diff --git a/README.md b/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e80e8628d5a369f7402fb0b4d6c569a64f1ac9c9
--- /dev/null
+++ b/README.md
@@ -0,0 +1,155 @@
+---
+library_name: peft
+license: apache-2.0
+base_model: openlm-research/open_llama_3b_v2
+tags:
+- generated_from_trainer
+model-index:
+- name: outputs/lora-out
+  results: []
+---
+
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+
+[<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
+<details><summary>See axolotl config</summary>
+
+axolotl version: `0.5.0`
+```yaml
+base_model: openlm-research/open_llama_3b_v2
+model_type: LlamaForCausalLM
+tokenizer_type: LlamaTokenizer
+load_in_8bit: true
+load_in_4bit: false
+strict: false
+push_dataset_to_hub:
+datasets:
+  - path: vicgalle/alpaca-gpt4
+    type: alpaca
+dataset_prepared_path:
+val_set_size: 0.02
+adapter: lora
+lora_model_dir:
+sequence_len: 1024
+sample_packing: true
+lora_r: 8
+lora_alpha: 16
+lora_dropout: 0.0
+lora_target_modules:
+  - gate_proj
+  - down_proj
+  - up_proj
+  - q_proj
+  - v_proj
+  - k_proj
+  - o_proj
+lora_fan_in_fan_out:
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+output_dir: ./outputs/lora-out
+gradient_accumulation_steps: 1
+micro_batch_size: 2
+num_epochs: 4
+optimizer: adamw_bnb_8bit
+torchdistx_path:
+lr_scheduler: cosine
+learning_rate: 0.0002
+train_on_inputs: false
+group_by_length: false
+bf16: false
+fp16: true
+tf32: false
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+gptq_groupsize:
+s2_attention:
+gptq_model_v1:
+warmup_steps: 20
+evals_per_epoch: 4
+saves_per_epoch: 1
+debug:
+deepspeed:
+weight_decay: 0.1
+fsdp:
+fsdp_config:
+special_tokens:
+  bos_token: "<s>"
+  eos_token: "</s>"
+  unk_token: "<unk>"
+
+```
+
+</details><br>
+
+# outputs/lora-out
+
+This model is a fine-tuned version of [openlm-research/open_llama_3b_v2](https://huggingface.co./openlm-research/open_llama_3b_v2) on the None dataset.
+It achieves the following results on the evaluation set:
+- Loss: 1.1770
+
+## Model description
+
+More information needed
+
+## Intended uses & limitations
+
+More information needed
+
+## Training and evaluation data
+
+More information needed
+
+## Training procedure
+
+### Training hyperparameters
+
+The following hyperparameters were used during training:
+- learning_rate: 0.0002
+- train_batch_size: 2
+- eval_batch_size: 2
+- seed: 42
+- optimizer: Use OptimizerNames.ADAMW_BNB with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_steps: 20
+- num_epochs: 4
+- mixed_precision_training: Native AMP
+
+### Training results
+
+| Training Loss | Epoch  | Step  | Validation Loss |
+|:-------------:|:------:|:-----:|:---------------:|
+| 1.1997        | 0.0002 | 1     | 1.3698          |
+| 1.1468        | 0.25   | 1404  | 1.1159          |
+| 1.2207        | 0.5    | 2808  | 1.1072          |
+| 0.9448        | 0.75   | 4212  | 1.0982          |
+| 1.0709        | 1.0    | 5616  | 1.0931          |
+| 0.9592        | 1.2498 | 7020  | 1.1051          |
+| 1.1133        | 1.4998 | 8424  | 1.1058          |
+| 0.884         | 1.7498 | 9828  | 1.1018          |
+| 0.9117        | 1.9998 | 11232 | 1.0963          |
+| 0.9594        | 2.2496 | 12636 | 1.1336          |
+| 0.9034        | 2.4996 | 14040 | 1.1338          |
+| 0.6645        | 2.7496 | 15444 | 1.1326          |
+| 0.8913        | 2.9996 | 16848 | 1.1309          |
+| 0.9476        | 3.2495 | 18252 | 1.1752          |
+| 0.9015        | 3.4995 | 19656 | 1.1762          |
+| 0.6284        | 3.7495 | 21060 | 1.1768          |
+| 0.7522        | 3.9995 | 22464 | 1.1770          |
+
+
+### Framework versions
+
+- PEFT 0.13.2
+- Transformers 4.46.1
+- Pytorch 2.3.1+cu121
+- Datasets 3.0.1
+- Tokenizers 0.20.3
\ No newline at end of file
diff --git a/adapter_config.json b/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..6b6f20a570fc808390da3f2e001093ac1e56c1da
--- /dev/null
+++ b/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "openlm-research/open_llama_3b_v2",
+  "bias": "none",
+  "fan_in_fan_out": null,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "down_proj",
+    "o_proj",
+    "q_proj",
+    "up_proj",
+    "k_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/adapter_model.bin b/adapter_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..25f077981ec7f2b43cb2ae0684498280c7a98296
--- /dev/null
+++ b/adapter_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:65b03db891ccca4d33155d35967b2a3c2b857232e53784271e82e9ce4f562160
+size 50982842
diff --git a/checkpoint-11232/README.md b/checkpoint-11232/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..719b4726992f7d0707a4253e9123dec35e4de390
--- /dev/null
+++ b/checkpoint-11232/README.md
@@ -0,0 +1,202 @@
+---
+base_model: openlm-research/open_llama_3b_v2
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/checkpoint-11232/adapter_config.json b/checkpoint-11232/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..6b6f20a570fc808390da3f2e001093ac1e56c1da
--- /dev/null
+++ b/checkpoint-11232/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "openlm-research/open_llama_3b_v2",
+  "bias": "none",
+  "fan_in_fan_out": null,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "down_proj",
+    "o_proj",
+    "q_proj",
+    "up_proj",
+    "k_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-11232/adapter_model.safetensors b/checkpoint-11232/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..5bb3ac9263fab3b5f0b1cc6d25d305fcb5ad958f
--- /dev/null
+++ b/checkpoint-11232/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:91227646d5faac43c528b816cc789555bb955e55a2da268fccb375f6d47f80e2
+size 50899792
diff --git a/checkpoint-11232/optimizer.pt b/checkpoint-11232/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7819e6803809f44ad4ced2362a126ff855f71d35
--- /dev/null
+++ b/checkpoint-11232/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:36ee80f0c39d96999aa9d2716aa41db37fefa4acc0f0bba7d635fb957ebd2168
+size 26231684
diff --git a/checkpoint-11232/rng_state.pth b/checkpoint-11232/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..87571ba044576778d1d6e555eff20ea04c20bbab
--- /dev/null
+++ b/checkpoint-11232/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5edb34d031c0c2b447f3eaadb401a4c1e7e7e6d8c096e28b7092e01a8bd48c92
+size 14244
diff --git a/checkpoint-11232/scheduler.pt b/checkpoint-11232/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f3104cf43abddc114b9a13947bb72f96cafb7034
--- /dev/null
+++ b/checkpoint-11232/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ba5b5a15573ac131ab58c742274eeb95853472637445e0705a6a1a1a1234b7c0
+size 1064
diff --git a/checkpoint-11232/special_tokens_map.json b/checkpoint-11232/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..72ecfeeb7e14d244c936169d2ed139eeae235ef1
--- /dev/null
+++ b/checkpoint-11232/special_tokens_map.json
@@ -0,0 +1,24 @@
+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/checkpoint-11232/tokenizer.model b/checkpoint-11232/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..98866ff8ae3631f331c57923c921a0c9ad22b97d
--- /dev/null
+++ b/checkpoint-11232/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:91b289e85fa20fd375d8b33dc12f77616f18abc6359804471d1fafcb425fecb8
+size 511574
diff --git a/checkpoint-11232/tokenizer_config.json b/checkpoint-11232/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c218d1b7228e3ad6055bdcf0ec15c4f188dc7d79
--- /dev/null
+++ b/checkpoint-11232/tokenizer_config.json
@@ -0,0 +1,43 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": true,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": true,
+  "model_max_length": 2048,
+  "pad_token": "</s>",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false,
+  "use_fast": true
+}
diff --git a/checkpoint-11232/trainer_state.json b/checkpoint-11232/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..cc8f7a4e47df175615f8a3cbe6d24ae81dd759a6
--- /dev/null
+++ b/checkpoint-11232/trainer_state.json
@@ -0,0 +1,78729 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.9998219373219372,
+  "eval_steps": 1404,
+  "global_step": 11232,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.00017806267806267807,
+      "grad_norm": 0.2854898273944855,
+      "learning_rate": 1e-05,
+      "loss": 1.1997,
+      "step": 1
+    },
+    {
+      "epoch": 0.00017806267806267807,
+      "eval_loss": 1.3698358535766602,
+      "eval_runtime": 24.1591,
+      "eval_samples_per_second": 43.089,
+      "eval_steps_per_second": 21.565,
+      "step": 1
+    },
+    {
+      "epoch": 0.00035612535612535614,
+      "grad_norm": 0.3508087396621704,
+      "learning_rate": 2e-05,
+      "loss": 1.4134,
+      "step": 2
+    },
+    {
+      "epoch": 0.0005341880341880342,
+      "grad_norm": 0.27050870656967163,
+      "learning_rate": 3e-05,
+      "loss": 1.3447,
+      "step": 3
+    },
+    {
+      "epoch": 0.0007122507122507123,
+      "grad_norm": 0.27706292271614075,
+      "learning_rate": 4e-05,
+      "loss": 1.0354,
+      "step": 4
+    },
+    {
+      "epoch": 0.0008903133903133903,
+      "grad_norm": 0.30398961901664734,
+      "learning_rate": 5e-05,
+      "loss": 1.1441,
+      "step": 5
+    },
+    {
+      "epoch": 0.0010683760683760685,
+      "grad_norm": 0.3103881776332855,
+      "learning_rate": 6e-05,
+      "loss": 1.341,
+      "step": 6
+    },
+    {
+      "epoch": 0.0012464387464387464,
+      "grad_norm": 0.5191189646720886,
+      "learning_rate": 7e-05,
+      "loss": 1.3457,
+      "step": 7
+    },
+    {
+      "epoch": 0.0014245014245014246,
+      "grad_norm": 0.4449467360973358,
+      "learning_rate": 8e-05,
+      "loss": 1.5051,
+      "step": 8
+    },
+    {
+      "epoch": 0.0016025641025641025,
+      "grad_norm": 0.3914581537246704,
+      "learning_rate": 9e-05,
+      "loss": 1.5525,
+      "step": 9
+    },
+    {
+      "epoch": 0.0017806267806267807,
+      "grad_norm": 0.37746086716651917,
+      "learning_rate": 0.0001,
+      "loss": 1.3266,
+      "step": 10
+    },
+    {
+      "epoch": 0.001958689458689459,
+      "grad_norm": 0.35226109623908997,
+      "learning_rate": 0.00011000000000000002,
+      "loss": 1.5416,
+      "step": 11
+    },
+    {
+      "epoch": 0.002136752136752137,
+      "grad_norm": 0.3343672454357147,
+      "learning_rate": 0.00012,
+      "loss": 1.3221,
+      "step": 12
+    },
+    {
+      "epoch": 0.0023148148148148147,
+      "grad_norm": 0.47298333048820496,
+      "learning_rate": 0.00013000000000000002,
+      "loss": 1.2999,
+      "step": 13
+    },
+    {
+      "epoch": 0.002492877492877493,
+      "grad_norm": 0.377814918756485,
+      "learning_rate": 0.00014,
+      "loss": 1.1688,
+      "step": 14
+    },
+    {
+      "epoch": 0.002670940170940171,
+      "grad_norm": 0.46344801783561707,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 1.3565,
+      "step": 15
+    },
+    {
+      "epoch": 0.002849002849002849,
+      "grad_norm": 0.49615249037742615,
+      "learning_rate": 0.00016,
+      "loss": 1.5692,
+      "step": 16
+    },
+    {
+      "epoch": 0.003027065527065527,
+      "grad_norm": 0.5109946131706238,
+      "learning_rate": 0.00017,
+      "loss": 1.2991,
+      "step": 17
+    },
+    {
+      "epoch": 0.003205128205128205,
+      "grad_norm": 0.5125070214271545,
+      "learning_rate": 0.00018,
+      "loss": 1.3309,
+      "step": 18
+    },
+    {
+      "epoch": 0.003383190883190883,
+      "grad_norm": 0.4517767131328583,
+      "learning_rate": 0.00019,
+      "loss": 1.357,
+      "step": 19
+    },
+    {
+      "epoch": 0.0035612535612535613,
+      "grad_norm": 0.47267794609069824,
+      "learning_rate": 0.0002,
+      "loss": 1.1301,
+      "step": 20
+    },
+    {
+      "epoch": 0.0037393162393162395,
+      "grad_norm": 0.46823424100875854,
+      "learning_rate": 0.00019999999902035388,
+      "loss": 1.1195,
+      "step": 21
+    },
+    {
+      "epoch": 0.003917378917378918,
+      "grad_norm": 0.440036803483963,
+      "learning_rate": 0.00019999999608141548,
+      "loss": 1.2822,
+      "step": 22
+    },
+    {
+      "epoch": 0.004095441595441595,
+      "grad_norm": 0.371101975440979,
+      "learning_rate": 0.00019999999118318492,
+      "loss": 1.132,
+      "step": 23
+    },
+    {
+      "epoch": 0.004273504273504274,
+      "grad_norm": 0.44691094756126404,
+      "learning_rate": 0.00019999998432566226,
+      "loss": 1.2968,
+      "step": 24
+    },
+    {
+      "epoch": 0.004451566951566952,
+      "grad_norm": 0.5462725162506104,
+      "learning_rate": 0.0001999999755088476,
+      "loss": 1.1714,
+      "step": 25
+    },
+    {
+      "epoch": 0.004629629629629629,
+      "grad_norm": 0.39860013127326965,
+      "learning_rate": 0.0001999999647327412,
+      "loss": 1.0407,
+      "step": 26
+    },
+    {
+      "epoch": 0.004807692307692308,
+      "grad_norm": 0.5031934380531311,
+      "learning_rate": 0.0001999999519973432,
+      "loss": 1.2773,
+      "step": 27
+    },
+    {
+      "epoch": 0.004985754985754986,
+      "grad_norm": 0.42162764072418213,
+      "learning_rate": 0.0001999999373026539,
+      "loss": 1.2824,
+      "step": 28
+    },
+    {
+      "epoch": 0.005163817663817663,
+      "grad_norm": 0.40964868664741516,
+      "learning_rate": 0.00019999992064867353,
+      "loss": 1.226,
+      "step": 29
+    },
+    {
+      "epoch": 0.005341880341880342,
+      "grad_norm": 0.41650915145874023,
+      "learning_rate": 0.00019999990203540245,
+      "loss": 1.2677,
+      "step": 30
+    },
+    {
+      "epoch": 0.00551994301994302,
+      "grad_norm": 0.40052226185798645,
+      "learning_rate": 0.00019999988146284103,
+      "loss": 0.9443,
+      "step": 31
+    },
+    {
+      "epoch": 0.005698005698005698,
+      "grad_norm": 0.5198387503623962,
+      "learning_rate": 0.00019999985893098964,
+      "loss": 1.3043,
+      "step": 32
+    },
+    {
+      "epoch": 0.005876068376068376,
+      "grad_norm": 0.50941002368927,
+      "learning_rate": 0.00019999983443984878,
+      "loss": 1.2002,
+      "step": 33
+    },
+    {
+      "epoch": 0.006054131054131054,
+      "grad_norm": 0.30082932114601135,
+      "learning_rate": 0.00019999980798941888,
+      "loss": 0.9904,
+      "step": 34
+    },
+    {
+      "epoch": 0.006232193732193732,
+      "grad_norm": 0.4228935241699219,
+      "learning_rate": 0.00019999977957970048,
+      "loss": 1.1137,
+      "step": 35
+    },
+    {
+      "epoch": 0.00641025641025641,
+      "grad_norm": 0.41294750571250916,
+      "learning_rate": 0.0001999997492106941,
+      "loss": 1.3385,
+      "step": 36
+    },
+    {
+      "epoch": 0.006588319088319089,
+      "grad_norm": 0.4415493905544281,
+      "learning_rate": 0.00019999971688240041,
+      "loss": 1.1695,
+      "step": 37
+    },
+    {
+      "epoch": 0.006766381766381766,
+      "grad_norm": 0.3726460933685303,
+      "learning_rate": 0.00019999968259482,
+      "loss": 1.1734,
+      "step": 38
+    },
+    {
+      "epoch": 0.006944444444444444,
+      "grad_norm": 0.3969627320766449,
+      "learning_rate": 0.0001999996463479535,
+      "loss": 1.1209,
+      "step": 39
+    },
+    {
+      "epoch": 0.007122507122507123,
+      "grad_norm": 0.3779667913913727,
+      "learning_rate": 0.0001999996081418017,
+      "loss": 1.1635,
+      "step": 40
+    },
+    {
+      "epoch": 0.0073005698005698,
+      "grad_norm": 0.3933636546134949,
+      "learning_rate": 0.0001999995679763653,
+      "loss": 1.1514,
+      "step": 41
+    },
+    {
+      "epoch": 0.007478632478632479,
+      "grad_norm": 0.3567957282066345,
+      "learning_rate": 0.00019999952585164507,
+      "loss": 1.2488,
+      "step": 42
+    },
+    {
+      "epoch": 0.007656695156695157,
+      "grad_norm": 0.32506081461906433,
+      "learning_rate": 0.00019999948176764186,
+      "loss": 1.149,
+      "step": 43
+    },
+    {
+      "epoch": 0.007834757834757835,
+      "grad_norm": 0.46588361263275146,
+      "learning_rate": 0.0001999994357243566,
+      "loss": 1.4263,
+      "step": 44
+    },
+    {
+      "epoch": 0.008012820512820512,
+      "grad_norm": 0.5070307850837708,
+      "learning_rate": 0.00019999938772179005,
+      "loss": 1.0698,
+      "step": 45
+    },
+    {
+      "epoch": 0.00819088319088319,
+      "grad_norm": 0.38199326395988464,
+      "learning_rate": 0.00019999933775994327,
+      "loss": 0.9907,
+      "step": 46
+    },
+    {
+      "epoch": 0.00836894586894587,
+      "grad_norm": 0.43684661388397217,
+      "learning_rate": 0.0001999992858388172,
+      "loss": 1.2905,
+      "step": 47
+    },
+    {
+      "epoch": 0.008547008547008548,
+      "grad_norm": 0.44482162594795227,
+      "learning_rate": 0.00019999923195841284,
+      "loss": 1.2153,
+      "step": 48
+    },
+    {
+      "epoch": 0.008725071225071225,
+      "grad_norm": 0.4259667694568634,
+      "learning_rate": 0.0001999991761187313,
+      "loss": 1.1582,
+      "step": 49
+    },
+    {
+      "epoch": 0.008903133903133903,
+      "grad_norm": 0.41649091243743896,
+      "learning_rate": 0.00019999911831977357,
+      "loss": 1.0185,
+      "step": 50
+    },
+    {
+      "epoch": 0.009081196581196582,
+      "grad_norm": 0.4179716110229492,
+      "learning_rate": 0.0001999990585615409,
+      "loss": 1.3579,
+      "step": 51
+    },
+    {
+      "epoch": 0.009259259259259259,
+      "grad_norm": 0.3372558355331421,
+      "learning_rate": 0.00019999899684403438,
+      "loss": 1.0638,
+      "step": 52
+    },
+    {
+      "epoch": 0.009437321937321937,
+      "grad_norm": 0.41294020414352417,
+      "learning_rate": 0.00019999893316725525,
+      "loss": 1.1932,
+      "step": 53
+    },
+    {
+      "epoch": 0.009615384615384616,
+      "grad_norm": 0.4407919645309448,
+      "learning_rate": 0.00019999886753120473,
+      "loss": 1.4129,
+      "step": 54
+    },
+    {
+      "epoch": 0.009793447293447293,
+      "grad_norm": 0.47948843240737915,
+      "learning_rate": 0.00019999879993588414,
+      "loss": 1.2424,
+      "step": 55
+    },
+    {
+      "epoch": 0.009971509971509971,
+      "grad_norm": 0.3535355031490326,
+      "learning_rate": 0.00019999873038129484,
+      "loss": 1.0145,
+      "step": 56
+    },
+    {
+      "epoch": 0.01014957264957265,
+      "grad_norm": 0.5067078471183777,
+      "learning_rate": 0.00019999865886743813,
+      "loss": 1.4708,
+      "step": 57
+    },
+    {
+      "epoch": 0.010327635327635327,
+      "grad_norm": 0.42862898111343384,
+      "learning_rate": 0.0001999985853943154,
+      "loss": 1.0399,
+      "step": 58
+    },
+    {
+      "epoch": 0.010505698005698005,
+      "grad_norm": 0.4769059419631958,
+      "learning_rate": 0.00019999850996192816,
+      "loss": 1.1258,
+      "step": 59
+    },
+    {
+      "epoch": 0.010683760683760684,
+      "grad_norm": 0.4065442383289337,
+      "learning_rate": 0.0001999984325702778,
+      "loss": 1.2077,
+      "step": 60
+    },
+    {
+      "epoch": 0.010861823361823363,
+      "grad_norm": 0.5318329930305481,
+      "learning_rate": 0.0001999983532193659,
+      "loss": 1.2298,
+      "step": 61
+    },
+    {
+      "epoch": 0.01103988603988604,
+      "grad_norm": 0.4777173101902008,
+      "learning_rate": 0.000199998271909194,
+      "loss": 1.3195,
+      "step": 62
+    },
+    {
+      "epoch": 0.011217948717948718,
+      "grad_norm": 0.37553808093070984,
+      "learning_rate": 0.0001999981886397637,
+      "loss": 1.1188,
+      "step": 63
+    },
+    {
+      "epoch": 0.011396011396011397,
+      "grad_norm": 0.3920556902885437,
+      "learning_rate": 0.0001999981034110766,
+      "loss": 1.1448,
+      "step": 64
+    },
+    {
+      "epoch": 0.011574074074074073,
+      "grad_norm": 0.454272598028183,
+      "learning_rate": 0.0001999980162231344,
+      "loss": 1.0812,
+      "step": 65
+    },
+    {
+      "epoch": 0.011752136752136752,
+      "grad_norm": 0.4354456663131714,
+      "learning_rate": 0.00019999792707593882,
+      "loss": 1.1174,
+      "step": 66
+    },
+    {
+      "epoch": 0.01193019943019943,
+      "grad_norm": 0.5030252933502197,
+      "learning_rate": 0.00019999783596949156,
+      "loss": 1.2925,
+      "step": 67
+    },
+    {
+      "epoch": 0.012108262108262107,
+      "grad_norm": 0.5141571164131165,
+      "learning_rate": 0.00019999774290379446,
+      "loss": 1.6193,
+      "step": 68
+    },
+    {
+      "epoch": 0.012286324786324786,
+      "grad_norm": 0.417298287153244,
+      "learning_rate": 0.0001999976478788493,
+      "loss": 1.1875,
+      "step": 69
+    },
+    {
+      "epoch": 0.012464387464387465,
+      "grad_norm": 0.4642415940761566,
+      "learning_rate": 0.00019999755089465795,
+      "loss": 1.4138,
+      "step": 70
+    },
+    {
+      "epoch": 0.012642450142450143,
+      "grad_norm": 0.43184754252433777,
+      "learning_rate": 0.0001999974519512223,
+      "loss": 1.0697,
+      "step": 71
+    },
+    {
+      "epoch": 0.01282051282051282,
+      "grad_norm": 0.46698349714279175,
+      "learning_rate": 0.00019999735104854436,
+      "loss": 0.709,
+      "step": 72
+    },
+    {
+      "epoch": 0.012998575498575499,
+      "grad_norm": 0.37253814935684204,
+      "learning_rate": 0.000199997248186626,
+      "loss": 1.2084,
+      "step": 73
+    },
+    {
+      "epoch": 0.013176638176638177,
+      "grad_norm": 0.3851388692855835,
+      "learning_rate": 0.0001999971433654693,
+      "loss": 1.0548,
+      "step": 74
+    },
+    {
+      "epoch": 0.013354700854700854,
+      "grad_norm": 0.4434688985347748,
+      "learning_rate": 0.00019999703658507635,
+      "loss": 1.4084,
+      "step": 75
+    },
+    {
+      "epoch": 0.013532763532763533,
+      "grad_norm": 0.43164482712745667,
+      "learning_rate": 0.00019999692784544913,
+      "loss": 1.4872,
+      "step": 76
+    },
+    {
+      "epoch": 0.013710826210826211,
+      "grad_norm": 0.4224303364753723,
+      "learning_rate": 0.00019999681714658984,
+      "loss": 1.2221,
+      "step": 77
+    },
+    {
+      "epoch": 0.013888888888888888,
+      "grad_norm": 0.35588955879211426,
+      "learning_rate": 0.00019999670448850069,
+      "loss": 0.84,
+      "step": 78
+    },
+    {
+      "epoch": 0.014066951566951567,
+      "grad_norm": 0.3970590829849243,
+      "learning_rate": 0.0001999965898711838,
+      "loss": 1.1886,
+      "step": 79
+    },
+    {
+      "epoch": 0.014245014245014245,
+      "grad_norm": 0.4331924319267273,
+      "learning_rate": 0.00019999647329464146,
+      "loss": 1.179,
+      "step": 80
+    },
+    {
+      "epoch": 0.014423076923076924,
+      "grad_norm": 0.4226946234703064,
+      "learning_rate": 0.00019999635475887598,
+      "loss": 1.1496,
+      "step": 81
+    },
+    {
+      "epoch": 0.0146011396011396,
+      "grad_norm": 0.381592720746994,
+      "learning_rate": 0.00019999623426388962,
+      "loss": 1.1774,
+      "step": 82
+    },
+    {
+      "epoch": 0.01477920227920228,
+      "grad_norm": 0.4190855622291565,
+      "learning_rate": 0.00019999611180968478,
+      "loss": 1.1491,
+      "step": 83
+    },
+    {
+      "epoch": 0.014957264957264958,
+      "grad_norm": 0.3904292583465576,
+      "learning_rate": 0.00019999598739626389,
+      "loss": 1.1275,
+      "step": 84
+    },
+    {
+      "epoch": 0.015135327635327635,
+      "grad_norm": 0.4515478014945984,
+      "learning_rate": 0.0001999958610236293,
+      "loss": 1.2404,
+      "step": 85
+    },
+    {
+      "epoch": 0.015313390313390313,
+      "grad_norm": 0.48341724276542664,
+      "learning_rate": 0.00019999573269178359,
+      "loss": 1.3572,
+      "step": 86
+    },
+    {
+      "epoch": 0.015491452991452992,
+      "grad_norm": 0.42150333523750305,
+      "learning_rate": 0.00019999560240072914,
+      "loss": 1.0203,
+      "step": 87
+    },
+    {
+      "epoch": 0.01566951566951567,
+      "grad_norm": 0.45445525646209717,
+      "learning_rate": 0.00019999547015046867,
+      "loss": 1.0677,
+      "step": 88
+    },
+    {
+      "epoch": 0.01584757834757835,
+      "grad_norm": 0.3581015467643738,
+      "learning_rate": 0.00019999533594100463,
+      "loss": 1.0693,
+      "step": 89
+    },
+    {
+      "epoch": 0.016025641025641024,
+      "grad_norm": 0.4430878758430481,
+      "learning_rate": 0.00019999519977233971,
+      "loss": 1.1591,
+      "step": 90
+    },
+    {
+      "epoch": 0.016203703703703703,
+      "grad_norm": 0.3940352201461792,
+      "learning_rate": 0.0001999950616444766,
+      "loss": 1.1325,
+      "step": 91
+    },
+    {
+      "epoch": 0.01638176638176638,
+      "grad_norm": 0.4521673321723938,
+      "learning_rate": 0.00019999492155741794,
+      "loss": 1.3288,
+      "step": 92
+    },
+    {
+      "epoch": 0.01655982905982906,
+      "grad_norm": 0.3988296687602997,
+      "learning_rate": 0.00019999477951116658,
+      "loss": 1.0023,
+      "step": 93
+    },
+    {
+      "epoch": 0.01673789173789174,
+      "grad_norm": 0.38709723949432373,
+      "learning_rate": 0.00019999463550572516,
+      "loss": 1.2623,
+      "step": 94
+    },
+    {
+      "epoch": 0.016915954415954417,
+      "grad_norm": 0.35376182198524475,
+      "learning_rate": 0.00019999448954109662,
+      "loss": 1.0643,
+      "step": 95
+    },
+    {
+      "epoch": 0.017094017094017096,
+      "grad_norm": 0.49547120928764343,
+      "learning_rate": 0.00019999434161728377,
+      "loss": 1.2121,
+      "step": 96
+    },
+    {
+      "epoch": 0.01727207977207977,
+      "grad_norm": 0.49593672156333923,
+      "learning_rate": 0.00019999419173428952,
+      "loss": 1.1635,
+      "step": 97
+    },
+    {
+      "epoch": 0.01745014245014245,
+      "grad_norm": 0.4146541953086853,
+      "learning_rate": 0.0001999940398921168,
+      "loss": 1.1452,
+      "step": 98
+    },
+    {
+      "epoch": 0.017628205128205128,
+      "grad_norm": 0.5177254676818848,
+      "learning_rate": 0.00019999388609076858,
+      "loss": 1.2178,
+      "step": 99
+    },
+    {
+      "epoch": 0.017806267806267807,
+      "grad_norm": 0.4012768864631653,
+      "learning_rate": 0.0001999937303302479,
+      "loss": 0.9222,
+      "step": 100
+    },
+    {
+      "epoch": 0.017984330484330485,
+      "grad_norm": 0.4597131907939911,
+      "learning_rate": 0.00019999357261055777,
+      "loss": 0.979,
+      "step": 101
+    },
+    {
+      "epoch": 0.018162393162393164,
+      "grad_norm": 0.6190966963768005,
+      "learning_rate": 0.00019999341293170132,
+      "loss": 1.3909,
+      "step": 102
+    },
+    {
+      "epoch": 0.01834045584045584,
+      "grad_norm": 0.4576462209224701,
+      "learning_rate": 0.00019999325129368164,
+      "loss": 1.073,
+      "step": 103
+    },
+    {
+      "epoch": 0.018518518518518517,
+      "grad_norm": 0.4036749005317688,
+      "learning_rate": 0.00019999308769650192,
+      "loss": 1.1354,
+      "step": 104
+    },
+    {
+      "epoch": 0.018696581196581196,
+      "grad_norm": 0.4722452759742737,
+      "learning_rate": 0.00019999292214016538,
+      "loss": 1.2039,
+      "step": 105
+    },
+    {
+      "epoch": 0.018874643874643875,
+      "grad_norm": 0.5338274240493774,
+      "learning_rate": 0.00019999275462467527,
+      "loss": 1.225,
+      "step": 106
+    },
+    {
+      "epoch": 0.019052706552706553,
+      "grad_norm": 0.4301491677761078,
+      "learning_rate": 0.00019999258515003484,
+      "loss": 1.0601,
+      "step": 107
+    },
+    {
+      "epoch": 0.019230769230769232,
+      "grad_norm": 0.33271175622940063,
+      "learning_rate": 0.0001999924137162474,
+      "loss": 0.8441,
+      "step": 108
+    },
+    {
+      "epoch": 0.01940883190883191,
+      "grad_norm": 0.4648784399032593,
+      "learning_rate": 0.0001999922403233163,
+      "loss": 1.2038,
+      "step": 109
+    },
+    {
+      "epoch": 0.019586894586894586,
+      "grad_norm": 0.37915176153182983,
+      "learning_rate": 0.00019999206497124504,
+      "loss": 1.0923,
+      "step": 110
+    },
+    {
+      "epoch": 0.019764957264957264,
+      "grad_norm": 0.3865506052970886,
+      "learning_rate": 0.00019999188766003695,
+      "loss": 0.9535,
+      "step": 111
+    },
+    {
+      "epoch": 0.019943019943019943,
+      "grad_norm": 0.35739636421203613,
+      "learning_rate": 0.0001999917083896955,
+      "loss": 1.2688,
+      "step": 112
+    },
+    {
+      "epoch": 0.02012108262108262,
+      "grad_norm": 0.3943796157836914,
+      "learning_rate": 0.0001999915271602243,
+      "loss": 1.1097,
+      "step": 113
+    },
+    {
+      "epoch": 0.0202991452991453,
+      "grad_norm": 0.44758161902427673,
+      "learning_rate": 0.0001999913439716268,
+      "loss": 1.2698,
+      "step": 114
+    },
+    {
+      "epoch": 0.02047720797720798,
+      "grad_norm": 0.3749747574329376,
+      "learning_rate": 0.00019999115882390664,
+      "loss": 1.1091,
+      "step": 115
+    },
+    {
+      "epoch": 0.020655270655270654,
+      "grad_norm": 0.3479487895965576,
+      "learning_rate": 0.00019999097171706745,
+      "loss": 1.0049,
+      "step": 116
+    },
+    {
+      "epoch": 0.020833333333333332,
+      "grad_norm": 0.4491243064403534,
+      "learning_rate": 0.00019999078265111285,
+      "loss": 1.1857,
+      "step": 117
+    },
+    {
+      "epoch": 0.02101139601139601,
+      "grad_norm": 0.345289021730423,
+      "learning_rate": 0.00019999059162604662,
+      "loss": 1.1397,
+      "step": 118
+    },
+    {
+      "epoch": 0.02118945868945869,
+      "grad_norm": 0.5467649698257446,
+      "learning_rate": 0.00019999039864187243,
+      "loss": 1.2196,
+      "step": 119
+    },
+    {
+      "epoch": 0.021367521367521368,
+      "grad_norm": 0.36446481943130493,
+      "learning_rate": 0.00019999020369859409,
+      "loss": 0.796,
+      "step": 120
+    },
+    {
+      "epoch": 0.021545584045584047,
+      "grad_norm": 0.4225841760635376,
+      "learning_rate": 0.00019999000679621543,
+      "loss": 0.9684,
+      "step": 121
+    },
+    {
+      "epoch": 0.021723646723646725,
+      "grad_norm": 0.4205594062805176,
+      "learning_rate": 0.0001999898079347403,
+      "loss": 1.2762,
+      "step": 122
+    },
+    {
+      "epoch": 0.0219017094017094,
+      "grad_norm": 0.43773892521858215,
+      "learning_rate": 0.00019998960711417257,
+      "loss": 1.117,
+      "step": 123
+    },
+    {
+      "epoch": 0.02207977207977208,
+      "grad_norm": 0.41279685497283936,
+      "learning_rate": 0.00019998940433451623,
+      "loss": 1.1502,
+      "step": 124
+    },
+    {
+      "epoch": 0.022257834757834757,
+      "grad_norm": 0.4090803563594818,
+      "learning_rate": 0.0001999891995957752,
+      "loss": 1.2591,
+      "step": 125
+    },
+    {
+      "epoch": 0.022435897435897436,
+      "grad_norm": 0.6000410914421082,
+      "learning_rate": 0.0001999889928979535,
+      "loss": 1.4321,
+      "step": 126
+    },
+    {
+      "epoch": 0.022613960113960115,
+      "grad_norm": 0.524264395236969,
+      "learning_rate": 0.00019998878424105524,
+      "loss": 1.1849,
+      "step": 127
+    },
+    {
+      "epoch": 0.022792022792022793,
+      "grad_norm": 0.4581047296524048,
+      "learning_rate": 0.00019998857362508443,
+      "loss": 1.0598,
+      "step": 128
+    },
+    {
+      "epoch": 0.022970085470085472,
+      "grad_norm": 0.42663446068763733,
+      "learning_rate": 0.00019998836105004526,
+      "loss": 1.1909,
+      "step": 129
+    },
+    {
+      "epoch": 0.023148148148148147,
+      "grad_norm": 0.45709118247032166,
+      "learning_rate": 0.00019998814651594183,
+      "loss": 1.2104,
+      "step": 130
+    },
+    {
+      "epoch": 0.023326210826210825,
+      "grad_norm": 0.39528369903564453,
+      "learning_rate": 0.0001999879300227784,
+      "loss": 1.3073,
+      "step": 131
+    },
+    {
+      "epoch": 0.023504273504273504,
+      "grad_norm": 0.46896448731422424,
+      "learning_rate": 0.00019998771157055914,
+      "loss": 1.3202,
+      "step": 132
+    },
+    {
+      "epoch": 0.023682336182336183,
+      "grad_norm": 0.4386129677295685,
+      "learning_rate": 0.00019998749115928842,
+      "loss": 1.2196,
+      "step": 133
+    },
+    {
+      "epoch": 0.02386039886039886,
+      "grad_norm": 0.45920488238334656,
+      "learning_rate": 0.00019998726878897051,
+      "loss": 1.3668,
+      "step": 134
+    },
+    {
+      "epoch": 0.02403846153846154,
+      "grad_norm": 0.4115797281265259,
+      "learning_rate": 0.0001999870444596098,
+      "loss": 1.1052,
+      "step": 135
+    },
+    {
+      "epoch": 0.024216524216524215,
+      "grad_norm": 0.3860839903354645,
+      "learning_rate": 0.0001999868181712106,
+      "loss": 1.0344,
+      "step": 136
+    },
+    {
+      "epoch": 0.024394586894586893,
+      "grad_norm": 0.42514732480049133,
+      "learning_rate": 0.00019998658992377742,
+      "loss": 1.1979,
+      "step": 137
+    },
+    {
+      "epoch": 0.024572649572649572,
+      "grad_norm": 0.36001840233802795,
+      "learning_rate": 0.00019998635971731475,
+      "loss": 1.4536,
+      "step": 138
+    },
+    {
+      "epoch": 0.02475071225071225,
+      "grad_norm": 0.3739112317562103,
+      "learning_rate": 0.00019998612755182707,
+      "loss": 1.0097,
+      "step": 139
+    },
+    {
+      "epoch": 0.02492877492877493,
+      "grad_norm": 0.37545472383499146,
+      "learning_rate": 0.00019998589342731888,
+      "loss": 0.829,
+      "step": 140
+    },
+    {
+      "epoch": 0.025106837606837608,
+      "grad_norm": 0.38660728931427,
+      "learning_rate": 0.0001999856573437948,
+      "loss": 1.1324,
+      "step": 141
+    },
+    {
+      "epoch": 0.025284900284900286,
+      "grad_norm": 0.3741356432437897,
+      "learning_rate": 0.00019998541930125953,
+      "loss": 1.0934,
+      "step": 142
+    },
+    {
+      "epoch": 0.02546296296296296,
+      "grad_norm": 0.41900336742401123,
+      "learning_rate": 0.00019998517929971764,
+      "loss": 1.0336,
+      "step": 143
+    },
+    {
+      "epoch": 0.02564102564102564,
+      "grad_norm": 0.4167572259902954,
+      "learning_rate": 0.00019998493733917384,
+      "loss": 1.2571,
+      "step": 144
+    },
+    {
+      "epoch": 0.02581908831908832,
+      "grad_norm": 0.39437636733055115,
+      "learning_rate": 0.0001999846934196329,
+      "loss": 1.2283,
+      "step": 145
+    },
+    {
+      "epoch": 0.025997150997150997,
+      "grad_norm": 0.39129480719566345,
+      "learning_rate": 0.00019998444754109964,
+      "loss": 0.9893,
+      "step": 146
+    },
+    {
+      "epoch": 0.026175213675213676,
+      "grad_norm": 0.45533549785614014,
+      "learning_rate": 0.0001999841997035788,
+      "loss": 1.0793,
+      "step": 147
+    },
+    {
+      "epoch": 0.026353276353276354,
+      "grad_norm": 0.3741768002510071,
+      "learning_rate": 0.00019998394990707524,
+      "loss": 1.2179,
+      "step": 148
+    },
+    {
+      "epoch": 0.026531339031339033,
+      "grad_norm": 0.4066533148288727,
+      "learning_rate": 0.0001999836981515939,
+      "loss": 1.1443,
+      "step": 149
+    },
+    {
+      "epoch": 0.026709401709401708,
+      "grad_norm": 0.4851688742637634,
+      "learning_rate": 0.0001999834444371397,
+      "loss": 1.1668,
+      "step": 150
+    },
+    {
+      "epoch": 0.026887464387464387,
+      "grad_norm": 0.428091436624527,
+      "learning_rate": 0.0001999831887637176,
+      "loss": 1.2676,
+      "step": 151
+    },
+    {
+      "epoch": 0.027065527065527065,
+      "grad_norm": 0.4024655222892761,
+      "learning_rate": 0.0001999829311313326,
+      "loss": 1.3115,
+      "step": 152
+    },
+    {
+      "epoch": 0.027243589743589744,
+      "grad_norm": 0.43983033299446106,
+      "learning_rate": 0.00019998267153998976,
+      "loss": 1.1019,
+      "step": 153
+    },
+    {
+      "epoch": 0.027421652421652423,
+      "grad_norm": 0.4317505359649658,
+      "learning_rate": 0.0001999824099896942,
+      "loss": 1.3129,
+      "step": 154
+    },
+    {
+      "epoch": 0.0275997150997151,
+      "grad_norm": 0.43107882142066956,
+      "learning_rate": 0.000199982146480451,
+      "loss": 1.2134,
+      "step": 155
+    },
+    {
+      "epoch": 0.027777777777777776,
+      "grad_norm": 0.3939448297023773,
+      "learning_rate": 0.00019998188101226532,
+      "loss": 1.0321,
+      "step": 156
+    },
+    {
+      "epoch": 0.027955840455840455,
+      "grad_norm": 0.4641847610473633,
+      "learning_rate": 0.00019998161358514237,
+      "loss": 1.2369,
+      "step": 157
+    },
+    {
+      "epoch": 0.028133903133903133,
+      "grad_norm": 0.3538529872894287,
+      "learning_rate": 0.0001999813441990874,
+      "loss": 1.2061,
+      "step": 158
+    },
+    {
+      "epoch": 0.028311965811965812,
+      "grad_norm": 0.3277950584888458,
+      "learning_rate": 0.0001999810728541057,
+      "loss": 0.9419,
+      "step": 159
+    },
+    {
+      "epoch": 0.02849002849002849,
+      "grad_norm": 0.424710750579834,
+      "learning_rate": 0.00019998079955020254,
+      "loss": 1.3302,
+      "step": 160
+    },
+    {
+      "epoch": 0.02866809116809117,
+      "grad_norm": 0.4120834469795227,
+      "learning_rate": 0.00019998052428738333,
+      "loss": 1.079,
+      "step": 161
+    },
+    {
+      "epoch": 0.028846153846153848,
+      "grad_norm": 0.45811930298805237,
+      "learning_rate": 0.00019998024706565346,
+      "loss": 1.1259,
+      "step": 162
+    },
+    {
+      "epoch": 0.029024216524216523,
+      "grad_norm": 0.3873266875743866,
+      "learning_rate": 0.0001999799678850183,
+      "loss": 1.2124,
+      "step": 163
+    },
+    {
+      "epoch": 0.0292022792022792,
+      "grad_norm": 0.5806412696838379,
+      "learning_rate": 0.00019997968674548337,
+      "loss": 1.3467,
+      "step": 164
+    },
+    {
+      "epoch": 0.02938034188034188,
+      "grad_norm": 0.3906802833080292,
+      "learning_rate": 0.00019997940364705418,
+      "loss": 1.1438,
+      "step": 165
+    },
+    {
+      "epoch": 0.02955840455840456,
+      "grad_norm": 0.45201995968818665,
+      "learning_rate": 0.00019997911858973626,
+      "loss": 1.1469,
+      "step": 166
+    },
+    {
+      "epoch": 0.029736467236467237,
+      "grad_norm": 0.4965892732143402,
+      "learning_rate": 0.0001999788315735352,
+      "loss": 1.0829,
+      "step": 167
+    },
+    {
+      "epoch": 0.029914529914529916,
+      "grad_norm": 0.32578057050704956,
+      "learning_rate": 0.0001999785425984566,
+      "loss": 1.0432,
+      "step": 168
+    },
+    {
+      "epoch": 0.03009259259259259,
+      "grad_norm": 0.4146028161048889,
+      "learning_rate": 0.00019997825166450617,
+      "loss": 1.1657,
+      "step": 169
+    },
+    {
+      "epoch": 0.03027065527065527,
+      "grad_norm": 0.4342964291572571,
+      "learning_rate": 0.0001999779587716896,
+      "loss": 1.2038,
+      "step": 170
+    },
+    {
+      "epoch": 0.030448717948717948,
+      "grad_norm": 0.40128546953201294,
+      "learning_rate": 0.00019997766392001258,
+      "loss": 1.3044,
+      "step": 171
+    },
+    {
+      "epoch": 0.030626780626780627,
+      "grad_norm": 0.4357539117336273,
+      "learning_rate": 0.00019997736710948094,
+      "loss": 1.2143,
+      "step": 172
+    },
+    {
+      "epoch": 0.030804843304843305,
+      "grad_norm": 0.4821035861968994,
+      "learning_rate": 0.00019997706834010045,
+      "loss": 1.0469,
+      "step": 173
+    },
+    {
+      "epoch": 0.030982905982905984,
+      "grad_norm": 0.3966675102710724,
+      "learning_rate": 0.000199976767611877,
+      "loss": 1.2122,
+      "step": 174
+    },
+    {
+      "epoch": 0.031160968660968662,
+      "grad_norm": 0.4265064299106598,
+      "learning_rate": 0.00019997646492481648,
+      "loss": 1.0871,
+      "step": 175
+    },
+    {
+      "epoch": 0.03133903133903134,
+      "grad_norm": 0.3445652723312378,
+      "learning_rate": 0.00019997616027892485,
+      "loss": 1.0412,
+      "step": 176
+    },
+    {
+      "epoch": 0.031517094017094016,
+      "grad_norm": 0.47187718749046326,
+      "learning_rate": 0.000199975853674208,
+      "loss": 1.0822,
+      "step": 177
+    },
+    {
+      "epoch": 0.0316951566951567,
+      "grad_norm": 0.37751707434654236,
+      "learning_rate": 0.000199975545110672,
+      "loss": 1.1439,
+      "step": 178
+    },
+    {
+      "epoch": 0.03187321937321937,
+      "grad_norm": 0.38792455196380615,
+      "learning_rate": 0.00019997523458832286,
+      "loss": 0.8604,
+      "step": 179
+    },
+    {
+      "epoch": 0.03205128205128205,
+      "grad_norm": 0.35199594497680664,
+      "learning_rate": 0.00019997492210716667,
+      "loss": 1.0819,
+      "step": 180
+    },
+    {
+      "epoch": 0.03222934472934473,
+      "grad_norm": 0.4828922748565674,
+      "learning_rate": 0.00019997460766720958,
+      "loss": 1.1879,
+      "step": 181
+    },
+    {
+      "epoch": 0.032407407407407406,
+      "grad_norm": 0.46153363585472107,
+      "learning_rate": 0.00019997429126845774,
+      "loss": 1.1592,
+      "step": 182
+    },
+    {
+      "epoch": 0.03258547008547009,
+      "grad_norm": 0.4844890832901001,
+      "learning_rate": 0.0001999739729109173,
+      "loss": 1.1334,
+      "step": 183
+    },
+    {
+      "epoch": 0.03276353276353276,
+      "grad_norm": 0.414617121219635,
+      "learning_rate": 0.00019997365259459457,
+      "loss": 1.0547,
+      "step": 184
+    },
+    {
+      "epoch": 0.032941595441595445,
+      "grad_norm": 0.46544626355171204,
+      "learning_rate": 0.00019997333031949581,
+      "loss": 1.4067,
+      "step": 185
+    },
+    {
+      "epoch": 0.03311965811965812,
+      "grad_norm": 0.48489415645599365,
+      "learning_rate": 0.0001999730060856273,
+      "loss": 1.4027,
+      "step": 186
+    },
+    {
+      "epoch": 0.033297720797720795,
+      "grad_norm": 0.3963346481323242,
+      "learning_rate": 0.0001999726798929954,
+      "loss": 1.1327,
+      "step": 187
+    },
+    {
+      "epoch": 0.03347578347578348,
+      "grad_norm": 0.3809385895729065,
+      "learning_rate": 0.00019997235174160652,
+      "loss": 1.3475,
+      "step": 188
+    },
+    {
+      "epoch": 0.03365384615384615,
+      "grad_norm": 0.3866960406303406,
+      "learning_rate": 0.0001999720216314671,
+      "loss": 1.1576,
+      "step": 189
+    },
+    {
+      "epoch": 0.033831908831908834,
+      "grad_norm": 0.34976935386657715,
+      "learning_rate": 0.00019997168956258356,
+      "loss": 0.9361,
+      "step": 190
+    },
+    {
+      "epoch": 0.03400997150997151,
+      "grad_norm": 0.38681939244270325,
+      "learning_rate": 0.00019997135553496243,
+      "loss": 1.1796,
+      "step": 191
+    },
+    {
+      "epoch": 0.03418803418803419,
+      "grad_norm": 0.41905197501182556,
+      "learning_rate": 0.0001999710195486103,
+      "loss": 1.1714,
+      "step": 192
+    },
+    {
+      "epoch": 0.03436609686609687,
+      "grad_norm": 0.42356589436531067,
+      "learning_rate": 0.0001999706816035337,
+      "loss": 1.0022,
+      "step": 193
+    },
+    {
+      "epoch": 0.03454415954415954,
+      "grad_norm": 0.3929740786552429,
+      "learning_rate": 0.00019997034169973925,
+      "loss": 1.3769,
+      "step": 194
+    },
+    {
+      "epoch": 0.034722222222222224,
+      "grad_norm": 0.4325186312198639,
+      "learning_rate": 0.00019996999983723366,
+      "loss": 1.3057,
+      "step": 195
+    },
+    {
+      "epoch": 0.0349002849002849,
+      "grad_norm": 0.3954029381275177,
+      "learning_rate": 0.00019996965601602355,
+      "loss": 1.1958,
+      "step": 196
+    },
+    {
+      "epoch": 0.03507834757834758,
+      "grad_norm": 0.34454262256622314,
+      "learning_rate": 0.00019996931023611572,
+      "loss": 1.0972,
+      "step": 197
+    },
+    {
+      "epoch": 0.035256410256410256,
+      "grad_norm": 0.48900291323661804,
+      "learning_rate": 0.0001999689624975169,
+      "loss": 1.213,
+      "step": 198
+    },
+    {
+      "epoch": 0.03543447293447293,
+      "grad_norm": 0.35214388370513916,
+      "learning_rate": 0.00019996861280023397,
+      "loss": 1.0285,
+      "step": 199
+    },
+    {
+      "epoch": 0.03561253561253561,
+      "grad_norm": 0.49393126368522644,
+      "learning_rate": 0.00019996826114427373,
+      "loss": 1.2313,
+      "step": 200
+    },
+    {
+      "epoch": 0.03579059829059829,
+      "grad_norm": 0.3994458019733429,
+      "learning_rate": 0.00019996790752964305,
+      "loss": 1.0474,
+      "step": 201
+    },
+    {
+      "epoch": 0.03596866096866097,
+      "grad_norm": 0.5387318730354309,
+      "learning_rate": 0.0001999675519563489,
+      "loss": 1.3067,
+      "step": 202
+    },
+    {
+      "epoch": 0.036146723646723646,
+      "grad_norm": 0.4976751208305359,
+      "learning_rate": 0.00019996719442439824,
+      "loss": 1.2593,
+      "step": 203
+    },
+    {
+      "epoch": 0.03632478632478633,
+      "grad_norm": 0.47052907943725586,
+      "learning_rate": 0.0001999668349337981,
+      "loss": 1.1036,
+      "step": 204
+    },
+    {
+      "epoch": 0.036502849002849,
+      "grad_norm": 0.39616644382476807,
+      "learning_rate": 0.00019996647348455543,
+      "loss": 1.0481,
+      "step": 205
+    },
+    {
+      "epoch": 0.03668091168091168,
+      "grad_norm": 0.42987677454948425,
+      "learning_rate": 0.00019996611007667742,
+      "loss": 1.0923,
+      "step": 206
+    },
+    {
+      "epoch": 0.03685897435897436,
+      "grad_norm": 0.47065848112106323,
+      "learning_rate": 0.00019996574471017113,
+      "loss": 1.1403,
+      "step": 207
+    },
+    {
+      "epoch": 0.037037037037037035,
+      "grad_norm": 0.4363015592098236,
+      "learning_rate": 0.00019996537738504373,
+      "loss": 1.253,
+      "step": 208
+    },
+    {
+      "epoch": 0.03721509971509972,
+      "grad_norm": 0.4038296937942505,
+      "learning_rate": 0.00019996500810130243,
+      "loss": 1.1679,
+      "step": 209
+    },
+    {
+      "epoch": 0.03739316239316239,
+      "grad_norm": 0.5038532018661499,
+      "learning_rate": 0.00019996463685895445,
+      "loss": 1.1182,
+      "step": 210
+    },
+    {
+      "epoch": 0.037571225071225074,
+      "grad_norm": 0.37740692496299744,
+      "learning_rate": 0.00019996426365800706,
+      "loss": 1.0465,
+      "step": 211
+    },
+    {
+      "epoch": 0.03774928774928775,
+      "grad_norm": 0.47794604301452637,
+      "learning_rate": 0.00019996388849846759,
+      "loss": 1.2836,
+      "step": 212
+    },
+    {
+      "epoch": 0.037927350427350424,
+      "grad_norm": 0.38460609316825867,
+      "learning_rate": 0.0001999635113803434,
+      "loss": 1.2099,
+      "step": 213
+    },
+    {
+      "epoch": 0.038105413105413107,
+      "grad_norm": 0.42016157507896423,
+      "learning_rate": 0.0001999631323036418,
+      "loss": 1.152,
+      "step": 214
+    },
+    {
+      "epoch": 0.03828347578347578,
+      "grad_norm": 0.4024946391582489,
+      "learning_rate": 0.00019996275126837033,
+      "loss": 1.1534,
+      "step": 215
+    },
+    {
+      "epoch": 0.038461538461538464,
+      "grad_norm": 0.4573793411254883,
+      "learning_rate": 0.00019996236827453642,
+      "loss": 1.2019,
+      "step": 216
+    },
+    {
+      "epoch": 0.03863960113960114,
+      "grad_norm": 0.3642503321170807,
+      "learning_rate": 0.0001999619833221475,
+      "loss": 1.0541,
+      "step": 217
+    },
+    {
+      "epoch": 0.03881766381766382,
+      "grad_norm": 0.38492897152900696,
+      "learning_rate": 0.0001999615964112112,
+      "loss": 1.1269,
+      "step": 218
+    },
+    {
+      "epoch": 0.038995726495726496,
+      "grad_norm": 0.427219420671463,
+      "learning_rate": 0.0001999612075417351,
+      "loss": 1.1126,
+      "step": 219
+    },
+    {
+      "epoch": 0.03917378917378917,
+      "grad_norm": 0.40781742334365845,
+      "learning_rate": 0.00019996081671372676,
+      "loss": 1.2207,
+      "step": 220
+    },
+    {
+      "epoch": 0.03935185185185185,
+      "grad_norm": 0.39229512214660645,
+      "learning_rate": 0.00019996042392719386,
+      "loss": 1.0403,
+      "step": 221
+    },
+    {
+      "epoch": 0.03952991452991453,
+      "grad_norm": 0.42038577795028687,
+      "learning_rate": 0.0001999600291821441,
+      "loss": 1.2157,
+      "step": 222
+    },
+    {
+      "epoch": 0.03970797720797721,
+      "grad_norm": 0.3963491916656494,
+      "learning_rate": 0.00019995963247858525,
+      "loss": 1.0532,
+      "step": 223
+    },
+    {
+      "epoch": 0.039886039886039885,
+      "grad_norm": 0.4389874041080475,
+      "learning_rate": 0.00019995923381652502,
+      "loss": 1.4279,
+      "step": 224
+    },
+    {
+      "epoch": 0.04006410256410257,
+      "grad_norm": 0.357312947511673,
+      "learning_rate": 0.00019995883319597123,
+      "loss": 0.9871,
+      "step": 225
+    },
+    {
+      "epoch": 0.04024216524216524,
+      "grad_norm": 0.3644427955150604,
+      "learning_rate": 0.00019995843061693181,
+      "loss": 1.0879,
+      "step": 226
+    },
+    {
+      "epoch": 0.04042022792022792,
+      "grad_norm": 0.4074651002883911,
+      "learning_rate": 0.00019995802607941453,
+      "loss": 1.2138,
+      "step": 227
+    },
+    {
+      "epoch": 0.0405982905982906,
+      "grad_norm": 0.40709465742111206,
+      "learning_rate": 0.0001999576195834274,
+      "loss": 1.1905,
+      "step": 228
+    },
+    {
+      "epoch": 0.040776353276353275,
+      "grad_norm": 0.4280182719230652,
+      "learning_rate": 0.00019995721112897838,
+      "loss": 1.2331,
+      "step": 229
+    },
+    {
+      "epoch": 0.04095441595441596,
+      "grad_norm": 0.37846076488494873,
+      "learning_rate": 0.00019995680071607544,
+      "loss": 1.078,
+      "step": 230
+    },
+    {
+      "epoch": 0.04113247863247863,
+      "grad_norm": 0.3877260088920593,
+      "learning_rate": 0.0001999563883447266,
+      "loss": 1.0309,
+      "step": 231
+    },
+    {
+      "epoch": 0.04131054131054131,
+      "grad_norm": 0.42886826395988464,
+      "learning_rate": 0.00019995597401494,
+      "loss": 1.0403,
+      "step": 232
+    },
+    {
+      "epoch": 0.04148860398860399,
+      "grad_norm": 0.4316534101963043,
+      "learning_rate": 0.00019995555772672372,
+      "loss": 1.2418,
+      "step": 233
+    },
+    {
+      "epoch": 0.041666666666666664,
+      "grad_norm": 0.45768865942955017,
+      "learning_rate": 0.00019995513948008593,
+      "loss": 1.233,
+      "step": 234
+    },
+    {
+      "epoch": 0.041844729344729346,
+      "grad_norm": 0.5647913813591003,
+      "learning_rate": 0.00019995471927503481,
+      "loss": 1.1346,
+      "step": 235
+    },
+    {
+      "epoch": 0.04202279202279202,
+      "grad_norm": 0.3797492980957031,
+      "learning_rate": 0.00019995429711157863,
+      "loss": 1.1574,
+      "step": 236
+    },
+    {
+      "epoch": 0.042200854700854704,
+      "grad_norm": 0.4392767548561096,
+      "learning_rate": 0.00019995387298972562,
+      "loss": 0.8988,
+      "step": 237
+    },
+    {
+      "epoch": 0.04237891737891738,
+      "grad_norm": 0.37331557273864746,
+      "learning_rate": 0.0001999534469094841,
+      "loss": 1.0439,
+      "step": 238
+    },
+    {
+      "epoch": 0.042556980056980054,
+      "grad_norm": 0.3785935938358307,
+      "learning_rate": 0.00019995301887086245,
+      "loss": 0.9839,
+      "step": 239
+    },
+    {
+      "epoch": 0.042735042735042736,
+      "grad_norm": 0.4351862668991089,
+      "learning_rate": 0.00019995258887386898,
+      "loss": 1.2653,
+      "step": 240
+    },
+    {
+      "epoch": 0.04291310541310541,
+      "grad_norm": 0.399475634098053,
+      "learning_rate": 0.0001999521569185122,
+      "loss": 0.9877,
+      "step": 241
+    },
+    {
+      "epoch": 0.04309116809116809,
+      "grad_norm": 0.42332810163497925,
+      "learning_rate": 0.00019995172300480053,
+      "loss": 1.2403,
+      "step": 242
+    },
+    {
+      "epoch": 0.04326923076923077,
+      "grad_norm": 0.4397708475589752,
+      "learning_rate": 0.00019995128713274247,
+      "loss": 0.9316,
+      "step": 243
+    },
+    {
+      "epoch": 0.04344729344729345,
+      "grad_norm": 0.3614110052585602,
+      "learning_rate": 0.00019995084930234658,
+      "loss": 1.1088,
+      "step": 244
+    },
+    {
+      "epoch": 0.043625356125356125,
+      "grad_norm": 0.39433717727661133,
+      "learning_rate": 0.0001999504095136214,
+      "loss": 1.2002,
+      "step": 245
+    },
+    {
+      "epoch": 0.0438034188034188,
+      "grad_norm": 0.33088216185569763,
+      "learning_rate": 0.0001999499677665756,
+      "loss": 0.8796,
+      "step": 246
+    },
+    {
+      "epoch": 0.04398148148148148,
+      "grad_norm": 0.5239143967628479,
+      "learning_rate": 0.00019994952406121784,
+      "loss": 1.2808,
+      "step": 247
+    },
+    {
+      "epoch": 0.04415954415954416,
+      "grad_norm": 0.42156723141670227,
+      "learning_rate": 0.00019994907839755675,
+      "loss": 1.1775,
+      "step": 248
+    },
+    {
+      "epoch": 0.04433760683760684,
+      "grad_norm": 0.42569902539253235,
+      "learning_rate": 0.0001999486307756011,
+      "loss": 1.001,
+      "step": 249
+    },
+    {
+      "epoch": 0.044515669515669515,
+      "grad_norm": 0.38241544365882874,
+      "learning_rate": 0.00019994818119535964,
+      "loss": 1.1064,
+      "step": 250
+    },
+    {
+      "epoch": 0.0446937321937322,
+      "grad_norm": 0.4185071885585785,
+      "learning_rate": 0.0001999477296568412,
+      "loss": 1.2109,
+      "step": 251
+    },
+    {
+      "epoch": 0.04487179487179487,
+      "grad_norm": 0.4189644157886505,
+      "learning_rate": 0.00019994727616005464,
+      "loss": 1.2902,
+      "step": 252
+    },
+    {
+      "epoch": 0.04504985754985755,
+      "grad_norm": 0.34671884775161743,
+      "learning_rate": 0.0001999468207050088,
+      "loss": 0.9429,
+      "step": 253
+    },
+    {
+      "epoch": 0.04522792022792023,
+      "grad_norm": 0.42391687631607056,
+      "learning_rate": 0.00019994636329171266,
+      "loss": 0.7179,
+      "step": 254
+    },
+    {
+      "epoch": 0.045405982905982904,
+      "grad_norm": 0.3803195655345917,
+      "learning_rate": 0.00019994590392017513,
+      "loss": 1.0318,
+      "step": 255
+    },
+    {
+      "epoch": 0.045584045584045586,
+      "grad_norm": 0.3389956057071686,
+      "learning_rate": 0.00019994544259040525,
+      "loss": 1.0485,
+      "step": 256
+    },
+    {
+      "epoch": 0.04576210826210826,
+      "grad_norm": 0.4927038550376892,
+      "learning_rate": 0.000199944979302412,
+      "loss": 1.3426,
+      "step": 257
+    },
+    {
+      "epoch": 0.045940170940170943,
+      "grad_norm": 0.33200421929359436,
+      "learning_rate": 0.00019994451405620453,
+      "loss": 1.0071,
+      "step": 258
+    },
+    {
+      "epoch": 0.04611823361823362,
+      "grad_norm": 0.38028615713119507,
+      "learning_rate": 0.00019994404685179195,
+      "loss": 1.0985,
+      "step": 259
+    },
+    {
+      "epoch": 0.046296296296296294,
+      "grad_norm": 0.3752151429653168,
+      "learning_rate": 0.00019994357768918333,
+      "loss": 0.9209,
+      "step": 260
+    },
+    {
+      "epoch": 0.046474358974358976,
+      "grad_norm": 0.43030866980552673,
+      "learning_rate": 0.00019994310656838796,
+      "loss": 0.9921,
+      "step": 261
+    },
+    {
+      "epoch": 0.04665242165242165,
+      "grad_norm": 0.4402460753917694,
+      "learning_rate": 0.00019994263348941502,
+      "loss": 1.1051,
+      "step": 262
+    },
+    {
+      "epoch": 0.04683048433048433,
+      "grad_norm": 0.43012720346450806,
+      "learning_rate": 0.0001999421584522738,
+      "loss": 1.1839,
+      "step": 263
+    },
+    {
+      "epoch": 0.04700854700854701,
+      "grad_norm": 0.4195305407047272,
+      "learning_rate": 0.0001999416814569736,
+      "loss": 1.1749,
+      "step": 264
+    },
+    {
+      "epoch": 0.04718660968660968,
+      "grad_norm": 0.45623287558555603,
+      "learning_rate": 0.00019994120250352372,
+      "loss": 1.2433,
+      "step": 265
+    },
+    {
+      "epoch": 0.047364672364672365,
+      "grad_norm": 0.4736156761646271,
+      "learning_rate": 0.00019994072159193363,
+      "loss": 1.2882,
+      "step": 266
+    },
+    {
+      "epoch": 0.04754273504273504,
+      "grad_norm": 0.36698561906814575,
+      "learning_rate": 0.0001999402387222127,
+      "loss": 1.1486,
+      "step": 267
+    },
+    {
+      "epoch": 0.04772079772079772,
+      "grad_norm": 0.3854144215583801,
+      "learning_rate": 0.00019993975389437038,
+      "loss": 0.8115,
+      "step": 268
+    },
+    {
+      "epoch": 0.0478988603988604,
+      "grad_norm": 0.41512808203697205,
+      "learning_rate": 0.0001999392671084162,
+      "loss": 1.0959,
+      "step": 269
+    },
+    {
+      "epoch": 0.04807692307692308,
+      "grad_norm": 0.3869563341140747,
+      "learning_rate": 0.0001999387783643597,
+      "loss": 1.087,
+      "step": 270
+    },
+    {
+      "epoch": 0.048254985754985755,
+      "grad_norm": 0.4649744927883148,
+      "learning_rate": 0.00019993828766221044,
+      "loss": 1.0011,
+      "step": 271
+    },
+    {
+      "epoch": 0.04843304843304843,
+      "grad_norm": 0.40331923961639404,
+      "learning_rate": 0.00019993779500197803,
+      "loss": 1.1463,
+      "step": 272
+    },
+    {
+      "epoch": 0.04861111111111111,
+      "grad_norm": 0.3826279938220978,
+      "learning_rate": 0.0001999373003836721,
+      "loss": 1.1491,
+      "step": 273
+    },
+    {
+      "epoch": 0.04878917378917379,
+      "grad_norm": 0.3967166543006897,
+      "learning_rate": 0.00019993680380730243,
+      "loss": 1.1462,
+      "step": 274
+    },
+    {
+      "epoch": 0.04896723646723647,
+      "grad_norm": 0.4298507869243622,
+      "learning_rate": 0.00019993630527287865,
+      "loss": 1.2471,
+      "step": 275
+    },
+    {
+      "epoch": 0.049145299145299144,
+      "grad_norm": 0.41486215591430664,
+      "learning_rate": 0.0001999358047804106,
+      "loss": 1.287,
+      "step": 276
+    },
+    {
+      "epoch": 0.049323361823361826,
+      "grad_norm": 0.3914124369621277,
+      "learning_rate": 0.00019993530232990803,
+      "loss": 1.0935,
+      "step": 277
+    },
+    {
+      "epoch": 0.0495014245014245,
+      "grad_norm": 0.39888378977775574,
+      "learning_rate": 0.00019993479792138082,
+      "loss": 1.2347,
+      "step": 278
+    },
+    {
+      "epoch": 0.049679487179487176,
+      "grad_norm": 0.3911665678024292,
+      "learning_rate": 0.00019993429155483884,
+      "loss": 1.0917,
+      "step": 279
+    },
+    {
+      "epoch": 0.04985754985754986,
+      "grad_norm": 0.42871445417404175,
+      "learning_rate": 0.00019993378323029197,
+      "loss": 1.0277,
+      "step": 280
+    },
+    {
+      "epoch": 0.050035612535612534,
+      "grad_norm": 0.35397860407829285,
+      "learning_rate": 0.00019993327294775027,
+      "loss": 0.9549,
+      "step": 281
+    },
+    {
+      "epoch": 0.050213675213675216,
+      "grad_norm": 0.4528059959411621,
+      "learning_rate": 0.00019993276070722364,
+      "loss": 1.2338,
+      "step": 282
+    },
+    {
+      "epoch": 0.05039173789173789,
+      "grad_norm": 0.354735791683197,
+      "learning_rate": 0.00019993224650872218,
+      "loss": 1.1892,
+      "step": 283
+    },
+    {
+      "epoch": 0.05056980056980057,
+      "grad_norm": 0.44407567381858826,
+      "learning_rate": 0.00019993173035225592,
+      "loss": 1.1621,
+      "step": 284
+    },
+    {
+      "epoch": 0.05074786324786325,
+      "grad_norm": 0.4177244305610657,
+      "learning_rate": 0.000199931212237835,
+      "loss": 1.1184,
+      "step": 285
+    },
+    {
+      "epoch": 0.05092592592592592,
+      "grad_norm": 0.5627759695053101,
+      "learning_rate": 0.0001999306921654696,
+      "loss": 1.0755,
+      "step": 286
+    },
+    {
+      "epoch": 0.051103988603988605,
+      "grad_norm": 0.46767523884773254,
+      "learning_rate": 0.00019993017013516986,
+      "loss": 1.2654,
+      "step": 287
+    },
+    {
+      "epoch": 0.05128205128205128,
+      "grad_norm": 0.4163128733634949,
+      "learning_rate": 0.000199929646146946,
+      "loss": 1.1307,
+      "step": 288
+    },
+    {
+      "epoch": 0.05146011396011396,
+      "grad_norm": 0.36954161524772644,
+      "learning_rate": 0.00019992912020080832,
+      "loss": 0.8274,
+      "step": 289
+    },
+    {
+      "epoch": 0.05163817663817664,
+      "grad_norm": 0.4770594835281372,
+      "learning_rate": 0.00019992859229676712,
+      "loss": 1.2235,
+      "step": 290
+    },
+    {
+      "epoch": 0.05181623931623932,
+      "grad_norm": 0.4174608290195465,
+      "learning_rate": 0.00019992806243483274,
+      "loss": 1.2893,
+      "step": 291
+    },
+    {
+      "epoch": 0.051994301994301995,
+      "grad_norm": 0.3794898986816406,
+      "learning_rate": 0.00019992753061501555,
+      "loss": 1.104,
+      "step": 292
+    },
+    {
+      "epoch": 0.05217236467236467,
+      "grad_norm": 0.3912592828273773,
+      "learning_rate": 0.000199926996837326,
+      "loss": 1.0043,
+      "step": 293
+    },
+    {
+      "epoch": 0.05235042735042735,
+      "grad_norm": 0.39641159772872925,
+      "learning_rate": 0.00019992646110177448,
+      "loss": 1.083,
+      "step": 294
+    },
+    {
+      "epoch": 0.05252849002849003,
+      "grad_norm": 0.3518857955932617,
+      "learning_rate": 0.00019992592340837157,
+      "loss": 0.9275,
+      "step": 295
+    },
+    {
+      "epoch": 0.05270655270655271,
+      "grad_norm": 0.3955721855163574,
+      "learning_rate": 0.00019992538375712777,
+      "loss": 1.0153,
+      "step": 296
+    },
+    {
+      "epoch": 0.052884615384615384,
+      "grad_norm": 0.3837333023548126,
+      "learning_rate": 0.00019992484214805364,
+      "loss": 1.1664,
+      "step": 297
+    },
+    {
+      "epoch": 0.053062678062678066,
+      "grad_norm": 0.39400920271873474,
+      "learning_rate": 0.0001999242985811598,
+      "loss": 1.0532,
+      "step": 298
+    },
+    {
+      "epoch": 0.05324074074074074,
+      "grad_norm": 0.39258649945259094,
+      "learning_rate": 0.00019992375305645692,
+      "loss": 1.0081,
+      "step": 299
+    },
+    {
+      "epoch": 0.053418803418803416,
+      "grad_norm": 0.49768248200416565,
+      "learning_rate": 0.00019992320557395566,
+      "loss": 1.2553,
+      "step": 300
+    },
+    {
+      "epoch": 0.0535968660968661,
+      "grad_norm": 0.364776074886322,
+      "learning_rate": 0.00019992265613366677,
+      "loss": 1.0582,
+      "step": 301
+    },
+    {
+      "epoch": 0.053774928774928774,
+      "grad_norm": 0.47317907214164734,
+      "learning_rate": 0.00019992210473560097,
+      "loss": 1.3114,
+      "step": 302
+    },
+    {
+      "epoch": 0.053952991452991456,
+      "grad_norm": 0.3706119656562805,
+      "learning_rate": 0.00019992155137976917,
+      "loss": 0.9554,
+      "step": 303
+    },
+    {
+      "epoch": 0.05413105413105413,
+      "grad_norm": 0.42809563875198364,
+      "learning_rate": 0.0001999209960661821,
+      "loss": 1.306,
+      "step": 304
+    },
+    {
+      "epoch": 0.054309116809116806,
+      "grad_norm": 0.4514487385749817,
+      "learning_rate": 0.00019992043879485066,
+      "loss": 1.0147,
+      "step": 305
+    },
+    {
+      "epoch": 0.05448717948717949,
+      "grad_norm": 0.36672836542129517,
+      "learning_rate": 0.0001999198795657858,
+      "loss": 1.1392,
+      "step": 306
+    },
+    {
+      "epoch": 0.05466524216524216,
+      "grad_norm": 0.4206554889678955,
+      "learning_rate": 0.00019991931837899847,
+      "loss": 1.2405,
+      "step": 307
+    },
+    {
+      "epoch": 0.054843304843304845,
+      "grad_norm": 0.46168261766433716,
+      "learning_rate": 0.00019991875523449966,
+      "loss": 1.2707,
+      "step": 308
+    },
+    {
+      "epoch": 0.05502136752136752,
+      "grad_norm": 0.39503365755081177,
+      "learning_rate": 0.00019991819013230039,
+      "loss": 1.0776,
+      "step": 309
+    },
+    {
+      "epoch": 0.0551994301994302,
+      "grad_norm": 0.35244834423065186,
+      "learning_rate": 0.00019991762307241178,
+      "loss": 1.0864,
+      "step": 310
+    },
+    {
+      "epoch": 0.05537749287749288,
+      "grad_norm": 0.3865319490432739,
+      "learning_rate": 0.0001999170540548449,
+      "loss": 1.3659,
+      "step": 311
+    },
+    {
+      "epoch": 0.05555555555555555,
+      "grad_norm": 0.3666876554489136,
+      "learning_rate": 0.0001999164830796109,
+      "loss": 0.9884,
+      "step": 312
+    },
+    {
+      "epoch": 0.055733618233618235,
+      "grad_norm": 0.4278281629085541,
+      "learning_rate": 0.00019991591014672096,
+      "loss": 1.1522,
+      "step": 313
+    },
+    {
+      "epoch": 0.05591168091168091,
+      "grad_norm": 0.4172627031803131,
+      "learning_rate": 0.0001999153352561863,
+      "loss": 1.2527,
+      "step": 314
+    },
+    {
+      "epoch": 0.05608974358974359,
+      "grad_norm": 0.38872212171554565,
+      "learning_rate": 0.00019991475840801823,
+      "loss": 1.2985,
+      "step": 315
+    },
+    {
+      "epoch": 0.05626780626780627,
+      "grad_norm": 0.4160458445549011,
+      "learning_rate": 0.00019991417960222804,
+      "loss": 1.1347,
+      "step": 316
+    },
+    {
+      "epoch": 0.05644586894586895,
+      "grad_norm": 0.5169723033905029,
+      "learning_rate": 0.00019991359883882705,
+      "loss": 1.0819,
+      "step": 317
+    },
+    {
+      "epoch": 0.056623931623931624,
+      "grad_norm": 0.42306259274482727,
+      "learning_rate": 0.0001999130161178266,
+      "loss": 1.3139,
+      "step": 318
+    },
+    {
+      "epoch": 0.0568019943019943,
+      "grad_norm": 0.41975873708724976,
+      "learning_rate": 0.00019991243143923816,
+      "loss": 1.2277,
+      "step": 319
+    },
+    {
+      "epoch": 0.05698005698005698,
+      "grad_norm": 0.3873472511768341,
+      "learning_rate": 0.00019991184480307324,
+      "loss": 1.156,
+      "step": 320
+    },
+    {
+      "epoch": 0.057158119658119656,
+      "grad_norm": 0.43656104803085327,
+      "learning_rate": 0.0001999112562093432,
+      "loss": 1.2344,
+      "step": 321
+    },
+    {
+      "epoch": 0.05733618233618234,
+      "grad_norm": 0.3738791048526764,
+      "learning_rate": 0.00019991066565805968,
+      "loss": 0.9573,
+      "step": 322
+    },
+    {
+      "epoch": 0.05751424501424501,
+      "grad_norm": 0.3838156461715698,
+      "learning_rate": 0.00019991007314923418,
+      "loss": 0.9274,
+      "step": 323
+    },
+    {
+      "epoch": 0.057692307692307696,
+      "grad_norm": 0.4564770758152008,
+      "learning_rate": 0.00019990947868287837,
+      "loss": 1.0756,
+      "step": 324
+    },
+    {
+      "epoch": 0.05787037037037037,
+      "grad_norm": 0.4560079872608185,
+      "learning_rate": 0.00019990888225900386,
+      "loss": 1.1508,
+      "step": 325
+    },
+    {
+      "epoch": 0.058048433048433046,
+      "grad_norm": 0.44356057047843933,
+      "learning_rate": 0.00019990828387762236,
+      "loss": 1.2323,
+      "step": 326
+    },
+    {
+      "epoch": 0.05822649572649573,
+      "grad_norm": 0.46390119194984436,
+      "learning_rate": 0.00019990768353874553,
+      "loss": 1.0031,
+      "step": 327
+    },
+    {
+      "epoch": 0.0584045584045584,
+      "grad_norm": 0.4502357244491577,
+      "learning_rate": 0.00019990708124238525,
+      "loss": 1.3454,
+      "step": 328
+    },
+    {
+      "epoch": 0.058582621082621085,
+      "grad_norm": 0.3979945182800293,
+      "learning_rate": 0.0001999064769885532,
+      "loss": 1.2833,
+      "step": 329
+    },
+    {
+      "epoch": 0.05876068376068376,
+      "grad_norm": 0.3899286687374115,
+      "learning_rate": 0.00019990587077726128,
+      "loss": 1.0175,
+      "step": 330
+    },
+    {
+      "epoch": 0.05893874643874644,
+      "grad_norm": 0.41422948241233826,
+      "learning_rate": 0.00019990526260852139,
+      "loss": 1.1151,
+      "step": 331
+    },
+    {
+      "epoch": 0.05911680911680912,
+      "grad_norm": 0.4266608953475952,
+      "learning_rate": 0.0001999046524823454,
+      "loss": 1.1119,
+      "step": 332
+    },
+    {
+      "epoch": 0.05929487179487179,
+      "grad_norm": 0.46563324332237244,
+      "learning_rate": 0.00019990404039874524,
+      "loss": 1.2358,
+      "step": 333
+    },
+    {
+      "epoch": 0.059472934472934474,
+      "grad_norm": 0.4404347240924835,
+      "learning_rate": 0.00019990342635773297,
+      "loss": 1.1748,
+      "step": 334
+    },
+    {
+      "epoch": 0.05965099715099715,
+      "grad_norm": 0.5133237838745117,
+      "learning_rate": 0.00019990281035932062,
+      "loss": 1.1649,
+      "step": 335
+    },
+    {
+      "epoch": 0.05982905982905983,
+      "grad_norm": 0.3593895435333252,
+      "learning_rate": 0.00019990219240352018,
+      "loss": 1.0318,
+      "step": 336
+    },
+    {
+      "epoch": 0.06000712250712251,
+      "grad_norm": 0.40554583072662354,
+      "learning_rate": 0.00019990157249034384,
+      "loss": 1.1202,
+      "step": 337
+    },
+    {
+      "epoch": 0.06018518518518518,
+      "grad_norm": 0.3770706057548523,
+      "learning_rate": 0.00019990095061980372,
+      "loss": 0.9908,
+      "step": 338
+    },
+    {
+      "epoch": 0.060363247863247864,
+      "grad_norm": 0.39676955342292786,
+      "learning_rate": 0.000199900326791912,
+      "loss": 0.8176,
+      "step": 339
+    },
+    {
+      "epoch": 0.06054131054131054,
+      "grad_norm": 0.41448578238487244,
+      "learning_rate": 0.00019989970100668086,
+      "loss": 1.2877,
+      "step": 340
+    },
+    {
+      "epoch": 0.06071937321937322,
+      "grad_norm": 0.4200015068054199,
+      "learning_rate": 0.00019989907326412265,
+      "loss": 1.2293,
+      "step": 341
+    },
+    {
+      "epoch": 0.060897435897435896,
+      "grad_norm": 0.47350621223449707,
+      "learning_rate": 0.0001998984435642496,
+      "loss": 1.2331,
+      "step": 342
+    },
+    {
+      "epoch": 0.06107549857549858,
+      "grad_norm": 0.47050634026527405,
+      "learning_rate": 0.00019989781190707406,
+      "loss": 0.8888,
+      "step": 343
+    },
+    {
+      "epoch": 0.06125356125356125,
+      "grad_norm": 0.4994896948337555,
+      "learning_rate": 0.00019989717829260842,
+      "loss": 1.0921,
+      "step": 344
+    },
+    {
+      "epoch": 0.06143162393162393,
+      "grad_norm": 0.36340200901031494,
+      "learning_rate": 0.0001998965427208651,
+      "loss": 0.9777,
+      "step": 345
+    },
+    {
+      "epoch": 0.06160968660968661,
+      "grad_norm": 0.3538152873516083,
+      "learning_rate": 0.00019989590519185654,
+      "loss": 1.0055,
+      "step": 346
+    },
+    {
+      "epoch": 0.061787749287749286,
+      "grad_norm": 0.5388944149017334,
+      "learning_rate": 0.00019989526570559526,
+      "loss": 1.1001,
+      "step": 347
+    },
+    {
+      "epoch": 0.06196581196581197,
+      "grad_norm": 0.4411574602127075,
+      "learning_rate": 0.00019989462426209373,
+      "loss": 1.0038,
+      "step": 348
+    },
+    {
+      "epoch": 0.06214387464387464,
+      "grad_norm": 0.3930876851081848,
+      "learning_rate": 0.00019989398086136455,
+      "loss": 1.1534,
+      "step": 349
+    },
+    {
+      "epoch": 0.062321937321937325,
+      "grad_norm": 0.47357070446014404,
+      "learning_rate": 0.00019989333550342033,
+      "loss": 1.2687,
+      "step": 350
+    },
+    {
+      "epoch": 0.0625,
+      "grad_norm": 0.40302303433418274,
+      "learning_rate": 0.00019989268818827372,
+      "loss": 1.1894,
+      "step": 351
+    },
+    {
+      "epoch": 0.06267806267806268,
+      "grad_norm": 0.4470510184764862,
+      "learning_rate": 0.00019989203891593738,
+      "loss": 1.2207,
+      "step": 352
+    },
+    {
+      "epoch": 0.06285612535612535,
+      "grad_norm": 0.42235100269317627,
+      "learning_rate": 0.00019989138768642406,
+      "loss": 1.2086,
+      "step": 353
+    },
+    {
+      "epoch": 0.06303418803418803,
+      "grad_norm": 0.38305309414863586,
+      "learning_rate": 0.0001998907344997465,
+      "loss": 1.0473,
+      "step": 354
+    },
+    {
+      "epoch": 0.06321225071225071,
+      "grad_norm": 0.3893027901649475,
+      "learning_rate": 0.0001998900793559175,
+      "loss": 1.1746,
+      "step": 355
+    },
+    {
+      "epoch": 0.0633903133903134,
+      "grad_norm": 0.41206735372543335,
+      "learning_rate": 0.0001998894222549499,
+      "loss": 1.188,
+      "step": 356
+    },
+    {
+      "epoch": 0.06356837606837606,
+      "grad_norm": 0.3700513243675232,
+      "learning_rate": 0.00019988876319685658,
+      "loss": 0.9862,
+      "step": 357
+    },
+    {
+      "epoch": 0.06374643874643875,
+      "grad_norm": 0.3708794116973877,
+      "learning_rate": 0.0001998881021816504,
+      "loss": 1.2003,
+      "step": 358
+    },
+    {
+      "epoch": 0.06392450142450143,
+      "grad_norm": 0.4058014154434204,
+      "learning_rate": 0.00019988743920934442,
+      "loss": 1.2311,
+      "step": 359
+    },
+    {
+      "epoch": 0.0641025641025641,
+      "grad_norm": 0.39134132862091064,
+      "learning_rate": 0.00019988677427995155,
+      "loss": 1.001,
+      "step": 360
+    },
+    {
+      "epoch": 0.06428062678062678,
+      "grad_norm": 0.3853437602519989,
+      "learning_rate": 0.00019988610739348484,
+      "loss": 1.0725,
+      "step": 361
+    },
+    {
+      "epoch": 0.06445868945868946,
+      "grad_norm": 0.47114330530166626,
+      "learning_rate": 0.00019988543854995735,
+      "loss": 1.2196,
+      "step": 362
+    },
+    {
+      "epoch": 0.06463675213675214,
+      "grad_norm": 0.40465688705444336,
+      "learning_rate": 0.00019988476774938216,
+      "loss": 1.1869,
+      "step": 363
+    },
+    {
+      "epoch": 0.06481481481481481,
+      "grad_norm": 0.40301886200904846,
+      "learning_rate": 0.00019988409499177245,
+      "loss": 1.1765,
+      "step": 364
+    },
+    {
+      "epoch": 0.0649928774928775,
+      "grad_norm": 0.43443185091018677,
+      "learning_rate": 0.0001998834202771414,
+      "loss": 1.2022,
+      "step": 365
+    },
+    {
+      "epoch": 0.06517094017094018,
+      "grad_norm": 0.4712986350059509,
+      "learning_rate": 0.00019988274360550217,
+      "loss": 1.156,
+      "step": 366
+    },
+    {
+      "epoch": 0.06534900284900284,
+      "grad_norm": 0.4524450898170471,
+      "learning_rate": 0.00019988206497686815,
+      "loss": 1.2917,
+      "step": 367
+    },
+    {
+      "epoch": 0.06552706552706553,
+      "grad_norm": 0.40302205085754395,
+      "learning_rate": 0.0001998813843912525,
+      "loss": 0.9993,
+      "step": 368
+    },
+    {
+      "epoch": 0.06570512820512821,
+      "grad_norm": 0.39435216784477234,
+      "learning_rate": 0.00019988070184866864,
+      "loss": 1.0914,
+      "step": 369
+    },
+    {
+      "epoch": 0.06588319088319089,
+      "grad_norm": 0.39267390966415405,
+      "learning_rate": 0.00019988001734912988,
+      "loss": 1.3138,
+      "step": 370
+    },
+    {
+      "epoch": 0.06606125356125356,
+      "grad_norm": 0.38351675868034363,
+      "learning_rate": 0.00019987933089264968,
+      "loss": 1.0997,
+      "step": 371
+    },
+    {
+      "epoch": 0.06623931623931624,
+      "grad_norm": 0.3294839859008789,
+      "learning_rate": 0.00019987864247924145,
+      "loss": 0.9656,
+      "step": 372
+    },
+    {
+      "epoch": 0.06641737891737892,
+      "grad_norm": 0.45333364605903625,
+      "learning_rate": 0.00019987795210891872,
+      "loss": 1.095,
+      "step": 373
+    },
+    {
+      "epoch": 0.06659544159544159,
+      "grad_norm": 0.4362282454967499,
+      "learning_rate": 0.00019987725978169501,
+      "loss": 1.2103,
+      "step": 374
+    },
+    {
+      "epoch": 0.06677350427350427,
+      "grad_norm": 0.41314780712127686,
+      "learning_rate": 0.00019987656549758385,
+      "loss": 1.2115,
+      "step": 375
+    },
+    {
+      "epoch": 0.06695156695156695,
+      "grad_norm": 0.4230864644050598,
+      "learning_rate": 0.00019987586925659888,
+      "loss": 1.17,
+      "step": 376
+    },
+    {
+      "epoch": 0.06712962962962964,
+      "grad_norm": 0.4703855812549591,
+      "learning_rate": 0.00019987517105875372,
+      "loss": 1.367,
+      "step": 377
+    },
+    {
+      "epoch": 0.0673076923076923,
+      "grad_norm": 0.4671297073364258,
+      "learning_rate": 0.00019987447090406206,
+      "loss": 1.2543,
+      "step": 378
+    },
+    {
+      "epoch": 0.06748575498575499,
+      "grad_norm": 0.43746981024742126,
+      "learning_rate": 0.0001998737687925376,
+      "loss": 1.214,
+      "step": 379
+    },
+    {
+      "epoch": 0.06766381766381767,
+      "grad_norm": 0.40889596939086914,
+      "learning_rate": 0.00019987306472419412,
+      "loss": 1.0496,
+      "step": 380
+    },
+    {
+      "epoch": 0.06784188034188034,
+      "grad_norm": 0.3677358627319336,
+      "learning_rate": 0.0001998723586990454,
+      "loss": 1.1242,
+      "step": 381
+    },
+    {
+      "epoch": 0.06801994301994302,
+      "grad_norm": 0.3892628848552704,
+      "learning_rate": 0.00019987165071710527,
+      "loss": 1.0246,
+      "step": 382
+    },
+    {
+      "epoch": 0.0681980056980057,
+      "grad_norm": 0.4281293749809265,
+      "learning_rate": 0.00019987094077838764,
+      "loss": 1.2817,
+      "step": 383
+    },
+    {
+      "epoch": 0.06837606837606838,
+      "grad_norm": 0.45030340552330017,
+      "learning_rate": 0.00019987022888290636,
+      "loss": 1.159,
+      "step": 384
+    },
+    {
+      "epoch": 0.06855413105413105,
+      "grad_norm": 0.6327905058860779,
+      "learning_rate": 0.00019986951503067545,
+      "loss": 0.9577,
+      "step": 385
+    },
+    {
+      "epoch": 0.06873219373219373,
+      "grad_norm": 0.40339627861976624,
+      "learning_rate": 0.0001998687992217088,
+      "loss": 1.138,
+      "step": 386
+    },
+    {
+      "epoch": 0.06891025641025642,
+      "grad_norm": 0.4018291234970093,
+      "learning_rate": 0.00019986808145602052,
+      "loss": 0.9109,
+      "step": 387
+    },
+    {
+      "epoch": 0.06908831908831908,
+      "grad_norm": 0.41566264629364014,
+      "learning_rate": 0.00019986736173362464,
+      "loss": 1.1516,
+      "step": 388
+    },
+    {
+      "epoch": 0.06926638176638177,
+      "grad_norm": 0.3569067418575287,
+      "learning_rate": 0.00019986664005453527,
+      "loss": 1.2329,
+      "step": 389
+    },
+    {
+      "epoch": 0.06944444444444445,
+      "grad_norm": 0.3959648907184601,
+      "learning_rate": 0.0001998659164187665,
+      "loss": 1.1041,
+      "step": 390
+    },
+    {
+      "epoch": 0.06962250712250712,
+      "grad_norm": 0.42853206396102905,
+      "learning_rate": 0.00019986519082633257,
+      "loss": 1.0859,
+      "step": 391
+    },
+    {
+      "epoch": 0.0698005698005698,
+      "grad_norm": 0.42005518078804016,
+      "learning_rate": 0.0001998644632772477,
+      "loss": 1.2017,
+      "step": 392
+    },
+    {
+      "epoch": 0.06997863247863248,
+      "grad_norm": 0.4296947419643402,
+      "learning_rate": 0.00019986373377152612,
+      "loss": 1.1464,
+      "step": 393
+    },
+    {
+      "epoch": 0.07015669515669516,
+      "grad_norm": 0.394747793674469,
+      "learning_rate": 0.0001998630023091821,
+      "loss": 1.0316,
+      "step": 394
+    },
+    {
+      "epoch": 0.07033475783475783,
+      "grad_norm": 0.3779357969760895,
+      "learning_rate": 0.00019986226889023002,
+      "loss": 1.1081,
+      "step": 395
+    },
+    {
+      "epoch": 0.07051282051282051,
+      "grad_norm": 0.4271804690361023,
+      "learning_rate": 0.00019986153351468424,
+      "loss": 0.985,
+      "step": 396
+    },
+    {
+      "epoch": 0.0706908831908832,
+      "grad_norm": 0.49412235617637634,
+      "learning_rate": 0.00019986079618255912,
+      "loss": 1.2606,
+      "step": 397
+    },
+    {
+      "epoch": 0.07086894586894586,
+      "grad_norm": 0.43657439947128296,
+      "learning_rate": 0.00019986005689386915,
+      "loss": 1.2266,
+      "step": 398
+    },
+    {
+      "epoch": 0.07104700854700854,
+      "grad_norm": 0.4060729444026947,
+      "learning_rate": 0.0001998593156486288,
+      "loss": 1.1787,
+      "step": 399
+    },
+    {
+      "epoch": 0.07122507122507123,
+      "grad_norm": 0.387046217918396,
+      "learning_rate": 0.00019985857244685264,
+      "loss": 0.9411,
+      "step": 400
+    },
+    {
+      "epoch": 0.07140313390313391,
+      "grad_norm": 0.4243999123573303,
+      "learning_rate": 0.00019985782728855516,
+      "loss": 1.2024,
+      "step": 401
+    },
+    {
+      "epoch": 0.07158119658119658,
+      "grad_norm": 0.43113812804222107,
+      "learning_rate": 0.000199857080173751,
+      "loss": 1.1246,
+      "step": 402
+    },
+    {
+      "epoch": 0.07175925925925926,
+      "grad_norm": 0.4653271436691284,
+      "learning_rate": 0.0001998563311024548,
+      "loss": 1.2343,
+      "step": 403
+    },
+    {
+      "epoch": 0.07193732193732194,
+      "grad_norm": 0.43260812759399414,
+      "learning_rate": 0.0001998555800746812,
+      "loss": 0.9543,
+      "step": 404
+    },
+    {
+      "epoch": 0.07211538461538461,
+      "grad_norm": 0.4635484516620636,
+      "learning_rate": 0.00019985482709044495,
+      "loss": 1.1091,
+      "step": 405
+    },
+    {
+      "epoch": 0.07229344729344729,
+      "grad_norm": 0.38362643122673035,
+      "learning_rate": 0.00019985407214976076,
+      "loss": 1.2584,
+      "step": 406
+    },
+    {
+      "epoch": 0.07247150997150997,
+      "grad_norm": 0.4068310558795929,
+      "learning_rate": 0.00019985331525264351,
+      "loss": 1.1944,
+      "step": 407
+    },
+    {
+      "epoch": 0.07264957264957266,
+      "grad_norm": 0.43909943103790283,
+      "learning_rate": 0.00019985255639910795,
+      "loss": 1.3748,
+      "step": 408
+    },
+    {
+      "epoch": 0.07282763532763532,
+      "grad_norm": 0.48674601316452026,
+      "learning_rate": 0.000199851795589169,
+      "loss": 1.2684,
+      "step": 409
+    },
+    {
+      "epoch": 0.073005698005698,
+      "grad_norm": 0.4218580722808838,
+      "learning_rate": 0.0001998510328228415,
+      "loss": 1.168,
+      "step": 410
+    },
+    {
+      "epoch": 0.07318376068376069,
+      "grad_norm": 0.4688236117362976,
+      "learning_rate": 0.00019985026810014046,
+      "loss": 1.3088,
+      "step": 411
+    },
+    {
+      "epoch": 0.07336182336182336,
+      "grad_norm": 0.3863612711429596,
+      "learning_rate": 0.00019984950142108083,
+      "loss": 1.0261,
+      "step": 412
+    },
+    {
+      "epoch": 0.07353988603988604,
+      "grad_norm": 0.4177640378475189,
+      "learning_rate": 0.00019984873278567765,
+      "loss": 1.1985,
+      "step": 413
+    },
+    {
+      "epoch": 0.07371794871794872,
+      "grad_norm": 0.4645586311817169,
+      "learning_rate": 0.00019984796219394592,
+      "loss": 1.2463,
+      "step": 414
+    },
+    {
+      "epoch": 0.0738960113960114,
+      "grad_norm": 0.5051766633987427,
+      "learning_rate": 0.00019984718964590083,
+      "loss": 1.3031,
+      "step": 415
+    },
+    {
+      "epoch": 0.07407407407407407,
+      "grad_norm": 0.4200040400028229,
+      "learning_rate": 0.0001998464151415575,
+      "loss": 1.0842,
+      "step": 416
+    },
+    {
+      "epoch": 0.07425213675213675,
+      "grad_norm": 0.34211036562919617,
+      "learning_rate": 0.000199845638680931,
+      "loss": 0.9659,
+      "step": 417
+    },
+    {
+      "epoch": 0.07443019943019943,
+      "grad_norm": 0.3553323447704315,
+      "learning_rate": 0.00019984486026403668,
+      "loss": 1.0102,
+      "step": 418
+    },
+    {
+      "epoch": 0.0746082621082621,
+      "grad_norm": 0.4967300295829773,
+      "learning_rate": 0.00019984407989088974,
+      "loss": 1.3125,
+      "step": 419
+    },
+    {
+      "epoch": 0.07478632478632478,
+      "grad_norm": 0.41649797558784485,
+      "learning_rate": 0.00019984329756150544,
+      "loss": 1.3092,
+      "step": 420
+    },
+    {
+      "epoch": 0.07496438746438747,
+      "grad_norm": 0.43825802206993103,
+      "learning_rate": 0.00019984251327589912,
+      "loss": 1.3678,
+      "step": 421
+    },
+    {
+      "epoch": 0.07514245014245015,
+      "grad_norm": 0.363394170999527,
+      "learning_rate": 0.00019984172703408617,
+      "loss": 1.305,
+      "step": 422
+    },
+    {
+      "epoch": 0.07532051282051282,
+      "grad_norm": 0.411563903093338,
+      "learning_rate": 0.000199840938836082,
+      "loss": 1.4248,
+      "step": 423
+    },
+    {
+      "epoch": 0.0754985754985755,
+      "grad_norm": 0.40548190474510193,
+      "learning_rate": 0.000199840148681902,
+      "loss": 1.1081,
+      "step": 424
+    },
+    {
+      "epoch": 0.07567663817663818,
+      "grad_norm": 0.3781099021434784,
+      "learning_rate": 0.00019983935657156171,
+      "loss": 1.185,
+      "step": 425
+    },
+    {
+      "epoch": 0.07585470085470085,
+      "grad_norm": 0.46597573161125183,
+      "learning_rate": 0.00019983856250507662,
+      "loss": 1.119,
+      "step": 426
+    },
+    {
+      "epoch": 0.07603276353276353,
+      "grad_norm": 0.3988197147846222,
+      "learning_rate": 0.00019983776648246232,
+      "loss": 1.206,
+      "step": 427
+    },
+    {
+      "epoch": 0.07621082621082621,
+      "grad_norm": 0.41210901737213135,
+      "learning_rate": 0.00019983696850373433,
+      "loss": 1.1843,
+      "step": 428
+    },
+    {
+      "epoch": 0.0763888888888889,
+      "grad_norm": 0.41870948672294617,
+      "learning_rate": 0.00019983616856890837,
+      "loss": 1.2248,
+      "step": 429
+    },
+    {
+      "epoch": 0.07656695156695156,
+      "grad_norm": 0.4320056140422821,
+      "learning_rate": 0.00019983536667800007,
+      "loss": 0.9743,
+      "step": 430
+    },
+    {
+      "epoch": 0.07674501424501425,
+      "grad_norm": 0.48455503582954407,
+      "learning_rate": 0.00019983456283102517,
+      "loss": 1.0438,
+      "step": 431
+    },
+    {
+      "epoch": 0.07692307692307693,
+      "grad_norm": 0.38712427020072937,
+      "learning_rate": 0.00019983375702799935,
+      "loss": 1.2041,
+      "step": 432
+    },
+    {
+      "epoch": 0.0771011396011396,
+      "grad_norm": 0.3578857481479645,
+      "learning_rate": 0.0001998329492689385,
+      "loss": 1.1623,
+      "step": 433
+    },
+    {
+      "epoch": 0.07727920227920228,
+      "grad_norm": 0.43065932393074036,
+      "learning_rate": 0.00019983213955385834,
+      "loss": 1.3033,
+      "step": 434
+    },
+    {
+      "epoch": 0.07745726495726496,
+      "grad_norm": 0.4882095754146576,
+      "learning_rate": 0.00019983132788277484,
+      "loss": 1.1635,
+      "step": 435
+    },
+    {
+      "epoch": 0.07763532763532764,
+      "grad_norm": 0.3429015874862671,
+      "learning_rate": 0.00019983051425570382,
+      "loss": 0.7289,
+      "step": 436
+    },
+    {
+      "epoch": 0.07781339031339031,
+      "grad_norm": 0.4320310056209564,
+      "learning_rate": 0.00019982969867266128,
+      "loss": 1.3685,
+      "step": 437
+    },
+    {
+      "epoch": 0.07799145299145299,
+      "grad_norm": 0.39891982078552246,
+      "learning_rate": 0.00019982888113366314,
+      "loss": 1.0444,
+      "step": 438
+    },
+    {
+      "epoch": 0.07816951566951567,
+      "grad_norm": 0.3675695061683655,
+      "learning_rate": 0.00019982806163872547,
+      "loss": 1.0527,
+      "step": 439
+    },
+    {
+      "epoch": 0.07834757834757834,
+      "grad_norm": 0.42824694514274597,
+      "learning_rate": 0.0001998272401878643,
+      "loss": 1.166,
+      "step": 440
+    },
+    {
+      "epoch": 0.07852564102564102,
+      "grad_norm": 0.3721694350242615,
+      "learning_rate": 0.00019982641678109575,
+      "loss": 1.1328,
+      "step": 441
+    },
+    {
+      "epoch": 0.0787037037037037,
+      "grad_norm": 0.33899208903312683,
+      "learning_rate": 0.00019982559141843592,
+      "loss": 1.016,
+      "step": 442
+    },
+    {
+      "epoch": 0.07888176638176639,
+      "grad_norm": 0.4029340147972107,
+      "learning_rate": 0.000199824764099901,
+      "loss": 1.0076,
+      "step": 443
+    },
+    {
+      "epoch": 0.07905982905982906,
+      "grad_norm": 0.4169132113456726,
+      "learning_rate": 0.0001998239348255072,
+      "loss": 1.208,
+      "step": 444
+    },
+    {
+      "epoch": 0.07923789173789174,
+      "grad_norm": 0.3865824043750763,
+      "learning_rate": 0.00019982310359527075,
+      "loss": 1.067,
+      "step": 445
+    },
+    {
+      "epoch": 0.07941595441595442,
+      "grad_norm": 0.4218919277191162,
+      "learning_rate": 0.00019982227040920796,
+      "loss": 1.195,
+      "step": 446
+    },
+    {
+      "epoch": 0.07959401709401709,
+      "grad_norm": 0.40504586696624756,
+      "learning_rate": 0.00019982143526733512,
+      "loss": 1.0188,
+      "step": 447
+    },
+    {
+      "epoch": 0.07977207977207977,
+      "grad_norm": 0.38330578804016113,
+      "learning_rate": 0.00019982059816966863,
+      "loss": 1.0484,
+      "step": 448
+    },
+    {
+      "epoch": 0.07995014245014245,
+      "grad_norm": 0.43731689453125,
+      "learning_rate": 0.00019981975911622488,
+      "loss": 1.074,
+      "step": 449
+    },
+    {
+      "epoch": 0.08012820512820513,
+      "grad_norm": 0.40858447551727295,
+      "learning_rate": 0.00019981891810702033,
+      "loss": 1.0008,
+      "step": 450
+    },
+    {
+      "epoch": 0.0803062678062678,
+      "grad_norm": 0.4031754732131958,
+      "learning_rate": 0.00019981807514207143,
+      "loss": 1.2179,
+      "step": 451
+    },
+    {
+      "epoch": 0.08048433048433049,
+      "grad_norm": 0.41920867562294006,
+      "learning_rate": 0.00019981723022139466,
+      "loss": 1.1406,
+      "step": 452
+    },
+    {
+      "epoch": 0.08066239316239317,
+      "grad_norm": 0.40305474400520325,
+      "learning_rate": 0.00019981638334500668,
+      "loss": 1.098,
+      "step": 453
+    },
+    {
+      "epoch": 0.08084045584045584,
+      "grad_norm": 0.4564182460308075,
+      "learning_rate": 0.00019981553451292396,
+      "loss": 1.419,
+      "step": 454
+    },
+    {
+      "epoch": 0.08101851851851852,
+      "grad_norm": 0.3832945227622986,
+      "learning_rate": 0.00019981468372516322,
+      "loss": 1.0919,
+      "step": 455
+    },
+    {
+      "epoch": 0.0811965811965812,
+      "grad_norm": 0.43062624335289,
+      "learning_rate": 0.0001998138309817411,
+      "loss": 1.0458,
+      "step": 456
+    },
+    {
+      "epoch": 0.08137464387464387,
+      "grad_norm": 0.3871173560619354,
+      "learning_rate": 0.0001998129762826743,
+      "loss": 1.1391,
+      "step": 457
+    },
+    {
+      "epoch": 0.08155270655270655,
+      "grad_norm": 0.43423157930374146,
+      "learning_rate": 0.0001998121196279796,
+      "loss": 1.1132,
+      "step": 458
+    },
+    {
+      "epoch": 0.08173076923076923,
+      "grad_norm": 0.4341012239456177,
+      "learning_rate": 0.00019981126101767372,
+      "loss": 1.113,
+      "step": 459
+    },
+    {
+      "epoch": 0.08190883190883191,
+      "grad_norm": 0.36748576164245605,
+      "learning_rate": 0.00019981040045177352,
+      "loss": 0.8108,
+      "step": 460
+    },
+    {
+      "epoch": 0.08208689458689458,
+      "grad_norm": 0.43133220076560974,
+      "learning_rate": 0.00019980953793029586,
+      "loss": 1.1861,
+      "step": 461
+    },
+    {
+      "epoch": 0.08226495726495726,
+      "grad_norm": 0.37204909324645996,
+      "learning_rate": 0.00019980867345325767,
+      "loss": 0.9222,
+      "step": 462
+    },
+    {
+      "epoch": 0.08244301994301995,
+      "grad_norm": 0.43370047211647034,
+      "learning_rate": 0.00019980780702067582,
+      "loss": 1.2984,
+      "step": 463
+    },
+    {
+      "epoch": 0.08262108262108261,
+      "grad_norm": 0.4991510808467865,
+      "learning_rate": 0.00019980693863256736,
+      "loss": 1.2222,
+      "step": 464
+    },
+    {
+      "epoch": 0.0827991452991453,
+      "grad_norm": 0.44318175315856934,
+      "learning_rate": 0.00019980606828894927,
+      "loss": 1.2262,
+      "step": 465
+    },
+    {
+      "epoch": 0.08297720797720798,
+      "grad_norm": 0.380231648683548,
+      "learning_rate": 0.0001998051959898386,
+      "loss": 1.0274,
+      "step": 466
+    },
+    {
+      "epoch": 0.08315527065527066,
+      "grad_norm": 0.39519667625427246,
+      "learning_rate": 0.0001998043217352524,
+      "loss": 1.2499,
+      "step": 467
+    },
+    {
+      "epoch": 0.08333333333333333,
+      "grad_norm": 0.457499235868454,
+      "learning_rate": 0.0001998034455252079,
+      "loss": 1.0751,
+      "step": 468
+    },
+    {
+      "epoch": 0.08351139601139601,
+      "grad_norm": 0.368522584438324,
+      "learning_rate": 0.00019980256735972215,
+      "loss": 1.0776,
+      "step": 469
+    },
+    {
+      "epoch": 0.08368945868945869,
+      "grad_norm": 0.3768427073955536,
+      "learning_rate": 0.00019980168723881243,
+      "loss": 1.2198,
+      "step": 470
+    },
+    {
+      "epoch": 0.08386752136752136,
+      "grad_norm": 0.37045565247535706,
+      "learning_rate": 0.000199800805162496,
+      "loss": 1.1816,
+      "step": 471
+    },
+    {
+      "epoch": 0.08404558404558404,
+      "grad_norm": 0.4219281077384949,
+      "learning_rate": 0.0001997999211307901,
+      "loss": 1.0515,
+      "step": 472
+    },
+    {
+      "epoch": 0.08422364672364673,
+      "grad_norm": 0.3815271258354187,
+      "learning_rate": 0.00019979903514371207,
+      "loss": 1.1709,
+      "step": 473
+    },
+    {
+      "epoch": 0.08440170940170941,
+      "grad_norm": 0.4566493630409241,
+      "learning_rate": 0.00019979814720127924,
+      "loss": 1.3063,
+      "step": 474
+    },
+    {
+      "epoch": 0.08457977207977208,
+      "grad_norm": 0.4043879806995392,
+      "learning_rate": 0.000199797257303509,
+      "loss": 1.0549,
+      "step": 475
+    },
+    {
+      "epoch": 0.08475783475783476,
+      "grad_norm": 0.3897830545902252,
+      "learning_rate": 0.00019979636545041886,
+      "loss": 1.1483,
+      "step": 476
+    },
+    {
+      "epoch": 0.08493589743589744,
+      "grad_norm": 0.36097025871276855,
+      "learning_rate": 0.00019979547164202622,
+      "loss": 1.1196,
+      "step": 477
+    },
+    {
+      "epoch": 0.08511396011396011,
+      "grad_norm": 0.3766986131668091,
+      "learning_rate": 0.00019979457587834863,
+      "loss": 1.0131,
+      "step": 478
+    },
+    {
+      "epoch": 0.08529202279202279,
+      "grad_norm": 0.39460286498069763,
+      "learning_rate": 0.00019979367815940364,
+      "loss": 1.1729,
+      "step": 479
+    },
+    {
+      "epoch": 0.08547008547008547,
+      "grad_norm": 0.4137469232082367,
+      "learning_rate": 0.00019979277848520885,
+      "loss": 1.2569,
+      "step": 480
+    },
+    {
+      "epoch": 0.08564814814814815,
+      "grad_norm": 0.464688777923584,
+      "learning_rate": 0.00019979187685578183,
+      "loss": 1.2064,
+      "step": 481
+    },
+    {
+      "epoch": 0.08582621082621082,
+      "grad_norm": 0.4245518147945404,
+      "learning_rate": 0.0001997909732711403,
+      "loss": 0.9812,
+      "step": 482
+    },
+    {
+      "epoch": 0.0860042735042735,
+      "grad_norm": 0.43368837237358093,
+      "learning_rate": 0.00019979006773130197,
+      "loss": 1.2822,
+      "step": 483
+    },
+    {
+      "epoch": 0.08618233618233619,
+      "grad_norm": 0.4232824444770813,
+      "learning_rate": 0.00019978916023628452,
+      "loss": 1.1446,
+      "step": 484
+    },
+    {
+      "epoch": 0.08636039886039885,
+      "grad_norm": 0.4183506369590759,
+      "learning_rate": 0.00019978825078610578,
+      "loss": 1.2605,
+      "step": 485
+    },
+    {
+      "epoch": 0.08653846153846154,
+      "grad_norm": 0.4391268491744995,
+      "learning_rate": 0.00019978733938078356,
+      "loss": 1.2165,
+      "step": 486
+    },
+    {
+      "epoch": 0.08671652421652422,
+      "grad_norm": 0.4139612317085266,
+      "learning_rate": 0.0001997864260203357,
+      "loss": 0.9389,
+      "step": 487
+    },
+    {
+      "epoch": 0.0868945868945869,
+      "grad_norm": 0.4058656096458435,
+      "learning_rate": 0.00019978551070478013,
+      "loss": 1.0652,
+      "step": 488
+    },
+    {
+      "epoch": 0.08707264957264957,
+      "grad_norm": 0.42333099246025085,
+      "learning_rate": 0.00019978459343413473,
+      "loss": 1.119,
+      "step": 489
+    },
+    {
+      "epoch": 0.08725071225071225,
+      "grad_norm": 0.4573031961917877,
+      "learning_rate": 0.00019978367420841754,
+      "loss": 1.1546,
+      "step": 490
+    },
+    {
+      "epoch": 0.08742877492877493,
+      "grad_norm": 0.4161617159843445,
+      "learning_rate": 0.00019978275302764655,
+      "loss": 1.0836,
+      "step": 491
+    },
+    {
+      "epoch": 0.0876068376068376,
+      "grad_norm": 0.422145277261734,
+      "learning_rate": 0.00019978182989183977,
+      "loss": 1.1908,
+      "step": 492
+    },
+    {
+      "epoch": 0.08778490028490028,
+      "grad_norm": 0.4588126838207245,
+      "learning_rate": 0.00019978090480101532,
+      "loss": 1.1758,
+      "step": 493
+    },
+    {
+      "epoch": 0.08796296296296297,
+      "grad_norm": 0.4425722062587738,
+      "learning_rate": 0.00019977997775519132,
+      "loss": 1.088,
+      "step": 494
+    },
+    {
+      "epoch": 0.08814102564102565,
+      "grad_norm": 0.37860307097435,
+      "learning_rate": 0.00019977904875438594,
+      "loss": 1.1532,
+      "step": 495
+    },
+    {
+      "epoch": 0.08831908831908832,
+      "grad_norm": 0.40435823798179626,
+      "learning_rate": 0.00019977811779861733,
+      "loss": 1.1271,
+      "step": 496
+    },
+    {
+      "epoch": 0.088497150997151,
+      "grad_norm": 0.42578884959220886,
+      "learning_rate": 0.0001997771848879038,
+      "loss": 0.9889,
+      "step": 497
+    },
+    {
+      "epoch": 0.08867521367521368,
+      "grad_norm": 0.3439478874206543,
+      "learning_rate": 0.00019977625002226361,
+      "loss": 1.1273,
+      "step": 498
+    },
+    {
+      "epoch": 0.08885327635327635,
+      "grad_norm": 0.362341970205307,
+      "learning_rate": 0.00019977531320171504,
+      "loss": 1.0214,
+      "step": 499
+    },
+    {
+      "epoch": 0.08903133903133903,
+      "grad_norm": 0.4305768609046936,
+      "learning_rate": 0.0001997743744262765,
+      "loss": 1.2648,
+      "step": 500
+    },
+    {
+      "epoch": 0.08920940170940171,
+      "grad_norm": 0.35900023579597473,
+      "learning_rate": 0.00019977343369596636,
+      "loss": 1.0274,
+      "step": 501
+    },
+    {
+      "epoch": 0.0893874643874644,
+      "grad_norm": 0.4950818717479706,
+      "learning_rate": 0.00019977249101080306,
+      "loss": 1.1483,
+      "step": 502
+    },
+    {
+      "epoch": 0.08956552706552706,
+      "grad_norm": 0.3800346553325653,
+      "learning_rate": 0.00019977154637080503,
+      "loss": 1.0636,
+      "step": 503
+    },
+    {
+      "epoch": 0.08974358974358974,
+      "grad_norm": 0.46202352643013,
+      "learning_rate": 0.0001997705997759908,
+      "loss": 1.1544,
+      "step": 504
+    },
+    {
+      "epoch": 0.08992165242165243,
+      "grad_norm": 0.36818403005599976,
+      "learning_rate": 0.00019976965122637895,
+      "loss": 0.9824,
+      "step": 505
+    },
+    {
+      "epoch": 0.0900997150997151,
+      "grad_norm": 0.40248095989227295,
+      "learning_rate": 0.00019976870072198805,
+      "loss": 1.1002,
+      "step": 506
+    },
+    {
+      "epoch": 0.09027777777777778,
+      "grad_norm": 0.3841850459575653,
+      "learning_rate": 0.00019976774826283667,
+      "loss": 1.2433,
+      "step": 507
+    },
+    {
+      "epoch": 0.09045584045584046,
+      "grad_norm": 0.46892330050468445,
+      "learning_rate": 0.0001997667938489435,
+      "loss": 1.3194,
+      "step": 508
+    },
+    {
+      "epoch": 0.09063390313390314,
+      "grad_norm": 0.39059561491012573,
+      "learning_rate": 0.0001997658374803273,
+      "loss": 1.1778,
+      "step": 509
+    },
+    {
+      "epoch": 0.09081196581196581,
+      "grad_norm": 0.3793235421180725,
+      "learning_rate": 0.00019976487915700672,
+      "loss": 1.0659,
+      "step": 510
+    },
+    {
+      "epoch": 0.09099002849002849,
+      "grad_norm": 0.39067742228507996,
+      "learning_rate": 0.00019976391887900058,
+      "loss": 1.107,
+      "step": 511
+    },
+    {
+      "epoch": 0.09116809116809117,
+      "grad_norm": 0.40121713280677795,
+      "learning_rate": 0.00019976295664632772,
+      "loss": 1.102,
+      "step": 512
+    },
+    {
+      "epoch": 0.09134615384615384,
+      "grad_norm": 0.49830010533332825,
+      "learning_rate": 0.00019976199245900697,
+      "loss": 1.1701,
+      "step": 513
+    },
+    {
+      "epoch": 0.09152421652421652,
+      "grad_norm": 0.4536968171596527,
+      "learning_rate": 0.0001997610263170572,
+      "loss": 1.1067,
+      "step": 514
+    },
+    {
+      "epoch": 0.0917022792022792,
+      "grad_norm": 0.3832971453666687,
+      "learning_rate": 0.00019976005822049735,
+      "loss": 1.0991,
+      "step": 515
+    },
+    {
+      "epoch": 0.09188034188034189,
+      "grad_norm": 0.4093509614467621,
+      "learning_rate": 0.0001997590881693464,
+      "loss": 1.0565,
+      "step": 516
+    },
+    {
+      "epoch": 0.09205840455840456,
+      "grad_norm": 0.46073687076568604,
+      "learning_rate": 0.0001997581161636233,
+      "loss": 1.0057,
+      "step": 517
+    },
+    {
+      "epoch": 0.09223646723646724,
+      "grad_norm": 0.5001922845840454,
+      "learning_rate": 0.0001997571422033472,
+      "loss": 1.2639,
+      "step": 518
+    },
+    {
+      "epoch": 0.09241452991452992,
+      "grad_norm": 0.4620618224143982,
+      "learning_rate": 0.00019975616628853713,
+      "loss": 1.0966,
+      "step": 519
+    },
+    {
+      "epoch": 0.09259259259259259,
+      "grad_norm": 0.3788183927536011,
+      "learning_rate": 0.0001997551884192122,
+      "loss": 0.9783,
+      "step": 520
+    },
+    {
+      "epoch": 0.09277065527065527,
+      "grad_norm": 0.45589539408683777,
+      "learning_rate": 0.00019975420859539154,
+      "loss": 1.2194,
+      "step": 521
+    },
+    {
+      "epoch": 0.09294871794871795,
+      "grad_norm": 0.40747523307800293,
+      "learning_rate": 0.00019975322681709443,
+      "loss": 1.0349,
+      "step": 522
+    },
+    {
+      "epoch": 0.09312678062678063,
+      "grad_norm": 0.5045142769813538,
+      "learning_rate": 0.00019975224308434002,
+      "loss": 1.1373,
+      "step": 523
+    },
+    {
+      "epoch": 0.0933048433048433,
+      "grad_norm": 0.40352702140808105,
+      "learning_rate": 0.00019975125739714767,
+      "loss": 1.1236,
+      "step": 524
+    },
+    {
+      "epoch": 0.09348290598290598,
+      "grad_norm": 0.4301735758781433,
+      "learning_rate": 0.0001997502697555366,
+      "loss": 1.2932,
+      "step": 525
+    },
+    {
+      "epoch": 0.09366096866096867,
+      "grad_norm": 0.36800238490104675,
+      "learning_rate": 0.00019974928015952624,
+      "loss": 1.0734,
+      "step": 526
+    },
+    {
+      "epoch": 0.09383903133903133,
+      "grad_norm": 0.4027230143547058,
+      "learning_rate": 0.00019974828860913594,
+      "loss": 1.2776,
+      "step": 527
+    },
+    {
+      "epoch": 0.09401709401709402,
+      "grad_norm": 0.42497140169143677,
+      "learning_rate": 0.0001997472951043851,
+      "loss": 1.248,
+      "step": 528
+    },
+    {
+      "epoch": 0.0941951566951567,
+      "grad_norm": 0.3888593018054962,
+      "learning_rate": 0.00019974629964529325,
+      "loss": 1.0231,
+      "step": 529
+    },
+    {
+      "epoch": 0.09437321937321937,
+      "grad_norm": 0.3761361241340637,
+      "learning_rate": 0.00019974530223187986,
+      "loss": 1.0216,
+      "step": 530
+    },
+    {
+      "epoch": 0.09455128205128205,
+      "grad_norm": 0.42192980647087097,
+      "learning_rate": 0.00019974430286416448,
+      "loss": 1.0731,
+      "step": 531
+    },
+    {
+      "epoch": 0.09472934472934473,
+      "grad_norm": 0.44244512915611267,
+      "learning_rate": 0.00019974330154216667,
+      "loss": 1.2793,
+      "step": 532
+    },
+    {
+      "epoch": 0.09490740740740741,
+      "grad_norm": 0.378252774477005,
+      "learning_rate": 0.0001997422982659061,
+      "loss": 1.0462,
+      "step": 533
+    },
+    {
+      "epoch": 0.09508547008547008,
+      "grad_norm": 0.45589110255241394,
+      "learning_rate": 0.00019974129303540236,
+      "loss": 1.1884,
+      "step": 534
+    },
+    {
+      "epoch": 0.09526353276353276,
+      "grad_norm": 0.33930808305740356,
+      "learning_rate": 0.0001997402858506752,
+      "loss": 0.8381,
+      "step": 535
+    },
+    {
+      "epoch": 0.09544159544159544,
+      "grad_norm": 0.45408427715301514,
+      "learning_rate": 0.0001997392767117443,
+      "loss": 1.2379,
+      "step": 536
+    },
+    {
+      "epoch": 0.09561965811965811,
+      "grad_norm": 0.44125741720199585,
+      "learning_rate": 0.0001997382656186295,
+      "loss": 1.1941,
+      "step": 537
+    },
+    {
+      "epoch": 0.0957977207977208,
+      "grad_norm": 0.4075697660446167,
+      "learning_rate": 0.00019973725257135054,
+      "loss": 1.0142,
+      "step": 538
+    },
+    {
+      "epoch": 0.09597578347578348,
+      "grad_norm": 0.4258415102958679,
+      "learning_rate": 0.00019973623756992733,
+      "loss": 1.0447,
+      "step": 539
+    },
+    {
+      "epoch": 0.09615384615384616,
+      "grad_norm": 0.2738485038280487,
+      "learning_rate": 0.0001997352206143797,
+      "loss": 0.5521,
+      "step": 540
+    },
+    {
+      "epoch": 0.09633190883190883,
+      "grad_norm": 0.38815587759017944,
+      "learning_rate": 0.00019973420170472762,
+      "loss": 1.1052,
+      "step": 541
+    },
+    {
+      "epoch": 0.09650997150997151,
+      "grad_norm": 0.3909834027290344,
+      "learning_rate": 0.00019973318084099106,
+      "loss": 1.0494,
+      "step": 542
+    },
+    {
+      "epoch": 0.09668803418803419,
+      "grad_norm": 0.4517597258090973,
+      "learning_rate": 0.00019973215802318996,
+      "loss": 1.0611,
+      "step": 543
+    },
+    {
+      "epoch": 0.09686609686609686,
+      "grad_norm": 0.48659002780914307,
+      "learning_rate": 0.00019973113325134442,
+      "loss": 0.9967,
+      "step": 544
+    },
+    {
+      "epoch": 0.09704415954415954,
+      "grad_norm": 0.4039791524410248,
+      "learning_rate": 0.0001997301065254745,
+      "loss": 1.251,
+      "step": 545
+    },
+    {
+      "epoch": 0.09722222222222222,
+      "grad_norm": 0.3985383212566376,
+      "learning_rate": 0.0001997290778456003,
+      "loss": 1.2263,
+      "step": 546
+    },
+    {
+      "epoch": 0.0974002849002849,
+      "grad_norm": 0.4540637731552124,
+      "learning_rate": 0.00019972804721174199,
+      "loss": 1.2084,
+      "step": 547
+    },
+    {
+      "epoch": 0.09757834757834757,
+      "grad_norm": 0.36867982149124146,
+      "learning_rate": 0.00019972701462391977,
+      "loss": 0.9704,
+      "step": 548
+    },
+    {
+      "epoch": 0.09775641025641026,
+      "grad_norm": 0.40199780464172363,
+      "learning_rate": 0.00019972598008215385,
+      "loss": 1.1121,
+      "step": 549
+    },
+    {
+      "epoch": 0.09793447293447294,
+      "grad_norm": 0.42728984355926514,
+      "learning_rate": 0.00019972494358646455,
+      "loss": 1.1606,
+      "step": 550
+    },
+    {
+      "epoch": 0.0981125356125356,
+      "grad_norm": 0.4212374687194824,
+      "learning_rate": 0.0001997239051368721,
+      "loss": 1.3093,
+      "step": 551
+    },
+    {
+      "epoch": 0.09829059829059829,
+      "grad_norm": 0.3972226083278656,
+      "learning_rate": 0.0001997228647333969,
+      "loss": 1.1218,
+      "step": 552
+    },
+    {
+      "epoch": 0.09846866096866097,
+      "grad_norm": 0.43649932742118835,
+      "learning_rate": 0.00019972182237605935,
+      "loss": 1.2532,
+      "step": 553
+    },
+    {
+      "epoch": 0.09864672364672365,
+      "grad_norm": 0.3812280595302582,
+      "learning_rate": 0.0001997207780648798,
+      "loss": 1.0409,
+      "step": 554
+    },
+    {
+      "epoch": 0.09882478632478632,
+      "grad_norm": 0.41684821248054504,
+      "learning_rate": 0.00019971973179987878,
+      "loss": 0.9569,
+      "step": 555
+    },
+    {
+      "epoch": 0.099002849002849,
+      "grad_norm": 0.38081470131874084,
+      "learning_rate": 0.00019971868358107674,
+      "loss": 1.1615,
+      "step": 556
+    },
+    {
+      "epoch": 0.09918091168091168,
+      "grad_norm": 0.3702073097229004,
+      "learning_rate": 0.0001997176334084943,
+      "loss": 1.3907,
+      "step": 557
+    },
+    {
+      "epoch": 0.09935897435897435,
+      "grad_norm": 0.3625728189945221,
+      "learning_rate": 0.00019971658128215193,
+      "loss": 1.1897,
+      "step": 558
+    },
+    {
+      "epoch": 0.09953703703703703,
+      "grad_norm": 0.3815405070781708,
+      "learning_rate": 0.0001997155272020703,
+      "loss": 1.1473,
+      "step": 559
+    },
+    {
+      "epoch": 0.09971509971509972,
+      "grad_norm": 0.48664286732673645,
+      "learning_rate": 0.00019971447116827004,
+      "loss": 1.2462,
+      "step": 560
+    },
+    {
+      "epoch": 0.0998931623931624,
+      "grad_norm": 0.3708696663379669,
+      "learning_rate": 0.0001997134131807719,
+      "loss": 1.0979,
+      "step": 561
+    },
+    {
+      "epoch": 0.10007122507122507,
+      "grad_norm": 0.44511324167251587,
+      "learning_rate": 0.00019971235323959654,
+      "loss": 1.2313,
+      "step": 562
+    },
+    {
+      "epoch": 0.10024928774928775,
+      "grad_norm": 0.3687448799610138,
+      "learning_rate": 0.00019971129134476473,
+      "loss": 1.1526,
+      "step": 563
+    },
+    {
+      "epoch": 0.10042735042735043,
+      "grad_norm": 0.4506866931915283,
+      "learning_rate": 0.00019971022749629735,
+      "loss": 1.0003,
+      "step": 564
+    },
+    {
+      "epoch": 0.1006054131054131,
+      "grad_norm": 0.41910406947135925,
+      "learning_rate": 0.00019970916169421515,
+      "loss": 1.013,
+      "step": 565
+    },
+    {
+      "epoch": 0.10078347578347578,
+      "grad_norm": 0.39728936553001404,
+      "learning_rate": 0.0001997080939385391,
+      "loss": 1.0501,
+      "step": 566
+    },
+    {
+      "epoch": 0.10096153846153846,
+      "grad_norm": 0.41415902972221375,
+      "learning_rate": 0.00019970702422929005,
+      "loss": 1.0791,
+      "step": 567
+    },
+    {
+      "epoch": 0.10113960113960115,
+      "grad_norm": 0.45630788803100586,
+      "learning_rate": 0.00019970595256648896,
+      "loss": 1.2884,
+      "step": 568
+    },
+    {
+      "epoch": 0.10131766381766381,
+      "grad_norm": 0.4371698796749115,
+      "learning_rate": 0.00019970487895015686,
+      "loss": 1.0684,
+      "step": 569
+    },
+    {
+      "epoch": 0.1014957264957265,
+      "grad_norm": 0.4350591003894806,
+      "learning_rate": 0.00019970380338031477,
+      "loss": 1.2415,
+      "step": 570
+    },
+    {
+      "epoch": 0.10167378917378918,
+      "grad_norm": 0.4232708215713501,
+      "learning_rate": 0.00019970272585698382,
+      "loss": 1.2656,
+      "step": 571
+    },
+    {
+      "epoch": 0.10185185185185185,
+      "grad_norm": 0.3917689919471741,
+      "learning_rate": 0.00019970164638018502,
+      "loss": 1.0178,
+      "step": 572
+    },
+    {
+      "epoch": 0.10202991452991453,
+      "grad_norm": 0.4262804388999939,
+      "learning_rate": 0.0001997005649499396,
+      "loss": 1.1805,
+      "step": 573
+    },
+    {
+      "epoch": 0.10220797720797721,
+      "grad_norm": 0.5217884182929993,
+      "learning_rate": 0.0001996994815662687,
+      "loss": 1.2392,
+      "step": 574
+    },
+    {
+      "epoch": 0.10238603988603989,
+      "grad_norm": 0.4273875057697296,
+      "learning_rate": 0.00019969839622919358,
+      "loss": 1.0844,
+      "step": 575
+    },
+    {
+      "epoch": 0.10256410256410256,
+      "grad_norm": 0.41588085889816284,
+      "learning_rate": 0.00019969730893873547,
+      "loss": 1.2437,
+      "step": 576
+    },
+    {
+      "epoch": 0.10274216524216524,
+      "grad_norm": 0.41617709398269653,
+      "learning_rate": 0.0001996962196949157,
+      "loss": 0.9519,
+      "step": 577
+    },
+    {
+      "epoch": 0.10292022792022792,
+      "grad_norm": 0.4832979142665863,
+      "learning_rate": 0.00019969512849775565,
+      "loss": 1.1889,
+      "step": 578
+    },
+    {
+      "epoch": 0.10309829059829059,
+      "grad_norm": 0.3936060965061188,
+      "learning_rate": 0.0001996940353472766,
+      "loss": 0.9888,
+      "step": 579
+    },
+    {
+      "epoch": 0.10327635327635327,
+      "grad_norm": 0.4147680997848511,
+      "learning_rate": 0.00019969294024350004,
+      "loss": 1.0733,
+      "step": 580
+    },
+    {
+      "epoch": 0.10345441595441596,
+      "grad_norm": 0.37791356444358826,
+      "learning_rate": 0.00019969184318644742,
+      "loss": 1.212,
+      "step": 581
+    },
+    {
+      "epoch": 0.10363247863247864,
+      "grad_norm": 0.44297221302986145,
+      "learning_rate": 0.00019969074417614023,
+      "loss": 1.0535,
+      "step": 582
+    },
+    {
+      "epoch": 0.10381054131054131,
+      "grad_norm": 0.4032835066318512,
+      "learning_rate": 0.0001996896432126,
+      "loss": 1.1869,
+      "step": 583
+    },
+    {
+      "epoch": 0.10398860398860399,
+      "grad_norm": 0.49271953105926514,
+      "learning_rate": 0.00019968854029584827,
+      "loss": 1.1661,
+      "step": 584
+    },
+    {
+      "epoch": 0.10416666666666667,
+      "grad_norm": 0.362699031829834,
+      "learning_rate": 0.0001996874354259067,
+      "loss": 0.868,
+      "step": 585
+    },
+    {
+      "epoch": 0.10434472934472934,
+      "grad_norm": 0.401795357465744,
+      "learning_rate": 0.0001996863286027969,
+      "loss": 1.1045,
+      "step": 586
+    },
+    {
+      "epoch": 0.10452279202279202,
+      "grad_norm": 0.45380479097366333,
+      "learning_rate": 0.00019968521982654058,
+      "loss": 0.8503,
+      "step": 587
+    },
+    {
+      "epoch": 0.1047008547008547,
+      "grad_norm": 0.49759066104888916,
+      "learning_rate": 0.00019968410909715947,
+      "loss": 1.4073,
+      "step": 588
+    },
+    {
+      "epoch": 0.10487891737891739,
+      "grad_norm": 0.4421198070049286,
+      "learning_rate": 0.0001996829964146753,
+      "loss": 1.1512,
+      "step": 589
+    },
+    {
+      "epoch": 0.10505698005698005,
+      "grad_norm": 0.46675658226013184,
+      "learning_rate": 0.00019968188177910988,
+      "loss": 1.0132,
+      "step": 590
+    },
+    {
+      "epoch": 0.10523504273504274,
+      "grad_norm": 0.5710657238960266,
+      "learning_rate": 0.00019968076519048507,
+      "loss": 1.267,
+      "step": 591
+    },
+    {
+      "epoch": 0.10541310541310542,
+      "grad_norm": 0.4655563235282898,
+      "learning_rate": 0.00019967964664882276,
+      "loss": 1.1204,
+      "step": 592
+    },
+    {
+      "epoch": 0.10559116809116809,
+      "grad_norm": 0.3895256519317627,
+      "learning_rate": 0.00019967852615414478,
+      "loss": 1.0814,
+      "step": 593
+    },
+    {
+      "epoch": 0.10576923076923077,
+      "grad_norm": 0.424216091632843,
+      "learning_rate": 0.00019967740370647322,
+      "loss": 1.1663,
+      "step": 594
+    },
+    {
+      "epoch": 0.10594729344729345,
+      "grad_norm": 0.3978985846042633,
+      "learning_rate": 0.00019967627930582996,
+      "loss": 0.909,
+      "step": 595
+    },
+    {
+      "epoch": 0.10612535612535613,
+      "grad_norm": 0.47064995765686035,
+      "learning_rate": 0.00019967515295223705,
+      "loss": 1.2351,
+      "step": 596
+    },
+    {
+      "epoch": 0.1063034188034188,
+      "grad_norm": 0.42449644207954407,
+      "learning_rate": 0.0001996740246457166,
+      "loss": 0.9739,
+      "step": 597
+    },
+    {
+      "epoch": 0.10648148148148148,
+      "grad_norm": 0.39033401012420654,
+      "learning_rate": 0.00019967289438629066,
+      "loss": 1.0933,
+      "step": 598
+    },
+    {
+      "epoch": 0.10665954415954416,
+      "grad_norm": 0.4398612678050995,
+      "learning_rate": 0.00019967176217398143,
+      "loss": 1.2479,
+      "step": 599
+    },
+    {
+      "epoch": 0.10683760683760683,
+      "grad_norm": 0.3946632742881775,
+      "learning_rate": 0.00019967062800881107,
+      "loss": 1.0417,
+      "step": 600
+    },
+    {
+      "epoch": 0.10701566951566951,
+      "grad_norm": 0.5083445906639099,
+      "learning_rate": 0.0001996694918908018,
+      "loss": 1.1109,
+      "step": 601
+    },
+    {
+      "epoch": 0.1071937321937322,
+      "grad_norm": 0.477724552154541,
+      "learning_rate": 0.00019966835381997585,
+      "loss": 1.2891,
+      "step": 602
+    },
+    {
+      "epoch": 0.10737179487179487,
+      "grad_norm": 0.4110167920589447,
+      "learning_rate": 0.0001996672137963556,
+      "loss": 1.0555,
+      "step": 603
+    },
+    {
+      "epoch": 0.10754985754985755,
+      "grad_norm": 0.44078320264816284,
+      "learning_rate": 0.00019966607181996334,
+      "loss": 0.9188,
+      "step": 604
+    },
+    {
+      "epoch": 0.10772792022792023,
+      "grad_norm": 0.41251105070114136,
+      "learning_rate": 0.00019966492789082142,
+      "loss": 1.2592,
+      "step": 605
+    },
+    {
+      "epoch": 0.10790598290598291,
+      "grad_norm": 0.37701505422592163,
+      "learning_rate": 0.00019966378200895227,
+      "loss": 1.0233,
+      "step": 606
+    },
+    {
+      "epoch": 0.10808404558404558,
+      "grad_norm": 0.44624966382980347,
+      "learning_rate": 0.00019966263417437835,
+      "loss": 1.2273,
+      "step": 607
+    },
+    {
+      "epoch": 0.10826210826210826,
+      "grad_norm": 0.3618549108505249,
+      "learning_rate": 0.00019966148438712214,
+      "loss": 0.9101,
+      "step": 608
+    },
+    {
+      "epoch": 0.10844017094017094,
+      "grad_norm": 0.384574294090271,
+      "learning_rate": 0.00019966033264720616,
+      "loss": 1.1769,
+      "step": 609
+    },
+    {
+      "epoch": 0.10861823361823361,
+      "grad_norm": 0.50872403383255,
+      "learning_rate": 0.000199659178954653,
+      "loss": 1.1213,
+      "step": 610
+    },
+    {
+      "epoch": 0.1087962962962963,
+      "grad_norm": 0.39736685156822205,
+      "learning_rate": 0.00019965802330948527,
+      "loss": 1.275,
+      "step": 611
+    },
+    {
+      "epoch": 0.10897435897435898,
+      "grad_norm": 0.484660267829895,
+      "learning_rate": 0.00019965686571172557,
+      "loss": 1.1671,
+      "step": 612
+    },
+    {
+      "epoch": 0.10915242165242166,
+      "grad_norm": 0.41420218348503113,
+      "learning_rate": 0.0001996557061613966,
+      "loss": 0.9541,
+      "step": 613
+    },
+    {
+      "epoch": 0.10933048433048433,
+      "grad_norm": 0.4057196080684662,
+      "learning_rate": 0.00019965454465852112,
+      "loss": 1.0145,
+      "step": 614
+    },
+    {
+      "epoch": 0.10950854700854701,
+      "grad_norm": 0.4559510052204132,
+      "learning_rate": 0.00019965338120312182,
+      "loss": 1.0889,
+      "step": 615
+    },
+    {
+      "epoch": 0.10968660968660969,
+      "grad_norm": 0.40960055589675903,
+      "learning_rate": 0.00019965221579522154,
+      "loss": 1.1447,
+      "step": 616
+    },
+    {
+      "epoch": 0.10986467236467236,
+      "grad_norm": 0.4701732099056244,
+      "learning_rate": 0.0001996510484348431,
+      "loss": 1.2871,
+      "step": 617
+    },
+    {
+      "epoch": 0.11004273504273504,
+      "grad_norm": 0.38420796394348145,
+      "learning_rate": 0.0001996498791220094,
+      "loss": 1.058,
+      "step": 618
+    },
+    {
+      "epoch": 0.11022079772079772,
+      "grad_norm": 0.4014730453491211,
+      "learning_rate": 0.00019964870785674327,
+      "loss": 1.023,
+      "step": 619
+    },
+    {
+      "epoch": 0.1103988603988604,
+      "grad_norm": 0.38846179842948914,
+      "learning_rate": 0.00019964753463906773,
+      "loss": 0.9834,
+      "step": 620
+    },
+    {
+      "epoch": 0.11057692307692307,
+      "grad_norm": 0.5120236277580261,
+      "learning_rate": 0.00019964635946900577,
+      "loss": 1.2347,
+      "step": 621
+    },
+    {
+      "epoch": 0.11075498575498575,
+      "grad_norm": 0.40483301877975464,
+      "learning_rate": 0.00019964518234658038,
+      "loss": 1.131,
+      "step": 622
+    },
+    {
+      "epoch": 0.11093304843304844,
+      "grad_norm": 0.445782870054245,
+      "learning_rate": 0.00019964400327181464,
+      "loss": 0.9349,
+      "step": 623
+    },
+    {
+      "epoch": 0.1111111111111111,
+      "grad_norm": 0.490460604429245,
+      "learning_rate": 0.00019964282224473165,
+      "loss": 1.0257,
+      "step": 624
+    },
+    {
+      "epoch": 0.11128917378917379,
+      "grad_norm": 0.37585243582725525,
+      "learning_rate": 0.00019964163926535454,
+      "loss": 0.9724,
+      "step": 625
+    },
+    {
+      "epoch": 0.11146723646723647,
+      "grad_norm": 0.4160473346710205,
+      "learning_rate": 0.00019964045433370651,
+      "loss": 0.874,
+      "step": 626
+    },
+    {
+      "epoch": 0.11164529914529915,
+      "grad_norm": 0.442425012588501,
+      "learning_rate": 0.00019963926744981074,
+      "loss": 1.064,
+      "step": 627
+    },
+    {
+      "epoch": 0.11182336182336182,
+      "grad_norm": 0.4451471269130707,
+      "learning_rate": 0.00019963807861369054,
+      "loss": 1.2343,
+      "step": 628
+    },
+    {
+      "epoch": 0.1120014245014245,
+      "grad_norm": 0.5018183588981628,
+      "learning_rate": 0.00019963688782536913,
+      "loss": 1.1226,
+      "step": 629
+    },
+    {
+      "epoch": 0.11217948717948718,
+      "grad_norm": 0.43723925948143005,
+      "learning_rate": 0.0001996356950848699,
+      "loss": 1.0178,
+      "step": 630
+    },
+    {
+      "epoch": 0.11235754985754985,
+      "grad_norm": 0.4794611930847168,
+      "learning_rate": 0.0001996345003922162,
+      "loss": 0.9695,
+      "step": 631
+    },
+    {
+      "epoch": 0.11253561253561253,
+      "grad_norm": 0.5021790266036987,
+      "learning_rate": 0.00019963330374743143,
+      "loss": 1.1748,
+      "step": 632
+    },
+    {
+      "epoch": 0.11271367521367522,
+      "grad_norm": 0.47228625416755676,
+      "learning_rate": 0.00019963210515053906,
+      "loss": 1.2138,
+      "step": 633
+    },
+    {
+      "epoch": 0.1128917378917379,
+      "grad_norm": 0.4261155128479004,
+      "learning_rate": 0.00019963090460156256,
+      "loss": 0.9428,
+      "step": 634
+    },
+    {
+      "epoch": 0.11306980056980057,
+      "grad_norm": 0.3279525339603424,
+      "learning_rate": 0.00019962970210052542,
+      "loss": 0.7803,
+      "step": 635
+    },
+    {
+      "epoch": 0.11324786324786325,
+      "grad_norm": 0.5106086730957031,
+      "learning_rate": 0.00019962849764745125,
+      "loss": 1.113,
+      "step": 636
+    },
+    {
+      "epoch": 0.11342592592592593,
+      "grad_norm": 0.38272222876548767,
+      "learning_rate": 0.00019962729124236363,
+      "loss": 0.896,
+      "step": 637
+    },
+    {
+      "epoch": 0.1136039886039886,
+      "grad_norm": 0.39532098174095154,
+      "learning_rate": 0.0001996260828852862,
+      "loss": 0.9308,
+      "step": 638
+    },
+    {
+      "epoch": 0.11378205128205128,
+      "grad_norm": 0.44947221875190735,
+      "learning_rate": 0.00019962487257624262,
+      "loss": 1.207,
+      "step": 639
+    },
+    {
+      "epoch": 0.11396011396011396,
+      "grad_norm": 0.40684598684310913,
+      "learning_rate": 0.00019962366031525664,
+      "loss": 1.11,
+      "step": 640
+    },
+    {
+      "epoch": 0.11413817663817664,
+      "grad_norm": 0.4296625852584839,
+      "learning_rate": 0.00019962244610235194,
+      "loss": 1.2784,
+      "step": 641
+    },
+    {
+      "epoch": 0.11431623931623931,
+      "grad_norm": 0.4560794532299042,
+      "learning_rate": 0.0001996212299375524,
+      "loss": 1.1191,
+      "step": 642
+    },
+    {
+      "epoch": 0.114494301994302,
+      "grad_norm": 0.40246087312698364,
+      "learning_rate": 0.00019962001182088177,
+      "loss": 1.1401,
+      "step": 643
+    },
+    {
+      "epoch": 0.11467236467236468,
+      "grad_norm": 0.3938910663127899,
+      "learning_rate": 0.000199618791752364,
+      "loss": 1.0959,
+      "step": 644
+    },
+    {
+      "epoch": 0.11485042735042734,
+      "grad_norm": 0.4123380184173584,
+      "learning_rate": 0.00019961756973202287,
+      "loss": 1.2824,
+      "step": 645
+    },
+    {
+      "epoch": 0.11502849002849003,
+      "grad_norm": 0.41085442900657654,
+      "learning_rate": 0.00019961634575988243,
+      "loss": 1.1137,
+      "step": 646
+    },
+    {
+      "epoch": 0.11520655270655271,
+      "grad_norm": 0.38276201486587524,
+      "learning_rate": 0.0001996151198359667,
+      "loss": 1.0747,
+      "step": 647
+    },
+    {
+      "epoch": 0.11538461538461539,
+      "grad_norm": 0.49269407987594604,
+      "learning_rate": 0.00019961389196029953,
+      "loss": 1.1731,
+      "step": 648
+    },
+    {
+      "epoch": 0.11556267806267806,
+      "grad_norm": 0.5152469277381897,
+      "learning_rate": 0.00019961266213290512,
+      "loss": 1.3574,
+      "step": 649
+    },
+    {
+      "epoch": 0.11574074074074074,
+      "grad_norm": 0.4835714101791382,
+      "learning_rate": 0.0001996114303538075,
+      "loss": 1.2859,
+      "step": 650
+    },
+    {
+      "epoch": 0.11591880341880342,
+      "grad_norm": 0.4284524917602539,
+      "learning_rate": 0.00019961019662303087,
+      "loss": 1.1103,
+      "step": 651
+    },
+    {
+      "epoch": 0.11609686609686609,
+      "grad_norm": 0.3933276832103729,
+      "learning_rate": 0.00019960896094059933,
+      "loss": 1.2647,
+      "step": 652
+    },
+    {
+      "epoch": 0.11627492877492877,
+      "grad_norm": 0.33749741315841675,
+      "learning_rate": 0.00019960772330653712,
+      "loss": 0.819,
+      "step": 653
+    },
+    {
+      "epoch": 0.11645299145299146,
+      "grad_norm": 0.48122069239616394,
+      "learning_rate": 0.00019960648372086852,
+      "loss": 1.2781,
+      "step": 654
+    },
+    {
+      "epoch": 0.11663105413105414,
+      "grad_norm": 0.4681607186794281,
+      "learning_rate": 0.00019960524218361775,
+      "loss": 0.9723,
+      "step": 655
+    },
+    {
+      "epoch": 0.1168091168091168,
+      "grad_norm": 0.3974960148334503,
+      "learning_rate": 0.0001996039986948092,
+      "loss": 1.0302,
+      "step": 656
+    },
+    {
+      "epoch": 0.11698717948717949,
+      "grad_norm": 0.43180662393569946,
+      "learning_rate": 0.0001996027532544672,
+      "loss": 1.3265,
+      "step": 657
+    },
+    {
+      "epoch": 0.11716524216524217,
+      "grad_norm": 0.4481917917728424,
+      "learning_rate": 0.00019960150586261613,
+      "loss": 1.136,
+      "step": 658
+    },
+    {
+      "epoch": 0.11734330484330484,
+      "grad_norm": 0.43428945541381836,
+      "learning_rate": 0.00019960025651928045,
+      "loss": 1.2412,
+      "step": 659
+    },
+    {
+      "epoch": 0.11752136752136752,
+      "grad_norm": 0.36211395263671875,
+      "learning_rate": 0.00019959900522448467,
+      "loss": 0.9563,
+      "step": 660
+    },
+    {
+      "epoch": 0.1176994301994302,
+      "grad_norm": 0.43585848808288574,
+      "learning_rate": 0.0001995977519782533,
+      "loss": 1.1677,
+      "step": 661
+    },
+    {
+      "epoch": 0.11787749287749288,
+      "grad_norm": 0.4232597351074219,
+      "learning_rate": 0.00019959649678061086,
+      "loss": 1.1187,
+      "step": 662
+    },
+    {
+      "epoch": 0.11805555555555555,
+      "grad_norm": 0.3304753303527832,
+      "learning_rate": 0.00019959523963158194,
+      "loss": 0.8473,
+      "step": 663
+    },
+    {
+      "epoch": 0.11823361823361823,
+      "grad_norm": 0.37600061297416687,
+      "learning_rate": 0.0001995939805311912,
+      "loss": 1.1227,
+      "step": 664
+    },
+    {
+      "epoch": 0.11841168091168092,
+      "grad_norm": 0.33417847752571106,
+      "learning_rate": 0.0001995927194794633,
+      "loss": 1.0315,
+      "step": 665
+    },
+    {
+      "epoch": 0.11858974358974358,
+      "grad_norm": 0.46799129247665405,
+      "learning_rate": 0.00019959145647642298,
+      "loss": 1.135,
+      "step": 666
+    },
+    {
+      "epoch": 0.11876780626780627,
+      "grad_norm": 0.4141576886177063,
+      "learning_rate": 0.0001995901915220949,
+      "loss": 1.0956,
+      "step": 667
+    },
+    {
+      "epoch": 0.11894586894586895,
+      "grad_norm": 0.3824596405029297,
+      "learning_rate": 0.0001995889246165039,
+      "loss": 1.1782,
+      "step": 668
+    },
+    {
+      "epoch": 0.11912393162393162,
+      "grad_norm": 0.4087786376476288,
+      "learning_rate": 0.00019958765575967484,
+      "loss": 0.9704,
+      "step": 669
+    },
+    {
+      "epoch": 0.1193019943019943,
+      "grad_norm": 0.5161317586898804,
+      "learning_rate": 0.00019958638495163252,
+      "loss": 1.2207,
+      "step": 670
+    },
+    {
+      "epoch": 0.11948005698005698,
+      "grad_norm": 0.4782274067401886,
+      "learning_rate": 0.0001995851121924019,
+      "loss": 1.1257,
+      "step": 671
+    },
+    {
+      "epoch": 0.11965811965811966,
+      "grad_norm": 0.40617331862449646,
+      "learning_rate": 0.00019958383748200782,
+      "loss": 1.1153,
+      "step": 672
+    },
+    {
+      "epoch": 0.11983618233618233,
+      "grad_norm": 0.40149980783462524,
+      "learning_rate": 0.00019958256082047533,
+      "loss": 0.9785,
+      "step": 673
+    },
+    {
+      "epoch": 0.12001424501424501,
+      "grad_norm": 0.4378886818885803,
+      "learning_rate": 0.00019958128220782942,
+      "loss": 1.1355,
+      "step": 674
+    },
+    {
+      "epoch": 0.1201923076923077,
+      "grad_norm": 0.4449596703052521,
+      "learning_rate": 0.0001995800016440952,
+      "loss": 1.0325,
+      "step": 675
+    },
+    {
+      "epoch": 0.12037037037037036,
+      "grad_norm": 0.4268079698085785,
+      "learning_rate": 0.00019957871912929765,
+      "loss": 1.1901,
+      "step": 676
+    },
+    {
+      "epoch": 0.12054843304843305,
+      "grad_norm": 0.4250091016292572,
+      "learning_rate": 0.00019957743466346198,
+      "loss": 1.0084,
+      "step": 677
+    },
+    {
+      "epoch": 0.12072649572649573,
+      "grad_norm": 0.40724286437034607,
+      "learning_rate": 0.0001995761482466133,
+      "loss": 1.0866,
+      "step": 678
+    },
+    {
+      "epoch": 0.12090455840455841,
+      "grad_norm": 0.42478349804878235,
+      "learning_rate": 0.00019957485987877688,
+      "loss": 1.1909,
+      "step": 679
+    },
+    {
+      "epoch": 0.12108262108262108,
+      "grad_norm": 0.371362566947937,
+      "learning_rate": 0.0001995735695599779,
+      "loss": 1.083,
+      "step": 680
+    },
+    {
+      "epoch": 0.12126068376068376,
+      "grad_norm": 0.4715283513069153,
+      "learning_rate": 0.0001995722772902417,
+      "loss": 1.2942,
+      "step": 681
+    },
+    {
+      "epoch": 0.12143874643874644,
+      "grad_norm": 0.3611983060836792,
+      "learning_rate": 0.00019957098306959355,
+      "loss": 0.9878,
+      "step": 682
+    },
+    {
+      "epoch": 0.12161680911680911,
+      "grad_norm": 0.4764883816242218,
+      "learning_rate": 0.00019956968689805883,
+      "loss": 1.0082,
+      "step": 683
+    },
+    {
+      "epoch": 0.12179487179487179,
+      "grad_norm": 0.33170604705810547,
+      "learning_rate": 0.00019956838877566293,
+      "loss": 0.8529,
+      "step": 684
+    },
+    {
+      "epoch": 0.12197293447293447,
+      "grad_norm": 0.46896886825561523,
+      "learning_rate": 0.00019956708870243133,
+      "loss": 1.0745,
+      "step": 685
+    },
+    {
+      "epoch": 0.12215099715099716,
+      "grad_norm": 0.4120674431324005,
+      "learning_rate": 0.00019956578667838941,
+      "loss": 1.1828,
+      "step": 686
+    },
+    {
+      "epoch": 0.12232905982905982,
+      "grad_norm": 0.45671191811561584,
+      "learning_rate": 0.00019956448270356275,
+      "loss": 1.3484,
+      "step": 687
+    },
+    {
+      "epoch": 0.1225071225071225,
+      "grad_norm": 0.4023838937282562,
+      "learning_rate": 0.00019956317677797687,
+      "loss": 0.9623,
+      "step": 688
+    },
+    {
+      "epoch": 0.12268518518518519,
+      "grad_norm": 0.5205856561660767,
+      "learning_rate": 0.00019956186890165737,
+      "loss": 1.2221,
+      "step": 689
+    },
+    {
+      "epoch": 0.12286324786324786,
+      "grad_norm": 0.43956050276756287,
+      "learning_rate": 0.00019956055907462987,
+      "loss": 1.1051,
+      "step": 690
+    },
+    {
+      "epoch": 0.12304131054131054,
+      "grad_norm": 0.4341758191585541,
+      "learning_rate": 0.00019955924729692003,
+      "loss": 0.8972,
+      "step": 691
+    },
+    {
+      "epoch": 0.12321937321937322,
+      "grad_norm": 0.42025020718574524,
+      "learning_rate": 0.00019955793356855357,
+      "loss": 1.1137,
+      "step": 692
+    },
+    {
+      "epoch": 0.1233974358974359,
+      "grad_norm": 0.44375079870224,
+      "learning_rate": 0.0001995566178895562,
+      "loss": 1.2783,
+      "step": 693
+    },
+    {
+      "epoch": 0.12357549857549857,
+      "grad_norm": 0.4703320264816284,
+      "learning_rate": 0.00019955530025995372,
+      "loss": 1.1991,
+      "step": 694
+    },
+    {
+      "epoch": 0.12375356125356125,
+      "grad_norm": 0.43781620264053345,
+      "learning_rate": 0.00019955398067977195,
+      "loss": 1.2316,
+      "step": 695
+    },
+    {
+      "epoch": 0.12393162393162394,
+      "grad_norm": 0.4362877607345581,
+      "learning_rate": 0.0001995526591490367,
+      "loss": 1.1374,
+      "step": 696
+    },
+    {
+      "epoch": 0.1241096866096866,
+      "grad_norm": 0.4434499442577362,
+      "learning_rate": 0.00019955133566777392,
+      "loss": 1.1034,
+      "step": 697
+    },
+    {
+      "epoch": 0.12428774928774929,
+      "grad_norm": 0.46613508462905884,
+      "learning_rate": 0.00019955001023600955,
+      "loss": 1.2252,
+      "step": 698
+    },
+    {
+      "epoch": 0.12446581196581197,
+      "grad_norm": 0.46226736903190613,
+      "learning_rate": 0.00019954868285376945,
+      "loss": 1.0296,
+      "step": 699
+    },
+    {
+      "epoch": 0.12464387464387465,
+      "grad_norm": 0.4460904002189636,
+      "learning_rate": 0.00019954735352107977,
+      "loss": 1.0553,
+      "step": 700
+    },
+    {
+      "epoch": 0.12482193732193732,
+      "grad_norm": 0.36708924174308777,
+      "learning_rate": 0.00019954602223796648,
+      "loss": 0.9384,
+      "step": 701
+    },
+    {
+      "epoch": 0.125,
+      "grad_norm": 0.3780093491077423,
+      "learning_rate": 0.00019954468900445566,
+      "loss": 0.9062,
+      "step": 702
+    },
+    {
+      "epoch": 0.12517806267806267,
+      "grad_norm": 0.41797417402267456,
+      "learning_rate": 0.00019954335382057345,
+      "loss": 1.0344,
+      "step": 703
+    },
+    {
+      "epoch": 0.12535612535612536,
+      "grad_norm": 0.43710798025131226,
+      "learning_rate": 0.00019954201668634597,
+      "loss": 1.1324,
+      "step": 704
+    },
+    {
+      "epoch": 0.12553418803418803,
+      "grad_norm": 0.4732789695262909,
+      "learning_rate": 0.00019954067760179952,
+      "loss": 1.1419,
+      "step": 705
+    },
+    {
+      "epoch": 0.1257122507122507,
+      "grad_norm": 0.43248575925827026,
+      "learning_rate": 0.00019953933656696022,
+      "loss": 1.5112,
+      "step": 706
+    },
+    {
+      "epoch": 0.1258903133903134,
+      "grad_norm": 0.4074753522872925,
+      "learning_rate": 0.00019953799358185442,
+      "loss": 0.9751,
+      "step": 707
+    },
+    {
+      "epoch": 0.12606837606837606,
+      "grad_norm": 0.4586823880672455,
+      "learning_rate": 0.0001995366486465084,
+      "loss": 1.267,
+      "step": 708
+    },
+    {
+      "epoch": 0.12624643874643873,
+      "grad_norm": 0.4716857075691223,
+      "learning_rate": 0.0001995353017609485,
+      "loss": 1.1636,
+      "step": 709
+    },
+    {
+      "epoch": 0.12642450142450143,
+      "grad_norm": 0.5214398503303528,
+      "learning_rate": 0.00019953395292520115,
+      "loss": 1.2317,
+      "step": 710
+    },
+    {
+      "epoch": 0.1266025641025641,
+      "grad_norm": 0.42961129546165466,
+      "learning_rate": 0.00019953260213929276,
+      "loss": 1.0271,
+      "step": 711
+    },
+    {
+      "epoch": 0.1267806267806268,
+      "grad_norm": 0.4764653444290161,
+      "learning_rate": 0.00019953124940324979,
+      "loss": 1.1747,
+      "step": 712
+    },
+    {
+      "epoch": 0.12695868945868946,
+      "grad_norm": 0.4420304000377655,
+      "learning_rate": 0.00019952989471709874,
+      "loss": 0.9783,
+      "step": 713
+    },
+    {
+      "epoch": 0.12713675213675213,
+      "grad_norm": 0.44114625453948975,
+      "learning_rate": 0.00019952853808086616,
+      "loss": 1.1953,
+      "step": 714
+    },
+    {
+      "epoch": 0.12731481481481483,
+      "grad_norm": 0.501923143863678,
+      "learning_rate": 0.0001995271794945786,
+      "loss": 0.9886,
+      "step": 715
+    },
+    {
+      "epoch": 0.1274928774928775,
+      "grad_norm": 0.42266538739204407,
+      "learning_rate": 0.00019952581895826276,
+      "loss": 1.2033,
+      "step": 716
+    },
+    {
+      "epoch": 0.12767094017094016,
+      "grad_norm": 0.37770554423332214,
+      "learning_rate": 0.00019952445647194523,
+      "loss": 1.0164,
+      "step": 717
+    },
+    {
+      "epoch": 0.12784900284900286,
+      "grad_norm": 0.369266152381897,
+      "learning_rate": 0.00019952309203565268,
+      "loss": 0.9186,
+      "step": 718
+    },
+    {
+      "epoch": 0.12802706552706553,
+      "grad_norm": 0.40446221828460693,
+      "learning_rate": 0.00019952172564941193,
+      "loss": 1.1576,
+      "step": 719
+    },
+    {
+      "epoch": 0.1282051282051282,
+      "grad_norm": 0.504172146320343,
+      "learning_rate": 0.00019952035731324967,
+      "loss": 1.2695,
+      "step": 720
+    },
+    {
+      "epoch": 0.1283831908831909,
+      "grad_norm": 0.37284108996391296,
+      "learning_rate": 0.0001995189870271928,
+      "loss": 1.0288,
+      "step": 721
+    },
+    {
+      "epoch": 0.12856125356125356,
+      "grad_norm": 0.41811618208885193,
+      "learning_rate": 0.00019951761479126805,
+      "loss": 1.2241,
+      "step": 722
+    },
+    {
+      "epoch": 0.12873931623931623,
+      "grad_norm": 0.44706249237060547,
+      "learning_rate": 0.0001995162406055024,
+      "loss": 1.0831,
+      "step": 723
+    },
+    {
+      "epoch": 0.12891737891737892,
+      "grad_norm": 0.426572322845459,
+      "learning_rate": 0.00019951486446992273,
+      "loss": 1.0047,
+      "step": 724
+    },
+    {
+      "epoch": 0.1290954415954416,
+      "grad_norm": 0.4446277618408203,
+      "learning_rate": 0.00019951348638455602,
+      "loss": 1.0827,
+      "step": 725
+    },
+    {
+      "epoch": 0.12927350427350429,
+      "grad_norm": 0.3934919834136963,
+      "learning_rate": 0.00019951210634942926,
+      "loss": 0.9808,
+      "step": 726
+    },
+    {
+      "epoch": 0.12945156695156695,
+      "grad_norm": 0.4316558241844177,
+      "learning_rate": 0.0001995107243645695,
+      "loss": 1.3341,
+      "step": 727
+    },
+    {
+      "epoch": 0.12962962962962962,
+      "grad_norm": 0.43074217438697815,
+      "learning_rate": 0.00019950934043000382,
+      "loss": 1.007,
+      "step": 728
+    },
+    {
+      "epoch": 0.12980769230769232,
+      "grad_norm": 0.5212171673774719,
+      "learning_rate": 0.0001995079545457593,
+      "loss": 1.1822,
+      "step": 729
+    },
+    {
+      "epoch": 0.129985754985755,
+      "grad_norm": 0.3749600946903229,
+      "learning_rate": 0.00019950656671186313,
+      "loss": 0.9657,
+      "step": 730
+    },
+    {
+      "epoch": 0.13016381766381765,
+      "grad_norm": 0.36626043915748596,
+      "learning_rate": 0.00019950517692834252,
+      "loss": 1.1274,
+      "step": 731
+    },
+    {
+      "epoch": 0.13034188034188035,
+      "grad_norm": 0.4635467529296875,
+      "learning_rate": 0.00019950378519522467,
+      "loss": 1.2305,
+      "step": 732
+    },
+    {
+      "epoch": 0.13051994301994302,
+      "grad_norm": 0.4077455699443817,
+      "learning_rate": 0.00019950239151253683,
+      "loss": 0.9485,
+      "step": 733
+    },
+    {
+      "epoch": 0.1306980056980057,
+      "grad_norm": 0.4222758114337921,
+      "learning_rate": 0.0001995009958803063,
+      "loss": 1.0376,
+      "step": 734
+    },
+    {
+      "epoch": 0.13087606837606838,
+      "grad_norm": 0.4330402612686157,
+      "learning_rate": 0.0001994995982985605,
+      "loss": 1.1774,
+      "step": 735
+    },
+    {
+      "epoch": 0.13105413105413105,
+      "grad_norm": 0.42275673151016235,
+      "learning_rate": 0.00019949819876732673,
+      "loss": 1.1238,
+      "step": 736
+    },
+    {
+      "epoch": 0.13123219373219372,
+      "grad_norm": 0.45576968789100647,
+      "learning_rate": 0.00019949679728663246,
+      "loss": 1.0428,
+      "step": 737
+    },
+    {
+      "epoch": 0.13141025641025642,
+      "grad_norm": 0.5508752465248108,
+      "learning_rate": 0.00019949539385650514,
+      "loss": 1.3221,
+      "step": 738
+    },
+    {
+      "epoch": 0.13158831908831908,
+      "grad_norm": 0.4115872383117676,
+      "learning_rate": 0.00019949398847697225,
+      "loss": 1.0301,
+      "step": 739
+    },
+    {
+      "epoch": 0.13176638176638178,
+      "grad_norm": 0.4662442207336426,
+      "learning_rate": 0.00019949258114806132,
+      "loss": 1.3263,
+      "step": 740
+    },
+    {
+      "epoch": 0.13194444444444445,
+      "grad_norm": 0.6077266931533813,
+      "learning_rate": 0.00019949117186979999,
+      "loss": 1.0269,
+      "step": 741
+    },
+    {
+      "epoch": 0.13212250712250712,
+      "grad_norm": 0.47039318084716797,
+      "learning_rate": 0.00019948976064221579,
+      "loss": 1.3782,
+      "step": 742
+    },
+    {
+      "epoch": 0.1323005698005698,
+      "grad_norm": 0.4773450493812561,
+      "learning_rate": 0.0001994883474653364,
+      "loss": 1.289,
+      "step": 743
+    },
+    {
+      "epoch": 0.13247863247863248,
+      "grad_norm": 0.40180155634880066,
+      "learning_rate": 0.00019948693233918952,
+      "loss": 0.8691,
+      "step": 744
+    },
+    {
+      "epoch": 0.13265669515669515,
+      "grad_norm": 0.45216289162635803,
+      "learning_rate": 0.00019948551526380288,
+      "loss": 1.071,
+      "step": 745
+    },
+    {
+      "epoch": 0.13283475783475784,
+      "grad_norm": 0.4289272427558899,
+      "learning_rate": 0.0001994840962392042,
+      "loss": 1.0422,
+      "step": 746
+    },
+    {
+      "epoch": 0.1330128205128205,
+      "grad_norm": 0.4617730379104614,
+      "learning_rate": 0.00019948267526542134,
+      "loss": 1.0835,
+      "step": 747
+    },
+    {
+      "epoch": 0.13319088319088318,
+      "grad_norm": 0.42710617184638977,
+      "learning_rate": 0.00019948125234248208,
+      "loss": 1.0535,
+      "step": 748
+    },
+    {
+      "epoch": 0.13336894586894588,
+      "grad_norm": 0.43433234095573425,
+      "learning_rate": 0.0001994798274704144,
+      "loss": 0.9313,
+      "step": 749
+    },
+    {
+      "epoch": 0.13354700854700854,
+      "grad_norm": 0.46270284056663513,
+      "learning_rate": 0.0001994784006492461,
+      "loss": 1.0903,
+      "step": 750
+    },
+    {
+      "epoch": 0.1337250712250712,
+      "grad_norm": 0.5319814682006836,
+      "learning_rate": 0.00019947697187900517,
+      "loss": 1.2329,
+      "step": 751
+    },
+    {
+      "epoch": 0.1339031339031339,
+      "grad_norm": 0.3511372208595276,
+      "learning_rate": 0.00019947554115971967,
+      "loss": 0.7116,
+      "step": 752
+    },
+    {
+      "epoch": 0.13408119658119658,
+      "grad_norm": 0.4103890359401703,
+      "learning_rate": 0.00019947410849141756,
+      "loss": 1.1527,
+      "step": 753
+    },
+    {
+      "epoch": 0.13425925925925927,
+      "grad_norm": 0.5390757322311401,
+      "learning_rate": 0.00019947267387412695,
+      "loss": 1.1682,
+      "step": 754
+    },
+    {
+      "epoch": 0.13443732193732194,
+      "grad_norm": 0.29939723014831543,
+      "learning_rate": 0.0001994712373078759,
+      "loss": 0.5848,
+      "step": 755
+    },
+    {
+      "epoch": 0.1346153846153846,
+      "grad_norm": 0.4605920612812042,
+      "learning_rate": 0.0001994697987926926,
+      "loss": 0.9448,
+      "step": 756
+    },
+    {
+      "epoch": 0.1347934472934473,
+      "grad_norm": 0.426213800907135,
+      "learning_rate": 0.00019946835832860527,
+      "loss": 1.0487,
+      "step": 757
+    },
+    {
+      "epoch": 0.13497150997150997,
+      "grad_norm": 0.4209515154361725,
+      "learning_rate": 0.00019946691591564203,
+      "loss": 1.0951,
+      "step": 758
+    },
+    {
+      "epoch": 0.13514957264957264,
+      "grad_norm": 0.39555591344833374,
+      "learning_rate": 0.0001994654715538312,
+      "loss": 0.8754,
+      "step": 759
+    },
+    {
+      "epoch": 0.13532763532763534,
+      "grad_norm": 0.4065483510494232,
+      "learning_rate": 0.0001994640252432011,
+      "loss": 0.9451,
+      "step": 760
+    },
+    {
+      "epoch": 0.135505698005698,
+      "grad_norm": 0.4489104151725769,
+      "learning_rate": 0.00019946257698378003,
+      "loss": 1.2031,
+      "step": 761
+    },
+    {
+      "epoch": 0.13568376068376067,
+      "grad_norm": 0.39928409457206726,
+      "learning_rate": 0.0001994611267755964,
+      "loss": 1.1124,
+      "step": 762
+    },
+    {
+      "epoch": 0.13586182336182337,
+      "grad_norm": 0.4145409166812897,
+      "learning_rate": 0.00019945967461867858,
+      "loss": 1.083,
+      "step": 763
+    },
+    {
+      "epoch": 0.13603988603988604,
+      "grad_norm": 0.43508613109588623,
+      "learning_rate": 0.00019945822051305507,
+      "loss": 1.1119,
+      "step": 764
+    },
+    {
+      "epoch": 0.1362179487179487,
+      "grad_norm": 0.5186598300933838,
+      "learning_rate": 0.0001994567644587543,
+      "loss": 1.3256,
+      "step": 765
+    },
+    {
+      "epoch": 0.1363960113960114,
+      "grad_norm": 0.4615778625011444,
+      "learning_rate": 0.00019945530645580487,
+      "loss": 1.3906,
+      "step": 766
+    },
+    {
+      "epoch": 0.13657407407407407,
+      "grad_norm": 0.4838152527809143,
+      "learning_rate": 0.00019945384650423532,
+      "loss": 0.8169,
+      "step": 767
+    },
+    {
+      "epoch": 0.13675213675213677,
+      "grad_norm": 0.49253368377685547,
+      "learning_rate": 0.0001994523846040742,
+      "loss": 1.1613,
+      "step": 768
+    },
+    {
+      "epoch": 0.13693019943019943,
+      "grad_norm": 0.4697009325027466,
+      "learning_rate": 0.00019945092075535024,
+      "loss": 1.1722,
+      "step": 769
+    },
+    {
+      "epoch": 0.1371082621082621,
+      "grad_norm": 0.47162383794784546,
+      "learning_rate": 0.00019944945495809204,
+      "loss": 1.054,
+      "step": 770
+    },
+    {
+      "epoch": 0.1372863247863248,
+      "grad_norm": 0.4653547704219818,
+      "learning_rate": 0.00019944798721232835,
+      "loss": 1.1791,
+      "step": 771
+    },
+    {
+      "epoch": 0.13746438746438747,
+      "grad_norm": 0.4244011640548706,
+      "learning_rate": 0.000199446517518088,
+      "loss": 1.1557,
+      "step": 772
+    },
+    {
+      "epoch": 0.13764245014245013,
+      "grad_norm": 0.43812859058380127,
+      "learning_rate": 0.00019944504587539967,
+      "loss": 1.1567,
+      "step": 773
+    },
+    {
+      "epoch": 0.13782051282051283,
+      "grad_norm": 0.3984275162220001,
+      "learning_rate": 0.00019944357228429227,
+      "loss": 1.0715,
+      "step": 774
+    },
+    {
+      "epoch": 0.1379985754985755,
+      "grad_norm": 0.3794248104095459,
+      "learning_rate": 0.0001994420967447946,
+      "loss": 0.9377,
+      "step": 775
+    },
+    {
+      "epoch": 0.13817663817663817,
+      "grad_norm": 0.4214578866958618,
+      "learning_rate": 0.00019944061925693566,
+      "loss": 1.0112,
+      "step": 776
+    },
+    {
+      "epoch": 0.13835470085470086,
+      "grad_norm": 0.4738999605178833,
+      "learning_rate": 0.00019943913982074435,
+      "loss": 0.8718,
+      "step": 777
+    },
+    {
+      "epoch": 0.13853276353276353,
+      "grad_norm": 0.43455326557159424,
+      "learning_rate": 0.00019943765843624965,
+      "loss": 1.1343,
+      "step": 778
+    },
+    {
+      "epoch": 0.1387108262108262,
+      "grad_norm": 0.44973456859588623,
+      "learning_rate": 0.00019943617510348062,
+      "loss": 1.0487,
+      "step": 779
+    },
+    {
+      "epoch": 0.1388888888888889,
+      "grad_norm": 0.4216597080230713,
+      "learning_rate": 0.00019943468982246628,
+      "loss": 1.0765,
+      "step": 780
+    },
+    {
+      "epoch": 0.13906695156695156,
+      "grad_norm": 0.5089883208274841,
+      "learning_rate": 0.00019943320259323578,
+      "loss": 1.3137,
+      "step": 781
+    },
+    {
+      "epoch": 0.13924501424501423,
+      "grad_norm": 0.4358222782611847,
+      "learning_rate": 0.00019943171341581822,
+      "loss": 1.1891,
+      "step": 782
+    },
+    {
+      "epoch": 0.13942307692307693,
+      "grad_norm": 0.40918609499931335,
+      "learning_rate": 0.00019943022229024275,
+      "loss": 1.279,
+      "step": 783
+    },
+    {
+      "epoch": 0.1396011396011396,
+      "grad_norm": 0.4614863395690918,
+      "learning_rate": 0.00019942872921653866,
+      "loss": 1.2477,
+      "step": 784
+    },
+    {
+      "epoch": 0.1397792022792023,
+      "grad_norm": 0.4141528904438019,
+      "learning_rate": 0.00019942723419473515,
+      "loss": 0.9622,
+      "step": 785
+    },
+    {
+      "epoch": 0.13995726495726496,
+      "grad_norm": 0.536139726638794,
+      "learning_rate": 0.00019942573722486154,
+      "loss": 1.2127,
+      "step": 786
+    },
+    {
+      "epoch": 0.14013532763532763,
+      "grad_norm": 0.4968845546245575,
+      "learning_rate": 0.0001994242383069471,
+      "loss": 1.2965,
+      "step": 787
+    },
+    {
+      "epoch": 0.14031339031339032,
+      "grad_norm": 0.3897174894809723,
+      "learning_rate": 0.00019942273744102132,
+      "loss": 0.9907,
+      "step": 788
+    },
+    {
+      "epoch": 0.140491452991453,
+      "grad_norm": 0.466307669878006,
+      "learning_rate": 0.0001994212346271135,
+      "loss": 1.2021,
+      "step": 789
+    },
+    {
+      "epoch": 0.14066951566951566,
+      "grad_norm": 0.49283576011657715,
+      "learning_rate": 0.0001994197298652531,
+      "loss": 1.0969,
+      "step": 790
+    },
+    {
+      "epoch": 0.14084757834757836,
+      "grad_norm": 0.4686102271080017,
+      "learning_rate": 0.00019941822315546964,
+      "loss": 1.0125,
+      "step": 791
+    },
+    {
+      "epoch": 0.14102564102564102,
+      "grad_norm": 0.4389997124671936,
+      "learning_rate": 0.0001994167144977926,
+      "loss": 1.1294,
+      "step": 792
+    },
+    {
+      "epoch": 0.1412037037037037,
+      "grad_norm": 0.38539355993270874,
+      "learning_rate": 0.00019941520389225162,
+      "loss": 1.1231,
+      "step": 793
+    },
+    {
+      "epoch": 0.1413817663817664,
+      "grad_norm": 0.4860847592353821,
+      "learning_rate": 0.00019941369133887618,
+      "loss": 1.2268,
+      "step": 794
+    },
+    {
+      "epoch": 0.14155982905982906,
+      "grad_norm": 0.4567467272281647,
+      "learning_rate": 0.00019941217683769598,
+      "loss": 1.1482,
+      "step": 795
+    },
+    {
+      "epoch": 0.14173789173789172,
+      "grad_norm": 0.5549420714378357,
+      "learning_rate": 0.00019941066038874067,
+      "loss": 1.1899,
+      "step": 796
+    },
+    {
+      "epoch": 0.14191595441595442,
+      "grad_norm": 0.3950003385543823,
+      "learning_rate": 0.00019940914199204,
+      "loss": 0.96,
+      "step": 797
+    },
+    {
+      "epoch": 0.1420940170940171,
+      "grad_norm": 0.43845999240875244,
+      "learning_rate": 0.00019940762164762373,
+      "loss": 1.0338,
+      "step": 798
+    },
+    {
+      "epoch": 0.14227207977207978,
+      "grad_norm": 0.468537300825119,
+      "learning_rate": 0.00019940609935552157,
+      "loss": 1.2416,
+      "step": 799
+    },
+    {
+      "epoch": 0.14245014245014245,
+      "grad_norm": 0.4292038679122925,
+      "learning_rate": 0.0001994045751157634,
+      "loss": 1.1397,
+      "step": 800
+    },
+    {
+      "epoch": 0.14262820512820512,
+      "grad_norm": 0.3800995647907257,
+      "learning_rate": 0.00019940304892837908,
+      "loss": 0.939,
+      "step": 801
+    },
+    {
+      "epoch": 0.14280626780626782,
+      "grad_norm": 0.38004353642463684,
+      "learning_rate": 0.00019940152079339852,
+      "loss": 1.0485,
+      "step": 802
+    },
+    {
+      "epoch": 0.14298433048433049,
+      "grad_norm": 0.4658142924308777,
+      "learning_rate": 0.00019939999071085163,
+      "loss": 1.1561,
+      "step": 803
+    },
+    {
+      "epoch": 0.14316239316239315,
+      "grad_norm": 0.4235048294067383,
+      "learning_rate": 0.0001993984586807684,
+      "loss": 1.0516,
+      "step": 804
+    },
+    {
+      "epoch": 0.14334045584045585,
+      "grad_norm": 0.42925819754600525,
+      "learning_rate": 0.00019939692470317887,
+      "loss": 1.2238,
+      "step": 805
+    },
+    {
+      "epoch": 0.14351851851851852,
+      "grad_norm": 0.43701639771461487,
+      "learning_rate": 0.00019939538877811308,
+      "loss": 1.0129,
+      "step": 806
+    },
+    {
+      "epoch": 0.14369658119658119,
+      "grad_norm": 0.42786353826522827,
+      "learning_rate": 0.00019939385090560113,
+      "loss": 1.1355,
+      "step": 807
+    },
+    {
+      "epoch": 0.14387464387464388,
+      "grad_norm": 0.371218740940094,
+      "learning_rate": 0.00019939231108567312,
+      "loss": 0.9712,
+      "step": 808
+    },
+    {
+      "epoch": 0.14405270655270655,
+      "grad_norm": 0.4834294617176056,
+      "learning_rate": 0.00019939076931835926,
+      "loss": 1.1375,
+      "step": 809
+    },
+    {
+      "epoch": 0.14423076923076922,
+      "grad_norm": 0.4700150191783905,
+      "learning_rate": 0.00019938922560368974,
+      "loss": 1.1943,
+      "step": 810
+    },
+    {
+      "epoch": 0.14440883190883191,
+      "grad_norm": 0.4430996775627136,
+      "learning_rate": 0.0001993876799416948,
+      "loss": 1.1976,
+      "step": 811
+    },
+    {
+      "epoch": 0.14458689458689458,
+      "grad_norm": 0.4161672592163086,
+      "learning_rate": 0.00019938613233240476,
+      "loss": 1.0291,
+      "step": 812
+    },
+    {
+      "epoch": 0.14476495726495728,
+      "grad_norm": 0.39838850498199463,
+      "learning_rate": 0.0001993845827758499,
+      "loss": 1.2103,
+      "step": 813
+    },
+    {
+      "epoch": 0.14494301994301995,
+      "grad_norm": 0.429198294878006,
+      "learning_rate": 0.00019938303127206057,
+      "loss": 0.9971,
+      "step": 814
+    },
+    {
+      "epoch": 0.14512108262108261,
+      "grad_norm": 0.4589254856109619,
+      "learning_rate": 0.00019938147782106719,
+      "loss": 1.2392,
+      "step": 815
+    },
+    {
+      "epoch": 0.1452991452991453,
+      "grad_norm": 0.42506635189056396,
+      "learning_rate": 0.00019937992242290023,
+      "loss": 1.0827,
+      "step": 816
+    },
+    {
+      "epoch": 0.14547720797720798,
+      "grad_norm": 0.3778113126754761,
+      "learning_rate": 0.00019937836507759012,
+      "loss": 1.021,
+      "step": 817
+    },
+    {
+      "epoch": 0.14565527065527065,
+      "grad_norm": 0.43071216344833374,
+      "learning_rate": 0.0001993768057851674,
+      "loss": 1.273,
+      "step": 818
+    },
+    {
+      "epoch": 0.14583333333333334,
+      "grad_norm": 0.4944681227207184,
+      "learning_rate": 0.00019937524454566262,
+      "loss": 1.3037,
+      "step": 819
+    },
+    {
+      "epoch": 0.146011396011396,
+      "grad_norm": 0.4438824951648712,
+      "learning_rate": 0.00019937368135910632,
+      "loss": 1.1383,
+      "step": 820
+    },
+    {
+      "epoch": 0.14618945868945868,
+      "grad_norm": 0.400215744972229,
+      "learning_rate": 0.0001993721162255292,
+      "loss": 1.0669,
+      "step": 821
+    },
+    {
+      "epoch": 0.14636752136752137,
+      "grad_norm": 0.4341452121734619,
+      "learning_rate": 0.00019937054914496185,
+      "loss": 1.1431,
+      "step": 822
+    },
+    {
+      "epoch": 0.14654558404558404,
+      "grad_norm": 0.3941744267940521,
+      "learning_rate": 0.00019936898011743503,
+      "loss": 1.1593,
+      "step": 823
+    },
+    {
+      "epoch": 0.1467236467236467,
+      "grad_norm": 0.4318541884422302,
+      "learning_rate": 0.00019936740914297947,
+      "loss": 1.2814,
+      "step": 824
+    },
+    {
+      "epoch": 0.1469017094017094,
+      "grad_norm": 0.44488632678985596,
+      "learning_rate": 0.00019936583622162595,
+      "loss": 1.1054,
+      "step": 825
+    },
+    {
+      "epoch": 0.14707977207977208,
+      "grad_norm": 0.38701096177101135,
+      "learning_rate": 0.00019936426135340528,
+      "loss": 1.1086,
+      "step": 826
+    },
+    {
+      "epoch": 0.14725783475783477,
+      "grad_norm": 0.45794424414634705,
+      "learning_rate": 0.0001993626845383483,
+      "loss": 1.2395,
+      "step": 827
+    },
+    {
+      "epoch": 0.14743589743589744,
+      "grad_norm": 0.49237680435180664,
+      "learning_rate": 0.00019936110577648596,
+      "loss": 1.3483,
+      "step": 828
+    },
+    {
+      "epoch": 0.1476139601139601,
+      "grad_norm": 0.481666624546051,
+      "learning_rate": 0.00019935952506784914,
+      "loss": 1.1848,
+      "step": 829
+    },
+    {
+      "epoch": 0.1477920227920228,
+      "grad_norm": 0.4015209376811981,
+      "learning_rate": 0.00019935794241246883,
+      "loss": 1.0624,
+      "step": 830
+    },
+    {
+      "epoch": 0.14797008547008547,
+      "grad_norm": 0.47975999116897583,
+      "learning_rate": 0.00019935635781037606,
+      "loss": 1.1595,
+      "step": 831
+    },
+    {
+      "epoch": 0.14814814814814814,
+      "grad_norm": 0.4440356492996216,
+      "learning_rate": 0.00019935477126160181,
+      "loss": 1.1325,
+      "step": 832
+    },
+    {
+      "epoch": 0.14832621082621084,
+      "grad_norm": 0.4167410731315613,
+      "learning_rate": 0.00019935318276617723,
+      "loss": 1.0662,
+      "step": 833
+    },
+    {
+      "epoch": 0.1485042735042735,
+      "grad_norm": 0.4107447862625122,
+      "learning_rate": 0.0001993515923241334,
+      "loss": 0.8816,
+      "step": 834
+    },
+    {
+      "epoch": 0.14868233618233617,
+      "grad_norm": 0.4020158648490906,
+      "learning_rate": 0.00019934999993550154,
+      "loss": 0.9797,
+      "step": 835
+    },
+    {
+      "epoch": 0.14886039886039887,
+      "grad_norm": 0.4186473786830902,
+      "learning_rate": 0.0001993484056003128,
+      "loss": 1.1243,
+      "step": 836
+    },
+    {
+      "epoch": 0.14903846153846154,
+      "grad_norm": 0.5534794926643372,
+      "learning_rate": 0.00019934680931859842,
+      "loss": 1.1189,
+      "step": 837
+    },
+    {
+      "epoch": 0.1492165242165242,
+      "grad_norm": 0.37901270389556885,
+      "learning_rate": 0.0001993452110903897,
+      "loss": 0.9241,
+      "step": 838
+    },
+    {
+      "epoch": 0.1493945868945869,
+      "grad_norm": 0.41773587465286255,
+      "learning_rate": 0.00019934361091571793,
+      "loss": 0.9467,
+      "step": 839
+    },
+    {
+      "epoch": 0.14957264957264957,
+      "grad_norm": 0.4962073564529419,
+      "learning_rate": 0.00019934200879461448,
+      "loss": 1.2423,
+      "step": 840
+    },
+    {
+      "epoch": 0.14975071225071226,
+      "grad_norm": 0.38565897941589355,
+      "learning_rate": 0.00019934040472711074,
+      "loss": 1.1545,
+      "step": 841
+    },
+    {
+      "epoch": 0.14992877492877493,
+      "grad_norm": 0.4295346736907959,
+      "learning_rate": 0.0001993387987132381,
+      "loss": 1.2482,
+      "step": 842
+    },
+    {
+      "epoch": 0.1501068376068376,
+      "grad_norm": 0.4279189705848694,
+      "learning_rate": 0.0001993371907530281,
+      "loss": 1.1135,
+      "step": 843
+    },
+    {
+      "epoch": 0.1502849002849003,
+      "grad_norm": 0.44649168848991394,
+      "learning_rate": 0.0001993355808465122,
+      "loss": 1.0734,
+      "step": 844
+    },
+    {
+      "epoch": 0.15046296296296297,
+      "grad_norm": 0.453707218170166,
+      "learning_rate": 0.0001993339689937219,
+      "loss": 1.0992,
+      "step": 845
+    },
+    {
+      "epoch": 0.15064102564102563,
+      "grad_norm": 0.5113263726234436,
+      "learning_rate": 0.00019933235519468886,
+      "loss": 1.1792,
+      "step": 846
+    },
+    {
+      "epoch": 0.15081908831908833,
+      "grad_norm": 0.5822970271110535,
+      "learning_rate": 0.00019933073944944466,
+      "loss": 1.367,
+      "step": 847
+    },
+    {
+      "epoch": 0.150997150997151,
+      "grad_norm": 0.3946528732776642,
+      "learning_rate": 0.00019932912175802097,
+      "loss": 0.9781,
+      "step": 848
+    },
+    {
+      "epoch": 0.15117521367521367,
+      "grad_norm": 0.5429860949516296,
+      "learning_rate": 0.00019932750212044945,
+      "loss": 0.9783,
+      "step": 849
+    },
+    {
+      "epoch": 0.15135327635327636,
+      "grad_norm": 0.45847952365875244,
+      "learning_rate": 0.0001993258805367619,
+      "loss": 1.1352,
+      "step": 850
+    },
+    {
+      "epoch": 0.15153133903133903,
+      "grad_norm": 0.42770692706108093,
+      "learning_rate": 0.00019932425700699004,
+      "loss": 1.2365,
+      "step": 851
+    },
+    {
+      "epoch": 0.1517094017094017,
+      "grad_norm": 0.41845405101776123,
+      "learning_rate": 0.00019932263153116565,
+      "loss": 1.2642,
+      "step": 852
+    },
+    {
+      "epoch": 0.1518874643874644,
+      "grad_norm": 0.4641731083393097,
+      "learning_rate": 0.00019932100410932066,
+      "loss": 1.2009,
+      "step": 853
+    },
+    {
+      "epoch": 0.15206552706552706,
+      "grad_norm": 0.4128672778606415,
+      "learning_rate": 0.00019931937474148689,
+      "loss": 1.1981,
+      "step": 854
+    },
+    {
+      "epoch": 0.15224358974358973,
+      "grad_norm": 0.4730764925479889,
+      "learning_rate": 0.00019931774342769632,
+      "loss": 1.2145,
+      "step": 855
+    },
+    {
+      "epoch": 0.15242165242165243,
+      "grad_norm": 0.36611825227737427,
+      "learning_rate": 0.00019931611016798089,
+      "loss": 0.8504,
+      "step": 856
+    },
+    {
+      "epoch": 0.1525997150997151,
+      "grad_norm": 0.40944692492485046,
+      "learning_rate": 0.00019931447496237254,
+      "loss": 1.2853,
+      "step": 857
+    },
+    {
+      "epoch": 0.1527777777777778,
+      "grad_norm": 0.4521993398666382,
+      "learning_rate": 0.0001993128378109034,
+      "loss": 1.0198,
+      "step": 858
+    },
+    {
+      "epoch": 0.15295584045584046,
+      "grad_norm": 0.42113015055656433,
+      "learning_rate": 0.0001993111987136055,
+      "loss": 1.1284,
+      "step": 859
+    },
+    {
+      "epoch": 0.15313390313390313,
+      "grad_norm": 0.4117624759674072,
+      "learning_rate": 0.00019930955767051098,
+      "loss": 1.0445,
+      "step": 860
+    },
+    {
+      "epoch": 0.15331196581196582,
+      "grad_norm": 0.4807964265346527,
+      "learning_rate": 0.00019930791468165197,
+      "loss": 1.1378,
+      "step": 861
+    },
+    {
+      "epoch": 0.1534900284900285,
+      "grad_norm": 0.4186483323574066,
+      "learning_rate": 0.00019930626974706063,
+      "loss": 1.1636,
+      "step": 862
+    },
+    {
+      "epoch": 0.15366809116809116,
+      "grad_norm": 0.3764737844467163,
+      "learning_rate": 0.00019930462286676926,
+      "loss": 0.9523,
+      "step": 863
+    },
+    {
+      "epoch": 0.15384615384615385,
+      "grad_norm": 0.4283556044101715,
+      "learning_rate": 0.00019930297404081008,
+      "loss": 1.1008,
+      "step": 864
+    },
+    {
+      "epoch": 0.15402421652421652,
+      "grad_norm": 0.4485796093940735,
+      "learning_rate": 0.00019930132326921541,
+      "loss": 1.0834,
+      "step": 865
+    },
+    {
+      "epoch": 0.1542022792022792,
+      "grad_norm": 0.3882720172405243,
+      "learning_rate": 0.0001992996705520176,
+      "loss": 1.1086,
+      "step": 866
+    },
+    {
+      "epoch": 0.1543803418803419,
+      "grad_norm": 0.44698455929756165,
+      "learning_rate": 0.00019929801588924902,
+      "loss": 1.1437,
+      "step": 867
+    },
+    {
+      "epoch": 0.15455840455840456,
+      "grad_norm": 0.46978411078453064,
+      "learning_rate": 0.00019929635928094208,
+      "loss": 1.091,
+      "step": 868
+    },
+    {
+      "epoch": 0.15473646723646722,
+      "grad_norm": 0.4717854857444763,
+      "learning_rate": 0.00019929470072712927,
+      "loss": 1.1959,
+      "step": 869
+    },
+    {
+      "epoch": 0.15491452991452992,
+      "grad_norm": 0.4324854016304016,
+      "learning_rate": 0.00019929304022784305,
+      "loss": 1.2062,
+      "step": 870
+    },
+    {
+      "epoch": 0.1550925925925926,
+      "grad_norm": 0.3948180675506592,
+      "learning_rate": 0.00019929137778311597,
+      "loss": 1.1101,
+      "step": 871
+    },
+    {
+      "epoch": 0.15527065527065528,
+      "grad_norm": 0.40345287322998047,
+      "learning_rate": 0.0001992897133929806,
+      "loss": 0.8894,
+      "step": 872
+    },
+    {
+      "epoch": 0.15544871794871795,
+      "grad_norm": 0.44931963086128235,
+      "learning_rate": 0.00019928804705746957,
+      "loss": 0.9389,
+      "step": 873
+    },
+    {
+      "epoch": 0.15562678062678062,
+      "grad_norm": 0.529196560382843,
+      "learning_rate": 0.0001992863787766155,
+      "loss": 1.3362,
+      "step": 874
+    },
+    {
+      "epoch": 0.15580484330484332,
+      "grad_norm": 0.41218671202659607,
+      "learning_rate": 0.0001992847085504511,
+      "loss": 1.0727,
+      "step": 875
+    },
+    {
+      "epoch": 0.15598290598290598,
+      "grad_norm": 0.44074541330337524,
+      "learning_rate": 0.00019928303637900907,
+      "loss": 1.1091,
+      "step": 876
+    },
+    {
+      "epoch": 0.15616096866096865,
+      "grad_norm": 0.5264310240745544,
+      "learning_rate": 0.00019928136226232218,
+      "loss": 1.201,
+      "step": 877
+    },
+    {
+      "epoch": 0.15633903133903135,
+      "grad_norm": 0.4255099594593048,
+      "learning_rate": 0.00019927968620042324,
+      "loss": 1.2514,
+      "step": 878
+    },
+    {
+      "epoch": 0.15651709401709402,
+      "grad_norm": 0.4030280113220215,
+      "learning_rate": 0.0001992780081933451,
+      "loss": 1.0422,
+      "step": 879
+    },
+    {
+      "epoch": 0.15669515669515668,
+      "grad_norm": 0.5270203948020935,
+      "learning_rate": 0.00019927632824112058,
+      "loss": 1.2476,
+      "step": 880
+    },
+    {
+      "epoch": 0.15687321937321938,
+      "grad_norm": 0.37767237424850464,
+      "learning_rate": 0.00019927464634378268,
+      "loss": 1.0768,
+      "step": 881
+    },
+    {
+      "epoch": 0.15705128205128205,
+      "grad_norm": 0.4535936415195465,
+      "learning_rate": 0.0001992729625013643,
+      "loss": 1.2097,
+      "step": 882
+    },
+    {
+      "epoch": 0.15722934472934472,
+      "grad_norm": 0.4282119870185852,
+      "learning_rate": 0.00019927127671389843,
+      "loss": 1.0904,
+      "step": 883
+    },
+    {
+      "epoch": 0.1574074074074074,
+      "grad_norm": 0.3924157917499542,
+      "learning_rate": 0.0001992695889814181,
+      "loss": 0.9692,
+      "step": 884
+    },
+    {
+      "epoch": 0.15758547008547008,
+      "grad_norm": 0.525075376033783,
+      "learning_rate": 0.0001992678993039564,
+      "loss": 1.0292,
+      "step": 885
+    },
+    {
+      "epoch": 0.15776353276353278,
+      "grad_norm": 0.4388505518436432,
+      "learning_rate": 0.00019926620768154644,
+      "loss": 1.1944,
+      "step": 886
+    },
+    {
+      "epoch": 0.15794159544159544,
+      "grad_norm": 0.4362235963344574,
+      "learning_rate": 0.00019926451411422132,
+      "loss": 0.97,
+      "step": 887
+    },
+    {
+      "epoch": 0.1581196581196581,
+      "grad_norm": 0.4265296459197998,
+      "learning_rate": 0.0001992628186020143,
+      "loss": 0.9196,
+      "step": 888
+    },
+    {
+      "epoch": 0.1582977207977208,
+      "grad_norm": 0.4019876718521118,
+      "learning_rate": 0.0001992611211449585,
+      "loss": 1.1368,
+      "step": 889
+    },
+    {
+      "epoch": 0.15847578347578348,
+      "grad_norm": 0.5003397464752197,
+      "learning_rate": 0.00019925942174308726,
+      "loss": 1.2582,
+      "step": 890
+    },
+    {
+      "epoch": 0.15865384615384615,
+      "grad_norm": 0.4774404466152191,
+      "learning_rate": 0.00019925772039643382,
+      "loss": 1.2277,
+      "step": 891
+    },
+    {
+      "epoch": 0.15883190883190884,
+      "grad_norm": 0.4590449333190918,
+      "learning_rate": 0.00019925601710503153,
+      "loss": 1.1679,
+      "step": 892
+    },
+    {
+      "epoch": 0.1590099715099715,
+      "grad_norm": 0.4221442639827728,
+      "learning_rate": 0.0001992543118689138,
+      "loss": 1.1626,
+      "step": 893
+    },
+    {
+      "epoch": 0.15918803418803418,
+      "grad_norm": 0.47613003849983215,
+      "learning_rate": 0.00019925260468811403,
+      "loss": 1.1509,
+      "step": 894
+    },
+    {
+      "epoch": 0.15936609686609687,
+      "grad_norm": 0.41706812381744385,
+      "learning_rate": 0.0001992508955626656,
+      "loss": 1.0366,
+      "step": 895
+    },
+    {
+      "epoch": 0.15954415954415954,
+      "grad_norm": 0.5064654350280762,
+      "learning_rate": 0.00019924918449260205,
+      "loss": 1.0729,
+      "step": 896
+    },
+    {
+      "epoch": 0.1597222222222222,
+      "grad_norm": 0.5019610524177551,
+      "learning_rate": 0.00019924747147795696,
+      "loss": 1.0642,
+      "step": 897
+    },
+    {
+      "epoch": 0.1599002849002849,
+      "grad_norm": 0.4345671534538269,
+      "learning_rate": 0.00019924575651876378,
+      "loss": 1.1747,
+      "step": 898
+    },
+    {
+      "epoch": 0.16007834757834757,
+      "grad_norm": 0.4397568702697754,
+      "learning_rate": 0.0001992440396150562,
+      "loss": 1.282,
+      "step": 899
+    },
+    {
+      "epoch": 0.16025641025641027,
+      "grad_norm": 0.520187497138977,
+      "learning_rate": 0.0001992423207668678,
+      "loss": 0.976,
+      "step": 900
+    },
+    {
+      "epoch": 0.16043447293447294,
+      "grad_norm": 0.39329993724823,
+      "learning_rate": 0.0001992405999742323,
+      "loss": 0.9829,
+      "step": 901
+    },
+    {
+      "epoch": 0.1606125356125356,
+      "grad_norm": 0.42361345887184143,
+      "learning_rate": 0.00019923887723718339,
+      "loss": 1.139,
+      "step": 902
+    },
+    {
+      "epoch": 0.1607905982905983,
+      "grad_norm": 0.3846314251422882,
+      "learning_rate": 0.00019923715255575482,
+      "loss": 0.8262,
+      "step": 903
+    },
+    {
+      "epoch": 0.16096866096866097,
+      "grad_norm": 0.39258381724357605,
+      "learning_rate": 0.0001992354259299804,
+      "loss": 0.9638,
+      "step": 904
+    },
+    {
+      "epoch": 0.16114672364672364,
+      "grad_norm": 0.4000850319862366,
+      "learning_rate": 0.00019923369735989397,
+      "loss": 0.91,
+      "step": 905
+    },
+    {
+      "epoch": 0.16132478632478633,
+      "grad_norm": 0.46303513646125793,
+      "learning_rate": 0.00019923196684552936,
+      "loss": 1.1447,
+      "step": 906
+    },
+    {
+      "epoch": 0.161502849002849,
+      "grad_norm": 0.38437438011169434,
+      "learning_rate": 0.0001992302343869205,
+      "loss": 1.0212,
+      "step": 907
+    },
+    {
+      "epoch": 0.16168091168091167,
+      "grad_norm": 0.44585472345352173,
+      "learning_rate": 0.00019922849998410135,
+      "loss": 1.1964,
+      "step": 908
+    },
+    {
+      "epoch": 0.16185897435897437,
+      "grad_norm": 0.41959813237190247,
+      "learning_rate": 0.00019922676363710583,
+      "loss": 0.9925,
+      "step": 909
+    },
+    {
+      "epoch": 0.16203703703703703,
+      "grad_norm": 0.47442761063575745,
+      "learning_rate": 0.00019922502534596803,
+      "loss": 0.9237,
+      "step": 910
+    },
+    {
+      "epoch": 0.1622150997150997,
+      "grad_norm": 0.5065128207206726,
+      "learning_rate": 0.00019922328511072198,
+      "loss": 1.2573,
+      "step": 911
+    },
+    {
+      "epoch": 0.1623931623931624,
+      "grad_norm": 0.4739879369735718,
+      "learning_rate": 0.0001992215429314018,
+      "loss": 1.4416,
+      "step": 912
+    },
+    {
+      "epoch": 0.16257122507122507,
+      "grad_norm": 0.48763832449913025,
+      "learning_rate": 0.00019921979880804157,
+      "loss": 1.0408,
+      "step": 913
+    },
+    {
+      "epoch": 0.16274928774928774,
+      "grad_norm": 0.4841614067554474,
+      "learning_rate": 0.0001992180527406755,
+      "loss": 1.1826,
+      "step": 914
+    },
+    {
+      "epoch": 0.16292735042735043,
+      "grad_norm": 0.49433308839797974,
+      "learning_rate": 0.0001992163047293378,
+      "loss": 1.3552,
+      "step": 915
+    },
+    {
+      "epoch": 0.1631054131054131,
+      "grad_norm": 0.4985002875328064,
+      "learning_rate": 0.0001992145547740627,
+      "loss": 1.2639,
+      "step": 916
+    },
+    {
+      "epoch": 0.1632834757834758,
+      "grad_norm": 0.40348032116889954,
+      "learning_rate": 0.00019921280287488448,
+      "loss": 1.1731,
+      "step": 917
+    },
+    {
+      "epoch": 0.16346153846153846,
+      "grad_norm": 0.5166002511978149,
+      "learning_rate": 0.0001992110490318375,
+      "loss": 1.0692,
+      "step": 918
+    },
+    {
+      "epoch": 0.16363960113960113,
+      "grad_norm": 0.44233468174934387,
+      "learning_rate": 0.00019920929324495615,
+      "loss": 1.0488,
+      "step": 919
+    },
+    {
+      "epoch": 0.16381766381766383,
+      "grad_norm": 0.43709903955459595,
+      "learning_rate": 0.00019920753551427476,
+      "loss": 0.8884,
+      "step": 920
+    },
+    {
+      "epoch": 0.1639957264957265,
+      "grad_norm": 0.4054167568683624,
+      "learning_rate": 0.00019920577583982778,
+      "loss": 0.9872,
+      "step": 921
+    },
+    {
+      "epoch": 0.16417378917378916,
+      "grad_norm": 0.4657362997531891,
+      "learning_rate": 0.0001992040142216497,
+      "loss": 1.4402,
+      "step": 922
+    },
+    {
+      "epoch": 0.16435185185185186,
+      "grad_norm": 0.42550426721572876,
+      "learning_rate": 0.0001992022506597751,
+      "loss": 1.0456,
+      "step": 923
+    },
+    {
+      "epoch": 0.16452991452991453,
+      "grad_norm": 0.49346762895584106,
+      "learning_rate": 0.00019920048515423842,
+      "loss": 1.527,
+      "step": 924
+    },
+    {
+      "epoch": 0.1647079772079772,
+      "grad_norm": 0.3970337510108948,
+      "learning_rate": 0.0001991987177050743,
+      "loss": 1.0363,
+      "step": 925
+    },
+    {
+      "epoch": 0.1648860398860399,
+      "grad_norm": 0.4027378559112549,
+      "learning_rate": 0.0001991969483123174,
+      "loss": 0.8416,
+      "step": 926
+    },
+    {
+      "epoch": 0.16506410256410256,
+      "grad_norm": 0.4181644916534424,
+      "learning_rate": 0.00019919517697600237,
+      "loss": 1.2253,
+      "step": 927
+    },
+    {
+      "epoch": 0.16524216524216523,
+      "grad_norm": 0.43686383962631226,
+      "learning_rate": 0.0001991934036961639,
+      "loss": 1.0808,
+      "step": 928
+    },
+    {
+      "epoch": 0.16542022792022792,
+      "grad_norm": 0.4242876172065735,
+      "learning_rate": 0.0001991916284728367,
+      "loss": 0.9483,
+      "step": 929
+    },
+    {
+      "epoch": 0.1655982905982906,
+      "grad_norm": 0.3690609037876129,
+      "learning_rate": 0.00019918985130605563,
+      "loss": 0.9495,
+      "step": 930
+    },
+    {
+      "epoch": 0.1657763532763533,
+      "grad_norm": 0.42184555530548096,
+      "learning_rate": 0.00019918807219585546,
+      "loss": 1.0966,
+      "step": 931
+    },
+    {
+      "epoch": 0.16595441595441596,
+      "grad_norm": 0.4342746138572693,
+      "learning_rate": 0.00019918629114227106,
+      "loss": 1.0875,
+      "step": 932
+    },
+    {
+      "epoch": 0.16613247863247863,
+      "grad_norm": 0.4191494286060333,
+      "learning_rate": 0.00019918450814533737,
+      "loss": 1.0777,
+      "step": 933
+    },
+    {
+      "epoch": 0.16631054131054132,
+      "grad_norm": 0.37124550342559814,
+      "learning_rate": 0.00019918272320508922,
+      "loss": 1.0131,
+      "step": 934
+    },
+    {
+      "epoch": 0.166488603988604,
+      "grad_norm": 0.4475722014904022,
+      "learning_rate": 0.00019918093632156168,
+      "loss": 1.1185,
+      "step": 935
+    },
+    {
+      "epoch": 0.16666666666666666,
+      "grad_norm": 0.4629058241844177,
+      "learning_rate": 0.0001991791474947897,
+      "loss": 1.0353,
+      "step": 936
+    },
+    {
+      "epoch": 0.16684472934472935,
+      "grad_norm": 0.48192909359931946,
+      "learning_rate": 0.00019917735672480834,
+      "loss": 1.1628,
+      "step": 937
+    },
+    {
+      "epoch": 0.16702279202279202,
+      "grad_norm": 0.5542252063751221,
+      "learning_rate": 0.00019917556401165273,
+      "loss": 1.3133,
+      "step": 938
+    },
+    {
+      "epoch": 0.1672008547008547,
+      "grad_norm": 0.4172651171684265,
+      "learning_rate": 0.00019917376935535796,
+      "loss": 1.1733,
+      "step": 939
+    },
+    {
+      "epoch": 0.16737891737891739,
+      "grad_norm": 0.4424920380115509,
+      "learning_rate": 0.0001991719727559592,
+      "loss": 1.0262,
+      "step": 940
+    },
+    {
+      "epoch": 0.16755698005698005,
+      "grad_norm": 0.4551742970943451,
+      "learning_rate": 0.00019917017421349162,
+      "loss": 1.0883,
+      "step": 941
+    },
+    {
+      "epoch": 0.16773504273504272,
+      "grad_norm": 0.45929640531539917,
+      "learning_rate": 0.00019916837372799048,
+      "loss": 1.1836,
+      "step": 942
+    },
+    {
+      "epoch": 0.16791310541310542,
+      "grad_norm": 0.4609353542327881,
+      "learning_rate": 0.0001991665712994911,
+      "loss": 1.0682,
+      "step": 943
+    },
+    {
+      "epoch": 0.16809116809116809,
+      "grad_norm": 0.42617303133010864,
+      "learning_rate": 0.00019916476692802873,
+      "loss": 1.074,
+      "step": 944
+    },
+    {
+      "epoch": 0.16826923076923078,
+      "grad_norm": 0.41919493675231934,
+      "learning_rate": 0.00019916296061363875,
+      "loss": 1.0969,
+      "step": 945
+    },
+    {
+      "epoch": 0.16844729344729345,
+      "grad_norm": 0.450979083776474,
+      "learning_rate": 0.00019916115235635656,
+      "loss": 1.1686,
+      "step": 946
+    },
+    {
+      "epoch": 0.16862535612535612,
+      "grad_norm": 0.42166751623153687,
+      "learning_rate": 0.00019915934215621758,
+      "loss": 0.9273,
+      "step": 947
+    },
+    {
+      "epoch": 0.16880341880341881,
+      "grad_norm": 0.4404160976409912,
+      "learning_rate": 0.00019915753001325729,
+      "loss": 1.1663,
+      "step": 948
+    },
+    {
+      "epoch": 0.16898148148148148,
+      "grad_norm": 0.42025226354599,
+      "learning_rate": 0.0001991557159275111,
+      "loss": 0.9433,
+      "step": 949
+    },
+    {
+      "epoch": 0.16915954415954415,
+      "grad_norm": 0.4277796745300293,
+      "learning_rate": 0.00019915389989901474,
+      "loss": 0.8475,
+      "step": 950
+    },
+    {
+      "epoch": 0.16933760683760685,
+      "grad_norm": 0.5162755250930786,
+      "learning_rate": 0.00019915208192780365,
+      "loss": 1.1155,
+      "step": 951
+    },
+    {
+      "epoch": 0.16951566951566951,
+      "grad_norm": 0.4214856028556824,
+      "learning_rate": 0.00019915026201391346,
+      "loss": 1.173,
+      "step": 952
+    },
+    {
+      "epoch": 0.16969373219373218,
+      "grad_norm": 0.4713292419910431,
+      "learning_rate": 0.00019914844015737985,
+      "loss": 1.1615,
+      "step": 953
+    },
+    {
+      "epoch": 0.16987179487179488,
+      "grad_norm": 0.461179256439209,
+      "learning_rate": 0.00019914661635823854,
+      "loss": 1.1169,
+      "step": 954
+    },
+    {
+      "epoch": 0.17004985754985755,
+      "grad_norm": 0.46200552582740784,
+      "learning_rate": 0.00019914479061652527,
+      "loss": 1.0274,
+      "step": 955
+    },
+    {
+      "epoch": 0.17022792022792022,
+      "grad_norm": 0.40968334674835205,
+      "learning_rate": 0.00019914296293227572,
+      "loss": 1.066,
+      "step": 956
+    },
+    {
+      "epoch": 0.1704059829059829,
+      "grad_norm": 0.40877434611320496,
+      "learning_rate": 0.0001991411333055258,
+      "loss": 1.1595,
+      "step": 957
+    },
+    {
+      "epoch": 0.17058404558404558,
+      "grad_norm": 0.42940187454223633,
+      "learning_rate": 0.00019913930173631132,
+      "loss": 1.0364,
+      "step": 958
+    },
+    {
+      "epoch": 0.17076210826210828,
+      "grad_norm": 0.49648910760879517,
+      "learning_rate": 0.00019913746822466819,
+      "loss": 1.0763,
+      "step": 959
+    },
+    {
+      "epoch": 0.17094017094017094,
+      "grad_norm": 0.4353426396846771,
+      "learning_rate": 0.00019913563277063228,
+      "loss": 0.9698,
+      "step": 960
+    },
+    {
+      "epoch": 0.1711182336182336,
+      "grad_norm": 0.45079681277275085,
+      "learning_rate": 0.00019913379537423958,
+      "loss": 1.2244,
+      "step": 961
+    },
+    {
+      "epoch": 0.1712962962962963,
+      "grad_norm": 0.4276828467845917,
+      "learning_rate": 0.00019913195603552607,
+      "loss": 0.9976,
+      "step": 962
+    },
+    {
+      "epoch": 0.17147435897435898,
+      "grad_norm": 0.41122403740882874,
+      "learning_rate": 0.00019913011475452785,
+      "loss": 1.0077,
+      "step": 963
+    },
+    {
+      "epoch": 0.17165242165242164,
+      "grad_norm": 0.43170276284217834,
+      "learning_rate": 0.00019912827153128096,
+      "loss": 1.1402,
+      "step": 964
+    },
+    {
+      "epoch": 0.17183048433048434,
+      "grad_norm": 0.37950268387794495,
+      "learning_rate": 0.0001991264263658215,
+      "loss": 0.9818,
+      "step": 965
+    },
+    {
+      "epoch": 0.172008547008547,
+      "grad_norm": 0.477333128452301,
+      "learning_rate": 0.00019912457925818562,
+      "loss": 1.1756,
+      "step": 966
+    },
+    {
+      "epoch": 0.17218660968660968,
+      "grad_norm": 0.4326401352882385,
+      "learning_rate": 0.00019912273020840954,
+      "loss": 1.3718,
+      "step": 967
+    },
+    {
+      "epoch": 0.17236467236467237,
+      "grad_norm": 0.37711042165756226,
+      "learning_rate": 0.00019912087921652945,
+      "loss": 0.9011,
+      "step": 968
+    },
+    {
+      "epoch": 0.17254273504273504,
+      "grad_norm": 0.50013667345047,
+      "learning_rate": 0.00019911902628258162,
+      "loss": 1.1163,
+      "step": 969
+    },
+    {
+      "epoch": 0.1727207977207977,
+      "grad_norm": 0.41913339495658875,
+      "learning_rate": 0.0001991171714066024,
+      "loss": 1.2614,
+      "step": 970
+    },
+    {
+      "epoch": 0.1728988603988604,
+      "grad_norm": 0.4075855612754822,
+      "learning_rate": 0.00019911531458862813,
+      "loss": 0.8984,
+      "step": 971
+    },
+    {
+      "epoch": 0.17307692307692307,
+      "grad_norm": 0.40277954936027527,
+      "learning_rate": 0.00019911345582869513,
+      "loss": 1.0851,
+      "step": 972
+    },
+    {
+      "epoch": 0.17325498575498577,
+      "grad_norm": 0.4312847852706909,
+      "learning_rate": 0.00019911159512683987,
+      "loss": 1.1273,
+      "step": 973
+    },
+    {
+      "epoch": 0.17343304843304844,
+      "grad_norm": 0.40303611755371094,
+      "learning_rate": 0.0001991097324830988,
+      "loss": 0.9645,
+      "step": 974
+    },
+    {
+      "epoch": 0.1736111111111111,
+      "grad_norm": 0.45560577511787415,
+      "learning_rate": 0.00019910786789750838,
+      "loss": 1.0864,
+      "step": 975
+    },
+    {
+      "epoch": 0.1737891737891738,
+      "grad_norm": 0.43775680661201477,
+      "learning_rate": 0.00019910600137010517,
+      "loss": 1.028,
+      "step": 976
+    },
+    {
+      "epoch": 0.17396723646723647,
+      "grad_norm": 0.3917224407196045,
+      "learning_rate": 0.00019910413290092572,
+      "loss": 1.0491,
+      "step": 977
+    },
+    {
+      "epoch": 0.17414529914529914,
+      "grad_norm": 0.4068751037120819,
+      "learning_rate": 0.0001991022624900067,
+      "loss": 1.0476,
+      "step": 978
+    },
+    {
+      "epoch": 0.17432336182336183,
+      "grad_norm": 0.4463370144367218,
+      "learning_rate": 0.0001991003901373847,
+      "loss": 1.0612,
+      "step": 979
+    },
+    {
+      "epoch": 0.1745014245014245,
+      "grad_norm": 0.46949052810668945,
+      "learning_rate": 0.0001990985158430964,
+      "loss": 1.3099,
+      "step": 980
+    },
+    {
+      "epoch": 0.17467948717948717,
+      "grad_norm": 0.4250012934207916,
+      "learning_rate": 0.00019909663960717856,
+      "loss": 0.9903,
+      "step": 981
+    },
+    {
+      "epoch": 0.17485754985754987,
+      "grad_norm": 0.5293903946876526,
+      "learning_rate": 0.0001990947614296679,
+      "loss": 0.9908,
+      "step": 982
+    },
+    {
+      "epoch": 0.17503561253561253,
+      "grad_norm": 0.3838284909725189,
+      "learning_rate": 0.0001990928813106013,
+      "loss": 0.716,
+      "step": 983
+    },
+    {
+      "epoch": 0.1752136752136752,
+      "grad_norm": 0.4597751200199127,
+      "learning_rate": 0.0001990909992500155,
+      "loss": 1.0126,
+      "step": 984
+    },
+    {
+      "epoch": 0.1753917378917379,
+      "grad_norm": 0.4844081699848175,
+      "learning_rate": 0.0001990891152479474,
+      "loss": 1.1043,
+      "step": 985
+    },
+    {
+      "epoch": 0.17556980056980057,
+      "grad_norm": 0.4763399660587311,
+      "learning_rate": 0.00019908722930443392,
+      "loss": 1.019,
+      "step": 986
+    },
+    {
+      "epoch": 0.17574786324786323,
+      "grad_norm": 0.4670077860355377,
+      "learning_rate": 0.00019908534141951204,
+      "loss": 1.1382,
+      "step": 987
+    },
+    {
+      "epoch": 0.17592592592592593,
+      "grad_norm": 0.39372730255126953,
+      "learning_rate": 0.00019908345159321873,
+      "loss": 1.1219,
+      "step": 988
+    },
+    {
+      "epoch": 0.1761039886039886,
+      "grad_norm": 0.41869843006134033,
+      "learning_rate": 0.00019908155982559098,
+      "loss": 0.9461,
+      "step": 989
+    },
+    {
+      "epoch": 0.1762820512820513,
+      "grad_norm": 0.4398406147956848,
+      "learning_rate": 0.00019907966611666593,
+      "loss": 1.1328,
+      "step": 990
+    },
+    {
+      "epoch": 0.17646011396011396,
+      "grad_norm": 0.4315733015537262,
+      "learning_rate": 0.0001990777704664806,
+      "loss": 1.0974,
+      "step": 991
+    },
+    {
+      "epoch": 0.17663817663817663,
+      "grad_norm": 0.42859575152397156,
+      "learning_rate": 0.00019907587287507222,
+      "loss": 1.2637,
+      "step": 992
+    },
+    {
+      "epoch": 0.17681623931623933,
+      "grad_norm": 0.47928622364997864,
+      "learning_rate": 0.0001990739733424779,
+      "loss": 1.0699,
+      "step": 993
+    },
+    {
+      "epoch": 0.176994301994302,
+      "grad_norm": 0.4443826973438263,
+      "learning_rate": 0.00019907207186873488,
+      "loss": 1.0547,
+      "step": 994
+    },
+    {
+      "epoch": 0.17717236467236466,
+      "grad_norm": 0.4108099937438965,
+      "learning_rate": 0.00019907016845388043,
+      "loss": 1.1401,
+      "step": 995
+    },
+    {
+      "epoch": 0.17735042735042736,
+      "grad_norm": 0.4474675953388214,
+      "learning_rate": 0.00019906826309795182,
+      "loss": 1.0712,
+      "step": 996
+    },
+    {
+      "epoch": 0.17752849002849003,
+      "grad_norm": 0.4149756133556366,
+      "learning_rate": 0.00019906635580098638,
+      "loss": 0.9585,
+      "step": 997
+    },
+    {
+      "epoch": 0.1777065527065527,
+      "grad_norm": 0.4875968098640442,
+      "learning_rate": 0.00019906444656302152,
+      "loss": 1.0659,
+      "step": 998
+    },
+    {
+      "epoch": 0.1778846153846154,
+      "grad_norm": 0.5494784116744995,
+      "learning_rate": 0.0001990625353840946,
+      "loss": 1.2858,
+      "step": 999
+    },
+    {
+      "epoch": 0.17806267806267806,
+      "grad_norm": 0.425062358379364,
+      "learning_rate": 0.0001990606222642431,
+      "loss": 1.1826,
+      "step": 1000
+    },
+    {
+      "epoch": 0.17824074074074073,
+      "grad_norm": 0.3890725374221802,
+      "learning_rate": 0.00019905870720350445,
+      "loss": 0.9568,
+      "step": 1001
+    },
+    {
+      "epoch": 0.17841880341880342,
+      "grad_norm": 0.3884070813655853,
+      "learning_rate": 0.00019905679020191624,
+      "loss": 0.9674,
+      "step": 1002
+    },
+    {
+      "epoch": 0.1785968660968661,
+      "grad_norm": 0.49496129155158997,
+      "learning_rate": 0.00019905487125951597,
+      "loss": 0.9143,
+      "step": 1003
+    },
+    {
+      "epoch": 0.1787749287749288,
+      "grad_norm": 0.43448135256767273,
+      "learning_rate": 0.00019905295037634128,
+      "loss": 1.2677,
+      "step": 1004
+    },
+    {
+      "epoch": 0.17895299145299146,
+      "grad_norm": 0.47327905893325806,
+      "learning_rate": 0.00019905102755242982,
+      "loss": 0.9089,
+      "step": 1005
+    },
+    {
+      "epoch": 0.17913105413105412,
+      "grad_norm": 0.4962378442287445,
+      "learning_rate": 0.00019904910278781922,
+      "loss": 1.1748,
+      "step": 1006
+    },
+    {
+      "epoch": 0.17930911680911682,
+      "grad_norm": 0.4343934655189514,
+      "learning_rate": 0.0001990471760825472,
+      "loss": 1.2176,
+      "step": 1007
+    },
+    {
+      "epoch": 0.1794871794871795,
+      "grad_norm": 0.4695793092250824,
+      "learning_rate": 0.0001990452474366515,
+      "loss": 1.1822,
+      "step": 1008
+    },
+    {
+      "epoch": 0.17966524216524216,
+      "grad_norm": 0.4156060516834259,
+      "learning_rate": 0.00019904331685016995,
+      "loss": 0.8231,
+      "step": 1009
+    },
+    {
+      "epoch": 0.17984330484330485,
+      "grad_norm": 0.5068191885948181,
+      "learning_rate": 0.00019904138432314035,
+      "loss": 1.1363,
+      "step": 1010
+    },
+    {
+      "epoch": 0.18002136752136752,
+      "grad_norm": 0.5189786553382874,
+      "learning_rate": 0.00019903944985560058,
+      "loss": 1.3131,
+      "step": 1011
+    },
+    {
+      "epoch": 0.1801994301994302,
+      "grad_norm": 0.5126828551292419,
+      "learning_rate": 0.00019903751344758848,
+      "loss": 1.0305,
+      "step": 1012
+    },
+    {
+      "epoch": 0.18037749287749288,
+      "grad_norm": 0.41045933961868286,
+      "learning_rate": 0.00019903557509914205,
+      "loss": 1.2726,
+      "step": 1013
+    },
+    {
+      "epoch": 0.18055555555555555,
+      "grad_norm": 0.4141713082790375,
+      "learning_rate": 0.0001990336348102993,
+      "loss": 0.9606,
+      "step": 1014
+    },
+    {
+      "epoch": 0.18073361823361822,
+      "grad_norm": 0.42652079463005066,
+      "learning_rate": 0.00019903169258109812,
+      "loss": 1.0235,
+      "step": 1015
+    },
+    {
+      "epoch": 0.18091168091168092,
+      "grad_norm": 0.42098379135131836,
+      "learning_rate": 0.0001990297484115767,
+      "loss": 1.0602,
+      "step": 1016
+    },
+    {
+      "epoch": 0.18108974358974358,
+      "grad_norm": 0.49920013546943665,
+      "learning_rate": 0.0001990278023017731,
+      "loss": 1.3322,
+      "step": 1017
+    },
+    {
+      "epoch": 0.18126780626780628,
+      "grad_norm": 0.412304550409317,
+      "learning_rate": 0.00019902585425172537,
+      "loss": 1.1011,
+      "step": 1018
+    },
+    {
+      "epoch": 0.18144586894586895,
+      "grad_norm": 0.44226935505867004,
+      "learning_rate": 0.00019902390426147177,
+      "loss": 0.9777,
+      "step": 1019
+    },
+    {
+      "epoch": 0.18162393162393162,
+      "grad_norm": 0.4685269594192505,
+      "learning_rate": 0.00019902195233105046,
+      "loss": 1.3587,
+      "step": 1020
+    },
+    {
+      "epoch": 0.1818019943019943,
+      "grad_norm": 0.4500584304332733,
+      "learning_rate": 0.00019901999846049968,
+      "loss": 0.9888,
+      "step": 1021
+    },
+    {
+      "epoch": 0.18198005698005698,
+      "grad_norm": 0.48566994071006775,
+      "learning_rate": 0.00019901804264985774,
+      "loss": 1.2364,
+      "step": 1022
+    },
+    {
+      "epoch": 0.18215811965811965,
+      "grad_norm": 0.4063156247138977,
+      "learning_rate": 0.00019901608489916294,
+      "loss": 1.2224,
+      "step": 1023
+    },
+    {
+      "epoch": 0.18233618233618235,
+      "grad_norm": 0.471276193857193,
+      "learning_rate": 0.00019901412520845367,
+      "loss": 0.9926,
+      "step": 1024
+    },
+    {
+      "epoch": 0.182514245014245,
+      "grad_norm": 0.5165421366691589,
+      "learning_rate": 0.00019901216357776829,
+      "loss": 0.9595,
+      "step": 1025
+    },
+    {
+      "epoch": 0.18269230769230768,
+      "grad_norm": 0.4746754467487335,
+      "learning_rate": 0.0001990102000071452,
+      "loss": 1.2057,
+      "step": 1026
+    },
+    {
+      "epoch": 0.18287037037037038,
+      "grad_norm": 0.44803035259246826,
+      "learning_rate": 0.00019900823449662297,
+      "loss": 1.2114,
+      "step": 1027
+    },
+    {
+      "epoch": 0.18304843304843305,
+      "grad_norm": 0.47256240248680115,
+      "learning_rate": 0.00019900626704624005,
+      "loss": 1.112,
+      "step": 1028
+    },
+    {
+      "epoch": 0.18322649572649571,
+      "grad_norm": 0.4253387153148651,
+      "learning_rate": 0.000199004297656035,
+      "loss": 0.9899,
+      "step": 1029
+    },
+    {
+      "epoch": 0.1834045584045584,
+      "grad_norm": 0.44958099722862244,
+      "learning_rate": 0.00019900232632604636,
+      "loss": 1.1445,
+      "step": 1030
+    },
+    {
+      "epoch": 0.18358262108262108,
+      "grad_norm": 0.5296537280082703,
+      "learning_rate": 0.00019900035305631285,
+      "loss": 1.2502,
+      "step": 1031
+    },
+    {
+      "epoch": 0.18376068376068377,
+      "grad_norm": 0.5057148933410645,
+      "learning_rate": 0.00019899837784687302,
+      "loss": 1.1426,
+      "step": 1032
+    },
+    {
+      "epoch": 0.18393874643874644,
+      "grad_norm": 0.41463762521743774,
+      "learning_rate": 0.00019899640069776566,
+      "loss": 1.1854,
+      "step": 1033
+    },
+    {
+      "epoch": 0.1841168091168091,
+      "grad_norm": 0.45800045132637024,
+      "learning_rate": 0.00019899442160902945,
+      "loss": 1.2438,
+      "step": 1034
+    },
+    {
+      "epoch": 0.1842948717948718,
+      "grad_norm": 0.43450453877449036,
+      "learning_rate": 0.00019899244058070324,
+      "loss": 1.0598,
+      "step": 1035
+    },
+    {
+      "epoch": 0.18447293447293447,
+      "grad_norm": 0.4141148626804352,
+      "learning_rate": 0.00019899045761282577,
+      "loss": 1.0465,
+      "step": 1036
+    },
+    {
+      "epoch": 0.18465099715099714,
+      "grad_norm": 0.3938458263874054,
+      "learning_rate": 0.0001989884727054359,
+      "loss": 1.0142,
+      "step": 1037
+    },
+    {
+      "epoch": 0.18482905982905984,
+      "grad_norm": 0.43898263573646545,
+      "learning_rate": 0.00019898648585857257,
+      "loss": 0.9212,
+      "step": 1038
+    },
+    {
+      "epoch": 0.1850071225071225,
+      "grad_norm": 0.4425487816333771,
+      "learning_rate": 0.00019898449707227465,
+      "loss": 1.2987,
+      "step": 1039
+    },
+    {
+      "epoch": 0.18518518518518517,
+      "grad_norm": 0.4537975490093231,
+      "learning_rate": 0.00019898250634658115,
+      "loss": 1.2023,
+      "step": 1040
+    },
+    {
+      "epoch": 0.18536324786324787,
+      "grad_norm": 0.4107198119163513,
+      "learning_rate": 0.00019898051368153104,
+      "loss": 0.8443,
+      "step": 1041
+    },
+    {
+      "epoch": 0.18554131054131054,
+      "grad_norm": 0.4389404058456421,
+      "learning_rate": 0.0001989785190771634,
+      "loss": 1.0502,
+      "step": 1042
+    },
+    {
+      "epoch": 0.1857193732193732,
+      "grad_norm": 0.4288824796676636,
+      "learning_rate": 0.00019897652253351726,
+      "loss": 1.01,
+      "step": 1043
+    },
+    {
+      "epoch": 0.1858974358974359,
+      "grad_norm": 0.50815349817276,
+      "learning_rate": 0.00019897452405063178,
+      "loss": 1.0308,
+      "step": 1044
+    },
+    {
+      "epoch": 0.18607549857549857,
+      "grad_norm": 0.45252710580825806,
+      "learning_rate": 0.0001989725236285461,
+      "loss": 1.0967,
+      "step": 1045
+    },
+    {
+      "epoch": 0.18625356125356127,
+      "grad_norm": 0.45049402117729187,
+      "learning_rate": 0.00019897052126729943,
+      "loss": 1.0141,
+      "step": 1046
+    },
+    {
+      "epoch": 0.18643162393162394,
+      "grad_norm": 0.49637508392333984,
+      "learning_rate": 0.00019896851696693098,
+      "loss": 1.0997,
+      "step": 1047
+    },
+    {
+      "epoch": 0.1866096866096866,
+      "grad_norm": 0.4465886056423187,
+      "learning_rate": 0.00019896651072748005,
+      "loss": 1.1415,
+      "step": 1048
+    },
+    {
+      "epoch": 0.1867877492877493,
+      "grad_norm": 0.5309500694274902,
+      "learning_rate": 0.00019896450254898592,
+      "loss": 1.1028,
+      "step": 1049
+    },
+    {
+      "epoch": 0.18696581196581197,
+      "grad_norm": 0.3516653776168823,
+      "learning_rate": 0.00019896249243148793,
+      "loss": 0.9841,
+      "step": 1050
+    },
+    {
+      "epoch": 0.18714387464387464,
+      "grad_norm": 0.4529176950454712,
+      "learning_rate": 0.0001989604803750255,
+      "loss": 1.1335,
+      "step": 1051
+    },
+    {
+      "epoch": 0.18732193732193733,
+      "grad_norm": 0.47694942355155945,
+      "learning_rate": 0.000198958466379638,
+      "loss": 1.2383,
+      "step": 1052
+    },
+    {
+      "epoch": 0.1875,
+      "grad_norm": 0.5524206757545471,
+      "learning_rate": 0.0001989564504453649,
+      "loss": 1.3668,
+      "step": 1053
+    },
+    {
+      "epoch": 0.18767806267806267,
+      "grad_norm": 0.39203691482543945,
+      "learning_rate": 0.00019895443257224576,
+      "loss": 1.2203,
+      "step": 1054
+    },
+    {
+      "epoch": 0.18785612535612536,
+      "grad_norm": 0.4164120852947235,
+      "learning_rate": 0.00019895241276032005,
+      "loss": 0.8954,
+      "step": 1055
+    },
+    {
+      "epoch": 0.18803418803418803,
+      "grad_norm": 0.41217970848083496,
+      "learning_rate": 0.0001989503910096274,
+      "loss": 1.0238,
+      "step": 1056
+    },
+    {
+      "epoch": 0.1882122507122507,
+      "grad_norm": 0.44038307666778564,
+      "learning_rate": 0.00019894836732020735,
+      "loss": 0.8159,
+      "step": 1057
+    },
+    {
+      "epoch": 0.1883903133903134,
+      "grad_norm": 0.45780670642852783,
+      "learning_rate": 0.0001989463416920996,
+      "loss": 1.2864,
+      "step": 1058
+    },
+    {
+      "epoch": 0.18856837606837606,
+      "grad_norm": 0.5197559595108032,
+      "learning_rate": 0.00019894431412534384,
+      "loss": 1.0756,
+      "step": 1059
+    },
+    {
+      "epoch": 0.18874643874643873,
+      "grad_norm": 0.43283385038375854,
+      "learning_rate": 0.00019894228461997979,
+      "loss": 1.0642,
+      "step": 1060
+    },
+    {
+      "epoch": 0.18892450142450143,
+      "grad_norm": 0.4657376706600189,
+      "learning_rate": 0.00019894025317604717,
+      "loss": 1.1159,
+      "step": 1061
+    },
+    {
+      "epoch": 0.1891025641025641,
+      "grad_norm": 0.4474908113479614,
+      "learning_rate": 0.00019893821979358588,
+      "loss": 1.2006,
+      "step": 1062
+    },
+    {
+      "epoch": 0.1892806267806268,
+      "grad_norm": 0.43878164887428284,
+      "learning_rate": 0.00019893618447263566,
+      "loss": 1.1599,
+      "step": 1063
+    },
+    {
+      "epoch": 0.18945868945868946,
+      "grad_norm": 0.4598735272884369,
+      "learning_rate": 0.00019893414721323645,
+      "loss": 1.3346,
+      "step": 1064
+    },
+    {
+      "epoch": 0.18963675213675213,
+      "grad_norm": 0.3947420120239258,
+      "learning_rate": 0.00019893210801542812,
+      "loss": 1.1201,
+      "step": 1065
+    },
+    {
+      "epoch": 0.18981481481481483,
+      "grad_norm": 0.3401558995246887,
+      "learning_rate": 0.00019893006687925064,
+      "loss": 0.7568,
+      "step": 1066
+    },
+    {
+      "epoch": 0.1899928774928775,
+      "grad_norm": 0.4400341808795929,
+      "learning_rate": 0.00019892802380474405,
+      "loss": 1.1706,
+      "step": 1067
+    },
+    {
+      "epoch": 0.19017094017094016,
+      "grad_norm": 0.42394164204597473,
+      "learning_rate": 0.00019892597879194829,
+      "loss": 1.0163,
+      "step": 1068
+    },
+    {
+      "epoch": 0.19034900284900286,
+      "grad_norm": 0.42904096841812134,
+      "learning_rate": 0.00019892393184090353,
+      "loss": 0.9193,
+      "step": 1069
+    },
+    {
+      "epoch": 0.19052706552706553,
+      "grad_norm": 0.497601181268692,
+      "learning_rate": 0.00019892188295164977,
+      "loss": 1.0377,
+      "step": 1070
+    },
+    {
+      "epoch": 0.1907051282051282,
+      "grad_norm": 0.4536020755767822,
+      "learning_rate": 0.00019891983212422723,
+      "loss": 1.0946,
+      "step": 1071
+    },
+    {
+      "epoch": 0.1908831908831909,
+      "grad_norm": 0.44916942715644836,
+      "learning_rate": 0.00019891777935867607,
+      "loss": 1.0563,
+      "step": 1072
+    },
+    {
+      "epoch": 0.19106125356125356,
+      "grad_norm": 0.4256889820098877,
+      "learning_rate": 0.0001989157246550365,
+      "loss": 1.0988,
+      "step": 1073
+    },
+    {
+      "epoch": 0.19123931623931623,
+      "grad_norm": 0.5559163689613342,
+      "learning_rate": 0.0001989136680133488,
+      "loss": 0.9155,
+      "step": 1074
+    },
+    {
+      "epoch": 0.19141737891737892,
+      "grad_norm": 0.391804963350296,
+      "learning_rate": 0.00019891160943365322,
+      "loss": 0.9314,
+      "step": 1075
+    },
+    {
+      "epoch": 0.1915954415954416,
+      "grad_norm": 0.4535716474056244,
+      "learning_rate": 0.00019890954891599015,
+      "loss": 1.0768,
+      "step": 1076
+    },
+    {
+      "epoch": 0.19177350427350429,
+      "grad_norm": 0.46770521998405457,
+      "learning_rate": 0.00019890748646039991,
+      "loss": 0.8406,
+      "step": 1077
+    },
+    {
+      "epoch": 0.19195156695156695,
+      "grad_norm": 0.4875394403934479,
+      "learning_rate": 0.00019890542206692295,
+      "loss": 1.1055,
+      "step": 1078
+    },
+    {
+      "epoch": 0.19212962962962962,
+      "grad_norm": 0.5072727203369141,
+      "learning_rate": 0.0001989033557355997,
+      "loss": 1.3093,
+      "step": 1079
+    },
+    {
+      "epoch": 0.19230769230769232,
+      "grad_norm": 0.4419287443161011,
+      "learning_rate": 0.00019890128746647068,
+      "loss": 1.1916,
+      "step": 1080
+    },
+    {
+      "epoch": 0.192485754985755,
+      "grad_norm": 0.45803651213645935,
+      "learning_rate": 0.00019889921725957637,
+      "loss": 1.2579,
+      "step": 1081
+    },
+    {
+      "epoch": 0.19266381766381765,
+      "grad_norm": 0.4832262098789215,
+      "learning_rate": 0.0001988971451149573,
+      "loss": 1.3217,
+      "step": 1082
+    },
+    {
+      "epoch": 0.19284188034188035,
+      "grad_norm": 0.4819786250591278,
+      "learning_rate": 0.00019889507103265416,
+      "loss": 1.0979,
+      "step": 1083
+    },
+    {
+      "epoch": 0.19301994301994302,
+      "grad_norm": 0.49360713362693787,
+      "learning_rate": 0.0001988929950127075,
+      "loss": 1.0987,
+      "step": 1084
+    },
+    {
+      "epoch": 0.1931980056980057,
+      "grad_norm": 0.44209200143814087,
+      "learning_rate": 0.00019889091705515806,
+      "loss": 1.2616,
+      "step": 1085
+    },
+    {
+      "epoch": 0.19337606837606838,
+      "grad_norm": 0.41626206040382385,
+      "learning_rate": 0.00019888883716004654,
+      "loss": 1.0922,
+      "step": 1086
+    },
+    {
+      "epoch": 0.19355413105413105,
+      "grad_norm": 0.4916635751724243,
+      "learning_rate": 0.00019888675532741366,
+      "loss": 0.9331,
+      "step": 1087
+    },
+    {
+      "epoch": 0.19373219373219372,
+      "grad_norm": 0.4493125379085541,
+      "learning_rate": 0.00019888467155730025,
+      "loss": 1.1261,
+      "step": 1088
+    },
+    {
+      "epoch": 0.19391025641025642,
+      "grad_norm": 0.3755671977996826,
+      "learning_rate": 0.00019888258584974708,
+      "loss": 0.9821,
+      "step": 1089
+    },
+    {
+      "epoch": 0.19408831908831908,
+      "grad_norm": 0.41917556524276733,
+      "learning_rate": 0.00019888049820479507,
+      "loss": 1.251,
+      "step": 1090
+    },
+    {
+      "epoch": 0.19426638176638178,
+      "grad_norm": 0.46184420585632324,
+      "learning_rate": 0.0001988784086224851,
+      "loss": 1.1731,
+      "step": 1091
+    },
+    {
+      "epoch": 0.19444444444444445,
+      "grad_norm": 0.4783691465854645,
+      "learning_rate": 0.00019887631710285812,
+      "loss": 1.1635,
+      "step": 1092
+    },
+    {
+      "epoch": 0.19462250712250712,
+      "grad_norm": 0.4710482060909271,
+      "learning_rate": 0.00019887422364595512,
+      "loss": 1.0229,
+      "step": 1093
+    },
+    {
+      "epoch": 0.1948005698005698,
+      "grad_norm": 0.4738706648349762,
+      "learning_rate": 0.00019887212825181707,
+      "loss": 1.128,
+      "step": 1094
+    },
+    {
+      "epoch": 0.19497863247863248,
+      "grad_norm": 0.45665010809898376,
+      "learning_rate": 0.00019887003092048508,
+      "loss": 1.0425,
+      "step": 1095
+    },
+    {
+      "epoch": 0.19515669515669515,
+      "grad_norm": 0.42740485072135925,
+      "learning_rate": 0.0001988679316520002,
+      "loss": 1.0738,
+      "step": 1096
+    },
+    {
+      "epoch": 0.19533475783475784,
+      "grad_norm": 0.5977092385292053,
+      "learning_rate": 0.0001988658304464036,
+      "loss": 1.2687,
+      "step": 1097
+    },
+    {
+      "epoch": 0.1955128205128205,
+      "grad_norm": 0.4411074221134186,
+      "learning_rate": 0.0001988637273037364,
+      "loss": 1.287,
+      "step": 1098
+    },
+    {
+      "epoch": 0.19569088319088318,
+      "grad_norm": 0.4409518539905548,
+      "learning_rate": 0.00019886162222403986,
+      "loss": 1.0515,
+      "step": 1099
+    },
+    {
+      "epoch": 0.19586894586894588,
+      "grad_norm": 0.4926736652851105,
+      "learning_rate": 0.0001988595152073552,
+      "loss": 1.1388,
+      "step": 1100
+    },
+    {
+      "epoch": 0.19604700854700854,
+      "grad_norm": 0.4607115387916565,
+      "learning_rate": 0.00019885740625372368,
+      "loss": 0.9803,
+      "step": 1101
+    },
+    {
+      "epoch": 0.1962250712250712,
+      "grad_norm": 0.4725342094898224,
+      "learning_rate": 0.0001988552953631867,
+      "loss": 1.199,
+      "step": 1102
+    },
+    {
+      "epoch": 0.1964031339031339,
+      "grad_norm": 0.48014503717422485,
+      "learning_rate": 0.00019885318253578548,
+      "loss": 1.1868,
+      "step": 1103
+    },
+    {
+      "epoch": 0.19658119658119658,
+      "grad_norm": 0.3872644603252411,
+      "learning_rate": 0.00019885106777156155,
+      "loss": 0.9182,
+      "step": 1104
+    },
+    {
+      "epoch": 0.19675925925925927,
+      "grad_norm": 0.4737720787525177,
+      "learning_rate": 0.00019884895107055627,
+      "loss": 1.1513,
+      "step": 1105
+    },
+    {
+      "epoch": 0.19693732193732194,
+      "grad_norm": 0.4144562780857086,
+      "learning_rate": 0.00019884683243281116,
+      "loss": 1.1711,
+      "step": 1106
+    },
+    {
+      "epoch": 0.1971153846153846,
+      "grad_norm": 0.4672079384326935,
+      "learning_rate": 0.00019884471185836769,
+      "loss": 1.0386,
+      "step": 1107
+    },
+    {
+      "epoch": 0.1972934472934473,
+      "grad_norm": 0.4558824598789215,
+      "learning_rate": 0.0001988425893472674,
+      "loss": 1.0535,
+      "step": 1108
+    },
+    {
+      "epoch": 0.19747150997150997,
+      "grad_norm": 0.5149834752082825,
+      "learning_rate": 0.00019884046489955192,
+      "loss": 1.0296,
+      "step": 1109
+    },
+    {
+      "epoch": 0.19764957264957264,
+      "grad_norm": 0.43444496393203735,
+      "learning_rate": 0.00019883833851526287,
+      "loss": 1.1475,
+      "step": 1110
+    },
+    {
+      "epoch": 0.19782763532763534,
+      "grad_norm": 0.46062374114990234,
+      "learning_rate": 0.00019883621019444188,
+      "loss": 1.183,
+      "step": 1111
+    },
+    {
+      "epoch": 0.198005698005698,
+      "grad_norm": 0.4893282949924469,
+      "learning_rate": 0.00019883407993713065,
+      "loss": 1.3733,
+      "step": 1112
+    },
+    {
+      "epoch": 0.19818376068376067,
+      "grad_norm": 0.5434843897819519,
+      "learning_rate": 0.00019883194774337096,
+      "loss": 1.2505,
+      "step": 1113
+    },
+    {
+      "epoch": 0.19836182336182337,
+      "grad_norm": 0.4698035418987274,
+      "learning_rate": 0.00019882981361320456,
+      "loss": 1.0152,
+      "step": 1114
+    },
+    {
+      "epoch": 0.19853988603988604,
+      "grad_norm": 0.4582163989543915,
+      "learning_rate": 0.00019882767754667325,
+      "loss": 1.1718,
+      "step": 1115
+    },
+    {
+      "epoch": 0.1987179487179487,
+      "grad_norm": 0.48744696378707886,
+      "learning_rate": 0.0001988255395438189,
+      "loss": 1.2923,
+      "step": 1116
+    },
+    {
+      "epoch": 0.1988960113960114,
+      "grad_norm": 0.4172030985355377,
+      "learning_rate": 0.0001988233996046834,
+      "loss": 0.8098,
+      "step": 1117
+    },
+    {
+      "epoch": 0.19907407407407407,
+      "grad_norm": 0.4556557834148407,
+      "learning_rate": 0.00019882125772930867,
+      "loss": 0.9654,
+      "step": 1118
+    },
+    {
+      "epoch": 0.19925213675213677,
+      "grad_norm": 0.4363219141960144,
+      "learning_rate": 0.00019881911391773666,
+      "loss": 1.0333,
+      "step": 1119
+    },
+    {
+      "epoch": 0.19943019943019943,
+      "grad_norm": 0.4336536228656769,
+      "learning_rate": 0.0001988169681700094,
+      "loss": 1.091,
+      "step": 1120
+    },
+    {
+      "epoch": 0.1996082621082621,
+      "grad_norm": 0.42073166370391846,
+      "learning_rate": 0.00019881482048616893,
+      "loss": 0.9687,
+      "step": 1121
+    },
+    {
+      "epoch": 0.1997863247863248,
+      "grad_norm": 0.4330587685108185,
+      "learning_rate": 0.00019881267086625733,
+      "loss": 1.0512,
+      "step": 1122
+    },
+    {
+      "epoch": 0.19996438746438747,
+      "grad_norm": 0.4602276682853699,
+      "learning_rate": 0.0001988105193103167,
+      "loss": 1.1806,
+      "step": 1123
+    },
+    {
+      "epoch": 0.20014245014245013,
+      "grad_norm": 0.4271257817745209,
+      "learning_rate": 0.0001988083658183892,
+      "loss": 1.1079,
+      "step": 1124
+    },
+    {
+      "epoch": 0.20032051282051283,
+      "grad_norm": 0.35446426272392273,
+      "learning_rate": 0.00019880621039051707,
+      "loss": 0.6769,
+      "step": 1125
+    },
+    {
+      "epoch": 0.2004985754985755,
+      "grad_norm": 0.413753479719162,
+      "learning_rate": 0.00019880405302674244,
+      "loss": 1.1088,
+      "step": 1126
+    },
+    {
+      "epoch": 0.20067663817663817,
+      "grad_norm": 0.4423675835132599,
+      "learning_rate": 0.00019880189372710767,
+      "loss": 1.1371,
+      "step": 1127
+    },
+    {
+      "epoch": 0.20085470085470086,
+      "grad_norm": 0.41865605115890503,
+      "learning_rate": 0.00019879973249165502,
+      "loss": 1.0027,
+      "step": 1128
+    },
+    {
+      "epoch": 0.20103276353276353,
+      "grad_norm": 0.4109594225883484,
+      "learning_rate": 0.00019879756932042686,
+      "loss": 0.8734,
+      "step": 1129
+    },
+    {
+      "epoch": 0.2012108262108262,
+      "grad_norm": 0.42326363921165466,
+      "learning_rate": 0.00019879540421346555,
+      "loss": 0.9722,
+      "step": 1130
+    },
+    {
+      "epoch": 0.2013888888888889,
+      "grad_norm": 0.4601542055606842,
+      "learning_rate": 0.00019879323717081354,
+      "loss": 1.1251,
+      "step": 1131
+    },
+    {
+      "epoch": 0.20156695156695156,
+      "grad_norm": 0.4704367518424988,
+      "learning_rate": 0.00019879106819251327,
+      "loss": 0.9457,
+      "step": 1132
+    },
+    {
+      "epoch": 0.20174501424501423,
+      "grad_norm": 0.465023934841156,
+      "learning_rate": 0.00019878889727860724,
+      "loss": 0.9633,
+      "step": 1133
+    },
+    {
+      "epoch": 0.20192307692307693,
+      "grad_norm": 0.4572450518608093,
+      "learning_rate": 0.00019878672442913796,
+      "loss": 1.1965,
+      "step": 1134
+    },
+    {
+      "epoch": 0.2021011396011396,
+      "grad_norm": 0.4323410391807556,
+      "learning_rate": 0.00019878454964414807,
+      "loss": 1.1296,
+      "step": 1135
+    },
+    {
+      "epoch": 0.2022792022792023,
+      "grad_norm": 0.4513751268386841,
+      "learning_rate": 0.00019878237292368013,
+      "loss": 1.0571,
+      "step": 1136
+    },
+    {
+      "epoch": 0.20245726495726496,
+      "grad_norm": 0.45504096150398254,
+      "learning_rate": 0.00019878019426777677,
+      "loss": 1.0316,
+      "step": 1137
+    },
+    {
+      "epoch": 0.20263532763532763,
+      "grad_norm": 0.45715275406837463,
+      "learning_rate": 0.0001987780136764807,
+      "loss": 1.0528,
+      "step": 1138
+    },
+    {
+      "epoch": 0.20281339031339032,
+      "grad_norm": 0.4934465289115906,
+      "learning_rate": 0.00019877583114983466,
+      "loss": 1.3238,
+      "step": 1139
+    },
+    {
+      "epoch": 0.202991452991453,
+      "grad_norm": 0.4304082989692688,
+      "learning_rate": 0.0001987736466878814,
+      "loss": 1.1774,
+      "step": 1140
+    },
+    {
+      "epoch": 0.20316951566951566,
+      "grad_norm": 0.49721968173980713,
+      "learning_rate": 0.00019877146029066372,
+      "loss": 1.1767,
+      "step": 1141
+    },
+    {
+      "epoch": 0.20334757834757836,
+      "grad_norm": 0.3629468083381653,
+      "learning_rate": 0.00019876927195822445,
+      "loss": 0.8588,
+      "step": 1142
+    },
+    {
+      "epoch": 0.20352564102564102,
+      "grad_norm": 0.49310383200645447,
+      "learning_rate": 0.00019876708169060648,
+      "loss": 1.0588,
+      "step": 1143
+    },
+    {
+      "epoch": 0.2037037037037037,
+      "grad_norm": 0.4270328879356384,
+      "learning_rate": 0.00019876488948785271,
+      "loss": 1.1523,
+      "step": 1144
+    },
+    {
+      "epoch": 0.2038817663817664,
+      "grad_norm": 0.4559730887413025,
+      "learning_rate": 0.0001987626953500061,
+      "loss": 1.1736,
+      "step": 1145
+    },
+    {
+      "epoch": 0.20405982905982906,
+      "grad_norm": 0.5335259437561035,
+      "learning_rate": 0.00019876049927710962,
+      "loss": 0.991,
+      "step": 1146
+    },
+    {
+      "epoch": 0.20423789173789172,
+      "grad_norm": 0.43500083684921265,
+      "learning_rate": 0.0001987583012692063,
+      "loss": 1.0631,
+      "step": 1147
+    },
+    {
+      "epoch": 0.20441595441595442,
+      "grad_norm": 0.4135417938232422,
+      "learning_rate": 0.00019875610132633927,
+      "loss": 1.0896,
+      "step": 1148
+    },
+    {
+      "epoch": 0.2045940170940171,
+      "grad_norm": 0.4078896641731262,
+      "learning_rate": 0.00019875389944855153,
+      "loss": 1.0395,
+      "step": 1149
+    },
+    {
+      "epoch": 0.20477207977207978,
+      "grad_norm": 0.46612194180488586,
+      "learning_rate": 0.00019875169563588632,
+      "loss": 1.0541,
+      "step": 1150
+    },
+    {
+      "epoch": 0.20495014245014245,
+      "grad_norm": 0.5093224048614502,
+      "learning_rate": 0.00019874948988838674,
+      "loss": 1.1486,
+      "step": 1151
+    },
+    {
+      "epoch": 0.20512820512820512,
+      "grad_norm": 0.5079755187034607,
+      "learning_rate": 0.00019874728220609607,
+      "loss": 1.2614,
+      "step": 1152
+    },
+    {
+      "epoch": 0.20530626780626782,
+      "grad_norm": 0.43663498759269714,
+      "learning_rate": 0.0001987450725890575,
+      "loss": 1.0683,
+      "step": 1153
+    },
+    {
+      "epoch": 0.20548433048433049,
+      "grad_norm": 0.5029327273368835,
+      "learning_rate": 0.00019874286103731435,
+      "loss": 1.1934,
+      "step": 1154
+    },
+    {
+      "epoch": 0.20566239316239315,
+      "grad_norm": 0.48770397901535034,
+      "learning_rate": 0.00019874064755090999,
+      "loss": 1.1634,
+      "step": 1155
+    },
+    {
+      "epoch": 0.20584045584045585,
+      "grad_norm": 0.46826690435409546,
+      "learning_rate": 0.00019873843212988776,
+      "loss": 1.0621,
+      "step": 1156
+    },
+    {
+      "epoch": 0.20601851851851852,
+      "grad_norm": 0.4810047149658203,
+      "learning_rate": 0.00019873621477429105,
+      "loss": 1.0879,
+      "step": 1157
+    },
+    {
+      "epoch": 0.20619658119658119,
+      "grad_norm": 0.4769522249698639,
+      "learning_rate": 0.00019873399548416335,
+      "loss": 1.1365,
+      "step": 1158
+    },
+    {
+      "epoch": 0.20637464387464388,
+      "grad_norm": 0.4221782982349396,
+      "learning_rate": 0.00019873177425954806,
+      "loss": 1.1168,
+      "step": 1159
+    },
+    {
+      "epoch": 0.20655270655270655,
+      "grad_norm": 0.4084923565387726,
+      "learning_rate": 0.00019872955110048876,
+      "loss": 1.2364,
+      "step": 1160
+    },
+    {
+      "epoch": 0.20673076923076922,
+      "grad_norm": 0.4781704545021057,
+      "learning_rate": 0.00019872732600702904,
+      "loss": 1.19,
+      "step": 1161
+    },
+    {
+      "epoch": 0.20690883190883191,
+      "grad_norm": 0.3984242081642151,
+      "learning_rate": 0.0001987250989792124,
+      "loss": 1.0568,
+      "step": 1162
+    },
+    {
+      "epoch": 0.20708689458689458,
+      "grad_norm": 0.4601972997188568,
+      "learning_rate": 0.00019872287001708257,
+      "loss": 1.1625,
+      "step": 1163
+    },
+    {
+      "epoch": 0.20726495726495728,
+      "grad_norm": 0.4853581190109253,
+      "learning_rate": 0.00019872063912068316,
+      "loss": 1.2304,
+      "step": 1164
+    },
+    {
+      "epoch": 0.20744301994301995,
+      "grad_norm": 0.41779839992523193,
+      "learning_rate": 0.0001987184062900579,
+      "loss": 0.9807,
+      "step": 1165
+    },
+    {
+      "epoch": 0.20762108262108261,
+      "grad_norm": 0.4945356249809265,
+      "learning_rate": 0.00019871617152525056,
+      "loss": 1.1861,
+      "step": 1166
+    },
+    {
+      "epoch": 0.2077991452991453,
+      "grad_norm": 0.47432294487953186,
+      "learning_rate": 0.00019871393482630487,
+      "loss": 1.1448,
+      "step": 1167
+    },
+    {
+      "epoch": 0.20797720797720798,
+      "grad_norm": 0.44647398591041565,
+      "learning_rate": 0.00019871169619326473,
+      "loss": 1.096,
+      "step": 1168
+    },
+    {
+      "epoch": 0.20815527065527065,
+      "grad_norm": 0.4643072783946991,
+      "learning_rate": 0.00019870945562617393,
+      "loss": 1.1561,
+      "step": 1169
+    },
+    {
+      "epoch": 0.20833333333333334,
+      "grad_norm": 0.4544340968132019,
+      "learning_rate": 0.0001987072131250764,
+      "loss": 1.0764,
+      "step": 1170
+    },
+    {
+      "epoch": 0.208511396011396,
+      "grad_norm": 0.6036561727523804,
+      "learning_rate": 0.00019870496869001607,
+      "loss": 1.3961,
+      "step": 1171
+    },
+    {
+      "epoch": 0.20868945868945868,
+      "grad_norm": 0.41348758339881897,
+      "learning_rate": 0.00019870272232103695,
+      "loss": 1.2219,
+      "step": 1172
+    },
+    {
+      "epoch": 0.20886752136752137,
+      "grad_norm": 0.4184056222438812,
+      "learning_rate": 0.000198700474018183,
+      "loss": 1.1115,
+      "step": 1173
+    },
+    {
+      "epoch": 0.20904558404558404,
+      "grad_norm": 0.41920599341392517,
+      "learning_rate": 0.0001986982237814983,
+      "loss": 0.9207,
+      "step": 1174
+    },
+    {
+      "epoch": 0.2092236467236467,
+      "grad_norm": 0.4710249602794647,
+      "learning_rate": 0.00019869597161102694,
+      "loss": 1.1342,
+      "step": 1175
+    },
+    {
+      "epoch": 0.2094017094017094,
+      "grad_norm": 0.46897777915000916,
+      "learning_rate": 0.000198693717506813,
+      "loss": 0.983,
+      "step": 1176
+    },
+    {
+      "epoch": 0.20957977207977208,
+      "grad_norm": 0.4817039370536804,
+      "learning_rate": 0.00019869146146890074,
+      "loss": 1.0923,
+      "step": 1177
+    },
+    {
+      "epoch": 0.20975783475783477,
+      "grad_norm": 0.4806751012802124,
+      "learning_rate": 0.00019868920349733427,
+      "loss": 1.2296,
+      "step": 1178
+    },
+    {
+      "epoch": 0.20993589743589744,
+      "grad_norm": 0.44182994961738586,
+      "learning_rate": 0.0001986869435921579,
+      "loss": 1.1856,
+      "step": 1179
+    },
+    {
+      "epoch": 0.2101139601139601,
+      "grad_norm": 0.4282805621623993,
+      "learning_rate": 0.00019868468175341584,
+      "loss": 1.0046,
+      "step": 1180
+    },
+    {
+      "epoch": 0.2102920227920228,
+      "grad_norm": 0.5011838674545288,
+      "learning_rate": 0.00019868241798115242,
+      "loss": 1.2401,
+      "step": 1181
+    },
+    {
+      "epoch": 0.21047008547008547,
+      "grad_norm": 0.4282447397708893,
+      "learning_rate": 0.00019868015227541208,
+      "loss": 0.9338,
+      "step": 1182
+    },
+    {
+      "epoch": 0.21064814814814814,
+      "grad_norm": 0.4348810911178589,
+      "learning_rate": 0.00019867788463623912,
+      "loss": 0.926,
+      "step": 1183
+    },
+    {
+      "epoch": 0.21082621082621084,
+      "grad_norm": 0.41518425941467285,
+      "learning_rate": 0.00019867561506367799,
+      "loss": 1.2723,
+      "step": 1184
+    },
+    {
+      "epoch": 0.2110042735042735,
+      "grad_norm": 0.47346001863479614,
+      "learning_rate": 0.00019867334355777315,
+      "loss": 1.1931,
+      "step": 1185
+    },
+    {
+      "epoch": 0.21118233618233617,
+      "grad_norm": 0.4071715474128723,
+      "learning_rate": 0.00019867107011856914,
+      "loss": 0.9619,
+      "step": 1186
+    },
+    {
+      "epoch": 0.21136039886039887,
+      "grad_norm": 0.4803447425365448,
+      "learning_rate": 0.00019866879474611046,
+      "loss": 1.2,
+      "step": 1187
+    },
+    {
+      "epoch": 0.21153846153846154,
+      "grad_norm": 0.4827699661254883,
+      "learning_rate": 0.00019866651744044172,
+      "loss": 1.0938,
+      "step": 1188
+    },
+    {
+      "epoch": 0.2117165242165242,
+      "grad_norm": 0.4528424143791199,
+      "learning_rate": 0.00019866423820160756,
+      "loss": 0.9721,
+      "step": 1189
+    },
+    {
+      "epoch": 0.2118945868945869,
+      "grad_norm": 0.43566834926605225,
+      "learning_rate": 0.0001986619570296526,
+      "loss": 1.0352,
+      "step": 1190
+    },
+    {
+      "epoch": 0.21207264957264957,
+      "grad_norm": 0.4516540467739105,
+      "learning_rate": 0.0001986596739246215,
+      "loss": 1.1333,
+      "step": 1191
+    },
+    {
+      "epoch": 0.21225071225071226,
+      "grad_norm": 0.4456641376018524,
+      "learning_rate": 0.00019865738888655908,
+      "loss": 1.2813,
+      "step": 1192
+    },
+    {
+      "epoch": 0.21242877492877493,
+      "grad_norm": 0.47048309445381165,
+      "learning_rate": 0.00019865510191551008,
+      "loss": 1.1067,
+      "step": 1193
+    },
+    {
+      "epoch": 0.2126068376068376,
+      "grad_norm": 0.4604061543941498,
+      "learning_rate": 0.00019865281301151928,
+      "loss": 0.925,
+      "step": 1194
+    },
+    {
+      "epoch": 0.2127849002849003,
+      "grad_norm": 0.49341437220573425,
+      "learning_rate": 0.00019865052217463153,
+      "loss": 1.2319,
+      "step": 1195
+    },
+    {
+      "epoch": 0.21296296296296297,
+      "grad_norm": 0.5099014639854431,
+      "learning_rate": 0.00019864822940489173,
+      "loss": 1.139,
+      "step": 1196
+    },
+    {
+      "epoch": 0.21314102564102563,
+      "grad_norm": 0.41396936774253845,
+      "learning_rate": 0.0001986459347023448,
+      "loss": 1.0594,
+      "step": 1197
+    },
+    {
+      "epoch": 0.21331908831908833,
+      "grad_norm": 0.46071869134902954,
+      "learning_rate": 0.0001986436380670357,
+      "loss": 1.0815,
+      "step": 1198
+    },
+    {
+      "epoch": 0.213497150997151,
+      "grad_norm": 0.507882297039032,
+      "learning_rate": 0.00019864133949900942,
+      "loss": 1.3841,
+      "step": 1199
+    },
+    {
+      "epoch": 0.21367521367521367,
+      "grad_norm": 0.45680439472198486,
+      "learning_rate": 0.00019863903899831103,
+      "loss": 1.0945,
+      "step": 1200
+    },
+    {
+      "epoch": 0.21385327635327636,
+      "grad_norm": 0.44277429580688477,
+      "learning_rate": 0.00019863673656498555,
+      "loss": 1.1655,
+      "step": 1201
+    },
+    {
+      "epoch": 0.21403133903133903,
+      "grad_norm": 0.43890756368637085,
+      "learning_rate": 0.00019863443219907812,
+      "loss": 1.1186,
+      "step": 1202
+    },
+    {
+      "epoch": 0.2142094017094017,
+      "grad_norm": 0.3910178542137146,
+      "learning_rate": 0.0001986321259006339,
+      "loss": 1.0817,
+      "step": 1203
+    },
+    {
+      "epoch": 0.2143874643874644,
+      "grad_norm": 0.3803878128528595,
+      "learning_rate": 0.00019862981766969803,
+      "loss": 0.8022,
+      "step": 1204
+    },
+    {
+      "epoch": 0.21456552706552706,
+      "grad_norm": 0.4495108425617218,
+      "learning_rate": 0.0001986275075063158,
+      "loss": 1.2212,
+      "step": 1205
+    },
+    {
+      "epoch": 0.21474358974358973,
+      "grad_norm": 0.5211976766586304,
+      "learning_rate": 0.00019862519541053244,
+      "loss": 1.2771,
+      "step": 1206
+    },
+    {
+      "epoch": 0.21492165242165243,
+      "grad_norm": 0.4313061535358429,
+      "learning_rate": 0.00019862288138239325,
+      "loss": 1.1205,
+      "step": 1207
+    },
+    {
+      "epoch": 0.2150997150997151,
+      "grad_norm": 0.47110888361930847,
+      "learning_rate": 0.00019862056542194355,
+      "loss": 1.1835,
+      "step": 1208
+    },
+    {
+      "epoch": 0.2152777777777778,
+      "grad_norm": 0.5129403471946716,
+      "learning_rate": 0.00019861824752922876,
+      "loss": 1.1655,
+      "step": 1209
+    },
+    {
+      "epoch": 0.21545584045584046,
+      "grad_norm": 0.4353938102722168,
+      "learning_rate": 0.00019861592770429427,
+      "loss": 1.2794,
+      "step": 1210
+    },
+    {
+      "epoch": 0.21563390313390313,
+      "grad_norm": 0.48590636253356934,
+      "learning_rate": 0.0001986136059471855,
+      "loss": 1.2003,
+      "step": 1211
+    },
+    {
+      "epoch": 0.21581196581196582,
+      "grad_norm": 0.4738406836986542,
+      "learning_rate": 0.00019861128225794804,
+      "loss": 1.2271,
+      "step": 1212
+    },
+    {
+      "epoch": 0.2159900284900285,
+      "grad_norm": 0.45983126759529114,
+      "learning_rate": 0.0001986089566366273,
+      "loss": 1.1896,
+      "step": 1213
+    },
+    {
+      "epoch": 0.21616809116809116,
+      "grad_norm": 0.37296006083488464,
+      "learning_rate": 0.00019860662908326892,
+      "loss": 1.079,
+      "step": 1214
+    },
+    {
+      "epoch": 0.21634615384615385,
+      "grad_norm": 0.4442676305770874,
+      "learning_rate": 0.00019860429959791845,
+      "loss": 1.1754,
+      "step": 1215
+    },
+    {
+      "epoch": 0.21652421652421652,
+      "grad_norm": 0.4950128495693207,
+      "learning_rate": 0.0001986019681806216,
+      "loss": 1.1571,
+      "step": 1216
+    },
+    {
+      "epoch": 0.2167022792022792,
+      "grad_norm": 0.4374556541442871,
+      "learning_rate": 0.000198599634831424,
+      "loss": 1.1003,
+      "step": 1217
+    },
+    {
+      "epoch": 0.2168803418803419,
+      "grad_norm": 0.47301414608955383,
+      "learning_rate": 0.00019859729955037136,
+      "loss": 1.1426,
+      "step": 1218
+    },
+    {
+      "epoch": 0.21705840455840456,
+      "grad_norm": 0.41213178634643555,
+      "learning_rate": 0.00019859496233750947,
+      "loss": 1.0659,
+      "step": 1219
+    },
+    {
+      "epoch": 0.21723646723646722,
+      "grad_norm": 0.41601964831352234,
+      "learning_rate": 0.0001985926231928841,
+      "loss": 1.0248,
+      "step": 1220
+    },
+    {
+      "epoch": 0.21741452991452992,
+      "grad_norm": 0.46328839659690857,
+      "learning_rate": 0.0001985902821165411,
+      "loss": 1.0405,
+      "step": 1221
+    },
+    {
+      "epoch": 0.2175925925925926,
+      "grad_norm": 0.43287959694862366,
+      "learning_rate": 0.0001985879391085263,
+      "loss": 0.9202,
+      "step": 1222
+    },
+    {
+      "epoch": 0.21777065527065528,
+      "grad_norm": 0.4770444631576538,
+      "learning_rate": 0.00019858559416888568,
+      "loss": 1.0911,
+      "step": 1223
+    },
+    {
+      "epoch": 0.21794871794871795,
+      "grad_norm": 0.4756585955619812,
+      "learning_rate": 0.00019858324729766507,
+      "loss": 1.1566,
+      "step": 1224
+    },
+    {
+      "epoch": 0.21812678062678062,
+      "grad_norm": 0.4337233006954193,
+      "learning_rate": 0.00019858089849491054,
+      "loss": 0.9084,
+      "step": 1225
+    },
+    {
+      "epoch": 0.21830484330484332,
+      "grad_norm": 0.5165579319000244,
+      "learning_rate": 0.00019857854776066813,
+      "loss": 1.4154,
+      "step": 1226
+    },
+    {
+      "epoch": 0.21848290598290598,
+      "grad_norm": 0.4280378520488739,
+      "learning_rate": 0.00019857619509498382,
+      "loss": 1.1291,
+      "step": 1227
+    },
+    {
+      "epoch": 0.21866096866096865,
+      "grad_norm": 0.5375089049339294,
+      "learning_rate": 0.00019857384049790376,
+      "loss": 1.2985,
+      "step": 1228
+    },
+    {
+      "epoch": 0.21883903133903135,
+      "grad_norm": 0.4708811640739441,
+      "learning_rate": 0.00019857148396947401,
+      "loss": 1.0589,
+      "step": 1229
+    },
+    {
+      "epoch": 0.21901709401709402,
+      "grad_norm": 0.4744570255279541,
+      "learning_rate": 0.00019856912550974084,
+      "loss": 1.1269,
+      "step": 1230
+    },
+    {
+      "epoch": 0.21919515669515668,
+      "grad_norm": 0.5355265736579895,
+      "learning_rate": 0.00019856676511875043,
+      "loss": 1.1441,
+      "step": 1231
+    },
+    {
+      "epoch": 0.21937321937321938,
+      "grad_norm": 0.42718183994293213,
+      "learning_rate": 0.00019856440279654897,
+      "loss": 1.0244,
+      "step": 1232
+    },
+    {
+      "epoch": 0.21955128205128205,
+      "grad_norm": 0.5162127614021301,
+      "learning_rate": 0.00019856203854318283,
+      "loss": 1.2674,
+      "step": 1233
+    },
+    {
+      "epoch": 0.21972934472934472,
+      "grad_norm": 0.5180695652961731,
+      "learning_rate": 0.00019855967235869827,
+      "loss": 1.2472,
+      "step": 1234
+    },
+    {
+      "epoch": 0.2199074074074074,
+      "grad_norm": 0.4290023744106293,
+      "learning_rate": 0.00019855730424314167,
+      "loss": 1.0502,
+      "step": 1235
+    },
+    {
+      "epoch": 0.22008547008547008,
+      "grad_norm": 0.4418254792690277,
+      "learning_rate": 0.00019855493419655945,
+      "loss": 1.0589,
+      "step": 1236
+    },
+    {
+      "epoch": 0.22026353276353278,
+      "grad_norm": 0.4074663817882538,
+      "learning_rate": 0.000198552562218998,
+      "loss": 0.9197,
+      "step": 1237
+    },
+    {
+      "epoch": 0.22044159544159544,
+      "grad_norm": 0.4526660740375519,
+      "learning_rate": 0.00019855018831050383,
+      "loss": 1.2578,
+      "step": 1238
+    },
+    {
+      "epoch": 0.2206196581196581,
+      "grad_norm": 0.4747827649116516,
+      "learning_rate": 0.00019854781247112343,
+      "loss": 1.0841,
+      "step": 1239
+    },
+    {
+      "epoch": 0.2207977207977208,
+      "grad_norm": 0.41567128896713257,
+      "learning_rate": 0.00019854543470090334,
+      "loss": 1.0737,
+      "step": 1240
+    },
+    {
+      "epoch": 0.22097578347578348,
+      "grad_norm": 0.4793100953102112,
+      "learning_rate": 0.00019854305499989022,
+      "loss": 1.1972,
+      "step": 1241
+    },
+    {
+      "epoch": 0.22115384615384615,
+      "grad_norm": 0.41755473613739014,
+      "learning_rate": 0.00019854067336813058,
+      "loss": 1.2529,
+      "step": 1242
+    },
+    {
+      "epoch": 0.22133190883190884,
+      "grad_norm": 0.40421152114868164,
+      "learning_rate": 0.0001985382898056712,
+      "loss": 1.0549,
+      "step": 1243
+    },
+    {
+      "epoch": 0.2215099715099715,
+      "grad_norm": 0.45779645442962646,
+      "learning_rate": 0.0001985359043125587,
+      "loss": 1.1586,
+      "step": 1244
+    },
+    {
+      "epoch": 0.22168803418803418,
+      "grad_norm": 0.4380546808242798,
+      "learning_rate": 0.00019853351688883987,
+      "loss": 1.1024,
+      "step": 1245
+    },
+    {
+      "epoch": 0.22186609686609687,
+      "grad_norm": 0.39917269349098206,
+      "learning_rate": 0.00019853112753456142,
+      "loss": 0.9823,
+      "step": 1246
+    },
+    {
+      "epoch": 0.22204415954415954,
+      "grad_norm": 0.4228038489818573,
+      "learning_rate": 0.00019852873624977022,
+      "loss": 1.1684,
+      "step": 1247
+    },
+    {
+      "epoch": 0.2222222222222222,
+      "grad_norm": 0.4462146759033203,
+      "learning_rate": 0.00019852634303451315,
+      "loss": 0.9027,
+      "step": 1248
+    },
+    {
+      "epoch": 0.2224002849002849,
+      "grad_norm": 0.5682163834571838,
+      "learning_rate": 0.000198523947888837,
+      "loss": 1.141,
+      "step": 1249
+    },
+    {
+      "epoch": 0.22257834757834757,
+      "grad_norm": 0.44866830110549927,
+      "learning_rate": 0.0001985215508127888,
+      "loss": 1.0759,
+      "step": 1250
+    },
+    {
+      "epoch": 0.22275641025641027,
+      "grad_norm": 0.4034106135368347,
+      "learning_rate": 0.00019851915180641548,
+      "loss": 1.0675,
+      "step": 1251
+    },
+    {
+      "epoch": 0.22293447293447294,
+      "grad_norm": 0.4780726432800293,
+      "learning_rate": 0.00019851675086976397,
+      "loss": 1.0283,
+      "step": 1252
+    },
+    {
+      "epoch": 0.2231125356125356,
+      "grad_norm": 0.48892372846603394,
+      "learning_rate": 0.00019851434800288145,
+      "loss": 1.1159,
+      "step": 1253
+    },
+    {
+      "epoch": 0.2232905982905983,
+      "grad_norm": 0.42629215121269226,
+      "learning_rate": 0.0001985119432058149,
+      "loss": 1.0292,
+      "step": 1254
+    },
+    {
+      "epoch": 0.22346866096866097,
+      "grad_norm": 0.4496444761753082,
+      "learning_rate": 0.00019850953647861146,
+      "loss": 1.0252,
+      "step": 1255
+    },
+    {
+      "epoch": 0.22364672364672364,
+      "grad_norm": 0.4371408224105835,
+      "learning_rate": 0.00019850712782131828,
+      "loss": 1.1104,
+      "step": 1256
+    },
+    {
+      "epoch": 0.22382478632478633,
+      "grad_norm": 0.4910794496536255,
+      "learning_rate": 0.00019850471723398258,
+      "loss": 1.1928,
+      "step": 1257
+    },
+    {
+      "epoch": 0.224002849002849,
+      "grad_norm": 0.41235068440437317,
+      "learning_rate": 0.00019850230471665157,
+      "loss": 1.1261,
+      "step": 1258
+    },
+    {
+      "epoch": 0.22418091168091167,
+      "grad_norm": 0.4507700502872467,
+      "learning_rate": 0.0001984998902693725,
+      "loss": 1.0602,
+      "step": 1259
+    },
+    {
+      "epoch": 0.22435897435897437,
+      "grad_norm": 0.4654198884963989,
+      "learning_rate": 0.00019849747389219272,
+      "loss": 1.1258,
+      "step": 1260
+    },
+    {
+      "epoch": 0.22453703703703703,
+      "grad_norm": 0.439807653427124,
+      "learning_rate": 0.00019849505558515952,
+      "loss": 1.2312,
+      "step": 1261
+    },
+    {
+      "epoch": 0.2247150997150997,
+      "grad_norm": 0.4309258759021759,
+      "learning_rate": 0.00019849263534832035,
+      "loss": 1.0083,
+      "step": 1262
+    },
+    {
+      "epoch": 0.2248931623931624,
+      "grad_norm": 0.4920141100883484,
+      "learning_rate": 0.00019849021318172255,
+      "loss": 1.0254,
+      "step": 1263
+    },
+    {
+      "epoch": 0.22507122507122507,
+      "grad_norm": 0.5333457589149475,
+      "learning_rate": 0.00019848778908541367,
+      "loss": 1.3017,
+      "step": 1264
+    },
+    {
+      "epoch": 0.22524928774928774,
+      "grad_norm": 0.4096757769584656,
+      "learning_rate": 0.0001984853630594411,
+      "loss": 0.9531,
+      "step": 1265
+    },
+    {
+      "epoch": 0.22542735042735043,
+      "grad_norm": 0.5744075775146484,
+      "learning_rate": 0.00019848293510385244,
+      "loss": 1.1414,
+      "step": 1266
+    },
+    {
+      "epoch": 0.2256054131054131,
+      "grad_norm": 0.44707193970680237,
+      "learning_rate": 0.00019848050521869529,
+      "loss": 1.1926,
+      "step": 1267
+    },
+    {
+      "epoch": 0.2257834757834758,
+      "grad_norm": 0.4162999391555786,
+      "learning_rate": 0.00019847807340401716,
+      "loss": 1.1354,
+      "step": 1268
+    },
+    {
+      "epoch": 0.22596153846153846,
+      "grad_norm": 0.4273204207420349,
+      "learning_rate": 0.0001984756396598658,
+      "loss": 0.9956,
+      "step": 1269
+    },
+    {
+      "epoch": 0.22613960113960113,
+      "grad_norm": 0.5670466423034668,
+      "learning_rate": 0.00019847320398628878,
+      "loss": 1.2384,
+      "step": 1270
+    },
+    {
+      "epoch": 0.22631766381766383,
+      "grad_norm": 0.424544095993042,
+      "learning_rate": 0.00019847076638333395,
+      "loss": 0.9963,
+      "step": 1271
+    },
+    {
+      "epoch": 0.2264957264957265,
+      "grad_norm": 0.3716120719909668,
+      "learning_rate": 0.000198468326851049,
+      "loss": 0.865,
+      "step": 1272
+    },
+    {
+      "epoch": 0.22667378917378916,
+      "grad_norm": 0.4472847282886505,
+      "learning_rate": 0.00019846588538948172,
+      "loss": 1.174,
+      "step": 1273
+    },
+    {
+      "epoch": 0.22685185185185186,
+      "grad_norm": 0.4599195718765259,
+      "learning_rate": 0.00019846344199867994,
+      "loss": 1.289,
+      "step": 1274
+    },
+    {
+      "epoch": 0.22702991452991453,
+      "grad_norm": 0.4303213357925415,
+      "learning_rate": 0.0001984609966786916,
+      "loss": 1.1606,
+      "step": 1275
+    },
+    {
+      "epoch": 0.2272079772079772,
+      "grad_norm": 0.44893527030944824,
+      "learning_rate": 0.00019845854942956455,
+      "loss": 1.1043,
+      "step": 1276
+    },
+    {
+      "epoch": 0.2273860398860399,
+      "grad_norm": 0.40033379197120667,
+      "learning_rate": 0.00019845610025134676,
+      "loss": 1.1434,
+      "step": 1277
+    },
+    {
+      "epoch": 0.22756410256410256,
+      "grad_norm": 0.4385402202606201,
+      "learning_rate": 0.00019845364914408616,
+      "loss": 0.9943,
+      "step": 1278
+    },
+    {
+      "epoch": 0.22774216524216523,
+      "grad_norm": 0.42123618721961975,
+      "learning_rate": 0.0001984511961078309,
+      "loss": 1.0911,
+      "step": 1279
+    },
+    {
+      "epoch": 0.22792022792022792,
+      "grad_norm": 0.5558577179908752,
+      "learning_rate": 0.00019844874114262893,
+      "loss": 1.3893,
+      "step": 1280
+    },
+    {
+      "epoch": 0.2280982905982906,
+      "grad_norm": 0.3996453583240509,
+      "learning_rate": 0.00019844628424852835,
+      "loss": 0.8951,
+      "step": 1281
+    },
+    {
+      "epoch": 0.2282763532763533,
+      "grad_norm": 0.3943425714969635,
+      "learning_rate": 0.0001984438254255774,
+      "loss": 1.0595,
+      "step": 1282
+    },
+    {
+      "epoch": 0.22845441595441596,
+      "grad_norm": 0.4429021179676056,
+      "learning_rate": 0.00019844136467382414,
+      "loss": 1.0853,
+      "step": 1283
+    },
+    {
+      "epoch": 0.22863247863247863,
+      "grad_norm": 0.4515686631202698,
+      "learning_rate": 0.00019843890199331687,
+      "loss": 1.0829,
+      "step": 1284
+    },
+    {
+      "epoch": 0.22881054131054132,
+      "grad_norm": 0.5157768726348877,
+      "learning_rate": 0.00019843643738410378,
+      "loss": 1.334,
+      "step": 1285
+    },
+    {
+      "epoch": 0.228988603988604,
+      "grad_norm": 0.45833173394203186,
+      "learning_rate": 0.0001984339708462332,
+      "loss": 1.1353,
+      "step": 1286
+    },
+    {
+      "epoch": 0.22916666666666666,
+      "grad_norm": 0.46610337495803833,
+      "learning_rate": 0.00019843150237975344,
+      "loss": 1.1338,
+      "step": 1287
+    },
+    {
+      "epoch": 0.22934472934472935,
+      "grad_norm": 0.5076978802680969,
+      "learning_rate": 0.00019842903198471286,
+      "loss": 1.1811,
+      "step": 1288
+    },
+    {
+      "epoch": 0.22952279202279202,
+      "grad_norm": 0.4297824800014496,
+      "learning_rate": 0.00019842655966115986,
+      "loss": 1.1799,
+      "step": 1289
+    },
+    {
+      "epoch": 0.2297008547008547,
+      "grad_norm": 0.5304586291313171,
+      "learning_rate": 0.0001984240854091429,
+      "loss": 1.1315,
+      "step": 1290
+    },
+    {
+      "epoch": 0.22987891737891739,
+      "grad_norm": 0.45359212160110474,
+      "learning_rate": 0.00019842160922871042,
+      "loss": 1.1037,
+      "step": 1291
+    },
+    {
+      "epoch": 0.23005698005698005,
+      "grad_norm": 0.4416881203651428,
+      "learning_rate": 0.00019841913111991096,
+      "loss": 1.122,
+      "step": 1292
+    },
+    {
+      "epoch": 0.23023504273504272,
+      "grad_norm": 0.46682995557785034,
+      "learning_rate": 0.0001984166510827931,
+      "loss": 0.9808,
+      "step": 1293
+    },
+    {
+      "epoch": 0.23041310541310542,
+      "grad_norm": 0.44172337651252747,
+      "learning_rate": 0.00019841416911740538,
+      "loss": 0.9167,
+      "step": 1294
+    },
+    {
+      "epoch": 0.23059116809116809,
+      "grad_norm": 0.40562742948532104,
+      "learning_rate": 0.0001984116852237965,
+      "loss": 0.9547,
+      "step": 1295
+    },
+    {
+      "epoch": 0.23076923076923078,
+      "grad_norm": 0.4040384888648987,
+      "learning_rate": 0.00019840919940201503,
+      "loss": 1.1039,
+      "step": 1296
+    },
+    {
+      "epoch": 0.23094729344729345,
+      "grad_norm": 0.5094077587127686,
+      "learning_rate": 0.00019840671165210973,
+      "loss": 1.2283,
+      "step": 1297
+    },
+    {
+      "epoch": 0.23112535612535612,
+      "grad_norm": 0.48553213477134705,
+      "learning_rate": 0.00019840422197412938,
+      "loss": 1.0927,
+      "step": 1298
+    },
+    {
+      "epoch": 0.23130341880341881,
+      "grad_norm": 0.5197509527206421,
+      "learning_rate": 0.00019840173036812266,
+      "loss": 1.2154,
+      "step": 1299
+    },
+    {
+      "epoch": 0.23148148148148148,
+      "grad_norm": 0.42069005966186523,
+      "learning_rate": 0.0001983992368341385,
+      "loss": 1.0076,
+      "step": 1300
+    },
+    {
+      "epoch": 0.23165954415954415,
+      "grad_norm": 0.475204735994339,
+      "learning_rate": 0.00019839674137222567,
+      "loss": 1.1682,
+      "step": 1301
+    },
+    {
+      "epoch": 0.23183760683760685,
+      "grad_norm": 0.55730140209198,
+      "learning_rate": 0.0001983942439824331,
+      "loss": 1.2948,
+      "step": 1302
+    },
+    {
+      "epoch": 0.23201566951566951,
+      "grad_norm": 0.4533313512802124,
+      "learning_rate": 0.00019839174466480973,
+      "loss": 1.2691,
+      "step": 1303
+    },
+    {
+      "epoch": 0.23219373219373218,
+      "grad_norm": 0.4733520746231079,
+      "learning_rate": 0.0001983892434194045,
+      "loss": 1.2232,
+      "step": 1304
+    },
+    {
+      "epoch": 0.23237179487179488,
+      "grad_norm": 0.5085756182670593,
+      "learning_rate": 0.00019838674024626643,
+      "loss": 1.1347,
+      "step": 1305
+    },
+    {
+      "epoch": 0.23254985754985755,
+      "grad_norm": 0.4679976999759674,
+      "learning_rate": 0.00019838423514544456,
+      "loss": 1.0018,
+      "step": 1306
+    },
+    {
+      "epoch": 0.23272792022792022,
+      "grad_norm": 0.4234481751918793,
+      "learning_rate": 0.00019838172811698795,
+      "loss": 1.0472,
+      "step": 1307
+    },
+    {
+      "epoch": 0.2329059829059829,
+      "grad_norm": 0.5749204158782959,
+      "learning_rate": 0.00019837921916094579,
+      "loss": 1.2239,
+      "step": 1308
+    },
+    {
+      "epoch": 0.23308404558404558,
+      "grad_norm": 0.46715882420539856,
+      "learning_rate": 0.0001983767082773672,
+      "loss": 1.1924,
+      "step": 1309
+    },
+    {
+      "epoch": 0.23326210826210828,
+      "grad_norm": 0.5079745054244995,
+      "learning_rate": 0.00019837419546630137,
+      "loss": 1.1086,
+      "step": 1310
+    },
+    {
+      "epoch": 0.23344017094017094,
+      "grad_norm": 0.4419243037700653,
+      "learning_rate": 0.0001983716807277975,
+      "loss": 1.1911,
+      "step": 1311
+    },
+    {
+      "epoch": 0.2336182336182336,
+      "grad_norm": 0.5107570290565491,
+      "learning_rate": 0.00019836916406190493,
+      "loss": 1.1071,
+      "step": 1312
+    },
+    {
+      "epoch": 0.2337962962962963,
+      "grad_norm": 0.5295659303665161,
+      "learning_rate": 0.00019836664546867293,
+      "loss": 1.2905,
+      "step": 1313
+    },
+    {
+      "epoch": 0.23397435897435898,
+      "grad_norm": 0.4844837784767151,
+      "learning_rate": 0.00019836412494815084,
+      "loss": 1.3507,
+      "step": 1314
+    },
+    {
+      "epoch": 0.23415242165242164,
+      "grad_norm": 0.6166049242019653,
+      "learning_rate": 0.00019836160250038808,
+      "loss": 1.2822,
+      "step": 1315
+    },
+    {
+      "epoch": 0.23433048433048434,
+      "grad_norm": 0.3229198753833771,
+      "learning_rate": 0.00019835907812543402,
+      "loss": 0.4959,
+      "step": 1316
+    },
+    {
+      "epoch": 0.234508547008547,
+      "grad_norm": 0.5788772702217102,
+      "learning_rate": 0.00019835655182333815,
+      "loss": 1.0832,
+      "step": 1317
+    },
+    {
+      "epoch": 0.23468660968660968,
+      "grad_norm": 0.525705099105835,
+      "learning_rate": 0.00019835402359414997,
+      "loss": 1.0968,
+      "step": 1318
+    },
+    {
+      "epoch": 0.23486467236467237,
+      "grad_norm": 0.5007779002189636,
+      "learning_rate": 0.000198351493437919,
+      "loss": 1.2788,
+      "step": 1319
+    },
+    {
+      "epoch": 0.23504273504273504,
+      "grad_norm": 0.4276871383190155,
+      "learning_rate": 0.00019834896135469484,
+      "loss": 1.0419,
+      "step": 1320
+    },
+    {
+      "epoch": 0.2352207977207977,
+      "grad_norm": 0.5359070301055908,
+      "learning_rate": 0.00019834642734452708,
+      "loss": 1.1308,
+      "step": 1321
+    },
+    {
+      "epoch": 0.2353988603988604,
+      "grad_norm": 0.4854908883571625,
+      "learning_rate": 0.0001983438914074654,
+      "loss": 1.1211,
+      "step": 1322
+    },
+    {
+      "epoch": 0.23557692307692307,
+      "grad_norm": 0.4913707375526428,
+      "learning_rate": 0.0001983413535435594,
+      "loss": 1.2392,
+      "step": 1323
+    },
+    {
+      "epoch": 0.23575498575498577,
+      "grad_norm": 0.46755748987197876,
+      "learning_rate": 0.0001983388137528589,
+      "loss": 0.9348,
+      "step": 1324
+    },
+    {
+      "epoch": 0.23593304843304844,
+      "grad_norm": 0.4592570960521698,
+      "learning_rate": 0.0001983362720354136,
+      "loss": 1.1339,
+      "step": 1325
+    },
+    {
+      "epoch": 0.2361111111111111,
+      "grad_norm": 0.5121711492538452,
+      "learning_rate": 0.00019833372839127335,
+      "loss": 1.2973,
+      "step": 1326
+    },
+    {
+      "epoch": 0.2362891737891738,
+      "grad_norm": 0.4809017479419708,
+      "learning_rate": 0.000198331182820488,
+      "loss": 0.9849,
+      "step": 1327
+    },
+    {
+      "epoch": 0.23646723646723647,
+      "grad_norm": 0.42340895533561707,
+      "learning_rate": 0.00019832863532310733,
+      "loss": 1.0731,
+      "step": 1328
+    },
+    {
+      "epoch": 0.23664529914529914,
+      "grad_norm": 0.5388045310974121,
+      "learning_rate": 0.00019832608589918135,
+      "loss": 1.0729,
+      "step": 1329
+    },
+    {
+      "epoch": 0.23682336182336183,
+      "grad_norm": 0.43075770139694214,
+      "learning_rate": 0.00019832353454875992,
+      "loss": 1.1684,
+      "step": 1330
+    },
+    {
+      "epoch": 0.2370014245014245,
+      "grad_norm": 0.554927408695221,
+      "learning_rate": 0.00019832098127189313,
+      "loss": 1.0842,
+      "step": 1331
+    },
+    {
+      "epoch": 0.23717948717948717,
+      "grad_norm": 0.5359260439872742,
+      "learning_rate": 0.0001983184260686309,
+      "loss": 1.2399,
+      "step": 1332
+    },
+    {
+      "epoch": 0.23735754985754987,
+      "grad_norm": 0.5141251087188721,
+      "learning_rate": 0.0001983158689390234,
+      "loss": 1.3752,
+      "step": 1333
+    },
+    {
+      "epoch": 0.23753561253561253,
+      "grad_norm": 0.4578750431537628,
+      "learning_rate": 0.00019831330988312067,
+      "loss": 1.0965,
+      "step": 1334
+    },
+    {
+      "epoch": 0.2377136752136752,
+      "grad_norm": 0.47974497079849243,
+      "learning_rate": 0.00019831074890097286,
+      "loss": 1.3379,
+      "step": 1335
+    },
+    {
+      "epoch": 0.2378917378917379,
+      "grad_norm": 0.4618176817893982,
+      "learning_rate": 0.00019830818599263014,
+      "loss": 1.274,
+      "step": 1336
+    },
+    {
+      "epoch": 0.23806980056980057,
+      "grad_norm": 0.4279816448688507,
+      "learning_rate": 0.00019830562115814276,
+      "loss": 0.996,
+      "step": 1337
+    },
+    {
+      "epoch": 0.23824786324786323,
+      "grad_norm": 0.4255026876926422,
+      "learning_rate": 0.0001983030543975609,
+      "loss": 0.969,
+      "step": 1338
+    },
+    {
+      "epoch": 0.23842592592592593,
+      "grad_norm": 0.4551412761211395,
+      "learning_rate": 0.00019830048571093493,
+      "loss": 1.0204,
+      "step": 1339
+    },
+    {
+      "epoch": 0.2386039886039886,
+      "grad_norm": 0.4747903048992157,
+      "learning_rate": 0.00019829791509831513,
+      "loss": 1.1816,
+      "step": 1340
+    },
+    {
+      "epoch": 0.2387820512820513,
+      "grad_norm": 0.47187140583992004,
+      "learning_rate": 0.00019829534255975188,
+      "loss": 1.1205,
+      "step": 1341
+    },
+    {
+      "epoch": 0.23896011396011396,
+      "grad_norm": 0.49332180619239807,
+      "learning_rate": 0.0001982927680952956,
+      "loss": 1.2657,
+      "step": 1342
+    },
+    {
+      "epoch": 0.23913817663817663,
+      "grad_norm": 0.5162837505340576,
+      "learning_rate": 0.0001982901917049967,
+      "loss": 1.2247,
+      "step": 1343
+    },
+    {
+      "epoch": 0.23931623931623933,
+      "grad_norm": 0.43407055735588074,
+      "learning_rate": 0.0001982876133889057,
+      "loss": 1.0038,
+      "step": 1344
+    },
+    {
+      "epoch": 0.239494301994302,
+      "grad_norm": 0.5132251977920532,
+      "learning_rate": 0.00019828503314707306,
+      "loss": 1.0678,
+      "step": 1345
+    },
+    {
+      "epoch": 0.23967236467236466,
+      "grad_norm": 0.46295464038848877,
+      "learning_rate": 0.00019828245097954937,
+      "loss": 1.1802,
+      "step": 1346
+    },
+    {
+      "epoch": 0.23985042735042736,
+      "grad_norm": 0.4682658314704895,
+      "learning_rate": 0.00019827986688638523,
+      "loss": 1.0249,
+      "step": 1347
+    },
+    {
+      "epoch": 0.24002849002849003,
+      "grad_norm": 0.49990561604499817,
+      "learning_rate": 0.00019827728086763125,
+      "loss": 1.0691,
+      "step": 1348
+    },
+    {
+      "epoch": 0.2402065527065527,
+      "grad_norm": 0.39090847969055176,
+      "learning_rate": 0.00019827469292333806,
+      "loss": 0.8367,
+      "step": 1349
+    },
+    {
+      "epoch": 0.2403846153846154,
+      "grad_norm": 0.5023905634880066,
+      "learning_rate": 0.00019827210305355645,
+      "loss": 1.0675,
+      "step": 1350
+    },
+    {
+      "epoch": 0.24056267806267806,
+      "grad_norm": 0.4744076430797577,
+      "learning_rate": 0.00019826951125833715,
+      "loss": 1.3166,
+      "step": 1351
+    },
+    {
+      "epoch": 0.24074074074074073,
+      "grad_norm": 0.44914689660072327,
+      "learning_rate": 0.00019826691753773088,
+      "loss": 0.9818,
+      "step": 1352
+    },
+    {
+      "epoch": 0.24091880341880342,
+      "grad_norm": 0.44391971826553345,
+      "learning_rate": 0.00019826432189178853,
+      "loss": 1.0448,
+      "step": 1353
+    },
+    {
+      "epoch": 0.2410968660968661,
+      "grad_norm": 0.46102839708328247,
+      "learning_rate": 0.00019826172432056086,
+      "loss": 0.9952,
+      "step": 1354
+    },
+    {
+      "epoch": 0.2412749287749288,
+      "grad_norm": 0.4796878695487976,
+      "learning_rate": 0.00019825912482409884,
+      "loss": 1.0977,
+      "step": 1355
+    },
+    {
+      "epoch": 0.24145299145299146,
+      "grad_norm": 0.5003768801689148,
+      "learning_rate": 0.0001982565234024534,
+      "loss": 1.3149,
+      "step": 1356
+    },
+    {
+      "epoch": 0.24163105413105412,
+      "grad_norm": 0.43475663661956787,
+      "learning_rate": 0.00019825392005567551,
+      "loss": 1.0527,
+      "step": 1357
+    },
+    {
+      "epoch": 0.24180911680911682,
+      "grad_norm": 0.46120527386665344,
+      "learning_rate": 0.00019825131478381613,
+      "loss": 1.2333,
+      "step": 1358
+    },
+    {
+      "epoch": 0.2419871794871795,
+      "grad_norm": 0.43748101592063904,
+      "learning_rate": 0.00019824870758692638,
+      "loss": 0.9788,
+      "step": 1359
+    },
+    {
+      "epoch": 0.24216524216524216,
+      "grad_norm": 0.5275192856788635,
+      "learning_rate": 0.00019824609846505727,
+      "loss": 1.1473,
+      "step": 1360
+    },
+    {
+      "epoch": 0.24234330484330485,
+      "grad_norm": 0.346463143825531,
+      "learning_rate": 0.00019824348741825993,
+      "loss": 0.6824,
+      "step": 1361
+    },
+    {
+      "epoch": 0.24252136752136752,
+      "grad_norm": 0.5004115700721741,
+      "learning_rate": 0.00019824087444658556,
+      "loss": 1.1853,
+      "step": 1362
+    },
+    {
+      "epoch": 0.2426994301994302,
+      "grad_norm": 0.42746666073799133,
+      "learning_rate": 0.00019823825955008533,
+      "loss": 0.9355,
+      "step": 1363
+    },
+    {
+      "epoch": 0.24287749287749288,
+      "grad_norm": 0.4099743068218231,
+      "learning_rate": 0.00019823564272881047,
+      "loss": 1.0753,
+      "step": 1364
+    },
+    {
+      "epoch": 0.24305555555555555,
+      "grad_norm": 0.5262967944145203,
+      "learning_rate": 0.00019823302398281226,
+      "loss": 1.2324,
+      "step": 1365
+    },
+    {
+      "epoch": 0.24323361823361822,
+      "grad_norm": 0.436069518327713,
+      "learning_rate": 0.000198230403312142,
+      "loss": 1.1887,
+      "step": 1366
+    },
+    {
+      "epoch": 0.24341168091168092,
+      "grad_norm": 0.38252368569374084,
+      "learning_rate": 0.00019822778071685107,
+      "loss": 1.0211,
+      "step": 1367
+    },
+    {
+      "epoch": 0.24358974358974358,
+      "grad_norm": 0.48024141788482666,
+      "learning_rate": 0.00019822515619699081,
+      "loss": 1.065,
+      "step": 1368
+    },
+    {
+      "epoch": 0.24376780626780628,
+      "grad_norm": 0.47421589493751526,
+      "learning_rate": 0.00019822252975261267,
+      "loss": 1.0433,
+      "step": 1369
+    },
+    {
+      "epoch": 0.24394586894586895,
+      "grad_norm": 0.46094807982444763,
+      "learning_rate": 0.00019821990138376808,
+      "loss": 1.1427,
+      "step": 1370
+    },
+    {
+      "epoch": 0.24412393162393162,
+      "grad_norm": 0.5093680620193481,
+      "learning_rate": 0.00019821727109050856,
+      "loss": 1.1086,
+      "step": 1371
+    },
+    {
+      "epoch": 0.2443019943019943,
+      "grad_norm": 0.41084879636764526,
+      "learning_rate": 0.00019821463887288566,
+      "loss": 1.0068,
+      "step": 1372
+    },
+    {
+      "epoch": 0.24448005698005698,
+      "grad_norm": 0.4991084635257721,
+      "learning_rate": 0.0001982120047309509,
+      "loss": 1.1884,
+      "step": 1373
+    },
+    {
+      "epoch": 0.24465811965811965,
+      "grad_norm": 0.39198383688926697,
+      "learning_rate": 0.00019820936866475595,
+      "loss": 0.9776,
+      "step": 1374
+    },
+    {
+      "epoch": 0.24483618233618235,
+      "grad_norm": 0.4517424702644348,
+      "learning_rate": 0.00019820673067435244,
+      "loss": 1.1491,
+      "step": 1375
+    },
+    {
+      "epoch": 0.245014245014245,
+      "grad_norm": 0.45881983637809753,
+      "learning_rate": 0.00019820409075979202,
+      "loss": 1.1198,
+      "step": 1376
+    },
+    {
+      "epoch": 0.24519230769230768,
+      "grad_norm": 0.4498792290687561,
+      "learning_rate": 0.00019820144892112646,
+      "loss": 1.0897,
+      "step": 1377
+    },
+    {
+      "epoch": 0.24537037037037038,
+      "grad_norm": 0.4128037393093109,
+      "learning_rate": 0.00019819880515840752,
+      "loss": 0.9415,
+      "step": 1378
+    },
+    {
+      "epoch": 0.24554843304843305,
+      "grad_norm": 0.4340885281562805,
+      "learning_rate": 0.00019819615947168698,
+      "loss": 1.201,
+      "step": 1379
+    },
+    {
+      "epoch": 0.24572649572649571,
+      "grad_norm": 0.43814027309417725,
+      "learning_rate": 0.00019819351186101667,
+      "loss": 1.1039,
+      "step": 1380
+    },
+    {
+      "epoch": 0.2459045584045584,
+      "grad_norm": 0.40115082263946533,
+      "learning_rate": 0.00019819086232644845,
+      "loss": 1.2599,
+      "step": 1381
+    },
+    {
+      "epoch": 0.24608262108262108,
+      "grad_norm": 0.4947351813316345,
+      "learning_rate": 0.00019818821086803426,
+      "loss": 1.252,
+      "step": 1382
+    },
+    {
+      "epoch": 0.24626068376068377,
+      "grad_norm": 0.45179441571235657,
+      "learning_rate": 0.0001981855574858261,
+      "loss": 1.1323,
+      "step": 1383
+    },
+    {
+      "epoch": 0.24643874643874644,
+      "grad_norm": 0.47159844636917114,
+      "learning_rate": 0.00019818290217987587,
+      "loss": 1.2053,
+      "step": 1384
+    },
+    {
+      "epoch": 0.2466168091168091,
+      "grad_norm": 0.4358448386192322,
+      "learning_rate": 0.0001981802449502356,
+      "loss": 1.1174,
+      "step": 1385
+    },
+    {
+      "epoch": 0.2467948717948718,
+      "grad_norm": 0.4588233530521393,
+      "learning_rate": 0.00019817758579695745,
+      "loss": 1.1098,
+      "step": 1386
+    },
+    {
+      "epoch": 0.24697293447293447,
+      "grad_norm": 0.4955112636089325,
+      "learning_rate": 0.00019817492472009338,
+      "loss": 1.258,
+      "step": 1387
+    },
+    {
+      "epoch": 0.24715099715099714,
+      "grad_norm": 0.4226941764354706,
+      "learning_rate": 0.00019817226171969565,
+      "loss": 1.0976,
+      "step": 1388
+    },
+    {
+      "epoch": 0.24732905982905984,
+      "grad_norm": 0.4076840579509735,
+      "learning_rate": 0.00019816959679581637,
+      "loss": 1.0121,
+      "step": 1389
+    },
+    {
+      "epoch": 0.2475071225071225,
+      "grad_norm": 0.4395063519477844,
+      "learning_rate": 0.0001981669299485078,
+      "loss": 1.3153,
+      "step": 1390
+    },
+    {
+      "epoch": 0.24768518518518517,
+      "grad_norm": 0.41010400652885437,
+      "learning_rate": 0.0001981642611778221,
+      "loss": 1.0717,
+      "step": 1391
+    },
+    {
+      "epoch": 0.24786324786324787,
+      "grad_norm": 0.43459352850914,
+      "learning_rate": 0.00019816159048381167,
+      "loss": 1.1077,
+      "step": 1392
+    },
+    {
+      "epoch": 0.24804131054131054,
+      "grad_norm": 0.46291449666023254,
+      "learning_rate": 0.00019815891786652875,
+      "loss": 1.0257,
+      "step": 1393
+    },
+    {
+      "epoch": 0.2482193732193732,
+      "grad_norm": 0.46408146619796753,
+      "learning_rate": 0.00019815624332602578,
+      "loss": 0.7899,
+      "step": 1394
+    },
+    {
+      "epoch": 0.2483974358974359,
+      "grad_norm": 0.4763357937335968,
+      "learning_rate": 0.00019815356686235508,
+      "loss": 0.9857,
+      "step": 1395
+    },
+    {
+      "epoch": 0.24857549857549857,
+      "grad_norm": 0.4766457676887512,
+      "learning_rate": 0.00019815088847556918,
+      "loss": 1.0589,
+      "step": 1396
+    },
+    {
+      "epoch": 0.24875356125356127,
+      "grad_norm": 0.4486583173274994,
+      "learning_rate": 0.0001981482081657205,
+      "loss": 1.2572,
+      "step": 1397
+    },
+    {
+      "epoch": 0.24893162393162394,
+      "grad_norm": 0.468878835439682,
+      "learning_rate": 0.00019814552593286155,
+      "loss": 1.101,
+      "step": 1398
+    },
+    {
+      "epoch": 0.2491096866096866,
+      "grad_norm": 0.4230278730392456,
+      "learning_rate": 0.0001981428417770449,
+      "loss": 0.9457,
+      "step": 1399
+    },
+    {
+      "epoch": 0.2492877492877493,
+      "grad_norm": 0.45630761981010437,
+      "learning_rate": 0.00019814015569832315,
+      "loss": 1.0665,
+      "step": 1400
+    },
+    {
+      "epoch": 0.24946581196581197,
+      "grad_norm": 0.5780113935470581,
+      "learning_rate": 0.00019813746769674893,
+      "loss": 1.1064,
+      "step": 1401
+    },
+    {
+      "epoch": 0.24964387464387464,
+      "grad_norm": 0.4343436658382416,
+      "learning_rate": 0.0001981347777723749,
+      "loss": 1.1132,
+      "step": 1402
+    },
+    {
+      "epoch": 0.24982193732193733,
+      "grad_norm": 0.4879056513309479,
+      "learning_rate": 0.0001981320859252537,
+      "loss": 1.1301,
+      "step": 1403
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.5248328447341919,
+      "learning_rate": 0.00019812939215543818,
+      "loss": 1.1468,
+      "step": 1404
+    },
+    {
+      "epoch": 0.25,
+      "eval_loss": 1.115895390510559,
+      "eval_runtime": 25.0474,
+      "eval_samples_per_second": 41.561,
+      "eval_steps_per_second": 20.801,
+      "step": 1404
+    },
+    {
+      "epoch": 0.2501780626780627,
+      "grad_norm": 0.5076769590377808,
+      "learning_rate": 0.00019812669646298106,
+      "loss": 1.1428,
+      "step": 1405
+    },
+    {
+      "epoch": 0.25035612535612534,
+      "grad_norm": 0.5510252714157104,
+      "learning_rate": 0.00019812399884793514,
+      "loss": 1.3383,
+      "step": 1406
+    },
+    {
+      "epoch": 0.25053418803418803,
+      "grad_norm": 0.48918986320495605,
+      "learning_rate": 0.0001981212993103533,
+      "loss": 1.1507,
+      "step": 1407
+    },
+    {
+      "epoch": 0.25071225071225073,
+      "grad_norm": 0.4678935110569,
+      "learning_rate": 0.00019811859785028846,
+      "loss": 1.13,
+      "step": 1408
+    },
+    {
+      "epoch": 0.25089031339031337,
+      "grad_norm": 0.5155254602432251,
+      "learning_rate": 0.0001981158944677935,
+      "loss": 1.1194,
+      "step": 1409
+    },
+    {
+      "epoch": 0.25106837606837606,
+      "grad_norm": 0.4533839523792267,
+      "learning_rate": 0.00019811318916292142,
+      "loss": 0.9464,
+      "step": 1410
+    },
+    {
+      "epoch": 0.25124643874643876,
+      "grad_norm": 0.5142433047294617,
+      "learning_rate": 0.00019811048193572517,
+      "loss": 1.0837,
+      "step": 1411
+    },
+    {
+      "epoch": 0.2514245014245014,
+      "grad_norm": 0.4330446124076843,
+      "learning_rate": 0.00019810777278625788,
+      "loss": 0.9117,
+      "step": 1412
+    },
+    {
+      "epoch": 0.2516025641025641,
+      "grad_norm": 0.44806256890296936,
+      "learning_rate": 0.00019810506171457254,
+      "loss": 1.1643,
+      "step": 1413
+    },
+    {
+      "epoch": 0.2517806267806268,
+      "grad_norm": 0.43526285886764526,
+      "learning_rate": 0.00019810234872072235,
+      "loss": 0.9776,
+      "step": 1414
+    },
+    {
+      "epoch": 0.25195868945868943,
+      "grad_norm": 0.47394511103630066,
+      "learning_rate": 0.00019809963380476039,
+      "loss": 1.0935,
+      "step": 1415
+    },
+    {
+      "epoch": 0.25213675213675213,
+      "grad_norm": 0.48961278796195984,
+      "learning_rate": 0.00019809691696673993,
+      "loss": 1.179,
+      "step": 1416
+    },
+    {
+      "epoch": 0.2523148148148148,
+      "grad_norm": 0.43153589963912964,
+      "learning_rate": 0.00019809419820671412,
+      "loss": 0.906,
+      "step": 1417
+    },
+    {
+      "epoch": 0.25249287749287747,
+      "grad_norm": 0.41187527775764465,
+      "learning_rate": 0.00019809147752473632,
+      "loss": 0.899,
+      "step": 1418
+    },
+    {
+      "epoch": 0.25267094017094016,
+      "grad_norm": 0.5003183484077454,
+      "learning_rate": 0.00019808875492085973,
+      "loss": 1.0606,
+      "step": 1419
+    },
+    {
+      "epoch": 0.25284900284900286,
+      "grad_norm": 0.4430316984653473,
+      "learning_rate": 0.00019808603039513778,
+      "loss": 0.9167,
+      "step": 1420
+    },
+    {
+      "epoch": 0.25302706552706555,
+      "grad_norm": 0.4577699601650238,
+      "learning_rate": 0.00019808330394762382,
+      "loss": 1.1184,
+      "step": 1421
+    },
+    {
+      "epoch": 0.2532051282051282,
+      "grad_norm": 0.42656826972961426,
+      "learning_rate": 0.0001980805755783713,
+      "loss": 0.9335,
+      "step": 1422
+    },
+    {
+      "epoch": 0.2533831908831909,
+      "grad_norm": 0.40980881452560425,
+      "learning_rate": 0.0001980778452874336,
+      "loss": 0.9756,
+      "step": 1423
+    },
+    {
+      "epoch": 0.2535612535612536,
+      "grad_norm": 0.5752090811729431,
+      "learning_rate": 0.00019807511307486423,
+      "loss": 1.1694,
+      "step": 1424
+    },
+    {
+      "epoch": 0.2537393162393162,
+      "grad_norm": 0.5000349283218384,
+      "learning_rate": 0.00019807237894071681,
+      "loss": 0.9515,
+      "step": 1425
+    },
+    {
+      "epoch": 0.2539173789173789,
+      "grad_norm": 0.5159069299697876,
+      "learning_rate": 0.00019806964288504483,
+      "loss": 1.4014,
+      "step": 1426
+    },
+    {
+      "epoch": 0.2540954415954416,
+      "grad_norm": 0.5377941131591797,
+      "learning_rate": 0.00019806690490790194,
+      "loss": 1.2832,
+      "step": 1427
+    },
+    {
+      "epoch": 0.25427350427350426,
+      "grad_norm": 0.4565938711166382,
+      "learning_rate": 0.00019806416500934174,
+      "loss": 1.0629,
+      "step": 1428
+    },
+    {
+      "epoch": 0.25445156695156695,
+      "grad_norm": 0.49867144227027893,
+      "learning_rate": 0.00019806142318941797,
+      "loss": 1.2011,
+      "step": 1429
+    },
+    {
+      "epoch": 0.25462962962962965,
+      "grad_norm": 0.5111994743347168,
+      "learning_rate": 0.00019805867944818427,
+      "loss": 0.8925,
+      "step": 1430
+    },
+    {
+      "epoch": 0.2548076923076923,
+      "grad_norm": 0.5204268097877502,
+      "learning_rate": 0.00019805593378569448,
+      "loss": 1.2956,
+      "step": 1431
+    },
+    {
+      "epoch": 0.254985754985755,
+      "grad_norm": 0.3889026939868927,
+      "learning_rate": 0.00019805318620200234,
+      "loss": 1.0355,
+      "step": 1432
+    },
+    {
+      "epoch": 0.2551638176638177,
+      "grad_norm": 0.46825656294822693,
+      "learning_rate": 0.00019805043669716174,
+      "loss": 1.0444,
+      "step": 1433
+    },
+    {
+      "epoch": 0.2553418803418803,
+      "grad_norm": 0.4509420394897461,
+      "learning_rate": 0.00019804768527122648,
+      "loss": 1.0423,
+      "step": 1434
+    },
+    {
+      "epoch": 0.255519943019943,
+      "grad_norm": 0.4514774978160858,
+      "learning_rate": 0.0001980449319242505,
+      "loss": 1.1588,
+      "step": 1435
+    },
+    {
+      "epoch": 0.2556980056980057,
+      "grad_norm": 0.43019044399261475,
+      "learning_rate": 0.0001980421766562878,
+      "loss": 0.9939,
+      "step": 1436
+    },
+    {
+      "epoch": 0.25587606837606836,
+      "grad_norm": 0.5056091547012329,
+      "learning_rate": 0.00019803941946739228,
+      "loss": 1.1238,
+      "step": 1437
+    },
+    {
+      "epoch": 0.25605413105413105,
+      "grad_norm": 0.48664605617523193,
+      "learning_rate": 0.000198036660357618,
+      "loss": 1.0702,
+      "step": 1438
+    },
+    {
+      "epoch": 0.25623219373219375,
+      "grad_norm": 0.4500972032546997,
+      "learning_rate": 0.000198033899327019,
+      "loss": 0.9365,
+      "step": 1439
+    },
+    {
+      "epoch": 0.2564102564102564,
+      "grad_norm": 0.4800589382648468,
+      "learning_rate": 0.0001980311363756494,
+      "loss": 1.1159,
+      "step": 1440
+    },
+    {
+      "epoch": 0.2565883190883191,
+      "grad_norm": 0.3486495316028595,
+      "learning_rate": 0.0001980283715035633,
+      "loss": 0.6029,
+      "step": 1441
+    },
+    {
+      "epoch": 0.2567663817663818,
+      "grad_norm": 0.46258702874183655,
+      "learning_rate": 0.00019802560471081493,
+      "loss": 1.025,
+      "step": 1442
+    },
+    {
+      "epoch": 0.2569444444444444,
+      "grad_norm": 0.4846673607826233,
+      "learning_rate": 0.00019802283599745844,
+      "loss": 1.1105,
+      "step": 1443
+    },
+    {
+      "epoch": 0.2571225071225071,
+      "grad_norm": 0.4586990475654602,
+      "learning_rate": 0.00019802006536354813,
+      "loss": 0.9897,
+      "step": 1444
+    },
+    {
+      "epoch": 0.2573005698005698,
+      "grad_norm": 0.5177786350250244,
+      "learning_rate": 0.00019801729280913825,
+      "loss": 1.2558,
+      "step": 1445
+    },
+    {
+      "epoch": 0.25747863247863245,
+      "grad_norm": 0.43213751912117004,
+      "learning_rate": 0.00019801451833428312,
+      "loss": 1.0961,
+      "step": 1446
+    },
+    {
+      "epoch": 0.25765669515669515,
+      "grad_norm": 0.42974478006362915,
+      "learning_rate": 0.00019801174193903714,
+      "loss": 1.0659,
+      "step": 1447
+    },
+    {
+      "epoch": 0.25783475783475784,
+      "grad_norm": 0.4424504339694977,
+      "learning_rate": 0.00019800896362345464,
+      "loss": 0.9805,
+      "step": 1448
+    },
+    {
+      "epoch": 0.25801282051282054,
+      "grad_norm": 0.4734833836555481,
+      "learning_rate": 0.0001980061833875901,
+      "loss": 1.255,
+      "step": 1449
+    },
+    {
+      "epoch": 0.2581908831908832,
+      "grad_norm": 0.41024845838546753,
+      "learning_rate": 0.000198003401231498,
+      "loss": 1.0908,
+      "step": 1450
+    },
+    {
+      "epoch": 0.2583689458689459,
+      "grad_norm": 0.43603816628456116,
+      "learning_rate": 0.00019800061715523283,
+      "loss": 1.0611,
+      "step": 1451
+    },
+    {
+      "epoch": 0.25854700854700857,
+      "grad_norm": 0.4871339499950409,
+      "learning_rate": 0.00019799783115884915,
+      "loss": 1.1851,
+      "step": 1452
+    },
+    {
+      "epoch": 0.2587250712250712,
+      "grad_norm": 0.49758270382881165,
+      "learning_rate": 0.00019799504324240157,
+      "loss": 1.1936,
+      "step": 1453
+    },
+    {
+      "epoch": 0.2589031339031339,
+      "grad_norm": 0.4201010763645172,
+      "learning_rate": 0.00019799225340594466,
+      "loss": 1.1567,
+      "step": 1454
+    },
+    {
+      "epoch": 0.2590811965811966,
+      "grad_norm": 0.4200313091278076,
+      "learning_rate": 0.00019798946164953309,
+      "loss": 0.9666,
+      "step": 1455
+    },
+    {
+      "epoch": 0.25925925925925924,
+      "grad_norm": 0.43001702427864075,
+      "learning_rate": 0.0001979866679732216,
+      "loss": 1.0104,
+      "step": 1456
+    },
+    {
+      "epoch": 0.25943732193732194,
+      "grad_norm": 0.46733465790748596,
+      "learning_rate": 0.0001979838723770649,
+      "loss": 1.0927,
+      "step": 1457
+    },
+    {
+      "epoch": 0.25961538461538464,
+      "grad_norm": 0.4513280391693115,
+      "learning_rate": 0.00019798107486111773,
+      "loss": 1.0282,
+      "step": 1458
+    },
+    {
+      "epoch": 0.2597934472934473,
+      "grad_norm": 0.40411749482154846,
+      "learning_rate": 0.00019797827542543495,
+      "loss": 1.0789,
+      "step": 1459
+    },
+    {
+      "epoch": 0.25997150997151,
+      "grad_norm": 0.4359099268913269,
+      "learning_rate": 0.0001979754740700714,
+      "loss": 1.0616,
+      "step": 1460
+    },
+    {
+      "epoch": 0.26014957264957267,
+      "grad_norm": 0.4979047477245331,
+      "learning_rate": 0.00019797267079508198,
+      "loss": 1.2948,
+      "step": 1461
+    },
+    {
+      "epoch": 0.2603276353276353,
+      "grad_norm": 0.44698619842529297,
+      "learning_rate": 0.0001979698656005216,
+      "loss": 0.9198,
+      "step": 1462
+    },
+    {
+      "epoch": 0.260505698005698,
+      "grad_norm": 0.48437631130218506,
+      "learning_rate": 0.00019796705848644516,
+      "loss": 1.3207,
+      "step": 1463
+    },
+    {
+      "epoch": 0.2606837606837607,
+      "grad_norm": 0.4382587671279907,
+      "learning_rate": 0.00019796424945290778,
+      "loss": 1.1315,
+      "step": 1464
+    },
+    {
+      "epoch": 0.26086182336182334,
+      "grad_norm": 0.4565944969654083,
+      "learning_rate": 0.0001979614384999644,
+      "loss": 1.1893,
+      "step": 1465
+    },
+    {
+      "epoch": 0.26103988603988604,
+      "grad_norm": 0.4705163836479187,
+      "learning_rate": 0.00019795862562767017,
+      "loss": 1.1132,
+      "step": 1466
+    },
+    {
+      "epoch": 0.26121794871794873,
+      "grad_norm": 0.525184690952301,
+      "learning_rate": 0.00019795581083608012,
+      "loss": 1.2111,
+      "step": 1467
+    },
+    {
+      "epoch": 0.2613960113960114,
+      "grad_norm": 0.45215457677841187,
+      "learning_rate": 0.00019795299412524945,
+      "loss": 1.1851,
+      "step": 1468
+    },
+    {
+      "epoch": 0.26157407407407407,
+      "grad_norm": 0.4336663484573364,
+      "learning_rate": 0.00019795017549523335,
+      "loss": 1.0147,
+      "step": 1469
+    },
+    {
+      "epoch": 0.26175213675213677,
+      "grad_norm": 0.5327649712562561,
+      "learning_rate": 0.00019794735494608703,
+      "loss": 1.1743,
+      "step": 1470
+    },
+    {
+      "epoch": 0.2619301994301994,
+      "grad_norm": 0.49972307682037354,
+      "learning_rate": 0.00019794453247786578,
+      "loss": 1.1624,
+      "step": 1471
+    },
+    {
+      "epoch": 0.2621082621082621,
+      "grad_norm": 0.43475785851478577,
+      "learning_rate": 0.00019794170809062485,
+      "loss": 0.9888,
+      "step": 1472
+    },
+    {
+      "epoch": 0.2622863247863248,
+      "grad_norm": 0.428838849067688,
+      "learning_rate": 0.0001979388817844196,
+      "loss": 0.9154,
+      "step": 1473
+    },
+    {
+      "epoch": 0.26246438746438744,
+      "grad_norm": 0.508568286895752,
+      "learning_rate": 0.00019793605355930544,
+      "loss": 1.1679,
+      "step": 1474
+    },
+    {
+      "epoch": 0.26264245014245013,
+      "grad_norm": 0.47791770100593567,
+      "learning_rate": 0.00019793322341533776,
+      "loss": 1.1375,
+      "step": 1475
+    },
+    {
+      "epoch": 0.26282051282051283,
+      "grad_norm": 0.41909220814704895,
+      "learning_rate": 0.00019793039135257196,
+      "loss": 1.0235,
+      "step": 1476
+    },
+    {
+      "epoch": 0.26299857549857547,
+      "grad_norm": 0.5564408302307129,
+      "learning_rate": 0.00019792755737106361,
+      "loss": 1.0756,
+      "step": 1477
+    },
+    {
+      "epoch": 0.26317663817663817,
+      "grad_norm": 0.42813625931739807,
+      "learning_rate": 0.0001979247214708682,
+      "loss": 0.8213,
+      "step": 1478
+    },
+    {
+      "epoch": 0.26335470085470086,
+      "grad_norm": 0.44495970010757446,
+      "learning_rate": 0.00019792188365204126,
+      "loss": 0.9654,
+      "step": 1479
+    },
+    {
+      "epoch": 0.26353276353276356,
+      "grad_norm": 0.47473424673080444,
+      "learning_rate": 0.00019791904391463846,
+      "loss": 1.1643,
+      "step": 1480
+    },
+    {
+      "epoch": 0.2637108262108262,
+      "grad_norm": 0.40189051628112793,
+      "learning_rate": 0.0001979162022587154,
+      "loss": 0.8687,
+      "step": 1481
+    },
+    {
+      "epoch": 0.2638888888888889,
+      "grad_norm": 0.44629937410354614,
+      "learning_rate": 0.00019791335868432776,
+      "loss": 1.0284,
+      "step": 1482
+    },
+    {
+      "epoch": 0.2640669515669516,
+      "grad_norm": 0.511275053024292,
+      "learning_rate": 0.00019791051319153124,
+      "loss": 1.2217,
+      "step": 1483
+    },
+    {
+      "epoch": 0.26424501424501423,
+      "grad_norm": 0.5136445164680481,
+      "learning_rate": 0.00019790766578038163,
+      "loss": 1.1129,
+      "step": 1484
+    },
+    {
+      "epoch": 0.2644230769230769,
+      "grad_norm": 0.4450451135635376,
+      "learning_rate": 0.00019790481645093469,
+      "loss": 0.9912,
+      "step": 1485
+    },
+    {
+      "epoch": 0.2646011396011396,
+      "grad_norm": 0.39455199241638184,
+      "learning_rate": 0.00019790196520324621,
+      "loss": 1.0887,
+      "step": 1486
+    },
+    {
+      "epoch": 0.26477920227920226,
+      "grad_norm": 0.4444045126438141,
+      "learning_rate": 0.00019789911203737216,
+      "loss": 1.1559,
+      "step": 1487
+    },
+    {
+      "epoch": 0.26495726495726496,
+      "grad_norm": 0.4769677221775055,
+      "learning_rate": 0.0001978962569533683,
+      "loss": 1.147,
+      "step": 1488
+    },
+    {
+      "epoch": 0.26513532763532766,
+      "grad_norm": 0.40226617455482483,
+      "learning_rate": 0.0001978933999512907,
+      "loss": 1.0966,
+      "step": 1489
+    },
+    {
+      "epoch": 0.2653133903133903,
+      "grad_norm": 0.4640974700450897,
+      "learning_rate": 0.00019789054103119526,
+      "loss": 1.1002,
+      "step": 1490
+    },
+    {
+      "epoch": 0.265491452991453,
+      "grad_norm": 0.48251107335090637,
+      "learning_rate": 0.00019788768019313806,
+      "loss": 1.07,
+      "step": 1491
+    },
+    {
+      "epoch": 0.2656695156695157,
+      "grad_norm": 0.4836949408054352,
+      "learning_rate": 0.00019788481743717506,
+      "loss": 1.2992,
+      "step": 1492
+    },
+    {
+      "epoch": 0.26584757834757833,
+      "grad_norm": 0.4253857135772705,
+      "learning_rate": 0.00019788195276336244,
+      "loss": 1.1326,
+      "step": 1493
+    },
+    {
+      "epoch": 0.266025641025641,
+      "grad_norm": 0.5161862373352051,
+      "learning_rate": 0.0001978790861717563,
+      "loss": 1.2131,
+      "step": 1494
+    },
+    {
+      "epoch": 0.2662037037037037,
+      "grad_norm": 0.5223346948623657,
+      "learning_rate": 0.00019787621766241274,
+      "loss": 1.0933,
+      "step": 1495
+    },
+    {
+      "epoch": 0.26638176638176636,
+      "grad_norm": 0.37622541189193726,
+      "learning_rate": 0.000197873347235388,
+      "loss": 0.8919,
+      "step": 1496
+    },
+    {
+      "epoch": 0.26655982905982906,
+      "grad_norm": 0.4425419569015503,
+      "learning_rate": 0.0001978704748907384,
+      "loss": 1.0411,
+      "step": 1497
+    },
+    {
+      "epoch": 0.26673789173789175,
+      "grad_norm": 0.4536985456943512,
+      "learning_rate": 0.00019786760062852015,
+      "loss": 1.2747,
+      "step": 1498
+    },
+    {
+      "epoch": 0.2669159544159544,
+      "grad_norm": 0.4998049736022949,
+      "learning_rate": 0.00019786472444878955,
+      "loss": 1.3214,
+      "step": 1499
+    },
+    {
+      "epoch": 0.2670940170940171,
+      "grad_norm": 0.42104312777519226,
+      "learning_rate": 0.00019786184635160295,
+      "loss": 0.7878,
+      "step": 1500
+    },
+    {
+      "epoch": 0.2672720797720798,
+      "grad_norm": 0.5354288220405579,
+      "learning_rate": 0.00019785896633701678,
+      "loss": 1.0642,
+      "step": 1501
+    },
+    {
+      "epoch": 0.2674501424501424,
+      "grad_norm": 0.4681485891342163,
+      "learning_rate": 0.00019785608440508744,
+      "loss": 1.1737,
+      "step": 1502
+    },
+    {
+      "epoch": 0.2676282051282051,
+      "grad_norm": 0.49107062816619873,
+      "learning_rate": 0.0001978532005558714,
+      "loss": 1.1507,
+      "step": 1503
+    },
+    {
+      "epoch": 0.2678062678062678,
+      "grad_norm": 0.4173283576965332,
+      "learning_rate": 0.0001978503147894252,
+      "loss": 1.0538,
+      "step": 1504
+    },
+    {
+      "epoch": 0.26798433048433046,
+      "grad_norm": 0.49354055523872375,
+      "learning_rate": 0.0001978474271058053,
+      "loss": 1.1043,
+      "step": 1505
+    },
+    {
+      "epoch": 0.26816239316239315,
+      "grad_norm": 0.5787215232849121,
+      "learning_rate": 0.00019784453750506834,
+      "loss": 0.9245,
+      "step": 1506
+    },
+    {
+      "epoch": 0.26834045584045585,
+      "grad_norm": 0.48982590436935425,
+      "learning_rate": 0.00019784164598727095,
+      "loss": 1.2007,
+      "step": 1507
+    },
+    {
+      "epoch": 0.26851851851851855,
+      "grad_norm": 0.4971007704734802,
+      "learning_rate": 0.00019783875255246973,
+      "loss": 1.1174,
+      "step": 1508
+    },
+    {
+      "epoch": 0.2686965811965812,
+      "grad_norm": 0.5200340151786804,
+      "learning_rate": 0.00019783585720072142,
+      "loss": 1.1967,
+      "step": 1509
+    },
+    {
+      "epoch": 0.2688746438746439,
+      "grad_norm": 0.47911885380744934,
+      "learning_rate": 0.00019783295993208271,
+      "loss": 1.162,
+      "step": 1510
+    },
+    {
+      "epoch": 0.2690527065527066,
+      "grad_norm": 0.4764275848865509,
+      "learning_rate": 0.00019783006074661037,
+      "loss": 1.1358,
+      "step": 1511
+    },
+    {
+      "epoch": 0.2692307692307692,
+      "grad_norm": 0.478545606136322,
+      "learning_rate": 0.00019782715964436124,
+      "loss": 1.0096,
+      "step": 1512
+    },
+    {
+      "epoch": 0.2694088319088319,
+      "grad_norm": 0.5512787699699402,
+      "learning_rate": 0.00019782425662539212,
+      "loss": 1.1799,
+      "step": 1513
+    },
+    {
+      "epoch": 0.2695868945868946,
+      "grad_norm": 0.5495108962059021,
+      "learning_rate": 0.00019782135168975988,
+      "loss": 1.0959,
+      "step": 1514
+    },
+    {
+      "epoch": 0.26976495726495725,
+      "grad_norm": 0.42052868008613586,
+      "learning_rate": 0.0001978184448375215,
+      "loss": 1.1872,
+      "step": 1515
+    },
+    {
+      "epoch": 0.26994301994301995,
+      "grad_norm": 0.4994426965713501,
+      "learning_rate": 0.0001978155360687339,
+      "loss": 1.0568,
+      "step": 1516
+    },
+    {
+      "epoch": 0.27012108262108264,
+      "grad_norm": 0.459577351808548,
+      "learning_rate": 0.00019781262538345402,
+      "loss": 1.0315,
+      "step": 1517
+    },
+    {
+      "epoch": 0.2702991452991453,
+      "grad_norm": 0.4792841374874115,
+      "learning_rate": 0.00019780971278173895,
+      "loss": 1.2055,
+      "step": 1518
+    },
+    {
+      "epoch": 0.270477207977208,
+      "grad_norm": 0.5017708539962769,
+      "learning_rate": 0.00019780679826364575,
+      "loss": 1.157,
+      "step": 1519
+    },
+    {
+      "epoch": 0.2706552706552707,
+      "grad_norm": 0.5197349786758423,
+      "learning_rate": 0.00019780388182923152,
+      "loss": 0.9101,
+      "step": 1520
+    },
+    {
+      "epoch": 0.2708333333333333,
+      "grad_norm": 0.4226742684841156,
+      "learning_rate": 0.00019780096347855338,
+      "loss": 1.0525,
+      "step": 1521
+    },
+    {
+      "epoch": 0.271011396011396,
+      "grad_norm": 0.5058164596557617,
+      "learning_rate": 0.00019779804321166852,
+      "loss": 0.931,
+      "step": 1522
+    },
+    {
+      "epoch": 0.2711894586894587,
+      "grad_norm": 0.44492244720458984,
+      "learning_rate": 0.00019779512102863418,
+      "loss": 1.0641,
+      "step": 1523
+    },
+    {
+      "epoch": 0.27136752136752135,
+      "grad_norm": 0.5348989963531494,
+      "learning_rate": 0.00019779219692950758,
+      "loss": 1.1692,
+      "step": 1524
+    },
+    {
+      "epoch": 0.27154558404558404,
+      "grad_norm": 0.4631774425506592,
+      "learning_rate": 0.00019778927091434602,
+      "loss": 1.0876,
+      "step": 1525
+    },
+    {
+      "epoch": 0.27172364672364674,
+      "grad_norm": 0.45957499742507935,
+      "learning_rate": 0.00019778634298320684,
+      "loss": 0.9527,
+      "step": 1526
+    },
+    {
+      "epoch": 0.2719017094017094,
+      "grad_norm": 0.4506755769252777,
+      "learning_rate": 0.00019778341313614743,
+      "loss": 1.086,
+      "step": 1527
+    },
+    {
+      "epoch": 0.2720797720797721,
+      "grad_norm": 0.4900587797164917,
+      "learning_rate": 0.00019778048137322513,
+      "loss": 0.9911,
+      "step": 1528
+    },
+    {
+      "epoch": 0.27225783475783477,
+      "grad_norm": 0.478127658367157,
+      "learning_rate": 0.00019777754769449745,
+      "loss": 1.2083,
+      "step": 1529
+    },
+    {
+      "epoch": 0.2724358974358974,
+      "grad_norm": 0.47220897674560547,
+      "learning_rate": 0.00019777461210002183,
+      "loss": 1.0313,
+      "step": 1530
+    },
+    {
+      "epoch": 0.2726139601139601,
+      "grad_norm": 0.4526277184486389,
+      "learning_rate": 0.0001977716745898558,
+      "loss": 1.2648,
+      "step": 1531
+    },
+    {
+      "epoch": 0.2727920227920228,
+      "grad_norm": 0.42907601594924927,
+      "learning_rate": 0.00019776873516405688,
+      "loss": 0.8645,
+      "step": 1532
+    },
+    {
+      "epoch": 0.27297008547008544,
+      "grad_norm": 0.43440163135528564,
+      "learning_rate": 0.00019776579382268272,
+      "loss": 0.9702,
+      "step": 1533
+    },
+    {
+      "epoch": 0.27314814814814814,
+      "grad_norm": 0.48213550448417664,
+      "learning_rate": 0.0001977628505657909,
+      "loss": 0.998,
+      "step": 1534
+    },
+    {
+      "epoch": 0.27332621082621084,
+      "grad_norm": 0.43385565280914307,
+      "learning_rate": 0.00019775990539343914,
+      "loss": 1.0575,
+      "step": 1535
+    },
+    {
+      "epoch": 0.27350427350427353,
+      "grad_norm": 0.45706847310066223,
+      "learning_rate": 0.00019775695830568507,
+      "loss": 1.3024,
+      "step": 1536
+    },
+    {
+      "epoch": 0.27368233618233617,
+      "grad_norm": 0.45769137144088745,
+      "learning_rate": 0.00019775400930258652,
+      "loss": 1.0987,
+      "step": 1537
+    },
+    {
+      "epoch": 0.27386039886039887,
+      "grad_norm": 0.44682395458221436,
+      "learning_rate": 0.00019775105838420117,
+      "loss": 1.1327,
+      "step": 1538
+    },
+    {
+      "epoch": 0.27403846153846156,
+      "grad_norm": 0.5923072099685669,
+      "learning_rate": 0.00019774810555058694,
+      "loss": 1.4766,
+      "step": 1539
+    },
+    {
+      "epoch": 0.2742165242165242,
+      "grad_norm": 0.4327206015586853,
+      "learning_rate": 0.0001977451508018016,
+      "loss": 1.1175,
+      "step": 1540
+    },
+    {
+      "epoch": 0.2743945868945869,
+      "grad_norm": 0.48036691546440125,
+      "learning_rate": 0.00019774219413790315,
+      "loss": 1.1189,
+      "step": 1541
+    },
+    {
+      "epoch": 0.2745726495726496,
+      "grad_norm": 0.41371914744377136,
+      "learning_rate": 0.00019773923555894935,
+      "loss": 1.1366,
+      "step": 1542
+    },
+    {
+      "epoch": 0.27475071225071224,
+      "grad_norm": 0.4452378749847412,
+      "learning_rate": 0.00019773627506499832,
+      "loss": 0.9517,
+      "step": 1543
+    },
+    {
+      "epoch": 0.27492877492877493,
+      "grad_norm": 0.469098299741745,
+      "learning_rate": 0.00019773331265610802,
+      "loss": 1.0848,
+      "step": 1544
+    },
+    {
+      "epoch": 0.27510683760683763,
+      "grad_norm": 0.5390294790267944,
+      "learning_rate": 0.00019773034833233646,
+      "loss": 0.8589,
+      "step": 1545
+    },
+    {
+      "epoch": 0.27528490028490027,
+      "grad_norm": 0.5368238091468811,
+      "learning_rate": 0.00019772738209374174,
+      "loss": 1.2954,
+      "step": 1546
+    },
+    {
+      "epoch": 0.27546296296296297,
+      "grad_norm": 0.4705318510532379,
+      "learning_rate": 0.00019772441394038198,
+      "loss": 1.2252,
+      "step": 1547
+    },
+    {
+      "epoch": 0.27564102564102566,
+      "grad_norm": 0.4682813286781311,
+      "learning_rate": 0.00019772144387231533,
+      "loss": 1.0855,
+      "step": 1548
+    },
+    {
+      "epoch": 0.2758190883190883,
+      "grad_norm": 0.46876460313796997,
+      "learning_rate": 0.0001977184718896,
+      "loss": 1.1959,
+      "step": 1549
+    },
+    {
+      "epoch": 0.275997150997151,
+      "grad_norm": 0.4172806441783905,
+      "learning_rate": 0.00019771549799229416,
+      "loss": 1.2166,
+      "step": 1550
+    },
+    {
+      "epoch": 0.2761752136752137,
+      "grad_norm": 0.5088075399398804,
+      "learning_rate": 0.0001977125221804562,
+      "loss": 1.1285,
+      "step": 1551
+    },
+    {
+      "epoch": 0.27635327635327633,
+      "grad_norm": 0.4728628396987915,
+      "learning_rate": 0.0001977095444541443,
+      "loss": 1.2985,
+      "step": 1552
+    },
+    {
+      "epoch": 0.27653133903133903,
+      "grad_norm": 0.4431236684322357,
+      "learning_rate": 0.00019770656481341684,
+      "loss": 1.1298,
+      "step": 1553
+    },
+    {
+      "epoch": 0.2767094017094017,
+      "grad_norm": 0.474065363407135,
+      "learning_rate": 0.00019770358325833223,
+      "loss": 1.1915,
+      "step": 1554
+    },
+    {
+      "epoch": 0.27688746438746437,
+      "grad_norm": 0.45718875527381897,
+      "learning_rate": 0.00019770059978894885,
+      "loss": 1.0626,
+      "step": 1555
+    },
+    {
+      "epoch": 0.27706552706552706,
+      "grad_norm": 0.49300211668014526,
+      "learning_rate": 0.00019769761440532522,
+      "loss": 1.0134,
+      "step": 1556
+    },
+    {
+      "epoch": 0.27724358974358976,
+      "grad_norm": 0.4389498829841614,
+      "learning_rate": 0.00019769462710751974,
+      "loss": 1.0292,
+      "step": 1557
+    },
+    {
+      "epoch": 0.2774216524216524,
+      "grad_norm": 0.47330448031425476,
+      "learning_rate": 0.000197691637895591,
+      "loss": 1.1273,
+      "step": 1558
+    },
+    {
+      "epoch": 0.2775997150997151,
+      "grad_norm": 0.5322058200836182,
+      "learning_rate": 0.00019768864676959755,
+      "loss": 1.059,
+      "step": 1559
+    },
+    {
+      "epoch": 0.2777777777777778,
+      "grad_norm": 0.4714536964893341,
+      "learning_rate": 0.000197685653729598,
+      "loss": 1.1987,
+      "step": 1560
+    },
+    {
+      "epoch": 0.27795584045584043,
+      "grad_norm": 0.48687809705734253,
+      "learning_rate": 0.00019768265877565097,
+      "loss": 1.3206,
+      "step": 1561
+    },
+    {
+      "epoch": 0.2781339031339031,
+      "grad_norm": 0.46066713333129883,
+      "learning_rate": 0.00019767966190781518,
+      "loss": 1.0845,
+      "step": 1562
+    },
+    {
+      "epoch": 0.2783119658119658,
+      "grad_norm": 0.44372090697288513,
+      "learning_rate": 0.00019767666312614935,
+      "loss": 1.0942,
+      "step": 1563
+    },
+    {
+      "epoch": 0.27849002849002846,
+      "grad_norm": 0.4615907073020935,
+      "learning_rate": 0.00019767366243071216,
+      "loss": 1.071,
+      "step": 1564
+    },
+    {
+      "epoch": 0.27866809116809116,
+      "grad_norm": 0.502097487449646,
+      "learning_rate": 0.0001976706598215625,
+      "loss": 1.1164,
+      "step": 1565
+    },
+    {
+      "epoch": 0.27884615384615385,
+      "grad_norm": 0.4371815621852875,
+      "learning_rate": 0.00019766765529875913,
+      "loss": 1.0252,
+      "step": 1566
+    },
+    {
+      "epoch": 0.27902421652421655,
+      "grad_norm": 0.43035808205604553,
+      "learning_rate": 0.00019766464886236093,
+      "loss": 1.073,
+      "step": 1567
+    },
+    {
+      "epoch": 0.2792022792022792,
+      "grad_norm": 0.49721601605415344,
+      "learning_rate": 0.00019766164051242683,
+      "loss": 1.0316,
+      "step": 1568
+    },
+    {
+      "epoch": 0.2793803418803419,
+      "grad_norm": 0.44866231083869934,
+      "learning_rate": 0.00019765863024901576,
+      "loss": 1.0951,
+      "step": 1569
+    },
+    {
+      "epoch": 0.2795584045584046,
+      "grad_norm": 0.46318337321281433,
+      "learning_rate": 0.0001976556180721867,
+      "loss": 0.9836,
+      "step": 1570
+    },
+    {
+      "epoch": 0.2797364672364672,
+      "grad_norm": 0.4227696657180786,
+      "learning_rate": 0.00019765260398199868,
+      "loss": 1.0414,
+      "step": 1571
+    },
+    {
+      "epoch": 0.2799145299145299,
+      "grad_norm": 0.6062980890274048,
+      "learning_rate": 0.00019764958797851073,
+      "loss": 1.137,
+      "step": 1572
+    },
+    {
+      "epoch": 0.2800925925925926,
+      "grad_norm": 0.4856833219528198,
+      "learning_rate": 0.00019764657006178196,
+      "loss": 1.1361,
+      "step": 1573
+    },
+    {
+      "epoch": 0.28027065527065526,
+      "grad_norm": 0.45612895488739014,
+      "learning_rate": 0.00019764355023187146,
+      "loss": 1.0005,
+      "step": 1574
+    },
+    {
+      "epoch": 0.28044871794871795,
+      "grad_norm": 0.4143696129322052,
+      "learning_rate": 0.00019764052848883845,
+      "loss": 1.051,
+      "step": 1575
+    },
+    {
+      "epoch": 0.28062678062678065,
+      "grad_norm": 0.4532071352005005,
+      "learning_rate": 0.00019763750483274212,
+      "loss": 1.0595,
+      "step": 1576
+    },
+    {
+      "epoch": 0.2808048433048433,
+      "grad_norm": 0.4940357208251953,
+      "learning_rate": 0.0001976344792636417,
+      "loss": 1.0983,
+      "step": 1577
+    },
+    {
+      "epoch": 0.280982905982906,
+      "grad_norm": 0.44405099749565125,
+      "learning_rate": 0.0001976314517815965,
+      "loss": 1.0846,
+      "step": 1578
+    },
+    {
+      "epoch": 0.2811609686609687,
+      "grad_norm": 0.5508625507354736,
+      "learning_rate": 0.00019762842238666578,
+      "loss": 1.1722,
+      "step": 1579
+    },
+    {
+      "epoch": 0.2813390313390313,
+      "grad_norm": 0.5241084694862366,
+      "learning_rate": 0.00019762539107890894,
+      "loss": 1.351,
+      "step": 1580
+    },
+    {
+      "epoch": 0.281517094017094,
+      "grad_norm": 0.5307353734970093,
+      "learning_rate": 0.00019762235785838537,
+      "loss": 1.1868,
+      "step": 1581
+    },
+    {
+      "epoch": 0.2816951566951567,
+      "grad_norm": 0.45697924494743347,
+      "learning_rate": 0.00019761932272515447,
+      "loss": 1.1982,
+      "step": 1582
+    },
+    {
+      "epoch": 0.28187321937321935,
+      "grad_norm": 0.412483811378479,
+      "learning_rate": 0.00019761628567927574,
+      "loss": 1.0433,
+      "step": 1583
+    },
+    {
+      "epoch": 0.28205128205128205,
+      "grad_norm": 0.4614165425300598,
+      "learning_rate": 0.00019761324672080868,
+      "loss": 1.104,
+      "step": 1584
+    },
+    {
+      "epoch": 0.28222934472934474,
+      "grad_norm": 0.47644901275634766,
+      "learning_rate": 0.00019761020584981284,
+      "loss": 1.1037,
+      "step": 1585
+    },
+    {
+      "epoch": 0.2824074074074074,
+      "grad_norm": 0.4985184669494629,
+      "learning_rate": 0.00019760716306634773,
+      "loss": 1.2213,
+      "step": 1586
+    },
+    {
+      "epoch": 0.2825854700854701,
+      "grad_norm": 0.508301317691803,
+      "learning_rate": 0.00019760411837047305,
+      "loss": 1.1315,
+      "step": 1587
+    },
+    {
+      "epoch": 0.2827635327635328,
+      "grad_norm": 0.5346587300300598,
+      "learning_rate": 0.00019760107176224845,
+      "loss": 1.2281,
+      "step": 1588
+    },
+    {
+      "epoch": 0.2829415954415954,
+      "grad_norm": 0.5106825232505798,
+      "learning_rate": 0.00019759802324173357,
+      "loss": 1.2904,
+      "step": 1589
+    },
+    {
+      "epoch": 0.2831196581196581,
+      "grad_norm": 0.46458688378334045,
+      "learning_rate": 0.00019759497280898817,
+      "loss": 1.0861,
+      "step": 1590
+    },
+    {
+      "epoch": 0.2832977207977208,
+      "grad_norm": 0.49115365743637085,
+      "learning_rate": 0.00019759192046407201,
+      "loss": 1.0529,
+      "step": 1591
+    },
+    {
+      "epoch": 0.28347578347578345,
+      "grad_norm": 0.5114167332649231,
+      "learning_rate": 0.0001975888662070449,
+      "loss": 1.2555,
+      "step": 1592
+    },
+    {
+      "epoch": 0.28365384615384615,
+      "grad_norm": 0.45844775438308716,
+      "learning_rate": 0.0001975858100379667,
+      "loss": 1.0662,
+      "step": 1593
+    },
+    {
+      "epoch": 0.28383190883190884,
+      "grad_norm": 0.4684161841869354,
+      "learning_rate": 0.00019758275195689727,
+      "loss": 1.0537,
+      "step": 1594
+    },
+    {
+      "epoch": 0.28400997150997154,
+      "grad_norm": 0.4816220998764038,
+      "learning_rate": 0.0001975796919638965,
+      "loss": 1.126,
+      "step": 1595
+    },
+    {
+      "epoch": 0.2841880341880342,
+      "grad_norm": 0.46578118205070496,
+      "learning_rate": 0.0001975766300590244,
+      "loss": 0.9651,
+      "step": 1596
+    },
+    {
+      "epoch": 0.2843660968660969,
+      "grad_norm": 0.4181675612926483,
+      "learning_rate": 0.0001975735662423409,
+      "loss": 1.0888,
+      "step": 1597
+    },
+    {
+      "epoch": 0.28454415954415957,
+      "grad_norm": 0.49417954683303833,
+      "learning_rate": 0.00019757050051390609,
+      "loss": 1.1878,
+      "step": 1598
+    },
+    {
+      "epoch": 0.2847222222222222,
+      "grad_norm": 0.47264960408210754,
+      "learning_rate": 0.00019756743287377998,
+      "loss": 1.027,
+      "step": 1599
+    },
+    {
+      "epoch": 0.2849002849002849,
+      "grad_norm": 0.47686338424682617,
+      "learning_rate": 0.0001975643633220227,
+      "loss": 1.1307,
+      "step": 1600
+    },
+    {
+      "epoch": 0.2850783475783476,
+      "grad_norm": 0.5571266412734985,
+      "learning_rate": 0.00019756129185869443,
+      "loss": 0.984,
+      "step": 1601
+    },
+    {
+      "epoch": 0.28525641025641024,
+      "grad_norm": 0.46942809224128723,
+      "learning_rate": 0.00019755821848385527,
+      "loss": 1.0397,
+      "step": 1602
+    },
+    {
+      "epoch": 0.28543447293447294,
+      "grad_norm": 0.6325890421867371,
+      "learning_rate": 0.00019755514319756551,
+      "loss": 1.0918,
+      "step": 1603
+    },
+    {
+      "epoch": 0.28561253561253563,
+      "grad_norm": 0.5297608375549316,
+      "learning_rate": 0.00019755206599988533,
+      "loss": 0.9911,
+      "step": 1604
+    },
+    {
+      "epoch": 0.2857905982905983,
+      "grad_norm": 0.4736945331096649,
+      "learning_rate": 0.00019754898689087512,
+      "loss": 1.0786,
+      "step": 1605
+    },
+    {
+      "epoch": 0.28596866096866097,
+      "grad_norm": 0.5048685669898987,
+      "learning_rate": 0.00019754590587059512,
+      "loss": 0.9834,
+      "step": 1606
+    },
+    {
+      "epoch": 0.28614672364672367,
+      "grad_norm": 0.3823149502277374,
+      "learning_rate": 0.00019754282293910574,
+      "loss": 0.8341,
+      "step": 1607
+    },
+    {
+      "epoch": 0.2863247863247863,
+      "grad_norm": 0.44071945548057556,
+      "learning_rate": 0.00019753973809646738,
+      "loss": 1.131,
+      "step": 1608
+    },
+    {
+      "epoch": 0.286502849002849,
+      "grad_norm": 0.44182759523391724,
+      "learning_rate": 0.00019753665134274043,
+      "loss": 1.0321,
+      "step": 1609
+    },
+    {
+      "epoch": 0.2866809116809117,
+      "grad_norm": 0.4486250877380371,
+      "learning_rate": 0.00019753356267798546,
+      "loss": 0.9941,
+      "step": 1610
+    },
+    {
+      "epoch": 0.28685897435897434,
+      "grad_norm": 0.42796584963798523,
+      "learning_rate": 0.00019753047210226292,
+      "loss": 1.0235,
+      "step": 1611
+    },
+    {
+      "epoch": 0.28703703703703703,
+      "grad_norm": 0.47294023633003235,
+      "learning_rate": 0.00019752737961563336,
+      "loss": 1.11,
+      "step": 1612
+    },
+    {
+      "epoch": 0.28721509971509973,
+      "grad_norm": 0.44550734758377075,
+      "learning_rate": 0.00019752428521815742,
+      "loss": 1.0849,
+      "step": 1613
+    },
+    {
+      "epoch": 0.28739316239316237,
+      "grad_norm": 0.44189929962158203,
+      "learning_rate": 0.0001975211889098957,
+      "loss": 0.8904,
+      "step": 1614
+    },
+    {
+      "epoch": 0.28757122507122507,
+      "grad_norm": 0.5302733182907104,
+      "learning_rate": 0.00019751809069090885,
+      "loss": 1.2348,
+      "step": 1615
+    },
+    {
+      "epoch": 0.28774928774928776,
+      "grad_norm": 0.5951390862464905,
+      "learning_rate": 0.00019751499056125762,
+      "loss": 1.3035,
+      "step": 1616
+    },
+    {
+      "epoch": 0.2879273504273504,
+      "grad_norm": 0.5431534647941589,
+      "learning_rate": 0.0001975118885210027,
+      "loss": 1.0016,
+      "step": 1617
+    },
+    {
+      "epoch": 0.2881054131054131,
+      "grad_norm": 0.47301986813545227,
+      "learning_rate": 0.00019750878457020489,
+      "loss": 1.2245,
+      "step": 1618
+    },
+    {
+      "epoch": 0.2882834757834758,
+      "grad_norm": 0.44785359501838684,
+      "learning_rate": 0.00019750567870892497,
+      "loss": 1.122,
+      "step": 1619
+    },
+    {
+      "epoch": 0.28846153846153844,
+      "grad_norm": 0.49494361877441406,
+      "learning_rate": 0.00019750257093722383,
+      "loss": 0.9421,
+      "step": 1620
+    },
+    {
+      "epoch": 0.28863960113960113,
+      "grad_norm": 0.4484521150588989,
+      "learning_rate": 0.00019749946125516242,
+      "loss": 1.2146,
+      "step": 1621
+    },
+    {
+      "epoch": 0.28881766381766383,
+      "grad_norm": 0.4635269343852997,
+      "learning_rate": 0.00019749634966280156,
+      "loss": 0.976,
+      "step": 1622
+    },
+    {
+      "epoch": 0.28899572649572647,
+      "grad_norm": 0.5532249808311462,
+      "learning_rate": 0.00019749323616020226,
+      "loss": 1.1818,
+      "step": 1623
+    },
+    {
+      "epoch": 0.28917378917378916,
+      "grad_norm": 0.4730629622936249,
+      "learning_rate": 0.00019749012074742552,
+      "loss": 1.0321,
+      "step": 1624
+    },
+    {
+      "epoch": 0.28935185185185186,
+      "grad_norm": 0.47437289357185364,
+      "learning_rate": 0.0001974870034245324,
+      "loss": 1.1572,
+      "step": 1625
+    },
+    {
+      "epoch": 0.28952991452991456,
+      "grad_norm": 0.4796304404735565,
+      "learning_rate": 0.00019748388419158394,
+      "loss": 1.1667,
+      "step": 1626
+    },
+    {
+      "epoch": 0.2897079772079772,
+      "grad_norm": 0.42686304450035095,
+      "learning_rate": 0.0001974807630486413,
+      "loss": 0.9824,
+      "step": 1627
+    },
+    {
+      "epoch": 0.2898860398860399,
+      "grad_norm": 0.4444865584373474,
+      "learning_rate": 0.00019747763999576558,
+      "loss": 1.2789,
+      "step": 1628
+    },
+    {
+      "epoch": 0.2900641025641026,
+      "grad_norm": 0.5039985179901123,
+      "learning_rate": 0.000197474515033018,
+      "loss": 1.1488,
+      "step": 1629
+    },
+    {
+      "epoch": 0.29024216524216523,
+      "grad_norm": 0.581479549407959,
+      "learning_rate": 0.00019747138816045978,
+      "loss": 1.1232,
+      "step": 1630
+    },
+    {
+      "epoch": 0.2904202279202279,
+      "grad_norm": 0.5415821075439453,
+      "learning_rate": 0.00019746825937815222,
+      "loss": 1.2326,
+      "step": 1631
+    },
+    {
+      "epoch": 0.2905982905982906,
+      "grad_norm": 0.45528364181518555,
+      "learning_rate": 0.00019746512868615656,
+      "loss": 1.0246,
+      "step": 1632
+    },
+    {
+      "epoch": 0.29077635327635326,
+      "grad_norm": 0.5255574584007263,
+      "learning_rate": 0.00019746199608453418,
+      "loss": 1.0592,
+      "step": 1633
+    },
+    {
+      "epoch": 0.29095441595441596,
+      "grad_norm": 0.5064096450805664,
+      "learning_rate": 0.00019745886157334646,
+      "loss": 1.3439,
+      "step": 1634
+    },
+    {
+      "epoch": 0.29113247863247865,
+      "grad_norm": 0.500848650932312,
+      "learning_rate": 0.00019745572515265475,
+      "loss": 1.1212,
+      "step": 1635
+    },
+    {
+      "epoch": 0.2913105413105413,
+      "grad_norm": 0.5229088068008423,
+      "learning_rate": 0.00019745258682252062,
+      "loss": 1.1019,
+      "step": 1636
+    },
+    {
+      "epoch": 0.291488603988604,
+      "grad_norm": 0.4494398832321167,
+      "learning_rate": 0.00019744944658300545,
+      "loss": 1.1298,
+      "step": 1637
+    },
+    {
+      "epoch": 0.2916666666666667,
+      "grad_norm": 0.48383277654647827,
+      "learning_rate": 0.00019744630443417082,
+      "loss": 1.206,
+      "step": 1638
+    },
+    {
+      "epoch": 0.2918447293447293,
+      "grad_norm": 0.4870131313800812,
+      "learning_rate": 0.00019744316037607828,
+      "loss": 1.2096,
+      "step": 1639
+    },
+    {
+      "epoch": 0.292022792022792,
+      "grad_norm": 0.4153090715408325,
+      "learning_rate": 0.00019744001440878944,
+      "loss": 1.0478,
+      "step": 1640
+    },
+    {
+      "epoch": 0.2922008547008547,
+      "grad_norm": 0.4262249171733856,
+      "learning_rate": 0.0001974368665323659,
+      "loss": 1.0393,
+      "step": 1641
+    },
+    {
+      "epoch": 0.29237891737891736,
+      "grad_norm": 0.46131134033203125,
+      "learning_rate": 0.00019743371674686938,
+      "loss": 1.0908,
+      "step": 1642
+    },
+    {
+      "epoch": 0.29255698005698005,
+      "grad_norm": 0.44877463579177856,
+      "learning_rate": 0.0001974305650523616,
+      "loss": 1.1906,
+      "step": 1643
+    },
+    {
+      "epoch": 0.29273504273504275,
+      "grad_norm": 0.5199326276779175,
+      "learning_rate": 0.00019742741144890432,
+      "loss": 1.1147,
+      "step": 1644
+    },
+    {
+      "epoch": 0.2929131054131054,
+      "grad_norm": 0.48142504692077637,
+      "learning_rate": 0.00019742425593655924,
+      "loss": 1.1951,
+      "step": 1645
+    },
+    {
+      "epoch": 0.2930911680911681,
+      "grad_norm": 0.5672988891601562,
+      "learning_rate": 0.0001974210985153883,
+      "loss": 1.1817,
+      "step": 1646
+    },
+    {
+      "epoch": 0.2932692307692308,
+      "grad_norm": 0.38135233521461487,
+      "learning_rate": 0.00019741793918545326,
+      "loss": 0.8567,
+      "step": 1647
+    },
+    {
+      "epoch": 0.2934472934472934,
+      "grad_norm": 0.6153588891029358,
+      "learning_rate": 0.0001974147779468161,
+      "loss": 1.0593,
+      "step": 1648
+    },
+    {
+      "epoch": 0.2936253561253561,
+      "grad_norm": 0.38935527205467224,
+      "learning_rate": 0.0001974116147995387,
+      "loss": 0.9907,
+      "step": 1649
+    },
+    {
+      "epoch": 0.2938034188034188,
+      "grad_norm": 0.467351496219635,
+      "learning_rate": 0.0001974084497436831,
+      "loss": 1.091,
+      "step": 1650
+    },
+    {
+      "epoch": 0.29398148148148145,
+      "grad_norm": 0.45613420009613037,
+      "learning_rate": 0.00019740528277931128,
+      "loss": 0.6789,
+      "step": 1651
+    },
+    {
+      "epoch": 0.29415954415954415,
+      "grad_norm": 0.4045158326625824,
+      "learning_rate": 0.00019740211390648524,
+      "loss": 1.0727,
+      "step": 1652
+    },
+    {
+      "epoch": 0.29433760683760685,
+      "grad_norm": 0.5122803449630737,
+      "learning_rate": 0.00019739894312526714,
+      "loss": 1.2297,
+      "step": 1653
+    },
+    {
+      "epoch": 0.29451566951566954,
+      "grad_norm": 0.44304123520851135,
+      "learning_rate": 0.00019739577043571908,
+      "loss": 0.9562,
+      "step": 1654
+    },
+    {
+      "epoch": 0.2946937321937322,
+      "grad_norm": 0.6070618629455566,
+      "learning_rate": 0.00019739259583790322,
+      "loss": 1.2745,
+      "step": 1655
+    },
+    {
+      "epoch": 0.2948717948717949,
+      "grad_norm": 0.48815637826919556,
+      "learning_rate": 0.00019738941933188176,
+      "loss": 1.0574,
+      "step": 1656
+    },
+    {
+      "epoch": 0.2950498575498576,
+      "grad_norm": 0.5067802667617798,
+      "learning_rate": 0.00019738624091771693,
+      "loss": 1.1874,
+      "step": 1657
+    },
+    {
+      "epoch": 0.2952279202279202,
+      "grad_norm": 0.4956928491592407,
+      "learning_rate": 0.000197383060595471,
+      "loss": 1.1085,
+      "step": 1658
+    },
+    {
+      "epoch": 0.2954059829059829,
+      "grad_norm": 0.46313008666038513,
+      "learning_rate": 0.00019737987836520633,
+      "loss": 1.0548,
+      "step": 1659
+    },
+    {
+      "epoch": 0.2955840455840456,
+      "grad_norm": 0.49944064021110535,
+      "learning_rate": 0.0001973766942269852,
+      "loss": 1.1485,
+      "step": 1660
+    },
+    {
+      "epoch": 0.29576210826210825,
+      "grad_norm": 0.4743517339229584,
+      "learning_rate": 0.00019737350818087003,
+      "loss": 0.9279,
+      "step": 1661
+    },
+    {
+      "epoch": 0.29594017094017094,
+      "grad_norm": 0.45935431122779846,
+      "learning_rate": 0.00019737032022692326,
+      "loss": 0.9574,
+      "step": 1662
+    },
+    {
+      "epoch": 0.29611823361823364,
+      "grad_norm": 0.4550873637199402,
+      "learning_rate": 0.00019736713036520734,
+      "loss": 1.1642,
+      "step": 1663
+    },
+    {
+      "epoch": 0.2962962962962963,
+      "grad_norm": 0.45252951979637146,
+      "learning_rate": 0.00019736393859578474,
+      "loss": 1.0113,
+      "step": 1664
+    },
+    {
+      "epoch": 0.296474358974359,
+      "grad_norm": 0.5147238969802856,
+      "learning_rate": 0.00019736074491871804,
+      "loss": 1.1604,
+      "step": 1665
+    },
+    {
+      "epoch": 0.29665242165242167,
+      "grad_norm": 0.5122934579849243,
+      "learning_rate": 0.00019735754933406977,
+      "loss": 0.9525,
+      "step": 1666
+    },
+    {
+      "epoch": 0.2968304843304843,
+      "grad_norm": 0.438620001077652,
+      "learning_rate": 0.00019735435184190257,
+      "loss": 1.0728,
+      "step": 1667
+    },
+    {
+      "epoch": 0.297008547008547,
+      "grad_norm": 0.41970670223236084,
+      "learning_rate": 0.00019735115244227908,
+      "loss": 0.9782,
+      "step": 1668
+    },
+    {
+      "epoch": 0.2971866096866097,
+      "grad_norm": 0.5447152256965637,
+      "learning_rate": 0.000197347951135262,
+      "loss": 1.0633,
+      "step": 1669
+    },
+    {
+      "epoch": 0.29736467236467234,
+      "grad_norm": 0.4846996068954468,
+      "learning_rate": 0.00019734474792091407,
+      "loss": 0.9019,
+      "step": 1670
+    },
+    {
+      "epoch": 0.29754273504273504,
+      "grad_norm": 0.4721437990665436,
+      "learning_rate": 0.00019734154279929796,
+      "loss": 1.1793,
+      "step": 1671
+    },
+    {
+      "epoch": 0.29772079772079774,
+      "grad_norm": 0.4659852385520935,
+      "learning_rate": 0.00019733833577047655,
+      "loss": 1.1503,
+      "step": 1672
+    },
+    {
+      "epoch": 0.2978988603988604,
+      "grad_norm": 0.3733183443546295,
+      "learning_rate": 0.00019733512683451268,
+      "loss": 0.7763,
+      "step": 1673
+    },
+    {
+      "epoch": 0.2980769230769231,
+      "grad_norm": 0.4898292124271393,
+      "learning_rate": 0.0001973319159914692,
+      "loss": 1.3146,
+      "step": 1674
+    },
+    {
+      "epoch": 0.29825498575498577,
+      "grad_norm": 0.41774725914001465,
+      "learning_rate": 0.00019732870324140899,
+      "loss": 1.2069,
+      "step": 1675
+    },
+    {
+      "epoch": 0.2984330484330484,
+      "grad_norm": 0.4607912003993988,
+      "learning_rate": 0.000197325488584395,
+      "loss": 1.2255,
+      "step": 1676
+    },
+    {
+      "epoch": 0.2986111111111111,
+      "grad_norm": 0.4692424237728119,
+      "learning_rate": 0.00019732227202049025,
+      "loss": 1.0793,
+      "step": 1677
+    },
+    {
+      "epoch": 0.2987891737891738,
+      "grad_norm": 0.5925022959709167,
+      "learning_rate": 0.00019731905354975778,
+      "loss": 1.0297,
+      "step": 1678
+    },
+    {
+      "epoch": 0.29896723646723644,
+      "grad_norm": 0.44047990441322327,
+      "learning_rate": 0.00019731583317226056,
+      "loss": 1.0982,
+      "step": 1679
+    },
+    {
+      "epoch": 0.29914529914529914,
+      "grad_norm": 0.5863066911697388,
+      "learning_rate": 0.0001973126108880618,
+      "loss": 1.0284,
+      "step": 1680
+    },
+    {
+      "epoch": 0.29932336182336183,
+      "grad_norm": 0.48962152004241943,
+      "learning_rate": 0.00019730938669722457,
+      "loss": 1.1861,
+      "step": 1681
+    },
+    {
+      "epoch": 0.29950142450142453,
+      "grad_norm": 0.5445577502250671,
+      "learning_rate": 0.00019730616059981205,
+      "loss": 1.2574,
+      "step": 1682
+    },
+    {
+      "epoch": 0.29967948717948717,
+      "grad_norm": 0.49327564239501953,
+      "learning_rate": 0.00019730293259588743,
+      "loss": 0.9578,
+      "step": 1683
+    },
+    {
+      "epoch": 0.29985754985754987,
+      "grad_norm": 0.4252840578556061,
+      "learning_rate": 0.00019729970268551398,
+      "loss": 1.0083,
+      "step": 1684
+    },
+    {
+      "epoch": 0.30003561253561256,
+      "grad_norm": 0.5140926241874695,
+      "learning_rate": 0.000197296470868755,
+      "loss": 1.3263,
+      "step": 1685
+    },
+    {
+      "epoch": 0.3002136752136752,
+      "grad_norm": 0.5143948197364807,
+      "learning_rate": 0.00019729323714567375,
+      "loss": 1.0424,
+      "step": 1686
+    },
+    {
+      "epoch": 0.3003917378917379,
+      "grad_norm": 0.3811354339122772,
+      "learning_rate": 0.00019729000151633367,
+      "loss": 0.6319,
+      "step": 1687
+    },
+    {
+      "epoch": 0.3005698005698006,
+      "grad_norm": 0.5249716639518738,
+      "learning_rate": 0.0001972867639807981,
+      "loss": 1.0173,
+      "step": 1688
+    },
+    {
+      "epoch": 0.30074786324786323,
+      "grad_norm": 0.41832098364830017,
+      "learning_rate": 0.00019728352453913048,
+      "loss": 1.0503,
+      "step": 1689
+    },
+    {
+      "epoch": 0.30092592592592593,
+      "grad_norm": 0.5961149334907532,
+      "learning_rate": 0.00019728028319139428,
+      "loss": 1.1843,
+      "step": 1690
+    },
+    {
+      "epoch": 0.3011039886039886,
+      "grad_norm": 0.44083690643310547,
+      "learning_rate": 0.00019727703993765303,
+      "loss": 1.1311,
+      "step": 1691
+    },
+    {
+      "epoch": 0.30128205128205127,
+      "grad_norm": 0.4368111491203308,
+      "learning_rate": 0.00019727379477797022,
+      "loss": 0.9463,
+      "step": 1692
+    },
+    {
+      "epoch": 0.30146011396011396,
+      "grad_norm": 0.5289376974105835,
+      "learning_rate": 0.00019727054771240954,
+      "loss": 0.9836,
+      "step": 1693
+    },
+    {
+      "epoch": 0.30163817663817666,
+      "grad_norm": 0.4132843613624573,
+      "learning_rate": 0.00019726729874103448,
+      "loss": 1.1052,
+      "step": 1694
+    },
+    {
+      "epoch": 0.3018162393162393,
+      "grad_norm": 0.4919086992740631,
+      "learning_rate": 0.00019726404786390877,
+      "loss": 1.2219,
+      "step": 1695
+    },
+    {
+      "epoch": 0.301994301994302,
+      "grad_norm": 0.42561691999435425,
+      "learning_rate": 0.0001972607950810961,
+      "loss": 1.0756,
+      "step": 1696
+    },
+    {
+      "epoch": 0.3021723646723647,
+      "grad_norm": 0.5030396580696106,
+      "learning_rate": 0.0001972575403926602,
+      "loss": 1.2207,
+      "step": 1697
+    },
+    {
+      "epoch": 0.30235042735042733,
+      "grad_norm": 0.4779801666736603,
+      "learning_rate": 0.0001972542837986648,
+      "loss": 1.194,
+      "step": 1698
+    },
+    {
+      "epoch": 0.30252849002849,
+      "grad_norm": 0.45395568013191223,
+      "learning_rate": 0.00019725102529917377,
+      "loss": 1.0775,
+      "step": 1699
+    },
+    {
+      "epoch": 0.3027065527065527,
+      "grad_norm": 0.6540699005126953,
+      "learning_rate": 0.0001972477648942509,
+      "loss": 1.181,
+      "step": 1700
+    },
+    {
+      "epoch": 0.30288461538461536,
+      "grad_norm": 0.46281275153160095,
+      "learning_rate": 0.00019724450258396008,
+      "loss": 0.629,
+      "step": 1701
+    },
+    {
+      "epoch": 0.30306267806267806,
+      "grad_norm": 0.3452845811843872,
+      "learning_rate": 0.00019724123836836527,
+      "loss": 0.51,
+      "step": 1702
+    },
+    {
+      "epoch": 0.30324074074074076,
+      "grad_norm": 0.4507991671562195,
+      "learning_rate": 0.00019723797224753038,
+      "loss": 1.0258,
+      "step": 1703
+    },
+    {
+      "epoch": 0.3034188034188034,
+      "grad_norm": 0.5385412573814392,
+      "learning_rate": 0.0001972347042215194,
+      "loss": 1.0232,
+      "step": 1704
+    },
+    {
+      "epoch": 0.3035968660968661,
+      "grad_norm": 0.4460466504096985,
+      "learning_rate": 0.00019723143429039642,
+      "loss": 1.1307,
+      "step": 1705
+    },
+    {
+      "epoch": 0.3037749287749288,
+      "grad_norm": 0.5229718685150146,
+      "learning_rate": 0.00019722816245422545,
+      "loss": 1.0964,
+      "step": 1706
+    },
+    {
+      "epoch": 0.30395299145299143,
+      "grad_norm": 0.4776979088783264,
+      "learning_rate": 0.00019722488871307058,
+      "loss": 1.2678,
+      "step": 1707
+    },
+    {
+      "epoch": 0.3041310541310541,
+      "grad_norm": 0.5371831655502319,
+      "learning_rate": 0.00019722161306699601,
+      "loss": 1.2808,
+      "step": 1708
+    },
+    {
+      "epoch": 0.3043091168091168,
+      "grad_norm": 0.45322108268737793,
+      "learning_rate": 0.0001972183355160659,
+      "loss": 1.0775,
+      "step": 1709
+    },
+    {
+      "epoch": 0.30448717948717946,
+      "grad_norm": 0.5036569833755493,
+      "learning_rate": 0.00019721505606034448,
+      "loss": 1.1859,
+      "step": 1710
+    },
+    {
+      "epoch": 0.30466524216524216,
+      "grad_norm": 0.5425969958305359,
+      "learning_rate": 0.00019721177469989593,
+      "loss": 1.0173,
+      "step": 1711
+    },
+    {
+      "epoch": 0.30484330484330485,
+      "grad_norm": 0.5638980269432068,
+      "learning_rate": 0.00019720849143478462,
+      "loss": 1.182,
+      "step": 1712
+    },
+    {
+      "epoch": 0.30502136752136755,
+      "grad_norm": 0.5160546898841858,
+      "learning_rate": 0.00019720520626507486,
+      "loss": 0.9853,
+      "step": 1713
+    },
+    {
+      "epoch": 0.3051994301994302,
+      "grad_norm": 0.5079004168510437,
+      "learning_rate": 0.000197201919190831,
+      "loss": 1.3154,
+      "step": 1714
+    },
+    {
+      "epoch": 0.3053774928774929,
+      "grad_norm": 0.4590355455875397,
+      "learning_rate": 0.00019719863021211745,
+      "loss": 1.007,
+      "step": 1715
+    },
+    {
+      "epoch": 0.3055555555555556,
+      "grad_norm": 0.49656423926353455,
+      "learning_rate": 0.00019719533932899865,
+      "loss": 1.2187,
+      "step": 1716
+    },
+    {
+      "epoch": 0.3057336182336182,
+      "grad_norm": 0.46426209807395935,
+      "learning_rate": 0.0001971920465415391,
+      "loss": 1.3007,
+      "step": 1717
+    },
+    {
+      "epoch": 0.3059116809116809,
+      "grad_norm": 0.5211917757987976,
+      "learning_rate": 0.00019718875184980328,
+      "loss": 1.2256,
+      "step": 1718
+    },
+    {
+      "epoch": 0.3060897435897436,
+      "grad_norm": 0.42953309416770935,
+      "learning_rate": 0.00019718545525385578,
+      "loss": 1.2838,
+      "step": 1719
+    },
+    {
+      "epoch": 0.30626780626780625,
+      "grad_norm": 0.4893105924129486,
+      "learning_rate": 0.00019718215675376116,
+      "loss": 1.052,
+      "step": 1720
+    },
+    {
+      "epoch": 0.30644586894586895,
+      "grad_norm": 0.4833602011203766,
+      "learning_rate": 0.00019717885634958405,
+      "loss": 1.069,
+      "step": 1721
+    },
+    {
+      "epoch": 0.30662393162393164,
+      "grad_norm": 0.502176821231842,
+      "learning_rate": 0.0001971755540413891,
+      "loss": 1.1659,
+      "step": 1722
+    },
+    {
+      "epoch": 0.3068019943019943,
+      "grad_norm": 0.4648856818675995,
+      "learning_rate": 0.00019717224982924108,
+      "loss": 1.1873,
+      "step": 1723
+    },
+    {
+      "epoch": 0.306980056980057,
+      "grad_norm": 0.405429869890213,
+      "learning_rate": 0.00019716894371320465,
+      "loss": 0.99,
+      "step": 1724
+    },
+    {
+      "epoch": 0.3071581196581197,
+      "grad_norm": 0.4306945204734802,
+      "learning_rate": 0.00019716563569334463,
+      "loss": 0.8751,
+      "step": 1725
+    },
+    {
+      "epoch": 0.3073361823361823,
+      "grad_norm": 0.49424824118614197,
+      "learning_rate": 0.00019716232576972583,
+      "loss": 0.9205,
+      "step": 1726
+    },
+    {
+      "epoch": 0.307514245014245,
+      "grad_norm": 0.5044034123420715,
+      "learning_rate": 0.00019715901394241306,
+      "loss": 1.2042,
+      "step": 1727
+    },
+    {
+      "epoch": 0.3076923076923077,
+      "grad_norm": 0.512180507183075,
+      "learning_rate": 0.00019715570021147126,
+      "loss": 1.1644,
+      "step": 1728
+    },
+    {
+      "epoch": 0.30787037037037035,
+      "grad_norm": 0.4377981126308441,
+      "learning_rate": 0.00019715238457696538,
+      "loss": 1.1625,
+      "step": 1729
+    },
+    {
+      "epoch": 0.30804843304843305,
+      "grad_norm": 0.49107855558395386,
+      "learning_rate": 0.00019714906703896027,
+      "loss": 1.1037,
+      "step": 1730
+    },
+    {
+      "epoch": 0.30822649572649574,
+      "grad_norm": 0.47342559695243835,
+      "learning_rate": 0.00019714574759752105,
+      "loss": 1.3186,
+      "step": 1731
+    },
+    {
+      "epoch": 0.3084045584045584,
+      "grad_norm": 0.487177312374115,
+      "learning_rate": 0.0001971424262527127,
+      "loss": 1.1196,
+      "step": 1732
+    },
+    {
+      "epoch": 0.3085826210826211,
+      "grad_norm": 0.5290025472640991,
+      "learning_rate": 0.0001971391030046003,
+      "loss": 1.2103,
+      "step": 1733
+    },
+    {
+      "epoch": 0.3087606837606838,
+      "grad_norm": 0.4587760269641876,
+      "learning_rate": 0.00019713577785324896,
+      "loss": 1.1017,
+      "step": 1734
+    },
+    {
+      "epoch": 0.3089387464387464,
+      "grad_norm": 0.45323294401168823,
+      "learning_rate": 0.00019713245079872388,
+      "loss": 1.0,
+      "step": 1735
+    },
+    {
+      "epoch": 0.3091168091168091,
+      "grad_norm": 0.43414804339408875,
+      "learning_rate": 0.00019712912184109013,
+      "loss": 1.0341,
+      "step": 1736
+    },
+    {
+      "epoch": 0.3092948717948718,
+      "grad_norm": 0.49604663252830505,
+      "learning_rate": 0.00019712579098041304,
+      "loss": 0.9437,
+      "step": 1737
+    },
+    {
+      "epoch": 0.30947293447293445,
+      "grad_norm": 0.48580703139305115,
+      "learning_rate": 0.00019712245821675785,
+      "loss": 1.2622,
+      "step": 1738
+    },
+    {
+      "epoch": 0.30965099715099714,
+      "grad_norm": 0.45333603024482727,
+      "learning_rate": 0.00019711912355018982,
+      "loss": 1.2063,
+      "step": 1739
+    },
+    {
+      "epoch": 0.30982905982905984,
+      "grad_norm": 0.5990764498710632,
+      "learning_rate": 0.00019711578698077432,
+      "loss": 1.5097,
+      "step": 1740
+    },
+    {
+      "epoch": 0.31000712250712253,
+      "grad_norm": 0.4386102259159088,
+      "learning_rate": 0.0001971124485085767,
+      "loss": 1.1283,
+      "step": 1741
+    },
+    {
+      "epoch": 0.3101851851851852,
+      "grad_norm": 0.4476035237312317,
+      "learning_rate": 0.00019710910813366242,
+      "loss": 0.8922,
+      "step": 1742
+    },
+    {
+      "epoch": 0.31036324786324787,
+      "grad_norm": 0.5276228785514832,
+      "learning_rate": 0.00019710576585609685,
+      "loss": 1.2373,
+      "step": 1743
+    },
+    {
+      "epoch": 0.31054131054131057,
+      "grad_norm": 0.4885637164115906,
+      "learning_rate": 0.00019710242167594557,
+      "loss": 1.0881,
+      "step": 1744
+    },
+    {
+      "epoch": 0.3107193732193732,
+      "grad_norm": 0.421132355928421,
+      "learning_rate": 0.000197099075593274,
+      "loss": 1.0544,
+      "step": 1745
+    },
+    {
+      "epoch": 0.3108974358974359,
+      "grad_norm": 0.5257927179336548,
+      "learning_rate": 0.00019709572760814777,
+      "loss": 1.265,
+      "step": 1746
+    },
+    {
+      "epoch": 0.3110754985754986,
+      "grad_norm": 0.5164850950241089,
+      "learning_rate": 0.00019709237772063247,
+      "loss": 0.9593,
+      "step": 1747
+    },
+    {
+      "epoch": 0.31125356125356124,
+      "grad_norm": 0.5176383256912231,
+      "learning_rate": 0.00019708902593079374,
+      "loss": 1.0194,
+      "step": 1748
+    },
+    {
+      "epoch": 0.31143162393162394,
+      "grad_norm": 0.4620790481567383,
+      "learning_rate": 0.00019708567223869716,
+      "loss": 0.9241,
+      "step": 1749
+    },
+    {
+      "epoch": 0.31160968660968663,
+      "grad_norm": 0.48307979106903076,
+      "learning_rate": 0.00019708231664440854,
+      "loss": 1.2314,
+      "step": 1750
+    },
+    {
+      "epoch": 0.31178774928774927,
+      "grad_norm": 0.4931468069553375,
+      "learning_rate": 0.00019707895914799364,
+      "loss": 1.2065,
+      "step": 1751
+    },
+    {
+      "epoch": 0.31196581196581197,
+      "grad_norm": 0.5035979747772217,
+      "learning_rate": 0.00019707559974951818,
+      "loss": 1.1867,
+      "step": 1752
+    },
+    {
+      "epoch": 0.31214387464387466,
+      "grad_norm": 0.47543632984161377,
+      "learning_rate": 0.00019707223844904795,
+      "loss": 1.0603,
+      "step": 1753
+    },
+    {
+      "epoch": 0.3123219373219373,
+      "grad_norm": 0.49929797649383545,
+      "learning_rate": 0.00019706887524664892,
+      "loss": 1.0597,
+      "step": 1754
+    },
+    {
+      "epoch": 0.3125,
+      "grad_norm": 0.5075222253799438,
+      "learning_rate": 0.00019706551014238687,
+      "loss": 1.1398,
+      "step": 1755
+    },
+    {
+      "epoch": 0.3126780626780627,
+      "grad_norm": 0.5096884369850159,
+      "learning_rate": 0.00019706214313632784,
+      "loss": 1.1382,
+      "step": 1756
+    },
+    {
+      "epoch": 0.31285612535612534,
+      "grad_norm": 0.4629988372325897,
+      "learning_rate": 0.0001970587742285377,
+      "loss": 1.0009,
+      "step": 1757
+    },
+    {
+      "epoch": 0.31303418803418803,
+      "grad_norm": 0.5244084596633911,
+      "learning_rate": 0.00019705540341908253,
+      "loss": 1.047,
+      "step": 1758
+    },
+    {
+      "epoch": 0.31321225071225073,
+      "grad_norm": 0.5136716961860657,
+      "learning_rate": 0.00019705203070802832,
+      "loss": 1.29,
+      "step": 1759
+    },
+    {
+      "epoch": 0.31339031339031337,
+      "grad_norm": 0.43991541862487793,
+      "learning_rate": 0.0001970486560954412,
+      "loss": 0.9605,
+      "step": 1760
+    },
+    {
+      "epoch": 0.31356837606837606,
+      "grad_norm": 0.4633477032184601,
+      "learning_rate": 0.00019704527958138725,
+      "loss": 1.1507,
+      "step": 1761
+    },
+    {
+      "epoch": 0.31374643874643876,
+      "grad_norm": 0.4419999420642853,
+      "learning_rate": 0.00019704190116593266,
+      "loss": 0.9262,
+      "step": 1762
+    },
+    {
+      "epoch": 0.3139245014245014,
+      "grad_norm": 0.49359434843063354,
+      "learning_rate": 0.00019703852084914357,
+      "loss": 0.9348,
+      "step": 1763
+    },
+    {
+      "epoch": 0.3141025641025641,
+      "grad_norm": 0.5072139501571655,
+      "learning_rate": 0.00019703513863108627,
+      "loss": 1.1592,
+      "step": 1764
+    },
+    {
+      "epoch": 0.3142806267806268,
+      "grad_norm": 0.45969831943511963,
+      "learning_rate": 0.00019703175451182698,
+      "loss": 1.1519,
+      "step": 1765
+    },
+    {
+      "epoch": 0.31445868945868943,
+      "grad_norm": 0.5148758292198181,
+      "learning_rate": 0.00019702836849143208,
+      "loss": 1.1673,
+      "step": 1766
+    },
+    {
+      "epoch": 0.31463675213675213,
+      "grad_norm": 0.43033209443092346,
+      "learning_rate": 0.0001970249805699678,
+      "loss": 0.9256,
+      "step": 1767
+    },
+    {
+      "epoch": 0.3148148148148148,
+      "grad_norm": 0.48143425583839417,
+      "learning_rate": 0.00019702159074750058,
+      "loss": 1.08,
+      "step": 1768
+    },
+    {
+      "epoch": 0.31499287749287747,
+      "grad_norm": 0.4780619740486145,
+      "learning_rate": 0.00019701819902409685,
+      "loss": 1.1198,
+      "step": 1769
+    },
+    {
+      "epoch": 0.31517094017094016,
+      "grad_norm": 0.4662075936794281,
+      "learning_rate": 0.00019701480539982305,
+      "loss": 0.8424,
+      "step": 1770
+    },
+    {
+      "epoch": 0.31534900284900286,
+      "grad_norm": 0.503901481628418,
+      "learning_rate": 0.00019701140987474566,
+      "loss": 1.1026,
+      "step": 1771
+    },
+    {
+      "epoch": 0.31552706552706555,
+      "grad_norm": 0.5197132229804993,
+      "learning_rate": 0.00019700801244893124,
+      "loss": 1.2148,
+      "step": 1772
+    },
+    {
+      "epoch": 0.3157051282051282,
+      "grad_norm": 0.4746309220790863,
+      "learning_rate": 0.00019700461312244634,
+      "loss": 1.0906,
+      "step": 1773
+    },
+    {
+      "epoch": 0.3158831908831909,
+      "grad_norm": 0.5277339816093445,
+      "learning_rate": 0.00019700121189535752,
+      "loss": 1.0588,
+      "step": 1774
+    },
+    {
+      "epoch": 0.3160612535612536,
+      "grad_norm": 0.436002254486084,
+      "learning_rate": 0.00019699780876773147,
+      "loss": 1.0341,
+      "step": 1775
+    },
+    {
+      "epoch": 0.3162393162393162,
+      "grad_norm": 0.5171145796775818,
+      "learning_rate": 0.00019699440373963486,
+      "loss": 1.282,
+      "step": 1776
+    },
+    {
+      "epoch": 0.3164173789173789,
+      "grad_norm": 0.38382846117019653,
+      "learning_rate": 0.00019699099681113436,
+      "loss": 0.8908,
+      "step": 1777
+    },
+    {
+      "epoch": 0.3165954415954416,
+      "grad_norm": 0.4621630609035492,
+      "learning_rate": 0.0001969875879822968,
+      "loss": 1.1074,
+      "step": 1778
+    },
+    {
+      "epoch": 0.31677350427350426,
+      "grad_norm": 0.5543130040168762,
+      "learning_rate": 0.00019698417725318892,
+      "loss": 0.9682,
+      "step": 1779
+    },
+    {
+      "epoch": 0.31695156695156695,
+      "grad_norm": 0.49534836411476135,
+      "learning_rate": 0.00019698076462387753,
+      "loss": 1.107,
+      "step": 1780
+    },
+    {
+      "epoch": 0.31712962962962965,
+      "grad_norm": 0.48844948410987854,
+      "learning_rate": 0.00019697735009442956,
+      "loss": 1.1295,
+      "step": 1781
+    },
+    {
+      "epoch": 0.3173076923076923,
+      "grad_norm": 0.5070686936378479,
+      "learning_rate": 0.00019697393366491185,
+      "loss": 1.083,
+      "step": 1782
+    },
+    {
+      "epoch": 0.317485754985755,
+      "grad_norm": 0.47817620635032654,
+      "learning_rate": 0.00019697051533539134,
+      "loss": 1.3014,
+      "step": 1783
+    },
+    {
+      "epoch": 0.3176638176638177,
+      "grad_norm": 0.538488507270813,
+      "learning_rate": 0.00019696709510593502,
+      "loss": 1.0354,
+      "step": 1784
+    },
+    {
+      "epoch": 0.3178418803418803,
+      "grad_norm": 0.5141439437866211,
+      "learning_rate": 0.0001969636729766099,
+      "loss": 1.2912,
+      "step": 1785
+    },
+    {
+      "epoch": 0.318019943019943,
+      "grad_norm": 0.5009665489196777,
+      "learning_rate": 0.00019696024894748306,
+      "loss": 0.9014,
+      "step": 1786
+    },
+    {
+      "epoch": 0.3181980056980057,
+      "grad_norm": 0.46199744939804077,
+      "learning_rate": 0.00019695682301862155,
+      "loss": 1.0532,
+      "step": 1787
+    },
+    {
+      "epoch": 0.31837606837606836,
+      "grad_norm": 0.4649423062801361,
+      "learning_rate": 0.0001969533951900925,
+      "loss": 0.8608,
+      "step": 1788
+    },
+    {
+      "epoch": 0.31855413105413105,
+      "grad_norm": 0.516909658908844,
+      "learning_rate": 0.0001969499654619631,
+      "loss": 1.1385,
+      "step": 1789
+    },
+    {
+      "epoch": 0.31873219373219375,
+      "grad_norm": 0.46016669273376465,
+      "learning_rate": 0.00019694653383430048,
+      "loss": 0.9168,
+      "step": 1790
+    },
+    {
+      "epoch": 0.3189102564102564,
+      "grad_norm": 0.4794938564300537,
+      "learning_rate": 0.00019694310030717193,
+      "loss": 1.0244,
+      "step": 1791
+    },
+    {
+      "epoch": 0.3190883190883191,
+      "grad_norm": 0.46577662229537964,
+      "learning_rate": 0.00019693966488064471,
+      "loss": 1.0954,
+      "step": 1792
+    },
+    {
+      "epoch": 0.3192663817663818,
+      "grad_norm": 0.4866746962070465,
+      "learning_rate": 0.00019693622755478614,
+      "loss": 1.2925,
+      "step": 1793
+    },
+    {
+      "epoch": 0.3194444444444444,
+      "grad_norm": 0.4841702878475189,
+      "learning_rate": 0.00019693278832966357,
+      "loss": 1.119,
+      "step": 1794
+    },
+    {
+      "epoch": 0.3196225071225071,
+      "grad_norm": 0.4835243821144104,
+      "learning_rate": 0.00019692934720534435,
+      "loss": 1.1702,
+      "step": 1795
+    },
+    {
+      "epoch": 0.3198005698005698,
+      "grad_norm": 0.5200608968734741,
+      "learning_rate": 0.00019692590418189594,
+      "loss": 1.1989,
+      "step": 1796
+    },
+    {
+      "epoch": 0.31997863247863245,
+      "grad_norm": 0.5147821307182312,
+      "learning_rate": 0.00019692245925938577,
+      "loss": 1.1417,
+      "step": 1797
+    },
+    {
+      "epoch": 0.32015669515669515,
+      "grad_norm": 0.5145614743232727,
+      "learning_rate": 0.00019691901243788136,
+      "loss": 1.0571,
+      "step": 1798
+    },
+    {
+      "epoch": 0.32033475783475784,
+      "grad_norm": 0.5416026711463928,
+      "learning_rate": 0.00019691556371745022,
+      "loss": 1.188,
+      "step": 1799
+    },
+    {
+      "epoch": 0.32051282051282054,
+      "grad_norm": 0.5140644311904907,
+      "learning_rate": 0.00019691211309815995,
+      "loss": 1.1795,
+      "step": 1800
+    },
+    {
+      "epoch": 0.3206908831908832,
+      "grad_norm": 0.44219106435775757,
+      "learning_rate": 0.00019690866058007817,
+      "loss": 0.9215,
+      "step": 1801
+    },
+    {
+      "epoch": 0.3208689458689459,
+      "grad_norm": 0.49523603916168213,
+      "learning_rate": 0.00019690520616327245,
+      "loss": 1.1117,
+      "step": 1802
+    },
+    {
+      "epoch": 0.32104700854700857,
+      "grad_norm": 0.5818293690681458,
+      "learning_rate": 0.0001969017498478105,
+      "loss": 1.16,
+      "step": 1803
+    },
+    {
+      "epoch": 0.3212250712250712,
+      "grad_norm": 0.5175749659538269,
+      "learning_rate": 0.0001968982916337601,
+      "loss": 1.1999,
+      "step": 1804
+    },
+    {
+      "epoch": 0.3214031339031339,
+      "grad_norm": 0.49916017055511475,
+      "learning_rate": 0.00019689483152118898,
+      "loss": 0.9505,
+      "step": 1805
+    },
+    {
+      "epoch": 0.3215811965811966,
+      "grad_norm": 0.46849536895751953,
+      "learning_rate": 0.00019689136951016488,
+      "loss": 0.9627,
+      "step": 1806
+    },
+    {
+      "epoch": 0.32175925925925924,
+      "grad_norm": 0.4226818382740021,
+      "learning_rate": 0.00019688790560075568,
+      "loss": 1.037,
+      "step": 1807
+    },
+    {
+      "epoch": 0.32193732193732194,
+      "grad_norm": 0.4697103798389435,
+      "learning_rate": 0.00019688443979302923,
+      "loss": 1.1431,
+      "step": 1808
+    },
+    {
+      "epoch": 0.32211538461538464,
+      "grad_norm": 0.4999365508556366,
+      "learning_rate": 0.00019688097208705343,
+      "loss": 1.171,
+      "step": 1809
+    },
+    {
+      "epoch": 0.3222934472934473,
+      "grad_norm": 0.5229731798171997,
+      "learning_rate": 0.00019687750248289625,
+      "loss": 1.3395,
+      "step": 1810
+    },
+    {
+      "epoch": 0.32247150997151,
+      "grad_norm": 0.512525737285614,
+      "learning_rate": 0.00019687403098062566,
+      "loss": 1.1438,
+      "step": 1811
+    },
+    {
+      "epoch": 0.32264957264957267,
+      "grad_norm": 0.4558548927307129,
+      "learning_rate": 0.00019687055758030967,
+      "loss": 1.0012,
+      "step": 1812
+    },
+    {
+      "epoch": 0.3228276353276353,
+      "grad_norm": 0.45195743441581726,
+      "learning_rate": 0.00019686708228201636,
+      "loss": 1.0222,
+      "step": 1813
+    },
+    {
+      "epoch": 0.323005698005698,
+      "grad_norm": 0.5023126602172852,
+      "learning_rate": 0.00019686360508581373,
+      "loss": 1.2128,
+      "step": 1814
+    },
+    {
+      "epoch": 0.3231837606837607,
+      "grad_norm": 0.46516045928001404,
+      "learning_rate": 0.00019686012599177003,
+      "loss": 0.989,
+      "step": 1815
+    },
+    {
+      "epoch": 0.32336182336182334,
+      "grad_norm": 0.4142672121524811,
+      "learning_rate": 0.00019685664499995338,
+      "loss": 1.0144,
+      "step": 1816
+    },
+    {
+      "epoch": 0.32353988603988604,
+      "grad_norm": 0.4511009752750397,
+      "learning_rate": 0.0001968531621104319,
+      "loss": 0.885,
+      "step": 1817
+    },
+    {
+      "epoch": 0.32371794871794873,
+      "grad_norm": 0.49583545327186584,
+      "learning_rate": 0.00019684967732327396,
+      "loss": 1.0986,
+      "step": 1818
+    },
+    {
+      "epoch": 0.3238960113960114,
+      "grad_norm": 0.5872161388397217,
+      "learning_rate": 0.0001968461906385478,
+      "loss": 1.1482,
+      "step": 1819
+    },
+    {
+      "epoch": 0.32407407407407407,
+      "grad_norm": 0.4509563148021698,
+      "learning_rate": 0.00019684270205632168,
+      "loss": 1.0578,
+      "step": 1820
+    },
+    {
+      "epoch": 0.32425213675213677,
+      "grad_norm": 0.501345157623291,
+      "learning_rate": 0.00019683921157666402,
+      "loss": 1.1792,
+      "step": 1821
+    },
+    {
+      "epoch": 0.3244301994301994,
+      "grad_norm": 0.48257577419281006,
+      "learning_rate": 0.00019683571919964314,
+      "loss": 1.0448,
+      "step": 1822
+    },
+    {
+      "epoch": 0.3246082621082621,
+      "grad_norm": 0.5399422645568848,
+      "learning_rate": 0.00019683222492532752,
+      "loss": 1.0579,
+      "step": 1823
+    },
+    {
+      "epoch": 0.3247863247863248,
+      "grad_norm": 0.4382506012916565,
+      "learning_rate": 0.0001968287287537856,
+      "loss": 1.0246,
+      "step": 1824
+    },
+    {
+      "epoch": 0.32496438746438744,
+      "grad_norm": 0.49247491359710693,
+      "learning_rate": 0.00019682523068508586,
+      "loss": 1.318,
+      "step": 1825
+    },
+    {
+      "epoch": 0.32514245014245013,
+      "grad_norm": 0.49067625403404236,
+      "learning_rate": 0.0001968217307192969,
+      "loss": 1.1028,
+      "step": 1826
+    },
+    {
+      "epoch": 0.32532051282051283,
+      "grad_norm": 0.4832286238670349,
+      "learning_rate": 0.00019681822885648723,
+      "loss": 1.0996,
+      "step": 1827
+    },
+    {
+      "epoch": 0.32549857549857547,
+      "grad_norm": 0.47144386172294617,
+      "learning_rate": 0.0001968147250967255,
+      "loss": 1.0707,
+      "step": 1828
+    },
+    {
+      "epoch": 0.32567663817663817,
+      "grad_norm": 0.46299225091934204,
+      "learning_rate": 0.0001968112194400803,
+      "loss": 1.0461,
+      "step": 1829
+    },
+    {
+      "epoch": 0.32585470085470086,
+      "grad_norm": 0.4880816340446472,
+      "learning_rate": 0.00019680771188662044,
+      "loss": 1.1198,
+      "step": 1830
+    },
+    {
+      "epoch": 0.32603276353276356,
+      "grad_norm": 0.43837276101112366,
+      "learning_rate": 0.00019680420243641452,
+      "loss": 1.0599,
+      "step": 1831
+    },
+    {
+      "epoch": 0.3262108262108262,
+      "grad_norm": 0.453168660402298,
+      "learning_rate": 0.0001968006910895314,
+      "loss": 1.0327,
+      "step": 1832
+    },
+    {
+      "epoch": 0.3263888888888889,
+      "grad_norm": 0.45183828473091125,
+      "learning_rate": 0.00019679717784603975,
+      "loss": 1.1381,
+      "step": 1833
+    },
+    {
+      "epoch": 0.3265669515669516,
+      "grad_norm": 0.5326765775680542,
+      "learning_rate": 0.00019679366270600852,
+      "loss": 1.3169,
+      "step": 1834
+    },
+    {
+      "epoch": 0.32674501424501423,
+      "grad_norm": 0.47468429803848267,
+      "learning_rate": 0.00019679014566950653,
+      "loss": 1.1816,
+      "step": 1835
+    },
+    {
+      "epoch": 0.3269230769230769,
+      "grad_norm": 0.5096879005432129,
+      "learning_rate": 0.0001967866267366027,
+      "loss": 1.1162,
+      "step": 1836
+    },
+    {
+      "epoch": 0.3271011396011396,
+      "grad_norm": 0.491514652967453,
+      "learning_rate": 0.00019678310590736598,
+      "loss": 1.2793,
+      "step": 1837
+    },
+    {
+      "epoch": 0.32727920227920226,
+      "grad_norm": 0.601439356803894,
+      "learning_rate": 0.00019677958318186533,
+      "loss": 0.9851,
+      "step": 1838
+    },
+    {
+      "epoch": 0.32745726495726496,
+      "grad_norm": 0.45270970463752747,
+      "learning_rate": 0.0001967760585601698,
+      "loss": 1.0042,
+      "step": 1839
+    },
+    {
+      "epoch": 0.32763532763532766,
+      "grad_norm": 0.48864325881004333,
+      "learning_rate": 0.00019677253204234847,
+      "loss": 1.0835,
+      "step": 1840
+    },
+    {
+      "epoch": 0.3278133903133903,
+      "grad_norm": 0.5855685472488403,
+      "learning_rate": 0.00019676900362847037,
+      "loss": 1.193,
+      "step": 1841
+    },
+    {
+      "epoch": 0.327991452991453,
+      "grad_norm": 0.7181013822555542,
+      "learning_rate": 0.00019676547331860466,
+      "loss": 1.2028,
+      "step": 1842
+    },
+    {
+      "epoch": 0.3281695156695157,
+      "grad_norm": 0.4517378807067871,
+      "learning_rate": 0.00019676194111282054,
+      "loss": 1.013,
+      "step": 1843
+    },
+    {
+      "epoch": 0.32834757834757833,
+      "grad_norm": 0.5477756857872009,
+      "learning_rate": 0.00019675840701118718,
+      "loss": 1.2311,
+      "step": 1844
+    },
+    {
+      "epoch": 0.328525641025641,
+      "grad_norm": 0.5194997191429138,
+      "learning_rate": 0.00019675487101377382,
+      "loss": 1.0953,
+      "step": 1845
+    },
+    {
+      "epoch": 0.3287037037037037,
+      "grad_norm": 0.44454067945480347,
+      "learning_rate": 0.00019675133312064977,
+      "loss": 0.8505,
+      "step": 1846
+    },
+    {
+      "epoch": 0.32888176638176636,
+      "grad_norm": 0.3938713073730469,
+      "learning_rate": 0.00019674779333188428,
+      "loss": 0.8525,
+      "step": 1847
+    },
+    {
+      "epoch": 0.32905982905982906,
+      "grad_norm": 0.4927884340286255,
+      "learning_rate": 0.00019674425164754682,
+      "loss": 1.2477,
+      "step": 1848
+    },
+    {
+      "epoch": 0.32923789173789175,
+      "grad_norm": 0.4516635239124298,
+      "learning_rate": 0.0001967407080677067,
+      "loss": 0.8333,
+      "step": 1849
+    },
+    {
+      "epoch": 0.3294159544159544,
+      "grad_norm": 0.47105780243873596,
+      "learning_rate": 0.00019673716259243336,
+      "loss": 1.0989,
+      "step": 1850
+    },
+    {
+      "epoch": 0.3295940170940171,
+      "grad_norm": 0.5192127823829651,
+      "learning_rate": 0.00019673361522179627,
+      "loss": 1.1164,
+      "step": 1851
+    },
+    {
+      "epoch": 0.3297720797720798,
+      "grad_norm": 0.5222696661949158,
+      "learning_rate": 0.00019673006595586495,
+      "loss": 1.3191,
+      "step": 1852
+    },
+    {
+      "epoch": 0.3299501424501424,
+      "grad_norm": 0.6046679019927979,
+      "learning_rate": 0.0001967265147947089,
+      "loss": 0.9782,
+      "step": 1853
+    },
+    {
+      "epoch": 0.3301282051282051,
+      "grad_norm": 0.47928622364997864,
+      "learning_rate": 0.00019672296173839775,
+      "loss": 1.2247,
+      "step": 1854
+    },
+    {
+      "epoch": 0.3303062678062678,
+      "grad_norm": 0.5435982346534729,
+      "learning_rate": 0.00019671940678700107,
+      "loss": 1.1647,
+      "step": 1855
+    },
+    {
+      "epoch": 0.33048433048433046,
+      "grad_norm": 0.46878984570503235,
+      "learning_rate": 0.00019671584994058856,
+      "loss": 1.132,
+      "step": 1856
+    },
+    {
+      "epoch": 0.33066239316239315,
+      "grad_norm": 0.5336877107620239,
+      "learning_rate": 0.00019671229119922986,
+      "loss": 1.0583,
+      "step": 1857
+    },
+    {
+      "epoch": 0.33084045584045585,
+      "grad_norm": 0.4811093807220459,
+      "learning_rate": 0.0001967087305629947,
+      "loss": 1.0089,
+      "step": 1858
+    },
+    {
+      "epoch": 0.33101851851851855,
+      "grad_norm": 0.5140184760093689,
+      "learning_rate": 0.0001967051680319529,
+      "loss": 1.2335,
+      "step": 1859
+    },
+    {
+      "epoch": 0.3311965811965812,
+      "grad_norm": 0.5855883955955505,
+      "learning_rate": 0.00019670160360617418,
+      "loss": 1.1107,
+      "step": 1860
+    },
+    {
+      "epoch": 0.3313746438746439,
+      "grad_norm": 0.5081531405448914,
+      "learning_rate": 0.00019669803728572844,
+      "loss": 1.0669,
+      "step": 1861
+    },
+    {
+      "epoch": 0.3315527065527066,
+      "grad_norm": 0.48749417066574097,
+      "learning_rate": 0.0001966944690706855,
+      "loss": 1.1465,
+      "step": 1862
+    },
+    {
+      "epoch": 0.3317307692307692,
+      "grad_norm": 0.5175687670707703,
+      "learning_rate": 0.00019669089896111536,
+      "loss": 1.254,
+      "step": 1863
+    },
+    {
+      "epoch": 0.3319088319088319,
+      "grad_norm": 0.4198860824108124,
+      "learning_rate": 0.0001966873269570879,
+      "loss": 0.9811,
+      "step": 1864
+    },
+    {
+      "epoch": 0.3320868945868946,
+      "grad_norm": 0.5220273733139038,
+      "learning_rate": 0.0001966837530586731,
+      "loss": 1.277,
+      "step": 1865
+    },
+    {
+      "epoch": 0.33226495726495725,
+      "grad_norm": 0.551954448223114,
+      "learning_rate": 0.00019668017726594101,
+      "loss": 1.0627,
+      "step": 1866
+    },
+    {
+      "epoch": 0.33244301994301995,
+      "grad_norm": 0.5289301872253418,
+      "learning_rate": 0.00019667659957896166,
+      "loss": 1.4525,
+      "step": 1867
+    },
+    {
+      "epoch": 0.33262108262108264,
+      "grad_norm": 0.5190161466598511,
+      "learning_rate": 0.00019667301999780522,
+      "loss": 1.1064,
+      "step": 1868
+    },
+    {
+      "epoch": 0.3327991452991453,
+      "grad_norm": 0.437637060880661,
+      "learning_rate": 0.00019666943852254172,
+      "loss": 1.1304,
+      "step": 1869
+    },
+    {
+      "epoch": 0.332977207977208,
+      "grad_norm": 0.4801286458969116,
+      "learning_rate": 0.00019666585515324138,
+      "loss": 1.032,
+      "step": 1870
+    },
+    {
+      "epoch": 0.3331552706552707,
+      "grad_norm": 0.5041908621788025,
+      "learning_rate": 0.00019666226988997445,
+      "loss": 1.2611,
+      "step": 1871
+    },
+    {
+      "epoch": 0.3333333333333333,
+      "grad_norm": 0.4529375731945038,
+      "learning_rate": 0.00019665868273281115,
+      "loss": 1.1346,
+      "step": 1872
+    },
+    {
+      "epoch": 0.333511396011396,
+      "grad_norm": 0.4797019064426422,
+      "learning_rate": 0.00019665509368182172,
+      "loss": 1.1716,
+      "step": 1873
+    },
+    {
+      "epoch": 0.3336894586894587,
+      "grad_norm": 0.5505055785179138,
+      "learning_rate": 0.00019665150273707652,
+      "loss": 0.9729,
+      "step": 1874
+    },
+    {
+      "epoch": 0.33386752136752135,
+      "grad_norm": 0.4228051006793976,
+      "learning_rate": 0.00019664790989864592,
+      "loss": 0.9023,
+      "step": 1875
+    },
+    {
+      "epoch": 0.33404558404558404,
+      "grad_norm": 0.4926959276199341,
+      "learning_rate": 0.00019664431516660028,
+      "loss": 1.0999,
+      "step": 1876
+    },
+    {
+      "epoch": 0.33422364672364674,
+      "grad_norm": 0.4273219704627991,
+      "learning_rate": 0.00019664071854101005,
+      "loss": 1.1039,
+      "step": 1877
+    },
+    {
+      "epoch": 0.3344017094017094,
+      "grad_norm": 0.48438936471939087,
+      "learning_rate": 0.00019663712002194566,
+      "loss": 1.1308,
+      "step": 1878
+    },
+    {
+      "epoch": 0.3345797720797721,
+      "grad_norm": 0.5102053284645081,
+      "learning_rate": 0.0001966335196094777,
+      "loss": 1.0618,
+      "step": 1879
+    },
+    {
+      "epoch": 0.33475783475783477,
+      "grad_norm": 0.4357300400733948,
+      "learning_rate": 0.00019662991730367663,
+      "loss": 1.0521,
+      "step": 1880
+    },
+    {
+      "epoch": 0.3349358974358974,
+      "grad_norm": 0.5052695870399475,
+      "learning_rate": 0.00019662631310461308,
+      "loss": 0.9579,
+      "step": 1881
+    },
+    {
+      "epoch": 0.3351139601139601,
+      "grad_norm": 0.4889117181301117,
+      "learning_rate": 0.00019662270701235762,
+      "loss": 1.0304,
+      "step": 1882
+    },
+    {
+      "epoch": 0.3352920227920228,
+      "grad_norm": 0.4671195149421692,
+      "learning_rate": 0.000196619099026981,
+      "loss": 1.2228,
+      "step": 1883
+    },
+    {
+      "epoch": 0.33547008547008544,
+      "grad_norm": 0.4700174331665039,
+      "learning_rate": 0.0001966154891485538,
+      "loss": 0.9634,
+      "step": 1884
+    },
+    {
+      "epoch": 0.33564814814814814,
+      "grad_norm": 0.488817423582077,
+      "learning_rate": 0.00019661187737714676,
+      "loss": 1.2499,
+      "step": 1885
+    },
+    {
+      "epoch": 0.33582621082621084,
+      "grad_norm": 0.5336169600486755,
+      "learning_rate": 0.00019660826371283073,
+      "loss": 1.251,
+      "step": 1886
+    },
+    {
+      "epoch": 0.33600427350427353,
+      "grad_norm": 0.5054540038108826,
+      "learning_rate": 0.00019660464815567642,
+      "loss": 1.221,
+      "step": 1887
+    },
+    {
+      "epoch": 0.33618233618233617,
+      "grad_norm": 0.5078747868537903,
+      "learning_rate": 0.00019660103070575472,
+      "loss": 0.9792,
+      "step": 1888
+    },
+    {
+      "epoch": 0.33636039886039887,
+      "grad_norm": 0.498571515083313,
+      "learning_rate": 0.0001965974113631365,
+      "loss": 1.1682,
+      "step": 1889
+    },
+    {
+      "epoch": 0.33653846153846156,
+      "grad_norm": 0.49969518184661865,
+      "learning_rate": 0.00019659379012789264,
+      "loss": 1.0012,
+      "step": 1890
+    },
+    {
+      "epoch": 0.3367165242165242,
+      "grad_norm": 0.4238094687461853,
+      "learning_rate": 0.00019659016700009416,
+      "loss": 1.0455,
+      "step": 1891
+    },
+    {
+      "epoch": 0.3368945868945869,
+      "grad_norm": 0.5139104723930359,
+      "learning_rate": 0.000196586541979812,
+      "loss": 0.9979,
+      "step": 1892
+    },
+    {
+      "epoch": 0.3370726495726496,
+      "grad_norm": 0.5446547269821167,
+      "learning_rate": 0.00019658291506711715,
+      "loss": 0.9271,
+      "step": 1893
+    },
+    {
+      "epoch": 0.33725071225071224,
+      "grad_norm": 0.5284572839736938,
+      "learning_rate": 0.00019657928626208077,
+      "loss": 1.0356,
+      "step": 1894
+    },
+    {
+      "epoch": 0.33742877492877493,
+      "grad_norm": 0.49936217069625854,
+      "learning_rate": 0.00019657565556477387,
+      "loss": 0.9785,
+      "step": 1895
+    },
+    {
+      "epoch": 0.33760683760683763,
+      "grad_norm": 0.4678729772567749,
+      "learning_rate": 0.00019657202297526763,
+      "loss": 1.2135,
+      "step": 1896
+    },
+    {
+      "epoch": 0.33778490028490027,
+      "grad_norm": 0.46844249963760376,
+      "learning_rate": 0.0001965683884936332,
+      "loss": 0.9369,
+      "step": 1897
+    },
+    {
+      "epoch": 0.33796296296296297,
+      "grad_norm": 0.4307389557361603,
+      "learning_rate": 0.0001965647521199418,
+      "loss": 0.9301,
+      "step": 1898
+    },
+    {
+      "epoch": 0.33814102564102566,
+      "grad_norm": 0.48227834701538086,
+      "learning_rate": 0.00019656111385426468,
+      "loss": 1.3169,
+      "step": 1899
+    },
+    {
+      "epoch": 0.3383190883190883,
+      "grad_norm": 0.45860713720321655,
+      "learning_rate": 0.00019655747369667315,
+      "loss": 0.9835,
+      "step": 1900
+    },
+    {
+      "epoch": 0.338497150997151,
+      "grad_norm": 0.5522414445877075,
+      "learning_rate": 0.00019655383164723846,
+      "loss": 1.363,
+      "step": 1901
+    },
+    {
+      "epoch": 0.3386752136752137,
+      "grad_norm": 0.5283710360527039,
+      "learning_rate": 0.000196550187706032,
+      "loss": 1.1499,
+      "step": 1902
+    },
+    {
+      "epoch": 0.33885327635327633,
+      "grad_norm": 0.4419134259223938,
+      "learning_rate": 0.00019654654187312525,
+      "loss": 1.2039,
+      "step": 1903
+    },
+    {
+      "epoch": 0.33903133903133903,
+      "grad_norm": 0.49066096544265747,
+      "learning_rate": 0.00019654289414858952,
+      "loss": 0.9707,
+      "step": 1904
+    },
+    {
+      "epoch": 0.3392094017094017,
+      "grad_norm": 0.4619338810443878,
+      "learning_rate": 0.00019653924453249633,
+      "loss": 1.0849,
+      "step": 1905
+    },
+    {
+      "epoch": 0.33938746438746437,
+      "grad_norm": 0.5191119313240051,
+      "learning_rate": 0.0001965355930249172,
+      "loss": 1.1387,
+      "step": 1906
+    },
+    {
+      "epoch": 0.33956552706552706,
+      "grad_norm": 0.5245711207389832,
+      "learning_rate": 0.00019653193962592368,
+      "loss": 1.3435,
+      "step": 1907
+    },
+    {
+      "epoch": 0.33974358974358976,
+      "grad_norm": 0.49562904238700867,
+      "learning_rate": 0.0001965282843355873,
+      "loss": 1.2781,
+      "step": 1908
+    },
+    {
+      "epoch": 0.3399216524216524,
+      "grad_norm": 0.4661353826522827,
+      "learning_rate": 0.0001965246271539797,
+      "loss": 0.9317,
+      "step": 1909
+    },
+    {
+      "epoch": 0.3400997150997151,
+      "grad_norm": 0.4723222851753235,
+      "learning_rate": 0.00019652096808117254,
+      "loss": 1.0733,
+      "step": 1910
+    },
+    {
+      "epoch": 0.3402777777777778,
+      "grad_norm": 0.4358505308628082,
+      "learning_rate": 0.00019651730711723754,
+      "loss": 1.1461,
+      "step": 1911
+    },
+    {
+      "epoch": 0.34045584045584043,
+      "grad_norm": 0.462422251701355,
+      "learning_rate": 0.00019651364426224638,
+      "loss": 1.0914,
+      "step": 1912
+    },
+    {
+      "epoch": 0.3406339031339031,
+      "grad_norm": 0.47952914237976074,
+      "learning_rate": 0.0001965099795162709,
+      "loss": 1.0392,
+      "step": 1913
+    },
+    {
+      "epoch": 0.3408119658119658,
+      "grad_norm": 0.5036373734474182,
+      "learning_rate": 0.00019650631287938282,
+      "loss": 1.4002,
+      "step": 1914
+    },
+    {
+      "epoch": 0.34099002849002846,
+      "grad_norm": 0.5130090713500977,
+      "learning_rate": 0.000196502644351654,
+      "loss": 1.3499,
+      "step": 1915
+    },
+    {
+      "epoch": 0.34116809116809116,
+      "grad_norm": 0.4426332414150238,
+      "learning_rate": 0.00019649897393315635,
+      "loss": 1.0726,
+      "step": 1916
+    },
+    {
+      "epoch": 0.34134615384615385,
+      "grad_norm": 0.5580727458000183,
+      "learning_rate": 0.00019649530162396176,
+      "loss": 1.1164,
+      "step": 1917
+    },
+    {
+      "epoch": 0.34152421652421655,
+      "grad_norm": 0.545001745223999,
+      "learning_rate": 0.00019649162742414218,
+      "loss": 0.962,
+      "step": 1918
+    },
+    {
+      "epoch": 0.3417022792022792,
+      "grad_norm": 0.5225808024406433,
+      "learning_rate": 0.00019648795133376962,
+      "loss": 1.1415,
+      "step": 1919
+    },
+    {
+      "epoch": 0.3418803418803419,
+      "grad_norm": 0.48210129141807556,
+      "learning_rate": 0.0001964842733529161,
+      "loss": 1.1188,
+      "step": 1920
+    },
+    {
+      "epoch": 0.3420584045584046,
+      "grad_norm": 0.4515395164489746,
+      "learning_rate": 0.00019648059348165365,
+      "loss": 1.0828,
+      "step": 1921
+    },
+    {
+      "epoch": 0.3422364672364672,
+      "grad_norm": 0.5802633166313171,
+      "learning_rate": 0.0001964769117200544,
+      "loss": 1.3137,
+      "step": 1922
+    },
+    {
+      "epoch": 0.3424145299145299,
+      "grad_norm": 0.4432032108306885,
+      "learning_rate": 0.00019647322806819046,
+      "loss": 1.0523,
+      "step": 1923
+    },
+    {
+      "epoch": 0.3425925925925926,
+      "grad_norm": 0.4697614908218384,
+      "learning_rate": 0.00019646954252613402,
+      "loss": 0.8426,
+      "step": 1924
+    },
+    {
+      "epoch": 0.34277065527065526,
+      "grad_norm": 0.4610968232154846,
+      "learning_rate": 0.0001964658550939573,
+      "loss": 0.9826,
+      "step": 1925
+    },
+    {
+      "epoch": 0.34294871794871795,
+      "grad_norm": 0.5278257727622986,
+      "learning_rate": 0.00019646216577173258,
+      "loss": 1.1064,
+      "step": 1926
+    },
+    {
+      "epoch": 0.34312678062678065,
+      "grad_norm": 0.5686144232749939,
+      "learning_rate": 0.00019645847455953205,
+      "loss": 0.9138,
+      "step": 1927
+    },
+    {
+      "epoch": 0.3433048433048433,
+      "grad_norm": 0.42894792556762695,
+      "learning_rate": 0.0001964547814574281,
+      "loss": 1.0461,
+      "step": 1928
+    },
+    {
+      "epoch": 0.343482905982906,
+      "grad_norm": 0.5567317605018616,
+      "learning_rate": 0.0001964510864654931,
+      "loss": 0.8787,
+      "step": 1929
+    },
+    {
+      "epoch": 0.3436609686609687,
+      "grad_norm": 0.5015586614608765,
+      "learning_rate": 0.0001964473895837994,
+      "loss": 1.1406,
+      "step": 1930
+    },
+    {
+      "epoch": 0.3438390313390313,
+      "grad_norm": 0.47391530871391296,
+      "learning_rate": 0.00019644369081241948,
+      "loss": 1.0685,
+      "step": 1931
+    },
+    {
+      "epoch": 0.344017094017094,
+      "grad_norm": 0.546037495136261,
+      "learning_rate": 0.00019643999015142574,
+      "loss": 1.2349,
+      "step": 1932
+    },
+    {
+      "epoch": 0.3441951566951567,
+      "grad_norm": 0.4724953770637512,
+      "learning_rate": 0.00019643628760089078,
+      "loss": 1.0621,
+      "step": 1933
+    },
+    {
+      "epoch": 0.34437321937321935,
+      "grad_norm": 0.5644593834877014,
+      "learning_rate": 0.00019643258316088703,
+      "loss": 1.2559,
+      "step": 1934
+    },
+    {
+      "epoch": 0.34455128205128205,
+      "grad_norm": 0.500815749168396,
+      "learning_rate": 0.00019642887683148718,
+      "loss": 1.0439,
+      "step": 1935
+    },
+    {
+      "epoch": 0.34472934472934474,
+      "grad_norm": 0.4932316541671753,
+      "learning_rate": 0.0001964251686127638,
+      "loss": 1.0404,
+      "step": 1936
+    },
+    {
+      "epoch": 0.3449074074074074,
+      "grad_norm": 0.48494651913642883,
+      "learning_rate": 0.00019642145850478954,
+      "loss": 0.9951,
+      "step": 1937
+    },
+    {
+      "epoch": 0.3450854700854701,
+      "grad_norm": 0.5191963315010071,
+      "learning_rate": 0.00019641774650763706,
+      "loss": 1.1258,
+      "step": 1938
+    },
+    {
+      "epoch": 0.3452635327635328,
+      "grad_norm": 0.4439312815666199,
+      "learning_rate": 0.00019641403262137918,
+      "loss": 1.1158,
+      "step": 1939
+    },
+    {
+      "epoch": 0.3454415954415954,
+      "grad_norm": 0.4829137921333313,
+      "learning_rate": 0.0001964103168460886,
+      "loss": 1.0531,
+      "step": 1940
+    },
+    {
+      "epoch": 0.3456196581196581,
+      "grad_norm": 0.49433329701423645,
+      "learning_rate": 0.00019640659918183811,
+      "loss": 1.1295,
+      "step": 1941
+    },
+    {
+      "epoch": 0.3457977207977208,
+      "grad_norm": 0.5351347923278809,
+      "learning_rate": 0.00019640287962870062,
+      "loss": 1.2379,
+      "step": 1942
+    },
+    {
+      "epoch": 0.34597578347578345,
+      "grad_norm": 0.4845680892467499,
+      "learning_rate": 0.00019639915818674895,
+      "loss": 1.0197,
+      "step": 1943
+    },
+    {
+      "epoch": 0.34615384615384615,
+      "grad_norm": 0.5312514901161194,
+      "learning_rate": 0.00019639543485605604,
+      "loss": 0.9734,
+      "step": 1944
+    },
+    {
+      "epoch": 0.34633190883190884,
+      "grad_norm": 0.4571874737739563,
+      "learning_rate": 0.00019639170963669478,
+      "loss": 1.1012,
+      "step": 1945
+    },
+    {
+      "epoch": 0.34650997150997154,
+      "grad_norm": 0.4449031949043274,
+      "learning_rate": 0.00019638798252873824,
+      "loss": 1.1393,
+      "step": 1946
+    },
+    {
+      "epoch": 0.3466880341880342,
+      "grad_norm": 0.47470834851264954,
+      "learning_rate": 0.0001963842535322594,
+      "loss": 0.981,
+      "step": 1947
+    },
+    {
+      "epoch": 0.3468660968660969,
+      "grad_norm": 0.5386981964111328,
+      "learning_rate": 0.00019638052264733132,
+      "loss": 1.1247,
+      "step": 1948
+    },
+    {
+      "epoch": 0.34704415954415957,
+      "grad_norm": 0.535589873790741,
+      "learning_rate": 0.00019637678987402714,
+      "loss": 1.3157,
+      "step": 1949
+    },
+    {
+      "epoch": 0.3472222222222222,
+      "grad_norm": 0.49338245391845703,
+      "learning_rate": 0.00019637305521242,
+      "loss": 1.1066,
+      "step": 1950
+    },
+    {
+      "epoch": 0.3474002849002849,
+      "grad_norm": 0.4247688353061676,
+      "learning_rate": 0.00019636931866258298,
+      "loss": 1.0039,
+      "step": 1951
+    },
+    {
+      "epoch": 0.3475783475783476,
+      "grad_norm": 0.5351517200469971,
+      "learning_rate": 0.00019636558022458934,
+      "loss": 1.0344,
+      "step": 1952
+    },
+    {
+      "epoch": 0.34775641025641024,
+      "grad_norm": 0.4633362889289856,
+      "learning_rate": 0.00019636183989851238,
+      "loss": 1.1383,
+      "step": 1953
+    },
+    {
+      "epoch": 0.34793447293447294,
+      "grad_norm": 0.553709089756012,
+      "learning_rate": 0.00019635809768442535,
+      "loss": 1.0389,
+      "step": 1954
+    },
+    {
+      "epoch": 0.34811253561253563,
+      "grad_norm": 0.479374498128891,
+      "learning_rate": 0.00019635435358240154,
+      "loss": 1.1774,
+      "step": 1955
+    },
+    {
+      "epoch": 0.3482905982905983,
+      "grad_norm": 0.5274081230163574,
+      "learning_rate": 0.0001963506075925143,
+      "loss": 1.1809,
+      "step": 1956
+    },
+    {
+      "epoch": 0.34846866096866097,
+      "grad_norm": 0.45398542284965515,
+      "learning_rate": 0.0001963468597148371,
+      "loss": 1.0502,
+      "step": 1957
+    },
+    {
+      "epoch": 0.34864672364672367,
+      "grad_norm": 0.48201611638069153,
+      "learning_rate": 0.00019634310994944332,
+      "loss": 1.0557,
+      "step": 1958
+    },
+    {
+      "epoch": 0.3488247863247863,
+      "grad_norm": 0.6407544016838074,
+      "learning_rate": 0.00019633935829640642,
+      "loss": 1.2138,
+      "step": 1959
+    },
+    {
+      "epoch": 0.349002849002849,
+      "grad_norm": 0.5385687351226807,
+      "learning_rate": 0.00019633560475579995,
+      "loss": 1.3496,
+      "step": 1960
+    },
+    {
+      "epoch": 0.3491809116809117,
+      "grad_norm": 0.5260964035987854,
+      "learning_rate": 0.0001963318493276974,
+      "loss": 1.0253,
+      "step": 1961
+    },
+    {
+      "epoch": 0.34935897435897434,
+      "grad_norm": 0.48478585481643677,
+      "learning_rate": 0.00019632809201217238,
+      "loss": 1.137,
+      "step": 1962
+    },
+    {
+      "epoch": 0.34953703703703703,
+      "grad_norm": 0.620033860206604,
+      "learning_rate": 0.0001963243328092985,
+      "loss": 1.3445,
+      "step": 1963
+    },
+    {
+      "epoch": 0.34971509971509973,
+      "grad_norm": 0.5149700045585632,
+      "learning_rate": 0.00019632057171914942,
+      "loss": 1.1042,
+      "step": 1964
+    },
+    {
+      "epoch": 0.34989316239316237,
+      "grad_norm": 0.42695048451423645,
+      "learning_rate": 0.0001963168087417988,
+      "loss": 0.8789,
+      "step": 1965
+    },
+    {
+      "epoch": 0.35007122507122507,
+      "grad_norm": 0.5281283855438232,
+      "learning_rate": 0.00019631304387732044,
+      "loss": 1.1155,
+      "step": 1966
+    },
+    {
+      "epoch": 0.35024928774928776,
+      "grad_norm": 0.4994089901447296,
+      "learning_rate": 0.00019630927712578804,
+      "loss": 1.1226,
+      "step": 1967
+    },
+    {
+      "epoch": 0.3504273504273504,
+      "grad_norm": 0.4433288276195526,
+      "learning_rate": 0.0001963055084872754,
+      "loss": 1.0262,
+      "step": 1968
+    },
+    {
+      "epoch": 0.3506054131054131,
+      "grad_norm": 0.46541857719421387,
+      "learning_rate": 0.0001963017379618564,
+      "loss": 1.1438,
+      "step": 1969
+    },
+    {
+      "epoch": 0.3507834757834758,
+      "grad_norm": 0.5097604393959045,
+      "learning_rate": 0.00019629796554960488,
+      "loss": 0.9641,
+      "step": 1970
+    },
+    {
+      "epoch": 0.35096153846153844,
+      "grad_norm": 0.49461981654167175,
+      "learning_rate": 0.00019629419125059478,
+      "loss": 1.1765,
+      "step": 1971
+    },
+    {
+      "epoch": 0.35113960113960113,
+      "grad_norm": 0.4763339161872864,
+      "learning_rate": 0.00019629041506490005,
+      "loss": 1.0527,
+      "step": 1972
+    },
+    {
+      "epoch": 0.35131766381766383,
+      "grad_norm": 0.4528443217277527,
+      "learning_rate": 0.00019628663699259463,
+      "loss": 1.1409,
+      "step": 1973
+    },
+    {
+      "epoch": 0.35149572649572647,
+      "grad_norm": 0.4436309039592743,
+      "learning_rate": 0.00019628285703375258,
+      "loss": 1.0459,
+      "step": 1974
+    },
+    {
+      "epoch": 0.35167378917378916,
+      "grad_norm": 0.5146129727363586,
+      "learning_rate": 0.00019627907518844797,
+      "loss": 1.2527,
+      "step": 1975
+    },
+    {
+      "epoch": 0.35185185185185186,
+      "grad_norm": 0.5202171802520752,
+      "learning_rate": 0.0001962752914567549,
+      "loss": 1.226,
+      "step": 1976
+    },
+    {
+      "epoch": 0.35202991452991456,
+      "grad_norm": 0.5267411470413208,
+      "learning_rate": 0.00019627150583874747,
+      "loss": 1.0898,
+      "step": 1977
+    },
+    {
+      "epoch": 0.3522079772079772,
+      "grad_norm": 0.546840250492096,
+      "learning_rate": 0.00019626771833449987,
+      "loss": 1.1716,
+      "step": 1978
+    },
+    {
+      "epoch": 0.3523860398860399,
+      "grad_norm": 0.5525290966033936,
+      "learning_rate": 0.0001962639289440863,
+      "loss": 1.1762,
+      "step": 1979
+    },
+    {
+      "epoch": 0.3525641025641026,
+      "grad_norm": 0.48967215418815613,
+      "learning_rate": 0.000196260137667581,
+      "loss": 1.1884,
+      "step": 1980
+    },
+    {
+      "epoch": 0.35274216524216523,
+      "grad_norm": 0.5908235907554626,
+      "learning_rate": 0.0001962563445050583,
+      "loss": 1.1887,
+      "step": 1981
+    },
+    {
+      "epoch": 0.3529202279202279,
+      "grad_norm": 0.46708086133003235,
+      "learning_rate": 0.00019625254945659245,
+      "loss": 0.8842,
+      "step": 1982
+    },
+    {
+      "epoch": 0.3530982905982906,
+      "grad_norm": 0.41652458906173706,
+      "learning_rate": 0.00019624875252225788,
+      "loss": 1.0268,
+      "step": 1983
+    },
+    {
+      "epoch": 0.35327635327635326,
+      "grad_norm": 0.5084529519081116,
+      "learning_rate": 0.00019624495370212892,
+      "loss": 1.0547,
+      "step": 1984
+    },
+    {
+      "epoch": 0.35345441595441596,
+      "grad_norm": 0.5667507648468018,
+      "learning_rate": 0.00019624115299628003,
+      "loss": 1.0656,
+      "step": 1985
+    },
+    {
+      "epoch": 0.35363247863247865,
+      "grad_norm": 0.5022873282432556,
+      "learning_rate": 0.00019623735040478568,
+      "loss": 1.0627,
+      "step": 1986
+    },
+    {
+      "epoch": 0.3538105413105413,
+      "grad_norm": 0.48342058062553406,
+      "learning_rate": 0.00019623354592772035,
+      "loss": 1.0976,
+      "step": 1987
+    },
+    {
+      "epoch": 0.353988603988604,
+      "grad_norm": 0.48117366433143616,
+      "learning_rate": 0.0001962297395651586,
+      "loss": 1.0515,
+      "step": 1988
+    },
+    {
+      "epoch": 0.3541666666666667,
+      "grad_norm": 0.492564857006073,
+      "learning_rate": 0.000196225931317175,
+      "loss": 1.1957,
+      "step": 1989
+    },
+    {
+      "epoch": 0.3543447293447293,
+      "grad_norm": 0.4756208658218384,
+      "learning_rate": 0.00019622212118384417,
+      "loss": 1.007,
+      "step": 1990
+    },
+    {
+      "epoch": 0.354522792022792,
+      "grad_norm": 0.581930935382843,
+      "learning_rate": 0.00019621830916524076,
+      "loss": 1.232,
+      "step": 1991
+    },
+    {
+      "epoch": 0.3547008547008547,
+      "grad_norm": 0.480064332485199,
+      "learning_rate": 0.00019621449526143947,
+      "loss": 1.2693,
+      "step": 1992
+    },
+    {
+      "epoch": 0.35487891737891736,
+      "grad_norm": 0.5679123401641846,
+      "learning_rate": 0.000196210679472515,
+      "loss": 1.2985,
+      "step": 1993
+    },
+    {
+      "epoch": 0.35505698005698005,
+      "grad_norm": 0.43757280707359314,
+      "learning_rate": 0.00019620686179854213,
+      "loss": 1.1387,
+      "step": 1994
+    },
+    {
+      "epoch": 0.35523504273504275,
+      "grad_norm": 0.4950634837150574,
+      "learning_rate": 0.00019620304223959566,
+      "loss": 1.1809,
+      "step": 1995
+    },
+    {
+      "epoch": 0.3554131054131054,
+      "grad_norm": 0.5574113726615906,
+      "learning_rate": 0.00019619922079575043,
+      "loss": 1.2434,
+      "step": 1996
+    },
+    {
+      "epoch": 0.3555911680911681,
+      "grad_norm": 0.5154930949211121,
+      "learning_rate": 0.00019619539746708128,
+      "loss": 1.1747,
+      "step": 1997
+    },
+    {
+      "epoch": 0.3557692307692308,
+      "grad_norm": 0.4377825856208801,
+      "learning_rate": 0.00019619157225366315,
+      "loss": 0.9547,
+      "step": 1998
+    },
+    {
+      "epoch": 0.3559472934472934,
+      "grad_norm": 0.530714213848114,
+      "learning_rate": 0.00019618774515557097,
+      "loss": 1.2057,
+      "step": 1999
+    },
+    {
+      "epoch": 0.3561253561253561,
+      "grad_norm": 0.5703464150428772,
+      "learning_rate": 0.00019618391617287978,
+      "loss": 1.3068,
+      "step": 2000
+    },
+    {
+      "epoch": 0.3563034188034188,
+      "grad_norm": 0.4862228333950043,
+      "learning_rate": 0.0001961800853056645,
+      "loss": 1.0077,
+      "step": 2001
+    },
+    {
+      "epoch": 0.35648148148148145,
+      "grad_norm": 0.5575395822525024,
+      "learning_rate": 0.00019617625255400028,
+      "loss": 1.03,
+      "step": 2002
+    },
+    {
+      "epoch": 0.35665954415954415,
+      "grad_norm": 0.4826279580593109,
+      "learning_rate": 0.0001961724179179622,
+      "loss": 1.268,
+      "step": 2003
+    },
+    {
+      "epoch": 0.35683760683760685,
+      "grad_norm": 0.49423274397850037,
+      "learning_rate": 0.00019616858139762534,
+      "loss": 1.1305,
+      "step": 2004
+    },
+    {
+      "epoch": 0.35701566951566954,
+      "grad_norm": 0.5208541750907898,
+      "learning_rate": 0.00019616474299306491,
+      "loss": 1.1651,
+      "step": 2005
+    },
+    {
+      "epoch": 0.3571937321937322,
+      "grad_norm": 0.5324164032936096,
+      "learning_rate": 0.0001961609027043561,
+      "loss": 1.1406,
+      "step": 2006
+    },
+    {
+      "epoch": 0.3573717948717949,
+      "grad_norm": 0.45385462045669556,
+      "learning_rate": 0.00019615706053157416,
+      "loss": 1.0716,
+      "step": 2007
+    },
+    {
+      "epoch": 0.3575498575498576,
+      "grad_norm": 0.5016173720359802,
+      "learning_rate": 0.00019615321647479438,
+      "loss": 1.0878,
+      "step": 2008
+    },
+    {
+      "epoch": 0.3577279202279202,
+      "grad_norm": 0.5073097348213196,
+      "learning_rate": 0.00019614937053409205,
+      "loss": 1.237,
+      "step": 2009
+    },
+    {
+      "epoch": 0.3579059829059829,
+      "grad_norm": 0.48880141973495483,
+      "learning_rate": 0.00019614552270954256,
+      "loss": 0.8794,
+      "step": 2010
+    },
+    {
+      "epoch": 0.3580840455840456,
+      "grad_norm": 0.43902209401130676,
+      "learning_rate": 0.00019614167300122126,
+      "loss": 0.912,
+      "step": 2011
+    },
+    {
+      "epoch": 0.35826210826210825,
+      "grad_norm": 0.42809322476387024,
+      "learning_rate": 0.0001961378214092036,
+      "loss": 0.7804,
+      "step": 2012
+    },
+    {
+      "epoch": 0.35844017094017094,
+      "grad_norm": 0.4464281499385834,
+      "learning_rate": 0.00019613396793356503,
+      "loss": 1.0004,
+      "step": 2013
+    },
+    {
+      "epoch": 0.35861823361823364,
+      "grad_norm": 0.49085676670074463,
+      "learning_rate": 0.00019613011257438109,
+      "loss": 1.1087,
+      "step": 2014
+    },
+    {
+      "epoch": 0.3587962962962963,
+      "grad_norm": 0.4997732937335968,
+      "learning_rate": 0.00019612625533172725,
+      "loss": 0.9591,
+      "step": 2015
+    },
+    {
+      "epoch": 0.358974358974359,
+      "grad_norm": 0.48442545533180237,
+      "learning_rate": 0.00019612239620567912,
+      "loss": 0.9744,
+      "step": 2016
+    },
+    {
+      "epoch": 0.35915242165242167,
+      "grad_norm": 0.4989205002784729,
+      "learning_rate": 0.00019611853519631233,
+      "loss": 0.9844,
+      "step": 2017
+    },
+    {
+      "epoch": 0.3593304843304843,
+      "grad_norm": 0.6107521653175354,
+      "learning_rate": 0.00019611467230370248,
+      "loss": 1.147,
+      "step": 2018
+    },
+    {
+      "epoch": 0.359508547008547,
+      "grad_norm": 0.5594844818115234,
+      "learning_rate": 0.00019611080752792535,
+      "loss": 1.3195,
+      "step": 2019
+    },
+    {
+      "epoch": 0.3596866096866097,
+      "grad_norm": 0.4786946475505829,
+      "learning_rate": 0.00019610694086905656,
+      "loss": 1.2108,
+      "step": 2020
+    },
+    {
+      "epoch": 0.35986467236467234,
+      "grad_norm": 0.5186030268669128,
+      "learning_rate": 0.0001961030723271719,
+      "loss": 1.0008,
+      "step": 2021
+    },
+    {
+      "epoch": 0.36004273504273504,
+      "grad_norm": 0.4520573318004608,
+      "learning_rate": 0.0001960992019023472,
+      "loss": 1.1307,
+      "step": 2022
+    },
+    {
+      "epoch": 0.36022079772079774,
+      "grad_norm": 0.4983210563659668,
+      "learning_rate": 0.00019609532959465823,
+      "loss": 1.1486,
+      "step": 2023
+    },
+    {
+      "epoch": 0.3603988603988604,
+      "grad_norm": 0.6209200024604797,
+      "learning_rate": 0.00019609145540418094,
+      "loss": 1.2566,
+      "step": 2024
+    },
+    {
+      "epoch": 0.3605769230769231,
+      "grad_norm": 0.47047603130340576,
+      "learning_rate": 0.00019608757933099117,
+      "loss": 1.1588,
+      "step": 2025
+    },
+    {
+      "epoch": 0.36075498575498577,
+      "grad_norm": 0.5147389769554138,
+      "learning_rate": 0.0001960837013751649,
+      "loss": 1.2113,
+      "step": 2026
+    },
+    {
+      "epoch": 0.3609330484330484,
+      "grad_norm": 0.45826098322868347,
+      "learning_rate": 0.00019607982153677808,
+      "loss": 1.13,
+      "step": 2027
+    },
+    {
+      "epoch": 0.3611111111111111,
+      "grad_norm": 0.5699561834335327,
+      "learning_rate": 0.00019607593981590675,
+      "loss": 1.2476,
+      "step": 2028
+    },
+    {
+      "epoch": 0.3612891737891738,
+      "grad_norm": 0.5349239110946655,
+      "learning_rate": 0.000196072056212627,
+      "loss": 1.2295,
+      "step": 2029
+    },
+    {
+      "epoch": 0.36146723646723644,
+      "grad_norm": 0.6212165355682373,
+      "learning_rate": 0.00019606817072701484,
+      "loss": 1.1965,
+      "step": 2030
+    },
+    {
+      "epoch": 0.36164529914529914,
+      "grad_norm": 0.4870990216732025,
+      "learning_rate": 0.00019606428335914645,
+      "loss": 1.4464,
+      "step": 2031
+    },
+    {
+      "epoch": 0.36182336182336183,
+      "grad_norm": 0.42427828907966614,
+      "learning_rate": 0.00019606039410909797,
+      "loss": 1.1546,
+      "step": 2032
+    },
+    {
+      "epoch": 0.36200142450142453,
+      "grad_norm": 0.5081788301467896,
+      "learning_rate": 0.0001960565029769456,
+      "loss": 1.1867,
+      "step": 2033
+    },
+    {
+      "epoch": 0.36217948717948717,
+      "grad_norm": 0.4813104271888733,
+      "learning_rate": 0.00019605260996276565,
+      "loss": 1.3726,
+      "step": 2034
+    },
+    {
+      "epoch": 0.36235754985754987,
+      "grad_norm": 0.4648851156234741,
+      "learning_rate": 0.0001960487150666343,
+      "loss": 1.2434,
+      "step": 2035
+    },
+    {
+      "epoch": 0.36253561253561256,
+      "grad_norm": 0.484161913394928,
+      "learning_rate": 0.00019604481828862792,
+      "loss": 1.1309,
+      "step": 2036
+    },
+    {
+      "epoch": 0.3627136752136752,
+      "grad_norm": 0.4929439127445221,
+      "learning_rate": 0.00019604091962882283,
+      "loss": 1.1007,
+      "step": 2037
+    },
+    {
+      "epoch": 0.3628917378917379,
+      "grad_norm": 0.45599642395973206,
+      "learning_rate": 0.00019603701908729544,
+      "loss": 1.2628,
+      "step": 2038
+    },
+    {
+      "epoch": 0.3630698005698006,
+      "grad_norm": 0.45295149087905884,
+      "learning_rate": 0.00019603311666412213,
+      "loss": 0.9808,
+      "step": 2039
+    },
+    {
+      "epoch": 0.36324786324786323,
+      "grad_norm": 0.48681163787841797,
+      "learning_rate": 0.00019602921235937942,
+      "loss": 1.0574,
+      "step": 2040
+    },
+    {
+      "epoch": 0.36342592592592593,
+      "grad_norm": 0.41232365369796753,
+      "learning_rate": 0.00019602530617314378,
+      "loss": 1.0454,
+      "step": 2041
+    },
+    {
+      "epoch": 0.3636039886039886,
+      "grad_norm": 0.46214723587036133,
+      "learning_rate": 0.00019602139810549174,
+      "loss": 0.9985,
+      "step": 2042
+    },
+    {
+      "epoch": 0.36378205128205127,
+      "grad_norm": 0.44307878613471985,
+      "learning_rate": 0.00019601748815649989,
+      "loss": 0.9683,
+      "step": 2043
+    },
+    {
+      "epoch": 0.36396011396011396,
+      "grad_norm": 0.4809451401233673,
+      "learning_rate": 0.00019601357632624477,
+      "loss": 1.028,
+      "step": 2044
+    },
+    {
+      "epoch": 0.36413817663817666,
+      "grad_norm": 0.4638497531414032,
+      "learning_rate": 0.0001960096626148031,
+      "loss": 0.9851,
+      "step": 2045
+    },
+    {
+      "epoch": 0.3643162393162393,
+      "grad_norm": 0.5942164063453674,
+      "learning_rate": 0.00019600574702225153,
+      "loss": 1.1606,
+      "step": 2046
+    },
+    {
+      "epoch": 0.364494301994302,
+      "grad_norm": 0.5171293616294861,
+      "learning_rate": 0.00019600182954866675,
+      "loss": 1.2335,
+      "step": 2047
+    },
+    {
+      "epoch": 0.3646723646723647,
+      "grad_norm": 0.5294404625892639,
+      "learning_rate": 0.00019599791019412558,
+      "loss": 1.0966,
+      "step": 2048
+    },
+    {
+      "epoch": 0.36485042735042733,
+      "grad_norm": 0.46117448806762695,
+      "learning_rate": 0.00019599398895870477,
+      "loss": 1.0565,
+      "step": 2049
+    },
+    {
+      "epoch": 0.36502849002849,
+      "grad_norm": 0.5385118126869202,
+      "learning_rate": 0.00019599006584248118,
+      "loss": 1.0076,
+      "step": 2050
+    },
+    {
+      "epoch": 0.3652065527065527,
+      "grad_norm": 0.4915166199207306,
+      "learning_rate": 0.00019598614084553165,
+      "loss": 0.9686,
+      "step": 2051
+    },
+    {
+      "epoch": 0.36538461538461536,
+      "grad_norm": 0.46769094467163086,
+      "learning_rate": 0.00019598221396793303,
+      "loss": 1.1217,
+      "step": 2052
+    },
+    {
+      "epoch": 0.36556267806267806,
+      "grad_norm": 0.5440493822097778,
+      "learning_rate": 0.00019597828520976236,
+      "loss": 1.2344,
+      "step": 2053
+    },
+    {
+      "epoch": 0.36574074074074076,
+      "grad_norm": 0.616727352142334,
+      "learning_rate": 0.00019597435457109657,
+      "loss": 1.2953,
+      "step": 2054
+    },
+    {
+      "epoch": 0.3659188034188034,
+      "grad_norm": 0.4859183430671692,
+      "learning_rate": 0.00019597042205201265,
+      "loss": 1.16,
+      "step": 2055
+    },
+    {
+      "epoch": 0.3660968660968661,
+      "grad_norm": 0.47056329250335693,
+      "learning_rate": 0.0001959664876525877,
+      "loss": 0.9982,
+      "step": 2056
+    },
+    {
+      "epoch": 0.3662749287749288,
+      "grad_norm": 0.48347967863082886,
+      "learning_rate": 0.00019596255137289875,
+      "loss": 1.0966,
+      "step": 2057
+    },
+    {
+      "epoch": 0.36645299145299143,
+      "grad_norm": 0.5068454742431641,
+      "learning_rate": 0.00019595861321302296,
+      "loss": 1.2891,
+      "step": 2058
+    },
+    {
+      "epoch": 0.3666310541310541,
+      "grad_norm": 0.5702359080314636,
+      "learning_rate": 0.00019595467317303747,
+      "loss": 1.1394,
+      "step": 2059
+    },
+    {
+      "epoch": 0.3668091168091168,
+      "grad_norm": 0.5028812885284424,
+      "learning_rate": 0.0001959507312530195,
+      "loss": 1.2324,
+      "step": 2060
+    },
+    {
+      "epoch": 0.36698717948717946,
+      "grad_norm": 0.4672880172729492,
+      "learning_rate": 0.00019594678745304628,
+      "loss": 1.0581,
+      "step": 2061
+    },
+    {
+      "epoch": 0.36716524216524216,
+      "grad_norm": 0.5233900547027588,
+      "learning_rate": 0.00019594284177319504,
+      "loss": 1.138,
+      "step": 2062
+    },
+    {
+      "epoch": 0.36734330484330485,
+      "grad_norm": 0.46871712803840637,
+      "learning_rate": 0.00019593889421354316,
+      "loss": 1.2159,
+      "step": 2063
+    },
+    {
+      "epoch": 0.36752136752136755,
+      "grad_norm": 0.5180533528327942,
+      "learning_rate": 0.00019593494477416793,
+      "loss": 1.1116,
+      "step": 2064
+    },
+    {
+      "epoch": 0.3676994301994302,
+      "grad_norm": 0.5398494005203247,
+      "learning_rate": 0.0001959309934551467,
+      "loss": 1.2038,
+      "step": 2065
+    },
+    {
+      "epoch": 0.3678774928774929,
+      "grad_norm": 0.4850373864173889,
+      "learning_rate": 0.000195927040256557,
+      "loss": 1.4315,
+      "step": 2066
+    },
+    {
+      "epoch": 0.3680555555555556,
+      "grad_norm": 0.49190905690193176,
+      "learning_rate": 0.0001959230851784762,
+      "loss": 0.9993,
+      "step": 2067
+    },
+    {
+      "epoch": 0.3682336182336182,
+      "grad_norm": 0.4546903073787689,
+      "learning_rate": 0.00019591912822098178,
+      "loss": 1.0979,
+      "step": 2068
+    },
+    {
+      "epoch": 0.3684116809116809,
+      "grad_norm": 0.4726468622684479,
+      "learning_rate": 0.00019591516938415133,
+      "loss": 1.1629,
+      "step": 2069
+    },
+    {
+      "epoch": 0.3685897435897436,
+      "grad_norm": 0.47856009006500244,
+      "learning_rate": 0.00019591120866806235,
+      "loss": 1.2048,
+      "step": 2070
+    },
+    {
+      "epoch": 0.36876780626780625,
+      "grad_norm": 0.46847718954086304,
+      "learning_rate": 0.0001959072460727925,
+      "loss": 1.0958,
+      "step": 2071
+    },
+    {
+      "epoch": 0.36894586894586895,
+      "grad_norm": 0.47164350748062134,
+      "learning_rate": 0.0001959032815984194,
+      "loss": 1.1912,
+      "step": 2072
+    },
+    {
+      "epoch": 0.36912393162393164,
+      "grad_norm": 0.4838213324546814,
+      "learning_rate": 0.0001958993152450207,
+      "loss": 1.1466,
+      "step": 2073
+    },
+    {
+      "epoch": 0.3693019943019943,
+      "grad_norm": 0.47234636545181274,
+      "learning_rate": 0.00019589534701267412,
+      "loss": 0.9475,
+      "step": 2074
+    },
+    {
+      "epoch": 0.369480056980057,
+      "grad_norm": 0.4913126826286316,
+      "learning_rate": 0.00019589137690145746,
+      "loss": 1.1571,
+      "step": 2075
+    },
+    {
+      "epoch": 0.3696581196581197,
+      "grad_norm": 0.4696233570575714,
+      "learning_rate": 0.00019588740491144842,
+      "loss": 0.9797,
+      "step": 2076
+    },
+    {
+      "epoch": 0.3698361823361823,
+      "grad_norm": 0.46146106719970703,
+      "learning_rate": 0.00019588343104272492,
+      "loss": 1.027,
+      "step": 2077
+    },
+    {
+      "epoch": 0.370014245014245,
+      "grad_norm": 0.4920627176761627,
+      "learning_rate": 0.00019587945529536474,
+      "loss": 1.1008,
+      "step": 2078
+    },
+    {
+      "epoch": 0.3701923076923077,
+      "grad_norm": 0.4854249954223633,
+      "learning_rate": 0.0001958754776694458,
+      "loss": 1.0759,
+      "step": 2079
+    },
+    {
+      "epoch": 0.37037037037037035,
+      "grad_norm": 0.4884897768497467,
+      "learning_rate": 0.00019587149816504608,
+      "loss": 1.1403,
+      "step": 2080
+    },
+    {
+      "epoch": 0.37054843304843305,
+      "grad_norm": 0.5062584280967712,
+      "learning_rate": 0.00019586751678224345,
+      "loss": 1.0185,
+      "step": 2081
+    },
+    {
+      "epoch": 0.37072649572649574,
+      "grad_norm": 0.44697675108909607,
+      "learning_rate": 0.000195863533521116,
+      "loss": 1.0462,
+      "step": 2082
+    },
+    {
+      "epoch": 0.3709045584045584,
+      "grad_norm": 0.5122885704040527,
+      "learning_rate": 0.00019585954838174176,
+      "loss": 1.108,
+      "step": 2083
+    },
+    {
+      "epoch": 0.3710826210826211,
+      "grad_norm": 0.486650288105011,
+      "learning_rate": 0.0001958555613641988,
+      "loss": 1.126,
+      "step": 2084
+    },
+    {
+      "epoch": 0.3712606837606838,
+      "grad_norm": 0.5296297669410706,
+      "learning_rate": 0.00019585157246856523,
+      "loss": 1.1757,
+      "step": 2085
+    },
+    {
+      "epoch": 0.3714387464387464,
+      "grad_norm": 0.4935721457004547,
+      "learning_rate": 0.0001958475816949192,
+      "loss": 1.1654,
+      "step": 2086
+    },
+    {
+      "epoch": 0.3716168091168091,
+      "grad_norm": 0.6226509213447571,
+      "learning_rate": 0.00019584358904333891,
+      "loss": 1.1981,
+      "step": 2087
+    },
+    {
+      "epoch": 0.3717948717948718,
+      "grad_norm": 0.44094228744506836,
+      "learning_rate": 0.0001958395945139026,
+      "loss": 0.8468,
+      "step": 2088
+    },
+    {
+      "epoch": 0.37197293447293445,
+      "grad_norm": 0.5335884690284729,
+      "learning_rate": 0.00019583559810668858,
+      "loss": 1.1597,
+      "step": 2089
+    },
+    {
+      "epoch": 0.37215099715099714,
+      "grad_norm": 0.4585414528846741,
+      "learning_rate": 0.000195831599821775,
+      "loss": 0.9343,
+      "step": 2090
+    },
+    {
+      "epoch": 0.37232905982905984,
+      "grad_norm": 0.533087432384491,
+      "learning_rate": 0.00019582759965924035,
+      "loss": 1.1209,
+      "step": 2091
+    },
+    {
+      "epoch": 0.37250712250712253,
+      "grad_norm": 0.5302683711051941,
+      "learning_rate": 0.00019582359761916295,
+      "loss": 1.236,
+      "step": 2092
+    },
+    {
+      "epoch": 0.3726851851851852,
+      "grad_norm": 0.4522508382797241,
+      "learning_rate": 0.00019581959370162122,
+      "loss": 1.0196,
+      "step": 2093
+    },
+    {
+      "epoch": 0.37286324786324787,
+      "grad_norm": 0.52391517162323,
+      "learning_rate": 0.00019581558790669358,
+      "loss": 1.0077,
+      "step": 2094
+    },
+    {
+      "epoch": 0.37304131054131057,
+      "grad_norm": 0.47144797444343567,
+      "learning_rate": 0.00019581158023445854,
+      "loss": 1.0956,
+      "step": 2095
+    },
+    {
+      "epoch": 0.3732193732193732,
+      "grad_norm": 0.4486723244190216,
+      "learning_rate": 0.00019580757068499459,
+      "loss": 0.8697,
+      "step": 2096
+    },
+    {
+      "epoch": 0.3733974358974359,
+      "grad_norm": 0.4626580476760864,
+      "learning_rate": 0.00019580355925838034,
+      "loss": 0.8489,
+      "step": 2097
+    },
+    {
+      "epoch": 0.3735754985754986,
+      "grad_norm": 0.5647920370101929,
+      "learning_rate": 0.00019579954595469438,
+      "loss": 1.1458,
+      "step": 2098
+    },
+    {
+      "epoch": 0.37375356125356124,
+      "grad_norm": 0.4734349846839905,
+      "learning_rate": 0.00019579553077401528,
+      "loss": 1.1036,
+      "step": 2099
+    },
+    {
+      "epoch": 0.37393162393162394,
+      "grad_norm": 0.5624295473098755,
+      "learning_rate": 0.00019579151371642176,
+      "loss": 0.9793,
+      "step": 2100
+    },
+    {
+      "epoch": 0.37410968660968663,
+      "grad_norm": 0.47507283091545105,
+      "learning_rate": 0.00019578749478199256,
+      "loss": 1.0371,
+      "step": 2101
+    },
+    {
+      "epoch": 0.37428774928774927,
+      "grad_norm": 0.550865113735199,
+      "learning_rate": 0.00019578347397080633,
+      "loss": 1.046,
+      "step": 2102
+    },
+    {
+      "epoch": 0.37446581196581197,
+      "grad_norm": 0.5249403715133667,
+      "learning_rate": 0.00019577945128294193,
+      "loss": 1.3185,
+      "step": 2103
+    },
+    {
+      "epoch": 0.37464387464387466,
+      "grad_norm": 0.4921024739742279,
+      "learning_rate": 0.00019577542671847815,
+      "loss": 1.0758,
+      "step": 2104
+    },
+    {
+      "epoch": 0.3748219373219373,
+      "grad_norm": 0.5351784825325012,
+      "learning_rate": 0.00019577140027749384,
+      "loss": 1.067,
+      "step": 2105
+    },
+    {
+      "epoch": 0.375,
+      "grad_norm": 0.44420507550239563,
+      "learning_rate": 0.00019576737196006787,
+      "loss": 1.1065,
+      "step": 2106
+    },
+    {
+      "epoch": 0.3751780626780627,
+      "grad_norm": 0.531384289264679,
+      "learning_rate": 0.0001957633417662792,
+      "loss": 1.1634,
+      "step": 2107
+    },
+    {
+      "epoch": 0.37535612535612534,
+      "grad_norm": 0.5167618989944458,
+      "learning_rate": 0.00019575930969620677,
+      "loss": 1.1646,
+      "step": 2108
+    },
+    {
+      "epoch": 0.37553418803418803,
+      "grad_norm": 0.41487228870391846,
+      "learning_rate": 0.0001957552757499296,
+      "loss": 0.793,
+      "step": 2109
+    },
+    {
+      "epoch": 0.37571225071225073,
+      "grad_norm": 0.5110787153244019,
+      "learning_rate": 0.00019575123992752672,
+      "loss": 1.1752,
+      "step": 2110
+    },
+    {
+      "epoch": 0.37589031339031337,
+      "grad_norm": 0.4422051012516022,
+      "learning_rate": 0.00019574720222907717,
+      "loss": 1.0102,
+      "step": 2111
+    },
+    {
+      "epoch": 0.37606837606837606,
+      "grad_norm": 0.4757538139820099,
+      "learning_rate": 0.0001957431626546601,
+      "loss": 1.0467,
+      "step": 2112
+    },
+    {
+      "epoch": 0.37624643874643876,
+      "grad_norm": 0.4736764430999756,
+      "learning_rate": 0.00019573912120435466,
+      "loss": 1.3048,
+      "step": 2113
+    },
+    {
+      "epoch": 0.3764245014245014,
+      "grad_norm": 0.49894335865974426,
+      "learning_rate": 0.00019573507787824004,
+      "loss": 1.0502,
+      "step": 2114
+    },
+    {
+      "epoch": 0.3766025641025641,
+      "grad_norm": 0.48120981454849243,
+      "learning_rate": 0.00019573103267639543,
+      "loss": 1.2405,
+      "step": 2115
+    },
+    {
+      "epoch": 0.3767806267806268,
+      "grad_norm": 0.4826737642288208,
+      "learning_rate": 0.0001957269855989001,
+      "loss": 1.1189,
+      "step": 2116
+    },
+    {
+      "epoch": 0.37695868945868943,
+      "grad_norm": 0.4736921489238739,
+      "learning_rate": 0.0001957229366458333,
+      "loss": 1.2862,
+      "step": 2117
+    },
+    {
+      "epoch": 0.37713675213675213,
+      "grad_norm": 0.3895208537578583,
+      "learning_rate": 0.00019571888581727446,
+      "loss": 1.0573,
+      "step": 2118
+    },
+    {
+      "epoch": 0.3773148148148148,
+      "grad_norm": 0.5107510089874268,
+      "learning_rate": 0.00019571483311330284,
+      "loss": 1.2913,
+      "step": 2119
+    },
+    {
+      "epoch": 0.37749287749287747,
+      "grad_norm": 0.4543241262435913,
+      "learning_rate": 0.00019571077853399794,
+      "loss": 0.949,
+      "step": 2120
+    },
+    {
+      "epoch": 0.37767094017094016,
+      "grad_norm": 0.46897491812705994,
+      "learning_rate": 0.00019570672207943913,
+      "loss": 1.2235,
+      "step": 2121
+    },
+    {
+      "epoch": 0.37784900284900286,
+      "grad_norm": 0.4812130630016327,
+      "learning_rate": 0.0001957026637497059,
+      "loss": 0.8857,
+      "step": 2122
+    },
+    {
+      "epoch": 0.37802706552706555,
+      "grad_norm": 0.47452476620674133,
+      "learning_rate": 0.00019569860354487782,
+      "loss": 1.0549,
+      "step": 2123
+    },
+    {
+      "epoch": 0.3782051282051282,
+      "grad_norm": 0.49879950284957886,
+      "learning_rate": 0.00019569454146503438,
+      "loss": 1.0475,
+      "step": 2124
+    },
+    {
+      "epoch": 0.3783831908831909,
+      "grad_norm": 0.4246445894241333,
+      "learning_rate": 0.00019569047751025518,
+      "loss": 0.8788,
+      "step": 2125
+    },
+    {
+      "epoch": 0.3785612535612536,
+      "grad_norm": 0.4868565499782562,
+      "learning_rate": 0.00019568641168061986,
+      "loss": 1.1801,
+      "step": 2126
+    },
+    {
+      "epoch": 0.3787393162393162,
+      "grad_norm": 0.46723654866218567,
+      "learning_rate": 0.0001956823439762081,
+      "loss": 1.1661,
+      "step": 2127
+    },
+    {
+      "epoch": 0.3789173789173789,
+      "grad_norm": 0.4989059269428253,
+      "learning_rate": 0.00019567827439709954,
+      "loss": 1.3037,
+      "step": 2128
+    },
+    {
+      "epoch": 0.3790954415954416,
+      "grad_norm": 0.441307932138443,
+      "learning_rate": 0.00019567420294337395,
+      "loss": 1.0197,
+      "step": 2129
+    },
+    {
+      "epoch": 0.37927350427350426,
+      "grad_norm": 0.5200160145759583,
+      "learning_rate": 0.0001956701296151111,
+      "loss": 1.3366,
+      "step": 2130
+    },
+    {
+      "epoch": 0.37945156695156695,
+      "grad_norm": 0.43610256910324097,
+      "learning_rate": 0.00019566605441239082,
+      "loss": 1.0148,
+      "step": 2131
+    },
+    {
+      "epoch": 0.37962962962962965,
+      "grad_norm": 0.4160982370376587,
+      "learning_rate": 0.00019566197733529293,
+      "loss": 1.0758,
+      "step": 2132
+    },
+    {
+      "epoch": 0.3798076923076923,
+      "grad_norm": 0.5007950663566589,
+      "learning_rate": 0.00019565789838389726,
+      "loss": 1.1937,
+      "step": 2133
+    },
+    {
+      "epoch": 0.379985754985755,
+      "grad_norm": 0.4991525113582611,
+      "learning_rate": 0.00019565381755828385,
+      "loss": 1.1788,
+      "step": 2134
+    },
+    {
+      "epoch": 0.3801638176638177,
+      "grad_norm": 0.6313113570213318,
+      "learning_rate": 0.00019564973485853258,
+      "loss": 1.1241,
+      "step": 2135
+    },
+    {
+      "epoch": 0.3803418803418803,
+      "grad_norm": 0.49736538529396057,
+      "learning_rate": 0.0001956456502847234,
+      "loss": 1.0299,
+      "step": 2136
+    },
+    {
+      "epoch": 0.380519943019943,
+      "grad_norm": 0.4384380578994751,
+      "learning_rate": 0.00019564156383693643,
+      "loss": 1.132,
+      "step": 2137
+    },
+    {
+      "epoch": 0.3806980056980057,
+      "grad_norm": 0.4696183502674103,
+      "learning_rate": 0.00019563747551525168,
+      "loss": 1.1145,
+      "step": 2138
+    },
+    {
+      "epoch": 0.38087606837606836,
+      "grad_norm": 0.42039749026298523,
+      "learning_rate": 0.0001956333853197493,
+      "loss": 0.9549,
+      "step": 2139
+    },
+    {
+      "epoch": 0.38105413105413105,
+      "grad_norm": 0.5547221899032593,
+      "learning_rate": 0.00019562929325050936,
+      "loss": 1.0476,
+      "step": 2140
+    },
+    {
+      "epoch": 0.38123219373219375,
+      "grad_norm": 0.4803301692008972,
+      "learning_rate": 0.0001956251993076121,
+      "loss": 1.1285,
+      "step": 2141
+    },
+    {
+      "epoch": 0.3814102564102564,
+      "grad_norm": 0.609501838684082,
+      "learning_rate": 0.00019562110349113766,
+      "loss": 1.2375,
+      "step": 2142
+    },
+    {
+      "epoch": 0.3815883190883191,
+      "grad_norm": 0.5134759545326233,
+      "learning_rate": 0.00019561700580116639,
+      "loss": 1.0895,
+      "step": 2143
+    },
+    {
+      "epoch": 0.3817663817663818,
+      "grad_norm": 0.5086711049079895,
+      "learning_rate": 0.00019561290623777846,
+      "loss": 1.1139,
+      "step": 2144
+    },
+    {
+      "epoch": 0.3819444444444444,
+      "grad_norm": 0.5371596813201904,
+      "learning_rate": 0.00019560880480105428,
+      "loss": 0.9302,
+      "step": 2145
+    },
+    {
+      "epoch": 0.3821225071225071,
+      "grad_norm": 0.4966319799423218,
+      "learning_rate": 0.00019560470149107418,
+      "loss": 1.2485,
+      "step": 2146
+    },
+    {
+      "epoch": 0.3823005698005698,
+      "grad_norm": 0.5296950340270996,
+      "learning_rate": 0.00019560059630791855,
+      "loss": 1.4449,
+      "step": 2147
+    },
+    {
+      "epoch": 0.38247863247863245,
+      "grad_norm": 0.5564194321632385,
+      "learning_rate": 0.00019559648925166783,
+      "loss": 1.0817,
+      "step": 2148
+    },
+    {
+      "epoch": 0.38265669515669515,
+      "grad_norm": 0.5763841867446899,
+      "learning_rate": 0.0001955923803224025,
+      "loss": 1.1915,
+      "step": 2149
+    },
+    {
+      "epoch": 0.38283475783475784,
+      "grad_norm": 0.4782295823097229,
+      "learning_rate": 0.00019558826952020304,
+      "loss": 1.1317,
+      "step": 2150
+    },
+    {
+      "epoch": 0.38301282051282054,
+      "grad_norm": 0.4876856207847595,
+      "learning_rate": 0.00019558415684515002,
+      "loss": 1.2113,
+      "step": 2151
+    },
+    {
+      "epoch": 0.3831908831908832,
+      "grad_norm": 0.4894421398639679,
+      "learning_rate": 0.00019558004229732398,
+      "loss": 1.0761,
+      "step": 2152
+    },
+    {
+      "epoch": 0.3833689458689459,
+      "grad_norm": 0.47914227843284607,
+      "learning_rate": 0.0001955759258768056,
+      "loss": 1.0869,
+      "step": 2153
+    },
+    {
+      "epoch": 0.38354700854700857,
+      "grad_norm": 0.43933629989624023,
+      "learning_rate": 0.00019557180758367543,
+      "loss": 1.0581,
+      "step": 2154
+    },
+    {
+      "epoch": 0.3837250712250712,
+      "grad_norm": 0.4078103005886078,
+      "learning_rate": 0.00019556768741801428,
+      "loss": 1.065,
+      "step": 2155
+    },
+    {
+      "epoch": 0.3839031339031339,
+      "grad_norm": 0.5112793445587158,
+      "learning_rate": 0.00019556356537990278,
+      "loss": 1.2023,
+      "step": 2156
+    },
+    {
+      "epoch": 0.3840811965811966,
+      "grad_norm": 0.4699678122997284,
+      "learning_rate": 0.00019555944146942177,
+      "loss": 1.2459,
+      "step": 2157
+    },
+    {
+      "epoch": 0.38425925925925924,
+      "grad_norm": 0.4723528027534485,
+      "learning_rate": 0.00019555531568665198,
+      "loss": 1.2204,
+      "step": 2158
+    },
+    {
+      "epoch": 0.38443732193732194,
+      "grad_norm": 0.4648225009441376,
+      "learning_rate": 0.00019555118803167432,
+      "loss": 1.1355,
+      "step": 2159
+    },
+    {
+      "epoch": 0.38461538461538464,
+      "grad_norm": 0.49861815571784973,
+      "learning_rate": 0.00019554705850456961,
+      "loss": 1.1301,
+      "step": 2160
+    },
+    {
+      "epoch": 0.3847934472934473,
+      "grad_norm": 0.4076344966888428,
+      "learning_rate": 0.00019554292710541874,
+      "loss": 0.8997,
+      "step": 2161
+    },
+    {
+      "epoch": 0.38497150997151,
+      "grad_norm": 0.5510796308517456,
+      "learning_rate": 0.00019553879383430272,
+      "loss": 1.0594,
+      "step": 2162
+    },
+    {
+      "epoch": 0.38514957264957267,
+      "grad_norm": 0.55793696641922,
+      "learning_rate": 0.00019553465869130249,
+      "loss": 1.1284,
+      "step": 2163
+    },
+    {
+      "epoch": 0.3853276353276353,
+      "grad_norm": 0.5096491575241089,
+      "learning_rate": 0.00019553052167649906,
+      "loss": 1.0419,
+      "step": 2164
+    },
+    {
+      "epoch": 0.385505698005698,
+      "grad_norm": 0.49077361822128296,
+      "learning_rate": 0.0001955263827899735,
+      "loss": 1.1632,
+      "step": 2165
+    },
+    {
+      "epoch": 0.3856837606837607,
+      "grad_norm": 0.5546894073486328,
+      "learning_rate": 0.00019552224203180693,
+      "loss": 1.1487,
+      "step": 2166
+    },
+    {
+      "epoch": 0.38586182336182334,
+      "grad_norm": 0.4930037260055542,
+      "learning_rate": 0.00019551809940208047,
+      "loss": 1.2668,
+      "step": 2167
+    },
+    {
+      "epoch": 0.38603988603988604,
+      "grad_norm": 0.5600671172142029,
+      "learning_rate": 0.00019551395490087525,
+      "loss": 1.3988,
+      "step": 2168
+    },
+    {
+      "epoch": 0.38621794871794873,
+      "grad_norm": 0.45897629857063293,
+      "learning_rate": 0.0001955098085282725,
+      "loss": 0.7792,
+      "step": 2169
+    },
+    {
+      "epoch": 0.3863960113960114,
+      "grad_norm": 0.46138936281204224,
+      "learning_rate": 0.00019550566028435346,
+      "loss": 1.1749,
+      "step": 2170
+    },
+    {
+      "epoch": 0.38657407407407407,
+      "grad_norm": 0.5136167407035828,
+      "learning_rate": 0.0001955015101691994,
+      "loss": 1.0153,
+      "step": 2171
+    },
+    {
+      "epoch": 0.38675213675213677,
+      "grad_norm": 0.4886440336704254,
+      "learning_rate": 0.00019549735818289165,
+      "loss": 1.0006,
+      "step": 2172
+    },
+    {
+      "epoch": 0.3869301994301994,
+      "grad_norm": 0.4339776635169983,
+      "learning_rate": 0.00019549320432551154,
+      "loss": 1.0109,
+      "step": 2173
+    },
+    {
+      "epoch": 0.3871082621082621,
+      "grad_norm": 0.48729443550109863,
+      "learning_rate": 0.00019548904859714044,
+      "loss": 1.2016,
+      "step": 2174
+    },
+    {
+      "epoch": 0.3872863247863248,
+      "grad_norm": 0.5128757357597351,
+      "learning_rate": 0.0001954848909978598,
+      "loss": 1.085,
+      "step": 2175
+    },
+    {
+      "epoch": 0.38746438746438744,
+      "grad_norm": 0.49636292457580566,
+      "learning_rate": 0.0001954807315277511,
+      "loss": 1.0671,
+      "step": 2176
+    },
+    {
+      "epoch": 0.38764245014245013,
+      "grad_norm": 0.4946988821029663,
+      "learning_rate": 0.00019547657018689578,
+      "loss": 1.2091,
+      "step": 2177
+    },
+    {
+      "epoch": 0.38782051282051283,
+      "grad_norm": 0.49004554748535156,
+      "learning_rate": 0.00019547240697537544,
+      "loss": 1.0241,
+      "step": 2178
+    },
+    {
+      "epoch": 0.38799857549857547,
+      "grad_norm": 0.48750075697898865,
+      "learning_rate": 0.00019546824189327157,
+      "loss": 1.1082,
+      "step": 2179
+    },
+    {
+      "epoch": 0.38817663817663817,
+      "grad_norm": 0.47726166248321533,
+      "learning_rate": 0.00019546407494066585,
+      "loss": 1.1275,
+      "step": 2180
+    },
+    {
+      "epoch": 0.38835470085470086,
+      "grad_norm": 0.5253444910049438,
+      "learning_rate": 0.00019545990611763986,
+      "loss": 1.0164,
+      "step": 2181
+    },
+    {
+      "epoch": 0.38853276353276356,
+      "grad_norm": 0.4470371603965759,
+      "learning_rate": 0.00019545573542427533,
+      "loss": 1.0138,
+      "step": 2182
+    },
+    {
+      "epoch": 0.3887108262108262,
+      "grad_norm": 0.6645087599754333,
+      "learning_rate": 0.00019545156286065397,
+      "loss": 1.0884,
+      "step": 2183
+    },
+    {
+      "epoch": 0.3888888888888889,
+      "grad_norm": 0.498775839805603,
+      "learning_rate": 0.0001954473884268575,
+      "loss": 1.1035,
+      "step": 2184
+    },
+    {
+      "epoch": 0.3890669515669516,
+      "grad_norm": 0.5830566883087158,
+      "learning_rate": 0.00019544321212296772,
+      "loss": 1.1665,
+      "step": 2185
+    },
+    {
+      "epoch": 0.38924501424501423,
+      "grad_norm": 0.48162809014320374,
+      "learning_rate": 0.00019543903394906646,
+      "loss": 1.1035,
+      "step": 2186
+    },
+    {
+      "epoch": 0.3894230769230769,
+      "grad_norm": 0.46334075927734375,
+      "learning_rate": 0.0001954348539052356,
+      "loss": 0.9764,
+      "step": 2187
+    },
+    {
+      "epoch": 0.3896011396011396,
+      "grad_norm": 0.6343515515327454,
+      "learning_rate": 0.00019543067199155704,
+      "loss": 0.9474,
+      "step": 2188
+    },
+    {
+      "epoch": 0.38977920227920226,
+      "grad_norm": 0.4867806136608124,
+      "learning_rate": 0.0001954264882081127,
+      "loss": 1.1161,
+      "step": 2189
+    },
+    {
+      "epoch": 0.38995726495726496,
+      "grad_norm": 0.49305734038352966,
+      "learning_rate": 0.00019542230255498454,
+      "loss": 1.1825,
+      "step": 2190
+    },
+    {
+      "epoch": 0.39013532763532766,
+      "grad_norm": 0.518465518951416,
+      "learning_rate": 0.00019541811503225457,
+      "loss": 1.0695,
+      "step": 2191
+    },
+    {
+      "epoch": 0.3903133903133903,
+      "grad_norm": 0.4892457127571106,
+      "learning_rate": 0.00019541392564000488,
+      "loss": 1.3113,
+      "step": 2192
+    },
+    {
+      "epoch": 0.390491452991453,
+      "grad_norm": 0.5150920152664185,
+      "learning_rate": 0.00019540973437831753,
+      "loss": 1.0735,
+      "step": 2193
+    },
+    {
+      "epoch": 0.3906695156695157,
+      "grad_norm": 0.5414708256721497,
+      "learning_rate": 0.00019540554124727462,
+      "loss": 1.0773,
+      "step": 2194
+    },
+    {
+      "epoch": 0.39084757834757833,
+      "grad_norm": 0.49826398491859436,
+      "learning_rate": 0.0001954013462469583,
+      "loss": 1.0542,
+      "step": 2195
+    },
+    {
+      "epoch": 0.391025641025641,
+      "grad_norm": 0.5203596949577332,
+      "learning_rate": 0.0001953971493774508,
+      "loss": 1.178,
+      "step": 2196
+    },
+    {
+      "epoch": 0.3912037037037037,
+      "grad_norm": 0.45095738768577576,
+      "learning_rate": 0.00019539295063883432,
+      "loss": 1.1254,
+      "step": 2197
+    },
+    {
+      "epoch": 0.39138176638176636,
+      "grad_norm": 0.4938857853412628,
+      "learning_rate": 0.00019538875003119113,
+      "loss": 1.1061,
+      "step": 2198
+    },
+    {
+      "epoch": 0.39155982905982906,
+      "grad_norm": 0.5260919332504272,
+      "learning_rate": 0.00019538454755460354,
+      "loss": 1.3292,
+      "step": 2199
+    },
+    {
+      "epoch": 0.39173789173789175,
+      "grad_norm": 0.46527108550071716,
+      "learning_rate": 0.00019538034320915388,
+      "loss": 1.2074,
+      "step": 2200
+    },
+    {
+      "epoch": 0.3919159544159544,
+      "grad_norm": 0.5608304738998413,
+      "learning_rate": 0.00019537613699492453,
+      "loss": 1.0385,
+      "step": 2201
+    },
+    {
+      "epoch": 0.3920940170940171,
+      "grad_norm": 0.5056684613227844,
+      "learning_rate": 0.00019537192891199792,
+      "loss": 1.1513,
+      "step": 2202
+    },
+    {
+      "epoch": 0.3922720797720798,
+      "grad_norm": 0.3764426112174988,
+      "learning_rate": 0.00019536771896045644,
+      "loss": 0.8966,
+      "step": 2203
+    },
+    {
+      "epoch": 0.3924501424501424,
+      "grad_norm": 0.4983638823032379,
+      "learning_rate": 0.0001953635071403827,
+      "loss": 1.097,
+      "step": 2204
+    },
+    {
+      "epoch": 0.3926282051282051,
+      "grad_norm": 0.5733919739723206,
+      "learning_rate": 0.00019535929345185904,
+      "loss": 1.4992,
+      "step": 2205
+    },
+    {
+      "epoch": 0.3928062678062678,
+      "grad_norm": 0.632064163684845,
+      "learning_rate": 0.00019535507789496817,
+      "loss": 1.0611,
+      "step": 2206
+    },
+    {
+      "epoch": 0.39298433048433046,
+      "grad_norm": 0.409978449344635,
+      "learning_rate": 0.00019535086046979262,
+      "loss": 0.7172,
+      "step": 2207
+    },
+    {
+      "epoch": 0.39316239316239315,
+      "grad_norm": 0.40910813212394714,
+      "learning_rate": 0.00019534664117641502,
+      "loss": 0.8803,
+      "step": 2208
+    },
+    {
+      "epoch": 0.39334045584045585,
+      "grad_norm": 0.4696179926395416,
+      "learning_rate": 0.00019534242001491807,
+      "loss": 1.1551,
+      "step": 2209
+    },
+    {
+      "epoch": 0.39351851851851855,
+      "grad_norm": 0.538425862789154,
+      "learning_rate": 0.00019533819698538444,
+      "loss": 1.1296,
+      "step": 2210
+    },
+    {
+      "epoch": 0.3936965811965812,
+      "grad_norm": 0.5913630723953247,
+      "learning_rate": 0.00019533397208789692,
+      "loss": 0.9757,
+      "step": 2211
+    },
+    {
+      "epoch": 0.3938746438746439,
+      "grad_norm": 0.5649870038032532,
+      "learning_rate": 0.00019532974532253822,
+      "loss": 0.9976,
+      "step": 2212
+    },
+    {
+      "epoch": 0.3940527065527066,
+      "grad_norm": 0.5012063980102539,
+      "learning_rate": 0.00019532551668939121,
+      "loss": 0.9969,
+      "step": 2213
+    },
+    {
+      "epoch": 0.3942307692307692,
+      "grad_norm": 0.5098594427108765,
+      "learning_rate": 0.00019532128618853872,
+      "loss": 1.1229,
+      "step": 2214
+    },
+    {
+      "epoch": 0.3944088319088319,
+      "grad_norm": 0.4753342568874359,
+      "learning_rate": 0.0001953170538200636,
+      "loss": 1.0808,
+      "step": 2215
+    },
+    {
+      "epoch": 0.3945868945868946,
+      "grad_norm": 0.4770098626613617,
+      "learning_rate": 0.00019531281958404888,
+      "loss": 1.0656,
+      "step": 2216
+    },
+    {
+      "epoch": 0.39476495726495725,
+      "grad_norm": 0.6007979512214661,
+      "learning_rate": 0.00019530858348057746,
+      "loss": 1.0093,
+      "step": 2217
+    },
+    {
+      "epoch": 0.39494301994301995,
+      "grad_norm": 0.4501650929450989,
+      "learning_rate": 0.00019530434550973227,
+      "loss": 0.8557,
+      "step": 2218
+    },
+    {
+      "epoch": 0.39512108262108264,
+      "grad_norm": 0.5123980641365051,
+      "learning_rate": 0.00019530010567159645,
+      "loss": 0.9833,
+      "step": 2219
+    },
+    {
+      "epoch": 0.3952991452991453,
+      "grad_norm": 0.4623969495296478,
+      "learning_rate": 0.000195295863966253,
+      "loss": 0.913,
+      "step": 2220
+    },
+    {
+      "epoch": 0.395477207977208,
+      "grad_norm": 0.4341880679130554,
+      "learning_rate": 0.0001952916203937851,
+      "loss": 1.0234,
+      "step": 2221
+    },
+    {
+      "epoch": 0.3956552706552707,
+      "grad_norm": 0.5935006141662598,
+      "learning_rate": 0.00019528737495427581,
+      "loss": 1.061,
+      "step": 2222
+    },
+    {
+      "epoch": 0.3958333333333333,
+      "grad_norm": 0.44835174083709717,
+      "learning_rate": 0.00019528312764780837,
+      "loss": 1.1567,
+      "step": 2223
+    },
+    {
+      "epoch": 0.396011396011396,
+      "grad_norm": 0.5476976633071899,
+      "learning_rate": 0.00019527887847446595,
+      "loss": 1.2304,
+      "step": 2224
+    },
+    {
+      "epoch": 0.3961894586894587,
+      "grad_norm": 0.4487939774990082,
+      "learning_rate": 0.00019527462743433187,
+      "loss": 1.1813,
+      "step": 2225
+    },
+    {
+      "epoch": 0.39636752136752135,
+      "grad_norm": 0.4053241014480591,
+      "learning_rate": 0.00019527037452748936,
+      "loss": 0.7899,
+      "step": 2226
+    },
+    {
+      "epoch": 0.39654558404558404,
+      "grad_norm": 0.534570574760437,
+      "learning_rate": 0.00019526611975402176,
+      "loss": 1.0681,
+      "step": 2227
+    },
+    {
+      "epoch": 0.39672364672364674,
+      "grad_norm": 0.46096158027648926,
+      "learning_rate": 0.00019526186311401246,
+      "loss": 0.9234,
+      "step": 2228
+    },
+    {
+      "epoch": 0.3969017094017094,
+      "grad_norm": 0.47363516688346863,
+      "learning_rate": 0.00019525760460754483,
+      "loss": 1.0197,
+      "step": 2229
+    },
+    {
+      "epoch": 0.3970797720797721,
+      "grad_norm": 0.46317258477211,
+      "learning_rate": 0.00019525334423470234,
+      "loss": 1.2103,
+      "step": 2230
+    },
+    {
+      "epoch": 0.39725783475783477,
+      "grad_norm": 0.4924237132072449,
+      "learning_rate": 0.0001952490819955684,
+      "loss": 1.3299,
+      "step": 2231
+    },
+    {
+      "epoch": 0.3974358974358974,
+      "grad_norm": 0.5419978499412537,
+      "learning_rate": 0.0001952448178902266,
+      "loss": 1.2526,
+      "step": 2232
+    },
+    {
+      "epoch": 0.3976139601139601,
+      "grad_norm": 0.5003267526626587,
+      "learning_rate": 0.00019524055191876043,
+      "loss": 1.1073,
+      "step": 2233
+    },
+    {
+      "epoch": 0.3977920227920228,
+      "grad_norm": 0.621789276599884,
+      "learning_rate": 0.00019523628408125347,
+      "loss": 1.3409,
+      "step": 2234
+    },
+    {
+      "epoch": 0.39797008547008544,
+      "grad_norm": 0.44235602021217346,
+      "learning_rate": 0.0001952320143777894,
+      "loss": 0.9799,
+      "step": 2235
+    },
+    {
+      "epoch": 0.39814814814814814,
+      "grad_norm": 0.49954718351364136,
+      "learning_rate": 0.0001952277428084518,
+      "loss": 1.2227,
+      "step": 2236
+    },
+    {
+      "epoch": 0.39832621082621084,
+      "grad_norm": 0.5113739967346191,
+      "learning_rate": 0.00019522346937332443,
+      "loss": 1.1644,
+      "step": 2237
+    },
+    {
+      "epoch": 0.39850427350427353,
+      "grad_norm": 0.5026139616966248,
+      "learning_rate": 0.00019521919407249096,
+      "loss": 1.0823,
+      "step": 2238
+    },
+    {
+      "epoch": 0.39868233618233617,
+      "grad_norm": 0.4943205714225769,
+      "learning_rate": 0.0001952149169060352,
+      "loss": 1.0961,
+      "step": 2239
+    },
+    {
+      "epoch": 0.39886039886039887,
+      "grad_norm": 0.4680631458759308,
+      "learning_rate": 0.00019521063787404094,
+      "loss": 0.9787,
+      "step": 2240
+    },
+    {
+      "epoch": 0.39903846153846156,
+      "grad_norm": 0.5511566400527954,
+      "learning_rate": 0.00019520635697659202,
+      "loss": 1.2543,
+      "step": 2241
+    },
+    {
+      "epoch": 0.3992165242165242,
+      "grad_norm": 0.5494263172149658,
+      "learning_rate": 0.00019520207421377229,
+      "loss": 1.1978,
+      "step": 2242
+    },
+    {
+      "epoch": 0.3993945868945869,
+      "grad_norm": 0.4850340485572815,
+      "learning_rate": 0.00019519778958566568,
+      "loss": 0.8531,
+      "step": 2243
+    },
+    {
+      "epoch": 0.3995726495726496,
+      "grad_norm": 0.47168150544166565,
+      "learning_rate": 0.00019519350309235613,
+      "loss": 1.0746,
+      "step": 2244
+    },
+    {
+      "epoch": 0.39975071225071224,
+      "grad_norm": 0.571133553981781,
+      "learning_rate": 0.00019518921473392765,
+      "loss": 1.2984,
+      "step": 2245
+    },
+    {
+      "epoch": 0.39992877492877493,
+      "grad_norm": 0.4636089503765106,
+      "learning_rate": 0.00019518492451046427,
+      "loss": 1.019,
+      "step": 2246
+    },
+    {
+      "epoch": 0.40010683760683763,
+      "grad_norm": 0.4573518931865692,
+      "learning_rate": 0.00019518063242205,
+      "loss": 1.1042,
+      "step": 2247
+    },
+    {
+      "epoch": 0.40028490028490027,
+      "grad_norm": 0.49098989367485046,
+      "learning_rate": 0.00019517633846876894,
+      "loss": 1.1224,
+      "step": 2248
+    },
+    {
+      "epoch": 0.40046296296296297,
+      "grad_norm": 0.5475491881370544,
+      "learning_rate": 0.00019517204265070523,
+      "loss": 1.0984,
+      "step": 2249
+    },
+    {
+      "epoch": 0.40064102564102566,
+      "grad_norm": 0.45498281717300415,
+      "learning_rate": 0.00019516774496794307,
+      "loss": 0.8883,
+      "step": 2250
+    },
+    {
+      "epoch": 0.4008190883190883,
+      "grad_norm": 0.4908423125743866,
+      "learning_rate": 0.00019516344542056666,
+      "loss": 1.328,
+      "step": 2251
+    },
+    {
+      "epoch": 0.400997150997151,
+      "grad_norm": 0.5474920272827148,
+      "learning_rate": 0.0001951591440086602,
+      "loss": 1.3825,
+      "step": 2252
+    },
+    {
+      "epoch": 0.4011752136752137,
+      "grad_norm": 0.5165615081787109,
+      "learning_rate": 0.000195154840732308,
+      "loss": 1.33,
+      "step": 2253
+    },
+    {
+      "epoch": 0.40135327635327633,
+      "grad_norm": 0.5185585021972656,
+      "learning_rate": 0.00019515053559159435,
+      "loss": 1.1689,
+      "step": 2254
+    },
+    {
+      "epoch": 0.40153133903133903,
+      "grad_norm": 0.5468854904174805,
+      "learning_rate": 0.00019514622858660363,
+      "loss": 1.2708,
+      "step": 2255
+    },
+    {
+      "epoch": 0.4017094017094017,
+      "grad_norm": 0.47556906938552856,
+      "learning_rate": 0.0001951419197174202,
+      "loss": 1.0488,
+      "step": 2256
+    },
+    {
+      "epoch": 0.40188746438746437,
+      "grad_norm": 0.5521323084831238,
+      "learning_rate": 0.0001951376089841285,
+      "loss": 1.0868,
+      "step": 2257
+    },
+    {
+      "epoch": 0.40206552706552706,
+      "grad_norm": 0.6029638051986694,
+      "learning_rate": 0.00019513329638681296,
+      "loss": 1.1735,
+      "step": 2258
+    },
+    {
+      "epoch": 0.40224358974358976,
+      "grad_norm": 0.4897766411304474,
+      "learning_rate": 0.00019512898192555812,
+      "loss": 1.1687,
+      "step": 2259
+    },
+    {
+      "epoch": 0.4024216524216524,
+      "grad_norm": 0.45527184009552,
+      "learning_rate": 0.00019512466560044848,
+      "loss": 1.0352,
+      "step": 2260
+    },
+    {
+      "epoch": 0.4025997150997151,
+      "grad_norm": 0.5025625824928284,
+      "learning_rate": 0.00019512034741156863,
+      "loss": 1.2503,
+      "step": 2261
+    },
+    {
+      "epoch": 0.4027777777777778,
+      "grad_norm": 0.46415451169013977,
+      "learning_rate": 0.00019511602735900317,
+      "loss": 1.032,
+      "step": 2262
+    },
+    {
+      "epoch": 0.40295584045584043,
+      "grad_norm": 0.4812934398651123,
+      "learning_rate": 0.00019511170544283678,
+      "loss": 1.0523,
+      "step": 2263
+    },
+    {
+      "epoch": 0.4031339031339031,
+      "grad_norm": 0.49937039613723755,
+      "learning_rate": 0.00019510738166315404,
+      "loss": 1.2238,
+      "step": 2264
+    },
+    {
+      "epoch": 0.4033119658119658,
+      "grad_norm": 0.5428698062896729,
+      "learning_rate": 0.00019510305602003975,
+      "loss": 1.0361,
+      "step": 2265
+    },
+    {
+      "epoch": 0.40349002849002846,
+      "grad_norm": 0.44836854934692383,
+      "learning_rate": 0.0001950987285135786,
+      "loss": 1.169,
+      "step": 2266
+    },
+    {
+      "epoch": 0.40366809116809116,
+      "grad_norm": 0.5071489214897156,
+      "learning_rate": 0.00019509439914385549,
+      "loss": 1.1567,
+      "step": 2267
+    },
+    {
+      "epoch": 0.40384615384615385,
+      "grad_norm": 0.5204613208770752,
+      "learning_rate": 0.00019509006791095513,
+      "loss": 0.9949,
+      "step": 2268
+    },
+    {
+      "epoch": 0.40402421652421655,
+      "grad_norm": 0.4583234488964081,
+      "learning_rate": 0.00019508573481496238,
+      "loss": 0.9051,
+      "step": 2269
+    },
+    {
+      "epoch": 0.4042022792022792,
+      "grad_norm": 0.5436791181564331,
+      "learning_rate": 0.00019508139985596222,
+      "loss": 1.3239,
+      "step": 2270
+    },
+    {
+      "epoch": 0.4043803418803419,
+      "grad_norm": 0.48774269223213196,
+      "learning_rate": 0.00019507706303403954,
+      "loss": 1.2102,
+      "step": 2271
+    },
+    {
+      "epoch": 0.4045584045584046,
+      "grad_norm": 0.4742540717124939,
+      "learning_rate": 0.00019507272434927933,
+      "loss": 1.1137,
+      "step": 2272
+    },
+    {
+      "epoch": 0.4047364672364672,
+      "grad_norm": 0.531148374080658,
+      "learning_rate": 0.00019506838380176658,
+      "loss": 1.3162,
+      "step": 2273
+    },
+    {
+      "epoch": 0.4049145299145299,
+      "grad_norm": 0.5002314448356628,
+      "learning_rate": 0.0001950640413915863,
+      "loss": 1.0743,
+      "step": 2274
+    },
+    {
+      "epoch": 0.4050925925925926,
+      "grad_norm": 0.39826446771621704,
+      "learning_rate": 0.00019505969711882366,
+      "loss": 0.7698,
+      "step": 2275
+    },
+    {
+      "epoch": 0.40527065527065526,
+      "grad_norm": 0.5177471041679382,
+      "learning_rate": 0.00019505535098356371,
+      "loss": 1.1821,
+      "step": 2276
+    },
+    {
+      "epoch": 0.40544871794871795,
+      "grad_norm": 0.467241108417511,
+      "learning_rate": 0.00019505100298589158,
+      "loss": 0.8036,
+      "step": 2277
+    },
+    {
+      "epoch": 0.40562678062678065,
+      "grad_norm": 0.43711844086647034,
+      "learning_rate": 0.00019504665312589255,
+      "loss": 0.8667,
+      "step": 2278
+    },
+    {
+      "epoch": 0.4058048433048433,
+      "grad_norm": 0.4929116368293762,
+      "learning_rate": 0.00019504230140365177,
+      "loss": 1.1279,
+      "step": 2279
+    },
+    {
+      "epoch": 0.405982905982906,
+      "grad_norm": 0.5279183983802795,
+      "learning_rate": 0.00019503794781925452,
+      "loss": 1.1318,
+      "step": 2280
+    },
+    {
+      "epoch": 0.4061609686609687,
+      "grad_norm": 0.549217939376831,
+      "learning_rate": 0.00019503359237278608,
+      "loss": 1.2007,
+      "step": 2281
+    },
+    {
+      "epoch": 0.4063390313390313,
+      "grad_norm": 0.5485880374908447,
+      "learning_rate": 0.00019502923506433187,
+      "loss": 1.1079,
+      "step": 2282
+    },
+    {
+      "epoch": 0.406517094017094,
+      "grad_norm": 0.48379644751548767,
+      "learning_rate": 0.0001950248758939772,
+      "loss": 0.9978,
+      "step": 2283
+    },
+    {
+      "epoch": 0.4066951566951567,
+      "grad_norm": 0.5943657755851746,
+      "learning_rate": 0.00019502051486180744,
+      "loss": 1.0466,
+      "step": 2284
+    },
+    {
+      "epoch": 0.40687321937321935,
+      "grad_norm": 0.5721273422241211,
+      "learning_rate": 0.00019501615196790812,
+      "loss": 1.2674,
+      "step": 2285
+    },
+    {
+      "epoch": 0.40705128205128205,
+      "grad_norm": 0.47624221444129944,
+      "learning_rate": 0.00019501178721236464,
+      "loss": 1.089,
+      "step": 2286
+    },
+    {
+      "epoch": 0.40722934472934474,
+      "grad_norm": 0.5091297030448914,
+      "learning_rate": 0.0001950074205952626,
+      "loss": 1.2035,
+      "step": 2287
+    },
+    {
+      "epoch": 0.4074074074074074,
+      "grad_norm": 0.45206236839294434,
+      "learning_rate": 0.0001950030521166875,
+      "loss": 0.9188,
+      "step": 2288
+    },
+    {
+      "epoch": 0.4075854700854701,
+      "grad_norm": 0.5563844442367554,
+      "learning_rate": 0.00019499868177672497,
+      "loss": 1.3444,
+      "step": 2289
+    },
+    {
+      "epoch": 0.4077635327635328,
+      "grad_norm": 0.4971138536930084,
+      "learning_rate": 0.00019499430957546055,
+      "loss": 1.1615,
+      "step": 2290
+    },
+    {
+      "epoch": 0.4079415954415954,
+      "grad_norm": 0.49355944991111755,
+      "learning_rate": 0.00019498993551298,
+      "loss": 1.1528,
+      "step": 2291
+    },
+    {
+      "epoch": 0.4081196581196581,
+      "grad_norm": 0.534705638885498,
+      "learning_rate": 0.000194985559589369,
+      "loss": 1.197,
+      "step": 2292
+    },
+    {
+      "epoch": 0.4082977207977208,
+      "grad_norm": 0.5113020539283752,
+      "learning_rate": 0.0001949811818047133,
+      "loss": 1.109,
+      "step": 2293
+    },
+    {
+      "epoch": 0.40847578347578345,
+      "grad_norm": 0.4823366701602936,
+      "learning_rate": 0.00019497680215909858,
+      "loss": 1.168,
+      "step": 2294
+    },
+    {
+      "epoch": 0.40865384615384615,
+      "grad_norm": 0.500792920589447,
+      "learning_rate": 0.00019497242065261077,
+      "loss": 1.1567,
+      "step": 2295
+    },
+    {
+      "epoch": 0.40883190883190884,
+      "grad_norm": 0.5047918558120728,
+      "learning_rate": 0.00019496803728533566,
+      "loss": 1.0515,
+      "step": 2296
+    },
+    {
+      "epoch": 0.40900997150997154,
+      "grad_norm": 0.474624365568161,
+      "learning_rate": 0.00019496365205735913,
+      "loss": 1.1747,
+      "step": 2297
+    },
+    {
+      "epoch": 0.4091880341880342,
+      "grad_norm": 0.5522183179855347,
+      "learning_rate": 0.0001949592649687671,
+      "loss": 1.1506,
+      "step": 2298
+    },
+    {
+      "epoch": 0.4093660968660969,
+      "grad_norm": 0.4526083767414093,
+      "learning_rate": 0.00019495487601964553,
+      "loss": 0.9968,
+      "step": 2299
+    },
+    {
+      "epoch": 0.40954415954415957,
+      "grad_norm": 0.545845091342926,
+      "learning_rate": 0.00019495048521008044,
+      "loss": 1.146,
+      "step": 2300
+    },
+    {
+      "epoch": 0.4097222222222222,
+      "grad_norm": 0.5475544333457947,
+      "learning_rate": 0.00019494609254015784,
+      "loss": 1.0101,
+      "step": 2301
+    },
+    {
+      "epoch": 0.4099002849002849,
+      "grad_norm": 0.43419042229652405,
+      "learning_rate": 0.00019494169800996373,
+      "loss": 1.065,
+      "step": 2302
+    },
+    {
+      "epoch": 0.4100783475783476,
+      "grad_norm": 0.44998374581336975,
+      "learning_rate": 0.00019493730161958435,
+      "loss": 0.9948,
+      "step": 2303
+    },
+    {
+      "epoch": 0.41025641025641024,
+      "grad_norm": 0.5401661992073059,
+      "learning_rate": 0.0001949329033691057,
+      "loss": 1.0473,
+      "step": 2304
+    },
+    {
+      "epoch": 0.41043447293447294,
+      "grad_norm": 0.48064103722572327,
+      "learning_rate": 0.00019492850325861404,
+      "loss": 1.0486,
+      "step": 2305
+    },
+    {
+      "epoch": 0.41061253561253563,
+      "grad_norm": 0.5398300290107727,
+      "learning_rate": 0.00019492410128819557,
+      "loss": 1.0314,
+      "step": 2306
+    },
+    {
+      "epoch": 0.4107905982905983,
+      "grad_norm": 0.4771125912666321,
+      "learning_rate": 0.0001949196974579365,
+      "loss": 0.9855,
+      "step": 2307
+    },
+    {
+      "epoch": 0.41096866096866097,
+      "grad_norm": 0.5375809669494629,
+      "learning_rate": 0.00019491529176792315,
+      "loss": 1.0777,
+      "step": 2308
+    },
+    {
+      "epoch": 0.41114672364672367,
+      "grad_norm": 0.48424094915390015,
+      "learning_rate": 0.00019491088421824183,
+      "loss": 1.0751,
+      "step": 2309
+    },
+    {
+      "epoch": 0.4113247863247863,
+      "grad_norm": 0.5054880380630493,
+      "learning_rate": 0.00019490647480897887,
+      "loss": 1.2457,
+      "step": 2310
+    },
+    {
+      "epoch": 0.411502849002849,
+      "grad_norm": 0.47118356823921204,
+      "learning_rate": 0.0001949020635402207,
+      "loss": 1.0445,
+      "step": 2311
+    },
+    {
+      "epoch": 0.4116809116809117,
+      "grad_norm": 0.47171851992607117,
+      "learning_rate": 0.00019489765041205375,
+      "loss": 1.0062,
+      "step": 2312
+    },
+    {
+      "epoch": 0.41185897435897434,
+      "grad_norm": 0.5703238844871521,
+      "learning_rate": 0.00019489323542456447,
+      "loss": 1.5639,
+      "step": 2313
+    },
+    {
+      "epoch": 0.41203703703703703,
+      "grad_norm": 0.5045075416564941,
+      "learning_rate": 0.00019488881857783935,
+      "loss": 1.1665,
+      "step": 2314
+    },
+    {
+      "epoch": 0.41221509971509973,
+      "grad_norm": 0.46835362911224365,
+      "learning_rate": 0.00019488439987196495,
+      "loss": 1.2078,
+      "step": 2315
+    },
+    {
+      "epoch": 0.41239316239316237,
+      "grad_norm": 0.5187196731567383,
+      "learning_rate": 0.00019487997930702785,
+      "loss": 1.1049,
+      "step": 2316
+    },
+    {
+      "epoch": 0.41257122507122507,
+      "grad_norm": 0.5190554857254028,
+      "learning_rate": 0.00019487555688311463,
+      "loss": 1.331,
+      "step": 2317
+    },
+    {
+      "epoch": 0.41274928774928776,
+      "grad_norm": 0.7394969463348389,
+      "learning_rate": 0.00019487113260031197,
+      "loss": 0.9646,
+      "step": 2318
+    },
+    {
+      "epoch": 0.4129273504273504,
+      "grad_norm": 0.532982349395752,
+      "learning_rate": 0.00019486670645870656,
+      "loss": 1.166,
+      "step": 2319
+    },
+    {
+      "epoch": 0.4131054131054131,
+      "grad_norm": 0.48659515380859375,
+      "learning_rate": 0.00019486227845838509,
+      "loss": 1.0016,
+      "step": 2320
+    },
+    {
+      "epoch": 0.4132834757834758,
+      "grad_norm": 0.5364453196525574,
+      "learning_rate": 0.00019485784859943434,
+      "loss": 1.3877,
+      "step": 2321
+    },
+    {
+      "epoch": 0.41346153846153844,
+      "grad_norm": 0.49788740277290344,
+      "learning_rate": 0.0001948534168819411,
+      "loss": 1.2949,
+      "step": 2322
+    },
+    {
+      "epoch": 0.41363960113960113,
+      "grad_norm": 0.5125377774238586,
+      "learning_rate": 0.00019484898330599217,
+      "loss": 0.9769,
+      "step": 2323
+    },
+    {
+      "epoch": 0.41381766381766383,
+      "grad_norm": 0.5434861779212952,
+      "learning_rate": 0.00019484454787167447,
+      "loss": 1.254,
+      "step": 2324
+    },
+    {
+      "epoch": 0.41399572649572647,
+      "grad_norm": 0.5324583053588867,
+      "learning_rate": 0.00019484011057907487,
+      "loss": 0.9788,
+      "step": 2325
+    },
+    {
+      "epoch": 0.41417378917378916,
+      "grad_norm": 0.4806961715221405,
+      "learning_rate": 0.00019483567142828033,
+      "loss": 1.0089,
+      "step": 2326
+    },
+    {
+      "epoch": 0.41435185185185186,
+      "grad_norm": 0.5152947306632996,
+      "learning_rate": 0.0001948312304193778,
+      "loss": 1.15,
+      "step": 2327
+    },
+    {
+      "epoch": 0.41452991452991456,
+      "grad_norm": 0.6030138731002808,
+      "learning_rate": 0.0001948267875524543,
+      "loss": 1.196,
+      "step": 2328
+    },
+    {
+      "epoch": 0.4147079772079772,
+      "grad_norm": 0.4504946768283844,
+      "learning_rate": 0.0001948223428275969,
+      "loss": 0.8742,
+      "step": 2329
+    },
+    {
+      "epoch": 0.4148860398860399,
+      "grad_norm": 0.5195745825767517,
+      "learning_rate": 0.00019481789624489263,
+      "loss": 1.0104,
+      "step": 2330
+    },
+    {
+      "epoch": 0.4150641025641026,
+      "grad_norm": 0.5269250869750977,
+      "learning_rate": 0.0001948134478044287,
+      "loss": 1.2284,
+      "step": 2331
+    },
+    {
+      "epoch": 0.41524216524216523,
+      "grad_norm": 0.5302315354347229,
+      "learning_rate": 0.00019480899750629218,
+      "loss": 1.1374,
+      "step": 2332
+    },
+    {
+      "epoch": 0.4154202279202279,
+      "grad_norm": 0.5501471161842346,
+      "learning_rate": 0.0001948045453505703,
+      "loss": 1.214,
+      "step": 2333
+    },
+    {
+      "epoch": 0.4155982905982906,
+      "grad_norm": 0.4674588739871979,
+      "learning_rate": 0.0001948000913373503,
+      "loss": 1.0568,
+      "step": 2334
+    },
+    {
+      "epoch": 0.41577635327635326,
+      "grad_norm": 0.5262266993522644,
+      "learning_rate": 0.0001947956354667195,
+      "loss": 1.111,
+      "step": 2335
+    },
+    {
+      "epoch": 0.41595441595441596,
+      "grad_norm": 0.4549071788787842,
+      "learning_rate": 0.00019479117773876507,
+      "loss": 1.2655,
+      "step": 2336
+    },
+    {
+      "epoch": 0.41613247863247865,
+      "grad_norm": 0.48897311091423035,
+      "learning_rate": 0.00019478671815357447,
+      "loss": 1.0543,
+      "step": 2337
+    },
+    {
+      "epoch": 0.4163105413105413,
+      "grad_norm": 0.5544867515563965,
+      "learning_rate": 0.000194782256711235,
+      "loss": 1.2276,
+      "step": 2338
+    },
+    {
+      "epoch": 0.416488603988604,
+      "grad_norm": 0.5050773024559021,
+      "learning_rate": 0.0001947777934118341,
+      "loss": 0.9781,
+      "step": 2339
+    },
+    {
+      "epoch": 0.4166666666666667,
+      "grad_norm": 0.4831899106502533,
+      "learning_rate": 0.00019477332825545925,
+      "loss": 1.0213,
+      "step": 2340
+    },
+    {
+      "epoch": 0.4168447293447293,
+      "grad_norm": 0.5392552614212036,
+      "learning_rate": 0.0001947688612421979,
+      "loss": 1.3251,
+      "step": 2341
+    },
+    {
+      "epoch": 0.417022792022792,
+      "grad_norm": 0.5003608465194702,
+      "learning_rate": 0.00019476439237213754,
+      "loss": 1.0714,
+      "step": 2342
+    },
+    {
+      "epoch": 0.4172008547008547,
+      "grad_norm": 0.5016986727714539,
+      "learning_rate": 0.00019475992164536582,
+      "loss": 1.0656,
+      "step": 2343
+    },
+    {
+      "epoch": 0.41737891737891736,
+      "grad_norm": 0.5139234066009521,
+      "learning_rate": 0.00019475544906197024,
+      "loss": 1.1317,
+      "step": 2344
+    },
+    {
+      "epoch": 0.41755698005698005,
+      "grad_norm": 0.582478940486908,
+      "learning_rate": 0.00019475097462203847,
+      "loss": 1.4209,
+      "step": 2345
+    },
+    {
+      "epoch": 0.41773504273504275,
+      "grad_norm": 0.5248767137527466,
+      "learning_rate": 0.00019474649832565823,
+      "loss": 1.2965,
+      "step": 2346
+    },
+    {
+      "epoch": 0.4179131054131054,
+      "grad_norm": 0.4977390170097351,
+      "learning_rate": 0.00019474202017291713,
+      "loss": 1.3319,
+      "step": 2347
+    },
+    {
+      "epoch": 0.4180911680911681,
+      "grad_norm": 0.4868984818458557,
+      "learning_rate": 0.00019473754016390298,
+      "loss": 1.0595,
+      "step": 2348
+    },
+    {
+      "epoch": 0.4182692307692308,
+      "grad_norm": 0.5965346693992615,
+      "learning_rate": 0.00019473305829870353,
+      "loss": 1.2289,
+      "step": 2349
+    },
+    {
+      "epoch": 0.4184472934472934,
+      "grad_norm": 0.46590209007263184,
+      "learning_rate": 0.0001947285745774066,
+      "loss": 1.0468,
+      "step": 2350
+    },
+    {
+      "epoch": 0.4186253561253561,
+      "grad_norm": 0.497811883687973,
+      "learning_rate": 0.0001947240890001,
+      "loss": 1.1247,
+      "step": 2351
+    },
+    {
+      "epoch": 0.4188034188034188,
+      "grad_norm": 0.5348289012908936,
+      "learning_rate": 0.0001947196015668717,
+      "loss": 0.9496,
+      "step": 2352
+    },
+    {
+      "epoch": 0.41898148148148145,
+      "grad_norm": 0.5086174607276917,
+      "learning_rate": 0.0001947151122778095,
+      "loss": 0.8869,
+      "step": 2353
+    },
+    {
+      "epoch": 0.41915954415954415,
+      "grad_norm": 0.4844677150249481,
+      "learning_rate": 0.00019471062113300146,
+      "loss": 0.847,
+      "step": 2354
+    },
+    {
+      "epoch": 0.41933760683760685,
+      "grad_norm": 0.5395866632461548,
+      "learning_rate": 0.00019470612813253556,
+      "loss": 0.9684,
+      "step": 2355
+    },
+    {
+      "epoch": 0.41951566951566954,
+      "grad_norm": 0.479403018951416,
+      "learning_rate": 0.0001947016332764998,
+      "loss": 1.0532,
+      "step": 2356
+    },
+    {
+      "epoch": 0.4196937321937322,
+      "grad_norm": 0.5499961376190186,
+      "learning_rate": 0.00019469713656498227,
+      "loss": 1.2565,
+      "step": 2357
+    },
+    {
+      "epoch": 0.4198717948717949,
+      "grad_norm": 0.5865352153778076,
+      "learning_rate": 0.00019469263799807104,
+      "loss": 1.1349,
+      "step": 2358
+    },
+    {
+      "epoch": 0.4200498575498576,
+      "grad_norm": 0.4454309046268463,
+      "learning_rate": 0.00019468813757585432,
+      "loss": 0.9631,
+      "step": 2359
+    },
+    {
+      "epoch": 0.4202279202279202,
+      "grad_norm": 0.48426875472068787,
+      "learning_rate": 0.00019468363529842023,
+      "loss": 0.9795,
+      "step": 2360
+    },
+    {
+      "epoch": 0.4204059829059829,
+      "grad_norm": 0.47428226470947266,
+      "learning_rate": 0.00019467913116585697,
+      "loss": 0.9316,
+      "step": 2361
+    },
+    {
+      "epoch": 0.4205840455840456,
+      "grad_norm": 0.5193758010864258,
+      "learning_rate": 0.00019467462517825282,
+      "loss": 1.235,
+      "step": 2362
+    },
+    {
+      "epoch": 0.42076210826210825,
+      "grad_norm": 0.49845513701438904,
+      "learning_rate": 0.00019467011733569607,
+      "loss": 1.2413,
+      "step": 2363
+    },
+    {
+      "epoch": 0.42094017094017094,
+      "grad_norm": 0.45483845472335815,
+      "learning_rate": 0.00019466560763827502,
+      "loss": 1.2817,
+      "step": 2364
+    },
+    {
+      "epoch": 0.42111823361823364,
+      "grad_norm": 0.43345287442207336,
+      "learning_rate": 0.00019466109608607806,
+      "loss": 0.8568,
+      "step": 2365
+    },
+    {
+      "epoch": 0.4212962962962963,
+      "grad_norm": 0.4467088282108307,
+      "learning_rate": 0.00019465658267919352,
+      "loss": 1.1408,
+      "step": 2366
+    },
+    {
+      "epoch": 0.421474358974359,
+      "grad_norm": 0.6705610156059265,
+      "learning_rate": 0.00019465206741770992,
+      "loss": 1.445,
+      "step": 2367
+    },
+    {
+      "epoch": 0.42165242165242167,
+      "grad_norm": 0.5037859678268433,
+      "learning_rate": 0.00019464755030171565,
+      "loss": 0.8682,
+      "step": 2368
+    },
+    {
+      "epoch": 0.4218304843304843,
+      "grad_norm": 0.49576324224472046,
+      "learning_rate": 0.00019464303133129928,
+      "loss": 0.8387,
+      "step": 2369
+    },
+    {
+      "epoch": 0.422008547008547,
+      "grad_norm": 0.5222806334495544,
+      "learning_rate": 0.00019463851050654927,
+      "loss": 1.1443,
+      "step": 2370
+    },
+    {
+      "epoch": 0.4221866096866097,
+      "grad_norm": 0.4966863989830017,
+      "learning_rate": 0.00019463398782755426,
+      "loss": 1.1555,
+      "step": 2371
+    },
+    {
+      "epoch": 0.42236467236467234,
+      "grad_norm": 0.6140168309211731,
+      "learning_rate": 0.00019462946329440285,
+      "loss": 1.2264,
+      "step": 2372
+    },
+    {
+      "epoch": 0.42254273504273504,
+      "grad_norm": 0.4906651973724365,
+      "learning_rate": 0.0001946249369071837,
+      "loss": 1.2459,
+      "step": 2373
+    },
+    {
+      "epoch": 0.42272079772079774,
+      "grad_norm": 0.5956700444221497,
+      "learning_rate": 0.00019462040866598544,
+      "loss": 1.1521,
+      "step": 2374
+    },
+    {
+      "epoch": 0.4228988603988604,
+      "grad_norm": 0.46044886112213135,
+      "learning_rate": 0.00019461587857089687,
+      "loss": 1.2084,
+      "step": 2375
+    },
+    {
+      "epoch": 0.4230769230769231,
+      "grad_norm": 0.5109430551528931,
+      "learning_rate": 0.00019461134662200668,
+      "loss": 1.2684,
+      "step": 2376
+    },
+    {
+      "epoch": 0.42325498575498577,
+      "grad_norm": 0.4373733103275299,
+      "learning_rate": 0.0001946068128194037,
+      "loss": 1.0451,
+      "step": 2377
+    },
+    {
+      "epoch": 0.4234330484330484,
+      "grad_norm": 0.553817868232727,
+      "learning_rate": 0.00019460227716317673,
+      "loss": 1.1052,
+      "step": 2378
+    },
+    {
+      "epoch": 0.4236111111111111,
+      "grad_norm": 0.5742647647857666,
+      "learning_rate": 0.00019459773965341468,
+      "loss": 1.1647,
+      "step": 2379
+    },
+    {
+      "epoch": 0.4237891737891738,
+      "grad_norm": 0.5461940169334412,
+      "learning_rate": 0.00019459320029020642,
+      "loss": 1.0953,
+      "step": 2380
+    },
+    {
+      "epoch": 0.42396723646723644,
+      "grad_norm": 0.5837802290916443,
+      "learning_rate": 0.0001945886590736409,
+      "loss": 1.1303,
+      "step": 2381
+    },
+    {
+      "epoch": 0.42414529914529914,
+      "grad_norm": 0.5316985249519348,
+      "learning_rate": 0.0001945841160038071,
+      "loss": 1.1204,
+      "step": 2382
+    },
+    {
+      "epoch": 0.42432336182336183,
+      "grad_norm": 0.5846191048622131,
+      "learning_rate": 0.00019457957108079404,
+      "loss": 1.2622,
+      "step": 2383
+    },
+    {
+      "epoch": 0.42450142450142453,
+      "grad_norm": 0.43266957998275757,
+      "learning_rate": 0.00019457502430469075,
+      "loss": 0.9834,
+      "step": 2384
+    },
+    {
+      "epoch": 0.42467948717948717,
+      "grad_norm": 0.514081597328186,
+      "learning_rate": 0.00019457047567558632,
+      "loss": 0.8413,
+      "step": 2385
+    },
+    {
+      "epoch": 0.42485754985754987,
+      "grad_norm": 0.4831700325012207,
+      "learning_rate": 0.00019456592519356987,
+      "loss": 0.9244,
+      "step": 2386
+    },
+    {
+      "epoch": 0.42503561253561256,
+      "grad_norm": 0.5612850785255432,
+      "learning_rate": 0.00019456137285873057,
+      "loss": 0.9438,
+      "step": 2387
+    },
+    {
+      "epoch": 0.4252136752136752,
+      "grad_norm": 0.5197352766990662,
+      "learning_rate": 0.00019455681867115758,
+      "loss": 1.1095,
+      "step": 2388
+    },
+    {
+      "epoch": 0.4253917378917379,
+      "grad_norm": 0.5045261979103088,
+      "learning_rate": 0.00019455226263094018,
+      "loss": 1.0007,
+      "step": 2389
+    },
+    {
+      "epoch": 0.4255698005698006,
+      "grad_norm": 0.5167570114135742,
+      "learning_rate": 0.00019454770473816758,
+      "loss": 1.1335,
+      "step": 2390
+    },
+    {
+      "epoch": 0.42574786324786323,
+      "grad_norm": 0.49262070655822754,
+      "learning_rate": 0.00019454314499292913,
+      "loss": 1.0436,
+      "step": 2391
+    },
+    {
+      "epoch": 0.42592592592592593,
+      "grad_norm": 0.4489207863807678,
+      "learning_rate": 0.00019453858339531417,
+      "loss": 1.0138,
+      "step": 2392
+    },
+    {
+      "epoch": 0.4261039886039886,
+      "grad_norm": 0.6024920344352722,
+      "learning_rate": 0.00019453401994541203,
+      "loss": 1.1921,
+      "step": 2393
+    },
+    {
+      "epoch": 0.42628205128205127,
+      "grad_norm": 0.46807861328125,
+      "learning_rate": 0.00019452945464331215,
+      "loss": 1.0947,
+      "step": 2394
+    },
+    {
+      "epoch": 0.42646011396011396,
+      "grad_norm": 0.48776543140411377,
+      "learning_rate": 0.00019452488748910397,
+      "loss": 1.0029,
+      "step": 2395
+    },
+    {
+      "epoch": 0.42663817663817666,
+      "grad_norm": 0.4798663556575775,
+      "learning_rate": 0.000194520318482877,
+      "loss": 0.7863,
+      "step": 2396
+    },
+    {
+      "epoch": 0.4268162393162393,
+      "grad_norm": 0.5067816972732544,
+      "learning_rate": 0.0001945157476247207,
+      "loss": 1.0049,
+      "step": 2397
+    },
+    {
+      "epoch": 0.426994301994302,
+      "grad_norm": 0.5179638266563416,
+      "learning_rate": 0.00019451117491472468,
+      "loss": 1.1851,
+      "step": 2398
+    },
+    {
+      "epoch": 0.4271723646723647,
+      "grad_norm": 0.4782430827617645,
+      "learning_rate": 0.00019450660035297854,
+      "loss": 1.125,
+      "step": 2399
+    },
+    {
+      "epoch": 0.42735042735042733,
+      "grad_norm": 0.560077965259552,
+      "learning_rate": 0.00019450202393957186,
+      "loss": 1.1843,
+      "step": 2400
+    },
+    {
+      "epoch": 0.42752849002849,
+      "grad_norm": 0.5247970223426819,
+      "learning_rate": 0.00019449744567459436,
+      "loss": 1.1576,
+      "step": 2401
+    },
+    {
+      "epoch": 0.4277065527065527,
+      "grad_norm": 0.6414062976837158,
+      "learning_rate": 0.00019449286555813568,
+      "loss": 1.1833,
+      "step": 2402
+    },
+    {
+      "epoch": 0.42788461538461536,
+      "grad_norm": 0.5006586909294128,
+      "learning_rate": 0.00019448828359028563,
+      "loss": 1.1778,
+      "step": 2403
+    },
+    {
+      "epoch": 0.42806267806267806,
+      "grad_norm": 0.4946450889110565,
+      "learning_rate": 0.0001944836997711339,
+      "loss": 1.1611,
+      "step": 2404
+    },
+    {
+      "epoch": 0.42824074074074076,
+      "grad_norm": 0.4601200222969055,
+      "learning_rate": 0.00019447911410077037,
+      "loss": 1.2456,
+      "step": 2405
+    },
+    {
+      "epoch": 0.4284188034188034,
+      "grad_norm": 0.4653947651386261,
+      "learning_rate": 0.00019447452657928485,
+      "loss": 1.0941,
+      "step": 2406
+    },
+    {
+      "epoch": 0.4285968660968661,
+      "grad_norm": 0.5015713572502136,
+      "learning_rate": 0.00019446993720676726,
+      "loss": 1.3113,
+      "step": 2407
+    },
+    {
+      "epoch": 0.4287749287749288,
+      "grad_norm": 0.5803143978118896,
+      "learning_rate": 0.0001944653459833075,
+      "loss": 1.0568,
+      "step": 2408
+    },
+    {
+      "epoch": 0.42895299145299143,
+      "grad_norm": 0.5259647965431213,
+      "learning_rate": 0.0001944607529089955,
+      "loss": 1.1243,
+      "step": 2409
+    },
+    {
+      "epoch": 0.4291310541310541,
+      "grad_norm": 0.5150414109230042,
+      "learning_rate": 0.00019445615798392124,
+      "loss": 1.0676,
+      "step": 2410
+    },
+    {
+      "epoch": 0.4293091168091168,
+      "grad_norm": 0.5848649740219116,
+      "learning_rate": 0.0001944515612081748,
+      "loss": 1.0671,
+      "step": 2411
+    },
+    {
+      "epoch": 0.42948717948717946,
+      "grad_norm": 0.5696990489959717,
+      "learning_rate": 0.00019444696258184626,
+      "loss": 1.3323,
+      "step": 2412
+    },
+    {
+      "epoch": 0.42966524216524216,
+      "grad_norm": 0.49822330474853516,
+      "learning_rate": 0.00019444236210502567,
+      "loss": 1.1004,
+      "step": 2413
+    },
+    {
+      "epoch": 0.42984330484330485,
+      "grad_norm": 0.4683490991592407,
+      "learning_rate": 0.00019443775977780317,
+      "loss": 0.9768,
+      "step": 2414
+    },
+    {
+      "epoch": 0.43002136752136755,
+      "grad_norm": 0.5703811049461365,
+      "learning_rate": 0.00019443315560026893,
+      "loss": 1.154,
+      "step": 2415
+    },
+    {
+      "epoch": 0.4301994301994302,
+      "grad_norm": 0.5121861100196838,
+      "learning_rate": 0.0001944285495725132,
+      "loss": 1.1388,
+      "step": 2416
+    },
+    {
+      "epoch": 0.4303774928774929,
+      "grad_norm": 0.4864094853401184,
+      "learning_rate": 0.00019442394169462619,
+      "loss": 0.9214,
+      "step": 2417
+    },
+    {
+      "epoch": 0.4305555555555556,
+      "grad_norm": 0.5234864354133606,
+      "learning_rate": 0.0001944193319666982,
+      "loss": 1.2787,
+      "step": 2418
+    },
+    {
+      "epoch": 0.4307336182336182,
+      "grad_norm": 0.5137650370597839,
+      "learning_rate": 0.00019441472038881955,
+      "loss": 1.1406,
+      "step": 2419
+    },
+    {
+      "epoch": 0.4309116809116809,
+      "grad_norm": 0.49687784910202026,
+      "learning_rate": 0.00019441010696108054,
+      "loss": 0.93,
+      "step": 2420
+    },
+    {
+      "epoch": 0.4310897435897436,
+      "grad_norm": 0.5078722834587097,
+      "learning_rate": 0.00019440549168357163,
+      "loss": 1.1417,
+      "step": 2421
+    },
+    {
+      "epoch": 0.43126780626780625,
+      "grad_norm": 0.4483391046524048,
+      "learning_rate": 0.00019440087455638324,
+      "loss": 0.9016,
+      "step": 2422
+    },
+    {
+      "epoch": 0.43144586894586895,
+      "grad_norm": 0.5963045954704285,
+      "learning_rate": 0.00019439625557960576,
+      "loss": 1.1567,
+      "step": 2423
+    },
+    {
+      "epoch": 0.43162393162393164,
+      "grad_norm": 0.5534471273422241,
+      "learning_rate": 0.0001943916347533298,
+      "loss": 1.1409,
+      "step": 2424
+    },
+    {
+      "epoch": 0.4318019943019943,
+      "grad_norm": 0.6400241851806641,
+      "learning_rate": 0.0001943870120776458,
+      "loss": 1.2041,
+      "step": 2425
+    },
+    {
+      "epoch": 0.431980056980057,
+      "grad_norm": 0.4599420726299286,
+      "learning_rate": 0.0001943823875526444,
+      "loss": 1.023,
+      "step": 2426
+    },
+    {
+      "epoch": 0.4321581196581197,
+      "grad_norm": 0.4799708425998688,
+      "learning_rate": 0.00019437776117841614,
+      "loss": 1.0872,
+      "step": 2427
+    },
+    {
+      "epoch": 0.4323361823361823,
+      "grad_norm": 0.5138532519340515,
+      "learning_rate": 0.00019437313295505172,
+      "loss": 1.1175,
+      "step": 2428
+    },
+    {
+      "epoch": 0.432514245014245,
+      "grad_norm": 0.538223147392273,
+      "learning_rate": 0.00019436850288264183,
+      "loss": 1.1203,
+      "step": 2429
+    },
+    {
+      "epoch": 0.4326923076923077,
+      "grad_norm": 0.458044171333313,
+      "learning_rate": 0.00019436387096127713,
+      "loss": 1.0383,
+      "step": 2430
+    },
+    {
+      "epoch": 0.43287037037037035,
+      "grad_norm": 0.5928303599357605,
+      "learning_rate": 0.00019435923719104842,
+      "loss": 1.1191,
+      "step": 2431
+    },
+    {
+      "epoch": 0.43304843304843305,
+      "grad_norm": 0.5818437933921814,
+      "learning_rate": 0.00019435460157204645,
+      "loss": 1.0352,
+      "step": 2432
+    },
+    {
+      "epoch": 0.43322649572649574,
+      "grad_norm": 0.487341046333313,
+      "learning_rate": 0.0001943499641043621,
+      "loss": 1.2608,
+      "step": 2433
+    },
+    {
+      "epoch": 0.4334045584045584,
+      "grad_norm": 0.4737292230129242,
+      "learning_rate": 0.0001943453247880862,
+      "loss": 1.0084,
+      "step": 2434
+    },
+    {
+      "epoch": 0.4335826210826211,
+      "grad_norm": 0.4251207709312439,
+      "learning_rate": 0.0001943406836233096,
+      "loss": 0.9163,
+      "step": 2435
+    },
+    {
+      "epoch": 0.4337606837606838,
+      "grad_norm": 0.49468478560447693,
+      "learning_rate": 0.00019433604061012331,
+      "loss": 1.0293,
+      "step": 2436
+    },
+    {
+      "epoch": 0.4339387464387464,
+      "grad_norm": 0.47120022773742676,
+      "learning_rate": 0.00019433139574861826,
+      "loss": 1.0097,
+      "step": 2437
+    },
+    {
+      "epoch": 0.4341168091168091,
+      "grad_norm": 0.5060358047485352,
+      "learning_rate": 0.00019432674903888548,
+      "loss": 1.0683,
+      "step": 2438
+    },
+    {
+      "epoch": 0.4342948717948718,
+      "grad_norm": 0.5455917119979858,
+      "learning_rate": 0.00019432210048101598,
+      "loss": 0.8886,
+      "step": 2439
+    },
+    {
+      "epoch": 0.43447293447293445,
+      "grad_norm": 0.7960546612739563,
+      "learning_rate": 0.00019431745007510086,
+      "loss": 0.8648,
+      "step": 2440
+    },
+    {
+      "epoch": 0.43465099715099714,
+      "grad_norm": 0.5069689154624939,
+      "learning_rate": 0.00019431279782123126,
+      "loss": 1.1315,
+      "step": 2441
+    },
+    {
+      "epoch": 0.43482905982905984,
+      "grad_norm": 0.5597776174545288,
+      "learning_rate": 0.0001943081437194983,
+      "loss": 1.2281,
+      "step": 2442
+    },
+    {
+      "epoch": 0.43500712250712253,
+      "grad_norm": 0.4527420997619629,
+      "learning_rate": 0.00019430348776999315,
+      "loss": 0.7576,
+      "step": 2443
+    },
+    {
+      "epoch": 0.4351851851851852,
+      "grad_norm": 0.5625936388969421,
+      "learning_rate": 0.00019429882997280706,
+      "loss": 1.0302,
+      "step": 2444
+    },
+    {
+      "epoch": 0.43536324786324787,
+      "grad_norm": 0.5173513293266296,
+      "learning_rate": 0.0001942941703280313,
+      "loss": 1.2255,
+      "step": 2445
+    },
+    {
+      "epoch": 0.43554131054131057,
+      "grad_norm": 0.45889151096343994,
+      "learning_rate": 0.00019428950883575714,
+      "loss": 0.9322,
+      "step": 2446
+    },
+    {
+      "epoch": 0.4357193732193732,
+      "grad_norm": 0.5288477540016174,
+      "learning_rate": 0.00019428484549607593,
+      "loss": 1.0572,
+      "step": 2447
+    },
+    {
+      "epoch": 0.4358974358974359,
+      "grad_norm": 0.48328033089637756,
+      "learning_rate": 0.00019428018030907902,
+      "loss": 1.1213,
+      "step": 2448
+    },
+    {
+      "epoch": 0.4360754985754986,
+      "grad_norm": 0.5146737098693848,
+      "learning_rate": 0.00019427551327485786,
+      "loss": 0.9633,
+      "step": 2449
+    },
+    {
+      "epoch": 0.43625356125356124,
+      "grad_norm": 0.5138360261917114,
+      "learning_rate": 0.00019427084439350382,
+      "loss": 1.0561,
+      "step": 2450
+    },
+    {
+      "epoch": 0.43643162393162394,
+      "grad_norm": 0.5192533135414124,
+      "learning_rate": 0.00019426617366510843,
+      "loss": 1.1704,
+      "step": 2451
+    },
+    {
+      "epoch": 0.43660968660968663,
+      "grad_norm": 0.4819495379924774,
+      "learning_rate": 0.00019426150108976318,
+      "loss": 1.0958,
+      "step": 2452
+    },
+    {
+      "epoch": 0.43678774928774927,
+      "grad_norm": 0.4626680910587311,
+      "learning_rate": 0.00019425682666755965,
+      "loss": 1.1872,
+      "step": 2453
+    },
+    {
+      "epoch": 0.43696581196581197,
+      "grad_norm": 0.5773931741714478,
+      "learning_rate": 0.00019425215039858937,
+      "loss": 1.0722,
+      "step": 2454
+    },
+    {
+      "epoch": 0.43714387464387466,
+      "grad_norm": 0.5003872513771057,
+      "learning_rate": 0.00019424747228294402,
+      "loss": 1.0561,
+      "step": 2455
+    },
+    {
+      "epoch": 0.4373219373219373,
+      "grad_norm": 0.47370314598083496,
+      "learning_rate": 0.0001942427923207152,
+      "loss": 1.1619,
+      "step": 2456
+    },
+    {
+      "epoch": 0.4375,
+      "grad_norm": 0.466421514749527,
+      "learning_rate": 0.00019423811051199466,
+      "loss": 1.1311,
+      "step": 2457
+    },
+    {
+      "epoch": 0.4376780626780627,
+      "grad_norm": 0.44564682245254517,
+      "learning_rate": 0.00019423342685687413,
+      "loss": 1.1889,
+      "step": 2458
+    },
+    {
+      "epoch": 0.43785612535612534,
+      "grad_norm": 0.40986698865890503,
+      "learning_rate": 0.00019422874135544533,
+      "loss": 0.7312,
+      "step": 2459
+    },
+    {
+      "epoch": 0.43803418803418803,
+      "grad_norm": 0.4714358448982239,
+      "learning_rate": 0.0001942240540078001,
+      "loss": 0.9273,
+      "step": 2460
+    },
+    {
+      "epoch": 0.43821225071225073,
+      "grad_norm": 0.5298398733139038,
+      "learning_rate": 0.00019421936481403025,
+      "loss": 1.3377,
+      "step": 2461
+    },
+    {
+      "epoch": 0.43839031339031337,
+      "grad_norm": 0.6326695680618286,
+      "learning_rate": 0.0001942146737742277,
+      "loss": 1.0258,
+      "step": 2462
+    },
+    {
+      "epoch": 0.43856837606837606,
+      "grad_norm": 0.5087653994560242,
+      "learning_rate": 0.00019420998088848427,
+      "loss": 1.0007,
+      "step": 2463
+    },
+    {
+      "epoch": 0.43874643874643876,
+      "grad_norm": 0.4895429313182831,
+      "learning_rate": 0.00019420528615689202,
+      "loss": 1.0032,
+      "step": 2464
+    },
+    {
+      "epoch": 0.4389245014245014,
+      "grad_norm": 0.5029937028884888,
+      "learning_rate": 0.00019420058957954285,
+      "loss": 1.2877,
+      "step": 2465
+    },
+    {
+      "epoch": 0.4391025641025641,
+      "grad_norm": 0.4953192174434662,
+      "learning_rate": 0.00019419589115652884,
+      "loss": 1.0759,
+      "step": 2466
+    },
+    {
+      "epoch": 0.4392806267806268,
+      "grad_norm": 0.5081778168678284,
+      "learning_rate": 0.000194191190887942,
+      "loss": 0.8816,
+      "step": 2467
+    },
+    {
+      "epoch": 0.43945868945868943,
+      "grad_norm": 0.5065913200378418,
+      "learning_rate": 0.00019418648877387446,
+      "loss": 1.0362,
+      "step": 2468
+    },
+    {
+      "epoch": 0.43963675213675213,
+      "grad_norm": 0.540600061416626,
+      "learning_rate": 0.00019418178481441832,
+      "loss": 1.0911,
+      "step": 2469
+    },
+    {
+      "epoch": 0.4398148148148148,
+      "grad_norm": 0.5122954845428467,
+      "learning_rate": 0.00019417707900966572,
+      "loss": 0.9866,
+      "step": 2470
+    },
+    {
+      "epoch": 0.43999287749287747,
+      "grad_norm": 0.5380190014839172,
+      "learning_rate": 0.00019417237135970893,
+      "loss": 1.2775,
+      "step": 2471
+    },
+    {
+      "epoch": 0.44017094017094016,
+      "grad_norm": 1.2977570295333862,
+      "learning_rate": 0.00019416766186464016,
+      "loss": 1.3993,
+      "step": 2472
+    },
+    {
+      "epoch": 0.44034900284900286,
+      "grad_norm": 0.48105308413505554,
+      "learning_rate": 0.00019416295052455165,
+      "loss": 0.9369,
+      "step": 2473
+    },
+    {
+      "epoch": 0.44052706552706555,
+      "grad_norm": 0.4742157459259033,
+      "learning_rate": 0.00019415823733953574,
+      "loss": 1.101,
+      "step": 2474
+    },
+    {
+      "epoch": 0.4407051282051282,
+      "grad_norm": 0.4958631694316864,
+      "learning_rate": 0.00019415352230968473,
+      "loss": 0.9906,
+      "step": 2475
+    },
+    {
+      "epoch": 0.4408831908831909,
+      "grad_norm": 0.5808146595954895,
+      "learning_rate": 0.00019414880543509107,
+      "loss": 1.2315,
+      "step": 2476
+    },
+    {
+      "epoch": 0.4410612535612536,
+      "grad_norm": 0.4294755160808563,
+      "learning_rate": 0.00019414408671584714,
+      "loss": 0.8275,
+      "step": 2477
+    },
+    {
+      "epoch": 0.4412393162393162,
+      "grad_norm": 0.5346055626869202,
+      "learning_rate": 0.0001941393661520454,
+      "loss": 1.2432,
+      "step": 2478
+    },
+    {
+      "epoch": 0.4414173789173789,
+      "grad_norm": 0.5827590227127075,
+      "learning_rate": 0.00019413464374377833,
+      "loss": 1.3204,
+      "step": 2479
+    },
+    {
+      "epoch": 0.4415954415954416,
+      "grad_norm": 0.45688143372535706,
+      "learning_rate": 0.00019412991949113847,
+      "loss": 0.9307,
+      "step": 2480
+    },
+    {
+      "epoch": 0.44177350427350426,
+      "grad_norm": 0.512999415397644,
+      "learning_rate": 0.0001941251933942184,
+      "loss": 1.2808,
+      "step": 2481
+    },
+    {
+      "epoch": 0.44195156695156695,
+      "grad_norm": 0.4546334445476532,
+      "learning_rate": 0.00019412046545311064,
+      "loss": 1.0156,
+      "step": 2482
+    },
+    {
+      "epoch": 0.44212962962962965,
+      "grad_norm": 0.48552581667900085,
+      "learning_rate": 0.00019411573566790793,
+      "loss": 1.3798,
+      "step": 2483
+    },
+    {
+      "epoch": 0.4423076923076923,
+      "grad_norm": 0.511970579624176,
+      "learning_rate": 0.00019411100403870287,
+      "loss": 1.065,
+      "step": 2484
+    },
+    {
+      "epoch": 0.442485754985755,
+      "grad_norm": 0.6367824077606201,
+      "learning_rate": 0.00019410627056558815,
+      "loss": 1.3242,
+      "step": 2485
+    },
+    {
+      "epoch": 0.4426638176638177,
+      "grad_norm": 0.48913368582725525,
+      "learning_rate": 0.00019410153524865659,
+      "loss": 0.9761,
+      "step": 2486
+    },
+    {
+      "epoch": 0.4428418803418803,
+      "grad_norm": 0.5077710151672363,
+      "learning_rate": 0.0001940967980880009,
+      "loss": 1.1023,
+      "step": 2487
+    },
+    {
+      "epoch": 0.443019943019943,
+      "grad_norm": 0.4956335723400116,
+      "learning_rate": 0.00019409205908371395,
+      "loss": 1.1788,
+      "step": 2488
+    },
+    {
+      "epoch": 0.4431980056980057,
+      "grad_norm": 0.4726616442203522,
+      "learning_rate": 0.00019408731823588853,
+      "loss": 1.1445,
+      "step": 2489
+    },
+    {
+      "epoch": 0.44337606837606836,
+      "grad_norm": 0.5676438212394714,
+      "learning_rate": 0.00019408257554461757,
+      "loss": 1.0344,
+      "step": 2490
+    },
+    {
+      "epoch": 0.44355413105413105,
+      "grad_norm": 0.537656843662262,
+      "learning_rate": 0.000194077831009994,
+      "loss": 0.9876,
+      "step": 2491
+    },
+    {
+      "epoch": 0.44373219373219375,
+      "grad_norm": 0.517905592918396,
+      "learning_rate": 0.00019407308463211074,
+      "loss": 1.1389,
+      "step": 2492
+    },
+    {
+      "epoch": 0.4439102564102564,
+      "grad_norm": 0.49227026104927063,
+      "learning_rate": 0.0001940683364110608,
+      "loss": 1.0351,
+      "step": 2493
+    },
+    {
+      "epoch": 0.4440883190883191,
+      "grad_norm": 0.5131173729896545,
+      "learning_rate": 0.00019406358634693725,
+      "loss": 1.0351,
+      "step": 2494
+    },
+    {
+      "epoch": 0.4442663817663818,
+      "grad_norm": 0.5064495205879211,
+      "learning_rate": 0.0001940588344398331,
+      "loss": 1.0248,
+      "step": 2495
+    },
+    {
+      "epoch": 0.4444444444444444,
+      "grad_norm": 0.44107526540756226,
+      "learning_rate": 0.00019405408068984148,
+      "loss": 0.8068,
+      "step": 2496
+    },
+    {
+      "epoch": 0.4446225071225071,
+      "grad_norm": 0.6711848378181458,
+      "learning_rate": 0.00019404932509705554,
+      "loss": 1.059,
+      "step": 2497
+    },
+    {
+      "epoch": 0.4448005698005698,
+      "grad_norm": 0.5862596035003662,
+      "learning_rate": 0.00019404456766156845,
+      "loss": 1.2012,
+      "step": 2498
+    },
+    {
+      "epoch": 0.44497863247863245,
+      "grad_norm": 0.5528512001037598,
+      "learning_rate": 0.0001940398083834734,
+      "loss": 1.1121,
+      "step": 2499
+    },
+    {
+      "epoch": 0.44515669515669515,
+      "grad_norm": 0.5326655507087708,
+      "learning_rate": 0.0001940350472628637,
+      "loss": 1.166,
+      "step": 2500
+    },
+    {
+      "epoch": 0.44533475783475784,
+      "grad_norm": 0.5384873747825623,
+      "learning_rate": 0.00019403028429983252,
+      "loss": 1.4111,
+      "step": 2501
+    },
+    {
+      "epoch": 0.44551282051282054,
+      "grad_norm": 0.5142310857772827,
+      "learning_rate": 0.0001940255194944733,
+      "loss": 1.3353,
+      "step": 2502
+    },
+    {
+      "epoch": 0.4456908831908832,
+      "grad_norm": 0.49124231934547424,
+      "learning_rate": 0.0001940207528468793,
+      "loss": 1.1443,
+      "step": 2503
+    },
+    {
+      "epoch": 0.4458689458689459,
+      "grad_norm": 0.509713888168335,
+      "learning_rate": 0.000194015984357144,
+      "loss": 1.1857,
+      "step": 2504
+    },
+    {
+      "epoch": 0.44604700854700857,
+      "grad_norm": 0.5211275219917297,
+      "learning_rate": 0.00019401121402536078,
+      "loss": 0.9911,
+      "step": 2505
+    },
+    {
+      "epoch": 0.4462250712250712,
+      "grad_norm": 0.480340838432312,
+      "learning_rate": 0.00019400644185162312,
+      "loss": 1.1018,
+      "step": 2506
+    },
+    {
+      "epoch": 0.4464031339031339,
+      "grad_norm": 0.4212559163570404,
+      "learning_rate": 0.00019400166783602448,
+      "loss": 0.7501,
+      "step": 2507
+    },
+    {
+      "epoch": 0.4465811965811966,
+      "grad_norm": 0.5110511183738708,
+      "learning_rate": 0.00019399689197865846,
+      "loss": 1.1244,
+      "step": 2508
+    },
+    {
+      "epoch": 0.44675925925925924,
+      "grad_norm": 0.5604230165481567,
+      "learning_rate": 0.0001939921142796186,
+      "loss": 1.1066,
+      "step": 2509
+    },
+    {
+      "epoch": 0.44693732193732194,
+      "grad_norm": 0.5578675270080566,
+      "learning_rate": 0.0001939873347389985,
+      "loss": 1.0514,
+      "step": 2510
+    },
+    {
+      "epoch": 0.44711538461538464,
+      "grad_norm": 0.520908772945404,
+      "learning_rate": 0.00019398255335689184,
+      "loss": 1.1217,
+      "step": 2511
+    },
+    {
+      "epoch": 0.4472934472934473,
+      "grad_norm": 0.4405131936073303,
+      "learning_rate": 0.00019397777013339224,
+      "loss": 1.043,
+      "step": 2512
+    },
+    {
+      "epoch": 0.44747150997151,
+      "grad_norm": 0.5217751860618591,
+      "learning_rate": 0.0001939729850685935,
+      "loss": 1.1301,
+      "step": 2513
+    },
+    {
+      "epoch": 0.44764957264957267,
+      "grad_norm": 0.6151493191719055,
+      "learning_rate": 0.00019396819816258932,
+      "loss": 1.3498,
+      "step": 2514
+    },
+    {
+      "epoch": 0.4478276353276353,
+      "grad_norm": 0.5622836947441101,
+      "learning_rate": 0.0001939634094154735,
+      "loss": 1.146,
+      "step": 2515
+    },
+    {
+      "epoch": 0.448005698005698,
+      "grad_norm": 0.4671688973903656,
+      "learning_rate": 0.00019395861882733984,
+      "loss": 0.9456,
+      "step": 2516
+    },
+    {
+      "epoch": 0.4481837606837607,
+      "grad_norm": 0.453951358795166,
+      "learning_rate": 0.00019395382639828223,
+      "loss": 1.0042,
+      "step": 2517
+    },
+    {
+      "epoch": 0.44836182336182334,
+      "grad_norm": 0.5150699615478516,
+      "learning_rate": 0.0001939490321283946,
+      "loss": 1.166,
+      "step": 2518
+    },
+    {
+      "epoch": 0.44853988603988604,
+      "grad_norm": 0.5718298554420471,
+      "learning_rate": 0.0001939442360177708,
+      "loss": 1.2033,
+      "step": 2519
+    },
+    {
+      "epoch": 0.44871794871794873,
+      "grad_norm": 0.5306782126426697,
+      "learning_rate": 0.00019393943806650488,
+      "loss": 1.0765,
+      "step": 2520
+    },
+    {
+      "epoch": 0.4488960113960114,
+      "grad_norm": 0.47633033990859985,
+      "learning_rate": 0.0001939346382746908,
+      "loss": 0.9957,
+      "step": 2521
+    },
+    {
+      "epoch": 0.44907407407407407,
+      "grad_norm": 0.496441513299942,
+      "learning_rate": 0.00019392983664242262,
+      "loss": 1.2016,
+      "step": 2522
+    },
+    {
+      "epoch": 0.44925213675213677,
+      "grad_norm": 0.45956477522850037,
+      "learning_rate": 0.00019392503316979442,
+      "loss": 1.026,
+      "step": 2523
+    },
+    {
+      "epoch": 0.4494301994301994,
+      "grad_norm": 0.5400575995445251,
+      "learning_rate": 0.0001939202278569003,
+      "loss": 1.0785,
+      "step": 2524
+    },
+    {
+      "epoch": 0.4496082621082621,
+      "grad_norm": 0.4847868084907532,
+      "learning_rate": 0.00019391542070383442,
+      "loss": 1.013,
+      "step": 2525
+    },
+    {
+      "epoch": 0.4497863247863248,
+      "grad_norm": 0.4694063663482666,
+      "learning_rate": 0.00019391061171069094,
+      "loss": 0.8793,
+      "step": 2526
+    },
+    {
+      "epoch": 0.44996438746438744,
+      "grad_norm": 0.5158169269561768,
+      "learning_rate": 0.00019390580087756413,
+      "loss": 0.9602,
+      "step": 2527
+    },
+    {
+      "epoch": 0.45014245014245013,
+      "grad_norm": 0.5404585003852844,
+      "learning_rate": 0.00019390098820454822,
+      "loss": 1.2247,
+      "step": 2528
+    },
+    {
+      "epoch": 0.45032051282051283,
+      "grad_norm": 0.5302738547325134,
+      "learning_rate": 0.00019389617369173752,
+      "loss": 0.918,
+      "step": 2529
+    },
+    {
+      "epoch": 0.45049857549857547,
+      "grad_norm": 0.5065485835075378,
+      "learning_rate": 0.00019389135733922634,
+      "loss": 1.0934,
+      "step": 2530
+    },
+    {
+      "epoch": 0.45067663817663817,
+      "grad_norm": 0.5491471886634827,
+      "learning_rate": 0.00019388653914710903,
+      "loss": 1.0736,
+      "step": 2531
+    },
+    {
+      "epoch": 0.45085470085470086,
+      "grad_norm": 0.4850206971168518,
+      "learning_rate": 0.00019388171911548005,
+      "loss": 1.2401,
+      "step": 2532
+    },
+    {
+      "epoch": 0.45103276353276356,
+      "grad_norm": 0.5419789552688599,
+      "learning_rate": 0.0001938768972444338,
+      "loss": 1.269,
+      "step": 2533
+    },
+    {
+      "epoch": 0.4512108262108262,
+      "grad_norm": 0.4209023714065552,
+      "learning_rate": 0.00019387207353406476,
+      "loss": 1.0544,
+      "step": 2534
+    },
+    {
+      "epoch": 0.4513888888888889,
+      "grad_norm": 0.578588604927063,
+      "learning_rate": 0.00019386724798446743,
+      "loss": 1.0564,
+      "step": 2535
+    },
+    {
+      "epoch": 0.4515669515669516,
+      "grad_norm": 0.5277524590492249,
+      "learning_rate": 0.00019386242059573638,
+      "loss": 1.1497,
+      "step": 2536
+    },
+    {
+      "epoch": 0.45174501424501423,
+      "grad_norm": 0.5536073446273804,
+      "learning_rate": 0.0001938575913679662,
+      "loss": 1.2213,
+      "step": 2537
+    },
+    {
+      "epoch": 0.4519230769230769,
+      "grad_norm": 0.5572254657745361,
+      "learning_rate": 0.00019385276030125143,
+      "loss": 1.0231,
+      "step": 2538
+    },
+    {
+      "epoch": 0.4521011396011396,
+      "grad_norm": 0.493847131729126,
+      "learning_rate": 0.00019384792739568686,
+      "loss": 0.9385,
+      "step": 2539
+    },
+    {
+      "epoch": 0.45227920227920226,
+      "grad_norm": 0.4641396403312683,
+      "learning_rate": 0.00019384309265136707,
+      "loss": 0.9332,
+      "step": 2540
+    },
+    {
+      "epoch": 0.45245726495726496,
+      "grad_norm": 0.5439442992210388,
+      "learning_rate": 0.00019383825606838681,
+      "loss": 1.317,
+      "step": 2541
+    },
+    {
+      "epoch": 0.45263532763532766,
+      "grad_norm": 0.7050970792770386,
+      "learning_rate": 0.00019383341764684086,
+      "loss": 0.9508,
+      "step": 2542
+    },
+    {
+      "epoch": 0.4528133903133903,
+      "grad_norm": 0.5013265013694763,
+      "learning_rate": 0.000193828577386824,
+      "loss": 1.2704,
+      "step": 2543
+    },
+    {
+      "epoch": 0.452991452991453,
+      "grad_norm": 0.47641924023628235,
+      "learning_rate": 0.0001938237352884311,
+      "loss": 1.0101,
+      "step": 2544
+    },
+    {
+      "epoch": 0.4531695156695157,
+      "grad_norm": 0.5223637819290161,
+      "learning_rate": 0.000193818891351757,
+      "loss": 1.0548,
+      "step": 2545
+    },
+    {
+      "epoch": 0.45334757834757833,
+      "grad_norm": 0.49065667390823364,
+      "learning_rate": 0.0001938140455768966,
+      "loss": 1.0927,
+      "step": 2546
+    },
+    {
+      "epoch": 0.453525641025641,
+      "grad_norm": 0.4808312654495239,
+      "learning_rate": 0.0001938091979639449,
+      "loss": 1.0599,
+      "step": 2547
+    },
+    {
+      "epoch": 0.4537037037037037,
+      "grad_norm": 0.5157489776611328,
+      "learning_rate": 0.0001938043485129968,
+      "loss": 1.2596,
+      "step": 2548
+    },
+    {
+      "epoch": 0.45388176638176636,
+      "grad_norm": 0.5983387231826782,
+      "learning_rate": 0.0001937994972241474,
+      "loss": 1.2276,
+      "step": 2549
+    },
+    {
+      "epoch": 0.45405982905982906,
+      "grad_norm": 0.49776506423950195,
+      "learning_rate": 0.00019379464409749163,
+      "loss": 1.3666,
+      "step": 2550
+    },
+    {
+      "epoch": 0.45423789173789175,
+      "grad_norm": 0.4693490266799927,
+      "learning_rate": 0.00019378978913312471,
+      "loss": 1.087,
+      "step": 2551
+    },
+    {
+      "epoch": 0.4544159544159544,
+      "grad_norm": 0.4754335880279541,
+      "learning_rate": 0.00019378493233114167,
+      "loss": 1.1282,
+      "step": 2552
+    },
+    {
+      "epoch": 0.4545940170940171,
+      "grad_norm": 0.5852862000465393,
+      "learning_rate": 0.00019378007369163776,
+      "loss": 1.1113,
+      "step": 2553
+    },
+    {
+      "epoch": 0.4547720797720798,
+      "grad_norm": 0.47442635893821716,
+      "learning_rate": 0.00019377521321470805,
+      "loss": 0.983,
+      "step": 2554
+    },
+    {
+      "epoch": 0.4549501424501424,
+      "grad_norm": 0.47432273626327515,
+      "learning_rate": 0.00019377035090044787,
+      "loss": 1.0169,
+      "step": 2555
+    },
+    {
+      "epoch": 0.4551282051282051,
+      "grad_norm": 0.4929196834564209,
+      "learning_rate": 0.00019376548674895246,
+      "loss": 1.0182,
+      "step": 2556
+    },
+    {
+      "epoch": 0.4553062678062678,
+      "grad_norm": 0.5433184504508972,
+      "learning_rate": 0.00019376062076031708,
+      "loss": 1.1339,
+      "step": 2557
+    },
+    {
+      "epoch": 0.45548433048433046,
+      "grad_norm": 0.47430408000946045,
+      "learning_rate": 0.00019375575293463715,
+      "loss": 1.1589,
+      "step": 2558
+    },
+    {
+      "epoch": 0.45566239316239315,
+      "grad_norm": 0.46641045808792114,
+      "learning_rate": 0.000193750883272008,
+      "loss": 1.029,
+      "step": 2559
+    },
+    {
+      "epoch": 0.45584045584045585,
+      "grad_norm": 0.44476228952407837,
+      "learning_rate": 0.00019374601177252502,
+      "loss": 0.8494,
+      "step": 2560
+    },
+    {
+      "epoch": 0.45601851851851855,
+      "grad_norm": 0.4886183440685272,
+      "learning_rate": 0.00019374113843628366,
+      "loss": 1.1374,
+      "step": 2561
+    },
+    {
+      "epoch": 0.4561965811965812,
+      "grad_norm": 0.4786703288555145,
+      "learning_rate": 0.00019373626326337946,
+      "loss": 1.2861,
+      "step": 2562
+    },
+    {
+      "epoch": 0.4563746438746439,
+      "grad_norm": 0.5752716660499573,
+      "learning_rate": 0.0001937313862539079,
+      "loss": 1.2365,
+      "step": 2563
+    },
+    {
+      "epoch": 0.4565527065527066,
+      "grad_norm": 0.519176185131073,
+      "learning_rate": 0.00019372650740796452,
+      "loss": 1.2264,
+      "step": 2564
+    },
+    {
+      "epoch": 0.4567307692307692,
+      "grad_norm": 0.5927292704582214,
+      "learning_rate": 0.00019372162672564493,
+      "loss": 0.8979,
+      "step": 2565
+    },
+    {
+      "epoch": 0.4569088319088319,
+      "grad_norm": 0.5467435121536255,
+      "learning_rate": 0.00019371674420704478,
+      "loss": 1.1016,
+      "step": 2566
+    },
+    {
+      "epoch": 0.4570868945868946,
+      "grad_norm": 0.49593284726142883,
+      "learning_rate": 0.00019371185985225968,
+      "loss": 0.982,
+      "step": 2567
+    },
+    {
+      "epoch": 0.45726495726495725,
+      "grad_norm": 0.5696587562561035,
+      "learning_rate": 0.00019370697366138538,
+      "loss": 0.979,
+      "step": 2568
+    },
+    {
+      "epoch": 0.45744301994301995,
+      "grad_norm": 0.4455752968788147,
+      "learning_rate": 0.00019370208563451757,
+      "loss": 0.8832,
+      "step": 2569
+    },
+    {
+      "epoch": 0.45762108262108264,
+      "grad_norm": 0.5072923302650452,
+      "learning_rate": 0.00019369719577175203,
+      "loss": 1.1046,
+      "step": 2570
+    },
+    {
+      "epoch": 0.4577991452991453,
+      "grad_norm": 0.45119982957839966,
+      "learning_rate": 0.0001936923040731846,
+      "loss": 1.0083,
+      "step": 2571
+    },
+    {
+      "epoch": 0.457977207977208,
+      "grad_norm": 0.5062251091003418,
+      "learning_rate": 0.00019368741053891108,
+      "loss": 1.2771,
+      "step": 2572
+    },
+    {
+      "epoch": 0.4581552706552707,
+      "grad_norm": 0.5511104464530945,
+      "learning_rate": 0.0001936825151690274,
+      "loss": 1.0039,
+      "step": 2573
+    },
+    {
+      "epoch": 0.4583333333333333,
+      "grad_norm": 0.4721006453037262,
+      "learning_rate": 0.0001936776179636294,
+      "loss": 1.3246,
+      "step": 2574
+    },
+    {
+      "epoch": 0.458511396011396,
+      "grad_norm": 0.5021488666534424,
+      "learning_rate": 0.0001936727189228131,
+      "loss": 1.1733,
+      "step": 2575
+    },
+    {
+      "epoch": 0.4586894586894587,
+      "grad_norm": 0.5755292177200317,
+      "learning_rate": 0.0001936678180466745,
+      "loss": 1.2241,
+      "step": 2576
+    },
+    {
+      "epoch": 0.45886752136752135,
+      "grad_norm": 0.4501610994338989,
+      "learning_rate": 0.00019366291533530952,
+      "loss": 1.0503,
+      "step": 2577
+    },
+    {
+      "epoch": 0.45904558404558404,
+      "grad_norm": 0.4067458212375641,
+      "learning_rate": 0.00019365801078881432,
+      "loss": 0.8259,
+      "step": 2578
+    },
+    {
+      "epoch": 0.45922364672364674,
+      "grad_norm": 0.539730429649353,
+      "learning_rate": 0.0001936531044072849,
+      "loss": 1.1964,
+      "step": 2579
+    },
+    {
+      "epoch": 0.4594017094017094,
+      "grad_norm": 0.5624797344207764,
+      "learning_rate": 0.0001936481961908175,
+      "loss": 1.2059,
+      "step": 2580
+    },
+    {
+      "epoch": 0.4595797720797721,
+      "grad_norm": 0.43679240345954895,
+      "learning_rate": 0.00019364328613950824,
+      "loss": 1.1371,
+      "step": 2581
+    },
+    {
+      "epoch": 0.45975783475783477,
+      "grad_norm": 0.5214769244194031,
+      "learning_rate": 0.00019363837425345328,
+      "loss": 1.109,
+      "step": 2582
+    },
+    {
+      "epoch": 0.4599358974358974,
+      "grad_norm": 0.4522894024848938,
+      "learning_rate": 0.00019363346053274892,
+      "loss": 1.0532,
+      "step": 2583
+    },
+    {
+      "epoch": 0.4601139601139601,
+      "grad_norm": 0.44980281591415405,
+      "learning_rate": 0.0001936285449774914,
+      "loss": 0.9352,
+      "step": 2584
+    },
+    {
+      "epoch": 0.4602920227920228,
+      "grad_norm": 0.5697414875030518,
+      "learning_rate": 0.00019362362758777705,
+      "loss": 1.2171,
+      "step": 2585
+    },
+    {
+      "epoch": 0.46047008547008544,
+      "grad_norm": 0.4636315107345581,
+      "learning_rate": 0.00019361870836370217,
+      "loss": 1.0662,
+      "step": 2586
+    },
+    {
+      "epoch": 0.46064814814814814,
+      "grad_norm": 0.5144017338752747,
+      "learning_rate": 0.00019361378730536321,
+      "loss": 1.0681,
+      "step": 2587
+    },
+    {
+      "epoch": 0.46082621082621084,
+      "grad_norm": 0.5007636547088623,
+      "learning_rate": 0.00019360886441285654,
+      "loss": 1.2058,
+      "step": 2588
+    },
+    {
+      "epoch": 0.46100427350427353,
+      "grad_norm": 0.5024117231369019,
+      "learning_rate": 0.00019360393968627864,
+      "loss": 1.065,
+      "step": 2589
+    },
+    {
+      "epoch": 0.46118233618233617,
+      "grad_norm": 0.48105588555336,
+      "learning_rate": 0.00019359901312572596,
+      "loss": 1.0887,
+      "step": 2590
+    },
+    {
+      "epoch": 0.46136039886039887,
+      "grad_norm": 0.5381982326507568,
+      "learning_rate": 0.00019359408473129506,
+      "loss": 1.2754,
+      "step": 2591
+    },
+    {
+      "epoch": 0.46153846153846156,
+      "grad_norm": 0.5051333904266357,
+      "learning_rate": 0.0001935891545030825,
+      "loss": 0.9334,
+      "step": 2592
+    },
+    {
+      "epoch": 0.4617165242165242,
+      "grad_norm": 0.43818601965904236,
+      "learning_rate": 0.0001935842224411849,
+      "loss": 1.0967,
+      "step": 2593
+    },
+    {
+      "epoch": 0.4618945868945869,
+      "grad_norm": 0.4727257490158081,
+      "learning_rate": 0.0001935792885456988,
+      "loss": 0.8136,
+      "step": 2594
+    },
+    {
+      "epoch": 0.4620726495726496,
+      "grad_norm": 0.5505291223526001,
+      "learning_rate": 0.00019357435281672098,
+      "loss": 1.3113,
+      "step": 2595
+    },
+    {
+      "epoch": 0.46225071225071224,
+      "grad_norm": 0.4705682396888733,
+      "learning_rate": 0.0001935694152543481,
+      "loss": 0.9863,
+      "step": 2596
+    },
+    {
+      "epoch": 0.46242877492877493,
+      "grad_norm": 0.49653419852256775,
+      "learning_rate": 0.0001935644758586769,
+      "loss": 1.035,
+      "step": 2597
+    },
+    {
+      "epoch": 0.46260683760683763,
+      "grad_norm": 0.4788367748260498,
+      "learning_rate": 0.00019355953462980415,
+      "loss": 1.1253,
+      "step": 2598
+    },
+    {
+      "epoch": 0.46278490028490027,
+      "grad_norm": 0.5295125842094421,
+      "learning_rate": 0.00019355459156782668,
+      "loss": 1.0853,
+      "step": 2599
+    },
+    {
+      "epoch": 0.46296296296296297,
+      "grad_norm": 0.4878056049346924,
+      "learning_rate": 0.00019354964667284133,
+      "loss": 1.1381,
+      "step": 2600
+    },
+    {
+      "epoch": 0.46314102564102566,
+      "grad_norm": 0.5442031025886536,
+      "learning_rate": 0.00019354469994494497,
+      "loss": 1.1349,
+      "step": 2601
+    },
+    {
+      "epoch": 0.4633190883190883,
+      "grad_norm": 0.4845225214958191,
+      "learning_rate": 0.00019353975138423457,
+      "loss": 1.0538,
+      "step": 2602
+    },
+    {
+      "epoch": 0.463497150997151,
+      "grad_norm": 0.4957871437072754,
+      "learning_rate": 0.00019353480099080703,
+      "loss": 1.2765,
+      "step": 2603
+    },
+    {
+      "epoch": 0.4636752136752137,
+      "grad_norm": 0.5414339303970337,
+      "learning_rate": 0.00019352984876475936,
+      "loss": 1.1015,
+      "step": 2604
+    },
+    {
+      "epoch": 0.46385327635327633,
+      "grad_norm": 0.5171043872833252,
+      "learning_rate": 0.0001935248947061886,
+      "loss": 0.9995,
+      "step": 2605
+    },
+    {
+      "epoch": 0.46403133903133903,
+      "grad_norm": 0.46040529012680054,
+      "learning_rate": 0.0001935199388151918,
+      "loss": 1.1126,
+      "step": 2606
+    },
+    {
+      "epoch": 0.4642094017094017,
+      "grad_norm": 0.5327033400535583,
+      "learning_rate": 0.00019351498109186613,
+      "loss": 1.1983,
+      "step": 2607
+    },
+    {
+      "epoch": 0.46438746438746437,
+      "grad_norm": 0.4451361298561096,
+      "learning_rate": 0.0001935100215363086,
+      "loss": 0.9689,
+      "step": 2608
+    },
+    {
+      "epoch": 0.46456552706552706,
+      "grad_norm": 0.5462809801101685,
+      "learning_rate": 0.00019350506014861646,
+      "loss": 1.036,
+      "step": 2609
+    },
+    {
+      "epoch": 0.46474358974358976,
+      "grad_norm": 0.4907000958919525,
+      "learning_rate": 0.00019350009692888694,
+      "loss": 1.0724,
+      "step": 2610
+    },
+    {
+      "epoch": 0.4649216524216524,
+      "grad_norm": 0.47523510456085205,
+      "learning_rate": 0.00019349513187721723,
+      "loss": 0.9214,
+      "step": 2611
+    },
+    {
+      "epoch": 0.4650997150997151,
+      "grad_norm": 0.539732813835144,
+      "learning_rate": 0.0001934901649937046,
+      "loss": 1.1166,
+      "step": 2612
+    },
+    {
+      "epoch": 0.4652777777777778,
+      "grad_norm": 0.4827860891819,
+      "learning_rate": 0.00019348519627844643,
+      "loss": 1.1613,
+      "step": 2613
+    },
+    {
+      "epoch": 0.46545584045584043,
+      "grad_norm": 0.5385223031044006,
+      "learning_rate": 0.00019348022573154,
+      "loss": 1.0105,
+      "step": 2614
+    },
+    {
+      "epoch": 0.4656339031339031,
+      "grad_norm": 0.4629383087158203,
+      "learning_rate": 0.0001934752533530828,
+      "loss": 1.0298,
+      "step": 2615
+    },
+    {
+      "epoch": 0.4658119658119658,
+      "grad_norm": 0.599371075630188,
+      "learning_rate": 0.00019347027914317212,
+      "loss": 1.3158,
+      "step": 2616
+    },
+    {
+      "epoch": 0.46599002849002846,
+      "grad_norm": 0.5954698324203491,
+      "learning_rate": 0.00019346530310190553,
+      "loss": 1.1882,
+      "step": 2617
+    },
+    {
+      "epoch": 0.46616809116809116,
+      "grad_norm": 0.49185171723365784,
+      "learning_rate": 0.00019346032522938046,
+      "loss": 1.0977,
+      "step": 2618
+    },
+    {
+      "epoch": 0.46634615384615385,
+      "grad_norm": 0.5145422220230103,
+      "learning_rate": 0.0001934553455256945,
+      "loss": 0.9948,
+      "step": 2619
+    },
+    {
+      "epoch": 0.46652421652421655,
+      "grad_norm": 0.6809412837028503,
+      "learning_rate": 0.00019345036399094517,
+      "loss": 1.5798,
+      "step": 2620
+    },
+    {
+      "epoch": 0.4667022792022792,
+      "grad_norm": 0.4606841206550598,
+      "learning_rate": 0.00019344538062523005,
+      "loss": 0.7357,
+      "step": 2621
+    },
+    {
+      "epoch": 0.4668803418803419,
+      "grad_norm": 0.49036628007888794,
+      "learning_rate": 0.00019344039542864685,
+      "loss": 1.1518,
+      "step": 2622
+    },
+    {
+      "epoch": 0.4670584045584046,
+      "grad_norm": 0.47904539108276367,
+      "learning_rate": 0.0001934354084012932,
+      "loss": 0.9929,
+      "step": 2623
+    },
+    {
+      "epoch": 0.4672364672364672,
+      "grad_norm": 0.5224666595458984,
+      "learning_rate": 0.0001934304195432668,
+      "loss": 1.2544,
+      "step": 2624
+    },
+    {
+      "epoch": 0.4674145299145299,
+      "grad_norm": 0.4902483820915222,
+      "learning_rate": 0.00019342542885466543,
+      "loss": 1.0301,
+      "step": 2625
+    },
+    {
+      "epoch": 0.4675925925925926,
+      "grad_norm": 0.46824702620506287,
+      "learning_rate": 0.00019342043633558683,
+      "loss": 0.9364,
+      "step": 2626
+    },
+    {
+      "epoch": 0.46777065527065526,
+      "grad_norm": 0.46272051334381104,
+      "learning_rate": 0.00019341544198612888,
+      "loss": 1.056,
+      "step": 2627
+    },
+    {
+      "epoch": 0.46794871794871795,
+      "grad_norm": 0.6216606497764587,
+      "learning_rate": 0.0001934104458063894,
+      "loss": 1.0825,
+      "step": 2628
+    },
+    {
+      "epoch": 0.46812678062678065,
+      "grad_norm": 0.5024014115333557,
+      "learning_rate": 0.00019340544779646623,
+      "loss": 1.1832,
+      "step": 2629
+    },
+    {
+      "epoch": 0.4683048433048433,
+      "grad_norm": 0.5547130107879639,
+      "learning_rate": 0.00019340044795645737,
+      "loss": 1.1335,
+      "step": 2630
+    },
+    {
+      "epoch": 0.468482905982906,
+      "grad_norm": 0.5439161658287048,
+      "learning_rate": 0.0001933954462864608,
+      "loss": 1.0229,
+      "step": 2631
+    },
+    {
+      "epoch": 0.4686609686609687,
+      "grad_norm": 0.4782990515232086,
+      "learning_rate": 0.0001933904427865744,
+      "loss": 1.2318,
+      "step": 2632
+    },
+    {
+      "epoch": 0.4688390313390313,
+      "grad_norm": 0.5872140526771545,
+      "learning_rate": 0.00019338543745689633,
+      "loss": 1.0132,
+      "step": 2633
+    },
+    {
+      "epoch": 0.469017094017094,
+      "grad_norm": 0.44163307547569275,
+      "learning_rate": 0.00019338043029752458,
+      "loss": 1.0091,
+      "step": 2634
+    },
+    {
+      "epoch": 0.4691951566951567,
+      "grad_norm": 0.541081428527832,
+      "learning_rate": 0.0001933754213085573,
+      "loss": 1.2155,
+      "step": 2635
+    },
+    {
+      "epoch": 0.46937321937321935,
+      "grad_norm": 0.4761527478694916,
+      "learning_rate": 0.00019337041049009255,
+      "loss": 1.1138,
+      "step": 2636
+    },
+    {
+      "epoch": 0.46955128205128205,
+      "grad_norm": 0.46414369344711304,
+      "learning_rate": 0.0001933653978422286,
+      "loss": 0.9903,
+      "step": 2637
+    },
+    {
+      "epoch": 0.46972934472934474,
+      "grad_norm": 0.5337086915969849,
+      "learning_rate": 0.00019336038336506363,
+      "loss": 1.2873,
+      "step": 2638
+    },
+    {
+      "epoch": 0.4699074074074074,
+      "grad_norm": 0.5065379738807678,
+      "learning_rate": 0.00019335536705869592,
+      "loss": 1.1436,
+      "step": 2639
+    },
+    {
+      "epoch": 0.4700854700854701,
+      "grad_norm": 0.5539217591285706,
+      "learning_rate": 0.0001933503489232237,
+      "loss": 1.2881,
+      "step": 2640
+    },
+    {
+      "epoch": 0.4702635327635328,
+      "grad_norm": 0.48303213715553284,
+      "learning_rate": 0.0001933453289587453,
+      "loss": 1.0209,
+      "step": 2641
+    },
+    {
+      "epoch": 0.4704415954415954,
+      "grad_norm": 0.6986871957778931,
+      "learning_rate": 0.00019334030716535908,
+      "loss": 1.1979,
+      "step": 2642
+    },
+    {
+      "epoch": 0.4706196581196581,
+      "grad_norm": 0.46137234568595886,
+      "learning_rate": 0.00019333528354316347,
+      "loss": 1.0682,
+      "step": 2643
+    },
+    {
+      "epoch": 0.4707977207977208,
+      "grad_norm": 0.4726654291152954,
+      "learning_rate": 0.00019333025809225684,
+      "loss": 1.1712,
+      "step": 2644
+    },
+    {
+      "epoch": 0.47097578347578345,
+      "grad_norm": 0.46188637614250183,
+      "learning_rate": 0.0001933252308127377,
+      "loss": 1.0183,
+      "step": 2645
+    },
+    {
+      "epoch": 0.47115384615384615,
+      "grad_norm": 0.5323259830474854,
+      "learning_rate": 0.0001933202017047045,
+      "loss": 0.935,
+      "step": 2646
+    },
+    {
+      "epoch": 0.47133190883190884,
+      "grad_norm": 0.5004189014434814,
+      "learning_rate": 0.00019331517076825582,
+      "loss": 1.1331,
+      "step": 2647
+    },
+    {
+      "epoch": 0.47150997150997154,
+      "grad_norm": 0.5443634986877441,
+      "learning_rate": 0.0001933101380034902,
+      "loss": 1.0514,
+      "step": 2648
+    },
+    {
+      "epoch": 0.4716880341880342,
+      "grad_norm": 0.504180371761322,
+      "learning_rate": 0.0001933051034105063,
+      "loss": 1.3099,
+      "step": 2649
+    },
+    {
+      "epoch": 0.4718660968660969,
+      "grad_norm": 0.5092344284057617,
+      "learning_rate": 0.0001933000669894027,
+      "loss": 1.0716,
+      "step": 2650
+    },
+    {
+      "epoch": 0.47204415954415957,
+      "grad_norm": 0.5236422419548035,
+      "learning_rate": 0.0001932950287402781,
+      "loss": 1.0981,
+      "step": 2651
+    },
+    {
+      "epoch": 0.4722222222222222,
+      "grad_norm": 0.6228063702583313,
+      "learning_rate": 0.0001932899886632312,
+      "loss": 1.3398,
+      "step": 2652
+    },
+    {
+      "epoch": 0.4724002849002849,
+      "grad_norm": 0.5112748146057129,
+      "learning_rate": 0.00019328494675836078,
+      "loss": 1.0151,
+      "step": 2653
+    },
+    {
+      "epoch": 0.4725783475783476,
+      "grad_norm": 0.5554201602935791,
+      "learning_rate": 0.00019327990302576563,
+      "loss": 1.404,
+      "step": 2654
+    },
+    {
+      "epoch": 0.47275641025641024,
+      "grad_norm": 0.5050725340843201,
+      "learning_rate": 0.0001932748574655445,
+      "loss": 0.951,
+      "step": 2655
+    },
+    {
+      "epoch": 0.47293447293447294,
+      "grad_norm": 0.5161749720573425,
+      "learning_rate": 0.00019326981007779636,
+      "loss": 1.2425,
+      "step": 2656
+    },
+    {
+      "epoch": 0.47311253561253563,
+      "grad_norm": 0.4865442216396332,
+      "learning_rate": 0.00019326476086262002,
+      "loss": 1.1175,
+      "step": 2657
+    },
+    {
+      "epoch": 0.4732905982905983,
+      "grad_norm": 0.5276186466217041,
+      "learning_rate": 0.0001932597098201144,
+      "loss": 1.3687,
+      "step": 2658
+    },
+    {
+      "epoch": 0.47346866096866097,
+      "grad_norm": 0.509139358997345,
+      "learning_rate": 0.00019325465695037855,
+      "loss": 1.0546,
+      "step": 2659
+    },
+    {
+      "epoch": 0.47364672364672367,
+      "grad_norm": 0.49815434217453003,
+      "learning_rate": 0.00019324960225351138,
+      "loss": 1.0807,
+      "step": 2660
+    },
+    {
+      "epoch": 0.4738247863247863,
+      "grad_norm": 0.5059618353843689,
+      "learning_rate": 0.00019324454572961197,
+      "loss": 1.0827,
+      "step": 2661
+    },
+    {
+      "epoch": 0.474002849002849,
+      "grad_norm": 0.5698565244674683,
+      "learning_rate": 0.00019323948737877942,
+      "loss": 1.2019,
+      "step": 2662
+    },
+    {
+      "epoch": 0.4741809116809117,
+      "grad_norm": 0.49661511182785034,
+      "learning_rate": 0.00019323442720111276,
+      "loss": 1.1447,
+      "step": 2663
+    },
+    {
+      "epoch": 0.47435897435897434,
+      "grad_norm": 0.46442747116088867,
+      "learning_rate": 0.0001932293651967112,
+      "loss": 0.8796,
+      "step": 2664
+    },
+    {
+      "epoch": 0.47453703703703703,
+      "grad_norm": 0.48306044936180115,
+      "learning_rate": 0.00019322430136567388,
+      "loss": 1.1358,
+      "step": 2665
+    },
+    {
+      "epoch": 0.47471509971509973,
+      "grad_norm": 0.5677350759506226,
+      "learning_rate": 0.00019321923570810005,
+      "loss": 1.1026,
+      "step": 2666
+    },
+    {
+      "epoch": 0.47489316239316237,
+      "grad_norm": 0.3700144588947296,
+      "learning_rate": 0.0001932141682240889,
+      "loss": 0.7514,
+      "step": 2667
+    },
+    {
+      "epoch": 0.47507122507122507,
+      "grad_norm": 0.6003054976463318,
+      "learning_rate": 0.0001932090989137398,
+      "loss": 1.1591,
+      "step": 2668
+    },
+    {
+      "epoch": 0.47524928774928776,
+      "grad_norm": 0.520298421382904,
+      "learning_rate": 0.00019320402777715204,
+      "loss": 1.339,
+      "step": 2669
+    },
+    {
+      "epoch": 0.4754273504273504,
+      "grad_norm": 0.46453598141670227,
+      "learning_rate": 0.00019319895481442493,
+      "loss": 0.9879,
+      "step": 2670
+    },
+    {
+      "epoch": 0.4756054131054131,
+      "grad_norm": 0.5247363448143005,
+      "learning_rate": 0.00019319388002565793,
+      "loss": 0.9862,
+      "step": 2671
+    },
+    {
+      "epoch": 0.4757834757834758,
+      "grad_norm": 0.5498613715171814,
+      "learning_rate": 0.00019318880341095046,
+      "loss": 1.2224,
+      "step": 2672
+    },
+    {
+      "epoch": 0.47596153846153844,
+      "grad_norm": 0.565838098526001,
+      "learning_rate": 0.00019318372497040192,
+      "loss": 1.0712,
+      "step": 2673
+    },
+    {
+      "epoch": 0.47613960113960113,
+      "grad_norm": 0.5797489881515503,
+      "learning_rate": 0.00019317864470411191,
+      "loss": 1.0176,
+      "step": 2674
+    },
+    {
+      "epoch": 0.47631766381766383,
+      "grad_norm": 0.5114326477050781,
+      "learning_rate": 0.0001931735626121799,
+      "loss": 1.1027,
+      "step": 2675
+    },
+    {
+      "epoch": 0.47649572649572647,
+      "grad_norm": 0.5396515727043152,
+      "learning_rate": 0.00019316847869470547,
+      "loss": 1.1782,
+      "step": 2676
+    },
+    {
+      "epoch": 0.47667378917378916,
+      "grad_norm": 0.4812076985836029,
+      "learning_rate": 0.00019316339295178824,
+      "loss": 1.1196,
+      "step": 2677
+    },
+    {
+      "epoch": 0.47685185185185186,
+      "grad_norm": 0.4875647723674774,
+      "learning_rate": 0.00019315830538352787,
+      "loss": 1.1407,
+      "step": 2678
+    },
+    {
+      "epoch": 0.47702991452991456,
+      "grad_norm": 0.5036377906799316,
+      "learning_rate": 0.00019315321599002404,
+      "loss": 0.9842,
+      "step": 2679
+    },
+    {
+      "epoch": 0.4772079772079772,
+      "grad_norm": 0.5054177641868591,
+      "learning_rate": 0.00019314812477137645,
+      "loss": 0.8196,
+      "step": 2680
+    },
+    {
+      "epoch": 0.4773860398860399,
+      "grad_norm": 0.5050665736198425,
+      "learning_rate": 0.00019314303172768483,
+      "loss": 0.8463,
+      "step": 2681
+    },
+    {
+      "epoch": 0.4775641025641026,
+      "grad_norm": 0.5179004669189453,
+      "learning_rate": 0.000193137936859049,
+      "loss": 1.2485,
+      "step": 2682
+    },
+    {
+      "epoch": 0.47774216524216523,
+      "grad_norm": 0.44986143708229065,
+      "learning_rate": 0.00019313284016556876,
+      "loss": 0.9855,
+      "step": 2683
+    },
+    {
+      "epoch": 0.4779202279202279,
+      "grad_norm": 0.5594347715377808,
+      "learning_rate": 0.00019312774164734398,
+      "loss": 1.0987,
+      "step": 2684
+    },
+    {
+      "epoch": 0.4780982905982906,
+      "grad_norm": 0.4837244749069214,
+      "learning_rate": 0.0001931226413044746,
+      "loss": 1.1119,
+      "step": 2685
+    },
+    {
+      "epoch": 0.47827635327635326,
+      "grad_norm": 0.489145427942276,
+      "learning_rate": 0.0001931175391370605,
+      "loss": 1.1962,
+      "step": 2686
+    },
+    {
+      "epoch": 0.47845441595441596,
+      "grad_norm": 0.503568708896637,
+      "learning_rate": 0.00019311243514520164,
+      "loss": 0.9668,
+      "step": 2687
+    },
+    {
+      "epoch": 0.47863247863247865,
+      "grad_norm": 0.5401005744934082,
+      "learning_rate": 0.00019310732932899805,
+      "loss": 1.3072,
+      "step": 2688
+    },
+    {
+      "epoch": 0.4788105413105413,
+      "grad_norm": 0.526523768901825,
+      "learning_rate": 0.00019310222168854971,
+      "loss": 1.1387,
+      "step": 2689
+    },
+    {
+      "epoch": 0.478988603988604,
+      "grad_norm": 0.5223183631896973,
+      "learning_rate": 0.00019309711222395678,
+      "loss": 1.1391,
+      "step": 2690
+    },
+    {
+      "epoch": 0.4791666666666667,
+      "grad_norm": 0.5840879082679749,
+      "learning_rate": 0.00019309200093531933,
+      "loss": 1.1543,
+      "step": 2691
+    },
+    {
+      "epoch": 0.4793447293447293,
+      "grad_norm": 0.5173699259757996,
+      "learning_rate": 0.00019308688782273753,
+      "loss": 1.1889,
+      "step": 2692
+    },
+    {
+      "epoch": 0.479522792022792,
+      "grad_norm": 0.5417894124984741,
+      "learning_rate": 0.00019308177288631146,
+      "loss": 1.299,
+      "step": 2693
+    },
+    {
+      "epoch": 0.4797008547008547,
+      "grad_norm": 0.4890797734260559,
+      "learning_rate": 0.0001930766561261415,
+      "loss": 1.1516,
+      "step": 2694
+    },
+    {
+      "epoch": 0.47987891737891736,
+      "grad_norm": 0.5422119498252869,
+      "learning_rate": 0.00019307153754232772,
+      "loss": 1.0301,
+      "step": 2695
+    },
+    {
+      "epoch": 0.48005698005698005,
+      "grad_norm": 0.5838702917098999,
+      "learning_rate": 0.00019306641713497057,
+      "loss": 1.265,
+      "step": 2696
+    },
+    {
+      "epoch": 0.48023504273504275,
+      "grad_norm": 0.5020943284034729,
+      "learning_rate": 0.00019306129490417027,
+      "loss": 1.1119,
+      "step": 2697
+    },
+    {
+      "epoch": 0.4804131054131054,
+      "grad_norm": 0.412993460893631,
+      "learning_rate": 0.00019305617085002723,
+      "loss": 0.8083,
+      "step": 2698
+    },
+    {
+      "epoch": 0.4805911680911681,
+      "grad_norm": 0.6270101070404053,
+      "learning_rate": 0.00019305104497264184,
+      "loss": 1.3355,
+      "step": 2699
+    },
+    {
+      "epoch": 0.4807692307692308,
+      "grad_norm": 0.45256730914115906,
+      "learning_rate": 0.0001930459172721145,
+      "loss": 1.0368,
+      "step": 2700
+    },
+    {
+      "epoch": 0.4809472934472934,
+      "grad_norm": 0.5351749658584595,
+      "learning_rate": 0.0001930407877485457,
+      "loss": 1.135,
+      "step": 2701
+    },
+    {
+      "epoch": 0.4811253561253561,
+      "grad_norm": 0.49324163794517517,
+      "learning_rate": 0.00019303565640203593,
+      "loss": 0.9383,
+      "step": 2702
+    },
+    {
+      "epoch": 0.4813034188034188,
+      "grad_norm": 0.5434361100196838,
+      "learning_rate": 0.00019303052323268576,
+      "loss": 1.2605,
+      "step": 2703
+    },
+    {
+      "epoch": 0.48148148148148145,
+      "grad_norm": 0.5858064889907837,
+      "learning_rate": 0.00019302538824059572,
+      "loss": 1.0846,
+      "step": 2704
+    },
+    {
+      "epoch": 0.48165954415954415,
+      "grad_norm": 0.5753700733184814,
+      "learning_rate": 0.00019302025142586647,
+      "loss": 1.0371,
+      "step": 2705
+    },
+    {
+      "epoch": 0.48183760683760685,
+      "grad_norm": 0.43102699518203735,
+      "learning_rate": 0.00019301511278859858,
+      "loss": 0.9189,
+      "step": 2706
+    },
+    {
+      "epoch": 0.48201566951566954,
+      "grad_norm": 0.4731025993824005,
+      "learning_rate": 0.0001930099723288928,
+      "loss": 1.1291,
+      "step": 2707
+    },
+    {
+      "epoch": 0.4821937321937322,
+      "grad_norm": 0.5685615539550781,
+      "learning_rate": 0.00019300483004684987,
+      "loss": 1.1006,
+      "step": 2708
+    },
+    {
+      "epoch": 0.4823717948717949,
+      "grad_norm": 0.4368155896663666,
+      "learning_rate": 0.00019299968594257044,
+      "loss": 0.9959,
+      "step": 2709
+    },
+    {
+      "epoch": 0.4825498575498576,
+      "grad_norm": 0.5594738125801086,
+      "learning_rate": 0.00019299454001615537,
+      "loss": 1.0826,
+      "step": 2710
+    },
+    {
+      "epoch": 0.4827279202279202,
+      "grad_norm": 0.48876598477363586,
+      "learning_rate": 0.00019298939226770548,
+      "loss": 1.1556,
+      "step": 2711
+    },
+    {
+      "epoch": 0.4829059829059829,
+      "grad_norm": 0.548039436340332,
+      "learning_rate": 0.00019298424269732157,
+      "loss": 1.158,
+      "step": 2712
+    },
+    {
+      "epoch": 0.4830840455840456,
+      "grad_norm": 0.4957645535469055,
+      "learning_rate": 0.00019297909130510464,
+      "loss": 0.9824,
+      "step": 2713
+    },
+    {
+      "epoch": 0.48326210826210825,
+      "grad_norm": 0.5197011232376099,
+      "learning_rate": 0.00019297393809115555,
+      "loss": 1.1074,
+      "step": 2714
+    },
+    {
+      "epoch": 0.48344017094017094,
+      "grad_norm": 0.5742064118385315,
+      "learning_rate": 0.00019296878305557526,
+      "loss": 1.0431,
+      "step": 2715
+    },
+    {
+      "epoch": 0.48361823361823364,
+      "grad_norm": 0.5698413252830505,
+      "learning_rate": 0.0001929636261984648,
+      "loss": 1.0713,
+      "step": 2716
+    },
+    {
+      "epoch": 0.4837962962962963,
+      "grad_norm": 0.48126333951950073,
+      "learning_rate": 0.0001929584675199252,
+      "loss": 0.9274,
+      "step": 2717
+    },
+    {
+      "epoch": 0.483974358974359,
+      "grad_norm": 0.49299830198287964,
+      "learning_rate": 0.00019295330702005754,
+      "loss": 0.9392,
+      "step": 2718
+    },
+    {
+      "epoch": 0.48415242165242167,
+      "grad_norm": 0.4780774414539337,
+      "learning_rate": 0.0001929481446989629,
+      "loss": 1.1459,
+      "step": 2719
+    },
+    {
+      "epoch": 0.4843304843304843,
+      "grad_norm": 0.5462654829025269,
+      "learning_rate": 0.00019294298055674248,
+      "loss": 1.0635,
+      "step": 2720
+    },
+    {
+      "epoch": 0.484508547008547,
+      "grad_norm": 0.5371061563491821,
+      "learning_rate": 0.00019293781459349743,
+      "loss": 1.3578,
+      "step": 2721
+    },
+    {
+      "epoch": 0.4846866096866097,
+      "grad_norm": 0.46308520436286926,
+      "learning_rate": 0.00019293264680932893,
+      "loss": 0.9001,
+      "step": 2722
+    },
+    {
+      "epoch": 0.48486467236467234,
+      "grad_norm": 0.5149807929992676,
+      "learning_rate": 0.0001929274772043383,
+      "loss": 0.6908,
+      "step": 2723
+    },
+    {
+      "epoch": 0.48504273504273504,
+      "grad_norm": 0.5435031056404114,
+      "learning_rate": 0.00019292230577862678,
+      "loss": 1.2143,
+      "step": 2724
+    },
+    {
+      "epoch": 0.48522079772079774,
+      "grad_norm": 0.44217726588249207,
+      "learning_rate": 0.00019291713253229568,
+      "loss": 0.9303,
+      "step": 2725
+    },
+    {
+      "epoch": 0.4853988603988604,
+      "grad_norm": 0.6120226383209229,
+      "learning_rate": 0.00019291195746544643,
+      "loss": 1.3801,
+      "step": 2726
+    },
+    {
+      "epoch": 0.4855769230769231,
+      "grad_norm": 0.5014316439628601,
+      "learning_rate": 0.00019290678057818037,
+      "loss": 1.0631,
+      "step": 2727
+    },
+    {
+      "epoch": 0.48575498575498577,
+      "grad_norm": 0.5667829513549805,
+      "learning_rate": 0.00019290160187059895,
+      "loss": 1.3166,
+      "step": 2728
+    },
+    {
+      "epoch": 0.4859330484330484,
+      "grad_norm": 0.5011509656906128,
+      "learning_rate": 0.0001928964213428036,
+      "loss": 1.1887,
+      "step": 2729
+    },
+    {
+      "epoch": 0.4861111111111111,
+      "grad_norm": 0.48317405581474304,
+      "learning_rate": 0.00019289123899489586,
+      "loss": 1.1125,
+      "step": 2730
+    },
+    {
+      "epoch": 0.4862891737891738,
+      "grad_norm": 0.4669005870819092,
+      "learning_rate": 0.00019288605482697726,
+      "loss": 1.0091,
+      "step": 2731
+    },
+    {
+      "epoch": 0.48646723646723644,
+      "grad_norm": 0.4330739974975586,
+      "learning_rate": 0.00019288086883914937,
+      "loss": 0.9789,
+      "step": 2732
+    },
+    {
+      "epoch": 0.48664529914529914,
+      "grad_norm": 0.48482781648635864,
+      "learning_rate": 0.0001928756810315138,
+      "loss": 1.1922,
+      "step": 2733
+    },
+    {
+      "epoch": 0.48682336182336183,
+      "grad_norm": 0.5781838297843933,
+      "learning_rate": 0.0001928704914041722,
+      "loss": 1.1793,
+      "step": 2734
+    },
+    {
+      "epoch": 0.48700142450142453,
+      "grad_norm": 0.5955413579940796,
+      "learning_rate": 0.00019286529995722623,
+      "loss": 1.1001,
+      "step": 2735
+    },
+    {
+      "epoch": 0.48717948717948717,
+      "grad_norm": 0.49204322695732117,
+      "learning_rate": 0.00019286010669077763,
+      "loss": 0.9219,
+      "step": 2736
+    },
+    {
+      "epoch": 0.48735754985754987,
+      "grad_norm": 0.5853500962257385,
+      "learning_rate": 0.00019285491160492813,
+      "loss": 1.1133,
+      "step": 2737
+    },
+    {
+      "epoch": 0.48753561253561256,
+      "grad_norm": 0.5555846095085144,
+      "learning_rate": 0.0001928497146997795,
+      "loss": 1.0915,
+      "step": 2738
+    },
+    {
+      "epoch": 0.4877136752136752,
+      "grad_norm": 0.5166759490966797,
+      "learning_rate": 0.00019284451597543364,
+      "loss": 0.9349,
+      "step": 2739
+    },
+    {
+      "epoch": 0.4878917378917379,
+      "grad_norm": 0.47816506028175354,
+      "learning_rate": 0.00019283931543199234,
+      "loss": 0.8978,
+      "step": 2740
+    },
+    {
+      "epoch": 0.4880698005698006,
+      "grad_norm": 0.5632442831993103,
+      "learning_rate": 0.0001928341130695575,
+      "loss": 1.0491,
+      "step": 2741
+    },
+    {
+      "epoch": 0.48824786324786323,
+      "grad_norm": 0.6532769799232483,
+      "learning_rate": 0.00019282890888823107,
+      "loss": 1.2779,
+      "step": 2742
+    },
+    {
+      "epoch": 0.48842592592592593,
+      "grad_norm": 0.5733640789985657,
+      "learning_rate": 0.000192823702888115,
+      "loss": 1.4127,
+      "step": 2743
+    },
+    {
+      "epoch": 0.4886039886039886,
+      "grad_norm": 0.5701746344566345,
+      "learning_rate": 0.00019281849506931132,
+      "loss": 1.138,
+      "step": 2744
+    },
+    {
+      "epoch": 0.48878205128205127,
+      "grad_norm": 0.5227449536323547,
+      "learning_rate": 0.000192813285431922,
+      "loss": 1.1831,
+      "step": 2745
+    },
+    {
+      "epoch": 0.48896011396011396,
+      "grad_norm": 0.48457080125808716,
+      "learning_rate": 0.00019280807397604915,
+      "loss": 1.2468,
+      "step": 2746
+    },
+    {
+      "epoch": 0.48913817663817666,
+      "grad_norm": 0.4596176743507385,
+      "learning_rate": 0.0001928028607017949,
+      "loss": 1.1098,
+      "step": 2747
+    },
+    {
+      "epoch": 0.4893162393162393,
+      "grad_norm": 0.5204966068267822,
+      "learning_rate": 0.00019279764560926142,
+      "loss": 1.1501,
+      "step": 2748
+    },
+    {
+      "epoch": 0.489494301994302,
+      "grad_norm": 0.5179490447044373,
+      "learning_rate": 0.0001927924286985508,
+      "loss": 1.2601,
+      "step": 2749
+    },
+    {
+      "epoch": 0.4896723646723647,
+      "grad_norm": 0.4563423693180084,
+      "learning_rate": 0.00019278720996976533,
+      "loss": 1.081,
+      "step": 2750
+    },
+    {
+      "epoch": 0.48985042735042733,
+      "grad_norm": 0.4906339943408966,
+      "learning_rate": 0.00019278198942300717,
+      "loss": 1.157,
+      "step": 2751
+    },
+    {
+      "epoch": 0.49002849002849,
+      "grad_norm": 0.42241403460502625,
+      "learning_rate": 0.00019277676705837873,
+      "loss": 1.0333,
+      "step": 2752
+    },
+    {
+      "epoch": 0.4902065527065527,
+      "grad_norm": 0.6310175657272339,
+      "learning_rate": 0.00019277154287598226,
+      "loss": 1.1225,
+      "step": 2753
+    },
+    {
+      "epoch": 0.49038461538461536,
+      "grad_norm": 0.5109034776687622,
+      "learning_rate": 0.0001927663168759201,
+      "loss": 1.1619,
+      "step": 2754
+    },
+    {
+      "epoch": 0.49056267806267806,
+      "grad_norm": 0.4809598922729492,
+      "learning_rate": 0.00019276108905829465,
+      "loss": 1.0423,
+      "step": 2755
+    },
+    {
+      "epoch": 0.49074074074074076,
+      "grad_norm": 0.557502806186676,
+      "learning_rate": 0.00019275585942320837,
+      "loss": 0.8783,
+      "step": 2756
+    },
+    {
+      "epoch": 0.4909188034188034,
+      "grad_norm": 0.5434393882751465,
+      "learning_rate": 0.0001927506279707637,
+      "loss": 1.1701,
+      "step": 2757
+    },
+    {
+      "epoch": 0.4910968660968661,
+      "grad_norm": 0.49278944730758667,
+      "learning_rate": 0.00019274539470106317,
+      "loss": 1.0447,
+      "step": 2758
+    },
+    {
+      "epoch": 0.4912749287749288,
+      "grad_norm": 0.5634264349937439,
+      "learning_rate": 0.00019274015961420927,
+      "loss": 1.0639,
+      "step": 2759
+    },
+    {
+      "epoch": 0.49145299145299143,
+      "grad_norm": 0.5632645487785339,
+      "learning_rate": 0.00019273492271030464,
+      "loss": 0.9223,
+      "step": 2760
+    },
+    {
+      "epoch": 0.4916310541310541,
+      "grad_norm": 0.5949172377586365,
+      "learning_rate": 0.00019272968398945177,
+      "loss": 0.894,
+      "step": 2761
+    },
+    {
+      "epoch": 0.4918091168091168,
+      "grad_norm": 0.5375374555587769,
+      "learning_rate": 0.00019272444345175342,
+      "loss": 1.0311,
+      "step": 2762
+    },
+    {
+      "epoch": 0.49198717948717946,
+      "grad_norm": 0.5211305022239685,
+      "learning_rate": 0.00019271920109731222,
+      "loss": 1.1531,
+      "step": 2763
+    },
+    {
+      "epoch": 0.49216524216524216,
+      "grad_norm": 0.44022253155708313,
+      "learning_rate": 0.00019271395692623084,
+      "loss": 0.9147,
+      "step": 2764
+    },
+    {
+      "epoch": 0.49234330484330485,
+      "grad_norm": 0.4682174623012543,
+      "learning_rate": 0.0001927087109386121,
+      "loss": 1.081,
+      "step": 2765
+    },
+    {
+      "epoch": 0.49252136752136755,
+      "grad_norm": 0.4971517324447632,
+      "learning_rate": 0.0001927034631345588,
+      "loss": 1.1017,
+      "step": 2766
+    },
+    {
+      "epoch": 0.4926994301994302,
+      "grad_norm": 0.5015294551849365,
+      "learning_rate": 0.00019269821351417364,
+      "loss": 1.1093,
+      "step": 2767
+    },
+    {
+      "epoch": 0.4928774928774929,
+      "grad_norm": 0.5512694716453552,
+      "learning_rate": 0.00019269296207755958,
+      "loss": 0.9657,
+      "step": 2768
+    },
+    {
+      "epoch": 0.4930555555555556,
+      "grad_norm": 0.4914868474006653,
+      "learning_rate": 0.00019268770882481948,
+      "loss": 1.0379,
+      "step": 2769
+    },
+    {
+      "epoch": 0.4932336182336182,
+      "grad_norm": 0.567337691783905,
+      "learning_rate": 0.00019268245375605626,
+      "loss": 1.004,
+      "step": 2770
+    },
+    {
+      "epoch": 0.4934116809116809,
+      "grad_norm": 0.518489420413971,
+      "learning_rate": 0.0001926771968713729,
+      "loss": 1.0734,
+      "step": 2771
+    },
+    {
+      "epoch": 0.4935897435897436,
+      "grad_norm": 0.567742109298706,
+      "learning_rate": 0.00019267193817087237,
+      "loss": 1.1276,
+      "step": 2772
+    },
+    {
+      "epoch": 0.49376780626780625,
+      "grad_norm": 0.5287964344024658,
+      "learning_rate": 0.00019266667765465773,
+      "loss": 1.1429,
+      "step": 2773
+    },
+    {
+      "epoch": 0.49394586894586895,
+      "grad_norm": 0.5302085876464844,
+      "learning_rate": 0.00019266141532283207,
+      "loss": 1.0934,
+      "step": 2774
+    },
+    {
+      "epoch": 0.49412393162393164,
+      "grad_norm": 0.5569987297058105,
+      "learning_rate": 0.00019265615117549842,
+      "loss": 1.1453,
+      "step": 2775
+    },
+    {
+      "epoch": 0.4943019943019943,
+      "grad_norm": 0.519695520401001,
+      "learning_rate": 0.00019265088521275997,
+      "loss": 1.1255,
+      "step": 2776
+    },
+    {
+      "epoch": 0.494480056980057,
+      "grad_norm": 0.5073211193084717,
+      "learning_rate": 0.0001926456174347199,
+      "loss": 1.0609,
+      "step": 2777
+    },
+    {
+      "epoch": 0.4946581196581197,
+      "grad_norm": 0.45028239488601685,
+      "learning_rate": 0.00019264034784148142,
+      "loss": 0.9098,
+      "step": 2778
+    },
+    {
+      "epoch": 0.4948361823361823,
+      "grad_norm": 0.6641215682029724,
+      "learning_rate": 0.00019263507643314776,
+      "loss": 0.8903,
+      "step": 2779
+    },
+    {
+      "epoch": 0.495014245014245,
+      "grad_norm": 0.5281413793563843,
+      "learning_rate": 0.00019262980320982224,
+      "loss": 1.2906,
+      "step": 2780
+    },
+    {
+      "epoch": 0.4951923076923077,
+      "grad_norm": 0.6256437301635742,
+      "learning_rate": 0.0001926245281716081,
+      "loss": 1.4142,
+      "step": 2781
+    },
+    {
+      "epoch": 0.49537037037037035,
+      "grad_norm": 0.5422517657279968,
+      "learning_rate": 0.00019261925131860877,
+      "loss": 1.1606,
+      "step": 2782
+    },
+    {
+      "epoch": 0.49554843304843305,
+      "grad_norm": 0.46938949823379517,
+      "learning_rate": 0.0001926139726509276,
+      "loss": 1.0333,
+      "step": 2783
+    },
+    {
+      "epoch": 0.49572649572649574,
+      "grad_norm": 0.5799683928489685,
+      "learning_rate": 0.000192608692168668,
+      "loss": 1.0333,
+      "step": 2784
+    },
+    {
+      "epoch": 0.4959045584045584,
+      "grad_norm": 0.5231602787971497,
+      "learning_rate": 0.0001926034098719335,
+      "loss": 1.1847,
+      "step": 2785
+    },
+    {
+      "epoch": 0.4960826210826211,
+      "grad_norm": 0.477845698595047,
+      "learning_rate": 0.00019259812576082752,
+      "loss": 1.0746,
+      "step": 2786
+    },
+    {
+      "epoch": 0.4962606837606838,
+      "grad_norm": 0.5490350723266602,
+      "learning_rate": 0.00019259283983545365,
+      "loss": 1.2462,
+      "step": 2787
+    },
+    {
+      "epoch": 0.4964387464387464,
+      "grad_norm": 0.5788847208023071,
+      "learning_rate": 0.0001925875520959154,
+      "loss": 1.3485,
+      "step": 2788
+    },
+    {
+      "epoch": 0.4966168091168091,
+      "grad_norm": 0.46184736490249634,
+      "learning_rate": 0.00019258226254231643,
+      "loss": 0.8673,
+      "step": 2789
+    },
+    {
+      "epoch": 0.4967948717948718,
+      "grad_norm": 0.4890633225440979,
+      "learning_rate": 0.0001925769711747603,
+      "loss": 0.9474,
+      "step": 2790
+    },
+    {
+      "epoch": 0.49697293447293445,
+      "grad_norm": 0.5719282627105713,
+      "learning_rate": 0.00019257167799335078,
+      "loss": 1.2532,
+      "step": 2791
+    },
+    {
+      "epoch": 0.49715099715099714,
+      "grad_norm": 0.5385584235191345,
+      "learning_rate": 0.0001925663829981915,
+      "loss": 1.1326,
+      "step": 2792
+    },
+    {
+      "epoch": 0.49732905982905984,
+      "grad_norm": 0.5339545011520386,
+      "learning_rate": 0.00019256108618938625,
+      "loss": 1.1362,
+      "step": 2793
+    },
+    {
+      "epoch": 0.49750712250712253,
+      "grad_norm": 0.5017803907394409,
+      "learning_rate": 0.00019255578756703878,
+      "loss": 1.0449,
+      "step": 2794
+    },
+    {
+      "epoch": 0.4976851851851852,
+      "grad_norm": 0.6004226803779602,
+      "learning_rate": 0.00019255048713125294,
+      "loss": 0.9346,
+      "step": 2795
+    },
+    {
+      "epoch": 0.49786324786324787,
+      "grad_norm": 0.44581490755081177,
+      "learning_rate": 0.00019254518488213255,
+      "loss": 1.038,
+      "step": 2796
+    },
+    {
+      "epoch": 0.49804131054131057,
+      "grad_norm": 0.5180951356887817,
+      "learning_rate": 0.00019253988081978151,
+      "loss": 1.0479,
+      "step": 2797
+    },
+    {
+      "epoch": 0.4982193732193732,
+      "grad_norm": 0.53944993019104,
+      "learning_rate": 0.00019253457494430376,
+      "loss": 1.2598,
+      "step": 2798
+    },
+    {
+      "epoch": 0.4983974358974359,
+      "grad_norm": 0.5633010268211365,
+      "learning_rate": 0.00019252926725580322,
+      "loss": 1.205,
+      "step": 2799
+    },
+    {
+      "epoch": 0.4985754985754986,
+      "grad_norm": 0.6653175950050354,
+      "learning_rate": 0.0001925239577543839,
+      "loss": 1.2383,
+      "step": 2800
+    },
+    {
+      "epoch": 0.49875356125356124,
+      "grad_norm": 0.5083333849906921,
+      "learning_rate": 0.00019251864644014984,
+      "loss": 1.0649,
+      "step": 2801
+    },
+    {
+      "epoch": 0.49893162393162394,
+      "grad_norm": 0.4842020571231842,
+      "learning_rate": 0.00019251333331320506,
+      "loss": 1.1991,
+      "step": 2802
+    },
+    {
+      "epoch": 0.49910968660968663,
+      "grad_norm": 0.47987112402915955,
+      "learning_rate": 0.00019250801837365373,
+      "loss": 1.1686,
+      "step": 2803
+    },
+    {
+      "epoch": 0.49928774928774927,
+      "grad_norm": 0.5316333770751953,
+      "learning_rate": 0.00019250270162159992,
+      "loss": 1.1759,
+      "step": 2804
+    },
+    {
+      "epoch": 0.49946581196581197,
+      "grad_norm": 0.5015079379081726,
+      "learning_rate": 0.00019249738305714787,
+      "loss": 0.9424,
+      "step": 2805
+    },
+    {
+      "epoch": 0.49964387464387466,
+      "grad_norm": 0.6488274931907654,
+      "learning_rate": 0.00019249206268040172,
+      "loss": 1.066,
+      "step": 2806
+    },
+    {
+      "epoch": 0.4998219373219373,
+      "grad_norm": 0.40364864468574524,
+      "learning_rate": 0.00019248674049146574,
+      "loss": 0.6998,
+      "step": 2807
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.5535672903060913,
+      "learning_rate": 0.00019248141649044423,
+      "loss": 1.2207,
+      "step": 2808
+    },
+    {
+      "epoch": 0.5,
+      "eval_loss": 1.1072274446487427,
+      "eval_runtime": 28.6913,
+      "eval_samples_per_second": 36.283,
+      "eval_steps_per_second": 18.159,
+      "step": 2808
+    },
+    {
+      "epoch": 0.5001780626780626,
+      "grad_norm": 0.4834389090538025,
+      "learning_rate": 0.00019247609067744143,
+      "loss": 1.1686,
+      "step": 2809
+    },
+    {
+      "epoch": 0.5003561253561254,
+      "grad_norm": 0.5007249712944031,
+      "learning_rate": 0.00019247076305256176,
+      "loss": 1.1343,
+      "step": 2810
+    },
+    {
+      "epoch": 0.500534188034188,
+      "grad_norm": 0.4773348271846771,
+      "learning_rate": 0.00019246543361590957,
+      "loss": 0.9324,
+      "step": 2811
+    },
+    {
+      "epoch": 0.5007122507122507,
+      "grad_norm": 0.47324609756469727,
+      "learning_rate": 0.0001924601023675893,
+      "loss": 1.0223,
+      "step": 2812
+    },
+    {
+      "epoch": 0.5008903133903134,
+      "grad_norm": 0.5583845973014832,
+      "learning_rate": 0.00019245476930770537,
+      "loss": 1.1328,
+      "step": 2813
+    },
+    {
+      "epoch": 0.5010683760683761,
+      "grad_norm": 0.4814579486846924,
+      "learning_rate": 0.00019244943443636232,
+      "loss": 1.0528,
+      "step": 2814
+    },
+    {
+      "epoch": 0.5012464387464387,
+      "grad_norm": 0.4996104836463928,
+      "learning_rate": 0.00019244409775366465,
+      "loss": 1.2482,
+      "step": 2815
+    },
+    {
+      "epoch": 0.5014245014245015,
+      "grad_norm": 0.47870904207229614,
+      "learning_rate": 0.0001924387592597169,
+      "loss": 0.9452,
+      "step": 2816
+    },
+    {
+      "epoch": 0.5016025641025641,
+      "grad_norm": 0.5617441534996033,
+      "learning_rate": 0.0001924334189546237,
+      "loss": 1.378,
+      "step": 2817
+    },
+    {
+      "epoch": 0.5017806267806267,
+      "grad_norm": 0.4872083365917206,
+      "learning_rate": 0.00019242807683848967,
+      "loss": 1.1571,
+      "step": 2818
+    },
+    {
+      "epoch": 0.5019586894586895,
+      "grad_norm": 0.5147804021835327,
+      "learning_rate": 0.00019242273291141947,
+      "loss": 1.1086,
+      "step": 2819
+    },
+    {
+      "epoch": 0.5021367521367521,
+      "grad_norm": 0.4698995351791382,
+      "learning_rate": 0.00019241738717351784,
+      "loss": 1.1579,
+      "step": 2820
+    },
+    {
+      "epoch": 0.5023148148148148,
+      "grad_norm": 0.5158926844596863,
+      "learning_rate": 0.00019241203962488946,
+      "loss": 1.2763,
+      "step": 2821
+    },
+    {
+      "epoch": 0.5024928774928775,
+      "grad_norm": 0.5218976736068726,
+      "learning_rate": 0.00019240669026563914,
+      "loss": 1.0633,
+      "step": 2822
+    },
+    {
+      "epoch": 0.5026709401709402,
+      "grad_norm": 0.5511452555656433,
+      "learning_rate": 0.0001924013390958717,
+      "loss": 0.9939,
+      "step": 2823
+    },
+    {
+      "epoch": 0.5028490028490028,
+      "grad_norm": 0.5227555632591248,
+      "learning_rate": 0.00019239598611569191,
+      "loss": 1.2478,
+      "step": 2824
+    },
+    {
+      "epoch": 0.5030270655270656,
+      "grad_norm": 0.5444719791412354,
+      "learning_rate": 0.00019239063132520475,
+      "loss": 1.1574,
+      "step": 2825
+    },
+    {
+      "epoch": 0.5032051282051282,
+      "grad_norm": 0.4752781093120575,
+      "learning_rate": 0.0001923852747245151,
+      "loss": 0.9034,
+      "step": 2826
+    },
+    {
+      "epoch": 0.5033831908831908,
+      "grad_norm": 0.5286496877670288,
+      "learning_rate": 0.00019237991631372792,
+      "loss": 1.1391,
+      "step": 2827
+    },
+    {
+      "epoch": 0.5035612535612536,
+      "grad_norm": 0.5009933710098267,
+      "learning_rate": 0.00019237455609294815,
+      "loss": 1.2178,
+      "step": 2828
+    },
+    {
+      "epoch": 0.5037393162393162,
+      "grad_norm": 0.5012276768684387,
+      "learning_rate": 0.00019236919406228085,
+      "loss": 0.9877,
+      "step": 2829
+    },
+    {
+      "epoch": 0.5039173789173789,
+      "grad_norm": 0.576508104801178,
+      "learning_rate": 0.00019236383022183106,
+      "loss": 1.1299,
+      "step": 2830
+    },
+    {
+      "epoch": 0.5040954415954416,
+      "grad_norm": 0.4716590642929077,
+      "learning_rate": 0.0001923584645717039,
+      "loss": 1.0451,
+      "step": 2831
+    },
+    {
+      "epoch": 0.5042735042735043,
+      "grad_norm": 0.5817418098449707,
+      "learning_rate": 0.00019235309711200448,
+      "loss": 1.0911,
+      "step": 2832
+    },
+    {
+      "epoch": 0.5044515669515669,
+      "grad_norm": 0.5695745944976807,
+      "learning_rate": 0.000192347727842838,
+      "loss": 1.0229,
+      "step": 2833
+    },
+    {
+      "epoch": 0.5046296296296297,
+      "grad_norm": 0.49127066135406494,
+      "learning_rate": 0.00019234235676430958,
+      "loss": 1.1377,
+      "step": 2834
+    },
+    {
+      "epoch": 0.5048076923076923,
+      "grad_norm": 0.5426172614097595,
+      "learning_rate": 0.00019233698387652453,
+      "loss": 1.2427,
+      "step": 2835
+    },
+    {
+      "epoch": 0.5049857549857549,
+      "grad_norm": 0.5342385172843933,
+      "learning_rate": 0.0001923316091795881,
+      "loss": 1.1427,
+      "step": 2836
+    },
+    {
+      "epoch": 0.5051638176638177,
+      "grad_norm": 0.5480486750602722,
+      "learning_rate": 0.00019232623267360558,
+      "loss": 1.0647,
+      "step": 2837
+    },
+    {
+      "epoch": 0.5053418803418803,
+      "grad_norm": 0.4584530293941498,
+      "learning_rate": 0.00019232085435868235,
+      "loss": 1.0461,
+      "step": 2838
+    },
+    {
+      "epoch": 0.5055199430199431,
+      "grad_norm": 0.5992119908332825,
+      "learning_rate": 0.00019231547423492371,
+      "loss": 1.1456,
+      "step": 2839
+    },
+    {
+      "epoch": 0.5056980056980057,
+      "grad_norm": 0.514018177986145,
+      "learning_rate": 0.00019231009230243515,
+      "loss": 1.2559,
+      "step": 2840
+    },
+    {
+      "epoch": 0.5058760683760684,
+      "grad_norm": 0.5392283797264099,
+      "learning_rate": 0.0001923047085613221,
+      "loss": 1.044,
+      "step": 2841
+    },
+    {
+      "epoch": 0.5060541310541311,
+      "grad_norm": 0.4486566483974457,
+      "learning_rate": 0.00019229932301169,
+      "loss": 1.0679,
+      "step": 2842
+    },
+    {
+      "epoch": 0.5062321937321937,
+      "grad_norm": 0.4523460566997528,
+      "learning_rate": 0.00019229393565364442,
+      "loss": 1.1651,
+      "step": 2843
+    },
+    {
+      "epoch": 0.5064102564102564,
+      "grad_norm": 0.6032688021659851,
+      "learning_rate": 0.0001922885464872909,
+      "loss": 1.15,
+      "step": 2844
+    },
+    {
+      "epoch": 0.5065883190883191,
+      "grad_norm": 0.5883688926696777,
+      "learning_rate": 0.000192283155512735,
+      "loss": 1.2179,
+      "step": 2845
+    },
+    {
+      "epoch": 0.5067663817663818,
+      "grad_norm": 0.5534378886222839,
+      "learning_rate": 0.00019227776273008238,
+      "loss": 1.0387,
+      "step": 2846
+    },
+    {
+      "epoch": 0.5069444444444444,
+      "grad_norm": 0.5899033546447754,
+      "learning_rate": 0.00019227236813943872,
+      "loss": 1.0812,
+      "step": 2847
+    },
+    {
+      "epoch": 0.5071225071225072,
+      "grad_norm": 0.5718855261802673,
+      "learning_rate": 0.00019226697174090965,
+      "loss": 1.1375,
+      "step": 2848
+    },
+    {
+      "epoch": 0.5073005698005698,
+      "grad_norm": 0.5080967545509338,
+      "learning_rate": 0.00019226157353460094,
+      "loss": 1.1421,
+      "step": 2849
+    },
+    {
+      "epoch": 0.5074786324786325,
+      "grad_norm": 0.5253677368164062,
+      "learning_rate": 0.0001922561735206184,
+      "loss": 1.0166,
+      "step": 2850
+    },
+    {
+      "epoch": 0.5076566951566952,
+      "grad_norm": 0.47797444462776184,
+      "learning_rate": 0.00019225077169906772,
+      "loss": 1.0504,
+      "step": 2851
+    },
+    {
+      "epoch": 0.5078347578347578,
+      "grad_norm": 0.4911690652370453,
+      "learning_rate": 0.0001922453680700548,
+      "loss": 1.0629,
+      "step": 2852
+    },
+    {
+      "epoch": 0.5080128205128205,
+      "grad_norm": 0.49678200483322144,
+      "learning_rate": 0.00019223996263368557,
+      "loss": 1.1672,
+      "step": 2853
+    },
+    {
+      "epoch": 0.5081908831908832,
+      "grad_norm": 0.5451810359954834,
+      "learning_rate": 0.00019223455539006586,
+      "loss": 1.3031,
+      "step": 2854
+    },
+    {
+      "epoch": 0.5083689458689459,
+      "grad_norm": 0.5708984136581421,
+      "learning_rate": 0.00019222914633930166,
+      "loss": 1.0986,
+      "step": 2855
+    },
+    {
+      "epoch": 0.5085470085470085,
+      "grad_norm": 0.47232356667518616,
+      "learning_rate": 0.00019222373548149888,
+      "loss": 1.0449,
+      "step": 2856
+    },
+    {
+      "epoch": 0.5087250712250713,
+      "grad_norm": 0.6027610898017883,
+      "learning_rate": 0.0001922183228167636,
+      "loss": 0.862,
+      "step": 2857
+    },
+    {
+      "epoch": 0.5089031339031339,
+      "grad_norm": 0.5211802124977112,
+      "learning_rate": 0.00019221290834520188,
+      "loss": 1.1048,
+      "step": 2858
+    },
+    {
+      "epoch": 0.5090811965811965,
+      "grad_norm": 0.45101237297058105,
+      "learning_rate": 0.00019220749206691972,
+      "loss": 1.0046,
+      "step": 2859
+    },
+    {
+      "epoch": 0.5092592592592593,
+      "grad_norm": 0.5526158213615417,
+      "learning_rate": 0.00019220207398202335,
+      "loss": 1.2275,
+      "step": 2860
+    },
+    {
+      "epoch": 0.5094373219373219,
+      "grad_norm": 0.48322010040283203,
+      "learning_rate": 0.00019219665409061885,
+      "loss": 0.9974,
+      "step": 2861
+    },
+    {
+      "epoch": 0.5096153846153846,
+      "grad_norm": 0.4775219261646271,
+      "learning_rate": 0.00019219123239281244,
+      "loss": 1.1852,
+      "step": 2862
+    },
+    {
+      "epoch": 0.5097934472934473,
+      "grad_norm": 0.46184200048446655,
+      "learning_rate": 0.00019218580888871034,
+      "loss": 0.9393,
+      "step": 2863
+    },
+    {
+      "epoch": 0.50997150997151,
+      "grad_norm": 0.47495174407958984,
+      "learning_rate": 0.00019218038357841883,
+      "loss": 0.9631,
+      "step": 2864
+    },
+    {
+      "epoch": 0.5101495726495726,
+      "grad_norm": 0.48600029945373535,
+      "learning_rate": 0.00019217495646204418,
+      "loss": 1.0498,
+      "step": 2865
+    },
+    {
+      "epoch": 0.5103276353276354,
+      "grad_norm": 0.5801547169685364,
+      "learning_rate": 0.00019216952753969274,
+      "loss": 1.2181,
+      "step": 2866
+    },
+    {
+      "epoch": 0.510505698005698,
+      "grad_norm": 0.5082106590270996,
+      "learning_rate": 0.00019216409681147085,
+      "loss": 1.2009,
+      "step": 2867
+    },
+    {
+      "epoch": 0.5106837606837606,
+      "grad_norm": 0.4184330701828003,
+      "learning_rate": 0.00019215866427748493,
+      "loss": 0.8462,
+      "step": 2868
+    },
+    {
+      "epoch": 0.5108618233618234,
+      "grad_norm": 0.518099844455719,
+      "learning_rate": 0.00019215322993784147,
+      "loss": 1.2091,
+      "step": 2869
+    },
+    {
+      "epoch": 0.511039886039886,
+      "grad_norm": 0.569464921951294,
+      "learning_rate": 0.0001921477937926469,
+      "loss": 1.0264,
+      "step": 2870
+    },
+    {
+      "epoch": 0.5112179487179487,
+      "grad_norm": 0.526767909526825,
+      "learning_rate": 0.00019214235584200768,
+      "loss": 1.1192,
+      "step": 2871
+    },
+    {
+      "epoch": 0.5113960113960114,
+      "grad_norm": 0.6511057019233704,
+      "learning_rate": 0.00019213691608603047,
+      "loss": 1.3193,
+      "step": 2872
+    },
+    {
+      "epoch": 0.5115740740740741,
+      "grad_norm": 0.48536401987075806,
+      "learning_rate": 0.00019213147452482173,
+      "loss": 1.1671,
+      "step": 2873
+    },
+    {
+      "epoch": 0.5117521367521367,
+      "grad_norm": 0.7972469329833984,
+      "learning_rate": 0.00019212603115848818,
+      "loss": 1.1393,
+      "step": 2874
+    },
+    {
+      "epoch": 0.5119301994301995,
+      "grad_norm": 0.5543264746665955,
+      "learning_rate": 0.00019212058598713642,
+      "loss": 1.1436,
+      "step": 2875
+    },
+    {
+      "epoch": 0.5121082621082621,
+      "grad_norm": 0.49688720703125,
+      "learning_rate": 0.0001921151390108731,
+      "loss": 1.0897,
+      "step": 2876
+    },
+    {
+      "epoch": 0.5122863247863247,
+      "grad_norm": 0.4928736090660095,
+      "learning_rate": 0.000192109690229805,
+      "loss": 1.2426,
+      "step": 2877
+    },
+    {
+      "epoch": 0.5124643874643875,
+      "grad_norm": 0.4917896091938019,
+      "learning_rate": 0.0001921042396440389,
+      "loss": 1.0047,
+      "step": 2878
+    },
+    {
+      "epoch": 0.5126424501424501,
+      "grad_norm": 0.5485204458236694,
+      "learning_rate": 0.00019209878725368152,
+      "loss": 1.2615,
+      "step": 2879
+    },
+    {
+      "epoch": 0.5128205128205128,
+      "grad_norm": 0.5229470133781433,
+      "learning_rate": 0.0001920933330588397,
+      "loss": 1.3249,
+      "step": 2880
+    },
+    {
+      "epoch": 0.5129985754985755,
+      "grad_norm": 0.4783077538013458,
+      "learning_rate": 0.00019208787705962037,
+      "loss": 1.2004,
+      "step": 2881
+    },
+    {
+      "epoch": 0.5131766381766382,
+      "grad_norm": 0.5106910467147827,
+      "learning_rate": 0.00019208241925613035,
+      "loss": 1.1745,
+      "step": 2882
+    },
+    {
+      "epoch": 0.5133547008547008,
+      "grad_norm": 0.5308730006217957,
+      "learning_rate": 0.00019207695964847666,
+      "loss": 0.9706,
+      "step": 2883
+    },
+    {
+      "epoch": 0.5135327635327636,
+      "grad_norm": 0.5489775538444519,
+      "learning_rate": 0.00019207149823676617,
+      "loss": 1.0073,
+      "step": 2884
+    },
+    {
+      "epoch": 0.5137108262108262,
+      "grad_norm": 0.4992835521697998,
+      "learning_rate": 0.00019206603502110596,
+      "loss": 1.1053,
+      "step": 2885
+    },
+    {
+      "epoch": 0.5138888888888888,
+      "grad_norm": 0.5304922461509705,
+      "learning_rate": 0.00019206057000160302,
+      "loss": 1.0565,
+      "step": 2886
+    },
+    {
+      "epoch": 0.5140669515669516,
+      "grad_norm": 0.46411609649658203,
+      "learning_rate": 0.00019205510317836448,
+      "loss": 0.9202,
+      "step": 2887
+    },
+    {
+      "epoch": 0.5142450142450142,
+      "grad_norm": 0.5236835479736328,
+      "learning_rate": 0.0001920496345514974,
+      "loss": 0.9075,
+      "step": 2888
+    },
+    {
+      "epoch": 0.5144230769230769,
+      "grad_norm": 0.4416964054107666,
+      "learning_rate": 0.00019204416412110895,
+      "loss": 0.9225,
+      "step": 2889
+    },
+    {
+      "epoch": 0.5146011396011396,
+      "grad_norm": 0.5470940470695496,
+      "learning_rate": 0.00019203869188730633,
+      "loss": 1.2195,
+      "step": 2890
+    },
+    {
+      "epoch": 0.5147792022792023,
+      "grad_norm": 0.5380414128303528,
+      "learning_rate": 0.0001920332178501967,
+      "loss": 1.0731,
+      "step": 2891
+    },
+    {
+      "epoch": 0.5149572649572649,
+      "grad_norm": 0.4405716359615326,
+      "learning_rate": 0.00019202774200988737,
+      "loss": 0.8739,
+      "step": 2892
+    },
+    {
+      "epoch": 0.5151353276353277,
+      "grad_norm": 0.5222984552383423,
+      "learning_rate": 0.0001920222643664856,
+      "loss": 1.1806,
+      "step": 2893
+    },
+    {
+      "epoch": 0.5153133903133903,
+      "grad_norm": 0.48545539379119873,
+      "learning_rate": 0.0001920167849200987,
+      "loss": 0.9939,
+      "step": 2894
+    },
+    {
+      "epoch": 0.5154914529914529,
+      "grad_norm": 0.45078009366989136,
+      "learning_rate": 0.0001920113036708341,
+      "loss": 1.0085,
+      "step": 2895
+    },
+    {
+      "epoch": 0.5156695156695157,
+      "grad_norm": 0.5029830932617188,
+      "learning_rate": 0.00019200582061879913,
+      "loss": 1.1095,
+      "step": 2896
+    },
+    {
+      "epoch": 0.5158475783475783,
+      "grad_norm": 0.5316143035888672,
+      "learning_rate": 0.00019200033576410118,
+      "loss": 0.9883,
+      "step": 2897
+    },
+    {
+      "epoch": 0.5160256410256411,
+      "grad_norm": 0.5282100439071655,
+      "learning_rate": 0.0001919948491068478,
+      "loss": 1.1441,
+      "step": 2898
+    },
+    {
+      "epoch": 0.5162037037037037,
+      "grad_norm": 0.5145367980003357,
+      "learning_rate": 0.00019198936064714647,
+      "loss": 1.1999,
+      "step": 2899
+    },
+    {
+      "epoch": 0.5163817663817664,
+      "grad_norm": 0.5385651588439941,
+      "learning_rate": 0.00019198387038510468,
+      "loss": 1.1831,
+      "step": 2900
+    },
+    {
+      "epoch": 0.5165598290598291,
+      "grad_norm": 0.4971916377544403,
+      "learning_rate": 0.00019197837832083002,
+      "loss": 1.2518,
+      "step": 2901
+    },
+    {
+      "epoch": 0.5167378917378918,
+      "grad_norm": 0.5253807306289673,
+      "learning_rate": 0.00019197288445443016,
+      "loss": 1.0788,
+      "step": 2902
+    },
+    {
+      "epoch": 0.5169159544159544,
+      "grad_norm": 0.49724945425987244,
+      "learning_rate": 0.00019196738878601263,
+      "loss": 1.0985,
+      "step": 2903
+    },
+    {
+      "epoch": 0.5170940170940171,
+      "grad_norm": 0.5327325463294983,
+      "learning_rate": 0.0001919618913156852,
+      "loss": 1.2862,
+      "step": 2904
+    },
+    {
+      "epoch": 0.5172720797720798,
+      "grad_norm": 0.639999270439148,
+      "learning_rate": 0.00019195639204355554,
+      "loss": 1.2052,
+      "step": 2905
+    },
+    {
+      "epoch": 0.5174501424501424,
+      "grad_norm": 0.4630785584449768,
+      "learning_rate": 0.0001919508909697314,
+      "loss": 1.1157,
+      "step": 2906
+    },
+    {
+      "epoch": 0.5176282051282052,
+      "grad_norm": 0.513949990272522,
+      "learning_rate": 0.00019194538809432055,
+      "loss": 1.0047,
+      "step": 2907
+    },
+    {
+      "epoch": 0.5178062678062678,
+      "grad_norm": 0.488034725189209,
+      "learning_rate": 0.0001919398834174308,
+      "loss": 0.9008,
+      "step": 2908
+    },
+    {
+      "epoch": 0.5179843304843305,
+      "grad_norm": 0.4892788529396057,
+      "learning_rate": 0.00019193437693917006,
+      "loss": 1.1024,
+      "step": 2909
+    },
+    {
+      "epoch": 0.5181623931623932,
+      "grad_norm": 0.5503842830657959,
+      "learning_rate": 0.00019192886865964618,
+      "loss": 1.2283,
+      "step": 2910
+    },
+    {
+      "epoch": 0.5183404558404558,
+      "grad_norm": 0.48885393142700195,
+      "learning_rate": 0.00019192335857896707,
+      "loss": 0.9522,
+      "step": 2911
+    },
+    {
+      "epoch": 0.5185185185185185,
+      "grad_norm": 0.5479527115821838,
+      "learning_rate": 0.00019191784669724072,
+      "loss": 1.1616,
+      "step": 2912
+    },
+    {
+      "epoch": 0.5186965811965812,
+      "grad_norm": 0.42701148986816406,
+      "learning_rate": 0.00019191233301457506,
+      "loss": 0.8434,
+      "step": 2913
+    },
+    {
+      "epoch": 0.5188746438746439,
+      "grad_norm": 0.4273422658443451,
+      "learning_rate": 0.00019190681753107822,
+      "loss": 0.8316,
+      "step": 2914
+    },
+    {
+      "epoch": 0.5190527065527065,
+      "grad_norm": 0.5047736763954163,
+      "learning_rate": 0.00019190130024685818,
+      "loss": 1.171,
+      "step": 2915
+    },
+    {
+      "epoch": 0.5192307692307693,
+      "grad_norm": 0.5221177935600281,
+      "learning_rate": 0.00019189578116202307,
+      "loss": 1.0256,
+      "step": 2916
+    },
+    {
+      "epoch": 0.5194088319088319,
+      "grad_norm": 0.4782322943210602,
+      "learning_rate": 0.00019189026027668105,
+      "loss": 0.8598,
+      "step": 2917
+    },
+    {
+      "epoch": 0.5195868945868946,
+      "grad_norm": 0.5627185702323914,
+      "learning_rate": 0.00019188473759094022,
+      "loss": 1.1825,
+      "step": 2918
+    },
+    {
+      "epoch": 0.5197649572649573,
+      "grad_norm": 0.5036423206329346,
+      "learning_rate": 0.00019187921310490888,
+      "loss": 1.0881,
+      "step": 2919
+    },
+    {
+      "epoch": 0.51994301994302,
+      "grad_norm": 0.4271143972873688,
+      "learning_rate": 0.0001918736868186952,
+      "loss": 0.9265,
+      "step": 2920
+    },
+    {
+      "epoch": 0.5201210826210826,
+      "grad_norm": 0.5427432656288147,
+      "learning_rate": 0.00019186815873240747,
+      "loss": 1.196,
+      "step": 2921
+    },
+    {
+      "epoch": 0.5202991452991453,
+      "grad_norm": 0.5494198203086853,
+      "learning_rate": 0.00019186262884615402,
+      "loss": 1.1207,
+      "step": 2922
+    },
+    {
+      "epoch": 0.520477207977208,
+      "grad_norm": 0.5305119752883911,
+      "learning_rate": 0.0001918570971600432,
+      "loss": 1.0393,
+      "step": 2923
+    },
+    {
+      "epoch": 0.5206552706552706,
+      "grad_norm": 0.46713170409202576,
+      "learning_rate": 0.00019185156367418333,
+      "loss": 0.9583,
+      "step": 2924
+    },
+    {
+      "epoch": 0.5208333333333334,
+      "grad_norm": 0.597776472568512,
+      "learning_rate": 0.00019184602838868292,
+      "loss": 1.2978,
+      "step": 2925
+    },
+    {
+      "epoch": 0.521011396011396,
+      "grad_norm": 0.520976722240448,
+      "learning_rate": 0.00019184049130365036,
+      "loss": 1.0515,
+      "step": 2926
+    },
+    {
+      "epoch": 0.5211894586894587,
+      "grad_norm": 0.5266290307044983,
+      "learning_rate": 0.00019183495241919415,
+      "loss": 1.0437,
+      "step": 2927
+    },
+    {
+      "epoch": 0.5213675213675214,
+      "grad_norm": 0.50911545753479,
+      "learning_rate": 0.00019182941173542285,
+      "loss": 0.9977,
+      "step": 2928
+    },
+    {
+      "epoch": 0.521545584045584,
+      "grad_norm": 0.4924670457839966,
+      "learning_rate": 0.00019182386925244496,
+      "loss": 0.9309,
+      "step": 2929
+    },
+    {
+      "epoch": 0.5217236467236467,
+      "grad_norm": 0.4979301393032074,
+      "learning_rate": 0.00019181832497036912,
+      "loss": 0.87,
+      "step": 2930
+    },
+    {
+      "epoch": 0.5219017094017094,
+      "grad_norm": 0.6307916045188904,
+      "learning_rate": 0.0001918127788893039,
+      "loss": 1.2159,
+      "step": 2931
+    },
+    {
+      "epoch": 0.5220797720797721,
+      "grad_norm": 0.4915660619735718,
+      "learning_rate": 0.00019180723100935802,
+      "loss": 1.0828,
+      "step": 2932
+    },
+    {
+      "epoch": 0.5222578347578347,
+      "grad_norm": 0.4312742352485657,
+      "learning_rate": 0.00019180168133064017,
+      "loss": 1.0496,
+      "step": 2933
+    },
+    {
+      "epoch": 0.5224358974358975,
+      "grad_norm": 0.6006124019622803,
+      "learning_rate": 0.00019179612985325908,
+      "loss": 1.0751,
+      "step": 2934
+    },
+    {
+      "epoch": 0.5226139601139601,
+      "grad_norm": 0.5332220196723938,
+      "learning_rate": 0.0001917905765773235,
+      "loss": 1.2601,
+      "step": 2935
+    },
+    {
+      "epoch": 0.5227920227920227,
+      "grad_norm": 0.4877954423427582,
+      "learning_rate": 0.00019178502150294223,
+      "loss": 1.2279,
+      "step": 2936
+    },
+    {
+      "epoch": 0.5229700854700855,
+      "grad_norm": 0.5975968837738037,
+      "learning_rate": 0.00019177946463022418,
+      "loss": 1.3371,
+      "step": 2937
+    },
+    {
+      "epoch": 0.5231481481481481,
+      "grad_norm": 0.5363923907279968,
+      "learning_rate": 0.00019177390595927815,
+      "loss": 1.0705,
+      "step": 2938
+    },
+    {
+      "epoch": 0.5233262108262108,
+      "grad_norm": 0.4314909875392914,
+      "learning_rate": 0.0001917683454902131,
+      "loss": 0.9172,
+      "step": 2939
+    },
+    {
+      "epoch": 0.5235042735042735,
+      "grad_norm": 0.46187883615493774,
+      "learning_rate": 0.0001917627832231379,
+      "loss": 1.1201,
+      "step": 2940
+    },
+    {
+      "epoch": 0.5236823361823362,
+      "grad_norm": 0.4648260772228241,
+      "learning_rate": 0.00019175721915816162,
+      "loss": 1.1307,
+      "step": 2941
+    },
+    {
+      "epoch": 0.5238603988603988,
+      "grad_norm": 0.4427165687084198,
+      "learning_rate": 0.00019175165329539325,
+      "loss": 0.9459,
+      "step": 2942
+    },
+    {
+      "epoch": 0.5240384615384616,
+      "grad_norm": 0.4645056128501892,
+      "learning_rate": 0.0001917460856349418,
+      "loss": 0.9176,
+      "step": 2943
+    },
+    {
+      "epoch": 0.5242165242165242,
+      "grad_norm": 0.4939568042755127,
+      "learning_rate": 0.0001917405161769164,
+      "loss": 1.1056,
+      "step": 2944
+    },
+    {
+      "epoch": 0.5243945868945868,
+      "grad_norm": 0.6057310104370117,
+      "learning_rate": 0.00019173494492142617,
+      "loss": 1.2714,
+      "step": 2945
+    },
+    {
+      "epoch": 0.5245726495726496,
+      "grad_norm": 0.5038546323776245,
+      "learning_rate": 0.00019172937186858025,
+      "loss": 0.911,
+      "step": 2946
+    },
+    {
+      "epoch": 0.5247507122507122,
+      "grad_norm": 0.5521321296691895,
+      "learning_rate": 0.00019172379701848784,
+      "loss": 1.0781,
+      "step": 2947
+    },
+    {
+      "epoch": 0.5249287749287749,
+      "grad_norm": 0.516979455947876,
+      "learning_rate": 0.00019171822037125817,
+      "loss": 1.1051,
+      "step": 2948
+    },
+    {
+      "epoch": 0.5251068376068376,
+      "grad_norm": 0.5443150997161865,
+      "learning_rate": 0.0001917126419270005,
+      "loss": 1.0802,
+      "step": 2949
+    },
+    {
+      "epoch": 0.5252849002849003,
+      "grad_norm": 0.5373311042785645,
+      "learning_rate": 0.00019170706168582412,
+      "loss": 0.9313,
+      "step": 2950
+    },
+    {
+      "epoch": 0.5254629629629629,
+      "grad_norm": 0.7511917948722839,
+      "learning_rate": 0.0001917014796478384,
+      "loss": 1.1958,
+      "step": 2951
+    },
+    {
+      "epoch": 0.5256410256410257,
+      "grad_norm": 0.49893468618392944,
+      "learning_rate": 0.00019169589581315263,
+      "loss": 0.9387,
+      "step": 2952
+    },
+    {
+      "epoch": 0.5258190883190883,
+      "grad_norm": 0.48010289669036865,
+      "learning_rate": 0.00019169031018187628,
+      "loss": 1.2459,
+      "step": 2953
+    },
+    {
+      "epoch": 0.5259971509971509,
+      "grad_norm": 0.48768678307533264,
+      "learning_rate": 0.0001916847227541188,
+      "loss": 1.0127,
+      "step": 2954
+    },
+    {
+      "epoch": 0.5261752136752137,
+      "grad_norm": 0.5973068475723267,
+      "learning_rate": 0.00019167913352998963,
+      "loss": 1.1685,
+      "step": 2955
+    },
+    {
+      "epoch": 0.5263532763532763,
+      "grad_norm": 0.5567806959152222,
+      "learning_rate": 0.00019167354250959826,
+      "loss": 1.142,
+      "step": 2956
+    },
+    {
+      "epoch": 0.5265313390313391,
+      "grad_norm": 0.47819700837135315,
+      "learning_rate": 0.00019166794969305428,
+      "loss": 0.712,
+      "step": 2957
+    },
+    {
+      "epoch": 0.5267094017094017,
+      "grad_norm": 0.5191744565963745,
+      "learning_rate": 0.00019166235508046725,
+      "loss": 1.2208,
+      "step": 2958
+    },
+    {
+      "epoch": 0.5268874643874644,
+      "grad_norm": 0.4987856149673462,
+      "learning_rate": 0.00019165675867194675,
+      "loss": 1.0466,
+      "step": 2959
+    },
+    {
+      "epoch": 0.5270655270655271,
+      "grad_norm": 0.5017665028572083,
+      "learning_rate": 0.0001916511604676025,
+      "loss": 1.1236,
+      "step": 2960
+    },
+    {
+      "epoch": 0.5272435897435898,
+      "grad_norm": 0.5115348696708679,
+      "learning_rate": 0.00019164556046754415,
+      "loss": 1.1497,
+      "step": 2961
+    },
+    {
+      "epoch": 0.5274216524216524,
+      "grad_norm": 0.4934345781803131,
+      "learning_rate": 0.0001916399586718814,
+      "loss": 1.0183,
+      "step": 2962
+    },
+    {
+      "epoch": 0.5275997150997151,
+      "grad_norm": 0.5033719539642334,
+      "learning_rate": 0.00019163435508072404,
+      "loss": 1.0256,
+      "step": 2963
+    },
+    {
+      "epoch": 0.5277777777777778,
+      "grad_norm": 0.5325372219085693,
+      "learning_rate": 0.00019162874969418184,
+      "loss": 1.1384,
+      "step": 2964
+    },
+    {
+      "epoch": 0.5279558404558404,
+      "grad_norm": 0.4901772141456604,
+      "learning_rate": 0.00019162314251236465,
+      "loss": 1.0831,
+      "step": 2965
+    },
+    {
+      "epoch": 0.5281339031339032,
+      "grad_norm": 0.4743805229663849,
+      "learning_rate": 0.0001916175335353823,
+      "loss": 1.1894,
+      "step": 2966
+    },
+    {
+      "epoch": 0.5283119658119658,
+      "grad_norm": 0.5439450740814209,
+      "learning_rate": 0.00019161192276334466,
+      "loss": 1.2066,
+      "step": 2967
+    },
+    {
+      "epoch": 0.5284900284900285,
+      "grad_norm": 0.5123090744018555,
+      "learning_rate": 0.00019160631019636174,
+      "loss": 1.1829,
+      "step": 2968
+    },
+    {
+      "epoch": 0.5286680911680912,
+      "grad_norm": 0.5995343923568726,
+      "learning_rate": 0.00019160069583454346,
+      "loss": 1.4872,
+      "step": 2969
+    },
+    {
+      "epoch": 0.5288461538461539,
+      "grad_norm": 0.4596657156944275,
+      "learning_rate": 0.00019159507967799985,
+      "loss": 0.8948,
+      "step": 2970
+    },
+    {
+      "epoch": 0.5290242165242165,
+      "grad_norm": 0.5533682107925415,
+      "learning_rate": 0.0001915894617268409,
+      "loss": 1.1779,
+      "step": 2971
+    },
+    {
+      "epoch": 0.5292022792022792,
+      "grad_norm": 0.3860718309879303,
+      "learning_rate": 0.00019158384198117673,
+      "loss": 0.6424,
+      "step": 2972
+    },
+    {
+      "epoch": 0.5293803418803419,
+      "grad_norm": 0.47424063086509705,
+      "learning_rate": 0.0001915782204411174,
+      "loss": 1.1592,
+      "step": 2973
+    },
+    {
+      "epoch": 0.5295584045584045,
+      "grad_norm": 0.5050228834152222,
+      "learning_rate": 0.00019157259710677309,
+      "loss": 1.1971,
+      "step": 2974
+    },
+    {
+      "epoch": 0.5297364672364673,
+      "grad_norm": 0.6080113649368286,
+      "learning_rate": 0.00019156697197825396,
+      "loss": 1.1511,
+      "step": 2975
+    },
+    {
+      "epoch": 0.5299145299145299,
+      "grad_norm": 0.4805932641029358,
+      "learning_rate": 0.00019156134505567024,
+      "loss": 1.1033,
+      "step": 2976
+    },
+    {
+      "epoch": 0.5300925925925926,
+      "grad_norm": 0.4835345447063446,
+      "learning_rate": 0.00019155571633913215,
+      "loss": 1.1832,
+      "step": 2977
+    },
+    {
+      "epoch": 0.5302706552706553,
+      "grad_norm": 0.5183725953102112,
+      "learning_rate": 0.00019155008582875,
+      "loss": 0.9221,
+      "step": 2978
+    },
+    {
+      "epoch": 0.530448717948718,
+      "grad_norm": 0.48015761375427246,
+      "learning_rate": 0.00019154445352463412,
+      "loss": 1.045,
+      "step": 2979
+    },
+    {
+      "epoch": 0.5306267806267806,
+      "grad_norm": 0.4670043885707855,
+      "learning_rate": 0.0001915388194268948,
+      "loss": 0.9025,
+      "step": 2980
+    },
+    {
+      "epoch": 0.5308048433048433,
+      "grad_norm": 0.5048824548721313,
+      "learning_rate": 0.0001915331835356425,
+      "loss": 1.0681,
+      "step": 2981
+    },
+    {
+      "epoch": 0.530982905982906,
+      "grad_norm": 0.4785633981227875,
+      "learning_rate": 0.00019152754585098758,
+      "loss": 1.0097,
+      "step": 2982
+    },
+    {
+      "epoch": 0.5311609686609686,
+      "grad_norm": 0.4829573333263397,
+      "learning_rate": 0.00019152190637304056,
+      "loss": 1.0856,
+      "step": 2983
+    },
+    {
+      "epoch": 0.5313390313390314,
+      "grad_norm": 0.5425563454627991,
+      "learning_rate": 0.00019151626510191189,
+      "loss": 1.2313,
+      "step": 2984
+    },
+    {
+      "epoch": 0.531517094017094,
+      "grad_norm": 0.5532251596450806,
+      "learning_rate": 0.0001915106220377121,
+      "loss": 1.0328,
+      "step": 2985
+    },
+    {
+      "epoch": 0.5316951566951567,
+      "grad_norm": 0.47016972303390503,
+      "learning_rate": 0.0001915049771805518,
+      "loss": 1.2003,
+      "step": 2986
+    },
+    {
+      "epoch": 0.5318732193732194,
+      "grad_norm": 0.5241743326187134,
+      "learning_rate": 0.00019149933053054153,
+      "loss": 1.046,
+      "step": 2987
+    },
+    {
+      "epoch": 0.532051282051282,
+      "grad_norm": 0.5043526887893677,
+      "learning_rate": 0.00019149368208779197,
+      "loss": 1.0022,
+      "step": 2988
+    },
+    {
+      "epoch": 0.5322293447293447,
+      "grad_norm": 0.5563312768936157,
+      "learning_rate": 0.00019148803185241374,
+      "loss": 1.1017,
+      "step": 2989
+    },
+    {
+      "epoch": 0.5324074074074074,
+      "grad_norm": 0.5414231419563293,
+      "learning_rate": 0.00019148237982451763,
+      "loss": 0.9649,
+      "step": 2990
+    },
+    {
+      "epoch": 0.5325854700854701,
+      "grad_norm": 0.5452231764793396,
+      "learning_rate": 0.0001914767260042143,
+      "loss": 1.2281,
+      "step": 2991
+    },
+    {
+      "epoch": 0.5327635327635327,
+      "grad_norm": 0.5500698685646057,
+      "learning_rate": 0.00019147107039161454,
+      "loss": 1.2865,
+      "step": 2992
+    },
+    {
+      "epoch": 0.5329415954415955,
+      "grad_norm": 0.49747416377067566,
+      "learning_rate": 0.00019146541298682918,
+      "loss": 1.1296,
+      "step": 2993
+    },
+    {
+      "epoch": 0.5331196581196581,
+      "grad_norm": 0.5684167742729187,
+      "learning_rate": 0.00019145975378996903,
+      "loss": 1.0685,
+      "step": 2994
+    },
+    {
+      "epoch": 0.5332977207977208,
+      "grad_norm": 0.5411235690116882,
+      "learning_rate": 0.00019145409280114502,
+      "loss": 1.1372,
+      "step": 2995
+    },
+    {
+      "epoch": 0.5334757834757835,
+      "grad_norm": 0.5006675720214844,
+      "learning_rate": 0.00019144843002046806,
+      "loss": 1.0688,
+      "step": 2996
+    },
+    {
+      "epoch": 0.5336538461538461,
+      "grad_norm": 0.4591315686702728,
+      "learning_rate": 0.00019144276544804908,
+      "loss": 1.1071,
+      "step": 2997
+    },
+    {
+      "epoch": 0.5338319088319088,
+      "grad_norm": 0.5615306496620178,
+      "learning_rate": 0.000191437099083999,
+      "loss": 1.1033,
+      "step": 2998
+    },
+    {
+      "epoch": 0.5340099715099715,
+      "grad_norm": 0.4986817240715027,
+      "learning_rate": 0.00019143143092842897,
+      "loss": 1.176,
+      "step": 2999
+    },
+    {
+      "epoch": 0.5341880341880342,
+      "grad_norm": 0.5017120242118835,
+      "learning_rate": 0.00019142576098144995,
+      "loss": 1.0174,
+      "step": 3000
+    },
+    {
+      "epoch": 0.5343660968660968,
+      "grad_norm": 0.508298397064209,
+      "learning_rate": 0.0001914200892431731,
+      "loss": 1.164,
+      "step": 3001
+    },
+    {
+      "epoch": 0.5345441595441596,
+      "grad_norm": 0.48068809509277344,
+      "learning_rate": 0.0001914144157137095,
+      "loss": 0.7959,
+      "step": 3002
+    },
+    {
+      "epoch": 0.5347222222222222,
+      "grad_norm": 0.6347028017044067,
+      "learning_rate": 0.0001914087403931703,
+      "loss": 1.1727,
+      "step": 3003
+    },
+    {
+      "epoch": 0.5349002849002849,
+      "grad_norm": 0.5558401942253113,
+      "learning_rate": 0.00019140306328166676,
+      "loss": 1.2282,
+      "step": 3004
+    },
+    {
+      "epoch": 0.5350783475783476,
+      "grad_norm": 0.5093596577644348,
+      "learning_rate": 0.00019139738437931004,
+      "loss": 1.3258,
+      "step": 3005
+    },
+    {
+      "epoch": 0.5352564102564102,
+      "grad_norm": 0.4653106927871704,
+      "learning_rate": 0.0001913917036862114,
+      "loss": 1.1062,
+      "step": 3006
+    },
+    {
+      "epoch": 0.5354344729344729,
+      "grad_norm": 0.48085781931877136,
+      "learning_rate": 0.00019138602120248222,
+      "loss": 0.9019,
+      "step": 3007
+    },
+    {
+      "epoch": 0.5356125356125356,
+      "grad_norm": 0.5174745321273804,
+      "learning_rate": 0.0001913803369282338,
+      "loss": 1.044,
+      "step": 3008
+    },
+    {
+      "epoch": 0.5357905982905983,
+      "grad_norm": 0.5359669327735901,
+      "learning_rate": 0.00019137465086357746,
+      "loss": 1.0723,
+      "step": 3009
+    },
+    {
+      "epoch": 0.5359686609686609,
+      "grad_norm": 0.5583470463752747,
+      "learning_rate": 0.00019136896300862467,
+      "loss": 1.2192,
+      "step": 3010
+    },
+    {
+      "epoch": 0.5361467236467237,
+      "grad_norm": 0.4905693829059601,
+      "learning_rate": 0.00019136327336348688,
+      "loss": 1.2372,
+      "step": 3011
+    },
+    {
+      "epoch": 0.5363247863247863,
+      "grad_norm": 0.5741264820098877,
+      "learning_rate": 0.0001913575819282755,
+      "loss": 1.1703,
+      "step": 3012
+    },
+    {
+      "epoch": 0.5365028490028491,
+      "grad_norm": 0.577033281326294,
+      "learning_rate": 0.0001913518887031021,
+      "loss": 1.1555,
+      "step": 3013
+    },
+    {
+      "epoch": 0.5366809116809117,
+      "grad_norm": 0.46795153617858887,
+      "learning_rate": 0.00019134619368807822,
+      "loss": 0.8583,
+      "step": 3014
+    },
+    {
+      "epoch": 0.5368589743589743,
+      "grad_norm": 0.5973345637321472,
+      "learning_rate": 0.0001913404968833154,
+      "loss": 1.1509,
+      "step": 3015
+    },
+    {
+      "epoch": 0.5370370370370371,
+      "grad_norm": 0.62020343542099,
+      "learning_rate": 0.00019133479828892531,
+      "loss": 1.0781,
+      "step": 3016
+    },
+    {
+      "epoch": 0.5372150997150997,
+      "grad_norm": 0.5342286229133606,
+      "learning_rate": 0.00019132909790501958,
+      "loss": 1.1556,
+      "step": 3017
+    },
+    {
+      "epoch": 0.5373931623931624,
+      "grad_norm": 0.49612846970558167,
+      "learning_rate": 0.0001913233957317099,
+      "loss": 0.9027,
+      "step": 3018
+    },
+    {
+      "epoch": 0.5375712250712251,
+      "grad_norm": 0.5403908491134644,
+      "learning_rate": 0.00019131769176910796,
+      "loss": 1.1125,
+      "step": 3019
+    },
+    {
+      "epoch": 0.5377492877492878,
+      "grad_norm": 0.4952050447463989,
+      "learning_rate": 0.0001913119860173256,
+      "loss": 1.2329,
+      "step": 3020
+    },
+    {
+      "epoch": 0.5379273504273504,
+      "grad_norm": 0.5877819657325745,
+      "learning_rate": 0.0001913062784764745,
+      "loss": 1.2855,
+      "step": 3021
+    },
+    {
+      "epoch": 0.5381054131054132,
+      "grad_norm": 0.49312907457351685,
+      "learning_rate": 0.00019130056914666655,
+      "loss": 1.0212,
+      "step": 3022
+    },
+    {
+      "epoch": 0.5382834757834758,
+      "grad_norm": 0.45544490218162537,
+      "learning_rate": 0.00019129485802801366,
+      "loss": 0.9748,
+      "step": 3023
+    },
+    {
+      "epoch": 0.5384615384615384,
+      "grad_norm": 0.5535242557525635,
+      "learning_rate": 0.00019128914512062762,
+      "loss": 1.2134,
+      "step": 3024
+    },
+    {
+      "epoch": 0.5386396011396012,
+      "grad_norm": 0.45369696617126465,
+      "learning_rate": 0.00019128343042462044,
+      "loss": 0.9964,
+      "step": 3025
+    },
+    {
+      "epoch": 0.5388176638176638,
+      "grad_norm": 0.6240725517272949,
+      "learning_rate": 0.00019127771394010406,
+      "loss": 1.425,
+      "step": 3026
+    },
+    {
+      "epoch": 0.5389957264957265,
+      "grad_norm": 0.4859573245048523,
+      "learning_rate": 0.0001912719956671905,
+      "loss": 1.087,
+      "step": 3027
+    },
+    {
+      "epoch": 0.5391737891737892,
+      "grad_norm": 0.47529762983322144,
+      "learning_rate": 0.0001912662756059918,
+      "loss": 0.9517,
+      "step": 3028
+    },
+    {
+      "epoch": 0.5393518518518519,
+      "grad_norm": 0.5317288637161255,
+      "learning_rate": 0.00019126055375661997,
+      "loss": 1.0945,
+      "step": 3029
+    },
+    {
+      "epoch": 0.5395299145299145,
+      "grad_norm": 0.55974280834198,
+      "learning_rate": 0.00019125483011918722,
+      "loss": 1.0794,
+      "step": 3030
+    },
+    {
+      "epoch": 0.5397079772079773,
+      "grad_norm": 0.48579123616218567,
+      "learning_rate": 0.0001912491046938056,
+      "loss": 1.1421,
+      "step": 3031
+    },
+    {
+      "epoch": 0.5398860398860399,
+      "grad_norm": 0.4917181134223938,
+      "learning_rate": 0.00019124337748058733,
+      "loss": 0.9708,
+      "step": 3032
+    },
+    {
+      "epoch": 0.5400641025641025,
+      "grad_norm": 0.525291383266449,
+      "learning_rate": 0.00019123764847964466,
+      "loss": 1.064,
+      "step": 3033
+    },
+    {
+      "epoch": 0.5402421652421653,
+      "grad_norm": 0.5733301639556885,
+      "learning_rate": 0.00019123191769108977,
+      "loss": 1.2142,
+      "step": 3034
+    },
+    {
+      "epoch": 0.5404202279202279,
+      "grad_norm": 0.5400987863540649,
+      "learning_rate": 0.00019122618511503494,
+      "loss": 1.1309,
+      "step": 3035
+    },
+    {
+      "epoch": 0.5405982905982906,
+      "grad_norm": 0.6261051893234253,
+      "learning_rate": 0.00019122045075159257,
+      "loss": 1.2112,
+      "step": 3036
+    },
+    {
+      "epoch": 0.5407763532763533,
+      "grad_norm": 0.5483576059341431,
+      "learning_rate": 0.0001912147146008749,
+      "loss": 1.2705,
+      "step": 3037
+    },
+    {
+      "epoch": 0.540954415954416,
+      "grad_norm": 0.5442137122154236,
+      "learning_rate": 0.00019120897666299443,
+      "loss": 1.2512,
+      "step": 3038
+    },
+    {
+      "epoch": 0.5411324786324786,
+      "grad_norm": 0.5680811405181885,
+      "learning_rate": 0.00019120323693806355,
+      "loss": 1.392,
+      "step": 3039
+    },
+    {
+      "epoch": 0.5413105413105413,
+      "grad_norm": 0.5237287878990173,
+      "learning_rate": 0.00019119749542619466,
+      "loss": 1.1599,
+      "step": 3040
+    },
+    {
+      "epoch": 0.541488603988604,
+      "grad_norm": 0.48119300603866577,
+      "learning_rate": 0.00019119175212750032,
+      "loss": 1.0976,
+      "step": 3041
+    },
+    {
+      "epoch": 0.5416666666666666,
+      "grad_norm": 0.507033109664917,
+      "learning_rate": 0.00019118600704209302,
+      "loss": 1.0181,
+      "step": 3042
+    },
+    {
+      "epoch": 0.5418447293447294,
+      "grad_norm": 0.484672874212265,
+      "learning_rate": 0.00019118026017008531,
+      "loss": 1.1636,
+      "step": 3043
+    },
+    {
+      "epoch": 0.542022792022792,
+      "grad_norm": 0.4923502206802368,
+      "learning_rate": 0.00019117451151158985,
+      "loss": 1.0388,
+      "step": 3044
+    },
+    {
+      "epoch": 0.5422008547008547,
+      "grad_norm": 0.4882057309150696,
+      "learning_rate": 0.00019116876106671922,
+      "loss": 1.131,
+      "step": 3045
+    },
+    {
+      "epoch": 0.5423789173789174,
+      "grad_norm": 0.6068355441093445,
+      "learning_rate": 0.0001911630088355861,
+      "loss": 1.3218,
+      "step": 3046
+    },
+    {
+      "epoch": 0.54255698005698,
+      "grad_norm": 0.5012881755828857,
+      "learning_rate": 0.0001911572548183032,
+      "loss": 1.0514,
+      "step": 3047
+    },
+    {
+      "epoch": 0.5427350427350427,
+      "grad_norm": 0.49849793314933777,
+      "learning_rate": 0.00019115149901498328,
+      "loss": 1.0003,
+      "step": 3048
+    },
+    {
+      "epoch": 0.5429131054131054,
+      "grad_norm": 0.4934251010417938,
+      "learning_rate": 0.00019114574142573904,
+      "loss": 1.0319,
+      "step": 3049
+    },
+    {
+      "epoch": 0.5430911680911681,
+      "grad_norm": 0.4947762191295624,
+      "learning_rate": 0.00019113998205068334,
+      "loss": 1.0906,
+      "step": 3050
+    },
+    {
+      "epoch": 0.5432692307692307,
+      "grad_norm": 0.5449416041374207,
+      "learning_rate": 0.00019113422088992907,
+      "loss": 0.9093,
+      "step": 3051
+    },
+    {
+      "epoch": 0.5434472934472935,
+      "grad_norm": 0.49395284056663513,
+      "learning_rate": 0.00019112845794358902,
+      "loss": 1.0071,
+      "step": 3052
+    },
+    {
+      "epoch": 0.5436253561253561,
+      "grad_norm": 0.5478728413581848,
+      "learning_rate": 0.00019112269321177613,
+      "loss": 1.2124,
+      "step": 3053
+    },
+    {
+      "epoch": 0.5438034188034188,
+      "grad_norm": 0.6205173134803772,
+      "learning_rate": 0.0001911169266946034,
+      "loss": 1.021,
+      "step": 3054
+    },
+    {
+      "epoch": 0.5439814814814815,
+      "grad_norm": 0.4777783751487732,
+      "learning_rate": 0.00019111115839218372,
+      "loss": 0.9192,
+      "step": 3055
+    },
+    {
+      "epoch": 0.5441595441595442,
+      "grad_norm": 0.5541689991950989,
+      "learning_rate": 0.00019110538830463018,
+      "loss": 1.1248,
+      "step": 3056
+    },
+    {
+      "epoch": 0.5443376068376068,
+      "grad_norm": 0.4750942289829254,
+      "learning_rate": 0.0001910996164320558,
+      "loss": 1.3147,
+      "step": 3057
+    },
+    {
+      "epoch": 0.5445156695156695,
+      "grad_norm": 0.6283948421478271,
+      "learning_rate": 0.0001910938427745737,
+      "loss": 1.0919,
+      "step": 3058
+    },
+    {
+      "epoch": 0.5446937321937322,
+      "grad_norm": 0.552725076675415,
+      "learning_rate": 0.00019108806733229698,
+      "loss": 1.3807,
+      "step": 3059
+    },
+    {
+      "epoch": 0.5448717948717948,
+      "grad_norm": 0.4832848310470581,
+      "learning_rate": 0.0001910822901053388,
+      "loss": 1.0705,
+      "step": 3060
+    },
+    {
+      "epoch": 0.5450498575498576,
+      "grad_norm": 0.6468375325202942,
+      "learning_rate": 0.00019107651109381233,
+      "loss": 1.0766,
+      "step": 3061
+    },
+    {
+      "epoch": 0.5452279202279202,
+      "grad_norm": 0.5464920401573181,
+      "learning_rate": 0.00019107073029783083,
+      "loss": 1.0453,
+      "step": 3062
+    },
+    {
+      "epoch": 0.5454059829059829,
+      "grad_norm": 0.5321210026741028,
+      "learning_rate": 0.0001910649477175076,
+      "loss": 1.2326,
+      "step": 3063
+    },
+    {
+      "epoch": 0.5455840455840456,
+      "grad_norm": 0.5572962164878845,
+      "learning_rate": 0.00019105916335295582,
+      "loss": 1.0673,
+      "step": 3064
+    },
+    {
+      "epoch": 0.5457621082621082,
+      "grad_norm": 0.5239177942276001,
+      "learning_rate": 0.00019105337720428894,
+      "loss": 1.04,
+      "step": 3065
+    },
+    {
+      "epoch": 0.5459401709401709,
+      "grad_norm": 0.5633319616317749,
+      "learning_rate": 0.00019104758927162023,
+      "loss": 0.9606,
+      "step": 3066
+    },
+    {
+      "epoch": 0.5461182336182336,
+      "grad_norm": 0.5317914485931396,
+      "learning_rate": 0.0001910417995550632,
+      "loss": 1.0651,
+      "step": 3067
+    },
+    {
+      "epoch": 0.5462962962962963,
+      "grad_norm": 0.5126453638076782,
+      "learning_rate": 0.00019103600805473118,
+      "loss": 1.0316,
+      "step": 3068
+    },
+    {
+      "epoch": 0.5464743589743589,
+      "grad_norm": 0.5262107253074646,
+      "learning_rate": 0.00019103021477073773,
+      "loss": 1.0752,
+      "step": 3069
+    },
+    {
+      "epoch": 0.5466524216524217,
+      "grad_norm": 0.5384877324104309,
+      "learning_rate": 0.0001910244197031963,
+      "loss": 1.1731,
+      "step": 3070
+    },
+    {
+      "epoch": 0.5468304843304843,
+      "grad_norm": 0.5126553773880005,
+      "learning_rate": 0.00019101862285222048,
+      "loss": 1.2229,
+      "step": 3071
+    },
+    {
+      "epoch": 0.5470085470085471,
+      "grad_norm": 0.4841194450855255,
+      "learning_rate": 0.0001910128242179238,
+      "loss": 0.9955,
+      "step": 3072
+    },
+    {
+      "epoch": 0.5471866096866097,
+      "grad_norm": 0.526546061038971,
+      "learning_rate": 0.00019100702380041987,
+      "loss": 1.2436,
+      "step": 3073
+    },
+    {
+      "epoch": 0.5473646723646723,
+      "grad_norm": 0.5085833072662354,
+      "learning_rate": 0.0001910012215998224,
+      "loss": 1.011,
+      "step": 3074
+    },
+    {
+      "epoch": 0.5475427350427351,
+      "grad_norm": 0.5149994492530823,
+      "learning_rate": 0.000190995417616245,
+      "loss": 0.8632,
+      "step": 3075
+    },
+    {
+      "epoch": 0.5477207977207977,
+      "grad_norm": 0.48079630732536316,
+      "learning_rate": 0.00019098961184980145,
+      "loss": 1.1115,
+      "step": 3076
+    },
+    {
+      "epoch": 0.5478988603988604,
+      "grad_norm": 0.5769477486610413,
+      "learning_rate": 0.00019098380430060546,
+      "loss": 0.9544,
+      "step": 3077
+    },
+    {
+      "epoch": 0.5480769230769231,
+      "grad_norm": 0.5260093808174133,
+      "learning_rate": 0.0001909779949687708,
+      "loss": 1.2354,
+      "step": 3078
+    },
+    {
+      "epoch": 0.5482549857549858,
+      "grad_norm": 0.5518734455108643,
+      "learning_rate": 0.00019097218385441135,
+      "loss": 1.1944,
+      "step": 3079
+    },
+    {
+      "epoch": 0.5484330484330484,
+      "grad_norm": 0.5436808466911316,
+      "learning_rate": 0.00019096637095764095,
+      "loss": 1.0717,
+      "step": 3080
+    },
+    {
+      "epoch": 0.5486111111111112,
+      "grad_norm": 0.4749584197998047,
+      "learning_rate": 0.00019096055627857344,
+      "loss": 1.0417,
+      "step": 3081
+    },
+    {
+      "epoch": 0.5487891737891738,
+      "grad_norm": 0.5485591292381287,
+      "learning_rate": 0.0001909547398173228,
+      "loss": 1.2515,
+      "step": 3082
+    },
+    {
+      "epoch": 0.5489672364672364,
+      "grad_norm": 0.5751016736030579,
+      "learning_rate": 0.00019094892157400296,
+      "loss": 1.2112,
+      "step": 3083
+    },
+    {
+      "epoch": 0.5491452991452992,
+      "grad_norm": 0.5404475331306458,
+      "learning_rate": 0.00019094310154872795,
+      "loss": 0.4334,
+      "step": 3084
+    },
+    {
+      "epoch": 0.5493233618233618,
+      "grad_norm": 0.5198020935058594,
+      "learning_rate": 0.00019093727974161178,
+      "loss": 0.9759,
+      "step": 3085
+    },
+    {
+      "epoch": 0.5495014245014245,
+      "grad_norm": 0.4893439710140228,
+      "learning_rate": 0.0001909314561527685,
+      "loss": 1.1287,
+      "step": 3086
+    },
+    {
+      "epoch": 0.5496794871794872,
+      "grad_norm": 0.5675956606864929,
+      "learning_rate": 0.00019092563078231228,
+      "loss": 1.234,
+      "step": 3087
+    },
+    {
+      "epoch": 0.5498575498575499,
+      "grad_norm": 0.5539132356643677,
+      "learning_rate": 0.00019091980363035714,
+      "loss": 1.2378,
+      "step": 3088
+    },
+    {
+      "epoch": 0.5500356125356125,
+      "grad_norm": 0.5194353461265564,
+      "learning_rate": 0.00019091397469701735,
+      "loss": 1.1338,
+      "step": 3089
+    },
+    {
+      "epoch": 0.5502136752136753,
+      "grad_norm": 0.5143756866455078,
+      "learning_rate": 0.0001909081439824071,
+      "loss": 0.9118,
+      "step": 3090
+    },
+    {
+      "epoch": 0.5503917378917379,
+      "grad_norm": 0.5624327659606934,
+      "learning_rate": 0.0001909023114866406,
+      "loss": 1.035,
+      "step": 3091
+    },
+    {
+      "epoch": 0.5505698005698005,
+      "grad_norm": 0.5285067558288574,
+      "learning_rate": 0.0001908964772098321,
+      "loss": 1.0451,
+      "step": 3092
+    },
+    {
+      "epoch": 0.5507478632478633,
+      "grad_norm": 0.5730587244033813,
+      "learning_rate": 0.000190890641152096,
+      "loss": 1.0672,
+      "step": 3093
+    },
+    {
+      "epoch": 0.5509259259259259,
+      "grad_norm": 0.5822951197624207,
+      "learning_rate": 0.0001908848033135466,
+      "loss": 1.1791,
+      "step": 3094
+    },
+    {
+      "epoch": 0.5511039886039886,
+      "grad_norm": 0.596161961555481,
+      "learning_rate": 0.00019087896369429826,
+      "loss": 1.0954,
+      "step": 3095
+    },
+    {
+      "epoch": 0.5512820512820513,
+      "grad_norm": 0.5138190984725952,
+      "learning_rate": 0.00019087312229446542,
+      "loss": 0.896,
+      "step": 3096
+    },
+    {
+      "epoch": 0.551460113960114,
+      "grad_norm": 0.5061872601509094,
+      "learning_rate": 0.0001908672791141625,
+      "loss": 1.1017,
+      "step": 3097
+    },
+    {
+      "epoch": 0.5516381766381766,
+      "grad_norm": 0.5189547538757324,
+      "learning_rate": 0.00019086143415350404,
+      "loss": 1.2906,
+      "step": 3098
+    },
+    {
+      "epoch": 0.5518162393162394,
+      "grad_norm": 0.5640039443969727,
+      "learning_rate": 0.00019085558741260448,
+      "loss": 1.1001,
+      "step": 3099
+    },
+    {
+      "epoch": 0.551994301994302,
+      "grad_norm": 0.453867107629776,
+      "learning_rate": 0.00019084973889157844,
+      "loss": 0.9731,
+      "step": 3100
+    },
+    {
+      "epoch": 0.5521723646723646,
+      "grad_norm": 0.5431303977966309,
+      "learning_rate": 0.0001908438885905405,
+      "loss": 1.3511,
+      "step": 3101
+    },
+    {
+      "epoch": 0.5523504273504274,
+      "grad_norm": 0.47693368792533875,
+      "learning_rate": 0.00019083803650960527,
+      "loss": 1.0426,
+      "step": 3102
+    },
+    {
+      "epoch": 0.55252849002849,
+      "grad_norm": 0.4663422703742981,
+      "learning_rate": 0.00019083218264888743,
+      "loss": 1.05,
+      "step": 3103
+    },
+    {
+      "epoch": 0.5527065527065527,
+      "grad_norm": 0.561354398727417,
+      "learning_rate": 0.00019082632700850164,
+      "loss": 0.9608,
+      "step": 3104
+    },
+    {
+      "epoch": 0.5528846153846154,
+      "grad_norm": 0.4981916844844818,
+      "learning_rate": 0.00019082046958856266,
+      "loss": 1.1935,
+      "step": 3105
+    },
+    {
+      "epoch": 0.5530626780626781,
+      "grad_norm": 0.5301326513290405,
+      "learning_rate": 0.0001908146103891852,
+      "loss": 1.0646,
+      "step": 3106
+    },
+    {
+      "epoch": 0.5532407407407407,
+      "grad_norm": 0.5023610591888428,
+      "learning_rate": 0.00019080874941048416,
+      "loss": 1.127,
+      "step": 3107
+    },
+    {
+      "epoch": 0.5534188034188035,
+      "grad_norm": 0.5172514319419861,
+      "learning_rate": 0.00019080288665257426,
+      "loss": 1.0435,
+      "step": 3108
+    },
+    {
+      "epoch": 0.5535968660968661,
+      "grad_norm": 0.6340598464012146,
+      "learning_rate": 0.00019079702211557048,
+      "loss": 1.3528,
+      "step": 3109
+    },
+    {
+      "epoch": 0.5537749287749287,
+      "grad_norm": 0.46882256865501404,
+      "learning_rate": 0.0001907911557995876,
+      "loss": 1.1361,
+      "step": 3110
+    },
+    {
+      "epoch": 0.5539529914529915,
+      "grad_norm": 0.6401382088661194,
+      "learning_rate": 0.00019078528770474068,
+      "loss": 1.2415,
+      "step": 3111
+    },
+    {
+      "epoch": 0.5541310541310541,
+      "grad_norm": 0.5141328573226929,
+      "learning_rate": 0.00019077941783114463,
+      "loss": 1.0505,
+      "step": 3112
+    },
+    {
+      "epoch": 0.5543091168091168,
+      "grad_norm": 0.522318959236145,
+      "learning_rate": 0.00019077354617891444,
+      "loss": 1.0964,
+      "step": 3113
+    },
+    {
+      "epoch": 0.5544871794871795,
+      "grad_norm": 0.539551854133606,
+      "learning_rate": 0.00019076767274816517,
+      "loss": 1.0735,
+      "step": 3114
+    },
+    {
+      "epoch": 0.5546652421652422,
+      "grad_norm": 0.495320200920105,
+      "learning_rate": 0.00019076179753901195,
+      "loss": 0.9754,
+      "step": 3115
+    },
+    {
+      "epoch": 0.5548433048433048,
+      "grad_norm": 0.5499199628829956,
+      "learning_rate": 0.00019075592055156984,
+      "loss": 1.0043,
+      "step": 3116
+    },
+    {
+      "epoch": 0.5550213675213675,
+      "grad_norm": 0.5352509617805481,
+      "learning_rate": 0.00019075004178595396,
+      "loss": 1.1701,
+      "step": 3117
+    },
+    {
+      "epoch": 0.5551994301994302,
+      "grad_norm": 0.5392300486564636,
+      "learning_rate": 0.00019074416124227953,
+      "loss": 1.1612,
+      "step": 3118
+    },
+    {
+      "epoch": 0.5553774928774928,
+      "grad_norm": 0.5195050835609436,
+      "learning_rate": 0.0001907382789206618,
+      "loss": 1.0934,
+      "step": 3119
+    },
+    {
+      "epoch": 0.5555555555555556,
+      "grad_norm": 0.5276884436607361,
+      "learning_rate": 0.000190732394821216,
+      "loss": 0.9011,
+      "step": 3120
+    },
+    {
+      "epoch": 0.5557336182336182,
+      "grad_norm": 0.6115903258323669,
+      "learning_rate": 0.00019072650894405734,
+      "loss": 1.3065,
+      "step": 3121
+    },
+    {
+      "epoch": 0.5559116809116809,
+      "grad_norm": 0.5752483010292053,
+      "learning_rate": 0.00019072062128930127,
+      "loss": 1.0063,
+      "step": 3122
+    },
+    {
+      "epoch": 0.5560897435897436,
+      "grad_norm": 0.5508273243904114,
+      "learning_rate": 0.00019071473185706302,
+      "loss": 1.2598,
+      "step": 3123
+    },
+    {
+      "epoch": 0.5562678062678063,
+      "grad_norm": 0.49712198972702026,
+      "learning_rate": 0.00019070884064745808,
+      "loss": 0.924,
+      "step": 3124
+    },
+    {
+      "epoch": 0.5564458689458689,
+      "grad_norm": 0.572849452495575,
+      "learning_rate": 0.00019070294766060185,
+      "loss": 0.9683,
+      "step": 3125
+    },
+    {
+      "epoch": 0.5566239316239316,
+      "grad_norm": 0.4807920753955841,
+      "learning_rate": 0.00019069705289660976,
+      "loss": 1.0998,
+      "step": 3126
+    },
+    {
+      "epoch": 0.5568019943019943,
+      "grad_norm": 0.5543031096458435,
+      "learning_rate": 0.0001906911563555973,
+      "loss": 1.0878,
+      "step": 3127
+    },
+    {
+      "epoch": 0.5569800569800569,
+      "grad_norm": 0.5710418820381165,
+      "learning_rate": 0.00019068525803768007,
+      "loss": 1.0381,
+      "step": 3128
+    },
+    {
+      "epoch": 0.5571581196581197,
+      "grad_norm": 0.5169163346290588,
+      "learning_rate": 0.00019067935794297357,
+      "loss": 1.1149,
+      "step": 3129
+    },
+    {
+      "epoch": 0.5573361823361823,
+      "grad_norm": 0.6474376916885376,
+      "learning_rate": 0.00019067345607159345,
+      "loss": 0.9828,
+      "step": 3130
+    },
+    {
+      "epoch": 0.5575142450142451,
+      "grad_norm": 0.5029847621917725,
+      "learning_rate": 0.0001906675524236553,
+      "loss": 0.797,
+      "step": 3131
+    },
+    {
+      "epoch": 0.5576923076923077,
+      "grad_norm": 0.5681431293487549,
+      "learning_rate": 0.00019066164699927478,
+      "loss": 1.1565,
+      "step": 3132
+    },
+    {
+      "epoch": 0.5578703703703703,
+      "grad_norm": 0.5654549598693848,
+      "learning_rate": 0.00019065573979856764,
+      "loss": 1.2488,
+      "step": 3133
+    },
+    {
+      "epoch": 0.5580484330484331,
+      "grad_norm": 0.47653043270111084,
+      "learning_rate": 0.0001906498308216496,
+      "loss": 1.0428,
+      "step": 3134
+    },
+    {
+      "epoch": 0.5582264957264957,
+      "grad_norm": 0.5068467259407043,
+      "learning_rate": 0.00019064392006863643,
+      "loss": 0.9659,
+      "step": 3135
+    },
+    {
+      "epoch": 0.5584045584045584,
+      "grad_norm": 0.7076661586761475,
+      "learning_rate": 0.00019063800753964393,
+      "loss": 1.1289,
+      "step": 3136
+    },
+    {
+      "epoch": 0.5585826210826211,
+      "grad_norm": 0.551456868648529,
+      "learning_rate": 0.000190632093234788,
+      "loss": 1.1925,
+      "step": 3137
+    },
+    {
+      "epoch": 0.5587606837606838,
+      "grad_norm": 0.518276035785675,
+      "learning_rate": 0.00019062617715418442,
+      "loss": 0.8681,
+      "step": 3138
+    },
+    {
+      "epoch": 0.5589387464387464,
+      "grad_norm": 0.5272278785705566,
+      "learning_rate": 0.0001906202592979492,
+      "loss": 1.0865,
+      "step": 3139
+    },
+    {
+      "epoch": 0.5591168091168092,
+      "grad_norm": 0.5344942212104797,
+      "learning_rate": 0.00019061433966619822,
+      "loss": 1.1647,
+      "step": 3140
+    },
+    {
+      "epoch": 0.5592948717948718,
+      "grad_norm": 0.5833460092544556,
+      "learning_rate": 0.00019060841825904753,
+      "loss": 1.3403,
+      "step": 3141
+    },
+    {
+      "epoch": 0.5594729344729344,
+      "grad_norm": 0.5707054734230042,
+      "learning_rate": 0.00019060249507661306,
+      "loss": 1.1236,
+      "step": 3142
+    },
+    {
+      "epoch": 0.5596509971509972,
+      "grad_norm": 0.5446065664291382,
+      "learning_rate": 0.00019059657011901094,
+      "loss": 1.017,
+      "step": 3143
+    },
+    {
+      "epoch": 0.5598290598290598,
+      "grad_norm": 0.5285109281539917,
+      "learning_rate": 0.0001905906433863572,
+      "loss": 1.3186,
+      "step": 3144
+    },
+    {
+      "epoch": 0.5600071225071225,
+      "grad_norm": 0.5308659672737122,
+      "learning_rate": 0.00019058471487876802,
+      "loss": 0.8464,
+      "step": 3145
+    },
+    {
+      "epoch": 0.5601851851851852,
+      "grad_norm": 0.5218054056167603,
+      "learning_rate": 0.00019057878459635948,
+      "loss": 1.0219,
+      "step": 3146
+    },
+    {
+      "epoch": 0.5603632478632479,
+      "grad_norm": 0.45067787170410156,
+      "learning_rate": 0.00019057285253924785,
+      "loss": 1.0364,
+      "step": 3147
+    },
+    {
+      "epoch": 0.5605413105413105,
+      "grad_norm": 0.4856041669845581,
+      "learning_rate": 0.0001905669187075493,
+      "loss": 1.1928,
+      "step": 3148
+    },
+    {
+      "epoch": 0.5607193732193733,
+      "grad_norm": 0.506912112236023,
+      "learning_rate": 0.00019056098310138016,
+      "loss": 1.119,
+      "step": 3149
+    },
+    {
+      "epoch": 0.5608974358974359,
+      "grad_norm": 0.49049463868141174,
+      "learning_rate": 0.00019055504572085662,
+      "loss": 1.2165,
+      "step": 3150
+    },
+    {
+      "epoch": 0.5610754985754985,
+      "grad_norm": 0.5250293612480164,
+      "learning_rate": 0.0001905491065660951,
+      "loss": 1.1427,
+      "step": 3151
+    },
+    {
+      "epoch": 0.5612535612535613,
+      "grad_norm": 0.43438446521759033,
+      "learning_rate": 0.00019054316563721195,
+      "loss": 0.884,
+      "step": 3152
+    },
+    {
+      "epoch": 0.5614316239316239,
+      "grad_norm": 0.5386807918548584,
+      "learning_rate": 0.00019053722293432354,
+      "loss": 1.1494,
+      "step": 3153
+    },
+    {
+      "epoch": 0.5616096866096866,
+      "grad_norm": 0.5403809547424316,
+      "learning_rate": 0.00019053127845754632,
+      "loss": 1.1743,
+      "step": 3154
+    },
+    {
+      "epoch": 0.5617877492877493,
+      "grad_norm": 0.4759823977947235,
+      "learning_rate": 0.00019052533220699678,
+      "loss": 1.0716,
+      "step": 3155
+    },
+    {
+      "epoch": 0.561965811965812,
+      "grad_norm": 0.45332327485084534,
+      "learning_rate": 0.0001905193841827914,
+      "loss": 0.8405,
+      "step": 3156
+    },
+    {
+      "epoch": 0.5621438746438746,
+      "grad_norm": 0.5617053508758545,
+      "learning_rate": 0.00019051343438504671,
+      "loss": 1.0422,
+      "step": 3157
+    },
+    {
+      "epoch": 0.5623219373219374,
+      "grad_norm": 0.5088049173355103,
+      "learning_rate": 0.00019050748281387931,
+      "loss": 1.0067,
+      "step": 3158
+    },
+    {
+      "epoch": 0.5625,
+      "grad_norm": 0.5174484848976135,
+      "learning_rate": 0.00019050152946940578,
+      "loss": 1.0623,
+      "step": 3159
+    },
+    {
+      "epoch": 0.5626780626780626,
+      "grad_norm": 0.6093568801879883,
+      "learning_rate": 0.0001904955743517428,
+      "loss": 1.24,
+      "step": 3160
+    },
+    {
+      "epoch": 0.5628561253561254,
+      "grad_norm": 0.49063584208488464,
+      "learning_rate": 0.00019048961746100703,
+      "loss": 0.8563,
+      "step": 3161
+    },
+    {
+      "epoch": 0.563034188034188,
+      "grad_norm": 0.583940863609314,
+      "learning_rate": 0.00019048365879731517,
+      "loss": 1.0695,
+      "step": 3162
+    },
+    {
+      "epoch": 0.5632122507122507,
+      "grad_norm": 0.4943268597126007,
+      "learning_rate": 0.000190477698360784,
+      "loss": 0.8606,
+      "step": 3163
+    },
+    {
+      "epoch": 0.5633903133903134,
+      "grad_norm": 0.5050932168960571,
+      "learning_rate": 0.00019047173615153028,
+      "loss": 1.1591,
+      "step": 3164
+    },
+    {
+      "epoch": 0.5635683760683761,
+      "grad_norm": 0.5445677638053894,
+      "learning_rate": 0.0001904657721696708,
+      "loss": 1.262,
+      "step": 3165
+    },
+    {
+      "epoch": 0.5637464387464387,
+      "grad_norm": 0.5445297360420227,
+      "learning_rate": 0.00019045980641532246,
+      "loss": 1.223,
+      "step": 3166
+    },
+    {
+      "epoch": 0.5639245014245015,
+      "grad_norm": 0.5098413228988647,
+      "learning_rate": 0.00019045383888860213,
+      "loss": 1.0829,
+      "step": 3167
+    },
+    {
+      "epoch": 0.5641025641025641,
+      "grad_norm": 0.484998881816864,
+      "learning_rate": 0.0001904478695896267,
+      "loss": 1.0711,
+      "step": 3168
+    },
+    {
+      "epoch": 0.5642806267806267,
+      "grad_norm": 0.5515334010124207,
+      "learning_rate": 0.0001904418985185132,
+      "loss": 1.1583,
+      "step": 3169
+    },
+    {
+      "epoch": 0.5644586894586895,
+      "grad_norm": 0.545460045337677,
+      "learning_rate": 0.00019043592567537853,
+      "loss": 1.2321,
+      "step": 3170
+    },
+    {
+      "epoch": 0.5646367521367521,
+      "grad_norm": 0.5463964343070984,
+      "learning_rate": 0.0001904299510603398,
+      "loss": 1.1019,
+      "step": 3171
+    },
+    {
+      "epoch": 0.5648148148148148,
+      "grad_norm": 0.5619220733642578,
+      "learning_rate": 0.000190423974673514,
+      "loss": 1.1001,
+      "step": 3172
+    },
+    {
+      "epoch": 0.5649928774928775,
+      "grad_norm": 0.4448916018009186,
+      "learning_rate": 0.00019041799651501825,
+      "loss": 1.057,
+      "step": 3173
+    },
+    {
+      "epoch": 0.5651709401709402,
+      "grad_norm": 0.6073006987571716,
+      "learning_rate": 0.00019041201658496975,
+      "loss": 1.0306,
+      "step": 3174
+    },
+    {
+      "epoch": 0.5653490028490028,
+      "grad_norm": 0.5342072248458862,
+      "learning_rate": 0.0001904060348834855,
+      "loss": 0.9231,
+      "step": 3175
+    },
+    {
+      "epoch": 0.5655270655270656,
+      "grad_norm": 0.4505697786808014,
+      "learning_rate": 0.0001904000514106829,
+      "loss": 1.1134,
+      "step": 3176
+    },
+    {
+      "epoch": 0.5657051282051282,
+      "grad_norm": 0.5627852082252502,
+      "learning_rate": 0.00019039406616667902,
+      "loss": 1.2138,
+      "step": 3177
+    },
+    {
+      "epoch": 0.5658831908831908,
+      "grad_norm": 0.499734103679657,
+      "learning_rate": 0.0001903880791515912,
+      "loss": 1.1074,
+      "step": 3178
+    },
+    {
+      "epoch": 0.5660612535612536,
+      "grad_norm": 0.4768189489841461,
+      "learning_rate": 0.00019038209036553676,
+      "loss": 0.9442,
+      "step": 3179
+    },
+    {
+      "epoch": 0.5662393162393162,
+      "grad_norm": 0.5265373587608337,
+      "learning_rate": 0.00019037609980863298,
+      "loss": 1.0907,
+      "step": 3180
+    },
+    {
+      "epoch": 0.5664173789173789,
+      "grad_norm": 0.5506128072738647,
+      "learning_rate": 0.00019037010748099728,
+      "loss": 1.2541,
+      "step": 3181
+    },
+    {
+      "epoch": 0.5665954415954416,
+      "grad_norm": 0.44860872626304626,
+      "learning_rate": 0.00019036411338274703,
+      "loss": 0.893,
+      "step": 3182
+    },
+    {
+      "epoch": 0.5667735042735043,
+      "grad_norm": 0.4901522994041443,
+      "learning_rate": 0.00019035811751399973,
+      "loss": 1.0469,
+      "step": 3183
+    },
+    {
+      "epoch": 0.5669515669515669,
+      "grad_norm": 0.500868022441864,
+      "learning_rate": 0.0001903521198748728,
+      "loss": 1.0527,
+      "step": 3184
+    },
+    {
+      "epoch": 0.5671296296296297,
+      "grad_norm": 0.5508102774620056,
+      "learning_rate": 0.00019034612046548376,
+      "loss": 1.283,
+      "step": 3185
+    },
+    {
+      "epoch": 0.5673076923076923,
+      "grad_norm": 0.5079495906829834,
+      "learning_rate": 0.0001903401192859502,
+      "loss": 1.0808,
+      "step": 3186
+    },
+    {
+      "epoch": 0.5674857549857549,
+      "grad_norm": 0.5758788585662842,
+      "learning_rate": 0.00019033411633638964,
+      "loss": 1.1301,
+      "step": 3187
+    },
+    {
+      "epoch": 0.5676638176638177,
+      "grad_norm": 0.46557924151420593,
+      "learning_rate": 0.00019032811161691972,
+      "loss": 1.0205,
+      "step": 3188
+    },
+    {
+      "epoch": 0.5678418803418803,
+      "grad_norm": 0.5665056109428406,
+      "learning_rate": 0.0001903221051276581,
+      "loss": 1.1926,
+      "step": 3189
+    },
+    {
+      "epoch": 0.5680199430199431,
+      "grad_norm": 0.5948992967605591,
+      "learning_rate": 0.00019031609686872246,
+      "loss": 1.2724,
+      "step": 3190
+    },
+    {
+      "epoch": 0.5681980056980057,
+      "grad_norm": 0.6189367771148682,
+      "learning_rate": 0.00019031008684023055,
+      "loss": 1.2762,
+      "step": 3191
+    },
+    {
+      "epoch": 0.5683760683760684,
+      "grad_norm": 0.49511992931365967,
+      "learning_rate": 0.00019030407504230006,
+      "loss": 1.0117,
+      "step": 3192
+    },
+    {
+      "epoch": 0.5685541310541311,
+      "grad_norm": 0.5358837842941284,
+      "learning_rate": 0.00019029806147504878,
+      "loss": 0.944,
+      "step": 3193
+    },
+    {
+      "epoch": 0.5687321937321937,
+      "grad_norm": 0.458636999130249,
+      "learning_rate": 0.00019029204613859463,
+      "loss": 0.8174,
+      "step": 3194
+    },
+    {
+      "epoch": 0.5689102564102564,
+      "grad_norm": 0.5168304443359375,
+      "learning_rate": 0.00019028602903305535,
+      "loss": 1.1533,
+      "step": 3195
+    },
+    {
+      "epoch": 0.5690883190883191,
+      "grad_norm": 0.5334134697914124,
+      "learning_rate": 0.00019028001015854892,
+      "loss": 1.1868,
+      "step": 3196
+    },
+    {
+      "epoch": 0.5692663817663818,
+      "grad_norm": 0.5649123191833496,
+      "learning_rate": 0.0001902739895151932,
+      "loss": 0.9876,
+      "step": 3197
+    },
+    {
+      "epoch": 0.5694444444444444,
+      "grad_norm": 0.5647651553153992,
+      "learning_rate": 0.0001902679671031062,
+      "loss": 1.0805,
+      "step": 3198
+    },
+    {
+      "epoch": 0.5696225071225072,
+      "grad_norm": 0.5251876711845398,
+      "learning_rate": 0.00019026194292240587,
+      "loss": 1.2335,
+      "step": 3199
+    },
+    {
+      "epoch": 0.5698005698005698,
+      "grad_norm": 0.5268014669418335,
+      "learning_rate": 0.0001902559169732103,
+      "loss": 1.19,
+      "step": 3200
+    },
+    {
+      "epoch": 0.5699786324786325,
+      "grad_norm": 0.5301041007041931,
+      "learning_rate": 0.00019024988925563752,
+      "loss": 1.1173,
+      "step": 3201
+    },
+    {
+      "epoch": 0.5701566951566952,
+      "grad_norm": 0.4531562030315399,
+      "learning_rate": 0.00019024385976980566,
+      "loss": 0.7576,
+      "step": 3202
+    },
+    {
+      "epoch": 0.5703347578347578,
+      "grad_norm": 0.5779716372489929,
+      "learning_rate": 0.00019023782851583282,
+      "loss": 1.1719,
+      "step": 3203
+    },
+    {
+      "epoch": 0.5705128205128205,
+      "grad_norm": 0.4886093735694885,
+      "learning_rate": 0.00019023179549383716,
+      "loss": 1.085,
+      "step": 3204
+    },
+    {
+      "epoch": 0.5706908831908832,
+      "grad_norm": 0.510117769241333,
+      "learning_rate": 0.0001902257607039369,
+      "loss": 0.8931,
+      "step": 3205
+    },
+    {
+      "epoch": 0.5708689458689459,
+      "grad_norm": 0.5195479393005371,
+      "learning_rate": 0.00019021972414625036,
+      "loss": 0.9922,
+      "step": 3206
+    },
+    {
+      "epoch": 0.5710470085470085,
+      "grad_norm": 0.5791407227516174,
+      "learning_rate": 0.00019021368582089568,
+      "loss": 1.112,
+      "step": 3207
+    },
+    {
+      "epoch": 0.5712250712250713,
+      "grad_norm": 0.5056005716323853,
+      "learning_rate": 0.00019020764572799122,
+      "loss": 0.8474,
+      "step": 3208
+    },
+    {
+      "epoch": 0.5714031339031339,
+      "grad_norm": 0.5060068964958191,
+      "learning_rate": 0.00019020160386765537,
+      "loss": 1.071,
+      "step": 3209
+    },
+    {
+      "epoch": 0.5715811965811965,
+      "grad_norm": 0.5396568775177002,
+      "learning_rate": 0.00019019556024000648,
+      "loss": 1.0436,
+      "step": 3210
+    },
+    {
+      "epoch": 0.5717592592592593,
+      "grad_norm": 0.6552190780639648,
+      "learning_rate": 0.0001901895148451629,
+      "loss": 0.9869,
+      "step": 3211
+    },
+    {
+      "epoch": 0.5719373219373219,
+      "grad_norm": 0.5177004337310791,
+      "learning_rate": 0.00019018346768324314,
+      "loss": 1.0193,
+      "step": 3212
+    },
+    {
+      "epoch": 0.5721153846153846,
+      "grad_norm": 0.5192117094993591,
+      "learning_rate": 0.0001901774187543657,
+      "loss": 1.1263,
+      "step": 3213
+    },
+    {
+      "epoch": 0.5722934472934473,
+      "grad_norm": 0.4857729971408844,
+      "learning_rate": 0.00019017136805864906,
+      "loss": 0.9808,
+      "step": 3214
+    },
+    {
+      "epoch": 0.57247150997151,
+      "grad_norm": 0.5800918936729431,
+      "learning_rate": 0.00019016531559621177,
+      "loss": 1.2334,
+      "step": 3215
+    },
+    {
+      "epoch": 0.5726495726495726,
+      "grad_norm": 0.4812086522579193,
+      "learning_rate": 0.00019015926136717242,
+      "loss": 1.2409,
+      "step": 3216
+    },
+    {
+      "epoch": 0.5728276353276354,
+      "grad_norm": 0.5128398537635803,
+      "learning_rate": 0.00019015320537164963,
+      "loss": 0.9036,
+      "step": 3217
+    },
+    {
+      "epoch": 0.573005698005698,
+      "grad_norm": 0.4761141538619995,
+      "learning_rate": 0.00019014714760976205,
+      "loss": 1.1058,
+      "step": 3218
+    },
+    {
+      "epoch": 0.5731837606837606,
+      "grad_norm": 0.5850459933280945,
+      "learning_rate": 0.0001901410880816284,
+      "loss": 1.1011,
+      "step": 3219
+    },
+    {
+      "epoch": 0.5733618233618234,
+      "grad_norm": 0.5648714303970337,
+      "learning_rate": 0.00019013502678736738,
+      "loss": 1.0479,
+      "step": 3220
+    },
+    {
+      "epoch": 0.573539886039886,
+      "grad_norm": 0.5835902094841003,
+      "learning_rate": 0.00019012896372709774,
+      "loss": 1.0555,
+      "step": 3221
+    },
+    {
+      "epoch": 0.5737179487179487,
+      "grad_norm": 0.5155113935470581,
+      "learning_rate": 0.00019012289890093828,
+      "loss": 0.9488,
+      "step": 3222
+    },
+    {
+      "epoch": 0.5738960113960114,
+      "grad_norm": 0.5064889788627625,
+      "learning_rate": 0.00019011683230900784,
+      "loss": 0.9144,
+      "step": 3223
+    },
+    {
+      "epoch": 0.5740740740740741,
+      "grad_norm": 0.53825843334198,
+      "learning_rate": 0.00019011076395142527,
+      "loss": 1.0713,
+      "step": 3224
+    },
+    {
+      "epoch": 0.5742521367521367,
+      "grad_norm": 0.5341386198997498,
+      "learning_rate": 0.00019010469382830947,
+      "loss": 1.1438,
+      "step": 3225
+    },
+    {
+      "epoch": 0.5744301994301995,
+      "grad_norm": 0.5300050973892212,
+      "learning_rate": 0.00019009862193977936,
+      "loss": 1.0114,
+      "step": 3226
+    },
+    {
+      "epoch": 0.5746082621082621,
+      "grad_norm": 0.6033682823181152,
+      "learning_rate": 0.0001900925482859539,
+      "loss": 1.0458,
+      "step": 3227
+    },
+    {
+      "epoch": 0.5747863247863247,
+      "grad_norm": 0.5108983516693115,
+      "learning_rate": 0.00019008647286695215,
+      "loss": 1.1211,
+      "step": 3228
+    },
+    {
+      "epoch": 0.5749643874643875,
+      "grad_norm": 0.5263782739639282,
+      "learning_rate": 0.00019008039568289308,
+      "loss": 0.8647,
+      "step": 3229
+    },
+    {
+      "epoch": 0.5751424501424501,
+      "grad_norm": 0.47119566798210144,
+      "learning_rate": 0.0001900743167338958,
+      "loss": 1.019,
+      "step": 3230
+    },
+    {
+      "epoch": 0.5753205128205128,
+      "grad_norm": 0.56391841173172,
+      "learning_rate": 0.00019006823602007937,
+      "loss": 0.9791,
+      "step": 3231
+    },
+    {
+      "epoch": 0.5754985754985755,
+      "grad_norm": 0.5364985466003418,
+      "learning_rate": 0.000190062153541563,
+      "loss": 1.1355,
+      "step": 3232
+    },
+    {
+      "epoch": 0.5756766381766382,
+      "grad_norm": 0.5098565220832825,
+      "learning_rate": 0.00019005606929846578,
+      "loss": 0.987,
+      "step": 3233
+    },
+    {
+      "epoch": 0.5758547008547008,
+      "grad_norm": 0.6640968918800354,
+      "learning_rate": 0.00019004998329090692,
+      "loss": 1.1165,
+      "step": 3234
+    },
+    {
+      "epoch": 0.5760327635327636,
+      "grad_norm": 0.5044721961021423,
+      "learning_rate": 0.00019004389551900578,
+      "loss": 0.8643,
+      "step": 3235
+    },
+    {
+      "epoch": 0.5762108262108262,
+      "grad_norm": 0.4822785258293152,
+      "learning_rate": 0.00019003780598288153,
+      "loss": 1.0735,
+      "step": 3236
+    },
+    {
+      "epoch": 0.5763888888888888,
+      "grad_norm": 0.505261242389679,
+      "learning_rate": 0.00019003171468265348,
+      "loss": 1.0001,
+      "step": 3237
+    },
+    {
+      "epoch": 0.5765669515669516,
+      "grad_norm": 0.5020412802696228,
+      "learning_rate": 0.00019002562161844102,
+      "loss": 0.9601,
+      "step": 3238
+    },
+    {
+      "epoch": 0.5767450142450142,
+      "grad_norm": 0.4920475482940674,
+      "learning_rate": 0.00019001952679036354,
+      "loss": 1.0111,
+      "step": 3239
+    },
+    {
+      "epoch": 0.5769230769230769,
+      "grad_norm": 0.5638813376426697,
+      "learning_rate": 0.00019001343019854042,
+      "loss": 1.1456,
+      "step": 3240
+    },
+    {
+      "epoch": 0.5771011396011396,
+      "grad_norm": 0.5519235134124756,
+      "learning_rate": 0.0001900073318430911,
+      "loss": 0.9258,
+      "step": 3241
+    },
+    {
+      "epoch": 0.5772792022792023,
+      "grad_norm": 0.5207770466804504,
+      "learning_rate": 0.0001900012317241351,
+      "loss": 0.9859,
+      "step": 3242
+    },
+    {
+      "epoch": 0.5774572649572649,
+      "grad_norm": 0.5493707656860352,
+      "learning_rate": 0.00018999512984179195,
+      "loss": 1.1183,
+      "step": 3243
+    },
+    {
+      "epoch": 0.5776353276353277,
+      "grad_norm": 0.4504764676094055,
+      "learning_rate": 0.00018998902619618116,
+      "loss": 0.9363,
+      "step": 3244
+    },
+    {
+      "epoch": 0.5778133903133903,
+      "grad_norm": 0.5232836604118347,
+      "learning_rate": 0.00018998292078742233,
+      "loss": 1.1887,
+      "step": 3245
+    },
+    {
+      "epoch": 0.5779914529914529,
+      "grad_norm": 0.5715088248252869,
+      "learning_rate": 0.0001899768136156351,
+      "loss": 1.4524,
+      "step": 3246
+    },
+    {
+      "epoch": 0.5781695156695157,
+      "grad_norm": 0.59555584192276,
+      "learning_rate": 0.0001899707046809391,
+      "loss": 1.0922,
+      "step": 3247
+    },
+    {
+      "epoch": 0.5783475783475783,
+      "grad_norm": 0.4500894546508789,
+      "learning_rate": 0.00018996459398345404,
+      "loss": 1.0087,
+      "step": 3248
+    },
+    {
+      "epoch": 0.5785256410256411,
+      "grad_norm": 0.49126625061035156,
+      "learning_rate": 0.00018995848152329967,
+      "loss": 1.1512,
+      "step": 3249
+    },
+    {
+      "epoch": 0.5787037037037037,
+      "grad_norm": 0.4096335172653198,
+      "learning_rate": 0.00018995236730059574,
+      "loss": 0.7633,
+      "step": 3250
+    },
+    {
+      "epoch": 0.5788817663817664,
+      "grad_norm": 0.5364313721656799,
+      "learning_rate": 0.00018994625131546199,
+      "loss": 1.295,
+      "step": 3251
+    },
+    {
+      "epoch": 0.5790598290598291,
+      "grad_norm": 0.4897502660751343,
+      "learning_rate": 0.00018994013356801834,
+      "loss": 1.2197,
+      "step": 3252
+    },
+    {
+      "epoch": 0.5792378917378918,
+      "grad_norm": 0.5101368427276611,
+      "learning_rate": 0.00018993401405838456,
+      "loss": 1.1129,
+      "step": 3253
+    },
+    {
+      "epoch": 0.5794159544159544,
+      "grad_norm": 0.5426377654075623,
+      "learning_rate": 0.00018992789278668063,
+      "loss": 1.188,
+      "step": 3254
+    },
+    {
+      "epoch": 0.5795940170940171,
+      "grad_norm": 0.5066362023353577,
+      "learning_rate": 0.00018992176975302644,
+      "loss": 1.2802,
+      "step": 3255
+    },
+    {
+      "epoch": 0.5797720797720798,
+      "grad_norm": 0.5418947339057922,
+      "learning_rate": 0.00018991564495754196,
+      "loss": 1.1675,
+      "step": 3256
+    },
+    {
+      "epoch": 0.5799501424501424,
+      "grad_norm": 0.5139963626861572,
+      "learning_rate": 0.0001899095184003472,
+      "loss": 0.9717,
+      "step": 3257
+    },
+    {
+      "epoch": 0.5801282051282052,
+      "grad_norm": 0.5167285799980164,
+      "learning_rate": 0.00018990339008156219,
+      "loss": 1.1529,
+      "step": 3258
+    },
+    {
+      "epoch": 0.5803062678062678,
+      "grad_norm": 0.53471440076828,
+      "learning_rate": 0.00018989726000130704,
+      "loss": 1.0711,
+      "step": 3259
+    },
+    {
+      "epoch": 0.5804843304843305,
+      "grad_norm": 0.49875229597091675,
+      "learning_rate": 0.0001898911281597018,
+      "loss": 1.1095,
+      "step": 3260
+    },
+    {
+      "epoch": 0.5806623931623932,
+      "grad_norm": 0.4473155438899994,
+      "learning_rate": 0.00018988499455686663,
+      "loss": 0.836,
+      "step": 3261
+    },
+    {
+      "epoch": 0.5808404558404558,
+      "grad_norm": 0.6181996464729309,
+      "learning_rate": 0.00018987885919292174,
+      "loss": 1.2787,
+      "step": 3262
+    },
+    {
+      "epoch": 0.5810185185185185,
+      "grad_norm": 0.4996899664402008,
+      "learning_rate": 0.00018987272206798733,
+      "loss": 1.2132,
+      "step": 3263
+    },
+    {
+      "epoch": 0.5811965811965812,
+      "grad_norm": 0.49979713559150696,
+      "learning_rate": 0.00018986658318218358,
+      "loss": 0.8388,
+      "step": 3264
+    },
+    {
+      "epoch": 0.5813746438746439,
+      "grad_norm": 0.5288876295089722,
+      "learning_rate": 0.00018986044253563084,
+      "loss": 1.1871,
+      "step": 3265
+    },
+    {
+      "epoch": 0.5815527065527065,
+      "grad_norm": 0.534063458442688,
+      "learning_rate": 0.00018985430012844937,
+      "loss": 0.96,
+      "step": 3266
+    },
+    {
+      "epoch": 0.5817307692307693,
+      "grad_norm": 0.5081285834312439,
+      "learning_rate": 0.00018984815596075953,
+      "loss": 1.1577,
+      "step": 3267
+    },
+    {
+      "epoch": 0.5819088319088319,
+      "grad_norm": 0.5648202896118164,
+      "learning_rate": 0.00018984201003268176,
+      "loss": 1.2235,
+      "step": 3268
+    },
+    {
+      "epoch": 0.5820868945868946,
+      "grad_norm": 0.495061993598938,
+      "learning_rate": 0.00018983586234433642,
+      "loss": 1.056,
+      "step": 3269
+    },
+    {
+      "epoch": 0.5822649572649573,
+      "grad_norm": 0.47149857878685,
+      "learning_rate": 0.000189829712895844,
+      "loss": 1.0844,
+      "step": 3270
+    },
+    {
+      "epoch": 0.58244301994302,
+      "grad_norm": 0.6107062697410583,
+      "learning_rate": 0.00018982356168732492,
+      "loss": 0.9868,
+      "step": 3271
+    },
+    {
+      "epoch": 0.5826210826210826,
+      "grad_norm": 0.7355940341949463,
+      "learning_rate": 0.00018981740871889974,
+      "loss": 1.1448,
+      "step": 3272
+    },
+    {
+      "epoch": 0.5827991452991453,
+      "grad_norm": 0.5950441956520081,
+      "learning_rate": 0.00018981125399068907,
+      "loss": 0.9618,
+      "step": 3273
+    },
+    {
+      "epoch": 0.582977207977208,
+      "grad_norm": 0.47607290744781494,
+      "learning_rate": 0.0001898050975028134,
+      "loss": 0.957,
+      "step": 3274
+    },
+    {
+      "epoch": 0.5831552706552706,
+      "grad_norm": 0.541164755821228,
+      "learning_rate": 0.00018979893925539338,
+      "loss": 1.1426,
+      "step": 3275
+    },
+    {
+      "epoch": 0.5833333333333334,
+      "grad_norm": 0.5240640044212341,
+      "learning_rate": 0.00018979277924854974,
+      "loss": 1.1421,
+      "step": 3276
+    },
+    {
+      "epoch": 0.583511396011396,
+      "grad_norm": 0.48155727982521057,
+      "learning_rate": 0.00018978661748240307,
+      "loss": 1.0069,
+      "step": 3277
+    },
+    {
+      "epoch": 0.5836894586894587,
+      "grad_norm": 0.5559938549995422,
+      "learning_rate": 0.00018978045395707418,
+      "loss": 1.1227,
+      "step": 3278
+    },
+    {
+      "epoch": 0.5838675213675214,
+      "grad_norm": 0.5244291424751282,
+      "learning_rate": 0.0001897742886726838,
+      "loss": 1.1103,
+      "step": 3279
+    },
+    {
+      "epoch": 0.584045584045584,
+      "grad_norm": 0.5277758240699768,
+      "learning_rate": 0.00018976812162935268,
+      "loss": 1.2125,
+      "step": 3280
+    },
+    {
+      "epoch": 0.5842236467236467,
+      "grad_norm": 0.5415039658546448,
+      "learning_rate": 0.00018976195282720173,
+      "loss": 1.146,
+      "step": 3281
+    },
+    {
+      "epoch": 0.5844017094017094,
+      "grad_norm": 0.5152051448822021,
+      "learning_rate": 0.00018975578226635177,
+      "loss": 1.0092,
+      "step": 3282
+    },
+    {
+      "epoch": 0.5845797720797721,
+      "grad_norm": 0.5489452481269836,
+      "learning_rate": 0.00018974960994692371,
+      "loss": 1.2425,
+      "step": 3283
+    },
+    {
+      "epoch": 0.5847578347578347,
+      "grad_norm": 0.491274356842041,
+      "learning_rate": 0.00018974343586903848,
+      "loss": 0.9559,
+      "step": 3284
+    },
+    {
+      "epoch": 0.5849358974358975,
+      "grad_norm": 0.5783739686012268,
+      "learning_rate": 0.00018973726003281707,
+      "loss": 1.1971,
+      "step": 3285
+    },
+    {
+      "epoch": 0.5851139601139601,
+      "grad_norm": 0.5056472420692444,
+      "learning_rate": 0.00018973108243838045,
+      "loss": 1.0313,
+      "step": 3286
+    },
+    {
+      "epoch": 0.5852920227920227,
+      "grad_norm": 0.4939729571342468,
+      "learning_rate": 0.00018972490308584962,
+      "loss": 1.1061,
+      "step": 3287
+    },
+    {
+      "epoch": 0.5854700854700855,
+      "grad_norm": 0.4889580011367798,
+      "learning_rate": 0.00018971872197534576,
+      "loss": 0.9157,
+      "step": 3288
+    },
+    {
+      "epoch": 0.5856481481481481,
+      "grad_norm": 0.40889349579811096,
+      "learning_rate": 0.00018971253910698993,
+      "loss": 0.8083,
+      "step": 3289
+    },
+    {
+      "epoch": 0.5858262108262108,
+      "grad_norm": 0.5221503973007202,
+      "learning_rate": 0.00018970635448090322,
+      "loss": 0.9995,
+      "step": 3290
+    },
+    {
+      "epoch": 0.5860042735042735,
+      "grad_norm": 0.47060561180114746,
+      "learning_rate": 0.00018970016809720687,
+      "loss": 0.9738,
+      "step": 3291
+    },
+    {
+      "epoch": 0.5861823361823362,
+      "grad_norm": 0.6083170771598816,
+      "learning_rate": 0.000189693979956022,
+      "loss": 1.188,
+      "step": 3292
+    },
+    {
+      "epoch": 0.5863603988603988,
+      "grad_norm": 0.4696751534938812,
+      "learning_rate": 0.00018968779005746998,
+      "loss": 1.089,
+      "step": 3293
+    },
+    {
+      "epoch": 0.5865384615384616,
+      "grad_norm": 0.5081014633178711,
+      "learning_rate": 0.00018968159840167202,
+      "loss": 1.1869,
+      "step": 3294
+    },
+    {
+      "epoch": 0.5867165242165242,
+      "grad_norm": 0.48042431473731995,
+      "learning_rate": 0.0001896754049887494,
+      "loss": 0.964,
+      "step": 3295
+    },
+    {
+      "epoch": 0.5868945868945868,
+      "grad_norm": 0.5075193643569946,
+      "learning_rate": 0.00018966920981882353,
+      "loss": 1.1884,
+      "step": 3296
+    },
+    {
+      "epoch": 0.5870726495726496,
+      "grad_norm": 0.5734842419624329,
+      "learning_rate": 0.00018966301289201576,
+      "loss": 1.1475,
+      "step": 3297
+    },
+    {
+      "epoch": 0.5872507122507122,
+      "grad_norm": 0.5525311231613159,
+      "learning_rate": 0.00018965681420844753,
+      "loss": 1.241,
+      "step": 3298
+    },
+    {
+      "epoch": 0.5874287749287749,
+      "grad_norm": 0.48142680525779724,
+      "learning_rate": 0.00018965061376824025,
+      "loss": 1.0871,
+      "step": 3299
+    },
+    {
+      "epoch": 0.5876068376068376,
+      "grad_norm": 0.5360350608825684,
+      "learning_rate": 0.00018964441157151544,
+      "loss": 1.1895,
+      "step": 3300
+    },
+    {
+      "epoch": 0.5877849002849003,
+      "grad_norm": 0.5207685232162476,
+      "learning_rate": 0.00018963820761839457,
+      "loss": 0.9323,
+      "step": 3301
+    },
+    {
+      "epoch": 0.5879629629629629,
+      "grad_norm": 0.453620970249176,
+      "learning_rate": 0.00018963200190899926,
+      "loss": 0.802,
+      "step": 3302
+    },
+    {
+      "epoch": 0.5881410256410257,
+      "grad_norm": 0.5198796391487122,
+      "learning_rate": 0.00018962579444345106,
+      "loss": 1.0243,
+      "step": 3303
+    },
+    {
+      "epoch": 0.5883190883190883,
+      "grad_norm": 0.5597525835037231,
+      "learning_rate": 0.0001896195852218716,
+      "loss": 0.9351,
+      "step": 3304
+    },
+    {
+      "epoch": 0.5884971509971509,
+      "grad_norm": 0.5738299489021301,
+      "learning_rate": 0.00018961337424438254,
+      "loss": 1.3737,
+      "step": 3305
+    },
+    {
+      "epoch": 0.5886752136752137,
+      "grad_norm": 0.5569949150085449,
+      "learning_rate": 0.00018960716151110554,
+      "loss": 1.0469,
+      "step": 3306
+    },
+    {
+      "epoch": 0.5888532763532763,
+      "grad_norm": 0.5088010430335999,
+      "learning_rate": 0.00018960094702216238,
+      "loss": 1.0982,
+      "step": 3307
+    },
+    {
+      "epoch": 0.5890313390313391,
+      "grad_norm": 0.5127636790275574,
+      "learning_rate": 0.0001895947307776748,
+      "loss": 0.9986,
+      "step": 3308
+    },
+    {
+      "epoch": 0.5892094017094017,
+      "grad_norm": 0.5160682797431946,
+      "learning_rate": 0.00018958851277776456,
+      "loss": 1.0219,
+      "step": 3309
+    },
+    {
+      "epoch": 0.5893874643874644,
+      "grad_norm": 0.5380711555480957,
+      "learning_rate": 0.00018958229302255356,
+      "loss": 1.118,
+      "step": 3310
+    },
+    {
+      "epoch": 0.5895655270655271,
+      "grad_norm": 0.5571228861808777,
+      "learning_rate": 0.0001895760715121636,
+      "loss": 1.0302,
+      "step": 3311
+    },
+    {
+      "epoch": 0.5897435897435898,
+      "grad_norm": 0.542266309261322,
+      "learning_rate": 0.00018956984824671657,
+      "loss": 1.0372,
+      "step": 3312
+    },
+    {
+      "epoch": 0.5899216524216524,
+      "grad_norm": 0.48350459337234497,
+      "learning_rate": 0.00018956362322633446,
+      "loss": 1.2,
+      "step": 3313
+    },
+    {
+      "epoch": 0.5900997150997151,
+      "grad_norm": 0.5001645088195801,
+      "learning_rate": 0.0001895573964511392,
+      "loss": 0.9749,
+      "step": 3314
+    },
+    {
+      "epoch": 0.5902777777777778,
+      "grad_norm": 0.5227531790733337,
+      "learning_rate": 0.00018955116792125276,
+      "loss": 1.025,
+      "step": 3315
+    },
+    {
+      "epoch": 0.5904558404558404,
+      "grad_norm": 0.522251546382904,
+      "learning_rate": 0.00018954493763679727,
+      "loss": 1.0821,
+      "step": 3316
+    },
+    {
+      "epoch": 0.5906339031339032,
+      "grad_norm": 0.5423251390457153,
+      "learning_rate": 0.00018953870559789467,
+      "loss": 1.0961,
+      "step": 3317
+    },
+    {
+      "epoch": 0.5908119658119658,
+      "grad_norm": 0.5615720748901367,
+      "learning_rate": 0.0001895324718046672,
+      "loss": 1.1209,
+      "step": 3318
+    },
+    {
+      "epoch": 0.5909900284900285,
+      "grad_norm": 0.44746771454811096,
+      "learning_rate": 0.00018952623625723692,
+      "loss": 0.9935,
+      "step": 3319
+    },
+    {
+      "epoch": 0.5911680911680912,
+      "grad_norm": 0.5993229150772095,
+      "learning_rate": 0.00018951999895572597,
+      "loss": 1.1409,
+      "step": 3320
+    },
+    {
+      "epoch": 0.5913461538461539,
+      "grad_norm": 0.4969801902770996,
+      "learning_rate": 0.00018951375990025666,
+      "loss": 1.1568,
+      "step": 3321
+    },
+    {
+      "epoch": 0.5915242165242165,
+      "grad_norm": 0.6001267433166504,
+      "learning_rate": 0.00018950751909095116,
+      "loss": 1.1135,
+      "step": 3322
+    },
+    {
+      "epoch": 0.5917022792022792,
+      "grad_norm": 0.5386021733283997,
+      "learning_rate": 0.00018950127652793172,
+      "loss": 0.947,
+      "step": 3323
+    },
+    {
+      "epoch": 0.5918803418803419,
+      "grad_norm": 0.49043843150138855,
+      "learning_rate": 0.00018949503221132074,
+      "loss": 0.9581,
+      "step": 3324
+    },
+    {
+      "epoch": 0.5920584045584045,
+      "grad_norm": 0.5241141319274902,
+      "learning_rate": 0.00018948878614124048,
+      "loss": 1.0797,
+      "step": 3325
+    },
+    {
+      "epoch": 0.5922364672364673,
+      "grad_norm": 0.5755026340484619,
+      "learning_rate": 0.00018948253831781338,
+      "loss": 1.1046,
+      "step": 3326
+    },
+    {
+      "epoch": 0.5924145299145299,
+      "grad_norm": 0.5004449486732483,
+      "learning_rate": 0.00018947628874116179,
+      "loss": 1.1416,
+      "step": 3327
+    },
+    {
+      "epoch": 0.5925925925925926,
+      "grad_norm": 0.53347247838974,
+      "learning_rate": 0.00018947003741140821,
+      "loss": 1.2718,
+      "step": 3328
+    },
+    {
+      "epoch": 0.5927706552706553,
+      "grad_norm": 0.6473469138145447,
+      "learning_rate": 0.0001894637843286751,
+      "loss": 1.2255,
+      "step": 3329
+    },
+    {
+      "epoch": 0.592948717948718,
+      "grad_norm": 0.4750518798828125,
+      "learning_rate": 0.00018945752949308498,
+      "loss": 1.0537,
+      "step": 3330
+    },
+    {
+      "epoch": 0.5931267806267806,
+      "grad_norm": 0.5636306405067444,
+      "learning_rate": 0.00018945127290476043,
+      "loss": 0.9906,
+      "step": 3331
+    },
+    {
+      "epoch": 0.5933048433048433,
+      "grad_norm": 0.4871736466884613,
+      "learning_rate": 0.00018944501456382397,
+      "loss": 1.0549,
+      "step": 3332
+    },
+    {
+      "epoch": 0.593482905982906,
+      "grad_norm": 0.5554637312889099,
+      "learning_rate": 0.0001894387544703983,
+      "loss": 1.1587,
+      "step": 3333
+    },
+    {
+      "epoch": 0.5936609686609686,
+      "grad_norm": 0.5385799407958984,
+      "learning_rate": 0.000189432492624606,
+      "loss": 0.9565,
+      "step": 3334
+    },
+    {
+      "epoch": 0.5938390313390314,
+      "grad_norm": 0.4996553063392639,
+      "learning_rate": 0.00018942622902656976,
+      "loss": 1.0456,
+      "step": 3335
+    },
+    {
+      "epoch": 0.594017094017094,
+      "grad_norm": 0.46810707449913025,
+      "learning_rate": 0.00018941996367641237,
+      "loss": 1.119,
+      "step": 3336
+    },
+    {
+      "epoch": 0.5941951566951567,
+      "grad_norm": 0.5672653913497925,
+      "learning_rate": 0.0001894136965742565,
+      "loss": 1.1317,
+      "step": 3337
+    },
+    {
+      "epoch": 0.5943732193732194,
+      "grad_norm": 0.4790053367614746,
+      "learning_rate": 0.00018940742772022504,
+      "loss": 1.0967,
+      "step": 3338
+    },
+    {
+      "epoch": 0.594551282051282,
+      "grad_norm": 0.5935906171798706,
+      "learning_rate": 0.00018940115711444072,
+      "loss": 1.3044,
+      "step": 3339
+    },
+    {
+      "epoch": 0.5947293447293447,
+      "grad_norm": 0.4790516793727875,
+      "learning_rate": 0.00018939488475702647,
+      "loss": 1.074,
+      "step": 3340
+    },
+    {
+      "epoch": 0.5949074074074074,
+      "grad_norm": 0.474588006734848,
+      "learning_rate": 0.00018938861064810516,
+      "loss": 1.1476,
+      "step": 3341
+    },
+    {
+      "epoch": 0.5950854700854701,
+      "grad_norm": 0.4908665120601654,
+      "learning_rate": 0.0001893823347877997,
+      "loss": 1.216,
+      "step": 3342
+    },
+    {
+      "epoch": 0.5952635327635327,
+      "grad_norm": 0.531650960445404,
+      "learning_rate": 0.00018937605717623307,
+      "loss": 1.1057,
+      "step": 3343
+    },
+    {
+      "epoch": 0.5954415954415955,
+      "grad_norm": 0.5581082105636597,
+      "learning_rate": 0.00018936977781352823,
+      "loss": 0.7972,
+      "step": 3344
+    },
+    {
+      "epoch": 0.5956196581196581,
+      "grad_norm": 0.42370662093162537,
+      "learning_rate": 0.00018936349669980827,
+      "loss": 0.8888,
+      "step": 3345
+    },
+    {
+      "epoch": 0.5957977207977208,
+      "grad_norm": 0.5817318558692932,
+      "learning_rate": 0.00018935721383519624,
+      "loss": 1.2801,
+      "step": 3346
+    },
+    {
+      "epoch": 0.5959757834757835,
+      "grad_norm": 0.4766376316547394,
+      "learning_rate": 0.00018935092921981524,
+      "loss": 1.0918,
+      "step": 3347
+    },
+    {
+      "epoch": 0.5961538461538461,
+      "grad_norm": 0.5567346811294556,
+      "learning_rate": 0.00018934464285378836,
+      "loss": 1.0269,
+      "step": 3348
+    },
+    {
+      "epoch": 0.5963319088319088,
+      "grad_norm": 0.5285565257072449,
+      "learning_rate": 0.0001893383547372388,
+      "loss": 1.1887,
+      "step": 3349
+    },
+    {
+      "epoch": 0.5965099715099715,
+      "grad_norm": 0.49052694439888,
+      "learning_rate": 0.00018933206487028979,
+      "loss": 1.0773,
+      "step": 3350
+    },
+    {
+      "epoch": 0.5966880341880342,
+      "grad_norm": 0.6175199151039124,
+      "learning_rate": 0.0001893257732530645,
+      "loss": 1.0192,
+      "step": 3351
+    },
+    {
+      "epoch": 0.5968660968660968,
+      "grad_norm": 0.56049644947052,
+      "learning_rate": 0.00018931947988568628,
+      "loss": 0.9516,
+      "step": 3352
+    },
+    {
+      "epoch": 0.5970441595441596,
+      "grad_norm": 0.47873660922050476,
+      "learning_rate": 0.00018931318476827838,
+      "loss": 0.8174,
+      "step": 3353
+    },
+    {
+      "epoch": 0.5972222222222222,
+      "grad_norm": 0.4748854339122772,
+      "learning_rate": 0.00018930688790096416,
+      "loss": 1.0238,
+      "step": 3354
+    },
+    {
+      "epoch": 0.5974002849002849,
+      "grad_norm": 0.5382232666015625,
+      "learning_rate": 0.00018930058928386698,
+      "loss": 1.0815,
+      "step": 3355
+    },
+    {
+      "epoch": 0.5975783475783476,
+      "grad_norm": 0.5038299560546875,
+      "learning_rate": 0.00018929428891711027,
+      "loss": 1.0472,
+      "step": 3356
+    },
+    {
+      "epoch": 0.5977564102564102,
+      "grad_norm": 0.5185908079147339,
+      "learning_rate": 0.00018928798680081744,
+      "loss": 1.0435,
+      "step": 3357
+    },
+    {
+      "epoch": 0.5979344729344729,
+      "grad_norm": 0.5169877409934998,
+      "learning_rate": 0.00018928168293511202,
+      "loss": 1.0437,
+      "step": 3358
+    },
+    {
+      "epoch": 0.5981125356125356,
+      "grad_norm": 0.5218369960784912,
+      "learning_rate": 0.00018927537732011749,
+      "loss": 1.082,
+      "step": 3359
+    },
+    {
+      "epoch": 0.5982905982905983,
+      "grad_norm": 0.5358219742774963,
+      "learning_rate": 0.0001892690699559574,
+      "loss": 1.2523,
+      "step": 3360
+    },
+    {
+      "epoch": 0.5984686609686609,
+      "grad_norm": 0.47716647386550903,
+      "learning_rate": 0.0001892627608427553,
+      "loss": 1.2069,
+      "step": 3361
+    },
+    {
+      "epoch": 0.5986467236467237,
+      "grad_norm": 0.5484169125556946,
+      "learning_rate": 0.00018925644998063482,
+      "loss": 1.2016,
+      "step": 3362
+    },
+    {
+      "epoch": 0.5988247863247863,
+      "grad_norm": 0.46814846992492676,
+      "learning_rate": 0.00018925013736971965,
+      "loss": 0.7989,
+      "step": 3363
+    },
+    {
+      "epoch": 0.5990028490028491,
+      "grad_norm": 0.5391258001327515,
+      "learning_rate": 0.0001892438230101334,
+      "loss": 1.224,
+      "step": 3364
+    },
+    {
+      "epoch": 0.5991809116809117,
+      "grad_norm": 0.5248384475708008,
+      "learning_rate": 0.00018923750690199987,
+      "loss": 1.1532,
+      "step": 3365
+    },
+    {
+      "epoch": 0.5993589743589743,
+      "grad_norm": 0.5074637532234192,
+      "learning_rate": 0.00018923118904544273,
+      "loss": 1.0968,
+      "step": 3366
+    },
+    {
+      "epoch": 0.5995370370370371,
+      "grad_norm": 0.5260029435157776,
+      "learning_rate": 0.00018922486944058581,
+      "loss": 1.1311,
+      "step": 3367
+    },
+    {
+      "epoch": 0.5997150997150997,
+      "grad_norm": 0.48497965931892395,
+      "learning_rate": 0.00018921854808755294,
+      "loss": 1.1208,
+      "step": 3368
+    },
+    {
+      "epoch": 0.5998931623931624,
+      "grad_norm": 0.5108651518821716,
+      "learning_rate": 0.00018921222498646792,
+      "loss": 1.147,
+      "step": 3369
+    },
+    {
+      "epoch": 0.6000712250712251,
+      "grad_norm": 0.5243437886238098,
+      "learning_rate": 0.00018920590013745471,
+      "loss": 0.9614,
+      "step": 3370
+    },
+    {
+      "epoch": 0.6002492877492878,
+      "grad_norm": 0.47022634744644165,
+      "learning_rate": 0.00018919957354063719,
+      "loss": 1.0579,
+      "step": 3371
+    },
+    {
+      "epoch": 0.6004273504273504,
+      "grad_norm": 0.6461413502693176,
+      "learning_rate": 0.00018919324519613931,
+      "loss": 1.2126,
+      "step": 3372
+    },
+    {
+      "epoch": 0.6006054131054132,
+      "grad_norm": 0.4654616713523865,
+      "learning_rate": 0.00018918691510408508,
+      "loss": 1.1476,
+      "step": 3373
+    },
+    {
+      "epoch": 0.6007834757834758,
+      "grad_norm": 0.48571303486824036,
+      "learning_rate": 0.00018918058326459854,
+      "loss": 1.2093,
+      "step": 3374
+    },
+    {
+      "epoch": 0.6009615384615384,
+      "grad_norm": 0.5255016684532166,
+      "learning_rate": 0.00018917424967780368,
+      "loss": 1.1538,
+      "step": 3375
+    },
+    {
+      "epoch": 0.6011396011396012,
+      "grad_norm": 0.5059894323348999,
+      "learning_rate": 0.00018916791434382468,
+      "loss": 1.0556,
+      "step": 3376
+    },
+    {
+      "epoch": 0.6013176638176638,
+      "grad_norm": 0.4581229090690613,
+      "learning_rate": 0.00018916157726278561,
+      "loss": 1.1468,
+      "step": 3377
+    },
+    {
+      "epoch": 0.6014957264957265,
+      "grad_norm": 0.5701818466186523,
+      "learning_rate": 0.00018915523843481067,
+      "loss": 1.3641,
+      "step": 3378
+    },
+    {
+      "epoch": 0.6016737891737892,
+      "grad_norm": 0.5007243752479553,
+      "learning_rate": 0.00018914889786002403,
+      "loss": 1.2705,
+      "step": 3379
+    },
+    {
+      "epoch": 0.6018518518518519,
+      "grad_norm": 0.5192995071411133,
+      "learning_rate": 0.0001891425555385499,
+      "loss": 0.9922,
+      "step": 3380
+    },
+    {
+      "epoch": 0.6020299145299145,
+      "grad_norm": 0.5880612134933472,
+      "learning_rate": 0.00018913621147051258,
+      "loss": 0.8783,
+      "step": 3381
+    },
+    {
+      "epoch": 0.6022079772079773,
+      "grad_norm": 0.5161563158035278,
+      "learning_rate": 0.0001891298656560364,
+      "loss": 0.9634,
+      "step": 3382
+    },
+    {
+      "epoch": 0.6023860398860399,
+      "grad_norm": 0.48450782895088196,
+      "learning_rate": 0.00018912351809524563,
+      "loss": 0.809,
+      "step": 3383
+    },
+    {
+      "epoch": 0.6025641025641025,
+      "grad_norm": 0.621537983417511,
+      "learning_rate": 0.00018911716878826465,
+      "loss": 1.2031,
+      "step": 3384
+    },
+    {
+      "epoch": 0.6027421652421653,
+      "grad_norm": 0.6014544367790222,
+      "learning_rate": 0.00018911081773521787,
+      "loss": 1.1552,
+      "step": 3385
+    },
+    {
+      "epoch": 0.6029202279202279,
+      "grad_norm": 0.49995481967926025,
+      "learning_rate": 0.00018910446493622976,
+      "loss": 0.8569,
+      "step": 3386
+    },
+    {
+      "epoch": 0.6030982905982906,
+      "grad_norm": 0.5157307386398315,
+      "learning_rate": 0.00018909811039142472,
+      "loss": 0.9515,
+      "step": 3387
+    },
+    {
+      "epoch": 0.6032763532763533,
+      "grad_norm": 0.5164140462875366,
+      "learning_rate": 0.0001890917541009273,
+      "loss": 0.9803,
+      "step": 3388
+    },
+    {
+      "epoch": 0.603454415954416,
+      "grad_norm": 0.5555596947669983,
+      "learning_rate": 0.00018908539606486206,
+      "loss": 1.2994,
+      "step": 3389
+    },
+    {
+      "epoch": 0.6036324786324786,
+      "grad_norm": 0.605697512626648,
+      "learning_rate": 0.00018907903628335353,
+      "loss": 1.2865,
+      "step": 3390
+    },
+    {
+      "epoch": 0.6038105413105413,
+      "grad_norm": 0.5700713992118835,
+      "learning_rate": 0.0001890726747565263,
+      "loss": 1.2493,
+      "step": 3391
+    },
+    {
+      "epoch": 0.603988603988604,
+      "grad_norm": 0.5516746044158936,
+      "learning_rate": 0.0001890663114845051,
+      "loss": 1.2743,
+      "step": 3392
+    },
+    {
+      "epoch": 0.6041666666666666,
+      "grad_norm": 0.5233162641525269,
+      "learning_rate": 0.0001890599464674145,
+      "loss": 0.9237,
+      "step": 3393
+    },
+    {
+      "epoch": 0.6043447293447294,
+      "grad_norm": 0.5709942579269409,
+      "learning_rate": 0.00018905357970537925,
+      "loss": 0.9922,
+      "step": 3394
+    },
+    {
+      "epoch": 0.604522792022792,
+      "grad_norm": 0.48403796553611755,
+      "learning_rate": 0.0001890472111985241,
+      "loss": 1.1255,
+      "step": 3395
+    },
+    {
+      "epoch": 0.6047008547008547,
+      "grad_norm": 0.628718376159668,
+      "learning_rate": 0.00018904084094697386,
+      "loss": 1.1458,
+      "step": 3396
+    },
+    {
+      "epoch": 0.6048789173789174,
+      "grad_norm": 0.46822869777679443,
+      "learning_rate": 0.00018903446895085328,
+      "loss": 0.8727,
+      "step": 3397
+    },
+    {
+      "epoch": 0.60505698005698,
+      "grad_norm": 0.505584180355072,
+      "learning_rate": 0.00018902809521028724,
+      "loss": 1.1595,
+      "step": 3398
+    },
+    {
+      "epoch": 0.6052350427350427,
+      "grad_norm": 0.4494974911212921,
+      "learning_rate": 0.00018902171972540058,
+      "loss": 0.6685,
+      "step": 3399
+    },
+    {
+      "epoch": 0.6054131054131054,
+      "grad_norm": 0.5101519227027893,
+      "learning_rate": 0.0001890153424963183,
+      "loss": 0.9313,
+      "step": 3400
+    },
+    {
+      "epoch": 0.6055911680911681,
+      "grad_norm": 0.5081079602241516,
+      "learning_rate": 0.00018900896352316528,
+      "loss": 1.2588,
+      "step": 3401
+    },
+    {
+      "epoch": 0.6057692307692307,
+      "grad_norm": 0.5784309506416321,
+      "learning_rate": 0.00018900258280606653,
+      "loss": 1.2077,
+      "step": 3402
+    },
+    {
+      "epoch": 0.6059472934472935,
+      "grad_norm": 0.4506312608718872,
+      "learning_rate": 0.00018899620034514705,
+      "loss": 1.05,
+      "step": 3403
+    },
+    {
+      "epoch": 0.6061253561253561,
+      "grad_norm": 0.5243048071861267,
+      "learning_rate": 0.0001889898161405319,
+      "loss": 1.2295,
+      "step": 3404
+    },
+    {
+      "epoch": 0.6063034188034188,
+      "grad_norm": 0.5447196364402771,
+      "learning_rate": 0.00018898343019234615,
+      "loss": 1.1476,
+      "step": 3405
+    },
+    {
+      "epoch": 0.6064814814814815,
+      "grad_norm": 0.46813663840293884,
+      "learning_rate": 0.00018897704250071492,
+      "loss": 1.2113,
+      "step": 3406
+    },
+    {
+      "epoch": 0.6066595441595442,
+      "grad_norm": 0.5340631604194641,
+      "learning_rate": 0.00018897065306576342,
+      "loss": 1.1656,
+      "step": 3407
+    },
+    {
+      "epoch": 0.6068376068376068,
+      "grad_norm": 0.513708233833313,
+      "learning_rate": 0.00018896426188761675,
+      "loss": 1.1616,
+      "step": 3408
+    },
+    {
+      "epoch": 0.6070156695156695,
+      "grad_norm": 0.594601035118103,
+      "learning_rate": 0.00018895786896640023,
+      "loss": 1.2564,
+      "step": 3409
+    },
+    {
+      "epoch": 0.6071937321937322,
+      "grad_norm": 0.45067599415779114,
+      "learning_rate": 0.000188951474302239,
+      "loss": 1.0107,
+      "step": 3410
+    },
+    {
+      "epoch": 0.6073717948717948,
+      "grad_norm": 0.5394250750541687,
+      "learning_rate": 0.00018894507789525843,
+      "loss": 1.4081,
+      "step": 3411
+    },
+    {
+      "epoch": 0.6075498575498576,
+      "grad_norm": 0.5612049102783203,
+      "learning_rate": 0.00018893867974558383,
+      "loss": 1.1015,
+      "step": 3412
+    },
+    {
+      "epoch": 0.6077279202279202,
+      "grad_norm": 0.4794061779975891,
+      "learning_rate": 0.00018893227985334056,
+      "loss": 1.2103,
+      "step": 3413
+    },
+    {
+      "epoch": 0.6079059829059829,
+      "grad_norm": 0.6060562133789062,
+      "learning_rate": 0.00018892587821865402,
+      "loss": 1.3693,
+      "step": 3414
+    },
+    {
+      "epoch": 0.6080840455840456,
+      "grad_norm": 0.44624534249305725,
+      "learning_rate": 0.00018891947484164963,
+      "loss": 0.8209,
+      "step": 3415
+    },
+    {
+      "epoch": 0.6082621082621082,
+      "grad_norm": 0.49297213554382324,
+      "learning_rate": 0.0001889130697224528,
+      "loss": 1.2027,
+      "step": 3416
+    },
+    {
+      "epoch": 0.6084401709401709,
+      "grad_norm": 0.4431746304035187,
+      "learning_rate": 0.0001889066628611891,
+      "loss": 1.0347,
+      "step": 3417
+    },
+    {
+      "epoch": 0.6086182336182336,
+      "grad_norm": 0.5425933599472046,
+      "learning_rate": 0.00018890025425798404,
+      "loss": 1.0556,
+      "step": 3418
+    },
+    {
+      "epoch": 0.6087962962962963,
+      "grad_norm": 0.5502763390541077,
+      "learning_rate": 0.00018889384391296315,
+      "loss": 1.2362,
+      "step": 3419
+    },
+    {
+      "epoch": 0.6089743589743589,
+      "grad_norm": 0.5442292094230652,
+      "learning_rate": 0.00018888743182625203,
+      "loss": 1.1306,
+      "step": 3420
+    },
+    {
+      "epoch": 0.6091524216524217,
+      "grad_norm": 0.4651123583316803,
+      "learning_rate": 0.00018888101799797636,
+      "loss": 0.9305,
+      "step": 3421
+    },
+    {
+      "epoch": 0.6093304843304843,
+      "grad_norm": 0.4713892340660095,
+      "learning_rate": 0.00018887460242826177,
+      "loss": 1.0789,
+      "step": 3422
+    },
+    {
+      "epoch": 0.6095085470085471,
+      "grad_norm": 0.5283244848251343,
+      "learning_rate": 0.00018886818511723398,
+      "loss": 1.345,
+      "step": 3423
+    },
+    {
+      "epoch": 0.6096866096866097,
+      "grad_norm": 0.5527324080467224,
+      "learning_rate": 0.0001888617660650187,
+      "loss": 1.1297,
+      "step": 3424
+    },
+    {
+      "epoch": 0.6098646723646723,
+      "grad_norm": 0.5412901043891907,
+      "learning_rate": 0.00018885534527174168,
+      "loss": 1.1213,
+      "step": 3425
+    },
+    {
+      "epoch": 0.6100427350427351,
+      "grad_norm": 0.5295354127883911,
+      "learning_rate": 0.00018884892273752878,
+      "loss": 1.1217,
+      "step": 3426
+    },
+    {
+      "epoch": 0.6102207977207977,
+      "grad_norm": 0.461900532245636,
+      "learning_rate": 0.0001888424984625058,
+      "loss": 0.827,
+      "step": 3427
+    },
+    {
+      "epoch": 0.6103988603988604,
+      "grad_norm": 0.4922671616077423,
+      "learning_rate": 0.00018883607244679865,
+      "loss": 1.2216,
+      "step": 3428
+    },
+    {
+      "epoch": 0.6105769230769231,
+      "grad_norm": 0.5080927014350891,
+      "learning_rate": 0.00018882964469053317,
+      "loss": 1.2446,
+      "step": 3429
+    },
+    {
+      "epoch": 0.6107549857549858,
+      "grad_norm": 0.5523943901062012,
+      "learning_rate": 0.00018882321519383534,
+      "loss": 1.3346,
+      "step": 3430
+    },
+    {
+      "epoch": 0.6109330484330484,
+      "grad_norm": 0.5105271935462952,
+      "learning_rate": 0.0001888167839568311,
+      "loss": 1.1311,
+      "step": 3431
+    },
+    {
+      "epoch": 0.6111111111111112,
+      "grad_norm": 0.5635872483253479,
+      "learning_rate": 0.0001888103509796465,
+      "loss": 1.1875,
+      "step": 3432
+    },
+    {
+      "epoch": 0.6112891737891738,
+      "grad_norm": 0.4619547426700592,
+      "learning_rate": 0.00018880391626240755,
+      "loss": 0.9176,
+      "step": 3433
+    },
+    {
+      "epoch": 0.6114672364672364,
+      "grad_norm": 0.5896356105804443,
+      "learning_rate": 0.00018879747980524034,
+      "loss": 1.0251,
+      "step": 3434
+    },
+    {
+      "epoch": 0.6116452991452992,
+      "grad_norm": 0.49062737822532654,
+      "learning_rate": 0.000188791041608271,
+      "loss": 1.1598,
+      "step": 3435
+    },
+    {
+      "epoch": 0.6118233618233618,
+      "grad_norm": 0.45717164874076843,
+      "learning_rate": 0.00018878460167162558,
+      "loss": 0.8647,
+      "step": 3436
+    },
+    {
+      "epoch": 0.6120014245014245,
+      "grad_norm": 0.5903525352478027,
+      "learning_rate": 0.00018877815999543038,
+      "loss": 0.9671,
+      "step": 3437
+    },
+    {
+      "epoch": 0.6121794871794872,
+      "grad_norm": 0.5315384268760681,
+      "learning_rate": 0.00018877171657981153,
+      "loss": 1.1759,
+      "step": 3438
+    },
+    {
+      "epoch": 0.6123575498575499,
+      "grad_norm": 0.5650150775909424,
+      "learning_rate": 0.0001887652714248953,
+      "loss": 1.0128,
+      "step": 3439
+    },
+    {
+      "epoch": 0.6125356125356125,
+      "grad_norm": 0.49841752648353577,
+      "learning_rate": 0.000188758824530808,
+      "loss": 1.1259,
+      "step": 3440
+    },
+    {
+      "epoch": 0.6127136752136753,
+      "grad_norm": 0.4985620975494385,
+      "learning_rate": 0.00018875237589767593,
+      "loss": 1.0158,
+      "step": 3441
+    },
+    {
+      "epoch": 0.6128917378917379,
+      "grad_norm": 0.45266565680503845,
+      "learning_rate": 0.00018874592552562536,
+      "loss": 0.93,
+      "step": 3442
+    },
+    {
+      "epoch": 0.6130698005698005,
+      "grad_norm": 0.5696130990982056,
+      "learning_rate": 0.00018873947341478274,
+      "loss": 1.1432,
+      "step": 3443
+    },
+    {
+      "epoch": 0.6132478632478633,
+      "grad_norm": 0.5211645364761353,
+      "learning_rate": 0.00018873301956527451,
+      "loss": 1.1317,
+      "step": 3444
+    },
+    {
+      "epoch": 0.6134259259259259,
+      "grad_norm": 0.4991866946220398,
+      "learning_rate": 0.00018872656397722707,
+      "loss": 1.0362,
+      "step": 3445
+    },
+    {
+      "epoch": 0.6136039886039886,
+      "grad_norm": 0.5109508037567139,
+      "learning_rate": 0.00018872010665076694,
+      "loss": 1.2728,
+      "step": 3446
+    },
+    {
+      "epoch": 0.6137820512820513,
+      "grad_norm": 0.5838373899459839,
+      "learning_rate": 0.00018871364758602058,
+      "loss": 1.1131,
+      "step": 3447
+    },
+    {
+      "epoch": 0.613960113960114,
+      "grad_norm": 0.5139824151992798,
+      "learning_rate": 0.00018870718678311462,
+      "loss": 1.238,
+      "step": 3448
+    },
+    {
+      "epoch": 0.6141381766381766,
+      "grad_norm": 0.4852082431316376,
+      "learning_rate": 0.00018870072424217562,
+      "loss": 1.0677,
+      "step": 3449
+    },
+    {
+      "epoch": 0.6143162393162394,
+      "grad_norm": 0.5312315225601196,
+      "learning_rate": 0.00018869425996333018,
+      "loss": 1.178,
+      "step": 3450
+    },
+    {
+      "epoch": 0.614494301994302,
+      "grad_norm": 0.6343565583229065,
+      "learning_rate": 0.00018868779394670492,
+      "loss": 0.8839,
+      "step": 3451
+    },
+    {
+      "epoch": 0.6146723646723646,
+      "grad_norm": 0.6029773950576782,
+      "learning_rate": 0.00018868132619242662,
+      "loss": 1.1188,
+      "step": 3452
+    },
+    {
+      "epoch": 0.6148504273504274,
+      "grad_norm": 0.5246016383171082,
+      "learning_rate": 0.00018867485670062193,
+      "loss": 1.0797,
+      "step": 3453
+    },
+    {
+      "epoch": 0.61502849002849,
+      "grad_norm": 0.49307698011398315,
+      "learning_rate": 0.00018866838547141763,
+      "loss": 0.9749,
+      "step": 3454
+    },
+    {
+      "epoch": 0.6152065527065527,
+      "grad_norm": 0.5232903361320496,
+      "learning_rate": 0.00018866191250494052,
+      "loss": 1.0785,
+      "step": 3455
+    },
+    {
+      "epoch": 0.6153846153846154,
+      "grad_norm": 0.5545645356178284,
+      "learning_rate": 0.0001886554378013174,
+      "loss": 1.0496,
+      "step": 3456
+    },
+    {
+      "epoch": 0.6155626780626781,
+      "grad_norm": 0.493945837020874,
+      "learning_rate": 0.00018864896136067515,
+      "loss": 0.9248,
+      "step": 3457
+    },
+    {
+      "epoch": 0.6157407407407407,
+      "grad_norm": 0.5223548412322998,
+      "learning_rate": 0.00018864248318314065,
+      "loss": 1.0617,
+      "step": 3458
+    },
+    {
+      "epoch": 0.6159188034188035,
+      "grad_norm": 0.5666514039039612,
+      "learning_rate": 0.00018863600326884082,
+      "loss": 0.9981,
+      "step": 3459
+    },
+    {
+      "epoch": 0.6160968660968661,
+      "grad_norm": 0.4648127257823944,
+      "learning_rate": 0.00018862952161790265,
+      "loss": 0.917,
+      "step": 3460
+    },
+    {
+      "epoch": 0.6162749287749287,
+      "grad_norm": 0.590326189994812,
+      "learning_rate": 0.0001886230382304531,
+      "loss": 1.044,
+      "step": 3461
+    },
+    {
+      "epoch": 0.6164529914529915,
+      "grad_norm": 0.5511625409126282,
+      "learning_rate": 0.00018861655310661925,
+      "loss": 1.0988,
+      "step": 3462
+    },
+    {
+      "epoch": 0.6166310541310541,
+      "grad_norm": 0.567182183265686,
+      "learning_rate": 0.0001886100662465281,
+      "loss": 1.3017,
+      "step": 3463
+    },
+    {
+      "epoch": 0.6168091168091168,
+      "grad_norm": 0.5708897709846497,
+      "learning_rate": 0.0001886035776503068,
+      "loss": 0.9123,
+      "step": 3464
+    },
+    {
+      "epoch": 0.6169871794871795,
+      "grad_norm": 0.4945180416107178,
+      "learning_rate": 0.0001885970873180824,
+      "loss": 1.1645,
+      "step": 3465
+    },
+    {
+      "epoch": 0.6171652421652422,
+      "grad_norm": 0.4713336229324341,
+      "learning_rate": 0.00018859059524998215,
+      "loss": 1.0546,
+      "step": 3466
+    },
+    {
+      "epoch": 0.6173433048433048,
+      "grad_norm": 0.532859206199646,
+      "learning_rate": 0.0001885841014461332,
+      "loss": 1.0795,
+      "step": 3467
+    },
+    {
+      "epoch": 0.6175213675213675,
+      "grad_norm": 0.5165733695030212,
+      "learning_rate": 0.00018857760590666284,
+      "loss": 1.1284,
+      "step": 3468
+    },
+    {
+      "epoch": 0.6176994301994302,
+      "grad_norm": 0.48623126745224,
+      "learning_rate": 0.00018857110863169826,
+      "loss": 0.8618,
+      "step": 3469
+    },
+    {
+      "epoch": 0.6178774928774928,
+      "grad_norm": 0.628559947013855,
+      "learning_rate": 0.0001885646096213668,
+      "loss": 1.1089,
+      "step": 3470
+    },
+    {
+      "epoch": 0.6180555555555556,
+      "grad_norm": 0.503545880317688,
+      "learning_rate": 0.0001885581088757958,
+      "loss": 1.2311,
+      "step": 3471
+    },
+    {
+      "epoch": 0.6182336182336182,
+      "grad_norm": 0.6172101497650146,
+      "learning_rate": 0.00018855160639511264,
+      "loss": 1.2651,
+      "step": 3472
+    },
+    {
+      "epoch": 0.6184116809116809,
+      "grad_norm": 0.49572527408599854,
+      "learning_rate": 0.00018854510217944465,
+      "loss": 1.1026,
+      "step": 3473
+    },
+    {
+      "epoch": 0.6185897435897436,
+      "grad_norm": 0.5373549461364746,
+      "learning_rate": 0.00018853859622891938,
+      "loss": 1.2562,
+      "step": 3474
+    },
+    {
+      "epoch": 0.6187678062678063,
+      "grad_norm": 0.5272396206855774,
+      "learning_rate": 0.0001885320885436642,
+      "loss": 1.1763,
+      "step": 3475
+    },
+    {
+      "epoch": 0.6189458689458689,
+      "grad_norm": 0.46584269404411316,
+      "learning_rate": 0.00018852557912380665,
+      "loss": 1.1762,
+      "step": 3476
+    },
+    {
+      "epoch": 0.6191239316239316,
+      "grad_norm": 0.4798245131969452,
+      "learning_rate": 0.0001885190679694743,
+      "loss": 0.9229,
+      "step": 3477
+    },
+    {
+      "epoch": 0.6193019943019943,
+      "grad_norm": 0.5221366286277771,
+      "learning_rate": 0.0001885125550807947,
+      "loss": 1.1078,
+      "step": 3478
+    },
+    {
+      "epoch": 0.6194800569800569,
+      "grad_norm": 0.5051897168159485,
+      "learning_rate": 0.0001885060404578954,
+      "loss": 1.0055,
+      "step": 3479
+    },
+    {
+      "epoch": 0.6196581196581197,
+      "grad_norm": 0.492662250995636,
+      "learning_rate": 0.00018849952410090413,
+      "loss": 1.1172,
+      "step": 3480
+    },
+    {
+      "epoch": 0.6198361823361823,
+      "grad_norm": 0.4906775951385498,
+      "learning_rate": 0.00018849300600994853,
+      "loss": 1.1223,
+      "step": 3481
+    },
+    {
+      "epoch": 0.6200142450142451,
+      "grad_norm": 0.5032641291618347,
+      "learning_rate": 0.0001884864861851563,
+      "loss": 0.9541,
+      "step": 3482
+    },
+    {
+      "epoch": 0.6201923076923077,
+      "grad_norm": 0.5262296795845032,
+      "learning_rate": 0.00018847996462665521,
+      "loss": 1.021,
+      "step": 3483
+    },
+    {
+      "epoch": 0.6203703703703703,
+      "grad_norm": 0.5253522992134094,
+      "learning_rate": 0.00018847344133457295,
+      "loss": 0.9075,
+      "step": 3484
+    },
+    {
+      "epoch": 0.6205484330484331,
+      "grad_norm": 0.4204299747943878,
+      "learning_rate": 0.00018846691630903744,
+      "loss": 0.895,
+      "step": 3485
+    },
+    {
+      "epoch": 0.6207264957264957,
+      "grad_norm": 0.557604193687439,
+      "learning_rate": 0.0001884603895501765,
+      "loss": 1.1758,
+      "step": 3486
+    },
+    {
+      "epoch": 0.6209045584045584,
+      "grad_norm": 0.5981321930885315,
+      "learning_rate": 0.00018845386105811795,
+      "loss": 1.1087,
+      "step": 3487
+    },
+    {
+      "epoch": 0.6210826210826211,
+      "grad_norm": 0.5285581946372986,
+      "learning_rate": 0.00018844733083298975,
+      "loss": 1.0692,
+      "step": 3488
+    },
+    {
+      "epoch": 0.6212606837606838,
+      "grad_norm": 0.5403170585632324,
+      "learning_rate": 0.00018844079887491986,
+      "loss": 1.1998,
+      "step": 3489
+    },
+    {
+      "epoch": 0.6214387464387464,
+      "grad_norm": 0.5471615791320801,
+      "learning_rate": 0.0001884342651840362,
+      "loss": 0.9556,
+      "step": 3490
+    },
+    {
+      "epoch": 0.6216168091168092,
+      "grad_norm": 0.6126871705055237,
+      "learning_rate": 0.00018842772976046686,
+      "loss": 1.2629,
+      "step": 3491
+    },
+    {
+      "epoch": 0.6217948717948718,
+      "grad_norm": 0.45669353008270264,
+      "learning_rate": 0.00018842119260433982,
+      "loss": 1.0203,
+      "step": 3492
+    },
+    {
+      "epoch": 0.6219729344729344,
+      "grad_norm": 0.4998520612716675,
+      "learning_rate": 0.0001884146537157832,
+      "loss": 1.0271,
+      "step": 3493
+    },
+    {
+      "epoch": 0.6221509971509972,
+      "grad_norm": 0.5820242166519165,
+      "learning_rate": 0.00018840811309492507,
+      "loss": 1.0321,
+      "step": 3494
+    },
+    {
+      "epoch": 0.6223290598290598,
+      "grad_norm": 0.581676185131073,
+      "learning_rate": 0.00018840157074189367,
+      "loss": 0.9219,
+      "step": 3495
+    },
+    {
+      "epoch": 0.6225071225071225,
+      "grad_norm": 0.6044120788574219,
+      "learning_rate": 0.0001883950266568171,
+      "loss": 1.1621,
+      "step": 3496
+    },
+    {
+      "epoch": 0.6226851851851852,
+      "grad_norm": 0.5448858737945557,
+      "learning_rate": 0.0001883884808398236,
+      "loss": 1.0686,
+      "step": 3497
+    },
+    {
+      "epoch": 0.6228632478632479,
+      "grad_norm": 0.4921551048755646,
+      "learning_rate": 0.00018838193329104143,
+      "loss": 1.2259,
+      "step": 3498
+    },
+    {
+      "epoch": 0.6230413105413105,
+      "grad_norm": 0.5374335646629333,
+      "learning_rate": 0.00018837538401059888,
+      "loss": 1.2608,
+      "step": 3499
+    },
+    {
+      "epoch": 0.6232193732193733,
+      "grad_norm": 0.5123008489608765,
+      "learning_rate": 0.0001883688329986243,
+      "loss": 0.8682,
+      "step": 3500
+    },
+    {
+      "epoch": 0.6233974358974359,
+      "grad_norm": 0.566145122051239,
+      "learning_rate": 0.00018836228025524595,
+      "loss": 1.1807,
+      "step": 3501
+    },
+    {
+      "epoch": 0.6235754985754985,
+      "grad_norm": 0.6658587455749512,
+      "learning_rate": 0.00018835572578059233,
+      "loss": 1.1641,
+      "step": 3502
+    },
+    {
+      "epoch": 0.6237535612535613,
+      "grad_norm": 0.4992465078830719,
+      "learning_rate": 0.00018834916957479177,
+      "loss": 0.9125,
+      "step": 3503
+    },
+    {
+      "epoch": 0.6239316239316239,
+      "grad_norm": 0.5081812739372253,
+      "learning_rate": 0.00018834261163797278,
+      "loss": 1.0939,
+      "step": 3504
+    },
+    {
+      "epoch": 0.6241096866096866,
+      "grad_norm": 0.5168607234954834,
+      "learning_rate": 0.0001883360519702638,
+      "loss": 1.2382,
+      "step": 3505
+    },
+    {
+      "epoch": 0.6242877492877493,
+      "grad_norm": 0.5517697334289551,
+      "learning_rate": 0.00018832949057179344,
+      "loss": 1.206,
+      "step": 3506
+    },
+    {
+      "epoch": 0.624465811965812,
+      "grad_norm": 0.4505497217178345,
+      "learning_rate": 0.00018832292744269013,
+      "loss": 0.8485,
+      "step": 3507
+    },
+    {
+      "epoch": 0.6246438746438746,
+      "grad_norm": 0.5230690240859985,
+      "learning_rate": 0.0001883163625830826,
+      "loss": 1.1701,
+      "step": 3508
+    },
+    {
+      "epoch": 0.6248219373219374,
+      "grad_norm": 0.5062205195426941,
+      "learning_rate": 0.00018830979599309937,
+      "loss": 1.0602,
+      "step": 3509
+    },
+    {
+      "epoch": 0.625,
+      "grad_norm": 0.49922460317611694,
+      "learning_rate": 0.00018830322767286913,
+      "loss": 1.1937,
+      "step": 3510
+    },
+    {
+      "epoch": 0.6251780626780626,
+      "grad_norm": 0.4637366831302643,
+      "learning_rate": 0.0001882966576225206,
+      "loss": 1.038,
+      "step": 3511
+    },
+    {
+      "epoch": 0.6253561253561254,
+      "grad_norm": 0.5330080389976501,
+      "learning_rate": 0.00018829008584218246,
+      "loss": 0.9308,
+      "step": 3512
+    },
+    {
+      "epoch": 0.625534188034188,
+      "grad_norm": 0.5443428754806519,
+      "learning_rate": 0.0001882835123319835,
+      "loss": 1.0006,
+      "step": 3513
+    },
+    {
+      "epoch": 0.6257122507122507,
+      "grad_norm": 0.5534018874168396,
+      "learning_rate": 0.00018827693709205253,
+      "loss": 1.2383,
+      "step": 3514
+    },
+    {
+      "epoch": 0.6258903133903134,
+      "grad_norm": 0.49207547307014465,
+      "learning_rate": 0.00018827036012251832,
+      "loss": 0.9804,
+      "step": 3515
+    },
+    {
+      "epoch": 0.6260683760683761,
+      "grad_norm": 0.4900086224079132,
+      "learning_rate": 0.0001882637814235098,
+      "loss": 1.012,
+      "step": 3516
+    },
+    {
+      "epoch": 0.6262464387464387,
+      "grad_norm": 0.5267475247383118,
+      "learning_rate": 0.00018825720099515585,
+      "loss": 1.1104,
+      "step": 3517
+    },
+    {
+      "epoch": 0.6264245014245015,
+      "grad_norm": 0.5711902379989624,
+      "learning_rate": 0.00018825061883758534,
+      "loss": 1.0616,
+      "step": 3518
+    },
+    {
+      "epoch": 0.6266025641025641,
+      "grad_norm": 0.5007771849632263,
+      "learning_rate": 0.0001882440349509273,
+      "loss": 0.9578,
+      "step": 3519
+    },
+    {
+      "epoch": 0.6267806267806267,
+      "grad_norm": 0.5657192468643188,
+      "learning_rate": 0.00018823744933531075,
+      "loss": 1.2768,
+      "step": 3520
+    },
+    {
+      "epoch": 0.6269586894586895,
+      "grad_norm": 0.6077173352241516,
+      "learning_rate": 0.00018823086199086462,
+      "loss": 1.147,
+      "step": 3521
+    },
+    {
+      "epoch": 0.6271367521367521,
+      "grad_norm": 0.5114718079566956,
+      "learning_rate": 0.000188224272917718,
+      "loss": 1.1176,
+      "step": 3522
+    },
+    {
+      "epoch": 0.6273148148148148,
+      "grad_norm": 0.4831676185131073,
+      "learning_rate": 0.0001882176821160001,
+      "loss": 0.8021,
+      "step": 3523
+    },
+    {
+      "epoch": 0.6274928774928775,
+      "grad_norm": 0.6327390670776367,
+      "learning_rate": 0.00018821108958583994,
+      "loss": 0.9449,
+      "step": 3524
+    },
+    {
+      "epoch": 0.6276709401709402,
+      "grad_norm": 0.5541796684265137,
+      "learning_rate": 0.00018820449532736672,
+      "loss": 1.2018,
+      "step": 3525
+    },
+    {
+      "epoch": 0.6278490028490028,
+      "grad_norm": 0.5224639773368835,
+      "learning_rate": 0.00018819789934070968,
+      "loss": 1.0138,
+      "step": 3526
+    },
+    {
+      "epoch": 0.6280270655270656,
+      "grad_norm": 0.49359360337257385,
+      "learning_rate": 0.00018819130162599798,
+      "loss": 1.0768,
+      "step": 3527
+    },
+    {
+      "epoch": 0.6282051282051282,
+      "grad_norm": 0.5525050759315491,
+      "learning_rate": 0.00018818470218336092,
+      "loss": 1.0883,
+      "step": 3528
+    },
+    {
+      "epoch": 0.6283831908831908,
+      "grad_norm": 0.5563427209854126,
+      "learning_rate": 0.00018817810101292787,
+      "loss": 1.1491,
+      "step": 3529
+    },
+    {
+      "epoch": 0.6285612535612536,
+      "grad_norm": 0.49363306164741516,
+      "learning_rate": 0.00018817149811482803,
+      "loss": 1.1409,
+      "step": 3530
+    },
+    {
+      "epoch": 0.6287393162393162,
+      "grad_norm": 0.5102340579032898,
+      "learning_rate": 0.00018816489348919086,
+      "loss": 1.1914,
+      "step": 3531
+    },
+    {
+      "epoch": 0.6289173789173789,
+      "grad_norm": 0.5173332691192627,
+      "learning_rate": 0.00018815828713614576,
+      "loss": 0.9308,
+      "step": 3532
+    },
+    {
+      "epoch": 0.6290954415954416,
+      "grad_norm": 0.5093010067939758,
+      "learning_rate": 0.00018815167905582216,
+      "loss": 0.9429,
+      "step": 3533
+    },
+    {
+      "epoch": 0.6292735042735043,
+      "grad_norm": 0.5453153848648071,
+      "learning_rate": 0.00018814506924834954,
+      "loss": 1.0147,
+      "step": 3534
+    },
+    {
+      "epoch": 0.6294515669515669,
+      "grad_norm": 0.5850773453712463,
+      "learning_rate": 0.00018813845771385737,
+      "loss": 1.3372,
+      "step": 3535
+    },
+    {
+      "epoch": 0.6296296296296297,
+      "grad_norm": 0.5095621943473816,
+      "learning_rate": 0.00018813184445247525,
+      "loss": 1.0515,
+      "step": 3536
+    },
+    {
+      "epoch": 0.6298076923076923,
+      "grad_norm": 0.6216054558753967,
+      "learning_rate": 0.00018812522946433266,
+      "loss": 0.8703,
+      "step": 3537
+    },
+    {
+      "epoch": 0.6299857549857549,
+      "grad_norm": 0.4945531189441681,
+      "learning_rate": 0.00018811861274955932,
+      "loss": 1.1485,
+      "step": 3538
+    },
+    {
+      "epoch": 0.6301638176638177,
+      "grad_norm": 0.47882601618766785,
+      "learning_rate": 0.00018811199430828477,
+      "loss": 1.1107,
+      "step": 3539
+    },
+    {
+      "epoch": 0.6303418803418803,
+      "grad_norm": 0.5005326867103577,
+      "learning_rate": 0.00018810537414063876,
+      "loss": 1.0237,
+      "step": 3540
+    },
+    {
+      "epoch": 0.6305199430199431,
+      "grad_norm": 0.5382370352745056,
+      "learning_rate": 0.00018809875224675093,
+      "loss": 0.9965,
+      "step": 3541
+    },
+    {
+      "epoch": 0.6306980056980057,
+      "grad_norm": 0.47002625465393066,
+      "learning_rate": 0.0001880921286267511,
+      "loss": 1.065,
+      "step": 3542
+    },
+    {
+      "epoch": 0.6308760683760684,
+      "grad_norm": 0.4519105851650238,
+      "learning_rate": 0.00018808550328076897,
+      "loss": 0.9312,
+      "step": 3543
+    },
+    {
+      "epoch": 0.6310541310541311,
+      "grad_norm": 0.45360881090164185,
+      "learning_rate": 0.0001880788762089344,
+      "loss": 1.0739,
+      "step": 3544
+    },
+    {
+      "epoch": 0.6312321937321937,
+      "grad_norm": 0.5578218698501587,
+      "learning_rate": 0.00018807224741137723,
+      "loss": 1.2478,
+      "step": 3545
+    },
+    {
+      "epoch": 0.6314102564102564,
+      "grad_norm": 0.4838615655899048,
+      "learning_rate": 0.0001880656168882273,
+      "loss": 1.0221,
+      "step": 3546
+    },
+    {
+      "epoch": 0.6315883190883191,
+      "grad_norm": 0.5733556747436523,
+      "learning_rate": 0.0001880589846396146,
+      "loss": 1.1249,
+      "step": 3547
+    },
+    {
+      "epoch": 0.6317663817663818,
+      "grad_norm": 0.4939686954021454,
+      "learning_rate": 0.00018805235066566894,
+      "loss": 0.8559,
+      "step": 3548
+    },
+    {
+      "epoch": 0.6319444444444444,
+      "grad_norm": 0.5072234869003296,
+      "learning_rate": 0.00018804571496652044,
+      "loss": 1.0842,
+      "step": 3549
+    },
+    {
+      "epoch": 0.6321225071225072,
+      "grad_norm": 0.4640493392944336,
+      "learning_rate": 0.00018803907754229903,
+      "loss": 1.0728,
+      "step": 3550
+    },
+    {
+      "epoch": 0.6323005698005698,
+      "grad_norm": 0.5314788818359375,
+      "learning_rate": 0.00018803243839313481,
+      "loss": 1.0752,
+      "step": 3551
+    },
+    {
+      "epoch": 0.6324786324786325,
+      "grad_norm": 0.5511462092399597,
+      "learning_rate": 0.0001880257975191578,
+      "loss": 1.0238,
+      "step": 3552
+    },
+    {
+      "epoch": 0.6326566951566952,
+      "grad_norm": 0.4980711042881012,
+      "learning_rate": 0.00018801915492049816,
+      "loss": 1.0981,
+      "step": 3553
+    },
+    {
+      "epoch": 0.6328347578347578,
+      "grad_norm": 0.7746123671531677,
+      "learning_rate": 0.00018801251059728604,
+      "loss": 1.0968,
+      "step": 3554
+    },
+    {
+      "epoch": 0.6330128205128205,
+      "grad_norm": 0.5006106495857239,
+      "learning_rate": 0.00018800586454965155,
+      "loss": 1.1802,
+      "step": 3555
+    },
+    {
+      "epoch": 0.6331908831908832,
+      "grad_norm": 0.49427780508995056,
+      "learning_rate": 0.000187999216777725,
+      "loss": 1.1257,
+      "step": 3556
+    },
+    {
+      "epoch": 0.6333689458689459,
+      "grad_norm": 0.5484146475791931,
+      "learning_rate": 0.00018799256728163662,
+      "loss": 1.1344,
+      "step": 3557
+    },
+    {
+      "epoch": 0.6335470085470085,
+      "grad_norm": 0.5007877349853516,
+      "learning_rate": 0.00018798591606151662,
+      "loss": 1.1328,
+      "step": 3558
+    },
+    {
+      "epoch": 0.6337250712250713,
+      "grad_norm": 0.5068148970603943,
+      "learning_rate": 0.00018797926311749544,
+      "loss": 0.976,
+      "step": 3559
+    },
+    {
+      "epoch": 0.6339031339031339,
+      "grad_norm": 0.44936859607696533,
+      "learning_rate": 0.00018797260844970334,
+      "loss": 0.9735,
+      "step": 3560
+    },
+    {
+      "epoch": 0.6340811965811965,
+      "grad_norm": 0.4592931866645813,
+      "learning_rate": 0.0001879659520582707,
+      "loss": 1.1306,
+      "step": 3561
+    },
+    {
+      "epoch": 0.6342592592592593,
+      "grad_norm": 0.4664020836353302,
+      "learning_rate": 0.00018795929394332795,
+      "loss": 1.0577,
+      "step": 3562
+    },
+    {
+      "epoch": 0.6344373219373219,
+      "grad_norm": 0.5638116002082825,
+      "learning_rate": 0.00018795263410500556,
+      "loss": 1.1747,
+      "step": 3563
+    },
+    {
+      "epoch": 0.6346153846153846,
+      "grad_norm": 0.524736225605011,
+      "learning_rate": 0.00018794597254343401,
+      "loss": 0.8964,
+      "step": 3564
+    },
+    {
+      "epoch": 0.6347934472934473,
+      "grad_norm": 0.4645404517650604,
+      "learning_rate": 0.00018793930925874386,
+      "loss": 0.8673,
+      "step": 3565
+    },
+    {
+      "epoch": 0.63497150997151,
+      "grad_norm": 0.4800064265727997,
+      "learning_rate": 0.00018793264425106558,
+      "loss": 1.0334,
+      "step": 3566
+    },
+    {
+      "epoch": 0.6351495726495726,
+      "grad_norm": 0.6202501058578491,
+      "learning_rate": 0.0001879259775205298,
+      "loss": 1.1061,
+      "step": 3567
+    },
+    {
+      "epoch": 0.6353276353276354,
+      "grad_norm": 0.503383457660675,
+      "learning_rate": 0.00018791930906726718,
+      "loss": 0.8545,
+      "step": 3568
+    },
+    {
+      "epoch": 0.635505698005698,
+      "grad_norm": 0.5256780982017517,
+      "learning_rate": 0.00018791263889140832,
+      "loss": 1.0785,
+      "step": 3569
+    },
+    {
+      "epoch": 0.6356837606837606,
+      "grad_norm": 0.47562023997306824,
+      "learning_rate": 0.00018790596699308392,
+      "loss": 1.0041,
+      "step": 3570
+    },
+    {
+      "epoch": 0.6358618233618234,
+      "grad_norm": 0.5103238224983215,
+      "learning_rate": 0.00018789929337242469,
+      "loss": 1.1488,
+      "step": 3571
+    },
+    {
+      "epoch": 0.636039886039886,
+      "grad_norm": 0.5023695826530457,
+      "learning_rate": 0.0001878926180295614,
+      "loss": 1.0696,
+      "step": 3572
+    },
+    {
+      "epoch": 0.6362179487179487,
+      "grad_norm": 0.5302290916442871,
+      "learning_rate": 0.00018788594096462487,
+      "loss": 1.0554,
+      "step": 3573
+    },
+    {
+      "epoch": 0.6363960113960114,
+      "grad_norm": 0.4798361361026764,
+      "learning_rate": 0.00018787926217774588,
+      "loss": 0.8872,
+      "step": 3574
+    },
+    {
+      "epoch": 0.6365740740740741,
+      "grad_norm": 0.5529209971427917,
+      "learning_rate": 0.00018787258166905527,
+      "loss": 1.0976,
+      "step": 3575
+    },
+    {
+      "epoch": 0.6367521367521367,
+      "grad_norm": 0.49757125973701477,
+      "learning_rate": 0.00018786589943868402,
+      "loss": 1.0049,
+      "step": 3576
+    },
+    {
+      "epoch": 0.6369301994301995,
+      "grad_norm": 0.5497848391532898,
+      "learning_rate": 0.00018785921548676295,
+      "loss": 1.2272,
+      "step": 3577
+    },
+    {
+      "epoch": 0.6371082621082621,
+      "grad_norm": 0.5061752200126648,
+      "learning_rate": 0.0001878525298134231,
+      "loss": 1.0307,
+      "step": 3578
+    },
+    {
+      "epoch": 0.6372863247863247,
+      "grad_norm": 0.5427432656288147,
+      "learning_rate": 0.00018784584241879538,
+      "loss": 1.1064,
+      "step": 3579
+    },
+    {
+      "epoch": 0.6374643874643875,
+      "grad_norm": 0.48312774300575256,
+      "learning_rate": 0.0001878391533030109,
+      "loss": 1.078,
+      "step": 3580
+    },
+    {
+      "epoch": 0.6376424501424501,
+      "grad_norm": 0.5059898495674133,
+      "learning_rate": 0.00018783246246620067,
+      "loss": 1.0922,
+      "step": 3581
+    },
+    {
+      "epoch": 0.6378205128205128,
+      "grad_norm": 0.5144124031066895,
+      "learning_rate": 0.00018782576990849581,
+      "loss": 1.0909,
+      "step": 3582
+    },
+    {
+      "epoch": 0.6379985754985755,
+      "grad_norm": 0.5535032153129578,
+      "learning_rate": 0.0001878190756300274,
+      "loss": 1.2579,
+      "step": 3583
+    },
+    {
+      "epoch": 0.6381766381766382,
+      "grad_norm": 0.49145692586898804,
+      "learning_rate": 0.00018781237963092667,
+      "loss": 1.0823,
+      "step": 3584
+    },
+    {
+      "epoch": 0.6383547008547008,
+      "grad_norm": 0.5245576500892639,
+      "learning_rate": 0.00018780568191132472,
+      "loss": 0.9595,
+      "step": 3585
+    },
+    {
+      "epoch": 0.6385327635327636,
+      "grad_norm": 0.5026637315750122,
+      "learning_rate": 0.00018779898247135287,
+      "loss": 1.153,
+      "step": 3586
+    },
+    {
+      "epoch": 0.6387108262108262,
+      "grad_norm": 0.5092771053314209,
+      "learning_rate": 0.00018779228131114234,
+      "loss": 1.0661,
+      "step": 3587
+    },
+    {
+      "epoch": 0.6388888888888888,
+      "grad_norm": 0.517387330532074,
+      "learning_rate": 0.00018778557843082444,
+      "loss": 1.0113,
+      "step": 3588
+    },
+    {
+      "epoch": 0.6390669515669516,
+      "grad_norm": 0.5149948000907898,
+      "learning_rate": 0.00018777887383053047,
+      "loss": 0.9483,
+      "step": 3589
+    },
+    {
+      "epoch": 0.6392450142450142,
+      "grad_norm": 0.4854544997215271,
+      "learning_rate": 0.00018777216751039185,
+      "loss": 1.22,
+      "step": 3590
+    },
+    {
+      "epoch": 0.6394230769230769,
+      "grad_norm": 0.5317271947860718,
+      "learning_rate": 0.0001877654594705399,
+      "loss": 1.2483,
+      "step": 3591
+    },
+    {
+      "epoch": 0.6396011396011396,
+      "grad_norm": 0.4554755687713623,
+      "learning_rate": 0.0001877587497111061,
+      "loss": 0.9864,
+      "step": 3592
+    },
+    {
+      "epoch": 0.6397792022792023,
+      "grad_norm": 0.4833736717700958,
+      "learning_rate": 0.0001877520382322219,
+      "loss": 0.8895,
+      "step": 3593
+    },
+    {
+      "epoch": 0.6399572649572649,
+      "grad_norm": 0.5018072724342346,
+      "learning_rate": 0.00018774532503401878,
+      "loss": 1.2523,
+      "step": 3594
+    },
+    {
+      "epoch": 0.6401353276353277,
+      "grad_norm": 0.4478762447834015,
+      "learning_rate": 0.00018773861011662832,
+      "loss": 0.8833,
+      "step": 3595
+    },
+    {
+      "epoch": 0.6403133903133903,
+      "grad_norm": 0.5686985850334167,
+      "learning_rate": 0.00018773189348018205,
+      "loss": 0.9934,
+      "step": 3596
+    },
+    {
+      "epoch": 0.6404914529914529,
+      "grad_norm": 0.5144175291061401,
+      "learning_rate": 0.00018772517512481157,
+      "loss": 0.8149,
+      "step": 3597
+    },
+    {
+      "epoch": 0.6406695156695157,
+      "grad_norm": 0.5359936356544495,
+      "learning_rate": 0.00018771845505064852,
+      "loss": 1.1822,
+      "step": 3598
+    },
+    {
+      "epoch": 0.6408475783475783,
+      "grad_norm": 0.532573938369751,
+      "learning_rate": 0.00018771173325782457,
+      "loss": 1.0361,
+      "step": 3599
+    },
+    {
+      "epoch": 0.6410256410256411,
+      "grad_norm": 0.46121537685394287,
+      "learning_rate": 0.00018770500974647138,
+      "loss": 1.0792,
+      "step": 3600
+    },
+    {
+      "epoch": 0.6412037037037037,
+      "grad_norm": 0.4804821312427521,
+      "learning_rate": 0.00018769828451672076,
+      "loss": 1.1119,
+      "step": 3601
+    },
+    {
+      "epoch": 0.6413817663817664,
+      "grad_norm": 0.4955114722251892,
+      "learning_rate": 0.00018769155756870443,
+      "loss": 0.9312,
+      "step": 3602
+    },
+    {
+      "epoch": 0.6415598290598291,
+      "grad_norm": 0.4987298250198364,
+      "learning_rate": 0.00018768482890255415,
+      "loss": 1.2326,
+      "step": 3603
+    },
+    {
+      "epoch": 0.6417378917378918,
+      "grad_norm": 0.47216179966926575,
+      "learning_rate": 0.0001876780985184018,
+      "loss": 1.0114,
+      "step": 3604
+    },
+    {
+      "epoch": 0.6419159544159544,
+      "grad_norm": 0.5891931653022766,
+      "learning_rate": 0.0001876713664163793,
+      "loss": 1.2963,
+      "step": 3605
+    },
+    {
+      "epoch": 0.6420940170940171,
+      "grad_norm": 0.4645081162452698,
+      "learning_rate": 0.00018766463259661846,
+      "loss": 1.0874,
+      "step": 3606
+    },
+    {
+      "epoch": 0.6422720797720798,
+      "grad_norm": 0.5275476574897766,
+      "learning_rate": 0.00018765789705925125,
+      "loss": 0.9453,
+      "step": 3607
+    },
+    {
+      "epoch": 0.6424501424501424,
+      "grad_norm": 0.5884957313537598,
+      "learning_rate": 0.00018765115980440964,
+      "loss": 1.0796,
+      "step": 3608
+    },
+    {
+      "epoch": 0.6426282051282052,
+      "grad_norm": 0.4843178987503052,
+      "learning_rate": 0.00018764442083222567,
+      "loss": 1.1657,
+      "step": 3609
+    },
+    {
+      "epoch": 0.6428062678062678,
+      "grad_norm": 0.5188381671905518,
+      "learning_rate": 0.00018763768014283126,
+      "loss": 1.1109,
+      "step": 3610
+    },
+    {
+      "epoch": 0.6429843304843305,
+      "grad_norm": 0.4101468324661255,
+      "learning_rate": 0.00018763093773635863,
+      "loss": 0.895,
+      "step": 3611
+    },
+    {
+      "epoch": 0.6431623931623932,
+      "grad_norm": 0.4552084505558014,
+      "learning_rate": 0.00018762419361293979,
+      "loss": 0.9418,
+      "step": 3612
+    },
+    {
+      "epoch": 0.6433404558404558,
+      "grad_norm": 0.5924661159515381,
+      "learning_rate": 0.0001876174477727069,
+      "loss": 1.2562,
+      "step": 3613
+    },
+    {
+      "epoch": 0.6435185185185185,
+      "grad_norm": 0.5072348713874817,
+      "learning_rate": 0.00018761070021579212,
+      "loss": 1.1501,
+      "step": 3614
+    },
+    {
+      "epoch": 0.6436965811965812,
+      "grad_norm": 0.5312697887420654,
+      "learning_rate": 0.0001876039509423277,
+      "loss": 1.0751,
+      "step": 3615
+    },
+    {
+      "epoch": 0.6438746438746439,
+      "grad_norm": 0.6046462059020996,
+      "learning_rate": 0.0001875971999524458,
+      "loss": 1.0927,
+      "step": 3616
+    },
+    {
+      "epoch": 0.6440527065527065,
+      "grad_norm": 0.4992375373840332,
+      "learning_rate": 0.00018759044724627876,
+      "loss": 0.96,
+      "step": 3617
+    },
+    {
+      "epoch": 0.6442307692307693,
+      "grad_norm": 0.4983134865760803,
+      "learning_rate": 0.00018758369282395886,
+      "loss": 1.0599,
+      "step": 3618
+    },
+    {
+      "epoch": 0.6444088319088319,
+      "grad_norm": 0.5655683279037476,
+      "learning_rate": 0.00018757693668561843,
+      "loss": 1.2372,
+      "step": 3619
+    },
+    {
+      "epoch": 0.6445868945868946,
+      "grad_norm": 0.4968827962875366,
+      "learning_rate": 0.00018757017883138985,
+      "loss": 1.1639,
+      "step": 3620
+    },
+    {
+      "epoch": 0.6447649572649573,
+      "grad_norm": 0.5831420421600342,
+      "learning_rate": 0.00018756341926140553,
+      "loss": 0.9002,
+      "step": 3621
+    },
+    {
+      "epoch": 0.64494301994302,
+      "grad_norm": 0.4828467071056366,
+      "learning_rate": 0.0001875566579757979,
+      "loss": 0.9201,
+      "step": 3622
+    },
+    {
+      "epoch": 0.6451210826210826,
+      "grad_norm": 0.5067087411880493,
+      "learning_rate": 0.00018754989497469943,
+      "loss": 0.9874,
+      "step": 3623
+    },
+    {
+      "epoch": 0.6452991452991453,
+      "grad_norm": 0.5182318091392517,
+      "learning_rate": 0.00018754313025824267,
+      "loss": 1.1291,
+      "step": 3624
+    },
+    {
+      "epoch": 0.645477207977208,
+      "grad_norm": 0.472200483083725,
+      "learning_rate": 0.0001875363638265601,
+      "loss": 1.0286,
+      "step": 3625
+    },
+    {
+      "epoch": 0.6456552706552706,
+      "grad_norm": 0.4597308039665222,
+      "learning_rate": 0.0001875295956797843,
+      "loss": 0.7517,
+      "step": 3626
+    },
+    {
+      "epoch": 0.6458333333333334,
+      "grad_norm": 0.5358221530914307,
+      "learning_rate": 0.00018752282581804798,
+      "loss": 1.2264,
+      "step": 3627
+    },
+    {
+      "epoch": 0.646011396011396,
+      "grad_norm": 0.5268992781639099,
+      "learning_rate": 0.00018751605424148363,
+      "loss": 1.0801,
+      "step": 3628
+    },
+    {
+      "epoch": 0.6461894586894587,
+      "grad_norm": 0.5917379260063171,
+      "learning_rate": 0.00018750928095022403,
+      "loss": 0.9538,
+      "step": 3629
+    },
+    {
+      "epoch": 0.6463675213675214,
+      "grad_norm": 0.44506707787513733,
+      "learning_rate": 0.00018750250594440183,
+      "loss": 0.9818,
+      "step": 3630
+    },
+    {
+      "epoch": 0.646545584045584,
+      "grad_norm": 0.5578880906105042,
+      "learning_rate": 0.00018749572922414982,
+      "loss": 0.9958,
+      "step": 3631
+    },
+    {
+      "epoch": 0.6467236467236467,
+      "grad_norm": 0.5155318975448608,
+      "learning_rate": 0.00018748895078960076,
+      "loss": 1.2888,
+      "step": 3632
+    },
+    {
+      "epoch": 0.6469017094017094,
+      "grad_norm": 0.5117297768592834,
+      "learning_rate": 0.0001874821706408874,
+      "loss": 1.0452,
+      "step": 3633
+    },
+    {
+      "epoch": 0.6470797720797721,
+      "grad_norm": 0.5169841647148132,
+      "learning_rate": 0.00018747538877814267,
+      "loss": 1.1649,
+      "step": 3634
+    },
+    {
+      "epoch": 0.6472578347578347,
+      "grad_norm": 0.5001181960105896,
+      "learning_rate": 0.00018746860520149942,
+      "loss": 1.1472,
+      "step": 3635
+    },
+    {
+      "epoch": 0.6474358974358975,
+      "grad_norm": 0.6289856433868408,
+      "learning_rate": 0.00018746181991109056,
+      "loss": 1.0351,
+      "step": 3636
+    },
+    {
+      "epoch": 0.6476139601139601,
+      "grad_norm": 0.5490612983703613,
+      "learning_rate": 0.00018745503290704897,
+      "loss": 0.8938,
+      "step": 3637
+    },
+    {
+      "epoch": 0.6477920227920227,
+      "grad_norm": 0.47378283739089966,
+      "learning_rate": 0.00018744824418950775,
+      "loss": 0.937,
+      "step": 3638
+    },
+    {
+      "epoch": 0.6479700854700855,
+      "grad_norm": 0.6079059839248657,
+      "learning_rate": 0.0001874414537585998,
+      "loss": 1.0486,
+      "step": 3639
+    },
+    {
+      "epoch": 0.6481481481481481,
+      "grad_norm": 0.5351769924163818,
+      "learning_rate": 0.00018743466161445823,
+      "loss": 1.0316,
+      "step": 3640
+    },
+    {
+      "epoch": 0.6483262108262108,
+      "grad_norm": 0.5516425967216492,
+      "learning_rate": 0.0001874278677572161,
+      "loss": 1.1552,
+      "step": 3641
+    },
+    {
+      "epoch": 0.6485042735042735,
+      "grad_norm": 0.5027523636817932,
+      "learning_rate": 0.0001874210721870065,
+      "loss": 1.0491,
+      "step": 3642
+    },
+    {
+      "epoch": 0.6486823361823362,
+      "grad_norm": 0.5596168041229248,
+      "learning_rate": 0.00018741427490396258,
+      "loss": 1.0256,
+      "step": 3643
+    },
+    {
+      "epoch": 0.6488603988603988,
+      "grad_norm": 0.5601046681404114,
+      "learning_rate": 0.00018740747590821751,
+      "loss": 1.1604,
+      "step": 3644
+    },
+    {
+      "epoch": 0.6490384615384616,
+      "grad_norm": 0.49749523401260376,
+      "learning_rate": 0.0001874006751999046,
+      "loss": 1.0532,
+      "step": 3645
+    },
+    {
+      "epoch": 0.6492165242165242,
+      "grad_norm": 0.6226113438606262,
+      "learning_rate": 0.00018739387277915697,
+      "loss": 1.1402,
+      "step": 3646
+    },
+    {
+      "epoch": 0.6493945868945868,
+      "grad_norm": 0.6142009496688843,
+      "learning_rate": 0.00018738706864610794,
+      "loss": 1.2437,
+      "step": 3647
+    },
+    {
+      "epoch": 0.6495726495726496,
+      "grad_norm": 0.48814916610717773,
+      "learning_rate": 0.00018738026280089084,
+      "loss": 0.8429,
+      "step": 3648
+    },
+    {
+      "epoch": 0.6497507122507122,
+      "grad_norm": 0.5717982053756714,
+      "learning_rate": 0.00018737345524363902,
+      "loss": 1.1095,
+      "step": 3649
+    },
+    {
+      "epoch": 0.6499287749287749,
+      "grad_norm": 0.5150009989738464,
+      "learning_rate": 0.00018736664597448582,
+      "loss": 1.199,
+      "step": 3650
+    },
+    {
+      "epoch": 0.6501068376068376,
+      "grad_norm": 0.58461594581604,
+      "learning_rate": 0.00018735983499356472,
+      "loss": 1.0704,
+      "step": 3651
+    },
+    {
+      "epoch": 0.6502849002849003,
+      "grad_norm": 0.5108643770217896,
+      "learning_rate": 0.0001873530223010091,
+      "loss": 1.2039,
+      "step": 3652
+    },
+    {
+      "epoch": 0.6504629629629629,
+      "grad_norm": 0.513306736946106,
+      "learning_rate": 0.00018734620789695247,
+      "loss": 1.1448,
+      "step": 3653
+    },
+    {
+      "epoch": 0.6506410256410257,
+      "grad_norm": 0.5139986872673035,
+      "learning_rate": 0.00018733939178152835,
+      "loss": 1.0023,
+      "step": 3654
+    },
+    {
+      "epoch": 0.6508190883190883,
+      "grad_norm": 0.5187703967094421,
+      "learning_rate": 0.00018733257395487027,
+      "loss": 1.1304,
+      "step": 3655
+    },
+    {
+      "epoch": 0.6509971509971509,
+      "grad_norm": 0.5470501184463501,
+      "learning_rate": 0.00018732575441711183,
+      "loss": 1.0272,
+      "step": 3656
+    },
+    {
+      "epoch": 0.6511752136752137,
+      "grad_norm": 0.537309467792511,
+      "learning_rate": 0.00018731893316838665,
+      "loss": 1.0806,
+      "step": 3657
+    },
+    {
+      "epoch": 0.6513532763532763,
+      "grad_norm": 0.5187864899635315,
+      "learning_rate": 0.00018731211020882836,
+      "loss": 1.0154,
+      "step": 3658
+    },
+    {
+      "epoch": 0.6515313390313391,
+      "grad_norm": 0.48373252153396606,
+      "learning_rate": 0.00018730528553857062,
+      "loss": 1.0135,
+      "step": 3659
+    },
+    {
+      "epoch": 0.6517094017094017,
+      "grad_norm": 0.5645000338554382,
+      "learning_rate": 0.00018729845915774716,
+      "loss": 0.8924,
+      "step": 3660
+    },
+    {
+      "epoch": 0.6518874643874644,
+      "grad_norm": 0.5722129940986633,
+      "learning_rate": 0.00018729163106649178,
+      "loss": 1.2416,
+      "step": 3661
+    },
+    {
+      "epoch": 0.6520655270655271,
+      "grad_norm": 0.5904877185821533,
+      "learning_rate": 0.00018728480126493823,
+      "loss": 0.9792,
+      "step": 3662
+    },
+    {
+      "epoch": 0.6522435897435898,
+      "grad_norm": 0.5224713087081909,
+      "learning_rate": 0.00018727796975322026,
+      "loss": 1.079,
+      "step": 3663
+    },
+    {
+      "epoch": 0.6524216524216524,
+      "grad_norm": 0.5667217969894409,
+      "learning_rate": 0.00018727113653147184,
+      "loss": 1.1397,
+      "step": 3664
+    },
+    {
+      "epoch": 0.6525997150997151,
+      "grad_norm": 0.5274622440338135,
+      "learning_rate": 0.00018726430159982677,
+      "loss": 1.0569,
+      "step": 3665
+    },
+    {
+      "epoch": 0.6527777777777778,
+      "grad_norm": 0.5745310187339783,
+      "learning_rate": 0.00018725746495841896,
+      "loss": 1.2129,
+      "step": 3666
+    },
+    {
+      "epoch": 0.6529558404558404,
+      "grad_norm": 0.6123398542404175,
+      "learning_rate": 0.0001872506266073824,
+      "loss": 1.186,
+      "step": 3667
+    },
+    {
+      "epoch": 0.6531339031339032,
+      "grad_norm": 0.4983387291431427,
+      "learning_rate": 0.00018724378654685106,
+      "loss": 1.1957,
+      "step": 3668
+    },
+    {
+      "epoch": 0.6533119658119658,
+      "grad_norm": 0.5584192276000977,
+      "learning_rate": 0.00018723694477695897,
+      "loss": 1.0939,
+      "step": 3669
+    },
+    {
+      "epoch": 0.6534900284900285,
+      "grad_norm": 0.5318745374679565,
+      "learning_rate": 0.00018723010129784016,
+      "loss": 1.1869,
+      "step": 3670
+    },
+    {
+      "epoch": 0.6536680911680912,
+      "grad_norm": 0.4607617259025574,
+      "learning_rate": 0.0001872232561096287,
+      "loss": 0.8447,
+      "step": 3671
+    },
+    {
+      "epoch": 0.6538461538461539,
+      "grad_norm": 0.5312213897705078,
+      "learning_rate": 0.00018721640921245874,
+      "loss": 1.0623,
+      "step": 3672
+    },
+    {
+      "epoch": 0.6540242165242165,
+      "grad_norm": 0.5099136233329773,
+      "learning_rate": 0.0001872095606064644,
+      "loss": 0.7174,
+      "step": 3673
+    },
+    {
+      "epoch": 0.6542022792022792,
+      "grad_norm": 0.6894404888153076,
+      "learning_rate": 0.0001872027102917799,
+      "loss": 1.0251,
+      "step": 3674
+    },
+    {
+      "epoch": 0.6543803418803419,
+      "grad_norm": 0.5758535861968994,
+      "learning_rate": 0.00018719585826853944,
+      "loss": 1.1655,
+      "step": 3675
+    },
+    {
+      "epoch": 0.6545584045584045,
+      "grad_norm": 0.521824061870575,
+      "learning_rate": 0.0001871890045368773,
+      "loss": 1.1653,
+      "step": 3676
+    },
+    {
+      "epoch": 0.6547364672364673,
+      "grad_norm": 0.5370712280273438,
+      "learning_rate": 0.00018718214909692771,
+      "loss": 1.3152,
+      "step": 3677
+    },
+    {
+      "epoch": 0.6549145299145299,
+      "grad_norm": 0.4459827244281769,
+      "learning_rate": 0.000187175291948825,
+      "loss": 1.0953,
+      "step": 3678
+    },
+    {
+      "epoch": 0.6550925925925926,
+      "grad_norm": 0.44131460785865784,
+      "learning_rate": 0.00018716843309270353,
+      "loss": 0.8568,
+      "step": 3679
+    },
+    {
+      "epoch": 0.6552706552706553,
+      "grad_norm": 0.5529624819755554,
+      "learning_rate": 0.00018716157252869772,
+      "loss": 1.2085,
+      "step": 3680
+    },
+    {
+      "epoch": 0.655448717948718,
+      "grad_norm": 0.44604751467704773,
+      "learning_rate": 0.00018715471025694194,
+      "loss": 0.9605,
+      "step": 3681
+    },
+    {
+      "epoch": 0.6556267806267806,
+      "grad_norm": 0.4662449359893799,
+      "learning_rate": 0.0001871478462775707,
+      "loss": 1.2092,
+      "step": 3682
+    },
+    {
+      "epoch": 0.6558048433048433,
+      "grad_norm": 0.42632922530174255,
+      "learning_rate": 0.0001871409805907184,
+      "loss": 0.9141,
+      "step": 3683
+    },
+    {
+      "epoch": 0.655982905982906,
+      "grad_norm": 0.534009575843811,
+      "learning_rate": 0.00018713411319651958,
+      "loss": 1.0147,
+      "step": 3684
+    },
+    {
+      "epoch": 0.6561609686609686,
+      "grad_norm": 0.5433241724967957,
+      "learning_rate": 0.00018712724409510888,
+      "loss": 1.1998,
+      "step": 3685
+    },
+    {
+      "epoch": 0.6563390313390314,
+      "grad_norm": 0.4771319627761841,
+      "learning_rate": 0.0001871203732866208,
+      "loss": 1.0384,
+      "step": 3686
+    },
+    {
+      "epoch": 0.656517094017094,
+      "grad_norm": 0.507641077041626,
+      "learning_rate": 0.00018711350077119,
+      "loss": 0.9608,
+      "step": 3687
+    },
+    {
+      "epoch": 0.6566951566951567,
+      "grad_norm": 0.5069413185119629,
+      "learning_rate": 0.00018710662654895108,
+      "loss": 1.055,
+      "step": 3688
+    },
+    {
+      "epoch": 0.6568732193732194,
+      "grad_norm": 0.512340247631073,
+      "learning_rate": 0.00018709975062003876,
+      "loss": 0.9506,
+      "step": 3689
+    },
+    {
+      "epoch": 0.657051282051282,
+      "grad_norm": 0.5156390070915222,
+      "learning_rate": 0.00018709287298458778,
+      "loss": 1.0089,
+      "step": 3690
+    },
+    {
+      "epoch": 0.6572293447293447,
+      "grad_norm": 0.5101696252822876,
+      "learning_rate": 0.0001870859936427329,
+      "loss": 1.0441,
+      "step": 3691
+    },
+    {
+      "epoch": 0.6574074074074074,
+      "grad_norm": 0.4394689202308655,
+      "learning_rate": 0.00018707911259460884,
+      "loss": 0.9124,
+      "step": 3692
+    },
+    {
+      "epoch": 0.6575854700854701,
+      "grad_norm": 0.4842554032802582,
+      "learning_rate": 0.00018707222984035043,
+      "loss": 1.0051,
+      "step": 3693
+    },
+    {
+      "epoch": 0.6577635327635327,
+      "grad_norm": 0.6418108344078064,
+      "learning_rate": 0.00018706534538009262,
+      "loss": 1.1165,
+      "step": 3694
+    },
+    {
+      "epoch": 0.6579415954415955,
+      "grad_norm": 0.5596832036972046,
+      "learning_rate": 0.00018705845921397022,
+      "loss": 1.1127,
+      "step": 3695
+    },
+    {
+      "epoch": 0.6581196581196581,
+      "grad_norm": 0.6692909002304077,
+      "learning_rate": 0.00018705157134211813,
+      "loss": 1.2403,
+      "step": 3696
+    },
+    {
+      "epoch": 0.6582977207977208,
+      "grad_norm": 0.5046468377113342,
+      "learning_rate": 0.00018704468176467134,
+      "loss": 1.1016,
+      "step": 3697
+    },
+    {
+      "epoch": 0.6584757834757835,
+      "grad_norm": 0.6723586320877075,
+      "learning_rate": 0.00018703779048176485,
+      "loss": 1.1777,
+      "step": 3698
+    },
+    {
+      "epoch": 0.6586538461538461,
+      "grad_norm": 0.5269754528999329,
+      "learning_rate": 0.00018703089749353365,
+      "loss": 1.1441,
+      "step": 3699
+    },
+    {
+      "epoch": 0.6588319088319088,
+      "grad_norm": 0.5303323268890381,
+      "learning_rate": 0.0001870240028001128,
+      "loss": 1.07,
+      "step": 3700
+    },
+    {
+      "epoch": 0.6590099715099715,
+      "grad_norm": 0.4795511066913605,
+      "learning_rate": 0.00018701710640163738,
+      "loss": 1.0189,
+      "step": 3701
+    },
+    {
+      "epoch": 0.6591880341880342,
+      "grad_norm": 0.514659583568573,
+      "learning_rate": 0.00018701020829824255,
+      "loss": 1.0792,
+      "step": 3702
+    },
+    {
+      "epoch": 0.6593660968660968,
+      "grad_norm": 0.5407463312149048,
+      "learning_rate": 0.0001870033084900634,
+      "loss": 0.9346,
+      "step": 3703
+    },
+    {
+      "epoch": 0.6595441595441596,
+      "grad_norm": 0.5358424186706543,
+      "learning_rate": 0.0001869964069772352,
+      "loss": 1.1242,
+      "step": 3704
+    },
+    {
+      "epoch": 0.6597222222222222,
+      "grad_norm": 0.470825731754303,
+      "learning_rate": 0.00018698950375989307,
+      "loss": 0.9952,
+      "step": 3705
+    },
+    {
+      "epoch": 0.6599002849002849,
+      "grad_norm": 0.5711592435836792,
+      "learning_rate": 0.00018698259883817236,
+      "loss": 1.1678,
+      "step": 3706
+    },
+    {
+      "epoch": 0.6600783475783476,
+      "grad_norm": 0.5298995971679688,
+      "learning_rate": 0.00018697569221220832,
+      "loss": 0.869,
+      "step": 3707
+    },
+    {
+      "epoch": 0.6602564102564102,
+      "grad_norm": 0.5453875064849854,
+      "learning_rate": 0.00018696878388213626,
+      "loss": 0.9706,
+      "step": 3708
+    },
+    {
+      "epoch": 0.6604344729344729,
+      "grad_norm": 0.6219926476478577,
+      "learning_rate": 0.00018696187384809154,
+      "loss": 1.1902,
+      "step": 3709
+    },
+    {
+      "epoch": 0.6606125356125356,
+      "grad_norm": 0.5972491502761841,
+      "learning_rate": 0.00018695496211020953,
+      "loss": 1.2054,
+      "step": 3710
+    },
+    {
+      "epoch": 0.6607905982905983,
+      "grad_norm": 0.5048904418945312,
+      "learning_rate": 0.0001869480486686257,
+      "loss": 1.0405,
+      "step": 3711
+    },
+    {
+      "epoch": 0.6609686609686609,
+      "grad_norm": 0.5474200248718262,
+      "learning_rate": 0.00018694113352347546,
+      "loss": 1.09,
+      "step": 3712
+    },
+    {
+      "epoch": 0.6611467236467237,
+      "grad_norm": 0.5073318481445312,
+      "learning_rate": 0.00018693421667489432,
+      "loss": 1.0698,
+      "step": 3713
+    },
+    {
+      "epoch": 0.6613247863247863,
+      "grad_norm": 0.5693208575248718,
+      "learning_rate": 0.0001869272981230178,
+      "loss": 0.9664,
+      "step": 3714
+    },
+    {
+      "epoch": 0.6615028490028491,
+      "grad_norm": 0.5678503513336182,
+      "learning_rate": 0.00018692037786798143,
+      "loss": 1.0895,
+      "step": 3715
+    },
+    {
+      "epoch": 0.6616809116809117,
+      "grad_norm": 0.4950976073741913,
+      "learning_rate": 0.00018691345590992082,
+      "loss": 0.9584,
+      "step": 3716
+    },
+    {
+      "epoch": 0.6618589743589743,
+      "grad_norm": 0.4944666624069214,
+      "learning_rate": 0.0001869065322489716,
+      "loss": 0.8607,
+      "step": 3717
+    },
+    {
+      "epoch": 0.6620370370370371,
+      "grad_norm": 0.5197804570198059,
+      "learning_rate": 0.0001868996068852694,
+      "loss": 1.2335,
+      "step": 3718
+    },
+    {
+      "epoch": 0.6622150997150997,
+      "grad_norm": 0.6550365686416626,
+      "learning_rate": 0.00018689267981894994,
+      "loss": 1.0441,
+      "step": 3719
+    },
+    {
+      "epoch": 0.6623931623931624,
+      "grad_norm": 0.5331503748893738,
+      "learning_rate": 0.00018688575105014888,
+      "loss": 1.1696,
+      "step": 3720
+    },
+    {
+      "epoch": 0.6625712250712251,
+      "grad_norm": 0.47304239869117737,
+      "learning_rate": 0.00018687882057900207,
+      "loss": 0.9695,
+      "step": 3721
+    },
+    {
+      "epoch": 0.6627492877492878,
+      "grad_norm": 0.5653772354125977,
+      "learning_rate": 0.00018687188840564524,
+      "loss": 1.2082,
+      "step": 3722
+    },
+    {
+      "epoch": 0.6629273504273504,
+      "grad_norm": 0.5323491096496582,
+      "learning_rate": 0.00018686495453021417,
+      "loss": 0.9106,
+      "step": 3723
+    },
+    {
+      "epoch": 0.6631054131054132,
+      "grad_norm": 0.5612817406654358,
+      "learning_rate": 0.00018685801895284483,
+      "loss": 1.1302,
+      "step": 3724
+    },
+    {
+      "epoch": 0.6632834757834758,
+      "grad_norm": 0.4562164545059204,
+      "learning_rate": 0.000186851081673673,
+      "loss": 0.8886,
+      "step": 3725
+    },
+    {
+      "epoch": 0.6634615384615384,
+      "grad_norm": 0.5006430745124817,
+      "learning_rate": 0.00018684414269283463,
+      "loss": 0.9128,
+      "step": 3726
+    },
+    {
+      "epoch": 0.6636396011396012,
+      "grad_norm": 0.5305442810058594,
+      "learning_rate": 0.0001868372020104657,
+      "loss": 1.1766,
+      "step": 3727
+    },
+    {
+      "epoch": 0.6638176638176638,
+      "grad_norm": 0.6129274368286133,
+      "learning_rate": 0.0001868302596267022,
+      "loss": 1.04,
+      "step": 3728
+    },
+    {
+      "epoch": 0.6639957264957265,
+      "grad_norm": 0.5530399084091187,
+      "learning_rate": 0.00018682331554168013,
+      "loss": 1.4114,
+      "step": 3729
+    },
+    {
+      "epoch": 0.6641737891737892,
+      "grad_norm": 0.5397193431854248,
+      "learning_rate": 0.00018681636975553557,
+      "loss": 1.1945,
+      "step": 3730
+    },
+    {
+      "epoch": 0.6643518518518519,
+      "grad_norm": 0.5510205030441284,
+      "learning_rate": 0.00018680942226840456,
+      "loss": 1.0489,
+      "step": 3731
+    },
+    {
+      "epoch": 0.6645299145299145,
+      "grad_norm": 0.5519221425056458,
+      "learning_rate": 0.00018680247308042324,
+      "loss": 1.1633,
+      "step": 3732
+    },
+    {
+      "epoch": 0.6647079772079773,
+      "grad_norm": 0.4848768711090088,
+      "learning_rate": 0.00018679552219172784,
+      "loss": 0.8716,
+      "step": 3733
+    },
+    {
+      "epoch": 0.6648860398860399,
+      "grad_norm": 0.5490246415138245,
+      "learning_rate": 0.0001867885696024544,
+      "loss": 1.1347,
+      "step": 3734
+    },
+    {
+      "epoch": 0.6650641025641025,
+      "grad_norm": 0.5281458497047424,
+      "learning_rate": 0.00018678161531273928,
+      "loss": 1.0987,
+      "step": 3735
+    },
+    {
+      "epoch": 0.6652421652421653,
+      "grad_norm": 0.5313079953193665,
+      "learning_rate": 0.00018677465932271867,
+      "loss": 0.9705,
+      "step": 3736
+    },
+    {
+      "epoch": 0.6654202279202279,
+      "grad_norm": 0.5425750017166138,
+      "learning_rate": 0.0001867677016325289,
+      "loss": 1.1847,
+      "step": 3737
+    },
+    {
+      "epoch": 0.6655982905982906,
+      "grad_norm": 0.5796298980712891,
+      "learning_rate": 0.0001867607422423062,
+      "loss": 1.2639,
+      "step": 3738
+    },
+    {
+      "epoch": 0.6657763532763533,
+      "grad_norm": 0.49738675355911255,
+      "learning_rate": 0.00018675378115218702,
+      "loss": 1.0536,
+      "step": 3739
+    },
+    {
+      "epoch": 0.665954415954416,
+      "grad_norm": 0.665250301361084,
+      "learning_rate": 0.0001867468183623077,
+      "loss": 1.2836,
+      "step": 3740
+    },
+    {
+      "epoch": 0.6661324786324786,
+      "grad_norm": 0.5184717178344727,
+      "learning_rate": 0.00018673985387280469,
+      "loss": 1.0497,
+      "step": 3741
+    },
+    {
+      "epoch": 0.6663105413105413,
+      "grad_norm": 0.5129656791687012,
+      "learning_rate": 0.00018673288768381442,
+      "loss": 1.2041,
+      "step": 3742
+    },
+    {
+      "epoch": 0.666488603988604,
+      "grad_norm": 0.5308768153190613,
+      "learning_rate": 0.00018672591979547337,
+      "loss": 1.2092,
+      "step": 3743
+    },
+    {
+      "epoch": 0.6666666666666666,
+      "grad_norm": 0.5059141516685486,
+      "learning_rate": 0.00018671895020791812,
+      "loss": 1.1929,
+      "step": 3744
+    },
+    {
+      "epoch": 0.6668447293447294,
+      "grad_norm": 0.5237857103347778,
+      "learning_rate": 0.00018671197892128517,
+      "loss": 1.2538,
+      "step": 3745
+    },
+    {
+      "epoch": 0.667022792022792,
+      "grad_norm": 0.450000137090683,
+      "learning_rate": 0.0001867050059357111,
+      "loss": 0.7138,
+      "step": 3746
+    },
+    {
+      "epoch": 0.6672008547008547,
+      "grad_norm": 0.5413795709609985,
+      "learning_rate": 0.00018669803125133258,
+      "loss": 1.1383,
+      "step": 3747
+    },
+    {
+      "epoch": 0.6673789173789174,
+      "grad_norm": 0.4657825529575348,
+      "learning_rate": 0.00018669105486828622,
+      "loss": 1.0518,
+      "step": 3748
+    },
+    {
+      "epoch": 0.66755698005698,
+      "grad_norm": 0.6198551654815674,
+      "learning_rate": 0.00018668407678670875,
+      "loss": 1.2697,
+      "step": 3749
+    },
+    {
+      "epoch": 0.6677350427350427,
+      "grad_norm": 0.5112186074256897,
+      "learning_rate": 0.00018667709700673685,
+      "loss": 0.9907,
+      "step": 3750
+    },
+    {
+      "epoch": 0.6679131054131054,
+      "grad_norm": 0.5446593761444092,
+      "learning_rate": 0.00018667011552850728,
+      "loss": 1.0708,
+      "step": 3751
+    },
+    {
+      "epoch": 0.6680911680911681,
+      "grad_norm": 0.5673866271972656,
+      "learning_rate": 0.00018666313235215682,
+      "loss": 1.05,
+      "step": 3752
+    },
+    {
+      "epoch": 0.6682692307692307,
+      "grad_norm": 0.4821988046169281,
+      "learning_rate": 0.00018665614747782235,
+      "loss": 1.0543,
+      "step": 3753
+    },
+    {
+      "epoch": 0.6684472934472935,
+      "grad_norm": 0.5158842206001282,
+      "learning_rate": 0.00018664916090564067,
+      "loss": 1.0331,
+      "step": 3754
+    },
+    {
+      "epoch": 0.6686253561253561,
+      "grad_norm": 0.45486921072006226,
+      "learning_rate": 0.00018664217263574865,
+      "loss": 0.9262,
+      "step": 3755
+    },
+    {
+      "epoch": 0.6688034188034188,
+      "grad_norm": 0.46193036437034607,
+      "learning_rate": 0.00018663518266828327,
+      "loss": 0.9858,
+      "step": 3756
+    },
+    {
+      "epoch": 0.6689814814814815,
+      "grad_norm": 0.5144094824790955,
+      "learning_rate": 0.00018662819100338148,
+      "loss": 1.0302,
+      "step": 3757
+    },
+    {
+      "epoch": 0.6691595441595442,
+      "grad_norm": 0.5246134400367737,
+      "learning_rate": 0.0001866211976411802,
+      "loss": 1.064,
+      "step": 3758
+    },
+    {
+      "epoch": 0.6693376068376068,
+      "grad_norm": 0.4853166937828064,
+      "learning_rate": 0.0001866142025818165,
+      "loss": 0.9481,
+      "step": 3759
+    },
+    {
+      "epoch": 0.6695156695156695,
+      "grad_norm": 0.5029586553573608,
+      "learning_rate": 0.00018660720582542743,
+      "loss": 0.9443,
+      "step": 3760
+    },
+    {
+      "epoch": 0.6696937321937322,
+      "grad_norm": 0.5373172163963318,
+      "learning_rate": 0.0001866002073721501,
+      "loss": 1.1401,
+      "step": 3761
+    },
+    {
+      "epoch": 0.6698717948717948,
+      "grad_norm": 0.6236287951469421,
+      "learning_rate": 0.00018659320722212158,
+      "loss": 1.1255,
+      "step": 3762
+    },
+    {
+      "epoch": 0.6700498575498576,
+      "grad_norm": 0.5470684766769409,
+      "learning_rate": 0.00018658620537547903,
+      "loss": 1.0622,
+      "step": 3763
+    },
+    {
+      "epoch": 0.6702279202279202,
+      "grad_norm": 0.63177090883255,
+      "learning_rate": 0.00018657920183235964,
+      "loss": 0.9736,
+      "step": 3764
+    },
+    {
+      "epoch": 0.6704059829059829,
+      "grad_norm": 0.5456309914588928,
+      "learning_rate": 0.00018657219659290068,
+      "loss": 1.027,
+      "step": 3765
+    },
+    {
+      "epoch": 0.6705840455840456,
+      "grad_norm": 0.4816138744354248,
+      "learning_rate": 0.00018656518965723935,
+      "loss": 0.7801,
+      "step": 3766
+    },
+    {
+      "epoch": 0.6707621082621082,
+      "grad_norm": 0.4811640679836273,
+      "learning_rate": 0.00018655818102551294,
+      "loss": 1.0535,
+      "step": 3767
+    },
+    {
+      "epoch": 0.6709401709401709,
+      "grad_norm": 0.4677673280239105,
+      "learning_rate": 0.00018655117069785884,
+      "loss": 1.1043,
+      "step": 3768
+    },
+    {
+      "epoch": 0.6711182336182336,
+      "grad_norm": 0.5628635883331299,
+      "learning_rate": 0.0001865441586744143,
+      "loss": 1.0392,
+      "step": 3769
+    },
+    {
+      "epoch": 0.6712962962962963,
+      "grad_norm": 0.5484504103660583,
+      "learning_rate": 0.00018653714495531673,
+      "loss": 1.1533,
+      "step": 3770
+    },
+    {
+      "epoch": 0.6714743589743589,
+      "grad_norm": 0.5830571055412292,
+      "learning_rate": 0.0001865301295407036,
+      "loss": 1.2479,
+      "step": 3771
+    },
+    {
+      "epoch": 0.6716524216524217,
+      "grad_norm": 0.5516841411590576,
+      "learning_rate": 0.00018652311243071235,
+      "loss": 1.2152,
+      "step": 3772
+    },
+    {
+      "epoch": 0.6718304843304843,
+      "grad_norm": 0.6360766291618347,
+      "learning_rate": 0.0001865160936254804,
+      "loss": 1.0752,
+      "step": 3773
+    },
+    {
+      "epoch": 0.6720085470085471,
+      "grad_norm": 0.6038610935211182,
+      "learning_rate": 0.00018650907312514533,
+      "loss": 1.2425,
+      "step": 3774
+    },
+    {
+      "epoch": 0.6721866096866097,
+      "grad_norm": 0.49572908878326416,
+      "learning_rate": 0.0001865020509298447,
+      "loss": 1.0057,
+      "step": 3775
+    },
+    {
+      "epoch": 0.6723646723646723,
+      "grad_norm": 0.4551616311073303,
+      "learning_rate": 0.00018649502703971607,
+      "loss": 1.0763,
+      "step": 3776
+    },
+    {
+      "epoch": 0.6725427350427351,
+      "grad_norm": 0.6621482372283936,
+      "learning_rate": 0.00018648800145489706,
+      "loss": 1.0306,
+      "step": 3777
+    },
+    {
+      "epoch": 0.6727207977207977,
+      "grad_norm": 0.5523806810379028,
+      "learning_rate": 0.0001864809741755253,
+      "loss": 0.9906,
+      "step": 3778
+    },
+    {
+      "epoch": 0.6728988603988604,
+      "grad_norm": 0.5527048110961914,
+      "learning_rate": 0.00018647394520173856,
+      "loss": 1.0734,
+      "step": 3779
+    },
+    {
+      "epoch": 0.6730769230769231,
+      "grad_norm": 0.573573887348175,
+      "learning_rate": 0.00018646691453367444,
+      "loss": 1.1409,
+      "step": 3780
+    },
+    {
+      "epoch": 0.6732549857549858,
+      "grad_norm": 0.6273239254951477,
+      "learning_rate": 0.00018645988217147079,
+      "loss": 0.9682,
+      "step": 3781
+    },
+    {
+      "epoch": 0.6734330484330484,
+      "grad_norm": 0.4917762279510498,
+      "learning_rate": 0.00018645284811526534,
+      "loss": 0.9681,
+      "step": 3782
+    },
+    {
+      "epoch": 0.6736111111111112,
+      "grad_norm": 0.4901154339313507,
+      "learning_rate": 0.0001864458123651959,
+      "loss": 1.1828,
+      "step": 3783
+    },
+    {
+      "epoch": 0.6737891737891738,
+      "grad_norm": 0.6292546391487122,
+      "learning_rate": 0.00018643877492140036,
+      "loss": 1.1987,
+      "step": 3784
+    },
+    {
+      "epoch": 0.6739672364672364,
+      "grad_norm": 0.5334137678146362,
+      "learning_rate": 0.0001864317357840166,
+      "loss": 1.0347,
+      "step": 3785
+    },
+    {
+      "epoch": 0.6741452991452992,
+      "grad_norm": 0.6064338684082031,
+      "learning_rate": 0.0001864246949531825,
+      "loss": 1.4154,
+      "step": 3786
+    },
+    {
+      "epoch": 0.6743233618233618,
+      "grad_norm": 0.5442034602165222,
+      "learning_rate": 0.000186417652429036,
+      "loss": 1.2604,
+      "step": 3787
+    },
+    {
+      "epoch": 0.6745014245014245,
+      "grad_norm": 0.490858793258667,
+      "learning_rate": 0.00018641060821171518,
+      "loss": 1.1511,
+      "step": 3788
+    },
+    {
+      "epoch": 0.6746794871794872,
+      "grad_norm": 0.571116030216217,
+      "learning_rate": 0.00018640356230135798,
+      "loss": 1.1479,
+      "step": 3789
+    },
+    {
+      "epoch": 0.6748575498575499,
+      "grad_norm": 0.4857785105705261,
+      "learning_rate": 0.00018639651469810247,
+      "loss": 0.9,
+      "step": 3790
+    },
+    {
+      "epoch": 0.6750356125356125,
+      "grad_norm": 0.5320703983306885,
+      "learning_rate": 0.0001863894654020867,
+      "loss": 1.2284,
+      "step": 3791
+    },
+    {
+      "epoch": 0.6752136752136753,
+      "grad_norm": 0.5586925745010376,
+      "learning_rate": 0.0001863824144134488,
+      "loss": 1.1183,
+      "step": 3792
+    },
+    {
+      "epoch": 0.6753917378917379,
+      "grad_norm": 0.47740885615348816,
+      "learning_rate": 0.000186375361732327,
+      "loss": 1.1512,
+      "step": 3793
+    },
+    {
+      "epoch": 0.6755698005698005,
+      "grad_norm": 0.5867732167243958,
+      "learning_rate": 0.00018636830735885935,
+      "loss": 1.1903,
+      "step": 3794
+    },
+    {
+      "epoch": 0.6757478632478633,
+      "grad_norm": 0.5013887882232666,
+      "learning_rate": 0.0001863612512931842,
+      "loss": 0.8581,
+      "step": 3795
+    },
+    {
+      "epoch": 0.6759259259259259,
+      "grad_norm": 0.6026871204376221,
+      "learning_rate": 0.0001863541935354397,
+      "loss": 0.9581,
+      "step": 3796
+    },
+    {
+      "epoch": 0.6761039886039886,
+      "grad_norm": 0.5238468647003174,
+      "learning_rate": 0.00018634713408576415,
+      "loss": 1.0949,
+      "step": 3797
+    },
+    {
+      "epoch": 0.6762820512820513,
+      "grad_norm": 0.5128598213195801,
+      "learning_rate": 0.00018634007294429585,
+      "loss": 0.8992,
+      "step": 3798
+    },
+    {
+      "epoch": 0.676460113960114,
+      "grad_norm": 0.5092771053314209,
+      "learning_rate": 0.00018633301011117324,
+      "loss": 1.0793,
+      "step": 3799
+    },
+    {
+      "epoch": 0.6766381766381766,
+      "grad_norm": 0.592566728591919,
+      "learning_rate": 0.00018632594558653457,
+      "loss": 1.3242,
+      "step": 3800
+    },
+    {
+      "epoch": 0.6768162393162394,
+      "grad_norm": 0.4953067898750305,
+      "learning_rate": 0.0001863188793705184,
+      "loss": 0.9925,
+      "step": 3801
+    },
+    {
+      "epoch": 0.676994301994302,
+      "grad_norm": 0.4989747107028961,
+      "learning_rate": 0.00018631181146326305,
+      "loss": 1.0677,
+      "step": 3802
+    },
+    {
+      "epoch": 0.6771723646723646,
+      "grad_norm": 0.5375261902809143,
+      "learning_rate": 0.00018630474186490705,
+      "loss": 1.0556,
+      "step": 3803
+    },
+    {
+      "epoch": 0.6773504273504274,
+      "grad_norm": 0.6512624025344849,
+      "learning_rate": 0.00018629767057558894,
+      "loss": 1.2041,
+      "step": 3804
+    },
+    {
+      "epoch": 0.67752849002849,
+      "grad_norm": 0.5428260564804077,
+      "learning_rate": 0.00018629059759544723,
+      "loss": 0.9645,
+      "step": 3805
+    },
+    {
+      "epoch": 0.6777065527065527,
+      "grad_norm": 0.5598662495613098,
+      "learning_rate": 0.00018628352292462052,
+      "loss": 1.1683,
+      "step": 3806
+    },
+    {
+      "epoch": 0.6778846153846154,
+      "grad_norm": 0.49351340532302856,
+      "learning_rate": 0.0001862764465632474,
+      "loss": 1.1622,
+      "step": 3807
+    },
+    {
+      "epoch": 0.6780626780626781,
+      "grad_norm": 0.4796701669692993,
+      "learning_rate": 0.00018626936851146657,
+      "loss": 1.0017,
+      "step": 3808
+    },
+    {
+      "epoch": 0.6782407407407407,
+      "grad_norm": 0.444533109664917,
+      "learning_rate": 0.00018626228876941664,
+      "loss": 0.9145,
+      "step": 3809
+    },
+    {
+      "epoch": 0.6784188034188035,
+      "grad_norm": 0.5197392702102661,
+      "learning_rate": 0.00018625520733723635,
+      "loss": 1.283,
+      "step": 3810
+    },
+    {
+      "epoch": 0.6785968660968661,
+      "grad_norm": 0.48785829544067383,
+      "learning_rate": 0.00018624812421506447,
+      "loss": 1.1084,
+      "step": 3811
+    },
+    {
+      "epoch": 0.6787749287749287,
+      "grad_norm": 0.5083680152893066,
+      "learning_rate": 0.00018624103940303974,
+      "loss": 0.9071,
+      "step": 3812
+    },
+    {
+      "epoch": 0.6789529914529915,
+      "grad_norm": 0.553819477558136,
+      "learning_rate": 0.00018623395290130103,
+      "loss": 0.9986,
+      "step": 3813
+    },
+    {
+      "epoch": 0.6791310541310541,
+      "grad_norm": 0.5347508788108826,
+      "learning_rate": 0.00018622686470998713,
+      "loss": 1.0148,
+      "step": 3814
+    },
+    {
+      "epoch": 0.6793091168091168,
+      "grad_norm": 0.5080769062042236,
+      "learning_rate": 0.00018621977482923693,
+      "loss": 1.0169,
+      "step": 3815
+    },
+    {
+      "epoch": 0.6794871794871795,
+      "grad_norm": 0.5444077849388123,
+      "learning_rate": 0.00018621268325918938,
+      "loss": 1.172,
+      "step": 3816
+    },
+    {
+      "epoch": 0.6796652421652422,
+      "grad_norm": 0.521946132183075,
+      "learning_rate": 0.00018620558999998335,
+      "loss": 1.0247,
+      "step": 3817
+    },
+    {
+      "epoch": 0.6798433048433048,
+      "grad_norm": 0.5257413983345032,
+      "learning_rate": 0.00018619849505175786,
+      "loss": 1.1574,
+      "step": 3818
+    },
+    {
+      "epoch": 0.6800213675213675,
+      "grad_norm": 0.5473007559776306,
+      "learning_rate": 0.00018619139841465193,
+      "loss": 1.1254,
+      "step": 3819
+    },
+    {
+      "epoch": 0.6801994301994302,
+      "grad_norm": 0.5479872226715088,
+      "learning_rate": 0.00018618430008880463,
+      "loss": 1.0196,
+      "step": 3820
+    },
+    {
+      "epoch": 0.6803774928774928,
+      "grad_norm": 0.5918973088264465,
+      "learning_rate": 0.00018617720007435497,
+      "loss": 1.082,
+      "step": 3821
+    },
+    {
+      "epoch": 0.6805555555555556,
+      "grad_norm": 0.5411791801452637,
+      "learning_rate": 0.0001861700983714421,
+      "loss": 0.7723,
+      "step": 3822
+    },
+    {
+      "epoch": 0.6807336182336182,
+      "grad_norm": 0.5466326475143433,
+      "learning_rate": 0.00018616299498020516,
+      "loss": 1.0979,
+      "step": 3823
+    },
+    {
+      "epoch": 0.6809116809116809,
+      "grad_norm": 0.5405182838439941,
+      "learning_rate": 0.00018615588990078332,
+      "loss": 0.8891,
+      "step": 3824
+    },
+    {
+      "epoch": 0.6810897435897436,
+      "grad_norm": 0.5415780544281006,
+      "learning_rate": 0.00018614878313331579,
+      "loss": 1.0927,
+      "step": 3825
+    },
+    {
+      "epoch": 0.6812678062678063,
+      "grad_norm": 0.5284909605979919,
+      "learning_rate": 0.00018614167467794182,
+      "loss": 1.0684,
+      "step": 3826
+    },
+    {
+      "epoch": 0.6814458689458689,
+      "grad_norm": 0.4873995780944824,
+      "learning_rate": 0.00018613456453480062,
+      "loss": 1.1653,
+      "step": 3827
+    },
+    {
+      "epoch": 0.6816239316239316,
+      "grad_norm": 0.5506551265716553,
+      "learning_rate": 0.0001861274527040316,
+      "loss": 0.9876,
+      "step": 3828
+    },
+    {
+      "epoch": 0.6818019943019943,
+      "grad_norm": 0.5031297206878662,
+      "learning_rate": 0.0001861203391857741,
+      "loss": 1.067,
+      "step": 3829
+    },
+    {
+      "epoch": 0.6819800569800569,
+      "grad_norm": 0.622346043586731,
+      "learning_rate": 0.0001861132239801674,
+      "loss": 1.1514,
+      "step": 3830
+    },
+    {
+      "epoch": 0.6821581196581197,
+      "grad_norm": 0.47706183791160583,
+      "learning_rate": 0.000186106107087351,
+      "loss": 0.9857,
+      "step": 3831
+    },
+    {
+      "epoch": 0.6823361823361823,
+      "grad_norm": 0.5082845091819763,
+      "learning_rate": 0.00018609898850746424,
+      "loss": 1.123,
+      "step": 3832
+    },
+    {
+      "epoch": 0.6825142450142451,
+      "grad_norm": 0.5119805932044983,
+      "learning_rate": 0.00018609186824064671,
+      "loss": 1.1386,
+      "step": 3833
+    },
+    {
+      "epoch": 0.6826923076923077,
+      "grad_norm": 0.5247541069984436,
+      "learning_rate": 0.00018608474628703788,
+      "loss": 0.9433,
+      "step": 3834
+    },
+    {
+      "epoch": 0.6828703703703703,
+      "grad_norm": 0.4618282616138458,
+      "learning_rate": 0.00018607762264677722,
+      "loss": 0.8727,
+      "step": 3835
+    },
+    {
+      "epoch": 0.6830484330484331,
+      "grad_norm": 0.6014040112495422,
+      "learning_rate": 0.00018607049732000436,
+      "loss": 1.1823,
+      "step": 3836
+    },
+    {
+      "epoch": 0.6832264957264957,
+      "grad_norm": 0.6489043831825256,
+      "learning_rate": 0.00018606337030685892,
+      "loss": 1.1466,
+      "step": 3837
+    },
+    {
+      "epoch": 0.6834045584045584,
+      "grad_norm": 0.5527763366699219,
+      "learning_rate": 0.00018605624160748053,
+      "loss": 1.3015,
+      "step": 3838
+    },
+    {
+      "epoch": 0.6835826210826211,
+      "grad_norm": 0.5628284215927124,
+      "learning_rate": 0.0001860491112220088,
+      "loss": 1.1504,
+      "step": 3839
+    },
+    {
+      "epoch": 0.6837606837606838,
+      "grad_norm": 0.5414566993713379,
+      "learning_rate": 0.00018604197915058355,
+      "loss": 1.0155,
+      "step": 3840
+    },
+    {
+      "epoch": 0.6839387464387464,
+      "grad_norm": 0.5378929376602173,
+      "learning_rate": 0.00018603484539334443,
+      "loss": 0.8917,
+      "step": 3841
+    },
+    {
+      "epoch": 0.6841168091168092,
+      "grad_norm": 0.5953748822212219,
+      "learning_rate": 0.00018602770995043125,
+      "loss": 1.1971,
+      "step": 3842
+    },
+    {
+      "epoch": 0.6842948717948718,
+      "grad_norm": 0.511813759803772,
+      "learning_rate": 0.00018602057282198376,
+      "loss": 1.1345,
+      "step": 3843
+    },
+    {
+      "epoch": 0.6844729344729344,
+      "grad_norm": 0.5145484209060669,
+      "learning_rate": 0.00018601343400814185,
+      "loss": 1.0786,
+      "step": 3844
+    },
+    {
+      "epoch": 0.6846509971509972,
+      "grad_norm": 0.5199604034423828,
+      "learning_rate": 0.00018600629350904542,
+      "loss": 1.2063,
+      "step": 3845
+    },
+    {
+      "epoch": 0.6848290598290598,
+      "grad_norm": 0.5653825998306274,
+      "learning_rate": 0.0001859991513248343,
+      "loss": 1.0314,
+      "step": 3846
+    },
+    {
+      "epoch": 0.6850071225071225,
+      "grad_norm": 0.5660843849182129,
+      "learning_rate": 0.00018599200745564843,
+      "loss": 1.2754,
+      "step": 3847
+    },
+    {
+      "epoch": 0.6851851851851852,
+      "grad_norm": 0.5225719809532166,
+      "learning_rate": 0.00018598486190162788,
+      "loss": 1.0837,
+      "step": 3848
+    },
+    {
+      "epoch": 0.6853632478632479,
+      "grad_norm": 0.5011669397354126,
+      "learning_rate": 0.00018597771466291252,
+      "loss": 1.1,
+      "step": 3849
+    },
+    {
+      "epoch": 0.6855413105413105,
+      "grad_norm": 0.5923115015029907,
+      "learning_rate": 0.00018597056573964245,
+      "loss": 1.1875,
+      "step": 3850
+    },
+    {
+      "epoch": 0.6857193732193733,
+      "grad_norm": 0.5666482448577881,
+      "learning_rate": 0.00018596341513195776,
+      "loss": 1.1663,
+      "step": 3851
+    },
+    {
+      "epoch": 0.6858974358974359,
+      "grad_norm": 0.5396790504455566,
+      "learning_rate": 0.0001859562628399985,
+      "loss": 1.1179,
+      "step": 3852
+    },
+    {
+      "epoch": 0.6860754985754985,
+      "grad_norm": 0.5709532499313354,
+      "learning_rate": 0.00018594910886390485,
+      "loss": 1.0369,
+      "step": 3853
+    },
+    {
+      "epoch": 0.6862535612535613,
+      "grad_norm": 0.45524322986602783,
+      "learning_rate": 0.00018594195320381692,
+      "loss": 1.0171,
+      "step": 3854
+    },
+    {
+      "epoch": 0.6864316239316239,
+      "grad_norm": 0.6130724549293518,
+      "learning_rate": 0.00018593479585987498,
+      "loss": 1.1944,
+      "step": 3855
+    },
+    {
+      "epoch": 0.6866096866096866,
+      "grad_norm": 0.5079745054244995,
+      "learning_rate": 0.0001859276368322192,
+      "loss": 1.2567,
+      "step": 3856
+    },
+    {
+      "epoch": 0.6867877492877493,
+      "grad_norm": 0.49919846653938293,
+      "learning_rate": 0.00018592047612098992,
+      "loss": 0.9459,
+      "step": 3857
+    },
+    {
+      "epoch": 0.686965811965812,
+      "grad_norm": 0.5776857733726501,
+      "learning_rate": 0.00018591331372632734,
+      "loss": 1.2456,
+      "step": 3858
+    },
+    {
+      "epoch": 0.6871438746438746,
+      "grad_norm": 0.4740692377090454,
+      "learning_rate": 0.00018590614964837188,
+      "loss": 1.0401,
+      "step": 3859
+    },
+    {
+      "epoch": 0.6873219373219374,
+      "grad_norm": 0.5015742182731628,
+      "learning_rate": 0.00018589898388726389,
+      "loss": 1.2052,
+      "step": 3860
+    },
+    {
+      "epoch": 0.6875,
+      "grad_norm": 0.4819730818271637,
+      "learning_rate": 0.0001858918164431437,
+      "loss": 1.007,
+      "step": 3861
+    },
+    {
+      "epoch": 0.6876780626780626,
+      "grad_norm": 0.5510426163673401,
+      "learning_rate": 0.00018588464731615184,
+      "loss": 1.0123,
+      "step": 3862
+    },
+    {
+      "epoch": 0.6878561253561254,
+      "grad_norm": 0.4950829744338989,
+      "learning_rate": 0.00018587747650642867,
+      "loss": 1.033,
+      "step": 3863
+    },
+    {
+      "epoch": 0.688034188034188,
+      "grad_norm": 0.5278680920600891,
+      "learning_rate": 0.0001858703040141148,
+      "loss": 1.0912,
+      "step": 3864
+    },
+    {
+      "epoch": 0.6882122507122507,
+      "grad_norm": 0.6359158158302307,
+      "learning_rate": 0.00018586312983935068,
+      "loss": 1.2868,
+      "step": 3865
+    },
+    {
+      "epoch": 0.6883903133903134,
+      "grad_norm": 0.5098239183425903,
+      "learning_rate": 0.0001858559539822769,
+      "loss": 0.8364,
+      "step": 3866
+    },
+    {
+      "epoch": 0.6885683760683761,
+      "grad_norm": 0.5651038289070129,
+      "learning_rate": 0.000185848776443034,
+      "loss": 1.1983,
+      "step": 3867
+    },
+    {
+      "epoch": 0.6887464387464387,
+      "grad_norm": 0.5305678248405457,
+      "learning_rate": 0.00018584159722176272,
+      "loss": 1.32,
+      "step": 3868
+    },
+    {
+      "epoch": 0.6889245014245015,
+      "grad_norm": 0.5481845140457153,
+      "learning_rate": 0.00018583441631860368,
+      "loss": 1.013,
+      "step": 3869
+    },
+    {
+      "epoch": 0.6891025641025641,
+      "grad_norm": 0.5214795470237732,
+      "learning_rate": 0.00018582723373369753,
+      "loss": 1.172,
+      "step": 3870
+    },
+    {
+      "epoch": 0.6892806267806267,
+      "grad_norm": 0.6282780766487122,
+      "learning_rate": 0.00018582004946718502,
+      "loss": 1.7304,
+      "step": 3871
+    },
+    {
+      "epoch": 0.6894586894586895,
+      "grad_norm": 0.5266988277435303,
+      "learning_rate": 0.0001858128635192069,
+      "loss": 1.1418,
+      "step": 3872
+    },
+    {
+      "epoch": 0.6896367521367521,
+      "grad_norm": 0.4761001467704773,
+      "learning_rate": 0.000185805675889904,
+      "loss": 0.8585,
+      "step": 3873
+    },
+    {
+      "epoch": 0.6898148148148148,
+      "grad_norm": 0.528779923915863,
+      "learning_rate": 0.00018579848657941715,
+      "loss": 1.0036,
+      "step": 3874
+    },
+    {
+      "epoch": 0.6899928774928775,
+      "grad_norm": 0.5427684783935547,
+      "learning_rate": 0.00018579129558788716,
+      "loss": 0.9769,
+      "step": 3875
+    },
+    {
+      "epoch": 0.6901709401709402,
+      "grad_norm": 0.6229544281959534,
+      "learning_rate": 0.00018578410291545495,
+      "loss": 1.2848,
+      "step": 3876
+    },
+    {
+      "epoch": 0.6903490028490028,
+      "grad_norm": 0.6602693200111389,
+      "learning_rate": 0.00018577690856226147,
+      "loss": 1.2713,
+      "step": 3877
+    },
+    {
+      "epoch": 0.6905270655270656,
+      "grad_norm": 0.45884042978286743,
+      "learning_rate": 0.0001857697125284476,
+      "loss": 0.9143,
+      "step": 3878
+    },
+    {
+      "epoch": 0.6907051282051282,
+      "grad_norm": 0.4956444203853607,
+      "learning_rate": 0.00018576251481415443,
+      "loss": 0.9646,
+      "step": 3879
+    },
+    {
+      "epoch": 0.6908831908831908,
+      "grad_norm": 0.473561555147171,
+      "learning_rate": 0.00018575531541952292,
+      "loss": 0.843,
+      "step": 3880
+    },
+    {
+      "epoch": 0.6910612535612536,
+      "grad_norm": 0.4676312506198883,
+      "learning_rate": 0.00018574811434469415,
+      "loss": 0.9464,
+      "step": 3881
+    },
+    {
+      "epoch": 0.6912393162393162,
+      "grad_norm": 0.5452045202255249,
+      "learning_rate": 0.00018574091158980922,
+      "loss": 0.985,
+      "step": 3882
+    },
+    {
+      "epoch": 0.6914173789173789,
+      "grad_norm": 0.6274946331977844,
+      "learning_rate": 0.0001857337071550092,
+      "loss": 1.0357,
+      "step": 3883
+    },
+    {
+      "epoch": 0.6915954415954416,
+      "grad_norm": 0.5533788800239563,
+      "learning_rate": 0.00018572650104043531,
+      "loss": 1.2636,
+      "step": 3884
+    },
+    {
+      "epoch": 0.6917735042735043,
+      "grad_norm": 0.48312318325042725,
+      "learning_rate": 0.00018571929324622872,
+      "loss": 1.2402,
+      "step": 3885
+    },
+    {
+      "epoch": 0.6919515669515669,
+      "grad_norm": 0.6087453961372375,
+      "learning_rate": 0.00018571208377253062,
+      "loss": 1.2961,
+      "step": 3886
+    },
+    {
+      "epoch": 0.6921296296296297,
+      "grad_norm": 0.49156486988067627,
+      "learning_rate": 0.00018570487261948234,
+      "loss": 0.9585,
+      "step": 3887
+    },
+    {
+      "epoch": 0.6923076923076923,
+      "grad_norm": 0.5200015902519226,
+      "learning_rate": 0.0001856976597872251,
+      "loss": 0.9274,
+      "step": 3888
+    },
+    {
+      "epoch": 0.6924857549857549,
+      "grad_norm": 0.5185118913650513,
+      "learning_rate": 0.0001856904452759002,
+      "loss": 1.0015,
+      "step": 3889
+    },
+    {
+      "epoch": 0.6926638176638177,
+      "grad_norm": 0.5859049558639526,
+      "learning_rate": 0.00018568322908564904,
+      "loss": 1.0959,
+      "step": 3890
+    },
+    {
+      "epoch": 0.6928418803418803,
+      "grad_norm": 0.5882301926612854,
+      "learning_rate": 0.00018567601121661302,
+      "loss": 1.3214,
+      "step": 3891
+    },
+    {
+      "epoch": 0.6930199430199431,
+      "grad_norm": 0.6475503444671631,
+      "learning_rate": 0.0001856687916689335,
+      "loss": 1.3265,
+      "step": 3892
+    },
+    {
+      "epoch": 0.6931980056980057,
+      "grad_norm": 0.46175432205200195,
+      "learning_rate": 0.000185661570442752,
+      "loss": 0.8547,
+      "step": 3893
+    },
+    {
+      "epoch": 0.6933760683760684,
+      "grad_norm": 0.5362716913223267,
+      "learning_rate": 0.00018565434753820998,
+      "loss": 0.974,
+      "step": 3894
+    },
+    {
+      "epoch": 0.6935541310541311,
+      "grad_norm": 0.4317963719367981,
+      "learning_rate": 0.00018564712295544896,
+      "loss": 0.7653,
+      "step": 3895
+    },
+    {
+      "epoch": 0.6937321937321937,
+      "grad_norm": 0.5679717659950256,
+      "learning_rate": 0.00018563989669461047,
+      "loss": 1.0691,
+      "step": 3896
+    },
+    {
+      "epoch": 0.6939102564102564,
+      "grad_norm": 0.5058363676071167,
+      "learning_rate": 0.00018563266875583608,
+      "loss": 1.0665,
+      "step": 3897
+    },
+    {
+      "epoch": 0.6940883190883191,
+      "grad_norm": 0.5365496277809143,
+      "learning_rate": 0.00018562543913926746,
+      "loss": 0.9963,
+      "step": 3898
+    },
+    {
+      "epoch": 0.6942663817663818,
+      "grad_norm": 0.49945300817489624,
+      "learning_rate": 0.0001856182078450462,
+      "loss": 0.8668,
+      "step": 3899
+    },
+    {
+      "epoch": 0.6944444444444444,
+      "grad_norm": 0.5869430899620056,
+      "learning_rate": 0.00018561097487331405,
+      "loss": 1.1942,
+      "step": 3900
+    },
+    {
+      "epoch": 0.6946225071225072,
+      "grad_norm": 0.5188950300216675,
+      "learning_rate": 0.0001856037402242127,
+      "loss": 0.9493,
+      "step": 3901
+    },
+    {
+      "epoch": 0.6948005698005698,
+      "grad_norm": 0.510788083076477,
+      "learning_rate": 0.00018559650389788384,
+      "loss": 0.9989,
+      "step": 3902
+    },
+    {
+      "epoch": 0.6949786324786325,
+      "grad_norm": 0.5360601544380188,
+      "learning_rate": 0.0001855892658944693,
+      "loss": 1.2766,
+      "step": 3903
+    },
+    {
+      "epoch": 0.6951566951566952,
+      "grad_norm": 0.522502601146698,
+      "learning_rate": 0.00018558202621411093,
+      "loss": 0.8774,
+      "step": 3904
+    },
+    {
+      "epoch": 0.6953347578347578,
+      "grad_norm": 0.5330635905265808,
+      "learning_rate": 0.00018557478485695052,
+      "loss": 0.972,
+      "step": 3905
+    },
+    {
+      "epoch": 0.6955128205128205,
+      "grad_norm": 0.5387479066848755,
+      "learning_rate": 0.00018556754182312996,
+      "loss": 1.0574,
+      "step": 3906
+    },
+    {
+      "epoch": 0.6956908831908832,
+      "grad_norm": 0.5357984900474548,
+      "learning_rate": 0.00018556029711279116,
+      "loss": 1.396,
+      "step": 3907
+    },
+    {
+      "epoch": 0.6958689458689459,
+      "grad_norm": 0.5647178292274475,
+      "learning_rate": 0.00018555305072607612,
+      "loss": 1.3304,
+      "step": 3908
+    },
+    {
+      "epoch": 0.6960470085470085,
+      "grad_norm": 0.46460914611816406,
+      "learning_rate": 0.00018554580266312673,
+      "loss": 0.9574,
+      "step": 3909
+    },
+    {
+      "epoch": 0.6962250712250713,
+      "grad_norm": 0.6206206679344177,
+      "learning_rate": 0.00018553855292408503,
+      "loss": 1.1637,
+      "step": 3910
+    },
+    {
+      "epoch": 0.6964031339031339,
+      "grad_norm": 0.5899842977523804,
+      "learning_rate": 0.00018553130150909312,
+      "loss": 1.1067,
+      "step": 3911
+    },
+    {
+      "epoch": 0.6965811965811965,
+      "grad_norm": 0.47294262051582336,
+      "learning_rate": 0.000185524048418293,
+      "loss": 1.1516,
+      "step": 3912
+    },
+    {
+      "epoch": 0.6967592592592593,
+      "grad_norm": 0.5791197419166565,
+      "learning_rate": 0.00018551679365182684,
+      "loss": 1.0007,
+      "step": 3913
+    },
+    {
+      "epoch": 0.6969373219373219,
+      "grad_norm": 0.5678651332855225,
+      "learning_rate": 0.00018550953720983672,
+      "loss": 1.2698,
+      "step": 3914
+    },
+    {
+      "epoch": 0.6971153846153846,
+      "grad_norm": 0.6509683728218079,
+      "learning_rate": 0.0001855022790924649,
+      "loss": 1.0354,
+      "step": 3915
+    },
+    {
+      "epoch": 0.6972934472934473,
+      "grad_norm": 0.5176648497581482,
+      "learning_rate": 0.0001854950192998535,
+      "loss": 1.1243,
+      "step": 3916
+    },
+    {
+      "epoch": 0.69747150997151,
+      "grad_norm": 0.520631730556488,
+      "learning_rate": 0.00018548775783214477,
+      "loss": 1.1371,
+      "step": 3917
+    },
+    {
+      "epoch": 0.6976495726495726,
+      "grad_norm": 0.5408333539962769,
+      "learning_rate": 0.00018548049468948108,
+      "loss": 1.1185,
+      "step": 3918
+    },
+    {
+      "epoch": 0.6978276353276354,
+      "grad_norm": 0.5423790216445923,
+      "learning_rate": 0.00018547322987200461,
+      "loss": 1.1539,
+      "step": 3919
+    },
+    {
+      "epoch": 0.698005698005698,
+      "grad_norm": 0.5422113537788391,
+      "learning_rate": 0.0001854659633798578,
+      "loss": 1.171,
+      "step": 3920
+    },
+    {
+      "epoch": 0.6981837606837606,
+      "grad_norm": 0.5113416314125061,
+      "learning_rate": 0.00018545869521318292,
+      "loss": 1.0597,
+      "step": 3921
+    },
+    {
+      "epoch": 0.6983618233618234,
+      "grad_norm": 0.49901214241981506,
+      "learning_rate": 0.00018545142537212248,
+      "loss": 1.1043,
+      "step": 3922
+    },
+    {
+      "epoch": 0.698539886039886,
+      "grad_norm": 0.6606622338294983,
+      "learning_rate": 0.00018544415385681885,
+      "loss": 1.1797,
+      "step": 3923
+    },
+    {
+      "epoch": 0.6987179487179487,
+      "grad_norm": 0.4786234498023987,
+      "learning_rate": 0.00018543688066741454,
+      "loss": 0.9532,
+      "step": 3924
+    },
+    {
+      "epoch": 0.6988960113960114,
+      "grad_norm": 0.5900700688362122,
+      "learning_rate": 0.00018542960580405203,
+      "loss": 1.1171,
+      "step": 3925
+    },
+    {
+      "epoch": 0.6990740740740741,
+      "grad_norm": 0.53485506772995,
+      "learning_rate": 0.00018542232926687383,
+      "loss": 1.1535,
+      "step": 3926
+    },
+    {
+      "epoch": 0.6992521367521367,
+      "grad_norm": 0.5269177556037903,
+      "learning_rate": 0.00018541505105602255,
+      "loss": 1.0287,
+      "step": 3927
+    },
+    {
+      "epoch": 0.6994301994301995,
+      "grad_norm": 0.5185505151748657,
+      "learning_rate": 0.0001854077711716408,
+      "loss": 1.2526,
+      "step": 3928
+    },
+    {
+      "epoch": 0.6996082621082621,
+      "grad_norm": 0.5615512132644653,
+      "learning_rate": 0.00018540048961387115,
+      "loss": 1.0189,
+      "step": 3929
+    },
+    {
+      "epoch": 0.6997863247863247,
+      "grad_norm": 0.4492493271827698,
+      "learning_rate": 0.00018539320638285637,
+      "loss": 0.8917,
+      "step": 3930
+    },
+    {
+      "epoch": 0.6999643874643875,
+      "grad_norm": 0.5062302947044373,
+      "learning_rate": 0.00018538592147873906,
+      "loss": 1.053,
+      "step": 3931
+    },
+    {
+      "epoch": 0.7001424501424501,
+      "grad_norm": 0.5508798956871033,
+      "learning_rate": 0.000185378634901662,
+      "loss": 0.9638,
+      "step": 3932
+    },
+    {
+      "epoch": 0.7003205128205128,
+      "grad_norm": 0.463980108499527,
+      "learning_rate": 0.00018537134665176793,
+      "loss": 1.0945,
+      "step": 3933
+    },
+    {
+      "epoch": 0.7004985754985755,
+      "grad_norm": 0.5027088522911072,
+      "learning_rate": 0.0001853640567291997,
+      "loss": 1.1745,
+      "step": 3934
+    },
+    {
+      "epoch": 0.7006766381766382,
+      "grad_norm": 0.5006551146507263,
+      "learning_rate": 0.00018535676513410009,
+      "loss": 0.8521,
+      "step": 3935
+    },
+    {
+      "epoch": 0.7008547008547008,
+      "grad_norm": 0.5870724320411682,
+      "learning_rate": 0.000185349471866612,
+      "loss": 0.9197,
+      "step": 3936
+    },
+    {
+      "epoch": 0.7010327635327636,
+      "grad_norm": 0.5030696392059326,
+      "learning_rate": 0.00018534217692687825,
+      "loss": 1.1049,
+      "step": 3937
+    },
+    {
+      "epoch": 0.7012108262108262,
+      "grad_norm": 0.5212681889533997,
+      "learning_rate": 0.00018533488031504186,
+      "loss": 1.3397,
+      "step": 3938
+    },
+    {
+      "epoch": 0.7013888888888888,
+      "grad_norm": 0.5649709105491638,
+      "learning_rate": 0.0001853275820312458,
+      "loss": 1.1994,
+      "step": 3939
+    },
+    {
+      "epoch": 0.7015669515669516,
+      "grad_norm": 0.4892779290676117,
+      "learning_rate": 0.00018532028207563297,
+      "loss": 1.1511,
+      "step": 3940
+    },
+    {
+      "epoch": 0.7017450142450142,
+      "grad_norm": 0.4929407835006714,
+      "learning_rate": 0.00018531298044834643,
+      "loss": 1.0792,
+      "step": 3941
+    },
+    {
+      "epoch": 0.7019230769230769,
+      "grad_norm": 0.5645940899848938,
+      "learning_rate": 0.00018530567714952932,
+      "loss": 1.0937,
+      "step": 3942
+    },
+    {
+      "epoch": 0.7021011396011396,
+      "grad_norm": 0.5471178293228149,
+      "learning_rate": 0.00018529837217932466,
+      "loss": 1.193,
+      "step": 3943
+    },
+    {
+      "epoch": 0.7022792022792023,
+      "grad_norm": 0.576627790927887,
+      "learning_rate": 0.00018529106553787558,
+      "loss": 1.1032,
+      "step": 3944
+    },
+    {
+      "epoch": 0.7024572649572649,
+      "grad_norm": 0.5015735626220703,
+      "learning_rate": 0.00018528375722532526,
+      "loss": 1.066,
+      "step": 3945
+    },
+    {
+      "epoch": 0.7026353276353277,
+      "grad_norm": 0.5315404534339905,
+      "learning_rate": 0.00018527644724181683,
+      "loss": 1.2059,
+      "step": 3946
+    },
+    {
+      "epoch": 0.7028133903133903,
+      "grad_norm": 0.5516065955162048,
+      "learning_rate": 0.0001852691355874936,
+      "loss": 1.161,
+      "step": 3947
+    },
+    {
+      "epoch": 0.7029914529914529,
+      "grad_norm": 0.5026212930679321,
+      "learning_rate": 0.0001852618222624988,
+      "loss": 1.2616,
+      "step": 3948
+    },
+    {
+      "epoch": 0.7031695156695157,
+      "grad_norm": 0.49874603748321533,
+      "learning_rate": 0.0001852545072669757,
+      "loss": 0.805,
+      "step": 3949
+    },
+    {
+      "epoch": 0.7033475783475783,
+      "grad_norm": 0.47698748111724854,
+      "learning_rate": 0.00018524719060106763,
+      "loss": 1.2321,
+      "step": 3950
+    },
+    {
+      "epoch": 0.7035256410256411,
+      "grad_norm": 0.5201322436332703,
+      "learning_rate": 0.00018523987226491792,
+      "loss": 1.1577,
+      "step": 3951
+    },
+    {
+      "epoch": 0.7037037037037037,
+      "grad_norm": 0.5506543517112732,
+      "learning_rate": 0.00018523255225867002,
+      "loss": 1.2289,
+      "step": 3952
+    },
+    {
+      "epoch": 0.7038817663817664,
+      "grad_norm": 0.5691256523132324,
+      "learning_rate": 0.0001852252305824673,
+      "loss": 1.1945,
+      "step": 3953
+    },
+    {
+      "epoch": 0.7040598290598291,
+      "grad_norm": 0.5324838757514954,
+      "learning_rate": 0.00018521790723645322,
+      "loss": 1.1037,
+      "step": 3954
+    },
+    {
+      "epoch": 0.7042378917378918,
+      "grad_norm": 0.5238786339759827,
+      "learning_rate": 0.00018521058222077127,
+      "loss": 1.2075,
+      "step": 3955
+    },
+    {
+      "epoch": 0.7044159544159544,
+      "grad_norm": 0.4936453402042389,
+      "learning_rate": 0.00018520325553556498,
+      "loss": 1.0537,
+      "step": 3956
+    },
+    {
+      "epoch": 0.7045940170940171,
+      "grad_norm": 0.6198282837867737,
+      "learning_rate": 0.00018519592718097791,
+      "loss": 1.0728,
+      "step": 3957
+    },
+    {
+      "epoch": 0.7047720797720798,
+      "grad_norm": 0.44729140400886536,
+      "learning_rate": 0.0001851885971571536,
+      "loss": 0.8432,
+      "step": 3958
+    },
+    {
+      "epoch": 0.7049501424501424,
+      "grad_norm": 0.5884211659431458,
+      "learning_rate": 0.00018518126546423572,
+      "loss": 0.9515,
+      "step": 3959
+    },
+    {
+      "epoch": 0.7051282051282052,
+      "grad_norm": 0.5293807983398438,
+      "learning_rate": 0.00018517393210236788,
+      "loss": 1.1178,
+      "step": 3960
+    },
+    {
+      "epoch": 0.7053062678062678,
+      "grad_norm": 0.6036825180053711,
+      "learning_rate": 0.00018516659707169374,
+      "loss": 1.0408,
+      "step": 3961
+    },
+    {
+      "epoch": 0.7054843304843305,
+      "grad_norm": 0.5157122015953064,
+      "learning_rate": 0.0001851592603723571,
+      "loss": 1.2136,
+      "step": 3962
+    },
+    {
+      "epoch": 0.7056623931623932,
+      "grad_norm": 0.5354781150817871,
+      "learning_rate": 0.00018515192200450163,
+      "loss": 0.7165,
+      "step": 3963
+    },
+    {
+      "epoch": 0.7058404558404558,
+      "grad_norm": 0.6073734760284424,
+      "learning_rate": 0.00018514458196827111,
+      "loss": 1.3079,
+      "step": 3964
+    },
+    {
+      "epoch": 0.7060185185185185,
+      "grad_norm": 0.4324839413166046,
+      "learning_rate": 0.0001851372402638094,
+      "loss": 0.7903,
+      "step": 3965
+    },
+    {
+      "epoch": 0.7061965811965812,
+      "grad_norm": 0.6530333161354065,
+      "learning_rate": 0.00018512989689126034,
+      "loss": 1.3179,
+      "step": 3966
+    },
+    {
+      "epoch": 0.7063746438746439,
+      "grad_norm": 0.5500404238700867,
+      "learning_rate": 0.00018512255185076782,
+      "loss": 1.0624,
+      "step": 3967
+    },
+    {
+      "epoch": 0.7065527065527065,
+      "grad_norm": 0.6277863383293152,
+      "learning_rate": 0.00018511520514247567,
+      "loss": 1.1056,
+      "step": 3968
+    },
+    {
+      "epoch": 0.7067307692307693,
+      "grad_norm": 0.580544650554657,
+      "learning_rate": 0.0001851078567665279,
+      "loss": 0.9849,
+      "step": 3969
+    },
+    {
+      "epoch": 0.7069088319088319,
+      "grad_norm": 0.4880999028682709,
+      "learning_rate": 0.00018510050672306848,
+      "loss": 1.0185,
+      "step": 3970
+    },
+    {
+      "epoch": 0.7070868945868946,
+      "grad_norm": 0.4919959306716919,
+      "learning_rate": 0.0001850931550122414,
+      "loss": 1.0334,
+      "step": 3971
+    },
+    {
+      "epoch": 0.7072649572649573,
+      "grad_norm": 0.6001213192939758,
+      "learning_rate": 0.0001850858016341907,
+      "loss": 1.0729,
+      "step": 3972
+    },
+    {
+      "epoch": 0.70744301994302,
+      "grad_norm": 0.538690447807312,
+      "learning_rate": 0.00018507844658906052,
+      "loss": 1.0733,
+      "step": 3973
+    },
+    {
+      "epoch": 0.7076210826210826,
+      "grad_norm": 0.5427643656730652,
+      "learning_rate": 0.00018507108987699487,
+      "loss": 1.1207,
+      "step": 3974
+    },
+    {
+      "epoch": 0.7077991452991453,
+      "grad_norm": 0.43014347553253174,
+      "learning_rate": 0.00018506373149813795,
+      "loss": 0.7958,
+      "step": 3975
+    },
+    {
+      "epoch": 0.707977207977208,
+      "grad_norm": 0.56591796875,
+      "learning_rate": 0.00018505637145263394,
+      "loss": 1.2199,
+      "step": 3976
+    },
+    {
+      "epoch": 0.7081552706552706,
+      "grad_norm": 0.59147047996521,
+      "learning_rate": 0.000185049009740627,
+      "loss": 1.2354,
+      "step": 3977
+    },
+    {
+      "epoch": 0.7083333333333334,
+      "grad_norm": 0.5078346133232117,
+      "learning_rate": 0.00018504164636226137,
+      "loss": 0.976,
+      "step": 3978
+    },
+    {
+      "epoch": 0.708511396011396,
+      "grad_norm": 0.533302366733551,
+      "learning_rate": 0.00018503428131768135,
+      "loss": 0.9653,
+      "step": 3979
+    },
+    {
+      "epoch": 0.7086894586894587,
+      "grad_norm": 0.4985341727733612,
+      "learning_rate": 0.00018502691460703122,
+      "loss": 1.1485,
+      "step": 3980
+    },
+    {
+      "epoch": 0.7088675213675214,
+      "grad_norm": 0.5143141150474548,
+      "learning_rate": 0.00018501954623045532,
+      "loss": 1.148,
+      "step": 3981
+    },
+    {
+      "epoch": 0.709045584045584,
+      "grad_norm": 0.507189154624939,
+      "learning_rate": 0.00018501217618809804,
+      "loss": 0.9306,
+      "step": 3982
+    },
+    {
+      "epoch": 0.7092236467236467,
+      "grad_norm": 0.5246604084968567,
+      "learning_rate": 0.00018500480448010377,
+      "loss": 0.9116,
+      "step": 3983
+    },
+    {
+      "epoch": 0.7094017094017094,
+      "grad_norm": 0.5321049094200134,
+      "learning_rate": 0.00018499743110661693,
+      "loss": 0.9607,
+      "step": 3984
+    },
+    {
+      "epoch": 0.7095797720797721,
+      "grad_norm": 0.62645423412323,
+      "learning_rate": 0.000184990056067782,
+      "loss": 1.5834,
+      "step": 3985
+    },
+    {
+      "epoch": 0.7097578347578347,
+      "grad_norm": 0.486557275056839,
+      "learning_rate": 0.0001849826793637435,
+      "loss": 1.0598,
+      "step": 3986
+    },
+    {
+      "epoch": 0.7099358974358975,
+      "grad_norm": 0.5122783184051514,
+      "learning_rate": 0.0001849753009946459,
+      "loss": 1.2213,
+      "step": 3987
+    },
+    {
+      "epoch": 0.7101139601139601,
+      "grad_norm": 0.4864068627357483,
+      "learning_rate": 0.0001849679209606338,
+      "loss": 1.2708,
+      "step": 3988
+    },
+    {
+      "epoch": 0.7102920227920227,
+      "grad_norm": 0.5860990881919861,
+      "learning_rate": 0.00018496053926185183,
+      "loss": 1.2421,
+      "step": 3989
+    },
+    {
+      "epoch": 0.7104700854700855,
+      "grad_norm": 0.471194326877594,
+      "learning_rate": 0.00018495315589844453,
+      "loss": 0.879,
+      "step": 3990
+    },
+    {
+      "epoch": 0.7106481481481481,
+      "grad_norm": 0.5626323819160461,
+      "learning_rate": 0.00018494577087055662,
+      "loss": 1.1297,
+      "step": 3991
+    },
+    {
+      "epoch": 0.7108262108262108,
+      "grad_norm": 0.4706762135028839,
+      "learning_rate": 0.0001849383841783328,
+      "loss": 1.0444,
+      "step": 3992
+    },
+    {
+      "epoch": 0.7110042735042735,
+      "grad_norm": 0.5776444673538208,
+      "learning_rate": 0.00018493099582191783,
+      "loss": 1.1773,
+      "step": 3993
+    },
+    {
+      "epoch": 0.7111823361823362,
+      "grad_norm": 0.5493253469467163,
+      "learning_rate": 0.00018492360580145637,
+      "loss": 1.0354,
+      "step": 3994
+    },
+    {
+      "epoch": 0.7113603988603988,
+      "grad_norm": 0.5328514575958252,
+      "learning_rate": 0.0001849162141170933,
+      "loss": 0.9251,
+      "step": 3995
+    },
+    {
+      "epoch": 0.7115384615384616,
+      "grad_norm": 0.5814893841743469,
+      "learning_rate": 0.0001849088207689734,
+      "loss": 1.1066,
+      "step": 3996
+    },
+    {
+      "epoch": 0.7117165242165242,
+      "grad_norm": 0.5476071834564209,
+      "learning_rate": 0.00018490142575724154,
+      "loss": 1.1613,
+      "step": 3997
+    },
+    {
+      "epoch": 0.7118945868945868,
+      "grad_norm": 0.5216463208198547,
+      "learning_rate": 0.00018489402908204258,
+      "loss": 1.2574,
+      "step": 3998
+    },
+    {
+      "epoch": 0.7120726495726496,
+      "grad_norm": 0.5110020637512207,
+      "learning_rate": 0.00018488663074352153,
+      "loss": 1.0663,
+      "step": 3999
+    },
+    {
+      "epoch": 0.7122507122507122,
+      "grad_norm": 0.448090523481369,
+      "learning_rate": 0.00018487923074182326,
+      "loss": 0.6687,
+      "step": 4000
+    },
+    {
+      "epoch": 0.7124287749287749,
+      "grad_norm": 0.4980565011501312,
+      "learning_rate": 0.00018487182907709279,
+      "loss": 1.2365,
+      "step": 4001
+    },
+    {
+      "epoch": 0.7126068376068376,
+      "grad_norm": 0.485831081867218,
+      "learning_rate": 0.00018486442574947511,
+      "loss": 1.0941,
+      "step": 4002
+    },
+    {
+      "epoch": 0.7127849002849003,
+      "grad_norm": 0.4955040216445923,
+      "learning_rate": 0.00018485702075911534,
+      "loss": 1.248,
+      "step": 4003
+    },
+    {
+      "epoch": 0.7129629629629629,
+      "grad_norm": 0.5168375968933105,
+      "learning_rate": 0.00018484961410615845,
+      "loss": 1.1118,
+      "step": 4004
+    },
+    {
+      "epoch": 0.7131410256410257,
+      "grad_norm": 0.5255687832832336,
+      "learning_rate": 0.00018484220579074968,
+      "loss": 1.0558,
+      "step": 4005
+    },
+    {
+      "epoch": 0.7133190883190883,
+      "grad_norm": 0.5502219796180725,
+      "learning_rate": 0.00018483479581303416,
+      "loss": 1.1604,
+      "step": 4006
+    },
+    {
+      "epoch": 0.7134971509971509,
+      "grad_norm": 0.5155881643295288,
+      "learning_rate": 0.000184827384173157,
+      "loss": 0.8246,
+      "step": 4007
+    },
+    {
+      "epoch": 0.7136752136752137,
+      "grad_norm": 0.5321542024612427,
+      "learning_rate": 0.0001848199708712635,
+      "loss": 1.2058,
+      "step": 4008
+    },
+    {
+      "epoch": 0.7138532763532763,
+      "grad_norm": 0.4929848313331604,
+      "learning_rate": 0.00018481255590749884,
+      "loss": 1.4023,
+      "step": 4009
+    },
+    {
+      "epoch": 0.7140313390313391,
+      "grad_norm": 0.5070937871932983,
+      "learning_rate": 0.00018480513928200836,
+      "loss": 1.0561,
+      "step": 4010
+    },
+    {
+      "epoch": 0.7142094017094017,
+      "grad_norm": 0.5750083327293396,
+      "learning_rate": 0.00018479772099493728,
+      "loss": 1.0276,
+      "step": 4011
+    },
+    {
+      "epoch": 0.7143874643874644,
+      "grad_norm": 0.5265933275222778,
+      "learning_rate": 0.00018479030104643108,
+      "loss": 1.0295,
+      "step": 4012
+    },
+    {
+      "epoch": 0.7145655270655271,
+      "grad_norm": 0.526830792427063,
+      "learning_rate": 0.00018478287943663504,
+      "loss": 1.0157,
+      "step": 4013
+    },
+    {
+      "epoch": 0.7147435897435898,
+      "grad_norm": 0.5344091653823853,
+      "learning_rate": 0.00018477545616569458,
+      "loss": 1.1997,
+      "step": 4014
+    },
+    {
+      "epoch": 0.7149216524216524,
+      "grad_norm": 0.4935445189476013,
+      "learning_rate": 0.0001847680312337552,
+      "loss": 1.1858,
+      "step": 4015
+    },
+    {
+      "epoch": 0.7150997150997151,
+      "grad_norm": 0.5291212797164917,
+      "learning_rate": 0.0001847606046409623,
+      "loss": 0.926,
+      "step": 4016
+    },
+    {
+      "epoch": 0.7152777777777778,
+      "grad_norm": 0.559050977230072,
+      "learning_rate": 0.00018475317638746142,
+      "loss": 1.0947,
+      "step": 4017
+    },
+    {
+      "epoch": 0.7154558404558404,
+      "grad_norm": 0.4566570222377777,
+      "learning_rate": 0.00018474574647339814,
+      "loss": 1.0334,
+      "step": 4018
+    },
+    {
+      "epoch": 0.7156339031339032,
+      "grad_norm": 0.5156155824661255,
+      "learning_rate": 0.000184738314898918,
+      "loss": 1.0076,
+      "step": 4019
+    },
+    {
+      "epoch": 0.7158119658119658,
+      "grad_norm": 0.5008716583251953,
+      "learning_rate": 0.00018473088166416662,
+      "loss": 1.0378,
+      "step": 4020
+    },
+    {
+      "epoch": 0.7159900284900285,
+      "grad_norm": 0.49556368589401245,
+      "learning_rate": 0.0001847234467692896,
+      "loss": 1.15,
+      "step": 4021
+    },
+    {
+      "epoch": 0.7161680911680912,
+      "grad_norm": 0.5464680790901184,
+      "learning_rate": 0.00018471601021443265,
+      "loss": 1.2975,
+      "step": 4022
+    },
+    {
+      "epoch": 0.7163461538461539,
+      "grad_norm": 0.6291980147361755,
+      "learning_rate": 0.00018470857199974144,
+      "loss": 1.05,
+      "step": 4023
+    },
+    {
+      "epoch": 0.7165242165242165,
+      "grad_norm": 0.5566631555557251,
+      "learning_rate": 0.00018470113212536176,
+      "loss": 1.1296,
+      "step": 4024
+    },
+    {
+      "epoch": 0.7167022792022792,
+      "grad_norm": 0.5569562911987305,
+      "learning_rate": 0.00018469369059143933,
+      "loss": 1.2484,
+      "step": 4025
+    },
+    {
+      "epoch": 0.7168803418803419,
+      "grad_norm": 0.5804716944694519,
+      "learning_rate": 0.00018468624739812,
+      "loss": 1.0547,
+      "step": 4026
+    },
+    {
+      "epoch": 0.7170584045584045,
+      "grad_norm": 0.6316802501678467,
+      "learning_rate": 0.00018467880254554952,
+      "loss": 1.1188,
+      "step": 4027
+    },
+    {
+      "epoch": 0.7172364672364673,
+      "grad_norm": 0.6131419539451599,
+      "learning_rate": 0.00018467135603387385,
+      "loss": 1.1662,
+      "step": 4028
+    },
+    {
+      "epoch": 0.7174145299145299,
+      "grad_norm": 0.4703124761581421,
+      "learning_rate": 0.00018466390786323883,
+      "loss": 1.038,
+      "step": 4029
+    },
+    {
+      "epoch": 0.7175925925925926,
+      "grad_norm": 0.5718469023704529,
+      "learning_rate": 0.0001846564580337904,
+      "loss": 1.0786,
+      "step": 4030
+    },
+    {
+      "epoch": 0.7177706552706553,
+      "grad_norm": 0.5227612853050232,
+      "learning_rate": 0.00018464900654567457,
+      "loss": 1.0561,
+      "step": 4031
+    },
+    {
+      "epoch": 0.717948717948718,
+      "grad_norm": 0.5800358057022095,
+      "learning_rate": 0.00018464155339903727,
+      "loss": 1.0944,
+      "step": 4032
+    },
+    {
+      "epoch": 0.7181267806267806,
+      "grad_norm": 0.5562314987182617,
+      "learning_rate": 0.00018463409859402455,
+      "loss": 0.8573,
+      "step": 4033
+    },
+    {
+      "epoch": 0.7183048433048433,
+      "grad_norm": 0.6420153379440308,
+      "learning_rate": 0.0001846266421307825,
+      "loss": 1.088,
+      "step": 4034
+    },
+    {
+      "epoch": 0.718482905982906,
+      "grad_norm": 0.4745902717113495,
+      "learning_rate": 0.00018461918400945718,
+      "loss": 1.1679,
+      "step": 4035
+    },
+    {
+      "epoch": 0.7186609686609686,
+      "grad_norm": 0.5070300102233887,
+      "learning_rate": 0.00018461172423019475,
+      "loss": 1.1984,
+      "step": 4036
+    },
+    {
+      "epoch": 0.7188390313390314,
+      "grad_norm": 0.5339375138282776,
+      "learning_rate": 0.00018460426279314133,
+      "loss": 1.3038,
+      "step": 4037
+    },
+    {
+      "epoch": 0.719017094017094,
+      "grad_norm": 0.5947147607803345,
+      "learning_rate": 0.00018459679969844313,
+      "loss": 1.0103,
+      "step": 4038
+    },
+    {
+      "epoch": 0.7191951566951567,
+      "grad_norm": 0.5493791699409485,
+      "learning_rate": 0.00018458933494624642,
+      "loss": 1.1001,
+      "step": 4039
+    },
+    {
+      "epoch": 0.7193732193732194,
+      "grad_norm": 0.5700310468673706,
+      "learning_rate": 0.00018458186853669736,
+      "loss": 0.9006,
+      "step": 4040
+    },
+    {
+      "epoch": 0.719551282051282,
+      "grad_norm": 0.60371994972229,
+      "learning_rate": 0.0001845744004699423,
+      "loss": 1.3001,
+      "step": 4041
+    },
+    {
+      "epoch": 0.7197293447293447,
+      "grad_norm": 0.5469261407852173,
+      "learning_rate": 0.00018456693074612757,
+      "loss": 1.1745,
+      "step": 4042
+    },
+    {
+      "epoch": 0.7199074074074074,
+      "grad_norm": 0.5179165601730347,
+      "learning_rate": 0.00018455945936539947,
+      "loss": 0.9883,
+      "step": 4043
+    },
+    {
+      "epoch": 0.7200854700854701,
+      "grad_norm": 0.5396696329116821,
+      "learning_rate": 0.00018455198632790447,
+      "loss": 1.1277,
+      "step": 4044
+    },
+    {
+      "epoch": 0.7202635327635327,
+      "grad_norm": 0.4559909403324127,
+      "learning_rate": 0.00018454451163378888,
+      "loss": 0.9644,
+      "step": 4045
+    },
+    {
+      "epoch": 0.7204415954415955,
+      "grad_norm": 0.49863892793655396,
+      "learning_rate": 0.00018453703528319927,
+      "loss": 1.1276,
+      "step": 4046
+    },
+    {
+      "epoch": 0.7206196581196581,
+      "grad_norm": 0.4790710508823395,
+      "learning_rate": 0.000184529557276282,
+      "loss": 0.9443,
+      "step": 4047
+    },
+    {
+      "epoch": 0.7207977207977208,
+      "grad_norm": 0.541999876499176,
+      "learning_rate": 0.0001845220776131837,
+      "loss": 1.0681,
+      "step": 4048
+    },
+    {
+      "epoch": 0.7209757834757835,
+      "grad_norm": 0.5119109153747559,
+      "learning_rate": 0.00018451459629405088,
+      "loss": 1.2078,
+      "step": 4049
+    },
+    {
+      "epoch": 0.7211538461538461,
+      "grad_norm": 0.6141307353973389,
+      "learning_rate": 0.00018450711331903006,
+      "loss": 1.1071,
+      "step": 4050
+    },
+    {
+      "epoch": 0.7213319088319088,
+      "grad_norm": 0.48679864406585693,
+      "learning_rate": 0.00018449962868826795,
+      "loss": 0.9713,
+      "step": 4051
+    },
+    {
+      "epoch": 0.7215099715099715,
+      "grad_norm": 0.5548661947250366,
+      "learning_rate": 0.0001844921424019111,
+      "loss": 1.2099,
+      "step": 4052
+    },
+    {
+      "epoch": 0.7216880341880342,
+      "grad_norm": 0.5000107884407043,
+      "learning_rate": 0.00018448465446010626,
+      "loss": 1.0184,
+      "step": 4053
+    },
+    {
+      "epoch": 0.7218660968660968,
+      "grad_norm": 0.6131454110145569,
+      "learning_rate": 0.00018447716486300013,
+      "loss": 1.2581,
+      "step": 4054
+    },
+    {
+      "epoch": 0.7220441595441596,
+      "grad_norm": 0.5145987868309021,
+      "learning_rate": 0.0001844696736107394,
+      "loss": 1.1646,
+      "step": 4055
+    },
+    {
+      "epoch": 0.7222222222222222,
+      "grad_norm": 0.4361337125301361,
+      "learning_rate": 0.00018446218070347094,
+      "loss": 0.8239,
+      "step": 4056
+    },
+    {
+      "epoch": 0.7224002849002849,
+      "grad_norm": 0.5549173355102539,
+      "learning_rate": 0.00018445468614134146,
+      "loss": 1.1935,
+      "step": 4057
+    },
+    {
+      "epoch": 0.7225783475783476,
+      "grad_norm": 0.5569297671318054,
+      "learning_rate": 0.00018444718992449789,
+      "loss": 1.0137,
+      "step": 4058
+    },
+    {
+      "epoch": 0.7227564102564102,
+      "grad_norm": 0.44866305589675903,
+      "learning_rate": 0.00018443969205308704,
+      "loss": 0.987,
+      "step": 4059
+    },
+    {
+      "epoch": 0.7229344729344729,
+      "grad_norm": 0.5142943263053894,
+      "learning_rate": 0.0001844321925272558,
+      "loss": 1.0837,
+      "step": 4060
+    },
+    {
+      "epoch": 0.7231125356125356,
+      "grad_norm": 0.4922119379043579,
+      "learning_rate": 0.0001844246913471512,
+      "loss": 0.8477,
+      "step": 4061
+    },
+    {
+      "epoch": 0.7232905982905983,
+      "grad_norm": 0.5245375633239746,
+      "learning_rate": 0.0001844171885129201,
+      "loss": 0.9985,
+      "step": 4062
+    },
+    {
+      "epoch": 0.7234686609686609,
+      "grad_norm": 0.45562678575515747,
+      "learning_rate": 0.00018440968402470956,
+      "loss": 0.8678,
+      "step": 4063
+    },
+    {
+      "epoch": 0.7236467236467237,
+      "grad_norm": 0.5388376712799072,
+      "learning_rate": 0.0001844021778826666,
+      "loss": 1.0586,
+      "step": 4064
+    },
+    {
+      "epoch": 0.7238247863247863,
+      "grad_norm": 0.48945263028144836,
+      "learning_rate": 0.00018439467008693833,
+      "loss": 1.0547,
+      "step": 4065
+    },
+    {
+      "epoch": 0.7240028490028491,
+      "grad_norm": 0.5202330350875854,
+      "learning_rate": 0.00018438716063767178,
+      "loss": 1.3142,
+      "step": 4066
+    },
+    {
+      "epoch": 0.7241809116809117,
+      "grad_norm": 0.5432567000389099,
+      "learning_rate": 0.00018437964953501413,
+      "loss": 1.0192,
+      "step": 4067
+    },
+    {
+      "epoch": 0.7243589743589743,
+      "grad_norm": 0.5220325589179993,
+      "learning_rate": 0.00018437213677911253,
+      "loss": 1.0904,
+      "step": 4068
+    },
+    {
+      "epoch": 0.7245370370370371,
+      "grad_norm": 0.45711690187454224,
+      "learning_rate": 0.00018436462237011417,
+      "loss": 1.0417,
+      "step": 4069
+    },
+    {
+      "epoch": 0.7247150997150997,
+      "grad_norm": 0.560778021812439,
+      "learning_rate": 0.0001843571063081663,
+      "loss": 1.2316,
+      "step": 4070
+    },
+    {
+      "epoch": 0.7248931623931624,
+      "grad_norm": 0.591533362865448,
+      "learning_rate": 0.0001843495885934162,
+      "loss": 1.0294,
+      "step": 4071
+    },
+    {
+      "epoch": 0.7250712250712251,
+      "grad_norm": 0.5550443530082703,
+      "learning_rate": 0.00018434206922601106,
+      "loss": 1.0162,
+      "step": 4072
+    },
+    {
+      "epoch": 0.7252492877492878,
+      "grad_norm": 0.5744053721427917,
+      "learning_rate": 0.00018433454820609833,
+      "loss": 1.2774,
+      "step": 4073
+    },
+    {
+      "epoch": 0.7254273504273504,
+      "grad_norm": 0.6210703253746033,
+      "learning_rate": 0.0001843270255338253,
+      "loss": 1.2526,
+      "step": 4074
+    },
+    {
+      "epoch": 0.7256054131054132,
+      "grad_norm": 0.49684277176856995,
+      "learning_rate": 0.0001843195012093394,
+      "loss": 1.0786,
+      "step": 4075
+    },
+    {
+      "epoch": 0.7257834757834758,
+      "grad_norm": 0.5851606130599976,
+      "learning_rate": 0.00018431197523278802,
+      "loss": 1.14,
+      "step": 4076
+    },
+    {
+      "epoch": 0.7259615384615384,
+      "grad_norm": 0.5494425296783447,
+      "learning_rate": 0.00018430444760431862,
+      "loss": 1.211,
+      "step": 4077
+    },
+    {
+      "epoch": 0.7261396011396012,
+      "grad_norm": 0.5247658491134644,
+      "learning_rate": 0.00018429691832407867,
+      "loss": 0.8031,
+      "step": 4078
+    },
+    {
+      "epoch": 0.7263176638176638,
+      "grad_norm": 0.5012249946594238,
+      "learning_rate": 0.00018428938739221574,
+      "loss": 1.1258,
+      "step": 4079
+    },
+    {
+      "epoch": 0.7264957264957265,
+      "grad_norm": 0.5226427912712097,
+      "learning_rate": 0.0001842818548088774,
+      "loss": 1.0029,
+      "step": 4080
+    },
+    {
+      "epoch": 0.7266737891737892,
+      "grad_norm": 0.45008543133735657,
+      "learning_rate": 0.00018427432057421114,
+      "loss": 1.0681,
+      "step": 4081
+    },
+    {
+      "epoch": 0.7268518518518519,
+      "grad_norm": 0.5127285122871399,
+      "learning_rate": 0.00018426678468836467,
+      "loss": 1.1069,
+      "step": 4082
+    },
+    {
+      "epoch": 0.7270299145299145,
+      "grad_norm": 0.5406150221824646,
+      "learning_rate": 0.0001842592471514856,
+      "loss": 1.052,
+      "step": 4083
+    },
+    {
+      "epoch": 0.7272079772079773,
+      "grad_norm": 0.5001157522201538,
+      "learning_rate": 0.0001842517079637216,
+      "loss": 0.9157,
+      "step": 4084
+    },
+    {
+      "epoch": 0.7273860398860399,
+      "grad_norm": 0.6169779300689697,
+      "learning_rate": 0.00018424416712522042,
+      "loss": 1.3133,
+      "step": 4085
+    },
+    {
+      "epoch": 0.7275641025641025,
+      "grad_norm": 0.4891316890716553,
+      "learning_rate": 0.00018423662463612974,
+      "loss": 0.9505,
+      "step": 4086
+    },
+    {
+      "epoch": 0.7277421652421653,
+      "grad_norm": 0.5883708596229553,
+      "learning_rate": 0.00018422908049659743,
+      "loss": 1.2797,
+      "step": 4087
+    },
+    {
+      "epoch": 0.7279202279202279,
+      "grad_norm": 0.6679072976112366,
+      "learning_rate": 0.00018422153470677125,
+      "loss": 1.1096,
+      "step": 4088
+    },
+    {
+      "epoch": 0.7280982905982906,
+      "grad_norm": 0.5178479552268982,
+      "learning_rate": 0.00018421398726679904,
+      "loss": 1.0299,
+      "step": 4089
+    },
+    {
+      "epoch": 0.7282763532763533,
+      "grad_norm": 0.6343900561332703,
+      "learning_rate": 0.0001842064381768287,
+      "loss": 1.2983,
+      "step": 4090
+    },
+    {
+      "epoch": 0.728454415954416,
+      "grad_norm": 0.43816515803337097,
+      "learning_rate": 0.0001841988874370081,
+      "loss": 0.9452,
+      "step": 4091
+    },
+    {
+      "epoch": 0.7286324786324786,
+      "grad_norm": 0.579790472984314,
+      "learning_rate": 0.00018419133504748528,
+      "loss": 1.1037,
+      "step": 4092
+    },
+    {
+      "epoch": 0.7288105413105413,
+      "grad_norm": 0.571374773979187,
+      "learning_rate": 0.00018418378100840807,
+      "loss": 1.1655,
+      "step": 4093
+    },
+    {
+      "epoch": 0.728988603988604,
+      "grad_norm": 0.5163514018058777,
+      "learning_rate": 0.0001841762253199246,
+      "loss": 1.1579,
+      "step": 4094
+    },
+    {
+      "epoch": 0.7291666666666666,
+      "grad_norm": 0.6553022265434265,
+      "learning_rate": 0.0001841686679821828,
+      "loss": 0.9664,
+      "step": 4095
+    },
+    {
+      "epoch": 0.7293447293447294,
+      "grad_norm": 0.5072969198226929,
+      "learning_rate": 0.00018416110899533084,
+      "loss": 0.9416,
+      "step": 4096
+    },
+    {
+      "epoch": 0.729522792022792,
+      "grad_norm": 0.5103251338005066,
+      "learning_rate": 0.00018415354835951675,
+      "loss": 1.0715,
+      "step": 4097
+    },
+    {
+      "epoch": 0.7297008547008547,
+      "grad_norm": 0.49752289056777954,
+      "learning_rate": 0.00018414598607488874,
+      "loss": 1.1848,
+      "step": 4098
+    },
+    {
+      "epoch": 0.7298789173789174,
+      "grad_norm": 0.5361882448196411,
+      "learning_rate": 0.00018413842214159488,
+      "loss": 1.1035,
+      "step": 4099
+    },
+    {
+      "epoch": 0.73005698005698,
+      "grad_norm": 0.5167670249938965,
+      "learning_rate": 0.00018413085655978343,
+      "loss": 1.0015,
+      "step": 4100
+    },
+    {
+      "epoch": 0.7302350427350427,
+      "grad_norm": 0.5930629372596741,
+      "learning_rate": 0.00018412328932960263,
+      "loss": 0.9766,
+      "step": 4101
+    },
+    {
+      "epoch": 0.7304131054131054,
+      "grad_norm": 0.5234778523445129,
+      "learning_rate": 0.00018411572045120073,
+      "loss": 1.0317,
+      "step": 4102
+    },
+    {
+      "epoch": 0.7305911680911681,
+      "grad_norm": 0.5361374020576477,
+      "learning_rate": 0.000184108149924726,
+      "loss": 1.1228,
+      "step": 4103
+    },
+    {
+      "epoch": 0.7307692307692307,
+      "grad_norm": 0.5845770239830017,
+      "learning_rate": 0.0001841005777503268,
+      "loss": 0.9541,
+      "step": 4104
+    },
+    {
+      "epoch": 0.7309472934472935,
+      "grad_norm": 0.49320483207702637,
+      "learning_rate": 0.0001840930039281515,
+      "loss": 0.9445,
+      "step": 4105
+    },
+    {
+      "epoch": 0.7311253561253561,
+      "grad_norm": 0.5391250252723694,
+      "learning_rate": 0.00018408542845834845,
+      "loss": 1.1983,
+      "step": 4106
+    },
+    {
+      "epoch": 0.7313034188034188,
+      "grad_norm": 0.4890393316745758,
+      "learning_rate": 0.00018407785134106613,
+      "loss": 0.8353,
+      "step": 4107
+    },
+    {
+      "epoch": 0.7314814814814815,
+      "grad_norm": 0.5839747190475464,
+      "learning_rate": 0.00018407027257645296,
+      "loss": 1.4074,
+      "step": 4108
+    },
+    {
+      "epoch": 0.7316595441595442,
+      "grad_norm": 0.5957708358764648,
+      "learning_rate": 0.0001840626921646574,
+      "loss": 1.1032,
+      "step": 4109
+    },
+    {
+      "epoch": 0.7318376068376068,
+      "grad_norm": 0.5029017925262451,
+      "learning_rate": 0.00018405511010582805,
+      "loss": 1.095,
+      "step": 4110
+    },
+    {
+      "epoch": 0.7320156695156695,
+      "grad_norm": 0.6054347157478333,
+      "learning_rate": 0.00018404752640011345,
+      "loss": 1.0366,
+      "step": 4111
+    },
+    {
+      "epoch": 0.7321937321937322,
+      "grad_norm": 0.5476830005645752,
+      "learning_rate": 0.00018403994104766212,
+      "loss": 1.0976,
+      "step": 4112
+    },
+    {
+      "epoch": 0.7323717948717948,
+      "grad_norm": 0.5000962615013123,
+      "learning_rate": 0.00018403235404862277,
+      "loss": 1.0809,
+      "step": 4113
+    },
+    {
+      "epoch": 0.7325498575498576,
+      "grad_norm": 0.5119251012802124,
+      "learning_rate": 0.00018402476540314394,
+      "loss": 1.0176,
+      "step": 4114
+    },
+    {
+      "epoch": 0.7327279202279202,
+      "grad_norm": 0.5825830698013306,
+      "learning_rate": 0.00018401717511137445,
+      "loss": 1.2357,
+      "step": 4115
+    },
+    {
+      "epoch": 0.7329059829059829,
+      "grad_norm": 0.5702941417694092,
+      "learning_rate": 0.0001840095831734629,
+      "loss": 1.1549,
+      "step": 4116
+    },
+    {
+      "epoch": 0.7330840455840456,
+      "grad_norm": 0.5660699605941772,
+      "learning_rate": 0.00018400198958955807,
+      "loss": 1.1778,
+      "step": 4117
+    },
+    {
+      "epoch": 0.7332621082621082,
+      "grad_norm": 0.5241161584854126,
+      "learning_rate": 0.0001839943943598088,
+      "loss": 0.8587,
+      "step": 4118
+    },
+    {
+      "epoch": 0.7334401709401709,
+      "grad_norm": 0.581194281578064,
+      "learning_rate": 0.0001839867974843638,
+      "loss": 1.2169,
+      "step": 4119
+    },
+    {
+      "epoch": 0.7336182336182336,
+      "grad_norm": 0.4342379570007324,
+      "learning_rate": 0.00018397919896337198,
+      "loss": 0.9182,
+      "step": 4120
+    },
+    {
+      "epoch": 0.7337962962962963,
+      "grad_norm": 0.5708567500114441,
+      "learning_rate": 0.00018397159879698224,
+      "loss": 1.1781,
+      "step": 4121
+    },
+    {
+      "epoch": 0.7339743589743589,
+      "grad_norm": 0.5827265977859497,
+      "learning_rate": 0.00018396399698534344,
+      "loss": 1.2905,
+      "step": 4122
+    },
+    {
+      "epoch": 0.7341524216524217,
+      "grad_norm": 0.5274056792259216,
+      "learning_rate": 0.00018395639352860457,
+      "loss": 1.1786,
+      "step": 4123
+    },
+    {
+      "epoch": 0.7343304843304843,
+      "grad_norm": 0.5094266533851624,
+      "learning_rate": 0.00018394878842691452,
+      "loss": 1.2016,
+      "step": 4124
+    },
+    {
+      "epoch": 0.7345085470085471,
+      "grad_norm": 0.48779475688934326,
+      "learning_rate": 0.0001839411816804224,
+      "loss": 1.0562,
+      "step": 4125
+    },
+    {
+      "epoch": 0.7346866096866097,
+      "grad_norm": 0.5805709958076477,
+      "learning_rate": 0.00018393357328927716,
+      "loss": 1.1705,
+      "step": 4126
+    },
+    {
+      "epoch": 0.7348646723646723,
+      "grad_norm": 0.4910700023174286,
+      "learning_rate": 0.00018392596325362791,
+      "loss": 1.0682,
+      "step": 4127
+    },
+    {
+      "epoch": 0.7350427350427351,
+      "grad_norm": 0.5297428369522095,
+      "learning_rate": 0.0001839183515736238,
+      "loss": 0.9505,
+      "step": 4128
+    },
+    {
+      "epoch": 0.7352207977207977,
+      "grad_norm": 0.45442086458206177,
+      "learning_rate": 0.00018391073824941385,
+      "loss": 0.9548,
+      "step": 4129
+    },
+    {
+      "epoch": 0.7353988603988604,
+      "grad_norm": 0.49299946427345276,
+      "learning_rate": 0.00018390312328114733,
+      "loss": 1.0868,
+      "step": 4130
+    },
+    {
+      "epoch": 0.7355769230769231,
+      "grad_norm": 0.4839940369129181,
+      "learning_rate": 0.0001838955066689734,
+      "loss": 0.9565,
+      "step": 4131
+    },
+    {
+      "epoch": 0.7357549857549858,
+      "grad_norm": 0.48600608110427856,
+      "learning_rate": 0.00018388788841304128,
+      "loss": 1.2353,
+      "step": 4132
+    },
+    {
+      "epoch": 0.7359330484330484,
+      "grad_norm": 0.4893583357334137,
+      "learning_rate": 0.0001838802685135003,
+      "loss": 0.9595,
+      "step": 4133
+    },
+    {
+      "epoch": 0.7361111111111112,
+      "grad_norm": 0.4587398171424866,
+      "learning_rate": 0.00018387264697049963,
+      "loss": 1.1222,
+      "step": 4134
+    },
+    {
+      "epoch": 0.7362891737891738,
+      "grad_norm": 0.5361055731773376,
+      "learning_rate": 0.00018386502378418872,
+      "loss": 1.3304,
+      "step": 4135
+    },
+    {
+      "epoch": 0.7364672364672364,
+      "grad_norm": 0.5556629300117493,
+      "learning_rate": 0.00018385739895471686,
+      "loss": 1.0358,
+      "step": 4136
+    },
+    {
+      "epoch": 0.7366452991452992,
+      "grad_norm": 0.45555856823921204,
+      "learning_rate": 0.00018384977248223346,
+      "loss": 1.0081,
+      "step": 4137
+    },
+    {
+      "epoch": 0.7368233618233618,
+      "grad_norm": 0.5606052875518799,
+      "learning_rate": 0.00018384214436688797,
+      "loss": 0.9367,
+      "step": 4138
+    },
+    {
+      "epoch": 0.7370014245014245,
+      "grad_norm": 0.5428356528282166,
+      "learning_rate": 0.00018383451460882982,
+      "loss": 1.1391,
+      "step": 4139
+    },
+    {
+      "epoch": 0.7371794871794872,
+      "grad_norm": 0.4891330897808075,
+      "learning_rate": 0.00018382688320820853,
+      "loss": 0.9805,
+      "step": 4140
+    },
+    {
+      "epoch": 0.7373575498575499,
+      "grad_norm": 0.5407996773719788,
+      "learning_rate": 0.0001838192501651736,
+      "loss": 1.0532,
+      "step": 4141
+    },
+    {
+      "epoch": 0.7375356125356125,
+      "grad_norm": 0.5241971611976624,
+      "learning_rate": 0.00018381161547987454,
+      "loss": 0.9509,
+      "step": 4142
+    },
+    {
+      "epoch": 0.7377136752136753,
+      "grad_norm": 0.5370210409164429,
+      "learning_rate": 0.000183803979152461,
+      "loss": 1.2342,
+      "step": 4143
+    },
+    {
+      "epoch": 0.7378917378917379,
+      "grad_norm": 0.5470060706138611,
+      "learning_rate": 0.00018379634118308259,
+      "loss": 0.9621,
+      "step": 4144
+    },
+    {
+      "epoch": 0.7380698005698005,
+      "grad_norm": 0.546313464641571,
+      "learning_rate": 0.00018378870157188893,
+      "loss": 1.1253,
+      "step": 4145
+    },
+    {
+      "epoch": 0.7382478632478633,
+      "grad_norm": 0.502027153968811,
+      "learning_rate": 0.00018378106031902974,
+      "loss": 1.1919,
+      "step": 4146
+    },
+    {
+      "epoch": 0.7384259259259259,
+      "grad_norm": 0.5282283425331116,
+      "learning_rate": 0.0001837734174246547,
+      "loss": 1.0088,
+      "step": 4147
+    },
+    {
+      "epoch": 0.7386039886039886,
+      "grad_norm": 0.5152897238731384,
+      "learning_rate": 0.00018376577288891355,
+      "loss": 1.0813,
+      "step": 4148
+    },
+    {
+      "epoch": 0.7387820512820513,
+      "grad_norm": 0.5002804398536682,
+      "learning_rate": 0.0001837581267119561,
+      "loss": 0.9797,
+      "step": 4149
+    },
+    {
+      "epoch": 0.738960113960114,
+      "grad_norm": 0.5698176026344299,
+      "learning_rate": 0.00018375047889393215,
+      "loss": 1.1099,
+      "step": 4150
+    },
+    {
+      "epoch": 0.7391381766381766,
+      "grad_norm": 0.5384604930877686,
+      "learning_rate": 0.00018374282943499156,
+      "loss": 1.1944,
+      "step": 4151
+    },
+    {
+      "epoch": 0.7393162393162394,
+      "grad_norm": 0.5483044385910034,
+      "learning_rate": 0.00018373517833528418,
+      "loss": 1.1734,
+      "step": 4152
+    },
+    {
+      "epoch": 0.739494301994302,
+      "grad_norm": 0.4824066162109375,
+      "learning_rate": 0.0001837275255949599,
+      "loss": 0.9515,
+      "step": 4153
+    },
+    {
+      "epoch": 0.7396723646723646,
+      "grad_norm": 0.45413634181022644,
+      "learning_rate": 0.00018371987121416873,
+      "loss": 0.7534,
+      "step": 4154
+    },
+    {
+      "epoch": 0.7398504273504274,
+      "grad_norm": 0.5874246954917908,
+      "learning_rate": 0.00018371221519306055,
+      "loss": 0.9464,
+      "step": 4155
+    },
+    {
+      "epoch": 0.74002849002849,
+      "grad_norm": 0.5219913125038147,
+      "learning_rate": 0.00018370455753178544,
+      "loss": 1.0494,
+      "step": 4156
+    },
+    {
+      "epoch": 0.7402065527065527,
+      "grad_norm": 0.5937709212303162,
+      "learning_rate": 0.00018369689823049341,
+      "loss": 1.0529,
+      "step": 4157
+    },
+    {
+      "epoch": 0.7403846153846154,
+      "grad_norm": 0.5204295516014099,
+      "learning_rate": 0.00018368923728933449,
+      "loss": 1.0602,
+      "step": 4158
+    },
+    {
+      "epoch": 0.7405626780626781,
+      "grad_norm": 0.5422890186309814,
+      "learning_rate": 0.00018368157470845885,
+      "loss": 0.9261,
+      "step": 4159
+    },
+    {
+      "epoch": 0.7407407407407407,
+      "grad_norm": 0.6163852214813232,
+      "learning_rate": 0.00018367391048801655,
+      "loss": 1.2771,
+      "step": 4160
+    },
+    {
+      "epoch": 0.7409188034188035,
+      "grad_norm": 0.5070751309394836,
+      "learning_rate": 0.00018366624462815785,
+      "loss": 1.0401,
+      "step": 4161
+    },
+    {
+      "epoch": 0.7410968660968661,
+      "grad_norm": 0.4477100968360901,
+      "learning_rate": 0.00018365857712903283,
+      "loss": 1.1463,
+      "step": 4162
+    },
+    {
+      "epoch": 0.7412749287749287,
+      "grad_norm": 0.5421462655067444,
+      "learning_rate": 0.0001836509079907918,
+      "loss": 0.9373,
+      "step": 4163
+    },
+    {
+      "epoch": 0.7414529914529915,
+      "grad_norm": 0.6162141561508179,
+      "learning_rate": 0.000183643237213585,
+      "loss": 1.1827,
+      "step": 4164
+    },
+    {
+      "epoch": 0.7416310541310541,
+      "grad_norm": 0.5653836131095886,
+      "learning_rate": 0.00018363556479756272,
+      "loss": 1.0689,
+      "step": 4165
+    },
+    {
+      "epoch": 0.7418091168091168,
+      "grad_norm": 0.57053542137146,
+      "learning_rate": 0.00018362789074287527,
+      "loss": 1.0289,
+      "step": 4166
+    },
+    {
+      "epoch": 0.7419871794871795,
+      "grad_norm": 0.5603055953979492,
+      "learning_rate": 0.00018362021504967304,
+      "loss": 1.1926,
+      "step": 4167
+    },
+    {
+      "epoch": 0.7421652421652422,
+      "grad_norm": 0.5460166335105896,
+      "learning_rate": 0.0001836125377181064,
+      "loss": 1.1488,
+      "step": 4168
+    },
+    {
+      "epoch": 0.7423433048433048,
+      "grad_norm": 0.5097107887268066,
+      "learning_rate": 0.00018360485874832579,
+      "loss": 1.0781,
+      "step": 4169
+    },
+    {
+      "epoch": 0.7425213675213675,
+      "grad_norm": 0.6280624270439148,
+      "learning_rate": 0.00018359717814048164,
+      "loss": 1.3625,
+      "step": 4170
+    },
+    {
+      "epoch": 0.7426994301994302,
+      "grad_norm": 0.4528210759162903,
+      "learning_rate": 0.0001835894958947244,
+      "loss": 0.8417,
+      "step": 4171
+    },
+    {
+      "epoch": 0.7428774928774928,
+      "grad_norm": 0.48735132813453674,
+      "learning_rate": 0.00018358181201120468,
+      "loss": 0.9544,
+      "step": 4172
+    },
+    {
+      "epoch": 0.7430555555555556,
+      "grad_norm": 0.48388174176216125,
+      "learning_rate": 0.00018357412649007296,
+      "loss": 1.0663,
+      "step": 4173
+    },
+    {
+      "epoch": 0.7432336182336182,
+      "grad_norm": 0.5435357689857483,
+      "learning_rate": 0.00018356643933147986,
+      "loss": 1.2074,
+      "step": 4174
+    },
+    {
+      "epoch": 0.7434116809116809,
+      "grad_norm": 0.49890074133872986,
+      "learning_rate": 0.00018355875053557594,
+      "loss": 1.1322,
+      "step": 4175
+    },
+    {
+      "epoch": 0.7435897435897436,
+      "grad_norm": 0.5680708885192871,
+      "learning_rate": 0.0001835510601025119,
+      "loss": 1.1964,
+      "step": 4176
+    },
+    {
+      "epoch": 0.7437678062678063,
+      "grad_norm": 0.5002360939979553,
+      "learning_rate": 0.00018354336803243842,
+      "loss": 1.1396,
+      "step": 4177
+    },
+    {
+      "epoch": 0.7439458689458689,
+      "grad_norm": 0.5202965140342712,
+      "learning_rate": 0.00018353567432550616,
+      "loss": 1.1498,
+      "step": 4178
+    },
+    {
+      "epoch": 0.7441239316239316,
+      "grad_norm": 0.514492928981781,
+      "learning_rate": 0.00018352797898186588,
+      "loss": 1.0959,
+      "step": 4179
+    },
+    {
+      "epoch": 0.7443019943019943,
+      "grad_norm": 0.6395383477210999,
+      "learning_rate": 0.0001835202820016684,
+      "loss": 1.2867,
+      "step": 4180
+    },
+    {
+      "epoch": 0.7444800569800569,
+      "grad_norm": 0.5489062070846558,
+      "learning_rate": 0.00018351258338506447,
+      "loss": 1.1638,
+      "step": 4181
+    },
+    {
+      "epoch": 0.7446581196581197,
+      "grad_norm": 0.5705671906471252,
+      "learning_rate": 0.00018350488313220498,
+      "loss": 0.9493,
+      "step": 4182
+    },
+    {
+      "epoch": 0.7448361823361823,
+      "grad_norm": 0.5404297709465027,
+      "learning_rate": 0.00018349718124324076,
+      "loss": 0.9876,
+      "step": 4183
+    },
+    {
+      "epoch": 0.7450142450142451,
+      "grad_norm": 0.5841003060340881,
+      "learning_rate": 0.0001834894777183227,
+      "loss": 1.1225,
+      "step": 4184
+    },
+    {
+      "epoch": 0.7451923076923077,
+      "grad_norm": 0.49774688482284546,
+      "learning_rate": 0.00018348177255760178,
+      "loss": 1.1442,
+      "step": 4185
+    },
+    {
+      "epoch": 0.7453703703703703,
+      "grad_norm": 0.5212422609329224,
+      "learning_rate": 0.00018347406576122894,
+      "loss": 1.101,
+      "step": 4186
+    },
+    {
+      "epoch": 0.7455484330484331,
+      "grad_norm": 0.615024983882904,
+      "learning_rate": 0.00018346635732935517,
+      "loss": 1.4188,
+      "step": 4187
+    },
+    {
+      "epoch": 0.7457264957264957,
+      "grad_norm": 0.46818843483924866,
+      "learning_rate": 0.00018345864726213154,
+      "loss": 1.0071,
+      "step": 4188
+    },
+    {
+      "epoch": 0.7459045584045584,
+      "grad_norm": 0.4921121895313263,
+      "learning_rate": 0.00018345093555970906,
+      "loss": 1.015,
+      "step": 4189
+    },
+    {
+      "epoch": 0.7460826210826211,
+      "grad_norm": 0.5042136311531067,
+      "learning_rate": 0.00018344322222223889,
+      "loss": 0.9974,
+      "step": 4190
+    },
+    {
+      "epoch": 0.7462606837606838,
+      "grad_norm": 0.5872490406036377,
+      "learning_rate": 0.0001834355072498721,
+      "loss": 1.3166,
+      "step": 4191
+    },
+    {
+      "epoch": 0.7464387464387464,
+      "grad_norm": 0.559117317199707,
+      "learning_rate": 0.00018342779064275984,
+      "loss": 1.2227,
+      "step": 4192
+    },
+    {
+      "epoch": 0.7466168091168092,
+      "grad_norm": 0.5269635319709778,
+      "learning_rate": 0.00018342007240105336,
+      "loss": 1.0281,
+      "step": 4193
+    },
+    {
+      "epoch": 0.7467948717948718,
+      "grad_norm": 0.4608335793018341,
+      "learning_rate": 0.00018341235252490387,
+      "loss": 0.98,
+      "step": 4194
+    },
+    {
+      "epoch": 0.7469729344729344,
+      "grad_norm": 0.5818259119987488,
+      "learning_rate": 0.00018340463101446255,
+      "loss": 1.1544,
+      "step": 4195
+    },
+    {
+      "epoch": 0.7471509971509972,
+      "grad_norm": 0.5577529668807983,
+      "learning_rate": 0.00018339690786988079,
+      "loss": 1.3059,
+      "step": 4196
+    },
+    {
+      "epoch": 0.7473290598290598,
+      "grad_norm": 0.5430468320846558,
+      "learning_rate": 0.00018338918309130983,
+      "loss": 1.2766,
+      "step": 4197
+    },
+    {
+      "epoch": 0.7475071225071225,
+      "grad_norm": 0.4941701591014862,
+      "learning_rate": 0.0001833814566789011,
+      "loss": 1.193,
+      "step": 4198
+    },
+    {
+      "epoch": 0.7476851851851852,
+      "grad_norm": 0.5471884608268738,
+      "learning_rate": 0.00018337372863280589,
+      "loss": 1.2261,
+      "step": 4199
+    },
+    {
+      "epoch": 0.7478632478632479,
+      "grad_norm": 0.4641438126564026,
+      "learning_rate": 0.0001833659989531757,
+      "loss": 0.7953,
+      "step": 4200
+    },
+    {
+      "epoch": 0.7480413105413105,
+      "grad_norm": 0.5244714617729187,
+      "learning_rate": 0.0001833582676401619,
+      "loss": 0.9344,
+      "step": 4201
+    },
+    {
+      "epoch": 0.7482193732193733,
+      "grad_norm": 0.5964360237121582,
+      "learning_rate": 0.00018335053469391603,
+      "loss": 1.2072,
+      "step": 4202
+    },
+    {
+      "epoch": 0.7483974358974359,
+      "grad_norm": 0.4929158091545105,
+      "learning_rate": 0.00018334280011458954,
+      "loss": 1.2183,
+      "step": 4203
+    },
+    {
+      "epoch": 0.7485754985754985,
+      "grad_norm": 0.46221864223480225,
+      "learning_rate": 0.00018333506390233405,
+      "loss": 1.1957,
+      "step": 4204
+    },
+    {
+      "epoch": 0.7487535612535613,
+      "grad_norm": 0.6301732659339905,
+      "learning_rate": 0.0001833273260573011,
+      "loss": 1.0582,
+      "step": 4205
+    },
+    {
+      "epoch": 0.7489316239316239,
+      "grad_norm": 0.5606021881103516,
+      "learning_rate": 0.0001833195865796423,
+      "loss": 1.4034,
+      "step": 4206
+    },
+    {
+      "epoch": 0.7491096866096866,
+      "grad_norm": 0.44856077432632446,
+      "learning_rate": 0.00018331184546950926,
+      "loss": 0.8421,
+      "step": 4207
+    },
+    {
+      "epoch": 0.7492877492877493,
+      "grad_norm": 0.5487226247787476,
+      "learning_rate": 0.00018330410272705366,
+      "loss": 1.238,
+      "step": 4208
+    },
+    {
+      "epoch": 0.749465811965812,
+      "grad_norm": 0.6043636798858643,
+      "learning_rate": 0.00018329635835242724,
+      "loss": 1.1215,
+      "step": 4209
+    },
+    {
+      "epoch": 0.7496438746438746,
+      "grad_norm": 0.5145319104194641,
+      "learning_rate": 0.00018328861234578173,
+      "loss": 1.1002,
+      "step": 4210
+    },
+    {
+      "epoch": 0.7498219373219374,
+      "grad_norm": 0.5667078495025635,
+      "learning_rate": 0.00018328086470726884,
+      "loss": 1.2994,
+      "step": 4211
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.5117634534835815,
+      "learning_rate": 0.00018327311543704043,
+      "loss": 0.9448,
+      "step": 4212
+    },
+    {
+      "epoch": 0.75,
+      "eval_loss": 1.0982474088668823,
+      "eval_runtime": 24.6617,
+      "eval_samples_per_second": 42.211,
+      "eval_steps_per_second": 21.126,
+      "step": 4212
+    },
+    {
+      "epoch": 0.7501780626780626,
+      "grad_norm": 0.5451585054397583,
+      "learning_rate": 0.00018326536453524826,
+      "loss": 0.9023,
+      "step": 4213
+    },
+    {
+      "epoch": 0.7503561253561254,
+      "grad_norm": 0.6585208773612976,
+      "learning_rate": 0.0001832576120020443,
+      "loss": 1.2798,
+      "step": 4214
+    },
+    {
+      "epoch": 0.750534188034188,
+      "grad_norm": 0.6444812417030334,
+      "learning_rate": 0.00018324985783758037,
+      "loss": 1.3999,
+      "step": 4215
+    },
+    {
+      "epoch": 0.7507122507122507,
+      "grad_norm": 0.6178330779075623,
+      "learning_rate": 0.0001832421020420084,
+      "loss": 1.1846,
+      "step": 4216
+    },
+    {
+      "epoch": 0.7508903133903134,
+      "grad_norm": 0.509969174861908,
+      "learning_rate": 0.00018323434461548036,
+      "loss": 1.1831,
+      "step": 4217
+    },
+    {
+      "epoch": 0.7510683760683761,
+      "grad_norm": 0.5558911561965942,
+      "learning_rate": 0.00018322658555814826,
+      "loss": 1.1599,
+      "step": 4218
+    },
+    {
+      "epoch": 0.7512464387464387,
+      "grad_norm": 0.5714917778968811,
+      "learning_rate": 0.0001832188248701641,
+      "loss": 0.9702,
+      "step": 4219
+    },
+    {
+      "epoch": 0.7514245014245015,
+      "grad_norm": 0.6136442422866821,
+      "learning_rate": 0.00018321106255167995,
+      "loss": 0.9376,
+      "step": 4220
+    },
+    {
+      "epoch": 0.7516025641025641,
+      "grad_norm": 0.5832077264785767,
+      "learning_rate": 0.00018320329860284785,
+      "loss": 1.2564,
+      "step": 4221
+    },
+    {
+      "epoch": 0.7517806267806267,
+      "grad_norm": 0.45330923795700073,
+      "learning_rate": 0.00018319553302381997,
+      "loss": 0.9321,
+      "step": 4222
+    },
+    {
+      "epoch": 0.7519586894586895,
+      "grad_norm": 0.5278468132019043,
+      "learning_rate": 0.00018318776581474847,
+      "loss": 1.1334,
+      "step": 4223
+    },
+    {
+      "epoch": 0.7521367521367521,
+      "grad_norm": 0.49267473816871643,
+      "learning_rate": 0.00018317999697578549,
+      "loss": 1.1577,
+      "step": 4224
+    },
+    {
+      "epoch": 0.7523148148148148,
+      "grad_norm": 0.5372124314308167,
+      "learning_rate": 0.00018317222650708325,
+      "loss": 1.037,
+      "step": 4225
+    },
+    {
+      "epoch": 0.7524928774928775,
+      "grad_norm": 0.5879829525947571,
+      "learning_rate": 0.000183164454408794,
+      "loss": 1.1312,
+      "step": 4226
+    },
+    {
+      "epoch": 0.7526709401709402,
+      "grad_norm": 0.5363932251930237,
+      "learning_rate": 0.00018315668068107004,
+      "loss": 1.174,
+      "step": 4227
+    },
+    {
+      "epoch": 0.7528490028490028,
+      "grad_norm": 0.5585991740226746,
+      "learning_rate": 0.00018314890532406366,
+      "loss": 1.2106,
+      "step": 4228
+    },
+    {
+      "epoch": 0.7530270655270656,
+      "grad_norm": 0.49395787715911865,
+      "learning_rate": 0.0001831411283379272,
+      "loss": 1.1163,
+      "step": 4229
+    },
+    {
+      "epoch": 0.7532051282051282,
+      "grad_norm": 0.5081066489219666,
+      "learning_rate": 0.00018313334972281306,
+      "loss": 1.184,
+      "step": 4230
+    },
+    {
+      "epoch": 0.7533831908831908,
+      "grad_norm": 0.40304034948349,
+      "learning_rate": 0.0001831255694788736,
+      "loss": 0.7548,
+      "step": 4231
+    },
+    {
+      "epoch": 0.7535612535612536,
+      "grad_norm": 0.4999815821647644,
+      "learning_rate": 0.0001831177876062613,
+      "loss": 1.0092,
+      "step": 4232
+    },
+    {
+      "epoch": 0.7537393162393162,
+      "grad_norm": 0.48917025327682495,
+      "learning_rate": 0.00018311000410512862,
+      "loss": 1.0354,
+      "step": 4233
+    },
+    {
+      "epoch": 0.7539173789173789,
+      "grad_norm": 0.475606769323349,
+      "learning_rate": 0.00018310221897562806,
+      "loss": 0.8728,
+      "step": 4234
+    },
+    {
+      "epoch": 0.7540954415954416,
+      "grad_norm": 0.630439817905426,
+      "learning_rate": 0.00018309443221791214,
+      "loss": 1.1436,
+      "step": 4235
+    },
+    {
+      "epoch": 0.7542735042735043,
+      "grad_norm": 0.524740993976593,
+      "learning_rate": 0.00018308664383213344,
+      "loss": 1.0487,
+      "step": 4236
+    },
+    {
+      "epoch": 0.7544515669515669,
+      "grad_norm": 0.4734523892402649,
+      "learning_rate": 0.0001830788538184445,
+      "loss": 1.0681,
+      "step": 4237
+    },
+    {
+      "epoch": 0.7546296296296297,
+      "grad_norm": 0.5767266750335693,
+      "learning_rate": 0.00018307106217699807,
+      "loss": 1.0599,
+      "step": 4238
+    },
+    {
+      "epoch": 0.7548076923076923,
+      "grad_norm": 0.6276642084121704,
+      "learning_rate": 0.0001830632689079467,
+      "loss": 1.2837,
+      "step": 4239
+    },
+    {
+      "epoch": 0.7549857549857549,
+      "grad_norm": 0.5539988279342651,
+      "learning_rate": 0.00018305547401144316,
+      "loss": 0.9072,
+      "step": 4240
+    },
+    {
+      "epoch": 0.7551638176638177,
+      "grad_norm": 0.4551292061805725,
+      "learning_rate": 0.00018304767748764014,
+      "loss": 1.0204,
+      "step": 4241
+    },
+    {
+      "epoch": 0.7553418803418803,
+      "grad_norm": 0.47344550490379333,
+      "learning_rate": 0.00018303987933669034,
+      "loss": 1.0473,
+      "step": 4242
+    },
+    {
+      "epoch": 0.7555199430199431,
+      "grad_norm": 0.6050213575363159,
+      "learning_rate": 0.00018303207955874665,
+      "loss": 1.1552,
+      "step": 4243
+    },
+    {
+      "epoch": 0.7556980056980057,
+      "grad_norm": 0.48943889141082764,
+      "learning_rate": 0.00018302427815396186,
+      "loss": 1.0002,
+      "step": 4244
+    },
+    {
+      "epoch": 0.7558760683760684,
+      "grad_norm": 0.5664682984352112,
+      "learning_rate": 0.00018301647512248878,
+      "loss": 1.1865,
+      "step": 4245
+    },
+    {
+      "epoch": 0.7560541310541311,
+      "grad_norm": 0.5702242255210876,
+      "learning_rate": 0.00018300867046448034,
+      "loss": 1.3029,
+      "step": 4246
+    },
+    {
+      "epoch": 0.7562321937321937,
+      "grad_norm": 0.593207836151123,
+      "learning_rate": 0.00018300086418008942,
+      "loss": 1.109,
+      "step": 4247
+    },
+    {
+      "epoch": 0.7564102564102564,
+      "grad_norm": 0.5887887477874756,
+      "learning_rate": 0.000182993056269469,
+      "loss": 1.3022,
+      "step": 4248
+    },
+    {
+      "epoch": 0.7565883190883191,
+      "grad_norm": 0.5277966260910034,
+      "learning_rate": 0.00018298524673277203,
+      "loss": 1.1738,
+      "step": 4249
+    },
+    {
+      "epoch": 0.7567663817663818,
+      "grad_norm": 0.589347779750824,
+      "learning_rate": 0.00018297743557015155,
+      "loss": 1.0185,
+      "step": 4250
+    },
+    {
+      "epoch": 0.7569444444444444,
+      "grad_norm": 0.49920859932899475,
+      "learning_rate": 0.0001829696227817606,
+      "loss": 1.118,
+      "step": 4251
+    },
+    {
+      "epoch": 0.7571225071225072,
+      "grad_norm": 0.502565324306488,
+      "learning_rate": 0.0001829618083677522,
+      "loss": 1.1856,
+      "step": 4252
+    },
+    {
+      "epoch": 0.7573005698005698,
+      "grad_norm": 0.49814435839653015,
+      "learning_rate": 0.00018295399232827955,
+      "loss": 1.0432,
+      "step": 4253
+    },
+    {
+      "epoch": 0.7574786324786325,
+      "grad_norm": 0.5087502598762512,
+      "learning_rate": 0.00018294617466349574,
+      "loss": 1.2325,
+      "step": 4254
+    },
+    {
+      "epoch": 0.7576566951566952,
+      "grad_norm": 0.5107288956642151,
+      "learning_rate": 0.00018293835537355394,
+      "loss": 1.0487,
+      "step": 4255
+    },
+    {
+      "epoch": 0.7578347578347578,
+      "grad_norm": 0.524725615978241,
+      "learning_rate": 0.00018293053445860732,
+      "loss": 1.1821,
+      "step": 4256
+    },
+    {
+      "epoch": 0.7580128205128205,
+      "grad_norm": 0.5234082937240601,
+      "learning_rate": 0.0001829227119188092,
+      "loss": 0.8896,
+      "step": 4257
+    },
+    {
+      "epoch": 0.7581908831908832,
+      "grad_norm": 0.5102918744087219,
+      "learning_rate": 0.00018291488775431275,
+      "loss": 1.0246,
+      "step": 4258
+    },
+    {
+      "epoch": 0.7583689458689459,
+      "grad_norm": 0.5552714467048645,
+      "learning_rate": 0.00018290706196527135,
+      "loss": 1.0193,
+      "step": 4259
+    },
+    {
+      "epoch": 0.7585470085470085,
+      "grad_norm": 0.5395022630691528,
+      "learning_rate": 0.00018289923455183825,
+      "loss": 1.3203,
+      "step": 4260
+    },
+    {
+      "epoch": 0.7587250712250713,
+      "grad_norm": 0.7474865913391113,
+      "learning_rate": 0.00018289140551416692,
+      "loss": 1.182,
+      "step": 4261
+    },
+    {
+      "epoch": 0.7589031339031339,
+      "grad_norm": 0.4892016649246216,
+      "learning_rate": 0.00018288357485241066,
+      "loss": 0.968,
+      "step": 4262
+    },
+    {
+      "epoch": 0.7590811965811965,
+      "grad_norm": 0.4627816081047058,
+      "learning_rate": 0.00018287574256672291,
+      "loss": 0.6895,
+      "step": 4263
+    },
+    {
+      "epoch": 0.7592592592592593,
+      "grad_norm": 0.6221280097961426,
+      "learning_rate": 0.00018286790865725715,
+      "loss": 0.9691,
+      "step": 4264
+    },
+    {
+      "epoch": 0.7594373219373219,
+      "grad_norm": 0.5542295575141907,
+      "learning_rate": 0.0001828600731241669,
+      "loss": 0.9996,
+      "step": 4265
+    },
+    {
+      "epoch": 0.7596153846153846,
+      "grad_norm": 0.5570770502090454,
+      "learning_rate": 0.00018285223596760562,
+      "loss": 1.1996,
+      "step": 4266
+    },
+    {
+      "epoch": 0.7597934472934473,
+      "grad_norm": 0.5495262742042542,
+      "learning_rate": 0.00018284439718772687,
+      "loss": 1.1572,
+      "step": 4267
+    },
+    {
+      "epoch": 0.75997150997151,
+      "grad_norm": 0.5006741881370544,
+      "learning_rate": 0.00018283655678468427,
+      "loss": 1.1215,
+      "step": 4268
+    },
+    {
+      "epoch": 0.7601495726495726,
+      "grad_norm": 0.4682157635688782,
+      "learning_rate": 0.00018282871475863144,
+      "loss": 1.0547,
+      "step": 4269
+    },
+    {
+      "epoch": 0.7603276353276354,
+      "grad_norm": 0.6275840997695923,
+      "learning_rate": 0.00018282087110972197,
+      "loss": 1.3855,
+      "step": 4270
+    },
+    {
+      "epoch": 0.760505698005698,
+      "grad_norm": 0.5341474413871765,
+      "learning_rate": 0.0001828130258381096,
+      "loss": 1.2024,
+      "step": 4271
+    },
+    {
+      "epoch": 0.7606837606837606,
+      "grad_norm": 0.4330833852291107,
+      "learning_rate": 0.000182805178943948,
+      "loss": 1.0508,
+      "step": 4272
+    },
+    {
+      "epoch": 0.7608618233618234,
+      "grad_norm": 0.6276537179946899,
+      "learning_rate": 0.00018279733042739094,
+      "loss": 1.1635,
+      "step": 4273
+    },
+    {
+      "epoch": 0.761039886039886,
+      "grad_norm": 0.5370199084281921,
+      "learning_rate": 0.00018278948028859217,
+      "loss": 1.0579,
+      "step": 4274
+    },
+    {
+      "epoch": 0.7612179487179487,
+      "grad_norm": 0.524959921836853,
+      "learning_rate": 0.00018278162852770552,
+      "loss": 1.0972,
+      "step": 4275
+    },
+    {
+      "epoch": 0.7613960113960114,
+      "grad_norm": 0.5029389262199402,
+      "learning_rate": 0.00018277377514488486,
+      "loss": 0.959,
+      "step": 4276
+    },
+    {
+      "epoch": 0.7615740740740741,
+      "grad_norm": 0.49772894382476807,
+      "learning_rate": 0.00018276592014028397,
+      "loss": 1.2773,
+      "step": 4277
+    },
+    {
+      "epoch": 0.7617521367521367,
+      "grad_norm": 0.5195719003677368,
+      "learning_rate": 0.00018275806351405685,
+      "loss": 1.0676,
+      "step": 4278
+    },
+    {
+      "epoch": 0.7619301994301995,
+      "grad_norm": 0.5167942643165588,
+      "learning_rate": 0.00018275020526635735,
+      "loss": 1.0615,
+      "step": 4279
+    },
+    {
+      "epoch": 0.7621082621082621,
+      "grad_norm": 0.4958035945892334,
+      "learning_rate": 0.0001827423453973395,
+      "loss": 0.9605,
+      "step": 4280
+    },
+    {
+      "epoch": 0.7622863247863247,
+      "grad_norm": 0.6256808042526245,
+      "learning_rate": 0.00018273448390715728,
+      "loss": 1.2526,
+      "step": 4281
+    },
+    {
+      "epoch": 0.7624643874643875,
+      "grad_norm": 0.5062580108642578,
+      "learning_rate": 0.0001827266207959647,
+      "loss": 1.0604,
+      "step": 4282
+    },
+    {
+      "epoch": 0.7626424501424501,
+      "grad_norm": 0.5080778002738953,
+      "learning_rate": 0.00018271875606391583,
+      "loss": 1.1246,
+      "step": 4283
+    },
+    {
+      "epoch": 0.7628205128205128,
+      "grad_norm": 0.5069389939308167,
+      "learning_rate": 0.00018271088971116479,
+      "loss": 1.3158,
+      "step": 4284
+    },
+    {
+      "epoch": 0.7629985754985755,
+      "grad_norm": 0.7280121445655823,
+      "learning_rate": 0.00018270302173786567,
+      "loss": 1.2066,
+      "step": 4285
+    },
+    {
+      "epoch": 0.7631766381766382,
+      "grad_norm": 0.6523470282554626,
+      "learning_rate": 0.00018269515214417267,
+      "loss": 1.3236,
+      "step": 4286
+    },
+    {
+      "epoch": 0.7633547008547008,
+      "grad_norm": 0.5799322724342346,
+      "learning_rate": 0.00018268728093023988,
+      "loss": 0.9786,
+      "step": 4287
+    },
+    {
+      "epoch": 0.7635327635327636,
+      "grad_norm": 0.46675166487693787,
+      "learning_rate": 0.00018267940809622163,
+      "loss": 0.8131,
+      "step": 4288
+    },
+    {
+      "epoch": 0.7637108262108262,
+      "grad_norm": 0.5566182732582092,
+      "learning_rate": 0.00018267153364227214,
+      "loss": 1.0565,
+      "step": 4289
+    },
+    {
+      "epoch": 0.7638888888888888,
+      "grad_norm": 0.532028079032898,
+      "learning_rate": 0.00018266365756854566,
+      "loss": 0.952,
+      "step": 4290
+    },
+    {
+      "epoch": 0.7640669515669516,
+      "grad_norm": 0.5082666873931885,
+      "learning_rate": 0.00018265577987519653,
+      "loss": 1.0704,
+      "step": 4291
+    },
+    {
+      "epoch": 0.7642450142450142,
+      "grad_norm": 0.5223562717437744,
+      "learning_rate": 0.00018264790056237912,
+      "loss": 1.1161,
+      "step": 4292
+    },
+    {
+      "epoch": 0.7644230769230769,
+      "grad_norm": 0.48472318053245544,
+      "learning_rate": 0.00018264001963024778,
+      "loss": 0.8784,
+      "step": 4293
+    },
+    {
+      "epoch": 0.7646011396011396,
+      "grad_norm": 0.5901281833648682,
+      "learning_rate": 0.0001826321370789569,
+      "loss": 1.1031,
+      "step": 4294
+    },
+    {
+      "epoch": 0.7647792022792023,
+      "grad_norm": 0.570350706577301,
+      "learning_rate": 0.000182624252908661,
+      "loss": 0.9047,
+      "step": 4295
+    },
+    {
+      "epoch": 0.7649572649572649,
+      "grad_norm": 0.568373441696167,
+      "learning_rate": 0.00018261636711951445,
+      "loss": 1.0106,
+      "step": 4296
+    },
+    {
+      "epoch": 0.7651353276353277,
+      "grad_norm": 0.6175880432128906,
+      "learning_rate": 0.00018260847971167182,
+      "loss": 1.3531,
+      "step": 4297
+    },
+    {
+      "epoch": 0.7653133903133903,
+      "grad_norm": 0.5682594776153564,
+      "learning_rate": 0.00018260059068528762,
+      "loss": 1.1261,
+      "step": 4298
+    },
+    {
+      "epoch": 0.7654914529914529,
+      "grad_norm": 0.5050225257873535,
+      "learning_rate": 0.00018259270004051644,
+      "loss": 1.0921,
+      "step": 4299
+    },
+    {
+      "epoch": 0.7656695156695157,
+      "grad_norm": 0.5416565537452698,
+      "learning_rate": 0.0001825848077775129,
+      "loss": 1.0881,
+      "step": 4300
+    },
+    {
+      "epoch": 0.7658475783475783,
+      "grad_norm": 0.5418867468833923,
+      "learning_rate": 0.0001825769138964316,
+      "loss": 1.2069,
+      "step": 4301
+    },
+    {
+      "epoch": 0.7660256410256411,
+      "grad_norm": 0.5447866320610046,
+      "learning_rate": 0.00018256901839742718,
+      "loss": 1.1827,
+      "step": 4302
+    },
+    {
+      "epoch": 0.7662037037037037,
+      "grad_norm": 0.5482802987098694,
+      "learning_rate": 0.00018256112128065439,
+      "loss": 1.0492,
+      "step": 4303
+    },
+    {
+      "epoch": 0.7663817663817664,
+      "grad_norm": 0.5059601664543152,
+      "learning_rate": 0.0001825532225462679,
+      "loss": 1.0996,
+      "step": 4304
+    },
+    {
+      "epoch": 0.7665598290598291,
+      "grad_norm": 0.5153701901435852,
+      "learning_rate": 0.00018254532219442258,
+      "loss": 1.3237,
+      "step": 4305
+    },
+    {
+      "epoch": 0.7667378917378918,
+      "grad_norm": 0.5370768904685974,
+      "learning_rate": 0.0001825374202252731,
+      "loss": 0.9925,
+      "step": 4306
+    },
+    {
+      "epoch": 0.7669159544159544,
+      "grad_norm": 0.4516580402851105,
+      "learning_rate": 0.00018252951663897432,
+      "loss": 1.0749,
+      "step": 4307
+    },
+    {
+      "epoch": 0.7670940170940171,
+      "grad_norm": 0.5565171837806702,
+      "learning_rate": 0.0001825216114356811,
+      "loss": 1.1617,
+      "step": 4308
+    },
+    {
+      "epoch": 0.7672720797720798,
+      "grad_norm": 0.5212662220001221,
+      "learning_rate": 0.00018251370461554834,
+      "loss": 1.1108,
+      "step": 4309
+    },
+    {
+      "epoch": 0.7674501424501424,
+      "grad_norm": 0.49061715602874756,
+      "learning_rate": 0.00018250579617873095,
+      "loss": 1.0881,
+      "step": 4310
+    },
+    {
+      "epoch": 0.7676282051282052,
+      "grad_norm": 0.5535751581192017,
+      "learning_rate": 0.00018249788612538387,
+      "loss": 0.9341,
+      "step": 4311
+    },
+    {
+      "epoch": 0.7678062678062678,
+      "grad_norm": 0.5425209403038025,
+      "learning_rate": 0.00018248997445566208,
+      "loss": 1.1858,
+      "step": 4312
+    },
+    {
+      "epoch": 0.7679843304843305,
+      "grad_norm": 0.6224395036697388,
+      "learning_rate": 0.0001824820611697206,
+      "loss": 1.0836,
+      "step": 4313
+    },
+    {
+      "epoch": 0.7681623931623932,
+      "grad_norm": 0.4895690977573395,
+      "learning_rate": 0.00018247414626771445,
+      "loss": 0.8598,
+      "step": 4314
+    },
+    {
+      "epoch": 0.7683404558404558,
+      "grad_norm": 0.5279615521430969,
+      "learning_rate": 0.00018246622974979877,
+      "loss": 1.1742,
+      "step": 4315
+    },
+    {
+      "epoch": 0.7685185185185185,
+      "grad_norm": 0.45300471782684326,
+      "learning_rate": 0.0001824583116161286,
+      "loss": 0.8872,
+      "step": 4316
+    },
+    {
+      "epoch": 0.7686965811965812,
+      "grad_norm": 0.6499692797660828,
+      "learning_rate": 0.00018245039186685916,
+      "loss": 1.2495,
+      "step": 4317
+    },
+    {
+      "epoch": 0.7688746438746439,
+      "grad_norm": 0.48151278495788574,
+      "learning_rate": 0.00018244247050214552,
+      "loss": 1.2382,
+      "step": 4318
+    },
+    {
+      "epoch": 0.7690527065527065,
+      "grad_norm": 0.6597028374671936,
+      "learning_rate": 0.0001824345475221429,
+      "loss": 1.3453,
+      "step": 4319
+    },
+    {
+      "epoch": 0.7692307692307693,
+      "grad_norm": 0.4536992609500885,
+      "learning_rate": 0.0001824266229270066,
+      "loss": 1.1141,
+      "step": 4320
+    },
+    {
+      "epoch": 0.7694088319088319,
+      "grad_norm": 0.5489405393600464,
+      "learning_rate": 0.00018241869671689184,
+      "loss": 1.0333,
+      "step": 4321
+    },
+    {
+      "epoch": 0.7695868945868946,
+      "grad_norm": 0.5741586089134216,
+      "learning_rate": 0.00018241076889195394,
+      "loss": 0.9939,
+      "step": 4322
+    },
+    {
+      "epoch": 0.7697649572649573,
+      "grad_norm": 0.47170960903167725,
+      "learning_rate": 0.00018240283945234823,
+      "loss": 0.9878,
+      "step": 4323
+    },
+    {
+      "epoch": 0.76994301994302,
+      "grad_norm": 0.4729093313217163,
+      "learning_rate": 0.00018239490839823004,
+      "loss": 1.0087,
+      "step": 4324
+    },
+    {
+      "epoch": 0.7701210826210826,
+      "grad_norm": 0.49869823455810547,
+      "learning_rate": 0.0001823869757297548,
+      "loss": 1.169,
+      "step": 4325
+    },
+    {
+      "epoch": 0.7702991452991453,
+      "grad_norm": 0.5118468403816223,
+      "learning_rate": 0.0001823790414470779,
+      "loss": 1.1092,
+      "step": 4326
+    },
+    {
+      "epoch": 0.770477207977208,
+      "grad_norm": 0.5076048970222473,
+      "learning_rate": 0.0001823711055503548,
+      "loss": 1.1028,
+      "step": 4327
+    },
+    {
+      "epoch": 0.7706552706552706,
+      "grad_norm": 0.5661569237709045,
+      "learning_rate": 0.00018236316803974098,
+      "loss": 1.1114,
+      "step": 4328
+    },
+    {
+      "epoch": 0.7708333333333334,
+      "grad_norm": 0.5542354583740234,
+      "learning_rate": 0.000182355228915392,
+      "loss": 1.0931,
+      "step": 4329
+    },
+    {
+      "epoch": 0.771011396011396,
+      "grad_norm": 0.5476680994033813,
+      "learning_rate": 0.0001823472881774634,
+      "loss": 1.036,
+      "step": 4330
+    },
+    {
+      "epoch": 0.7711894586894587,
+      "grad_norm": 0.5449798703193665,
+      "learning_rate": 0.00018233934582611073,
+      "loss": 1.0682,
+      "step": 4331
+    },
+    {
+      "epoch": 0.7713675213675214,
+      "grad_norm": 0.61089026927948,
+      "learning_rate": 0.00018233140186148963,
+      "loss": 1.0748,
+      "step": 4332
+    },
+    {
+      "epoch": 0.771545584045584,
+      "grad_norm": 0.5015206336975098,
+      "learning_rate": 0.00018232345628375576,
+      "loss": 1.2032,
+      "step": 4333
+    },
+    {
+      "epoch": 0.7717236467236467,
+      "grad_norm": 0.579289972782135,
+      "learning_rate": 0.00018231550909306475,
+      "loss": 1.0764,
+      "step": 4334
+    },
+    {
+      "epoch": 0.7719017094017094,
+      "grad_norm": 0.5889299511909485,
+      "learning_rate": 0.00018230756028957235,
+      "loss": 1.1768,
+      "step": 4335
+    },
+    {
+      "epoch": 0.7720797720797721,
+      "grad_norm": 0.5328249335289001,
+      "learning_rate": 0.00018229960987343428,
+      "loss": 1.0055,
+      "step": 4336
+    },
+    {
+      "epoch": 0.7722578347578347,
+      "grad_norm": 0.5766382217407227,
+      "learning_rate": 0.0001822916578448063,
+      "loss": 0.9923,
+      "step": 4337
+    },
+    {
+      "epoch": 0.7724358974358975,
+      "grad_norm": 0.6448187828063965,
+      "learning_rate": 0.00018228370420384423,
+      "loss": 1.1135,
+      "step": 4338
+    },
+    {
+      "epoch": 0.7726139601139601,
+      "grad_norm": 0.5505210757255554,
+      "learning_rate": 0.00018227574895070394,
+      "loss": 1.2048,
+      "step": 4339
+    },
+    {
+      "epoch": 0.7727920227920227,
+      "grad_norm": 0.6278925538063049,
+      "learning_rate": 0.00018226779208554126,
+      "loss": 1.1045,
+      "step": 4340
+    },
+    {
+      "epoch": 0.7729700854700855,
+      "grad_norm": 0.5345009565353394,
+      "learning_rate": 0.00018225983360851207,
+      "loss": 1.0102,
+      "step": 4341
+    },
+    {
+      "epoch": 0.7731481481481481,
+      "grad_norm": 0.566633403301239,
+      "learning_rate": 0.00018225187351977233,
+      "loss": 1.0038,
+      "step": 4342
+    },
+    {
+      "epoch": 0.7733262108262108,
+      "grad_norm": 0.5066078901290894,
+      "learning_rate": 0.000182243911819478,
+      "loss": 1.0339,
+      "step": 4343
+    },
+    {
+      "epoch": 0.7735042735042735,
+      "grad_norm": 0.5614920258522034,
+      "learning_rate": 0.00018223594850778503,
+      "loss": 1.1021,
+      "step": 4344
+    },
+    {
+      "epoch": 0.7736823361823362,
+      "grad_norm": 0.7747337818145752,
+      "learning_rate": 0.0001822279835848495,
+      "loss": 1.1129,
+      "step": 4345
+    },
+    {
+      "epoch": 0.7738603988603988,
+      "grad_norm": 0.7066529989242554,
+      "learning_rate": 0.00018222001705082744,
+      "loss": 1.3234,
+      "step": 4346
+    },
+    {
+      "epoch": 0.7740384615384616,
+      "grad_norm": 0.6340884566307068,
+      "learning_rate": 0.00018221204890587497,
+      "loss": 1.0726,
+      "step": 4347
+    },
+    {
+      "epoch": 0.7742165242165242,
+      "grad_norm": 0.5401145815849304,
+      "learning_rate": 0.00018220407915014818,
+      "loss": 0.9904,
+      "step": 4348
+    },
+    {
+      "epoch": 0.7743945868945868,
+      "grad_norm": 0.5069159269332886,
+      "learning_rate": 0.00018219610778380315,
+      "loss": 1.0654,
+      "step": 4349
+    },
+    {
+      "epoch": 0.7745726495726496,
+      "grad_norm": 0.5422839522361755,
+      "learning_rate": 0.00018218813480699623,
+      "loss": 1.1741,
+      "step": 4350
+    },
+    {
+      "epoch": 0.7747507122507122,
+      "grad_norm": 0.5550300478935242,
+      "learning_rate": 0.0001821801602198835,
+      "loss": 1.0033,
+      "step": 4351
+    },
+    {
+      "epoch": 0.7749287749287749,
+      "grad_norm": 0.5987736582756042,
+      "learning_rate": 0.00018217218402262123,
+      "loss": 0.935,
+      "step": 4352
+    },
+    {
+      "epoch": 0.7751068376068376,
+      "grad_norm": 0.6137008666992188,
+      "learning_rate": 0.00018216420621536573,
+      "loss": 1.17,
+      "step": 4353
+    },
+    {
+      "epoch": 0.7752849002849003,
+      "grad_norm": 0.47124359011650085,
+      "learning_rate": 0.0001821562267982733,
+      "loss": 0.8316,
+      "step": 4354
+    },
+    {
+      "epoch": 0.7754629629629629,
+      "grad_norm": 0.5057868361473083,
+      "learning_rate": 0.00018214824577150024,
+      "loss": 1.0246,
+      "step": 4355
+    },
+    {
+      "epoch": 0.7756410256410257,
+      "grad_norm": 0.604055643081665,
+      "learning_rate": 0.00018214026313520299,
+      "loss": 1.1272,
+      "step": 4356
+    },
+    {
+      "epoch": 0.7758190883190883,
+      "grad_norm": 0.6690384149551392,
+      "learning_rate": 0.0001821322788895379,
+      "loss": 1.0464,
+      "step": 4357
+    },
+    {
+      "epoch": 0.7759971509971509,
+      "grad_norm": 0.5458958745002747,
+      "learning_rate": 0.0001821242930346614,
+      "loss": 1.1712,
+      "step": 4358
+    },
+    {
+      "epoch": 0.7761752136752137,
+      "grad_norm": 0.6448663473129272,
+      "learning_rate": 0.00018211630557073,
+      "loss": 1.1125,
+      "step": 4359
+    },
+    {
+      "epoch": 0.7763532763532763,
+      "grad_norm": 0.49889448285102844,
+      "learning_rate": 0.00018210831649790018,
+      "loss": 1.097,
+      "step": 4360
+    },
+    {
+      "epoch": 0.7765313390313391,
+      "grad_norm": 0.5118046998977661,
+      "learning_rate": 0.00018210032581632843,
+      "loss": 1.009,
+      "step": 4361
+    },
+    {
+      "epoch": 0.7767094017094017,
+      "grad_norm": 0.5450068116188049,
+      "learning_rate": 0.00018209233352617135,
+      "loss": 1.1138,
+      "step": 4362
+    },
+    {
+      "epoch": 0.7768874643874644,
+      "grad_norm": 0.6147481203079224,
+      "learning_rate": 0.00018208433962758558,
+      "loss": 1.212,
+      "step": 4363
+    },
+    {
+      "epoch": 0.7770655270655271,
+      "grad_norm": 0.554176926612854,
+      "learning_rate": 0.00018207634412072764,
+      "loss": 1.1271,
+      "step": 4364
+    },
+    {
+      "epoch": 0.7772435897435898,
+      "grad_norm": 0.5872851014137268,
+      "learning_rate": 0.00018206834700575426,
+      "loss": 1.2793,
+      "step": 4365
+    },
+    {
+      "epoch": 0.7774216524216524,
+      "grad_norm": 0.5135685205459595,
+      "learning_rate": 0.00018206034828282207,
+      "loss": 0.9642,
+      "step": 4366
+    },
+    {
+      "epoch": 0.7775997150997151,
+      "grad_norm": 0.5699490308761597,
+      "learning_rate": 0.00018205234795208786,
+      "loss": 0.9086,
+      "step": 4367
+    },
+    {
+      "epoch": 0.7777777777777778,
+      "grad_norm": 0.5908057689666748,
+      "learning_rate": 0.00018204434601370832,
+      "loss": 1.1973,
+      "step": 4368
+    },
+    {
+      "epoch": 0.7779558404558404,
+      "grad_norm": 0.5777581334114075,
+      "learning_rate": 0.00018203634246784025,
+      "loss": 1.0447,
+      "step": 4369
+    },
+    {
+      "epoch": 0.7781339031339032,
+      "grad_norm": 0.4822927713394165,
+      "learning_rate": 0.00018202833731464048,
+      "loss": 0.814,
+      "step": 4370
+    },
+    {
+      "epoch": 0.7783119658119658,
+      "grad_norm": 0.5343610644340515,
+      "learning_rate": 0.0001820203305542658,
+      "loss": 1.2785,
+      "step": 4371
+    },
+    {
+      "epoch": 0.7784900284900285,
+      "grad_norm": 0.5462222695350647,
+      "learning_rate": 0.00018201232218687316,
+      "loss": 1.1785,
+      "step": 4372
+    },
+    {
+      "epoch": 0.7786680911680912,
+      "grad_norm": 0.5177609324455261,
+      "learning_rate": 0.00018200431221261943,
+      "loss": 1.111,
+      "step": 4373
+    },
+    {
+      "epoch": 0.7788461538461539,
+      "grad_norm": 0.5324625968933105,
+      "learning_rate": 0.00018199630063166157,
+      "loss": 1.0738,
+      "step": 4374
+    },
+    {
+      "epoch": 0.7790242165242165,
+      "grad_norm": 0.6392876505851746,
+      "learning_rate": 0.0001819882874441565,
+      "loss": 1.1758,
+      "step": 4375
+    },
+    {
+      "epoch": 0.7792022792022792,
+      "grad_norm": 0.49964696168899536,
+      "learning_rate": 0.00018198027265026127,
+      "loss": 1.0556,
+      "step": 4376
+    },
+    {
+      "epoch": 0.7793803418803419,
+      "grad_norm": 0.6090660691261292,
+      "learning_rate": 0.00018197225625013287,
+      "loss": 1.0102,
+      "step": 4377
+    },
+    {
+      "epoch": 0.7795584045584045,
+      "grad_norm": 0.5242345929145813,
+      "learning_rate": 0.00018196423824392842,
+      "loss": 0.8335,
+      "step": 4378
+    },
+    {
+      "epoch": 0.7797364672364673,
+      "grad_norm": 0.5265036225318909,
+      "learning_rate": 0.00018195621863180498,
+      "loss": 1.0781,
+      "step": 4379
+    },
+    {
+      "epoch": 0.7799145299145299,
+      "grad_norm": 0.5115378499031067,
+      "learning_rate": 0.0001819481974139197,
+      "loss": 1.1658,
+      "step": 4380
+    },
+    {
+      "epoch": 0.7800925925925926,
+      "grad_norm": 0.6489549875259399,
+      "learning_rate": 0.00018194017459042972,
+      "loss": 1.0572,
+      "step": 4381
+    },
+    {
+      "epoch": 0.7802706552706553,
+      "grad_norm": 0.5800202488899231,
+      "learning_rate": 0.0001819321501614922,
+      "loss": 0.9593,
+      "step": 4382
+    },
+    {
+      "epoch": 0.780448717948718,
+      "grad_norm": 0.5608528256416321,
+      "learning_rate": 0.00018192412412726443,
+      "loss": 1.0324,
+      "step": 4383
+    },
+    {
+      "epoch": 0.7806267806267806,
+      "grad_norm": 0.5596401691436768,
+      "learning_rate": 0.00018191609648790362,
+      "loss": 1.071,
+      "step": 4384
+    },
+    {
+      "epoch": 0.7808048433048433,
+      "grad_norm": 0.5712903141975403,
+      "learning_rate": 0.00018190806724356707,
+      "loss": 0.9011,
+      "step": 4385
+    },
+    {
+      "epoch": 0.780982905982906,
+      "grad_norm": 0.5079438090324402,
+      "learning_rate": 0.0001819000363944121,
+      "loss": 1.1194,
+      "step": 4386
+    },
+    {
+      "epoch": 0.7811609686609686,
+      "grad_norm": 0.5785079598426819,
+      "learning_rate": 0.00018189200394059602,
+      "loss": 1.1703,
+      "step": 4387
+    },
+    {
+      "epoch": 0.7813390313390314,
+      "grad_norm": 0.6901816129684448,
+      "learning_rate": 0.00018188396988227625,
+      "loss": 1.6689,
+      "step": 4388
+    },
+    {
+      "epoch": 0.781517094017094,
+      "grad_norm": 0.48107922077178955,
+      "learning_rate": 0.00018187593421961022,
+      "loss": 1.0116,
+      "step": 4389
+    },
+    {
+      "epoch": 0.7816951566951567,
+      "grad_norm": 0.5843084454536438,
+      "learning_rate": 0.0001818678969527553,
+      "loss": 1.1172,
+      "step": 4390
+    },
+    {
+      "epoch": 0.7818732193732194,
+      "grad_norm": 0.479034423828125,
+      "learning_rate": 0.00018185985808186902,
+      "loss": 0.811,
+      "step": 4391
+    },
+    {
+      "epoch": 0.782051282051282,
+      "grad_norm": 0.5864158272743225,
+      "learning_rate": 0.00018185181760710888,
+      "loss": 0.9522,
+      "step": 4392
+    },
+    {
+      "epoch": 0.7822293447293447,
+      "grad_norm": 0.4824625551700592,
+      "learning_rate": 0.00018184377552863242,
+      "loss": 0.9039,
+      "step": 4393
+    },
+    {
+      "epoch": 0.7824074074074074,
+      "grad_norm": 0.580102801322937,
+      "learning_rate": 0.00018183573184659717,
+      "loss": 1.2382,
+      "step": 4394
+    },
+    {
+      "epoch": 0.7825854700854701,
+      "grad_norm": 0.5300056338310242,
+      "learning_rate": 0.00018182768656116073,
+      "loss": 1.2268,
+      "step": 4395
+    },
+    {
+      "epoch": 0.7827635327635327,
+      "grad_norm": 0.5548123121261597,
+      "learning_rate": 0.00018181963967248078,
+      "loss": 1.0628,
+      "step": 4396
+    },
+    {
+      "epoch": 0.7829415954415955,
+      "grad_norm": 0.5485070943832397,
+      "learning_rate": 0.00018181159118071496,
+      "loss": 0.9628,
+      "step": 4397
+    },
+    {
+      "epoch": 0.7831196581196581,
+      "grad_norm": 0.47405415773391724,
+      "learning_rate": 0.00018180354108602095,
+      "loss": 1.1413,
+      "step": 4398
+    },
+    {
+      "epoch": 0.7832977207977208,
+      "grad_norm": 0.5545752644538879,
+      "learning_rate": 0.0001817954893885565,
+      "loss": 1.3807,
+      "step": 4399
+    },
+    {
+      "epoch": 0.7834757834757835,
+      "grad_norm": 0.5339497327804565,
+      "learning_rate": 0.00018178743608847933,
+      "loss": 0.9978,
+      "step": 4400
+    },
+    {
+      "epoch": 0.7836538461538461,
+      "grad_norm": 0.5006352663040161,
+      "learning_rate": 0.00018177938118594725,
+      "loss": 0.8873,
+      "step": 4401
+    },
+    {
+      "epoch": 0.7838319088319088,
+      "grad_norm": 0.4845179319381714,
+      "learning_rate": 0.00018177132468111812,
+      "loss": 0.8866,
+      "step": 4402
+    },
+    {
+      "epoch": 0.7840099715099715,
+      "grad_norm": 0.5240967869758606,
+      "learning_rate": 0.0001817632665741497,
+      "loss": 1.0347,
+      "step": 4403
+    },
+    {
+      "epoch": 0.7841880341880342,
+      "grad_norm": 0.5311884880065918,
+      "learning_rate": 0.00018175520686519993,
+      "loss": 1.2065,
+      "step": 4404
+    },
+    {
+      "epoch": 0.7843660968660968,
+      "grad_norm": 0.5562815070152283,
+      "learning_rate": 0.00018174714555442673,
+      "loss": 1.1272,
+      "step": 4405
+    },
+    {
+      "epoch": 0.7845441595441596,
+      "grad_norm": 0.5524366497993469,
+      "learning_rate": 0.00018173908264198802,
+      "loss": 1.2337,
+      "step": 4406
+    },
+    {
+      "epoch": 0.7847222222222222,
+      "grad_norm": 0.5612216591835022,
+      "learning_rate": 0.0001817310181280418,
+      "loss": 1.1809,
+      "step": 4407
+    },
+    {
+      "epoch": 0.7849002849002849,
+      "grad_norm": 0.5315343737602234,
+      "learning_rate": 0.000181722952012746,
+      "loss": 1.0491,
+      "step": 4408
+    },
+    {
+      "epoch": 0.7850783475783476,
+      "grad_norm": 0.5233435034751892,
+      "learning_rate": 0.00018171488429625878,
+      "loss": 1.0457,
+      "step": 4409
+    },
+    {
+      "epoch": 0.7852564102564102,
+      "grad_norm": 0.7809093594551086,
+      "learning_rate": 0.00018170681497873813,
+      "loss": 1.1578,
+      "step": 4410
+    },
+    {
+      "epoch": 0.7854344729344729,
+      "grad_norm": 0.49659839272499084,
+      "learning_rate": 0.00018169874406034217,
+      "loss": 1.0815,
+      "step": 4411
+    },
+    {
+      "epoch": 0.7856125356125356,
+      "grad_norm": 0.5020765066146851,
+      "learning_rate": 0.00018169067154122904,
+      "loss": 1.1985,
+      "step": 4412
+    },
+    {
+      "epoch": 0.7857905982905983,
+      "grad_norm": 0.6408432126045227,
+      "learning_rate": 0.0001816825974215569,
+      "loss": 1.2272,
+      "step": 4413
+    },
+    {
+      "epoch": 0.7859686609686609,
+      "grad_norm": 0.5062605142593384,
+      "learning_rate": 0.00018167452170148396,
+      "loss": 0.9663,
+      "step": 4414
+    },
+    {
+      "epoch": 0.7861467236467237,
+      "grad_norm": 0.5100119113922119,
+      "learning_rate": 0.0001816664443811684,
+      "loss": 1.0256,
+      "step": 4415
+    },
+    {
+      "epoch": 0.7863247863247863,
+      "grad_norm": 0.5277643799781799,
+      "learning_rate": 0.00018165836546076854,
+      "loss": 1.2885,
+      "step": 4416
+    },
+    {
+      "epoch": 0.7865028490028491,
+      "grad_norm": 0.5568150281906128,
+      "learning_rate": 0.0001816502849404426,
+      "loss": 1.2673,
+      "step": 4417
+    },
+    {
+      "epoch": 0.7866809116809117,
+      "grad_norm": 0.5061392188072205,
+      "learning_rate": 0.00018164220282034896,
+      "loss": 1.072,
+      "step": 4418
+    },
+    {
+      "epoch": 0.7868589743589743,
+      "grad_norm": 0.5383077263832092,
+      "learning_rate": 0.00018163411910064597,
+      "loss": 1.0621,
+      "step": 4419
+    },
+    {
+      "epoch": 0.7870370370370371,
+      "grad_norm": 0.5167948007583618,
+      "learning_rate": 0.00018162603378149198,
+      "loss": 1.099,
+      "step": 4420
+    },
+    {
+      "epoch": 0.7872150997150997,
+      "grad_norm": 0.5084534287452698,
+      "learning_rate": 0.0001816179468630454,
+      "loss": 1.3984,
+      "step": 4421
+    },
+    {
+      "epoch": 0.7873931623931624,
+      "grad_norm": 0.608762264251709,
+      "learning_rate": 0.00018160985834546475,
+      "loss": 1.3553,
+      "step": 4422
+    },
+    {
+      "epoch": 0.7875712250712251,
+      "grad_norm": 0.4900866746902466,
+      "learning_rate": 0.00018160176822890842,
+      "loss": 1.0009,
+      "step": 4423
+    },
+    {
+      "epoch": 0.7877492877492878,
+      "grad_norm": 0.5928917527198792,
+      "learning_rate": 0.00018159367651353496,
+      "loss": 1.0523,
+      "step": 4424
+    },
+    {
+      "epoch": 0.7879273504273504,
+      "grad_norm": 0.624422013759613,
+      "learning_rate": 0.0001815855831995029,
+      "loss": 1.0519,
+      "step": 4425
+    },
+    {
+      "epoch": 0.7881054131054132,
+      "grad_norm": 0.5140150785446167,
+      "learning_rate": 0.00018157748828697082,
+      "loss": 1.048,
+      "step": 4426
+    },
+    {
+      "epoch": 0.7882834757834758,
+      "grad_norm": 0.47006943821907043,
+      "learning_rate": 0.00018156939177609732,
+      "loss": 1.0067,
+      "step": 4427
+    },
+    {
+      "epoch": 0.7884615384615384,
+      "grad_norm": 0.5178864002227783,
+      "learning_rate": 0.00018156129366704105,
+      "loss": 1.0583,
+      "step": 4428
+    },
+    {
+      "epoch": 0.7886396011396012,
+      "grad_norm": 0.5279985666275024,
+      "learning_rate": 0.00018155319395996066,
+      "loss": 1.3023,
+      "step": 4429
+    },
+    {
+      "epoch": 0.7888176638176638,
+      "grad_norm": 0.5238787531852722,
+      "learning_rate": 0.00018154509265501482,
+      "loss": 1.0851,
+      "step": 4430
+    },
+    {
+      "epoch": 0.7889957264957265,
+      "grad_norm": 0.5914917588233948,
+      "learning_rate": 0.00018153698975236228,
+      "loss": 0.9291,
+      "step": 4431
+    },
+    {
+      "epoch": 0.7891737891737892,
+      "grad_norm": 0.5046082735061646,
+      "learning_rate": 0.00018152888525216183,
+      "loss": 0.9951,
+      "step": 4432
+    },
+    {
+      "epoch": 0.7893518518518519,
+      "grad_norm": 0.5042256116867065,
+      "learning_rate": 0.00018152077915457225,
+      "loss": 1.0243,
+      "step": 4433
+    },
+    {
+      "epoch": 0.7895299145299145,
+      "grad_norm": 0.5950339436531067,
+      "learning_rate": 0.0001815126714597523,
+      "loss": 0.9803,
+      "step": 4434
+    },
+    {
+      "epoch": 0.7897079772079773,
+      "grad_norm": 0.5163764953613281,
+      "learning_rate": 0.0001815045621678609,
+      "loss": 1.0353,
+      "step": 4435
+    },
+    {
+      "epoch": 0.7898860398860399,
+      "grad_norm": 0.5166211128234863,
+      "learning_rate": 0.00018149645127905691,
+      "loss": 0.9649,
+      "step": 4436
+    },
+    {
+      "epoch": 0.7900641025641025,
+      "grad_norm": 0.5239769220352173,
+      "learning_rate": 0.00018148833879349927,
+      "loss": 0.9747,
+      "step": 4437
+    },
+    {
+      "epoch": 0.7902421652421653,
+      "grad_norm": 0.5803237557411194,
+      "learning_rate": 0.00018148022471134692,
+      "loss": 1.315,
+      "step": 4438
+    },
+    {
+      "epoch": 0.7904202279202279,
+      "grad_norm": 0.5141370296478271,
+      "learning_rate": 0.00018147210903275877,
+      "loss": 1.0547,
+      "step": 4439
+    },
+    {
+      "epoch": 0.7905982905982906,
+      "grad_norm": 0.545788586139679,
+      "learning_rate": 0.00018146399175789394,
+      "loss": 1.0797,
+      "step": 4440
+    },
+    {
+      "epoch": 0.7907763532763533,
+      "grad_norm": 0.5273314714431763,
+      "learning_rate": 0.0001814558728869114,
+      "loss": 0.7928,
+      "step": 4441
+    },
+    {
+      "epoch": 0.790954415954416,
+      "grad_norm": 0.4614652693271637,
+      "learning_rate": 0.00018144775241997024,
+      "loss": 0.8826,
+      "step": 4442
+    },
+    {
+      "epoch": 0.7911324786324786,
+      "grad_norm": 0.6203590631484985,
+      "learning_rate": 0.00018143963035722958,
+      "loss": 1.2891,
+      "step": 4443
+    },
+    {
+      "epoch": 0.7913105413105413,
+      "grad_norm": 0.4870408773422241,
+      "learning_rate": 0.0001814315066988485,
+      "loss": 1.0717,
+      "step": 4444
+    },
+    {
+      "epoch": 0.791488603988604,
+      "grad_norm": 0.6468982696533203,
+      "learning_rate": 0.00018142338144498625,
+      "loss": 1.3398,
+      "step": 4445
+    },
+    {
+      "epoch": 0.7916666666666666,
+      "grad_norm": 0.4727918207645416,
+      "learning_rate": 0.00018141525459580197,
+      "loss": 1.0195,
+      "step": 4446
+    },
+    {
+      "epoch": 0.7918447293447294,
+      "grad_norm": 0.5080479979515076,
+      "learning_rate": 0.0001814071261514549,
+      "loss": 1.0163,
+      "step": 4447
+    },
+    {
+      "epoch": 0.792022792022792,
+      "grad_norm": 0.5380908250808716,
+      "learning_rate": 0.0001813989961121043,
+      "loss": 1.1673,
+      "step": 4448
+    },
+    {
+      "epoch": 0.7922008547008547,
+      "grad_norm": 0.5020384192466736,
+      "learning_rate": 0.00018139086447790945,
+      "loss": 0.8591,
+      "step": 4449
+    },
+    {
+      "epoch": 0.7923789173789174,
+      "grad_norm": 0.5279949903488159,
+      "learning_rate": 0.0001813827312490297,
+      "loss": 1.1221,
+      "step": 4450
+    },
+    {
+      "epoch": 0.79255698005698,
+      "grad_norm": 0.6739233732223511,
+      "learning_rate": 0.00018137459642562437,
+      "loss": 1.2704,
+      "step": 4451
+    },
+    {
+      "epoch": 0.7927350427350427,
+      "grad_norm": 0.5112259984016418,
+      "learning_rate": 0.00018136646000785288,
+      "loss": 1.1161,
+      "step": 4452
+    },
+    {
+      "epoch": 0.7929131054131054,
+      "grad_norm": 0.5244031548500061,
+      "learning_rate": 0.00018135832199587463,
+      "loss": 0.7866,
+      "step": 4453
+    },
+    {
+      "epoch": 0.7930911680911681,
+      "grad_norm": 0.5803347229957581,
+      "learning_rate": 0.0001813501823898491,
+      "loss": 0.994,
+      "step": 4454
+    },
+    {
+      "epoch": 0.7932692307692307,
+      "grad_norm": 0.6191152930259705,
+      "learning_rate": 0.00018134204118993568,
+      "loss": 1.0725,
+      "step": 4455
+    },
+    {
+      "epoch": 0.7934472934472935,
+      "grad_norm": 0.549735963344574,
+      "learning_rate": 0.00018133389839629396,
+      "loss": 0.9915,
+      "step": 4456
+    },
+    {
+      "epoch": 0.7936253561253561,
+      "grad_norm": 0.4940381646156311,
+      "learning_rate": 0.00018132575400908347,
+      "loss": 1.1815,
+      "step": 4457
+    },
+    {
+      "epoch": 0.7938034188034188,
+      "grad_norm": 0.5009099245071411,
+      "learning_rate": 0.00018131760802846377,
+      "loss": 1.0833,
+      "step": 4458
+    },
+    {
+      "epoch": 0.7939814814814815,
+      "grad_norm": 0.595853865146637,
+      "learning_rate": 0.00018130946045459445,
+      "loss": 1.2774,
+      "step": 4459
+    },
+    {
+      "epoch": 0.7941595441595442,
+      "grad_norm": 0.534794807434082,
+      "learning_rate": 0.00018130131128763513,
+      "loss": 1.0891,
+      "step": 4460
+    },
+    {
+      "epoch": 0.7943376068376068,
+      "grad_norm": 0.5828582048416138,
+      "learning_rate": 0.00018129316052774557,
+      "loss": 1.0786,
+      "step": 4461
+    },
+    {
+      "epoch": 0.7945156695156695,
+      "grad_norm": 0.4750654697418213,
+      "learning_rate": 0.00018128500817508533,
+      "loss": 1.0818,
+      "step": 4462
+    },
+    {
+      "epoch": 0.7946937321937322,
+      "grad_norm": 0.5626576542854309,
+      "learning_rate": 0.00018127685422981426,
+      "loss": 1.0807,
+      "step": 4463
+    },
+    {
+      "epoch": 0.7948717948717948,
+      "grad_norm": 0.6434760093688965,
+      "learning_rate": 0.00018126869869209203,
+      "loss": 1.0908,
+      "step": 4464
+    },
+    {
+      "epoch": 0.7950498575498576,
+      "grad_norm": 0.5577414631843567,
+      "learning_rate": 0.00018126054156207853,
+      "loss": 1.0281,
+      "step": 4465
+    },
+    {
+      "epoch": 0.7952279202279202,
+      "grad_norm": 0.5001249313354492,
+      "learning_rate": 0.00018125238283993347,
+      "loss": 0.9083,
+      "step": 4466
+    },
+    {
+      "epoch": 0.7954059829059829,
+      "grad_norm": 0.5298314690589905,
+      "learning_rate": 0.00018124422252581676,
+      "loss": 0.971,
+      "step": 4467
+    },
+    {
+      "epoch": 0.7955840455840456,
+      "grad_norm": 0.4872737228870392,
+      "learning_rate": 0.00018123606061988832,
+      "loss": 1.0515,
+      "step": 4468
+    },
+    {
+      "epoch": 0.7957621082621082,
+      "grad_norm": 0.5895398259162903,
+      "learning_rate": 0.00018122789712230798,
+      "loss": 1.0771,
+      "step": 4469
+    },
+    {
+      "epoch": 0.7959401709401709,
+      "grad_norm": 0.5212514996528625,
+      "learning_rate": 0.00018121973203323577,
+      "loss": 1.0365,
+      "step": 4470
+    },
+    {
+      "epoch": 0.7961182336182336,
+      "grad_norm": 0.4679451584815979,
+      "learning_rate": 0.0001812115653528316,
+      "loss": 0.9445,
+      "step": 4471
+    },
+    {
+      "epoch": 0.7962962962962963,
+      "grad_norm": 0.5852653980255127,
+      "learning_rate": 0.00018120339708125552,
+      "loss": 1.1781,
+      "step": 4472
+    },
+    {
+      "epoch": 0.7964743589743589,
+      "grad_norm": 0.6081342697143555,
+      "learning_rate": 0.00018119522721866756,
+      "loss": 1.3881,
+      "step": 4473
+    },
+    {
+      "epoch": 0.7966524216524217,
+      "grad_norm": 0.5254155993461609,
+      "learning_rate": 0.00018118705576522777,
+      "loss": 1.2198,
+      "step": 4474
+    },
+    {
+      "epoch": 0.7968304843304843,
+      "grad_norm": 0.5959419012069702,
+      "learning_rate": 0.00018117888272109632,
+      "loss": 1.0922,
+      "step": 4475
+    },
+    {
+      "epoch": 0.7970085470085471,
+      "grad_norm": 0.6243147253990173,
+      "learning_rate": 0.0001811707080864333,
+      "loss": 1.1782,
+      "step": 4476
+    },
+    {
+      "epoch": 0.7971866096866097,
+      "grad_norm": 0.5336906909942627,
+      "learning_rate": 0.0001811625318613988,
+      "loss": 1.167,
+      "step": 4477
+    },
+    {
+      "epoch": 0.7973646723646723,
+      "grad_norm": 0.5287907719612122,
+      "learning_rate": 0.00018115435404615315,
+      "loss": 0.9923,
+      "step": 4478
+    },
+    {
+      "epoch": 0.7975427350427351,
+      "grad_norm": 0.48941442370414734,
+      "learning_rate": 0.0001811461746408565,
+      "loss": 0.863,
+      "step": 4479
+    },
+    {
+      "epoch": 0.7977207977207977,
+      "grad_norm": 0.48465651273727417,
+      "learning_rate": 0.0001811379936456691,
+      "loss": 1.147,
+      "step": 4480
+    },
+    {
+      "epoch": 0.7978988603988604,
+      "grad_norm": 0.5676067471504211,
+      "learning_rate": 0.0001811298110607513,
+      "loss": 1.3121,
+      "step": 4481
+    },
+    {
+      "epoch": 0.7980769230769231,
+      "grad_norm": 0.4894018769264221,
+      "learning_rate": 0.00018112162688626337,
+      "loss": 1.1831,
+      "step": 4482
+    },
+    {
+      "epoch": 0.7982549857549858,
+      "grad_norm": 0.5626382827758789,
+      "learning_rate": 0.0001811134411223657,
+      "loss": 1.1977,
+      "step": 4483
+    },
+    {
+      "epoch": 0.7984330484330484,
+      "grad_norm": 0.564119815826416,
+      "learning_rate": 0.00018110525376921862,
+      "loss": 1.2686,
+      "step": 4484
+    },
+    {
+      "epoch": 0.7986111111111112,
+      "grad_norm": 0.6385740041732788,
+      "learning_rate": 0.00018109706482698256,
+      "loss": 1.2418,
+      "step": 4485
+    },
+    {
+      "epoch": 0.7987891737891738,
+      "grad_norm": 0.5550164580345154,
+      "learning_rate": 0.00018108887429581802,
+      "loss": 1.081,
+      "step": 4486
+    },
+    {
+      "epoch": 0.7989672364672364,
+      "grad_norm": 0.5583973526954651,
+      "learning_rate": 0.00018108068217588544,
+      "loss": 1.1757,
+      "step": 4487
+    },
+    {
+      "epoch": 0.7991452991452992,
+      "grad_norm": 0.5533342957496643,
+      "learning_rate": 0.00018107248846734527,
+      "loss": 1.1947,
+      "step": 4488
+    },
+    {
+      "epoch": 0.7993233618233618,
+      "grad_norm": 0.5291479229927063,
+      "learning_rate": 0.00018106429317035815,
+      "loss": 1.2769,
+      "step": 4489
+    },
+    {
+      "epoch": 0.7995014245014245,
+      "grad_norm": 0.4680160582065582,
+      "learning_rate": 0.00018105609628508458,
+      "loss": 0.7059,
+      "step": 4490
+    },
+    {
+      "epoch": 0.7996794871794872,
+      "grad_norm": 0.5364881157875061,
+      "learning_rate": 0.00018104789781168517,
+      "loss": 1.0566,
+      "step": 4491
+    },
+    {
+      "epoch": 0.7998575498575499,
+      "grad_norm": 0.5917307734489441,
+      "learning_rate": 0.0001810396977503206,
+      "loss": 1.2263,
+      "step": 4492
+    },
+    {
+      "epoch": 0.8000356125356125,
+      "grad_norm": 0.6013199090957642,
+      "learning_rate": 0.0001810314961011515,
+      "loss": 1.2053,
+      "step": 4493
+    },
+    {
+      "epoch": 0.8002136752136753,
+      "grad_norm": 0.6005663275718689,
+      "learning_rate": 0.0001810232928643385,
+      "loss": 1.2241,
+      "step": 4494
+    },
+    {
+      "epoch": 0.8003917378917379,
+      "grad_norm": 0.49207603931427,
+      "learning_rate": 0.00018101508804004246,
+      "loss": 1.0661,
+      "step": 4495
+    },
+    {
+      "epoch": 0.8005698005698005,
+      "grad_norm": 0.4834063947200775,
+      "learning_rate": 0.00018100688162842401,
+      "loss": 1.1745,
+      "step": 4496
+    },
+    {
+      "epoch": 0.8007478632478633,
+      "grad_norm": 0.5347156524658203,
+      "learning_rate": 0.000180998673629644,
+      "loss": 1.0679,
+      "step": 4497
+    },
+    {
+      "epoch": 0.8009259259259259,
+      "grad_norm": 0.5815600156784058,
+      "learning_rate": 0.00018099046404386327,
+      "loss": 1.2652,
+      "step": 4498
+    },
+    {
+      "epoch": 0.8011039886039886,
+      "grad_norm": 0.5291135311126709,
+      "learning_rate": 0.00018098225287124263,
+      "loss": 1.2072,
+      "step": 4499
+    },
+    {
+      "epoch": 0.8012820512820513,
+      "grad_norm": 0.5779497027397156,
+      "learning_rate": 0.000180974040111943,
+      "loss": 1.3277,
+      "step": 4500
+    },
+    {
+      "epoch": 0.801460113960114,
+      "grad_norm": 0.44566696882247925,
+      "learning_rate": 0.0001809658257661252,
+      "loss": 0.7702,
+      "step": 4501
+    },
+    {
+      "epoch": 0.8016381766381766,
+      "grad_norm": 0.5407577753067017,
+      "learning_rate": 0.00018095760983395027,
+      "loss": 1.2894,
+      "step": 4502
+    },
+    {
+      "epoch": 0.8018162393162394,
+      "grad_norm": 0.4771903455257416,
+      "learning_rate": 0.00018094939231557916,
+      "loss": 1.045,
+      "step": 4503
+    },
+    {
+      "epoch": 0.801994301994302,
+      "grad_norm": 0.5970945358276367,
+      "learning_rate": 0.00018094117321117286,
+      "loss": 1.2059,
+      "step": 4504
+    },
+    {
+      "epoch": 0.8021723646723646,
+      "grad_norm": 0.4959338903427124,
+      "learning_rate": 0.0001809329525208924,
+      "loss": 1.155,
+      "step": 4505
+    },
+    {
+      "epoch": 0.8023504273504274,
+      "grad_norm": 0.5142548084259033,
+      "learning_rate": 0.00018092473024489887,
+      "loss": 0.9413,
+      "step": 4506
+    },
+    {
+      "epoch": 0.80252849002849,
+      "grad_norm": 0.5336433053016663,
+      "learning_rate": 0.00018091650638335334,
+      "loss": 1.0699,
+      "step": 4507
+    },
+    {
+      "epoch": 0.8027065527065527,
+      "grad_norm": 0.47770628333091736,
+      "learning_rate": 0.00018090828093641698,
+      "loss": 1.1515,
+      "step": 4508
+    },
+    {
+      "epoch": 0.8028846153846154,
+      "grad_norm": 0.5443438291549683,
+      "learning_rate": 0.00018090005390425091,
+      "loss": 1.189,
+      "step": 4509
+    },
+    {
+      "epoch": 0.8030626780626781,
+      "grad_norm": 0.523179829120636,
+      "learning_rate": 0.00018089182528701632,
+      "loss": 1.1272,
+      "step": 4510
+    },
+    {
+      "epoch": 0.8032407407407407,
+      "grad_norm": 0.49628451466560364,
+      "learning_rate": 0.00018088359508487448,
+      "loss": 0.9754,
+      "step": 4511
+    },
+    {
+      "epoch": 0.8034188034188035,
+      "grad_norm": 0.5933086276054382,
+      "learning_rate": 0.00018087536329798663,
+      "loss": 1.2111,
+      "step": 4512
+    },
+    {
+      "epoch": 0.8035968660968661,
+      "grad_norm": 0.4565310776233673,
+      "learning_rate": 0.00018086712992651402,
+      "loss": 0.7729,
+      "step": 4513
+    },
+    {
+      "epoch": 0.8037749287749287,
+      "grad_norm": 0.5013461112976074,
+      "learning_rate": 0.00018085889497061798,
+      "loss": 1.2178,
+      "step": 4514
+    },
+    {
+      "epoch": 0.8039529914529915,
+      "grad_norm": 0.5170024633407593,
+      "learning_rate": 0.00018085065843045987,
+      "loss": 0.9181,
+      "step": 4515
+    },
+    {
+      "epoch": 0.8041310541310541,
+      "grad_norm": 0.583363950252533,
+      "learning_rate": 0.00018084242030620104,
+      "loss": 1.1542,
+      "step": 4516
+    },
+    {
+      "epoch": 0.8043091168091168,
+      "grad_norm": 0.46835777163505554,
+      "learning_rate": 0.00018083418059800297,
+      "loss": 0.8954,
+      "step": 4517
+    },
+    {
+      "epoch": 0.8044871794871795,
+      "grad_norm": 0.5145657062530518,
+      "learning_rate": 0.000180825939306027,
+      "loss": 1.0417,
+      "step": 4518
+    },
+    {
+      "epoch": 0.8046652421652422,
+      "grad_norm": 0.47216105461120605,
+      "learning_rate": 0.00018081769643043467,
+      "loss": 0.9516,
+      "step": 4519
+    },
+    {
+      "epoch": 0.8048433048433048,
+      "grad_norm": 0.5059915781021118,
+      "learning_rate": 0.0001808094519713875,
+      "loss": 1.1643,
+      "step": 4520
+    },
+    {
+      "epoch": 0.8050213675213675,
+      "grad_norm": 0.5406439900398254,
+      "learning_rate": 0.00018080120592904692,
+      "loss": 1.2038,
+      "step": 4521
+    },
+    {
+      "epoch": 0.8051994301994302,
+      "grad_norm": 0.6123420000076294,
+      "learning_rate": 0.0001807929583035746,
+      "loss": 1.4004,
+      "step": 4522
+    },
+    {
+      "epoch": 0.8053774928774928,
+      "grad_norm": 0.49699845910072327,
+      "learning_rate": 0.00018078470909513208,
+      "loss": 1.0347,
+      "step": 4523
+    },
+    {
+      "epoch": 0.8055555555555556,
+      "grad_norm": 0.5369421243667603,
+      "learning_rate": 0.000180776458303881,
+      "loss": 1.0418,
+      "step": 4524
+    },
+    {
+      "epoch": 0.8057336182336182,
+      "grad_norm": 0.5407396554946899,
+      "learning_rate": 0.00018076820592998301,
+      "loss": 0.9546,
+      "step": 4525
+    },
+    {
+      "epoch": 0.8059116809116809,
+      "grad_norm": 0.5749752521514893,
+      "learning_rate": 0.00018075995197359984,
+      "loss": 1.1438,
+      "step": 4526
+    },
+    {
+      "epoch": 0.8060897435897436,
+      "grad_norm": 0.5523102283477783,
+      "learning_rate": 0.00018075169643489317,
+      "loss": 1.1312,
+      "step": 4527
+    },
+    {
+      "epoch": 0.8062678062678063,
+      "grad_norm": 0.5767508149147034,
+      "learning_rate": 0.00018074343931402472,
+      "loss": 1.1951,
+      "step": 4528
+    },
+    {
+      "epoch": 0.8064458689458689,
+      "grad_norm": 0.5262924432754517,
+      "learning_rate": 0.00018073518061115633,
+      "loss": 1.1985,
+      "step": 4529
+    },
+    {
+      "epoch": 0.8066239316239316,
+      "grad_norm": 0.4742378294467926,
+      "learning_rate": 0.0001807269203264498,
+      "loss": 1.0126,
+      "step": 4530
+    },
+    {
+      "epoch": 0.8068019943019943,
+      "grad_norm": 0.5190158486366272,
+      "learning_rate": 0.00018071865846006692,
+      "loss": 0.9985,
+      "step": 4531
+    },
+    {
+      "epoch": 0.8069800569800569,
+      "grad_norm": 0.5910618305206299,
+      "learning_rate": 0.00018071039501216964,
+      "loss": 1.2776,
+      "step": 4532
+    },
+    {
+      "epoch": 0.8071581196581197,
+      "grad_norm": 0.5363098382949829,
+      "learning_rate": 0.00018070212998291983,
+      "loss": 1.3346,
+      "step": 4533
+    },
+    {
+      "epoch": 0.8073361823361823,
+      "grad_norm": 0.47711408138275146,
+      "learning_rate": 0.0001806938633724794,
+      "loss": 1.04,
+      "step": 4534
+    },
+    {
+      "epoch": 0.8075142450142451,
+      "grad_norm": 0.5092964172363281,
+      "learning_rate": 0.0001806855951810104,
+      "loss": 1.1409,
+      "step": 4535
+    },
+    {
+      "epoch": 0.8076923076923077,
+      "grad_norm": 0.5828777551651001,
+      "learning_rate": 0.00018067732540867472,
+      "loss": 1.3048,
+      "step": 4536
+    },
+    {
+      "epoch": 0.8078703703703703,
+      "grad_norm": 0.5779826045036316,
+      "learning_rate": 0.00018066905405563445,
+      "loss": 1.1599,
+      "step": 4537
+    },
+    {
+      "epoch": 0.8080484330484331,
+      "grad_norm": 0.49908435344696045,
+      "learning_rate": 0.00018066078112205167,
+      "loss": 1.1502,
+      "step": 4538
+    },
+    {
+      "epoch": 0.8082264957264957,
+      "grad_norm": 0.4772704839706421,
+      "learning_rate": 0.0001806525066080884,
+      "loss": 0.7925,
+      "step": 4539
+    },
+    {
+      "epoch": 0.8084045584045584,
+      "grad_norm": 0.4298383295536041,
+      "learning_rate": 0.00018064423051390683,
+      "loss": 0.7322,
+      "step": 4540
+    },
+    {
+      "epoch": 0.8085826210826211,
+      "grad_norm": 0.49349579215049744,
+      "learning_rate": 0.0001806359528396691,
+      "loss": 1.0021,
+      "step": 4541
+    },
+    {
+      "epoch": 0.8087606837606838,
+      "grad_norm": 0.4698609411716461,
+      "learning_rate": 0.00018062767358553735,
+      "loss": 0.9751,
+      "step": 4542
+    },
+    {
+      "epoch": 0.8089387464387464,
+      "grad_norm": 0.4949014186859131,
+      "learning_rate": 0.00018061939275167385,
+      "loss": 0.9553,
+      "step": 4543
+    },
+    {
+      "epoch": 0.8091168091168092,
+      "grad_norm": 0.5604463815689087,
+      "learning_rate": 0.0001806111103382408,
+      "loss": 0.9894,
+      "step": 4544
+    },
+    {
+      "epoch": 0.8092948717948718,
+      "grad_norm": 0.5761561989784241,
+      "learning_rate": 0.00018060282634540053,
+      "loss": 1.258,
+      "step": 4545
+    },
+    {
+      "epoch": 0.8094729344729344,
+      "grad_norm": 0.5239115357398987,
+      "learning_rate": 0.00018059454077331527,
+      "loss": 0.9189,
+      "step": 4546
+    },
+    {
+      "epoch": 0.8096509971509972,
+      "grad_norm": 0.47902220487594604,
+      "learning_rate": 0.00018058625362214742,
+      "loss": 1.0389,
+      "step": 4547
+    },
+    {
+      "epoch": 0.8098290598290598,
+      "grad_norm": 0.6274173259735107,
+      "learning_rate": 0.00018057796489205936,
+      "loss": 1.3368,
+      "step": 4548
+    },
+    {
+      "epoch": 0.8100071225071225,
+      "grad_norm": 0.5789401531219482,
+      "learning_rate": 0.00018056967458321345,
+      "loss": 1.1473,
+      "step": 4549
+    },
+    {
+      "epoch": 0.8101851851851852,
+      "grad_norm": 0.5850043296813965,
+      "learning_rate": 0.0001805613826957721,
+      "loss": 1.2224,
+      "step": 4550
+    },
+    {
+      "epoch": 0.8103632478632479,
+      "grad_norm": 0.6310738921165466,
+      "learning_rate": 0.00018055308922989788,
+      "loss": 1.0707,
+      "step": 4551
+    },
+    {
+      "epoch": 0.8105413105413105,
+      "grad_norm": 0.5198429822921753,
+      "learning_rate": 0.00018054479418575317,
+      "loss": 0.8984,
+      "step": 4552
+    },
+    {
+      "epoch": 0.8107193732193733,
+      "grad_norm": 0.5757743120193481,
+      "learning_rate": 0.00018053649756350054,
+      "loss": 1.2007,
+      "step": 4553
+    },
+    {
+      "epoch": 0.8108974358974359,
+      "grad_norm": 0.5109567642211914,
+      "learning_rate": 0.0001805281993633025,
+      "loss": 1.0696,
+      "step": 4554
+    },
+    {
+      "epoch": 0.8110754985754985,
+      "grad_norm": 0.5030225515365601,
+      "learning_rate": 0.00018051989958532173,
+      "loss": 0.9667,
+      "step": 4555
+    },
+    {
+      "epoch": 0.8112535612535613,
+      "grad_norm": 0.5291743874549866,
+      "learning_rate": 0.00018051159822972079,
+      "loss": 1.0219,
+      "step": 4556
+    },
+    {
+      "epoch": 0.8114316239316239,
+      "grad_norm": 0.5874896049499512,
+      "learning_rate": 0.00018050329529666233,
+      "loss": 0.8589,
+      "step": 4557
+    },
+    {
+      "epoch": 0.8116096866096866,
+      "grad_norm": 0.673284113407135,
+      "learning_rate": 0.000180494990786309,
+      "loss": 1.1902,
+      "step": 4558
+    },
+    {
+      "epoch": 0.8117877492877493,
+      "grad_norm": 0.4742524027824402,
+      "learning_rate": 0.00018048668469882354,
+      "loss": 1.0578,
+      "step": 4559
+    },
+    {
+      "epoch": 0.811965811965812,
+      "grad_norm": 0.5519167184829712,
+      "learning_rate": 0.0001804783770343687,
+      "loss": 1.083,
+      "step": 4560
+    },
+    {
+      "epoch": 0.8121438746438746,
+      "grad_norm": 0.5669941306114197,
+      "learning_rate": 0.00018047006779310727,
+      "loss": 1.0784,
+      "step": 4561
+    },
+    {
+      "epoch": 0.8123219373219374,
+      "grad_norm": 0.512759804725647,
+      "learning_rate": 0.000180461756975202,
+      "loss": 1.0361,
+      "step": 4562
+    },
+    {
+      "epoch": 0.8125,
+      "grad_norm": 0.5721749067306519,
+      "learning_rate": 0.00018045344458081575,
+      "loss": 1.0246,
+      "step": 4563
+    },
+    {
+      "epoch": 0.8126780626780626,
+      "grad_norm": 0.566430389881134,
+      "learning_rate": 0.00018044513061011137,
+      "loss": 1.1452,
+      "step": 4564
+    },
+    {
+      "epoch": 0.8128561253561254,
+      "grad_norm": 0.49391916394233704,
+      "learning_rate": 0.00018043681506325177,
+      "loss": 0.89,
+      "step": 4565
+    },
+    {
+      "epoch": 0.813034188034188,
+      "grad_norm": 0.5379437804222107,
+      "learning_rate": 0.00018042849794039988,
+      "loss": 1.1289,
+      "step": 4566
+    },
+    {
+      "epoch": 0.8132122507122507,
+      "grad_norm": 0.5667982697486877,
+      "learning_rate": 0.00018042017924171865,
+      "loss": 1.1596,
+      "step": 4567
+    },
+    {
+      "epoch": 0.8133903133903134,
+      "grad_norm": 0.6214209794998169,
+      "learning_rate": 0.00018041185896737109,
+      "loss": 1.0622,
+      "step": 4568
+    },
+    {
+      "epoch": 0.8135683760683761,
+      "grad_norm": 0.5442491173744202,
+      "learning_rate": 0.00018040353711752015,
+      "loss": 1.0536,
+      "step": 4569
+    },
+    {
+      "epoch": 0.8137464387464387,
+      "grad_norm": 0.5266172885894775,
+      "learning_rate": 0.00018039521369232894,
+      "loss": 1.0576,
+      "step": 4570
+    },
+    {
+      "epoch": 0.8139245014245015,
+      "grad_norm": 0.6057912111282349,
+      "learning_rate": 0.00018038688869196053,
+      "loss": 1.3067,
+      "step": 4571
+    },
+    {
+      "epoch": 0.8141025641025641,
+      "grad_norm": 0.489869087934494,
+      "learning_rate": 0.00018037856211657803,
+      "loss": 1.0279,
+      "step": 4572
+    },
+    {
+      "epoch": 0.8142806267806267,
+      "grad_norm": 0.5497978329658508,
+      "learning_rate": 0.00018037023396634457,
+      "loss": 1.1568,
+      "step": 4573
+    },
+    {
+      "epoch": 0.8144586894586895,
+      "grad_norm": 0.5243251919746399,
+      "learning_rate": 0.0001803619042414233,
+      "loss": 0.9767,
+      "step": 4574
+    },
+    {
+      "epoch": 0.8146367521367521,
+      "grad_norm": 0.503032922744751,
+      "learning_rate": 0.0001803535729419775,
+      "loss": 1.065,
+      "step": 4575
+    },
+    {
+      "epoch": 0.8148148148148148,
+      "grad_norm": 0.49955418705940247,
+      "learning_rate": 0.00018034524006817034,
+      "loss": 1.2752,
+      "step": 4576
+    },
+    {
+      "epoch": 0.8149928774928775,
+      "grad_norm": 0.5746406316757202,
+      "learning_rate": 0.00018033690562016508,
+      "loss": 1.098,
+      "step": 4577
+    },
+    {
+      "epoch": 0.8151709401709402,
+      "grad_norm": 0.5224192142486572,
+      "learning_rate": 0.00018032856959812507,
+      "loss": 1.1284,
+      "step": 4578
+    },
+    {
+      "epoch": 0.8153490028490028,
+      "grad_norm": 0.5484535694122314,
+      "learning_rate": 0.00018032023200221362,
+      "loss": 0.9182,
+      "step": 4579
+    },
+    {
+      "epoch": 0.8155270655270656,
+      "grad_norm": 0.5003355741500854,
+      "learning_rate": 0.00018031189283259405,
+      "loss": 1.136,
+      "step": 4580
+    },
+    {
+      "epoch": 0.8157051282051282,
+      "grad_norm": 0.5395768284797668,
+      "learning_rate": 0.00018030355208942977,
+      "loss": 1.2349,
+      "step": 4581
+    },
+    {
+      "epoch": 0.8158831908831908,
+      "grad_norm": 0.561966598033905,
+      "learning_rate": 0.0001802952097728842,
+      "loss": 0.999,
+      "step": 4582
+    },
+    {
+      "epoch": 0.8160612535612536,
+      "grad_norm": 0.4886479675769806,
+      "learning_rate": 0.00018028686588312083,
+      "loss": 0.9165,
+      "step": 4583
+    },
+    {
+      "epoch": 0.8162393162393162,
+      "grad_norm": 0.4769509732723236,
+      "learning_rate": 0.00018027852042030307,
+      "loss": 1.1377,
+      "step": 4584
+    },
+    {
+      "epoch": 0.8164173789173789,
+      "grad_norm": 0.4723633825778961,
+      "learning_rate": 0.00018027017338459448,
+      "loss": 1.0274,
+      "step": 4585
+    },
+    {
+      "epoch": 0.8165954415954416,
+      "grad_norm": 0.5773285627365112,
+      "learning_rate": 0.00018026182477615859,
+      "loss": 1.1468,
+      "step": 4586
+    },
+    {
+      "epoch": 0.8167735042735043,
+      "grad_norm": 0.5529203414916992,
+      "learning_rate": 0.00018025347459515895,
+      "loss": 1.0815,
+      "step": 4587
+    },
+    {
+      "epoch": 0.8169515669515669,
+      "grad_norm": 0.5449469685554504,
+      "learning_rate": 0.00018024512284175922,
+      "loss": 1.1637,
+      "step": 4588
+    },
+    {
+      "epoch": 0.8171296296296297,
+      "grad_norm": 0.5155341625213623,
+      "learning_rate": 0.00018023676951612298,
+      "loss": 1.1842,
+      "step": 4589
+    },
+    {
+      "epoch": 0.8173076923076923,
+      "grad_norm": 0.5569564700126648,
+      "learning_rate": 0.00018022841461841393,
+      "loss": 0.9254,
+      "step": 4590
+    },
+    {
+      "epoch": 0.8174857549857549,
+      "grad_norm": 0.45203131437301636,
+      "learning_rate": 0.00018022005814879573,
+      "loss": 0.9561,
+      "step": 4591
+    },
+    {
+      "epoch": 0.8176638176638177,
+      "grad_norm": 0.5735056400299072,
+      "learning_rate": 0.00018021170010743218,
+      "loss": 1.1402,
+      "step": 4592
+    },
+    {
+      "epoch": 0.8178418803418803,
+      "grad_norm": 0.6075260043144226,
+      "learning_rate": 0.00018020334049448697,
+      "loss": 0.8601,
+      "step": 4593
+    },
+    {
+      "epoch": 0.8180199430199431,
+      "grad_norm": 0.522682785987854,
+      "learning_rate": 0.0001801949793101239,
+      "loss": 1.0088,
+      "step": 4594
+    },
+    {
+      "epoch": 0.8181980056980057,
+      "grad_norm": 0.5648437142372131,
+      "learning_rate": 0.00018018661655450682,
+      "loss": 0.8359,
+      "step": 4595
+    },
+    {
+      "epoch": 0.8183760683760684,
+      "grad_norm": 0.5406472086906433,
+      "learning_rate": 0.00018017825222779954,
+      "loss": 1.1553,
+      "step": 4596
+    },
+    {
+      "epoch": 0.8185541310541311,
+      "grad_norm": 0.4917788803577423,
+      "learning_rate": 0.000180169886330166,
+      "loss": 1.2198,
+      "step": 4597
+    },
+    {
+      "epoch": 0.8187321937321937,
+      "grad_norm": 0.6293069124221802,
+      "learning_rate": 0.00018016151886177004,
+      "loss": 1.0245,
+      "step": 4598
+    },
+    {
+      "epoch": 0.8189102564102564,
+      "grad_norm": 0.47277843952178955,
+      "learning_rate": 0.00018015314982277564,
+      "loss": 1.1141,
+      "step": 4599
+    },
+    {
+      "epoch": 0.8190883190883191,
+      "grad_norm": 0.6132395267486572,
+      "learning_rate": 0.0001801447792133468,
+      "loss": 1.1227,
+      "step": 4600
+    },
+    {
+      "epoch": 0.8192663817663818,
+      "grad_norm": 0.46839597821235657,
+      "learning_rate": 0.00018013640703364747,
+      "loss": 0.9239,
+      "step": 4601
+    },
+    {
+      "epoch": 0.8194444444444444,
+      "grad_norm": 0.5055009722709656,
+      "learning_rate": 0.00018012803328384171,
+      "loss": 0.8486,
+      "step": 4602
+    },
+    {
+      "epoch": 0.8196225071225072,
+      "grad_norm": 0.5094841718673706,
+      "learning_rate": 0.00018011965796409362,
+      "loss": 0.9969,
+      "step": 4603
+    },
+    {
+      "epoch": 0.8198005698005698,
+      "grad_norm": 0.6177363395690918,
+      "learning_rate": 0.00018011128107456726,
+      "loss": 1.242,
+      "step": 4604
+    },
+    {
+      "epoch": 0.8199786324786325,
+      "grad_norm": 0.5280042290687561,
+      "learning_rate": 0.00018010290261542676,
+      "loss": 1.1569,
+      "step": 4605
+    },
+    {
+      "epoch": 0.8201566951566952,
+      "grad_norm": 0.5259367227554321,
+      "learning_rate": 0.00018009452258683625,
+      "loss": 0.9993,
+      "step": 4606
+    },
+    {
+      "epoch": 0.8203347578347578,
+      "grad_norm": 0.464469850063324,
+      "learning_rate": 0.00018008614098896,
+      "loss": 1.0288,
+      "step": 4607
+    },
+    {
+      "epoch": 0.8205128205128205,
+      "grad_norm": 0.6136324405670166,
+      "learning_rate": 0.00018007775782196214,
+      "loss": 1.1541,
+      "step": 4608
+    },
+    {
+      "epoch": 0.8206908831908832,
+      "grad_norm": 0.5376590490341187,
+      "learning_rate": 0.000180069373086007,
+      "loss": 1.0624,
+      "step": 4609
+    },
+    {
+      "epoch": 0.8208689458689459,
+      "grad_norm": 0.662916362285614,
+      "learning_rate": 0.0001800609867812588,
+      "loss": 1.1502,
+      "step": 4610
+    },
+    {
+      "epoch": 0.8210470085470085,
+      "grad_norm": 0.5153383612632751,
+      "learning_rate": 0.00018005259890788188,
+      "loss": 0.9789,
+      "step": 4611
+    },
+    {
+      "epoch": 0.8212250712250713,
+      "grad_norm": 0.5042359232902527,
+      "learning_rate": 0.00018004420946604057,
+      "loss": 0.9585,
+      "step": 4612
+    },
+    {
+      "epoch": 0.8214031339031339,
+      "grad_norm": 0.5395993590354919,
+      "learning_rate": 0.00018003581845589927,
+      "loss": 1.159,
+      "step": 4613
+    },
+    {
+      "epoch": 0.8215811965811965,
+      "grad_norm": 0.5561928749084473,
+      "learning_rate": 0.00018002742587762237,
+      "loss": 1.1604,
+      "step": 4614
+    },
+    {
+      "epoch": 0.8217592592592593,
+      "grad_norm": 0.5602710843086243,
+      "learning_rate": 0.00018001903173137432,
+      "loss": 0.9922,
+      "step": 4615
+    },
+    {
+      "epoch": 0.8219373219373219,
+      "grad_norm": 0.5529088377952576,
+      "learning_rate": 0.00018001063601731955,
+      "loss": 1.0943,
+      "step": 4616
+    },
+    {
+      "epoch": 0.8221153846153846,
+      "grad_norm": 0.5156456828117371,
+      "learning_rate": 0.00018000223873562254,
+      "loss": 1.1399,
+      "step": 4617
+    },
+    {
+      "epoch": 0.8222934472934473,
+      "grad_norm": 0.4868306517601013,
+      "learning_rate": 0.0001799938398864479,
+      "loss": 1.0692,
+      "step": 4618
+    },
+    {
+      "epoch": 0.82247150997151,
+      "grad_norm": 0.5372915267944336,
+      "learning_rate": 0.0001799854394699601,
+      "loss": 1.2675,
+      "step": 4619
+    },
+    {
+      "epoch": 0.8226495726495726,
+      "grad_norm": 0.6101839542388916,
+      "learning_rate": 0.0001799770374863238,
+      "loss": 0.9586,
+      "step": 4620
+    },
+    {
+      "epoch": 0.8228276353276354,
+      "grad_norm": 0.5034586787223816,
+      "learning_rate": 0.00017996863393570357,
+      "loss": 1.0885,
+      "step": 4621
+    },
+    {
+      "epoch": 0.823005698005698,
+      "grad_norm": 0.5608823299407959,
+      "learning_rate": 0.0001799602288182641,
+      "loss": 1.0002,
+      "step": 4622
+    },
+    {
+      "epoch": 0.8231837606837606,
+      "grad_norm": 0.5700048208236694,
+      "learning_rate": 0.00017995182213417,
+      "loss": 1.1484,
+      "step": 4623
+    },
+    {
+      "epoch": 0.8233618233618234,
+      "grad_norm": 0.5283229351043701,
+      "learning_rate": 0.00017994341388358608,
+      "loss": 1.0744,
+      "step": 4624
+    },
+    {
+      "epoch": 0.823539886039886,
+      "grad_norm": 0.5215758681297302,
+      "learning_rate": 0.00017993500406667703,
+      "loss": 1.2686,
+      "step": 4625
+    },
+    {
+      "epoch": 0.8237179487179487,
+      "grad_norm": 0.528883159160614,
+      "learning_rate": 0.0001799265926836076,
+      "loss": 1.1393,
+      "step": 4626
+    },
+    {
+      "epoch": 0.8238960113960114,
+      "grad_norm": 0.5589834451675415,
+      "learning_rate": 0.00017991817973454265,
+      "loss": 1.1744,
+      "step": 4627
+    },
+    {
+      "epoch": 0.8240740740740741,
+      "grad_norm": 0.49817174673080444,
+      "learning_rate": 0.00017990976521964697,
+      "loss": 1.0544,
+      "step": 4628
+    },
+    {
+      "epoch": 0.8242521367521367,
+      "grad_norm": 0.613961398601532,
+      "learning_rate": 0.00017990134913908542,
+      "loss": 1.0951,
+      "step": 4629
+    },
+    {
+      "epoch": 0.8244301994301995,
+      "grad_norm": 0.47278255224227905,
+      "learning_rate": 0.00017989293149302295,
+      "loss": 0.9742,
+      "step": 4630
+    },
+    {
+      "epoch": 0.8246082621082621,
+      "grad_norm": 0.49807092547416687,
+      "learning_rate": 0.00017988451228162443,
+      "loss": 1.0985,
+      "step": 4631
+    },
+    {
+      "epoch": 0.8247863247863247,
+      "grad_norm": 0.5624374747276306,
+      "learning_rate": 0.00017987609150505485,
+      "loss": 1.2446,
+      "step": 4632
+    },
+    {
+      "epoch": 0.8249643874643875,
+      "grad_norm": 0.4863535761833191,
+      "learning_rate": 0.00017986766916347916,
+      "loss": 1.0239,
+      "step": 4633
+    },
+    {
+      "epoch": 0.8251424501424501,
+      "grad_norm": 0.679585874080658,
+      "learning_rate": 0.00017985924525706245,
+      "loss": 1.1698,
+      "step": 4634
+    },
+    {
+      "epoch": 0.8253205128205128,
+      "grad_norm": 0.5545455813407898,
+      "learning_rate": 0.00017985081978596967,
+      "loss": 1.0926,
+      "step": 4635
+    },
+    {
+      "epoch": 0.8254985754985755,
+      "grad_norm": 0.5303109288215637,
+      "learning_rate": 0.000179842392750366,
+      "loss": 1.0978,
+      "step": 4636
+    },
+    {
+      "epoch": 0.8256766381766382,
+      "grad_norm": 0.6053299307823181,
+      "learning_rate": 0.00017983396415041644,
+      "loss": 1.0596,
+      "step": 4637
+    },
+    {
+      "epoch": 0.8258547008547008,
+      "grad_norm": 0.5241885185241699,
+      "learning_rate": 0.00017982553398628625,
+      "loss": 0.8541,
+      "step": 4638
+    },
+    {
+      "epoch": 0.8260327635327636,
+      "grad_norm": 0.5934443473815918,
+      "learning_rate": 0.00017981710225814052,
+      "loss": 1.145,
+      "step": 4639
+    },
+    {
+      "epoch": 0.8262108262108262,
+      "grad_norm": 0.5341619849205017,
+      "learning_rate": 0.00017980866896614447,
+      "loss": 1.0745,
+      "step": 4640
+    },
+    {
+      "epoch": 0.8263888888888888,
+      "grad_norm": 0.6732913851737976,
+      "learning_rate": 0.00017980023411046336,
+      "loss": 1.0775,
+      "step": 4641
+    },
+    {
+      "epoch": 0.8265669515669516,
+      "grad_norm": 0.5134359002113342,
+      "learning_rate": 0.0001797917976912624,
+      "loss": 1.0298,
+      "step": 4642
+    },
+    {
+      "epoch": 0.8267450142450142,
+      "grad_norm": 0.5234783887863159,
+      "learning_rate": 0.00017978335970870698,
+      "loss": 1.1069,
+      "step": 4643
+    },
+    {
+      "epoch": 0.8269230769230769,
+      "grad_norm": 0.4776439964771271,
+      "learning_rate": 0.00017977492016296232,
+      "loss": 0.6367,
+      "step": 4644
+    },
+    {
+      "epoch": 0.8271011396011396,
+      "grad_norm": 0.53763347864151,
+      "learning_rate": 0.0001797664790541938,
+      "loss": 1.1356,
+      "step": 4645
+    },
+    {
+      "epoch": 0.8272792022792023,
+      "grad_norm": 0.5082212686538696,
+      "learning_rate": 0.00017975803638256682,
+      "loss": 0.7873,
+      "step": 4646
+    },
+    {
+      "epoch": 0.8274572649572649,
+      "grad_norm": 0.5156424641609192,
+      "learning_rate": 0.00017974959214824685,
+      "loss": 1.084,
+      "step": 4647
+    },
+    {
+      "epoch": 0.8276353276353277,
+      "grad_norm": 0.5275198817253113,
+      "learning_rate": 0.00017974114635139926,
+      "loss": 1.1219,
+      "step": 4648
+    },
+    {
+      "epoch": 0.8278133903133903,
+      "grad_norm": 0.5548223257064819,
+      "learning_rate": 0.00017973269899218956,
+      "loss": 1.0808,
+      "step": 4649
+    },
+    {
+      "epoch": 0.8279914529914529,
+      "grad_norm": 0.535347580909729,
+      "learning_rate": 0.00017972425007078323,
+      "loss": 1.1211,
+      "step": 4650
+    },
+    {
+      "epoch": 0.8281695156695157,
+      "grad_norm": 0.5299580693244934,
+      "learning_rate": 0.00017971579958734587,
+      "loss": 0.9911,
+      "step": 4651
+    },
+    {
+      "epoch": 0.8283475783475783,
+      "grad_norm": 0.4863550066947937,
+      "learning_rate": 0.000179707347542043,
+      "loss": 0.9122,
+      "step": 4652
+    },
+    {
+      "epoch": 0.8285256410256411,
+      "grad_norm": 0.5284972190856934,
+      "learning_rate": 0.00017969889393504022,
+      "loss": 1.0424,
+      "step": 4653
+    },
+    {
+      "epoch": 0.8287037037037037,
+      "grad_norm": 0.5305661559104919,
+      "learning_rate": 0.00017969043876650317,
+      "loss": 1.1122,
+      "step": 4654
+    },
+    {
+      "epoch": 0.8288817663817664,
+      "grad_norm": 0.5645657777786255,
+      "learning_rate": 0.00017968198203659755,
+      "loss": 1.2195,
+      "step": 4655
+    },
+    {
+      "epoch": 0.8290598290598291,
+      "grad_norm": 0.521649181842804,
+      "learning_rate": 0.000179673523745489,
+      "loss": 1.2684,
+      "step": 4656
+    },
+    {
+      "epoch": 0.8292378917378918,
+      "grad_norm": 0.5984422564506531,
+      "learning_rate": 0.00017966506389334322,
+      "loss": 0.9894,
+      "step": 4657
+    },
+    {
+      "epoch": 0.8294159544159544,
+      "grad_norm": 0.5318729281425476,
+      "learning_rate": 0.00017965660248032603,
+      "loss": 1.2929,
+      "step": 4658
+    },
+    {
+      "epoch": 0.8295940170940171,
+      "grad_norm": 0.4666081368923187,
+      "learning_rate": 0.0001796481395066032,
+      "loss": 0.9646,
+      "step": 4659
+    },
+    {
+      "epoch": 0.8297720797720798,
+      "grad_norm": 0.5780388116836548,
+      "learning_rate": 0.00017963967497234054,
+      "loss": 1.1043,
+      "step": 4660
+    },
+    {
+      "epoch": 0.8299501424501424,
+      "grad_norm": 0.44089245796203613,
+      "learning_rate": 0.00017963120887770387,
+      "loss": 0.8932,
+      "step": 4661
+    },
+    {
+      "epoch": 0.8301282051282052,
+      "grad_norm": 0.5198349356651306,
+      "learning_rate": 0.0001796227412228591,
+      "loss": 0.9378,
+      "step": 4662
+    },
+    {
+      "epoch": 0.8303062678062678,
+      "grad_norm": 0.5298343896865845,
+      "learning_rate": 0.00017961427200797206,
+      "loss": 1.0272,
+      "step": 4663
+    },
+    {
+      "epoch": 0.8304843304843305,
+      "grad_norm": 0.5087099671363831,
+      "learning_rate": 0.0001796058012332088,
+      "loss": 0.989,
+      "step": 4664
+    },
+    {
+      "epoch": 0.8306623931623932,
+      "grad_norm": 0.504228949546814,
+      "learning_rate": 0.0001795973288987352,
+      "loss": 1.0134,
+      "step": 4665
+    },
+    {
+      "epoch": 0.8308404558404558,
+      "grad_norm": 0.6788033843040466,
+      "learning_rate": 0.00017958885500471728,
+      "loss": 0.8856,
+      "step": 4666
+    },
+    {
+      "epoch": 0.8310185185185185,
+      "grad_norm": 0.5166172385215759,
+      "learning_rate": 0.00017958037955132113,
+      "loss": 0.8711,
+      "step": 4667
+    },
+    {
+      "epoch": 0.8311965811965812,
+      "grad_norm": 0.5712400078773499,
+      "learning_rate": 0.00017957190253871272,
+      "loss": 1.0418,
+      "step": 4668
+    },
+    {
+      "epoch": 0.8313746438746439,
+      "grad_norm": 0.5531231164932251,
+      "learning_rate": 0.0001795634239670582,
+      "loss": 0.9021,
+      "step": 4669
+    },
+    {
+      "epoch": 0.8315527065527065,
+      "grad_norm": 0.6165615916252136,
+      "learning_rate": 0.00017955494383652365,
+      "loss": 1.0927,
+      "step": 4670
+    },
+    {
+      "epoch": 0.8317307692307693,
+      "grad_norm": 0.5920368432998657,
+      "learning_rate": 0.00017954646214727525,
+      "loss": 1.231,
+      "step": 4671
+    },
+    {
+      "epoch": 0.8319088319088319,
+      "grad_norm": 0.5037244558334351,
+      "learning_rate": 0.00017953797889947915,
+      "loss": 0.85,
+      "step": 4672
+    },
+    {
+      "epoch": 0.8320868945868946,
+      "grad_norm": 0.5618211627006531,
+      "learning_rate": 0.0001795294940933016,
+      "loss": 1.145,
+      "step": 4673
+    },
+    {
+      "epoch": 0.8322649572649573,
+      "grad_norm": 0.6275593042373657,
+      "learning_rate": 0.00017952100772890877,
+      "loss": 0.9061,
+      "step": 4674
+    },
+    {
+      "epoch": 0.83244301994302,
+      "grad_norm": 0.5376096367835999,
+      "learning_rate": 0.00017951251980646702,
+      "loss": 1.1948,
+      "step": 4675
+    },
+    {
+      "epoch": 0.8326210826210826,
+      "grad_norm": 0.5162268877029419,
+      "learning_rate": 0.0001795040303261426,
+      "loss": 1.2158,
+      "step": 4676
+    },
+    {
+      "epoch": 0.8327991452991453,
+      "grad_norm": 0.5730512142181396,
+      "learning_rate": 0.0001794955392881019,
+      "loss": 0.9962,
+      "step": 4677
+    },
+    {
+      "epoch": 0.832977207977208,
+      "grad_norm": 0.5128712058067322,
+      "learning_rate": 0.00017948704669251122,
+      "loss": 1.2797,
+      "step": 4678
+    },
+    {
+      "epoch": 0.8331552706552706,
+      "grad_norm": 0.5173979997634888,
+      "learning_rate": 0.00017947855253953697,
+      "loss": 1.1093,
+      "step": 4679
+    },
+    {
+      "epoch": 0.8333333333333334,
+      "grad_norm": 0.504646897315979,
+      "learning_rate": 0.0001794700568293456,
+      "loss": 1.3171,
+      "step": 4680
+    },
+    {
+      "epoch": 0.833511396011396,
+      "grad_norm": 0.5638105869293213,
+      "learning_rate": 0.00017946155956210356,
+      "loss": 0.9224,
+      "step": 4681
+    },
+    {
+      "epoch": 0.8336894586894587,
+      "grad_norm": 0.5289680361747742,
+      "learning_rate": 0.00017945306073797733,
+      "loss": 0.8919,
+      "step": 4682
+    },
+    {
+      "epoch": 0.8338675213675214,
+      "grad_norm": 0.5224629044532776,
+      "learning_rate": 0.0001794445603571334,
+      "loss": 1.0345,
+      "step": 4683
+    },
+    {
+      "epoch": 0.834045584045584,
+      "grad_norm": 0.5342282056808472,
+      "learning_rate": 0.00017943605841973836,
+      "loss": 1.2305,
+      "step": 4684
+    },
+    {
+      "epoch": 0.8342236467236467,
+      "grad_norm": 0.6118032336235046,
+      "learning_rate": 0.00017942755492595874,
+      "loss": 1.0316,
+      "step": 4685
+    },
+    {
+      "epoch": 0.8344017094017094,
+      "grad_norm": 0.49112311005592346,
+      "learning_rate": 0.00017941904987596121,
+      "loss": 0.9809,
+      "step": 4686
+    },
+    {
+      "epoch": 0.8345797720797721,
+      "grad_norm": 0.5044063925743103,
+      "learning_rate": 0.0001794105432699124,
+      "loss": 0.834,
+      "step": 4687
+    },
+    {
+      "epoch": 0.8347578347578347,
+      "grad_norm": 0.4849987328052521,
+      "learning_rate": 0.00017940203510797892,
+      "loss": 0.9971,
+      "step": 4688
+    },
+    {
+      "epoch": 0.8349358974358975,
+      "grad_norm": 0.5539469122886658,
+      "learning_rate": 0.00017939352539032748,
+      "loss": 1.1599,
+      "step": 4689
+    },
+    {
+      "epoch": 0.8351139601139601,
+      "grad_norm": 0.5474258065223694,
+      "learning_rate": 0.00017938501411712485,
+      "loss": 1.25,
+      "step": 4690
+    },
+    {
+      "epoch": 0.8352920227920227,
+      "grad_norm": 0.4880213737487793,
+      "learning_rate": 0.0001793765012885378,
+      "loss": 1.1471,
+      "step": 4691
+    },
+    {
+      "epoch": 0.8354700854700855,
+      "grad_norm": 0.5602759718894958,
+      "learning_rate": 0.00017936798690473309,
+      "loss": 1.0723,
+      "step": 4692
+    },
+    {
+      "epoch": 0.8356481481481481,
+      "grad_norm": 0.627775251865387,
+      "learning_rate": 0.00017935947096587755,
+      "loss": 1.3768,
+      "step": 4693
+    },
+    {
+      "epoch": 0.8358262108262108,
+      "grad_norm": 0.5324847102165222,
+      "learning_rate": 0.00017935095347213804,
+      "loss": 0.9945,
+      "step": 4694
+    },
+    {
+      "epoch": 0.8360042735042735,
+      "grad_norm": 0.5244048237800598,
+      "learning_rate": 0.0001793424344236814,
+      "loss": 1.1725,
+      "step": 4695
+    },
+    {
+      "epoch": 0.8361823361823362,
+      "grad_norm": 0.5420708656311035,
+      "learning_rate": 0.00017933391382067462,
+      "loss": 1.1267,
+      "step": 4696
+    },
+    {
+      "epoch": 0.8363603988603988,
+      "grad_norm": 0.5285456776618958,
+      "learning_rate": 0.00017932539166328458,
+      "loss": 1.0368,
+      "step": 4697
+    },
+    {
+      "epoch": 0.8365384615384616,
+      "grad_norm": 0.5330373048782349,
+      "learning_rate": 0.00017931686795167825,
+      "loss": 1.1082,
+      "step": 4698
+    },
+    {
+      "epoch": 0.8367165242165242,
+      "grad_norm": 0.5516682267189026,
+      "learning_rate": 0.0001793083426860227,
+      "loss": 1.1833,
+      "step": 4699
+    },
+    {
+      "epoch": 0.8368945868945868,
+      "grad_norm": 0.5229935646057129,
+      "learning_rate": 0.0001792998158664849,
+      "loss": 0.8527,
+      "step": 4700
+    },
+    {
+      "epoch": 0.8370726495726496,
+      "grad_norm": 0.4821490943431854,
+      "learning_rate": 0.00017929128749323195,
+      "loss": 1.1201,
+      "step": 4701
+    },
+    {
+      "epoch": 0.8372507122507122,
+      "grad_norm": 0.6276404857635498,
+      "learning_rate": 0.0001792827575664309,
+      "loss": 1.0986,
+      "step": 4702
+    },
+    {
+      "epoch": 0.8374287749287749,
+      "grad_norm": 0.5681334733963013,
+      "learning_rate": 0.00017927422608624897,
+      "loss": 1.3821,
+      "step": 4703
+    },
+    {
+      "epoch": 0.8376068376068376,
+      "grad_norm": 0.5257087349891663,
+      "learning_rate": 0.00017926569305285324,
+      "loss": 1.1033,
+      "step": 4704
+    },
+    {
+      "epoch": 0.8377849002849003,
+      "grad_norm": 0.5665168166160583,
+      "learning_rate": 0.0001792571584664109,
+      "loss": 1.104,
+      "step": 4705
+    },
+    {
+      "epoch": 0.8379629629629629,
+      "grad_norm": 0.5202076435089111,
+      "learning_rate": 0.00017924862232708918,
+      "loss": 1.052,
+      "step": 4706
+    },
+    {
+      "epoch": 0.8381410256410257,
+      "grad_norm": 0.5103010535240173,
+      "learning_rate": 0.00017924008463505534,
+      "loss": 1.1348,
+      "step": 4707
+    },
+    {
+      "epoch": 0.8383190883190883,
+      "grad_norm": 0.6811865568161011,
+      "learning_rate": 0.00017923154539047667,
+      "loss": 1.2804,
+      "step": 4708
+    },
+    {
+      "epoch": 0.8384971509971509,
+      "grad_norm": 0.46808311343193054,
+      "learning_rate": 0.00017922300459352042,
+      "loss": 0.9302,
+      "step": 4709
+    },
+    {
+      "epoch": 0.8386752136752137,
+      "grad_norm": 0.47713059186935425,
+      "learning_rate": 0.00017921446224435398,
+      "loss": 0.78,
+      "step": 4710
+    },
+    {
+      "epoch": 0.8388532763532763,
+      "grad_norm": 0.7579890489578247,
+      "learning_rate": 0.0001792059183431447,
+      "loss": 1.4776,
+      "step": 4711
+    },
+    {
+      "epoch": 0.8390313390313391,
+      "grad_norm": 0.6009423136711121,
+      "learning_rate": 0.00017919737289006,
+      "loss": 1.2679,
+      "step": 4712
+    },
+    {
+      "epoch": 0.8392094017094017,
+      "grad_norm": 0.56390780210495,
+      "learning_rate": 0.00017918882588526729,
+      "loss": 1.0402,
+      "step": 4713
+    },
+    {
+      "epoch": 0.8393874643874644,
+      "grad_norm": 0.5698862075805664,
+      "learning_rate": 0.00017918027732893404,
+      "loss": 1.2336,
+      "step": 4714
+    },
+    {
+      "epoch": 0.8395655270655271,
+      "grad_norm": 0.5016305446624756,
+      "learning_rate": 0.0001791717272212277,
+      "loss": 1.0373,
+      "step": 4715
+    },
+    {
+      "epoch": 0.8397435897435898,
+      "grad_norm": 0.5886971950531006,
+      "learning_rate": 0.0001791631755623159,
+      "loss": 1.1062,
+      "step": 4716
+    },
+    {
+      "epoch": 0.8399216524216524,
+      "grad_norm": 0.647833526134491,
+      "learning_rate": 0.00017915462235236607,
+      "loss": 1.0464,
+      "step": 4717
+    },
+    {
+      "epoch": 0.8400997150997151,
+      "grad_norm": 0.4961194396018982,
+      "learning_rate": 0.00017914606759154587,
+      "loss": 1.0763,
+      "step": 4718
+    },
+    {
+      "epoch": 0.8402777777777778,
+      "grad_norm": 0.47041359543800354,
+      "learning_rate": 0.00017913751128002288,
+      "loss": 1.0685,
+      "step": 4719
+    },
+    {
+      "epoch": 0.8404558404558404,
+      "grad_norm": 0.5752858519554138,
+      "learning_rate": 0.00017912895341796475,
+      "loss": 1.0577,
+      "step": 4720
+    },
+    {
+      "epoch": 0.8406339031339032,
+      "grad_norm": 0.5233224034309387,
+      "learning_rate": 0.00017912039400553914,
+      "loss": 1.1484,
+      "step": 4721
+    },
+    {
+      "epoch": 0.8408119658119658,
+      "grad_norm": 0.5327485203742981,
+      "learning_rate": 0.00017911183304291378,
+      "loss": 1.0028,
+      "step": 4722
+    },
+    {
+      "epoch": 0.8409900284900285,
+      "grad_norm": 0.5320752263069153,
+      "learning_rate": 0.00017910327053025638,
+      "loss": 1.1247,
+      "step": 4723
+    },
+    {
+      "epoch": 0.8411680911680912,
+      "grad_norm": 0.529617965221405,
+      "learning_rate": 0.00017909470646773477,
+      "loss": 1.1698,
+      "step": 4724
+    },
+    {
+      "epoch": 0.8413461538461539,
+      "grad_norm": 0.5055609345436096,
+      "learning_rate": 0.00017908614085551664,
+      "loss": 1.0925,
+      "step": 4725
+    },
+    {
+      "epoch": 0.8415242165242165,
+      "grad_norm": 0.5356255769729614,
+      "learning_rate": 0.00017907757369376985,
+      "loss": 1.0354,
+      "step": 4726
+    },
+    {
+      "epoch": 0.8417022792022792,
+      "grad_norm": 0.582834780216217,
+      "learning_rate": 0.00017906900498266233,
+      "loss": 1.1248,
+      "step": 4727
+    },
+    {
+      "epoch": 0.8418803418803419,
+      "grad_norm": 0.5750834941864014,
+      "learning_rate": 0.00017906043472236188,
+      "loss": 1.0119,
+      "step": 4728
+    },
+    {
+      "epoch": 0.8420584045584045,
+      "grad_norm": 0.5923320055007935,
+      "learning_rate": 0.00017905186291303644,
+      "loss": 1.0662,
+      "step": 4729
+    },
+    {
+      "epoch": 0.8422364672364673,
+      "grad_norm": 0.4767811894416809,
+      "learning_rate": 0.00017904328955485396,
+      "loss": 1.0911,
+      "step": 4730
+    },
+    {
+      "epoch": 0.8424145299145299,
+      "grad_norm": 0.5294556021690369,
+      "learning_rate": 0.00017903471464798245,
+      "loss": 1.2861,
+      "step": 4731
+    },
+    {
+      "epoch": 0.8425925925925926,
+      "grad_norm": 0.599117636680603,
+      "learning_rate": 0.00017902613819258985,
+      "loss": 1.1707,
+      "step": 4732
+    },
+    {
+      "epoch": 0.8427706552706553,
+      "grad_norm": 0.5912977457046509,
+      "learning_rate": 0.00017901756018884424,
+      "loss": 1.1884,
+      "step": 4733
+    },
+    {
+      "epoch": 0.842948717948718,
+      "grad_norm": 0.587676465511322,
+      "learning_rate": 0.0001790089806369137,
+      "loss": 1.1054,
+      "step": 4734
+    },
+    {
+      "epoch": 0.8431267806267806,
+      "grad_norm": 0.6271800398826599,
+      "learning_rate": 0.0001790003995369663,
+      "loss": 1.2094,
+      "step": 4735
+    },
+    {
+      "epoch": 0.8433048433048433,
+      "grad_norm": 0.47198590636253357,
+      "learning_rate": 0.00017899181688917017,
+      "loss": 0.9561,
+      "step": 4736
+    },
+    {
+      "epoch": 0.843482905982906,
+      "grad_norm": 0.690732479095459,
+      "learning_rate": 0.00017898323269369351,
+      "loss": 1.1629,
+      "step": 4737
+    },
+    {
+      "epoch": 0.8436609686609686,
+      "grad_norm": 0.4926888048648834,
+      "learning_rate": 0.00017897464695070445,
+      "loss": 1.1097,
+      "step": 4738
+    },
+    {
+      "epoch": 0.8438390313390314,
+      "grad_norm": 0.7071278691291809,
+      "learning_rate": 0.00017896605966037128,
+      "loss": 1.195,
+      "step": 4739
+    },
+    {
+      "epoch": 0.844017094017094,
+      "grad_norm": 0.5650486350059509,
+      "learning_rate": 0.00017895747082286216,
+      "loss": 1.0107,
+      "step": 4740
+    },
+    {
+      "epoch": 0.8441951566951567,
+      "grad_norm": 0.5291931629180908,
+      "learning_rate": 0.00017894888043834545,
+      "loss": 1.0104,
+      "step": 4741
+    },
+    {
+      "epoch": 0.8443732193732194,
+      "grad_norm": 0.5751241445541382,
+      "learning_rate": 0.00017894028850698942,
+      "loss": 1.2482,
+      "step": 4742
+    },
+    {
+      "epoch": 0.844551282051282,
+      "grad_norm": 0.5833632349967957,
+      "learning_rate": 0.0001789316950289624,
+      "loss": 1.0552,
+      "step": 4743
+    },
+    {
+      "epoch": 0.8447293447293447,
+      "grad_norm": 0.543729841709137,
+      "learning_rate": 0.00017892310000443282,
+      "loss": 1.1453,
+      "step": 4744
+    },
+    {
+      "epoch": 0.8449074074074074,
+      "grad_norm": 0.5674204230308533,
+      "learning_rate": 0.00017891450343356902,
+      "loss": 1.0757,
+      "step": 4745
+    },
+    {
+      "epoch": 0.8450854700854701,
+      "grad_norm": 0.5161892771720886,
+      "learning_rate": 0.00017890590531653946,
+      "loss": 1.1163,
+      "step": 4746
+    },
+    {
+      "epoch": 0.8452635327635327,
+      "grad_norm": 0.49907612800598145,
+      "learning_rate": 0.00017889730565351258,
+      "loss": 1.0356,
+      "step": 4747
+    },
+    {
+      "epoch": 0.8454415954415955,
+      "grad_norm": 0.4994732439517975,
+      "learning_rate": 0.00017888870444465692,
+      "loss": 1.026,
+      "step": 4748
+    },
+    {
+      "epoch": 0.8456196581196581,
+      "grad_norm": 0.6397520303726196,
+      "learning_rate": 0.00017888010169014095,
+      "loss": 0.957,
+      "step": 4749
+    },
+    {
+      "epoch": 0.8457977207977208,
+      "grad_norm": 0.5379729270935059,
+      "learning_rate": 0.00017887149739013327,
+      "loss": 1.1664,
+      "step": 4750
+    },
+    {
+      "epoch": 0.8459757834757835,
+      "grad_norm": 0.4487382769584656,
+      "learning_rate": 0.00017886289154480246,
+      "loss": 0.9377,
+      "step": 4751
+    },
+    {
+      "epoch": 0.8461538461538461,
+      "grad_norm": 0.5645943880081177,
+      "learning_rate": 0.00017885428415431707,
+      "loss": 1.273,
+      "step": 4752
+    },
+    {
+      "epoch": 0.8463319088319088,
+      "grad_norm": 0.5535289645195007,
+      "learning_rate": 0.00017884567521884577,
+      "loss": 1.1779,
+      "step": 4753
+    },
+    {
+      "epoch": 0.8465099715099715,
+      "grad_norm": 0.5039721131324768,
+      "learning_rate": 0.0001788370647385573,
+      "loss": 1.0237,
+      "step": 4754
+    },
+    {
+      "epoch": 0.8466880341880342,
+      "grad_norm": 0.4543854892253876,
+      "learning_rate": 0.00017882845271362032,
+      "loss": 0.8149,
+      "step": 4755
+    },
+    {
+      "epoch": 0.8468660968660968,
+      "grad_norm": 0.5095639824867249,
+      "learning_rate": 0.00017881983914420352,
+      "loss": 1.0141,
+      "step": 4756
+    },
+    {
+      "epoch": 0.8470441595441596,
+      "grad_norm": 0.5341798663139343,
+      "learning_rate": 0.00017881122403047575,
+      "loss": 1.1885,
+      "step": 4757
+    },
+    {
+      "epoch": 0.8472222222222222,
+      "grad_norm": 0.5595062971115112,
+      "learning_rate": 0.00017880260737260573,
+      "loss": 0.8939,
+      "step": 4758
+    },
+    {
+      "epoch": 0.8474002849002849,
+      "grad_norm": 0.5355880260467529,
+      "learning_rate": 0.00017879398917076232,
+      "loss": 1.2434,
+      "step": 4759
+    },
+    {
+      "epoch": 0.8475783475783476,
+      "grad_norm": 0.49477261304855347,
+      "learning_rate": 0.0001787853694251144,
+      "loss": 0.979,
+      "step": 4760
+    },
+    {
+      "epoch": 0.8477564102564102,
+      "grad_norm": 0.5154359340667725,
+      "learning_rate": 0.00017877674813583078,
+      "loss": 1.0957,
+      "step": 4761
+    },
+    {
+      "epoch": 0.8479344729344729,
+      "grad_norm": 0.5651070475578308,
+      "learning_rate": 0.00017876812530308046,
+      "loss": 1.1884,
+      "step": 4762
+    },
+    {
+      "epoch": 0.8481125356125356,
+      "grad_norm": 0.537277340888977,
+      "learning_rate": 0.00017875950092703232,
+      "loss": 1.0272,
+      "step": 4763
+    },
+    {
+      "epoch": 0.8482905982905983,
+      "grad_norm": 0.5259691476821899,
+      "learning_rate": 0.00017875087500785538,
+      "loss": 1.1493,
+      "step": 4764
+    },
+    {
+      "epoch": 0.8484686609686609,
+      "grad_norm": 0.5491300225257874,
+      "learning_rate": 0.00017874224754571867,
+      "loss": 0.8316,
+      "step": 4765
+    },
+    {
+      "epoch": 0.8486467236467237,
+      "grad_norm": 0.5493744611740112,
+      "learning_rate": 0.00017873361854079116,
+      "loss": 1.2328,
+      "step": 4766
+    },
+    {
+      "epoch": 0.8488247863247863,
+      "grad_norm": 0.571002185344696,
+      "learning_rate": 0.00017872498799324197,
+      "loss": 1.1384,
+      "step": 4767
+    },
+    {
+      "epoch": 0.8490028490028491,
+      "grad_norm": 0.538152813911438,
+      "learning_rate": 0.00017871635590324013,
+      "loss": 1.0581,
+      "step": 4768
+    },
+    {
+      "epoch": 0.8491809116809117,
+      "grad_norm": 0.5214923620223999,
+      "learning_rate": 0.00017870772227095486,
+      "loss": 1.0612,
+      "step": 4769
+    },
+    {
+      "epoch": 0.8493589743589743,
+      "grad_norm": 0.5714883804321289,
+      "learning_rate": 0.0001786990870965553,
+      "loss": 0.9076,
+      "step": 4770
+    },
+    {
+      "epoch": 0.8495370370370371,
+      "grad_norm": 0.4181775450706482,
+      "learning_rate": 0.00017869045038021054,
+      "loss": 0.8366,
+      "step": 4771
+    },
+    {
+      "epoch": 0.8497150997150997,
+      "grad_norm": 0.6266027688980103,
+      "learning_rate": 0.00017868181212208993,
+      "loss": 1.2047,
+      "step": 4772
+    },
+    {
+      "epoch": 0.8498931623931624,
+      "grad_norm": 0.5423732399940491,
+      "learning_rate": 0.0001786731723223626,
+      "loss": 1.3878,
+      "step": 4773
+    },
+    {
+      "epoch": 0.8500712250712251,
+      "grad_norm": 0.5512300133705139,
+      "learning_rate": 0.00017866453098119793,
+      "loss": 1.1132,
+      "step": 4774
+    },
+    {
+      "epoch": 0.8502492877492878,
+      "grad_norm": 0.5767185688018799,
+      "learning_rate": 0.00017865588809876519,
+      "loss": 0.97,
+      "step": 4775
+    },
+    {
+      "epoch": 0.8504273504273504,
+      "grad_norm": 0.5305790305137634,
+      "learning_rate": 0.00017864724367523368,
+      "loss": 1.1158,
+      "step": 4776
+    },
+    {
+      "epoch": 0.8506054131054132,
+      "grad_norm": 0.49702391028404236,
+      "learning_rate": 0.00017863859771077284,
+      "loss": 0.9669,
+      "step": 4777
+    },
+    {
+      "epoch": 0.8507834757834758,
+      "grad_norm": 0.5490063428878784,
+      "learning_rate": 0.00017862995020555205,
+      "loss": 1.0646,
+      "step": 4778
+    },
+    {
+      "epoch": 0.8509615384615384,
+      "grad_norm": 0.5308689475059509,
+      "learning_rate": 0.00017862130115974068,
+      "loss": 0.8922,
+      "step": 4779
+    },
+    {
+      "epoch": 0.8511396011396012,
+      "grad_norm": 0.5412983894348145,
+      "learning_rate": 0.00017861265057350826,
+      "loss": 1.1444,
+      "step": 4780
+    },
+    {
+      "epoch": 0.8513176638176638,
+      "grad_norm": 0.5857377052307129,
+      "learning_rate": 0.00017860399844702425,
+      "loss": 1.1643,
+      "step": 4781
+    },
+    {
+      "epoch": 0.8514957264957265,
+      "grad_norm": 0.599273681640625,
+      "learning_rate": 0.00017859534478045815,
+      "loss": 1.169,
+      "step": 4782
+    },
+    {
+      "epoch": 0.8516737891737892,
+      "grad_norm": 0.5677087903022766,
+      "learning_rate": 0.00017858668957397957,
+      "loss": 1.0793,
+      "step": 4783
+    },
+    {
+      "epoch": 0.8518518518518519,
+      "grad_norm": 0.5648362636566162,
+      "learning_rate": 0.00017857803282775807,
+      "loss": 1.1932,
+      "step": 4784
+    },
+    {
+      "epoch": 0.8520299145299145,
+      "grad_norm": 0.5138826966285706,
+      "learning_rate": 0.00017856937454196323,
+      "loss": 1.0011,
+      "step": 4785
+    },
+    {
+      "epoch": 0.8522079772079773,
+      "grad_norm": 0.5951429009437561,
+      "learning_rate": 0.0001785607147167647,
+      "loss": 1.3198,
+      "step": 4786
+    },
+    {
+      "epoch": 0.8523860398860399,
+      "grad_norm": 0.5341953039169312,
+      "learning_rate": 0.00017855205335233216,
+      "loss": 0.9094,
+      "step": 4787
+    },
+    {
+      "epoch": 0.8525641025641025,
+      "grad_norm": 0.5193579196929932,
+      "learning_rate": 0.00017854339044883535,
+      "loss": 0.892,
+      "step": 4788
+    },
+    {
+      "epoch": 0.8527421652421653,
+      "grad_norm": 0.5053097009658813,
+      "learning_rate": 0.00017853472600644392,
+      "loss": 1.0589,
+      "step": 4789
+    },
+    {
+      "epoch": 0.8529202279202279,
+      "grad_norm": 0.5819617509841919,
+      "learning_rate": 0.0001785260600253277,
+      "loss": 1.2646,
+      "step": 4790
+    },
+    {
+      "epoch": 0.8530982905982906,
+      "grad_norm": 0.5327470302581787,
+      "learning_rate": 0.00017851739250565645,
+      "loss": 1.056,
+      "step": 4791
+    },
+    {
+      "epoch": 0.8532763532763533,
+      "grad_norm": 0.5131269097328186,
+      "learning_rate": 0.0001785087234476,
+      "loss": 1.1192,
+      "step": 4792
+    },
+    {
+      "epoch": 0.853454415954416,
+      "grad_norm": 0.4698086977005005,
+      "learning_rate": 0.00017850005285132821,
+      "loss": 0.9849,
+      "step": 4793
+    },
+    {
+      "epoch": 0.8536324786324786,
+      "grad_norm": 0.5503947734832764,
+      "learning_rate": 0.00017849138071701092,
+      "loss": 1.1139,
+      "step": 4794
+    },
+    {
+      "epoch": 0.8538105413105413,
+      "grad_norm": 0.5120903849601746,
+      "learning_rate": 0.0001784827070448181,
+      "loss": 0.9801,
+      "step": 4795
+    },
+    {
+      "epoch": 0.853988603988604,
+      "grad_norm": 0.47650405764579773,
+      "learning_rate": 0.00017847403183491968,
+      "loss": 1.0268,
+      "step": 4796
+    },
+    {
+      "epoch": 0.8541666666666666,
+      "grad_norm": 0.5773387551307678,
+      "learning_rate": 0.0001784653550874856,
+      "loss": 1.0336,
+      "step": 4797
+    },
+    {
+      "epoch": 0.8543447293447294,
+      "grad_norm": 0.545531153678894,
+      "learning_rate": 0.00017845667680268593,
+      "loss": 1.0532,
+      "step": 4798
+    },
+    {
+      "epoch": 0.854522792022792,
+      "grad_norm": 0.533161461353302,
+      "learning_rate": 0.0001784479969806906,
+      "loss": 1.1964,
+      "step": 4799
+    },
+    {
+      "epoch": 0.8547008547008547,
+      "grad_norm": 0.5880789160728455,
+      "learning_rate": 0.00017843931562166977,
+      "loss": 1.1588,
+      "step": 4800
+    },
+    {
+      "epoch": 0.8548789173789174,
+      "grad_norm": 0.5381524562835693,
+      "learning_rate": 0.00017843063272579346,
+      "loss": 1.1533,
+      "step": 4801
+    },
+    {
+      "epoch": 0.85505698005698,
+      "grad_norm": 0.6280176639556885,
+      "learning_rate": 0.00017842194829323187,
+      "loss": 1.0084,
+      "step": 4802
+    },
+    {
+      "epoch": 0.8552350427350427,
+      "grad_norm": 0.5098552703857422,
+      "learning_rate": 0.0001784132623241551,
+      "loss": 1.0804,
+      "step": 4803
+    },
+    {
+      "epoch": 0.8554131054131054,
+      "grad_norm": 0.5406526923179626,
+      "learning_rate": 0.00017840457481873328,
+      "loss": 1.2571,
+      "step": 4804
+    },
+    {
+      "epoch": 0.8555911680911681,
+      "grad_norm": 0.5859003663063049,
+      "learning_rate": 0.00017839588577713678,
+      "loss": 1.2462,
+      "step": 4805
+    },
+    {
+      "epoch": 0.8557692307692307,
+      "grad_norm": 0.6209002137184143,
+      "learning_rate": 0.00017838719519953572,
+      "loss": 1.307,
+      "step": 4806
+    },
+    {
+      "epoch": 0.8559472934472935,
+      "grad_norm": 0.525753915309906,
+      "learning_rate": 0.00017837850308610037,
+      "loss": 1.2957,
+      "step": 4807
+    },
+    {
+      "epoch": 0.8561253561253561,
+      "grad_norm": 0.5096195340156555,
+      "learning_rate": 0.0001783698094370011,
+      "loss": 1.1433,
+      "step": 4808
+    },
+    {
+      "epoch": 0.8563034188034188,
+      "grad_norm": 0.5873076915740967,
+      "learning_rate": 0.0001783611142524082,
+      "loss": 1.2271,
+      "step": 4809
+    },
+    {
+      "epoch": 0.8564814814814815,
+      "grad_norm": 0.5093944668769836,
+      "learning_rate": 0.0001783524175324921,
+      "loss": 0.8788,
+      "step": 4810
+    },
+    {
+      "epoch": 0.8566595441595442,
+      "grad_norm": 0.5485084652900696,
+      "learning_rate": 0.00017834371927742307,
+      "loss": 1.256,
+      "step": 4811
+    },
+    {
+      "epoch": 0.8568376068376068,
+      "grad_norm": 0.5808873772621155,
+      "learning_rate": 0.00017833501948737163,
+      "loss": 0.9287,
+      "step": 4812
+    },
+    {
+      "epoch": 0.8570156695156695,
+      "grad_norm": 0.5113978385925293,
+      "learning_rate": 0.00017832631816250822,
+      "loss": 1.0372,
+      "step": 4813
+    },
+    {
+      "epoch": 0.8571937321937322,
+      "grad_norm": 0.5877016186714172,
+      "learning_rate": 0.0001783176153030033,
+      "loss": 1.3023,
+      "step": 4814
+    },
+    {
+      "epoch": 0.8573717948717948,
+      "grad_norm": 0.534328043460846,
+      "learning_rate": 0.00017830891090902742,
+      "loss": 1.1023,
+      "step": 4815
+    },
+    {
+      "epoch": 0.8575498575498576,
+      "grad_norm": 0.5781638026237488,
+      "learning_rate": 0.0001783002049807511,
+      "loss": 0.9562,
+      "step": 4816
+    },
+    {
+      "epoch": 0.8577279202279202,
+      "grad_norm": 0.5760263204574585,
+      "learning_rate": 0.00017829149751834487,
+      "loss": 0.8733,
+      "step": 4817
+    },
+    {
+      "epoch": 0.8579059829059829,
+      "grad_norm": 0.3887255787849426,
+      "learning_rate": 0.00017828278852197944,
+      "loss": 0.5949,
+      "step": 4818
+    },
+    {
+      "epoch": 0.8580840455840456,
+      "grad_norm": 0.47814446687698364,
+      "learning_rate": 0.00017827407799182537,
+      "loss": 1.0698,
+      "step": 4819
+    },
+    {
+      "epoch": 0.8582621082621082,
+      "grad_norm": 0.5520272254943848,
+      "learning_rate": 0.00017826536592805334,
+      "loss": 1.1314,
+      "step": 4820
+    },
+    {
+      "epoch": 0.8584401709401709,
+      "grad_norm": 0.5285319685935974,
+      "learning_rate": 0.00017825665233083405,
+      "loss": 1.1618,
+      "step": 4821
+    },
+    {
+      "epoch": 0.8586182336182336,
+      "grad_norm": 0.6080102324485779,
+      "learning_rate": 0.0001782479372003382,
+      "loss": 1.3817,
+      "step": 4822
+    },
+    {
+      "epoch": 0.8587962962962963,
+      "grad_norm": 0.7474410533905029,
+      "learning_rate": 0.00017823922053673662,
+      "loss": 1.1321,
+      "step": 4823
+    },
+    {
+      "epoch": 0.8589743589743589,
+      "grad_norm": 0.559283435344696,
+      "learning_rate": 0.0001782305023402,
+      "loss": 1.1894,
+      "step": 4824
+    },
+    {
+      "epoch": 0.8591524216524217,
+      "grad_norm": 0.5620571374893188,
+      "learning_rate": 0.00017822178261089918,
+      "loss": 1.134,
+      "step": 4825
+    },
+    {
+      "epoch": 0.8593304843304843,
+      "grad_norm": 0.5553044676780701,
+      "learning_rate": 0.00017821306134900504,
+      "loss": 1.3222,
+      "step": 4826
+    },
+    {
+      "epoch": 0.8595085470085471,
+      "grad_norm": 0.6177778244018555,
+      "learning_rate": 0.00017820433855468846,
+      "loss": 1.2545,
+      "step": 4827
+    },
+    {
+      "epoch": 0.8596866096866097,
+      "grad_norm": 0.656233012676239,
+      "learning_rate": 0.0001781956142281203,
+      "loss": 1.1346,
+      "step": 4828
+    },
+    {
+      "epoch": 0.8598646723646723,
+      "grad_norm": 0.6710973381996155,
+      "learning_rate": 0.0001781868883694715,
+      "loss": 1.1361,
+      "step": 4829
+    },
+    {
+      "epoch": 0.8600427350427351,
+      "grad_norm": 0.5093601942062378,
+      "learning_rate": 0.0001781781609789131,
+      "loss": 1.0509,
+      "step": 4830
+    },
+    {
+      "epoch": 0.8602207977207977,
+      "grad_norm": 0.5707578063011169,
+      "learning_rate": 0.00017816943205661598,
+      "loss": 1.0964,
+      "step": 4831
+    },
+    {
+      "epoch": 0.8603988603988604,
+      "grad_norm": 0.6159597635269165,
+      "learning_rate": 0.00017816070160275125,
+      "loss": 1.0322,
+      "step": 4832
+    },
+    {
+      "epoch": 0.8605769230769231,
+      "grad_norm": 0.5430580377578735,
+      "learning_rate": 0.0001781519696174899,
+      "loss": 1.2464,
+      "step": 4833
+    },
+    {
+      "epoch": 0.8607549857549858,
+      "grad_norm": 0.48104700446128845,
+      "learning_rate": 0.0001781432361010031,
+      "loss": 1.1031,
+      "step": 4834
+    },
+    {
+      "epoch": 0.8609330484330484,
+      "grad_norm": 0.5304946303367615,
+      "learning_rate": 0.0001781345010534619,
+      "loss": 1.0281,
+      "step": 4835
+    },
+    {
+      "epoch": 0.8611111111111112,
+      "grad_norm": 0.5230711698532104,
+      "learning_rate": 0.00017812576447503742,
+      "loss": 0.9499,
+      "step": 4836
+    },
+    {
+      "epoch": 0.8612891737891738,
+      "grad_norm": 0.5363606214523315,
+      "learning_rate": 0.00017811702636590093,
+      "loss": 1.1358,
+      "step": 4837
+    },
+    {
+      "epoch": 0.8614672364672364,
+      "grad_norm": 0.5880044102668762,
+      "learning_rate": 0.00017810828672622358,
+      "loss": 1.1765,
+      "step": 4838
+    },
+    {
+      "epoch": 0.8616452991452992,
+      "grad_norm": 0.5194395184516907,
+      "learning_rate": 0.0001780995455561766,
+      "loss": 1.1622,
+      "step": 4839
+    },
+    {
+      "epoch": 0.8618233618233618,
+      "grad_norm": 0.5114264488220215,
+      "learning_rate": 0.00017809080285593126,
+      "loss": 1.0081,
+      "step": 4840
+    },
+    {
+      "epoch": 0.8620014245014245,
+      "grad_norm": 0.6174240112304688,
+      "learning_rate": 0.00017808205862565886,
+      "loss": 1.0745,
+      "step": 4841
+    },
+    {
+      "epoch": 0.8621794871794872,
+      "grad_norm": 0.5662630200386047,
+      "learning_rate": 0.0001780733128655307,
+      "loss": 1.3369,
+      "step": 4842
+    },
+    {
+      "epoch": 0.8623575498575499,
+      "grad_norm": 0.5917882919311523,
+      "learning_rate": 0.00017806456557571817,
+      "loss": 1.1631,
+      "step": 4843
+    },
+    {
+      "epoch": 0.8625356125356125,
+      "grad_norm": 0.5305736660957336,
+      "learning_rate": 0.00017805581675639265,
+      "loss": 0.9875,
+      "step": 4844
+    },
+    {
+      "epoch": 0.8627136752136753,
+      "grad_norm": 0.5181219577789307,
+      "learning_rate": 0.00017804706640772556,
+      "loss": 0.9918,
+      "step": 4845
+    },
+    {
+      "epoch": 0.8628917378917379,
+      "grad_norm": 0.5467997789382935,
+      "learning_rate": 0.00017803831452988832,
+      "loss": 1.1395,
+      "step": 4846
+    },
+    {
+      "epoch": 0.8630698005698005,
+      "grad_norm": 0.5494031310081482,
+      "learning_rate": 0.00017802956112305241,
+      "loss": 1.0312,
+      "step": 4847
+    },
+    {
+      "epoch": 0.8632478632478633,
+      "grad_norm": 0.5804065465927124,
+      "learning_rate": 0.00017802080618738931,
+      "loss": 1.1555,
+      "step": 4848
+    },
+    {
+      "epoch": 0.8634259259259259,
+      "grad_norm": 0.5424801111221313,
+      "learning_rate": 0.00017801204972307067,
+      "loss": 1.0215,
+      "step": 4849
+    },
+    {
+      "epoch": 0.8636039886039886,
+      "grad_norm": 0.5321891903877258,
+      "learning_rate": 0.0001780032917302679,
+      "loss": 1.0187,
+      "step": 4850
+    },
+    {
+      "epoch": 0.8637820512820513,
+      "grad_norm": 0.5543400049209595,
+      "learning_rate": 0.0001779945322091527,
+      "loss": 1.1972,
+      "step": 4851
+    },
+    {
+      "epoch": 0.863960113960114,
+      "grad_norm": 0.566649317741394,
+      "learning_rate": 0.00017798577115989668,
+      "loss": 1.0758,
+      "step": 4852
+    },
+    {
+      "epoch": 0.8641381766381766,
+      "grad_norm": 0.5538444519042969,
+      "learning_rate": 0.00017797700858267145,
+      "loss": 1.1338,
+      "step": 4853
+    },
+    {
+      "epoch": 0.8643162393162394,
+      "grad_norm": 0.5641313791275024,
+      "learning_rate": 0.0001779682444776487,
+      "loss": 1.256,
+      "step": 4854
+    },
+    {
+      "epoch": 0.864494301994302,
+      "grad_norm": 0.6377350091934204,
+      "learning_rate": 0.00017795947884500016,
+      "loss": 1.144,
+      "step": 4855
+    },
+    {
+      "epoch": 0.8646723646723646,
+      "grad_norm": 0.5581876039505005,
+      "learning_rate": 0.0001779507116848976,
+      "loss": 1.3163,
+      "step": 4856
+    },
+    {
+      "epoch": 0.8648504273504274,
+      "grad_norm": 0.5416772365570068,
+      "learning_rate": 0.0001779419429975128,
+      "loss": 1.0219,
+      "step": 4857
+    },
+    {
+      "epoch": 0.86502849002849,
+      "grad_norm": 0.5450608730316162,
+      "learning_rate": 0.0001779331727830175,
+      "loss": 1.0093,
+      "step": 4858
+    },
+    {
+      "epoch": 0.8652065527065527,
+      "grad_norm": 0.5151242017745972,
+      "learning_rate": 0.00017792440104158358,
+      "loss": 1.067,
+      "step": 4859
+    },
+    {
+      "epoch": 0.8653846153846154,
+      "grad_norm": 0.5225046873092651,
+      "learning_rate": 0.0001779156277733829,
+      "loss": 1.0432,
+      "step": 4860
+    },
+    {
+      "epoch": 0.8655626780626781,
+      "grad_norm": 0.5168602466583252,
+      "learning_rate": 0.00017790685297858737,
+      "loss": 0.9665,
+      "step": 4861
+    },
+    {
+      "epoch": 0.8657407407407407,
+      "grad_norm": 0.5749059319496155,
+      "learning_rate": 0.00017789807665736889,
+      "loss": 1.1607,
+      "step": 4862
+    },
+    {
+      "epoch": 0.8659188034188035,
+      "grad_norm": 0.45656394958496094,
+      "learning_rate": 0.00017788929880989938,
+      "loss": 0.8362,
+      "step": 4863
+    },
+    {
+      "epoch": 0.8660968660968661,
+      "grad_norm": 0.5090615749359131,
+      "learning_rate": 0.00017788051943635086,
+      "loss": 0.9553,
+      "step": 4864
+    },
+    {
+      "epoch": 0.8662749287749287,
+      "grad_norm": 0.5381240248680115,
+      "learning_rate": 0.0001778717385368954,
+      "loss": 1.1391,
+      "step": 4865
+    },
+    {
+      "epoch": 0.8664529914529915,
+      "grad_norm": 0.522720456123352,
+      "learning_rate": 0.00017786295611170493,
+      "loss": 1.1869,
+      "step": 4866
+    },
+    {
+      "epoch": 0.8666310541310541,
+      "grad_norm": 0.530986487865448,
+      "learning_rate": 0.0001778541721609516,
+      "loss": 1.1046,
+      "step": 4867
+    },
+    {
+      "epoch": 0.8668091168091168,
+      "grad_norm": 0.5065864324569702,
+      "learning_rate": 0.0001778453866848075,
+      "loss": 1.008,
+      "step": 4868
+    },
+    {
+      "epoch": 0.8669871794871795,
+      "grad_norm": 0.5541394352912903,
+      "learning_rate": 0.00017783659968344476,
+      "loss": 1.0004,
+      "step": 4869
+    },
+    {
+      "epoch": 0.8671652421652422,
+      "grad_norm": 0.5059576630592346,
+      "learning_rate": 0.00017782781115703556,
+      "loss": 1.128,
+      "step": 4870
+    },
+    {
+      "epoch": 0.8673433048433048,
+      "grad_norm": 0.5052187442779541,
+      "learning_rate": 0.00017781902110575203,
+      "loss": 0.8544,
+      "step": 4871
+    },
+    {
+      "epoch": 0.8675213675213675,
+      "grad_norm": 0.5383397340774536,
+      "learning_rate": 0.00017781022952976646,
+      "loss": 1.1411,
+      "step": 4872
+    },
+    {
+      "epoch": 0.8676994301994302,
+      "grad_norm": 0.4760429859161377,
+      "learning_rate": 0.00017780143642925106,
+      "loss": 0.8246,
+      "step": 4873
+    },
+    {
+      "epoch": 0.8678774928774928,
+      "grad_norm": 0.5480535626411438,
+      "learning_rate": 0.00017779264180437817,
+      "loss": 1.013,
+      "step": 4874
+    },
+    {
+      "epoch": 0.8680555555555556,
+      "grad_norm": 0.5303317904472351,
+      "learning_rate": 0.00017778384565532004,
+      "loss": 1.0201,
+      "step": 4875
+    },
+    {
+      "epoch": 0.8682336182336182,
+      "grad_norm": 0.5365355014801025,
+      "learning_rate": 0.00017777504798224903,
+      "loss": 1.1107,
+      "step": 4876
+    },
+    {
+      "epoch": 0.8684116809116809,
+      "grad_norm": 0.5173360705375671,
+      "learning_rate": 0.00017776624878533754,
+      "loss": 1.0808,
+      "step": 4877
+    },
+    {
+      "epoch": 0.8685897435897436,
+      "grad_norm": 0.5088842511177063,
+      "learning_rate": 0.00017775744806475792,
+      "loss": 0.995,
+      "step": 4878
+    },
+    {
+      "epoch": 0.8687678062678063,
+      "grad_norm": 0.5796698927879333,
+      "learning_rate": 0.00017774864582068264,
+      "loss": 1.1485,
+      "step": 4879
+    },
+    {
+      "epoch": 0.8689458689458689,
+      "grad_norm": 0.5719375610351562,
+      "learning_rate": 0.00017773984205328417,
+      "loss": 1.0133,
+      "step": 4880
+    },
+    {
+      "epoch": 0.8691239316239316,
+      "grad_norm": 0.6396418213844299,
+      "learning_rate": 0.00017773103676273498,
+      "loss": 1.0932,
+      "step": 4881
+    },
+    {
+      "epoch": 0.8693019943019943,
+      "grad_norm": 0.5602468252182007,
+      "learning_rate": 0.00017772222994920763,
+      "loss": 0.9702,
+      "step": 4882
+    },
+    {
+      "epoch": 0.8694800569800569,
+      "grad_norm": 0.5167748332023621,
+      "learning_rate": 0.00017771342161287457,
+      "loss": 1.0528,
+      "step": 4883
+    },
+    {
+      "epoch": 0.8696581196581197,
+      "grad_norm": 0.5572916865348816,
+      "learning_rate": 0.00017770461175390848,
+      "loss": 1.1341,
+      "step": 4884
+    },
+    {
+      "epoch": 0.8698361823361823,
+      "grad_norm": 0.6666276454925537,
+      "learning_rate": 0.00017769580037248195,
+      "loss": 1.1948,
+      "step": 4885
+    },
+    {
+      "epoch": 0.8700142450142451,
+      "grad_norm": 0.5348601937294006,
+      "learning_rate": 0.0001776869874687676,
+      "loss": 1.0562,
+      "step": 4886
+    },
+    {
+      "epoch": 0.8701923076923077,
+      "grad_norm": 0.5449648499488831,
+      "learning_rate": 0.00017767817304293812,
+      "loss": 0.988,
+      "step": 4887
+    },
+    {
+      "epoch": 0.8703703703703703,
+      "grad_norm": 0.5995045304298401,
+      "learning_rate": 0.0001776693570951662,
+      "loss": 1.2526,
+      "step": 4888
+    },
+    {
+      "epoch": 0.8705484330484331,
+      "grad_norm": 0.6575320959091187,
+      "learning_rate": 0.00017766053962562457,
+      "loss": 1.1717,
+      "step": 4889
+    },
+    {
+      "epoch": 0.8707264957264957,
+      "grad_norm": 0.5882139801979065,
+      "learning_rate": 0.00017765172063448597,
+      "loss": 1.238,
+      "step": 4890
+    },
+    {
+      "epoch": 0.8709045584045584,
+      "grad_norm": 0.5908389091491699,
+      "learning_rate": 0.00017764290012192325,
+      "loss": 1.0606,
+      "step": 4891
+    },
+    {
+      "epoch": 0.8710826210826211,
+      "grad_norm": 0.6169339418411255,
+      "learning_rate": 0.00017763407808810917,
+      "loss": 1.1456,
+      "step": 4892
+    },
+    {
+      "epoch": 0.8712606837606838,
+      "grad_norm": 0.5916035771369934,
+      "learning_rate": 0.0001776252545332166,
+      "loss": 1.0026,
+      "step": 4893
+    },
+    {
+      "epoch": 0.8714387464387464,
+      "grad_norm": 0.539995551109314,
+      "learning_rate": 0.00017761642945741843,
+      "loss": 1.2397,
+      "step": 4894
+    },
+    {
+      "epoch": 0.8716168091168092,
+      "grad_norm": 0.5346137881278992,
+      "learning_rate": 0.00017760760286088755,
+      "loss": 1.1232,
+      "step": 4895
+    },
+    {
+      "epoch": 0.8717948717948718,
+      "grad_norm": 0.570202112197876,
+      "learning_rate": 0.00017759877474379692,
+      "loss": 1.0708,
+      "step": 4896
+    },
+    {
+      "epoch": 0.8719729344729344,
+      "grad_norm": 0.5023398399353027,
+      "learning_rate": 0.00017758994510631948,
+      "loss": 1.1056,
+      "step": 4897
+    },
+    {
+      "epoch": 0.8721509971509972,
+      "grad_norm": 0.5447137951850891,
+      "learning_rate": 0.00017758111394862826,
+      "loss": 0.8776,
+      "step": 4898
+    },
+    {
+      "epoch": 0.8723290598290598,
+      "grad_norm": 0.5193906426429749,
+      "learning_rate": 0.00017757228127089625,
+      "loss": 0.9959,
+      "step": 4899
+    },
+    {
+      "epoch": 0.8725071225071225,
+      "grad_norm": 0.5958787798881531,
+      "learning_rate": 0.00017756344707329656,
+      "loss": 1.092,
+      "step": 4900
+    },
+    {
+      "epoch": 0.8726851851851852,
+      "grad_norm": 0.521045982837677,
+      "learning_rate": 0.00017755461135600221,
+      "loss": 0.9864,
+      "step": 4901
+    },
+    {
+      "epoch": 0.8728632478632479,
+      "grad_norm": 0.5257635116577148,
+      "learning_rate": 0.00017754577411918638,
+      "loss": 1.216,
+      "step": 4902
+    },
+    {
+      "epoch": 0.8730413105413105,
+      "grad_norm": 0.5425964593887329,
+      "learning_rate": 0.0001775369353630222,
+      "loss": 1.1432,
+      "step": 4903
+    },
+    {
+      "epoch": 0.8732193732193733,
+      "grad_norm": 0.47995322942733765,
+      "learning_rate": 0.00017752809508768286,
+      "loss": 1.0227,
+      "step": 4904
+    },
+    {
+      "epoch": 0.8733974358974359,
+      "grad_norm": 0.5747429728507996,
+      "learning_rate": 0.0001775192532933415,
+      "loss": 0.9984,
+      "step": 4905
+    },
+    {
+      "epoch": 0.8735754985754985,
+      "grad_norm": 0.5745723247528076,
+      "learning_rate": 0.00017751040998017142,
+      "loss": 1.2559,
+      "step": 4906
+    },
+    {
+      "epoch": 0.8737535612535613,
+      "grad_norm": 0.6114141941070557,
+      "learning_rate": 0.0001775015651483459,
+      "loss": 1.3224,
+      "step": 4907
+    },
+    {
+      "epoch": 0.8739316239316239,
+      "grad_norm": 0.4757187068462372,
+      "learning_rate": 0.00017749271879803817,
+      "loss": 1.0352,
+      "step": 4908
+    },
+    {
+      "epoch": 0.8741096866096866,
+      "grad_norm": 0.48644450306892395,
+      "learning_rate": 0.0001774838709294216,
+      "loss": 1.0876,
+      "step": 4909
+    },
+    {
+      "epoch": 0.8742877492877493,
+      "grad_norm": 0.5652037262916565,
+      "learning_rate": 0.00017747502154266955,
+      "loss": 0.9189,
+      "step": 4910
+    },
+    {
+      "epoch": 0.874465811965812,
+      "grad_norm": 0.5289644002914429,
+      "learning_rate": 0.00017746617063795538,
+      "loss": 0.9431,
+      "step": 4911
+    },
+    {
+      "epoch": 0.8746438746438746,
+      "grad_norm": 0.594656229019165,
+      "learning_rate": 0.00017745731821545253,
+      "loss": 1.2408,
+      "step": 4912
+    },
+    {
+      "epoch": 0.8748219373219374,
+      "grad_norm": 0.5693240165710449,
+      "learning_rate": 0.0001774484642753344,
+      "loss": 1.347,
+      "step": 4913
+    },
+    {
+      "epoch": 0.875,
+      "grad_norm": 0.5291008949279785,
+      "learning_rate": 0.00017743960881777456,
+      "loss": 1.161,
+      "step": 4914
+    },
+    {
+      "epoch": 0.8751780626780626,
+      "grad_norm": 0.5958300232887268,
+      "learning_rate": 0.00017743075184294642,
+      "loss": 1.2058,
+      "step": 4915
+    },
+    {
+      "epoch": 0.8753561253561254,
+      "grad_norm": 0.513884425163269,
+      "learning_rate": 0.00017742189335102354,
+      "loss": 1.0952,
+      "step": 4916
+    },
+    {
+      "epoch": 0.875534188034188,
+      "grad_norm": 0.5860681533813477,
+      "learning_rate": 0.00017741303334217948,
+      "loss": 1.1801,
+      "step": 4917
+    },
+    {
+      "epoch": 0.8757122507122507,
+      "grad_norm": 0.47962820529937744,
+      "learning_rate": 0.00017740417181658788,
+      "loss": 1.0785,
+      "step": 4918
+    },
+    {
+      "epoch": 0.8758903133903134,
+      "grad_norm": 0.5110440254211426,
+      "learning_rate": 0.00017739530877442227,
+      "loss": 1.1385,
+      "step": 4919
+    },
+    {
+      "epoch": 0.8760683760683761,
+      "grad_norm": 0.5106285214424133,
+      "learning_rate": 0.00017738644421585643,
+      "loss": 1.1204,
+      "step": 4920
+    },
+    {
+      "epoch": 0.8762464387464387,
+      "grad_norm": 0.5709205865859985,
+      "learning_rate": 0.00017737757814106393,
+      "loss": 1.0108,
+      "step": 4921
+    },
+    {
+      "epoch": 0.8764245014245015,
+      "grad_norm": 0.5850250124931335,
+      "learning_rate": 0.0001773687105502185,
+      "loss": 1.0059,
+      "step": 4922
+    },
+    {
+      "epoch": 0.8766025641025641,
+      "grad_norm": 0.5194727778434753,
+      "learning_rate": 0.00017735984144349396,
+      "loss": 0.9466,
+      "step": 4923
+    },
+    {
+      "epoch": 0.8767806267806267,
+      "grad_norm": 0.5246787667274475,
+      "learning_rate": 0.000177350970821064,
+      "loss": 1.1336,
+      "step": 4924
+    },
+    {
+      "epoch": 0.8769586894586895,
+      "grad_norm": 0.5798323154449463,
+      "learning_rate": 0.00017734209868310244,
+      "loss": 1.1641,
+      "step": 4925
+    },
+    {
+      "epoch": 0.8771367521367521,
+      "grad_norm": 0.5188565850257874,
+      "learning_rate": 0.00017733322502978314,
+      "loss": 0.9959,
+      "step": 4926
+    },
+    {
+      "epoch": 0.8773148148148148,
+      "grad_norm": 0.5969653725624084,
+      "learning_rate": 0.00017732434986127995,
+      "loss": 1.2162,
+      "step": 4927
+    },
+    {
+      "epoch": 0.8774928774928775,
+      "grad_norm": 0.5520089268684387,
+      "learning_rate": 0.00017731547317776674,
+      "loss": 1.0163,
+      "step": 4928
+    },
+    {
+      "epoch": 0.8776709401709402,
+      "grad_norm": 0.48789507150650024,
+      "learning_rate": 0.00017730659497941745,
+      "loss": 0.9757,
+      "step": 4929
+    },
+    {
+      "epoch": 0.8778490028490028,
+      "grad_norm": 0.6034960746765137,
+      "learning_rate": 0.000177297715266406,
+      "loss": 1.1278,
+      "step": 4930
+    },
+    {
+      "epoch": 0.8780270655270656,
+      "grad_norm": 0.53016597032547,
+      "learning_rate": 0.00017728883403890638,
+      "loss": 1.0637,
+      "step": 4931
+    },
+    {
+      "epoch": 0.8782051282051282,
+      "grad_norm": 0.5073726177215576,
+      "learning_rate": 0.00017727995129709266,
+      "loss": 1.1491,
+      "step": 4932
+    },
+    {
+      "epoch": 0.8783831908831908,
+      "grad_norm": 0.540605366230011,
+      "learning_rate": 0.00017727106704113878,
+      "loss": 1.0133,
+      "step": 4933
+    },
+    {
+      "epoch": 0.8785612535612536,
+      "grad_norm": 0.5346775054931641,
+      "learning_rate": 0.0001772621812712189,
+      "loss": 1.1781,
+      "step": 4934
+    },
+    {
+      "epoch": 0.8787393162393162,
+      "grad_norm": 0.5659036040306091,
+      "learning_rate": 0.00017725329398750702,
+      "loss": 1.1023,
+      "step": 4935
+    },
+    {
+      "epoch": 0.8789173789173789,
+      "grad_norm": 0.591063380241394,
+      "learning_rate": 0.00017724440519017738,
+      "loss": 1.0298,
+      "step": 4936
+    },
+    {
+      "epoch": 0.8790954415954416,
+      "grad_norm": 0.5173781514167786,
+      "learning_rate": 0.0001772355148794041,
+      "loss": 1.0483,
+      "step": 4937
+    },
+    {
+      "epoch": 0.8792735042735043,
+      "grad_norm": 0.5405352711677551,
+      "learning_rate": 0.0001772266230553613,
+      "loss": 1.0716,
+      "step": 4938
+    },
+    {
+      "epoch": 0.8794515669515669,
+      "grad_norm": 0.518442690372467,
+      "learning_rate": 0.00017721772971822323,
+      "loss": 1.1373,
+      "step": 4939
+    },
+    {
+      "epoch": 0.8796296296296297,
+      "grad_norm": 0.533673107624054,
+      "learning_rate": 0.0001772088348681642,
+      "loss": 1.0489,
+      "step": 4940
+    },
+    {
+      "epoch": 0.8798076923076923,
+      "grad_norm": 0.46117857098579407,
+      "learning_rate": 0.0001771999385053584,
+      "loss": 1.0297,
+      "step": 4941
+    },
+    {
+      "epoch": 0.8799857549857549,
+      "grad_norm": 0.4687997102737427,
+      "learning_rate": 0.0001771910406299802,
+      "loss": 1.071,
+      "step": 4942
+    },
+    {
+      "epoch": 0.8801638176638177,
+      "grad_norm": 0.5064153075218201,
+      "learning_rate": 0.0001771821412422039,
+      "loss": 0.9518,
+      "step": 4943
+    },
+    {
+      "epoch": 0.8803418803418803,
+      "grad_norm": 0.6561978459358215,
+      "learning_rate": 0.00017717324034220385,
+      "loss": 1.11,
+      "step": 4944
+    },
+    {
+      "epoch": 0.8805199430199431,
+      "grad_norm": 0.5551498532295227,
+      "learning_rate": 0.00017716433793015454,
+      "loss": 0.9719,
+      "step": 4945
+    },
+    {
+      "epoch": 0.8806980056980057,
+      "grad_norm": 0.47059500217437744,
+      "learning_rate": 0.00017715543400623025,
+      "loss": 0.8891,
+      "step": 4946
+    },
+    {
+      "epoch": 0.8808760683760684,
+      "grad_norm": 0.5035740733146667,
+      "learning_rate": 0.00017714652857060554,
+      "loss": 0.9671,
+      "step": 4947
+    },
+    {
+      "epoch": 0.8810541310541311,
+      "grad_norm": 0.4599960446357727,
+      "learning_rate": 0.00017713762162345487,
+      "loss": 0.9588,
+      "step": 4948
+    },
+    {
+      "epoch": 0.8812321937321937,
+      "grad_norm": 0.5087231397628784,
+      "learning_rate": 0.0001771287131649527,
+      "loss": 1.1433,
+      "step": 4949
+    },
+    {
+      "epoch": 0.8814102564102564,
+      "grad_norm": 0.5609854459762573,
+      "learning_rate": 0.00017711980319527366,
+      "loss": 1.2022,
+      "step": 4950
+    },
+    {
+      "epoch": 0.8815883190883191,
+      "grad_norm": 0.49460700154304504,
+      "learning_rate": 0.00017711089171459227,
+      "loss": 1.019,
+      "step": 4951
+    },
+    {
+      "epoch": 0.8817663817663818,
+      "grad_norm": 0.5047259330749512,
+      "learning_rate": 0.00017710197872308314,
+      "loss": 0.8301,
+      "step": 4952
+    },
+    {
+      "epoch": 0.8819444444444444,
+      "grad_norm": 0.5784406065940857,
+      "learning_rate": 0.0001770930642209209,
+      "loss": 0.9336,
+      "step": 4953
+    },
+    {
+      "epoch": 0.8821225071225072,
+      "grad_norm": 0.5037121772766113,
+      "learning_rate": 0.00017708414820828022,
+      "loss": 1.0199,
+      "step": 4954
+    },
+    {
+      "epoch": 0.8823005698005698,
+      "grad_norm": 0.5683804750442505,
+      "learning_rate": 0.00017707523068533575,
+      "loss": 0.9758,
+      "step": 4955
+    },
+    {
+      "epoch": 0.8824786324786325,
+      "grad_norm": 0.5167922973632812,
+      "learning_rate": 0.0001770663116522623,
+      "loss": 1.0389,
+      "step": 4956
+    },
+    {
+      "epoch": 0.8826566951566952,
+      "grad_norm": 0.5813606381416321,
+      "learning_rate": 0.0001770573911092345,
+      "loss": 1.3998,
+      "step": 4957
+    },
+    {
+      "epoch": 0.8828347578347578,
+      "grad_norm": 0.5280475616455078,
+      "learning_rate": 0.00017704846905642723,
+      "loss": 1.0545,
+      "step": 4958
+    },
+    {
+      "epoch": 0.8830128205128205,
+      "grad_norm": 0.5421732068061829,
+      "learning_rate": 0.00017703954549401528,
+      "loss": 0.899,
+      "step": 4959
+    },
+    {
+      "epoch": 0.8831908831908832,
+      "grad_norm": 0.5177720189094543,
+      "learning_rate": 0.00017703062042217344,
+      "loss": 0.975,
+      "step": 4960
+    },
+    {
+      "epoch": 0.8833689458689459,
+      "grad_norm": 0.639327883720398,
+      "learning_rate": 0.00017702169384107666,
+      "loss": 1.1936,
+      "step": 4961
+    },
+    {
+      "epoch": 0.8835470085470085,
+      "grad_norm": 0.5201572179794312,
+      "learning_rate": 0.00017701276575089975,
+      "loss": 0.9891,
+      "step": 4962
+    },
+    {
+      "epoch": 0.8837250712250713,
+      "grad_norm": 0.5304145216941833,
+      "learning_rate": 0.00017700383615181767,
+      "loss": 1.0569,
+      "step": 4963
+    },
+    {
+      "epoch": 0.8839031339031339,
+      "grad_norm": 0.6068132519721985,
+      "learning_rate": 0.00017699490504400538,
+      "loss": 1.2653,
+      "step": 4964
+    },
+    {
+      "epoch": 0.8840811965811965,
+      "grad_norm": 0.597895085811615,
+      "learning_rate": 0.00017698597242763787,
+      "loss": 1.2577,
+      "step": 4965
+    },
+    {
+      "epoch": 0.8842592592592593,
+      "grad_norm": 0.5356902480125427,
+      "learning_rate": 0.00017697703830289017,
+      "loss": 1.1056,
+      "step": 4966
+    },
+    {
+      "epoch": 0.8844373219373219,
+      "grad_norm": 0.5429540872573853,
+      "learning_rate": 0.0001769681026699373,
+      "loss": 1.0951,
+      "step": 4967
+    },
+    {
+      "epoch": 0.8846153846153846,
+      "grad_norm": 0.5789309144020081,
+      "learning_rate": 0.00017695916552895436,
+      "loss": 1.0786,
+      "step": 4968
+    },
+    {
+      "epoch": 0.8847934472934473,
+      "grad_norm": 0.5621341466903687,
+      "learning_rate": 0.0001769502268801164,
+      "loss": 1.0645,
+      "step": 4969
+    },
+    {
+      "epoch": 0.88497150997151,
+      "grad_norm": 0.5879453420639038,
+      "learning_rate": 0.00017694128672359865,
+      "loss": 1.2171,
+      "step": 4970
+    },
+    {
+      "epoch": 0.8851495726495726,
+      "grad_norm": 0.5005951523780823,
+      "learning_rate": 0.0001769323450595762,
+      "loss": 1.0725,
+      "step": 4971
+    },
+    {
+      "epoch": 0.8853276353276354,
+      "grad_norm": 0.5439660549163818,
+      "learning_rate": 0.00017692340188822425,
+      "loss": 1.162,
+      "step": 4972
+    },
+    {
+      "epoch": 0.885505698005698,
+      "grad_norm": 0.6309837698936462,
+      "learning_rate": 0.00017691445720971802,
+      "loss": 1.2861,
+      "step": 4973
+    },
+    {
+      "epoch": 0.8856837606837606,
+      "grad_norm": 0.4997463822364807,
+      "learning_rate": 0.00017690551102423282,
+      "loss": 1.1887,
+      "step": 4974
+    },
+    {
+      "epoch": 0.8858618233618234,
+      "grad_norm": 0.5430852174758911,
+      "learning_rate": 0.00017689656333194385,
+      "loss": 1.1231,
+      "step": 4975
+    },
+    {
+      "epoch": 0.886039886039886,
+      "grad_norm": 0.5414215922355652,
+      "learning_rate": 0.00017688761413302644,
+      "loss": 1.2345,
+      "step": 4976
+    },
+    {
+      "epoch": 0.8862179487179487,
+      "grad_norm": 0.5594443082809448,
+      "learning_rate": 0.00017687866342765601,
+      "loss": 1.0775,
+      "step": 4977
+    },
+    {
+      "epoch": 0.8863960113960114,
+      "grad_norm": 0.5827134847640991,
+      "learning_rate": 0.00017686971121600787,
+      "loss": 1.0609,
+      "step": 4978
+    },
+    {
+      "epoch": 0.8865740740740741,
+      "grad_norm": 0.5075414776802063,
+      "learning_rate": 0.00017686075749825738,
+      "loss": 0.796,
+      "step": 4979
+    },
+    {
+      "epoch": 0.8867521367521367,
+      "grad_norm": 0.6007544994354248,
+      "learning_rate": 0.00017685180227458003,
+      "loss": 1.1716,
+      "step": 4980
+    },
+    {
+      "epoch": 0.8869301994301995,
+      "grad_norm": 0.6458030343055725,
+      "learning_rate": 0.00017684284554515128,
+      "loss": 1.1945,
+      "step": 4981
+    },
+    {
+      "epoch": 0.8871082621082621,
+      "grad_norm": 0.5519212484359741,
+      "learning_rate": 0.00017683388731014657,
+      "loss": 1.2571,
+      "step": 4982
+    },
+    {
+      "epoch": 0.8872863247863247,
+      "grad_norm": 0.5079960227012634,
+      "learning_rate": 0.00017682492756974146,
+      "loss": 1.1186,
+      "step": 4983
+    },
+    {
+      "epoch": 0.8874643874643875,
+      "grad_norm": 0.63576740026474,
+      "learning_rate": 0.00017681596632411147,
+      "loss": 1.389,
+      "step": 4984
+    },
+    {
+      "epoch": 0.8876424501424501,
+      "grad_norm": 0.43325698375701904,
+      "learning_rate": 0.0001768070035734322,
+      "loss": 0.7757,
+      "step": 4985
+    },
+    {
+      "epoch": 0.8878205128205128,
+      "grad_norm": 0.49492064118385315,
+      "learning_rate": 0.00017679803931787923,
+      "loss": 1.0096,
+      "step": 4986
+    },
+    {
+      "epoch": 0.8879985754985755,
+      "grad_norm": 0.5561224222183228,
+      "learning_rate": 0.00017678907355762825,
+      "loss": 0.952,
+      "step": 4987
+    },
+    {
+      "epoch": 0.8881766381766382,
+      "grad_norm": 0.5392457246780396,
+      "learning_rate": 0.00017678010629285486,
+      "loss": 1.0442,
+      "step": 4988
+    },
+    {
+      "epoch": 0.8883547008547008,
+      "grad_norm": 0.4659234881401062,
+      "learning_rate": 0.00017677113752373482,
+      "loss": 0.8668,
+      "step": 4989
+    },
+    {
+      "epoch": 0.8885327635327636,
+      "grad_norm": 0.5139175057411194,
+      "learning_rate": 0.0001767621672504438,
+      "loss": 0.8386,
+      "step": 4990
+    },
+    {
+      "epoch": 0.8887108262108262,
+      "grad_norm": 0.5395823121070862,
+      "learning_rate": 0.00017675319547315755,
+      "loss": 0.9754,
+      "step": 4991
+    },
+    {
+      "epoch": 0.8888888888888888,
+      "grad_norm": 0.4751867949962616,
+      "learning_rate": 0.0001767442221920519,
+      "loss": 0.8775,
+      "step": 4992
+    },
+    {
+      "epoch": 0.8890669515669516,
+      "grad_norm": 0.5728281736373901,
+      "learning_rate": 0.00017673524740730265,
+      "loss": 1.2807,
+      "step": 4993
+    },
+    {
+      "epoch": 0.8892450142450142,
+      "grad_norm": 0.5545622110366821,
+      "learning_rate": 0.00017672627111908558,
+      "loss": 1.0039,
+      "step": 4994
+    },
+    {
+      "epoch": 0.8894230769230769,
+      "grad_norm": 0.5127374529838562,
+      "learning_rate": 0.00017671729332757665,
+      "loss": 1.0505,
+      "step": 4995
+    },
+    {
+      "epoch": 0.8896011396011396,
+      "grad_norm": 0.5238714218139648,
+      "learning_rate": 0.00017670831403295175,
+      "loss": 1.1775,
+      "step": 4996
+    },
+    {
+      "epoch": 0.8897792022792023,
+      "grad_norm": 0.5610160827636719,
+      "learning_rate": 0.00017669933323538674,
+      "loss": 1.0555,
+      "step": 4997
+    },
+    {
+      "epoch": 0.8899572649572649,
+      "grad_norm": 0.5481634736061096,
+      "learning_rate": 0.00017669035093505762,
+      "loss": 1.0802,
+      "step": 4998
+    },
+    {
+      "epoch": 0.8901353276353277,
+      "grad_norm": 0.4725174307823181,
+      "learning_rate": 0.0001766813671321404,
+      "loss": 0.9611,
+      "step": 4999
+    },
+    {
+      "epoch": 0.8903133903133903,
+      "grad_norm": 0.5184635519981384,
+      "learning_rate": 0.0001766723818268111,
+      "loss": 1.1659,
+      "step": 5000
+    },
+    {
+      "epoch": 0.8904914529914529,
+      "grad_norm": 0.5503578186035156,
+      "learning_rate": 0.00017666339501924575,
+      "loss": 1.2165,
+      "step": 5001
+    },
+    {
+      "epoch": 0.8906695156695157,
+      "grad_norm": 0.5299594402313232,
+      "learning_rate": 0.0001766544067096204,
+      "loss": 1.0196,
+      "step": 5002
+    },
+    {
+      "epoch": 0.8908475783475783,
+      "grad_norm": 0.5673944354057312,
+      "learning_rate": 0.00017664541689811118,
+      "loss": 1.2058,
+      "step": 5003
+    },
+    {
+      "epoch": 0.8910256410256411,
+      "grad_norm": 0.6057320833206177,
+      "learning_rate": 0.00017663642558489426,
+      "loss": 1.0136,
+      "step": 5004
+    },
+    {
+      "epoch": 0.8912037037037037,
+      "grad_norm": 0.4767026901245117,
+      "learning_rate": 0.00017662743277014578,
+      "loss": 0.8522,
+      "step": 5005
+    },
+    {
+      "epoch": 0.8913817663817664,
+      "grad_norm": 0.5346270203590393,
+      "learning_rate": 0.00017661843845404192,
+      "loss": 1.1568,
+      "step": 5006
+    },
+    {
+      "epoch": 0.8915598290598291,
+      "grad_norm": 0.5365738868713379,
+      "learning_rate": 0.00017660944263675891,
+      "loss": 1.0488,
+      "step": 5007
+    },
+    {
+      "epoch": 0.8917378917378918,
+      "grad_norm": 0.5536269545555115,
+      "learning_rate": 0.00017660044531847305,
+      "loss": 1.1216,
+      "step": 5008
+    },
+    {
+      "epoch": 0.8919159544159544,
+      "grad_norm": 0.6325978636741638,
+      "learning_rate": 0.00017659144649936055,
+      "loss": 1.2843,
+      "step": 5009
+    },
+    {
+      "epoch": 0.8920940170940171,
+      "grad_norm": 0.5890641212463379,
+      "learning_rate": 0.00017658244617959777,
+      "loss": 1.1976,
+      "step": 5010
+    },
+    {
+      "epoch": 0.8922720797720798,
+      "grad_norm": 0.604870080947876,
+      "learning_rate": 0.00017657344435936107,
+      "loss": 1.2881,
+      "step": 5011
+    },
+    {
+      "epoch": 0.8924501424501424,
+      "grad_norm": 0.49805206060409546,
+      "learning_rate": 0.00017656444103882676,
+      "loss": 0.8998,
+      "step": 5012
+    },
+    {
+      "epoch": 0.8926282051282052,
+      "grad_norm": 0.506926953792572,
+      "learning_rate": 0.0001765554362181713,
+      "loss": 1.0731,
+      "step": 5013
+    },
+    {
+      "epoch": 0.8928062678062678,
+      "grad_norm": 0.5353260636329651,
+      "learning_rate": 0.0001765464298975711,
+      "loss": 1.0676,
+      "step": 5014
+    },
+    {
+      "epoch": 0.8929843304843305,
+      "grad_norm": 0.5641853213310242,
+      "learning_rate": 0.0001765374220772026,
+      "loss": 0.9606,
+      "step": 5015
+    },
+    {
+      "epoch": 0.8931623931623932,
+      "grad_norm": 0.5049327611923218,
+      "learning_rate": 0.00017652841275724233,
+      "loss": 1.009,
+      "step": 5016
+    },
+    {
+      "epoch": 0.8933404558404558,
+      "grad_norm": 0.6255155205726624,
+      "learning_rate": 0.0001765194019378668,
+      "loss": 1.138,
+      "step": 5017
+    },
+    {
+      "epoch": 0.8935185185185185,
+      "grad_norm": 0.5816851854324341,
+      "learning_rate": 0.00017651038961925247,
+      "loss": 1.3398,
+      "step": 5018
+    },
+    {
+      "epoch": 0.8936965811965812,
+      "grad_norm": 0.5188020467758179,
+      "learning_rate": 0.00017650137580157605,
+      "loss": 1.0126,
+      "step": 5019
+    },
+    {
+      "epoch": 0.8938746438746439,
+      "grad_norm": 0.5231554508209229,
+      "learning_rate": 0.00017649236048501406,
+      "loss": 1.0328,
+      "step": 5020
+    },
+    {
+      "epoch": 0.8940527065527065,
+      "grad_norm": 0.7638634443283081,
+      "learning_rate": 0.0001764833436697432,
+      "loss": 1.3016,
+      "step": 5021
+    },
+    {
+      "epoch": 0.8942307692307693,
+      "grad_norm": 0.5354094505310059,
+      "learning_rate": 0.00017647432535594008,
+      "loss": 1.0646,
+      "step": 5022
+    },
+    {
+      "epoch": 0.8944088319088319,
+      "grad_norm": 0.6938086748123169,
+      "learning_rate": 0.0001764653055437814,
+      "loss": 1.2051,
+      "step": 5023
+    },
+    {
+      "epoch": 0.8945868945868946,
+      "grad_norm": 0.5546849370002747,
+      "learning_rate": 0.00017645628423344393,
+      "loss": 1.0671,
+      "step": 5024
+    },
+    {
+      "epoch": 0.8947649572649573,
+      "grad_norm": 0.49294665455818176,
+      "learning_rate": 0.0001764472614251044,
+      "loss": 1.0328,
+      "step": 5025
+    },
+    {
+      "epoch": 0.89494301994302,
+      "grad_norm": 0.5965796113014221,
+      "learning_rate": 0.00017643823711893956,
+      "loss": 1.0741,
+      "step": 5026
+    },
+    {
+      "epoch": 0.8951210826210826,
+      "grad_norm": 0.4846448302268982,
+      "learning_rate": 0.00017642921131512626,
+      "loss": 1.0409,
+      "step": 5027
+    },
+    {
+      "epoch": 0.8952991452991453,
+      "grad_norm": 0.5767390131950378,
+      "learning_rate": 0.00017642018401384135,
+      "loss": 1.018,
+      "step": 5028
+    },
+    {
+      "epoch": 0.895477207977208,
+      "grad_norm": 0.503027617931366,
+      "learning_rate": 0.00017641115521526167,
+      "loss": 1.0002,
+      "step": 5029
+    },
+    {
+      "epoch": 0.8956552706552706,
+      "grad_norm": 0.6668619513511658,
+      "learning_rate": 0.00017640212491956412,
+      "loss": 1.2154,
+      "step": 5030
+    },
+    {
+      "epoch": 0.8958333333333334,
+      "grad_norm": 0.5544148683547974,
+      "learning_rate": 0.00017639309312692566,
+      "loss": 1.2701,
+      "step": 5031
+    },
+    {
+      "epoch": 0.896011396011396,
+      "grad_norm": 0.6026872992515564,
+      "learning_rate": 0.00017638405983752323,
+      "loss": 0.9335,
+      "step": 5032
+    },
+    {
+      "epoch": 0.8961894586894587,
+      "grad_norm": 0.6288694143295288,
+      "learning_rate": 0.00017637502505153384,
+      "loss": 0.9075,
+      "step": 5033
+    },
+    {
+      "epoch": 0.8963675213675214,
+      "grad_norm": 0.4890204966068268,
+      "learning_rate": 0.00017636598876913446,
+      "loss": 0.8492,
+      "step": 5034
+    },
+    {
+      "epoch": 0.896545584045584,
+      "grad_norm": 0.5746598243713379,
+      "learning_rate": 0.00017635695099050218,
+      "loss": 1.1557,
+      "step": 5035
+    },
+    {
+      "epoch": 0.8967236467236467,
+      "grad_norm": 0.5165683031082153,
+      "learning_rate": 0.00017634791171581405,
+      "loss": 1.0899,
+      "step": 5036
+    },
+    {
+      "epoch": 0.8969017094017094,
+      "grad_norm": 0.4621037244796753,
+      "learning_rate": 0.0001763388709452472,
+      "loss": 1.0457,
+      "step": 5037
+    },
+    {
+      "epoch": 0.8970797720797721,
+      "grad_norm": 0.532358705997467,
+      "learning_rate": 0.00017632982867897876,
+      "loss": 1.139,
+      "step": 5038
+    },
+    {
+      "epoch": 0.8972578347578347,
+      "grad_norm": 0.5794399976730347,
+      "learning_rate": 0.00017632078491718587,
+      "loss": 1.031,
+      "step": 5039
+    },
+    {
+      "epoch": 0.8974358974358975,
+      "grad_norm": 0.5031905174255371,
+      "learning_rate": 0.00017631173966004576,
+      "loss": 0.9508,
+      "step": 5040
+    },
+    {
+      "epoch": 0.8976139601139601,
+      "grad_norm": 0.6528840065002441,
+      "learning_rate": 0.00017630269290773564,
+      "loss": 0.9974,
+      "step": 5041
+    },
+    {
+      "epoch": 0.8977920227920227,
+      "grad_norm": 0.6007558703422546,
+      "learning_rate": 0.00017629364466043273,
+      "loss": 1.0993,
+      "step": 5042
+    },
+    {
+      "epoch": 0.8979700854700855,
+      "grad_norm": 0.5104095339775085,
+      "learning_rate": 0.00017628459491831437,
+      "loss": 0.9175,
+      "step": 5043
+    },
+    {
+      "epoch": 0.8981481481481481,
+      "grad_norm": 0.5285516977310181,
+      "learning_rate": 0.00017627554368155782,
+      "loss": 0.998,
+      "step": 5044
+    },
+    {
+      "epoch": 0.8983262108262108,
+      "grad_norm": 0.5629046559333801,
+      "learning_rate": 0.00017626649095034045,
+      "loss": 1.2021,
+      "step": 5045
+    },
+    {
+      "epoch": 0.8985042735042735,
+      "grad_norm": 0.57548987865448,
+      "learning_rate": 0.00017625743672483962,
+      "loss": 1.2076,
+      "step": 5046
+    },
+    {
+      "epoch": 0.8986823361823362,
+      "grad_norm": 0.4883024990558624,
+      "learning_rate": 0.0001762483810052327,
+      "loss": 0.9761,
+      "step": 5047
+    },
+    {
+      "epoch": 0.8988603988603988,
+      "grad_norm": 0.6378034949302673,
+      "learning_rate": 0.0001762393237916972,
+      "loss": 1.2266,
+      "step": 5048
+    },
+    {
+      "epoch": 0.8990384615384616,
+      "grad_norm": 0.5201624035835266,
+      "learning_rate": 0.0001762302650844105,
+      "loss": 1.247,
+      "step": 5049
+    },
+    {
+      "epoch": 0.8992165242165242,
+      "grad_norm": 0.5438048243522644,
+      "learning_rate": 0.0001762212048835501,
+      "loss": 0.993,
+      "step": 5050
+    },
+    {
+      "epoch": 0.8993945868945868,
+      "grad_norm": 0.5928253531455994,
+      "learning_rate": 0.00017621214318929354,
+      "loss": 1.0469,
+      "step": 5051
+    },
+    {
+      "epoch": 0.8995726495726496,
+      "grad_norm": 0.6437996625900269,
+      "learning_rate": 0.00017620308000181831,
+      "loss": 1.3136,
+      "step": 5052
+    },
+    {
+      "epoch": 0.8997507122507122,
+      "grad_norm": 0.5961456298828125,
+      "learning_rate": 0.00017619401532130208,
+      "loss": 1.1495,
+      "step": 5053
+    },
+    {
+      "epoch": 0.8999287749287749,
+      "grad_norm": 0.497388631105423,
+      "learning_rate": 0.0001761849491479224,
+      "loss": 0.7783,
+      "step": 5054
+    },
+    {
+      "epoch": 0.9001068376068376,
+      "grad_norm": 0.5984451174736023,
+      "learning_rate": 0.00017617588148185687,
+      "loss": 1.3115,
+      "step": 5055
+    },
+    {
+      "epoch": 0.9002849002849003,
+      "grad_norm": 0.549163818359375,
+      "learning_rate": 0.0001761668123232832,
+      "loss": 1.1649,
+      "step": 5056
+    },
+    {
+      "epoch": 0.9004629629629629,
+      "grad_norm": 0.5831968188285828,
+      "learning_rate": 0.00017615774167237903,
+      "loss": 1.1749,
+      "step": 5057
+    },
+    {
+      "epoch": 0.9006410256410257,
+      "grad_norm": 0.5111076235771179,
+      "learning_rate": 0.00017614866952932214,
+      "loss": 0.8936,
+      "step": 5058
+    },
+    {
+      "epoch": 0.9008190883190883,
+      "grad_norm": 0.5740947723388672,
+      "learning_rate": 0.00017613959589429028,
+      "loss": 1.2606,
+      "step": 5059
+    },
+    {
+      "epoch": 0.9009971509971509,
+      "grad_norm": 0.5881099700927734,
+      "learning_rate": 0.0001761305207674612,
+      "loss": 1.3682,
+      "step": 5060
+    },
+    {
+      "epoch": 0.9011752136752137,
+      "grad_norm": 0.5007091760635376,
+      "learning_rate": 0.00017612144414901268,
+      "loss": 0.7788,
+      "step": 5061
+    },
+    {
+      "epoch": 0.9013532763532763,
+      "grad_norm": 0.5127760171890259,
+      "learning_rate": 0.00017611236603912262,
+      "loss": 1.0519,
+      "step": 5062
+    },
+    {
+      "epoch": 0.9015313390313391,
+      "grad_norm": 0.6185184121131897,
+      "learning_rate": 0.00017610328643796882,
+      "loss": 1.1672,
+      "step": 5063
+    },
+    {
+      "epoch": 0.9017094017094017,
+      "grad_norm": 0.49707287549972534,
+      "learning_rate": 0.00017609420534572926,
+      "loss": 1.1865,
+      "step": 5064
+    },
+    {
+      "epoch": 0.9018874643874644,
+      "grad_norm": 0.5667552351951599,
+      "learning_rate": 0.0001760851227625818,
+      "loss": 1.1388,
+      "step": 5065
+    },
+    {
+      "epoch": 0.9020655270655271,
+      "grad_norm": 0.50298011302948,
+      "learning_rate": 0.00017607603868870442,
+      "loss": 0.9552,
+      "step": 5066
+    },
+    {
+      "epoch": 0.9022435897435898,
+      "grad_norm": 0.5709219574928284,
+      "learning_rate": 0.0001760669531242751,
+      "loss": 1.2636,
+      "step": 5067
+    },
+    {
+      "epoch": 0.9024216524216524,
+      "grad_norm": 0.4943496286869049,
+      "learning_rate": 0.0001760578660694718,
+      "loss": 0.8951,
+      "step": 5068
+    },
+    {
+      "epoch": 0.9025997150997151,
+      "grad_norm": 0.5475931167602539,
+      "learning_rate": 0.00017604877752447267,
+      "loss": 1.1442,
+      "step": 5069
+    },
+    {
+      "epoch": 0.9027777777777778,
+      "grad_norm": 0.5280239582061768,
+      "learning_rate": 0.0001760396874894557,
+      "loss": 0.9537,
+      "step": 5070
+    },
+    {
+      "epoch": 0.9029558404558404,
+      "grad_norm": 0.5480797290802002,
+      "learning_rate": 0.000176030595964599,
+      "loss": 1.1557,
+      "step": 5071
+    },
+    {
+      "epoch": 0.9031339031339032,
+      "grad_norm": 0.5232734680175781,
+      "learning_rate": 0.00017602150295008073,
+      "loss": 1.0219,
+      "step": 5072
+    },
+    {
+      "epoch": 0.9033119658119658,
+      "grad_norm": 0.5448359251022339,
+      "learning_rate": 0.000176012408446079,
+      "loss": 1.1964,
+      "step": 5073
+    },
+    {
+      "epoch": 0.9034900284900285,
+      "grad_norm": 0.4841914474964142,
+      "learning_rate": 0.00017600331245277206,
+      "loss": 1.0667,
+      "step": 5074
+    },
+    {
+      "epoch": 0.9036680911680912,
+      "grad_norm": 0.5407083630561829,
+      "learning_rate": 0.0001759942149703381,
+      "loss": 1.1895,
+      "step": 5075
+    },
+    {
+      "epoch": 0.9038461538461539,
+      "grad_norm": 0.5140416026115417,
+      "learning_rate": 0.00017598511599895534,
+      "loss": 0.9402,
+      "step": 5076
+    },
+    {
+      "epoch": 0.9040242165242165,
+      "grad_norm": 0.6333765983581543,
+      "learning_rate": 0.00017597601553880207,
+      "loss": 1.239,
+      "step": 5077
+    },
+    {
+      "epoch": 0.9042022792022792,
+      "grad_norm": 0.4996028244495392,
+      "learning_rate": 0.00017596691359005664,
+      "loss": 1.0259,
+      "step": 5078
+    },
+    {
+      "epoch": 0.9043803418803419,
+      "grad_norm": 0.591892421245575,
+      "learning_rate": 0.00017595781015289732,
+      "loss": 1.2148,
+      "step": 5079
+    },
+    {
+      "epoch": 0.9045584045584045,
+      "grad_norm": 0.736499011516571,
+      "learning_rate": 0.0001759487052275025,
+      "loss": 1.1373,
+      "step": 5080
+    },
+    {
+      "epoch": 0.9047364672364673,
+      "grad_norm": 0.5951572060585022,
+      "learning_rate": 0.00017593959881405057,
+      "loss": 1.1833,
+      "step": 5081
+    },
+    {
+      "epoch": 0.9049145299145299,
+      "grad_norm": 0.5092006325721741,
+      "learning_rate": 0.00017593049091271996,
+      "loss": 0.8841,
+      "step": 5082
+    },
+    {
+      "epoch": 0.9050925925925926,
+      "grad_norm": 0.5679013729095459,
+      "learning_rate": 0.0001759213815236891,
+      "loss": 1.1056,
+      "step": 5083
+    },
+    {
+      "epoch": 0.9052706552706553,
+      "grad_norm": 0.5708174109458923,
+      "learning_rate": 0.0001759122706471365,
+      "loss": 1.1952,
+      "step": 5084
+    },
+    {
+      "epoch": 0.905448717948718,
+      "grad_norm": 0.5726733803749084,
+      "learning_rate": 0.00017590315828324067,
+      "loss": 1.1013,
+      "step": 5085
+    },
+    {
+      "epoch": 0.9056267806267806,
+      "grad_norm": 0.5821273326873779,
+      "learning_rate": 0.00017589404443218008,
+      "loss": 1.2323,
+      "step": 5086
+    },
+    {
+      "epoch": 0.9058048433048433,
+      "grad_norm": 0.5811445713043213,
+      "learning_rate": 0.00017588492909413337,
+      "loss": 1.2241,
+      "step": 5087
+    },
+    {
+      "epoch": 0.905982905982906,
+      "grad_norm": 0.5377545952796936,
+      "learning_rate": 0.0001758758122692791,
+      "loss": 0.9777,
+      "step": 5088
+    },
+    {
+      "epoch": 0.9061609686609686,
+      "grad_norm": 0.5985640287399292,
+      "learning_rate": 0.0001758666939577959,
+      "loss": 0.9737,
+      "step": 5089
+    },
+    {
+      "epoch": 0.9063390313390314,
+      "grad_norm": 0.6038222908973694,
+      "learning_rate": 0.00017585757415986247,
+      "loss": 1.2116,
+      "step": 5090
+    },
+    {
+      "epoch": 0.906517094017094,
+      "grad_norm": 0.6752246022224426,
+      "learning_rate": 0.00017584845287565743,
+      "loss": 1.1975,
+      "step": 5091
+    },
+    {
+      "epoch": 0.9066951566951567,
+      "grad_norm": 0.5400625467300415,
+      "learning_rate": 0.0001758393301053595,
+      "loss": 0.9669,
+      "step": 5092
+    },
+    {
+      "epoch": 0.9068732193732194,
+      "grad_norm": 0.5637784004211426,
+      "learning_rate": 0.00017583020584914746,
+      "loss": 1.2672,
+      "step": 5093
+    },
+    {
+      "epoch": 0.907051282051282,
+      "grad_norm": 0.4825877249240875,
+      "learning_rate": 0.00017582108010720006,
+      "loss": 0.9719,
+      "step": 5094
+    },
+    {
+      "epoch": 0.9072293447293447,
+      "grad_norm": 0.49902790784835815,
+      "learning_rate": 0.00017581195287969613,
+      "loss": 0.7941,
+      "step": 5095
+    },
+    {
+      "epoch": 0.9074074074074074,
+      "grad_norm": 0.5991541743278503,
+      "learning_rate": 0.0001758028241668144,
+      "loss": 1.049,
+      "step": 5096
+    },
+    {
+      "epoch": 0.9075854700854701,
+      "grad_norm": 0.5788859724998474,
+      "learning_rate": 0.00017579369396873384,
+      "loss": 1.0318,
+      "step": 5097
+    },
+    {
+      "epoch": 0.9077635327635327,
+      "grad_norm": 0.5914160013198853,
+      "learning_rate": 0.0001757845622856333,
+      "loss": 1.1007,
+      "step": 5098
+    },
+    {
+      "epoch": 0.9079415954415955,
+      "grad_norm": 0.5361711382865906,
+      "learning_rate": 0.00017577542911769166,
+      "loss": 1.0694,
+      "step": 5099
+    },
+    {
+      "epoch": 0.9081196581196581,
+      "grad_norm": 0.5752849578857422,
+      "learning_rate": 0.00017576629446508792,
+      "loss": 1.1184,
+      "step": 5100
+    },
+    {
+      "epoch": 0.9082977207977208,
+      "grad_norm": 0.6042249798774719,
+      "learning_rate": 0.000175757158328001,
+      "loss": 1.2808,
+      "step": 5101
+    },
+    {
+      "epoch": 0.9084757834757835,
+      "grad_norm": 0.508352518081665,
+      "learning_rate": 0.00017574802070661,
+      "loss": 1.0038,
+      "step": 5102
+    },
+    {
+      "epoch": 0.9086538461538461,
+      "grad_norm": 0.5667358040809631,
+      "learning_rate": 0.00017573888160109385,
+      "loss": 1.0208,
+      "step": 5103
+    },
+    {
+      "epoch": 0.9088319088319088,
+      "grad_norm": 0.653619647026062,
+      "learning_rate": 0.00017572974101163165,
+      "loss": 1.2053,
+      "step": 5104
+    },
+    {
+      "epoch": 0.9090099715099715,
+      "grad_norm": 0.5069597363471985,
+      "learning_rate": 0.00017572059893840246,
+      "loss": 0.8634,
+      "step": 5105
+    },
+    {
+      "epoch": 0.9091880341880342,
+      "grad_norm": 0.6160602569580078,
+      "learning_rate": 0.00017571145538158547,
+      "loss": 1.2626,
+      "step": 5106
+    },
+    {
+      "epoch": 0.9093660968660968,
+      "grad_norm": 0.6335833668708801,
+      "learning_rate": 0.00017570231034135978,
+      "loss": 1.3381,
+      "step": 5107
+    },
+    {
+      "epoch": 0.9095441595441596,
+      "grad_norm": 0.5140398740768433,
+      "learning_rate": 0.00017569316381790454,
+      "loss": 1.1258,
+      "step": 5108
+    },
+    {
+      "epoch": 0.9097222222222222,
+      "grad_norm": 0.5682975649833679,
+      "learning_rate": 0.00017568401581139905,
+      "loss": 1.3367,
+      "step": 5109
+    },
+    {
+      "epoch": 0.9099002849002849,
+      "grad_norm": 0.49765729904174805,
+      "learning_rate": 0.00017567486632202246,
+      "loss": 1.1891,
+      "step": 5110
+    },
+    {
+      "epoch": 0.9100783475783476,
+      "grad_norm": 0.5139224529266357,
+      "learning_rate": 0.00017566571534995406,
+      "loss": 0.9768,
+      "step": 5111
+    },
+    {
+      "epoch": 0.9102564102564102,
+      "grad_norm": 0.5510922074317932,
+      "learning_rate": 0.00017565656289537316,
+      "loss": 1.1552,
+      "step": 5112
+    },
+    {
+      "epoch": 0.9104344729344729,
+      "grad_norm": 0.6243364810943604,
+      "learning_rate": 0.00017564740895845908,
+      "loss": 1.1341,
+      "step": 5113
+    },
+    {
+      "epoch": 0.9106125356125356,
+      "grad_norm": 0.5334977507591248,
+      "learning_rate": 0.00017563825353939116,
+      "loss": 1.0894,
+      "step": 5114
+    },
+    {
+      "epoch": 0.9107905982905983,
+      "grad_norm": 0.5195826292037964,
+      "learning_rate": 0.00017562909663834878,
+      "loss": 1.1011,
+      "step": 5115
+    },
+    {
+      "epoch": 0.9109686609686609,
+      "grad_norm": 0.5298168063163757,
+      "learning_rate": 0.00017561993825551138,
+      "loss": 1.0079,
+      "step": 5116
+    },
+    {
+      "epoch": 0.9111467236467237,
+      "grad_norm": 0.5858965516090393,
+      "learning_rate": 0.00017561077839105835,
+      "loss": 1.2746,
+      "step": 5117
+    },
+    {
+      "epoch": 0.9113247863247863,
+      "grad_norm": 0.5572476387023926,
+      "learning_rate": 0.0001756016170451692,
+      "loss": 0.8169,
+      "step": 5118
+    },
+    {
+      "epoch": 0.9115028490028491,
+      "grad_norm": 0.5247095823287964,
+      "learning_rate": 0.0001755924542180234,
+      "loss": 1.1206,
+      "step": 5119
+    },
+    {
+      "epoch": 0.9116809116809117,
+      "grad_norm": 0.5605118274688721,
+      "learning_rate": 0.0001755832899098005,
+      "loss": 1.371,
+      "step": 5120
+    },
+    {
+      "epoch": 0.9118589743589743,
+      "grad_norm": 0.5732316970825195,
+      "learning_rate": 0.00017557412412068005,
+      "loss": 1.1248,
+      "step": 5121
+    },
+    {
+      "epoch": 0.9120370370370371,
+      "grad_norm": 0.6167279481887817,
+      "learning_rate": 0.0001755649568508416,
+      "loss": 0.94,
+      "step": 5122
+    },
+    {
+      "epoch": 0.9122150997150997,
+      "grad_norm": 0.5497499108314514,
+      "learning_rate": 0.00017555578810046483,
+      "loss": 1.0112,
+      "step": 5123
+    },
+    {
+      "epoch": 0.9123931623931624,
+      "grad_norm": 0.540762186050415,
+      "learning_rate": 0.00017554661786972931,
+      "loss": 1.1058,
+      "step": 5124
+    },
+    {
+      "epoch": 0.9125712250712251,
+      "grad_norm": 0.5943556427955627,
+      "learning_rate": 0.0001755374461588148,
+      "loss": 0.9086,
+      "step": 5125
+    },
+    {
+      "epoch": 0.9127492877492878,
+      "grad_norm": 0.5300756692886353,
+      "learning_rate": 0.0001755282729679009,
+      "loss": 1.1566,
+      "step": 5126
+    },
+    {
+      "epoch": 0.9129273504273504,
+      "grad_norm": 0.5390434861183167,
+      "learning_rate": 0.00017551909829716743,
+      "loss": 1.1395,
+      "step": 5127
+    },
+    {
+      "epoch": 0.9131054131054132,
+      "grad_norm": 0.627434492111206,
+      "learning_rate": 0.00017550992214679405,
+      "loss": 1.1537,
+      "step": 5128
+    },
+    {
+      "epoch": 0.9132834757834758,
+      "grad_norm": 0.4806903302669525,
+      "learning_rate": 0.00017550074451696063,
+      "loss": 0.7905,
+      "step": 5129
+    },
+    {
+      "epoch": 0.9134615384615384,
+      "grad_norm": 0.5714817047119141,
+      "learning_rate": 0.00017549156540784696,
+      "loss": 1.1042,
+      "step": 5130
+    },
+    {
+      "epoch": 0.9136396011396012,
+      "grad_norm": 0.5839236378669739,
+      "learning_rate": 0.0001754823848196329,
+      "loss": 1.0383,
+      "step": 5131
+    },
+    {
+      "epoch": 0.9138176638176638,
+      "grad_norm": 0.6089872717857361,
+      "learning_rate": 0.0001754732027524983,
+      "loss": 0.9399,
+      "step": 5132
+    },
+    {
+      "epoch": 0.9139957264957265,
+      "grad_norm": 0.4937956631183624,
+      "learning_rate": 0.00017546401920662307,
+      "loss": 0.7382,
+      "step": 5133
+    },
+    {
+      "epoch": 0.9141737891737892,
+      "grad_norm": 0.5918676257133484,
+      "learning_rate": 0.00017545483418218716,
+      "loss": 1.2207,
+      "step": 5134
+    },
+    {
+      "epoch": 0.9143518518518519,
+      "grad_norm": 0.5825346112251282,
+      "learning_rate": 0.0001754456476793705,
+      "loss": 0.9669,
+      "step": 5135
+    },
+    {
+      "epoch": 0.9145299145299145,
+      "grad_norm": 0.49829617142677307,
+      "learning_rate": 0.0001754364596983531,
+      "loss": 1.2247,
+      "step": 5136
+    },
+    {
+      "epoch": 0.9147079772079773,
+      "grad_norm": 0.5128271579742432,
+      "learning_rate": 0.00017542727023931497,
+      "loss": 0.9563,
+      "step": 5137
+    },
+    {
+      "epoch": 0.9148860398860399,
+      "grad_norm": 0.5789414644241333,
+      "learning_rate": 0.00017541807930243622,
+      "loss": 1.22,
+      "step": 5138
+    },
+    {
+      "epoch": 0.9150641025641025,
+      "grad_norm": 0.44155433773994446,
+      "learning_rate": 0.00017540888688789683,
+      "loss": 0.9897,
+      "step": 5139
+    },
+    {
+      "epoch": 0.9152421652421653,
+      "grad_norm": 0.550464391708374,
+      "learning_rate": 0.00017539969299587696,
+      "loss": 1.0624,
+      "step": 5140
+    },
+    {
+      "epoch": 0.9154202279202279,
+      "grad_norm": 0.5019831657409668,
+      "learning_rate": 0.0001753904976265567,
+      "loss": 0.9045,
+      "step": 5141
+    },
+    {
+      "epoch": 0.9155982905982906,
+      "grad_norm": 0.589658796787262,
+      "learning_rate": 0.0001753813007801163,
+      "loss": 1.0454,
+      "step": 5142
+    },
+    {
+      "epoch": 0.9157763532763533,
+      "grad_norm": 0.5945459008216858,
+      "learning_rate": 0.00017537210245673586,
+      "loss": 1.0042,
+      "step": 5143
+    },
+    {
+      "epoch": 0.915954415954416,
+      "grad_norm": 0.5409809947013855,
+      "learning_rate": 0.00017536290265659566,
+      "loss": 1.0609,
+      "step": 5144
+    },
+    {
+      "epoch": 0.9161324786324786,
+      "grad_norm": 0.5302975177764893,
+      "learning_rate": 0.00017535370137987597,
+      "loss": 1.1394,
+      "step": 5145
+    },
+    {
+      "epoch": 0.9163105413105413,
+      "grad_norm": 0.5253351330757141,
+      "learning_rate": 0.00017534449862675698,
+      "loss": 1.2249,
+      "step": 5146
+    },
+    {
+      "epoch": 0.916488603988604,
+      "grad_norm": 0.6363829970359802,
+      "learning_rate": 0.00017533529439741908,
+      "loss": 1.1333,
+      "step": 5147
+    },
+    {
+      "epoch": 0.9166666666666666,
+      "grad_norm": 0.4703354835510254,
+      "learning_rate": 0.0001753260886920426,
+      "loss": 0.9971,
+      "step": 5148
+    },
+    {
+      "epoch": 0.9168447293447294,
+      "grad_norm": 0.6394907236099243,
+      "learning_rate": 0.00017531688151080786,
+      "loss": 1.5942,
+      "step": 5149
+    },
+    {
+      "epoch": 0.917022792022792,
+      "grad_norm": 0.5573459267616272,
+      "learning_rate": 0.00017530767285389527,
+      "loss": 0.9669,
+      "step": 5150
+    },
+    {
+      "epoch": 0.9172008547008547,
+      "grad_norm": 0.5000962615013123,
+      "learning_rate": 0.00017529846272148532,
+      "loss": 1.2151,
+      "step": 5151
+    },
+    {
+      "epoch": 0.9173789173789174,
+      "grad_norm": 0.5550395846366882,
+      "learning_rate": 0.0001752892511137584,
+      "loss": 1.1765,
+      "step": 5152
+    },
+    {
+      "epoch": 0.91755698005698,
+      "grad_norm": 0.5461394786834717,
+      "learning_rate": 0.00017528003803089496,
+      "loss": 1.1136,
+      "step": 5153
+    },
+    {
+      "epoch": 0.9177350427350427,
+      "grad_norm": 0.5512672662734985,
+      "learning_rate": 0.00017527082347307558,
+      "loss": 1.1727,
+      "step": 5154
+    },
+    {
+      "epoch": 0.9179131054131054,
+      "grad_norm": 0.5210778713226318,
+      "learning_rate": 0.0001752616074404808,
+      "loss": 1.09,
+      "step": 5155
+    },
+    {
+      "epoch": 0.9180911680911681,
+      "grad_norm": 0.5214943289756775,
+      "learning_rate": 0.00017525238993329115,
+      "loss": 0.9654,
+      "step": 5156
+    },
+    {
+      "epoch": 0.9182692307692307,
+      "grad_norm": 0.5822862386703491,
+      "learning_rate": 0.00017524317095168724,
+      "loss": 1.0951,
+      "step": 5157
+    },
+    {
+      "epoch": 0.9184472934472935,
+      "grad_norm": 0.43948012590408325,
+      "learning_rate": 0.0001752339504958497,
+      "loss": 0.6984,
+      "step": 5158
+    },
+    {
+      "epoch": 0.9186253561253561,
+      "grad_norm": 0.5024449229240417,
+      "learning_rate": 0.00017522472856595916,
+      "loss": 0.983,
+      "step": 5159
+    },
+    {
+      "epoch": 0.9188034188034188,
+      "grad_norm": 0.5815144181251526,
+      "learning_rate": 0.00017521550516219636,
+      "loss": 0.9784,
+      "step": 5160
+    },
+    {
+      "epoch": 0.9189814814814815,
+      "grad_norm": 0.5519825220108032,
+      "learning_rate": 0.00017520628028474197,
+      "loss": 1.064,
+      "step": 5161
+    },
+    {
+      "epoch": 0.9191595441595442,
+      "grad_norm": 0.5615749955177307,
+      "learning_rate": 0.00017519705393377675,
+      "loss": 1.1284,
+      "step": 5162
+    },
+    {
+      "epoch": 0.9193376068376068,
+      "grad_norm": 0.5929917693138123,
+      "learning_rate": 0.00017518782610948148,
+      "loss": 1.1221,
+      "step": 5163
+    },
+    {
+      "epoch": 0.9195156695156695,
+      "grad_norm": 0.7116361856460571,
+      "learning_rate": 0.00017517859681203692,
+      "loss": 1.0188,
+      "step": 5164
+    },
+    {
+      "epoch": 0.9196937321937322,
+      "grad_norm": 0.5095893740653992,
+      "learning_rate": 0.00017516936604162396,
+      "loss": 1.0724,
+      "step": 5165
+    },
+    {
+      "epoch": 0.9198717948717948,
+      "grad_norm": 0.5701385736465454,
+      "learning_rate": 0.00017516013379842337,
+      "loss": 1.0572,
+      "step": 5166
+    },
+    {
+      "epoch": 0.9200498575498576,
+      "grad_norm": 0.518412709236145,
+      "learning_rate": 0.00017515090008261613,
+      "loss": 1.0514,
+      "step": 5167
+    },
+    {
+      "epoch": 0.9202279202279202,
+      "grad_norm": 0.5324261784553528,
+      "learning_rate": 0.00017514166489438312,
+      "loss": 1.1708,
+      "step": 5168
+    },
+    {
+      "epoch": 0.9204059829059829,
+      "grad_norm": 0.5640990138053894,
+      "learning_rate": 0.00017513242823390525,
+      "loss": 1.2846,
+      "step": 5169
+    },
+    {
+      "epoch": 0.9205840455840456,
+      "grad_norm": 0.510352373123169,
+      "learning_rate": 0.00017512319010136356,
+      "loss": 1.0763,
+      "step": 5170
+    },
+    {
+      "epoch": 0.9207621082621082,
+      "grad_norm": 0.4994175136089325,
+      "learning_rate": 0.00017511395049693898,
+      "loss": 0.9665,
+      "step": 5171
+    },
+    {
+      "epoch": 0.9209401709401709,
+      "grad_norm": 0.43196994066238403,
+      "learning_rate": 0.00017510470942081258,
+      "loss": 0.761,
+      "step": 5172
+    },
+    {
+      "epoch": 0.9211182336182336,
+      "grad_norm": 0.558977484703064,
+      "learning_rate": 0.00017509546687316543,
+      "loss": 1.0758,
+      "step": 5173
+    },
+    {
+      "epoch": 0.9212962962962963,
+      "grad_norm": 0.573302149772644,
+      "learning_rate": 0.0001750862228541786,
+      "loss": 0.9635,
+      "step": 5174
+    },
+    {
+      "epoch": 0.9214743589743589,
+      "grad_norm": 0.5083786845207214,
+      "learning_rate": 0.00017507697736403321,
+      "loss": 1.0311,
+      "step": 5175
+    },
+    {
+      "epoch": 0.9216524216524217,
+      "grad_norm": 0.5478954911231995,
+      "learning_rate": 0.00017506773040291043,
+      "loss": 1.074,
+      "step": 5176
+    },
+    {
+      "epoch": 0.9218304843304843,
+      "grad_norm": 0.522376537322998,
+      "learning_rate": 0.00017505848197099137,
+      "loss": 1.1162,
+      "step": 5177
+    },
+    {
+      "epoch": 0.9220085470085471,
+      "grad_norm": 0.5946292281150818,
+      "learning_rate": 0.0001750492320684573,
+      "loss": 0.9494,
+      "step": 5178
+    },
+    {
+      "epoch": 0.9221866096866097,
+      "grad_norm": 0.5423247814178467,
+      "learning_rate": 0.00017503998069548943,
+      "loss": 1.0558,
+      "step": 5179
+    },
+    {
+      "epoch": 0.9223646723646723,
+      "grad_norm": 0.49960651993751526,
+      "learning_rate": 0.000175030727852269,
+      "loss": 1.0748,
+      "step": 5180
+    },
+    {
+      "epoch": 0.9225427350427351,
+      "grad_norm": 0.6066586375236511,
+      "learning_rate": 0.00017502147353897732,
+      "loss": 1.2066,
+      "step": 5181
+    },
+    {
+      "epoch": 0.9227207977207977,
+      "grad_norm": 0.57244473695755,
+      "learning_rate": 0.00017501221775579576,
+      "loss": 1.048,
+      "step": 5182
+    },
+    {
+      "epoch": 0.9228988603988604,
+      "grad_norm": 0.512464165687561,
+      "learning_rate": 0.00017500296050290557,
+      "loss": 1.1405,
+      "step": 5183
+    },
+    {
+      "epoch": 0.9230769230769231,
+      "grad_norm": 0.5380734801292419,
+      "learning_rate": 0.00017499370178048818,
+      "loss": 1.0641,
+      "step": 5184
+    },
+    {
+      "epoch": 0.9232549857549858,
+      "grad_norm": 0.47102874517440796,
+      "learning_rate": 0.000174984441588725,
+      "loss": 0.7948,
+      "step": 5185
+    },
+    {
+      "epoch": 0.9234330484330484,
+      "grad_norm": 0.6702211499214172,
+      "learning_rate": 0.00017497517992779747,
+      "loss": 1.3009,
+      "step": 5186
+    },
+    {
+      "epoch": 0.9236111111111112,
+      "grad_norm": 0.4685834050178528,
+      "learning_rate": 0.000174965916797887,
+      "loss": 0.8136,
+      "step": 5187
+    },
+    {
+      "epoch": 0.9237891737891738,
+      "grad_norm": 0.5414277911186218,
+      "learning_rate": 0.00017495665219917513,
+      "loss": 0.9708,
+      "step": 5188
+    },
+    {
+      "epoch": 0.9239672364672364,
+      "grad_norm": 0.5253050923347473,
+      "learning_rate": 0.0001749473861318434,
+      "loss": 1.0691,
+      "step": 5189
+    },
+    {
+      "epoch": 0.9241452991452992,
+      "grad_norm": 0.6009906530380249,
+      "learning_rate": 0.00017493811859607328,
+      "loss": 1.2023,
+      "step": 5190
+    },
+    {
+      "epoch": 0.9243233618233618,
+      "grad_norm": 0.5519336462020874,
+      "learning_rate": 0.00017492884959204643,
+      "loss": 1.189,
+      "step": 5191
+    },
+    {
+      "epoch": 0.9245014245014245,
+      "grad_norm": 0.5024857521057129,
+      "learning_rate": 0.0001749195791199444,
+      "loss": 0.8685,
+      "step": 5192
+    },
+    {
+      "epoch": 0.9246794871794872,
+      "grad_norm": 0.5735679864883423,
+      "learning_rate": 0.00017491030717994887,
+      "loss": 1.1903,
+      "step": 5193
+    },
+    {
+      "epoch": 0.9248575498575499,
+      "grad_norm": 0.5338658094406128,
+      "learning_rate": 0.00017490103377224147,
+      "loss": 1.0442,
+      "step": 5194
+    },
+    {
+      "epoch": 0.9250356125356125,
+      "grad_norm": 0.46669119596481323,
+      "learning_rate": 0.0001748917588970039,
+      "loss": 0.6343,
+      "step": 5195
+    },
+    {
+      "epoch": 0.9252136752136753,
+      "grad_norm": 0.510910153388977,
+      "learning_rate": 0.00017488248255441793,
+      "loss": 0.9334,
+      "step": 5196
+    },
+    {
+      "epoch": 0.9253917378917379,
+      "grad_norm": 0.5732216238975525,
+      "learning_rate": 0.00017487320474466524,
+      "loss": 1.0483,
+      "step": 5197
+    },
+    {
+      "epoch": 0.9255698005698005,
+      "grad_norm": 0.5864318609237671,
+      "learning_rate": 0.00017486392546792762,
+      "loss": 1.0669,
+      "step": 5198
+    },
+    {
+      "epoch": 0.9257478632478633,
+      "grad_norm": 0.5074281096458435,
+      "learning_rate": 0.00017485464472438692,
+      "loss": 1.0636,
+      "step": 5199
+    },
+    {
+      "epoch": 0.9259259259259259,
+      "grad_norm": 0.5833215117454529,
+      "learning_rate": 0.00017484536251422496,
+      "loss": 1.2005,
+      "step": 5200
+    },
+    {
+      "epoch": 0.9261039886039886,
+      "grad_norm": 0.5624990463256836,
+      "learning_rate": 0.0001748360788376236,
+      "loss": 1.1623,
+      "step": 5201
+    },
+    {
+      "epoch": 0.9262820512820513,
+      "grad_norm": 0.5618230104446411,
+      "learning_rate": 0.00017482679369476472,
+      "loss": 1.0495,
+      "step": 5202
+    },
+    {
+      "epoch": 0.926460113960114,
+      "grad_norm": 0.6254985332489014,
+      "learning_rate": 0.00017481750708583024,
+      "loss": 0.9521,
+      "step": 5203
+    },
+    {
+      "epoch": 0.9266381766381766,
+      "grad_norm": 0.5488203763961792,
+      "learning_rate": 0.00017480821901100216,
+      "loss": 1.0689,
+      "step": 5204
+    },
+    {
+      "epoch": 0.9268162393162394,
+      "grad_norm": 0.6157993674278259,
+      "learning_rate": 0.00017479892947046245,
+      "loss": 1.2852,
+      "step": 5205
+    },
+    {
+      "epoch": 0.926994301994302,
+      "grad_norm": 0.49653390049934387,
+      "learning_rate": 0.00017478963846439305,
+      "loss": 0.8616,
+      "step": 5206
+    },
+    {
+      "epoch": 0.9271723646723646,
+      "grad_norm": 0.5079081058502197,
+      "learning_rate": 0.00017478034599297603,
+      "loss": 1.0192,
+      "step": 5207
+    },
+    {
+      "epoch": 0.9273504273504274,
+      "grad_norm": 0.5392495393753052,
+      "learning_rate": 0.00017477105205639354,
+      "loss": 1.115,
+      "step": 5208
+    },
+    {
+      "epoch": 0.92752849002849,
+      "grad_norm": 0.5336191654205322,
+      "learning_rate": 0.00017476175665482756,
+      "loss": 1.1892,
+      "step": 5209
+    },
+    {
+      "epoch": 0.9277065527065527,
+      "grad_norm": 0.631712019443512,
+      "learning_rate": 0.00017475245978846026,
+      "loss": 0.9619,
+      "step": 5210
+    },
+    {
+      "epoch": 0.9278846153846154,
+      "grad_norm": 0.5123951435089111,
+      "learning_rate": 0.0001747431614574738,
+      "loss": 1.1477,
+      "step": 5211
+    },
+    {
+      "epoch": 0.9280626780626781,
+      "grad_norm": 0.5045743584632874,
+      "learning_rate": 0.00017473386166205038,
+      "loss": 0.9749,
+      "step": 5212
+    },
+    {
+      "epoch": 0.9282407407407407,
+      "grad_norm": 0.5296525359153748,
+      "learning_rate": 0.00017472456040237217,
+      "loss": 1.0736,
+      "step": 5213
+    },
+    {
+      "epoch": 0.9284188034188035,
+      "grad_norm": 0.6304933428764343,
+      "learning_rate": 0.00017471525767862145,
+      "loss": 1.2444,
+      "step": 5214
+    },
+    {
+      "epoch": 0.9285968660968661,
+      "grad_norm": 0.4851958155632019,
+      "learning_rate": 0.00017470595349098044,
+      "loss": 0.9049,
+      "step": 5215
+    },
+    {
+      "epoch": 0.9287749287749287,
+      "grad_norm": 0.5730679631233215,
+      "learning_rate": 0.00017469664783963148,
+      "loss": 1.0773,
+      "step": 5216
+    },
+    {
+      "epoch": 0.9289529914529915,
+      "grad_norm": 0.6020415425300598,
+      "learning_rate": 0.00017468734072475684,
+      "loss": 1.3247,
+      "step": 5217
+    },
+    {
+      "epoch": 0.9291310541310541,
+      "grad_norm": 0.47981077432632446,
+      "learning_rate": 0.00017467803214653893,
+      "loss": 1.0009,
+      "step": 5218
+    },
+    {
+      "epoch": 0.9293091168091168,
+      "grad_norm": 0.5787527561187744,
+      "learning_rate": 0.0001746687221051601,
+      "loss": 1.2523,
+      "step": 5219
+    },
+    {
+      "epoch": 0.9294871794871795,
+      "grad_norm": 0.4495891332626343,
+      "learning_rate": 0.00017465941060080278,
+      "loss": 0.7364,
+      "step": 5220
+    },
+    {
+      "epoch": 0.9296652421652422,
+      "grad_norm": 0.5721768140792847,
+      "learning_rate": 0.0001746500976336494,
+      "loss": 1.015,
+      "step": 5221
+    },
+    {
+      "epoch": 0.9298433048433048,
+      "grad_norm": 0.5500208735466003,
+      "learning_rate": 0.0001746407832038824,
+      "loss": 1.053,
+      "step": 5222
+    },
+    {
+      "epoch": 0.9300213675213675,
+      "grad_norm": 0.5784386992454529,
+      "learning_rate": 0.00017463146731168437,
+      "loss": 0.9784,
+      "step": 5223
+    },
+    {
+      "epoch": 0.9301994301994302,
+      "grad_norm": 0.4960322082042694,
+      "learning_rate": 0.00017462214995723772,
+      "loss": 0.8674,
+      "step": 5224
+    },
+    {
+      "epoch": 0.9303774928774928,
+      "grad_norm": 0.5005537271499634,
+      "learning_rate": 0.00017461283114072508,
+      "loss": 1.0486,
+      "step": 5225
+    },
+    {
+      "epoch": 0.9305555555555556,
+      "grad_norm": 0.5064167380332947,
+      "learning_rate": 0.000174603510862329,
+      "loss": 0.9722,
+      "step": 5226
+    },
+    {
+      "epoch": 0.9307336182336182,
+      "grad_norm": 0.583558976650238,
+      "learning_rate": 0.0001745941891222321,
+      "loss": 0.9957,
+      "step": 5227
+    },
+    {
+      "epoch": 0.9309116809116809,
+      "grad_norm": 0.4982515871524811,
+      "learning_rate": 0.00017458486592061704,
+      "loss": 0.958,
+      "step": 5228
+    },
+    {
+      "epoch": 0.9310897435897436,
+      "grad_norm": 0.526549756526947,
+      "learning_rate": 0.0001745755412576664,
+      "loss": 1.1172,
+      "step": 5229
+    },
+    {
+      "epoch": 0.9312678062678063,
+      "grad_norm": 0.6129719018936157,
+      "learning_rate": 0.000174566215133563,
+      "loss": 1.2524,
+      "step": 5230
+    },
+    {
+      "epoch": 0.9314458689458689,
+      "grad_norm": 0.5385653972625732,
+      "learning_rate": 0.00017455688754848948,
+      "loss": 1.1655,
+      "step": 5231
+    },
+    {
+      "epoch": 0.9316239316239316,
+      "grad_norm": 0.5646410584449768,
+      "learning_rate": 0.0001745475585026287,
+      "loss": 0.9026,
+      "step": 5232
+    },
+    {
+      "epoch": 0.9318019943019943,
+      "grad_norm": 0.549223780632019,
+      "learning_rate": 0.0001745382279961633,
+      "loss": 0.804,
+      "step": 5233
+    },
+    {
+      "epoch": 0.9319800569800569,
+      "grad_norm": 0.48547953367233276,
+      "learning_rate": 0.0001745288960292762,
+      "loss": 1.0224,
+      "step": 5234
+    },
+    {
+      "epoch": 0.9321581196581197,
+      "grad_norm": 0.5260967016220093,
+      "learning_rate": 0.00017451956260215016,
+      "loss": 0.9688,
+      "step": 5235
+    },
+    {
+      "epoch": 0.9323361823361823,
+      "grad_norm": 0.6261999011039734,
+      "learning_rate": 0.00017451022771496812,
+      "loss": 1.2539,
+      "step": 5236
+    },
+    {
+      "epoch": 0.9325142450142451,
+      "grad_norm": 0.5801421999931335,
+      "learning_rate": 0.00017450089136791298,
+      "loss": 1.11,
+      "step": 5237
+    },
+    {
+      "epoch": 0.9326923076923077,
+      "grad_norm": 0.5833573937416077,
+      "learning_rate": 0.0001744915535611676,
+      "loss": 0.9328,
+      "step": 5238
+    },
+    {
+      "epoch": 0.9328703703703703,
+      "grad_norm": 0.5422634482383728,
+      "learning_rate": 0.00017448221429491496,
+      "loss": 1.034,
+      "step": 5239
+    },
+    {
+      "epoch": 0.9330484330484331,
+      "grad_norm": 0.5105658769607544,
+      "learning_rate": 0.00017447287356933808,
+      "loss": 0.8924,
+      "step": 5240
+    },
+    {
+      "epoch": 0.9332264957264957,
+      "grad_norm": 0.5114831924438477,
+      "learning_rate": 0.00017446353138461995,
+      "loss": 0.9328,
+      "step": 5241
+    },
+    {
+      "epoch": 0.9334045584045584,
+      "grad_norm": 0.5105039477348328,
+      "learning_rate": 0.00017445418774094358,
+      "loss": 1.0468,
+      "step": 5242
+    },
+    {
+      "epoch": 0.9335826210826211,
+      "grad_norm": 0.593250036239624,
+      "learning_rate": 0.00017444484263849208,
+      "loss": 1.0603,
+      "step": 5243
+    },
+    {
+      "epoch": 0.9337606837606838,
+      "grad_norm": 0.600788414478302,
+      "learning_rate": 0.00017443549607744853,
+      "loss": 1.1506,
+      "step": 5244
+    },
+    {
+      "epoch": 0.9339387464387464,
+      "grad_norm": 0.5394418239593506,
+      "learning_rate": 0.00017442614805799605,
+      "loss": 1.038,
+      "step": 5245
+    },
+    {
+      "epoch": 0.9341168091168092,
+      "grad_norm": 0.5446375608444214,
+      "learning_rate": 0.00017441679858031786,
+      "loss": 1.079,
+      "step": 5246
+    },
+    {
+      "epoch": 0.9342948717948718,
+      "grad_norm": 0.5859794616699219,
+      "learning_rate": 0.00017440744764459702,
+      "loss": 1.1453,
+      "step": 5247
+    },
+    {
+      "epoch": 0.9344729344729344,
+      "grad_norm": 0.4899081289768219,
+      "learning_rate": 0.00017439809525101688,
+      "loss": 1.163,
+      "step": 5248
+    },
+    {
+      "epoch": 0.9346509971509972,
+      "grad_norm": 0.652846097946167,
+      "learning_rate": 0.00017438874139976055,
+      "loss": 1.1819,
+      "step": 5249
+    },
+    {
+      "epoch": 0.9348290598290598,
+      "grad_norm": 0.5402514934539795,
+      "learning_rate": 0.00017437938609101138,
+      "loss": 1.0159,
+      "step": 5250
+    },
+    {
+      "epoch": 0.9350071225071225,
+      "grad_norm": 0.565864086151123,
+      "learning_rate": 0.00017437002932495265,
+      "loss": 1.1121,
+      "step": 5251
+    },
+    {
+      "epoch": 0.9351851851851852,
+      "grad_norm": 0.611786425113678,
+      "learning_rate": 0.0001743606711017677,
+      "loss": 1.2511,
+      "step": 5252
+    },
+    {
+      "epoch": 0.9353632478632479,
+      "grad_norm": 0.5706882476806641,
+      "learning_rate": 0.00017435131142163988,
+      "loss": 1.128,
+      "step": 5253
+    },
+    {
+      "epoch": 0.9355413105413105,
+      "grad_norm": 0.5369367003440857,
+      "learning_rate": 0.00017434195028475253,
+      "loss": 1.0562,
+      "step": 5254
+    },
+    {
+      "epoch": 0.9357193732193733,
+      "grad_norm": 0.49957552552223206,
+      "learning_rate": 0.0001743325876912891,
+      "loss": 1.0568,
+      "step": 5255
+    },
+    {
+      "epoch": 0.9358974358974359,
+      "grad_norm": 0.5398106575012207,
+      "learning_rate": 0.00017432322364143305,
+      "loss": 1.1502,
+      "step": 5256
+    },
+    {
+      "epoch": 0.9360754985754985,
+      "grad_norm": 0.6522027254104614,
+      "learning_rate": 0.00017431385813536783,
+      "loss": 1.0591,
+      "step": 5257
+    },
+    {
+      "epoch": 0.9362535612535613,
+      "grad_norm": 0.5872012972831726,
+      "learning_rate": 0.00017430449117327693,
+      "loss": 1.3737,
+      "step": 5258
+    },
+    {
+      "epoch": 0.9364316239316239,
+      "grad_norm": 0.5124474167823792,
+      "learning_rate": 0.00017429512275534382,
+      "loss": 1.0727,
+      "step": 5259
+    },
+    {
+      "epoch": 0.9366096866096866,
+      "grad_norm": 0.5103365778923035,
+      "learning_rate": 0.00017428575288175218,
+      "loss": 1.0339,
+      "step": 5260
+    },
+    {
+      "epoch": 0.9367877492877493,
+      "grad_norm": 0.585483729839325,
+      "learning_rate": 0.0001742763815526855,
+      "loss": 1.1844,
+      "step": 5261
+    },
+    {
+      "epoch": 0.936965811965812,
+      "grad_norm": 0.5855562090873718,
+      "learning_rate": 0.00017426700876832746,
+      "loss": 1.3234,
+      "step": 5262
+    },
+    {
+      "epoch": 0.9371438746438746,
+      "grad_norm": 0.5774588584899902,
+      "learning_rate": 0.00017425763452886162,
+      "loss": 1.0937,
+      "step": 5263
+    },
+    {
+      "epoch": 0.9373219373219374,
+      "grad_norm": 0.5718343257904053,
+      "learning_rate": 0.00017424825883447168,
+      "loss": 1.0783,
+      "step": 5264
+    },
+    {
+      "epoch": 0.9375,
+      "grad_norm": 0.5414558053016663,
+      "learning_rate": 0.00017423888168534136,
+      "loss": 1.1244,
+      "step": 5265
+    },
+    {
+      "epoch": 0.9376780626780626,
+      "grad_norm": 0.5818275809288025,
+      "learning_rate": 0.00017422950308165438,
+      "loss": 1.247,
+      "step": 5266
+    },
+    {
+      "epoch": 0.9378561253561254,
+      "grad_norm": 0.586398184299469,
+      "learning_rate": 0.00017422012302359448,
+      "loss": 1.0515,
+      "step": 5267
+    },
+    {
+      "epoch": 0.938034188034188,
+      "grad_norm": 0.5236606001853943,
+      "learning_rate": 0.00017421074151134544,
+      "loss": 1.1907,
+      "step": 5268
+    },
+    {
+      "epoch": 0.9382122507122507,
+      "grad_norm": 0.5108010172843933,
+      "learning_rate": 0.0001742013585450911,
+      "loss": 1.1125,
+      "step": 5269
+    },
+    {
+      "epoch": 0.9383903133903134,
+      "grad_norm": 0.4956454038619995,
+      "learning_rate": 0.00017419197412501527,
+      "loss": 1.0305,
+      "step": 5270
+    },
+    {
+      "epoch": 0.9385683760683761,
+      "grad_norm": 0.5432302951812744,
+      "learning_rate": 0.0001741825882513018,
+      "loss": 1.1946,
+      "step": 5271
+    },
+    {
+      "epoch": 0.9387464387464387,
+      "grad_norm": 0.5119295716285706,
+      "learning_rate": 0.00017417320092413463,
+      "loss": 0.875,
+      "step": 5272
+    },
+    {
+      "epoch": 0.9389245014245015,
+      "grad_norm": 0.49740248918533325,
+      "learning_rate": 0.0001741638121436977,
+      "loss": 1.1093,
+      "step": 5273
+    },
+    {
+      "epoch": 0.9391025641025641,
+      "grad_norm": 0.5069027543067932,
+      "learning_rate": 0.00017415442191017491,
+      "loss": 1.2498,
+      "step": 5274
+    },
+    {
+      "epoch": 0.9392806267806267,
+      "grad_norm": 0.570264995098114,
+      "learning_rate": 0.00017414503022375027,
+      "loss": 1.0192,
+      "step": 5275
+    },
+    {
+      "epoch": 0.9394586894586895,
+      "grad_norm": 0.48129352927207947,
+      "learning_rate": 0.00017413563708460776,
+      "loss": 0.8467,
+      "step": 5276
+    },
+    {
+      "epoch": 0.9396367521367521,
+      "grad_norm": 0.5214534401893616,
+      "learning_rate": 0.00017412624249293148,
+      "loss": 0.9723,
+      "step": 5277
+    },
+    {
+      "epoch": 0.9398148148148148,
+      "grad_norm": 0.5150161385536194,
+      "learning_rate": 0.00017411684644890544,
+      "loss": 1.0906,
+      "step": 5278
+    },
+    {
+      "epoch": 0.9399928774928775,
+      "grad_norm": 0.5695852637290955,
+      "learning_rate": 0.00017410744895271377,
+      "loss": 1.2891,
+      "step": 5279
+    },
+    {
+      "epoch": 0.9401709401709402,
+      "grad_norm": 0.5613594651222229,
+      "learning_rate": 0.00017409805000454055,
+      "loss": 1.1373,
+      "step": 5280
+    },
+    {
+      "epoch": 0.9403490028490028,
+      "grad_norm": 0.5134239196777344,
+      "learning_rate": 0.00017408864960457004,
+      "loss": 1.1081,
+      "step": 5281
+    },
+    {
+      "epoch": 0.9405270655270656,
+      "grad_norm": 0.5256397724151611,
+      "learning_rate": 0.00017407924775298628,
+      "loss": 1.058,
+      "step": 5282
+    },
+    {
+      "epoch": 0.9407051282051282,
+      "grad_norm": 0.5145402550697327,
+      "learning_rate": 0.00017406984444997357,
+      "loss": 1.0667,
+      "step": 5283
+    },
+    {
+      "epoch": 0.9408831908831908,
+      "grad_norm": 0.5435704588890076,
+      "learning_rate": 0.0001740604396957161,
+      "loss": 1.2275,
+      "step": 5284
+    },
+    {
+      "epoch": 0.9410612535612536,
+      "grad_norm": 0.5798762440681458,
+      "learning_rate": 0.0001740510334903982,
+      "loss": 1.2061,
+      "step": 5285
+    },
+    {
+      "epoch": 0.9412393162393162,
+      "grad_norm": 0.5461057424545288,
+      "learning_rate": 0.00017404162583420414,
+      "loss": 1.1585,
+      "step": 5286
+    },
+    {
+      "epoch": 0.9414173789173789,
+      "grad_norm": 0.5090487003326416,
+      "learning_rate": 0.00017403221672731818,
+      "loss": 1.2496,
+      "step": 5287
+    },
+    {
+      "epoch": 0.9415954415954416,
+      "grad_norm": 0.5171035528182983,
+      "learning_rate": 0.00017402280616992476,
+      "loss": 1.1947,
+      "step": 5288
+    },
+    {
+      "epoch": 0.9417735042735043,
+      "grad_norm": 0.5292364358901978,
+      "learning_rate": 0.00017401339416220818,
+      "loss": 1.0182,
+      "step": 5289
+    },
+    {
+      "epoch": 0.9419515669515669,
+      "grad_norm": 0.5011499524116516,
+      "learning_rate": 0.00017400398070435293,
+      "loss": 1.3363,
+      "step": 5290
+    },
+    {
+      "epoch": 0.9421296296296297,
+      "grad_norm": 0.4821554720401764,
+      "learning_rate": 0.0001739945657965434,
+      "loss": 0.9077,
+      "step": 5291
+    },
+    {
+      "epoch": 0.9423076923076923,
+      "grad_norm": 0.5849515199661255,
+      "learning_rate": 0.00017398514943896403,
+      "loss": 1.1582,
+      "step": 5292
+    },
+    {
+      "epoch": 0.9424857549857549,
+      "grad_norm": 0.49826139211654663,
+      "learning_rate": 0.00017397573163179937,
+      "loss": 1.1025,
+      "step": 5293
+    },
+    {
+      "epoch": 0.9426638176638177,
+      "grad_norm": 0.6031842827796936,
+      "learning_rate": 0.00017396631237523392,
+      "loss": 1.1932,
+      "step": 5294
+    },
+    {
+      "epoch": 0.9428418803418803,
+      "grad_norm": 0.6013330221176147,
+      "learning_rate": 0.00017395689166945224,
+      "loss": 1.2078,
+      "step": 5295
+    },
+    {
+      "epoch": 0.9430199430199431,
+      "grad_norm": 0.5147021412849426,
+      "learning_rate": 0.00017394746951463893,
+      "loss": 0.9988,
+      "step": 5296
+    },
+    {
+      "epoch": 0.9431980056980057,
+      "grad_norm": 0.5721762776374817,
+      "learning_rate": 0.0001739380459109785,
+      "loss": 1.1442,
+      "step": 5297
+    },
+    {
+      "epoch": 0.9433760683760684,
+      "grad_norm": 0.49272531270980835,
+      "learning_rate": 0.0001739286208586557,
+      "loss": 1.0481,
+      "step": 5298
+    },
+    {
+      "epoch": 0.9435541310541311,
+      "grad_norm": 0.6545688509941101,
+      "learning_rate": 0.00017391919435785514,
+      "loss": 1.1393,
+      "step": 5299
+    },
+    {
+      "epoch": 0.9437321937321937,
+      "grad_norm": 0.617756724357605,
+      "learning_rate": 0.00017390976640876152,
+      "loss": 1.1108,
+      "step": 5300
+    },
+    {
+      "epoch": 0.9439102564102564,
+      "grad_norm": 0.4870470464229584,
+      "learning_rate": 0.00017390033701155955,
+      "loss": 0.9028,
+      "step": 5301
+    },
+    {
+      "epoch": 0.9440883190883191,
+      "grad_norm": 0.5250138640403748,
+      "learning_rate": 0.000173890906166434,
+      "loss": 1.0326,
+      "step": 5302
+    },
+    {
+      "epoch": 0.9442663817663818,
+      "grad_norm": 0.5879467129707336,
+      "learning_rate": 0.00017388147387356964,
+      "loss": 1.1569,
+      "step": 5303
+    },
+    {
+      "epoch": 0.9444444444444444,
+      "grad_norm": 0.4790486991405487,
+      "learning_rate": 0.00017387204013315127,
+      "loss": 0.967,
+      "step": 5304
+    },
+    {
+      "epoch": 0.9446225071225072,
+      "grad_norm": 0.5884372591972351,
+      "learning_rate": 0.0001738626049453637,
+      "loss": 1.1342,
+      "step": 5305
+    },
+    {
+      "epoch": 0.9448005698005698,
+      "grad_norm": 0.4633975028991699,
+      "learning_rate": 0.00017385316831039187,
+      "loss": 0.8942,
+      "step": 5306
+    },
+    {
+      "epoch": 0.9449786324786325,
+      "grad_norm": 0.5301823019981384,
+      "learning_rate": 0.0001738437302284206,
+      "loss": 1.1683,
+      "step": 5307
+    },
+    {
+      "epoch": 0.9451566951566952,
+      "grad_norm": 0.5476770997047424,
+      "learning_rate": 0.00017383429069963484,
+      "loss": 1.1574,
+      "step": 5308
+    },
+    {
+      "epoch": 0.9453347578347578,
+      "grad_norm": 0.47689101099967957,
+      "learning_rate": 0.00017382484972421953,
+      "loss": 1.0792,
+      "step": 5309
+    },
+    {
+      "epoch": 0.9455128205128205,
+      "grad_norm": 0.526063084602356,
+      "learning_rate": 0.00017381540730235963,
+      "loss": 0.9012,
+      "step": 5310
+    },
+    {
+      "epoch": 0.9456908831908832,
+      "grad_norm": 0.5667058229446411,
+      "learning_rate": 0.0001738059634342402,
+      "loss": 1.0908,
+      "step": 5311
+    },
+    {
+      "epoch": 0.9458689458689459,
+      "grad_norm": 0.5402196645736694,
+      "learning_rate": 0.00017379651812004623,
+      "loss": 0.943,
+      "step": 5312
+    },
+    {
+      "epoch": 0.9460470085470085,
+      "grad_norm": 0.5288932919502258,
+      "learning_rate": 0.00017378707135996276,
+      "loss": 1.0055,
+      "step": 5313
+    },
+    {
+      "epoch": 0.9462250712250713,
+      "grad_norm": 0.5607456564903259,
+      "learning_rate": 0.00017377762315417492,
+      "loss": 1.2073,
+      "step": 5314
+    },
+    {
+      "epoch": 0.9464031339031339,
+      "grad_norm": 0.5737698674201965,
+      "learning_rate": 0.00017376817350286781,
+      "loss": 1.0001,
+      "step": 5315
+    },
+    {
+      "epoch": 0.9465811965811965,
+      "grad_norm": 0.6562079787254333,
+      "learning_rate": 0.00017375872240622657,
+      "loss": 1.1503,
+      "step": 5316
+    },
+    {
+      "epoch": 0.9467592592592593,
+      "grad_norm": 0.5407183170318604,
+      "learning_rate": 0.0001737492698644364,
+      "loss": 1.1169,
+      "step": 5317
+    },
+    {
+      "epoch": 0.9469373219373219,
+      "grad_norm": 0.5504152178764343,
+      "learning_rate": 0.00017373981587768248,
+      "loss": 1.0468,
+      "step": 5318
+    },
+    {
+      "epoch": 0.9471153846153846,
+      "grad_norm": 0.4813530743122101,
+      "learning_rate": 0.00017373036044615006,
+      "loss": 0.9707,
+      "step": 5319
+    },
+    {
+      "epoch": 0.9472934472934473,
+      "grad_norm": 0.5810509920120239,
+      "learning_rate": 0.00017372090357002437,
+      "loss": 1.4949,
+      "step": 5320
+    },
+    {
+      "epoch": 0.94747150997151,
+      "grad_norm": 0.5250222086906433,
+      "learning_rate": 0.00017371144524949074,
+      "loss": 1.0818,
+      "step": 5321
+    },
+    {
+      "epoch": 0.9476495726495726,
+      "grad_norm": 0.4852280914783478,
+      "learning_rate": 0.00017370198548473444,
+      "loss": 1.1793,
+      "step": 5322
+    },
+    {
+      "epoch": 0.9478276353276354,
+      "grad_norm": 0.5392420291900635,
+      "learning_rate": 0.00017369252427594086,
+      "loss": 1.153,
+      "step": 5323
+    },
+    {
+      "epoch": 0.948005698005698,
+      "grad_norm": 0.521294116973877,
+      "learning_rate": 0.00017368306162329533,
+      "loss": 0.8572,
+      "step": 5324
+    },
+    {
+      "epoch": 0.9481837606837606,
+      "grad_norm": 0.5579673647880554,
+      "learning_rate": 0.0001736735975269833,
+      "loss": 1.0452,
+      "step": 5325
+    },
+    {
+      "epoch": 0.9483618233618234,
+      "grad_norm": 0.6027318835258484,
+      "learning_rate": 0.0001736641319871901,
+      "loss": 1.3475,
+      "step": 5326
+    },
+    {
+      "epoch": 0.948539886039886,
+      "grad_norm": 0.5600738525390625,
+      "learning_rate": 0.00017365466500410132,
+      "loss": 1.0338,
+      "step": 5327
+    },
+    {
+      "epoch": 0.9487179487179487,
+      "grad_norm": 0.5691532492637634,
+      "learning_rate": 0.00017364519657790236,
+      "loss": 1.129,
+      "step": 5328
+    },
+    {
+      "epoch": 0.9488960113960114,
+      "grad_norm": 0.5161463022232056,
+      "learning_rate": 0.0001736357267087788,
+      "loss": 1.0438,
+      "step": 5329
+    },
+    {
+      "epoch": 0.9490740740740741,
+      "grad_norm": 0.5049656629562378,
+      "learning_rate": 0.0001736262553969161,
+      "loss": 0.9484,
+      "step": 5330
+    },
+    {
+      "epoch": 0.9492521367521367,
+      "grad_norm": 0.5477150678634644,
+      "learning_rate": 0.00017361678264249988,
+      "loss": 0.8995,
+      "step": 5331
+    },
+    {
+      "epoch": 0.9494301994301995,
+      "grad_norm": 0.5679608583450317,
+      "learning_rate": 0.0001736073084457157,
+      "loss": 1.241,
+      "step": 5332
+    },
+    {
+      "epoch": 0.9496082621082621,
+      "grad_norm": 0.5748196840286255,
+      "learning_rate": 0.00017359783280674926,
+      "loss": 1.0046,
+      "step": 5333
+    },
+    {
+      "epoch": 0.9497863247863247,
+      "grad_norm": 0.5677094459533691,
+      "learning_rate": 0.00017358835572578617,
+      "loss": 1.2913,
+      "step": 5334
+    },
+    {
+      "epoch": 0.9499643874643875,
+      "grad_norm": 0.49663659930229187,
+      "learning_rate": 0.0001735788772030121,
+      "loss": 1.0388,
+      "step": 5335
+    },
+    {
+      "epoch": 0.9501424501424501,
+      "grad_norm": 0.5687218904495239,
+      "learning_rate": 0.0001735693972386128,
+      "loss": 1.1631,
+      "step": 5336
+    },
+    {
+      "epoch": 0.9503205128205128,
+      "grad_norm": 0.520708441734314,
+      "learning_rate": 0.00017355991583277395,
+      "loss": 1.0744,
+      "step": 5337
+    },
+    {
+      "epoch": 0.9504985754985755,
+      "grad_norm": 0.5738952159881592,
+      "learning_rate": 0.00017355043298568137,
+      "loss": 1.318,
+      "step": 5338
+    },
+    {
+      "epoch": 0.9506766381766382,
+      "grad_norm": 0.5378455519676208,
+      "learning_rate": 0.00017354094869752085,
+      "loss": 0.9827,
+      "step": 5339
+    },
+    {
+      "epoch": 0.9508547008547008,
+      "grad_norm": 0.5047366619110107,
+      "learning_rate": 0.0001735314629684782,
+      "loss": 1.0966,
+      "step": 5340
+    },
+    {
+      "epoch": 0.9510327635327636,
+      "grad_norm": 0.5526043772697449,
+      "learning_rate": 0.0001735219757987393,
+      "loss": 1.059,
+      "step": 5341
+    },
+    {
+      "epoch": 0.9512108262108262,
+      "grad_norm": 0.5741400718688965,
+      "learning_rate": 0.00017351248718849003,
+      "loss": 1.1232,
+      "step": 5342
+    },
+    {
+      "epoch": 0.9513888888888888,
+      "grad_norm": 0.5421118140220642,
+      "learning_rate": 0.00017350299713791626,
+      "loss": 1.0427,
+      "step": 5343
+    },
+    {
+      "epoch": 0.9515669515669516,
+      "grad_norm": 0.4857081472873688,
+      "learning_rate": 0.00017349350564720392,
+      "loss": 0.8663,
+      "step": 5344
+    },
+    {
+      "epoch": 0.9517450142450142,
+      "grad_norm": 0.5411618947982788,
+      "learning_rate": 0.00017348401271653904,
+      "loss": 1.0317,
+      "step": 5345
+    },
+    {
+      "epoch": 0.9519230769230769,
+      "grad_norm": 0.5246246457099915,
+      "learning_rate": 0.00017347451834610756,
+      "loss": 1.0076,
+      "step": 5346
+    },
+    {
+      "epoch": 0.9521011396011396,
+      "grad_norm": 0.5278927683830261,
+      "learning_rate": 0.00017346502253609556,
+      "loss": 0.931,
+      "step": 5347
+    },
+    {
+      "epoch": 0.9522792022792023,
+      "grad_norm": 0.5934548377990723,
+      "learning_rate": 0.00017345552528668902,
+      "loss": 1.3205,
+      "step": 5348
+    },
+    {
+      "epoch": 0.9524572649572649,
+      "grad_norm": 0.5466100573539734,
+      "learning_rate": 0.00017344602659807406,
+      "loss": 0.8725,
+      "step": 5349
+    },
+    {
+      "epoch": 0.9526353276353277,
+      "grad_norm": 0.5220118761062622,
+      "learning_rate": 0.00017343652647043678,
+      "loss": 1.1642,
+      "step": 5350
+    },
+    {
+      "epoch": 0.9528133903133903,
+      "grad_norm": 0.6166301965713501,
+      "learning_rate": 0.0001734270249039633,
+      "loss": 0.8152,
+      "step": 5351
+    },
+    {
+      "epoch": 0.9529914529914529,
+      "grad_norm": 0.5173428058624268,
+      "learning_rate": 0.00017341752189883983,
+      "loss": 0.9296,
+      "step": 5352
+    },
+    {
+      "epoch": 0.9531695156695157,
+      "grad_norm": 0.5363461375236511,
+      "learning_rate": 0.0001734080174552525,
+      "loss": 1.3546,
+      "step": 5353
+    },
+    {
+      "epoch": 0.9533475783475783,
+      "grad_norm": 0.5333831906318665,
+      "learning_rate": 0.0001733985115733876,
+      "loss": 1.0401,
+      "step": 5354
+    },
+    {
+      "epoch": 0.9535256410256411,
+      "grad_norm": 0.5179334878921509,
+      "learning_rate": 0.00017338900425343132,
+      "loss": 1.1254,
+      "step": 5355
+    },
+    {
+      "epoch": 0.9537037037037037,
+      "grad_norm": 0.5171303153038025,
+      "learning_rate": 0.00017337949549556993,
+      "loss": 1.0518,
+      "step": 5356
+    },
+    {
+      "epoch": 0.9538817663817664,
+      "grad_norm": 0.5164596438407898,
+      "learning_rate": 0.00017336998529998978,
+      "loss": 0.8732,
+      "step": 5357
+    },
+    {
+      "epoch": 0.9540598290598291,
+      "grad_norm": 0.5555717349052429,
+      "learning_rate": 0.00017336047366687719,
+      "loss": 1.2312,
+      "step": 5358
+    },
+    {
+      "epoch": 0.9542378917378918,
+      "grad_norm": 0.45685622096061707,
+      "learning_rate": 0.00017335096059641847,
+      "loss": 0.8882,
+      "step": 5359
+    },
+    {
+      "epoch": 0.9544159544159544,
+      "grad_norm": 0.5260133743286133,
+      "learning_rate": 0.0001733414460888001,
+      "loss": 1.0952,
+      "step": 5360
+    },
+    {
+      "epoch": 0.9545940170940171,
+      "grad_norm": 0.4597703814506531,
+      "learning_rate": 0.0001733319301442084,
+      "loss": 1.0835,
+      "step": 5361
+    },
+    {
+      "epoch": 0.9547720797720798,
+      "grad_norm": 0.5279495120048523,
+      "learning_rate": 0.0001733224127628299,
+      "loss": 1.0295,
+      "step": 5362
+    },
+    {
+      "epoch": 0.9549501424501424,
+      "grad_norm": 0.48919400572776794,
+      "learning_rate": 0.00017331289394485104,
+      "loss": 0.9693,
+      "step": 5363
+    },
+    {
+      "epoch": 0.9551282051282052,
+      "grad_norm": 0.5639515519142151,
+      "learning_rate": 0.0001733033736904583,
+      "loss": 1.0893,
+      "step": 5364
+    },
+    {
+      "epoch": 0.9553062678062678,
+      "grad_norm": 0.49761319160461426,
+      "learning_rate": 0.00017329385199983823,
+      "loss": 1.038,
+      "step": 5365
+    },
+    {
+      "epoch": 0.9554843304843305,
+      "grad_norm": 0.5503305792808533,
+      "learning_rate": 0.0001732843288731774,
+      "loss": 0.9976,
+      "step": 5366
+    },
+    {
+      "epoch": 0.9556623931623932,
+      "grad_norm": 0.5633028745651245,
+      "learning_rate": 0.00017327480431066235,
+      "loss": 1.0602,
+      "step": 5367
+    },
+    {
+      "epoch": 0.9558404558404558,
+      "grad_norm": 0.48074454069137573,
+      "learning_rate": 0.00017326527831247973,
+      "loss": 1.0286,
+      "step": 5368
+    },
+    {
+      "epoch": 0.9560185185185185,
+      "grad_norm": 0.506597638130188,
+      "learning_rate": 0.0001732557508788162,
+      "loss": 0.9061,
+      "step": 5369
+    },
+    {
+      "epoch": 0.9561965811965812,
+      "grad_norm": 0.6570749282836914,
+      "learning_rate": 0.0001732462220098584,
+      "loss": 1.0852,
+      "step": 5370
+    },
+    {
+      "epoch": 0.9563746438746439,
+      "grad_norm": 0.5607653856277466,
+      "learning_rate": 0.00017323669170579302,
+      "loss": 1.0486,
+      "step": 5371
+    },
+    {
+      "epoch": 0.9565527065527065,
+      "grad_norm": 0.6047050356864929,
+      "learning_rate": 0.0001732271599668068,
+      "loss": 1.2175,
+      "step": 5372
+    },
+    {
+      "epoch": 0.9567307692307693,
+      "grad_norm": 0.5506869554519653,
+      "learning_rate": 0.00017321762679308651,
+      "loss": 1.0114,
+      "step": 5373
+    },
+    {
+      "epoch": 0.9569088319088319,
+      "grad_norm": 0.5868638157844543,
+      "learning_rate": 0.00017320809218481891,
+      "loss": 1.2983,
+      "step": 5374
+    },
+    {
+      "epoch": 0.9570868945868946,
+      "grad_norm": 0.539619505405426,
+      "learning_rate": 0.00017319855614219084,
+      "loss": 1.2361,
+      "step": 5375
+    },
+    {
+      "epoch": 0.9572649572649573,
+      "grad_norm": 0.5525495409965515,
+      "learning_rate": 0.0001731890186653891,
+      "loss": 1.1316,
+      "step": 5376
+    },
+    {
+      "epoch": 0.95744301994302,
+      "grad_norm": 0.5549767017364502,
+      "learning_rate": 0.0001731794797546006,
+      "loss": 1.0547,
+      "step": 5377
+    },
+    {
+      "epoch": 0.9576210826210826,
+      "grad_norm": 0.5356076955795288,
+      "learning_rate": 0.00017316993941001222,
+      "loss": 0.9942,
+      "step": 5378
+    },
+    {
+      "epoch": 0.9577991452991453,
+      "grad_norm": 0.5365784168243408,
+      "learning_rate": 0.00017316039763181084,
+      "loss": 1.226,
+      "step": 5379
+    },
+    {
+      "epoch": 0.957977207977208,
+      "grad_norm": 0.5190927386283875,
+      "learning_rate": 0.00017315085442018343,
+      "loss": 1.1704,
+      "step": 5380
+    },
+    {
+      "epoch": 0.9581552706552706,
+      "grad_norm": 0.526658833026886,
+      "learning_rate": 0.00017314130977531705,
+      "loss": 1.109,
+      "step": 5381
+    },
+    {
+      "epoch": 0.9583333333333334,
+      "grad_norm": 0.5373684763908386,
+      "learning_rate": 0.0001731317636973986,
+      "loss": 1.0018,
+      "step": 5382
+    },
+    {
+      "epoch": 0.958511396011396,
+      "grad_norm": 0.5714904069900513,
+      "learning_rate": 0.00017312221618661516,
+      "loss": 1.1855,
+      "step": 5383
+    },
+    {
+      "epoch": 0.9586894586894587,
+      "grad_norm": 0.5707863569259644,
+      "learning_rate": 0.00017311266724315377,
+      "loss": 0.9482,
+      "step": 5384
+    },
+    {
+      "epoch": 0.9588675213675214,
+      "grad_norm": 0.5856872797012329,
+      "learning_rate": 0.00017310311686720157,
+      "loss": 0.9543,
+      "step": 5385
+    },
+    {
+      "epoch": 0.959045584045584,
+      "grad_norm": 0.5041963458061218,
+      "learning_rate": 0.00017309356505894568,
+      "loss": 1.1427,
+      "step": 5386
+    },
+    {
+      "epoch": 0.9592236467236467,
+      "grad_norm": 0.5409179925918579,
+      "learning_rate": 0.00017308401181857316,
+      "loss": 0.8432,
+      "step": 5387
+    },
+    {
+      "epoch": 0.9594017094017094,
+      "grad_norm": 0.5248702764511108,
+      "learning_rate": 0.00017307445714627128,
+      "loss": 1.1403,
+      "step": 5388
+    },
+    {
+      "epoch": 0.9595797720797721,
+      "grad_norm": 0.50718092918396,
+      "learning_rate": 0.00017306490104222722,
+      "loss": 0.9066,
+      "step": 5389
+    },
+    {
+      "epoch": 0.9597578347578347,
+      "grad_norm": 0.5563821196556091,
+      "learning_rate": 0.0001730553435066282,
+      "loss": 1.0204,
+      "step": 5390
+    },
+    {
+      "epoch": 0.9599358974358975,
+      "grad_norm": 0.5696987509727478,
+      "learning_rate": 0.00017304578453966146,
+      "loss": 1.1405,
+      "step": 5391
+    },
+    {
+      "epoch": 0.9601139601139601,
+      "grad_norm": 0.5927395224571228,
+      "learning_rate": 0.00017303622414151435,
+      "loss": 1.0398,
+      "step": 5392
+    },
+    {
+      "epoch": 0.9602920227920227,
+      "grad_norm": 0.5375707745552063,
+      "learning_rate": 0.0001730266623123741,
+      "loss": 0.9519,
+      "step": 5393
+    },
+    {
+      "epoch": 0.9604700854700855,
+      "grad_norm": 0.457998126745224,
+      "learning_rate": 0.00017301709905242815,
+      "loss": 0.8743,
+      "step": 5394
+    },
+    {
+      "epoch": 0.9606481481481481,
+      "grad_norm": 0.5427796244621277,
+      "learning_rate": 0.00017300753436186382,
+      "loss": 1.078,
+      "step": 5395
+    },
+    {
+      "epoch": 0.9608262108262108,
+      "grad_norm": 0.5458595752716064,
+      "learning_rate": 0.0001729979682408685,
+      "loss": 1.1081,
+      "step": 5396
+    },
+    {
+      "epoch": 0.9610042735042735,
+      "grad_norm": 0.5495280027389526,
+      "learning_rate": 0.00017298840068962962,
+      "loss": 1.0141,
+      "step": 5397
+    },
+    {
+      "epoch": 0.9611823361823362,
+      "grad_norm": 0.5878560543060303,
+      "learning_rate": 0.00017297883170833465,
+      "loss": 1.302,
+      "step": 5398
+    },
+    {
+      "epoch": 0.9613603988603988,
+      "grad_norm": 0.5452881455421448,
+      "learning_rate": 0.00017296926129717108,
+      "loss": 0.9929,
+      "step": 5399
+    },
+    {
+      "epoch": 0.9615384615384616,
+      "grad_norm": 0.6021811366081238,
+      "learning_rate": 0.0001729596894563264,
+      "loss": 1.2629,
+      "step": 5400
+    },
+    {
+      "epoch": 0.9617165242165242,
+      "grad_norm": 0.5820204615592957,
+      "learning_rate": 0.0001729501161859882,
+      "loss": 1.0662,
+      "step": 5401
+    },
+    {
+      "epoch": 0.9618945868945868,
+      "grad_norm": 0.4953218102455139,
+      "learning_rate": 0.000172940541486344,
+      "loss": 1.047,
+      "step": 5402
+    },
+    {
+      "epoch": 0.9620726495726496,
+      "grad_norm": 0.5409793853759766,
+      "learning_rate": 0.00017293096535758143,
+      "loss": 1.1993,
+      "step": 5403
+    },
+    {
+      "epoch": 0.9622507122507122,
+      "grad_norm": 0.49702873826026917,
+      "learning_rate": 0.00017292138779988805,
+      "loss": 1.2471,
+      "step": 5404
+    },
+    {
+      "epoch": 0.9624287749287749,
+      "grad_norm": 0.5743489861488342,
+      "learning_rate": 0.00017291180881345158,
+      "loss": 1.0816,
+      "step": 5405
+    },
+    {
+      "epoch": 0.9626068376068376,
+      "grad_norm": 0.5747945308685303,
+      "learning_rate": 0.00017290222839845968,
+      "loss": 1.3548,
+      "step": 5406
+    },
+    {
+      "epoch": 0.9627849002849003,
+      "grad_norm": 0.5341345071792603,
+      "learning_rate": 0.00017289264655510005,
+      "loss": 1.0435,
+      "step": 5407
+    },
+    {
+      "epoch": 0.9629629629629629,
+      "grad_norm": 0.5719689130783081,
+      "learning_rate": 0.00017288306328356044,
+      "loss": 1.2319,
+      "step": 5408
+    },
+    {
+      "epoch": 0.9631410256410257,
+      "grad_norm": 0.4783279597759247,
+      "learning_rate": 0.0001728734785840286,
+      "loss": 0.9397,
+      "step": 5409
+    },
+    {
+      "epoch": 0.9633190883190883,
+      "grad_norm": 0.4730507731437683,
+      "learning_rate": 0.00017286389245669233,
+      "loss": 0.9384,
+      "step": 5410
+    },
+    {
+      "epoch": 0.9634971509971509,
+      "grad_norm": 0.5309939384460449,
+      "learning_rate": 0.00017285430490173944,
+      "loss": 1.098,
+      "step": 5411
+    },
+    {
+      "epoch": 0.9636752136752137,
+      "grad_norm": 0.5177853107452393,
+      "learning_rate": 0.0001728447159193578,
+      "loss": 1.2777,
+      "step": 5412
+    },
+    {
+      "epoch": 0.9638532763532763,
+      "grad_norm": 0.6437913775444031,
+      "learning_rate": 0.00017283512550973526,
+      "loss": 1.2661,
+      "step": 5413
+    },
+    {
+      "epoch": 0.9640313390313391,
+      "grad_norm": 0.6096072196960449,
+      "learning_rate": 0.00017282553367305975,
+      "loss": 0.9569,
+      "step": 5414
+    },
+    {
+      "epoch": 0.9642094017094017,
+      "grad_norm": 0.5104934573173523,
+      "learning_rate": 0.00017281594040951918,
+      "loss": 0.9666,
+      "step": 5415
+    },
+    {
+      "epoch": 0.9643874643874644,
+      "grad_norm": 0.6178240776062012,
+      "learning_rate": 0.00017280634571930153,
+      "loss": 1.1277,
+      "step": 5416
+    },
+    {
+      "epoch": 0.9645655270655271,
+      "grad_norm": 0.5749034881591797,
+      "learning_rate": 0.0001727967496025948,
+      "loss": 1.245,
+      "step": 5417
+    },
+    {
+      "epoch": 0.9647435897435898,
+      "grad_norm": 0.5036978721618652,
+      "learning_rate": 0.00017278715205958694,
+      "loss": 1.3049,
+      "step": 5418
+    },
+    {
+      "epoch": 0.9649216524216524,
+      "grad_norm": 0.5593041777610779,
+      "learning_rate": 0.00017277755309046605,
+      "loss": 1.2304,
+      "step": 5419
+    },
+    {
+      "epoch": 0.9650997150997151,
+      "grad_norm": 0.5446555614471436,
+      "learning_rate": 0.0001727679526954202,
+      "loss": 0.732,
+      "step": 5420
+    },
+    {
+      "epoch": 0.9652777777777778,
+      "grad_norm": 0.6063070297241211,
+      "learning_rate": 0.00017275835087463747,
+      "loss": 1.3723,
+      "step": 5421
+    },
+    {
+      "epoch": 0.9654558404558404,
+      "grad_norm": 0.4994211792945862,
+      "learning_rate": 0.00017274874762830602,
+      "loss": 1.0505,
+      "step": 5422
+    },
+    {
+      "epoch": 0.9656339031339032,
+      "grad_norm": 0.49396973848342896,
+      "learning_rate": 0.00017273914295661395,
+      "loss": 0.8691,
+      "step": 5423
+    },
+    {
+      "epoch": 0.9658119658119658,
+      "grad_norm": 0.5067027807235718,
+      "learning_rate": 0.0001727295368597495,
+      "loss": 0.9744,
+      "step": 5424
+    },
+    {
+      "epoch": 0.9659900284900285,
+      "grad_norm": 0.6720643043518066,
+      "learning_rate": 0.00017271992933790085,
+      "loss": 1.1513,
+      "step": 5425
+    },
+    {
+      "epoch": 0.9661680911680912,
+      "grad_norm": 0.5494341254234314,
+      "learning_rate": 0.00017271032039125624,
+      "loss": 0.8295,
+      "step": 5426
+    },
+    {
+      "epoch": 0.9663461538461539,
+      "grad_norm": 0.644332230091095,
+      "learning_rate": 0.00017270071002000394,
+      "loss": 1.0043,
+      "step": 5427
+    },
+    {
+      "epoch": 0.9665242165242165,
+      "grad_norm": 0.5658500790596008,
+      "learning_rate": 0.00017269109822433225,
+      "loss": 1.2575,
+      "step": 5428
+    },
+    {
+      "epoch": 0.9667022792022792,
+      "grad_norm": 0.5163155794143677,
+      "learning_rate": 0.00017268148500442952,
+      "loss": 1.1391,
+      "step": 5429
+    },
+    {
+      "epoch": 0.9668803418803419,
+      "grad_norm": 0.5113703608512878,
+      "learning_rate": 0.00017267187036048404,
+      "loss": 1.0819,
+      "step": 5430
+    },
+    {
+      "epoch": 0.9670584045584045,
+      "grad_norm": 0.6339422464370728,
+      "learning_rate": 0.00017266225429268426,
+      "loss": 1.0733,
+      "step": 5431
+    },
+    {
+      "epoch": 0.9672364672364673,
+      "grad_norm": 0.5158288478851318,
+      "learning_rate": 0.0001726526368012185,
+      "loss": 0.9518,
+      "step": 5432
+    },
+    {
+      "epoch": 0.9674145299145299,
+      "grad_norm": 0.593717634677887,
+      "learning_rate": 0.00017264301788627527,
+      "loss": 0.9416,
+      "step": 5433
+    },
+    {
+      "epoch": 0.9675925925925926,
+      "grad_norm": 0.49593186378479004,
+      "learning_rate": 0.00017263339754804301,
+      "loss": 1.0307,
+      "step": 5434
+    },
+    {
+      "epoch": 0.9677706552706553,
+      "grad_norm": 0.44032949209213257,
+      "learning_rate": 0.00017262377578671024,
+      "loss": 0.7884,
+      "step": 5435
+    },
+    {
+      "epoch": 0.967948717948718,
+      "grad_norm": 0.513073742389679,
+      "learning_rate": 0.00017261415260246538,
+      "loss": 0.9797,
+      "step": 5436
+    },
+    {
+      "epoch": 0.9681267806267806,
+      "grad_norm": 0.5737422108650208,
+      "learning_rate": 0.0001726045279954971,
+      "loss": 1.0487,
+      "step": 5437
+    },
+    {
+      "epoch": 0.9683048433048433,
+      "grad_norm": 0.5385867953300476,
+      "learning_rate": 0.0001725949019659939,
+      "loss": 1.4166,
+      "step": 5438
+    },
+    {
+      "epoch": 0.968482905982906,
+      "grad_norm": 0.5224326848983765,
+      "learning_rate": 0.00017258527451414438,
+      "loss": 1.195,
+      "step": 5439
+    },
+    {
+      "epoch": 0.9686609686609686,
+      "grad_norm": 0.5305148363113403,
+      "learning_rate": 0.0001725756456401372,
+      "loss": 1.0301,
+      "step": 5440
+    },
+    {
+      "epoch": 0.9688390313390314,
+      "grad_norm": 0.532588005065918,
+      "learning_rate": 0.000172566015344161,
+      "loss": 1.1269,
+      "step": 5441
+    },
+    {
+      "epoch": 0.969017094017094,
+      "grad_norm": 0.5812515020370483,
+      "learning_rate": 0.0001725563836264045,
+      "loss": 1.1787,
+      "step": 5442
+    },
+    {
+      "epoch": 0.9691951566951567,
+      "grad_norm": 0.4962109327316284,
+      "learning_rate": 0.00017254675048705638,
+      "loss": 1.0639,
+      "step": 5443
+    },
+    {
+      "epoch": 0.9693732193732194,
+      "grad_norm": 0.5094883441925049,
+      "learning_rate": 0.00017253711592630534,
+      "loss": 1.0922,
+      "step": 5444
+    },
+    {
+      "epoch": 0.969551282051282,
+      "grad_norm": 0.5728049874305725,
+      "learning_rate": 0.00017252747994434025,
+      "loss": 1.1237,
+      "step": 5445
+    },
+    {
+      "epoch": 0.9697293447293447,
+      "grad_norm": 0.5406180620193481,
+      "learning_rate": 0.00017251784254134983,
+      "loss": 1.1161,
+      "step": 5446
+    },
+    {
+      "epoch": 0.9699074074074074,
+      "grad_norm": 0.5724552869796753,
+      "learning_rate": 0.00017250820371752292,
+      "loss": 1.2205,
+      "step": 5447
+    },
+    {
+      "epoch": 0.9700854700854701,
+      "grad_norm": 0.5698846578598022,
+      "learning_rate": 0.0001724985634730484,
+      "loss": 1.1472,
+      "step": 5448
+    },
+    {
+      "epoch": 0.9702635327635327,
+      "grad_norm": 0.5315805673599243,
+      "learning_rate": 0.0001724889218081151,
+      "loss": 1.0253,
+      "step": 5449
+    },
+    {
+      "epoch": 0.9704415954415955,
+      "grad_norm": 0.5970377326011658,
+      "learning_rate": 0.000172479278722912,
+      "loss": 1.3033,
+      "step": 5450
+    },
+    {
+      "epoch": 0.9706196581196581,
+      "grad_norm": 0.6149488687515259,
+      "learning_rate": 0.00017246963421762798,
+      "loss": 1.0689,
+      "step": 5451
+    },
+    {
+      "epoch": 0.9707977207977208,
+      "grad_norm": 0.4848574995994568,
+      "learning_rate": 0.00017245998829245202,
+      "loss": 0.8829,
+      "step": 5452
+    },
+    {
+      "epoch": 0.9709757834757835,
+      "grad_norm": 0.6073294281959534,
+      "learning_rate": 0.00017245034094757312,
+      "loss": 1.2378,
+      "step": 5453
+    },
+    {
+      "epoch": 0.9711538461538461,
+      "grad_norm": 0.6362034678459167,
+      "learning_rate": 0.00017244069218318026,
+      "loss": 1.3606,
+      "step": 5454
+    },
+    {
+      "epoch": 0.9713319088319088,
+      "grad_norm": 0.5353880524635315,
+      "learning_rate": 0.00017243104199946257,
+      "loss": 1.1288,
+      "step": 5455
+    },
+    {
+      "epoch": 0.9715099715099715,
+      "grad_norm": 0.5096352100372314,
+      "learning_rate": 0.00017242139039660902,
+      "loss": 1.0056,
+      "step": 5456
+    },
+    {
+      "epoch": 0.9716880341880342,
+      "grad_norm": 0.5086682438850403,
+      "learning_rate": 0.00017241173737480884,
+      "loss": 1.091,
+      "step": 5457
+    },
+    {
+      "epoch": 0.9718660968660968,
+      "grad_norm": 0.5034295320510864,
+      "learning_rate": 0.000172402082934251,
+      "loss": 0.9749,
+      "step": 5458
+    },
+    {
+      "epoch": 0.9720441595441596,
+      "grad_norm": 0.5205379724502563,
+      "learning_rate": 0.0001723924270751248,
+      "loss": 1.1068,
+      "step": 5459
+    },
+    {
+      "epoch": 0.9722222222222222,
+      "grad_norm": 0.5904826521873474,
+      "learning_rate": 0.00017238276979761937,
+      "loss": 1.0613,
+      "step": 5460
+    },
+    {
+      "epoch": 0.9724002849002849,
+      "grad_norm": 0.6415045261383057,
+      "learning_rate": 0.0001723731111019239,
+      "loss": 1.2126,
+      "step": 5461
+    },
+    {
+      "epoch": 0.9725783475783476,
+      "grad_norm": 0.5769147872924805,
+      "learning_rate": 0.0001723634509882277,
+      "loss": 1.337,
+      "step": 5462
+    },
+    {
+      "epoch": 0.9727564102564102,
+      "grad_norm": 0.5585111975669861,
+      "learning_rate": 0.00017235378945671998,
+      "loss": 1.3922,
+      "step": 5463
+    },
+    {
+      "epoch": 0.9729344729344729,
+      "grad_norm": 0.5788411498069763,
+      "learning_rate": 0.00017234412650759008,
+      "loss": 0.8532,
+      "step": 5464
+    },
+    {
+      "epoch": 0.9731125356125356,
+      "grad_norm": 0.5617673397064209,
+      "learning_rate": 0.00017233446214102728,
+      "loss": 1.2575,
+      "step": 5465
+    },
+    {
+      "epoch": 0.9732905982905983,
+      "grad_norm": 0.4227815568447113,
+      "learning_rate": 0.00017232479635722093,
+      "loss": 1.0618,
+      "step": 5466
+    },
+    {
+      "epoch": 0.9734686609686609,
+      "grad_norm": 0.49751797318458557,
+      "learning_rate": 0.00017231512915636047,
+      "loss": 0.7714,
+      "step": 5467
+    },
+    {
+      "epoch": 0.9736467236467237,
+      "grad_norm": 0.5983800292015076,
+      "learning_rate": 0.0001723054605386353,
+      "loss": 1.2297,
+      "step": 5468
+    },
+    {
+      "epoch": 0.9738247863247863,
+      "grad_norm": 0.543394923210144,
+      "learning_rate": 0.0001722957905042348,
+      "loss": 1.0078,
+      "step": 5469
+    },
+    {
+      "epoch": 0.9740028490028491,
+      "grad_norm": 0.5633566975593567,
+      "learning_rate": 0.00017228611905334846,
+      "loss": 1.0938,
+      "step": 5470
+    },
+    {
+      "epoch": 0.9741809116809117,
+      "grad_norm": 0.49377235770225525,
+      "learning_rate": 0.00017227644618616578,
+      "loss": 1.096,
+      "step": 5471
+    },
+    {
+      "epoch": 0.9743589743589743,
+      "grad_norm": 0.4963362216949463,
+      "learning_rate": 0.00017226677190287627,
+      "loss": 1.0003,
+      "step": 5472
+    },
+    {
+      "epoch": 0.9745370370370371,
+      "grad_norm": 0.4483006000518799,
+      "learning_rate": 0.00017225709620366953,
+      "loss": 0.8623,
+      "step": 5473
+    },
+    {
+      "epoch": 0.9747150997150997,
+      "grad_norm": 0.5429352521896362,
+      "learning_rate": 0.00017224741908873506,
+      "loss": 1.1383,
+      "step": 5474
+    },
+    {
+      "epoch": 0.9748931623931624,
+      "grad_norm": 0.5871657729148865,
+      "learning_rate": 0.0001722377405582625,
+      "loss": 1.2005,
+      "step": 5475
+    },
+    {
+      "epoch": 0.9750712250712251,
+      "grad_norm": 0.6002383828163147,
+      "learning_rate": 0.0001722280606124415,
+      "loss": 1.0696,
+      "step": 5476
+    },
+    {
+      "epoch": 0.9752492877492878,
+      "grad_norm": 0.5351617336273193,
+      "learning_rate": 0.00017221837925146164,
+      "loss": 1.243,
+      "step": 5477
+    },
+    {
+      "epoch": 0.9754273504273504,
+      "grad_norm": 0.46613118052482605,
+      "learning_rate": 0.00017220869647551268,
+      "loss": 1.0344,
+      "step": 5478
+    },
+    {
+      "epoch": 0.9756054131054132,
+      "grad_norm": 0.6015593409538269,
+      "learning_rate": 0.00017219901228478432,
+      "loss": 1.082,
+      "step": 5479
+    },
+    {
+      "epoch": 0.9757834757834758,
+      "grad_norm": 0.5829521417617798,
+      "learning_rate": 0.0001721893266794663,
+      "loss": 0.8683,
+      "step": 5480
+    },
+    {
+      "epoch": 0.9759615384615384,
+      "grad_norm": 0.6344960927963257,
+      "learning_rate": 0.00017217963965974838,
+      "loss": 1.1048,
+      "step": 5481
+    },
+    {
+      "epoch": 0.9761396011396012,
+      "grad_norm": 0.5586308240890503,
+      "learning_rate": 0.00017216995122582034,
+      "loss": 0.9657,
+      "step": 5482
+    },
+    {
+      "epoch": 0.9763176638176638,
+      "grad_norm": 0.48625239729881287,
+      "learning_rate": 0.00017216026137787204,
+      "loss": 1.1026,
+      "step": 5483
+    },
+    {
+      "epoch": 0.9764957264957265,
+      "grad_norm": 0.5625223517417908,
+      "learning_rate": 0.00017215057011609332,
+      "loss": 1.1579,
+      "step": 5484
+    },
+    {
+      "epoch": 0.9766737891737892,
+      "grad_norm": 0.6016653776168823,
+      "learning_rate": 0.0001721408774406741,
+      "loss": 1.1777,
+      "step": 5485
+    },
+    {
+      "epoch": 0.9768518518518519,
+      "grad_norm": 0.5444921851158142,
+      "learning_rate": 0.00017213118335180418,
+      "loss": 1.119,
+      "step": 5486
+    },
+    {
+      "epoch": 0.9770299145299145,
+      "grad_norm": 0.5574755668640137,
+      "learning_rate": 0.0001721214878496736,
+      "loss": 1.1128,
+      "step": 5487
+    },
+    {
+      "epoch": 0.9772079772079773,
+      "grad_norm": 0.5486113429069519,
+      "learning_rate": 0.00017211179093447226,
+      "loss": 1.1673,
+      "step": 5488
+    },
+    {
+      "epoch": 0.9773860398860399,
+      "grad_norm": 0.5545483231544495,
+      "learning_rate": 0.00017210209260639018,
+      "loss": 1.1748,
+      "step": 5489
+    },
+    {
+      "epoch": 0.9775641025641025,
+      "grad_norm": 0.5756667256355286,
+      "learning_rate": 0.0001720923928656174,
+      "loss": 1.2377,
+      "step": 5490
+    },
+    {
+      "epoch": 0.9777421652421653,
+      "grad_norm": 0.5744972229003906,
+      "learning_rate": 0.00017208269171234392,
+      "loss": 1.1242,
+      "step": 5491
+    },
+    {
+      "epoch": 0.9779202279202279,
+      "grad_norm": 0.6109468340873718,
+      "learning_rate": 0.00017207298914675984,
+      "loss": 1.1948,
+      "step": 5492
+    },
+    {
+      "epoch": 0.9780982905982906,
+      "grad_norm": 0.5195167660713196,
+      "learning_rate": 0.00017206328516905525,
+      "loss": 1.0941,
+      "step": 5493
+    },
+    {
+      "epoch": 0.9782763532763533,
+      "grad_norm": 0.5549042224884033,
+      "learning_rate": 0.0001720535797794203,
+      "loss": 1.1503,
+      "step": 5494
+    },
+    {
+      "epoch": 0.978454415954416,
+      "grad_norm": 0.6317743062973022,
+      "learning_rate": 0.0001720438729780451,
+      "loss": 1.3468,
+      "step": 5495
+    },
+    {
+      "epoch": 0.9786324786324786,
+      "grad_norm": 0.5932528972625732,
+      "learning_rate": 0.0001720341647651199,
+      "loss": 1.105,
+      "step": 5496
+    },
+    {
+      "epoch": 0.9788105413105413,
+      "grad_norm": 0.607880175113678,
+      "learning_rate": 0.00017202445514083488,
+      "loss": 1.1465,
+      "step": 5497
+    },
+    {
+      "epoch": 0.978988603988604,
+      "grad_norm": 0.49227309226989746,
+      "learning_rate": 0.00017201474410538027,
+      "loss": 0.9075,
+      "step": 5498
+    },
+    {
+      "epoch": 0.9791666666666666,
+      "grad_norm": 0.5059443116188049,
+      "learning_rate": 0.00017200503165894636,
+      "loss": 1.0483,
+      "step": 5499
+    },
+    {
+      "epoch": 0.9793447293447294,
+      "grad_norm": 0.5792799592018127,
+      "learning_rate": 0.0001719953178017234,
+      "loss": 1.0987,
+      "step": 5500
+    },
+    {
+      "epoch": 0.979522792022792,
+      "grad_norm": 0.5010457038879395,
+      "learning_rate": 0.00017198560253390177,
+      "loss": 1.1051,
+      "step": 5501
+    },
+    {
+      "epoch": 0.9797008547008547,
+      "grad_norm": 0.5866543054580688,
+      "learning_rate": 0.0001719758858556718,
+      "loss": 1.2824,
+      "step": 5502
+    },
+    {
+      "epoch": 0.9798789173789174,
+      "grad_norm": 0.5392137169837952,
+      "learning_rate": 0.00017196616776722382,
+      "loss": 0.886,
+      "step": 5503
+    },
+    {
+      "epoch": 0.98005698005698,
+      "grad_norm": 0.5200899839401245,
+      "learning_rate": 0.00017195644826874834,
+      "loss": 1.1504,
+      "step": 5504
+    },
+    {
+      "epoch": 0.9802350427350427,
+      "grad_norm": 0.533159077167511,
+      "learning_rate": 0.00017194672736043569,
+      "loss": 1.1216,
+      "step": 5505
+    },
+    {
+      "epoch": 0.9804131054131054,
+      "grad_norm": 0.5543524622917175,
+      "learning_rate": 0.0001719370050424764,
+      "loss": 1.0161,
+      "step": 5506
+    },
+    {
+      "epoch": 0.9805911680911681,
+      "grad_norm": 0.5315365195274353,
+      "learning_rate": 0.00017192728131506092,
+      "loss": 1.0509,
+      "step": 5507
+    },
+    {
+      "epoch": 0.9807692307692307,
+      "grad_norm": 0.5406147837638855,
+      "learning_rate": 0.00017191755617837977,
+      "loss": 1.0695,
+      "step": 5508
+    },
+    {
+      "epoch": 0.9809472934472935,
+      "grad_norm": 0.4563386142253876,
+      "learning_rate": 0.00017190782963262354,
+      "loss": 0.995,
+      "step": 5509
+    },
+    {
+      "epoch": 0.9811253561253561,
+      "grad_norm": 0.5456405282020569,
+      "learning_rate": 0.00017189810167798274,
+      "loss": 1.0546,
+      "step": 5510
+    },
+    {
+      "epoch": 0.9813034188034188,
+      "grad_norm": 0.6275575160980225,
+      "learning_rate": 0.00017188837231464795,
+      "loss": 1.0432,
+      "step": 5511
+    },
+    {
+      "epoch": 0.9814814814814815,
+      "grad_norm": 0.49735602736473083,
+      "learning_rate": 0.0001718786415428099,
+      "loss": 1.035,
+      "step": 5512
+    },
+    {
+      "epoch": 0.9816595441595442,
+      "grad_norm": 0.5234259963035583,
+      "learning_rate": 0.00017186890936265916,
+      "loss": 1.0918,
+      "step": 5513
+    },
+    {
+      "epoch": 0.9818376068376068,
+      "grad_norm": 0.5091170072555542,
+      "learning_rate": 0.00017185917577438643,
+      "loss": 1.0239,
+      "step": 5514
+    },
+    {
+      "epoch": 0.9820156695156695,
+      "grad_norm": 0.6155703067779541,
+      "learning_rate": 0.00017184944077818244,
+      "loss": 1.2366,
+      "step": 5515
+    },
+    {
+      "epoch": 0.9821937321937322,
+      "grad_norm": 0.5074070692062378,
+      "learning_rate": 0.0001718397043742379,
+      "loss": 1.0318,
+      "step": 5516
+    },
+    {
+      "epoch": 0.9823717948717948,
+      "grad_norm": 0.5234423279762268,
+      "learning_rate": 0.0001718299665627436,
+      "loss": 1.0322,
+      "step": 5517
+    },
+    {
+      "epoch": 0.9825498575498576,
+      "grad_norm": 0.5783474445343018,
+      "learning_rate": 0.0001718202273438903,
+      "loss": 0.9486,
+      "step": 5518
+    },
+    {
+      "epoch": 0.9827279202279202,
+      "grad_norm": 0.5708683133125305,
+      "learning_rate": 0.00017181048671786886,
+      "loss": 1.0785,
+      "step": 5519
+    },
+    {
+      "epoch": 0.9829059829059829,
+      "grad_norm": 0.5985961556434631,
+      "learning_rate": 0.00017180074468487009,
+      "loss": 1.198,
+      "step": 5520
+    },
+    {
+      "epoch": 0.9830840455840456,
+      "grad_norm": 0.5711352229118347,
+      "learning_rate": 0.0001717910012450849,
+      "loss": 1.0386,
+      "step": 5521
+    },
+    {
+      "epoch": 0.9832621082621082,
+      "grad_norm": 0.5338063836097717,
+      "learning_rate": 0.00017178125639870416,
+      "loss": 1.1594,
+      "step": 5522
+    },
+    {
+      "epoch": 0.9834401709401709,
+      "grad_norm": 0.6144943237304688,
+      "learning_rate": 0.00017177151014591881,
+      "loss": 1.1083,
+      "step": 5523
+    },
+    {
+      "epoch": 0.9836182336182336,
+      "grad_norm": 0.547285795211792,
+      "learning_rate": 0.00017176176248691983,
+      "loss": 1.1507,
+      "step": 5524
+    },
+    {
+      "epoch": 0.9837962962962963,
+      "grad_norm": 0.5807644724845886,
+      "learning_rate": 0.00017175201342189817,
+      "loss": 1.3044,
+      "step": 5525
+    },
+    {
+      "epoch": 0.9839743589743589,
+      "grad_norm": 0.5229477882385254,
+      "learning_rate": 0.00017174226295104485,
+      "loss": 1.2622,
+      "step": 5526
+    },
+    {
+      "epoch": 0.9841524216524217,
+      "grad_norm": 0.6100695133209229,
+      "learning_rate": 0.00017173251107455094,
+      "loss": 1.2026,
+      "step": 5527
+    },
+    {
+      "epoch": 0.9843304843304843,
+      "grad_norm": 0.5410884618759155,
+      "learning_rate": 0.00017172275779260744,
+      "loss": 1.2964,
+      "step": 5528
+    },
+    {
+      "epoch": 0.9845085470085471,
+      "grad_norm": 0.5937406420707703,
+      "learning_rate": 0.00017171300310540554,
+      "loss": 1.1435,
+      "step": 5529
+    },
+    {
+      "epoch": 0.9846866096866097,
+      "grad_norm": 0.56817227602005,
+      "learning_rate": 0.00017170324701313634,
+      "loss": 1.0099,
+      "step": 5530
+    },
+    {
+      "epoch": 0.9848646723646723,
+      "grad_norm": 0.5776323080062866,
+      "learning_rate": 0.00017169348951599092,
+      "loss": 1.3539,
+      "step": 5531
+    },
+    {
+      "epoch": 0.9850427350427351,
+      "grad_norm": 0.5208535194396973,
+      "learning_rate": 0.0001716837306141605,
+      "loss": 1.2306,
+      "step": 5532
+    },
+    {
+      "epoch": 0.9852207977207977,
+      "grad_norm": 0.552173376083374,
+      "learning_rate": 0.0001716739703078363,
+      "loss": 1.0551,
+      "step": 5533
+    },
+    {
+      "epoch": 0.9853988603988604,
+      "grad_norm": 0.5327515602111816,
+      "learning_rate": 0.00017166420859720955,
+      "loss": 1.2443,
+      "step": 5534
+    },
+    {
+      "epoch": 0.9855769230769231,
+      "grad_norm": 0.5255244374275208,
+      "learning_rate": 0.0001716544454824715,
+      "loss": 1.005,
+      "step": 5535
+    },
+    {
+      "epoch": 0.9857549857549858,
+      "grad_norm": 0.4753847122192383,
+      "learning_rate": 0.00017164468096381343,
+      "loss": 1.0081,
+      "step": 5536
+    },
+    {
+      "epoch": 0.9859330484330484,
+      "grad_norm": 0.5261829495429993,
+      "learning_rate": 0.00017163491504142665,
+      "loss": 1.2249,
+      "step": 5537
+    },
+    {
+      "epoch": 0.9861111111111112,
+      "grad_norm": 0.46499499678611755,
+      "learning_rate": 0.00017162514771550255,
+      "loss": 0.8759,
+      "step": 5538
+    },
+    {
+      "epoch": 0.9862891737891738,
+      "grad_norm": 0.5233004689216614,
+      "learning_rate": 0.00017161537898623247,
+      "loss": 1.0474,
+      "step": 5539
+    },
+    {
+      "epoch": 0.9864672364672364,
+      "grad_norm": 0.46905553340911865,
+      "learning_rate": 0.00017160560885380778,
+      "loss": 0.9033,
+      "step": 5540
+    },
+    {
+      "epoch": 0.9866452991452992,
+      "grad_norm": 0.5816231369972229,
+      "learning_rate": 0.00017159583731841998,
+      "loss": 1.0628,
+      "step": 5541
+    },
+    {
+      "epoch": 0.9868233618233618,
+      "grad_norm": 0.4575413167476654,
+      "learning_rate": 0.00017158606438026045,
+      "loss": 1.0446,
+      "step": 5542
+    },
+    {
+      "epoch": 0.9870014245014245,
+      "grad_norm": 0.5968109965324402,
+      "learning_rate": 0.00017157629003952067,
+      "loss": 1.032,
+      "step": 5543
+    },
+    {
+      "epoch": 0.9871794871794872,
+      "grad_norm": 0.5316148400306702,
+      "learning_rate": 0.00017156651429639218,
+      "loss": 0.9167,
+      "step": 5544
+    },
+    {
+      "epoch": 0.9873575498575499,
+      "grad_norm": 0.5185125470161438,
+      "learning_rate": 0.00017155673715106651,
+      "loss": 1.1527,
+      "step": 5545
+    },
+    {
+      "epoch": 0.9875356125356125,
+      "grad_norm": 0.5167772769927979,
+      "learning_rate": 0.00017154695860373525,
+      "loss": 0.9954,
+      "step": 5546
+    },
+    {
+      "epoch": 0.9877136752136753,
+      "grad_norm": 0.6406680345535278,
+      "learning_rate": 0.00017153717865458994,
+      "loss": 1.2758,
+      "step": 5547
+    },
+    {
+      "epoch": 0.9878917378917379,
+      "grad_norm": 0.5223956108093262,
+      "learning_rate": 0.00017152739730382223,
+      "loss": 1.1526,
+      "step": 5548
+    },
+    {
+      "epoch": 0.9880698005698005,
+      "grad_norm": 0.6131790280342102,
+      "learning_rate": 0.00017151761455162375,
+      "loss": 1.1024,
+      "step": 5549
+    },
+    {
+      "epoch": 0.9882478632478633,
+      "grad_norm": 0.5574753880500793,
+      "learning_rate": 0.00017150783039818616,
+      "loss": 0.9733,
+      "step": 5550
+    },
+    {
+      "epoch": 0.9884259259259259,
+      "grad_norm": 0.5417882800102234,
+      "learning_rate": 0.0001714980448437012,
+      "loss": 1.2244,
+      "step": 5551
+    },
+    {
+      "epoch": 0.9886039886039886,
+      "grad_norm": 0.6217474341392517,
+      "learning_rate": 0.0001714882578883606,
+      "loss": 0.9224,
+      "step": 5552
+    },
+    {
+      "epoch": 0.9887820512820513,
+      "grad_norm": 0.5846285223960876,
+      "learning_rate": 0.00017147846953235606,
+      "loss": 1.2429,
+      "step": 5553
+    },
+    {
+      "epoch": 0.988960113960114,
+      "grad_norm": 0.5924782752990723,
+      "learning_rate": 0.00017146867977587936,
+      "loss": 0.9907,
+      "step": 5554
+    },
+    {
+      "epoch": 0.9891381766381766,
+      "grad_norm": 0.5756853818893433,
+      "learning_rate": 0.00017145888861912242,
+      "loss": 1.1266,
+      "step": 5555
+    },
+    {
+      "epoch": 0.9893162393162394,
+      "grad_norm": 0.5277376770973206,
+      "learning_rate": 0.00017144909606227693,
+      "loss": 1.1676,
+      "step": 5556
+    },
+    {
+      "epoch": 0.989494301994302,
+      "grad_norm": 0.5138902068138123,
+      "learning_rate": 0.00017143930210553485,
+      "loss": 0.9864,
+      "step": 5557
+    },
+    {
+      "epoch": 0.9896723646723646,
+      "grad_norm": 0.8072507977485657,
+      "learning_rate": 0.00017142950674908805,
+      "loss": 1.111,
+      "step": 5558
+    },
+    {
+      "epoch": 0.9898504273504274,
+      "grad_norm": 0.5641721487045288,
+      "learning_rate": 0.00017141970999312844,
+      "loss": 0.9106,
+      "step": 5559
+    },
+    {
+      "epoch": 0.99002849002849,
+      "grad_norm": 0.5260798931121826,
+      "learning_rate": 0.000171409911837848,
+      "loss": 1.1609,
+      "step": 5560
+    },
+    {
+      "epoch": 0.9902065527065527,
+      "grad_norm": 0.5398530960083008,
+      "learning_rate": 0.00017140011228343864,
+      "loss": 1.0368,
+      "step": 5561
+    },
+    {
+      "epoch": 0.9903846153846154,
+      "grad_norm": 0.6011313199996948,
+      "learning_rate": 0.00017139031133009245,
+      "loss": 1.1314,
+      "step": 5562
+    },
+    {
+      "epoch": 0.9905626780626781,
+      "grad_norm": 0.6194971203804016,
+      "learning_rate": 0.00017138050897800135,
+      "loss": 1.3493,
+      "step": 5563
+    },
+    {
+      "epoch": 0.9907407407407407,
+      "grad_norm": 0.5779356956481934,
+      "learning_rate": 0.0001713707052273575,
+      "loss": 0.943,
+      "step": 5564
+    },
+    {
+      "epoch": 0.9909188034188035,
+      "grad_norm": 0.5321127772331238,
+      "learning_rate": 0.00017136090007835293,
+      "loss": 0.7914,
+      "step": 5565
+    },
+    {
+      "epoch": 0.9910968660968661,
+      "grad_norm": 0.5470426678657532,
+      "learning_rate": 0.00017135109353117977,
+      "loss": 1.2113,
+      "step": 5566
+    },
+    {
+      "epoch": 0.9912749287749287,
+      "grad_norm": 0.5551436543464661,
+      "learning_rate": 0.00017134128558603012,
+      "loss": 0.8932,
+      "step": 5567
+    },
+    {
+      "epoch": 0.9914529914529915,
+      "grad_norm": 0.45770928263664246,
+      "learning_rate": 0.0001713314762430962,
+      "loss": 1.0061,
+      "step": 5568
+    },
+    {
+      "epoch": 0.9916310541310541,
+      "grad_norm": 0.5578967332839966,
+      "learning_rate": 0.00017132166550257017,
+      "loss": 1.148,
+      "step": 5569
+    },
+    {
+      "epoch": 0.9918091168091168,
+      "grad_norm": 0.5086452960968018,
+      "learning_rate": 0.0001713118533646443,
+      "loss": 0.9803,
+      "step": 5570
+    },
+    {
+      "epoch": 0.9919871794871795,
+      "grad_norm": 0.4714745879173279,
+      "learning_rate": 0.00017130203982951078,
+      "loss": 1.0176,
+      "step": 5571
+    },
+    {
+      "epoch": 0.9921652421652422,
+      "grad_norm": 0.6254406571388245,
+      "learning_rate": 0.0001712922248973619,
+      "loss": 1.0932,
+      "step": 5572
+    },
+    {
+      "epoch": 0.9923433048433048,
+      "grad_norm": 0.5005003809928894,
+      "learning_rate": 0.00017128240856838998,
+      "loss": 1.0783,
+      "step": 5573
+    },
+    {
+      "epoch": 0.9925213675213675,
+      "grad_norm": 0.5668206214904785,
+      "learning_rate": 0.00017127259084278733,
+      "loss": 1.0404,
+      "step": 5574
+    },
+    {
+      "epoch": 0.9926994301994302,
+      "grad_norm": 0.4976036250591278,
+      "learning_rate": 0.00017126277172074632,
+      "loss": 1.1437,
+      "step": 5575
+    },
+    {
+      "epoch": 0.9928774928774928,
+      "grad_norm": 0.567546546459198,
+      "learning_rate": 0.00017125295120245935,
+      "loss": 1.2188,
+      "step": 5576
+    },
+    {
+      "epoch": 0.9930555555555556,
+      "grad_norm": 0.5614372491836548,
+      "learning_rate": 0.0001712431292881188,
+      "loss": 0.9187,
+      "step": 5577
+    },
+    {
+      "epoch": 0.9932336182336182,
+      "grad_norm": 0.6117973327636719,
+      "learning_rate": 0.00017123330597791712,
+      "loss": 1.1285,
+      "step": 5578
+    },
+    {
+      "epoch": 0.9934116809116809,
+      "grad_norm": 0.6000342965126038,
+      "learning_rate": 0.00017122348127204676,
+      "loss": 0.9837,
+      "step": 5579
+    },
+    {
+      "epoch": 0.9935897435897436,
+      "grad_norm": 0.5453050136566162,
+      "learning_rate": 0.0001712136551707003,
+      "loss": 0.8771,
+      "step": 5580
+    },
+    {
+      "epoch": 0.9937678062678063,
+      "grad_norm": 0.49603891372680664,
+      "learning_rate": 0.00017120382767407018,
+      "loss": 1.0754,
+      "step": 5581
+    },
+    {
+      "epoch": 0.9939458689458689,
+      "grad_norm": 0.48031488060951233,
+      "learning_rate": 0.00017119399878234894,
+      "loss": 0.6933,
+      "step": 5582
+    },
+    {
+      "epoch": 0.9941239316239316,
+      "grad_norm": 0.6048742532730103,
+      "learning_rate": 0.0001711841684957292,
+      "loss": 0.9696,
+      "step": 5583
+    },
+    {
+      "epoch": 0.9943019943019943,
+      "grad_norm": 0.5183123350143433,
+      "learning_rate": 0.00017117433681440355,
+      "loss": 1.1313,
+      "step": 5584
+    },
+    {
+      "epoch": 0.9944800569800569,
+      "grad_norm": 0.504916250705719,
+      "learning_rate": 0.00017116450373856466,
+      "loss": 1.0273,
+      "step": 5585
+    },
+    {
+      "epoch": 0.9946581196581197,
+      "grad_norm": 0.5804886817932129,
+      "learning_rate": 0.0001711546692684051,
+      "loss": 1.1162,
+      "step": 5586
+    },
+    {
+      "epoch": 0.9948361823361823,
+      "grad_norm": 0.5531938672065735,
+      "learning_rate": 0.0001711448334041176,
+      "loss": 1.2893,
+      "step": 5587
+    },
+    {
+      "epoch": 0.9950142450142451,
+      "grad_norm": 0.5079928636550903,
+      "learning_rate": 0.00017113499614589492,
+      "loss": 1.0393,
+      "step": 5588
+    },
+    {
+      "epoch": 0.9951923076923077,
+      "grad_norm": 0.5421964526176453,
+      "learning_rate": 0.00017112515749392973,
+      "loss": 0.8844,
+      "step": 5589
+    },
+    {
+      "epoch": 0.9953703703703703,
+      "grad_norm": 0.4834558367729187,
+      "learning_rate": 0.00017111531744841486,
+      "loss": 1.0187,
+      "step": 5590
+    },
+    {
+      "epoch": 0.9955484330484331,
+      "grad_norm": 0.6704340577125549,
+      "learning_rate": 0.00017110547600954307,
+      "loss": 0.8524,
+      "step": 5591
+    },
+    {
+      "epoch": 0.9957264957264957,
+      "grad_norm": 0.4578927159309387,
+      "learning_rate": 0.00017109563317750718,
+      "loss": 1.059,
+      "step": 5592
+    },
+    {
+      "epoch": 0.9959045584045584,
+      "grad_norm": 0.5563494563102722,
+      "learning_rate": 0.00017108578895250006,
+      "loss": 1.1211,
+      "step": 5593
+    },
+    {
+      "epoch": 0.9960826210826211,
+      "grad_norm": 0.5272170901298523,
+      "learning_rate": 0.00017107594333471454,
+      "loss": 0.9224,
+      "step": 5594
+    },
+    {
+      "epoch": 0.9962606837606838,
+      "grad_norm": 0.5697501301765442,
+      "learning_rate": 0.00017106609632434357,
+      "loss": 1.2223,
+      "step": 5595
+    },
+    {
+      "epoch": 0.9964387464387464,
+      "grad_norm": 0.5385653376579285,
+      "learning_rate": 0.00017105624792158007,
+      "loss": 1.0809,
+      "step": 5596
+    },
+    {
+      "epoch": 0.9966168091168092,
+      "grad_norm": 0.5608006119728088,
+      "learning_rate": 0.000171046398126617,
+      "loss": 1.3936,
+      "step": 5597
+    },
+    {
+      "epoch": 0.9967948717948718,
+      "grad_norm": 0.5063132643699646,
+      "learning_rate": 0.00017103654693964736,
+      "loss": 1.2086,
+      "step": 5598
+    },
+    {
+      "epoch": 0.9969729344729344,
+      "grad_norm": 0.6014235019683838,
+      "learning_rate": 0.00017102669436086415,
+      "loss": 1.1231,
+      "step": 5599
+    },
+    {
+      "epoch": 0.9971509971509972,
+      "grad_norm": 0.49549567699432373,
+      "learning_rate": 0.00017101684039046036,
+      "loss": 1.0013,
+      "step": 5600
+    },
+    {
+      "epoch": 0.9973290598290598,
+      "grad_norm": 0.517464816570282,
+      "learning_rate": 0.00017100698502862916,
+      "loss": 1.1143,
+      "step": 5601
+    },
+    {
+      "epoch": 0.9975071225071225,
+      "grad_norm": 0.514281153678894,
+      "learning_rate": 0.00017099712827556358,
+      "loss": 1.0336,
+      "step": 5602
+    },
+    {
+      "epoch": 0.9976851851851852,
+      "grad_norm": 0.5378567576408386,
+      "learning_rate": 0.00017098727013145672,
+      "loss": 0.8278,
+      "step": 5603
+    },
+    {
+      "epoch": 0.9978632478632479,
+      "grad_norm": 0.5098404884338379,
+      "learning_rate": 0.0001709774105965018,
+      "loss": 0.9902,
+      "step": 5604
+    },
+    {
+      "epoch": 0.9980413105413105,
+      "grad_norm": 0.6231759190559387,
+      "learning_rate": 0.00017096754967089198,
+      "loss": 1.0564,
+      "step": 5605
+    },
+    {
+      "epoch": 0.9982193732193733,
+      "grad_norm": 0.47434380650520325,
+      "learning_rate": 0.00017095768735482042,
+      "loss": 0.7457,
+      "step": 5606
+    },
+    {
+      "epoch": 0.9983974358974359,
+      "grad_norm": 0.5771013498306274,
+      "learning_rate": 0.00017094782364848035,
+      "loss": 1.1191,
+      "step": 5607
+    },
+    {
+      "epoch": 0.9985754985754985,
+      "grad_norm": 0.5617234706878662,
+      "learning_rate": 0.00017093795855206508,
+      "loss": 1.0779,
+      "step": 5608
+    },
+    {
+      "epoch": 0.9987535612535613,
+      "grad_norm": 0.6573554873466492,
+      "learning_rate": 0.00017092809206576792,
+      "loss": 1.0191,
+      "step": 5609
+    },
+    {
+      "epoch": 0.9989316239316239,
+      "grad_norm": 0.482834130525589,
+      "learning_rate": 0.00017091822418978207,
+      "loss": 1.0119,
+      "step": 5610
+    },
+    {
+      "epoch": 0.9991096866096866,
+      "grad_norm": 0.47496405243873596,
+      "learning_rate": 0.000170908354924301,
+      "loss": 0.8297,
+      "step": 5611
+    },
+    {
+      "epoch": 0.9992877492877493,
+      "grad_norm": 0.5013265013694763,
+      "learning_rate": 0.00017089848426951796,
+      "loss": 1.1511,
+      "step": 5612
+    },
+    {
+      "epoch": 0.999465811965812,
+      "grad_norm": 0.5402522683143616,
+      "learning_rate": 0.00017088861222562643,
+      "loss": 1.1401,
+      "step": 5613
+    },
+    {
+      "epoch": 0.9996438746438746,
+      "grad_norm": 0.546302318572998,
+      "learning_rate": 0.00017087873879281977,
+      "loss": 0.8611,
+      "step": 5614
+    },
+    {
+      "epoch": 0.9998219373219374,
+      "grad_norm": 0.44279807806015015,
+      "learning_rate": 0.0001708688639712915,
+      "loss": 0.79,
+      "step": 5615
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.5514659285545349,
+      "learning_rate": 0.00017085898776123502,
+      "loss": 1.0709,
+      "step": 5616
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 1.093075156211853,
+      "eval_runtime": 24.6155,
+      "eval_samples_per_second": 42.29,
+      "eval_steps_per_second": 21.166,
+      "step": 5616
+    },
+    {
+      "epoch": 1.0001780626780628,
+      "grad_norm": 0.6290156841278076,
+      "learning_rate": 0.0001708491101628439,
+      "loss": 1.1786,
+      "step": 5617
+    },
+    {
+      "epoch": 1.0001780626780628,
+      "grad_norm": 0.4703841209411621,
+      "learning_rate": 0.00017083923117631162,
+      "loss": 0.9548,
+      "step": 5618
+    },
+    {
+      "epoch": 1.0003561253561253,
+      "grad_norm": 0.4518105089664459,
+      "learning_rate": 0.0001708293508018318,
+      "loss": 1.0089,
+      "step": 5619
+    },
+    {
+      "epoch": 1.000534188034188,
+      "grad_norm": 0.5658619403839111,
+      "learning_rate": 0.00017081946903959794,
+      "loss": 0.9466,
+      "step": 5620
+    },
+    {
+      "epoch": 1.0007122507122508,
+      "grad_norm": 0.6153838634490967,
+      "learning_rate": 0.00017080958588980372,
+      "loss": 1.2898,
+      "step": 5621
+    },
+    {
+      "epoch": 1.0008903133903133,
+      "grad_norm": 0.5245628952980042,
+      "learning_rate": 0.00017079970135264275,
+      "loss": 1.1702,
+      "step": 5622
+    },
+    {
+      "epoch": 1.001068376068376,
+      "grad_norm": 0.5291880965232849,
+      "learning_rate": 0.00017078981542830875,
+      "loss": 1.0779,
+      "step": 5623
+    },
+    {
+      "epoch": 1.0012464387464388,
+      "grad_norm": 0.500579297542572,
+      "learning_rate": 0.0001707799281169953,
+      "loss": 0.9587,
+      "step": 5624
+    },
+    {
+      "epoch": 1.0014245014245013,
+      "grad_norm": 0.45739707350730896,
+      "learning_rate": 0.00017077003941889625,
+      "loss": 0.9373,
+      "step": 5625
+    },
+    {
+      "epoch": 1.001602564102564,
+      "grad_norm": 0.5513401031494141,
+      "learning_rate": 0.00017076014933420526,
+      "loss": 1.0368,
+      "step": 5626
+    },
+    {
+      "epoch": 1.0017806267806268,
+      "grad_norm": 0.46513232588768005,
+      "learning_rate": 0.00017075025786311612,
+      "loss": 0.9422,
+      "step": 5627
+    },
+    {
+      "epoch": 1.0019586894586894,
+      "grad_norm": 0.4530394673347473,
+      "learning_rate": 0.00017074036500582267,
+      "loss": 0.8211,
+      "step": 5628
+    },
+    {
+      "epoch": 1.0021367521367521,
+      "grad_norm": 0.5612013339996338,
+      "learning_rate": 0.00017073047076251872,
+      "loss": 0.9466,
+      "step": 5629
+    },
+    {
+      "epoch": 1.0023148148148149,
+      "grad_norm": 0.4976879954338074,
+      "learning_rate": 0.00017072057513339812,
+      "loss": 0.8059,
+      "step": 5630
+    },
+    {
+      "epoch": 1.0024928774928774,
+      "grad_norm": 0.4842833876609802,
+      "learning_rate": 0.00017071067811865476,
+      "loss": 0.6554,
+      "step": 5631
+    },
+    {
+      "epoch": 1.0026709401709402,
+      "grad_norm": 0.5446373224258423,
+      "learning_rate": 0.00017070077971848257,
+      "loss": 1.1001,
+      "step": 5632
+    },
+    {
+      "epoch": 1.002849002849003,
+      "grad_norm": 0.5996584892272949,
+      "learning_rate": 0.00017069087993307544,
+      "loss": 1.0317,
+      "step": 5633
+    },
+    {
+      "epoch": 1.0030270655270654,
+      "grad_norm": 0.5369443297386169,
+      "learning_rate": 0.00017068097876262738,
+      "loss": 0.8019,
+      "step": 5634
+    },
+    {
+      "epoch": 1.0032051282051282,
+      "grad_norm": 0.4985966682434082,
+      "learning_rate": 0.00017067107620733236,
+      "loss": 1.0121,
+      "step": 5635
+    },
+    {
+      "epoch": 1.003383190883191,
+      "grad_norm": 0.5262824892997742,
+      "learning_rate": 0.0001706611722673844,
+      "loss": 1.0157,
+      "step": 5636
+    },
+    {
+      "epoch": 1.0035612535612535,
+      "grad_norm": 0.5912795066833496,
+      "learning_rate": 0.00017065126694297756,
+      "loss": 1.0327,
+      "step": 5637
+    },
+    {
+      "epoch": 1.0037393162393162,
+      "grad_norm": 0.5866343379020691,
+      "learning_rate": 0.00017064136023430595,
+      "loss": 1.1194,
+      "step": 5638
+    },
+    {
+      "epoch": 1.003917378917379,
+      "grad_norm": 0.5009918808937073,
+      "learning_rate": 0.0001706314521415636,
+      "loss": 1.0467,
+      "step": 5639
+    },
+    {
+      "epoch": 1.0040954415954415,
+      "grad_norm": 0.5455304384231567,
+      "learning_rate": 0.00017062154266494464,
+      "loss": 0.8749,
+      "step": 5640
+    },
+    {
+      "epoch": 1.0042735042735043,
+      "grad_norm": 0.5648258328437805,
+      "learning_rate": 0.00017061163180464328,
+      "loss": 0.9408,
+      "step": 5641
+    },
+    {
+      "epoch": 1.004451566951567,
+      "grad_norm": 0.5276365876197815,
+      "learning_rate": 0.00017060171956085368,
+      "loss": 0.9681,
+      "step": 5642
+    },
+    {
+      "epoch": 1.0046296296296295,
+      "grad_norm": 0.5212745070457458,
+      "learning_rate": 0.00017059180593377007,
+      "loss": 0.9188,
+      "step": 5643
+    },
+    {
+      "epoch": 1.0048076923076923,
+      "grad_norm": 0.540626585483551,
+      "learning_rate": 0.00017058189092358664,
+      "loss": 1.0809,
+      "step": 5644
+    },
+    {
+      "epoch": 1.004985754985755,
+      "grad_norm": 0.5592377781867981,
+      "learning_rate": 0.00017057197453049767,
+      "loss": 0.8589,
+      "step": 5645
+    },
+    {
+      "epoch": 1.0051638176638176,
+      "grad_norm": 0.5115051865577698,
+      "learning_rate": 0.00017056205675469746,
+      "loss": 0.8006,
+      "step": 5646
+    },
+    {
+      "epoch": 1.0053418803418803,
+      "grad_norm": 0.5031117796897888,
+      "learning_rate": 0.00017055213759638034,
+      "loss": 0.9242,
+      "step": 5647
+    },
+    {
+      "epoch": 1.005519943019943,
+      "grad_norm": 0.5342774987220764,
+      "learning_rate": 0.00017054221705574066,
+      "loss": 0.8268,
+      "step": 5648
+    },
+    {
+      "epoch": 1.0056980056980056,
+      "grad_norm": 0.44480493664741516,
+      "learning_rate": 0.00017053229513297276,
+      "loss": 0.6892,
+      "step": 5649
+    },
+    {
+      "epoch": 1.0058760683760684,
+      "grad_norm": 0.5032621622085571,
+      "learning_rate": 0.00017052237182827105,
+      "loss": 0.971,
+      "step": 5650
+    },
+    {
+      "epoch": 1.006054131054131,
+      "grad_norm": 0.5611015558242798,
+      "learning_rate": 0.00017051244714182996,
+      "loss": 0.9403,
+      "step": 5651
+    },
+    {
+      "epoch": 1.0062321937321936,
+      "grad_norm": 0.5064613223075867,
+      "learning_rate": 0.00017050252107384393,
+      "loss": 0.9718,
+      "step": 5652
+    },
+    {
+      "epoch": 1.0064102564102564,
+      "grad_norm": 0.6458395719528198,
+      "learning_rate": 0.0001704925936245075,
+      "loss": 1.1161,
+      "step": 5653
+    },
+    {
+      "epoch": 1.0065883190883191,
+      "grad_norm": 0.527418315410614,
+      "learning_rate": 0.00017048266479401512,
+      "loss": 0.9315,
+      "step": 5654
+    },
+    {
+      "epoch": 1.0067663817663817,
+      "grad_norm": 0.5127941370010376,
+      "learning_rate": 0.00017047273458256133,
+      "loss": 0.8206,
+      "step": 5655
+    },
+    {
+      "epoch": 1.0069444444444444,
+      "grad_norm": 0.6257100105285645,
+      "learning_rate": 0.00017046280299034067,
+      "loss": 0.9854,
+      "step": 5656
+    },
+    {
+      "epoch": 1.0071225071225072,
+      "grad_norm": 0.5081700682640076,
+      "learning_rate": 0.0001704528700175478,
+      "loss": 0.9478,
+      "step": 5657
+    },
+    {
+      "epoch": 1.0073005698005697,
+      "grad_norm": 0.598127543926239,
+      "learning_rate": 0.00017044293566437725,
+      "loss": 1.0721,
+      "step": 5658
+    },
+    {
+      "epoch": 1.0074786324786325,
+      "grad_norm": 0.5429877638816833,
+      "learning_rate": 0.00017043299993102376,
+      "loss": 0.9732,
+      "step": 5659
+    },
+    {
+      "epoch": 1.0076566951566952,
+      "grad_norm": 0.6006619334220886,
+      "learning_rate": 0.00017042306281768194,
+      "loss": 1.1262,
+      "step": 5660
+    },
+    {
+      "epoch": 1.0078347578347577,
+      "grad_norm": 0.48933324217796326,
+      "learning_rate": 0.00017041312432454646,
+      "loss": 0.8596,
+      "step": 5661
+    },
+    {
+      "epoch": 1.0080128205128205,
+      "grad_norm": 0.5902166366577148,
+      "learning_rate": 0.0001704031844518121,
+      "loss": 1.1035,
+      "step": 5662
+    },
+    {
+      "epoch": 1.0081908831908832,
+      "grad_norm": 0.523597776889801,
+      "learning_rate": 0.0001703932431996736,
+      "loss": 0.7117,
+      "step": 5663
+    },
+    {
+      "epoch": 1.0083689458689458,
+      "grad_norm": 0.6313928365707397,
+      "learning_rate": 0.00017038330056832573,
+      "loss": 1.0204,
+      "step": 5664
+    },
+    {
+      "epoch": 1.0085470085470085,
+      "grad_norm": 0.5627471804618835,
+      "learning_rate": 0.00017037335655796328,
+      "loss": 0.7648,
+      "step": 5665
+    },
+    {
+      "epoch": 1.0087250712250713,
+      "grad_norm": 0.5817851424217224,
+      "learning_rate": 0.0001703634111687811,
+      "loss": 1.0452,
+      "step": 5666
+    },
+    {
+      "epoch": 1.0089031339031338,
+      "grad_norm": 0.5143535137176514,
+      "learning_rate": 0.00017035346440097407,
+      "loss": 0.9788,
+      "step": 5667
+    },
+    {
+      "epoch": 1.0090811965811965,
+      "grad_norm": 0.5331187844276428,
+      "learning_rate": 0.000170343516254737,
+      "loss": 0.7584,
+      "step": 5668
+    },
+    {
+      "epoch": 1.0092592592592593,
+      "grad_norm": 0.5723634362220764,
+      "learning_rate": 0.00017033356673026487,
+      "loss": 0.9435,
+      "step": 5669
+    },
+    {
+      "epoch": 1.0094373219373218,
+      "grad_norm": 0.6012297868728638,
+      "learning_rate": 0.00017032361582775265,
+      "loss": 1.142,
+      "step": 5670
+    },
+    {
+      "epoch": 1.0096153846153846,
+      "grad_norm": 0.6161282658576965,
+      "learning_rate": 0.00017031366354739523,
+      "loss": 1.2823,
+      "step": 5671
+    },
+    {
+      "epoch": 1.0097934472934473,
+      "grad_norm": 0.5088054537773132,
+      "learning_rate": 0.00017030370988938763,
+      "loss": 0.9743,
+      "step": 5672
+    },
+    {
+      "epoch": 1.0099715099715099,
+      "grad_norm": 0.512003481388092,
+      "learning_rate": 0.0001702937548539249,
+      "loss": 0.9112,
+      "step": 5673
+    },
+    {
+      "epoch": 1.0101495726495726,
+      "grad_norm": 0.5565149784088135,
+      "learning_rate": 0.00017028379844120207,
+      "loss": 1.0074,
+      "step": 5674
+    },
+    {
+      "epoch": 1.0103276353276354,
+      "grad_norm": 0.6463099718093872,
+      "learning_rate": 0.00017027384065141418,
+      "loss": 1.175,
+      "step": 5675
+    },
+    {
+      "epoch": 1.010505698005698,
+      "grad_norm": 0.46999064087867737,
+      "learning_rate": 0.00017026388148475637,
+      "loss": 0.8429,
+      "step": 5676
+    },
+    {
+      "epoch": 1.0106837606837606,
+      "grad_norm": 0.5617384910583496,
+      "learning_rate": 0.00017025392094142377,
+      "loss": 1.045,
+      "step": 5677
+    },
+    {
+      "epoch": 1.0108618233618234,
+      "grad_norm": 0.5156623721122742,
+      "learning_rate": 0.00017024395902161154,
+      "loss": 1.016,
+      "step": 5678
+    },
+    {
+      "epoch": 1.0110398860398861,
+      "grad_norm": 0.5693390369415283,
+      "learning_rate": 0.00017023399572551484,
+      "loss": 0.8616,
+      "step": 5679
+    },
+    {
+      "epoch": 1.0112179487179487,
+      "grad_norm": 0.5234879851341248,
+      "learning_rate": 0.00017022403105332892,
+      "loss": 0.9244,
+      "step": 5680
+    },
+    {
+      "epoch": 1.0113960113960114,
+      "grad_norm": 0.6513097286224365,
+      "learning_rate": 0.00017021406500524893,
+      "loss": 0.9565,
+      "step": 5681
+    },
+    {
+      "epoch": 1.0115740740740742,
+      "grad_norm": 0.5788878202438354,
+      "learning_rate": 0.00017020409758147022,
+      "loss": 0.8994,
+      "step": 5682
+    },
+    {
+      "epoch": 1.0117521367521367,
+      "grad_norm": 0.5495247840881348,
+      "learning_rate": 0.00017019412878218807,
+      "loss": 0.9371,
+      "step": 5683
+    },
+    {
+      "epoch": 1.0119301994301995,
+      "grad_norm": 0.639045238494873,
+      "learning_rate": 0.00017018415860759777,
+      "loss": 1.0297,
+      "step": 5684
+    },
+    {
+      "epoch": 1.0121082621082622,
+      "grad_norm": 0.5167784690856934,
+      "learning_rate": 0.0001701741870578947,
+      "loss": 0.8974,
+      "step": 5685
+    },
+    {
+      "epoch": 1.0122863247863247,
+      "grad_norm": 0.6131011247634888,
+      "learning_rate": 0.00017016421413327417,
+      "loss": 1.13,
+      "step": 5686
+    },
+    {
+      "epoch": 1.0124643874643875,
+      "grad_norm": 0.4804688096046448,
+      "learning_rate": 0.00017015423983393166,
+      "loss": 1.0098,
+      "step": 5687
+    },
+    {
+      "epoch": 1.0126424501424502,
+      "grad_norm": 0.6605221629142761,
+      "learning_rate": 0.00017014426416006253,
+      "loss": 1.1123,
+      "step": 5688
+    },
+    {
+      "epoch": 1.0128205128205128,
+      "grad_norm": 0.5523666739463806,
+      "learning_rate": 0.00017013428711186226,
+      "loss": 0.8226,
+      "step": 5689
+    },
+    {
+      "epoch": 1.0129985754985755,
+      "grad_norm": 0.6012941598892212,
+      "learning_rate": 0.00017012430868952632,
+      "loss": 0.8915,
+      "step": 5690
+    },
+    {
+      "epoch": 1.0131766381766383,
+      "grad_norm": 0.5830875039100647,
+      "learning_rate": 0.00017011432889325022,
+      "loss": 1.021,
+      "step": 5691
+    },
+    {
+      "epoch": 1.0133547008547008,
+      "grad_norm": 0.5546056032180786,
+      "learning_rate": 0.0001701043477232295,
+      "loss": 0.7656,
+      "step": 5692
+    },
+    {
+      "epoch": 1.0135327635327636,
+      "grad_norm": 0.5592601299285889,
+      "learning_rate": 0.0001700943651796597,
+      "loss": 1.0172,
+      "step": 5693
+    },
+    {
+      "epoch": 1.0137108262108263,
+      "grad_norm": 0.5708866715431213,
+      "learning_rate": 0.00017008438126273645,
+      "loss": 1.0012,
+      "step": 5694
+    },
+    {
+      "epoch": 1.0138888888888888,
+      "grad_norm": 0.6856338381767273,
+      "learning_rate": 0.0001700743959726553,
+      "loss": 1.1278,
+      "step": 5695
+    },
+    {
+      "epoch": 1.0140669515669516,
+      "grad_norm": 0.6523802876472473,
+      "learning_rate": 0.000170064409309612,
+      "loss": 1.0406,
+      "step": 5696
+    },
+    {
+      "epoch": 1.0142450142450143,
+      "grad_norm": 0.6653079986572266,
+      "learning_rate": 0.00017005442127380208,
+      "loss": 1.1086,
+      "step": 5697
+    },
+    {
+      "epoch": 1.0144230769230769,
+      "grad_norm": 0.5841104388237,
+      "learning_rate": 0.00017004443186542133,
+      "loss": 0.9335,
+      "step": 5698
+    },
+    {
+      "epoch": 1.0146011396011396,
+      "grad_norm": 0.5696784257888794,
+      "learning_rate": 0.0001700344410846654,
+      "loss": 1.0247,
+      "step": 5699
+    },
+    {
+      "epoch": 1.0147792022792024,
+      "grad_norm": 0.7135653495788574,
+      "learning_rate": 0.00017002444893173013,
+      "loss": 1.0259,
+      "step": 5700
+    },
+    {
+      "epoch": 1.014957264957265,
+      "grad_norm": 0.5806999802589417,
+      "learning_rate": 0.00017001445540681124,
+      "loss": 1.0053,
+      "step": 5701
+    },
+    {
+      "epoch": 1.0151353276353277,
+      "grad_norm": 0.5298715829849243,
+      "learning_rate": 0.0001700044605101045,
+      "loss": 0.9415,
+      "step": 5702
+    },
+    {
+      "epoch": 1.0153133903133904,
+      "grad_norm": 0.5817379951477051,
+      "learning_rate": 0.0001699944642418058,
+      "loss": 1.0906,
+      "step": 5703
+    },
+    {
+      "epoch": 1.015491452991453,
+      "grad_norm": 0.6564923524856567,
+      "learning_rate": 0.00016998446660211098,
+      "loss": 0.9933,
+      "step": 5704
+    },
+    {
+      "epoch": 1.0156695156695157,
+      "grad_norm": 0.6547308564186096,
+      "learning_rate": 0.00016997446759121592,
+      "loss": 1.0045,
+      "step": 5705
+    },
+    {
+      "epoch": 1.0158475783475784,
+      "grad_norm": 0.5763013958930969,
+      "learning_rate": 0.00016996446720931652,
+      "loss": 1.0898,
+      "step": 5706
+    },
+    {
+      "epoch": 1.016025641025641,
+      "grad_norm": 0.6118074059486389,
+      "learning_rate": 0.00016995446545660871,
+      "loss": 0.9398,
+      "step": 5707
+    },
+    {
+      "epoch": 1.0162037037037037,
+      "grad_norm": 0.6810526251792908,
+      "learning_rate": 0.0001699444623332885,
+      "loss": 1.0968,
+      "step": 5708
+    },
+    {
+      "epoch": 1.0163817663817665,
+      "grad_norm": 0.5292752981185913,
+      "learning_rate": 0.00016993445783955184,
+      "loss": 0.7549,
+      "step": 5709
+    },
+    {
+      "epoch": 1.016559829059829,
+      "grad_norm": 0.6014277935028076,
+      "learning_rate": 0.00016992445197559474,
+      "loss": 1.1711,
+      "step": 5710
+    },
+    {
+      "epoch": 1.0167378917378918,
+      "grad_norm": 0.5089772343635559,
+      "learning_rate": 0.00016991444474161326,
+      "loss": 0.9188,
+      "step": 5711
+    },
+    {
+      "epoch": 1.0169159544159545,
+      "grad_norm": 0.567193865776062,
+      "learning_rate": 0.0001699044361378035,
+      "loss": 0.7462,
+      "step": 5712
+    },
+    {
+      "epoch": 1.017094017094017,
+      "grad_norm": 0.5638598799705505,
+      "learning_rate": 0.00016989442616436147,
+      "loss": 0.9643,
+      "step": 5713
+    },
+    {
+      "epoch": 1.0172720797720798,
+      "grad_norm": 0.5634039640426636,
+      "learning_rate": 0.0001698844148214834,
+      "loss": 1.0141,
+      "step": 5714
+    },
+    {
+      "epoch": 1.0174501424501425,
+      "grad_norm": 0.5326652526855469,
+      "learning_rate": 0.00016987440210936537,
+      "loss": 0.865,
+      "step": 5715
+    },
+    {
+      "epoch": 1.017628205128205,
+      "grad_norm": 0.5858046412467957,
+      "learning_rate": 0.0001698643880282036,
+      "loss": 0.9561,
+      "step": 5716
+    },
+    {
+      "epoch": 1.0178062678062678,
+      "grad_norm": 0.6424698829650879,
+      "learning_rate": 0.00016985437257819428,
+      "loss": 1.0169,
+      "step": 5717
+    },
+    {
+      "epoch": 1.0179843304843306,
+      "grad_norm": 0.6294280290603638,
+      "learning_rate": 0.00016984435575953364,
+      "loss": 1.0438,
+      "step": 5718
+    },
+    {
+      "epoch": 1.018162393162393,
+      "grad_norm": 0.5533088445663452,
+      "learning_rate": 0.00016983433757241788,
+      "loss": 0.8901,
+      "step": 5719
+    },
+    {
+      "epoch": 1.0183404558404558,
+      "grad_norm": 0.5148718953132629,
+      "learning_rate": 0.00016982431801704342,
+      "loss": 0.9201,
+      "step": 5720
+    },
+    {
+      "epoch": 1.0185185185185186,
+      "grad_norm": 0.5609371662139893,
+      "learning_rate": 0.00016981429709360645,
+      "loss": 0.9347,
+      "step": 5721
+    },
+    {
+      "epoch": 1.0186965811965811,
+      "grad_norm": 0.5502731204032898,
+      "learning_rate": 0.00016980427480230338,
+      "loss": 1.0508,
+      "step": 5722
+    },
+    {
+      "epoch": 1.0188746438746439,
+      "grad_norm": 0.5880394577980042,
+      "learning_rate": 0.00016979425114333055,
+      "loss": 1.1258,
+      "step": 5723
+    },
+    {
+      "epoch": 1.0190527065527066,
+      "grad_norm": 0.5569866895675659,
+      "learning_rate": 0.0001697842261168843,
+      "loss": 0.9186,
+      "step": 5724
+    },
+    {
+      "epoch": 1.0192307692307692,
+      "grad_norm": 0.7468093037605286,
+      "learning_rate": 0.00016977419972316116,
+      "loss": 1.2066,
+      "step": 5725
+    },
+    {
+      "epoch": 1.019408831908832,
+      "grad_norm": 0.6041515469551086,
+      "learning_rate": 0.00016976417196235753,
+      "loss": 0.939,
+      "step": 5726
+    },
+    {
+      "epoch": 1.0195868945868947,
+      "grad_norm": 0.6102641224861145,
+      "learning_rate": 0.00016975414283466983,
+      "loss": 0.8334,
+      "step": 5727
+    },
+    {
+      "epoch": 1.0197649572649572,
+      "grad_norm": 0.5418640375137329,
+      "learning_rate": 0.00016974411234029467,
+      "loss": 0.8072,
+      "step": 5728
+    },
+    {
+      "epoch": 1.01994301994302,
+      "grad_norm": 0.6569705605506897,
+      "learning_rate": 0.00016973408047942843,
+      "loss": 1.103,
+      "step": 5729
+    },
+    {
+      "epoch": 1.0201210826210827,
+      "grad_norm": 0.5778102278709412,
+      "learning_rate": 0.00016972404725226778,
+      "loss": 0.9353,
+      "step": 5730
+    },
+    {
+      "epoch": 1.0202991452991452,
+      "grad_norm": 0.5474382638931274,
+      "learning_rate": 0.0001697140126590093,
+      "loss": 1.0009,
+      "step": 5731
+    },
+    {
+      "epoch": 1.020477207977208,
+      "grad_norm": 0.5869506597518921,
+      "learning_rate": 0.00016970397669984947,
+      "loss": 1.0027,
+      "step": 5732
+    },
+    {
+      "epoch": 1.0206552706552707,
+      "grad_norm": 0.5078117251396179,
+      "learning_rate": 0.00016969393937498508,
+      "loss": 0.8316,
+      "step": 5733
+    },
+    {
+      "epoch": 1.0208333333333333,
+      "grad_norm": 0.5488452911376953,
+      "learning_rate": 0.0001696839006846127,
+      "loss": 0.8438,
+      "step": 5734
+    },
+    {
+      "epoch": 1.021011396011396,
+      "grad_norm": 0.5921052098274231,
+      "learning_rate": 0.00016967386062892908,
+      "loss": 0.9147,
+      "step": 5735
+    },
+    {
+      "epoch": 1.0211894586894588,
+      "grad_norm": 0.5486881136894226,
+      "learning_rate": 0.00016966381920813085,
+      "loss": 0.7619,
+      "step": 5736
+    },
+    {
+      "epoch": 1.0213675213675213,
+      "grad_norm": 0.5250689387321472,
+      "learning_rate": 0.00016965377642241483,
+      "loss": 0.9192,
+      "step": 5737
+    },
+    {
+      "epoch": 1.021545584045584,
+      "grad_norm": 0.5355087518692017,
+      "learning_rate": 0.00016964373227197773,
+      "loss": 0.954,
+      "step": 5738
+    },
+    {
+      "epoch": 1.0217236467236468,
+      "grad_norm": 0.6758780479431152,
+      "learning_rate": 0.0001696336867570164,
+      "loss": 1.1257,
+      "step": 5739
+    },
+    {
+      "epoch": 1.0219017094017093,
+      "grad_norm": 0.6361044049263,
+      "learning_rate": 0.00016962363987772756,
+      "loss": 1.0889,
+      "step": 5740
+    },
+    {
+      "epoch": 1.022079772079772,
+      "grad_norm": 0.5802326798439026,
+      "learning_rate": 0.00016961359163430819,
+      "loss": 0.8966,
+      "step": 5741
+    },
+    {
+      "epoch": 1.0222578347578348,
+      "grad_norm": 0.5535712242126465,
+      "learning_rate": 0.00016960354202695508,
+      "loss": 1.0007,
+      "step": 5742
+    },
+    {
+      "epoch": 1.0224358974358974,
+      "grad_norm": 0.5469220280647278,
+      "learning_rate": 0.00016959349105586516,
+      "loss": 0.8202,
+      "step": 5743
+    },
+    {
+      "epoch": 1.02261396011396,
+      "grad_norm": 0.5533008575439453,
+      "learning_rate": 0.00016958343872123534,
+      "loss": 0.9576,
+      "step": 5744
+    },
+    {
+      "epoch": 1.0227920227920229,
+      "grad_norm": 0.615132749080658,
+      "learning_rate": 0.00016957338502326258,
+      "loss": 0.8719,
+      "step": 5745
+    },
+    {
+      "epoch": 1.0229700854700854,
+      "grad_norm": 0.519075334072113,
+      "learning_rate": 0.0001695633299621439,
+      "loss": 0.8309,
+      "step": 5746
+    },
+    {
+      "epoch": 1.0231481481481481,
+      "grad_norm": 0.6249759197235107,
+      "learning_rate": 0.00016955327353807624,
+      "loss": 1.151,
+      "step": 5747
+    },
+    {
+      "epoch": 1.023326210826211,
+      "grad_norm": 0.560299277305603,
+      "learning_rate": 0.00016954321575125668,
+      "loss": 0.7889,
+      "step": 5748
+    },
+    {
+      "epoch": 1.0235042735042734,
+      "grad_norm": 0.5735262036323547,
+      "learning_rate": 0.0001695331566018823,
+      "loss": 0.8794,
+      "step": 5749
+    },
+    {
+      "epoch": 1.0236823361823362,
+      "grad_norm": 0.5893994569778442,
+      "learning_rate": 0.00016952309609015012,
+      "loss": 0.9696,
+      "step": 5750
+    },
+    {
+      "epoch": 1.023860398860399,
+      "grad_norm": 0.6064512133598328,
+      "learning_rate": 0.0001695130342162573,
+      "loss": 0.9771,
+      "step": 5751
+    },
+    {
+      "epoch": 1.0240384615384615,
+      "grad_norm": 0.5833427309989929,
+      "learning_rate": 0.00016950297098040099,
+      "loss": 1.1768,
+      "step": 5752
+    },
+    {
+      "epoch": 1.0242165242165242,
+      "grad_norm": 0.5940282344818115,
+      "learning_rate": 0.00016949290638277833,
+      "loss": 1.0758,
+      "step": 5753
+    },
+    {
+      "epoch": 1.024394586894587,
+      "grad_norm": 0.5267124772071838,
+      "learning_rate": 0.00016948284042358656,
+      "loss": 0.772,
+      "step": 5754
+    },
+    {
+      "epoch": 1.0245726495726495,
+      "grad_norm": 0.6217982172966003,
+      "learning_rate": 0.00016947277310302284,
+      "loss": 0.8583,
+      "step": 5755
+    },
+    {
+      "epoch": 1.0247507122507122,
+      "grad_norm": 0.6192215085029602,
+      "learning_rate": 0.00016946270442128443,
+      "loss": 0.9148,
+      "step": 5756
+    },
+    {
+      "epoch": 1.024928774928775,
+      "grad_norm": 0.5337123870849609,
+      "learning_rate": 0.00016945263437856867,
+      "loss": 1.0054,
+      "step": 5757
+    },
+    {
+      "epoch": 1.0251068376068375,
+      "grad_norm": 0.5462040901184082,
+      "learning_rate": 0.00016944256297507276,
+      "loss": 1.1097,
+      "step": 5758
+    },
+    {
+      "epoch": 1.0252849002849003,
+      "grad_norm": 0.5606170892715454,
+      "learning_rate": 0.00016943249021099415,
+      "loss": 1.0192,
+      "step": 5759
+    },
+    {
+      "epoch": 1.025462962962963,
+      "grad_norm": 0.636974573135376,
+      "learning_rate": 0.00016942241608653008,
+      "loss": 1.0241,
+      "step": 5760
+    },
+    {
+      "epoch": 1.0256410256410255,
+      "grad_norm": 0.4895164966583252,
+      "learning_rate": 0.00016941234060187797,
+      "loss": 0.9057,
+      "step": 5761
+    },
+    {
+      "epoch": 1.0258190883190883,
+      "grad_norm": 0.5810303092002869,
+      "learning_rate": 0.00016940226375723527,
+      "loss": 1.0809,
+      "step": 5762
+    },
+    {
+      "epoch": 1.025997150997151,
+      "grad_norm": 0.6043853163719177,
+      "learning_rate": 0.00016939218555279937,
+      "loss": 1.0685,
+      "step": 5763
+    },
+    {
+      "epoch": 1.0261752136752136,
+      "grad_norm": 0.5827188491821289,
+      "learning_rate": 0.00016938210598876774,
+      "loss": 1.0236,
+      "step": 5764
+    },
+    {
+      "epoch": 1.0263532763532763,
+      "grad_norm": 0.6677887439727783,
+      "learning_rate": 0.0001693720250653379,
+      "loss": 1.0586,
+      "step": 5765
+    },
+    {
+      "epoch": 1.026531339031339,
+      "grad_norm": 0.558051347732544,
+      "learning_rate": 0.0001693619427827073,
+      "loss": 0.745,
+      "step": 5766
+    },
+    {
+      "epoch": 1.0267094017094016,
+      "grad_norm": 0.6336706280708313,
+      "learning_rate": 0.0001693518591410735,
+      "loss": 1.0658,
+      "step": 5767
+    },
+    {
+      "epoch": 1.0268874643874644,
+      "grad_norm": 0.7077126502990723,
+      "learning_rate": 0.00016934177414063416,
+      "loss": 1.18,
+      "step": 5768
+    },
+    {
+      "epoch": 1.0270655270655271,
+      "grad_norm": 0.5342326760292053,
+      "learning_rate": 0.00016933168778158675,
+      "loss": 0.8347,
+      "step": 5769
+    },
+    {
+      "epoch": 1.0272435897435896,
+      "grad_norm": 0.6116416454315186,
+      "learning_rate": 0.00016932160006412895,
+      "loss": 1.0648,
+      "step": 5770
+    },
+    {
+      "epoch": 1.0274216524216524,
+      "grad_norm": 0.5411320924758911,
+      "learning_rate": 0.0001693115109884584,
+      "loss": 1.0756,
+      "step": 5771
+    },
+    {
+      "epoch": 1.0275997150997151,
+      "grad_norm": 0.5549847483634949,
+      "learning_rate": 0.00016930142055477277,
+      "loss": 0.7259,
+      "step": 5772
+    },
+    {
+      "epoch": 1.0277777777777777,
+      "grad_norm": 0.549010694026947,
+      "learning_rate": 0.00016929132876326977,
+      "loss": 0.9488,
+      "step": 5773
+    },
+    {
+      "epoch": 1.0279558404558404,
+      "grad_norm": 0.6302017569541931,
+      "learning_rate": 0.00016928123561414714,
+      "loss": 0.8851,
+      "step": 5774
+    },
+    {
+      "epoch": 1.0281339031339032,
+      "grad_norm": 0.5831273198127747,
+      "learning_rate": 0.00016927114110760257,
+      "loss": 0.7841,
+      "step": 5775
+    },
+    {
+      "epoch": 1.0283119658119657,
+      "grad_norm": 0.5528474450111389,
+      "learning_rate": 0.00016926104524383394,
+      "loss": 1.0108,
+      "step": 5776
+    },
+    {
+      "epoch": 1.0284900284900285,
+      "grad_norm": 0.6279126405715942,
+      "learning_rate": 0.00016925094802303897,
+      "loss": 0.8632,
+      "step": 5777
+    },
+    {
+      "epoch": 1.0286680911680912,
+      "grad_norm": 0.6783218383789062,
+      "learning_rate": 0.00016924084944541554,
+      "loss": 1.0746,
+      "step": 5778
+    },
+    {
+      "epoch": 1.0288461538461537,
+      "grad_norm": 0.5823925137519836,
+      "learning_rate": 0.00016923074951116153,
+      "loss": 1.0486,
+      "step": 5779
+    },
+    {
+      "epoch": 1.0290242165242165,
+      "grad_norm": 0.6095981597900391,
+      "learning_rate": 0.00016922064822047473,
+      "loss": 0.8113,
+      "step": 5780
+    },
+    {
+      "epoch": 1.0292022792022792,
+      "grad_norm": 0.7887664437294006,
+      "learning_rate": 0.00016921054557355317,
+      "loss": 1.2411,
+      "step": 5781
+    },
+    {
+      "epoch": 1.0293803418803418,
+      "grad_norm": 0.6511263251304626,
+      "learning_rate": 0.00016920044157059475,
+      "loss": 0.924,
+      "step": 5782
+    },
+    {
+      "epoch": 1.0295584045584045,
+      "grad_norm": 0.6045661568641663,
+      "learning_rate": 0.00016919033621179744,
+      "loss": 0.8373,
+      "step": 5783
+    },
+    {
+      "epoch": 1.0297364672364673,
+      "grad_norm": 0.6914188861846924,
+      "learning_rate": 0.0001691802294973592,
+      "loss": 0.9589,
+      "step": 5784
+    },
+    {
+      "epoch": 1.0299145299145298,
+      "grad_norm": 0.6483730673789978,
+      "learning_rate": 0.00016917012142747805,
+      "loss": 0.9871,
+      "step": 5785
+    },
+    {
+      "epoch": 1.0300925925925926,
+      "grad_norm": 0.5775033235549927,
+      "learning_rate": 0.0001691600120023521,
+      "loss": 1.0591,
+      "step": 5786
+    },
+    {
+      "epoch": 1.0302706552706553,
+      "grad_norm": 0.6206814646720886,
+      "learning_rate": 0.00016914990122217932,
+      "loss": 0.9126,
+      "step": 5787
+    },
+    {
+      "epoch": 1.0304487179487178,
+      "grad_norm": 0.5422028303146362,
+      "learning_rate": 0.00016913978908715796,
+      "loss": 0.8227,
+      "step": 5788
+    },
+    {
+      "epoch": 1.0306267806267806,
+      "grad_norm": 0.5824416875839233,
+      "learning_rate": 0.000169129675597486,
+      "loss": 1.111,
+      "step": 5789
+    },
+    {
+      "epoch": 1.0308048433048433,
+      "grad_norm": 0.5419015884399414,
+      "learning_rate": 0.00016911956075336165,
+      "loss": 0.8941,
+      "step": 5790
+    },
+    {
+      "epoch": 1.0309829059829059,
+      "grad_norm": 0.6171557903289795,
+      "learning_rate": 0.0001691094445549831,
+      "loss": 0.8679,
+      "step": 5791
+    },
+    {
+      "epoch": 1.0311609686609686,
+      "grad_norm": 0.6136980056762695,
+      "learning_rate": 0.00016909932700254855,
+      "loss": 0.9266,
+      "step": 5792
+    },
+    {
+      "epoch": 1.0313390313390314,
+      "grad_norm": 0.6275020241737366,
+      "learning_rate": 0.00016908920809625624,
+      "loss": 1.0828,
+      "step": 5793
+    },
+    {
+      "epoch": 1.0315170940170941,
+      "grad_norm": 0.6538251638412476,
+      "learning_rate": 0.0001690790878363044,
+      "loss": 0.8413,
+      "step": 5794
+    },
+    {
+      "epoch": 1.0316951566951567,
+      "grad_norm": 0.5981295108795166,
+      "learning_rate": 0.00016906896622289136,
+      "loss": 0.9845,
+      "step": 5795
+    },
+    {
+      "epoch": 1.0318732193732194,
+      "grad_norm": 0.5390967130661011,
+      "learning_rate": 0.00016905884325621538,
+      "loss": 0.8755,
+      "step": 5796
+    },
+    {
+      "epoch": 1.032051282051282,
+      "grad_norm": 0.5534448623657227,
+      "learning_rate": 0.00016904871893647482,
+      "loss": 1.1868,
+      "step": 5797
+    },
+    {
+      "epoch": 1.0322293447293447,
+      "grad_norm": 0.664556086063385,
+      "learning_rate": 0.00016903859326386806,
+      "loss": 1.1418,
+      "step": 5798
+    },
+    {
+      "epoch": 1.0324074074074074,
+      "grad_norm": 0.5737143158912659,
+      "learning_rate": 0.00016902846623859346,
+      "loss": 1.124,
+      "step": 5799
+    },
+    {
+      "epoch": 1.0325854700854702,
+      "grad_norm": 0.6499935388565063,
+      "learning_rate": 0.0001690183378608495,
+      "loss": 1.0331,
+      "step": 5800
+    },
+    {
+      "epoch": 1.0327635327635327,
+      "grad_norm": 0.5721518993377686,
+      "learning_rate": 0.00016900820813083454,
+      "loss": 0.8664,
+      "step": 5801
+    },
+    {
+      "epoch": 1.0329415954415955,
+      "grad_norm": 0.5651140809059143,
+      "learning_rate": 0.0001689980770487471,
+      "loss": 1.1661,
+      "step": 5802
+    },
+    {
+      "epoch": 1.0331196581196582,
+      "grad_norm": 0.5935871005058289,
+      "learning_rate": 0.0001689879446147857,
+      "loss": 0.8722,
+      "step": 5803
+    },
+    {
+      "epoch": 1.0332977207977208,
+      "grad_norm": 0.5627842545509338,
+      "learning_rate": 0.00016897781082914884,
+      "loss": 1.0036,
+      "step": 5804
+    },
+    {
+      "epoch": 1.0334757834757835,
+      "grad_norm": 0.5866895914077759,
+      "learning_rate": 0.00016896767569203502,
+      "loss": 0.9739,
+      "step": 5805
+    },
+    {
+      "epoch": 1.0336538461538463,
+      "grad_norm": 0.5568059682846069,
+      "learning_rate": 0.0001689575392036429,
+      "loss": 0.7081,
+      "step": 5806
+    },
+    {
+      "epoch": 1.0338319088319088,
+      "grad_norm": 0.6054235100746155,
+      "learning_rate": 0.00016894740136417103,
+      "loss": 1.1168,
+      "step": 5807
+    },
+    {
+      "epoch": 1.0340099715099715,
+      "grad_norm": 0.5215454697608948,
+      "learning_rate": 0.00016893726217381805,
+      "loss": 0.9172,
+      "step": 5808
+    },
+    {
+      "epoch": 1.0341880341880343,
+      "grad_norm": 0.5415732860565186,
+      "learning_rate": 0.00016892712163278263,
+      "loss": 0.7812,
+      "step": 5809
+    },
+    {
+      "epoch": 1.0343660968660968,
+      "grad_norm": 0.6341692805290222,
+      "learning_rate": 0.00016891697974126345,
+      "loss": 1.0658,
+      "step": 5810
+    },
+    {
+      "epoch": 1.0345441595441596,
+      "grad_norm": 0.6326245665550232,
+      "learning_rate": 0.00016890683649945922,
+      "loss": 1.0134,
+      "step": 5811
+    },
+    {
+      "epoch": 1.0347222222222223,
+      "grad_norm": 0.5729571580886841,
+      "learning_rate": 0.00016889669190756868,
+      "loss": 0.9139,
+      "step": 5812
+    },
+    {
+      "epoch": 1.0349002849002849,
+      "grad_norm": 0.5912853479385376,
+      "learning_rate": 0.00016888654596579054,
+      "loss": 1.122,
+      "step": 5813
+    },
+    {
+      "epoch": 1.0350783475783476,
+      "grad_norm": 0.8410450220108032,
+      "learning_rate": 0.00016887639867432368,
+      "loss": 1.3009,
+      "step": 5814
+    },
+    {
+      "epoch": 1.0352564102564104,
+      "grad_norm": 0.5416620969772339,
+      "learning_rate": 0.00016886625003336683,
+      "loss": 0.8751,
+      "step": 5815
+    },
+    {
+      "epoch": 1.0354344729344729,
+      "grad_norm": 0.6367851495742798,
+      "learning_rate": 0.0001688561000431189,
+      "loss": 0.956,
+      "step": 5816
+    },
+    {
+      "epoch": 1.0356125356125356,
+      "grad_norm": 0.4618827700614929,
+      "learning_rate": 0.0001688459487037787,
+      "loss": 0.5313,
+      "step": 5817
+    },
+    {
+      "epoch": 1.0357905982905984,
+      "grad_norm": 0.7139244079589844,
+      "learning_rate": 0.00016883579601554516,
+      "loss": 1.0787,
+      "step": 5818
+    },
+    {
+      "epoch": 1.035968660968661,
+      "grad_norm": 0.6896135210990906,
+      "learning_rate": 0.00016882564197861715,
+      "loss": 0.932,
+      "step": 5819
+    },
+    {
+      "epoch": 1.0361467236467237,
+      "grad_norm": 0.5889739394187927,
+      "learning_rate": 0.00016881548659319372,
+      "loss": 0.8852,
+      "step": 5820
+    },
+    {
+      "epoch": 1.0363247863247864,
+      "grad_norm": 0.5954701900482178,
+      "learning_rate": 0.00016880532985947375,
+      "loss": 0.8192,
+      "step": 5821
+    },
+    {
+      "epoch": 1.036502849002849,
+      "grad_norm": 0.6665091514587402,
+      "learning_rate": 0.00016879517177765627,
+      "loss": 0.9578,
+      "step": 5822
+    },
+    {
+      "epoch": 1.0366809116809117,
+      "grad_norm": 0.5990539789199829,
+      "learning_rate": 0.00016878501234794034,
+      "loss": 0.9797,
+      "step": 5823
+    },
+    {
+      "epoch": 1.0368589743589745,
+      "grad_norm": 0.596755862236023,
+      "learning_rate": 0.00016877485157052496,
+      "loss": 1.173,
+      "step": 5824
+    },
+    {
+      "epoch": 1.037037037037037,
+      "grad_norm": 0.544658362865448,
+      "learning_rate": 0.00016876468944560923,
+      "loss": 1.0742,
+      "step": 5825
+    },
+    {
+      "epoch": 1.0372150997150997,
+      "grad_norm": 0.5841910243034363,
+      "learning_rate": 0.00016875452597339225,
+      "loss": 1.029,
+      "step": 5826
+    },
+    {
+      "epoch": 1.0373931623931625,
+      "grad_norm": 0.6508592963218689,
+      "learning_rate": 0.00016874436115407317,
+      "loss": 0.9883,
+      "step": 5827
+    },
+    {
+      "epoch": 1.037571225071225,
+      "grad_norm": 0.590050458908081,
+      "learning_rate": 0.00016873419498785114,
+      "loss": 1.0713,
+      "step": 5828
+    },
+    {
+      "epoch": 1.0377492877492878,
+      "grad_norm": 0.5386307239532471,
+      "learning_rate": 0.00016872402747492534,
+      "loss": 1.0159,
+      "step": 5829
+    },
+    {
+      "epoch": 1.0379273504273505,
+      "grad_norm": 0.6173896193504333,
+      "learning_rate": 0.00016871385861549497,
+      "loss": 1.0056,
+      "step": 5830
+    },
+    {
+      "epoch": 1.038105413105413,
+      "grad_norm": 0.5377787351608276,
+      "learning_rate": 0.0001687036884097593,
+      "loss": 0.8708,
+      "step": 5831
+    },
+    {
+      "epoch": 1.0382834757834758,
+      "grad_norm": 0.5753569006919861,
+      "learning_rate": 0.00016869351685791756,
+      "loss": 1.0529,
+      "step": 5832
+    },
+    {
+      "epoch": 1.0384615384615385,
+      "grad_norm": 0.6085895299911499,
+      "learning_rate": 0.00016868334396016906,
+      "loss": 1.1017,
+      "step": 5833
+    },
+    {
+      "epoch": 1.038639601139601,
+      "grad_norm": 0.6320509910583496,
+      "learning_rate": 0.0001686731697167131,
+      "loss": 1.0543,
+      "step": 5834
+    },
+    {
+      "epoch": 1.0388176638176638,
+      "grad_norm": 0.5691760778427124,
+      "learning_rate": 0.00016866299412774907,
+      "loss": 0.9975,
+      "step": 5835
+    },
+    {
+      "epoch": 1.0389957264957266,
+      "grad_norm": 0.5990765690803528,
+      "learning_rate": 0.0001686528171934763,
+      "loss": 0.8776,
+      "step": 5836
+    },
+    {
+      "epoch": 1.039173789173789,
+      "grad_norm": 0.6650477647781372,
+      "learning_rate": 0.00016864263891409415,
+      "loss": 1.0652,
+      "step": 5837
+    },
+    {
+      "epoch": 1.0393518518518519,
+      "grad_norm": 0.6050353646278381,
+      "learning_rate": 0.00016863245928980212,
+      "loss": 0.9313,
+      "step": 5838
+    },
+    {
+      "epoch": 1.0395299145299146,
+      "grad_norm": 0.587505578994751,
+      "learning_rate": 0.0001686222783207996,
+      "loss": 0.9892,
+      "step": 5839
+    },
+    {
+      "epoch": 1.0397079772079771,
+      "grad_norm": 0.6310170292854309,
+      "learning_rate": 0.00016861209600728608,
+      "loss": 1.1045,
+      "step": 5840
+    },
+    {
+      "epoch": 1.03988603988604,
+      "grad_norm": 0.5683430433273315,
+      "learning_rate": 0.0001686019123494611,
+      "loss": 1.0507,
+      "step": 5841
+    },
+    {
+      "epoch": 1.0400641025641026,
+      "grad_norm": 0.6621488332748413,
+      "learning_rate": 0.00016859172734752414,
+      "loss": 0.9255,
+      "step": 5842
+    },
+    {
+      "epoch": 1.0402421652421652,
+      "grad_norm": 0.6197706460952759,
+      "learning_rate": 0.00016858154100167475,
+      "loss": 1.0031,
+      "step": 5843
+    },
+    {
+      "epoch": 1.040420227920228,
+      "grad_norm": 0.6805898547172546,
+      "learning_rate": 0.00016857135331211257,
+      "loss": 0.9901,
+      "step": 5844
+    },
+    {
+      "epoch": 1.0405982905982907,
+      "grad_norm": 0.5512405633926392,
+      "learning_rate": 0.00016856116427903714,
+      "loss": 1.0033,
+      "step": 5845
+    },
+    {
+      "epoch": 1.0407763532763532,
+      "grad_norm": 0.5643384456634521,
+      "learning_rate": 0.00016855097390264815,
+      "loss": 0.9136,
+      "step": 5846
+    },
+    {
+      "epoch": 1.040954415954416,
+      "grad_norm": 0.48351922631263733,
+      "learning_rate": 0.0001685407821831452,
+      "loss": 0.6163,
+      "step": 5847
+    },
+    {
+      "epoch": 1.0411324786324787,
+      "grad_norm": 0.6256039142608643,
+      "learning_rate": 0.00016853058912072802,
+      "loss": 0.9409,
+      "step": 5848
+    },
+    {
+      "epoch": 1.0413105413105412,
+      "grad_norm": 0.6539996862411499,
+      "learning_rate": 0.00016852039471559627,
+      "loss": 0.9367,
+      "step": 5849
+    },
+    {
+      "epoch": 1.041488603988604,
+      "grad_norm": 0.6192609667778015,
+      "learning_rate": 0.00016851019896794975,
+      "loss": 0.9631,
+      "step": 5850
+    },
+    {
+      "epoch": 1.0416666666666667,
+      "grad_norm": 0.613563060760498,
+      "learning_rate": 0.0001685000018779882,
+      "loss": 0.9132,
+      "step": 5851
+    },
+    {
+      "epoch": 1.0418447293447293,
+      "grad_norm": 0.6004200577735901,
+      "learning_rate": 0.0001684898034459114,
+      "loss": 1.1313,
+      "step": 5852
+    },
+    {
+      "epoch": 1.042022792022792,
+      "grad_norm": 0.6158567070960999,
+      "learning_rate": 0.0001684796036719192,
+      "loss": 1.0253,
+      "step": 5853
+    },
+    {
+      "epoch": 1.0422008547008548,
+      "grad_norm": 0.6362335085868835,
+      "learning_rate": 0.00016846940255621143,
+      "loss": 0.93,
+      "step": 5854
+    },
+    {
+      "epoch": 1.0423789173789173,
+      "grad_norm": 0.6148427128791809,
+      "learning_rate": 0.00016845920009898787,
+      "loss": 0.9122,
+      "step": 5855
+    },
+    {
+      "epoch": 1.04255698005698,
+      "grad_norm": 0.5119984149932861,
+      "learning_rate": 0.00016844899630044858,
+      "loss": 0.7954,
+      "step": 5856
+    },
+    {
+      "epoch": 1.0427350427350428,
+      "grad_norm": 0.571849524974823,
+      "learning_rate": 0.00016843879116079338,
+      "loss": 0.8588,
+      "step": 5857
+    },
+    {
+      "epoch": 1.0429131054131053,
+      "grad_norm": 0.6173384785652161,
+      "learning_rate": 0.00016842858468022221,
+      "loss": 1.0475,
+      "step": 5858
+    },
+    {
+      "epoch": 1.043091168091168,
+      "grad_norm": 0.566114068031311,
+      "learning_rate": 0.0001684183768589351,
+      "loss": 0.8485,
+      "step": 5859
+    },
+    {
+      "epoch": 1.0432692307692308,
+      "grad_norm": 0.653134286403656,
+      "learning_rate": 0.000168408167697132,
+      "loss": 0.9976,
+      "step": 5860
+    },
+    {
+      "epoch": 1.0434472934472934,
+      "grad_norm": 0.63815838098526,
+      "learning_rate": 0.00016839795719501296,
+      "loss": 0.7091,
+      "step": 5861
+    },
+    {
+      "epoch": 1.0436253561253561,
+      "grad_norm": 0.5109001994132996,
+      "learning_rate": 0.00016838774535277805,
+      "loss": 0.7668,
+      "step": 5862
+    },
+    {
+      "epoch": 1.0438034188034189,
+      "grad_norm": 0.6741907596588135,
+      "learning_rate": 0.0001683775321706273,
+      "loss": 1.0493,
+      "step": 5863
+    },
+    {
+      "epoch": 1.0439814814814814,
+      "grad_norm": 0.6006115674972534,
+      "learning_rate": 0.0001683673176487609,
+      "loss": 0.9784,
+      "step": 5864
+    },
+    {
+      "epoch": 1.0441595441595442,
+      "grad_norm": 0.5504778027534485,
+      "learning_rate": 0.0001683571017873789,
+      "loss": 0.9718,
+      "step": 5865
+    },
+    {
+      "epoch": 1.044337606837607,
+      "grad_norm": 0.5713102221488953,
+      "learning_rate": 0.00016834688458668148,
+      "loss": 1.12,
+      "step": 5866
+    },
+    {
+      "epoch": 1.0445156695156694,
+      "grad_norm": 0.7878454923629761,
+      "learning_rate": 0.00016833666604686886,
+      "loss": 1.1803,
+      "step": 5867
+    },
+    {
+      "epoch": 1.0446937321937322,
+      "grad_norm": 0.582697331905365,
+      "learning_rate": 0.00016832644616814122,
+      "loss": 0.943,
+      "step": 5868
+    },
+    {
+      "epoch": 1.044871794871795,
+      "grad_norm": 0.5300645232200623,
+      "learning_rate": 0.00016831622495069878,
+      "loss": 0.9087,
+      "step": 5869
+    },
+    {
+      "epoch": 1.0450498575498575,
+      "grad_norm": 0.5627666115760803,
+      "learning_rate": 0.00016830600239474186,
+      "loss": 1.081,
+      "step": 5870
+    },
+    {
+      "epoch": 1.0452279202279202,
+      "grad_norm": 0.6760496497154236,
+      "learning_rate": 0.0001682957785004707,
+      "loss": 1.1098,
+      "step": 5871
+    },
+    {
+      "epoch": 1.045405982905983,
+      "grad_norm": 0.6424084901809692,
+      "learning_rate": 0.00016828555326808565,
+      "loss": 0.9657,
+      "step": 5872
+    },
+    {
+      "epoch": 1.0455840455840455,
+      "grad_norm": 0.5523313283920288,
+      "learning_rate": 0.000168275326697787,
+      "loss": 1.0163,
+      "step": 5873
+    },
+    {
+      "epoch": 1.0457621082621082,
+      "grad_norm": 0.5582337975502014,
+      "learning_rate": 0.00016826509878977518,
+      "loss": 0.8825,
+      "step": 5874
+    },
+    {
+      "epoch": 1.045940170940171,
+      "grad_norm": 0.5603214502334595,
+      "learning_rate": 0.00016825486954425055,
+      "loss": 0.9032,
+      "step": 5875
+    },
+    {
+      "epoch": 1.0461182336182335,
+      "grad_norm": 0.5944222807884216,
+      "learning_rate": 0.00016824463896141355,
+      "loss": 0.9384,
+      "step": 5876
+    },
+    {
+      "epoch": 1.0462962962962963,
+      "grad_norm": 0.6220229268074036,
+      "learning_rate": 0.00016823440704146457,
+      "loss": 0.8962,
+      "step": 5877
+    },
+    {
+      "epoch": 1.046474358974359,
+      "grad_norm": 0.5607972145080566,
+      "learning_rate": 0.0001682241737846042,
+      "loss": 0.9385,
+      "step": 5878
+    },
+    {
+      "epoch": 1.0466524216524216,
+      "grad_norm": 0.6206870079040527,
+      "learning_rate": 0.00016821393919103282,
+      "loss": 1.0597,
+      "step": 5879
+    },
+    {
+      "epoch": 1.0468304843304843,
+      "grad_norm": 0.5126399993896484,
+      "learning_rate": 0.000168203703260951,
+      "loss": 0.9403,
+      "step": 5880
+    },
+    {
+      "epoch": 1.047008547008547,
+      "grad_norm": 0.6569282412528992,
+      "learning_rate": 0.00016819346599455929,
+      "loss": 0.8124,
+      "step": 5881
+    },
+    {
+      "epoch": 1.0471866096866096,
+      "grad_norm": 0.6670137047767639,
+      "learning_rate": 0.0001681832273920583,
+      "loss": 1.1927,
+      "step": 5882
+    },
+    {
+      "epoch": 1.0473646723646723,
+      "grad_norm": 0.5403243899345398,
+      "learning_rate": 0.00016817298745364862,
+      "loss": 0.8539,
+      "step": 5883
+    },
+    {
+      "epoch": 1.047542735042735,
+      "grad_norm": 0.5500505566596985,
+      "learning_rate": 0.00016816274617953086,
+      "loss": 1.1064,
+      "step": 5884
+    },
+    {
+      "epoch": 1.0477207977207976,
+      "grad_norm": 0.5482703447341919,
+      "learning_rate": 0.00016815250356990566,
+      "loss": 0.7276,
+      "step": 5885
+    },
+    {
+      "epoch": 1.0478988603988604,
+      "grad_norm": 0.6290771961212158,
+      "learning_rate": 0.00016814225962497373,
+      "loss": 0.9018,
+      "step": 5886
+    },
+    {
+      "epoch": 1.0480769230769231,
+      "grad_norm": 0.6404094696044922,
+      "learning_rate": 0.00016813201434493578,
+      "loss": 1.0638,
+      "step": 5887
+    },
+    {
+      "epoch": 1.0482549857549857,
+      "grad_norm": 0.5484994053840637,
+      "learning_rate": 0.0001681217677299926,
+      "loss": 1.0033,
+      "step": 5888
+    },
+    {
+      "epoch": 1.0484330484330484,
+      "grad_norm": 0.6474852561950684,
+      "learning_rate": 0.0001681115197803448,
+      "loss": 1.1017,
+      "step": 5889
+    },
+    {
+      "epoch": 1.0486111111111112,
+      "grad_norm": 0.6186243295669556,
+      "learning_rate": 0.0001681012704961933,
+      "loss": 0.9978,
+      "step": 5890
+    },
+    {
+      "epoch": 1.0487891737891737,
+      "grad_norm": 0.6244034767150879,
+      "learning_rate": 0.00016809101987773887,
+      "loss": 0.9906,
+      "step": 5891
+    },
+    {
+      "epoch": 1.0489672364672364,
+      "grad_norm": 0.5893426537513733,
+      "learning_rate": 0.00016808076792518235,
+      "loss": 0.9345,
+      "step": 5892
+    },
+    {
+      "epoch": 1.0491452991452992,
+      "grad_norm": 0.6283876299858093,
+      "learning_rate": 0.0001680705146387246,
+      "loss": 1.0041,
+      "step": 5893
+    },
+    {
+      "epoch": 1.0493233618233617,
+      "grad_norm": 0.6075255870819092,
+      "learning_rate": 0.00016806026001856656,
+      "loss": 1.0661,
+      "step": 5894
+    },
+    {
+      "epoch": 1.0495014245014245,
+      "grad_norm": 0.5350496768951416,
+      "learning_rate": 0.00016805000406490907,
+      "loss": 0.6789,
+      "step": 5895
+    },
+    {
+      "epoch": 1.0496794871794872,
+      "grad_norm": 0.5380373597145081,
+      "learning_rate": 0.00016803974677795312,
+      "loss": 0.8889,
+      "step": 5896
+    },
+    {
+      "epoch": 1.0498575498575498,
+      "grad_norm": 0.6145668029785156,
+      "learning_rate": 0.0001680294881578997,
+      "loss": 0.8952,
+      "step": 5897
+    },
+    {
+      "epoch": 1.0500356125356125,
+      "grad_norm": 0.5666532516479492,
+      "learning_rate": 0.00016801922820494972,
+      "loss": 0.9697,
+      "step": 5898
+    },
+    {
+      "epoch": 1.0502136752136753,
+      "grad_norm": 0.5352747440338135,
+      "learning_rate": 0.0001680089669193043,
+      "loss": 0.9619,
+      "step": 5899
+    },
+    {
+      "epoch": 1.0503917378917378,
+      "grad_norm": 0.5405527949333191,
+      "learning_rate": 0.00016799870430116444,
+      "loss": 0.8733,
+      "step": 5900
+    },
+    {
+      "epoch": 1.0505698005698005,
+      "grad_norm": 0.5936748385429382,
+      "learning_rate": 0.00016798844035073124,
+      "loss": 0.8746,
+      "step": 5901
+    },
+    {
+      "epoch": 1.0507478632478633,
+      "grad_norm": 0.539652943611145,
+      "learning_rate": 0.00016797817506820578,
+      "loss": 0.8743,
+      "step": 5902
+    },
+    {
+      "epoch": 1.0509259259259258,
+      "grad_norm": 0.644528865814209,
+      "learning_rate": 0.00016796790845378915,
+      "loss": 0.9251,
+      "step": 5903
+    },
+    {
+      "epoch": 1.0511039886039886,
+      "grad_norm": 0.5429201126098633,
+      "learning_rate": 0.00016795764050768258,
+      "loss": 0.747,
+      "step": 5904
+    },
+    {
+      "epoch": 1.0512820512820513,
+      "grad_norm": 0.6432006359100342,
+      "learning_rate": 0.00016794737123008725,
+      "loss": 0.9166,
+      "step": 5905
+    },
+    {
+      "epoch": 1.0514601139601139,
+      "grad_norm": 0.6084117293357849,
+      "learning_rate": 0.00016793710062120427,
+      "loss": 1.0778,
+      "step": 5906
+    },
+    {
+      "epoch": 1.0516381766381766,
+      "grad_norm": 0.5351580381393433,
+      "learning_rate": 0.00016792682868123495,
+      "loss": 0.9124,
+      "step": 5907
+    },
+    {
+      "epoch": 1.0518162393162394,
+      "grad_norm": 0.7078854441642761,
+      "learning_rate": 0.00016791655541038053,
+      "loss": 1.1209,
+      "step": 5908
+    },
+    {
+      "epoch": 1.051994301994302,
+      "grad_norm": 0.5943832993507385,
+      "learning_rate": 0.0001679062808088423,
+      "loss": 0.9077,
+      "step": 5909
+    },
+    {
+      "epoch": 1.0521723646723646,
+      "grad_norm": 0.5216894745826721,
+      "learning_rate": 0.00016789600487682156,
+      "loss": 0.9866,
+      "step": 5910
+    },
+    {
+      "epoch": 1.0523504273504274,
+      "grad_norm": 0.738451361656189,
+      "learning_rate": 0.00016788572761451963,
+      "loss": 1.1611,
+      "step": 5911
+    },
+    {
+      "epoch": 1.05252849002849,
+      "grad_norm": 0.6411251425743103,
+      "learning_rate": 0.00016787544902213791,
+      "loss": 1.1481,
+      "step": 5912
+    },
+    {
+      "epoch": 1.0527065527065527,
+      "grad_norm": 0.6768319010734558,
+      "learning_rate": 0.00016786516909987774,
+      "loss": 0.8614,
+      "step": 5913
+    },
+    {
+      "epoch": 1.0528846153846154,
+      "grad_norm": 0.5838070511817932,
+      "learning_rate": 0.0001678548878479406,
+      "loss": 0.9719,
+      "step": 5914
+    },
+    {
+      "epoch": 1.0530626780626782,
+      "grad_norm": 0.541522741317749,
+      "learning_rate": 0.00016784460526652784,
+      "loss": 0.767,
+      "step": 5915
+    },
+    {
+      "epoch": 1.0532407407407407,
+      "grad_norm": 0.6064762473106384,
+      "learning_rate": 0.000167834321355841,
+      "loss": 1.0792,
+      "step": 5916
+    },
+    {
+      "epoch": 1.0534188034188035,
+      "grad_norm": 0.5515492558479309,
+      "learning_rate": 0.00016782403611608152,
+      "loss": 0.7897,
+      "step": 5917
+    },
+    {
+      "epoch": 1.0535968660968662,
+      "grad_norm": 0.6326262950897217,
+      "learning_rate": 0.000167813749547451,
+      "loss": 0.9279,
+      "step": 5918
+    },
+    {
+      "epoch": 1.0537749287749287,
+      "grad_norm": 0.6262009739875793,
+      "learning_rate": 0.0001678034616501509,
+      "loss": 0.9752,
+      "step": 5919
+    },
+    {
+      "epoch": 1.0539529914529915,
+      "grad_norm": 0.6049023270606995,
+      "learning_rate": 0.00016779317242438278,
+      "loss": 0.9167,
+      "step": 5920
+    },
+    {
+      "epoch": 1.0541310541310542,
+      "grad_norm": 0.6286031007766724,
+      "learning_rate": 0.0001677828818703483,
+      "loss": 1.1277,
+      "step": 5921
+    },
+    {
+      "epoch": 1.0543091168091168,
+      "grad_norm": 0.662086009979248,
+      "learning_rate": 0.00016777258998824907,
+      "loss": 1.0824,
+      "step": 5922
+    },
+    {
+      "epoch": 1.0544871794871795,
+      "grad_norm": 0.5358783006668091,
+      "learning_rate": 0.00016776229677828672,
+      "loss": 0.825,
+      "step": 5923
+    },
+    {
+      "epoch": 1.0546652421652423,
+      "grad_norm": 0.490326464176178,
+      "learning_rate": 0.00016775200224066294,
+      "loss": 0.7916,
+      "step": 5924
+    },
+    {
+      "epoch": 1.0548433048433048,
+      "grad_norm": 0.5940443277359009,
+      "learning_rate": 0.0001677417063755794,
+      "loss": 1.0121,
+      "step": 5925
+    },
+    {
+      "epoch": 1.0550213675213675,
+      "grad_norm": 0.5974507927894592,
+      "learning_rate": 0.00016773140918323787,
+      "loss": 0.7629,
+      "step": 5926
+    },
+    {
+      "epoch": 1.0551994301994303,
+      "grad_norm": 0.5747174024581909,
+      "learning_rate": 0.00016772111066384003,
+      "loss": 0.9373,
+      "step": 5927
+    },
+    {
+      "epoch": 1.0553774928774928,
+      "grad_norm": 0.5998024940490723,
+      "learning_rate": 0.00016771081081758772,
+      "loss": 0.8543,
+      "step": 5928
+    },
+    {
+      "epoch": 1.0555555555555556,
+      "grad_norm": 0.5771155953407288,
+      "learning_rate": 0.00016770050964468275,
+      "loss": 0.9108,
+      "step": 5929
+    },
+    {
+      "epoch": 1.0557336182336183,
+      "grad_norm": 0.5695661306381226,
+      "learning_rate": 0.00016769020714532692,
+      "loss": 0.8055,
+      "step": 5930
+    },
+    {
+      "epoch": 1.0559116809116809,
+      "grad_norm": 0.6164212226867676,
+      "learning_rate": 0.0001676799033197221,
+      "loss": 1.0917,
+      "step": 5931
+    },
+    {
+      "epoch": 1.0560897435897436,
+      "grad_norm": 0.6092487573623657,
+      "learning_rate": 0.00016766959816807018,
+      "loss": 0.9276,
+      "step": 5932
+    },
+    {
+      "epoch": 1.0562678062678064,
+      "grad_norm": 0.5595401525497437,
+      "learning_rate": 0.00016765929169057305,
+      "loss": 0.9435,
+      "step": 5933
+    },
+    {
+      "epoch": 1.056445868945869,
+      "grad_norm": 0.5875109434127808,
+      "learning_rate": 0.00016764898388743263,
+      "loss": 0.959,
+      "step": 5934
+    },
+    {
+      "epoch": 1.0566239316239316,
+      "grad_norm": 0.6045668721199036,
+      "learning_rate": 0.00016763867475885088,
+      "loss": 0.8636,
+      "step": 5935
+    },
+    {
+      "epoch": 1.0568019943019944,
+      "grad_norm": 0.6088171005249023,
+      "learning_rate": 0.00016762836430502987,
+      "loss": 0.6807,
+      "step": 5936
+    },
+    {
+      "epoch": 1.056980056980057,
+      "grad_norm": 0.6293274760246277,
+      "learning_rate": 0.00016761805252617148,
+      "loss": 1.042,
+      "step": 5937
+    },
+    {
+      "epoch": 1.0571581196581197,
+      "grad_norm": 0.588472843170166,
+      "learning_rate": 0.00016760773942247785,
+      "loss": 0.8896,
+      "step": 5938
+    },
+    {
+      "epoch": 1.0573361823361824,
+      "grad_norm": 0.4412326216697693,
+      "learning_rate": 0.000167597424994151,
+      "loss": 0.6727,
+      "step": 5939
+    },
+    {
+      "epoch": 1.057514245014245,
+      "grad_norm": 0.6086825132369995,
+      "learning_rate": 0.00016758710924139302,
+      "loss": 0.9908,
+      "step": 5940
+    },
+    {
+      "epoch": 1.0576923076923077,
+      "grad_norm": 0.6424705386161804,
+      "learning_rate": 0.00016757679216440608,
+      "loss": 1.0182,
+      "step": 5941
+    },
+    {
+      "epoch": 1.0578703703703705,
+      "grad_norm": 0.6610676050186157,
+      "learning_rate": 0.00016756647376339222,
+      "loss": 0.9645,
+      "step": 5942
+    },
+    {
+      "epoch": 1.058048433048433,
+      "grad_norm": 0.598292887210846,
+      "learning_rate": 0.0001675561540385537,
+      "loss": 0.9694,
+      "step": 5943
+    },
+    {
+      "epoch": 1.0582264957264957,
+      "grad_norm": 0.6941167116165161,
+      "learning_rate": 0.00016754583299009266,
+      "loss": 1.0786,
+      "step": 5944
+    },
+    {
+      "epoch": 1.0584045584045585,
+      "grad_norm": 0.6543232798576355,
+      "learning_rate": 0.00016753551061821133,
+      "loss": 1.0488,
+      "step": 5945
+    },
+    {
+      "epoch": 1.058582621082621,
+      "grad_norm": 0.606159508228302,
+      "learning_rate": 0.000167525186923112,
+      "loss": 0.9448,
+      "step": 5946
+    },
+    {
+      "epoch": 1.0587606837606838,
+      "grad_norm": 0.5051791071891785,
+      "learning_rate": 0.00016751486190499685,
+      "loss": 0.7485,
+      "step": 5947
+    },
+    {
+      "epoch": 1.0589387464387465,
+      "grad_norm": 0.6459367275238037,
+      "learning_rate": 0.00016750453556406826,
+      "loss": 1.0055,
+      "step": 5948
+    },
+    {
+      "epoch": 1.059116809116809,
+      "grad_norm": 0.551591157913208,
+      "learning_rate": 0.00016749420790052852,
+      "loss": 0.9717,
+      "step": 5949
+    },
+    {
+      "epoch": 1.0592948717948718,
+      "grad_norm": 0.5899214148521423,
+      "learning_rate": 0.00016748387891458,
+      "loss": 0.7774,
+      "step": 5950
+    },
+    {
+      "epoch": 1.0594729344729346,
+      "grad_norm": 0.582379162311554,
+      "learning_rate": 0.00016747354860642503,
+      "loss": 0.953,
+      "step": 5951
+    },
+    {
+      "epoch": 1.059650997150997,
+      "grad_norm": 0.6035816073417664,
+      "learning_rate": 0.00016746321697626605,
+      "loss": 1.1175,
+      "step": 5952
+    },
+    {
+      "epoch": 1.0598290598290598,
+      "grad_norm": 0.6476401686668396,
+      "learning_rate": 0.00016745288402430548,
+      "loss": 0.9448,
+      "step": 5953
+    },
+    {
+      "epoch": 1.0600071225071226,
+      "grad_norm": 0.6126405596733093,
+      "learning_rate": 0.00016744254975074578,
+      "loss": 0.882,
+      "step": 5954
+    },
+    {
+      "epoch": 1.0601851851851851,
+      "grad_norm": 0.5333579182624817,
+      "learning_rate": 0.0001674322141557894,
+      "loss": 0.9539,
+      "step": 5955
+    },
+    {
+      "epoch": 1.0603632478632479,
+      "grad_norm": 0.6085022687911987,
+      "learning_rate": 0.0001674218772396389,
+      "loss": 1.0028,
+      "step": 5956
+    },
+    {
+      "epoch": 1.0605413105413106,
+      "grad_norm": 0.5809528827667236,
+      "learning_rate": 0.0001674115390024967,
+      "loss": 0.84,
+      "step": 5957
+    },
+    {
+      "epoch": 1.0607193732193732,
+      "grad_norm": 0.5820229649543762,
+      "learning_rate": 0.00016740119944456548,
+      "loss": 0.9563,
+      "step": 5958
+    },
+    {
+      "epoch": 1.060897435897436,
+      "grad_norm": 0.6349015831947327,
+      "learning_rate": 0.00016739085856604775,
+      "loss": 0.9739,
+      "step": 5959
+    },
+    {
+      "epoch": 1.0610754985754987,
+      "grad_norm": 0.6346020102500916,
+      "learning_rate": 0.00016738051636714616,
+      "loss": 0.907,
+      "step": 5960
+    },
+    {
+      "epoch": 1.0612535612535612,
+      "grad_norm": 0.5850573778152466,
+      "learning_rate": 0.0001673701728480633,
+      "loss": 1.0688,
+      "step": 5961
+    },
+    {
+      "epoch": 1.061431623931624,
+      "grad_norm": 0.6258122324943542,
+      "learning_rate": 0.00016735982800900184,
+      "loss": 0.9997,
+      "step": 5962
+    },
+    {
+      "epoch": 1.0616096866096867,
+      "grad_norm": 0.6744239330291748,
+      "learning_rate": 0.00016734948185016452,
+      "loss": 0.9431,
+      "step": 5963
+    },
+    {
+      "epoch": 1.0617877492877492,
+      "grad_norm": 0.5769457817077637,
+      "learning_rate": 0.000167339134371754,
+      "loss": 0.9658,
+      "step": 5964
+    },
+    {
+      "epoch": 1.061965811965812,
+      "grad_norm": 0.6385112404823303,
+      "learning_rate": 0.000167328785573973,
+      "loss": 1.0199,
+      "step": 5965
+    },
+    {
+      "epoch": 1.0621438746438747,
+      "grad_norm": 0.536522388458252,
+      "learning_rate": 0.00016731843545702435,
+      "loss": 0.8496,
+      "step": 5966
+    },
+    {
+      "epoch": 1.0623219373219372,
+      "grad_norm": 0.5978497862815857,
+      "learning_rate": 0.00016730808402111075,
+      "loss": 0.8536,
+      "step": 5967
+    },
+    {
+      "epoch": 1.0625,
+      "grad_norm": 0.6091681122779846,
+      "learning_rate": 0.0001672977312664351,
+      "loss": 1.0241,
+      "step": 5968
+    },
+    {
+      "epoch": 1.0626780626780628,
+      "grad_norm": 0.5807273387908936,
+      "learning_rate": 0.0001672873771932002,
+      "loss": 1.0522,
+      "step": 5969
+    },
+    {
+      "epoch": 1.0628561253561253,
+      "grad_norm": 0.6511965990066528,
+      "learning_rate": 0.0001672770218016089,
+      "loss": 0.8908,
+      "step": 5970
+    },
+    {
+      "epoch": 1.063034188034188,
+      "grad_norm": 0.6241721510887146,
+      "learning_rate": 0.00016726666509186416,
+      "loss": 0.9854,
+      "step": 5971
+    },
+    {
+      "epoch": 1.0632122507122508,
+      "grad_norm": 0.6112468242645264,
+      "learning_rate": 0.0001672563070641688,
+      "loss": 1.0091,
+      "step": 5972
+    },
+    {
+      "epoch": 1.0633903133903133,
+      "grad_norm": 0.6135509014129639,
+      "learning_rate": 0.00016724594771872587,
+      "loss": 0.8891,
+      "step": 5973
+    },
+    {
+      "epoch": 1.063568376068376,
+      "grad_norm": 0.608384370803833,
+      "learning_rate": 0.00016723558705573823,
+      "loss": 1.017,
+      "step": 5974
+    },
+    {
+      "epoch": 1.0637464387464388,
+      "grad_norm": 0.6578485369682312,
+      "learning_rate": 0.00016722522507540895,
+      "loss": 0.9165,
+      "step": 5975
+    },
+    {
+      "epoch": 1.0639245014245013,
+      "grad_norm": 0.562588095664978,
+      "learning_rate": 0.00016721486177794106,
+      "loss": 0.7989,
+      "step": 5976
+    },
+    {
+      "epoch": 1.064102564102564,
+      "grad_norm": 0.5541409254074097,
+      "learning_rate": 0.00016720449716353753,
+      "loss": 0.8917,
+      "step": 5977
+    },
+    {
+      "epoch": 1.0642806267806268,
+      "grad_norm": 0.551167905330658,
+      "learning_rate": 0.0001671941312324015,
+      "loss": 0.824,
+      "step": 5978
+    },
+    {
+      "epoch": 1.0644586894586894,
+      "grad_norm": 0.6280582547187805,
+      "learning_rate": 0.0001671837639847361,
+      "loss": 0.9708,
+      "step": 5979
+    },
+    {
+      "epoch": 1.0646367521367521,
+      "grad_norm": 0.6389226913452148,
+      "learning_rate": 0.00016717339542074436,
+      "loss": 1.0081,
+      "step": 5980
+    },
+    {
+      "epoch": 1.0648148148148149,
+      "grad_norm": 0.6677889823913574,
+      "learning_rate": 0.0001671630255406295,
+      "loss": 1.2709,
+      "step": 5981
+    },
+    {
+      "epoch": 1.0649928774928774,
+      "grad_norm": 0.5748161673545837,
+      "learning_rate": 0.00016715265434459465,
+      "loss": 0.9157,
+      "step": 5982
+    },
+    {
+      "epoch": 1.0651709401709402,
+      "grad_norm": 0.6677651405334473,
+      "learning_rate": 0.00016714228183284304,
+      "loss": 1.1097,
+      "step": 5983
+    },
+    {
+      "epoch": 1.065349002849003,
+      "grad_norm": 0.6253604292869568,
+      "learning_rate": 0.0001671319080055779,
+      "loss": 0.9819,
+      "step": 5984
+    },
+    {
+      "epoch": 1.0655270655270654,
+      "grad_norm": 0.5548844337463379,
+      "learning_rate": 0.0001671215328630025,
+      "loss": 0.9324,
+      "step": 5985
+    },
+    {
+      "epoch": 1.0657051282051282,
+      "grad_norm": 0.622062623500824,
+      "learning_rate": 0.00016711115640532004,
+      "loss": 0.8749,
+      "step": 5986
+    },
+    {
+      "epoch": 1.065883190883191,
+      "grad_norm": 0.6496043801307678,
+      "learning_rate": 0.00016710077863273394,
+      "loss": 1.0642,
+      "step": 5987
+    },
+    {
+      "epoch": 1.0660612535612535,
+      "grad_norm": 0.6140534281730652,
+      "learning_rate": 0.00016709039954544746,
+      "loss": 0.8928,
+      "step": 5988
+    },
+    {
+      "epoch": 1.0662393162393162,
+      "grad_norm": 0.6387218236923218,
+      "learning_rate": 0.00016708001914366393,
+      "loss": 0.9525,
+      "step": 5989
+    },
+    {
+      "epoch": 1.066417378917379,
+      "grad_norm": 0.6119858026504517,
+      "learning_rate": 0.0001670696374275868,
+      "loss": 0.8663,
+      "step": 5990
+    },
+    {
+      "epoch": 1.0665954415954415,
+      "grad_norm": 0.6722040772438049,
+      "learning_rate": 0.00016705925439741947,
+      "loss": 1.1173,
+      "step": 5991
+    },
+    {
+      "epoch": 1.0667735042735043,
+      "grad_norm": 0.8226081132888794,
+      "learning_rate": 0.00016704887005336534,
+      "loss": 1.0572,
+      "step": 5992
+    },
+    {
+      "epoch": 1.066951566951567,
+      "grad_norm": 0.7248596549034119,
+      "learning_rate": 0.00016703848439562785,
+      "loss": 1.0493,
+      "step": 5993
+    },
+    {
+      "epoch": 1.0671296296296295,
+      "grad_norm": 0.7185787558555603,
+      "learning_rate": 0.00016702809742441058,
+      "loss": 1.1366,
+      "step": 5994
+    },
+    {
+      "epoch": 1.0673076923076923,
+      "grad_norm": 0.6118780970573425,
+      "learning_rate": 0.00016701770913991694,
+      "loss": 0.9557,
+      "step": 5995
+    },
+    {
+      "epoch": 1.067485754985755,
+      "grad_norm": 0.6472596526145935,
+      "learning_rate": 0.0001670073195423505,
+      "loss": 0.9977,
+      "step": 5996
+    },
+    {
+      "epoch": 1.0676638176638176,
+      "grad_norm": 0.7110133767127991,
+      "learning_rate": 0.00016699692863191484,
+      "loss": 1.1932,
+      "step": 5997
+    },
+    {
+      "epoch": 1.0678418803418803,
+      "grad_norm": 0.5827305912971497,
+      "learning_rate": 0.00016698653640881354,
+      "loss": 0.7641,
+      "step": 5998
+    },
+    {
+      "epoch": 1.068019943019943,
+      "grad_norm": 0.527208149433136,
+      "learning_rate": 0.00016697614287325017,
+      "loss": 0.7683,
+      "step": 5999
+    },
+    {
+      "epoch": 1.0681980056980056,
+      "grad_norm": 0.6680626273155212,
+      "learning_rate": 0.00016696574802542848,
+      "loss": 1.1748,
+      "step": 6000
+    },
+    {
+      "epoch": 1.0683760683760684,
+      "grad_norm": 0.5947227478027344,
+      "learning_rate": 0.00016695535186555204,
+      "loss": 1.0894,
+      "step": 6001
+    },
+    {
+      "epoch": 1.068554131054131,
+      "grad_norm": 0.5828250646591187,
+      "learning_rate": 0.00016694495439382456,
+      "loss": 0.9895,
+      "step": 6002
+    },
+    {
+      "epoch": 1.0687321937321936,
+      "grad_norm": 0.5897728204727173,
+      "learning_rate": 0.00016693455561044978,
+      "loss": 0.9686,
+      "step": 6003
+    },
+    {
+      "epoch": 1.0689102564102564,
+      "grad_norm": 0.5441751480102539,
+      "learning_rate": 0.0001669241555156314,
+      "loss": 0.8948,
+      "step": 6004
+    },
+    {
+      "epoch": 1.0690883190883191,
+      "grad_norm": 0.694199800491333,
+      "learning_rate": 0.00016691375410957324,
+      "loss": 1.0824,
+      "step": 6005
+    },
+    {
+      "epoch": 1.0692663817663817,
+      "grad_norm": 0.6077630519866943,
+      "learning_rate": 0.00016690335139247906,
+      "loss": 1.0931,
+      "step": 6006
+    },
+    {
+      "epoch": 1.0694444444444444,
+      "grad_norm": 0.6558539867401123,
+      "learning_rate": 0.0001668929473645527,
+      "loss": 1.0099,
+      "step": 6007
+    },
+    {
+      "epoch": 1.0696225071225072,
+      "grad_norm": 0.5722812414169312,
+      "learning_rate": 0.00016688254202599798,
+      "loss": 0.7999,
+      "step": 6008
+    },
+    {
+      "epoch": 1.0698005698005697,
+      "grad_norm": 0.5915400981903076,
+      "learning_rate": 0.0001668721353770188,
+      "loss": 0.7866,
+      "step": 6009
+    },
+    {
+      "epoch": 1.0699786324786325,
+      "grad_norm": 0.5290952324867249,
+      "learning_rate": 0.00016686172741781901,
+      "loss": 0.793,
+      "step": 6010
+    },
+    {
+      "epoch": 1.0701566951566952,
+      "grad_norm": 0.5501774549484253,
+      "learning_rate": 0.00016685131814860263,
+      "loss": 0.8775,
+      "step": 6011
+    },
+    {
+      "epoch": 1.0703347578347577,
+      "grad_norm": 0.6192594766616821,
+      "learning_rate": 0.00016684090756957347,
+      "loss": 1.1686,
+      "step": 6012
+    },
+    {
+      "epoch": 1.0705128205128205,
+      "grad_norm": 0.6640267968177795,
+      "learning_rate": 0.00016683049568093561,
+      "loss": 1.1789,
+      "step": 6013
+    },
+    {
+      "epoch": 1.0706908831908832,
+      "grad_norm": 0.552893877029419,
+      "learning_rate": 0.00016682008248289303,
+      "loss": 0.7957,
+      "step": 6014
+    },
+    {
+      "epoch": 1.0708689458689458,
+      "grad_norm": 0.6406302452087402,
+      "learning_rate": 0.00016680966797564972,
+      "loss": 1.1174,
+      "step": 6015
+    },
+    {
+      "epoch": 1.0710470085470085,
+      "grad_norm": Infinity,
+      "learning_rate": 0.00016680966797564972,
+      "loss": 0.9168,
+      "step": 6016
+    },
+    {
+      "epoch": 1.0712250712250713,
+      "grad_norm": 0.6384762525558472,
+      "learning_rate": 0.00016679925215940975,
+      "loss": 0.9831,
+      "step": 6017
+    },
+    {
+      "epoch": 1.071403133903134,
+      "grad_norm": 0.5906224846839905,
+      "learning_rate": 0.0001667888350343772,
+      "loss": 0.9167,
+      "step": 6018
+    },
+    {
+      "epoch": 1.0715811965811965,
+      "grad_norm": 0.658044695854187,
+      "learning_rate": 0.00016677841660075617,
+      "loss": 1.0075,
+      "step": 6019
+    },
+    {
+      "epoch": 1.0717592592592593,
+      "grad_norm": 0.6313242316246033,
+      "learning_rate": 0.00016676799685875078,
+      "loss": 0.8551,
+      "step": 6020
+    },
+    {
+      "epoch": 1.0719373219373218,
+      "grad_norm": 0.5891841053962708,
+      "learning_rate": 0.00016675757580856518,
+      "loss": 0.8475,
+      "step": 6021
+    },
+    {
+      "epoch": 1.0721153846153846,
+      "grad_norm": 0.581317126750946,
+      "learning_rate": 0.00016674715345040358,
+      "loss": 0.9308,
+      "step": 6022
+    },
+    {
+      "epoch": 1.0722934472934473,
+      "grad_norm": 0.5952537655830383,
+      "learning_rate": 0.00016673672978447017,
+      "loss": 0.9104,
+      "step": 6023
+    },
+    {
+      "epoch": 1.07247150997151,
+      "grad_norm": 0.5934227705001831,
+      "learning_rate": 0.00016672630481096915,
+      "loss": 0.9882,
+      "step": 6024
+    },
+    {
+      "epoch": 1.0726495726495726,
+      "grad_norm": 0.5867539048194885,
+      "learning_rate": 0.00016671587853010482,
+      "loss": 1.0186,
+      "step": 6025
+    },
+    {
+      "epoch": 1.0728276353276354,
+      "grad_norm": 0.6002280116081238,
+      "learning_rate": 0.00016670545094208143,
+      "loss": 0.92,
+      "step": 6026
+    },
+    {
+      "epoch": 1.073005698005698,
+      "grad_norm": 0.6261683702468872,
+      "learning_rate": 0.0001666950220471033,
+      "loss": 0.9293,
+      "step": 6027
+    },
+    {
+      "epoch": 1.0731837606837606,
+      "grad_norm": 0.6128147840499878,
+      "learning_rate": 0.00016668459184537477,
+      "loss": 1.0787,
+      "step": 6028
+    },
+    {
+      "epoch": 1.0733618233618234,
+      "grad_norm": 0.62148118019104,
+      "learning_rate": 0.00016667416033710016,
+      "loss": 0.8843,
+      "step": 6029
+    },
+    {
+      "epoch": 1.0735398860398861,
+      "grad_norm": 0.7166166305541992,
+      "learning_rate": 0.0001666637275224839,
+      "loss": 0.8877,
+      "step": 6030
+    },
+    {
+      "epoch": 1.0737179487179487,
+      "grad_norm": 0.5275574922561646,
+      "learning_rate": 0.0001666532934017304,
+      "loss": 0.9604,
+      "step": 6031
+    },
+    {
+      "epoch": 1.0738960113960114,
+      "grad_norm": 0.8132784962654114,
+      "learning_rate": 0.00016664285797504406,
+      "loss": 1.0203,
+      "step": 6032
+    },
+    {
+      "epoch": 1.074074074074074,
+      "grad_norm": 0.5887695550918579,
+      "learning_rate": 0.00016663242124262935,
+      "loss": 0.8819,
+      "step": 6033
+    },
+    {
+      "epoch": 1.0742521367521367,
+      "grad_norm": 0.5552900433540344,
+      "learning_rate": 0.00016662198320469078,
+      "loss": 0.7542,
+      "step": 6034
+    },
+    {
+      "epoch": 1.0744301994301995,
+      "grad_norm": 0.6228970885276794,
+      "learning_rate": 0.0001666115438614328,
+      "loss": 1.0362,
+      "step": 6035
+    },
+    {
+      "epoch": 1.0746082621082622,
+      "grad_norm": 0.7193471789360046,
+      "learning_rate": 0.00016660110321306003,
+      "loss": 1.3073,
+      "step": 6036
+    },
+    {
+      "epoch": 1.0747863247863247,
+      "grad_norm": 0.6167412996292114,
+      "learning_rate": 0.000166590661259777,
+      "loss": 0.941,
+      "step": 6037
+    },
+    {
+      "epoch": 1.0749643874643875,
+      "grad_norm": 0.5716922879219055,
+      "learning_rate": 0.00016658021800178827,
+      "loss": 0.83,
+      "step": 6038
+    },
+    {
+      "epoch": 1.0751424501424502,
+      "grad_norm": 0.6404047012329102,
+      "learning_rate": 0.00016656977343929848,
+      "loss": 1.0617,
+      "step": 6039
+    },
+    {
+      "epoch": 1.0753205128205128,
+      "grad_norm": 0.531395435333252,
+      "learning_rate": 0.00016655932757251226,
+      "loss": 0.7785,
+      "step": 6040
+    },
+    {
+      "epoch": 1.0754985754985755,
+      "grad_norm": 0.6468462347984314,
+      "learning_rate": 0.0001665488804016343,
+      "loss": 0.7893,
+      "step": 6041
+    },
+    {
+      "epoch": 1.0756766381766383,
+      "grad_norm": 0.6539653539657593,
+      "learning_rate": 0.00016653843192686925,
+      "loss": 1.1011,
+      "step": 6042
+    },
+    {
+      "epoch": 1.0758547008547008,
+      "grad_norm": 0.630107045173645,
+      "learning_rate": 0.0001665279821484219,
+      "loss": 0.9262,
+      "step": 6043
+    },
+    {
+      "epoch": 1.0760327635327636,
+      "grad_norm": 0.5875992774963379,
+      "learning_rate": 0.00016651753106649688,
+      "loss": 1.0501,
+      "step": 6044
+    },
+    {
+      "epoch": 1.0762108262108263,
+      "grad_norm": 0.573428750038147,
+      "learning_rate": 0.00016650707868129904,
+      "loss": 1.0672,
+      "step": 6045
+    },
+    {
+      "epoch": 1.0763888888888888,
+      "grad_norm": 0.6215469241142273,
+      "learning_rate": 0.00016649662499303316,
+      "loss": 0.868,
+      "step": 6046
+    },
+    {
+      "epoch": 1.0765669515669516,
+      "grad_norm": 0.6666893362998962,
+      "learning_rate": 0.00016648617000190402,
+      "loss": 1.0965,
+      "step": 6047
+    },
+    {
+      "epoch": 1.0767450142450143,
+      "grad_norm": 0.8343498706817627,
+      "learning_rate": 0.00016647571370811653,
+      "loss": 1.2302,
+      "step": 6048
+    },
+    {
+      "epoch": 1.0769230769230769,
+      "grad_norm": 0.591147780418396,
+      "learning_rate": 0.0001664652561118755,
+      "loss": 0.9698,
+      "step": 6049
+    },
+    {
+      "epoch": 1.0771011396011396,
+      "grad_norm": 0.573375940322876,
+      "learning_rate": 0.00016645479721338584,
+      "loss": 0.8798,
+      "step": 6050
+    },
+    {
+      "epoch": 1.0772792022792024,
+      "grad_norm": 0.4956737160682678,
+      "learning_rate": 0.00016644433701285246,
+      "loss": 0.6523,
+      "step": 6051
+    },
+    {
+      "epoch": 1.077457264957265,
+      "grad_norm": 0.6896619200706482,
+      "learning_rate": 0.00016643387551048034,
+      "loss": 0.8911,
+      "step": 6052
+    },
+    {
+      "epoch": 1.0776353276353277,
+      "grad_norm": 0.5820416808128357,
+      "learning_rate": 0.00016642341270647445,
+      "loss": 1.1486,
+      "step": 6053
+    },
+    {
+      "epoch": 1.0778133903133904,
+      "grad_norm": 0.611132025718689,
+      "learning_rate": 0.00016641294860103976,
+      "loss": 1.0705,
+      "step": 6054
+    },
+    {
+      "epoch": 1.077991452991453,
+      "grad_norm": 0.6705698370933533,
+      "learning_rate": 0.00016640248319438133,
+      "loss": 0.9826,
+      "step": 6055
+    },
+    {
+      "epoch": 1.0781695156695157,
+      "grad_norm": 0.5987013578414917,
+      "learning_rate": 0.00016639201648670416,
+      "loss": 1.0409,
+      "step": 6056
+    },
+    {
+      "epoch": 1.0783475783475784,
+      "grad_norm": 0.6707149744033813,
+      "learning_rate": 0.00016638154847821332,
+      "loss": 1.1332,
+      "step": 6057
+    },
+    {
+      "epoch": 1.078525641025641,
+      "grad_norm": 0.6400678157806396,
+      "learning_rate": 0.00016637107916911393,
+      "loss": 1.2559,
+      "step": 6058
+    },
+    {
+      "epoch": 1.0787037037037037,
+      "grad_norm": 0.6370311379432678,
+      "learning_rate": 0.00016636060855961115,
+      "loss": 0.9752,
+      "step": 6059
+    },
+    {
+      "epoch": 1.0788817663817665,
+      "grad_norm": 0.6116052269935608,
+      "learning_rate": 0.00016635013664991012,
+      "loss": 0.8364,
+      "step": 6060
+    },
+    {
+      "epoch": 1.079059829059829,
+      "grad_norm": 0.7932127714157104,
+      "learning_rate": 0.00016633966344021593,
+      "loss": 0.939,
+      "step": 6061
+    },
+    {
+      "epoch": 1.0792378917378918,
+      "grad_norm": 0.576249897480011,
+      "learning_rate": 0.00016632918893073385,
+      "loss": 0.8911,
+      "step": 6062
+    },
+    {
+      "epoch": 1.0794159544159545,
+      "grad_norm": 0.5456888675689697,
+      "learning_rate": 0.00016631871312166915,
+      "loss": 0.8646,
+      "step": 6063
+    },
+    {
+      "epoch": 1.079594017094017,
+      "grad_norm": 0.717522919178009,
+      "learning_rate": 0.000166308236013227,
+      "loss": 1.0814,
+      "step": 6064
+    },
+    {
+      "epoch": 1.0797720797720798,
+      "grad_norm": 0.6637256145477295,
+      "learning_rate": 0.0001662977576056127,
+      "loss": 1.22,
+      "step": 6065
+    },
+    {
+      "epoch": 1.0799501424501425,
+      "grad_norm": 0.5846666693687439,
+      "learning_rate": 0.0001662872778990316,
+      "loss": 1.1745,
+      "step": 6066
+    },
+    {
+      "epoch": 1.080128205128205,
+      "grad_norm": 0.6611326336860657,
+      "learning_rate": 0.00016627679689368895,
+      "loss": 1.1262,
+      "step": 6067
+    },
+    {
+      "epoch": 1.0803062678062678,
+      "grad_norm": 0.6022892594337463,
+      "learning_rate": 0.00016626631458979015,
+      "loss": 0.9741,
+      "step": 6068
+    },
+    {
+      "epoch": 1.0804843304843306,
+      "grad_norm": 0.5862685441970825,
+      "learning_rate": 0.00016625583098754058,
+      "loss": 0.914,
+      "step": 6069
+    },
+    {
+      "epoch": 1.080662393162393,
+      "grad_norm": 0.7089241147041321,
+      "learning_rate": 0.00016624534608714563,
+      "loss": 1.0614,
+      "step": 6070
+    },
+    {
+      "epoch": 1.0808404558404558,
+      "grad_norm": 0.5286028981208801,
+      "learning_rate": 0.00016623485988881076,
+      "loss": 0.8756,
+      "step": 6071
+    },
+    {
+      "epoch": 1.0810185185185186,
+      "grad_norm": 0.6437101364135742,
+      "learning_rate": 0.00016622437239274137,
+      "loss": 0.7222,
+      "step": 6072
+    },
+    {
+      "epoch": 1.0811965811965811,
+      "grad_norm": 0.6197740435600281,
+      "learning_rate": 0.000166213883599143,
+      "loss": 0.7876,
+      "step": 6073
+    },
+    {
+      "epoch": 1.0813746438746439,
+      "grad_norm": 0.5889328122138977,
+      "learning_rate": 0.0001662033935082211,
+      "loss": 0.9587,
+      "step": 6074
+    },
+    {
+      "epoch": 1.0815527065527066,
+      "grad_norm": 0.5353847742080688,
+      "learning_rate": 0.00016619290212018125,
+      "loss": 0.8664,
+      "step": 6075
+    },
+    {
+      "epoch": 1.0817307692307692,
+      "grad_norm": 0.7202061414718628,
+      "learning_rate": 0.00016618240943522898,
+      "loss": 1.0429,
+      "step": 6076
+    },
+    {
+      "epoch": 1.081908831908832,
+      "grad_norm": 0.5831515192985535,
+      "learning_rate": 0.0001661719154535699,
+      "loss": 1.0323,
+      "step": 6077
+    },
+    {
+      "epoch": 1.0820868945868947,
+      "grad_norm": 0.6270500421524048,
+      "learning_rate": 0.00016616142017540953,
+      "loss": 0.9272,
+      "step": 6078
+    },
+    {
+      "epoch": 1.0822649572649572,
+      "grad_norm": 0.6064695119857788,
+      "learning_rate": 0.00016615092360095364,
+      "loss": 1.0629,
+      "step": 6079
+    },
+    {
+      "epoch": 1.08244301994302,
+      "grad_norm": 0.5578122138977051,
+      "learning_rate": 0.00016614042573040777,
+      "loss": 0.8601,
+      "step": 6080
+    },
+    {
+      "epoch": 1.0826210826210827,
+      "grad_norm": 0.5920688509941101,
+      "learning_rate": 0.0001661299265639777,
+      "loss": 1.0082,
+      "step": 6081
+    },
+    {
+      "epoch": 1.0827991452991452,
+      "grad_norm": 0.6191682815551758,
+      "learning_rate": 0.0001661194261018691,
+      "loss": 0.9645,
+      "step": 6082
+    },
+    {
+      "epoch": 1.082977207977208,
+      "grad_norm": 0.6403279304504395,
+      "learning_rate": 0.00016610892434428765,
+      "loss": 0.9263,
+      "step": 6083
+    },
+    {
+      "epoch": 1.0831552706552707,
+      "grad_norm": 0.579502284526825,
+      "learning_rate": 0.00016609842129143915,
+      "loss": 0.8997,
+      "step": 6084
+    },
+    {
+      "epoch": 1.0833333333333333,
+      "grad_norm": 0.5831437706947327,
+      "learning_rate": 0.00016608791694352944,
+      "loss": 1.0703,
+      "step": 6085
+    },
+    {
+      "epoch": 1.083511396011396,
+      "grad_norm": 0.6188452243804932,
+      "learning_rate": 0.00016607741130076424,
+      "loss": 0.8856,
+      "step": 6086
+    },
+    {
+      "epoch": 1.0836894586894588,
+      "grad_norm": 0.7413692474365234,
+      "learning_rate": 0.00016606690436334946,
+      "loss": 1.1995,
+      "step": 6087
+    },
+    {
+      "epoch": 1.0838675213675213,
+      "grad_norm": 0.5552099347114563,
+      "learning_rate": 0.00016605639613149093,
+      "loss": 0.8514,
+      "step": 6088
+    },
+    {
+      "epoch": 1.084045584045584,
+      "grad_norm": 0.5906503200531006,
+      "learning_rate": 0.00016604588660539452,
+      "loss": 0.9431,
+      "step": 6089
+    },
+    {
+      "epoch": 1.0842236467236468,
+      "grad_norm": 0.5326111316680908,
+      "learning_rate": 0.0001660353757852662,
+      "loss": 0.8306,
+      "step": 6090
+    },
+    {
+      "epoch": 1.0844017094017093,
+      "grad_norm": 0.7273091673851013,
+      "learning_rate": 0.0001660248636713118,
+      "loss": 1.1109,
+      "step": 6091
+    },
+    {
+      "epoch": 1.084579772079772,
+      "grad_norm": 0.66513592004776,
+      "learning_rate": 0.00016601435026373737,
+      "loss": 1.0621,
+      "step": 6092
+    },
+    {
+      "epoch": 1.0847578347578348,
+      "grad_norm": 0.6470831632614136,
+      "learning_rate": 0.00016600383556274892,
+      "loss": 1.1075,
+      "step": 6093
+    },
+    {
+      "epoch": 1.0849358974358974,
+      "grad_norm": 0.6308658719062805,
+      "learning_rate": 0.0001659933195685524,
+      "loss": 0.9832,
+      "step": 6094
+    },
+    {
+      "epoch": 1.08511396011396,
+      "grad_norm": 0.6569336652755737,
+      "learning_rate": 0.00016598280228135388,
+      "loss": 0.9754,
+      "step": 6095
+    },
+    {
+      "epoch": 1.0852920227920229,
+      "grad_norm": 0.5672318339347839,
+      "learning_rate": 0.0001659722837013594,
+      "loss": 0.9075,
+      "step": 6096
+    },
+    {
+      "epoch": 1.0854700854700854,
+      "grad_norm": 0.6397247314453125,
+      "learning_rate": 0.00016596176382877506,
+      "loss": 1.0358,
+      "step": 6097
+    },
+    {
+      "epoch": 1.0856481481481481,
+      "grad_norm": 0.6046154499053955,
+      "learning_rate": 0.000165951242663807,
+      "loss": 0.9036,
+      "step": 6098
+    },
+    {
+      "epoch": 1.085826210826211,
+      "grad_norm": 0.7190790176391602,
+      "learning_rate": 0.00016594072020666134,
+      "loss": 1.05,
+      "step": 6099
+    },
+    {
+      "epoch": 1.0860042735042734,
+      "grad_norm": 0.636986255645752,
+      "learning_rate": 0.00016593019645754425,
+      "loss": 1.0648,
+      "step": 6100
+    },
+    {
+      "epoch": 1.0861823361823362,
+      "grad_norm": 0.7239426374435425,
+      "learning_rate": 0.00016591967141666193,
+      "loss": 1.3332,
+      "step": 6101
+    },
+    {
+      "epoch": 1.086360398860399,
+      "grad_norm": 0.5623281002044678,
+      "learning_rate": 0.00016590914508422054,
+      "loss": 0.997,
+      "step": 6102
+    },
+    {
+      "epoch": 1.0865384615384615,
+      "grad_norm": 0.5559574365615845,
+      "learning_rate": 0.00016589861746042642,
+      "loss": 0.9309,
+      "step": 6103
+    },
+    {
+      "epoch": 1.0867165242165242,
+      "grad_norm": 0.6056998372077942,
+      "learning_rate": 0.00016588808854548574,
+      "loss": 1.05,
+      "step": 6104
+    },
+    {
+      "epoch": 1.086894586894587,
+      "grad_norm": 0.6419603228569031,
+      "learning_rate": 0.00016587755833960487,
+      "loss": 0.8933,
+      "step": 6105
+    },
+    {
+      "epoch": 1.0870726495726495,
+      "grad_norm": 0.5236496329307556,
+      "learning_rate": 0.00016586702684299006,
+      "loss": 1.0061,
+      "step": 6106
+    },
+    {
+      "epoch": 1.0872507122507122,
+      "grad_norm": 0.5764613747596741,
+      "learning_rate": 0.0001658564940558477,
+      "loss": 1.0218,
+      "step": 6107
+    },
+    {
+      "epoch": 1.087428774928775,
+      "grad_norm": 0.6049391627311707,
+      "learning_rate": 0.00016584595997838416,
+      "loss": 0.8157,
+      "step": 6108
+    },
+    {
+      "epoch": 1.0876068376068375,
+      "grad_norm": 0.585422933101654,
+      "learning_rate": 0.0001658354246108058,
+      "loss": 1.2761,
+      "step": 6109
+    },
+    {
+      "epoch": 1.0877849002849003,
+      "grad_norm": 0.6420125365257263,
+      "learning_rate": 0.00016582488795331907,
+      "loss": 1.1978,
+      "step": 6110
+    },
+    {
+      "epoch": 1.087962962962963,
+      "grad_norm": 0.646091878414154,
+      "learning_rate": 0.00016581435000613038,
+      "loss": 0.8946,
+      "step": 6111
+    },
+    {
+      "epoch": 1.0881410256410255,
+      "grad_norm": 0.6563934087753296,
+      "learning_rate": 0.00016580381076944625,
+      "loss": 1.0625,
+      "step": 6112
+    },
+    {
+      "epoch": 1.0883190883190883,
+      "grad_norm": 0.6796613931655884,
+      "learning_rate": 0.0001657932702434731,
+      "loss": 0.9401,
+      "step": 6113
+    },
+    {
+      "epoch": 1.088497150997151,
+      "grad_norm": 0.6248648762702942,
+      "learning_rate": 0.00016578272842841753,
+      "loss": 0.8558,
+      "step": 6114
+    },
+    {
+      "epoch": 1.0886752136752136,
+      "grad_norm": 0.5136269330978394,
+      "learning_rate": 0.00016577218532448605,
+      "loss": 0.6424,
+      "step": 6115
+    },
+    {
+      "epoch": 1.0888532763532763,
+      "grad_norm": 0.5581641793251038,
+      "learning_rate": 0.00016576164093188523,
+      "loss": 0.7923,
+      "step": 6116
+    },
+    {
+      "epoch": 1.089031339031339,
+      "grad_norm": 0.630352258682251,
+      "learning_rate": 0.0001657510952508216,
+      "loss": 0.9115,
+      "step": 6117
+    },
+    {
+      "epoch": 1.0892094017094016,
+      "grad_norm": 0.6167593002319336,
+      "learning_rate": 0.0001657405482815019,
+      "loss": 1.1112,
+      "step": 6118
+    },
+    {
+      "epoch": 1.0893874643874644,
+      "grad_norm": 0.5908578634262085,
+      "learning_rate": 0.00016573000002413271,
+      "loss": 1.0359,
+      "step": 6119
+    },
+    {
+      "epoch": 1.0895655270655271,
+      "grad_norm": 0.6326140761375427,
+      "learning_rate": 0.00016571945047892073,
+      "loss": 1.0459,
+      "step": 6120
+    },
+    {
+      "epoch": 1.0897435897435896,
+      "grad_norm": 0.7273572683334351,
+      "learning_rate": 0.00016570889964607262,
+      "loss": 1.0901,
+      "step": 6121
+    },
+    {
+      "epoch": 1.0899216524216524,
+      "grad_norm": 0.6168062090873718,
+      "learning_rate": 0.00016569834752579513,
+      "loss": 0.8739,
+      "step": 6122
+    },
+    {
+      "epoch": 1.0900997150997151,
+      "grad_norm": 0.5620378255844116,
+      "learning_rate": 0.00016568779411829497,
+      "loss": 0.9614,
+      "step": 6123
+    },
+    {
+      "epoch": 1.0902777777777777,
+      "grad_norm": 0.6319156885147095,
+      "learning_rate": 0.00016567723942377899,
+      "loss": 1.1031,
+      "step": 6124
+    },
+    {
+      "epoch": 1.0904558404558404,
+      "grad_norm": 0.6590072512626648,
+      "learning_rate": 0.00016566668344245388,
+      "loss": 1.0086,
+      "step": 6125
+    },
+    {
+      "epoch": 1.0906339031339032,
+      "grad_norm": 0.5823387503623962,
+      "learning_rate": 0.00016565612617452656,
+      "loss": 0.8886,
+      "step": 6126
+    },
+    {
+      "epoch": 1.0908119658119657,
+      "grad_norm": 0.5795989632606506,
+      "learning_rate": 0.00016564556762020381,
+      "loss": 0.7683,
+      "step": 6127
+    },
+    {
+      "epoch": 1.0909900284900285,
+      "grad_norm": 0.5940101742744446,
+      "learning_rate": 0.00016563500777969255,
+      "loss": 0.8873,
+      "step": 6128
+    },
+    {
+      "epoch": 1.0911680911680912,
+      "grad_norm": 0.5708247423171997,
+      "learning_rate": 0.00016562444665319963,
+      "loss": 0.7382,
+      "step": 6129
+    },
+    {
+      "epoch": 1.0913461538461537,
+      "grad_norm": 0.6339239478111267,
+      "learning_rate": 0.00016561388424093202,
+      "loss": 0.9323,
+      "step": 6130
+    },
+    {
+      "epoch": 1.0915242165242165,
+      "grad_norm": 0.720000147819519,
+      "learning_rate": 0.00016560332054309663,
+      "loss": 1.0437,
+      "step": 6131
+    },
+    {
+      "epoch": 1.0917022792022792,
+      "grad_norm": 0.686580240726471,
+      "learning_rate": 0.00016559275555990048,
+      "loss": 0.9841,
+      "step": 6132
+    },
+    {
+      "epoch": 1.091880341880342,
+      "grad_norm": 0.6067900061607361,
+      "learning_rate": 0.00016558218929155053,
+      "loss": 1.0862,
+      "step": 6133
+    },
+    {
+      "epoch": 1.0920584045584045,
+      "grad_norm": 0.6678896546363831,
+      "learning_rate": 0.00016557162173825384,
+      "loss": 0.8509,
+      "step": 6134
+    },
+    {
+      "epoch": 1.0922364672364673,
+      "grad_norm": 0.53044193983078,
+      "learning_rate": 0.0001655610529002174,
+      "loss": 0.9227,
+      "step": 6135
+    },
+    {
+      "epoch": 1.0924145299145298,
+      "grad_norm": 0.6499412655830383,
+      "learning_rate": 0.00016555048277764836,
+      "loss": 1.0867,
+      "step": 6136
+    },
+    {
+      "epoch": 1.0925925925925926,
+      "grad_norm": 0.6543099284172058,
+      "learning_rate": 0.00016553991137075374,
+      "loss": 0.849,
+      "step": 6137
+    },
+    {
+      "epoch": 1.0927706552706553,
+      "grad_norm": 0.5772737860679626,
+      "learning_rate": 0.0001655293386797407,
+      "loss": 0.8475,
+      "step": 6138
+    },
+    {
+      "epoch": 1.092948717948718,
+      "grad_norm": 0.616348385810852,
+      "learning_rate": 0.00016551876470481642,
+      "loss": 0.9205,
+      "step": 6139
+    },
+    {
+      "epoch": 1.0931267806267806,
+      "grad_norm": 0.7151142954826355,
+      "learning_rate": 0.00016550818944618801,
+      "loss": 1.1389,
+      "step": 6140
+    },
+    {
+      "epoch": 1.0933048433048433,
+      "grad_norm": 0.6566469669342041,
+      "learning_rate": 0.00016549761290406275,
+      "loss": 0.8216,
+      "step": 6141
+    },
+    {
+      "epoch": 1.0934829059829059,
+      "grad_norm": 0.7075428366661072,
+      "learning_rate": 0.00016548703507864783,
+      "loss": 1.065,
+      "step": 6142
+    },
+    {
+      "epoch": 1.0936609686609686,
+      "grad_norm": 0.6589360237121582,
+      "learning_rate": 0.00016547645597015046,
+      "loss": 0.9899,
+      "step": 6143
+    },
+    {
+      "epoch": 1.0938390313390314,
+      "grad_norm": 0.6445585489273071,
+      "learning_rate": 0.00016546587557877797,
+      "loss": 1.1629,
+      "step": 6144
+    },
+    {
+      "epoch": 1.0940170940170941,
+      "grad_norm": 0.6216462850570679,
+      "learning_rate": 0.00016545529390473763,
+      "loss": 0.9685,
+      "step": 6145
+    },
+    {
+      "epoch": 1.0941951566951567,
+      "grad_norm": 0.6195303797721863,
+      "learning_rate": 0.0001654447109482368,
+      "loss": 1.144,
+      "step": 6146
+    },
+    {
+      "epoch": 1.0943732193732194,
+      "grad_norm": 0.6625444293022156,
+      "learning_rate": 0.0001654341267094828,
+      "loss": 0.9886,
+      "step": 6147
+    },
+    {
+      "epoch": 1.094551282051282,
+      "grad_norm": 0.6449851393699646,
+      "learning_rate": 0.000165423541188683,
+      "loss": 0.9568,
+      "step": 6148
+    },
+    {
+      "epoch": 1.0947293447293447,
+      "grad_norm": 0.6490375995635986,
+      "learning_rate": 0.00016541295438604484,
+      "loss": 1.1304,
+      "step": 6149
+    },
+    {
+      "epoch": 1.0949074074074074,
+      "grad_norm": 0.6771987676620483,
+      "learning_rate": 0.00016540236630177574,
+      "loss": 1.0426,
+      "step": 6150
+    },
+    {
+      "epoch": 1.0950854700854702,
+      "grad_norm": 0.5214568376541138,
+      "learning_rate": 0.00016539177693608307,
+      "loss": 0.6742,
+      "step": 6151
+    },
+    {
+      "epoch": 1.0952635327635327,
+      "grad_norm": 0.6005097031593323,
+      "learning_rate": 0.00016538118628917442,
+      "loss": 0.9901,
+      "step": 6152
+    },
+    {
+      "epoch": 1.0954415954415955,
+      "grad_norm": 0.6449539065361023,
+      "learning_rate": 0.0001653705943612572,
+      "loss": 0.9654,
+      "step": 6153
+    },
+    {
+      "epoch": 1.095619658119658,
+      "grad_norm": 0.6443646550178528,
+      "learning_rate": 0.00016536000115253903,
+      "loss": 0.9084,
+      "step": 6154
+    },
+    {
+      "epoch": 1.0957977207977208,
+      "grad_norm": 0.6072495579719543,
+      "learning_rate": 0.0001653494066632274,
+      "loss": 0.6308,
+      "step": 6155
+    },
+    {
+      "epoch": 1.0959757834757835,
+      "grad_norm": 0.5751157999038696,
+      "learning_rate": 0.00016533881089352988,
+      "loss": 0.96,
+      "step": 6156
+    },
+    {
+      "epoch": 1.0961538461538463,
+      "grad_norm": 0.6310713291168213,
+      "learning_rate": 0.0001653282138436541,
+      "loss": 1.0997,
+      "step": 6157
+    },
+    {
+      "epoch": 1.0963319088319088,
+      "grad_norm": 0.5573651790618896,
+      "learning_rate": 0.00016531761551380765,
+      "loss": 0.9738,
+      "step": 6158
+    },
+    {
+      "epoch": 1.0965099715099715,
+      "grad_norm": 0.5615308880805969,
+      "learning_rate": 0.00016530701590419824,
+      "loss": 0.9658,
+      "step": 6159
+    },
+    {
+      "epoch": 1.0966880341880343,
+      "grad_norm": 0.6471942663192749,
+      "learning_rate": 0.0001652964150150335,
+      "loss": 1.0763,
+      "step": 6160
+    },
+    {
+      "epoch": 1.0968660968660968,
+      "grad_norm": 0.6305427551269531,
+      "learning_rate": 0.00016528581284652117,
+      "loss": 1.112,
+      "step": 6161
+    },
+    {
+      "epoch": 1.0970441595441596,
+      "grad_norm": 0.6881145238876343,
+      "learning_rate": 0.00016527520939886892,
+      "loss": 0.8476,
+      "step": 6162
+    },
+    {
+      "epoch": 1.0972222222222223,
+      "grad_norm": 0.6507891416549683,
+      "learning_rate": 0.00016526460467228458,
+      "loss": 1.1097,
+      "step": 6163
+    },
+    {
+      "epoch": 1.0974002849002849,
+      "grad_norm": 0.5960137844085693,
+      "learning_rate": 0.00016525399866697586,
+      "loss": 0.9934,
+      "step": 6164
+    },
+    {
+      "epoch": 1.0975783475783476,
+      "grad_norm": 0.6001808643341064,
+      "learning_rate": 0.0001652433913831506,
+      "loss": 1.0782,
+      "step": 6165
+    },
+    {
+      "epoch": 1.0977564102564104,
+      "grad_norm": 0.5639005303382874,
+      "learning_rate": 0.00016523278282101663,
+      "loss": 1.0929,
+      "step": 6166
+    },
+    {
+      "epoch": 1.0979344729344729,
+      "grad_norm": 0.5962058305740356,
+      "learning_rate": 0.00016522217298078177,
+      "loss": 1.0315,
+      "step": 6167
+    },
+    {
+      "epoch": 1.0981125356125356,
+      "grad_norm": 0.6920329928398132,
+      "learning_rate": 0.0001652115618626539,
+      "loss": 0.9176,
+      "step": 6168
+    },
+    {
+      "epoch": 1.0982905982905984,
+      "grad_norm": 0.6963527202606201,
+      "learning_rate": 0.00016520094946684098,
+      "loss": 1.2136,
+      "step": 6169
+    },
+    {
+      "epoch": 1.098468660968661,
+      "grad_norm": 0.5855711102485657,
+      "learning_rate": 0.00016519033579355093,
+      "loss": 0.8453,
+      "step": 6170
+    },
+    {
+      "epoch": 1.0986467236467237,
+      "grad_norm": 0.6454927325248718,
+      "learning_rate": 0.0001651797208429916,
+      "loss": 1.0747,
+      "step": 6171
+    },
+    {
+      "epoch": 1.0988247863247864,
+      "grad_norm": 0.644585907459259,
+      "learning_rate": 0.00016516910461537108,
+      "loss": 0.8165,
+      "step": 6172
+    },
+    {
+      "epoch": 1.099002849002849,
+      "grad_norm": 0.6488069891929626,
+      "learning_rate": 0.00016515848711089732,
+      "loss": 1.1048,
+      "step": 6173
+    },
+    {
+      "epoch": 1.0991809116809117,
+      "grad_norm": 0.5867953896522522,
+      "learning_rate": 0.00016514786832977834,
+      "loss": 0.63,
+      "step": 6174
+    },
+    {
+      "epoch": 1.0993589743589745,
+      "grad_norm": 0.560591459274292,
+      "learning_rate": 0.00016513724827222227,
+      "loss": 0.9255,
+      "step": 6175
+    },
+    {
+      "epoch": 1.099537037037037,
+      "grad_norm": 0.675262451171875,
+      "learning_rate": 0.00016512662693843707,
+      "loss": 0.7637,
+      "step": 6176
+    },
+    {
+      "epoch": 1.0997150997150997,
+      "grad_norm": 0.6515669822692871,
+      "learning_rate": 0.00016511600432863091,
+      "loss": 0.7579,
+      "step": 6177
+    },
+    {
+      "epoch": 1.0998931623931625,
+      "grad_norm": 0.683409571647644,
+      "learning_rate": 0.00016510538044301192,
+      "loss": 0.9183,
+      "step": 6178
+    },
+    {
+      "epoch": 1.100071225071225,
+      "grad_norm": 0.6194507479667664,
+      "learning_rate": 0.00016509475528178827,
+      "loss": 1.16,
+      "step": 6179
+    },
+    {
+      "epoch": 1.1002492877492878,
+      "grad_norm": 0.6192209720611572,
+      "learning_rate": 0.0001650841288451681,
+      "loss": 1.1392,
+      "step": 6180
+    },
+    {
+      "epoch": 1.1004273504273505,
+      "grad_norm": 0.6029189825057983,
+      "learning_rate": 0.0001650735011333596,
+      "loss": 1.1453,
+      "step": 6181
+    },
+    {
+      "epoch": 1.100605413105413,
+      "grad_norm": 0.7040731310844421,
+      "learning_rate": 0.00016506287214657105,
+      "loss": 0.9367,
+      "step": 6182
+    },
+    {
+      "epoch": 1.1007834757834758,
+      "grad_norm": 0.5909842252731323,
+      "learning_rate": 0.00016505224188501067,
+      "loss": 0.6463,
+      "step": 6183
+    },
+    {
+      "epoch": 1.1009615384615385,
+      "grad_norm": 0.6129698157310486,
+      "learning_rate": 0.00016504161034888674,
+      "loss": 0.9432,
+      "step": 6184
+    },
+    {
+      "epoch": 1.101139601139601,
+      "grad_norm": 0.6181607842445374,
+      "learning_rate": 0.00016503097753840757,
+      "loss": 0.9934,
+      "step": 6185
+    },
+    {
+      "epoch": 1.1013176638176638,
+      "grad_norm": 0.6463226675987244,
+      "learning_rate": 0.0001650203434537815,
+      "loss": 0.8471,
+      "step": 6186
+    },
+    {
+      "epoch": 1.1014957264957266,
+      "grad_norm": 0.5999348163604736,
+      "learning_rate": 0.00016500970809521688,
+      "loss": 0.9418,
+      "step": 6187
+    },
+    {
+      "epoch": 1.101673789173789,
+      "grad_norm": 0.629504919052124,
+      "learning_rate": 0.00016499907146292204,
+      "loss": 0.9699,
+      "step": 6188
+    },
+    {
+      "epoch": 1.1018518518518519,
+      "grad_norm": 0.694767951965332,
+      "learning_rate": 0.00016498843355710542,
+      "loss": 0.8793,
+      "step": 6189
+    },
+    {
+      "epoch": 1.1020299145299146,
+      "grad_norm": 0.6205509901046753,
+      "learning_rate": 0.00016497779437797547,
+      "loss": 0.8384,
+      "step": 6190
+    },
+    {
+      "epoch": 1.1022079772079771,
+      "grad_norm": 0.6256579756736755,
+      "learning_rate": 0.0001649671539257406,
+      "loss": 0.9275,
+      "step": 6191
+    },
+    {
+      "epoch": 1.10238603988604,
+      "grad_norm": 0.6593793034553528,
+      "learning_rate": 0.00016495651220060933,
+      "loss": 1.0495,
+      "step": 6192
+    },
+    {
+      "epoch": 1.1025641025641026,
+      "grad_norm": 0.7809221148490906,
+      "learning_rate": 0.00016494586920279012,
+      "loss": 1.0485,
+      "step": 6193
+    },
+    {
+      "epoch": 1.1027421652421652,
+      "grad_norm": 0.6147717833518982,
+      "learning_rate": 0.0001649352249324915,
+      "loss": 0.8739,
+      "step": 6194
+    },
+    {
+      "epoch": 1.102920227920228,
+      "grad_norm": 0.565411388874054,
+      "learning_rate": 0.00016492457938992208,
+      "loss": 0.9759,
+      "step": 6195
+    },
+    {
+      "epoch": 1.1030982905982907,
+      "grad_norm": 0.596370279788971,
+      "learning_rate": 0.00016491393257529036,
+      "loss": 0.9658,
+      "step": 6196
+    },
+    {
+      "epoch": 1.1032763532763532,
+      "grad_norm": 0.6334326863288879,
+      "learning_rate": 0.00016490328448880498,
+      "loss": 0.8785,
+      "step": 6197
+    },
+    {
+      "epoch": 1.103454415954416,
+      "grad_norm": 0.5538334846496582,
+      "learning_rate": 0.0001648926351306746,
+      "loss": 0.7174,
+      "step": 6198
+    },
+    {
+      "epoch": 1.1036324786324787,
+      "grad_norm": 0.6249658465385437,
+      "learning_rate": 0.00016488198450110778,
+      "loss": 0.8579,
+      "step": 6199
+    },
+    {
+      "epoch": 1.1038105413105412,
+      "grad_norm": 0.6128895878791809,
+      "learning_rate": 0.00016487133260031329,
+      "loss": 0.8538,
+      "step": 6200
+    },
+    {
+      "epoch": 1.103988603988604,
+      "grad_norm": 0.5808702707290649,
+      "learning_rate": 0.0001648606794284998,
+      "loss": 0.8143,
+      "step": 6201
+    },
+    {
+      "epoch": 1.1041666666666667,
+      "grad_norm": 0.671419084072113,
+      "learning_rate": 0.00016485002498587602,
+      "loss": 1.1268,
+      "step": 6202
+    },
+    {
+      "epoch": 1.1043447293447293,
+      "grad_norm": 0.5706788897514343,
+      "learning_rate": 0.00016483936927265075,
+      "loss": 0.9558,
+      "step": 6203
+    },
+    {
+      "epoch": 1.104522792022792,
+      "grad_norm": 0.5700307488441467,
+      "learning_rate": 0.00016482871228903266,
+      "loss": 0.9616,
+      "step": 6204
+    },
+    {
+      "epoch": 1.1047008547008548,
+      "grad_norm": 0.5764816403388977,
+      "learning_rate": 0.0001648180540352307,
+      "loss": 0.8692,
+      "step": 6205
+    },
+    {
+      "epoch": 1.1048789173789173,
+      "grad_norm": 0.5786563754081726,
+      "learning_rate": 0.00016480739451145358,
+      "loss": 0.9406,
+      "step": 6206
+    },
+    {
+      "epoch": 1.10505698005698,
+      "grad_norm": 0.6112591624259949,
+      "learning_rate": 0.0001647967337179102,
+      "loss": 0.8999,
+      "step": 6207
+    },
+    {
+      "epoch": 1.1052350427350428,
+      "grad_norm": 0.5708907246589661,
+      "learning_rate": 0.00016478607165480944,
+      "loss": 0.9236,
+      "step": 6208
+    },
+    {
+      "epoch": 1.1054131054131053,
+      "grad_norm": 0.6742013692855835,
+      "learning_rate": 0.00016477540832236014,
+      "loss": 1.0911,
+      "step": 6209
+    },
+    {
+      "epoch": 1.105591168091168,
+      "grad_norm": 0.6382617354393005,
+      "learning_rate": 0.0001647647437207713,
+      "loss": 0.7901,
+      "step": 6210
+    },
+    {
+      "epoch": 1.1057692307692308,
+      "grad_norm": 0.6241547465324402,
+      "learning_rate": 0.00016475407785025188,
+      "loss": 1.0048,
+      "step": 6211
+    },
+    {
+      "epoch": 1.1059472934472934,
+      "grad_norm": 0.6452877521514893,
+      "learning_rate": 0.00016474341071101077,
+      "loss": 0.8902,
+      "step": 6212
+    },
+    {
+      "epoch": 1.1061253561253561,
+      "grad_norm": 0.6212326288223267,
+      "learning_rate": 0.00016473274230325704,
+      "loss": 1.078,
+      "step": 6213
+    },
+    {
+      "epoch": 1.1063034188034189,
+      "grad_norm": 0.6870912909507751,
+      "learning_rate": 0.00016472207262719968,
+      "loss": 0.9127,
+      "step": 6214
+    },
+    {
+      "epoch": 1.1064814814814814,
+      "grad_norm": 0.6286750435829163,
+      "learning_rate": 0.00016471140168304777,
+      "loss": 1.0271,
+      "step": 6215
+    },
+    {
+      "epoch": 1.1066595441595442,
+      "grad_norm": 0.645806074142456,
+      "learning_rate": 0.00016470072947101036,
+      "loss": 1.1514,
+      "step": 6216
+    },
+    {
+      "epoch": 1.106837606837607,
+      "grad_norm": 0.6800320148468018,
+      "learning_rate": 0.00016469005599129653,
+      "loss": 0.9322,
+      "step": 6217
+    },
+    {
+      "epoch": 1.1070156695156694,
+      "grad_norm": 0.5898309946060181,
+      "learning_rate": 0.0001646793812441155,
+      "loss": 1.065,
+      "step": 6218
+    },
+    {
+      "epoch": 1.1071937321937322,
+      "grad_norm": 0.6000019907951355,
+      "learning_rate": 0.00016466870522967634,
+      "loss": 0.911,
+      "step": 6219
+    },
+    {
+      "epoch": 1.107371794871795,
+      "grad_norm": 0.6164331436157227,
+      "learning_rate": 0.0001646580279481882,
+      "loss": 0.8421,
+      "step": 6220
+    },
+    {
+      "epoch": 1.1075498575498575,
+      "grad_norm": 0.6410242319107056,
+      "learning_rate": 0.00016464734939986036,
+      "loss": 0.9688,
+      "step": 6221
+    },
+    {
+      "epoch": 1.1077279202279202,
+      "grad_norm": 0.7153300046920776,
+      "learning_rate": 0.00016463666958490197,
+      "loss": 1.0722,
+      "step": 6222
+    },
+    {
+      "epoch": 1.107905982905983,
+      "grad_norm": 0.6977026462554932,
+      "learning_rate": 0.00016462598850352234,
+      "loss": 1.0192,
+      "step": 6223
+    },
+    {
+      "epoch": 1.1080840455840455,
+      "grad_norm": 0.6379461884498596,
+      "learning_rate": 0.0001646153061559307,
+      "loss": 1.0474,
+      "step": 6224
+    },
+    {
+      "epoch": 1.1082621082621082,
+      "grad_norm": 0.6135090589523315,
+      "learning_rate": 0.00016460462254233634,
+      "loss": 1.0082,
+      "step": 6225
+    },
+    {
+      "epoch": 1.108440170940171,
+      "grad_norm": 0.6326230764389038,
+      "learning_rate": 0.00016459393766294866,
+      "loss": 1.1097,
+      "step": 6226
+    },
+    {
+      "epoch": 1.1086182336182335,
+      "grad_norm": 0.6636839509010315,
+      "learning_rate": 0.0001645832515179769,
+      "loss": 0.9689,
+      "step": 6227
+    },
+    {
+      "epoch": 1.1087962962962963,
+      "grad_norm": 0.5713129043579102,
+      "learning_rate": 0.00016457256410763052,
+      "loss": 0.8642,
+      "step": 6228
+    },
+    {
+      "epoch": 1.108974358974359,
+      "grad_norm": 0.584204912185669,
+      "learning_rate": 0.00016456187543211888,
+      "loss": 0.9957,
+      "step": 6229
+    },
+    {
+      "epoch": 1.1091524216524216,
+      "grad_norm": 0.5920230746269226,
+      "learning_rate": 0.0001645511854916514,
+      "loss": 0.7297,
+      "step": 6230
+    },
+    {
+      "epoch": 1.1093304843304843,
+      "grad_norm": 0.6207385063171387,
+      "learning_rate": 0.0001645404942864375,
+      "loss": 0.868,
+      "step": 6231
+    },
+    {
+      "epoch": 1.109508547008547,
+      "grad_norm": 0.7267234921455383,
+      "learning_rate": 0.00016452980181668673,
+      "loss": 1.0248,
+      "step": 6232
+    },
+    {
+      "epoch": 1.1096866096866096,
+      "grad_norm": 0.5925650596618652,
+      "learning_rate": 0.00016451910808260852,
+      "loss": 1.1075,
+      "step": 6233
+    },
+    {
+      "epoch": 1.1098646723646723,
+      "grad_norm": 0.5632196664810181,
+      "learning_rate": 0.00016450841308441244,
+      "loss": 0.9865,
+      "step": 6234
+    },
+    {
+      "epoch": 1.110042735042735,
+      "grad_norm": 0.6115161180496216,
+      "learning_rate": 0.000164497716822308,
+      "loss": 1.1343,
+      "step": 6235
+    },
+    {
+      "epoch": 1.1102207977207976,
+      "grad_norm": 0.634398341178894,
+      "learning_rate": 0.00016448701929650477,
+      "loss": 1.1039,
+      "step": 6236
+    },
+    {
+      "epoch": 1.1103988603988604,
+      "grad_norm": 0.5843468308448792,
+      "learning_rate": 0.00016447632050721237,
+      "loss": 0.8462,
+      "step": 6237
+    },
+    {
+      "epoch": 1.1105769230769231,
+      "grad_norm": 0.799375593662262,
+      "learning_rate": 0.0001644656204546404,
+      "loss": 0.9861,
+      "step": 6238
+    },
+    {
+      "epoch": 1.1107549857549857,
+      "grad_norm": 0.600289523601532,
+      "learning_rate": 0.0001644549191389985,
+      "loss": 1.0323,
+      "step": 6239
+    },
+    {
+      "epoch": 1.1109330484330484,
+      "grad_norm": 0.6154919266700745,
+      "learning_rate": 0.00016444421656049637,
+      "loss": 0.9158,
+      "step": 6240
+    },
+    {
+      "epoch": 1.1111111111111112,
+      "grad_norm": 0.6685689687728882,
+      "learning_rate": 0.00016443351271934367,
+      "loss": 1.0429,
+      "step": 6241
+    },
+    {
+      "epoch": 1.1112891737891737,
+      "grad_norm": 0.699978232383728,
+      "learning_rate": 0.00016442280761575016,
+      "loss": 1.072,
+      "step": 6242
+    },
+    {
+      "epoch": 1.1114672364672364,
+      "grad_norm": 0.6461396217346191,
+      "learning_rate": 0.00016441210124992556,
+      "loss": 0.9758,
+      "step": 6243
+    },
+    {
+      "epoch": 1.1116452991452992,
+      "grad_norm": 0.6463284492492676,
+      "learning_rate": 0.00016440139362207962,
+      "loss": 0.9205,
+      "step": 6244
+    },
+    {
+      "epoch": 1.1118233618233617,
+      "grad_norm": 0.6587556004524231,
+      "learning_rate": 0.00016439068473242217,
+      "loss": 1.0027,
+      "step": 6245
+    },
+    {
+      "epoch": 1.1120014245014245,
+      "grad_norm": 0.6896520256996155,
+      "learning_rate": 0.000164379974581163,
+      "loss": 0.9788,
+      "step": 6246
+    },
+    {
+      "epoch": 1.1121794871794872,
+      "grad_norm": 0.6766142845153809,
+      "learning_rate": 0.000164369263168512,
+      "loss": 0.9647,
+      "step": 6247
+    },
+    {
+      "epoch": 1.1123575498575498,
+      "grad_norm": 0.7024297118186951,
+      "learning_rate": 0.00016435855049467898,
+      "loss": 1.1163,
+      "step": 6248
+    },
+    {
+      "epoch": 1.1125356125356125,
+      "grad_norm": 0.6654963493347168,
+      "learning_rate": 0.00016434783655987385,
+      "loss": 0.9302,
+      "step": 6249
+    },
+    {
+      "epoch": 1.1127136752136753,
+      "grad_norm": 0.6973692774772644,
+      "learning_rate": 0.0001643371213643065,
+      "loss": 0.9585,
+      "step": 6250
+    },
+    {
+      "epoch": 1.1128917378917378,
+      "grad_norm": 0.7153545022010803,
+      "learning_rate": 0.000164326404908187,
+      "loss": 1.0485,
+      "step": 6251
+    },
+    {
+      "epoch": 1.1130698005698005,
+      "grad_norm": 0.6114685535430908,
+      "learning_rate": 0.00016431568719172516,
+      "loss": 0.8881,
+      "step": 6252
+    },
+    {
+      "epoch": 1.1132478632478633,
+      "grad_norm": 0.6500731706619263,
+      "learning_rate": 0.00016430496821513103,
+      "loss": 1.0658,
+      "step": 6253
+    },
+    {
+      "epoch": 1.113425925925926,
+      "grad_norm": 0.5800092220306396,
+      "learning_rate": 0.00016429424797861466,
+      "loss": 0.9158,
+      "step": 6254
+    },
+    {
+      "epoch": 1.1136039886039886,
+      "grad_norm": 0.6653759479522705,
+      "learning_rate": 0.00016428352648238602,
+      "loss": 0.9762,
+      "step": 6255
+    },
+    {
+      "epoch": 1.1137820512820513,
+      "grad_norm": 0.649208128452301,
+      "learning_rate": 0.00016427280372665525,
+      "loss": 1.1184,
+      "step": 6256
+    },
+    {
+      "epoch": 1.1139601139601139,
+      "grad_norm": 0.6665199398994446,
+      "learning_rate": 0.00016426207971163238,
+      "loss": 0.9417,
+      "step": 6257
+    },
+    {
+      "epoch": 1.1141381766381766,
+      "grad_norm": 0.6110978126525879,
+      "learning_rate": 0.00016425135443752758,
+      "loss": 1.1531,
+      "step": 6258
+    },
+    {
+      "epoch": 1.1143162393162394,
+      "grad_norm": 0.6517077088356018,
+      "learning_rate": 0.00016424062790455093,
+      "loss": 0.9055,
+      "step": 6259
+    },
+    {
+      "epoch": 1.114494301994302,
+      "grad_norm": 0.6278966665267944,
+      "learning_rate": 0.00016422990011291265,
+      "loss": 1.0087,
+      "step": 6260
+    },
+    {
+      "epoch": 1.1146723646723646,
+      "grad_norm": 0.5818809270858765,
+      "learning_rate": 0.00016421917106282288,
+      "loss": 1.0202,
+      "step": 6261
+    },
+    {
+      "epoch": 1.1148504273504274,
+      "grad_norm": 0.5670005679130554,
+      "learning_rate": 0.00016420844075449187,
+      "loss": 0.841,
+      "step": 6262
+    },
+    {
+      "epoch": 1.11502849002849,
+      "grad_norm": 0.6584762334823608,
+      "learning_rate": 0.00016419770918812984,
+      "loss": 1.0322,
+      "step": 6263
+    },
+    {
+      "epoch": 1.1152065527065527,
+      "grad_norm": 0.6023790836334229,
+      "learning_rate": 0.00016418697636394705,
+      "loss": 0.9152,
+      "step": 6264
+    },
+    {
+      "epoch": 1.1153846153846154,
+      "grad_norm": 0.6234691739082336,
+      "learning_rate": 0.00016417624228215382,
+      "loss": 0.9555,
+      "step": 6265
+    },
+    {
+      "epoch": 1.1155626780626782,
+      "grad_norm": 0.6690816879272461,
+      "learning_rate": 0.00016416550694296045,
+      "loss": 0.9341,
+      "step": 6266
+    },
+    {
+      "epoch": 1.1157407407407407,
+      "grad_norm": 0.6030237078666687,
+      "learning_rate": 0.00016415477034657723,
+      "loss": 1.0442,
+      "step": 6267
+    },
+    {
+      "epoch": 1.1159188034188035,
+      "grad_norm": 0.5954633951187134,
+      "learning_rate": 0.00016414403249321455,
+      "loss": 0.9132,
+      "step": 6268
+    },
+    {
+      "epoch": 1.116096866096866,
+      "grad_norm": 0.7876830101013184,
+      "learning_rate": 0.0001641332933830828,
+      "loss": 0.9456,
+      "step": 6269
+    },
+    {
+      "epoch": 1.1162749287749287,
+      "grad_norm": 0.6776009798049927,
+      "learning_rate": 0.00016412255301639244,
+      "loss": 0.9022,
+      "step": 6270
+    },
+    {
+      "epoch": 1.1164529914529915,
+      "grad_norm": 0.6094426512718201,
+      "learning_rate": 0.0001641118113933538,
+      "loss": 0.9629,
+      "step": 6271
+    },
+    {
+      "epoch": 1.1166310541310542,
+      "grad_norm": 0.5818213820457458,
+      "learning_rate": 0.00016410106851417742,
+      "loss": 0.9049,
+      "step": 6272
+    },
+    {
+      "epoch": 1.1168091168091168,
+      "grad_norm": 0.5668078064918518,
+      "learning_rate": 0.00016409032437907377,
+      "loss": 1.0011,
+      "step": 6273
+    },
+    {
+      "epoch": 1.1169871794871795,
+      "grad_norm": 0.6984922289848328,
+      "learning_rate": 0.00016407957898825334,
+      "loss": 0.9454,
+      "step": 6274
+    },
+    {
+      "epoch": 1.1171652421652423,
+      "grad_norm": 0.5509830117225647,
+      "learning_rate": 0.00016406883234192668,
+      "loss": 0.9132,
+      "step": 6275
+    },
+    {
+      "epoch": 1.1173433048433048,
+      "grad_norm": 0.5117461681365967,
+      "learning_rate": 0.00016405808444030435,
+      "loss": 0.7675,
+      "step": 6276
+    },
+    {
+      "epoch": 1.1175213675213675,
+      "grad_norm": 0.6358339786529541,
+      "learning_rate": 0.00016404733528359688,
+      "loss": 0.9777,
+      "step": 6277
+    },
+    {
+      "epoch": 1.1176994301994303,
+      "grad_norm": 0.5870591402053833,
+      "learning_rate": 0.00016403658487201494,
+      "loss": 0.8576,
+      "step": 6278
+    },
+    {
+      "epoch": 1.1178774928774928,
+      "grad_norm": 0.6532407999038696,
+      "learning_rate": 0.00016402583320576915,
+      "loss": 1.1787,
+      "step": 6279
+    },
+    {
+      "epoch": 1.1180555555555556,
+      "grad_norm": 0.6374639272689819,
+      "learning_rate": 0.00016401508028507017,
+      "loss": 0.9298,
+      "step": 6280
+    },
+    {
+      "epoch": 1.1182336182336183,
+      "grad_norm": 0.7280316352844238,
+      "learning_rate": 0.00016400432611012869,
+      "loss": 1.1081,
+      "step": 6281
+    },
+    {
+      "epoch": 1.1184116809116809,
+      "grad_norm": 0.6070699095726013,
+      "learning_rate": 0.00016399357068115538,
+      "loss": 0.9107,
+      "step": 6282
+    },
+    {
+      "epoch": 1.1185897435897436,
+      "grad_norm": 0.6701489686965942,
+      "learning_rate": 0.00016398281399836097,
+      "loss": 1.0879,
+      "step": 6283
+    },
+    {
+      "epoch": 1.1187678062678064,
+      "grad_norm": 0.6343162655830383,
+      "learning_rate": 0.00016397205606195626,
+      "loss": 0.8552,
+      "step": 6284
+    },
+    {
+      "epoch": 1.118945868945869,
+      "grad_norm": 0.6450608968734741,
+      "learning_rate": 0.00016396129687215198,
+      "loss": 1.1119,
+      "step": 6285
+    },
+    {
+      "epoch": 1.1191239316239316,
+      "grad_norm": 0.7219904661178589,
+      "learning_rate": 0.00016395053642915896,
+      "loss": 0.9081,
+      "step": 6286
+    },
+    {
+      "epoch": 1.1193019943019944,
+      "grad_norm": 0.6189733147621155,
+      "learning_rate": 0.00016393977473318802,
+      "loss": 0.9818,
+      "step": 6287
+    },
+    {
+      "epoch": 1.119480056980057,
+      "grad_norm": 0.6310907602310181,
+      "learning_rate": 0.00016392901178445004,
+      "loss": 1.0334,
+      "step": 6288
+    },
+    {
+      "epoch": 1.1196581196581197,
+      "grad_norm": 0.6556720733642578,
+      "learning_rate": 0.00016391824758315587,
+      "loss": 1.0452,
+      "step": 6289
+    },
+    {
+      "epoch": 1.1198361823361824,
+      "grad_norm": 0.6697782278060913,
+      "learning_rate": 0.00016390748212951638,
+      "loss": 0.9627,
+      "step": 6290
+    },
+    {
+      "epoch": 1.120014245014245,
+      "grad_norm": 0.6341549754142761,
+      "learning_rate": 0.00016389671542374256,
+      "loss": 1.112,
+      "step": 6291
+    },
+    {
+      "epoch": 1.1201923076923077,
+      "grad_norm": 0.6913946270942688,
+      "learning_rate": 0.00016388594746604535,
+      "loss": 0.9622,
+      "step": 6292
+    },
+    {
+      "epoch": 1.1203703703703705,
+      "grad_norm": 0.695488691329956,
+      "learning_rate": 0.0001638751782566357,
+      "loss": 1.0951,
+      "step": 6293
+    },
+    {
+      "epoch": 1.120548433048433,
+      "grad_norm": 0.6965359449386597,
+      "learning_rate": 0.00016386440779572463,
+      "loss": 1.1742,
+      "step": 6294
+    },
+    {
+      "epoch": 1.1207264957264957,
+      "grad_norm": 0.624679684638977,
+      "learning_rate": 0.00016385363608352314,
+      "loss": 0.9756,
+      "step": 6295
+    },
+    {
+      "epoch": 1.1209045584045585,
+      "grad_norm": 0.7511318922042847,
+      "learning_rate": 0.0001638428631202423,
+      "loss": 0.907,
+      "step": 6296
+    },
+    {
+      "epoch": 1.121082621082621,
+      "grad_norm": 0.5334641337394714,
+      "learning_rate": 0.00016383208890609317,
+      "loss": 0.7932,
+      "step": 6297
+    },
+    {
+      "epoch": 1.1212606837606838,
+      "grad_norm": 0.7518552541732788,
+      "learning_rate": 0.00016382131344128687,
+      "loss": 1.1556,
+      "step": 6298
+    },
+    {
+      "epoch": 1.1214387464387465,
+      "grad_norm": 0.618618369102478,
+      "learning_rate": 0.00016381053672603449,
+      "loss": 1.1027,
+      "step": 6299
+    },
+    {
+      "epoch": 1.121616809116809,
+      "grad_norm": 0.638956606388092,
+      "learning_rate": 0.00016379975876054724,
+      "loss": 1.0377,
+      "step": 6300
+    },
+    {
+      "epoch": 1.1217948717948718,
+      "grad_norm": 0.8031370639801025,
+      "learning_rate": 0.0001637889795450362,
+      "loss": 1.0821,
+      "step": 6301
+    },
+    {
+      "epoch": 1.1219729344729346,
+      "grad_norm": 0.6710168123245239,
+      "learning_rate": 0.00016377819907971265,
+      "loss": 1.2896,
+      "step": 6302
+    },
+    {
+      "epoch": 1.122150997150997,
+      "grad_norm": 0.5850739479064941,
+      "learning_rate": 0.00016376741736478777,
+      "loss": 1.0836,
+      "step": 6303
+    },
+    {
+      "epoch": 1.1223290598290598,
+      "grad_norm": 0.6410611271858215,
+      "learning_rate": 0.0001637566344004728,
+      "loss": 1.0395,
+      "step": 6304
+    },
+    {
+      "epoch": 1.1225071225071226,
+      "grad_norm": 0.6884660720825195,
+      "learning_rate": 0.00016374585018697903,
+      "loss": 0.871,
+      "step": 6305
+    },
+    {
+      "epoch": 1.1226851851851851,
+      "grad_norm": 0.622207522392273,
+      "learning_rate": 0.00016373506472451777,
+      "loss": 0.9897,
+      "step": 6306
+    },
+    {
+      "epoch": 1.1228632478632479,
+      "grad_norm": 0.6018275618553162,
+      "learning_rate": 0.00016372427801330028,
+      "loss": 0.8398,
+      "step": 6307
+    },
+    {
+      "epoch": 1.1230413105413106,
+      "grad_norm": 0.6451539993286133,
+      "learning_rate": 0.00016371349005353796,
+      "loss": 0.9878,
+      "step": 6308
+    },
+    {
+      "epoch": 1.1232193732193732,
+      "grad_norm": 0.5549424886703491,
+      "learning_rate": 0.00016370270084544215,
+      "loss": 0.844,
+      "step": 6309
+    },
+    {
+      "epoch": 1.123397435897436,
+      "grad_norm": 0.6082940697669983,
+      "learning_rate": 0.00016369191038922423,
+      "loss": 1.0704,
+      "step": 6310
+    },
+    {
+      "epoch": 1.1235754985754987,
+      "grad_norm": 0.6423100829124451,
+      "learning_rate": 0.00016368111868509563,
+      "loss": 1.0639,
+      "step": 6311
+    },
+    {
+      "epoch": 1.1237535612535612,
+      "grad_norm": 0.6274200081825256,
+      "learning_rate": 0.00016367032573326784,
+      "loss": 0.9996,
+      "step": 6312
+    },
+    {
+      "epoch": 1.123931623931624,
+      "grad_norm": 0.6618558168411255,
+      "learning_rate": 0.00016365953153395227,
+      "loss": 0.8074,
+      "step": 6313
+    },
+    {
+      "epoch": 1.1241096866096867,
+      "grad_norm": 0.7624069452285767,
+      "learning_rate": 0.00016364873608736038,
+      "loss": 0.9741,
+      "step": 6314
+    },
+    {
+      "epoch": 1.1242877492877492,
+      "grad_norm": 0.5391361117362976,
+      "learning_rate": 0.00016363793939370375,
+      "loss": 0.6992,
+      "step": 6315
+    },
+    {
+      "epoch": 1.124465811965812,
+      "grad_norm": 0.7564396858215332,
+      "learning_rate": 0.0001636271414531939,
+      "loss": 1.1971,
+      "step": 6316
+    },
+    {
+      "epoch": 1.1246438746438747,
+      "grad_norm": 0.6584066152572632,
+      "learning_rate": 0.00016361634226604239,
+      "loss": 1.0842,
+      "step": 6317
+    },
+    {
+      "epoch": 1.1248219373219372,
+      "grad_norm": 0.6851227283477783,
+      "learning_rate": 0.00016360554183246078,
+      "loss": 1.0879,
+      "step": 6318
+    },
+    {
+      "epoch": 1.125,
+      "grad_norm": 0.5699417591094971,
+      "learning_rate": 0.00016359474015266074,
+      "loss": 0.782,
+      "step": 6319
+    },
+    {
+      "epoch": 1.1251780626780628,
+      "grad_norm": 0.5495570302009583,
+      "learning_rate": 0.00016358393722685385,
+      "loss": 1.076,
+      "step": 6320
+    },
+    {
+      "epoch": 1.1253561253561253,
+      "grad_norm": 0.5872206091880798,
+      "learning_rate": 0.0001635731330552518,
+      "loss": 0.8601,
+      "step": 6321
+    },
+    {
+      "epoch": 1.125534188034188,
+      "grad_norm": 0.7012827396392822,
+      "learning_rate": 0.00016356232763806627,
+      "loss": 1.0443,
+      "step": 6322
+    },
+    {
+      "epoch": 1.1257122507122508,
+      "grad_norm": 0.6645881533622742,
+      "learning_rate": 0.00016355152097550897,
+      "loss": 1.0027,
+      "step": 6323
+    },
+    {
+      "epoch": 1.1258903133903133,
+      "grad_norm": 0.7376120090484619,
+      "learning_rate": 0.00016354071306779163,
+      "loss": 1.1941,
+      "step": 6324
+    },
+    {
+      "epoch": 1.126068376068376,
+      "grad_norm": 0.648932695388794,
+      "learning_rate": 0.000163529903915126,
+      "loss": 1.096,
+      "step": 6325
+    },
+    {
+      "epoch": 1.1262464387464388,
+      "grad_norm": 0.6186314821243286,
+      "learning_rate": 0.0001635190935177239,
+      "loss": 1.011,
+      "step": 6326
+    },
+    {
+      "epoch": 1.1264245014245013,
+      "grad_norm": 0.5964710116386414,
+      "learning_rate": 0.0001635082818757971,
+      "loss": 0.8893,
+      "step": 6327
+    },
+    {
+      "epoch": 1.126602564102564,
+      "grad_norm": 0.5264934301376343,
+      "learning_rate": 0.00016349746898955747,
+      "loss": 0.7325,
+      "step": 6328
+    },
+    {
+      "epoch": 1.1267806267806268,
+      "grad_norm": 0.6523048877716064,
+      "learning_rate": 0.00016348665485921678,
+      "loss": 1.0488,
+      "step": 6329
+    },
+    {
+      "epoch": 1.1269586894586894,
+      "grad_norm": 0.6878600120544434,
+      "learning_rate": 0.00016347583948498703,
+      "loss": 1.0926,
+      "step": 6330
+    },
+    {
+      "epoch": 1.1271367521367521,
+      "grad_norm": 0.592656672000885,
+      "learning_rate": 0.00016346502286708004,
+      "loss": 0.978,
+      "step": 6331
+    },
+    {
+      "epoch": 1.1273148148148149,
+      "grad_norm": 0.6338315606117249,
+      "learning_rate": 0.00016345420500570777,
+      "loss": 1.1048,
+      "step": 6332
+    },
+    {
+      "epoch": 1.1274928774928774,
+      "grad_norm": 0.5955204367637634,
+      "learning_rate": 0.00016344338590108218,
+      "loss": 0.88,
+      "step": 6333
+    },
+    {
+      "epoch": 1.1276709401709402,
+      "grad_norm": 0.690448522567749,
+      "learning_rate": 0.0001634325655534152,
+      "loss": 1.0564,
+      "step": 6334
+    },
+    {
+      "epoch": 1.127849002849003,
+      "grad_norm": 0.6125795841217041,
+      "learning_rate": 0.00016342174396291888,
+      "loss": 1.0608,
+      "step": 6335
+    },
+    {
+      "epoch": 1.1280270655270654,
+      "grad_norm": 0.6387807726860046,
+      "learning_rate": 0.00016341092112980523,
+      "loss": 0.9581,
+      "step": 6336
+    },
+    {
+      "epoch": 1.1282051282051282,
+      "grad_norm": 0.6247823238372803,
+      "learning_rate": 0.0001634000970542863,
+      "loss": 0.932,
+      "step": 6337
+    },
+    {
+      "epoch": 1.128383190883191,
+      "grad_norm": 0.5928077697753906,
+      "learning_rate": 0.0001633892717365742,
+      "loss": 0.8963,
+      "step": 6338
+    },
+    {
+      "epoch": 1.1285612535612535,
+      "grad_norm": 0.5922074913978577,
+      "learning_rate": 0.000163378445176881,
+      "loss": 0.9772,
+      "step": 6339
+    },
+    {
+      "epoch": 1.1287393162393162,
+      "grad_norm": 0.6573056578636169,
+      "learning_rate": 0.00016336761737541878,
+      "loss": 0.8233,
+      "step": 6340
+    },
+    {
+      "epoch": 1.128917378917379,
+      "grad_norm": 0.627772867679596,
+      "learning_rate": 0.0001633567883323998,
+      "loss": 0.9618,
+      "step": 6341
+    },
+    {
+      "epoch": 1.1290954415954415,
+      "grad_norm": 0.6066579818725586,
+      "learning_rate": 0.0001633459580480361,
+      "loss": 0.9066,
+      "step": 6342
+    },
+    {
+      "epoch": 1.1292735042735043,
+      "grad_norm": 0.670295000076294,
+      "learning_rate": 0.00016333512652253997,
+      "loss": 0.8003,
+      "step": 6343
+    },
+    {
+      "epoch": 1.129451566951567,
+      "grad_norm": 0.6402488946914673,
+      "learning_rate": 0.0001633242937561236,
+      "loss": 0.998,
+      "step": 6344
+    },
+    {
+      "epoch": 1.1296296296296295,
+      "grad_norm": 0.7224995493888855,
+      "learning_rate": 0.00016331345974899923,
+      "loss": 1.0308,
+      "step": 6345
+    },
+    {
+      "epoch": 1.1298076923076923,
+      "grad_norm": 0.5019716620445251,
+      "learning_rate": 0.00016330262450137917,
+      "loss": 0.6874,
+      "step": 6346
+    },
+    {
+      "epoch": 1.129985754985755,
+      "grad_norm": 0.5774167776107788,
+      "learning_rate": 0.00016329178801347566,
+      "loss": 0.8287,
+      "step": 6347
+    },
+    {
+      "epoch": 1.1301638176638176,
+      "grad_norm": 0.7797795534133911,
+      "learning_rate": 0.00016328095028550103,
+      "loss": 1.2145,
+      "step": 6348
+    },
+    {
+      "epoch": 1.1303418803418803,
+      "grad_norm": 0.5384017825126648,
+      "learning_rate": 0.00016327011131766765,
+      "loss": 0.8022,
+      "step": 6349
+    },
+    {
+      "epoch": 1.130519943019943,
+      "grad_norm": 0.6350888609886169,
+      "learning_rate": 0.00016325927111018786,
+      "loss": 1.1178,
+      "step": 6350
+    },
+    {
+      "epoch": 1.1306980056980056,
+      "grad_norm": 0.6386831998825073,
+      "learning_rate": 0.0001632484296632741,
+      "loss": 0.967,
+      "step": 6351
+    },
+    {
+      "epoch": 1.1308760683760684,
+      "grad_norm": 0.6214167475700378,
+      "learning_rate": 0.0001632375869771387,
+      "loss": 0.9416,
+      "step": 6352
+    },
+    {
+      "epoch": 1.131054131054131,
+      "grad_norm": 0.6145567297935486,
+      "learning_rate": 0.00016322674305199416,
+      "loss": 0.9175,
+      "step": 6353
+    },
+    {
+      "epoch": 1.1312321937321936,
+      "grad_norm": 0.7027857303619385,
+      "learning_rate": 0.00016321589788805297,
+      "loss": 1.0063,
+      "step": 6354
+    },
+    {
+      "epoch": 1.1314102564102564,
+      "grad_norm": 0.6942669153213501,
+      "learning_rate": 0.00016320505148552755,
+      "loss": 0.9191,
+      "step": 6355
+    },
+    {
+      "epoch": 1.1315883190883191,
+      "grad_norm": 0.6388658285140991,
+      "learning_rate": 0.0001631942038446304,
+      "loss": 0.993,
+      "step": 6356
+    },
+    {
+      "epoch": 1.131766381766382,
+      "grad_norm": 0.6627292633056641,
+      "learning_rate": 0.00016318335496557415,
+      "loss": 1.0055,
+      "step": 6357
+    },
+    {
+      "epoch": 1.1319444444444444,
+      "grad_norm": 0.7997342944145203,
+      "learning_rate": 0.0001631725048485713,
+      "loss": 0.9019,
+      "step": 6358
+    },
+    {
+      "epoch": 1.1321225071225072,
+      "grad_norm": 0.8817830681800842,
+      "learning_rate": 0.00016316165349383445,
+      "loss": 0.9793,
+      "step": 6359
+    },
+    {
+      "epoch": 1.1323005698005697,
+      "grad_norm": 0.5629408955574036,
+      "learning_rate": 0.00016315080090157621,
+      "loss": 0.6139,
+      "step": 6360
+    },
+    {
+      "epoch": 1.1324786324786325,
+      "grad_norm": 0.647220253944397,
+      "learning_rate": 0.0001631399470720092,
+      "loss": 0.9776,
+      "step": 6361
+    },
+    {
+      "epoch": 1.1326566951566952,
+      "grad_norm": 0.6762630939483643,
+      "learning_rate": 0.0001631290920053461,
+      "loss": 1.1027,
+      "step": 6362
+    },
+    {
+      "epoch": 1.132834757834758,
+      "grad_norm": 0.5862727761268616,
+      "learning_rate": 0.00016311823570179957,
+      "loss": 1.1359,
+      "step": 6363
+    },
+    {
+      "epoch": 1.1330128205128205,
+      "grad_norm": 0.7042981386184692,
+      "learning_rate": 0.00016310737816158235,
+      "loss": 1.142,
+      "step": 6364
+    },
+    {
+      "epoch": 1.1331908831908832,
+      "grad_norm": 0.5990639328956604,
+      "learning_rate": 0.00016309651938490712,
+      "loss": 0.9306,
+      "step": 6365
+    },
+    {
+      "epoch": 1.1333689458689458,
+      "grad_norm": 0.5894871950149536,
+      "learning_rate": 0.00016308565937198669,
+      "loss": 0.8343,
+      "step": 6366
+    },
+    {
+      "epoch": 1.1335470085470085,
+      "grad_norm": 0.6863628029823303,
+      "learning_rate": 0.0001630747981230338,
+      "loss": 0.9552,
+      "step": 6367
+    },
+    {
+      "epoch": 1.1337250712250713,
+      "grad_norm": 0.7438958287239075,
+      "learning_rate": 0.00016306393563826128,
+      "loss": 1.0422,
+      "step": 6368
+    },
+    {
+      "epoch": 1.133903133903134,
+      "grad_norm": 0.5695775747299194,
+      "learning_rate": 0.00016305307191788194,
+      "loss": 0.8633,
+      "step": 6369
+    },
+    {
+      "epoch": 1.1340811965811965,
+      "grad_norm": 0.6257741451263428,
+      "learning_rate": 0.00016304220696210863,
+      "loss": 1.0333,
+      "step": 6370
+    },
+    {
+      "epoch": 1.1342592592592593,
+      "grad_norm": 0.6366072297096252,
+      "learning_rate": 0.00016303134077115425,
+      "loss": 1.1452,
+      "step": 6371
+    },
+    {
+      "epoch": 1.1344373219373218,
+      "grad_norm": 0.624569296836853,
+      "learning_rate": 0.00016302047334523168,
+      "loss": 1.0569,
+      "step": 6372
+    },
+    {
+      "epoch": 1.1346153846153846,
+      "grad_norm": 0.5585938096046448,
+      "learning_rate": 0.00016300960468455382,
+      "loss": 0.9612,
+      "step": 6373
+    },
+    {
+      "epoch": 1.1347934472934473,
+      "grad_norm": 0.5738831162452698,
+      "learning_rate": 0.00016299873478933368,
+      "loss": 0.9206,
+      "step": 6374
+    },
+    {
+      "epoch": 1.13497150997151,
+      "grad_norm": 0.6797143220901489,
+      "learning_rate": 0.00016298786365978417,
+      "loss": 1.0748,
+      "step": 6375
+    },
+    {
+      "epoch": 1.1351495726495726,
+      "grad_norm": 0.6341326832771301,
+      "learning_rate": 0.00016297699129611833,
+      "loss": 0.9901,
+      "step": 6376
+    },
+    {
+      "epoch": 1.1353276353276354,
+      "grad_norm": 0.6568490862846375,
+      "learning_rate": 0.00016296611769854916,
+      "loss": 1.0598,
+      "step": 6377
+    },
+    {
+      "epoch": 1.135505698005698,
+      "grad_norm": 0.6151928901672363,
+      "learning_rate": 0.00016295524286728973,
+      "loss": 0.8352,
+      "step": 6378
+    },
+    {
+      "epoch": 1.1356837606837606,
+      "grad_norm": 0.7209593057632446,
+      "learning_rate": 0.0001629443668025531,
+      "loss": 0.9945,
+      "step": 6379
+    },
+    {
+      "epoch": 1.1358618233618234,
+      "grad_norm": 0.6600689888000488,
+      "learning_rate": 0.00016293348950455235,
+      "loss": 1.0572,
+      "step": 6380
+    },
+    {
+      "epoch": 1.1360398860398861,
+      "grad_norm": 0.5587523579597473,
+      "learning_rate": 0.0001629226109735006,
+      "loss": 0.8526,
+      "step": 6381
+    },
+    {
+      "epoch": 1.1362179487179487,
+      "grad_norm": 0.6184542775154114,
+      "learning_rate": 0.00016291173120961102,
+      "loss": 0.8246,
+      "step": 6382
+    },
+    {
+      "epoch": 1.1363960113960114,
+      "grad_norm": 0.6604713797569275,
+      "learning_rate": 0.00016290085021309673,
+      "loss": 1.0349,
+      "step": 6383
+    },
+    {
+      "epoch": 1.136574074074074,
+      "grad_norm": 0.5880835056304932,
+      "learning_rate": 0.00016288996798417097,
+      "loss": 0.8726,
+      "step": 6384
+    },
+    {
+      "epoch": 1.1367521367521367,
+      "grad_norm": 0.5770880579948425,
+      "learning_rate": 0.00016287908452304692,
+      "loss": 0.7639,
+      "step": 6385
+    },
+    {
+      "epoch": 1.1369301994301995,
+      "grad_norm": 0.5719713568687439,
+      "learning_rate": 0.00016286819982993782,
+      "loss": 0.9717,
+      "step": 6386
+    },
+    {
+      "epoch": 1.1371082621082622,
+      "grad_norm": 0.7028461694717407,
+      "learning_rate": 0.00016285731390505695,
+      "loss": 1.0147,
+      "step": 6387
+    },
+    {
+      "epoch": 1.1372863247863247,
+      "grad_norm": 0.5396828651428223,
+      "learning_rate": 0.00016284642674861756,
+      "loss": 0.8119,
+      "step": 6388
+    },
+    {
+      "epoch": 1.1374643874643875,
+      "grad_norm": 0.592580258846283,
+      "learning_rate": 0.00016283553836083303,
+      "loss": 1.0914,
+      "step": 6389
+    },
+    {
+      "epoch": 1.13764245014245,
+      "grad_norm": 0.634596586227417,
+      "learning_rate": 0.00016282464874191663,
+      "loss": 1.1037,
+      "step": 6390
+    },
+    {
+      "epoch": 1.1378205128205128,
+      "grad_norm": 0.6462705731391907,
+      "learning_rate": 0.00016281375789208176,
+      "loss": 1.1523,
+      "step": 6391
+    },
+    {
+      "epoch": 1.1379985754985755,
+      "grad_norm": 0.6527917385101318,
+      "learning_rate": 0.0001628028658115418,
+      "loss": 1.0415,
+      "step": 6392
+    },
+    {
+      "epoch": 1.1381766381766383,
+      "grad_norm": 0.6309964060783386,
+      "learning_rate": 0.00016279197250051013,
+      "loss": 0.9747,
+      "step": 6393
+    },
+    {
+      "epoch": 1.1383547008547008,
+      "grad_norm": 0.6342993974685669,
+      "learning_rate": 0.00016278107795920018,
+      "loss": 0.9897,
+      "step": 6394
+    },
+    {
+      "epoch": 1.1385327635327636,
+      "grad_norm": 0.7149887084960938,
+      "learning_rate": 0.00016277018218782544,
+      "loss": 0.9659,
+      "step": 6395
+    },
+    {
+      "epoch": 1.138710826210826,
+      "grad_norm": 0.7219462394714355,
+      "learning_rate": 0.00016275928518659938,
+      "loss": 0.9301,
+      "step": 6396
+    },
+    {
+      "epoch": 1.1388888888888888,
+      "grad_norm": 0.6649485230445862,
+      "learning_rate": 0.0001627483869557355,
+      "loss": 0.9012,
+      "step": 6397
+    },
+    {
+      "epoch": 1.1390669515669516,
+      "grad_norm": 0.6910027861595154,
+      "learning_rate": 0.00016273748749544731,
+      "loss": 0.956,
+      "step": 6398
+    },
+    {
+      "epoch": 1.1392450142450143,
+      "grad_norm": 0.6369016766548157,
+      "learning_rate": 0.00016272658680594837,
+      "loss": 0.8027,
+      "step": 6399
+    },
+    {
+      "epoch": 1.1394230769230769,
+      "grad_norm": 0.6540524959564209,
+      "learning_rate": 0.00016271568488745227,
+      "loss": 1.2397,
+      "step": 6400
+    },
+    {
+      "epoch": 1.1396011396011396,
+      "grad_norm": 0.5912376046180725,
+      "learning_rate": 0.00016270478174017263,
+      "loss": 0.8453,
+      "step": 6401
+    },
+    {
+      "epoch": 1.1397792022792024,
+      "grad_norm": 0.6847240924835205,
+      "learning_rate": 0.00016269387736432303,
+      "loss": 0.9776,
+      "step": 6402
+    },
+    {
+      "epoch": 1.139957264957265,
+      "grad_norm": 0.6465024352073669,
+      "learning_rate": 0.00016268297176011716,
+      "loss": 0.8971,
+      "step": 6403
+    },
+    {
+      "epoch": 1.1401353276353277,
+      "grad_norm": 0.6639063954353333,
+      "learning_rate": 0.00016267206492776866,
+      "loss": 0.9756,
+      "step": 6404
+    },
+    {
+      "epoch": 1.1403133903133904,
+      "grad_norm": 0.6343763470649719,
+      "learning_rate": 0.00016266115686749123,
+      "loss": 0.9368,
+      "step": 6405
+    },
+    {
+      "epoch": 1.140491452991453,
+      "grad_norm": 0.7144993543624878,
+      "learning_rate": 0.0001626502475794986,
+      "loss": 0.9285,
+      "step": 6406
+    },
+    {
+      "epoch": 1.1406695156695157,
+      "grad_norm": 0.6217414736747742,
+      "learning_rate": 0.00016263933706400451,
+      "loss": 0.8867,
+      "step": 6407
+    },
+    {
+      "epoch": 1.1408475783475784,
+      "grad_norm": 0.6843730807304382,
+      "learning_rate": 0.00016262842532122274,
+      "loss": 0.9863,
+      "step": 6408
+    },
+    {
+      "epoch": 1.141025641025641,
+      "grad_norm": 0.6866166591644287,
+      "learning_rate": 0.00016261751235136705,
+      "loss": 1.0517,
+      "step": 6409
+    },
+    {
+      "epoch": 1.1412037037037037,
+      "grad_norm": 0.6650584936141968,
+      "learning_rate": 0.0001626065981546513,
+      "loss": 1.0629,
+      "step": 6410
+    },
+    {
+      "epoch": 1.1413817663817665,
+      "grad_norm": 0.5805012583732605,
+      "learning_rate": 0.00016259568273128933,
+      "loss": 0.8175,
+      "step": 6411
+    },
+    {
+      "epoch": 1.141559829059829,
+      "grad_norm": 0.7005903124809265,
+      "learning_rate": 0.00016258476608149497,
+      "loss": 1.0267,
+      "step": 6412
+    },
+    {
+      "epoch": 1.1417378917378918,
+      "grad_norm": 0.6293461322784424,
+      "learning_rate": 0.00016257384820548217,
+      "loss": 1.1034,
+      "step": 6413
+    },
+    {
+      "epoch": 1.1419159544159545,
+      "grad_norm": 0.6281774640083313,
+      "learning_rate": 0.00016256292910346476,
+      "loss": 1.0775,
+      "step": 6414
+    },
+    {
+      "epoch": 1.142094017094017,
+      "grad_norm": 0.5912862420082092,
+      "learning_rate": 0.0001625520087756567,
+      "loss": 0.9589,
+      "step": 6415
+    },
+    {
+      "epoch": 1.1422720797720798,
+      "grad_norm": 0.5813978314399719,
+      "learning_rate": 0.00016254108722227198,
+      "loss": 0.9195,
+      "step": 6416
+    },
+    {
+      "epoch": 1.1424501424501425,
+      "grad_norm": 0.650805652141571,
+      "learning_rate": 0.00016253016444352458,
+      "loss": 1.0207,
+      "step": 6417
+    },
+    {
+      "epoch": 1.142628205128205,
+      "grad_norm": 0.6909520030021667,
+      "learning_rate": 0.00016251924043962851,
+      "loss": 0.9854,
+      "step": 6418
+    },
+    {
+      "epoch": 1.1428062678062678,
+      "grad_norm": 0.6054595112800598,
+      "learning_rate": 0.0001625083152107978,
+      "loss": 0.852,
+      "step": 6419
+    },
+    {
+      "epoch": 1.1429843304843306,
+      "grad_norm": 0.601078987121582,
+      "learning_rate": 0.00016249738875724647,
+      "loss": 0.9609,
+      "step": 6420
+    },
+    {
+      "epoch": 1.143162393162393,
+      "grad_norm": 0.5340180397033691,
+      "learning_rate": 0.00016248646107918868,
+      "loss": 0.8364,
+      "step": 6421
+    },
+    {
+      "epoch": 1.1433404558404558,
+      "grad_norm": 0.6687821745872498,
+      "learning_rate": 0.00016247553217683846,
+      "loss": 1.005,
+      "step": 6422
+    },
+    {
+      "epoch": 1.1435185185185186,
+      "grad_norm": 0.6347902417182922,
+      "learning_rate": 0.00016246460205040998,
+      "loss": 1.026,
+      "step": 6423
+    },
+    {
+      "epoch": 1.1436965811965811,
+      "grad_norm": 0.6136734485626221,
+      "learning_rate": 0.00016245367070011736,
+      "loss": 0.7811,
+      "step": 6424
+    },
+    {
+      "epoch": 1.1438746438746439,
+      "grad_norm": 0.6591334342956543,
+      "learning_rate": 0.00016244273812617482,
+      "loss": 0.991,
+      "step": 6425
+    },
+    {
+      "epoch": 1.1440527065527066,
+      "grad_norm": 0.6062475442886353,
+      "learning_rate": 0.00016243180432879656,
+      "loss": 0.9879,
+      "step": 6426
+    },
+    {
+      "epoch": 1.1442307692307692,
+      "grad_norm": 0.5941380858421326,
+      "learning_rate": 0.00016242086930819678,
+      "loss": 0.9771,
+      "step": 6427
+    },
+    {
+      "epoch": 1.144408831908832,
+      "grad_norm": 0.7320533990859985,
+      "learning_rate": 0.00016240993306458973,
+      "loss": 1.0919,
+      "step": 6428
+    },
+    {
+      "epoch": 1.1445868945868947,
+      "grad_norm": 0.6998075246810913,
+      "learning_rate": 0.00016239899559818962,
+      "loss": 1.0721,
+      "step": 6429
+    },
+    {
+      "epoch": 1.1447649572649572,
+      "grad_norm": 0.847931444644928,
+      "learning_rate": 0.0001623880569092109,
+      "loss": 0.8759,
+      "step": 6430
+    },
+    {
+      "epoch": 1.14494301994302,
+      "grad_norm": 0.6670104265213013,
+      "learning_rate": 0.00016237711699786775,
+      "loss": 1.0515,
+      "step": 6431
+    },
+    {
+      "epoch": 1.1451210826210827,
+      "grad_norm": 0.601759672164917,
+      "learning_rate": 0.00016236617586437463,
+      "loss": 0.7298,
+      "step": 6432
+    },
+    {
+      "epoch": 1.1452991452991452,
+      "grad_norm": 0.6411594152450562,
+      "learning_rate": 0.00016235523350894578,
+      "loss": 0.9336,
+      "step": 6433
+    },
+    {
+      "epoch": 1.145477207977208,
+      "grad_norm": 0.6485120058059692,
+      "learning_rate": 0.0001623442899317957,
+      "loss": 1.1215,
+      "step": 6434
+    },
+    {
+      "epoch": 1.1456552706552707,
+      "grad_norm": 0.6041508316993713,
+      "learning_rate": 0.00016233334513313875,
+      "loss": 0.8917,
+      "step": 6435
+    },
+    {
+      "epoch": 1.1458333333333333,
+      "grad_norm": 0.6292745471000671,
+      "learning_rate": 0.0001623223991131894,
+      "loss": 0.9976,
+      "step": 6436
+    },
+    {
+      "epoch": 1.146011396011396,
+      "grad_norm": 0.5442200303077698,
+      "learning_rate": 0.0001623114518721621,
+      "loss": 0.8072,
+      "step": 6437
+    },
+    {
+      "epoch": 1.1461894586894588,
+      "grad_norm": 0.6668170094490051,
+      "learning_rate": 0.00016230050341027136,
+      "loss": 0.9641,
+      "step": 6438
+    },
+    {
+      "epoch": 1.1463675213675213,
+      "grad_norm": 0.644186794757843,
+      "learning_rate": 0.00016228955372773164,
+      "loss": 0.9248,
+      "step": 6439
+    },
+    {
+      "epoch": 1.146545584045584,
+      "grad_norm": 0.6661991477012634,
+      "learning_rate": 0.00016227860282475753,
+      "loss": 0.8719,
+      "step": 6440
+    },
+    {
+      "epoch": 1.1467236467236468,
+      "grad_norm": 0.5232062935829163,
+      "learning_rate": 0.00016226765070156355,
+      "loss": 0.5418,
+      "step": 6441
+    },
+    {
+      "epoch": 1.1469017094017093,
+      "grad_norm": 0.573176383972168,
+      "learning_rate": 0.00016225669735836436,
+      "loss": 1.0858,
+      "step": 6442
+    },
+    {
+      "epoch": 1.147079772079772,
+      "grad_norm": 0.6137439608573914,
+      "learning_rate": 0.00016224574279537446,
+      "loss": 1.1205,
+      "step": 6443
+    },
+    {
+      "epoch": 1.1472578347578348,
+      "grad_norm": 0.6328136920928955,
+      "learning_rate": 0.00016223478701280855,
+      "loss": 0.8957,
+      "step": 6444
+    },
+    {
+      "epoch": 1.1474358974358974,
+      "grad_norm": 0.6687374114990234,
+      "learning_rate": 0.00016222383001088126,
+      "loss": 1.0318,
+      "step": 6445
+    },
+    {
+      "epoch": 1.14761396011396,
+      "grad_norm": 0.6057115793228149,
+      "learning_rate": 0.0001622128717898073,
+      "loss": 0.9575,
+      "step": 6446
+    },
+    {
+      "epoch": 1.1477920227920229,
+      "grad_norm": 0.6758735775947571,
+      "learning_rate": 0.0001622019123498013,
+      "loss": 1.2273,
+      "step": 6447
+    },
+    {
+      "epoch": 1.1479700854700854,
+      "grad_norm": 0.6233550310134888,
+      "learning_rate": 0.0001621909516910781,
+      "loss": 0.7875,
+      "step": 6448
+    },
+    {
+      "epoch": 1.1481481481481481,
+      "grad_norm": 0.6371827721595764,
+      "learning_rate": 0.0001621799898138524,
+      "loss": 1.0488,
+      "step": 6449
+    },
+    {
+      "epoch": 1.148326210826211,
+      "grad_norm": 0.6179831624031067,
+      "learning_rate": 0.00016216902671833892,
+      "loss": 0.9792,
+      "step": 6450
+    },
+    {
+      "epoch": 1.1485042735042734,
+      "grad_norm": 0.6234193444252014,
+      "learning_rate": 0.00016215806240475256,
+      "loss": 0.927,
+      "step": 6451
+    },
+    {
+      "epoch": 1.1486823361823362,
+      "grad_norm": 0.6940563917160034,
+      "learning_rate": 0.00016214709687330803,
+      "loss": 1.047,
+      "step": 6452
+    },
+    {
+      "epoch": 1.148860398860399,
+      "grad_norm": 0.6567606925964355,
+      "learning_rate": 0.00016213613012422027,
+      "loss": 0.9695,
+      "step": 6453
+    },
+    {
+      "epoch": 1.1490384615384615,
+      "grad_norm": 0.7374183535575867,
+      "learning_rate": 0.0001621251621577041,
+      "loss": 1.0443,
+      "step": 6454
+    },
+    {
+      "epoch": 1.1492165242165242,
+      "grad_norm": 0.6789869666099548,
+      "learning_rate": 0.00016211419297397443,
+      "loss": 1.0319,
+      "step": 6455
+    },
+    {
+      "epoch": 1.149394586894587,
+      "grad_norm": 0.6225521564483643,
+      "learning_rate": 0.00016210322257324619,
+      "loss": 1.0529,
+      "step": 6456
+    },
+    {
+      "epoch": 1.1495726495726495,
+      "grad_norm": 0.619701623916626,
+      "learning_rate": 0.00016209225095573432,
+      "loss": 0.962,
+      "step": 6457
+    },
+    {
+      "epoch": 1.1497507122507122,
+      "grad_norm": 0.6132834553718567,
+      "learning_rate": 0.00016208127812165375,
+      "loss": 0.9588,
+      "step": 6458
+    },
+    {
+      "epoch": 1.149928774928775,
+      "grad_norm": 0.6005367040634155,
+      "learning_rate": 0.00016207030407121954,
+      "loss": 0.9497,
+      "step": 6459
+    },
+    {
+      "epoch": 1.1501068376068375,
+      "grad_norm": 0.575309157371521,
+      "learning_rate": 0.00016205932880464664,
+      "loss": 1.0035,
+      "step": 6460
+    },
+    {
+      "epoch": 1.1502849002849003,
+      "grad_norm": 0.5958710312843323,
+      "learning_rate": 0.0001620483523221501,
+      "loss": 1.0004,
+      "step": 6461
+    },
+    {
+      "epoch": 1.150462962962963,
+      "grad_norm": 0.5934719443321228,
+      "learning_rate": 0.000162037374623945,
+      "loss": 0.8694,
+      "step": 6462
+    },
+    {
+      "epoch": 1.1506410256410255,
+      "grad_norm": 0.6042510271072388,
+      "learning_rate": 0.00016202639571024643,
+      "loss": 0.8598,
+      "step": 6463
+    },
+    {
+      "epoch": 1.1508190883190883,
+      "grad_norm": 0.6206158399581909,
+      "learning_rate": 0.00016201541558126946,
+      "loss": 0.961,
+      "step": 6464
+    },
+    {
+      "epoch": 1.150997150997151,
+      "grad_norm": 0.5997715592384338,
+      "learning_rate": 0.00016200443423722925,
+      "loss": 0.8686,
+      "step": 6465
+    },
+    {
+      "epoch": 1.1511752136752136,
+      "grad_norm": 0.742457926273346,
+      "learning_rate": 0.00016199345167834098,
+      "loss": 1.1113,
+      "step": 6466
+    },
+    {
+      "epoch": 1.1513532763532763,
+      "grad_norm": 0.6772766709327698,
+      "learning_rate": 0.00016198246790481976,
+      "loss": 1.0717,
+      "step": 6467
+    },
+    {
+      "epoch": 1.151531339031339,
+      "grad_norm": 0.6127712726593018,
+      "learning_rate": 0.0001619714829168809,
+      "loss": 0.8887,
+      "step": 6468
+    },
+    {
+      "epoch": 1.1517094017094016,
+      "grad_norm": 0.5585067272186279,
+      "learning_rate": 0.00016196049671473954,
+      "loss": 1.0144,
+      "step": 6469
+    },
+    {
+      "epoch": 1.1518874643874644,
+      "grad_norm": 0.6269431710243225,
+      "learning_rate": 0.00016194950929861092,
+      "loss": 1.0206,
+      "step": 6470
+    },
+    {
+      "epoch": 1.1520655270655271,
+      "grad_norm": 0.6270785331726074,
+      "learning_rate": 0.0001619385206687104,
+      "loss": 1.0517,
+      "step": 6471
+    },
+    {
+      "epoch": 1.1522435897435896,
+      "grad_norm": 0.744712233543396,
+      "learning_rate": 0.00016192753082525322,
+      "loss": 1.0699,
+      "step": 6472
+    },
+    {
+      "epoch": 1.1524216524216524,
+      "grad_norm": 0.7025929689407349,
+      "learning_rate": 0.00016191653976845474,
+      "loss": 0.951,
+      "step": 6473
+    },
+    {
+      "epoch": 1.1525997150997151,
+      "grad_norm": 0.6175379753112793,
+      "learning_rate": 0.00016190554749853024,
+      "loss": 1.2153,
+      "step": 6474
+    },
+    {
+      "epoch": 1.1527777777777777,
+      "grad_norm": 0.6212149858474731,
+      "learning_rate": 0.00016189455401569513,
+      "loss": 1.0428,
+      "step": 6475
+    },
+    {
+      "epoch": 1.1529558404558404,
+      "grad_norm": 0.6716817617416382,
+      "learning_rate": 0.00016188355932016484,
+      "loss": 1.179,
+      "step": 6476
+    },
+    {
+      "epoch": 1.1531339031339032,
+      "grad_norm": 0.6247739791870117,
+      "learning_rate": 0.00016187256341215476,
+      "loss": 0.9451,
+      "step": 6477
+    },
+    {
+      "epoch": 1.153311965811966,
+      "grad_norm": 0.6223008036613464,
+      "learning_rate": 0.00016186156629188032,
+      "loss": 0.9915,
+      "step": 6478
+    },
+    {
+      "epoch": 1.1534900284900285,
+      "grad_norm": 0.5610866546630859,
+      "learning_rate": 0.000161850567959557,
+      "loss": 0.7741,
+      "step": 6479
+    },
+    {
+      "epoch": 1.1536680911680912,
+      "grad_norm": 0.6241226196289062,
+      "learning_rate": 0.0001618395684154003,
+      "loss": 1.2193,
+      "step": 6480
+    },
+    {
+      "epoch": 1.1538461538461537,
+      "grad_norm": 0.703789472579956,
+      "learning_rate": 0.00016182856765962567,
+      "loss": 1.0725,
+      "step": 6481
+    },
+    {
+      "epoch": 1.1540242165242165,
+      "grad_norm": 0.6802006959915161,
+      "learning_rate": 0.00016181756569244872,
+      "loss": 1.0908,
+      "step": 6482
+    },
+    {
+      "epoch": 1.1542022792022792,
+      "grad_norm": 0.6504136919975281,
+      "learning_rate": 0.000161806562514085,
+      "loss": 0.9706,
+      "step": 6483
+    },
+    {
+      "epoch": 1.154380341880342,
+      "grad_norm": 0.7217034101486206,
+      "learning_rate": 0.00016179555812475003,
+      "loss": 0.9084,
+      "step": 6484
+    },
+    {
+      "epoch": 1.1545584045584045,
+      "grad_norm": 0.5919039249420166,
+      "learning_rate": 0.0001617845525246595,
+      "loss": 0.949,
+      "step": 6485
+    },
+    {
+      "epoch": 1.1547364672364673,
+      "grad_norm": 0.6160184741020203,
+      "learning_rate": 0.00016177354571402902,
+      "loss": 0.8144,
+      "step": 6486
+    },
+    {
+      "epoch": 1.1549145299145298,
+      "grad_norm": 0.7323806285858154,
+      "learning_rate": 0.00016176253769307426,
+      "loss": 1.0528,
+      "step": 6487
+    },
+    {
+      "epoch": 1.1550925925925926,
+      "grad_norm": 0.6051317453384399,
+      "learning_rate": 0.0001617515284620108,
+      "loss": 0.9558,
+      "step": 6488
+    },
+    {
+      "epoch": 1.1552706552706553,
+      "grad_norm": 0.6418905258178711,
+      "learning_rate": 0.00016174051802105447,
+      "loss": 1.062,
+      "step": 6489
+    },
+    {
+      "epoch": 1.155448717948718,
+      "grad_norm": 0.6914883852005005,
+      "learning_rate": 0.00016172950637042096,
+      "loss": 0.9999,
+      "step": 6490
+    },
+    {
+      "epoch": 1.1556267806267806,
+      "grad_norm": 0.5558316707611084,
+      "learning_rate": 0.000161718493510326,
+      "loss": 0.9561,
+      "step": 6491
+    },
+    {
+      "epoch": 1.1558048433048433,
+      "grad_norm": 0.6632496118545532,
+      "learning_rate": 0.00016170747944098531,
+      "loss": 1.0133,
+      "step": 6492
+    },
+    {
+      "epoch": 1.1559829059829059,
+      "grad_norm": 0.6407149434089661,
+      "learning_rate": 0.00016169646416261478,
+      "loss": 1.0563,
+      "step": 6493
+    },
+    {
+      "epoch": 1.1561609686609686,
+      "grad_norm": 0.8128494024276733,
+      "learning_rate": 0.0001616854476754302,
+      "loss": 1.1559,
+      "step": 6494
+    },
+    {
+      "epoch": 1.1563390313390314,
+      "grad_norm": 0.6403429508209229,
+      "learning_rate": 0.00016167442997964742,
+      "loss": 1.0983,
+      "step": 6495
+    },
+    {
+      "epoch": 1.1565170940170941,
+      "grad_norm": 0.76612788438797,
+      "learning_rate": 0.0001616634110754823,
+      "loss": 0.973,
+      "step": 6496
+    },
+    {
+      "epoch": 1.1566951566951567,
+      "grad_norm": 0.6914355754852295,
+      "learning_rate": 0.0001616523909631507,
+      "loss": 0.9307,
+      "step": 6497
+    },
+    {
+      "epoch": 1.1568732193732194,
+      "grad_norm": 0.546602725982666,
+      "learning_rate": 0.00016164136964286863,
+      "loss": 1.0328,
+      "step": 6498
+    },
+    {
+      "epoch": 1.157051282051282,
+      "grad_norm": 0.5695818662643433,
+      "learning_rate": 0.00016163034711485193,
+      "loss": 0.9607,
+      "step": 6499
+    },
+    {
+      "epoch": 1.1572293447293447,
+      "grad_norm": 0.5649738311767578,
+      "learning_rate": 0.00016161932337931662,
+      "loss": 1.1521,
+      "step": 6500
+    },
+    {
+      "epoch": 1.1574074074074074,
+      "grad_norm": 0.6437582969665527,
+      "learning_rate": 0.00016160829843647867,
+      "loss": 0.9613,
+      "step": 6501
+    },
+    {
+      "epoch": 1.1575854700854702,
+      "grad_norm": 0.5841929316520691,
+      "learning_rate": 0.0001615972722865541,
+      "loss": 0.8187,
+      "step": 6502
+    },
+    {
+      "epoch": 1.1577635327635327,
+      "grad_norm": 0.6481246948242188,
+      "learning_rate": 0.00016158624492975892,
+      "loss": 1.0447,
+      "step": 6503
+    },
+    {
+      "epoch": 1.1579415954415955,
+      "grad_norm": 0.629804790019989,
+      "learning_rate": 0.0001615752163663092,
+      "loss": 0.9034,
+      "step": 6504
+    },
+    {
+      "epoch": 1.158119658119658,
+      "grad_norm": 0.5797054171562195,
+      "learning_rate": 0.00016156418659642104,
+      "loss": 0.8168,
+      "step": 6505
+    },
+    {
+      "epoch": 1.1582977207977208,
+      "grad_norm": 0.588424563407898,
+      "learning_rate": 0.00016155315562031052,
+      "loss": 0.828,
+      "step": 6506
+    },
+    {
+      "epoch": 1.1584757834757835,
+      "grad_norm": 0.7120068669319153,
+      "learning_rate": 0.0001615421234381938,
+      "loss": 1.0637,
+      "step": 6507
+    },
+    {
+      "epoch": 1.1586538461538463,
+      "grad_norm": 0.6635081768035889,
+      "learning_rate": 0.00016153109005028702,
+      "loss": 0.9838,
+      "step": 6508
+    },
+    {
+      "epoch": 1.1588319088319088,
+      "grad_norm": 0.6080414056777954,
+      "learning_rate": 0.00016152005545680634,
+      "loss": 0.983,
+      "step": 6509
+    },
+    {
+      "epoch": 1.1590099715099715,
+      "grad_norm": 0.7131237983703613,
+      "learning_rate": 0.00016150901965796796,
+      "loss": 1.1053,
+      "step": 6510
+    },
+    {
+      "epoch": 1.159188034188034,
+      "grad_norm": 0.6051005125045776,
+      "learning_rate": 0.00016149798265398813,
+      "loss": 0.9903,
+      "step": 6511
+    },
+    {
+      "epoch": 1.1593660968660968,
+      "grad_norm": 0.6193733811378479,
+      "learning_rate": 0.00016148694444508306,
+      "loss": 1.0478,
+      "step": 6512
+    },
+    {
+      "epoch": 1.1595441595441596,
+      "grad_norm": 0.567888081073761,
+      "learning_rate": 0.00016147590503146905,
+      "loss": 0.7995,
+      "step": 6513
+    },
+    {
+      "epoch": 1.1597222222222223,
+      "grad_norm": 0.6889783143997192,
+      "learning_rate": 0.00016146486441336242,
+      "loss": 0.9684,
+      "step": 6514
+    },
+    {
+      "epoch": 1.1599002849002849,
+      "grad_norm": 0.6470308303833008,
+      "learning_rate": 0.0001614538225909794,
+      "loss": 0.9824,
+      "step": 6515
+    },
+    {
+      "epoch": 1.1600783475783476,
+      "grad_norm": 0.6833886504173279,
+      "learning_rate": 0.00016144277956453638,
+      "loss": 0.9845,
+      "step": 6516
+    },
+    {
+      "epoch": 1.1602564102564104,
+      "grad_norm": 0.5827815532684326,
+      "learning_rate": 0.00016143173533424978,
+      "loss": 0.9476,
+      "step": 6517
+    },
+    {
+      "epoch": 1.1604344729344729,
+      "grad_norm": 0.6701242327690125,
+      "learning_rate": 0.00016142068990033593,
+      "loss": 1.0839,
+      "step": 6518
+    },
+    {
+      "epoch": 1.1606125356125356,
+      "grad_norm": 0.5844996571540833,
+      "learning_rate": 0.00016140964326301122,
+      "loss": 0.8861,
+      "step": 6519
+    },
+    {
+      "epoch": 1.1607905982905984,
+      "grad_norm": 0.5831994414329529,
+      "learning_rate": 0.00016139859542249214,
+      "loss": 0.9817,
+      "step": 6520
+    },
+    {
+      "epoch": 1.160968660968661,
+      "grad_norm": 0.6830124855041504,
+      "learning_rate": 0.0001613875463789951,
+      "loss": 0.8749,
+      "step": 6521
+    },
+    {
+      "epoch": 1.1611467236467237,
+      "grad_norm": 0.6003018021583557,
+      "learning_rate": 0.00016137649613273667,
+      "loss": 0.9593,
+      "step": 6522
+    },
+    {
+      "epoch": 1.1613247863247864,
+      "grad_norm": 0.5973994731903076,
+      "learning_rate": 0.00016136544468393327,
+      "loss": 1.0384,
+      "step": 6523
+    },
+    {
+      "epoch": 1.161502849002849,
+      "grad_norm": 0.6702523827552795,
+      "learning_rate": 0.00016135439203280143,
+      "loss": 1.0431,
+      "step": 6524
+    },
+    {
+      "epoch": 1.1616809116809117,
+      "grad_norm": 0.6160697937011719,
+      "learning_rate": 0.00016134333817955775,
+      "loss": 1.0339,
+      "step": 6525
+    },
+    {
+      "epoch": 1.1618589743589745,
+      "grad_norm": 0.7078264355659485,
+      "learning_rate": 0.0001613322831244188,
+      "loss": 1.0285,
+      "step": 6526
+    },
+    {
+      "epoch": 1.162037037037037,
+      "grad_norm": 0.5744216442108154,
+      "learning_rate": 0.00016132122686760117,
+      "loss": 0.6589,
+      "step": 6527
+    },
+    {
+      "epoch": 1.1622150997150997,
+      "grad_norm": 0.6802098155021667,
+      "learning_rate": 0.00016131016940932146,
+      "loss": 0.9532,
+      "step": 6528
+    },
+    {
+      "epoch": 1.1623931623931625,
+      "grad_norm": 0.6523237228393555,
+      "learning_rate": 0.00016129911074979635,
+      "loss": 0.9409,
+      "step": 6529
+    },
+    {
+      "epoch": 1.162571225071225,
+      "grad_norm": 0.710307776927948,
+      "learning_rate": 0.00016128805088924252,
+      "loss": 1.2536,
+      "step": 6530
+    },
+    {
+      "epoch": 1.1627492877492878,
+      "grad_norm": 0.6349819898605347,
+      "learning_rate": 0.0001612769898278766,
+      "loss": 1.0857,
+      "step": 6531
+    },
+    {
+      "epoch": 1.1629273504273505,
+      "grad_norm": 0.5348139405250549,
+      "learning_rate": 0.00016126592756591542,
+      "loss": 0.5969,
+      "step": 6532
+    },
+    {
+      "epoch": 1.163105413105413,
+      "grad_norm": 0.635619580745697,
+      "learning_rate": 0.00016125486410357564,
+      "loss": 0.9885,
+      "step": 6533
+    },
+    {
+      "epoch": 1.1632834757834758,
+      "grad_norm": 0.6434559226036072,
+      "learning_rate": 0.000161243799441074,
+      "loss": 0.8377,
+      "step": 6534
+    },
+    {
+      "epoch": 1.1634615384615385,
+      "grad_norm": 0.6509647369384766,
+      "learning_rate": 0.00016123273357862737,
+      "loss": 0.8393,
+      "step": 6535
+    },
+    {
+      "epoch": 1.163639601139601,
+      "grad_norm": 0.6179081797599792,
+      "learning_rate": 0.0001612216665164525,
+      "loss": 0.9143,
+      "step": 6536
+    },
+    {
+      "epoch": 1.1638176638176638,
+      "grad_norm": 0.5923223495483398,
+      "learning_rate": 0.0001612105982547663,
+      "loss": 1.0185,
+      "step": 6537
+    },
+    {
+      "epoch": 1.1639957264957266,
+      "grad_norm": 0.702150285243988,
+      "learning_rate": 0.00016119952879378556,
+      "loss": 0.863,
+      "step": 6538
+    },
+    {
+      "epoch": 1.164173789173789,
+      "grad_norm": 0.6596643328666687,
+      "learning_rate": 0.00016118845813372715,
+      "loss": 1.0089,
+      "step": 6539
+    },
+    {
+      "epoch": 1.1643518518518519,
+      "grad_norm": 0.7675769329071045,
+      "learning_rate": 0.00016117738627480804,
+      "loss": 1.0179,
+      "step": 6540
+    },
+    {
+      "epoch": 1.1645299145299146,
+      "grad_norm": 0.6742541193962097,
+      "learning_rate": 0.00016116631321724513,
+      "loss": 1.0663,
+      "step": 6541
+    },
+    {
+      "epoch": 1.1647079772079771,
+      "grad_norm": 0.7379785776138306,
+      "learning_rate": 0.0001611552389612554,
+      "loss": 1.0162,
+      "step": 6542
+    },
+    {
+      "epoch": 1.16488603988604,
+      "grad_norm": 0.5729365944862366,
+      "learning_rate": 0.00016114416350705577,
+      "loss": 0.8146,
+      "step": 6543
+    },
+    {
+      "epoch": 1.1650641025641026,
+      "grad_norm": 0.6481349468231201,
+      "learning_rate": 0.00016113308685486327,
+      "loss": 1.0748,
+      "step": 6544
+    },
+    {
+      "epoch": 1.1652421652421652,
+      "grad_norm": 0.5588181018829346,
+      "learning_rate": 0.00016112200900489493,
+      "loss": 0.7511,
+      "step": 6545
+    },
+    {
+      "epoch": 1.165420227920228,
+      "grad_norm": 0.674363911151886,
+      "learning_rate": 0.0001611109299573678,
+      "loss": 0.9852,
+      "step": 6546
+    },
+    {
+      "epoch": 1.1655982905982907,
+      "grad_norm": 0.6712620854377747,
+      "learning_rate": 0.00016109984971249893,
+      "loss": 0.9558,
+      "step": 6547
+    },
+    {
+      "epoch": 1.1657763532763532,
+      "grad_norm": 0.5260626077651978,
+      "learning_rate": 0.00016108876827050544,
+      "loss": 0.7008,
+      "step": 6548
+    },
+    {
+      "epoch": 1.165954415954416,
+      "grad_norm": 0.6056292057037354,
+      "learning_rate": 0.00016107768563160445,
+      "loss": 0.7756,
+      "step": 6549
+    },
+    {
+      "epoch": 1.1661324786324787,
+      "grad_norm": 0.5725821256637573,
+      "learning_rate": 0.00016106660179601308,
+      "loss": 0.8228,
+      "step": 6550
+    },
+    {
+      "epoch": 1.1663105413105412,
+      "grad_norm": 0.6708397269248962,
+      "learning_rate": 0.00016105551676394848,
+      "loss": 1.0711,
+      "step": 6551
+    },
+    {
+      "epoch": 1.166488603988604,
+      "grad_norm": 0.645453155040741,
+      "learning_rate": 0.00016104443053562787,
+      "loss": 0.9299,
+      "step": 6552
+    },
+    {
+      "epoch": 1.1666666666666667,
+      "grad_norm": 0.6743524074554443,
+      "learning_rate": 0.00016103334311126847,
+      "loss": 0.8977,
+      "step": 6553
+    },
+    {
+      "epoch": 1.1668447293447293,
+      "grad_norm": 0.7248545289039612,
+      "learning_rate": 0.0001610222544910875,
+      "loss": 1.2135,
+      "step": 6554
+    },
+    {
+      "epoch": 1.167022792022792,
+      "grad_norm": 0.5798853635787964,
+      "learning_rate": 0.00016101116467530217,
+      "loss": 0.857,
+      "step": 6555
+    },
+    {
+      "epoch": 1.1672008547008548,
+      "grad_norm": 0.6828082799911499,
+      "learning_rate": 0.00016100007366412985,
+      "loss": 0.9405,
+      "step": 6556
+    },
+    {
+      "epoch": 1.1673789173789173,
+      "grad_norm": 0.6820163130760193,
+      "learning_rate": 0.0001609889814577878,
+      "loss": 0.9144,
+      "step": 6557
+    },
+    {
+      "epoch": 1.16755698005698,
+      "grad_norm": 0.6482275128364563,
+      "learning_rate": 0.00016097788805649333,
+      "loss": 0.8586,
+      "step": 6558
+    },
+    {
+      "epoch": 1.1677350427350428,
+      "grad_norm": 0.6404715180397034,
+      "learning_rate": 0.00016096679346046385,
+      "loss": 0.7018,
+      "step": 6559
+    },
+    {
+      "epoch": 1.1679131054131053,
+      "grad_norm": 0.6315203309059143,
+      "learning_rate": 0.0001609556976699167,
+      "loss": 0.9602,
+      "step": 6560
+    },
+    {
+      "epoch": 1.168091168091168,
+      "grad_norm": 0.5521387457847595,
+      "learning_rate": 0.00016094460068506925,
+      "loss": 0.9294,
+      "step": 6561
+    },
+    {
+      "epoch": 1.1682692307692308,
+      "grad_norm": 0.583372175693512,
+      "learning_rate": 0.00016093350250613895,
+      "loss": 1.077,
+      "step": 6562
+    },
+    {
+      "epoch": 1.1684472934472934,
+      "grad_norm": 0.5990512371063232,
+      "learning_rate": 0.00016092240313334325,
+      "loss": 1.0102,
+      "step": 6563
+    },
+    {
+      "epoch": 1.1686253561253561,
+      "grad_norm": 0.675128161907196,
+      "learning_rate": 0.00016091130256689964,
+      "loss": 1.0407,
+      "step": 6564
+    },
+    {
+      "epoch": 1.1688034188034189,
+      "grad_norm": 0.48797324299812317,
+      "learning_rate": 0.00016090020080702556,
+      "loss": 0.7821,
+      "step": 6565
+    },
+    {
+      "epoch": 1.1689814814814814,
+      "grad_norm": 0.7487484216690063,
+      "learning_rate": 0.00016088909785393857,
+      "loss": 1.0444,
+      "step": 6566
+    },
+    {
+      "epoch": 1.1691595441595442,
+      "grad_norm": 0.6288858652114868,
+      "learning_rate": 0.00016087799370785618,
+      "loss": 1.1854,
+      "step": 6567
+    },
+    {
+      "epoch": 1.169337606837607,
+      "grad_norm": 0.6639021635055542,
+      "learning_rate": 0.000160866888368996,
+      "loss": 0.9632,
+      "step": 6568
+    },
+    {
+      "epoch": 1.1695156695156694,
+      "grad_norm": 0.6553738713264465,
+      "learning_rate": 0.00016085578183757556,
+      "loss": 1.2765,
+      "step": 6569
+    },
+    {
+      "epoch": 1.1696937321937322,
+      "grad_norm": 0.7489066123962402,
+      "learning_rate": 0.00016084467411381248,
+      "loss": 1.0705,
+      "step": 6570
+    },
+    {
+      "epoch": 1.169871794871795,
+      "grad_norm": 0.7079828381538391,
+      "learning_rate": 0.00016083356519792444,
+      "loss": 0.8256,
+      "step": 6571
+    },
+    {
+      "epoch": 1.1700498575498575,
+      "grad_norm": 0.7065926790237427,
+      "learning_rate": 0.00016082245509012902,
+      "loss": 1.0439,
+      "step": 6572
+    },
+    {
+      "epoch": 1.1702279202279202,
+      "grad_norm": 0.6113346815109253,
+      "learning_rate": 0.00016081134379064395,
+      "loss": 0.9153,
+      "step": 6573
+    },
+    {
+      "epoch": 1.170405982905983,
+      "grad_norm": 0.6094171404838562,
+      "learning_rate": 0.0001608002312996869,
+      "loss": 0.9723,
+      "step": 6574
+    },
+    {
+      "epoch": 1.1705840455840455,
+      "grad_norm": 0.6208072900772095,
+      "learning_rate": 0.00016078911761747565,
+      "loss": 0.948,
+      "step": 6575
+    },
+    {
+      "epoch": 1.1707621082621082,
+      "grad_norm": 0.5736680626869202,
+      "learning_rate": 0.00016077800274422792,
+      "loss": 0.9155,
+      "step": 6576
+    },
+    {
+      "epoch": 1.170940170940171,
+      "grad_norm": 0.6793957948684692,
+      "learning_rate": 0.0001607668866801615,
+      "loss": 0.9574,
+      "step": 6577
+    },
+    {
+      "epoch": 1.1711182336182335,
+      "grad_norm": 0.6251805424690247,
+      "learning_rate": 0.00016075576942549413,
+      "loss": 1.0319,
+      "step": 6578
+    },
+    {
+      "epoch": 1.1712962962962963,
+      "grad_norm": 0.628882110118866,
+      "learning_rate": 0.0001607446509804437,
+      "loss": 0.9336,
+      "step": 6579
+    },
+    {
+      "epoch": 1.171474358974359,
+      "grad_norm": 0.6712356805801392,
+      "learning_rate": 0.000160733531345228,
+      "loss": 1.0958,
+      "step": 6580
+    },
+    {
+      "epoch": 1.1716524216524216,
+      "grad_norm": 0.599365770816803,
+      "learning_rate": 0.0001607224105200649,
+      "loss": 0.9814,
+      "step": 6581
+    },
+    {
+      "epoch": 1.1718304843304843,
+      "grad_norm": 0.5798245668411255,
+      "learning_rate": 0.00016071128850517235,
+      "loss": 1.0355,
+      "step": 6582
+    },
+    {
+      "epoch": 1.172008547008547,
+      "grad_norm": 0.7646229863166809,
+      "learning_rate": 0.00016070016530076817,
+      "loss": 0.9976,
+      "step": 6583
+    },
+    {
+      "epoch": 1.1721866096866096,
+      "grad_norm": 0.6371127367019653,
+      "learning_rate": 0.0001606890409070704,
+      "loss": 0.9588,
+      "step": 6584
+    },
+    {
+      "epoch": 1.1723646723646723,
+      "grad_norm": 0.6497066617012024,
+      "learning_rate": 0.0001606779153242969,
+      "loss": 0.8817,
+      "step": 6585
+    },
+    {
+      "epoch": 1.172542735042735,
+      "grad_norm": 0.7255781888961792,
+      "learning_rate": 0.0001606667885526657,
+      "loss": 1.1319,
+      "step": 6586
+    },
+    {
+      "epoch": 1.1727207977207976,
+      "grad_norm": 0.67711341381073,
+      "learning_rate": 0.00016065566059239483,
+      "loss": 1.0755,
+      "step": 6587
+    },
+    {
+      "epoch": 1.1728988603988604,
+      "grad_norm": 0.6159650087356567,
+      "learning_rate": 0.00016064453144370227,
+      "loss": 0.9892,
+      "step": 6588
+    },
+    {
+      "epoch": 1.1730769230769231,
+      "grad_norm": 0.658938467502594,
+      "learning_rate": 0.00016063340110680609,
+      "loss": 0.9131,
+      "step": 6589
+    },
+    {
+      "epoch": 1.1732549857549857,
+      "grad_norm": 0.6754795908927917,
+      "learning_rate": 0.00016062226958192438,
+      "loss": 1.0119,
+      "step": 6590
+    },
+    {
+      "epoch": 1.1734330484330484,
+      "grad_norm": 0.6453405022621155,
+      "learning_rate": 0.00016061113686927523,
+      "loss": 0.997,
+      "step": 6591
+    },
+    {
+      "epoch": 1.1736111111111112,
+      "grad_norm": 0.6580284237861633,
+      "learning_rate": 0.00016060000296907675,
+      "loss": 0.8432,
+      "step": 6592
+    },
+    {
+      "epoch": 1.173789173789174,
+      "grad_norm": 0.6588153839111328,
+      "learning_rate": 0.00016058886788154712,
+      "loss": 1.0725,
+      "step": 6593
+    },
+    {
+      "epoch": 1.1739672364672364,
+      "grad_norm": 0.6247910857200623,
+      "learning_rate": 0.00016057773160690447,
+      "loss": 0.8736,
+      "step": 6594
+    },
+    {
+      "epoch": 1.1741452991452992,
+      "grad_norm": 0.579594075679779,
+      "learning_rate": 0.000160566594145367,
+      "loss": 0.8809,
+      "step": 6595
+    },
+    {
+      "epoch": 1.1743233618233617,
+      "grad_norm": 0.6738116145133972,
+      "learning_rate": 0.00016055545549715293,
+      "loss": 0.825,
+      "step": 6596
+    },
+    {
+      "epoch": 1.1745014245014245,
+      "grad_norm": 0.6658982634544373,
+      "learning_rate": 0.00016054431566248054,
+      "loss": 1.0809,
+      "step": 6597
+    },
+    {
+      "epoch": 1.1746794871794872,
+      "grad_norm": 0.5367915630340576,
+      "learning_rate": 0.00016053317464156803,
+      "loss": 0.9005,
+      "step": 6598
+    },
+    {
+      "epoch": 1.17485754985755,
+      "grad_norm": 0.7243228554725647,
+      "learning_rate": 0.00016052203243463372,
+      "loss": 1.0573,
+      "step": 6599
+    },
+    {
+      "epoch": 1.1750356125356125,
+      "grad_norm": 0.6359432935714722,
+      "learning_rate": 0.0001605108890418959,
+      "loss": 0.8569,
+      "step": 6600
+    },
+    {
+      "epoch": 1.1752136752136753,
+      "grad_norm": 0.6565225720405579,
+      "learning_rate": 0.0001604997444635729,
+      "loss": 0.9748,
+      "step": 6601
+    },
+    {
+      "epoch": 1.1753917378917378,
+      "grad_norm": 0.7124663591384888,
+      "learning_rate": 0.0001604885986998831,
+      "loss": 1.0271,
+      "step": 6602
+    },
+    {
+      "epoch": 1.1755698005698005,
+      "grad_norm": 0.659766435623169,
+      "learning_rate": 0.00016047745175104487,
+      "loss": 1.0635,
+      "step": 6603
+    },
+    {
+      "epoch": 1.1757478632478633,
+      "grad_norm": 0.5874318480491638,
+      "learning_rate": 0.00016046630361727656,
+      "loss": 0.9257,
+      "step": 6604
+    },
+    {
+      "epoch": 1.175925925925926,
+      "grad_norm": 0.587345540523529,
+      "learning_rate": 0.0001604551542987967,
+      "loss": 1.0759,
+      "step": 6605
+    },
+    {
+      "epoch": 1.1761039886039886,
+      "grad_norm": 0.733567476272583,
+      "learning_rate": 0.00016044400379582364,
+      "loss": 0.9877,
+      "step": 6606
+    },
+    {
+      "epoch": 1.1762820512820513,
+      "grad_norm": 0.6538317203521729,
+      "learning_rate": 0.0001604328521085759,
+      "loss": 1.0094,
+      "step": 6607
+    },
+    {
+      "epoch": 1.1764601139601139,
+      "grad_norm": 0.6279696822166443,
+      "learning_rate": 0.00016042169923727195,
+      "loss": 1.1049,
+      "step": 6608
+    },
+    {
+      "epoch": 1.1766381766381766,
+      "grad_norm": 0.6949752569198608,
+      "learning_rate": 0.00016041054518213033,
+      "loss": 1.1418,
+      "step": 6609
+    },
+    {
+      "epoch": 1.1768162393162394,
+      "grad_norm": 0.6144010424613953,
+      "learning_rate": 0.00016039938994336957,
+      "loss": 1.0306,
+      "step": 6610
+    },
+    {
+      "epoch": 1.176994301994302,
+      "grad_norm": 0.5868683457374573,
+      "learning_rate": 0.00016038823352120823,
+      "loss": 0.9894,
+      "step": 6611
+    },
+    {
+      "epoch": 1.1771723646723646,
+      "grad_norm": 0.7181115746498108,
+      "learning_rate": 0.0001603770759158649,
+      "loss": 1.1674,
+      "step": 6612
+    },
+    {
+      "epoch": 1.1773504273504274,
+      "grad_norm": 0.6271308064460754,
+      "learning_rate": 0.00016036591712755818,
+      "loss": 0.9726,
+      "step": 6613
+    },
+    {
+      "epoch": 1.17752849002849,
+      "grad_norm": 0.6922675371170044,
+      "learning_rate": 0.00016035475715650668,
+      "loss": 0.9142,
+      "step": 6614
+    },
+    {
+      "epoch": 1.1777065527065527,
+      "grad_norm": 0.6838833689689636,
+      "learning_rate": 0.00016034359600292913,
+      "loss": 1.1627,
+      "step": 6615
+    },
+    {
+      "epoch": 1.1778846153846154,
+      "grad_norm": 0.6628252267837524,
+      "learning_rate": 0.00016033243366704418,
+      "loss": 0.739,
+      "step": 6616
+    },
+    {
+      "epoch": 1.1780626780626782,
+      "grad_norm": 0.6367576122283936,
+      "learning_rate": 0.0001603212701490705,
+      "loss": 0.9015,
+      "step": 6617
+    },
+    {
+      "epoch": 1.1782407407407407,
+      "grad_norm": 0.6498967409133911,
+      "learning_rate": 0.00016031010544922687,
+      "loss": 0.9645,
+      "step": 6618
+    },
+    {
+      "epoch": 1.1784188034188035,
+      "grad_norm": 0.468795508146286,
+      "learning_rate": 0.00016029893956773198,
+      "loss": 0.7305,
+      "step": 6619
+    },
+    {
+      "epoch": 1.178596866096866,
+      "grad_norm": 0.6355500817298889,
+      "learning_rate": 0.00016028777250480465,
+      "loss": 0.9183,
+      "step": 6620
+    },
+    {
+      "epoch": 1.1787749287749287,
+      "grad_norm": 0.7582615613937378,
+      "learning_rate": 0.0001602766042606636,
+      "loss": 1.1641,
+      "step": 6621
+    },
+    {
+      "epoch": 1.1789529914529915,
+      "grad_norm": 0.580035924911499,
+      "learning_rate": 0.00016026543483552776,
+      "loss": 0.9164,
+      "step": 6622
+    },
+    {
+      "epoch": 1.1791310541310542,
+      "grad_norm": 0.6198559999465942,
+      "learning_rate": 0.00016025426422961592,
+      "loss": 0.9803,
+      "step": 6623
+    },
+    {
+      "epoch": 1.1793091168091168,
+      "grad_norm": 0.59112149477005,
+      "learning_rate": 0.0001602430924431469,
+      "loss": 0.8645,
+      "step": 6624
+    },
+    {
+      "epoch": 1.1794871794871795,
+      "grad_norm": 0.6200533509254456,
+      "learning_rate": 0.00016023191947633965,
+      "loss": 1.068,
+      "step": 6625
+    },
+    {
+      "epoch": 1.179665242165242,
+      "grad_norm": 0.6077516078948975,
+      "learning_rate": 0.00016022074532941305,
+      "loss": 1.0017,
+      "step": 6626
+    },
+    {
+      "epoch": 1.1798433048433048,
+      "grad_norm": 0.6770145893096924,
+      "learning_rate": 0.00016020957000258606,
+      "loss": 0.9022,
+      "step": 6627
+    },
+    {
+      "epoch": 1.1800213675213675,
+      "grad_norm": 0.6478054523468018,
+      "learning_rate": 0.0001601983934960776,
+      "loss": 0.8615,
+      "step": 6628
+    },
+    {
+      "epoch": 1.1801994301994303,
+      "grad_norm": 0.6528988480567932,
+      "learning_rate": 0.00016018721581010666,
+      "loss": 1.0015,
+      "step": 6629
+    },
+    {
+      "epoch": 1.1803774928774928,
+      "grad_norm": 0.6160712242126465,
+      "learning_rate": 0.0001601760369448923,
+      "loss": 0.9382,
+      "step": 6630
+    },
+    {
+      "epoch": 1.1805555555555556,
+      "grad_norm": 0.5755789875984192,
+      "learning_rate": 0.00016016485690065345,
+      "loss": 1.0551,
+      "step": 6631
+    },
+    {
+      "epoch": 1.180733618233618,
+      "grad_norm": 0.8495022654533386,
+      "learning_rate": 0.00016015367567760925,
+      "loss": 0.9295,
+      "step": 6632
+    },
+    {
+      "epoch": 1.1809116809116809,
+      "grad_norm": 0.6010929346084595,
+      "learning_rate": 0.0001601424932759787,
+      "loss": 1.0413,
+      "step": 6633
+    },
+    {
+      "epoch": 1.1810897435897436,
+      "grad_norm": 0.6953579187393188,
+      "learning_rate": 0.00016013130969598093,
+      "loss": 1.0149,
+      "step": 6634
+    },
+    {
+      "epoch": 1.1812678062678064,
+      "grad_norm": 0.6949529647827148,
+      "learning_rate": 0.0001601201249378351,
+      "loss": 0.9992,
+      "step": 6635
+    },
+    {
+      "epoch": 1.181445868945869,
+      "grad_norm": 0.6471893787384033,
+      "learning_rate": 0.00016010893900176028,
+      "loss": 0.7985,
+      "step": 6636
+    },
+    {
+      "epoch": 1.1816239316239316,
+      "grad_norm": 0.6524858474731445,
+      "learning_rate": 0.00016009775188797568,
+      "loss": 0.9517,
+      "step": 6637
+    },
+    {
+      "epoch": 1.1818019943019944,
+      "grad_norm": 0.639214038848877,
+      "learning_rate": 0.00016008656359670046,
+      "loss": 1.0357,
+      "step": 6638
+    },
+    {
+      "epoch": 1.181980056980057,
+      "grad_norm": 0.6039628386497498,
+      "learning_rate": 0.00016007537412815386,
+      "loss": 1.0536,
+      "step": 6639
+    },
+    {
+      "epoch": 1.1821581196581197,
+      "grad_norm": 0.653540313243866,
+      "learning_rate": 0.00016006418348255507,
+      "loss": 0.9414,
+      "step": 6640
+    },
+    {
+      "epoch": 1.1823361823361824,
+      "grad_norm": 0.6331741809844971,
+      "learning_rate": 0.0001600529916601234,
+      "loss": 1.0352,
+      "step": 6641
+    },
+    {
+      "epoch": 1.182514245014245,
+      "grad_norm": 0.7552719712257385,
+      "learning_rate": 0.00016004179866107812,
+      "loss": 1.1103,
+      "step": 6642
+    },
+    {
+      "epoch": 1.1826923076923077,
+      "grad_norm": 0.6795875430107117,
+      "learning_rate": 0.00016003060448563852,
+      "loss": 1.1246,
+      "step": 6643
+    },
+    {
+      "epoch": 1.1828703703703705,
+      "grad_norm": 0.6308842301368713,
+      "learning_rate": 0.0001600194091340239,
+      "loss": 0.9532,
+      "step": 6644
+    },
+    {
+      "epoch": 1.183048433048433,
+      "grad_norm": 0.5640553832054138,
+      "learning_rate": 0.00016000821260645366,
+      "loss": 0.7491,
+      "step": 6645
+    },
+    {
+      "epoch": 1.1832264957264957,
+      "grad_norm": 0.5611832141876221,
+      "learning_rate": 0.00015999701490314712,
+      "loss": 0.9239,
+      "step": 6646
+    },
+    {
+      "epoch": 1.1834045584045585,
+      "grad_norm": 0.5881187915802002,
+      "learning_rate": 0.00015998581602432374,
+      "loss": 0.9246,
+      "step": 6647
+    },
+    {
+      "epoch": 1.183582621082621,
+      "grad_norm": 0.7291010022163391,
+      "learning_rate": 0.00015997461597020291,
+      "loss": 1.0314,
+      "step": 6648
+    },
+    {
+      "epoch": 1.1837606837606838,
+      "grad_norm": 0.6784794926643372,
+      "learning_rate": 0.00015996341474100402,
+      "loss": 1.0011,
+      "step": 6649
+    },
+    {
+      "epoch": 1.1839387464387465,
+      "grad_norm": 0.7083746194839478,
+      "learning_rate": 0.00015995221233694663,
+      "loss": 1.0336,
+      "step": 6650
+    },
+    {
+      "epoch": 1.184116809116809,
+      "grad_norm": 0.7081790566444397,
+      "learning_rate": 0.00015994100875825015,
+      "loss": 1.2386,
+      "step": 6651
+    },
+    {
+      "epoch": 1.1842948717948718,
+      "grad_norm": 0.5938812494277954,
+      "learning_rate": 0.00015992980400513415,
+      "loss": 0.7549,
+      "step": 6652
+    },
+    {
+      "epoch": 1.1844729344729346,
+      "grad_norm": 0.7084267139434814,
+      "learning_rate": 0.00015991859807781811,
+      "loss": 1.1194,
+      "step": 6653
+    },
+    {
+      "epoch": 1.184650997150997,
+      "grad_norm": 0.6391362547874451,
+      "learning_rate": 0.0001599073909765216,
+      "loss": 1.0857,
+      "step": 6654
+    },
+    {
+      "epoch": 1.1848290598290598,
+      "grad_norm": 0.8074106574058533,
+      "learning_rate": 0.00015989618270146423,
+      "loss": 1.1715,
+      "step": 6655
+    },
+    {
+      "epoch": 1.1850071225071226,
+      "grad_norm": 0.5778565406799316,
+      "learning_rate": 0.0001598849732528656,
+      "loss": 0.8843,
+      "step": 6656
+    },
+    {
+      "epoch": 1.1851851851851851,
+      "grad_norm": 0.6955079436302185,
+      "learning_rate": 0.00015987376263094526,
+      "loss": 1.0281,
+      "step": 6657
+    },
+    {
+      "epoch": 1.1853632478632479,
+      "grad_norm": 0.6789296269416809,
+      "learning_rate": 0.00015986255083592297,
+      "loss": 0.9739,
+      "step": 6658
+    },
+    {
+      "epoch": 1.1855413105413106,
+      "grad_norm": 0.6294292211532593,
+      "learning_rate": 0.00015985133786801834,
+      "loss": 1.0692,
+      "step": 6659
+    },
+    {
+      "epoch": 1.1857193732193732,
+      "grad_norm": 0.5604581832885742,
+      "learning_rate": 0.00015984012372745107,
+      "loss": 0.9059,
+      "step": 6660
+    },
+    {
+      "epoch": 1.185897435897436,
+      "grad_norm": 0.6727550625801086,
+      "learning_rate": 0.00015982890841444088,
+      "loss": 1.049,
+      "step": 6661
+    },
+    {
+      "epoch": 1.1860754985754987,
+      "grad_norm": 0.620914101600647,
+      "learning_rate": 0.0001598176919292075,
+      "loss": 1.1021,
+      "step": 6662
+    },
+    {
+      "epoch": 1.1862535612535612,
+      "grad_norm": 0.6696683168411255,
+      "learning_rate": 0.00015980647427197076,
+      "loss": 0.9053,
+      "step": 6663
+    },
+    {
+      "epoch": 1.186431623931624,
+      "grad_norm": 0.6713385581970215,
+      "learning_rate": 0.00015979525544295036,
+      "loss": 0.9596,
+      "step": 6664
+    },
+    {
+      "epoch": 1.1866096866096867,
+      "grad_norm": 0.7643477320671082,
+      "learning_rate": 0.00015978403544236614,
+      "loss": 0.882,
+      "step": 6665
+    },
+    {
+      "epoch": 1.1867877492877492,
+      "grad_norm": 0.5890966057777405,
+      "learning_rate": 0.00015977281427043794,
+      "loss": 1.0215,
+      "step": 6666
+    },
+    {
+      "epoch": 1.186965811965812,
+      "grad_norm": 0.7287502288818359,
+      "learning_rate": 0.0001597615919273856,
+      "loss": 1.0111,
+      "step": 6667
+    },
+    {
+      "epoch": 1.1871438746438747,
+      "grad_norm": 0.5713803172111511,
+      "learning_rate": 0.00015975036841342903,
+      "loss": 1.0068,
+      "step": 6668
+    },
+    {
+      "epoch": 1.1873219373219372,
+      "grad_norm": 0.5113094449043274,
+      "learning_rate": 0.0001597391437287881,
+      "loss": 0.9018,
+      "step": 6669
+    },
+    {
+      "epoch": 1.1875,
+      "grad_norm": 0.585640013217926,
+      "learning_rate": 0.00015972791787368276,
+      "loss": 1.0375,
+      "step": 6670
+    },
+    {
+      "epoch": 1.1876780626780628,
+      "grad_norm": 0.5778326392173767,
+      "learning_rate": 0.00015971669084833293,
+      "loss": 0.9975,
+      "step": 6671
+    },
+    {
+      "epoch": 1.1878561253561253,
+      "grad_norm": 0.6707763075828552,
+      "learning_rate": 0.0001597054626529586,
+      "loss": 1.0048,
+      "step": 6672
+    },
+    {
+      "epoch": 1.188034188034188,
+      "grad_norm": 0.6113292574882507,
+      "learning_rate": 0.00015969423328777974,
+      "loss": 1.1447,
+      "step": 6673
+    },
+    {
+      "epoch": 1.1882122507122508,
+      "grad_norm": 0.6075651049613953,
+      "learning_rate": 0.00015968300275301638,
+      "loss": 0.9212,
+      "step": 6674
+    },
+    {
+      "epoch": 1.1883903133903133,
+      "grad_norm": 0.6990494132041931,
+      "learning_rate": 0.00015967177104888857,
+      "loss": 0.9952,
+      "step": 6675
+    },
+    {
+      "epoch": 1.188568376068376,
+      "grad_norm": 0.6228706240653992,
+      "learning_rate": 0.00015966053817561638,
+      "loss": 1.0187,
+      "step": 6676
+    },
+    {
+      "epoch": 1.1887464387464388,
+      "grad_norm": 0.6387844085693359,
+      "learning_rate": 0.00015964930413341985,
+      "loss": 1.1614,
+      "step": 6677
+    },
+    {
+      "epoch": 1.1889245014245013,
+      "grad_norm": 0.6501925587654114,
+      "learning_rate": 0.00015963806892251915,
+      "loss": 1.0366,
+      "step": 6678
+    },
+    {
+      "epoch": 1.189102564102564,
+      "grad_norm": 0.6923739910125732,
+      "learning_rate": 0.00015962683254313435,
+      "loss": 1.1992,
+      "step": 6679
+    },
+    {
+      "epoch": 1.1892806267806268,
+      "grad_norm": 0.6640275120735168,
+      "learning_rate": 0.00015961559499548563,
+      "loss": 0.8883,
+      "step": 6680
+    },
+    {
+      "epoch": 1.1894586894586894,
+      "grad_norm": 0.6493857502937317,
+      "learning_rate": 0.00015960435627979317,
+      "loss": 1.1368,
+      "step": 6681
+    },
+    {
+      "epoch": 1.1896367521367521,
+      "grad_norm": 0.6357189416885376,
+      "learning_rate": 0.0001595931163962772,
+      "loss": 1.0502,
+      "step": 6682
+    },
+    {
+      "epoch": 1.1898148148148149,
+      "grad_norm": 0.5756343007087708,
+      "learning_rate": 0.0001595818753451579,
+      "loss": 0.9871,
+      "step": 6683
+    },
+    {
+      "epoch": 1.1899928774928774,
+      "grad_norm": 0.7369210124015808,
+      "learning_rate": 0.0001595706331266555,
+      "loss": 1.3229,
+      "step": 6684
+    },
+    {
+      "epoch": 1.1901709401709402,
+      "grad_norm": 0.7140820622444153,
+      "learning_rate": 0.0001595593897409903,
+      "loss": 1.1154,
+      "step": 6685
+    },
+    {
+      "epoch": 1.190349002849003,
+      "grad_norm": 0.696973443031311,
+      "learning_rate": 0.00015954814518838255,
+      "loss": 0.9806,
+      "step": 6686
+    },
+    {
+      "epoch": 1.1905270655270654,
+      "grad_norm": 0.5299260020256042,
+      "learning_rate": 0.00015953689946905262,
+      "loss": 0.771,
+      "step": 6687
+    },
+    {
+      "epoch": 1.1907051282051282,
+      "grad_norm": 0.6814879775047302,
+      "learning_rate": 0.00015952565258322085,
+      "loss": 0.8444,
+      "step": 6688
+    },
+    {
+      "epoch": 1.190883190883191,
+      "grad_norm": 0.6215870976448059,
+      "learning_rate": 0.00015951440453110754,
+      "loss": 1.0743,
+      "step": 6689
+    },
+    {
+      "epoch": 1.1910612535612535,
+      "grad_norm": 0.7017203569412231,
+      "learning_rate": 0.00015950315531293308,
+      "loss": 1.185,
+      "step": 6690
+    },
+    {
+      "epoch": 1.1912393162393162,
+      "grad_norm": 0.7147250175476074,
+      "learning_rate": 0.00015949190492891795,
+      "loss": 1.0646,
+      "step": 6691
+    },
+    {
+      "epoch": 1.191417378917379,
+      "grad_norm": 0.5867117047309875,
+      "learning_rate": 0.00015948065337928252,
+      "loss": 1.0554,
+      "step": 6692
+    },
+    {
+      "epoch": 1.1915954415954415,
+      "grad_norm": 0.6813527345657349,
+      "learning_rate": 0.0001594694006642472,
+      "loss": 1.1451,
+      "step": 6693
+    },
+    {
+      "epoch": 1.1917735042735043,
+      "grad_norm": 0.5192593932151794,
+      "learning_rate": 0.00015945814678403256,
+      "loss": 0.7886,
+      "step": 6694
+    },
+    {
+      "epoch": 1.191951566951567,
+      "grad_norm": 0.6537744402885437,
+      "learning_rate": 0.00015944689173885904,
+      "loss": 0.9905,
+      "step": 6695
+    },
+    {
+      "epoch": 1.1921296296296295,
+      "grad_norm": 0.7350276112556458,
+      "learning_rate": 0.00015943563552894716,
+      "loss": 0.9009,
+      "step": 6696
+    },
+    {
+      "epoch": 1.1923076923076923,
+      "grad_norm": 0.7086381316184998,
+      "learning_rate": 0.00015942437815451746,
+      "loss": 0.9117,
+      "step": 6697
+    },
+    {
+      "epoch": 1.192485754985755,
+      "grad_norm": 0.6774969696998596,
+      "learning_rate": 0.00015941311961579054,
+      "loss": 1.1172,
+      "step": 6698
+    },
+    {
+      "epoch": 1.1926638176638176,
+      "grad_norm": 0.7034362554550171,
+      "learning_rate": 0.00015940185991298694,
+      "loss": 0.8054,
+      "step": 6699
+    },
+    {
+      "epoch": 1.1928418803418803,
+      "grad_norm": 0.66145920753479,
+      "learning_rate": 0.00015939059904632728,
+      "loss": 0.7417,
+      "step": 6700
+    },
+    {
+      "epoch": 1.193019943019943,
+      "grad_norm": 0.6590890884399414,
+      "learning_rate": 0.00015937933701603223,
+      "loss": 0.9169,
+      "step": 6701
+    },
+    {
+      "epoch": 1.1931980056980056,
+      "grad_norm": 0.7492850422859192,
+      "learning_rate": 0.0001593680738223224,
+      "loss": 1.0529,
+      "step": 6702
+    },
+    {
+      "epoch": 1.1933760683760684,
+      "grad_norm": 0.7103236317634583,
+      "learning_rate": 0.00015935680946541848,
+      "loss": 1.1377,
+      "step": 6703
+    },
+    {
+      "epoch": 1.193554131054131,
+      "grad_norm": 0.6164175868034363,
+      "learning_rate": 0.00015934554394554122,
+      "loss": 0.8636,
+      "step": 6704
+    },
+    {
+      "epoch": 1.1937321937321936,
+      "grad_norm": 0.6667410135269165,
+      "learning_rate": 0.0001593342772629113,
+      "loss": 1.0073,
+      "step": 6705
+    },
+    {
+      "epoch": 1.1939102564102564,
+      "grad_norm": 0.6785695552825928,
+      "learning_rate": 0.00015932300941774944,
+      "loss": 1.0752,
+      "step": 6706
+    },
+    {
+      "epoch": 1.1940883190883191,
+      "grad_norm": 0.6446872353553772,
+      "learning_rate": 0.0001593117404102765,
+      "loss": 0.9509,
+      "step": 6707
+    },
+    {
+      "epoch": 1.194266381766382,
+      "grad_norm": 0.6607686877250671,
+      "learning_rate": 0.00015930047024071317,
+      "loss": 1.0902,
+      "step": 6708
+    },
+    {
+      "epoch": 1.1944444444444444,
+      "grad_norm": 0.664804995059967,
+      "learning_rate": 0.0001592891989092803,
+      "loss": 0.9783,
+      "step": 6709
+    },
+    {
+      "epoch": 1.1946225071225072,
+      "grad_norm": 0.7147907018661499,
+      "learning_rate": 0.00015927792641619876,
+      "loss": 1.0558,
+      "step": 6710
+    },
+    {
+      "epoch": 1.1948005698005697,
+      "grad_norm": 0.6858944296836853,
+      "learning_rate": 0.0001592666527616894,
+      "loss": 1.0514,
+      "step": 6711
+    },
+    {
+      "epoch": 1.1949786324786325,
+      "grad_norm": 0.598463773727417,
+      "learning_rate": 0.0001592553779459731,
+      "loss": 0.8927,
+      "step": 6712
+    },
+    {
+      "epoch": 1.1951566951566952,
+      "grad_norm": 0.6872668862342834,
+      "learning_rate": 0.00015924410196927076,
+      "loss": 1.016,
+      "step": 6713
+    },
+    {
+      "epoch": 1.195334757834758,
+      "grad_norm": 0.6547996401786804,
+      "learning_rate": 0.00015923282483180326,
+      "loss": 1.1573,
+      "step": 6714
+    },
+    {
+      "epoch": 1.1955128205128205,
+      "grad_norm": 0.6254705786705017,
+      "learning_rate": 0.00015922154653379167,
+      "loss": 1.0179,
+      "step": 6715
+    },
+    {
+      "epoch": 1.1956908831908832,
+      "grad_norm": 0.6049207448959351,
+      "learning_rate": 0.00015921026707545684,
+      "loss": 1.0713,
+      "step": 6716
+    },
+    {
+      "epoch": 1.1958689458689458,
+      "grad_norm": 0.6042858958244324,
+      "learning_rate": 0.0001591989864570199,
+      "loss": 0.919,
+      "step": 6717
+    },
+    {
+      "epoch": 1.1960470085470085,
+      "grad_norm": 0.6521187424659729,
+      "learning_rate": 0.0001591877046787017,
+      "loss": 1.0112,
+      "step": 6718
+    },
+    {
+      "epoch": 1.1962250712250713,
+      "grad_norm": 0.766260027885437,
+      "learning_rate": 0.00015917642174072348,
+      "loss": 0.9774,
+      "step": 6719
+    },
+    {
+      "epoch": 1.196403133903134,
+      "grad_norm": 0.7066532373428345,
+      "learning_rate": 0.00015916513764330613,
+      "loss": 1.1112,
+      "step": 6720
+    },
+    {
+      "epoch": 1.1965811965811965,
+      "grad_norm": 0.7351508140563965,
+      "learning_rate": 0.00015915385238667083,
+      "loss": 0.9841,
+      "step": 6721
+    },
+    {
+      "epoch": 1.1967592592592593,
+      "grad_norm": 0.6133812069892883,
+      "learning_rate": 0.0001591425659710387,
+      "loss": 0.8629,
+      "step": 6722
+    },
+    {
+      "epoch": 1.1969373219373218,
+      "grad_norm": 0.7244157791137695,
+      "learning_rate": 0.00015913127839663083,
+      "loss": 1.1584,
+      "step": 6723
+    },
+    {
+      "epoch": 1.1971153846153846,
+      "grad_norm": 0.5986210107803345,
+      "learning_rate": 0.00015911998966366842,
+      "loss": 0.8507,
+      "step": 6724
+    },
+    {
+      "epoch": 1.1972934472934473,
+      "grad_norm": 0.6087439060211182,
+      "learning_rate": 0.00015910869977237257,
+      "loss": 0.884,
+      "step": 6725
+    },
+    {
+      "epoch": 1.19747150997151,
+      "grad_norm": 0.7546007633209229,
+      "learning_rate": 0.00015909740872296457,
+      "loss": 1.1449,
+      "step": 6726
+    },
+    {
+      "epoch": 1.1976495726495726,
+      "grad_norm": 0.6437731385231018,
+      "learning_rate": 0.0001590861165156656,
+      "loss": 0.7845,
+      "step": 6727
+    },
+    {
+      "epoch": 1.1978276353276354,
+      "grad_norm": 0.6281737089157104,
+      "learning_rate": 0.00015907482315069693,
+      "loss": 0.8969,
+      "step": 6728
+    },
+    {
+      "epoch": 1.198005698005698,
+      "grad_norm": 0.6196113228797913,
+      "learning_rate": 0.00015906352862827983,
+      "loss": 1.0264,
+      "step": 6729
+    },
+    {
+      "epoch": 1.1981837606837606,
+      "grad_norm": 0.5990965962409973,
+      "learning_rate": 0.00015905223294863553,
+      "loss": 1.0017,
+      "step": 6730
+    },
+    {
+      "epoch": 1.1983618233618234,
+      "grad_norm": 0.6509191393852234,
+      "learning_rate": 0.00015904093611198542,
+      "loss": 1.1066,
+      "step": 6731
+    },
+    {
+      "epoch": 1.1985398860398861,
+      "grad_norm": 0.6648043990135193,
+      "learning_rate": 0.00015902963811855085,
+      "loss": 1.077,
+      "step": 6732
+    },
+    {
+      "epoch": 1.1987179487179487,
+      "grad_norm": 0.7071963548660278,
+      "learning_rate": 0.00015901833896855307,
+      "loss": 1.1346,
+      "step": 6733
+    },
+    {
+      "epoch": 1.1988960113960114,
+      "grad_norm": 0.5889959335327148,
+      "learning_rate": 0.0001590070386622136,
+      "loss": 0.9525,
+      "step": 6734
+    },
+    {
+      "epoch": 1.199074074074074,
+      "grad_norm": 0.6233037710189819,
+      "learning_rate": 0.00015899573719975376,
+      "loss": 1.0513,
+      "step": 6735
+    },
+    {
+      "epoch": 1.1992521367521367,
+      "grad_norm": 0.7912302613258362,
+      "learning_rate": 0.000158984434581395,
+      "loss": 0.8749,
+      "step": 6736
+    },
+    {
+      "epoch": 1.1994301994301995,
+      "grad_norm": 0.5783160924911499,
+      "learning_rate": 0.0001589731308073588,
+      "loss": 0.7173,
+      "step": 6737
+    },
+    {
+      "epoch": 1.1996082621082622,
+      "grad_norm": 0.718950092792511,
+      "learning_rate": 0.00015896182587786658,
+      "loss": 1.0815,
+      "step": 6738
+    },
+    {
+      "epoch": 1.1997863247863247,
+      "grad_norm": 0.6700926423072815,
+      "learning_rate": 0.0001589505197931399,
+      "loss": 1.0817,
+      "step": 6739
+    },
+    {
+      "epoch": 1.1999643874643875,
+      "grad_norm": 0.7614455223083496,
+      "learning_rate": 0.0001589392125534002,
+      "loss": 0.9707,
+      "step": 6740
+    },
+    {
+      "epoch": 1.20014245014245,
+      "grad_norm": 0.6998619437217712,
+      "learning_rate": 0.00015892790415886906,
+      "loss": 1.0541,
+      "step": 6741
+    },
+    {
+      "epoch": 1.2003205128205128,
+      "grad_norm": 0.6127668619155884,
+      "learning_rate": 0.0001589165946097681,
+      "loss": 0.9147,
+      "step": 6742
+    },
+    {
+      "epoch": 1.2004985754985755,
+      "grad_norm": 0.7112005352973938,
+      "learning_rate": 0.00015890528390631885,
+      "loss": 0.868,
+      "step": 6743
+    },
+    {
+      "epoch": 1.2006766381766383,
+      "grad_norm": 0.6631024479866028,
+      "learning_rate": 0.0001588939720487429,
+      "loss": 0.9277,
+      "step": 6744
+    },
+    {
+      "epoch": 1.2008547008547008,
+      "grad_norm": 0.6106321215629578,
+      "learning_rate": 0.00015888265903726188,
+      "loss": 1.0223,
+      "step": 6745
+    },
+    {
+      "epoch": 1.2010327635327636,
+      "grad_norm": 0.6400851607322693,
+      "learning_rate": 0.00015887134487209753,
+      "loss": 1.1279,
+      "step": 6746
+    },
+    {
+      "epoch": 1.201210826210826,
+      "grad_norm": 0.6298650503158569,
+      "learning_rate": 0.00015886002955347147,
+      "loss": 0.9481,
+      "step": 6747
+    },
+    {
+      "epoch": 1.2013888888888888,
+      "grad_norm": 0.647974967956543,
+      "learning_rate": 0.00015884871308160538,
+      "loss": 1.1513,
+      "step": 6748
+    },
+    {
+      "epoch": 1.2015669515669516,
+      "grad_norm": 0.6770651936531067,
+      "learning_rate": 0.000158837395456721,
+      "loss": 0.9914,
+      "step": 6749
+    },
+    {
+      "epoch": 1.2017450142450143,
+      "grad_norm": 0.6708947420120239,
+      "learning_rate": 0.0001588260766790401,
+      "loss": 1.1848,
+      "step": 6750
+    },
+    {
+      "epoch": 1.2019230769230769,
+      "grad_norm": 0.5624440908432007,
+      "learning_rate": 0.00015881475674878442,
+      "loss": 0.9848,
+      "step": 6751
+    },
+    {
+      "epoch": 1.2021011396011396,
+      "grad_norm": 0.5512633919715881,
+      "learning_rate": 0.00015880343566617575,
+      "loss": 1.0308,
+      "step": 6752
+    },
+    {
+      "epoch": 1.2022792022792024,
+      "grad_norm": 0.5621042251586914,
+      "learning_rate": 0.0001587921134314359,
+      "loss": 0.8724,
+      "step": 6753
+    },
+    {
+      "epoch": 1.202457264957265,
+      "grad_norm": 0.6881251931190491,
+      "learning_rate": 0.00015878079004478675,
+      "loss": 0.9771,
+      "step": 6754
+    },
+    {
+      "epoch": 1.2026353276353277,
+      "grad_norm": 0.729998767375946,
+      "learning_rate": 0.0001587694655064501,
+      "loss": 1.002,
+      "step": 6755
+    },
+    {
+      "epoch": 1.2028133903133904,
+      "grad_norm": 0.5972567200660706,
+      "learning_rate": 0.00015875813981664787,
+      "loss": 1.0571,
+      "step": 6756
+    },
+    {
+      "epoch": 1.202991452991453,
+      "grad_norm": 0.6319229006767273,
+      "learning_rate": 0.00015874681297560196,
+      "loss": 0.9294,
+      "step": 6757
+    },
+    {
+      "epoch": 1.2031695156695157,
+      "grad_norm": 0.6751521825790405,
+      "learning_rate": 0.00015873548498353428,
+      "loss": 0.783,
+      "step": 6758
+    },
+    {
+      "epoch": 1.2033475783475784,
+      "grad_norm": 0.6476554870605469,
+      "learning_rate": 0.00015872415584066677,
+      "loss": 0.8939,
+      "step": 6759
+    },
+    {
+      "epoch": 1.203525641025641,
+      "grad_norm": 0.6530960202217102,
+      "learning_rate": 0.0001587128255472214,
+      "loss": 0.9828,
+      "step": 6760
+    },
+    {
+      "epoch": 1.2037037037037037,
+      "grad_norm": 0.6708502173423767,
+      "learning_rate": 0.00015870149410342023,
+      "loss": 0.9285,
+      "step": 6761
+    },
+    {
+      "epoch": 1.2038817663817665,
+      "grad_norm": 0.7749543190002441,
+      "learning_rate": 0.0001586901615094852,
+      "loss": 1.1295,
+      "step": 6762
+    },
+    {
+      "epoch": 1.204059829059829,
+      "grad_norm": 0.6750495433807373,
+      "learning_rate": 0.00015867882776563836,
+      "loss": 1.0562,
+      "step": 6763
+    },
+    {
+      "epoch": 1.2042378917378918,
+      "grad_norm": 0.6892416477203369,
+      "learning_rate": 0.00015866749287210178,
+      "loss": 0.7207,
+      "step": 6764
+    },
+    {
+      "epoch": 1.2044159544159545,
+      "grad_norm": 0.7066485285758972,
+      "learning_rate": 0.00015865615682909758,
+      "loss": 1.0489,
+      "step": 6765
+    },
+    {
+      "epoch": 1.204594017094017,
+      "grad_norm": 0.5669938325881958,
+      "learning_rate": 0.00015864481963684783,
+      "loss": 0.8149,
+      "step": 6766
+    },
+    {
+      "epoch": 1.2047720797720798,
+      "grad_norm": 0.6467341780662537,
+      "learning_rate": 0.0001586334812955746,
+      "loss": 0.9595,
+      "step": 6767
+    },
+    {
+      "epoch": 1.2049501424501425,
+      "grad_norm": 0.6026045680046082,
+      "learning_rate": 0.0001586221418055002,
+      "loss": 0.9832,
+      "step": 6768
+    },
+    {
+      "epoch": 1.205128205128205,
+      "grad_norm": 0.7655174732208252,
+      "learning_rate": 0.00015861080116684665,
+      "loss": 0.9796,
+      "step": 6769
+    },
+    {
+      "epoch": 1.2053062678062678,
+      "grad_norm": 0.6386621594429016,
+      "learning_rate": 0.00015859945937983624,
+      "loss": 0.9368,
+      "step": 6770
+    },
+    {
+      "epoch": 1.2054843304843306,
+      "grad_norm": 0.7088032364845276,
+      "learning_rate": 0.0001585881164446911,
+      "loss": 1.0167,
+      "step": 6771
+    },
+    {
+      "epoch": 1.205662393162393,
+      "grad_norm": 0.6015275716781616,
+      "learning_rate": 0.0001585767723616336,
+      "loss": 0.8551,
+      "step": 6772
+    },
+    {
+      "epoch": 1.2058404558404558,
+      "grad_norm": 0.7013260722160339,
+      "learning_rate": 0.00015856542713088583,
+      "loss": 0.8009,
+      "step": 6773
+    },
+    {
+      "epoch": 1.2060185185185186,
+      "grad_norm": 0.6931240558624268,
+      "learning_rate": 0.00015855408075267024,
+      "loss": 0.9964,
+      "step": 6774
+    },
+    {
+      "epoch": 1.2061965811965811,
+      "grad_norm": 0.7274388670921326,
+      "learning_rate": 0.00015854273322720908,
+      "loss": 1.0991,
+      "step": 6775
+    },
+    {
+      "epoch": 1.2063746438746439,
+      "grad_norm": 0.6353716254234314,
+      "learning_rate": 0.00015853138455472466,
+      "loss": 1.0893,
+      "step": 6776
+    },
+    {
+      "epoch": 1.2065527065527066,
+      "grad_norm": 0.6958979368209839,
+      "learning_rate": 0.00015852003473543932,
+      "loss": 1.0238,
+      "step": 6777
+    },
+    {
+      "epoch": 1.2067307692307692,
+      "grad_norm": 0.626838743686676,
+      "learning_rate": 0.00015850868376957551,
+      "loss": 0.9384,
+      "step": 6778
+    },
+    {
+      "epoch": 1.206908831908832,
+      "grad_norm": 0.5455024242401123,
+      "learning_rate": 0.00015849733165735556,
+      "loss": 0.8068,
+      "step": 6779
+    },
+    {
+      "epoch": 1.2070868945868947,
+      "grad_norm": 0.6337353587150574,
+      "learning_rate": 0.0001584859783990019,
+      "loss": 1.1341,
+      "step": 6780
+    },
+    {
+      "epoch": 1.2072649572649572,
+      "grad_norm": 0.6318019032478333,
+      "learning_rate": 0.000158474623994737,
+      "loss": 1.1095,
+      "step": 6781
+    },
+    {
+      "epoch": 1.20744301994302,
+      "grad_norm": 0.8183810710906982,
+      "learning_rate": 0.00015846326844478332,
+      "loss": 1.1471,
+      "step": 6782
+    },
+    {
+      "epoch": 1.2076210826210827,
+      "grad_norm": 0.6140483021736145,
+      "learning_rate": 0.00015845191174936334,
+      "loss": 0.8538,
+      "step": 6783
+    },
+    {
+      "epoch": 1.2077991452991452,
+      "grad_norm": 0.7570197582244873,
+      "learning_rate": 0.0001584405539086996,
+      "loss": 1.427,
+      "step": 6784
+    },
+    {
+      "epoch": 1.207977207977208,
+      "grad_norm": 0.7616991996765137,
+      "learning_rate": 0.00015842919492301455,
+      "loss": 1.2214,
+      "step": 6785
+    },
+    {
+      "epoch": 1.2081552706552707,
+      "grad_norm": 0.561996579170227,
+      "learning_rate": 0.00015841783479253084,
+      "loss": 0.8916,
+      "step": 6786
+    },
+    {
+      "epoch": 1.2083333333333333,
+      "grad_norm": 0.6124222874641418,
+      "learning_rate": 0.000158406473517471,
+      "loss": 0.9637,
+      "step": 6787
+    },
+    {
+      "epoch": 1.208511396011396,
+      "grad_norm": 0.6053098440170288,
+      "learning_rate": 0.00015839511109805762,
+      "loss": 1.0365,
+      "step": 6788
+    },
+    {
+      "epoch": 1.2086894586894588,
+      "grad_norm": 0.6451675295829773,
+      "learning_rate": 0.00015838374753451338,
+      "loss": 1.0497,
+      "step": 6789
+    },
+    {
+      "epoch": 1.2088675213675213,
+      "grad_norm": 0.6789399981498718,
+      "learning_rate": 0.00015837238282706087,
+      "loss": 0.9286,
+      "step": 6790
+    },
+    {
+      "epoch": 1.209045584045584,
+      "grad_norm": 0.5742998123168945,
+      "learning_rate": 0.0001583610169759228,
+      "loss": 1.082,
+      "step": 6791
+    },
+    {
+      "epoch": 1.2092236467236468,
+      "grad_norm": 0.6813693642616272,
+      "learning_rate": 0.0001583496499813218,
+      "loss": 0.9785,
+      "step": 6792
+    },
+    {
+      "epoch": 1.2094017094017093,
+      "grad_norm": 0.6150603890419006,
+      "learning_rate": 0.0001583382818434806,
+      "loss": 0.9533,
+      "step": 6793
+    },
+    {
+      "epoch": 1.209579772079772,
+      "grad_norm": 0.6905919909477234,
+      "learning_rate": 0.000158326912562622,
+      "loss": 1.0132,
+      "step": 6794
+    },
+    {
+      "epoch": 1.2097578347578348,
+      "grad_norm": 0.5861411094665527,
+      "learning_rate": 0.0001583155421389687,
+      "loss": 0.7071,
+      "step": 6795
+    },
+    {
+      "epoch": 1.2099358974358974,
+      "grad_norm": 0.6822740435600281,
+      "learning_rate": 0.0001583041705727435,
+      "loss": 1.1366,
+      "step": 6796
+    },
+    {
+      "epoch": 1.21011396011396,
+      "grad_norm": 0.6013675928115845,
+      "learning_rate": 0.00015829279786416916,
+      "loss": 0.9232,
+      "step": 6797
+    },
+    {
+      "epoch": 1.2102920227920229,
+      "grad_norm": 0.650675356388092,
+      "learning_rate": 0.00015828142401346857,
+      "loss": 0.887,
+      "step": 6798
+    },
+    {
+      "epoch": 1.2104700854700854,
+      "grad_norm": 0.6764078736305237,
+      "learning_rate": 0.00015827004902086456,
+      "loss": 0.8423,
+      "step": 6799
+    },
+    {
+      "epoch": 1.2106481481481481,
+      "grad_norm": 0.6460821628570557,
+      "learning_rate": 0.00015825867288657994,
+      "loss": 1.0074,
+      "step": 6800
+    },
+    {
+      "epoch": 1.210826210826211,
+      "grad_norm": 0.692562997341156,
+      "learning_rate": 0.00015824729561083768,
+      "loss": 0.7978,
+      "step": 6801
+    },
+    {
+      "epoch": 1.2110042735042734,
+      "grad_norm": 0.7255034446716309,
+      "learning_rate": 0.00015823591719386066,
+      "loss": 1.071,
+      "step": 6802
+    },
+    {
+      "epoch": 1.2111823361823362,
+      "grad_norm": 0.6598904728889465,
+      "learning_rate": 0.0001582245376358718,
+      "loss": 0.9736,
+      "step": 6803
+    },
+    {
+      "epoch": 1.211360398860399,
+      "grad_norm": 0.6372483968734741,
+      "learning_rate": 0.0001582131569370941,
+      "loss": 0.9029,
+      "step": 6804
+    },
+    {
+      "epoch": 1.2115384615384615,
+      "grad_norm": 0.5907173156738281,
+      "learning_rate": 0.00015820177509775048,
+      "loss": 0.918,
+      "step": 6805
+    },
+    {
+      "epoch": 1.2117165242165242,
+      "grad_norm": 0.6252630949020386,
+      "learning_rate": 0.00015819039211806404,
+      "loss": 0.7801,
+      "step": 6806
+    },
+    {
+      "epoch": 1.211894586894587,
+      "grad_norm": 0.5793096423149109,
+      "learning_rate": 0.0001581790079982577,
+      "loss": 0.5769,
+      "step": 6807
+    },
+    {
+      "epoch": 1.2120726495726495,
+      "grad_norm": 0.7267270684242249,
+      "learning_rate": 0.00015816762273855454,
+      "loss": 1.1428,
+      "step": 6808
+    },
+    {
+      "epoch": 1.2122507122507122,
+      "grad_norm": 0.7481234073638916,
+      "learning_rate": 0.00015815623633917767,
+      "loss": 1.0209,
+      "step": 6809
+    },
+    {
+      "epoch": 1.212428774928775,
+      "grad_norm": 0.6114386916160583,
+      "learning_rate": 0.00015814484880035017,
+      "loss": 0.9073,
+      "step": 6810
+    },
+    {
+      "epoch": 1.2126068376068375,
+      "grad_norm": 0.6871182322502136,
+      "learning_rate": 0.00015813346012229516,
+      "loss": 1.151,
+      "step": 6811
+    },
+    {
+      "epoch": 1.2127849002849003,
+      "grad_norm": 0.6380293965339661,
+      "learning_rate": 0.0001581220703052357,
+      "loss": 1.0981,
+      "step": 6812
+    },
+    {
+      "epoch": 1.212962962962963,
+      "grad_norm": 0.6013718247413635,
+      "learning_rate": 0.00015811067934939503,
+      "loss": 0.8832,
+      "step": 6813
+    },
+    {
+      "epoch": 1.2131410256410255,
+      "grad_norm": 0.5816897749900818,
+      "learning_rate": 0.00015809928725499632,
+      "loss": 1.063,
+      "step": 6814
+    },
+    {
+      "epoch": 1.2133190883190883,
+      "grad_norm": 0.5970914363861084,
+      "learning_rate": 0.00015808789402226278,
+      "loss": 1.1177,
+      "step": 6815
+    },
+    {
+      "epoch": 1.213497150997151,
+      "grad_norm": 0.7624936103820801,
+      "learning_rate": 0.00015807649965141762,
+      "loss": 1.048,
+      "step": 6816
+    },
+    {
+      "epoch": 1.2136752136752136,
+      "grad_norm": 0.636263906955719,
+      "learning_rate": 0.0001580651041426841,
+      "loss": 0.9743,
+      "step": 6817
+    },
+    {
+      "epoch": 1.2138532763532763,
+      "grad_norm": 0.641090452671051,
+      "learning_rate": 0.00015805370749628547,
+      "loss": 1.0227,
+      "step": 6818
+    },
+    {
+      "epoch": 1.214031339031339,
+      "grad_norm": 0.6484021544456482,
+      "learning_rate": 0.00015804230971244504,
+      "loss": 0.9615,
+      "step": 6819
+    },
+    {
+      "epoch": 1.2142094017094016,
+      "grad_norm": 0.6473353505134583,
+      "learning_rate": 0.00015803091079138613,
+      "loss": 1.0507,
+      "step": 6820
+    },
+    {
+      "epoch": 1.2143874643874644,
+      "grad_norm": 0.5477129220962524,
+      "learning_rate": 0.00015801951073333206,
+      "loss": 0.7928,
+      "step": 6821
+    },
+    {
+      "epoch": 1.2145655270655271,
+      "grad_norm": 0.7256210446357727,
+      "learning_rate": 0.0001580081095385062,
+      "loss": 1.0172,
+      "step": 6822
+    },
+    {
+      "epoch": 1.2147435897435896,
+      "grad_norm": 0.5785418748855591,
+      "learning_rate": 0.00015799670720713195,
+      "loss": 0.8478,
+      "step": 6823
+    },
+    {
+      "epoch": 1.2149216524216524,
+      "grad_norm": 0.6782996654510498,
+      "learning_rate": 0.00015798530373943267,
+      "loss": 1.1819,
+      "step": 6824
+    },
+    {
+      "epoch": 1.2150997150997151,
+      "grad_norm": 0.6513699293136597,
+      "learning_rate": 0.00015797389913563186,
+      "loss": 0.9626,
+      "step": 6825
+    },
+    {
+      "epoch": 1.2152777777777777,
+      "grad_norm": 0.6503037214279175,
+      "learning_rate": 0.0001579624933959529,
+      "loss": 1.0282,
+      "step": 6826
+    },
+    {
+      "epoch": 1.2154558404558404,
+      "grad_norm": 0.581501841545105,
+      "learning_rate": 0.0001579510865206193,
+      "loss": 0.8976,
+      "step": 6827
+    },
+    {
+      "epoch": 1.2156339031339032,
+      "grad_norm": 0.6696721911430359,
+      "learning_rate": 0.00015793967850985454,
+      "loss": 0.6418,
+      "step": 6828
+    },
+    {
+      "epoch": 1.215811965811966,
+      "grad_norm": 0.6577274203300476,
+      "learning_rate": 0.00015792826936388213,
+      "loss": 1.0615,
+      "step": 6829
+    },
+    {
+      "epoch": 1.2159900284900285,
+      "grad_norm": 0.66291743516922,
+      "learning_rate": 0.00015791685908292564,
+      "loss": 0.8582,
+      "step": 6830
+    },
+    {
+      "epoch": 1.2161680911680912,
+      "grad_norm": 0.6548362374305725,
+      "learning_rate": 0.0001579054476672086,
+      "loss": 1.0343,
+      "step": 6831
+    },
+    {
+      "epoch": 1.2163461538461537,
+      "grad_norm": 0.6381218433380127,
+      "learning_rate": 0.00015789403511695457,
+      "loss": 0.8133,
+      "step": 6832
+    },
+    {
+      "epoch": 1.2165242165242165,
+      "grad_norm": 0.7217492461204529,
+      "learning_rate": 0.00015788262143238722,
+      "loss": 0.9183,
+      "step": 6833
+    },
+    {
+      "epoch": 1.2167022792022792,
+      "grad_norm": 0.610454797744751,
+      "learning_rate": 0.00015787120661373013,
+      "loss": 0.8488,
+      "step": 6834
+    },
+    {
+      "epoch": 1.216880341880342,
+      "grad_norm": 0.592771053314209,
+      "learning_rate": 0.00015785979066120696,
+      "loss": 0.8673,
+      "step": 6835
+    },
+    {
+      "epoch": 1.2170584045584045,
+      "grad_norm": 0.5787834525108337,
+      "learning_rate": 0.00015784837357504138,
+      "loss": 0.7945,
+      "step": 6836
+    },
+    {
+      "epoch": 1.2172364672364673,
+      "grad_norm": 0.6814196109771729,
+      "learning_rate": 0.0001578369553554571,
+      "loss": 0.8906,
+      "step": 6837
+    },
+    {
+      "epoch": 1.2174145299145298,
+      "grad_norm": 0.6383981108665466,
+      "learning_rate": 0.00015782553600267787,
+      "loss": 0.8962,
+      "step": 6838
+    },
+    {
+      "epoch": 1.2175925925925926,
+      "grad_norm": 0.6733864545822144,
+      "learning_rate": 0.0001578141155169273,
+      "loss": 1.2077,
+      "step": 6839
+    },
+    {
+      "epoch": 1.2177706552706553,
+      "grad_norm": 0.5891284346580505,
+      "learning_rate": 0.0001578026938984293,
+      "loss": 0.9477,
+      "step": 6840
+    },
+    {
+      "epoch": 1.217948717948718,
+      "grad_norm": 0.7220266461372375,
+      "learning_rate": 0.00015779127114740757,
+      "loss": 1.0343,
+      "step": 6841
+    },
+    {
+      "epoch": 1.2181267806267806,
+      "grad_norm": 0.6566546559333801,
+      "learning_rate": 0.0001577798472640859,
+      "loss": 0.9576,
+      "step": 6842
+    },
+    {
+      "epoch": 1.2183048433048433,
+      "grad_norm": 0.6428449153900146,
+      "learning_rate": 0.0001577684222486882,
+      "loss": 0.8957,
+      "step": 6843
+    },
+    {
+      "epoch": 1.2184829059829059,
+      "grad_norm": 0.6542909741401672,
+      "learning_rate": 0.00015775699610143823,
+      "loss": 0.9942,
+      "step": 6844
+    },
+    {
+      "epoch": 1.2186609686609686,
+      "grad_norm": 0.7101675868034363,
+      "learning_rate": 0.00015774556882255992,
+      "loss": 1.015,
+      "step": 6845
+    },
+    {
+      "epoch": 1.2188390313390314,
+      "grad_norm": 0.6606267094612122,
+      "learning_rate": 0.00015773414041227713,
+      "loss": 1.1406,
+      "step": 6846
+    },
+    {
+      "epoch": 1.2190170940170941,
+      "grad_norm": 0.67124342918396,
+      "learning_rate": 0.00015772271087081383,
+      "loss": 1.2392,
+      "step": 6847
+    },
+    {
+      "epoch": 1.2191951566951567,
+      "grad_norm": 0.6615056991577148,
+      "learning_rate": 0.0001577112801983939,
+      "loss": 1.1583,
+      "step": 6848
+    },
+    {
+      "epoch": 1.2193732193732194,
+      "grad_norm": 0.6941317319869995,
+      "learning_rate": 0.0001576998483952413,
+      "loss": 1.0255,
+      "step": 6849
+    },
+    {
+      "epoch": 1.219551282051282,
+      "grad_norm": 0.5740683674812317,
+      "learning_rate": 0.00015768841546158005,
+      "loss": 1.0393,
+      "step": 6850
+    },
+    {
+      "epoch": 1.2197293447293447,
+      "grad_norm": 0.7143667340278625,
+      "learning_rate": 0.00015767698139763415,
+      "loss": 0.7564,
+      "step": 6851
+    },
+    {
+      "epoch": 1.2199074074074074,
+      "grad_norm": 0.6730484366416931,
+      "learning_rate": 0.00015766554620362758,
+      "loss": 1.2221,
+      "step": 6852
+    },
+    {
+      "epoch": 1.2200854700854702,
+      "grad_norm": 0.6883087754249573,
+      "learning_rate": 0.00015765410987978444,
+      "loss": 1.0156,
+      "step": 6853
+    },
+    {
+      "epoch": 1.2202635327635327,
+      "grad_norm": 0.6585961580276489,
+      "learning_rate": 0.00015764267242632875,
+      "loss": 1.0888,
+      "step": 6854
+    },
+    {
+      "epoch": 1.2204415954415955,
+      "grad_norm": 0.6325246691703796,
+      "learning_rate": 0.00015763123384348465,
+      "loss": 0.973,
+      "step": 6855
+    },
+    {
+      "epoch": 1.220619658119658,
+      "grad_norm": 0.5930588245391846,
+      "learning_rate": 0.00015761979413147627,
+      "loss": 0.8551,
+      "step": 6856
+    },
+    {
+      "epoch": 1.2207977207977208,
+      "grad_norm": 0.6440611481666565,
+      "learning_rate": 0.0001576083532905277,
+      "loss": 0.8396,
+      "step": 6857
+    },
+    {
+      "epoch": 1.2209757834757835,
+      "grad_norm": 0.6796659231185913,
+      "learning_rate": 0.00015759691132086315,
+      "loss": 1.0662,
+      "step": 6858
+    },
+    {
+      "epoch": 1.2211538461538463,
+      "grad_norm": 0.6813400983810425,
+      "learning_rate": 0.00015758546822270674,
+      "loss": 1.0457,
+      "step": 6859
+    },
+    {
+      "epoch": 1.2213319088319088,
+      "grad_norm": 0.6871716976165771,
+      "learning_rate": 0.00015757402399628272,
+      "loss": 1.1675,
+      "step": 6860
+    },
+    {
+      "epoch": 1.2215099715099715,
+      "grad_norm": 0.6431481838226318,
+      "learning_rate": 0.00015756257864181524,
+      "loss": 0.9366,
+      "step": 6861
+    },
+    {
+      "epoch": 1.221688034188034,
+      "grad_norm": 0.6061800718307495,
+      "learning_rate": 0.00015755113215952868,
+      "loss": 0.9267,
+      "step": 6862
+    },
+    {
+      "epoch": 1.2218660968660968,
+      "grad_norm": 0.5755770206451416,
+      "learning_rate": 0.00015753968454964722,
+      "loss": 0.7342,
+      "step": 6863
+    },
+    {
+      "epoch": 1.2220441595441596,
+      "grad_norm": 0.571345329284668,
+      "learning_rate": 0.00015752823581239515,
+      "loss": 0.8943,
+      "step": 6864
+    },
+    {
+      "epoch": 1.2222222222222223,
+      "grad_norm": 0.6925615668296814,
+      "learning_rate": 0.0001575167859479968,
+      "loss": 0.8801,
+      "step": 6865
+    },
+    {
+      "epoch": 1.2224002849002849,
+      "grad_norm": 0.6812975406646729,
+      "learning_rate": 0.00015750533495667655,
+      "loss": 0.9567,
+      "step": 6866
+    },
+    {
+      "epoch": 1.2225783475783476,
+      "grad_norm": 0.8216777443885803,
+      "learning_rate": 0.00015749388283865868,
+      "loss": 1.0908,
+      "step": 6867
+    },
+    {
+      "epoch": 1.2227564102564104,
+      "grad_norm": 0.6051010489463806,
+      "learning_rate": 0.00015748242959416763,
+      "loss": 0.8851,
+      "step": 6868
+    },
+    {
+      "epoch": 1.2229344729344729,
+      "grad_norm": 0.7750816345214844,
+      "learning_rate": 0.00015747097522342775,
+      "loss": 1.1526,
+      "step": 6869
+    },
+    {
+      "epoch": 1.2231125356125356,
+      "grad_norm": 0.6240930557250977,
+      "learning_rate": 0.00015745951972666355,
+      "loss": 1.0603,
+      "step": 6870
+    },
+    {
+      "epoch": 1.2232905982905984,
+      "grad_norm": 0.7228875160217285,
+      "learning_rate": 0.00015744806310409937,
+      "loss": 1.1028,
+      "step": 6871
+    },
+    {
+      "epoch": 1.223468660968661,
+      "grad_norm": 0.724075436592102,
+      "learning_rate": 0.00015743660535595978,
+      "loss": 0.8983,
+      "step": 6872
+    },
+    {
+      "epoch": 1.2236467236467237,
+      "grad_norm": 0.6398203372955322,
+      "learning_rate": 0.00015742514648246916,
+      "loss": 1.0548,
+      "step": 6873
+    },
+    {
+      "epoch": 1.2238247863247864,
+      "grad_norm": 0.7024285793304443,
+      "learning_rate": 0.00015741368648385212,
+      "loss": 1.0172,
+      "step": 6874
+    },
+    {
+      "epoch": 1.224002849002849,
+      "grad_norm": 0.6717609763145447,
+      "learning_rate": 0.00015740222536033316,
+      "loss": 0.9002,
+      "step": 6875
+    },
+    {
+      "epoch": 1.2241809116809117,
+      "grad_norm": 0.5886133313179016,
+      "learning_rate": 0.00015739076311213686,
+      "loss": 0.8614,
+      "step": 6876
+    },
+    {
+      "epoch": 1.2243589743589745,
+      "grad_norm": 0.6856684684753418,
+      "learning_rate": 0.00015737929973948776,
+      "loss": 1.1633,
+      "step": 6877
+    },
+    {
+      "epoch": 1.224537037037037,
+      "grad_norm": 0.6771421432495117,
+      "learning_rate": 0.00015736783524261045,
+      "loss": 1.0921,
+      "step": 6878
+    },
+    {
+      "epoch": 1.2247150997150997,
+      "grad_norm": 0.5016412138938904,
+      "learning_rate": 0.0001573563696217296,
+      "loss": 0.6732,
+      "step": 6879
+    },
+    {
+      "epoch": 1.2248931623931625,
+      "grad_norm": 0.7595276236534119,
+      "learning_rate": 0.00015734490287706984,
+      "loss": 1.0427,
+      "step": 6880
+    },
+    {
+      "epoch": 1.225071225071225,
+      "grad_norm": 0.6664281487464905,
+      "learning_rate": 0.00015733343500885582,
+      "loss": 1.2836,
+      "step": 6881
+    },
+    {
+      "epoch": 1.2252492877492878,
+      "grad_norm": 0.6662577390670776,
+      "learning_rate": 0.00015732196601731224,
+      "loss": 1.1288,
+      "step": 6882
+    },
+    {
+      "epoch": 1.2254273504273505,
+      "grad_norm": 0.6238988041877747,
+      "learning_rate": 0.00015731049590266385,
+      "loss": 1.0809,
+      "step": 6883
+    },
+    {
+      "epoch": 1.225605413105413,
+      "grad_norm": 0.6483062505722046,
+      "learning_rate": 0.00015729902466513532,
+      "loss": 0.9992,
+      "step": 6884
+    },
+    {
+      "epoch": 1.2257834757834758,
+      "grad_norm": 0.6890861988067627,
+      "learning_rate": 0.0001572875523049514,
+      "loss": 1.1844,
+      "step": 6885
+    },
+    {
+      "epoch": 1.2259615384615385,
+      "grad_norm": 0.7087607383728027,
+      "learning_rate": 0.00015727607882233695,
+      "loss": 1.013,
+      "step": 6886
+    },
+    {
+      "epoch": 1.226139601139601,
+      "grad_norm": 0.709048867225647,
+      "learning_rate": 0.00015726460421751668,
+      "loss": 0.9748,
+      "step": 6887
+    },
+    {
+      "epoch": 1.2263176638176638,
+      "grad_norm": 0.5918150544166565,
+      "learning_rate": 0.00015725312849071546,
+      "loss": 0.9978,
+      "step": 6888
+    },
+    {
+      "epoch": 1.2264957264957266,
+      "grad_norm": 0.4343377947807312,
+      "learning_rate": 0.0001572416516421581,
+      "loss": 0.6233,
+      "step": 6889
+    },
+    {
+      "epoch": 1.226673789173789,
+      "grad_norm": 0.6360403895378113,
+      "learning_rate": 0.00015723017367206952,
+      "loss": 0.9698,
+      "step": 6890
+    },
+    {
+      "epoch": 1.2268518518518519,
+      "grad_norm": 0.7261984944343567,
+      "learning_rate": 0.00015721869458067454,
+      "loss": 1.0426,
+      "step": 6891
+    },
+    {
+      "epoch": 1.2270299145299146,
+      "grad_norm": 0.6806774139404297,
+      "learning_rate": 0.0001572072143681981,
+      "loss": 0.9692,
+      "step": 6892
+    },
+    {
+      "epoch": 1.2272079772079771,
+      "grad_norm": 0.7140612006187439,
+      "learning_rate": 0.00015719573303486515,
+      "loss": 1.0828,
+      "step": 6893
+    },
+    {
+      "epoch": 1.22738603988604,
+      "grad_norm": 0.5383326411247253,
+      "learning_rate": 0.0001571842505809006,
+      "loss": 1.012,
+      "step": 6894
+    },
+    {
+      "epoch": 1.2275641025641026,
+      "grad_norm": 0.5992259383201599,
+      "learning_rate": 0.0001571727670065295,
+      "loss": 0.876,
+      "step": 6895
+    },
+    {
+      "epoch": 1.2277421652421652,
+      "grad_norm": 0.636696457862854,
+      "learning_rate": 0.00015716128231197676,
+      "loss": 1.1001,
+      "step": 6896
+    },
+    {
+      "epoch": 1.227920227920228,
+      "grad_norm": 0.5980371236801147,
+      "learning_rate": 0.00015714979649746744,
+      "loss": 0.937,
+      "step": 6897
+    },
+    {
+      "epoch": 1.2280982905982907,
+      "grad_norm": 0.7678794860839844,
+      "learning_rate": 0.00015713830956322656,
+      "loss": 1.1965,
+      "step": 6898
+    },
+    {
+      "epoch": 1.2282763532763532,
+      "grad_norm": 0.6918835639953613,
+      "learning_rate": 0.00015712682150947923,
+      "loss": 0.8578,
+      "step": 6899
+    },
+    {
+      "epoch": 1.228454415954416,
+      "grad_norm": 0.6463451385498047,
+      "learning_rate": 0.00015711533233645048,
+      "loss": 1.009,
+      "step": 6900
+    },
+    {
+      "epoch": 1.2286324786324787,
+      "grad_norm": 0.6720646023750305,
+      "learning_rate": 0.00015710384204436549,
+      "loss": 1.0031,
+      "step": 6901
+    },
+    {
+      "epoch": 1.2288105413105412,
+      "grad_norm": 0.6618736982345581,
+      "learning_rate": 0.00015709235063344926,
+      "loss": 0.9017,
+      "step": 6902
+    },
+    {
+      "epoch": 1.228988603988604,
+      "grad_norm": 0.6789427399635315,
+      "learning_rate": 0.0001570808581039271,
+      "loss": 1.1289,
+      "step": 6903
+    },
+    {
+      "epoch": 1.2291666666666667,
+      "grad_norm": 0.6395950317382812,
+      "learning_rate": 0.00015706936445602403,
+      "loss": 1.1051,
+      "step": 6904
+    },
+    {
+      "epoch": 1.2293447293447293,
+      "grad_norm": 0.7023917436599731,
+      "learning_rate": 0.00015705786968996533,
+      "loss": 1.2876,
+      "step": 6905
+    },
+    {
+      "epoch": 1.229522792022792,
+      "grad_norm": 0.7473352551460266,
+      "learning_rate": 0.00015704637380597623,
+      "loss": 1.237,
+      "step": 6906
+    },
+    {
+      "epoch": 1.2297008547008548,
+      "grad_norm": 0.6952672004699707,
+      "learning_rate": 0.00015703487680428192,
+      "loss": 1.0674,
+      "step": 6907
+    },
+    {
+      "epoch": 1.2298789173789173,
+      "grad_norm": 0.5968644022941589,
+      "learning_rate": 0.0001570233786851077,
+      "loss": 0.9169,
+      "step": 6908
+    },
+    {
+      "epoch": 1.23005698005698,
+      "grad_norm": 0.7219798564910889,
+      "learning_rate": 0.0001570118794486788,
+      "loss": 1.0556,
+      "step": 6909
+    },
+    {
+      "epoch": 1.2302350427350428,
+      "grad_norm": 0.6603400707244873,
+      "learning_rate": 0.0001570003790952206,
+      "loss": 0.9596,
+      "step": 6910
+    },
+    {
+      "epoch": 1.2304131054131053,
+      "grad_norm": 0.5972838401794434,
+      "learning_rate": 0.0001569888776249583,
+      "loss": 0.9168,
+      "step": 6911
+    },
+    {
+      "epoch": 1.230591168091168,
+      "grad_norm": 0.792585551738739,
+      "learning_rate": 0.00015697737503811738,
+      "loss": 1.1074,
+      "step": 6912
+    },
+    {
+      "epoch": 1.2307692307692308,
+      "grad_norm": 0.5845609903335571,
+      "learning_rate": 0.00015696587133492314,
+      "loss": 0.8413,
+      "step": 6913
+    },
+    {
+      "epoch": 1.2309472934472934,
+      "grad_norm": 0.6603896021842957,
+      "learning_rate": 0.000156954366515601,
+      "loss": 0.9109,
+      "step": 6914
+    },
+    {
+      "epoch": 1.2311253561253561,
+      "grad_norm": 0.6367142796516418,
+      "learning_rate": 0.00015694286058037636,
+      "loss": 1.0119,
+      "step": 6915
+    },
+    {
+      "epoch": 1.2313034188034189,
+      "grad_norm": 0.693854570388794,
+      "learning_rate": 0.00015693135352947465,
+      "loss": 1.0925,
+      "step": 6916
+    },
+    {
+      "epoch": 1.2314814814814814,
+      "grad_norm": 0.6570404171943665,
+      "learning_rate": 0.00015691984536312135,
+      "loss": 0.9731,
+      "step": 6917
+    },
+    {
+      "epoch": 1.2316595441595442,
+      "grad_norm": 0.6778639554977417,
+      "learning_rate": 0.0001569083360815419,
+      "loss": 1.1415,
+      "step": 6918
+    },
+    {
+      "epoch": 1.231837606837607,
+      "grad_norm": 0.6656233668327332,
+      "learning_rate": 0.00015689682568496182,
+      "loss": 0.8603,
+      "step": 6919
+    },
+    {
+      "epoch": 1.2320156695156694,
+      "grad_norm": 0.6569861173629761,
+      "learning_rate": 0.00015688531417360665,
+      "loss": 0.8374,
+      "step": 6920
+    },
+    {
+      "epoch": 1.2321937321937322,
+      "grad_norm": 0.6746888160705566,
+      "learning_rate": 0.0001568738015477019,
+      "loss": 1.1395,
+      "step": 6921
+    },
+    {
+      "epoch": 1.232371794871795,
+      "grad_norm": 0.6180813908576965,
+      "learning_rate": 0.00015686228780747316,
+      "loss": 1.0049,
+      "step": 6922
+    },
+    {
+      "epoch": 1.2325498575498575,
+      "grad_norm": 0.7326146960258484,
+      "learning_rate": 0.000156850772953146,
+      "loss": 1.2389,
+      "step": 6923
+    },
+    {
+      "epoch": 1.2327279202279202,
+      "grad_norm": 0.5912215709686279,
+      "learning_rate": 0.00015683925698494608,
+      "loss": 1.0174,
+      "step": 6924
+    },
+    {
+      "epoch": 1.232905982905983,
+      "grad_norm": 0.5214745402336121,
+      "learning_rate": 0.00015682773990309895,
+      "loss": 0.5778,
+      "step": 6925
+    },
+    {
+      "epoch": 1.2330840455840455,
+      "grad_norm": 0.6862079501152039,
+      "learning_rate": 0.00015681622170783034,
+      "loss": 0.896,
+      "step": 6926
+    },
+    {
+      "epoch": 1.2332621082621082,
+      "grad_norm": 0.7858926057815552,
+      "learning_rate": 0.00015680470239936586,
+      "loss": 1.0714,
+      "step": 6927
+    },
+    {
+      "epoch": 1.233440170940171,
+      "grad_norm": 0.6706146597862244,
+      "learning_rate": 0.00015679318197793127,
+      "loss": 1.0157,
+      "step": 6928
+    },
+    {
+      "epoch": 1.2336182336182335,
+      "grad_norm": 0.6657105088233948,
+      "learning_rate": 0.00015678166044375225,
+      "loss": 0.9674,
+      "step": 6929
+    },
+    {
+      "epoch": 1.2337962962962963,
+      "grad_norm": 0.6790838837623596,
+      "learning_rate": 0.0001567701377970545,
+      "loss": 0.9744,
+      "step": 6930
+    },
+    {
+      "epoch": 1.233974358974359,
+      "grad_norm": 0.6469771862030029,
+      "learning_rate": 0.00015675861403806386,
+      "loss": 1.0205,
+      "step": 6931
+    },
+    {
+      "epoch": 1.2341524216524216,
+      "grad_norm": 0.4926300346851349,
+      "learning_rate": 0.0001567470891670061,
+      "loss": 0.6336,
+      "step": 6932
+    },
+    {
+      "epoch": 1.2343304843304843,
+      "grad_norm": 0.6762157082557678,
+      "learning_rate": 0.000156735563184107,
+      "loss": 1.059,
+      "step": 6933
+    },
+    {
+      "epoch": 1.234508547008547,
+      "grad_norm": 0.6998521685600281,
+      "learning_rate": 0.0001567240360895924,
+      "loss": 1.0586,
+      "step": 6934
+    },
+    {
+      "epoch": 1.2346866096866096,
+      "grad_norm": 0.5947706699371338,
+      "learning_rate": 0.00015671250788368814,
+      "loss": 0.8815,
+      "step": 6935
+    },
+    {
+      "epoch": 1.2348646723646723,
+      "grad_norm": 0.6966122984886169,
+      "learning_rate": 0.0001567009785666201,
+      "loss": 1.0105,
+      "step": 6936
+    },
+    {
+      "epoch": 1.235042735042735,
+      "grad_norm": 0.6747866272926331,
+      "learning_rate": 0.0001566894481386142,
+      "loss": 0.8783,
+      "step": 6937
+    },
+    {
+      "epoch": 1.2352207977207976,
+      "grad_norm": 0.6348921060562134,
+      "learning_rate": 0.0001566779165998963,
+      "loss": 0.7813,
+      "step": 6938
+    },
+    {
+      "epoch": 1.2353988603988604,
+      "grad_norm": 0.596466600894928,
+      "learning_rate": 0.00015666638395069236,
+      "loss": 0.8689,
+      "step": 6939
+    },
+    {
+      "epoch": 1.2355769230769231,
+      "grad_norm": 0.6926795244216919,
+      "learning_rate": 0.00015665485019122834,
+      "loss": 1.0266,
+      "step": 6940
+    },
+    {
+      "epoch": 1.2357549857549857,
+      "grad_norm": 0.6590100526809692,
+      "learning_rate": 0.00015664331532173022,
+      "loss": 1.128,
+      "step": 6941
+    },
+    {
+      "epoch": 1.2359330484330484,
+      "grad_norm": 0.7422109246253967,
+      "learning_rate": 0.00015663177934242402,
+      "loss": 0.8495,
+      "step": 6942
+    },
+    {
+      "epoch": 1.2361111111111112,
+      "grad_norm": 0.6463228464126587,
+      "learning_rate": 0.0001566202422535357,
+      "loss": 1.0941,
+      "step": 6943
+    },
+    {
+      "epoch": 1.236289173789174,
+      "grad_norm": 0.7278686761856079,
+      "learning_rate": 0.0001566087040552914,
+      "loss": 1.2039,
+      "step": 6944
+    },
+    {
+      "epoch": 1.2364672364672364,
+      "grad_norm": 0.6917086839675903,
+      "learning_rate": 0.00015659716474791712,
+      "loss": 1.042,
+      "step": 6945
+    },
+    {
+      "epoch": 1.2366452991452992,
+      "grad_norm": 0.637205183506012,
+      "learning_rate": 0.00015658562433163898,
+      "loss": 1.0379,
+      "step": 6946
+    },
+    {
+      "epoch": 1.2368233618233617,
+      "grad_norm": 0.6706623435020447,
+      "learning_rate": 0.00015657408280668307,
+      "loss": 1.0347,
+      "step": 6947
+    },
+    {
+      "epoch": 1.2370014245014245,
+      "grad_norm": 0.6435480713844299,
+      "learning_rate": 0.00015656254017327553,
+      "loss": 0.7708,
+      "step": 6948
+    },
+    {
+      "epoch": 1.2371794871794872,
+      "grad_norm": 0.5703113675117493,
+      "learning_rate": 0.0001565509964316425,
+      "loss": 0.8786,
+      "step": 6949
+    },
+    {
+      "epoch": 1.23735754985755,
+      "grad_norm": 0.6438127160072327,
+      "learning_rate": 0.00015653945158201018,
+      "loss": 0.9435,
+      "step": 6950
+    },
+    {
+      "epoch": 1.2375356125356125,
+      "grad_norm": 0.68101966381073,
+      "learning_rate": 0.00015652790562460474,
+      "loss": 1.1062,
+      "step": 6951
+    },
+    {
+      "epoch": 1.2377136752136753,
+      "grad_norm": 0.661230206489563,
+      "learning_rate": 0.00015651635855965242,
+      "loss": 1.0113,
+      "step": 6952
+    },
+    {
+      "epoch": 1.2378917378917378,
+      "grad_norm": 0.6399117708206177,
+      "learning_rate": 0.0001565048103873795,
+      "loss": 1.1423,
+      "step": 6953
+    },
+    {
+      "epoch": 1.2380698005698005,
+      "grad_norm": 0.7614672780036926,
+      "learning_rate": 0.00015649326110801215,
+      "loss": 1.0359,
+      "step": 6954
+    },
+    {
+      "epoch": 1.2382478632478633,
+      "grad_norm": 0.6461986303329468,
+      "learning_rate": 0.00015648171072177674,
+      "loss": 1.0145,
+      "step": 6955
+    },
+    {
+      "epoch": 1.238425925925926,
+      "grad_norm": 0.5902668833732605,
+      "learning_rate": 0.0001564701592288995,
+      "loss": 0.9451,
+      "step": 6956
+    },
+    {
+      "epoch": 1.2386039886039886,
+      "grad_norm": 0.5686020255088806,
+      "learning_rate": 0.00015645860662960682,
+      "loss": 0.7512,
+      "step": 6957
+    },
+    {
+      "epoch": 1.2387820512820513,
+      "grad_norm": 0.6640077829360962,
+      "learning_rate": 0.00015644705292412503,
+      "loss": 0.7133,
+      "step": 6958
+    },
+    {
+      "epoch": 1.2389601139601139,
+      "grad_norm": 0.7402132749557495,
+      "learning_rate": 0.00015643549811268049,
+      "loss": 1.0903,
+      "step": 6959
+    },
+    {
+      "epoch": 1.2391381766381766,
+      "grad_norm": 0.62332683801651,
+      "learning_rate": 0.00015642394219549962,
+      "loss": 0.9378,
+      "step": 6960
+    },
+    {
+      "epoch": 1.2393162393162394,
+      "grad_norm": 0.6374901533126831,
+      "learning_rate": 0.00015641238517280877,
+      "loss": 1.0746,
+      "step": 6961
+    },
+    {
+      "epoch": 1.239494301994302,
+      "grad_norm": 0.5939112901687622,
+      "learning_rate": 0.00015640082704483443,
+      "loss": 0.7185,
+      "step": 6962
+    },
+    {
+      "epoch": 1.2396723646723646,
+      "grad_norm": 0.8378096222877502,
+      "learning_rate": 0.00015638926781180306,
+      "loss": 1.1932,
+      "step": 6963
+    },
+    {
+      "epoch": 1.2398504273504274,
+      "grad_norm": 0.5707982778549194,
+      "learning_rate": 0.0001563777074739411,
+      "loss": 0.9834,
+      "step": 6964
+    },
+    {
+      "epoch": 1.24002849002849,
+      "grad_norm": 0.6339748501777649,
+      "learning_rate": 0.00015636614603147512,
+      "loss": 1.0307,
+      "step": 6965
+    },
+    {
+      "epoch": 1.2402065527065527,
+      "grad_norm": 0.7353155016899109,
+      "learning_rate": 0.00015635458348463156,
+      "loss": 1.0311,
+      "step": 6966
+    },
+    {
+      "epoch": 1.2403846153846154,
+      "grad_norm": 0.8307726979255676,
+      "learning_rate": 0.00015634301983363704,
+      "loss": 1.0673,
+      "step": 6967
+    },
+    {
+      "epoch": 1.2405626780626782,
+      "grad_norm": 0.5299199819564819,
+      "learning_rate": 0.00015633145507871807,
+      "loss": 0.6649,
+      "step": 6968
+    },
+    {
+      "epoch": 1.2407407407407407,
+      "grad_norm": 0.6162533760070801,
+      "learning_rate": 0.00015631988922010126,
+      "loss": 0.8096,
+      "step": 6969
+    },
+    {
+      "epoch": 1.2409188034188035,
+      "grad_norm": 0.6212689876556396,
+      "learning_rate": 0.0001563083222580132,
+      "loss": 1.0371,
+      "step": 6970
+    },
+    {
+      "epoch": 1.241096866096866,
+      "grad_norm": 0.6148123145103455,
+      "learning_rate": 0.00015629675419268055,
+      "loss": 1.0439,
+      "step": 6971
+    },
+    {
+      "epoch": 1.2412749287749287,
+      "grad_norm": 0.6163684129714966,
+      "learning_rate": 0.00015628518502432994,
+      "loss": 0.9075,
+      "step": 6972
+    },
+    {
+      "epoch": 1.2414529914529915,
+      "grad_norm": 0.5127472877502441,
+      "learning_rate": 0.00015627361475318807,
+      "loss": 0.6138,
+      "step": 6973
+    },
+    {
+      "epoch": 1.2416310541310542,
+      "grad_norm": 0.6508103013038635,
+      "learning_rate": 0.0001562620433794816,
+      "loss": 0.9608,
+      "step": 6974
+    },
+    {
+      "epoch": 1.2418091168091168,
+      "grad_norm": 0.6711046695709229,
+      "learning_rate": 0.0001562504709034373,
+      "loss": 1.1494,
+      "step": 6975
+    },
+    {
+      "epoch": 1.2419871794871795,
+      "grad_norm": 0.6831514835357666,
+      "learning_rate": 0.00015623889732528182,
+      "loss": 0.9664,
+      "step": 6976
+    },
+    {
+      "epoch": 1.242165242165242,
+      "grad_norm": 0.693732738494873,
+      "learning_rate": 0.00015622732264524198,
+      "loss": 0.9055,
+      "step": 6977
+    },
+    {
+      "epoch": 1.2423433048433048,
+      "grad_norm": 0.8475173711776733,
+      "learning_rate": 0.00015621574686354456,
+      "loss": 1.2014,
+      "step": 6978
+    },
+    {
+      "epoch": 1.2425213675213675,
+      "grad_norm": 0.6342347264289856,
+      "learning_rate": 0.0001562041699804164,
+      "loss": 1.0691,
+      "step": 6979
+    },
+    {
+      "epoch": 1.2426994301994303,
+      "grad_norm": 0.620517373085022,
+      "learning_rate": 0.00015619259199608422,
+      "loss": 0.7318,
+      "step": 6980
+    },
+    {
+      "epoch": 1.2428774928774928,
+      "grad_norm": 0.589567244052887,
+      "learning_rate": 0.000156181012910775,
+      "loss": 1.0656,
+      "step": 6981
+    },
+    {
+      "epoch": 1.2430555555555556,
+      "grad_norm": 0.7570258975028992,
+      "learning_rate": 0.00015616943272471546,
+      "loss": 1.0517,
+      "step": 6982
+    },
+    {
+      "epoch": 1.243233618233618,
+      "grad_norm": 0.6232032775878906,
+      "learning_rate": 0.00015615785143813262,
+      "loss": 0.8867,
+      "step": 6983
+    },
+    {
+      "epoch": 1.2434116809116809,
+      "grad_norm": 0.630095899105072,
+      "learning_rate": 0.0001561462690512533,
+      "loss": 0.9287,
+      "step": 6984
+    },
+    {
+      "epoch": 1.2435897435897436,
+      "grad_norm": 0.7410848140716553,
+      "learning_rate": 0.00015613468556430454,
+      "loss": 1.162,
+      "step": 6985
+    },
+    {
+      "epoch": 1.2437678062678064,
+      "grad_norm": 0.7574684023857117,
+      "learning_rate": 0.00015612310097751317,
+      "loss": 1.2118,
+      "step": 6986
+    },
+    {
+      "epoch": 1.243945868945869,
+      "grad_norm": 0.580760657787323,
+      "learning_rate": 0.0001561115152911062,
+      "loss": 1.0612,
+      "step": 6987
+    },
+    {
+      "epoch": 1.2441239316239316,
+      "grad_norm": 0.6105104088783264,
+      "learning_rate": 0.00015609992850531073,
+      "loss": 0.9262,
+      "step": 6988
+    },
+    {
+      "epoch": 1.2443019943019944,
+      "grad_norm": 0.669435441493988,
+      "learning_rate": 0.00015608834062035362,
+      "loss": 0.9595,
+      "step": 6989
+    },
+    {
+      "epoch": 1.244480056980057,
+      "grad_norm": 0.6530314683914185,
+      "learning_rate": 0.00015607675163646206,
+      "loss": 0.7987,
+      "step": 6990
+    },
+    {
+      "epoch": 1.2446581196581197,
+      "grad_norm": 0.5801477432250977,
+      "learning_rate": 0.00015606516155386297,
+      "loss": 0.7667,
+      "step": 6991
+    },
+    {
+      "epoch": 1.2448361823361824,
+      "grad_norm": 0.5773885250091553,
+      "learning_rate": 0.00015605357037278355,
+      "loss": 0.847,
+      "step": 6992
+    },
+    {
+      "epoch": 1.245014245014245,
+      "grad_norm": 0.5399810075759888,
+      "learning_rate": 0.00015604197809345082,
+      "loss": 0.9284,
+      "step": 6993
+    },
+    {
+      "epoch": 1.2451923076923077,
+      "grad_norm": 0.5910452604293823,
+      "learning_rate": 0.000156030384716092,
+      "loss": 1.0004,
+      "step": 6994
+    },
+    {
+      "epoch": 1.2453703703703705,
+      "grad_norm": 0.5979224443435669,
+      "learning_rate": 0.00015601879024093414,
+      "loss": 0.9027,
+      "step": 6995
+    },
+    {
+      "epoch": 1.245548433048433,
+      "grad_norm": 0.6092126369476318,
+      "learning_rate": 0.0001560071946682045,
+      "loss": 0.9755,
+      "step": 6996
+    },
+    {
+      "epoch": 1.2457264957264957,
+      "grad_norm": 0.6536708474159241,
+      "learning_rate": 0.0001559955979981302,
+      "loss": 1.1828,
+      "step": 6997
+    },
+    {
+      "epoch": 1.2459045584045585,
+      "grad_norm": 0.6602030992507935,
+      "learning_rate": 0.00015598400023093847,
+      "loss": 1.0395,
+      "step": 6998
+    },
+    {
+      "epoch": 1.246082621082621,
+      "grad_norm": 0.6864825487136841,
+      "learning_rate": 0.00015597240136685657,
+      "loss": 1.083,
+      "step": 6999
+    },
+    {
+      "epoch": 1.2462606837606838,
+      "grad_norm": 0.6194674968719482,
+      "learning_rate": 0.0001559608014061117,
+      "loss": 1.0461,
+      "step": 7000
+    },
+    {
+      "epoch": 1.2464387464387465,
+      "grad_norm": 0.5879074335098267,
+      "learning_rate": 0.00015594920034893122,
+      "loss": 1.076,
+      "step": 7001
+    },
+    {
+      "epoch": 1.246616809116809,
+      "grad_norm": 0.6514387726783752,
+      "learning_rate": 0.00015593759819554234,
+      "loss": 1.0396,
+      "step": 7002
+    },
+    {
+      "epoch": 1.2467948717948718,
+      "grad_norm": 0.5988301634788513,
+      "learning_rate": 0.00015592599494617247,
+      "loss": 0.9501,
+      "step": 7003
+    },
+    {
+      "epoch": 1.2469729344729346,
+      "grad_norm": 0.6282773613929749,
+      "learning_rate": 0.00015591439060104887,
+      "loss": 1.1002,
+      "step": 7004
+    },
+    {
+      "epoch": 1.247150997150997,
+      "grad_norm": 0.6910465955734253,
+      "learning_rate": 0.00015590278516039896,
+      "loss": 1.1771,
+      "step": 7005
+    },
+    {
+      "epoch": 1.2473290598290598,
+      "grad_norm": 0.6097282767295837,
+      "learning_rate": 0.00015589117862445007,
+      "loss": 1.0707,
+      "step": 7006
+    },
+    {
+      "epoch": 1.2475071225071226,
+      "grad_norm": 0.7076875567436218,
+      "learning_rate": 0.00015587957099342967,
+      "loss": 1.0078,
+      "step": 7007
+    },
+    {
+      "epoch": 1.2476851851851851,
+      "grad_norm": 0.6776556372642517,
+      "learning_rate": 0.00015586796226756518,
+      "loss": 0.8971,
+      "step": 7008
+    },
+    {
+      "epoch": 1.2478632478632479,
+      "grad_norm": 0.6506341695785522,
+      "learning_rate": 0.00015585635244708398,
+      "loss": 0.9727,
+      "step": 7009
+    },
+    {
+      "epoch": 1.2480413105413106,
+      "grad_norm": 0.624724805355072,
+      "learning_rate": 0.00015584474153221357,
+      "loss": 0.9858,
+      "step": 7010
+    },
+    {
+      "epoch": 1.2482193732193732,
+      "grad_norm": 0.6070096492767334,
+      "learning_rate": 0.0001558331295231815,
+      "loss": 0.9385,
+      "step": 7011
+    },
+    {
+      "epoch": 1.248397435897436,
+      "grad_norm": 0.6948656439781189,
+      "learning_rate": 0.00015582151642021524,
+      "loss": 0.9425,
+      "step": 7012
+    },
+    {
+      "epoch": 1.2485754985754987,
+      "grad_norm": 0.6559088230133057,
+      "learning_rate": 0.0001558099022235423,
+      "loss": 1.0002,
+      "step": 7013
+    },
+    {
+      "epoch": 1.2487535612535612,
+      "grad_norm": 0.6097117066383362,
+      "learning_rate": 0.00015579828693339026,
+      "loss": 1.0234,
+      "step": 7014
+    },
+    {
+      "epoch": 1.248931623931624,
+      "grad_norm": 0.6612260341644287,
+      "learning_rate": 0.00015578667054998673,
+      "loss": 1.1376,
+      "step": 7015
+    },
+    {
+      "epoch": 1.2491096866096867,
+      "grad_norm": 0.6305607557296753,
+      "learning_rate": 0.00015577505307355925,
+      "loss": 0.9127,
+      "step": 7016
+    },
+    {
+      "epoch": 1.2492877492877492,
+      "grad_norm": 0.6648319959640503,
+      "learning_rate": 0.00015576343450433549,
+      "loss": 0.8697,
+      "step": 7017
+    },
+    {
+      "epoch": 1.249465811965812,
+      "grad_norm": 0.7642946839332581,
+      "learning_rate": 0.00015575181484254303,
+      "loss": 1.0998,
+      "step": 7018
+    },
+    {
+      "epoch": 1.2496438746438747,
+      "grad_norm": 0.6775243282318115,
+      "learning_rate": 0.00015574019408840962,
+      "loss": 1.0186,
+      "step": 7019
+    },
+    {
+      "epoch": 1.2498219373219372,
+      "grad_norm": 0.6075591444969177,
+      "learning_rate": 0.00015572857224216286,
+      "loss": 0.9592,
+      "step": 7020
+    },
+    {
+      "epoch": 1.2498219373219372,
+      "eval_loss": 1.105136752128601,
+      "eval_runtime": 24.4793,
+      "eval_samples_per_second": 42.526,
+      "eval_steps_per_second": 21.283,
+      "step": 7020
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 0.5856962203979492,
+      "learning_rate": 0.0001557169493040305,
+      "loss": 0.8336,
+      "step": 7021
+    },
+    {
+      "epoch": 1.2501780626780628,
+      "grad_norm": 0.6451364159584045,
+      "learning_rate": 0.00015570532527424028,
+      "loss": 0.8805,
+      "step": 7022
+    },
+    {
+      "epoch": 1.2503561253561253,
+      "grad_norm": 0.6266474723815918,
+      "learning_rate": 0.00015569370015301991,
+      "loss": 1.0023,
+      "step": 7023
+    },
+    {
+      "epoch": 1.250534188034188,
+      "grad_norm": 0.5547378063201904,
+      "learning_rate": 0.00015568207394059722,
+      "loss": 0.7385,
+      "step": 7024
+    },
+    {
+      "epoch": 1.2507122507122508,
+      "grad_norm": 0.604169487953186,
+      "learning_rate": 0.0001556704466371999,
+      "loss": 0.9194,
+      "step": 7025
+    },
+    {
+      "epoch": 1.2508903133903133,
+      "grad_norm": 0.7054405212402344,
+      "learning_rate": 0.00015565881824305586,
+      "loss": 1.1864,
+      "step": 7026
+    },
+    {
+      "epoch": 1.251068376068376,
+      "grad_norm": 0.6429929733276367,
+      "learning_rate": 0.0001556471887583929,
+      "loss": 1.0129,
+      "step": 7027
+    },
+    {
+      "epoch": 1.2512464387464388,
+      "grad_norm": 0.695957362651825,
+      "learning_rate": 0.00015563555818343887,
+      "loss": 1.2994,
+      "step": 7028
+    },
+    {
+      "epoch": 1.2514245014245013,
+      "grad_norm": 0.5889938473701477,
+      "learning_rate": 0.0001556239265184216,
+      "loss": 1.0109,
+      "step": 7029
+    },
+    {
+      "epoch": 1.251602564102564,
+      "grad_norm": 0.6424569487571716,
+      "learning_rate": 0.0001556122937635691,
+      "loss": 0.8585,
+      "step": 7030
+    },
+    {
+      "epoch": 1.2517806267806268,
+      "grad_norm": 0.5561244487762451,
+      "learning_rate": 0.0001556006599191092,
+      "loss": 0.9994,
+      "step": 7031
+    },
+    {
+      "epoch": 1.2519586894586894,
+      "grad_norm": 0.6355302333831787,
+      "learning_rate": 0.00015558902498526988,
+      "loss": 0.9495,
+      "step": 7032
+    },
+    {
+      "epoch": 1.2521367521367521,
+      "grad_norm": 0.6272686719894409,
+      "learning_rate": 0.00015557738896227908,
+      "loss": 0.7611,
+      "step": 7033
+    },
+    {
+      "epoch": 1.2523148148148149,
+      "grad_norm": 0.7069199085235596,
+      "learning_rate": 0.00015556575185036482,
+      "loss": 1.0612,
+      "step": 7034
+    },
+    {
+      "epoch": 1.2524928774928774,
+      "grad_norm": 0.6635094285011292,
+      "learning_rate": 0.00015555411364975505,
+      "loss": 1.1182,
+      "step": 7035
+    },
+    {
+      "epoch": 1.2526709401709402,
+      "grad_norm": 0.6112014651298523,
+      "learning_rate": 0.00015554247436067785,
+      "loss": 0.8677,
+      "step": 7036
+    },
+    {
+      "epoch": 1.252849002849003,
+      "grad_norm": 0.678963303565979,
+      "learning_rate": 0.00015553083398336126,
+      "loss": 1.1421,
+      "step": 7037
+    },
+    {
+      "epoch": 1.2530270655270654,
+      "grad_norm": 0.6291939616203308,
+      "learning_rate": 0.0001555191925180333,
+      "loss": 0.9157,
+      "step": 7038
+    },
+    {
+      "epoch": 1.2532051282051282,
+      "grad_norm": 0.6519795656204224,
+      "learning_rate": 0.0001555075499649221,
+      "loss": 1.0074,
+      "step": 7039
+    },
+    {
+      "epoch": 1.253383190883191,
+      "grad_norm": 0.6063529849052429,
+      "learning_rate": 0.00015549590632425576,
+      "loss": 1.0205,
+      "step": 7040
+    },
+    {
+      "epoch": 1.2535612535612537,
+      "grad_norm": 0.7055633664131165,
+      "learning_rate": 0.00015548426159626242,
+      "loss": 1.0254,
+      "step": 7041
+    },
+    {
+      "epoch": 1.2537393162393162,
+      "grad_norm": 0.6783022880554199,
+      "learning_rate": 0.00015547261578117025,
+      "loss": 1.1017,
+      "step": 7042
+    },
+    {
+      "epoch": 1.253917378917379,
+      "grad_norm": 0.7055003643035889,
+      "learning_rate": 0.0001554609688792074,
+      "loss": 1.0269,
+      "step": 7043
+    },
+    {
+      "epoch": 1.2540954415954415,
+      "grad_norm": 0.6465007662773132,
+      "learning_rate": 0.0001554493208906021,
+      "loss": 1.0492,
+      "step": 7044
+    },
+    {
+      "epoch": 1.2542735042735043,
+      "grad_norm": 0.6443775296211243,
+      "learning_rate": 0.0001554376718155825,
+      "loss": 0.9778,
+      "step": 7045
+    },
+    {
+      "epoch": 1.254451566951567,
+      "grad_norm": 0.695214569568634,
+      "learning_rate": 0.0001554260216543769,
+      "loss": 0.8792,
+      "step": 7046
+    },
+    {
+      "epoch": 1.2546296296296298,
+      "grad_norm": 0.6777814626693726,
+      "learning_rate": 0.00015541437040721354,
+      "loss": 0.8944,
+      "step": 7047
+    },
+    {
+      "epoch": 1.2548076923076923,
+      "grad_norm": 0.6269369721412659,
+      "learning_rate": 0.0001554027180743207,
+      "loss": 0.8825,
+      "step": 7048
+    },
+    {
+      "epoch": 1.254985754985755,
+      "grad_norm": 0.6197061538696289,
+      "learning_rate": 0.0001553910646559267,
+      "loss": 0.9823,
+      "step": 7049
+    },
+    {
+      "epoch": 1.2551638176638176,
+      "grad_norm": 0.681347131729126,
+      "learning_rate": 0.00015537941015225984,
+      "loss": 0.995,
+      "step": 7050
+    },
+    {
+      "epoch": 1.2553418803418803,
+      "grad_norm": 0.6224286556243896,
+      "learning_rate": 0.00015536775456354848,
+      "loss": 0.7714,
+      "step": 7051
+    },
+    {
+      "epoch": 1.255519943019943,
+      "grad_norm": 0.6113278269767761,
+      "learning_rate": 0.00015535609789002098,
+      "loss": 0.9859,
+      "step": 7052
+    },
+    {
+      "epoch": 1.2556980056980058,
+      "grad_norm": 0.6985422372817993,
+      "learning_rate": 0.00015534444013190577,
+      "loss": 0.8785,
+      "step": 7053
+    },
+    {
+      "epoch": 1.2558760683760684,
+      "grad_norm": 0.5602933168411255,
+      "learning_rate": 0.00015533278128943118,
+      "loss": 0.8341,
+      "step": 7054
+    },
+    {
+      "epoch": 1.256054131054131,
+      "grad_norm": 0.587684690952301,
+      "learning_rate": 0.0001553211213628257,
+      "loss": 0.7933,
+      "step": 7055
+    },
+    {
+      "epoch": 1.2562321937321936,
+      "grad_norm": 0.692997932434082,
+      "learning_rate": 0.0001553094603523178,
+      "loss": 1.0957,
+      "step": 7056
+    },
+    {
+      "epoch": 1.2564102564102564,
+      "grad_norm": 0.6925587058067322,
+      "learning_rate": 0.00015529779825813588,
+      "loss": 0.8602,
+      "step": 7057
+    },
+    {
+      "epoch": 1.2565883190883191,
+      "grad_norm": 0.6383063197135925,
+      "learning_rate": 0.0001552861350805085,
+      "loss": 0.9933,
+      "step": 7058
+    },
+    {
+      "epoch": 1.256766381766382,
+      "grad_norm": 0.6520544290542603,
+      "learning_rate": 0.00015527447081966413,
+      "loss": 0.9498,
+      "step": 7059
+    },
+    {
+      "epoch": 1.2569444444444444,
+      "grad_norm": 0.7353914380073547,
+      "learning_rate": 0.00015526280547583133,
+      "loss": 1.1071,
+      "step": 7060
+    },
+    {
+      "epoch": 1.2571225071225072,
+      "grad_norm": 0.7141618132591248,
+      "learning_rate": 0.00015525113904923864,
+      "loss": 0.8333,
+      "step": 7061
+    },
+    {
+      "epoch": 1.2573005698005697,
+      "grad_norm": 0.6194499731063843,
+      "learning_rate": 0.00015523947154011468,
+      "loss": 0.9421,
+      "step": 7062
+    },
+    {
+      "epoch": 1.2574786324786325,
+      "grad_norm": 0.7514514327049255,
+      "learning_rate": 0.00015522780294868803,
+      "loss": 1.226,
+      "step": 7063
+    },
+    {
+      "epoch": 1.2576566951566952,
+      "grad_norm": 0.762923538684845,
+      "learning_rate": 0.0001552161332751873,
+      "loss": 1.1893,
+      "step": 7064
+    },
+    {
+      "epoch": 1.257834757834758,
+      "grad_norm": 0.6265730261802673,
+      "learning_rate": 0.00015520446251984113,
+      "loss": 0.6604,
+      "step": 7065
+    },
+    {
+      "epoch": 1.2580128205128205,
+      "grad_norm": 0.6447750329971313,
+      "learning_rate": 0.0001551927906828782,
+      "loss": 0.9814,
+      "step": 7066
+    },
+    {
+      "epoch": 1.2581908831908832,
+      "grad_norm": 0.5791042447090149,
+      "learning_rate": 0.00015518111776452722,
+      "loss": 0.8283,
+      "step": 7067
+    },
+    {
+      "epoch": 1.2583689458689458,
+      "grad_norm": 0.5267777442932129,
+      "learning_rate": 0.00015516944376501682,
+      "loss": 0.5748,
+      "step": 7068
+    },
+    {
+      "epoch": 1.2585470085470085,
+      "grad_norm": 0.7343912720680237,
+      "learning_rate": 0.0001551577686845758,
+      "loss": 1.1777,
+      "step": 7069
+    },
+    {
+      "epoch": 1.2587250712250713,
+      "grad_norm": 0.645746111869812,
+      "learning_rate": 0.00015514609252343284,
+      "loss": 0.9356,
+      "step": 7070
+    },
+    {
+      "epoch": 1.258903133903134,
+      "grad_norm": 0.6993104219436646,
+      "learning_rate": 0.0001551344152818168,
+      "loss": 1.06,
+      "step": 7071
+    },
+    {
+      "epoch": 1.2590811965811965,
+      "grad_norm": 0.6661365628242493,
+      "learning_rate": 0.0001551227369599564,
+      "loss": 1.061,
+      "step": 7072
+    },
+    {
+      "epoch": 1.2592592592592593,
+      "grad_norm": 0.7833736538887024,
+      "learning_rate": 0.0001551110575580805,
+      "loss": 0.9674,
+      "step": 7073
+    },
+    {
+      "epoch": 1.2594373219373218,
+      "grad_norm": 0.5878575444221497,
+      "learning_rate": 0.00015509937707641787,
+      "loss": 0.9002,
+      "step": 7074
+    },
+    {
+      "epoch": 1.2596153846153846,
+      "grad_norm": 0.6402907371520996,
+      "learning_rate": 0.00015508769551519745,
+      "loss": 1.0157,
+      "step": 7075
+    },
+    {
+      "epoch": 1.2597934472934473,
+      "grad_norm": 0.6794611215591431,
+      "learning_rate": 0.00015507601287464805,
+      "loss": 1.052,
+      "step": 7076
+    },
+    {
+      "epoch": 1.25997150997151,
+      "grad_norm": 0.706922173500061,
+      "learning_rate": 0.0001550643291549986,
+      "loss": 1.0814,
+      "step": 7077
+    },
+    {
+      "epoch": 1.2601495726495726,
+      "grad_norm": 0.6722953915596008,
+      "learning_rate": 0.000155052644356478,
+      "loss": 1.1402,
+      "step": 7078
+    },
+    {
+      "epoch": 1.2603276353276354,
+      "grad_norm": 0.6619611978530884,
+      "learning_rate": 0.00015504095847931518,
+      "loss": 0.9583,
+      "step": 7079
+    },
+    {
+      "epoch": 1.260505698005698,
+      "grad_norm": 0.5645583271980286,
+      "learning_rate": 0.00015502927152373914,
+      "loss": 0.6746,
+      "step": 7080
+    },
+    {
+      "epoch": 1.2606837606837606,
+      "grad_norm": 0.6634977459907532,
+      "learning_rate": 0.00015501758348997882,
+      "loss": 1.0451,
+      "step": 7081
+    },
+    {
+      "epoch": 1.2608618233618234,
+      "grad_norm": 0.7167651057243347,
+      "learning_rate": 0.00015500589437826326,
+      "loss": 0.931,
+      "step": 7082
+    },
+    {
+      "epoch": 1.2610398860398861,
+      "grad_norm": 0.6179340481758118,
+      "learning_rate": 0.00015499420418882146,
+      "loss": 1.0953,
+      "step": 7083
+    },
+    {
+      "epoch": 1.2612179487179487,
+      "grad_norm": 0.6948468685150146,
+      "learning_rate": 0.00015498251292188247,
+      "loss": 1.0277,
+      "step": 7084
+    },
+    {
+      "epoch": 1.2613960113960114,
+      "grad_norm": 0.6256045699119568,
+      "learning_rate": 0.00015497082057767532,
+      "loss": 1.0154,
+      "step": 7085
+    },
+    {
+      "epoch": 1.261574074074074,
+      "grad_norm": 0.6457428336143494,
+      "learning_rate": 0.0001549591271564292,
+      "loss": 0.9693,
+      "step": 7086
+    },
+    {
+      "epoch": 1.2617521367521367,
+      "grad_norm": 0.722259521484375,
+      "learning_rate": 0.0001549474326583731,
+      "loss": 0.9176,
+      "step": 7087
+    },
+    {
+      "epoch": 1.2619301994301995,
+      "grad_norm": 0.742477297782898,
+      "learning_rate": 0.0001549357370837362,
+      "loss": 0.9813,
+      "step": 7088
+    },
+    {
+      "epoch": 1.2621082621082622,
+      "grad_norm": 0.5981723666191101,
+      "learning_rate": 0.0001549240404327477,
+      "loss": 0.8943,
+      "step": 7089
+    },
+    {
+      "epoch": 1.2622863247863247,
+      "grad_norm": 0.6266574859619141,
+      "learning_rate": 0.00015491234270563665,
+      "loss": 0.8439,
+      "step": 7090
+    },
+    {
+      "epoch": 1.2624643874643875,
+      "grad_norm": 0.6723998188972473,
+      "learning_rate": 0.00015490064390263238,
+      "loss": 1.2278,
+      "step": 7091
+    },
+    {
+      "epoch": 1.26264245014245,
+      "grad_norm": 0.6628100275993347,
+      "learning_rate": 0.00015488894402396398,
+      "loss": 0.9526,
+      "step": 7092
+    },
+    {
+      "epoch": 1.2628205128205128,
+      "grad_norm": 0.6661350727081299,
+      "learning_rate": 0.0001548772430698608,
+      "loss": 0.974,
+      "step": 7093
+    },
+    {
+      "epoch": 1.2629985754985755,
+      "grad_norm": 0.8210669755935669,
+      "learning_rate": 0.000154865541040552,
+      "loss": 1.1142,
+      "step": 7094
+    },
+    {
+      "epoch": 1.2631766381766383,
+      "grad_norm": 0.6329003572463989,
+      "learning_rate": 0.0001548538379362669,
+      "loss": 0.8485,
+      "step": 7095
+    },
+    {
+      "epoch": 1.2633547008547008,
+      "grad_norm": 0.6288384795188904,
+      "learning_rate": 0.0001548421337572348,
+      "loss": 0.816,
+      "step": 7096
+    },
+    {
+      "epoch": 1.2635327635327636,
+      "grad_norm": 0.631060004234314,
+      "learning_rate": 0.00015483042850368504,
+      "loss": 0.8237,
+      "step": 7097
+    },
+    {
+      "epoch": 1.263710826210826,
+      "grad_norm": 0.7343839406967163,
+      "learning_rate": 0.0001548187221758469,
+      "loss": 1.1507,
+      "step": 7098
+    },
+    {
+      "epoch": 1.2638888888888888,
+      "grad_norm": 0.6313042640686035,
+      "learning_rate": 0.0001548070147739498,
+      "loss": 0.7762,
+      "step": 7099
+    },
+    {
+      "epoch": 1.2640669515669516,
+      "grad_norm": 0.6449850797653198,
+      "learning_rate": 0.00015479530629822308,
+      "loss": 0.9225,
+      "step": 7100
+    },
+    {
+      "epoch": 1.2642450142450143,
+      "grad_norm": 0.6371589303016663,
+      "learning_rate": 0.00015478359674889617,
+      "loss": 1.0088,
+      "step": 7101
+    },
+    {
+      "epoch": 1.2644230769230769,
+      "grad_norm": 0.6483678221702576,
+      "learning_rate": 0.00015477188612619849,
+      "loss": 0.6234,
+      "step": 7102
+    },
+    {
+      "epoch": 1.2646011396011396,
+      "grad_norm": 0.6945441365242004,
+      "learning_rate": 0.00015476017443035947,
+      "loss": 1.123,
+      "step": 7103
+    },
+    {
+      "epoch": 1.2647792022792022,
+      "grad_norm": 0.6356340050697327,
+      "learning_rate": 0.00015474846166160856,
+      "loss": 0.9923,
+      "step": 7104
+    },
+    {
+      "epoch": 1.264957264957265,
+      "grad_norm": 0.6774702668190002,
+      "learning_rate": 0.00015473674782017532,
+      "loss": 0.9694,
+      "step": 7105
+    },
+    {
+      "epoch": 1.2651353276353277,
+      "grad_norm": 0.6332793831825256,
+      "learning_rate": 0.0001547250329062892,
+      "loss": 1.0633,
+      "step": 7106
+    },
+    {
+      "epoch": 1.2653133903133904,
+      "grad_norm": 0.6563684344291687,
+      "learning_rate": 0.00015471331692017972,
+      "loss": 1.0893,
+      "step": 7107
+    },
+    {
+      "epoch": 1.265491452991453,
+      "grad_norm": 0.7318371534347534,
+      "learning_rate": 0.0001547015998620765,
+      "loss": 1.1777,
+      "step": 7108
+    },
+    {
+      "epoch": 1.2656695156695157,
+      "grad_norm": 0.7099173069000244,
+      "learning_rate": 0.000154689881732209,
+      "loss": 1.1717,
+      "step": 7109
+    },
+    {
+      "epoch": 1.2658475783475782,
+      "grad_norm": 0.661078691482544,
+      "learning_rate": 0.00015467816253080693,
+      "loss": 1.0448,
+      "step": 7110
+    },
+    {
+      "epoch": 1.266025641025641,
+      "grad_norm": 0.6206802129745483,
+      "learning_rate": 0.0001546664422580998,
+      "loss": 0.9334,
+      "step": 7111
+    },
+    {
+      "epoch": 1.2662037037037037,
+      "grad_norm": 0.6514355540275574,
+      "learning_rate": 0.00015465472091431728,
+      "loss": 0.9533,
+      "step": 7112
+    },
+    {
+      "epoch": 1.2663817663817665,
+      "grad_norm": 0.6090209484100342,
+      "learning_rate": 0.0001546429984996891,
+      "loss": 0.9206,
+      "step": 7113
+    },
+    {
+      "epoch": 1.266559829059829,
+      "grad_norm": 0.6345987915992737,
+      "learning_rate": 0.00015463127501444488,
+      "loss": 1.0537,
+      "step": 7114
+    },
+    {
+      "epoch": 1.2667378917378918,
+      "grad_norm": 0.6095160245895386,
+      "learning_rate": 0.0001546195504588143,
+      "loss": 0.8652,
+      "step": 7115
+    },
+    {
+      "epoch": 1.2669159544159543,
+      "grad_norm": 0.6751621961593628,
+      "learning_rate": 0.00015460782483302707,
+      "loss": 0.9001,
+      "step": 7116
+    },
+    {
+      "epoch": 1.267094017094017,
+      "grad_norm": 0.6261575222015381,
+      "learning_rate": 0.00015459609813731295,
+      "loss": 0.929,
+      "step": 7117
+    },
+    {
+      "epoch": 1.2672720797720798,
+      "grad_norm": 0.589495837688446,
+      "learning_rate": 0.0001545843703719017,
+      "loss": 0.9023,
+      "step": 7118
+    },
+    {
+      "epoch": 1.2674501424501425,
+      "grad_norm": 0.6364617943763733,
+      "learning_rate": 0.00015457264153702311,
+      "loss": 0.8261,
+      "step": 7119
+    },
+    {
+      "epoch": 1.267628205128205,
+      "grad_norm": 0.6685599684715271,
+      "learning_rate": 0.00015456091163290698,
+      "loss": 1.1267,
+      "step": 7120
+    },
+    {
+      "epoch": 1.2678062678062678,
+      "grad_norm": 0.6440932750701904,
+      "learning_rate": 0.0001545491806597831,
+      "loss": 0.9643,
+      "step": 7121
+    },
+    {
+      "epoch": 1.2679843304843303,
+      "grad_norm": 0.7641597390174866,
+      "learning_rate": 0.00015453744861788137,
+      "loss": 1.1577,
+      "step": 7122
+    },
+    {
+      "epoch": 1.268162393162393,
+      "grad_norm": 0.6965937614440918,
+      "learning_rate": 0.00015452571550743163,
+      "loss": 0.7835,
+      "step": 7123
+    },
+    {
+      "epoch": 1.2683404558404558,
+      "grad_norm": 0.6332844495773315,
+      "learning_rate": 0.00015451398132866376,
+      "loss": 0.9794,
+      "step": 7124
+    },
+    {
+      "epoch": 1.2685185185185186,
+      "grad_norm": 0.6719903349876404,
+      "learning_rate": 0.00015450224608180765,
+      "loss": 0.9795,
+      "step": 7125
+    },
+    {
+      "epoch": 1.2686965811965811,
+      "grad_norm": 0.567414402961731,
+      "learning_rate": 0.00015449050976709328,
+      "loss": 0.9737,
+      "step": 7126
+    },
+    {
+      "epoch": 1.2688746438746439,
+      "grad_norm": 0.6810645461082458,
+      "learning_rate": 0.0001544787723847505,
+      "loss": 1.2358,
+      "step": 7127
+    },
+    {
+      "epoch": 1.2690527065527066,
+      "grad_norm": 0.6693191528320312,
+      "learning_rate": 0.00015446703393500938,
+      "loss": 0.9475,
+      "step": 7128
+    },
+    {
+      "epoch": 1.2692307692307692,
+      "grad_norm": 0.7077522277832031,
+      "learning_rate": 0.00015445529441809988,
+      "loss": 1.013,
+      "step": 7129
+    },
+    {
+      "epoch": 1.269408831908832,
+      "grad_norm": 0.6596258878707886,
+      "learning_rate": 0.000154443553834252,
+      "loss": 1.1506,
+      "step": 7130
+    },
+    {
+      "epoch": 1.2695868945868947,
+      "grad_norm": 0.6721500754356384,
+      "learning_rate": 0.0001544318121836958,
+      "loss": 0.8848,
+      "step": 7131
+    },
+    {
+      "epoch": 1.2697649572649572,
+      "grad_norm": 0.6943998336791992,
+      "learning_rate": 0.00015442006946666132,
+      "loss": 1.1118,
+      "step": 7132
+    },
+    {
+      "epoch": 1.26994301994302,
+      "grad_norm": 0.6132234930992126,
+      "learning_rate": 0.0001544083256833786,
+      "loss": 0.9932,
+      "step": 7133
+    },
+    {
+      "epoch": 1.2701210826210827,
+      "grad_norm": 0.7337939739227295,
+      "learning_rate": 0.00015439658083407775,
+      "loss": 1.0973,
+      "step": 7134
+    },
+    {
+      "epoch": 1.2702991452991452,
+      "grad_norm": 0.6551772356033325,
+      "learning_rate": 0.00015438483491898893,
+      "loss": 1.0006,
+      "step": 7135
+    },
+    {
+      "epoch": 1.270477207977208,
+      "grad_norm": 0.660068929195404,
+      "learning_rate": 0.00015437308793834223,
+      "loss": 0.9291,
+      "step": 7136
+    },
+    {
+      "epoch": 1.2706552706552707,
+      "grad_norm": 0.7622788548469543,
+      "learning_rate": 0.00015436133989236783,
+      "loss": 1.0782,
+      "step": 7137
+    },
+    {
+      "epoch": 1.2708333333333333,
+      "grad_norm": 0.848494291305542,
+      "learning_rate": 0.00015434959078129587,
+      "loss": 1.2001,
+      "step": 7138
+    },
+    {
+      "epoch": 1.271011396011396,
+      "grad_norm": 0.6222602725028992,
+      "learning_rate": 0.0001543378406053566,
+      "loss": 1.011,
+      "step": 7139
+    },
+    {
+      "epoch": 1.2711894586894588,
+      "grad_norm": 0.6164663434028625,
+      "learning_rate": 0.00015432608936478026,
+      "loss": 1.0282,
+      "step": 7140
+    },
+    {
+      "epoch": 1.2713675213675213,
+      "grad_norm": 0.7236546277999878,
+      "learning_rate": 0.000154314337059797,
+      "loss": 1.0112,
+      "step": 7141
+    },
+    {
+      "epoch": 1.271545584045584,
+      "grad_norm": 0.6891111135482788,
+      "learning_rate": 0.00015430258369063715,
+      "loss": 1.1191,
+      "step": 7142
+    },
+    {
+      "epoch": 1.2717236467236468,
+      "grad_norm": 0.6600295901298523,
+      "learning_rate": 0.00015429082925753099,
+      "loss": 0.9561,
+      "step": 7143
+    },
+    {
+      "epoch": 1.2719017094017093,
+      "grad_norm": 0.6819902062416077,
+      "learning_rate": 0.0001542790737607088,
+      "loss": 1.0631,
+      "step": 7144
+    },
+    {
+      "epoch": 1.272079772079772,
+      "grad_norm": 0.6518470644950867,
+      "learning_rate": 0.0001542673172004009,
+      "loss": 1.0806,
+      "step": 7145
+    },
+    {
+      "epoch": 1.2722578347578348,
+      "grad_norm": 0.737501859664917,
+      "learning_rate": 0.00015425555957683767,
+      "loss": 1.0144,
+      "step": 7146
+    },
+    {
+      "epoch": 1.2724358974358974,
+      "grad_norm": 0.6245740652084351,
+      "learning_rate": 0.00015424380089024944,
+      "loss": 1.0612,
+      "step": 7147
+    },
+    {
+      "epoch": 1.27261396011396,
+      "grad_norm": 0.7118125557899475,
+      "learning_rate": 0.0001542320411408666,
+      "loss": 1.1458,
+      "step": 7148
+    },
+    {
+      "epoch": 1.2727920227920229,
+      "grad_norm": 0.6965761780738831,
+      "learning_rate": 0.00015422028032891958,
+      "loss": 0.8052,
+      "step": 7149
+    },
+    {
+      "epoch": 1.2729700854700854,
+      "grad_norm": 0.7661466598510742,
+      "learning_rate": 0.0001542085184546388,
+      "loss": 1.1245,
+      "step": 7150
+    },
+    {
+      "epoch": 1.2731481481481481,
+      "grad_norm": 0.7238876223564148,
+      "learning_rate": 0.00015419675551825475,
+      "loss": 0.9346,
+      "step": 7151
+    },
+    {
+      "epoch": 1.273326210826211,
+      "grad_norm": 0.669562041759491,
+      "learning_rate": 0.0001541849915199978,
+      "loss": 0.7816,
+      "step": 7152
+    },
+    {
+      "epoch": 1.2735042735042734,
+      "grad_norm": 0.6799174547195435,
+      "learning_rate": 0.00015417322646009855,
+      "loss": 1.047,
+      "step": 7153
+    },
+    {
+      "epoch": 1.2736823361823362,
+      "grad_norm": 0.6012796759605408,
+      "learning_rate": 0.00015416146033878745,
+      "loss": 1.0101,
+      "step": 7154
+    },
+    {
+      "epoch": 1.273860398860399,
+      "grad_norm": 0.7008427977561951,
+      "learning_rate": 0.00015414969315629505,
+      "loss": 1.1321,
+      "step": 7155
+    },
+    {
+      "epoch": 1.2740384615384617,
+      "grad_norm": 0.6555556058883667,
+      "learning_rate": 0.0001541379249128519,
+      "loss": 0.9926,
+      "step": 7156
+    },
+    {
+      "epoch": 1.2742165242165242,
+      "grad_norm": 0.6324251294136047,
+      "learning_rate": 0.00015412615560868854,
+      "loss": 0.9051,
+      "step": 7157
+    },
+    {
+      "epoch": 1.274394586894587,
+      "grad_norm": 0.6035568714141846,
+      "learning_rate": 0.0001541143852440356,
+      "loss": 0.8248,
+      "step": 7158
+    },
+    {
+      "epoch": 1.2745726495726495,
+      "grad_norm": 0.6733569502830505,
+      "learning_rate": 0.0001541026138191237,
+      "loss": 0.9149,
+      "step": 7159
+    },
+    {
+      "epoch": 1.2747507122507122,
+      "grad_norm": 0.8306798338890076,
+      "learning_rate": 0.0001540908413341835,
+      "loss": 1.0694,
+      "step": 7160
+    },
+    {
+      "epoch": 1.274928774928775,
+      "grad_norm": 0.6649713516235352,
+      "learning_rate": 0.00015407906778944563,
+      "loss": 1.1358,
+      "step": 7161
+    },
+    {
+      "epoch": 1.2751068376068377,
+      "grad_norm": 0.6889697909355164,
+      "learning_rate": 0.00015406729318514074,
+      "loss": 1.0096,
+      "step": 7162
+    },
+    {
+      "epoch": 1.2752849002849003,
+      "grad_norm": 0.6948645114898682,
+      "learning_rate": 0.0001540555175214996,
+      "loss": 1.0649,
+      "step": 7163
+    },
+    {
+      "epoch": 1.275462962962963,
+      "grad_norm": 0.6844844818115234,
+      "learning_rate": 0.0001540437407987528,
+      "loss": 0.884,
+      "step": 7164
+    },
+    {
+      "epoch": 1.2756410256410255,
+      "grad_norm": 0.7124526500701904,
+      "learning_rate": 0.00015403196301713124,
+      "loss": 1.1307,
+      "step": 7165
+    },
+    {
+      "epoch": 1.2758190883190883,
+      "grad_norm": 0.7328375577926636,
+      "learning_rate": 0.00015402018417686556,
+      "loss": 1.0348,
+      "step": 7166
+    },
+    {
+      "epoch": 1.275997150997151,
+      "grad_norm": 0.5872696042060852,
+      "learning_rate": 0.00015400840427818663,
+      "loss": 0.9827,
+      "step": 7167
+    },
+    {
+      "epoch": 1.2761752136752138,
+      "grad_norm": 0.6370702385902405,
+      "learning_rate": 0.00015399662332132519,
+      "loss": 0.9171,
+      "step": 7168
+    },
+    {
+      "epoch": 1.2763532763532763,
+      "grad_norm": 0.6481866240501404,
+      "learning_rate": 0.00015398484130651205,
+      "loss": 0.8704,
+      "step": 7169
+    },
+    {
+      "epoch": 1.276531339031339,
+      "grad_norm": 0.598739743232727,
+      "learning_rate": 0.00015397305823397812,
+      "loss": 0.8097,
+      "step": 7170
+    },
+    {
+      "epoch": 1.2767094017094016,
+      "grad_norm": 0.5941228270530701,
+      "learning_rate": 0.00015396127410395423,
+      "loss": 0.8853,
+      "step": 7171
+    },
+    {
+      "epoch": 1.2768874643874644,
+      "grad_norm": 0.6485885381698608,
+      "learning_rate": 0.00015394948891667127,
+      "loss": 0.702,
+      "step": 7172
+    },
+    {
+      "epoch": 1.2770655270655271,
+      "grad_norm": 0.5314942598342896,
+      "learning_rate": 0.00015393770267236017,
+      "loss": 0.7899,
+      "step": 7173
+    },
+    {
+      "epoch": 1.2772435897435899,
+      "grad_norm": 0.6113781929016113,
+      "learning_rate": 0.00015392591537125182,
+      "loss": 0.9871,
+      "step": 7174
+    },
+    {
+      "epoch": 1.2774216524216524,
+      "grad_norm": 0.5625866651535034,
+      "learning_rate": 0.00015391412701357715,
+      "loss": 0.8246,
+      "step": 7175
+    },
+    {
+      "epoch": 1.2775997150997151,
+      "grad_norm": 0.6006998419761658,
+      "learning_rate": 0.00015390233759956718,
+      "loss": 0.899,
+      "step": 7176
+    },
+    {
+      "epoch": 1.2777777777777777,
+      "grad_norm": 0.6916918158531189,
+      "learning_rate": 0.0001538905471294529,
+      "loss": 1.0443,
+      "step": 7177
+    },
+    {
+      "epoch": 1.2779558404558404,
+      "grad_norm": 0.6263536810874939,
+      "learning_rate": 0.00015387875560346525,
+      "loss": 0.9159,
+      "step": 7178
+    },
+    {
+      "epoch": 1.2781339031339032,
+      "grad_norm": 0.6563085913658142,
+      "learning_rate": 0.00015386696302183535,
+      "loss": 0.994,
+      "step": 7179
+    },
+    {
+      "epoch": 1.278311965811966,
+      "grad_norm": 0.6312007904052734,
+      "learning_rate": 0.00015385516938479416,
+      "loss": 0.9148,
+      "step": 7180
+    },
+    {
+      "epoch": 1.2784900284900285,
+      "grad_norm": 0.6408209204673767,
+      "learning_rate": 0.00015384337469257284,
+      "loss": 1.0508,
+      "step": 7181
+    },
+    {
+      "epoch": 1.2786680911680912,
+      "grad_norm": 0.656234085559845,
+      "learning_rate": 0.00015383157894540244,
+      "loss": 0.9952,
+      "step": 7182
+    },
+    {
+      "epoch": 1.2788461538461537,
+      "grad_norm": 0.7401639819145203,
+      "learning_rate": 0.00015381978214351407,
+      "loss": 1.1615,
+      "step": 7183
+    },
+    {
+      "epoch": 1.2790242165242165,
+      "grad_norm": 0.5746055841445923,
+      "learning_rate": 0.00015380798428713885,
+      "loss": 0.9142,
+      "step": 7184
+    },
+    {
+      "epoch": 1.2792022792022792,
+      "grad_norm": 0.8061720728874207,
+      "learning_rate": 0.00015379618537650797,
+      "loss": 1.13,
+      "step": 7185
+    },
+    {
+      "epoch": 1.279380341880342,
+      "grad_norm": 0.6336073875427246,
+      "learning_rate": 0.0001537843854118526,
+      "loss": 1.0581,
+      "step": 7186
+    },
+    {
+      "epoch": 1.2795584045584045,
+      "grad_norm": 0.6549856066703796,
+      "learning_rate": 0.0001537725843934039,
+      "loss": 1.09,
+      "step": 7187
+    },
+    {
+      "epoch": 1.2797364672364673,
+      "grad_norm": 0.5759010910987854,
+      "learning_rate": 0.00015376078232139315,
+      "loss": 0.8441,
+      "step": 7188
+    },
+    {
+      "epoch": 1.2799145299145298,
+      "grad_norm": 0.5733884572982788,
+      "learning_rate": 0.00015374897919605152,
+      "loss": 0.9086,
+      "step": 7189
+    },
+    {
+      "epoch": 1.2800925925925926,
+      "grad_norm": 0.6505870819091797,
+      "learning_rate": 0.0001537371750176103,
+      "loss": 1.1683,
+      "step": 7190
+    },
+    {
+      "epoch": 1.2802706552706553,
+      "grad_norm": 0.6744688153266907,
+      "learning_rate": 0.00015372536978630077,
+      "loss": 0.9483,
+      "step": 7191
+    },
+    {
+      "epoch": 1.280448717948718,
+      "grad_norm": 0.598098874092102,
+      "learning_rate": 0.0001537135635023542,
+      "loss": 0.7747,
+      "step": 7192
+    },
+    {
+      "epoch": 1.2806267806267806,
+      "grad_norm": 0.6711761951446533,
+      "learning_rate": 0.00015370175616600195,
+      "loss": 1.1897,
+      "step": 7193
+    },
+    {
+      "epoch": 1.2808048433048433,
+      "grad_norm": 0.6207453608512878,
+      "learning_rate": 0.00015368994777747536,
+      "loss": 1.0063,
+      "step": 7194
+    },
+    {
+      "epoch": 1.2809829059829059,
+      "grad_norm": 0.6701686382293701,
+      "learning_rate": 0.00015367813833700575,
+      "loss": 1.0864,
+      "step": 7195
+    },
+    {
+      "epoch": 1.2811609686609686,
+      "grad_norm": 0.5916469693183899,
+      "learning_rate": 0.00015366632784482456,
+      "loss": 0.8786,
+      "step": 7196
+    },
+    {
+      "epoch": 1.2813390313390314,
+      "grad_norm": 0.6567547917366028,
+      "learning_rate": 0.00015365451630116312,
+      "loss": 0.9977,
+      "step": 7197
+    },
+    {
+      "epoch": 1.2815170940170941,
+      "grad_norm": 0.7287433743476868,
+      "learning_rate": 0.00015364270370625294,
+      "loss": 1.1248,
+      "step": 7198
+    },
+    {
+      "epoch": 1.2816951566951567,
+      "grad_norm": 0.7736039161682129,
+      "learning_rate": 0.0001536308900603254,
+      "loss": 0.9832,
+      "step": 7199
+    },
+    {
+      "epoch": 1.2818732193732194,
+      "grad_norm": 0.6799852252006531,
+      "learning_rate": 0.00015361907536361194,
+      "loss": 1.0275,
+      "step": 7200
+    },
+    {
+      "epoch": 1.282051282051282,
+      "grad_norm": 0.5975812673568726,
+      "learning_rate": 0.00015360725961634407,
+      "loss": 1.0516,
+      "step": 7201
+    },
+    {
+      "epoch": 1.2822293447293447,
+      "grad_norm": 0.616307258605957,
+      "learning_rate": 0.00015359544281875337,
+      "loss": 0.8095,
+      "step": 7202
+    },
+    {
+      "epoch": 1.2824074074074074,
+      "grad_norm": 0.6357580423355103,
+      "learning_rate": 0.00015358362497107126,
+      "loss": 0.9186,
+      "step": 7203
+    },
+    {
+      "epoch": 1.2825854700854702,
+      "grad_norm": 0.679333508014679,
+      "learning_rate": 0.00015357180607352935,
+      "loss": 0.9433,
+      "step": 7204
+    },
+    {
+      "epoch": 1.2827635327635327,
+      "grad_norm": 0.6345439553260803,
+      "learning_rate": 0.00015355998612635914,
+      "loss": 0.9186,
+      "step": 7205
+    },
+    {
+      "epoch": 1.2829415954415955,
+      "grad_norm": 0.6256508827209473,
+      "learning_rate": 0.00015354816512979231,
+      "loss": 0.9984,
+      "step": 7206
+    },
+    {
+      "epoch": 1.283119658119658,
+      "grad_norm": 0.7973852753639221,
+      "learning_rate": 0.00015353634308406044,
+      "loss": 1.1145,
+      "step": 7207
+    },
+    {
+      "epoch": 1.2832977207977208,
+      "grad_norm": 0.711125910282135,
+      "learning_rate": 0.0001535245199893951,
+      "loss": 1.1947,
+      "step": 7208
+    },
+    {
+      "epoch": 1.2834757834757835,
+      "grad_norm": 0.6096055507659912,
+      "learning_rate": 0.00015351269584602798,
+      "loss": 1.0078,
+      "step": 7209
+    },
+    {
+      "epoch": 1.2836538461538463,
+      "grad_norm": 0.7089232802391052,
+      "learning_rate": 0.00015350087065419077,
+      "loss": 1.112,
+      "step": 7210
+    },
+    {
+      "epoch": 1.2838319088319088,
+      "grad_norm": 0.716199517250061,
+      "learning_rate": 0.00015348904441411508,
+      "loss": 1.1015,
+      "step": 7211
+    },
+    {
+      "epoch": 1.2840099715099715,
+      "grad_norm": 0.6374632716178894,
+      "learning_rate": 0.00015347721712603276,
+      "loss": 1.0519,
+      "step": 7212
+    },
+    {
+      "epoch": 1.284188034188034,
+      "grad_norm": 0.6500036716461182,
+      "learning_rate": 0.0001534653887901754,
+      "loss": 1.1719,
+      "step": 7213
+    },
+    {
+      "epoch": 1.2843660968660968,
+      "grad_norm": 0.7249937653541565,
+      "learning_rate": 0.00015345355940677485,
+      "loss": 1.0188,
+      "step": 7214
+    },
+    {
+      "epoch": 1.2845441595441596,
+      "grad_norm": 0.6645919680595398,
+      "learning_rate": 0.00015344172897606285,
+      "loss": 0.9788,
+      "step": 7215
+    },
+    {
+      "epoch": 1.2847222222222223,
+      "grad_norm": 0.7032710313796997,
+      "learning_rate": 0.00015342989749827113,
+      "loss": 1.1093,
+      "step": 7216
+    },
+    {
+      "epoch": 1.2849002849002849,
+      "grad_norm": 0.622767984867096,
+      "learning_rate": 0.0001534180649736316,
+      "loss": 0.8978,
+      "step": 7217
+    },
+    {
+      "epoch": 1.2850783475783476,
+      "grad_norm": 0.7499693036079407,
+      "learning_rate": 0.00015340623140237605,
+      "loss": 1.2232,
+      "step": 7218
+    },
+    {
+      "epoch": 1.2852564102564101,
+      "grad_norm": 0.6308625936508179,
+      "learning_rate": 0.00015339439678473636,
+      "loss": 0.8621,
+      "step": 7219
+    },
+    {
+      "epoch": 1.2854344729344729,
+      "grad_norm": 0.6513667106628418,
+      "learning_rate": 0.00015338256112094434,
+      "loss": 1.0541,
+      "step": 7220
+    },
+    {
+      "epoch": 1.2856125356125356,
+      "grad_norm": 0.6080937385559082,
+      "learning_rate": 0.00015337072441123193,
+      "loss": 0.8474,
+      "step": 7221
+    },
+    {
+      "epoch": 1.2857905982905984,
+      "grad_norm": 0.6742652058601379,
+      "learning_rate": 0.00015335888665583104,
+      "loss": 1.0172,
+      "step": 7222
+    },
+    {
+      "epoch": 1.285968660968661,
+      "grad_norm": 0.620810866355896,
+      "learning_rate": 0.00015334704785497364,
+      "loss": 1.049,
+      "step": 7223
+    },
+    {
+      "epoch": 1.2861467236467237,
+      "grad_norm": 0.5733018517494202,
+      "learning_rate": 0.00015333520800889165,
+      "loss": 0.7371,
+      "step": 7224
+    },
+    {
+      "epoch": 1.2863247863247862,
+      "grad_norm": 0.6447640061378479,
+      "learning_rate": 0.00015332336711781702,
+      "loss": 0.9925,
+      "step": 7225
+    },
+    {
+      "epoch": 1.286502849002849,
+      "grad_norm": 0.6764999628067017,
+      "learning_rate": 0.00015331152518198183,
+      "loss": 0.9052,
+      "step": 7226
+    },
+    {
+      "epoch": 1.2866809116809117,
+      "grad_norm": 0.6492836475372314,
+      "learning_rate": 0.00015329968220161803,
+      "loss": 0.9493,
+      "step": 7227
+    },
+    {
+      "epoch": 1.2868589743589745,
+      "grad_norm": 0.666157603263855,
+      "learning_rate": 0.00015328783817695766,
+      "loss": 1.0626,
+      "step": 7228
+    },
+    {
+      "epoch": 1.287037037037037,
+      "grad_norm": 0.7098026871681213,
+      "learning_rate": 0.00015327599310823283,
+      "loss": 1.0461,
+      "step": 7229
+    },
+    {
+      "epoch": 1.2872150997150997,
+      "grad_norm": 0.637778103351593,
+      "learning_rate": 0.00015326414699567555,
+      "loss": 0.9383,
+      "step": 7230
+    },
+    {
+      "epoch": 1.2873931623931623,
+      "grad_norm": 0.6816399693489075,
+      "learning_rate": 0.00015325229983951798,
+      "loss": 1.0647,
+      "step": 7231
+    },
+    {
+      "epoch": 1.287571225071225,
+      "grad_norm": 0.668689489364624,
+      "learning_rate": 0.0001532404516399922,
+      "loss": 1.0479,
+      "step": 7232
+    },
+    {
+      "epoch": 1.2877492877492878,
+      "grad_norm": 0.6459103226661682,
+      "learning_rate": 0.0001532286023973304,
+      "loss": 1.1751,
+      "step": 7233
+    },
+    {
+      "epoch": 1.2879273504273505,
+      "grad_norm": 0.679999589920044,
+      "learning_rate": 0.00015321675211176468,
+      "loss": 0.7541,
+      "step": 7234
+    },
+    {
+      "epoch": 1.288105413105413,
+      "grad_norm": 0.5415067672729492,
+      "learning_rate": 0.00015320490078352724,
+      "loss": 0.822,
+      "step": 7235
+    },
+    {
+      "epoch": 1.2882834757834758,
+      "grad_norm": 0.6817963719367981,
+      "learning_rate": 0.00015319304841285032,
+      "loss": 0.9424,
+      "step": 7236
+    },
+    {
+      "epoch": 1.2884615384615383,
+      "grad_norm": 0.6187505125999451,
+      "learning_rate": 0.0001531811949999661,
+      "loss": 0.8596,
+      "step": 7237
+    },
+    {
+      "epoch": 1.288639601139601,
+      "grad_norm": 0.6737838387489319,
+      "learning_rate": 0.00015316934054510685,
+      "loss": 1.0046,
+      "step": 7238
+    },
+    {
+      "epoch": 1.2888176638176638,
+      "grad_norm": 0.6445996761322021,
+      "learning_rate": 0.00015315748504850482,
+      "loss": 1.01,
+      "step": 7239
+    },
+    {
+      "epoch": 1.2889957264957266,
+      "grad_norm": 0.7279136180877686,
+      "learning_rate": 0.0001531456285103923,
+      "loss": 0.9066,
+      "step": 7240
+    },
+    {
+      "epoch": 1.289173789173789,
+      "grad_norm": 0.6619178652763367,
+      "learning_rate": 0.00015313377093100153,
+      "loss": 0.8977,
+      "step": 7241
+    },
+    {
+      "epoch": 1.2893518518518519,
+      "grad_norm": 0.7644323110580444,
+      "learning_rate": 0.000153121912310565,
+      "loss": 1.3085,
+      "step": 7242
+    },
+    {
+      "epoch": 1.2895299145299146,
+      "grad_norm": 0.645882248878479,
+      "learning_rate": 0.00015311005264931487,
+      "loss": 1.0337,
+      "step": 7243
+    },
+    {
+      "epoch": 1.2897079772079771,
+      "grad_norm": 0.6868017911911011,
+      "learning_rate": 0.0001530981919474836,
+      "loss": 0.9616,
+      "step": 7244
+    },
+    {
+      "epoch": 1.28988603988604,
+      "grad_norm": 0.7176693677902222,
+      "learning_rate": 0.00015308633020530362,
+      "loss": 1.1975,
+      "step": 7245
+    },
+    {
+      "epoch": 1.2900641025641026,
+      "grad_norm": 0.7358015775680542,
+      "learning_rate": 0.00015307446742300718,
+      "loss": 0.9308,
+      "step": 7246
+    },
+    {
+      "epoch": 1.2902421652421652,
+      "grad_norm": 0.7330248355865479,
+      "learning_rate": 0.00015306260360082688,
+      "loss": 0.9518,
+      "step": 7247
+    },
+    {
+      "epoch": 1.290420227920228,
+      "grad_norm": 0.6571981310844421,
+      "learning_rate": 0.00015305073873899503,
+      "loss": 0.9531,
+      "step": 7248
+    },
+    {
+      "epoch": 1.2905982905982907,
+      "grad_norm": 0.5968486666679382,
+      "learning_rate": 0.00015303887283774417,
+      "loss": 0.9245,
+      "step": 7249
+    },
+    {
+      "epoch": 1.2907763532763532,
+      "grad_norm": 0.6398176550865173,
+      "learning_rate": 0.0001530270058973068,
+      "loss": 1.0452,
+      "step": 7250
+    },
+    {
+      "epoch": 1.290954415954416,
+      "grad_norm": 0.5462267994880676,
+      "learning_rate": 0.00015301513791791542,
+      "loss": 0.8451,
+      "step": 7251
+    },
+    {
+      "epoch": 1.2911324786324787,
+      "grad_norm": 0.7536166906356812,
+      "learning_rate": 0.00015300326889980252,
+      "loss": 1.0086,
+      "step": 7252
+    },
+    {
+      "epoch": 1.2913105413105412,
+      "grad_norm": 0.6208569407463074,
+      "learning_rate": 0.00015299139884320065,
+      "loss": 0.7437,
+      "step": 7253
+    },
+    {
+      "epoch": 1.291488603988604,
+      "grad_norm": 0.7025452852249146,
+      "learning_rate": 0.00015297952774834242,
+      "loss": 0.8874,
+      "step": 7254
+    },
+    {
+      "epoch": 1.2916666666666667,
+      "grad_norm": 0.6758308410644531,
+      "learning_rate": 0.00015296765561546041,
+      "loss": 1.0378,
+      "step": 7255
+    },
+    {
+      "epoch": 1.2918447293447293,
+      "grad_norm": 0.7170431613922119,
+      "learning_rate": 0.00015295578244478724,
+      "loss": 1.0111,
+      "step": 7256
+    },
+    {
+      "epoch": 1.292022792022792,
+      "grad_norm": 0.6263511180877686,
+      "learning_rate": 0.00015294390823655544,
+      "loss": 0.7836,
+      "step": 7257
+    },
+    {
+      "epoch": 1.2922008547008548,
+      "grad_norm": 0.5887803435325623,
+      "learning_rate": 0.0001529320329909978,
+      "loss": 1.068,
+      "step": 7258
+    },
+    {
+      "epoch": 1.2923789173789173,
+      "grad_norm": 0.5955889821052551,
+      "learning_rate": 0.00015292015670834692,
+      "loss": 0.8903,
+      "step": 7259
+    },
+    {
+      "epoch": 1.29255698005698,
+      "grad_norm": 0.630449652671814,
+      "learning_rate": 0.00015290827938883552,
+      "loss": 1.1096,
+      "step": 7260
+    },
+    {
+      "epoch": 1.2927350427350428,
+      "grad_norm": 0.7405480146408081,
+      "learning_rate": 0.00015289640103269625,
+      "loss": 1.0648,
+      "step": 7261
+    },
+    {
+      "epoch": 1.2929131054131053,
+      "grad_norm": 0.6082221865653992,
+      "learning_rate": 0.00015288452164016191,
+      "loss": 0.9266,
+      "step": 7262
+    },
+    {
+      "epoch": 1.293091168091168,
+      "grad_norm": 0.6211720108985901,
+      "learning_rate": 0.00015287264121146524,
+      "loss": 0.849,
+      "step": 7263
+    },
+    {
+      "epoch": 1.2932692307692308,
+      "grad_norm": 0.6481043100357056,
+      "learning_rate": 0.00015286075974683898,
+      "loss": 0.7761,
+      "step": 7264
+    },
+    {
+      "epoch": 1.2934472934472934,
+      "grad_norm": 0.5957167744636536,
+      "learning_rate": 0.00015284887724651593,
+      "loss": 0.8942,
+      "step": 7265
+    },
+    {
+      "epoch": 1.2936253561253561,
+      "grad_norm": 0.7272268533706665,
+      "learning_rate": 0.00015283699371072894,
+      "loss": 1.0913,
+      "step": 7266
+    },
+    {
+      "epoch": 1.2938034188034189,
+      "grad_norm": 0.5902758836746216,
+      "learning_rate": 0.0001528251091397108,
+      "loss": 1.1045,
+      "step": 7267
+    },
+    {
+      "epoch": 1.2939814814814814,
+      "grad_norm": 0.6382482051849365,
+      "learning_rate": 0.00015281322353369436,
+      "loss": 0.9265,
+      "step": 7268
+    },
+    {
+      "epoch": 1.2941595441595442,
+      "grad_norm": 0.6556048393249512,
+      "learning_rate": 0.00015280133689291256,
+      "loss": 1.0536,
+      "step": 7269
+    },
+    {
+      "epoch": 1.294337606837607,
+      "grad_norm": 0.680895209312439,
+      "learning_rate": 0.00015278944921759822,
+      "loss": 0.9996,
+      "step": 7270
+    },
+    {
+      "epoch": 1.2945156695156697,
+      "grad_norm": 0.670317530632019,
+      "learning_rate": 0.00015277756050798428,
+      "loss": 1.1402,
+      "step": 7271
+    },
+    {
+      "epoch": 1.2946937321937322,
+      "grad_norm": 0.6312688589096069,
+      "learning_rate": 0.0001527656707643037,
+      "loss": 1.0669,
+      "step": 7272
+    },
+    {
+      "epoch": 1.294871794871795,
+      "grad_norm": 0.6267009973526001,
+      "learning_rate": 0.0001527537799867894,
+      "loss": 0.8985,
+      "step": 7273
+    },
+    {
+      "epoch": 1.2950498575498575,
+      "grad_norm": 0.7069001197814941,
+      "learning_rate": 0.00015274188817567436,
+      "loss": 0.9478,
+      "step": 7274
+    },
+    {
+      "epoch": 1.2952279202279202,
+      "grad_norm": 0.7229067087173462,
+      "learning_rate": 0.00015272999533119162,
+      "loss": 0.9005,
+      "step": 7275
+    },
+    {
+      "epoch": 1.295405982905983,
+      "grad_norm": 0.6254632472991943,
+      "learning_rate": 0.00015271810145357412,
+      "loss": 0.9746,
+      "step": 7276
+    },
+    {
+      "epoch": 1.2955840455840457,
+      "grad_norm": 0.6772669553756714,
+      "learning_rate": 0.00015270620654305494,
+      "loss": 1.1714,
+      "step": 7277
+    },
+    {
+      "epoch": 1.2957621082621082,
+      "grad_norm": 0.605576753616333,
+      "learning_rate": 0.00015269431059986713,
+      "loss": 0.7735,
+      "step": 7278
+    },
+    {
+      "epoch": 1.295940170940171,
+      "grad_norm": 0.7144771814346313,
+      "learning_rate": 0.00015268241362424378,
+      "loss": 0.9757,
+      "step": 7279
+    },
+    {
+      "epoch": 1.2961182336182335,
+      "grad_norm": 0.5275486707687378,
+      "learning_rate": 0.00015267051561641798,
+      "loss": 0.5669,
+      "step": 7280
+    },
+    {
+      "epoch": 1.2962962962962963,
+      "grad_norm": 0.6619452238082886,
+      "learning_rate": 0.00015265861657662284,
+      "loss": 0.9511,
+      "step": 7281
+    },
+    {
+      "epoch": 1.296474358974359,
+      "grad_norm": 0.6788223385810852,
+      "learning_rate": 0.00015264671650509147,
+      "loss": 1.2649,
+      "step": 7282
+    },
+    {
+      "epoch": 1.2966524216524218,
+      "grad_norm": 0.6198732852935791,
+      "learning_rate": 0.00015263481540205706,
+      "loss": 1.0659,
+      "step": 7283
+    },
+    {
+      "epoch": 1.2968304843304843,
+      "grad_norm": 0.6038815975189209,
+      "learning_rate": 0.0001526229132677528,
+      "loss": 1.0655,
+      "step": 7284
+    },
+    {
+      "epoch": 1.297008547008547,
+      "grad_norm": 0.7616196870803833,
+      "learning_rate": 0.00015261101010241186,
+      "loss": 1.131,
+      "step": 7285
+    },
+    {
+      "epoch": 1.2971866096866096,
+      "grad_norm": 0.7002527713775635,
+      "learning_rate": 0.00015259910590626746,
+      "loss": 1.1375,
+      "step": 7286
+    },
+    {
+      "epoch": 1.2973646723646723,
+      "grad_norm": 0.6067437529563904,
+      "learning_rate": 0.00015258720067955284,
+      "loss": 0.9306,
+      "step": 7287
+    },
+    {
+      "epoch": 1.297542735042735,
+      "grad_norm": 0.653232216835022,
+      "learning_rate": 0.00015257529442250128,
+      "loss": 1.107,
+      "step": 7288
+    },
+    {
+      "epoch": 1.2977207977207978,
+      "grad_norm": 0.6969175934791565,
+      "learning_rate": 0.00015256338713534603,
+      "loss": 0.8365,
+      "step": 7289
+    },
+    {
+      "epoch": 1.2978988603988604,
+      "grad_norm": 0.6176731586456299,
+      "learning_rate": 0.00015255147881832043,
+      "loss": 0.9707,
+      "step": 7290
+    },
+    {
+      "epoch": 1.2980769230769231,
+      "grad_norm": 0.6543741822242737,
+      "learning_rate": 0.00015253956947165772,
+      "loss": 0.7714,
+      "step": 7291
+    },
+    {
+      "epoch": 1.2982549857549857,
+      "grad_norm": 0.5224920511245728,
+      "learning_rate": 0.00015252765909559135,
+      "loss": 0.7469,
+      "step": 7292
+    },
+    {
+      "epoch": 1.2984330484330484,
+      "grad_norm": 0.638708770275116,
+      "learning_rate": 0.00015251574769035455,
+      "loss": 1.0965,
+      "step": 7293
+    },
+    {
+      "epoch": 1.2986111111111112,
+      "grad_norm": 0.6742943525314331,
+      "learning_rate": 0.0001525038352561808,
+      "loss": 1.1286,
+      "step": 7294
+    },
+    {
+      "epoch": 1.298789173789174,
+      "grad_norm": 0.6027839183807373,
+      "learning_rate": 0.00015249192179330346,
+      "loss": 0.8824,
+      "step": 7295
+    },
+    {
+      "epoch": 1.2989672364672364,
+      "grad_norm": 0.7462167143821716,
+      "learning_rate": 0.00015248000730195597,
+      "loss": 0.94,
+      "step": 7296
+    },
+    {
+      "epoch": 1.2991452991452992,
+      "grad_norm": 0.6972534656524658,
+      "learning_rate": 0.00015246809178237172,
+      "loss": 1.0664,
+      "step": 7297
+    },
+    {
+      "epoch": 1.2993233618233617,
+      "grad_norm": 0.569949209690094,
+      "learning_rate": 0.0001524561752347842,
+      "loss": 0.691,
+      "step": 7298
+    },
+    {
+      "epoch": 1.2995014245014245,
+      "grad_norm": 0.6066586375236511,
+      "learning_rate": 0.00015244425765942695,
+      "loss": 1.083,
+      "step": 7299
+    },
+    {
+      "epoch": 1.2996794871794872,
+      "grad_norm": 0.6927483677864075,
+      "learning_rate": 0.00015243233905653337,
+      "loss": 1.0068,
+      "step": 7300
+    },
+    {
+      "epoch": 1.29985754985755,
+      "grad_norm": 0.752824604511261,
+      "learning_rate": 0.00015242041942633704,
+      "loss": 0.9946,
+      "step": 7301
+    },
+    {
+      "epoch": 1.3000356125356125,
+      "grad_norm": 0.6532080173492432,
+      "learning_rate": 0.0001524084987690715,
+      "loss": 1.2326,
+      "step": 7302
+    },
+    {
+      "epoch": 1.3002136752136753,
+      "grad_norm": 0.7954180836677551,
+      "learning_rate": 0.0001523965770849703,
+      "loss": 1.1105,
+      "step": 7303
+    },
+    {
+      "epoch": 1.3003917378917378,
+      "grad_norm": 0.5971781015396118,
+      "learning_rate": 0.000152384654374267,
+      "loss": 1.0984,
+      "step": 7304
+    },
+    {
+      "epoch": 1.3005698005698005,
+      "grad_norm": 0.7778682112693787,
+      "learning_rate": 0.0001523727306371952,
+      "loss": 1.0795,
+      "step": 7305
+    },
+    {
+      "epoch": 1.3007478632478633,
+      "grad_norm": 0.6712004542350769,
+      "learning_rate": 0.00015236080587398856,
+      "loss": 1.0814,
+      "step": 7306
+    },
+    {
+      "epoch": 1.300925925925926,
+      "grad_norm": 0.581048846244812,
+      "learning_rate": 0.00015234888008488066,
+      "loss": 0.9868,
+      "step": 7307
+    },
+    {
+      "epoch": 1.3011039886039886,
+      "grad_norm": 0.697695791721344,
+      "learning_rate": 0.00015233695327010523,
+      "loss": 1.1045,
+      "step": 7308
+    },
+    {
+      "epoch": 1.3012820512820513,
+      "grad_norm": 0.6858421564102173,
+      "learning_rate": 0.00015232502542989593,
+      "loss": 1.0769,
+      "step": 7309
+    },
+    {
+      "epoch": 1.3014601139601139,
+      "grad_norm": 0.6312826871871948,
+      "learning_rate": 0.00015231309656448642,
+      "loss": 0.9523,
+      "step": 7310
+    },
+    {
+      "epoch": 1.3016381766381766,
+      "grad_norm": 0.9243300557136536,
+      "learning_rate": 0.0001523011666741105,
+      "loss": 0.947,
+      "step": 7311
+    },
+    {
+      "epoch": 1.3018162393162394,
+      "grad_norm": 0.6808217763900757,
+      "learning_rate": 0.00015228923575900184,
+      "loss": 0.8631,
+      "step": 7312
+    },
+    {
+      "epoch": 1.301994301994302,
+      "grad_norm": 0.6713891625404358,
+      "learning_rate": 0.00015227730381939424,
+      "loss": 0.9157,
+      "step": 7313
+    },
+    {
+      "epoch": 1.3021723646723646,
+      "grad_norm": 0.6802582740783691,
+      "learning_rate": 0.00015226537085552146,
+      "loss": 1.041,
+      "step": 7314
+    },
+    {
+      "epoch": 1.3023504273504274,
+      "grad_norm": 0.6543951034545898,
+      "learning_rate": 0.0001522534368676173,
+      "loss": 0.8709,
+      "step": 7315
+    },
+    {
+      "epoch": 1.30252849002849,
+      "grad_norm": 0.6290678381919861,
+      "learning_rate": 0.0001522415018559156,
+      "loss": 1.0568,
+      "step": 7316
+    },
+    {
+      "epoch": 1.3027065527065527,
+      "grad_norm": 0.6590015292167664,
+      "learning_rate": 0.0001522295658206502,
+      "loss": 0.9919,
+      "step": 7317
+    },
+    {
+      "epoch": 1.3028846153846154,
+      "grad_norm": 0.6374103426933289,
+      "learning_rate": 0.00015221762876205494,
+      "loss": 0.878,
+      "step": 7318
+    },
+    {
+      "epoch": 1.3030626780626782,
+      "grad_norm": 0.7247048616409302,
+      "learning_rate": 0.00015220569068036372,
+      "loss": 1.061,
+      "step": 7319
+    },
+    {
+      "epoch": 1.3032407407407407,
+      "grad_norm": 0.6450991630554199,
+      "learning_rate": 0.00015219375157581047,
+      "loss": 0.9389,
+      "step": 7320
+    },
+    {
+      "epoch": 1.3034188034188035,
+      "grad_norm": 0.8039840459823608,
+      "learning_rate": 0.00015218181144862903,
+      "loss": 1.0692,
+      "step": 7321
+    },
+    {
+      "epoch": 1.303596866096866,
+      "grad_norm": 0.6539456248283386,
+      "learning_rate": 0.00015216987029905346,
+      "loss": 1.0478,
+      "step": 7322
+    },
+    {
+      "epoch": 1.3037749287749287,
+      "grad_norm": 0.60880047082901,
+      "learning_rate": 0.00015215792812731758,
+      "loss": 0.8412,
+      "step": 7323
+    },
+    {
+      "epoch": 1.3039529914529915,
+      "grad_norm": 0.6757258176803589,
+      "learning_rate": 0.0001521459849336555,
+      "loss": 0.896,
+      "step": 7324
+    },
+    {
+      "epoch": 1.3041310541310542,
+      "grad_norm": 0.6735622882843018,
+      "learning_rate": 0.00015213404071830116,
+      "loss": 1.1078,
+      "step": 7325
+    },
+    {
+      "epoch": 1.3043091168091168,
+      "grad_norm": 0.7321233749389648,
+      "learning_rate": 0.00015212209548148858,
+      "loss": 1.1021,
+      "step": 7326
+    },
+    {
+      "epoch": 1.3044871794871795,
+      "grad_norm": 0.6678910851478577,
+      "learning_rate": 0.00015211014922345182,
+      "loss": 1.0043,
+      "step": 7327
+    },
+    {
+      "epoch": 1.304665242165242,
+      "grad_norm": 0.6876940727233887,
+      "learning_rate": 0.0001520982019444249,
+      "loss": 1.0376,
+      "step": 7328
+    },
+    {
+      "epoch": 1.3048433048433048,
+      "grad_norm": 0.6171853542327881,
+      "learning_rate": 0.00015208625364464195,
+      "loss": 0.839,
+      "step": 7329
+    },
+    {
+      "epoch": 1.3050213675213675,
+      "grad_norm": 0.6449569463729858,
+      "learning_rate": 0.0001520743043243371,
+      "loss": 1.0908,
+      "step": 7330
+    },
+    {
+      "epoch": 1.3051994301994303,
+      "grad_norm": 0.6894628405570984,
+      "learning_rate": 0.00015206235398374443,
+      "loss": 1.0263,
+      "step": 7331
+    },
+    {
+      "epoch": 1.3053774928774928,
+      "grad_norm": 0.5853552222251892,
+      "learning_rate": 0.00015205040262309804,
+      "loss": 0.8342,
+      "step": 7332
+    },
+    {
+      "epoch": 1.3055555555555556,
+      "grad_norm": 0.5934799313545227,
+      "learning_rate": 0.00015203845024263214,
+      "loss": 0.9464,
+      "step": 7333
+    },
+    {
+      "epoch": 1.305733618233618,
+      "grad_norm": 0.668927788734436,
+      "learning_rate": 0.00015202649684258095,
+      "loss": 0.9018,
+      "step": 7334
+    },
+    {
+      "epoch": 1.3059116809116809,
+      "grad_norm": 0.676810085773468,
+      "learning_rate": 0.0001520145424231786,
+      "loss": 0.9284,
+      "step": 7335
+    },
+    {
+      "epoch": 1.3060897435897436,
+      "grad_norm": 0.6223878264427185,
+      "learning_rate": 0.00015200258698465935,
+      "loss": 1.0779,
+      "step": 7336
+    },
+    {
+      "epoch": 1.3062678062678064,
+      "grad_norm": 0.6092363595962524,
+      "learning_rate": 0.00015199063052725745,
+      "loss": 0.8602,
+      "step": 7337
+    },
+    {
+      "epoch": 1.306445868945869,
+      "grad_norm": 0.7668731212615967,
+      "learning_rate": 0.00015197867305120712,
+      "loss": 1.0756,
+      "step": 7338
+    },
+    {
+      "epoch": 1.3066239316239316,
+      "grad_norm": 0.6485331654548645,
+      "learning_rate": 0.00015196671455674268,
+      "loss": 1.0193,
+      "step": 7339
+    },
+    {
+      "epoch": 1.3068019943019942,
+      "grad_norm": 0.5661036372184753,
+      "learning_rate": 0.0001519547550440984,
+      "loss": 0.8321,
+      "step": 7340
+    },
+    {
+      "epoch": 1.306980056980057,
+      "grad_norm": 0.6270507574081421,
+      "learning_rate": 0.00015194279451350866,
+      "loss": 0.6403,
+      "step": 7341
+    },
+    {
+      "epoch": 1.3071581196581197,
+      "grad_norm": 0.7283764481544495,
+      "learning_rate": 0.00015193083296520773,
+      "loss": 1.0401,
+      "step": 7342
+    },
+    {
+      "epoch": 1.3073361823361824,
+      "grad_norm": 0.658835232257843,
+      "learning_rate": 0.00015191887039943,
+      "loss": 1.0172,
+      "step": 7343
+    },
+    {
+      "epoch": 1.307514245014245,
+      "grad_norm": 0.6288984417915344,
+      "learning_rate": 0.00015190690681640988,
+      "loss": 0.8649,
+      "step": 7344
+    },
+    {
+      "epoch": 1.3076923076923077,
+      "grad_norm": 0.666442334651947,
+      "learning_rate": 0.00015189494221638176,
+      "loss": 1.0757,
+      "step": 7345
+    },
+    {
+      "epoch": 1.3078703703703702,
+      "grad_norm": 0.6116433143615723,
+      "learning_rate": 0.00015188297659958003,
+      "loss": 0.9244,
+      "step": 7346
+    },
+    {
+      "epoch": 1.308048433048433,
+      "grad_norm": 0.6378964185714722,
+      "learning_rate": 0.0001518710099662392,
+      "loss": 0.9629,
+      "step": 7347
+    },
+    {
+      "epoch": 1.3082264957264957,
+      "grad_norm": 0.6258945465087891,
+      "learning_rate": 0.00015185904231659357,
+      "loss": 0.8524,
+      "step": 7348
+    },
+    {
+      "epoch": 1.3084045584045585,
+      "grad_norm": 0.6498504877090454,
+      "learning_rate": 0.0001518470736508778,
+      "loss": 0.9685,
+      "step": 7349
+    },
+    {
+      "epoch": 1.308582621082621,
+      "grad_norm": 0.6928247809410095,
+      "learning_rate": 0.00015183510396932635,
+      "loss": 0.9054,
+      "step": 7350
+    },
+    {
+      "epoch": 1.3087606837606838,
+      "grad_norm": 0.6350936889648438,
+      "learning_rate": 0.0001518231332721737,
+      "loss": 1.0039,
+      "step": 7351
+    },
+    {
+      "epoch": 1.3089387464387463,
+      "grad_norm": 0.6652286648750305,
+      "learning_rate": 0.00015181116155965437,
+      "loss": 0.8946,
+      "step": 7352
+    },
+    {
+      "epoch": 1.309116809116809,
+      "grad_norm": 0.6554864048957825,
+      "learning_rate": 0.000151799188832003,
+      "loss": 0.9518,
+      "step": 7353
+    },
+    {
+      "epoch": 1.3092948717948718,
+      "grad_norm": 0.7523114085197449,
+      "learning_rate": 0.0001517872150894541,
+      "loss": 0.9462,
+      "step": 7354
+    },
+    {
+      "epoch": 1.3094729344729346,
+      "grad_norm": 0.7113336324691772,
+      "learning_rate": 0.0001517752403322423,
+      "loss": 1.2347,
+      "step": 7355
+    },
+    {
+      "epoch": 1.309650997150997,
+      "grad_norm": 0.6461622714996338,
+      "learning_rate": 0.00015176326456060223,
+      "loss": 0.8891,
+      "step": 7356
+    },
+    {
+      "epoch": 1.3098290598290598,
+      "grad_norm": 0.7429143190383911,
+      "learning_rate": 0.00015175128777476852,
+      "loss": 1.1944,
+      "step": 7357
+    },
+    {
+      "epoch": 1.3100071225071226,
+      "grad_norm": 0.6816306114196777,
+      "learning_rate": 0.00015173930997497585,
+      "loss": 1.1445,
+      "step": 7358
+    },
+    {
+      "epoch": 1.3101851851851851,
+      "grad_norm": 0.6644450426101685,
+      "learning_rate": 0.00015172733116145884,
+      "loss": 0.9808,
+      "step": 7359
+    },
+    {
+      "epoch": 1.3103632478632479,
+      "grad_norm": 0.6921063661575317,
+      "learning_rate": 0.00015171535133445225,
+      "loss": 1.0162,
+      "step": 7360
+    },
+    {
+      "epoch": 1.3105413105413106,
+      "grad_norm": 0.6386187672615051,
+      "learning_rate": 0.00015170337049419082,
+      "loss": 0.9951,
+      "step": 7361
+    },
+    {
+      "epoch": 1.3107193732193732,
+      "grad_norm": 0.6505418419837952,
+      "learning_rate": 0.0001516913886409092,
+      "loss": 0.8872,
+      "step": 7362
+    },
+    {
+      "epoch": 1.310897435897436,
+      "grad_norm": 0.6415576934814453,
+      "learning_rate": 0.00015167940577484222,
+      "loss": 1.056,
+      "step": 7363
+    },
+    {
+      "epoch": 1.3110754985754987,
+      "grad_norm": 0.6691195964813232,
+      "learning_rate": 0.00015166742189622458,
+      "loss": 1.0561,
+      "step": 7364
+    },
+    {
+      "epoch": 1.3112535612535612,
+      "grad_norm": 0.6376257538795471,
+      "learning_rate": 0.00015165543700529122,
+      "loss": 0.8499,
+      "step": 7365
+    },
+    {
+      "epoch": 1.311431623931624,
+      "grad_norm": 0.6270790696144104,
+      "learning_rate": 0.00015164345110227684,
+      "loss": 1.0244,
+      "step": 7366
+    },
+    {
+      "epoch": 1.3116096866096867,
+      "grad_norm": 0.7120122313499451,
+      "learning_rate": 0.0001516314641874163,
+      "loss": 1.0476,
+      "step": 7367
+    },
+    {
+      "epoch": 1.3117877492877492,
+      "grad_norm": 0.6152660250663757,
+      "learning_rate": 0.0001516194762609445,
+      "loss": 0.897,
+      "step": 7368
+    },
+    {
+      "epoch": 1.311965811965812,
+      "grad_norm": 0.7578088045120239,
+      "learning_rate": 0.00015160748732309626,
+      "loss": 1.1609,
+      "step": 7369
+    },
+    {
+      "epoch": 1.3121438746438747,
+      "grad_norm": 0.6594924330711365,
+      "learning_rate": 0.00015159549737410656,
+      "loss": 1.1706,
+      "step": 7370
+    },
+    {
+      "epoch": 1.3123219373219372,
+      "grad_norm": 0.6559173464775085,
+      "learning_rate": 0.00015158350641421024,
+      "loss": 0.9452,
+      "step": 7371
+    },
+    {
+      "epoch": 1.3125,
+      "grad_norm": 0.6667516231536865,
+      "learning_rate": 0.00015157151444364226,
+      "loss": 0.8153,
+      "step": 7372
+    },
+    {
+      "epoch": 1.3126780626780628,
+      "grad_norm": 0.7054803371429443,
+      "learning_rate": 0.00015155952146263761,
+      "loss": 0.9887,
+      "step": 7373
+    },
+    {
+      "epoch": 1.3128561253561253,
+      "grad_norm": 0.7035902142524719,
+      "learning_rate": 0.00015154752747143123,
+      "loss": 1.1832,
+      "step": 7374
+    },
+    {
+      "epoch": 1.313034188034188,
+      "grad_norm": 0.6297488212585449,
+      "learning_rate": 0.00015153553247025813,
+      "loss": 0.9602,
+      "step": 7375
+    },
+    {
+      "epoch": 1.3132122507122508,
+      "grad_norm": 0.6851378083229065,
+      "learning_rate": 0.00015152353645935335,
+      "loss": 1.0743,
+      "step": 7376
+    },
+    {
+      "epoch": 1.3133903133903133,
+      "grad_norm": 0.6215537786483765,
+      "learning_rate": 0.00015151153943895187,
+      "loss": 0.9484,
+      "step": 7377
+    },
+    {
+      "epoch": 1.313568376068376,
+      "grad_norm": 0.6848666071891785,
+      "learning_rate": 0.0001514995414092888,
+      "loss": 1.0978,
+      "step": 7378
+    },
+    {
+      "epoch": 1.3137464387464388,
+      "grad_norm": 0.7527492642402649,
+      "learning_rate": 0.00015148754237059918,
+      "loss": 1.083,
+      "step": 7379
+    },
+    {
+      "epoch": 1.3139245014245013,
+      "grad_norm": 0.6264588236808777,
+      "learning_rate": 0.00015147554232311814,
+      "loss": 0.9995,
+      "step": 7380
+    },
+    {
+      "epoch": 1.314102564102564,
+      "grad_norm": 0.6666619181632996,
+      "learning_rate": 0.00015146354126708075,
+      "loss": 1.0156,
+      "step": 7381
+    },
+    {
+      "epoch": 1.3142806267806268,
+      "grad_norm": 0.6626597046852112,
+      "learning_rate": 0.00015145153920272222,
+      "loss": 1.0047,
+      "step": 7382
+    },
+    {
+      "epoch": 1.3144586894586894,
+      "grad_norm": 0.5975428223609924,
+      "learning_rate": 0.0001514395361302776,
+      "loss": 0.806,
+      "step": 7383
+    },
+    {
+      "epoch": 1.3146367521367521,
+      "grad_norm": 0.6509957909584045,
+      "learning_rate": 0.00015142753204998218,
+      "loss": 0.8871,
+      "step": 7384
+    },
+    {
+      "epoch": 1.3148148148148149,
+      "grad_norm": 0.6672926545143127,
+      "learning_rate": 0.00015141552696207108,
+      "loss": 0.9616,
+      "step": 7385
+    },
+    {
+      "epoch": 1.3149928774928774,
+      "grad_norm": 0.6965435147285461,
+      "learning_rate": 0.00015140352086677954,
+      "loss": 1.124,
+      "step": 7386
+    },
+    {
+      "epoch": 1.3151709401709402,
+      "grad_norm": 0.6559258103370667,
+      "learning_rate": 0.00015139151376434277,
+      "loss": 1.0271,
+      "step": 7387
+    },
+    {
+      "epoch": 1.315349002849003,
+      "grad_norm": 0.7613587379455566,
+      "learning_rate": 0.00015137950565499608,
+      "loss": 1.0349,
+      "step": 7388
+    },
+    {
+      "epoch": 1.3155270655270654,
+      "grad_norm": 0.7001944780349731,
+      "learning_rate": 0.0001513674965389747,
+      "loss": 0.8551,
+      "step": 7389
+    },
+    {
+      "epoch": 1.3157051282051282,
+      "grad_norm": 0.6087043285369873,
+      "learning_rate": 0.0001513554864165139,
+      "loss": 0.7118,
+      "step": 7390
+    },
+    {
+      "epoch": 1.315883190883191,
+      "grad_norm": 0.71526700258255,
+      "learning_rate": 0.00015134347528784908,
+      "loss": 1.0478,
+      "step": 7391
+    },
+    {
+      "epoch": 1.3160612535612537,
+      "grad_norm": 0.6182073950767517,
+      "learning_rate": 0.00015133146315321548,
+      "loss": 0.9474,
+      "step": 7392
+    },
+    {
+      "epoch": 1.3162393162393162,
+      "grad_norm": 0.7771387696266174,
+      "learning_rate": 0.0001513194500128485,
+      "loss": 1.0544,
+      "step": 7393
+    },
+    {
+      "epoch": 1.316417378917379,
+      "grad_norm": 0.7108260989189148,
+      "learning_rate": 0.00015130743586698353,
+      "loss": 0.8813,
+      "step": 7394
+    },
+    {
+      "epoch": 1.3165954415954415,
+      "grad_norm": 0.7057309150695801,
+      "learning_rate": 0.0001512954207158559,
+      "loss": 0.899,
+      "step": 7395
+    },
+    {
+      "epoch": 1.3167735042735043,
+      "grad_norm": 0.6139237880706787,
+      "learning_rate": 0.00015128340455970106,
+      "loss": 0.8885,
+      "step": 7396
+    },
+    {
+      "epoch": 1.316951566951567,
+      "grad_norm": 0.7166598439216614,
+      "learning_rate": 0.00015127138739875443,
+      "loss": 0.9792,
+      "step": 7397
+    },
+    {
+      "epoch": 1.3171296296296298,
+      "grad_norm": 0.6916186809539795,
+      "learning_rate": 0.00015125936923325153,
+      "loss": 0.8871,
+      "step": 7398
+    },
+    {
+      "epoch": 1.3173076923076923,
+      "grad_norm": 0.7189087271690369,
+      "learning_rate": 0.0001512473500634277,
+      "loss": 0.8302,
+      "step": 7399
+    },
+    {
+      "epoch": 1.317485754985755,
+      "grad_norm": 0.5739200115203857,
+      "learning_rate": 0.00015123532988951853,
+      "loss": 0.9137,
+      "step": 7400
+    },
+    {
+      "epoch": 1.3176638176638176,
+      "grad_norm": 0.7661057114601135,
+      "learning_rate": 0.00015122330871175952,
+      "loss": 1.1255,
+      "step": 7401
+    },
+    {
+      "epoch": 1.3178418803418803,
+      "grad_norm": 0.6487592458724976,
+      "learning_rate": 0.00015121128653038617,
+      "loss": 1.0519,
+      "step": 7402
+    },
+    {
+      "epoch": 1.318019943019943,
+      "grad_norm": 0.693134605884552,
+      "learning_rate": 0.00015119926334563406,
+      "loss": 0.9585,
+      "step": 7403
+    },
+    {
+      "epoch": 1.3181980056980058,
+      "grad_norm": 0.5895997285842896,
+      "learning_rate": 0.0001511872391577387,
+      "loss": 0.8033,
+      "step": 7404
+    },
+    {
+      "epoch": 1.3183760683760684,
+      "grad_norm": 0.654876172542572,
+      "learning_rate": 0.00015117521396693575,
+      "loss": 1.0082,
+      "step": 7405
+    },
+    {
+      "epoch": 1.318554131054131,
+      "grad_norm": 0.5877239108085632,
+      "learning_rate": 0.0001511631877734608,
+      "loss": 1.0147,
+      "step": 7406
+    },
+    {
+      "epoch": 1.3187321937321936,
+      "grad_norm": 0.6109837889671326,
+      "learning_rate": 0.00015115116057754944,
+      "loss": 0.7498,
+      "step": 7407
+    },
+    {
+      "epoch": 1.3189102564102564,
+      "grad_norm": 0.643856942653656,
+      "learning_rate": 0.00015113913237943736,
+      "loss": 1.0417,
+      "step": 7408
+    },
+    {
+      "epoch": 1.3190883190883191,
+      "grad_norm": 0.654077410697937,
+      "learning_rate": 0.00015112710317936022,
+      "loss": 1.1809,
+      "step": 7409
+    },
+    {
+      "epoch": 1.319266381766382,
+      "grad_norm": 0.6785375475883484,
+      "learning_rate": 0.00015111507297755367,
+      "loss": 0.9447,
+      "step": 7410
+    },
+    {
+      "epoch": 1.3194444444444444,
+      "grad_norm": 0.6513382196426392,
+      "learning_rate": 0.00015110304177425347,
+      "loss": 0.8286,
+      "step": 7411
+    },
+    {
+      "epoch": 1.3196225071225072,
+      "grad_norm": 0.6536405682563782,
+      "learning_rate": 0.00015109100956969533,
+      "loss": 1.1959,
+      "step": 7412
+    },
+    {
+      "epoch": 1.3198005698005697,
+      "grad_norm": 0.6633172035217285,
+      "learning_rate": 0.00015107897636411498,
+      "loss": 0.8839,
+      "step": 7413
+    },
+    {
+      "epoch": 1.3199786324786325,
+      "grad_norm": 0.5773791670799255,
+      "learning_rate": 0.00015106694215774821,
+      "loss": 0.9785,
+      "step": 7414
+    },
+    {
+      "epoch": 1.3201566951566952,
+      "grad_norm": 0.7005468010902405,
+      "learning_rate": 0.00015105490695083078,
+      "loss": 1.0752,
+      "step": 7415
+    },
+    {
+      "epoch": 1.320334757834758,
+      "grad_norm": 0.6509538888931274,
+      "learning_rate": 0.0001510428707435985,
+      "loss": 0.9886,
+      "step": 7416
+    },
+    {
+      "epoch": 1.3205128205128205,
+      "grad_norm": 0.6607788801193237,
+      "learning_rate": 0.0001510308335362872,
+      "loss": 0.9756,
+      "step": 7417
+    },
+    {
+      "epoch": 1.3206908831908832,
+      "grad_norm": 0.5977858304977417,
+      "learning_rate": 0.00015101879532913274,
+      "loss": 1.0574,
+      "step": 7418
+    },
+    {
+      "epoch": 1.3208689458689458,
+      "grad_norm": 0.6478607058525085,
+      "learning_rate": 0.00015100675612237096,
+      "loss": 1.0076,
+      "step": 7419
+    },
+    {
+      "epoch": 1.3210470085470085,
+      "grad_norm": 0.6386681199073792,
+      "learning_rate": 0.00015099471591623775,
+      "loss": 0.9639,
+      "step": 7420
+    },
+    {
+      "epoch": 1.3212250712250713,
+      "grad_norm": 0.6348143815994263,
+      "learning_rate": 0.000150982674710969,
+      "loss": 1.0226,
+      "step": 7421
+    },
+    {
+      "epoch": 1.321403133903134,
+      "grad_norm": 0.6737388372421265,
+      "learning_rate": 0.00015097063250680068,
+      "loss": 0.9985,
+      "step": 7422
+    },
+    {
+      "epoch": 1.3215811965811965,
+      "grad_norm": 0.7302656769752502,
+      "learning_rate": 0.00015095858930396866,
+      "loss": 0.9969,
+      "step": 7423
+    },
+    {
+      "epoch": 1.3217592592592593,
+      "grad_norm": 0.7062691450119019,
+      "learning_rate": 0.00015094654510270898,
+      "loss": 0.9137,
+      "step": 7424
+    },
+    {
+      "epoch": 1.3219373219373218,
+      "grad_norm": 0.6289888620376587,
+      "learning_rate": 0.00015093449990325754,
+      "loss": 0.9231,
+      "step": 7425
+    },
+    {
+      "epoch": 1.3221153846153846,
+      "grad_norm": 0.643284261226654,
+      "learning_rate": 0.0001509224537058504,
+      "loss": 0.8981,
+      "step": 7426
+    },
+    {
+      "epoch": 1.3222934472934473,
+      "grad_norm": 0.7019244432449341,
+      "learning_rate": 0.00015091040651072355,
+      "loss": 0.9994,
+      "step": 7427
+    },
+    {
+      "epoch": 1.32247150997151,
+      "grad_norm": 0.5982088446617126,
+      "learning_rate": 0.0001508983583181131,
+      "loss": 0.9365,
+      "step": 7428
+    },
+    {
+      "epoch": 1.3226495726495726,
+      "grad_norm": 0.6086063385009766,
+      "learning_rate": 0.00015088630912825498,
+      "loss": 0.8621,
+      "step": 7429
+    },
+    {
+      "epoch": 1.3228276353276354,
+      "grad_norm": 0.6829213500022888,
+      "learning_rate": 0.00015087425894138535,
+      "loss": 1.1959,
+      "step": 7430
+    },
+    {
+      "epoch": 1.323005698005698,
+      "grad_norm": 0.6538017392158508,
+      "learning_rate": 0.00015086220775774033,
+      "loss": 0.9412,
+      "step": 7431
+    },
+    {
+      "epoch": 1.3231837606837606,
+      "grad_norm": 0.6334070563316345,
+      "learning_rate": 0.00015085015557755597,
+      "loss": 0.9044,
+      "step": 7432
+    },
+    {
+      "epoch": 1.3233618233618234,
+      "grad_norm": 0.6514624357223511,
+      "learning_rate": 0.00015083810240106845,
+      "loss": 0.8859,
+      "step": 7433
+    },
+    {
+      "epoch": 1.3235398860398861,
+      "grad_norm": 0.7130434513092041,
+      "learning_rate": 0.00015082604822851397,
+      "loss": 1.2845,
+      "step": 7434
+    },
+    {
+      "epoch": 1.3237179487179487,
+      "grad_norm": 0.609419584274292,
+      "learning_rate": 0.00015081399306012862,
+      "loss": 1.0725,
+      "step": 7435
+    },
+    {
+      "epoch": 1.3238960113960114,
+      "grad_norm": 0.586807370185852,
+      "learning_rate": 0.0001508019368961486,
+      "loss": 0.9032,
+      "step": 7436
+    },
+    {
+      "epoch": 1.324074074074074,
+      "grad_norm": 0.6937291026115417,
+      "learning_rate": 0.0001507898797368102,
+      "loss": 0.7975,
+      "step": 7437
+    },
+    {
+      "epoch": 1.3242521367521367,
+      "grad_norm": 0.6804966330528259,
+      "learning_rate": 0.00015077782158234962,
+      "loss": 1.1018,
+      "step": 7438
+    },
+    {
+      "epoch": 1.3244301994301995,
+      "grad_norm": 0.6110677123069763,
+      "learning_rate": 0.0001507657624330031,
+      "loss": 0.7988,
+      "step": 7439
+    },
+    {
+      "epoch": 1.3246082621082622,
+      "grad_norm": 0.6340961456298828,
+      "learning_rate": 0.0001507537022890069,
+      "loss": 0.844,
+      "step": 7440
+    },
+    {
+      "epoch": 1.3247863247863247,
+      "grad_norm": 0.7291021943092346,
+      "learning_rate": 0.00015074164115059735,
+      "loss": 0.9867,
+      "step": 7441
+    },
+    {
+      "epoch": 1.3249643874643875,
+      "grad_norm": 0.6818505525588989,
+      "learning_rate": 0.00015072957901801076,
+      "loss": 1.1541,
+      "step": 7442
+    },
+    {
+      "epoch": 1.32514245014245,
+      "grad_norm": 0.6174707412719727,
+      "learning_rate": 0.00015071751589148345,
+      "loss": 1.1679,
+      "step": 7443
+    },
+    {
+      "epoch": 1.3253205128205128,
+      "grad_norm": 0.6481367945671082,
+      "learning_rate": 0.00015070545177125176,
+      "loss": 1.0955,
+      "step": 7444
+    },
+    {
+      "epoch": 1.3254985754985755,
+      "grad_norm": 0.6752339005470276,
+      "learning_rate": 0.00015069338665755203,
+      "loss": 0.8651,
+      "step": 7445
+    },
+    {
+      "epoch": 1.3256766381766383,
+      "grad_norm": 0.6608055830001831,
+      "learning_rate": 0.00015068132055062077,
+      "loss": 0.9553,
+      "step": 7446
+    },
+    {
+      "epoch": 1.3258547008547008,
+      "grad_norm": 0.5933246612548828,
+      "learning_rate": 0.00015066925345069425,
+      "loss": 0.8584,
+      "step": 7447
+    },
+    {
+      "epoch": 1.3260327635327636,
+      "grad_norm": 0.6301844716072083,
+      "learning_rate": 0.000150657185358009,
+      "loss": 0.8583,
+      "step": 7448
+    },
+    {
+      "epoch": 1.326210826210826,
+      "grad_norm": 0.7359434962272644,
+      "learning_rate": 0.00015064511627280145,
+      "loss": 1.0905,
+      "step": 7449
+    },
+    {
+      "epoch": 1.3263888888888888,
+      "grad_norm": 0.6334579586982727,
+      "learning_rate": 0.00015063304619530806,
+      "loss": 0.9814,
+      "step": 7450
+    },
+    {
+      "epoch": 1.3265669515669516,
+      "grad_norm": 0.6974197626113892,
+      "learning_rate": 0.00015062097512576528,
+      "loss": 0.9302,
+      "step": 7451
+    },
+    {
+      "epoch": 1.3267450142450143,
+      "grad_norm": 0.6895849704742432,
+      "learning_rate": 0.00015060890306440965,
+      "loss": 1.0175,
+      "step": 7452
+    },
+    {
+      "epoch": 1.3269230769230769,
+      "grad_norm": 0.5938003659248352,
+      "learning_rate": 0.00015059683001147767,
+      "loss": 0.8084,
+      "step": 7453
+    },
+    {
+      "epoch": 1.3271011396011396,
+      "grad_norm": 0.6821470856666565,
+      "learning_rate": 0.00015058475596720596,
+      "loss": 0.9897,
+      "step": 7454
+    },
+    {
+      "epoch": 1.3272792022792022,
+      "grad_norm": 0.5507164001464844,
+      "learning_rate": 0.00015057268093183104,
+      "loss": 0.7012,
+      "step": 7455
+    },
+    {
+      "epoch": 1.327457264957265,
+      "grad_norm": 0.6216199398040771,
+      "learning_rate": 0.00015056060490558945,
+      "loss": 1.0281,
+      "step": 7456
+    },
+    {
+      "epoch": 1.3276353276353277,
+      "grad_norm": 0.6674157977104187,
+      "learning_rate": 0.00015054852788871787,
+      "loss": 0.8776,
+      "step": 7457
+    },
+    {
+      "epoch": 1.3278133903133904,
+      "grad_norm": 0.666963517665863,
+      "learning_rate": 0.0001505364498814529,
+      "loss": 1.0742,
+      "step": 7458
+    },
+    {
+      "epoch": 1.327991452991453,
+      "grad_norm": 0.6205331683158875,
+      "learning_rate": 0.00015052437088403114,
+      "loss": 1.1109,
+      "step": 7459
+    },
+    {
+      "epoch": 1.3281695156695157,
+      "grad_norm": 0.6402750611305237,
+      "learning_rate": 0.00015051229089668933,
+      "loss": 1.0648,
+      "step": 7460
+    },
+    {
+      "epoch": 1.3283475783475782,
+      "grad_norm": 0.7445703744888306,
+      "learning_rate": 0.00015050020991966406,
+      "loss": 0.8989,
+      "step": 7461
+    },
+    {
+      "epoch": 1.328525641025641,
+      "grad_norm": 0.8131299614906311,
+      "learning_rate": 0.00015048812795319212,
+      "loss": 0.9552,
+      "step": 7462
+    },
+    {
+      "epoch": 1.3287037037037037,
+      "grad_norm": 0.7007313966751099,
+      "learning_rate": 0.00015047604499751017,
+      "loss": 0.9899,
+      "step": 7463
+    },
+    {
+      "epoch": 1.3288817663817665,
+      "grad_norm": 0.60536789894104,
+      "learning_rate": 0.000150463961052855,
+      "loss": 0.7694,
+      "step": 7464
+    },
+    {
+      "epoch": 1.329059829059829,
+      "grad_norm": 0.6910434365272522,
+      "learning_rate": 0.00015045187611946331,
+      "loss": 0.9575,
+      "step": 7465
+    },
+    {
+      "epoch": 1.3292378917378918,
+      "grad_norm": 0.7693352103233337,
+      "learning_rate": 0.00015043979019757194,
+      "loss": 1.1987,
+      "step": 7466
+    },
+    {
+      "epoch": 1.3294159544159543,
+      "grad_norm": 0.6675218939781189,
+      "learning_rate": 0.00015042770328741763,
+      "loss": 1.0099,
+      "step": 7467
+    },
+    {
+      "epoch": 1.329594017094017,
+      "grad_norm": 0.8040883541107178,
+      "learning_rate": 0.00015041561538923722,
+      "loss": 0.9493,
+      "step": 7468
+    },
+    {
+      "epoch": 1.3297720797720798,
+      "grad_norm": 0.6765826344490051,
+      "learning_rate": 0.00015040352650326762,
+      "loss": 1.1035,
+      "step": 7469
+    },
+    {
+      "epoch": 1.3299501424501425,
+      "grad_norm": 0.7099924087524414,
+      "learning_rate": 0.0001503914366297456,
+      "loss": 0.9198,
+      "step": 7470
+    },
+    {
+      "epoch": 1.330128205128205,
+      "grad_norm": 0.6673682928085327,
+      "learning_rate": 0.00015037934576890804,
+      "loss": 1.0234,
+      "step": 7471
+    },
+    {
+      "epoch": 1.3303062678062678,
+      "grad_norm": 0.7022300958633423,
+      "learning_rate": 0.00015036725392099184,
+      "loss": 1.3875,
+      "step": 7472
+    },
+    {
+      "epoch": 1.3304843304843303,
+      "grad_norm": 0.6997060179710388,
+      "learning_rate": 0.00015035516108623394,
+      "loss": 0.8114,
+      "step": 7473
+    },
+    {
+      "epoch": 1.330662393162393,
+      "grad_norm": 0.6262350678443909,
+      "learning_rate": 0.00015034306726487127,
+      "loss": 1.128,
+      "step": 7474
+    },
+    {
+      "epoch": 1.3308404558404558,
+      "grad_norm": 0.6330382227897644,
+      "learning_rate": 0.00015033097245714078,
+      "loss": 0.9032,
+      "step": 7475
+    },
+    {
+      "epoch": 1.3310185185185186,
+      "grad_norm": 0.6527551412582397,
+      "learning_rate": 0.00015031887666327944,
+      "loss": 0.9311,
+      "step": 7476
+    },
+    {
+      "epoch": 1.3311965811965811,
+      "grad_norm": 0.6754798889160156,
+      "learning_rate": 0.00015030677988352422,
+      "loss": 1.0626,
+      "step": 7477
+    },
+    {
+      "epoch": 1.3313746438746439,
+      "grad_norm": 0.6397945284843445,
+      "learning_rate": 0.00015029468211811216,
+      "loss": 0.9222,
+      "step": 7478
+    },
+    {
+      "epoch": 1.3315527065527066,
+      "grad_norm": 0.8163481950759888,
+      "learning_rate": 0.0001502825833672803,
+      "loss": 1.1827,
+      "step": 7479
+    },
+    {
+      "epoch": 1.3317307692307692,
+      "grad_norm": 0.6645621657371521,
+      "learning_rate": 0.00015027048363126566,
+      "loss": 0.9744,
+      "step": 7480
+    },
+    {
+      "epoch": 1.331908831908832,
+      "grad_norm": 0.6943182349205017,
+      "learning_rate": 0.0001502583829103053,
+      "loss": 1.1597,
+      "step": 7481
+    },
+    {
+      "epoch": 1.3320868945868947,
+      "grad_norm": 0.6283710598945618,
+      "learning_rate": 0.00015024628120463636,
+      "loss": 0.9514,
+      "step": 7482
+    },
+    {
+      "epoch": 1.3322649572649572,
+      "grad_norm": 0.6159678101539612,
+      "learning_rate": 0.0001502341785144959,
+      "loss": 0.9752,
+      "step": 7483
+    },
+    {
+      "epoch": 1.33244301994302,
+      "grad_norm": 0.6259802579879761,
+      "learning_rate": 0.00015022207484012107,
+      "loss": 0.9356,
+      "step": 7484
+    },
+    {
+      "epoch": 1.3326210826210827,
+      "grad_norm": 0.7322365641593933,
+      "learning_rate": 0.00015020997018174904,
+      "loss": 1.2072,
+      "step": 7485
+    },
+    {
+      "epoch": 1.3327991452991452,
+      "grad_norm": 0.6323443651199341,
+      "learning_rate": 0.0001501978645396169,
+      "loss": 1.1661,
+      "step": 7486
+    },
+    {
+      "epoch": 1.332977207977208,
+      "grad_norm": 0.7811527848243713,
+      "learning_rate": 0.00015018575791396187,
+      "loss": 1.0304,
+      "step": 7487
+    },
+    {
+      "epoch": 1.3331552706552707,
+      "grad_norm": 0.7221232056617737,
+      "learning_rate": 0.0001501736503050212,
+      "loss": 0.8838,
+      "step": 7488
+    },
+    {
+      "epoch": 1.3333333333333333,
+      "grad_norm": 0.6980099081993103,
+      "learning_rate": 0.00015016154171303207,
+      "loss": 1.1841,
+      "step": 7489
+    },
+    {
+      "epoch": 1.333511396011396,
+      "grad_norm": 0.6802879571914673,
+      "learning_rate": 0.00015014943213823175,
+      "loss": 0.959,
+      "step": 7490
+    },
+    {
+      "epoch": 1.3336894586894588,
+      "grad_norm": 0.637698233127594,
+      "learning_rate": 0.00015013732158085746,
+      "loss": 1.0517,
+      "step": 7491
+    },
+    {
+      "epoch": 1.3338675213675213,
+      "grad_norm": 0.6386787295341492,
+      "learning_rate": 0.0001501252100411465,
+      "loss": 0.7125,
+      "step": 7492
+    },
+    {
+      "epoch": 1.334045584045584,
+      "grad_norm": 0.6287358403205872,
+      "learning_rate": 0.0001501130975193362,
+      "loss": 0.8913,
+      "step": 7493
+    },
+    {
+      "epoch": 1.3342236467236468,
+      "grad_norm": 0.6142337322235107,
+      "learning_rate": 0.00015010098401566386,
+      "loss": 0.8149,
+      "step": 7494
+    },
+    {
+      "epoch": 1.3344017094017093,
+      "grad_norm": 0.6369916200637817,
+      "learning_rate": 0.0001500888695303668,
+      "loss": 1.0186,
+      "step": 7495
+    },
+    {
+      "epoch": 1.334579772079772,
+      "grad_norm": 0.7526934146881104,
+      "learning_rate": 0.0001500767540636824,
+      "loss": 1.2421,
+      "step": 7496
+    },
+    {
+      "epoch": 1.3347578347578348,
+      "grad_norm": 0.7278095483779907,
+      "learning_rate": 0.00015006463761584802,
+      "loss": 0.9856,
+      "step": 7497
+    },
+    {
+      "epoch": 1.3349358974358974,
+      "grad_norm": 0.6165127158164978,
+      "learning_rate": 0.00015005252018710104,
+      "loss": 1.0041,
+      "step": 7498
+    },
+    {
+      "epoch": 1.33511396011396,
+      "grad_norm": 0.637856662273407,
+      "learning_rate": 0.00015004040177767896,
+      "loss": 0.9134,
+      "step": 7499
+    },
+    {
+      "epoch": 1.3352920227920229,
+      "grad_norm": 0.661227285861969,
+      "learning_rate": 0.00015002828238781912,
+      "loss": 1.0393,
+      "step": 7500
+    },
+    {
+      "epoch": 1.3354700854700854,
+      "grad_norm": 0.6061869859695435,
+      "learning_rate": 0.000150016162017759,
+      "loss": 0.8453,
+      "step": 7501
+    },
+    {
+      "epoch": 1.3356481481481481,
+      "grad_norm": 0.6938419938087463,
+      "learning_rate": 0.0001500040406677361,
+      "loss": 1.0338,
+      "step": 7502
+    },
+    {
+      "epoch": 1.335826210826211,
+      "grad_norm": 0.6672863960266113,
+      "learning_rate": 0.0001499919183379879,
+      "loss": 0.8765,
+      "step": 7503
+    },
+    {
+      "epoch": 1.3360042735042734,
+      "grad_norm": 0.6200515031814575,
+      "learning_rate": 0.00014997979502875193,
+      "loss": 0.8286,
+      "step": 7504
+    },
+    {
+      "epoch": 1.3361823361823362,
+      "grad_norm": 0.6287549138069153,
+      "learning_rate": 0.00014996767074026567,
+      "loss": 0.9761,
+      "step": 7505
+    },
+    {
+      "epoch": 1.336360398860399,
+      "grad_norm": 0.6036837100982666,
+      "learning_rate": 0.0001499555454727667,
+      "loss": 1.0506,
+      "step": 7506
+    },
+    {
+      "epoch": 1.3365384615384617,
+      "grad_norm": 0.6875260472297668,
+      "learning_rate": 0.0001499434192264926,
+      "loss": 1.001,
+      "step": 7507
+    },
+    {
+      "epoch": 1.3367165242165242,
+      "grad_norm": 0.6558469533920288,
+      "learning_rate": 0.00014993129200168096,
+      "loss": 0.6874,
+      "step": 7508
+    },
+    {
+      "epoch": 1.336894586894587,
+      "grad_norm": 0.604167103767395,
+      "learning_rate": 0.00014991916379856934,
+      "loss": 1.0173,
+      "step": 7509
+    },
+    {
+      "epoch": 1.3370726495726495,
+      "grad_norm": 0.5941442251205444,
+      "learning_rate": 0.00014990703461739544,
+      "loss": 0.8569,
+      "step": 7510
+    },
+    {
+      "epoch": 1.3372507122507122,
+      "grad_norm": 0.7645071148872375,
+      "learning_rate": 0.00014989490445839687,
+      "loss": 1.0172,
+      "step": 7511
+    },
+    {
+      "epoch": 1.337428774928775,
+      "grad_norm": 0.5491678714752197,
+      "learning_rate": 0.00014988277332181126,
+      "loss": 0.8018,
+      "step": 7512
+    },
+    {
+      "epoch": 1.3376068376068377,
+      "grad_norm": 0.583322286605835,
+      "learning_rate": 0.00014987064120787635,
+      "loss": 0.8704,
+      "step": 7513
+    },
+    {
+      "epoch": 1.3377849002849003,
+      "grad_norm": 0.7385724186897278,
+      "learning_rate": 0.00014985850811682984,
+      "loss": 1.1121,
+      "step": 7514
+    },
+    {
+      "epoch": 1.337962962962963,
+      "grad_norm": 0.6842585206031799,
+      "learning_rate": 0.00014984637404890941,
+      "loss": 0.914,
+      "step": 7515
+    },
+    {
+      "epoch": 1.3381410256410255,
+      "grad_norm": 0.6771186590194702,
+      "learning_rate": 0.00014983423900435285,
+      "loss": 1.0838,
+      "step": 7516
+    },
+    {
+      "epoch": 1.3383190883190883,
+      "grad_norm": 0.7562049031257629,
+      "learning_rate": 0.00014982210298339788,
+      "loss": 1.123,
+      "step": 7517
+    },
+    {
+      "epoch": 1.338497150997151,
+      "grad_norm": 0.7617804408073425,
+      "learning_rate": 0.0001498099659862823,
+      "loss": 0.9438,
+      "step": 7518
+    },
+    {
+      "epoch": 1.3386752136752138,
+      "grad_norm": 0.561958909034729,
+      "learning_rate": 0.00014979782801324392,
+      "loss": 0.8739,
+      "step": 7519
+    },
+    {
+      "epoch": 1.3388532763532763,
+      "grad_norm": 0.7726154923439026,
+      "learning_rate": 0.00014978568906452052,
+      "loss": 1.1306,
+      "step": 7520
+    },
+    {
+      "epoch": 1.339031339031339,
+      "grad_norm": 0.6658660173416138,
+      "learning_rate": 0.00014977354914035002,
+      "loss": 1.0214,
+      "step": 7521
+    },
+    {
+      "epoch": 1.3392094017094016,
+      "grad_norm": 0.6385402679443359,
+      "learning_rate": 0.00014976140824097015,
+      "loss": 0.8851,
+      "step": 7522
+    },
+    {
+      "epoch": 1.3393874643874644,
+      "grad_norm": 0.6315767168998718,
+      "learning_rate": 0.0001497492663666189,
+      "loss": 0.986,
+      "step": 7523
+    },
+    {
+      "epoch": 1.3395655270655271,
+      "grad_norm": 0.6379088759422302,
+      "learning_rate": 0.0001497371235175341,
+      "loss": 0.9322,
+      "step": 7524
+    },
+    {
+      "epoch": 1.3397435897435899,
+      "grad_norm": 0.6605859994888306,
+      "learning_rate": 0.0001497249796939537,
+      "loss": 1.1112,
+      "step": 7525
+    },
+    {
+      "epoch": 1.3399216524216524,
+      "grad_norm": 0.7342822551727295,
+      "learning_rate": 0.0001497128348961156,
+      "loss": 0.9798,
+      "step": 7526
+    },
+    {
+      "epoch": 1.3400997150997151,
+      "grad_norm": 0.5667192935943604,
+      "learning_rate": 0.0001497006891242578,
+      "loss": 0.7493,
+      "step": 7527
+    },
+    {
+      "epoch": 1.3402777777777777,
+      "grad_norm": 0.6106827855110168,
+      "learning_rate": 0.0001496885423786182,
+      "loss": 1.0924,
+      "step": 7528
+    },
+    {
+      "epoch": 1.3404558404558404,
+      "grad_norm": 0.6207202076911926,
+      "learning_rate": 0.00014967639465943486,
+      "loss": 1.1123,
+      "step": 7529
+    },
+    {
+      "epoch": 1.3406339031339032,
+      "grad_norm": 0.6272760033607483,
+      "learning_rate": 0.00014966424596694574,
+      "loss": 0.9275,
+      "step": 7530
+    },
+    {
+      "epoch": 1.340811965811966,
+      "grad_norm": 0.6485986113548279,
+      "learning_rate": 0.0001496520963013889,
+      "loss": 1.1491,
+      "step": 7531
+    },
+    {
+      "epoch": 1.3409900284900285,
+      "grad_norm": 0.5743561387062073,
+      "learning_rate": 0.00014963994566300238,
+      "loss": 1.1101,
+      "step": 7532
+    },
+    {
+      "epoch": 1.3411680911680912,
+      "grad_norm": 0.6508657336235046,
+      "learning_rate": 0.00014962779405202424,
+      "loss": 1.0368,
+      "step": 7533
+    },
+    {
+      "epoch": 1.3413461538461537,
+      "grad_norm": 0.6598748564720154,
+      "learning_rate": 0.00014961564146869259,
+      "loss": 1.1064,
+      "step": 7534
+    },
+    {
+      "epoch": 1.3415242165242165,
+      "grad_norm": 0.6722840070724487,
+      "learning_rate": 0.00014960348791324547,
+      "loss": 0.9758,
+      "step": 7535
+    },
+    {
+      "epoch": 1.3417022792022792,
+      "grad_norm": 0.5807220935821533,
+      "learning_rate": 0.00014959133338592108,
+      "loss": 0.9936,
+      "step": 7536
+    },
+    {
+      "epoch": 1.341880341880342,
+      "grad_norm": 0.6318647265434265,
+      "learning_rate": 0.00014957917788695752,
+      "loss": 0.907,
+      "step": 7537
+    },
+    {
+      "epoch": 1.3420584045584045,
+      "grad_norm": 0.6725485324859619,
+      "learning_rate": 0.00014956702141659295,
+      "loss": 0.988,
+      "step": 7538
+    },
+    {
+      "epoch": 1.3422364672364673,
+      "grad_norm": 0.6675217747688293,
+      "learning_rate": 0.0001495548639750656,
+      "loss": 1.0194,
+      "step": 7539
+    },
+    {
+      "epoch": 1.3424145299145298,
+      "grad_norm": 0.6976884603500366,
+      "learning_rate": 0.0001495427055626136,
+      "loss": 1.2515,
+      "step": 7540
+    },
+    {
+      "epoch": 1.3425925925925926,
+      "grad_norm": 0.654941737651825,
+      "learning_rate": 0.0001495305461794752,
+      "loss": 1.2072,
+      "step": 7541
+    },
+    {
+      "epoch": 1.3427706552706553,
+      "grad_norm": 0.7085291743278503,
+      "learning_rate": 0.00014951838582588864,
+      "loss": 0.9772,
+      "step": 7542
+    },
+    {
+      "epoch": 1.342948717948718,
+      "grad_norm": 0.6319566965103149,
+      "learning_rate": 0.00014950622450209217,
+      "loss": 1.0162,
+      "step": 7543
+    },
+    {
+      "epoch": 1.3431267806267806,
+      "grad_norm": 0.6272495985031128,
+      "learning_rate": 0.00014949406220832407,
+      "loss": 0.7985,
+      "step": 7544
+    },
+    {
+      "epoch": 1.3433048433048433,
+      "grad_norm": 0.6352069973945618,
+      "learning_rate": 0.00014948189894482266,
+      "loss": 1.0041,
+      "step": 7545
+    },
+    {
+      "epoch": 1.3434829059829059,
+      "grad_norm": 0.6071867346763611,
+      "learning_rate": 0.0001494697347118262,
+      "loss": 0.9486,
+      "step": 7546
+    },
+    {
+      "epoch": 1.3436609686609686,
+      "grad_norm": 0.6458829641342163,
+      "learning_rate": 0.00014945756950957308,
+      "loss": 0.9417,
+      "step": 7547
+    },
+    {
+      "epoch": 1.3438390313390314,
+      "grad_norm": 0.6472262740135193,
+      "learning_rate": 0.0001494454033383016,
+      "loss": 1.056,
+      "step": 7548
+    },
+    {
+      "epoch": 1.3440170940170941,
+      "grad_norm": 0.6985635161399841,
+      "learning_rate": 0.00014943323619825017,
+      "loss": 1.0483,
+      "step": 7549
+    },
+    {
+      "epoch": 1.3441951566951567,
+      "grad_norm": 0.6379460096359253,
+      "learning_rate": 0.00014942106808965718,
+      "loss": 0.9552,
+      "step": 7550
+    },
+    {
+      "epoch": 1.3443732193732194,
+      "grad_norm": 0.7036557793617249,
+      "learning_rate": 0.00014940889901276098,
+      "loss": 0.9647,
+      "step": 7551
+    },
+    {
+      "epoch": 1.344551282051282,
+      "grad_norm": 0.6697289943695068,
+      "learning_rate": 0.0001493967289678001,
+      "loss": 0.9029,
+      "step": 7552
+    },
+    {
+      "epoch": 1.3447293447293447,
+      "grad_norm": 0.6336250901222229,
+      "learning_rate": 0.00014938455795501286,
+      "loss": 0.9458,
+      "step": 7553
+    },
+    {
+      "epoch": 1.3449074074074074,
+      "grad_norm": 0.7279673218727112,
+      "learning_rate": 0.00014937238597463785,
+      "loss": 1.0228,
+      "step": 7554
+    },
+    {
+      "epoch": 1.3450854700854702,
+      "grad_norm": 0.6514406204223633,
+      "learning_rate": 0.00014936021302691349,
+      "loss": 0.8265,
+      "step": 7555
+    },
+    {
+      "epoch": 1.3452635327635327,
+      "grad_norm": 0.6405338644981384,
+      "learning_rate": 0.0001493480391120783,
+      "loss": 0.9516,
+      "step": 7556
+    },
+    {
+      "epoch": 1.3454415954415955,
+      "grad_norm": 0.6442672610282898,
+      "learning_rate": 0.00014933586423037076,
+      "loss": 0.9279,
+      "step": 7557
+    },
+    {
+      "epoch": 1.345619658119658,
+      "grad_norm": 0.7588633894920349,
+      "learning_rate": 0.00014932368838202945,
+      "loss": 1.0976,
+      "step": 7558
+    },
+    {
+      "epoch": 1.3457977207977208,
+      "grad_norm": 0.5536739230155945,
+      "learning_rate": 0.00014931151156729296,
+      "loss": 0.713,
+      "step": 7559
+    },
+    {
+      "epoch": 1.3459757834757835,
+      "grad_norm": 0.6897570490837097,
+      "learning_rate": 0.00014929933378639981,
+      "loss": 0.9521,
+      "step": 7560
+    },
+    {
+      "epoch": 1.3461538461538463,
+      "grad_norm": 0.6654927134513855,
+      "learning_rate": 0.00014928715503958863,
+      "loss": 0.8506,
+      "step": 7561
+    },
+    {
+      "epoch": 1.3463319088319088,
+      "grad_norm": 0.655806839466095,
+      "learning_rate": 0.00014927497532709808,
+      "loss": 0.8636,
+      "step": 7562
+    },
+    {
+      "epoch": 1.3465099715099715,
+      "grad_norm": 0.6547064185142517,
+      "learning_rate": 0.00014926279464916667,
+      "loss": 0.9155,
+      "step": 7563
+    },
+    {
+      "epoch": 1.346688034188034,
+      "grad_norm": 0.7555415034294128,
+      "learning_rate": 0.00014925061300603316,
+      "loss": 0.8791,
+      "step": 7564
+    },
+    {
+      "epoch": 1.3468660968660968,
+      "grad_norm": 0.7439392805099487,
+      "learning_rate": 0.0001492384303979362,
+      "loss": 1.1669,
+      "step": 7565
+    },
+    {
+      "epoch": 1.3470441595441596,
+      "grad_norm": 0.6016925573348999,
+      "learning_rate": 0.0001492262468251145,
+      "loss": 0.9811,
+      "step": 7566
+    },
+    {
+      "epoch": 1.3472222222222223,
+      "grad_norm": 0.644652783870697,
+      "learning_rate": 0.00014921406228780675,
+      "loss": 0.7096,
+      "step": 7567
+    },
+    {
+      "epoch": 1.3474002849002849,
+      "grad_norm": 0.721814751625061,
+      "learning_rate": 0.00014920187678625166,
+      "loss": 0.9933,
+      "step": 7568
+    },
+    {
+      "epoch": 1.3475783475783476,
+      "grad_norm": 0.6212092638015747,
+      "learning_rate": 0.000149189690320688,
+      "loss": 0.8499,
+      "step": 7569
+    },
+    {
+      "epoch": 1.3477564102564101,
+      "grad_norm": 0.6235958337783813,
+      "learning_rate": 0.00014917750289135455,
+      "loss": 0.9189,
+      "step": 7570
+    },
+    {
+      "epoch": 1.3479344729344729,
+      "grad_norm": 0.6309674978256226,
+      "learning_rate": 0.0001491653144984901,
+      "loss": 0.9744,
+      "step": 7571
+    },
+    {
+      "epoch": 1.3481125356125356,
+      "grad_norm": 0.7606496214866638,
+      "learning_rate": 0.00014915312514233344,
+      "loss": 1.0181,
+      "step": 7572
+    },
+    {
+      "epoch": 1.3482905982905984,
+      "grad_norm": 0.6892654895782471,
+      "learning_rate": 0.00014914093482312342,
+      "loss": 0.9517,
+      "step": 7573
+    },
+    {
+      "epoch": 1.348468660968661,
+      "grad_norm": 0.6746503114700317,
+      "learning_rate": 0.0001491287435410988,
+      "loss": 1.056,
+      "step": 7574
+    },
+    {
+      "epoch": 1.3486467236467237,
+      "grad_norm": 0.5892919301986694,
+      "learning_rate": 0.00014911655129649858,
+      "loss": 1.0515,
+      "step": 7575
+    },
+    {
+      "epoch": 1.3488247863247862,
+      "grad_norm": 0.6278096437454224,
+      "learning_rate": 0.0001491043580895615,
+      "loss": 0.864,
+      "step": 7576
+    },
+    {
+      "epoch": 1.349002849002849,
+      "grad_norm": 0.7017706632614136,
+      "learning_rate": 0.0001490921639205266,
+      "loss": 1.0618,
+      "step": 7577
+    },
+    {
+      "epoch": 1.3491809116809117,
+      "grad_norm": 0.7318746447563171,
+      "learning_rate": 0.00014907996878963268,
+      "loss": 0.9905,
+      "step": 7578
+    },
+    {
+      "epoch": 1.3493589743589745,
+      "grad_norm": 0.6485885977745056,
+      "learning_rate": 0.00014906777269711873,
+      "loss": 1.0498,
+      "step": 7579
+    },
+    {
+      "epoch": 1.349537037037037,
+      "grad_norm": 0.644902229309082,
+      "learning_rate": 0.00014905557564322372,
+      "loss": 0.885,
+      "step": 7580
+    },
+    {
+      "epoch": 1.3497150997150997,
+      "grad_norm": 0.6567610502243042,
+      "learning_rate": 0.0001490433776281866,
+      "loss": 0.8938,
+      "step": 7581
+    },
+    {
+      "epoch": 1.3498931623931623,
+      "grad_norm": 0.6233102679252625,
+      "learning_rate": 0.0001490311786522464,
+      "loss": 0.9007,
+      "step": 7582
+    },
+    {
+      "epoch": 1.350071225071225,
+      "grad_norm": 0.6962146759033203,
+      "learning_rate": 0.00014901897871564206,
+      "loss": 0.9257,
+      "step": 7583
+    },
+    {
+      "epoch": 1.3502492877492878,
+      "grad_norm": 0.6986933350563049,
+      "learning_rate": 0.00014900677781861266,
+      "loss": 1.0089,
+      "step": 7584
+    },
+    {
+      "epoch": 1.3504273504273505,
+      "grad_norm": 0.7527925968170166,
+      "learning_rate": 0.00014899457596139729,
+      "loss": 1.0762,
+      "step": 7585
+    },
+    {
+      "epoch": 1.350605413105413,
+      "grad_norm": 0.69191974401474,
+      "learning_rate": 0.00014898237314423494,
+      "loss": 0.9829,
+      "step": 7586
+    },
+    {
+      "epoch": 1.3507834757834758,
+      "grad_norm": 0.7866443395614624,
+      "learning_rate": 0.00014897016936736478,
+      "loss": 1.0911,
+      "step": 7587
+    },
+    {
+      "epoch": 1.3509615384615383,
+      "grad_norm": 0.7087522745132446,
+      "learning_rate": 0.00014895796463102587,
+      "loss": 1.0693,
+      "step": 7588
+    },
+    {
+      "epoch": 1.351139601139601,
+      "grad_norm": 0.704276442527771,
+      "learning_rate": 0.00014894575893545736,
+      "loss": 0.9082,
+      "step": 7589
+    },
+    {
+      "epoch": 1.3513176638176638,
+      "grad_norm": 0.7074487805366516,
+      "learning_rate": 0.00014893355228089833,
+      "loss": 0.8731,
+      "step": 7590
+    },
+    {
+      "epoch": 1.3514957264957266,
+      "grad_norm": 0.6542425155639648,
+      "learning_rate": 0.00014892134466758803,
+      "loss": 0.9325,
+      "step": 7591
+    },
+    {
+      "epoch": 1.351673789173789,
+      "grad_norm": 0.6577230095863342,
+      "learning_rate": 0.0001489091360957656,
+      "loss": 0.8468,
+      "step": 7592
+    },
+    {
+      "epoch": 1.3518518518518519,
+      "grad_norm": 0.638534426689148,
+      "learning_rate": 0.00014889692656567025,
+      "loss": 0.8598,
+      "step": 7593
+    },
+    {
+      "epoch": 1.3520299145299146,
+      "grad_norm": 0.751133918762207,
+      "learning_rate": 0.0001488847160775412,
+      "loss": 1.0006,
+      "step": 7594
+    },
+    {
+      "epoch": 1.3522079772079771,
+      "grad_norm": 0.6272708773612976,
+      "learning_rate": 0.00014887250463161767,
+      "loss": 0.8782,
+      "step": 7595
+    },
+    {
+      "epoch": 1.35238603988604,
+      "grad_norm": 0.7242439985275269,
+      "learning_rate": 0.00014886029222813897,
+      "loss": 1.2443,
+      "step": 7596
+    },
+    {
+      "epoch": 1.3525641025641026,
+      "grad_norm": 0.6199275851249695,
+      "learning_rate": 0.0001488480788673443,
+      "loss": 0.9211,
+      "step": 7597
+    },
+    {
+      "epoch": 1.3527421652421652,
+      "grad_norm": 0.6401306986808777,
+      "learning_rate": 0.00014883586454947305,
+      "loss": 0.8808,
+      "step": 7598
+    },
+    {
+      "epoch": 1.352920227920228,
+      "grad_norm": 0.6340938806533813,
+      "learning_rate": 0.00014882364927476443,
+      "loss": 0.9406,
+      "step": 7599
+    },
+    {
+      "epoch": 1.3530982905982907,
+      "grad_norm": 0.6388604044914246,
+      "learning_rate": 0.00014881143304345783,
+      "loss": 1.0674,
+      "step": 7600
+    },
+    {
+      "epoch": 1.3532763532763532,
+      "grad_norm": 0.7562061548233032,
+      "learning_rate": 0.00014879921585579263,
+      "loss": 1.0959,
+      "step": 7601
+    },
+    {
+      "epoch": 1.353454415954416,
+      "grad_norm": 0.6303606033325195,
+      "learning_rate": 0.00014878699771200815,
+      "loss": 0.9641,
+      "step": 7602
+    },
+    {
+      "epoch": 1.3536324786324787,
+      "grad_norm": 0.8623232841491699,
+      "learning_rate": 0.00014877477861234382,
+      "loss": 1.1529,
+      "step": 7603
+    },
+    {
+      "epoch": 1.3538105413105412,
+      "grad_norm": 0.6607624888420105,
+      "learning_rate": 0.00014876255855703896,
+      "loss": 0.6291,
+      "step": 7604
+    },
+    {
+      "epoch": 1.353988603988604,
+      "grad_norm": 0.6226931214332581,
+      "learning_rate": 0.0001487503375463331,
+      "loss": 0.7485,
+      "step": 7605
+    },
+    {
+      "epoch": 1.3541666666666667,
+      "grad_norm": 0.7626705169677734,
+      "learning_rate": 0.00014873811558046565,
+      "loss": 0.9694,
+      "step": 7606
+    },
+    {
+      "epoch": 1.3543447293447293,
+      "grad_norm": 0.5436057448387146,
+      "learning_rate": 0.00014872589265967605,
+      "loss": 0.6173,
+      "step": 7607
+    },
+    {
+      "epoch": 1.354522792022792,
+      "grad_norm": 0.7822177410125732,
+      "learning_rate": 0.00014871366878420382,
+      "loss": 1.0048,
+      "step": 7608
+    },
+    {
+      "epoch": 1.3547008547008548,
+      "grad_norm": 0.6955201625823975,
+      "learning_rate": 0.00014870144395428848,
+      "loss": 0.9487,
+      "step": 7609
+    },
+    {
+      "epoch": 1.3548789173789173,
+      "grad_norm": 0.6625505685806274,
+      "learning_rate": 0.00014868921817016943,
+      "loss": 0.9389,
+      "step": 7610
+    },
+    {
+      "epoch": 1.35505698005698,
+      "grad_norm": 0.6625354886054993,
+      "learning_rate": 0.00014867699143208634,
+      "loss": 0.9538,
+      "step": 7611
+    },
+    {
+      "epoch": 1.3552350427350428,
+      "grad_norm": 0.7426592707633972,
+      "learning_rate": 0.00014866476374027874,
+      "loss": 1.2566,
+      "step": 7612
+    },
+    {
+      "epoch": 1.3554131054131053,
+      "grad_norm": 0.6856544017791748,
+      "learning_rate": 0.00014865253509498616,
+      "loss": 0.9663,
+      "step": 7613
+    },
+    {
+      "epoch": 1.355591168091168,
+      "grad_norm": 0.6343915462493896,
+      "learning_rate": 0.00014864030549644825,
+      "loss": 0.9416,
+      "step": 7614
+    },
+    {
+      "epoch": 1.3557692307692308,
+      "grad_norm": 0.6319553256034851,
+      "learning_rate": 0.00014862807494490454,
+      "loss": 0.9335,
+      "step": 7615
+    },
+    {
+      "epoch": 1.3559472934472934,
+      "grad_norm": 0.6919772624969482,
+      "learning_rate": 0.00014861584344059476,
+      "loss": 0.8516,
+      "step": 7616
+    },
+    {
+      "epoch": 1.3561253561253561,
+      "grad_norm": 0.6405790448188782,
+      "learning_rate": 0.00014860361098375851,
+      "loss": 1.1278,
+      "step": 7617
+    },
+    {
+      "epoch": 1.3563034188034189,
+      "grad_norm": 0.7591732144355774,
+      "learning_rate": 0.00014859137757463548,
+      "loss": 1.0961,
+      "step": 7618
+    },
+    {
+      "epoch": 1.3564814814814814,
+      "grad_norm": 0.6166727542877197,
+      "learning_rate": 0.0001485791432134653,
+      "loss": 0.9358,
+      "step": 7619
+    },
+    {
+      "epoch": 1.3566595441595442,
+      "grad_norm": 0.7068707346916199,
+      "learning_rate": 0.00014856690790048777,
+      "loss": 0.8325,
+      "step": 7620
+    },
+    {
+      "epoch": 1.356837606837607,
+      "grad_norm": 0.8465402722358704,
+      "learning_rate": 0.00014855467163594257,
+      "loss": 1.0047,
+      "step": 7621
+    },
+    {
+      "epoch": 1.3570156695156697,
+      "grad_norm": 0.7403460741043091,
+      "learning_rate": 0.00014854243442006943,
+      "loss": 1.0907,
+      "step": 7622
+    },
+    {
+      "epoch": 1.3571937321937322,
+      "grad_norm": 0.6939566135406494,
+      "learning_rate": 0.00014853019625310813,
+      "loss": 0.9156,
+      "step": 7623
+    },
+    {
+      "epoch": 1.357371794871795,
+      "grad_norm": 0.6425924897193909,
+      "learning_rate": 0.0001485179571352984,
+      "loss": 0.8156,
+      "step": 7624
+    },
+    {
+      "epoch": 1.3575498575498575,
+      "grad_norm": 0.7091902494430542,
+      "learning_rate": 0.00014850571706688013,
+      "loss": 1.0483,
+      "step": 7625
+    },
+    {
+      "epoch": 1.3577279202279202,
+      "grad_norm": 0.663342297077179,
+      "learning_rate": 0.00014849347604809312,
+      "loss": 1.0405,
+      "step": 7626
+    },
+    {
+      "epoch": 1.357905982905983,
+      "grad_norm": 0.6727671027183533,
+      "learning_rate": 0.00014848123407917716,
+      "loss": 1.0389,
+      "step": 7627
+    },
+    {
+      "epoch": 1.3580840455840457,
+      "grad_norm": 0.6572692394256592,
+      "learning_rate": 0.0001484689911603721,
+      "loss": 1.0489,
+      "step": 7628
+    },
+    {
+      "epoch": 1.3582621082621082,
+      "grad_norm": 0.7629066109657288,
+      "learning_rate": 0.0001484567472919179,
+      "loss": 1.0372,
+      "step": 7629
+    },
+    {
+      "epoch": 1.358440170940171,
+      "grad_norm": 0.7848913669586182,
+      "learning_rate": 0.00014844450247405435,
+      "loss": 0.9437,
+      "step": 7630
+    },
+    {
+      "epoch": 1.3586182336182335,
+      "grad_norm": 0.715949535369873,
+      "learning_rate": 0.00014843225670702143,
+      "loss": 1.1949,
+      "step": 7631
+    },
+    {
+      "epoch": 1.3587962962962963,
+      "grad_norm": 0.6498245596885681,
+      "learning_rate": 0.00014842000999105905,
+      "loss": 0.8845,
+      "step": 7632
+    },
+    {
+      "epoch": 1.358974358974359,
+      "grad_norm": 0.7251074910163879,
+      "learning_rate": 0.00014840776232640716,
+      "loss": 1.093,
+      "step": 7633
+    },
+    {
+      "epoch": 1.3591524216524218,
+      "grad_norm": 0.6223580837249756,
+      "learning_rate": 0.0001483955137133057,
+      "loss": 1.0344,
+      "step": 7634
+    },
+    {
+      "epoch": 1.3593304843304843,
+      "grad_norm": 0.6504943370819092,
+      "learning_rate": 0.00014838326415199472,
+      "loss": 1.109,
+      "step": 7635
+    },
+    {
+      "epoch": 1.359508547008547,
+      "grad_norm": 0.5912374258041382,
+      "learning_rate": 0.00014837101364271416,
+      "loss": 1.0756,
+      "step": 7636
+    },
+    {
+      "epoch": 1.3596866096866096,
+      "grad_norm": 0.6116467714309692,
+      "learning_rate": 0.00014835876218570408,
+      "loss": 0.7871,
+      "step": 7637
+    },
+    {
+      "epoch": 1.3598646723646723,
+      "grad_norm": 0.7013412117958069,
+      "learning_rate": 0.0001483465097812045,
+      "loss": 1.0003,
+      "step": 7638
+    },
+    {
+      "epoch": 1.360042735042735,
+      "grad_norm": 0.5930750370025635,
+      "learning_rate": 0.00014833425642945552,
+      "loss": 0.9926,
+      "step": 7639
+    },
+    {
+      "epoch": 1.3602207977207978,
+      "grad_norm": 0.732955276966095,
+      "learning_rate": 0.00014832200213069717,
+      "loss": 1.2801,
+      "step": 7640
+    },
+    {
+      "epoch": 1.3603988603988604,
+      "grad_norm": 0.6836149096488953,
+      "learning_rate": 0.00014830974688516958,
+      "loss": 0.9292,
+      "step": 7641
+    },
+    {
+      "epoch": 1.3605769230769231,
+      "grad_norm": 0.6531919836997986,
+      "learning_rate": 0.00014829749069311283,
+      "loss": 0.9551,
+      "step": 7642
+    },
+    {
+      "epoch": 1.3607549857549857,
+      "grad_norm": 0.719093382358551,
+      "learning_rate": 0.0001482852335547671,
+      "loss": 0.8588,
+      "step": 7643
+    },
+    {
+      "epoch": 1.3609330484330484,
+      "grad_norm": 0.6144105792045593,
+      "learning_rate": 0.00014827297547037252,
+      "loss": 0.9033,
+      "step": 7644
+    },
+    {
+      "epoch": 1.3611111111111112,
+      "grad_norm": 0.789241373538971,
+      "learning_rate": 0.00014826071644016926,
+      "loss": 1.1916,
+      "step": 7645
+    },
+    {
+      "epoch": 1.361289173789174,
+      "grad_norm": 0.6137418746948242,
+      "learning_rate": 0.0001482484564643975,
+      "loss": 0.9648,
+      "step": 7646
+    },
+    {
+      "epoch": 1.3614672364672364,
+      "grad_norm": 0.6789261698722839,
+      "learning_rate": 0.00014823619554329745,
+      "loss": 0.829,
+      "step": 7647
+    },
+    {
+      "epoch": 1.3616452991452992,
+      "grad_norm": 0.6508790254592896,
+      "learning_rate": 0.0001482239336771094,
+      "loss": 0.942,
+      "step": 7648
+    },
+    {
+      "epoch": 1.3618233618233617,
+      "grad_norm": 0.6725571751594543,
+      "learning_rate": 0.00014821167086607353,
+      "loss": 0.8884,
+      "step": 7649
+    },
+    {
+      "epoch": 1.3620014245014245,
+      "grad_norm": 0.6252003908157349,
+      "learning_rate": 0.00014819940711043012,
+      "loss": 0.9778,
+      "step": 7650
+    },
+    {
+      "epoch": 1.3621794871794872,
+      "grad_norm": 0.6950626969337463,
+      "learning_rate": 0.00014818714241041943,
+      "loss": 1.2104,
+      "step": 7651
+    },
+    {
+      "epoch": 1.36235754985755,
+      "grad_norm": 0.6527379155158997,
+      "learning_rate": 0.0001481748767662818,
+      "loss": 0.7845,
+      "step": 7652
+    },
+    {
+      "epoch": 1.3625356125356125,
+      "grad_norm": 0.7438235282897949,
+      "learning_rate": 0.00014816261017825755,
+      "loss": 0.9513,
+      "step": 7653
+    },
+    {
+      "epoch": 1.3627136752136753,
+      "grad_norm": 0.6412696838378906,
+      "learning_rate": 0.000148150342646587,
+      "loss": 0.8478,
+      "step": 7654
+    },
+    {
+      "epoch": 1.3628917378917378,
+      "grad_norm": 0.658481240272522,
+      "learning_rate": 0.00014813807417151046,
+      "loss": 0.6816,
+      "step": 7655
+    },
+    {
+      "epoch": 1.3630698005698005,
+      "grad_norm": 0.6170126795768738,
+      "learning_rate": 0.0001481258047532684,
+      "loss": 0.8862,
+      "step": 7656
+    },
+    {
+      "epoch": 1.3632478632478633,
+      "grad_norm": 0.7049173712730408,
+      "learning_rate": 0.0001481135343921012,
+      "loss": 1.0027,
+      "step": 7657
+    },
+    {
+      "epoch": 1.363425925925926,
+      "grad_norm": 0.7780741453170776,
+      "learning_rate": 0.0001481012630882492,
+      "loss": 1.0183,
+      "step": 7658
+    },
+    {
+      "epoch": 1.3636039886039886,
+      "grad_norm": 0.6658362746238708,
+      "learning_rate": 0.00014808899084195286,
+      "loss": 0.878,
+      "step": 7659
+    },
+    {
+      "epoch": 1.3637820512820513,
+      "grad_norm": 0.7192076444625854,
+      "learning_rate": 0.00014807671765345267,
+      "loss": 1.2269,
+      "step": 7660
+    },
+    {
+      "epoch": 1.3639601139601139,
+      "grad_norm": 0.7038660049438477,
+      "learning_rate": 0.00014806444352298903,
+      "loss": 0.889,
+      "step": 7661
+    },
+    {
+      "epoch": 1.3641381766381766,
+      "grad_norm": 0.622803270816803,
+      "learning_rate": 0.00014805216845080249,
+      "loss": 0.9623,
+      "step": 7662
+    },
+    {
+      "epoch": 1.3643162393162394,
+      "grad_norm": 0.9157076478004456,
+      "learning_rate": 0.00014803989243713353,
+      "loss": 1.106,
+      "step": 7663
+    },
+    {
+      "epoch": 1.364494301994302,
+      "grad_norm": 0.6369999647140503,
+      "learning_rate": 0.00014802761548222268,
+      "loss": 0.9755,
+      "step": 7664
+    },
+    {
+      "epoch": 1.3646723646723646,
+      "grad_norm": 0.8318394422531128,
+      "learning_rate": 0.00014801533758631045,
+      "loss": 1.1786,
+      "step": 7665
+    },
+    {
+      "epoch": 1.3648504273504274,
+      "grad_norm": 0.7065796852111816,
+      "learning_rate": 0.00014800305874963744,
+      "loss": 1.2066,
+      "step": 7666
+    },
+    {
+      "epoch": 1.36502849002849,
+      "grad_norm": 0.6570265293121338,
+      "learning_rate": 0.0001479907789724442,
+      "loss": 1.0084,
+      "step": 7667
+    },
+    {
+      "epoch": 1.3652065527065527,
+      "grad_norm": 0.637321949005127,
+      "learning_rate": 0.00014797849825497135,
+      "loss": 0.9075,
+      "step": 7668
+    },
+    {
+      "epoch": 1.3653846153846154,
+      "grad_norm": 0.7656470537185669,
+      "learning_rate": 0.00014796621659745948,
+      "loss": 1.1497,
+      "step": 7669
+    },
+    {
+      "epoch": 1.3655626780626782,
+      "grad_norm": 0.6798120737075806,
+      "learning_rate": 0.0001479539340001493,
+      "loss": 0.8154,
+      "step": 7670
+    },
+    {
+      "epoch": 1.3657407407407407,
+      "grad_norm": 0.7004328966140747,
+      "learning_rate": 0.0001479416504632813,
+      "loss": 1.0513,
+      "step": 7671
+    },
+    {
+      "epoch": 1.3659188034188035,
+      "grad_norm": 0.6551713943481445,
+      "learning_rate": 0.0001479293659870963,
+      "loss": 0.8735,
+      "step": 7672
+    },
+    {
+      "epoch": 1.366096866096866,
+      "grad_norm": 0.7685719132423401,
+      "learning_rate": 0.00014791708057183494,
+      "loss": 1.111,
+      "step": 7673
+    },
+    {
+      "epoch": 1.3662749287749287,
+      "grad_norm": 0.673624575138092,
+      "learning_rate": 0.0001479047942177379,
+      "loss": 0.9418,
+      "step": 7674
+    },
+    {
+      "epoch": 1.3664529914529915,
+      "grad_norm": 0.6281047463417053,
+      "learning_rate": 0.00014789250692504597,
+      "loss": 1.0938,
+      "step": 7675
+    },
+    {
+      "epoch": 1.3666310541310542,
+      "grad_norm": 0.5846312642097473,
+      "learning_rate": 0.0001478802186939998,
+      "loss": 0.6352,
+      "step": 7676
+    },
+    {
+      "epoch": 1.3668091168091168,
+      "grad_norm": 0.7037251591682434,
+      "learning_rate": 0.00014786792952484025,
+      "loss": 1.1775,
+      "step": 7677
+    },
+    {
+      "epoch": 1.3669871794871795,
+      "grad_norm": 0.69822758436203,
+      "learning_rate": 0.00014785563941780808,
+      "loss": 1.0877,
+      "step": 7678
+    },
+    {
+      "epoch": 1.367165242165242,
+      "grad_norm": 0.7229313254356384,
+      "learning_rate": 0.000147843348373144,
+      "loss": 1.0305,
+      "step": 7679
+    },
+    {
+      "epoch": 1.3673433048433048,
+      "grad_norm": 0.665771484375,
+      "learning_rate": 0.00014783105639108897,
+      "loss": 0.9056,
+      "step": 7680
+    },
+    {
+      "epoch": 1.3675213675213675,
+      "grad_norm": 0.6418357491493225,
+      "learning_rate": 0.00014781876347188367,
+      "loss": 0.9374,
+      "step": 7681
+    },
+    {
+      "epoch": 1.3676994301994303,
+      "grad_norm": 0.7255483269691467,
+      "learning_rate": 0.0001478064696157691,
+      "loss": 0.8533,
+      "step": 7682
+    },
+    {
+      "epoch": 1.3678774928774928,
+      "grad_norm": 0.668064534664154,
+      "learning_rate": 0.00014779417482298603,
+      "loss": 0.9002,
+      "step": 7683
+    },
+    {
+      "epoch": 1.3680555555555556,
+      "grad_norm": 0.6797603368759155,
+      "learning_rate": 0.0001477818790937754,
+      "loss": 0.9733,
+      "step": 7684
+    },
+    {
+      "epoch": 1.368233618233618,
+      "grad_norm": 0.6905350685119629,
+      "learning_rate": 0.0001477695824283781,
+      "loss": 0.7985,
+      "step": 7685
+    },
+    {
+      "epoch": 1.3684116809116809,
+      "grad_norm": 0.6846137046813965,
+      "learning_rate": 0.00014775728482703507,
+      "loss": 0.9154,
+      "step": 7686
+    },
+    {
+      "epoch": 1.3685897435897436,
+      "grad_norm": 0.6686832904815674,
+      "learning_rate": 0.00014774498628998726,
+      "loss": 0.926,
+      "step": 7687
+    },
+    {
+      "epoch": 1.3687678062678064,
+      "grad_norm": 0.7050234079360962,
+      "learning_rate": 0.00014773268681747561,
+      "loss": 0.9386,
+      "step": 7688
+    },
+    {
+      "epoch": 1.368945868945869,
+      "grad_norm": 0.7048354744911194,
+      "learning_rate": 0.00014772038640974112,
+      "loss": 1.1483,
+      "step": 7689
+    },
+    {
+      "epoch": 1.3691239316239316,
+      "grad_norm": 0.698192298412323,
+      "learning_rate": 0.0001477080850670248,
+      "loss": 1.1452,
+      "step": 7690
+    },
+    {
+      "epoch": 1.3693019943019942,
+      "grad_norm": 0.6838962435722351,
+      "learning_rate": 0.00014769578278956766,
+      "loss": 0.9789,
+      "step": 7691
+    },
+    {
+      "epoch": 1.369480056980057,
+      "grad_norm": 0.6636955142021179,
+      "learning_rate": 0.00014768347957761074,
+      "loss": 0.931,
+      "step": 7692
+    },
+    {
+      "epoch": 1.3696581196581197,
+      "grad_norm": 0.706030547618866,
+      "learning_rate": 0.0001476711754313951,
+      "loss": 1.1096,
+      "step": 7693
+    },
+    {
+      "epoch": 1.3698361823361824,
+      "grad_norm": 0.6771288514137268,
+      "learning_rate": 0.00014765887035116178,
+      "loss": 0.9641,
+      "step": 7694
+    },
+    {
+      "epoch": 1.370014245014245,
+      "grad_norm": 0.6805008053779602,
+      "learning_rate": 0.00014764656433715188,
+      "loss": 0.8724,
+      "step": 7695
+    },
+    {
+      "epoch": 1.3701923076923077,
+      "grad_norm": 0.6599233746528625,
+      "learning_rate": 0.00014763425738960657,
+      "loss": 0.8477,
+      "step": 7696
+    },
+    {
+      "epoch": 1.3703703703703702,
+      "grad_norm": 0.7036116123199463,
+      "learning_rate": 0.0001476219495087669,
+      "loss": 1.0991,
+      "step": 7697
+    },
+    {
+      "epoch": 1.370548433048433,
+      "grad_norm": 0.6677989363670349,
+      "learning_rate": 0.0001476096406948741,
+      "loss": 1.2397,
+      "step": 7698
+    },
+    {
+      "epoch": 1.3707264957264957,
+      "grad_norm": 0.5652269721031189,
+      "learning_rate": 0.00014759733094816928,
+      "loss": 0.9302,
+      "step": 7699
+    },
+    {
+      "epoch": 1.3709045584045585,
+      "grad_norm": 0.6670156121253967,
+      "learning_rate": 0.00014758502026889362,
+      "loss": 0.8362,
+      "step": 7700
+    },
+    {
+      "epoch": 1.371082621082621,
+      "grad_norm": 0.6705406904220581,
+      "learning_rate": 0.00014757270865728832,
+      "loss": 0.876,
+      "step": 7701
+    },
+    {
+      "epoch": 1.3712606837606838,
+      "grad_norm": 0.6020053625106812,
+      "learning_rate": 0.00014756039611359465,
+      "loss": 0.9182,
+      "step": 7702
+    },
+    {
+      "epoch": 1.3714387464387463,
+      "grad_norm": 0.6370134949684143,
+      "learning_rate": 0.0001475480826380538,
+      "loss": 1.1063,
+      "step": 7703
+    },
+    {
+      "epoch": 1.371616809116809,
+      "grad_norm": 0.6906460523605347,
+      "learning_rate": 0.00014753576823090705,
+      "loss": 0.988,
+      "step": 7704
+    },
+    {
+      "epoch": 1.3717948717948718,
+      "grad_norm": 0.6047569513320923,
+      "learning_rate": 0.00014752345289239567,
+      "loss": 1.15,
+      "step": 7705
+    },
+    {
+      "epoch": 1.3719729344729346,
+      "grad_norm": 0.7019868493080139,
+      "learning_rate": 0.00014751113662276095,
+      "loss": 1.1185,
+      "step": 7706
+    },
+    {
+      "epoch": 1.372150997150997,
+      "grad_norm": 0.6534035801887512,
+      "learning_rate": 0.00014749881942224417,
+      "loss": 0.9006,
+      "step": 7707
+    },
+    {
+      "epoch": 1.3723290598290598,
+      "grad_norm": 0.6111651659011841,
+      "learning_rate": 0.00014748650129108674,
+      "loss": 0.935,
+      "step": 7708
+    },
+    {
+      "epoch": 1.3725071225071226,
+      "grad_norm": 0.6678512096405029,
+      "learning_rate": 0.00014747418222952995,
+      "loss": 0.8771,
+      "step": 7709
+    },
+    {
+      "epoch": 1.3726851851851851,
+      "grad_norm": 0.607829749584198,
+      "learning_rate": 0.00014746186223781518,
+      "loss": 1.0509,
+      "step": 7710
+    },
+    {
+      "epoch": 1.3728632478632479,
+      "grad_norm": 0.7274412512779236,
+      "learning_rate": 0.00014744954131618382,
+      "loss": 0.9545,
+      "step": 7711
+    },
+    {
+      "epoch": 1.3730413105413106,
+      "grad_norm": 0.640333354473114,
+      "learning_rate": 0.00014743721946487723,
+      "loss": 1.018,
+      "step": 7712
+    },
+    {
+      "epoch": 1.3732193732193732,
+      "grad_norm": 0.6772079467773438,
+      "learning_rate": 0.0001474248966841369,
+      "loss": 1.0983,
+      "step": 7713
+    },
+    {
+      "epoch": 1.373397435897436,
+      "grad_norm": 0.49630534648895264,
+      "learning_rate": 0.00014741257297420422,
+      "loss": 0.5238,
+      "step": 7714
+    },
+    {
+      "epoch": 1.3735754985754987,
+      "grad_norm": 0.6316596269607544,
+      "learning_rate": 0.00014740024833532068,
+      "loss": 1.1342,
+      "step": 7715
+    },
+    {
+      "epoch": 1.3737535612535612,
+      "grad_norm": 0.5928404331207275,
+      "learning_rate": 0.00014738792276772775,
+      "loss": 0.7987,
+      "step": 7716
+    },
+    {
+      "epoch": 1.373931623931624,
+      "grad_norm": 0.6773418188095093,
+      "learning_rate": 0.00014737559627166688,
+      "loss": 0.934,
+      "step": 7717
+    },
+    {
+      "epoch": 1.3741096866096867,
+      "grad_norm": 0.7895028591156006,
+      "learning_rate": 0.00014736326884737963,
+      "loss": 0.984,
+      "step": 7718
+    },
+    {
+      "epoch": 1.3742877492877492,
+      "grad_norm": 0.7074753046035767,
+      "learning_rate": 0.00014735094049510752,
+      "loss": 1.0093,
+      "step": 7719
+    },
+    {
+      "epoch": 1.374465811965812,
+      "grad_norm": 0.5389847159385681,
+      "learning_rate": 0.00014733861121509208,
+      "loss": 0.8138,
+      "step": 7720
+    },
+    {
+      "epoch": 1.3746438746438747,
+      "grad_norm": 0.6138495206832886,
+      "learning_rate": 0.00014732628100757493,
+      "loss": 0.9282,
+      "step": 7721
+    },
+    {
+      "epoch": 1.3748219373219372,
+      "grad_norm": 0.7609560489654541,
+      "learning_rate": 0.00014731394987279757,
+      "loss": 0.9859,
+      "step": 7722
+    },
+    {
+      "epoch": 1.375,
+      "grad_norm": 0.6806198954582214,
+      "learning_rate": 0.00014730161781100165,
+      "loss": 0.8932,
+      "step": 7723
+    },
+    {
+      "epoch": 1.3751780626780628,
+      "grad_norm": 0.7229103446006775,
+      "learning_rate": 0.0001472892848224288,
+      "loss": 0.956,
+      "step": 7724
+    },
+    {
+      "epoch": 1.3753561253561253,
+      "grad_norm": 0.6157994866371155,
+      "learning_rate": 0.00014727695090732066,
+      "loss": 1.0285,
+      "step": 7725
+    },
+    {
+      "epoch": 1.375534188034188,
+      "grad_norm": 0.5885980129241943,
+      "learning_rate": 0.00014726461606591885,
+      "loss": 0.9174,
+      "step": 7726
+    },
+    {
+      "epoch": 1.3757122507122508,
+      "grad_norm": 0.6655769944190979,
+      "learning_rate": 0.0001472522802984651,
+      "loss": 0.9059,
+      "step": 7727
+    },
+    {
+      "epoch": 1.3758903133903133,
+      "grad_norm": 0.7075541019439697,
+      "learning_rate": 0.00014723994360520105,
+      "loss": 1.0055,
+      "step": 7728
+    },
+    {
+      "epoch": 1.376068376068376,
+      "grad_norm": 0.6947159171104431,
+      "learning_rate": 0.00014722760598636847,
+      "loss": 0.9782,
+      "step": 7729
+    },
+    {
+      "epoch": 1.3762464387464388,
+      "grad_norm": 0.6629964709281921,
+      "learning_rate": 0.00014721526744220905,
+      "loss": 0.9427,
+      "step": 7730
+    },
+    {
+      "epoch": 1.3764245014245013,
+      "grad_norm": 0.7385284304618835,
+      "learning_rate": 0.00014720292797296453,
+      "loss": 0.9953,
+      "step": 7731
+    },
+    {
+      "epoch": 1.376602564102564,
+      "grad_norm": 0.6123563051223755,
+      "learning_rate": 0.0001471905875788767,
+      "loss": 1.0103,
+      "step": 7732
+    },
+    {
+      "epoch": 1.3767806267806268,
+      "grad_norm": 0.6457047462463379,
+      "learning_rate": 0.00014717824626018732,
+      "loss": 0.9779,
+      "step": 7733
+    },
+    {
+      "epoch": 1.3769586894586894,
+      "grad_norm": 0.6196442246437073,
+      "learning_rate": 0.00014716590401713824,
+      "loss": 0.8747,
+      "step": 7734
+    },
+    {
+      "epoch": 1.3771367521367521,
+      "grad_norm": 0.7932298183441162,
+      "learning_rate": 0.00014715356084997122,
+      "loss": 1.1617,
+      "step": 7735
+    },
+    {
+      "epoch": 1.3773148148148149,
+      "grad_norm": 0.787304699420929,
+      "learning_rate": 0.00014714121675892815,
+      "loss": 1.1383,
+      "step": 7736
+    },
+    {
+      "epoch": 1.3774928774928774,
+      "grad_norm": 0.672795295715332,
+      "learning_rate": 0.00014712887174425085,
+      "loss": 1.2563,
+      "step": 7737
+    },
+    {
+      "epoch": 1.3776709401709402,
+      "grad_norm": 0.6505744457244873,
+      "learning_rate": 0.00014711652580618123,
+      "loss": 0.9194,
+      "step": 7738
+    },
+    {
+      "epoch": 1.377849002849003,
+      "grad_norm": 0.8141193985939026,
+      "learning_rate": 0.00014710417894496115,
+      "loss": 1.1428,
+      "step": 7739
+    },
+    {
+      "epoch": 1.3780270655270654,
+      "grad_norm": 0.6269707679748535,
+      "learning_rate": 0.00014709183116083253,
+      "loss": 0.7164,
+      "step": 7740
+    },
+    {
+      "epoch": 1.3782051282051282,
+      "grad_norm": 0.6737076640129089,
+      "learning_rate": 0.0001470794824540373,
+      "loss": 0.9965,
+      "step": 7741
+    },
+    {
+      "epoch": 1.378383190883191,
+      "grad_norm": 0.6451728343963623,
+      "learning_rate": 0.0001470671328248174,
+      "loss": 1.0539,
+      "step": 7742
+    },
+    {
+      "epoch": 1.3785612535612537,
+      "grad_norm": 0.6480295062065125,
+      "learning_rate": 0.00014705478227341486,
+      "loss": 0.9118,
+      "step": 7743
+    },
+    {
+      "epoch": 1.3787393162393162,
+      "grad_norm": 0.7429090738296509,
+      "learning_rate": 0.00014704243080007154,
+      "loss": 1.0031,
+      "step": 7744
+    },
+    {
+      "epoch": 1.378917378917379,
+      "grad_norm": 0.5601376891136169,
+      "learning_rate": 0.00014703007840502955,
+      "loss": 0.849,
+      "step": 7745
+    },
+    {
+      "epoch": 1.3790954415954415,
+      "grad_norm": 0.7067657113075256,
+      "learning_rate": 0.00014701772508853088,
+      "loss": 1.3067,
+      "step": 7746
+    },
+    {
+      "epoch": 1.3792735042735043,
+      "grad_norm": 0.7016390562057495,
+      "learning_rate": 0.00014700537085081755,
+      "loss": 1.0236,
+      "step": 7747
+    },
+    {
+      "epoch": 1.379451566951567,
+      "grad_norm": 0.6505000591278076,
+      "learning_rate": 0.0001469930156921316,
+      "loss": 1.0121,
+      "step": 7748
+    },
+    {
+      "epoch": 1.3796296296296298,
+      "grad_norm": 0.8515380620956421,
+      "learning_rate": 0.00014698065961271512,
+      "loss": 1.0413,
+      "step": 7749
+    },
+    {
+      "epoch": 1.3798076923076923,
+      "grad_norm": 0.6322008371353149,
+      "learning_rate": 0.00014696830261281025,
+      "loss": 0.8306,
+      "step": 7750
+    },
+    {
+      "epoch": 1.379985754985755,
+      "grad_norm": 0.7090431451797485,
+      "learning_rate": 0.00014695594469265902,
+      "loss": 1.1829,
+      "step": 7751
+    },
+    {
+      "epoch": 1.3801638176638176,
+      "grad_norm": 0.5913167595863342,
+      "learning_rate": 0.00014694358585250363,
+      "loss": 0.9769,
+      "step": 7752
+    },
+    {
+      "epoch": 1.3803418803418803,
+      "grad_norm": 0.7345432639122009,
+      "learning_rate": 0.00014693122609258616,
+      "loss": 0.9928,
+      "step": 7753
+    },
+    {
+      "epoch": 1.380519943019943,
+      "grad_norm": 0.6158214211463928,
+      "learning_rate": 0.00014691886541314884,
+      "loss": 1.1166,
+      "step": 7754
+    },
+    {
+      "epoch": 1.3806980056980058,
+      "grad_norm": 0.6874041557312012,
+      "learning_rate": 0.0001469065038144338,
+      "loss": 1.0808,
+      "step": 7755
+    },
+    {
+      "epoch": 1.3808760683760684,
+      "grad_norm": 0.8135195970535278,
+      "learning_rate": 0.00014689414129668326,
+      "loss": 0.9482,
+      "step": 7756
+    },
+    {
+      "epoch": 1.381054131054131,
+      "grad_norm": 0.6389174461364746,
+      "learning_rate": 0.00014688177786013944,
+      "loss": 1.039,
+      "step": 7757
+    },
+    {
+      "epoch": 1.3812321937321936,
+      "grad_norm": 0.6953016519546509,
+      "learning_rate": 0.00014686941350504454,
+      "loss": 0.9426,
+      "step": 7758
+    },
+    {
+      "epoch": 1.3814102564102564,
+      "grad_norm": 0.8171859383583069,
+      "learning_rate": 0.00014685704823164087,
+      "loss": 1.0393,
+      "step": 7759
+    },
+    {
+      "epoch": 1.3815883190883191,
+      "grad_norm": 0.6968414783477783,
+      "learning_rate": 0.0001468446820401707,
+      "loss": 1.1167,
+      "step": 7760
+    },
+    {
+      "epoch": 1.381766381766382,
+      "grad_norm": 0.6916623711585999,
+      "learning_rate": 0.00014683231493087628,
+      "loss": 1.1886,
+      "step": 7761
+    },
+    {
+      "epoch": 1.3819444444444444,
+      "grad_norm": 0.7351683378219604,
+      "learning_rate": 0.00014681994690399992,
+      "loss": 0.9893,
+      "step": 7762
+    },
+    {
+      "epoch": 1.3821225071225072,
+      "grad_norm": 0.6617491245269775,
+      "learning_rate": 0.00014680757795978395,
+      "loss": 1.0505,
+      "step": 7763
+    },
+    {
+      "epoch": 1.3823005698005697,
+      "grad_norm": 0.6627485156059265,
+      "learning_rate": 0.00014679520809847074,
+      "loss": 0.9878,
+      "step": 7764
+    },
+    {
+      "epoch": 1.3824786324786325,
+      "grad_norm": 0.704636812210083,
+      "learning_rate": 0.00014678283732030264,
+      "loss": 0.8332,
+      "step": 7765
+    },
+    {
+      "epoch": 1.3826566951566952,
+      "grad_norm": 0.698853075504303,
+      "learning_rate": 0.00014677046562552203,
+      "loss": 1.0926,
+      "step": 7766
+    },
+    {
+      "epoch": 1.382834757834758,
+      "grad_norm": 0.6695869565010071,
+      "learning_rate": 0.0001467580930143713,
+      "loss": 1.0626,
+      "step": 7767
+    },
+    {
+      "epoch": 1.3830128205128205,
+      "grad_norm": 0.672173023223877,
+      "learning_rate": 0.00014674571948709286,
+      "loss": 0.8842,
+      "step": 7768
+    },
+    {
+      "epoch": 1.3831908831908832,
+      "grad_norm": 0.6735473871231079,
+      "learning_rate": 0.00014673334504392916,
+      "loss": 0.9382,
+      "step": 7769
+    },
+    {
+      "epoch": 1.3833689458689458,
+      "grad_norm": 0.6864013075828552,
+      "learning_rate": 0.00014672096968512265,
+      "loss": 1.1369,
+      "step": 7770
+    },
+    {
+      "epoch": 1.3835470085470085,
+      "grad_norm": 0.7154954075813293,
+      "learning_rate": 0.0001467085934109158,
+      "loss": 1.1447,
+      "step": 7771
+    },
+    {
+      "epoch": 1.3837250712250713,
+      "grad_norm": 0.5934487581253052,
+      "learning_rate": 0.0001466962162215511,
+      "loss": 0.8923,
+      "step": 7772
+    },
+    {
+      "epoch": 1.383903133903134,
+      "grad_norm": 0.8116832971572876,
+      "learning_rate": 0.00014668383811727097,
+      "loss": 1.0997,
+      "step": 7773
+    },
+    {
+      "epoch": 1.3840811965811965,
+      "grad_norm": 0.8661674857139587,
+      "learning_rate": 0.00014667145909831808,
+      "loss": 1.0112,
+      "step": 7774
+    },
+    {
+      "epoch": 1.3842592592592593,
+      "grad_norm": 0.5173856616020203,
+      "learning_rate": 0.00014665907916493488,
+      "loss": 0.6571,
+      "step": 7775
+    },
+    {
+      "epoch": 1.3844373219373218,
+      "grad_norm": 0.6165067553520203,
+      "learning_rate": 0.00014664669831736395,
+      "loss": 1.0992,
+      "step": 7776
+    },
+    {
+      "epoch": 1.3846153846153846,
+      "grad_norm": 0.6564429998397827,
+      "learning_rate": 0.00014663431655584787,
+      "loss": 0.9103,
+      "step": 7777
+    },
+    {
+      "epoch": 1.3847934472934473,
+      "grad_norm": 0.7162124514579773,
+      "learning_rate": 0.00014662193388062923,
+      "loss": 1.0645,
+      "step": 7778
+    },
+    {
+      "epoch": 1.38497150997151,
+      "grad_norm": 0.6391215920448303,
+      "learning_rate": 0.00014660955029195064,
+      "loss": 0.902,
+      "step": 7779
+    },
+    {
+      "epoch": 1.3851495726495726,
+      "grad_norm": 0.6876635551452637,
+      "learning_rate": 0.00014659716579005475,
+      "loss": 1.0924,
+      "step": 7780
+    },
+    {
+      "epoch": 1.3853276353276354,
+      "grad_norm": 0.7254653573036194,
+      "learning_rate": 0.00014658478037518418,
+      "loss": 1.0135,
+      "step": 7781
+    },
+    {
+      "epoch": 1.385505698005698,
+      "grad_norm": 0.6900535225868225,
+      "learning_rate": 0.00014657239404758162,
+      "loss": 0.983,
+      "step": 7782
+    },
+    {
+      "epoch": 1.3856837606837606,
+      "grad_norm": 0.7477042078971863,
+      "learning_rate": 0.00014656000680748975,
+      "loss": 1.0707,
+      "step": 7783
+    },
+    {
+      "epoch": 1.3858618233618234,
+      "grad_norm": 0.5756927132606506,
+      "learning_rate": 0.00014654761865515124,
+      "loss": 0.8881,
+      "step": 7784
+    },
+    {
+      "epoch": 1.3860398860398861,
+      "grad_norm": 0.6736083626747131,
+      "learning_rate": 0.00014653522959080884,
+      "loss": 1.0193,
+      "step": 7785
+    },
+    {
+      "epoch": 1.3862179487179487,
+      "grad_norm": 0.616179883480072,
+      "learning_rate": 0.0001465228396147053,
+      "loss": 0.8676,
+      "step": 7786
+    },
+    {
+      "epoch": 1.3863960113960114,
+      "grad_norm": 0.7956456542015076,
+      "learning_rate": 0.00014651044872708338,
+      "loss": 0.9787,
+      "step": 7787
+    },
+    {
+      "epoch": 1.386574074074074,
+      "grad_norm": 0.6613463163375854,
+      "learning_rate": 0.00014649805692818578,
+      "loss": 1.0032,
+      "step": 7788
+    },
+    {
+      "epoch": 1.3867521367521367,
+      "grad_norm": 0.6215800642967224,
+      "learning_rate": 0.0001464856642182554,
+      "loss": 1.0123,
+      "step": 7789
+    },
+    {
+      "epoch": 1.3869301994301995,
+      "grad_norm": 0.6701171398162842,
+      "learning_rate": 0.00014647327059753496,
+      "loss": 0.9108,
+      "step": 7790
+    },
+    {
+      "epoch": 1.3871082621082622,
+      "grad_norm": 0.6213465929031372,
+      "learning_rate": 0.00014646087606626736,
+      "loss": 0.9313,
+      "step": 7791
+    },
+    {
+      "epoch": 1.3872863247863247,
+      "grad_norm": 0.7535304427146912,
+      "learning_rate": 0.00014644848062469535,
+      "loss": 1.0813,
+      "step": 7792
+    },
+    {
+      "epoch": 1.3874643874643875,
+      "grad_norm": 0.6778230667114258,
+      "learning_rate": 0.0001464360842730619,
+      "loss": 1.0405,
+      "step": 7793
+    },
+    {
+      "epoch": 1.38764245014245,
+      "grad_norm": 0.7816025614738464,
+      "learning_rate": 0.0001464236870116098,
+      "loss": 0.9228,
+      "step": 7794
+    },
+    {
+      "epoch": 1.3878205128205128,
+      "grad_norm": 0.6815229058265686,
+      "learning_rate": 0.00014641128884058203,
+      "loss": 0.9607,
+      "step": 7795
+    },
+    {
+      "epoch": 1.3879985754985755,
+      "grad_norm": 0.7027714848518372,
+      "learning_rate": 0.00014639888976022145,
+      "loss": 0.9379,
+      "step": 7796
+    },
+    {
+      "epoch": 1.3881766381766383,
+      "grad_norm": 0.7636353373527527,
+      "learning_rate": 0.00014638648977077104,
+      "loss": 1.1186,
+      "step": 7797
+    },
+    {
+      "epoch": 1.3883547008547008,
+      "grad_norm": 0.6732974052429199,
+      "learning_rate": 0.00014637408887247365,
+      "loss": 1.1378,
+      "step": 7798
+    },
+    {
+      "epoch": 1.3885327635327636,
+      "grad_norm": 0.7539397478103638,
+      "learning_rate": 0.0001463616870655724,
+      "loss": 0.999,
+      "step": 7799
+    },
+    {
+      "epoch": 1.388710826210826,
+      "grad_norm": 0.6872972846031189,
+      "learning_rate": 0.00014634928435031013,
+      "loss": 0.9564,
+      "step": 7800
+    },
+    {
+      "epoch": 1.3888888888888888,
+      "grad_norm": 0.6823115348815918,
+      "learning_rate": 0.00014633688072693,
+      "loss": 0.9745,
+      "step": 7801
+    },
+    {
+      "epoch": 1.3890669515669516,
+      "grad_norm": 0.6462571620941162,
+      "learning_rate": 0.00014632447619567488,
+      "loss": 0.8314,
+      "step": 7802
+    },
+    {
+      "epoch": 1.3892450142450143,
+      "grad_norm": 0.7245402932167053,
+      "learning_rate": 0.0001463120707567879,
+      "loss": 0.8291,
+      "step": 7803
+    },
+    {
+      "epoch": 1.3894230769230769,
+      "grad_norm": 0.697179913520813,
+      "learning_rate": 0.00014629966441051208,
+      "loss": 1.017,
+      "step": 7804
+    },
+    {
+      "epoch": 1.3896011396011396,
+      "grad_norm": 0.6304250359535217,
+      "learning_rate": 0.00014628725715709053,
+      "loss": 0.9262,
+      "step": 7805
+    },
+    {
+      "epoch": 1.3897792022792022,
+      "grad_norm": 0.5780240297317505,
+      "learning_rate": 0.00014627484899676634,
+      "loss": 0.6596,
+      "step": 7806
+    },
+    {
+      "epoch": 1.389957264957265,
+      "grad_norm": 0.8030684590339661,
+      "learning_rate": 0.0001462624399297826,
+      "loss": 0.9977,
+      "step": 7807
+    },
+    {
+      "epoch": 1.3901353276353277,
+      "grad_norm": 0.7999774813652039,
+      "learning_rate": 0.00014625002995638246,
+      "loss": 1.1036,
+      "step": 7808
+    },
+    {
+      "epoch": 1.3903133903133904,
+      "grad_norm": 0.7054862976074219,
+      "learning_rate": 0.00014623761907680904,
+      "loss": 1.1435,
+      "step": 7809
+    },
+    {
+      "epoch": 1.390491452991453,
+      "grad_norm": 0.6660647392272949,
+      "learning_rate": 0.00014622520729130556,
+      "loss": 0.703,
+      "step": 7810
+    },
+    {
+      "epoch": 1.3906695156695157,
+      "grad_norm": 0.6339690089225769,
+      "learning_rate": 0.00014621279460011515,
+      "loss": 1.0451,
+      "step": 7811
+    },
+    {
+      "epoch": 1.3908475783475782,
+      "grad_norm": 0.8568736910820007,
+      "learning_rate": 0.00014620038100348102,
+      "loss": 1.009,
+      "step": 7812
+    },
+    {
+      "epoch": 1.391025641025641,
+      "grad_norm": 0.7126797437667847,
+      "learning_rate": 0.00014618796650164642,
+      "loss": 0.9592,
+      "step": 7813
+    },
+    {
+      "epoch": 1.3912037037037037,
+      "grad_norm": 0.6768994331359863,
+      "learning_rate": 0.00014617555109485453,
+      "loss": 1.09,
+      "step": 7814
+    },
+    {
+      "epoch": 1.3913817663817665,
+      "grad_norm": 0.7609471678733826,
+      "learning_rate": 0.00014616313478334864,
+      "loss": 0.9781,
+      "step": 7815
+    },
+    {
+      "epoch": 1.391559829059829,
+      "grad_norm": 0.7107006907463074,
+      "learning_rate": 0.00014615071756737203,
+      "loss": 0.9769,
+      "step": 7816
+    },
+    {
+      "epoch": 1.3917378917378918,
+      "grad_norm": 0.6324763894081116,
+      "learning_rate": 0.00014613829944716802,
+      "loss": 1.089,
+      "step": 7817
+    },
+    {
+      "epoch": 1.3919159544159543,
+      "grad_norm": 0.6617186069488525,
+      "learning_rate": 0.00014612588042297984,
+      "loss": 1.0466,
+      "step": 7818
+    },
+    {
+      "epoch": 1.392094017094017,
+      "grad_norm": 0.7881436944007874,
+      "learning_rate": 0.00014611346049505083,
+      "loss": 1.003,
+      "step": 7819
+    },
+    {
+      "epoch": 1.3922720797720798,
+      "grad_norm": 0.7391049861907959,
+      "learning_rate": 0.00014610103966362437,
+      "loss": 1.0531,
+      "step": 7820
+    },
+    {
+      "epoch": 1.3924501424501425,
+      "grad_norm": 0.6299472451210022,
+      "learning_rate": 0.00014608861792894383,
+      "loss": 0.8433,
+      "step": 7821
+    },
+    {
+      "epoch": 1.392628205128205,
+      "grad_norm": 0.6053452491760254,
+      "learning_rate": 0.00014607619529125255,
+      "loss": 0.7945,
+      "step": 7822
+    },
+    {
+      "epoch": 1.3928062678062678,
+      "grad_norm": 0.7160114645957947,
+      "learning_rate": 0.0001460637717507939,
+      "loss": 1.1604,
+      "step": 7823
+    },
+    {
+      "epoch": 1.3929843304843303,
+      "grad_norm": 0.6308854222297668,
+      "learning_rate": 0.00014605134730781135,
+      "loss": 1.0918,
+      "step": 7824
+    },
+    {
+      "epoch": 1.393162393162393,
+      "grad_norm": 0.7187000513076782,
+      "learning_rate": 0.00014603892196254833,
+      "loss": 1.0594,
+      "step": 7825
+    },
+    {
+      "epoch": 1.3933404558404558,
+      "grad_norm": 0.7516581416130066,
+      "learning_rate": 0.00014602649571524826,
+      "loss": 0.9222,
+      "step": 7826
+    },
+    {
+      "epoch": 1.3935185185185186,
+      "grad_norm": 0.6340481638908386,
+      "learning_rate": 0.00014601406856615463,
+      "loss": 0.8131,
+      "step": 7827
+    },
+    {
+      "epoch": 1.3936965811965811,
+      "grad_norm": 0.8161744475364685,
+      "learning_rate": 0.0001460016405155109,
+      "loss": 0.8695,
+      "step": 7828
+    },
+    {
+      "epoch": 1.3938746438746439,
+      "grad_norm": 0.6926971077919006,
+      "learning_rate": 0.0001459892115635606,
+      "loss": 0.9548,
+      "step": 7829
+    },
+    {
+      "epoch": 1.3940527065527066,
+      "grad_norm": 0.6669796109199524,
+      "learning_rate": 0.0001459767817105472,
+      "loss": 0.9255,
+      "step": 7830
+    },
+    {
+      "epoch": 1.3942307692307692,
+      "grad_norm": 0.6626184582710266,
+      "learning_rate": 0.00014596435095671432,
+      "loss": 1.1141,
+      "step": 7831
+    },
+    {
+      "epoch": 1.394408831908832,
+      "grad_norm": 0.6755738854408264,
+      "learning_rate": 0.00014595191930230546,
+      "loss": 0.9596,
+      "step": 7832
+    },
+    {
+      "epoch": 1.3945868945868947,
+      "grad_norm": 0.6034863591194153,
+      "learning_rate": 0.00014593948674756417,
+      "loss": 0.8088,
+      "step": 7833
+    },
+    {
+      "epoch": 1.3947649572649572,
+      "grad_norm": 0.5638226866722107,
+      "learning_rate": 0.00014592705329273406,
+      "loss": 0.5828,
+      "step": 7834
+    },
+    {
+      "epoch": 1.39494301994302,
+      "grad_norm": 0.6902222633361816,
+      "learning_rate": 0.0001459146189380588,
+      "loss": 0.7954,
+      "step": 7835
+    },
+    {
+      "epoch": 1.3951210826210827,
+      "grad_norm": 0.7579947710037231,
+      "learning_rate": 0.0001459021836837819,
+      "loss": 1.1301,
+      "step": 7836
+    },
+    {
+      "epoch": 1.3952991452991452,
+      "grad_norm": 0.6894911527633667,
+      "learning_rate": 0.00014588974753014712,
+      "loss": 1.082,
+      "step": 7837
+    },
+    {
+      "epoch": 1.395477207977208,
+      "grad_norm": 0.6330230832099915,
+      "learning_rate": 0.000145877310477398,
+      "loss": 0.7614,
+      "step": 7838
+    },
+    {
+      "epoch": 1.3956552706552707,
+      "grad_norm": 0.6164960265159607,
+      "learning_rate": 0.00014586487252577832,
+      "loss": 0.8981,
+      "step": 7839
+    },
+    {
+      "epoch": 1.3958333333333333,
+      "grad_norm": 0.6575061678886414,
+      "learning_rate": 0.0001458524336755317,
+      "loss": 0.9735,
+      "step": 7840
+    },
+    {
+      "epoch": 1.396011396011396,
+      "grad_norm": 0.687921941280365,
+      "learning_rate": 0.00014583999392690195,
+      "loss": 0.9207,
+      "step": 7841
+    },
+    {
+      "epoch": 1.3961894586894588,
+      "grad_norm": 0.6175212860107422,
+      "learning_rate": 0.00014582755328013274,
+      "loss": 1.0444,
+      "step": 7842
+    },
+    {
+      "epoch": 1.3963675213675213,
+      "grad_norm": 0.6351733207702637,
+      "learning_rate": 0.00014581511173546781,
+      "loss": 1.0143,
+      "step": 7843
+    },
+    {
+      "epoch": 1.396545584045584,
+      "grad_norm": 0.7235051989555359,
+      "learning_rate": 0.00014580266929315093,
+      "loss": 0.9108,
+      "step": 7844
+    },
+    {
+      "epoch": 1.3967236467236468,
+      "grad_norm": 0.6432043313980103,
+      "learning_rate": 0.00014579022595342586,
+      "loss": 0.8674,
+      "step": 7845
+    },
+    {
+      "epoch": 1.3969017094017093,
+      "grad_norm": 0.7775412797927856,
+      "learning_rate": 0.00014577778171653648,
+      "loss": 1.0637,
+      "step": 7846
+    },
+    {
+      "epoch": 1.397079772079772,
+      "grad_norm": 0.6748763918876648,
+      "learning_rate": 0.00014576533658272655,
+      "loss": 1.0356,
+      "step": 7847
+    },
+    {
+      "epoch": 1.3972578347578348,
+      "grad_norm": 0.6940401196479797,
+      "learning_rate": 0.00014575289055223994,
+      "loss": 0.9937,
+      "step": 7848
+    },
+    {
+      "epoch": 1.3974358974358974,
+      "grad_norm": 0.6971304416656494,
+      "learning_rate": 0.00014574044362532045,
+      "loss": 0.9753,
+      "step": 7849
+    },
+    {
+      "epoch": 1.39761396011396,
+      "grad_norm": 0.6576017141342163,
+      "learning_rate": 0.00014572799580221197,
+      "loss": 1.1233,
+      "step": 7850
+    },
+    {
+      "epoch": 1.3977920227920229,
+      "grad_norm": 0.6270702481269836,
+      "learning_rate": 0.00014571554708315843,
+      "loss": 0.9771,
+      "step": 7851
+    },
+    {
+      "epoch": 1.3979700854700854,
+      "grad_norm": 0.6898425817489624,
+      "learning_rate": 0.00014570309746840372,
+      "loss": 0.9235,
+      "step": 7852
+    },
+    {
+      "epoch": 1.3981481481481481,
+      "grad_norm": 0.7017102241516113,
+      "learning_rate": 0.00014569064695819174,
+      "loss": 1.1056,
+      "step": 7853
+    },
+    {
+      "epoch": 1.398326210826211,
+      "grad_norm": 0.6298288702964783,
+      "learning_rate": 0.00014567819555276647,
+      "loss": 0.8635,
+      "step": 7854
+    },
+    {
+      "epoch": 1.3985042735042734,
+      "grad_norm": 0.7173134684562683,
+      "learning_rate": 0.00014566574325237182,
+      "loss": 1.0893,
+      "step": 7855
+    },
+    {
+      "epoch": 1.3986823361823362,
+      "grad_norm": 0.7541036605834961,
+      "learning_rate": 0.0001456532900572518,
+      "loss": 1.0996,
+      "step": 7856
+    },
+    {
+      "epoch": 1.398860398860399,
+      "grad_norm": 0.6204771399497986,
+      "learning_rate": 0.0001456408359676504,
+      "loss": 0.7601,
+      "step": 7857
+    },
+    {
+      "epoch": 1.3990384615384617,
+      "grad_norm": 0.629557192325592,
+      "learning_rate": 0.00014562838098381163,
+      "loss": 0.9239,
+      "step": 7858
+    },
+    {
+      "epoch": 1.3992165242165242,
+      "grad_norm": 0.6878390908241272,
+      "learning_rate": 0.00014561592510597954,
+      "loss": 0.9641,
+      "step": 7859
+    },
+    {
+      "epoch": 1.399394586894587,
+      "grad_norm": 0.7490049004554749,
+      "learning_rate": 0.00014560346833439813,
+      "loss": 1.0198,
+      "step": 7860
+    },
+    {
+      "epoch": 1.3995726495726495,
+      "grad_norm": 0.6337960958480835,
+      "learning_rate": 0.0001455910106693115,
+      "loss": 0.8709,
+      "step": 7861
+    },
+    {
+      "epoch": 1.3997507122507122,
+      "grad_norm": 0.6210524439811707,
+      "learning_rate": 0.0001455785521109637,
+      "loss": 1.1049,
+      "step": 7862
+    },
+    {
+      "epoch": 1.399928774928775,
+      "grad_norm": 0.7894936203956604,
+      "learning_rate": 0.00014556609265959887,
+      "loss": 0.8933,
+      "step": 7863
+    },
+    {
+      "epoch": 1.4001068376068377,
+      "grad_norm": 0.6888098120689392,
+      "learning_rate": 0.00014555363231546112,
+      "loss": 0.9738,
+      "step": 7864
+    },
+    {
+      "epoch": 1.4002849002849003,
+      "grad_norm": 0.608799934387207,
+      "learning_rate": 0.00014554117107879456,
+      "loss": 0.9103,
+      "step": 7865
+    },
+    {
+      "epoch": 1.400462962962963,
+      "grad_norm": 0.7390474081039429,
+      "learning_rate": 0.00014552870894984335,
+      "loss": 1.2484,
+      "step": 7866
+    },
+    {
+      "epoch": 1.4006410256410255,
+      "grad_norm": 0.6513381600379944,
+      "learning_rate": 0.00014551624592885169,
+      "loss": 0.8523,
+      "step": 7867
+    },
+    {
+      "epoch": 1.4008190883190883,
+      "grad_norm": 0.6357464790344238,
+      "learning_rate": 0.00014550378201606373,
+      "loss": 0.9594,
+      "step": 7868
+    },
+    {
+      "epoch": 1.400997150997151,
+      "grad_norm": 0.6893286108970642,
+      "learning_rate": 0.0001454913172117237,
+      "loss": 0.9798,
+      "step": 7869
+    },
+    {
+      "epoch": 1.4011752136752138,
+      "grad_norm": 0.6566550731658936,
+      "learning_rate": 0.0001454788515160758,
+      "loss": 1.0532,
+      "step": 7870
+    },
+    {
+      "epoch": 1.4013532763532763,
+      "grad_norm": 0.6442158222198486,
+      "learning_rate": 0.00014546638492936425,
+      "loss": 1.0789,
+      "step": 7871
+    },
+    {
+      "epoch": 1.401531339031339,
+      "grad_norm": 0.7570971846580505,
+      "learning_rate": 0.0001454539174518334,
+      "loss": 0.9806,
+      "step": 7872
+    },
+    {
+      "epoch": 1.4017094017094016,
+      "grad_norm": 0.6180047392845154,
+      "learning_rate": 0.0001454414490837274,
+      "loss": 0.857,
+      "step": 7873
+    },
+    {
+      "epoch": 1.4018874643874644,
+      "grad_norm": 0.7143170237541199,
+      "learning_rate": 0.0001454289798252906,
+      "loss": 0.8815,
+      "step": 7874
+    },
+    {
+      "epoch": 1.4020655270655271,
+      "grad_norm": 0.6388922929763794,
+      "learning_rate": 0.00014541650967676736,
+      "loss": 0.95,
+      "step": 7875
+    },
+    {
+      "epoch": 1.4022435897435899,
+      "grad_norm": 0.7137351632118225,
+      "learning_rate": 0.00014540403863840193,
+      "loss": 0.8973,
+      "step": 7876
+    },
+    {
+      "epoch": 1.4024216524216524,
+      "grad_norm": 0.656315267086029,
+      "learning_rate": 0.0001453915667104387,
+      "loss": 1.149,
+      "step": 7877
+    },
+    {
+      "epoch": 1.4025997150997151,
+      "grad_norm": 0.7234711647033691,
+      "learning_rate": 0.000145379093893122,
+      "loss": 0.9798,
+      "step": 7878
+    },
+    {
+      "epoch": 1.4027777777777777,
+      "grad_norm": 0.6595289707183838,
+      "learning_rate": 0.00014536662018669623,
+      "loss": 1.2704,
+      "step": 7879
+    },
+    {
+      "epoch": 1.4029558404558404,
+      "grad_norm": 0.6760551333427429,
+      "learning_rate": 0.00014535414559140576,
+      "loss": 0.8672,
+      "step": 7880
+    },
+    {
+      "epoch": 1.4031339031339032,
+      "grad_norm": 0.5916706919670105,
+      "learning_rate": 0.000145341670107495,
+      "loss": 0.888,
+      "step": 7881
+    },
+    {
+      "epoch": 1.403311965811966,
+      "grad_norm": 0.7272133231163025,
+      "learning_rate": 0.00014532919373520846,
+      "loss": 1.0466,
+      "step": 7882
+    },
+    {
+      "epoch": 1.4034900284900285,
+      "grad_norm": 0.8512467741966248,
+      "learning_rate": 0.00014531671647479048,
+      "loss": 1.2482,
+      "step": 7883
+    },
+    {
+      "epoch": 1.4036680911680912,
+      "grad_norm": 0.5536492466926575,
+      "learning_rate": 0.0001453042383264856,
+      "loss": 0.7823,
+      "step": 7884
+    },
+    {
+      "epoch": 1.4038461538461537,
+      "grad_norm": 0.7262215614318848,
+      "learning_rate": 0.0001452917592905383,
+      "loss": 0.9713,
+      "step": 7885
+    },
+    {
+      "epoch": 1.4040242165242165,
+      "grad_norm": 0.7146059274673462,
+      "learning_rate": 0.00014527927936719304,
+      "loss": 1.1064,
+      "step": 7886
+    },
+    {
+      "epoch": 1.4042022792022792,
+      "grad_norm": 0.5915318131446838,
+      "learning_rate": 0.00014526679855669436,
+      "loss": 0.8567,
+      "step": 7887
+    },
+    {
+      "epoch": 1.404380341880342,
+      "grad_norm": 0.6548298001289368,
+      "learning_rate": 0.00014525431685928682,
+      "loss": 1.1359,
+      "step": 7888
+    },
+    {
+      "epoch": 1.4045584045584045,
+      "grad_norm": 0.7482563853263855,
+      "learning_rate": 0.0001452418342752149,
+      "loss": 0.9095,
+      "step": 7889
+    },
+    {
+      "epoch": 1.4047364672364673,
+      "grad_norm": 0.6660130023956299,
+      "learning_rate": 0.0001452293508047233,
+      "loss": 1.2343,
+      "step": 7890
+    },
+    {
+      "epoch": 1.4049145299145298,
+      "grad_norm": 0.7457148432731628,
+      "learning_rate": 0.00014521686644805644,
+      "loss": 1.2086,
+      "step": 7891
+    },
+    {
+      "epoch": 1.4050925925925926,
+      "grad_norm": 0.5957929491996765,
+      "learning_rate": 0.00014520438120545906,
+      "loss": 0.9724,
+      "step": 7892
+    },
+    {
+      "epoch": 1.4052706552706553,
+      "grad_norm": 0.6832270622253418,
+      "learning_rate": 0.00014519189507717573,
+      "loss": 0.9903,
+      "step": 7893
+    },
+    {
+      "epoch": 1.405448717948718,
+      "grad_norm": 0.6202489733695984,
+      "learning_rate": 0.00014517940806345109,
+      "loss": 0.962,
+      "step": 7894
+    },
+    {
+      "epoch": 1.4056267806267806,
+      "grad_norm": 0.6419472694396973,
+      "learning_rate": 0.0001451669201645298,
+      "loss": 0.8147,
+      "step": 7895
+    },
+    {
+      "epoch": 1.4058048433048433,
+      "grad_norm": 0.61143958568573,
+      "learning_rate": 0.00014515443138065652,
+      "loss": 0.8674,
+      "step": 7896
+    },
+    {
+      "epoch": 1.4059829059829059,
+      "grad_norm": 0.7527356743812561,
+      "learning_rate": 0.00014514194171207597,
+      "loss": 1.0581,
+      "step": 7897
+    },
+    {
+      "epoch": 1.4061609686609686,
+      "grad_norm": 0.7195194363594055,
+      "learning_rate": 0.00014512945115903285,
+      "loss": 1.0268,
+      "step": 7898
+    },
+    {
+      "epoch": 1.4063390313390314,
+      "grad_norm": 0.7919661998748779,
+      "learning_rate": 0.00014511695972177187,
+      "loss": 1.0259,
+      "step": 7899
+    },
+    {
+      "epoch": 1.4065170940170941,
+      "grad_norm": 0.6774758696556091,
+      "learning_rate": 0.00014510446740053783,
+      "loss": 1.1214,
+      "step": 7900
+    },
+    {
+      "epoch": 1.4066951566951567,
+      "grad_norm": 0.6102406978607178,
+      "learning_rate": 0.0001450919741955754,
+      "loss": 1.1846,
+      "step": 7901
+    },
+    {
+      "epoch": 1.4068732193732194,
+      "grad_norm": 0.7189443707466125,
+      "learning_rate": 0.00014507948010712942,
+      "loss": 0.7758,
+      "step": 7902
+    },
+    {
+      "epoch": 1.407051282051282,
+      "grad_norm": 0.654153048992157,
+      "learning_rate": 0.00014506698513544467,
+      "loss": 0.899,
+      "step": 7903
+    },
+    {
+      "epoch": 1.4072293447293447,
+      "grad_norm": 0.637934684753418,
+      "learning_rate": 0.00014505448928076598,
+      "loss": 0.8301,
+      "step": 7904
+    },
+    {
+      "epoch": 1.4074074074074074,
+      "grad_norm": 0.7504615783691406,
+      "learning_rate": 0.00014504199254333812,
+      "loss": 0.9883,
+      "step": 7905
+    },
+    {
+      "epoch": 1.4075854700854702,
+      "grad_norm": 0.7902522683143616,
+      "learning_rate": 0.00014502949492340602,
+      "loss": 0.9615,
+      "step": 7906
+    },
+    {
+      "epoch": 1.4077635327635327,
+      "grad_norm": 0.5832732319831848,
+      "learning_rate": 0.0001450169964212145,
+      "loss": 0.7136,
+      "step": 7907
+    },
+    {
+      "epoch": 1.4079415954415955,
+      "grad_norm": 0.6025400757789612,
+      "learning_rate": 0.00014500449703700846,
+      "loss": 0.8812,
+      "step": 7908
+    },
+    {
+      "epoch": 1.408119658119658,
+      "grad_norm": 0.6412411332130432,
+      "learning_rate": 0.0001449919967710328,
+      "loss": 0.9346,
+      "step": 7909
+    },
+    {
+      "epoch": 1.4082977207977208,
+      "grad_norm": 0.7546970844268799,
+      "learning_rate": 0.00014497949562353242,
+      "loss": 1.0794,
+      "step": 7910
+    },
+    {
+      "epoch": 1.4084757834757835,
+      "grad_norm": 0.6175593733787537,
+      "learning_rate": 0.00014496699359475222,
+      "loss": 0.8939,
+      "step": 7911
+    },
+    {
+      "epoch": 1.4086538461538463,
+      "grad_norm": 0.6571716666221619,
+      "learning_rate": 0.00014495449068493722,
+      "loss": 1.1003,
+      "step": 7912
+    },
+    {
+      "epoch": 1.4088319088319088,
+      "grad_norm": 0.7038990259170532,
+      "learning_rate": 0.00014494198689433236,
+      "loss": 0.8844,
+      "step": 7913
+    },
+    {
+      "epoch": 1.4090099715099715,
+      "grad_norm": 0.7007337212562561,
+      "learning_rate": 0.00014492948222318263,
+      "loss": 1.2038,
+      "step": 7914
+    },
+    {
+      "epoch": 1.409188034188034,
+      "grad_norm": 0.7318591475486755,
+      "learning_rate": 0.00014491697667173302,
+      "loss": 1.0388,
+      "step": 7915
+    },
+    {
+      "epoch": 1.4093660968660968,
+      "grad_norm": 0.7010329961776733,
+      "learning_rate": 0.00014490447024022855,
+      "loss": 1.1485,
+      "step": 7916
+    },
+    {
+      "epoch": 1.4095441595441596,
+      "grad_norm": 0.7844831347465515,
+      "learning_rate": 0.0001448919629289143,
+      "loss": 1.1417,
+      "step": 7917
+    },
+    {
+      "epoch": 1.4097222222222223,
+      "grad_norm": 0.6953392624855042,
+      "learning_rate": 0.00014487945473803525,
+      "loss": 0.9546,
+      "step": 7918
+    },
+    {
+      "epoch": 1.4099002849002849,
+      "grad_norm": 0.6307587623596191,
+      "learning_rate": 0.00014486694566783655,
+      "loss": 0.9912,
+      "step": 7919
+    },
+    {
+      "epoch": 1.4100783475783476,
+      "grad_norm": 0.6200215816497803,
+      "learning_rate": 0.00014485443571856326,
+      "loss": 1.0998,
+      "step": 7920
+    },
+    {
+      "epoch": 1.4102564102564101,
+      "grad_norm": 0.7096502184867859,
+      "learning_rate": 0.00014484192489046043,
+      "loss": 0.9587,
+      "step": 7921
+    },
+    {
+      "epoch": 1.4104344729344729,
+      "grad_norm": 0.6965526342391968,
+      "learning_rate": 0.00014482941318377327,
+      "loss": 0.8791,
+      "step": 7922
+    },
+    {
+      "epoch": 1.4106125356125356,
+      "grad_norm": 0.7303466200828552,
+      "learning_rate": 0.00014481690059874687,
+      "loss": 1.084,
+      "step": 7923
+    },
+    {
+      "epoch": 1.4107905982905984,
+      "grad_norm": 0.6144066452980042,
+      "learning_rate": 0.00014480438713562638,
+      "loss": 0.9646,
+      "step": 7924
+    },
+    {
+      "epoch": 1.410968660968661,
+      "grad_norm": 0.645222008228302,
+      "learning_rate": 0.00014479187279465704,
+      "loss": 0.728,
+      "step": 7925
+    },
+    {
+      "epoch": 1.4111467236467237,
+      "grad_norm": 0.6069912314414978,
+      "learning_rate": 0.000144779357576084,
+      "loss": 0.842,
+      "step": 7926
+    },
+    {
+      "epoch": 1.4113247863247862,
+      "grad_norm": 0.6212135553359985,
+      "learning_rate": 0.00014476684148015243,
+      "loss": 0.9817,
+      "step": 7927
+    },
+    {
+      "epoch": 1.411502849002849,
+      "grad_norm": 0.6893343329429626,
+      "learning_rate": 0.00014475432450710763,
+      "loss": 1.0265,
+      "step": 7928
+    },
+    {
+      "epoch": 1.4116809116809117,
+      "grad_norm": 0.6842793822288513,
+      "learning_rate": 0.00014474180665719478,
+      "loss": 1.0593,
+      "step": 7929
+    },
+    {
+      "epoch": 1.4118589743589745,
+      "grad_norm": 0.74690842628479,
+      "learning_rate": 0.0001447292879306592,
+      "loss": 0.9096,
+      "step": 7930
+    },
+    {
+      "epoch": 1.412037037037037,
+      "grad_norm": 0.6624761819839478,
+      "learning_rate": 0.00014471676832774613,
+      "loss": 1.2244,
+      "step": 7931
+    },
+    {
+      "epoch": 1.4122150997150997,
+      "grad_norm": 0.6205778121948242,
+      "learning_rate": 0.00014470424784870088,
+      "loss": 1.1,
+      "step": 7932
+    },
+    {
+      "epoch": 1.4123931623931623,
+      "grad_norm": 0.7592337131500244,
+      "learning_rate": 0.00014469172649376875,
+      "loss": 0.963,
+      "step": 7933
+    },
+    {
+      "epoch": 1.412571225071225,
+      "grad_norm": 0.673328697681427,
+      "learning_rate": 0.00014467920426319508,
+      "loss": 0.8923,
+      "step": 7934
+    },
+    {
+      "epoch": 1.4127492877492878,
+      "grad_norm": 0.6064394116401672,
+      "learning_rate": 0.00014466668115722522,
+      "loss": 0.9679,
+      "step": 7935
+    },
+    {
+      "epoch": 1.4129273504273505,
+      "grad_norm": 0.7738677859306335,
+      "learning_rate": 0.00014465415717610454,
+      "loss": 1.0678,
+      "step": 7936
+    },
+    {
+      "epoch": 1.413105413105413,
+      "grad_norm": 0.7013397812843323,
+      "learning_rate": 0.00014464163232007836,
+      "loss": 0.9017,
+      "step": 7937
+    },
+    {
+      "epoch": 1.4132834757834758,
+      "grad_norm": 0.713291347026825,
+      "learning_rate": 0.0001446291065893922,
+      "loss": 1.1953,
+      "step": 7938
+    },
+    {
+      "epoch": 1.4134615384615383,
+      "grad_norm": 0.7538655996322632,
+      "learning_rate": 0.00014461657998429136,
+      "loss": 1.0571,
+      "step": 7939
+    },
+    {
+      "epoch": 1.413639601139601,
+      "grad_norm": 0.6358973383903503,
+      "learning_rate": 0.00014460405250502133,
+      "loss": 0.8552,
+      "step": 7940
+    },
+    {
+      "epoch": 1.4138176638176638,
+      "grad_norm": 0.67508864402771,
+      "learning_rate": 0.00014459152415182756,
+      "loss": 1.0293,
+      "step": 7941
+    },
+    {
+      "epoch": 1.4139957264957266,
+      "grad_norm": 0.7074598670005798,
+      "learning_rate": 0.00014457899492495546,
+      "loss": 1.2102,
+      "step": 7942
+    },
+    {
+      "epoch": 1.414173789173789,
+      "grad_norm": 0.7157037854194641,
+      "learning_rate": 0.00014456646482465058,
+      "loss": 1.0566,
+      "step": 7943
+    },
+    {
+      "epoch": 1.4143518518518519,
+      "grad_norm": 0.7918477058410645,
+      "learning_rate": 0.00014455393385115844,
+      "loss": 1.3727,
+      "step": 7944
+    },
+    {
+      "epoch": 1.4145299145299146,
+      "grad_norm": 0.569144606590271,
+      "learning_rate": 0.0001445414020047245,
+      "loss": 0.7251,
+      "step": 7945
+    },
+    {
+      "epoch": 1.4147079772079771,
+      "grad_norm": 0.7589054107666016,
+      "learning_rate": 0.0001445288692855943,
+      "loss": 1.0155,
+      "step": 7946
+    },
+    {
+      "epoch": 1.41488603988604,
+      "grad_norm": 0.7531685829162598,
+      "learning_rate": 0.0001445163356940134,
+      "loss": 0.8404,
+      "step": 7947
+    },
+    {
+      "epoch": 1.4150641025641026,
+      "grad_norm": 0.5730917453765869,
+      "learning_rate": 0.0001445038012302274,
+      "loss": 0.8215,
+      "step": 7948
+    },
+    {
+      "epoch": 1.4152421652421652,
+      "grad_norm": 0.6960710883140564,
+      "learning_rate": 0.00014449126589448187,
+      "loss": 0.7902,
+      "step": 7949
+    },
+    {
+      "epoch": 1.415420227920228,
+      "grad_norm": 0.8207054138183594,
+      "learning_rate": 0.0001444787296870224,
+      "loss": 1.493,
+      "step": 7950
+    },
+    {
+      "epoch": 1.4155982905982907,
+      "grad_norm": 0.5854668617248535,
+      "learning_rate": 0.00014446619260809462,
+      "loss": 0.9262,
+      "step": 7951
+    },
+    {
+      "epoch": 1.4157763532763532,
+      "grad_norm": 0.5458414554595947,
+      "learning_rate": 0.00014445365465794413,
+      "loss": 0.8431,
+      "step": 7952
+    },
+    {
+      "epoch": 1.415954415954416,
+      "grad_norm": 0.6880569458007812,
+      "learning_rate": 0.00014444111583681666,
+      "loss": 1.0184,
+      "step": 7953
+    },
+    {
+      "epoch": 1.4161324786324787,
+      "grad_norm": 0.6391083598136902,
+      "learning_rate": 0.00014442857614495783,
+      "loss": 0.88,
+      "step": 7954
+    },
+    {
+      "epoch": 1.4163105413105412,
+      "grad_norm": 0.6246135234832764,
+      "learning_rate": 0.00014441603558261335,
+      "loss": 0.776,
+      "step": 7955
+    },
+    {
+      "epoch": 1.416488603988604,
+      "grad_norm": 0.6263493895530701,
+      "learning_rate": 0.00014440349415002893,
+      "loss": 0.9069,
+      "step": 7956
+    },
+    {
+      "epoch": 1.4166666666666667,
+      "grad_norm": 0.7123475670814514,
+      "learning_rate": 0.00014439095184745024,
+      "loss": 0.8339,
+      "step": 7957
+    },
+    {
+      "epoch": 1.4168447293447293,
+      "grad_norm": 0.7171050906181335,
+      "learning_rate": 0.00014437840867512309,
+      "loss": 1.0633,
+      "step": 7958
+    },
+    {
+      "epoch": 1.417022792022792,
+      "grad_norm": 0.7097769975662231,
+      "learning_rate": 0.00014436586463329322,
+      "loss": 1.0852,
+      "step": 7959
+    },
+    {
+      "epoch": 1.4172008547008548,
+      "grad_norm": 0.6889223456382751,
+      "learning_rate": 0.00014435331972220637,
+      "loss": 0.916,
+      "step": 7960
+    },
+    {
+      "epoch": 1.4173789173789173,
+      "grad_norm": 0.6674435138702393,
+      "learning_rate": 0.0001443407739421084,
+      "loss": 0.9307,
+      "step": 7961
+    },
+    {
+      "epoch": 1.41755698005698,
+      "grad_norm": 0.6578894853591919,
+      "learning_rate": 0.00014432822729324503,
+      "loss": 0.8767,
+      "step": 7962
+    },
+    {
+      "epoch": 1.4177350427350428,
+      "grad_norm": 0.7145379781723022,
+      "learning_rate": 0.00014431567977586212,
+      "loss": 0.9962,
+      "step": 7963
+    },
+    {
+      "epoch": 1.4179131054131053,
+      "grad_norm": 0.6916680335998535,
+      "learning_rate": 0.00014430313139020555,
+      "loss": 1.0464,
+      "step": 7964
+    },
+    {
+      "epoch": 1.418091168091168,
+      "grad_norm": 0.6296181678771973,
+      "learning_rate": 0.00014429058213652116,
+      "loss": 1.0699,
+      "step": 7965
+    },
+    {
+      "epoch": 1.4182692307692308,
+      "grad_norm": 0.5640227198600769,
+      "learning_rate": 0.00014427803201505482,
+      "loss": 0.7006,
+      "step": 7966
+    },
+    {
+      "epoch": 1.4184472934472934,
+      "grad_norm": 0.7181212306022644,
+      "learning_rate": 0.0001442654810260524,
+      "loss": 1.1648,
+      "step": 7967
+    },
+    {
+      "epoch": 1.4186253561253561,
+      "grad_norm": 0.6830772757530212,
+      "learning_rate": 0.00014425292916975984,
+      "loss": 1.0641,
+      "step": 7968
+    },
+    {
+      "epoch": 1.4188034188034189,
+      "grad_norm": 0.665716290473938,
+      "learning_rate": 0.00014424037644642307,
+      "loss": 0.8769,
+      "step": 7969
+    },
+    {
+      "epoch": 1.4189814814814814,
+      "grad_norm": 0.8088666796684265,
+      "learning_rate": 0.00014422782285628802,
+      "loss": 1.1496,
+      "step": 7970
+    },
+    {
+      "epoch": 1.4191595441595442,
+      "grad_norm": 0.7186072468757629,
+      "learning_rate": 0.00014421526839960064,
+      "loss": 0.7421,
+      "step": 7971
+    },
+    {
+      "epoch": 1.419337606837607,
+      "grad_norm": 0.6405926942825317,
+      "learning_rate": 0.00014420271307660694,
+      "loss": 1.0139,
+      "step": 7972
+    },
+    {
+      "epoch": 1.4195156695156697,
+      "grad_norm": 0.7097104787826538,
+      "learning_rate": 0.0001441901568875529,
+      "loss": 1.1582,
+      "step": 7973
+    },
+    {
+      "epoch": 1.4196937321937322,
+      "grad_norm": 0.7347947359085083,
+      "learning_rate": 0.00014417759983268452,
+      "loss": 0.9751,
+      "step": 7974
+    },
+    {
+      "epoch": 1.419871794871795,
+      "grad_norm": 0.6999621987342834,
+      "learning_rate": 0.00014416504191224787,
+      "loss": 0.9419,
+      "step": 7975
+    },
+    {
+      "epoch": 1.4200498575498575,
+      "grad_norm": 0.6500616073608398,
+      "learning_rate": 0.00014415248312648897,
+      "loss": 0.9407,
+      "step": 7976
+    },
+    {
+      "epoch": 1.4202279202279202,
+      "grad_norm": 0.6368781328201294,
+      "learning_rate": 0.00014413992347565383,
+      "loss": 1.1224,
+      "step": 7977
+    },
+    {
+      "epoch": 1.420405982905983,
+      "grad_norm": 0.6422648429870605,
+      "learning_rate": 0.00014412736295998864,
+      "loss": 0.9573,
+      "step": 7978
+    },
+    {
+      "epoch": 1.4205840455840457,
+      "grad_norm": 0.744057297706604,
+      "learning_rate": 0.00014411480157973942,
+      "loss": 1.1384,
+      "step": 7979
+    },
+    {
+      "epoch": 1.4207621082621082,
+      "grad_norm": 0.5905839204788208,
+      "learning_rate": 0.00014410223933515232,
+      "loss": 0.8212,
+      "step": 7980
+    },
+    {
+      "epoch": 1.420940170940171,
+      "grad_norm": 0.5905438661575317,
+      "learning_rate": 0.0001440896762264734,
+      "loss": 0.8281,
+      "step": 7981
+    },
+    {
+      "epoch": 1.4211182336182335,
+      "grad_norm": 0.7087140679359436,
+      "learning_rate": 0.00014407711225394892,
+      "loss": 1.0165,
+      "step": 7982
+    },
+    {
+      "epoch": 1.4212962962962963,
+      "grad_norm": 0.6173902153968811,
+      "learning_rate": 0.00014406454741782495,
+      "loss": 0.8823,
+      "step": 7983
+    },
+    {
+      "epoch": 1.421474358974359,
+      "grad_norm": 0.6649761199951172,
+      "learning_rate": 0.00014405198171834772,
+      "loss": 0.9489,
+      "step": 7984
+    },
+    {
+      "epoch": 1.4216524216524218,
+      "grad_norm": 0.619286835193634,
+      "learning_rate": 0.00014403941515576344,
+      "loss": 0.8149,
+      "step": 7985
+    },
+    {
+      "epoch": 1.4218304843304843,
+      "grad_norm": 0.6358469724655151,
+      "learning_rate": 0.0001440268477303183,
+      "loss": 1.0558,
+      "step": 7986
+    },
+    {
+      "epoch": 1.422008547008547,
+      "grad_norm": 0.7239769697189331,
+      "learning_rate": 0.0001440142794422585,
+      "loss": 1.0528,
+      "step": 7987
+    },
+    {
+      "epoch": 1.4221866096866096,
+      "grad_norm": 0.681168794631958,
+      "learning_rate": 0.00014400171029183036,
+      "loss": 1.0867,
+      "step": 7988
+    },
+    {
+      "epoch": 1.4223646723646723,
+      "grad_norm": 0.6741157174110413,
+      "learning_rate": 0.0001439891402792801,
+      "loss": 0.9153,
+      "step": 7989
+    },
+    {
+      "epoch": 1.422542735042735,
+      "grad_norm": 0.5881659984588623,
+      "learning_rate": 0.00014397656940485403,
+      "loss": 0.92,
+      "step": 7990
+    },
+    {
+      "epoch": 1.4227207977207978,
+      "grad_norm": 0.637093722820282,
+      "learning_rate": 0.00014396399766879842,
+      "loss": 0.921,
+      "step": 7991
+    },
+    {
+      "epoch": 1.4228988603988604,
+      "grad_norm": 0.7760605216026306,
+      "learning_rate": 0.0001439514250713596,
+      "loss": 1.1451,
+      "step": 7992
+    },
+    {
+      "epoch": 1.4230769230769231,
+      "grad_norm": 0.6619600653648376,
+      "learning_rate": 0.00014393885161278393,
+      "loss": 1.0365,
+      "step": 7993
+    },
+    {
+      "epoch": 1.4232549857549857,
+      "grad_norm": 0.5354374051094055,
+      "learning_rate": 0.0001439262772933177,
+      "loss": 0.8718,
+      "step": 7994
+    },
+    {
+      "epoch": 1.4234330484330484,
+      "grad_norm": 0.7063560485839844,
+      "learning_rate": 0.00014391370211320735,
+      "loss": 0.8258,
+      "step": 7995
+    },
+    {
+      "epoch": 1.4236111111111112,
+      "grad_norm": 0.6876368522644043,
+      "learning_rate": 0.00014390112607269923,
+      "loss": 0.9579,
+      "step": 7996
+    },
+    {
+      "epoch": 1.423789173789174,
+      "grad_norm": 0.6976612210273743,
+      "learning_rate": 0.00014388854917203974,
+      "loss": 1.0376,
+      "step": 7997
+    },
+    {
+      "epoch": 1.4239672364672364,
+      "grad_norm": 0.6157355308532715,
+      "learning_rate": 0.00014387597141147525,
+      "loss": 0.8743,
+      "step": 7998
+    },
+    {
+      "epoch": 1.4241452991452992,
+      "grad_norm": 0.7273156046867371,
+      "learning_rate": 0.0001438633927912523,
+      "loss": 1.101,
+      "step": 7999
+    },
+    {
+      "epoch": 1.4243233618233617,
+      "grad_norm": 0.918380618095398,
+      "learning_rate": 0.0001438508133116173,
+      "loss": 0.9625,
+      "step": 8000
+    },
+    {
+      "epoch": 1.4245014245014245,
+      "grad_norm": 0.626040518283844,
+      "learning_rate": 0.00014383823297281666,
+      "loss": 0.9552,
+      "step": 8001
+    },
+    {
+      "epoch": 1.4246794871794872,
+      "grad_norm": 0.7320386171340942,
+      "learning_rate": 0.00014382565177509693,
+      "loss": 1.0719,
+      "step": 8002
+    },
+    {
+      "epoch": 1.42485754985755,
+      "grad_norm": 0.7283148169517517,
+      "learning_rate": 0.0001438130697187046,
+      "loss": 1.0455,
+      "step": 8003
+    },
+    {
+      "epoch": 1.4250356125356125,
+      "grad_norm": 0.6614177823066711,
+      "learning_rate": 0.00014380048680388613,
+      "loss": 0.9876,
+      "step": 8004
+    },
+    {
+      "epoch": 1.4252136752136753,
+      "grad_norm": 0.6726453900337219,
+      "learning_rate": 0.00014378790303088817,
+      "loss": 0.9861,
+      "step": 8005
+    },
+    {
+      "epoch": 1.4253917378917378,
+      "grad_norm": 0.7968725562095642,
+      "learning_rate": 0.00014377531839995718,
+      "loss": 1.1662,
+      "step": 8006
+    },
+    {
+      "epoch": 1.4255698005698005,
+      "grad_norm": 0.6510586738586426,
+      "learning_rate": 0.0001437627329113398,
+      "loss": 0.9452,
+      "step": 8007
+    },
+    {
+      "epoch": 1.4257478632478633,
+      "grad_norm": 0.6933155655860901,
+      "learning_rate": 0.00014375014656528253,
+      "loss": 1.0149,
+      "step": 8008
+    },
+    {
+      "epoch": 1.425925925925926,
+      "grad_norm": 0.7141832113265991,
+      "learning_rate": 0.00014373755936203204,
+      "loss": 1.0667,
+      "step": 8009
+    },
+    {
+      "epoch": 1.4261039886039886,
+      "grad_norm": 0.6352181434631348,
+      "learning_rate": 0.00014372497130183494,
+      "loss": 0.8652,
+      "step": 8010
+    },
+    {
+      "epoch": 1.4262820512820513,
+      "grad_norm": 0.7494860291481018,
+      "learning_rate": 0.00014371238238493786,
+      "loss": 0.9592,
+      "step": 8011
+    },
+    {
+      "epoch": 1.4264601139601139,
+      "grad_norm": 0.610556423664093,
+      "learning_rate": 0.00014369979261158746,
+      "loss": 0.7015,
+      "step": 8012
+    },
+    {
+      "epoch": 1.4266381766381766,
+      "grad_norm": 0.7305756211280823,
+      "learning_rate": 0.00014368720198203037,
+      "loss": 0.9681,
+      "step": 8013
+    },
+    {
+      "epoch": 1.4268162393162394,
+      "grad_norm": 0.6964020133018494,
+      "learning_rate": 0.0001436746104965133,
+      "loss": 1.1166,
+      "step": 8014
+    },
+    {
+      "epoch": 1.426994301994302,
+      "grad_norm": 0.7449237108230591,
+      "learning_rate": 0.00014366201815528302,
+      "loss": 1.1331,
+      "step": 8015
+    },
+    {
+      "epoch": 1.4271723646723646,
+      "grad_norm": 0.625834047794342,
+      "learning_rate": 0.00014364942495858615,
+      "loss": 0.8796,
+      "step": 8016
+    },
+    {
+      "epoch": 1.4273504273504274,
+      "grad_norm": 0.664559006690979,
+      "learning_rate": 0.0001436368309066695,
+      "loss": 1.0263,
+      "step": 8017
+    },
+    {
+      "epoch": 1.42752849002849,
+      "grad_norm": Infinity,
+      "learning_rate": 0.0001436368309066695,
+      "loss": 1.0731,
+      "step": 8018
+    },
+    {
+      "epoch": 1.4277065527065527,
+      "grad_norm": 0.6714464426040649,
+      "learning_rate": 0.00014362423599977977,
+      "loss": 0.9345,
+      "step": 8019
+    },
+    {
+      "epoch": 1.4278846153846154,
+      "grad_norm": 0.7595751285552979,
+      "learning_rate": 0.00014361164023816376,
+      "loss": 0.9646,
+      "step": 8020
+    },
+    {
+      "epoch": 1.4280626780626782,
+      "grad_norm": 0.6413954496383667,
+      "learning_rate": 0.00014359904362206828,
+      "loss": 1.0471,
+      "step": 8021
+    },
+    {
+      "epoch": 1.4282407407407407,
+      "grad_norm": 0.7298843264579773,
+      "learning_rate": 0.00014358644615174008,
+      "loss": 0.8932,
+      "step": 8022
+    },
+    {
+      "epoch": 1.4284188034188035,
+      "grad_norm": 0.8022156953811646,
+      "learning_rate": 0.00014357384782742602,
+      "loss": 1.0437,
+      "step": 8023
+    },
+    {
+      "epoch": 1.428596866096866,
+      "grad_norm": 0.7264443635940552,
+      "learning_rate": 0.00014356124864937296,
+      "loss": 0.9368,
+      "step": 8024
+    },
+    {
+      "epoch": 1.4287749287749287,
+      "grad_norm": 0.6819384098052979,
+      "learning_rate": 0.00014354864861782768,
+      "loss": 1.0,
+      "step": 8025
+    },
+    {
+      "epoch": 1.4289529914529915,
+      "grad_norm": 0.5945104956626892,
+      "learning_rate": 0.0001435360477330371,
+      "loss": 0.8108,
+      "step": 8026
+    },
+    {
+      "epoch": 1.4291310541310542,
+      "grad_norm": 0.6497398018836975,
+      "learning_rate": 0.0001435234459952481,
+      "loss": 0.8712,
+      "step": 8027
+    },
+    {
+      "epoch": 1.4293091168091168,
+      "grad_norm": 0.6424077749252319,
+      "learning_rate": 0.0001435108434047076,
+      "loss": 0.9172,
+      "step": 8028
+    },
+    {
+      "epoch": 1.4294871794871795,
+      "grad_norm": 0.6806963086128235,
+      "learning_rate": 0.00014349823996166253,
+      "loss": 1.1648,
+      "step": 8029
+    },
+    {
+      "epoch": 1.429665242165242,
+      "grad_norm": 0.6601083874702454,
+      "learning_rate": 0.00014348563566635977,
+      "loss": 0.9453,
+      "step": 8030
+    },
+    {
+      "epoch": 1.4298433048433048,
+      "grad_norm": 0.7024385929107666,
+      "learning_rate": 0.00014347303051904636,
+      "loss": 1.074,
+      "step": 8031
+    },
+    {
+      "epoch": 1.4300213675213675,
+      "grad_norm": 0.7094005942344666,
+      "learning_rate": 0.00014346042451996918,
+      "loss": 0.9976,
+      "step": 8032
+    },
+    {
+      "epoch": 1.4301994301994303,
+      "grad_norm": 0.6775936484336853,
+      "learning_rate": 0.0001434478176693753,
+      "loss": 0.9039,
+      "step": 8033
+    },
+    {
+      "epoch": 1.4303774928774928,
+      "grad_norm": 0.6920986771583557,
+      "learning_rate": 0.00014343520996751166,
+      "loss": 0.9122,
+      "step": 8034
+    },
+    {
+      "epoch": 1.4305555555555556,
+      "grad_norm": 0.720690906047821,
+      "learning_rate": 0.00014342260141462528,
+      "loss": 1.1028,
+      "step": 8035
+    },
+    {
+      "epoch": 1.430733618233618,
+      "grad_norm": 0.624546229839325,
+      "learning_rate": 0.00014340999201096328,
+      "loss": 0.9083,
+      "step": 8036
+    },
+    {
+      "epoch": 1.4309116809116809,
+      "grad_norm": 0.6560490727424622,
+      "learning_rate": 0.00014339738175677265,
+      "loss": 0.8029,
+      "step": 8037
+    },
+    {
+      "epoch": 1.4310897435897436,
+      "grad_norm": 0.8266100883483887,
+      "learning_rate": 0.00014338477065230047,
+      "loss": 0.9655,
+      "step": 8038
+    },
+    {
+      "epoch": 1.4312678062678064,
+      "grad_norm": 0.6593570113182068,
+      "learning_rate": 0.00014337215869779385,
+      "loss": 1.0299,
+      "step": 8039
+    },
+    {
+      "epoch": 1.431445868945869,
+      "grad_norm": 0.6321794390678406,
+      "learning_rate": 0.00014335954589349986,
+      "loss": 0.8755,
+      "step": 8040
+    },
+    {
+      "epoch": 1.4316239316239316,
+      "grad_norm": 0.7030870318412781,
+      "learning_rate": 0.00014334693223966562,
+      "loss": 1.1226,
+      "step": 8041
+    },
+    {
+      "epoch": 1.4318019943019942,
+      "grad_norm": 0.7794312238693237,
+      "learning_rate": 0.0001433343177365383,
+      "loss": 1.1252,
+      "step": 8042
+    },
+    {
+      "epoch": 1.431980056980057,
+      "grad_norm": 0.6115018129348755,
+      "learning_rate": 0.00014332170238436507,
+      "loss": 0.8753,
+      "step": 8043
+    },
+    {
+      "epoch": 1.4321581196581197,
+      "grad_norm": 0.8525674939155579,
+      "learning_rate": 0.00014330908618339304,
+      "loss": 0.9135,
+      "step": 8044
+    },
+    {
+      "epoch": 1.4323361823361824,
+      "grad_norm": 0.6869912147521973,
+      "learning_rate": 0.00014329646913386948,
+      "loss": 0.868,
+      "step": 8045
+    },
+    {
+      "epoch": 1.432514245014245,
+      "grad_norm": 0.5877542495727539,
+      "learning_rate": 0.0001432838512360415,
+      "loss": 0.9051,
+      "step": 8046
+    },
+    {
+      "epoch": 1.4326923076923077,
+      "grad_norm": 0.6609327793121338,
+      "learning_rate": 0.0001432712324901564,
+      "loss": 0.9084,
+      "step": 8047
+    },
+    {
+      "epoch": 1.4328703703703702,
+      "grad_norm": 0.6318345069885254,
+      "learning_rate": 0.0001432586128964614,
+      "loss": 0.8291,
+      "step": 8048
+    },
+    {
+      "epoch": 1.433048433048433,
+      "grad_norm": 0.6973567008972168,
+      "learning_rate": 0.0001432459924552037,
+      "loss": 0.97,
+      "step": 8049
+    },
+    {
+      "epoch": 1.4332264957264957,
+      "grad_norm": 0.6838201284408569,
+      "learning_rate": 0.00014323337116663062,
+      "loss": 1.0957,
+      "step": 8050
+    },
+    {
+      "epoch": 1.4334045584045585,
+      "grad_norm": 0.7472857236862183,
+      "learning_rate": 0.00014322074903098944,
+      "loss": 1.0981,
+      "step": 8051
+    },
+    {
+      "epoch": 1.433582621082621,
+      "grad_norm": 0.7723061442375183,
+      "learning_rate": 0.0001432081260485275,
+      "loss": 1.2231,
+      "step": 8052
+    },
+    {
+      "epoch": 1.4337606837606838,
+      "grad_norm": 0.681834876537323,
+      "learning_rate": 0.00014319550221949208,
+      "loss": 1.073,
+      "step": 8053
+    },
+    {
+      "epoch": 1.4339387464387463,
+      "grad_norm": 0.6566045880317688,
+      "learning_rate": 0.00014318287754413051,
+      "loss": 1.1298,
+      "step": 8054
+    },
+    {
+      "epoch": 1.434116809116809,
+      "grad_norm": 0.6792440414428711,
+      "learning_rate": 0.00014317025202269015,
+      "loss": 1.2224,
+      "step": 8055
+    },
+    {
+      "epoch": 1.4342948717948718,
+      "grad_norm": 0.7946709394454956,
+      "learning_rate": 0.00014315762565541838,
+      "loss": 1.0728,
+      "step": 8056
+    },
+    {
+      "epoch": 1.4344729344729346,
+      "grad_norm": 0.633466899394989,
+      "learning_rate": 0.00014314499844256262,
+      "loss": 0.944,
+      "step": 8057
+    },
+    {
+      "epoch": 1.434650997150997,
+      "grad_norm": 0.7308502197265625,
+      "learning_rate": 0.00014313237038437023,
+      "loss": 1.0684,
+      "step": 8058
+    },
+    {
+      "epoch": 1.4348290598290598,
+      "grad_norm": 0.6483737230300903,
+      "learning_rate": 0.00014311974148108862,
+      "loss": 1.0843,
+      "step": 8059
+    },
+    {
+      "epoch": 1.4350071225071226,
+      "grad_norm": 0.6301209926605225,
+      "learning_rate": 0.00014310711173296526,
+      "loss": 1.0083,
+      "step": 8060
+    },
+    {
+      "epoch": 1.4351851851851851,
+      "grad_norm": 0.6674302816390991,
+      "learning_rate": 0.00014309448114024757,
+      "loss": 0.9877,
+      "step": 8061
+    },
+    {
+      "epoch": 1.4353632478632479,
+      "grad_norm": 0.6888732314109802,
+      "learning_rate": 0.00014308184970318307,
+      "loss": 0.9937,
+      "step": 8062
+    },
+    {
+      "epoch": 1.4355413105413106,
+      "grad_norm": 0.6922950148582458,
+      "learning_rate": 0.00014306921742201923,
+      "loss": 1.0149,
+      "step": 8063
+    },
+    {
+      "epoch": 1.4357193732193732,
+      "grad_norm": 0.6050686240196228,
+      "learning_rate": 0.00014305658429700352,
+      "loss": 0.7882,
+      "step": 8064
+    },
+    {
+      "epoch": 1.435897435897436,
+      "grad_norm": 0.5080767869949341,
+      "learning_rate": 0.00014304395032838348,
+      "loss": 0.7796,
+      "step": 8065
+    },
+    {
+      "epoch": 1.4360754985754987,
+      "grad_norm": 0.6382707953453064,
+      "learning_rate": 0.00014303131551640668,
+      "loss": 0.965,
+      "step": 8066
+    },
+    {
+      "epoch": 1.4362535612535612,
+      "grad_norm": 0.7153477668762207,
+      "learning_rate": 0.00014301867986132063,
+      "loss": 1.1277,
+      "step": 8067
+    },
+    {
+      "epoch": 1.436431623931624,
+      "grad_norm": 0.6208404898643494,
+      "learning_rate": 0.00014300604336337292,
+      "loss": 0.8246,
+      "step": 8068
+    },
+    {
+      "epoch": 1.4366096866096867,
+      "grad_norm": 0.719695508480072,
+      "learning_rate": 0.0001429934060228111,
+      "loss": 0.7681,
+      "step": 8069
+    },
+    {
+      "epoch": 1.4367877492877492,
+      "grad_norm": 0.6219030618667603,
+      "learning_rate": 0.0001429807678398828,
+      "loss": 1.0425,
+      "step": 8070
+    },
+    {
+      "epoch": 1.436965811965812,
+      "grad_norm": 0.6080238819122314,
+      "learning_rate": 0.00014296812881483566,
+      "loss": 0.8762,
+      "step": 8071
+    },
+    {
+      "epoch": 1.4371438746438747,
+      "grad_norm": 0.6264194846153259,
+      "learning_rate": 0.00014295548894791729,
+      "loss": 1.087,
+      "step": 8072
+    },
+    {
+      "epoch": 1.4373219373219372,
+      "grad_norm": 0.6503600478172302,
+      "learning_rate": 0.00014294284823937535,
+      "loss": 1.0583,
+      "step": 8073
+    },
+    {
+      "epoch": 1.4375,
+      "grad_norm": 0.7623817324638367,
+      "learning_rate": 0.0001429302066894575,
+      "loss": 1.2372,
+      "step": 8074
+    },
+    {
+      "epoch": 1.4376780626780628,
+      "grad_norm": 0.7020344138145447,
+      "learning_rate": 0.00014291756429841144,
+      "loss": 1.2163,
+      "step": 8075
+    },
+    {
+      "epoch": 1.4378561253561253,
+      "grad_norm": 0.7070338129997253,
+      "learning_rate": 0.00014290492106648484,
+      "loss": 0.986,
+      "step": 8076
+    },
+    {
+      "epoch": 1.438034188034188,
+      "grad_norm": 0.6407621502876282,
+      "learning_rate": 0.00014289227699392545,
+      "loss": 0.9329,
+      "step": 8077
+    },
+    {
+      "epoch": 1.4382122507122508,
+      "grad_norm": 0.6836710572242737,
+      "learning_rate": 0.00014287963208098098,
+      "loss": 0.9252,
+      "step": 8078
+    },
+    {
+      "epoch": 1.4383903133903133,
+      "grad_norm": 0.648642897605896,
+      "learning_rate": 0.00014286698632789922,
+      "loss": 1.0457,
+      "step": 8079
+    },
+    {
+      "epoch": 1.438568376068376,
+      "grad_norm": 0.7015881538391113,
+      "learning_rate": 0.0001428543397349279,
+      "loss": 1.0516,
+      "step": 8080
+    },
+    {
+      "epoch": 1.4387464387464388,
+      "grad_norm": 0.6031532883644104,
+      "learning_rate": 0.0001428416923023148,
+      "loss": 0.9423,
+      "step": 8081
+    },
+    {
+      "epoch": 1.4389245014245013,
+      "grad_norm": 0.8235578536987305,
+      "learning_rate": 0.00014282904403030772,
+      "loss": 1.3433,
+      "step": 8082
+    },
+    {
+      "epoch": 1.439102564102564,
+      "grad_norm": 0.7355761528015137,
+      "learning_rate": 0.00014281639491915452,
+      "loss": 1.0128,
+      "step": 8083
+    },
+    {
+      "epoch": 1.4392806267806268,
+      "grad_norm": 0.7429629564285278,
+      "learning_rate": 0.00014280374496910303,
+      "loss": 0.8546,
+      "step": 8084
+    },
+    {
+      "epoch": 1.4394586894586894,
+      "grad_norm": 0.5831776857376099,
+      "learning_rate": 0.00014279109418040105,
+      "loss": 0.9021,
+      "step": 8085
+    },
+    {
+      "epoch": 1.4396367521367521,
+      "grad_norm": 0.6585184931755066,
+      "learning_rate": 0.00014277844255329645,
+      "loss": 0.9256,
+      "step": 8086
+    },
+    {
+      "epoch": 1.4398148148148149,
+      "grad_norm": 0.6412501931190491,
+      "learning_rate": 0.00014276579008803717,
+      "loss": 0.9305,
+      "step": 8087
+    },
+    {
+      "epoch": 1.4399928774928774,
+      "grad_norm": 0.6305423378944397,
+      "learning_rate": 0.00014275313678487102,
+      "loss": 0.9471,
+      "step": 8088
+    },
+    {
+      "epoch": 1.4401709401709402,
+      "grad_norm": 0.7160914540290833,
+      "learning_rate": 0.00014274048264404602,
+      "loss": 0.8798,
+      "step": 8089
+    },
+    {
+      "epoch": 1.440349002849003,
+      "grad_norm": 0.6740858554840088,
+      "learning_rate": 0.00014272782766581004,
+      "loss": 0.9022,
+      "step": 8090
+    },
+    {
+      "epoch": 1.4405270655270654,
+      "grad_norm": 0.7554821968078613,
+      "learning_rate": 0.000142715171850411,
+      "loss": 1.0924,
+      "step": 8091
+    },
+    {
+      "epoch": 1.4407051282051282,
+      "grad_norm": 0.7361162304878235,
+      "learning_rate": 0.00014270251519809694,
+      "loss": 0.9907,
+      "step": 8092
+    },
+    {
+      "epoch": 1.440883190883191,
+      "grad_norm": 0.731813371181488,
+      "learning_rate": 0.0001426898577091158,
+      "loss": 1.1765,
+      "step": 8093
+    },
+    {
+      "epoch": 1.4410612535612537,
+      "grad_norm": 0.6877756714820862,
+      "learning_rate": 0.00014267719938371558,
+      "loss": 1.0536,
+      "step": 8094
+    },
+    {
+      "epoch": 1.4412393162393162,
+      "grad_norm": 0.6724407076835632,
+      "learning_rate": 0.00014266454022214426,
+      "loss": 1.1895,
+      "step": 8095
+    },
+    {
+      "epoch": 1.441417378917379,
+      "grad_norm": 0.6946671605110168,
+      "learning_rate": 0.0001426518802246499,
+      "loss": 1.0437,
+      "step": 8096
+    },
+    {
+      "epoch": 1.4415954415954415,
+      "grad_norm": 0.7032839059829712,
+      "learning_rate": 0.00014263921939148058,
+      "loss": 1.1363,
+      "step": 8097
+    },
+    {
+      "epoch": 1.4417735042735043,
+      "grad_norm": 0.6942192316055298,
+      "learning_rate": 0.00014262655772288434,
+      "loss": 1.315,
+      "step": 8098
+    },
+    {
+      "epoch": 1.441951566951567,
+      "grad_norm": 0.7002301812171936,
+      "learning_rate": 0.00014261389521910922,
+      "loss": 1.0546,
+      "step": 8099
+    },
+    {
+      "epoch": 1.4421296296296298,
+      "grad_norm": 0.7260788083076477,
+      "learning_rate": 0.00014260123188040335,
+      "loss": 0.9374,
+      "step": 8100
+    },
+    {
+      "epoch": 1.4423076923076923,
+      "grad_norm": 0.6629201173782349,
+      "learning_rate": 0.00014258856770701486,
+      "loss": 0.8632,
+      "step": 8101
+    },
+    {
+      "epoch": 1.442485754985755,
+      "grad_norm": 0.6570318937301636,
+      "learning_rate": 0.0001425759026991918,
+      "loss": 1.0102,
+      "step": 8102
+    },
+    {
+      "epoch": 1.4426638176638176,
+      "grad_norm": 0.7696560621261597,
+      "learning_rate": 0.00014256323685718242,
+      "loss": 0.9703,
+      "step": 8103
+    },
+    {
+      "epoch": 1.4428418803418803,
+      "grad_norm": 0.7206611633300781,
+      "learning_rate": 0.00014255057018123482,
+      "loss": 1.1728,
+      "step": 8104
+    },
+    {
+      "epoch": 1.443019943019943,
+      "grad_norm": 0.6871611475944519,
+      "learning_rate": 0.0001425379026715972,
+      "loss": 0.9377,
+      "step": 8105
+    },
+    {
+      "epoch": 1.4431980056980058,
+      "grad_norm": 0.6027442812919617,
+      "learning_rate": 0.00014252523432851775,
+      "loss": 0.9212,
+      "step": 8106
+    },
+    {
+      "epoch": 1.4433760683760684,
+      "grad_norm": 0.7149752378463745,
+      "learning_rate": 0.00014251256515224463,
+      "loss": 0.9654,
+      "step": 8107
+    },
+    {
+      "epoch": 1.443554131054131,
+      "grad_norm": 0.5949522256851196,
+      "learning_rate": 0.00014249989514302614,
+      "loss": 1.0646,
+      "step": 8108
+    },
+    {
+      "epoch": 1.4437321937321936,
+      "grad_norm": 0.7345452904701233,
+      "learning_rate": 0.0001424872243011105,
+      "loss": 0.9801,
+      "step": 8109
+    },
+    {
+      "epoch": 1.4439102564102564,
+      "grad_norm": 0.8045009970664978,
+      "learning_rate": 0.00014247455262674592,
+      "loss": 1.3529,
+      "step": 8110
+    },
+    {
+      "epoch": 1.4440883190883191,
+      "grad_norm": 0.6712123155593872,
+      "learning_rate": 0.00014246188012018073,
+      "loss": 1.0416,
+      "step": 8111
+    },
+    {
+      "epoch": 1.444266381766382,
+      "grad_norm": 0.7811154127120972,
+      "learning_rate": 0.00014244920678166322,
+      "loss": 1.2019,
+      "step": 8112
+    },
+    {
+      "epoch": 1.4444444444444444,
+      "grad_norm": 0.6834486126899719,
+      "learning_rate": 0.00014243653261144167,
+      "loss": 0.986,
+      "step": 8113
+    },
+    {
+      "epoch": 1.4446225071225072,
+      "grad_norm": 0.6901041269302368,
+      "learning_rate": 0.00014242385760976443,
+      "loss": 1.0988,
+      "step": 8114
+    },
+    {
+      "epoch": 1.4448005698005697,
+      "grad_norm": 0.6233634948730469,
+      "learning_rate": 0.00014241118177687982,
+      "loss": 0.7748,
+      "step": 8115
+    },
+    {
+      "epoch": 1.4449786324786325,
+      "grad_norm": 0.6899837851524353,
+      "learning_rate": 0.00014239850511303624,
+      "loss": 0.9734,
+      "step": 8116
+    },
+    {
+      "epoch": 1.4451566951566952,
+      "grad_norm": 0.6316244006156921,
+      "learning_rate": 0.00014238582761848197,
+      "loss": 0.7888,
+      "step": 8117
+    },
+    {
+      "epoch": 1.445334757834758,
+      "grad_norm": 0.6074259877204895,
+      "learning_rate": 0.00014237314929346545,
+      "loss": 0.8843,
+      "step": 8118
+    },
+    {
+      "epoch": 1.4455128205128205,
+      "grad_norm": 0.6112192273139954,
+      "learning_rate": 0.00014236047013823516,
+      "loss": 0.8529,
+      "step": 8119
+    },
+    {
+      "epoch": 1.4456908831908832,
+      "grad_norm": 0.6883894801139832,
+      "learning_rate": 0.0001423477901530394,
+      "loss": 0.9506,
+      "step": 8120
+    },
+    {
+      "epoch": 1.4458689458689458,
+      "grad_norm": 0.7248309254646301,
+      "learning_rate": 0.00014233510933812666,
+      "loss": 0.9573,
+      "step": 8121
+    },
+    {
+      "epoch": 1.4460470085470085,
+      "grad_norm": 0.6853367686271667,
+      "learning_rate": 0.00014232242769374542,
+      "loss": 0.9903,
+      "step": 8122
+    },
+    {
+      "epoch": 1.4462250712250713,
+      "grad_norm": 0.7179274559020996,
+      "learning_rate": 0.0001423097452201441,
+      "loss": 0.9157,
+      "step": 8123
+    },
+    {
+      "epoch": 1.446403133903134,
+      "grad_norm": 0.6704817414283752,
+      "learning_rate": 0.00014229706191757127,
+      "loss": 1.1361,
+      "step": 8124
+    },
+    {
+      "epoch": 1.4465811965811965,
+      "grad_norm": 0.6380739212036133,
+      "learning_rate": 0.00014228437778627533,
+      "loss": 0.9336,
+      "step": 8125
+    },
+    {
+      "epoch": 1.4467592592592593,
+      "grad_norm": 0.6275362372398376,
+      "learning_rate": 0.00014227169282650487,
+      "loss": 0.9617,
+      "step": 8126
+    },
+    {
+      "epoch": 1.4469373219373218,
+      "grad_norm": 0.5644828677177429,
+      "learning_rate": 0.00014225900703850836,
+      "loss": 0.7384,
+      "step": 8127
+    },
+    {
+      "epoch": 1.4471153846153846,
+      "grad_norm": 0.6522284150123596,
+      "learning_rate": 0.00014224632042253443,
+      "loss": 1.1098,
+      "step": 8128
+    },
+    {
+      "epoch": 1.4472934472934473,
+      "grad_norm": 0.6228049993515015,
+      "learning_rate": 0.0001422336329788316,
+      "loss": 1.1061,
+      "step": 8129
+    },
+    {
+      "epoch": 1.44747150997151,
+      "grad_norm": 0.6092000603675842,
+      "learning_rate": 0.00014222094470764848,
+      "loss": 0.808,
+      "step": 8130
+    },
+    {
+      "epoch": 1.4476495726495726,
+      "grad_norm": 0.667435348033905,
+      "learning_rate": 0.00014220825560923363,
+      "loss": 1.1223,
+      "step": 8131
+    },
+    {
+      "epoch": 1.4478276353276354,
+      "grad_norm": 0.6080766320228577,
+      "learning_rate": 0.0001421955656838357,
+      "loss": 1.0099,
+      "step": 8132
+    },
+    {
+      "epoch": 1.448005698005698,
+      "grad_norm": 0.7597638368606567,
+      "learning_rate": 0.00014218287493170332,
+      "loss": 0.9718,
+      "step": 8133
+    },
+    {
+      "epoch": 1.4481837606837606,
+      "grad_norm": 0.574130654335022,
+      "learning_rate": 0.0001421701833530851,
+      "loss": 0.7745,
+      "step": 8134
+    },
+    {
+      "epoch": 1.4483618233618234,
+      "grad_norm": 0.6372822523117065,
+      "learning_rate": 0.0001421574909482298,
+      "loss": 1.0088,
+      "step": 8135
+    },
+    {
+      "epoch": 1.4485398860398861,
+      "grad_norm": 0.6759644746780396,
+      "learning_rate": 0.000142144797717386,
+      "loss": 0.9684,
+      "step": 8136
+    },
+    {
+      "epoch": 1.4487179487179487,
+      "grad_norm": 0.706351637840271,
+      "learning_rate": 0.00014213210366080244,
+      "loss": 1.021,
+      "step": 8137
+    },
+    {
+      "epoch": 1.4488960113960114,
+      "grad_norm": 0.6976894736289978,
+      "learning_rate": 0.0001421194087787278,
+      "loss": 1.1038,
+      "step": 8138
+    },
+    {
+      "epoch": 1.449074074074074,
+      "grad_norm": 0.7322551012039185,
+      "learning_rate": 0.00014210671307141092,
+      "loss": 1.0213,
+      "step": 8139
+    },
+    {
+      "epoch": 1.4492521367521367,
+      "grad_norm": 0.5885626077651978,
+      "learning_rate": 0.0001420940165391004,
+      "loss": 0.821,
+      "step": 8140
+    },
+    {
+      "epoch": 1.4494301994301995,
+      "grad_norm": 0.7009791135787964,
+      "learning_rate": 0.0001420813191820451,
+      "loss": 0.8647,
+      "step": 8141
+    },
+    {
+      "epoch": 1.4496082621082622,
+      "grad_norm": 0.5715423822402954,
+      "learning_rate": 0.00014206862100049375,
+      "loss": 0.873,
+      "step": 8142
+    },
+    {
+      "epoch": 1.4497863247863247,
+      "grad_norm": 1.1452178955078125,
+      "learning_rate": 0.00014205592199469514,
+      "loss": 1.2523,
+      "step": 8143
+    },
+    {
+      "epoch": 1.4499643874643875,
+      "grad_norm": 0.8076814413070679,
+      "learning_rate": 0.00014204322216489814,
+      "loss": 1.1071,
+      "step": 8144
+    },
+    {
+      "epoch": 1.45014245014245,
+      "grad_norm": 0.7325751185417175,
+      "learning_rate": 0.00014203052151135154,
+      "loss": 0.9846,
+      "step": 8145
+    },
+    {
+      "epoch": 1.4503205128205128,
+      "grad_norm": 0.7009061574935913,
+      "learning_rate": 0.00014201782003430417,
+      "loss": 0.8153,
+      "step": 8146
+    },
+    {
+      "epoch": 1.4504985754985755,
+      "grad_norm": 0.6502353549003601,
+      "learning_rate": 0.0001420051177340049,
+      "loss": 0.8959,
+      "step": 8147
+    },
+    {
+      "epoch": 1.4506766381766383,
+      "grad_norm": 0.6134430170059204,
+      "learning_rate": 0.00014199241461070261,
+      "loss": 0.9683,
+      "step": 8148
+    },
+    {
+      "epoch": 1.4508547008547008,
+      "grad_norm": 0.720160722732544,
+      "learning_rate": 0.0001419797106646462,
+      "loss": 0.9579,
+      "step": 8149
+    },
+    {
+      "epoch": 1.4510327635327636,
+      "grad_norm": 0.6141422986984253,
+      "learning_rate": 0.00014196700589608454,
+      "loss": 0.9427,
+      "step": 8150
+    },
+    {
+      "epoch": 1.451210826210826,
+      "grad_norm": 0.6835139393806458,
+      "learning_rate": 0.00014195430030526656,
+      "loss": 1.0374,
+      "step": 8151
+    },
+    {
+      "epoch": 1.4513888888888888,
+      "grad_norm": 0.6829691529273987,
+      "learning_rate": 0.00014194159389244128,
+      "loss": 0.9418,
+      "step": 8152
+    },
+    {
+      "epoch": 1.4515669515669516,
+      "grad_norm": 0.7142195701599121,
+      "learning_rate": 0.00014192888665785755,
+      "loss": 1.1876,
+      "step": 8153
+    },
+    {
+      "epoch": 1.4517450142450143,
+      "grad_norm": 0.6719943284988403,
+      "learning_rate": 0.0001419161786017644,
+      "loss": 1.1417,
+      "step": 8154
+    },
+    {
+      "epoch": 1.4519230769230769,
+      "grad_norm": 0.6478939652442932,
+      "learning_rate": 0.0001419034697244108,
+      "loss": 0.943,
+      "step": 8155
+    },
+    {
+      "epoch": 1.4521011396011396,
+      "grad_norm": 0.6308888792991638,
+      "learning_rate": 0.00014189076002604575,
+      "loss": 0.9842,
+      "step": 8156
+    },
+    {
+      "epoch": 1.4522792022792022,
+      "grad_norm": 0.673559844493866,
+      "learning_rate": 0.00014187804950691827,
+      "loss": 0.8108,
+      "step": 8157
+    },
+    {
+      "epoch": 1.452457264957265,
+      "grad_norm": 0.5895359516143799,
+      "learning_rate": 0.00014186533816727744,
+      "loss": 0.8187,
+      "step": 8158
+    },
+    {
+      "epoch": 1.4526353276353277,
+      "grad_norm": 0.6703287363052368,
+      "learning_rate": 0.00014185262600737225,
+      "loss": 0.9012,
+      "step": 8159
+    },
+    {
+      "epoch": 1.4528133903133904,
+      "grad_norm": 0.697728157043457,
+      "learning_rate": 0.00014183991302745182,
+      "loss": 1.2572,
+      "step": 8160
+    },
+    {
+      "epoch": 1.452991452991453,
+      "grad_norm": 0.599371075630188,
+      "learning_rate": 0.00014182719922776514,
+      "loss": 1.078,
+      "step": 8161
+    },
+    {
+      "epoch": 1.4531695156695157,
+      "grad_norm": 0.6774863600730896,
+      "learning_rate": 0.00014181448460856143,
+      "loss": 1.0607,
+      "step": 8162
+    },
+    {
+      "epoch": 1.4533475783475782,
+      "grad_norm": 0.6872009038925171,
+      "learning_rate": 0.00014180176917008976,
+      "loss": 1.0713,
+      "step": 8163
+    },
+    {
+      "epoch": 1.453525641025641,
+      "grad_norm": 0.7949981093406677,
+      "learning_rate": 0.00014178905291259926,
+      "loss": 1.0471,
+      "step": 8164
+    },
+    {
+      "epoch": 1.4537037037037037,
+      "grad_norm": 0.6592127084732056,
+      "learning_rate": 0.00014177633583633908,
+      "loss": 0.8409,
+      "step": 8165
+    },
+    {
+      "epoch": 1.4538817663817665,
+      "grad_norm": 0.6745635867118835,
+      "learning_rate": 0.00014176361794155837,
+      "loss": 1.0859,
+      "step": 8166
+    },
+    {
+      "epoch": 1.454059829059829,
+      "grad_norm": 0.6661605834960938,
+      "learning_rate": 0.00014175089922850633,
+      "loss": 1.0587,
+      "step": 8167
+    },
+    {
+      "epoch": 1.4542378917378918,
+      "grad_norm": 0.6697571873664856,
+      "learning_rate": 0.00014173817969743212,
+      "loss": 0.8876,
+      "step": 8168
+    },
+    {
+      "epoch": 1.4544159544159543,
+      "grad_norm": 0.6162588000297546,
+      "learning_rate": 0.000141725459348585,
+      "loss": 0.9575,
+      "step": 8169
+    },
+    {
+      "epoch": 1.454594017094017,
+      "grad_norm": 0.6235088109970093,
+      "learning_rate": 0.00014171273818221422,
+      "loss": 0.9209,
+      "step": 8170
+    },
+    {
+      "epoch": 1.4547720797720798,
+      "grad_norm": 0.6744212508201599,
+      "learning_rate": 0.00014170001619856896,
+      "loss": 0.9704,
+      "step": 8171
+    },
+    {
+      "epoch": 1.4549501424501425,
+      "grad_norm": 0.6781345009803772,
+      "learning_rate": 0.0001416872933978985,
+      "loss": 1.1507,
+      "step": 8172
+    },
+    {
+      "epoch": 1.455128205128205,
+      "grad_norm": 0.7160060405731201,
+      "learning_rate": 0.0001416745697804521,
+      "loss": 1.2529,
+      "step": 8173
+    },
+    {
+      "epoch": 1.4553062678062678,
+      "grad_norm": 0.6742389798164368,
+      "learning_rate": 0.00014166184534647913,
+      "loss": 1.0168,
+      "step": 8174
+    },
+    {
+      "epoch": 1.4554843304843303,
+      "grad_norm": 0.6685828566551208,
+      "learning_rate": 0.0001416491200962288,
+      "loss": 1.0807,
+      "step": 8175
+    },
+    {
+      "epoch": 1.455662393162393,
+      "grad_norm": 0.6998327374458313,
+      "learning_rate": 0.0001416363940299505,
+      "loss": 1.1711,
+      "step": 8176
+    },
+    {
+      "epoch": 1.4558404558404558,
+      "grad_norm": 0.7132518291473389,
+      "learning_rate": 0.00014162366714789358,
+      "loss": 1.1392,
+      "step": 8177
+    },
+    {
+      "epoch": 1.4560185185185186,
+      "grad_norm": 0.6995887160301208,
+      "learning_rate": 0.0001416109394503073,
+      "loss": 1.3335,
+      "step": 8178
+    },
+    {
+      "epoch": 1.4561965811965811,
+      "grad_norm": 0.7161234021186829,
+      "learning_rate": 0.00014159821093744115,
+      "loss": 0.9725,
+      "step": 8179
+    },
+    {
+      "epoch": 1.4563746438746439,
+      "grad_norm": 0.7678874135017395,
+      "learning_rate": 0.00014158548160954446,
+      "loss": 1.1578,
+      "step": 8180
+    },
+    {
+      "epoch": 1.4565527065527066,
+      "grad_norm": 0.67372065782547,
+      "learning_rate": 0.00014157275146686662,
+      "loss": 1.0867,
+      "step": 8181
+    },
+    {
+      "epoch": 1.4567307692307692,
+      "grad_norm": 0.7757831811904907,
+      "learning_rate": 0.00014156002050965712,
+      "loss": 0.9768,
+      "step": 8182
+    },
+    {
+      "epoch": 1.456908831908832,
+      "grad_norm": 0.7174801230430603,
+      "learning_rate": 0.00014154728873816533,
+      "loss": 1.1712,
+      "step": 8183
+    },
+    {
+      "epoch": 1.4570868945868947,
+      "grad_norm": 0.5972673892974854,
+      "learning_rate": 0.0001415345561526407,
+      "loss": 0.9571,
+      "step": 8184
+    },
+    {
+      "epoch": 1.4572649572649572,
+      "grad_norm": 0.7999650835990906,
+      "learning_rate": 0.00014152182275333275,
+      "loss": 1.0583,
+      "step": 8185
+    },
+    {
+      "epoch": 1.45744301994302,
+      "grad_norm": 0.6737848520278931,
+      "learning_rate": 0.00014150908854049091,
+      "loss": 1.0562,
+      "step": 8186
+    },
+    {
+      "epoch": 1.4576210826210827,
+      "grad_norm": 0.7756418585777283,
+      "learning_rate": 0.00014149635351436474,
+      "loss": 1.2301,
+      "step": 8187
+    },
+    {
+      "epoch": 1.4577991452991452,
+      "grad_norm": 0.5633914470672607,
+      "learning_rate": 0.00014148361767520374,
+      "loss": 0.8847,
+      "step": 8188
+    },
+    {
+      "epoch": 1.457977207977208,
+      "grad_norm": 0.8462759256362915,
+      "learning_rate": 0.00014147088102325737,
+      "loss": 0.8046,
+      "step": 8189
+    },
+    {
+      "epoch": 1.4581552706552707,
+      "grad_norm": 0.7081632614135742,
+      "learning_rate": 0.00014145814355877526,
+      "loss": 1.0764,
+      "step": 8190
+    },
+    {
+      "epoch": 1.4583333333333333,
+      "grad_norm": 0.7357106804847717,
+      "learning_rate": 0.00014144540528200698,
+      "loss": 1.0202,
+      "step": 8191
+    },
+    {
+      "epoch": 1.458511396011396,
+      "grad_norm": 0.603566586971283,
+      "learning_rate": 0.00014143266619320204,
+      "loss": 0.8214,
+      "step": 8192
+    },
+    {
+      "epoch": 1.4586894586894588,
+      "grad_norm": 0.6829110383987427,
+      "learning_rate": 0.00014141992629261007,
+      "loss": 0.9479,
+      "step": 8193
+    },
+    {
+      "epoch": 1.4588675213675213,
+      "grad_norm": 0.6822739839553833,
+      "learning_rate": 0.00014140718558048072,
+      "loss": 0.9117,
+      "step": 8194
+    },
+    {
+      "epoch": 1.459045584045584,
+      "grad_norm": 0.7383607029914856,
+      "learning_rate": 0.00014139444405706356,
+      "loss": 0.9819,
+      "step": 8195
+    },
+    {
+      "epoch": 1.4592236467236468,
+      "grad_norm": 0.6319897770881653,
+      "learning_rate": 0.00014138170172260826,
+      "loss": 1.0508,
+      "step": 8196
+    },
+    {
+      "epoch": 1.4594017094017093,
+      "grad_norm": 0.6804461479187012,
+      "learning_rate": 0.0001413689585773645,
+      "loss": 0.992,
+      "step": 8197
+    },
+    {
+      "epoch": 1.459579772079772,
+      "grad_norm": 0.6198720335960388,
+      "learning_rate": 0.0001413562146215819,
+      "loss": 1.0113,
+      "step": 8198
+    },
+    {
+      "epoch": 1.4597578347578348,
+      "grad_norm": 0.5968540906906128,
+      "learning_rate": 0.0001413434698555102,
+      "loss": 0.7562,
+      "step": 8199
+    },
+    {
+      "epoch": 1.4599358974358974,
+      "grad_norm": 0.5370334982872009,
+      "learning_rate": 0.00014133072427939913,
+      "loss": 0.9238,
+      "step": 8200
+    },
+    {
+      "epoch": 1.46011396011396,
+      "grad_norm": 0.6652548909187317,
+      "learning_rate": 0.00014131797789349832,
+      "loss": 0.9464,
+      "step": 8201
+    },
+    {
+      "epoch": 1.4602920227920229,
+      "grad_norm": 0.637852668762207,
+      "learning_rate": 0.00014130523069805757,
+      "loss": 1.0395,
+      "step": 8202
+    },
+    {
+      "epoch": 1.4604700854700854,
+      "grad_norm": 0.8186550140380859,
+      "learning_rate": 0.00014129248269332664,
+      "loss": 1.2116,
+      "step": 8203
+    },
+    {
+      "epoch": 1.4606481481481481,
+      "grad_norm": 0.5290196537971497,
+      "learning_rate": 0.00014127973387955528,
+      "loss": 0.7331,
+      "step": 8204
+    },
+    {
+      "epoch": 1.460826210826211,
+      "grad_norm": 0.6516342163085938,
+      "learning_rate": 0.00014126698425699332,
+      "loss": 0.9275,
+      "step": 8205
+    },
+    {
+      "epoch": 1.4610042735042734,
+      "grad_norm": 0.767254114151001,
+      "learning_rate": 0.00014125423382589048,
+      "loss": 0.9355,
+      "step": 8206
+    },
+    {
+      "epoch": 1.4611823361823362,
+      "grad_norm": 0.6476777195930481,
+      "learning_rate": 0.00014124148258649668,
+      "loss": 0.9263,
+      "step": 8207
+    },
+    {
+      "epoch": 1.461360398860399,
+      "grad_norm": 0.6737871766090393,
+      "learning_rate": 0.00014122873053906167,
+      "loss": 0.9815,
+      "step": 8208
+    },
+    {
+      "epoch": 1.4615384615384617,
+      "grad_norm": 0.6311159729957581,
+      "learning_rate": 0.00014121597768383532,
+      "loss": 0.9607,
+      "step": 8209
+    },
+    {
+      "epoch": 1.4617165242165242,
+      "grad_norm": 0.6061250567436218,
+      "learning_rate": 0.00014120322402106752,
+      "loss": 0.7428,
+      "step": 8210
+    },
+    {
+      "epoch": 1.461894586894587,
+      "grad_norm": 0.6916252970695496,
+      "learning_rate": 0.00014119046955100815,
+      "loss": 0.9664,
+      "step": 8211
+    },
+    {
+      "epoch": 1.4620726495726495,
+      "grad_norm": 0.6583660840988159,
+      "learning_rate": 0.00014117771427390706,
+      "loss": 1.0645,
+      "step": 8212
+    },
+    {
+      "epoch": 1.4622507122507122,
+      "grad_norm": 0.7034604549407959,
+      "learning_rate": 0.00014116495819001425,
+      "loss": 0.9223,
+      "step": 8213
+    },
+    {
+      "epoch": 1.462428774928775,
+      "grad_norm": 0.6378605961799622,
+      "learning_rate": 0.00014115220129957954,
+      "loss": 0.7963,
+      "step": 8214
+    },
+    {
+      "epoch": 1.4626068376068377,
+      "grad_norm": 0.6251596212387085,
+      "learning_rate": 0.00014113944360285297,
+      "loss": 0.9852,
+      "step": 8215
+    },
+    {
+      "epoch": 1.4627849002849003,
+      "grad_norm": 0.7055560946464539,
+      "learning_rate": 0.00014112668510008446,
+      "loss": 0.9342,
+      "step": 8216
+    },
+    {
+      "epoch": 1.462962962962963,
+      "grad_norm": 0.6250377893447876,
+      "learning_rate": 0.00014111392579152396,
+      "loss": 0.9886,
+      "step": 8217
+    },
+    {
+      "epoch": 1.4631410256410255,
+      "grad_norm": 0.6011185050010681,
+      "learning_rate": 0.00014110116567742152,
+      "loss": 0.8465,
+      "step": 8218
+    },
+    {
+      "epoch": 1.4633190883190883,
+      "grad_norm": 0.6632489562034607,
+      "learning_rate": 0.0001410884047580271,
+      "loss": 0.8619,
+      "step": 8219
+    },
+    {
+      "epoch": 1.463497150997151,
+      "grad_norm": 0.7194828987121582,
+      "learning_rate": 0.00014107564303359076,
+      "loss": 1.1231,
+      "step": 8220
+    },
+    {
+      "epoch": 1.4636752136752138,
+      "grad_norm": 0.7640393376350403,
+      "learning_rate": 0.0001410628805043625,
+      "loss": 1.1955,
+      "step": 8221
+    },
+    {
+      "epoch": 1.4638532763532763,
+      "grad_norm": 0.9118906259536743,
+      "learning_rate": 0.0001410501171705924,
+      "loss": 1.0555,
+      "step": 8222
+    },
+    {
+      "epoch": 1.464031339031339,
+      "grad_norm": 0.7545066475868225,
+      "learning_rate": 0.00014103735303253053,
+      "loss": 0.9425,
+      "step": 8223
+    },
+    {
+      "epoch": 1.4642094017094016,
+      "grad_norm": 0.6848801970481873,
+      "learning_rate": 0.000141024588090427,
+      "loss": 1.0418,
+      "step": 8224
+    },
+    {
+      "epoch": 1.4643874643874644,
+      "grad_norm": 0.6825160384178162,
+      "learning_rate": 0.00014101182234453185,
+      "loss": 0.9615,
+      "step": 8225
+    },
+    {
+      "epoch": 1.4645655270655271,
+      "grad_norm": 0.8258556723594666,
+      "learning_rate": 0.00014099905579509527,
+      "loss": 1.1237,
+      "step": 8226
+    },
+    {
+      "epoch": 1.4647435897435899,
+      "grad_norm": 0.6427522897720337,
+      "learning_rate": 0.00014098628844236733,
+      "loss": 1.0853,
+      "step": 8227
+    },
+    {
+      "epoch": 1.4649216524216524,
+      "grad_norm": 0.6476351022720337,
+      "learning_rate": 0.00014097352028659825,
+      "loss": 1.1286,
+      "step": 8228
+    },
+    {
+      "epoch": 1.4650997150997151,
+      "grad_norm": 0.7621034383773804,
+      "learning_rate": 0.00014096075132803812,
+      "loss": 1.1402,
+      "step": 8229
+    },
+    {
+      "epoch": 1.4652777777777777,
+      "grad_norm": 0.6629892587661743,
+      "learning_rate": 0.00014094798156693718,
+      "loss": 0.7108,
+      "step": 8230
+    },
+    {
+      "epoch": 1.4654558404558404,
+      "grad_norm": 0.6902043223381042,
+      "learning_rate": 0.00014093521100354557,
+      "loss": 1.1761,
+      "step": 8231
+    },
+    {
+      "epoch": 1.4656339031339032,
+      "grad_norm": 0.7422910928726196,
+      "learning_rate": 0.00014092243963811357,
+      "loss": 0.867,
+      "step": 8232
+    },
+    {
+      "epoch": 1.465811965811966,
+      "grad_norm": 0.7424963712692261,
+      "learning_rate": 0.00014090966747089137,
+      "loss": 1.015,
+      "step": 8233
+    },
+    {
+      "epoch": 1.4659900284900285,
+      "grad_norm": 0.6855891942977905,
+      "learning_rate": 0.0001408968945021292,
+      "loss": 0.9624,
+      "step": 8234
+    },
+    {
+      "epoch": 1.4661680911680912,
+      "grad_norm": 0.5968918204307556,
+      "learning_rate": 0.00014088412073207736,
+      "loss": 0.9243,
+      "step": 8235
+    },
+    {
+      "epoch": 1.4663461538461537,
+      "grad_norm": 0.6153344511985779,
+      "learning_rate": 0.0001408713461609861,
+      "loss": 1.0305,
+      "step": 8236
+    },
+    {
+      "epoch": 1.4665242165242165,
+      "grad_norm": 0.6627458333969116,
+      "learning_rate": 0.0001408585707891057,
+      "loss": 1.1102,
+      "step": 8237
+    },
+    {
+      "epoch": 1.4667022792022792,
+      "grad_norm": 0.6475233435630798,
+      "learning_rate": 0.0001408457946166865,
+      "loss": 1.0045,
+      "step": 8238
+    },
+    {
+      "epoch": 1.466880341880342,
+      "grad_norm": 0.6792858839035034,
+      "learning_rate": 0.00014083301764397876,
+      "loss": 1.0092,
+      "step": 8239
+    },
+    {
+      "epoch": 1.4670584045584045,
+      "grad_norm": 0.6916255354881287,
+      "learning_rate": 0.00014082023987123293,
+      "loss": 1.0761,
+      "step": 8240
+    },
+    {
+      "epoch": 1.4672364672364673,
+      "grad_norm": 0.7901251912117004,
+      "learning_rate": 0.00014080746129869923,
+      "loss": 0.8002,
+      "step": 8241
+    },
+    {
+      "epoch": 1.4674145299145298,
+      "grad_norm": 0.8078263401985168,
+      "learning_rate": 0.00014079468192662812,
+      "loss": 0.9738,
+      "step": 8242
+    },
+    {
+      "epoch": 1.4675925925925926,
+      "grad_norm": 0.6370784640312195,
+      "learning_rate": 0.00014078190175526996,
+      "loss": 1.0256,
+      "step": 8243
+    },
+    {
+      "epoch": 1.4677706552706553,
+      "grad_norm": 0.6087532639503479,
+      "learning_rate": 0.0001407691207848752,
+      "loss": 0.9747,
+      "step": 8244
+    },
+    {
+      "epoch": 1.467948717948718,
+      "grad_norm": 0.6333357691764832,
+      "learning_rate": 0.00014075633901569414,
+      "loss": 1.0135,
+      "step": 8245
+    },
+    {
+      "epoch": 1.4681267806267806,
+      "grad_norm": 0.6914255619049072,
+      "learning_rate": 0.00014074355644797733,
+      "loss": 1.0261,
+      "step": 8246
+    },
+    {
+      "epoch": 1.4683048433048433,
+      "grad_norm": 0.6374734044075012,
+      "learning_rate": 0.00014073077308197513,
+      "loss": 0.9197,
+      "step": 8247
+    },
+    {
+      "epoch": 1.4684829059829059,
+      "grad_norm": 0.8023789525032043,
+      "learning_rate": 0.00014071798891793807,
+      "loss": 1.1085,
+      "step": 8248
+    },
+    {
+      "epoch": 1.4686609686609686,
+      "grad_norm": 0.7722933888435364,
+      "learning_rate": 0.0001407052039561166,
+      "loss": 1.2018,
+      "step": 8249
+    },
+    {
+      "epoch": 1.4688390313390314,
+      "grad_norm": 0.6823393106460571,
+      "learning_rate": 0.0001406924181967612,
+      "loss": 1.088,
+      "step": 8250
+    },
+    {
+      "epoch": 1.4690170940170941,
+      "grad_norm": 0.7037357687950134,
+      "learning_rate": 0.00014067963164012242,
+      "loss": 1.0324,
+      "step": 8251
+    },
+    {
+      "epoch": 1.4691951566951567,
+      "grad_norm": 0.6549737453460693,
+      "learning_rate": 0.00014066684428645074,
+      "loss": 1.152,
+      "step": 8252
+    },
+    {
+      "epoch": 1.4693732193732194,
+      "grad_norm": 0.5349790453910828,
+      "learning_rate": 0.00014065405613599674,
+      "loss": 0.6996,
+      "step": 8253
+    },
+    {
+      "epoch": 1.469551282051282,
+      "grad_norm": 0.6760679483413696,
+      "learning_rate": 0.00014064126718901096,
+      "loss": 0.9856,
+      "step": 8254
+    },
+    {
+      "epoch": 1.4697293447293447,
+      "grad_norm": 0.5912436842918396,
+      "learning_rate": 0.00014062847744574395,
+      "loss": 1.0076,
+      "step": 8255
+    },
+    {
+      "epoch": 1.4699074074074074,
+      "grad_norm": 0.75101637840271,
+      "learning_rate": 0.00014061568690644632,
+      "loss": 1.0033,
+      "step": 8256
+    },
+    {
+      "epoch": 1.4700854700854702,
+      "grad_norm": 0.6233504414558411,
+      "learning_rate": 0.00014060289557136873,
+      "loss": 0.8525,
+      "step": 8257
+    },
+    {
+      "epoch": 1.4702635327635327,
+      "grad_norm": 0.659570038318634,
+      "learning_rate": 0.00014059010344076171,
+      "loss": 0.855,
+      "step": 8258
+    },
+    {
+      "epoch": 1.4704415954415955,
+      "grad_norm": 0.8096539974212646,
+      "learning_rate": 0.00014057731051487593,
+      "loss": 0.9905,
+      "step": 8259
+    },
+    {
+      "epoch": 1.470619658119658,
+      "grad_norm": 0.5829728245735168,
+      "learning_rate": 0.00014056451679396204,
+      "loss": 0.7974,
+      "step": 8260
+    },
+    {
+      "epoch": 1.4707977207977208,
+      "grad_norm": 0.6176979541778564,
+      "learning_rate": 0.0001405517222782707,
+      "loss": 0.9556,
+      "step": 8261
+    },
+    {
+      "epoch": 1.4709757834757835,
+      "grad_norm": 0.6322479248046875,
+      "learning_rate": 0.00014053892696805264,
+      "loss": 0.8837,
+      "step": 8262
+    },
+    {
+      "epoch": 1.4711538461538463,
+      "grad_norm": 0.6886917948722839,
+      "learning_rate": 0.0001405261308635585,
+      "loss": 0.9242,
+      "step": 8263
+    },
+    {
+      "epoch": 1.4713319088319088,
+      "grad_norm": 0.7474521994590759,
+      "learning_rate": 0.00014051333396503901,
+      "loss": 0.9906,
+      "step": 8264
+    },
+    {
+      "epoch": 1.4715099715099715,
+      "grad_norm": 0.7120978832244873,
+      "learning_rate": 0.00014050053627274488,
+      "loss": 1.1074,
+      "step": 8265
+    },
+    {
+      "epoch": 1.471688034188034,
+      "grad_norm": 0.6778998374938965,
+      "learning_rate": 0.0001404877377869269,
+      "loss": 1.0027,
+      "step": 8266
+    },
+    {
+      "epoch": 1.4718660968660968,
+      "grad_norm": 0.6832901239395142,
+      "learning_rate": 0.0001404749385078358,
+      "loss": 0.9399,
+      "step": 8267
+    },
+    {
+      "epoch": 1.4720441595441596,
+      "grad_norm": 0.7428423762321472,
+      "learning_rate": 0.00014046213843572236,
+      "loss": 1.0591,
+      "step": 8268
+    },
+    {
+      "epoch": 1.4722222222222223,
+      "grad_norm": 0.7522720098495483,
+      "learning_rate": 0.00014044933757083737,
+      "loss": 1.1184,
+      "step": 8269
+    },
+    {
+      "epoch": 1.4724002849002849,
+      "grad_norm": 0.7714734673500061,
+      "learning_rate": 0.00014043653591343163,
+      "loss": 1.0783,
+      "step": 8270
+    },
+    {
+      "epoch": 1.4725783475783476,
+      "grad_norm": 0.5860890746116638,
+      "learning_rate": 0.00014042373346375597,
+      "loss": 0.8394,
+      "step": 8271
+    },
+    {
+      "epoch": 1.4727564102564101,
+      "grad_norm": 0.6400395035743713,
+      "learning_rate": 0.0001404109302220612,
+      "loss": 0.9153,
+      "step": 8272
+    },
+    {
+      "epoch": 1.4729344729344729,
+      "grad_norm": 0.7441139817237854,
+      "learning_rate": 0.00014039812618859827,
+      "loss": 0.9224,
+      "step": 8273
+    },
+    {
+      "epoch": 1.4731125356125356,
+      "grad_norm": 0.6030932664871216,
+      "learning_rate": 0.00014038532136361793,
+      "loss": 1.0783,
+      "step": 8274
+    },
+    {
+      "epoch": 1.4732905982905984,
+      "grad_norm": 0.7243345975875854,
+      "learning_rate": 0.0001403725157473711,
+      "loss": 0.9894,
+      "step": 8275
+    },
+    {
+      "epoch": 1.473468660968661,
+      "grad_norm": 0.6880641579627991,
+      "learning_rate": 0.0001403597093401087,
+      "loss": 0.9459,
+      "step": 8276
+    },
+    {
+      "epoch": 1.4736467236467237,
+      "grad_norm": 0.6263882517814636,
+      "learning_rate": 0.00014034690214208165,
+      "loss": 0.8781,
+      "step": 8277
+    },
+    {
+      "epoch": 1.4738247863247862,
+      "grad_norm": 0.7159495949745178,
+      "learning_rate": 0.00014033409415354085,
+      "loss": 1.0511,
+      "step": 8278
+    },
+    {
+      "epoch": 1.474002849002849,
+      "grad_norm": 0.7182226181030273,
+      "learning_rate": 0.00014032128537473727,
+      "loss": 1.1196,
+      "step": 8279
+    },
+    {
+      "epoch": 1.4741809116809117,
+      "grad_norm": 0.744478166103363,
+      "learning_rate": 0.00014030847580592186,
+      "loss": 1.0747,
+      "step": 8280
+    },
+    {
+      "epoch": 1.4743589743589745,
+      "grad_norm": 0.6806797385215759,
+      "learning_rate": 0.00014029566544734558,
+      "loss": 1.1519,
+      "step": 8281
+    },
+    {
+      "epoch": 1.474537037037037,
+      "grad_norm": 0.6813502311706543,
+      "learning_rate": 0.00014028285429925946,
+      "loss": 0.968,
+      "step": 8282
+    },
+    {
+      "epoch": 1.4747150997150997,
+      "grad_norm": 0.639784574508667,
+      "learning_rate": 0.00014027004236191452,
+      "loss": 1.0685,
+      "step": 8283
+    },
+    {
+      "epoch": 1.4748931623931623,
+      "grad_norm": 0.6325878500938416,
+      "learning_rate": 0.00014025722963556173,
+      "loss": 1.0358,
+      "step": 8284
+    },
+    {
+      "epoch": 1.475071225071225,
+      "grad_norm": 0.7012955546379089,
+      "learning_rate": 0.00014024441612045215,
+      "loss": 1.1059,
+      "step": 8285
+    },
+    {
+      "epoch": 1.4752492877492878,
+      "grad_norm": 0.690380334854126,
+      "learning_rate": 0.00014023160181683684,
+      "loss": 0.9628,
+      "step": 8286
+    },
+    {
+      "epoch": 1.4754273504273505,
+      "grad_norm": 0.7178516983985901,
+      "learning_rate": 0.00014021878672496686,
+      "loss": 0.963,
+      "step": 8287
+    },
+    {
+      "epoch": 1.475605413105413,
+      "grad_norm": 0.7049064636230469,
+      "learning_rate": 0.0001402059708450933,
+      "loss": 0.8996,
+      "step": 8288
+    },
+    {
+      "epoch": 1.4757834757834758,
+      "grad_norm": 0.6777819395065308,
+      "learning_rate": 0.00014019315417746728,
+      "loss": 1.0696,
+      "step": 8289
+    },
+    {
+      "epoch": 1.4759615384615383,
+      "grad_norm": 0.5948763489723206,
+      "learning_rate": 0.00014018033672233987,
+      "loss": 0.928,
+      "step": 8290
+    },
+    {
+      "epoch": 1.476139601139601,
+      "grad_norm": 0.7183942198753357,
+      "learning_rate": 0.00014016751847996224,
+      "loss": 1.1053,
+      "step": 8291
+    },
+    {
+      "epoch": 1.4763176638176638,
+      "grad_norm": 0.7426177263259888,
+      "learning_rate": 0.00014015469945058556,
+      "loss": 0.9504,
+      "step": 8292
+    },
+    {
+      "epoch": 1.4764957264957266,
+      "grad_norm": 0.6508159041404724,
+      "learning_rate": 0.0001401418796344609,
+      "loss": 1.1176,
+      "step": 8293
+    },
+    {
+      "epoch": 1.476673789173789,
+      "grad_norm": 0.6954567432403564,
+      "learning_rate": 0.00014012905903183954,
+      "loss": 0.9238,
+      "step": 8294
+    },
+    {
+      "epoch": 1.4768518518518519,
+      "grad_norm": 0.7023960947990417,
+      "learning_rate": 0.0001401162376429726,
+      "loss": 1.2032,
+      "step": 8295
+    },
+    {
+      "epoch": 1.4770299145299146,
+      "grad_norm": 0.7174739837646484,
+      "learning_rate": 0.00014010341546811134,
+      "loss": 0.9385,
+      "step": 8296
+    },
+    {
+      "epoch": 1.4772079772079771,
+      "grad_norm": 0.611980140209198,
+      "learning_rate": 0.00014009059250750695,
+      "loss": 0.9469,
+      "step": 8297
+    },
+    {
+      "epoch": 1.47738603988604,
+      "grad_norm": 0.6362917423248291,
+      "learning_rate": 0.0001400777687614107,
+      "loss": 1.1406,
+      "step": 8298
+    },
+    {
+      "epoch": 1.4775641025641026,
+      "grad_norm": 0.6884697675704956,
+      "learning_rate": 0.00014006494423007381,
+      "loss": 0.7915,
+      "step": 8299
+    },
+    {
+      "epoch": 1.4777421652421652,
+      "grad_norm": 0.6266025304794312,
+      "learning_rate": 0.00014005211891374755,
+      "loss": 0.94,
+      "step": 8300
+    },
+    {
+      "epoch": 1.477920227920228,
+      "grad_norm": 0.6130280494689941,
+      "learning_rate": 0.00014003929281268323,
+      "loss": 0.9369,
+      "step": 8301
+    },
+    {
+      "epoch": 1.4780982905982907,
+      "grad_norm": 0.7244207859039307,
+      "learning_rate": 0.00014002646592713215,
+      "loss": 1.1449,
+      "step": 8302
+    },
+    {
+      "epoch": 1.4782763532763532,
+      "grad_norm": 0.6527345776557922,
+      "learning_rate": 0.0001400136382573456,
+      "loss": 0.7792,
+      "step": 8303
+    },
+    {
+      "epoch": 1.478454415954416,
+      "grad_norm": 0.7102689743041992,
+      "learning_rate": 0.00014000080980357496,
+      "loss": 0.9577,
+      "step": 8304
+    },
+    {
+      "epoch": 1.4786324786324787,
+      "grad_norm": 0.6179325580596924,
+      "learning_rate": 0.00013998798056607154,
+      "loss": 0.827,
+      "step": 8305
+    },
+    {
+      "epoch": 1.4788105413105412,
+      "grad_norm": 0.761234700679779,
+      "learning_rate": 0.00013997515054508668,
+      "loss": 1.0576,
+      "step": 8306
+    },
+    {
+      "epoch": 1.478988603988604,
+      "grad_norm": 0.6200914978981018,
+      "learning_rate": 0.0001399623197408718,
+      "loss": 1.0514,
+      "step": 8307
+    },
+    {
+      "epoch": 1.4791666666666667,
+      "grad_norm": 0.5961193442344666,
+      "learning_rate": 0.0001399494881536783,
+      "loss": 0.7846,
+      "step": 8308
+    },
+    {
+      "epoch": 1.4793447293447293,
+      "grad_norm": 0.645984411239624,
+      "learning_rate": 0.00013993665578375758,
+      "loss": 0.9927,
+      "step": 8309
+    },
+    {
+      "epoch": 1.479522792022792,
+      "grad_norm": 0.7258989810943604,
+      "learning_rate": 0.000139923822631361,
+      "loss": 0.7567,
+      "step": 8310
+    },
+    {
+      "epoch": 1.4797008547008548,
+      "grad_norm": 0.708882212638855,
+      "learning_rate": 0.00013991098869674007,
+      "loss": 1.1147,
+      "step": 8311
+    },
+    {
+      "epoch": 1.4798789173789173,
+      "grad_norm": 0.669262707233429,
+      "learning_rate": 0.00013989815398014624,
+      "loss": 0.7142,
+      "step": 8312
+    },
+    {
+      "epoch": 1.48005698005698,
+      "grad_norm": 0.7398767471313477,
+      "learning_rate": 0.00013988531848183096,
+      "loss": 1.043,
+      "step": 8313
+    },
+    {
+      "epoch": 1.4802350427350428,
+      "grad_norm": 0.753197193145752,
+      "learning_rate": 0.0001398724822020457,
+      "loss": 1.058,
+      "step": 8314
+    },
+    {
+      "epoch": 1.4804131054131053,
+      "grad_norm": 0.663526177406311,
+      "learning_rate": 0.000139859645141042,
+      "loss": 1.1272,
+      "step": 8315
+    },
+    {
+      "epoch": 1.480591168091168,
+      "grad_norm": 0.6537514925003052,
+      "learning_rate": 0.00013984680729907135,
+      "loss": 1.011,
+      "step": 8316
+    },
+    {
+      "epoch": 1.4807692307692308,
+      "grad_norm": 0.707554817199707,
+      "learning_rate": 0.00013983396867638527,
+      "loss": 1.0593,
+      "step": 8317
+    },
+    {
+      "epoch": 1.4809472934472934,
+      "grad_norm": 0.6261475086212158,
+      "learning_rate": 0.00013982112927323533,
+      "loss": 1.0731,
+      "step": 8318
+    },
+    {
+      "epoch": 1.4811253561253561,
+      "grad_norm": 0.6694258451461792,
+      "learning_rate": 0.00013980828908987308,
+      "loss": 1.0703,
+      "step": 8319
+    },
+    {
+      "epoch": 1.4813034188034189,
+      "grad_norm": 0.7793164253234863,
+      "learning_rate": 0.00013979544812655012,
+      "loss": 1.0447,
+      "step": 8320
+    },
+    {
+      "epoch": 1.4814814814814814,
+      "grad_norm": 0.6496448516845703,
+      "learning_rate": 0.00013978260638351802,
+      "loss": 1.0208,
+      "step": 8321
+    },
+    {
+      "epoch": 1.4816595441595442,
+      "grad_norm": 0.5992059111595154,
+      "learning_rate": 0.00013976976386102834,
+      "loss": 0.9717,
+      "step": 8322
+    },
+    {
+      "epoch": 1.481837606837607,
+      "grad_norm": 0.7473567128181458,
+      "learning_rate": 0.0001397569205593328,
+      "loss": 0.9612,
+      "step": 8323
+    },
+    {
+      "epoch": 1.4820156695156697,
+      "grad_norm": 0.657558798789978,
+      "learning_rate": 0.00013974407647868297,
+      "loss": 1.2137,
+      "step": 8324
+    },
+    {
+      "epoch": 1.4821937321937322,
+      "grad_norm": 0.7040614485740662,
+      "learning_rate": 0.00013973123161933055,
+      "loss": 1.007,
+      "step": 8325
+    },
+    {
+      "epoch": 1.482371794871795,
+      "grad_norm": 0.6098681092262268,
+      "learning_rate": 0.00013971838598152717,
+      "loss": 1.0595,
+      "step": 8326
+    },
+    {
+      "epoch": 1.4825498575498575,
+      "grad_norm": 0.7194869518280029,
+      "learning_rate": 0.0001397055395655245,
+      "loss": 0.9632,
+      "step": 8327
+    },
+    {
+      "epoch": 1.4827279202279202,
+      "grad_norm": 0.645972728729248,
+      "learning_rate": 0.00013969269237157426,
+      "loss": 1.0712,
+      "step": 8328
+    },
+    {
+      "epoch": 1.482905982905983,
+      "grad_norm": 0.6580560207366943,
+      "learning_rate": 0.0001396798443999282,
+      "loss": 1.2117,
+      "step": 8329
+    },
+    {
+      "epoch": 1.4830840455840457,
+      "grad_norm": 0.6624418497085571,
+      "learning_rate": 0.00013966699565083802,
+      "loss": 0.8529,
+      "step": 8330
+    },
+    {
+      "epoch": 1.4832621082621082,
+      "grad_norm": 0.659896731376648,
+      "learning_rate": 0.00013965414612455545,
+      "loss": 0.9359,
+      "step": 8331
+    },
+    {
+      "epoch": 1.483440170940171,
+      "grad_norm": 0.6690883636474609,
+      "learning_rate": 0.00013964129582133222,
+      "loss": 0.971,
+      "step": 8332
+    },
+    {
+      "epoch": 1.4836182336182335,
+      "grad_norm": 0.6767334938049316,
+      "learning_rate": 0.00013962844474142022,
+      "loss": 1.0137,
+      "step": 8333
+    },
+    {
+      "epoch": 1.4837962962962963,
+      "grad_norm": 0.6412752270698547,
+      "learning_rate": 0.0001396155928850711,
+      "loss": 1.2812,
+      "step": 8334
+    },
+    {
+      "epoch": 1.483974358974359,
+      "grad_norm": 0.6731469035148621,
+      "learning_rate": 0.0001396027402525368,
+      "loss": 0.8723,
+      "step": 8335
+    },
+    {
+      "epoch": 1.4841524216524218,
+      "grad_norm": 0.7327923774719238,
+      "learning_rate": 0.000139589886844069,
+      "loss": 0.9606,
+      "step": 8336
+    },
+    {
+      "epoch": 1.4843304843304843,
+      "grad_norm": 0.6194515824317932,
+      "learning_rate": 0.00013957703265991963,
+      "loss": 0.8514,
+      "step": 8337
+    },
+    {
+      "epoch": 1.484508547008547,
+      "grad_norm": 0.7250012755393982,
+      "learning_rate": 0.00013956417770034053,
+      "loss": 0.9755,
+      "step": 8338
+    },
+    {
+      "epoch": 1.4846866096866096,
+      "grad_norm": 0.7484263181686401,
+      "learning_rate": 0.00013955132196558358,
+      "loss": 1.0376,
+      "step": 8339
+    },
+    {
+      "epoch": 1.4848646723646723,
+      "grad_norm": 0.7593362331390381,
+      "learning_rate": 0.00013953846545590058,
+      "loss": 1.3011,
+      "step": 8340
+    },
+    {
+      "epoch": 1.485042735042735,
+      "grad_norm": 0.6670466065406799,
+      "learning_rate": 0.00013952560817154352,
+      "loss": 0.9726,
+      "step": 8341
+    },
+    {
+      "epoch": 1.4852207977207978,
+      "grad_norm": 0.8001134395599365,
+      "learning_rate": 0.00013951275011276425,
+      "loss": 1.1447,
+      "step": 8342
+    },
+    {
+      "epoch": 1.4853988603988604,
+      "grad_norm": 0.741450309753418,
+      "learning_rate": 0.00013949989127981475,
+      "loss": 1.1101,
+      "step": 8343
+    },
+    {
+      "epoch": 1.4855769230769231,
+      "grad_norm": 0.6594467163085938,
+      "learning_rate": 0.00013948703167294694,
+      "loss": 1.0205,
+      "step": 8344
+    },
+    {
+      "epoch": 1.4857549857549857,
+      "grad_norm": 0.6303030252456665,
+      "learning_rate": 0.00013947417129241276,
+      "loss": 0.9179,
+      "step": 8345
+    },
+    {
+      "epoch": 1.4859330484330484,
+      "grad_norm": 0.6352720856666565,
+      "learning_rate": 0.00013946131013846418,
+      "loss": 1.158,
+      "step": 8346
+    },
+    {
+      "epoch": 1.4861111111111112,
+      "grad_norm": 0.6720923781394958,
+      "learning_rate": 0.0001394484482113532,
+      "loss": 0.8805,
+      "step": 8347
+    },
+    {
+      "epoch": 1.486289173789174,
+      "grad_norm": 0.7186421751976013,
+      "learning_rate": 0.00013943558551133186,
+      "loss": 0.8951,
+      "step": 8348
+    },
+    {
+      "epoch": 1.4864672364672364,
+      "grad_norm": 0.6038698554039001,
+      "learning_rate": 0.00013942272203865214,
+      "loss": 1.0079,
+      "step": 8349
+    },
+    {
+      "epoch": 1.4866452991452992,
+      "grad_norm": 0.665790319442749,
+      "learning_rate": 0.00013940985779356606,
+      "loss": 0.8853,
+      "step": 8350
+    },
+    {
+      "epoch": 1.4868233618233617,
+      "grad_norm": 0.6941595673561096,
+      "learning_rate": 0.00013939699277632568,
+      "loss": 1.1404,
+      "step": 8351
+    },
+    {
+      "epoch": 1.4870014245014245,
+      "grad_norm": 0.7943871021270752,
+      "learning_rate": 0.00013938412698718305,
+      "loss": 0.9961,
+      "step": 8352
+    },
+    {
+      "epoch": 1.4871794871794872,
+      "grad_norm": 0.6363818645477295,
+      "learning_rate": 0.00013937126042639028,
+      "loss": 0.8621,
+      "step": 8353
+    },
+    {
+      "epoch": 1.48735754985755,
+      "grad_norm": 0.7986421585083008,
+      "learning_rate": 0.00013935839309419943,
+      "loss": 1.0547,
+      "step": 8354
+    },
+    {
+      "epoch": 1.4875356125356125,
+      "grad_norm": 0.5890130400657654,
+      "learning_rate": 0.00013934552499086266,
+      "loss": 0.9863,
+      "step": 8355
+    },
+    {
+      "epoch": 1.4877136752136753,
+      "grad_norm": 0.7915370464324951,
+      "learning_rate": 0.00013933265611663207,
+      "loss": 1.0385,
+      "step": 8356
+    },
+    {
+      "epoch": 1.4878917378917378,
+      "grad_norm": 0.7062503695487976,
+      "learning_rate": 0.00013931978647175973,
+      "loss": 1.0984,
+      "step": 8357
+    },
+    {
+      "epoch": 1.4880698005698005,
+      "grad_norm": 0.6496769785881042,
+      "learning_rate": 0.00013930691605649792,
+      "loss": 1.0884,
+      "step": 8358
+    },
+    {
+      "epoch": 1.4882478632478633,
+      "grad_norm": 0.6527266502380371,
+      "learning_rate": 0.0001392940448710987,
+      "loss": 1.0366,
+      "step": 8359
+    },
+    {
+      "epoch": 1.488425925925926,
+      "grad_norm": 0.6269870400428772,
+      "learning_rate": 0.00013928117291581431,
+      "loss": 0.9097,
+      "step": 8360
+    },
+    {
+      "epoch": 1.4886039886039886,
+      "grad_norm": 0.6581160426139832,
+      "learning_rate": 0.00013926830019089694,
+      "loss": 0.8694,
+      "step": 8361
+    },
+    {
+      "epoch": 1.4887820512820513,
+      "grad_norm": 0.6196219325065613,
+      "learning_rate": 0.0001392554266965988,
+      "loss": 0.8054,
+      "step": 8362
+    },
+    {
+      "epoch": 1.4889601139601139,
+      "grad_norm": 0.6246176362037659,
+      "learning_rate": 0.0001392425524331721,
+      "loss": 0.9309,
+      "step": 8363
+    },
+    {
+      "epoch": 1.4891381766381766,
+      "grad_norm": 0.7293874025344849,
+      "learning_rate": 0.00013922967740086914,
+      "loss": 1.051,
+      "step": 8364
+    },
+    {
+      "epoch": 1.4893162393162394,
+      "grad_norm": 0.6581604480743408,
+      "learning_rate": 0.00013921680159994213,
+      "loss": 0.8475,
+      "step": 8365
+    },
+    {
+      "epoch": 1.489494301994302,
+      "grad_norm": 0.6294612288475037,
+      "learning_rate": 0.00013920392503064335,
+      "loss": 0.6946,
+      "step": 8366
+    },
+    {
+      "epoch": 1.4896723646723646,
+      "grad_norm": 0.5725370645523071,
+      "learning_rate": 0.00013919104769322512,
+      "loss": 0.7838,
+      "step": 8367
+    },
+    {
+      "epoch": 1.4898504273504274,
+      "grad_norm": 0.681520402431488,
+      "learning_rate": 0.00013917816958793967,
+      "loss": 0.99,
+      "step": 8368
+    },
+    {
+      "epoch": 1.49002849002849,
+      "grad_norm": 0.6660219430923462,
+      "learning_rate": 0.00013916529071503943,
+      "loss": 0.9113,
+      "step": 8369
+    },
+    {
+      "epoch": 1.4902065527065527,
+      "grad_norm": 0.7567862272262573,
+      "learning_rate": 0.00013915241107477665,
+      "loss": 1.2498,
+      "step": 8370
+    },
+    {
+      "epoch": 1.4903846153846154,
+      "grad_norm": 0.7366036176681519,
+      "learning_rate": 0.00013913953066740372,
+      "loss": 1.115,
+      "step": 8371
+    },
+    {
+      "epoch": 1.4905626780626782,
+      "grad_norm": 0.6201434135437012,
+      "learning_rate": 0.00013912664949317297,
+      "loss": 0.8447,
+      "step": 8372
+    },
+    {
+      "epoch": 1.4907407407407407,
+      "grad_norm": 0.7618655562400818,
+      "learning_rate": 0.00013911376755233683,
+      "loss": 0.9696,
+      "step": 8373
+    },
+    {
+      "epoch": 1.4909188034188035,
+      "grad_norm": 0.6716726422309875,
+      "learning_rate": 0.00013910088484514764,
+      "loss": 0.9753,
+      "step": 8374
+    },
+    {
+      "epoch": 1.491096866096866,
+      "grad_norm": 0.6745659112930298,
+      "learning_rate": 0.0001390880013718579,
+      "loss": 1.134,
+      "step": 8375
+    },
+    {
+      "epoch": 1.4912749287749287,
+      "grad_norm": 0.7524410486221313,
+      "learning_rate": 0.0001390751171327199,
+      "loss": 1.0235,
+      "step": 8376
+    },
+    {
+      "epoch": 1.4914529914529915,
+      "grad_norm": 0.7409411072731018,
+      "learning_rate": 0.00013906223212798615,
+      "loss": 0.752,
+      "step": 8377
+    },
+    {
+      "epoch": 1.4916310541310542,
+      "grad_norm": 0.7016384601593018,
+      "learning_rate": 0.00013904934635790913,
+      "loss": 1.1712,
+      "step": 8378
+    },
+    {
+      "epoch": 1.4918091168091168,
+      "grad_norm": 0.6537824869155884,
+      "learning_rate": 0.00013903645982274129,
+      "loss": 1.1162,
+      "step": 8379
+    },
+    {
+      "epoch": 1.4919871794871795,
+      "grad_norm": 0.6460806727409363,
+      "learning_rate": 0.0001390235725227351,
+      "loss": 0.9389,
+      "step": 8380
+    },
+    {
+      "epoch": 1.492165242165242,
+      "grad_norm": 0.6405501365661621,
+      "learning_rate": 0.0001390106844581431,
+      "loss": 1.0508,
+      "step": 8381
+    },
+    {
+      "epoch": 1.4923433048433048,
+      "grad_norm": 0.6672594547271729,
+      "learning_rate": 0.00013899779562921775,
+      "loss": 1.0018,
+      "step": 8382
+    },
+    {
+      "epoch": 1.4925213675213675,
+      "grad_norm": 0.6303185820579529,
+      "learning_rate": 0.0001389849060362116,
+      "loss": 0.9964,
+      "step": 8383
+    },
+    {
+      "epoch": 1.4926994301994303,
+      "grad_norm": 0.6981508731842041,
+      "learning_rate": 0.00013897201567937719,
+      "loss": 1.174,
+      "step": 8384
+    },
+    {
+      "epoch": 1.4928774928774928,
+      "grad_norm": 0.6195989847183228,
+      "learning_rate": 0.0001389591245589671,
+      "loss": 0.9254,
+      "step": 8385
+    },
+    {
+      "epoch": 1.4930555555555556,
+      "grad_norm": 0.6232163310050964,
+      "learning_rate": 0.00013894623267523393,
+      "loss": 0.7151,
+      "step": 8386
+    },
+    {
+      "epoch": 1.493233618233618,
+      "grad_norm": 0.673067033290863,
+      "learning_rate": 0.0001389333400284302,
+      "loss": 1.0156,
+      "step": 8387
+    },
+    {
+      "epoch": 1.4934116809116809,
+      "grad_norm": 0.706266462802887,
+      "learning_rate": 0.00013892044661880856,
+      "loss": 0.9387,
+      "step": 8388
+    },
+    {
+      "epoch": 1.4935897435897436,
+      "grad_norm": 0.742640495300293,
+      "learning_rate": 0.00013890755244662161,
+      "loss": 1.1597,
+      "step": 8389
+    },
+    {
+      "epoch": 1.4937678062678064,
+      "grad_norm": 0.6856846809387207,
+      "learning_rate": 0.000138894657512122,
+      "loss": 0.9998,
+      "step": 8390
+    },
+    {
+      "epoch": 1.493945868945869,
+      "grad_norm": 0.7214110493659973,
+      "learning_rate": 0.0001388817618155624,
+      "loss": 1.1867,
+      "step": 8391
+    },
+    {
+      "epoch": 1.4941239316239316,
+      "grad_norm": 0.7346787452697754,
+      "learning_rate": 0.0001388688653571954,
+      "loss": 0.9071,
+      "step": 8392
+    },
+    {
+      "epoch": 1.4943019943019942,
+      "grad_norm": 0.7019181847572327,
+      "learning_rate": 0.00013885596813727373,
+      "loss": 1.0472,
+      "step": 8393
+    },
+    {
+      "epoch": 1.494480056980057,
+      "grad_norm": 0.6780814528465271,
+      "learning_rate": 0.00013884307015605012,
+      "loss": 1.0031,
+      "step": 8394
+    },
+    {
+      "epoch": 1.4946581196581197,
+      "grad_norm": 0.6722873449325562,
+      "learning_rate": 0.0001388301714137772,
+      "loss": 0.8889,
+      "step": 8395
+    },
+    {
+      "epoch": 1.4948361823361824,
+      "grad_norm": 0.6736134886741638,
+      "learning_rate": 0.00013881727191070777,
+      "loss": 0.8695,
+      "step": 8396
+    },
+    {
+      "epoch": 1.495014245014245,
+      "grad_norm": 0.632648766040802,
+      "learning_rate": 0.00013880437164709452,
+      "loss": 0.9391,
+      "step": 8397
+    },
+    {
+      "epoch": 1.4951923076923077,
+      "grad_norm": 0.7004299163818359,
+      "learning_rate": 0.0001387914706231902,
+      "loss": 1.1423,
+      "step": 8398
+    },
+    {
+      "epoch": 1.4953703703703702,
+      "grad_norm": 0.5787134766578674,
+      "learning_rate": 0.0001387785688392476,
+      "loss": 0.9953,
+      "step": 8399
+    },
+    {
+      "epoch": 1.495548433048433,
+      "grad_norm": 0.6671785712242126,
+      "learning_rate": 0.0001387656662955195,
+      "loss": 0.9356,
+      "step": 8400
+    },
+    {
+      "epoch": 1.4957264957264957,
+      "grad_norm": 0.7216096520423889,
+      "learning_rate": 0.0001387527629922587,
+      "loss": 0.9065,
+      "step": 8401
+    },
+    {
+      "epoch": 1.4959045584045585,
+      "grad_norm": 0.6469849348068237,
+      "learning_rate": 0.00013873985892971801,
+      "loss": 1.0664,
+      "step": 8402
+    },
+    {
+      "epoch": 1.496082621082621,
+      "grad_norm": 0.5598217248916626,
+      "learning_rate": 0.00013872695410815027,
+      "loss": 0.8834,
+      "step": 8403
+    },
+    {
+      "epoch": 1.4962606837606838,
+      "grad_norm": 0.6860302686691284,
+      "learning_rate": 0.00013871404852780828,
+      "loss": 0.9061,
+      "step": 8404
+    },
+    {
+      "epoch": 1.4964387464387463,
+      "grad_norm": 0.7101688385009766,
+      "learning_rate": 0.00013870114218894497,
+      "loss": 1.0236,
+      "step": 8405
+    },
+    {
+      "epoch": 1.496616809116809,
+      "grad_norm": 0.6494225859642029,
+      "learning_rate": 0.00013868823509181313,
+      "loss": 0.9631,
+      "step": 8406
+    },
+    {
+      "epoch": 1.4967948717948718,
+      "grad_norm": 0.6804189085960388,
+      "learning_rate": 0.00013867532723666574,
+      "loss": 0.9341,
+      "step": 8407
+    },
+    {
+      "epoch": 1.4969729344729346,
+      "grad_norm": 0.8493942022323608,
+      "learning_rate": 0.00013866241862375562,
+      "loss": 1.1451,
+      "step": 8408
+    },
+    {
+      "epoch": 1.497150997150997,
+      "grad_norm": 0.6248497366905212,
+      "learning_rate": 0.00013864950925333576,
+      "loss": 0.8584,
+      "step": 8409
+    },
+    {
+      "epoch": 1.4973290598290598,
+      "grad_norm": 0.6238769292831421,
+      "learning_rate": 0.00013863659912565903,
+      "loss": 1.1612,
+      "step": 8410
+    },
+    {
+      "epoch": 1.4975071225071226,
+      "grad_norm": 0.8538609147071838,
+      "learning_rate": 0.0001386236882409784,
+      "loss": 1.0817,
+      "step": 8411
+    },
+    {
+      "epoch": 1.4976851851851851,
+      "grad_norm": 0.7301406264305115,
+      "learning_rate": 0.00013861077659954683,
+      "loss": 0.943,
+      "step": 8412
+    },
+    {
+      "epoch": 1.4978632478632479,
+      "grad_norm": 0.6573456525802612,
+      "learning_rate": 0.0001385978642016173,
+      "loss": 1.0154,
+      "step": 8413
+    },
+    {
+      "epoch": 1.4980413105413106,
+      "grad_norm": 0.7634185552597046,
+      "learning_rate": 0.0001385849510474428,
+      "loss": 1.0432,
+      "step": 8414
+    },
+    {
+      "epoch": 1.4982193732193732,
+      "grad_norm": 0.6156686544418335,
+      "learning_rate": 0.00013857203713727633,
+      "loss": 1.0442,
+      "step": 8415
+    },
+    {
+      "epoch": 1.498397435897436,
+      "grad_norm": 0.5386871695518494,
+      "learning_rate": 0.00013855912247137092,
+      "loss": 0.9055,
+      "step": 8416
+    },
+    {
+      "epoch": 1.4985754985754987,
+      "grad_norm": 0.7108574509620667,
+      "learning_rate": 0.00013854620704997962,
+      "loss": 0.9705,
+      "step": 8417
+    },
+    {
+      "epoch": 1.4987535612535612,
+      "grad_norm": 0.7313347458839417,
+      "learning_rate": 0.00013853329087335547,
+      "loss": 0.7541,
+      "step": 8418
+    },
+    {
+      "epoch": 1.498931623931624,
+      "grad_norm": 0.8369119167327881,
+      "learning_rate": 0.0001385203739417515,
+      "loss": 1.1317,
+      "step": 8419
+    },
+    {
+      "epoch": 1.4991096866096867,
+      "grad_norm": 0.6763789057731628,
+      "learning_rate": 0.00013850745625542085,
+      "loss": 0.7909,
+      "step": 8420
+    },
+    {
+      "epoch": 1.4992877492877492,
+      "grad_norm": 0.7369635105133057,
+      "learning_rate": 0.00013849453781461656,
+      "loss": 1.1454,
+      "step": 8421
+    },
+    {
+      "epoch": 1.499465811965812,
+      "grad_norm": 0.7165971398353577,
+      "learning_rate": 0.0001384816186195918,
+      "loss": 1.1927,
+      "step": 8422
+    },
+    {
+      "epoch": 1.4996438746438747,
+      "grad_norm": 0.7502337694168091,
+      "learning_rate": 0.00013846869867059966,
+      "loss": 1.0592,
+      "step": 8423
+    },
+    {
+      "epoch": 1.4998219373219372,
+      "grad_norm": 0.7207813858985901,
+      "learning_rate": 0.00013845577796789326,
+      "loss": 1.1133,
+      "step": 8424
+    },
+    {
+      "epoch": 1.4998219373219372,
+      "eval_loss": 1.1057652235031128,
+      "eval_runtime": 24.7975,
+      "eval_samples_per_second": 41.98,
+      "eval_steps_per_second": 21.01,
+      "step": 8424
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 0.6962727308273315,
+      "learning_rate": 0.00013844285651172576,
+      "loss": 1.0711,
+      "step": 8425
+    },
+    {
+      "epoch": 1.5001780626780628,
+      "grad_norm": 0.6585133075714111,
+      "learning_rate": 0.00013842993430235038,
+      "loss": 0.9793,
+      "step": 8426
+    },
+    {
+      "epoch": 1.5003561253561255,
+      "grad_norm": 0.7045056819915771,
+      "learning_rate": 0.00013841701134002029,
+      "loss": 1.0046,
+      "step": 8427
+    },
+    {
+      "epoch": 1.500534188034188,
+      "grad_norm": 0.6788702011108398,
+      "learning_rate": 0.00013840408762498863,
+      "loss": 0.9539,
+      "step": 8428
+    },
+    {
+      "epoch": 1.5007122507122506,
+      "grad_norm": 0.7253114581108093,
+      "learning_rate": 0.00013839116315750863,
+      "loss": 0.9446,
+      "step": 8429
+    },
+    {
+      "epoch": 1.5008903133903133,
+      "grad_norm": 0.6103765368461609,
+      "learning_rate": 0.0001383782379378336,
+      "loss": 0.7862,
+      "step": 8430
+    },
+    {
+      "epoch": 1.501068376068376,
+      "grad_norm": 0.6662353873252869,
+      "learning_rate": 0.00013836531196621666,
+      "loss": 1.2178,
+      "step": 8431
+    },
+    {
+      "epoch": 1.5012464387464388,
+      "grad_norm": 0.6871803998947144,
+      "learning_rate": 0.00013835238524291117,
+      "loss": 0.9263,
+      "step": 8432
+    },
+    {
+      "epoch": 1.5014245014245016,
+      "grad_norm": 0.62713223695755,
+      "learning_rate": 0.00013833945776817034,
+      "loss": 0.8879,
+      "step": 8433
+    },
+    {
+      "epoch": 1.501602564102564,
+      "grad_norm": 0.6698164343833923,
+      "learning_rate": 0.00013832652954224748,
+      "loss": 0.9847,
+      "step": 8434
+    },
+    {
+      "epoch": 1.5017806267806266,
+      "grad_norm": 0.6855883002281189,
+      "learning_rate": 0.0001383136005653959,
+      "loss": 0.8614,
+      "step": 8435
+    },
+    {
+      "epoch": 1.5019586894586894,
+      "grad_norm": 0.7028802037239075,
+      "learning_rate": 0.0001383006708378689,
+      "loss": 1.0153,
+      "step": 8436
+    },
+    {
+      "epoch": 1.5021367521367521,
+      "grad_norm": 0.6710380911827087,
+      "learning_rate": 0.00013828774035991981,
+      "loss": 1.0163,
+      "step": 8437
+    },
+    {
+      "epoch": 1.5023148148148149,
+      "grad_norm": 0.618984580039978,
+      "learning_rate": 0.000138274809131802,
+      "loss": 1.0015,
+      "step": 8438
+    },
+    {
+      "epoch": 1.5024928774928776,
+      "grad_norm": 0.6881645321846008,
+      "learning_rate": 0.00013826187715376882,
+      "loss": 0.9776,
+      "step": 8439
+    },
+    {
+      "epoch": 1.5026709401709402,
+      "grad_norm": 0.6715859770774841,
+      "learning_rate": 0.00013824894442607358,
+      "loss": 0.9129,
+      "step": 8440
+    },
+    {
+      "epoch": 1.5028490028490027,
+      "grad_norm": 0.5940943360328674,
+      "learning_rate": 0.0001382360109489698,
+      "loss": 1.0724,
+      "step": 8441
+    },
+    {
+      "epoch": 1.5030270655270654,
+      "grad_norm": 0.6536458134651184,
+      "learning_rate": 0.0001382230767227108,
+      "loss": 1.0162,
+      "step": 8442
+    },
+    {
+      "epoch": 1.5032051282051282,
+      "grad_norm": 0.6163156628608704,
+      "learning_rate": 0.00013821014174755,
+      "loss": 1.0521,
+      "step": 8443
+    },
+    {
+      "epoch": 1.503383190883191,
+      "grad_norm": 0.7592282891273499,
+      "learning_rate": 0.00013819720602374082,
+      "loss": 0.9525,
+      "step": 8444
+    },
+    {
+      "epoch": 1.5035612535612537,
+      "grad_norm": 0.6672595143318176,
+      "learning_rate": 0.0001381842695515368,
+      "loss": 0.9359,
+      "step": 8445
+    },
+    {
+      "epoch": 1.5037393162393162,
+      "grad_norm": 0.6395034193992615,
+      "learning_rate": 0.0001381713323311913,
+      "loss": 1.166,
+      "step": 8446
+    },
+    {
+      "epoch": 1.5039173789173788,
+      "grad_norm": 0.5958148837089539,
+      "learning_rate": 0.00013815839436295783,
+      "loss": 0.9885,
+      "step": 8447
+    },
+    {
+      "epoch": 1.5040954415954415,
+      "grad_norm": 0.676555871963501,
+      "learning_rate": 0.0001381454556470899,
+      "loss": 1.0637,
+      "step": 8448
+    },
+    {
+      "epoch": 1.5042735042735043,
+      "grad_norm": 0.642428994178772,
+      "learning_rate": 0.00013813251618384102,
+      "loss": 0.9288,
+      "step": 8449
+    },
+    {
+      "epoch": 1.504451566951567,
+      "grad_norm": 0.6730920076370239,
+      "learning_rate": 0.00013811957597346467,
+      "loss": 1.1345,
+      "step": 8450
+    },
+    {
+      "epoch": 1.5046296296296298,
+      "grad_norm": 0.7824259996414185,
+      "learning_rate": 0.00013810663501621443,
+      "loss": 0.7532,
+      "step": 8451
+    },
+    {
+      "epoch": 1.5048076923076923,
+      "grad_norm": 0.8184825778007507,
+      "learning_rate": 0.00013809369331234386,
+      "loss": 1.2674,
+      "step": 8452
+    },
+    {
+      "epoch": 1.5049857549857548,
+      "grad_norm": 0.7369286417961121,
+      "learning_rate": 0.00013808075086210647,
+      "loss": 1.0978,
+      "step": 8453
+    },
+    {
+      "epoch": 1.5051638176638176,
+      "grad_norm": 0.6336679458618164,
+      "learning_rate": 0.00013806780766575588,
+      "loss": 1.0922,
+      "step": 8454
+    },
+    {
+      "epoch": 1.5053418803418803,
+      "grad_norm": 0.700219452381134,
+      "learning_rate": 0.0001380548637235457,
+      "loss": 1.0908,
+      "step": 8455
+    },
+    {
+      "epoch": 1.505519943019943,
+      "grad_norm": 0.6346127986907959,
+      "learning_rate": 0.0001380419190357295,
+      "loss": 1.1265,
+      "step": 8456
+    },
+    {
+      "epoch": 1.5056980056980058,
+      "grad_norm": 0.8653196096420288,
+      "learning_rate": 0.00013802897360256093,
+      "loss": 1.0466,
+      "step": 8457
+    },
+    {
+      "epoch": 1.5058760683760684,
+      "grad_norm": 0.6589069962501526,
+      "learning_rate": 0.0001380160274242936,
+      "loss": 1.245,
+      "step": 8458
+    },
+    {
+      "epoch": 1.506054131054131,
+      "grad_norm": 0.6527602076530457,
+      "learning_rate": 0.00013800308050118117,
+      "loss": 1.1539,
+      "step": 8459
+    },
+    {
+      "epoch": 1.5062321937321936,
+      "grad_norm": 0.6005436182022095,
+      "learning_rate": 0.00013799013283347734,
+      "loss": 0.899,
+      "step": 8460
+    },
+    {
+      "epoch": 1.5064102564102564,
+      "grad_norm": 0.6954274773597717,
+      "learning_rate": 0.0001379771844214358,
+      "loss": 1.1245,
+      "step": 8461
+    },
+    {
+      "epoch": 1.5065883190883191,
+      "grad_norm": 0.658764660358429,
+      "learning_rate": 0.00013796423526531019,
+      "loss": 0.9884,
+      "step": 8462
+    },
+    {
+      "epoch": 1.506766381766382,
+      "grad_norm": 0.652214527130127,
+      "learning_rate": 0.0001379512853653543,
+      "loss": 0.9711,
+      "step": 8463
+    },
+    {
+      "epoch": 1.5069444444444444,
+      "grad_norm": 0.5680044889450073,
+      "learning_rate": 0.00013793833472182176,
+      "loss": 0.9055,
+      "step": 8464
+    },
+    {
+      "epoch": 1.5071225071225072,
+      "grad_norm": 0.7524166703224182,
+      "learning_rate": 0.0001379253833349664,
+      "loss": 1.1163,
+      "step": 8465
+    },
+    {
+      "epoch": 1.5073005698005697,
+      "grad_norm": 0.692936897277832,
+      "learning_rate": 0.0001379124312050419,
+      "loss": 0.899,
+      "step": 8466
+    },
+    {
+      "epoch": 1.5074786324786325,
+      "grad_norm": 0.6871617436408997,
+      "learning_rate": 0.00013789947833230207,
+      "loss": 0.9416,
+      "step": 8467
+    },
+    {
+      "epoch": 1.5076566951566952,
+      "grad_norm": 0.5983462333679199,
+      "learning_rate": 0.0001378865247170007,
+      "loss": 0.9776,
+      "step": 8468
+    },
+    {
+      "epoch": 1.507834757834758,
+      "grad_norm": 0.6486790180206299,
+      "learning_rate": 0.0001378735703593916,
+      "loss": 0.9346,
+      "step": 8469
+    },
+    {
+      "epoch": 1.5080128205128205,
+      "grad_norm": 0.6843809485435486,
+      "learning_rate": 0.00013786061525972857,
+      "loss": 1.1276,
+      "step": 8470
+    },
+    {
+      "epoch": 1.5081908831908832,
+      "grad_norm": 0.5734516382217407,
+      "learning_rate": 0.00013784765941826538,
+      "loss": 0.6939,
+      "step": 8471
+    },
+    {
+      "epoch": 1.5083689458689458,
+      "grad_norm": 0.6126381754875183,
+      "learning_rate": 0.00013783470283525596,
+      "loss": 0.8609,
+      "step": 8472
+    },
+    {
+      "epoch": 1.5085470085470085,
+      "grad_norm": 0.7570928335189819,
+      "learning_rate": 0.00013782174551095415,
+      "loss": 0.8809,
+      "step": 8473
+    },
+    {
+      "epoch": 1.5087250712250713,
+      "grad_norm": 0.6911360025405884,
+      "learning_rate": 0.00013780878744561377,
+      "loss": 0.9916,
+      "step": 8474
+    },
+    {
+      "epoch": 1.508903133903134,
+      "grad_norm": 0.6651954650878906,
+      "learning_rate": 0.00013779582863948878,
+      "loss": 1.0012,
+      "step": 8475
+    },
+    {
+      "epoch": 1.5090811965811965,
+      "grad_norm": 0.845396876335144,
+      "learning_rate": 0.000137782869092833,
+      "loss": 0.8455,
+      "step": 8476
+    },
+    {
+      "epoch": 1.5092592592592593,
+      "grad_norm": 0.6958050727844238,
+      "learning_rate": 0.00013776990880590042,
+      "loss": 1.0264,
+      "step": 8477
+    },
+    {
+      "epoch": 1.5094373219373218,
+      "grad_norm": 0.6950124502182007,
+      "learning_rate": 0.00013775694777894493,
+      "loss": 1.0547,
+      "step": 8478
+    },
+    {
+      "epoch": 1.5096153846153846,
+      "grad_norm": 0.7243088483810425,
+      "learning_rate": 0.00013774398601222045,
+      "loss": 1.0999,
+      "step": 8479
+    },
+    {
+      "epoch": 1.5097934472934473,
+      "grad_norm": 0.6820448040962219,
+      "learning_rate": 0.00013773102350598097,
+      "loss": 0.823,
+      "step": 8480
+    },
+    {
+      "epoch": 1.50997150997151,
+      "grad_norm": 0.689996063709259,
+      "learning_rate": 0.0001377180602604805,
+      "loss": 1.049,
+      "step": 8481
+    },
+    {
+      "epoch": 1.5101495726495726,
+      "grad_norm": 0.6763314604759216,
+      "learning_rate": 0.000137705096275973,
+      "loss": 0.9633,
+      "step": 8482
+    },
+    {
+      "epoch": 1.5103276353276354,
+      "grad_norm": 0.6760517358779907,
+      "learning_rate": 0.00013769213155271243,
+      "loss": 1.0326,
+      "step": 8483
+    },
+    {
+      "epoch": 1.510505698005698,
+      "grad_norm": 0.7181188464164734,
+      "learning_rate": 0.00013767916609095285,
+      "loss": 0.9629,
+      "step": 8484
+    },
+    {
+      "epoch": 1.5106837606837606,
+      "grad_norm": 0.7102212905883789,
+      "learning_rate": 0.0001376661998909483,
+      "loss": 1.2714,
+      "step": 8485
+    },
+    {
+      "epoch": 1.5108618233618234,
+      "grad_norm": 0.6719805598258972,
+      "learning_rate": 0.00013765323295295278,
+      "loss": 0.7848,
+      "step": 8486
+    },
+    {
+      "epoch": 1.5110398860398861,
+      "grad_norm": 0.6592095494270325,
+      "learning_rate": 0.0001376402652772204,
+      "loss": 0.882,
+      "step": 8487
+    },
+    {
+      "epoch": 1.5112179487179487,
+      "grad_norm": 0.6858693361282349,
+      "learning_rate": 0.00013762729686400522,
+      "loss": 0.9418,
+      "step": 8488
+    },
+    {
+      "epoch": 1.5113960113960114,
+      "grad_norm": 0.7183199524879456,
+      "learning_rate": 0.0001376143277135613,
+      "loss": 1.0611,
+      "step": 8489
+    },
+    {
+      "epoch": 1.511574074074074,
+      "grad_norm": 0.6294263005256653,
+      "learning_rate": 0.00013760135782614277,
+      "loss": 0.864,
+      "step": 8490
+    },
+    {
+      "epoch": 1.5117521367521367,
+      "grad_norm": 0.6762619614601135,
+      "learning_rate": 0.00013758838720200376,
+      "loss": 1.0295,
+      "step": 8491
+    },
+    {
+      "epoch": 1.5119301994301995,
+      "grad_norm": 0.6919726133346558,
+      "learning_rate": 0.00013757541584139834,
+      "loss": 1.0803,
+      "step": 8492
+    },
+    {
+      "epoch": 1.5121082621082622,
+      "grad_norm": 0.6801241040229797,
+      "learning_rate": 0.00013756244374458075,
+      "loss": 1.1394,
+      "step": 8493
+    },
+    {
+      "epoch": 1.5122863247863247,
+      "grad_norm": 0.6758754253387451,
+      "learning_rate": 0.0001375494709118051,
+      "loss": 1.0053,
+      "step": 8494
+    },
+    {
+      "epoch": 1.5124643874643875,
+      "grad_norm": 0.6727001070976257,
+      "learning_rate": 0.00013753649734332555,
+      "loss": 1.1407,
+      "step": 8495
+    },
+    {
+      "epoch": 1.51264245014245,
+      "grad_norm": 0.693913459777832,
+      "learning_rate": 0.00013752352303939632,
+      "loss": 1.1804,
+      "step": 8496
+    },
+    {
+      "epoch": 1.5128205128205128,
+      "grad_norm": 0.6122510433197021,
+      "learning_rate": 0.0001375105480002716,
+      "loss": 0.917,
+      "step": 8497
+    },
+    {
+      "epoch": 1.5129985754985755,
+      "grad_norm": 0.6305009722709656,
+      "learning_rate": 0.00013749757222620562,
+      "loss": 1.1075,
+      "step": 8498
+    },
+    {
+      "epoch": 1.5131766381766383,
+      "grad_norm": 0.7249642610549927,
+      "learning_rate": 0.0001374845957174526,
+      "loss": 0.9107,
+      "step": 8499
+    },
+    {
+      "epoch": 1.5133547008547008,
+      "grad_norm": 0.6922136545181274,
+      "learning_rate": 0.0001374716184742668,
+      "loss": 0.9974,
+      "step": 8500
+    },
+    {
+      "epoch": 1.5135327635327636,
+      "grad_norm": 0.6989904046058655,
+      "learning_rate": 0.00013745864049690245,
+      "loss": 0.9866,
+      "step": 8501
+    },
+    {
+      "epoch": 1.513710826210826,
+      "grad_norm": 0.6284058094024658,
+      "learning_rate": 0.0001374456617856139,
+      "loss": 0.8658,
+      "step": 8502
+    },
+    {
+      "epoch": 1.5138888888888888,
+      "grad_norm": 0.615388810634613,
+      "learning_rate": 0.00013743268234065535,
+      "loss": 0.7876,
+      "step": 8503
+    },
+    {
+      "epoch": 1.5140669515669516,
+      "grad_norm": 0.6212600469589233,
+      "learning_rate": 0.0001374197021622812,
+      "loss": 0.855,
+      "step": 8504
+    },
+    {
+      "epoch": 1.5142450142450143,
+      "grad_norm": 0.6312419772148132,
+      "learning_rate": 0.00013740672125074567,
+      "loss": 0.9252,
+      "step": 8505
+    },
+    {
+      "epoch": 1.5144230769230769,
+      "grad_norm": 0.7094576954841614,
+      "learning_rate": 0.00013739373960630315,
+      "loss": 0.7655,
+      "step": 8506
+    },
+    {
+      "epoch": 1.5146011396011396,
+      "grad_norm": 0.5583470463752747,
+      "learning_rate": 0.000137380757229208,
+      "loss": 0.7855,
+      "step": 8507
+    },
+    {
+      "epoch": 1.5147792022792022,
+      "grad_norm": 0.6798399686813354,
+      "learning_rate": 0.00013736777411971457,
+      "loss": 0.9935,
+      "step": 8508
+    },
+    {
+      "epoch": 1.514957264957265,
+      "grad_norm": 0.7835991978645325,
+      "learning_rate": 0.00013735479027807723,
+      "loss": 1.1603,
+      "step": 8509
+    },
+    {
+      "epoch": 1.5151353276353277,
+      "grad_norm": 0.6230790615081787,
+      "learning_rate": 0.00013734180570455033,
+      "loss": 1.1463,
+      "step": 8510
+    },
+    {
+      "epoch": 1.5153133903133904,
+      "grad_norm": 0.646603524684906,
+      "learning_rate": 0.00013732882039938835,
+      "loss": 0.9564,
+      "step": 8511
+    },
+    {
+      "epoch": 1.515491452991453,
+      "grad_norm": 0.6619647145271301,
+      "learning_rate": 0.0001373158343628457,
+      "loss": 0.8492,
+      "step": 8512
+    },
+    {
+      "epoch": 1.5156695156695157,
+      "grad_norm": 0.6458454132080078,
+      "learning_rate": 0.00013730284759517675,
+      "loss": 1.0049,
+      "step": 8513
+    },
+    {
+      "epoch": 1.5158475783475782,
+      "grad_norm": 0.7415743470191956,
+      "learning_rate": 0.00013728986009663602,
+      "loss": 0.872,
+      "step": 8514
+    },
+    {
+      "epoch": 1.516025641025641,
+      "grad_norm": 0.6198840141296387,
+      "learning_rate": 0.00013727687186747793,
+      "loss": 0.8645,
+      "step": 8515
+    },
+    {
+      "epoch": 1.5162037037037037,
+      "grad_norm": 0.7160853147506714,
+      "learning_rate": 0.00013726388290795697,
+      "loss": 1.0144,
+      "step": 8516
+    },
+    {
+      "epoch": 1.5163817663817665,
+      "grad_norm": 0.6604135632514954,
+      "learning_rate": 0.00013725089321832765,
+      "loss": 0.9827,
+      "step": 8517
+    },
+    {
+      "epoch": 1.5165598290598292,
+      "grad_norm": 0.6480790972709656,
+      "learning_rate": 0.00013723790279884443,
+      "loss": 1.0357,
+      "step": 8518
+    },
+    {
+      "epoch": 1.5167378917378918,
+      "grad_norm": 0.6207128167152405,
+      "learning_rate": 0.00013722491164976187,
+      "loss": 0.9467,
+      "step": 8519
+    },
+    {
+      "epoch": 1.5169159544159543,
+      "grad_norm": 0.6024298667907715,
+      "learning_rate": 0.00013721191977133452,
+      "loss": 0.8821,
+      "step": 8520
+    },
+    {
+      "epoch": 1.517094017094017,
+      "grad_norm": 0.684898316860199,
+      "learning_rate": 0.00013719892716381688,
+      "loss": 0.9823,
+      "step": 8521
+    },
+    {
+      "epoch": 1.5172720797720798,
+      "grad_norm": 0.7460635304450989,
+      "learning_rate": 0.00013718593382746355,
+      "loss": 1.2573,
+      "step": 8522
+    },
+    {
+      "epoch": 1.5174501424501425,
+      "grad_norm": 0.7193243503570557,
+      "learning_rate": 0.00013717293976252907,
+      "loss": 1.0162,
+      "step": 8523
+    },
+    {
+      "epoch": 1.5176282051282053,
+      "grad_norm": 0.6328752040863037,
+      "learning_rate": 0.0001371599449692681,
+      "loss": 0.8183,
+      "step": 8524
+    },
+    {
+      "epoch": 1.5178062678062678,
+      "grad_norm": 0.658784806728363,
+      "learning_rate": 0.00013714694944793517,
+      "loss": 0.9315,
+      "step": 8525
+    },
+    {
+      "epoch": 1.5179843304843303,
+      "grad_norm": 0.7875827550888062,
+      "learning_rate": 0.00013713395319878493,
+      "loss": 1.0889,
+      "step": 8526
+    },
+    {
+      "epoch": 1.518162393162393,
+      "grad_norm": 0.6580079793930054,
+      "learning_rate": 0.00013712095622207203,
+      "loss": 1.0276,
+      "step": 8527
+    },
+    {
+      "epoch": 1.5183404558404558,
+      "grad_norm": 0.6214027404785156,
+      "learning_rate": 0.00013710795851805106,
+      "loss": 0.9692,
+      "step": 8528
+    },
+    {
+      "epoch": 1.5185185185185186,
+      "grad_norm": 0.7839403748512268,
+      "learning_rate": 0.0001370949600869768,
+      "loss": 0.7378,
+      "step": 8529
+    },
+    {
+      "epoch": 1.5186965811965814,
+      "grad_norm": 0.6632764339447021,
+      "learning_rate": 0.0001370819609291038,
+      "loss": 0.9431,
+      "step": 8530
+    },
+    {
+      "epoch": 1.5188746438746439,
+      "grad_norm": 0.7071712017059326,
+      "learning_rate": 0.00013706896104468682,
+      "loss": 0.7684,
+      "step": 8531
+    },
+    {
+      "epoch": 1.5190527065527064,
+      "grad_norm": 0.7494829297065735,
+      "learning_rate": 0.00013705596043398058,
+      "loss": 0.9709,
+      "step": 8532
+    },
+    {
+      "epoch": 1.5192307692307692,
+      "grad_norm": 0.6408106088638306,
+      "learning_rate": 0.00013704295909723973,
+      "loss": 0.8494,
+      "step": 8533
+    },
+    {
+      "epoch": 1.519408831908832,
+      "grad_norm": 0.6043150424957275,
+      "learning_rate": 0.0001370299570347191,
+      "loss": 0.7485,
+      "step": 8534
+    },
+    {
+      "epoch": 1.5195868945868947,
+      "grad_norm": 0.6944992542266846,
+      "learning_rate": 0.00013701695424667336,
+      "loss": 0.8403,
+      "step": 8535
+    },
+    {
+      "epoch": 1.5197649572649574,
+      "grad_norm": 0.7730217576026917,
+      "learning_rate": 0.00013700395073335726,
+      "loss": 0.9122,
+      "step": 8536
+    },
+    {
+      "epoch": 1.51994301994302,
+      "grad_norm": 0.6300255060195923,
+      "learning_rate": 0.00013699094649502564,
+      "loss": 0.9185,
+      "step": 8537
+    },
+    {
+      "epoch": 1.5201210826210825,
+      "grad_norm": 0.648676335811615,
+      "learning_rate": 0.00013697794153193327,
+      "loss": 0.9897,
+      "step": 8538
+    },
+    {
+      "epoch": 1.5202991452991452,
+      "grad_norm": 0.7365788817405701,
+      "learning_rate": 0.00013696493584433494,
+      "loss": 0.958,
+      "step": 8539
+    },
+    {
+      "epoch": 1.520477207977208,
+      "grad_norm": 0.6634557247161865,
+      "learning_rate": 0.00013695192943248552,
+      "loss": 0.9389,
+      "step": 8540
+    },
+    {
+      "epoch": 1.5206552706552707,
+      "grad_norm": 0.6110827922821045,
+      "learning_rate": 0.00013693892229663977,
+      "loss": 0.9341,
+      "step": 8541
+    },
+    {
+      "epoch": 1.5208333333333335,
+      "grad_norm": 0.7207275032997131,
+      "learning_rate": 0.00013692591443705256,
+      "loss": 0.9526,
+      "step": 8542
+    },
+    {
+      "epoch": 1.521011396011396,
+      "grad_norm": 0.7071022391319275,
+      "learning_rate": 0.0001369129058539788,
+      "loss": 0.9572,
+      "step": 8543
+    },
+    {
+      "epoch": 1.5211894586894585,
+      "grad_norm": 0.5898227691650391,
+      "learning_rate": 0.0001368998965476733,
+      "loss": 0.921,
+      "step": 8544
+    },
+    {
+      "epoch": 1.5213675213675213,
+      "grad_norm": 0.7542559504508972,
+      "learning_rate": 0.000136886886518391,
+      "loss": 0.7799,
+      "step": 8545
+    },
+    {
+      "epoch": 1.521545584045584,
+      "grad_norm": 0.6904959678649902,
+      "learning_rate": 0.00013687387576638674,
+      "loss": 0.9601,
+      "step": 8546
+    },
+    {
+      "epoch": 1.5217236467236468,
+      "grad_norm": 0.763414204120636,
+      "learning_rate": 0.00013686086429191553,
+      "loss": 1.0046,
+      "step": 8547
+    },
+    {
+      "epoch": 1.5219017094017095,
+      "grad_norm": 0.6879960298538208,
+      "learning_rate": 0.00013684785209523224,
+      "loss": 0.9615,
+      "step": 8548
+    },
+    {
+      "epoch": 1.522079772079772,
+      "grad_norm": 0.7166057229042053,
+      "learning_rate": 0.00013683483917659186,
+      "loss": 0.9481,
+      "step": 8549
+    },
+    {
+      "epoch": 1.5222578347578346,
+      "grad_norm": 0.6384348273277283,
+      "learning_rate": 0.0001368218255362493,
+      "loss": 1.1037,
+      "step": 8550
+    },
+    {
+      "epoch": 1.5224358974358974,
+      "grad_norm": 0.6564528346061707,
+      "learning_rate": 0.00013680881117445953,
+      "loss": 0.951,
+      "step": 8551
+    },
+    {
+      "epoch": 1.52261396011396,
+      "grad_norm": 0.749301016330719,
+      "learning_rate": 0.00013679579609147762,
+      "loss": 0.9324,
+      "step": 8552
+    },
+    {
+      "epoch": 1.5227920227920229,
+      "grad_norm": 0.8130472898483276,
+      "learning_rate": 0.00013678278028755848,
+      "loss": 1.0178,
+      "step": 8553
+    },
+    {
+      "epoch": 1.5229700854700856,
+      "grad_norm": 0.6763297319412231,
+      "learning_rate": 0.0001367697637629572,
+      "loss": 0.9224,
+      "step": 8554
+    },
+    {
+      "epoch": 1.5231481481481481,
+      "grad_norm": 0.6630885601043701,
+      "learning_rate": 0.00013675674651792878,
+      "loss": 1.0254,
+      "step": 8555
+    },
+    {
+      "epoch": 1.5233262108262107,
+      "grad_norm": 0.7377206087112427,
+      "learning_rate": 0.00013674372855272825,
+      "loss": 1.0413,
+      "step": 8556
+    },
+    {
+      "epoch": 1.5235042735042734,
+      "grad_norm": 0.5270320177078247,
+      "learning_rate": 0.00013673070986761068,
+      "loss": 0.7124,
+      "step": 8557
+    },
+    {
+      "epoch": 1.5236823361823362,
+      "grad_norm": 0.5941976308822632,
+      "learning_rate": 0.00013671769046283116,
+      "loss": 1.0281,
+      "step": 8558
+    },
+    {
+      "epoch": 1.523860398860399,
+      "grad_norm": 0.6131376028060913,
+      "learning_rate": 0.0001367046703386448,
+      "loss": 0.7593,
+      "step": 8559
+    },
+    {
+      "epoch": 1.5240384615384617,
+      "grad_norm": 0.7381763458251953,
+      "learning_rate": 0.00013669164949530664,
+      "loss": 1.148,
+      "step": 8560
+    },
+    {
+      "epoch": 1.5242165242165242,
+      "grad_norm": 0.683274507522583,
+      "learning_rate": 0.00013667862793307185,
+      "loss": 0.8354,
+      "step": 8561
+    },
+    {
+      "epoch": 1.5243945868945867,
+      "grad_norm": 0.6912649273872375,
+      "learning_rate": 0.0001366656056521955,
+      "loss": 0.9043,
+      "step": 8562
+    },
+    {
+      "epoch": 1.5245726495726495,
+      "grad_norm": 0.5999594330787659,
+      "learning_rate": 0.0001366525826529328,
+      "loss": 0.6138,
+      "step": 8563
+    },
+    {
+      "epoch": 1.5247507122507122,
+      "grad_norm": 0.7185927629470825,
+      "learning_rate": 0.00013663955893553892,
+      "loss": 0.895,
+      "step": 8564
+    },
+    {
+      "epoch": 1.524928774928775,
+      "grad_norm": 0.5967002511024475,
+      "learning_rate": 0.00013662653450026893,
+      "loss": 0.9636,
+      "step": 8565
+    },
+    {
+      "epoch": 1.5251068376068377,
+      "grad_norm": 0.7122953534126282,
+      "learning_rate": 0.00013661350934737813,
+      "loss": 0.9465,
+      "step": 8566
+    },
+    {
+      "epoch": 1.5252849002849003,
+      "grad_norm": 0.705326497554779,
+      "learning_rate": 0.00013660048347712163,
+      "loss": 1.121,
+      "step": 8567
+    },
+    {
+      "epoch": 1.5254629629629628,
+      "grad_norm": 0.6023733019828796,
+      "learning_rate": 0.0001365874568897547,
+      "loss": 0.9881,
+      "step": 8568
+    },
+    {
+      "epoch": 1.5256410256410255,
+      "grad_norm": 0.6883122324943542,
+      "learning_rate": 0.0001365744295855326,
+      "loss": 1.2372,
+      "step": 8569
+    },
+    {
+      "epoch": 1.5258190883190883,
+      "grad_norm": 0.718126654624939,
+      "learning_rate": 0.0001365614015647105,
+      "loss": 1.0888,
+      "step": 8570
+    },
+    {
+      "epoch": 1.525997150997151,
+      "grad_norm": 0.6649243831634521,
+      "learning_rate": 0.00013654837282754367,
+      "loss": 1.0458,
+      "step": 8571
+    },
+    {
+      "epoch": 1.5261752136752138,
+      "grad_norm": 0.6959797143936157,
+      "learning_rate": 0.00013653534337428738,
+      "loss": 0.9282,
+      "step": 8572
+    },
+    {
+      "epoch": 1.5263532763532763,
+      "grad_norm": 0.6069976687431335,
+      "learning_rate": 0.00013652231320519697,
+      "loss": 0.9706,
+      "step": 8573
+    },
+    {
+      "epoch": 1.526531339031339,
+      "grad_norm": 0.7085374593734741,
+      "learning_rate": 0.0001365092823205277,
+      "loss": 1.1241,
+      "step": 8574
+    },
+    {
+      "epoch": 1.5267094017094016,
+      "grad_norm": 0.575106143951416,
+      "learning_rate": 0.00013649625072053488,
+      "loss": 0.9814,
+      "step": 8575
+    },
+    {
+      "epoch": 1.5268874643874644,
+      "grad_norm": 0.6541273593902588,
+      "learning_rate": 0.00013648321840547384,
+      "loss": 1.0155,
+      "step": 8576
+    },
+    {
+      "epoch": 1.5270655270655271,
+      "grad_norm": 0.6754382848739624,
+      "learning_rate": 0.0001364701853755999,
+      "loss": 1.0284,
+      "step": 8577
+    },
+    {
+      "epoch": 1.5272435897435899,
+      "grad_norm": 0.6219634413719177,
+      "learning_rate": 0.00013645715163116846,
+      "loss": 1.1539,
+      "step": 8578
+    },
+    {
+      "epoch": 1.5274216524216524,
+      "grad_norm": 0.7625157833099365,
+      "learning_rate": 0.00013644411717243486,
+      "loss": 1.1157,
+      "step": 8579
+    },
+    {
+      "epoch": 1.5275997150997151,
+      "grad_norm": 0.6944296956062317,
+      "learning_rate": 0.0001364310819996545,
+      "loss": 0.8309,
+      "step": 8580
+    },
+    {
+      "epoch": 1.5277777777777777,
+      "grad_norm": 0.7198494672775269,
+      "learning_rate": 0.00013641804611308277,
+      "loss": 1.0883,
+      "step": 8581
+    },
+    {
+      "epoch": 1.5279558404558404,
+      "grad_norm": 0.6398822069168091,
+      "learning_rate": 0.00013640500951297508,
+      "loss": 1.0173,
+      "step": 8582
+    },
+    {
+      "epoch": 1.5281339031339032,
+      "grad_norm": 0.7306683659553528,
+      "learning_rate": 0.00013639197219958682,
+      "loss": 0.9979,
+      "step": 8583
+    },
+    {
+      "epoch": 1.528311965811966,
+      "grad_norm": 0.6873512268066406,
+      "learning_rate": 0.00013637893417317348,
+      "loss": 0.7883,
+      "step": 8584
+    },
+    {
+      "epoch": 1.5284900284900285,
+      "grad_norm": 0.6482085585594177,
+      "learning_rate": 0.00013636589543399052,
+      "loss": 0.9367,
+      "step": 8585
+    },
+    {
+      "epoch": 1.5286680911680912,
+      "grad_norm": 0.8161232471466064,
+      "learning_rate": 0.00013635285598229336,
+      "loss": 1.0582,
+      "step": 8586
+    },
+    {
+      "epoch": 1.5288461538461537,
+      "grad_norm": 0.6722155809402466,
+      "learning_rate": 0.0001363398158183375,
+      "loss": 0.9805,
+      "step": 8587
+    },
+    {
+      "epoch": 1.5290242165242165,
+      "grad_norm": 0.7175397872924805,
+      "learning_rate": 0.00013632677494237845,
+      "loss": 1.0747,
+      "step": 8588
+    },
+    {
+      "epoch": 1.5292022792022792,
+      "grad_norm": 0.6665592789649963,
+      "learning_rate": 0.00013631373335467172,
+      "loss": 1.006,
+      "step": 8589
+    },
+    {
+      "epoch": 1.529380341880342,
+      "grad_norm": 0.7002299427986145,
+      "learning_rate": 0.0001363006910554728,
+      "loss": 1.0702,
+      "step": 8590
+    },
+    {
+      "epoch": 1.5295584045584045,
+      "grad_norm": 0.7712168097496033,
+      "learning_rate": 0.00013628764804503725,
+      "loss": 1.0628,
+      "step": 8591
+    },
+    {
+      "epoch": 1.5297364672364673,
+      "grad_norm": 0.6620795130729675,
+      "learning_rate": 0.0001362746043236206,
+      "loss": 1.01,
+      "step": 8592
+    },
+    {
+      "epoch": 1.5299145299145298,
+      "grad_norm": 0.6374393701553345,
+      "learning_rate": 0.00013626155989147846,
+      "loss": 0.9106,
+      "step": 8593
+    },
+    {
+      "epoch": 1.5300925925925926,
+      "grad_norm": 0.6531631946563721,
+      "learning_rate": 0.00013624851474886636,
+      "loss": 1.0488,
+      "step": 8594
+    },
+    {
+      "epoch": 1.5302706552706553,
+      "grad_norm": 0.6843775510787964,
+      "learning_rate": 0.00013623546889603993,
+      "loss": 0.8599,
+      "step": 8595
+    },
+    {
+      "epoch": 1.530448717948718,
+      "grad_norm": 0.7232706546783447,
+      "learning_rate": 0.00013622242233325476,
+      "loss": 1.0875,
+      "step": 8596
+    },
+    {
+      "epoch": 1.5306267806267806,
+      "grad_norm": 0.695691704750061,
+      "learning_rate": 0.00013620937506076644,
+      "loss": 0.9835,
+      "step": 8597
+    },
+    {
+      "epoch": 1.5308048433048433,
+      "grad_norm": 0.6321248412132263,
+      "learning_rate": 0.00013619632707883065,
+      "loss": 0.9778,
+      "step": 8598
+    },
+    {
+      "epoch": 1.5309829059829059,
+      "grad_norm": 0.6469168663024902,
+      "learning_rate": 0.00013618327838770303,
+      "loss": 0.9968,
+      "step": 8599
+    },
+    {
+      "epoch": 1.5311609686609686,
+      "grad_norm": 0.6798683404922485,
+      "learning_rate": 0.00013617022898763925,
+      "loss": 0.78,
+      "step": 8600
+    },
+    {
+      "epoch": 1.5313390313390314,
+      "grad_norm": 0.6932336091995239,
+      "learning_rate": 0.00013615717887889496,
+      "loss": 0.9473,
+      "step": 8601
+    },
+    {
+      "epoch": 1.5315170940170941,
+      "grad_norm": 0.7304185628890991,
+      "learning_rate": 0.00013614412806172585,
+      "loss": 1.0478,
+      "step": 8602
+    },
+    {
+      "epoch": 1.5316951566951567,
+      "grad_norm": 0.6585272550582886,
+      "learning_rate": 0.00013613107653638763,
+      "loss": 0.8563,
+      "step": 8603
+    },
+    {
+      "epoch": 1.5318732193732194,
+      "grad_norm": 0.6804470419883728,
+      "learning_rate": 0.00013611802430313604,
+      "loss": 0.9839,
+      "step": 8604
+    },
+    {
+      "epoch": 1.532051282051282,
+      "grad_norm": 0.7271378040313721,
+      "learning_rate": 0.0001361049713622268,
+      "loss": 1.0906,
+      "step": 8605
+    },
+    {
+      "epoch": 1.5322293447293447,
+      "grad_norm": 0.7731603980064392,
+      "learning_rate": 0.00013609191771391562,
+      "loss": 1.1318,
+      "step": 8606
+    },
+    {
+      "epoch": 1.5324074074074074,
+      "grad_norm": 0.6143709421157837,
+      "learning_rate": 0.0001360788633584583,
+      "loss": 0.8726,
+      "step": 8607
+    },
+    {
+      "epoch": 1.5325854700854702,
+      "grad_norm": 0.6847203373908997,
+      "learning_rate": 0.00013606580829611056,
+      "loss": 0.9963,
+      "step": 8608
+    },
+    {
+      "epoch": 1.5327635327635327,
+      "grad_norm": 0.7561219334602356,
+      "learning_rate": 0.0001360527525271283,
+      "loss": 0.8873,
+      "step": 8609
+    },
+    {
+      "epoch": 1.5329415954415955,
+      "grad_norm": 0.7997925281524658,
+      "learning_rate": 0.0001360396960517672,
+      "loss": 0.7675,
+      "step": 8610
+    },
+    {
+      "epoch": 1.533119658119658,
+      "grad_norm": 0.7206357717514038,
+      "learning_rate": 0.00013602663887028315,
+      "loss": 1.0084,
+      "step": 8611
+    },
+    {
+      "epoch": 1.5332977207977208,
+      "grad_norm": 0.6454238891601562,
+      "learning_rate": 0.00013601358098293194,
+      "loss": 0.8194,
+      "step": 8612
+    },
+    {
+      "epoch": 1.5334757834757835,
+      "grad_norm": 0.5531884431838989,
+      "learning_rate": 0.0001360005223899694,
+      "loss": 0.8596,
+      "step": 8613
+    },
+    {
+      "epoch": 1.5336538461538463,
+      "grad_norm": 0.659161388874054,
+      "learning_rate": 0.00013598746309165144,
+      "loss": 1.0363,
+      "step": 8614
+    },
+    {
+      "epoch": 1.5338319088319088,
+      "grad_norm": 0.6958948373794556,
+      "learning_rate": 0.00013597440308823385,
+      "loss": 0.9852,
+      "step": 8615
+    },
+    {
+      "epoch": 1.5340099715099715,
+      "grad_norm": 0.7147171497344971,
+      "learning_rate": 0.0001359613423799726,
+      "loss": 1.0506,
+      "step": 8616
+    },
+    {
+      "epoch": 1.534188034188034,
+      "grad_norm": 0.604450523853302,
+      "learning_rate": 0.00013594828096712353,
+      "loss": 0.9344,
+      "step": 8617
+    },
+    {
+      "epoch": 1.5343660968660968,
+      "grad_norm": 0.714547336101532,
+      "learning_rate": 0.00013593521884994257,
+      "loss": 1.1583,
+      "step": 8618
+    },
+    {
+      "epoch": 1.5345441595441596,
+      "grad_norm": 0.6864442825317383,
+      "learning_rate": 0.00013592215602868565,
+      "loss": 0.991,
+      "step": 8619
+    },
+    {
+      "epoch": 1.5347222222222223,
+      "grad_norm": 0.6384446620941162,
+      "learning_rate": 0.00013590909250360873,
+      "loss": 0.8799,
+      "step": 8620
+    },
+    {
+      "epoch": 1.5349002849002849,
+      "grad_norm": 0.7307949662208557,
+      "learning_rate": 0.00013589602827496772,
+      "loss": 1.0276,
+      "step": 8621
+    },
+    {
+      "epoch": 1.5350783475783476,
+      "grad_norm": 0.6620129942893982,
+      "learning_rate": 0.00013588296334301862,
+      "loss": 0.9378,
+      "step": 8622
+    },
+    {
+      "epoch": 1.5352564102564101,
+      "grad_norm": 0.7216851711273193,
+      "learning_rate": 0.00013586989770801735,
+      "loss": 0.8984,
+      "step": 8623
+    },
+    {
+      "epoch": 1.5354344729344729,
+      "grad_norm": 0.7319885492324829,
+      "learning_rate": 0.00013585683137022,
+      "loss": 1.0357,
+      "step": 8624
+    },
+    {
+      "epoch": 1.5356125356125356,
+      "grad_norm": 0.7455703616142273,
+      "learning_rate": 0.00013584376432988247,
+      "loss": 0.9727,
+      "step": 8625
+    },
+    {
+      "epoch": 1.5357905982905984,
+      "grad_norm": 0.7285277247428894,
+      "learning_rate": 0.0001358306965872609,
+      "loss": 1.1132,
+      "step": 8626
+    },
+    {
+      "epoch": 1.535968660968661,
+      "grad_norm": 0.6250096559524536,
+      "learning_rate": 0.00013581762814261124,
+      "loss": 0.8538,
+      "step": 8627
+    },
+    {
+      "epoch": 1.5361467236467237,
+      "grad_norm": 0.6252279281616211,
+      "learning_rate": 0.0001358045589961895,
+      "loss": 0.822,
+      "step": 8628
+    },
+    {
+      "epoch": 1.5363247863247862,
+      "grad_norm": 0.7723368406295776,
+      "learning_rate": 0.0001357914891482519,
+      "loss": 0.9841,
+      "step": 8629
+    },
+    {
+      "epoch": 1.536502849002849,
+      "grad_norm": 0.6855236887931824,
+      "learning_rate": 0.00013577841859905435,
+      "loss": 0.9512,
+      "step": 8630
+    },
+    {
+      "epoch": 1.5366809116809117,
+      "grad_norm": 0.8320944309234619,
+      "learning_rate": 0.00013576534734885303,
+      "loss": 1.0324,
+      "step": 8631
+    },
+    {
+      "epoch": 1.5368589743589745,
+      "grad_norm": 0.6970052123069763,
+      "learning_rate": 0.00013575227539790405,
+      "loss": 0.9874,
+      "step": 8632
+    },
+    {
+      "epoch": 1.5370370370370372,
+      "grad_norm": 0.7774853110313416,
+      "learning_rate": 0.00013573920274646345,
+      "loss": 0.962,
+      "step": 8633
+    },
+    {
+      "epoch": 1.5372150997150997,
+      "grad_norm": 0.6479182839393616,
+      "learning_rate": 0.0001357261293947875,
+      "loss": 0.9438,
+      "step": 8634
+    },
+    {
+      "epoch": 1.5373931623931623,
+      "grad_norm": 0.6855679750442505,
+      "learning_rate": 0.00013571305534313218,
+      "loss": 1.0898,
+      "step": 8635
+    },
+    {
+      "epoch": 1.537571225071225,
+      "grad_norm": 0.6527835726737976,
+      "learning_rate": 0.00013569998059175377,
+      "loss": 0.954,
+      "step": 8636
+    },
+    {
+      "epoch": 1.5377492877492878,
+      "grad_norm": 0.6601176857948303,
+      "learning_rate": 0.00013568690514090837,
+      "loss": 1.0183,
+      "step": 8637
+    },
+    {
+      "epoch": 1.5379273504273505,
+      "grad_norm": 0.6628120541572571,
+      "learning_rate": 0.0001356738289908522,
+      "loss": 1.0651,
+      "step": 8638
+    },
+    {
+      "epoch": 1.5381054131054133,
+      "grad_norm": 0.7492203712463379,
+      "learning_rate": 0.00013566075214184147,
+      "loss": 1.2438,
+      "step": 8639
+    },
+    {
+      "epoch": 1.5382834757834758,
+      "grad_norm": 0.6781343817710876,
+      "learning_rate": 0.00013564767459413237,
+      "loss": 0.9413,
+      "step": 8640
+    },
+    {
+      "epoch": 1.5384615384615383,
+      "grad_norm": 0.6890891790390015,
+      "learning_rate": 0.00013563459634798115,
+      "loss": 0.9912,
+      "step": 8641
+    },
+    {
+      "epoch": 1.538639601139601,
+      "grad_norm": 0.722820520401001,
+      "learning_rate": 0.00013562151740364404,
+      "loss": 1.1799,
+      "step": 8642
+    },
+    {
+      "epoch": 1.5388176638176638,
+      "grad_norm": 0.738369882106781,
+      "learning_rate": 0.0001356084377613773,
+      "loss": 1.1313,
+      "step": 8643
+    },
+    {
+      "epoch": 1.5389957264957266,
+      "grad_norm": 0.6232718229293823,
+      "learning_rate": 0.00013559535742143717,
+      "loss": 0.9035,
+      "step": 8644
+    },
+    {
+      "epoch": 1.5391737891737893,
+      "grad_norm": 0.7371624708175659,
+      "learning_rate": 0.00013558227638407996,
+      "loss": 1.3377,
+      "step": 8645
+    },
+    {
+      "epoch": 1.5393518518518519,
+      "grad_norm": 0.658353865146637,
+      "learning_rate": 0.00013556919464956197,
+      "loss": 0.9591,
+      "step": 8646
+    },
+    {
+      "epoch": 1.5395299145299144,
+      "grad_norm": 0.6205827593803406,
+      "learning_rate": 0.0001355561122181395,
+      "loss": 0.9217,
+      "step": 8647
+    },
+    {
+      "epoch": 1.5397079772079771,
+      "grad_norm": 0.5892502069473267,
+      "learning_rate": 0.00013554302909006888,
+      "loss": 0.8893,
+      "step": 8648
+    },
+    {
+      "epoch": 1.53988603988604,
+      "grad_norm": 1.224568486213684,
+      "learning_rate": 0.0001355299452656064,
+      "loss": 0.8237,
+      "step": 8649
+    },
+    {
+      "epoch": 1.5400641025641026,
+      "grad_norm": 0.7732635736465454,
+      "learning_rate": 0.0001355168607450085,
+      "loss": 1.1043,
+      "step": 8650
+    },
+    {
+      "epoch": 1.5402421652421654,
+      "grad_norm": 0.6365402340888977,
+      "learning_rate": 0.00013550377552853146,
+      "loss": 1.0345,
+      "step": 8651
+    },
+    {
+      "epoch": 1.540420227920228,
+      "grad_norm": 0.7046400904655457,
+      "learning_rate": 0.00013549068961643171,
+      "loss": 1.0361,
+      "step": 8652
+    },
+    {
+      "epoch": 1.5405982905982905,
+      "grad_norm": 0.6760256886482239,
+      "learning_rate": 0.0001354776030089656,
+      "loss": 0.9437,
+      "step": 8653
+    },
+    {
+      "epoch": 1.5407763532763532,
+      "grad_norm": 0.6180984973907471,
+      "learning_rate": 0.00013546451570638958,
+      "loss": 0.9737,
+      "step": 8654
+    },
+    {
+      "epoch": 1.540954415954416,
+      "grad_norm": 0.6221960186958313,
+      "learning_rate": 0.00013545142770896005,
+      "loss": 0.9313,
+      "step": 8655
+    },
+    {
+      "epoch": 1.5411324786324787,
+      "grad_norm": 0.6887816786766052,
+      "learning_rate": 0.0001354383390169334,
+      "loss": 1.1736,
+      "step": 8656
+    },
+    {
+      "epoch": 1.5413105413105415,
+      "grad_norm": 0.5840606093406677,
+      "learning_rate": 0.00013542524963056614,
+      "loss": 0.9269,
+      "step": 8657
+    },
+    {
+      "epoch": 1.541488603988604,
+      "grad_norm": 0.7396654486656189,
+      "learning_rate": 0.00013541215955011472,
+      "loss": 1.1189,
+      "step": 8658
+    },
+    {
+      "epoch": 1.5416666666666665,
+      "grad_norm": 0.780616819858551,
+      "learning_rate": 0.00013539906877583555,
+      "loss": 1.1251,
+      "step": 8659
+    },
+    {
+      "epoch": 1.5418447293447293,
+      "grad_norm": 0.6975206732749939,
+      "learning_rate": 0.0001353859773079852,
+      "loss": 1.2134,
+      "step": 8660
+    },
+    {
+      "epoch": 1.542022792022792,
+      "grad_norm": 0.7572869658470154,
+      "learning_rate": 0.00013537288514682013,
+      "loss": 0.9396,
+      "step": 8661
+    },
+    {
+      "epoch": 1.5422008547008548,
+      "grad_norm": 0.6252159476280212,
+      "learning_rate": 0.00013535979229259686,
+      "loss": 0.8449,
+      "step": 8662
+    },
+    {
+      "epoch": 1.5423789173789175,
+      "grad_norm": 0.7321650981903076,
+      "learning_rate": 0.0001353466987455719,
+      "loss": 1.3263,
+      "step": 8663
+    },
+    {
+      "epoch": 1.54255698005698,
+      "grad_norm": 0.7168700695037842,
+      "learning_rate": 0.00013533360450600177,
+      "loss": 0.8923,
+      "step": 8664
+    },
+    {
+      "epoch": 1.5427350427350426,
+      "grad_norm": 0.5931934714317322,
+      "learning_rate": 0.00013532050957414313,
+      "loss": 0.8448,
+      "step": 8665
+    },
+    {
+      "epoch": 1.5429131054131053,
+      "grad_norm": 0.6621279120445251,
+      "learning_rate": 0.00013530741395025245,
+      "loss": 1.1023,
+      "step": 8666
+    },
+    {
+      "epoch": 1.543091168091168,
+      "grad_norm": 0.7133732438087463,
+      "learning_rate": 0.00013529431763458633,
+      "loss": 0.9986,
+      "step": 8667
+    },
+    {
+      "epoch": 1.5432692307692308,
+      "grad_norm": 0.7589015960693359,
+      "learning_rate": 0.0001352812206274014,
+      "loss": 1.0111,
+      "step": 8668
+    },
+    {
+      "epoch": 1.5434472934472936,
+      "grad_norm": 0.6958192586898804,
+      "learning_rate": 0.0001352681229289542,
+      "loss": 0.9466,
+      "step": 8669
+    },
+    {
+      "epoch": 1.5436253561253561,
+      "grad_norm": 0.7539750337600708,
+      "learning_rate": 0.0001352550245395014,
+      "loss": 1.0974,
+      "step": 8670
+    },
+    {
+      "epoch": 1.5438034188034186,
+      "grad_norm": 0.7003816366195679,
+      "learning_rate": 0.00013524192545929964,
+      "loss": 1.0354,
+      "step": 8671
+    },
+    {
+      "epoch": 1.5439814814814814,
+      "grad_norm": 0.6503025889396667,
+      "learning_rate": 0.00013522882568860558,
+      "loss": 1.0476,
+      "step": 8672
+    },
+    {
+      "epoch": 1.5441595441595442,
+      "grad_norm": 0.6757345199584961,
+      "learning_rate": 0.00013521572522767584,
+      "loss": 0.864,
+      "step": 8673
+    },
+    {
+      "epoch": 1.544337606837607,
+      "grad_norm": 0.6857611536979675,
+      "learning_rate": 0.0001352026240767671,
+      "loss": 1.1627,
+      "step": 8674
+    },
+    {
+      "epoch": 1.5445156695156697,
+      "grad_norm": 0.5775430798530579,
+      "learning_rate": 0.0001351895222361361,
+      "loss": 0.7444,
+      "step": 8675
+    },
+    {
+      "epoch": 1.5446937321937322,
+      "grad_norm": 0.7511499524116516,
+      "learning_rate": 0.00013517641970603952,
+      "loss": 1.1547,
+      "step": 8676
+    },
+    {
+      "epoch": 1.5448717948717947,
+      "grad_norm": 0.6727504730224609,
+      "learning_rate": 0.00013516331648673403,
+      "loss": 1.0829,
+      "step": 8677
+    },
+    {
+      "epoch": 1.5450498575498575,
+      "grad_norm": 0.6128812432289124,
+      "learning_rate": 0.00013515021257847642,
+      "loss": 0.9318,
+      "step": 8678
+    },
+    {
+      "epoch": 1.5452279202279202,
+      "grad_norm": 0.7309781312942505,
+      "learning_rate": 0.00013513710798152343,
+      "loss": 1.0844,
+      "step": 8679
+    },
+    {
+      "epoch": 1.545405982905983,
+      "grad_norm": 0.695655882358551,
+      "learning_rate": 0.00013512400269613176,
+      "loss": 1.113,
+      "step": 8680
+    },
+    {
+      "epoch": 1.5455840455840457,
+      "grad_norm": 0.696441650390625,
+      "learning_rate": 0.00013511089672255824,
+      "loss": 1.0499,
+      "step": 8681
+    },
+    {
+      "epoch": 1.5457621082621082,
+      "grad_norm": 0.6309961080551147,
+      "learning_rate": 0.00013509779006105964,
+      "loss": 0.8759,
+      "step": 8682
+    },
+    {
+      "epoch": 1.5459401709401708,
+      "grad_norm": 0.6155984401702881,
+      "learning_rate": 0.00013508468271189277,
+      "loss": 0.8967,
+      "step": 8683
+    },
+    {
+      "epoch": 1.5461182336182335,
+      "grad_norm": 0.6786884665489197,
+      "learning_rate": 0.00013507157467531442,
+      "loss": 1.0806,
+      "step": 8684
+    },
+    {
+      "epoch": 1.5462962962962963,
+      "grad_norm": 0.6494075059890747,
+      "learning_rate": 0.00013505846595158138,
+      "loss": 1.0196,
+      "step": 8685
+    },
+    {
+      "epoch": 1.546474358974359,
+      "grad_norm": 0.7599824070930481,
+      "learning_rate": 0.00013504535654095055,
+      "loss": 0.8662,
+      "step": 8686
+    },
+    {
+      "epoch": 1.5466524216524218,
+      "grad_norm": 0.6017210483551025,
+      "learning_rate": 0.00013503224644367877,
+      "loss": 0.872,
+      "step": 8687
+    },
+    {
+      "epoch": 1.5468304843304843,
+      "grad_norm": 0.7972410321235657,
+      "learning_rate": 0.00013501913566002288,
+      "loss": 1.0958,
+      "step": 8688
+    },
+    {
+      "epoch": 1.547008547008547,
+      "grad_norm": 0.7572960257530212,
+      "learning_rate": 0.00013500602419023978,
+      "loss": 1.0219,
+      "step": 8689
+    },
+    {
+      "epoch": 1.5471866096866096,
+      "grad_norm": 0.6329224109649658,
+      "learning_rate": 0.00013499291203458635,
+      "loss": 0.8636,
+      "step": 8690
+    },
+    {
+      "epoch": 1.5473646723646723,
+      "grad_norm": 0.6777113080024719,
+      "learning_rate": 0.0001349797991933195,
+      "loss": 1.0297,
+      "step": 8691
+    },
+    {
+      "epoch": 1.547542735042735,
+      "grad_norm": 0.6449527144432068,
+      "learning_rate": 0.00013496668566669617,
+      "loss": 1.0296,
+      "step": 8692
+    },
+    {
+      "epoch": 1.5477207977207978,
+      "grad_norm": 0.8236973881721497,
+      "learning_rate": 0.00013495357145497326,
+      "loss": 0.8569,
+      "step": 8693
+    },
+    {
+      "epoch": 1.5478988603988604,
+      "grad_norm": 0.6753743290901184,
+      "learning_rate": 0.0001349404565584077,
+      "loss": 1.0733,
+      "step": 8694
+    },
+    {
+      "epoch": 1.5480769230769231,
+      "grad_norm": 0.6642967462539673,
+      "learning_rate": 0.0001349273409772565,
+      "loss": 0.9437,
+      "step": 8695
+    },
+    {
+      "epoch": 1.5482549857549857,
+      "grad_norm": 0.6470823884010315,
+      "learning_rate": 0.00013491422471177661,
+      "loss": 0.999,
+      "step": 8696
+    },
+    {
+      "epoch": 1.5484330484330484,
+      "grad_norm": 0.7287036776542664,
+      "learning_rate": 0.000134901107762225,
+      "loss": 0.9396,
+      "step": 8697
+    },
+    {
+      "epoch": 1.5486111111111112,
+      "grad_norm": 0.6258324980735779,
+      "learning_rate": 0.00013488799012885872,
+      "loss": 1.045,
+      "step": 8698
+    },
+    {
+      "epoch": 1.548789173789174,
+      "grad_norm": 0.6540539860725403,
+      "learning_rate": 0.00013487487181193473,
+      "loss": 0.9939,
+      "step": 8699
+    },
+    {
+      "epoch": 1.5489672364672364,
+      "grad_norm": 0.7129563093185425,
+      "learning_rate": 0.00013486175281171003,
+      "loss": 1.2079,
+      "step": 8700
+    },
+    {
+      "epoch": 1.5491452991452992,
+      "grad_norm": 0.6383145451545715,
+      "learning_rate": 0.00013484863312844173,
+      "loss": 0.9999,
+      "step": 8701
+    },
+    {
+      "epoch": 1.5493233618233617,
+      "grad_norm": 0.6310200691223145,
+      "learning_rate": 0.0001348355127623869,
+      "loss": 1.1193,
+      "step": 8702
+    },
+    {
+      "epoch": 1.5495014245014245,
+      "grad_norm": 0.6370054483413696,
+      "learning_rate": 0.0001348223917138025,
+      "loss": 1.0213,
+      "step": 8703
+    },
+    {
+      "epoch": 1.5496794871794872,
+      "grad_norm": 0.7052688598632812,
+      "learning_rate": 0.00013480926998294573,
+      "loss": 0.8773,
+      "step": 8704
+    },
+    {
+      "epoch": 1.54985754985755,
+      "grad_norm": 0.6369579434394836,
+      "learning_rate": 0.00013479614757007355,
+      "loss": 1.0072,
+      "step": 8705
+    },
+    {
+      "epoch": 1.5500356125356125,
+      "grad_norm": 0.7152075171470642,
+      "learning_rate": 0.0001347830244754432,
+      "loss": 1.0409,
+      "step": 8706
+    },
+    {
+      "epoch": 1.5502136752136753,
+      "grad_norm": 0.654183566570282,
+      "learning_rate": 0.00013476990069931173,
+      "loss": 0.9363,
+      "step": 8707
+    },
+    {
+      "epoch": 1.5503917378917378,
+      "grad_norm": 0.6700537204742432,
+      "learning_rate": 0.00013475677624193627,
+      "loss": 0.985,
+      "step": 8708
+    },
+    {
+      "epoch": 1.5505698005698005,
+      "grad_norm": 0.7195445895195007,
+      "learning_rate": 0.00013474365110357402,
+      "loss": 0.988,
+      "step": 8709
+    },
+    {
+      "epoch": 1.5507478632478633,
+      "grad_norm": 0.6019890904426575,
+      "learning_rate": 0.00013473052528448201,
+      "loss": 0.9915,
+      "step": 8710
+    },
+    {
+      "epoch": 1.550925925925926,
+      "grad_norm": 0.7787565588951111,
+      "learning_rate": 0.0001347173987849176,
+      "loss": 0.9676,
+      "step": 8711
+    },
+    {
+      "epoch": 1.5511039886039886,
+      "grad_norm": 0.6997103691101074,
+      "learning_rate": 0.00013470427160513782,
+      "loss": 1.1158,
+      "step": 8712
+    },
+    {
+      "epoch": 1.5512820512820513,
+      "grad_norm": 0.6259464025497437,
+      "learning_rate": 0.00013469114374539998,
+      "loss": 0.8784,
+      "step": 8713
+    },
+    {
+      "epoch": 1.5514601139601139,
+      "grad_norm": 0.6159056425094604,
+      "learning_rate": 0.00013467801520596122,
+      "loss": 0.9184,
+      "step": 8714
+    },
+    {
+      "epoch": 1.5516381766381766,
+      "grad_norm": 0.6823606491088867,
+      "learning_rate": 0.00013466488598707876,
+      "loss": 0.9542,
+      "step": 8715
+    },
+    {
+      "epoch": 1.5518162393162394,
+      "grad_norm": 0.6781585812568665,
+      "learning_rate": 0.0001346517560890099,
+      "loss": 1.1761,
+      "step": 8716
+    },
+    {
+      "epoch": 1.551994301994302,
+      "grad_norm": 0.6313831806182861,
+      "learning_rate": 0.00013463862551201184,
+      "loss": 0.8935,
+      "step": 8717
+    },
+    {
+      "epoch": 1.5521723646723646,
+      "grad_norm": 0.7466186881065369,
+      "learning_rate": 0.0001346254942563419,
+      "loss": 1.0583,
+      "step": 8718
+    },
+    {
+      "epoch": 1.5523504273504274,
+      "grad_norm": 0.7073680758476257,
+      "learning_rate": 0.0001346123623222573,
+      "loss": 0.9863,
+      "step": 8719
+    },
+    {
+      "epoch": 1.55252849002849,
+      "grad_norm": 0.6286870241165161,
+      "learning_rate": 0.00013459922971001536,
+      "loss": 0.9921,
+      "step": 8720
+    },
+    {
+      "epoch": 1.5527065527065527,
+      "grad_norm": 0.6047035455703735,
+      "learning_rate": 0.0001345860964198734,
+      "loss": 0.9155,
+      "step": 8721
+    },
+    {
+      "epoch": 1.5528846153846154,
+      "grad_norm": 0.5909964442253113,
+      "learning_rate": 0.00013457296245208874,
+      "loss": 0.9593,
+      "step": 8722
+    },
+    {
+      "epoch": 1.5530626780626782,
+      "grad_norm": 0.7838597893714905,
+      "learning_rate": 0.00013455982780691869,
+      "loss": 0.8872,
+      "step": 8723
+    },
+    {
+      "epoch": 1.5532407407407407,
+      "grad_norm": 0.6914706230163574,
+      "learning_rate": 0.00013454669248462063,
+      "loss": 0.9104,
+      "step": 8724
+    },
+    {
+      "epoch": 1.5534188034188035,
+      "grad_norm": 0.6777952909469604,
+      "learning_rate": 0.00013453355648545182,
+      "loss": 0.9839,
+      "step": 8725
+    },
+    {
+      "epoch": 1.553596866096866,
+      "grad_norm": 0.7482799291610718,
+      "learning_rate": 0.00013452041980966978,
+      "loss": 1.1164,
+      "step": 8726
+    },
+    {
+      "epoch": 1.5537749287749287,
+      "grad_norm": 0.6616327166557312,
+      "learning_rate": 0.0001345072824575318,
+      "loss": 0.9574,
+      "step": 8727
+    },
+    {
+      "epoch": 1.5539529914529915,
+      "grad_norm": 0.7193203568458557,
+      "learning_rate": 0.00013449414442929532,
+      "loss": 1.0609,
+      "step": 8728
+    },
+    {
+      "epoch": 1.5541310541310542,
+      "grad_norm": 0.6599446535110474,
+      "learning_rate": 0.0001344810057252177,
+      "loss": 0.9574,
+      "step": 8729
+    },
+    {
+      "epoch": 1.5543091168091168,
+      "grad_norm": 0.7221707105636597,
+      "learning_rate": 0.00013446786634555642,
+      "loss": 0.9819,
+      "step": 8730
+    },
+    {
+      "epoch": 1.5544871794871795,
+      "grad_norm": 0.6531312465667725,
+      "learning_rate": 0.0001344547262905689,
+      "loss": 0.9986,
+      "step": 8731
+    },
+    {
+      "epoch": 1.554665242165242,
+      "grad_norm": 0.6879804730415344,
+      "learning_rate": 0.0001344415855605126,
+      "loss": 1.1078,
+      "step": 8732
+    },
+    {
+      "epoch": 1.5548433048433048,
+      "grad_norm": 0.708907425403595,
+      "learning_rate": 0.00013442844415564498,
+      "loss": 1.0221,
+      "step": 8733
+    },
+    {
+      "epoch": 1.5550213675213675,
+      "grad_norm": 0.7957375645637512,
+      "learning_rate": 0.0001344153020762235,
+      "loss": 1.3101,
+      "step": 8734
+    },
+    {
+      "epoch": 1.5551994301994303,
+      "grad_norm": 0.7068197727203369,
+      "learning_rate": 0.00013440215932250567,
+      "loss": 0.8995,
+      "step": 8735
+    },
+    {
+      "epoch": 1.5553774928774928,
+      "grad_norm": 0.6455841064453125,
+      "learning_rate": 0.00013438901589474898,
+      "loss": 0.7244,
+      "step": 8736
+    },
+    {
+      "epoch": 1.5555555555555556,
+      "grad_norm": 0.7500516772270203,
+      "learning_rate": 0.00013437587179321097,
+      "loss": 1.0161,
+      "step": 8737
+    },
+    {
+      "epoch": 1.555733618233618,
+      "grad_norm": 0.5983143448829651,
+      "learning_rate": 0.00013436272701814917,
+      "loss": 0.9922,
+      "step": 8738
+    },
+    {
+      "epoch": 1.5559116809116809,
+      "grad_norm": 0.8761729598045349,
+      "learning_rate": 0.0001343495815698211,
+      "loss": 1.022,
+      "step": 8739
+    },
+    {
+      "epoch": 1.5560897435897436,
+      "grad_norm": 0.6901857852935791,
+      "learning_rate": 0.00013433643544848438,
+      "loss": 1.0668,
+      "step": 8740
+    },
+    {
+      "epoch": 1.5562678062678064,
+      "grad_norm": 0.6770836114883423,
+      "learning_rate": 0.00013432328865439647,
+      "loss": 0.9516,
+      "step": 8741
+    },
+    {
+      "epoch": 1.556445868945869,
+      "grad_norm": 0.6138805150985718,
+      "learning_rate": 0.00013431014118781505,
+      "loss": 0.8682,
+      "step": 8742
+    },
+    {
+      "epoch": 1.5566239316239316,
+      "grad_norm": 0.6796693801879883,
+      "learning_rate": 0.00013429699304899772,
+      "loss": 1.1132,
+      "step": 8743
+    },
+    {
+      "epoch": 1.5568019943019942,
+      "grad_norm": 0.6626394987106323,
+      "learning_rate": 0.000134283844238202,
+      "loss": 0.9273,
+      "step": 8744
+    },
+    {
+      "epoch": 1.556980056980057,
+      "grad_norm": 0.7088519334793091,
+      "learning_rate": 0.00013427069475568563,
+      "loss": 0.8915,
+      "step": 8745
+    },
+    {
+      "epoch": 1.5571581196581197,
+      "grad_norm": 0.6244857311248779,
+      "learning_rate": 0.0001342575446017061,
+      "loss": 0.9466,
+      "step": 8746
+    },
+    {
+      "epoch": 1.5573361823361824,
+      "grad_norm": 0.6969038248062134,
+      "learning_rate": 0.00013424439377652123,
+      "loss": 1.2307,
+      "step": 8747
+    },
+    {
+      "epoch": 1.5575142450142452,
+      "grad_norm": 0.6636740565299988,
+      "learning_rate": 0.0001342312422803886,
+      "loss": 0.9456,
+      "step": 8748
+    },
+    {
+      "epoch": 1.5576923076923077,
+      "grad_norm": 0.7863389253616333,
+      "learning_rate": 0.00013421809011356586,
+      "loss": 1.1888,
+      "step": 8749
+    },
+    {
+      "epoch": 1.5578703703703702,
+      "grad_norm": 0.7504058480262756,
+      "learning_rate": 0.00013420493727631073,
+      "loss": 1.2602,
+      "step": 8750
+    },
+    {
+      "epoch": 1.558048433048433,
+      "grad_norm": 0.7173139452934265,
+      "learning_rate": 0.00013419178376888085,
+      "loss": 1.0726,
+      "step": 8751
+    },
+    {
+      "epoch": 1.5582264957264957,
+      "grad_norm": 0.6517474055290222,
+      "learning_rate": 0.00013417862959153406,
+      "loss": 1.1299,
+      "step": 8752
+    },
+    {
+      "epoch": 1.5584045584045585,
+      "grad_norm": 0.8911739587783813,
+      "learning_rate": 0.00013416547474452803,
+      "loss": 1.105,
+      "step": 8753
+    },
+    {
+      "epoch": 1.5585826210826212,
+      "grad_norm": 0.7116649150848389,
+      "learning_rate": 0.00013415231922812049,
+      "loss": 0.8037,
+      "step": 8754
+    },
+    {
+      "epoch": 1.5587606837606838,
+      "grad_norm": 0.6935904026031494,
+      "learning_rate": 0.00013413916304256916,
+      "loss": 1.2778,
+      "step": 8755
+    },
+    {
+      "epoch": 1.5589387464387463,
+      "grad_norm": 0.652763843536377,
+      "learning_rate": 0.00013412600618813186,
+      "loss": 0.9188,
+      "step": 8756
+    },
+    {
+      "epoch": 1.559116809116809,
+      "grad_norm": 0.6545276641845703,
+      "learning_rate": 0.00013411284866506637,
+      "loss": 1.0116,
+      "step": 8757
+    },
+    {
+      "epoch": 1.5592948717948718,
+      "grad_norm": 0.632165253162384,
+      "learning_rate": 0.0001340996904736305,
+      "loss": 0.8538,
+      "step": 8758
+    },
+    {
+      "epoch": 1.5594729344729346,
+      "grad_norm": 0.6719664931297302,
+      "learning_rate": 0.000134086531614082,
+      "loss": 1.1877,
+      "step": 8759
+    },
+    {
+      "epoch": 1.5596509971509973,
+      "grad_norm": 0.6691158413887024,
+      "learning_rate": 0.00013407337208667873,
+      "loss": 1.0411,
+      "step": 8760
+    },
+    {
+      "epoch": 1.5598290598290598,
+      "grad_norm": 0.7711479067802429,
+      "learning_rate": 0.0001340602118916785,
+      "loss": 0.9995,
+      "step": 8761
+    },
+    {
+      "epoch": 1.5600071225071224,
+      "grad_norm": 0.7229881286621094,
+      "learning_rate": 0.0001340470510293392,
+      "loss": 1.1751,
+      "step": 8762
+    },
+    {
+      "epoch": 1.5601851851851851,
+      "grad_norm": 0.7183271646499634,
+      "learning_rate": 0.00013403388949991864,
+      "loss": 0.9371,
+      "step": 8763
+    },
+    {
+      "epoch": 1.5603632478632479,
+      "grad_norm": 0.8142383098602295,
+      "learning_rate": 0.00013402072730367475,
+      "loss": 1.0199,
+      "step": 8764
+    },
+    {
+      "epoch": 1.5605413105413106,
+      "grad_norm": 0.6349362134933472,
+      "learning_rate": 0.00013400756444086534,
+      "loss": 0.8453,
+      "step": 8765
+    },
+    {
+      "epoch": 1.5607193732193734,
+      "grad_norm": 0.651900589466095,
+      "learning_rate": 0.00013399440091174834,
+      "loss": 0.8952,
+      "step": 8766
+    },
+    {
+      "epoch": 1.560897435897436,
+      "grad_norm": 0.6873346567153931,
+      "learning_rate": 0.00013398123671658172,
+      "loss": 0.9438,
+      "step": 8767
+    },
+    {
+      "epoch": 1.5610754985754984,
+      "grad_norm": 0.7404754757881165,
+      "learning_rate": 0.00013396807185562333,
+      "loss": 1.123,
+      "step": 8768
+    },
+    {
+      "epoch": 1.5612535612535612,
+      "grad_norm": 0.7449641227722168,
+      "learning_rate": 0.00013395490632913111,
+      "loss": 0.9407,
+      "step": 8769
+    },
+    {
+      "epoch": 1.561431623931624,
+      "grad_norm": 0.7393384575843811,
+      "learning_rate": 0.0001339417401373631,
+      "loss": 1.0209,
+      "step": 8770
+    },
+    {
+      "epoch": 1.5616096866096867,
+      "grad_norm": 0.6787426471710205,
+      "learning_rate": 0.00013392857328057713,
+      "loss": 0.9768,
+      "step": 8771
+    },
+    {
+      "epoch": 1.5617877492877494,
+      "grad_norm": 0.6295693516731262,
+      "learning_rate": 0.00013391540575903127,
+      "loss": 0.9011,
+      "step": 8772
+    },
+    {
+      "epoch": 1.561965811965812,
+      "grad_norm": 0.7114503979682922,
+      "learning_rate": 0.00013390223757298354,
+      "loss": 1.0696,
+      "step": 8773
+    },
+    {
+      "epoch": 1.5621438746438745,
+      "grad_norm": 0.7540110349655151,
+      "learning_rate": 0.00013388906872269184,
+      "loss": 1.0071,
+      "step": 8774
+    },
+    {
+      "epoch": 1.5623219373219372,
+      "grad_norm": 0.6472305059432983,
+      "learning_rate": 0.00013387589920841423,
+      "loss": 1.105,
+      "step": 8775
+    },
+    {
+      "epoch": 1.5625,
+      "grad_norm": 0.6936793327331543,
+      "learning_rate": 0.00013386272903040874,
+      "loss": 0.885,
+      "step": 8776
+    },
+    {
+      "epoch": 1.5626780626780628,
+      "grad_norm": 0.7487989068031311,
+      "learning_rate": 0.00013384955818893343,
+      "loss": 0.7842,
+      "step": 8777
+    },
+    {
+      "epoch": 1.5628561253561255,
+      "grad_norm": 0.6109505891799927,
+      "learning_rate": 0.00013383638668424633,
+      "loss": 0.9461,
+      "step": 8778
+    },
+    {
+      "epoch": 1.563034188034188,
+      "grad_norm": 0.6650055646896362,
+      "learning_rate": 0.00013382321451660558,
+      "loss": 1.0463,
+      "step": 8779
+    },
+    {
+      "epoch": 1.5632122507122506,
+      "grad_norm": 0.7147329449653625,
+      "learning_rate": 0.00013381004168626915,
+      "loss": 0.946,
+      "step": 8780
+    },
+    {
+      "epoch": 1.5633903133903133,
+      "grad_norm": 0.6919382810592651,
+      "learning_rate": 0.00013379686819349522,
+      "loss": 0.8946,
+      "step": 8781
+    },
+    {
+      "epoch": 1.563568376068376,
+      "grad_norm": 0.7339401245117188,
+      "learning_rate": 0.00013378369403854184,
+      "loss": 0.9625,
+      "step": 8782
+    },
+    {
+      "epoch": 1.5637464387464388,
+      "grad_norm": 0.6337129473686218,
+      "learning_rate": 0.00013377051922166717,
+      "loss": 1.0854,
+      "step": 8783
+    },
+    {
+      "epoch": 1.5639245014245016,
+      "grad_norm": 0.7301266193389893,
+      "learning_rate": 0.0001337573437431293,
+      "loss": 1.017,
+      "step": 8784
+    },
+    {
+      "epoch": 1.564102564102564,
+      "grad_norm": 0.689540684223175,
+      "learning_rate": 0.00013374416760318644,
+      "loss": 0.8734,
+      "step": 8785
+    },
+    {
+      "epoch": 1.5642806267806266,
+      "grad_norm": 0.7121307849884033,
+      "learning_rate": 0.0001337309908020967,
+      "loss": 1.0827,
+      "step": 8786
+    },
+    {
+      "epoch": 1.5644586894586894,
+      "grad_norm": 0.6715386509895325,
+      "learning_rate": 0.00013371781334011826,
+      "loss": 0.946,
+      "step": 8787
+    },
+    {
+      "epoch": 1.5646367521367521,
+      "grad_norm": 0.6895501613616943,
+      "learning_rate": 0.00013370463521750932,
+      "loss": 1.1113,
+      "step": 8788
+    },
+    {
+      "epoch": 1.5648148148148149,
+      "grad_norm": 0.6592531204223633,
+      "learning_rate": 0.00013369145643452805,
+      "loss": 0.9952,
+      "step": 8789
+    },
+    {
+      "epoch": 1.5649928774928776,
+      "grad_norm": 0.7495190501213074,
+      "learning_rate": 0.0001336782769914327,
+      "loss": 1.0936,
+      "step": 8790
+    },
+    {
+      "epoch": 1.5651709401709402,
+      "grad_norm": 0.7273977398872375,
+      "learning_rate": 0.00013366509688848147,
+      "loss": 1.1749,
+      "step": 8791
+    },
+    {
+      "epoch": 1.5653490028490027,
+      "grad_norm": 0.6447354555130005,
+      "learning_rate": 0.0001336519161259326,
+      "loss": 0.8638,
+      "step": 8792
+    },
+    {
+      "epoch": 1.5655270655270654,
+      "grad_norm": 0.6572020053863525,
+      "learning_rate": 0.00013363873470404432,
+      "loss": 0.8005,
+      "step": 8793
+    },
+    {
+      "epoch": 1.5657051282051282,
+      "grad_norm": 0.676418662071228,
+      "learning_rate": 0.00013362555262307491,
+      "loss": 0.7651,
+      "step": 8794
+    },
+    {
+      "epoch": 1.565883190883191,
+      "grad_norm": 0.6886745095252991,
+      "learning_rate": 0.0001336123698832827,
+      "loss": 1.0765,
+      "step": 8795
+    },
+    {
+      "epoch": 1.5660612535612537,
+      "grad_norm": 0.8134182095527649,
+      "learning_rate": 0.00013359918648492584,
+      "loss": 1.2228,
+      "step": 8796
+    },
+    {
+      "epoch": 1.5662393162393162,
+      "grad_norm": 0.7210384011268616,
+      "learning_rate": 0.00013358600242826277,
+      "loss": 0.8247,
+      "step": 8797
+    },
+    {
+      "epoch": 1.5664173789173788,
+      "grad_norm": 0.7086136341094971,
+      "learning_rate": 0.00013357281771355175,
+      "loss": 1.0323,
+      "step": 8798
+    },
+    {
+      "epoch": 1.5665954415954415,
+      "grad_norm": 0.7419785857200623,
+      "learning_rate": 0.0001335596323410511,
+      "loss": 1.213,
+      "step": 8799
+    },
+    {
+      "epoch": 1.5667735042735043,
+      "grad_norm": 0.6390291452407837,
+      "learning_rate": 0.0001335464463110192,
+      "loss": 1.0403,
+      "step": 8800
+    },
+    {
+      "epoch": 1.566951566951567,
+      "grad_norm": 0.6111941337585449,
+      "learning_rate": 0.00013353325962371434,
+      "loss": 0.9747,
+      "step": 8801
+    },
+    {
+      "epoch": 1.5671296296296298,
+      "grad_norm": 0.6792671084403992,
+      "learning_rate": 0.00013352007227939488,
+      "loss": 1.1179,
+      "step": 8802
+    },
+    {
+      "epoch": 1.5673076923076923,
+      "grad_norm": 0.6656535863876343,
+      "learning_rate": 0.0001335068842783193,
+      "loss": 0.9214,
+      "step": 8803
+    },
+    {
+      "epoch": 1.5674857549857548,
+      "grad_norm": 0.6910907626152039,
+      "learning_rate": 0.0001334936956207459,
+      "loss": 1.0609,
+      "step": 8804
+    },
+    {
+      "epoch": 1.5676638176638176,
+      "grad_norm": 0.65049147605896,
+      "learning_rate": 0.00013348050630693315,
+      "loss": 0.7189,
+      "step": 8805
+    },
+    {
+      "epoch": 1.5678418803418803,
+      "grad_norm": 0.6258065104484558,
+      "learning_rate": 0.0001334673163371394,
+      "loss": 1.0683,
+      "step": 8806
+    },
+    {
+      "epoch": 1.568019943019943,
+      "grad_norm": 0.7518934607505798,
+      "learning_rate": 0.00013345412571162305,
+      "loss": 1.2415,
+      "step": 8807
+    },
+    {
+      "epoch": 1.5681980056980058,
+      "grad_norm": 0.7395275235176086,
+      "learning_rate": 0.00013344093443064267,
+      "loss": 0.9153,
+      "step": 8808
+    },
+    {
+      "epoch": 1.5683760683760684,
+      "grad_norm": 0.6789839267730713,
+      "learning_rate": 0.00013342774249445663,
+      "loss": 0.8051,
+      "step": 8809
+    },
+    {
+      "epoch": 1.568554131054131,
+      "grad_norm": 0.786247193813324,
+      "learning_rate": 0.00013341454990332342,
+      "loss": 1.203,
+      "step": 8810
+    },
+    {
+      "epoch": 1.5687321937321936,
+      "grad_norm": 0.6858161687850952,
+      "learning_rate": 0.00013340135665750153,
+      "loss": 0.9494,
+      "step": 8811
+    },
+    {
+      "epoch": 1.5689102564102564,
+      "grad_norm": 0.7245797514915466,
+      "learning_rate": 0.0001333881627572494,
+      "loss": 1.0544,
+      "step": 8812
+    },
+    {
+      "epoch": 1.5690883190883191,
+      "grad_norm": 0.6176164150238037,
+      "learning_rate": 0.00013337496820282563,
+      "loss": 0.9084,
+      "step": 8813
+    },
+    {
+      "epoch": 1.569266381766382,
+      "grad_norm": 0.7342953681945801,
+      "learning_rate": 0.00013336177299448868,
+      "loss": 1.0006,
+      "step": 8814
+    },
+    {
+      "epoch": 1.5694444444444444,
+      "grad_norm": 0.5183523297309875,
+      "learning_rate": 0.00013334857713249708,
+      "loss": 0.6295,
+      "step": 8815
+    },
+    {
+      "epoch": 1.5696225071225072,
+      "grad_norm": 0.6664513349533081,
+      "learning_rate": 0.00013333538061710936,
+      "loss": 0.7569,
+      "step": 8816
+    },
+    {
+      "epoch": 1.5698005698005697,
+      "grad_norm": 0.7051160931587219,
+      "learning_rate": 0.0001333221834485841,
+      "loss": 0.9917,
+      "step": 8817
+    },
+    {
+      "epoch": 1.5699786324786325,
+      "grad_norm": 0.7888057231903076,
+      "learning_rate": 0.0001333089856271799,
+      "loss": 1.0337,
+      "step": 8818
+    },
+    {
+      "epoch": 1.5701566951566952,
+      "grad_norm": 0.6796144247055054,
+      "learning_rate": 0.00013329578715315534,
+      "loss": 1.0915,
+      "step": 8819
+    },
+    {
+      "epoch": 1.570334757834758,
+      "grad_norm": 0.7442883849143982,
+      "learning_rate": 0.000133282588026769,
+      "loss": 1.1695,
+      "step": 8820
+    },
+    {
+      "epoch": 1.5705128205128205,
+      "grad_norm": 0.6164735555648804,
+      "learning_rate": 0.00013326938824827946,
+      "loss": 1.0143,
+      "step": 8821
+    },
+    {
+      "epoch": 1.5706908831908832,
+      "grad_norm": 0.6526502966880798,
+      "learning_rate": 0.00013325618781794539,
+      "loss": 0.8402,
+      "step": 8822
+    },
+    {
+      "epoch": 1.5708689458689458,
+      "grad_norm": 0.6376087069511414,
+      "learning_rate": 0.00013324298673602535,
+      "loss": 0.7582,
+      "step": 8823
+    },
+    {
+      "epoch": 1.5710470085470085,
+      "grad_norm": 0.6888708472251892,
+      "learning_rate": 0.00013322978500277807,
+      "loss": 0.997,
+      "step": 8824
+    },
+    {
+      "epoch": 1.5712250712250713,
+      "grad_norm": 0.553656280040741,
+      "learning_rate": 0.0001332165826184622,
+      "loss": 0.6917,
+      "step": 8825
+    },
+    {
+      "epoch": 1.571403133903134,
+      "grad_norm": 0.643285870552063,
+      "learning_rate": 0.0001332033795833364,
+      "loss": 0.8689,
+      "step": 8826
+    },
+    {
+      "epoch": 1.5715811965811965,
+      "grad_norm": 0.6210280060768127,
+      "learning_rate": 0.00013319017589765933,
+      "loss": 0.9047,
+      "step": 8827
+    },
+    {
+      "epoch": 1.5717592592592593,
+      "grad_norm": 0.7612366676330566,
+      "learning_rate": 0.0001331769715616897,
+      "loss": 0.9818,
+      "step": 8828
+    },
+    {
+      "epoch": 1.5719373219373218,
+      "grad_norm": 0.5970702171325684,
+      "learning_rate": 0.00013316376657568628,
+      "loss": 0.82,
+      "step": 8829
+    },
+    {
+      "epoch": 1.5721153846153846,
+      "grad_norm": 0.7182583808898926,
+      "learning_rate": 0.0001331505609399077,
+      "loss": 1.0633,
+      "step": 8830
+    },
+    {
+      "epoch": 1.5722934472934473,
+      "grad_norm": 0.7230739593505859,
+      "learning_rate": 0.00013313735465461278,
+      "loss": 0.977,
+      "step": 8831
+    },
+    {
+      "epoch": 1.57247150997151,
+      "grad_norm": 0.6752985119819641,
+      "learning_rate": 0.00013312414772006018,
+      "loss": 0.9666,
+      "step": 8832
+    },
+    {
+      "epoch": 1.5726495726495726,
+      "grad_norm": 0.7724275588989258,
+      "learning_rate": 0.00013311094013650877,
+      "loss": 1.148,
+      "step": 8833
+    },
+    {
+      "epoch": 1.5728276353276354,
+      "grad_norm": 0.7216386198997498,
+      "learning_rate": 0.00013309773190421724,
+      "loss": 0.9935,
+      "step": 8834
+    },
+    {
+      "epoch": 1.573005698005698,
+      "grad_norm": 0.6422320008277893,
+      "learning_rate": 0.0001330845230234444,
+      "loss": 0.9383,
+      "step": 8835
+    },
+    {
+      "epoch": 1.5731837606837606,
+      "grad_norm": 0.669538140296936,
+      "learning_rate": 0.00013307131349444906,
+      "loss": 1.0866,
+      "step": 8836
+    },
+    {
+      "epoch": 1.5733618233618234,
+      "grad_norm": 0.6994584798812866,
+      "learning_rate": 0.00013305810331749003,
+      "loss": 0.7882,
+      "step": 8837
+    },
+    {
+      "epoch": 1.5735398860398861,
+      "grad_norm": 0.8094269633293152,
+      "learning_rate": 0.00013304489249282617,
+      "loss": 1.2316,
+      "step": 8838
+    },
+    {
+      "epoch": 1.5737179487179487,
+      "grad_norm": 0.7180120348930359,
+      "learning_rate": 0.00013303168102071625,
+      "loss": 0.9795,
+      "step": 8839
+    },
+    {
+      "epoch": 1.5738960113960114,
+      "grad_norm": 0.6191438436508179,
+      "learning_rate": 0.00013301846890141918,
+      "loss": 0.8957,
+      "step": 8840
+    },
+    {
+      "epoch": 1.574074074074074,
+      "grad_norm": 0.671094536781311,
+      "learning_rate": 0.00013300525613519382,
+      "loss": 1.059,
+      "step": 8841
+    },
+    {
+      "epoch": 1.5742521367521367,
+      "grad_norm": 0.8062624931335449,
+      "learning_rate": 0.000132992042722299,
+      "loss": 0.9782,
+      "step": 8842
+    },
+    {
+      "epoch": 1.5744301994301995,
+      "grad_norm": 0.6674807667732239,
+      "learning_rate": 0.00013297882866299362,
+      "loss": 0.7765,
+      "step": 8843
+    },
+    {
+      "epoch": 1.5746082621082622,
+      "grad_norm": 0.6369131803512573,
+      "learning_rate": 0.00013296561395753664,
+      "loss": 0.97,
+      "step": 8844
+    },
+    {
+      "epoch": 1.5747863247863247,
+      "grad_norm": 0.7913636565208435,
+      "learning_rate": 0.00013295239860618691,
+      "loss": 1.0458,
+      "step": 8845
+    },
+    {
+      "epoch": 1.5749643874643875,
+      "grad_norm": 0.6722261905670166,
+      "learning_rate": 0.0001329391826092034,
+      "loss": 1.1118,
+      "step": 8846
+    },
+    {
+      "epoch": 1.57514245014245,
+      "grad_norm": 0.6936299800872803,
+      "learning_rate": 0.00013292596596684502,
+      "loss": 1.009,
+      "step": 8847
+    },
+    {
+      "epoch": 1.5753205128205128,
+      "grad_norm": 0.7009961009025574,
+      "learning_rate": 0.00013291274867937073,
+      "loss": 0.9904,
+      "step": 8848
+    },
+    {
+      "epoch": 1.5754985754985755,
+      "grad_norm": 0.6900732517242432,
+      "learning_rate": 0.0001328995307470395,
+      "loss": 1.0488,
+      "step": 8849
+    },
+    {
+      "epoch": 1.5756766381766383,
+      "grad_norm": 0.6389018297195435,
+      "learning_rate": 0.00013288631217011032,
+      "loss": 0.9444,
+      "step": 8850
+    },
+    {
+      "epoch": 1.5758547008547008,
+      "grad_norm": 0.6370900869369507,
+      "learning_rate": 0.00013287309294884216,
+      "loss": 0.7465,
+      "step": 8851
+    },
+    {
+      "epoch": 1.5760327635327636,
+      "grad_norm": 0.6463848948478699,
+      "learning_rate": 0.00013285987308349405,
+      "loss": 0.896,
+      "step": 8852
+    },
+    {
+      "epoch": 1.576210826210826,
+      "grad_norm": 0.6022449731826782,
+      "learning_rate": 0.00013284665257432495,
+      "loss": 0.8822,
+      "step": 8853
+    },
+    {
+      "epoch": 1.5763888888888888,
+      "grad_norm": 0.768189013004303,
+      "learning_rate": 0.00013283343142159396,
+      "loss": 0.9862,
+      "step": 8854
+    },
+    {
+      "epoch": 1.5765669515669516,
+      "grad_norm": 0.6642358303070068,
+      "learning_rate": 0.00013282020962556007,
+      "loss": 1.0713,
+      "step": 8855
+    },
+    {
+      "epoch": 1.5767450142450143,
+      "grad_norm": 0.6883034706115723,
+      "learning_rate": 0.00013280698718648234,
+      "loss": 1.0351,
+      "step": 8856
+    },
+    {
+      "epoch": 1.5769230769230769,
+      "grad_norm": 0.602808952331543,
+      "learning_rate": 0.00013279376410461988,
+      "loss": 0.7615,
+      "step": 8857
+    },
+    {
+      "epoch": 1.5771011396011396,
+      "grad_norm": 0.5968614220619202,
+      "learning_rate": 0.0001327805403802317,
+      "loss": 0.9443,
+      "step": 8858
+    },
+    {
+      "epoch": 1.5772792022792022,
+      "grad_norm": 0.7314837574958801,
+      "learning_rate": 0.00013276731601357696,
+      "loss": 0.8784,
+      "step": 8859
+    },
+    {
+      "epoch": 1.577457264957265,
+      "grad_norm": 0.619754433631897,
+      "learning_rate": 0.0001327540910049147,
+      "loss": 0.954,
+      "step": 8860
+    },
+    {
+      "epoch": 1.5776353276353277,
+      "grad_norm": 0.7195139527320862,
+      "learning_rate": 0.0001327408653545041,
+      "loss": 1.0227,
+      "step": 8861
+    },
+    {
+      "epoch": 1.5778133903133904,
+      "grad_norm": 0.6796214580535889,
+      "learning_rate": 0.0001327276390626042,
+      "loss": 1.0593,
+      "step": 8862
+    },
+    {
+      "epoch": 1.577991452991453,
+      "grad_norm": 0.6576255559921265,
+      "learning_rate": 0.00013271441212947427,
+      "loss": 0.7921,
+      "step": 8863
+    },
+    {
+      "epoch": 1.5781695156695157,
+      "grad_norm": 0.7222092151641846,
+      "learning_rate": 0.00013270118455537336,
+      "loss": 1.0545,
+      "step": 8864
+    },
+    {
+      "epoch": 1.5783475783475782,
+      "grad_norm": 0.7159737348556519,
+      "learning_rate": 0.00013268795634056066,
+      "loss": 0.9664,
+      "step": 8865
+    },
+    {
+      "epoch": 1.578525641025641,
+      "grad_norm": 0.7120481133460999,
+      "learning_rate": 0.00013267472748529536,
+      "loss": 1.0148,
+      "step": 8866
+    },
+    {
+      "epoch": 1.5787037037037037,
+      "grad_norm": 0.7353253364562988,
+      "learning_rate": 0.00013266149798983666,
+      "loss": 0.9288,
+      "step": 8867
+    },
+    {
+      "epoch": 1.5788817663817665,
+      "grad_norm": 0.6652441620826721,
+      "learning_rate": 0.00013264826785444375,
+      "loss": 0.8246,
+      "step": 8868
+    },
+    {
+      "epoch": 1.5790598290598292,
+      "grad_norm": 0.7254189252853394,
+      "learning_rate": 0.00013263503707937584,
+      "loss": 0.9892,
+      "step": 8869
+    },
+    {
+      "epoch": 1.5792378917378918,
+      "grad_norm": 0.6305747032165527,
+      "learning_rate": 0.00013262180566489223,
+      "loss": 0.8931,
+      "step": 8870
+    },
+    {
+      "epoch": 1.5794159544159543,
+      "grad_norm": 0.6560617089271545,
+      "learning_rate": 0.00013260857361125205,
+      "loss": 0.9245,
+      "step": 8871
+    },
+    {
+      "epoch": 1.579594017094017,
+      "grad_norm": 0.7304151654243469,
+      "learning_rate": 0.00013259534091871462,
+      "loss": 1.009,
+      "step": 8872
+    },
+    {
+      "epoch": 1.5797720797720798,
+      "grad_norm": 0.782636821269989,
+      "learning_rate": 0.00013258210758753918,
+      "loss": 1.1123,
+      "step": 8873
+    },
+    {
+      "epoch": 1.5799501424501425,
+      "grad_norm": 0.6992011070251465,
+      "learning_rate": 0.00013256887361798504,
+      "loss": 1.099,
+      "step": 8874
+    },
+    {
+      "epoch": 1.5801282051282053,
+      "grad_norm": 0.7159731984138489,
+      "learning_rate": 0.00013255563901031148,
+      "loss": 1.0257,
+      "step": 8875
+    },
+    {
+      "epoch": 1.5803062678062678,
+      "grad_norm": 0.6055454611778259,
+      "learning_rate": 0.0001325424037647778,
+      "loss": 0.9199,
+      "step": 8876
+    },
+    {
+      "epoch": 1.5804843304843303,
+      "grad_norm": 0.6838310360908508,
+      "learning_rate": 0.00013252916788164334,
+      "loss": 0.8644,
+      "step": 8877
+    },
+    {
+      "epoch": 1.580662393162393,
+      "grad_norm": 0.7067445516586304,
+      "learning_rate": 0.00013251593136116738,
+      "loss": 1.0285,
+      "step": 8878
+    },
+    {
+      "epoch": 1.5808404558404558,
+      "grad_norm": 0.7021774649620056,
+      "learning_rate": 0.00013250269420360928,
+      "loss": 1.1263,
+      "step": 8879
+    },
+    {
+      "epoch": 1.5810185185185186,
+      "grad_norm": 0.6586757302284241,
+      "learning_rate": 0.00013248945640922843,
+      "loss": 0.906,
+      "step": 8880
+    },
+    {
+      "epoch": 1.5811965811965814,
+      "grad_norm": 0.6673910021781921,
+      "learning_rate": 0.00013247621797828418,
+      "loss": 1.0652,
+      "step": 8881
+    },
+    {
+      "epoch": 1.5813746438746439,
+      "grad_norm": 0.6763964295387268,
+      "learning_rate": 0.00013246297891103588,
+      "loss": 1.0227,
+      "step": 8882
+    },
+    {
+      "epoch": 1.5815527065527064,
+      "grad_norm": 0.6536892056465149,
+      "learning_rate": 0.00013244973920774298,
+      "loss": 0.9026,
+      "step": 8883
+    },
+    {
+      "epoch": 1.5817307692307692,
+      "grad_norm": 0.8010411858558655,
+      "learning_rate": 0.0001324364988686648,
+      "loss": 1.1167,
+      "step": 8884
+    },
+    {
+      "epoch": 1.581908831908832,
+      "grad_norm": 0.8159251809120178,
+      "learning_rate": 0.00013242325789406082,
+      "loss": 1.233,
+      "step": 8885
+    },
+    {
+      "epoch": 1.5820868945868947,
+      "grad_norm": 0.6487745046615601,
+      "learning_rate": 0.00013241001628419048,
+      "loss": 0.9888,
+      "step": 8886
+    },
+    {
+      "epoch": 1.5822649572649574,
+      "grad_norm": 0.6750285029411316,
+      "learning_rate": 0.00013239677403931318,
+      "loss": 0.8874,
+      "step": 8887
+    },
+    {
+      "epoch": 1.58244301994302,
+      "grad_norm": 0.7164602875709534,
+      "learning_rate": 0.0001323835311596884,
+      "loss": 1.2029,
+      "step": 8888
+    },
+    {
+      "epoch": 1.5826210826210825,
+      "grad_norm": 0.6081351041793823,
+      "learning_rate": 0.00013237028764557558,
+      "loss": 0.9593,
+      "step": 8889
+    },
+    {
+      "epoch": 1.5827991452991452,
+      "grad_norm": 0.7235409021377563,
+      "learning_rate": 0.00013235704349723424,
+      "loss": 1.5324,
+      "step": 8890
+    },
+    {
+      "epoch": 1.582977207977208,
+      "grad_norm": 0.6658480763435364,
+      "learning_rate": 0.0001323437987149238,
+      "loss": 0.9756,
+      "step": 8891
+    },
+    {
+      "epoch": 1.5831552706552707,
+      "grad_norm": 0.7924265265464783,
+      "learning_rate": 0.00013233055329890387,
+      "loss": 0.9329,
+      "step": 8892
+    },
+    {
+      "epoch": 1.5833333333333335,
+      "grad_norm": 0.6262093186378479,
+      "learning_rate": 0.0001323173072494339,
+      "loss": 0.8288,
+      "step": 8893
+    },
+    {
+      "epoch": 1.583511396011396,
+      "grad_norm": 0.6851989030838013,
+      "learning_rate": 0.0001323040605667734,
+      "loss": 0.9822,
+      "step": 8894
+    },
+    {
+      "epoch": 1.5836894586894585,
+      "grad_norm": 0.6963728666305542,
+      "learning_rate": 0.00013229081325118194,
+      "loss": 1.0416,
+      "step": 8895
+    },
+    {
+      "epoch": 1.5838675213675213,
+      "grad_norm": 0.6017457842826843,
+      "learning_rate": 0.0001322775653029191,
+      "loss": 0.8123,
+      "step": 8896
+    },
+    {
+      "epoch": 1.584045584045584,
+      "grad_norm": 0.7396472096443176,
+      "learning_rate": 0.0001322643167222444,
+      "loss": 1.0339,
+      "step": 8897
+    },
+    {
+      "epoch": 1.5842236467236468,
+      "grad_norm": 0.6360299587249756,
+      "learning_rate": 0.00013225106750941744,
+      "loss": 0.9463,
+      "step": 8898
+    },
+    {
+      "epoch": 1.5844017094017095,
+      "grad_norm": 0.6297624111175537,
+      "learning_rate": 0.00013223781766469783,
+      "loss": 0.9921,
+      "step": 8899
+    },
+    {
+      "epoch": 1.584579772079772,
+      "grad_norm": 0.7722037434577942,
+      "learning_rate": 0.0001322245671883451,
+      "loss": 0.8394,
+      "step": 8900
+    },
+    {
+      "epoch": 1.5847578347578346,
+      "grad_norm": 0.677364706993103,
+      "learning_rate": 0.00013221131608061895,
+      "loss": 1.0954,
+      "step": 8901
+    },
+    {
+      "epoch": 1.5849358974358974,
+      "grad_norm": 0.6954908967018127,
+      "learning_rate": 0.00013219806434177899,
+      "loss": 1.0637,
+      "step": 8902
+    },
+    {
+      "epoch": 1.58511396011396,
+      "grad_norm": 0.7079192996025085,
+      "learning_rate": 0.00013218481197208484,
+      "loss": 1.039,
+      "step": 8903
+    },
+    {
+      "epoch": 1.5852920227920229,
+      "grad_norm": 0.7070451378822327,
+      "learning_rate": 0.00013217155897179611,
+      "loss": 1.0025,
+      "step": 8904
+    },
+    {
+      "epoch": 1.5854700854700856,
+      "grad_norm": 0.6940776705741882,
+      "learning_rate": 0.00013215830534117257,
+      "loss": 0.8039,
+      "step": 8905
+    },
+    {
+      "epoch": 1.5856481481481481,
+      "grad_norm": 0.6545892953872681,
+      "learning_rate": 0.00013214505108047382,
+      "loss": 0.9347,
+      "step": 8906
+    },
+    {
+      "epoch": 1.5858262108262107,
+      "grad_norm": 0.6769635081291199,
+      "learning_rate": 0.00013213179618995957,
+      "loss": 1.0321,
+      "step": 8907
+    },
+    {
+      "epoch": 1.5860042735042734,
+      "grad_norm": 0.6505448222160339,
+      "learning_rate": 0.00013211854066988953,
+      "loss": 1.0558,
+      "step": 8908
+    },
+    {
+      "epoch": 1.5861823361823362,
+      "grad_norm": 0.6764090061187744,
+      "learning_rate": 0.00013210528452052336,
+      "loss": 0.8407,
+      "step": 8909
+    },
+    {
+      "epoch": 1.586360398860399,
+      "grad_norm": 0.6454851627349854,
+      "learning_rate": 0.00013209202774212088,
+      "loss": 0.7439,
+      "step": 8910
+    },
+    {
+      "epoch": 1.5865384615384617,
+      "grad_norm": 0.6911695599555969,
+      "learning_rate": 0.00013207877033494177,
+      "loss": 0.9625,
+      "step": 8911
+    },
+    {
+      "epoch": 1.5867165242165242,
+      "grad_norm": 0.7405226826667786,
+      "learning_rate": 0.0001320655122992458,
+      "loss": 1.054,
+      "step": 8912
+    },
+    {
+      "epoch": 1.5868945868945867,
+      "grad_norm": 0.7362869381904602,
+      "learning_rate": 0.00013205225363529274,
+      "loss": 1.0516,
+      "step": 8913
+    },
+    {
+      "epoch": 1.5870726495726495,
+      "grad_norm": 0.6923766136169434,
+      "learning_rate": 0.0001320389943433423,
+      "loss": 1.2323,
+      "step": 8914
+    },
+    {
+      "epoch": 1.5872507122507122,
+      "grad_norm": 0.7980395555496216,
+      "learning_rate": 0.00013202573442365435,
+      "loss": 1.0229,
+      "step": 8915
+    },
+    {
+      "epoch": 1.587428774928775,
+      "grad_norm": 0.7211610078811646,
+      "learning_rate": 0.00013201247387648868,
+      "loss": 1.0666,
+      "step": 8916
+    },
+    {
+      "epoch": 1.5876068376068377,
+      "grad_norm": 0.6728795766830444,
+      "learning_rate": 0.00013199921270210506,
+      "loss": 1.0322,
+      "step": 8917
+    },
+    {
+      "epoch": 1.5877849002849003,
+      "grad_norm": 0.6226436495780945,
+      "learning_rate": 0.00013198595090076337,
+      "loss": 1.0517,
+      "step": 8918
+    },
+    {
+      "epoch": 1.5879629629629628,
+      "grad_norm": 0.6396511197090149,
+      "learning_rate": 0.0001319726884727234,
+      "loss": 0.8662,
+      "step": 8919
+    },
+    {
+      "epoch": 1.5881410256410255,
+      "grad_norm": 0.5664374828338623,
+      "learning_rate": 0.00013195942541824497,
+      "loss": 0.6601,
+      "step": 8920
+    },
+    {
+      "epoch": 1.5883190883190883,
+      "grad_norm": 0.6556946039199829,
+      "learning_rate": 0.00013194616173758806,
+      "loss": 0.9662,
+      "step": 8921
+    },
+    {
+      "epoch": 1.588497150997151,
+      "grad_norm": 0.7332060933113098,
+      "learning_rate": 0.00013193289743101245,
+      "loss": 0.7687,
+      "step": 8922
+    },
+    {
+      "epoch": 1.5886752136752138,
+      "grad_norm": 0.6103306412696838,
+      "learning_rate": 0.00013191963249877805,
+      "loss": 0.8329,
+      "step": 8923
+    },
+    {
+      "epoch": 1.5888532763532763,
+      "grad_norm": 0.63165283203125,
+      "learning_rate": 0.00013190636694114475,
+      "loss": 0.8336,
+      "step": 8924
+    },
+    {
+      "epoch": 1.589031339031339,
+      "grad_norm": 0.6955820322036743,
+      "learning_rate": 0.00013189310075837246,
+      "loss": 1.0457,
+      "step": 8925
+    },
+    {
+      "epoch": 1.5892094017094016,
+      "grad_norm": 0.6911605596542358,
+      "learning_rate": 0.00013187983395072114,
+      "loss": 0.9389,
+      "step": 8926
+    },
+    {
+      "epoch": 1.5893874643874644,
+      "grad_norm": 0.6493414640426636,
+      "learning_rate": 0.00013186656651845068,
+      "loss": 0.9821,
+      "step": 8927
+    },
+    {
+      "epoch": 1.5895655270655271,
+      "grad_norm": 0.6168226599693298,
+      "learning_rate": 0.00013185329846182107,
+      "loss": 1.0259,
+      "step": 8928
+    },
+    {
+      "epoch": 1.5897435897435899,
+      "grad_norm": 0.6460188627243042,
+      "learning_rate": 0.0001318400297810922,
+      "loss": 0.9836,
+      "step": 8929
+    },
+    {
+      "epoch": 1.5899216524216524,
+      "grad_norm": 0.6630695462226868,
+      "learning_rate": 0.0001318267604765241,
+      "loss": 0.8936,
+      "step": 8930
+    },
+    {
+      "epoch": 1.5900997150997151,
+      "grad_norm": 0.6308651566505432,
+      "learning_rate": 0.00013181349054837676,
+      "loss": 0.9583,
+      "step": 8931
+    },
+    {
+      "epoch": 1.5902777777777777,
+      "grad_norm": 0.6508499979972839,
+      "learning_rate": 0.00013180021999691018,
+      "loss": 0.7647,
+      "step": 8932
+    },
+    {
+      "epoch": 1.5904558404558404,
+      "grad_norm": 0.6625795960426331,
+      "learning_rate": 0.00013178694882238432,
+      "loss": 1.0329,
+      "step": 8933
+    },
+    {
+      "epoch": 1.5906339031339032,
+      "grad_norm": 0.6721987128257751,
+      "learning_rate": 0.00013177367702505924,
+      "loss": 0.9377,
+      "step": 8934
+    },
+    {
+      "epoch": 1.590811965811966,
+      "grad_norm": 0.7295519709587097,
+      "learning_rate": 0.00013176040460519497,
+      "loss": 0.9396,
+      "step": 8935
+    },
+    {
+      "epoch": 1.5909900284900285,
+      "grad_norm": 0.6673944592475891,
+      "learning_rate": 0.0001317471315630515,
+      "loss": 1.0284,
+      "step": 8936
+    },
+    {
+      "epoch": 1.5911680911680912,
+      "grad_norm": 0.6858960390090942,
+      "learning_rate": 0.00013173385789888898,
+      "loss": 1.2022,
+      "step": 8937
+    },
+    {
+      "epoch": 1.5913461538461537,
+      "grad_norm": 0.5836796164512634,
+      "learning_rate": 0.00013172058361296743,
+      "loss": 1.0078,
+      "step": 8938
+    },
+    {
+      "epoch": 1.5915242165242165,
+      "grad_norm": 0.7732513546943665,
+      "learning_rate": 0.00013170730870554694,
+      "loss": 1.0912,
+      "step": 8939
+    },
+    {
+      "epoch": 1.5917022792022792,
+      "grad_norm": 0.7095892429351807,
+      "learning_rate": 0.0001316940331768876,
+      "loss": 1.0506,
+      "step": 8940
+    },
+    {
+      "epoch": 1.591880341880342,
+      "grad_norm": 0.757534384727478,
+      "learning_rate": 0.00013168075702724952,
+      "loss": 1.036,
+      "step": 8941
+    },
+    {
+      "epoch": 1.5920584045584045,
+      "grad_norm": 0.6719361543655396,
+      "learning_rate": 0.00013166748025689282,
+      "loss": 0.9406,
+      "step": 8942
+    },
+    {
+      "epoch": 1.5922364672364673,
+      "grad_norm": 0.6955735087394714,
+      "learning_rate": 0.00013165420286607763,
+      "loss": 0.9325,
+      "step": 8943
+    },
+    {
+      "epoch": 1.5924145299145298,
+      "grad_norm": 0.6810322999954224,
+      "learning_rate": 0.00013164092485506407,
+      "loss": 1.0402,
+      "step": 8944
+    },
+    {
+      "epoch": 1.5925925925925926,
+      "grad_norm": 0.6346224546432495,
+      "learning_rate": 0.00013162764622411233,
+      "loss": 0.9725,
+      "step": 8945
+    },
+    {
+      "epoch": 1.5927706552706553,
+      "grad_norm": 0.728705883026123,
+      "learning_rate": 0.00013161436697348258,
+      "loss": 0.9665,
+      "step": 8946
+    },
+    {
+      "epoch": 1.592948717948718,
+      "grad_norm": 0.6838595271110535,
+      "learning_rate": 0.00013160108710343494,
+      "loss": 0.9771,
+      "step": 8947
+    },
+    {
+      "epoch": 1.5931267806267806,
+      "grad_norm": 0.7052602767944336,
+      "learning_rate": 0.00013158780661422966,
+      "loss": 0.8819,
+      "step": 8948
+    },
+    {
+      "epoch": 1.5933048433048433,
+      "grad_norm": 0.7237630486488342,
+      "learning_rate": 0.00013157452550612697,
+      "loss": 1.0609,
+      "step": 8949
+    },
+    {
+      "epoch": 1.5934829059829059,
+      "grad_norm": 0.6554936766624451,
+      "learning_rate": 0.00013156124377938699,
+      "loss": 0.8592,
+      "step": 8950
+    },
+    {
+      "epoch": 1.5936609686609686,
+      "grad_norm": 0.6125665307044983,
+      "learning_rate": 0.00013154796143427,
+      "loss": 0.8399,
+      "step": 8951
+    },
+    {
+      "epoch": 1.5938390313390314,
+      "grad_norm": 0.6930897235870361,
+      "learning_rate": 0.0001315346784710363,
+      "loss": 0.9965,
+      "step": 8952
+    },
+    {
+      "epoch": 1.5940170940170941,
+      "grad_norm": 0.7808064818382263,
+      "learning_rate": 0.00013152139488994605,
+      "loss": 1.0527,
+      "step": 8953
+    },
+    {
+      "epoch": 1.5941951566951567,
+      "grad_norm": 0.6125522255897522,
+      "learning_rate": 0.0001315081106912595,
+      "loss": 1.1159,
+      "step": 8954
+    },
+    {
+      "epoch": 1.5943732193732194,
+      "grad_norm": 0.5863428711891174,
+      "learning_rate": 0.00013149482587523703,
+      "loss": 0.84,
+      "step": 8955
+    },
+    {
+      "epoch": 1.594551282051282,
+      "grad_norm": 0.7170202732086182,
+      "learning_rate": 0.00013148154044213882,
+      "loss": 1.0821,
+      "step": 8956
+    },
+    {
+      "epoch": 1.5947293447293447,
+      "grad_norm": 0.6409463882446289,
+      "learning_rate": 0.00013146825439222528,
+      "loss": 1.0097,
+      "step": 8957
+    },
+    {
+      "epoch": 1.5949074074074074,
+      "grad_norm": 0.7037690281867981,
+      "learning_rate": 0.00013145496772575666,
+      "loss": 1.1511,
+      "step": 8958
+    },
+    {
+      "epoch": 1.5950854700854702,
+      "grad_norm": 0.6400953531265259,
+      "learning_rate": 0.00013144168044299326,
+      "loss": 1.0809,
+      "step": 8959
+    },
+    {
+      "epoch": 1.5952635327635327,
+      "grad_norm": 0.6129940152168274,
+      "learning_rate": 0.00013142839254419545,
+      "loss": 0.8481,
+      "step": 8960
+    },
+    {
+      "epoch": 1.5954415954415955,
+      "grad_norm": 0.7452271580696106,
+      "learning_rate": 0.00013141510402962358,
+      "loss": 1.0649,
+      "step": 8961
+    },
+    {
+      "epoch": 1.595619658119658,
+      "grad_norm": 0.7407623529434204,
+      "learning_rate": 0.000131401814899538,
+      "loss": 0.9084,
+      "step": 8962
+    },
+    {
+      "epoch": 1.5957977207977208,
+      "grad_norm": 0.7103050947189331,
+      "learning_rate": 0.0001313885251541991,
+      "loss": 0.946,
+      "step": 8963
+    },
+    {
+      "epoch": 1.5959757834757835,
+      "grad_norm": 0.5566636323928833,
+      "learning_rate": 0.00013137523479386727,
+      "loss": 0.6781,
+      "step": 8964
+    },
+    {
+      "epoch": 1.5961538461538463,
+      "grad_norm": 0.8137457966804504,
+      "learning_rate": 0.00013136194381880288,
+      "loss": 0.9273,
+      "step": 8965
+    },
+    {
+      "epoch": 1.5963319088319088,
+      "grad_norm": 0.779330849647522,
+      "learning_rate": 0.0001313486522292663,
+      "loss": 1.1105,
+      "step": 8966
+    },
+    {
+      "epoch": 1.5965099715099715,
+      "grad_norm": 0.6807126998901367,
+      "learning_rate": 0.00013133536002551808,
+      "loss": 1.0728,
+      "step": 8967
+    },
+    {
+      "epoch": 1.596688034188034,
+      "grad_norm": 0.7371507287025452,
+      "learning_rate": 0.00013132206720781853,
+      "loss": 0.979,
+      "step": 8968
+    },
+    {
+      "epoch": 1.5968660968660968,
+      "grad_norm": 0.6811465620994568,
+      "learning_rate": 0.00013130877377642814,
+      "loss": 0.9821,
+      "step": 8969
+    },
+    {
+      "epoch": 1.5970441595441596,
+      "grad_norm": 0.6732743978500366,
+      "learning_rate": 0.00013129547973160738,
+      "loss": 0.8511,
+      "step": 8970
+    },
+    {
+      "epoch": 1.5972222222222223,
+      "grad_norm": 0.594901978969574,
+      "learning_rate": 0.0001312821850736167,
+      "loss": 0.9674,
+      "step": 8971
+    },
+    {
+      "epoch": 1.5974002849002849,
+      "grad_norm": 0.6743764281272888,
+      "learning_rate": 0.00013126888980271657,
+      "loss": 0.9268,
+      "step": 8972
+    },
+    {
+      "epoch": 1.5975783475783476,
+      "grad_norm": 0.7532161474227905,
+      "learning_rate": 0.00013125559391916752,
+      "loss": 1.0474,
+      "step": 8973
+    },
+    {
+      "epoch": 1.5977564102564101,
+      "grad_norm": 0.6331499814987183,
+      "learning_rate": 0.00013124229742323,
+      "loss": 1.05,
+      "step": 8974
+    },
+    {
+      "epoch": 1.5979344729344729,
+      "grad_norm": 0.7418690323829651,
+      "learning_rate": 0.0001312290003151646,
+      "loss": 0.9475,
+      "step": 8975
+    },
+    {
+      "epoch": 1.5981125356125356,
+      "grad_norm": 0.6511179804801941,
+      "learning_rate": 0.0001312157025952318,
+      "loss": 0.9206,
+      "step": 8976
+    },
+    {
+      "epoch": 1.5982905982905984,
+      "grad_norm": 0.6380775570869446,
+      "learning_rate": 0.00013120240426369215,
+      "loss": 0.9953,
+      "step": 8977
+    },
+    {
+      "epoch": 1.598468660968661,
+      "grad_norm": 0.8483675122261047,
+      "learning_rate": 0.00013118910532080623,
+      "loss": 0.9454,
+      "step": 8978
+    },
+    {
+      "epoch": 1.5986467236467237,
+      "grad_norm": 0.6700518727302551,
+      "learning_rate": 0.00013117580576683455,
+      "loss": 1.0413,
+      "step": 8979
+    },
+    {
+      "epoch": 1.5988247863247862,
+      "grad_norm": 0.7750083208084106,
+      "learning_rate": 0.00013116250560203774,
+      "loss": 1.1868,
+      "step": 8980
+    },
+    {
+      "epoch": 1.599002849002849,
+      "grad_norm": 0.7474972009658813,
+      "learning_rate": 0.00013114920482667635,
+      "loss": 1.0876,
+      "step": 8981
+    },
+    {
+      "epoch": 1.5991809116809117,
+      "grad_norm": 0.6920070052146912,
+      "learning_rate": 0.000131135903441011,
+      "loss": 1.0787,
+      "step": 8982
+    },
+    {
+      "epoch": 1.5993589743589745,
+      "grad_norm": 0.7572436928749084,
+      "learning_rate": 0.00013112260144530232,
+      "loss": 0.9798,
+      "step": 8983
+    },
+    {
+      "epoch": 1.5995370370370372,
+      "grad_norm": 0.6983019709587097,
+      "learning_rate": 0.00013110929883981088,
+      "loss": 1.1115,
+      "step": 8984
+    },
+    {
+      "epoch": 1.5997150997150997,
+      "grad_norm": 0.6352120041847229,
+      "learning_rate": 0.0001310959956247974,
+      "loss": 0.9962,
+      "step": 8985
+    },
+    {
+      "epoch": 1.5998931623931623,
+      "grad_norm": 0.596858561038971,
+      "learning_rate": 0.00013108269180052244,
+      "loss": 0.8686,
+      "step": 8986
+    },
+    {
+      "epoch": 1.600071225071225,
+      "grad_norm": 0.6237605214118958,
+      "learning_rate": 0.00013106938736724672,
+      "loss": 0.9166,
+      "step": 8987
+    },
+    {
+      "epoch": 1.6002492877492878,
+      "grad_norm": 0.6818585395812988,
+      "learning_rate": 0.0001310560823252309,
+      "loss": 0.9993,
+      "step": 8988
+    },
+    {
+      "epoch": 1.6004273504273505,
+      "grad_norm": 0.6372287273406982,
+      "learning_rate": 0.00013104277667473564,
+      "loss": 0.8589,
+      "step": 8989
+    },
+    {
+      "epoch": 1.6006054131054133,
+      "grad_norm": 0.6057302355766296,
+      "learning_rate": 0.0001310294704160217,
+      "loss": 0.9325,
+      "step": 8990
+    },
+    {
+      "epoch": 1.6007834757834758,
+      "grad_norm": 0.6999384164810181,
+      "learning_rate": 0.0001310161635493497,
+      "loss": 0.8691,
+      "step": 8991
+    },
+    {
+      "epoch": 1.6009615384615383,
+      "grad_norm": 0.6182113289833069,
+      "learning_rate": 0.00013100285607498045,
+      "loss": 1.0271,
+      "step": 8992
+    },
+    {
+      "epoch": 1.601139601139601,
+      "grad_norm": 0.6681149005889893,
+      "learning_rate": 0.0001309895479931746,
+      "loss": 0.989,
+      "step": 8993
+    },
+    {
+      "epoch": 1.6013176638176638,
+      "grad_norm": 0.6187826991081238,
+      "learning_rate": 0.00013097623930419293,
+      "loss": 0.8051,
+      "step": 8994
+    },
+    {
+      "epoch": 1.6014957264957266,
+      "grad_norm": 0.698793888092041,
+      "learning_rate": 0.00013096293000829621,
+      "loss": 1.0762,
+      "step": 8995
+    },
+    {
+      "epoch": 1.6016737891737893,
+      "grad_norm": 0.693149745464325,
+      "learning_rate": 0.0001309496201057452,
+      "loss": 1.0894,
+      "step": 8996
+    },
+    {
+      "epoch": 1.6018518518518519,
+      "grad_norm": 0.6664052605628967,
+      "learning_rate": 0.00013093630959680068,
+      "loss": 0.9835,
+      "step": 8997
+    },
+    {
+      "epoch": 1.6020299145299144,
+      "grad_norm": 0.6919469833374023,
+      "learning_rate": 0.0001309229984817234,
+      "loss": 0.9062,
+      "step": 8998
+    },
+    {
+      "epoch": 1.6022079772079771,
+      "grad_norm": 0.704781174659729,
+      "learning_rate": 0.00013090968676077427,
+      "loss": 0.8582,
+      "step": 8999
+    },
+    {
+      "epoch": 1.60238603988604,
+      "grad_norm": 0.8055264949798584,
+      "learning_rate": 0.000130896374434214,
+      "loss": 0.9813,
+      "step": 9000
+    },
+    {
+      "epoch": 1.6025641025641026,
+      "grad_norm": 0.6301952004432678,
+      "learning_rate": 0.00013088306150230348,
+      "loss": 0.7056,
+      "step": 9001
+    },
+    {
+      "epoch": 1.6027421652421654,
+      "grad_norm": 0.698544442653656,
+      "learning_rate": 0.00013086974796530347,
+      "loss": 0.9806,
+      "step": 9002
+    },
+    {
+      "epoch": 1.602920227920228,
+      "grad_norm": 0.669548511505127,
+      "learning_rate": 0.00013085643382347491,
+      "loss": 1.0317,
+      "step": 9003
+    },
+    {
+      "epoch": 1.6030982905982905,
+      "grad_norm": 0.6404716372489929,
+      "learning_rate": 0.00013084311907707864,
+      "loss": 0.8885,
+      "step": 9004
+    },
+    {
+      "epoch": 1.6032763532763532,
+      "grad_norm": 0.6968616843223572,
+      "learning_rate": 0.0001308298037263755,
+      "loss": 1.0665,
+      "step": 9005
+    },
+    {
+      "epoch": 1.603454415954416,
+      "grad_norm": 0.849311113357544,
+      "learning_rate": 0.00013081648777162644,
+      "loss": 1.1404,
+      "step": 9006
+    },
+    {
+      "epoch": 1.6036324786324787,
+      "grad_norm": 0.6603094935417175,
+      "learning_rate": 0.00013080317121309223,
+      "loss": 0.8341,
+      "step": 9007
+    },
+    {
+      "epoch": 1.6038105413105415,
+      "grad_norm": 0.6777810454368591,
+      "learning_rate": 0.00013078985405103394,
+      "loss": 1.044,
+      "step": 9008
+    },
+    {
+      "epoch": 1.603988603988604,
+      "grad_norm": 0.6783546209335327,
+      "learning_rate": 0.0001307765362857124,
+      "loss": 1.042,
+      "step": 9009
+    },
+    {
+      "epoch": 1.6041666666666665,
+      "grad_norm": 0.7251788377761841,
+      "learning_rate": 0.00013076321791738858,
+      "loss": 0.9004,
+      "step": 9010
+    },
+    {
+      "epoch": 1.6043447293447293,
+      "grad_norm": 0.7885342240333557,
+      "learning_rate": 0.00013074989894632338,
+      "loss": 1.1966,
+      "step": 9011
+    },
+    {
+      "epoch": 1.604522792022792,
+      "grad_norm": 0.7171013355255127,
+      "learning_rate": 0.0001307365793727778,
+      "loss": 1.2242,
+      "step": 9012
+    },
+    {
+      "epoch": 1.6047008547008548,
+      "grad_norm": 0.6027249693870544,
+      "learning_rate": 0.00013072325919701283,
+      "loss": 0.917,
+      "step": 9013
+    },
+    {
+      "epoch": 1.6048789173789175,
+      "grad_norm": 0.5957151055335999,
+      "learning_rate": 0.00013070993841928936,
+      "loss": 0.9154,
+      "step": 9014
+    },
+    {
+      "epoch": 1.60505698005698,
+      "grad_norm": 0.6190659403800964,
+      "learning_rate": 0.00013069661703986847,
+      "loss": 0.7071,
+      "step": 9015
+    },
+    {
+      "epoch": 1.6052350427350426,
+      "grad_norm": 0.6454868316650391,
+      "learning_rate": 0.00013068329505901117,
+      "loss": 0.8381,
+      "step": 9016
+    },
+    {
+      "epoch": 1.6054131054131053,
+      "grad_norm": 0.6255491375923157,
+      "learning_rate": 0.00013066997247697837,
+      "loss": 0.7515,
+      "step": 9017
+    },
+    {
+      "epoch": 1.605591168091168,
+      "grad_norm": 0.6214072108268738,
+      "learning_rate": 0.0001306566492940312,
+      "loss": 1.0101,
+      "step": 9018
+    },
+    {
+      "epoch": 1.6057692307692308,
+      "grad_norm": 0.7244150638580322,
+      "learning_rate": 0.0001306433255104307,
+      "loss": 1.2558,
+      "step": 9019
+    },
+    {
+      "epoch": 1.6059472934472936,
+      "grad_norm": 0.6162270903587341,
+      "learning_rate": 0.00013063000112643785,
+      "loss": 1.1009,
+      "step": 9020
+    },
+    {
+      "epoch": 1.6061253561253561,
+      "grad_norm": 0.7309414744377136,
+      "learning_rate": 0.0001306166761423138,
+      "loss": 1.1973,
+      "step": 9021
+    },
+    {
+      "epoch": 1.6063034188034186,
+      "grad_norm": 0.7150956392288208,
+      "learning_rate": 0.00013060335055831957,
+      "loss": 0.9136,
+      "step": 9022
+    },
+    {
+      "epoch": 1.6064814814814814,
+      "grad_norm": 0.8187742829322815,
+      "learning_rate": 0.00013059002437471623,
+      "loss": 1.0524,
+      "step": 9023
+    },
+    {
+      "epoch": 1.6066595441595442,
+      "grad_norm": 0.7928692698478699,
+      "learning_rate": 0.00013057669759176493,
+      "loss": 1.0249,
+      "step": 9024
+    },
+    {
+      "epoch": 1.606837606837607,
+      "grad_norm": 0.6929279565811157,
+      "learning_rate": 0.00013056337020972677,
+      "loss": 1.1804,
+      "step": 9025
+    },
+    {
+      "epoch": 1.6070156695156697,
+      "grad_norm": 0.6771654486656189,
+      "learning_rate": 0.00013055004222886285,
+      "loss": 1.0284,
+      "step": 9026
+    },
+    {
+      "epoch": 1.6071937321937322,
+      "grad_norm": 0.6689024567604065,
+      "learning_rate": 0.0001305367136494343,
+      "loss": 1.0431,
+      "step": 9027
+    },
+    {
+      "epoch": 1.6073717948717947,
+      "grad_norm": 0.71135413646698,
+      "learning_rate": 0.0001305233844717023,
+      "loss": 0.9692,
+      "step": 9028
+    },
+    {
+      "epoch": 1.6075498575498575,
+      "grad_norm": 0.5459749698638916,
+      "learning_rate": 0.00013051005469592796,
+      "loss": 0.5643,
+      "step": 9029
+    },
+    {
+      "epoch": 1.6077279202279202,
+      "grad_norm": 0.7225865125656128,
+      "learning_rate": 0.00013049672432237253,
+      "loss": 1.0954,
+      "step": 9030
+    },
+    {
+      "epoch": 1.607905982905983,
+      "grad_norm": 0.6878093481063843,
+      "learning_rate": 0.0001304833933512971,
+      "loss": 0.894,
+      "step": 9031
+    },
+    {
+      "epoch": 1.6080840455840457,
+      "grad_norm": 0.6967248320579529,
+      "learning_rate": 0.00013047006178296288,
+      "loss": 1.0356,
+      "step": 9032
+    },
+    {
+      "epoch": 1.6082621082621082,
+      "grad_norm": 0.6404993534088135,
+      "learning_rate": 0.00013045672961763114,
+      "loss": 0.8528,
+      "step": 9033
+    },
+    {
+      "epoch": 1.6084401709401708,
+      "grad_norm": 0.5919156074523926,
+      "learning_rate": 0.000130443396855563,
+      "loss": 0.7196,
+      "step": 9034
+    },
+    {
+      "epoch": 1.6086182336182335,
+      "grad_norm": 0.6792302131652832,
+      "learning_rate": 0.00013043006349701977,
+      "loss": 0.9519,
+      "step": 9035
+    },
+    {
+      "epoch": 1.6087962962962963,
+      "grad_norm": 0.6263542175292969,
+      "learning_rate": 0.00013041672954226268,
+      "loss": 1.0483,
+      "step": 9036
+    },
+    {
+      "epoch": 1.608974358974359,
+      "grad_norm": 0.5865579843521118,
+      "learning_rate": 0.00013040339499155294,
+      "loss": 0.8794,
+      "step": 9037
+    },
+    {
+      "epoch": 1.6091524216524218,
+      "grad_norm": 0.8383142948150635,
+      "learning_rate": 0.00013039005984515181,
+      "loss": 0.8929,
+      "step": 9038
+    },
+    {
+      "epoch": 1.6093304843304843,
+      "grad_norm": 0.6438691020011902,
+      "learning_rate": 0.00013037672410332063,
+      "loss": 0.9957,
+      "step": 9039
+    },
+    {
+      "epoch": 1.609508547008547,
+      "grad_norm": 0.74748694896698,
+      "learning_rate": 0.0001303633877663206,
+      "loss": 0.9809,
+      "step": 9040
+    },
+    {
+      "epoch": 1.6096866096866096,
+      "grad_norm": 0.6697205901145935,
+      "learning_rate": 0.00013035005083441312,
+      "loss": 0.9556,
+      "step": 9041
+    },
+    {
+      "epoch": 1.6098646723646723,
+      "grad_norm": 0.6577828526496887,
+      "learning_rate": 0.00013033671330785941,
+      "loss": 0.8956,
+      "step": 9042
+    },
+    {
+      "epoch": 1.610042735042735,
+      "grad_norm": 0.6423429846763611,
+      "learning_rate": 0.0001303233751869208,
+      "loss": 0.8467,
+      "step": 9043
+    },
+    {
+      "epoch": 1.6102207977207978,
+      "grad_norm": 0.6552175879478455,
+      "learning_rate": 0.00013031003647185867,
+      "loss": 0.8656,
+      "step": 9044
+    },
+    {
+      "epoch": 1.6103988603988604,
+      "grad_norm": 0.6755174398422241,
+      "learning_rate": 0.00013029669716293433,
+      "loss": 0.7836,
+      "step": 9045
+    },
+    {
+      "epoch": 1.6105769230769231,
+      "grad_norm": 0.6832906007766724,
+      "learning_rate": 0.00013028335726040914,
+      "loss": 1.1531,
+      "step": 9046
+    },
+    {
+      "epoch": 1.6107549857549857,
+      "grad_norm": 0.6498637795448303,
+      "learning_rate": 0.00013027001676454446,
+      "loss": 0.8637,
+      "step": 9047
+    },
+    {
+      "epoch": 1.6109330484330484,
+      "grad_norm": 0.6792944073677063,
+      "learning_rate": 0.0001302566756756017,
+      "loss": 1.0865,
+      "step": 9048
+    },
+    {
+      "epoch": 1.6111111111111112,
+      "grad_norm": 0.6801337003707886,
+      "learning_rate": 0.00013024333399384226,
+      "loss": 1.0738,
+      "step": 9049
+    },
+    {
+      "epoch": 1.611289173789174,
+      "grad_norm": 0.675216794013977,
+      "learning_rate": 0.0001302299917195275,
+      "loss": 1.1074,
+      "step": 9050
+    },
+    {
+      "epoch": 1.6114672364672364,
+      "grad_norm": 0.6418983340263367,
+      "learning_rate": 0.00013021664885291885,
+      "loss": 1.0025,
+      "step": 9051
+    },
+    {
+      "epoch": 1.6116452991452992,
+      "grad_norm": 0.7778789401054382,
+      "learning_rate": 0.0001302033053942777,
+      "loss": 1.0847,
+      "step": 9052
+    },
+    {
+      "epoch": 1.6118233618233617,
+      "grad_norm": 0.7672827243804932,
+      "learning_rate": 0.00013018996134386555,
+      "loss": 1.0565,
+      "step": 9053
+    },
+    {
+      "epoch": 1.6120014245014245,
+      "grad_norm": 0.6770617961883545,
+      "learning_rate": 0.00013017661670194382,
+      "loss": 0.9069,
+      "step": 9054
+    },
+    {
+      "epoch": 1.6121794871794872,
+      "grad_norm": 0.7161242961883545,
+      "learning_rate": 0.00013016327146877393,
+      "loss": 1.1301,
+      "step": 9055
+    },
+    {
+      "epoch": 1.61235754985755,
+      "grad_norm": 0.6923251152038574,
+      "learning_rate": 0.00013014992564461746,
+      "loss": 0.9546,
+      "step": 9056
+    },
+    {
+      "epoch": 1.6125356125356125,
+      "grad_norm": 0.622953474521637,
+      "learning_rate": 0.0001301365792297358,
+      "loss": 0.8152,
+      "step": 9057
+    },
+    {
+      "epoch": 1.6127136752136753,
+      "grad_norm": 0.7477008104324341,
+      "learning_rate": 0.00013012323222439046,
+      "loss": 0.8428,
+      "step": 9058
+    },
+    {
+      "epoch": 1.6128917378917378,
+      "grad_norm": 0.6612883806228638,
+      "learning_rate": 0.000130109884628843,
+      "loss": 1.0678,
+      "step": 9059
+    },
+    {
+      "epoch": 1.6130698005698005,
+      "grad_norm": 0.6406781077384949,
+      "learning_rate": 0.00013009653644335486,
+      "loss": 0.6792,
+      "step": 9060
+    },
+    {
+      "epoch": 1.6132478632478633,
+      "grad_norm": 0.6279141902923584,
+      "learning_rate": 0.00013008318766818763,
+      "loss": 0.9826,
+      "step": 9061
+    },
+    {
+      "epoch": 1.613425925925926,
+      "grad_norm": 0.6616412401199341,
+      "learning_rate": 0.00013006983830360285,
+      "loss": 1.0691,
+      "step": 9062
+    },
+    {
+      "epoch": 1.6136039886039886,
+      "grad_norm": 0.6520406603813171,
+      "learning_rate": 0.000130056488349862,
+      "loss": 0.9487,
+      "step": 9063
+    },
+    {
+      "epoch": 1.6137820512820513,
+      "grad_norm": 0.6378647089004517,
+      "learning_rate": 0.00013004313780722672,
+      "loss": 0.8557,
+      "step": 9064
+    },
+    {
+      "epoch": 1.6139601139601139,
+      "grad_norm": 0.6547569036483765,
+      "learning_rate": 0.00013002978667595857,
+      "loss": 0.879,
+      "step": 9065
+    },
+    {
+      "epoch": 1.6141381766381766,
+      "grad_norm": 0.7347842454910278,
+      "learning_rate": 0.00013001643495631914,
+      "loss": 1.0757,
+      "step": 9066
+    },
+    {
+      "epoch": 1.6143162393162394,
+      "grad_norm": 0.5988406538963318,
+      "learning_rate": 0.00013000308264857002,
+      "loss": 0.6754,
+      "step": 9067
+    },
+    {
+      "epoch": 1.614494301994302,
+      "grad_norm": 0.6949366331100464,
+      "learning_rate": 0.00012998972975297282,
+      "loss": 1.1236,
+      "step": 9068
+    },
+    {
+      "epoch": 1.6146723646723646,
+      "grad_norm": 0.7095484137535095,
+      "learning_rate": 0.00012997637626978913,
+      "loss": 1.0124,
+      "step": 9069
+    },
+    {
+      "epoch": 1.6148504273504274,
+      "grad_norm": 0.6634095311164856,
+      "learning_rate": 0.00012996302219928064,
+      "loss": 1.2018,
+      "step": 9070
+    },
+    {
+      "epoch": 1.61502849002849,
+      "grad_norm": 0.6894524693489075,
+      "learning_rate": 0.000129949667541709,
+      "loss": 0.9959,
+      "step": 9071
+    },
+    {
+      "epoch": 1.6152065527065527,
+      "grad_norm": 0.672334611415863,
+      "learning_rate": 0.00012993631229733582,
+      "loss": 1.0369,
+      "step": 9072
+    },
+    {
+      "epoch": 1.6153846153846154,
+      "grad_norm": 0.725759744644165,
+      "learning_rate": 0.00012992295646642278,
+      "loss": 1.0079,
+      "step": 9073
+    },
+    {
+      "epoch": 1.6155626780626782,
+      "grad_norm": 0.7941585779190063,
+      "learning_rate": 0.00012990960004923154,
+      "loss": 0.9468,
+      "step": 9074
+    },
+    {
+      "epoch": 1.6157407407407407,
+      "grad_norm": 0.6556950807571411,
+      "learning_rate": 0.00012989624304602385,
+      "loss": 0.9915,
+      "step": 9075
+    },
+    {
+      "epoch": 1.6159188034188035,
+      "grad_norm": 0.7515892386436462,
+      "learning_rate": 0.0001298828854570614,
+      "loss": 1.0924,
+      "step": 9076
+    },
+    {
+      "epoch": 1.616096866096866,
+      "grad_norm": 0.6944101452827454,
+      "learning_rate": 0.00012986952728260586,
+      "loss": 0.9632,
+      "step": 9077
+    },
+    {
+      "epoch": 1.6162749287749287,
+      "grad_norm": 0.6286170482635498,
+      "learning_rate": 0.000129856168522919,
+      "loss": 1.0311,
+      "step": 9078
+    },
+    {
+      "epoch": 1.6164529914529915,
+      "grad_norm": 0.8362757563591003,
+      "learning_rate": 0.0001298428091782625,
+      "loss": 1.1232,
+      "step": 9079
+    },
+    {
+      "epoch": 1.6166310541310542,
+      "grad_norm": 0.6199851632118225,
+      "learning_rate": 0.0001298294492488982,
+      "loss": 0.9454,
+      "step": 9080
+    },
+    {
+      "epoch": 1.6168091168091168,
+      "grad_norm": 0.7541791796684265,
+      "learning_rate": 0.0001298160887350878,
+      "loss": 0.9759,
+      "step": 9081
+    },
+    {
+      "epoch": 1.6169871794871795,
+      "grad_norm": 0.6940878033638,
+      "learning_rate": 0.00012980272763709304,
+      "loss": 0.9258,
+      "step": 9082
+    },
+    {
+      "epoch": 1.617165242165242,
+      "grad_norm": 0.6934045553207397,
+      "learning_rate": 0.00012978936595517575,
+      "loss": 1.0142,
+      "step": 9083
+    },
+    {
+      "epoch": 1.6173433048433048,
+      "grad_norm": 0.8147503733634949,
+      "learning_rate": 0.00012977600368959774,
+      "loss": 0.964,
+      "step": 9084
+    },
+    {
+      "epoch": 1.6175213675213675,
+      "grad_norm": 0.6583107709884644,
+      "learning_rate": 0.00012976264084062079,
+      "loss": 1.0315,
+      "step": 9085
+    },
+    {
+      "epoch": 1.6176994301994303,
+      "grad_norm": 0.7192013263702393,
+      "learning_rate": 0.0001297492774085067,
+      "loss": 0.9528,
+      "step": 9086
+    },
+    {
+      "epoch": 1.6178774928774928,
+      "grad_norm": 0.665888786315918,
+      "learning_rate": 0.00012973591339351733,
+      "loss": 1.0188,
+      "step": 9087
+    },
+    {
+      "epoch": 1.6180555555555556,
+      "grad_norm": 0.7170987725257874,
+      "learning_rate": 0.0001297225487959145,
+      "loss": 0.8969,
+      "step": 9088
+    },
+    {
+      "epoch": 1.618233618233618,
+      "grad_norm": 0.6768732070922852,
+      "learning_rate": 0.00012970918361596007,
+      "loss": 1.1951,
+      "step": 9089
+    },
+    {
+      "epoch": 1.6184116809116809,
+      "grad_norm": 0.6640290021896362,
+      "learning_rate": 0.00012969581785391592,
+      "loss": 0.9649,
+      "step": 9090
+    },
+    {
+      "epoch": 1.6185897435897436,
+      "grad_norm": 0.6200813055038452,
+      "learning_rate": 0.00012968245151004392,
+      "loss": 0.9446,
+      "step": 9091
+    },
+    {
+      "epoch": 1.6187678062678064,
+      "grad_norm": 0.6815837621688843,
+      "learning_rate": 0.0001296690845846059,
+      "loss": 1.0506,
+      "step": 9092
+    },
+    {
+      "epoch": 1.618945868945869,
+      "grad_norm": 0.7252637147903442,
+      "learning_rate": 0.0001296557170778638,
+      "loss": 1.1977,
+      "step": 9093
+    },
+    {
+      "epoch": 1.6191239316239316,
+      "grad_norm": 0.5609107613563538,
+      "learning_rate": 0.00012964234899007955,
+      "loss": 0.8009,
+      "step": 9094
+    },
+    {
+      "epoch": 1.6193019943019942,
+      "grad_norm": 0.6539437770843506,
+      "learning_rate": 0.00012962898032151506,
+      "loss": 0.8482,
+      "step": 9095
+    },
+    {
+      "epoch": 1.619480056980057,
+      "grad_norm": 0.6993300914764404,
+      "learning_rate": 0.0001296156110724322,
+      "loss": 1.0725,
+      "step": 9096
+    },
+    {
+      "epoch": 1.6196581196581197,
+      "grad_norm": 0.6768273711204529,
+      "learning_rate": 0.000129602241243093,
+      "loss": 0.9247,
+      "step": 9097
+    },
+    {
+      "epoch": 1.6198361823361824,
+      "grad_norm": 0.6896265745162964,
+      "learning_rate": 0.00012958887083375939,
+      "loss": 0.9526,
+      "step": 9098
+    },
+    {
+      "epoch": 1.6200142450142452,
+      "grad_norm": 0.7475146651268005,
+      "learning_rate": 0.00012957549984469327,
+      "loss": 0.8302,
+      "step": 9099
+    },
+    {
+      "epoch": 1.6201923076923077,
+      "grad_norm": 0.6622769236564636,
+      "learning_rate": 0.00012956212827615674,
+      "loss": 0.9505,
+      "step": 9100
+    },
+    {
+      "epoch": 1.6203703703703702,
+      "grad_norm": 0.6938058137893677,
+      "learning_rate": 0.00012954875612841167,
+      "loss": 0.9757,
+      "step": 9101
+    },
+    {
+      "epoch": 1.620548433048433,
+      "grad_norm": 0.7453510761260986,
+      "learning_rate": 0.0001295353834017201,
+      "loss": 1.0919,
+      "step": 9102
+    },
+    {
+      "epoch": 1.6207264957264957,
+      "grad_norm": 0.7868932485580444,
+      "learning_rate": 0.0001295220100963441,
+      "loss": 0.9265,
+      "step": 9103
+    },
+    {
+      "epoch": 1.6209045584045585,
+      "grad_norm": 0.6779825091362,
+      "learning_rate": 0.00012950863621254558,
+      "loss": 0.98,
+      "step": 9104
+    },
+    {
+      "epoch": 1.6210826210826212,
+      "grad_norm": 0.6825897097587585,
+      "learning_rate": 0.00012949526175058662,
+      "loss": 0.9218,
+      "step": 9105
+    },
+    {
+      "epoch": 1.6212606837606838,
+      "grad_norm": 0.6686047911643982,
+      "learning_rate": 0.00012948188671072934,
+      "loss": 0.9546,
+      "step": 9106
+    },
+    {
+      "epoch": 1.6214387464387463,
+      "grad_norm": 0.7456090450286865,
+      "learning_rate": 0.0001294685110932357,
+      "loss": 1.0819,
+      "step": 9107
+    },
+    {
+      "epoch": 1.621616809116809,
+      "grad_norm": 0.7111441493034363,
+      "learning_rate": 0.0001294551348983678,
+      "loss": 0.9916,
+      "step": 9108
+    },
+    {
+      "epoch": 1.6217948717948718,
+      "grad_norm": 0.6534699201583862,
+      "learning_rate": 0.00012944175812638773,
+      "loss": 1.0374,
+      "step": 9109
+    },
+    {
+      "epoch": 1.6219729344729346,
+      "grad_norm": 0.6046397089958191,
+      "learning_rate": 0.00012942838077755758,
+      "loss": 0.7922,
+      "step": 9110
+    },
+    {
+      "epoch": 1.6221509971509973,
+      "grad_norm": 0.7736679911613464,
+      "learning_rate": 0.00012941500285213942,
+      "loss": 1.0056,
+      "step": 9111
+    },
+    {
+      "epoch": 1.6223290598290598,
+      "grad_norm": 0.6850929260253906,
+      "learning_rate": 0.00012940162435039538,
+      "loss": 0.9538,
+      "step": 9112
+    },
+    {
+      "epoch": 1.6225071225071224,
+      "grad_norm": 0.6305751800537109,
+      "learning_rate": 0.00012938824527258756,
+      "loss": 0.9341,
+      "step": 9113
+    },
+    {
+      "epoch": 1.6226851851851851,
+      "grad_norm": 0.6740923523902893,
+      "learning_rate": 0.0001293748656189782,
+      "loss": 1.0037,
+      "step": 9114
+    },
+    {
+      "epoch": 1.6228632478632479,
+      "grad_norm": 0.6579762101173401,
+      "learning_rate": 0.00012936148538982928,
+      "loss": 1.0022,
+      "step": 9115
+    },
+    {
+      "epoch": 1.6230413105413106,
+      "grad_norm": 0.6500434279441833,
+      "learning_rate": 0.0001293481045854031,
+      "loss": 0.8589,
+      "step": 9116
+    },
+    {
+      "epoch": 1.6232193732193734,
+      "grad_norm": 0.7825912237167358,
+      "learning_rate": 0.00012933472320596177,
+      "loss": 1.0345,
+      "step": 9117
+    },
+    {
+      "epoch": 1.623397435897436,
+      "grad_norm": 0.8341414332389832,
+      "learning_rate": 0.0001293213412517675,
+      "loss": 1.0314,
+      "step": 9118
+    },
+    {
+      "epoch": 1.6235754985754984,
+      "grad_norm": 0.63664311170578,
+      "learning_rate": 0.00012930795872308242,
+      "loss": 0.819,
+      "step": 9119
+    },
+    {
+      "epoch": 1.6237535612535612,
+      "grad_norm": 0.6800840497016907,
+      "learning_rate": 0.00012929457562016878,
+      "loss": 0.95,
+      "step": 9120
+    },
+    {
+      "epoch": 1.623931623931624,
+      "grad_norm": 0.754165530204773,
+      "learning_rate": 0.0001292811919432888,
+      "loss": 1.1193,
+      "step": 9121
+    },
+    {
+      "epoch": 1.6241096866096867,
+      "grad_norm": 0.678871750831604,
+      "learning_rate": 0.00012926780769270465,
+      "loss": 0.9015,
+      "step": 9122
+    },
+    {
+      "epoch": 1.6242877492877494,
+      "grad_norm": 0.6642945408821106,
+      "learning_rate": 0.00012925442286867866,
+      "loss": 0.9095,
+      "step": 9123
+    },
+    {
+      "epoch": 1.624465811965812,
+      "grad_norm": 0.6089697480201721,
+      "learning_rate": 0.000129241037471473,
+      "loss": 0.8994,
+      "step": 9124
+    },
+    {
+      "epoch": 1.6246438746438745,
+      "grad_norm": 0.7320881485939026,
+      "learning_rate": 0.00012922765150134995,
+      "loss": 1.0518,
+      "step": 9125
+    },
+    {
+      "epoch": 1.6248219373219372,
+      "grad_norm": 0.7308032512664795,
+      "learning_rate": 0.0001292142649585718,
+      "loss": 1.0557,
+      "step": 9126
+    },
+    {
+      "epoch": 1.625,
+      "grad_norm": 0.6896602511405945,
+      "learning_rate": 0.0001292008778434008,
+      "loss": 1.145,
+      "step": 9127
+    },
+    {
+      "epoch": 1.6251780626780628,
+      "grad_norm": 0.6112532615661621,
+      "learning_rate": 0.00012918749015609926,
+      "loss": 0.9611,
+      "step": 9128
+    },
+    {
+      "epoch": 1.6253561253561255,
+      "grad_norm": 0.6856057643890381,
+      "learning_rate": 0.00012917410189692947,
+      "loss": 1.0124,
+      "step": 9129
+    },
+    {
+      "epoch": 1.625534188034188,
+      "grad_norm": 0.699252188205719,
+      "learning_rate": 0.00012916071306615378,
+      "loss": 0.8854,
+      "step": 9130
+    },
+    {
+      "epoch": 1.6257122507122506,
+      "grad_norm": 0.6306683421134949,
+      "learning_rate": 0.0001291473236640345,
+      "loss": 1.0722,
+      "step": 9131
+    },
+    {
+      "epoch": 1.6258903133903133,
+      "grad_norm": 0.6358118653297424,
+      "learning_rate": 0.00012913393369083393,
+      "loss": 0.889,
+      "step": 9132
+    },
+    {
+      "epoch": 1.626068376068376,
+      "grad_norm": 0.6953601837158203,
+      "learning_rate": 0.00012912054314681445,
+      "loss": 1.0168,
+      "step": 9133
+    },
+    {
+      "epoch": 1.6262464387464388,
+      "grad_norm": 0.6742331385612488,
+      "learning_rate": 0.00012910715203223844,
+      "loss": 0.8152,
+      "step": 9134
+    },
+    {
+      "epoch": 1.6264245014245016,
+      "grad_norm": 0.5872861742973328,
+      "learning_rate": 0.00012909376034736823,
+      "loss": 0.8702,
+      "step": 9135
+    },
+    {
+      "epoch": 1.626602564102564,
+      "grad_norm": 0.7580631971359253,
+      "learning_rate": 0.00012908036809246623,
+      "loss": 0.994,
+      "step": 9136
+    },
+    {
+      "epoch": 1.6267806267806266,
+      "grad_norm": 0.7544930577278137,
+      "learning_rate": 0.00012906697526779488,
+      "loss": 0.7475,
+      "step": 9137
+    },
+    {
+      "epoch": 1.6269586894586894,
+      "grad_norm": 0.6850766539573669,
+      "learning_rate": 0.00012905358187361647,
+      "loss": 1.0943,
+      "step": 9138
+    },
+    {
+      "epoch": 1.6271367521367521,
+      "grad_norm": 0.6821565628051758,
+      "learning_rate": 0.0001290401879101935,
+      "loss": 1.2928,
+      "step": 9139
+    },
+    {
+      "epoch": 1.6273148148148149,
+      "grad_norm": 0.6961034536361694,
+      "learning_rate": 0.00012902679337778835,
+      "loss": 0.8694,
+      "step": 9140
+    },
+    {
+      "epoch": 1.6274928774928776,
+      "grad_norm": 0.7159550786018372,
+      "learning_rate": 0.00012901339827666353,
+      "loss": 0.8827,
+      "step": 9141
+    },
+    {
+      "epoch": 1.6276709401709402,
+      "grad_norm": 0.7491081953048706,
+      "learning_rate": 0.0001290000026070814,
+      "loss": 0.8159,
+      "step": 9142
+    },
+    {
+      "epoch": 1.6278490028490027,
+      "grad_norm": 0.7107849717140198,
+      "learning_rate": 0.00012898660636930447,
+      "loss": 1.0625,
+      "step": 9143
+    },
+    {
+      "epoch": 1.6280270655270654,
+      "grad_norm": 0.7227210998535156,
+      "learning_rate": 0.0001289732095635952,
+      "loss": 0.9744,
+      "step": 9144
+    },
+    {
+      "epoch": 1.6282051282051282,
+      "grad_norm": 0.7141995429992676,
+      "learning_rate": 0.00012895981219021607,
+      "loss": 0.9836,
+      "step": 9145
+    },
+    {
+      "epoch": 1.628383190883191,
+      "grad_norm": 0.6445552706718445,
+      "learning_rate": 0.00012894641424942958,
+      "loss": 1.0183,
+      "step": 9146
+    },
+    {
+      "epoch": 1.6285612535612537,
+      "grad_norm": 0.698783278465271,
+      "learning_rate": 0.00012893301574149824,
+      "loss": 0.8392,
+      "step": 9147
+    },
+    {
+      "epoch": 1.6287393162393162,
+      "grad_norm": 0.6529116034507751,
+      "learning_rate": 0.00012891961666668458,
+      "loss": 0.9317,
+      "step": 9148
+    },
+    {
+      "epoch": 1.6289173789173788,
+      "grad_norm": 0.7780548930168152,
+      "learning_rate": 0.0001289062170252511,
+      "loss": 1.2406,
+      "step": 9149
+    },
+    {
+      "epoch": 1.6290954415954415,
+      "grad_norm": 0.6500990986824036,
+      "learning_rate": 0.0001288928168174603,
+      "loss": 1.0381,
+      "step": 9150
+    },
+    {
+      "epoch": 1.6292735042735043,
+      "grad_norm": 0.7098208665847778,
+      "learning_rate": 0.00012887941604357482,
+      "loss": 1.2126,
+      "step": 9151
+    },
+    {
+      "epoch": 1.629451566951567,
+      "grad_norm": 0.730648398399353,
+      "learning_rate": 0.0001288660147038572,
+      "loss": 0.8351,
+      "step": 9152
+    },
+    {
+      "epoch": 1.6296296296296298,
+      "grad_norm": 0.5520278215408325,
+      "learning_rate": 0.0001288526127985699,
+      "loss": 0.5877,
+      "step": 9153
+    },
+    {
+      "epoch": 1.6298076923076923,
+      "grad_norm": 0.7611770033836365,
+      "learning_rate": 0.00012883921032797563,
+      "loss": 1.2227,
+      "step": 9154
+    },
+    {
+      "epoch": 1.6299857549857548,
+      "grad_norm": 0.636820375919342,
+      "learning_rate": 0.00012882580729233696,
+      "loss": 0.8305,
+      "step": 9155
+    },
+    {
+      "epoch": 1.6301638176638176,
+      "grad_norm": 0.694492518901825,
+      "learning_rate": 0.00012881240369191644,
+      "loss": 1.0452,
+      "step": 9156
+    },
+    {
+      "epoch": 1.6303418803418803,
+      "grad_norm": 0.67826908826828,
+      "learning_rate": 0.00012879899952697677,
+      "loss": 0.8345,
+      "step": 9157
+    },
+    {
+      "epoch": 1.630519943019943,
+      "grad_norm": 0.5891323685646057,
+      "learning_rate": 0.00012878559479778052,
+      "loss": 0.8367,
+      "step": 9158
+    },
+    {
+      "epoch": 1.6306980056980058,
+      "grad_norm": 0.6766192317008972,
+      "learning_rate": 0.0001287721895045903,
+      "loss": 0.8319,
+      "step": 9159
+    },
+    {
+      "epoch": 1.6308760683760684,
+      "grad_norm": 0.5306392908096313,
+      "learning_rate": 0.0001287587836476688,
+      "loss": 0.7945,
+      "step": 9160
+    },
+    {
+      "epoch": 1.631054131054131,
+      "grad_norm": 0.6677970290184021,
+      "learning_rate": 0.0001287453772272787,
+      "loss": 1.1228,
+      "step": 9161
+    },
+    {
+      "epoch": 1.6312321937321936,
+      "grad_norm": 0.810052752494812,
+      "learning_rate": 0.00012873197024368266,
+      "loss": 0.8395,
+      "step": 9162
+    },
+    {
+      "epoch": 1.6314102564102564,
+      "grad_norm": 0.7619220018386841,
+      "learning_rate": 0.00012871856269714333,
+      "loss": 1.3713,
+      "step": 9163
+    },
+    {
+      "epoch": 1.6315883190883191,
+      "grad_norm": 0.6564521193504333,
+      "learning_rate": 0.00012870515458792342,
+      "loss": 1.0513,
+      "step": 9164
+    },
+    {
+      "epoch": 1.631766381766382,
+      "grad_norm": 0.6874445676803589,
+      "learning_rate": 0.00012869174591628564,
+      "loss": 1.0255,
+      "step": 9165
+    },
+    {
+      "epoch": 1.6319444444444444,
+      "grad_norm": 0.6958737373352051,
+      "learning_rate": 0.0001286783366824927,
+      "loss": 0.9361,
+      "step": 9166
+    },
+    {
+      "epoch": 1.6321225071225072,
+      "grad_norm": 0.6909199357032776,
+      "learning_rate": 0.0001286649268868073,
+      "loss": 0.9855,
+      "step": 9167
+    },
+    {
+      "epoch": 1.6323005698005697,
+      "grad_norm": 0.7671375274658203,
+      "learning_rate": 0.00012865151652949225,
+      "loss": 1.084,
+      "step": 9168
+    },
+    {
+      "epoch": 1.6324786324786325,
+      "grad_norm": 0.750200092792511,
+      "learning_rate": 0.00012863810561081023,
+      "loss": 0.9341,
+      "step": 9169
+    },
+    {
+      "epoch": 1.6326566951566952,
+      "grad_norm": 0.6595860123634338,
+      "learning_rate": 0.00012862469413102402,
+      "loss": 0.9386,
+      "step": 9170
+    },
+    {
+      "epoch": 1.632834757834758,
+      "grad_norm": 0.622373640537262,
+      "learning_rate": 0.0001286112820903964,
+      "loss": 0.7697,
+      "step": 9171
+    },
+    {
+      "epoch": 1.6330128205128205,
+      "grad_norm": 0.9628498554229736,
+      "learning_rate": 0.00012859786948919014,
+      "loss": 1.2629,
+      "step": 9172
+    },
+    {
+      "epoch": 1.6331908831908832,
+      "grad_norm": 0.7610561847686768,
+      "learning_rate": 0.000128584456327668,
+      "loss": 0.9748,
+      "step": 9173
+    },
+    {
+      "epoch": 1.6333689458689458,
+      "grad_norm": 0.6585374474525452,
+      "learning_rate": 0.00012857104260609285,
+      "loss": 0.9049,
+      "step": 9174
+    },
+    {
+      "epoch": 1.6335470085470085,
+      "grad_norm": 0.6996221542358398,
+      "learning_rate": 0.00012855762832472746,
+      "loss": 0.8893,
+      "step": 9175
+    },
+    {
+      "epoch": 1.6337250712250713,
+      "grad_norm": 0.6226270198822021,
+      "learning_rate": 0.00012854421348383466,
+      "loss": 0.8913,
+      "step": 9176
+    },
+    {
+      "epoch": 1.633903133903134,
+      "grad_norm": 0.6570866107940674,
+      "learning_rate": 0.00012853079808367731,
+      "loss": 0.8632,
+      "step": 9177
+    },
+    {
+      "epoch": 1.6340811965811965,
+      "grad_norm": 0.6899664402008057,
+      "learning_rate": 0.00012851738212451826,
+      "loss": 0.8177,
+      "step": 9178
+    },
+    {
+      "epoch": 1.6342592592592593,
+      "grad_norm": 0.75257807970047,
+      "learning_rate": 0.0001285039656066203,
+      "loss": 0.9096,
+      "step": 9179
+    },
+    {
+      "epoch": 1.6344373219373218,
+      "grad_norm": 0.6614963412284851,
+      "learning_rate": 0.00012849054853024638,
+      "loss": 0.9255,
+      "step": 9180
+    },
+    {
+      "epoch": 1.6346153846153846,
+      "grad_norm": 0.7245957851409912,
+      "learning_rate": 0.00012847713089565933,
+      "loss": 1.0122,
+      "step": 9181
+    },
+    {
+      "epoch": 1.6347934472934473,
+      "grad_norm": 0.7332839369773865,
+      "learning_rate": 0.00012846371270312204,
+      "loss": 0.8484,
+      "step": 9182
+    },
+    {
+      "epoch": 1.63497150997151,
+      "grad_norm": 0.628089189529419,
+      "learning_rate": 0.00012845029395289748,
+      "loss": 1.0171,
+      "step": 9183
+    },
+    {
+      "epoch": 1.6351495726495726,
+      "grad_norm": 0.7493528723716736,
+      "learning_rate": 0.00012843687464524848,
+      "loss": 1.1635,
+      "step": 9184
+    },
+    {
+      "epoch": 1.6353276353276354,
+      "grad_norm": 0.6328163146972656,
+      "learning_rate": 0.00012842345478043799,
+      "loss": 1.1254,
+      "step": 9185
+    },
+    {
+      "epoch": 1.635505698005698,
+      "grad_norm": 0.6720291376113892,
+      "learning_rate": 0.00012841003435872894,
+      "loss": 0.9729,
+      "step": 9186
+    },
+    {
+      "epoch": 1.6356837606837606,
+      "grad_norm": 0.6657332181930542,
+      "learning_rate": 0.00012839661338038427,
+      "loss": 1.1047,
+      "step": 9187
+    },
+    {
+      "epoch": 1.6358618233618234,
+      "grad_norm": 0.7416180968284607,
+      "learning_rate": 0.000128383191845667,
+      "loss": 0.9505,
+      "step": 9188
+    },
+    {
+      "epoch": 1.6360398860398861,
+      "grad_norm": 0.8737816214561462,
+      "learning_rate": 0.00012836976975484,
+      "loss": 1.0518,
+      "step": 9189
+    },
+    {
+      "epoch": 1.6362179487179487,
+      "grad_norm": 0.7351877093315125,
+      "learning_rate": 0.0001283563471081663,
+      "loss": 1.1152,
+      "step": 9190
+    },
+    {
+      "epoch": 1.6363960113960114,
+      "grad_norm": 0.6442788243293762,
+      "learning_rate": 0.00012834292390590893,
+      "loss": 0.9432,
+      "step": 9191
+    },
+    {
+      "epoch": 1.636574074074074,
+      "grad_norm": 0.6848029494285583,
+      "learning_rate": 0.0001283295001483308,
+      "loss": 0.8528,
+      "step": 9192
+    },
+    {
+      "epoch": 1.6367521367521367,
+      "grad_norm": 0.6627060174942017,
+      "learning_rate": 0.00012831607583569497,
+      "loss": 1.0222,
+      "step": 9193
+    },
+    {
+      "epoch": 1.6369301994301995,
+      "grad_norm": 0.7319555878639221,
+      "learning_rate": 0.00012830265096826446,
+      "loss": 0.9392,
+      "step": 9194
+    },
+    {
+      "epoch": 1.6371082621082622,
+      "grad_norm": 0.6986424326896667,
+      "learning_rate": 0.0001282892255463023,
+      "loss": 1.2095,
+      "step": 9195
+    },
+    {
+      "epoch": 1.6372863247863247,
+      "grad_norm": 0.6649929881095886,
+      "learning_rate": 0.0001282757995700715,
+      "loss": 0.9426,
+      "step": 9196
+    },
+    {
+      "epoch": 1.6374643874643875,
+      "grad_norm": 0.6789031624794006,
+      "learning_rate": 0.0001282623730398352,
+      "loss": 0.9705,
+      "step": 9197
+    },
+    {
+      "epoch": 1.63764245014245,
+      "grad_norm": 0.6388779878616333,
+      "learning_rate": 0.00012824894595585637,
+      "loss": 1.0698,
+      "step": 9198
+    },
+    {
+      "epoch": 1.6378205128205128,
+      "grad_norm": 0.636832594871521,
+      "learning_rate": 0.00012823551831839814,
+      "loss": 0.9445,
+      "step": 9199
+    },
+    {
+      "epoch": 1.6379985754985755,
+      "grad_norm": 0.670190691947937,
+      "learning_rate": 0.0001282220901277236,
+      "loss": 0.9847,
+      "step": 9200
+    },
+    {
+      "epoch": 1.6381766381766383,
+      "grad_norm": 0.6020209193229675,
+      "learning_rate": 0.0001282086613840958,
+      "loss": 1.0047,
+      "step": 9201
+    },
+    {
+      "epoch": 1.6383547008547008,
+      "grad_norm": 0.6648211479187012,
+      "learning_rate": 0.0001281952320877779,
+      "loss": 0.8717,
+      "step": 9202
+    },
+    {
+      "epoch": 1.6385327635327636,
+      "grad_norm": 0.7207710146903992,
+      "learning_rate": 0.000128181802239033,
+      "loss": 1.1232,
+      "step": 9203
+    },
+    {
+      "epoch": 1.638710826210826,
+      "grad_norm": 0.800992488861084,
+      "learning_rate": 0.0001281683718381242,
+      "loss": 1.0688,
+      "step": 9204
+    },
+    {
+      "epoch": 1.6388888888888888,
+      "grad_norm": 0.789398193359375,
+      "learning_rate": 0.0001281549408853147,
+      "loss": 1.1772,
+      "step": 9205
+    },
+    {
+      "epoch": 1.6390669515669516,
+      "grad_norm": 0.6514480710029602,
+      "learning_rate": 0.0001281415093808676,
+      "loss": 1.1685,
+      "step": 9206
+    },
+    {
+      "epoch": 1.6392450142450143,
+      "grad_norm": 0.6914686560630798,
+      "learning_rate": 0.00012812807732504608,
+      "loss": 1.1307,
+      "step": 9207
+    },
+    {
+      "epoch": 1.6394230769230769,
+      "grad_norm": 0.6788144111633301,
+      "learning_rate": 0.00012811464471811334,
+      "loss": 1.1735,
+      "step": 9208
+    },
+    {
+      "epoch": 1.6396011396011396,
+      "grad_norm": 0.7049870491027832,
+      "learning_rate": 0.00012810121156033252,
+      "loss": 1.0128,
+      "step": 9209
+    },
+    {
+      "epoch": 1.6397792022792022,
+      "grad_norm": 0.7156766057014465,
+      "learning_rate": 0.00012808777785196687,
+      "loss": 0.9503,
+      "step": 9210
+    },
+    {
+      "epoch": 1.639957264957265,
+      "grad_norm": 0.651716411113739,
+      "learning_rate": 0.0001280743435932795,
+      "loss": 1.1227,
+      "step": 9211
+    },
+    {
+      "epoch": 1.6401353276353277,
+      "grad_norm": 0.7276262044906616,
+      "learning_rate": 0.0001280609087845337,
+      "loss": 1.06,
+      "step": 9212
+    },
+    {
+      "epoch": 1.6403133903133904,
+      "grad_norm": 0.6591095924377441,
+      "learning_rate": 0.0001280474734259927,
+      "loss": 1.0861,
+      "step": 9213
+    },
+    {
+      "epoch": 1.640491452991453,
+      "grad_norm": 0.6675926446914673,
+      "learning_rate": 0.00012803403751791975,
+      "loss": 0.9815,
+      "step": 9214
+    },
+    {
+      "epoch": 1.6406695156695157,
+      "grad_norm": 0.6391474008560181,
+      "learning_rate": 0.00012802060106057803,
+      "loss": 0.8027,
+      "step": 9215
+    },
+    {
+      "epoch": 1.6408475783475782,
+      "grad_norm": 0.6384556293487549,
+      "learning_rate": 0.00012800716405423086,
+      "loss": 0.7877,
+      "step": 9216
+    },
+    {
+      "epoch": 1.641025641025641,
+      "grad_norm": 0.661191463470459,
+      "learning_rate": 0.00012799372649914146,
+      "loss": 0.9725,
+      "step": 9217
+    },
+    {
+      "epoch": 1.6412037037037037,
+      "grad_norm": 0.7418332695960999,
+      "learning_rate": 0.0001279802883955732,
+      "loss": 1.1756,
+      "step": 9218
+    },
+    {
+      "epoch": 1.6413817663817665,
+      "grad_norm": 0.6588954329490662,
+      "learning_rate": 0.00012796684974378928,
+      "loss": 1.0428,
+      "step": 9219
+    },
+    {
+      "epoch": 1.6415598290598292,
+      "grad_norm": 0.7566093802452087,
+      "learning_rate": 0.000127953410544053,
+      "loss": 1.1254,
+      "step": 9220
+    },
+    {
+      "epoch": 1.6417378917378918,
+      "grad_norm": 0.6801039576530457,
+      "learning_rate": 0.00012793997079662777,
+      "loss": 1.0854,
+      "step": 9221
+    },
+    {
+      "epoch": 1.6419159544159543,
+      "grad_norm": 0.7262716889381409,
+      "learning_rate": 0.0001279265305017768,
+      "loss": 0.9343,
+      "step": 9222
+    },
+    {
+      "epoch": 1.642094017094017,
+      "grad_norm": 0.628625750541687,
+      "learning_rate": 0.0001279130896597635,
+      "loss": 0.8942,
+      "step": 9223
+    },
+    {
+      "epoch": 1.6422720797720798,
+      "grad_norm": 0.6183576583862305,
+      "learning_rate": 0.0001278996482708512,
+      "loss": 0.9284,
+      "step": 9224
+    },
+    {
+      "epoch": 1.6424501424501425,
+      "grad_norm": 0.7912000417709351,
+      "learning_rate": 0.00012788620633530327,
+      "loss": 1.3043,
+      "step": 9225
+    },
+    {
+      "epoch": 1.6426282051282053,
+      "grad_norm": 0.6982026100158691,
+      "learning_rate": 0.00012787276385338298,
+      "loss": 1.0224,
+      "step": 9226
+    },
+    {
+      "epoch": 1.6428062678062678,
+      "grad_norm": 0.6734985709190369,
+      "learning_rate": 0.00012785932082535386,
+      "loss": 0.8781,
+      "step": 9227
+    },
+    {
+      "epoch": 1.6429843304843303,
+      "grad_norm": 0.8799532055854797,
+      "learning_rate": 0.0001278458772514792,
+      "loss": 1.1482,
+      "step": 9228
+    },
+    {
+      "epoch": 1.643162393162393,
+      "grad_norm": 0.590295672416687,
+      "learning_rate": 0.0001278324331320224,
+      "loss": 0.9502,
+      "step": 9229
+    },
+    {
+      "epoch": 1.6433404558404558,
+      "grad_norm": 0.6562125086784363,
+      "learning_rate": 0.0001278189884672469,
+      "loss": 0.9834,
+      "step": 9230
+    },
+    {
+      "epoch": 1.6435185185185186,
+      "grad_norm": 0.6848936676979065,
+      "learning_rate": 0.00012780554325741612,
+      "loss": 1.0414,
+      "step": 9231
+    },
+    {
+      "epoch": 1.6436965811965814,
+      "grad_norm": 0.5985032320022583,
+      "learning_rate": 0.00012779209750279344,
+      "loss": 0.9469,
+      "step": 9232
+    },
+    {
+      "epoch": 1.6438746438746439,
+      "grad_norm": 0.7500917911529541,
+      "learning_rate": 0.00012777865120364238,
+      "loss": 0.9626,
+      "step": 9233
+    },
+    {
+      "epoch": 1.6440527065527064,
+      "grad_norm": 0.6565709114074707,
+      "learning_rate": 0.00012776520436022634,
+      "loss": 1.0594,
+      "step": 9234
+    },
+    {
+      "epoch": 1.6442307692307692,
+      "grad_norm": 0.8005441427230835,
+      "learning_rate": 0.00012775175697280882,
+      "loss": 1.2379,
+      "step": 9235
+    },
+    {
+      "epoch": 1.644408831908832,
+      "grad_norm": 0.6734150648117065,
+      "learning_rate": 0.00012773830904165326,
+      "loss": 0.9171,
+      "step": 9236
+    },
+    {
+      "epoch": 1.6445868945868947,
+      "grad_norm": 0.6950868368148804,
+      "learning_rate": 0.00012772486056702314,
+      "loss": 1.1782,
+      "step": 9237
+    },
+    {
+      "epoch": 1.6447649572649574,
+      "grad_norm": 0.8009599447250366,
+      "learning_rate": 0.000127711411549182,
+      "loss": 1.0288,
+      "step": 9238
+    },
+    {
+      "epoch": 1.64494301994302,
+      "grad_norm": 0.6227970719337463,
+      "learning_rate": 0.0001276979619883933,
+      "loss": 0.9327,
+      "step": 9239
+    },
+    {
+      "epoch": 1.6451210826210825,
+      "grad_norm": 0.6828190088272095,
+      "learning_rate": 0.00012768451188492058,
+      "loss": 0.9816,
+      "step": 9240
+    },
+    {
+      "epoch": 1.6452991452991452,
+      "grad_norm": 0.9689767360687256,
+      "learning_rate": 0.00012767106123902738,
+      "loss": 0.9049,
+      "step": 9241
+    },
+    {
+      "epoch": 1.645477207977208,
+      "grad_norm": 0.677061140537262,
+      "learning_rate": 0.00012765761005097717,
+      "loss": 0.9472,
+      "step": 9242
+    },
+    {
+      "epoch": 1.6456552706552707,
+      "grad_norm": 0.7227110862731934,
+      "learning_rate": 0.00012764415832103356,
+      "loss": 1.0384,
+      "step": 9243
+    },
+    {
+      "epoch": 1.6458333333333335,
+      "grad_norm": 0.6540094614028931,
+      "learning_rate": 0.0001276307060494601,
+      "loss": 0.8166,
+      "step": 9244
+    },
+    {
+      "epoch": 1.646011396011396,
+      "grad_norm": 0.6921904683113098,
+      "learning_rate": 0.00012761725323652033,
+      "loss": 0.9746,
+      "step": 9245
+    },
+    {
+      "epoch": 1.6461894586894585,
+      "grad_norm": 0.6742660999298096,
+      "learning_rate": 0.0001276037998824779,
+      "loss": 0.8441,
+      "step": 9246
+    },
+    {
+      "epoch": 1.6463675213675213,
+      "grad_norm": 0.6611103415489197,
+      "learning_rate": 0.0001275903459875963,
+      "loss": 1.087,
+      "step": 9247
+    },
+    {
+      "epoch": 1.646545584045584,
+      "grad_norm": 0.6805498003959656,
+      "learning_rate": 0.00012757689155213923,
+      "loss": 0.923,
+      "step": 9248
+    },
+    {
+      "epoch": 1.6467236467236468,
+      "grad_norm": 0.6598179340362549,
+      "learning_rate": 0.00012756343657637024,
+      "loss": 0.9371,
+      "step": 9249
+    },
+    {
+      "epoch": 1.6469017094017095,
+      "grad_norm": 0.7147273421287537,
+      "learning_rate": 0.00012754998106055297,
+      "loss": 1.053,
+      "step": 9250
+    },
+    {
+      "epoch": 1.647079772079772,
+      "grad_norm": 0.72414630651474,
+      "learning_rate": 0.00012753652500495103,
+      "loss": 1.0547,
+      "step": 9251
+    },
+    {
+      "epoch": 1.6472578347578346,
+      "grad_norm": 0.7784913182258606,
+      "learning_rate": 0.00012752306840982811,
+      "loss": 0.9012,
+      "step": 9252
+    },
+    {
+      "epoch": 1.6474358974358974,
+      "grad_norm": 0.644026517868042,
+      "learning_rate": 0.0001275096112754478,
+      "loss": 1.0911,
+      "step": 9253
+    },
+    {
+      "epoch": 1.64761396011396,
+      "grad_norm": 0.691124677658081,
+      "learning_rate": 0.00012749615360207382,
+      "loss": 0.9918,
+      "step": 9254
+    },
+    {
+      "epoch": 1.6477920227920229,
+      "grad_norm": 0.6632972359657288,
+      "learning_rate": 0.00012748269538996986,
+      "loss": 0.9438,
+      "step": 9255
+    },
+    {
+      "epoch": 1.6479700854700856,
+      "grad_norm": 0.6548733115196228,
+      "learning_rate": 0.00012746923663939955,
+      "loss": 1.1082,
+      "step": 9256
+    },
+    {
+      "epoch": 1.6481481481481481,
+      "grad_norm": 0.6737542748451233,
+      "learning_rate": 0.00012745577735062664,
+      "loss": 0.9255,
+      "step": 9257
+    },
+    {
+      "epoch": 1.6483262108262107,
+      "grad_norm": 0.686862051486969,
+      "learning_rate": 0.00012744231752391479,
+      "loss": 0.9493,
+      "step": 9258
+    },
+    {
+      "epoch": 1.6485042735042734,
+      "grad_norm": 0.6096474528312683,
+      "learning_rate": 0.00012742885715952772,
+      "loss": 0.6849,
+      "step": 9259
+    },
+    {
+      "epoch": 1.6486823361823362,
+      "grad_norm": 0.702751636505127,
+      "learning_rate": 0.00012741539625772918,
+      "loss": 1.0335,
+      "step": 9260
+    },
+    {
+      "epoch": 1.648860398860399,
+      "grad_norm": 0.7470958232879639,
+      "learning_rate": 0.0001274019348187829,
+      "loss": 1.105,
+      "step": 9261
+    },
+    {
+      "epoch": 1.6490384615384617,
+      "grad_norm": 0.6642739176750183,
+      "learning_rate": 0.0001273884728429526,
+      "loss": 1.01,
+      "step": 9262
+    },
+    {
+      "epoch": 1.6492165242165242,
+      "grad_norm": 0.6470904350280762,
+      "learning_rate": 0.00012737501033050213,
+      "loss": 0.9009,
+      "step": 9263
+    },
+    {
+      "epoch": 1.6493945868945867,
+      "grad_norm": 0.7487246990203857,
+      "learning_rate": 0.00012736154728169518,
+      "loss": 0.9832,
+      "step": 9264
+    },
+    {
+      "epoch": 1.6495726495726495,
+      "grad_norm": 0.7370779514312744,
+      "learning_rate": 0.00012734808369679553,
+      "loss": 1.0464,
+      "step": 9265
+    },
+    {
+      "epoch": 1.6497507122507122,
+      "grad_norm": 0.7942814826965332,
+      "learning_rate": 0.00012733461957606702,
+      "loss": 1.102,
+      "step": 9266
+    },
+    {
+      "epoch": 1.649928774928775,
+      "grad_norm": 0.6535606980323792,
+      "learning_rate": 0.00012732115491977336,
+      "loss": 1.0655,
+      "step": 9267
+    },
+    {
+      "epoch": 1.6501068376068377,
+      "grad_norm": 0.601716935634613,
+      "learning_rate": 0.00012730768972817847,
+      "loss": 0.8236,
+      "step": 9268
+    },
+    {
+      "epoch": 1.6502849002849003,
+      "grad_norm": 0.7375118732452393,
+      "learning_rate": 0.00012729422400154614,
+      "loss": 0.9313,
+      "step": 9269
+    },
+    {
+      "epoch": 1.6504629629629628,
+      "grad_norm": 0.7360411882400513,
+      "learning_rate": 0.00012728075774014018,
+      "loss": 0.9254,
+      "step": 9270
+    },
+    {
+      "epoch": 1.6506410256410255,
+      "grad_norm": 0.8453929424285889,
+      "learning_rate": 0.00012726729094422444,
+      "loss": 1.0975,
+      "step": 9271
+    },
+    {
+      "epoch": 1.6508190883190883,
+      "grad_norm": 0.5615501999855042,
+      "learning_rate": 0.00012725382361406274,
+      "loss": 0.8243,
+      "step": 9272
+    },
+    {
+      "epoch": 1.650997150997151,
+      "grad_norm": 0.6494898796081543,
+      "learning_rate": 0.000127240355749919,
+      "loss": 0.9766,
+      "step": 9273
+    },
+    {
+      "epoch": 1.6511752136752138,
+      "grad_norm": 0.6544778347015381,
+      "learning_rate": 0.0001272268873520571,
+      "loss": 0.9969,
+      "step": 9274
+    },
+    {
+      "epoch": 1.6513532763532763,
+      "grad_norm": 0.6937400698661804,
+      "learning_rate": 0.00012721341842074092,
+      "loss": 1.0626,
+      "step": 9275
+    },
+    {
+      "epoch": 1.651531339031339,
+      "grad_norm": 0.7068421244621277,
+      "learning_rate": 0.0001271999489562343,
+      "loss": 1.0068,
+      "step": 9276
+    },
+    {
+      "epoch": 1.6517094017094016,
+      "grad_norm": 0.6425052285194397,
+      "learning_rate": 0.0001271864789588012,
+      "loss": 0.8716,
+      "step": 9277
+    },
+    {
+      "epoch": 1.6518874643874644,
+      "grad_norm": 0.6895090341567993,
+      "learning_rate": 0.0001271730084287055,
+      "loss": 1.081,
+      "step": 9278
+    },
+    {
+      "epoch": 1.6520655270655271,
+      "grad_norm": 0.6773712038993835,
+      "learning_rate": 0.00012715953736621116,
+      "loss": 0.7586,
+      "step": 9279
+    },
+    {
+      "epoch": 1.6522435897435899,
+      "grad_norm": 0.6085716485977173,
+      "learning_rate": 0.0001271460657715821,
+      "loss": 0.8627,
+      "step": 9280
+    },
+    {
+      "epoch": 1.6524216524216524,
+      "grad_norm": 0.6415461897850037,
+      "learning_rate": 0.00012713259364508227,
+      "loss": 0.9751,
+      "step": 9281
+    },
+    {
+      "epoch": 1.6525997150997151,
+      "grad_norm": 0.6460939645767212,
+      "learning_rate": 0.00012711912098697565,
+      "loss": 0.9578,
+      "step": 9282
+    },
+    {
+      "epoch": 1.6527777777777777,
+      "grad_norm": 0.6076797246932983,
+      "learning_rate": 0.00012710564779752615,
+      "loss": 0.9627,
+      "step": 9283
+    },
+    {
+      "epoch": 1.6529558404558404,
+      "grad_norm": 0.710782527923584,
+      "learning_rate": 0.00012709217407699783,
+      "loss": 0.8725,
+      "step": 9284
+    },
+    {
+      "epoch": 1.6531339031339032,
+      "grad_norm": 0.6793623566627502,
+      "learning_rate": 0.00012707869982565463,
+      "loss": 0.908,
+      "step": 9285
+    },
+    {
+      "epoch": 1.653311965811966,
+      "grad_norm": 0.6841681003570557,
+      "learning_rate": 0.00012706522504376055,
+      "loss": 0.8546,
+      "step": 9286
+    },
+    {
+      "epoch": 1.6534900284900285,
+      "grad_norm": 0.7908675670623779,
+      "learning_rate": 0.0001270517497315796,
+      "loss": 0.9409,
+      "step": 9287
+    },
+    {
+      "epoch": 1.6536680911680912,
+      "grad_norm": 0.6918683648109436,
+      "learning_rate": 0.0001270382738893758,
+      "loss": 1.0493,
+      "step": 9288
+    },
+    {
+      "epoch": 1.6538461538461537,
+      "grad_norm": 0.6891819834709167,
+      "learning_rate": 0.00012702479751741322,
+      "loss": 1.0675,
+      "step": 9289
+    },
+    {
+      "epoch": 1.6540242165242165,
+      "grad_norm": 0.6965166926383972,
+      "learning_rate": 0.00012701132061595586,
+      "loss": 0.8563,
+      "step": 9290
+    },
+    {
+      "epoch": 1.6542022792022792,
+      "grad_norm": 0.7549001574516296,
+      "learning_rate": 0.00012699784318526779,
+      "loss": 1.1572,
+      "step": 9291
+    },
+    {
+      "epoch": 1.654380341880342,
+      "grad_norm": 0.6100513339042664,
+      "learning_rate": 0.00012698436522561303,
+      "loss": 0.897,
+      "step": 9292
+    },
+    {
+      "epoch": 1.6545584045584045,
+      "grad_norm": 0.6477037668228149,
+      "learning_rate": 0.00012697088673725574,
+      "loss": 0.7961,
+      "step": 9293
+    },
+    {
+      "epoch": 1.6547364672364673,
+      "grad_norm": 0.7402619123458862,
+      "learning_rate": 0.0001269574077204599,
+      "loss": 1.2001,
+      "step": 9294
+    },
+    {
+      "epoch": 1.6549145299145298,
+      "grad_norm": 0.7162346243858337,
+      "learning_rate": 0.0001269439281754897,
+      "loss": 0.9963,
+      "step": 9295
+    },
+    {
+      "epoch": 1.6550925925925926,
+      "grad_norm": 0.6757413744926453,
+      "learning_rate": 0.0001269304481026092,
+      "loss": 1.0476,
+      "step": 9296
+    },
+    {
+      "epoch": 1.6552706552706553,
+      "grad_norm": 0.6455655097961426,
+      "learning_rate": 0.0001269169675020825,
+      "loss": 0.9716,
+      "step": 9297
+    },
+    {
+      "epoch": 1.655448717948718,
+      "grad_norm": 0.7705031037330627,
+      "learning_rate": 0.0001269034863741737,
+      "loss": 0.9886,
+      "step": 9298
+    },
+    {
+      "epoch": 1.6556267806267806,
+      "grad_norm": 0.6084272861480713,
+      "learning_rate": 0.000126890004719147,
+      "loss": 0.8231,
+      "step": 9299
+    },
+    {
+      "epoch": 1.6558048433048433,
+      "grad_norm": 0.7051045298576355,
+      "learning_rate": 0.00012687652253726652,
+      "loss": 0.8673,
+      "step": 9300
+    },
+    {
+      "epoch": 1.6559829059829059,
+      "grad_norm": 0.731675386428833,
+      "learning_rate": 0.0001268630398287964,
+      "loss": 0.8609,
+      "step": 9301
+    },
+    {
+      "epoch": 1.6561609686609686,
+      "grad_norm": 0.6796799302101135,
+      "learning_rate": 0.00012684955659400087,
+      "loss": 1.0157,
+      "step": 9302
+    },
+    {
+      "epoch": 1.6563390313390314,
+      "grad_norm": 0.6270264983177185,
+      "learning_rate": 0.000126836072833144,
+      "loss": 0.8924,
+      "step": 9303
+    },
+    {
+      "epoch": 1.6565170940170941,
+      "grad_norm": 0.7235464453697205,
+      "learning_rate": 0.00012682258854649004,
+      "loss": 0.8904,
+      "step": 9304
+    },
+    {
+      "epoch": 1.6566951566951567,
+      "grad_norm": 0.7644724249839783,
+      "learning_rate": 0.00012680910373430318,
+      "loss": 0.9119,
+      "step": 9305
+    },
+    {
+      "epoch": 1.6568732193732194,
+      "grad_norm": 0.661411702632904,
+      "learning_rate": 0.00012679561839684764,
+      "loss": 1.0066,
+      "step": 9306
+    },
+    {
+      "epoch": 1.657051282051282,
+      "grad_norm": 0.6981723308563232,
+      "learning_rate": 0.0001267821325343876,
+      "loss": 1.2579,
+      "step": 9307
+    },
+    {
+      "epoch": 1.6572293447293447,
+      "grad_norm": 0.6469807028770447,
+      "learning_rate": 0.0001267686461471873,
+      "loss": 0.8678,
+      "step": 9308
+    },
+    {
+      "epoch": 1.6574074074074074,
+      "grad_norm": 0.8255495429039001,
+      "learning_rate": 0.000126755159235511,
+      "loss": 0.9053,
+      "step": 9309
+    },
+    {
+      "epoch": 1.6575854700854702,
+      "grad_norm": 0.6882261037826538,
+      "learning_rate": 0.00012674167179962294,
+      "loss": 0.8364,
+      "step": 9310
+    },
+    {
+      "epoch": 1.6577635327635327,
+      "grad_norm": 0.6816701889038086,
+      "learning_rate": 0.00012672818383978733,
+      "loss": 0.9627,
+      "step": 9311
+    },
+    {
+      "epoch": 1.6579415954415955,
+      "grad_norm": 0.6993424892425537,
+      "learning_rate": 0.00012671469535626852,
+      "loss": 0.8337,
+      "step": 9312
+    },
+    {
+      "epoch": 1.658119658119658,
+      "grad_norm": 0.6271458864212036,
+      "learning_rate": 0.00012670120634933075,
+      "loss": 0.8322,
+      "step": 9313
+    },
+    {
+      "epoch": 1.6582977207977208,
+      "grad_norm": 0.7012003660202026,
+      "learning_rate": 0.00012668771681923827,
+      "loss": 0.8895,
+      "step": 9314
+    },
+    {
+      "epoch": 1.6584757834757835,
+      "grad_norm": 0.6704670190811157,
+      "learning_rate": 0.00012667422676625547,
+      "loss": 1.0544,
+      "step": 9315
+    },
+    {
+      "epoch": 1.6586538461538463,
+      "grad_norm": 0.6189491748809814,
+      "learning_rate": 0.0001266607361906466,
+      "loss": 0.9623,
+      "step": 9316
+    },
+    {
+      "epoch": 1.6588319088319088,
+      "grad_norm": 0.7065694332122803,
+      "learning_rate": 0.000126647245092676,
+      "loss": 0.8874,
+      "step": 9317
+    },
+    {
+      "epoch": 1.6590099715099715,
+      "grad_norm": 0.7473452687263489,
+      "learning_rate": 0.00012663375347260795,
+      "loss": 1.0576,
+      "step": 9318
+    },
+    {
+      "epoch": 1.659188034188034,
+      "grad_norm": 0.6839408874511719,
+      "learning_rate": 0.0001266202613307068,
+      "loss": 0.9127,
+      "step": 9319
+    },
+    {
+      "epoch": 1.6593660968660968,
+      "grad_norm": 0.7154020071029663,
+      "learning_rate": 0.00012660676866723699,
+      "loss": 1.1174,
+      "step": 9320
+    },
+    {
+      "epoch": 1.6595441595441596,
+      "grad_norm": 0.7123729586601257,
+      "learning_rate": 0.0001265932754824628,
+      "loss": 0.9617,
+      "step": 9321
+    },
+    {
+      "epoch": 1.6597222222222223,
+      "grad_norm": 0.7537810802459717,
+      "learning_rate": 0.0001265797817766486,
+      "loss": 1.0333,
+      "step": 9322
+    },
+    {
+      "epoch": 1.6599002849002849,
+      "grad_norm": 0.706551730632782,
+      "learning_rate": 0.00012656628755005884,
+      "loss": 1.0838,
+      "step": 9323
+    },
+    {
+      "epoch": 1.6600783475783476,
+      "grad_norm": 0.8104004859924316,
+      "learning_rate": 0.0001265527928029578,
+      "loss": 0.9807,
+      "step": 9324
+    },
+    {
+      "epoch": 1.6602564102564101,
+      "grad_norm": 0.6892881989479065,
+      "learning_rate": 0.00012653929753560998,
+      "loss": 0.9941,
+      "step": 9325
+    },
+    {
+      "epoch": 1.6604344729344729,
+      "grad_norm": 0.5919203758239746,
+      "learning_rate": 0.00012652580174827974,
+      "loss": 0.9268,
+      "step": 9326
+    },
+    {
+      "epoch": 1.6606125356125356,
+      "grad_norm": 0.6715863347053528,
+      "learning_rate": 0.00012651230544123154,
+      "loss": 1.0912,
+      "step": 9327
+    },
+    {
+      "epoch": 1.6607905982905984,
+      "grad_norm": 0.6765137314796448,
+      "learning_rate": 0.0001264988086147298,
+      "loss": 1.1576,
+      "step": 9328
+    },
+    {
+      "epoch": 1.660968660968661,
+      "grad_norm": 0.6781638860702515,
+      "learning_rate": 0.00012648531126903888,
+      "loss": 1.1162,
+      "step": 9329
+    },
+    {
+      "epoch": 1.6611467236467237,
+      "grad_norm": 0.715871274471283,
+      "learning_rate": 0.00012647181340442337,
+      "loss": 0.714,
+      "step": 9330
+    },
+    {
+      "epoch": 1.6613247863247862,
+      "grad_norm": 0.6237258315086365,
+      "learning_rate": 0.00012645831502114762,
+      "loss": 0.8512,
+      "step": 9331
+    },
+    {
+      "epoch": 1.661502849002849,
+      "grad_norm": 0.6668339967727661,
+      "learning_rate": 0.0001264448161194762,
+      "loss": 1.0384,
+      "step": 9332
+    },
+    {
+      "epoch": 1.6616809116809117,
+      "grad_norm": 0.8316730260848999,
+      "learning_rate": 0.00012643131669967352,
+      "loss": 0.8931,
+      "step": 9333
+    },
+    {
+      "epoch": 1.6618589743589745,
+      "grad_norm": 0.7013183832168579,
+      "learning_rate": 0.00012641781676200406,
+      "loss": 1.0548,
+      "step": 9334
+    },
+    {
+      "epoch": 1.6620370370370372,
+      "grad_norm": 0.6980466842651367,
+      "learning_rate": 0.00012640431630673243,
+      "loss": 0.8988,
+      "step": 9335
+    },
+    {
+      "epoch": 1.6622150997150997,
+      "grad_norm": 0.7045995593070984,
+      "learning_rate": 0.000126390815334123,
+      "loss": 1.107,
+      "step": 9336
+    },
+    {
+      "epoch": 1.6623931623931623,
+      "grad_norm": 0.6699773669242859,
+      "learning_rate": 0.00012637731384444043,
+      "loss": 1.1757,
+      "step": 9337
+    },
+    {
+      "epoch": 1.662571225071225,
+      "grad_norm": 0.6489999294281006,
+      "learning_rate": 0.00012636381183794916,
+      "loss": 0.9282,
+      "step": 9338
+    },
+    {
+      "epoch": 1.6627492877492878,
+      "grad_norm": 0.7085952758789062,
+      "learning_rate": 0.00012635030931491375,
+      "loss": 1.0221,
+      "step": 9339
+    },
+    {
+      "epoch": 1.6629273504273505,
+      "grad_norm": 0.6893135905265808,
+      "learning_rate": 0.00012633680627559878,
+      "loss": 1.0517,
+      "step": 9340
+    },
+    {
+      "epoch": 1.6631054131054133,
+      "grad_norm": 0.5659682154655457,
+      "learning_rate": 0.00012632330272026882,
+      "loss": 0.6294,
+      "step": 9341
+    },
+    {
+      "epoch": 1.6632834757834758,
+      "grad_norm": 0.6889018416404724,
+      "learning_rate": 0.00012630979864918838,
+      "loss": 1.0735,
+      "step": 9342
+    },
+    {
+      "epoch": 1.6634615384615383,
+      "grad_norm": 0.7333424687385559,
+      "learning_rate": 0.00012629629406262212,
+      "loss": 0.9079,
+      "step": 9343
+    },
+    {
+      "epoch": 1.663639601139601,
+      "grad_norm": 0.6340580582618713,
+      "learning_rate": 0.00012628278896083462,
+      "loss": 0.9738,
+      "step": 9344
+    },
+    {
+      "epoch": 1.6638176638176638,
+      "grad_norm": 0.7042564749717712,
+      "learning_rate": 0.00012626928334409044,
+      "loss": 0.959,
+      "step": 9345
+    },
+    {
+      "epoch": 1.6639957264957266,
+      "grad_norm": 0.711757242679596,
+      "learning_rate": 0.00012625577721265424,
+      "loss": 0.8113,
+      "step": 9346
+    },
+    {
+      "epoch": 1.6641737891737893,
+      "grad_norm": 0.7723299264907837,
+      "learning_rate": 0.0001262422705667906,
+      "loss": 1.1724,
+      "step": 9347
+    },
+    {
+      "epoch": 1.6643518518518519,
+      "grad_norm": 0.711334228515625,
+      "learning_rate": 0.00012622876340676422,
+      "loss": 1.0121,
+      "step": 9348
+    },
+    {
+      "epoch": 1.6645299145299144,
+      "grad_norm": 0.6954590678215027,
+      "learning_rate": 0.0001262152557328397,
+      "loss": 1.2093,
+      "step": 9349
+    },
+    {
+      "epoch": 1.6647079772079771,
+      "grad_norm": 0.6341620087623596,
+      "learning_rate": 0.00012620174754528166,
+      "loss": 1.0535,
+      "step": 9350
+    },
+    {
+      "epoch": 1.66488603988604,
+      "grad_norm": 0.6434268355369568,
+      "learning_rate": 0.00012618823884435484,
+      "loss": 0.8964,
+      "step": 9351
+    },
+    {
+      "epoch": 1.6650641025641026,
+      "grad_norm": 0.7685084939002991,
+      "learning_rate": 0.00012617472963032385,
+      "loss": 1.0639,
+      "step": 9352
+    },
+    {
+      "epoch": 1.6652421652421654,
+      "grad_norm": 0.6347958445549011,
+      "learning_rate": 0.00012616121990345345,
+      "loss": 1.0252,
+      "step": 9353
+    },
+    {
+      "epoch": 1.665420227920228,
+      "grad_norm": 0.647722601890564,
+      "learning_rate": 0.0001261477096640083,
+      "loss": 0.9527,
+      "step": 9354
+    },
+    {
+      "epoch": 1.6655982905982905,
+      "grad_norm": 0.5942047834396362,
+      "learning_rate": 0.000126134198912253,
+      "loss": 1.0062,
+      "step": 9355
+    },
+    {
+      "epoch": 1.6657763532763532,
+      "grad_norm": 0.683555006980896,
+      "learning_rate": 0.00012612068764845247,
+      "loss": 0.8101,
+      "step": 9356
+    },
+    {
+      "epoch": 1.665954415954416,
+      "grad_norm": 0.6832289099693298,
+      "learning_rate": 0.00012610717587287128,
+      "loss": 1.1436,
+      "step": 9357
+    },
+    {
+      "epoch": 1.6661324786324787,
+      "grad_norm": 0.7035253047943115,
+      "learning_rate": 0.00012609366358577422,
+      "loss": 0.9724,
+      "step": 9358
+    },
+    {
+      "epoch": 1.6663105413105415,
+      "grad_norm": 0.6471409797668457,
+      "learning_rate": 0.00012608015078742604,
+      "loss": 0.776,
+      "step": 9359
+    },
+    {
+      "epoch": 1.666488603988604,
+      "grad_norm": 0.7069687247276306,
+      "learning_rate": 0.00012606663747809145,
+      "loss": 0.9667,
+      "step": 9360
+    },
+    {
+      "epoch": 1.6666666666666665,
+      "grad_norm": 0.6744135618209839,
+      "learning_rate": 0.00012605312365803525,
+      "loss": 1.1152,
+      "step": 9361
+    },
+    {
+      "epoch": 1.6668447293447293,
+      "grad_norm": 0.7212334275245667,
+      "learning_rate": 0.00012603960932752227,
+      "loss": 1.1543,
+      "step": 9362
+    },
+    {
+      "epoch": 1.667022792022792,
+      "grad_norm": 0.6501669883728027,
+      "learning_rate": 0.0001260260944868172,
+      "loss": 0.8595,
+      "step": 9363
+    },
+    {
+      "epoch": 1.6672008547008548,
+      "grad_norm": 0.6970864534378052,
+      "learning_rate": 0.00012601257913618486,
+      "loss": 0.9364,
+      "step": 9364
+    },
+    {
+      "epoch": 1.6673789173789175,
+      "grad_norm": 0.6802223324775696,
+      "learning_rate": 0.00012599906327589007,
+      "loss": 0.8429,
+      "step": 9365
+    },
+    {
+      "epoch": 1.66755698005698,
+      "grad_norm": 0.6842933893203735,
+      "learning_rate": 0.00012598554690619764,
+      "loss": 1.1255,
+      "step": 9366
+    },
+    {
+      "epoch": 1.6677350427350426,
+      "grad_norm": 0.6547088623046875,
+      "learning_rate": 0.0001259720300273724,
+      "loss": 0.983,
+      "step": 9367
+    },
+    {
+      "epoch": 1.6679131054131053,
+      "grad_norm": 0.620424211025238,
+      "learning_rate": 0.0001259585126396792,
+      "loss": 0.918,
+      "step": 9368
+    },
+    {
+      "epoch": 1.668091168091168,
+      "grad_norm": 0.5659816861152649,
+      "learning_rate": 0.00012594499474338287,
+      "loss": 0.7788,
+      "step": 9369
+    },
+    {
+      "epoch": 1.6682692307692308,
+      "grad_norm": 0.5904595255851746,
+      "learning_rate": 0.00012593147633874826,
+      "loss": 0.801,
+      "step": 9370
+    },
+    {
+      "epoch": 1.6684472934472936,
+      "grad_norm": 0.6444024443626404,
+      "learning_rate": 0.0001259179574260402,
+      "loss": 1.0997,
+      "step": 9371
+    },
+    {
+      "epoch": 1.6686253561253561,
+      "grad_norm": 0.6408827304840088,
+      "learning_rate": 0.00012590443800552365,
+      "loss": 0.9839,
+      "step": 9372
+    },
+    {
+      "epoch": 1.6688034188034186,
+      "grad_norm": 0.752391517162323,
+      "learning_rate": 0.00012589091807746345,
+      "loss": 1.0249,
+      "step": 9373
+    },
+    {
+      "epoch": 1.6689814814814814,
+      "grad_norm": 0.8256397247314453,
+      "learning_rate": 0.00012587739764212448,
+      "loss": 0.9541,
+      "step": 9374
+    },
+    {
+      "epoch": 1.6691595441595442,
+      "grad_norm": 0.7878768444061279,
+      "learning_rate": 0.00012586387669977166,
+      "loss": 1.0071,
+      "step": 9375
+    },
+    {
+      "epoch": 1.669337606837607,
+      "grad_norm": 0.6179735660552979,
+      "learning_rate": 0.0001258503552506699,
+      "loss": 0.8495,
+      "step": 9376
+    },
+    {
+      "epoch": 1.6695156695156697,
+      "grad_norm": 0.6699580550193787,
+      "learning_rate": 0.00012583683329508413,
+      "loss": 0.8999,
+      "step": 9377
+    },
+    {
+      "epoch": 1.6696937321937322,
+      "grad_norm": 0.6542006731033325,
+      "learning_rate": 0.00012582331083327929,
+      "loss": 1.0357,
+      "step": 9378
+    },
+    {
+      "epoch": 1.6698717948717947,
+      "grad_norm": 0.7275210618972778,
+      "learning_rate": 0.0001258097878655203,
+      "loss": 1.0259,
+      "step": 9379
+    },
+    {
+      "epoch": 1.6700498575498575,
+      "grad_norm": 0.6836326122283936,
+      "learning_rate": 0.00012579626439207216,
+      "loss": 1.0428,
+      "step": 9380
+    },
+    {
+      "epoch": 1.6702279202279202,
+      "grad_norm": 0.760123610496521,
+      "learning_rate": 0.00012578274041319978,
+      "loss": 0.9716,
+      "step": 9381
+    },
+    {
+      "epoch": 1.670405982905983,
+      "grad_norm": 0.5525194406509399,
+      "learning_rate": 0.00012576921592916818,
+      "loss": 0.8253,
+      "step": 9382
+    },
+    {
+      "epoch": 1.6705840455840457,
+      "grad_norm": 0.6881270408630371,
+      "learning_rate": 0.00012575569094024232,
+      "loss": 1.0571,
+      "step": 9383
+    },
+    {
+      "epoch": 1.6707621082621082,
+      "grad_norm": 0.6776245832443237,
+      "learning_rate": 0.0001257421654466872,
+      "loss": 0.9119,
+      "step": 9384
+    },
+    {
+      "epoch": 1.6709401709401708,
+      "grad_norm": 0.7903014421463013,
+      "learning_rate": 0.0001257286394487678,
+      "loss": 1.0626,
+      "step": 9385
+    },
+    {
+      "epoch": 1.6711182336182335,
+      "grad_norm": 0.61158287525177,
+      "learning_rate": 0.0001257151129467492,
+      "loss": 0.9378,
+      "step": 9386
+    },
+    {
+      "epoch": 1.6712962962962963,
+      "grad_norm": 0.655189573764801,
+      "learning_rate": 0.00012570158594089637,
+      "loss": 0.9334,
+      "step": 9387
+    },
+    {
+      "epoch": 1.671474358974359,
+      "grad_norm": 0.6707320809364319,
+      "learning_rate": 0.0001256880584314743,
+      "loss": 1.1802,
+      "step": 9388
+    },
+    {
+      "epoch": 1.6716524216524218,
+      "grad_norm": 0.847341775894165,
+      "learning_rate": 0.00012567453041874814,
+      "loss": 1.1169,
+      "step": 9389
+    },
+    {
+      "epoch": 1.6718304843304843,
+      "grad_norm": 0.6136410236358643,
+      "learning_rate": 0.00012566100190298287,
+      "loss": 0.8959,
+      "step": 9390
+    },
+    {
+      "epoch": 1.672008547008547,
+      "grad_norm": 0.7203437089920044,
+      "learning_rate": 0.00012564747288444357,
+      "loss": 0.9803,
+      "step": 9391
+    },
+    {
+      "epoch": 1.6721866096866096,
+      "grad_norm": 0.7832576632499695,
+      "learning_rate": 0.00012563394336339534,
+      "loss": 0.8696,
+      "step": 9392
+    },
+    {
+      "epoch": 1.6723646723646723,
+      "grad_norm": 0.6940804719924927,
+      "learning_rate": 0.00012562041334010323,
+      "loss": 1.0571,
+      "step": 9393
+    },
+    {
+      "epoch": 1.672542735042735,
+      "grad_norm": 0.6042298674583435,
+      "learning_rate": 0.00012560688281483234,
+      "loss": 0.8835,
+      "step": 9394
+    },
+    {
+      "epoch": 1.6727207977207978,
+      "grad_norm": 0.7870675921440125,
+      "learning_rate": 0.00012559335178784776,
+      "loss": 1.1585,
+      "step": 9395
+    },
+    {
+      "epoch": 1.6728988603988604,
+      "grad_norm": 0.7448568940162659,
+      "learning_rate": 0.00012557982025941463,
+      "loss": 0.9699,
+      "step": 9396
+    },
+    {
+      "epoch": 1.6730769230769231,
+      "grad_norm": 0.7226544618606567,
+      "learning_rate": 0.00012556628822979807,
+      "loss": 0.7817,
+      "step": 9397
+    },
+    {
+      "epoch": 1.6732549857549857,
+      "grad_norm": 0.5652043223381042,
+      "learning_rate": 0.0001255527556992632,
+      "loss": 0.8077,
+      "step": 9398
+    },
+    {
+      "epoch": 1.6734330484330484,
+      "grad_norm": 0.6459930539131165,
+      "learning_rate": 0.00012553922266807517,
+      "loss": 1.22,
+      "step": 9399
+    },
+    {
+      "epoch": 1.6736111111111112,
+      "grad_norm": 0.7568991780281067,
+      "learning_rate": 0.00012552568913649912,
+      "loss": 1.1559,
+      "step": 9400
+    },
+    {
+      "epoch": 1.673789173789174,
+      "grad_norm": 0.7462680339813232,
+      "learning_rate": 0.0001255121551048002,
+      "loss": 1.1438,
+      "step": 9401
+    },
+    {
+      "epoch": 1.6739672364672364,
+      "grad_norm": 0.6653871536254883,
+      "learning_rate": 0.0001254986205732436,
+      "loss": 0.9468,
+      "step": 9402
+    },
+    {
+      "epoch": 1.6741452991452992,
+      "grad_norm": 0.6261825561523438,
+      "learning_rate": 0.0001254850855420945,
+      "loss": 0.8558,
+      "step": 9403
+    },
+    {
+      "epoch": 1.6743233618233617,
+      "grad_norm": 0.6442354321479797,
+      "learning_rate": 0.0001254715500116181,
+      "loss": 0.8605,
+      "step": 9404
+    },
+    {
+      "epoch": 1.6745014245014245,
+      "grad_norm": 0.7483665943145752,
+      "learning_rate": 0.00012545801398207958,
+      "loss": 0.9089,
+      "step": 9405
+    },
+    {
+      "epoch": 1.6746794871794872,
+      "grad_norm": 0.7319819927215576,
+      "learning_rate": 0.00012544447745374416,
+      "loss": 0.9937,
+      "step": 9406
+    },
+    {
+      "epoch": 1.67485754985755,
+      "grad_norm": 0.703014075756073,
+      "learning_rate": 0.00012543094042687708,
+      "loss": 0.9597,
+      "step": 9407
+    },
+    {
+      "epoch": 1.6750356125356125,
+      "grad_norm": 0.6593887209892273,
+      "learning_rate": 0.00012541740290174353,
+      "loss": 0.844,
+      "step": 9408
+    },
+    {
+      "epoch": 1.6752136752136753,
+      "grad_norm": 0.6567463874816895,
+      "learning_rate": 0.00012540386487860879,
+      "loss": 1.0744,
+      "step": 9409
+    },
+    {
+      "epoch": 1.6753917378917378,
+      "grad_norm": 0.7784611582756042,
+      "learning_rate": 0.00012539032635773805,
+      "loss": 0.974,
+      "step": 9410
+    },
+    {
+      "epoch": 1.6755698005698005,
+      "grad_norm": 0.6760087609291077,
+      "learning_rate": 0.00012537678733939663,
+      "loss": 0.8948,
+      "step": 9411
+    },
+    {
+      "epoch": 1.6757478632478633,
+      "grad_norm": 0.825965940952301,
+      "learning_rate": 0.0001253632478238498,
+      "loss": 1.1196,
+      "step": 9412
+    },
+    {
+      "epoch": 1.675925925925926,
+      "grad_norm": 0.7215564250946045,
+      "learning_rate": 0.00012534970781136277,
+      "loss": 1.1774,
+      "step": 9413
+    },
+    {
+      "epoch": 1.6761039886039886,
+      "grad_norm": 0.6548578143119812,
+      "learning_rate": 0.00012533616730220094,
+      "loss": 0.8671,
+      "step": 9414
+    },
+    {
+      "epoch": 1.6762820512820513,
+      "grad_norm": 0.7257684469223022,
+      "learning_rate": 0.00012532262629662947,
+      "loss": 1.105,
+      "step": 9415
+    },
+    {
+      "epoch": 1.6764601139601139,
+      "grad_norm": 0.6695847511291504,
+      "learning_rate": 0.00012530908479491378,
+      "loss": 0.9189,
+      "step": 9416
+    },
+    {
+      "epoch": 1.6766381766381766,
+      "grad_norm": 0.684695303440094,
+      "learning_rate": 0.00012529554279731915,
+      "loss": 1.066,
+      "step": 9417
+    },
+    {
+      "epoch": 1.6768162393162394,
+      "grad_norm": 0.7107276320457458,
+      "learning_rate": 0.0001252820003041109,
+      "loss": 0.9311,
+      "step": 9418
+    },
+    {
+      "epoch": 1.676994301994302,
+      "grad_norm": 0.6755440831184387,
+      "learning_rate": 0.0001252684573155544,
+      "loss": 1.1036,
+      "step": 9419
+    },
+    {
+      "epoch": 1.6771723646723646,
+      "grad_norm": 0.7571110725402832,
+      "learning_rate": 0.00012525491383191491,
+      "loss": 1.0244,
+      "step": 9420
+    },
+    {
+      "epoch": 1.6773504273504274,
+      "grad_norm": 0.6960614323616028,
+      "learning_rate": 0.0001252413698534579,
+      "loss": 0.9077,
+      "step": 9421
+    },
+    {
+      "epoch": 1.67752849002849,
+      "grad_norm": 0.6675550937652588,
+      "learning_rate": 0.00012522782538044867,
+      "loss": 1.0543,
+      "step": 9422
+    },
+    {
+      "epoch": 1.6777065527065527,
+      "grad_norm": 0.6637391448020935,
+      "learning_rate": 0.0001252142804131526,
+      "loss": 0.9471,
+      "step": 9423
+    },
+    {
+      "epoch": 1.6778846153846154,
+      "grad_norm": 0.6382880210876465,
+      "learning_rate": 0.00012520073495183508,
+      "loss": 0.9729,
+      "step": 9424
+    },
+    {
+      "epoch": 1.6780626780626782,
+      "grad_norm": 0.731922447681427,
+      "learning_rate": 0.0001251871889967615,
+      "loss": 1.0385,
+      "step": 9425
+    },
+    {
+      "epoch": 1.6782407407407407,
+      "grad_norm": 0.5868890285491943,
+      "learning_rate": 0.00012517364254819728,
+      "loss": 0.8466,
+      "step": 9426
+    },
+    {
+      "epoch": 1.6784188034188035,
+      "grad_norm": 0.8535677790641785,
+      "learning_rate": 0.00012516009560640786,
+      "loss": 1.1009,
+      "step": 9427
+    },
+    {
+      "epoch": 1.678596866096866,
+      "grad_norm": 0.7044199705123901,
+      "learning_rate": 0.0001251465481716586,
+      "loss": 1.0862,
+      "step": 9428
+    },
+    {
+      "epoch": 1.6787749287749287,
+      "grad_norm": 0.7207323312759399,
+      "learning_rate": 0.00012513300024421498,
+      "loss": 1.064,
+      "step": 9429
+    },
+    {
+      "epoch": 1.6789529914529915,
+      "grad_norm": 0.7739703059196472,
+      "learning_rate": 0.0001251194518243424,
+      "loss": 1.1738,
+      "step": 9430
+    },
+    {
+      "epoch": 1.6791310541310542,
+      "grad_norm": 0.6829344630241394,
+      "learning_rate": 0.00012510590291230637,
+      "loss": 1.0555,
+      "step": 9431
+    },
+    {
+      "epoch": 1.6793091168091168,
+      "grad_norm": 0.6760238409042358,
+      "learning_rate": 0.0001250923535083723,
+      "loss": 1.2177,
+      "step": 9432
+    },
+    {
+      "epoch": 1.6794871794871795,
+      "grad_norm": 0.6666911840438843,
+      "learning_rate": 0.0001250788036128057,
+      "loss": 0.8957,
+      "step": 9433
+    },
+    {
+      "epoch": 1.679665242165242,
+      "grad_norm": 0.747797429561615,
+      "learning_rate": 0.00012506525322587207,
+      "loss": 0.9793,
+      "step": 9434
+    },
+    {
+      "epoch": 1.6798433048433048,
+      "grad_norm": 0.6261107325553894,
+      "learning_rate": 0.00012505170234783686,
+      "loss": 0.7781,
+      "step": 9435
+    },
+    {
+      "epoch": 1.6800213675213675,
+      "grad_norm": 0.7055163979530334,
+      "learning_rate": 0.00012503815097896555,
+      "loss": 1.0617,
+      "step": 9436
+    },
+    {
+      "epoch": 1.6801994301994303,
+      "grad_norm": 0.5567409992218018,
+      "learning_rate": 0.00012502459911952371,
+      "loss": 0.7911,
+      "step": 9437
+    },
+    {
+      "epoch": 1.6803774928774928,
+      "grad_norm": 0.7410423159599304,
+      "learning_rate": 0.0001250110467697768,
+      "loss": 1.1041,
+      "step": 9438
+    },
+    {
+      "epoch": 1.6805555555555556,
+      "grad_norm": 0.6185283064842224,
+      "learning_rate": 0.00012499749392999045,
+      "loss": 0.8101,
+      "step": 9439
+    },
+    {
+      "epoch": 1.680733618233618,
+      "grad_norm": 0.6988311409950256,
+      "learning_rate": 0.0001249839406004301,
+      "loss": 0.8579,
+      "step": 9440
+    },
+    {
+      "epoch": 1.6809116809116809,
+      "grad_norm": 0.5588746070861816,
+      "learning_rate": 0.00012497038678136132,
+      "loss": 0.8035,
+      "step": 9441
+    },
+    {
+      "epoch": 1.6810897435897436,
+      "grad_norm": 0.6568905711174011,
+      "learning_rate": 0.0001249568324730497,
+      "loss": 0.7455,
+      "step": 9442
+    },
+    {
+      "epoch": 1.6812678062678064,
+      "grad_norm": 0.6924821138381958,
+      "learning_rate": 0.00012494327767576078,
+      "loss": 1.134,
+      "step": 9443
+    },
+    {
+      "epoch": 1.681445868945869,
+      "grad_norm": 0.6940170526504517,
+      "learning_rate": 0.00012492972238976018,
+      "loss": 0.9719,
+      "step": 9444
+    },
+    {
+      "epoch": 1.6816239316239316,
+      "grad_norm": 0.667465090751648,
+      "learning_rate": 0.00012491616661531343,
+      "loss": 0.953,
+      "step": 9445
+    },
+    {
+      "epoch": 1.6818019943019942,
+      "grad_norm": 0.7693275809288025,
+      "learning_rate": 0.00012490261035268612,
+      "loss": 1.1342,
+      "step": 9446
+    },
+    {
+      "epoch": 1.681980056980057,
+      "grad_norm": 0.7243115305900574,
+      "learning_rate": 0.00012488905360214393,
+      "loss": 1.1847,
+      "step": 9447
+    },
+    {
+      "epoch": 1.6821581196581197,
+      "grad_norm": 0.657357931137085,
+      "learning_rate": 0.00012487549636395245,
+      "loss": 0.8747,
+      "step": 9448
+    },
+    {
+      "epoch": 1.6823361823361824,
+      "grad_norm": 0.7471592426300049,
+      "learning_rate": 0.00012486193863837727,
+      "loss": 1.0472,
+      "step": 9449
+    },
+    {
+      "epoch": 1.6825142450142452,
+      "grad_norm": 0.7476530075073242,
+      "learning_rate": 0.00012484838042568406,
+      "loss": 1.0708,
+      "step": 9450
+    },
+    {
+      "epoch": 1.6826923076923077,
+      "grad_norm": 0.6031121611595154,
+      "learning_rate": 0.00012483482172613846,
+      "loss": 0.8243,
+      "step": 9451
+    },
+    {
+      "epoch": 1.6828703703703702,
+      "grad_norm": 0.6733492016792297,
+      "learning_rate": 0.00012482126254000607,
+      "loss": 0.7808,
+      "step": 9452
+    },
+    {
+      "epoch": 1.683048433048433,
+      "grad_norm": 0.5865318179130554,
+      "learning_rate": 0.00012480770286755265,
+      "loss": 0.829,
+      "step": 9453
+    },
+    {
+      "epoch": 1.6832264957264957,
+      "grad_norm": 0.6805713772773743,
+      "learning_rate": 0.0001247941427090438,
+      "loss": 0.7206,
+      "step": 9454
+    },
+    {
+      "epoch": 1.6834045584045585,
+      "grad_norm": 0.6514836549758911,
+      "learning_rate": 0.0001247805820647453,
+      "loss": 0.9499,
+      "step": 9455
+    },
+    {
+      "epoch": 1.6835826210826212,
+      "grad_norm": 0.7432990074157715,
+      "learning_rate": 0.0001247670209349227,
+      "loss": 1.1324,
+      "step": 9456
+    },
+    {
+      "epoch": 1.6837606837606838,
+      "grad_norm": 0.6348414421081543,
+      "learning_rate": 0.00012475345931984178,
+      "loss": 0.8246,
+      "step": 9457
+    },
+    {
+      "epoch": 1.6839387464387463,
+      "grad_norm": 0.7194374203681946,
+      "learning_rate": 0.00012473989721976825,
+      "loss": 0.9634,
+      "step": 9458
+    },
+    {
+      "epoch": 1.684116809116809,
+      "grad_norm": 0.7869647741317749,
+      "learning_rate": 0.00012472633463496785,
+      "loss": 1.2115,
+      "step": 9459
+    },
+    {
+      "epoch": 1.6842948717948718,
+      "grad_norm": 0.6672070026397705,
+      "learning_rate": 0.00012471277156570623,
+      "loss": 0.9842,
+      "step": 9460
+    },
+    {
+      "epoch": 1.6844729344729346,
+      "grad_norm": 0.6611466407775879,
+      "learning_rate": 0.00012469920801224925,
+      "loss": 0.9343,
+      "step": 9461
+    },
+    {
+      "epoch": 1.6846509971509973,
+      "grad_norm": 0.6715068221092224,
+      "learning_rate": 0.0001246856439748626,
+      "loss": 0.6852,
+      "step": 9462
+    },
+    {
+      "epoch": 1.6848290598290598,
+      "grad_norm": 0.641942024230957,
+      "learning_rate": 0.00012467207945381198,
+      "loss": 0.8863,
+      "step": 9463
+    },
+    {
+      "epoch": 1.6850071225071224,
+      "grad_norm": 0.8414762616157532,
+      "learning_rate": 0.00012465851444936325,
+      "loss": 1.3404,
+      "step": 9464
+    },
+    {
+      "epoch": 1.6851851851851851,
+      "grad_norm": 0.715752363204956,
+      "learning_rate": 0.00012464494896178216,
+      "loss": 1.123,
+      "step": 9465
+    },
+    {
+      "epoch": 1.6853632478632479,
+      "grad_norm": 0.6913973093032837,
+      "learning_rate": 0.00012463138299133447,
+      "loss": 1.0659,
+      "step": 9466
+    },
+    {
+      "epoch": 1.6855413105413106,
+      "grad_norm": 0.6998484134674072,
+      "learning_rate": 0.000124617816538286,
+      "loss": 1.0555,
+      "step": 9467
+    },
+    {
+      "epoch": 1.6857193732193734,
+      "grad_norm": 0.7313308119773865,
+      "learning_rate": 0.00012460424960290256,
+      "loss": 1.0915,
+      "step": 9468
+    },
+    {
+      "epoch": 1.685897435897436,
+      "grad_norm": 0.6790569424629211,
+      "learning_rate": 0.00012459068218544995,
+      "loss": 1.0214,
+      "step": 9469
+    },
+    {
+      "epoch": 1.6860754985754984,
+      "grad_norm": 0.6494466662406921,
+      "learning_rate": 0.00012457711428619402,
+      "loss": 0.9476,
+      "step": 9470
+    },
+    {
+      "epoch": 1.6862535612535612,
+      "grad_norm": 0.8048526048660278,
+      "learning_rate": 0.0001245635459054006,
+      "loss": 1.1852,
+      "step": 9471
+    },
+    {
+      "epoch": 1.686431623931624,
+      "grad_norm": 0.6237879395484924,
+      "learning_rate": 0.0001245499770433355,
+      "loss": 1.0106,
+      "step": 9472
+    },
+    {
+      "epoch": 1.6866096866096867,
+      "grad_norm": 0.6282906532287598,
+      "learning_rate": 0.0001245364077002646,
+      "loss": 0.9858,
+      "step": 9473
+    },
+    {
+      "epoch": 1.6867877492877494,
+      "grad_norm": 0.7239370346069336,
+      "learning_rate": 0.00012452283787645375,
+      "loss": 0.9586,
+      "step": 9474
+    },
+    {
+      "epoch": 1.686965811965812,
+      "grad_norm": 0.6438776850700378,
+      "learning_rate": 0.00012450926757216887,
+      "loss": 0.9198,
+      "step": 9475
+    },
+    {
+      "epoch": 1.6871438746438745,
+      "grad_norm": 0.6451360583305359,
+      "learning_rate": 0.00012449569678767578,
+      "loss": 1.0183,
+      "step": 9476
+    },
+    {
+      "epoch": 1.6873219373219372,
+      "grad_norm": 0.6950216293334961,
+      "learning_rate": 0.0001244821255232404,
+      "loss": 0.9048,
+      "step": 9477
+    },
+    {
+      "epoch": 1.6875,
+      "grad_norm": 0.710489809513092,
+      "learning_rate": 0.00012446855377912865,
+      "loss": 1.1596,
+      "step": 9478
+    },
+    {
+      "epoch": 1.6876780626780628,
+      "grad_norm": 0.6819305419921875,
+      "learning_rate": 0.0001244549815556064,
+      "loss": 0.8486,
+      "step": 9479
+    },
+    {
+      "epoch": 1.6878561253561255,
+      "grad_norm": 0.7185879945755005,
+      "learning_rate": 0.00012444140885293958,
+      "loss": 0.9539,
+      "step": 9480
+    },
+    {
+      "epoch": 1.688034188034188,
+      "grad_norm": 0.8181464672088623,
+      "learning_rate": 0.00012442783567139415,
+      "loss": 1.0038,
+      "step": 9481
+    },
+    {
+      "epoch": 1.6882122507122506,
+      "grad_norm": 0.47161349654197693,
+      "learning_rate": 0.000124414262011236,
+      "loss": 0.67,
+      "step": 9482
+    },
+    {
+      "epoch": 1.6883903133903133,
+      "grad_norm": 0.7752482295036316,
+      "learning_rate": 0.00012440068787273112,
+      "loss": 0.9944,
+      "step": 9483
+    },
+    {
+      "epoch": 1.688568376068376,
+      "grad_norm": 0.7119397521018982,
+      "learning_rate": 0.00012438711325614543,
+      "loss": 0.9098,
+      "step": 9484
+    },
+    {
+      "epoch": 1.6887464387464388,
+      "grad_norm": 0.7161153554916382,
+      "learning_rate": 0.00012437353816174493,
+      "loss": 1.0003,
+      "step": 9485
+    },
+    {
+      "epoch": 1.6889245014245016,
+      "grad_norm": 0.5989507436752319,
+      "learning_rate": 0.0001243599625897956,
+      "loss": 1.0301,
+      "step": 9486
+    },
+    {
+      "epoch": 1.689102564102564,
+      "grad_norm": 0.7906841039657593,
+      "learning_rate": 0.00012434638654056334,
+      "loss": 1.0388,
+      "step": 9487
+    },
+    {
+      "epoch": 1.6892806267806266,
+      "grad_norm": 0.6679551601409912,
+      "learning_rate": 0.00012433281001431428,
+      "loss": 0.9505,
+      "step": 9488
+    },
+    {
+      "epoch": 1.6894586894586894,
+      "grad_norm": 0.7090578675270081,
+      "learning_rate": 0.0001243192330113143,
+      "loss": 0.8616,
+      "step": 9489
+    },
+    {
+      "epoch": 1.6896367521367521,
+      "grad_norm": 0.6401308178901672,
+      "learning_rate": 0.00012430565553182949,
+      "loss": 0.9099,
+      "step": 9490
+    },
+    {
+      "epoch": 1.6898148148148149,
+      "grad_norm": 0.7360149621963501,
+      "learning_rate": 0.00012429207757612586,
+      "loss": 1.0233,
+      "step": 9491
+    },
+    {
+      "epoch": 1.6899928774928776,
+      "grad_norm": 0.6736137270927429,
+      "learning_rate": 0.00012427849914446946,
+      "loss": 0.9803,
+      "step": 9492
+    },
+    {
+      "epoch": 1.6901709401709402,
+      "grad_norm": 0.7728668451309204,
+      "learning_rate": 0.00012426492023712623,
+      "loss": 1.2316,
+      "step": 9493
+    },
+    {
+      "epoch": 1.6903490028490027,
+      "grad_norm": 0.789718508720398,
+      "learning_rate": 0.00012425134085436234,
+      "loss": 1.1218,
+      "step": 9494
+    },
+    {
+      "epoch": 1.6905270655270654,
+      "grad_norm": 0.7314121723175049,
+      "learning_rate": 0.0001242377609964438,
+      "loss": 1.1294,
+      "step": 9495
+    },
+    {
+      "epoch": 1.6907051282051282,
+      "grad_norm": 0.7222046256065369,
+      "learning_rate": 0.0001242241806636367,
+      "loss": 1.0288,
+      "step": 9496
+    },
+    {
+      "epoch": 1.690883190883191,
+      "grad_norm": 0.7546363472938538,
+      "learning_rate": 0.00012421059985620708,
+      "loss": 0.8781,
+      "step": 9497
+    },
+    {
+      "epoch": 1.6910612535612537,
+      "grad_norm": 0.7502550482749939,
+      "learning_rate": 0.00012419701857442104,
+      "loss": 0.927,
+      "step": 9498
+    },
+    {
+      "epoch": 1.6912393162393162,
+      "grad_norm": 0.6244059205055237,
+      "learning_rate": 0.00012418343681854473,
+      "loss": 0.9689,
+      "step": 9499
+    },
+    {
+      "epoch": 1.6914173789173788,
+      "grad_norm": 0.7214263677597046,
+      "learning_rate": 0.00012416985458884417,
+      "loss": 1.0842,
+      "step": 9500
+    },
+    {
+      "epoch": 1.6915954415954415,
+      "grad_norm": 0.6960242390632629,
+      "learning_rate": 0.00012415627188558555,
+      "loss": 0.9766,
+      "step": 9501
+    },
+    {
+      "epoch": 1.6917735042735043,
+      "grad_norm": 0.6687830686569214,
+      "learning_rate": 0.00012414268870903494,
+      "loss": 1.0222,
+      "step": 9502
+    },
+    {
+      "epoch": 1.691951566951567,
+      "grad_norm": 0.8611155152320862,
+      "learning_rate": 0.00012412910505945848,
+      "loss": 1.1792,
+      "step": 9503
+    },
+    {
+      "epoch": 1.6921296296296298,
+      "grad_norm": 0.6655587553977966,
+      "learning_rate": 0.00012411552093712235,
+      "loss": 0.8763,
+      "step": 9504
+    },
+    {
+      "epoch": 1.6923076923076923,
+      "grad_norm": 0.7829837799072266,
+      "learning_rate": 0.00012410193634229268,
+      "loss": 1.0803,
+      "step": 9505
+    },
+    {
+      "epoch": 1.6924857549857548,
+      "grad_norm": 0.7951042652130127,
+      "learning_rate": 0.00012408835127523566,
+      "loss": 1.0925,
+      "step": 9506
+    },
+    {
+      "epoch": 1.6926638176638176,
+      "grad_norm": 0.715495228767395,
+      "learning_rate": 0.0001240747657362174,
+      "loss": 1.2411,
+      "step": 9507
+    },
+    {
+      "epoch": 1.6928418803418803,
+      "grad_norm": 0.6779513359069824,
+      "learning_rate": 0.00012406117972550414,
+      "loss": 0.8886,
+      "step": 9508
+    },
+    {
+      "epoch": 1.693019943019943,
+      "grad_norm": 0.647588312625885,
+      "learning_rate": 0.00012404759324336203,
+      "loss": 1.107,
+      "step": 9509
+    },
+    {
+      "epoch": 1.6931980056980058,
+      "grad_norm": 0.7398989796638489,
+      "learning_rate": 0.00012403400629005726,
+      "loss": 1.0256,
+      "step": 9510
+    },
+    {
+      "epoch": 1.6933760683760684,
+      "grad_norm": 0.7572638392448425,
+      "learning_rate": 0.0001240204188658561,
+      "loss": 0.9662,
+      "step": 9511
+    },
+    {
+      "epoch": 1.693554131054131,
+      "grad_norm": 0.7044163346290588,
+      "learning_rate": 0.00012400683097102473,
+      "loss": 1.1388,
+      "step": 9512
+    },
+    {
+      "epoch": 1.6937321937321936,
+      "grad_norm": 0.7889094948768616,
+      "learning_rate": 0.00012399324260582936,
+      "loss": 1.0453,
+      "step": 9513
+    },
+    {
+      "epoch": 1.6939102564102564,
+      "grad_norm": 0.7977854609489441,
+      "learning_rate": 0.00012397965377053627,
+      "loss": 1.015,
+      "step": 9514
+    },
+    {
+      "epoch": 1.6940883190883191,
+      "grad_norm": 0.6223814487457275,
+      "learning_rate": 0.00012396606446541165,
+      "loss": 0.7985,
+      "step": 9515
+    },
+    {
+      "epoch": 1.694266381766382,
+      "grad_norm": 0.8307462334632874,
+      "learning_rate": 0.0001239524746907218,
+      "loss": 0.8899,
+      "step": 9516
+    },
+    {
+      "epoch": 1.6944444444444444,
+      "grad_norm": 0.7780544757843018,
+      "learning_rate": 0.00012393888444673295,
+      "loss": 0.9406,
+      "step": 9517
+    },
+    {
+      "epoch": 1.6946225071225072,
+      "grad_norm": 0.6894499659538269,
+      "learning_rate": 0.0001239252937337114,
+      "loss": 0.9412,
+      "step": 9518
+    },
+    {
+      "epoch": 1.6948005698005697,
+      "grad_norm": 0.7000680565834045,
+      "learning_rate": 0.00012391170255192342,
+      "loss": 1.0314,
+      "step": 9519
+    },
+    {
+      "epoch": 1.6949786324786325,
+      "grad_norm": 0.6772416830062866,
+      "learning_rate": 0.0001238981109016353,
+      "loss": 0.9153,
+      "step": 9520
+    },
+    {
+      "epoch": 1.6951566951566952,
+      "grad_norm": 0.7069609761238098,
+      "learning_rate": 0.00012388451878311333,
+      "loss": 1.1777,
+      "step": 9521
+    },
+    {
+      "epoch": 1.695334757834758,
+      "grad_norm": 0.6138432621955872,
+      "learning_rate": 0.00012387092619662386,
+      "loss": 0.8085,
+      "step": 9522
+    },
+    {
+      "epoch": 1.6955128205128205,
+      "grad_norm": 0.6122859716415405,
+      "learning_rate": 0.00012385733314243313,
+      "loss": 0.8534,
+      "step": 9523
+    },
+    {
+      "epoch": 1.6956908831908832,
+      "grad_norm": 0.7499903440475464,
+      "learning_rate": 0.00012384373962080755,
+      "loss": 0.9329,
+      "step": 9524
+    },
+    {
+      "epoch": 1.6958689458689458,
+      "grad_norm": 0.6413441896438599,
+      "learning_rate": 0.00012383014563201343,
+      "loss": 0.9609,
+      "step": 9525
+    },
+    {
+      "epoch": 1.6960470085470085,
+      "grad_norm": 0.7467969059944153,
+      "learning_rate": 0.0001238165511763171,
+      "loss": 0.9142,
+      "step": 9526
+    },
+    {
+      "epoch": 1.6962250712250713,
+      "grad_norm": 0.6540884375572205,
+      "learning_rate": 0.00012380295625398494,
+      "loss": 0.9503,
+      "step": 9527
+    },
+    {
+      "epoch": 1.696403133903134,
+      "grad_norm": 0.6298567652702332,
+      "learning_rate": 0.00012378936086528326,
+      "loss": 0.8853,
+      "step": 9528
+    },
+    {
+      "epoch": 1.6965811965811965,
+      "grad_norm": 0.8003417253494263,
+      "learning_rate": 0.00012377576501047845,
+      "loss": 0.969,
+      "step": 9529
+    },
+    {
+      "epoch": 1.6967592592592593,
+      "grad_norm": 0.8318493962287903,
+      "learning_rate": 0.00012376216868983697,
+      "loss": 1.1413,
+      "step": 9530
+    },
+    {
+      "epoch": 1.6969373219373218,
+      "grad_norm": 0.8294426202774048,
+      "learning_rate": 0.00012374857190362515,
+      "loss": 1.1885,
+      "step": 9531
+    },
+    {
+      "epoch": 1.6971153846153846,
+      "grad_norm": 0.7502955198287964,
+      "learning_rate": 0.0001237349746521094,
+      "loss": 1.233,
+      "step": 9532
+    },
+    {
+      "epoch": 1.6972934472934473,
+      "grad_norm": 0.6306588649749756,
+      "learning_rate": 0.00012372137693555612,
+      "loss": 1.2255,
+      "step": 9533
+    },
+    {
+      "epoch": 1.69747150997151,
+      "grad_norm": 0.7802746891975403,
+      "learning_rate": 0.0001237077787542317,
+      "loss": 1.2054,
+      "step": 9534
+    },
+    {
+      "epoch": 1.6976495726495726,
+      "grad_norm": 0.685114860534668,
+      "learning_rate": 0.00012369418010840265,
+      "loss": 0.9865,
+      "step": 9535
+    },
+    {
+      "epoch": 1.6978276353276354,
+      "grad_norm": 0.6656857132911682,
+      "learning_rate": 0.00012368058099833536,
+      "loss": 1.1579,
+      "step": 9536
+    },
+    {
+      "epoch": 1.698005698005698,
+      "grad_norm": 0.6596674919128418,
+      "learning_rate": 0.00012366698142429625,
+      "loss": 0.9104,
+      "step": 9537
+    },
+    {
+      "epoch": 1.6981837606837606,
+      "grad_norm": 0.6025584936141968,
+      "learning_rate": 0.00012365338138655183,
+      "loss": 1.117,
+      "step": 9538
+    },
+    {
+      "epoch": 1.6983618233618234,
+      "grad_norm": 0.671585202217102,
+      "learning_rate": 0.0001236397808853685,
+      "loss": 1.0271,
+      "step": 9539
+    },
+    {
+      "epoch": 1.6985398860398861,
+      "grad_norm": 0.7467984557151794,
+      "learning_rate": 0.0001236261799210128,
+      "loss": 1.0411,
+      "step": 9540
+    },
+    {
+      "epoch": 1.6987179487179487,
+      "grad_norm": 0.6251640915870667,
+      "learning_rate": 0.0001236125784937512,
+      "loss": 0.7154,
+      "step": 9541
+    },
+    {
+      "epoch": 1.6988960113960114,
+      "grad_norm": 0.7560956478118896,
+      "learning_rate": 0.00012359897660385016,
+      "loss": 1.0048,
+      "step": 9542
+    },
+    {
+      "epoch": 1.699074074074074,
+      "grad_norm": 0.6144903302192688,
+      "learning_rate": 0.00012358537425157618,
+      "loss": 1.1294,
+      "step": 9543
+    },
+    {
+      "epoch": 1.6992521367521367,
+      "grad_norm": 0.7839425206184387,
+      "learning_rate": 0.00012357177143719578,
+      "loss": 1.0725,
+      "step": 9544
+    },
+    {
+      "epoch": 1.6994301994301995,
+      "grad_norm": 0.6488651037216187,
+      "learning_rate": 0.00012355816816097553,
+      "loss": 0.9267,
+      "step": 9545
+    },
+    {
+      "epoch": 1.6996082621082622,
+      "grad_norm": 0.6848782896995544,
+      "learning_rate": 0.00012354456442318187,
+      "loss": 1.0426,
+      "step": 9546
+    },
+    {
+      "epoch": 1.6997863247863247,
+      "grad_norm": 0.7164611220359802,
+      "learning_rate": 0.0001235309602240814,
+      "loss": 0.8208,
+      "step": 9547
+    },
+    {
+      "epoch": 1.6999643874643875,
+      "grad_norm": 0.6725530624389648,
+      "learning_rate": 0.0001235173555639406,
+      "loss": 0.9366,
+      "step": 9548
+    },
+    {
+      "epoch": 1.70014245014245,
+      "grad_norm": 0.6958004236221313,
+      "learning_rate": 0.00012350375044302612,
+      "loss": 1.0185,
+      "step": 9549
+    },
+    {
+      "epoch": 1.7003205128205128,
+      "grad_norm": 0.8035947680473328,
+      "learning_rate": 0.00012349014486160445,
+      "loss": 1.065,
+      "step": 9550
+    },
+    {
+      "epoch": 1.7004985754985755,
+      "grad_norm": 0.6705633997917175,
+      "learning_rate": 0.00012347653881994222,
+      "loss": 0.8381,
+      "step": 9551
+    },
+    {
+      "epoch": 1.7006766381766383,
+      "grad_norm": 0.6652300357818604,
+      "learning_rate": 0.00012346293231830596,
+      "loss": 1.1428,
+      "step": 9552
+    },
+    {
+      "epoch": 1.7008547008547008,
+      "grad_norm": 0.6719335913658142,
+      "learning_rate": 0.0001234493253569623,
+      "loss": 1.0138,
+      "step": 9553
+    },
+    {
+      "epoch": 1.7010327635327636,
+      "grad_norm": 0.746981680393219,
+      "learning_rate": 0.0001234357179361778,
+      "loss": 1.1169,
+      "step": 9554
+    },
+    {
+      "epoch": 1.701210826210826,
+      "grad_norm": 0.6768170595169067,
+      "learning_rate": 0.0001234221100562191,
+      "loss": 0.9065,
+      "step": 9555
+    },
+    {
+      "epoch": 1.7013888888888888,
+      "grad_norm": 0.7127171754837036,
+      "learning_rate": 0.00012340850171735278,
+      "loss": 0.9467,
+      "step": 9556
+    },
+    {
+      "epoch": 1.7015669515669516,
+      "grad_norm": 0.6802694797515869,
+      "learning_rate": 0.00012339489291984554,
+      "loss": 0.8938,
+      "step": 9557
+    },
+    {
+      "epoch": 1.7017450142450143,
+      "grad_norm": 0.7101455926895142,
+      "learning_rate": 0.00012338128366396394,
+      "loss": 1.1939,
+      "step": 9558
+    },
+    {
+      "epoch": 1.7019230769230769,
+      "grad_norm": 0.621223509311676,
+      "learning_rate": 0.00012336767394997467,
+      "loss": 0.7583,
+      "step": 9559
+    },
+    {
+      "epoch": 1.7021011396011396,
+      "grad_norm": 0.7130763530731201,
+      "learning_rate": 0.00012335406377814439,
+      "loss": 0.8684,
+      "step": 9560
+    },
+    {
+      "epoch": 1.7022792022792022,
+      "grad_norm": 0.6761086583137512,
+      "learning_rate": 0.00012334045314873972,
+      "loss": 1.0197,
+      "step": 9561
+    },
+    {
+      "epoch": 1.702457264957265,
+      "grad_norm": 0.7030459642410278,
+      "learning_rate": 0.00012332684206202736,
+      "loss": 0.8627,
+      "step": 9562
+    },
+    {
+      "epoch": 1.7026353276353277,
+      "grad_norm": 0.6278037428855896,
+      "learning_rate": 0.000123313230518274,
+      "loss": 0.8953,
+      "step": 9563
+    },
+    {
+      "epoch": 1.7028133903133904,
+      "grad_norm": 0.6450623869895935,
+      "learning_rate": 0.00012329961851774627,
+      "loss": 0.8826,
+      "step": 9564
+    },
+    {
+      "epoch": 1.702991452991453,
+      "grad_norm": 0.7324244976043701,
+      "learning_rate": 0.00012328600606071097,
+      "loss": 1.0133,
+      "step": 9565
+    },
+    {
+      "epoch": 1.7031695156695157,
+      "grad_norm": 0.6560033559799194,
+      "learning_rate": 0.00012327239314743473,
+      "loss": 0.9601,
+      "step": 9566
+    },
+    {
+      "epoch": 1.7033475783475782,
+      "grad_norm": 0.6693514585494995,
+      "learning_rate": 0.0001232587797781843,
+      "loss": 0.9447,
+      "step": 9567
+    },
+    {
+      "epoch": 1.703525641025641,
+      "grad_norm": 0.6403199434280396,
+      "learning_rate": 0.00012324516595322638,
+      "loss": 0.8554,
+      "step": 9568
+    },
+    {
+      "epoch": 1.7037037037037037,
+      "grad_norm": 0.8290280103683472,
+      "learning_rate": 0.00012323155167282774,
+      "loss": 1.1877,
+      "step": 9569
+    },
+    {
+      "epoch": 1.7038817663817665,
+      "grad_norm": 0.7207778692245483,
+      "learning_rate": 0.00012321793693725509,
+      "loss": 1.0978,
+      "step": 9570
+    },
+    {
+      "epoch": 1.7040598290598292,
+      "grad_norm": 0.8794265985488892,
+      "learning_rate": 0.00012320432174677519,
+      "loss": 0.9387,
+      "step": 9571
+    },
+    {
+      "epoch": 1.7042378917378918,
+      "grad_norm": 0.6683359146118164,
+      "learning_rate": 0.00012319070610165484,
+      "loss": 0.9227,
+      "step": 9572
+    },
+    {
+      "epoch": 1.7044159544159543,
+      "grad_norm": 0.7342001795768738,
+      "learning_rate": 0.00012317709000216076,
+      "loss": 0.9453,
+      "step": 9573
+    },
+    {
+      "epoch": 1.704594017094017,
+      "grad_norm": 0.6315770149230957,
+      "learning_rate": 0.00012316347344855973,
+      "loss": 0.8263,
+      "step": 9574
+    },
+    {
+      "epoch": 1.7047720797720798,
+      "grad_norm": 0.7697155475616455,
+      "learning_rate": 0.00012314985644111857,
+      "loss": 1.0238,
+      "step": 9575
+    },
+    {
+      "epoch": 1.7049501424501425,
+      "grad_norm": 0.6674068570137024,
+      "learning_rate": 0.00012313623898010408,
+      "loss": 1.0823,
+      "step": 9576
+    },
+    {
+      "epoch": 1.7051282051282053,
+      "grad_norm": 0.6995484232902527,
+      "learning_rate": 0.00012312262106578304,
+      "loss": 1.2001,
+      "step": 9577
+    },
+    {
+      "epoch": 1.7053062678062678,
+      "grad_norm": 0.7639257907867432,
+      "learning_rate": 0.00012310900269842226,
+      "loss": 1.3438,
+      "step": 9578
+    },
+    {
+      "epoch": 1.7054843304843303,
+      "grad_norm": 0.6486390233039856,
+      "learning_rate": 0.00012309538387828857,
+      "loss": 0.9924,
+      "step": 9579
+    },
+    {
+      "epoch": 1.705662393162393,
+      "grad_norm": 0.6737813949584961,
+      "learning_rate": 0.00012308176460564885,
+      "loss": 0.8722,
+      "step": 9580
+    },
+    {
+      "epoch": 1.7058404558404558,
+      "grad_norm": 0.6462090611457825,
+      "learning_rate": 0.00012306814488076987,
+      "loss": 1.1013,
+      "step": 9581
+    },
+    {
+      "epoch": 1.7060185185185186,
+      "grad_norm": 0.7887832522392273,
+      "learning_rate": 0.00012305452470391852,
+      "loss": 0.9998,
+      "step": 9582
+    },
+    {
+      "epoch": 1.7061965811965814,
+      "grad_norm": 0.6345070004463196,
+      "learning_rate": 0.00012304090407536165,
+      "loss": 1.0305,
+      "step": 9583
+    },
+    {
+      "epoch": 1.7063746438746439,
+      "grad_norm": 0.6398460268974304,
+      "learning_rate": 0.0001230272829953661,
+      "loss": 1.2243,
+      "step": 9584
+    },
+    {
+      "epoch": 1.7065527065527064,
+      "grad_norm": 0.6501944065093994,
+      "learning_rate": 0.00012301366146419879,
+      "loss": 0.9425,
+      "step": 9585
+    },
+    {
+      "epoch": 1.7067307692307692,
+      "grad_norm": 0.6406761407852173,
+      "learning_rate": 0.00012300003948212661,
+      "loss": 0.948,
+      "step": 9586
+    },
+    {
+      "epoch": 1.706908831908832,
+      "grad_norm": 0.7114266157150269,
+      "learning_rate": 0.00012298641704941644,
+      "loss": 1.1291,
+      "step": 9587
+    },
+    {
+      "epoch": 1.7070868945868947,
+      "grad_norm": 0.6653099656105042,
+      "learning_rate": 0.00012297279416633515,
+      "loss": 1.0156,
+      "step": 9588
+    },
+    {
+      "epoch": 1.7072649572649574,
+      "grad_norm": 0.5970917344093323,
+      "learning_rate": 0.0001229591708331497,
+      "loss": 0.9424,
+      "step": 9589
+    },
+    {
+      "epoch": 1.70744301994302,
+      "grad_norm": 0.6861461400985718,
+      "learning_rate": 0.00012294554705012694,
+      "loss": 0.7581,
+      "step": 9590
+    },
+    {
+      "epoch": 1.7076210826210825,
+      "grad_norm": 0.6930568218231201,
+      "learning_rate": 0.00012293192281753393,
+      "loss": 1.0544,
+      "step": 9591
+    },
+    {
+      "epoch": 1.7077991452991452,
+      "grad_norm": 0.7420656085014343,
+      "learning_rate": 0.00012291829813563748,
+      "loss": 0.7092,
+      "step": 9592
+    },
+    {
+      "epoch": 1.707977207977208,
+      "grad_norm": 0.6607801914215088,
+      "learning_rate": 0.0001229046730047046,
+      "loss": 0.5544,
+      "step": 9593
+    },
+    {
+      "epoch": 1.7081552706552707,
+      "grad_norm": 0.8419139385223389,
+      "learning_rate": 0.00012289104742500224,
+      "loss": 1.0443,
+      "step": 9594
+    },
+    {
+      "epoch": 1.7083333333333335,
+      "grad_norm": 0.6774617433547974,
+      "learning_rate": 0.00012287742139679734,
+      "loss": 1.0098,
+      "step": 9595
+    },
+    {
+      "epoch": 1.708511396011396,
+      "grad_norm": 0.7517698407173157,
+      "learning_rate": 0.0001228637949203569,
+      "loss": 1.1145,
+      "step": 9596
+    },
+    {
+      "epoch": 1.7086894586894585,
+      "grad_norm": 0.6048635840415955,
+      "learning_rate": 0.00012285016799594791,
+      "loss": 0.7398,
+      "step": 9597
+    },
+    {
+      "epoch": 1.7088675213675213,
+      "grad_norm": 0.8054425716400146,
+      "learning_rate": 0.00012283654062383734,
+      "loss": 1.0893,
+      "step": 9598
+    },
+    {
+      "epoch": 1.709045584045584,
+      "grad_norm": 0.8694897294044495,
+      "learning_rate": 0.0001228229128042922,
+      "loss": 1.2366,
+      "step": 9599
+    },
+    {
+      "epoch": 1.7092236467236468,
+      "grad_norm": 0.7460638880729675,
+      "learning_rate": 0.00012280928453757946,
+      "loss": 1.1753,
+      "step": 9600
+    },
+    {
+      "epoch": 1.7094017094017095,
+      "grad_norm": 0.6714958548545837,
+      "learning_rate": 0.00012279565582396618,
+      "loss": 1.0473,
+      "step": 9601
+    },
+    {
+      "epoch": 1.709579772079772,
+      "grad_norm": 0.6893340945243835,
+      "learning_rate": 0.00012278202666371937,
+      "loss": 1.2761,
+      "step": 9602
+    },
+    {
+      "epoch": 1.7097578347578346,
+      "grad_norm": 0.6816153526306152,
+      "learning_rate": 0.00012276839705710612,
+      "loss": 0.991,
+      "step": 9603
+    },
+    {
+      "epoch": 1.7099358974358974,
+      "grad_norm": 0.6961633563041687,
+      "learning_rate": 0.0001227547670043934,
+      "loss": 1.0634,
+      "step": 9604
+    },
+    {
+      "epoch": 1.71011396011396,
+      "grad_norm": 0.643734872341156,
+      "learning_rate": 0.0001227411365058483,
+      "loss": 0.8672,
+      "step": 9605
+    },
+    {
+      "epoch": 1.7102920227920229,
+      "grad_norm": 0.7313315272331238,
+      "learning_rate": 0.00012272750556173784,
+      "loss": 1.1152,
+      "step": 9606
+    },
+    {
+      "epoch": 1.7104700854700856,
+      "grad_norm": 0.6464954614639282,
+      "learning_rate": 0.00012271387417232916,
+      "loss": 0.8798,
+      "step": 9607
+    },
+    {
+      "epoch": 1.7106481481481481,
+      "grad_norm": 0.8365204334259033,
+      "learning_rate": 0.00012270024233788929,
+      "loss": 1.213,
+      "step": 9608
+    },
+    {
+      "epoch": 1.7108262108262107,
+      "grad_norm": 0.6460705995559692,
+      "learning_rate": 0.0001226866100586853,
+      "loss": 0.9232,
+      "step": 9609
+    },
+    {
+      "epoch": 1.7110042735042734,
+      "grad_norm": 0.6446022987365723,
+      "learning_rate": 0.00012267297733498434,
+      "loss": 0.8295,
+      "step": 9610
+    },
+    {
+      "epoch": 1.7111823361823362,
+      "grad_norm": 0.7692012190818787,
+      "learning_rate": 0.00012265934416705345,
+      "loss": 1.0715,
+      "step": 9611
+    },
+    {
+      "epoch": 1.711360398860399,
+      "grad_norm": 0.671154260635376,
+      "learning_rate": 0.0001226457105551598,
+      "loss": 0.9752,
+      "step": 9612
+    },
+    {
+      "epoch": 1.7115384615384617,
+      "grad_norm": 0.6525935530662537,
+      "learning_rate": 0.00012263207649957053,
+      "loss": 1.09,
+      "step": 9613
+    },
+    {
+      "epoch": 1.7117165242165242,
+      "grad_norm": 0.6984749436378479,
+      "learning_rate": 0.0001226184420005527,
+      "loss": 0.9956,
+      "step": 9614
+    },
+    {
+      "epoch": 1.7118945868945867,
+      "grad_norm": 0.6769809126853943,
+      "learning_rate": 0.0001226048070583735,
+      "loss": 1.0151,
+      "step": 9615
+    },
+    {
+      "epoch": 1.7120726495726495,
+      "grad_norm": 0.6085978746414185,
+      "learning_rate": 0.00012259117167330005,
+      "loss": 0.8706,
+      "step": 9616
+    },
+    {
+      "epoch": 1.7122507122507122,
+      "grad_norm": 0.7335749268531799,
+      "learning_rate": 0.00012257753584559952,
+      "loss": 1.0575,
+      "step": 9617
+    },
+    {
+      "epoch": 1.712428774928775,
+      "grad_norm": 0.7392038106918335,
+      "learning_rate": 0.0001225638995755391,
+      "loss": 0.8763,
+      "step": 9618
+    },
+    {
+      "epoch": 1.7126068376068377,
+      "grad_norm": 0.6708608865737915,
+      "learning_rate": 0.00012255026286338592,
+      "loss": 1.131,
+      "step": 9619
+    },
+    {
+      "epoch": 1.7127849002849003,
+      "grad_norm": 0.726657509803772,
+      "learning_rate": 0.0001225366257094072,
+      "loss": 1.0569,
+      "step": 9620
+    },
+    {
+      "epoch": 1.7129629629629628,
+      "grad_norm": 0.749098002910614,
+      "learning_rate": 0.0001225229881138701,
+      "loss": 0.9196,
+      "step": 9621
+    },
+    {
+      "epoch": 1.7131410256410255,
+      "grad_norm": 0.6550580263137817,
+      "learning_rate": 0.00012250935007704182,
+      "loss": 1.0244,
+      "step": 9622
+    },
+    {
+      "epoch": 1.7133190883190883,
+      "grad_norm": 0.7714282274246216,
+      "learning_rate": 0.00012249571159918962,
+      "loss": 1.1025,
+      "step": 9623
+    },
+    {
+      "epoch": 1.713497150997151,
+      "grad_norm": 0.7869850397109985,
+      "learning_rate": 0.00012248207268058064,
+      "loss": 0.9238,
+      "step": 9624
+    },
+    {
+      "epoch": 1.7136752136752138,
+      "grad_norm": 0.7187856435775757,
+      "learning_rate": 0.00012246843332148216,
+      "loss": 1.081,
+      "step": 9625
+    },
+    {
+      "epoch": 1.7138532763532763,
+      "grad_norm": 0.6634210348129272,
+      "learning_rate": 0.00012245479352216142,
+      "loss": 1.1944,
+      "step": 9626
+    },
+    {
+      "epoch": 1.714031339031339,
+      "grad_norm": 0.6609212160110474,
+      "learning_rate": 0.00012244115328288567,
+      "loss": 0.9613,
+      "step": 9627
+    },
+    {
+      "epoch": 1.7142094017094016,
+      "grad_norm": 0.7906867861747742,
+      "learning_rate": 0.0001224275126039221,
+      "loss": 1.2692,
+      "step": 9628
+    },
+    {
+      "epoch": 1.7143874643874644,
+      "grad_norm": 0.8037096858024597,
+      "learning_rate": 0.000122413871485538,
+      "loss": 0.9823,
+      "step": 9629
+    },
+    {
+      "epoch": 1.7145655270655271,
+      "grad_norm": 0.7740145921707153,
+      "learning_rate": 0.00012240022992800068,
+      "loss": 1.1937,
+      "step": 9630
+    },
+    {
+      "epoch": 1.7147435897435899,
+      "grad_norm": 0.595372200012207,
+      "learning_rate": 0.00012238658793157738,
+      "loss": 0.9153,
+      "step": 9631
+    },
+    {
+      "epoch": 1.7149216524216524,
+      "grad_norm": 0.6671900749206543,
+      "learning_rate": 0.0001223729454965354,
+      "loss": 1.0895,
+      "step": 9632
+    },
+    {
+      "epoch": 1.7150997150997151,
+      "grad_norm": 0.5805774927139282,
+      "learning_rate": 0.000122359302623142,
+      "loss": 1.0001,
+      "step": 9633
+    },
+    {
+      "epoch": 1.7152777777777777,
+      "grad_norm": 0.8851602673530579,
+      "learning_rate": 0.00012234565931166456,
+      "loss": 1.2828,
+      "step": 9634
+    },
+    {
+      "epoch": 1.7154558404558404,
+      "grad_norm": 0.6960011720657349,
+      "learning_rate": 0.0001223320155623703,
+      "loss": 1.0622,
+      "step": 9635
+    },
+    {
+      "epoch": 1.7156339031339032,
+      "grad_norm": 0.5587009191513062,
+      "learning_rate": 0.0001223183713755266,
+      "loss": 0.83,
+      "step": 9636
+    },
+    {
+      "epoch": 1.715811965811966,
+      "grad_norm": 0.6892730593681335,
+      "learning_rate": 0.00012230472675140076,
+      "loss": 0.9214,
+      "step": 9637
+    },
+    {
+      "epoch": 1.7159900284900285,
+      "grad_norm": 0.6545090079307556,
+      "learning_rate": 0.00012229108169026017,
+      "loss": 0.829,
+      "step": 9638
+    },
+    {
+      "epoch": 1.7161680911680912,
+      "grad_norm": 0.6539101600646973,
+      "learning_rate": 0.00012227743619237213,
+      "loss": 1.0686,
+      "step": 9639
+    },
+    {
+      "epoch": 1.7163461538461537,
+      "grad_norm": 0.5887274146080017,
+      "learning_rate": 0.000122263790258004,
+      "loss": 0.9285,
+      "step": 9640
+    },
+    {
+      "epoch": 1.7165242165242165,
+      "grad_norm": 0.6328918933868408,
+      "learning_rate": 0.00012225014388742313,
+      "loss": 0.9684,
+      "step": 9641
+    },
+    {
+      "epoch": 1.7167022792022792,
+      "grad_norm": 0.6377436518669128,
+      "learning_rate": 0.00012223649708089694,
+      "loss": 0.9425,
+      "step": 9642
+    },
+    {
+      "epoch": 1.716880341880342,
+      "grad_norm": 0.6967392563819885,
+      "learning_rate": 0.00012222284983869275,
+      "loss": 0.9342,
+      "step": 9643
+    },
+    {
+      "epoch": 1.7170584045584045,
+      "grad_norm": 0.7051317691802979,
+      "learning_rate": 0.00012220920216107802,
+      "loss": 1.1843,
+      "step": 9644
+    },
+    {
+      "epoch": 1.7172364672364673,
+      "grad_norm": 0.6864503622055054,
+      "learning_rate": 0.00012219555404832007,
+      "loss": 1.0371,
+      "step": 9645
+    },
+    {
+      "epoch": 1.7174145299145298,
+      "grad_norm": 0.583454430103302,
+      "learning_rate": 0.00012218190550068638,
+      "loss": 0.6774,
+      "step": 9646
+    },
+    {
+      "epoch": 1.7175925925925926,
+      "grad_norm": 0.6755677461624146,
+      "learning_rate": 0.0001221682565184443,
+      "loss": 0.9517,
+      "step": 9647
+    },
+    {
+      "epoch": 1.7177706552706553,
+      "grad_norm": 0.7230031490325928,
+      "learning_rate": 0.0001221546071018613,
+      "loss": 1.0385,
+      "step": 9648
+    },
+    {
+      "epoch": 1.717948717948718,
+      "grad_norm": 0.7381200194358826,
+      "learning_rate": 0.0001221409572512048,
+      "loss": 0.9893,
+      "step": 9649
+    },
+    {
+      "epoch": 1.7181267806267806,
+      "grad_norm": 0.7079094648361206,
+      "learning_rate": 0.0001221273069667422,
+      "loss": 0.7793,
+      "step": 9650
+    },
+    {
+      "epoch": 1.7183048433048433,
+      "grad_norm": 0.6666881442070007,
+      "learning_rate": 0.00012211365624874106,
+      "loss": 0.9752,
+      "step": 9651
+    },
+    {
+      "epoch": 1.7184829059829059,
+      "grad_norm": 0.6196922659873962,
+      "learning_rate": 0.00012210000509746868,
+      "loss": 0.922,
+      "step": 9652
+    },
+    {
+      "epoch": 1.7186609686609686,
+      "grad_norm": 0.657879650592804,
+      "learning_rate": 0.00012208635351319266,
+      "loss": 1.2583,
+      "step": 9653
+    },
+    {
+      "epoch": 1.7188390313390314,
+      "grad_norm": 0.7240459322929382,
+      "learning_rate": 0.00012207270149618043,
+      "loss": 0.8479,
+      "step": 9654
+    },
+    {
+      "epoch": 1.7190170940170941,
+      "grad_norm": 0.8293825387954712,
+      "learning_rate": 0.00012205904904669945,
+      "loss": 0.9092,
+      "step": 9655
+    },
+    {
+      "epoch": 1.7191951566951567,
+      "grad_norm": 0.6907553672790527,
+      "learning_rate": 0.0001220453961650172,
+      "loss": 1.0543,
+      "step": 9656
+    },
+    {
+      "epoch": 1.7193732193732194,
+      "grad_norm": 0.7178300023078918,
+      "learning_rate": 0.00012203174285140124,
+      "loss": 0.9147,
+      "step": 9657
+    },
+    {
+      "epoch": 1.719551282051282,
+      "grad_norm": 0.7037166357040405,
+      "learning_rate": 0.00012201808910611905,
+      "loss": 0.8685,
+      "step": 9658
+    },
+    {
+      "epoch": 1.7197293447293447,
+      "grad_norm": 0.5850751996040344,
+      "learning_rate": 0.00012200443492943813,
+      "loss": 0.72,
+      "step": 9659
+    },
+    {
+      "epoch": 1.7199074074074074,
+      "grad_norm": 0.744239330291748,
+      "learning_rate": 0.00012199078032162603,
+      "loss": 0.9717,
+      "step": 9660
+    },
+    {
+      "epoch": 1.7200854700854702,
+      "grad_norm": 0.6509126424789429,
+      "learning_rate": 0.00012197712528295025,
+      "loss": 0.9768,
+      "step": 9661
+    },
+    {
+      "epoch": 1.7202635327635327,
+      "grad_norm": 0.623220682144165,
+      "learning_rate": 0.00012196346981367837,
+      "loss": 0.9824,
+      "step": 9662
+    },
+    {
+      "epoch": 1.7204415954415955,
+      "grad_norm": 0.6376451849937439,
+      "learning_rate": 0.00012194981391407792,
+      "loss": 0.8228,
+      "step": 9663
+    },
+    {
+      "epoch": 1.720619658119658,
+      "grad_norm": 0.794830322265625,
+      "learning_rate": 0.00012193615758441648,
+      "loss": 0.9168,
+      "step": 9664
+    },
+    {
+      "epoch": 1.7207977207977208,
+      "grad_norm": 0.7812975645065308,
+      "learning_rate": 0.0001219225008249616,
+      "loss": 0.8625,
+      "step": 9665
+    },
+    {
+      "epoch": 1.7209757834757835,
+      "grad_norm": 0.6843218207359314,
+      "learning_rate": 0.0001219088436359808,
+      "loss": 1.0176,
+      "step": 9666
+    },
+    {
+      "epoch": 1.7211538461538463,
+      "grad_norm": 0.6924905180931091,
+      "learning_rate": 0.00012189518601774178,
+      "loss": 0.855,
+      "step": 9667
+    },
+    {
+      "epoch": 1.7213319088319088,
+      "grad_norm": 0.6348826289176941,
+      "learning_rate": 0.00012188152797051202,
+      "loss": 1.1596,
+      "step": 9668
+    },
+    {
+      "epoch": 1.7215099715099715,
+      "grad_norm": 0.7170482873916626,
+      "learning_rate": 0.00012186786949455922,
+      "loss": 0.9811,
+      "step": 9669
+    },
+    {
+      "epoch": 1.721688034188034,
+      "grad_norm": 0.7471763491630554,
+      "learning_rate": 0.00012185421059015094,
+      "loss": 1.0925,
+      "step": 9670
+    },
+    {
+      "epoch": 1.7218660968660968,
+      "grad_norm": 0.6771119236946106,
+      "learning_rate": 0.00012184055125755481,
+      "loss": 0.9403,
+      "step": 9671
+    },
+    {
+      "epoch": 1.7220441595441596,
+      "grad_norm": 0.4335343539714813,
+      "learning_rate": 0.0001218268914970384,
+      "loss": 0.4925,
+      "step": 9672
+    },
+    {
+      "epoch": 1.7222222222222223,
+      "grad_norm": 0.6652585864067078,
+      "learning_rate": 0.00012181323130886943,
+      "loss": 0.7684,
+      "step": 9673
+    },
+    {
+      "epoch": 1.7224002849002849,
+      "grad_norm": 0.6465467810630798,
+      "learning_rate": 0.00012179957069331548,
+      "loss": 0.9011,
+      "step": 9674
+    },
+    {
+      "epoch": 1.7225783475783476,
+      "grad_norm": 0.6725688576698303,
+      "learning_rate": 0.00012178590965064427,
+      "loss": 0.9563,
+      "step": 9675
+    },
+    {
+      "epoch": 1.7227564102564101,
+      "grad_norm": 0.6223418712615967,
+      "learning_rate": 0.00012177224818112341,
+      "loss": 0.9099,
+      "step": 9676
+    },
+    {
+      "epoch": 1.7229344729344729,
+      "grad_norm": 0.79325270652771,
+      "learning_rate": 0.00012175858628502053,
+      "loss": 1.0318,
+      "step": 9677
+    },
+    {
+      "epoch": 1.7231125356125356,
+      "grad_norm": 0.6735602617263794,
+      "learning_rate": 0.0001217449239626034,
+      "loss": 1.0797,
+      "step": 9678
+    },
+    {
+      "epoch": 1.7232905982905984,
+      "grad_norm": 0.7082492113113403,
+      "learning_rate": 0.00012173126121413962,
+      "loss": 1.1341,
+      "step": 9679
+    },
+    {
+      "epoch": 1.723468660968661,
+      "grad_norm": 0.6563859581947327,
+      "learning_rate": 0.00012171759803989696,
+      "loss": 0.8778,
+      "step": 9680
+    },
+    {
+      "epoch": 1.7236467236467237,
+      "grad_norm": 0.6867792010307312,
+      "learning_rate": 0.00012170393444014306,
+      "loss": 0.8301,
+      "step": 9681
+    },
+    {
+      "epoch": 1.7238247863247862,
+      "grad_norm": 0.7870511412620544,
+      "learning_rate": 0.00012169027041514562,
+      "loss": 0.9165,
+      "step": 9682
+    },
+    {
+      "epoch": 1.724002849002849,
+      "grad_norm": 0.8006493449211121,
+      "learning_rate": 0.00012167660596517241,
+      "loss": 1.0395,
+      "step": 9683
+    },
+    {
+      "epoch": 1.7241809116809117,
+      "grad_norm": 0.6936125159263611,
+      "learning_rate": 0.00012166294109049114,
+      "loss": 1.1037,
+      "step": 9684
+    },
+    {
+      "epoch": 1.7243589743589745,
+      "grad_norm": 0.8176514506340027,
+      "learning_rate": 0.00012164927579136956,
+      "loss": 0.8791,
+      "step": 9685
+    },
+    {
+      "epoch": 1.7245370370370372,
+      "grad_norm": 0.6948300004005432,
+      "learning_rate": 0.00012163561006807537,
+      "loss": 0.9292,
+      "step": 9686
+    },
+    {
+      "epoch": 1.7247150997150997,
+      "grad_norm": 0.6237453818321228,
+      "learning_rate": 0.00012162194392087634,
+      "loss": 0.8553,
+      "step": 9687
+    },
+    {
+      "epoch": 1.7248931623931623,
+      "grad_norm": 0.6198007464408875,
+      "learning_rate": 0.00012160827735004021,
+      "loss": 0.9599,
+      "step": 9688
+    },
+    {
+      "epoch": 1.725071225071225,
+      "grad_norm": 0.639838695526123,
+      "learning_rate": 0.00012159461035583482,
+      "loss": 0.9328,
+      "step": 9689
+    },
+    {
+      "epoch": 1.7252492877492878,
+      "grad_norm": 0.7264436483383179,
+      "learning_rate": 0.00012158094293852789,
+      "loss": 1.0247,
+      "step": 9690
+    },
+    {
+      "epoch": 1.7254273504273505,
+      "grad_norm": 0.6320534348487854,
+      "learning_rate": 0.00012156727509838721,
+      "loss": 1.1222,
+      "step": 9691
+    },
+    {
+      "epoch": 1.7256054131054133,
+      "grad_norm": 0.6204122304916382,
+      "learning_rate": 0.00012155360683568056,
+      "loss": 0.9765,
+      "step": 9692
+    },
+    {
+      "epoch": 1.7257834757834758,
+      "grad_norm": 0.7026457190513611,
+      "learning_rate": 0.00012153993815067579,
+      "loss": 1.0178,
+      "step": 9693
+    },
+    {
+      "epoch": 1.7259615384615383,
+      "grad_norm": 0.6471006870269775,
+      "learning_rate": 0.00012152626904364067,
+      "loss": 1.0035,
+      "step": 9694
+    },
+    {
+      "epoch": 1.726139601139601,
+      "grad_norm": 0.6875706911087036,
+      "learning_rate": 0.00012151259951484301,
+      "loss": 0.7921,
+      "step": 9695
+    },
+    {
+      "epoch": 1.7263176638176638,
+      "grad_norm": 0.6963251233100891,
+      "learning_rate": 0.00012149892956455067,
+      "loss": 0.9677,
+      "step": 9696
+    },
+    {
+      "epoch": 1.7264957264957266,
+      "grad_norm": 0.9077282547950745,
+      "learning_rate": 0.00012148525919303142,
+      "loss": 0.9362,
+      "step": 9697
+    },
+    {
+      "epoch": 1.7266737891737893,
+      "grad_norm": 0.7347434163093567,
+      "learning_rate": 0.00012147158840055319,
+      "loss": 0.8712,
+      "step": 9698
+    },
+    {
+      "epoch": 1.7268518518518519,
+      "grad_norm": 0.7206630110740662,
+      "learning_rate": 0.00012145791718738377,
+      "loss": 1.032,
+      "step": 9699
+    },
+    {
+      "epoch": 1.7270299145299144,
+      "grad_norm": 0.7174662947654724,
+      "learning_rate": 0.00012144424555379106,
+      "loss": 0.954,
+      "step": 9700
+    },
+    {
+      "epoch": 1.7272079772079771,
+      "grad_norm": 0.7442345023155212,
+      "learning_rate": 0.0001214305735000429,
+      "loss": 1.0709,
+      "step": 9701
+    },
+    {
+      "epoch": 1.72738603988604,
+      "grad_norm": 0.6154376268386841,
+      "learning_rate": 0.00012141690102640715,
+      "loss": 0.9365,
+      "step": 9702
+    },
+    {
+      "epoch": 1.7275641025641026,
+      "grad_norm": 0.6213796734809875,
+      "learning_rate": 0.00012140322813315172,
+      "loss": 0.8337,
+      "step": 9703
+    },
+    {
+      "epoch": 1.7277421652421654,
+      "grad_norm": 0.7682011127471924,
+      "learning_rate": 0.0001213895548205445,
+      "loss": 1.1579,
+      "step": 9704
+    },
+    {
+      "epoch": 1.727920227920228,
+      "grad_norm": 0.6796970963478088,
+      "learning_rate": 0.0001213758810888534,
+      "loss": 0.8875,
+      "step": 9705
+    },
+    {
+      "epoch": 1.7280982905982905,
+      "grad_norm": 0.7203732132911682,
+      "learning_rate": 0.0001213622069383463,
+      "loss": 0.7827,
+      "step": 9706
+    },
+    {
+      "epoch": 1.7282763532763532,
+      "grad_norm": 0.6151877045631409,
+      "learning_rate": 0.00012134853236929111,
+      "loss": 1.0282,
+      "step": 9707
+    },
+    {
+      "epoch": 1.728454415954416,
+      "grad_norm": 0.6665124297142029,
+      "learning_rate": 0.0001213348573819558,
+      "loss": 1.0636,
+      "step": 9708
+    },
+    {
+      "epoch": 1.7286324786324787,
+      "grad_norm": 0.7334614396095276,
+      "learning_rate": 0.00012132118197660829,
+      "loss": 1.0889,
+      "step": 9709
+    },
+    {
+      "epoch": 1.7288105413105415,
+      "grad_norm": 0.7267759442329407,
+      "learning_rate": 0.00012130750615351649,
+      "loss": 1.096,
+      "step": 9710
+    },
+    {
+      "epoch": 1.728988603988604,
+      "grad_norm": 0.6542944312095642,
+      "learning_rate": 0.00012129382991294837,
+      "loss": 1.0855,
+      "step": 9711
+    },
+    {
+      "epoch": 1.7291666666666665,
+      "grad_norm": 0.694523274898529,
+      "learning_rate": 0.00012128015325517193,
+      "loss": 0.8482,
+      "step": 9712
+    },
+    {
+      "epoch": 1.7293447293447293,
+      "grad_norm": 0.7879082560539246,
+      "learning_rate": 0.00012126647618045504,
+      "loss": 1.2356,
+      "step": 9713
+    },
+    {
+      "epoch": 1.729522792022792,
+      "grad_norm": 0.7108420729637146,
+      "learning_rate": 0.00012125279868906574,
+      "loss": 1.0185,
+      "step": 9714
+    },
+    {
+      "epoch": 1.7297008547008548,
+      "grad_norm": 0.6928725838661194,
+      "learning_rate": 0.000121239120781272,
+      "loss": 1.1507,
+      "step": 9715
+    },
+    {
+      "epoch": 1.7298789173789175,
+      "grad_norm": 0.6195241212844849,
+      "learning_rate": 0.00012122544245734182,
+      "loss": 0.8656,
+      "step": 9716
+    },
+    {
+      "epoch": 1.73005698005698,
+      "grad_norm": 0.5962017774581909,
+      "learning_rate": 0.00012121176371754317,
+      "loss": 0.918,
+      "step": 9717
+    },
+    {
+      "epoch": 1.7302350427350426,
+      "grad_norm": 0.7409394979476929,
+      "learning_rate": 0.00012119808456214407,
+      "loss": 1.0283,
+      "step": 9718
+    },
+    {
+      "epoch": 1.7304131054131053,
+      "grad_norm": 0.6571973562240601,
+      "learning_rate": 0.00012118440499141257,
+      "loss": 1.1015,
+      "step": 9719
+    },
+    {
+      "epoch": 1.730591168091168,
+      "grad_norm": 0.681394100189209,
+      "learning_rate": 0.00012117072500561664,
+      "loss": 0.8247,
+      "step": 9720
+    },
+    {
+      "epoch": 1.7307692307692308,
+      "grad_norm": 0.7278251647949219,
+      "learning_rate": 0.00012115704460502432,
+      "loss": 1.0693,
+      "step": 9721
+    },
+    {
+      "epoch": 1.7309472934472936,
+      "grad_norm": 0.6569405794143677,
+      "learning_rate": 0.0001211433637899037,
+      "loss": 0.8992,
+      "step": 9722
+    },
+    {
+      "epoch": 1.7311253561253561,
+      "grad_norm": 0.6305136680603027,
+      "learning_rate": 0.00012112968256052272,
+      "loss": 0.8543,
+      "step": 9723
+    },
+    {
+      "epoch": 1.7313034188034186,
+      "grad_norm": 0.6111339330673218,
+      "learning_rate": 0.00012111600091714956,
+      "loss": 0.991,
+      "step": 9724
+    },
+    {
+      "epoch": 1.7314814814814814,
+      "grad_norm": 0.646973192691803,
+      "learning_rate": 0.00012110231886005223,
+      "loss": 0.8855,
+      "step": 9725
+    },
+    {
+      "epoch": 1.7316595441595442,
+      "grad_norm": 0.7054407000541687,
+      "learning_rate": 0.00012108863638949879,
+      "loss": 1.0816,
+      "step": 9726
+    },
+    {
+      "epoch": 1.731837606837607,
+      "grad_norm": 0.6592162847518921,
+      "learning_rate": 0.00012107495350575729,
+      "loss": 1.0961,
+      "step": 9727
+    },
+    {
+      "epoch": 1.7320156695156697,
+      "grad_norm": 0.6615595817565918,
+      "learning_rate": 0.00012106127020909587,
+      "loss": 0.9669,
+      "step": 9728
+    },
+    {
+      "epoch": 1.7321937321937322,
+      "grad_norm": 0.9030881524085999,
+      "learning_rate": 0.00012104758649978263,
+      "loss": 0.9438,
+      "step": 9729
+    },
+    {
+      "epoch": 1.7323717948717947,
+      "grad_norm": 0.6776516437530518,
+      "learning_rate": 0.00012103390237808566,
+      "loss": 0.8967,
+      "step": 9730
+    },
+    {
+      "epoch": 1.7325498575498575,
+      "grad_norm": 0.6010605096817017,
+      "learning_rate": 0.00012102021784427306,
+      "loss": 0.8893,
+      "step": 9731
+    },
+    {
+      "epoch": 1.7327279202279202,
+      "grad_norm": 0.6540384292602539,
+      "learning_rate": 0.00012100653289861295,
+      "loss": 0.9328,
+      "step": 9732
+    },
+    {
+      "epoch": 1.732905982905983,
+      "grad_norm": 0.6836950182914734,
+      "learning_rate": 0.00012099284754137345,
+      "loss": 0.9019,
+      "step": 9733
+    },
+    {
+      "epoch": 1.7330840455840457,
+      "grad_norm": 0.7597874402999878,
+      "learning_rate": 0.00012097916177282274,
+      "loss": 1.0093,
+      "step": 9734
+    },
+    {
+      "epoch": 1.7332621082621082,
+      "grad_norm": 0.7686513066291809,
+      "learning_rate": 0.00012096547559322892,
+      "loss": 0.8685,
+      "step": 9735
+    },
+    {
+      "epoch": 1.7334401709401708,
+      "grad_norm": 0.613777220249176,
+      "learning_rate": 0.0001209517890028602,
+      "loss": 0.8317,
+      "step": 9736
+    },
+    {
+      "epoch": 1.7336182336182335,
+      "grad_norm": 0.6788455843925476,
+      "learning_rate": 0.00012093810200198466,
+      "loss": 0.866,
+      "step": 9737
+    },
+    {
+      "epoch": 1.7337962962962963,
+      "grad_norm": 0.616801381111145,
+      "learning_rate": 0.00012092441459087047,
+      "loss": 0.8299,
+      "step": 9738
+    },
+    {
+      "epoch": 1.733974358974359,
+      "grad_norm": 0.731987476348877,
+      "learning_rate": 0.00012091072676978589,
+      "loss": 1.089,
+      "step": 9739
+    },
+    {
+      "epoch": 1.7341524216524218,
+      "grad_norm": 0.7042871117591858,
+      "learning_rate": 0.00012089703853899905,
+      "loss": 0.8667,
+      "step": 9740
+    },
+    {
+      "epoch": 1.7343304843304843,
+      "grad_norm": 0.62722247838974,
+      "learning_rate": 0.00012088334989877817,
+      "loss": 0.9185,
+      "step": 9741
+    },
+    {
+      "epoch": 1.734508547008547,
+      "grad_norm": 0.6354684829711914,
+      "learning_rate": 0.0001208696608493914,
+      "loss": 0.9951,
+      "step": 9742
+    },
+    {
+      "epoch": 1.7346866096866096,
+      "grad_norm": 0.658647894859314,
+      "learning_rate": 0.00012085597139110698,
+      "loss": 0.9324,
+      "step": 9743
+    },
+    {
+      "epoch": 1.7348646723646723,
+      "grad_norm": 0.84359210729599,
+      "learning_rate": 0.00012084228152419312,
+      "loss": 1.0861,
+      "step": 9744
+    },
+    {
+      "epoch": 1.735042735042735,
+      "grad_norm": 0.6293938755989075,
+      "learning_rate": 0.00012082859124891807,
+      "loss": 0.9676,
+      "step": 9745
+    },
+    {
+      "epoch": 1.7352207977207978,
+      "grad_norm": 0.6398760676383972,
+      "learning_rate": 0.00012081490056555004,
+      "loss": 0.8502,
+      "step": 9746
+    },
+    {
+      "epoch": 1.7353988603988604,
+      "grad_norm": 0.6918041706085205,
+      "learning_rate": 0.00012080120947435726,
+      "loss": 1.0081,
+      "step": 9747
+    },
+    {
+      "epoch": 1.7355769230769231,
+      "grad_norm": 0.7374079823493958,
+      "learning_rate": 0.00012078751797560798,
+      "loss": 0.9485,
+      "step": 9748
+    },
+    {
+      "epoch": 1.7357549857549857,
+      "grad_norm": 0.7392128705978394,
+      "learning_rate": 0.00012077382606957049,
+      "loss": 0.9283,
+      "step": 9749
+    },
+    {
+      "epoch": 1.7359330484330484,
+      "grad_norm": 0.701320230960846,
+      "learning_rate": 0.00012076013375651303,
+      "loss": 1.0339,
+      "step": 9750
+    },
+    {
+      "epoch": 1.7361111111111112,
+      "grad_norm": 0.6316696405410767,
+      "learning_rate": 0.00012074644103670387,
+      "loss": 0.9097,
+      "step": 9751
+    },
+    {
+      "epoch": 1.736289173789174,
+      "grad_norm": 0.6892024278640747,
+      "learning_rate": 0.00012073274791041132,
+      "loss": 1.0863,
+      "step": 9752
+    },
+    {
+      "epoch": 1.7364672364672364,
+      "grad_norm": 0.6032847762107849,
+      "learning_rate": 0.00012071905437790361,
+      "loss": 0.9305,
+      "step": 9753
+    },
+    {
+      "epoch": 1.7366452991452992,
+      "grad_norm": 0.6659184098243713,
+      "learning_rate": 0.00012070536043944907,
+      "loss": 0.9793,
+      "step": 9754
+    },
+    {
+      "epoch": 1.7368233618233617,
+      "grad_norm": 0.7413665056228638,
+      "learning_rate": 0.00012069166609531602,
+      "loss": 1.0523,
+      "step": 9755
+    },
+    {
+      "epoch": 1.7370014245014245,
+      "grad_norm": 0.7814368009567261,
+      "learning_rate": 0.00012067797134577275,
+      "loss": 0.9988,
+      "step": 9756
+    },
+    {
+      "epoch": 1.7371794871794872,
+      "grad_norm": 0.6174948811531067,
+      "learning_rate": 0.00012066427619108757,
+      "loss": 0.9002,
+      "step": 9757
+    },
+    {
+      "epoch": 1.73735754985755,
+      "grad_norm": 0.6521819233894348,
+      "learning_rate": 0.00012065058063152885,
+      "loss": 1.1307,
+      "step": 9758
+    },
+    {
+      "epoch": 1.7375356125356125,
+      "grad_norm": 0.6797493696212769,
+      "learning_rate": 0.00012063688466736489,
+      "loss": 0.84,
+      "step": 9759
+    },
+    {
+      "epoch": 1.7377136752136753,
+      "grad_norm": 0.6496474146842957,
+      "learning_rate": 0.00012062318829886404,
+      "loss": 0.86,
+      "step": 9760
+    },
+    {
+      "epoch": 1.7378917378917378,
+      "grad_norm": 0.6701306104660034,
+      "learning_rate": 0.00012060949152629467,
+      "loss": 0.9422,
+      "step": 9761
+    },
+    {
+      "epoch": 1.7380698005698005,
+      "grad_norm": 0.7331172823905945,
+      "learning_rate": 0.00012059579434992512,
+      "loss": 1.1648,
+      "step": 9762
+    },
+    {
+      "epoch": 1.7382478632478633,
+      "grad_norm": 0.63930743932724,
+      "learning_rate": 0.00012058209677002375,
+      "loss": 1.0617,
+      "step": 9763
+    },
+    {
+      "epoch": 1.738425925925926,
+      "grad_norm": 0.668851912021637,
+      "learning_rate": 0.00012056839878685895,
+      "loss": 0.8219,
+      "step": 9764
+    },
+    {
+      "epoch": 1.7386039886039886,
+      "grad_norm": 0.7305747270584106,
+      "learning_rate": 0.00012055470040069912,
+      "loss": 1.0416,
+      "step": 9765
+    },
+    {
+      "epoch": 1.7387820512820513,
+      "grad_norm": 0.6931866407394409,
+      "learning_rate": 0.00012054100161181264,
+      "loss": 1.0588,
+      "step": 9766
+    },
+    {
+      "epoch": 1.7389601139601139,
+      "grad_norm": 0.6565485000610352,
+      "learning_rate": 0.00012052730242046785,
+      "loss": 0.7885,
+      "step": 9767
+    },
+    {
+      "epoch": 1.7391381766381766,
+      "grad_norm": 0.739985466003418,
+      "learning_rate": 0.00012051360282693327,
+      "loss": 1.0973,
+      "step": 9768
+    },
+    {
+      "epoch": 1.7393162393162394,
+      "grad_norm": 0.6477079391479492,
+      "learning_rate": 0.00012049990283147723,
+      "loss": 0.9841,
+      "step": 9769
+    },
+    {
+      "epoch": 1.739494301994302,
+      "grad_norm": 0.7018330097198486,
+      "learning_rate": 0.00012048620243436819,
+      "loss": 1.0869,
+      "step": 9770
+    },
+    {
+      "epoch": 1.7396723646723646,
+      "grad_norm": 0.7087421417236328,
+      "learning_rate": 0.00012047250163587456,
+      "loss": 0.916,
+      "step": 9771
+    },
+    {
+      "epoch": 1.7398504273504274,
+      "grad_norm": 0.8747151494026184,
+      "learning_rate": 0.00012045880043626481,
+      "loss": 0.8245,
+      "step": 9772
+    },
+    {
+      "epoch": 1.74002849002849,
+      "grad_norm": 0.777498722076416,
+      "learning_rate": 0.00012044509883580735,
+      "loss": 1.071,
+      "step": 9773
+    },
+    {
+      "epoch": 1.7402065527065527,
+      "grad_norm": 0.6668971180915833,
+      "learning_rate": 0.00012043139683477062,
+      "loss": 1.0447,
+      "step": 9774
+    },
+    {
+      "epoch": 1.7403846153846154,
+      "grad_norm": 0.6702026724815369,
+      "learning_rate": 0.00012041769443342317,
+      "loss": 0.8688,
+      "step": 9775
+    },
+    {
+      "epoch": 1.7405626780626782,
+      "grad_norm": 0.7866267561912537,
+      "learning_rate": 0.00012040399163203337,
+      "loss": 1.0842,
+      "step": 9776
+    },
+    {
+      "epoch": 1.7407407407407407,
+      "grad_norm": 0.7655110955238342,
+      "learning_rate": 0.00012039028843086977,
+      "loss": 1.2417,
+      "step": 9777
+    },
+    {
+      "epoch": 1.7409188034188035,
+      "grad_norm": 0.7084119915962219,
+      "learning_rate": 0.0001203765848302008,
+      "loss": 0.9844,
+      "step": 9778
+    },
+    {
+      "epoch": 1.741096866096866,
+      "grad_norm": 0.7135398983955383,
+      "learning_rate": 0.00012036288083029497,
+      "loss": 1.1102,
+      "step": 9779
+    },
+    {
+      "epoch": 1.7412749287749287,
+      "grad_norm": 0.6784615516662598,
+      "learning_rate": 0.0001203491764314208,
+      "loss": 1.0349,
+      "step": 9780
+    },
+    {
+      "epoch": 1.7414529914529915,
+      "grad_norm": 0.7170301079750061,
+      "learning_rate": 0.00012033547163384682,
+      "loss": 1.0899,
+      "step": 9781
+    },
+    {
+      "epoch": 1.7416310541310542,
+      "grad_norm": 0.6692060828208923,
+      "learning_rate": 0.0001203217664378415,
+      "loss": 1.0486,
+      "step": 9782
+    },
+    {
+      "epoch": 1.7418091168091168,
+      "grad_norm": 0.6730037927627563,
+      "learning_rate": 0.00012030806084367336,
+      "loss": 0.9684,
+      "step": 9783
+    },
+    {
+      "epoch": 1.7419871794871795,
+      "grad_norm": 0.5983504056930542,
+      "learning_rate": 0.00012029435485161096,
+      "loss": 0.7106,
+      "step": 9784
+    },
+    {
+      "epoch": 1.742165242165242,
+      "grad_norm": 0.6834231615066528,
+      "learning_rate": 0.00012028064846192284,
+      "loss": 0.803,
+      "step": 9785
+    },
+    {
+      "epoch": 1.7423433048433048,
+      "grad_norm": 0.621046245098114,
+      "learning_rate": 0.00012026694167487755,
+      "loss": 0.9129,
+      "step": 9786
+    },
+    {
+      "epoch": 1.7425213675213675,
+      "grad_norm": 0.6348989605903625,
+      "learning_rate": 0.00012025323449074361,
+      "loss": 1.0076,
+      "step": 9787
+    },
+    {
+      "epoch": 1.7426994301994303,
+      "grad_norm": 0.6139974594116211,
+      "learning_rate": 0.00012023952690978966,
+      "loss": 1.0756,
+      "step": 9788
+    },
+    {
+      "epoch": 1.7428774928774928,
+      "grad_norm": 0.6473259925842285,
+      "learning_rate": 0.00012022581893228419,
+      "loss": 1.0568,
+      "step": 9789
+    },
+    {
+      "epoch": 1.7430555555555556,
+      "grad_norm": 0.6133778095245361,
+      "learning_rate": 0.00012021211055849581,
+      "loss": 0.8722,
+      "step": 9790
+    },
+    {
+      "epoch": 1.743233618233618,
+      "grad_norm": 0.6934139728546143,
+      "learning_rate": 0.00012019840178869315,
+      "loss": 1.0329,
+      "step": 9791
+    },
+    {
+      "epoch": 1.7434116809116809,
+      "grad_norm": 0.6730150580406189,
+      "learning_rate": 0.00012018469262314474,
+      "loss": 0.9326,
+      "step": 9792
+    },
+    {
+      "epoch": 1.7435897435897436,
+      "grad_norm": 0.6805521249771118,
+      "learning_rate": 0.0001201709830621192,
+      "loss": 1.0527,
+      "step": 9793
+    },
+    {
+      "epoch": 1.7437678062678064,
+      "grad_norm": 0.6972569823265076,
+      "learning_rate": 0.00012015727310588516,
+      "loss": 1.0024,
+      "step": 9794
+    },
+    {
+      "epoch": 1.743945868945869,
+      "grad_norm": 0.7329187989234924,
+      "learning_rate": 0.00012014356275471122,
+      "loss": 1.1864,
+      "step": 9795
+    },
+    {
+      "epoch": 1.7441239316239316,
+      "grad_norm": 0.7220240831375122,
+      "learning_rate": 0.00012012985200886602,
+      "loss": 0.8831,
+      "step": 9796
+    },
+    {
+      "epoch": 1.7443019943019942,
+      "grad_norm": 0.7829749584197998,
+      "learning_rate": 0.00012011614086861818,
+      "loss": 1.0365,
+      "step": 9797
+    },
+    {
+      "epoch": 1.744480056980057,
+      "grad_norm": 0.7148944735527039,
+      "learning_rate": 0.00012010242933423637,
+      "loss": 1.0413,
+      "step": 9798
+    },
+    {
+      "epoch": 1.7446581196581197,
+      "grad_norm": 0.5607262253761292,
+      "learning_rate": 0.00012008871740598917,
+      "loss": 0.8154,
+      "step": 9799
+    },
+    {
+      "epoch": 1.7448361823361824,
+      "grad_norm": 0.754626452922821,
+      "learning_rate": 0.00012007500508414531,
+      "loss": 1.0569,
+      "step": 9800
+    },
+    {
+      "epoch": 1.7450142450142452,
+      "grad_norm": 0.7216293215751648,
+      "learning_rate": 0.00012006129236897343,
+      "loss": 1.1641,
+      "step": 9801
+    },
+    {
+      "epoch": 1.7451923076923077,
+      "grad_norm": 0.6575515270233154,
+      "learning_rate": 0.0001200475792607422,
+      "loss": 0.9063,
+      "step": 9802
+    },
+    {
+      "epoch": 1.7453703703703702,
+      "grad_norm": 0.7411505579948425,
+      "learning_rate": 0.00012003386575972031,
+      "loss": 0.9791,
+      "step": 9803
+    },
+    {
+      "epoch": 1.745548433048433,
+      "grad_norm": 0.6945903301239014,
+      "learning_rate": 0.0001200201518661764,
+      "loss": 0.8111,
+      "step": 9804
+    },
+    {
+      "epoch": 1.7457264957264957,
+      "grad_norm": 0.5760970115661621,
+      "learning_rate": 0.00012000643758037924,
+      "loss": 1.1054,
+      "step": 9805
+    },
+    {
+      "epoch": 1.7459045584045585,
+      "grad_norm": 0.6732224225997925,
+      "learning_rate": 0.00011999272290259748,
+      "loss": 0.8992,
+      "step": 9806
+    },
+    {
+      "epoch": 1.7460826210826212,
+      "grad_norm": 0.673270046710968,
+      "learning_rate": 0.00011997900783309983,
+      "loss": 1.0554,
+      "step": 9807
+    },
+    {
+      "epoch": 1.7462606837606838,
+      "grad_norm": 0.7233314514160156,
+      "learning_rate": 0.00011996529237215503,
+      "loss": 1.066,
+      "step": 9808
+    },
+    {
+      "epoch": 1.7464387464387463,
+      "grad_norm": 0.7016494274139404,
+      "learning_rate": 0.00011995157652003183,
+      "loss": 0.891,
+      "step": 9809
+    },
+    {
+      "epoch": 1.746616809116809,
+      "grad_norm": 0.9377092719078064,
+      "learning_rate": 0.00011993786027699889,
+      "loss": 0.8626,
+      "step": 9810
+    },
+    {
+      "epoch": 1.7467948717948718,
+      "grad_norm": 0.6825845241546631,
+      "learning_rate": 0.00011992414364332503,
+      "loss": 0.8996,
+      "step": 9811
+    },
+    {
+      "epoch": 1.7469729344729346,
+      "grad_norm": 0.6836053729057312,
+      "learning_rate": 0.00011991042661927896,
+      "loss": 0.9338,
+      "step": 9812
+    },
+    {
+      "epoch": 1.7471509971509973,
+      "grad_norm": 0.6462908387184143,
+      "learning_rate": 0.00011989670920512943,
+      "loss": 1.1185,
+      "step": 9813
+    },
+    {
+      "epoch": 1.7473290598290598,
+      "grad_norm": 0.7191921472549438,
+      "learning_rate": 0.00011988299140114522,
+      "loss": 0.9084,
+      "step": 9814
+    },
+    {
+      "epoch": 1.7475071225071224,
+      "grad_norm": 0.6951598525047302,
+      "learning_rate": 0.00011986927320759508,
+      "loss": 1.0653,
+      "step": 9815
+    },
+    {
+      "epoch": 1.7476851851851851,
+      "grad_norm": 0.7512598037719727,
+      "learning_rate": 0.00011985555462474784,
+      "loss": 1.0259,
+      "step": 9816
+    },
+    {
+      "epoch": 1.7478632478632479,
+      "grad_norm": 0.6885492205619812,
+      "learning_rate": 0.00011984183565287226,
+      "loss": 0.7148,
+      "step": 9817
+    },
+    {
+      "epoch": 1.7480413105413106,
+      "grad_norm": 0.6880139708518982,
+      "learning_rate": 0.00011982811629223709,
+      "loss": 1.1567,
+      "step": 9818
+    },
+    {
+      "epoch": 1.7482193732193734,
+      "grad_norm": 0.7381170392036438,
+      "learning_rate": 0.0001198143965431112,
+      "loss": 0.8483,
+      "step": 9819
+    },
+    {
+      "epoch": 1.748397435897436,
+      "grad_norm": 0.6761063933372498,
+      "learning_rate": 0.00011980067640576333,
+      "loss": 0.9498,
+      "step": 9820
+    },
+    {
+      "epoch": 1.7485754985754984,
+      "grad_norm": 0.6454669237136841,
+      "learning_rate": 0.00011978695588046238,
+      "loss": 0.7336,
+      "step": 9821
+    },
+    {
+      "epoch": 1.7487535612535612,
+      "grad_norm": 0.6026871800422668,
+      "learning_rate": 0.00011977323496747712,
+      "loss": 0.8618,
+      "step": 9822
+    },
+    {
+      "epoch": 1.748931623931624,
+      "grad_norm": 0.6877408027648926,
+      "learning_rate": 0.0001197595136670764,
+      "loss": 0.9146,
+      "step": 9823
+    },
+    {
+      "epoch": 1.7491096866096867,
+      "grad_norm": 0.6874892115592957,
+      "learning_rate": 0.00011974579197952906,
+      "loss": 1.1628,
+      "step": 9824
+    },
+    {
+      "epoch": 1.7492877492877494,
+      "grad_norm": 0.7464384436607361,
+      "learning_rate": 0.00011973206990510393,
+      "loss": 1.007,
+      "step": 9825
+    },
+    {
+      "epoch": 1.749465811965812,
+      "grad_norm": 0.7281473278999329,
+      "learning_rate": 0.00011971834744406986,
+      "loss": 1.0776,
+      "step": 9826
+    },
+    {
+      "epoch": 1.7496438746438745,
+      "grad_norm": 0.6112284660339355,
+      "learning_rate": 0.00011970462459669575,
+      "loss": 0.7616,
+      "step": 9827
+    },
+    {
+      "epoch": 1.7498219373219372,
+      "grad_norm": 0.6498035192489624,
+      "learning_rate": 0.00011969090136325048,
+      "loss": 0.884,
+      "step": 9828
+    },
+    {
+      "epoch": 1.7498219373219372,
+      "eval_loss": 1.1018389463424683,
+      "eval_runtime": 24.5594,
+      "eval_samples_per_second": 42.387,
+      "eval_steps_per_second": 21.214,
+      "step": 9828
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 0.6746426224708557,
+      "learning_rate": 0.00011967717774400289,
+      "loss": 0.9023,
+      "step": 9829
+    },
+    {
+      "epoch": 1.7501780626780628,
+      "grad_norm": 0.6513423323631287,
+      "learning_rate": 0.00011966345373922188,
+      "loss": 0.9786,
+      "step": 9830
+    },
+    {
+      "epoch": 1.7503561253561255,
+      "grad_norm": 0.7053804397583008,
+      "learning_rate": 0.00011964972934917632,
+      "loss": 1.0667,
+      "step": 9831
+    },
+    {
+      "epoch": 1.750534188034188,
+      "grad_norm": 0.6769008040428162,
+      "learning_rate": 0.00011963600457413513,
+      "loss": 0.8596,
+      "step": 9832
+    },
+    {
+      "epoch": 1.7507122507122506,
+      "grad_norm": 0.7162246108055115,
+      "learning_rate": 0.00011962227941436725,
+      "loss": 1.0746,
+      "step": 9833
+    },
+    {
+      "epoch": 1.7508903133903133,
+      "grad_norm": 0.7665811777114868,
+      "learning_rate": 0.00011960855387014156,
+      "loss": 1.0056,
+      "step": 9834
+    },
+    {
+      "epoch": 1.751068376068376,
+      "grad_norm": 0.6186950206756592,
+      "learning_rate": 0.00011959482794172696,
+      "loss": 0.9016,
+      "step": 9835
+    },
+    {
+      "epoch": 1.7512464387464388,
+      "grad_norm": 0.8018904328346252,
+      "learning_rate": 0.00011958110162939245,
+      "loss": 0.9534,
+      "step": 9836
+    },
+    {
+      "epoch": 1.7514245014245016,
+      "grad_norm": 0.8239033818244934,
+      "learning_rate": 0.0001195673749334069,
+      "loss": 1.214,
+      "step": 9837
+    },
+    {
+      "epoch": 1.751602564102564,
+      "grad_norm": 0.7886297106742859,
+      "learning_rate": 0.00011955364785403931,
+      "loss": 0.9672,
+      "step": 9838
+    },
+    {
+      "epoch": 1.7517806267806266,
+      "grad_norm": 0.6463177800178528,
+      "learning_rate": 0.00011953992039155862,
+      "loss": 0.9184,
+      "step": 9839
+    },
+    {
+      "epoch": 1.7519586894586894,
+      "grad_norm": 0.7374706864356995,
+      "learning_rate": 0.00011952619254623374,
+      "loss": 0.9988,
+      "step": 9840
+    },
+    {
+      "epoch": 1.7521367521367521,
+      "grad_norm": 0.7456657886505127,
+      "learning_rate": 0.00011951246431833369,
+      "loss": 1.2197,
+      "step": 9841
+    },
+    {
+      "epoch": 1.7523148148148149,
+      "grad_norm": 0.6644248962402344,
+      "learning_rate": 0.00011949873570812746,
+      "loss": 0.9449,
+      "step": 9842
+    },
+    {
+      "epoch": 1.7524928774928776,
+      "grad_norm": 0.707919180393219,
+      "learning_rate": 0.000119485006715884,
+      "loss": 0.774,
+      "step": 9843
+    },
+    {
+      "epoch": 1.7526709401709402,
+      "grad_norm": 0.6273906826972961,
+      "learning_rate": 0.00011947127734187231,
+      "loss": 0.8682,
+      "step": 9844
+    },
+    {
+      "epoch": 1.7528490028490027,
+      "grad_norm": 0.8335350155830383,
+      "learning_rate": 0.00011945754758636136,
+      "loss": 1.2282,
+      "step": 9845
+    },
+    {
+      "epoch": 1.7530270655270654,
+      "grad_norm": 0.6849051117897034,
+      "learning_rate": 0.00011944381744962022,
+      "loss": 1.1091,
+      "step": 9846
+    },
+    {
+      "epoch": 1.7532051282051282,
+      "grad_norm": 0.8571760058403015,
+      "learning_rate": 0.00011943008693191781,
+      "loss": 0.9806,
+      "step": 9847
+    },
+    {
+      "epoch": 1.753383190883191,
+      "grad_norm": 0.7045019268989563,
+      "learning_rate": 0.00011941635603352328,
+      "loss": 0.9217,
+      "step": 9848
+    },
+    {
+      "epoch": 1.7535612535612537,
+      "grad_norm": 0.6820187568664551,
+      "learning_rate": 0.00011940262475470556,
+      "loss": 0.9983,
+      "step": 9849
+    },
+    {
+      "epoch": 1.7537393162393162,
+      "grad_norm": 0.7400697469711304,
+      "learning_rate": 0.00011938889309573374,
+      "loss": 0.9521,
+      "step": 9850
+    },
+    {
+      "epoch": 1.7539173789173788,
+      "grad_norm": 0.7027658820152283,
+      "learning_rate": 0.00011937516105687678,
+      "loss": 1.0749,
+      "step": 9851
+    },
+    {
+      "epoch": 1.7540954415954415,
+      "grad_norm": 0.6778307557106018,
+      "learning_rate": 0.00011936142863840382,
+      "loss": 1.0249,
+      "step": 9852
+    },
+    {
+      "epoch": 1.7542735042735043,
+      "grad_norm": 0.6787961721420288,
+      "learning_rate": 0.00011934769584058389,
+      "loss": 1.0014,
+      "step": 9853
+    },
+    {
+      "epoch": 1.754451566951567,
+      "grad_norm": 0.7515636086463928,
+      "learning_rate": 0.00011933396266368606,
+      "loss": 1.0351,
+      "step": 9854
+    },
+    {
+      "epoch": 1.7546296296296298,
+      "grad_norm": 0.6620134115219116,
+      "learning_rate": 0.00011932022910797938,
+      "loss": 1.0294,
+      "step": 9855
+    },
+    {
+      "epoch": 1.7548076923076923,
+      "grad_norm": 0.8260951638221741,
+      "learning_rate": 0.00011930649517373294,
+      "loss": 0.9078,
+      "step": 9856
+    },
+    {
+      "epoch": 1.7549857549857548,
+      "grad_norm": 0.7680675983428955,
+      "learning_rate": 0.00011929276086121584,
+      "loss": 0.92,
+      "step": 9857
+    },
+    {
+      "epoch": 1.7551638176638176,
+      "grad_norm": 0.7104191184043884,
+      "learning_rate": 0.00011927902617069717,
+      "loss": 0.9937,
+      "step": 9858
+    },
+    {
+      "epoch": 1.7553418803418803,
+      "grad_norm": 0.7185840606689453,
+      "learning_rate": 0.00011926529110244603,
+      "loss": 0.9775,
+      "step": 9859
+    },
+    {
+      "epoch": 1.755519943019943,
+      "grad_norm": 0.7114652991294861,
+      "learning_rate": 0.00011925155565673151,
+      "loss": 0.883,
+      "step": 9860
+    },
+    {
+      "epoch": 1.7556980056980058,
+      "grad_norm": 0.6906639337539673,
+      "learning_rate": 0.00011923781983382276,
+      "loss": 0.9789,
+      "step": 9861
+    },
+    {
+      "epoch": 1.7558760683760684,
+      "grad_norm": 0.706908106803894,
+      "learning_rate": 0.00011922408363398892,
+      "loss": 1.1186,
+      "step": 9862
+    },
+    {
+      "epoch": 1.756054131054131,
+      "grad_norm": 0.7532939910888672,
+      "learning_rate": 0.00011921034705749908,
+      "loss": 0.977,
+      "step": 9863
+    },
+    {
+      "epoch": 1.7562321937321936,
+      "grad_norm": 0.7397763729095459,
+      "learning_rate": 0.0001191966101046224,
+      "loss": 1.1121,
+      "step": 9864
+    },
+    {
+      "epoch": 1.7564102564102564,
+      "grad_norm": 0.6955398321151733,
+      "learning_rate": 0.00011918287277562801,
+      "loss": 1.0439,
+      "step": 9865
+    },
+    {
+      "epoch": 1.7565883190883191,
+      "grad_norm": 0.7485929727554321,
+      "learning_rate": 0.00011916913507078507,
+      "loss": 1.1644,
+      "step": 9866
+    },
+    {
+      "epoch": 1.756766381766382,
+      "grad_norm": 0.6337487101554871,
+      "learning_rate": 0.00011915539699036274,
+      "loss": 0.8216,
+      "step": 9867
+    },
+    {
+      "epoch": 1.7569444444444444,
+      "grad_norm": 0.6628872752189636,
+      "learning_rate": 0.00011914165853463022,
+      "loss": 0.9584,
+      "step": 9868
+    },
+    {
+      "epoch": 1.7571225071225072,
+      "grad_norm": 0.6577547788619995,
+      "learning_rate": 0.00011912791970385666,
+      "loss": 0.9484,
+      "step": 9869
+    },
+    {
+      "epoch": 1.7573005698005697,
+      "grad_norm": 0.6409304738044739,
+      "learning_rate": 0.00011911418049831127,
+      "loss": 1.1256,
+      "step": 9870
+    },
+    {
+      "epoch": 1.7574786324786325,
+      "grad_norm": 0.7499844431877136,
+      "learning_rate": 0.00011910044091826319,
+      "loss": 0.7991,
+      "step": 9871
+    },
+    {
+      "epoch": 1.7576566951566952,
+      "grad_norm": 0.6786715388298035,
+      "learning_rate": 0.00011908670096398165,
+      "loss": 1.0368,
+      "step": 9872
+    },
+    {
+      "epoch": 1.757834757834758,
+      "grad_norm": 0.6432101130485535,
+      "learning_rate": 0.00011907296063573585,
+      "loss": 0.9059,
+      "step": 9873
+    },
+    {
+      "epoch": 1.7580128205128205,
+      "grad_norm": 0.6542613506317139,
+      "learning_rate": 0.00011905921993379503,
+      "loss": 0.9866,
+      "step": 9874
+    },
+    {
+      "epoch": 1.7581908831908832,
+      "grad_norm": 0.6048218011856079,
+      "learning_rate": 0.00011904547885842838,
+      "loss": 0.9488,
+      "step": 9875
+    },
+    {
+      "epoch": 1.7583689458689458,
+      "grad_norm": 0.7694938778877258,
+      "learning_rate": 0.00011903173740990512,
+      "loss": 1.1026,
+      "step": 9876
+    },
+    {
+      "epoch": 1.7585470085470085,
+      "grad_norm": 0.6621627807617188,
+      "learning_rate": 0.00011901799558849451,
+      "loss": 1.135,
+      "step": 9877
+    },
+    {
+      "epoch": 1.7587250712250713,
+      "grad_norm": 0.6561587452888489,
+      "learning_rate": 0.0001190042533944658,
+      "loss": 0.9322,
+      "step": 9878
+    },
+    {
+      "epoch": 1.758903133903134,
+      "grad_norm": 0.7846759557723999,
+      "learning_rate": 0.00011899051082808821,
+      "loss": 0.9324,
+      "step": 9879
+    },
+    {
+      "epoch": 1.7590811965811965,
+      "grad_norm": 0.6004071831703186,
+      "learning_rate": 0.00011897676788963101,
+      "loss": 0.9641,
+      "step": 9880
+    },
+    {
+      "epoch": 1.7592592592592593,
+      "grad_norm": 0.6731070280075073,
+      "learning_rate": 0.00011896302457936344,
+      "loss": 1.1437,
+      "step": 9881
+    },
+    {
+      "epoch": 1.7594373219373218,
+      "grad_norm": 0.6768675446510315,
+      "learning_rate": 0.00011894928089755481,
+      "loss": 1.0707,
+      "step": 9882
+    },
+    {
+      "epoch": 1.7596153846153846,
+      "grad_norm": 0.8368878960609436,
+      "learning_rate": 0.0001189355368444744,
+      "loss": 1.0435,
+      "step": 9883
+    },
+    {
+      "epoch": 1.7597934472934473,
+      "grad_norm": 0.6132324934005737,
+      "learning_rate": 0.00011892179242039149,
+      "loss": 0.8889,
+      "step": 9884
+    },
+    {
+      "epoch": 1.75997150997151,
+      "grad_norm": 0.7598093152046204,
+      "learning_rate": 0.00011890804762557535,
+      "loss": 1.151,
+      "step": 9885
+    },
+    {
+      "epoch": 1.7601495726495726,
+      "grad_norm": 0.7317715883255005,
+      "learning_rate": 0.00011889430246029527,
+      "loss": 0.9992,
+      "step": 9886
+    },
+    {
+      "epoch": 1.7603276353276354,
+      "grad_norm": 0.7664858102798462,
+      "learning_rate": 0.00011888055692482059,
+      "loss": 0.8398,
+      "step": 9887
+    },
+    {
+      "epoch": 1.760505698005698,
+      "grad_norm": 0.6916853189468384,
+      "learning_rate": 0.00011886681101942063,
+      "loss": 0.9507,
+      "step": 9888
+    },
+    {
+      "epoch": 1.7606837606837606,
+      "grad_norm": 0.7103399634361267,
+      "learning_rate": 0.0001188530647443647,
+      "loss": 0.915,
+      "step": 9889
+    },
+    {
+      "epoch": 1.7608618233618234,
+      "grad_norm": 0.6177804470062256,
+      "learning_rate": 0.00011883931809992215,
+      "loss": 0.721,
+      "step": 9890
+    },
+    {
+      "epoch": 1.7610398860398861,
+      "grad_norm": 0.7523959279060364,
+      "learning_rate": 0.00011882557108636227,
+      "loss": 0.99,
+      "step": 9891
+    },
+    {
+      "epoch": 1.7612179487179487,
+      "grad_norm": 0.6211134791374207,
+      "learning_rate": 0.00011881182370395442,
+      "loss": 0.8089,
+      "step": 9892
+    },
+    {
+      "epoch": 1.7613960113960114,
+      "grad_norm": 0.6660307049751282,
+      "learning_rate": 0.00011879807595296802,
+      "loss": 1.1062,
+      "step": 9893
+    },
+    {
+      "epoch": 1.761574074074074,
+      "grad_norm": 0.7039240598678589,
+      "learning_rate": 0.00011878432783367232,
+      "loss": 0.9739,
+      "step": 9894
+    },
+    {
+      "epoch": 1.7617521367521367,
+      "grad_norm": 0.658064603805542,
+      "learning_rate": 0.00011877057934633675,
+      "loss": 0.9438,
+      "step": 9895
+    },
+    {
+      "epoch": 1.7619301994301995,
+      "grad_norm": 0.8227152228355408,
+      "learning_rate": 0.00011875683049123068,
+      "loss": 0.8385,
+      "step": 9896
+    },
+    {
+      "epoch": 1.7621082621082622,
+      "grad_norm": 0.6622483730316162,
+      "learning_rate": 0.00011874308126862346,
+      "loss": 0.9432,
+      "step": 9897
+    },
+    {
+      "epoch": 1.7622863247863247,
+      "grad_norm": 0.7211357951164246,
+      "learning_rate": 0.00011872933167878453,
+      "loss": 1.2471,
+      "step": 9898
+    },
+    {
+      "epoch": 1.7624643874643875,
+      "grad_norm": 0.6177424192428589,
+      "learning_rate": 0.00011871558172198322,
+      "loss": 0.8892,
+      "step": 9899
+    },
+    {
+      "epoch": 1.76264245014245,
+      "grad_norm": 0.6924285888671875,
+      "learning_rate": 0.00011870183139848898,
+      "loss": 1.021,
+      "step": 9900
+    },
+    {
+      "epoch": 1.7628205128205128,
+      "grad_norm": 0.6168648600578308,
+      "learning_rate": 0.0001186880807085712,
+      "loss": 0.9013,
+      "step": 9901
+    },
+    {
+      "epoch": 1.7629985754985755,
+      "grad_norm": 0.6410452723503113,
+      "learning_rate": 0.00011867432965249929,
+      "loss": 0.6686,
+      "step": 9902
+    },
+    {
+      "epoch": 1.7631766381766383,
+      "grad_norm": 0.6959559917449951,
+      "learning_rate": 0.0001186605782305427,
+      "loss": 0.9814,
+      "step": 9903
+    },
+    {
+      "epoch": 1.7633547008547008,
+      "grad_norm": 0.7456178069114685,
+      "learning_rate": 0.00011864682644297085,
+      "loss": 1.0151,
+      "step": 9904
+    },
+    {
+      "epoch": 1.7635327635327636,
+      "grad_norm": 0.6499991416931152,
+      "learning_rate": 0.00011863307429005317,
+      "loss": 0.83,
+      "step": 9905
+    },
+    {
+      "epoch": 1.763710826210826,
+      "grad_norm": 0.643344521522522,
+      "learning_rate": 0.00011861932177205908,
+      "loss": 0.8853,
+      "step": 9906
+    },
+    {
+      "epoch": 1.7638888888888888,
+      "grad_norm": 0.6570441722869873,
+      "learning_rate": 0.00011860556888925804,
+      "loss": 0.9179,
+      "step": 9907
+    },
+    {
+      "epoch": 1.7640669515669516,
+      "grad_norm": 0.6892307996749878,
+      "learning_rate": 0.00011859181564191957,
+      "loss": 0.9657,
+      "step": 9908
+    },
+    {
+      "epoch": 1.7642450142450143,
+      "grad_norm": 0.648158073425293,
+      "learning_rate": 0.0001185780620303131,
+      "loss": 0.9179,
+      "step": 9909
+    },
+    {
+      "epoch": 1.7644230769230769,
+      "grad_norm": 0.5833603143692017,
+      "learning_rate": 0.00011856430805470808,
+      "loss": 0.8505,
+      "step": 9910
+    },
+    {
+      "epoch": 1.7646011396011396,
+      "grad_norm": 0.8302416205406189,
+      "learning_rate": 0.000118550553715374,
+      "loss": 0.8948,
+      "step": 9911
+    },
+    {
+      "epoch": 1.7647792022792022,
+      "grad_norm": 0.7075300216674805,
+      "learning_rate": 0.00011853679901258035,
+      "loss": 1.2467,
+      "step": 9912
+    },
+    {
+      "epoch": 1.764957264957265,
+      "grad_norm": 0.81916344165802,
+      "learning_rate": 0.00011852304394659666,
+      "loss": 0.9963,
+      "step": 9913
+    },
+    {
+      "epoch": 1.7651353276353277,
+      "grad_norm": 0.6492435932159424,
+      "learning_rate": 0.00011850928851769239,
+      "loss": 1.0704,
+      "step": 9914
+    },
+    {
+      "epoch": 1.7653133903133904,
+      "grad_norm": 0.7301090359687805,
+      "learning_rate": 0.00011849553272613704,
+      "loss": 1.0477,
+      "step": 9915
+    },
+    {
+      "epoch": 1.765491452991453,
+      "grad_norm": 0.7280275821685791,
+      "learning_rate": 0.00011848177657220019,
+      "loss": 0.9124,
+      "step": 9916
+    },
+    {
+      "epoch": 1.7656695156695157,
+      "grad_norm": 0.6948845386505127,
+      "learning_rate": 0.00011846802005615127,
+      "loss": 1.2275,
+      "step": 9917
+    },
+    {
+      "epoch": 1.7658475783475782,
+      "grad_norm": 0.6553834676742554,
+      "learning_rate": 0.0001184542631782599,
+      "loss": 1.2311,
+      "step": 9918
+    },
+    {
+      "epoch": 1.766025641025641,
+      "grad_norm": 0.6899739503860474,
+      "learning_rate": 0.00011844050593879556,
+      "loss": 0.8936,
+      "step": 9919
+    },
+    {
+      "epoch": 1.7662037037037037,
+      "grad_norm": 0.6076815128326416,
+      "learning_rate": 0.00011842674833802782,
+      "loss": 0.8432,
+      "step": 9920
+    },
+    {
+      "epoch": 1.7663817663817665,
+      "grad_norm": 0.7650902271270752,
+      "learning_rate": 0.00011841299037622624,
+      "loss": 1.0447,
+      "step": 9921
+    },
+    {
+      "epoch": 1.7665598290598292,
+      "grad_norm": 0.6864938735961914,
+      "learning_rate": 0.00011839923205366032,
+      "loss": 0.936,
+      "step": 9922
+    },
+    {
+      "epoch": 1.7667378917378918,
+      "grad_norm": 0.7176852226257324,
+      "learning_rate": 0.0001183854733705997,
+      "loss": 0.9764,
+      "step": 9923
+    },
+    {
+      "epoch": 1.7669159544159543,
+      "grad_norm": 0.6513439416885376,
+      "learning_rate": 0.00011837171432731393,
+      "loss": 1.0095,
+      "step": 9924
+    },
+    {
+      "epoch": 1.767094017094017,
+      "grad_norm": 0.8031024932861328,
+      "learning_rate": 0.00011835795492407256,
+      "loss": 1.1348,
+      "step": 9925
+    },
+    {
+      "epoch": 1.7672720797720798,
+      "grad_norm": 0.7659830451011658,
+      "learning_rate": 0.00011834419516114518,
+      "loss": 0.9058,
+      "step": 9926
+    },
+    {
+      "epoch": 1.7674501424501425,
+      "grad_norm": 0.8864039778709412,
+      "learning_rate": 0.00011833043503880145,
+      "loss": 1.0342,
+      "step": 9927
+    },
+    {
+      "epoch": 1.7676282051282053,
+      "grad_norm": 0.6870512962341309,
+      "learning_rate": 0.00011831667455731088,
+      "loss": 0.9361,
+      "step": 9928
+    },
+    {
+      "epoch": 1.7678062678062678,
+      "grad_norm": 0.6458830833435059,
+      "learning_rate": 0.00011830291371694315,
+      "loss": 0.8215,
+      "step": 9929
+    },
+    {
+      "epoch": 1.7679843304843303,
+      "grad_norm": 0.7456086874008179,
+      "learning_rate": 0.00011828915251796787,
+      "loss": 1.1243,
+      "step": 9930
+    },
+    {
+      "epoch": 1.768162393162393,
+      "grad_norm": 0.6834850311279297,
+      "learning_rate": 0.00011827539096065459,
+      "loss": 0.9536,
+      "step": 9931
+    },
+    {
+      "epoch": 1.7683404558404558,
+      "grad_norm": 0.643864631652832,
+      "learning_rate": 0.00011826162904527302,
+      "loss": 1.1707,
+      "step": 9932
+    },
+    {
+      "epoch": 1.7685185185185186,
+      "grad_norm": 0.6312864422798157,
+      "learning_rate": 0.00011824786677209275,
+      "loss": 0.7937,
+      "step": 9933
+    },
+    {
+      "epoch": 1.7686965811965814,
+      "grad_norm": 0.6092729568481445,
+      "learning_rate": 0.00011823410414138343,
+      "loss": 0.8787,
+      "step": 9934
+    },
+    {
+      "epoch": 1.7688746438746439,
+      "grad_norm": 0.6859988570213318,
+      "learning_rate": 0.00011822034115341474,
+      "loss": 0.9691,
+      "step": 9935
+    },
+    {
+      "epoch": 1.7690527065527064,
+      "grad_norm": 0.7219935059547424,
+      "learning_rate": 0.0001182065778084563,
+      "loss": 1.0606,
+      "step": 9936
+    },
+    {
+      "epoch": 1.7692307692307692,
+      "grad_norm": 0.6596202850341797,
+      "learning_rate": 0.00011819281410677778,
+      "loss": 1.0543,
+      "step": 9937
+    },
+    {
+      "epoch": 1.769408831908832,
+      "grad_norm": 0.6616338491439819,
+      "learning_rate": 0.00011817905004864887,
+      "loss": 0.9757,
+      "step": 9938
+    },
+    {
+      "epoch": 1.7695868945868947,
+      "grad_norm": 0.6637360453605652,
+      "learning_rate": 0.00011816528563433924,
+      "loss": 0.925,
+      "step": 9939
+    },
+    {
+      "epoch": 1.7697649572649574,
+      "grad_norm": 0.8422333002090454,
+      "learning_rate": 0.00011815152086411859,
+      "loss": 1.1343,
+      "step": 9940
+    },
+    {
+      "epoch": 1.76994301994302,
+      "grad_norm": 0.6638204455375671,
+      "learning_rate": 0.00011813775573825656,
+      "loss": 1.2136,
+      "step": 9941
+    },
+    {
+      "epoch": 1.7701210826210825,
+      "grad_norm": 0.7258831858634949,
+      "learning_rate": 0.0001181239902570229,
+      "loss": 0.7308,
+      "step": 9942
+    },
+    {
+      "epoch": 1.7702991452991452,
+      "grad_norm": 0.730582594871521,
+      "learning_rate": 0.0001181102244206873,
+      "loss": 1.1097,
+      "step": 9943
+    },
+    {
+      "epoch": 1.770477207977208,
+      "grad_norm": 0.7324019074440002,
+      "learning_rate": 0.00011809645822951946,
+      "loss": 0.9802,
+      "step": 9944
+    },
+    {
+      "epoch": 1.7706552706552707,
+      "grad_norm": 0.5565997958183289,
+      "learning_rate": 0.00011808269168378914,
+      "loss": 0.7079,
+      "step": 9945
+    },
+    {
+      "epoch": 1.7708333333333335,
+      "grad_norm": 0.6395503282546997,
+      "learning_rate": 0.00011806892478376601,
+      "loss": 1.0048,
+      "step": 9946
+    },
+    {
+      "epoch": 1.771011396011396,
+      "grad_norm": 0.7670905590057373,
+      "learning_rate": 0.00011805515752971985,
+      "loss": 1.2509,
+      "step": 9947
+    },
+    {
+      "epoch": 1.7711894586894585,
+      "grad_norm": 0.5945813655853271,
+      "learning_rate": 0.00011804138992192037,
+      "loss": 0.8856,
+      "step": 9948
+    },
+    {
+      "epoch": 1.7713675213675213,
+      "grad_norm": 0.7355493307113647,
+      "learning_rate": 0.00011802762196063737,
+      "loss": 0.9629,
+      "step": 9949
+    },
+    {
+      "epoch": 1.771545584045584,
+      "grad_norm": 0.7024806141853333,
+      "learning_rate": 0.00011801385364614055,
+      "loss": 1.1351,
+      "step": 9950
+    },
+    {
+      "epoch": 1.7717236467236468,
+      "grad_norm": 0.6553003191947937,
+      "learning_rate": 0.00011800008497869968,
+      "loss": 0.911,
+      "step": 9951
+    },
+    {
+      "epoch": 1.7719017094017095,
+      "grad_norm": 0.6883971691131592,
+      "learning_rate": 0.00011798631595858454,
+      "loss": 1.0099,
+      "step": 9952
+    },
+    {
+      "epoch": 1.772079772079772,
+      "grad_norm": 0.7106832265853882,
+      "learning_rate": 0.00011797254658606489,
+      "loss": 1.0298,
+      "step": 9953
+    },
+    {
+      "epoch": 1.7722578347578346,
+      "grad_norm": 0.7902877926826477,
+      "learning_rate": 0.00011795877686141055,
+      "loss": 1.0572,
+      "step": 9954
+    },
+    {
+      "epoch": 1.7724358974358974,
+      "grad_norm": 0.7105007171630859,
+      "learning_rate": 0.00011794500678489126,
+      "loss": 1.1725,
+      "step": 9955
+    },
+    {
+      "epoch": 1.77261396011396,
+      "grad_norm": 0.7314959764480591,
+      "learning_rate": 0.00011793123635677685,
+      "loss": 1.1074,
+      "step": 9956
+    },
+    {
+      "epoch": 1.7727920227920229,
+      "grad_norm": 0.6358618140220642,
+      "learning_rate": 0.00011791746557733712,
+      "loss": 0.8786,
+      "step": 9957
+    },
+    {
+      "epoch": 1.7729700854700856,
+      "grad_norm": 0.6441367864608765,
+      "learning_rate": 0.00011790369444684187,
+      "loss": 1.1332,
+      "step": 9958
+    },
+    {
+      "epoch": 1.7731481481481481,
+      "grad_norm": 0.686787486076355,
+      "learning_rate": 0.0001178899229655609,
+      "loss": 0.9566,
+      "step": 9959
+    },
+    {
+      "epoch": 1.7733262108262107,
+      "grad_norm": 0.653840184211731,
+      "learning_rate": 0.00011787615113376407,
+      "loss": 0.8763,
+      "step": 9960
+    },
+    {
+      "epoch": 1.7735042735042734,
+      "grad_norm": 0.7106643915176392,
+      "learning_rate": 0.00011786237895172119,
+      "loss": 0.9929,
+      "step": 9961
+    },
+    {
+      "epoch": 1.7736823361823362,
+      "grad_norm": 0.6634044051170349,
+      "learning_rate": 0.0001178486064197021,
+      "loss": 0.7467,
+      "step": 9962
+    },
+    {
+      "epoch": 1.773860398860399,
+      "grad_norm": 0.7087352871894836,
+      "learning_rate": 0.00011783483353797663,
+      "loss": 1.0104,
+      "step": 9963
+    },
+    {
+      "epoch": 1.7740384615384617,
+      "grad_norm": 0.8088061213493347,
+      "learning_rate": 0.00011782106030681466,
+      "loss": 1.0376,
+      "step": 9964
+    },
+    {
+      "epoch": 1.7742165242165242,
+      "grad_norm": 0.7204688787460327,
+      "learning_rate": 0.00011780728672648604,
+      "loss": 0.8556,
+      "step": 9965
+    },
+    {
+      "epoch": 1.7743945868945867,
+      "grad_norm": 0.7893314957618713,
+      "learning_rate": 0.0001177935127972606,
+      "loss": 0.9764,
+      "step": 9966
+    },
+    {
+      "epoch": 1.7745726495726495,
+      "grad_norm": 0.6098896265029907,
+      "learning_rate": 0.00011777973851940826,
+      "loss": 0.9407,
+      "step": 9967
+    },
+    {
+      "epoch": 1.7747507122507122,
+      "grad_norm": 0.6420868039131165,
+      "learning_rate": 0.0001177659638931989,
+      "loss": 1.1328,
+      "step": 9968
+    },
+    {
+      "epoch": 1.774928774928775,
+      "grad_norm": 0.7732378244400024,
+      "learning_rate": 0.00011775218891890234,
+      "loss": 1.1236,
+      "step": 9969
+    },
+    {
+      "epoch": 1.7751068376068377,
+      "grad_norm": 0.6591582894325256,
+      "learning_rate": 0.00011773841359678855,
+      "loss": 1.1523,
+      "step": 9970
+    },
+    {
+      "epoch": 1.7752849002849003,
+      "grad_norm": 0.6337170004844666,
+      "learning_rate": 0.00011772463792712738,
+      "loss": 1.1998,
+      "step": 9971
+    },
+    {
+      "epoch": 1.7754629629629628,
+      "grad_norm": 0.6400532126426697,
+      "learning_rate": 0.00011771086191018874,
+      "loss": 0.9543,
+      "step": 9972
+    },
+    {
+      "epoch": 1.7756410256410255,
+      "grad_norm": 0.6431527733802795,
+      "learning_rate": 0.00011769708554624257,
+      "loss": 0.8164,
+      "step": 9973
+    },
+    {
+      "epoch": 1.7758190883190883,
+      "grad_norm": 0.7303599119186401,
+      "learning_rate": 0.00011768330883555876,
+      "loss": 0.9553,
+      "step": 9974
+    },
+    {
+      "epoch": 1.775997150997151,
+      "grad_norm": 0.7838605642318726,
+      "learning_rate": 0.00011766953177840725,
+      "loss": 0.9759,
+      "step": 9975
+    },
+    {
+      "epoch": 1.7761752136752138,
+      "grad_norm": 0.6505265831947327,
+      "learning_rate": 0.00011765575437505796,
+      "loss": 0.8527,
+      "step": 9976
+    },
+    {
+      "epoch": 1.7763532763532763,
+      "grad_norm": 0.7336180806159973,
+      "learning_rate": 0.00011764197662578086,
+      "loss": 1.1098,
+      "step": 9977
+    },
+    {
+      "epoch": 1.776531339031339,
+      "grad_norm": 0.7040138244628906,
+      "learning_rate": 0.00011762819853084586,
+      "loss": 1.1289,
+      "step": 9978
+    },
+    {
+      "epoch": 1.7767094017094016,
+      "grad_norm": 0.6414867043495178,
+      "learning_rate": 0.00011761442009052293,
+      "loss": 1.0826,
+      "step": 9979
+    },
+    {
+      "epoch": 1.7768874643874644,
+      "grad_norm": 0.6760666370391846,
+      "learning_rate": 0.00011760064130508204,
+      "loss": 1.0188,
+      "step": 9980
+    },
+    {
+      "epoch": 1.7770655270655271,
+      "grad_norm": 0.7864978909492493,
+      "learning_rate": 0.00011758686217479316,
+      "loss": 1.1938,
+      "step": 9981
+    },
+    {
+      "epoch": 1.7772435897435899,
+      "grad_norm": 0.7964870929718018,
+      "learning_rate": 0.00011757308269992622,
+      "loss": 0.9876,
+      "step": 9982
+    },
+    {
+      "epoch": 1.7774216524216524,
+      "grad_norm": 0.5158692002296448,
+      "learning_rate": 0.00011755930288075123,
+      "loss": 0.6508,
+      "step": 9983
+    },
+    {
+      "epoch": 1.7775997150997151,
+      "grad_norm": 0.7208606600761414,
+      "learning_rate": 0.00011754552271753819,
+      "loss": 1.0738,
+      "step": 9984
+    },
+    {
+      "epoch": 1.7777777777777777,
+      "grad_norm": 0.6811334490776062,
+      "learning_rate": 0.00011753174221055705,
+      "loss": 1.1216,
+      "step": 9985
+    },
+    {
+      "epoch": 1.7779558404558404,
+      "grad_norm": 0.6389986276626587,
+      "learning_rate": 0.00011751796136007787,
+      "loss": 0.9664,
+      "step": 9986
+    },
+    {
+      "epoch": 1.7781339031339032,
+      "grad_norm": 0.7081875205039978,
+      "learning_rate": 0.00011750418016637064,
+      "loss": 0.9365,
+      "step": 9987
+    },
+    {
+      "epoch": 1.778311965811966,
+      "grad_norm": 0.7291778326034546,
+      "learning_rate": 0.00011749039862970535,
+      "loss": 1.3222,
+      "step": 9988
+    },
+    {
+      "epoch": 1.7784900284900285,
+      "grad_norm": 0.6790453791618347,
+      "learning_rate": 0.000117476616750352,
+      "loss": 0.9537,
+      "step": 9989
+    },
+    {
+      "epoch": 1.7786680911680912,
+      "grad_norm": 0.6271076202392578,
+      "learning_rate": 0.00011746283452858069,
+      "loss": 0.9842,
+      "step": 9990
+    },
+    {
+      "epoch": 1.7788461538461537,
+      "grad_norm": 0.675628662109375,
+      "learning_rate": 0.00011744905196466138,
+      "loss": 0.8675,
+      "step": 9991
+    },
+    {
+      "epoch": 1.7790242165242165,
+      "grad_norm": 0.7328314185142517,
+      "learning_rate": 0.00011743526905886417,
+      "loss": 0.9793,
+      "step": 9992
+    },
+    {
+      "epoch": 1.7792022792022792,
+      "grad_norm": 0.698764979839325,
+      "learning_rate": 0.00011742148581145908,
+      "loss": 0.9527,
+      "step": 9993
+    },
+    {
+      "epoch": 1.779380341880342,
+      "grad_norm": 0.6911364793777466,
+      "learning_rate": 0.00011740770222271616,
+      "loss": 1.1069,
+      "step": 9994
+    },
+    {
+      "epoch": 1.7795584045584045,
+      "grad_norm": 0.6990836262702942,
+      "learning_rate": 0.00011739391829290547,
+      "loss": 0.9132,
+      "step": 9995
+    },
+    {
+      "epoch": 1.7797364672364673,
+      "grad_norm": 0.7056801319122314,
+      "learning_rate": 0.0001173801340222971,
+      "loss": 1.053,
+      "step": 9996
+    },
+    {
+      "epoch": 1.7799145299145298,
+      "grad_norm": 0.7453791499137878,
+      "learning_rate": 0.0001173663494111611,
+      "loss": 0.8806,
+      "step": 9997
+    },
+    {
+      "epoch": 1.7800925925925926,
+      "grad_norm": 0.7211771011352539,
+      "learning_rate": 0.00011735256445976757,
+      "loss": 0.9968,
+      "step": 9998
+    },
+    {
+      "epoch": 1.7802706552706553,
+      "grad_norm": 0.7259734272956848,
+      "learning_rate": 0.00011733877916838656,
+      "loss": 1.167,
+      "step": 9999
+    },
+    {
+      "epoch": 1.780448717948718,
+      "grad_norm": 0.6931926012039185,
+      "learning_rate": 0.00011732499353728821,
+      "loss": 1.0634,
+      "step": 10000
+    },
+    {
+      "epoch": 1.7806267806267806,
+      "grad_norm": 0.6900074481964111,
+      "learning_rate": 0.00011731120756674259,
+      "loss": 0.9718,
+      "step": 10001
+    },
+    {
+      "epoch": 1.7808048433048433,
+      "grad_norm": 0.6817582845687866,
+      "learning_rate": 0.00011729742125701984,
+      "loss": 1.0896,
+      "step": 10002
+    },
+    {
+      "epoch": 1.7809829059829059,
+      "grad_norm": 0.6901891231536865,
+      "learning_rate": 0.00011728363460839003,
+      "loss": 1.0163,
+      "step": 10003
+    },
+    {
+      "epoch": 1.7811609686609686,
+      "grad_norm": 0.9138323664665222,
+      "learning_rate": 0.00011726984762112328,
+      "loss": 1.1713,
+      "step": 10004
+    },
+    {
+      "epoch": 1.7813390313390314,
+      "grad_norm": 0.6105810403823853,
+      "learning_rate": 0.00011725606029548977,
+      "loss": 0.9331,
+      "step": 10005
+    },
+    {
+      "epoch": 1.7815170940170941,
+      "grad_norm": 0.5605259537696838,
+      "learning_rate": 0.0001172422726317596,
+      "loss": 0.7154,
+      "step": 10006
+    },
+    {
+      "epoch": 1.7816951566951567,
+      "grad_norm": 0.6950963735580444,
+      "learning_rate": 0.00011722848463020292,
+      "loss": 1.0093,
+      "step": 10007
+    },
+    {
+      "epoch": 1.7818732193732194,
+      "grad_norm": 0.6806309819221497,
+      "learning_rate": 0.00011721469629108988,
+      "loss": 0.8662,
+      "step": 10008
+    },
+    {
+      "epoch": 1.782051282051282,
+      "grad_norm": 0.7528520226478577,
+      "learning_rate": 0.00011720090761469063,
+      "loss": 0.8567,
+      "step": 10009
+    },
+    {
+      "epoch": 1.7822293447293447,
+      "grad_norm": 0.6617229580879211,
+      "learning_rate": 0.00011718711860127529,
+      "loss": 1.0378,
+      "step": 10010
+    },
+    {
+      "epoch": 1.7824074074074074,
+      "grad_norm": 0.6468376517295837,
+      "learning_rate": 0.00011717332925111411,
+      "loss": 1.0658,
+      "step": 10011
+    },
+    {
+      "epoch": 1.7825854700854702,
+      "grad_norm": 0.7141897082328796,
+      "learning_rate": 0.00011715953956447721,
+      "loss": 1.023,
+      "step": 10012
+    },
+    {
+      "epoch": 1.7827635327635327,
+      "grad_norm": 0.5777570605278015,
+      "learning_rate": 0.00011714574954163475,
+      "loss": 0.9154,
+      "step": 10013
+    },
+    {
+      "epoch": 1.7829415954415955,
+      "grad_norm": 0.7536137700080872,
+      "learning_rate": 0.00011713195918285695,
+      "loss": 0.9651,
+      "step": 10014
+    },
+    {
+      "epoch": 1.783119658119658,
+      "grad_norm": 0.6977683305740356,
+      "learning_rate": 0.00011711816848841402,
+      "loss": 0.7977,
+      "step": 10015
+    },
+    {
+      "epoch": 1.7832977207977208,
+      "grad_norm": 0.6522472500801086,
+      "learning_rate": 0.00011710437745857614,
+      "loss": 0.8834,
+      "step": 10016
+    },
+    {
+      "epoch": 1.7834757834757835,
+      "grad_norm": 0.6263057589530945,
+      "learning_rate": 0.0001170905860936135,
+      "loss": 1.0576,
+      "step": 10017
+    },
+    {
+      "epoch": 1.7836538461538463,
+      "grad_norm": 0.6470699310302734,
+      "learning_rate": 0.00011707679439379635,
+      "loss": 0.9412,
+      "step": 10018
+    },
+    {
+      "epoch": 1.7838319088319088,
+      "grad_norm": Infinity,
+      "learning_rate": 0.00011707679439379635,
+      "loss": 1.1746,
+      "step": 10019
+    },
+    {
+      "epoch": 1.7840099715099715,
+      "grad_norm": 0.6022017002105713,
+      "learning_rate": 0.00011706300235939485,
+      "loss": 0.8945,
+      "step": 10020
+    },
+    {
+      "epoch": 1.784188034188034,
+      "grad_norm": 0.637208104133606,
+      "learning_rate": 0.00011704920999067927,
+      "loss": 1.0215,
+      "step": 10021
+    },
+    {
+      "epoch": 1.7843660968660968,
+      "grad_norm": 0.7467851042747498,
+      "learning_rate": 0.00011703541728791987,
+      "loss": 1.0341,
+      "step": 10022
+    },
+    {
+      "epoch": 1.7845441595441596,
+      "grad_norm": 0.7562711238861084,
+      "learning_rate": 0.00011702162425138683,
+      "loss": 0.9748,
+      "step": 10023
+    },
+    {
+      "epoch": 1.7847222222222223,
+      "grad_norm": 0.6480089426040649,
+      "learning_rate": 0.00011700783088135043,
+      "loss": 1.05,
+      "step": 10024
+    },
+    {
+      "epoch": 1.7849002849002849,
+      "grad_norm": 0.6293981671333313,
+      "learning_rate": 0.00011699403717808091,
+      "loss": 1.0376,
+      "step": 10025
+    },
+    {
+      "epoch": 1.7850783475783476,
+      "grad_norm": 0.6821253895759583,
+      "learning_rate": 0.00011698024314184853,
+      "loss": 1.0542,
+      "step": 10026
+    },
+    {
+      "epoch": 1.7852564102564101,
+      "grad_norm": 0.6681216359138489,
+      "learning_rate": 0.00011696644877292356,
+      "loss": 1.0018,
+      "step": 10027
+    },
+    {
+      "epoch": 1.7854344729344729,
+      "grad_norm": 0.6788804531097412,
+      "learning_rate": 0.00011695265407157628,
+      "loss": 1.1823,
+      "step": 10028
+    },
+    {
+      "epoch": 1.7856125356125356,
+      "grad_norm": 0.6147881150245667,
+      "learning_rate": 0.00011693885903807697,
+      "loss": 0.9246,
+      "step": 10029
+    },
+    {
+      "epoch": 1.7857905982905984,
+      "grad_norm": 0.7952296137809753,
+      "learning_rate": 0.00011692506367269588,
+      "loss": 1.0528,
+      "step": 10030
+    },
+    {
+      "epoch": 1.785968660968661,
+      "grad_norm": 0.6985954642295837,
+      "learning_rate": 0.00011691126797570333,
+      "loss": 0.9173,
+      "step": 10031
+    },
+    {
+      "epoch": 1.7861467236467237,
+      "grad_norm": 0.6211223602294922,
+      "learning_rate": 0.00011689747194736961,
+      "loss": 0.7527,
+      "step": 10032
+    },
+    {
+      "epoch": 1.7863247863247862,
+      "grad_norm": 0.7531208992004395,
+      "learning_rate": 0.00011688367558796507,
+      "loss": 1.1087,
+      "step": 10033
+    },
+    {
+      "epoch": 1.786502849002849,
+      "grad_norm": 0.7742924690246582,
+      "learning_rate": 0.00011686987889775996,
+      "loss": 1.1512,
+      "step": 10034
+    },
+    {
+      "epoch": 1.7866809116809117,
+      "grad_norm": 0.7046231627464294,
+      "learning_rate": 0.00011685608187702459,
+      "loss": 1.0516,
+      "step": 10035
+    },
+    {
+      "epoch": 1.7868589743589745,
+      "grad_norm": 0.6264076232910156,
+      "learning_rate": 0.00011684228452602933,
+      "loss": 0.8938,
+      "step": 10036
+    },
+    {
+      "epoch": 1.7870370370370372,
+      "grad_norm": 0.6342145800590515,
+      "learning_rate": 0.00011682848684504448,
+      "loss": 0.8177,
+      "step": 10037
+    },
+    {
+      "epoch": 1.7872150997150997,
+      "grad_norm": 0.6609861254692078,
+      "learning_rate": 0.00011681468883434041,
+      "loss": 0.9692,
+      "step": 10038
+    },
+    {
+      "epoch": 1.7873931623931623,
+      "grad_norm": 0.7918622493743896,
+      "learning_rate": 0.00011680089049418743,
+      "loss": 0.8246,
+      "step": 10039
+    },
+    {
+      "epoch": 1.787571225071225,
+      "grad_norm": 0.697712779045105,
+      "learning_rate": 0.00011678709182485592,
+      "loss": 0.8981,
+      "step": 10040
+    },
+    {
+      "epoch": 1.7877492877492878,
+      "grad_norm": 0.6747658252716064,
+      "learning_rate": 0.00011677329282661617,
+      "loss": 1.1243,
+      "step": 10041
+    },
+    {
+      "epoch": 1.7879273504273505,
+      "grad_norm": 0.6525771617889404,
+      "learning_rate": 0.00011675949349973863,
+      "loss": 0.852,
+      "step": 10042
+    },
+    {
+      "epoch": 1.7881054131054133,
+      "grad_norm": 0.7062464952468872,
+      "learning_rate": 0.00011674569384449363,
+      "loss": 1.2582,
+      "step": 10043
+    },
+    {
+      "epoch": 1.7882834757834758,
+      "grad_norm": 0.6453786492347717,
+      "learning_rate": 0.00011673189386115154,
+      "loss": 0.868,
+      "step": 10044
+    },
+    {
+      "epoch": 1.7884615384615383,
+      "grad_norm": 0.7939708232879639,
+      "learning_rate": 0.00011671809354998273,
+      "loss": 0.7553,
+      "step": 10045
+    },
+    {
+      "epoch": 1.788639601139601,
+      "grad_norm": 0.6466066837310791,
+      "learning_rate": 0.00011670429291125761,
+      "loss": 0.942,
+      "step": 10046
+    },
+    {
+      "epoch": 1.7888176638176638,
+      "grad_norm": 0.7380510568618774,
+      "learning_rate": 0.00011669049194524657,
+      "loss": 1.044,
+      "step": 10047
+    },
+    {
+      "epoch": 1.7889957264957266,
+      "grad_norm": 0.6719707250595093,
+      "learning_rate": 0.00011667669065222002,
+      "loss": 1.1624,
+      "step": 10048
+    },
+    {
+      "epoch": 1.7891737891737893,
+      "grad_norm": 0.6996603012084961,
+      "learning_rate": 0.00011666288903244837,
+      "loss": 1.001,
+      "step": 10049
+    },
+    {
+      "epoch": 1.7893518518518519,
+      "grad_norm": 0.696590006351471,
+      "learning_rate": 0.00011664908708620202,
+      "loss": 1.17,
+      "step": 10050
+    },
+    {
+      "epoch": 1.7895299145299144,
+      "grad_norm": 0.7226764559745789,
+      "learning_rate": 0.00011663528481375137,
+      "loss": 1.0762,
+      "step": 10051
+    },
+    {
+      "epoch": 1.7897079772079771,
+      "grad_norm": 0.6117866635322571,
+      "learning_rate": 0.00011662148221536689,
+      "loss": 0.9199,
+      "step": 10052
+    },
+    {
+      "epoch": 1.78988603988604,
+      "grad_norm": 0.6424985527992249,
+      "learning_rate": 0.000116607679291319,
+      "loss": 1.1672,
+      "step": 10053
+    },
+    {
+      "epoch": 1.7900641025641026,
+      "grad_norm": 0.6390290856361389,
+      "learning_rate": 0.00011659387604187813,
+      "loss": 1.1895,
+      "step": 10054
+    },
+    {
+      "epoch": 1.7902421652421654,
+      "grad_norm": 0.6553205251693726,
+      "learning_rate": 0.00011658007246731473,
+      "loss": 1.0967,
+      "step": 10055
+    },
+    {
+      "epoch": 1.790420227920228,
+      "grad_norm": 0.7737570405006409,
+      "learning_rate": 0.00011656626856789922,
+      "loss": 0.9637,
+      "step": 10056
+    },
+    {
+      "epoch": 1.7905982905982905,
+      "grad_norm": 0.644296407699585,
+      "learning_rate": 0.00011655246434390212,
+      "loss": 0.9933,
+      "step": 10057
+    },
+    {
+      "epoch": 1.7907763532763532,
+      "grad_norm": 0.8154410123825073,
+      "learning_rate": 0.00011653865979559388,
+      "loss": 0.9623,
+      "step": 10058
+    },
+    {
+      "epoch": 1.790954415954416,
+      "grad_norm": 0.7181384563446045,
+      "learning_rate": 0.00011652485492324495,
+      "loss": 0.9113,
+      "step": 10059
+    },
+    {
+      "epoch": 1.7911324786324787,
+      "grad_norm": 0.7835097908973694,
+      "learning_rate": 0.00011651104972712582,
+      "loss": 1.0804,
+      "step": 10060
+    },
+    {
+      "epoch": 1.7913105413105415,
+      "grad_norm": 0.6843693852424622,
+      "learning_rate": 0.00011649724420750691,
+      "loss": 1.0242,
+      "step": 10061
+    },
+    {
+      "epoch": 1.791488603988604,
+      "grad_norm": 0.8364703059196472,
+      "learning_rate": 0.00011648343836465885,
+      "loss": 0.8445,
+      "step": 10062
+    },
+    {
+      "epoch": 1.7916666666666665,
+      "grad_norm": 0.7122092843055725,
+      "learning_rate": 0.00011646963219885201,
+      "loss": 1.0453,
+      "step": 10063
+    },
+    {
+      "epoch": 1.7918447293447293,
+      "grad_norm": 0.7018755078315735,
+      "learning_rate": 0.00011645582571035696,
+      "loss": 0.9753,
+      "step": 10064
+    },
+    {
+      "epoch": 1.792022792022792,
+      "grad_norm": 0.6522594094276428,
+      "learning_rate": 0.00011644201889944419,
+      "loss": 1.0328,
+      "step": 10065
+    },
+    {
+      "epoch": 1.7922008547008548,
+      "grad_norm": 0.70301353931427,
+      "learning_rate": 0.00011642821176638419,
+      "loss": 0.9143,
+      "step": 10066
+    },
+    {
+      "epoch": 1.7923789173789175,
+      "grad_norm": 0.6255469918251038,
+      "learning_rate": 0.0001164144043114475,
+      "loss": 0.9527,
+      "step": 10067
+    },
+    {
+      "epoch": 1.79255698005698,
+      "grad_norm": 0.6780602931976318,
+      "learning_rate": 0.0001164005965349047,
+      "loss": 0.9192,
+      "step": 10068
+    },
+    {
+      "epoch": 1.7927350427350426,
+      "grad_norm": 0.6025984287261963,
+      "learning_rate": 0.00011638678843702626,
+      "loss": 0.9055,
+      "step": 10069
+    },
+    {
+      "epoch": 1.7929131054131053,
+      "grad_norm": 0.6430829763412476,
+      "learning_rate": 0.00011637298001808275,
+      "loss": 0.9359,
+      "step": 10070
+    },
+    {
+      "epoch": 1.793091168091168,
+      "grad_norm": 0.6388106942176819,
+      "learning_rate": 0.0001163591712783447,
+      "loss": 0.8847,
+      "step": 10071
+    },
+    {
+      "epoch": 1.7932692307692308,
+      "grad_norm": 0.706347644329071,
+      "learning_rate": 0.00011634536221808265,
+      "loss": 0.9055,
+      "step": 10072
+    },
+    {
+      "epoch": 1.7934472934472936,
+      "grad_norm": 0.661226749420166,
+      "learning_rate": 0.00011633155283756721,
+      "loss": 1.118,
+      "step": 10073
+    },
+    {
+      "epoch": 1.7936253561253561,
+      "grad_norm": 0.543207049369812,
+      "learning_rate": 0.00011631774313706891,
+      "loss": 0.8856,
+      "step": 10074
+    },
+    {
+      "epoch": 1.7938034188034186,
+      "grad_norm": 0.6514154672622681,
+      "learning_rate": 0.00011630393311685835,
+      "loss": 0.8967,
+      "step": 10075
+    },
+    {
+      "epoch": 1.7939814814814814,
+      "grad_norm": 0.8669198155403137,
+      "learning_rate": 0.00011629012277720607,
+      "loss": 1.0362,
+      "step": 10076
+    },
+    {
+      "epoch": 1.7941595441595442,
+      "grad_norm": 0.7256068587303162,
+      "learning_rate": 0.00011627631211838266,
+      "loss": 1.1948,
+      "step": 10077
+    },
+    {
+      "epoch": 1.794337606837607,
+      "grad_norm": 0.6504935622215271,
+      "learning_rate": 0.00011626250114065875,
+      "loss": 0.8309,
+      "step": 10078
+    },
+    {
+      "epoch": 1.7945156695156697,
+      "grad_norm": 0.6964160799980164,
+      "learning_rate": 0.0001162486898443049,
+      "loss": 0.9593,
+      "step": 10079
+    },
+    {
+      "epoch": 1.7946937321937322,
+      "grad_norm": 0.668727695941925,
+      "learning_rate": 0.00011623487822959174,
+      "loss": 0.8897,
+      "step": 10080
+    },
+    {
+      "epoch": 1.7948717948717947,
+      "grad_norm": 0.6907223463058472,
+      "learning_rate": 0.00011622106629678986,
+      "loss": 0.897,
+      "step": 10081
+    },
+    {
+      "epoch": 1.7950498575498575,
+      "grad_norm": 0.6652865409851074,
+      "learning_rate": 0.00011620725404616985,
+      "loss": 0.9321,
+      "step": 10082
+    },
+    {
+      "epoch": 1.7952279202279202,
+      "grad_norm": 0.6523811221122742,
+      "learning_rate": 0.00011619344147800239,
+      "loss": 0.8991,
+      "step": 10083
+    },
+    {
+      "epoch": 1.795405982905983,
+      "grad_norm": 0.6162952184677124,
+      "learning_rate": 0.0001161796285925581,
+      "loss": 0.8061,
+      "step": 10084
+    },
+    {
+      "epoch": 1.7955840455840457,
+      "grad_norm": 0.670606791973114,
+      "learning_rate": 0.0001161658153901076,
+      "loss": 0.9341,
+      "step": 10085
+    },
+    {
+      "epoch": 1.7957621082621082,
+      "grad_norm": 0.6372489333152771,
+      "learning_rate": 0.00011615200187092148,
+      "loss": 1.1049,
+      "step": 10086
+    },
+    {
+      "epoch": 1.7959401709401708,
+      "grad_norm": 0.7311037182807922,
+      "learning_rate": 0.00011613818803527045,
+      "loss": 1.0881,
+      "step": 10087
+    },
+    {
+      "epoch": 1.7961182336182335,
+      "grad_norm": 0.7440751194953918,
+      "learning_rate": 0.00011612437388342518,
+      "loss": 0.9487,
+      "step": 10088
+    },
+    {
+      "epoch": 1.7962962962962963,
+      "grad_norm": 0.6605934500694275,
+      "learning_rate": 0.00011611055941565629,
+      "loss": 0.8757,
+      "step": 10089
+    },
+    {
+      "epoch": 1.796474358974359,
+      "grad_norm": 0.7546001076698303,
+      "learning_rate": 0.00011609674463223446,
+      "loss": 0.9368,
+      "step": 10090
+    },
+    {
+      "epoch": 1.7966524216524218,
+      "grad_norm": 0.7001389861106873,
+      "learning_rate": 0.00011608292953343036,
+      "loss": 0.9098,
+      "step": 10091
+    },
+    {
+      "epoch": 1.7968304843304843,
+      "grad_norm": 0.6898102760314941,
+      "learning_rate": 0.00011606911411951462,
+      "loss": 0.8821,
+      "step": 10092
+    },
+    {
+      "epoch": 1.797008547008547,
+      "grad_norm": 0.7020773887634277,
+      "learning_rate": 0.00011605529839075801,
+      "loss": 1.2775,
+      "step": 10093
+    },
+    {
+      "epoch": 1.7971866096866096,
+      "grad_norm": 0.6061446070671082,
+      "learning_rate": 0.0001160414823474312,
+      "loss": 1.0156,
+      "step": 10094
+    },
+    {
+      "epoch": 1.7973646723646723,
+      "grad_norm": 0.6746069192886353,
+      "learning_rate": 0.00011602766598980484,
+      "loss": 0.8223,
+      "step": 10095
+    },
+    {
+      "epoch": 1.797542735042735,
+      "grad_norm": 0.655829131603241,
+      "learning_rate": 0.00011601384931814967,
+      "loss": 0.9482,
+      "step": 10096
+    },
+    {
+      "epoch": 1.7977207977207978,
+      "grad_norm": 0.6762703061103821,
+      "learning_rate": 0.00011600003233273636,
+      "loss": 1.0191,
+      "step": 10097
+    },
+    {
+      "epoch": 1.7978988603988604,
+      "grad_norm": 0.7610527276992798,
+      "learning_rate": 0.00011598621503383566,
+      "loss": 1.0771,
+      "step": 10098
+    },
+    {
+      "epoch": 1.7980769230769231,
+      "grad_norm": 0.6857240200042725,
+      "learning_rate": 0.0001159723974217183,
+      "loss": 0.8325,
+      "step": 10099
+    },
+    {
+      "epoch": 1.7982549857549857,
+      "grad_norm": 0.6897954940795898,
+      "learning_rate": 0.00011595857949665501,
+      "loss": 1.0064,
+      "step": 10100
+    },
+    {
+      "epoch": 1.7984330484330484,
+      "grad_norm": 0.7023211717605591,
+      "learning_rate": 0.00011594476125891649,
+      "loss": 1.1346,
+      "step": 10101
+    },
+    {
+      "epoch": 1.7986111111111112,
+      "grad_norm": 0.8131003975868225,
+      "learning_rate": 0.00011593094270877347,
+      "loss": 1.0384,
+      "step": 10102
+    },
+    {
+      "epoch": 1.798789173789174,
+      "grad_norm": 0.6504445672035217,
+      "learning_rate": 0.00011591712384649676,
+      "loss": 0.8172,
+      "step": 10103
+    },
+    {
+      "epoch": 1.7989672364672364,
+      "grad_norm": 0.7379748821258545,
+      "learning_rate": 0.00011590330467235704,
+      "loss": 1.0118,
+      "step": 10104
+    },
+    {
+      "epoch": 1.7991452991452992,
+      "grad_norm": 0.8867329955101013,
+      "learning_rate": 0.0001158894851866251,
+      "loss": 1.023,
+      "step": 10105
+    },
+    {
+      "epoch": 1.7993233618233617,
+      "grad_norm": 0.7057412266731262,
+      "learning_rate": 0.00011587566538957173,
+      "loss": 0.8415,
+      "step": 10106
+    },
+    {
+      "epoch": 1.7995014245014245,
+      "grad_norm": 0.7479654550552368,
+      "learning_rate": 0.00011586184528146769,
+      "loss": 0.9663,
+      "step": 10107
+    },
+    {
+      "epoch": 1.7996794871794872,
+      "grad_norm": 0.6280845403671265,
+      "learning_rate": 0.00011584802486258368,
+      "loss": 0.973,
+      "step": 10108
+    },
+    {
+      "epoch": 1.79985754985755,
+      "grad_norm": 0.6735749840736389,
+      "learning_rate": 0.00011583420413319059,
+      "loss": 0.8631,
+      "step": 10109
+    },
+    {
+      "epoch": 1.8000356125356125,
+      "grad_norm": 0.5940406918525696,
+      "learning_rate": 0.00011582038309355918,
+      "loss": 0.8533,
+      "step": 10110
+    },
+    {
+      "epoch": 1.8002136752136753,
+      "grad_norm": 0.6923874020576477,
+      "learning_rate": 0.00011580656174396021,
+      "loss": 1.1105,
+      "step": 10111
+    },
+    {
+      "epoch": 1.8003917378917378,
+      "grad_norm": 0.6996715664863586,
+      "learning_rate": 0.00011579274008466447,
+      "loss": 0.9952,
+      "step": 10112
+    },
+    {
+      "epoch": 1.8005698005698005,
+      "grad_norm": 0.656561553478241,
+      "learning_rate": 0.00011577891811594281,
+      "loss": 0.9621,
+      "step": 10113
+    },
+    {
+      "epoch": 1.8007478632478633,
+      "grad_norm": 0.7121242880821228,
+      "learning_rate": 0.00011576509583806605,
+      "loss": 0.8658,
+      "step": 10114
+    },
+    {
+      "epoch": 1.800925925925926,
+      "grad_norm": 0.7864459753036499,
+      "learning_rate": 0.00011575127325130498,
+      "loss": 0.9867,
+      "step": 10115
+    },
+    {
+      "epoch": 1.8011039886039886,
+      "grad_norm": 0.6086452007293701,
+      "learning_rate": 0.00011573745035593042,
+      "loss": 0.8625,
+      "step": 10116
+    },
+    {
+      "epoch": 1.8012820512820513,
+      "grad_norm": 0.6553642749786377,
+      "learning_rate": 0.00011572362715221321,
+      "loss": 0.8475,
+      "step": 10117
+    },
+    {
+      "epoch": 1.8014601139601139,
+      "grad_norm": 0.6677348017692566,
+      "learning_rate": 0.00011570980364042419,
+      "loss": 0.9672,
+      "step": 10118
+    },
+    {
+      "epoch": 1.8016381766381766,
+      "grad_norm": 0.6275015473365784,
+      "learning_rate": 0.0001156959798208342,
+      "loss": 0.8663,
+      "step": 10119
+    },
+    {
+      "epoch": 1.8018162393162394,
+      "grad_norm": 0.787568211555481,
+      "learning_rate": 0.0001156821556937141,
+      "loss": 1.0188,
+      "step": 10120
+    },
+    {
+      "epoch": 1.801994301994302,
+      "grad_norm": 0.6983163356781006,
+      "learning_rate": 0.00011566833125933473,
+      "loss": 1.0767,
+      "step": 10121
+    },
+    {
+      "epoch": 1.8021723646723646,
+      "grad_norm": 0.7008936405181885,
+      "learning_rate": 0.00011565450651796695,
+      "loss": 1.0116,
+      "step": 10122
+    },
+    {
+      "epoch": 1.8023504273504274,
+      "grad_norm": 0.7694976925849915,
+      "learning_rate": 0.00011564068146988163,
+      "loss": 1.0227,
+      "step": 10123
+    },
+    {
+      "epoch": 1.80252849002849,
+      "grad_norm": 0.9530014991760254,
+      "learning_rate": 0.00011562685611534967,
+      "loss": 0.907,
+      "step": 10124
+    },
+    {
+      "epoch": 1.8027065527065527,
+      "grad_norm": 0.6714984178543091,
+      "learning_rate": 0.00011561303045464189,
+      "loss": 0.9501,
+      "step": 10125
+    },
+    {
+      "epoch": 1.8028846153846154,
+      "grad_norm": 0.7233797311782837,
+      "learning_rate": 0.00011559920448802925,
+      "loss": 1.021,
+      "step": 10126
+    },
+    {
+      "epoch": 1.8030626780626782,
+      "grad_norm": 0.7600540518760681,
+      "learning_rate": 0.0001155853782157826,
+      "loss": 1.1056,
+      "step": 10127
+    },
+    {
+      "epoch": 1.8032407407407407,
+      "grad_norm": 0.7836297750473022,
+      "learning_rate": 0.00011557155163817281,
+      "loss": 0.9906,
+      "step": 10128
+    },
+    {
+      "epoch": 1.8034188034188035,
+      "grad_norm": 0.7161104083061218,
+      "learning_rate": 0.00011555772475547084,
+      "loss": 0.9541,
+      "step": 10129
+    },
+    {
+      "epoch": 1.803596866096866,
+      "grad_norm": 0.6613732576370239,
+      "learning_rate": 0.00011554389756794757,
+      "loss": 0.9188,
+      "step": 10130
+    },
+    {
+      "epoch": 1.8037749287749287,
+      "grad_norm": 0.6415915489196777,
+      "learning_rate": 0.00011553007007587391,
+      "loss": 0.9928,
+      "step": 10131
+    },
+    {
+      "epoch": 1.8039529914529915,
+      "grad_norm": 0.7730516195297241,
+      "learning_rate": 0.0001155162422795208,
+      "loss": 1.0654,
+      "step": 10132
+    },
+    {
+      "epoch": 1.8041310541310542,
+      "grad_norm": 0.6769654750823975,
+      "learning_rate": 0.00011550241417915913,
+      "loss": 1.0678,
+      "step": 10133
+    },
+    {
+      "epoch": 1.8043091168091168,
+      "grad_norm": 0.6542425751686096,
+      "learning_rate": 0.00011548858577505988,
+      "loss": 0.9796,
+      "step": 10134
+    },
+    {
+      "epoch": 1.8044871794871795,
+      "grad_norm": 0.7282404899597168,
+      "learning_rate": 0.00011547475706749395,
+      "loss": 1.0314,
+      "step": 10135
+    },
+    {
+      "epoch": 1.804665242165242,
+      "grad_norm": 0.6450245976448059,
+      "learning_rate": 0.00011546092805673232,
+      "loss": 0.9564,
+      "step": 10136
+    },
+    {
+      "epoch": 1.8048433048433048,
+      "grad_norm": 0.65577632188797,
+      "learning_rate": 0.0001154470987430459,
+      "loss": 1.0219,
+      "step": 10137
+    },
+    {
+      "epoch": 1.8050213675213675,
+      "grad_norm": 0.7151737809181213,
+      "learning_rate": 0.00011543326912670567,
+      "loss": 0.9245,
+      "step": 10138
+    },
+    {
+      "epoch": 1.8051994301994303,
+      "grad_norm": 0.6695905327796936,
+      "learning_rate": 0.00011541943920798259,
+      "loss": 0.9535,
+      "step": 10139
+    },
+    {
+      "epoch": 1.8053774928774928,
+      "grad_norm": 0.7443813681602478,
+      "learning_rate": 0.00011540560898714767,
+      "loss": 1.1697,
+      "step": 10140
+    },
+    {
+      "epoch": 1.8055555555555556,
+      "grad_norm": 0.5701992511749268,
+      "learning_rate": 0.0001153917784644718,
+      "loss": 0.7868,
+      "step": 10141
+    },
+    {
+      "epoch": 1.805733618233618,
+      "grad_norm": 0.6992354989051819,
+      "learning_rate": 0.00011537794764022605,
+      "loss": 0.9856,
+      "step": 10142
+    },
+    {
+      "epoch": 1.8059116809116809,
+      "grad_norm": 0.6354477405548096,
+      "learning_rate": 0.00011536411651468131,
+      "loss": 0.8752,
+      "step": 10143
+    },
+    {
+      "epoch": 1.8060897435897436,
+      "grad_norm": 0.6952932476997375,
+      "learning_rate": 0.00011535028508810864,
+      "loss": 0.9446,
+      "step": 10144
+    },
+    {
+      "epoch": 1.8062678062678064,
+      "grad_norm": 0.5527541637420654,
+      "learning_rate": 0.00011533645336077901,
+      "loss": 0.5486,
+      "step": 10145
+    },
+    {
+      "epoch": 1.806445868945869,
+      "grad_norm": 0.685046374797821,
+      "learning_rate": 0.00011532262133296345,
+      "loss": 0.9529,
+      "step": 10146
+    },
+    {
+      "epoch": 1.8066239316239316,
+      "grad_norm": 0.6927558779716492,
+      "learning_rate": 0.00011530878900493296,
+      "loss": 1.1758,
+      "step": 10147
+    },
+    {
+      "epoch": 1.8068019943019942,
+      "grad_norm": 0.6758309602737427,
+      "learning_rate": 0.00011529495637695855,
+      "loss": 1.0076,
+      "step": 10148
+    },
+    {
+      "epoch": 1.806980056980057,
+      "grad_norm": 0.6739441156387329,
+      "learning_rate": 0.00011528112344931121,
+      "loss": 1.1914,
+      "step": 10149
+    },
+    {
+      "epoch": 1.8071581196581197,
+      "grad_norm": 0.7031944394111633,
+      "learning_rate": 0.00011526729022226204,
+      "loss": 0.783,
+      "step": 10150
+    },
+    {
+      "epoch": 1.8073361823361824,
+      "grad_norm": 0.6476930975914001,
+      "learning_rate": 0.00011525345669608202,
+      "loss": 0.9595,
+      "step": 10151
+    },
+    {
+      "epoch": 1.8075142450142452,
+      "grad_norm": 0.710498571395874,
+      "learning_rate": 0.00011523962287104222,
+      "loss": 0.8821,
+      "step": 10152
+    },
+    {
+      "epoch": 1.8076923076923077,
+      "grad_norm": 0.6664412617683411,
+      "learning_rate": 0.00011522578874741365,
+      "loss": 1.0182,
+      "step": 10153
+    },
+    {
+      "epoch": 1.8078703703703702,
+      "grad_norm": 0.8374263048171997,
+      "learning_rate": 0.00011521195432546737,
+      "loss": 0.9394,
+      "step": 10154
+    },
+    {
+      "epoch": 1.808048433048433,
+      "grad_norm": 0.6770764589309692,
+      "learning_rate": 0.00011519811960547447,
+      "loss": 1.0568,
+      "step": 10155
+    },
+    {
+      "epoch": 1.8082264957264957,
+      "grad_norm": 0.7014045715332031,
+      "learning_rate": 0.00011518428458770595,
+      "loss": 1.1705,
+      "step": 10156
+    },
+    {
+      "epoch": 1.8084045584045585,
+      "grad_norm": 0.6590061187744141,
+      "learning_rate": 0.00011517044927243295,
+      "loss": 1.1233,
+      "step": 10157
+    },
+    {
+      "epoch": 1.8085826210826212,
+      "grad_norm": 0.6093801856040955,
+      "learning_rate": 0.00011515661365992647,
+      "loss": 0.953,
+      "step": 10158
+    },
+    {
+      "epoch": 1.8087606837606838,
+      "grad_norm": 0.6197089552879333,
+      "learning_rate": 0.00011514277775045768,
+      "loss": 0.9414,
+      "step": 10159
+    },
+    {
+      "epoch": 1.8089387464387463,
+      "grad_norm": 0.7530463337898254,
+      "learning_rate": 0.00011512894154429759,
+      "loss": 0.9168,
+      "step": 10160
+    },
+    {
+      "epoch": 1.809116809116809,
+      "grad_norm": 0.6051347851753235,
+      "learning_rate": 0.00011511510504171735,
+      "loss": 0.9132,
+      "step": 10161
+    },
+    {
+      "epoch": 1.8092948717948718,
+      "grad_norm": 0.6388311982154846,
+      "learning_rate": 0.000115101268242988,
+      "loss": 0.6551,
+      "step": 10162
+    },
+    {
+      "epoch": 1.8094729344729346,
+      "grad_norm": 0.7040972709655762,
+      "learning_rate": 0.00011508743114838063,
+      "loss": 0.9409,
+      "step": 10163
+    },
+    {
+      "epoch": 1.8096509971509973,
+      "grad_norm": 0.7669548392295837,
+      "learning_rate": 0.00011507359375816644,
+      "loss": 1.0376,
+      "step": 10164
+    },
+    {
+      "epoch": 1.8098290598290598,
+      "grad_norm": 0.7309662699699402,
+      "learning_rate": 0.00011505975607261646,
+      "loss": 0.9071,
+      "step": 10165
+    },
+    {
+      "epoch": 1.8100071225071224,
+      "grad_norm": 0.6624547839164734,
+      "learning_rate": 0.00011504591809200187,
+      "loss": 1.0765,
+      "step": 10166
+    },
+    {
+      "epoch": 1.8101851851851851,
+      "grad_norm": 0.7719045281410217,
+      "learning_rate": 0.00011503207981659376,
+      "loss": 0.9244,
+      "step": 10167
+    },
+    {
+      "epoch": 1.8103632478632479,
+      "grad_norm": 0.6701484322547913,
+      "learning_rate": 0.0001150182412466633,
+      "loss": 0.9475,
+      "step": 10168
+    },
+    {
+      "epoch": 1.8105413105413106,
+      "grad_norm": 0.5604981184005737,
+      "learning_rate": 0.00011500440238248154,
+      "loss": 0.6268,
+      "step": 10169
+    },
+    {
+      "epoch": 1.8107193732193734,
+      "grad_norm": 0.6736510992050171,
+      "learning_rate": 0.00011499056322431973,
+      "loss": 0.9088,
+      "step": 10170
+    },
+    {
+      "epoch": 1.810897435897436,
+      "grad_norm": 0.7428455948829651,
+      "learning_rate": 0.00011497672377244897,
+      "loss": 0.9298,
+      "step": 10171
+    },
+    {
+      "epoch": 1.8110754985754984,
+      "grad_norm": 0.6543142795562744,
+      "learning_rate": 0.00011496288402714042,
+      "loss": 0.8863,
+      "step": 10172
+    },
+    {
+      "epoch": 1.8112535612535612,
+      "grad_norm": 0.6809250712394714,
+      "learning_rate": 0.00011494904398866524,
+      "loss": 0.977,
+      "step": 10173
+    },
+    {
+      "epoch": 1.811431623931624,
+      "grad_norm": 0.8105120062828064,
+      "learning_rate": 0.00011493520365729456,
+      "loss": 1.2115,
+      "step": 10174
+    },
+    {
+      "epoch": 1.8116096866096867,
+      "grad_norm": 0.6985095143318176,
+      "learning_rate": 0.00011492136303329964,
+      "loss": 0.8233,
+      "step": 10175
+    },
+    {
+      "epoch": 1.8117877492877494,
+      "grad_norm": 0.7198361754417419,
+      "learning_rate": 0.00011490752211695158,
+      "loss": 1.0552,
+      "step": 10176
+    },
+    {
+      "epoch": 1.811965811965812,
+      "grad_norm": 0.7077036499977112,
+      "learning_rate": 0.0001148936809085216,
+      "loss": 0.9171,
+      "step": 10177
+    },
+    {
+      "epoch": 1.8121438746438745,
+      "grad_norm": 0.9362925887107849,
+      "learning_rate": 0.00011487983940828089,
+      "loss": 0.9042,
+      "step": 10178
+    },
+    {
+      "epoch": 1.8123219373219372,
+      "grad_norm": 0.6732819676399231,
+      "learning_rate": 0.0001148659976165006,
+      "loss": 1.1033,
+      "step": 10179
+    },
+    {
+      "epoch": 1.8125,
+      "grad_norm": 0.747702419757843,
+      "learning_rate": 0.00011485215553345201,
+      "loss": 1.0692,
+      "step": 10180
+    },
+    {
+      "epoch": 1.8126780626780628,
+      "grad_norm": 0.7011259198188782,
+      "learning_rate": 0.00011483831315940627,
+      "loss": 0.9278,
+      "step": 10181
+    },
+    {
+      "epoch": 1.8128561253561255,
+      "grad_norm": 0.8542702198028564,
+      "learning_rate": 0.00011482447049463462,
+      "loss": 0.9476,
+      "step": 10182
+    },
+    {
+      "epoch": 1.813034188034188,
+      "grad_norm": 0.6975166201591492,
+      "learning_rate": 0.00011481062753940825,
+      "loss": 0.9486,
+      "step": 10183
+    },
+    {
+      "epoch": 1.8132122507122506,
+      "grad_norm": 0.8239036798477173,
+      "learning_rate": 0.0001147967842939984,
+      "loss": 1.0518,
+      "step": 10184
+    },
+    {
+      "epoch": 1.8133903133903133,
+      "grad_norm": 0.7559717297554016,
+      "learning_rate": 0.00011478294075867628,
+      "loss": 1.1877,
+      "step": 10185
+    },
+    {
+      "epoch": 1.813568376068376,
+      "grad_norm": 0.6755532026290894,
+      "learning_rate": 0.00011476909693371318,
+      "loss": 0.9287,
+      "step": 10186
+    },
+    {
+      "epoch": 1.8137464387464388,
+      "grad_norm": 0.6561332941055298,
+      "learning_rate": 0.0001147552528193803,
+      "loss": 0.83,
+      "step": 10187
+    },
+    {
+      "epoch": 1.8139245014245016,
+      "grad_norm": 0.7223508954048157,
+      "learning_rate": 0.00011474140841594887,
+      "loss": 1.1259,
+      "step": 10188
+    },
+    {
+      "epoch": 1.814102564102564,
+      "grad_norm": 0.7920593023300171,
+      "learning_rate": 0.0001147275637236902,
+      "loss": 1.0925,
+      "step": 10189
+    },
+    {
+      "epoch": 1.8142806267806266,
+      "grad_norm": 0.6896616816520691,
+      "learning_rate": 0.00011471371874287546,
+      "loss": 1.0204,
+      "step": 10190
+    },
+    {
+      "epoch": 1.8144586894586894,
+      "grad_norm": 0.6149865388870239,
+      "learning_rate": 0.00011469987347377602,
+      "loss": 1.1249,
+      "step": 10191
+    },
+    {
+      "epoch": 1.8146367521367521,
+      "grad_norm": 0.6650002598762512,
+      "learning_rate": 0.00011468602791666307,
+      "loss": 0.9723,
+      "step": 10192
+    },
+    {
+      "epoch": 1.8148148148148149,
+      "grad_norm": 0.7298738956451416,
+      "learning_rate": 0.00011467218207180792,
+      "loss": 1.0225,
+      "step": 10193
+    },
+    {
+      "epoch": 1.8149928774928776,
+      "grad_norm": 0.8075628876686096,
+      "learning_rate": 0.00011465833593948183,
+      "loss": 1.0429,
+      "step": 10194
+    },
+    {
+      "epoch": 1.8151709401709402,
+      "grad_norm": 0.8196593523025513,
+      "learning_rate": 0.0001146444895199561,
+      "loss": 0.9148,
+      "step": 10195
+    },
+    {
+      "epoch": 1.8153490028490027,
+      "grad_norm": 0.6394698023796082,
+      "learning_rate": 0.00011463064281350204,
+      "loss": 0.9781,
+      "step": 10196
+    },
+    {
+      "epoch": 1.8155270655270654,
+      "grad_norm": 0.7302836775779724,
+      "learning_rate": 0.00011461679582039091,
+      "loss": 1.0394,
+      "step": 10197
+    },
+    {
+      "epoch": 1.8157051282051282,
+      "grad_norm": 0.7066670060157776,
+      "learning_rate": 0.00011460294854089404,
+      "loss": 1.1153,
+      "step": 10198
+    },
+    {
+      "epoch": 1.815883190883191,
+      "grad_norm": 0.6471068263053894,
+      "learning_rate": 0.0001145891009752827,
+      "loss": 1.1533,
+      "step": 10199
+    },
+    {
+      "epoch": 1.8160612535612537,
+      "grad_norm": 0.6842355132102966,
+      "learning_rate": 0.00011457525312382826,
+      "loss": 0.953,
+      "step": 10200
+    },
+    {
+      "epoch": 1.8162393162393162,
+      "grad_norm": 0.6720319986343384,
+      "learning_rate": 0.00011456140498680202,
+      "loss": 1.003,
+      "step": 10201
+    },
+    {
+      "epoch": 1.8164173789173788,
+      "grad_norm": 0.632017970085144,
+      "learning_rate": 0.00011454755656447527,
+      "loss": 0.8148,
+      "step": 10202
+    },
+    {
+      "epoch": 1.8165954415954415,
+      "grad_norm": 0.7193828225135803,
+      "learning_rate": 0.00011453370785711939,
+      "loss": 1.0098,
+      "step": 10203
+    },
+    {
+      "epoch": 1.8167735042735043,
+      "grad_norm": 0.7098045349121094,
+      "learning_rate": 0.00011451985886500566,
+      "loss": 1.1276,
+      "step": 10204
+    },
+    {
+      "epoch": 1.816951566951567,
+      "grad_norm": 0.7076733708381653,
+      "learning_rate": 0.00011450600958840547,
+      "loss": 1.1216,
+      "step": 10205
+    },
+    {
+      "epoch": 1.8171296296296298,
+      "grad_norm": 0.6864610314369202,
+      "learning_rate": 0.00011449216002759018,
+      "loss": 0.9896,
+      "step": 10206
+    },
+    {
+      "epoch": 1.8173076923076923,
+      "grad_norm": 0.737727701663971,
+      "learning_rate": 0.0001144783101828311,
+      "loss": 0.9447,
+      "step": 10207
+    },
+    {
+      "epoch": 1.8174857549857548,
+      "grad_norm": 0.6562525033950806,
+      "learning_rate": 0.00011446446005439964,
+      "loss": 1.1208,
+      "step": 10208
+    },
+    {
+      "epoch": 1.8176638176638176,
+      "grad_norm": 0.7203826308250427,
+      "learning_rate": 0.0001144506096425671,
+      "loss": 1.1339,
+      "step": 10209
+    },
+    {
+      "epoch": 1.8178418803418803,
+      "grad_norm": 0.6657233834266663,
+      "learning_rate": 0.00011443675894760489,
+      "loss": 0.8307,
+      "step": 10210
+    },
+    {
+      "epoch": 1.818019943019943,
+      "grad_norm": 0.7032586932182312,
+      "learning_rate": 0.00011442290796978437,
+      "loss": 0.8546,
+      "step": 10211
+    },
+    {
+      "epoch": 1.8181980056980058,
+      "grad_norm": 0.6989460587501526,
+      "learning_rate": 0.00011440905670937696,
+      "loss": 1.0749,
+      "step": 10212
+    },
+    {
+      "epoch": 1.8183760683760684,
+      "grad_norm": 0.6461085677146912,
+      "learning_rate": 0.00011439520516665399,
+      "loss": 0.984,
+      "step": 10213
+    },
+    {
+      "epoch": 1.818554131054131,
+      "grad_norm": 0.7077372670173645,
+      "learning_rate": 0.00011438135334188689,
+      "loss": 1.0813,
+      "step": 10214
+    },
+    {
+      "epoch": 1.8187321937321936,
+      "grad_norm": 0.6724075675010681,
+      "learning_rate": 0.00011436750123534704,
+      "loss": 0.9975,
+      "step": 10215
+    },
+    {
+      "epoch": 1.8189102564102564,
+      "grad_norm": 0.6205753684043884,
+      "learning_rate": 0.00011435364884730583,
+      "loss": 0.7414,
+      "step": 10216
+    },
+    {
+      "epoch": 1.8190883190883191,
+      "grad_norm": 0.6416093707084656,
+      "learning_rate": 0.00011433979617803472,
+      "loss": 1.0024,
+      "step": 10217
+    },
+    {
+      "epoch": 1.819266381766382,
+      "grad_norm": 0.7817183136940002,
+      "learning_rate": 0.00011432594322780508,
+      "loss": 1.0577,
+      "step": 10218
+    },
+    {
+      "epoch": 1.8194444444444444,
+      "grad_norm": 0.688220202922821,
+      "learning_rate": 0.00011431208999688835,
+      "loss": 1.0301,
+      "step": 10219
+    },
+    {
+      "epoch": 1.8196225071225072,
+      "grad_norm": 0.6464754343032837,
+      "learning_rate": 0.0001142982364855559,
+      "loss": 1.0608,
+      "step": 10220
+    },
+    {
+      "epoch": 1.8198005698005697,
+      "grad_norm": 0.6607306599617004,
+      "learning_rate": 0.00011428438269407926,
+      "loss": 1.1203,
+      "step": 10221
+    },
+    {
+      "epoch": 1.8199786324786325,
+      "grad_norm": 0.5779942870140076,
+      "learning_rate": 0.00011427052862272982,
+      "loss": 0.7895,
+      "step": 10222
+    },
+    {
+      "epoch": 1.8201566951566952,
+      "grad_norm": 0.7599068880081177,
+      "learning_rate": 0.000114256674271779,
+      "loss": 0.883,
+      "step": 10223
+    },
+    {
+      "epoch": 1.820334757834758,
+      "grad_norm": 0.6578865051269531,
+      "learning_rate": 0.00011424281964149824,
+      "loss": 1.101,
+      "step": 10224
+    },
+    {
+      "epoch": 1.8205128205128205,
+      "grad_norm": 0.7090746760368347,
+      "learning_rate": 0.00011422896473215905,
+      "loss": 0.9514,
+      "step": 10225
+    },
+    {
+      "epoch": 1.8206908831908832,
+      "grad_norm": 0.7537758946418762,
+      "learning_rate": 0.00011421510954403281,
+      "loss": 1.2193,
+      "step": 10226
+    },
+    {
+      "epoch": 1.8208689458689458,
+      "grad_norm": 0.670183002948761,
+      "learning_rate": 0.00011420125407739106,
+      "loss": 1.1408,
+      "step": 10227
+    },
+    {
+      "epoch": 1.8210470085470085,
+      "grad_norm": 0.742520809173584,
+      "learning_rate": 0.00011418739833250524,
+      "loss": 0.8826,
+      "step": 10228
+    },
+    {
+      "epoch": 1.8212250712250713,
+      "grad_norm": 0.6542800664901733,
+      "learning_rate": 0.00011417354230964683,
+      "loss": 1.0039,
+      "step": 10229
+    },
+    {
+      "epoch": 1.821403133903134,
+      "grad_norm": 0.6713709235191345,
+      "learning_rate": 0.00011415968600908727,
+      "loss": 0.9351,
+      "step": 10230
+    },
+    {
+      "epoch": 1.8215811965811965,
+      "grad_norm": 0.6794951558113098,
+      "learning_rate": 0.0001141458294310981,
+      "loss": 0.9491,
+      "step": 10231
+    },
+    {
+      "epoch": 1.8217592592592593,
+      "grad_norm": 0.6921972632408142,
+      "learning_rate": 0.00011413197257595079,
+      "loss": 1.1342,
+      "step": 10232
+    },
+    {
+      "epoch": 1.8219373219373218,
+      "grad_norm": 0.702586829662323,
+      "learning_rate": 0.00011411811544391682,
+      "loss": 0.9992,
+      "step": 10233
+    },
+    {
+      "epoch": 1.8221153846153846,
+      "grad_norm": 0.8147975206375122,
+      "learning_rate": 0.00011410425803526772,
+      "loss": 1.0507,
+      "step": 10234
+    },
+    {
+      "epoch": 1.8222934472934473,
+      "grad_norm": 0.66419517993927,
+      "learning_rate": 0.00011409040035027496,
+      "loss": 1.0426,
+      "step": 10235
+    },
+    {
+      "epoch": 1.82247150997151,
+      "grad_norm": 0.6132485866546631,
+      "learning_rate": 0.00011407654238921011,
+      "loss": 0.9859,
+      "step": 10236
+    },
+    {
+      "epoch": 1.8226495726495726,
+      "grad_norm": 0.7522366046905518,
+      "learning_rate": 0.00011406268415234462,
+      "loss": 0.9379,
+      "step": 10237
+    },
+    {
+      "epoch": 1.8228276353276354,
+      "grad_norm": 0.6335554122924805,
+      "learning_rate": 0.00011404882563995007,
+      "loss": 0.9322,
+      "step": 10238
+    },
+    {
+      "epoch": 1.823005698005698,
+      "grad_norm": 0.7577497363090515,
+      "learning_rate": 0.00011403496685229797,
+      "loss": 1.1383,
+      "step": 10239
+    },
+    {
+      "epoch": 1.8231837606837606,
+      "grad_norm": 0.6796886920928955,
+      "learning_rate": 0.00011402110778965982,
+      "loss": 1.0092,
+      "step": 10240
+    },
+    {
+      "epoch": 1.8233618233618234,
+      "grad_norm": 0.7676617503166199,
+      "learning_rate": 0.0001140072484523072,
+      "loss": 1.0137,
+      "step": 10241
+    },
+    {
+      "epoch": 1.8235398860398861,
+      "grad_norm": 0.7807821035385132,
+      "learning_rate": 0.00011399338884051165,
+      "loss": 0.8987,
+      "step": 10242
+    },
+    {
+      "epoch": 1.8237179487179487,
+      "grad_norm": 0.7169568538665771,
+      "learning_rate": 0.00011397952895454473,
+      "loss": 0.8984,
+      "step": 10243
+    },
+    {
+      "epoch": 1.8238960113960114,
+      "grad_norm": 0.6564654111862183,
+      "learning_rate": 0.00011396566879467793,
+      "loss": 1.0255,
+      "step": 10244
+    },
+    {
+      "epoch": 1.824074074074074,
+      "grad_norm": 0.7290034294128418,
+      "learning_rate": 0.00011395180836118292,
+      "loss": 0.9962,
+      "step": 10245
+    },
+    {
+      "epoch": 1.8242521367521367,
+      "grad_norm": 0.6610758900642395,
+      "learning_rate": 0.00011393794765433115,
+      "loss": 1.102,
+      "step": 10246
+    },
+    {
+      "epoch": 1.8244301994301995,
+      "grad_norm": 0.6875932216644287,
+      "learning_rate": 0.0001139240866743943,
+      "loss": 0.9963,
+      "step": 10247
+    },
+    {
+      "epoch": 1.8246082621082622,
+      "grad_norm": 0.7595645189285278,
+      "learning_rate": 0.00011391022542164387,
+      "loss": 1.1285,
+      "step": 10248
+    },
+    {
+      "epoch": 1.8247863247863247,
+      "grad_norm": 0.6752721667289734,
+      "learning_rate": 0.0001138963638963515,
+      "loss": 0.9447,
+      "step": 10249
+    },
+    {
+      "epoch": 1.8249643874643875,
+      "grad_norm": 0.6697955131530762,
+      "learning_rate": 0.00011388250209878873,
+      "loss": 1.0804,
+      "step": 10250
+    },
+    {
+      "epoch": 1.82514245014245,
+      "grad_norm": 0.6546956896781921,
+      "learning_rate": 0.00011386864002922713,
+      "loss": 0.9626,
+      "step": 10251
+    },
+    {
+      "epoch": 1.8253205128205128,
+      "grad_norm": 0.8002896904945374,
+      "learning_rate": 0.00011385477768793838,
+      "loss": 1.1933,
+      "step": 10252
+    },
+    {
+      "epoch": 1.8254985754985755,
+      "grad_norm": 0.6566781401634216,
+      "learning_rate": 0.00011384091507519403,
+      "loss": 0.9802,
+      "step": 10253
+    },
+    {
+      "epoch": 1.8256766381766383,
+      "grad_norm": 0.617420494556427,
+      "learning_rate": 0.00011382705219126572,
+      "loss": 1.1098,
+      "step": 10254
+    },
+    {
+      "epoch": 1.8258547008547008,
+      "grad_norm": 0.6558036208152771,
+      "learning_rate": 0.00011381318903642504,
+      "loss": 1.0291,
+      "step": 10255
+    },
+    {
+      "epoch": 1.8260327635327636,
+      "grad_norm": 0.6295637488365173,
+      "learning_rate": 0.00011379932561094358,
+      "loss": 1.0792,
+      "step": 10256
+    },
+    {
+      "epoch": 1.826210826210826,
+      "grad_norm": 0.7475154399871826,
+      "learning_rate": 0.00011378546191509303,
+      "loss": 1.1362,
+      "step": 10257
+    },
+    {
+      "epoch": 1.8263888888888888,
+      "grad_norm": 0.6814939379692078,
+      "learning_rate": 0.00011377159794914498,
+      "loss": 0.9131,
+      "step": 10258
+    },
+    {
+      "epoch": 1.8265669515669516,
+      "grad_norm": 0.6726876497268677,
+      "learning_rate": 0.00011375773371337111,
+      "loss": 0.9147,
+      "step": 10259
+    },
+    {
+      "epoch": 1.8267450142450143,
+      "grad_norm": 0.785943865776062,
+      "learning_rate": 0.00011374386920804298,
+      "loss": 1.0137,
+      "step": 10260
+    },
+    {
+      "epoch": 1.8269230769230769,
+      "grad_norm": 0.7614478468894958,
+      "learning_rate": 0.0001137300044334323,
+      "loss": 1.2118,
+      "step": 10261
+    },
+    {
+      "epoch": 1.8271011396011396,
+      "grad_norm": 0.7317564487457275,
+      "learning_rate": 0.00011371613938981072,
+      "loss": 1.0602,
+      "step": 10262
+    },
+    {
+      "epoch": 1.8272792022792022,
+      "grad_norm": 0.6716432571411133,
+      "learning_rate": 0.00011370227407744987,
+      "loss": 0.952,
+      "step": 10263
+    },
+    {
+      "epoch": 1.827457264957265,
+      "grad_norm": 0.6946425437927246,
+      "learning_rate": 0.00011368840849662139,
+      "loss": 1.0554,
+      "step": 10264
+    },
+    {
+      "epoch": 1.8276353276353277,
+      "grad_norm": 0.6692264080047607,
+      "learning_rate": 0.00011367454264759703,
+      "loss": 0.8944,
+      "step": 10265
+    },
+    {
+      "epoch": 1.8278133903133904,
+      "grad_norm": 0.6931505799293518,
+      "learning_rate": 0.00011366067653064838,
+      "loss": 0.9045,
+      "step": 10266
+    },
+    {
+      "epoch": 1.827991452991453,
+      "grad_norm": 0.7233194708824158,
+      "learning_rate": 0.00011364681014604716,
+      "loss": 0.9441,
+      "step": 10267
+    },
+    {
+      "epoch": 1.8281695156695157,
+      "grad_norm": 0.6451242566108704,
+      "learning_rate": 0.00011363294349406506,
+      "loss": 0.9948,
+      "step": 10268
+    },
+    {
+      "epoch": 1.8283475783475782,
+      "grad_norm": 0.6993351578712463,
+      "learning_rate": 0.00011361907657497375,
+      "loss": 1.1057,
+      "step": 10269
+    },
+    {
+      "epoch": 1.828525641025641,
+      "grad_norm": 0.7241137623786926,
+      "learning_rate": 0.00011360520938904493,
+      "loss": 0.974,
+      "step": 10270
+    },
+    {
+      "epoch": 1.8287037037037037,
+      "grad_norm": 0.6349480152130127,
+      "learning_rate": 0.00011359134193655027,
+      "loss": 0.9026,
+      "step": 10271
+    },
+    {
+      "epoch": 1.8288817663817665,
+      "grad_norm": 0.6916826963424683,
+      "learning_rate": 0.00011357747421776151,
+      "loss": 0.9153,
+      "step": 10272
+    },
+    {
+      "epoch": 1.8290598290598292,
+      "grad_norm": 0.879770040512085,
+      "learning_rate": 0.00011356360623295037,
+      "loss": 1.0818,
+      "step": 10273
+    },
+    {
+      "epoch": 1.8292378917378918,
+      "grad_norm": 0.6293807029724121,
+      "learning_rate": 0.00011354973798238853,
+      "loss": 1.1164,
+      "step": 10274
+    },
+    {
+      "epoch": 1.8294159544159543,
+      "grad_norm": 0.7070622444152832,
+      "learning_rate": 0.0001135358694663477,
+      "loss": 0.8795,
+      "step": 10275
+    },
+    {
+      "epoch": 1.829594017094017,
+      "grad_norm": 0.6847673654556274,
+      "learning_rate": 0.00011352200068509962,
+      "loss": 0.9173,
+      "step": 10276
+    },
+    {
+      "epoch": 1.8297720797720798,
+      "grad_norm": 0.6552146077156067,
+      "learning_rate": 0.00011350813163891605,
+      "loss": 1.0425,
+      "step": 10277
+    },
+    {
+      "epoch": 1.8299501424501425,
+      "grad_norm": 0.6432808041572571,
+      "learning_rate": 0.0001134942623280687,
+      "loss": 0.9418,
+      "step": 10278
+    },
+    {
+      "epoch": 1.8301282051282053,
+      "grad_norm": 0.7412393093109131,
+      "learning_rate": 0.00011348039275282931,
+      "loss": 1.1212,
+      "step": 10279
+    },
+    {
+      "epoch": 1.8303062678062678,
+      "grad_norm": 0.6543423533439636,
+      "learning_rate": 0.00011346652291346965,
+      "loss": 1.0553,
+      "step": 10280
+    },
+    {
+      "epoch": 1.8304843304843303,
+      "grad_norm": 0.7159286141395569,
+      "learning_rate": 0.00011345265281026138,
+      "loss": 1.0582,
+      "step": 10281
+    },
+    {
+      "epoch": 1.830662393162393,
+      "grad_norm": 0.6443323493003845,
+      "learning_rate": 0.00011343878244347639,
+      "loss": 0.9462,
+      "step": 10282
+    },
+    {
+      "epoch": 1.8308404558404558,
+      "grad_norm": 0.7592014074325562,
+      "learning_rate": 0.00011342491181338634,
+      "loss": 1.2718,
+      "step": 10283
+    },
+    {
+      "epoch": 1.8310185185185186,
+      "grad_norm": 0.627109944820404,
+      "learning_rate": 0.00011341104092026302,
+      "loss": 1.0177,
+      "step": 10284
+    },
+    {
+      "epoch": 1.8311965811965814,
+      "grad_norm": 0.8061598539352417,
+      "learning_rate": 0.00011339716976437827,
+      "loss": 0.9416,
+      "step": 10285
+    },
+    {
+      "epoch": 1.8313746438746439,
+      "grad_norm": 0.6584261059761047,
+      "learning_rate": 0.00011338329834600377,
+      "loss": 0.8297,
+      "step": 10286
+    },
+    {
+      "epoch": 1.8315527065527064,
+      "grad_norm": 0.6329470276832581,
+      "learning_rate": 0.00011336942666541133,
+      "loss": 0.8386,
+      "step": 10287
+    },
+    {
+      "epoch": 1.8317307692307692,
+      "grad_norm": 0.6833979487419128,
+      "learning_rate": 0.00011335555472287275,
+      "loss": 0.9407,
+      "step": 10288
+    },
+    {
+      "epoch": 1.831908831908832,
+      "grad_norm": 0.7663840651512146,
+      "learning_rate": 0.00011334168251865985,
+      "loss": 1.0018,
+      "step": 10289
+    },
+    {
+      "epoch": 1.8320868945868947,
+      "grad_norm": 0.7751262784004211,
+      "learning_rate": 0.00011332781005304436,
+      "loss": 1.0576,
+      "step": 10290
+    },
+    {
+      "epoch": 1.8322649572649574,
+      "grad_norm": 0.6857370138168335,
+      "learning_rate": 0.00011331393732629814,
+      "loss": 0.9888,
+      "step": 10291
+    },
+    {
+      "epoch": 1.83244301994302,
+      "grad_norm": 0.7534535527229309,
+      "learning_rate": 0.00011330006433869296,
+      "loss": 1.0834,
+      "step": 10292
+    },
+    {
+      "epoch": 1.8326210826210825,
+      "grad_norm": 0.6785250306129456,
+      "learning_rate": 0.00011328619109050065,
+      "loss": 1.0471,
+      "step": 10293
+    },
+    {
+      "epoch": 1.8327991452991452,
+      "grad_norm": 0.7023689150810242,
+      "learning_rate": 0.00011327231758199303,
+      "loss": 1.0652,
+      "step": 10294
+    },
+    {
+      "epoch": 1.832977207977208,
+      "grad_norm": 0.6776610612869263,
+      "learning_rate": 0.00011325844381344192,
+      "loss": 0.9504,
+      "step": 10295
+    },
+    {
+      "epoch": 1.8331552706552707,
+      "grad_norm": 0.7704112529754639,
+      "learning_rate": 0.00011324456978511917,
+      "loss": 0.9712,
+      "step": 10296
+    },
+    {
+      "epoch": 1.8333333333333335,
+      "grad_norm": 0.601502537727356,
+      "learning_rate": 0.00011323069549729654,
+      "loss": 1.075,
+      "step": 10297
+    },
+    {
+      "epoch": 1.833511396011396,
+      "grad_norm": 0.6282439231872559,
+      "learning_rate": 0.00011321682095024596,
+      "loss": 0.9238,
+      "step": 10298
+    },
+    {
+      "epoch": 1.8336894586894585,
+      "grad_norm": 0.6873499155044556,
+      "learning_rate": 0.00011320294614423921,
+      "loss": 1.0464,
+      "step": 10299
+    },
+    {
+      "epoch": 1.8338675213675213,
+      "grad_norm": 0.6063792705535889,
+      "learning_rate": 0.00011318907107954815,
+      "loss": 0.9732,
+      "step": 10300
+    },
+    {
+      "epoch": 1.834045584045584,
+      "grad_norm": 0.5830921530723572,
+      "learning_rate": 0.00011317519575644464,
+      "loss": 0.7568,
+      "step": 10301
+    },
+    {
+      "epoch": 1.8342236467236468,
+      "grad_norm": 0.6394222378730774,
+      "learning_rate": 0.00011316132017520053,
+      "loss": 0.9958,
+      "step": 10302
+    },
+    {
+      "epoch": 1.8344017094017095,
+      "grad_norm": 0.7052412033081055,
+      "learning_rate": 0.00011314744433608773,
+      "loss": 0.9129,
+      "step": 10303
+    },
+    {
+      "epoch": 1.834579772079772,
+      "grad_norm": 0.7287624478340149,
+      "learning_rate": 0.00011313356823937801,
+      "loss": 0.8608,
+      "step": 10304
+    },
+    {
+      "epoch": 1.8347578347578346,
+      "grad_norm": 0.702937662601471,
+      "learning_rate": 0.00011311969188534334,
+      "loss": 1.3074,
+      "step": 10305
+    },
+    {
+      "epoch": 1.8349358974358974,
+      "grad_norm": 0.6693850159645081,
+      "learning_rate": 0.00011310581527425557,
+      "loss": 0.928,
+      "step": 10306
+    },
+    {
+      "epoch": 1.83511396011396,
+      "grad_norm": 0.8153932094573975,
+      "learning_rate": 0.00011309193840638654,
+      "loss": 1.1771,
+      "step": 10307
+    },
+    {
+      "epoch": 1.8352920227920229,
+      "grad_norm": 0.6517418622970581,
+      "learning_rate": 0.00011307806128200821,
+      "loss": 0.9634,
+      "step": 10308
+    },
+    {
+      "epoch": 1.8354700854700856,
+      "grad_norm": 0.6626226305961609,
+      "learning_rate": 0.00011306418390139245,
+      "loss": 0.9371,
+      "step": 10309
+    },
+    {
+      "epoch": 1.8356481481481481,
+      "grad_norm": 0.7397477030754089,
+      "learning_rate": 0.0001130503062648111,
+      "loss": 0.9398,
+      "step": 10310
+    },
+    {
+      "epoch": 1.8358262108262107,
+      "grad_norm": 0.6790265440940857,
+      "learning_rate": 0.00011303642837253614,
+      "loss": 0.9728,
+      "step": 10311
+    },
+    {
+      "epoch": 1.8360042735042734,
+      "grad_norm": 0.6266449093818665,
+      "learning_rate": 0.00011302255022483941,
+      "loss": 0.847,
+      "step": 10312
+    },
+    {
+      "epoch": 1.8361823361823362,
+      "grad_norm": 0.791657030582428,
+      "learning_rate": 0.00011300867182199288,
+      "loss": 0.8342,
+      "step": 10313
+    },
+    {
+      "epoch": 1.836360398860399,
+      "grad_norm": 0.7128583788871765,
+      "learning_rate": 0.00011299479316426846,
+      "loss": 0.9591,
+      "step": 10314
+    },
+    {
+      "epoch": 1.8365384615384617,
+      "grad_norm": 0.659928023815155,
+      "learning_rate": 0.00011298091425193806,
+      "loss": 1.0282,
+      "step": 10315
+    },
+    {
+      "epoch": 1.8367165242165242,
+      "grad_norm": 0.6641396284103394,
+      "learning_rate": 0.00011296703508527363,
+      "loss": 1.0161,
+      "step": 10316
+    },
+    {
+      "epoch": 1.8368945868945867,
+      "grad_norm": 0.7921316027641296,
+      "learning_rate": 0.00011295315566454702,
+      "loss": 0.8897,
+      "step": 10317
+    },
+    {
+      "epoch": 1.8370726495726495,
+      "grad_norm": 0.6900694966316223,
+      "learning_rate": 0.00011293927599003029,
+      "loss": 1.0094,
+      "step": 10318
+    },
+    {
+      "epoch": 1.8372507122507122,
+      "grad_norm": 0.8054366707801819,
+      "learning_rate": 0.0001129253960619953,
+      "loss": 0.9489,
+      "step": 10319
+    },
+    {
+      "epoch": 1.837428774928775,
+      "grad_norm": 0.6623767018318176,
+      "learning_rate": 0.00011291151588071405,
+      "loss": 0.92,
+      "step": 10320
+    },
+    {
+      "epoch": 1.8376068376068377,
+      "grad_norm": 0.6143901348114014,
+      "learning_rate": 0.00011289763544645846,
+      "loss": 0.8093,
+      "step": 10321
+    },
+    {
+      "epoch": 1.8377849002849003,
+      "grad_norm": 0.8207027316093445,
+      "learning_rate": 0.00011288375475950046,
+      "loss": 1.2402,
+      "step": 10322
+    },
+    {
+      "epoch": 1.8379629629629628,
+      "grad_norm": 0.6759985685348511,
+      "learning_rate": 0.00011286987382011209,
+      "loss": 0.9179,
+      "step": 10323
+    },
+    {
+      "epoch": 1.8381410256410255,
+      "grad_norm": 0.745439887046814,
+      "learning_rate": 0.00011285599262856523,
+      "loss": 0.8157,
+      "step": 10324
+    },
+    {
+      "epoch": 1.8383190883190883,
+      "grad_norm": 0.6873317360877991,
+      "learning_rate": 0.00011284211118513194,
+      "loss": 0.8681,
+      "step": 10325
+    },
+    {
+      "epoch": 1.838497150997151,
+      "grad_norm": 0.7060160040855408,
+      "learning_rate": 0.00011282822949008416,
+      "loss": 1.0833,
+      "step": 10326
+    },
+    {
+      "epoch": 1.8386752136752138,
+      "grad_norm": 0.8079642653465271,
+      "learning_rate": 0.00011281434754369389,
+      "loss": 0.8639,
+      "step": 10327
+    },
+    {
+      "epoch": 1.8388532763532763,
+      "grad_norm": 0.6434001922607422,
+      "learning_rate": 0.00011280046534623303,
+      "loss": 0.9269,
+      "step": 10328
+    },
+    {
+      "epoch": 1.839031339031339,
+      "grad_norm": 0.7005292773246765,
+      "learning_rate": 0.0001127865828979737,
+      "loss": 1.1475,
+      "step": 10329
+    },
+    {
+      "epoch": 1.8392094017094016,
+      "grad_norm": 0.7004852890968323,
+      "learning_rate": 0.00011277270019918784,
+      "loss": 0.9467,
+      "step": 10330
+    },
+    {
+      "epoch": 1.8393874643874644,
+      "grad_norm": 0.7542549967765808,
+      "learning_rate": 0.00011275881725014743,
+      "loss": 1.0371,
+      "step": 10331
+    },
+    {
+      "epoch": 1.8395655270655271,
+      "grad_norm": 0.674051821231842,
+      "learning_rate": 0.00011274493405112452,
+      "loss": 1.1097,
+      "step": 10332
+    },
+    {
+      "epoch": 1.8397435897435899,
+      "grad_norm": 0.8136405348777771,
+      "learning_rate": 0.00011273105060239107,
+      "loss": 0.9718,
+      "step": 10333
+    },
+    {
+      "epoch": 1.8399216524216524,
+      "grad_norm": 0.6524073481559753,
+      "learning_rate": 0.00011271716690421916,
+      "loss": 0.9953,
+      "step": 10334
+    },
+    {
+      "epoch": 1.8400997150997151,
+      "grad_norm": 0.7436625957489014,
+      "learning_rate": 0.00011270328295688077,
+      "loss": 1.0722,
+      "step": 10335
+    },
+    {
+      "epoch": 1.8402777777777777,
+      "grad_norm": 0.6815723180770874,
+      "learning_rate": 0.00011268939876064795,
+      "loss": 1.0924,
+      "step": 10336
+    },
+    {
+      "epoch": 1.8404558404558404,
+      "grad_norm": 0.6923388242721558,
+      "learning_rate": 0.0001126755143157927,
+      "loss": 0.921,
+      "step": 10337
+    },
+    {
+      "epoch": 1.8406339031339032,
+      "grad_norm": 0.7464849948883057,
+      "learning_rate": 0.00011266162962258708,
+      "loss": 1.0549,
+      "step": 10338
+    },
+    {
+      "epoch": 1.840811965811966,
+      "grad_norm": 0.6621805429458618,
+      "learning_rate": 0.00011264774468130315,
+      "loss": 1.0764,
+      "step": 10339
+    },
+    {
+      "epoch": 1.8409900284900285,
+      "grad_norm": 0.7370132803916931,
+      "learning_rate": 0.00011263385949221295,
+      "loss": 0.7818,
+      "step": 10340
+    },
+    {
+      "epoch": 1.8411680911680912,
+      "grad_norm": 0.673100471496582,
+      "learning_rate": 0.00011261997405558848,
+      "loss": 1.04,
+      "step": 10341
+    },
+    {
+      "epoch": 1.8413461538461537,
+      "grad_norm": 0.5978201031684875,
+      "learning_rate": 0.00011260608837170183,
+      "loss": 0.9644,
+      "step": 10342
+    },
+    {
+      "epoch": 1.8415242165242165,
+      "grad_norm": 0.6868628263473511,
+      "learning_rate": 0.00011259220244082507,
+      "loss": 0.9533,
+      "step": 10343
+    },
+    {
+      "epoch": 1.8417022792022792,
+      "grad_norm": 0.6580314636230469,
+      "learning_rate": 0.0001125783162632303,
+      "loss": 0.9506,
+      "step": 10344
+    },
+    {
+      "epoch": 1.841880341880342,
+      "grad_norm": 0.7238291501998901,
+      "learning_rate": 0.00011256442983918951,
+      "loss": 0.8663,
+      "step": 10345
+    },
+    {
+      "epoch": 1.8420584045584045,
+      "grad_norm": 0.5838520526885986,
+      "learning_rate": 0.00011255054316897484,
+      "loss": 0.9606,
+      "step": 10346
+    },
+    {
+      "epoch": 1.8422364672364673,
+      "grad_norm": 0.7102842926979065,
+      "learning_rate": 0.00011253665625285836,
+      "loss": 0.801,
+      "step": 10347
+    },
+    {
+      "epoch": 1.8424145299145298,
+      "grad_norm": 0.6449147462844849,
+      "learning_rate": 0.0001125227690911121,
+      "loss": 1.0827,
+      "step": 10348
+    },
+    {
+      "epoch": 1.8425925925925926,
+      "grad_norm": 0.6355304718017578,
+      "learning_rate": 0.00011250888168400823,
+      "loss": 1.0369,
+      "step": 10349
+    },
+    {
+      "epoch": 1.8427706552706553,
+      "grad_norm": 0.678977906703949,
+      "learning_rate": 0.0001124949940318188,
+      "loss": 0.9491,
+      "step": 10350
+    },
+    {
+      "epoch": 1.842948717948718,
+      "grad_norm": 0.6366633772850037,
+      "learning_rate": 0.00011248110613481592,
+      "loss": 0.7272,
+      "step": 10351
+    },
+    {
+      "epoch": 1.8431267806267806,
+      "grad_norm": 0.6639098525047302,
+      "learning_rate": 0.00011246721799327171,
+      "loss": 1.0313,
+      "step": 10352
+    },
+    {
+      "epoch": 1.8433048433048433,
+      "grad_norm": 0.6034720540046692,
+      "learning_rate": 0.00011245332960745822,
+      "loss": 0.7141,
+      "step": 10353
+    },
+    {
+      "epoch": 1.8434829059829059,
+      "grad_norm": 0.8118346333503723,
+      "learning_rate": 0.00011243944097764763,
+      "loss": 1.171,
+      "step": 10354
+    },
+    {
+      "epoch": 1.8436609686609686,
+      "grad_norm": 0.6706618070602417,
+      "learning_rate": 0.00011242555210411203,
+      "loss": 0.9578,
+      "step": 10355
+    },
+    {
+      "epoch": 1.8438390313390314,
+      "grad_norm": 0.619562029838562,
+      "learning_rate": 0.00011241166298712355,
+      "loss": 0.9883,
+      "step": 10356
+    },
+    {
+      "epoch": 1.8440170940170941,
+      "grad_norm": 0.6471936106681824,
+      "learning_rate": 0.00011239777362695434,
+      "loss": 0.8897,
+      "step": 10357
+    },
+    {
+      "epoch": 1.8441951566951567,
+      "grad_norm": 0.7179005742073059,
+      "learning_rate": 0.00011238388402387645,
+      "loss": 0.9646,
+      "step": 10358
+    },
+    {
+      "epoch": 1.8443732193732194,
+      "grad_norm": 0.7726966738700867,
+      "learning_rate": 0.00011236999417816214,
+      "loss": 0.8855,
+      "step": 10359
+    },
+    {
+      "epoch": 1.844551282051282,
+      "grad_norm": 0.6733565330505371,
+      "learning_rate": 0.00011235610409008346,
+      "loss": 1.0379,
+      "step": 10360
+    },
+    {
+      "epoch": 1.8447293447293447,
+      "grad_norm": 0.7317814826965332,
+      "learning_rate": 0.0001123422137599126,
+      "loss": 0.8528,
+      "step": 10361
+    },
+    {
+      "epoch": 1.8449074074074074,
+      "grad_norm": 0.6727005839347839,
+      "learning_rate": 0.0001123283231879217,
+      "loss": 0.9612,
+      "step": 10362
+    },
+    {
+      "epoch": 1.8450854700854702,
+      "grad_norm": 0.6350542306900024,
+      "learning_rate": 0.00011231443237438289,
+      "loss": 0.9939,
+      "step": 10363
+    },
+    {
+      "epoch": 1.8452635327635327,
+      "grad_norm": 0.693148672580719,
+      "learning_rate": 0.00011230054131956836,
+      "loss": 1.0149,
+      "step": 10364
+    },
+    {
+      "epoch": 1.8454415954415955,
+      "grad_norm": 0.7263579368591309,
+      "learning_rate": 0.0001122866500237503,
+      "loss": 1.1044,
+      "step": 10365
+    },
+    {
+      "epoch": 1.845619658119658,
+      "grad_norm": 0.7044230699539185,
+      "learning_rate": 0.00011227275848720085,
+      "loss": 1.0677,
+      "step": 10366
+    },
+    {
+      "epoch": 1.8457977207977208,
+      "grad_norm": 0.6895326972007751,
+      "learning_rate": 0.00011225886671019219,
+      "loss": 1.1025,
+      "step": 10367
+    },
+    {
+      "epoch": 1.8459757834757835,
+      "grad_norm": 0.6045145988464355,
+      "learning_rate": 0.00011224497469299651,
+      "loss": 0.8079,
+      "step": 10368
+    },
+    {
+      "epoch": 1.8461538461538463,
+      "grad_norm": 0.6613210439682007,
+      "learning_rate": 0.00011223108243588599,
+      "loss": 1.0345,
+      "step": 10369
+    },
+    {
+      "epoch": 1.8463319088319088,
+      "grad_norm": 0.6288960576057434,
+      "learning_rate": 0.0001122171899391328,
+      "loss": 1.0166,
+      "step": 10370
+    },
+    {
+      "epoch": 1.8465099715099715,
+      "grad_norm": 0.6158748865127563,
+      "learning_rate": 0.00011220329720300917,
+      "loss": 0.895,
+      "step": 10371
+    },
+    {
+      "epoch": 1.846688034188034,
+      "grad_norm": 0.6583057641983032,
+      "learning_rate": 0.00011218940422778728,
+      "loss": 0.8059,
+      "step": 10372
+    },
+    {
+      "epoch": 1.8468660968660968,
+      "grad_norm": 0.6761550903320312,
+      "learning_rate": 0.00011217551101373932,
+      "loss": 0.9253,
+      "step": 10373
+    },
+    {
+      "epoch": 1.8470441595441596,
+      "grad_norm": 0.5969263315200806,
+      "learning_rate": 0.0001121616175611375,
+      "loss": 0.8549,
+      "step": 10374
+    },
+    {
+      "epoch": 1.8472222222222223,
+      "grad_norm": 0.7994722723960876,
+      "learning_rate": 0.00011214772387025407,
+      "loss": 0.9918,
+      "step": 10375
+    },
+    {
+      "epoch": 1.8474002849002849,
+      "grad_norm": 0.6949167847633362,
+      "learning_rate": 0.00011213382994136123,
+      "loss": 1.1853,
+      "step": 10376
+    },
+    {
+      "epoch": 1.8475783475783476,
+      "grad_norm": 0.7356176376342773,
+      "learning_rate": 0.00011211993577473121,
+      "loss": 0.8809,
+      "step": 10377
+    },
+    {
+      "epoch": 1.8477564102564101,
+      "grad_norm": 0.7110268473625183,
+      "learning_rate": 0.0001121060413706362,
+      "loss": 0.9805,
+      "step": 10378
+    },
+    {
+      "epoch": 1.8479344729344729,
+      "grad_norm": 0.6509962677955627,
+      "learning_rate": 0.00011209214672934846,
+      "loss": 0.8899,
+      "step": 10379
+    },
+    {
+      "epoch": 1.8481125356125356,
+      "grad_norm": 0.6103082299232483,
+      "learning_rate": 0.00011207825185114025,
+      "loss": 0.8576,
+      "step": 10380
+    },
+    {
+      "epoch": 1.8482905982905984,
+      "grad_norm": 0.6261070966720581,
+      "learning_rate": 0.00011206435673628377,
+      "loss": 0.8884,
+      "step": 10381
+    },
+    {
+      "epoch": 1.848468660968661,
+      "grad_norm": 0.7629222273826599,
+      "learning_rate": 0.00011205046138505126,
+      "loss": 1.1714,
+      "step": 10382
+    },
+    {
+      "epoch": 1.8486467236467237,
+      "grad_norm": 0.617957353591919,
+      "learning_rate": 0.000112036565797715,
+      "loss": 0.9546,
+      "step": 10383
+    },
+    {
+      "epoch": 1.8488247863247862,
+      "grad_norm": 0.6926987171173096,
+      "learning_rate": 0.00011202266997454724,
+      "loss": 0.8842,
+      "step": 10384
+    },
+    {
+      "epoch": 1.849002849002849,
+      "grad_norm": 0.602758526802063,
+      "learning_rate": 0.00011200877391582025,
+      "loss": 0.9782,
+      "step": 10385
+    },
+    {
+      "epoch": 1.8491809116809117,
+      "grad_norm": 0.706731915473938,
+      "learning_rate": 0.00011199487762180627,
+      "loss": 0.8176,
+      "step": 10386
+    },
+    {
+      "epoch": 1.8493589743589745,
+      "grad_norm": 0.7135118842124939,
+      "learning_rate": 0.0001119809810927776,
+      "loss": 0.9277,
+      "step": 10387
+    },
+    {
+      "epoch": 1.8495370370370372,
+      "grad_norm": 0.7484592199325562,
+      "learning_rate": 0.00011196708432900647,
+      "loss": 1.0733,
+      "step": 10388
+    },
+    {
+      "epoch": 1.8497150997150997,
+      "grad_norm": 0.7087157964706421,
+      "learning_rate": 0.00011195318733076519,
+      "loss": 0.9443,
+      "step": 10389
+    },
+    {
+      "epoch": 1.8498931623931623,
+      "grad_norm": 0.6511468291282654,
+      "learning_rate": 0.00011193929009832602,
+      "loss": 0.955,
+      "step": 10390
+    },
+    {
+      "epoch": 1.850071225071225,
+      "grad_norm": 0.6386628746986389,
+      "learning_rate": 0.0001119253926319613,
+      "loss": 1.0357,
+      "step": 10391
+    },
+    {
+      "epoch": 1.8502492877492878,
+      "grad_norm": 0.6400021314620972,
+      "learning_rate": 0.00011191149493194327,
+      "loss": 0.8094,
+      "step": 10392
+    },
+    {
+      "epoch": 1.8504273504273505,
+      "grad_norm": 0.7942537069320679,
+      "learning_rate": 0.00011189759699854423,
+      "loss": 0.9717,
+      "step": 10393
+    },
+    {
+      "epoch": 1.8506054131054133,
+      "grad_norm": 0.7230474948883057,
+      "learning_rate": 0.00011188369883203647,
+      "loss": 0.9043,
+      "step": 10394
+    },
+    {
+      "epoch": 1.8507834757834758,
+      "grad_norm": 0.8837162852287292,
+      "learning_rate": 0.00011186980043269235,
+      "loss": 1.2821,
+      "step": 10395
+    },
+    {
+      "epoch": 1.8509615384615383,
+      "grad_norm": 0.7260291576385498,
+      "learning_rate": 0.00011185590180078413,
+      "loss": 1.1672,
+      "step": 10396
+    },
+    {
+      "epoch": 1.851139601139601,
+      "grad_norm": 0.6290066242218018,
+      "learning_rate": 0.00011184200293658415,
+      "loss": 0.8942,
+      "step": 10397
+    },
+    {
+      "epoch": 1.8513176638176638,
+      "grad_norm": 0.6571013331413269,
+      "learning_rate": 0.00011182810384036475,
+      "loss": 1.0753,
+      "step": 10398
+    },
+    {
+      "epoch": 1.8514957264957266,
+      "grad_norm": 0.6494737267494202,
+      "learning_rate": 0.00011181420451239817,
+      "loss": 0.8833,
+      "step": 10399
+    },
+    {
+      "epoch": 1.8516737891737893,
+      "grad_norm": 0.7383694648742676,
+      "learning_rate": 0.00011180030495295684,
+      "loss": 1.0094,
+      "step": 10400
+    },
+    {
+      "epoch": 1.8518518518518519,
+      "grad_norm": 0.6713876724243164,
+      "learning_rate": 0.00011178640516231302,
+      "loss": 0.975,
+      "step": 10401
+    },
+    {
+      "epoch": 1.8520299145299144,
+      "grad_norm": 0.8041042685508728,
+      "learning_rate": 0.00011177250514073912,
+      "loss": 1.1419,
+      "step": 10402
+    },
+    {
+      "epoch": 1.8522079772079771,
+      "grad_norm": 0.7035061120986938,
+      "learning_rate": 0.00011175860488850738,
+      "loss": 1.0921,
+      "step": 10403
+    },
+    {
+      "epoch": 1.85238603988604,
+      "grad_norm": 0.6135673522949219,
+      "learning_rate": 0.00011174470440589022,
+      "loss": 0.9611,
+      "step": 10404
+    },
+    {
+      "epoch": 1.8525641025641026,
+      "grad_norm": 0.7868386507034302,
+      "learning_rate": 0.00011173080369315999,
+      "loss": 0.8561,
+      "step": 10405
+    },
+    {
+      "epoch": 1.8527421652421654,
+      "grad_norm": 0.6575735211372375,
+      "learning_rate": 0.00011171690275058902,
+      "loss": 1.0256,
+      "step": 10406
+    },
+    {
+      "epoch": 1.852920227920228,
+      "grad_norm": 0.7514392137527466,
+      "learning_rate": 0.00011170300157844969,
+      "loss": 1.0868,
+      "step": 10407
+    },
+    {
+      "epoch": 1.8530982905982905,
+      "grad_norm": 0.6915257573127747,
+      "learning_rate": 0.00011168910017701436,
+      "loss": 1.1223,
+      "step": 10408
+    },
+    {
+      "epoch": 1.8532763532763532,
+      "grad_norm": 0.7406772971153259,
+      "learning_rate": 0.00011167519854655535,
+      "loss": 1.0922,
+      "step": 10409
+    },
+    {
+      "epoch": 1.853454415954416,
+      "grad_norm": 0.6632742881774902,
+      "learning_rate": 0.0001116612966873451,
+      "loss": 0.9082,
+      "step": 10410
+    },
+    {
+      "epoch": 1.8536324786324787,
+      "grad_norm": 0.8154461979866028,
+      "learning_rate": 0.00011164739459965598,
+      "loss": 1.1126,
+      "step": 10411
+    },
+    {
+      "epoch": 1.8538105413105415,
+      "grad_norm": 0.895764172077179,
+      "learning_rate": 0.00011163349228376037,
+      "loss": 1.0589,
+      "step": 10412
+    },
+    {
+      "epoch": 1.853988603988604,
+      "grad_norm": 0.6746504902839661,
+      "learning_rate": 0.00011161958973993063,
+      "loss": 1.0184,
+      "step": 10413
+    },
+    {
+      "epoch": 1.8541666666666665,
+      "grad_norm": 0.7271263003349304,
+      "learning_rate": 0.00011160568696843916,
+      "loss": 0.9989,
+      "step": 10414
+    },
+    {
+      "epoch": 1.8543447293447293,
+      "grad_norm": 0.7503132820129395,
+      "learning_rate": 0.00011159178396955836,
+      "loss": 1.0783,
+      "step": 10415
+    },
+    {
+      "epoch": 1.854522792022792,
+      "grad_norm": 0.6768177151679993,
+      "learning_rate": 0.00011157788074356066,
+      "loss": 0.9916,
+      "step": 10416
+    },
+    {
+      "epoch": 1.8547008547008548,
+      "grad_norm": 0.6804978251457214,
+      "learning_rate": 0.00011156397729071842,
+      "loss": 0.9534,
+      "step": 10417
+    },
+    {
+      "epoch": 1.8548789173789175,
+      "grad_norm": 0.7144617438316345,
+      "learning_rate": 0.00011155007361130408,
+      "loss": 0.991,
+      "step": 10418
+    },
+    {
+      "epoch": 1.85505698005698,
+      "grad_norm": 0.6816750168800354,
+      "learning_rate": 0.00011153616970559,
+      "loss": 0.9551,
+      "step": 10419
+    },
+    {
+      "epoch": 1.8552350427350426,
+      "grad_norm": 0.6620030999183655,
+      "learning_rate": 0.00011152226557384866,
+      "loss": 0.8854,
+      "step": 10420
+    },
+    {
+      "epoch": 1.8554131054131053,
+      "grad_norm": 0.8400058746337891,
+      "learning_rate": 0.00011150836121635249,
+      "loss": 1.1593,
+      "step": 10421
+    },
+    {
+      "epoch": 1.855591168091168,
+      "grad_norm": 0.6666815280914307,
+      "learning_rate": 0.00011149445663337385,
+      "loss": 1.2112,
+      "step": 10422
+    },
+    {
+      "epoch": 1.8557692307692308,
+      "grad_norm": 0.7298431396484375,
+      "learning_rate": 0.00011148055182518522,
+      "loss": 0.9721,
+      "step": 10423
+    },
+    {
+      "epoch": 1.8559472934472936,
+      "grad_norm": 0.66816645860672,
+      "learning_rate": 0.00011146664679205903,
+      "loss": 1.0945,
+      "step": 10424
+    },
+    {
+      "epoch": 1.8561253561253561,
+      "grad_norm": 0.5979483127593994,
+      "learning_rate": 0.00011145274153426771,
+      "loss": 1.0176,
+      "step": 10425
+    },
+    {
+      "epoch": 1.8563034188034186,
+      "grad_norm": 0.6579445600509644,
+      "learning_rate": 0.00011143883605208372,
+      "loss": 0.9143,
+      "step": 10426
+    },
+    {
+      "epoch": 1.8564814814814814,
+      "grad_norm": 0.6871697902679443,
+      "learning_rate": 0.0001114249303457795,
+      "loss": 1.071,
+      "step": 10427
+    },
+    {
+      "epoch": 1.8566595441595442,
+      "grad_norm": 0.6683333516120911,
+      "learning_rate": 0.0001114110244156275,
+      "loss": 0.7809,
+      "step": 10428
+    },
+    {
+      "epoch": 1.856837606837607,
+      "grad_norm": 0.6122907996177673,
+      "learning_rate": 0.0001113971182619002,
+      "loss": 0.8329,
+      "step": 10429
+    },
+    {
+      "epoch": 1.8570156695156697,
+      "grad_norm": 0.6510575413703918,
+      "learning_rate": 0.00011138321188487,
+      "loss": 1.0068,
+      "step": 10430
+    },
+    {
+      "epoch": 1.8571937321937322,
+      "grad_norm": 0.6417793035507202,
+      "learning_rate": 0.00011136930528480945,
+      "loss": 1.0093,
+      "step": 10431
+    },
+    {
+      "epoch": 1.8573717948717947,
+      "grad_norm": 0.595824658870697,
+      "learning_rate": 0.00011135539846199096,
+      "loss": 0.9856,
+      "step": 10432
+    },
+    {
+      "epoch": 1.8575498575498575,
+      "grad_norm": 0.7594470381736755,
+      "learning_rate": 0.00011134149141668704,
+      "loss": 0.8173,
+      "step": 10433
+    },
+    {
+      "epoch": 1.8577279202279202,
+      "grad_norm": 0.7078324556350708,
+      "learning_rate": 0.00011132758414917016,
+      "loss": 1.0236,
+      "step": 10434
+    },
+    {
+      "epoch": 1.857905982905983,
+      "grad_norm": 0.6830437779426575,
+      "learning_rate": 0.00011131367665971275,
+      "loss": 0.8483,
+      "step": 10435
+    },
+    {
+      "epoch": 1.8580840455840457,
+      "grad_norm": 0.6856399774551392,
+      "learning_rate": 0.0001112997689485874,
+      "loss": 0.8729,
+      "step": 10436
+    },
+    {
+      "epoch": 1.8582621082621082,
+      "grad_norm": 0.6530426144599915,
+      "learning_rate": 0.00011128586101606653,
+      "loss": 0.8616,
+      "step": 10437
+    },
+    {
+      "epoch": 1.8584401709401708,
+      "grad_norm": 0.6341808438301086,
+      "learning_rate": 0.00011127195286242267,
+      "loss": 0.896,
+      "step": 10438
+    },
+    {
+      "epoch": 1.8586182336182335,
+      "grad_norm": 0.6278257966041565,
+      "learning_rate": 0.00011125804448792831,
+      "loss": 0.8309,
+      "step": 10439
+    },
+    {
+      "epoch": 1.8587962962962963,
+      "grad_norm": 0.708705723285675,
+      "learning_rate": 0.00011124413589285594,
+      "loss": 1.1065,
+      "step": 10440
+    },
+    {
+      "epoch": 1.858974358974359,
+      "grad_norm": 0.6845232248306274,
+      "learning_rate": 0.00011123022707747808,
+      "loss": 0.9292,
+      "step": 10441
+    },
+    {
+      "epoch": 1.8591524216524218,
+      "grad_norm": 0.749204695224762,
+      "learning_rate": 0.00011121631804206726,
+      "loss": 1.0487,
+      "step": 10442
+    },
+    {
+      "epoch": 1.8593304843304843,
+      "grad_norm": 0.7123128771781921,
+      "learning_rate": 0.00011120240878689599,
+      "loss": 0.9138,
+      "step": 10443
+    },
+    {
+      "epoch": 1.859508547008547,
+      "grad_norm": 0.6862115263938904,
+      "learning_rate": 0.00011118849931223679,
+      "loss": 1.0675,
+      "step": 10444
+    },
+    {
+      "epoch": 1.8596866096866096,
+      "grad_norm": 0.7245760560035706,
+      "learning_rate": 0.00011117458961836215,
+      "loss": 0.9643,
+      "step": 10445
+    },
+    {
+      "epoch": 1.8598646723646723,
+      "grad_norm": 0.701574444770813,
+      "learning_rate": 0.0001111606797055447,
+      "loss": 1.0022,
+      "step": 10446
+    },
+    {
+      "epoch": 1.860042735042735,
+      "grad_norm": 0.7292088270187378,
+      "learning_rate": 0.0001111467695740569,
+      "loss": 0.9465,
+      "step": 10447
+    },
+    {
+      "epoch": 1.8602207977207978,
+      "grad_norm": 0.7045044302940369,
+      "learning_rate": 0.0001111328592241713,
+      "loss": 1.0942,
+      "step": 10448
+    },
+    {
+      "epoch": 1.8603988603988604,
+      "grad_norm": 0.7181426286697388,
+      "learning_rate": 0.00011111894865616046,
+      "loss": 1.2108,
+      "step": 10449
+    },
+    {
+      "epoch": 1.8605769230769231,
+      "grad_norm": 0.6083306074142456,
+      "learning_rate": 0.00011110503787029689,
+      "loss": 0.929,
+      "step": 10450
+    },
+    {
+      "epoch": 1.8607549857549857,
+      "grad_norm": 0.6847347617149353,
+      "learning_rate": 0.00011109112686685319,
+      "loss": 1.0911,
+      "step": 10451
+    },
+    {
+      "epoch": 1.8609330484330484,
+      "grad_norm": 0.7131744027137756,
+      "learning_rate": 0.0001110772156461019,
+      "loss": 0.9649,
+      "step": 10452
+    },
+    {
+      "epoch": 1.8611111111111112,
+      "grad_norm": 0.7920312881469727,
+      "learning_rate": 0.00011106330420831559,
+      "loss": 0.9965,
+      "step": 10453
+    },
+    {
+      "epoch": 1.861289173789174,
+      "grad_norm": 0.6640987992286682,
+      "learning_rate": 0.00011104939255376681,
+      "loss": 1.2346,
+      "step": 10454
+    },
+    {
+      "epoch": 1.8614672364672364,
+      "grad_norm": 0.5878208875656128,
+      "learning_rate": 0.00011103548068272811,
+      "loss": 0.8565,
+      "step": 10455
+    },
+    {
+      "epoch": 1.8616452991452992,
+      "grad_norm": 0.6636882424354553,
+      "learning_rate": 0.0001110215685954721,
+      "loss": 0.8556,
+      "step": 10456
+    },
+    {
+      "epoch": 1.8618233618233617,
+      "grad_norm": 0.5985570549964905,
+      "learning_rate": 0.00011100765629227137,
+      "loss": 1.0291,
+      "step": 10457
+    },
+    {
+      "epoch": 1.8620014245014245,
+      "grad_norm": 0.7546643614768982,
+      "learning_rate": 0.00011099374377339846,
+      "loss": 1.0199,
+      "step": 10458
+    },
+    {
+      "epoch": 1.8621794871794872,
+      "grad_norm": 0.6529727578163147,
+      "learning_rate": 0.00011097983103912602,
+      "loss": 1.0826,
+      "step": 10459
+    },
+    {
+      "epoch": 1.86235754985755,
+      "grad_norm": 0.6394338607788086,
+      "learning_rate": 0.00011096591808972654,
+      "loss": 0.9896,
+      "step": 10460
+    },
+    {
+      "epoch": 1.8625356125356125,
+      "grad_norm": 0.6508805751800537,
+      "learning_rate": 0.00011095200492547271,
+      "loss": 0.9659,
+      "step": 10461
+    },
+    {
+      "epoch": 1.8627136752136753,
+      "grad_norm": 0.7085812091827393,
+      "learning_rate": 0.00011093809154663705,
+      "loss": 0.9998,
+      "step": 10462
+    },
+    {
+      "epoch": 1.8628917378917378,
+      "grad_norm": 0.6488457322120667,
+      "learning_rate": 0.00011092417795349226,
+      "loss": 0.9757,
+      "step": 10463
+    },
+    {
+      "epoch": 1.8630698005698005,
+      "grad_norm": 0.6405763626098633,
+      "learning_rate": 0.0001109102641463109,
+      "loss": 0.8188,
+      "step": 10464
+    },
+    {
+      "epoch": 1.8632478632478633,
+      "grad_norm": 0.713361918926239,
+      "learning_rate": 0.00011089635012536554,
+      "loss": 0.886,
+      "step": 10465
+    },
+    {
+      "epoch": 1.863425925925926,
+      "grad_norm": 0.5752255916595459,
+      "learning_rate": 0.00011088243589092886,
+      "loss": 1.0223,
+      "step": 10466
+    },
+    {
+      "epoch": 1.8636039886039886,
+      "grad_norm": 0.6722734570503235,
+      "learning_rate": 0.00011086852144327344,
+      "loss": 0.9499,
+      "step": 10467
+    },
+    {
+      "epoch": 1.8637820512820513,
+      "grad_norm": 0.5516420006752014,
+      "learning_rate": 0.00011085460678267194,
+      "loss": 0.7767,
+      "step": 10468
+    },
+    {
+      "epoch": 1.8639601139601139,
+      "grad_norm": 0.731257438659668,
+      "learning_rate": 0.00011084069190939697,
+      "loss": 1.2299,
+      "step": 10469
+    },
+    {
+      "epoch": 1.8641381766381766,
+      "grad_norm": 0.7977055907249451,
+      "learning_rate": 0.00011082677682372114,
+      "loss": 0.9109,
+      "step": 10470
+    },
+    {
+      "epoch": 1.8643162393162394,
+      "grad_norm": 0.679900586605072,
+      "learning_rate": 0.0001108128615259171,
+      "loss": 0.9319,
+      "step": 10471
+    },
+    {
+      "epoch": 1.864494301994302,
+      "grad_norm": 0.7428545951843262,
+      "learning_rate": 0.00011079894601625754,
+      "loss": 0.8585,
+      "step": 10472
+    },
+    {
+      "epoch": 1.8646723646723646,
+      "grad_norm": 0.6560967564582825,
+      "learning_rate": 0.00011078503029501504,
+      "loss": 1.0069,
+      "step": 10473
+    },
+    {
+      "epoch": 1.8648504273504274,
+      "grad_norm": 0.636202871799469,
+      "learning_rate": 0.00011077111436246228,
+      "loss": 1.0329,
+      "step": 10474
+    },
+    {
+      "epoch": 1.86502849002849,
+      "grad_norm": 0.6666205525398254,
+      "learning_rate": 0.00011075719821887191,
+      "loss": 1.0123,
+      "step": 10475
+    },
+    {
+      "epoch": 1.8652065527065527,
+      "grad_norm": 0.7089471220970154,
+      "learning_rate": 0.00011074328186451657,
+      "loss": 0.7851,
+      "step": 10476
+    },
+    {
+      "epoch": 1.8653846153846154,
+      "grad_norm": 0.6054788827896118,
+      "learning_rate": 0.00011072936529966895,
+      "loss": 0.8224,
+      "step": 10477
+    },
+    {
+      "epoch": 1.8655626780626782,
+      "grad_norm": 0.6009029150009155,
+      "learning_rate": 0.00011071544852460172,
+      "loss": 0.865,
+      "step": 10478
+    },
+    {
+      "epoch": 1.8657407407407407,
+      "grad_norm": 0.6238716244697571,
+      "learning_rate": 0.00011070153153958753,
+      "loss": 0.8685,
+      "step": 10479
+    },
+    {
+      "epoch": 1.8659188034188035,
+      "grad_norm": 0.719985842704773,
+      "learning_rate": 0.00011068761434489903,
+      "loss": 1.2204,
+      "step": 10480
+    },
+    {
+      "epoch": 1.866096866096866,
+      "grad_norm": 0.72972172498703,
+      "learning_rate": 0.00011067369694080895,
+      "loss": 1.0454,
+      "step": 10481
+    },
+    {
+      "epoch": 1.8662749287749287,
+      "grad_norm": 0.6741998791694641,
+      "learning_rate": 0.00011065977932758995,
+      "loss": 0.9992,
+      "step": 10482
+    },
+    {
+      "epoch": 1.8664529914529915,
+      "grad_norm": 0.6150268912315369,
+      "learning_rate": 0.00011064586150551472,
+      "loss": 0.8866,
+      "step": 10483
+    },
+    {
+      "epoch": 1.8666310541310542,
+      "grad_norm": 0.8253782391548157,
+      "learning_rate": 0.00011063194347485597,
+      "loss": 1.1173,
+      "step": 10484
+    },
+    {
+      "epoch": 1.8668091168091168,
+      "grad_norm": 0.7176247835159302,
+      "learning_rate": 0.00011061802523588636,
+      "loss": 1.0414,
+      "step": 10485
+    },
+    {
+      "epoch": 1.8669871794871795,
+      "grad_norm": 0.6372736096382141,
+      "learning_rate": 0.00011060410678887858,
+      "loss": 1.0548,
+      "step": 10486
+    },
+    {
+      "epoch": 1.867165242165242,
+      "grad_norm": 0.7107454538345337,
+      "learning_rate": 0.00011059018813410538,
+      "loss": 1.2298,
+      "step": 10487
+    },
+    {
+      "epoch": 1.8673433048433048,
+      "grad_norm": 0.7113911509513855,
+      "learning_rate": 0.00011057626927183944,
+      "loss": 0.9598,
+      "step": 10488
+    },
+    {
+      "epoch": 1.8675213675213675,
+      "grad_norm": 0.6734410524368286,
+      "learning_rate": 0.00011056235020235346,
+      "loss": 0.9475,
+      "step": 10489
+    },
+    {
+      "epoch": 1.8676994301994303,
+      "grad_norm": 0.6875202655792236,
+      "learning_rate": 0.0001105484309259202,
+      "loss": 1.0735,
+      "step": 10490
+    },
+    {
+      "epoch": 1.8678774928774928,
+      "grad_norm": 0.6908353567123413,
+      "learning_rate": 0.0001105345114428123,
+      "loss": 1.0558,
+      "step": 10491
+    },
+    {
+      "epoch": 1.8680555555555556,
+      "grad_norm": 0.6283324360847473,
+      "learning_rate": 0.00011052059175330256,
+      "loss": 0.8872,
+      "step": 10492
+    },
+    {
+      "epoch": 1.868233618233618,
+      "grad_norm": 0.6422587633132935,
+      "learning_rate": 0.00011050667185766368,
+      "loss": 1.1022,
+      "step": 10493
+    },
+    {
+      "epoch": 1.8684116809116809,
+      "grad_norm": 0.7075859904289246,
+      "learning_rate": 0.0001104927517561684,
+      "loss": 1.1389,
+      "step": 10494
+    },
+    {
+      "epoch": 1.8685897435897436,
+      "grad_norm": 0.5896905064582825,
+      "learning_rate": 0.00011047883144908944,
+      "loss": 0.7732,
+      "step": 10495
+    },
+    {
+      "epoch": 1.8687678062678064,
+      "grad_norm": 0.7647629976272583,
+      "learning_rate": 0.00011046491093669953,
+      "loss": 0.9983,
+      "step": 10496
+    },
+    {
+      "epoch": 1.868945868945869,
+      "grad_norm": 0.5864735841751099,
+      "learning_rate": 0.00011045099021927144,
+      "loss": 0.8427,
+      "step": 10497
+    },
+    {
+      "epoch": 1.8691239316239316,
+      "grad_norm": 0.6766837239265442,
+      "learning_rate": 0.00011043706929707791,
+      "loss": 0.9595,
+      "step": 10498
+    },
+    {
+      "epoch": 1.8693019943019942,
+      "grad_norm": 0.5480074286460876,
+      "learning_rate": 0.00011042314817039168,
+      "loss": 0.691,
+      "step": 10499
+    },
+    {
+      "epoch": 1.869480056980057,
+      "grad_norm": 0.6259615421295166,
+      "learning_rate": 0.00011040922683948553,
+      "loss": 0.9991,
+      "step": 10500
+    },
+    {
+      "epoch": 1.8696581196581197,
+      "grad_norm": 0.5950598120689392,
+      "learning_rate": 0.00011039530530463218,
+      "loss": 0.7413,
+      "step": 10501
+    },
+    {
+      "epoch": 1.8698361823361824,
+      "grad_norm": 0.8099377751350403,
+      "learning_rate": 0.00011038138356610441,
+      "loss": 1.1351,
+      "step": 10502
+    },
+    {
+      "epoch": 1.8700142450142452,
+      "grad_norm": 0.6716185212135315,
+      "learning_rate": 0.00011036746162417501,
+      "loss": 1.1057,
+      "step": 10503
+    },
+    {
+      "epoch": 1.8701923076923077,
+      "grad_norm": 0.7993219494819641,
+      "learning_rate": 0.00011035353947911675,
+      "loss": 1.2095,
+      "step": 10504
+    },
+    {
+      "epoch": 1.8703703703703702,
+      "grad_norm": 0.6381276249885559,
+      "learning_rate": 0.00011033961713120237,
+      "loss": 1.0261,
+      "step": 10505
+    },
+    {
+      "epoch": 1.870548433048433,
+      "grad_norm": 0.6326032280921936,
+      "learning_rate": 0.00011032569458070469,
+      "loss": 0.8664,
+      "step": 10506
+    },
+    {
+      "epoch": 1.8707264957264957,
+      "grad_norm": 0.6864820718765259,
+      "learning_rate": 0.00011031177182789644,
+      "loss": 0.9959,
+      "step": 10507
+    },
+    {
+      "epoch": 1.8709045584045585,
+      "grad_norm": 0.6341838240623474,
+      "learning_rate": 0.00011029784887305048,
+      "loss": 0.8029,
+      "step": 10508
+    },
+    {
+      "epoch": 1.8710826210826212,
+      "grad_norm": 0.6559172868728638,
+      "learning_rate": 0.00011028392571643957,
+      "loss": 0.9282,
+      "step": 10509
+    },
+    {
+      "epoch": 1.8712606837606838,
+      "grad_norm": 0.6976849436759949,
+      "learning_rate": 0.0001102700023583365,
+      "loss": 1.0198,
+      "step": 10510
+    },
+    {
+      "epoch": 1.8714387464387463,
+      "grad_norm": 0.7159395217895508,
+      "learning_rate": 0.00011025607879901402,
+      "loss": 1.1585,
+      "step": 10511
+    },
+    {
+      "epoch": 1.871616809116809,
+      "grad_norm": 0.7168624997138977,
+      "learning_rate": 0.000110242155038745,
+      "loss": 1.0558,
+      "step": 10512
+    },
+    {
+      "epoch": 1.8717948717948718,
+      "grad_norm": 0.5784319043159485,
+      "learning_rate": 0.00011022823107780224,
+      "loss": 0.9481,
+      "step": 10513
+    },
+    {
+      "epoch": 1.8719729344729346,
+      "grad_norm": 0.6602259874343872,
+      "learning_rate": 0.00011021430691645856,
+      "loss": 1.0538,
+      "step": 10514
+    },
+    {
+      "epoch": 1.8721509971509973,
+      "grad_norm": 0.6874588131904602,
+      "learning_rate": 0.00011020038255498672,
+      "loss": 1.1396,
+      "step": 10515
+    },
+    {
+      "epoch": 1.8723290598290598,
+      "grad_norm": 0.7311663031578064,
+      "learning_rate": 0.00011018645799365956,
+      "loss": 1.084,
+      "step": 10516
+    },
+    {
+      "epoch": 1.8725071225071224,
+      "grad_norm": 0.7097118496894836,
+      "learning_rate": 0.00011017253323274996,
+      "loss": 0.9872,
+      "step": 10517
+    },
+    {
+      "epoch": 1.8726851851851851,
+      "grad_norm": 0.6667875051498413,
+      "learning_rate": 0.00011015860827253068,
+      "loss": 1.105,
+      "step": 10518
+    },
+    {
+      "epoch": 1.8728632478632479,
+      "grad_norm": 0.6807677745819092,
+      "learning_rate": 0.0001101446831132746,
+      "loss": 0.9093,
+      "step": 10519
+    },
+    {
+      "epoch": 1.8730413105413106,
+      "grad_norm": 0.6885797381401062,
+      "learning_rate": 0.0001101307577552545,
+      "loss": 0.8479,
+      "step": 10520
+    },
+    {
+      "epoch": 1.8732193732193734,
+      "grad_norm": 0.6269213557243347,
+      "learning_rate": 0.00011011683219874323,
+      "loss": 0.9457,
+      "step": 10521
+    },
+    {
+      "epoch": 1.873397435897436,
+      "grad_norm": 0.7096766829490662,
+      "learning_rate": 0.00011010290644401364,
+      "loss": 1.0971,
+      "step": 10522
+    },
+    {
+      "epoch": 1.8735754985754984,
+      "grad_norm": 0.6909209489822388,
+      "learning_rate": 0.00011008898049133863,
+      "loss": 0.9928,
+      "step": 10523
+    },
+    {
+      "epoch": 1.8737535612535612,
+      "grad_norm": 0.6586211323738098,
+      "learning_rate": 0.000110075054340991,
+      "loss": 0.818,
+      "step": 10524
+    },
+    {
+      "epoch": 1.873931623931624,
+      "grad_norm": 0.5934817790985107,
+      "learning_rate": 0.0001100611279932436,
+      "loss": 0.7698,
+      "step": 10525
+    },
+    {
+      "epoch": 1.8741096866096867,
+      "grad_norm": 0.6361709237098694,
+      "learning_rate": 0.00011004720144836931,
+      "loss": 0.9465,
+      "step": 10526
+    },
+    {
+      "epoch": 1.8742877492877494,
+      "grad_norm": 0.6742212176322937,
+      "learning_rate": 0.00011003327470664095,
+      "loss": 1.0998,
+      "step": 10527
+    },
+    {
+      "epoch": 1.874465811965812,
+      "grad_norm": 0.6634946465492249,
+      "learning_rate": 0.00011001934776833143,
+      "loss": 0.8328,
+      "step": 10528
+    },
+    {
+      "epoch": 1.8746438746438745,
+      "grad_norm": 0.6754063963890076,
+      "learning_rate": 0.0001100054206337136,
+      "loss": 1.147,
+      "step": 10529
+    },
+    {
+      "epoch": 1.8748219373219372,
+      "grad_norm": 0.5951135158538818,
+      "learning_rate": 0.00010999149330306036,
+      "loss": 0.8956,
+      "step": 10530
+    },
+    {
+      "epoch": 1.875,
+      "grad_norm": 0.6140317320823669,
+      "learning_rate": 0.00010997756577664455,
+      "loss": 0.9368,
+      "step": 10531
+    },
+    {
+      "epoch": 1.8751780626780628,
+      "grad_norm": 0.6419258713722229,
+      "learning_rate": 0.00010996363805473904,
+      "loss": 0.9817,
+      "step": 10532
+    },
+    {
+      "epoch": 1.8753561253561255,
+      "grad_norm": 0.7173396348953247,
+      "learning_rate": 0.00010994971013761677,
+      "loss": 0.9638,
+      "step": 10533
+    },
+    {
+      "epoch": 1.875534188034188,
+      "grad_norm": 0.8125925660133362,
+      "learning_rate": 0.0001099357820255506,
+      "loss": 1.0996,
+      "step": 10534
+    },
+    {
+      "epoch": 1.8757122507122506,
+      "grad_norm": 0.6191564798355103,
+      "learning_rate": 0.00010992185371881341,
+      "loss": 0.8266,
+      "step": 10535
+    },
+    {
+      "epoch": 1.8758903133903133,
+      "grad_norm": 0.6632885336875916,
+      "learning_rate": 0.0001099079252176781,
+      "loss": 1.1884,
+      "step": 10536
+    },
+    {
+      "epoch": 1.876068376068376,
+      "grad_norm": 0.7323372960090637,
+      "learning_rate": 0.00010989399652241759,
+      "loss": 1.0842,
+      "step": 10537
+    },
+    {
+      "epoch": 1.8762464387464388,
+      "grad_norm": 0.7553854584693909,
+      "learning_rate": 0.00010988006763330476,
+      "loss": 0.9948,
+      "step": 10538
+    },
+    {
+      "epoch": 1.8764245014245016,
+      "grad_norm": 0.5887658596038818,
+      "learning_rate": 0.00010986613855061255,
+      "loss": 0.7653,
+      "step": 10539
+    },
+    {
+      "epoch": 1.876602564102564,
+      "grad_norm": 0.6849574446678162,
+      "learning_rate": 0.00010985220927461384,
+      "loss": 1.152,
+      "step": 10540
+    },
+    {
+      "epoch": 1.8767806267806266,
+      "grad_norm": 0.6985000371932983,
+      "learning_rate": 0.00010983827980558155,
+      "loss": 0.9869,
+      "step": 10541
+    },
+    {
+      "epoch": 1.8769586894586894,
+      "grad_norm": 0.6885373592376709,
+      "learning_rate": 0.00010982435014378858,
+      "loss": 1.1803,
+      "step": 10542
+    },
+    {
+      "epoch": 1.8771367521367521,
+      "grad_norm": 0.7610142827033997,
+      "learning_rate": 0.00010981042028950788,
+      "loss": 0.9219,
+      "step": 10543
+    },
+    {
+      "epoch": 1.8773148148148149,
+      "grad_norm": 0.6545612215995789,
+      "learning_rate": 0.00010979649024301242,
+      "loss": 1.0337,
+      "step": 10544
+    },
+    {
+      "epoch": 1.8774928774928776,
+      "grad_norm": 0.7307698130607605,
+      "learning_rate": 0.00010978256000457505,
+      "loss": 0.9726,
+      "step": 10545
+    },
+    {
+      "epoch": 1.8776709401709402,
+      "grad_norm": 0.68310546875,
+      "learning_rate": 0.00010976862957446877,
+      "loss": 1.161,
+      "step": 10546
+    },
+    {
+      "epoch": 1.8778490028490027,
+      "grad_norm": 0.6114758253097534,
+      "learning_rate": 0.00010975469895296646,
+      "loss": 0.8863,
+      "step": 10547
+    },
+    {
+      "epoch": 1.8780270655270654,
+      "grad_norm": 0.732390820980072,
+      "learning_rate": 0.00010974076814034106,
+      "loss": 1.0339,
+      "step": 10548
+    },
+    {
+      "epoch": 1.8782051282051282,
+      "grad_norm": 0.6741712689399719,
+      "learning_rate": 0.0001097268371368656,
+      "loss": 1.0024,
+      "step": 10549
+    },
+    {
+      "epoch": 1.878383190883191,
+      "grad_norm": 0.6374897360801697,
+      "learning_rate": 0.00010971290594281294,
+      "loss": 0.91,
+      "step": 10550
+    },
+    {
+      "epoch": 1.8785612535612537,
+      "grad_norm": 0.6434261202812195,
+      "learning_rate": 0.00010969897455845608,
+      "loss": 1.0048,
+      "step": 10551
+    },
+    {
+      "epoch": 1.8787393162393162,
+      "grad_norm": 0.6573047041893005,
+      "learning_rate": 0.00010968504298406794,
+      "loss": 1.118,
+      "step": 10552
+    },
+    {
+      "epoch": 1.8789173789173788,
+      "grad_norm": 0.6686552166938782,
+      "learning_rate": 0.00010967111121992152,
+      "loss": 1.089,
+      "step": 10553
+    },
+    {
+      "epoch": 1.8790954415954415,
+      "grad_norm": 0.7899606823921204,
+      "learning_rate": 0.00010965717926628976,
+      "loss": 1.059,
+      "step": 10554
+    },
+    {
+      "epoch": 1.8792735042735043,
+      "grad_norm": 0.5808879733085632,
+      "learning_rate": 0.00010964324712344564,
+      "loss": 0.9369,
+      "step": 10555
+    },
+    {
+      "epoch": 1.879451566951567,
+      "grad_norm": 0.6322834491729736,
+      "learning_rate": 0.00010962931479166211,
+      "loss": 0.8783,
+      "step": 10556
+    },
+    {
+      "epoch": 1.8796296296296298,
+      "grad_norm": 0.647002637386322,
+      "learning_rate": 0.00010961538227121218,
+      "loss": 0.9468,
+      "step": 10557
+    },
+    {
+      "epoch": 1.8798076923076923,
+      "grad_norm": 0.6581854820251465,
+      "learning_rate": 0.0001096014495623688,
+      "loss": 1.0077,
+      "step": 10558
+    },
+    {
+      "epoch": 1.8799857549857548,
+      "grad_norm": 0.6879259943962097,
+      "learning_rate": 0.00010958751666540496,
+      "loss": 0.976,
+      "step": 10559
+    },
+    {
+      "epoch": 1.8801638176638176,
+      "grad_norm": 0.7055090665817261,
+      "learning_rate": 0.00010957358358059364,
+      "loss": 0.8903,
+      "step": 10560
+    },
+    {
+      "epoch": 1.8803418803418803,
+      "grad_norm": 0.6865016222000122,
+      "learning_rate": 0.00010955965030820782,
+      "loss": 0.9872,
+      "step": 10561
+    },
+    {
+      "epoch": 1.880519943019943,
+      "grad_norm": 0.663436770439148,
+      "learning_rate": 0.00010954571684852055,
+      "loss": 1.0485,
+      "step": 10562
+    },
+    {
+      "epoch": 1.8806980056980058,
+      "grad_norm": 0.6861656904220581,
+      "learning_rate": 0.00010953178320180475,
+      "loss": 1.0691,
+      "step": 10563
+    },
+    {
+      "epoch": 1.8808760683760684,
+      "grad_norm": 0.8045449256896973,
+      "learning_rate": 0.0001095178493683335,
+      "loss": 1.1534,
+      "step": 10564
+    },
+    {
+      "epoch": 1.881054131054131,
+      "grad_norm": 0.6493151187896729,
+      "learning_rate": 0.00010950391534837973,
+      "loss": 0.8756,
+      "step": 10565
+    },
+    {
+      "epoch": 1.8812321937321936,
+      "grad_norm": 0.7057121992111206,
+      "learning_rate": 0.00010948998114221651,
+      "loss": 1.1709,
+      "step": 10566
+    },
+    {
+      "epoch": 1.8814102564102564,
+      "grad_norm": 0.7708197236061096,
+      "learning_rate": 0.0001094760467501168,
+      "loss": 1.0037,
+      "step": 10567
+    },
+    {
+      "epoch": 1.8815883190883191,
+      "grad_norm": 0.7234642505645752,
+      "learning_rate": 0.00010946211217235364,
+      "loss": 1.0757,
+      "step": 10568
+    },
+    {
+      "epoch": 1.881766381766382,
+      "grad_norm": 0.6964395642280579,
+      "learning_rate": 0.00010944817740920006,
+      "loss": 1.0769,
+      "step": 10569
+    },
+    {
+      "epoch": 1.8819444444444444,
+      "grad_norm": 0.7465848922729492,
+      "learning_rate": 0.00010943424246092906,
+      "loss": 0.9772,
+      "step": 10570
+    },
+    {
+      "epoch": 1.8821225071225072,
+      "grad_norm": 0.7145788073539734,
+      "learning_rate": 0.0001094203073278137,
+      "loss": 0.9638,
+      "step": 10571
+    },
+    {
+      "epoch": 1.8823005698005697,
+      "grad_norm": 0.7421764135360718,
+      "learning_rate": 0.00010940637201012698,
+      "loss": 1.0324,
+      "step": 10572
+    },
+    {
+      "epoch": 1.8824786324786325,
+      "grad_norm": 0.7373253107070923,
+      "learning_rate": 0.0001093924365081419,
+      "loss": 1.1554,
+      "step": 10573
+    },
+    {
+      "epoch": 1.8826566951566952,
+      "grad_norm": 0.6861984729766846,
+      "learning_rate": 0.00010937850082213156,
+      "loss": 0.9899,
+      "step": 10574
+    },
+    {
+      "epoch": 1.882834757834758,
+      "grad_norm": 0.6173393130302429,
+      "learning_rate": 0.000109364564952369,
+      "loss": 0.8495,
+      "step": 10575
+    },
+    {
+      "epoch": 1.8830128205128205,
+      "grad_norm": 0.6871610879898071,
+      "learning_rate": 0.00010935062889912723,
+      "loss": 1.2164,
+      "step": 10576
+    },
+    {
+      "epoch": 1.8831908831908832,
+      "grad_norm": 0.7062903642654419,
+      "learning_rate": 0.00010933669266267931,
+      "loss": 1.1077,
+      "step": 10577
+    },
+    {
+      "epoch": 1.8833689458689458,
+      "grad_norm": 0.6574689745903015,
+      "learning_rate": 0.00010932275624329828,
+      "loss": 0.9326,
+      "step": 10578
+    },
+    {
+      "epoch": 1.8835470085470085,
+      "grad_norm": 0.636385440826416,
+      "learning_rate": 0.00010930881964125723,
+      "loss": 1.0581,
+      "step": 10579
+    },
+    {
+      "epoch": 1.8837250712250713,
+      "grad_norm": 0.6178432106971741,
+      "learning_rate": 0.0001092948828568292,
+      "loss": 1.1288,
+      "step": 10580
+    },
+    {
+      "epoch": 1.883903133903134,
+      "grad_norm": 0.6509431600570679,
+      "learning_rate": 0.00010928094589028721,
+      "loss": 1.0113,
+      "step": 10581
+    },
+    {
+      "epoch": 1.8840811965811965,
+      "grad_norm": 0.6543706059455872,
+      "learning_rate": 0.00010926700874190441,
+      "loss": 1.0041,
+      "step": 10582
+    },
+    {
+      "epoch": 1.8842592592592593,
+      "grad_norm": 0.6815463304519653,
+      "learning_rate": 0.0001092530714119538,
+      "loss": 1.0892,
+      "step": 10583
+    },
+    {
+      "epoch": 1.8844373219373218,
+      "grad_norm": 0.6787421107292175,
+      "learning_rate": 0.00010923913390070846,
+      "loss": 1.2693,
+      "step": 10584
+    },
+    {
+      "epoch": 1.8846153846153846,
+      "grad_norm": 0.6953850984573364,
+      "learning_rate": 0.00010922519620844151,
+      "loss": 0.9848,
+      "step": 10585
+    },
+    {
+      "epoch": 1.8847934472934473,
+      "grad_norm": 0.7061360478401184,
+      "learning_rate": 0.000109211258335426,
+      "loss": 0.949,
+      "step": 10586
+    },
+    {
+      "epoch": 1.88497150997151,
+      "grad_norm": 0.6845372915267944,
+      "learning_rate": 0.00010919732028193504,
+      "loss": 0.9554,
+      "step": 10587
+    },
+    {
+      "epoch": 1.8851495726495726,
+      "grad_norm": 0.6524720788002014,
+      "learning_rate": 0.00010918338204824165,
+      "loss": 1.1037,
+      "step": 10588
+    },
+    {
+      "epoch": 1.8853276353276354,
+      "grad_norm": 0.6410523653030396,
+      "learning_rate": 0.00010916944363461899,
+      "loss": 0.9085,
+      "step": 10589
+    },
+    {
+      "epoch": 1.885505698005698,
+      "grad_norm": 0.7109059691429138,
+      "learning_rate": 0.00010915550504134014,
+      "loss": 1.0526,
+      "step": 10590
+    },
+    {
+      "epoch": 1.8856837606837606,
+      "grad_norm": 0.7781991362571716,
+      "learning_rate": 0.00010914156626867818,
+      "loss": 0.9737,
+      "step": 10591
+    },
+    {
+      "epoch": 1.8858618233618234,
+      "grad_norm": 0.7173767685890198,
+      "learning_rate": 0.00010912762731690623,
+      "loss": 0.8862,
+      "step": 10592
+    },
+    {
+      "epoch": 1.8860398860398861,
+      "grad_norm": 0.7650504112243652,
+      "learning_rate": 0.00010911368818629732,
+      "loss": 1.2175,
+      "step": 10593
+    },
+    {
+      "epoch": 1.8862179487179487,
+      "grad_norm": 0.6316116452217102,
+      "learning_rate": 0.00010909974887712468,
+      "loss": 0.8332,
+      "step": 10594
+    },
+    {
+      "epoch": 1.8863960113960114,
+      "grad_norm": 0.6504800319671631,
+      "learning_rate": 0.00010908580938966138,
+      "loss": 0.8864,
+      "step": 10595
+    },
+    {
+      "epoch": 1.886574074074074,
+      "grad_norm": 0.675507128238678,
+      "learning_rate": 0.00010907186972418049,
+      "loss": 0.8523,
+      "step": 10596
+    },
+    {
+      "epoch": 1.8867521367521367,
+      "grad_norm": 0.6535763144493103,
+      "learning_rate": 0.00010905792988095515,
+      "loss": 1.0786,
+      "step": 10597
+    },
+    {
+      "epoch": 1.8869301994301995,
+      "grad_norm": 0.7071853280067444,
+      "learning_rate": 0.0001090439898602585,
+      "loss": 0.9319,
+      "step": 10598
+    },
+    {
+      "epoch": 1.8871082621082622,
+      "grad_norm": 0.699466347694397,
+      "learning_rate": 0.00010903004966236365,
+      "loss": 0.9573,
+      "step": 10599
+    },
+    {
+      "epoch": 1.8872863247863247,
+      "grad_norm": 0.7099201083183289,
+      "learning_rate": 0.00010901610928754375,
+      "loss": 0.9447,
+      "step": 10600
+    },
+    {
+      "epoch": 1.8874643874643875,
+      "grad_norm": 0.6140450835227966,
+      "learning_rate": 0.00010900216873607189,
+      "loss": 1.0227,
+      "step": 10601
+    },
+    {
+      "epoch": 1.88764245014245,
+      "grad_norm": 0.6613629460334778,
+      "learning_rate": 0.00010898822800822127,
+      "loss": 1.0152,
+      "step": 10602
+    },
+    {
+      "epoch": 1.8878205128205128,
+      "grad_norm": 0.7334819436073303,
+      "learning_rate": 0.00010897428710426498,
+      "loss": 1.1452,
+      "step": 10603
+    },
+    {
+      "epoch": 1.8879985754985755,
+      "grad_norm": 0.6819368004798889,
+      "learning_rate": 0.00010896034602447616,
+      "loss": 1.0504,
+      "step": 10604
+    },
+    {
+      "epoch": 1.8881766381766383,
+      "grad_norm": 0.6781361103057861,
+      "learning_rate": 0.00010894640476912799,
+      "loss": 0.8719,
+      "step": 10605
+    },
+    {
+      "epoch": 1.8883547008547008,
+      "grad_norm": 0.621960461139679,
+      "learning_rate": 0.00010893246333849361,
+      "loss": 0.9264,
+      "step": 10606
+    },
+    {
+      "epoch": 1.8885327635327636,
+      "grad_norm": 0.6350592374801636,
+      "learning_rate": 0.00010891852173284615,
+      "loss": 1.0042,
+      "step": 10607
+    },
+    {
+      "epoch": 1.888710826210826,
+      "grad_norm": 0.6650694012641907,
+      "learning_rate": 0.00010890457995245879,
+      "loss": 1.1387,
+      "step": 10608
+    },
+    {
+      "epoch": 1.8888888888888888,
+      "grad_norm": 0.6515723466873169,
+      "learning_rate": 0.00010889063799760468,
+      "loss": 0.9508,
+      "step": 10609
+    },
+    {
+      "epoch": 1.8890669515669516,
+      "grad_norm": 0.6368890404701233,
+      "learning_rate": 0.000108876695868557,
+      "loss": 0.8051,
+      "step": 10610
+    },
+    {
+      "epoch": 1.8892450142450143,
+      "grad_norm": 0.7971013188362122,
+      "learning_rate": 0.00010886275356558888,
+      "loss": 0.8629,
+      "step": 10611
+    },
+    {
+      "epoch": 1.8894230769230769,
+      "grad_norm": 0.6739095449447632,
+      "learning_rate": 0.00010884881108897353,
+      "loss": 0.9606,
+      "step": 10612
+    },
+    {
+      "epoch": 1.8896011396011396,
+      "grad_norm": 0.7754076719284058,
+      "learning_rate": 0.00010883486843898412,
+      "loss": 1.0751,
+      "step": 10613
+    },
+    {
+      "epoch": 1.8897792022792022,
+      "grad_norm": 0.6538285613059998,
+      "learning_rate": 0.00010882092561589379,
+      "loss": 0.9288,
+      "step": 10614
+    },
+    {
+      "epoch": 1.889957264957265,
+      "grad_norm": 0.7373257875442505,
+      "learning_rate": 0.00010880698261997577,
+      "loss": 0.9884,
+      "step": 10615
+    },
+    {
+      "epoch": 1.8901353276353277,
+      "grad_norm": 0.6575660109519958,
+      "learning_rate": 0.00010879303945150321,
+      "loss": 1.0307,
+      "step": 10616
+    },
+    {
+      "epoch": 1.8903133903133904,
+      "grad_norm": 0.7500179409980774,
+      "learning_rate": 0.00010877909611074932,
+      "loss": 1.0812,
+      "step": 10617
+    },
+    {
+      "epoch": 1.890491452991453,
+      "grad_norm": 0.7607308030128479,
+      "learning_rate": 0.00010876515259798727,
+      "loss": 0.9746,
+      "step": 10618
+    },
+    {
+      "epoch": 1.8906695156695157,
+      "grad_norm": 0.7930253744125366,
+      "learning_rate": 0.00010875120891349024,
+      "loss": 0.7911,
+      "step": 10619
+    },
+    {
+      "epoch": 1.8908475783475782,
+      "grad_norm": 0.635254979133606,
+      "learning_rate": 0.00010873726505753148,
+      "loss": 1.0468,
+      "step": 10620
+    },
+    {
+      "epoch": 1.891025641025641,
+      "grad_norm": 0.7579759359359741,
+      "learning_rate": 0.00010872332103038414,
+      "loss": 0.9558,
+      "step": 10621
+    },
+    {
+      "epoch": 1.8912037037037037,
+      "grad_norm": 0.5841903686523438,
+      "learning_rate": 0.00010870937683232146,
+      "loss": 0.913,
+      "step": 10622
+    },
+    {
+      "epoch": 1.8913817663817665,
+      "grad_norm": 0.7088860273361206,
+      "learning_rate": 0.00010869543246361664,
+      "loss": 1.0814,
+      "step": 10623
+    },
+    {
+      "epoch": 1.8915598290598292,
+      "grad_norm": 0.6713772416114807,
+      "learning_rate": 0.00010868148792454285,
+      "loss": 0.9972,
+      "step": 10624
+    },
+    {
+      "epoch": 1.8917378917378918,
+      "grad_norm": 0.6733243465423584,
+      "learning_rate": 0.00010866754321537338,
+      "loss": 0.9596,
+      "step": 10625
+    },
+    {
+      "epoch": 1.8919159544159543,
+      "grad_norm": 0.7747747898101807,
+      "learning_rate": 0.00010865359833638138,
+      "loss": 1.0871,
+      "step": 10626
+    },
+    {
+      "epoch": 1.892094017094017,
+      "grad_norm": 0.677175760269165,
+      "learning_rate": 0.00010863965328784011,
+      "loss": 0.9939,
+      "step": 10627
+    },
+    {
+      "epoch": 1.8922720797720798,
+      "grad_norm": 0.7883930206298828,
+      "learning_rate": 0.00010862570807002279,
+      "loss": 1.0708,
+      "step": 10628
+    },
+    {
+      "epoch": 1.8924501424501425,
+      "grad_norm": 0.7003030180931091,
+      "learning_rate": 0.00010861176268320261,
+      "loss": 0.9791,
+      "step": 10629
+    },
+    {
+      "epoch": 1.8926282051282053,
+      "grad_norm": 0.7450358271598816,
+      "learning_rate": 0.00010859781712765284,
+      "loss": 0.9672,
+      "step": 10630
+    },
+    {
+      "epoch": 1.8928062678062678,
+      "grad_norm": 0.7776696085929871,
+      "learning_rate": 0.00010858387140364672,
+      "loss": 1.1037,
+      "step": 10631
+    },
+    {
+      "epoch": 1.8929843304843303,
+      "grad_norm": 0.6896173357963562,
+      "learning_rate": 0.00010856992551145745,
+      "loss": 1.0048,
+      "step": 10632
+    },
+    {
+      "epoch": 1.893162393162393,
+      "grad_norm": 0.5997697710990906,
+      "learning_rate": 0.00010855597945135834,
+      "loss": 0.8025,
+      "step": 10633
+    },
+    {
+      "epoch": 1.8933404558404558,
+      "grad_norm": 0.8781484365463257,
+      "learning_rate": 0.00010854203322362251,
+      "loss": 1.0014,
+      "step": 10634
+    },
+    {
+      "epoch": 1.8935185185185186,
+      "grad_norm": 0.6348843574523926,
+      "learning_rate": 0.00010852808682852334,
+      "loss": 0.9857,
+      "step": 10635
+    },
+    {
+      "epoch": 1.8936965811965814,
+      "grad_norm": 0.9704267978668213,
+      "learning_rate": 0.000108514140266334,
+      "loss": 1.0522,
+      "step": 10636
+    },
+    {
+      "epoch": 1.8938746438746439,
+      "grad_norm": 0.70372074842453,
+      "learning_rate": 0.00010850019353732779,
+      "loss": 1.1044,
+      "step": 10637
+    },
+    {
+      "epoch": 1.8940527065527064,
+      "grad_norm": 0.6528043150901794,
+      "learning_rate": 0.00010848624664177793,
+      "loss": 0.9328,
+      "step": 10638
+    },
+    {
+      "epoch": 1.8942307692307692,
+      "grad_norm": 0.6299768090248108,
+      "learning_rate": 0.00010847229957995768,
+      "loss": 1.0099,
+      "step": 10639
+    },
+    {
+      "epoch": 1.894408831908832,
+      "grad_norm": 0.6347038149833679,
+      "learning_rate": 0.00010845835235214034,
+      "loss": 1.1354,
+      "step": 10640
+    },
+    {
+      "epoch": 1.8945868945868947,
+      "grad_norm": 0.7087811827659607,
+      "learning_rate": 0.00010844440495859913,
+      "loss": 1.0543,
+      "step": 10641
+    },
+    {
+      "epoch": 1.8947649572649574,
+      "grad_norm": 0.7386305332183838,
+      "learning_rate": 0.00010843045739960738,
+      "loss": 0.9192,
+      "step": 10642
+    },
+    {
+      "epoch": 1.89494301994302,
+      "grad_norm": 0.6047097444534302,
+      "learning_rate": 0.00010841650967543833,
+      "loss": 0.8668,
+      "step": 10643
+    },
+    {
+      "epoch": 1.8951210826210825,
+      "grad_norm": 0.6779503226280212,
+      "learning_rate": 0.00010840256178636523,
+      "loss": 0.9263,
+      "step": 10644
+    },
+    {
+      "epoch": 1.8952991452991452,
+      "grad_norm": 0.7398194670677185,
+      "learning_rate": 0.00010838861373266138,
+      "loss": 0.9534,
+      "step": 10645
+    },
+    {
+      "epoch": 1.895477207977208,
+      "grad_norm": 0.8138558864593506,
+      "learning_rate": 0.00010837466551460011,
+      "loss": 0.9835,
+      "step": 10646
+    },
+    {
+      "epoch": 1.8956552706552707,
+      "grad_norm": 0.8847818374633789,
+      "learning_rate": 0.00010836071713245466,
+      "loss": 0.9769,
+      "step": 10647
+    },
+    {
+      "epoch": 1.8958333333333335,
+      "grad_norm": 0.6824164390563965,
+      "learning_rate": 0.0001083467685864983,
+      "loss": 0.9901,
+      "step": 10648
+    },
+    {
+      "epoch": 1.896011396011396,
+      "grad_norm": 0.6318182945251465,
+      "learning_rate": 0.00010833281987700436,
+      "loss": 0.7677,
+      "step": 10649
+    },
+    {
+      "epoch": 1.8961894586894585,
+      "grad_norm": 0.7372074127197266,
+      "learning_rate": 0.00010831887100424612,
+      "loss": 0.9858,
+      "step": 10650
+    },
+    {
+      "epoch": 1.8963675213675213,
+      "grad_norm": 0.7246516346931458,
+      "learning_rate": 0.00010830492196849688,
+      "loss": 0.9644,
+      "step": 10651
+    },
+    {
+      "epoch": 1.896545584045584,
+      "grad_norm": 0.6517095565795898,
+      "learning_rate": 0.00010829097277002997,
+      "loss": 1.1733,
+      "step": 10652
+    },
+    {
+      "epoch": 1.8967236467236468,
+      "grad_norm": 0.6931695342063904,
+      "learning_rate": 0.00010827702340911867,
+      "loss": 0.9923,
+      "step": 10653
+    },
+    {
+      "epoch": 1.8969017094017095,
+      "grad_norm": 0.6210272312164307,
+      "learning_rate": 0.00010826307388603628,
+      "loss": 0.8757,
+      "step": 10654
+    },
+    {
+      "epoch": 1.897079772079772,
+      "grad_norm": 0.7011165618896484,
+      "learning_rate": 0.00010824912420105611,
+      "loss": 1.0011,
+      "step": 10655
+    },
+    {
+      "epoch": 1.8972578347578346,
+      "grad_norm": 0.7431246638298035,
+      "learning_rate": 0.0001082351743544515,
+      "loss": 1.1498,
+      "step": 10656
+    },
+    {
+      "epoch": 1.8974358974358974,
+      "grad_norm": 0.7099978923797607,
+      "learning_rate": 0.00010822122434649576,
+      "loss": 1.0673,
+      "step": 10657
+    },
+    {
+      "epoch": 1.89761396011396,
+      "grad_norm": 0.7375551462173462,
+      "learning_rate": 0.00010820727417746219,
+      "loss": 1.0157,
+      "step": 10658
+    },
+    {
+      "epoch": 1.8977920227920229,
+      "grad_norm": 0.8155642151832581,
+      "learning_rate": 0.00010819332384762413,
+      "loss": 1.229,
+      "step": 10659
+    },
+    {
+      "epoch": 1.8979700854700856,
+      "grad_norm": 0.6917914748191833,
+      "learning_rate": 0.00010817937335725493,
+      "loss": 0.9701,
+      "step": 10660
+    },
+    {
+      "epoch": 1.8981481481481481,
+      "grad_norm": 0.8498218059539795,
+      "learning_rate": 0.00010816542270662786,
+      "loss": 1.0123,
+      "step": 10661
+    },
+    {
+      "epoch": 1.8983262108262107,
+      "grad_norm": 0.7234359979629517,
+      "learning_rate": 0.00010815147189601634,
+      "loss": 1.0755,
+      "step": 10662
+    },
+    {
+      "epoch": 1.8985042735042734,
+      "grad_norm": 0.6997553110122681,
+      "learning_rate": 0.00010813752092569365,
+      "loss": 1.1594,
+      "step": 10663
+    },
+    {
+      "epoch": 1.8986823361823362,
+      "grad_norm": 0.6519457101821899,
+      "learning_rate": 0.00010812356979593314,
+      "loss": 0.9609,
+      "step": 10664
+    },
+    {
+      "epoch": 1.898860398860399,
+      "grad_norm": 0.7215374708175659,
+      "learning_rate": 0.00010810961850700813,
+      "loss": 1.1392,
+      "step": 10665
+    },
+    {
+      "epoch": 1.8990384615384617,
+      "grad_norm": 0.7766093611717224,
+      "learning_rate": 0.00010809566705919202,
+      "loss": 1.0256,
+      "step": 10666
+    },
+    {
+      "epoch": 1.8992165242165242,
+      "grad_norm": 0.6520358920097351,
+      "learning_rate": 0.00010808171545275814,
+      "loss": 1.0434,
+      "step": 10667
+    },
+    {
+      "epoch": 1.8993945868945867,
+      "grad_norm": 0.7454953193664551,
+      "learning_rate": 0.00010806776368797982,
+      "loss": 1.2323,
+      "step": 10668
+    },
+    {
+      "epoch": 1.8995726495726495,
+      "grad_norm": 0.6891530752182007,
+      "learning_rate": 0.00010805381176513043,
+      "loss": 1.1104,
+      "step": 10669
+    },
+    {
+      "epoch": 1.8997507122507122,
+      "grad_norm": 0.6609626412391663,
+      "learning_rate": 0.00010803985968448331,
+      "loss": 0.8565,
+      "step": 10670
+    },
+    {
+      "epoch": 1.899928774928775,
+      "grad_norm": 0.6650999188423157,
+      "learning_rate": 0.00010802590744631187,
+      "loss": 1.1003,
+      "step": 10671
+    },
+    {
+      "epoch": 1.9001068376068377,
+      "grad_norm": 0.5794292092323303,
+      "learning_rate": 0.00010801195505088945,
+      "loss": 0.528,
+      "step": 10672
+    },
+    {
+      "epoch": 1.9002849002849003,
+      "grad_norm": 1.0802743434906006,
+      "learning_rate": 0.00010799800249848939,
+      "loss": 0.8861,
+      "step": 10673
+    },
+    {
+      "epoch": 1.9004629629629628,
+      "grad_norm": 0.650833249092102,
+      "learning_rate": 0.00010798404978938513,
+      "loss": 0.9962,
+      "step": 10674
+    },
+    {
+      "epoch": 1.9006410256410255,
+      "grad_norm": 0.7290451526641846,
+      "learning_rate": 0.00010797009692384994,
+      "loss": 1.0764,
+      "step": 10675
+    },
+    {
+      "epoch": 1.9008190883190883,
+      "grad_norm": 0.6273928880691528,
+      "learning_rate": 0.00010795614390215727,
+      "loss": 0.9478,
+      "step": 10676
+    },
+    {
+      "epoch": 1.900997150997151,
+      "grad_norm": 0.6939455270767212,
+      "learning_rate": 0.00010794219072458052,
+      "loss": 0.8991,
+      "step": 10677
+    },
+    {
+      "epoch": 1.9011752136752138,
+      "grad_norm": 0.7455828189849854,
+      "learning_rate": 0.00010792823739139302,
+      "loss": 0.8902,
+      "step": 10678
+    },
+    {
+      "epoch": 1.9013532763532763,
+      "grad_norm": 0.6894607543945312,
+      "learning_rate": 0.00010791428390286817,
+      "loss": 0.9355,
+      "step": 10679
+    },
+    {
+      "epoch": 1.901531339031339,
+      "grad_norm": 0.6844658851623535,
+      "learning_rate": 0.00010790033025927936,
+      "loss": 0.9835,
+      "step": 10680
+    },
+    {
+      "epoch": 1.9017094017094016,
+      "grad_norm": 0.6646730899810791,
+      "learning_rate": 0.00010788637646090001,
+      "loss": 0.9376,
+      "step": 10681
+    },
+    {
+      "epoch": 1.9018874643874644,
+      "grad_norm": 0.6494864225387573,
+      "learning_rate": 0.00010787242250800349,
+      "loss": 0.8533,
+      "step": 10682
+    },
+    {
+      "epoch": 1.9020655270655271,
+      "grad_norm": 0.686198353767395,
+      "learning_rate": 0.0001078584684008632,
+      "loss": 0.8075,
+      "step": 10683
+    },
+    {
+      "epoch": 1.9022435897435899,
+      "grad_norm": 0.7014855742454529,
+      "learning_rate": 0.00010784451413975256,
+      "loss": 1.0805,
+      "step": 10684
+    },
+    {
+      "epoch": 1.9024216524216524,
+      "grad_norm": 0.7191864252090454,
+      "learning_rate": 0.00010783055972494496,
+      "loss": 0.9375,
+      "step": 10685
+    },
+    {
+      "epoch": 1.9025997150997151,
+      "grad_norm": 0.8114212155342102,
+      "learning_rate": 0.00010781660515671379,
+      "loss": 0.9716,
+      "step": 10686
+    },
+    {
+      "epoch": 1.9027777777777777,
+      "grad_norm": 0.7423529028892517,
+      "learning_rate": 0.0001078026504353325,
+      "loss": 0.9066,
+      "step": 10687
+    },
+    {
+      "epoch": 1.9029558404558404,
+      "grad_norm": 0.6517882347106934,
+      "learning_rate": 0.00010778869556107447,
+      "loss": 0.9908,
+      "step": 10688
+    },
+    {
+      "epoch": 1.9031339031339032,
+      "grad_norm": 0.6983367800712585,
+      "learning_rate": 0.00010777474053421315,
+      "loss": 1.1048,
+      "step": 10689
+    },
+    {
+      "epoch": 1.903311965811966,
+      "grad_norm": 0.597766101360321,
+      "learning_rate": 0.00010776078535502193,
+      "loss": 0.84,
+      "step": 10690
+    },
+    {
+      "epoch": 1.9034900284900285,
+      "grad_norm": 0.7335455417633057,
+      "learning_rate": 0.00010774683002377422,
+      "loss": 1.0387,
+      "step": 10691
+    },
+    {
+      "epoch": 1.9036680911680912,
+      "grad_norm": 0.6742176413536072,
+      "learning_rate": 0.0001077328745407435,
+      "loss": 0.9743,
+      "step": 10692
+    },
+    {
+      "epoch": 1.9038461538461537,
+      "grad_norm": 0.7954961657524109,
+      "learning_rate": 0.00010771891890620316,
+      "loss": 1.1025,
+      "step": 10693
+    },
+    {
+      "epoch": 1.9040242165242165,
+      "grad_norm": 0.733351469039917,
+      "learning_rate": 0.00010770496312042664,
+      "loss": 1.028,
+      "step": 10694
+    },
+    {
+      "epoch": 1.9042022792022792,
+      "grad_norm": 0.7059772610664368,
+      "learning_rate": 0.00010769100718368734,
+      "loss": 1.0103,
+      "step": 10695
+    },
+    {
+      "epoch": 1.904380341880342,
+      "grad_norm": 0.6234813332557678,
+      "learning_rate": 0.00010767705109625877,
+      "loss": 0.6893,
+      "step": 10696
+    },
+    {
+      "epoch": 1.9045584045584045,
+      "grad_norm": 0.6670311689376831,
+      "learning_rate": 0.0001076630948584143,
+      "loss": 1.1386,
+      "step": 10697
+    },
+    {
+      "epoch": 1.9047364672364673,
+      "grad_norm": 0.7444894909858704,
+      "learning_rate": 0.00010764913847042744,
+      "loss": 0.8524,
+      "step": 10698
+    },
+    {
+      "epoch": 1.9049145299145298,
+      "grad_norm": 0.6252964735031128,
+      "learning_rate": 0.00010763518193257158,
+      "loss": 0.9407,
+      "step": 10699
+    },
+    {
+      "epoch": 1.9050925925925926,
+      "grad_norm": 0.7794382572174072,
+      "learning_rate": 0.0001076212252451202,
+      "loss": 1.05,
+      "step": 10700
+    },
+    {
+      "epoch": 1.9052706552706553,
+      "grad_norm": 0.6313693523406982,
+      "learning_rate": 0.00010760726840834671,
+      "loss": 0.8667,
+      "step": 10701
+    },
+    {
+      "epoch": 1.905448717948718,
+      "grad_norm": 0.6766461730003357,
+      "learning_rate": 0.00010759331142252462,
+      "loss": 0.9675,
+      "step": 10702
+    },
+    {
+      "epoch": 1.9056267806267806,
+      "grad_norm": 0.7457365393638611,
+      "learning_rate": 0.00010757935428792739,
+      "loss": 0.9177,
+      "step": 10703
+    },
+    {
+      "epoch": 1.9058048433048433,
+      "grad_norm": 0.6649872064590454,
+      "learning_rate": 0.00010756539700482844,
+      "loss": 0.8703,
+      "step": 10704
+    },
+    {
+      "epoch": 1.9059829059829059,
+      "grad_norm": 0.8418740034103394,
+      "learning_rate": 0.00010755143957350127,
+      "loss": 0.8993,
+      "step": 10705
+    },
+    {
+      "epoch": 1.9061609686609686,
+      "grad_norm": 0.6767167448997498,
+      "learning_rate": 0.00010753748199421929,
+      "loss": 1.0063,
+      "step": 10706
+    },
+    {
+      "epoch": 1.9063390313390314,
+      "grad_norm": 0.6959242820739746,
+      "learning_rate": 0.00010752352426725603,
+      "loss": 1.0516,
+      "step": 10707
+    },
+    {
+      "epoch": 1.9065170940170941,
+      "grad_norm": 0.7106529474258423,
+      "learning_rate": 0.00010750956639288493,
+      "loss": 0.9596,
+      "step": 10708
+    },
+    {
+      "epoch": 1.9066951566951567,
+      "grad_norm": 0.7611243724822998,
+      "learning_rate": 0.00010749560837137949,
+      "loss": 1.0739,
+      "step": 10709
+    },
+    {
+      "epoch": 1.9068732193732194,
+      "grad_norm": 0.6684338450431824,
+      "learning_rate": 0.00010748165020301317,
+      "loss": 1.1437,
+      "step": 10710
+    },
+    {
+      "epoch": 1.907051282051282,
+      "grad_norm": 0.5957385897636414,
+      "learning_rate": 0.00010746769188805945,
+      "loss": 0.8802,
+      "step": 10711
+    },
+    {
+      "epoch": 1.9072293447293447,
+      "grad_norm": 0.69919353723526,
+      "learning_rate": 0.00010745373342679184,
+      "loss": 1.1891,
+      "step": 10712
+    },
+    {
+      "epoch": 1.9074074074074074,
+      "grad_norm": 0.7562127709388733,
+      "learning_rate": 0.0001074397748194838,
+      "loss": 0.8717,
+      "step": 10713
+    },
+    {
+      "epoch": 1.9075854700854702,
+      "grad_norm": 0.6420038938522339,
+      "learning_rate": 0.00010742581606640882,
+      "loss": 1.1196,
+      "step": 10714
+    },
+    {
+      "epoch": 1.9077635327635327,
+      "grad_norm": 0.7545611262321472,
+      "learning_rate": 0.00010741185716784039,
+      "loss": 1.161,
+      "step": 10715
+    },
+    {
+      "epoch": 1.9079415954415955,
+      "grad_norm": 0.6467727422714233,
+      "learning_rate": 0.000107397898124052,
+      "loss": 0.8029,
+      "step": 10716
+    },
+    {
+      "epoch": 1.908119658119658,
+      "grad_norm": 0.6129235625267029,
+      "learning_rate": 0.00010738393893531722,
+      "loss": 0.8802,
+      "step": 10717
+    },
+    {
+      "epoch": 1.9082977207977208,
+      "grad_norm": 0.6416113376617432,
+      "learning_rate": 0.00010736997960190946,
+      "loss": 0.8465,
+      "step": 10718
+    },
+    {
+      "epoch": 1.9084757834757835,
+      "grad_norm": 0.6609050631523132,
+      "learning_rate": 0.00010735602012410229,
+      "loss": 0.9484,
+      "step": 10719
+    },
+    {
+      "epoch": 1.9086538461538463,
+      "grad_norm": 0.6302639842033386,
+      "learning_rate": 0.00010734206050216913,
+      "loss": 0.898,
+      "step": 10720
+    },
+    {
+      "epoch": 1.9088319088319088,
+      "grad_norm": 0.7291215658187866,
+      "learning_rate": 0.00010732810073638358,
+      "loss": 0.9544,
+      "step": 10721
+    },
+    {
+      "epoch": 1.9090099715099715,
+      "grad_norm": 0.6436966061592102,
+      "learning_rate": 0.0001073141408270191,
+      "loss": 0.956,
+      "step": 10722
+    },
+    {
+      "epoch": 1.909188034188034,
+      "grad_norm": 0.6247875094413757,
+      "learning_rate": 0.00010730018077434924,
+      "loss": 0.8704,
+      "step": 10723
+    },
+    {
+      "epoch": 1.9093660968660968,
+      "grad_norm": 0.7599029541015625,
+      "learning_rate": 0.00010728622057864753,
+      "loss": 1.2024,
+      "step": 10724
+    },
+    {
+      "epoch": 1.9095441595441596,
+      "grad_norm": 0.6894544959068298,
+      "learning_rate": 0.00010727226024018744,
+      "loss": 1.1226,
+      "step": 10725
+    },
+    {
+      "epoch": 1.9097222222222223,
+      "grad_norm": 0.6920733451843262,
+      "learning_rate": 0.0001072582997592425,
+      "loss": 0.7682,
+      "step": 10726
+    },
+    {
+      "epoch": 1.9099002849002849,
+      "grad_norm": 0.6013005375862122,
+      "learning_rate": 0.00010724433913608627,
+      "loss": 0.9462,
+      "step": 10727
+    },
+    {
+      "epoch": 1.9100783475783476,
+      "grad_norm": 0.7466302514076233,
+      "learning_rate": 0.00010723037837099225,
+      "loss": 0.9507,
+      "step": 10728
+    },
+    {
+      "epoch": 1.9102564102564101,
+      "grad_norm": 0.7070091962814331,
+      "learning_rate": 0.00010721641746423401,
+      "loss": 1.0704,
+      "step": 10729
+    },
+    {
+      "epoch": 1.9104344729344729,
+      "grad_norm": 0.6747950315475464,
+      "learning_rate": 0.00010720245641608506,
+      "loss": 0.7899,
+      "step": 10730
+    },
+    {
+      "epoch": 1.9106125356125356,
+      "grad_norm": 0.7338371276855469,
+      "learning_rate": 0.00010718849522681891,
+      "loss": 0.9574,
+      "step": 10731
+    },
+    {
+      "epoch": 1.9107905982905984,
+      "grad_norm": 0.6923216581344604,
+      "learning_rate": 0.00010717453389670915,
+      "loss": 1.0725,
+      "step": 10732
+    },
+    {
+      "epoch": 1.910968660968661,
+      "grad_norm": 0.6050783395767212,
+      "learning_rate": 0.0001071605724260293,
+      "loss": 0.9224,
+      "step": 10733
+    },
+    {
+      "epoch": 1.9111467236467237,
+      "grad_norm": 0.6854597330093384,
+      "learning_rate": 0.00010714661081505291,
+      "loss": 0.9749,
+      "step": 10734
+    },
+    {
+      "epoch": 1.9113247863247862,
+      "grad_norm": 0.7661508321762085,
+      "learning_rate": 0.00010713264906405351,
+      "loss": 1.1564,
+      "step": 10735
+    },
+    {
+      "epoch": 1.911502849002849,
+      "grad_norm": 0.6389622688293457,
+      "learning_rate": 0.00010711868717330467,
+      "loss": 0.8148,
+      "step": 10736
+    },
+    {
+      "epoch": 1.9116809116809117,
+      "grad_norm": 0.6318161487579346,
+      "learning_rate": 0.00010710472514307996,
+      "loss": 0.7833,
+      "step": 10737
+    },
+    {
+      "epoch": 1.9118589743589745,
+      "grad_norm": 0.8646727800369263,
+      "learning_rate": 0.00010709076297365292,
+      "loss": 1.2682,
+      "step": 10738
+    },
+    {
+      "epoch": 1.9120370370370372,
+      "grad_norm": 0.6085501909255981,
+      "learning_rate": 0.0001070768006652971,
+      "loss": 0.8706,
+      "step": 10739
+    },
+    {
+      "epoch": 1.9122150997150997,
+      "grad_norm": 0.8259731531143188,
+      "learning_rate": 0.00010706283821828607,
+      "loss": 0.9014,
+      "step": 10740
+    },
+    {
+      "epoch": 1.9123931623931623,
+      "grad_norm": 0.6509148478507996,
+      "learning_rate": 0.0001070488756328934,
+      "loss": 0.8814,
+      "step": 10741
+    },
+    {
+      "epoch": 1.912571225071225,
+      "grad_norm": 0.7241966128349304,
+      "learning_rate": 0.00010703491290939264,
+      "loss": 0.9925,
+      "step": 10742
+    },
+    {
+      "epoch": 1.9127492877492878,
+      "grad_norm": 0.7736822366714478,
+      "learning_rate": 0.00010702095004805738,
+      "loss": 1.0881,
+      "step": 10743
+    },
+    {
+      "epoch": 1.9129273504273505,
+      "grad_norm": 0.6912824511528015,
+      "learning_rate": 0.00010700698704916123,
+      "loss": 1.2334,
+      "step": 10744
+    },
+    {
+      "epoch": 1.9131054131054133,
+      "grad_norm": 0.825065553188324,
+      "learning_rate": 0.0001069930239129777,
+      "loss": 0.9783,
+      "step": 10745
+    },
+    {
+      "epoch": 1.9132834757834758,
+      "grad_norm": 0.7650560140609741,
+      "learning_rate": 0.00010697906063978038,
+      "loss": 0.9788,
+      "step": 10746
+    },
+    {
+      "epoch": 1.9134615384615383,
+      "grad_norm": 0.7368232607841492,
+      "learning_rate": 0.00010696509722984287,
+      "loss": 0.8704,
+      "step": 10747
+    },
+    {
+      "epoch": 1.913639601139601,
+      "grad_norm": 0.6630628108978271,
+      "learning_rate": 0.00010695113368343875,
+      "loss": 1.1993,
+      "step": 10748
+    },
+    {
+      "epoch": 1.9138176638176638,
+      "grad_norm": 0.6842190027236938,
+      "learning_rate": 0.0001069371700008416,
+      "loss": 0.9128,
+      "step": 10749
+    },
+    {
+      "epoch": 1.9139957264957266,
+      "grad_norm": 0.591655969619751,
+      "learning_rate": 0.00010692320618232503,
+      "loss": 1.0607,
+      "step": 10750
+    },
+    {
+      "epoch": 1.9141737891737893,
+      "grad_norm": 0.74644535779953,
+      "learning_rate": 0.0001069092422281626,
+      "loss": 1.0937,
+      "step": 10751
+    },
+    {
+      "epoch": 1.9143518518518519,
+      "grad_norm": 0.7123813629150391,
+      "learning_rate": 0.00010689527813862792,
+      "loss": 0.9043,
+      "step": 10752
+    },
+    {
+      "epoch": 1.9145299145299144,
+      "grad_norm": 0.6850089430809021,
+      "learning_rate": 0.0001068813139139946,
+      "loss": 1.0908,
+      "step": 10753
+    },
+    {
+      "epoch": 1.9147079772079771,
+      "grad_norm": 0.5882078409194946,
+      "learning_rate": 0.00010686734955453623,
+      "loss": 0.829,
+      "step": 10754
+    },
+    {
+      "epoch": 1.91488603988604,
+      "grad_norm": 0.6741717457771301,
+      "learning_rate": 0.00010685338506052642,
+      "loss": 0.9197,
+      "step": 10755
+    },
+    {
+      "epoch": 1.9150641025641026,
+      "grad_norm": 0.6597354412078857,
+      "learning_rate": 0.00010683942043223876,
+      "loss": 0.8778,
+      "step": 10756
+    },
+    {
+      "epoch": 1.9152421652421654,
+      "grad_norm": 0.6682151556015015,
+      "learning_rate": 0.00010682545566994684,
+      "loss": 0.9305,
+      "step": 10757
+    },
+    {
+      "epoch": 1.915420227920228,
+      "grad_norm": 0.8283176422119141,
+      "learning_rate": 0.00010681149077392431,
+      "loss": 1.0164,
+      "step": 10758
+    },
+    {
+      "epoch": 1.9155982905982905,
+      "grad_norm": 0.648845374584198,
+      "learning_rate": 0.00010679752574444477,
+      "loss": 1.0114,
+      "step": 10759
+    },
+    {
+      "epoch": 1.9157763532763532,
+      "grad_norm": 0.755913496017456,
+      "learning_rate": 0.00010678356058178182,
+      "loss": 1.1142,
+      "step": 10760
+    },
+    {
+      "epoch": 1.915954415954416,
+      "grad_norm": 0.7334780097007751,
+      "learning_rate": 0.00010676959528620911,
+      "loss": 0.8758,
+      "step": 10761
+    },
+    {
+      "epoch": 1.9161324786324787,
+      "grad_norm": 0.9132041335105896,
+      "learning_rate": 0.00010675562985800025,
+      "loss": 0.995,
+      "step": 10762
+    },
+    {
+      "epoch": 1.9163105413105415,
+      "grad_norm": 0.7070860266685486,
+      "learning_rate": 0.00010674166429742882,
+      "loss": 0.9856,
+      "step": 10763
+    },
+    {
+      "epoch": 1.916488603988604,
+      "grad_norm": 0.7143638134002686,
+      "learning_rate": 0.00010672769860476853,
+      "loss": 1.0612,
+      "step": 10764
+    },
+    {
+      "epoch": 1.9166666666666665,
+      "grad_norm": 0.815717339515686,
+      "learning_rate": 0.00010671373278029293,
+      "loss": 1.1539,
+      "step": 10765
+    },
+    {
+      "epoch": 1.9168447293447293,
+      "grad_norm": 0.6379499435424805,
+      "learning_rate": 0.0001066997668242757,
+      "loss": 0.8295,
+      "step": 10766
+    },
+    {
+      "epoch": 1.917022792022792,
+      "grad_norm": 0.6482511758804321,
+      "learning_rate": 0.00010668580073699044,
+      "loss": 1.0079,
+      "step": 10767
+    },
+    {
+      "epoch": 1.9172008547008548,
+      "grad_norm": 0.7382873296737671,
+      "learning_rate": 0.00010667183451871082,
+      "loss": 0.8973,
+      "step": 10768
+    },
+    {
+      "epoch": 1.9173789173789175,
+      "grad_norm": 0.7818579077720642,
+      "learning_rate": 0.00010665786816971044,
+      "loss": 1.2131,
+      "step": 10769
+    },
+    {
+      "epoch": 1.91755698005698,
+      "grad_norm": 0.6960901021957397,
+      "learning_rate": 0.000106643901690263,
+      "loss": 1.1466,
+      "step": 10770
+    },
+    {
+      "epoch": 1.9177350427350426,
+      "grad_norm": 0.696966826915741,
+      "learning_rate": 0.00010662993508064208,
+      "loss": 0.854,
+      "step": 10771
+    },
+    {
+      "epoch": 1.9179131054131053,
+      "grad_norm": 0.6745442152023315,
+      "learning_rate": 0.00010661596834112133,
+      "loss": 0.9559,
+      "step": 10772
+    },
+    {
+      "epoch": 1.918091168091168,
+      "grad_norm": 0.7436230778694153,
+      "learning_rate": 0.00010660200147197447,
+      "loss": 1.1367,
+      "step": 10773
+    },
+    {
+      "epoch": 1.9182692307692308,
+      "grad_norm": 0.6051676869392395,
+      "learning_rate": 0.00010658803447347509,
+      "loss": 1.05,
+      "step": 10774
+    },
+    {
+      "epoch": 1.9184472934472936,
+      "grad_norm": 0.5662530660629272,
+      "learning_rate": 0.00010657406734589686,
+      "loss": 0.8697,
+      "step": 10775
+    },
+    {
+      "epoch": 1.9186253561253561,
+      "grad_norm": 0.6640757322311401,
+      "learning_rate": 0.00010656010008951344,
+      "loss": 1.0636,
+      "step": 10776
+    },
+    {
+      "epoch": 1.9188034188034186,
+      "grad_norm": 0.6994011998176575,
+      "learning_rate": 0.00010654613270459848,
+      "loss": 0.9326,
+      "step": 10777
+    },
+    {
+      "epoch": 1.9189814814814814,
+      "grad_norm": 0.6827420592308044,
+      "learning_rate": 0.00010653216519142563,
+      "loss": 0.8667,
+      "step": 10778
+    },
+    {
+      "epoch": 1.9191595441595442,
+      "grad_norm": 0.6814691424369812,
+      "learning_rate": 0.00010651819755026862,
+      "loss": 0.828,
+      "step": 10779
+    },
+    {
+      "epoch": 1.919337606837607,
+      "grad_norm": 0.7033611536026001,
+      "learning_rate": 0.00010650422978140103,
+      "loss": 1.0427,
+      "step": 10780
+    },
+    {
+      "epoch": 1.9195156695156697,
+      "grad_norm": 0.7098833322525024,
+      "learning_rate": 0.00010649026188509657,
+      "loss": 1.1723,
+      "step": 10781
+    },
+    {
+      "epoch": 1.9196937321937322,
+      "grad_norm": 0.7184767723083496,
+      "learning_rate": 0.00010647629386162893,
+      "loss": 0.852,
+      "step": 10782
+    },
+    {
+      "epoch": 1.9198717948717947,
+      "grad_norm": 0.6682565808296204,
+      "learning_rate": 0.00010646232571127175,
+      "loss": 0.8827,
+      "step": 10783
+    },
+    {
+      "epoch": 1.9200498575498575,
+      "grad_norm": 0.6699280142784119,
+      "learning_rate": 0.00010644835743429873,
+      "loss": 0.8346,
+      "step": 10784
+    },
+    {
+      "epoch": 1.9202279202279202,
+      "grad_norm": 0.8041857481002808,
+      "learning_rate": 0.00010643438903098355,
+      "loss": 0.9622,
+      "step": 10785
+    },
+    {
+      "epoch": 1.920405982905983,
+      "grad_norm": 0.7315110564231873,
+      "learning_rate": 0.00010642042050159986,
+      "loss": 1.0443,
+      "step": 10786
+    },
+    {
+      "epoch": 1.9205840455840457,
+      "grad_norm": 0.5850204229354858,
+      "learning_rate": 0.0001064064518464214,
+      "loss": 1.0155,
+      "step": 10787
+    },
+    {
+      "epoch": 1.9207621082621082,
+      "grad_norm": 0.7320640683174133,
+      "learning_rate": 0.00010639248306572178,
+      "loss": 1.1556,
+      "step": 10788
+    },
+    {
+      "epoch": 1.9209401709401708,
+      "grad_norm": 0.689804196357727,
+      "learning_rate": 0.00010637851415977478,
+      "loss": 1.1058,
+      "step": 10789
+    },
+    {
+      "epoch": 1.9211182336182335,
+      "grad_norm": 0.6433262228965759,
+      "learning_rate": 0.000106364545128854,
+      "loss": 1.0916,
+      "step": 10790
+    },
+    {
+      "epoch": 1.9212962962962963,
+      "grad_norm": 0.6802626252174377,
+      "learning_rate": 0.00010635057597323323,
+      "loss": 1.126,
+      "step": 10791
+    },
+    {
+      "epoch": 1.921474358974359,
+      "grad_norm": 0.7503384351730347,
+      "learning_rate": 0.00010633660669318608,
+      "loss": 0.9354,
+      "step": 10792
+    },
+    {
+      "epoch": 1.9216524216524218,
+      "grad_norm": 0.6370253562927246,
+      "learning_rate": 0.00010632263728898629,
+      "loss": 0.9976,
+      "step": 10793
+    },
+    {
+      "epoch": 1.9218304843304843,
+      "grad_norm": 0.7566042542457581,
+      "learning_rate": 0.00010630866776090755,
+      "loss": 1.0311,
+      "step": 10794
+    },
+    {
+      "epoch": 1.922008547008547,
+      "grad_norm": 0.7011943459510803,
+      "learning_rate": 0.0001062946981092236,
+      "loss": 0.8777,
+      "step": 10795
+    },
+    {
+      "epoch": 1.9221866096866096,
+      "grad_norm": 0.6621114015579224,
+      "learning_rate": 0.00010628072833420811,
+      "loss": 0.9615,
+      "step": 10796
+    },
+    {
+      "epoch": 1.9223646723646723,
+      "grad_norm": 0.6863150000572205,
+      "learning_rate": 0.00010626675843613478,
+      "loss": 1.071,
+      "step": 10797
+    },
+    {
+      "epoch": 1.922542735042735,
+      "grad_norm": 0.597970724105835,
+      "learning_rate": 0.00010625278841527733,
+      "loss": 0.8661,
+      "step": 10798
+    },
+    {
+      "epoch": 1.9227207977207978,
+      "grad_norm": 0.5958755612373352,
+      "learning_rate": 0.00010623881827190947,
+      "loss": 0.9075,
+      "step": 10799
+    },
+    {
+      "epoch": 1.9228988603988604,
+      "grad_norm": 0.7764523029327393,
+      "learning_rate": 0.00010622484800630494,
+      "loss": 1.0576,
+      "step": 10800
+    },
+    {
+      "epoch": 1.9230769230769231,
+      "grad_norm": 0.774156391620636,
+      "learning_rate": 0.00010621087761873748,
+      "loss": 0.9273,
+      "step": 10801
+    },
+    {
+      "epoch": 1.9232549857549857,
+      "grad_norm": 0.6321687698364258,
+      "learning_rate": 0.00010619690710948074,
+      "loss": 0.8805,
+      "step": 10802
+    },
+    {
+      "epoch": 1.9234330484330484,
+      "grad_norm": 0.659538984298706,
+      "learning_rate": 0.00010618293647880846,
+      "loss": 0.9845,
+      "step": 10803
+    },
+    {
+      "epoch": 1.9236111111111112,
+      "grad_norm": 0.6931299567222595,
+      "learning_rate": 0.00010616896572699442,
+      "loss": 1.2005,
+      "step": 10804
+    },
+    {
+      "epoch": 1.923789173789174,
+      "grad_norm": 0.6054762005805969,
+      "learning_rate": 0.00010615499485431228,
+      "loss": 0.825,
+      "step": 10805
+    },
+    {
+      "epoch": 1.9239672364672364,
+      "grad_norm": 0.6631526947021484,
+      "learning_rate": 0.00010614102386103584,
+      "loss": 0.9149,
+      "step": 10806
+    },
+    {
+      "epoch": 1.9241452991452992,
+      "grad_norm": 0.6667893528938293,
+      "learning_rate": 0.00010612705274743878,
+      "loss": 1.014,
+      "step": 10807
+    },
+    {
+      "epoch": 1.9243233618233617,
+      "grad_norm": 0.861302375793457,
+      "learning_rate": 0.00010611308151379482,
+      "loss": 1.0809,
+      "step": 10808
+    },
+    {
+      "epoch": 1.9245014245014245,
+      "grad_norm": 0.6997994780540466,
+      "learning_rate": 0.00010609911016037777,
+      "loss": 0.8897,
+      "step": 10809
+    },
+    {
+      "epoch": 1.9246794871794872,
+      "grad_norm": 0.5689206123352051,
+      "learning_rate": 0.00010608513868746131,
+      "loss": 0.7517,
+      "step": 10810
+    },
+    {
+      "epoch": 1.92485754985755,
+      "grad_norm": 0.5972287654876709,
+      "learning_rate": 0.00010607116709531918,
+      "loss": 0.9015,
+      "step": 10811
+    },
+    {
+      "epoch": 1.9250356125356125,
+      "grad_norm": 0.7115643620491028,
+      "learning_rate": 0.00010605719538422519,
+      "loss": 0.6974,
+      "step": 10812
+    },
+    {
+      "epoch": 1.9252136752136753,
+      "grad_norm": 0.6548098921775818,
+      "learning_rate": 0.00010604322355445297,
+      "loss": 0.7075,
+      "step": 10813
+    },
+    {
+      "epoch": 1.9253917378917378,
+      "grad_norm": 0.6666337847709656,
+      "learning_rate": 0.00010602925160627639,
+      "loss": 1.0389,
+      "step": 10814
+    },
+    {
+      "epoch": 1.9255698005698005,
+      "grad_norm": 0.7754444479942322,
+      "learning_rate": 0.00010601527953996913,
+      "loss": 1.0674,
+      "step": 10815
+    },
+    {
+      "epoch": 1.9257478632478633,
+      "grad_norm": 0.6602712869644165,
+      "learning_rate": 0.00010600130735580498,
+      "loss": 1.2622,
+      "step": 10816
+    },
+    {
+      "epoch": 1.925925925925926,
+      "grad_norm": 0.6974020004272461,
+      "learning_rate": 0.00010598733505405767,
+      "loss": 0.9748,
+      "step": 10817
+    },
+    {
+      "epoch": 1.9261039886039886,
+      "grad_norm": 0.6236271858215332,
+      "learning_rate": 0.00010597336263500095,
+      "loss": 0.9463,
+      "step": 10818
+    },
+    {
+      "epoch": 1.9262820512820513,
+      "grad_norm": 0.6856079697608948,
+      "learning_rate": 0.00010595939009890859,
+      "loss": 0.9484,
+      "step": 10819
+    },
+    {
+      "epoch": 1.9264601139601139,
+      "grad_norm": 0.7300925850868225,
+      "learning_rate": 0.00010594541744605437,
+      "loss": 0.9702,
+      "step": 10820
+    },
+    {
+      "epoch": 1.9266381766381766,
+      "grad_norm": 0.6546478867530823,
+      "learning_rate": 0.00010593144467671208,
+      "loss": 0.8235,
+      "step": 10821
+    },
+    {
+      "epoch": 1.9268162393162394,
+      "grad_norm": 0.7215169072151184,
+      "learning_rate": 0.00010591747179115543,
+      "loss": 0.9986,
+      "step": 10822
+    },
+    {
+      "epoch": 1.926994301994302,
+      "grad_norm": 0.7304712533950806,
+      "learning_rate": 0.00010590349878965822,
+      "loss": 1.099,
+      "step": 10823
+    },
+    {
+      "epoch": 1.9271723646723646,
+      "grad_norm": 0.5883305668830872,
+      "learning_rate": 0.0001058895256724942,
+      "loss": 1.0647,
+      "step": 10824
+    },
+    {
+      "epoch": 1.9273504273504274,
+      "grad_norm": 0.8067272305488586,
+      "learning_rate": 0.00010587555243993716,
+      "loss": 1.0295,
+      "step": 10825
+    },
+    {
+      "epoch": 1.92752849002849,
+      "grad_norm": 0.6607550978660583,
+      "learning_rate": 0.00010586157909226089,
+      "loss": 0.8669,
+      "step": 10826
+    },
+    {
+      "epoch": 1.9277065527065527,
+      "grad_norm": 0.7256106734275818,
+      "learning_rate": 0.00010584760562973914,
+      "loss": 1.1674,
+      "step": 10827
+    },
+    {
+      "epoch": 1.9278846153846154,
+      "grad_norm": 0.6584621071815491,
+      "learning_rate": 0.00010583363205264574,
+      "loss": 0.8901,
+      "step": 10828
+    },
+    {
+      "epoch": 1.9280626780626782,
+      "grad_norm": 0.7200617790222168,
+      "learning_rate": 0.00010581965836125439,
+      "loss": 1.0463,
+      "step": 10829
+    },
+    {
+      "epoch": 1.9282407407407407,
+      "grad_norm": 0.7244223952293396,
+      "learning_rate": 0.00010580568455583894,
+      "loss": 1.0973,
+      "step": 10830
+    },
+    {
+      "epoch": 1.9284188034188035,
+      "grad_norm": 0.7678009867668152,
+      "learning_rate": 0.00010579171063667317,
+      "loss": 1.1753,
+      "step": 10831
+    },
+    {
+      "epoch": 1.928596866096866,
+      "grad_norm": 0.6455881595611572,
+      "learning_rate": 0.00010577773660403085,
+      "loss": 0.8988,
+      "step": 10832
+    },
+    {
+      "epoch": 1.9287749287749287,
+      "grad_norm": 0.6804864406585693,
+      "learning_rate": 0.0001057637624581858,
+      "loss": 0.8156,
+      "step": 10833
+    },
+    {
+      "epoch": 1.9289529914529915,
+      "grad_norm": 0.7874828577041626,
+      "learning_rate": 0.00010574978819941176,
+      "loss": 1.1876,
+      "step": 10834
+    },
+    {
+      "epoch": 1.9291310541310542,
+      "grad_norm": 0.7396490573883057,
+      "learning_rate": 0.00010573581382798261,
+      "loss": 0.8709,
+      "step": 10835
+    },
+    {
+      "epoch": 1.9293091168091168,
+      "grad_norm": 0.6800381541252136,
+      "learning_rate": 0.00010572183934417209,
+      "loss": 0.9906,
+      "step": 10836
+    },
+    {
+      "epoch": 1.9294871794871795,
+      "grad_norm": 0.7077754139900208,
+      "learning_rate": 0.000105707864748254,
+      "loss": 0.9785,
+      "step": 10837
+    },
+    {
+      "epoch": 1.929665242165242,
+      "grad_norm": 0.693249523639679,
+      "learning_rate": 0.00010569389004050216,
+      "loss": 0.9515,
+      "step": 10838
+    },
+    {
+      "epoch": 1.9298433048433048,
+      "grad_norm": 0.706924319267273,
+      "learning_rate": 0.00010567991522119037,
+      "loss": 1.074,
+      "step": 10839
+    },
+    {
+      "epoch": 1.9300213675213675,
+      "grad_norm": 0.6504101157188416,
+      "learning_rate": 0.00010566594029059244,
+      "loss": 1.0635,
+      "step": 10840
+    },
+    {
+      "epoch": 1.9301994301994303,
+      "grad_norm": 0.7620238661766052,
+      "learning_rate": 0.00010565196524898219,
+      "loss": 0.944,
+      "step": 10841
+    },
+    {
+      "epoch": 1.9303774928774928,
+      "grad_norm": 0.6713484525680542,
+      "learning_rate": 0.00010563799009663344,
+      "loss": 0.749,
+      "step": 10842
+    },
+    {
+      "epoch": 1.9305555555555556,
+      "grad_norm": 0.9279242157936096,
+      "learning_rate": 0.00010562401483381997,
+      "loss": 0.961,
+      "step": 10843
+    },
+    {
+      "epoch": 1.930733618233618,
+      "grad_norm": 0.6710723638534546,
+      "learning_rate": 0.00010561003946081558,
+      "loss": 1.1288,
+      "step": 10844
+    },
+    {
+      "epoch": 1.9309116809116809,
+      "grad_norm": 0.7751701474189758,
+      "learning_rate": 0.00010559606397789416,
+      "loss": 0.9435,
+      "step": 10845
+    },
+    {
+      "epoch": 1.9310897435897436,
+      "grad_norm": 0.6741766929626465,
+      "learning_rate": 0.00010558208838532948,
+      "loss": 1.0299,
+      "step": 10846
+    },
+    {
+      "epoch": 1.9312678062678064,
+      "grad_norm": 0.6988041400909424,
+      "learning_rate": 0.00010556811268339539,
+      "loss": 1.0236,
+      "step": 10847
+    },
+    {
+      "epoch": 1.931445868945869,
+      "grad_norm": 0.6353505253791809,
+      "learning_rate": 0.00010555413687236568,
+      "loss": 1.0361,
+      "step": 10848
+    },
+    {
+      "epoch": 1.9316239316239316,
+      "grad_norm": 0.7162703275680542,
+      "learning_rate": 0.0001055401609525142,
+      "loss": 1.0931,
+      "step": 10849
+    },
+    {
+      "epoch": 1.9318019943019942,
+      "grad_norm": 0.61545330286026,
+      "learning_rate": 0.00010552618492411476,
+      "loss": 0.8829,
+      "step": 10850
+    },
+    {
+      "epoch": 1.931980056980057,
+      "grad_norm": 0.6304612159729004,
+      "learning_rate": 0.00010551220878744124,
+      "loss": 0.8574,
+      "step": 10851
+    },
+    {
+      "epoch": 1.9321581196581197,
+      "grad_norm": 0.6372067928314209,
+      "learning_rate": 0.00010549823254276743,
+      "loss": 1.0949,
+      "step": 10852
+    },
+    {
+      "epoch": 1.9323361823361824,
+      "grad_norm": 0.6952856779098511,
+      "learning_rate": 0.00010548425619036715,
+      "loss": 0.9232,
+      "step": 10853
+    },
+    {
+      "epoch": 1.9325142450142452,
+      "grad_norm": 0.6510106325149536,
+      "learning_rate": 0.00010547027973051427,
+      "loss": 1.0753,
+      "step": 10854
+    },
+    {
+      "epoch": 1.9326923076923077,
+      "grad_norm": 0.6377716064453125,
+      "learning_rate": 0.00010545630316348263,
+      "loss": 0.8466,
+      "step": 10855
+    },
+    {
+      "epoch": 1.9328703703703702,
+      "grad_norm": 0.7366968393325806,
+      "learning_rate": 0.00010544232648954606,
+      "loss": 0.9351,
+      "step": 10856
+    },
+    {
+      "epoch": 1.933048433048433,
+      "grad_norm": 0.703652024269104,
+      "learning_rate": 0.00010542834970897843,
+      "loss": 1.0032,
+      "step": 10857
+    },
+    {
+      "epoch": 1.9332264957264957,
+      "grad_norm": 0.6685494780540466,
+      "learning_rate": 0.00010541437282205355,
+      "loss": 0.8818,
+      "step": 10858
+    },
+    {
+      "epoch": 1.9334045584045585,
+      "grad_norm": 0.6594362854957581,
+      "learning_rate": 0.00010540039582904527,
+      "loss": 0.9535,
+      "step": 10859
+    },
+    {
+      "epoch": 1.9335826210826212,
+      "grad_norm": 0.8003259301185608,
+      "learning_rate": 0.00010538641873022744,
+      "loss": 0.8852,
+      "step": 10860
+    },
+    {
+      "epoch": 1.9337606837606838,
+      "grad_norm": 0.6567012071609497,
+      "learning_rate": 0.00010537244152587393,
+      "loss": 1.0832,
+      "step": 10861
+    },
+    {
+      "epoch": 1.9339387464387463,
+      "grad_norm": 0.6714941263198853,
+      "learning_rate": 0.00010535846421625862,
+      "loss": 1.1047,
+      "step": 10862
+    },
+    {
+      "epoch": 1.934116809116809,
+      "grad_norm": 0.6998924612998962,
+      "learning_rate": 0.00010534448680165531,
+      "loss": 0.8827,
+      "step": 10863
+    },
+    {
+      "epoch": 1.9342948717948718,
+      "grad_norm": 0.6065765619277954,
+      "learning_rate": 0.0001053305092823379,
+      "loss": 0.5773,
+      "step": 10864
+    },
+    {
+      "epoch": 1.9344729344729346,
+      "grad_norm": 0.7678273916244507,
+      "learning_rate": 0.0001053165316585802,
+      "loss": 0.9199,
+      "step": 10865
+    },
+    {
+      "epoch": 1.9346509971509973,
+      "grad_norm": 0.7071540951728821,
+      "learning_rate": 0.00010530255393065613,
+      "loss": 1.0292,
+      "step": 10866
+    },
+    {
+      "epoch": 1.9348290598290598,
+      "grad_norm": 0.6329835057258606,
+      "learning_rate": 0.00010528857609883956,
+      "loss": 0.9915,
+      "step": 10867
+    },
+    {
+      "epoch": 1.9350071225071224,
+      "grad_norm": 0.6274038553237915,
+      "learning_rate": 0.00010527459816340427,
+      "loss": 0.8499,
+      "step": 10868
+    },
+    {
+      "epoch": 1.9351851851851851,
+      "grad_norm": 0.6564371585845947,
+      "learning_rate": 0.00010526062012462424,
+      "loss": 1.1707,
+      "step": 10869
+    },
+    {
+      "epoch": 1.9353632478632479,
+      "grad_norm": 0.8561269044876099,
+      "learning_rate": 0.00010524664198277326,
+      "loss": 1.148,
+      "step": 10870
+    },
+    {
+      "epoch": 1.9355413105413106,
+      "grad_norm": 0.6322671175003052,
+      "learning_rate": 0.00010523266373812521,
+      "loss": 0.9165,
+      "step": 10871
+    },
+    {
+      "epoch": 1.9357193732193734,
+      "grad_norm": 0.7602947354316711,
+      "learning_rate": 0.00010521868539095403,
+      "loss": 0.9647,
+      "step": 10872
+    },
+    {
+      "epoch": 1.935897435897436,
+      "grad_norm": 0.5962168574333191,
+      "learning_rate": 0.00010520470694153353,
+      "loss": 0.8585,
+      "step": 10873
+    },
+    {
+      "epoch": 1.9360754985754984,
+      "grad_norm": 0.7498637437820435,
+      "learning_rate": 0.00010519072839013757,
+      "loss": 0.9828,
+      "step": 10874
+    },
+    {
+      "epoch": 1.9362535612535612,
+      "grad_norm": 0.6841256022453308,
+      "learning_rate": 0.00010517674973704012,
+      "loss": 0.9991,
+      "step": 10875
+    },
+    {
+      "epoch": 1.936431623931624,
+      "grad_norm": 0.8281826972961426,
+      "learning_rate": 0.00010516277098251499,
+      "loss": 1.028,
+      "step": 10876
+    },
+    {
+      "epoch": 1.9366096866096867,
+      "grad_norm": 0.6673563718795776,
+      "learning_rate": 0.0001051487921268361,
+      "loss": 1.1594,
+      "step": 10877
+    },
+    {
+      "epoch": 1.9367877492877494,
+      "grad_norm": 0.7833667993545532,
+      "learning_rate": 0.00010513481317027733,
+      "loss": 0.7675,
+      "step": 10878
+    },
+    {
+      "epoch": 1.936965811965812,
+      "grad_norm": 0.6087225675582886,
+      "learning_rate": 0.00010512083411311253,
+      "loss": 0.7803,
+      "step": 10879
+    },
+    {
+      "epoch": 1.9371438746438745,
+      "grad_norm": 0.6758120656013489,
+      "learning_rate": 0.00010510685495561563,
+      "loss": 1.0621,
+      "step": 10880
+    },
+    {
+      "epoch": 1.9373219373219372,
+      "grad_norm": 0.6720096468925476,
+      "learning_rate": 0.00010509287569806055,
+      "loss": 0.8502,
+      "step": 10881
+    },
+    {
+      "epoch": 1.9375,
+      "grad_norm": 0.6233887672424316,
+      "learning_rate": 0.00010507889634072113,
+      "loss": 1.0127,
+      "step": 10882
+    },
+    {
+      "epoch": 1.9376780626780628,
+      "grad_norm": 0.667742908000946,
+      "learning_rate": 0.00010506491688387127,
+      "loss": 0.9086,
+      "step": 10883
+    },
+    {
+      "epoch": 1.9378561253561255,
+      "grad_norm": 0.6533677577972412,
+      "learning_rate": 0.00010505093732778492,
+      "loss": 0.9724,
+      "step": 10884
+    },
+    {
+      "epoch": 1.938034188034188,
+      "grad_norm": 0.7171359062194824,
+      "learning_rate": 0.00010503695767273591,
+      "loss": 0.9915,
+      "step": 10885
+    },
+    {
+      "epoch": 1.9382122507122506,
+      "grad_norm": 0.723655641078949,
+      "learning_rate": 0.0001050229779189982,
+      "loss": 0.8981,
+      "step": 10886
+    },
+    {
+      "epoch": 1.9383903133903133,
+      "grad_norm": 0.6863494515419006,
+      "learning_rate": 0.00010500899806684568,
+      "loss": 1.2577,
+      "step": 10887
+    },
+    {
+      "epoch": 1.938568376068376,
+      "grad_norm": 0.8174706697463989,
+      "learning_rate": 0.00010499501811655224,
+      "loss": 0.9848,
+      "step": 10888
+    },
+    {
+      "epoch": 1.9387464387464388,
+      "grad_norm": 0.6378024220466614,
+      "learning_rate": 0.00010498103806839179,
+      "loss": 0.8499,
+      "step": 10889
+    },
+    {
+      "epoch": 1.9389245014245016,
+      "grad_norm": 0.6734544634819031,
+      "learning_rate": 0.00010496705792263823,
+      "loss": 0.8446,
+      "step": 10890
+    },
+    {
+      "epoch": 1.939102564102564,
+      "grad_norm": 0.6802361607551575,
+      "learning_rate": 0.00010495307767956551,
+      "loss": 0.9285,
+      "step": 10891
+    },
+    {
+      "epoch": 1.9392806267806266,
+      "grad_norm": 0.7821299433708191,
+      "learning_rate": 0.00010493909733944752,
+      "loss": 1.08,
+      "step": 10892
+    },
+    {
+      "epoch": 1.9394586894586894,
+      "grad_norm": 0.6204990148544312,
+      "learning_rate": 0.00010492511690255818,
+      "loss": 0.7861,
+      "step": 10893
+    },
+    {
+      "epoch": 1.9396367521367521,
+      "grad_norm": 0.6386391520500183,
+      "learning_rate": 0.0001049111363691714,
+      "loss": 0.9162,
+      "step": 10894
+    },
+    {
+      "epoch": 1.9398148148148149,
+      "grad_norm": 0.6885092854499817,
+      "learning_rate": 0.0001048971557395611,
+      "loss": 1.0026,
+      "step": 10895
+    },
+    {
+      "epoch": 1.9399928774928776,
+      "grad_norm": 0.6962558627128601,
+      "learning_rate": 0.00010488317501400122,
+      "loss": 1.146,
+      "step": 10896
+    },
+    {
+      "epoch": 1.9401709401709402,
+      "grad_norm": 0.6283716559410095,
+      "learning_rate": 0.00010486919419276566,
+      "loss": 1.0268,
+      "step": 10897
+    },
+    {
+      "epoch": 1.9403490028490027,
+      "grad_norm": 0.7183622717857361,
+      "learning_rate": 0.00010485521327612835,
+      "loss": 1.0123,
+      "step": 10898
+    },
+    {
+      "epoch": 1.9405270655270654,
+      "grad_norm": 0.6354197263717651,
+      "learning_rate": 0.00010484123226436321,
+      "loss": 0.871,
+      "step": 10899
+    },
+    {
+      "epoch": 1.9407051282051282,
+      "grad_norm": 0.804358184337616,
+      "learning_rate": 0.00010482725115774421,
+      "loss": 1.1001,
+      "step": 10900
+    },
+    {
+      "epoch": 1.940883190883191,
+      "grad_norm": 0.6896754503250122,
+      "learning_rate": 0.00010481326995654524,
+      "loss": 1.0976,
+      "step": 10901
+    },
+    {
+      "epoch": 1.9410612535612537,
+      "grad_norm": 0.9108015894889832,
+      "learning_rate": 0.00010479928866104023,
+      "loss": 0.8785,
+      "step": 10902
+    },
+    {
+      "epoch": 1.9412393162393162,
+      "grad_norm": 0.6963121294975281,
+      "learning_rate": 0.00010478530727150316,
+      "loss": 1.0458,
+      "step": 10903
+    },
+    {
+      "epoch": 1.9414173789173788,
+      "grad_norm": 0.6657114624977112,
+      "learning_rate": 0.00010477132578820792,
+      "loss": 0.8188,
+      "step": 10904
+    },
+    {
+      "epoch": 1.9415954415954415,
+      "grad_norm": 0.671716034412384,
+      "learning_rate": 0.00010475734421142847,
+      "loss": 1.0915,
+      "step": 10905
+    },
+    {
+      "epoch": 1.9417735042735043,
+      "grad_norm": 0.6790717244148254,
+      "learning_rate": 0.0001047433625414387,
+      "loss": 0.9688,
+      "step": 10906
+    },
+    {
+      "epoch": 1.941951566951567,
+      "grad_norm": 0.6411764621734619,
+      "learning_rate": 0.00010472938077851264,
+      "loss": 1.0387,
+      "step": 10907
+    },
+    {
+      "epoch": 1.9421296296296298,
+      "grad_norm": 0.8579615950584412,
+      "learning_rate": 0.00010471539892292417,
+      "loss": 1.1635,
+      "step": 10908
+    },
+    {
+      "epoch": 1.9423076923076923,
+      "grad_norm": 0.7031029462814331,
+      "learning_rate": 0.00010470141697494726,
+      "loss": 0.9813,
+      "step": 10909
+    },
+    {
+      "epoch": 1.9424857549857548,
+      "grad_norm": 0.6657388806343079,
+      "learning_rate": 0.00010468743493485584,
+      "loss": 0.7947,
+      "step": 10910
+    },
+    {
+      "epoch": 1.9426638176638176,
+      "grad_norm": 0.6364194750785828,
+      "learning_rate": 0.00010467345280292389,
+      "loss": 0.8554,
+      "step": 10911
+    },
+    {
+      "epoch": 1.9428418803418803,
+      "grad_norm": 0.7394127249717712,
+      "learning_rate": 0.00010465947057942534,
+      "loss": 0.822,
+      "step": 10912
+    },
+    {
+      "epoch": 1.943019943019943,
+      "grad_norm": 0.6557473540306091,
+      "learning_rate": 0.00010464548826463411,
+      "loss": 1.0025,
+      "step": 10913
+    },
+    {
+      "epoch": 1.9431980056980058,
+      "grad_norm": 0.6530601382255554,
+      "learning_rate": 0.00010463150585882422,
+      "loss": 1.0828,
+      "step": 10914
+    },
+    {
+      "epoch": 1.9433760683760684,
+      "grad_norm": 0.7376404404640198,
+      "learning_rate": 0.00010461752336226957,
+      "loss": 0.9413,
+      "step": 10915
+    },
+    {
+      "epoch": 1.943554131054131,
+      "grad_norm": 0.7110656499862671,
+      "learning_rate": 0.00010460354077524417,
+      "loss": 0.9162,
+      "step": 10916
+    },
+    {
+      "epoch": 1.9437321937321936,
+      "grad_norm": 0.6515666246414185,
+      "learning_rate": 0.00010458955809802194,
+      "loss": 0.9211,
+      "step": 10917
+    },
+    {
+      "epoch": 1.9439102564102564,
+      "grad_norm": 0.6888720989227295,
+      "learning_rate": 0.00010457557533087683,
+      "loss": 1.0632,
+      "step": 10918
+    },
+    {
+      "epoch": 1.9440883190883191,
+      "grad_norm": 0.7246627807617188,
+      "learning_rate": 0.00010456159247408286,
+      "loss": 0.9807,
+      "step": 10919
+    },
+    {
+      "epoch": 1.944266381766382,
+      "grad_norm": 0.727834165096283,
+      "learning_rate": 0.00010454760952791394,
+      "loss": 1.0793,
+      "step": 10920
+    },
+    {
+      "epoch": 1.9444444444444444,
+      "grad_norm": 0.6365306377410889,
+      "learning_rate": 0.00010453362649264407,
+      "loss": 1.0415,
+      "step": 10921
+    },
+    {
+      "epoch": 1.9446225071225072,
+      "grad_norm": 0.7187839150428772,
+      "learning_rate": 0.0001045196433685472,
+      "loss": 1.007,
+      "step": 10922
+    },
+    {
+      "epoch": 1.9448005698005697,
+      "grad_norm": 0.5905138254165649,
+      "learning_rate": 0.00010450566015589732,
+      "loss": 0.9818,
+      "step": 10923
+    },
+    {
+      "epoch": 1.9449786324786325,
+      "grad_norm": 0.7008894085884094,
+      "learning_rate": 0.00010449167685496837,
+      "loss": 0.8444,
+      "step": 10924
+    },
+    {
+      "epoch": 1.9451566951566952,
+      "grad_norm": 0.6126312017440796,
+      "learning_rate": 0.00010447769346603435,
+      "loss": 0.7207,
+      "step": 10925
+    },
+    {
+      "epoch": 1.945334757834758,
+      "grad_norm": 0.7513176202774048,
+      "learning_rate": 0.00010446370998936922,
+      "loss": 0.8693,
+      "step": 10926
+    },
+    {
+      "epoch": 1.9455128205128205,
+      "grad_norm": 0.6382531523704529,
+      "learning_rate": 0.00010444972642524697,
+      "loss": 0.8379,
+      "step": 10927
+    },
+    {
+      "epoch": 1.9456908831908832,
+      "grad_norm": 0.7062170505523682,
+      "learning_rate": 0.0001044357427739416,
+      "loss": 1.0525,
+      "step": 10928
+    },
+    {
+      "epoch": 1.9458689458689458,
+      "grad_norm": 0.6954067349433899,
+      "learning_rate": 0.00010442175903572703,
+      "loss": 1.0238,
+      "step": 10929
+    },
+    {
+      "epoch": 1.9460470085470085,
+      "grad_norm": 0.7257117033004761,
+      "learning_rate": 0.00010440777521087731,
+      "loss": 1.1413,
+      "step": 10930
+    },
+    {
+      "epoch": 1.9462250712250713,
+      "grad_norm": 0.6617701053619385,
+      "learning_rate": 0.00010439379129966635,
+      "loss": 1.0089,
+      "step": 10931
+    },
+    {
+      "epoch": 1.946403133903134,
+      "grad_norm": 0.6860800385475159,
+      "learning_rate": 0.00010437980730236821,
+      "loss": 1.1778,
+      "step": 10932
+    },
+    {
+      "epoch": 1.9465811965811965,
+      "grad_norm": 0.846235454082489,
+      "learning_rate": 0.00010436582321925684,
+      "loss": 0.9851,
+      "step": 10933
+    },
+    {
+      "epoch": 1.9467592592592593,
+      "grad_norm": 0.6385617852210999,
+      "learning_rate": 0.00010435183905060623,
+      "loss": 0.9542,
+      "step": 10934
+    },
+    {
+      "epoch": 1.9469373219373218,
+      "grad_norm": 0.7137401700019836,
+      "learning_rate": 0.00010433785479669038,
+      "loss": 1.0499,
+      "step": 10935
+    },
+    {
+      "epoch": 1.9471153846153846,
+      "grad_norm": 0.6269308924674988,
+      "learning_rate": 0.00010432387045778324,
+      "loss": 0.8929,
+      "step": 10936
+    },
+    {
+      "epoch": 1.9472934472934473,
+      "grad_norm": 0.7903163433074951,
+      "learning_rate": 0.00010430988603415888,
+      "loss": 0.9812,
+      "step": 10937
+    },
+    {
+      "epoch": 1.94747150997151,
+      "grad_norm": 0.6006736159324646,
+      "learning_rate": 0.00010429590152609121,
+      "loss": 0.7959,
+      "step": 10938
+    },
+    {
+      "epoch": 1.9476495726495726,
+      "grad_norm": 0.6061521768569946,
+      "learning_rate": 0.00010428191693385431,
+      "loss": 0.8748,
+      "step": 10939
+    },
+    {
+      "epoch": 1.9478276353276354,
+      "grad_norm": 0.6637623906135559,
+      "learning_rate": 0.00010426793225772216,
+      "loss": 0.7047,
+      "step": 10940
+    },
+    {
+      "epoch": 1.948005698005698,
+      "grad_norm": 0.7650586366653442,
+      "learning_rate": 0.00010425394749796874,
+      "loss": 1.0018,
+      "step": 10941
+    },
+    {
+      "epoch": 1.9481837606837606,
+      "grad_norm": 0.6575125455856323,
+      "learning_rate": 0.000104239962654868,
+      "loss": 0.8915,
+      "step": 10942
+    },
+    {
+      "epoch": 1.9483618233618234,
+      "grad_norm": 0.6315393447875977,
+      "learning_rate": 0.00010422597772869404,
+      "loss": 1.1884,
+      "step": 10943
+    },
+    {
+      "epoch": 1.9485398860398861,
+      "grad_norm": 0.7607148885726929,
+      "learning_rate": 0.00010421199271972083,
+      "loss": 0.9341,
+      "step": 10944
+    },
+    {
+      "epoch": 1.9487179487179487,
+      "grad_norm": 0.6491827964782715,
+      "learning_rate": 0.00010419800762822239,
+      "loss": 0.9991,
+      "step": 10945
+    },
+    {
+      "epoch": 1.9488960113960114,
+      "grad_norm": 0.6294243335723877,
+      "learning_rate": 0.00010418402245447265,
+      "loss": 0.9253,
+      "step": 10946
+    },
+    {
+      "epoch": 1.949074074074074,
+      "grad_norm": 0.6472215056419373,
+      "learning_rate": 0.00010417003719874571,
+      "loss": 1.0402,
+      "step": 10947
+    },
+    {
+      "epoch": 1.9492521367521367,
+      "grad_norm": 0.7377899885177612,
+      "learning_rate": 0.00010415605186131559,
+      "loss": 1.046,
+      "step": 10948
+    },
+    {
+      "epoch": 1.9494301994301995,
+      "grad_norm": 0.6391907334327698,
+      "learning_rate": 0.00010414206644245623,
+      "loss": 0.8529,
+      "step": 10949
+    },
+    {
+      "epoch": 1.9496082621082622,
+      "grad_norm": 0.7101355195045471,
+      "learning_rate": 0.0001041280809424417,
+      "loss": 0.925,
+      "step": 10950
+    },
+    {
+      "epoch": 1.9497863247863247,
+      "grad_norm": 0.7891978025436401,
+      "learning_rate": 0.00010411409536154597,
+      "loss": 1.0691,
+      "step": 10951
+    },
+    {
+      "epoch": 1.9499643874643875,
+      "grad_norm": 0.7225242853164673,
+      "learning_rate": 0.00010410010970004311,
+      "loss": 1.158,
+      "step": 10952
+    },
+    {
+      "epoch": 1.95014245014245,
+      "grad_norm": 0.6073256731033325,
+      "learning_rate": 0.00010408612395820714,
+      "loss": 0.9977,
+      "step": 10953
+    },
+    {
+      "epoch": 1.9503205128205128,
+      "grad_norm": 0.6373769044876099,
+      "learning_rate": 0.00010407213813631203,
+      "loss": 1.019,
+      "step": 10954
+    },
+    {
+      "epoch": 1.9504985754985755,
+      "grad_norm": 0.7451884746551514,
+      "learning_rate": 0.00010405815223463184,
+      "loss": 0.9497,
+      "step": 10955
+    },
+    {
+      "epoch": 1.9506766381766383,
+      "grad_norm": 0.7760418057441711,
+      "learning_rate": 0.00010404416625344058,
+      "loss": 1.0378,
+      "step": 10956
+    },
+    {
+      "epoch": 1.9508547008547008,
+      "grad_norm": 0.7057808041572571,
+      "learning_rate": 0.00010403018019301228,
+      "loss": 0.8953,
+      "step": 10957
+    },
+    {
+      "epoch": 1.9510327635327636,
+      "grad_norm": 0.6599584817886353,
+      "learning_rate": 0.00010401619405362095,
+      "loss": 0.8859,
+      "step": 10958
+    },
+    {
+      "epoch": 1.951210826210826,
+      "grad_norm": 0.6977253556251526,
+      "learning_rate": 0.00010400220783554069,
+      "loss": 0.9038,
+      "step": 10959
+    },
+    {
+      "epoch": 1.9513888888888888,
+      "grad_norm": 0.6930267810821533,
+      "learning_rate": 0.00010398822153904546,
+      "loss": 1.1547,
+      "step": 10960
+    },
+    {
+      "epoch": 1.9515669515669516,
+      "grad_norm": 0.6301694512367249,
+      "learning_rate": 0.00010397423516440931,
+      "loss": 0.8875,
+      "step": 10961
+    },
+    {
+      "epoch": 1.9517450142450143,
+      "grad_norm": 0.7447484135627747,
+      "learning_rate": 0.00010396024871190628,
+      "loss": 1.0454,
+      "step": 10962
+    },
+    {
+      "epoch": 1.9519230769230769,
+      "grad_norm": 0.8666765093803406,
+      "learning_rate": 0.00010394626218181041,
+      "loss": 1.2211,
+      "step": 10963
+    },
+    {
+      "epoch": 1.9521011396011396,
+      "grad_norm": 0.599354088306427,
+      "learning_rate": 0.00010393227557439573,
+      "loss": 1.0419,
+      "step": 10964
+    },
+    {
+      "epoch": 1.9522792022792022,
+      "grad_norm": 0.6991702914237976,
+      "learning_rate": 0.00010391828888993627,
+      "loss": 0.8217,
+      "step": 10965
+    },
+    {
+      "epoch": 1.952457264957265,
+      "grad_norm": 0.7467028498649597,
+      "learning_rate": 0.0001039043021287061,
+      "loss": 0.8708,
+      "step": 10966
+    },
+    {
+      "epoch": 1.9526353276353277,
+      "grad_norm": 0.6806215047836304,
+      "learning_rate": 0.0001038903152909792,
+      "loss": 1.218,
+      "step": 10967
+    },
+    {
+      "epoch": 1.9528133903133904,
+      "grad_norm": 0.6704212427139282,
+      "learning_rate": 0.00010387632837702968,
+      "loss": 0.8428,
+      "step": 10968
+    },
+    {
+      "epoch": 1.952991452991453,
+      "grad_norm": 0.6843154430389404,
+      "learning_rate": 0.00010386234138713155,
+      "loss": 0.9729,
+      "step": 10969
+    },
+    {
+      "epoch": 1.9531695156695157,
+      "grad_norm": 0.6619821190834045,
+      "learning_rate": 0.00010384835432155888,
+      "loss": 1.021,
+      "step": 10970
+    },
+    {
+      "epoch": 1.9533475783475782,
+      "grad_norm": 0.6249803900718689,
+      "learning_rate": 0.0001038343671805857,
+      "loss": 0.9321,
+      "step": 10971
+    },
+    {
+      "epoch": 1.953525641025641,
+      "grad_norm": 0.7361689805984497,
+      "learning_rate": 0.00010382037996448604,
+      "loss": 0.9451,
+      "step": 10972
+    },
+    {
+      "epoch": 1.9537037037037037,
+      "grad_norm": 0.6464847922325134,
+      "learning_rate": 0.00010380639267353398,
+      "loss": 1.0188,
+      "step": 10973
+    },
+    {
+      "epoch": 1.9538817663817665,
+      "grad_norm": 0.5975635647773743,
+      "learning_rate": 0.00010379240530800356,
+      "loss": 0.9025,
+      "step": 10974
+    },
+    {
+      "epoch": 1.9540598290598292,
+      "grad_norm": 0.6734475493431091,
+      "learning_rate": 0.00010377841786816884,
+      "loss": 1.0742,
+      "step": 10975
+    },
+    {
+      "epoch": 1.9542378917378918,
+      "grad_norm": 0.7318592667579651,
+      "learning_rate": 0.00010376443035430386,
+      "loss": 1.1082,
+      "step": 10976
+    },
+    {
+      "epoch": 1.9544159544159543,
+      "grad_norm": 0.7696142792701721,
+      "learning_rate": 0.00010375044276668271,
+      "loss": 0.8421,
+      "step": 10977
+    },
+    {
+      "epoch": 1.954594017094017,
+      "grad_norm": 0.68442302942276,
+      "learning_rate": 0.00010373645510557939,
+      "loss": 1.0794,
+      "step": 10978
+    },
+    {
+      "epoch": 1.9547720797720798,
+      "grad_norm": 0.7582547068595886,
+      "learning_rate": 0.00010372246737126801,
+      "loss": 1.0332,
+      "step": 10979
+    },
+    {
+      "epoch": 1.9549501424501425,
+      "grad_norm": 0.6529998183250427,
+      "learning_rate": 0.00010370847956402262,
+      "loss": 1.1833,
+      "step": 10980
+    },
+    {
+      "epoch": 1.9551282051282053,
+      "grad_norm": 0.7565605044364929,
+      "learning_rate": 0.00010369449168411729,
+      "loss": 1.0494,
+      "step": 10981
+    },
+    {
+      "epoch": 1.9553062678062678,
+      "grad_norm": 0.6346915364265442,
+      "learning_rate": 0.00010368050373182605,
+      "loss": 1.0052,
+      "step": 10982
+    },
+    {
+      "epoch": 1.9554843304843303,
+      "grad_norm": 0.7021830081939697,
+      "learning_rate": 0.00010366651570742298,
+      "loss": 0.9716,
+      "step": 10983
+    },
+    {
+      "epoch": 1.955662393162393,
+      "grad_norm": 0.6464530825614929,
+      "learning_rate": 0.00010365252761118218,
+      "loss": 0.9802,
+      "step": 10984
+    },
+    {
+      "epoch": 1.9558404558404558,
+      "grad_norm": 0.6845090985298157,
+      "learning_rate": 0.00010363853944337768,
+      "loss": 0.9529,
+      "step": 10985
+    },
+    {
+      "epoch": 1.9560185185185186,
+      "grad_norm": 0.7178115248680115,
+      "learning_rate": 0.00010362455120428356,
+      "loss": 0.9968,
+      "step": 10986
+    },
+    {
+      "epoch": 1.9561965811965814,
+      "grad_norm": 0.6131038069725037,
+      "learning_rate": 0.00010361056289417385,
+      "loss": 1.0559,
+      "step": 10987
+    },
+    {
+      "epoch": 1.9563746438746439,
+      "grad_norm": 0.6946909427642822,
+      "learning_rate": 0.0001035965745133227,
+      "loss": 1.0457,
+      "step": 10988
+    },
+    {
+      "epoch": 1.9565527065527064,
+      "grad_norm": 0.7376706600189209,
+      "learning_rate": 0.00010358258606200413,
+      "loss": 0.7775,
+      "step": 10989
+    },
+    {
+      "epoch": 1.9567307692307692,
+      "grad_norm": 0.6864920854568481,
+      "learning_rate": 0.00010356859754049225,
+      "loss": 0.8798,
+      "step": 10990
+    },
+    {
+      "epoch": 1.956908831908832,
+      "grad_norm": 0.6301153302192688,
+      "learning_rate": 0.0001035546089490611,
+      "loss": 0.8757,
+      "step": 10991
+    },
+    {
+      "epoch": 1.9570868945868947,
+      "grad_norm": 0.7184807062149048,
+      "learning_rate": 0.00010354062028798474,
+      "loss": 1.0783,
+      "step": 10992
+    },
+    {
+      "epoch": 1.9572649572649574,
+      "grad_norm": 0.7138563394546509,
+      "learning_rate": 0.00010352663155753732,
+      "loss": 1.0328,
+      "step": 10993
+    },
+    {
+      "epoch": 1.95744301994302,
+      "grad_norm": 0.6565547585487366,
+      "learning_rate": 0.00010351264275799286,
+      "loss": 1.1312,
+      "step": 10994
+    },
+    {
+      "epoch": 1.9576210826210825,
+      "grad_norm": 0.7055862545967102,
+      "learning_rate": 0.00010349865388962547,
+      "loss": 1.0787,
+      "step": 10995
+    },
+    {
+      "epoch": 1.9577991452991452,
+      "grad_norm": 0.6184022426605225,
+      "learning_rate": 0.00010348466495270926,
+      "loss": 0.9635,
+      "step": 10996
+    },
+    {
+      "epoch": 1.957977207977208,
+      "grad_norm": 0.6563652753829956,
+      "learning_rate": 0.0001034706759475182,
+      "loss": 0.772,
+      "step": 10997
+    },
+    {
+      "epoch": 1.9581552706552707,
+      "grad_norm": 0.6103591322898865,
+      "learning_rate": 0.00010345668687432651,
+      "loss": 0.8113,
+      "step": 10998
+    },
+    {
+      "epoch": 1.9583333333333335,
+      "grad_norm": 0.6715512275695801,
+      "learning_rate": 0.0001034426977334082,
+      "loss": 1.1841,
+      "step": 10999
+    },
+    {
+      "epoch": 1.958511396011396,
+      "grad_norm": 0.680092453956604,
+      "learning_rate": 0.00010342870852503739,
+      "loss": 0.9992,
+      "step": 11000
+    },
+    {
+      "epoch": 1.9586894586894585,
+      "grad_norm": 0.828472375869751,
+      "learning_rate": 0.00010341471924948816,
+      "loss": 1.0975,
+      "step": 11001
+    },
+    {
+      "epoch": 1.9588675213675213,
+      "grad_norm": 0.758441686630249,
+      "learning_rate": 0.00010340072990703463,
+      "loss": 1.0632,
+      "step": 11002
+    },
+    {
+      "epoch": 1.959045584045584,
+      "grad_norm": 0.6847560405731201,
+      "learning_rate": 0.00010338674049795079,
+      "loss": 1.0054,
+      "step": 11003
+    },
+    {
+      "epoch": 1.9592236467236468,
+      "grad_norm": 0.707626223564148,
+      "learning_rate": 0.00010337275102251085,
+      "loss": 0.9427,
+      "step": 11004
+    },
+    {
+      "epoch": 1.9594017094017095,
+      "grad_norm": 0.769036591053009,
+      "learning_rate": 0.00010335876148098887,
+      "loss": 1.0424,
+      "step": 11005
+    },
+    {
+      "epoch": 1.959579772079772,
+      "grad_norm": 0.822695791721344,
+      "learning_rate": 0.00010334477187365892,
+      "loss": 1.1573,
+      "step": 11006
+    },
+    {
+      "epoch": 1.9597578347578346,
+      "grad_norm": 0.6290286183357239,
+      "learning_rate": 0.00010333078220079513,
+      "loss": 0.936,
+      "step": 11007
+    },
+    {
+      "epoch": 1.9599358974358974,
+      "grad_norm": 0.6802252531051636,
+      "learning_rate": 0.00010331679246267155,
+      "loss": 0.8049,
+      "step": 11008
+    },
+    {
+      "epoch": 1.96011396011396,
+      "grad_norm": 0.6652607321739197,
+      "learning_rate": 0.00010330280265956232,
+      "loss": 0.926,
+      "step": 11009
+    },
+    {
+      "epoch": 1.9602920227920229,
+      "grad_norm": 0.7057216763496399,
+      "learning_rate": 0.00010328881279174154,
+      "loss": 0.9464,
+      "step": 11010
+    },
+    {
+      "epoch": 1.9604700854700856,
+      "grad_norm": 0.6951601505279541,
+      "learning_rate": 0.00010327482285948331,
+      "loss": 0.9882,
+      "step": 11011
+    },
+    {
+      "epoch": 1.9606481481481481,
+      "grad_norm": 0.6537632942199707,
+      "learning_rate": 0.00010326083286306174,
+      "loss": 0.8663,
+      "step": 11012
+    },
+    {
+      "epoch": 1.9608262108262107,
+      "grad_norm": 0.7252047657966614,
+      "learning_rate": 0.0001032468428027509,
+      "loss": 1.1377,
+      "step": 11013
+    },
+    {
+      "epoch": 1.9610042735042734,
+      "grad_norm": 0.6494104266166687,
+      "learning_rate": 0.00010323285267882492,
+      "loss": 0.8072,
+      "step": 11014
+    },
+    {
+      "epoch": 1.9611823361823362,
+      "grad_norm": 0.8463460206985474,
+      "learning_rate": 0.00010321886249155792,
+      "loss": 1.22,
+      "step": 11015
+    },
+    {
+      "epoch": 1.961360398860399,
+      "grad_norm": 0.6071396470069885,
+      "learning_rate": 0.00010320487224122401,
+      "loss": 0.7975,
+      "step": 11016
+    },
+    {
+      "epoch": 1.9615384615384617,
+      "grad_norm": 0.6546960473060608,
+      "learning_rate": 0.00010319088192809725,
+      "loss": 1.1729,
+      "step": 11017
+    },
+    {
+      "epoch": 1.9617165242165242,
+      "grad_norm": 0.7399442791938782,
+      "learning_rate": 0.00010317689155245178,
+      "loss": 1.092,
+      "step": 11018
+    },
+    {
+      "epoch": 1.9618945868945867,
+      "grad_norm": 0.7103837728500366,
+      "learning_rate": 0.00010316290111456175,
+      "loss": 0.8436,
+      "step": 11019
+    },
+    {
+      "epoch": 1.9620726495726495,
+      "grad_norm": 0.6990065574645996,
+      "learning_rate": 0.00010314891061470125,
+      "loss": 0.9003,
+      "step": 11020
+    },
+    {
+      "epoch": 1.9622507122507122,
+      "grad_norm": 0.7945666313171387,
+      "learning_rate": 0.00010313492005314438,
+      "loss": 0.8812,
+      "step": 11021
+    },
+    {
+      "epoch": 1.962428774928775,
+      "grad_norm": 0.6177538633346558,
+      "learning_rate": 0.00010312092943016527,
+      "loss": 1.0091,
+      "step": 11022
+    },
+    {
+      "epoch": 1.9626068376068377,
+      "grad_norm": 0.7260771989822388,
+      "learning_rate": 0.000103106938746038,
+      "loss": 0.9376,
+      "step": 11023
+    },
+    {
+      "epoch": 1.9627849002849003,
+      "grad_norm": 0.6726518273353577,
+      "learning_rate": 0.00010309294800103674,
+      "loss": 0.8048,
+      "step": 11024
+    },
+    {
+      "epoch": 1.9629629629629628,
+      "grad_norm": 0.8759992122650146,
+      "learning_rate": 0.00010307895719543562,
+      "loss": 1.0248,
+      "step": 11025
+    },
+    {
+      "epoch": 1.9631410256410255,
+      "grad_norm": 0.683437168598175,
+      "learning_rate": 0.00010306496632950868,
+      "loss": 1.0314,
+      "step": 11026
+    },
+    {
+      "epoch": 1.9633190883190883,
+      "grad_norm": 0.7255756258964539,
+      "learning_rate": 0.00010305097540353012,
+      "loss": 0.9828,
+      "step": 11027
+    },
+    {
+      "epoch": 1.963497150997151,
+      "grad_norm": 0.6904804706573486,
+      "learning_rate": 0.000103036984417774,
+      "loss": 0.9054,
+      "step": 11028
+    },
+    {
+      "epoch": 1.9636752136752138,
+      "grad_norm": 0.6906846761703491,
+      "learning_rate": 0.00010302299337251451,
+      "loss": 1.0287,
+      "step": 11029
+    },
+    {
+      "epoch": 1.9638532763532763,
+      "grad_norm": 0.6677078008651733,
+      "learning_rate": 0.00010300900226802575,
+      "loss": 0.8742,
+      "step": 11030
+    },
+    {
+      "epoch": 1.964031339031339,
+      "grad_norm": 0.6144888997077942,
+      "learning_rate": 0.00010299501110458183,
+      "loss": 0.6942,
+      "step": 11031
+    },
+    {
+      "epoch": 1.9642094017094016,
+      "grad_norm": 0.753010094165802,
+      "learning_rate": 0.0001029810198824569,
+      "loss": 0.9018,
+      "step": 11032
+    },
+    {
+      "epoch": 1.9643874643874644,
+      "grad_norm": 0.6872276663780212,
+      "learning_rate": 0.00010296702860192505,
+      "loss": 1.1647,
+      "step": 11033
+    },
+    {
+      "epoch": 1.9645655270655271,
+      "grad_norm": 0.709000289440155,
+      "learning_rate": 0.00010295303726326047,
+      "loss": 0.9143,
+      "step": 11034
+    },
+    {
+      "epoch": 1.9647435897435899,
+      "grad_norm": 0.6507021188735962,
+      "learning_rate": 0.00010293904586673723,
+      "loss": 1.006,
+      "step": 11035
+    },
+    {
+      "epoch": 1.9649216524216524,
+      "grad_norm": 0.6789946556091309,
+      "learning_rate": 0.00010292505441262952,
+      "loss": 0.9049,
+      "step": 11036
+    },
+    {
+      "epoch": 1.9650997150997151,
+      "grad_norm": 0.7156081795692444,
+      "learning_rate": 0.00010291106290121143,
+      "loss": 0.9195,
+      "step": 11037
+    },
+    {
+      "epoch": 1.9652777777777777,
+      "grad_norm": 0.6770932078361511,
+      "learning_rate": 0.0001028970713327571,
+      "loss": 0.9524,
+      "step": 11038
+    },
+    {
+      "epoch": 1.9654558404558404,
+      "grad_norm": 0.7304288148880005,
+      "learning_rate": 0.00010288307970754067,
+      "loss": 0.9276,
+      "step": 11039
+    },
+    {
+      "epoch": 1.9656339031339032,
+      "grad_norm": 0.7603645324707031,
+      "learning_rate": 0.0001028690880258363,
+      "loss": 1.2157,
+      "step": 11040
+    },
+    {
+      "epoch": 1.965811965811966,
+      "grad_norm": 0.6875246167182922,
+      "learning_rate": 0.00010285509628791811,
+      "loss": 1.0269,
+      "step": 11041
+    },
+    {
+      "epoch": 1.9659900284900285,
+      "grad_norm": 0.7234818935394287,
+      "learning_rate": 0.00010284110449406026,
+      "loss": 0.9695,
+      "step": 11042
+    },
+    {
+      "epoch": 1.9661680911680912,
+      "grad_norm": 0.7322804927825928,
+      "learning_rate": 0.00010282711264453684,
+      "loss": 0.9752,
+      "step": 11043
+    },
+    {
+      "epoch": 1.9663461538461537,
+      "grad_norm": 0.7524822950363159,
+      "learning_rate": 0.00010281312073962202,
+      "loss": 1.2144,
+      "step": 11044
+    },
+    {
+      "epoch": 1.9665242165242165,
+      "grad_norm": 0.6623101234436035,
+      "learning_rate": 0.00010279912877958995,
+      "loss": 1.1334,
+      "step": 11045
+    },
+    {
+      "epoch": 1.9667022792022792,
+      "grad_norm": 0.7814893126487732,
+      "learning_rate": 0.00010278513676471477,
+      "loss": 1.266,
+      "step": 11046
+    },
+    {
+      "epoch": 1.966880341880342,
+      "grad_norm": 0.7129884362220764,
+      "learning_rate": 0.00010277114469527063,
+      "loss": 1.0918,
+      "step": 11047
+    },
+    {
+      "epoch": 1.9670584045584045,
+      "grad_norm": 0.6996828317642212,
+      "learning_rate": 0.00010275715257153164,
+      "loss": 0.9269,
+      "step": 11048
+    },
+    {
+      "epoch": 1.9672364672364673,
+      "grad_norm": 0.6439059972763062,
+      "learning_rate": 0.00010274316039377198,
+      "loss": 1.1998,
+      "step": 11049
+    },
+    {
+      "epoch": 1.9674145299145298,
+      "grad_norm": 0.6837672591209412,
+      "learning_rate": 0.00010272916816226581,
+      "loss": 0.8899,
+      "step": 11050
+    },
+    {
+      "epoch": 1.9675925925925926,
+      "grad_norm": 0.702583909034729,
+      "learning_rate": 0.00010271517587728726,
+      "loss": 1.1862,
+      "step": 11051
+    },
+    {
+      "epoch": 1.9677706552706553,
+      "grad_norm": 0.6627798676490784,
+      "learning_rate": 0.00010270118353911047,
+      "loss": 0.898,
+      "step": 11052
+    },
+    {
+      "epoch": 1.967948717948718,
+      "grad_norm": 0.7628579139709473,
+      "learning_rate": 0.00010268719114800957,
+      "loss": 1.006,
+      "step": 11053
+    },
+    {
+      "epoch": 1.9681267806267806,
+      "grad_norm": 0.6425395607948303,
+      "learning_rate": 0.00010267319870425877,
+      "loss": 0.962,
+      "step": 11054
+    },
+    {
+      "epoch": 1.9683048433048433,
+      "grad_norm": 0.7462666630744934,
+      "learning_rate": 0.00010265920620813219,
+      "loss": 1.0703,
+      "step": 11055
+    },
+    {
+      "epoch": 1.9684829059829059,
+      "grad_norm": 0.67641681432724,
+      "learning_rate": 0.00010264521365990401,
+      "loss": 1.1077,
+      "step": 11056
+    },
+    {
+      "epoch": 1.9686609686609686,
+      "grad_norm": 0.6716381311416626,
+      "learning_rate": 0.0001026312210598483,
+      "loss": 1.1048,
+      "step": 11057
+    },
+    {
+      "epoch": 1.9688390313390314,
+      "grad_norm": 0.7207448482513428,
+      "learning_rate": 0.00010261722840823935,
+      "loss": 0.9236,
+      "step": 11058
+    },
+    {
+      "epoch": 1.9690170940170941,
+      "grad_norm": 0.7208544015884399,
+      "learning_rate": 0.0001026032357053512,
+      "loss": 1.0814,
+      "step": 11059
+    },
+    {
+      "epoch": 1.9691951566951567,
+      "grad_norm": 0.6076363325119019,
+      "learning_rate": 0.00010258924295145807,
+      "loss": 0.9388,
+      "step": 11060
+    },
+    {
+      "epoch": 1.9693732193732194,
+      "grad_norm": 0.6460439562797546,
+      "learning_rate": 0.00010257525014683411,
+      "loss": 0.9506,
+      "step": 11061
+    },
+    {
+      "epoch": 1.969551282051282,
+      "grad_norm": 0.7449939250946045,
+      "learning_rate": 0.00010256125729175348,
+      "loss": 1.0209,
+      "step": 11062
+    },
+    {
+      "epoch": 1.9697293447293447,
+      "grad_norm": 0.640885055065155,
+      "learning_rate": 0.00010254726438649031,
+      "loss": 1.0235,
+      "step": 11063
+    },
+    {
+      "epoch": 1.9699074074074074,
+      "grad_norm": 0.6872261166572571,
+      "learning_rate": 0.00010253327143131879,
+      "loss": 0.9217,
+      "step": 11064
+    },
+    {
+      "epoch": 1.9700854700854702,
+      "grad_norm": 0.6213285326957703,
+      "learning_rate": 0.0001025192784265131,
+      "loss": 0.8204,
+      "step": 11065
+    },
+    {
+      "epoch": 1.9702635327635327,
+      "grad_norm": 0.6594449281692505,
+      "learning_rate": 0.00010250528537234736,
+      "loss": 0.9789,
+      "step": 11066
+    },
+    {
+      "epoch": 1.9704415954415955,
+      "grad_norm": 0.7098729610443115,
+      "learning_rate": 0.00010249129226909577,
+      "loss": 1.2551,
+      "step": 11067
+    },
+    {
+      "epoch": 1.970619658119658,
+      "grad_norm": 0.7455953359603882,
+      "learning_rate": 0.0001024772991170325,
+      "loss": 1.0281,
+      "step": 11068
+    },
+    {
+      "epoch": 1.9707977207977208,
+      "grad_norm": 0.6657416224479675,
+      "learning_rate": 0.00010246330591643166,
+      "loss": 0.9421,
+      "step": 11069
+    },
+    {
+      "epoch": 1.9709757834757835,
+      "grad_norm": 0.6480659246444702,
+      "learning_rate": 0.00010244931266756748,
+      "loss": 0.9424,
+      "step": 11070
+    },
+    {
+      "epoch": 1.9711538461538463,
+      "grad_norm": 0.6440510749816895,
+      "learning_rate": 0.00010243531937071411,
+      "loss": 0.9651,
+      "step": 11071
+    },
+    {
+      "epoch": 1.9713319088319088,
+      "grad_norm": 0.6329794526100159,
+      "learning_rate": 0.00010242132602614571,
+      "loss": 0.9233,
+      "step": 11072
+    },
+    {
+      "epoch": 1.9715099715099715,
+      "grad_norm": 0.6694819927215576,
+      "learning_rate": 0.00010240733263413646,
+      "loss": 0.884,
+      "step": 11073
+    },
+    {
+      "epoch": 1.971688034188034,
+      "grad_norm": 0.7702556848526001,
+      "learning_rate": 0.0001023933391949605,
+      "loss": 1.216,
+      "step": 11074
+    },
+    {
+      "epoch": 1.9718660968660968,
+      "grad_norm": 0.6587536931037903,
+      "learning_rate": 0.00010237934570889207,
+      "loss": 0.9324,
+      "step": 11075
+    },
+    {
+      "epoch": 1.9720441595441596,
+      "grad_norm": 0.7919837832450867,
+      "learning_rate": 0.00010236535217620529,
+      "loss": 1.0011,
+      "step": 11076
+    },
+    {
+      "epoch": 1.9722222222222223,
+      "grad_norm": 0.6604606509208679,
+      "learning_rate": 0.00010235135859717433,
+      "loss": 0.929,
+      "step": 11077
+    },
+    {
+      "epoch": 1.9724002849002849,
+      "grad_norm": 0.7158446907997131,
+      "learning_rate": 0.0001023373649720734,
+      "loss": 0.8912,
+      "step": 11078
+    },
+    {
+      "epoch": 1.9725783475783476,
+      "grad_norm": 0.7450904846191406,
+      "learning_rate": 0.00010232337130117666,
+      "loss": 1.0782,
+      "step": 11079
+    },
+    {
+      "epoch": 1.9727564102564101,
+      "grad_norm": 0.6687077283859253,
+      "learning_rate": 0.00010230937758475827,
+      "loss": 1.0662,
+      "step": 11080
+    },
+    {
+      "epoch": 1.9729344729344729,
+      "grad_norm": 0.7188364267349243,
+      "learning_rate": 0.00010229538382309245,
+      "loss": 1.024,
+      "step": 11081
+    },
+    {
+      "epoch": 1.9731125356125356,
+      "grad_norm": 0.6787814497947693,
+      "learning_rate": 0.00010228139001645334,
+      "loss": 0.9559,
+      "step": 11082
+    },
+    {
+      "epoch": 1.9732905982905984,
+      "grad_norm": 0.6834072470664978,
+      "learning_rate": 0.00010226739616511513,
+      "loss": 0.8143,
+      "step": 11083
+    },
+    {
+      "epoch": 1.973468660968661,
+      "grad_norm": 0.6651090979576111,
+      "learning_rate": 0.00010225340226935201,
+      "loss": 1.05,
+      "step": 11084
+    },
+    {
+      "epoch": 1.9736467236467237,
+      "grad_norm": 0.7125018835067749,
+      "learning_rate": 0.00010223940832943813,
+      "loss": 1.0275,
+      "step": 11085
+    },
+    {
+      "epoch": 1.9738247863247862,
+      "grad_norm": 0.6886870861053467,
+      "learning_rate": 0.00010222541434564772,
+      "loss": 1.0972,
+      "step": 11086
+    },
+    {
+      "epoch": 1.974002849002849,
+      "grad_norm": 0.7068913578987122,
+      "learning_rate": 0.00010221142031825492,
+      "loss": 0.9248,
+      "step": 11087
+    },
+    {
+      "epoch": 1.9741809116809117,
+      "grad_norm": 0.7752319574356079,
+      "learning_rate": 0.00010219742624753397,
+      "loss": 0.9754,
+      "step": 11088
+    },
+    {
+      "epoch": 1.9743589743589745,
+      "grad_norm": 0.7915459871292114,
+      "learning_rate": 0.00010218343213375896,
+      "loss": 1.2589,
+      "step": 11089
+    },
+    {
+      "epoch": 1.9745370370370372,
+      "grad_norm": 0.6597068309783936,
+      "learning_rate": 0.00010216943797720418,
+      "loss": 1.0004,
+      "step": 11090
+    },
+    {
+      "epoch": 1.9747150997150997,
+      "grad_norm": 0.7060620188713074,
+      "learning_rate": 0.00010215544377814375,
+      "loss": 0.9968,
+      "step": 11091
+    },
+    {
+      "epoch": 1.9748931623931623,
+      "grad_norm": 0.6815677881240845,
+      "learning_rate": 0.0001021414495368519,
+      "loss": 0.8889,
+      "step": 11092
+    },
+    {
+      "epoch": 1.975071225071225,
+      "grad_norm": 0.6872935891151428,
+      "learning_rate": 0.00010212745525360277,
+      "loss": 1.1582,
+      "step": 11093
+    },
+    {
+      "epoch": 1.9752492877492878,
+      "grad_norm": 0.6781140565872192,
+      "learning_rate": 0.00010211346092867056,
+      "loss": 0.9988,
+      "step": 11094
+    },
+    {
+      "epoch": 1.9754273504273505,
+      "grad_norm": 0.6959224343299866,
+      "learning_rate": 0.00010209946656232949,
+      "loss": 1.1097,
+      "step": 11095
+    },
+    {
+      "epoch": 1.9756054131054133,
+      "grad_norm": 0.7205058336257935,
+      "learning_rate": 0.00010208547215485376,
+      "loss": 0.9951,
+      "step": 11096
+    },
+    {
+      "epoch": 1.9757834757834758,
+      "grad_norm": 0.6968751549720764,
+      "learning_rate": 0.00010207147770651748,
+      "loss": 0.9313,
+      "step": 11097
+    },
+    {
+      "epoch": 1.9759615384615383,
+      "grad_norm": 0.6688823103904724,
+      "learning_rate": 0.00010205748321759494,
+      "loss": 0.9439,
+      "step": 11098
+    },
+    {
+      "epoch": 1.976139601139601,
+      "grad_norm": 0.6169568300247192,
+      "learning_rate": 0.00010204348868836028,
+      "loss": 1.123,
+      "step": 11099
+    },
+    {
+      "epoch": 1.9763176638176638,
+      "grad_norm": 0.6995537281036377,
+      "learning_rate": 0.00010202949411908768,
+      "loss": 1.1928,
+      "step": 11100
+    },
+    {
+      "epoch": 1.9764957264957266,
+      "grad_norm": 0.7102637887001038,
+      "learning_rate": 0.00010201549951005138,
+      "loss": 1.0265,
+      "step": 11101
+    },
+    {
+      "epoch": 1.9766737891737893,
+      "grad_norm": 0.6820045113563538,
+      "learning_rate": 0.00010200150486152558,
+      "loss": 0.9309,
+      "step": 11102
+    },
+    {
+      "epoch": 1.9768518518518519,
+      "grad_norm": 0.7050938010215759,
+      "learning_rate": 0.00010198751017378443,
+      "loss": 1.0047,
+      "step": 11103
+    },
+    {
+      "epoch": 1.9770299145299144,
+      "grad_norm": 0.6418201923370361,
+      "learning_rate": 0.00010197351544710214,
+      "loss": 1.1172,
+      "step": 11104
+    },
+    {
+      "epoch": 1.9772079772079771,
+      "grad_norm": 0.6681215763092041,
+      "learning_rate": 0.0001019595206817529,
+      "loss": 1.0621,
+      "step": 11105
+    },
+    {
+      "epoch": 1.97738603988604,
+      "grad_norm": 0.7725709676742554,
+      "learning_rate": 0.00010194552587801094,
+      "loss": 1.0044,
+      "step": 11106
+    },
+    {
+      "epoch": 1.9775641025641026,
+      "grad_norm": 0.6870455741882324,
+      "learning_rate": 0.00010193153103615045,
+      "loss": 1.2652,
+      "step": 11107
+    },
+    {
+      "epoch": 1.9777421652421654,
+      "grad_norm": 0.6352108120918274,
+      "learning_rate": 0.00010191753615644561,
+      "loss": 1.1081,
+      "step": 11108
+    },
+    {
+      "epoch": 1.977920227920228,
+      "grad_norm": 0.7322626113891602,
+      "learning_rate": 0.00010190354123917066,
+      "loss": 1.0003,
+      "step": 11109
+    },
+    {
+      "epoch": 1.9780982905982905,
+      "grad_norm": 0.6240935921669006,
+      "learning_rate": 0.00010188954628459972,
+      "loss": 0.8925,
+      "step": 11110
+    },
+    {
+      "epoch": 1.9782763532763532,
+      "grad_norm": 0.6648945212364197,
+      "learning_rate": 0.00010187555129300708,
+      "loss": 1.0882,
+      "step": 11111
+    },
+    {
+      "epoch": 1.978454415954416,
+      "grad_norm": 0.6704208850860596,
+      "learning_rate": 0.00010186155626466692,
+      "loss": 0.8873,
+      "step": 11112
+    },
+    {
+      "epoch": 1.9786324786324787,
+      "grad_norm": 0.6716459393501282,
+      "learning_rate": 0.00010184756119985341,
+      "loss": 1.0045,
+      "step": 11113
+    },
+    {
+      "epoch": 1.9788105413105415,
+      "grad_norm": 0.81277996301651,
+      "learning_rate": 0.0001018335660988408,
+      "loss": 0.8867,
+      "step": 11114
+    },
+    {
+      "epoch": 1.978988603988604,
+      "grad_norm": 0.7008311748504639,
+      "learning_rate": 0.00010181957096190323,
+      "loss": 0.9391,
+      "step": 11115
+    },
+    {
+      "epoch": 1.9791666666666665,
+      "grad_norm": 0.727676272392273,
+      "learning_rate": 0.00010180557578931498,
+      "loss": 1.0157,
+      "step": 11116
+    },
+    {
+      "epoch": 1.9793447293447293,
+      "grad_norm": 0.7058015465736389,
+      "learning_rate": 0.00010179158058135018,
+      "loss": 1.0,
+      "step": 11117
+    },
+    {
+      "epoch": 1.979522792022792,
+      "grad_norm": 0.7770412564277649,
+      "learning_rate": 0.00010177758533828312,
+      "loss": 1.0428,
+      "step": 11118
+    },
+    {
+      "epoch": 1.9797008547008548,
+      "grad_norm": 0.6557414531707764,
+      "learning_rate": 0.00010176359006038798,
+      "loss": 0.8557,
+      "step": 11119
+    },
+    {
+      "epoch": 1.9798789173789175,
+      "grad_norm": 0.7681090235710144,
+      "learning_rate": 0.00010174959474793894,
+      "loss": 0.867,
+      "step": 11120
+    },
+    {
+      "epoch": 1.98005698005698,
+      "grad_norm": 0.7915860414505005,
+      "learning_rate": 0.0001017355994012102,
+      "loss": 0.9961,
+      "step": 11121
+    },
+    {
+      "epoch": 1.9802350427350426,
+      "grad_norm": 0.8039166927337646,
+      "learning_rate": 0.00010172160402047604,
+      "loss": 1.1378,
+      "step": 11122
+    },
+    {
+      "epoch": 1.9804131054131053,
+      "grad_norm": 0.6641189455986023,
+      "learning_rate": 0.0001017076086060106,
+      "loss": 0.8914,
+      "step": 11123
+    },
+    {
+      "epoch": 1.980591168091168,
+      "grad_norm": 0.7673811316490173,
+      "learning_rate": 0.00010169361315808812,
+      "loss": 1.018,
+      "step": 11124
+    },
+    {
+      "epoch": 1.9807692307692308,
+      "grad_norm": 0.7320558428764343,
+      "learning_rate": 0.00010167961767698279,
+      "loss": 1.0515,
+      "step": 11125
+    },
+    {
+      "epoch": 1.9809472934472936,
+      "grad_norm": 0.5717357993125916,
+      "learning_rate": 0.00010166562216296886,
+      "loss": 0.7619,
+      "step": 11126
+    },
+    {
+      "epoch": 1.9811253561253561,
+      "grad_norm": 0.6638465523719788,
+      "learning_rate": 0.00010165162661632052,
+      "loss": 1.0161,
+      "step": 11127
+    },
+    {
+      "epoch": 1.9813034188034186,
+      "grad_norm": 0.7293243408203125,
+      "learning_rate": 0.00010163763103731201,
+      "loss": 1.063,
+      "step": 11128
+    },
+    {
+      "epoch": 1.9814814814814814,
+      "grad_norm": 0.634694516658783,
+      "learning_rate": 0.00010162363542621752,
+      "loss": 0.8945,
+      "step": 11129
+    },
+    {
+      "epoch": 1.9816595441595442,
+      "grad_norm": 0.7086902856826782,
+      "learning_rate": 0.00010160963978331122,
+      "loss": 1.0542,
+      "step": 11130
+    },
+    {
+      "epoch": 1.981837606837607,
+      "grad_norm": 0.5939825773239136,
+      "learning_rate": 0.00010159564410886742,
+      "loss": 0.7822,
+      "step": 11131
+    },
+    {
+      "epoch": 1.9820156695156697,
+      "grad_norm": 0.722183346748352,
+      "learning_rate": 0.00010158164840316027,
+      "loss": 1.0252,
+      "step": 11132
+    },
+    {
+      "epoch": 1.9821937321937322,
+      "grad_norm": 0.7300103306770325,
+      "learning_rate": 0.000101567652666464,
+      "loss": 0.9099,
+      "step": 11133
+    },
+    {
+      "epoch": 1.9823717948717947,
+      "grad_norm": 0.7148736119270325,
+      "learning_rate": 0.00010155365689905285,
+      "loss": 1.0149,
+      "step": 11134
+    },
+    {
+      "epoch": 1.9825498575498575,
+      "grad_norm": 0.8214462995529175,
+      "learning_rate": 0.000101539661101201,
+      "loss": 1.0127,
+      "step": 11135
+    },
+    {
+      "epoch": 1.9827279202279202,
+      "grad_norm": 0.7111126780509949,
+      "learning_rate": 0.00010152566527318265,
+      "loss": 1.045,
+      "step": 11136
+    },
+    {
+      "epoch": 1.982905982905983,
+      "grad_norm": 0.6640021800994873,
+      "learning_rate": 0.00010151166941527213,
+      "loss": 0.9618,
+      "step": 11137
+    },
+    {
+      "epoch": 1.9830840455840457,
+      "grad_norm": 0.7177722454071045,
+      "learning_rate": 0.00010149767352774358,
+      "loss": 1.0373,
+      "step": 11138
+    },
+    {
+      "epoch": 1.9832621082621082,
+      "grad_norm": 0.6728883981704712,
+      "learning_rate": 0.00010148367761087121,
+      "loss": 0.9886,
+      "step": 11139
+    },
+    {
+      "epoch": 1.9834401709401708,
+      "grad_norm": 0.7060428857803345,
+      "learning_rate": 0.00010146968166492926,
+      "loss": 1.042,
+      "step": 11140
+    },
+    {
+      "epoch": 1.9836182336182335,
+      "grad_norm": 0.706253707408905,
+      "learning_rate": 0.00010145568569019192,
+      "loss": 1.2249,
+      "step": 11141
+    },
+    {
+      "epoch": 1.9837962962962963,
+      "grad_norm": 0.618221640586853,
+      "learning_rate": 0.00010144168968693348,
+      "loss": 0.9223,
+      "step": 11142
+    },
+    {
+      "epoch": 1.983974358974359,
+      "grad_norm": 0.7005748748779297,
+      "learning_rate": 0.00010142769365542814,
+      "loss": 1.2735,
+      "step": 11143
+    },
+    {
+      "epoch": 1.9841524216524218,
+      "grad_norm": 0.6059799194335938,
+      "learning_rate": 0.0001014136975959501,
+      "loss": 0.7216,
+      "step": 11144
+    },
+    {
+      "epoch": 1.9843304843304843,
+      "grad_norm": 0.7169116735458374,
+      "learning_rate": 0.00010139970150877358,
+      "loss": 0.9541,
+      "step": 11145
+    },
+    {
+      "epoch": 1.984508547008547,
+      "grad_norm": 0.7402058839797974,
+      "learning_rate": 0.00010138570539417281,
+      "loss": 1.1268,
+      "step": 11146
+    },
+    {
+      "epoch": 1.9846866096866096,
+      "grad_norm": 0.7204117178916931,
+      "learning_rate": 0.00010137170925242201,
+      "loss": 1.1557,
+      "step": 11147
+    },
+    {
+      "epoch": 1.9848646723646723,
+      "grad_norm": 0.589163064956665,
+      "learning_rate": 0.00010135771308379545,
+      "loss": 0.9863,
+      "step": 11148
+    },
+    {
+      "epoch": 1.985042735042735,
+      "grad_norm": 0.6342785358428955,
+      "learning_rate": 0.00010134371688856732,
+      "loss": 0.9294,
+      "step": 11149
+    },
+    {
+      "epoch": 1.9852207977207978,
+      "grad_norm": 0.7144256234169006,
+      "learning_rate": 0.00010132972066701183,
+      "loss": 0.9428,
+      "step": 11150
+    },
+    {
+      "epoch": 1.9853988603988604,
+      "grad_norm": 0.658032238483429,
+      "learning_rate": 0.00010131572441940322,
+      "loss": 0.9749,
+      "step": 11151
+    },
+    {
+      "epoch": 1.9855769230769231,
+      "grad_norm": 0.7609163522720337,
+      "learning_rate": 0.00010130172814601576,
+      "loss": 1.1771,
+      "step": 11152
+    },
+    {
+      "epoch": 1.9857549857549857,
+      "grad_norm": 0.6531760692596436,
+      "learning_rate": 0.00010128773184712361,
+      "loss": 0.8529,
+      "step": 11153
+    },
+    {
+      "epoch": 1.9859330484330484,
+      "grad_norm": 0.6983599066734314,
+      "learning_rate": 0.00010127373552300103,
+      "loss": 1.0307,
+      "step": 11154
+    },
+    {
+      "epoch": 1.9861111111111112,
+      "grad_norm": 0.7121559381484985,
+      "learning_rate": 0.00010125973917392224,
+      "loss": 0.9426,
+      "step": 11155
+    },
+    {
+      "epoch": 1.986289173789174,
+      "grad_norm": 0.6282170414924622,
+      "learning_rate": 0.0001012457428001615,
+      "loss": 0.8983,
+      "step": 11156
+    },
+    {
+      "epoch": 1.9864672364672364,
+      "grad_norm": 0.6960387825965881,
+      "learning_rate": 0.000101231746401993,
+      "loss": 0.9001,
+      "step": 11157
+    },
+    {
+      "epoch": 1.9866452991452992,
+      "grad_norm": 0.7523152232170105,
+      "learning_rate": 0.000101217749979691,
+      "loss": 1.3462,
+      "step": 11158
+    },
+    {
+      "epoch": 1.9868233618233617,
+      "grad_norm": 0.71713787317276,
+      "learning_rate": 0.00010120375353352971,
+      "loss": 1.0147,
+      "step": 11159
+    },
+    {
+      "epoch": 1.9870014245014245,
+      "grad_norm": 0.7304390072822571,
+      "learning_rate": 0.00010118975706378339,
+      "loss": 0.8436,
+      "step": 11160
+    },
+    {
+      "epoch": 1.9871794871794872,
+      "grad_norm": 0.789968729019165,
+      "learning_rate": 0.00010117576057072622,
+      "loss": 1.1162,
+      "step": 11161
+    },
+    {
+      "epoch": 1.98735754985755,
+      "grad_norm": 0.6752170920372009,
+      "learning_rate": 0.00010116176405463249,
+      "loss": 1.0619,
+      "step": 11162
+    },
+    {
+      "epoch": 1.9875356125356125,
+      "grad_norm": 0.681398868560791,
+      "learning_rate": 0.0001011477675157764,
+      "loss": 0.8981,
+      "step": 11163
+    },
+    {
+      "epoch": 1.9877136752136753,
+      "grad_norm": 0.61469566822052,
+      "learning_rate": 0.0001011337709544322,
+      "loss": 1.0139,
+      "step": 11164
+    },
+    {
+      "epoch": 1.9878917378917378,
+      "grad_norm": 0.7524265050888062,
+      "learning_rate": 0.0001011197743708741,
+      "loss": 1.1571,
+      "step": 11165
+    },
+    {
+      "epoch": 1.9880698005698005,
+      "grad_norm": 0.6289594173431396,
+      "learning_rate": 0.00010110577776537633,
+      "loss": 0.93,
+      "step": 11166
+    },
+    {
+      "epoch": 1.9882478632478633,
+      "grad_norm": 0.6991903781890869,
+      "learning_rate": 0.00010109178113821318,
+      "loss": 1.1176,
+      "step": 11167
+    },
+    {
+      "epoch": 1.988425925925926,
+      "grad_norm": 0.7604053020477295,
+      "learning_rate": 0.00010107778448965883,
+      "loss": 1.0497,
+      "step": 11168
+    },
+    {
+      "epoch": 1.9886039886039886,
+      "grad_norm": 0.7166453003883362,
+      "learning_rate": 0.00010106378781998753,
+      "loss": 1.1237,
+      "step": 11169
+    },
+    {
+      "epoch": 1.9887820512820513,
+      "grad_norm": 0.6071686744689941,
+      "learning_rate": 0.00010104979112947352,
+      "loss": 0.8934,
+      "step": 11170
+    },
+    {
+      "epoch": 1.9889601139601139,
+      "grad_norm": 0.6618169546127319,
+      "learning_rate": 0.00010103579441839101,
+      "loss": 1.0596,
+      "step": 11171
+    },
+    {
+      "epoch": 1.9891381766381766,
+      "grad_norm": 0.6838458776473999,
+      "learning_rate": 0.0001010217976870143,
+      "loss": 1.0167,
+      "step": 11172
+    },
+    {
+      "epoch": 1.9893162393162394,
+      "grad_norm": 0.6369979381561279,
+      "learning_rate": 0.00010100780093561757,
+      "loss": 0.9001,
+      "step": 11173
+    },
+    {
+      "epoch": 1.989494301994302,
+      "grad_norm": 0.661313533782959,
+      "learning_rate": 0.00010099380416447508,
+      "loss": 0.8952,
+      "step": 11174
+    },
+    {
+      "epoch": 1.9896723646723646,
+      "grad_norm": 0.6991600394248962,
+      "learning_rate": 0.00010097980737386106,
+      "loss": 1.0083,
+      "step": 11175
+    },
+    {
+      "epoch": 1.9898504273504274,
+      "grad_norm": 0.618748664855957,
+      "learning_rate": 0.00010096581056404972,
+      "loss": 0.8797,
+      "step": 11176
+    },
+    {
+      "epoch": 1.99002849002849,
+      "grad_norm": 0.7039223909378052,
+      "learning_rate": 0.00010095181373531535,
+      "loss": 1.0385,
+      "step": 11177
+    },
+    {
+      "epoch": 1.9902065527065527,
+      "grad_norm": 0.7598999738693237,
+      "learning_rate": 0.00010093781688793216,
+      "loss": 0.9205,
+      "step": 11178
+    },
+    {
+      "epoch": 1.9903846153846154,
+      "grad_norm": 0.6355955600738525,
+      "learning_rate": 0.00010092382002217441,
+      "loss": 0.8646,
+      "step": 11179
+    },
+    {
+      "epoch": 1.9905626780626782,
+      "grad_norm": 0.8024569153785706,
+      "learning_rate": 0.00010090982313831634,
+      "loss": 1.1678,
+      "step": 11180
+    },
+    {
+      "epoch": 1.9907407407407407,
+      "grad_norm": 0.5960529446601868,
+      "learning_rate": 0.00010089582623663216,
+      "loss": 0.8277,
+      "step": 11181
+    },
+    {
+      "epoch": 1.9909188034188035,
+      "grad_norm": 0.6323728561401367,
+      "learning_rate": 0.00010088182931739609,
+      "loss": 0.948,
+      "step": 11182
+    },
+    {
+      "epoch": 1.991096866096866,
+      "grad_norm": 0.7532381415367126,
+      "learning_rate": 0.00010086783238088244,
+      "loss": 1.2948,
+      "step": 11183
+    },
+    {
+      "epoch": 1.9912749287749287,
+      "grad_norm": 0.5740166306495667,
+      "learning_rate": 0.00010085383542736543,
+      "loss": 0.7019,
+      "step": 11184
+    },
+    {
+      "epoch": 1.9914529914529915,
+      "grad_norm": 0.616985559463501,
+      "learning_rate": 0.00010083983845711929,
+      "loss": 1.0802,
+      "step": 11185
+    },
+    {
+      "epoch": 1.9916310541310542,
+      "grad_norm": 0.7505929470062256,
+      "learning_rate": 0.00010082584147041824,
+      "loss": 1.0523,
+      "step": 11186
+    },
+    {
+      "epoch": 1.9918091168091168,
+      "grad_norm": 0.7147656679153442,
+      "learning_rate": 0.00010081184446753653,
+      "loss": 1.0019,
+      "step": 11187
+    },
+    {
+      "epoch": 1.9919871794871795,
+      "grad_norm": 0.7301992774009705,
+      "learning_rate": 0.00010079784744874845,
+      "loss": 1.0329,
+      "step": 11188
+    },
+    {
+      "epoch": 1.992165242165242,
+      "grad_norm": 0.6847206354141235,
+      "learning_rate": 0.00010078385041432819,
+      "loss": 1.0367,
+      "step": 11189
+    },
+    {
+      "epoch": 1.9923433048433048,
+      "grad_norm": 0.7310990691184998,
+      "learning_rate": 0.00010076985336455,
+      "loss": 1.1675,
+      "step": 11190
+    },
+    {
+      "epoch": 1.9925213675213675,
+      "grad_norm": 0.6916858553886414,
+      "learning_rate": 0.00010075585629968813,
+      "loss": 0.8615,
+      "step": 11191
+    },
+    {
+      "epoch": 1.9926994301994303,
+      "grad_norm": 0.6519390344619751,
+      "learning_rate": 0.00010074185922001685,
+      "loss": 0.8105,
+      "step": 11192
+    },
+    {
+      "epoch": 1.9928774928774928,
+      "grad_norm": 0.7437400817871094,
+      "learning_rate": 0.00010072786212581036,
+      "loss": 0.9993,
+      "step": 11193
+    },
+    {
+      "epoch": 1.9930555555555556,
+      "grad_norm": 0.5048928260803223,
+      "learning_rate": 0.00010071386501734292,
+      "loss": 0.7912,
+      "step": 11194
+    },
+    {
+      "epoch": 1.993233618233618,
+      "grad_norm": 0.8042343258857727,
+      "learning_rate": 0.00010069986789488882,
+      "loss": 0.9156,
+      "step": 11195
+    },
+    {
+      "epoch": 1.9934116809116809,
+      "grad_norm": 0.7188669443130493,
+      "learning_rate": 0.0001006858707587222,
+      "loss": 1.0474,
+      "step": 11196
+    },
+    {
+      "epoch": 1.9935897435897436,
+      "grad_norm": 0.7377660870552063,
+      "learning_rate": 0.00010067187360911738,
+      "loss": 0.7013,
+      "step": 11197
+    },
+    {
+      "epoch": 1.9937678062678064,
+      "grad_norm": 0.6684696078300476,
+      "learning_rate": 0.00010065787644634861,
+      "loss": 0.9199,
+      "step": 11198
+    },
+    {
+      "epoch": 1.993945868945869,
+      "grad_norm": 0.7341524958610535,
+      "learning_rate": 0.00010064387927069012,
+      "loss": 1.0925,
+      "step": 11199
+    },
+    {
+      "epoch": 1.9941239316239316,
+      "grad_norm": 0.685745120048523,
+      "learning_rate": 0.00010062988208241614,
+      "loss": 1.083,
+      "step": 11200
+    },
+    {
+      "epoch": 1.9943019943019942,
+      "grad_norm": 0.6923556327819824,
+      "learning_rate": 0.00010061588488180096,
+      "loss": 1.2728,
+      "step": 11201
+    },
+    {
+      "epoch": 1.994480056980057,
+      "grad_norm": 0.6663293242454529,
+      "learning_rate": 0.00010060188766911876,
+      "loss": 1.0937,
+      "step": 11202
+    },
+    {
+      "epoch": 1.9946581196581197,
+      "grad_norm": 0.7963639497756958,
+      "learning_rate": 0.00010058789044464383,
+      "loss": 1.0592,
+      "step": 11203
+    },
+    {
+      "epoch": 1.9948361823361824,
+      "grad_norm": 0.6362990140914917,
+      "learning_rate": 0.00010057389320865042,
+      "loss": 0.8872,
+      "step": 11204
+    },
+    {
+      "epoch": 1.9950142450142452,
+      "grad_norm": 0.7752974033355713,
+      "learning_rate": 0.00010055989596141278,
+      "loss": 1.043,
+      "step": 11205
+    },
+    {
+      "epoch": 1.9951923076923077,
+      "grad_norm": 0.7125133275985718,
+      "learning_rate": 0.00010054589870320512,
+      "loss": 1.0015,
+      "step": 11206
+    },
+    {
+      "epoch": 1.9953703703703702,
+      "grad_norm": 0.7102736830711365,
+      "learning_rate": 0.00010053190143430169,
+      "loss": 1.0052,
+      "step": 11207
+    },
+    {
+      "epoch": 1.995548433048433,
+      "grad_norm": 0.8628628849983215,
+      "learning_rate": 0.00010051790415497677,
+      "loss": 1.2351,
+      "step": 11208
+    },
+    {
+      "epoch": 1.9957264957264957,
+      "grad_norm": 0.7233129739761353,
+      "learning_rate": 0.00010050390686550462,
+      "loss": 1.0848,
+      "step": 11209
+    },
+    {
+      "epoch": 1.9959045584045585,
+      "grad_norm": 0.5936228036880493,
+      "learning_rate": 0.00010048990956615944,
+      "loss": 0.7998,
+      "step": 11210
+    },
+    {
+      "epoch": 1.9960826210826212,
+      "grad_norm": 0.7345388531684875,
+      "learning_rate": 0.0001004759122572155,
+      "loss": 1.0329,
+      "step": 11211
+    },
+    {
+      "epoch": 1.9962606837606838,
+      "grad_norm": 0.7344130873680115,
+      "learning_rate": 0.00010046191493894703,
+      "loss": 1.1563,
+      "step": 11212
+    },
+    {
+      "epoch": 1.9964387464387463,
+      "grad_norm": 0.6979942321777344,
+      "learning_rate": 0.00010044791761162833,
+      "loss": 0.9269,
+      "step": 11213
+    },
+    {
+      "epoch": 1.996616809116809,
+      "grad_norm": 0.67514967918396,
+      "learning_rate": 0.0001004339202755336,
+      "loss": 0.9028,
+      "step": 11214
+    },
+    {
+      "epoch": 1.9967948717948718,
+      "grad_norm": 0.6379111409187317,
+      "learning_rate": 0.00010041992293093712,
+      "loss": 0.7816,
+      "step": 11215
+    },
+    {
+      "epoch": 1.9969729344729346,
+      "grad_norm": 0.693976104259491,
+      "learning_rate": 0.00010040592557811308,
+      "loss": 0.8411,
+      "step": 11216
+    },
+    {
+      "epoch": 1.9971509971509973,
+      "grad_norm": 0.5952646732330322,
+      "learning_rate": 0.0001003919282173358,
+      "loss": 0.8681,
+      "step": 11217
+    },
+    {
+      "epoch": 1.9973290598290598,
+      "grad_norm": 0.7452160716056824,
+      "learning_rate": 0.00010037793084887948,
+      "loss": 1.0198,
+      "step": 11218
+    },
+    {
+      "epoch": 1.9975071225071224,
+      "grad_norm": 0.6683938503265381,
+      "learning_rate": 0.00010036393347301841,
+      "loss": 0.8162,
+      "step": 11219
+    },
+    {
+      "epoch": 1.9976851851851851,
+      "grad_norm": 0.6849120855331421,
+      "learning_rate": 0.00010034993609002683,
+      "loss": 1.0668,
+      "step": 11220
+    },
+    {
+      "epoch": 1.9978632478632479,
+      "grad_norm": 0.8782517910003662,
+      "learning_rate": 0.00010033593870017897,
+      "loss": 1.222,
+      "step": 11221
+    },
+    {
+      "epoch": 1.9980413105413106,
+      "grad_norm": 0.6482772827148438,
+      "learning_rate": 0.00010032194130374908,
+      "loss": 0.7722,
+      "step": 11222
+    },
+    {
+      "epoch": 1.9982193732193734,
+      "grad_norm": 0.8595399260520935,
+      "learning_rate": 0.00010030794390101142,
+      "loss": 1.3004,
+      "step": 11223
+    },
+    {
+      "epoch": 1.998397435897436,
+      "grad_norm": 0.7258931994438171,
+      "learning_rate": 0.00010029394649224024,
+      "loss": 0.8825,
+      "step": 11224
+    },
+    {
+      "epoch": 1.9985754985754984,
+      "grad_norm": 0.6291348934173584,
+      "learning_rate": 0.00010027994907770981,
+      "loss": 0.8681,
+      "step": 11225
+    },
+    {
+      "epoch": 1.9987535612535612,
+      "grad_norm": 0.7528844475746155,
+      "learning_rate": 0.00010026595165769434,
+      "loss": 1.1443,
+      "step": 11226
+    },
+    {
+      "epoch": 1.998931623931624,
+      "grad_norm": 0.654017984867096,
+      "learning_rate": 0.0001002519542324681,
+      "loss": 0.8585,
+      "step": 11227
+    },
+    {
+      "epoch": 1.9991096866096867,
+      "grad_norm": 0.6812533736228943,
+      "learning_rate": 0.00010023795680230532,
+      "loss": 0.8757,
+      "step": 11228
+    },
+    {
+      "epoch": 1.9992877492877494,
+      "grad_norm": 0.7120179533958435,
+      "learning_rate": 0.0001002239593674803,
+      "loss": 1.0159,
+      "step": 11229
+    },
+    {
+      "epoch": 1.999465811965812,
+      "grad_norm": 0.6943802237510681,
+      "learning_rate": 0.00010020996192826725,
+      "loss": 1.0193,
+      "step": 11230
+    },
+    {
+      "epoch": 1.9996438746438745,
+      "grad_norm": 0.7227906584739685,
+      "learning_rate": 0.00010019596448494047,
+      "loss": 1.1536,
+      "step": 11231
+    },
+    {
+      "epoch": 1.9998219373219372,
+      "grad_norm": 0.6233312487602234,
+      "learning_rate": 0.00010018196703777411,
+      "loss": 0.9117,
+      "step": 11232
+    },
+    {
+      "epoch": 1.9998219373219372,
+      "eval_loss": 1.0963108539581299,
+      "eval_runtime": 24.4478,
+      "eval_samples_per_second": 42.58,
+      "eval_steps_per_second": 21.311,
+      "step": 11232
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 22464,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 4,
+  "save_steps": 5616,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 4.3382624499400704e+17,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-11232/training_args.bin b/checkpoint-11232/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..1245f6a2afbe9a6eefbb6d141231d555e0b0bf84
--- /dev/null
+++ b/checkpoint-11232/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:86de370014ed2be86ea27c820b434ceec5e097da2b5f9b08d0eac9aa564d8961
+size 6200
diff --git a/checkpoint-16848/README.md b/checkpoint-16848/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..719b4726992f7d0707a4253e9123dec35e4de390
--- /dev/null
+++ b/checkpoint-16848/README.md
@@ -0,0 +1,202 @@
+---
+base_model: openlm-research/open_llama_3b_v2
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/checkpoint-16848/adapter_config.json b/checkpoint-16848/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..6b6f20a570fc808390da3f2e001093ac1e56c1da
--- /dev/null
+++ b/checkpoint-16848/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "openlm-research/open_llama_3b_v2",
+  "bias": "none",
+  "fan_in_fan_out": null,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "down_proj",
+    "o_proj",
+    "q_proj",
+    "up_proj",
+    "k_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-16848/adapter_model.safetensors b/checkpoint-16848/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..3e168b3124d125c747860a13472082cb4b3c7ff5
--- /dev/null
+++ b/checkpoint-16848/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:03085ad0ad1bd99c21ebcb38449bb38ae5c59f4fd968a0b6bb26d8cd80e9e417
+size 50899792
diff --git a/checkpoint-16848/optimizer.pt b/checkpoint-16848/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3a2c6b78d5691087b2f44d23513f47d44f07c4a6
--- /dev/null
+++ b/checkpoint-16848/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:71fb7e3d382d3193410d495df1ec8962896c45111c77639022c41a01d17ffee4
+size 26231684
diff --git a/checkpoint-16848/rng_state.pth b/checkpoint-16848/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..9402f707925fbc8129d9caeef7089abcb232912c
--- /dev/null
+++ b/checkpoint-16848/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:52cca5856c568bc52c683b690919168fa27bfbdfefc6e0a62355afa6011157c3
+size 14244
diff --git a/checkpoint-16848/scheduler.pt b/checkpoint-16848/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..911075f926d55fdd3ae06b342e6974c78d70efb0
--- /dev/null
+++ b/checkpoint-16848/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3acc4594645e05c91ea7d2972486dd125352a9c7ef49354924ff3431645d5b82
+size 1064
diff --git a/checkpoint-16848/special_tokens_map.json b/checkpoint-16848/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..72ecfeeb7e14d244c936169d2ed139eeae235ef1
--- /dev/null
+++ b/checkpoint-16848/special_tokens_map.json
@@ -0,0 +1,24 @@
+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/checkpoint-16848/tokenizer.model b/checkpoint-16848/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..98866ff8ae3631f331c57923c921a0c9ad22b97d
--- /dev/null
+++ b/checkpoint-16848/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:91b289e85fa20fd375d8b33dc12f77616f18abc6359804471d1fafcb425fecb8
+size 511574
diff --git a/checkpoint-16848/tokenizer_config.json b/checkpoint-16848/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c218d1b7228e3ad6055bdcf0ec15c4f188dc7d79
--- /dev/null
+++ b/checkpoint-16848/tokenizer_config.json
@@ -0,0 +1,43 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": true,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": true,
+  "model_max_length": 2048,
+  "pad_token": "</s>",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false,
+  "use_fast": true
+}
diff --git a/checkpoint-16848/trainer_state.json b/checkpoint-16848/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..a81badf41324718b70079c4e113b372df15bd5ea
--- /dev/null
+++ b/checkpoint-16848/trainer_state.json
@@ -0,0 +1,118073 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.9996438746438745,
+  "eval_steps": 1404,
+  "global_step": 16848,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.00017806267806267807,
+      "grad_norm": 0.2854898273944855,
+      "learning_rate": 1e-05,
+      "loss": 1.1997,
+      "step": 1
+    },
+    {
+      "epoch": 0.00017806267806267807,
+      "eval_loss": 1.3698358535766602,
+      "eval_runtime": 24.1591,
+      "eval_samples_per_second": 43.089,
+      "eval_steps_per_second": 21.565,
+      "step": 1
+    },
+    {
+      "epoch": 0.00035612535612535614,
+      "grad_norm": 0.3508087396621704,
+      "learning_rate": 2e-05,
+      "loss": 1.4134,
+      "step": 2
+    },
+    {
+      "epoch": 0.0005341880341880342,
+      "grad_norm": 0.27050870656967163,
+      "learning_rate": 3e-05,
+      "loss": 1.3447,
+      "step": 3
+    },
+    {
+      "epoch": 0.0007122507122507123,
+      "grad_norm": 0.27706292271614075,
+      "learning_rate": 4e-05,
+      "loss": 1.0354,
+      "step": 4
+    },
+    {
+      "epoch": 0.0008903133903133903,
+      "grad_norm": 0.30398961901664734,
+      "learning_rate": 5e-05,
+      "loss": 1.1441,
+      "step": 5
+    },
+    {
+      "epoch": 0.0010683760683760685,
+      "grad_norm": 0.3103881776332855,
+      "learning_rate": 6e-05,
+      "loss": 1.341,
+      "step": 6
+    },
+    {
+      "epoch": 0.0012464387464387464,
+      "grad_norm": 0.5191189646720886,
+      "learning_rate": 7e-05,
+      "loss": 1.3457,
+      "step": 7
+    },
+    {
+      "epoch": 0.0014245014245014246,
+      "grad_norm": 0.4449467360973358,
+      "learning_rate": 8e-05,
+      "loss": 1.5051,
+      "step": 8
+    },
+    {
+      "epoch": 0.0016025641025641025,
+      "grad_norm": 0.3914581537246704,
+      "learning_rate": 9e-05,
+      "loss": 1.5525,
+      "step": 9
+    },
+    {
+      "epoch": 0.0017806267806267807,
+      "grad_norm": 0.37746086716651917,
+      "learning_rate": 0.0001,
+      "loss": 1.3266,
+      "step": 10
+    },
+    {
+      "epoch": 0.001958689458689459,
+      "grad_norm": 0.35226109623908997,
+      "learning_rate": 0.00011000000000000002,
+      "loss": 1.5416,
+      "step": 11
+    },
+    {
+      "epoch": 0.002136752136752137,
+      "grad_norm": 0.3343672454357147,
+      "learning_rate": 0.00012,
+      "loss": 1.3221,
+      "step": 12
+    },
+    {
+      "epoch": 0.0023148148148148147,
+      "grad_norm": 0.47298333048820496,
+      "learning_rate": 0.00013000000000000002,
+      "loss": 1.2999,
+      "step": 13
+    },
+    {
+      "epoch": 0.002492877492877493,
+      "grad_norm": 0.377814918756485,
+      "learning_rate": 0.00014,
+      "loss": 1.1688,
+      "step": 14
+    },
+    {
+      "epoch": 0.002670940170940171,
+      "grad_norm": 0.46344801783561707,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 1.3565,
+      "step": 15
+    },
+    {
+      "epoch": 0.002849002849002849,
+      "grad_norm": 0.49615249037742615,
+      "learning_rate": 0.00016,
+      "loss": 1.5692,
+      "step": 16
+    },
+    {
+      "epoch": 0.003027065527065527,
+      "grad_norm": 0.5109946131706238,
+      "learning_rate": 0.00017,
+      "loss": 1.2991,
+      "step": 17
+    },
+    {
+      "epoch": 0.003205128205128205,
+      "grad_norm": 0.5125070214271545,
+      "learning_rate": 0.00018,
+      "loss": 1.3309,
+      "step": 18
+    },
+    {
+      "epoch": 0.003383190883190883,
+      "grad_norm": 0.4517767131328583,
+      "learning_rate": 0.00019,
+      "loss": 1.357,
+      "step": 19
+    },
+    {
+      "epoch": 0.0035612535612535613,
+      "grad_norm": 0.47267794609069824,
+      "learning_rate": 0.0002,
+      "loss": 1.1301,
+      "step": 20
+    },
+    {
+      "epoch": 0.0037393162393162395,
+      "grad_norm": 0.46823424100875854,
+      "learning_rate": 0.00019999999902035388,
+      "loss": 1.1195,
+      "step": 21
+    },
+    {
+      "epoch": 0.003917378917378918,
+      "grad_norm": 0.440036803483963,
+      "learning_rate": 0.00019999999608141548,
+      "loss": 1.2822,
+      "step": 22
+    },
+    {
+      "epoch": 0.004095441595441595,
+      "grad_norm": 0.371101975440979,
+      "learning_rate": 0.00019999999118318492,
+      "loss": 1.132,
+      "step": 23
+    },
+    {
+      "epoch": 0.004273504273504274,
+      "grad_norm": 0.44691094756126404,
+      "learning_rate": 0.00019999998432566226,
+      "loss": 1.2968,
+      "step": 24
+    },
+    {
+      "epoch": 0.004451566951566952,
+      "grad_norm": 0.5462725162506104,
+      "learning_rate": 0.0001999999755088476,
+      "loss": 1.1714,
+      "step": 25
+    },
+    {
+      "epoch": 0.004629629629629629,
+      "grad_norm": 0.39860013127326965,
+      "learning_rate": 0.0001999999647327412,
+      "loss": 1.0407,
+      "step": 26
+    },
+    {
+      "epoch": 0.004807692307692308,
+      "grad_norm": 0.5031934380531311,
+      "learning_rate": 0.0001999999519973432,
+      "loss": 1.2773,
+      "step": 27
+    },
+    {
+      "epoch": 0.004985754985754986,
+      "grad_norm": 0.42162764072418213,
+      "learning_rate": 0.0001999999373026539,
+      "loss": 1.2824,
+      "step": 28
+    },
+    {
+      "epoch": 0.005163817663817663,
+      "grad_norm": 0.40964868664741516,
+      "learning_rate": 0.00019999992064867353,
+      "loss": 1.226,
+      "step": 29
+    },
+    {
+      "epoch": 0.005341880341880342,
+      "grad_norm": 0.41650915145874023,
+      "learning_rate": 0.00019999990203540245,
+      "loss": 1.2677,
+      "step": 30
+    },
+    {
+      "epoch": 0.00551994301994302,
+      "grad_norm": 0.40052226185798645,
+      "learning_rate": 0.00019999988146284103,
+      "loss": 0.9443,
+      "step": 31
+    },
+    {
+      "epoch": 0.005698005698005698,
+      "grad_norm": 0.5198387503623962,
+      "learning_rate": 0.00019999985893098964,
+      "loss": 1.3043,
+      "step": 32
+    },
+    {
+      "epoch": 0.005876068376068376,
+      "grad_norm": 0.50941002368927,
+      "learning_rate": 0.00019999983443984878,
+      "loss": 1.2002,
+      "step": 33
+    },
+    {
+      "epoch": 0.006054131054131054,
+      "grad_norm": 0.30082932114601135,
+      "learning_rate": 0.00019999980798941888,
+      "loss": 0.9904,
+      "step": 34
+    },
+    {
+      "epoch": 0.006232193732193732,
+      "grad_norm": 0.4228935241699219,
+      "learning_rate": 0.00019999977957970048,
+      "loss": 1.1137,
+      "step": 35
+    },
+    {
+      "epoch": 0.00641025641025641,
+      "grad_norm": 0.41294750571250916,
+      "learning_rate": 0.0001999997492106941,
+      "loss": 1.3385,
+      "step": 36
+    },
+    {
+      "epoch": 0.006588319088319089,
+      "grad_norm": 0.4415493905544281,
+      "learning_rate": 0.00019999971688240041,
+      "loss": 1.1695,
+      "step": 37
+    },
+    {
+      "epoch": 0.006766381766381766,
+      "grad_norm": 0.3726460933685303,
+      "learning_rate": 0.00019999968259482,
+      "loss": 1.1734,
+      "step": 38
+    },
+    {
+      "epoch": 0.006944444444444444,
+      "grad_norm": 0.3969627320766449,
+      "learning_rate": 0.0001999996463479535,
+      "loss": 1.1209,
+      "step": 39
+    },
+    {
+      "epoch": 0.007122507122507123,
+      "grad_norm": 0.3779667913913727,
+      "learning_rate": 0.0001999996081418017,
+      "loss": 1.1635,
+      "step": 40
+    },
+    {
+      "epoch": 0.0073005698005698,
+      "grad_norm": 0.3933636546134949,
+      "learning_rate": 0.0001999995679763653,
+      "loss": 1.1514,
+      "step": 41
+    },
+    {
+      "epoch": 0.007478632478632479,
+      "grad_norm": 0.3567957282066345,
+      "learning_rate": 0.00019999952585164507,
+      "loss": 1.2488,
+      "step": 42
+    },
+    {
+      "epoch": 0.007656695156695157,
+      "grad_norm": 0.32506081461906433,
+      "learning_rate": 0.00019999948176764186,
+      "loss": 1.149,
+      "step": 43
+    },
+    {
+      "epoch": 0.007834757834757835,
+      "grad_norm": 0.46588361263275146,
+      "learning_rate": 0.0001999994357243566,
+      "loss": 1.4263,
+      "step": 44
+    },
+    {
+      "epoch": 0.008012820512820512,
+      "grad_norm": 0.5070307850837708,
+      "learning_rate": 0.00019999938772179005,
+      "loss": 1.0698,
+      "step": 45
+    },
+    {
+      "epoch": 0.00819088319088319,
+      "grad_norm": 0.38199326395988464,
+      "learning_rate": 0.00019999933775994327,
+      "loss": 0.9907,
+      "step": 46
+    },
+    {
+      "epoch": 0.00836894586894587,
+      "grad_norm": 0.43684661388397217,
+      "learning_rate": 0.0001999992858388172,
+      "loss": 1.2905,
+      "step": 47
+    },
+    {
+      "epoch": 0.008547008547008548,
+      "grad_norm": 0.44482162594795227,
+      "learning_rate": 0.00019999923195841284,
+      "loss": 1.2153,
+      "step": 48
+    },
+    {
+      "epoch": 0.008725071225071225,
+      "grad_norm": 0.4259667694568634,
+      "learning_rate": 0.0001999991761187313,
+      "loss": 1.1582,
+      "step": 49
+    },
+    {
+      "epoch": 0.008903133903133903,
+      "grad_norm": 0.41649091243743896,
+      "learning_rate": 0.00019999911831977357,
+      "loss": 1.0185,
+      "step": 50
+    },
+    {
+      "epoch": 0.009081196581196582,
+      "grad_norm": 0.4179716110229492,
+      "learning_rate": 0.0001999990585615409,
+      "loss": 1.3579,
+      "step": 51
+    },
+    {
+      "epoch": 0.009259259259259259,
+      "grad_norm": 0.3372558355331421,
+      "learning_rate": 0.00019999899684403438,
+      "loss": 1.0638,
+      "step": 52
+    },
+    {
+      "epoch": 0.009437321937321937,
+      "grad_norm": 0.41294020414352417,
+      "learning_rate": 0.00019999893316725525,
+      "loss": 1.1932,
+      "step": 53
+    },
+    {
+      "epoch": 0.009615384615384616,
+      "grad_norm": 0.4407919645309448,
+      "learning_rate": 0.00019999886753120473,
+      "loss": 1.4129,
+      "step": 54
+    },
+    {
+      "epoch": 0.009793447293447293,
+      "grad_norm": 0.47948843240737915,
+      "learning_rate": 0.00019999879993588414,
+      "loss": 1.2424,
+      "step": 55
+    },
+    {
+      "epoch": 0.009971509971509971,
+      "grad_norm": 0.3535355031490326,
+      "learning_rate": 0.00019999873038129484,
+      "loss": 1.0145,
+      "step": 56
+    },
+    {
+      "epoch": 0.01014957264957265,
+      "grad_norm": 0.5067078471183777,
+      "learning_rate": 0.00019999865886743813,
+      "loss": 1.4708,
+      "step": 57
+    },
+    {
+      "epoch": 0.010327635327635327,
+      "grad_norm": 0.42862898111343384,
+      "learning_rate": 0.0001999985853943154,
+      "loss": 1.0399,
+      "step": 58
+    },
+    {
+      "epoch": 0.010505698005698005,
+      "grad_norm": 0.4769059419631958,
+      "learning_rate": 0.00019999850996192816,
+      "loss": 1.1258,
+      "step": 59
+    },
+    {
+      "epoch": 0.010683760683760684,
+      "grad_norm": 0.4065442383289337,
+      "learning_rate": 0.0001999984325702778,
+      "loss": 1.2077,
+      "step": 60
+    },
+    {
+      "epoch": 0.010861823361823363,
+      "grad_norm": 0.5318329930305481,
+      "learning_rate": 0.0001999983532193659,
+      "loss": 1.2298,
+      "step": 61
+    },
+    {
+      "epoch": 0.01103988603988604,
+      "grad_norm": 0.4777173101902008,
+      "learning_rate": 0.000199998271909194,
+      "loss": 1.3195,
+      "step": 62
+    },
+    {
+      "epoch": 0.011217948717948718,
+      "grad_norm": 0.37553808093070984,
+      "learning_rate": 0.0001999981886397637,
+      "loss": 1.1188,
+      "step": 63
+    },
+    {
+      "epoch": 0.011396011396011397,
+      "grad_norm": 0.3920556902885437,
+      "learning_rate": 0.0001999981034110766,
+      "loss": 1.1448,
+      "step": 64
+    },
+    {
+      "epoch": 0.011574074074074073,
+      "grad_norm": 0.454272598028183,
+      "learning_rate": 0.0001999980162231344,
+      "loss": 1.0812,
+      "step": 65
+    },
+    {
+      "epoch": 0.011752136752136752,
+      "grad_norm": 0.4354456663131714,
+      "learning_rate": 0.00019999792707593882,
+      "loss": 1.1174,
+      "step": 66
+    },
+    {
+      "epoch": 0.01193019943019943,
+      "grad_norm": 0.5030252933502197,
+      "learning_rate": 0.00019999783596949156,
+      "loss": 1.2925,
+      "step": 67
+    },
+    {
+      "epoch": 0.012108262108262107,
+      "grad_norm": 0.5141571164131165,
+      "learning_rate": 0.00019999774290379446,
+      "loss": 1.6193,
+      "step": 68
+    },
+    {
+      "epoch": 0.012286324786324786,
+      "grad_norm": 0.417298287153244,
+      "learning_rate": 0.0001999976478788493,
+      "loss": 1.1875,
+      "step": 69
+    },
+    {
+      "epoch": 0.012464387464387465,
+      "grad_norm": 0.4642415940761566,
+      "learning_rate": 0.00019999755089465795,
+      "loss": 1.4138,
+      "step": 70
+    },
+    {
+      "epoch": 0.012642450142450143,
+      "grad_norm": 0.43184754252433777,
+      "learning_rate": 0.0001999974519512223,
+      "loss": 1.0697,
+      "step": 71
+    },
+    {
+      "epoch": 0.01282051282051282,
+      "grad_norm": 0.46698349714279175,
+      "learning_rate": 0.00019999735104854436,
+      "loss": 0.709,
+      "step": 72
+    },
+    {
+      "epoch": 0.012998575498575499,
+      "grad_norm": 0.37253814935684204,
+      "learning_rate": 0.000199997248186626,
+      "loss": 1.2084,
+      "step": 73
+    },
+    {
+      "epoch": 0.013176638176638177,
+      "grad_norm": 0.3851388692855835,
+      "learning_rate": 0.0001999971433654693,
+      "loss": 1.0548,
+      "step": 74
+    },
+    {
+      "epoch": 0.013354700854700854,
+      "grad_norm": 0.4434688985347748,
+      "learning_rate": 0.00019999703658507635,
+      "loss": 1.4084,
+      "step": 75
+    },
+    {
+      "epoch": 0.013532763532763533,
+      "grad_norm": 0.43164482712745667,
+      "learning_rate": 0.00019999692784544913,
+      "loss": 1.4872,
+      "step": 76
+    },
+    {
+      "epoch": 0.013710826210826211,
+      "grad_norm": 0.4224303364753723,
+      "learning_rate": 0.00019999681714658984,
+      "loss": 1.2221,
+      "step": 77
+    },
+    {
+      "epoch": 0.013888888888888888,
+      "grad_norm": 0.35588955879211426,
+      "learning_rate": 0.00019999670448850069,
+      "loss": 0.84,
+      "step": 78
+    },
+    {
+      "epoch": 0.014066951566951567,
+      "grad_norm": 0.3970590829849243,
+      "learning_rate": 0.0001999965898711838,
+      "loss": 1.1886,
+      "step": 79
+    },
+    {
+      "epoch": 0.014245014245014245,
+      "grad_norm": 0.4331924319267273,
+      "learning_rate": 0.00019999647329464146,
+      "loss": 1.179,
+      "step": 80
+    },
+    {
+      "epoch": 0.014423076923076924,
+      "grad_norm": 0.4226946234703064,
+      "learning_rate": 0.00019999635475887598,
+      "loss": 1.1496,
+      "step": 81
+    },
+    {
+      "epoch": 0.0146011396011396,
+      "grad_norm": 0.381592720746994,
+      "learning_rate": 0.00019999623426388962,
+      "loss": 1.1774,
+      "step": 82
+    },
+    {
+      "epoch": 0.01477920227920228,
+      "grad_norm": 0.4190855622291565,
+      "learning_rate": 0.00019999611180968478,
+      "loss": 1.1491,
+      "step": 83
+    },
+    {
+      "epoch": 0.014957264957264958,
+      "grad_norm": 0.3904292583465576,
+      "learning_rate": 0.00019999598739626389,
+      "loss": 1.1275,
+      "step": 84
+    },
+    {
+      "epoch": 0.015135327635327635,
+      "grad_norm": 0.4515478014945984,
+      "learning_rate": 0.0001999958610236293,
+      "loss": 1.2404,
+      "step": 85
+    },
+    {
+      "epoch": 0.015313390313390313,
+      "grad_norm": 0.48341724276542664,
+      "learning_rate": 0.00019999573269178359,
+      "loss": 1.3572,
+      "step": 86
+    },
+    {
+      "epoch": 0.015491452991452992,
+      "grad_norm": 0.42150333523750305,
+      "learning_rate": 0.00019999560240072914,
+      "loss": 1.0203,
+      "step": 87
+    },
+    {
+      "epoch": 0.01566951566951567,
+      "grad_norm": 0.45445525646209717,
+      "learning_rate": 0.00019999547015046867,
+      "loss": 1.0677,
+      "step": 88
+    },
+    {
+      "epoch": 0.01584757834757835,
+      "grad_norm": 0.3581015467643738,
+      "learning_rate": 0.00019999533594100463,
+      "loss": 1.0693,
+      "step": 89
+    },
+    {
+      "epoch": 0.016025641025641024,
+      "grad_norm": 0.4430878758430481,
+      "learning_rate": 0.00019999519977233971,
+      "loss": 1.1591,
+      "step": 90
+    },
+    {
+      "epoch": 0.016203703703703703,
+      "grad_norm": 0.3940352201461792,
+      "learning_rate": 0.0001999950616444766,
+      "loss": 1.1325,
+      "step": 91
+    },
+    {
+      "epoch": 0.01638176638176638,
+      "grad_norm": 0.4521673321723938,
+      "learning_rate": 0.00019999492155741794,
+      "loss": 1.3288,
+      "step": 92
+    },
+    {
+      "epoch": 0.01655982905982906,
+      "grad_norm": 0.3988296687602997,
+      "learning_rate": 0.00019999477951116658,
+      "loss": 1.0023,
+      "step": 93
+    },
+    {
+      "epoch": 0.01673789173789174,
+      "grad_norm": 0.38709723949432373,
+      "learning_rate": 0.00019999463550572516,
+      "loss": 1.2623,
+      "step": 94
+    },
+    {
+      "epoch": 0.016915954415954417,
+      "grad_norm": 0.35376182198524475,
+      "learning_rate": 0.00019999448954109662,
+      "loss": 1.0643,
+      "step": 95
+    },
+    {
+      "epoch": 0.017094017094017096,
+      "grad_norm": 0.49547120928764343,
+      "learning_rate": 0.00019999434161728377,
+      "loss": 1.2121,
+      "step": 96
+    },
+    {
+      "epoch": 0.01727207977207977,
+      "grad_norm": 0.49593672156333923,
+      "learning_rate": 0.00019999419173428952,
+      "loss": 1.1635,
+      "step": 97
+    },
+    {
+      "epoch": 0.01745014245014245,
+      "grad_norm": 0.4146541953086853,
+      "learning_rate": 0.0001999940398921168,
+      "loss": 1.1452,
+      "step": 98
+    },
+    {
+      "epoch": 0.017628205128205128,
+      "grad_norm": 0.5177254676818848,
+      "learning_rate": 0.00019999388609076858,
+      "loss": 1.2178,
+      "step": 99
+    },
+    {
+      "epoch": 0.017806267806267807,
+      "grad_norm": 0.4012768864631653,
+      "learning_rate": 0.0001999937303302479,
+      "loss": 0.9222,
+      "step": 100
+    },
+    {
+      "epoch": 0.017984330484330485,
+      "grad_norm": 0.4597131907939911,
+      "learning_rate": 0.00019999357261055777,
+      "loss": 0.979,
+      "step": 101
+    },
+    {
+      "epoch": 0.018162393162393164,
+      "grad_norm": 0.6190966963768005,
+      "learning_rate": 0.00019999341293170132,
+      "loss": 1.3909,
+      "step": 102
+    },
+    {
+      "epoch": 0.01834045584045584,
+      "grad_norm": 0.4576462209224701,
+      "learning_rate": 0.00019999325129368164,
+      "loss": 1.073,
+      "step": 103
+    },
+    {
+      "epoch": 0.018518518518518517,
+      "grad_norm": 0.4036749005317688,
+      "learning_rate": 0.00019999308769650192,
+      "loss": 1.1354,
+      "step": 104
+    },
+    {
+      "epoch": 0.018696581196581196,
+      "grad_norm": 0.4722452759742737,
+      "learning_rate": 0.00019999292214016538,
+      "loss": 1.2039,
+      "step": 105
+    },
+    {
+      "epoch": 0.018874643874643875,
+      "grad_norm": 0.5338274240493774,
+      "learning_rate": 0.00019999275462467527,
+      "loss": 1.225,
+      "step": 106
+    },
+    {
+      "epoch": 0.019052706552706553,
+      "grad_norm": 0.4301491677761078,
+      "learning_rate": 0.00019999258515003484,
+      "loss": 1.0601,
+      "step": 107
+    },
+    {
+      "epoch": 0.019230769230769232,
+      "grad_norm": 0.33271175622940063,
+      "learning_rate": 0.0001999924137162474,
+      "loss": 0.8441,
+      "step": 108
+    },
+    {
+      "epoch": 0.01940883190883191,
+      "grad_norm": 0.4648784399032593,
+      "learning_rate": 0.0001999922403233163,
+      "loss": 1.2038,
+      "step": 109
+    },
+    {
+      "epoch": 0.019586894586894586,
+      "grad_norm": 0.37915176153182983,
+      "learning_rate": 0.00019999206497124504,
+      "loss": 1.0923,
+      "step": 110
+    },
+    {
+      "epoch": 0.019764957264957264,
+      "grad_norm": 0.3865506052970886,
+      "learning_rate": 0.00019999188766003695,
+      "loss": 0.9535,
+      "step": 111
+    },
+    {
+      "epoch": 0.019943019943019943,
+      "grad_norm": 0.35739636421203613,
+      "learning_rate": 0.0001999917083896955,
+      "loss": 1.2688,
+      "step": 112
+    },
+    {
+      "epoch": 0.02012108262108262,
+      "grad_norm": 0.3943796157836914,
+      "learning_rate": 0.0001999915271602243,
+      "loss": 1.1097,
+      "step": 113
+    },
+    {
+      "epoch": 0.0202991452991453,
+      "grad_norm": 0.44758161902427673,
+      "learning_rate": 0.0001999913439716268,
+      "loss": 1.2698,
+      "step": 114
+    },
+    {
+      "epoch": 0.02047720797720798,
+      "grad_norm": 0.3749747574329376,
+      "learning_rate": 0.00019999115882390664,
+      "loss": 1.1091,
+      "step": 115
+    },
+    {
+      "epoch": 0.020655270655270654,
+      "grad_norm": 0.3479487895965576,
+      "learning_rate": 0.00019999097171706745,
+      "loss": 1.0049,
+      "step": 116
+    },
+    {
+      "epoch": 0.020833333333333332,
+      "grad_norm": 0.4491243064403534,
+      "learning_rate": 0.00019999078265111285,
+      "loss": 1.1857,
+      "step": 117
+    },
+    {
+      "epoch": 0.02101139601139601,
+      "grad_norm": 0.345289021730423,
+      "learning_rate": 0.00019999059162604662,
+      "loss": 1.1397,
+      "step": 118
+    },
+    {
+      "epoch": 0.02118945868945869,
+      "grad_norm": 0.5467649698257446,
+      "learning_rate": 0.00019999039864187243,
+      "loss": 1.2196,
+      "step": 119
+    },
+    {
+      "epoch": 0.021367521367521368,
+      "grad_norm": 0.36446481943130493,
+      "learning_rate": 0.00019999020369859409,
+      "loss": 0.796,
+      "step": 120
+    },
+    {
+      "epoch": 0.021545584045584047,
+      "grad_norm": 0.4225841760635376,
+      "learning_rate": 0.00019999000679621543,
+      "loss": 0.9684,
+      "step": 121
+    },
+    {
+      "epoch": 0.021723646723646725,
+      "grad_norm": 0.4205594062805176,
+      "learning_rate": 0.0001999898079347403,
+      "loss": 1.2762,
+      "step": 122
+    },
+    {
+      "epoch": 0.0219017094017094,
+      "grad_norm": 0.43773892521858215,
+      "learning_rate": 0.00019998960711417257,
+      "loss": 1.117,
+      "step": 123
+    },
+    {
+      "epoch": 0.02207977207977208,
+      "grad_norm": 0.41279685497283936,
+      "learning_rate": 0.00019998940433451623,
+      "loss": 1.1502,
+      "step": 124
+    },
+    {
+      "epoch": 0.022257834757834757,
+      "grad_norm": 0.4090803563594818,
+      "learning_rate": 0.0001999891995957752,
+      "loss": 1.2591,
+      "step": 125
+    },
+    {
+      "epoch": 0.022435897435897436,
+      "grad_norm": 0.6000410914421082,
+      "learning_rate": 0.0001999889928979535,
+      "loss": 1.4321,
+      "step": 126
+    },
+    {
+      "epoch": 0.022613960113960115,
+      "grad_norm": 0.524264395236969,
+      "learning_rate": 0.00019998878424105524,
+      "loss": 1.1849,
+      "step": 127
+    },
+    {
+      "epoch": 0.022792022792022793,
+      "grad_norm": 0.4581047296524048,
+      "learning_rate": 0.00019998857362508443,
+      "loss": 1.0598,
+      "step": 128
+    },
+    {
+      "epoch": 0.022970085470085472,
+      "grad_norm": 0.42663446068763733,
+      "learning_rate": 0.00019998836105004526,
+      "loss": 1.1909,
+      "step": 129
+    },
+    {
+      "epoch": 0.023148148148148147,
+      "grad_norm": 0.45709118247032166,
+      "learning_rate": 0.00019998814651594183,
+      "loss": 1.2104,
+      "step": 130
+    },
+    {
+      "epoch": 0.023326210826210825,
+      "grad_norm": 0.39528369903564453,
+      "learning_rate": 0.0001999879300227784,
+      "loss": 1.3073,
+      "step": 131
+    },
+    {
+      "epoch": 0.023504273504273504,
+      "grad_norm": 0.46896448731422424,
+      "learning_rate": 0.00019998771157055914,
+      "loss": 1.3202,
+      "step": 132
+    },
+    {
+      "epoch": 0.023682336182336183,
+      "grad_norm": 0.4386129677295685,
+      "learning_rate": 0.00019998749115928842,
+      "loss": 1.2196,
+      "step": 133
+    },
+    {
+      "epoch": 0.02386039886039886,
+      "grad_norm": 0.45920488238334656,
+      "learning_rate": 0.00019998726878897051,
+      "loss": 1.3668,
+      "step": 134
+    },
+    {
+      "epoch": 0.02403846153846154,
+      "grad_norm": 0.4115797281265259,
+      "learning_rate": 0.0001999870444596098,
+      "loss": 1.1052,
+      "step": 135
+    },
+    {
+      "epoch": 0.024216524216524215,
+      "grad_norm": 0.3860839903354645,
+      "learning_rate": 0.0001999868181712106,
+      "loss": 1.0344,
+      "step": 136
+    },
+    {
+      "epoch": 0.024394586894586893,
+      "grad_norm": 0.42514732480049133,
+      "learning_rate": 0.00019998658992377742,
+      "loss": 1.1979,
+      "step": 137
+    },
+    {
+      "epoch": 0.024572649572649572,
+      "grad_norm": 0.36001840233802795,
+      "learning_rate": 0.00019998635971731475,
+      "loss": 1.4536,
+      "step": 138
+    },
+    {
+      "epoch": 0.02475071225071225,
+      "grad_norm": 0.3739112317562103,
+      "learning_rate": 0.00019998612755182707,
+      "loss": 1.0097,
+      "step": 139
+    },
+    {
+      "epoch": 0.02492877492877493,
+      "grad_norm": 0.37545472383499146,
+      "learning_rate": 0.00019998589342731888,
+      "loss": 0.829,
+      "step": 140
+    },
+    {
+      "epoch": 0.025106837606837608,
+      "grad_norm": 0.38660728931427,
+      "learning_rate": 0.0001999856573437948,
+      "loss": 1.1324,
+      "step": 141
+    },
+    {
+      "epoch": 0.025284900284900286,
+      "grad_norm": 0.3741356432437897,
+      "learning_rate": 0.00019998541930125953,
+      "loss": 1.0934,
+      "step": 142
+    },
+    {
+      "epoch": 0.02546296296296296,
+      "grad_norm": 0.41900336742401123,
+      "learning_rate": 0.00019998517929971764,
+      "loss": 1.0336,
+      "step": 143
+    },
+    {
+      "epoch": 0.02564102564102564,
+      "grad_norm": 0.4167572259902954,
+      "learning_rate": 0.00019998493733917384,
+      "loss": 1.2571,
+      "step": 144
+    },
+    {
+      "epoch": 0.02581908831908832,
+      "grad_norm": 0.39437636733055115,
+      "learning_rate": 0.0001999846934196329,
+      "loss": 1.2283,
+      "step": 145
+    },
+    {
+      "epoch": 0.025997150997150997,
+      "grad_norm": 0.39129480719566345,
+      "learning_rate": 0.00019998444754109964,
+      "loss": 0.9893,
+      "step": 146
+    },
+    {
+      "epoch": 0.026175213675213676,
+      "grad_norm": 0.45533549785614014,
+      "learning_rate": 0.0001999841997035788,
+      "loss": 1.0793,
+      "step": 147
+    },
+    {
+      "epoch": 0.026353276353276354,
+      "grad_norm": 0.3741768002510071,
+      "learning_rate": 0.00019998394990707524,
+      "loss": 1.2179,
+      "step": 148
+    },
+    {
+      "epoch": 0.026531339031339033,
+      "grad_norm": 0.4066533148288727,
+      "learning_rate": 0.0001999836981515939,
+      "loss": 1.1443,
+      "step": 149
+    },
+    {
+      "epoch": 0.026709401709401708,
+      "grad_norm": 0.4851688742637634,
+      "learning_rate": 0.0001999834444371397,
+      "loss": 1.1668,
+      "step": 150
+    },
+    {
+      "epoch": 0.026887464387464387,
+      "grad_norm": 0.428091436624527,
+      "learning_rate": 0.0001999831887637176,
+      "loss": 1.2676,
+      "step": 151
+    },
+    {
+      "epoch": 0.027065527065527065,
+      "grad_norm": 0.4024655222892761,
+      "learning_rate": 0.0001999829311313326,
+      "loss": 1.3115,
+      "step": 152
+    },
+    {
+      "epoch": 0.027243589743589744,
+      "grad_norm": 0.43983033299446106,
+      "learning_rate": 0.00019998267153998976,
+      "loss": 1.1019,
+      "step": 153
+    },
+    {
+      "epoch": 0.027421652421652423,
+      "grad_norm": 0.4317505359649658,
+      "learning_rate": 0.0001999824099896942,
+      "loss": 1.3129,
+      "step": 154
+    },
+    {
+      "epoch": 0.0275997150997151,
+      "grad_norm": 0.43107882142066956,
+      "learning_rate": 0.000199982146480451,
+      "loss": 1.2134,
+      "step": 155
+    },
+    {
+      "epoch": 0.027777777777777776,
+      "grad_norm": 0.3939448297023773,
+      "learning_rate": 0.00019998188101226532,
+      "loss": 1.0321,
+      "step": 156
+    },
+    {
+      "epoch": 0.027955840455840455,
+      "grad_norm": 0.4641847610473633,
+      "learning_rate": 0.00019998161358514237,
+      "loss": 1.2369,
+      "step": 157
+    },
+    {
+      "epoch": 0.028133903133903133,
+      "grad_norm": 0.3538529872894287,
+      "learning_rate": 0.0001999813441990874,
+      "loss": 1.2061,
+      "step": 158
+    },
+    {
+      "epoch": 0.028311965811965812,
+      "grad_norm": 0.3277950584888458,
+      "learning_rate": 0.0001999810728541057,
+      "loss": 0.9419,
+      "step": 159
+    },
+    {
+      "epoch": 0.02849002849002849,
+      "grad_norm": 0.424710750579834,
+      "learning_rate": 0.00019998079955020254,
+      "loss": 1.3302,
+      "step": 160
+    },
+    {
+      "epoch": 0.02866809116809117,
+      "grad_norm": 0.4120834469795227,
+      "learning_rate": 0.00019998052428738333,
+      "loss": 1.079,
+      "step": 161
+    },
+    {
+      "epoch": 0.028846153846153848,
+      "grad_norm": 0.45811930298805237,
+      "learning_rate": 0.00019998024706565346,
+      "loss": 1.1259,
+      "step": 162
+    },
+    {
+      "epoch": 0.029024216524216523,
+      "grad_norm": 0.3873266875743866,
+      "learning_rate": 0.0001999799678850183,
+      "loss": 1.2124,
+      "step": 163
+    },
+    {
+      "epoch": 0.0292022792022792,
+      "grad_norm": 0.5806412696838379,
+      "learning_rate": 0.00019997968674548337,
+      "loss": 1.3467,
+      "step": 164
+    },
+    {
+      "epoch": 0.02938034188034188,
+      "grad_norm": 0.3906802833080292,
+      "learning_rate": 0.00019997940364705418,
+      "loss": 1.1438,
+      "step": 165
+    },
+    {
+      "epoch": 0.02955840455840456,
+      "grad_norm": 0.45201995968818665,
+      "learning_rate": 0.00019997911858973626,
+      "loss": 1.1469,
+      "step": 166
+    },
+    {
+      "epoch": 0.029736467236467237,
+      "grad_norm": 0.4965892732143402,
+      "learning_rate": 0.0001999788315735352,
+      "loss": 1.0829,
+      "step": 167
+    },
+    {
+      "epoch": 0.029914529914529916,
+      "grad_norm": 0.32578057050704956,
+      "learning_rate": 0.0001999785425984566,
+      "loss": 1.0432,
+      "step": 168
+    },
+    {
+      "epoch": 0.03009259259259259,
+      "grad_norm": 0.4146028161048889,
+      "learning_rate": 0.00019997825166450617,
+      "loss": 1.1657,
+      "step": 169
+    },
+    {
+      "epoch": 0.03027065527065527,
+      "grad_norm": 0.4342964291572571,
+      "learning_rate": 0.0001999779587716896,
+      "loss": 1.2038,
+      "step": 170
+    },
+    {
+      "epoch": 0.030448717948717948,
+      "grad_norm": 0.40128546953201294,
+      "learning_rate": 0.00019997766392001258,
+      "loss": 1.3044,
+      "step": 171
+    },
+    {
+      "epoch": 0.030626780626780627,
+      "grad_norm": 0.4357539117336273,
+      "learning_rate": 0.00019997736710948094,
+      "loss": 1.2143,
+      "step": 172
+    },
+    {
+      "epoch": 0.030804843304843305,
+      "grad_norm": 0.4821035861968994,
+      "learning_rate": 0.00019997706834010045,
+      "loss": 1.0469,
+      "step": 173
+    },
+    {
+      "epoch": 0.030982905982905984,
+      "grad_norm": 0.3966675102710724,
+      "learning_rate": 0.000199976767611877,
+      "loss": 1.2122,
+      "step": 174
+    },
+    {
+      "epoch": 0.031160968660968662,
+      "grad_norm": 0.4265064299106598,
+      "learning_rate": 0.00019997646492481648,
+      "loss": 1.0871,
+      "step": 175
+    },
+    {
+      "epoch": 0.03133903133903134,
+      "grad_norm": 0.3445652723312378,
+      "learning_rate": 0.00019997616027892485,
+      "loss": 1.0412,
+      "step": 176
+    },
+    {
+      "epoch": 0.031517094017094016,
+      "grad_norm": 0.47187718749046326,
+      "learning_rate": 0.000199975853674208,
+      "loss": 1.0822,
+      "step": 177
+    },
+    {
+      "epoch": 0.0316951566951567,
+      "grad_norm": 0.37751707434654236,
+      "learning_rate": 0.000199975545110672,
+      "loss": 1.1439,
+      "step": 178
+    },
+    {
+      "epoch": 0.03187321937321937,
+      "grad_norm": 0.38792455196380615,
+      "learning_rate": 0.00019997523458832286,
+      "loss": 0.8604,
+      "step": 179
+    },
+    {
+      "epoch": 0.03205128205128205,
+      "grad_norm": 0.35199594497680664,
+      "learning_rate": 0.00019997492210716667,
+      "loss": 1.0819,
+      "step": 180
+    },
+    {
+      "epoch": 0.03222934472934473,
+      "grad_norm": 0.4828922748565674,
+      "learning_rate": 0.00019997460766720958,
+      "loss": 1.1879,
+      "step": 181
+    },
+    {
+      "epoch": 0.032407407407407406,
+      "grad_norm": 0.46153363585472107,
+      "learning_rate": 0.00019997429126845774,
+      "loss": 1.1592,
+      "step": 182
+    },
+    {
+      "epoch": 0.03258547008547009,
+      "grad_norm": 0.4844890832901001,
+      "learning_rate": 0.0001999739729109173,
+      "loss": 1.1334,
+      "step": 183
+    },
+    {
+      "epoch": 0.03276353276353276,
+      "grad_norm": 0.414617121219635,
+      "learning_rate": 0.00019997365259459457,
+      "loss": 1.0547,
+      "step": 184
+    },
+    {
+      "epoch": 0.032941595441595445,
+      "grad_norm": 0.46544626355171204,
+      "learning_rate": 0.00019997333031949581,
+      "loss": 1.4067,
+      "step": 185
+    },
+    {
+      "epoch": 0.03311965811965812,
+      "grad_norm": 0.48489415645599365,
+      "learning_rate": 0.0001999730060856273,
+      "loss": 1.4027,
+      "step": 186
+    },
+    {
+      "epoch": 0.033297720797720795,
+      "grad_norm": 0.3963346481323242,
+      "learning_rate": 0.0001999726798929954,
+      "loss": 1.1327,
+      "step": 187
+    },
+    {
+      "epoch": 0.03347578347578348,
+      "grad_norm": 0.3809385895729065,
+      "learning_rate": 0.00019997235174160652,
+      "loss": 1.3475,
+      "step": 188
+    },
+    {
+      "epoch": 0.03365384615384615,
+      "grad_norm": 0.3866960406303406,
+      "learning_rate": 0.0001999720216314671,
+      "loss": 1.1576,
+      "step": 189
+    },
+    {
+      "epoch": 0.033831908831908834,
+      "grad_norm": 0.34976935386657715,
+      "learning_rate": 0.00019997168956258356,
+      "loss": 0.9361,
+      "step": 190
+    },
+    {
+      "epoch": 0.03400997150997151,
+      "grad_norm": 0.38681939244270325,
+      "learning_rate": 0.00019997135553496243,
+      "loss": 1.1796,
+      "step": 191
+    },
+    {
+      "epoch": 0.03418803418803419,
+      "grad_norm": 0.41905197501182556,
+      "learning_rate": 0.0001999710195486103,
+      "loss": 1.1714,
+      "step": 192
+    },
+    {
+      "epoch": 0.03436609686609687,
+      "grad_norm": 0.42356589436531067,
+      "learning_rate": 0.0001999706816035337,
+      "loss": 1.0022,
+      "step": 193
+    },
+    {
+      "epoch": 0.03454415954415954,
+      "grad_norm": 0.3929740786552429,
+      "learning_rate": 0.00019997034169973925,
+      "loss": 1.3769,
+      "step": 194
+    },
+    {
+      "epoch": 0.034722222222222224,
+      "grad_norm": 0.4325186312198639,
+      "learning_rate": 0.00019996999983723366,
+      "loss": 1.3057,
+      "step": 195
+    },
+    {
+      "epoch": 0.0349002849002849,
+      "grad_norm": 0.3954029381275177,
+      "learning_rate": 0.00019996965601602355,
+      "loss": 1.1958,
+      "step": 196
+    },
+    {
+      "epoch": 0.03507834757834758,
+      "grad_norm": 0.34454262256622314,
+      "learning_rate": 0.00019996931023611572,
+      "loss": 1.0972,
+      "step": 197
+    },
+    {
+      "epoch": 0.035256410256410256,
+      "grad_norm": 0.48900291323661804,
+      "learning_rate": 0.0001999689624975169,
+      "loss": 1.213,
+      "step": 198
+    },
+    {
+      "epoch": 0.03543447293447293,
+      "grad_norm": 0.35214388370513916,
+      "learning_rate": 0.00019996861280023397,
+      "loss": 1.0285,
+      "step": 199
+    },
+    {
+      "epoch": 0.03561253561253561,
+      "grad_norm": 0.49393126368522644,
+      "learning_rate": 0.00019996826114427373,
+      "loss": 1.2313,
+      "step": 200
+    },
+    {
+      "epoch": 0.03579059829059829,
+      "grad_norm": 0.3994458019733429,
+      "learning_rate": 0.00019996790752964305,
+      "loss": 1.0474,
+      "step": 201
+    },
+    {
+      "epoch": 0.03596866096866097,
+      "grad_norm": 0.5387318730354309,
+      "learning_rate": 0.0001999675519563489,
+      "loss": 1.3067,
+      "step": 202
+    },
+    {
+      "epoch": 0.036146723646723646,
+      "grad_norm": 0.4976751208305359,
+      "learning_rate": 0.00019996719442439824,
+      "loss": 1.2593,
+      "step": 203
+    },
+    {
+      "epoch": 0.03632478632478633,
+      "grad_norm": 0.47052907943725586,
+      "learning_rate": 0.0001999668349337981,
+      "loss": 1.1036,
+      "step": 204
+    },
+    {
+      "epoch": 0.036502849002849,
+      "grad_norm": 0.39616644382476807,
+      "learning_rate": 0.00019996647348455543,
+      "loss": 1.0481,
+      "step": 205
+    },
+    {
+      "epoch": 0.03668091168091168,
+      "grad_norm": 0.42987677454948425,
+      "learning_rate": 0.00019996611007667742,
+      "loss": 1.0923,
+      "step": 206
+    },
+    {
+      "epoch": 0.03685897435897436,
+      "grad_norm": 0.47065848112106323,
+      "learning_rate": 0.00019996574471017113,
+      "loss": 1.1403,
+      "step": 207
+    },
+    {
+      "epoch": 0.037037037037037035,
+      "grad_norm": 0.4363015592098236,
+      "learning_rate": 0.00019996537738504373,
+      "loss": 1.253,
+      "step": 208
+    },
+    {
+      "epoch": 0.03721509971509972,
+      "grad_norm": 0.4038296937942505,
+      "learning_rate": 0.00019996500810130243,
+      "loss": 1.1679,
+      "step": 209
+    },
+    {
+      "epoch": 0.03739316239316239,
+      "grad_norm": 0.5038532018661499,
+      "learning_rate": 0.00019996463685895445,
+      "loss": 1.1182,
+      "step": 210
+    },
+    {
+      "epoch": 0.037571225071225074,
+      "grad_norm": 0.37740692496299744,
+      "learning_rate": 0.00019996426365800706,
+      "loss": 1.0465,
+      "step": 211
+    },
+    {
+      "epoch": 0.03774928774928775,
+      "grad_norm": 0.47794604301452637,
+      "learning_rate": 0.00019996388849846759,
+      "loss": 1.2836,
+      "step": 212
+    },
+    {
+      "epoch": 0.037927350427350424,
+      "grad_norm": 0.38460609316825867,
+      "learning_rate": 0.0001999635113803434,
+      "loss": 1.2099,
+      "step": 213
+    },
+    {
+      "epoch": 0.038105413105413107,
+      "grad_norm": 0.42016157507896423,
+      "learning_rate": 0.0001999631323036418,
+      "loss": 1.152,
+      "step": 214
+    },
+    {
+      "epoch": 0.03828347578347578,
+      "grad_norm": 0.4024946391582489,
+      "learning_rate": 0.00019996275126837033,
+      "loss": 1.1534,
+      "step": 215
+    },
+    {
+      "epoch": 0.038461538461538464,
+      "grad_norm": 0.4573793411254883,
+      "learning_rate": 0.00019996236827453642,
+      "loss": 1.2019,
+      "step": 216
+    },
+    {
+      "epoch": 0.03863960113960114,
+      "grad_norm": 0.3642503321170807,
+      "learning_rate": 0.0001999619833221475,
+      "loss": 1.0541,
+      "step": 217
+    },
+    {
+      "epoch": 0.03881766381766382,
+      "grad_norm": 0.38492897152900696,
+      "learning_rate": 0.0001999615964112112,
+      "loss": 1.1269,
+      "step": 218
+    },
+    {
+      "epoch": 0.038995726495726496,
+      "grad_norm": 0.427219420671463,
+      "learning_rate": 0.0001999612075417351,
+      "loss": 1.1126,
+      "step": 219
+    },
+    {
+      "epoch": 0.03917378917378917,
+      "grad_norm": 0.40781742334365845,
+      "learning_rate": 0.00019996081671372676,
+      "loss": 1.2207,
+      "step": 220
+    },
+    {
+      "epoch": 0.03935185185185185,
+      "grad_norm": 0.39229512214660645,
+      "learning_rate": 0.00019996042392719386,
+      "loss": 1.0403,
+      "step": 221
+    },
+    {
+      "epoch": 0.03952991452991453,
+      "grad_norm": 0.42038577795028687,
+      "learning_rate": 0.0001999600291821441,
+      "loss": 1.2157,
+      "step": 222
+    },
+    {
+      "epoch": 0.03970797720797721,
+      "grad_norm": 0.3963491916656494,
+      "learning_rate": 0.00019995963247858525,
+      "loss": 1.0532,
+      "step": 223
+    },
+    {
+      "epoch": 0.039886039886039885,
+      "grad_norm": 0.4389874041080475,
+      "learning_rate": 0.00019995923381652502,
+      "loss": 1.4279,
+      "step": 224
+    },
+    {
+      "epoch": 0.04006410256410257,
+      "grad_norm": 0.357312947511673,
+      "learning_rate": 0.00019995883319597123,
+      "loss": 0.9871,
+      "step": 225
+    },
+    {
+      "epoch": 0.04024216524216524,
+      "grad_norm": 0.3644427955150604,
+      "learning_rate": 0.00019995843061693181,
+      "loss": 1.0879,
+      "step": 226
+    },
+    {
+      "epoch": 0.04042022792022792,
+      "grad_norm": 0.4074651002883911,
+      "learning_rate": 0.00019995802607941453,
+      "loss": 1.2138,
+      "step": 227
+    },
+    {
+      "epoch": 0.0405982905982906,
+      "grad_norm": 0.40709465742111206,
+      "learning_rate": 0.0001999576195834274,
+      "loss": 1.1905,
+      "step": 228
+    },
+    {
+      "epoch": 0.040776353276353275,
+      "grad_norm": 0.4280182719230652,
+      "learning_rate": 0.00019995721112897838,
+      "loss": 1.2331,
+      "step": 229
+    },
+    {
+      "epoch": 0.04095441595441596,
+      "grad_norm": 0.37846076488494873,
+      "learning_rate": 0.00019995680071607544,
+      "loss": 1.078,
+      "step": 230
+    },
+    {
+      "epoch": 0.04113247863247863,
+      "grad_norm": 0.3877260088920593,
+      "learning_rate": 0.0001999563883447266,
+      "loss": 1.0309,
+      "step": 231
+    },
+    {
+      "epoch": 0.04131054131054131,
+      "grad_norm": 0.42886826395988464,
+      "learning_rate": 0.00019995597401494,
+      "loss": 1.0403,
+      "step": 232
+    },
+    {
+      "epoch": 0.04148860398860399,
+      "grad_norm": 0.4316534101963043,
+      "learning_rate": 0.00019995555772672372,
+      "loss": 1.2418,
+      "step": 233
+    },
+    {
+      "epoch": 0.041666666666666664,
+      "grad_norm": 0.45768865942955017,
+      "learning_rate": 0.00019995513948008593,
+      "loss": 1.233,
+      "step": 234
+    },
+    {
+      "epoch": 0.041844729344729346,
+      "grad_norm": 0.5647913813591003,
+      "learning_rate": 0.00019995471927503481,
+      "loss": 1.1346,
+      "step": 235
+    },
+    {
+      "epoch": 0.04202279202279202,
+      "grad_norm": 0.3797492980957031,
+      "learning_rate": 0.00019995429711157863,
+      "loss": 1.1574,
+      "step": 236
+    },
+    {
+      "epoch": 0.042200854700854704,
+      "grad_norm": 0.4392767548561096,
+      "learning_rate": 0.00019995387298972562,
+      "loss": 0.8988,
+      "step": 237
+    },
+    {
+      "epoch": 0.04237891737891738,
+      "grad_norm": 0.37331557273864746,
+      "learning_rate": 0.0001999534469094841,
+      "loss": 1.0439,
+      "step": 238
+    },
+    {
+      "epoch": 0.042556980056980054,
+      "grad_norm": 0.3785935938358307,
+      "learning_rate": 0.00019995301887086245,
+      "loss": 0.9839,
+      "step": 239
+    },
+    {
+      "epoch": 0.042735042735042736,
+      "grad_norm": 0.4351862668991089,
+      "learning_rate": 0.00019995258887386898,
+      "loss": 1.2653,
+      "step": 240
+    },
+    {
+      "epoch": 0.04291310541310541,
+      "grad_norm": 0.399475634098053,
+      "learning_rate": 0.0001999521569185122,
+      "loss": 0.9877,
+      "step": 241
+    },
+    {
+      "epoch": 0.04309116809116809,
+      "grad_norm": 0.42332810163497925,
+      "learning_rate": 0.00019995172300480053,
+      "loss": 1.2403,
+      "step": 242
+    },
+    {
+      "epoch": 0.04326923076923077,
+      "grad_norm": 0.4397708475589752,
+      "learning_rate": 0.00019995128713274247,
+      "loss": 0.9316,
+      "step": 243
+    },
+    {
+      "epoch": 0.04344729344729345,
+      "grad_norm": 0.3614110052585602,
+      "learning_rate": 0.00019995084930234658,
+      "loss": 1.1088,
+      "step": 244
+    },
+    {
+      "epoch": 0.043625356125356125,
+      "grad_norm": 0.39433717727661133,
+      "learning_rate": 0.0001999504095136214,
+      "loss": 1.2002,
+      "step": 245
+    },
+    {
+      "epoch": 0.0438034188034188,
+      "grad_norm": 0.33088216185569763,
+      "learning_rate": 0.0001999499677665756,
+      "loss": 0.8796,
+      "step": 246
+    },
+    {
+      "epoch": 0.04398148148148148,
+      "grad_norm": 0.5239143967628479,
+      "learning_rate": 0.00019994952406121784,
+      "loss": 1.2808,
+      "step": 247
+    },
+    {
+      "epoch": 0.04415954415954416,
+      "grad_norm": 0.42156723141670227,
+      "learning_rate": 0.00019994907839755675,
+      "loss": 1.1775,
+      "step": 248
+    },
+    {
+      "epoch": 0.04433760683760684,
+      "grad_norm": 0.42569902539253235,
+      "learning_rate": 0.0001999486307756011,
+      "loss": 1.001,
+      "step": 249
+    },
+    {
+      "epoch": 0.044515669515669515,
+      "grad_norm": 0.38241544365882874,
+      "learning_rate": 0.00019994818119535964,
+      "loss": 1.1064,
+      "step": 250
+    },
+    {
+      "epoch": 0.0446937321937322,
+      "grad_norm": 0.4185071885585785,
+      "learning_rate": 0.0001999477296568412,
+      "loss": 1.2109,
+      "step": 251
+    },
+    {
+      "epoch": 0.04487179487179487,
+      "grad_norm": 0.4189644157886505,
+      "learning_rate": 0.00019994727616005464,
+      "loss": 1.2902,
+      "step": 252
+    },
+    {
+      "epoch": 0.04504985754985755,
+      "grad_norm": 0.34671884775161743,
+      "learning_rate": 0.0001999468207050088,
+      "loss": 0.9429,
+      "step": 253
+    },
+    {
+      "epoch": 0.04522792022792023,
+      "grad_norm": 0.42391687631607056,
+      "learning_rate": 0.00019994636329171266,
+      "loss": 0.7179,
+      "step": 254
+    },
+    {
+      "epoch": 0.045405982905982904,
+      "grad_norm": 0.3803195655345917,
+      "learning_rate": 0.00019994590392017513,
+      "loss": 1.0318,
+      "step": 255
+    },
+    {
+      "epoch": 0.045584045584045586,
+      "grad_norm": 0.3389956057071686,
+      "learning_rate": 0.00019994544259040525,
+      "loss": 1.0485,
+      "step": 256
+    },
+    {
+      "epoch": 0.04576210826210826,
+      "grad_norm": 0.4927038550376892,
+      "learning_rate": 0.000199944979302412,
+      "loss": 1.3426,
+      "step": 257
+    },
+    {
+      "epoch": 0.045940170940170943,
+      "grad_norm": 0.33200421929359436,
+      "learning_rate": 0.00019994451405620453,
+      "loss": 1.0071,
+      "step": 258
+    },
+    {
+      "epoch": 0.04611823361823362,
+      "grad_norm": 0.38028615713119507,
+      "learning_rate": 0.00019994404685179195,
+      "loss": 1.0985,
+      "step": 259
+    },
+    {
+      "epoch": 0.046296296296296294,
+      "grad_norm": 0.3752151429653168,
+      "learning_rate": 0.00019994357768918333,
+      "loss": 0.9209,
+      "step": 260
+    },
+    {
+      "epoch": 0.046474358974358976,
+      "grad_norm": 0.43030866980552673,
+      "learning_rate": 0.00019994310656838796,
+      "loss": 0.9921,
+      "step": 261
+    },
+    {
+      "epoch": 0.04665242165242165,
+      "grad_norm": 0.4402460753917694,
+      "learning_rate": 0.00019994263348941502,
+      "loss": 1.1051,
+      "step": 262
+    },
+    {
+      "epoch": 0.04683048433048433,
+      "grad_norm": 0.43012720346450806,
+      "learning_rate": 0.0001999421584522738,
+      "loss": 1.1839,
+      "step": 263
+    },
+    {
+      "epoch": 0.04700854700854701,
+      "grad_norm": 0.4195305407047272,
+      "learning_rate": 0.0001999416814569736,
+      "loss": 1.1749,
+      "step": 264
+    },
+    {
+      "epoch": 0.04718660968660968,
+      "grad_norm": 0.45623287558555603,
+      "learning_rate": 0.00019994120250352372,
+      "loss": 1.2433,
+      "step": 265
+    },
+    {
+      "epoch": 0.047364672364672365,
+      "grad_norm": 0.4736156761646271,
+      "learning_rate": 0.00019994072159193363,
+      "loss": 1.2882,
+      "step": 266
+    },
+    {
+      "epoch": 0.04754273504273504,
+      "grad_norm": 0.36698561906814575,
+      "learning_rate": 0.0001999402387222127,
+      "loss": 1.1486,
+      "step": 267
+    },
+    {
+      "epoch": 0.04772079772079772,
+      "grad_norm": 0.3854144215583801,
+      "learning_rate": 0.00019993975389437038,
+      "loss": 0.8115,
+      "step": 268
+    },
+    {
+      "epoch": 0.0478988603988604,
+      "grad_norm": 0.41512808203697205,
+      "learning_rate": 0.0001999392671084162,
+      "loss": 1.0959,
+      "step": 269
+    },
+    {
+      "epoch": 0.04807692307692308,
+      "grad_norm": 0.3869563341140747,
+      "learning_rate": 0.0001999387783643597,
+      "loss": 1.087,
+      "step": 270
+    },
+    {
+      "epoch": 0.048254985754985755,
+      "grad_norm": 0.4649744927883148,
+      "learning_rate": 0.00019993828766221044,
+      "loss": 1.0011,
+      "step": 271
+    },
+    {
+      "epoch": 0.04843304843304843,
+      "grad_norm": 0.40331923961639404,
+      "learning_rate": 0.00019993779500197803,
+      "loss": 1.1463,
+      "step": 272
+    },
+    {
+      "epoch": 0.04861111111111111,
+      "grad_norm": 0.3826279938220978,
+      "learning_rate": 0.0001999373003836721,
+      "loss": 1.1491,
+      "step": 273
+    },
+    {
+      "epoch": 0.04878917378917379,
+      "grad_norm": 0.3967166543006897,
+      "learning_rate": 0.00019993680380730243,
+      "loss": 1.1462,
+      "step": 274
+    },
+    {
+      "epoch": 0.04896723646723647,
+      "grad_norm": 0.4298507869243622,
+      "learning_rate": 0.00019993630527287865,
+      "loss": 1.2471,
+      "step": 275
+    },
+    {
+      "epoch": 0.049145299145299144,
+      "grad_norm": 0.41486215591430664,
+      "learning_rate": 0.0001999358047804106,
+      "loss": 1.287,
+      "step": 276
+    },
+    {
+      "epoch": 0.049323361823361826,
+      "grad_norm": 0.3914124369621277,
+      "learning_rate": 0.00019993530232990803,
+      "loss": 1.0935,
+      "step": 277
+    },
+    {
+      "epoch": 0.0495014245014245,
+      "grad_norm": 0.39888378977775574,
+      "learning_rate": 0.00019993479792138082,
+      "loss": 1.2347,
+      "step": 278
+    },
+    {
+      "epoch": 0.049679487179487176,
+      "grad_norm": 0.3911665678024292,
+      "learning_rate": 0.00019993429155483884,
+      "loss": 1.0917,
+      "step": 279
+    },
+    {
+      "epoch": 0.04985754985754986,
+      "grad_norm": 0.42871445417404175,
+      "learning_rate": 0.00019993378323029197,
+      "loss": 1.0277,
+      "step": 280
+    },
+    {
+      "epoch": 0.050035612535612534,
+      "grad_norm": 0.35397860407829285,
+      "learning_rate": 0.00019993327294775027,
+      "loss": 0.9549,
+      "step": 281
+    },
+    {
+      "epoch": 0.050213675213675216,
+      "grad_norm": 0.4528059959411621,
+      "learning_rate": 0.00019993276070722364,
+      "loss": 1.2338,
+      "step": 282
+    },
+    {
+      "epoch": 0.05039173789173789,
+      "grad_norm": 0.354735791683197,
+      "learning_rate": 0.00019993224650872218,
+      "loss": 1.1892,
+      "step": 283
+    },
+    {
+      "epoch": 0.05056980056980057,
+      "grad_norm": 0.44407567381858826,
+      "learning_rate": 0.00019993173035225592,
+      "loss": 1.1621,
+      "step": 284
+    },
+    {
+      "epoch": 0.05074786324786325,
+      "grad_norm": 0.4177244305610657,
+      "learning_rate": 0.000199931212237835,
+      "loss": 1.1184,
+      "step": 285
+    },
+    {
+      "epoch": 0.05092592592592592,
+      "grad_norm": 0.5627759695053101,
+      "learning_rate": 0.0001999306921654696,
+      "loss": 1.0755,
+      "step": 286
+    },
+    {
+      "epoch": 0.051103988603988605,
+      "grad_norm": 0.46767523884773254,
+      "learning_rate": 0.00019993017013516986,
+      "loss": 1.2654,
+      "step": 287
+    },
+    {
+      "epoch": 0.05128205128205128,
+      "grad_norm": 0.4163128733634949,
+      "learning_rate": 0.000199929646146946,
+      "loss": 1.1307,
+      "step": 288
+    },
+    {
+      "epoch": 0.05146011396011396,
+      "grad_norm": 0.36954161524772644,
+      "learning_rate": 0.00019992912020080832,
+      "loss": 0.8274,
+      "step": 289
+    },
+    {
+      "epoch": 0.05163817663817664,
+      "grad_norm": 0.4770594835281372,
+      "learning_rate": 0.00019992859229676712,
+      "loss": 1.2235,
+      "step": 290
+    },
+    {
+      "epoch": 0.05181623931623932,
+      "grad_norm": 0.4174608290195465,
+      "learning_rate": 0.00019992806243483274,
+      "loss": 1.2893,
+      "step": 291
+    },
+    {
+      "epoch": 0.051994301994301995,
+      "grad_norm": 0.3794898986816406,
+      "learning_rate": 0.00019992753061501555,
+      "loss": 1.104,
+      "step": 292
+    },
+    {
+      "epoch": 0.05217236467236467,
+      "grad_norm": 0.3912592828273773,
+      "learning_rate": 0.000199926996837326,
+      "loss": 1.0043,
+      "step": 293
+    },
+    {
+      "epoch": 0.05235042735042735,
+      "grad_norm": 0.39641159772872925,
+      "learning_rate": 0.00019992646110177448,
+      "loss": 1.083,
+      "step": 294
+    },
+    {
+      "epoch": 0.05252849002849003,
+      "grad_norm": 0.3518857955932617,
+      "learning_rate": 0.00019992592340837157,
+      "loss": 0.9275,
+      "step": 295
+    },
+    {
+      "epoch": 0.05270655270655271,
+      "grad_norm": 0.3955721855163574,
+      "learning_rate": 0.00019992538375712777,
+      "loss": 1.0153,
+      "step": 296
+    },
+    {
+      "epoch": 0.052884615384615384,
+      "grad_norm": 0.3837333023548126,
+      "learning_rate": 0.00019992484214805364,
+      "loss": 1.1664,
+      "step": 297
+    },
+    {
+      "epoch": 0.053062678062678066,
+      "grad_norm": 0.39400920271873474,
+      "learning_rate": 0.0001999242985811598,
+      "loss": 1.0532,
+      "step": 298
+    },
+    {
+      "epoch": 0.05324074074074074,
+      "grad_norm": 0.39258649945259094,
+      "learning_rate": 0.00019992375305645692,
+      "loss": 1.0081,
+      "step": 299
+    },
+    {
+      "epoch": 0.053418803418803416,
+      "grad_norm": 0.49768248200416565,
+      "learning_rate": 0.00019992320557395566,
+      "loss": 1.2553,
+      "step": 300
+    },
+    {
+      "epoch": 0.0535968660968661,
+      "grad_norm": 0.364776074886322,
+      "learning_rate": 0.00019992265613366677,
+      "loss": 1.0582,
+      "step": 301
+    },
+    {
+      "epoch": 0.053774928774928774,
+      "grad_norm": 0.47317907214164734,
+      "learning_rate": 0.00019992210473560097,
+      "loss": 1.3114,
+      "step": 302
+    },
+    {
+      "epoch": 0.053952991452991456,
+      "grad_norm": 0.3706119656562805,
+      "learning_rate": 0.00019992155137976917,
+      "loss": 0.9554,
+      "step": 303
+    },
+    {
+      "epoch": 0.05413105413105413,
+      "grad_norm": 0.42809563875198364,
+      "learning_rate": 0.0001999209960661821,
+      "loss": 1.306,
+      "step": 304
+    },
+    {
+      "epoch": 0.054309116809116806,
+      "grad_norm": 0.4514487385749817,
+      "learning_rate": 0.00019992043879485066,
+      "loss": 1.0147,
+      "step": 305
+    },
+    {
+      "epoch": 0.05448717948717949,
+      "grad_norm": 0.36672836542129517,
+      "learning_rate": 0.0001999198795657858,
+      "loss": 1.1392,
+      "step": 306
+    },
+    {
+      "epoch": 0.05466524216524216,
+      "grad_norm": 0.4206554889678955,
+      "learning_rate": 0.00019991931837899847,
+      "loss": 1.2405,
+      "step": 307
+    },
+    {
+      "epoch": 0.054843304843304845,
+      "grad_norm": 0.46168261766433716,
+      "learning_rate": 0.00019991875523449966,
+      "loss": 1.2707,
+      "step": 308
+    },
+    {
+      "epoch": 0.05502136752136752,
+      "grad_norm": 0.39503365755081177,
+      "learning_rate": 0.00019991819013230039,
+      "loss": 1.0776,
+      "step": 309
+    },
+    {
+      "epoch": 0.0551994301994302,
+      "grad_norm": 0.35244834423065186,
+      "learning_rate": 0.00019991762307241178,
+      "loss": 1.0864,
+      "step": 310
+    },
+    {
+      "epoch": 0.05537749287749288,
+      "grad_norm": 0.3865319490432739,
+      "learning_rate": 0.0001999170540548449,
+      "loss": 1.3659,
+      "step": 311
+    },
+    {
+      "epoch": 0.05555555555555555,
+      "grad_norm": 0.3666876554489136,
+      "learning_rate": 0.0001999164830796109,
+      "loss": 0.9884,
+      "step": 312
+    },
+    {
+      "epoch": 0.055733618233618235,
+      "grad_norm": 0.4278281629085541,
+      "learning_rate": 0.00019991591014672096,
+      "loss": 1.1522,
+      "step": 313
+    },
+    {
+      "epoch": 0.05591168091168091,
+      "grad_norm": 0.4172627031803131,
+      "learning_rate": 0.0001999153352561863,
+      "loss": 1.2527,
+      "step": 314
+    },
+    {
+      "epoch": 0.05608974358974359,
+      "grad_norm": 0.38872212171554565,
+      "learning_rate": 0.00019991475840801823,
+      "loss": 1.2985,
+      "step": 315
+    },
+    {
+      "epoch": 0.05626780626780627,
+      "grad_norm": 0.4160458445549011,
+      "learning_rate": 0.00019991417960222804,
+      "loss": 1.1347,
+      "step": 316
+    },
+    {
+      "epoch": 0.05644586894586895,
+      "grad_norm": 0.5169723033905029,
+      "learning_rate": 0.00019991359883882705,
+      "loss": 1.0819,
+      "step": 317
+    },
+    {
+      "epoch": 0.056623931623931624,
+      "grad_norm": 0.42306259274482727,
+      "learning_rate": 0.0001999130161178266,
+      "loss": 1.3139,
+      "step": 318
+    },
+    {
+      "epoch": 0.0568019943019943,
+      "grad_norm": 0.41975873708724976,
+      "learning_rate": 0.00019991243143923816,
+      "loss": 1.2277,
+      "step": 319
+    },
+    {
+      "epoch": 0.05698005698005698,
+      "grad_norm": 0.3873472511768341,
+      "learning_rate": 0.00019991184480307324,
+      "loss": 1.156,
+      "step": 320
+    },
+    {
+      "epoch": 0.057158119658119656,
+      "grad_norm": 0.43656104803085327,
+      "learning_rate": 0.0001999112562093432,
+      "loss": 1.2344,
+      "step": 321
+    },
+    {
+      "epoch": 0.05733618233618234,
+      "grad_norm": 0.3738791048526764,
+      "learning_rate": 0.00019991066565805968,
+      "loss": 0.9573,
+      "step": 322
+    },
+    {
+      "epoch": 0.05751424501424501,
+      "grad_norm": 0.3838156461715698,
+      "learning_rate": 0.00019991007314923418,
+      "loss": 0.9274,
+      "step": 323
+    },
+    {
+      "epoch": 0.057692307692307696,
+      "grad_norm": 0.4564770758152008,
+      "learning_rate": 0.00019990947868287837,
+      "loss": 1.0756,
+      "step": 324
+    },
+    {
+      "epoch": 0.05787037037037037,
+      "grad_norm": 0.4560079872608185,
+      "learning_rate": 0.00019990888225900386,
+      "loss": 1.1508,
+      "step": 325
+    },
+    {
+      "epoch": 0.058048433048433046,
+      "grad_norm": 0.44356057047843933,
+      "learning_rate": 0.00019990828387762236,
+      "loss": 1.2323,
+      "step": 326
+    },
+    {
+      "epoch": 0.05822649572649573,
+      "grad_norm": 0.46390119194984436,
+      "learning_rate": 0.00019990768353874553,
+      "loss": 1.0031,
+      "step": 327
+    },
+    {
+      "epoch": 0.0584045584045584,
+      "grad_norm": 0.4502357244491577,
+      "learning_rate": 0.00019990708124238525,
+      "loss": 1.3454,
+      "step": 328
+    },
+    {
+      "epoch": 0.058582621082621085,
+      "grad_norm": 0.3979945182800293,
+      "learning_rate": 0.0001999064769885532,
+      "loss": 1.2833,
+      "step": 329
+    },
+    {
+      "epoch": 0.05876068376068376,
+      "grad_norm": 0.3899286687374115,
+      "learning_rate": 0.00019990587077726128,
+      "loss": 1.0175,
+      "step": 330
+    },
+    {
+      "epoch": 0.05893874643874644,
+      "grad_norm": 0.41422948241233826,
+      "learning_rate": 0.00019990526260852139,
+      "loss": 1.1151,
+      "step": 331
+    },
+    {
+      "epoch": 0.05911680911680912,
+      "grad_norm": 0.4266608953475952,
+      "learning_rate": 0.0001999046524823454,
+      "loss": 1.1119,
+      "step": 332
+    },
+    {
+      "epoch": 0.05929487179487179,
+      "grad_norm": 0.46563324332237244,
+      "learning_rate": 0.00019990404039874524,
+      "loss": 1.2358,
+      "step": 333
+    },
+    {
+      "epoch": 0.059472934472934474,
+      "grad_norm": 0.4404347240924835,
+      "learning_rate": 0.00019990342635773297,
+      "loss": 1.1748,
+      "step": 334
+    },
+    {
+      "epoch": 0.05965099715099715,
+      "grad_norm": 0.5133237838745117,
+      "learning_rate": 0.00019990281035932062,
+      "loss": 1.1649,
+      "step": 335
+    },
+    {
+      "epoch": 0.05982905982905983,
+      "grad_norm": 0.3593895435333252,
+      "learning_rate": 0.00019990219240352018,
+      "loss": 1.0318,
+      "step": 336
+    },
+    {
+      "epoch": 0.06000712250712251,
+      "grad_norm": 0.40554583072662354,
+      "learning_rate": 0.00019990157249034384,
+      "loss": 1.1202,
+      "step": 337
+    },
+    {
+      "epoch": 0.06018518518518518,
+      "grad_norm": 0.3770706057548523,
+      "learning_rate": 0.00019990095061980372,
+      "loss": 0.9908,
+      "step": 338
+    },
+    {
+      "epoch": 0.060363247863247864,
+      "grad_norm": 0.39676955342292786,
+      "learning_rate": 0.000199900326791912,
+      "loss": 0.8176,
+      "step": 339
+    },
+    {
+      "epoch": 0.06054131054131054,
+      "grad_norm": 0.41448578238487244,
+      "learning_rate": 0.00019989970100668086,
+      "loss": 1.2877,
+      "step": 340
+    },
+    {
+      "epoch": 0.06071937321937322,
+      "grad_norm": 0.4200015068054199,
+      "learning_rate": 0.00019989907326412265,
+      "loss": 1.2293,
+      "step": 341
+    },
+    {
+      "epoch": 0.060897435897435896,
+      "grad_norm": 0.47350621223449707,
+      "learning_rate": 0.0001998984435642496,
+      "loss": 1.2331,
+      "step": 342
+    },
+    {
+      "epoch": 0.06107549857549858,
+      "grad_norm": 0.47050634026527405,
+      "learning_rate": 0.00019989781190707406,
+      "loss": 0.8888,
+      "step": 343
+    },
+    {
+      "epoch": 0.06125356125356125,
+      "grad_norm": 0.4994896948337555,
+      "learning_rate": 0.00019989717829260842,
+      "loss": 1.0921,
+      "step": 344
+    },
+    {
+      "epoch": 0.06143162393162393,
+      "grad_norm": 0.36340200901031494,
+      "learning_rate": 0.0001998965427208651,
+      "loss": 0.9777,
+      "step": 345
+    },
+    {
+      "epoch": 0.06160968660968661,
+      "grad_norm": 0.3538152873516083,
+      "learning_rate": 0.00019989590519185654,
+      "loss": 1.0055,
+      "step": 346
+    },
+    {
+      "epoch": 0.061787749287749286,
+      "grad_norm": 0.5388944149017334,
+      "learning_rate": 0.00019989526570559526,
+      "loss": 1.1001,
+      "step": 347
+    },
+    {
+      "epoch": 0.06196581196581197,
+      "grad_norm": 0.4411574602127075,
+      "learning_rate": 0.00019989462426209373,
+      "loss": 1.0038,
+      "step": 348
+    },
+    {
+      "epoch": 0.06214387464387464,
+      "grad_norm": 0.3930876851081848,
+      "learning_rate": 0.00019989398086136455,
+      "loss": 1.1534,
+      "step": 349
+    },
+    {
+      "epoch": 0.062321937321937325,
+      "grad_norm": 0.47357070446014404,
+      "learning_rate": 0.00019989333550342033,
+      "loss": 1.2687,
+      "step": 350
+    },
+    {
+      "epoch": 0.0625,
+      "grad_norm": 0.40302303433418274,
+      "learning_rate": 0.00019989268818827372,
+      "loss": 1.1894,
+      "step": 351
+    },
+    {
+      "epoch": 0.06267806267806268,
+      "grad_norm": 0.4470510184764862,
+      "learning_rate": 0.00019989203891593738,
+      "loss": 1.2207,
+      "step": 352
+    },
+    {
+      "epoch": 0.06285612535612535,
+      "grad_norm": 0.42235100269317627,
+      "learning_rate": 0.00019989138768642406,
+      "loss": 1.2086,
+      "step": 353
+    },
+    {
+      "epoch": 0.06303418803418803,
+      "grad_norm": 0.38305309414863586,
+      "learning_rate": 0.0001998907344997465,
+      "loss": 1.0473,
+      "step": 354
+    },
+    {
+      "epoch": 0.06321225071225071,
+      "grad_norm": 0.3893027901649475,
+      "learning_rate": 0.0001998900793559175,
+      "loss": 1.1746,
+      "step": 355
+    },
+    {
+      "epoch": 0.0633903133903134,
+      "grad_norm": 0.41206735372543335,
+      "learning_rate": 0.0001998894222549499,
+      "loss": 1.188,
+      "step": 356
+    },
+    {
+      "epoch": 0.06356837606837606,
+      "grad_norm": 0.3700513243675232,
+      "learning_rate": 0.00019988876319685658,
+      "loss": 0.9862,
+      "step": 357
+    },
+    {
+      "epoch": 0.06374643874643875,
+      "grad_norm": 0.3708794116973877,
+      "learning_rate": 0.0001998881021816504,
+      "loss": 1.2003,
+      "step": 358
+    },
+    {
+      "epoch": 0.06392450142450143,
+      "grad_norm": 0.4058014154434204,
+      "learning_rate": 0.00019988743920934442,
+      "loss": 1.2311,
+      "step": 359
+    },
+    {
+      "epoch": 0.0641025641025641,
+      "grad_norm": 0.39134132862091064,
+      "learning_rate": 0.00019988677427995155,
+      "loss": 1.001,
+      "step": 360
+    },
+    {
+      "epoch": 0.06428062678062678,
+      "grad_norm": 0.3853437602519989,
+      "learning_rate": 0.00019988610739348484,
+      "loss": 1.0725,
+      "step": 361
+    },
+    {
+      "epoch": 0.06445868945868946,
+      "grad_norm": 0.47114330530166626,
+      "learning_rate": 0.00019988543854995735,
+      "loss": 1.2196,
+      "step": 362
+    },
+    {
+      "epoch": 0.06463675213675214,
+      "grad_norm": 0.40465688705444336,
+      "learning_rate": 0.00019988476774938216,
+      "loss": 1.1869,
+      "step": 363
+    },
+    {
+      "epoch": 0.06481481481481481,
+      "grad_norm": 0.40301886200904846,
+      "learning_rate": 0.00019988409499177245,
+      "loss": 1.1765,
+      "step": 364
+    },
+    {
+      "epoch": 0.0649928774928775,
+      "grad_norm": 0.43443185091018677,
+      "learning_rate": 0.0001998834202771414,
+      "loss": 1.2022,
+      "step": 365
+    },
+    {
+      "epoch": 0.06517094017094018,
+      "grad_norm": 0.4712986350059509,
+      "learning_rate": 0.00019988274360550217,
+      "loss": 1.156,
+      "step": 366
+    },
+    {
+      "epoch": 0.06534900284900284,
+      "grad_norm": 0.4524450898170471,
+      "learning_rate": 0.00019988206497686815,
+      "loss": 1.2917,
+      "step": 367
+    },
+    {
+      "epoch": 0.06552706552706553,
+      "grad_norm": 0.40302205085754395,
+      "learning_rate": 0.0001998813843912525,
+      "loss": 0.9993,
+      "step": 368
+    },
+    {
+      "epoch": 0.06570512820512821,
+      "grad_norm": 0.39435216784477234,
+      "learning_rate": 0.00019988070184866864,
+      "loss": 1.0914,
+      "step": 369
+    },
+    {
+      "epoch": 0.06588319088319089,
+      "grad_norm": 0.39267390966415405,
+      "learning_rate": 0.00019988001734912988,
+      "loss": 1.3138,
+      "step": 370
+    },
+    {
+      "epoch": 0.06606125356125356,
+      "grad_norm": 0.38351675868034363,
+      "learning_rate": 0.00019987933089264968,
+      "loss": 1.0997,
+      "step": 371
+    },
+    {
+      "epoch": 0.06623931623931624,
+      "grad_norm": 0.3294839859008789,
+      "learning_rate": 0.00019987864247924145,
+      "loss": 0.9656,
+      "step": 372
+    },
+    {
+      "epoch": 0.06641737891737892,
+      "grad_norm": 0.45333364605903625,
+      "learning_rate": 0.00019987795210891872,
+      "loss": 1.095,
+      "step": 373
+    },
+    {
+      "epoch": 0.06659544159544159,
+      "grad_norm": 0.4362282454967499,
+      "learning_rate": 0.00019987725978169501,
+      "loss": 1.2103,
+      "step": 374
+    },
+    {
+      "epoch": 0.06677350427350427,
+      "grad_norm": 0.41314780712127686,
+      "learning_rate": 0.00019987656549758385,
+      "loss": 1.2115,
+      "step": 375
+    },
+    {
+      "epoch": 0.06695156695156695,
+      "grad_norm": 0.4230864644050598,
+      "learning_rate": 0.00019987586925659888,
+      "loss": 1.17,
+      "step": 376
+    },
+    {
+      "epoch": 0.06712962962962964,
+      "grad_norm": 0.4703855812549591,
+      "learning_rate": 0.00019987517105875372,
+      "loss": 1.367,
+      "step": 377
+    },
+    {
+      "epoch": 0.0673076923076923,
+      "grad_norm": 0.4671297073364258,
+      "learning_rate": 0.00019987447090406206,
+      "loss": 1.2543,
+      "step": 378
+    },
+    {
+      "epoch": 0.06748575498575499,
+      "grad_norm": 0.43746981024742126,
+      "learning_rate": 0.0001998737687925376,
+      "loss": 1.214,
+      "step": 379
+    },
+    {
+      "epoch": 0.06766381766381767,
+      "grad_norm": 0.40889596939086914,
+      "learning_rate": 0.00019987306472419412,
+      "loss": 1.0496,
+      "step": 380
+    },
+    {
+      "epoch": 0.06784188034188034,
+      "grad_norm": 0.3677358627319336,
+      "learning_rate": 0.0001998723586990454,
+      "loss": 1.1242,
+      "step": 381
+    },
+    {
+      "epoch": 0.06801994301994302,
+      "grad_norm": 0.3892628848552704,
+      "learning_rate": 0.00019987165071710527,
+      "loss": 1.0246,
+      "step": 382
+    },
+    {
+      "epoch": 0.0681980056980057,
+      "grad_norm": 0.4281293749809265,
+      "learning_rate": 0.00019987094077838764,
+      "loss": 1.2817,
+      "step": 383
+    },
+    {
+      "epoch": 0.06837606837606838,
+      "grad_norm": 0.45030340552330017,
+      "learning_rate": 0.00019987022888290636,
+      "loss": 1.159,
+      "step": 384
+    },
+    {
+      "epoch": 0.06855413105413105,
+      "grad_norm": 0.6327905058860779,
+      "learning_rate": 0.00019986951503067545,
+      "loss": 0.9577,
+      "step": 385
+    },
+    {
+      "epoch": 0.06873219373219373,
+      "grad_norm": 0.40339627861976624,
+      "learning_rate": 0.0001998687992217088,
+      "loss": 1.138,
+      "step": 386
+    },
+    {
+      "epoch": 0.06891025641025642,
+      "grad_norm": 0.4018291234970093,
+      "learning_rate": 0.00019986808145602052,
+      "loss": 0.9109,
+      "step": 387
+    },
+    {
+      "epoch": 0.06908831908831908,
+      "grad_norm": 0.41566264629364014,
+      "learning_rate": 0.00019986736173362464,
+      "loss": 1.1516,
+      "step": 388
+    },
+    {
+      "epoch": 0.06926638176638177,
+      "grad_norm": 0.3569067418575287,
+      "learning_rate": 0.00019986664005453527,
+      "loss": 1.2329,
+      "step": 389
+    },
+    {
+      "epoch": 0.06944444444444445,
+      "grad_norm": 0.3959648907184601,
+      "learning_rate": 0.0001998659164187665,
+      "loss": 1.1041,
+      "step": 390
+    },
+    {
+      "epoch": 0.06962250712250712,
+      "grad_norm": 0.42853206396102905,
+      "learning_rate": 0.00019986519082633257,
+      "loss": 1.0859,
+      "step": 391
+    },
+    {
+      "epoch": 0.0698005698005698,
+      "grad_norm": 0.42005518078804016,
+      "learning_rate": 0.0001998644632772477,
+      "loss": 1.2017,
+      "step": 392
+    },
+    {
+      "epoch": 0.06997863247863248,
+      "grad_norm": 0.4296947419643402,
+      "learning_rate": 0.00019986373377152612,
+      "loss": 1.1464,
+      "step": 393
+    },
+    {
+      "epoch": 0.07015669515669516,
+      "grad_norm": 0.394747793674469,
+      "learning_rate": 0.0001998630023091821,
+      "loss": 1.0316,
+      "step": 394
+    },
+    {
+      "epoch": 0.07033475783475783,
+      "grad_norm": 0.3779357969760895,
+      "learning_rate": 0.00019986226889023002,
+      "loss": 1.1081,
+      "step": 395
+    },
+    {
+      "epoch": 0.07051282051282051,
+      "grad_norm": 0.4271804690361023,
+      "learning_rate": 0.00019986153351468424,
+      "loss": 0.985,
+      "step": 396
+    },
+    {
+      "epoch": 0.0706908831908832,
+      "grad_norm": 0.49412235617637634,
+      "learning_rate": 0.00019986079618255912,
+      "loss": 1.2606,
+      "step": 397
+    },
+    {
+      "epoch": 0.07086894586894586,
+      "grad_norm": 0.43657439947128296,
+      "learning_rate": 0.00019986005689386915,
+      "loss": 1.2266,
+      "step": 398
+    },
+    {
+      "epoch": 0.07104700854700854,
+      "grad_norm": 0.4060729444026947,
+      "learning_rate": 0.0001998593156486288,
+      "loss": 1.1787,
+      "step": 399
+    },
+    {
+      "epoch": 0.07122507122507123,
+      "grad_norm": 0.387046217918396,
+      "learning_rate": 0.00019985857244685264,
+      "loss": 0.9411,
+      "step": 400
+    },
+    {
+      "epoch": 0.07140313390313391,
+      "grad_norm": 0.4243999123573303,
+      "learning_rate": 0.00019985782728855516,
+      "loss": 1.2024,
+      "step": 401
+    },
+    {
+      "epoch": 0.07158119658119658,
+      "grad_norm": 0.43113812804222107,
+      "learning_rate": 0.000199857080173751,
+      "loss": 1.1246,
+      "step": 402
+    },
+    {
+      "epoch": 0.07175925925925926,
+      "grad_norm": 0.4653271436691284,
+      "learning_rate": 0.0001998563311024548,
+      "loss": 1.2343,
+      "step": 403
+    },
+    {
+      "epoch": 0.07193732193732194,
+      "grad_norm": 0.43260812759399414,
+      "learning_rate": 0.0001998555800746812,
+      "loss": 0.9543,
+      "step": 404
+    },
+    {
+      "epoch": 0.07211538461538461,
+      "grad_norm": 0.4635484516620636,
+      "learning_rate": 0.00019985482709044495,
+      "loss": 1.1091,
+      "step": 405
+    },
+    {
+      "epoch": 0.07229344729344729,
+      "grad_norm": 0.38362643122673035,
+      "learning_rate": 0.00019985407214976076,
+      "loss": 1.2584,
+      "step": 406
+    },
+    {
+      "epoch": 0.07247150997150997,
+      "grad_norm": 0.4068310558795929,
+      "learning_rate": 0.00019985331525264351,
+      "loss": 1.1944,
+      "step": 407
+    },
+    {
+      "epoch": 0.07264957264957266,
+      "grad_norm": 0.43909943103790283,
+      "learning_rate": 0.00019985255639910795,
+      "loss": 1.3748,
+      "step": 408
+    },
+    {
+      "epoch": 0.07282763532763532,
+      "grad_norm": 0.48674601316452026,
+      "learning_rate": 0.000199851795589169,
+      "loss": 1.2684,
+      "step": 409
+    },
+    {
+      "epoch": 0.073005698005698,
+      "grad_norm": 0.4218580722808838,
+      "learning_rate": 0.0001998510328228415,
+      "loss": 1.168,
+      "step": 410
+    },
+    {
+      "epoch": 0.07318376068376069,
+      "grad_norm": 0.4688236117362976,
+      "learning_rate": 0.00019985026810014046,
+      "loss": 1.3088,
+      "step": 411
+    },
+    {
+      "epoch": 0.07336182336182336,
+      "grad_norm": 0.3863612711429596,
+      "learning_rate": 0.00019984950142108083,
+      "loss": 1.0261,
+      "step": 412
+    },
+    {
+      "epoch": 0.07353988603988604,
+      "grad_norm": 0.4177640378475189,
+      "learning_rate": 0.00019984873278567765,
+      "loss": 1.1985,
+      "step": 413
+    },
+    {
+      "epoch": 0.07371794871794872,
+      "grad_norm": 0.4645586311817169,
+      "learning_rate": 0.00019984796219394592,
+      "loss": 1.2463,
+      "step": 414
+    },
+    {
+      "epoch": 0.0738960113960114,
+      "grad_norm": 0.5051766633987427,
+      "learning_rate": 0.00019984718964590083,
+      "loss": 1.3031,
+      "step": 415
+    },
+    {
+      "epoch": 0.07407407407407407,
+      "grad_norm": 0.4200040400028229,
+      "learning_rate": 0.0001998464151415575,
+      "loss": 1.0842,
+      "step": 416
+    },
+    {
+      "epoch": 0.07425213675213675,
+      "grad_norm": 0.34211036562919617,
+      "learning_rate": 0.000199845638680931,
+      "loss": 0.9659,
+      "step": 417
+    },
+    {
+      "epoch": 0.07443019943019943,
+      "grad_norm": 0.3553323447704315,
+      "learning_rate": 0.00019984486026403668,
+      "loss": 1.0102,
+      "step": 418
+    },
+    {
+      "epoch": 0.0746082621082621,
+      "grad_norm": 0.4967300295829773,
+      "learning_rate": 0.00019984407989088974,
+      "loss": 1.3125,
+      "step": 419
+    },
+    {
+      "epoch": 0.07478632478632478,
+      "grad_norm": 0.41649797558784485,
+      "learning_rate": 0.00019984329756150544,
+      "loss": 1.3092,
+      "step": 420
+    },
+    {
+      "epoch": 0.07496438746438747,
+      "grad_norm": 0.43825802206993103,
+      "learning_rate": 0.00019984251327589912,
+      "loss": 1.3678,
+      "step": 421
+    },
+    {
+      "epoch": 0.07514245014245015,
+      "grad_norm": 0.363394170999527,
+      "learning_rate": 0.00019984172703408617,
+      "loss": 1.305,
+      "step": 422
+    },
+    {
+      "epoch": 0.07532051282051282,
+      "grad_norm": 0.411563903093338,
+      "learning_rate": 0.000199840938836082,
+      "loss": 1.4248,
+      "step": 423
+    },
+    {
+      "epoch": 0.0754985754985755,
+      "grad_norm": 0.40548190474510193,
+      "learning_rate": 0.000199840148681902,
+      "loss": 1.1081,
+      "step": 424
+    },
+    {
+      "epoch": 0.07567663817663818,
+      "grad_norm": 0.3781099021434784,
+      "learning_rate": 0.00019983935657156171,
+      "loss": 1.185,
+      "step": 425
+    },
+    {
+      "epoch": 0.07585470085470085,
+      "grad_norm": 0.46597573161125183,
+      "learning_rate": 0.00019983856250507662,
+      "loss": 1.119,
+      "step": 426
+    },
+    {
+      "epoch": 0.07603276353276353,
+      "grad_norm": 0.3988197147846222,
+      "learning_rate": 0.00019983776648246232,
+      "loss": 1.206,
+      "step": 427
+    },
+    {
+      "epoch": 0.07621082621082621,
+      "grad_norm": 0.41210901737213135,
+      "learning_rate": 0.00019983696850373433,
+      "loss": 1.1843,
+      "step": 428
+    },
+    {
+      "epoch": 0.0763888888888889,
+      "grad_norm": 0.41870948672294617,
+      "learning_rate": 0.00019983616856890837,
+      "loss": 1.2248,
+      "step": 429
+    },
+    {
+      "epoch": 0.07656695156695156,
+      "grad_norm": 0.4320056140422821,
+      "learning_rate": 0.00019983536667800007,
+      "loss": 0.9743,
+      "step": 430
+    },
+    {
+      "epoch": 0.07674501424501425,
+      "grad_norm": 0.48455503582954407,
+      "learning_rate": 0.00019983456283102517,
+      "loss": 1.0438,
+      "step": 431
+    },
+    {
+      "epoch": 0.07692307692307693,
+      "grad_norm": 0.38712427020072937,
+      "learning_rate": 0.00019983375702799935,
+      "loss": 1.2041,
+      "step": 432
+    },
+    {
+      "epoch": 0.0771011396011396,
+      "grad_norm": 0.3578857481479645,
+      "learning_rate": 0.0001998329492689385,
+      "loss": 1.1623,
+      "step": 433
+    },
+    {
+      "epoch": 0.07727920227920228,
+      "grad_norm": 0.43065932393074036,
+      "learning_rate": 0.00019983213955385834,
+      "loss": 1.3033,
+      "step": 434
+    },
+    {
+      "epoch": 0.07745726495726496,
+      "grad_norm": 0.4882095754146576,
+      "learning_rate": 0.00019983132788277484,
+      "loss": 1.1635,
+      "step": 435
+    },
+    {
+      "epoch": 0.07763532763532764,
+      "grad_norm": 0.3429015874862671,
+      "learning_rate": 0.00019983051425570382,
+      "loss": 0.7289,
+      "step": 436
+    },
+    {
+      "epoch": 0.07781339031339031,
+      "grad_norm": 0.4320310056209564,
+      "learning_rate": 0.00019982969867266128,
+      "loss": 1.3685,
+      "step": 437
+    },
+    {
+      "epoch": 0.07799145299145299,
+      "grad_norm": 0.39891982078552246,
+      "learning_rate": 0.00019982888113366314,
+      "loss": 1.0444,
+      "step": 438
+    },
+    {
+      "epoch": 0.07816951566951567,
+      "grad_norm": 0.3675695061683655,
+      "learning_rate": 0.00019982806163872547,
+      "loss": 1.0527,
+      "step": 439
+    },
+    {
+      "epoch": 0.07834757834757834,
+      "grad_norm": 0.42824694514274597,
+      "learning_rate": 0.0001998272401878643,
+      "loss": 1.166,
+      "step": 440
+    },
+    {
+      "epoch": 0.07852564102564102,
+      "grad_norm": 0.3721694350242615,
+      "learning_rate": 0.00019982641678109575,
+      "loss": 1.1328,
+      "step": 441
+    },
+    {
+      "epoch": 0.0787037037037037,
+      "grad_norm": 0.33899208903312683,
+      "learning_rate": 0.00019982559141843592,
+      "loss": 1.016,
+      "step": 442
+    },
+    {
+      "epoch": 0.07888176638176639,
+      "grad_norm": 0.4029340147972107,
+      "learning_rate": 0.000199824764099901,
+      "loss": 1.0076,
+      "step": 443
+    },
+    {
+      "epoch": 0.07905982905982906,
+      "grad_norm": 0.4169132113456726,
+      "learning_rate": 0.0001998239348255072,
+      "loss": 1.208,
+      "step": 444
+    },
+    {
+      "epoch": 0.07923789173789174,
+      "grad_norm": 0.3865824043750763,
+      "learning_rate": 0.00019982310359527075,
+      "loss": 1.067,
+      "step": 445
+    },
+    {
+      "epoch": 0.07941595441595442,
+      "grad_norm": 0.4218919277191162,
+      "learning_rate": 0.00019982227040920796,
+      "loss": 1.195,
+      "step": 446
+    },
+    {
+      "epoch": 0.07959401709401709,
+      "grad_norm": 0.40504586696624756,
+      "learning_rate": 0.00019982143526733512,
+      "loss": 1.0188,
+      "step": 447
+    },
+    {
+      "epoch": 0.07977207977207977,
+      "grad_norm": 0.38330578804016113,
+      "learning_rate": 0.00019982059816966863,
+      "loss": 1.0484,
+      "step": 448
+    },
+    {
+      "epoch": 0.07995014245014245,
+      "grad_norm": 0.43731689453125,
+      "learning_rate": 0.00019981975911622488,
+      "loss": 1.074,
+      "step": 449
+    },
+    {
+      "epoch": 0.08012820512820513,
+      "grad_norm": 0.40858447551727295,
+      "learning_rate": 0.00019981891810702033,
+      "loss": 1.0008,
+      "step": 450
+    },
+    {
+      "epoch": 0.0803062678062678,
+      "grad_norm": 0.4031754732131958,
+      "learning_rate": 0.00019981807514207143,
+      "loss": 1.2179,
+      "step": 451
+    },
+    {
+      "epoch": 0.08048433048433049,
+      "grad_norm": 0.41920867562294006,
+      "learning_rate": 0.00019981723022139466,
+      "loss": 1.1406,
+      "step": 452
+    },
+    {
+      "epoch": 0.08066239316239317,
+      "grad_norm": 0.40305474400520325,
+      "learning_rate": 0.00019981638334500668,
+      "loss": 1.098,
+      "step": 453
+    },
+    {
+      "epoch": 0.08084045584045584,
+      "grad_norm": 0.4564182460308075,
+      "learning_rate": 0.00019981553451292396,
+      "loss": 1.419,
+      "step": 454
+    },
+    {
+      "epoch": 0.08101851851851852,
+      "grad_norm": 0.3832945227622986,
+      "learning_rate": 0.00019981468372516322,
+      "loss": 1.0919,
+      "step": 455
+    },
+    {
+      "epoch": 0.0811965811965812,
+      "grad_norm": 0.43062624335289,
+      "learning_rate": 0.0001998138309817411,
+      "loss": 1.0458,
+      "step": 456
+    },
+    {
+      "epoch": 0.08137464387464387,
+      "grad_norm": 0.3871173560619354,
+      "learning_rate": 0.0001998129762826743,
+      "loss": 1.1391,
+      "step": 457
+    },
+    {
+      "epoch": 0.08155270655270655,
+      "grad_norm": 0.43423157930374146,
+      "learning_rate": 0.0001998121196279796,
+      "loss": 1.1132,
+      "step": 458
+    },
+    {
+      "epoch": 0.08173076923076923,
+      "grad_norm": 0.4341012239456177,
+      "learning_rate": 0.00019981126101767372,
+      "loss": 1.113,
+      "step": 459
+    },
+    {
+      "epoch": 0.08190883190883191,
+      "grad_norm": 0.36748576164245605,
+      "learning_rate": 0.00019981040045177352,
+      "loss": 0.8108,
+      "step": 460
+    },
+    {
+      "epoch": 0.08208689458689458,
+      "grad_norm": 0.43133220076560974,
+      "learning_rate": 0.00019980953793029586,
+      "loss": 1.1861,
+      "step": 461
+    },
+    {
+      "epoch": 0.08226495726495726,
+      "grad_norm": 0.37204909324645996,
+      "learning_rate": 0.00019980867345325767,
+      "loss": 0.9222,
+      "step": 462
+    },
+    {
+      "epoch": 0.08244301994301995,
+      "grad_norm": 0.43370047211647034,
+      "learning_rate": 0.00019980780702067582,
+      "loss": 1.2984,
+      "step": 463
+    },
+    {
+      "epoch": 0.08262108262108261,
+      "grad_norm": 0.4991510808467865,
+      "learning_rate": 0.00019980693863256736,
+      "loss": 1.2222,
+      "step": 464
+    },
+    {
+      "epoch": 0.0827991452991453,
+      "grad_norm": 0.44318175315856934,
+      "learning_rate": 0.00019980606828894927,
+      "loss": 1.2262,
+      "step": 465
+    },
+    {
+      "epoch": 0.08297720797720798,
+      "grad_norm": 0.380231648683548,
+      "learning_rate": 0.0001998051959898386,
+      "loss": 1.0274,
+      "step": 466
+    },
+    {
+      "epoch": 0.08315527065527066,
+      "grad_norm": 0.39519667625427246,
+      "learning_rate": 0.0001998043217352524,
+      "loss": 1.2499,
+      "step": 467
+    },
+    {
+      "epoch": 0.08333333333333333,
+      "grad_norm": 0.457499235868454,
+      "learning_rate": 0.0001998034455252079,
+      "loss": 1.0751,
+      "step": 468
+    },
+    {
+      "epoch": 0.08351139601139601,
+      "grad_norm": 0.368522584438324,
+      "learning_rate": 0.00019980256735972215,
+      "loss": 1.0776,
+      "step": 469
+    },
+    {
+      "epoch": 0.08368945868945869,
+      "grad_norm": 0.3768427073955536,
+      "learning_rate": 0.00019980168723881243,
+      "loss": 1.2198,
+      "step": 470
+    },
+    {
+      "epoch": 0.08386752136752136,
+      "grad_norm": 0.37045565247535706,
+      "learning_rate": 0.000199800805162496,
+      "loss": 1.1816,
+      "step": 471
+    },
+    {
+      "epoch": 0.08404558404558404,
+      "grad_norm": 0.4219281077384949,
+      "learning_rate": 0.0001997999211307901,
+      "loss": 1.0515,
+      "step": 472
+    },
+    {
+      "epoch": 0.08422364672364673,
+      "grad_norm": 0.3815271258354187,
+      "learning_rate": 0.00019979903514371207,
+      "loss": 1.1709,
+      "step": 473
+    },
+    {
+      "epoch": 0.08440170940170941,
+      "grad_norm": 0.4566493630409241,
+      "learning_rate": 0.00019979814720127924,
+      "loss": 1.3063,
+      "step": 474
+    },
+    {
+      "epoch": 0.08457977207977208,
+      "grad_norm": 0.4043879806995392,
+      "learning_rate": 0.000199797257303509,
+      "loss": 1.0549,
+      "step": 475
+    },
+    {
+      "epoch": 0.08475783475783476,
+      "grad_norm": 0.3897830545902252,
+      "learning_rate": 0.00019979636545041886,
+      "loss": 1.1483,
+      "step": 476
+    },
+    {
+      "epoch": 0.08493589743589744,
+      "grad_norm": 0.36097025871276855,
+      "learning_rate": 0.00019979547164202622,
+      "loss": 1.1196,
+      "step": 477
+    },
+    {
+      "epoch": 0.08511396011396011,
+      "grad_norm": 0.3766986131668091,
+      "learning_rate": 0.00019979457587834863,
+      "loss": 1.0131,
+      "step": 478
+    },
+    {
+      "epoch": 0.08529202279202279,
+      "grad_norm": 0.39460286498069763,
+      "learning_rate": 0.00019979367815940364,
+      "loss": 1.1729,
+      "step": 479
+    },
+    {
+      "epoch": 0.08547008547008547,
+      "grad_norm": 0.4137469232082367,
+      "learning_rate": 0.00019979277848520885,
+      "loss": 1.2569,
+      "step": 480
+    },
+    {
+      "epoch": 0.08564814814814815,
+      "grad_norm": 0.464688777923584,
+      "learning_rate": 0.00019979187685578183,
+      "loss": 1.2064,
+      "step": 481
+    },
+    {
+      "epoch": 0.08582621082621082,
+      "grad_norm": 0.4245518147945404,
+      "learning_rate": 0.0001997909732711403,
+      "loss": 0.9812,
+      "step": 482
+    },
+    {
+      "epoch": 0.0860042735042735,
+      "grad_norm": 0.43368837237358093,
+      "learning_rate": 0.00019979006773130197,
+      "loss": 1.2822,
+      "step": 483
+    },
+    {
+      "epoch": 0.08618233618233619,
+      "grad_norm": 0.4232824444770813,
+      "learning_rate": 0.00019978916023628452,
+      "loss": 1.1446,
+      "step": 484
+    },
+    {
+      "epoch": 0.08636039886039885,
+      "grad_norm": 0.4183506369590759,
+      "learning_rate": 0.00019978825078610578,
+      "loss": 1.2605,
+      "step": 485
+    },
+    {
+      "epoch": 0.08653846153846154,
+      "grad_norm": 0.4391268491744995,
+      "learning_rate": 0.00019978733938078356,
+      "loss": 1.2165,
+      "step": 486
+    },
+    {
+      "epoch": 0.08671652421652422,
+      "grad_norm": 0.4139612317085266,
+      "learning_rate": 0.0001997864260203357,
+      "loss": 0.9389,
+      "step": 487
+    },
+    {
+      "epoch": 0.0868945868945869,
+      "grad_norm": 0.4058656096458435,
+      "learning_rate": 0.00019978551070478013,
+      "loss": 1.0652,
+      "step": 488
+    },
+    {
+      "epoch": 0.08707264957264957,
+      "grad_norm": 0.42333099246025085,
+      "learning_rate": 0.00019978459343413473,
+      "loss": 1.119,
+      "step": 489
+    },
+    {
+      "epoch": 0.08725071225071225,
+      "grad_norm": 0.4573031961917877,
+      "learning_rate": 0.00019978367420841754,
+      "loss": 1.1546,
+      "step": 490
+    },
+    {
+      "epoch": 0.08742877492877493,
+      "grad_norm": 0.4161617159843445,
+      "learning_rate": 0.00019978275302764655,
+      "loss": 1.0836,
+      "step": 491
+    },
+    {
+      "epoch": 0.0876068376068376,
+      "grad_norm": 0.422145277261734,
+      "learning_rate": 0.00019978182989183977,
+      "loss": 1.1908,
+      "step": 492
+    },
+    {
+      "epoch": 0.08778490028490028,
+      "grad_norm": 0.4588126838207245,
+      "learning_rate": 0.00019978090480101532,
+      "loss": 1.1758,
+      "step": 493
+    },
+    {
+      "epoch": 0.08796296296296297,
+      "grad_norm": 0.4425722062587738,
+      "learning_rate": 0.00019977997775519132,
+      "loss": 1.088,
+      "step": 494
+    },
+    {
+      "epoch": 0.08814102564102565,
+      "grad_norm": 0.37860307097435,
+      "learning_rate": 0.00019977904875438594,
+      "loss": 1.1532,
+      "step": 495
+    },
+    {
+      "epoch": 0.08831908831908832,
+      "grad_norm": 0.40435823798179626,
+      "learning_rate": 0.00019977811779861733,
+      "loss": 1.1271,
+      "step": 496
+    },
+    {
+      "epoch": 0.088497150997151,
+      "grad_norm": 0.42578884959220886,
+      "learning_rate": 0.0001997771848879038,
+      "loss": 0.9889,
+      "step": 497
+    },
+    {
+      "epoch": 0.08867521367521368,
+      "grad_norm": 0.3439478874206543,
+      "learning_rate": 0.00019977625002226361,
+      "loss": 1.1273,
+      "step": 498
+    },
+    {
+      "epoch": 0.08885327635327635,
+      "grad_norm": 0.362341970205307,
+      "learning_rate": 0.00019977531320171504,
+      "loss": 1.0214,
+      "step": 499
+    },
+    {
+      "epoch": 0.08903133903133903,
+      "grad_norm": 0.4305768609046936,
+      "learning_rate": 0.0001997743744262765,
+      "loss": 1.2648,
+      "step": 500
+    },
+    {
+      "epoch": 0.08920940170940171,
+      "grad_norm": 0.35900023579597473,
+      "learning_rate": 0.00019977343369596636,
+      "loss": 1.0274,
+      "step": 501
+    },
+    {
+      "epoch": 0.0893874643874644,
+      "grad_norm": 0.4950818717479706,
+      "learning_rate": 0.00019977249101080306,
+      "loss": 1.1483,
+      "step": 502
+    },
+    {
+      "epoch": 0.08956552706552706,
+      "grad_norm": 0.3800346553325653,
+      "learning_rate": 0.00019977154637080503,
+      "loss": 1.0636,
+      "step": 503
+    },
+    {
+      "epoch": 0.08974358974358974,
+      "grad_norm": 0.46202352643013,
+      "learning_rate": 0.0001997705997759908,
+      "loss": 1.1544,
+      "step": 504
+    },
+    {
+      "epoch": 0.08992165242165243,
+      "grad_norm": 0.36818403005599976,
+      "learning_rate": 0.00019976965122637895,
+      "loss": 0.9824,
+      "step": 505
+    },
+    {
+      "epoch": 0.0900997150997151,
+      "grad_norm": 0.40248095989227295,
+      "learning_rate": 0.00019976870072198805,
+      "loss": 1.1002,
+      "step": 506
+    },
+    {
+      "epoch": 0.09027777777777778,
+      "grad_norm": 0.3841850459575653,
+      "learning_rate": 0.00019976774826283667,
+      "loss": 1.2433,
+      "step": 507
+    },
+    {
+      "epoch": 0.09045584045584046,
+      "grad_norm": 0.46892330050468445,
+      "learning_rate": 0.0001997667938489435,
+      "loss": 1.3194,
+      "step": 508
+    },
+    {
+      "epoch": 0.09063390313390314,
+      "grad_norm": 0.39059561491012573,
+      "learning_rate": 0.0001997658374803273,
+      "loss": 1.1778,
+      "step": 509
+    },
+    {
+      "epoch": 0.09081196581196581,
+      "grad_norm": 0.3793235421180725,
+      "learning_rate": 0.00019976487915700672,
+      "loss": 1.0659,
+      "step": 510
+    },
+    {
+      "epoch": 0.09099002849002849,
+      "grad_norm": 0.39067742228507996,
+      "learning_rate": 0.00019976391887900058,
+      "loss": 1.107,
+      "step": 511
+    },
+    {
+      "epoch": 0.09116809116809117,
+      "grad_norm": 0.40121713280677795,
+      "learning_rate": 0.00019976295664632772,
+      "loss": 1.102,
+      "step": 512
+    },
+    {
+      "epoch": 0.09134615384615384,
+      "grad_norm": 0.49830010533332825,
+      "learning_rate": 0.00019976199245900697,
+      "loss": 1.1701,
+      "step": 513
+    },
+    {
+      "epoch": 0.09152421652421652,
+      "grad_norm": 0.4536968171596527,
+      "learning_rate": 0.0001997610263170572,
+      "loss": 1.1067,
+      "step": 514
+    },
+    {
+      "epoch": 0.0917022792022792,
+      "grad_norm": 0.3832971453666687,
+      "learning_rate": 0.00019976005822049735,
+      "loss": 1.0991,
+      "step": 515
+    },
+    {
+      "epoch": 0.09188034188034189,
+      "grad_norm": 0.4093509614467621,
+      "learning_rate": 0.0001997590881693464,
+      "loss": 1.0565,
+      "step": 516
+    },
+    {
+      "epoch": 0.09205840455840456,
+      "grad_norm": 0.46073687076568604,
+      "learning_rate": 0.0001997581161636233,
+      "loss": 1.0057,
+      "step": 517
+    },
+    {
+      "epoch": 0.09223646723646724,
+      "grad_norm": 0.5001922845840454,
+      "learning_rate": 0.0001997571422033472,
+      "loss": 1.2639,
+      "step": 518
+    },
+    {
+      "epoch": 0.09241452991452992,
+      "grad_norm": 0.4620618224143982,
+      "learning_rate": 0.00019975616628853713,
+      "loss": 1.0966,
+      "step": 519
+    },
+    {
+      "epoch": 0.09259259259259259,
+      "grad_norm": 0.3788183927536011,
+      "learning_rate": 0.0001997551884192122,
+      "loss": 0.9783,
+      "step": 520
+    },
+    {
+      "epoch": 0.09277065527065527,
+      "grad_norm": 0.45589539408683777,
+      "learning_rate": 0.00019975420859539154,
+      "loss": 1.2194,
+      "step": 521
+    },
+    {
+      "epoch": 0.09294871794871795,
+      "grad_norm": 0.40747523307800293,
+      "learning_rate": 0.00019975322681709443,
+      "loss": 1.0349,
+      "step": 522
+    },
+    {
+      "epoch": 0.09312678062678063,
+      "grad_norm": 0.5045142769813538,
+      "learning_rate": 0.00019975224308434002,
+      "loss": 1.1373,
+      "step": 523
+    },
+    {
+      "epoch": 0.0933048433048433,
+      "grad_norm": 0.40352702140808105,
+      "learning_rate": 0.00019975125739714767,
+      "loss": 1.1236,
+      "step": 524
+    },
+    {
+      "epoch": 0.09348290598290598,
+      "grad_norm": 0.4301735758781433,
+      "learning_rate": 0.0001997502697555366,
+      "loss": 1.2932,
+      "step": 525
+    },
+    {
+      "epoch": 0.09366096866096867,
+      "grad_norm": 0.36800238490104675,
+      "learning_rate": 0.00019974928015952624,
+      "loss": 1.0734,
+      "step": 526
+    },
+    {
+      "epoch": 0.09383903133903133,
+      "grad_norm": 0.4027230143547058,
+      "learning_rate": 0.00019974828860913594,
+      "loss": 1.2776,
+      "step": 527
+    },
+    {
+      "epoch": 0.09401709401709402,
+      "grad_norm": 0.42497140169143677,
+      "learning_rate": 0.0001997472951043851,
+      "loss": 1.248,
+      "step": 528
+    },
+    {
+      "epoch": 0.0941951566951567,
+      "grad_norm": 0.3888593018054962,
+      "learning_rate": 0.00019974629964529325,
+      "loss": 1.0231,
+      "step": 529
+    },
+    {
+      "epoch": 0.09437321937321937,
+      "grad_norm": 0.3761361241340637,
+      "learning_rate": 0.00019974530223187986,
+      "loss": 1.0216,
+      "step": 530
+    },
+    {
+      "epoch": 0.09455128205128205,
+      "grad_norm": 0.42192980647087097,
+      "learning_rate": 0.00019974430286416448,
+      "loss": 1.0731,
+      "step": 531
+    },
+    {
+      "epoch": 0.09472934472934473,
+      "grad_norm": 0.44244512915611267,
+      "learning_rate": 0.00019974330154216667,
+      "loss": 1.2793,
+      "step": 532
+    },
+    {
+      "epoch": 0.09490740740740741,
+      "grad_norm": 0.378252774477005,
+      "learning_rate": 0.0001997422982659061,
+      "loss": 1.0462,
+      "step": 533
+    },
+    {
+      "epoch": 0.09508547008547008,
+      "grad_norm": 0.45589110255241394,
+      "learning_rate": 0.00019974129303540236,
+      "loss": 1.1884,
+      "step": 534
+    },
+    {
+      "epoch": 0.09526353276353276,
+      "grad_norm": 0.33930808305740356,
+      "learning_rate": 0.0001997402858506752,
+      "loss": 0.8381,
+      "step": 535
+    },
+    {
+      "epoch": 0.09544159544159544,
+      "grad_norm": 0.45408427715301514,
+      "learning_rate": 0.0001997392767117443,
+      "loss": 1.2379,
+      "step": 536
+    },
+    {
+      "epoch": 0.09561965811965811,
+      "grad_norm": 0.44125741720199585,
+      "learning_rate": 0.0001997382656186295,
+      "loss": 1.1941,
+      "step": 537
+    },
+    {
+      "epoch": 0.0957977207977208,
+      "grad_norm": 0.4075697660446167,
+      "learning_rate": 0.00019973725257135054,
+      "loss": 1.0142,
+      "step": 538
+    },
+    {
+      "epoch": 0.09597578347578348,
+      "grad_norm": 0.4258415102958679,
+      "learning_rate": 0.00019973623756992733,
+      "loss": 1.0447,
+      "step": 539
+    },
+    {
+      "epoch": 0.09615384615384616,
+      "grad_norm": 0.2738485038280487,
+      "learning_rate": 0.0001997352206143797,
+      "loss": 0.5521,
+      "step": 540
+    },
+    {
+      "epoch": 0.09633190883190883,
+      "grad_norm": 0.38815587759017944,
+      "learning_rate": 0.00019973420170472762,
+      "loss": 1.1052,
+      "step": 541
+    },
+    {
+      "epoch": 0.09650997150997151,
+      "grad_norm": 0.3909834027290344,
+      "learning_rate": 0.00019973318084099106,
+      "loss": 1.0494,
+      "step": 542
+    },
+    {
+      "epoch": 0.09668803418803419,
+      "grad_norm": 0.4517597258090973,
+      "learning_rate": 0.00019973215802318996,
+      "loss": 1.0611,
+      "step": 543
+    },
+    {
+      "epoch": 0.09686609686609686,
+      "grad_norm": 0.48659002780914307,
+      "learning_rate": 0.00019973113325134442,
+      "loss": 0.9967,
+      "step": 544
+    },
+    {
+      "epoch": 0.09704415954415954,
+      "grad_norm": 0.4039791524410248,
+      "learning_rate": 0.0001997301065254745,
+      "loss": 1.251,
+      "step": 545
+    },
+    {
+      "epoch": 0.09722222222222222,
+      "grad_norm": 0.3985383212566376,
+      "learning_rate": 0.0001997290778456003,
+      "loss": 1.2263,
+      "step": 546
+    },
+    {
+      "epoch": 0.0974002849002849,
+      "grad_norm": 0.4540637731552124,
+      "learning_rate": 0.00019972804721174199,
+      "loss": 1.2084,
+      "step": 547
+    },
+    {
+      "epoch": 0.09757834757834757,
+      "grad_norm": 0.36867982149124146,
+      "learning_rate": 0.00019972701462391977,
+      "loss": 0.9704,
+      "step": 548
+    },
+    {
+      "epoch": 0.09775641025641026,
+      "grad_norm": 0.40199780464172363,
+      "learning_rate": 0.00019972598008215385,
+      "loss": 1.1121,
+      "step": 549
+    },
+    {
+      "epoch": 0.09793447293447294,
+      "grad_norm": 0.42728984355926514,
+      "learning_rate": 0.00019972494358646455,
+      "loss": 1.1606,
+      "step": 550
+    },
+    {
+      "epoch": 0.0981125356125356,
+      "grad_norm": 0.4212374687194824,
+      "learning_rate": 0.0001997239051368721,
+      "loss": 1.3093,
+      "step": 551
+    },
+    {
+      "epoch": 0.09829059829059829,
+      "grad_norm": 0.3972226083278656,
+      "learning_rate": 0.0001997228647333969,
+      "loss": 1.1218,
+      "step": 552
+    },
+    {
+      "epoch": 0.09846866096866097,
+      "grad_norm": 0.43649932742118835,
+      "learning_rate": 0.00019972182237605935,
+      "loss": 1.2532,
+      "step": 553
+    },
+    {
+      "epoch": 0.09864672364672365,
+      "grad_norm": 0.3812280595302582,
+      "learning_rate": 0.0001997207780648798,
+      "loss": 1.0409,
+      "step": 554
+    },
+    {
+      "epoch": 0.09882478632478632,
+      "grad_norm": 0.41684821248054504,
+      "learning_rate": 0.00019971973179987878,
+      "loss": 0.9569,
+      "step": 555
+    },
+    {
+      "epoch": 0.099002849002849,
+      "grad_norm": 0.38081470131874084,
+      "learning_rate": 0.00019971868358107674,
+      "loss": 1.1615,
+      "step": 556
+    },
+    {
+      "epoch": 0.09918091168091168,
+      "grad_norm": 0.3702073097229004,
+      "learning_rate": 0.0001997176334084943,
+      "loss": 1.3907,
+      "step": 557
+    },
+    {
+      "epoch": 0.09935897435897435,
+      "grad_norm": 0.3625728189945221,
+      "learning_rate": 0.00019971658128215193,
+      "loss": 1.1897,
+      "step": 558
+    },
+    {
+      "epoch": 0.09953703703703703,
+      "grad_norm": 0.3815405070781708,
+      "learning_rate": 0.0001997155272020703,
+      "loss": 1.1473,
+      "step": 559
+    },
+    {
+      "epoch": 0.09971509971509972,
+      "grad_norm": 0.48664286732673645,
+      "learning_rate": 0.00019971447116827004,
+      "loss": 1.2462,
+      "step": 560
+    },
+    {
+      "epoch": 0.0998931623931624,
+      "grad_norm": 0.3708696663379669,
+      "learning_rate": 0.0001997134131807719,
+      "loss": 1.0979,
+      "step": 561
+    },
+    {
+      "epoch": 0.10007122507122507,
+      "grad_norm": 0.44511324167251587,
+      "learning_rate": 0.00019971235323959654,
+      "loss": 1.2313,
+      "step": 562
+    },
+    {
+      "epoch": 0.10024928774928775,
+      "grad_norm": 0.3687448799610138,
+      "learning_rate": 0.00019971129134476473,
+      "loss": 1.1526,
+      "step": 563
+    },
+    {
+      "epoch": 0.10042735042735043,
+      "grad_norm": 0.4506866931915283,
+      "learning_rate": 0.00019971022749629735,
+      "loss": 1.0003,
+      "step": 564
+    },
+    {
+      "epoch": 0.1006054131054131,
+      "grad_norm": 0.41910406947135925,
+      "learning_rate": 0.00019970916169421515,
+      "loss": 1.013,
+      "step": 565
+    },
+    {
+      "epoch": 0.10078347578347578,
+      "grad_norm": 0.39728936553001404,
+      "learning_rate": 0.0001997080939385391,
+      "loss": 1.0501,
+      "step": 566
+    },
+    {
+      "epoch": 0.10096153846153846,
+      "grad_norm": 0.41415902972221375,
+      "learning_rate": 0.00019970702422929005,
+      "loss": 1.0791,
+      "step": 567
+    },
+    {
+      "epoch": 0.10113960113960115,
+      "grad_norm": 0.45630788803100586,
+      "learning_rate": 0.00019970595256648896,
+      "loss": 1.2884,
+      "step": 568
+    },
+    {
+      "epoch": 0.10131766381766381,
+      "grad_norm": 0.4371698796749115,
+      "learning_rate": 0.00019970487895015686,
+      "loss": 1.0684,
+      "step": 569
+    },
+    {
+      "epoch": 0.1014957264957265,
+      "grad_norm": 0.4350591003894806,
+      "learning_rate": 0.00019970380338031477,
+      "loss": 1.2415,
+      "step": 570
+    },
+    {
+      "epoch": 0.10167378917378918,
+      "grad_norm": 0.4232708215713501,
+      "learning_rate": 0.00019970272585698382,
+      "loss": 1.2656,
+      "step": 571
+    },
+    {
+      "epoch": 0.10185185185185185,
+      "grad_norm": 0.3917689919471741,
+      "learning_rate": 0.00019970164638018502,
+      "loss": 1.0178,
+      "step": 572
+    },
+    {
+      "epoch": 0.10202991452991453,
+      "grad_norm": 0.4262804388999939,
+      "learning_rate": 0.0001997005649499396,
+      "loss": 1.1805,
+      "step": 573
+    },
+    {
+      "epoch": 0.10220797720797721,
+      "grad_norm": 0.5217884182929993,
+      "learning_rate": 0.0001996994815662687,
+      "loss": 1.2392,
+      "step": 574
+    },
+    {
+      "epoch": 0.10238603988603989,
+      "grad_norm": 0.4273875057697296,
+      "learning_rate": 0.00019969839622919358,
+      "loss": 1.0844,
+      "step": 575
+    },
+    {
+      "epoch": 0.10256410256410256,
+      "grad_norm": 0.41588085889816284,
+      "learning_rate": 0.00019969730893873547,
+      "loss": 1.2437,
+      "step": 576
+    },
+    {
+      "epoch": 0.10274216524216524,
+      "grad_norm": 0.41617709398269653,
+      "learning_rate": 0.0001996962196949157,
+      "loss": 0.9519,
+      "step": 577
+    },
+    {
+      "epoch": 0.10292022792022792,
+      "grad_norm": 0.4832979142665863,
+      "learning_rate": 0.00019969512849775565,
+      "loss": 1.1889,
+      "step": 578
+    },
+    {
+      "epoch": 0.10309829059829059,
+      "grad_norm": 0.3936060965061188,
+      "learning_rate": 0.0001996940353472766,
+      "loss": 0.9888,
+      "step": 579
+    },
+    {
+      "epoch": 0.10327635327635327,
+      "grad_norm": 0.4147680997848511,
+      "learning_rate": 0.00019969294024350004,
+      "loss": 1.0733,
+      "step": 580
+    },
+    {
+      "epoch": 0.10345441595441596,
+      "grad_norm": 0.37791356444358826,
+      "learning_rate": 0.00019969184318644742,
+      "loss": 1.212,
+      "step": 581
+    },
+    {
+      "epoch": 0.10363247863247864,
+      "grad_norm": 0.44297221302986145,
+      "learning_rate": 0.00019969074417614023,
+      "loss": 1.0535,
+      "step": 582
+    },
+    {
+      "epoch": 0.10381054131054131,
+      "grad_norm": 0.4032835066318512,
+      "learning_rate": 0.0001996896432126,
+      "loss": 1.1869,
+      "step": 583
+    },
+    {
+      "epoch": 0.10398860398860399,
+      "grad_norm": 0.49271953105926514,
+      "learning_rate": 0.00019968854029584827,
+      "loss": 1.1661,
+      "step": 584
+    },
+    {
+      "epoch": 0.10416666666666667,
+      "grad_norm": 0.362699031829834,
+      "learning_rate": 0.0001996874354259067,
+      "loss": 0.868,
+      "step": 585
+    },
+    {
+      "epoch": 0.10434472934472934,
+      "grad_norm": 0.401795357465744,
+      "learning_rate": 0.0001996863286027969,
+      "loss": 1.1045,
+      "step": 586
+    },
+    {
+      "epoch": 0.10452279202279202,
+      "grad_norm": 0.45380479097366333,
+      "learning_rate": 0.00019968521982654058,
+      "loss": 0.8503,
+      "step": 587
+    },
+    {
+      "epoch": 0.1047008547008547,
+      "grad_norm": 0.49759066104888916,
+      "learning_rate": 0.00019968410909715947,
+      "loss": 1.4073,
+      "step": 588
+    },
+    {
+      "epoch": 0.10487891737891739,
+      "grad_norm": 0.4421198070049286,
+      "learning_rate": 0.0001996829964146753,
+      "loss": 1.1512,
+      "step": 589
+    },
+    {
+      "epoch": 0.10505698005698005,
+      "grad_norm": 0.46675658226013184,
+      "learning_rate": 0.00019968188177910988,
+      "loss": 1.0132,
+      "step": 590
+    },
+    {
+      "epoch": 0.10523504273504274,
+      "grad_norm": 0.5710657238960266,
+      "learning_rate": 0.00019968076519048507,
+      "loss": 1.267,
+      "step": 591
+    },
+    {
+      "epoch": 0.10541310541310542,
+      "grad_norm": 0.4655563235282898,
+      "learning_rate": 0.00019967964664882276,
+      "loss": 1.1204,
+      "step": 592
+    },
+    {
+      "epoch": 0.10559116809116809,
+      "grad_norm": 0.3895256519317627,
+      "learning_rate": 0.00019967852615414478,
+      "loss": 1.0814,
+      "step": 593
+    },
+    {
+      "epoch": 0.10576923076923077,
+      "grad_norm": 0.424216091632843,
+      "learning_rate": 0.00019967740370647322,
+      "loss": 1.1663,
+      "step": 594
+    },
+    {
+      "epoch": 0.10594729344729345,
+      "grad_norm": 0.3978985846042633,
+      "learning_rate": 0.00019967627930582996,
+      "loss": 0.909,
+      "step": 595
+    },
+    {
+      "epoch": 0.10612535612535613,
+      "grad_norm": 0.47064995765686035,
+      "learning_rate": 0.00019967515295223705,
+      "loss": 1.2351,
+      "step": 596
+    },
+    {
+      "epoch": 0.1063034188034188,
+      "grad_norm": 0.42449644207954407,
+      "learning_rate": 0.0001996740246457166,
+      "loss": 0.9739,
+      "step": 597
+    },
+    {
+      "epoch": 0.10648148148148148,
+      "grad_norm": 0.39033401012420654,
+      "learning_rate": 0.00019967289438629066,
+      "loss": 1.0933,
+      "step": 598
+    },
+    {
+      "epoch": 0.10665954415954416,
+      "grad_norm": 0.4398612678050995,
+      "learning_rate": 0.00019967176217398143,
+      "loss": 1.2479,
+      "step": 599
+    },
+    {
+      "epoch": 0.10683760683760683,
+      "grad_norm": 0.3946632742881775,
+      "learning_rate": 0.00019967062800881107,
+      "loss": 1.0417,
+      "step": 600
+    },
+    {
+      "epoch": 0.10701566951566951,
+      "grad_norm": 0.5083445906639099,
+      "learning_rate": 0.0001996694918908018,
+      "loss": 1.1109,
+      "step": 601
+    },
+    {
+      "epoch": 0.1071937321937322,
+      "grad_norm": 0.477724552154541,
+      "learning_rate": 0.00019966835381997585,
+      "loss": 1.2891,
+      "step": 602
+    },
+    {
+      "epoch": 0.10737179487179487,
+      "grad_norm": 0.4110167920589447,
+      "learning_rate": 0.0001996672137963556,
+      "loss": 1.0555,
+      "step": 603
+    },
+    {
+      "epoch": 0.10754985754985755,
+      "grad_norm": 0.44078320264816284,
+      "learning_rate": 0.00019966607181996334,
+      "loss": 0.9188,
+      "step": 604
+    },
+    {
+      "epoch": 0.10772792022792023,
+      "grad_norm": 0.41251105070114136,
+      "learning_rate": 0.00019966492789082142,
+      "loss": 1.2592,
+      "step": 605
+    },
+    {
+      "epoch": 0.10790598290598291,
+      "grad_norm": 0.37701505422592163,
+      "learning_rate": 0.00019966378200895227,
+      "loss": 1.0233,
+      "step": 606
+    },
+    {
+      "epoch": 0.10808404558404558,
+      "grad_norm": 0.44624966382980347,
+      "learning_rate": 0.00019966263417437835,
+      "loss": 1.2273,
+      "step": 607
+    },
+    {
+      "epoch": 0.10826210826210826,
+      "grad_norm": 0.3618549108505249,
+      "learning_rate": 0.00019966148438712214,
+      "loss": 0.9101,
+      "step": 608
+    },
+    {
+      "epoch": 0.10844017094017094,
+      "grad_norm": 0.384574294090271,
+      "learning_rate": 0.00019966033264720616,
+      "loss": 1.1769,
+      "step": 609
+    },
+    {
+      "epoch": 0.10861823361823361,
+      "grad_norm": 0.50872403383255,
+      "learning_rate": 0.000199659178954653,
+      "loss": 1.1213,
+      "step": 610
+    },
+    {
+      "epoch": 0.1087962962962963,
+      "grad_norm": 0.39736685156822205,
+      "learning_rate": 0.00019965802330948527,
+      "loss": 1.275,
+      "step": 611
+    },
+    {
+      "epoch": 0.10897435897435898,
+      "grad_norm": 0.484660267829895,
+      "learning_rate": 0.00019965686571172557,
+      "loss": 1.1671,
+      "step": 612
+    },
+    {
+      "epoch": 0.10915242165242166,
+      "grad_norm": 0.41420218348503113,
+      "learning_rate": 0.0001996557061613966,
+      "loss": 0.9541,
+      "step": 613
+    },
+    {
+      "epoch": 0.10933048433048433,
+      "grad_norm": 0.4057196080684662,
+      "learning_rate": 0.00019965454465852112,
+      "loss": 1.0145,
+      "step": 614
+    },
+    {
+      "epoch": 0.10950854700854701,
+      "grad_norm": 0.4559510052204132,
+      "learning_rate": 0.00019965338120312182,
+      "loss": 1.0889,
+      "step": 615
+    },
+    {
+      "epoch": 0.10968660968660969,
+      "grad_norm": 0.40960055589675903,
+      "learning_rate": 0.00019965221579522154,
+      "loss": 1.1447,
+      "step": 616
+    },
+    {
+      "epoch": 0.10986467236467236,
+      "grad_norm": 0.4701732099056244,
+      "learning_rate": 0.0001996510484348431,
+      "loss": 1.2871,
+      "step": 617
+    },
+    {
+      "epoch": 0.11004273504273504,
+      "grad_norm": 0.38420796394348145,
+      "learning_rate": 0.0001996498791220094,
+      "loss": 1.058,
+      "step": 618
+    },
+    {
+      "epoch": 0.11022079772079772,
+      "grad_norm": 0.4014730453491211,
+      "learning_rate": 0.00019964870785674327,
+      "loss": 1.023,
+      "step": 619
+    },
+    {
+      "epoch": 0.1103988603988604,
+      "grad_norm": 0.38846179842948914,
+      "learning_rate": 0.00019964753463906773,
+      "loss": 0.9834,
+      "step": 620
+    },
+    {
+      "epoch": 0.11057692307692307,
+      "grad_norm": 0.5120236277580261,
+      "learning_rate": 0.00019964635946900577,
+      "loss": 1.2347,
+      "step": 621
+    },
+    {
+      "epoch": 0.11075498575498575,
+      "grad_norm": 0.40483301877975464,
+      "learning_rate": 0.00019964518234658038,
+      "loss": 1.131,
+      "step": 622
+    },
+    {
+      "epoch": 0.11093304843304844,
+      "grad_norm": 0.445782870054245,
+      "learning_rate": 0.00019964400327181464,
+      "loss": 0.9349,
+      "step": 623
+    },
+    {
+      "epoch": 0.1111111111111111,
+      "grad_norm": 0.490460604429245,
+      "learning_rate": 0.00019964282224473165,
+      "loss": 1.0257,
+      "step": 624
+    },
+    {
+      "epoch": 0.11128917378917379,
+      "grad_norm": 0.37585243582725525,
+      "learning_rate": 0.00019964163926535454,
+      "loss": 0.9724,
+      "step": 625
+    },
+    {
+      "epoch": 0.11146723646723647,
+      "grad_norm": 0.4160473346710205,
+      "learning_rate": 0.00019964045433370651,
+      "loss": 0.874,
+      "step": 626
+    },
+    {
+      "epoch": 0.11164529914529915,
+      "grad_norm": 0.442425012588501,
+      "learning_rate": 0.00019963926744981074,
+      "loss": 1.064,
+      "step": 627
+    },
+    {
+      "epoch": 0.11182336182336182,
+      "grad_norm": 0.4451471269130707,
+      "learning_rate": 0.00019963807861369054,
+      "loss": 1.2343,
+      "step": 628
+    },
+    {
+      "epoch": 0.1120014245014245,
+      "grad_norm": 0.5018183588981628,
+      "learning_rate": 0.00019963688782536913,
+      "loss": 1.1226,
+      "step": 629
+    },
+    {
+      "epoch": 0.11217948717948718,
+      "grad_norm": 0.43723925948143005,
+      "learning_rate": 0.0001996356950848699,
+      "loss": 1.0178,
+      "step": 630
+    },
+    {
+      "epoch": 0.11235754985754985,
+      "grad_norm": 0.4794611930847168,
+      "learning_rate": 0.0001996345003922162,
+      "loss": 0.9695,
+      "step": 631
+    },
+    {
+      "epoch": 0.11253561253561253,
+      "grad_norm": 0.5021790266036987,
+      "learning_rate": 0.00019963330374743143,
+      "loss": 1.1748,
+      "step": 632
+    },
+    {
+      "epoch": 0.11271367521367522,
+      "grad_norm": 0.47228625416755676,
+      "learning_rate": 0.00019963210515053906,
+      "loss": 1.2138,
+      "step": 633
+    },
+    {
+      "epoch": 0.1128917378917379,
+      "grad_norm": 0.4261155128479004,
+      "learning_rate": 0.00019963090460156256,
+      "loss": 0.9428,
+      "step": 634
+    },
+    {
+      "epoch": 0.11306980056980057,
+      "grad_norm": 0.3279525339603424,
+      "learning_rate": 0.00019962970210052542,
+      "loss": 0.7803,
+      "step": 635
+    },
+    {
+      "epoch": 0.11324786324786325,
+      "grad_norm": 0.5106086730957031,
+      "learning_rate": 0.00019962849764745125,
+      "loss": 1.113,
+      "step": 636
+    },
+    {
+      "epoch": 0.11342592592592593,
+      "grad_norm": 0.38272222876548767,
+      "learning_rate": 0.00019962729124236363,
+      "loss": 0.896,
+      "step": 637
+    },
+    {
+      "epoch": 0.1136039886039886,
+      "grad_norm": 0.39532098174095154,
+      "learning_rate": 0.0001996260828852862,
+      "loss": 0.9308,
+      "step": 638
+    },
+    {
+      "epoch": 0.11378205128205128,
+      "grad_norm": 0.44947221875190735,
+      "learning_rate": 0.00019962487257624262,
+      "loss": 1.207,
+      "step": 639
+    },
+    {
+      "epoch": 0.11396011396011396,
+      "grad_norm": 0.40684598684310913,
+      "learning_rate": 0.00019962366031525664,
+      "loss": 1.11,
+      "step": 640
+    },
+    {
+      "epoch": 0.11413817663817664,
+      "grad_norm": 0.4296625852584839,
+      "learning_rate": 0.00019962244610235194,
+      "loss": 1.2784,
+      "step": 641
+    },
+    {
+      "epoch": 0.11431623931623931,
+      "grad_norm": 0.4560794532299042,
+      "learning_rate": 0.0001996212299375524,
+      "loss": 1.1191,
+      "step": 642
+    },
+    {
+      "epoch": 0.114494301994302,
+      "grad_norm": 0.40246087312698364,
+      "learning_rate": 0.00019962001182088177,
+      "loss": 1.1401,
+      "step": 643
+    },
+    {
+      "epoch": 0.11467236467236468,
+      "grad_norm": 0.3938910663127899,
+      "learning_rate": 0.000199618791752364,
+      "loss": 1.0959,
+      "step": 644
+    },
+    {
+      "epoch": 0.11485042735042734,
+      "grad_norm": 0.4123380184173584,
+      "learning_rate": 0.00019961756973202287,
+      "loss": 1.2824,
+      "step": 645
+    },
+    {
+      "epoch": 0.11502849002849003,
+      "grad_norm": 0.41085442900657654,
+      "learning_rate": 0.00019961634575988243,
+      "loss": 1.1137,
+      "step": 646
+    },
+    {
+      "epoch": 0.11520655270655271,
+      "grad_norm": 0.38276201486587524,
+      "learning_rate": 0.0001996151198359667,
+      "loss": 1.0747,
+      "step": 647
+    },
+    {
+      "epoch": 0.11538461538461539,
+      "grad_norm": 0.49269407987594604,
+      "learning_rate": 0.00019961389196029953,
+      "loss": 1.1731,
+      "step": 648
+    },
+    {
+      "epoch": 0.11556267806267806,
+      "grad_norm": 0.5152469277381897,
+      "learning_rate": 0.00019961266213290512,
+      "loss": 1.3574,
+      "step": 649
+    },
+    {
+      "epoch": 0.11574074074074074,
+      "grad_norm": 0.4835714101791382,
+      "learning_rate": 0.0001996114303538075,
+      "loss": 1.2859,
+      "step": 650
+    },
+    {
+      "epoch": 0.11591880341880342,
+      "grad_norm": 0.4284524917602539,
+      "learning_rate": 0.00019961019662303087,
+      "loss": 1.1103,
+      "step": 651
+    },
+    {
+      "epoch": 0.11609686609686609,
+      "grad_norm": 0.3933276832103729,
+      "learning_rate": 0.00019960896094059933,
+      "loss": 1.2647,
+      "step": 652
+    },
+    {
+      "epoch": 0.11627492877492877,
+      "grad_norm": 0.33749741315841675,
+      "learning_rate": 0.00019960772330653712,
+      "loss": 0.819,
+      "step": 653
+    },
+    {
+      "epoch": 0.11645299145299146,
+      "grad_norm": 0.48122069239616394,
+      "learning_rate": 0.00019960648372086852,
+      "loss": 1.2781,
+      "step": 654
+    },
+    {
+      "epoch": 0.11663105413105414,
+      "grad_norm": 0.4681607186794281,
+      "learning_rate": 0.00019960524218361775,
+      "loss": 0.9723,
+      "step": 655
+    },
+    {
+      "epoch": 0.1168091168091168,
+      "grad_norm": 0.3974960148334503,
+      "learning_rate": 0.0001996039986948092,
+      "loss": 1.0302,
+      "step": 656
+    },
+    {
+      "epoch": 0.11698717948717949,
+      "grad_norm": 0.43180662393569946,
+      "learning_rate": 0.0001996027532544672,
+      "loss": 1.3265,
+      "step": 657
+    },
+    {
+      "epoch": 0.11716524216524217,
+      "grad_norm": 0.4481917917728424,
+      "learning_rate": 0.00019960150586261613,
+      "loss": 1.136,
+      "step": 658
+    },
+    {
+      "epoch": 0.11734330484330484,
+      "grad_norm": 0.43428945541381836,
+      "learning_rate": 0.00019960025651928045,
+      "loss": 1.2412,
+      "step": 659
+    },
+    {
+      "epoch": 0.11752136752136752,
+      "grad_norm": 0.36211395263671875,
+      "learning_rate": 0.00019959900522448467,
+      "loss": 0.9563,
+      "step": 660
+    },
+    {
+      "epoch": 0.1176994301994302,
+      "grad_norm": 0.43585848808288574,
+      "learning_rate": 0.0001995977519782533,
+      "loss": 1.1677,
+      "step": 661
+    },
+    {
+      "epoch": 0.11787749287749288,
+      "grad_norm": 0.4232597351074219,
+      "learning_rate": 0.00019959649678061086,
+      "loss": 1.1187,
+      "step": 662
+    },
+    {
+      "epoch": 0.11805555555555555,
+      "grad_norm": 0.3304753303527832,
+      "learning_rate": 0.00019959523963158194,
+      "loss": 0.8473,
+      "step": 663
+    },
+    {
+      "epoch": 0.11823361823361823,
+      "grad_norm": 0.37600061297416687,
+      "learning_rate": 0.0001995939805311912,
+      "loss": 1.1227,
+      "step": 664
+    },
+    {
+      "epoch": 0.11841168091168092,
+      "grad_norm": 0.33417847752571106,
+      "learning_rate": 0.0001995927194794633,
+      "loss": 1.0315,
+      "step": 665
+    },
+    {
+      "epoch": 0.11858974358974358,
+      "grad_norm": 0.46799129247665405,
+      "learning_rate": 0.00019959145647642298,
+      "loss": 1.135,
+      "step": 666
+    },
+    {
+      "epoch": 0.11876780626780627,
+      "grad_norm": 0.4141576886177063,
+      "learning_rate": 0.0001995901915220949,
+      "loss": 1.0956,
+      "step": 667
+    },
+    {
+      "epoch": 0.11894586894586895,
+      "grad_norm": 0.3824596405029297,
+      "learning_rate": 0.0001995889246165039,
+      "loss": 1.1782,
+      "step": 668
+    },
+    {
+      "epoch": 0.11912393162393162,
+      "grad_norm": 0.4087786376476288,
+      "learning_rate": 0.00019958765575967484,
+      "loss": 0.9704,
+      "step": 669
+    },
+    {
+      "epoch": 0.1193019943019943,
+      "grad_norm": 0.5161317586898804,
+      "learning_rate": 0.00019958638495163252,
+      "loss": 1.2207,
+      "step": 670
+    },
+    {
+      "epoch": 0.11948005698005698,
+      "grad_norm": 0.4782274067401886,
+      "learning_rate": 0.0001995851121924019,
+      "loss": 1.1257,
+      "step": 671
+    },
+    {
+      "epoch": 0.11965811965811966,
+      "grad_norm": 0.40617331862449646,
+      "learning_rate": 0.00019958383748200782,
+      "loss": 1.1153,
+      "step": 672
+    },
+    {
+      "epoch": 0.11983618233618233,
+      "grad_norm": 0.40149980783462524,
+      "learning_rate": 0.00019958256082047533,
+      "loss": 0.9785,
+      "step": 673
+    },
+    {
+      "epoch": 0.12001424501424501,
+      "grad_norm": 0.4378886818885803,
+      "learning_rate": 0.00019958128220782942,
+      "loss": 1.1355,
+      "step": 674
+    },
+    {
+      "epoch": 0.1201923076923077,
+      "grad_norm": 0.4449596703052521,
+      "learning_rate": 0.0001995800016440952,
+      "loss": 1.0325,
+      "step": 675
+    },
+    {
+      "epoch": 0.12037037037037036,
+      "grad_norm": 0.4268079698085785,
+      "learning_rate": 0.00019957871912929765,
+      "loss": 1.1901,
+      "step": 676
+    },
+    {
+      "epoch": 0.12054843304843305,
+      "grad_norm": 0.4250091016292572,
+      "learning_rate": 0.00019957743466346198,
+      "loss": 1.0084,
+      "step": 677
+    },
+    {
+      "epoch": 0.12072649572649573,
+      "grad_norm": 0.40724286437034607,
+      "learning_rate": 0.0001995761482466133,
+      "loss": 1.0866,
+      "step": 678
+    },
+    {
+      "epoch": 0.12090455840455841,
+      "grad_norm": 0.42478349804878235,
+      "learning_rate": 0.00019957485987877688,
+      "loss": 1.1909,
+      "step": 679
+    },
+    {
+      "epoch": 0.12108262108262108,
+      "grad_norm": 0.371362566947937,
+      "learning_rate": 0.0001995735695599779,
+      "loss": 1.083,
+      "step": 680
+    },
+    {
+      "epoch": 0.12126068376068376,
+      "grad_norm": 0.4715283513069153,
+      "learning_rate": 0.0001995722772902417,
+      "loss": 1.2942,
+      "step": 681
+    },
+    {
+      "epoch": 0.12143874643874644,
+      "grad_norm": 0.3611983060836792,
+      "learning_rate": 0.00019957098306959355,
+      "loss": 0.9878,
+      "step": 682
+    },
+    {
+      "epoch": 0.12161680911680911,
+      "grad_norm": 0.4764883816242218,
+      "learning_rate": 0.00019956968689805883,
+      "loss": 1.0082,
+      "step": 683
+    },
+    {
+      "epoch": 0.12179487179487179,
+      "grad_norm": 0.33170604705810547,
+      "learning_rate": 0.00019956838877566293,
+      "loss": 0.8529,
+      "step": 684
+    },
+    {
+      "epoch": 0.12197293447293447,
+      "grad_norm": 0.46896886825561523,
+      "learning_rate": 0.00019956708870243133,
+      "loss": 1.0745,
+      "step": 685
+    },
+    {
+      "epoch": 0.12215099715099716,
+      "grad_norm": 0.4120674431324005,
+      "learning_rate": 0.00019956578667838941,
+      "loss": 1.1828,
+      "step": 686
+    },
+    {
+      "epoch": 0.12232905982905982,
+      "grad_norm": 0.45671191811561584,
+      "learning_rate": 0.00019956448270356275,
+      "loss": 1.3484,
+      "step": 687
+    },
+    {
+      "epoch": 0.1225071225071225,
+      "grad_norm": 0.4023838937282562,
+      "learning_rate": 0.00019956317677797687,
+      "loss": 0.9623,
+      "step": 688
+    },
+    {
+      "epoch": 0.12268518518518519,
+      "grad_norm": 0.5205856561660767,
+      "learning_rate": 0.00019956186890165737,
+      "loss": 1.2221,
+      "step": 689
+    },
+    {
+      "epoch": 0.12286324786324786,
+      "grad_norm": 0.43956050276756287,
+      "learning_rate": 0.00019956055907462987,
+      "loss": 1.1051,
+      "step": 690
+    },
+    {
+      "epoch": 0.12304131054131054,
+      "grad_norm": 0.4341758191585541,
+      "learning_rate": 0.00019955924729692003,
+      "loss": 0.8972,
+      "step": 691
+    },
+    {
+      "epoch": 0.12321937321937322,
+      "grad_norm": 0.42025020718574524,
+      "learning_rate": 0.00019955793356855357,
+      "loss": 1.1137,
+      "step": 692
+    },
+    {
+      "epoch": 0.1233974358974359,
+      "grad_norm": 0.44375079870224,
+      "learning_rate": 0.0001995566178895562,
+      "loss": 1.2783,
+      "step": 693
+    },
+    {
+      "epoch": 0.12357549857549857,
+      "grad_norm": 0.4703320264816284,
+      "learning_rate": 0.00019955530025995372,
+      "loss": 1.1991,
+      "step": 694
+    },
+    {
+      "epoch": 0.12375356125356125,
+      "grad_norm": 0.43781620264053345,
+      "learning_rate": 0.00019955398067977195,
+      "loss": 1.2316,
+      "step": 695
+    },
+    {
+      "epoch": 0.12393162393162394,
+      "grad_norm": 0.4362877607345581,
+      "learning_rate": 0.0001995526591490367,
+      "loss": 1.1374,
+      "step": 696
+    },
+    {
+      "epoch": 0.1241096866096866,
+      "grad_norm": 0.4434499442577362,
+      "learning_rate": 0.00019955133566777392,
+      "loss": 1.1034,
+      "step": 697
+    },
+    {
+      "epoch": 0.12428774928774929,
+      "grad_norm": 0.46613508462905884,
+      "learning_rate": 0.00019955001023600955,
+      "loss": 1.2252,
+      "step": 698
+    },
+    {
+      "epoch": 0.12446581196581197,
+      "grad_norm": 0.46226736903190613,
+      "learning_rate": 0.00019954868285376945,
+      "loss": 1.0296,
+      "step": 699
+    },
+    {
+      "epoch": 0.12464387464387465,
+      "grad_norm": 0.4460904002189636,
+      "learning_rate": 0.00019954735352107977,
+      "loss": 1.0553,
+      "step": 700
+    },
+    {
+      "epoch": 0.12482193732193732,
+      "grad_norm": 0.36708924174308777,
+      "learning_rate": 0.00019954602223796648,
+      "loss": 0.9384,
+      "step": 701
+    },
+    {
+      "epoch": 0.125,
+      "grad_norm": 0.3780093491077423,
+      "learning_rate": 0.00019954468900445566,
+      "loss": 0.9062,
+      "step": 702
+    },
+    {
+      "epoch": 0.12517806267806267,
+      "grad_norm": 0.41797417402267456,
+      "learning_rate": 0.00019954335382057345,
+      "loss": 1.0344,
+      "step": 703
+    },
+    {
+      "epoch": 0.12535612535612536,
+      "grad_norm": 0.43710798025131226,
+      "learning_rate": 0.00019954201668634597,
+      "loss": 1.1324,
+      "step": 704
+    },
+    {
+      "epoch": 0.12553418803418803,
+      "grad_norm": 0.4732789695262909,
+      "learning_rate": 0.00019954067760179952,
+      "loss": 1.1419,
+      "step": 705
+    },
+    {
+      "epoch": 0.1257122507122507,
+      "grad_norm": 0.43248575925827026,
+      "learning_rate": 0.00019953933656696022,
+      "loss": 1.5112,
+      "step": 706
+    },
+    {
+      "epoch": 0.1258903133903134,
+      "grad_norm": 0.4074753522872925,
+      "learning_rate": 0.00019953799358185442,
+      "loss": 0.9751,
+      "step": 707
+    },
+    {
+      "epoch": 0.12606837606837606,
+      "grad_norm": 0.4586823880672455,
+      "learning_rate": 0.0001995366486465084,
+      "loss": 1.267,
+      "step": 708
+    },
+    {
+      "epoch": 0.12624643874643873,
+      "grad_norm": 0.4716857075691223,
+      "learning_rate": 0.0001995353017609485,
+      "loss": 1.1636,
+      "step": 709
+    },
+    {
+      "epoch": 0.12642450142450143,
+      "grad_norm": 0.5214398503303528,
+      "learning_rate": 0.00019953395292520115,
+      "loss": 1.2317,
+      "step": 710
+    },
+    {
+      "epoch": 0.1266025641025641,
+      "grad_norm": 0.42961129546165466,
+      "learning_rate": 0.00019953260213929276,
+      "loss": 1.0271,
+      "step": 711
+    },
+    {
+      "epoch": 0.1267806267806268,
+      "grad_norm": 0.4764653444290161,
+      "learning_rate": 0.00019953124940324979,
+      "loss": 1.1747,
+      "step": 712
+    },
+    {
+      "epoch": 0.12695868945868946,
+      "grad_norm": 0.4420304000377655,
+      "learning_rate": 0.00019952989471709874,
+      "loss": 0.9783,
+      "step": 713
+    },
+    {
+      "epoch": 0.12713675213675213,
+      "grad_norm": 0.44114625453948975,
+      "learning_rate": 0.00019952853808086616,
+      "loss": 1.1953,
+      "step": 714
+    },
+    {
+      "epoch": 0.12731481481481483,
+      "grad_norm": 0.501923143863678,
+      "learning_rate": 0.0001995271794945786,
+      "loss": 0.9886,
+      "step": 715
+    },
+    {
+      "epoch": 0.1274928774928775,
+      "grad_norm": 0.42266538739204407,
+      "learning_rate": 0.00019952581895826276,
+      "loss": 1.2033,
+      "step": 716
+    },
+    {
+      "epoch": 0.12767094017094016,
+      "grad_norm": 0.37770554423332214,
+      "learning_rate": 0.00019952445647194523,
+      "loss": 1.0164,
+      "step": 717
+    },
+    {
+      "epoch": 0.12784900284900286,
+      "grad_norm": 0.369266152381897,
+      "learning_rate": 0.00019952309203565268,
+      "loss": 0.9186,
+      "step": 718
+    },
+    {
+      "epoch": 0.12802706552706553,
+      "grad_norm": 0.40446221828460693,
+      "learning_rate": 0.00019952172564941193,
+      "loss": 1.1576,
+      "step": 719
+    },
+    {
+      "epoch": 0.1282051282051282,
+      "grad_norm": 0.504172146320343,
+      "learning_rate": 0.00019952035731324967,
+      "loss": 1.2695,
+      "step": 720
+    },
+    {
+      "epoch": 0.1283831908831909,
+      "grad_norm": 0.37284108996391296,
+      "learning_rate": 0.0001995189870271928,
+      "loss": 1.0288,
+      "step": 721
+    },
+    {
+      "epoch": 0.12856125356125356,
+      "grad_norm": 0.41811618208885193,
+      "learning_rate": 0.00019951761479126805,
+      "loss": 1.2241,
+      "step": 722
+    },
+    {
+      "epoch": 0.12873931623931623,
+      "grad_norm": 0.44706249237060547,
+      "learning_rate": 0.0001995162406055024,
+      "loss": 1.0831,
+      "step": 723
+    },
+    {
+      "epoch": 0.12891737891737892,
+      "grad_norm": 0.426572322845459,
+      "learning_rate": 0.00019951486446992273,
+      "loss": 1.0047,
+      "step": 724
+    },
+    {
+      "epoch": 0.1290954415954416,
+      "grad_norm": 0.4446277618408203,
+      "learning_rate": 0.00019951348638455602,
+      "loss": 1.0827,
+      "step": 725
+    },
+    {
+      "epoch": 0.12927350427350429,
+      "grad_norm": 0.3934919834136963,
+      "learning_rate": 0.00019951210634942926,
+      "loss": 0.9808,
+      "step": 726
+    },
+    {
+      "epoch": 0.12945156695156695,
+      "grad_norm": 0.4316558241844177,
+      "learning_rate": 0.0001995107243645695,
+      "loss": 1.3341,
+      "step": 727
+    },
+    {
+      "epoch": 0.12962962962962962,
+      "grad_norm": 0.43074217438697815,
+      "learning_rate": 0.00019950934043000382,
+      "loss": 1.007,
+      "step": 728
+    },
+    {
+      "epoch": 0.12980769230769232,
+      "grad_norm": 0.5212171673774719,
+      "learning_rate": 0.0001995079545457593,
+      "loss": 1.1822,
+      "step": 729
+    },
+    {
+      "epoch": 0.129985754985755,
+      "grad_norm": 0.3749600946903229,
+      "learning_rate": 0.00019950656671186313,
+      "loss": 0.9657,
+      "step": 730
+    },
+    {
+      "epoch": 0.13016381766381765,
+      "grad_norm": 0.36626043915748596,
+      "learning_rate": 0.00019950517692834252,
+      "loss": 1.1274,
+      "step": 731
+    },
+    {
+      "epoch": 0.13034188034188035,
+      "grad_norm": 0.4635467529296875,
+      "learning_rate": 0.00019950378519522467,
+      "loss": 1.2305,
+      "step": 732
+    },
+    {
+      "epoch": 0.13051994301994302,
+      "grad_norm": 0.4077455699443817,
+      "learning_rate": 0.00019950239151253683,
+      "loss": 0.9485,
+      "step": 733
+    },
+    {
+      "epoch": 0.1306980056980057,
+      "grad_norm": 0.4222758114337921,
+      "learning_rate": 0.0001995009958803063,
+      "loss": 1.0376,
+      "step": 734
+    },
+    {
+      "epoch": 0.13087606837606838,
+      "grad_norm": 0.4330402612686157,
+      "learning_rate": 0.0001994995982985605,
+      "loss": 1.1774,
+      "step": 735
+    },
+    {
+      "epoch": 0.13105413105413105,
+      "grad_norm": 0.42275673151016235,
+      "learning_rate": 0.00019949819876732673,
+      "loss": 1.1238,
+      "step": 736
+    },
+    {
+      "epoch": 0.13123219373219372,
+      "grad_norm": 0.45576968789100647,
+      "learning_rate": 0.00019949679728663246,
+      "loss": 1.0428,
+      "step": 737
+    },
+    {
+      "epoch": 0.13141025641025642,
+      "grad_norm": 0.5508752465248108,
+      "learning_rate": 0.00019949539385650514,
+      "loss": 1.3221,
+      "step": 738
+    },
+    {
+      "epoch": 0.13158831908831908,
+      "grad_norm": 0.4115872383117676,
+      "learning_rate": 0.00019949398847697225,
+      "loss": 1.0301,
+      "step": 739
+    },
+    {
+      "epoch": 0.13176638176638178,
+      "grad_norm": 0.4662442207336426,
+      "learning_rate": 0.00019949258114806132,
+      "loss": 1.3263,
+      "step": 740
+    },
+    {
+      "epoch": 0.13194444444444445,
+      "grad_norm": 0.6077266931533813,
+      "learning_rate": 0.00019949117186979999,
+      "loss": 1.0269,
+      "step": 741
+    },
+    {
+      "epoch": 0.13212250712250712,
+      "grad_norm": 0.47039318084716797,
+      "learning_rate": 0.00019948976064221579,
+      "loss": 1.3782,
+      "step": 742
+    },
+    {
+      "epoch": 0.1323005698005698,
+      "grad_norm": 0.4773450493812561,
+      "learning_rate": 0.0001994883474653364,
+      "loss": 1.289,
+      "step": 743
+    },
+    {
+      "epoch": 0.13247863247863248,
+      "grad_norm": 0.40180155634880066,
+      "learning_rate": 0.00019948693233918952,
+      "loss": 0.8691,
+      "step": 744
+    },
+    {
+      "epoch": 0.13265669515669515,
+      "grad_norm": 0.45216289162635803,
+      "learning_rate": 0.00019948551526380288,
+      "loss": 1.071,
+      "step": 745
+    },
+    {
+      "epoch": 0.13283475783475784,
+      "grad_norm": 0.4289272427558899,
+      "learning_rate": 0.0001994840962392042,
+      "loss": 1.0422,
+      "step": 746
+    },
+    {
+      "epoch": 0.1330128205128205,
+      "grad_norm": 0.4617730379104614,
+      "learning_rate": 0.00019948267526542134,
+      "loss": 1.0835,
+      "step": 747
+    },
+    {
+      "epoch": 0.13319088319088318,
+      "grad_norm": 0.42710617184638977,
+      "learning_rate": 0.00019948125234248208,
+      "loss": 1.0535,
+      "step": 748
+    },
+    {
+      "epoch": 0.13336894586894588,
+      "grad_norm": 0.43433234095573425,
+      "learning_rate": 0.0001994798274704144,
+      "loss": 0.9313,
+      "step": 749
+    },
+    {
+      "epoch": 0.13354700854700854,
+      "grad_norm": 0.46270284056663513,
+      "learning_rate": 0.0001994784006492461,
+      "loss": 1.0903,
+      "step": 750
+    },
+    {
+      "epoch": 0.1337250712250712,
+      "grad_norm": 0.5319814682006836,
+      "learning_rate": 0.00019947697187900517,
+      "loss": 1.2329,
+      "step": 751
+    },
+    {
+      "epoch": 0.1339031339031339,
+      "grad_norm": 0.3511372208595276,
+      "learning_rate": 0.00019947554115971967,
+      "loss": 0.7116,
+      "step": 752
+    },
+    {
+      "epoch": 0.13408119658119658,
+      "grad_norm": 0.4103890359401703,
+      "learning_rate": 0.00019947410849141756,
+      "loss": 1.1527,
+      "step": 753
+    },
+    {
+      "epoch": 0.13425925925925927,
+      "grad_norm": 0.5390757322311401,
+      "learning_rate": 0.00019947267387412695,
+      "loss": 1.1682,
+      "step": 754
+    },
+    {
+      "epoch": 0.13443732193732194,
+      "grad_norm": 0.29939723014831543,
+      "learning_rate": 0.0001994712373078759,
+      "loss": 0.5848,
+      "step": 755
+    },
+    {
+      "epoch": 0.1346153846153846,
+      "grad_norm": 0.4605920612812042,
+      "learning_rate": 0.0001994697987926926,
+      "loss": 0.9448,
+      "step": 756
+    },
+    {
+      "epoch": 0.1347934472934473,
+      "grad_norm": 0.426213800907135,
+      "learning_rate": 0.00019946835832860527,
+      "loss": 1.0487,
+      "step": 757
+    },
+    {
+      "epoch": 0.13497150997150997,
+      "grad_norm": 0.4209515154361725,
+      "learning_rate": 0.00019946691591564203,
+      "loss": 1.0951,
+      "step": 758
+    },
+    {
+      "epoch": 0.13514957264957264,
+      "grad_norm": 0.39555591344833374,
+      "learning_rate": 0.0001994654715538312,
+      "loss": 0.8754,
+      "step": 759
+    },
+    {
+      "epoch": 0.13532763532763534,
+      "grad_norm": 0.4065483510494232,
+      "learning_rate": 0.0001994640252432011,
+      "loss": 0.9451,
+      "step": 760
+    },
+    {
+      "epoch": 0.135505698005698,
+      "grad_norm": 0.4489104151725769,
+      "learning_rate": 0.00019946257698378003,
+      "loss": 1.2031,
+      "step": 761
+    },
+    {
+      "epoch": 0.13568376068376067,
+      "grad_norm": 0.39928409457206726,
+      "learning_rate": 0.0001994611267755964,
+      "loss": 1.1124,
+      "step": 762
+    },
+    {
+      "epoch": 0.13586182336182337,
+      "grad_norm": 0.4145409166812897,
+      "learning_rate": 0.00019945967461867858,
+      "loss": 1.083,
+      "step": 763
+    },
+    {
+      "epoch": 0.13603988603988604,
+      "grad_norm": 0.43508613109588623,
+      "learning_rate": 0.00019945822051305507,
+      "loss": 1.1119,
+      "step": 764
+    },
+    {
+      "epoch": 0.1362179487179487,
+      "grad_norm": 0.5186598300933838,
+      "learning_rate": 0.0001994567644587543,
+      "loss": 1.3256,
+      "step": 765
+    },
+    {
+      "epoch": 0.1363960113960114,
+      "grad_norm": 0.4615778625011444,
+      "learning_rate": 0.00019945530645580487,
+      "loss": 1.3906,
+      "step": 766
+    },
+    {
+      "epoch": 0.13657407407407407,
+      "grad_norm": 0.4838152527809143,
+      "learning_rate": 0.00019945384650423532,
+      "loss": 0.8169,
+      "step": 767
+    },
+    {
+      "epoch": 0.13675213675213677,
+      "grad_norm": 0.49253368377685547,
+      "learning_rate": 0.0001994523846040742,
+      "loss": 1.1613,
+      "step": 768
+    },
+    {
+      "epoch": 0.13693019943019943,
+      "grad_norm": 0.4697009325027466,
+      "learning_rate": 0.00019945092075535024,
+      "loss": 1.1722,
+      "step": 769
+    },
+    {
+      "epoch": 0.1371082621082621,
+      "grad_norm": 0.47162383794784546,
+      "learning_rate": 0.00019944945495809204,
+      "loss": 1.054,
+      "step": 770
+    },
+    {
+      "epoch": 0.1372863247863248,
+      "grad_norm": 0.4653547704219818,
+      "learning_rate": 0.00019944798721232835,
+      "loss": 1.1791,
+      "step": 771
+    },
+    {
+      "epoch": 0.13746438746438747,
+      "grad_norm": 0.4244011640548706,
+      "learning_rate": 0.000199446517518088,
+      "loss": 1.1557,
+      "step": 772
+    },
+    {
+      "epoch": 0.13764245014245013,
+      "grad_norm": 0.43812859058380127,
+      "learning_rate": 0.00019944504587539967,
+      "loss": 1.1567,
+      "step": 773
+    },
+    {
+      "epoch": 0.13782051282051283,
+      "grad_norm": 0.3984275162220001,
+      "learning_rate": 0.00019944357228429227,
+      "loss": 1.0715,
+      "step": 774
+    },
+    {
+      "epoch": 0.1379985754985755,
+      "grad_norm": 0.3794248104095459,
+      "learning_rate": 0.0001994420967447946,
+      "loss": 0.9377,
+      "step": 775
+    },
+    {
+      "epoch": 0.13817663817663817,
+      "grad_norm": 0.4214578866958618,
+      "learning_rate": 0.00019944061925693566,
+      "loss": 1.0112,
+      "step": 776
+    },
+    {
+      "epoch": 0.13835470085470086,
+      "grad_norm": 0.4738999605178833,
+      "learning_rate": 0.00019943913982074435,
+      "loss": 0.8718,
+      "step": 777
+    },
+    {
+      "epoch": 0.13853276353276353,
+      "grad_norm": 0.43455326557159424,
+      "learning_rate": 0.00019943765843624965,
+      "loss": 1.1343,
+      "step": 778
+    },
+    {
+      "epoch": 0.1387108262108262,
+      "grad_norm": 0.44973456859588623,
+      "learning_rate": 0.00019943617510348062,
+      "loss": 1.0487,
+      "step": 779
+    },
+    {
+      "epoch": 0.1388888888888889,
+      "grad_norm": 0.4216597080230713,
+      "learning_rate": 0.00019943468982246628,
+      "loss": 1.0765,
+      "step": 780
+    },
+    {
+      "epoch": 0.13906695156695156,
+      "grad_norm": 0.5089883208274841,
+      "learning_rate": 0.00019943320259323578,
+      "loss": 1.3137,
+      "step": 781
+    },
+    {
+      "epoch": 0.13924501424501423,
+      "grad_norm": 0.4358222782611847,
+      "learning_rate": 0.00019943171341581822,
+      "loss": 1.1891,
+      "step": 782
+    },
+    {
+      "epoch": 0.13942307692307693,
+      "grad_norm": 0.40918609499931335,
+      "learning_rate": 0.00019943022229024275,
+      "loss": 1.279,
+      "step": 783
+    },
+    {
+      "epoch": 0.1396011396011396,
+      "grad_norm": 0.4614863395690918,
+      "learning_rate": 0.00019942872921653866,
+      "loss": 1.2477,
+      "step": 784
+    },
+    {
+      "epoch": 0.1397792022792023,
+      "grad_norm": 0.4141528904438019,
+      "learning_rate": 0.00019942723419473515,
+      "loss": 0.9622,
+      "step": 785
+    },
+    {
+      "epoch": 0.13995726495726496,
+      "grad_norm": 0.536139726638794,
+      "learning_rate": 0.00019942573722486154,
+      "loss": 1.2127,
+      "step": 786
+    },
+    {
+      "epoch": 0.14013532763532763,
+      "grad_norm": 0.4968845546245575,
+      "learning_rate": 0.0001994242383069471,
+      "loss": 1.2965,
+      "step": 787
+    },
+    {
+      "epoch": 0.14031339031339032,
+      "grad_norm": 0.3897174894809723,
+      "learning_rate": 0.00019942273744102132,
+      "loss": 0.9907,
+      "step": 788
+    },
+    {
+      "epoch": 0.140491452991453,
+      "grad_norm": 0.466307669878006,
+      "learning_rate": 0.0001994212346271135,
+      "loss": 1.2021,
+      "step": 789
+    },
+    {
+      "epoch": 0.14066951566951566,
+      "grad_norm": 0.49283576011657715,
+      "learning_rate": 0.0001994197298652531,
+      "loss": 1.0969,
+      "step": 790
+    },
+    {
+      "epoch": 0.14084757834757836,
+      "grad_norm": 0.4686102271080017,
+      "learning_rate": 0.00019941822315546964,
+      "loss": 1.0125,
+      "step": 791
+    },
+    {
+      "epoch": 0.14102564102564102,
+      "grad_norm": 0.4389997124671936,
+      "learning_rate": 0.0001994167144977926,
+      "loss": 1.1294,
+      "step": 792
+    },
+    {
+      "epoch": 0.1412037037037037,
+      "grad_norm": 0.38539355993270874,
+      "learning_rate": 0.00019941520389225162,
+      "loss": 1.1231,
+      "step": 793
+    },
+    {
+      "epoch": 0.1413817663817664,
+      "grad_norm": 0.4860847592353821,
+      "learning_rate": 0.00019941369133887618,
+      "loss": 1.2268,
+      "step": 794
+    },
+    {
+      "epoch": 0.14155982905982906,
+      "grad_norm": 0.4567467272281647,
+      "learning_rate": 0.00019941217683769598,
+      "loss": 1.1482,
+      "step": 795
+    },
+    {
+      "epoch": 0.14173789173789172,
+      "grad_norm": 0.5549420714378357,
+      "learning_rate": 0.00019941066038874067,
+      "loss": 1.1899,
+      "step": 796
+    },
+    {
+      "epoch": 0.14191595441595442,
+      "grad_norm": 0.3950003385543823,
+      "learning_rate": 0.00019940914199204,
+      "loss": 0.96,
+      "step": 797
+    },
+    {
+      "epoch": 0.1420940170940171,
+      "grad_norm": 0.43845999240875244,
+      "learning_rate": 0.00019940762164762373,
+      "loss": 1.0338,
+      "step": 798
+    },
+    {
+      "epoch": 0.14227207977207978,
+      "grad_norm": 0.468537300825119,
+      "learning_rate": 0.00019940609935552157,
+      "loss": 1.2416,
+      "step": 799
+    },
+    {
+      "epoch": 0.14245014245014245,
+      "grad_norm": 0.4292038679122925,
+      "learning_rate": 0.0001994045751157634,
+      "loss": 1.1397,
+      "step": 800
+    },
+    {
+      "epoch": 0.14262820512820512,
+      "grad_norm": 0.3800995647907257,
+      "learning_rate": 0.00019940304892837908,
+      "loss": 0.939,
+      "step": 801
+    },
+    {
+      "epoch": 0.14280626780626782,
+      "grad_norm": 0.38004353642463684,
+      "learning_rate": 0.00019940152079339852,
+      "loss": 1.0485,
+      "step": 802
+    },
+    {
+      "epoch": 0.14298433048433049,
+      "grad_norm": 0.4658142924308777,
+      "learning_rate": 0.00019939999071085163,
+      "loss": 1.1561,
+      "step": 803
+    },
+    {
+      "epoch": 0.14316239316239315,
+      "grad_norm": 0.4235048294067383,
+      "learning_rate": 0.0001993984586807684,
+      "loss": 1.0516,
+      "step": 804
+    },
+    {
+      "epoch": 0.14334045584045585,
+      "grad_norm": 0.42925819754600525,
+      "learning_rate": 0.00019939692470317887,
+      "loss": 1.2238,
+      "step": 805
+    },
+    {
+      "epoch": 0.14351851851851852,
+      "grad_norm": 0.43701639771461487,
+      "learning_rate": 0.00019939538877811308,
+      "loss": 1.0129,
+      "step": 806
+    },
+    {
+      "epoch": 0.14369658119658119,
+      "grad_norm": 0.42786353826522827,
+      "learning_rate": 0.00019939385090560113,
+      "loss": 1.1355,
+      "step": 807
+    },
+    {
+      "epoch": 0.14387464387464388,
+      "grad_norm": 0.371218740940094,
+      "learning_rate": 0.00019939231108567312,
+      "loss": 0.9712,
+      "step": 808
+    },
+    {
+      "epoch": 0.14405270655270655,
+      "grad_norm": 0.4834294617176056,
+      "learning_rate": 0.00019939076931835926,
+      "loss": 1.1375,
+      "step": 809
+    },
+    {
+      "epoch": 0.14423076923076922,
+      "grad_norm": 0.4700150191783905,
+      "learning_rate": 0.00019938922560368974,
+      "loss": 1.1943,
+      "step": 810
+    },
+    {
+      "epoch": 0.14440883190883191,
+      "grad_norm": 0.4430996775627136,
+      "learning_rate": 0.0001993876799416948,
+      "loss": 1.1976,
+      "step": 811
+    },
+    {
+      "epoch": 0.14458689458689458,
+      "grad_norm": 0.4161672592163086,
+      "learning_rate": 0.00019938613233240476,
+      "loss": 1.0291,
+      "step": 812
+    },
+    {
+      "epoch": 0.14476495726495728,
+      "grad_norm": 0.39838850498199463,
+      "learning_rate": 0.0001993845827758499,
+      "loss": 1.2103,
+      "step": 813
+    },
+    {
+      "epoch": 0.14494301994301995,
+      "grad_norm": 0.429198294878006,
+      "learning_rate": 0.00019938303127206057,
+      "loss": 0.9971,
+      "step": 814
+    },
+    {
+      "epoch": 0.14512108262108261,
+      "grad_norm": 0.4589254856109619,
+      "learning_rate": 0.00019938147782106719,
+      "loss": 1.2392,
+      "step": 815
+    },
+    {
+      "epoch": 0.1452991452991453,
+      "grad_norm": 0.42506635189056396,
+      "learning_rate": 0.00019937992242290023,
+      "loss": 1.0827,
+      "step": 816
+    },
+    {
+      "epoch": 0.14547720797720798,
+      "grad_norm": 0.3778113126754761,
+      "learning_rate": 0.00019937836507759012,
+      "loss": 1.021,
+      "step": 817
+    },
+    {
+      "epoch": 0.14565527065527065,
+      "grad_norm": 0.43071216344833374,
+      "learning_rate": 0.0001993768057851674,
+      "loss": 1.273,
+      "step": 818
+    },
+    {
+      "epoch": 0.14583333333333334,
+      "grad_norm": 0.4944681227207184,
+      "learning_rate": 0.00019937524454566262,
+      "loss": 1.3037,
+      "step": 819
+    },
+    {
+      "epoch": 0.146011396011396,
+      "grad_norm": 0.4438824951648712,
+      "learning_rate": 0.00019937368135910632,
+      "loss": 1.1383,
+      "step": 820
+    },
+    {
+      "epoch": 0.14618945868945868,
+      "grad_norm": 0.400215744972229,
+      "learning_rate": 0.0001993721162255292,
+      "loss": 1.0669,
+      "step": 821
+    },
+    {
+      "epoch": 0.14636752136752137,
+      "grad_norm": 0.4341452121734619,
+      "learning_rate": 0.00019937054914496185,
+      "loss": 1.1431,
+      "step": 822
+    },
+    {
+      "epoch": 0.14654558404558404,
+      "grad_norm": 0.3941744267940521,
+      "learning_rate": 0.00019936898011743503,
+      "loss": 1.1593,
+      "step": 823
+    },
+    {
+      "epoch": 0.1467236467236467,
+      "grad_norm": 0.4318541884422302,
+      "learning_rate": 0.00019936740914297947,
+      "loss": 1.2814,
+      "step": 824
+    },
+    {
+      "epoch": 0.1469017094017094,
+      "grad_norm": 0.44488632678985596,
+      "learning_rate": 0.00019936583622162595,
+      "loss": 1.1054,
+      "step": 825
+    },
+    {
+      "epoch": 0.14707977207977208,
+      "grad_norm": 0.38701096177101135,
+      "learning_rate": 0.00019936426135340528,
+      "loss": 1.1086,
+      "step": 826
+    },
+    {
+      "epoch": 0.14725783475783477,
+      "grad_norm": 0.45794424414634705,
+      "learning_rate": 0.0001993626845383483,
+      "loss": 1.2395,
+      "step": 827
+    },
+    {
+      "epoch": 0.14743589743589744,
+      "grad_norm": 0.49237680435180664,
+      "learning_rate": 0.00019936110577648596,
+      "loss": 1.3483,
+      "step": 828
+    },
+    {
+      "epoch": 0.1476139601139601,
+      "grad_norm": 0.481666624546051,
+      "learning_rate": 0.00019935952506784914,
+      "loss": 1.1848,
+      "step": 829
+    },
+    {
+      "epoch": 0.1477920227920228,
+      "grad_norm": 0.4015209376811981,
+      "learning_rate": 0.00019935794241246883,
+      "loss": 1.0624,
+      "step": 830
+    },
+    {
+      "epoch": 0.14797008547008547,
+      "grad_norm": 0.47975999116897583,
+      "learning_rate": 0.00019935635781037606,
+      "loss": 1.1595,
+      "step": 831
+    },
+    {
+      "epoch": 0.14814814814814814,
+      "grad_norm": 0.4440356492996216,
+      "learning_rate": 0.00019935477126160181,
+      "loss": 1.1325,
+      "step": 832
+    },
+    {
+      "epoch": 0.14832621082621084,
+      "grad_norm": 0.4167410731315613,
+      "learning_rate": 0.00019935318276617723,
+      "loss": 1.0662,
+      "step": 833
+    },
+    {
+      "epoch": 0.1485042735042735,
+      "grad_norm": 0.4107447862625122,
+      "learning_rate": 0.0001993515923241334,
+      "loss": 0.8816,
+      "step": 834
+    },
+    {
+      "epoch": 0.14868233618233617,
+      "grad_norm": 0.4020158648490906,
+      "learning_rate": 0.00019934999993550154,
+      "loss": 0.9797,
+      "step": 835
+    },
+    {
+      "epoch": 0.14886039886039887,
+      "grad_norm": 0.4186473786830902,
+      "learning_rate": 0.0001993484056003128,
+      "loss": 1.1243,
+      "step": 836
+    },
+    {
+      "epoch": 0.14903846153846154,
+      "grad_norm": 0.5534794926643372,
+      "learning_rate": 0.00019934680931859842,
+      "loss": 1.1189,
+      "step": 837
+    },
+    {
+      "epoch": 0.1492165242165242,
+      "grad_norm": 0.37901270389556885,
+      "learning_rate": 0.0001993452110903897,
+      "loss": 0.9241,
+      "step": 838
+    },
+    {
+      "epoch": 0.1493945868945869,
+      "grad_norm": 0.41773587465286255,
+      "learning_rate": 0.00019934361091571793,
+      "loss": 0.9467,
+      "step": 839
+    },
+    {
+      "epoch": 0.14957264957264957,
+      "grad_norm": 0.4962073564529419,
+      "learning_rate": 0.00019934200879461448,
+      "loss": 1.2423,
+      "step": 840
+    },
+    {
+      "epoch": 0.14975071225071226,
+      "grad_norm": 0.38565897941589355,
+      "learning_rate": 0.00019934040472711074,
+      "loss": 1.1545,
+      "step": 841
+    },
+    {
+      "epoch": 0.14992877492877493,
+      "grad_norm": 0.4295346736907959,
+      "learning_rate": 0.0001993387987132381,
+      "loss": 1.2482,
+      "step": 842
+    },
+    {
+      "epoch": 0.1501068376068376,
+      "grad_norm": 0.4279189705848694,
+      "learning_rate": 0.0001993371907530281,
+      "loss": 1.1135,
+      "step": 843
+    },
+    {
+      "epoch": 0.1502849002849003,
+      "grad_norm": 0.44649168848991394,
+      "learning_rate": 0.0001993355808465122,
+      "loss": 1.0734,
+      "step": 844
+    },
+    {
+      "epoch": 0.15046296296296297,
+      "grad_norm": 0.453707218170166,
+      "learning_rate": 0.0001993339689937219,
+      "loss": 1.0992,
+      "step": 845
+    },
+    {
+      "epoch": 0.15064102564102563,
+      "grad_norm": 0.5113263726234436,
+      "learning_rate": 0.00019933235519468886,
+      "loss": 1.1792,
+      "step": 846
+    },
+    {
+      "epoch": 0.15081908831908833,
+      "grad_norm": 0.5822970271110535,
+      "learning_rate": 0.00019933073944944466,
+      "loss": 1.367,
+      "step": 847
+    },
+    {
+      "epoch": 0.150997150997151,
+      "grad_norm": 0.3946528732776642,
+      "learning_rate": 0.00019932912175802097,
+      "loss": 0.9781,
+      "step": 848
+    },
+    {
+      "epoch": 0.15117521367521367,
+      "grad_norm": 0.5429860949516296,
+      "learning_rate": 0.00019932750212044945,
+      "loss": 0.9783,
+      "step": 849
+    },
+    {
+      "epoch": 0.15135327635327636,
+      "grad_norm": 0.45847952365875244,
+      "learning_rate": 0.0001993258805367619,
+      "loss": 1.1352,
+      "step": 850
+    },
+    {
+      "epoch": 0.15153133903133903,
+      "grad_norm": 0.42770692706108093,
+      "learning_rate": 0.00019932425700699004,
+      "loss": 1.2365,
+      "step": 851
+    },
+    {
+      "epoch": 0.1517094017094017,
+      "grad_norm": 0.41845405101776123,
+      "learning_rate": 0.00019932263153116565,
+      "loss": 1.2642,
+      "step": 852
+    },
+    {
+      "epoch": 0.1518874643874644,
+      "grad_norm": 0.4641731083393097,
+      "learning_rate": 0.00019932100410932066,
+      "loss": 1.2009,
+      "step": 853
+    },
+    {
+      "epoch": 0.15206552706552706,
+      "grad_norm": 0.4128672778606415,
+      "learning_rate": 0.00019931937474148689,
+      "loss": 1.1981,
+      "step": 854
+    },
+    {
+      "epoch": 0.15224358974358973,
+      "grad_norm": 0.4730764925479889,
+      "learning_rate": 0.00019931774342769632,
+      "loss": 1.2145,
+      "step": 855
+    },
+    {
+      "epoch": 0.15242165242165243,
+      "grad_norm": 0.36611825227737427,
+      "learning_rate": 0.00019931611016798089,
+      "loss": 0.8504,
+      "step": 856
+    },
+    {
+      "epoch": 0.1525997150997151,
+      "grad_norm": 0.40944692492485046,
+      "learning_rate": 0.00019931447496237254,
+      "loss": 1.2853,
+      "step": 857
+    },
+    {
+      "epoch": 0.1527777777777778,
+      "grad_norm": 0.4521993398666382,
+      "learning_rate": 0.0001993128378109034,
+      "loss": 1.0198,
+      "step": 858
+    },
+    {
+      "epoch": 0.15295584045584046,
+      "grad_norm": 0.42113015055656433,
+      "learning_rate": 0.0001993111987136055,
+      "loss": 1.1284,
+      "step": 859
+    },
+    {
+      "epoch": 0.15313390313390313,
+      "grad_norm": 0.4117624759674072,
+      "learning_rate": 0.00019930955767051098,
+      "loss": 1.0445,
+      "step": 860
+    },
+    {
+      "epoch": 0.15331196581196582,
+      "grad_norm": 0.4807964265346527,
+      "learning_rate": 0.00019930791468165197,
+      "loss": 1.1378,
+      "step": 861
+    },
+    {
+      "epoch": 0.1534900284900285,
+      "grad_norm": 0.4186483323574066,
+      "learning_rate": 0.00019930626974706063,
+      "loss": 1.1636,
+      "step": 862
+    },
+    {
+      "epoch": 0.15366809116809116,
+      "grad_norm": 0.3764737844467163,
+      "learning_rate": 0.00019930462286676926,
+      "loss": 0.9523,
+      "step": 863
+    },
+    {
+      "epoch": 0.15384615384615385,
+      "grad_norm": 0.4283556044101715,
+      "learning_rate": 0.00019930297404081008,
+      "loss": 1.1008,
+      "step": 864
+    },
+    {
+      "epoch": 0.15402421652421652,
+      "grad_norm": 0.4485796093940735,
+      "learning_rate": 0.00019930132326921541,
+      "loss": 1.0834,
+      "step": 865
+    },
+    {
+      "epoch": 0.1542022792022792,
+      "grad_norm": 0.3882720172405243,
+      "learning_rate": 0.0001992996705520176,
+      "loss": 1.1086,
+      "step": 866
+    },
+    {
+      "epoch": 0.1543803418803419,
+      "grad_norm": 0.44698455929756165,
+      "learning_rate": 0.00019929801588924902,
+      "loss": 1.1437,
+      "step": 867
+    },
+    {
+      "epoch": 0.15455840455840456,
+      "grad_norm": 0.46978411078453064,
+      "learning_rate": 0.00019929635928094208,
+      "loss": 1.091,
+      "step": 868
+    },
+    {
+      "epoch": 0.15473646723646722,
+      "grad_norm": 0.4717854857444763,
+      "learning_rate": 0.00019929470072712927,
+      "loss": 1.1959,
+      "step": 869
+    },
+    {
+      "epoch": 0.15491452991452992,
+      "grad_norm": 0.4324854016304016,
+      "learning_rate": 0.00019929304022784305,
+      "loss": 1.2062,
+      "step": 870
+    },
+    {
+      "epoch": 0.1550925925925926,
+      "grad_norm": 0.3948180675506592,
+      "learning_rate": 0.00019929137778311597,
+      "loss": 1.1101,
+      "step": 871
+    },
+    {
+      "epoch": 0.15527065527065528,
+      "grad_norm": 0.40345287322998047,
+      "learning_rate": 0.0001992897133929806,
+      "loss": 0.8894,
+      "step": 872
+    },
+    {
+      "epoch": 0.15544871794871795,
+      "grad_norm": 0.44931963086128235,
+      "learning_rate": 0.00019928804705746957,
+      "loss": 0.9389,
+      "step": 873
+    },
+    {
+      "epoch": 0.15562678062678062,
+      "grad_norm": 0.529196560382843,
+      "learning_rate": 0.0001992863787766155,
+      "loss": 1.3362,
+      "step": 874
+    },
+    {
+      "epoch": 0.15580484330484332,
+      "grad_norm": 0.41218671202659607,
+      "learning_rate": 0.0001992847085504511,
+      "loss": 1.0727,
+      "step": 875
+    },
+    {
+      "epoch": 0.15598290598290598,
+      "grad_norm": 0.44074541330337524,
+      "learning_rate": 0.00019928303637900907,
+      "loss": 1.1091,
+      "step": 876
+    },
+    {
+      "epoch": 0.15616096866096865,
+      "grad_norm": 0.5264310240745544,
+      "learning_rate": 0.00019928136226232218,
+      "loss": 1.201,
+      "step": 877
+    },
+    {
+      "epoch": 0.15633903133903135,
+      "grad_norm": 0.4255099594593048,
+      "learning_rate": 0.00019927968620042324,
+      "loss": 1.2514,
+      "step": 878
+    },
+    {
+      "epoch": 0.15651709401709402,
+      "grad_norm": 0.4030280113220215,
+      "learning_rate": 0.0001992780081933451,
+      "loss": 1.0422,
+      "step": 879
+    },
+    {
+      "epoch": 0.15669515669515668,
+      "grad_norm": 0.5270203948020935,
+      "learning_rate": 0.00019927632824112058,
+      "loss": 1.2476,
+      "step": 880
+    },
+    {
+      "epoch": 0.15687321937321938,
+      "grad_norm": 0.37767237424850464,
+      "learning_rate": 0.00019927464634378268,
+      "loss": 1.0768,
+      "step": 881
+    },
+    {
+      "epoch": 0.15705128205128205,
+      "grad_norm": 0.4535936415195465,
+      "learning_rate": 0.0001992729625013643,
+      "loss": 1.2097,
+      "step": 882
+    },
+    {
+      "epoch": 0.15722934472934472,
+      "grad_norm": 0.4282119870185852,
+      "learning_rate": 0.00019927127671389843,
+      "loss": 1.0904,
+      "step": 883
+    },
+    {
+      "epoch": 0.1574074074074074,
+      "grad_norm": 0.3924157917499542,
+      "learning_rate": 0.0001992695889814181,
+      "loss": 0.9692,
+      "step": 884
+    },
+    {
+      "epoch": 0.15758547008547008,
+      "grad_norm": 0.525075376033783,
+      "learning_rate": 0.0001992678993039564,
+      "loss": 1.0292,
+      "step": 885
+    },
+    {
+      "epoch": 0.15776353276353278,
+      "grad_norm": 0.4388505518436432,
+      "learning_rate": 0.00019926620768154644,
+      "loss": 1.1944,
+      "step": 886
+    },
+    {
+      "epoch": 0.15794159544159544,
+      "grad_norm": 0.4362235963344574,
+      "learning_rate": 0.00019926451411422132,
+      "loss": 0.97,
+      "step": 887
+    },
+    {
+      "epoch": 0.1581196581196581,
+      "grad_norm": 0.4265296459197998,
+      "learning_rate": 0.0001992628186020143,
+      "loss": 0.9196,
+      "step": 888
+    },
+    {
+      "epoch": 0.1582977207977208,
+      "grad_norm": 0.4019876718521118,
+      "learning_rate": 0.0001992611211449585,
+      "loss": 1.1368,
+      "step": 889
+    },
+    {
+      "epoch": 0.15847578347578348,
+      "grad_norm": 0.5003397464752197,
+      "learning_rate": 0.00019925942174308726,
+      "loss": 1.2582,
+      "step": 890
+    },
+    {
+      "epoch": 0.15865384615384615,
+      "grad_norm": 0.4774404466152191,
+      "learning_rate": 0.00019925772039643382,
+      "loss": 1.2277,
+      "step": 891
+    },
+    {
+      "epoch": 0.15883190883190884,
+      "grad_norm": 0.4590449333190918,
+      "learning_rate": 0.00019925601710503153,
+      "loss": 1.1679,
+      "step": 892
+    },
+    {
+      "epoch": 0.1590099715099715,
+      "grad_norm": 0.4221442639827728,
+      "learning_rate": 0.0001992543118689138,
+      "loss": 1.1626,
+      "step": 893
+    },
+    {
+      "epoch": 0.15918803418803418,
+      "grad_norm": 0.47613003849983215,
+      "learning_rate": 0.00019925260468811403,
+      "loss": 1.1509,
+      "step": 894
+    },
+    {
+      "epoch": 0.15936609686609687,
+      "grad_norm": 0.41706812381744385,
+      "learning_rate": 0.0001992508955626656,
+      "loss": 1.0366,
+      "step": 895
+    },
+    {
+      "epoch": 0.15954415954415954,
+      "grad_norm": 0.5064654350280762,
+      "learning_rate": 0.00019924918449260205,
+      "loss": 1.0729,
+      "step": 896
+    },
+    {
+      "epoch": 0.1597222222222222,
+      "grad_norm": 0.5019610524177551,
+      "learning_rate": 0.00019924747147795696,
+      "loss": 1.0642,
+      "step": 897
+    },
+    {
+      "epoch": 0.1599002849002849,
+      "grad_norm": 0.4345671534538269,
+      "learning_rate": 0.00019924575651876378,
+      "loss": 1.1747,
+      "step": 898
+    },
+    {
+      "epoch": 0.16007834757834757,
+      "grad_norm": 0.4397568702697754,
+      "learning_rate": 0.0001992440396150562,
+      "loss": 1.282,
+      "step": 899
+    },
+    {
+      "epoch": 0.16025641025641027,
+      "grad_norm": 0.520187497138977,
+      "learning_rate": 0.0001992423207668678,
+      "loss": 0.976,
+      "step": 900
+    },
+    {
+      "epoch": 0.16043447293447294,
+      "grad_norm": 0.39329993724823,
+      "learning_rate": 0.0001992405999742323,
+      "loss": 0.9829,
+      "step": 901
+    },
+    {
+      "epoch": 0.1606125356125356,
+      "grad_norm": 0.42361345887184143,
+      "learning_rate": 0.00019923887723718339,
+      "loss": 1.139,
+      "step": 902
+    },
+    {
+      "epoch": 0.1607905982905983,
+      "grad_norm": 0.3846314251422882,
+      "learning_rate": 0.00019923715255575482,
+      "loss": 0.8262,
+      "step": 903
+    },
+    {
+      "epoch": 0.16096866096866097,
+      "grad_norm": 0.39258381724357605,
+      "learning_rate": 0.0001992354259299804,
+      "loss": 0.9638,
+      "step": 904
+    },
+    {
+      "epoch": 0.16114672364672364,
+      "grad_norm": 0.4000850319862366,
+      "learning_rate": 0.00019923369735989397,
+      "loss": 0.91,
+      "step": 905
+    },
+    {
+      "epoch": 0.16132478632478633,
+      "grad_norm": 0.46303513646125793,
+      "learning_rate": 0.00019923196684552936,
+      "loss": 1.1447,
+      "step": 906
+    },
+    {
+      "epoch": 0.161502849002849,
+      "grad_norm": 0.38437438011169434,
+      "learning_rate": 0.0001992302343869205,
+      "loss": 1.0212,
+      "step": 907
+    },
+    {
+      "epoch": 0.16168091168091167,
+      "grad_norm": 0.44585472345352173,
+      "learning_rate": 0.00019922849998410135,
+      "loss": 1.1964,
+      "step": 908
+    },
+    {
+      "epoch": 0.16185897435897437,
+      "grad_norm": 0.41959813237190247,
+      "learning_rate": 0.00019922676363710583,
+      "loss": 0.9925,
+      "step": 909
+    },
+    {
+      "epoch": 0.16203703703703703,
+      "grad_norm": 0.47442761063575745,
+      "learning_rate": 0.00019922502534596803,
+      "loss": 0.9237,
+      "step": 910
+    },
+    {
+      "epoch": 0.1622150997150997,
+      "grad_norm": 0.5065128207206726,
+      "learning_rate": 0.00019922328511072198,
+      "loss": 1.2573,
+      "step": 911
+    },
+    {
+      "epoch": 0.1623931623931624,
+      "grad_norm": 0.4739879369735718,
+      "learning_rate": 0.0001992215429314018,
+      "loss": 1.4416,
+      "step": 912
+    },
+    {
+      "epoch": 0.16257122507122507,
+      "grad_norm": 0.48763832449913025,
+      "learning_rate": 0.00019921979880804157,
+      "loss": 1.0408,
+      "step": 913
+    },
+    {
+      "epoch": 0.16274928774928774,
+      "grad_norm": 0.4841614067554474,
+      "learning_rate": 0.0001992180527406755,
+      "loss": 1.1826,
+      "step": 914
+    },
+    {
+      "epoch": 0.16292735042735043,
+      "grad_norm": 0.49433308839797974,
+      "learning_rate": 0.0001992163047293378,
+      "loss": 1.3552,
+      "step": 915
+    },
+    {
+      "epoch": 0.1631054131054131,
+      "grad_norm": 0.4985002875328064,
+      "learning_rate": 0.0001992145547740627,
+      "loss": 1.2639,
+      "step": 916
+    },
+    {
+      "epoch": 0.1632834757834758,
+      "grad_norm": 0.40348032116889954,
+      "learning_rate": 0.00019921280287488448,
+      "loss": 1.1731,
+      "step": 917
+    },
+    {
+      "epoch": 0.16346153846153846,
+      "grad_norm": 0.5166002511978149,
+      "learning_rate": 0.0001992110490318375,
+      "loss": 1.0692,
+      "step": 918
+    },
+    {
+      "epoch": 0.16363960113960113,
+      "grad_norm": 0.44233468174934387,
+      "learning_rate": 0.00019920929324495615,
+      "loss": 1.0488,
+      "step": 919
+    },
+    {
+      "epoch": 0.16381766381766383,
+      "grad_norm": 0.43709903955459595,
+      "learning_rate": 0.00019920753551427476,
+      "loss": 0.8884,
+      "step": 920
+    },
+    {
+      "epoch": 0.1639957264957265,
+      "grad_norm": 0.4054167568683624,
+      "learning_rate": 0.00019920577583982778,
+      "loss": 0.9872,
+      "step": 921
+    },
+    {
+      "epoch": 0.16417378917378916,
+      "grad_norm": 0.4657362997531891,
+      "learning_rate": 0.0001992040142216497,
+      "loss": 1.4402,
+      "step": 922
+    },
+    {
+      "epoch": 0.16435185185185186,
+      "grad_norm": 0.42550426721572876,
+      "learning_rate": 0.0001992022506597751,
+      "loss": 1.0456,
+      "step": 923
+    },
+    {
+      "epoch": 0.16452991452991453,
+      "grad_norm": 0.49346762895584106,
+      "learning_rate": 0.00019920048515423842,
+      "loss": 1.527,
+      "step": 924
+    },
+    {
+      "epoch": 0.1647079772079772,
+      "grad_norm": 0.3970337510108948,
+      "learning_rate": 0.0001991987177050743,
+      "loss": 1.0363,
+      "step": 925
+    },
+    {
+      "epoch": 0.1648860398860399,
+      "grad_norm": 0.4027378559112549,
+      "learning_rate": 0.0001991969483123174,
+      "loss": 0.8416,
+      "step": 926
+    },
+    {
+      "epoch": 0.16506410256410256,
+      "grad_norm": 0.4181644916534424,
+      "learning_rate": 0.00019919517697600237,
+      "loss": 1.2253,
+      "step": 927
+    },
+    {
+      "epoch": 0.16524216524216523,
+      "grad_norm": 0.43686383962631226,
+      "learning_rate": 0.0001991934036961639,
+      "loss": 1.0808,
+      "step": 928
+    },
+    {
+      "epoch": 0.16542022792022792,
+      "grad_norm": 0.4242876172065735,
+      "learning_rate": 0.0001991916284728367,
+      "loss": 0.9483,
+      "step": 929
+    },
+    {
+      "epoch": 0.1655982905982906,
+      "grad_norm": 0.3690609037876129,
+      "learning_rate": 0.00019918985130605563,
+      "loss": 0.9495,
+      "step": 930
+    },
+    {
+      "epoch": 0.1657763532763533,
+      "grad_norm": 0.42184555530548096,
+      "learning_rate": 0.00019918807219585546,
+      "loss": 1.0966,
+      "step": 931
+    },
+    {
+      "epoch": 0.16595441595441596,
+      "grad_norm": 0.4342746138572693,
+      "learning_rate": 0.00019918629114227106,
+      "loss": 1.0875,
+      "step": 932
+    },
+    {
+      "epoch": 0.16613247863247863,
+      "grad_norm": 0.4191494286060333,
+      "learning_rate": 0.00019918450814533737,
+      "loss": 1.0777,
+      "step": 933
+    },
+    {
+      "epoch": 0.16631054131054132,
+      "grad_norm": 0.37124550342559814,
+      "learning_rate": 0.00019918272320508922,
+      "loss": 1.0131,
+      "step": 934
+    },
+    {
+      "epoch": 0.166488603988604,
+      "grad_norm": 0.4475722014904022,
+      "learning_rate": 0.00019918093632156168,
+      "loss": 1.1185,
+      "step": 935
+    },
+    {
+      "epoch": 0.16666666666666666,
+      "grad_norm": 0.4629058241844177,
+      "learning_rate": 0.0001991791474947897,
+      "loss": 1.0353,
+      "step": 936
+    },
+    {
+      "epoch": 0.16684472934472935,
+      "grad_norm": 0.48192909359931946,
+      "learning_rate": 0.00019917735672480834,
+      "loss": 1.1628,
+      "step": 937
+    },
+    {
+      "epoch": 0.16702279202279202,
+      "grad_norm": 0.5542252063751221,
+      "learning_rate": 0.00019917556401165273,
+      "loss": 1.3133,
+      "step": 938
+    },
+    {
+      "epoch": 0.1672008547008547,
+      "grad_norm": 0.4172651171684265,
+      "learning_rate": 0.00019917376935535796,
+      "loss": 1.1733,
+      "step": 939
+    },
+    {
+      "epoch": 0.16737891737891739,
+      "grad_norm": 0.4424920380115509,
+      "learning_rate": 0.0001991719727559592,
+      "loss": 1.0262,
+      "step": 940
+    },
+    {
+      "epoch": 0.16755698005698005,
+      "grad_norm": 0.4551742970943451,
+      "learning_rate": 0.00019917017421349162,
+      "loss": 1.0883,
+      "step": 941
+    },
+    {
+      "epoch": 0.16773504273504272,
+      "grad_norm": 0.45929640531539917,
+      "learning_rate": 0.00019916837372799048,
+      "loss": 1.1836,
+      "step": 942
+    },
+    {
+      "epoch": 0.16791310541310542,
+      "grad_norm": 0.4609353542327881,
+      "learning_rate": 0.0001991665712994911,
+      "loss": 1.0682,
+      "step": 943
+    },
+    {
+      "epoch": 0.16809116809116809,
+      "grad_norm": 0.42617303133010864,
+      "learning_rate": 0.00019916476692802873,
+      "loss": 1.074,
+      "step": 944
+    },
+    {
+      "epoch": 0.16826923076923078,
+      "grad_norm": 0.41919493675231934,
+      "learning_rate": 0.00019916296061363875,
+      "loss": 1.0969,
+      "step": 945
+    },
+    {
+      "epoch": 0.16844729344729345,
+      "grad_norm": 0.450979083776474,
+      "learning_rate": 0.00019916115235635656,
+      "loss": 1.1686,
+      "step": 946
+    },
+    {
+      "epoch": 0.16862535612535612,
+      "grad_norm": 0.42166751623153687,
+      "learning_rate": 0.00019915934215621758,
+      "loss": 0.9273,
+      "step": 947
+    },
+    {
+      "epoch": 0.16880341880341881,
+      "grad_norm": 0.4404160976409912,
+      "learning_rate": 0.00019915753001325729,
+      "loss": 1.1663,
+      "step": 948
+    },
+    {
+      "epoch": 0.16898148148148148,
+      "grad_norm": 0.42025226354599,
+      "learning_rate": 0.0001991557159275111,
+      "loss": 0.9433,
+      "step": 949
+    },
+    {
+      "epoch": 0.16915954415954415,
+      "grad_norm": 0.4277796745300293,
+      "learning_rate": 0.00019915389989901474,
+      "loss": 0.8475,
+      "step": 950
+    },
+    {
+      "epoch": 0.16933760683760685,
+      "grad_norm": 0.5162755250930786,
+      "learning_rate": 0.00019915208192780365,
+      "loss": 1.1155,
+      "step": 951
+    },
+    {
+      "epoch": 0.16951566951566951,
+      "grad_norm": 0.4214856028556824,
+      "learning_rate": 0.00019915026201391346,
+      "loss": 1.173,
+      "step": 952
+    },
+    {
+      "epoch": 0.16969373219373218,
+      "grad_norm": 0.4713292419910431,
+      "learning_rate": 0.00019914844015737985,
+      "loss": 1.1615,
+      "step": 953
+    },
+    {
+      "epoch": 0.16987179487179488,
+      "grad_norm": 0.461179256439209,
+      "learning_rate": 0.00019914661635823854,
+      "loss": 1.1169,
+      "step": 954
+    },
+    {
+      "epoch": 0.17004985754985755,
+      "grad_norm": 0.46200552582740784,
+      "learning_rate": 0.00019914479061652527,
+      "loss": 1.0274,
+      "step": 955
+    },
+    {
+      "epoch": 0.17022792022792022,
+      "grad_norm": 0.40968334674835205,
+      "learning_rate": 0.00019914296293227572,
+      "loss": 1.066,
+      "step": 956
+    },
+    {
+      "epoch": 0.1704059829059829,
+      "grad_norm": 0.40877434611320496,
+      "learning_rate": 0.0001991411333055258,
+      "loss": 1.1595,
+      "step": 957
+    },
+    {
+      "epoch": 0.17058404558404558,
+      "grad_norm": 0.42940187454223633,
+      "learning_rate": 0.00019913930173631132,
+      "loss": 1.0364,
+      "step": 958
+    },
+    {
+      "epoch": 0.17076210826210828,
+      "grad_norm": 0.49648910760879517,
+      "learning_rate": 0.00019913746822466819,
+      "loss": 1.0763,
+      "step": 959
+    },
+    {
+      "epoch": 0.17094017094017094,
+      "grad_norm": 0.4353426396846771,
+      "learning_rate": 0.00019913563277063228,
+      "loss": 0.9698,
+      "step": 960
+    },
+    {
+      "epoch": 0.1711182336182336,
+      "grad_norm": 0.45079681277275085,
+      "learning_rate": 0.00019913379537423958,
+      "loss": 1.2244,
+      "step": 961
+    },
+    {
+      "epoch": 0.1712962962962963,
+      "grad_norm": 0.4276828467845917,
+      "learning_rate": 0.00019913195603552607,
+      "loss": 0.9976,
+      "step": 962
+    },
+    {
+      "epoch": 0.17147435897435898,
+      "grad_norm": 0.41122403740882874,
+      "learning_rate": 0.00019913011475452785,
+      "loss": 1.0077,
+      "step": 963
+    },
+    {
+      "epoch": 0.17165242165242164,
+      "grad_norm": 0.43170276284217834,
+      "learning_rate": 0.00019912827153128096,
+      "loss": 1.1402,
+      "step": 964
+    },
+    {
+      "epoch": 0.17183048433048434,
+      "grad_norm": 0.37950268387794495,
+      "learning_rate": 0.0001991264263658215,
+      "loss": 0.9818,
+      "step": 965
+    },
+    {
+      "epoch": 0.172008547008547,
+      "grad_norm": 0.477333128452301,
+      "learning_rate": 0.00019912457925818562,
+      "loss": 1.1756,
+      "step": 966
+    },
+    {
+      "epoch": 0.17218660968660968,
+      "grad_norm": 0.4326401352882385,
+      "learning_rate": 0.00019912273020840954,
+      "loss": 1.3718,
+      "step": 967
+    },
+    {
+      "epoch": 0.17236467236467237,
+      "grad_norm": 0.37711042165756226,
+      "learning_rate": 0.00019912087921652945,
+      "loss": 0.9011,
+      "step": 968
+    },
+    {
+      "epoch": 0.17254273504273504,
+      "grad_norm": 0.50013667345047,
+      "learning_rate": 0.00019911902628258162,
+      "loss": 1.1163,
+      "step": 969
+    },
+    {
+      "epoch": 0.1727207977207977,
+      "grad_norm": 0.41913339495658875,
+      "learning_rate": 0.0001991171714066024,
+      "loss": 1.2614,
+      "step": 970
+    },
+    {
+      "epoch": 0.1728988603988604,
+      "grad_norm": 0.4075855612754822,
+      "learning_rate": 0.00019911531458862813,
+      "loss": 0.8984,
+      "step": 971
+    },
+    {
+      "epoch": 0.17307692307692307,
+      "grad_norm": 0.40277954936027527,
+      "learning_rate": 0.00019911345582869513,
+      "loss": 1.0851,
+      "step": 972
+    },
+    {
+      "epoch": 0.17325498575498577,
+      "grad_norm": 0.4312847852706909,
+      "learning_rate": 0.00019911159512683987,
+      "loss": 1.1273,
+      "step": 973
+    },
+    {
+      "epoch": 0.17343304843304844,
+      "grad_norm": 0.40303611755371094,
+      "learning_rate": 0.0001991097324830988,
+      "loss": 0.9645,
+      "step": 974
+    },
+    {
+      "epoch": 0.1736111111111111,
+      "grad_norm": 0.45560577511787415,
+      "learning_rate": 0.00019910786789750838,
+      "loss": 1.0864,
+      "step": 975
+    },
+    {
+      "epoch": 0.1737891737891738,
+      "grad_norm": 0.43775680661201477,
+      "learning_rate": 0.00019910600137010517,
+      "loss": 1.028,
+      "step": 976
+    },
+    {
+      "epoch": 0.17396723646723647,
+      "grad_norm": 0.3917224407196045,
+      "learning_rate": 0.00019910413290092572,
+      "loss": 1.0491,
+      "step": 977
+    },
+    {
+      "epoch": 0.17414529914529914,
+      "grad_norm": 0.4068751037120819,
+      "learning_rate": 0.0001991022624900067,
+      "loss": 1.0476,
+      "step": 978
+    },
+    {
+      "epoch": 0.17432336182336183,
+      "grad_norm": 0.4463370144367218,
+      "learning_rate": 0.0001991003901373847,
+      "loss": 1.0612,
+      "step": 979
+    },
+    {
+      "epoch": 0.1745014245014245,
+      "grad_norm": 0.46949052810668945,
+      "learning_rate": 0.0001990985158430964,
+      "loss": 1.3099,
+      "step": 980
+    },
+    {
+      "epoch": 0.17467948717948717,
+      "grad_norm": 0.4250012934207916,
+      "learning_rate": 0.00019909663960717856,
+      "loss": 0.9903,
+      "step": 981
+    },
+    {
+      "epoch": 0.17485754985754987,
+      "grad_norm": 0.5293903946876526,
+      "learning_rate": 0.0001990947614296679,
+      "loss": 0.9908,
+      "step": 982
+    },
+    {
+      "epoch": 0.17503561253561253,
+      "grad_norm": 0.3838284909725189,
+      "learning_rate": 0.0001990928813106013,
+      "loss": 0.716,
+      "step": 983
+    },
+    {
+      "epoch": 0.1752136752136752,
+      "grad_norm": 0.4597751200199127,
+      "learning_rate": 0.0001990909992500155,
+      "loss": 1.0126,
+      "step": 984
+    },
+    {
+      "epoch": 0.1753917378917379,
+      "grad_norm": 0.4844081699848175,
+      "learning_rate": 0.0001990891152479474,
+      "loss": 1.1043,
+      "step": 985
+    },
+    {
+      "epoch": 0.17556980056980057,
+      "grad_norm": 0.4763399660587311,
+      "learning_rate": 0.00019908722930443392,
+      "loss": 1.019,
+      "step": 986
+    },
+    {
+      "epoch": 0.17574786324786323,
+      "grad_norm": 0.4670077860355377,
+      "learning_rate": 0.00019908534141951204,
+      "loss": 1.1382,
+      "step": 987
+    },
+    {
+      "epoch": 0.17592592592592593,
+      "grad_norm": 0.39372730255126953,
+      "learning_rate": 0.00019908345159321873,
+      "loss": 1.1219,
+      "step": 988
+    },
+    {
+      "epoch": 0.1761039886039886,
+      "grad_norm": 0.41869843006134033,
+      "learning_rate": 0.00019908155982559098,
+      "loss": 0.9461,
+      "step": 989
+    },
+    {
+      "epoch": 0.1762820512820513,
+      "grad_norm": 0.4398406147956848,
+      "learning_rate": 0.00019907966611666593,
+      "loss": 1.1328,
+      "step": 990
+    },
+    {
+      "epoch": 0.17646011396011396,
+      "grad_norm": 0.4315733015537262,
+      "learning_rate": 0.0001990777704664806,
+      "loss": 1.0974,
+      "step": 991
+    },
+    {
+      "epoch": 0.17663817663817663,
+      "grad_norm": 0.42859575152397156,
+      "learning_rate": 0.00019907587287507222,
+      "loss": 1.2637,
+      "step": 992
+    },
+    {
+      "epoch": 0.17681623931623933,
+      "grad_norm": 0.47928622364997864,
+      "learning_rate": 0.0001990739733424779,
+      "loss": 1.0699,
+      "step": 993
+    },
+    {
+      "epoch": 0.176994301994302,
+      "grad_norm": 0.4443826973438263,
+      "learning_rate": 0.00019907207186873488,
+      "loss": 1.0547,
+      "step": 994
+    },
+    {
+      "epoch": 0.17717236467236466,
+      "grad_norm": 0.4108099937438965,
+      "learning_rate": 0.00019907016845388043,
+      "loss": 1.1401,
+      "step": 995
+    },
+    {
+      "epoch": 0.17735042735042736,
+      "grad_norm": 0.4474675953388214,
+      "learning_rate": 0.00019906826309795182,
+      "loss": 1.0712,
+      "step": 996
+    },
+    {
+      "epoch": 0.17752849002849003,
+      "grad_norm": 0.4149756133556366,
+      "learning_rate": 0.00019906635580098638,
+      "loss": 0.9585,
+      "step": 997
+    },
+    {
+      "epoch": 0.1777065527065527,
+      "grad_norm": 0.4875968098640442,
+      "learning_rate": 0.00019906444656302152,
+      "loss": 1.0659,
+      "step": 998
+    },
+    {
+      "epoch": 0.1778846153846154,
+      "grad_norm": 0.5494784116744995,
+      "learning_rate": 0.0001990625353840946,
+      "loss": 1.2858,
+      "step": 999
+    },
+    {
+      "epoch": 0.17806267806267806,
+      "grad_norm": 0.425062358379364,
+      "learning_rate": 0.0001990606222642431,
+      "loss": 1.1826,
+      "step": 1000
+    },
+    {
+      "epoch": 0.17824074074074073,
+      "grad_norm": 0.3890725374221802,
+      "learning_rate": 0.00019905870720350445,
+      "loss": 0.9568,
+      "step": 1001
+    },
+    {
+      "epoch": 0.17841880341880342,
+      "grad_norm": 0.3884070813655853,
+      "learning_rate": 0.00019905679020191624,
+      "loss": 0.9674,
+      "step": 1002
+    },
+    {
+      "epoch": 0.1785968660968661,
+      "grad_norm": 0.49496129155158997,
+      "learning_rate": 0.00019905487125951597,
+      "loss": 0.9143,
+      "step": 1003
+    },
+    {
+      "epoch": 0.1787749287749288,
+      "grad_norm": 0.43448135256767273,
+      "learning_rate": 0.00019905295037634128,
+      "loss": 1.2677,
+      "step": 1004
+    },
+    {
+      "epoch": 0.17895299145299146,
+      "grad_norm": 0.47327905893325806,
+      "learning_rate": 0.00019905102755242982,
+      "loss": 0.9089,
+      "step": 1005
+    },
+    {
+      "epoch": 0.17913105413105412,
+      "grad_norm": 0.4962378442287445,
+      "learning_rate": 0.00019904910278781922,
+      "loss": 1.1748,
+      "step": 1006
+    },
+    {
+      "epoch": 0.17930911680911682,
+      "grad_norm": 0.4343934655189514,
+      "learning_rate": 0.0001990471760825472,
+      "loss": 1.2176,
+      "step": 1007
+    },
+    {
+      "epoch": 0.1794871794871795,
+      "grad_norm": 0.4695793092250824,
+      "learning_rate": 0.0001990452474366515,
+      "loss": 1.1822,
+      "step": 1008
+    },
+    {
+      "epoch": 0.17966524216524216,
+      "grad_norm": 0.4156060516834259,
+      "learning_rate": 0.00019904331685016995,
+      "loss": 0.8231,
+      "step": 1009
+    },
+    {
+      "epoch": 0.17984330484330485,
+      "grad_norm": 0.5068191885948181,
+      "learning_rate": 0.00019904138432314035,
+      "loss": 1.1363,
+      "step": 1010
+    },
+    {
+      "epoch": 0.18002136752136752,
+      "grad_norm": 0.5189786553382874,
+      "learning_rate": 0.00019903944985560058,
+      "loss": 1.3131,
+      "step": 1011
+    },
+    {
+      "epoch": 0.1801994301994302,
+      "grad_norm": 0.5126828551292419,
+      "learning_rate": 0.00019903751344758848,
+      "loss": 1.0305,
+      "step": 1012
+    },
+    {
+      "epoch": 0.18037749287749288,
+      "grad_norm": 0.41045933961868286,
+      "learning_rate": 0.00019903557509914205,
+      "loss": 1.2726,
+      "step": 1013
+    },
+    {
+      "epoch": 0.18055555555555555,
+      "grad_norm": 0.4141713082790375,
+      "learning_rate": 0.0001990336348102993,
+      "loss": 0.9606,
+      "step": 1014
+    },
+    {
+      "epoch": 0.18073361823361822,
+      "grad_norm": 0.42652079463005066,
+      "learning_rate": 0.00019903169258109812,
+      "loss": 1.0235,
+      "step": 1015
+    },
+    {
+      "epoch": 0.18091168091168092,
+      "grad_norm": 0.42098379135131836,
+      "learning_rate": 0.0001990297484115767,
+      "loss": 1.0602,
+      "step": 1016
+    },
+    {
+      "epoch": 0.18108974358974358,
+      "grad_norm": 0.49920013546943665,
+      "learning_rate": 0.0001990278023017731,
+      "loss": 1.3322,
+      "step": 1017
+    },
+    {
+      "epoch": 0.18126780626780628,
+      "grad_norm": 0.412304550409317,
+      "learning_rate": 0.00019902585425172537,
+      "loss": 1.1011,
+      "step": 1018
+    },
+    {
+      "epoch": 0.18144586894586895,
+      "grad_norm": 0.44226935505867004,
+      "learning_rate": 0.00019902390426147177,
+      "loss": 0.9777,
+      "step": 1019
+    },
+    {
+      "epoch": 0.18162393162393162,
+      "grad_norm": 0.4685269594192505,
+      "learning_rate": 0.00019902195233105046,
+      "loss": 1.3587,
+      "step": 1020
+    },
+    {
+      "epoch": 0.1818019943019943,
+      "grad_norm": 0.4500584304332733,
+      "learning_rate": 0.00019901999846049968,
+      "loss": 0.9888,
+      "step": 1021
+    },
+    {
+      "epoch": 0.18198005698005698,
+      "grad_norm": 0.48566994071006775,
+      "learning_rate": 0.00019901804264985774,
+      "loss": 1.2364,
+      "step": 1022
+    },
+    {
+      "epoch": 0.18215811965811965,
+      "grad_norm": 0.4063156247138977,
+      "learning_rate": 0.00019901608489916294,
+      "loss": 1.2224,
+      "step": 1023
+    },
+    {
+      "epoch": 0.18233618233618235,
+      "grad_norm": 0.471276193857193,
+      "learning_rate": 0.00019901412520845367,
+      "loss": 0.9926,
+      "step": 1024
+    },
+    {
+      "epoch": 0.182514245014245,
+      "grad_norm": 0.5165421366691589,
+      "learning_rate": 0.00019901216357776829,
+      "loss": 0.9595,
+      "step": 1025
+    },
+    {
+      "epoch": 0.18269230769230768,
+      "grad_norm": 0.4746754467487335,
+      "learning_rate": 0.0001990102000071452,
+      "loss": 1.2057,
+      "step": 1026
+    },
+    {
+      "epoch": 0.18287037037037038,
+      "grad_norm": 0.44803035259246826,
+      "learning_rate": 0.00019900823449662297,
+      "loss": 1.2114,
+      "step": 1027
+    },
+    {
+      "epoch": 0.18304843304843305,
+      "grad_norm": 0.47256240248680115,
+      "learning_rate": 0.00019900626704624005,
+      "loss": 1.112,
+      "step": 1028
+    },
+    {
+      "epoch": 0.18322649572649571,
+      "grad_norm": 0.4253387153148651,
+      "learning_rate": 0.000199004297656035,
+      "loss": 0.9899,
+      "step": 1029
+    },
+    {
+      "epoch": 0.1834045584045584,
+      "grad_norm": 0.44958099722862244,
+      "learning_rate": 0.00019900232632604636,
+      "loss": 1.1445,
+      "step": 1030
+    },
+    {
+      "epoch": 0.18358262108262108,
+      "grad_norm": 0.5296537280082703,
+      "learning_rate": 0.00019900035305631285,
+      "loss": 1.2502,
+      "step": 1031
+    },
+    {
+      "epoch": 0.18376068376068377,
+      "grad_norm": 0.5057148933410645,
+      "learning_rate": 0.00019899837784687302,
+      "loss": 1.1426,
+      "step": 1032
+    },
+    {
+      "epoch": 0.18393874643874644,
+      "grad_norm": 0.41463762521743774,
+      "learning_rate": 0.00019899640069776566,
+      "loss": 1.1854,
+      "step": 1033
+    },
+    {
+      "epoch": 0.1841168091168091,
+      "grad_norm": 0.45800045132637024,
+      "learning_rate": 0.00019899442160902945,
+      "loss": 1.2438,
+      "step": 1034
+    },
+    {
+      "epoch": 0.1842948717948718,
+      "grad_norm": 0.43450453877449036,
+      "learning_rate": 0.00019899244058070324,
+      "loss": 1.0598,
+      "step": 1035
+    },
+    {
+      "epoch": 0.18447293447293447,
+      "grad_norm": 0.4141148626804352,
+      "learning_rate": 0.00019899045761282577,
+      "loss": 1.0465,
+      "step": 1036
+    },
+    {
+      "epoch": 0.18465099715099714,
+      "grad_norm": 0.3938458263874054,
+      "learning_rate": 0.0001989884727054359,
+      "loss": 1.0142,
+      "step": 1037
+    },
+    {
+      "epoch": 0.18482905982905984,
+      "grad_norm": 0.43898263573646545,
+      "learning_rate": 0.00019898648585857257,
+      "loss": 0.9212,
+      "step": 1038
+    },
+    {
+      "epoch": 0.1850071225071225,
+      "grad_norm": 0.4425487816333771,
+      "learning_rate": 0.00019898449707227465,
+      "loss": 1.2987,
+      "step": 1039
+    },
+    {
+      "epoch": 0.18518518518518517,
+      "grad_norm": 0.4537975490093231,
+      "learning_rate": 0.00019898250634658115,
+      "loss": 1.2023,
+      "step": 1040
+    },
+    {
+      "epoch": 0.18536324786324787,
+      "grad_norm": 0.4107198119163513,
+      "learning_rate": 0.00019898051368153104,
+      "loss": 0.8443,
+      "step": 1041
+    },
+    {
+      "epoch": 0.18554131054131054,
+      "grad_norm": 0.4389404058456421,
+      "learning_rate": 0.0001989785190771634,
+      "loss": 1.0502,
+      "step": 1042
+    },
+    {
+      "epoch": 0.1857193732193732,
+      "grad_norm": 0.4288824796676636,
+      "learning_rate": 0.00019897652253351726,
+      "loss": 1.01,
+      "step": 1043
+    },
+    {
+      "epoch": 0.1858974358974359,
+      "grad_norm": 0.50815349817276,
+      "learning_rate": 0.00019897452405063178,
+      "loss": 1.0308,
+      "step": 1044
+    },
+    {
+      "epoch": 0.18607549857549857,
+      "grad_norm": 0.45252710580825806,
+      "learning_rate": 0.0001989725236285461,
+      "loss": 1.0967,
+      "step": 1045
+    },
+    {
+      "epoch": 0.18625356125356127,
+      "grad_norm": 0.45049402117729187,
+      "learning_rate": 0.00019897052126729943,
+      "loss": 1.0141,
+      "step": 1046
+    },
+    {
+      "epoch": 0.18643162393162394,
+      "grad_norm": 0.49637508392333984,
+      "learning_rate": 0.00019896851696693098,
+      "loss": 1.0997,
+      "step": 1047
+    },
+    {
+      "epoch": 0.1866096866096866,
+      "grad_norm": 0.4465886056423187,
+      "learning_rate": 0.00019896651072748005,
+      "loss": 1.1415,
+      "step": 1048
+    },
+    {
+      "epoch": 0.1867877492877493,
+      "grad_norm": 0.5309500694274902,
+      "learning_rate": 0.00019896450254898592,
+      "loss": 1.1028,
+      "step": 1049
+    },
+    {
+      "epoch": 0.18696581196581197,
+      "grad_norm": 0.3516653776168823,
+      "learning_rate": 0.00019896249243148793,
+      "loss": 0.9841,
+      "step": 1050
+    },
+    {
+      "epoch": 0.18714387464387464,
+      "grad_norm": 0.4529176950454712,
+      "learning_rate": 0.0001989604803750255,
+      "loss": 1.1335,
+      "step": 1051
+    },
+    {
+      "epoch": 0.18732193732193733,
+      "grad_norm": 0.47694942355155945,
+      "learning_rate": 0.000198958466379638,
+      "loss": 1.2383,
+      "step": 1052
+    },
+    {
+      "epoch": 0.1875,
+      "grad_norm": 0.5524206757545471,
+      "learning_rate": 0.0001989564504453649,
+      "loss": 1.3668,
+      "step": 1053
+    },
+    {
+      "epoch": 0.18767806267806267,
+      "grad_norm": 0.39203691482543945,
+      "learning_rate": 0.00019895443257224576,
+      "loss": 1.2203,
+      "step": 1054
+    },
+    {
+      "epoch": 0.18785612535612536,
+      "grad_norm": 0.4164120852947235,
+      "learning_rate": 0.00019895241276032005,
+      "loss": 0.8954,
+      "step": 1055
+    },
+    {
+      "epoch": 0.18803418803418803,
+      "grad_norm": 0.41217970848083496,
+      "learning_rate": 0.0001989503910096274,
+      "loss": 1.0238,
+      "step": 1056
+    },
+    {
+      "epoch": 0.1882122507122507,
+      "grad_norm": 0.44038307666778564,
+      "learning_rate": 0.00019894836732020735,
+      "loss": 0.8159,
+      "step": 1057
+    },
+    {
+      "epoch": 0.1883903133903134,
+      "grad_norm": 0.45780670642852783,
+      "learning_rate": 0.0001989463416920996,
+      "loss": 1.2864,
+      "step": 1058
+    },
+    {
+      "epoch": 0.18856837606837606,
+      "grad_norm": 0.5197559595108032,
+      "learning_rate": 0.00019894431412534384,
+      "loss": 1.0756,
+      "step": 1059
+    },
+    {
+      "epoch": 0.18874643874643873,
+      "grad_norm": 0.43283385038375854,
+      "learning_rate": 0.00019894228461997979,
+      "loss": 1.0642,
+      "step": 1060
+    },
+    {
+      "epoch": 0.18892450142450143,
+      "grad_norm": 0.4657376706600189,
+      "learning_rate": 0.00019894025317604717,
+      "loss": 1.1159,
+      "step": 1061
+    },
+    {
+      "epoch": 0.1891025641025641,
+      "grad_norm": 0.4474908113479614,
+      "learning_rate": 0.00019893821979358588,
+      "loss": 1.2006,
+      "step": 1062
+    },
+    {
+      "epoch": 0.1892806267806268,
+      "grad_norm": 0.43878164887428284,
+      "learning_rate": 0.00019893618447263566,
+      "loss": 1.1599,
+      "step": 1063
+    },
+    {
+      "epoch": 0.18945868945868946,
+      "grad_norm": 0.4598735272884369,
+      "learning_rate": 0.00019893414721323645,
+      "loss": 1.3346,
+      "step": 1064
+    },
+    {
+      "epoch": 0.18963675213675213,
+      "grad_norm": 0.3947420120239258,
+      "learning_rate": 0.00019893210801542812,
+      "loss": 1.1201,
+      "step": 1065
+    },
+    {
+      "epoch": 0.18981481481481483,
+      "grad_norm": 0.3401558995246887,
+      "learning_rate": 0.00019893006687925064,
+      "loss": 0.7568,
+      "step": 1066
+    },
+    {
+      "epoch": 0.1899928774928775,
+      "grad_norm": 0.4400341808795929,
+      "learning_rate": 0.00019892802380474405,
+      "loss": 1.1706,
+      "step": 1067
+    },
+    {
+      "epoch": 0.19017094017094016,
+      "grad_norm": 0.42394164204597473,
+      "learning_rate": 0.00019892597879194829,
+      "loss": 1.0163,
+      "step": 1068
+    },
+    {
+      "epoch": 0.19034900284900286,
+      "grad_norm": 0.42904096841812134,
+      "learning_rate": 0.00019892393184090353,
+      "loss": 0.9193,
+      "step": 1069
+    },
+    {
+      "epoch": 0.19052706552706553,
+      "grad_norm": 0.497601181268692,
+      "learning_rate": 0.00019892188295164977,
+      "loss": 1.0377,
+      "step": 1070
+    },
+    {
+      "epoch": 0.1907051282051282,
+      "grad_norm": 0.4536020755767822,
+      "learning_rate": 0.00019891983212422723,
+      "loss": 1.0946,
+      "step": 1071
+    },
+    {
+      "epoch": 0.1908831908831909,
+      "grad_norm": 0.44916942715644836,
+      "learning_rate": 0.00019891777935867607,
+      "loss": 1.0563,
+      "step": 1072
+    },
+    {
+      "epoch": 0.19106125356125356,
+      "grad_norm": 0.4256889820098877,
+      "learning_rate": 0.0001989157246550365,
+      "loss": 1.0988,
+      "step": 1073
+    },
+    {
+      "epoch": 0.19123931623931623,
+      "grad_norm": 0.5559163689613342,
+      "learning_rate": 0.0001989136680133488,
+      "loss": 0.9155,
+      "step": 1074
+    },
+    {
+      "epoch": 0.19141737891737892,
+      "grad_norm": 0.391804963350296,
+      "learning_rate": 0.00019891160943365322,
+      "loss": 0.9314,
+      "step": 1075
+    },
+    {
+      "epoch": 0.1915954415954416,
+      "grad_norm": 0.4535716474056244,
+      "learning_rate": 0.00019890954891599015,
+      "loss": 1.0768,
+      "step": 1076
+    },
+    {
+      "epoch": 0.19177350427350429,
+      "grad_norm": 0.46770521998405457,
+      "learning_rate": 0.00019890748646039991,
+      "loss": 0.8406,
+      "step": 1077
+    },
+    {
+      "epoch": 0.19195156695156695,
+      "grad_norm": 0.4875394403934479,
+      "learning_rate": 0.00019890542206692295,
+      "loss": 1.1055,
+      "step": 1078
+    },
+    {
+      "epoch": 0.19212962962962962,
+      "grad_norm": 0.5072727203369141,
+      "learning_rate": 0.0001989033557355997,
+      "loss": 1.3093,
+      "step": 1079
+    },
+    {
+      "epoch": 0.19230769230769232,
+      "grad_norm": 0.4419287443161011,
+      "learning_rate": 0.00019890128746647068,
+      "loss": 1.1916,
+      "step": 1080
+    },
+    {
+      "epoch": 0.192485754985755,
+      "grad_norm": 0.45803651213645935,
+      "learning_rate": 0.00019889921725957637,
+      "loss": 1.2579,
+      "step": 1081
+    },
+    {
+      "epoch": 0.19266381766381765,
+      "grad_norm": 0.4832262098789215,
+      "learning_rate": 0.0001988971451149573,
+      "loss": 1.3217,
+      "step": 1082
+    },
+    {
+      "epoch": 0.19284188034188035,
+      "grad_norm": 0.4819786250591278,
+      "learning_rate": 0.00019889507103265416,
+      "loss": 1.0979,
+      "step": 1083
+    },
+    {
+      "epoch": 0.19301994301994302,
+      "grad_norm": 0.49360713362693787,
+      "learning_rate": 0.0001988929950127075,
+      "loss": 1.0987,
+      "step": 1084
+    },
+    {
+      "epoch": 0.1931980056980057,
+      "grad_norm": 0.44209200143814087,
+      "learning_rate": 0.00019889091705515806,
+      "loss": 1.2616,
+      "step": 1085
+    },
+    {
+      "epoch": 0.19337606837606838,
+      "grad_norm": 0.41626206040382385,
+      "learning_rate": 0.00019888883716004654,
+      "loss": 1.0922,
+      "step": 1086
+    },
+    {
+      "epoch": 0.19355413105413105,
+      "grad_norm": 0.4916635751724243,
+      "learning_rate": 0.00019888675532741366,
+      "loss": 0.9331,
+      "step": 1087
+    },
+    {
+      "epoch": 0.19373219373219372,
+      "grad_norm": 0.4493125379085541,
+      "learning_rate": 0.00019888467155730025,
+      "loss": 1.1261,
+      "step": 1088
+    },
+    {
+      "epoch": 0.19391025641025642,
+      "grad_norm": 0.3755671977996826,
+      "learning_rate": 0.00019888258584974708,
+      "loss": 0.9821,
+      "step": 1089
+    },
+    {
+      "epoch": 0.19408831908831908,
+      "grad_norm": 0.41917556524276733,
+      "learning_rate": 0.00019888049820479507,
+      "loss": 1.251,
+      "step": 1090
+    },
+    {
+      "epoch": 0.19426638176638178,
+      "grad_norm": 0.46184420585632324,
+      "learning_rate": 0.0001988784086224851,
+      "loss": 1.1731,
+      "step": 1091
+    },
+    {
+      "epoch": 0.19444444444444445,
+      "grad_norm": 0.4783691465854645,
+      "learning_rate": 0.00019887631710285812,
+      "loss": 1.1635,
+      "step": 1092
+    },
+    {
+      "epoch": 0.19462250712250712,
+      "grad_norm": 0.4710482060909271,
+      "learning_rate": 0.00019887422364595512,
+      "loss": 1.0229,
+      "step": 1093
+    },
+    {
+      "epoch": 0.1948005698005698,
+      "grad_norm": 0.4738706648349762,
+      "learning_rate": 0.00019887212825181707,
+      "loss": 1.128,
+      "step": 1094
+    },
+    {
+      "epoch": 0.19497863247863248,
+      "grad_norm": 0.45665010809898376,
+      "learning_rate": 0.00019887003092048508,
+      "loss": 1.0425,
+      "step": 1095
+    },
+    {
+      "epoch": 0.19515669515669515,
+      "grad_norm": 0.42740485072135925,
+      "learning_rate": 0.0001988679316520002,
+      "loss": 1.0738,
+      "step": 1096
+    },
+    {
+      "epoch": 0.19533475783475784,
+      "grad_norm": 0.5977092385292053,
+      "learning_rate": 0.0001988658304464036,
+      "loss": 1.2687,
+      "step": 1097
+    },
+    {
+      "epoch": 0.1955128205128205,
+      "grad_norm": 0.4411074221134186,
+      "learning_rate": 0.0001988637273037364,
+      "loss": 1.287,
+      "step": 1098
+    },
+    {
+      "epoch": 0.19569088319088318,
+      "grad_norm": 0.4409518539905548,
+      "learning_rate": 0.00019886162222403986,
+      "loss": 1.0515,
+      "step": 1099
+    },
+    {
+      "epoch": 0.19586894586894588,
+      "grad_norm": 0.4926736652851105,
+      "learning_rate": 0.0001988595152073552,
+      "loss": 1.1388,
+      "step": 1100
+    },
+    {
+      "epoch": 0.19604700854700854,
+      "grad_norm": 0.4607115387916565,
+      "learning_rate": 0.00019885740625372368,
+      "loss": 0.9803,
+      "step": 1101
+    },
+    {
+      "epoch": 0.1962250712250712,
+      "grad_norm": 0.4725342094898224,
+      "learning_rate": 0.0001988552953631867,
+      "loss": 1.199,
+      "step": 1102
+    },
+    {
+      "epoch": 0.1964031339031339,
+      "grad_norm": 0.48014503717422485,
+      "learning_rate": 0.00019885318253578548,
+      "loss": 1.1868,
+      "step": 1103
+    },
+    {
+      "epoch": 0.19658119658119658,
+      "grad_norm": 0.3872644603252411,
+      "learning_rate": 0.00019885106777156155,
+      "loss": 0.9182,
+      "step": 1104
+    },
+    {
+      "epoch": 0.19675925925925927,
+      "grad_norm": 0.4737720787525177,
+      "learning_rate": 0.00019884895107055627,
+      "loss": 1.1513,
+      "step": 1105
+    },
+    {
+      "epoch": 0.19693732193732194,
+      "grad_norm": 0.4144562780857086,
+      "learning_rate": 0.00019884683243281116,
+      "loss": 1.1711,
+      "step": 1106
+    },
+    {
+      "epoch": 0.1971153846153846,
+      "grad_norm": 0.4672079384326935,
+      "learning_rate": 0.00019884471185836769,
+      "loss": 1.0386,
+      "step": 1107
+    },
+    {
+      "epoch": 0.1972934472934473,
+      "grad_norm": 0.4558824598789215,
+      "learning_rate": 0.0001988425893472674,
+      "loss": 1.0535,
+      "step": 1108
+    },
+    {
+      "epoch": 0.19747150997150997,
+      "grad_norm": 0.5149834752082825,
+      "learning_rate": 0.00019884046489955192,
+      "loss": 1.0296,
+      "step": 1109
+    },
+    {
+      "epoch": 0.19764957264957264,
+      "grad_norm": 0.43444496393203735,
+      "learning_rate": 0.00019883833851526287,
+      "loss": 1.1475,
+      "step": 1110
+    },
+    {
+      "epoch": 0.19782763532763534,
+      "grad_norm": 0.46062374114990234,
+      "learning_rate": 0.00019883621019444188,
+      "loss": 1.183,
+      "step": 1111
+    },
+    {
+      "epoch": 0.198005698005698,
+      "grad_norm": 0.4893282949924469,
+      "learning_rate": 0.00019883407993713065,
+      "loss": 1.3733,
+      "step": 1112
+    },
+    {
+      "epoch": 0.19818376068376067,
+      "grad_norm": 0.5434843897819519,
+      "learning_rate": 0.00019883194774337096,
+      "loss": 1.2505,
+      "step": 1113
+    },
+    {
+      "epoch": 0.19836182336182337,
+      "grad_norm": 0.4698035418987274,
+      "learning_rate": 0.00019882981361320456,
+      "loss": 1.0152,
+      "step": 1114
+    },
+    {
+      "epoch": 0.19853988603988604,
+      "grad_norm": 0.4582163989543915,
+      "learning_rate": 0.00019882767754667325,
+      "loss": 1.1718,
+      "step": 1115
+    },
+    {
+      "epoch": 0.1987179487179487,
+      "grad_norm": 0.48744696378707886,
+      "learning_rate": 0.0001988255395438189,
+      "loss": 1.2923,
+      "step": 1116
+    },
+    {
+      "epoch": 0.1988960113960114,
+      "grad_norm": 0.4172030985355377,
+      "learning_rate": 0.0001988233996046834,
+      "loss": 0.8098,
+      "step": 1117
+    },
+    {
+      "epoch": 0.19907407407407407,
+      "grad_norm": 0.4556557834148407,
+      "learning_rate": 0.00019882125772930867,
+      "loss": 0.9654,
+      "step": 1118
+    },
+    {
+      "epoch": 0.19925213675213677,
+      "grad_norm": 0.4363219141960144,
+      "learning_rate": 0.00019881911391773666,
+      "loss": 1.0333,
+      "step": 1119
+    },
+    {
+      "epoch": 0.19943019943019943,
+      "grad_norm": 0.4336536228656769,
+      "learning_rate": 0.0001988169681700094,
+      "loss": 1.091,
+      "step": 1120
+    },
+    {
+      "epoch": 0.1996082621082621,
+      "grad_norm": 0.42073166370391846,
+      "learning_rate": 0.00019881482048616893,
+      "loss": 0.9687,
+      "step": 1121
+    },
+    {
+      "epoch": 0.1997863247863248,
+      "grad_norm": 0.4330587685108185,
+      "learning_rate": 0.00019881267086625733,
+      "loss": 1.0512,
+      "step": 1122
+    },
+    {
+      "epoch": 0.19996438746438747,
+      "grad_norm": 0.4602276682853699,
+      "learning_rate": 0.0001988105193103167,
+      "loss": 1.1806,
+      "step": 1123
+    },
+    {
+      "epoch": 0.20014245014245013,
+      "grad_norm": 0.4271257817745209,
+      "learning_rate": 0.0001988083658183892,
+      "loss": 1.1079,
+      "step": 1124
+    },
+    {
+      "epoch": 0.20032051282051283,
+      "grad_norm": 0.35446426272392273,
+      "learning_rate": 0.00019880621039051707,
+      "loss": 0.6769,
+      "step": 1125
+    },
+    {
+      "epoch": 0.2004985754985755,
+      "grad_norm": 0.413753479719162,
+      "learning_rate": 0.00019880405302674244,
+      "loss": 1.1088,
+      "step": 1126
+    },
+    {
+      "epoch": 0.20067663817663817,
+      "grad_norm": 0.4423675835132599,
+      "learning_rate": 0.00019880189372710767,
+      "loss": 1.1371,
+      "step": 1127
+    },
+    {
+      "epoch": 0.20085470085470086,
+      "grad_norm": 0.41865605115890503,
+      "learning_rate": 0.00019879973249165502,
+      "loss": 1.0027,
+      "step": 1128
+    },
+    {
+      "epoch": 0.20103276353276353,
+      "grad_norm": 0.4109594225883484,
+      "learning_rate": 0.00019879756932042686,
+      "loss": 0.8734,
+      "step": 1129
+    },
+    {
+      "epoch": 0.2012108262108262,
+      "grad_norm": 0.42326363921165466,
+      "learning_rate": 0.00019879540421346555,
+      "loss": 0.9722,
+      "step": 1130
+    },
+    {
+      "epoch": 0.2013888888888889,
+      "grad_norm": 0.4601542055606842,
+      "learning_rate": 0.00019879323717081354,
+      "loss": 1.1251,
+      "step": 1131
+    },
+    {
+      "epoch": 0.20156695156695156,
+      "grad_norm": 0.4704367518424988,
+      "learning_rate": 0.00019879106819251327,
+      "loss": 0.9457,
+      "step": 1132
+    },
+    {
+      "epoch": 0.20174501424501423,
+      "grad_norm": 0.465023934841156,
+      "learning_rate": 0.00019878889727860724,
+      "loss": 0.9633,
+      "step": 1133
+    },
+    {
+      "epoch": 0.20192307692307693,
+      "grad_norm": 0.4572450518608093,
+      "learning_rate": 0.00019878672442913796,
+      "loss": 1.1965,
+      "step": 1134
+    },
+    {
+      "epoch": 0.2021011396011396,
+      "grad_norm": 0.4323410391807556,
+      "learning_rate": 0.00019878454964414807,
+      "loss": 1.1296,
+      "step": 1135
+    },
+    {
+      "epoch": 0.2022792022792023,
+      "grad_norm": 0.4513751268386841,
+      "learning_rate": 0.00019878237292368013,
+      "loss": 1.0571,
+      "step": 1136
+    },
+    {
+      "epoch": 0.20245726495726496,
+      "grad_norm": 0.45504096150398254,
+      "learning_rate": 0.00019878019426777677,
+      "loss": 1.0316,
+      "step": 1137
+    },
+    {
+      "epoch": 0.20263532763532763,
+      "grad_norm": 0.45715275406837463,
+      "learning_rate": 0.0001987780136764807,
+      "loss": 1.0528,
+      "step": 1138
+    },
+    {
+      "epoch": 0.20281339031339032,
+      "grad_norm": 0.4934465289115906,
+      "learning_rate": 0.00019877583114983466,
+      "loss": 1.3238,
+      "step": 1139
+    },
+    {
+      "epoch": 0.202991452991453,
+      "grad_norm": 0.4304082989692688,
+      "learning_rate": 0.0001987736466878814,
+      "loss": 1.1774,
+      "step": 1140
+    },
+    {
+      "epoch": 0.20316951566951566,
+      "grad_norm": 0.49721968173980713,
+      "learning_rate": 0.00019877146029066372,
+      "loss": 1.1767,
+      "step": 1141
+    },
+    {
+      "epoch": 0.20334757834757836,
+      "grad_norm": 0.3629468083381653,
+      "learning_rate": 0.00019876927195822445,
+      "loss": 0.8588,
+      "step": 1142
+    },
+    {
+      "epoch": 0.20352564102564102,
+      "grad_norm": 0.49310383200645447,
+      "learning_rate": 0.00019876708169060648,
+      "loss": 1.0588,
+      "step": 1143
+    },
+    {
+      "epoch": 0.2037037037037037,
+      "grad_norm": 0.4270328879356384,
+      "learning_rate": 0.00019876488948785271,
+      "loss": 1.1523,
+      "step": 1144
+    },
+    {
+      "epoch": 0.2038817663817664,
+      "grad_norm": 0.4559730887413025,
+      "learning_rate": 0.0001987626953500061,
+      "loss": 1.1736,
+      "step": 1145
+    },
+    {
+      "epoch": 0.20405982905982906,
+      "grad_norm": 0.5335259437561035,
+      "learning_rate": 0.00019876049927710962,
+      "loss": 0.991,
+      "step": 1146
+    },
+    {
+      "epoch": 0.20423789173789172,
+      "grad_norm": 0.43500083684921265,
+      "learning_rate": 0.0001987583012692063,
+      "loss": 1.0631,
+      "step": 1147
+    },
+    {
+      "epoch": 0.20441595441595442,
+      "grad_norm": 0.4135417938232422,
+      "learning_rate": 0.00019875610132633927,
+      "loss": 1.0896,
+      "step": 1148
+    },
+    {
+      "epoch": 0.2045940170940171,
+      "grad_norm": 0.4078896641731262,
+      "learning_rate": 0.00019875389944855153,
+      "loss": 1.0395,
+      "step": 1149
+    },
+    {
+      "epoch": 0.20477207977207978,
+      "grad_norm": 0.46612194180488586,
+      "learning_rate": 0.00019875169563588632,
+      "loss": 1.0541,
+      "step": 1150
+    },
+    {
+      "epoch": 0.20495014245014245,
+      "grad_norm": 0.5093224048614502,
+      "learning_rate": 0.00019874948988838674,
+      "loss": 1.1486,
+      "step": 1151
+    },
+    {
+      "epoch": 0.20512820512820512,
+      "grad_norm": 0.5079755187034607,
+      "learning_rate": 0.00019874728220609607,
+      "loss": 1.2614,
+      "step": 1152
+    },
+    {
+      "epoch": 0.20530626780626782,
+      "grad_norm": 0.43663498759269714,
+      "learning_rate": 0.0001987450725890575,
+      "loss": 1.0683,
+      "step": 1153
+    },
+    {
+      "epoch": 0.20548433048433049,
+      "grad_norm": 0.5029327273368835,
+      "learning_rate": 0.00019874286103731435,
+      "loss": 1.1934,
+      "step": 1154
+    },
+    {
+      "epoch": 0.20566239316239315,
+      "grad_norm": 0.48770397901535034,
+      "learning_rate": 0.00019874064755090999,
+      "loss": 1.1634,
+      "step": 1155
+    },
+    {
+      "epoch": 0.20584045584045585,
+      "grad_norm": 0.46826690435409546,
+      "learning_rate": 0.00019873843212988776,
+      "loss": 1.0621,
+      "step": 1156
+    },
+    {
+      "epoch": 0.20601851851851852,
+      "grad_norm": 0.4810047149658203,
+      "learning_rate": 0.00019873621477429105,
+      "loss": 1.0879,
+      "step": 1157
+    },
+    {
+      "epoch": 0.20619658119658119,
+      "grad_norm": 0.4769522249698639,
+      "learning_rate": 0.00019873399548416335,
+      "loss": 1.1365,
+      "step": 1158
+    },
+    {
+      "epoch": 0.20637464387464388,
+      "grad_norm": 0.4221782982349396,
+      "learning_rate": 0.00019873177425954806,
+      "loss": 1.1168,
+      "step": 1159
+    },
+    {
+      "epoch": 0.20655270655270655,
+      "grad_norm": 0.4084923565387726,
+      "learning_rate": 0.00019872955110048876,
+      "loss": 1.2364,
+      "step": 1160
+    },
+    {
+      "epoch": 0.20673076923076922,
+      "grad_norm": 0.4781704545021057,
+      "learning_rate": 0.00019872732600702904,
+      "loss": 1.19,
+      "step": 1161
+    },
+    {
+      "epoch": 0.20690883190883191,
+      "grad_norm": 0.3984242081642151,
+      "learning_rate": 0.0001987250989792124,
+      "loss": 1.0568,
+      "step": 1162
+    },
+    {
+      "epoch": 0.20708689458689458,
+      "grad_norm": 0.4601972997188568,
+      "learning_rate": 0.00019872287001708257,
+      "loss": 1.1625,
+      "step": 1163
+    },
+    {
+      "epoch": 0.20726495726495728,
+      "grad_norm": 0.4853581190109253,
+      "learning_rate": 0.00019872063912068316,
+      "loss": 1.2304,
+      "step": 1164
+    },
+    {
+      "epoch": 0.20744301994301995,
+      "grad_norm": 0.41779839992523193,
+      "learning_rate": 0.0001987184062900579,
+      "loss": 0.9807,
+      "step": 1165
+    },
+    {
+      "epoch": 0.20762108262108261,
+      "grad_norm": 0.4945356249809265,
+      "learning_rate": 0.00019871617152525056,
+      "loss": 1.1861,
+      "step": 1166
+    },
+    {
+      "epoch": 0.2077991452991453,
+      "grad_norm": 0.47432294487953186,
+      "learning_rate": 0.00019871393482630487,
+      "loss": 1.1448,
+      "step": 1167
+    },
+    {
+      "epoch": 0.20797720797720798,
+      "grad_norm": 0.44647398591041565,
+      "learning_rate": 0.00019871169619326473,
+      "loss": 1.096,
+      "step": 1168
+    },
+    {
+      "epoch": 0.20815527065527065,
+      "grad_norm": 0.4643072783946991,
+      "learning_rate": 0.00019870945562617393,
+      "loss": 1.1561,
+      "step": 1169
+    },
+    {
+      "epoch": 0.20833333333333334,
+      "grad_norm": 0.4544340968132019,
+      "learning_rate": 0.0001987072131250764,
+      "loss": 1.0764,
+      "step": 1170
+    },
+    {
+      "epoch": 0.208511396011396,
+      "grad_norm": 0.6036561727523804,
+      "learning_rate": 0.00019870496869001607,
+      "loss": 1.3961,
+      "step": 1171
+    },
+    {
+      "epoch": 0.20868945868945868,
+      "grad_norm": 0.41348758339881897,
+      "learning_rate": 0.00019870272232103695,
+      "loss": 1.2219,
+      "step": 1172
+    },
+    {
+      "epoch": 0.20886752136752137,
+      "grad_norm": 0.4184056222438812,
+      "learning_rate": 0.000198700474018183,
+      "loss": 1.1115,
+      "step": 1173
+    },
+    {
+      "epoch": 0.20904558404558404,
+      "grad_norm": 0.41920599341392517,
+      "learning_rate": 0.0001986982237814983,
+      "loss": 0.9207,
+      "step": 1174
+    },
+    {
+      "epoch": 0.2092236467236467,
+      "grad_norm": 0.4710249602794647,
+      "learning_rate": 0.00019869597161102694,
+      "loss": 1.1342,
+      "step": 1175
+    },
+    {
+      "epoch": 0.2094017094017094,
+      "grad_norm": 0.46897777915000916,
+      "learning_rate": 0.000198693717506813,
+      "loss": 0.983,
+      "step": 1176
+    },
+    {
+      "epoch": 0.20957977207977208,
+      "grad_norm": 0.4817039370536804,
+      "learning_rate": 0.00019869146146890074,
+      "loss": 1.0923,
+      "step": 1177
+    },
+    {
+      "epoch": 0.20975783475783477,
+      "grad_norm": 0.4806751012802124,
+      "learning_rate": 0.00019868920349733427,
+      "loss": 1.2296,
+      "step": 1178
+    },
+    {
+      "epoch": 0.20993589743589744,
+      "grad_norm": 0.44182994961738586,
+      "learning_rate": 0.0001986869435921579,
+      "loss": 1.1856,
+      "step": 1179
+    },
+    {
+      "epoch": 0.2101139601139601,
+      "grad_norm": 0.4282805621623993,
+      "learning_rate": 0.00019868468175341584,
+      "loss": 1.0046,
+      "step": 1180
+    },
+    {
+      "epoch": 0.2102920227920228,
+      "grad_norm": 0.5011838674545288,
+      "learning_rate": 0.00019868241798115242,
+      "loss": 1.2401,
+      "step": 1181
+    },
+    {
+      "epoch": 0.21047008547008547,
+      "grad_norm": 0.4282447397708893,
+      "learning_rate": 0.00019868015227541208,
+      "loss": 0.9338,
+      "step": 1182
+    },
+    {
+      "epoch": 0.21064814814814814,
+      "grad_norm": 0.4348810911178589,
+      "learning_rate": 0.00019867788463623912,
+      "loss": 0.926,
+      "step": 1183
+    },
+    {
+      "epoch": 0.21082621082621084,
+      "grad_norm": 0.41518425941467285,
+      "learning_rate": 0.00019867561506367799,
+      "loss": 1.2723,
+      "step": 1184
+    },
+    {
+      "epoch": 0.2110042735042735,
+      "grad_norm": 0.47346001863479614,
+      "learning_rate": 0.00019867334355777315,
+      "loss": 1.1931,
+      "step": 1185
+    },
+    {
+      "epoch": 0.21118233618233617,
+      "grad_norm": 0.4071715474128723,
+      "learning_rate": 0.00019867107011856914,
+      "loss": 0.9619,
+      "step": 1186
+    },
+    {
+      "epoch": 0.21136039886039887,
+      "grad_norm": 0.4803447425365448,
+      "learning_rate": 0.00019866879474611046,
+      "loss": 1.2,
+      "step": 1187
+    },
+    {
+      "epoch": 0.21153846153846154,
+      "grad_norm": 0.4827699661254883,
+      "learning_rate": 0.00019866651744044172,
+      "loss": 1.0938,
+      "step": 1188
+    },
+    {
+      "epoch": 0.2117165242165242,
+      "grad_norm": 0.4528424143791199,
+      "learning_rate": 0.00019866423820160756,
+      "loss": 0.9721,
+      "step": 1189
+    },
+    {
+      "epoch": 0.2118945868945869,
+      "grad_norm": 0.43566834926605225,
+      "learning_rate": 0.0001986619570296526,
+      "loss": 1.0352,
+      "step": 1190
+    },
+    {
+      "epoch": 0.21207264957264957,
+      "grad_norm": 0.4516540467739105,
+      "learning_rate": 0.0001986596739246215,
+      "loss": 1.1333,
+      "step": 1191
+    },
+    {
+      "epoch": 0.21225071225071226,
+      "grad_norm": 0.4456641376018524,
+      "learning_rate": 0.00019865738888655908,
+      "loss": 1.2813,
+      "step": 1192
+    },
+    {
+      "epoch": 0.21242877492877493,
+      "grad_norm": 0.47048309445381165,
+      "learning_rate": 0.00019865510191551008,
+      "loss": 1.1067,
+      "step": 1193
+    },
+    {
+      "epoch": 0.2126068376068376,
+      "grad_norm": 0.4604061543941498,
+      "learning_rate": 0.00019865281301151928,
+      "loss": 0.925,
+      "step": 1194
+    },
+    {
+      "epoch": 0.2127849002849003,
+      "grad_norm": 0.49341437220573425,
+      "learning_rate": 0.00019865052217463153,
+      "loss": 1.2319,
+      "step": 1195
+    },
+    {
+      "epoch": 0.21296296296296297,
+      "grad_norm": 0.5099014639854431,
+      "learning_rate": 0.00019864822940489173,
+      "loss": 1.139,
+      "step": 1196
+    },
+    {
+      "epoch": 0.21314102564102563,
+      "grad_norm": 0.41396936774253845,
+      "learning_rate": 0.0001986459347023448,
+      "loss": 1.0594,
+      "step": 1197
+    },
+    {
+      "epoch": 0.21331908831908833,
+      "grad_norm": 0.46071869134902954,
+      "learning_rate": 0.0001986436380670357,
+      "loss": 1.0815,
+      "step": 1198
+    },
+    {
+      "epoch": 0.213497150997151,
+      "grad_norm": 0.507882297039032,
+      "learning_rate": 0.00019864133949900942,
+      "loss": 1.3841,
+      "step": 1199
+    },
+    {
+      "epoch": 0.21367521367521367,
+      "grad_norm": 0.45680439472198486,
+      "learning_rate": 0.00019863903899831103,
+      "loss": 1.0945,
+      "step": 1200
+    },
+    {
+      "epoch": 0.21385327635327636,
+      "grad_norm": 0.44277429580688477,
+      "learning_rate": 0.00019863673656498555,
+      "loss": 1.1655,
+      "step": 1201
+    },
+    {
+      "epoch": 0.21403133903133903,
+      "grad_norm": 0.43890756368637085,
+      "learning_rate": 0.00019863443219907812,
+      "loss": 1.1186,
+      "step": 1202
+    },
+    {
+      "epoch": 0.2142094017094017,
+      "grad_norm": 0.3910178542137146,
+      "learning_rate": 0.0001986321259006339,
+      "loss": 1.0817,
+      "step": 1203
+    },
+    {
+      "epoch": 0.2143874643874644,
+      "grad_norm": 0.3803878128528595,
+      "learning_rate": 0.00019862981766969803,
+      "loss": 0.8022,
+      "step": 1204
+    },
+    {
+      "epoch": 0.21456552706552706,
+      "grad_norm": 0.4495108425617218,
+      "learning_rate": 0.0001986275075063158,
+      "loss": 1.2212,
+      "step": 1205
+    },
+    {
+      "epoch": 0.21474358974358973,
+      "grad_norm": 0.5211976766586304,
+      "learning_rate": 0.00019862519541053244,
+      "loss": 1.2771,
+      "step": 1206
+    },
+    {
+      "epoch": 0.21492165242165243,
+      "grad_norm": 0.4313061535358429,
+      "learning_rate": 0.00019862288138239325,
+      "loss": 1.1205,
+      "step": 1207
+    },
+    {
+      "epoch": 0.2150997150997151,
+      "grad_norm": 0.47110888361930847,
+      "learning_rate": 0.00019862056542194355,
+      "loss": 1.1835,
+      "step": 1208
+    },
+    {
+      "epoch": 0.2152777777777778,
+      "grad_norm": 0.5129403471946716,
+      "learning_rate": 0.00019861824752922876,
+      "loss": 1.1655,
+      "step": 1209
+    },
+    {
+      "epoch": 0.21545584045584046,
+      "grad_norm": 0.4353938102722168,
+      "learning_rate": 0.00019861592770429427,
+      "loss": 1.2794,
+      "step": 1210
+    },
+    {
+      "epoch": 0.21563390313390313,
+      "grad_norm": 0.48590636253356934,
+      "learning_rate": 0.0001986136059471855,
+      "loss": 1.2003,
+      "step": 1211
+    },
+    {
+      "epoch": 0.21581196581196582,
+      "grad_norm": 0.4738406836986542,
+      "learning_rate": 0.00019861128225794804,
+      "loss": 1.2271,
+      "step": 1212
+    },
+    {
+      "epoch": 0.2159900284900285,
+      "grad_norm": 0.45983126759529114,
+      "learning_rate": 0.0001986089566366273,
+      "loss": 1.1896,
+      "step": 1213
+    },
+    {
+      "epoch": 0.21616809116809116,
+      "grad_norm": 0.37296006083488464,
+      "learning_rate": 0.00019860662908326892,
+      "loss": 1.079,
+      "step": 1214
+    },
+    {
+      "epoch": 0.21634615384615385,
+      "grad_norm": 0.4442676305770874,
+      "learning_rate": 0.00019860429959791845,
+      "loss": 1.1754,
+      "step": 1215
+    },
+    {
+      "epoch": 0.21652421652421652,
+      "grad_norm": 0.4950128495693207,
+      "learning_rate": 0.0001986019681806216,
+      "loss": 1.1571,
+      "step": 1216
+    },
+    {
+      "epoch": 0.2167022792022792,
+      "grad_norm": 0.4374556541442871,
+      "learning_rate": 0.000198599634831424,
+      "loss": 1.1003,
+      "step": 1217
+    },
+    {
+      "epoch": 0.2168803418803419,
+      "grad_norm": 0.47301414608955383,
+      "learning_rate": 0.00019859729955037136,
+      "loss": 1.1426,
+      "step": 1218
+    },
+    {
+      "epoch": 0.21705840455840456,
+      "grad_norm": 0.41213178634643555,
+      "learning_rate": 0.00019859496233750947,
+      "loss": 1.0659,
+      "step": 1219
+    },
+    {
+      "epoch": 0.21723646723646722,
+      "grad_norm": 0.41601964831352234,
+      "learning_rate": 0.0001985926231928841,
+      "loss": 1.0248,
+      "step": 1220
+    },
+    {
+      "epoch": 0.21741452991452992,
+      "grad_norm": 0.46328839659690857,
+      "learning_rate": 0.0001985902821165411,
+      "loss": 1.0405,
+      "step": 1221
+    },
+    {
+      "epoch": 0.2175925925925926,
+      "grad_norm": 0.43287959694862366,
+      "learning_rate": 0.0001985879391085263,
+      "loss": 0.9202,
+      "step": 1222
+    },
+    {
+      "epoch": 0.21777065527065528,
+      "grad_norm": 0.4770444631576538,
+      "learning_rate": 0.00019858559416888568,
+      "loss": 1.0911,
+      "step": 1223
+    },
+    {
+      "epoch": 0.21794871794871795,
+      "grad_norm": 0.4756585955619812,
+      "learning_rate": 0.00019858324729766507,
+      "loss": 1.1566,
+      "step": 1224
+    },
+    {
+      "epoch": 0.21812678062678062,
+      "grad_norm": 0.4337233006954193,
+      "learning_rate": 0.00019858089849491054,
+      "loss": 0.9084,
+      "step": 1225
+    },
+    {
+      "epoch": 0.21830484330484332,
+      "grad_norm": 0.5165579319000244,
+      "learning_rate": 0.00019857854776066813,
+      "loss": 1.4154,
+      "step": 1226
+    },
+    {
+      "epoch": 0.21848290598290598,
+      "grad_norm": 0.4280378520488739,
+      "learning_rate": 0.00019857619509498382,
+      "loss": 1.1291,
+      "step": 1227
+    },
+    {
+      "epoch": 0.21866096866096865,
+      "grad_norm": 0.5375089049339294,
+      "learning_rate": 0.00019857384049790376,
+      "loss": 1.2985,
+      "step": 1228
+    },
+    {
+      "epoch": 0.21883903133903135,
+      "grad_norm": 0.4708811640739441,
+      "learning_rate": 0.00019857148396947401,
+      "loss": 1.0589,
+      "step": 1229
+    },
+    {
+      "epoch": 0.21901709401709402,
+      "grad_norm": 0.4744570255279541,
+      "learning_rate": 0.00019856912550974084,
+      "loss": 1.1269,
+      "step": 1230
+    },
+    {
+      "epoch": 0.21919515669515668,
+      "grad_norm": 0.5355265736579895,
+      "learning_rate": 0.00019856676511875043,
+      "loss": 1.1441,
+      "step": 1231
+    },
+    {
+      "epoch": 0.21937321937321938,
+      "grad_norm": 0.42718183994293213,
+      "learning_rate": 0.00019856440279654897,
+      "loss": 1.0244,
+      "step": 1232
+    },
+    {
+      "epoch": 0.21955128205128205,
+      "grad_norm": 0.5162127614021301,
+      "learning_rate": 0.00019856203854318283,
+      "loss": 1.2674,
+      "step": 1233
+    },
+    {
+      "epoch": 0.21972934472934472,
+      "grad_norm": 0.5180695652961731,
+      "learning_rate": 0.00019855967235869827,
+      "loss": 1.2472,
+      "step": 1234
+    },
+    {
+      "epoch": 0.2199074074074074,
+      "grad_norm": 0.4290023744106293,
+      "learning_rate": 0.00019855730424314167,
+      "loss": 1.0502,
+      "step": 1235
+    },
+    {
+      "epoch": 0.22008547008547008,
+      "grad_norm": 0.4418254792690277,
+      "learning_rate": 0.00019855493419655945,
+      "loss": 1.0589,
+      "step": 1236
+    },
+    {
+      "epoch": 0.22026353276353278,
+      "grad_norm": 0.4074663817882538,
+      "learning_rate": 0.000198552562218998,
+      "loss": 0.9197,
+      "step": 1237
+    },
+    {
+      "epoch": 0.22044159544159544,
+      "grad_norm": 0.4526660740375519,
+      "learning_rate": 0.00019855018831050383,
+      "loss": 1.2578,
+      "step": 1238
+    },
+    {
+      "epoch": 0.2206196581196581,
+      "grad_norm": 0.4747827649116516,
+      "learning_rate": 0.00019854781247112343,
+      "loss": 1.0841,
+      "step": 1239
+    },
+    {
+      "epoch": 0.2207977207977208,
+      "grad_norm": 0.41567128896713257,
+      "learning_rate": 0.00019854543470090334,
+      "loss": 1.0737,
+      "step": 1240
+    },
+    {
+      "epoch": 0.22097578347578348,
+      "grad_norm": 0.4793100953102112,
+      "learning_rate": 0.00019854305499989022,
+      "loss": 1.1972,
+      "step": 1241
+    },
+    {
+      "epoch": 0.22115384615384615,
+      "grad_norm": 0.41755473613739014,
+      "learning_rate": 0.00019854067336813058,
+      "loss": 1.2529,
+      "step": 1242
+    },
+    {
+      "epoch": 0.22133190883190884,
+      "grad_norm": 0.40421152114868164,
+      "learning_rate": 0.0001985382898056712,
+      "loss": 1.0549,
+      "step": 1243
+    },
+    {
+      "epoch": 0.2215099715099715,
+      "grad_norm": 0.45779645442962646,
+      "learning_rate": 0.0001985359043125587,
+      "loss": 1.1586,
+      "step": 1244
+    },
+    {
+      "epoch": 0.22168803418803418,
+      "grad_norm": 0.4380546808242798,
+      "learning_rate": 0.00019853351688883987,
+      "loss": 1.1024,
+      "step": 1245
+    },
+    {
+      "epoch": 0.22186609686609687,
+      "grad_norm": 0.39917269349098206,
+      "learning_rate": 0.00019853112753456142,
+      "loss": 0.9823,
+      "step": 1246
+    },
+    {
+      "epoch": 0.22204415954415954,
+      "grad_norm": 0.4228038489818573,
+      "learning_rate": 0.00019852873624977022,
+      "loss": 1.1684,
+      "step": 1247
+    },
+    {
+      "epoch": 0.2222222222222222,
+      "grad_norm": 0.4462146759033203,
+      "learning_rate": 0.00019852634303451315,
+      "loss": 0.9027,
+      "step": 1248
+    },
+    {
+      "epoch": 0.2224002849002849,
+      "grad_norm": 0.5682163834571838,
+      "learning_rate": 0.000198523947888837,
+      "loss": 1.141,
+      "step": 1249
+    },
+    {
+      "epoch": 0.22257834757834757,
+      "grad_norm": 0.44866830110549927,
+      "learning_rate": 0.0001985215508127888,
+      "loss": 1.0759,
+      "step": 1250
+    },
+    {
+      "epoch": 0.22275641025641027,
+      "grad_norm": 0.4034106135368347,
+      "learning_rate": 0.00019851915180641548,
+      "loss": 1.0675,
+      "step": 1251
+    },
+    {
+      "epoch": 0.22293447293447294,
+      "grad_norm": 0.4780726432800293,
+      "learning_rate": 0.00019851675086976397,
+      "loss": 1.0283,
+      "step": 1252
+    },
+    {
+      "epoch": 0.2231125356125356,
+      "grad_norm": 0.48892372846603394,
+      "learning_rate": 0.00019851434800288145,
+      "loss": 1.1159,
+      "step": 1253
+    },
+    {
+      "epoch": 0.2232905982905983,
+      "grad_norm": 0.42629215121269226,
+      "learning_rate": 0.0001985119432058149,
+      "loss": 1.0292,
+      "step": 1254
+    },
+    {
+      "epoch": 0.22346866096866097,
+      "grad_norm": 0.4496444761753082,
+      "learning_rate": 0.00019850953647861146,
+      "loss": 1.0252,
+      "step": 1255
+    },
+    {
+      "epoch": 0.22364672364672364,
+      "grad_norm": 0.4371408224105835,
+      "learning_rate": 0.00019850712782131828,
+      "loss": 1.1104,
+      "step": 1256
+    },
+    {
+      "epoch": 0.22382478632478633,
+      "grad_norm": 0.4910794496536255,
+      "learning_rate": 0.00019850471723398258,
+      "loss": 1.1928,
+      "step": 1257
+    },
+    {
+      "epoch": 0.224002849002849,
+      "grad_norm": 0.41235068440437317,
+      "learning_rate": 0.00019850230471665157,
+      "loss": 1.1261,
+      "step": 1258
+    },
+    {
+      "epoch": 0.22418091168091167,
+      "grad_norm": 0.4507700502872467,
+      "learning_rate": 0.0001984998902693725,
+      "loss": 1.0602,
+      "step": 1259
+    },
+    {
+      "epoch": 0.22435897435897437,
+      "grad_norm": 0.4654198884963989,
+      "learning_rate": 0.00019849747389219272,
+      "loss": 1.1258,
+      "step": 1260
+    },
+    {
+      "epoch": 0.22453703703703703,
+      "grad_norm": 0.439807653427124,
+      "learning_rate": 0.00019849505558515952,
+      "loss": 1.2312,
+      "step": 1261
+    },
+    {
+      "epoch": 0.2247150997150997,
+      "grad_norm": 0.4309258759021759,
+      "learning_rate": 0.00019849263534832035,
+      "loss": 1.0083,
+      "step": 1262
+    },
+    {
+      "epoch": 0.2248931623931624,
+      "grad_norm": 0.4920141100883484,
+      "learning_rate": 0.00019849021318172255,
+      "loss": 1.0254,
+      "step": 1263
+    },
+    {
+      "epoch": 0.22507122507122507,
+      "grad_norm": 0.5333457589149475,
+      "learning_rate": 0.00019848778908541367,
+      "loss": 1.3017,
+      "step": 1264
+    },
+    {
+      "epoch": 0.22524928774928774,
+      "grad_norm": 0.4096757769584656,
+      "learning_rate": 0.0001984853630594411,
+      "loss": 0.9531,
+      "step": 1265
+    },
+    {
+      "epoch": 0.22542735042735043,
+      "grad_norm": 0.5744075775146484,
+      "learning_rate": 0.00019848293510385244,
+      "loss": 1.1414,
+      "step": 1266
+    },
+    {
+      "epoch": 0.2256054131054131,
+      "grad_norm": 0.44707193970680237,
+      "learning_rate": 0.00019848050521869529,
+      "loss": 1.1926,
+      "step": 1267
+    },
+    {
+      "epoch": 0.2257834757834758,
+      "grad_norm": 0.4162999391555786,
+      "learning_rate": 0.00019847807340401716,
+      "loss": 1.1354,
+      "step": 1268
+    },
+    {
+      "epoch": 0.22596153846153846,
+      "grad_norm": 0.4273204207420349,
+      "learning_rate": 0.0001984756396598658,
+      "loss": 0.9956,
+      "step": 1269
+    },
+    {
+      "epoch": 0.22613960113960113,
+      "grad_norm": 0.5670466423034668,
+      "learning_rate": 0.00019847320398628878,
+      "loss": 1.2384,
+      "step": 1270
+    },
+    {
+      "epoch": 0.22631766381766383,
+      "grad_norm": 0.424544095993042,
+      "learning_rate": 0.00019847076638333395,
+      "loss": 0.9963,
+      "step": 1271
+    },
+    {
+      "epoch": 0.2264957264957265,
+      "grad_norm": 0.3716120719909668,
+      "learning_rate": 0.000198468326851049,
+      "loss": 0.865,
+      "step": 1272
+    },
+    {
+      "epoch": 0.22667378917378916,
+      "grad_norm": 0.4472847282886505,
+      "learning_rate": 0.00019846588538948172,
+      "loss": 1.174,
+      "step": 1273
+    },
+    {
+      "epoch": 0.22685185185185186,
+      "grad_norm": 0.4599195718765259,
+      "learning_rate": 0.00019846344199867994,
+      "loss": 1.289,
+      "step": 1274
+    },
+    {
+      "epoch": 0.22702991452991453,
+      "grad_norm": 0.4303213357925415,
+      "learning_rate": 0.0001984609966786916,
+      "loss": 1.1606,
+      "step": 1275
+    },
+    {
+      "epoch": 0.2272079772079772,
+      "grad_norm": 0.44893527030944824,
+      "learning_rate": 0.00019845854942956455,
+      "loss": 1.1043,
+      "step": 1276
+    },
+    {
+      "epoch": 0.2273860398860399,
+      "grad_norm": 0.40033379197120667,
+      "learning_rate": 0.00019845610025134676,
+      "loss": 1.1434,
+      "step": 1277
+    },
+    {
+      "epoch": 0.22756410256410256,
+      "grad_norm": 0.4385402202606201,
+      "learning_rate": 0.00019845364914408616,
+      "loss": 0.9943,
+      "step": 1278
+    },
+    {
+      "epoch": 0.22774216524216523,
+      "grad_norm": 0.42123618721961975,
+      "learning_rate": 0.0001984511961078309,
+      "loss": 1.0911,
+      "step": 1279
+    },
+    {
+      "epoch": 0.22792022792022792,
+      "grad_norm": 0.5558577179908752,
+      "learning_rate": 0.00019844874114262893,
+      "loss": 1.3893,
+      "step": 1280
+    },
+    {
+      "epoch": 0.2280982905982906,
+      "grad_norm": 0.3996453583240509,
+      "learning_rate": 0.00019844628424852835,
+      "loss": 0.8951,
+      "step": 1281
+    },
+    {
+      "epoch": 0.2282763532763533,
+      "grad_norm": 0.3943425714969635,
+      "learning_rate": 0.0001984438254255774,
+      "loss": 1.0595,
+      "step": 1282
+    },
+    {
+      "epoch": 0.22845441595441596,
+      "grad_norm": 0.4429021179676056,
+      "learning_rate": 0.00019844136467382414,
+      "loss": 1.0853,
+      "step": 1283
+    },
+    {
+      "epoch": 0.22863247863247863,
+      "grad_norm": 0.4515686631202698,
+      "learning_rate": 0.00019843890199331687,
+      "loss": 1.0829,
+      "step": 1284
+    },
+    {
+      "epoch": 0.22881054131054132,
+      "grad_norm": 0.5157768726348877,
+      "learning_rate": 0.00019843643738410378,
+      "loss": 1.334,
+      "step": 1285
+    },
+    {
+      "epoch": 0.228988603988604,
+      "grad_norm": 0.45833173394203186,
+      "learning_rate": 0.0001984339708462332,
+      "loss": 1.1353,
+      "step": 1286
+    },
+    {
+      "epoch": 0.22916666666666666,
+      "grad_norm": 0.46610337495803833,
+      "learning_rate": 0.00019843150237975344,
+      "loss": 1.1338,
+      "step": 1287
+    },
+    {
+      "epoch": 0.22934472934472935,
+      "grad_norm": 0.5076978802680969,
+      "learning_rate": 0.00019842903198471286,
+      "loss": 1.1811,
+      "step": 1288
+    },
+    {
+      "epoch": 0.22952279202279202,
+      "grad_norm": 0.4297824800014496,
+      "learning_rate": 0.00019842655966115986,
+      "loss": 1.1799,
+      "step": 1289
+    },
+    {
+      "epoch": 0.2297008547008547,
+      "grad_norm": 0.5304586291313171,
+      "learning_rate": 0.0001984240854091429,
+      "loss": 1.1315,
+      "step": 1290
+    },
+    {
+      "epoch": 0.22987891737891739,
+      "grad_norm": 0.45359212160110474,
+      "learning_rate": 0.00019842160922871042,
+      "loss": 1.1037,
+      "step": 1291
+    },
+    {
+      "epoch": 0.23005698005698005,
+      "grad_norm": 0.4416881203651428,
+      "learning_rate": 0.00019841913111991096,
+      "loss": 1.122,
+      "step": 1292
+    },
+    {
+      "epoch": 0.23023504273504272,
+      "grad_norm": 0.46682995557785034,
+      "learning_rate": 0.0001984166510827931,
+      "loss": 0.9808,
+      "step": 1293
+    },
+    {
+      "epoch": 0.23041310541310542,
+      "grad_norm": 0.44172337651252747,
+      "learning_rate": 0.00019841416911740538,
+      "loss": 0.9167,
+      "step": 1294
+    },
+    {
+      "epoch": 0.23059116809116809,
+      "grad_norm": 0.40562742948532104,
+      "learning_rate": 0.0001984116852237965,
+      "loss": 0.9547,
+      "step": 1295
+    },
+    {
+      "epoch": 0.23076923076923078,
+      "grad_norm": 0.4040384888648987,
+      "learning_rate": 0.00019840919940201503,
+      "loss": 1.1039,
+      "step": 1296
+    },
+    {
+      "epoch": 0.23094729344729345,
+      "grad_norm": 0.5094077587127686,
+      "learning_rate": 0.00019840671165210973,
+      "loss": 1.2283,
+      "step": 1297
+    },
+    {
+      "epoch": 0.23112535612535612,
+      "grad_norm": 0.48553213477134705,
+      "learning_rate": 0.00019840422197412938,
+      "loss": 1.0927,
+      "step": 1298
+    },
+    {
+      "epoch": 0.23130341880341881,
+      "grad_norm": 0.5197509527206421,
+      "learning_rate": 0.00019840173036812266,
+      "loss": 1.2154,
+      "step": 1299
+    },
+    {
+      "epoch": 0.23148148148148148,
+      "grad_norm": 0.42069005966186523,
+      "learning_rate": 0.0001983992368341385,
+      "loss": 1.0076,
+      "step": 1300
+    },
+    {
+      "epoch": 0.23165954415954415,
+      "grad_norm": 0.475204735994339,
+      "learning_rate": 0.00019839674137222567,
+      "loss": 1.1682,
+      "step": 1301
+    },
+    {
+      "epoch": 0.23183760683760685,
+      "grad_norm": 0.55730140209198,
+      "learning_rate": 0.0001983942439824331,
+      "loss": 1.2948,
+      "step": 1302
+    },
+    {
+      "epoch": 0.23201566951566951,
+      "grad_norm": 0.4533313512802124,
+      "learning_rate": 0.00019839174466480973,
+      "loss": 1.2691,
+      "step": 1303
+    },
+    {
+      "epoch": 0.23219373219373218,
+      "grad_norm": 0.4733520746231079,
+      "learning_rate": 0.0001983892434194045,
+      "loss": 1.2232,
+      "step": 1304
+    },
+    {
+      "epoch": 0.23237179487179488,
+      "grad_norm": 0.5085756182670593,
+      "learning_rate": 0.00019838674024626643,
+      "loss": 1.1347,
+      "step": 1305
+    },
+    {
+      "epoch": 0.23254985754985755,
+      "grad_norm": 0.4679976999759674,
+      "learning_rate": 0.00019838423514544456,
+      "loss": 1.0018,
+      "step": 1306
+    },
+    {
+      "epoch": 0.23272792022792022,
+      "grad_norm": 0.4234481751918793,
+      "learning_rate": 0.00019838172811698795,
+      "loss": 1.0472,
+      "step": 1307
+    },
+    {
+      "epoch": 0.2329059829059829,
+      "grad_norm": 0.5749204158782959,
+      "learning_rate": 0.00019837921916094579,
+      "loss": 1.2239,
+      "step": 1308
+    },
+    {
+      "epoch": 0.23308404558404558,
+      "grad_norm": 0.46715882420539856,
+      "learning_rate": 0.0001983767082773672,
+      "loss": 1.1924,
+      "step": 1309
+    },
+    {
+      "epoch": 0.23326210826210828,
+      "grad_norm": 0.5079745054244995,
+      "learning_rate": 0.00019837419546630137,
+      "loss": 1.1086,
+      "step": 1310
+    },
+    {
+      "epoch": 0.23344017094017094,
+      "grad_norm": 0.4419243037700653,
+      "learning_rate": 0.0001983716807277975,
+      "loss": 1.1911,
+      "step": 1311
+    },
+    {
+      "epoch": 0.2336182336182336,
+      "grad_norm": 0.5107570290565491,
+      "learning_rate": 0.00019836916406190493,
+      "loss": 1.1071,
+      "step": 1312
+    },
+    {
+      "epoch": 0.2337962962962963,
+      "grad_norm": 0.5295659303665161,
+      "learning_rate": 0.00019836664546867293,
+      "loss": 1.2905,
+      "step": 1313
+    },
+    {
+      "epoch": 0.23397435897435898,
+      "grad_norm": 0.4844837784767151,
+      "learning_rate": 0.00019836412494815084,
+      "loss": 1.3507,
+      "step": 1314
+    },
+    {
+      "epoch": 0.23415242165242164,
+      "grad_norm": 0.6166049242019653,
+      "learning_rate": 0.00019836160250038808,
+      "loss": 1.2822,
+      "step": 1315
+    },
+    {
+      "epoch": 0.23433048433048434,
+      "grad_norm": 0.3229198753833771,
+      "learning_rate": 0.00019835907812543402,
+      "loss": 0.4959,
+      "step": 1316
+    },
+    {
+      "epoch": 0.234508547008547,
+      "grad_norm": 0.5788772702217102,
+      "learning_rate": 0.00019835655182333815,
+      "loss": 1.0832,
+      "step": 1317
+    },
+    {
+      "epoch": 0.23468660968660968,
+      "grad_norm": 0.525705099105835,
+      "learning_rate": 0.00019835402359414997,
+      "loss": 1.0968,
+      "step": 1318
+    },
+    {
+      "epoch": 0.23486467236467237,
+      "grad_norm": 0.5007779002189636,
+      "learning_rate": 0.000198351493437919,
+      "loss": 1.2788,
+      "step": 1319
+    },
+    {
+      "epoch": 0.23504273504273504,
+      "grad_norm": 0.4276871383190155,
+      "learning_rate": 0.00019834896135469484,
+      "loss": 1.0419,
+      "step": 1320
+    },
+    {
+      "epoch": 0.2352207977207977,
+      "grad_norm": 0.5359070301055908,
+      "learning_rate": 0.00019834642734452708,
+      "loss": 1.1308,
+      "step": 1321
+    },
+    {
+      "epoch": 0.2353988603988604,
+      "grad_norm": 0.4854908883571625,
+      "learning_rate": 0.0001983438914074654,
+      "loss": 1.1211,
+      "step": 1322
+    },
+    {
+      "epoch": 0.23557692307692307,
+      "grad_norm": 0.4913707375526428,
+      "learning_rate": 0.0001983413535435594,
+      "loss": 1.2392,
+      "step": 1323
+    },
+    {
+      "epoch": 0.23575498575498577,
+      "grad_norm": 0.46755748987197876,
+      "learning_rate": 0.0001983388137528589,
+      "loss": 0.9348,
+      "step": 1324
+    },
+    {
+      "epoch": 0.23593304843304844,
+      "grad_norm": 0.4592570960521698,
+      "learning_rate": 0.0001983362720354136,
+      "loss": 1.1339,
+      "step": 1325
+    },
+    {
+      "epoch": 0.2361111111111111,
+      "grad_norm": 0.5121711492538452,
+      "learning_rate": 0.00019833372839127335,
+      "loss": 1.2973,
+      "step": 1326
+    },
+    {
+      "epoch": 0.2362891737891738,
+      "grad_norm": 0.4809017479419708,
+      "learning_rate": 0.000198331182820488,
+      "loss": 0.9849,
+      "step": 1327
+    },
+    {
+      "epoch": 0.23646723646723647,
+      "grad_norm": 0.42340895533561707,
+      "learning_rate": 0.00019832863532310733,
+      "loss": 1.0731,
+      "step": 1328
+    },
+    {
+      "epoch": 0.23664529914529914,
+      "grad_norm": 0.5388045310974121,
+      "learning_rate": 0.00019832608589918135,
+      "loss": 1.0729,
+      "step": 1329
+    },
+    {
+      "epoch": 0.23682336182336183,
+      "grad_norm": 0.43075770139694214,
+      "learning_rate": 0.00019832353454875992,
+      "loss": 1.1684,
+      "step": 1330
+    },
+    {
+      "epoch": 0.2370014245014245,
+      "grad_norm": 0.554927408695221,
+      "learning_rate": 0.00019832098127189313,
+      "loss": 1.0842,
+      "step": 1331
+    },
+    {
+      "epoch": 0.23717948717948717,
+      "grad_norm": 0.5359260439872742,
+      "learning_rate": 0.0001983184260686309,
+      "loss": 1.2399,
+      "step": 1332
+    },
+    {
+      "epoch": 0.23735754985754987,
+      "grad_norm": 0.5141251087188721,
+      "learning_rate": 0.0001983158689390234,
+      "loss": 1.3752,
+      "step": 1333
+    },
+    {
+      "epoch": 0.23753561253561253,
+      "grad_norm": 0.4578750431537628,
+      "learning_rate": 0.00019831330988312067,
+      "loss": 1.0965,
+      "step": 1334
+    },
+    {
+      "epoch": 0.2377136752136752,
+      "grad_norm": 0.47974497079849243,
+      "learning_rate": 0.00019831074890097286,
+      "loss": 1.3379,
+      "step": 1335
+    },
+    {
+      "epoch": 0.2378917378917379,
+      "grad_norm": 0.4618176817893982,
+      "learning_rate": 0.00019830818599263014,
+      "loss": 1.274,
+      "step": 1336
+    },
+    {
+      "epoch": 0.23806980056980057,
+      "grad_norm": 0.4279816448688507,
+      "learning_rate": 0.00019830562115814276,
+      "loss": 0.996,
+      "step": 1337
+    },
+    {
+      "epoch": 0.23824786324786323,
+      "grad_norm": 0.4255026876926422,
+      "learning_rate": 0.0001983030543975609,
+      "loss": 0.969,
+      "step": 1338
+    },
+    {
+      "epoch": 0.23842592592592593,
+      "grad_norm": 0.4551412761211395,
+      "learning_rate": 0.00019830048571093493,
+      "loss": 1.0204,
+      "step": 1339
+    },
+    {
+      "epoch": 0.2386039886039886,
+      "grad_norm": 0.4747903048992157,
+      "learning_rate": 0.00019829791509831513,
+      "loss": 1.1816,
+      "step": 1340
+    },
+    {
+      "epoch": 0.2387820512820513,
+      "grad_norm": 0.47187140583992004,
+      "learning_rate": 0.00019829534255975188,
+      "loss": 1.1205,
+      "step": 1341
+    },
+    {
+      "epoch": 0.23896011396011396,
+      "grad_norm": 0.49332180619239807,
+      "learning_rate": 0.0001982927680952956,
+      "loss": 1.2657,
+      "step": 1342
+    },
+    {
+      "epoch": 0.23913817663817663,
+      "grad_norm": 0.5162837505340576,
+      "learning_rate": 0.0001982901917049967,
+      "loss": 1.2247,
+      "step": 1343
+    },
+    {
+      "epoch": 0.23931623931623933,
+      "grad_norm": 0.43407055735588074,
+      "learning_rate": 0.0001982876133889057,
+      "loss": 1.0038,
+      "step": 1344
+    },
+    {
+      "epoch": 0.239494301994302,
+      "grad_norm": 0.5132251977920532,
+      "learning_rate": 0.00019828503314707306,
+      "loss": 1.0678,
+      "step": 1345
+    },
+    {
+      "epoch": 0.23967236467236466,
+      "grad_norm": 0.46295464038848877,
+      "learning_rate": 0.00019828245097954937,
+      "loss": 1.1802,
+      "step": 1346
+    },
+    {
+      "epoch": 0.23985042735042736,
+      "grad_norm": 0.4682658314704895,
+      "learning_rate": 0.00019827986688638523,
+      "loss": 1.0249,
+      "step": 1347
+    },
+    {
+      "epoch": 0.24002849002849003,
+      "grad_norm": 0.49990561604499817,
+      "learning_rate": 0.00019827728086763125,
+      "loss": 1.0691,
+      "step": 1348
+    },
+    {
+      "epoch": 0.2402065527065527,
+      "grad_norm": 0.39090847969055176,
+      "learning_rate": 0.00019827469292333806,
+      "loss": 0.8367,
+      "step": 1349
+    },
+    {
+      "epoch": 0.2403846153846154,
+      "grad_norm": 0.5023905634880066,
+      "learning_rate": 0.00019827210305355645,
+      "loss": 1.0675,
+      "step": 1350
+    },
+    {
+      "epoch": 0.24056267806267806,
+      "grad_norm": 0.4744076430797577,
+      "learning_rate": 0.00019826951125833715,
+      "loss": 1.3166,
+      "step": 1351
+    },
+    {
+      "epoch": 0.24074074074074073,
+      "grad_norm": 0.44914689660072327,
+      "learning_rate": 0.00019826691753773088,
+      "loss": 0.9818,
+      "step": 1352
+    },
+    {
+      "epoch": 0.24091880341880342,
+      "grad_norm": 0.44391971826553345,
+      "learning_rate": 0.00019826432189178853,
+      "loss": 1.0448,
+      "step": 1353
+    },
+    {
+      "epoch": 0.2410968660968661,
+      "grad_norm": 0.46102839708328247,
+      "learning_rate": 0.00019826172432056086,
+      "loss": 0.9952,
+      "step": 1354
+    },
+    {
+      "epoch": 0.2412749287749288,
+      "grad_norm": 0.4796878695487976,
+      "learning_rate": 0.00019825912482409884,
+      "loss": 1.0977,
+      "step": 1355
+    },
+    {
+      "epoch": 0.24145299145299146,
+      "grad_norm": 0.5003768801689148,
+      "learning_rate": 0.0001982565234024534,
+      "loss": 1.3149,
+      "step": 1356
+    },
+    {
+      "epoch": 0.24163105413105412,
+      "grad_norm": 0.43475663661956787,
+      "learning_rate": 0.00019825392005567551,
+      "loss": 1.0527,
+      "step": 1357
+    },
+    {
+      "epoch": 0.24180911680911682,
+      "grad_norm": 0.46120527386665344,
+      "learning_rate": 0.00019825131478381613,
+      "loss": 1.2333,
+      "step": 1358
+    },
+    {
+      "epoch": 0.2419871794871795,
+      "grad_norm": 0.43748101592063904,
+      "learning_rate": 0.00019824870758692638,
+      "loss": 0.9788,
+      "step": 1359
+    },
+    {
+      "epoch": 0.24216524216524216,
+      "grad_norm": 0.5275192856788635,
+      "learning_rate": 0.00019824609846505727,
+      "loss": 1.1473,
+      "step": 1360
+    },
+    {
+      "epoch": 0.24234330484330485,
+      "grad_norm": 0.346463143825531,
+      "learning_rate": 0.00019824348741825993,
+      "loss": 0.6824,
+      "step": 1361
+    },
+    {
+      "epoch": 0.24252136752136752,
+      "grad_norm": 0.5004115700721741,
+      "learning_rate": 0.00019824087444658556,
+      "loss": 1.1853,
+      "step": 1362
+    },
+    {
+      "epoch": 0.2426994301994302,
+      "grad_norm": 0.42746666073799133,
+      "learning_rate": 0.00019823825955008533,
+      "loss": 0.9355,
+      "step": 1363
+    },
+    {
+      "epoch": 0.24287749287749288,
+      "grad_norm": 0.4099743068218231,
+      "learning_rate": 0.00019823564272881047,
+      "loss": 1.0753,
+      "step": 1364
+    },
+    {
+      "epoch": 0.24305555555555555,
+      "grad_norm": 0.5262967944145203,
+      "learning_rate": 0.00019823302398281226,
+      "loss": 1.2324,
+      "step": 1365
+    },
+    {
+      "epoch": 0.24323361823361822,
+      "grad_norm": 0.436069518327713,
+      "learning_rate": 0.000198230403312142,
+      "loss": 1.1887,
+      "step": 1366
+    },
+    {
+      "epoch": 0.24341168091168092,
+      "grad_norm": 0.38252368569374084,
+      "learning_rate": 0.00019822778071685107,
+      "loss": 1.0211,
+      "step": 1367
+    },
+    {
+      "epoch": 0.24358974358974358,
+      "grad_norm": 0.48024141788482666,
+      "learning_rate": 0.00019822515619699081,
+      "loss": 1.065,
+      "step": 1368
+    },
+    {
+      "epoch": 0.24376780626780628,
+      "grad_norm": 0.47421589493751526,
+      "learning_rate": 0.00019822252975261267,
+      "loss": 1.0433,
+      "step": 1369
+    },
+    {
+      "epoch": 0.24394586894586895,
+      "grad_norm": 0.46094807982444763,
+      "learning_rate": 0.00019821990138376808,
+      "loss": 1.1427,
+      "step": 1370
+    },
+    {
+      "epoch": 0.24412393162393162,
+      "grad_norm": 0.5093680620193481,
+      "learning_rate": 0.00019821727109050856,
+      "loss": 1.1086,
+      "step": 1371
+    },
+    {
+      "epoch": 0.2443019943019943,
+      "grad_norm": 0.41084879636764526,
+      "learning_rate": 0.00019821463887288566,
+      "loss": 1.0068,
+      "step": 1372
+    },
+    {
+      "epoch": 0.24448005698005698,
+      "grad_norm": 0.4991084635257721,
+      "learning_rate": 0.0001982120047309509,
+      "loss": 1.1884,
+      "step": 1373
+    },
+    {
+      "epoch": 0.24465811965811965,
+      "grad_norm": 0.39198383688926697,
+      "learning_rate": 0.00019820936866475595,
+      "loss": 0.9776,
+      "step": 1374
+    },
+    {
+      "epoch": 0.24483618233618235,
+      "grad_norm": 0.4517424702644348,
+      "learning_rate": 0.00019820673067435244,
+      "loss": 1.1491,
+      "step": 1375
+    },
+    {
+      "epoch": 0.245014245014245,
+      "grad_norm": 0.45881983637809753,
+      "learning_rate": 0.00019820409075979202,
+      "loss": 1.1198,
+      "step": 1376
+    },
+    {
+      "epoch": 0.24519230769230768,
+      "grad_norm": 0.4498792290687561,
+      "learning_rate": 0.00019820144892112646,
+      "loss": 1.0897,
+      "step": 1377
+    },
+    {
+      "epoch": 0.24537037037037038,
+      "grad_norm": 0.4128037393093109,
+      "learning_rate": 0.00019819880515840752,
+      "loss": 0.9415,
+      "step": 1378
+    },
+    {
+      "epoch": 0.24554843304843305,
+      "grad_norm": 0.4340885281562805,
+      "learning_rate": 0.00019819615947168698,
+      "loss": 1.201,
+      "step": 1379
+    },
+    {
+      "epoch": 0.24572649572649571,
+      "grad_norm": 0.43814027309417725,
+      "learning_rate": 0.00019819351186101667,
+      "loss": 1.1039,
+      "step": 1380
+    },
+    {
+      "epoch": 0.2459045584045584,
+      "grad_norm": 0.40115082263946533,
+      "learning_rate": 0.00019819086232644845,
+      "loss": 1.2599,
+      "step": 1381
+    },
+    {
+      "epoch": 0.24608262108262108,
+      "grad_norm": 0.4947351813316345,
+      "learning_rate": 0.00019818821086803426,
+      "loss": 1.252,
+      "step": 1382
+    },
+    {
+      "epoch": 0.24626068376068377,
+      "grad_norm": 0.45179441571235657,
+      "learning_rate": 0.0001981855574858261,
+      "loss": 1.1323,
+      "step": 1383
+    },
+    {
+      "epoch": 0.24643874643874644,
+      "grad_norm": 0.47159844636917114,
+      "learning_rate": 0.00019818290217987587,
+      "loss": 1.2053,
+      "step": 1384
+    },
+    {
+      "epoch": 0.2466168091168091,
+      "grad_norm": 0.4358448386192322,
+      "learning_rate": 0.0001981802449502356,
+      "loss": 1.1174,
+      "step": 1385
+    },
+    {
+      "epoch": 0.2467948717948718,
+      "grad_norm": 0.4588233530521393,
+      "learning_rate": 0.00019817758579695745,
+      "loss": 1.1098,
+      "step": 1386
+    },
+    {
+      "epoch": 0.24697293447293447,
+      "grad_norm": 0.4955112636089325,
+      "learning_rate": 0.00019817492472009338,
+      "loss": 1.258,
+      "step": 1387
+    },
+    {
+      "epoch": 0.24715099715099714,
+      "grad_norm": 0.4226941764354706,
+      "learning_rate": 0.00019817226171969565,
+      "loss": 1.0976,
+      "step": 1388
+    },
+    {
+      "epoch": 0.24732905982905984,
+      "grad_norm": 0.4076840579509735,
+      "learning_rate": 0.00019816959679581637,
+      "loss": 1.0121,
+      "step": 1389
+    },
+    {
+      "epoch": 0.2475071225071225,
+      "grad_norm": 0.4395063519477844,
+      "learning_rate": 0.0001981669299485078,
+      "loss": 1.3153,
+      "step": 1390
+    },
+    {
+      "epoch": 0.24768518518518517,
+      "grad_norm": 0.41010400652885437,
+      "learning_rate": 0.0001981642611778221,
+      "loss": 1.0717,
+      "step": 1391
+    },
+    {
+      "epoch": 0.24786324786324787,
+      "grad_norm": 0.43459352850914,
+      "learning_rate": 0.00019816159048381167,
+      "loss": 1.1077,
+      "step": 1392
+    },
+    {
+      "epoch": 0.24804131054131054,
+      "grad_norm": 0.46291449666023254,
+      "learning_rate": 0.00019815891786652875,
+      "loss": 1.0257,
+      "step": 1393
+    },
+    {
+      "epoch": 0.2482193732193732,
+      "grad_norm": 0.46408146619796753,
+      "learning_rate": 0.00019815624332602578,
+      "loss": 0.7899,
+      "step": 1394
+    },
+    {
+      "epoch": 0.2483974358974359,
+      "grad_norm": 0.4763357937335968,
+      "learning_rate": 0.00019815356686235508,
+      "loss": 0.9857,
+      "step": 1395
+    },
+    {
+      "epoch": 0.24857549857549857,
+      "grad_norm": 0.4766457676887512,
+      "learning_rate": 0.00019815088847556918,
+      "loss": 1.0589,
+      "step": 1396
+    },
+    {
+      "epoch": 0.24875356125356127,
+      "grad_norm": 0.4486583173274994,
+      "learning_rate": 0.0001981482081657205,
+      "loss": 1.2572,
+      "step": 1397
+    },
+    {
+      "epoch": 0.24893162393162394,
+      "grad_norm": 0.468878835439682,
+      "learning_rate": 0.00019814552593286155,
+      "loss": 1.101,
+      "step": 1398
+    },
+    {
+      "epoch": 0.2491096866096866,
+      "grad_norm": 0.4230278730392456,
+      "learning_rate": 0.0001981428417770449,
+      "loss": 0.9457,
+      "step": 1399
+    },
+    {
+      "epoch": 0.2492877492877493,
+      "grad_norm": 0.45630761981010437,
+      "learning_rate": 0.00019814015569832315,
+      "loss": 1.0665,
+      "step": 1400
+    },
+    {
+      "epoch": 0.24946581196581197,
+      "grad_norm": 0.5780113935470581,
+      "learning_rate": 0.00019813746769674893,
+      "loss": 1.1064,
+      "step": 1401
+    },
+    {
+      "epoch": 0.24964387464387464,
+      "grad_norm": 0.4343436658382416,
+      "learning_rate": 0.0001981347777723749,
+      "loss": 1.1132,
+      "step": 1402
+    },
+    {
+      "epoch": 0.24982193732193733,
+      "grad_norm": 0.4879056513309479,
+      "learning_rate": 0.0001981320859252537,
+      "loss": 1.1301,
+      "step": 1403
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.5248328447341919,
+      "learning_rate": 0.00019812939215543818,
+      "loss": 1.1468,
+      "step": 1404
+    },
+    {
+      "epoch": 0.25,
+      "eval_loss": 1.115895390510559,
+      "eval_runtime": 25.0474,
+      "eval_samples_per_second": 41.561,
+      "eval_steps_per_second": 20.801,
+      "step": 1404
+    },
+    {
+      "epoch": 0.2501780626780627,
+      "grad_norm": 0.5076769590377808,
+      "learning_rate": 0.00019812669646298106,
+      "loss": 1.1428,
+      "step": 1405
+    },
+    {
+      "epoch": 0.25035612535612534,
+      "grad_norm": 0.5510252714157104,
+      "learning_rate": 0.00019812399884793514,
+      "loss": 1.3383,
+      "step": 1406
+    },
+    {
+      "epoch": 0.25053418803418803,
+      "grad_norm": 0.48918986320495605,
+      "learning_rate": 0.0001981212993103533,
+      "loss": 1.1507,
+      "step": 1407
+    },
+    {
+      "epoch": 0.25071225071225073,
+      "grad_norm": 0.4678935110569,
+      "learning_rate": 0.00019811859785028846,
+      "loss": 1.13,
+      "step": 1408
+    },
+    {
+      "epoch": 0.25089031339031337,
+      "grad_norm": 0.5155254602432251,
+      "learning_rate": 0.0001981158944677935,
+      "loss": 1.1194,
+      "step": 1409
+    },
+    {
+      "epoch": 0.25106837606837606,
+      "grad_norm": 0.4533839523792267,
+      "learning_rate": 0.00019811318916292142,
+      "loss": 0.9464,
+      "step": 1410
+    },
+    {
+      "epoch": 0.25124643874643876,
+      "grad_norm": 0.5142433047294617,
+      "learning_rate": 0.00019811048193572517,
+      "loss": 1.0837,
+      "step": 1411
+    },
+    {
+      "epoch": 0.2514245014245014,
+      "grad_norm": 0.4330446124076843,
+      "learning_rate": 0.00019810777278625788,
+      "loss": 0.9117,
+      "step": 1412
+    },
+    {
+      "epoch": 0.2516025641025641,
+      "grad_norm": 0.44806256890296936,
+      "learning_rate": 0.00019810506171457254,
+      "loss": 1.1643,
+      "step": 1413
+    },
+    {
+      "epoch": 0.2517806267806268,
+      "grad_norm": 0.43526285886764526,
+      "learning_rate": 0.00019810234872072235,
+      "loss": 0.9776,
+      "step": 1414
+    },
+    {
+      "epoch": 0.25195868945868943,
+      "grad_norm": 0.47394511103630066,
+      "learning_rate": 0.00019809963380476039,
+      "loss": 1.0935,
+      "step": 1415
+    },
+    {
+      "epoch": 0.25213675213675213,
+      "grad_norm": 0.48961278796195984,
+      "learning_rate": 0.00019809691696673993,
+      "loss": 1.179,
+      "step": 1416
+    },
+    {
+      "epoch": 0.2523148148148148,
+      "grad_norm": 0.43153589963912964,
+      "learning_rate": 0.00019809419820671412,
+      "loss": 0.906,
+      "step": 1417
+    },
+    {
+      "epoch": 0.25249287749287747,
+      "grad_norm": 0.41187527775764465,
+      "learning_rate": 0.00019809147752473632,
+      "loss": 0.899,
+      "step": 1418
+    },
+    {
+      "epoch": 0.25267094017094016,
+      "grad_norm": 0.5003183484077454,
+      "learning_rate": 0.00019808875492085973,
+      "loss": 1.0606,
+      "step": 1419
+    },
+    {
+      "epoch": 0.25284900284900286,
+      "grad_norm": 0.4430316984653473,
+      "learning_rate": 0.00019808603039513778,
+      "loss": 0.9167,
+      "step": 1420
+    },
+    {
+      "epoch": 0.25302706552706555,
+      "grad_norm": 0.4577699601650238,
+      "learning_rate": 0.00019808330394762382,
+      "loss": 1.1184,
+      "step": 1421
+    },
+    {
+      "epoch": 0.2532051282051282,
+      "grad_norm": 0.42656826972961426,
+      "learning_rate": 0.0001980805755783713,
+      "loss": 0.9335,
+      "step": 1422
+    },
+    {
+      "epoch": 0.2533831908831909,
+      "grad_norm": 0.40980881452560425,
+      "learning_rate": 0.0001980778452874336,
+      "loss": 0.9756,
+      "step": 1423
+    },
+    {
+      "epoch": 0.2535612535612536,
+      "grad_norm": 0.5752090811729431,
+      "learning_rate": 0.00019807511307486423,
+      "loss": 1.1694,
+      "step": 1424
+    },
+    {
+      "epoch": 0.2537393162393162,
+      "grad_norm": 0.5000349283218384,
+      "learning_rate": 0.00019807237894071681,
+      "loss": 0.9515,
+      "step": 1425
+    },
+    {
+      "epoch": 0.2539173789173789,
+      "grad_norm": 0.5159069299697876,
+      "learning_rate": 0.00019806964288504483,
+      "loss": 1.4014,
+      "step": 1426
+    },
+    {
+      "epoch": 0.2540954415954416,
+      "grad_norm": 0.5377941131591797,
+      "learning_rate": 0.00019806690490790194,
+      "loss": 1.2832,
+      "step": 1427
+    },
+    {
+      "epoch": 0.25427350427350426,
+      "grad_norm": 0.4565938711166382,
+      "learning_rate": 0.00019806416500934174,
+      "loss": 1.0629,
+      "step": 1428
+    },
+    {
+      "epoch": 0.25445156695156695,
+      "grad_norm": 0.49867144227027893,
+      "learning_rate": 0.00019806142318941797,
+      "loss": 1.2011,
+      "step": 1429
+    },
+    {
+      "epoch": 0.25462962962962965,
+      "grad_norm": 0.5111994743347168,
+      "learning_rate": 0.00019805867944818427,
+      "loss": 0.8925,
+      "step": 1430
+    },
+    {
+      "epoch": 0.2548076923076923,
+      "grad_norm": 0.5204268097877502,
+      "learning_rate": 0.00019805593378569448,
+      "loss": 1.2956,
+      "step": 1431
+    },
+    {
+      "epoch": 0.254985754985755,
+      "grad_norm": 0.3889026939868927,
+      "learning_rate": 0.00019805318620200234,
+      "loss": 1.0355,
+      "step": 1432
+    },
+    {
+      "epoch": 0.2551638176638177,
+      "grad_norm": 0.46825656294822693,
+      "learning_rate": 0.00019805043669716174,
+      "loss": 1.0444,
+      "step": 1433
+    },
+    {
+      "epoch": 0.2553418803418803,
+      "grad_norm": 0.4509420394897461,
+      "learning_rate": 0.00019804768527122648,
+      "loss": 1.0423,
+      "step": 1434
+    },
+    {
+      "epoch": 0.255519943019943,
+      "grad_norm": 0.4514774978160858,
+      "learning_rate": 0.0001980449319242505,
+      "loss": 1.1588,
+      "step": 1435
+    },
+    {
+      "epoch": 0.2556980056980057,
+      "grad_norm": 0.43019044399261475,
+      "learning_rate": 0.0001980421766562878,
+      "loss": 0.9939,
+      "step": 1436
+    },
+    {
+      "epoch": 0.25587606837606836,
+      "grad_norm": 0.5056091547012329,
+      "learning_rate": 0.00019803941946739228,
+      "loss": 1.1238,
+      "step": 1437
+    },
+    {
+      "epoch": 0.25605413105413105,
+      "grad_norm": 0.48664605617523193,
+      "learning_rate": 0.000198036660357618,
+      "loss": 1.0702,
+      "step": 1438
+    },
+    {
+      "epoch": 0.25623219373219375,
+      "grad_norm": 0.4500972032546997,
+      "learning_rate": 0.000198033899327019,
+      "loss": 0.9365,
+      "step": 1439
+    },
+    {
+      "epoch": 0.2564102564102564,
+      "grad_norm": 0.4800589382648468,
+      "learning_rate": 0.0001980311363756494,
+      "loss": 1.1159,
+      "step": 1440
+    },
+    {
+      "epoch": 0.2565883190883191,
+      "grad_norm": 0.3486495316028595,
+      "learning_rate": 0.0001980283715035633,
+      "loss": 0.6029,
+      "step": 1441
+    },
+    {
+      "epoch": 0.2567663817663818,
+      "grad_norm": 0.46258702874183655,
+      "learning_rate": 0.00019802560471081493,
+      "loss": 1.025,
+      "step": 1442
+    },
+    {
+      "epoch": 0.2569444444444444,
+      "grad_norm": 0.4846673607826233,
+      "learning_rate": 0.00019802283599745844,
+      "loss": 1.1105,
+      "step": 1443
+    },
+    {
+      "epoch": 0.2571225071225071,
+      "grad_norm": 0.4586990475654602,
+      "learning_rate": 0.00019802006536354813,
+      "loss": 0.9897,
+      "step": 1444
+    },
+    {
+      "epoch": 0.2573005698005698,
+      "grad_norm": 0.5177786350250244,
+      "learning_rate": 0.00019801729280913825,
+      "loss": 1.2558,
+      "step": 1445
+    },
+    {
+      "epoch": 0.25747863247863245,
+      "grad_norm": 0.43213751912117004,
+      "learning_rate": 0.00019801451833428312,
+      "loss": 1.0961,
+      "step": 1446
+    },
+    {
+      "epoch": 0.25765669515669515,
+      "grad_norm": 0.42974478006362915,
+      "learning_rate": 0.00019801174193903714,
+      "loss": 1.0659,
+      "step": 1447
+    },
+    {
+      "epoch": 0.25783475783475784,
+      "grad_norm": 0.4424504339694977,
+      "learning_rate": 0.00019800896362345464,
+      "loss": 0.9805,
+      "step": 1448
+    },
+    {
+      "epoch": 0.25801282051282054,
+      "grad_norm": 0.4734833836555481,
+      "learning_rate": 0.0001980061833875901,
+      "loss": 1.255,
+      "step": 1449
+    },
+    {
+      "epoch": 0.2581908831908832,
+      "grad_norm": 0.41024845838546753,
+      "learning_rate": 0.000198003401231498,
+      "loss": 1.0908,
+      "step": 1450
+    },
+    {
+      "epoch": 0.2583689458689459,
+      "grad_norm": 0.43603816628456116,
+      "learning_rate": 0.00019800061715523283,
+      "loss": 1.0611,
+      "step": 1451
+    },
+    {
+      "epoch": 0.25854700854700857,
+      "grad_norm": 0.4871339499950409,
+      "learning_rate": 0.00019799783115884915,
+      "loss": 1.1851,
+      "step": 1452
+    },
+    {
+      "epoch": 0.2587250712250712,
+      "grad_norm": 0.49758270382881165,
+      "learning_rate": 0.00019799504324240157,
+      "loss": 1.1936,
+      "step": 1453
+    },
+    {
+      "epoch": 0.2589031339031339,
+      "grad_norm": 0.4201010763645172,
+      "learning_rate": 0.00019799225340594466,
+      "loss": 1.1567,
+      "step": 1454
+    },
+    {
+      "epoch": 0.2590811965811966,
+      "grad_norm": 0.4200313091278076,
+      "learning_rate": 0.00019798946164953309,
+      "loss": 0.9666,
+      "step": 1455
+    },
+    {
+      "epoch": 0.25925925925925924,
+      "grad_norm": 0.43001702427864075,
+      "learning_rate": 0.0001979866679732216,
+      "loss": 1.0104,
+      "step": 1456
+    },
+    {
+      "epoch": 0.25943732193732194,
+      "grad_norm": 0.46733465790748596,
+      "learning_rate": 0.0001979838723770649,
+      "loss": 1.0927,
+      "step": 1457
+    },
+    {
+      "epoch": 0.25961538461538464,
+      "grad_norm": 0.4513280391693115,
+      "learning_rate": 0.00019798107486111773,
+      "loss": 1.0282,
+      "step": 1458
+    },
+    {
+      "epoch": 0.2597934472934473,
+      "grad_norm": 0.40411749482154846,
+      "learning_rate": 0.00019797827542543495,
+      "loss": 1.0789,
+      "step": 1459
+    },
+    {
+      "epoch": 0.25997150997151,
+      "grad_norm": 0.4359099268913269,
+      "learning_rate": 0.0001979754740700714,
+      "loss": 1.0616,
+      "step": 1460
+    },
+    {
+      "epoch": 0.26014957264957267,
+      "grad_norm": 0.4979047477245331,
+      "learning_rate": 0.00019797267079508198,
+      "loss": 1.2948,
+      "step": 1461
+    },
+    {
+      "epoch": 0.2603276353276353,
+      "grad_norm": 0.44698619842529297,
+      "learning_rate": 0.0001979698656005216,
+      "loss": 0.9198,
+      "step": 1462
+    },
+    {
+      "epoch": 0.260505698005698,
+      "grad_norm": 0.48437631130218506,
+      "learning_rate": 0.00019796705848644516,
+      "loss": 1.3207,
+      "step": 1463
+    },
+    {
+      "epoch": 0.2606837606837607,
+      "grad_norm": 0.4382587671279907,
+      "learning_rate": 0.00019796424945290778,
+      "loss": 1.1315,
+      "step": 1464
+    },
+    {
+      "epoch": 0.26086182336182334,
+      "grad_norm": 0.4565944969654083,
+      "learning_rate": 0.0001979614384999644,
+      "loss": 1.1893,
+      "step": 1465
+    },
+    {
+      "epoch": 0.26103988603988604,
+      "grad_norm": 0.4705163836479187,
+      "learning_rate": 0.00019795862562767017,
+      "loss": 1.1132,
+      "step": 1466
+    },
+    {
+      "epoch": 0.26121794871794873,
+      "grad_norm": 0.525184690952301,
+      "learning_rate": 0.00019795581083608012,
+      "loss": 1.2111,
+      "step": 1467
+    },
+    {
+      "epoch": 0.2613960113960114,
+      "grad_norm": 0.45215457677841187,
+      "learning_rate": 0.00019795299412524945,
+      "loss": 1.1851,
+      "step": 1468
+    },
+    {
+      "epoch": 0.26157407407407407,
+      "grad_norm": 0.4336663484573364,
+      "learning_rate": 0.00019795017549523335,
+      "loss": 1.0147,
+      "step": 1469
+    },
+    {
+      "epoch": 0.26175213675213677,
+      "grad_norm": 0.5327649712562561,
+      "learning_rate": 0.00019794735494608703,
+      "loss": 1.1743,
+      "step": 1470
+    },
+    {
+      "epoch": 0.2619301994301994,
+      "grad_norm": 0.49972307682037354,
+      "learning_rate": 0.00019794453247786578,
+      "loss": 1.1624,
+      "step": 1471
+    },
+    {
+      "epoch": 0.2621082621082621,
+      "grad_norm": 0.43475785851478577,
+      "learning_rate": 0.00019794170809062485,
+      "loss": 0.9888,
+      "step": 1472
+    },
+    {
+      "epoch": 0.2622863247863248,
+      "grad_norm": 0.428838849067688,
+      "learning_rate": 0.0001979388817844196,
+      "loss": 0.9154,
+      "step": 1473
+    },
+    {
+      "epoch": 0.26246438746438744,
+      "grad_norm": 0.508568286895752,
+      "learning_rate": 0.00019793605355930544,
+      "loss": 1.1679,
+      "step": 1474
+    },
+    {
+      "epoch": 0.26264245014245013,
+      "grad_norm": 0.47791770100593567,
+      "learning_rate": 0.00019793322341533776,
+      "loss": 1.1375,
+      "step": 1475
+    },
+    {
+      "epoch": 0.26282051282051283,
+      "grad_norm": 0.41909220814704895,
+      "learning_rate": 0.00019793039135257196,
+      "loss": 1.0235,
+      "step": 1476
+    },
+    {
+      "epoch": 0.26299857549857547,
+      "grad_norm": 0.5564408302307129,
+      "learning_rate": 0.00019792755737106361,
+      "loss": 1.0756,
+      "step": 1477
+    },
+    {
+      "epoch": 0.26317663817663817,
+      "grad_norm": 0.42813625931739807,
+      "learning_rate": 0.0001979247214708682,
+      "loss": 0.8213,
+      "step": 1478
+    },
+    {
+      "epoch": 0.26335470085470086,
+      "grad_norm": 0.44495970010757446,
+      "learning_rate": 0.00019792188365204126,
+      "loss": 0.9654,
+      "step": 1479
+    },
+    {
+      "epoch": 0.26353276353276356,
+      "grad_norm": 0.47473424673080444,
+      "learning_rate": 0.00019791904391463846,
+      "loss": 1.1643,
+      "step": 1480
+    },
+    {
+      "epoch": 0.2637108262108262,
+      "grad_norm": 0.40189051628112793,
+      "learning_rate": 0.0001979162022587154,
+      "loss": 0.8687,
+      "step": 1481
+    },
+    {
+      "epoch": 0.2638888888888889,
+      "grad_norm": 0.44629937410354614,
+      "learning_rate": 0.00019791335868432776,
+      "loss": 1.0284,
+      "step": 1482
+    },
+    {
+      "epoch": 0.2640669515669516,
+      "grad_norm": 0.511275053024292,
+      "learning_rate": 0.00019791051319153124,
+      "loss": 1.2217,
+      "step": 1483
+    },
+    {
+      "epoch": 0.26424501424501423,
+      "grad_norm": 0.5136445164680481,
+      "learning_rate": 0.00019790766578038163,
+      "loss": 1.1129,
+      "step": 1484
+    },
+    {
+      "epoch": 0.2644230769230769,
+      "grad_norm": 0.4450451135635376,
+      "learning_rate": 0.00019790481645093469,
+      "loss": 0.9912,
+      "step": 1485
+    },
+    {
+      "epoch": 0.2646011396011396,
+      "grad_norm": 0.39455199241638184,
+      "learning_rate": 0.00019790196520324621,
+      "loss": 1.0887,
+      "step": 1486
+    },
+    {
+      "epoch": 0.26477920227920226,
+      "grad_norm": 0.4444045126438141,
+      "learning_rate": 0.00019789911203737216,
+      "loss": 1.1559,
+      "step": 1487
+    },
+    {
+      "epoch": 0.26495726495726496,
+      "grad_norm": 0.4769677221775055,
+      "learning_rate": 0.0001978962569533683,
+      "loss": 1.147,
+      "step": 1488
+    },
+    {
+      "epoch": 0.26513532763532766,
+      "grad_norm": 0.40226617455482483,
+      "learning_rate": 0.0001978933999512907,
+      "loss": 1.0966,
+      "step": 1489
+    },
+    {
+      "epoch": 0.2653133903133903,
+      "grad_norm": 0.4640974700450897,
+      "learning_rate": 0.00019789054103119526,
+      "loss": 1.1002,
+      "step": 1490
+    },
+    {
+      "epoch": 0.265491452991453,
+      "grad_norm": 0.48251107335090637,
+      "learning_rate": 0.00019788768019313806,
+      "loss": 1.07,
+      "step": 1491
+    },
+    {
+      "epoch": 0.2656695156695157,
+      "grad_norm": 0.4836949408054352,
+      "learning_rate": 0.00019788481743717506,
+      "loss": 1.2992,
+      "step": 1492
+    },
+    {
+      "epoch": 0.26584757834757833,
+      "grad_norm": 0.4253857135772705,
+      "learning_rate": 0.00019788195276336244,
+      "loss": 1.1326,
+      "step": 1493
+    },
+    {
+      "epoch": 0.266025641025641,
+      "grad_norm": 0.5161862373352051,
+      "learning_rate": 0.0001978790861717563,
+      "loss": 1.2131,
+      "step": 1494
+    },
+    {
+      "epoch": 0.2662037037037037,
+      "grad_norm": 0.5223346948623657,
+      "learning_rate": 0.00019787621766241274,
+      "loss": 1.0933,
+      "step": 1495
+    },
+    {
+      "epoch": 0.26638176638176636,
+      "grad_norm": 0.37622541189193726,
+      "learning_rate": 0.000197873347235388,
+      "loss": 0.8919,
+      "step": 1496
+    },
+    {
+      "epoch": 0.26655982905982906,
+      "grad_norm": 0.4425419569015503,
+      "learning_rate": 0.0001978704748907384,
+      "loss": 1.0411,
+      "step": 1497
+    },
+    {
+      "epoch": 0.26673789173789175,
+      "grad_norm": 0.4536985456943512,
+      "learning_rate": 0.00019786760062852015,
+      "loss": 1.2747,
+      "step": 1498
+    },
+    {
+      "epoch": 0.2669159544159544,
+      "grad_norm": 0.4998049736022949,
+      "learning_rate": 0.00019786472444878955,
+      "loss": 1.3214,
+      "step": 1499
+    },
+    {
+      "epoch": 0.2670940170940171,
+      "grad_norm": 0.42104312777519226,
+      "learning_rate": 0.00019786184635160295,
+      "loss": 0.7878,
+      "step": 1500
+    },
+    {
+      "epoch": 0.2672720797720798,
+      "grad_norm": 0.5354288220405579,
+      "learning_rate": 0.00019785896633701678,
+      "loss": 1.0642,
+      "step": 1501
+    },
+    {
+      "epoch": 0.2674501424501424,
+      "grad_norm": 0.4681485891342163,
+      "learning_rate": 0.00019785608440508744,
+      "loss": 1.1737,
+      "step": 1502
+    },
+    {
+      "epoch": 0.2676282051282051,
+      "grad_norm": 0.49107062816619873,
+      "learning_rate": 0.0001978532005558714,
+      "loss": 1.1507,
+      "step": 1503
+    },
+    {
+      "epoch": 0.2678062678062678,
+      "grad_norm": 0.4173283576965332,
+      "learning_rate": 0.0001978503147894252,
+      "loss": 1.0538,
+      "step": 1504
+    },
+    {
+      "epoch": 0.26798433048433046,
+      "grad_norm": 0.49354055523872375,
+      "learning_rate": 0.0001978474271058053,
+      "loss": 1.1043,
+      "step": 1505
+    },
+    {
+      "epoch": 0.26816239316239315,
+      "grad_norm": 0.5787215232849121,
+      "learning_rate": 0.00019784453750506834,
+      "loss": 0.9245,
+      "step": 1506
+    },
+    {
+      "epoch": 0.26834045584045585,
+      "grad_norm": 0.48982590436935425,
+      "learning_rate": 0.00019784164598727095,
+      "loss": 1.2007,
+      "step": 1507
+    },
+    {
+      "epoch": 0.26851851851851855,
+      "grad_norm": 0.4971007704734802,
+      "learning_rate": 0.00019783875255246973,
+      "loss": 1.1174,
+      "step": 1508
+    },
+    {
+      "epoch": 0.2686965811965812,
+      "grad_norm": 0.5200340151786804,
+      "learning_rate": 0.00019783585720072142,
+      "loss": 1.1967,
+      "step": 1509
+    },
+    {
+      "epoch": 0.2688746438746439,
+      "grad_norm": 0.47911885380744934,
+      "learning_rate": 0.00019783295993208271,
+      "loss": 1.162,
+      "step": 1510
+    },
+    {
+      "epoch": 0.2690527065527066,
+      "grad_norm": 0.4764275848865509,
+      "learning_rate": 0.00019783006074661037,
+      "loss": 1.1358,
+      "step": 1511
+    },
+    {
+      "epoch": 0.2692307692307692,
+      "grad_norm": 0.478545606136322,
+      "learning_rate": 0.00019782715964436124,
+      "loss": 1.0096,
+      "step": 1512
+    },
+    {
+      "epoch": 0.2694088319088319,
+      "grad_norm": 0.5512787699699402,
+      "learning_rate": 0.00019782425662539212,
+      "loss": 1.1799,
+      "step": 1513
+    },
+    {
+      "epoch": 0.2695868945868946,
+      "grad_norm": 0.5495108962059021,
+      "learning_rate": 0.00019782135168975988,
+      "loss": 1.0959,
+      "step": 1514
+    },
+    {
+      "epoch": 0.26976495726495725,
+      "grad_norm": 0.42052868008613586,
+      "learning_rate": 0.0001978184448375215,
+      "loss": 1.1872,
+      "step": 1515
+    },
+    {
+      "epoch": 0.26994301994301995,
+      "grad_norm": 0.4994426965713501,
+      "learning_rate": 0.0001978155360687339,
+      "loss": 1.0568,
+      "step": 1516
+    },
+    {
+      "epoch": 0.27012108262108264,
+      "grad_norm": 0.459577351808548,
+      "learning_rate": 0.00019781262538345402,
+      "loss": 1.0315,
+      "step": 1517
+    },
+    {
+      "epoch": 0.2702991452991453,
+      "grad_norm": 0.4792841374874115,
+      "learning_rate": 0.00019780971278173895,
+      "loss": 1.2055,
+      "step": 1518
+    },
+    {
+      "epoch": 0.270477207977208,
+      "grad_norm": 0.5017708539962769,
+      "learning_rate": 0.00019780679826364575,
+      "loss": 1.157,
+      "step": 1519
+    },
+    {
+      "epoch": 0.2706552706552707,
+      "grad_norm": 0.5197349786758423,
+      "learning_rate": 0.00019780388182923152,
+      "loss": 0.9101,
+      "step": 1520
+    },
+    {
+      "epoch": 0.2708333333333333,
+      "grad_norm": 0.4226742684841156,
+      "learning_rate": 0.00019780096347855338,
+      "loss": 1.0525,
+      "step": 1521
+    },
+    {
+      "epoch": 0.271011396011396,
+      "grad_norm": 0.5058164596557617,
+      "learning_rate": 0.00019779804321166852,
+      "loss": 0.931,
+      "step": 1522
+    },
+    {
+      "epoch": 0.2711894586894587,
+      "grad_norm": 0.44492244720458984,
+      "learning_rate": 0.00019779512102863418,
+      "loss": 1.0641,
+      "step": 1523
+    },
+    {
+      "epoch": 0.27136752136752135,
+      "grad_norm": 0.5348989963531494,
+      "learning_rate": 0.00019779219692950758,
+      "loss": 1.1692,
+      "step": 1524
+    },
+    {
+      "epoch": 0.27154558404558404,
+      "grad_norm": 0.4631774425506592,
+      "learning_rate": 0.00019778927091434602,
+      "loss": 1.0876,
+      "step": 1525
+    },
+    {
+      "epoch": 0.27172364672364674,
+      "grad_norm": 0.45957499742507935,
+      "learning_rate": 0.00019778634298320684,
+      "loss": 0.9527,
+      "step": 1526
+    },
+    {
+      "epoch": 0.2719017094017094,
+      "grad_norm": 0.4506755769252777,
+      "learning_rate": 0.00019778341313614743,
+      "loss": 1.086,
+      "step": 1527
+    },
+    {
+      "epoch": 0.2720797720797721,
+      "grad_norm": 0.4900587797164917,
+      "learning_rate": 0.00019778048137322513,
+      "loss": 0.9911,
+      "step": 1528
+    },
+    {
+      "epoch": 0.27225783475783477,
+      "grad_norm": 0.478127658367157,
+      "learning_rate": 0.00019777754769449745,
+      "loss": 1.2083,
+      "step": 1529
+    },
+    {
+      "epoch": 0.2724358974358974,
+      "grad_norm": 0.47220897674560547,
+      "learning_rate": 0.00019777461210002183,
+      "loss": 1.0313,
+      "step": 1530
+    },
+    {
+      "epoch": 0.2726139601139601,
+      "grad_norm": 0.4526277184486389,
+      "learning_rate": 0.0001977716745898558,
+      "loss": 1.2648,
+      "step": 1531
+    },
+    {
+      "epoch": 0.2727920227920228,
+      "grad_norm": 0.42907601594924927,
+      "learning_rate": 0.00019776873516405688,
+      "loss": 0.8645,
+      "step": 1532
+    },
+    {
+      "epoch": 0.27297008547008544,
+      "grad_norm": 0.43440163135528564,
+      "learning_rate": 0.00019776579382268272,
+      "loss": 0.9702,
+      "step": 1533
+    },
+    {
+      "epoch": 0.27314814814814814,
+      "grad_norm": 0.48213550448417664,
+      "learning_rate": 0.0001977628505657909,
+      "loss": 0.998,
+      "step": 1534
+    },
+    {
+      "epoch": 0.27332621082621084,
+      "grad_norm": 0.43385565280914307,
+      "learning_rate": 0.00019775990539343914,
+      "loss": 1.0575,
+      "step": 1535
+    },
+    {
+      "epoch": 0.27350427350427353,
+      "grad_norm": 0.45706847310066223,
+      "learning_rate": 0.00019775695830568507,
+      "loss": 1.3024,
+      "step": 1536
+    },
+    {
+      "epoch": 0.27368233618233617,
+      "grad_norm": 0.45769137144088745,
+      "learning_rate": 0.00019775400930258652,
+      "loss": 1.0987,
+      "step": 1537
+    },
+    {
+      "epoch": 0.27386039886039887,
+      "grad_norm": 0.44682395458221436,
+      "learning_rate": 0.00019775105838420117,
+      "loss": 1.1327,
+      "step": 1538
+    },
+    {
+      "epoch": 0.27403846153846156,
+      "grad_norm": 0.5923072099685669,
+      "learning_rate": 0.00019774810555058694,
+      "loss": 1.4766,
+      "step": 1539
+    },
+    {
+      "epoch": 0.2742165242165242,
+      "grad_norm": 0.4327206015586853,
+      "learning_rate": 0.0001977451508018016,
+      "loss": 1.1175,
+      "step": 1540
+    },
+    {
+      "epoch": 0.2743945868945869,
+      "grad_norm": 0.48036691546440125,
+      "learning_rate": 0.00019774219413790315,
+      "loss": 1.1189,
+      "step": 1541
+    },
+    {
+      "epoch": 0.2745726495726496,
+      "grad_norm": 0.41371914744377136,
+      "learning_rate": 0.00019773923555894935,
+      "loss": 1.1366,
+      "step": 1542
+    },
+    {
+      "epoch": 0.27475071225071224,
+      "grad_norm": 0.4452378749847412,
+      "learning_rate": 0.00019773627506499832,
+      "loss": 0.9517,
+      "step": 1543
+    },
+    {
+      "epoch": 0.27492877492877493,
+      "grad_norm": 0.469098299741745,
+      "learning_rate": 0.00019773331265610802,
+      "loss": 1.0848,
+      "step": 1544
+    },
+    {
+      "epoch": 0.27510683760683763,
+      "grad_norm": 0.5390294790267944,
+      "learning_rate": 0.00019773034833233646,
+      "loss": 0.8589,
+      "step": 1545
+    },
+    {
+      "epoch": 0.27528490028490027,
+      "grad_norm": 0.5368238091468811,
+      "learning_rate": 0.00019772738209374174,
+      "loss": 1.2954,
+      "step": 1546
+    },
+    {
+      "epoch": 0.27546296296296297,
+      "grad_norm": 0.4705318510532379,
+      "learning_rate": 0.00019772441394038198,
+      "loss": 1.2252,
+      "step": 1547
+    },
+    {
+      "epoch": 0.27564102564102566,
+      "grad_norm": 0.4682813286781311,
+      "learning_rate": 0.00019772144387231533,
+      "loss": 1.0855,
+      "step": 1548
+    },
+    {
+      "epoch": 0.2758190883190883,
+      "grad_norm": 0.46876460313796997,
+      "learning_rate": 0.0001977184718896,
+      "loss": 1.1959,
+      "step": 1549
+    },
+    {
+      "epoch": 0.275997150997151,
+      "grad_norm": 0.4172806441783905,
+      "learning_rate": 0.00019771549799229416,
+      "loss": 1.2166,
+      "step": 1550
+    },
+    {
+      "epoch": 0.2761752136752137,
+      "grad_norm": 0.5088075399398804,
+      "learning_rate": 0.0001977125221804562,
+      "loss": 1.1285,
+      "step": 1551
+    },
+    {
+      "epoch": 0.27635327635327633,
+      "grad_norm": 0.4728628396987915,
+      "learning_rate": 0.0001977095444541443,
+      "loss": 1.2985,
+      "step": 1552
+    },
+    {
+      "epoch": 0.27653133903133903,
+      "grad_norm": 0.4431236684322357,
+      "learning_rate": 0.00019770656481341684,
+      "loss": 1.1298,
+      "step": 1553
+    },
+    {
+      "epoch": 0.2767094017094017,
+      "grad_norm": 0.474065363407135,
+      "learning_rate": 0.00019770358325833223,
+      "loss": 1.1915,
+      "step": 1554
+    },
+    {
+      "epoch": 0.27688746438746437,
+      "grad_norm": 0.45718875527381897,
+      "learning_rate": 0.00019770059978894885,
+      "loss": 1.0626,
+      "step": 1555
+    },
+    {
+      "epoch": 0.27706552706552706,
+      "grad_norm": 0.49300211668014526,
+      "learning_rate": 0.00019769761440532522,
+      "loss": 1.0134,
+      "step": 1556
+    },
+    {
+      "epoch": 0.27724358974358976,
+      "grad_norm": 0.4389498829841614,
+      "learning_rate": 0.00019769462710751974,
+      "loss": 1.0292,
+      "step": 1557
+    },
+    {
+      "epoch": 0.2774216524216524,
+      "grad_norm": 0.47330448031425476,
+      "learning_rate": 0.000197691637895591,
+      "loss": 1.1273,
+      "step": 1558
+    },
+    {
+      "epoch": 0.2775997150997151,
+      "grad_norm": 0.5322058200836182,
+      "learning_rate": 0.00019768864676959755,
+      "loss": 1.059,
+      "step": 1559
+    },
+    {
+      "epoch": 0.2777777777777778,
+      "grad_norm": 0.4714536964893341,
+      "learning_rate": 0.000197685653729598,
+      "loss": 1.1987,
+      "step": 1560
+    },
+    {
+      "epoch": 0.27795584045584043,
+      "grad_norm": 0.48687809705734253,
+      "learning_rate": 0.00019768265877565097,
+      "loss": 1.3206,
+      "step": 1561
+    },
+    {
+      "epoch": 0.2781339031339031,
+      "grad_norm": 0.46066713333129883,
+      "learning_rate": 0.00019767966190781518,
+      "loss": 1.0845,
+      "step": 1562
+    },
+    {
+      "epoch": 0.2783119658119658,
+      "grad_norm": 0.44372090697288513,
+      "learning_rate": 0.00019767666312614935,
+      "loss": 1.0942,
+      "step": 1563
+    },
+    {
+      "epoch": 0.27849002849002846,
+      "grad_norm": 0.4615907073020935,
+      "learning_rate": 0.00019767366243071216,
+      "loss": 1.071,
+      "step": 1564
+    },
+    {
+      "epoch": 0.27866809116809116,
+      "grad_norm": 0.502097487449646,
+      "learning_rate": 0.0001976706598215625,
+      "loss": 1.1164,
+      "step": 1565
+    },
+    {
+      "epoch": 0.27884615384615385,
+      "grad_norm": 0.4371815621852875,
+      "learning_rate": 0.00019766765529875913,
+      "loss": 1.0252,
+      "step": 1566
+    },
+    {
+      "epoch": 0.27902421652421655,
+      "grad_norm": 0.43035808205604553,
+      "learning_rate": 0.00019766464886236093,
+      "loss": 1.073,
+      "step": 1567
+    },
+    {
+      "epoch": 0.2792022792022792,
+      "grad_norm": 0.49721601605415344,
+      "learning_rate": 0.00019766164051242683,
+      "loss": 1.0316,
+      "step": 1568
+    },
+    {
+      "epoch": 0.2793803418803419,
+      "grad_norm": 0.44866231083869934,
+      "learning_rate": 0.00019765863024901576,
+      "loss": 1.0951,
+      "step": 1569
+    },
+    {
+      "epoch": 0.2795584045584046,
+      "grad_norm": 0.46318337321281433,
+      "learning_rate": 0.0001976556180721867,
+      "loss": 0.9836,
+      "step": 1570
+    },
+    {
+      "epoch": 0.2797364672364672,
+      "grad_norm": 0.4227696657180786,
+      "learning_rate": 0.00019765260398199868,
+      "loss": 1.0414,
+      "step": 1571
+    },
+    {
+      "epoch": 0.2799145299145299,
+      "grad_norm": 0.6062980890274048,
+      "learning_rate": 0.00019764958797851073,
+      "loss": 1.137,
+      "step": 1572
+    },
+    {
+      "epoch": 0.2800925925925926,
+      "grad_norm": 0.4856833219528198,
+      "learning_rate": 0.00019764657006178196,
+      "loss": 1.1361,
+      "step": 1573
+    },
+    {
+      "epoch": 0.28027065527065526,
+      "grad_norm": 0.45612895488739014,
+      "learning_rate": 0.00019764355023187146,
+      "loss": 1.0005,
+      "step": 1574
+    },
+    {
+      "epoch": 0.28044871794871795,
+      "grad_norm": 0.4143696129322052,
+      "learning_rate": 0.00019764052848883845,
+      "loss": 1.051,
+      "step": 1575
+    },
+    {
+      "epoch": 0.28062678062678065,
+      "grad_norm": 0.4532071352005005,
+      "learning_rate": 0.00019763750483274212,
+      "loss": 1.0595,
+      "step": 1576
+    },
+    {
+      "epoch": 0.2808048433048433,
+      "grad_norm": 0.4940357208251953,
+      "learning_rate": 0.0001976344792636417,
+      "loss": 1.0983,
+      "step": 1577
+    },
+    {
+      "epoch": 0.280982905982906,
+      "grad_norm": 0.44405099749565125,
+      "learning_rate": 0.0001976314517815965,
+      "loss": 1.0846,
+      "step": 1578
+    },
+    {
+      "epoch": 0.2811609686609687,
+      "grad_norm": 0.5508625507354736,
+      "learning_rate": 0.00019762842238666578,
+      "loss": 1.1722,
+      "step": 1579
+    },
+    {
+      "epoch": 0.2813390313390313,
+      "grad_norm": 0.5241084694862366,
+      "learning_rate": 0.00019762539107890894,
+      "loss": 1.351,
+      "step": 1580
+    },
+    {
+      "epoch": 0.281517094017094,
+      "grad_norm": 0.5307353734970093,
+      "learning_rate": 0.00019762235785838537,
+      "loss": 1.1868,
+      "step": 1581
+    },
+    {
+      "epoch": 0.2816951566951567,
+      "grad_norm": 0.45697924494743347,
+      "learning_rate": 0.00019761932272515447,
+      "loss": 1.1982,
+      "step": 1582
+    },
+    {
+      "epoch": 0.28187321937321935,
+      "grad_norm": 0.412483811378479,
+      "learning_rate": 0.00019761628567927574,
+      "loss": 1.0433,
+      "step": 1583
+    },
+    {
+      "epoch": 0.28205128205128205,
+      "grad_norm": 0.4614165425300598,
+      "learning_rate": 0.00019761324672080868,
+      "loss": 1.104,
+      "step": 1584
+    },
+    {
+      "epoch": 0.28222934472934474,
+      "grad_norm": 0.47644901275634766,
+      "learning_rate": 0.00019761020584981284,
+      "loss": 1.1037,
+      "step": 1585
+    },
+    {
+      "epoch": 0.2824074074074074,
+      "grad_norm": 0.4985184669494629,
+      "learning_rate": 0.00019760716306634773,
+      "loss": 1.2213,
+      "step": 1586
+    },
+    {
+      "epoch": 0.2825854700854701,
+      "grad_norm": 0.508301317691803,
+      "learning_rate": 0.00019760411837047305,
+      "loss": 1.1315,
+      "step": 1587
+    },
+    {
+      "epoch": 0.2827635327635328,
+      "grad_norm": 0.5346587300300598,
+      "learning_rate": 0.00019760107176224845,
+      "loss": 1.2281,
+      "step": 1588
+    },
+    {
+      "epoch": 0.2829415954415954,
+      "grad_norm": 0.5106825232505798,
+      "learning_rate": 0.00019759802324173357,
+      "loss": 1.2904,
+      "step": 1589
+    },
+    {
+      "epoch": 0.2831196581196581,
+      "grad_norm": 0.46458688378334045,
+      "learning_rate": 0.00019759497280898817,
+      "loss": 1.0861,
+      "step": 1590
+    },
+    {
+      "epoch": 0.2832977207977208,
+      "grad_norm": 0.49115365743637085,
+      "learning_rate": 0.00019759192046407201,
+      "loss": 1.0529,
+      "step": 1591
+    },
+    {
+      "epoch": 0.28347578347578345,
+      "grad_norm": 0.5114167332649231,
+      "learning_rate": 0.0001975888662070449,
+      "loss": 1.2555,
+      "step": 1592
+    },
+    {
+      "epoch": 0.28365384615384615,
+      "grad_norm": 0.45844775438308716,
+      "learning_rate": 0.0001975858100379667,
+      "loss": 1.0662,
+      "step": 1593
+    },
+    {
+      "epoch": 0.28383190883190884,
+      "grad_norm": 0.4684161841869354,
+      "learning_rate": 0.00019758275195689727,
+      "loss": 1.0537,
+      "step": 1594
+    },
+    {
+      "epoch": 0.28400997150997154,
+      "grad_norm": 0.4816220998764038,
+      "learning_rate": 0.0001975796919638965,
+      "loss": 1.126,
+      "step": 1595
+    },
+    {
+      "epoch": 0.2841880341880342,
+      "grad_norm": 0.46578118205070496,
+      "learning_rate": 0.0001975766300590244,
+      "loss": 0.9651,
+      "step": 1596
+    },
+    {
+      "epoch": 0.2843660968660969,
+      "grad_norm": 0.4181675612926483,
+      "learning_rate": 0.0001975735662423409,
+      "loss": 1.0888,
+      "step": 1597
+    },
+    {
+      "epoch": 0.28454415954415957,
+      "grad_norm": 0.49417954683303833,
+      "learning_rate": 0.00019757050051390609,
+      "loss": 1.1878,
+      "step": 1598
+    },
+    {
+      "epoch": 0.2847222222222222,
+      "grad_norm": 0.47264960408210754,
+      "learning_rate": 0.00019756743287377998,
+      "loss": 1.027,
+      "step": 1599
+    },
+    {
+      "epoch": 0.2849002849002849,
+      "grad_norm": 0.47686338424682617,
+      "learning_rate": 0.0001975643633220227,
+      "loss": 1.1307,
+      "step": 1600
+    },
+    {
+      "epoch": 0.2850783475783476,
+      "grad_norm": 0.5571266412734985,
+      "learning_rate": 0.00019756129185869443,
+      "loss": 0.984,
+      "step": 1601
+    },
+    {
+      "epoch": 0.28525641025641024,
+      "grad_norm": 0.46942809224128723,
+      "learning_rate": 0.00019755821848385527,
+      "loss": 1.0397,
+      "step": 1602
+    },
+    {
+      "epoch": 0.28543447293447294,
+      "grad_norm": 0.6325890421867371,
+      "learning_rate": 0.00019755514319756551,
+      "loss": 1.0918,
+      "step": 1603
+    },
+    {
+      "epoch": 0.28561253561253563,
+      "grad_norm": 0.5297608375549316,
+      "learning_rate": 0.00019755206599988533,
+      "loss": 0.9911,
+      "step": 1604
+    },
+    {
+      "epoch": 0.2857905982905983,
+      "grad_norm": 0.4736945331096649,
+      "learning_rate": 0.00019754898689087512,
+      "loss": 1.0786,
+      "step": 1605
+    },
+    {
+      "epoch": 0.28596866096866097,
+      "grad_norm": 0.5048685669898987,
+      "learning_rate": 0.00019754590587059512,
+      "loss": 0.9834,
+      "step": 1606
+    },
+    {
+      "epoch": 0.28614672364672367,
+      "grad_norm": 0.3823149502277374,
+      "learning_rate": 0.00019754282293910574,
+      "loss": 0.8341,
+      "step": 1607
+    },
+    {
+      "epoch": 0.2863247863247863,
+      "grad_norm": 0.44071945548057556,
+      "learning_rate": 0.00019753973809646738,
+      "loss": 1.131,
+      "step": 1608
+    },
+    {
+      "epoch": 0.286502849002849,
+      "grad_norm": 0.44182759523391724,
+      "learning_rate": 0.00019753665134274043,
+      "loss": 1.0321,
+      "step": 1609
+    },
+    {
+      "epoch": 0.2866809116809117,
+      "grad_norm": 0.4486250877380371,
+      "learning_rate": 0.00019753356267798546,
+      "loss": 0.9941,
+      "step": 1610
+    },
+    {
+      "epoch": 0.28685897435897434,
+      "grad_norm": 0.42796584963798523,
+      "learning_rate": 0.00019753047210226292,
+      "loss": 1.0235,
+      "step": 1611
+    },
+    {
+      "epoch": 0.28703703703703703,
+      "grad_norm": 0.47294023633003235,
+      "learning_rate": 0.00019752737961563336,
+      "loss": 1.11,
+      "step": 1612
+    },
+    {
+      "epoch": 0.28721509971509973,
+      "grad_norm": 0.44550734758377075,
+      "learning_rate": 0.00019752428521815742,
+      "loss": 1.0849,
+      "step": 1613
+    },
+    {
+      "epoch": 0.28739316239316237,
+      "grad_norm": 0.44189929962158203,
+      "learning_rate": 0.0001975211889098957,
+      "loss": 0.8904,
+      "step": 1614
+    },
+    {
+      "epoch": 0.28757122507122507,
+      "grad_norm": 0.5302733182907104,
+      "learning_rate": 0.00019751809069090885,
+      "loss": 1.2348,
+      "step": 1615
+    },
+    {
+      "epoch": 0.28774928774928776,
+      "grad_norm": 0.5951390862464905,
+      "learning_rate": 0.00019751499056125762,
+      "loss": 1.3035,
+      "step": 1616
+    },
+    {
+      "epoch": 0.2879273504273504,
+      "grad_norm": 0.5431534647941589,
+      "learning_rate": 0.0001975118885210027,
+      "loss": 1.0016,
+      "step": 1617
+    },
+    {
+      "epoch": 0.2881054131054131,
+      "grad_norm": 0.47301986813545227,
+      "learning_rate": 0.00019750878457020489,
+      "loss": 1.2245,
+      "step": 1618
+    },
+    {
+      "epoch": 0.2882834757834758,
+      "grad_norm": 0.44785359501838684,
+      "learning_rate": 0.00019750567870892497,
+      "loss": 1.122,
+      "step": 1619
+    },
+    {
+      "epoch": 0.28846153846153844,
+      "grad_norm": 0.49494361877441406,
+      "learning_rate": 0.00019750257093722383,
+      "loss": 0.9421,
+      "step": 1620
+    },
+    {
+      "epoch": 0.28863960113960113,
+      "grad_norm": 0.4484521150588989,
+      "learning_rate": 0.00019749946125516242,
+      "loss": 1.2146,
+      "step": 1621
+    },
+    {
+      "epoch": 0.28881766381766383,
+      "grad_norm": 0.4635269343852997,
+      "learning_rate": 0.00019749634966280156,
+      "loss": 0.976,
+      "step": 1622
+    },
+    {
+      "epoch": 0.28899572649572647,
+      "grad_norm": 0.5532249808311462,
+      "learning_rate": 0.00019749323616020226,
+      "loss": 1.1818,
+      "step": 1623
+    },
+    {
+      "epoch": 0.28917378917378916,
+      "grad_norm": 0.4730629622936249,
+      "learning_rate": 0.00019749012074742552,
+      "loss": 1.0321,
+      "step": 1624
+    },
+    {
+      "epoch": 0.28935185185185186,
+      "grad_norm": 0.47437289357185364,
+      "learning_rate": 0.0001974870034245324,
+      "loss": 1.1572,
+      "step": 1625
+    },
+    {
+      "epoch": 0.28952991452991456,
+      "grad_norm": 0.4796304404735565,
+      "learning_rate": 0.00019748388419158394,
+      "loss": 1.1667,
+      "step": 1626
+    },
+    {
+      "epoch": 0.2897079772079772,
+      "grad_norm": 0.42686304450035095,
+      "learning_rate": 0.0001974807630486413,
+      "loss": 0.9824,
+      "step": 1627
+    },
+    {
+      "epoch": 0.2898860398860399,
+      "grad_norm": 0.4444865584373474,
+      "learning_rate": 0.00019747763999576558,
+      "loss": 1.2789,
+      "step": 1628
+    },
+    {
+      "epoch": 0.2900641025641026,
+      "grad_norm": 0.5039985179901123,
+      "learning_rate": 0.000197474515033018,
+      "loss": 1.1488,
+      "step": 1629
+    },
+    {
+      "epoch": 0.29024216524216523,
+      "grad_norm": 0.581479549407959,
+      "learning_rate": 0.00019747138816045978,
+      "loss": 1.1232,
+      "step": 1630
+    },
+    {
+      "epoch": 0.2904202279202279,
+      "grad_norm": 0.5415821075439453,
+      "learning_rate": 0.00019746825937815222,
+      "loss": 1.2326,
+      "step": 1631
+    },
+    {
+      "epoch": 0.2905982905982906,
+      "grad_norm": 0.45528364181518555,
+      "learning_rate": 0.00019746512868615656,
+      "loss": 1.0246,
+      "step": 1632
+    },
+    {
+      "epoch": 0.29077635327635326,
+      "grad_norm": 0.5255574584007263,
+      "learning_rate": 0.00019746199608453418,
+      "loss": 1.0592,
+      "step": 1633
+    },
+    {
+      "epoch": 0.29095441595441596,
+      "grad_norm": 0.5064096450805664,
+      "learning_rate": 0.00019745886157334646,
+      "loss": 1.3439,
+      "step": 1634
+    },
+    {
+      "epoch": 0.29113247863247865,
+      "grad_norm": 0.500848650932312,
+      "learning_rate": 0.00019745572515265475,
+      "loss": 1.1212,
+      "step": 1635
+    },
+    {
+      "epoch": 0.2913105413105413,
+      "grad_norm": 0.5229088068008423,
+      "learning_rate": 0.00019745258682252062,
+      "loss": 1.1019,
+      "step": 1636
+    },
+    {
+      "epoch": 0.291488603988604,
+      "grad_norm": 0.4494398832321167,
+      "learning_rate": 0.00019744944658300545,
+      "loss": 1.1298,
+      "step": 1637
+    },
+    {
+      "epoch": 0.2916666666666667,
+      "grad_norm": 0.48383277654647827,
+      "learning_rate": 0.00019744630443417082,
+      "loss": 1.206,
+      "step": 1638
+    },
+    {
+      "epoch": 0.2918447293447293,
+      "grad_norm": 0.4870131313800812,
+      "learning_rate": 0.00019744316037607828,
+      "loss": 1.2096,
+      "step": 1639
+    },
+    {
+      "epoch": 0.292022792022792,
+      "grad_norm": 0.4153090715408325,
+      "learning_rate": 0.00019744001440878944,
+      "loss": 1.0478,
+      "step": 1640
+    },
+    {
+      "epoch": 0.2922008547008547,
+      "grad_norm": 0.4262249171733856,
+      "learning_rate": 0.0001974368665323659,
+      "loss": 1.0393,
+      "step": 1641
+    },
+    {
+      "epoch": 0.29237891737891736,
+      "grad_norm": 0.46131134033203125,
+      "learning_rate": 0.00019743371674686938,
+      "loss": 1.0908,
+      "step": 1642
+    },
+    {
+      "epoch": 0.29255698005698005,
+      "grad_norm": 0.44877463579177856,
+      "learning_rate": 0.0001974305650523616,
+      "loss": 1.1906,
+      "step": 1643
+    },
+    {
+      "epoch": 0.29273504273504275,
+      "grad_norm": 0.5199326276779175,
+      "learning_rate": 0.00019742741144890432,
+      "loss": 1.1147,
+      "step": 1644
+    },
+    {
+      "epoch": 0.2929131054131054,
+      "grad_norm": 0.48142504692077637,
+      "learning_rate": 0.00019742425593655924,
+      "loss": 1.1951,
+      "step": 1645
+    },
+    {
+      "epoch": 0.2930911680911681,
+      "grad_norm": 0.5672988891601562,
+      "learning_rate": 0.0001974210985153883,
+      "loss": 1.1817,
+      "step": 1646
+    },
+    {
+      "epoch": 0.2932692307692308,
+      "grad_norm": 0.38135233521461487,
+      "learning_rate": 0.00019741793918545326,
+      "loss": 0.8567,
+      "step": 1647
+    },
+    {
+      "epoch": 0.2934472934472934,
+      "grad_norm": 0.6153588891029358,
+      "learning_rate": 0.0001974147779468161,
+      "loss": 1.0593,
+      "step": 1648
+    },
+    {
+      "epoch": 0.2936253561253561,
+      "grad_norm": 0.38935527205467224,
+      "learning_rate": 0.0001974116147995387,
+      "loss": 0.9907,
+      "step": 1649
+    },
+    {
+      "epoch": 0.2938034188034188,
+      "grad_norm": 0.467351496219635,
+      "learning_rate": 0.0001974084497436831,
+      "loss": 1.091,
+      "step": 1650
+    },
+    {
+      "epoch": 0.29398148148148145,
+      "grad_norm": 0.45613420009613037,
+      "learning_rate": 0.00019740528277931128,
+      "loss": 0.6789,
+      "step": 1651
+    },
+    {
+      "epoch": 0.29415954415954415,
+      "grad_norm": 0.4045158326625824,
+      "learning_rate": 0.00019740211390648524,
+      "loss": 1.0727,
+      "step": 1652
+    },
+    {
+      "epoch": 0.29433760683760685,
+      "grad_norm": 0.5122803449630737,
+      "learning_rate": 0.00019739894312526714,
+      "loss": 1.2297,
+      "step": 1653
+    },
+    {
+      "epoch": 0.29451566951566954,
+      "grad_norm": 0.44304123520851135,
+      "learning_rate": 0.00019739577043571908,
+      "loss": 0.9562,
+      "step": 1654
+    },
+    {
+      "epoch": 0.2946937321937322,
+      "grad_norm": 0.6070618629455566,
+      "learning_rate": 0.00019739259583790322,
+      "loss": 1.2745,
+      "step": 1655
+    },
+    {
+      "epoch": 0.2948717948717949,
+      "grad_norm": 0.48815637826919556,
+      "learning_rate": 0.00019738941933188176,
+      "loss": 1.0574,
+      "step": 1656
+    },
+    {
+      "epoch": 0.2950498575498576,
+      "grad_norm": 0.5067802667617798,
+      "learning_rate": 0.00019738624091771693,
+      "loss": 1.1874,
+      "step": 1657
+    },
+    {
+      "epoch": 0.2952279202279202,
+      "grad_norm": 0.4956928491592407,
+      "learning_rate": 0.000197383060595471,
+      "loss": 1.1085,
+      "step": 1658
+    },
+    {
+      "epoch": 0.2954059829059829,
+      "grad_norm": 0.46313008666038513,
+      "learning_rate": 0.00019737987836520633,
+      "loss": 1.0548,
+      "step": 1659
+    },
+    {
+      "epoch": 0.2955840455840456,
+      "grad_norm": 0.49944064021110535,
+      "learning_rate": 0.0001973766942269852,
+      "loss": 1.1485,
+      "step": 1660
+    },
+    {
+      "epoch": 0.29576210826210825,
+      "grad_norm": 0.4743517339229584,
+      "learning_rate": 0.00019737350818087003,
+      "loss": 0.9279,
+      "step": 1661
+    },
+    {
+      "epoch": 0.29594017094017094,
+      "grad_norm": 0.45935431122779846,
+      "learning_rate": 0.00019737032022692326,
+      "loss": 0.9574,
+      "step": 1662
+    },
+    {
+      "epoch": 0.29611823361823364,
+      "grad_norm": 0.4550873637199402,
+      "learning_rate": 0.00019736713036520734,
+      "loss": 1.1642,
+      "step": 1663
+    },
+    {
+      "epoch": 0.2962962962962963,
+      "grad_norm": 0.45252951979637146,
+      "learning_rate": 0.00019736393859578474,
+      "loss": 1.0113,
+      "step": 1664
+    },
+    {
+      "epoch": 0.296474358974359,
+      "grad_norm": 0.5147238969802856,
+      "learning_rate": 0.00019736074491871804,
+      "loss": 1.1604,
+      "step": 1665
+    },
+    {
+      "epoch": 0.29665242165242167,
+      "grad_norm": 0.5122934579849243,
+      "learning_rate": 0.00019735754933406977,
+      "loss": 0.9525,
+      "step": 1666
+    },
+    {
+      "epoch": 0.2968304843304843,
+      "grad_norm": 0.438620001077652,
+      "learning_rate": 0.00019735435184190257,
+      "loss": 1.0728,
+      "step": 1667
+    },
+    {
+      "epoch": 0.297008547008547,
+      "grad_norm": 0.41970670223236084,
+      "learning_rate": 0.00019735115244227908,
+      "loss": 0.9782,
+      "step": 1668
+    },
+    {
+      "epoch": 0.2971866096866097,
+      "grad_norm": 0.5447152256965637,
+      "learning_rate": 0.000197347951135262,
+      "loss": 1.0633,
+      "step": 1669
+    },
+    {
+      "epoch": 0.29736467236467234,
+      "grad_norm": 0.4846996068954468,
+      "learning_rate": 0.00019734474792091407,
+      "loss": 0.9019,
+      "step": 1670
+    },
+    {
+      "epoch": 0.29754273504273504,
+      "grad_norm": 0.4721437990665436,
+      "learning_rate": 0.00019734154279929796,
+      "loss": 1.1793,
+      "step": 1671
+    },
+    {
+      "epoch": 0.29772079772079774,
+      "grad_norm": 0.4659852385520935,
+      "learning_rate": 0.00019733833577047655,
+      "loss": 1.1503,
+      "step": 1672
+    },
+    {
+      "epoch": 0.2978988603988604,
+      "grad_norm": 0.3733183443546295,
+      "learning_rate": 0.00019733512683451268,
+      "loss": 0.7763,
+      "step": 1673
+    },
+    {
+      "epoch": 0.2980769230769231,
+      "grad_norm": 0.4898292124271393,
+      "learning_rate": 0.0001973319159914692,
+      "loss": 1.3146,
+      "step": 1674
+    },
+    {
+      "epoch": 0.29825498575498577,
+      "grad_norm": 0.41774725914001465,
+      "learning_rate": 0.00019732870324140899,
+      "loss": 1.2069,
+      "step": 1675
+    },
+    {
+      "epoch": 0.2984330484330484,
+      "grad_norm": 0.4607912003993988,
+      "learning_rate": 0.000197325488584395,
+      "loss": 1.2255,
+      "step": 1676
+    },
+    {
+      "epoch": 0.2986111111111111,
+      "grad_norm": 0.4692424237728119,
+      "learning_rate": 0.00019732227202049025,
+      "loss": 1.0793,
+      "step": 1677
+    },
+    {
+      "epoch": 0.2987891737891738,
+      "grad_norm": 0.5925022959709167,
+      "learning_rate": 0.00019731905354975778,
+      "loss": 1.0297,
+      "step": 1678
+    },
+    {
+      "epoch": 0.29896723646723644,
+      "grad_norm": 0.44047990441322327,
+      "learning_rate": 0.00019731583317226056,
+      "loss": 1.0982,
+      "step": 1679
+    },
+    {
+      "epoch": 0.29914529914529914,
+      "grad_norm": 0.5863066911697388,
+      "learning_rate": 0.0001973126108880618,
+      "loss": 1.0284,
+      "step": 1680
+    },
+    {
+      "epoch": 0.29932336182336183,
+      "grad_norm": 0.48962152004241943,
+      "learning_rate": 0.00019730938669722457,
+      "loss": 1.1861,
+      "step": 1681
+    },
+    {
+      "epoch": 0.29950142450142453,
+      "grad_norm": 0.5445577502250671,
+      "learning_rate": 0.00019730616059981205,
+      "loss": 1.2574,
+      "step": 1682
+    },
+    {
+      "epoch": 0.29967948717948717,
+      "grad_norm": 0.49327564239501953,
+      "learning_rate": 0.00019730293259588743,
+      "loss": 0.9578,
+      "step": 1683
+    },
+    {
+      "epoch": 0.29985754985754987,
+      "grad_norm": 0.4252840578556061,
+      "learning_rate": 0.00019729970268551398,
+      "loss": 1.0083,
+      "step": 1684
+    },
+    {
+      "epoch": 0.30003561253561256,
+      "grad_norm": 0.5140926241874695,
+      "learning_rate": 0.000197296470868755,
+      "loss": 1.3263,
+      "step": 1685
+    },
+    {
+      "epoch": 0.3002136752136752,
+      "grad_norm": 0.5143948197364807,
+      "learning_rate": 0.00019729323714567375,
+      "loss": 1.0424,
+      "step": 1686
+    },
+    {
+      "epoch": 0.3003917378917379,
+      "grad_norm": 0.3811354339122772,
+      "learning_rate": 0.00019729000151633367,
+      "loss": 0.6319,
+      "step": 1687
+    },
+    {
+      "epoch": 0.3005698005698006,
+      "grad_norm": 0.5249716639518738,
+      "learning_rate": 0.0001972867639807981,
+      "loss": 1.0173,
+      "step": 1688
+    },
+    {
+      "epoch": 0.30074786324786323,
+      "grad_norm": 0.41832098364830017,
+      "learning_rate": 0.00019728352453913048,
+      "loss": 1.0503,
+      "step": 1689
+    },
+    {
+      "epoch": 0.30092592592592593,
+      "grad_norm": 0.5961149334907532,
+      "learning_rate": 0.00019728028319139428,
+      "loss": 1.1843,
+      "step": 1690
+    },
+    {
+      "epoch": 0.3011039886039886,
+      "grad_norm": 0.44083690643310547,
+      "learning_rate": 0.00019727703993765303,
+      "loss": 1.1311,
+      "step": 1691
+    },
+    {
+      "epoch": 0.30128205128205127,
+      "grad_norm": 0.4368111491203308,
+      "learning_rate": 0.00019727379477797022,
+      "loss": 0.9463,
+      "step": 1692
+    },
+    {
+      "epoch": 0.30146011396011396,
+      "grad_norm": 0.5289376974105835,
+      "learning_rate": 0.00019727054771240954,
+      "loss": 0.9836,
+      "step": 1693
+    },
+    {
+      "epoch": 0.30163817663817666,
+      "grad_norm": 0.4132843613624573,
+      "learning_rate": 0.00019726729874103448,
+      "loss": 1.1052,
+      "step": 1694
+    },
+    {
+      "epoch": 0.3018162393162393,
+      "grad_norm": 0.4919086992740631,
+      "learning_rate": 0.00019726404786390877,
+      "loss": 1.2219,
+      "step": 1695
+    },
+    {
+      "epoch": 0.301994301994302,
+      "grad_norm": 0.42561691999435425,
+      "learning_rate": 0.0001972607950810961,
+      "loss": 1.0756,
+      "step": 1696
+    },
+    {
+      "epoch": 0.3021723646723647,
+      "grad_norm": 0.5030396580696106,
+      "learning_rate": 0.0001972575403926602,
+      "loss": 1.2207,
+      "step": 1697
+    },
+    {
+      "epoch": 0.30235042735042733,
+      "grad_norm": 0.4779801666736603,
+      "learning_rate": 0.0001972542837986648,
+      "loss": 1.194,
+      "step": 1698
+    },
+    {
+      "epoch": 0.30252849002849,
+      "grad_norm": 0.45395568013191223,
+      "learning_rate": 0.00019725102529917377,
+      "loss": 1.0775,
+      "step": 1699
+    },
+    {
+      "epoch": 0.3027065527065527,
+      "grad_norm": 0.6540699005126953,
+      "learning_rate": 0.0001972477648942509,
+      "loss": 1.181,
+      "step": 1700
+    },
+    {
+      "epoch": 0.30288461538461536,
+      "grad_norm": 0.46281275153160095,
+      "learning_rate": 0.00019724450258396008,
+      "loss": 0.629,
+      "step": 1701
+    },
+    {
+      "epoch": 0.30306267806267806,
+      "grad_norm": 0.3452845811843872,
+      "learning_rate": 0.00019724123836836527,
+      "loss": 0.51,
+      "step": 1702
+    },
+    {
+      "epoch": 0.30324074074074076,
+      "grad_norm": 0.4507991671562195,
+      "learning_rate": 0.00019723797224753038,
+      "loss": 1.0258,
+      "step": 1703
+    },
+    {
+      "epoch": 0.3034188034188034,
+      "grad_norm": 0.5385412573814392,
+      "learning_rate": 0.0001972347042215194,
+      "loss": 1.0232,
+      "step": 1704
+    },
+    {
+      "epoch": 0.3035968660968661,
+      "grad_norm": 0.4460466504096985,
+      "learning_rate": 0.00019723143429039642,
+      "loss": 1.1307,
+      "step": 1705
+    },
+    {
+      "epoch": 0.3037749287749288,
+      "grad_norm": 0.5229718685150146,
+      "learning_rate": 0.00019722816245422545,
+      "loss": 1.0964,
+      "step": 1706
+    },
+    {
+      "epoch": 0.30395299145299143,
+      "grad_norm": 0.4776979088783264,
+      "learning_rate": 0.00019722488871307058,
+      "loss": 1.2678,
+      "step": 1707
+    },
+    {
+      "epoch": 0.3041310541310541,
+      "grad_norm": 0.5371831655502319,
+      "learning_rate": 0.00019722161306699601,
+      "loss": 1.2808,
+      "step": 1708
+    },
+    {
+      "epoch": 0.3043091168091168,
+      "grad_norm": 0.45322108268737793,
+      "learning_rate": 0.0001972183355160659,
+      "loss": 1.0775,
+      "step": 1709
+    },
+    {
+      "epoch": 0.30448717948717946,
+      "grad_norm": 0.5036569833755493,
+      "learning_rate": 0.00019721505606034448,
+      "loss": 1.1859,
+      "step": 1710
+    },
+    {
+      "epoch": 0.30466524216524216,
+      "grad_norm": 0.5425969958305359,
+      "learning_rate": 0.00019721177469989593,
+      "loss": 1.0173,
+      "step": 1711
+    },
+    {
+      "epoch": 0.30484330484330485,
+      "grad_norm": 0.5638980269432068,
+      "learning_rate": 0.00019720849143478462,
+      "loss": 1.182,
+      "step": 1712
+    },
+    {
+      "epoch": 0.30502136752136755,
+      "grad_norm": 0.5160546898841858,
+      "learning_rate": 0.00019720520626507486,
+      "loss": 0.9853,
+      "step": 1713
+    },
+    {
+      "epoch": 0.3051994301994302,
+      "grad_norm": 0.5079004168510437,
+      "learning_rate": 0.000197201919190831,
+      "loss": 1.3154,
+      "step": 1714
+    },
+    {
+      "epoch": 0.3053774928774929,
+      "grad_norm": 0.4590355455875397,
+      "learning_rate": 0.00019719863021211745,
+      "loss": 1.007,
+      "step": 1715
+    },
+    {
+      "epoch": 0.3055555555555556,
+      "grad_norm": 0.49656423926353455,
+      "learning_rate": 0.00019719533932899865,
+      "loss": 1.2187,
+      "step": 1716
+    },
+    {
+      "epoch": 0.3057336182336182,
+      "grad_norm": 0.46426209807395935,
+      "learning_rate": 0.0001971920465415391,
+      "loss": 1.3007,
+      "step": 1717
+    },
+    {
+      "epoch": 0.3059116809116809,
+      "grad_norm": 0.5211917757987976,
+      "learning_rate": 0.00019718875184980328,
+      "loss": 1.2256,
+      "step": 1718
+    },
+    {
+      "epoch": 0.3060897435897436,
+      "grad_norm": 0.42953309416770935,
+      "learning_rate": 0.00019718545525385578,
+      "loss": 1.2838,
+      "step": 1719
+    },
+    {
+      "epoch": 0.30626780626780625,
+      "grad_norm": 0.4893105924129486,
+      "learning_rate": 0.00019718215675376116,
+      "loss": 1.052,
+      "step": 1720
+    },
+    {
+      "epoch": 0.30644586894586895,
+      "grad_norm": 0.4833602011203766,
+      "learning_rate": 0.00019717885634958405,
+      "loss": 1.069,
+      "step": 1721
+    },
+    {
+      "epoch": 0.30662393162393164,
+      "grad_norm": 0.502176821231842,
+      "learning_rate": 0.0001971755540413891,
+      "loss": 1.1659,
+      "step": 1722
+    },
+    {
+      "epoch": 0.3068019943019943,
+      "grad_norm": 0.4648856818675995,
+      "learning_rate": 0.00019717224982924108,
+      "loss": 1.1873,
+      "step": 1723
+    },
+    {
+      "epoch": 0.306980056980057,
+      "grad_norm": 0.405429869890213,
+      "learning_rate": 0.00019716894371320465,
+      "loss": 0.99,
+      "step": 1724
+    },
+    {
+      "epoch": 0.3071581196581197,
+      "grad_norm": 0.4306945204734802,
+      "learning_rate": 0.00019716563569334463,
+      "loss": 0.8751,
+      "step": 1725
+    },
+    {
+      "epoch": 0.3073361823361823,
+      "grad_norm": 0.49424824118614197,
+      "learning_rate": 0.00019716232576972583,
+      "loss": 0.9205,
+      "step": 1726
+    },
+    {
+      "epoch": 0.307514245014245,
+      "grad_norm": 0.5044034123420715,
+      "learning_rate": 0.00019715901394241306,
+      "loss": 1.2042,
+      "step": 1727
+    },
+    {
+      "epoch": 0.3076923076923077,
+      "grad_norm": 0.512180507183075,
+      "learning_rate": 0.00019715570021147126,
+      "loss": 1.1644,
+      "step": 1728
+    },
+    {
+      "epoch": 0.30787037037037035,
+      "grad_norm": 0.4377981126308441,
+      "learning_rate": 0.00019715238457696538,
+      "loss": 1.1625,
+      "step": 1729
+    },
+    {
+      "epoch": 0.30804843304843305,
+      "grad_norm": 0.49107855558395386,
+      "learning_rate": 0.00019714906703896027,
+      "loss": 1.1037,
+      "step": 1730
+    },
+    {
+      "epoch": 0.30822649572649574,
+      "grad_norm": 0.47342559695243835,
+      "learning_rate": 0.00019714574759752105,
+      "loss": 1.3186,
+      "step": 1731
+    },
+    {
+      "epoch": 0.3084045584045584,
+      "grad_norm": 0.487177312374115,
+      "learning_rate": 0.0001971424262527127,
+      "loss": 1.1196,
+      "step": 1732
+    },
+    {
+      "epoch": 0.3085826210826211,
+      "grad_norm": 0.5290025472640991,
+      "learning_rate": 0.0001971391030046003,
+      "loss": 1.2103,
+      "step": 1733
+    },
+    {
+      "epoch": 0.3087606837606838,
+      "grad_norm": 0.4587760269641876,
+      "learning_rate": 0.00019713577785324896,
+      "loss": 1.1017,
+      "step": 1734
+    },
+    {
+      "epoch": 0.3089387464387464,
+      "grad_norm": 0.45323294401168823,
+      "learning_rate": 0.00019713245079872388,
+      "loss": 1.0,
+      "step": 1735
+    },
+    {
+      "epoch": 0.3091168091168091,
+      "grad_norm": 0.43414804339408875,
+      "learning_rate": 0.00019712912184109013,
+      "loss": 1.0341,
+      "step": 1736
+    },
+    {
+      "epoch": 0.3092948717948718,
+      "grad_norm": 0.49604663252830505,
+      "learning_rate": 0.00019712579098041304,
+      "loss": 0.9437,
+      "step": 1737
+    },
+    {
+      "epoch": 0.30947293447293445,
+      "grad_norm": 0.48580703139305115,
+      "learning_rate": 0.00019712245821675785,
+      "loss": 1.2622,
+      "step": 1738
+    },
+    {
+      "epoch": 0.30965099715099714,
+      "grad_norm": 0.45333603024482727,
+      "learning_rate": 0.00019711912355018982,
+      "loss": 1.2063,
+      "step": 1739
+    },
+    {
+      "epoch": 0.30982905982905984,
+      "grad_norm": 0.5990764498710632,
+      "learning_rate": 0.00019711578698077432,
+      "loss": 1.5097,
+      "step": 1740
+    },
+    {
+      "epoch": 0.31000712250712253,
+      "grad_norm": 0.4386102259159088,
+      "learning_rate": 0.0001971124485085767,
+      "loss": 1.1283,
+      "step": 1741
+    },
+    {
+      "epoch": 0.3101851851851852,
+      "grad_norm": 0.4476035237312317,
+      "learning_rate": 0.00019710910813366242,
+      "loss": 0.8922,
+      "step": 1742
+    },
+    {
+      "epoch": 0.31036324786324787,
+      "grad_norm": 0.5276228785514832,
+      "learning_rate": 0.00019710576585609685,
+      "loss": 1.2373,
+      "step": 1743
+    },
+    {
+      "epoch": 0.31054131054131057,
+      "grad_norm": 0.4885637164115906,
+      "learning_rate": 0.00019710242167594557,
+      "loss": 1.0881,
+      "step": 1744
+    },
+    {
+      "epoch": 0.3107193732193732,
+      "grad_norm": 0.421132355928421,
+      "learning_rate": 0.000197099075593274,
+      "loss": 1.0544,
+      "step": 1745
+    },
+    {
+      "epoch": 0.3108974358974359,
+      "grad_norm": 0.5257927179336548,
+      "learning_rate": 0.00019709572760814777,
+      "loss": 1.265,
+      "step": 1746
+    },
+    {
+      "epoch": 0.3110754985754986,
+      "grad_norm": 0.5164850950241089,
+      "learning_rate": 0.00019709237772063247,
+      "loss": 0.9593,
+      "step": 1747
+    },
+    {
+      "epoch": 0.31125356125356124,
+      "grad_norm": 0.5176383256912231,
+      "learning_rate": 0.00019708902593079374,
+      "loss": 1.0194,
+      "step": 1748
+    },
+    {
+      "epoch": 0.31143162393162394,
+      "grad_norm": 0.4620790481567383,
+      "learning_rate": 0.00019708567223869716,
+      "loss": 0.9241,
+      "step": 1749
+    },
+    {
+      "epoch": 0.31160968660968663,
+      "grad_norm": 0.48307979106903076,
+      "learning_rate": 0.00019708231664440854,
+      "loss": 1.2314,
+      "step": 1750
+    },
+    {
+      "epoch": 0.31178774928774927,
+      "grad_norm": 0.4931468069553375,
+      "learning_rate": 0.00019707895914799364,
+      "loss": 1.2065,
+      "step": 1751
+    },
+    {
+      "epoch": 0.31196581196581197,
+      "grad_norm": 0.5035979747772217,
+      "learning_rate": 0.00019707559974951818,
+      "loss": 1.1867,
+      "step": 1752
+    },
+    {
+      "epoch": 0.31214387464387466,
+      "grad_norm": 0.47543632984161377,
+      "learning_rate": 0.00019707223844904795,
+      "loss": 1.0603,
+      "step": 1753
+    },
+    {
+      "epoch": 0.3123219373219373,
+      "grad_norm": 0.49929797649383545,
+      "learning_rate": 0.00019706887524664892,
+      "loss": 1.0597,
+      "step": 1754
+    },
+    {
+      "epoch": 0.3125,
+      "grad_norm": 0.5075222253799438,
+      "learning_rate": 0.00019706551014238687,
+      "loss": 1.1398,
+      "step": 1755
+    },
+    {
+      "epoch": 0.3126780626780627,
+      "grad_norm": 0.5096884369850159,
+      "learning_rate": 0.00019706214313632784,
+      "loss": 1.1382,
+      "step": 1756
+    },
+    {
+      "epoch": 0.31285612535612534,
+      "grad_norm": 0.4629988372325897,
+      "learning_rate": 0.0001970587742285377,
+      "loss": 1.0009,
+      "step": 1757
+    },
+    {
+      "epoch": 0.31303418803418803,
+      "grad_norm": 0.5244084596633911,
+      "learning_rate": 0.00019705540341908253,
+      "loss": 1.047,
+      "step": 1758
+    },
+    {
+      "epoch": 0.31321225071225073,
+      "grad_norm": 0.5136716961860657,
+      "learning_rate": 0.00019705203070802832,
+      "loss": 1.29,
+      "step": 1759
+    },
+    {
+      "epoch": 0.31339031339031337,
+      "grad_norm": 0.43991541862487793,
+      "learning_rate": 0.0001970486560954412,
+      "loss": 0.9605,
+      "step": 1760
+    },
+    {
+      "epoch": 0.31356837606837606,
+      "grad_norm": 0.4633477032184601,
+      "learning_rate": 0.00019704527958138725,
+      "loss": 1.1507,
+      "step": 1761
+    },
+    {
+      "epoch": 0.31374643874643876,
+      "grad_norm": 0.4419999420642853,
+      "learning_rate": 0.00019704190116593266,
+      "loss": 0.9262,
+      "step": 1762
+    },
+    {
+      "epoch": 0.3139245014245014,
+      "grad_norm": 0.49359434843063354,
+      "learning_rate": 0.00019703852084914357,
+      "loss": 0.9348,
+      "step": 1763
+    },
+    {
+      "epoch": 0.3141025641025641,
+      "grad_norm": 0.5072139501571655,
+      "learning_rate": 0.00019703513863108627,
+      "loss": 1.1592,
+      "step": 1764
+    },
+    {
+      "epoch": 0.3142806267806268,
+      "grad_norm": 0.45969831943511963,
+      "learning_rate": 0.00019703175451182698,
+      "loss": 1.1519,
+      "step": 1765
+    },
+    {
+      "epoch": 0.31445868945868943,
+      "grad_norm": 0.5148758292198181,
+      "learning_rate": 0.00019702836849143208,
+      "loss": 1.1673,
+      "step": 1766
+    },
+    {
+      "epoch": 0.31463675213675213,
+      "grad_norm": 0.43033209443092346,
+      "learning_rate": 0.0001970249805699678,
+      "loss": 0.9256,
+      "step": 1767
+    },
+    {
+      "epoch": 0.3148148148148148,
+      "grad_norm": 0.48143425583839417,
+      "learning_rate": 0.00019702159074750058,
+      "loss": 1.08,
+      "step": 1768
+    },
+    {
+      "epoch": 0.31499287749287747,
+      "grad_norm": 0.4780619740486145,
+      "learning_rate": 0.00019701819902409685,
+      "loss": 1.1198,
+      "step": 1769
+    },
+    {
+      "epoch": 0.31517094017094016,
+      "grad_norm": 0.4662075936794281,
+      "learning_rate": 0.00019701480539982305,
+      "loss": 0.8424,
+      "step": 1770
+    },
+    {
+      "epoch": 0.31534900284900286,
+      "grad_norm": 0.503901481628418,
+      "learning_rate": 0.00019701140987474566,
+      "loss": 1.1026,
+      "step": 1771
+    },
+    {
+      "epoch": 0.31552706552706555,
+      "grad_norm": 0.5197132229804993,
+      "learning_rate": 0.00019700801244893124,
+      "loss": 1.2148,
+      "step": 1772
+    },
+    {
+      "epoch": 0.3157051282051282,
+      "grad_norm": 0.4746309220790863,
+      "learning_rate": 0.00019700461312244634,
+      "loss": 1.0906,
+      "step": 1773
+    },
+    {
+      "epoch": 0.3158831908831909,
+      "grad_norm": 0.5277339816093445,
+      "learning_rate": 0.00019700121189535752,
+      "loss": 1.0588,
+      "step": 1774
+    },
+    {
+      "epoch": 0.3160612535612536,
+      "grad_norm": 0.436002254486084,
+      "learning_rate": 0.00019699780876773147,
+      "loss": 1.0341,
+      "step": 1775
+    },
+    {
+      "epoch": 0.3162393162393162,
+      "grad_norm": 0.5171145796775818,
+      "learning_rate": 0.00019699440373963486,
+      "loss": 1.282,
+      "step": 1776
+    },
+    {
+      "epoch": 0.3164173789173789,
+      "grad_norm": 0.38382846117019653,
+      "learning_rate": 0.00019699099681113436,
+      "loss": 0.8908,
+      "step": 1777
+    },
+    {
+      "epoch": 0.3165954415954416,
+      "grad_norm": 0.4621630609035492,
+      "learning_rate": 0.0001969875879822968,
+      "loss": 1.1074,
+      "step": 1778
+    },
+    {
+      "epoch": 0.31677350427350426,
+      "grad_norm": 0.5543130040168762,
+      "learning_rate": 0.00019698417725318892,
+      "loss": 0.9682,
+      "step": 1779
+    },
+    {
+      "epoch": 0.31695156695156695,
+      "grad_norm": 0.49534836411476135,
+      "learning_rate": 0.00019698076462387753,
+      "loss": 1.107,
+      "step": 1780
+    },
+    {
+      "epoch": 0.31712962962962965,
+      "grad_norm": 0.48844948410987854,
+      "learning_rate": 0.00019697735009442956,
+      "loss": 1.1295,
+      "step": 1781
+    },
+    {
+      "epoch": 0.3173076923076923,
+      "grad_norm": 0.5070686936378479,
+      "learning_rate": 0.00019697393366491185,
+      "loss": 1.083,
+      "step": 1782
+    },
+    {
+      "epoch": 0.317485754985755,
+      "grad_norm": 0.47817620635032654,
+      "learning_rate": 0.00019697051533539134,
+      "loss": 1.3014,
+      "step": 1783
+    },
+    {
+      "epoch": 0.3176638176638177,
+      "grad_norm": 0.538488507270813,
+      "learning_rate": 0.00019696709510593502,
+      "loss": 1.0354,
+      "step": 1784
+    },
+    {
+      "epoch": 0.3178418803418803,
+      "grad_norm": 0.5141439437866211,
+      "learning_rate": 0.0001969636729766099,
+      "loss": 1.2912,
+      "step": 1785
+    },
+    {
+      "epoch": 0.318019943019943,
+      "grad_norm": 0.5009665489196777,
+      "learning_rate": 0.00019696024894748306,
+      "loss": 0.9014,
+      "step": 1786
+    },
+    {
+      "epoch": 0.3181980056980057,
+      "grad_norm": 0.46199744939804077,
+      "learning_rate": 0.00019695682301862155,
+      "loss": 1.0532,
+      "step": 1787
+    },
+    {
+      "epoch": 0.31837606837606836,
+      "grad_norm": 0.4649423062801361,
+      "learning_rate": 0.0001969533951900925,
+      "loss": 0.8608,
+      "step": 1788
+    },
+    {
+      "epoch": 0.31855413105413105,
+      "grad_norm": 0.516909658908844,
+      "learning_rate": 0.0001969499654619631,
+      "loss": 1.1385,
+      "step": 1789
+    },
+    {
+      "epoch": 0.31873219373219375,
+      "grad_norm": 0.46016669273376465,
+      "learning_rate": 0.00019694653383430048,
+      "loss": 0.9168,
+      "step": 1790
+    },
+    {
+      "epoch": 0.3189102564102564,
+      "grad_norm": 0.4794938564300537,
+      "learning_rate": 0.00019694310030717193,
+      "loss": 1.0244,
+      "step": 1791
+    },
+    {
+      "epoch": 0.3190883190883191,
+      "grad_norm": 0.46577662229537964,
+      "learning_rate": 0.00019693966488064471,
+      "loss": 1.0954,
+      "step": 1792
+    },
+    {
+      "epoch": 0.3192663817663818,
+      "grad_norm": 0.4866746962070465,
+      "learning_rate": 0.00019693622755478614,
+      "loss": 1.2925,
+      "step": 1793
+    },
+    {
+      "epoch": 0.3194444444444444,
+      "grad_norm": 0.4841702878475189,
+      "learning_rate": 0.00019693278832966357,
+      "loss": 1.119,
+      "step": 1794
+    },
+    {
+      "epoch": 0.3196225071225071,
+      "grad_norm": 0.4835243821144104,
+      "learning_rate": 0.00019692934720534435,
+      "loss": 1.1702,
+      "step": 1795
+    },
+    {
+      "epoch": 0.3198005698005698,
+      "grad_norm": 0.5200608968734741,
+      "learning_rate": 0.00019692590418189594,
+      "loss": 1.1989,
+      "step": 1796
+    },
+    {
+      "epoch": 0.31997863247863245,
+      "grad_norm": 0.5147821307182312,
+      "learning_rate": 0.00019692245925938577,
+      "loss": 1.1417,
+      "step": 1797
+    },
+    {
+      "epoch": 0.32015669515669515,
+      "grad_norm": 0.5145614743232727,
+      "learning_rate": 0.00019691901243788136,
+      "loss": 1.0571,
+      "step": 1798
+    },
+    {
+      "epoch": 0.32033475783475784,
+      "grad_norm": 0.5416026711463928,
+      "learning_rate": 0.00019691556371745022,
+      "loss": 1.188,
+      "step": 1799
+    },
+    {
+      "epoch": 0.32051282051282054,
+      "grad_norm": 0.5140644311904907,
+      "learning_rate": 0.00019691211309815995,
+      "loss": 1.1795,
+      "step": 1800
+    },
+    {
+      "epoch": 0.3206908831908832,
+      "grad_norm": 0.44219106435775757,
+      "learning_rate": 0.00019690866058007817,
+      "loss": 0.9215,
+      "step": 1801
+    },
+    {
+      "epoch": 0.3208689458689459,
+      "grad_norm": 0.49523603916168213,
+      "learning_rate": 0.00019690520616327245,
+      "loss": 1.1117,
+      "step": 1802
+    },
+    {
+      "epoch": 0.32104700854700857,
+      "grad_norm": 0.5818293690681458,
+      "learning_rate": 0.0001969017498478105,
+      "loss": 1.16,
+      "step": 1803
+    },
+    {
+      "epoch": 0.3212250712250712,
+      "grad_norm": 0.5175749659538269,
+      "learning_rate": 0.0001968982916337601,
+      "loss": 1.1999,
+      "step": 1804
+    },
+    {
+      "epoch": 0.3214031339031339,
+      "grad_norm": 0.49916017055511475,
+      "learning_rate": 0.00019689483152118898,
+      "loss": 0.9505,
+      "step": 1805
+    },
+    {
+      "epoch": 0.3215811965811966,
+      "grad_norm": 0.46849536895751953,
+      "learning_rate": 0.00019689136951016488,
+      "loss": 0.9627,
+      "step": 1806
+    },
+    {
+      "epoch": 0.32175925925925924,
+      "grad_norm": 0.4226818382740021,
+      "learning_rate": 0.00019688790560075568,
+      "loss": 1.037,
+      "step": 1807
+    },
+    {
+      "epoch": 0.32193732193732194,
+      "grad_norm": 0.4697103798389435,
+      "learning_rate": 0.00019688443979302923,
+      "loss": 1.1431,
+      "step": 1808
+    },
+    {
+      "epoch": 0.32211538461538464,
+      "grad_norm": 0.4999365508556366,
+      "learning_rate": 0.00019688097208705343,
+      "loss": 1.171,
+      "step": 1809
+    },
+    {
+      "epoch": 0.3222934472934473,
+      "grad_norm": 0.5229731798171997,
+      "learning_rate": 0.00019687750248289625,
+      "loss": 1.3395,
+      "step": 1810
+    },
+    {
+      "epoch": 0.32247150997151,
+      "grad_norm": 0.512525737285614,
+      "learning_rate": 0.00019687403098062566,
+      "loss": 1.1438,
+      "step": 1811
+    },
+    {
+      "epoch": 0.32264957264957267,
+      "grad_norm": 0.4558548927307129,
+      "learning_rate": 0.00019687055758030967,
+      "loss": 1.0012,
+      "step": 1812
+    },
+    {
+      "epoch": 0.3228276353276353,
+      "grad_norm": 0.45195743441581726,
+      "learning_rate": 0.00019686708228201636,
+      "loss": 1.0222,
+      "step": 1813
+    },
+    {
+      "epoch": 0.323005698005698,
+      "grad_norm": 0.5023126602172852,
+      "learning_rate": 0.00019686360508581373,
+      "loss": 1.2128,
+      "step": 1814
+    },
+    {
+      "epoch": 0.3231837606837607,
+      "grad_norm": 0.46516045928001404,
+      "learning_rate": 0.00019686012599177003,
+      "loss": 0.989,
+      "step": 1815
+    },
+    {
+      "epoch": 0.32336182336182334,
+      "grad_norm": 0.4142672121524811,
+      "learning_rate": 0.00019685664499995338,
+      "loss": 1.0144,
+      "step": 1816
+    },
+    {
+      "epoch": 0.32353988603988604,
+      "grad_norm": 0.4511009752750397,
+      "learning_rate": 0.0001968531621104319,
+      "loss": 0.885,
+      "step": 1817
+    },
+    {
+      "epoch": 0.32371794871794873,
+      "grad_norm": 0.49583545327186584,
+      "learning_rate": 0.00019684967732327396,
+      "loss": 1.0986,
+      "step": 1818
+    },
+    {
+      "epoch": 0.3238960113960114,
+      "grad_norm": 0.5872161388397217,
+      "learning_rate": 0.0001968461906385478,
+      "loss": 1.1482,
+      "step": 1819
+    },
+    {
+      "epoch": 0.32407407407407407,
+      "grad_norm": 0.4509563148021698,
+      "learning_rate": 0.00019684270205632168,
+      "loss": 1.0578,
+      "step": 1820
+    },
+    {
+      "epoch": 0.32425213675213677,
+      "grad_norm": 0.501345157623291,
+      "learning_rate": 0.00019683921157666402,
+      "loss": 1.1792,
+      "step": 1821
+    },
+    {
+      "epoch": 0.3244301994301994,
+      "grad_norm": 0.48257577419281006,
+      "learning_rate": 0.00019683571919964314,
+      "loss": 1.0448,
+      "step": 1822
+    },
+    {
+      "epoch": 0.3246082621082621,
+      "grad_norm": 0.5399422645568848,
+      "learning_rate": 0.00019683222492532752,
+      "loss": 1.0579,
+      "step": 1823
+    },
+    {
+      "epoch": 0.3247863247863248,
+      "grad_norm": 0.4382506012916565,
+      "learning_rate": 0.0001968287287537856,
+      "loss": 1.0246,
+      "step": 1824
+    },
+    {
+      "epoch": 0.32496438746438744,
+      "grad_norm": 0.49247491359710693,
+      "learning_rate": 0.00019682523068508586,
+      "loss": 1.318,
+      "step": 1825
+    },
+    {
+      "epoch": 0.32514245014245013,
+      "grad_norm": 0.49067625403404236,
+      "learning_rate": 0.0001968217307192969,
+      "loss": 1.1028,
+      "step": 1826
+    },
+    {
+      "epoch": 0.32532051282051283,
+      "grad_norm": 0.4832286238670349,
+      "learning_rate": 0.00019681822885648723,
+      "loss": 1.0996,
+      "step": 1827
+    },
+    {
+      "epoch": 0.32549857549857547,
+      "grad_norm": 0.47144386172294617,
+      "learning_rate": 0.0001968147250967255,
+      "loss": 1.0707,
+      "step": 1828
+    },
+    {
+      "epoch": 0.32567663817663817,
+      "grad_norm": 0.46299225091934204,
+      "learning_rate": 0.0001968112194400803,
+      "loss": 1.0461,
+      "step": 1829
+    },
+    {
+      "epoch": 0.32585470085470086,
+      "grad_norm": 0.4880816340446472,
+      "learning_rate": 0.00019680771188662044,
+      "loss": 1.1198,
+      "step": 1830
+    },
+    {
+      "epoch": 0.32603276353276356,
+      "grad_norm": 0.43837276101112366,
+      "learning_rate": 0.00019680420243641452,
+      "loss": 1.0599,
+      "step": 1831
+    },
+    {
+      "epoch": 0.3262108262108262,
+      "grad_norm": 0.453168660402298,
+      "learning_rate": 0.0001968006910895314,
+      "loss": 1.0327,
+      "step": 1832
+    },
+    {
+      "epoch": 0.3263888888888889,
+      "grad_norm": 0.45183828473091125,
+      "learning_rate": 0.00019679717784603975,
+      "loss": 1.1381,
+      "step": 1833
+    },
+    {
+      "epoch": 0.3265669515669516,
+      "grad_norm": 0.5326765775680542,
+      "learning_rate": 0.00019679366270600852,
+      "loss": 1.3169,
+      "step": 1834
+    },
+    {
+      "epoch": 0.32674501424501423,
+      "grad_norm": 0.47468429803848267,
+      "learning_rate": 0.00019679014566950653,
+      "loss": 1.1816,
+      "step": 1835
+    },
+    {
+      "epoch": 0.3269230769230769,
+      "grad_norm": 0.5096879005432129,
+      "learning_rate": 0.0001967866267366027,
+      "loss": 1.1162,
+      "step": 1836
+    },
+    {
+      "epoch": 0.3271011396011396,
+      "grad_norm": 0.491514652967453,
+      "learning_rate": 0.00019678310590736598,
+      "loss": 1.2793,
+      "step": 1837
+    },
+    {
+      "epoch": 0.32727920227920226,
+      "grad_norm": 0.601439356803894,
+      "learning_rate": 0.00019677958318186533,
+      "loss": 0.9851,
+      "step": 1838
+    },
+    {
+      "epoch": 0.32745726495726496,
+      "grad_norm": 0.45270970463752747,
+      "learning_rate": 0.0001967760585601698,
+      "loss": 1.0042,
+      "step": 1839
+    },
+    {
+      "epoch": 0.32763532763532766,
+      "grad_norm": 0.48864325881004333,
+      "learning_rate": 0.00019677253204234847,
+      "loss": 1.0835,
+      "step": 1840
+    },
+    {
+      "epoch": 0.3278133903133903,
+      "grad_norm": 0.5855685472488403,
+      "learning_rate": 0.00019676900362847037,
+      "loss": 1.193,
+      "step": 1841
+    },
+    {
+      "epoch": 0.327991452991453,
+      "grad_norm": 0.7181013822555542,
+      "learning_rate": 0.00019676547331860466,
+      "loss": 1.2028,
+      "step": 1842
+    },
+    {
+      "epoch": 0.3281695156695157,
+      "grad_norm": 0.4517378807067871,
+      "learning_rate": 0.00019676194111282054,
+      "loss": 1.013,
+      "step": 1843
+    },
+    {
+      "epoch": 0.32834757834757833,
+      "grad_norm": 0.5477756857872009,
+      "learning_rate": 0.00019675840701118718,
+      "loss": 1.2311,
+      "step": 1844
+    },
+    {
+      "epoch": 0.328525641025641,
+      "grad_norm": 0.5194997191429138,
+      "learning_rate": 0.00019675487101377382,
+      "loss": 1.0953,
+      "step": 1845
+    },
+    {
+      "epoch": 0.3287037037037037,
+      "grad_norm": 0.44454067945480347,
+      "learning_rate": 0.00019675133312064977,
+      "loss": 0.8505,
+      "step": 1846
+    },
+    {
+      "epoch": 0.32888176638176636,
+      "grad_norm": 0.3938713073730469,
+      "learning_rate": 0.00019674779333188428,
+      "loss": 0.8525,
+      "step": 1847
+    },
+    {
+      "epoch": 0.32905982905982906,
+      "grad_norm": 0.4927884340286255,
+      "learning_rate": 0.00019674425164754682,
+      "loss": 1.2477,
+      "step": 1848
+    },
+    {
+      "epoch": 0.32923789173789175,
+      "grad_norm": 0.4516635239124298,
+      "learning_rate": 0.0001967407080677067,
+      "loss": 0.8333,
+      "step": 1849
+    },
+    {
+      "epoch": 0.3294159544159544,
+      "grad_norm": 0.47105780243873596,
+      "learning_rate": 0.00019673716259243336,
+      "loss": 1.0989,
+      "step": 1850
+    },
+    {
+      "epoch": 0.3295940170940171,
+      "grad_norm": 0.5192127823829651,
+      "learning_rate": 0.00019673361522179627,
+      "loss": 1.1164,
+      "step": 1851
+    },
+    {
+      "epoch": 0.3297720797720798,
+      "grad_norm": 0.5222696661949158,
+      "learning_rate": 0.00019673006595586495,
+      "loss": 1.3191,
+      "step": 1852
+    },
+    {
+      "epoch": 0.3299501424501424,
+      "grad_norm": 0.6046679019927979,
+      "learning_rate": 0.0001967265147947089,
+      "loss": 0.9782,
+      "step": 1853
+    },
+    {
+      "epoch": 0.3301282051282051,
+      "grad_norm": 0.47928622364997864,
+      "learning_rate": 0.00019672296173839775,
+      "loss": 1.2247,
+      "step": 1854
+    },
+    {
+      "epoch": 0.3303062678062678,
+      "grad_norm": 0.5435982346534729,
+      "learning_rate": 0.00019671940678700107,
+      "loss": 1.1647,
+      "step": 1855
+    },
+    {
+      "epoch": 0.33048433048433046,
+      "grad_norm": 0.46878984570503235,
+      "learning_rate": 0.00019671584994058856,
+      "loss": 1.132,
+      "step": 1856
+    },
+    {
+      "epoch": 0.33066239316239315,
+      "grad_norm": 0.5336877107620239,
+      "learning_rate": 0.00019671229119922986,
+      "loss": 1.0583,
+      "step": 1857
+    },
+    {
+      "epoch": 0.33084045584045585,
+      "grad_norm": 0.4811093807220459,
+      "learning_rate": 0.0001967087305629947,
+      "loss": 1.0089,
+      "step": 1858
+    },
+    {
+      "epoch": 0.33101851851851855,
+      "grad_norm": 0.5140184760093689,
+      "learning_rate": 0.0001967051680319529,
+      "loss": 1.2335,
+      "step": 1859
+    },
+    {
+      "epoch": 0.3311965811965812,
+      "grad_norm": 0.5855883955955505,
+      "learning_rate": 0.00019670160360617418,
+      "loss": 1.1107,
+      "step": 1860
+    },
+    {
+      "epoch": 0.3313746438746439,
+      "grad_norm": 0.5081531405448914,
+      "learning_rate": 0.00019669803728572844,
+      "loss": 1.0669,
+      "step": 1861
+    },
+    {
+      "epoch": 0.3315527065527066,
+      "grad_norm": 0.48749417066574097,
+      "learning_rate": 0.0001966944690706855,
+      "loss": 1.1465,
+      "step": 1862
+    },
+    {
+      "epoch": 0.3317307692307692,
+      "grad_norm": 0.5175687670707703,
+      "learning_rate": 0.00019669089896111536,
+      "loss": 1.254,
+      "step": 1863
+    },
+    {
+      "epoch": 0.3319088319088319,
+      "grad_norm": 0.4198860824108124,
+      "learning_rate": 0.0001966873269570879,
+      "loss": 0.9811,
+      "step": 1864
+    },
+    {
+      "epoch": 0.3320868945868946,
+      "grad_norm": 0.5220273733139038,
+      "learning_rate": 0.0001966837530586731,
+      "loss": 1.277,
+      "step": 1865
+    },
+    {
+      "epoch": 0.33226495726495725,
+      "grad_norm": 0.551954448223114,
+      "learning_rate": 0.00019668017726594101,
+      "loss": 1.0627,
+      "step": 1866
+    },
+    {
+      "epoch": 0.33244301994301995,
+      "grad_norm": 0.5289301872253418,
+      "learning_rate": 0.00019667659957896166,
+      "loss": 1.4525,
+      "step": 1867
+    },
+    {
+      "epoch": 0.33262108262108264,
+      "grad_norm": 0.5190161466598511,
+      "learning_rate": 0.00019667301999780522,
+      "loss": 1.1064,
+      "step": 1868
+    },
+    {
+      "epoch": 0.3327991452991453,
+      "grad_norm": 0.437637060880661,
+      "learning_rate": 0.00019666943852254172,
+      "loss": 1.1304,
+      "step": 1869
+    },
+    {
+      "epoch": 0.332977207977208,
+      "grad_norm": 0.4801286458969116,
+      "learning_rate": 0.00019666585515324138,
+      "loss": 1.032,
+      "step": 1870
+    },
+    {
+      "epoch": 0.3331552706552707,
+      "grad_norm": 0.5041908621788025,
+      "learning_rate": 0.00019666226988997445,
+      "loss": 1.2611,
+      "step": 1871
+    },
+    {
+      "epoch": 0.3333333333333333,
+      "grad_norm": 0.4529375731945038,
+      "learning_rate": 0.00019665868273281115,
+      "loss": 1.1346,
+      "step": 1872
+    },
+    {
+      "epoch": 0.333511396011396,
+      "grad_norm": 0.4797019064426422,
+      "learning_rate": 0.00019665509368182172,
+      "loss": 1.1716,
+      "step": 1873
+    },
+    {
+      "epoch": 0.3336894586894587,
+      "grad_norm": 0.5505055785179138,
+      "learning_rate": 0.00019665150273707652,
+      "loss": 0.9729,
+      "step": 1874
+    },
+    {
+      "epoch": 0.33386752136752135,
+      "grad_norm": 0.4228051006793976,
+      "learning_rate": 0.00019664790989864592,
+      "loss": 0.9023,
+      "step": 1875
+    },
+    {
+      "epoch": 0.33404558404558404,
+      "grad_norm": 0.4926959276199341,
+      "learning_rate": 0.00019664431516660028,
+      "loss": 1.0999,
+      "step": 1876
+    },
+    {
+      "epoch": 0.33422364672364674,
+      "grad_norm": 0.4273219704627991,
+      "learning_rate": 0.00019664071854101005,
+      "loss": 1.1039,
+      "step": 1877
+    },
+    {
+      "epoch": 0.3344017094017094,
+      "grad_norm": 0.48438936471939087,
+      "learning_rate": 0.00019663712002194566,
+      "loss": 1.1308,
+      "step": 1878
+    },
+    {
+      "epoch": 0.3345797720797721,
+      "grad_norm": 0.5102053284645081,
+      "learning_rate": 0.0001966335196094777,
+      "loss": 1.0618,
+      "step": 1879
+    },
+    {
+      "epoch": 0.33475783475783477,
+      "grad_norm": 0.4357300400733948,
+      "learning_rate": 0.00019662991730367663,
+      "loss": 1.0521,
+      "step": 1880
+    },
+    {
+      "epoch": 0.3349358974358974,
+      "grad_norm": 0.5052695870399475,
+      "learning_rate": 0.00019662631310461308,
+      "loss": 0.9579,
+      "step": 1881
+    },
+    {
+      "epoch": 0.3351139601139601,
+      "grad_norm": 0.4889117181301117,
+      "learning_rate": 0.00019662270701235762,
+      "loss": 1.0304,
+      "step": 1882
+    },
+    {
+      "epoch": 0.3352920227920228,
+      "grad_norm": 0.4671195149421692,
+      "learning_rate": 0.000196619099026981,
+      "loss": 1.2228,
+      "step": 1883
+    },
+    {
+      "epoch": 0.33547008547008544,
+      "grad_norm": 0.4700174331665039,
+      "learning_rate": 0.0001966154891485538,
+      "loss": 0.9634,
+      "step": 1884
+    },
+    {
+      "epoch": 0.33564814814814814,
+      "grad_norm": 0.488817423582077,
+      "learning_rate": 0.00019661187737714676,
+      "loss": 1.2499,
+      "step": 1885
+    },
+    {
+      "epoch": 0.33582621082621084,
+      "grad_norm": 0.5336169600486755,
+      "learning_rate": 0.00019660826371283073,
+      "loss": 1.251,
+      "step": 1886
+    },
+    {
+      "epoch": 0.33600427350427353,
+      "grad_norm": 0.5054540038108826,
+      "learning_rate": 0.00019660464815567642,
+      "loss": 1.221,
+      "step": 1887
+    },
+    {
+      "epoch": 0.33618233618233617,
+      "grad_norm": 0.5078747868537903,
+      "learning_rate": 0.00019660103070575472,
+      "loss": 0.9792,
+      "step": 1888
+    },
+    {
+      "epoch": 0.33636039886039887,
+      "grad_norm": 0.498571515083313,
+      "learning_rate": 0.0001965974113631365,
+      "loss": 1.1682,
+      "step": 1889
+    },
+    {
+      "epoch": 0.33653846153846156,
+      "grad_norm": 0.49969518184661865,
+      "learning_rate": 0.00019659379012789264,
+      "loss": 1.0012,
+      "step": 1890
+    },
+    {
+      "epoch": 0.3367165242165242,
+      "grad_norm": 0.4238094687461853,
+      "learning_rate": 0.00019659016700009416,
+      "loss": 1.0455,
+      "step": 1891
+    },
+    {
+      "epoch": 0.3368945868945869,
+      "grad_norm": 0.5139104723930359,
+      "learning_rate": 0.000196586541979812,
+      "loss": 0.9979,
+      "step": 1892
+    },
+    {
+      "epoch": 0.3370726495726496,
+      "grad_norm": 0.5446547269821167,
+      "learning_rate": 0.00019658291506711715,
+      "loss": 0.9271,
+      "step": 1893
+    },
+    {
+      "epoch": 0.33725071225071224,
+      "grad_norm": 0.5284572839736938,
+      "learning_rate": 0.00019657928626208077,
+      "loss": 1.0356,
+      "step": 1894
+    },
+    {
+      "epoch": 0.33742877492877493,
+      "grad_norm": 0.49936217069625854,
+      "learning_rate": 0.00019657565556477387,
+      "loss": 0.9785,
+      "step": 1895
+    },
+    {
+      "epoch": 0.33760683760683763,
+      "grad_norm": 0.4678729772567749,
+      "learning_rate": 0.00019657202297526763,
+      "loss": 1.2135,
+      "step": 1896
+    },
+    {
+      "epoch": 0.33778490028490027,
+      "grad_norm": 0.46844249963760376,
+      "learning_rate": 0.0001965683884936332,
+      "loss": 0.9369,
+      "step": 1897
+    },
+    {
+      "epoch": 0.33796296296296297,
+      "grad_norm": 0.4307389557361603,
+      "learning_rate": 0.0001965647521199418,
+      "loss": 0.9301,
+      "step": 1898
+    },
+    {
+      "epoch": 0.33814102564102566,
+      "grad_norm": 0.48227834701538086,
+      "learning_rate": 0.00019656111385426468,
+      "loss": 1.3169,
+      "step": 1899
+    },
+    {
+      "epoch": 0.3383190883190883,
+      "grad_norm": 0.45860713720321655,
+      "learning_rate": 0.00019655747369667315,
+      "loss": 0.9835,
+      "step": 1900
+    },
+    {
+      "epoch": 0.338497150997151,
+      "grad_norm": 0.5522414445877075,
+      "learning_rate": 0.00019655383164723846,
+      "loss": 1.363,
+      "step": 1901
+    },
+    {
+      "epoch": 0.3386752136752137,
+      "grad_norm": 0.5283710360527039,
+      "learning_rate": 0.000196550187706032,
+      "loss": 1.1499,
+      "step": 1902
+    },
+    {
+      "epoch": 0.33885327635327633,
+      "grad_norm": 0.4419134259223938,
+      "learning_rate": 0.00019654654187312525,
+      "loss": 1.2039,
+      "step": 1903
+    },
+    {
+      "epoch": 0.33903133903133903,
+      "grad_norm": 0.49066096544265747,
+      "learning_rate": 0.00019654289414858952,
+      "loss": 0.9707,
+      "step": 1904
+    },
+    {
+      "epoch": 0.3392094017094017,
+      "grad_norm": 0.4619338810443878,
+      "learning_rate": 0.00019653924453249633,
+      "loss": 1.0849,
+      "step": 1905
+    },
+    {
+      "epoch": 0.33938746438746437,
+      "grad_norm": 0.5191119313240051,
+      "learning_rate": 0.0001965355930249172,
+      "loss": 1.1387,
+      "step": 1906
+    },
+    {
+      "epoch": 0.33956552706552706,
+      "grad_norm": 0.5245711207389832,
+      "learning_rate": 0.00019653193962592368,
+      "loss": 1.3435,
+      "step": 1907
+    },
+    {
+      "epoch": 0.33974358974358976,
+      "grad_norm": 0.49562904238700867,
+      "learning_rate": 0.0001965282843355873,
+      "loss": 1.2781,
+      "step": 1908
+    },
+    {
+      "epoch": 0.3399216524216524,
+      "grad_norm": 0.4661353826522827,
+      "learning_rate": 0.0001965246271539797,
+      "loss": 0.9317,
+      "step": 1909
+    },
+    {
+      "epoch": 0.3400997150997151,
+      "grad_norm": 0.4723222851753235,
+      "learning_rate": 0.00019652096808117254,
+      "loss": 1.0733,
+      "step": 1910
+    },
+    {
+      "epoch": 0.3402777777777778,
+      "grad_norm": 0.4358505308628082,
+      "learning_rate": 0.00019651730711723754,
+      "loss": 1.1461,
+      "step": 1911
+    },
+    {
+      "epoch": 0.34045584045584043,
+      "grad_norm": 0.462422251701355,
+      "learning_rate": 0.00019651364426224638,
+      "loss": 1.0914,
+      "step": 1912
+    },
+    {
+      "epoch": 0.3406339031339031,
+      "grad_norm": 0.47952914237976074,
+      "learning_rate": 0.0001965099795162709,
+      "loss": 1.0392,
+      "step": 1913
+    },
+    {
+      "epoch": 0.3408119658119658,
+      "grad_norm": 0.5036373734474182,
+      "learning_rate": 0.00019650631287938282,
+      "loss": 1.4002,
+      "step": 1914
+    },
+    {
+      "epoch": 0.34099002849002846,
+      "grad_norm": 0.5130090713500977,
+      "learning_rate": 0.000196502644351654,
+      "loss": 1.3499,
+      "step": 1915
+    },
+    {
+      "epoch": 0.34116809116809116,
+      "grad_norm": 0.4426332414150238,
+      "learning_rate": 0.00019649897393315635,
+      "loss": 1.0726,
+      "step": 1916
+    },
+    {
+      "epoch": 0.34134615384615385,
+      "grad_norm": 0.5580727458000183,
+      "learning_rate": 0.00019649530162396176,
+      "loss": 1.1164,
+      "step": 1917
+    },
+    {
+      "epoch": 0.34152421652421655,
+      "grad_norm": 0.545001745223999,
+      "learning_rate": 0.00019649162742414218,
+      "loss": 0.962,
+      "step": 1918
+    },
+    {
+      "epoch": 0.3417022792022792,
+      "grad_norm": 0.5225808024406433,
+      "learning_rate": 0.00019648795133376962,
+      "loss": 1.1415,
+      "step": 1919
+    },
+    {
+      "epoch": 0.3418803418803419,
+      "grad_norm": 0.48210129141807556,
+      "learning_rate": 0.0001964842733529161,
+      "loss": 1.1188,
+      "step": 1920
+    },
+    {
+      "epoch": 0.3420584045584046,
+      "grad_norm": 0.4515395164489746,
+      "learning_rate": 0.00019648059348165365,
+      "loss": 1.0828,
+      "step": 1921
+    },
+    {
+      "epoch": 0.3422364672364672,
+      "grad_norm": 0.5802633166313171,
+      "learning_rate": 0.0001964769117200544,
+      "loss": 1.3137,
+      "step": 1922
+    },
+    {
+      "epoch": 0.3424145299145299,
+      "grad_norm": 0.4432032108306885,
+      "learning_rate": 0.00019647322806819046,
+      "loss": 1.0523,
+      "step": 1923
+    },
+    {
+      "epoch": 0.3425925925925926,
+      "grad_norm": 0.4697614908218384,
+      "learning_rate": 0.00019646954252613402,
+      "loss": 0.8426,
+      "step": 1924
+    },
+    {
+      "epoch": 0.34277065527065526,
+      "grad_norm": 0.4610968232154846,
+      "learning_rate": 0.0001964658550939573,
+      "loss": 0.9826,
+      "step": 1925
+    },
+    {
+      "epoch": 0.34294871794871795,
+      "grad_norm": 0.5278257727622986,
+      "learning_rate": 0.00019646216577173258,
+      "loss": 1.1064,
+      "step": 1926
+    },
+    {
+      "epoch": 0.34312678062678065,
+      "grad_norm": 0.5686144232749939,
+      "learning_rate": 0.00019645847455953205,
+      "loss": 0.9138,
+      "step": 1927
+    },
+    {
+      "epoch": 0.3433048433048433,
+      "grad_norm": 0.42894792556762695,
+      "learning_rate": 0.0001964547814574281,
+      "loss": 1.0461,
+      "step": 1928
+    },
+    {
+      "epoch": 0.343482905982906,
+      "grad_norm": 0.5567317605018616,
+      "learning_rate": 0.0001964510864654931,
+      "loss": 0.8787,
+      "step": 1929
+    },
+    {
+      "epoch": 0.3436609686609687,
+      "grad_norm": 0.5015586614608765,
+      "learning_rate": 0.0001964473895837994,
+      "loss": 1.1406,
+      "step": 1930
+    },
+    {
+      "epoch": 0.3438390313390313,
+      "grad_norm": 0.47391530871391296,
+      "learning_rate": 0.00019644369081241948,
+      "loss": 1.0685,
+      "step": 1931
+    },
+    {
+      "epoch": 0.344017094017094,
+      "grad_norm": 0.546037495136261,
+      "learning_rate": 0.00019643999015142574,
+      "loss": 1.2349,
+      "step": 1932
+    },
+    {
+      "epoch": 0.3441951566951567,
+      "grad_norm": 0.4724953770637512,
+      "learning_rate": 0.00019643628760089078,
+      "loss": 1.0621,
+      "step": 1933
+    },
+    {
+      "epoch": 0.34437321937321935,
+      "grad_norm": 0.5644593834877014,
+      "learning_rate": 0.00019643258316088703,
+      "loss": 1.2559,
+      "step": 1934
+    },
+    {
+      "epoch": 0.34455128205128205,
+      "grad_norm": 0.500815749168396,
+      "learning_rate": 0.00019642887683148718,
+      "loss": 1.0439,
+      "step": 1935
+    },
+    {
+      "epoch": 0.34472934472934474,
+      "grad_norm": 0.4932316541671753,
+      "learning_rate": 0.0001964251686127638,
+      "loss": 1.0404,
+      "step": 1936
+    },
+    {
+      "epoch": 0.3449074074074074,
+      "grad_norm": 0.48494651913642883,
+      "learning_rate": 0.00019642145850478954,
+      "loss": 0.9951,
+      "step": 1937
+    },
+    {
+      "epoch": 0.3450854700854701,
+      "grad_norm": 0.5191963315010071,
+      "learning_rate": 0.00019641774650763706,
+      "loss": 1.1258,
+      "step": 1938
+    },
+    {
+      "epoch": 0.3452635327635328,
+      "grad_norm": 0.4439312815666199,
+      "learning_rate": 0.00019641403262137918,
+      "loss": 1.1158,
+      "step": 1939
+    },
+    {
+      "epoch": 0.3454415954415954,
+      "grad_norm": 0.4829137921333313,
+      "learning_rate": 0.0001964103168460886,
+      "loss": 1.0531,
+      "step": 1940
+    },
+    {
+      "epoch": 0.3456196581196581,
+      "grad_norm": 0.49433329701423645,
+      "learning_rate": 0.00019640659918183811,
+      "loss": 1.1295,
+      "step": 1941
+    },
+    {
+      "epoch": 0.3457977207977208,
+      "grad_norm": 0.5351347923278809,
+      "learning_rate": 0.00019640287962870062,
+      "loss": 1.2379,
+      "step": 1942
+    },
+    {
+      "epoch": 0.34597578347578345,
+      "grad_norm": 0.4845680892467499,
+      "learning_rate": 0.00019639915818674895,
+      "loss": 1.0197,
+      "step": 1943
+    },
+    {
+      "epoch": 0.34615384615384615,
+      "grad_norm": 0.5312514901161194,
+      "learning_rate": 0.00019639543485605604,
+      "loss": 0.9734,
+      "step": 1944
+    },
+    {
+      "epoch": 0.34633190883190884,
+      "grad_norm": 0.4571874737739563,
+      "learning_rate": 0.00019639170963669478,
+      "loss": 1.1012,
+      "step": 1945
+    },
+    {
+      "epoch": 0.34650997150997154,
+      "grad_norm": 0.4449031949043274,
+      "learning_rate": 0.00019638798252873824,
+      "loss": 1.1393,
+      "step": 1946
+    },
+    {
+      "epoch": 0.3466880341880342,
+      "grad_norm": 0.47470834851264954,
+      "learning_rate": 0.0001963842535322594,
+      "loss": 0.981,
+      "step": 1947
+    },
+    {
+      "epoch": 0.3468660968660969,
+      "grad_norm": 0.5386981964111328,
+      "learning_rate": 0.00019638052264733132,
+      "loss": 1.1247,
+      "step": 1948
+    },
+    {
+      "epoch": 0.34704415954415957,
+      "grad_norm": 0.535589873790741,
+      "learning_rate": 0.00019637678987402714,
+      "loss": 1.3157,
+      "step": 1949
+    },
+    {
+      "epoch": 0.3472222222222222,
+      "grad_norm": 0.49338245391845703,
+      "learning_rate": 0.00019637305521242,
+      "loss": 1.1066,
+      "step": 1950
+    },
+    {
+      "epoch": 0.3474002849002849,
+      "grad_norm": 0.4247688353061676,
+      "learning_rate": 0.00019636931866258298,
+      "loss": 1.0039,
+      "step": 1951
+    },
+    {
+      "epoch": 0.3475783475783476,
+      "grad_norm": 0.5351517200469971,
+      "learning_rate": 0.00019636558022458934,
+      "loss": 1.0344,
+      "step": 1952
+    },
+    {
+      "epoch": 0.34775641025641024,
+      "grad_norm": 0.4633362889289856,
+      "learning_rate": 0.00019636183989851238,
+      "loss": 1.1383,
+      "step": 1953
+    },
+    {
+      "epoch": 0.34793447293447294,
+      "grad_norm": 0.553709089756012,
+      "learning_rate": 0.00019635809768442535,
+      "loss": 1.0389,
+      "step": 1954
+    },
+    {
+      "epoch": 0.34811253561253563,
+      "grad_norm": 0.479374498128891,
+      "learning_rate": 0.00019635435358240154,
+      "loss": 1.1774,
+      "step": 1955
+    },
+    {
+      "epoch": 0.3482905982905983,
+      "grad_norm": 0.5274081230163574,
+      "learning_rate": 0.0001963506075925143,
+      "loss": 1.1809,
+      "step": 1956
+    },
+    {
+      "epoch": 0.34846866096866097,
+      "grad_norm": 0.45398542284965515,
+      "learning_rate": 0.0001963468597148371,
+      "loss": 1.0502,
+      "step": 1957
+    },
+    {
+      "epoch": 0.34864672364672367,
+      "grad_norm": 0.48201611638069153,
+      "learning_rate": 0.00019634310994944332,
+      "loss": 1.0557,
+      "step": 1958
+    },
+    {
+      "epoch": 0.3488247863247863,
+      "grad_norm": 0.6407544016838074,
+      "learning_rate": 0.00019633935829640642,
+      "loss": 1.2138,
+      "step": 1959
+    },
+    {
+      "epoch": 0.349002849002849,
+      "grad_norm": 0.5385687351226807,
+      "learning_rate": 0.00019633560475579995,
+      "loss": 1.3496,
+      "step": 1960
+    },
+    {
+      "epoch": 0.3491809116809117,
+      "grad_norm": 0.5260964035987854,
+      "learning_rate": 0.0001963318493276974,
+      "loss": 1.0253,
+      "step": 1961
+    },
+    {
+      "epoch": 0.34935897435897434,
+      "grad_norm": 0.48478585481643677,
+      "learning_rate": 0.00019632809201217238,
+      "loss": 1.137,
+      "step": 1962
+    },
+    {
+      "epoch": 0.34953703703703703,
+      "grad_norm": 0.620033860206604,
+      "learning_rate": 0.0001963243328092985,
+      "loss": 1.3445,
+      "step": 1963
+    },
+    {
+      "epoch": 0.34971509971509973,
+      "grad_norm": 0.5149700045585632,
+      "learning_rate": 0.00019632057171914942,
+      "loss": 1.1042,
+      "step": 1964
+    },
+    {
+      "epoch": 0.34989316239316237,
+      "grad_norm": 0.42695048451423645,
+      "learning_rate": 0.0001963168087417988,
+      "loss": 0.8789,
+      "step": 1965
+    },
+    {
+      "epoch": 0.35007122507122507,
+      "grad_norm": 0.5281283855438232,
+      "learning_rate": 0.00019631304387732044,
+      "loss": 1.1155,
+      "step": 1966
+    },
+    {
+      "epoch": 0.35024928774928776,
+      "grad_norm": 0.4994089901447296,
+      "learning_rate": 0.00019630927712578804,
+      "loss": 1.1226,
+      "step": 1967
+    },
+    {
+      "epoch": 0.3504273504273504,
+      "grad_norm": 0.4433288276195526,
+      "learning_rate": 0.0001963055084872754,
+      "loss": 1.0262,
+      "step": 1968
+    },
+    {
+      "epoch": 0.3506054131054131,
+      "grad_norm": 0.46541857719421387,
+      "learning_rate": 0.0001963017379618564,
+      "loss": 1.1438,
+      "step": 1969
+    },
+    {
+      "epoch": 0.3507834757834758,
+      "grad_norm": 0.5097604393959045,
+      "learning_rate": 0.00019629796554960488,
+      "loss": 0.9641,
+      "step": 1970
+    },
+    {
+      "epoch": 0.35096153846153844,
+      "grad_norm": 0.49461981654167175,
+      "learning_rate": 0.00019629419125059478,
+      "loss": 1.1765,
+      "step": 1971
+    },
+    {
+      "epoch": 0.35113960113960113,
+      "grad_norm": 0.4763339161872864,
+      "learning_rate": 0.00019629041506490005,
+      "loss": 1.0527,
+      "step": 1972
+    },
+    {
+      "epoch": 0.35131766381766383,
+      "grad_norm": 0.4528443217277527,
+      "learning_rate": 0.00019628663699259463,
+      "loss": 1.1409,
+      "step": 1973
+    },
+    {
+      "epoch": 0.35149572649572647,
+      "grad_norm": 0.4436309039592743,
+      "learning_rate": 0.00019628285703375258,
+      "loss": 1.0459,
+      "step": 1974
+    },
+    {
+      "epoch": 0.35167378917378916,
+      "grad_norm": 0.5146129727363586,
+      "learning_rate": 0.00019627907518844797,
+      "loss": 1.2527,
+      "step": 1975
+    },
+    {
+      "epoch": 0.35185185185185186,
+      "grad_norm": 0.5202171802520752,
+      "learning_rate": 0.0001962752914567549,
+      "loss": 1.226,
+      "step": 1976
+    },
+    {
+      "epoch": 0.35202991452991456,
+      "grad_norm": 0.5267411470413208,
+      "learning_rate": 0.00019627150583874747,
+      "loss": 1.0898,
+      "step": 1977
+    },
+    {
+      "epoch": 0.3522079772079772,
+      "grad_norm": 0.546840250492096,
+      "learning_rate": 0.00019626771833449987,
+      "loss": 1.1716,
+      "step": 1978
+    },
+    {
+      "epoch": 0.3523860398860399,
+      "grad_norm": 0.5525290966033936,
+      "learning_rate": 0.0001962639289440863,
+      "loss": 1.1762,
+      "step": 1979
+    },
+    {
+      "epoch": 0.3525641025641026,
+      "grad_norm": 0.48967215418815613,
+      "learning_rate": 0.000196260137667581,
+      "loss": 1.1884,
+      "step": 1980
+    },
+    {
+      "epoch": 0.35274216524216523,
+      "grad_norm": 0.5908235907554626,
+      "learning_rate": 0.0001962563445050583,
+      "loss": 1.1887,
+      "step": 1981
+    },
+    {
+      "epoch": 0.3529202279202279,
+      "grad_norm": 0.46708086133003235,
+      "learning_rate": 0.00019625254945659245,
+      "loss": 0.8842,
+      "step": 1982
+    },
+    {
+      "epoch": 0.3530982905982906,
+      "grad_norm": 0.41652458906173706,
+      "learning_rate": 0.00019624875252225788,
+      "loss": 1.0268,
+      "step": 1983
+    },
+    {
+      "epoch": 0.35327635327635326,
+      "grad_norm": 0.5084529519081116,
+      "learning_rate": 0.00019624495370212892,
+      "loss": 1.0547,
+      "step": 1984
+    },
+    {
+      "epoch": 0.35345441595441596,
+      "grad_norm": 0.5667507648468018,
+      "learning_rate": 0.00019624115299628003,
+      "loss": 1.0656,
+      "step": 1985
+    },
+    {
+      "epoch": 0.35363247863247865,
+      "grad_norm": 0.5022873282432556,
+      "learning_rate": 0.00019623735040478568,
+      "loss": 1.0627,
+      "step": 1986
+    },
+    {
+      "epoch": 0.3538105413105413,
+      "grad_norm": 0.48342058062553406,
+      "learning_rate": 0.00019623354592772035,
+      "loss": 1.0976,
+      "step": 1987
+    },
+    {
+      "epoch": 0.353988603988604,
+      "grad_norm": 0.48117366433143616,
+      "learning_rate": 0.0001962297395651586,
+      "loss": 1.0515,
+      "step": 1988
+    },
+    {
+      "epoch": 0.3541666666666667,
+      "grad_norm": 0.492564857006073,
+      "learning_rate": 0.000196225931317175,
+      "loss": 1.1957,
+      "step": 1989
+    },
+    {
+      "epoch": 0.3543447293447293,
+      "grad_norm": 0.4756208658218384,
+      "learning_rate": 0.00019622212118384417,
+      "loss": 1.007,
+      "step": 1990
+    },
+    {
+      "epoch": 0.354522792022792,
+      "grad_norm": 0.581930935382843,
+      "learning_rate": 0.00019621830916524076,
+      "loss": 1.232,
+      "step": 1991
+    },
+    {
+      "epoch": 0.3547008547008547,
+      "grad_norm": 0.480064332485199,
+      "learning_rate": 0.00019621449526143947,
+      "loss": 1.2693,
+      "step": 1992
+    },
+    {
+      "epoch": 0.35487891737891736,
+      "grad_norm": 0.5679123401641846,
+      "learning_rate": 0.000196210679472515,
+      "loss": 1.2985,
+      "step": 1993
+    },
+    {
+      "epoch": 0.35505698005698005,
+      "grad_norm": 0.43757280707359314,
+      "learning_rate": 0.00019620686179854213,
+      "loss": 1.1387,
+      "step": 1994
+    },
+    {
+      "epoch": 0.35523504273504275,
+      "grad_norm": 0.4950634837150574,
+      "learning_rate": 0.00019620304223959566,
+      "loss": 1.1809,
+      "step": 1995
+    },
+    {
+      "epoch": 0.3554131054131054,
+      "grad_norm": 0.5574113726615906,
+      "learning_rate": 0.00019619922079575043,
+      "loss": 1.2434,
+      "step": 1996
+    },
+    {
+      "epoch": 0.3555911680911681,
+      "grad_norm": 0.5154930949211121,
+      "learning_rate": 0.00019619539746708128,
+      "loss": 1.1747,
+      "step": 1997
+    },
+    {
+      "epoch": 0.3557692307692308,
+      "grad_norm": 0.4377825856208801,
+      "learning_rate": 0.00019619157225366315,
+      "loss": 0.9547,
+      "step": 1998
+    },
+    {
+      "epoch": 0.3559472934472934,
+      "grad_norm": 0.530714213848114,
+      "learning_rate": 0.00019618774515557097,
+      "loss": 1.2057,
+      "step": 1999
+    },
+    {
+      "epoch": 0.3561253561253561,
+      "grad_norm": 0.5703464150428772,
+      "learning_rate": 0.00019618391617287978,
+      "loss": 1.3068,
+      "step": 2000
+    },
+    {
+      "epoch": 0.3563034188034188,
+      "grad_norm": 0.4862228333950043,
+      "learning_rate": 0.0001961800853056645,
+      "loss": 1.0077,
+      "step": 2001
+    },
+    {
+      "epoch": 0.35648148148148145,
+      "grad_norm": 0.5575395822525024,
+      "learning_rate": 0.00019617625255400028,
+      "loss": 1.03,
+      "step": 2002
+    },
+    {
+      "epoch": 0.35665954415954415,
+      "grad_norm": 0.4826279580593109,
+      "learning_rate": 0.0001961724179179622,
+      "loss": 1.268,
+      "step": 2003
+    },
+    {
+      "epoch": 0.35683760683760685,
+      "grad_norm": 0.49423274397850037,
+      "learning_rate": 0.00019616858139762534,
+      "loss": 1.1305,
+      "step": 2004
+    },
+    {
+      "epoch": 0.35701566951566954,
+      "grad_norm": 0.5208541750907898,
+      "learning_rate": 0.00019616474299306491,
+      "loss": 1.1651,
+      "step": 2005
+    },
+    {
+      "epoch": 0.3571937321937322,
+      "grad_norm": 0.5324164032936096,
+      "learning_rate": 0.0001961609027043561,
+      "loss": 1.1406,
+      "step": 2006
+    },
+    {
+      "epoch": 0.3573717948717949,
+      "grad_norm": 0.45385462045669556,
+      "learning_rate": 0.00019615706053157416,
+      "loss": 1.0716,
+      "step": 2007
+    },
+    {
+      "epoch": 0.3575498575498576,
+      "grad_norm": 0.5016173720359802,
+      "learning_rate": 0.00019615321647479438,
+      "loss": 1.0878,
+      "step": 2008
+    },
+    {
+      "epoch": 0.3577279202279202,
+      "grad_norm": 0.5073097348213196,
+      "learning_rate": 0.00019614937053409205,
+      "loss": 1.237,
+      "step": 2009
+    },
+    {
+      "epoch": 0.3579059829059829,
+      "grad_norm": 0.48880141973495483,
+      "learning_rate": 0.00019614552270954256,
+      "loss": 0.8794,
+      "step": 2010
+    },
+    {
+      "epoch": 0.3580840455840456,
+      "grad_norm": 0.43902209401130676,
+      "learning_rate": 0.00019614167300122126,
+      "loss": 0.912,
+      "step": 2011
+    },
+    {
+      "epoch": 0.35826210826210825,
+      "grad_norm": 0.42809322476387024,
+      "learning_rate": 0.0001961378214092036,
+      "loss": 0.7804,
+      "step": 2012
+    },
+    {
+      "epoch": 0.35844017094017094,
+      "grad_norm": 0.4464281499385834,
+      "learning_rate": 0.00019613396793356503,
+      "loss": 1.0004,
+      "step": 2013
+    },
+    {
+      "epoch": 0.35861823361823364,
+      "grad_norm": 0.49085676670074463,
+      "learning_rate": 0.00019613011257438109,
+      "loss": 1.1087,
+      "step": 2014
+    },
+    {
+      "epoch": 0.3587962962962963,
+      "grad_norm": 0.4997732937335968,
+      "learning_rate": 0.00019612625533172725,
+      "loss": 0.9591,
+      "step": 2015
+    },
+    {
+      "epoch": 0.358974358974359,
+      "grad_norm": 0.48442545533180237,
+      "learning_rate": 0.00019612239620567912,
+      "loss": 0.9744,
+      "step": 2016
+    },
+    {
+      "epoch": 0.35915242165242167,
+      "grad_norm": 0.4989205002784729,
+      "learning_rate": 0.00019611853519631233,
+      "loss": 0.9844,
+      "step": 2017
+    },
+    {
+      "epoch": 0.3593304843304843,
+      "grad_norm": 0.6107521653175354,
+      "learning_rate": 0.00019611467230370248,
+      "loss": 1.147,
+      "step": 2018
+    },
+    {
+      "epoch": 0.359508547008547,
+      "grad_norm": 0.5594844818115234,
+      "learning_rate": 0.00019611080752792535,
+      "loss": 1.3195,
+      "step": 2019
+    },
+    {
+      "epoch": 0.3596866096866097,
+      "grad_norm": 0.4786946475505829,
+      "learning_rate": 0.00019610694086905656,
+      "loss": 1.2108,
+      "step": 2020
+    },
+    {
+      "epoch": 0.35986467236467234,
+      "grad_norm": 0.5186030268669128,
+      "learning_rate": 0.0001961030723271719,
+      "loss": 1.0008,
+      "step": 2021
+    },
+    {
+      "epoch": 0.36004273504273504,
+      "grad_norm": 0.4520573318004608,
+      "learning_rate": 0.0001960992019023472,
+      "loss": 1.1307,
+      "step": 2022
+    },
+    {
+      "epoch": 0.36022079772079774,
+      "grad_norm": 0.4983210563659668,
+      "learning_rate": 0.00019609532959465823,
+      "loss": 1.1486,
+      "step": 2023
+    },
+    {
+      "epoch": 0.3603988603988604,
+      "grad_norm": 0.6209200024604797,
+      "learning_rate": 0.00019609145540418094,
+      "loss": 1.2566,
+      "step": 2024
+    },
+    {
+      "epoch": 0.3605769230769231,
+      "grad_norm": 0.47047603130340576,
+      "learning_rate": 0.00019608757933099117,
+      "loss": 1.1588,
+      "step": 2025
+    },
+    {
+      "epoch": 0.36075498575498577,
+      "grad_norm": 0.5147389769554138,
+      "learning_rate": 0.0001960837013751649,
+      "loss": 1.2113,
+      "step": 2026
+    },
+    {
+      "epoch": 0.3609330484330484,
+      "grad_norm": 0.45826098322868347,
+      "learning_rate": 0.00019607982153677808,
+      "loss": 1.13,
+      "step": 2027
+    },
+    {
+      "epoch": 0.3611111111111111,
+      "grad_norm": 0.5699561834335327,
+      "learning_rate": 0.00019607593981590675,
+      "loss": 1.2476,
+      "step": 2028
+    },
+    {
+      "epoch": 0.3612891737891738,
+      "grad_norm": 0.5349239110946655,
+      "learning_rate": 0.000196072056212627,
+      "loss": 1.2295,
+      "step": 2029
+    },
+    {
+      "epoch": 0.36146723646723644,
+      "grad_norm": 0.6212165355682373,
+      "learning_rate": 0.00019606817072701484,
+      "loss": 1.1965,
+      "step": 2030
+    },
+    {
+      "epoch": 0.36164529914529914,
+      "grad_norm": 0.4870990216732025,
+      "learning_rate": 0.00019606428335914645,
+      "loss": 1.4464,
+      "step": 2031
+    },
+    {
+      "epoch": 0.36182336182336183,
+      "grad_norm": 0.42427828907966614,
+      "learning_rate": 0.00019606039410909797,
+      "loss": 1.1546,
+      "step": 2032
+    },
+    {
+      "epoch": 0.36200142450142453,
+      "grad_norm": 0.5081788301467896,
+      "learning_rate": 0.0001960565029769456,
+      "loss": 1.1867,
+      "step": 2033
+    },
+    {
+      "epoch": 0.36217948717948717,
+      "grad_norm": 0.4813104271888733,
+      "learning_rate": 0.00019605260996276565,
+      "loss": 1.3726,
+      "step": 2034
+    },
+    {
+      "epoch": 0.36235754985754987,
+      "grad_norm": 0.4648851156234741,
+      "learning_rate": 0.0001960487150666343,
+      "loss": 1.2434,
+      "step": 2035
+    },
+    {
+      "epoch": 0.36253561253561256,
+      "grad_norm": 0.484161913394928,
+      "learning_rate": 0.00019604481828862792,
+      "loss": 1.1309,
+      "step": 2036
+    },
+    {
+      "epoch": 0.3627136752136752,
+      "grad_norm": 0.4929439127445221,
+      "learning_rate": 0.00019604091962882283,
+      "loss": 1.1007,
+      "step": 2037
+    },
+    {
+      "epoch": 0.3628917378917379,
+      "grad_norm": 0.45599642395973206,
+      "learning_rate": 0.00019603701908729544,
+      "loss": 1.2628,
+      "step": 2038
+    },
+    {
+      "epoch": 0.3630698005698006,
+      "grad_norm": 0.45295149087905884,
+      "learning_rate": 0.00019603311666412213,
+      "loss": 0.9808,
+      "step": 2039
+    },
+    {
+      "epoch": 0.36324786324786323,
+      "grad_norm": 0.48681163787841797,
+      "learning_rate": 0.00019602921235937942,
+      "loss": 1.0574,
+      "step": 2040
+    },
+    {
+      "epoch": 0.36342592592592593,
+      "grad_norm": 0.41232365369796753,
+      "learning_rate": 0.00019602530617314378,
+      "loss": 1.0454,
+      "step": 2041
+    },
+    {
+      "epoch": 0.3636039886039886,
+      "grad_norm": 0.46214723587036133,
+      "learning_rate": 0.00019602139810549174,
+      "loss": 0.9985,
+      "step": 2042
+    },
+    {
+      "epoch": 0.36378205128205127,
+      "grad_norm": 0.44307878613471985,
+      "learning_rate": 0.00019601748815649989,
+      "loss": 0.9683,
+      "step": 2043
+    },
+    {
+      "epoch": 0.36396011396011396,
+      "grad_norm": 0.4809451401233673,
+      "learning_rate": 0.00019601357632624477,
+      "loss": 1.028,
+      "step": 2044
+    },
+    {
+      "epoch": 0.36413817663817666,
+      "grad_norm": 0.4638497531414032,
+      "learning_rate": 0.0001960096626148031,
+      "loss": 0.9851,
+      "step": 2045
+    },
+    {
+      "epoch": 0.3643162393162393,
+      "grad_norm": 0.5942164063453674,
+      "learning_rate": 0.00019600574702225153,
+      "loss": 1.1606,
+      "step": 2046
+    },
+    {
+      "epoch": 0.364494301994302,
+      "grad_norm": 0.5171293616294861,
+      "learning_rate": 0.00019600182954866675,
+      "loss": 1.2335,
+      "step": 2047
+    },
+    {
+      "epoch": 0.3646723646723647,
+      "grad_norm": 0.5294404625892639,
+      "learning_rate": 0.00019599791019412558,
+      "loss": 1.0966,
+      "step": 2048
+    },
+    {
+      "epoch": 0.36485042735042733,
+      "grad_norm": 0.46117448806762695,
+      "learning_rate": 0.00019599398895870477,
+      "loss": 1.0565,
+      "step": 2049
+    },
+    {
+      "epoch": 0.36502849002849,
+      "grad_norm": 0.5385118126869202,
+      "learning_rate": 0.00019599006584248118,
+      "loss": 1.0076,
+      "step": 2050
+    },
+    {
+      "epoch": 0.3652065527065527,
+      "grad_norm": 0.4915166199207306,
+      "learning_rate": 0.00019598614084553165,
+      "loss": 0.9686,
+      "step": 2051
+    },
+    {
+      "epoch": 0.36538461538461536,
+      "grad_norm": 0.46769094467163086,
+      "learning_rate": 0.00019598221396793303,
+      "loss": 1.1217,
+      "step": 2052
+    },
+    {
+      "epoch": 0.36556267806267806,
+      "grad_norm": 0.5440493822097778,
+      "learning_rate": 0.00019597828520976236,
+      "loss": 1.2344,
+      "step": 2053
+    },
+    {
+      "epoch": 0.36574074074074076,
+      "grad_norm": 0.616727352142334,
+      "learning_rate": 0.00019597435457109657,
+      "loss": 1.2953,
+      "step": 2054
+    },
+    {
+      "epoch": 0.3659188034188034,
+      "grad_norm": 0.4859183430671692,
+      "learning_rate": 0.00019597042205201265,
+      "loss": 1.16,
+      "step": 2055
+    },
+    {
+      "epoch": 0.3660968660968661,
+      "grad_norm": 0.47056329250335693,
+      "learning_rate": 0.0001959664876525877,
+      "loss": 0.9982,
+      "step": 2056
+    },
+    {
+      "epoch": 0.3662749287749288,
+      "grad_norm": 0.48347967863082886,
+      "learning_rate": 0.00019596255137289875,
+      "loss": 1.0966,
+      "step": 2057
+    },
+    {
+      "epoch": 0.36645299145299143,
+      "grad_norm": 0.5068454742431641,
+      "learning_rate": 0.00019595861321302296,
+      "loss": 1.2891,
+      "step": 2058
+    },
+    {
+      "epoch": 0.3666310541310541,
+      "grad_norm": 0.5702359080314636,
+      "learning_rate": 0.00019595467317303747,
+      "loss": 1.1394,
+      "step": 2059
+    },
+    {
+      "epoch": 0.3668091168091168,
+      "grad_norm": 0.5028812885284424,
+      "learning_rate": 0.0001959507312530195,
+      "loss": 1.2324,
+      "step": 2060
+    },
+    {
+      "epoch": 0.36698717948717946,
+      "grad_norm": 0.4672880172729492,
+      "learning_rate": 0.00019594678745304628,
+      "loss": 1.0581,
+      "step": 2061
+    },
+    {
+      "epoch": 0.36716524216524216,
+      "grad_norm": 0.5233900547027588,
+      "learning_rate": 0.00019594284177319504,
+      "loss": 1.138,
+      "step": 2062
+    },
+    {
+      "epoch": 0.36734330484330485,
+      "grad_norm": 0.46871712803840637,
+      "learning_rate": 0.00019593889421354316,
+      "loss": 1.2159,
+      "step": 2063
+    },
+    {
+      "epoch": 0.36752136752136755,
+      "grad_norm": 0.5180533528327942,
+      "learning_rate": 0.00019593494477416793,
+      "loss": 1.1116,
+      "step": 2064
+    },
+    {
+      "epoch": 0.3676994301994302,
+      "grad_norm": 0.5398494005203247,
+      "learning_rate": 0.0001959309934551467,
+      "loss": 1.2038,
+      "step": 2065
+    },
+    {
+      "epoch": 0.3678774928774929,
+      "grad_norm": 0.4850373864173889,
+      "learning_rate": 0.000195927040256557,
+      "loss": 1.4315,
+      "step": 2066
+    },
+    {
+      "epoch": 0.3680555555555556,
+      "grad_norm": 0.49190905690193176,
+      "learning_rate": 0.0001959230851784762,
+      "loss": 0.9993,
+      "step": 2067
+    },
+    {
+      "epoch": 0.3682336182336182,
+      "grad_norm": 0.4546903073787689,
+      "learning_rate": 0.00019591912822098178,
+      "loss": 1.0979,
+      "step": 2068
+    },
+    {
+      "epoch": 0.3684116809116809,
+      "grad_norm": 0.4726468622684479,
+      "learning_rate": 0.00019591516938415133,
+      "loss": 1.1629,
+      "step": 2069
+    },
+    {
+      "epoch": 0.3685897435897436,
+      "grad_norm": 0.47856009006500244,
+      "learning_rate": 0.00019591120866806235,
+      "loss": 1.2048,
+      "step": 2070
+    },
+    {
+      "epoch": 0.36876780626780625,
+      "grad_norm": 0.46847718954086304,
+      "learning_rate": 0.0001959072460727925,
+      "loss": 1.0958,
+      "step": 2071
+    },
+    {
+      "epoch": 0.36894586894586895,
+      "grad_norm": 0.47164350748062134,
+      "learning_rate": 0.0001959032815984194,
+      "loss": 1.1912,
+      "step": 2072
+    },
+    {
+      "epoch": 0.36912393162393164,
+      "grad_norm": 0.4838213324546814,
+      "learning_rate": 0.0001958993152450207,
+      "loss": 1.1466,
+      "step": 2073
+    },
+    {
+      "epoch": 0.3693019943019943,
+      "grad_norm": 0.47234636545181274,
+      "learning_rate": 0.00019589534701267412,
+      "loss": 0.9475,
+      "step": 2074
+    },
+    {
+      "epoch": 0.369480056980057,
+      "grad_norm": 0.4913126826286316,
+      "learning_rate": 0.00019589137690145746,
+      "loss": 1.1571,
+      "step": 2075
+    },
+    {
+      "epoch": 0.3696581196581197,
+      "grad_norm": 0.4696233570575714,
+      "learning_rate": 0.00019588740491144842,
+      "loss": 0.9797,
+      "step": 2076
+    },
+    {
+      "epoch": 0.3698361823361823,
+      "grad_norm": 0.46146106719970703,
+      "learning_rate": 0.00019588343104272492,
+      "loss": 1.027,
+      "step": 2077
+    },
+    {
+      "epoch": 0.370014245014245,
+      "grad_norm": 0.4920627176761627,
+      "learning_rate": 0.00019587945529536474,
+      "loss": 1.1008,
+      "step": 2078
+    },
+    {
+      "epoch": 0.3701923076923077,
+      "grad_norm": 0.4854249954223633,
+      "learning_rate": 0.0001958754776694458,
+      "loss": 1.0759,
+      "step": 2079
+    },
+    {
+      "epoch": 0.37037037037037035,
+      "grad_norm": 0.4884897768497467,
+      "learning_rate": 0.00019587149816504608,
+      "loss": 1.1403,
+      "step": 2080
+    },
+    {
+      "epoch": 0.37054843304843305,
+      "grad_norm": 0.5062584280967712,
+      "learning_rate": 0.00019586751678224345,
+      "loss": 1.0185,
+      "step": 2081
+    },
+    {
+      "epoch": 0.37072649572649574,
+      "grad_norm": 0.44697675108909607,
+      "learning_rate": 0.000195863533521116,
+      "loss": 1.0462,
+      "step": 2082
+    },
+    {
+      "epoch": 0.3709045584045584,
+      "grad_norm": 0.5122885704040527,
+      "learning_rate": 0.00019585954838174176,
+      "loss": 1.108,
+      "step": 2083
+    },
+    {
+      "epoch": 0.3710826210826211,
+      "grad_norm": 0.486650288105011,
+      "learning_rate": 0.0001958555613641988,
+      "loss": 1.126,
+      "step": 2084
+    },
+    {
+      "epoch": 0.3712606837606838,
+      "grad_norm": 0.5296297669410706,
+      "learning_rate": 0.00019585157246856523,
+      "loss": 1.1757,
+      "step": 2085
+    },
+    {
+      "epoch": 0.3714387464387464,
+      "grad_norm": 0.4935721457004547,
+      "learning_rate": 0.0001958475816949192,
+      "loss": 1.1654,
+      "step": 2086
+    },
+    {
+      "epoch": 0.3716168091168091,
+      "grad_norm": 0.6226509213447571,
+      "learning_rate": 0.00019584358904333891,
+      "loss": 1.1981,
+      "step": 2087
+    },
+    {
+      "epoch": 0.3717948717948718,
+      "grad_norm": 0.44094228744506836,
+      "learning_rate": 0.0001958395945139026,
+      "loss": 0.8468,
+      "step": 2088
+    },
+    {
+      "epoch": 0.37197293447293445,
+      "grad_norm": 0.5335884690284729,
+      "learning_rate": 0.00019583559810668858,
+      "loss": 1.1597,
+      "step": 2089
+    },
+    {
+      "epoch": 0.37215099715099714,
+      "grad_norm": 0.4585414528846741,
+      "learning_rate": 0.000195831599821775,
+      "loss": 0.9343,
+      "step": 2090
+    },
+    {
+      "epoch": 0.37232905982905984,
+      "grad_norm": 0.533087432384491,
+      "learning_rate": 0.00019582759965924035,
+      "loss": 1.1209,
+      "step": 2091
+    },
+    {
+      "epoch": 0.37250712250712253,
+      "grad_norm": 0.5302683711051941,
+      "learning_rate": 0.00019582359761916295,
+      "loss": 1.236,
+      "step": 2092
+    },
+    {
+      "epoch": 0.3726851851851852,
+      "grad_norm": 0.4522508382797241,
+      "learning_rate": 0.00019581959370162122,
+      "loss": 1.0196,
+      "step": 2093
+    },
+    {
+      "epoch": 0.37286324786324787,
+      "grad_norm": 0.52391517162323,
+      "learning_rate": 0.00019581558790669358,
+      "loss": 1.0077,
+      "step": 2094
+    },
+    {
+      "epoch": 0.37304131054131057,
+      "grad_norm": 0.47144797444343567,
+      "learning_rate": 0.00019581158023445854,
+      "loss": 1.0956,
+      "step": 2095
+    },
+    {
+      "epoch": 0.3732193732193732,
+      "grad_norm": 0.4486723244190216,
+      "learning_rate": 0.00019580757068499459,
+      "loss": 0.8697,
+      "step": 2096
+    },
+    {
+      "epoch": 0.3733974358974359,
+      "grad_norm": 0.4626580476760864,
+      "learning_rate": 0.00019580355925838034,
+      "loss": 0.8489,
+      "step": 2097
+    },
+    {
+      "epoch": 0.3735754985754986,
+      "grad_norm": 0.5647920370101929,
+      "learning_rate": 0.00019579954595469438,
+      "loss": 1.1458,
+      "step": 2098
+    },
+    {
+      "epoch": 0.37375356125356124,
+      "grad_norm": 0.4734349846839905,
+      "learning_rate": 0.00019579553077401528,
+      "loss": 1.1036,
+      "step": 2099
+    },
+    {
+      "epoch": 0.37393162393162394,
+      "grad_norm": 0.5624295473098755,
+      "learning_rate": 0.00019579151371642176,
+      "loss": 0.9793,
+      "step": 2100
+    },
+    {
+      "epoch": 0.37410968660968663,
+      "grad_norm": 0.47507283091545105,
+      "learning_rate": 0.00019578749478199256,
+      "loss": 1.0371,
+      "step": 2101
+    },
+    {
+      "epoch": 0.37428774928774927,
+      "grad_norm": 0.550865113735199,
+      "learning_rate": 0.00019578347397080633,
+      "loss": 1.046,
+      "step": 2102
+    },
+    {
+      "epoch": 0.37446581196581197,
+      "grad_norm": 0.5249403715133667,
+      "learning_rate": 0.00019577945128294193,
+      "loss": 1.3185,
+      "step": 2103
+    },
+    {
+      "epoch": 0.37464387464387466,
+      "grad_norm": 0.4921024739742279,
+      "learning_rate": 0.00019577542671847815,
+      "loss": 1.0758,
+      "step": 2104
+    },
+    {
+      "epoch": 0.3748219373219373,
+      "grad_norm": 0.5351784825325012,
+      "learning_rate": 0.00019577140027749384,
+      "loss": 1.067,
+      "step": 2105
+    },
+    {
+      "epoch": 0.375,
+      "grad_norm": 0.44420507550239563,
+      "learning_rate": 0.00019576737196006787,
+      "loss": 1.1065,
+      "step": 2106
+    },
+    {
+      "epoch": 0.3751780626780627,
+      "grad_norm": 0.531384289264679,
+      "learning_rate": 0.0001957633417662792,
+      "loss": 1.1634,
+      "step": 2107
+    },
+    {
+      "epoch": 0.37535612535612534,
+      "grad_norm": 0.5167618989944458,
+      "learning_rate": 0.00019575930969620677,
+      "loss": 1.1646,
+      "step": 2108
+    },
+    {
+      "epoch": 0.37553418803418803,
+      "grad_norm": 0.41487228870391846,
+      "learning_rate": 0.0001957552757499296,
+      "loss": 0.793,
+      "step": 2109
+    },
+    {
+      "epoch": 0.37571225071225073,
+      "grad_norm": 0.5110787153244019,
+      "learning_rate": 0.00019575123992752672,
+      "loss": 1.1752,
+      "step": 2110
+    },
+    {
+      "epoch": 0.37589031339031337,
+      "grad_norm": 0.4422051012516022,
+      "learning_rate": 0.00019574720222907717,
+      "loss": 1.0102,
+      "step": 2111
+    },
+    {
+      "epoch": 0.37606837606837606,
+      "grad_norm": 0.4757538139820099,
+      "learning_rate": 0.0001957431626546601,
+      "loss": 1.0467,
+      "step": 2112
+    },
+    {
+      "epoch": 0.37624643874643876,
+      "grad_norm": 0.4736764430999756,
+      "learning_rate": 0.00019573912120435466,
+      "loss": 1.3048,
+      "step": 2113
+    },
+    {
+      "epoch": 0.3764245014245014,
+      "grad_norm": 0.49894335865974426,
+      "learning_rate": 0.00019573507787824004,
+      "loss": 1.0502,
+      "step": 2114
+    },
+    {
+      "epoch": 0.3766025641025641,
+      "grad_norm": 0.48120981454849243,
+      "learning_rate": 0.00019573103267639543,
+      "loss": 1.2405,
+      "step": 2115
+    },
+    {
+      "epoch": 0.3767806267806268,
+      "grad_norm": 0.4826737642288208,
+      "learning_rate": 0.0001957269855989001,
+      "loss": 1.1189,
+      "step": 2116
+    },
+    {
+      "epoch": 0.37695868945868943,
+      "grad_norm": 0.4736921489238739,
+      "learning_rate": 0.0001957229366458333,
+      "loss": 1.2862,
+      "step": 2117
+    },
+    {
+      "epoch": 0.37713675213675213,
+      "grad_norm": 0.3895208537578583,
+      "learning_rate": 0.00019571888581727446,
+      "loss": 1.0573,
+      "step": 2118
+    },
+    {
+      "epoch": 0.3773148148148148,
+      "grad_norm": 0.5107510089874268,
+      "learning_rate": 0.00019571483311330284,
+      "loss": 1.2913,
+      "step": 2119
+    },
+    {
+      "epoch": 0.37749287749287747,
+      "grad_norm": 0.4543241262435913,
+      "learning_rate": 0.00019571077853399794,
+      "loss": 0.949,
+      "step": 2120
+    },
+    {
+      "epoch": 0.37767094017094016,
+      "grad_norm": 0.46897491812705994,
+      "learning_rate": 0.00019570672207943913,
+      "loss": 1.2235,
+      "step": 2121
+    },
+    {
+      "epoch": 0.37784900284900286,
+      "grad_norm": 0.4812130630016327,
+      "learning_rate": 0.0001957026637497059,
+      "loss": 0.8857,
+      "step": 2122
+    },
+    {
+      "epoch": 0.37802706552706555,
+      "grad_norm": 0.47452476620674133,
+      "learning_rate": 0.00019569860354487782,
+      "loss": 1.0549,
+      "step": 2123
+    },
+    {
+      "epoch": 0.3782051282051282,
+      "grad_norm": 0.49879950284957886,
+      "learning_rate": 0.00019569454146503438,
+      "loss": 1.0475,
+      "step": 2124
+    },
+    {
+      "epoch": 0.3783831908831909,
+      "grad_norm": 0.4246445894241333,
+      "learning_rate": 0.00019569047751025518,
+      "loss": 0.8788,
+      "step": 2125
+    },
+    {
+      "epoch": 0.3785612535612536,
+      "grad_norm": 0.4868565499782562,
+      "learning_rate": 0.00019568641168061986,
+      "loss": 1.1801,
+      "step": 2126
+    },
+    {
+      "epoch": 0.3787393162393162,
+      "grad_norm": 0.46723654866218567,
+      "learning_rate": 0.0001956823439762081,
+      "loss": 1.1661,
+      "step": 2127
+    },
+    {
+      "epoch": 0.3789173789173789,
+      "grad_norm": 0.4989059269428253,
+      "learning_rate": 0.00019567827439709954,
+      "loss": 1.3037,
+      "step": 2128
+    },
+    {
+      "epoch": 0.3790954415954416,
+      "grad_norm": 0.441307932138443,
+      "learning_rate": 0.00019567420294337395,
+      "loss": 1.0197,
+      "step": 2129
+    },
+    {
+      "epoch": 0.37927350427350426,
+      "grad_norm": 0.5200160145759583,
+      "learning_rate": 0.0001956701296151111,
+      "loss": 1.3366,
+      "step": 2130
+    },
+    {
+      "epoch": 0.37945156695156695,
+      "grad_norm": 0.43610256910324097,
+      "learning_rate": 0.00019566605441239082,
+      "loss": 1.0148,
+      "step": 2131
+    },
+    {
+      "epoch": 0.37962962962962965,
+      "grad_norm": 0.4160982370376587,
+      "learning_rate": 0.00019566197733529293,
+      "loss": 1.0758,
+      "step": 2132
+    },
+    {
+      "epoch": 0.3798076923076923,
+      "grad_norm": 0.5007950663566589,
+      "learning_rate": 0.00019565789838389726,
+      "loss": 1.1937,
+      "step": 2133
+    },
+    {
+      "epoch": 0.379985754985755,
+      "grad_norm": 0.4991525113582611,
+      "learning_rate": 0.00019565381755828385,
+      "loss": 1.1788,
+      "step": 2134
+    },
+    {
+      "epoch": 0.3801638176638177,
+      "grad_norm": 0.6313113570213318,
+      "learning_rate": 0.00019564973485853258,
+      "loss": 1.1241,
+      "step": 2135
+    },
+    {
+      "epoch": 0.3803418803418803,
+      "grad_norm": 0.49736538529396057,
+      "learning_rate": 0.0001956456502847234,
+      "loss": 1.0299,
+      "step": 2136
+    },
+    {
+      "epoch": 0.380519943019943,
+      "grad_norm": 0.4384380578994751,
+      "learning_rate": 0.00019564156383693643,
+      "loss": 1.132,
+      "step": 2137
+    },
+    {
+      "epoch": 0.3806980056980057,
+      "grad_norm": 0.4696183502674103,
+      "learning_rate": 0.00019563747551525168,
+      "loss": 1.1145,
+      "step": 2138
+    },
+    {
+      "epoch": 0.38087606837606836,
+      "grad_norm": 0.42039749026298523,
+      "learning_rate": 0.0001956333853197493,
+      "loss": 0.9549,
+      "step": 2139
+    },
+    {
+      "epoch": 0.38105413105413105,
+      "grad_norm": 0.5547221899032593,
+      "learning_rate": 0.00019562929325050936,
+      "loss": 1.0476,
+      "step": 2140
+    },
+    {
+      "epoch": 0.38123219373219375,
+      "grad_norm": 0.4803301692008972,
+      "learning_rate": 0.0001956251993076121,
+      "loss": 1.1285,
+      "step": 2141
+    },
+    {
+      "epoch": 0.3814102564102564,
+      "grad_norm": 0.609501838684082,
+      "learning_rate": 0.00019562110349113766,
+      "loss": 1.2375,
+      "step": 2142
+    },
+    {
+      "epoch": 0.3815883190883191,
+      "grad_norm": 0.5134759545326233,
+      "learning_rate": 0.00019561700580116639,
+      "loss": 1.0895,
+      "step": 2143
+    },
+    {
+      "epoch": 0.3817663817663818,
+      "grad_norm": 0.5086711049079895,
+      "learning_rate": 0.00019561290623777846,
+      "loss": 1.1139,
+      "step": 2144
+    },
+    {
+      "epoch": 0.3819444444444444,
+      "grad_norm": 0.5371596813201904,
+      "learning_rate": 0.00019560880480105428,
+      "loss": 0.9302,
+      "step": 2145
+    },
+    {
+      "epoch": 0.3821225071225071,
+      "grad_norm": 0.4966319799423218,
+      "learning_rate": 0.00019560470149107418,
+      "loss": 1.2485,
+      "step": 2146
+    },
+    {
+      "epoch": 0.3823005698005698,
+      "grad_norm": 0.5296950340270996,
+      "learning_rate": 0.00019560059630791855,
+      "loss": 1.4449,
+      "step": 2147
+    },
+    {
+      "epoch": 0.38247863247863245,
+      "grad_norm": 0.5564194321632385,
+      "learning_rate": 0.00019559648925166783,
+      "loss": 1.0817,
+      "step": 2148
+    },
+    {
+      "epoch": 0.38265669515669515,
+      "grad_norm": 0.5763841867446899,
+      "learning_rate": 0.0001955923803224025,
+      "loss": 1.1915,
+      "step": 2149
+    },
+    {
+      "epoch": 0.38283475783475784,
+      "grad_norm": 0.4782295823097229,
+      "learning_rate": 0.00019558826952020304,
+      "loss": 1.1317,
+      "step": 2150
+    },
+    {
+      "epoch": 0.38301282051282054,
+      "grad_norm": 0.4876856207847595,
+      "learning_rate": 0.00019558415684515002,
+      "loss": 1.2113,
+      "step": 2151
+    },
+    {
+      "epoch": 0.3831908831908832,
+      "grad_norm": 0.4894421398639679,
+      "learning_rate": 0.00019558004229732398,
+      "loss": 1.0761,
+      "step": 2152
+    },
+    {
+      "epoch": 0.3833689458689459,
+      "grad_norm": 0.47914227843284607,
+      "learning_rate": 0.0001955759258768056,
+      "loss": 1.0869,
+      "step": 2153
+    },
+    {
+      "epoch": 0.38354700854700857,
+      "grad_norm": 0.43933629989624023,
+      "learning_rate": 0.00019557180758367543,
+      "loss": 1.0581,
+      "step": 2154
+    },
+    {
+      "epoch": 0.3837250712250712,
+      "grad_norm": 0.4078103005886078,
+      "learning_rate": 0.00019556768741801428,
+      "loss": 1.065,
+      "step": 2155
+    },
+    {
+      "epoch": 0.3839031339031339,
+      "grad_norm": 0.5112793445587158,
+      "learning_rate": 0.00019556356537990278,
+      "loss": 1.2023,
+      "step": 2156
+    },
+    {
+      "epoch": 0.3840811965811966,
+      "grad_norm": 0.4699678122997284,
+      "learning_rate": 0.00019555944146942177,
+      "loss": 1.2459,
+      "step": 2157
+    },
+    {
+      "epoch": 0.38425925925925924,
+      "grad_norm": 0.4723528027534485,
+      "learning_rate": 0.00019555531568665198,
+      "loss": 1.2204,
+      "step": 2158
+    },
+    {
+      "epoch": 0.38443732193732194,
+      "grad_norm": 0.4648225009441376,
+      "learning_rate": 0.00019555118803167432,
+      "loss": 1.1355,
+      "step": 2159
+    },
+    {
+      "epoch": 0.38461538461538464,
+      "grad_norm": 0.49861815571784973,
+      "learning_rate": 0.00019554705850456961,
+      "loss": 1.1301,
+      "step": 2160
+    },
+    {
+      "epoch": 0.3847934472934473,
+      "grad_norm": 0.4076344966888428,
+      "learning_rate": 0.00019554292710541874,
+      "loss": 0.8997,
+      "step": 2161
+    },
+    {
+      "epoch": 0.38497150997151,
+      "grad_norm": 0.5510796308517456,
+      "learning_rate": 0.00019553879383430272,
+      "loss": 1.0594,
+      "step": 2162
+    },
+    {
+      "epoch": 0.38514957264957267,
+      "grad_norm": 0.55793696641922,
+      "learning_rate": 0.00019553465869130249,
+      "loss": 1.1284,
+      "step": 2163
+    },
+    {
+      "epoch": 0.3853276353276353,
+      "grad_norm": 0.5096491575241089,
+      "learning_rate": 0.00019553052167649906,
+      "loss": 1.0419,
+      "step": 2164
+    },
+    {
+      "epoch": 0.385505698005698,
+      "grad_norm": 0.49077361822128296,
+      "learning_rate": 0.0001955263827899735,
+      "loss": 1.1632,
+      "step": 2165
+    },
+    {
+      "epoch": 0.3856837606837607,
+      "grad_norm": 0.5546894073486328,
+      "learning_rate": 0.00019552224203180693,
+      "loss": 1.1487,
+      "step": 2166
+    },
+    {
+      "epoch": 0.38586182336182334,
+      "grad_norm": 0.4930037260055542,
+      "learning_rate": 0.00019551809940208047,
+      "loss": 1.2668,
+      "step": 2167
+    },
+    {
+      "epoch": 0.38603988603988604,
+      "grad_norm": 0.5600671172142029,
+      "learning_rate": 0.00019551395490087525,
+      "loss": 1.3988,
+      "step": 2168
+    },
+    {
+      "epoch": 0.38621794871794873,
+      "grad_norm": 0.45897629857063293,
+      "learning_rate": 0.0001955098085282725,
+      "loss": 0.7792,
+      "step": 2169
+    },
+    {
+      "epoch": 0.3863960113960114,
+      "grad_norm": 0.46138936281204224,
+      "learning_rate": 0.00019550566028435346,
+      "loss": 1.1749,
+      "step": 2170
+    },
+    {
+      "epoch": 0.38657407407407407,
+      "grad_norm": 0.5136167407035828,
+      "learning_rate": 0.0001955015101691994,
+      "loss": 1.0153,
+      "step": 2171
+    },
+    {
+      "epoch": 0.38675213675213677,
+      "grad_norm": 0.4886440336704254,
+      "learning_rate": 0.00019549735818289165,
+      "loss": 1.0006,
+      "step": 2172
+    },
+    {
+      "epoch": 0.3869301994301994,
+      "grad_norm": 0.4339776635169983,
+      "learning_rate": 0.00019549320432551154,
+      "loss": 1.0109,
+      "step": 2173
+    },
+    {
+      "epoch": 0.3871082621082621,
+      "grad_norm": 0.48729443550109863,
+      "learning_rate": 0.00019548904859714044,
+      "loss": 1.2016,
+      "step": 2174
+    },
+    {
+      "epoch": 0.3872863247863248,
+      "grad_norm": 0.5128757357597351,
+      "learning_rate": 0.0001954848909978598,
+      "loss": 1.085,
+      "step": 2175
+    },
+    {
+      "epoch": 0.38746438746438744,
+      "grad_norm": 0.49636292457580566,
+      "learning_rate": 0.0001954807315277511,
+      "loss": 1.0671,
+      "step": 2176
+    },
+    {
+      "epoch": 0.38764245014245013,
+      "grad_norm": 0.4946988821029663,
+      "learning_rate": 0.00019547657018689578,
+      "loss": 1.2091,
+      "step": 2177
+    },
+    {
+      "epoch": 0.38782051282051283,
+      "grad_norm": 0.49004554748535156,
+      "learning_rate": 0.00019547240697537544,
+      "loss": 1.0241,
+      "step": 2178
+    },
+    {
+      "epoch": 0.38799857549857547,
+      "grad_norm": 0.48750075697898865,
+      "learning_rate": 0.00019546824189327157,
+      "loss": 1.1082,
+      "step": 2179
+    },
+    {
+      "epoch": 0.38817663817663817,
+      "grad_norm": 0.47726166248321533,
+      "learning_rate": 0.00019546407494066585,
+      "loss": 1.1275,
+      "step": 2180
+    },
+    {
+      "epoch": 0.38835470085470086,
+      "grad_norm": 0.5253444910049438,
+      "learning_rate": 0.00019545990611763986,
+      "loss": 1.0164,
+      "step": 2181
+    },
+    {
+      "epoch": 0.38853276353276356,
+      "grad_norm": 0.4470371603965759,
+      "learning_rate": 0.00019545573542427533,
+      "loss": 1.0138,
+      "step": 2182
+    },
+    {
+      "epoch": 0.3887108262108262,
+      "grad_norm": 0.6645087599754333,
+      "learning_rate": 0.00019545156286065397,
+      "loss": 1.0884,
+      "step": 2183
+    },
+    {
+      "epoch": 0.3888888888888889,
+      "grad_norm": 0.498775839805603,
+      "learning_rate": 0.0001954473884268575,
+      "loss": 1.1035,
+      "step": 2184
+    },
+    {
+      "epoch": 0.3890669515669516,
+      "grad_norm": 0.5830566883087158,
+      "learning_rate": 0.00019544321212296772,
+      "loss": 1.1665,
+      "step": 2185
+    },
+    {
+      "epoch": 0.38924501424501423,
+      "grad_norm": 0.48162809014320374,
+      "learning_rate": 0.00019543903394906646,
+      "loss": 1.1035,
+      "step": 2186
+    },
+    {
+      "epoch": 0.3894230769230769,
+      "grad_norm": 0.46334075927734375,
+      "learning_rate": 0.0001954348539052356,
+      "loss": 0.9764,
+      "step": 2187
+    },
+    {
+      "epoch": 0.3896011396011396,
+      "grad_norm": 0.6343515515327454,
+      "learning_rate": 0.00019543067199155704,
+      "loss": 0.9474,
+      "step": 2188
+    },
+    {
+      "epoch": 0.38977920227920226,
+      "grad_norm": 0.4867806136608124,
+      "learning_rate": 0.0001954264882081127,
+      "loss": 1.1161,
+      "step": 2189
+    },
+    {
+      "epoch": 0.38995726495726496,
+      "grad_norm": 0.49305734038352966,
+      "learning_rate": 0.00019542230255498454,
+      "loss": 1.1825,
+      "step": 2190
+    },
+    {
+      "epoch": 0.39013532763532766,
+      "grad_norm": 0.518465518951416,
+      "learning_rate": 0.00019541811503225457,
+      "loss": 1.0695,
+      "step": 2191
+    },
+    {
+      "epoch": 0.3903133903133903,
+      "grad_norm": 0.4892457127571106,
+      "learning_rate": 0.00019541392564000488,
+      "loss": 1.3113,
+      "step": 2192
+    },
+    {
+      "epoch": 0.390491452991453,
+      "grad_norm": 0.5150920152664185,
+      "learning_rate": 0.00019540973437831753,
+      "loss": 1.0735,
+      "step": 2193
+    },
+    {
+      "epoch": 0.3906695156695157,
+      "grad_norm": 0.5414708256721497,
+      "learning_rate": 0.00019540554124727462,
+      "loss": 1.0773,
+      "step": 2194
+    },
+    {
+      "epoch": 0.39084757834757833,
+      "grad_norm": 0.49826398491859436,
+      "learning_rate": 0.0001954013462469583,
+      "loss": 1.0542,
+      "step": 2195
+    },
+    {
+      "epoch": 0.391025641025641,
+      "grad_norm": 0.5203596949577332,
+      "learning_rate": 0.0001953971493774508,
+      "loss": 1.178,
+      "step": 2196
+    },
+    {
+      "epoch": 0.3912037037037037,
+      "grad_norm": 0.45095738768577576,
+      "learning_rate": 0.00019539295063883432,
+      "loss": 1.1254,
+      "step": 2197
+    },
+    {
+      "epoch": 0.39138176638176636,
+      "grad_norm": 0.4938857853412628,
+      "learning_rate": 0.00019538875003119113,
+      "loss": 1.1061,
+      "step": 2198
+    },
+    {
+      "epoch": 0.39155982905982906,
+      "grad_norm": 0.5260919332504272,
+      "learning_rate": 0.00019538454755460354,
+      "loss": 1.3292,
+      "step": 2199
+    },
+    {
+      "epoch": 0.39173789173789175,
+      "grad_norm": 0.46527108550071716,
+      "learning_rate": 0.00019538034320915388,
+      "loss": 1.2074,
+      "step": 2200
+    },
+    {
+      "epoch": 0.3919159544159544,
+      "grad_norm": 0.5608304738998413,
+      "learning_rate": 0.00019537613699492453,
+      "loss": 1.0385,
+      "step": 2201
+    },
+    {
+      "epoch": 0.3920940170940171,
+      "grad_norm": 0.5056684613227844,
+      "learning_rate": 0.00019537192891199792,
+      "loss": 1.1513,
+      "step": 2202
+    },
+    {
+      "epoch": 0.3922720797720798,
+      "grad_norm": 0.3764426112174988,
+      "learning_rate": 0.00019536771896045644,
+      "loss": 0.8966,
+      "step": 2203
+    },
+    {
+      "epoch": 0.3924501424501424,
+      "grad_norm": 0.4983638823032379,
+      "learning_rate": 0.0001953635071403827,
+      "loss": 1.097,
+      "step": 2204
+    },
+    {
+      "epoch": 0.3926282051282051,
+      "grad_norm": 0.5733919739723206,
+      "learning_rate": 0.00019535929345185904,
+      "loss": 1.4992,
+      "step": 2205
+    },
+    {
+      "epoch": 0.3928062678062678,
+      "grad_norm": 0.632064163684845,
+      "learning_rate": 0.00019535507789496817,
+      "loss": 1.0611,
+      "step": 2206
+    },
+    {
+      "epoch": 0.39298433048433046,
+      "grad_norm": 0.409978449344635,
+      "learning_rate": 0.00019535086046979262,
+      "loss": 0.7172,
+      "step": 2207
+    },
+    {
+      "epoch": 0.39316239316239315,
+      "grad_norm": 0.40910813212394714,
+      "learning_rate": 0.00019534664117641502,
+      "loss": 0.8803,
+      "step": 2208
+    },
+    {
+      "epoch": 0.39334045584045585,
+      "grad_norm": 0.4696179926395416,
+      "learning_rate": 0.00019534242001491807,
+      "loss": 1.1551,
+      "step": 2209
+    },
+    {
+      "epoch": 0.39351851851851855,
+      "grad_norm": 0.538425862789154,
+      "learning_rate": 0.00019533819698538444,
+      "loss": 1.1296,
+      "step": 2210
+    },
+    {
+      "epoch": 0.3936965811965812,
+      "grad_norm": 0.5913630723953247,
+      "learning_rate": 0.00019533397208789692,
+      "loss": 0.9757,
+      "step": 2211
+    },
+    {
+      "epoch": 0.3938746438746439,
+      "grad_norm": 0.5649870038032532,
+      "learning_rate": 0.00019532974532253822,
+      "loss": 0.9976,
+      "step": 2212
+    },
+    {
+      "epoch": 0.3940527065527066,
+      "grad_norm": 0.5012063980102539,
+      "learning_rate": 0.00019532551668939121,
+      "loss": 0.9969,
+      "step": 2213
+    },
+    {
+      "epoch": 0.3942307692307692,
+      "grad_norm": 0.5098594427108765,
+      "learning_rate": 0.00019532128618853872,
+      "loss": 1.1229,
+      "step": 2214
+    },
+    {
+      "epoch": 0.3944088319088319,
+      "grad_norm": 0.4753342568874359,
+      "learning_rate": 0.0001953170538200636,
+      "loss": 1.0808,
+      "step": 2215
+    },
+    {
+      "epoch": 0.3945868945868946,
+      "grad_norm": 0.4770098626613617,
+      "learning_rate": 0.00019531281958404888,
+      "loss": 1.0656,
+      "step": 2216
+    },
+    {
+      "epoch": 0.39476495726495725,
+      "grad_norm": 0.6007979512214661,
+      "learning_rate": 0.00019530858348057746,
+      "loss": 1.0093,
+      "step": 2217
+    },
+    {
+      "epoch": 0.39494301994301995,
+      "grad_norm": 0.4501650929450989,
+      "learning_rate": 0.00019530434550973227,
+      "loss": 0.8557,
+      "step": 2218
+    },
+    {
+      "epoch": 0.39512108262108264,
+      "grad_norm": 0.5123980641365051,
+      "learning_rate": 0.00019530010567159645,
+      "loss": 0.9833,
+      "step": 2219
+    },
+    {
+      "epoch": 0.3952991452991453,
+      "grad_norm": 0.4623969495296478,
+      "learning_rate": 0.000195295863966253,
+      "loss": 0.913,
+      "step": 2220
+    },
+    {
+      "epoch": 0.395477207977208,
+      "grad_norm": 0.4341880679130554,
+      "learning_rate": 0.0001952916203937851,
+      "loss": 1.0234,
+      "step": 2221
+    },
+    {
+      "epoch": 0.3956552706552707,
+      "grad_norm": 0.5935006141662598,
+      "learning_rate": 0.00019528737495427581,
+      "loss": 1.061,
+      "step": 2222
+    },
+    {
+      "epoch": 0.3958333333333333,
+      "grad_norm": 0.44835174083709717,
+      "learning_rate": 0.00019528312764780837,
+      "loss": 1.1567,
+      "step": 2223
+    },
+    {
+      "epoch": 0.396011396011396,
+      "grad_norm": 0.5476976633071899,
+      "learning_rate": 0.00019527887847446595,
+      "loss": 1.2304,
+      "step": 2224
+    },
+    {
+      "epoch": 0.3961894586894587,
+      "grad_norm": 0.4487939774990082,
+      "learning_rate": 0.00019527462743433187,
+      "loss": 1.1813,
+      "step": 2225
+    },
+    {
+      "epoch": 0.39636752136752135,
+      "grad_norm": 0.4053241014480591,
+      "learning_rate": 0.00019527037452748936,
+      "loss": 0.7899,
+      "step": 2226
+    },
+    {
+      "epoch": 0.39654558404558404,
+      "grad_norm": 0.534570574760437,
+      "learning_rate": 0.00019526611975402176,
+      "loss": 1.0681,
+      "step": 2227
+    },
+    {
+      "epoch": 0.39672364672364674,
+      "grad_norm": 0.46096158027648926,
+      "learning_rate": 0.00019526186311401246,
+      "loss": 0.9234,
+      "step": 2228
+    },
+    {
+      "epoch": 0.3969017094017094,
+      "grad_norm": 0.47363516688346863,
+      "learning_rate": 0.00019525760460754483,
+      "loss": 1.0197,
+      "step": 2229
+    },
+    {
+      "epoch": 0.3970797720797721,
+      "grad_norm": 0.46317258477211,
+      "learning_rate": 0.00019525334423470234,
+      "loss": 1.2103,
+      "step": 2230
+    },
+    {
+      "epoch": 0.39725783475783477,
+      "grad_norm": 0.4924237132072449,
+      "learning_rate": 0.0001952490819955684,
+      "loss": 1.3299,
+      "step": 2231
+    },
+    {
+      "epoch": 0.3974358974358974,
+      "grad_norm": 0.5419978499412537,
+      "learning_rate": 0.0001952448178902266,
+      "loss": 1.2526,
+      "step": 2232
+    },
+    {
+      "epoch": 0.3976139601139601,
+      "grad_norm": 0.5003267526626587,
+      "learning_rate": 0.00019524055191876043,
+      "loss": 1.1073,
+      "step": 2233
+    },
+    {
+      "epoch": 0.3977920227920228,
+      "grad_norm": 0.621789276599884,
+      "learning_rate": 0.00019523628408125347,
+      "loss": 1.3409,
+      "step": 2234
+    },
+    {
+      "epoch": 0.39797008547008544,
+      "grad_norm": 0.44235602021217346,
+      "learning_rate": 0.0001952320143777894,
+      "loss": 0.9799,
+      "step": 2235
+    },
+    {
+      "epoch": 0.39814814814814814,
+      "grad_norm": 0.49954718351364136,
+      "learning_rate": 0.0001952277428084518,
+      "loss": 1.2227,
+      "step": 2236
+    },
+    {
+      "epoch": 0.39832621082621084,
+      "grad_norm": 0.5113739967346191,
+      "learning_rate": 0.00019522346937332443,
+      "loss": 1.1644,
+      "step": 2237
+    },
+    {
+      "epoch": 0.39850427350427353,
+      "grad_norm": 0.5026139616966248,
+      "learning_rate": 0.00019521919407249096,
+      "loss": 1.0823,
+      "step": 2238
+    },
+    {
+      "epoch": 0.39868233618233617,
+      "grad_norm": 0.4943205714225769,
+      "learning_rate": 0.0001952149169060352,
+      "loss": 1.0961,
+      "step": 2239
+    },
+    {
+      "epoch": 0.39886039886039887,
+      "grad_norm": 0.4680631458759308,
+      "learning_rate": 0.00019521063787404094,
+      "loss": 0.9787,
+      "step": 2240
+    },
+    {
+      "epoch": 0.39903846153846156,
+      "grad_norm": 0.5511566400527954,
+      "learning_rate": 0.00019520635697659202,
+      "loss": 1.2543,
+      "step": 2241
+    },
+    {
+      "epoch": 0.3992165242165242,
+      "grad_norm": 0.5494263172149658,
+      "learning_rate": 0.00019520207421377229,
+      "loss": 1.1978,
+      "step": 2242
+    },
+    {
+      "epoch": 0.3993945868945869,
+      "grad_norm": 0.4850340485572815,
+      "learning_rate": 0.00019519778958566568,
+      "loss": 0.8531,
+      "step": 2243
+    },
+    {
+      "epoch": 0.3995726495726496,
+      "grad_norm": 0.47168150544166565,
+      "learning_rate": 0.00019519350309235613,
+      "loss": 1.0746,
+      "step": 2244
+    },
+    {
+      "epoch": 0.39975071225071224,
+      "grad_norm": 0.571133553981781,
+      "learning_rate": 0.00019518921473392765,
+      "loss": 1.2984,
+      "step": 2245
+    },
+    {
+      "epoch": 0.39992877492877493,
+      "grad_norm": 0.4636089503765106,
+      "learning_rate": 0.00019518492451046427,
+      "loss": 1.019,
+      "step": 2246
+    },
+    {
+      "epoch": 0.40010683760683763,
+      "grad_norm": 0.4573518931865692,
+      "learning_rate": 0.00019518063242205,
+      "loss": 1.1042,
+      "step": 2247
+    },
+    {
+      "epoch": 0.40028490028490027,
+      "grad_norm": 0.49098989367485046,
+      "learning_rate": 0.00019517633846876894,
+      "loss": 1.1224,
+      "step": 2248
+    },
+    {
+      "epoch": 0.40046296296296297,
+      "grad_norm": 0.5475491881370544,
+      "learning_rate": 0.00019517204265070523,
+      "loss": 1.0984,
+      "step": 2249
+    },
+    {
+      "epoch": 0.40064102564102566,
+      "grad_norm": 0.45498281717300415,
+      "learning_rate": 0.00019516774496794307,
+      "loss": 0.8883,
+      "step": 2250
+    },
+    {
+      "epoch": 0.4008190883190883,
+      "grad_norm": 0.4908423125743866,
+      "learning_rate": 0.00019516344542056666,
+      "loss": 1.328,
+      "step": 2251
+    },
+    {
+      "epoch": 0.400997150997151,
+      "grad_norm": 0.5474920272827148,
+      "learning_rate": 0.0001951591440086602,
+      "loss": 1.3825,
+      "step": 2252
+    },
+    {
+      "epoch": 0.4011752136752137,
+      "grad_norm": 0.5165615081787109,
+      "learning_rate": 0.000195154840732308,
+      "loss": 1.33,
+      "step": 2253
+    },
+    {
+      "epoch": 0.40135327635327633,
+      "grad_norm": 0.5185585021972656,
+      "learning_rate": 0.00019515053559159435,
+      "loss": 1.1689,
+      "step": 2254
+    },
+    {
+      "epoch": 0.40153133903133903,
+      "grad_norm": 0.5468854904174805,
+      "learning_rate": 0.00019514622858660363,
+      "loss": 1.2708,
+      "step": 2255
+    },
+    {
+      "epoch": 0.4017094017094017,
+      "grad_norm": 0.47556906938552856,
+      "learning_rate": 0.0001951419197174202,
+      "loss": 1.0488,
+      "step": 2256
+    },
+    {
+      "epoch": 0.40188746438746437,
+      "grad_norm": 0.5521323084831238,
+      "learning_rate": 0.0001951376089841285,
+      "loss": 1.0868,
+      "step": 2257
+    },
+    {
+      "epoch": 0.40206552706552706,
+      "grad_norm": 0.6029638051986694,
+      "learning_rate": 0.00019513329638681296,
+      "loss": 1.1735,
+      "step": 2258
+    },
+    {
+      "epoch": 0.40224358974358976,
+      "grad_norm": 0.4897766411304474,
+      "learning_rate": 0.00019512898192555812,
+      "loss": 1.1687,
+      "step": 2259
+    },
+    {
+      "epoch": 0.4024216524216524,
+      "grad_norm": 0.45527184009552,
+      "learning_rate": 0.00019512466560044848,
+      "loss": 1.0352,
+      "step": 2260
+    },
+    {
+      "epoch": 0.4025997150997151,
+      "grad_norm": 0.5025625824928284,
+      "learning_rate": 0.00019512034741156863,
+      "loss": 1.2503,
+      "step": 2261
+    },
+    {
+      "epoch": 0.4027777777777778,
+      "grad_norm": 0.46415451169013977,
+      "learning_rate": 0.00019511602735900317,
+      "loss": 1.032,
+      "step": 2262
+    },
+    {
+      "epoch": 0.40295584045584043,
+      "grad_norm": 0.4812934398651123,
+      "learning_rate": 0.00019511170544283678,
+      "loss": 1.0523,
+      "step": 2263
+    },
+    {
+      "epoch": 0.4031339031339031,
+      "grad_norm": 0.49937039613723755,
+      "learning_rate": 0.00019510738166315404,
+      "loss": 1.2238,
+      "step": 2264
+    },
+    {
+      "epoch": 0.4033119658119658,
+      "grad_norm": 0.5428698062896729,
+      "learning_rate": 0.00019510305602003975,
+      "loss": 1.0361,
+      "step": 2265
+    },
+    {
+      "epoch": 0.40349002849002846,
+      "grad_norm": 0.44836854934692383,
+      "learning_rate": 0.0001950987285135786,
+      "loss": 1.169,
+      "step": 2266
+    },
+    {
+      "epoch": 0.40366809116809116,
+      "grad_norm": 0.5071489214897156,
+      "learning_rate": 0.00019509439914385549,
+      "loss": 1.1567,
+      "step": 2267
+    },
+    {
+      "epoch": 0.40384615384615385,
+      "grad_norm": 0.5204613208770752,
+      "learning_rate": 0.00019509006791095513,
+      "loss": 0.9949,
+      "step": 2268
+    },
+    {
+      "epoch": 0.40402421652421655,
+      "grad_norm": 0.4583234488964081,
+      "learning_rate": 0.00019508573481496238,
+      "loss": 0.9051,
+      "step": 2269
+    },
+    {
+      "epoch": 0.4042022792022792,
+      "grad_norm": 0.5436791181564331,
+      "learning_rate": 0.00019508139985596222,
+      "loss": 1.3239,
+      "step": 2270
+    },
+    {
+      "epoch": 0.4043803418803419,
+      "grad_norm": 0.48774269223213196,
+      "learning_rate": 0.00019507706303403954,
+      "loss": 1.2102,
+      "step": 2271
+    },
+    {
+      "epoch": 0.4045584045584046,
+      "grad_norm": 0.4742540717124939,
+      "learning_rate": 0.00019507272434927933,
+      "loss": 1.1137,
+      "step": 2272
+    },
+    {
+      "epoch": 0.4047364672364672,
+      "grad_norm": 0.531148374080658,
+      "learning_rate": 0.00019506838380176658,
+      "loss": 1.3162,
+      "step": 2273
+    },
+    {
+      "epoch": 0.4049145299145299,
+      "grad_norm": 0.5002314448356628,
+      "learning_rate": 0.0001950640413915863,
+      "loss": 1.0743,
+      "step": 2274
+    },
+    {
+      "epoch": 0.4050925925925926,
+      "grad_norm": 0.39826446771621704,
+      "learning_rate": 0.00019505969711882366,
+      "loss": 0.7698,
+      "step": 2275
+    },
+    {
+      "epoch": 0.40527065527065526,
+      "grad_norm": 0.5177471041679382,
+      "learning_rate": 0.00019505535098356371,
+      "loss": 1.1821,
+      "step": 2276
+    },
+    {
+      "epoch": 0.40544871794871795,
+      "grad_norm": 0.467241108417511,
+      "learning_rate": 0.00019505100298589158,
+      "loss": 0.8036,
+      "step": 2277
+    },
+    {
+      "epoch": 0.40562678062678065,
+      "grad_norm": 0.43711844086647034,
+      "learning_rate": 0.00019504665312589255,
+      "loss": 0.8667,
+      "step": 2278
+    },
+    {
+      "epoch": 0.4058048433048433,
+      "grad_norm": 0.4929116368293762,
+      "learning_rate": 0.00019504230140365177,
+      "loss": 1.1279,
+      "step": 2279
+    },
+    {
+      "epoch": 0.405982905982906,
+      "grad_norm": 0.5279183983802795,
+      "learning_rate": 0.00019503794781925452,
+      "loss": 1.1318,
+      "step": 2280
+    },
+    {
+      "epoch": 0.4061609686609687,
+      "grad_norm": 0.549217939376831,
+      "learning_rate": 0.00019503359237278608,
+      "loss": 1.2007,
+      "step": 2281
+    },
+    {
+      "epoch": 0.4063390313390313,
+      "grad_norm": 0.5485880374908447,
+      "learning_rate": 0.00019502923506433187,
+      "loss": 1.1079,
+      "step": 2282
+    },
+    {
+      "epoch": 0.406517094017094,
+      "grad_norm": 0.48379644751548767,
+      "learning_rate": 0.0001950248758939772,
+      "loss": 0.9978,
+      "step": 2283
+    },
+    {
+      "epoch": 0.4066951566951567,
+      "grad_norm": 0.5943657755851746,
+      "learning_rate": 0.00019502051486180744,
+      "loss": 1.0466,
+      "step": 2284
+    },
+    {
+      "epoch": 0.40687321937321935,
+      "grad_norm": 0.5721273422241211,
+      "learning_rate": 0.00019501615196790812,
+      "loss": 1.2674,
+      "step": 2285
+    },
+    {
+      "epoch": 0.40705128205128205,
+      "grad_norm": 0.47624221444129944,
+      "learning_rate": 0.00019501178721236464,
+      "loss": 1.089,
+      "step": 2286
+    },
+    {
+      "epoch": 0.40722934472934474,
+      "grad_norm": 0.5091297030448914,
+      "learning_rate": 0.0001950074205952626,
+      "loss": 1.2035,
+      "step": 2287
+    },
+    {
+      "epoch": 0.4074074074074074,
+      "grad_norm": 0.45206236839294434,
+      "learning_rate": 0.0001950030521166875,
+      "loss": 0.9188,
+      "step": 2288
+    },
+    {
+      "epoch": 0.4075854700854701,
+      "grad_norm": 0.5563844442367554,
+      "learning_rate": 0.00019499868177672497,
+      "loss": 1.3444,
+      "step": 2289
+    },
+    {
+      "epoch": 0.4077635327635328,
+      "grad_norm": 0.4971138536930084,
+      "learning_rate": 0.00019499430957546055,
+      "loss": 1.1615,
+      "step": 2290
+    },
+    {
+      "epoch": 0.4079415954415954,
+      "grad_norm": 0.49355944991111755,
+      "learning_rate": 0.00019498993551298,
+      "loss": 1.1528,
+      "step": 2291
+    },
+    {
+      "epoch": 0.4081196581196581,
+      "grad_norm": 0.534705638885498,
+      "learning_rate": 0.000194985559589369,
+      "loss": 1.197,
+      "step": 2292
+    },
+    {
+      "epoch": 0.4082977207977208,
+      "grad_norm": 0.5113020539283752,
+      "learning_rate": 0.0001949811818047133,
+      "loss": 1.109,
+      "step": 2293
+    },
+    {
+      "epoch": 0.40847578347578345,
+      "grad_norm": 0.4823366701602936,
+      "learning_rate": 0.00019497680215909858,
+      "loss": 1.168,
+      "step": 2294
+    },
+    {
+      "epoch": 0.40865384615384615,
+      "grad_norm": 0.500792920589447,
+      "learning_rate": 0.00019497242065261077,
+      "loss": 1.1567,
+      "step": 2295
+    },
+    {
+      "epoch": 0.40883190883190884,
+      "grad_norm": 0.5047918558120728,
+      "learning_rate": 0.00019496803728533566,
+      "loss": 1.0515,
+      "step": 2296
+    },
+    {
+      "epoch": 0.40900997150997154,
+      "grad_norm": 0.474624365568161,
+      "learning_rate": 0.00019496365205735913,
+      "loss": 1.1747,
+      "step": 2297
+    },
+    {
+      "epoch": 0.4091880341880342,
+      "grad_norm": 0.5522183179855347,
+      "learning_rate": 0.0001949592649687671,
+      "loss": 1.1506,
+      "step": 2298
+    },
+    {
+      "epoch": 0.4093660968660969,
+      "grad_norm": 0.4526083767414093,
+      "learning_rate": 0.00019495487601964553,
+      "loss": 0.9968,
+      "step": 2299
+    },
+    {
+      "epoch": 0.40954415954415957,
+      "grad_norm": 0.545845091342926,
+      "learning_rate": 0.00019495048521008044,
+      "loss": 1.146,
+      "step": 2300
+    },
+    {
+      "epoch": 0.4097222222222222,
+      "grad_norm": 0.5475544333457947,
+      "learning_rate": 0.00019494609254015784,
+      "loss": 1.0101,
+      "step": 2301
+    },
+    {
+      "epoch": 0.4099002849002849,
+      "grad_norm": 0.43419042229652405,
+      "learning_rate": 0.00019494169800996373,
+      "loss": 1.065,
+      "step": 2302
+    },
+    {
+      "epoch": 0.4100783475783476,
+      "grad_norm": 0.44998374581336975,
+      "learning_rate": 0.00019493730161958435,
+      "loss": 0.9948,
+      "step": 2303
+    },
+    {
+      "epoch": 0.41025641025641024,
+      "grad_norm": 0.5401661992073059,
+      "learning_rate": 0.0001949329033691057,
+      "loss": 1.0473,
+      "step": 2304
+    },
+    {
+      "epoch": 0.41043447293447294,
+      "grad_norm": 0.48064103722572327,
+      "learning_rate": 0.00019492850325861404,
+      "loss": 1.0486,
+      "step": 2305
+    },
+    {
+      "epoch": 0.41061253561253563,
+      "grad_norm": 0.5398300290107727,
+      "learning_rate": 0.00019492410128819557,
+      "loss": 1.0314,
+      "step": 2306
+    },
+    {
+      "epoch": 0.4107905982905983,
+      "grad_norm": 0.4771125912666321,
+      "learning_rate": 0.0001949196974579365,
+      "loss": 0.9855,
+      "step": 2307
+    },
+    {
+      "epoch": 0.41096866096866097,
+      "grad_norm": 0.5375809669494629,
+      "learning_rate": 0.00019491529176792315,
+      "loss": 1.0777,
+      "step": 2308
+    },
+    {
+      "epoch": 0.41114672364672367,
+      "grad_norm": 0.48424094915390015,
+      "learning_rate": 0.00019491088421824183,
+      "loss": 1.0751,
+      "step": 2309
+    },
+    {
+      "epoch": 0.4113247863247863,
+      "grad_norm": 0.5054880380630493,
+      "learning_rate": 0.00019490647480897887,
+      "loss": 1.2457,
+      "step": 2310
+    },
+    {
+      "epoch": 0.411502849002849,
+      "grad_norm": 0.47118356823921204,
+      "learning_rate": 0.0001949020635402207,
+      "loss": 1.0445,
+      "step": 2311
+    },
+    {
+      "epoch": 0.4116809116809117,
+      "grad_norm": 0.47171851992607117,
+      "learning_rate": 0.00019489765041205375,
+      "loss": 1.0062,
+      "step": 2312
+    },
+    {
+      "epoch": 0.41185897435897434,
+      "grad_norm": 0.5703238844871521,
+      "learning_rate": 0.00019489323542456447,
+      "loss": 1.5639,
+      "step": 2313
+    },
+    {
+      "epoch": 0.41203703703703703,
+      "grad_norm": 0.5045075416564941,
+      "learning_rate": 0.00019488881857783935,
+      "loss": 1.1665,
+      "step": 2314
+    },
+    {
+      "epoch": 0.41221509971509973,
+      "grad_norm": 0.46835362911224365,
+      "learning_rate": 0.00019488439987196495,
+      "loss": 1.2078,
+      "step": 2315
+    },
+    {
+      "epoch": 0.41239316239316237,
+      "grad_norm": 0.5187196731567383,
+      "learning_rate": 0.00019487997930702785,
+      "loss": 1.1049,
+      "step": 2316
+    },
+    {
+      "epoch": 0.41257122507122507,
+      "grad_norm": 0.5190554857254028,
+      "learning_rate": 0.00019487555688311463,
+      "loss": 1.331,
+      "step": 2317
+    },
+    {
+      "epoch": 0.41274928774928776,
+      "grad_norm": 0.7394969463348389,
+      "learning_rate": 0.00019487113260031197,
+      "loss": 0.9646,
+      "step": 2318
+    },
+    {
+      "epoch": 0.4129273504273504,
+      "grad_norm": 0.532982349395752,
+      "learning_rate": 0.00019486670645870656,
+      "loss": 1.166,
+      "step": 2319
+    },
+    {
+      "epoch": 0.4131054131054131,
+      "grad_norm": 0.48659515380859375,
+      "learning_rate": 0.00019486227845838509,
+      "loss": 1.0016,
+      "step": 2320
+    },
+    {
+      "epoch": 0.4132834757834758,
+      "grad_norm": 0.5364453196525574,
+      "learning_rate": 0.00019485784859943434,
+      "loss": 1.3877,
+      "step": 2321
+    },
+    {
+      "epoch": 0.41346153846153844,
+      "grad_norm": 0.49788740277290344,
+      "learning_rate": 0.0001948534168819411,
+      "loss": 1.2949,
+      "step": 2322
+    },
+    {
+      "epoch": 0.41363960113960113,
+      "grad_norm": 0.5125377774238586,
+      "learning_rate": 0.00019484898330599217,
+      "loss": 0.9769,
+      "step": 2323
+    },
+    {
+      "epoch": 0.41381766381766383,
+      "grad_norm": 0.5434861779212952,
+      "learning_rate": 0.00019484454787167447,
+      "loss": 1.254,
+      "step": 2324
+    },
+    {
+      "epoch": 0.41399572649572647,
+      "grad_norm": 0.5324583053588867,
+      "learning_rate": 0.00019484011057907487,
+      "loss": 0.9788,
+      "step": 2325
+    },
+    {
+      "epoch": 0.41417378917378916,
+      "grad_norm": 0.4806961715221405,
+      "learning_rate": 0.00019483567142828033,
+      "loss": 1.0089,
+      "step": 2326
+    },
+    {
+      "epoch": 0.41435185185185186,
+      "grad_norm": 0.5152947306632996,
+      "learning_rate": 0.0001948312304193778,
+      "loss": 1.15,
+      "step": 2327
+    },
+    {
+      "epoch": 0.41452991452991456,
+      "grad_norm": 0.6030138731002808,
+      "learning_rate": 0.0001948267875524543,
+      "loss": 1.196,
+      "step": 2328
+    },
+    {
+      "epoch": 0.4147079772079772,
+      "grad_norm": 0.4504946768283844,
+      "learning_rate": 0.0001948223428275969,
+      "loss": 0.8742,
+      "step": 2329
+    },
+    {
+      "epoch": 0.4148860398860399,
+      "grad_norm": 0.5195745825767517,
+      "learning_rate": 0.00019481789624489263,
+      "loss": 1.0104,
+      "step": 2330
+    },
+    {
+      "epoch": 0.4150641025641026,
+      "grad_norm": 0.5269250869750977,
+      "learning_rate": 0.0001948134478044287,
+      "loss": 1.2284,
+      "step": 2331
+    },
+    {
+      "epoch": 0.41524216524216523,
+      "grad_norm": 0.5302315354347229,
+      "learning_rate": 0.00019480899750629218,
+      "loss": 1.1374,
+      "step": 2332
+    },
+    {
+      "epoch": 0.4154202279202279,
+      "grad_norm": 0.5501471161842346,
+      "learning_rate": 0.0001948045453505703,
+      "loss": 1.214,
+      "step": 2333
+    },
+    {
+      "epoch": 0.4155982905982906,
+      "grad_norm": 0.4674588739871979,
+      "learning_rate": 0.0001948000913373503,
+      "loss": 1.0568,
+      "step": 2334
+    },
+    {
+      "epoch": 0.41577635327635326,
+      "grad_norm": 0.5262266993522644,
+      "learning_rate": 0.0001947956354667195,
+      "loss": 1.111,
+      "step": 2335
+    },
+    {
+      "epoch": 0.41595441595441596,
+      "grad_norm": 0.4549071788787842,
+      "learning_rate": 0.00019479117773876507,
+      "loss": 1.2655,
+      "step": 2336
+    },
+    {
+      "epoch": 0.41613247863247865,
+      "grad_norm": 0.48897311091423035,
+      "learning_rate": 0.00019478671815357447,
+      "loss": 1.0543,
+      "step": 2337
+    },
+    {
+      "epoch": 0.4163105413105413,
+      "grad_norm": 0.5544867515563965,
+      "learning_rate": 0.000194782256711235,
+      "loss": 1.2276,
+      "step": 2338
+    },
+    {
+      "epoch": 0.416488603988604,
+      "grad_norm": 0.5050773024559021,
+      "learning_rate": 0.0001947777934118341,
+      "loss": 0.9781,
+      "step": 2339
+    },
+    {
+      "epoch": 0.4166666666666667,
+      "grad_norm": 0.4831899106502533,
+      "learning_rate": 0.00019477332825545925,
+      "loss": 1.0213,
+      "step": 2340
+    },
+    {
+      "epoch": 0.4168447293447293,
+      "grad_norm": 0.5392552614212036,
+      "learning_rate": 0.0001947688612421979,
+      "loss": 1.3251,
+      "step": 2341
+    },
+    {
+      "epoch": 0.417022792022792,
+      "grad_norm": 0.5003608465194702,
+      "learning_rate": 0.00019476439237213754,
+      "loss": 1.0714,
+      "step": 2342
+    },
+    {
+      "epoch": 0.4172008547008547,
+      "grad_norm": 0.5016986727714539,
+      "learning_rate": 0.00019475992164536582,
+      "loss": 1.0656,
+      "step": 2343
+    },
+    {
+      "epoch": 0.41737891737891736,
+      "grad_norm": 0.5139234066009521,
+      "learning_rate": 0.00019475544906197024,
+      "loss": 1.1317,
+      "step": 2344
+    },
+    {
+      "epoch": 0.41755698005698005,
+      "grad_norm": 0.582478940486908,
+      "learning_rate": 0.00019475097462203847,
+      "loss": 1.4209,
+      "step": 2345
+    },
+    {
+      "epoch": 0.41773504273504275,
+      "grad_norm": 0.5248767137527466,
+      "learning_rate": 0.00019474649832565823,
+      "loss": 1.2965,
+      "step": 2346
+    },
+    {
+      "epoch": 0.4179131054131054,
+      "grad_norm": 0.4977390170097351,
+      "learning_rate": 0.00019474202017291713,
+      "loss": 1.3319,
+      "step": 2347
+    },
+    {
+      "epoch": 0.4180911680911681,
+      "grad_norm": 0.4868984818458557,
+      "learning_rate": 0.00019473754016390298,
+      "loss": 1.0595,
+      "step": 2348
+    },
+    {
+      "epoch": 0.4182692307692308,
+      "grad_norm": 0.5965346693992615,
+      "learning_rate": 0.00019473305829870353,
+      "loss": 1.2289,
+      "step": 2349
+    },
+    {
+      "epoch": 0.4184472934472934,
+      "grad_norm": 0.46590209007263184,
+      "learning_rate": 0.0001947285745774066,
+      "loss": 1.0468,
+      "step": 2350
+    },
+    {
+      "epoch": 0.4186253561253561,
+      "grad_norm": 0.497811883687973,
+      "learning_rate": 0.0001947240890001,
+      "loss": 1.1247,
+      "step": 2351
+    },
+    {
+      "epoch": 0.4188034188034188,
+      "grad_norm": 0.5348289012908936,
+      "learning_rate": 0.0001947196015668717,
+      "loss": 0.9496,
+      "step": 2352
+    },
+    {
+      "epoch": 0.41898148148148145,
+      "grad_norm": 0.5086174607276917,
+      "learning_rate": 0.0001947151122778095,
+      "loss": 0.8869,
+      "step": 2353
+    },
+    {
+      "epoch": 0.41915954415954415,
+      "grad_norm": 0.4844677150249481,
+      "learning_rate": 0.00019471062113300146,
+      "loss": 0.847,
+      "step": 2354
+    },
+    {
+      "epoch": 0.41933760683760685,
+      "grad_norm": 0.5395866632461548,
+      "learning_rate": 0.00019470612813253556,
+      "loss": 0.9684,
+      "step": 2355
+    },
+    {
+      "epoch": 0.41951566951566954,
+      "grad_norm": 0.479403018951416,
+      "learning_rate": 0.0001947016332764998,
+      "loss": 1.0532,
+      "step": 2356
+    },
+    {
+      "epoch": 0.4196937321937322,
+      "grad_norm": 0.5499961376190186,
+      "learning_rate": 0.00019469713656498227,
+      "loss": 1.2565,
+      "step": 2357
+    },
+    {
+      "epoch": 0.4198717948717949,
+      "grad_norm": 0.5865352153778076,
+      "learning_rate": 0.00019469263799807104,
+      "loss": 1.1349,
+      "step": 2358
+    },
+    {
+      "epoch": 0.4200498575498576,
+      "grad_norm": 0.4454309046268463,
+      "learning_rate": 0.00019468813757585432,
+      "loss": 0.9631,
+      "step": 2359
+    },
+    {
+      "epoch": 0.4202279202279202,
+      "grad_norm": 0.48426875472068787,
+      "learning_rate": 0.00019468363529842023,
+      "loss": 0.9795,
+      "step": 2360
+    },
+    {
+      "epoch": 0.4204059829059829,
+      "grad_norm": 0.47428226470947266,
+      "learning_rate": 0.00019467913116585697,
+      "loss": 0.9316,
+      "step": 2361
+    },
+    {
+      "epoch": 0.4205840455840456,
+      "grad_norm": 0.5193758010864258,
+      "learning_rate": 0.00019467462517825282,
+      "loss": 1.235,
+      "step": 2362
+    },
+    {
+      "epoch": 0.42076210826210825,
+      "grad_norm": 0.49845513701438904,
+      "learning_rate": 0.00019467011733569607,
+      "loss": 1.2413,
+      "step": 2363
+    },
+    {
+      "epoch": 0.42094017094017094,
+      "grad_norm": 0.45483845472335815,
+      "learning_rate": 0.00019466560763827502,
+      "loss": 1.2817,
+      "step": 2364
+    },
+    {
+      "epoch": 0.42111823361823364,
+      "grad_norm": 0.43345287442207336,
+      "learning_rate": 0.00019466109608607806,
+      "loss": 0.8568,
+      "step": 2365
+    },
+    {
+      "epoch": 0.4212962962962963,
+      "grad_norm": 0.4467088282108307,
+      "learning_rate": 0.00019465658267919352,
+      "loss": 1.1408,
+      "step": 2366
+    },
+    {
+      "epoch": 0.421474358974359,
+      "grad_norm": 0.6705610156059265,
+      "learning_rate": 0.00019465206741770992,
+      "loss": 1.445,
+      "step": 2367
+    },
+    {
+      "epoch": 0.42165242165242167,
+      "grad_norm": 0.5037859678268433,
+      "learning_rate": 0.00019464755030171565,
+      "loss": 0.8682,
+      "step": 2368
+    },
+    {
+      "epoch": 0.4218304843304843,
+      "grad_norm": 0.49576324224472046,
+      "learning_rate": 0.00019464303133129928,
+      "loss": 0.8387,
+      "step": 2369
+    },
+    {
+      "epoch": 0.422008547008547,
+      "grad_norm": 0.5222806334495544,
+      "learning_rate": 0.00019463851050654927,
+      "loss": 1.1443,
+      "step": 2370
+    },
+    {
+      "epoch": 0.4221866096866097,
+      "grad_norm": 0.4966863989830017,
+      "learning_rate": 0.00019463398782755426,
+      "loss": 1.1555,
+      "step": 2371
+    },
+    {
+      "epoch": 0.42236467236467234,
+      "grad_norm": 0.6140168309211731,
+      "learning_rate": 0.00019462946329440285,
+      "loss": 1.2264,
+      "step": 2372
+    },
+    {
+      "epoch": 0.42254273504273504,
+      "grad_norm": 0.4906651973724365,
+      "learning_rate": 0.0001946249369071837,
+      "loss": 1.2459,
+      "step": 2373
+    },
+    {
+      "epoch": 0.42272079772079774,
+      "grad_norm": 0.5956700444221497,
+      "learning_rate": 0.00019462040866598544,
+      "loss": 1.1521,
+      "step": 2374
+    },
+    {
+      "epoch": 0.4228988603988604,
+      "grad_norm": 0.46044886112213135,
+      "learning_rate": 0.00019461587857089687,
+      "loss": 1.2084,
+      "step": 2375
+    },
+    {
+      "epoch": 0.4230769230769231,
+      "grad_norm": 0.5109430551528931,
+      "learning_rate": 0.00019461134662200668,
+      "loss": 1.2684,
+      "step": 2376
+    },
+    {
+      "epoch": 0.42325498575498577,
+      "grad_norm": 0.4373733103275299,
+      "learning_rate": 0.0001946068128194037,
+      "loss": 1.0451,
+      "step": 2377
+    },
+    {
+      "epoch": 0.4234330484330484,
+      "grad_norm": 0.553817868232727,
+      "learning_rate": 0.00019460227716317673,
+      "loss": 1.1052,
+      "step": 2378
+    },
+    {
+      "epoch": 0.4236111111111111,
+      "grad_norm": 0.5742647647857666,
+      "learning_rate": 0.00019459773965341468,
+      "loss": 1.1647,
+      "step": 2379
+    },
+    {
+      "epoch": 0.4237891737891738,
+      "grad_norm": 0.5461940169334412,
+      "learning_rate": 0.00019459320029020642,
+      "loss": 1.0953,
+      "step": 2380
+    },
+    {
+      "epoch": 0.42396723646723644,
+      "grad_norm": 0.5837802290916443,
+      "learning_rate": 0.0001945886590736409,
+      "loss": 1.1303,
+      "step": 2381
+    },
+    {
+      "epoch": 0.42414529914529914,
+      "grad_norm": 0.5316985249519348,
+      "learning_rate": 0.0001945841160038071,
+      "loss": 1.1204,
+      "step": 2382
+    },
+    {
+      "epoch": 0.42432336182336183,
+      "grad_norm": 0.5846191048622131,
+      "learning_rate": 0.00019457957108079404,
+      "loss": 1.2622,
+      "step": 2383
+    },
+    {
+      "epoch": 0.42450142450142453,
+      "grad_norm": 0.43266957998275757,
+      "learning_rate": 0.00019457502430469075,
+      "loss": 0.9834,
+      "step": 2384
+    },
+    {
+      "epoch": 0.42467948717948717,
+      "grad_norm": 0.514081597328186,
+      "learning_rate": 0.00019457047567558632,
+      "loss": 0.8413,
+      "step": 2385
+    },
+    {
+      "epoch": 0.42485754985754987,
+      "grad_norm": 0.4831700325012207,
+      "learning_rate": 0.00019456592519356987,
+      "loss": 0.9244,
+      "step": 2386
+    },
+    {
+      "epoch": 0.42503561253561256,
+      "grad_norm": 0.5612850785255432,
+      "learning_rate": 0.00019456137285873057,
+      "loss": 0.9438,
+      "step": 2387
+    },
+    {
+      "epoch": 0.4252136752136752,
+      "grad_norm": 0.5197352766990662,
+      "learning_rate": 0.00019455681867115758,
+      "loss": 1.1095,
+      "step": 2388
+    },
+    {
+      "epoch": 0.4253917378917379,
+      "grad_norm": 0.5045261979103088,
+      "learning_rate": 0.00019455226263094018,
+      "loss": 1.0007,
+      "step": 2389
+    },
+    {
+      "epoch": 0.4255698005698006,
+      "grad_norm": 0.5167570114135742,
+      "learning_rate": 0.00019454770473816758,
+      "loss": 1.1335,
+      "step": 2390
+    },
+    {
+      "epoch": 0.42574786324786323,
+      "grad_norm": 0.49262070655822754,
+      "learning_rate": 0.00019454314499292913,
+      "loss": 1.0436,
+      "step": 2391
+    },
+    {
+      "epoch": 0.42592592592592593,
+      "grad_norm": 0.4489207863807678,
+      "learning_rate": 0.00019453858339531417,
+      "loss": 1.0138,
+      "step": 2392
+    },
+    {
+      "epoch": 0.4261039886039886,
+      "grad_norm": 0.6024920344352722,
+      "learning_rate": 0.00019453401994541203,
+      "loss": 1.1921,
+      "step": 2393
+    },
+    {
+      "epoch": 0.42628205128205127,
+      "grad_norm": 0.46807861328125,
+      "learning_rate": 0.00019452945464331215,
+      "loss": 1.0947,
+      "step": 2394
+    },
+    {
+      "epoch": 0.42646011396011396,
+      "grad_norm": 0.48776543140411377,
+      "learning_rate": 0.00019452488748910397,
+      "loss": 1.0029,
+      "step": 2395
+    },
+    {
+      "epoch": 0.42663817663817666,
+      "grad_norm": 0.4798663556575775,
+      "learning_rate": 0.000194520318482877,
+      "loss": 0.7863,
+      "step": 2396
+    },
+    {
+      "epoch": 0.4268162393162393,
+      "grad_norm": 0.5067816972732544,
+      "learning_rate": 0.0001945157476247207,
+      "loss": 1.0049,
+      "step": 2397
+    },
+    {
+      "epoch": 0.426994301994302,
+      "grad_norm": 0.5179638266563416,
+      "learning_rate": 0.00019451117491472468,
+      "loss": 1.1851,
+      "step": 2398
+    },
+    {
+      "epoch": 0.4271723646723647,
+      "grad_norm": 0.4782430827617645,
+      "learning_rate": 0.00019450660035297854,
+      "loss": 1.125,
+      "step": 2399
+    },
+    {
+      "epoch": 0.42735042735042733,
+      "grad_norm": 0.560077965259552,
+      "learning_rate": 0.00019450202393957186,
+      "loss": 1.1843,
+      "step": 2400
+    },
+    {
+      "epoch": 0.42752849002849,
+      "grad_norm": 0.5247970223426819,
+      "learning_rate": 0.00019449744567459436,
+      "loss": 1.1576,
+      "step": 2401
+    },
+    {
+      "epoch": 0.4277065527065527,
+      "grad_norm": 0.6414062976837158,
+      "learning_rate": 0.00019449286555813568,
+      "loss": 1.1833,
+      "step": 2402
+    },
+    {
+      "epoch": 0.42788461538461536,
+      "grad_norm": 0.5006586909294128,
+      "learning_rate": 0.00019448828359028563,
+      "loss": 1.1778,
+      "step": 2403
+    },
+    {
+      "epoch": 0.42806267806267806,
+      "grad_norm": 0.4946450889110565,
+      "learning_rate": 0.0001944836997711339,
+      "loss": 1.1611,
+      "step": 2404
+    },
+    {
+      "epoch": 0.42824074074074076,
+      "grad_norm": 0.4601200222969055,
+      "learning_rate": 0.00019447911410077037,
+      "loss": 1.2456,
+      "step": 2405
+    },
+    {
+      "epoch": 0.4284188034188034,
+      "grad_norm": 0.4653947651386261,
+      "learning_rate": 0.00019447452657928485,
+      "loss": 1.0941,
+      "step": 2406
+    },
+    {
+      "epoch": 0.4285968660968661,
+      "grad_norm": 0.5015713572502136,
+      "learning_rate": 0.00019446993720676726,
+      "loss": 1.3113,
+      "step": 2407
+    },
+    {
+      "epoch": 0.4287749287749288,
+      "grad_norm": 0.5803143978118896,
+      "learning_rate": 0.0001944653459833075,
+      "loss": 1.0568,
+      "step": 2408
+    },
+    {
+      "epoch": 0.42895299145299143,
+      "grad_norm": 0.5259647965431213,
+      "learning_rate": 0.0001944607529089955,
+      "loss": 1.1243,
+      "step": 2409
+    },
+    {
+      "epoch": 0.4291310541310541,
+      "grad_norm": 0.5150414109230042,
+      "learning_rate": 0.00019445615798392124,
+      "loss": 1.0676,
+      "step": 2410
+    },
+    {
+      "epoch": 0.4293091168091168,
+      "grad_norm": 0.5848649740219116,
+      "learning_rate": 0.0001944515612081748,
+      "loss": 1.0671,
+      "step": 2411
+    },
+    {
+      "epoch": 0.42948717948717946,
+      "grad_norm": 0.5696990489959717,
+      "learning_rate": 0.00019444696258184626,
+      "loss": 1.3323,
+      "step": 2412
+    },
+    {
+      "epoch": 0.42966524216524216,
+      "grad_norm": 0.49822330474853516,
+      "learning_rate": 0.00019444236210502567,
+      "loss": 1.1004,
+      "step": 2413
+    },
+    {
+      "epoch": 0.42984330484330485,
+      "grad_norm": 0.4683490991592407,
+      "learning_rate": 0.00019443775977780317,
+      "loss": 0.9768,
+      "step": 2414
+    },
+    {
+      "epoch": 0.43002136752136755,
+      "grad_norm": 0.5703811049461365,
+      "learning_rate": 0.00019443315560026893,
+      "loss": 1.154,
+      "step": 2415
+    },
+    {
+      "epoch": 0.4301994301994302,
+      "grad_norm": 0.5121861100196838,
+      "learning_rate": 0.0001944285495725132,
+      "loss": 1.1388,
+      "step": 2416
+    },
+    {
+      "epoch": 0.4303774928774929,
+      "grad_norm": 0.4864094853401184,
+      "learning_rate": 0.00019442394169462619,
+      "loss": 0.9214,
+      "step": 2417
+    },
+    {
+      "epoch": 0.4305555555555556,
+      "grad_norm": 0.5234864354133606,
+      "learning_rate": 0.0001944193319666982,
+      "loss": 1.2787,
+      "step": 2418
+    },
+    {
+      "epoch": 0.4307336182336182,
+      "grad_norm": 0.5137650370597839,
+      "learning_rate": 0.00019441472038881955,
+      "loss": 1.1406,
+      "step": 2419
+    },
+    {
+      "epoch": 0.4309116809116809,
+      "grad_norm": 0.49687784910202026,
+      "learning_rate": 0.00019441010696108054,
+      "loss": 0.93,
+      "step": 2420
+    },
+    {
+      "epoch": 0.4310897435897436,
+      "grad_norm": 0.5078722834587097,
+      "learning_rate": 0.00019440549168357163,
+      "loss": 1.1417,
+      "step": 2421
+    },
+    {
+      "epoch": 0.43126780626780625,
+      "grad_norm": 0.4483391046524048,
+      "learning_rate": 0.00019440087455638324,
+      "loss": 0.9016,
+      "step": 2422
+    },
+    {
+      "epoch": 0.43144586894586895,
+      "grad_norm": 0.5963045954704285,
+      "learning_rate": 0.00019439625557960576,
+      "loss": 1.1567,
+      "step": 2423
+    },
+    {
+      "epoch": 0.43162393162393164,
+      "grad_norm": 0.5534471273422241,
+      "learning_rate": 0.0001943916347533298,
+      "loss": 1.1409,
+      "step": 2424
+    },
+    {
+      "epoch": 0.4318019943019943,
+      "grad_norm": 0.6400241851806641,
+      "learning_rate": 0.0001943870120776458,
+      "loss": 1.2041,
+      "step": 2425
+    },
+    {
+      "epoch": 0.431980056980057,
+      "grad_norm": 0.4599420726299286,
+      "learning_rate": 0.0001943823875526444,
+      "loss": 1.023,
+      "step": 2426
+    },
+    {
+      "epoch": 0.4321581196581197,
+      "grad_norm": 0.4799708425998688,
+      "learning_rate": 0.00019437776117841614,
+      "loss": 1.0872,
+      "step": 2427
+    },
+    {
+      "epoch": 0.4323361823361823,
+      "grad_norm": 0.5138532519340515,
+      "learning_rate": 0.00019437313295505172,
+      "loss": 1.1175,
+      "step": 2428
+    },
+    {
+      "epoch": 0.432514245014245,
+      "grad_norm": 0.538223147392273,
+      "learning_rate": 0.00019436850288264183,
+      "loss": 1.1203,
+      "step": 2429
+    },
+    {
+      "epoch": 0.4326923076923077,
+      "grad_norm": 0.458044171333313,
+      "learning_rate": 0.00019436387096127713,
+      "loss": 1.0383,
+      "step": 2430
+    },
+    {
+      "epoch": 0.43287037037037035,
+      "grad_norm": 0.5928303599357605,
+      "learning_rate": 0.00019435923719104842,
+      "loss": 1.1191,
+      "step": 2431
+    },
+    {
+      "epoch": 0.43304843304843305,
+      "grad_norm": 0.5818437933921814,
+      "learning_rate": 0.00019435460157204645,
+      "loss": 1.0352,
+      "step": 2432
+    },
+    {
+      "epoch": 0.43322649572649574,
+      "grad_norm": 0.487341046333313,
+      "learning_rate": 0.0001943499641043621,
+      "loss": 1.2608,
+      "step": 2433
+    },
+    {
+      "epoch": 0.4334045584045584,
+      "grad_norm": 0.4737292230129242,
+      "learning_rate": 0.0001943453247880862,
+      "loss": 1.0084,
+      "step": 2434
+    },
+    {
+      "epoch": 0.4335826210826211,
+      "grad_norm": 0.4251207709312439,
+      "learning_rate": 0.0001943406836233096,
+      "loss": 0.9163,
+      "step": 2435
+    },
+    {
+      "epoch": 0.4337606837606838,
+      "grad_norm": 0.49468478560447693,
+      "learning_rate": 0.00019433604061012331,
+      "loss": 1.0293,
+      "step": 2436
+    },
+    {
+      "epoch": 0.4339387464387464,
+      "grad_norm": 0.47120022773742676,
+      "learning_rate": 0.00019433139574861826,
+      "loss": 1.0097,
+      "step": 2437
+    },
+    {
+      "epoch": 0.4341168091168091,
+      "grad_norm": 0.5060358047485352,
+      "learning_rate": 0.00019432674903888548,
+      "loss": 1.0683,
+      "step": 2438
+    },
+    {
+      "epoch": 0.4342948717948718,
+      "grad_norm": 0.5455917119979858,
+      "learning_rate": 0.00019432210048101598,
+      "loss": 0.8886,
+      "step": 2439
+    },
+    {
+      "epoch": 0.43447293447293445,
+      "grad_norm": 0.7960546612739563,
+      "learning_rate": 0.00019431745007510086,
+      "loss": 0.8648,
+      "step": 2440
+    },
+    {
+      "epoch": 0.43465099715099714,
+      "grad_norm": 0.5069689154624939,
+      "learning_rate": 0.00019431279782123126,
+      "loss": 1.1315,
+      "step": 2441
+    },
+    {
+      "epoch": 0.43482905982905984,
+      "grad_norm": 0.5597776174545288,
+      "learning_rate": 0.0001943081437194983,
+      "loss": 1.2281,
+      "step": 2442
+    },
+    {
+      "epoch": 0.43500712250712253,
+      "grad_norm": 0.4527420997619629,
+      "learning_rate": 0.00019430348776999315,
+      "loss": 0.7576,
+      "step": 2443
+    },
+    {
+      "epoch": 0.4351851851851852,
+      "grad_norm": 0.5625936388969421,
+      "learning_rate": 0.00019429882997280706,
+      "loss": 1.0302,
+      "step": 2444
+    },
+    {
+      "epoch": 0.43536324786324787,
+      "grad_norm": 0.5173513293266296,
+      "learning_rate": 0.0001942941703280313,
+      "loss": 1.2255,
+      "step": 2445
+    },
+    {
+      "epoch": 0.43554131054131057,
+      "grad_norm": 0.45889151096343994,
+      "learning_rate": 0.00019428950883575714,
+      "loss": 0.9322,
+      "step": 2446
+    },
+    {
+      "epoch": 0.4357193732193732,
+      "grad_norm": 0.5288477540016174,
+      "learning_rate": 0.00019428484549607593,
+      "loss": 1.0572,
+      "step": 2447
+    },
+    {
+      "epoch": 0.4358974358974359,
+      "grad_norm": 0.48328033089637756,
+      "learning_rate": 0.00019428018030907902,
+      "loss": 1.1213,
+      "step": 2448
+    },
+    {
+      "epoch": 0.4360754985754986,
+      "grad_norm": 0.5146737098693848,
+      "learning_rate": 0.00019427551327485786,
+      "loss": 0.9633,
+      "step": 2449
+    },
+    {
+      "epoch": 0.43625356125356124,
+      "grad_norm": 0.5138360261917114,
+      "learning_rate": 0.00019427084439350382,
+      "loss": 1.0561,
+      "step": 2450
+    },
+    {
+      "epoch": 0.43643162393162394,
+      "grad_norm": 0.5192533135414124,
+      "learning_rate": 0.00019426617366510843,
+      "loss": 1.1704,
+      "step": 2451
+    },
+    {
+      "epoch": 0.43660968660968663,
+      "grad_norm": 0.4819495379924774,
+      "learning_rate": 0.00019426150108976318,
+      "loss": 1.0958,
+      "step": 2452
+    },
+    {
+      "epoch": 0.43678774928774927,
+      "grad_norm": 0.4626680910587311,
+      "learning_rate": 0.00019425682666755965,
+      "loss": 1.1872,
+      "step": 2453
+    },
+    {
+      "epoch": 0.43696581196581197,
+      "grad_norm": 0.5773931741714478,
+      "learning_rate": 0.00019425215039858937,
+      "loss": 1.0722,
+      "step": 2454
+    },
+    {
+      "epoch": 0.43714387464387466,
+      "grad_norm": 0.5003872513771057,
+      "learning_rate": 0.00019424747228294402,
+      "loss": 1.0561,
+      "step": 2455
+    },
+    {
+      "epoch": 0.4373219373219373,
+      "grad_norm": 0.47370314598083496,
+      "learning_rate": 0.0001942427923207152,
+      "loss": 1.1619,
+      "step": 2456
+    },
+    {
+      "epoch": 0.4375,
+      "grad_norm": 0.466421514749527,
+      "learning_rate": 0.00019423811051199466,
+      "loss": 1.1311,
+      "step": 2457
+    },
+    {
+      "epoch": 0.4376780626780627,
+      "grad_norm": 0.44564682245254517,
+      "learning_rate": 0.00019423342685687413,
+      "loss": 1.1889,
+      "step": 2458
+    },
+    {
+      "epoch": 0.43785612535612534,
+      "grad_norm": 0.40986698865890503,
+      "learning_rate": 0.00019422874135544533,
+      "loss": 0.7312,
+      "step": 2459
+    },
+    {
+      "epoch": 0.43803418803418803,
+      "grad_norm": 0.4714358448982239,
+      "learning_rate": 0.0001942240540078001,
+      "loss": 0.9273,
+      "step": 2460
+    },
+    {
+      "epoch": 0.43821225071225073,
+      "grad_norm": 0.5298398733139038,
+      "learning_rate": 0.00019421936481403025,
+      "loss": 1.3377,
+      "step": 2461
+    },
+    {
+      "epoch": 0.43839031339031337,
+      "grad_norm": 0.6326695680618286,
+      "learning_rate": 0.0001942146737742277,
+      "loss": 1.0258,
+      "step": 2462
+    },
+    {
+      "epoch": 0.43856837606837606,
+      "grad_norm": 0.5087653994560242,
+      "learning_rate": 0.00019420998088848427,
+      "loss": 1.0007,
+      "step": 2463
+    },
+    {
+      "epoch": 0.43874643874643876,
+      "grad_norm": 0.4895429313182831,
+      "learning_rate": 0.00019420528615689202,
+      "loss": 1.0032,
+      "step": 2464
+    },
+    {
+      "epoch": 0.4389245014245014,
+      "grad_norm": 0.5029937028884888,
+      "learning_rate": 0.00019420058957954285,
+      "loss": 1.2877,
+      "step": 2465
+    },
+    {
+      "epoch": 0.4391025641025641,
+      "grad_norm": 0.4953192174434662,
+      "learning_rate": 0.00019419589115652884,
+      "loss": 1.0759,
+      "step": 2466
+    },
+    {
+      "epoch": 0.4392806267806268,
+      "grad_norm": 0.5081778168678284,
+      "learning_rate": 0.000194191190887942,
+      "loss": 0.8816,
+      "step": 2467
+    },
+    {
+      "epoch": 0.43945868945868943,
+      "grad_norm": 0.5065913200378418,
+      "learning_rate": 0.00019418648877387446,
+      "loss": 1.0362,
+      "step": 2468
+    },
+    {
+      "epoch": 0.43963675213675213,
+      "grad_norm": 0.540600061416626,
+      "learning_rate": 0.00019418178481441832,
+      "loss": 1.0911,
+      "step": 2469
+    },
+    {
+      "epoch": 0.4398148148148148,
+      "grad_norm": 0.5122954845428467,
+      "learning_rate": 0.00019417707900966572,
+      "loss": 0.9866,
+      "step": 2470
+    },
+    {
+      "epoch": 0.43999287749287747,
+      "grad_norm": 0.5380190014839172,
+      "learning_rate": 0.00019417237135970893,
+      "loss": 1.2775,
+      "step": 2471
+    },
+    {
+      "epoch": 0.44017094017094016,
+      "grad_norm": 1.2977570295333862,
+      "learning_rate": 0.00019416766186464016,
+      "loss": 1.3993,
+      "step": 2472
+    },
+    {
+      "epoch": 0.44034900284900286,
+      "grad_norm": 0.48105308413505554,
+      "learning_rate": 0.00019416295052455165,
+      "loss": 0.9369,
+      "step": 2473
+    },
+    {
+      "epoch": 0.44052706552706555,
+      "grad_norm": 0.4742157459259033,
+      "learning_rate": 0.00019415823733953574,
+      "loss": 1.101,
+      "step": 2474
+    },
+    {
+      "epoch": 0.4407051282051282,
+      "grad_norm": 0.4958631694316864,
+      "learning_rate": 0.00019415352230968473,
+      "loss": 0.9906,
+      "step": 2475
+    },
+    {
+      "epoch": 0.4408831908831909,
+      "grad_norm": 0.5808146595954895,
+      "learning_rate": 0.00019414880543509107,
+      "loss": 1.2315,
+      "step": 2476
+    },
+    {
+      "epoch": 0.4410612535612536,
+      "grad_norm": 0.4294755160808563,
+      "learning_rate": 0.00019414408671584714,
+      "loss": 0.8275,
+      "step": 2477
+    },
+    {
+      "epoch": 0.4412393162393162,
+      "grad_norm": 0.5346055626869202,
+      "learning_rate": 0.0001941393661520454,
+      "loss": 1.2432,
+      "step": 2478
+    },
+    {
+      "epoch": 0.4414173789173789,
+      "grad_norm": 0.5827590227127075,
+      "learning_rate": 0.00019413464374377833,
+      "loss": 1.3204,
+      "step": 2479
+    },
+    {
+      "epoch": 0.4415954415954416,
+      "grad_norm": 0.45688143372535706,
+      "learning_rate": 0.00019412991949113847,
+      "loss": 0.9307,
+      "step": 2480
+    },
+    {
+      "epoch": 0.44177350427350426,
+      "grad_norm": 0.512999415397644,
+      "learning_rate": 0.0001941251933942184,
+      "loss": 1.2808,
+      "step": 2481
+    },
+    {
+      "epoch": 0.44195156695156695,
+      "grad_norm": 0.4546334445476532,
+      "learning_rate": 0.00019412046545311064,
+      "loss": 1.0156,
+      "step": 2482
+    },
+    {
+      "epoch": 0.44212962962962965,
+      "grad_norm": 0.48552581667900085,
+      "learning_rate": 0.00019411573566790793,
+      "loss": 1.3798,
+      "step": 2483
+    },
+    {
+      "epoch": 0.4423076923076923,
+      "grad_norm": 0.511970579624176,
+      "learning_rate": 0.00019411100403870287,
+      "loss": 1.065,
+      "step": 2484
+    },
+    {
+      "epoch": 0.442485754985755,
+      "grad_norm": 0.6367824077606201,
+      "learning_rate": 0.00019410627056558815,
+      "loss": 1.3242,
+      "step": 2485
+    },
+    {
+      "epoch": 0.4426638176638177,
+      "grad_norm": 0.48913368582725525,
+      "learning_rate": 0.00019410153524865659,
+      "loss": 0.9761,
+      "step": 2486
+    },
+    {
+      "epoch": 0.4428418803418803,
+      "grad_norm": 0.5077710151672363,
+      "learning_rate": 0.0001940967980880009,
+      "loss": 1.1023,
+      "step": 2487
+    },
+    {
+      "epoch": 0.443019943019943,
+      "grad_norm": 0.4956335723400116,
+      "learning_rate": 0.00019409205908371395,
+      "loss": 1.1788,
+      "step": 2488
+    },
+    {
+      "epoch": 0.4431980056980057,
+      "grad_norm": 0.4726616442203522,
+      "learning_rate": 0.00019408731823588853,
+      "loss": 1.1445,
+      "step": 2489
+    },
+    {
+      "epoch": 0.44337606837606836,
+      "grad_norm": 0.5676438212394714,
+      "learning_rate": 0.00019408257554461757,
+      "loss": 1.0344,
+      "step": 2490
+    },
+    {
+      "epoch": 0.44355413105413105,
+      "grad_norm": 0.537656843662262,
+      "learning_rate": 0.000194077831009994,
+      "loss": 0.9876,
+      "step": 2491
+    },
+    {
+      "epoch": 0.44373219373219375,
+      "grad_norm": 0.517905592918396,
+      "learning_rate": 0.00019407308463211074,
+      "loss": 1.1389,
+      "step": 2492
+    },
+    {
+      "epoch": 0.4439102564102564,
+      "grad_norm": 0.49227026104927063,
+      "learning_rate": 0.0001940683364110608,
+      "loss": 1.0351,
+      "step": 2493
+    },
+    {
+      "epoch": 0.4440883190883191,
+      "grad_norm": 0.5131173729896545,
+      "learning_rate": 0.00019406358634693725,
+      "loss": 1.0351,
+      "step": 2494
+    },
+    {
+      "epoch": 0.4442663817663818,
+      "grad_norm": 0.5064495205879211,
+      "learning_rate": 0.0001940588344398331,
+      "loss": 1.0248,
+      "step": 2495
+    },
+    {
+      "epoch": 0.4444444444444444,
+      "grad_norm": 0.44107526540756226,
+      "learning_rate": 0.00019405408068984148,
+      "loss": 0.8068,
+      "step": 2496
+    },
+    {
+      "epoch": 0.4446225071225071,
+      "grad_norm": 0.6711848378181458,
+      "learning_rate": 0.00019404932509705554,
+      "loss": 1.059,
+      "step": 2497
+    },
+    {
+      "epoch": 0.4448005698005698,
+      "grad_norm": 0.5862596035003662,
+      "learning_rate": 0.00019404456766156845,
+      "loss": 1.2012,
+      "step": 2498
+    },
+    {
+      "epoch": 0.44497863247863245,
+      "grad_norm": 0.5528512001037598,
+      "learning_rate": 0.0001940398083834734,
+      "loss": 1.1121,
+      "step": 2499
+    },
+    {
+      "epoch": 0.44515669515669515,
+      "grad_norm": 0.5326655507087708,
+      "learning_rate": 0.0001940350472628637,
+      "loss": 1.166,
+      "step": 2500
+    },
+    {
+      "epoch": 0.44533475783475784,
+      "grad_norm": 0.5384873747825623,
+      "learning_rate": 0.00019403028429983252,
+      "loss": 1.4111,
+      "step": 2501
+    },
+    {
+      "epoch": 0.44551282051282054,
+      "grad_norm": 0.5142310857772827,
+      "learning_rate": 0.0001940255194944733,
+      "loss": 1.3353,
+      "step": 2502
+    },
+    {
+      "epoch": 0.4456908831908832,
+      "grad_norm": 0.49124231934547424,
+      "learning_rate": 0.0001940207528468793,
+      "loss": 1.1443,
+      "step": 2503
+    },
+    {
+      "epoch": 0.4458689458689459,
+      "grad_norm": 0.509713888168335,
+      "learning_rate": 0.000194015984357144,
+      "loss": 1.1857,
+      "step": 2504
+    },
+    {
+      "epoch": 0.44604700854700857,
+      "grad_norm": 0.5211275219917297,
+      "learning_rate": 0.00019401121402536078,
+      "loss": 0.9911,
+      "step": 2505
+    },
+    {
+      "epoch": 0.4462250712250712,
+      "grad_norm": 0.480340838432312,
+      "learning_rate": 0.00019400644185162312,
+      "loss": 1.1018,
+      "step": 2506
+    },
+    {
+      "epoch": 0.4464031339031339,
+      "grad_norm": 0.4212559163570404,
+      "learning_rate": 0.00019400166783602448,
+      "loss": 0.7501,
+      "step": 2507
+    },
+    {
+      "epoch": 0.4465811965811966,
+      "grad_norm": 0.5110511183738708,
+      "learning_rate": 0.00019399689197865846,
+      "loss": 1.1244,
+      "step": 2508
+    },
+    {
+      "epoch": 0.44675925925925924,
+      "grad_norm": 0.5604230165481567,
+      "learning_rate": 0.0001939921142796186,
+      "loss": 1.1066,
+      "step": 2509
+    },
+    {
+      "epoch": 0.44693732193732194,
+      "grad_norm": 0.5578675270080566,
+      "learning_rate": 0.0001939873347389985,
+      "loss": 1.0514,
+      "step": 2510
+    },
+    {
+      "epoch": 0.44711538461538464,
+      "grad_norm": 0.520908772945404,
+      "learning_rate": 0.00019398255335689184,
+      "loss": 1.1217,
+      "step": 2511
+    },
+    {
+      "epoch": 0.4472934472934473,
+      "grad_norm": 0.4405131936073303,
+      "learning_rate": 0.00019397777013339224,
+      "loss": 1.043,
+      "step": 2512
+    },
+    {
+      "epoch": 0.44747150997151,
+      "grad_norm": 0.5217751860618591,
+      "learning_rate": 0.0001939729850685935,
+      "loss": 1.1301,
+      "step": 2513
+    },
+    {
+      "epoch": 0.44764957264957267,
+      "grad_norm": 0.6151493191719055,
+      "learning_rate": 0.00019396819816258932,
+      "loss": 1.3498,
+      "step": 2514
+    },
+    {
+      "epoch": 0.4478276353276353,
+      "grad_norm": 0.5622836947441101,
+      "learning_rate": 0.0001939634094154735,
+      "loss": 1.146,
+      "step": 2515
+    },
+    {
+      "epoch": 0.448005698005698,
+      "grad_norm": 0.4671688973903656,
+      "learning_rate": 0.00019395861882733984,
+      "loss": 0.9456,
+      "step": 2516
+    },
+    {
+      "epoch": 0.4481837606837607,
+      "grad_norm": 0.453951358795166,
+      "learning_rate": 0.00019395382639828223,
+      "loss": 1.0042,
+      "step": 2517
+    },
+    {
+      "epoch": 0.44836182336182334,
+      "grad_norm": 0.5150699615478516,
+      "learning_rate": 0.0001939490321283946,
+      "loss": 1.166,
+      "step": 2518
+    },
+    {
+      "epoch": 0.44853988603988604,
+      "grad_norm": 0.5718298554420471,
+      "learning_rate": 0.0001939442360177708,
+      "loss": 1.2033,
+      "step": 2519
+    },
+    {
+      "epoch": 0.44871794871794873,
+      "grad_norm": 0.5306782126426697,
+      "learning_rate": 0.00019393943806650488,
+      "loss": 1.0765,
+      "step": 2520
+    },
+    {
+      "epoch": 0.4488960113960114,
+      "grad_norm": 0.47633033990859985,
+      "learning_rate": 0.0001939346382746908,
+      "loss": 0.9957,
+      "step": 2521
+    },
+    {
+      "epoch": 0.44907407407407407,
+      "grad_norm": 0.496441513299942,
+      "learning_rate": 0.00019392983664242262,
+      "loss": 1.2016,
+      "step": 2522
+    },
+    {
+      "epoch": 0.44925213675213677,
+      "grad_norm": 0.45956477522850037,
+      "learning_rate": 0.00019392503316979442,
+      "loss": 1.026,
+      "step": 2523
+    },
+    {
+      "epoch": 0.4494301994301994,
+      "grad_norm": 0.5400575995445251,
+      "learning_rate": 0.0001939202278569003,
+      "loss": 1.0785,
+      "step": 2524
+    },
+    {
+      "epoch": 0.4496082621082621,
+      "grad_norm": 0.4847868084907532,
+      "learning_rate": 0.00019391542070383442,
+      "loss": 1.013,
+      "step": 2525
+    },
+    {
+      "epoch": 0.4497863247863248,
+      "grad_norm": 0.4694063663482666,
+      "learning_rate": 0.00019391061171069094,
+      "loss": 0.8793,
+      "step": 2526
+    },
+    {
+      "epoch": 0.44996438746438744,
+      "grad_norm": 0.5158169269561768,
+      "learning_rate": 0.00019390580087756413,
+      "loss": 0.9602,
+      "step": 2527
+    },
+    {
+      "epoch": 0.45014245014245013,
+      "grad_norm": 0.5404585003852844,
+      "learning_rate": 0.00019390098820454822,
+      "loss": 1.2247,
+      "step": 2528
+    },
+    {
+      "epoch": 0.45032051282051283,
+      "grad_norm": 0.5302738547325134,
+      "learning_rate": 0.00019389617369173752,
+      "loss": 0.918,
+      "step": 2529
+    },
+    {
+      "epoch": 0.45049857549857547,
+      "grad_norm": 0.5065485835075378,
+      "learning_rate": 0.00019389135733922634,
+      "loss": 1.0934,
+      "step": 2530
+    },
+    {
+      "epoch": 0.45067663817663817,
+      "grad_norm": 0.5491471886634827,
+      "learning_rate": 0.00019388653914710903,
+      "loss": 1.0736,
+      "step": 2531
+    },
+    {
+      "epoch": 0.45085470085470086,
+      "grad_norm": 0.4850206971168518,
+      "learning_rate": 0.00019388171911548005,
+      "loss": 1.2401,
+      "step": 2532
+    },
+    {
+      "epoch": 0.45103276353276356,
+      "grad_norm": 0.5419789552688599,
+      "learning_rate": 0.0001938768972444338,
+      "loss": 1.269,
+      "step": 2533
+    },
+    {
+      "epoch": 0.4512108262108262,
+      "grad_norm": 0.4209023714065552,
+      "learning_rate": 0.00019387207353406476,
+      "loss": 1.0544,
+      "step": 2534
+    },
+    {
+      "epoch": 0.4513888888888889,
+      "grad_norm": 0.578588604927063,
+      "learning_rate": 0.00019386724798446743,
+      "loss": 1.0564,
+      "step": 2535
+    },
+    {
+      "epoch": 0.4515669515669516,
+      "grad_norm": 0.5277524590492249,
+      "learning_rate": 0.00019386242059573638,
+      "loss": 1.1497,
+      "step": 2536
+    },
+    {
+      "epoch": 0.45174501424501423,
+      "grad_norm": 0.5536073446273804,
+      "learning_rate": 0.0001938575913679662,
+      "loss": 1.2213,
+      "step": 2537
+    },
+    {
+      "epoch": 0.4519230769230769,
+      "grad_norm": 0.5572254657745361,
+      "learning_rate": 0.00019385276030125143,
+      "loss": 1.0231,
+      "step": 2538
+    },
+    {
+      "epoch": 0.4521011396011396,
+      "grad_norm": 0.493847131729126,
+      "learning_rate": 0.00019384792739568686,
+      "loss": 0.9385,
+      "step": 2539
+    },
+    {
+      "epoch": 0.45227920227920226,
+      "grad_norm": 0.4641396403312683,
+      "learning_rate": 0.00019384309265136707,
+      "loss": 0.9332,
+      "step": 2540
+    },
+    {
+      "epoch": 0.45245726495726496,
+      "grad_norm": 0.5439442992210388,
+      "learning_rate": 0.00019383825606838681,
+      "loss": 1.317,
+      "step": 2541
+    },
+    {
+      "epoch": 0.45263532763532766,
+      "grad_norm": 0.7050970792770386,
+      "learning_rate": 0.00019383341764684086,
+      "loss": 0.9508,
+      "step": 2542
+    },
+    {
+      "epoch": 0.4528133903133903,
+      "grad_norm": 0.5013265013694763,
+      "learning_rate": 0.000193828577386824,
+      "loss": 1.2704,
+      "step": 2543
+    },
+    {
+      "epoch": 0.452991452991453,
+      "grad_norm": 0.47641924023628235,
+      "learning_rate": 0.0001938237352884311,
+      "loss": 1.0101,
+      "step": 2544
+    },
+    {
+      "epoch": 0.4531695156695157,
+      "grad_norm": 0.5223637819290161,
+      "learning_rate": 0.000193818891351757,
+      "loss": 1.0548,
+      "step": 2545
+    },
+    {
+      "epoch": 0.45334757834757833,
+      "grad_norm": 0.49065667390823364,
+      "learning_rate": 0.0001938140455768966,
+      "loss": 1.0927,
+      "step": 2546
+    },
+    {
+      "epoch": 0.453525641025641,
+      "grad_norm": 0.4808312654495239,
+      "learning_rate": 0.0001938091979639449,
+      "loss": 1.0599,
+      "step": 2547
+    },
+    {
+      "epoch": 0.4537037037037037,
+      "grad_norm": 0.5157489776611328,
+      "learning_rate": 0.0001938043485129968,
+      "loss": 1.2596,
+      "step": 2548
+    },
+    {
+      "epoch": 0.45388176638176636,
+      "grad_norm": 0.5983387231826782,
+      "learning_rate": 0.0001937994972241474,
+      "loss": 1.2276,
+      "step": 2549
+    },
+    {
+      "epoch": 0.45405982905982906,
+      "grad_norm": 0.49776506423950195,
+      "learning_rate": 0.00019379464409749163,
+      "loss": 1.3666,
+      "step": 2550
+    },
+    {
+      "epoch": 0.45423789173789175,
+      "grad_norm": 0.4693490266799927,
+      "learning_rate": 0.00019378978913312471,
+      "loss": 1.087,
+      "step": 2551
+    },
+    {
+      "epoch": 0.4544159544159544,
+      "grad_norm": 0.4754335880279541,
+      "learning_rate": 0.00019378493233114167,
+      "loss": 1.1282,
+      "step": 2552
+    },
+    {
+      "epoch": 0.4545940170940171,
+      "grad_norm": 0.5852862000465393,
+      "learning_rate": 0.00019378007369163776,
+      "loss": 1.1113,
+      "step": 2553
+    },
+    {
+      "epoch": 0.4547720797720798,
+      "grad_norm": 0.47442635893821716,
+      "learning_rate": 0.00019377521321470805,
+      "loss": 0.983,
+      "step": 2554
+    },
+    {
+      "epoch": 0.4549501424501424,
+      "grad_norm": 0.47432273626327515,
+      "learning_rate": 0.00019377035090044787,
+      "loss": 1.0169,
+      "step": 2555
+    },
+    {
+      "epoch": 0.4551282051282051,
+      "grad_norm": 0.4929196834564209,
+      "learning_rate": 0.00019376548674895246,
+      "loss": 1.0182,
+      "step": 2556
+    },
+    {
+      "epoch": 0.4553062678062678,
+      "grad_norm": 0.5433184504508972,
+      "learning_rate": 0.00019376062076031708,
+      "loss": 1.1339,
+      "step": 2557
+    },
+    {
+      "epoch": 0.45548433048433046,
+      "grad_norm": 0.47430408000946045,
+      "learning_rate": 0.00019375575293463715,
+      "loss": 1.1589,
+      "step": 2558
+    },
+    {
+      "epoch": 0.45566239316239315,
+      "grad_norm": 0.46641045808792114,
+      "learning_rate": 0.000193750883272008,
+      "loss": 1.029,
+      "step": 2559
+    },
+    {
+      "epoch": 0.45584045584045585,
+      "grad_norm": 0.44476228952407837,
+      "learning_rate": 0.00019374601177252502,
+      "loss": 0.8494,
+      "step": 2560
+    },
+    {
+      "epoch": 0.45601851851851855,
+      "grad_norm": 0.4886183440685272,
+      "learning_rate": 0.00019374113843628366,
+      "loss": 1.1374,
+      "step": 2561
+    },
+    {
+      "epoch": 0.4561965811965812,
+      "grad_norm": 0.4786703288555145,
+      "learning_rate": 0.00019373626326337946,
+      "loss": 1.2861,
+      "step": 2562
+    },
+    {
+      "epoch": 0.4563746438746439,
+      "grad_norm": 0.5752716660499573,
+      "learning_rate": 0.0001937313862539079,
+      "loss": 1.2365,
+      "step": 2563
+    },
+    {
+      "epoch": 0.4565527065527066,
+      "grad_norm": 0.519176185131073,
+      "learning_rate": 0.00019372650740796452,
+      "loss": 1.2264,
+      "step": 2564
+    },
+    {
+      "epoch": 0.4567307692307692,
+      "grad_norm": 0.5927292704582214,
+      "learning_rate": 0.00019372162672564493,
+      "loss": 0.8979,
+      "step": 2565
+    },
+    {
+      "epoch": 0.4569088319088319,
+      "grad_norm": 0.5467435121536255,
+      "learning_rate": 0.00019371674420704478,
+      "loss": 1.1016,
+      "step": 2566
+    },
+    {
+      "epoch": 0.4570868945868946,
+      "grad_norm": 0.49593284726142883,
+      "learning_rate": 0.00019371185985225968,
+      "loss": 0.982,
+      "step": 2567
+    },
+    {
+      "epoch": 0.45726495726495725,
+      "grad_norm": 0.5696587562561035,
+      "learning_rate": 0.00019370697366138538,
+      "loss": 0.979,
+      "step": 2568
+    },
+    {
+      "epoch": 0.45744301994301995,
+      "grad_norm": 0.4455752968788147,
+      "learning_rate": 0.00019370208563451757,
+      "loss": 0.8832,
+      "step": 2569
+    },
+    {
+      "epoch": 0.45762108262108264,
+      "grad_norm": 0.5072923302650452,
+      "learning_rate": 0.00019369719577175203,
+      "loss": 1.1046,
+      "step": 2570
+    },
+    {
+      "epoch": 0.4577991452991453,
+      "grad_norm": 0.45119982957839966,
+      "learning_rate": 0.0001936923040731846,
+      "loss": 1.0083,
+      "step": 2571
+    },
+    {
+      "epoch": 0.457977207977208,
+      "grad_norm": 0.5062251091003418,
+      "learning_rate": 0.00019368741053891108,
+      "loss": 1.2771,
+      "step": 2572
+    },
+    {
+      "epoch": 0.4581552706552707,
+      "grad_norm": 0.5511104464530945,
+      "learning_rate": 0.0001936825151690274,
+      "loss": 1.0039,
+      "step": 2573
+    },
+    {
+      "epoch": 0.4583333333333333,
+      "grad_norm": 0.4721006453037262,
+      "learning_rate": 0.0001936776179636294,
+      "loss": 1.3246,
+      "step": 2574
+    },
+    {
+      "epoch": 0.458511396011396,
+      "grad_norm": 0.5021488666534424,
+      "learning_rate": 0.0001936727189228131,
+      "loss": 1.1733,
+      "step": 2575
+    },
+    {
+      "epoch": 0.4586894586894587,
+      "grad_norm": 0.5755292177200317,
+      "learning_rate": 0.0001936678180466745,
+      "loss": 1.2241,
+      "step": 2576
+    },
+    {
+      "epoch": 0.45886752136752135,
+      "grad_norm": 0.4501610994338989,
+      "learning_rate": 0.00019366291533530952,
+      "loss": 1.0503,
+      "step": 2577
+    },
+    {
+      "epoch": 0.45904558404558404,
+      "grad_norm": 0.4067458212375641,
+      "learning_rate": 0.00019365801078881432,
+      "loss": 0.8259,
+      "step": 2578
+    },
+    {
+      "epoch": 0.45922364672364674,
+      "grad_norm": 0.539730429649353,
+      "learning_rate": 0.0001936531044072849,
+      "loss": 1.1964,
+      "step": 2579
+    },
+    {
+      "epoch": 0.4594017094017094,
+      "grad_norm": 0.5624797344207764,
+      "learning_rate": 0.0001936481961908175,
+      "loss": 1.2059,
+      "step": 2580
+    },
+    {
+      "epoch": 0.4595797720797721,
+      "grad_norm": 0.43679240345954895,
+      "learning_rate": 0.00019364328613950824,
+      "loss": 1.1371,
+      "step": 2581
+    },
+    {
+      "epoch": 0.45975783475783477,
+      "grad_norm": 0.5214769244194031,
+      "learning_rate": 0.00019363837425345328,
+      "loss": 1.109,
+      "step": 2582
+    },
+    {
+      "epoch": 0.4599358974358974,
+      "grad_norm": 0.4522894024848938,
+      "learning_rate": 0.00019363346053274892,
+      "loss": 1.0532,
+      "step": 2583
+    },
+    {
+      "epoch": 0.4601139601139601,
+      "grad_norm": 0.44980281591415405,
+      "learning_rate": 0.0001936285449774914,
+      "loss": 0.9352,
+      "step": 2584
+    },
+    {
+      "epoch": 0.4602920227920228,
+      "grad_norm": 0.5697414875030518,
+      "learning_rate": 0.00019362362758777705,
+      "loss": 1.2171,
+      "step": 2585
+    },
+    {
+      "epoch": 0.46047008547008544,
+      "grad_norm": 0.4636315107345581,
+      "learning_rate": 0.00019361870836370217,
+      "loss": 1.0662,
+      "step": 2586
+    },
+    {
+      "epoch": 0.46064814814814814,
+      "grad_norm": 0.5144017338752747,
+      "learning_rate": 0.00019361378730536321,
+      "loss": 1.0681,
+      "step": 2587
+    },
+    {
+      "epoch": 0.46082621082621084,
+      "grad_norm": 0.5007636547088623,
+      "learning_rate": 0.00019360886441285654,
+      "loss": 1.2058,
+      "step": 2588
+    },
+    {
+      "epoch": 0.46100427350427353,
+      "grad_norm": 0.5024117231369019,
+      "learning_rate": 0.00019360393968627864,
+      "loss": 1.065,
+      "step": 2589
+    },
+    {
+      "epoch": 0.46118233618233617,
+      "grad_norm": 0.48105588555336,
+      "learning_rate": 0.00019359901312572596,
+      "loss": 1.0887,
+      "step": 2590
+    },
+    {
+      "epoch": 0.46136039886039887,
+      "grad_norm": 0.5381982326507568,
+      "learning_rate": 0.00019359408473129506,
+      "loss": 1.2754,
+      "step": 2591
+    },
+    {
+      "epoch": 0.46153846153846156,
+      "grad_norm": 0.5051333904266357,
+      "learning_rate": 0.0001935891545030825,
+      "loss": 0.9334,
+      "step": 2592
+    },
+    {
+      "epoch": 0.4617165242165242,
+      "grad_norm": 0.43818601965904236,
+      "learning_rate": 0.0001935842224411849,
+      "loss": 1.0967,
+      "step": 2593
+    },
+    {
+      "epoch": 0.4618945868945869,
+      "grad_norm": 0.4727257490158081,
+      "learning_rate": 0.0001935792885456988,
+      "loss": 0.8136,
+      "step": 2594
+    },
+    {
+      "epoch": 0.4620726495726496,
+      "grad_norm": 0.5505291223526001,
+      "learning_rate": 0.00019357435281672098,
+      "loss": 1.3113,
+      "step": 2595
+    },
+    {
+      "epoch": 0.46225071225071224,
+      "grad_norm": 0.4705682396888733,
+      "learning_rate": 0.0001935694152543481,
+      "loss": 0.9863,
+      "step": 2596
+    },
+    {
+      "epoch": 0.46242877492877493,
+      "grad_norm": 0.49653419852256775,
+      "learning_rate": 0.0001935644758586769,
+      "loss": 1.035,
+      "step": 2597
+    },
+    {
+      "epoch": 0.46260683760683763,
+      "grad_norm": 0.4788367748260498,
+      "learning_rate": 0.00019355953462980415,
+      "loss": 1.1253,
+      "step": 2598
+    },
+    {
+      "epoch": 0.46278490028490027,
+      "grad_norm": 0.5295125842094421,
+      "learning_rate": 0.00019355459156782668,
+      "loss": 1.0853,
+      "step": 2599
+    },
+    {
+      "epoch": 0.46296296296296297,
+      "grad_norm": 0.4878056049346924,
+      "learning_rate": 0.00019354964667284133,
+      "loss": 1.1381,
+      "step": 2600
+    },
+    {
+      "epoch": 0.46314102564102566,
+      "grad_norm": 0.5442031025886536,
+      "learning_rate": 0.00019354469994494497,
+      "loss": 1.1349,
+      "step": 2601
+    },
+    {
+      "epoch": 0.4633190883190883,
+      "grad_norm": 0.4845225214958191,
+      "learning_rate": 0.00019353975138423457,
+      "loss": 1.0538,
+      "step": 2602
+    },
+    {
+      "epoch": 0.463497150997151,
+      "grad_norm": 0.4957871437072754,
+      "learning_rate": 0.00019353480099080703,
+      "loss": 1.2765,
+      "step": 2603
+    },
+    {
+      "epoch": 0.4636752136752137,
+      "grad_norm": 0.5414339303970337,
+      "learning_rate": 0.00019352984876475936,
+      "loss": 1.1015,
+      "step": 2604
+    },
+    {
+      "epoch": 0.46385327635327633,
+      "grad_norm": 0.5171043872833252,
+      "learning_rate": 0.0001935248947061886,
+      "loss": 0.9995,
+      "step": 2605
+    },
+    {
+      "epoch": 0.46403133903133903,
+      "grad_norm": 0.46040529012680054,
+      "learning_rate": 0.0001935199388151918,
+      "loss": 1.1126,
+      "step": 2606
+    },
+    {
+      "epoch": 0.4642094017094017,
+      "grad_norm": 0.5327033400535583,
+      "learning_rate": 0.00019351498109186613,
+      "loss": 1.1983,
+      "step": 2607
+    },
+    {
+      "epoch": 0.46438746438746437,
+      "grad_norm": 0.4451361298561096,
+      "learning_rate": 0.0001935100215363086,
+      "loss": 0.9689,
+      "step": 2608
+    },
+    {
+      "epoch": 0.46456552706552706,
+      "grad_norm": 0.5462809801101685,
+      "learning_rate": 0.00019350506014861646,
+      "loss": 1.036,
+      "step": 2609
+    },
+    {
+      "epoch": 0.46474358974358976,
+      "grad_norm": 0.4907000958919525,
+      "learning_rate": 0.00019350009692888694,
+      "loss": 1.0724,
+      "step": 2610
+    },
+    {
+      "epoch": 0.4649216524216524,
+      "grad_norm": 0.47523510456085205,
+      "learning_rate": 0.00019349513187721723,
+      "loss": 0.9214,
+      "step": 2611
+    },
+    {
+      "epoch": 0.4650997150997151,
+      "grad_norm": 0.539732813835144,
+      "learning_rate": 0.0001934901649937046,
+      "loss": 1.1166,
+      "step": 2612
+    },
+    {
+      "epoch": 0.4652777777777778,
+      "grad_norm": 0.4827860891819,
+      "learning_rate": 0.00019348519627844643,
+      "loss": 1.1613,
+      "step": 2613
+    },
+    {
+      "epoch": 0.46545584045584043,
+      "grad_norm": 0.5385223031044006,
+      "learning_rate": 0.00019348022573154,
+      "loss": 1.0105,
+      "step": 2614
+    },
+    {
+      "epoch": 0.4656339031339031,
+      "grad_norm": 0.4629383087158203,
+      "learning_rate": 0.0001934752533530828,
+      "loss": 1.0298,
+      "step": 2615
+    },
+    {
+      "epoch": 0.4658119658119658,
+      "grad_norm": 0.599371075630188,
+      "learning_rate": 0.00019347027914317212,
+      "loss": 1.3158,
+      "step": 2616
+    },
+    {
+      "epoch": 0.46599002849002846,
+      "grad_norm": 0.5954698324203491,
+      "learning_rate": 0.00019346530310190553,
+      "loss": 1.1882,
+      "step": 2617
+    },
+    {
+      "epoch": 0.46616809116809116,
+      "grad_norm": 0.49185171723365784,
+      "learning_rate": 0.00019346032522938046,
+      "loss": 1.0977,
+      "step": 2618
+    },
+    {
+      "epoch": 0.46634615384615385,
+      "grad_norm": 0.5145422220230103,
+      "learning_rate": 0.0001934553455256945,
+      "loss": 0.9948,
+      "step": 2619
+    },
+    {
+      "epoch": 0.46652421652421655,
+      "grad_norm": 0.6809412837028503,
+      "learning_rate": 0.00019345036399094517,
+      "loss": 1.5798,
+      "step": 2620
+    },
+    {
+      "epoch": 0.4667022792022792,
+      "grad_norm": 0.4606841206550598,
+      "learning_rate": 0.00019344538062523005,
+      "loss": 0.7357,
+      "step": 2621
+    },
+    {
+      "epoch": 0.4668803418803419,
+      "grad_norm": 0.49036628007888794,
+      "learning_rate": 0.00019344039542864685,
+      "loss": 1.1518,
+      "step": 2622
+    },
+    {
+      "epoch": 0.4670584045584046,
+      "grad_norm": 0.47904539108276367,
+      "learning_rate": 0.0001934354084012932,
+      "loss": 0.9929,
+      "step": 2623
+    },
+    {
+      "epoch": 0.4672364672364672,
+      "grad_norm": 0.5224666595458984,
+      "learning_rate": 0.0001934304195432668,
+      "loss": 1.2544,
+      "step": 2624
+    },
+    {
+      "epoch": 0.4674145299145299,
+      "grad_norm": 0.4902483820915222,
+      "learning_rate": 0.00019342542885466543,
+      "loss": 1.0301,
+      "step": 2625
+    },
+    {
+      "epoch": 0.4675925925925926,
+      "grad_norm": 0.46824702620506287,
+      "learning_rate": 0.00019342043633558683,
+      "loss": 0.9364,
+      "step": 2626
+    },
+    {
+      "epoch": 0.46777065527065526,
+      "grad_norm": 0.46272051334381104,
+      "learning_rate": 0.00019341544198612888,
+      "loss": 1.056,
+      "step": 2627
+    },
+    {
+      "epoch": 0.46794871794871795,
+      "grad_norm": 0.6216606497764587,
+      "learning_rate": 0.0001934104458063894,
+      "loss": 1.0825,
+      "step": 2628
+    },
+    {
+      "epoch": 0.46812678062678065,
+      "grad_norm": 0.5024014115333557,
+      "learning_rate": 0.00019340544779646623,
+      "loss": 1.1832,
+      "step": 2629
+    },
+    {
+      "epoch": 0.4683048433048433,
+      "grad_norm": 0.5547130107879639,
+      "learning_rate": 0.00019340044795645737,
+      "loss": 1.1335,
+      "step": 2630
+    },
+    {
+      "epoch": 0.468482905982906,
+      "grad_norm": 0.5439161658287048,
+      "learning_rate": 0.0001933954462864608,
+      "loss": 1.0229,
+      "step": 2631
+    },
+    {
+      "epoch": 0.4686609686609687,
+      "grad_norm": 0.4782990515232086,
+      "learning_rate": 0.0001933904427865744,
+      "loss": 1.2318,
+      "step": 2632
+    },
+    {
+      "epoch": 0.4688390313390313,
+      "grad_norm": 0.5872140526771545,
+      "learning_rate": 0.00019338543745689633,
+      "loss": 1.0132,
+      "step": 2633
+    },
+    {
+      "epoch": 0.469017094017094,
+      "grad_norm": 0.44163307547569275,
+      "learning_rate": 0.00019338043029752458,
+      "loss": 1.0091,
+      "step": 2634
+    },
+    {
+      "epoch": 0.4691951566951567,
+      "grad_norm": 0.541081428527832,
+      "learning_rate": 0.0001933754213085573,
+      "loss": 1.2155,
+      "step": 2635
+    },
+    {
+      "epoch": 0.46937321937321935,
+      "grad_norm": 0.4761527478694916,
+      "learning_rate": 0.00019337041049009255,
+      "loss": 1.1138,
+      "step": 2636
+    },
+    {
+      "epoch": 0.46955128205128205,
+      "grad_norm": 0.46414369344711304,
+      "learning_rate": 0.0001933653978422286,
+      "loss": 0.9903,
+      "step": 2637
+    },
+    {
+      "epoch": 0.46972934472934474,
+      "grad_norm": 0.5337086915969849,
+      "learning_rate": 0.00019336038336506363,
+      "loss": 1.2873,
+      "step": 2638
+    },
+    {
+      "epoch": 0.4699074074074074,
+      "grad_norm": 0.5065379738807678,
+      "learning_rate": 0.00019335536705869592,
+      "loss": 1.1436,
+      "step": 2639
+    },
+    {
+      "epoch": 0.4700854700854701,
+      "grad_norm": 0.5539217591285706,
+      "learning_rate": 0.0001933503489232237,
+      "loss": 1.2881,
+      "step": 2640
+    },
+    {
+      "epoch": 0.4702635327635328,
+      "grad_norm": 0.48303213715553284,
+      "learning_rate": 0.0001933453289587453,
+      "loss": 1.0209,
+      "step": 2641
+    },
+    {
+      "epoch": 0.4704415954415954,
+      "grad_norm": 0.6986871957778931,
+      "learning_rate": 0.00019334030716535908,
+      "loss": 1.1979,
+      "step": 2642
+    },
+    {
+      "epoch": 0.4706196581196581,
+      "grad_norm": 0.46137234568595886,
+      "learning_rate": 0.00019333528354316347,
+      "loss": 1.0682,
+      "step": 2643
+    },
+    {
+      "epoch": 0.4707977207977208,
+      "grad_norm": 0.4726654291152954,
+      "learning_rate": 0.00019333025809225684,
+      "loss": 1.1712,
+      "step": 2644
+    },
+    {
+      "epoch": 0.47097578347578345,
+      "grad_norm": 0.46188637614250183,
+      "learning_rate": 0.0001933252308127377,
+      "loss": 1.0183,
+      "step": 2645
+    },
+    {
+      "epoch": 0.47115384615384615,
+      "grad_norm": 0.5323259830474854,
+      "learning_rate": 0.0001933202017047045,
+      "loss": 0.935,
+      "step": 2646
+    },
+    {
+      "epoch": 0.47133190883190884,
+      "grad_norm": 0.5004189014434814,
+      "learning_rate": 0.00019331517076825582,
+      "loss": 1.1331,
+      "step": 2647
+    },
+    {
+      "epoch": 0.47150997150997154,
+      "grad_norm": 0.5443634986877441,
+      "learning_rate": 0.0001933101380034902,
+      "loss": 1.0514,
+      "step": 2648
+    },
+    {
+      "epoch": 0.4716880341880342,
+      "grad_norm": 0.504180371761322,
+      "learning_rate": 0.0001933051034105063,
+      "loss": 1.3099,
+      "step": 2649
+    },
+    {
+      "epoch": 0.4718660968660969,
+      "grad_norm": 0.5092344284057617,
+      "learning_rate": 0.0001933000669894027,
+      "loss": 1.0716,
+      "step": 2650
+    },
+    {
+      "epoch": 0.47204415954415957,
+      "grad_norm": 0.5236422419548035,
+      "learning_rate": 0.0001932950287402781,
+      "loss": 1.0981,
+      "step": 2651
+    },
+    {
+      "epoch": 0.4722222222222222,
+      "grad_norm": 0.6228063702583313,
+      "learning_rate": 0.0001932899886632312,
+      "loss": 1.3398,
+      "step": 2652
+    },
+    {
+      "epoch": 0.4724002849002849,
+      "grad_norm": 0.5112748146057129,
+      "learning_rate": 0.00019328494675836078,
+      "loss": 1.0151,
+      "step": 2653
+    },
+    {
+      "epoch": 0.4725783475783476,
+      "grad_norm": 0.5554201602935791,
+      "learning_rate": 0.00019327990302576563,
+      "loss": 1.404,
+      "step": 2654
+    },
+    {
+      "epoch": 0.47275641025641024,
+      "grad_norm": 0.5050725340843201,
+      "learning_rate": 0.0001932748574655445,
+      "loss": 0.951,
+      "step": 2655
+    },
+    {
+      "epoch": 0.47293447293447294,
+      "grad_norm": 0.5161749720573425,
+      "learning_rate": 0.00019326981007779636,
+      "loss": 1.2425,
+      "step": 2656
+    },
+    {
+      "epoch": 0.47311253561253563,
+      "grad_norm": 0.4865442216396332,
+      "learning_rate": 0.00019326476086262002,
+      "loss": 1.1175,
+      "step": 2657
+    },
+    {
+      "epoch": 0.4732905982905983,
+      "grad_norm": 0.5276186466217041,
+      "learning_rate": 0.0001932597098201144,
+      "loss": 1.3687,
+      "step": 2658
+    },
+    {
+      "epoch": 0.47346866096866097,
+      "grad_norm": 0.509139358997345,
+      "learning_rate": 0.00019325465695037855,
+      "loss": 1.0546,
+      "step": 2659
+    },
+    {
+      "epoch": 0.47364672364672367,
+      "grad_norm": 0.49815434217453003,
+      "learning_rate": 0.00019324960225351138,
+      "loss": 1.0807,
+      "step": 2660
+    },
+    {
+      "epoch": 0.4738247863247863,
+      "grad_norm": 0.5059618353843689,
+      "learning_rate": 0.00019324454572961197,
+      "loss": 1.0827,
+      "step": 2661
+    },
+    {
+      "epoch": 0.474002849002849,
+      "grad_norm": 0.5698565244674683,
+      "learning_rate": 0.00019323948737877942,
+      "loss": 1.2019,
+      "step": 2662
+    },
+    {
+      "epoch": 0.4741809116809117,
+      "grad_norm": 0.49661511182785034,
+      "learning_rate": 0.00019323442720111276,
+      "loss": 1.1447,
+      "step": 2663
+    },
+    {
+      "epoch": 0.47435897435897434,
+      "grad_norm": 0.46442747116088867,
+      "learning_rate": 0.0001932293651967112,
+      "loss": 0.8796,
+      "step": 2664
+    },
+    {
+      "epoch": 0.47453703703703703,
+      "grad_norm": 0.48306044936180115,
+      "learning_rate": 0.00019322430136567388,
+      "loss": 1.1358,
+      "step": 2665
+    },
+    {
+      "epoch": 0.47471509971509973,
+      "grad_norm": 0.5677350759506226,
+      "learning_rate": 0.00019321923570810005,
+      "loss": 1.1026,
+      "step": 2666
+    },
+    {
+      "epoch": 0.47489316239316237,
+      "grad_norm": 0.3700144588947296,
+      "learning_rate": 0.0001932141682240889,
+      "loss": 0.7514,
+      "step": 2667
+    },
+    {
+      "epoch": 0.47507122507122507,
+      "grad_norm": 0.6003054976463318,
+      "learning_rate": 0.0001932090989137398,
+      "loss": 1.1591,
+      "step": 2668
+    },
+    {
+      "epoch": 0.47524928774928776,
+      "grad_norm": 0.520298421382904,
+      "learning_rate": 0.00019320402777715204,
+      "loss": 1.339,
+      "step": 2669
+    },
+    {
+      "epoch": 0.4754273504273504,
+      "grad_norm": 0.46453598141670227,
+      "learning_rate": 0.00019319895481442493,
+      "loss": 0.9879,
+      "step": 2670
+    },
+    {
+      "epoch": 0.4756054131054131,
+      "grad_norm": 0.5247363448143005,
+      "learning_rate": 0.00019319388002565793,
+      "loss": 0.9862,
+      "step": 2671
+    },
+    {
+      "epoch": 0.4757834757834758,
+      "grad_norm": 0.5498613715171814,
+      "learning_rate": 0.00019318880341095046,
+      "loss": 1.2224,
+      "step": 2672
+    },
+    {
+      "epoch": 0.47596153846153844,
+      "grad_norm": 0.565838098526001,
+      "learning_rate": 0.00019318372497040192,
+      "loss": 1.0712,
+      "step": 2673
+    },
+    {
+      "epoch": 0.47613960113960113,
+      "grad_norm": 0.5797489881515503,
+      "learning_rate": 0.00019317864470411191,
+      "loss": 1.0176,
+      "step": 2674
+    },
+    {
+      "epoch": 0.47631766381766383,
+      "grad_norm": 0.5114326477050781,
+      "learning_rate": 0.0001931735626121799,
+      "loss": 1.1027,
+      "step": 2675
+    },
+    {
+      "epoch": 0.47649572649572647,
+      "grad_norm": 0.5396515727043152,
+      "learning_rate": 0.00019316847869470547,
+      "loss": 1.1782,
+      "step": 2676
+    },
+    {
+      "epoch": 0.47667378917378916,
+      "grad_norm": 0.4812076985836029,
+      "learning_rate": 0.00019316339295178824,
+      "loss": 1.1196,
+      "step": 2677
+    },
+    {
+      "epoch": 0.47685185185185186,
+      "grad_norm": 0.4875647723674774,
+      "learning_rate": 0.00019315830538352787,
+      "loss": 1.1407,
+      "step": 2678
+    },
+    {
+      "epoch": 0.47702991452991456,
+      "grad_norm": 0.5036377906799316,
+      "learning_rate": 0.00019315321599002404,
+      "loss": 0.9842,
+      "step": 2679
+    },
+    {
+      "epoch": 0.4772079772079772,
+      "grad_norm": 0.5054177641868591,
+      "learning_rate": 0.00019314812477137645,
+      "loss": 0.8196,
+      "step": 2680
+    },
+    {
+      "epoch": 0.4773860398860399,
+      "grad_norm": 0.5050665736198425,
+      "learning_rate": 0.00019314303172768483,
+      "loss": 0.8463,
+      "step": 2681
+    },
+    {
+      "epoch": 0.4775641025641026,
+      "grad_norm": 0.5179004669189453,
+      "learning_rate": 0.000193137936859049,
+      "loss": 1.2485,
+      "step": 2682
+    },
+    {
+      "epoch": 0.47774216524216523,
+      "grad_norm": 0.44986143708229065,
+      "learning_rate": 0.00019313284016556876,
+      "loss": 0.9855,
+      "step": 2683
+    },
+    {
+      "epoch": 0.4779202279202279,
+      "grad_norm": 0.5594347715377808,
+      "learning_rate": 0.00019312774164734398,
+      "loss": 1.0987,
+      "step": 2684
+    },
+    {
+      "epoch": 0.4780982905982906,
+      "grad_norm": 0.4837244749069214,
+      "learning_rate": 0.0001931226413044746,
+      "loss": 1.1119,
+      "step": 2685
+    },
+    {
+      "epoch": 0.47827635327635326,
+      "grad_norm": 0.489145427942276,
+      "learning_rate": 0.0001931175391370605,
+      "loss": 1.1962,
+      "step": 2686
+    },
+    {
+      "epoch": 0.47845441595441596,
+      "grad_norm": 0.503568708896637,
+      "learning_rate": 0.00019311243514520164,
+      "loss": 0.9668,
+      "step": 2687
+    },
+    {
+      "epoch": 0.47863247863247865,
+      "grad_norm": 0.5401005744934082,
+      "learning_rate": 0.00019310732932899805,
+      "loss": 1.3072,
+      "step": 2688
+    },
+    {
+      "epoch": 0.4788105413105413,
+      "grad_norm": 0.526523768901825,
+      "learning_rate": 0.00019310222168854971,
+      "loss": 1.1387,
+      "step": 2689
+    },
+    {
+      "epoch": 0.478988603988604,
+      "grad_norm": 0.5223183631896973,
+      "learning_rate": 0.00019309711222395678,
+      "loss": 1.1391,
+      "step": 2690
+    },
+    {
+      "epoch": 0.4791666666666667,
+      "grad_norm": 0.5840879082679749,
+      "learning_rate": 0.00019309200093531933,
+      "loss": 1.1543,
+      "step": 2691
+    },
+    {
+      "epoch": 0.4793447293447293,
+      "grad_norm": 0.5173699259757996,
+      "learning_rate": 0.00019308688782273753,
+      "loss": 1.1889,
+      "step": 2692
+    },
+    {
+      "epoch": 0.479522792022792,
+      "grad_norm": 0.5417894124984741,
+      "learning_rate": 0.00019308177288631146,
+      "loss": 1.299,
+      "step": 2693
+    },
+    {
+      "epoch": 0.4797008547008547,
+      "grad_norm": 0.4890797734260559,
+      "learning_rate": 0.0001930766561261415,
+      "loss": 1.1516,
+      "step": 2694
+    },
+    {
+      "epoch": 0.47987891737891736,
+      "grad_norm": 0.5422119498252869,
+      "learning_rate": 0.00019307153754232772,
+      "loss": 1.0301,
+      "step": 2695
+    },
+    {
+      "epoch": 0.48005698005698005,
+      "grad_norm": 0.5838702917098999,
+      "learning_rate": 0.00019306641713497057,
+      "loss": 1.265,
+      "step": 2696
+    },
+    {
+      "epoch": 0.48023504273504275,
+      "grad_norm": 0.5020943284034729,
+      "learning_rate": 0.00019306129490417027,
+      "loss": 1.1119,
+      "step": 2697
+    },
+    {
+      "epoch": 0.4804131054131054,
+      "grad_norm": 0.412993460893631,
+      "learning_rate": 0.00019305617085002723,
+      "loss": 0.8083,
+      "step": 2698
+    },
+    {
+      "epoch": 0.4805911680911681,
+      "grad_norm": 0.6270101070404053,
+      "learning_rate": 0.00019305104497264184,
+      "loss": 1.3355,
+      "step": 2699
+    },
+    {
+      "epoch": 0.4807692307692308,
+      "grad_norm": 0.45256730914115906,
+      "learning_rate": 0.0001930459172721145,
+      "loss": 1.0368,
+      "step": 2700
+    },
+    {
+      "epoch": 0.4809472934472934,
+      "grad_norm": 0.5351749658584595,
+      "learning_rate": 0.0001930407877485457,
+      "loss": 1.135,
+      "step": 2701
+    },
+    {
+      "epoch": 0.4811253561253561,
+      "grad_norm": 0.49324163794517517,
+      "learning_rate": 0.00019303565640203593,
+      "loss": 0.9383,
+      "step": 2702
+    },
+    {
+      "epoch": 0.4813034188034188,
+      "grad_norm": 0.5434361100196838,
+      "learning_rate": 0.00019303052323268576,
+      "loss": 1.2605,
+      "step": 2703
+    },
+    {
+      "epoch": 0.48148148148148145,
+      "grad_norm": 0.5858064889907837,
+      "learning_rate": 0.00019302538824059572,
+      "loss": 1.0846,
+      "step": 2704
+    },
+    {
+      "epoch": 0.48165954415954415,
+      "grad_norm": 0.5753700733184814,
+      "learning_rate": 0.00019302025142586647,
+      "loss": 1.0371,
+      "step": 2705
+    },
+    {
+      "epoch": 0.48183760683760685,
+      "grad_norm": 0.43102699518203735,
+      "learning_rate": 0.00019301511278859858,
+      "loss": 0.9189,
+      "step": 2706
+    },
+    {
+      "epoch": 0.48201566951566954,
+      "grad_norm": 0.4731025993824005,
+      "learning_rate": 0.0001930099723288928,
+      "loss": 1.1291,
+      "step": 2707
+    },
+    {
+      "epoch": 0.4821937321937322,
+      "grad_norm": 0.5685615539550781,
+      "learning_rate": 0.00019300483004684987,
+      "loss": 1.1006,
+      "step": 2708
+    },
+    {
+      "epoch": 0.4823717948717949,
+      "grad_norm": 0.4368155896663666,
+      "learning_rate": 0.00019299968594257044,
+      "loss": 0.9959,
+      "step": 2709
+    },
+    {
+      "epoch": 0.4825498575498576,
+      "grad_norm": 0.5594738125801086,
+      "learning_rate": 0.00019299454001615537,
+      "loss": 1.0826,
+      "step": 2710
+    },
+    {
+      "epoch": 0.4827279202279202,
+      "grad_norm": 0.48876598477363586,
+      "learning_rate": 0.00019298939226770548,
+      "loss": 1.1556,
+      "step": 2711
+    },
+    {
+      "epoch": 0.4829059829059829,
+      "grad_norm": 0.548039436340332,
+      "learning_rate": 0.00019298424269732157,
+      "loss": 1.158,
+      "step": 2712
+    },
+    {
+      "epoch": 0.4830840455840456,
+      "grad_norm": 0.4957645535469055,
+      "learning_rate": 0.00019297909130510464,
+      "loss": 0.9824,
+      "step": 2713
+    },
+    {
+      "epoch": 0.48326210826210825,
+      "grad_norm": 0.5197011232376099,
+      "learning_rate": 0.00019297393809115555,
+      "loss": 1.1074,
+      "step": 2714
+    },
+    {
+      "epoch": 0.48344017094017094,
+      "grad_norm": 0.5742064118385315,
+      "learning_rate": 0.00019296878305557526,
+      "loss": 1.0431,
+      "step": 2715
+    },
+    {
+      "epoch": 0.48361823361823364,
+      "grad_norm": 0.5698413252830505,
+      "learning_rate": 0.0001929636261984648,
+      "loss": 1.0713,
+      "step": 2716
+    },
+    {
+      "epoch": 0.4837962962962963,
+      "grad_norm": 0.48126333951950073,
+      "learning_rate": 0.0001929584675199252,
+      "loss": 0.9274,
+      "step": 2717
+    },
+    {
+      "epoch": 0.483974358974359,
+      "grad_norm": 0.49299830198287964,
+      "learning_rate": 0.00019295330702005754,
+      "loss": 0.9392,
+      "step": 2718
+    },
+    {
+      "epoch": 0.48415242165242167,
+      "grad_norm": 0.4780774414539337,
+      "learning_rate": 0.0001929481446989629,
+      "loss": 1.1459,
+      "step": 2719
+    },
+    {
+      "epoch": 0.4843304843304843,
+      "grad_norm": 0.5462654829025269,
+      "learning_rate": 0.00019294298055674248,
+      "loss": 1.0635,
+      "step": 2720
+    },
+    {
+      "epoch": 0.484508547008547,
+      "grad_norm": 0.5371061563491821,
+      "learning_rate": 0.00019293781459349743,
+      "loss": 1.3578,
+      "step": 2721
+    },
+    {
+      "epoch": 0.4846866096866097,
+      "grad_norm": 0.46308520436286926,
+      "learning_rate": 0.00019293264680932893,
+      "loss": 0.9001,
+      "step": 2722
+    },
+    {
+      "epoch": 0.48486467236467234,
+      "grad_norm": 0.5149807929992676,
+      "learning_rate": 0.0001929274772043383,
+      "loss": 0.6908,
+      "step": 2723
+    },
+    {
+      "epoch": 0.48504273504273504,
+      "grad_norm": 0.5435031056404114,
+      "learning_rate": 0.00019292230577862678,
+      "loss": 1.2143,
+      "step": 2724
+    },
+    {
+      "epoch": 0.48522079772079774,
+      "grad_norm": 0.44217726588249207,
+      "learning_rate": 0.00019291713253229568,
+      "loss": 0.9303,
+      "step": 2725
+    },
+    {
+      "epoch": 0.4853988603988604,
+      "grad_norm": 0.6120226383209229,
+      "learning_rate": 0.00019291195746544643,
+      "loss": 1.3801,
+      "step": 2726
+    },
+    {
+      "epoch": 0.4855769230769231,
+      "grad_norm": 0.5014316439628601,
+      "learning_rate": 0.00019290678057818037,
+      "loss": 1.0631,
+      "step": 2727
+    },
+    {
+      "epoch": 0.48575498575498577,
+      "grad_norm": 0.5667829513549805,
+      "learning_rate": 0.00019290160187059895,
+      "loss": 1.3166,
+      "step": 2728
+    },
+    {
+      "epoch": 0.4859330484330484,
+      "grad_norm": 0.5011509656906128,
+      "learning_rate": 0.0001928964213428036,
+      "loss": 1.1887,
+      "step": 2729
+    },
+    {
+      "epoch": 0.4861111111111111,
+      "grad_norm": 0.48317405581474304,
+      "learning_rate": 0.00019289123899489586,
+      "loss": 1.1125,
+      "step": 2730
+    },
+    {
+      "epoch": 0.4862891737891738,
+      "grad_norm": 0.4669005870819092,
+      "learning_rate": 0.00019288605482697726,
+      "loss": 1.0091,
+      "step": 2731
+    },
+    {
+      "epoch": 0.48646723646723644,
+      "grad_norm": 0.4330739974975586,
+      "learning_rate": 0.00019288086883914937,
+      "loss": 0.9789,
+      "step": 2732
+    },
+    {
+      "epoch": 0.48664529914529914,
+      "grad_norm": 0.48482781648635864,
+      "learning_rate": 0.0001928756810315138,
+      "loss": 1.1922,
+      "step": 2733
+    },
+    {
+      "epoch": 0.48682336182336183,
+      "grad_norm": 0.5781838297843933,
+      "learning_rate": 0.0001928704914041722,
+      "loss": 1.1793,
+      "step": 2734
+    },
+    {
+      "epoch": 0.48700142450142453,
+      "grad_norm": 0.5955413579940796,
+      "learning_rate": 0.00019286529995722623,
+      "loss": 1.1001,
+      "step": 2735
+    },
+    {
+      "epoch": 0.48717948717948717,
+      "grad_norm": 0.49204322695732117,
+      "learning_rate": 0.00019286010669077763,
+      "loss": 0.9219,
+      "step": 2736
+    },
+    {
+      "epoch": 0.48735754985754987,
+      "grad_norm": 0.5853500962257385,
+      "learning_rate": 0.00019285491160492813,
+      "loss": 1.1133,
+      "step": 2737
+    },
+    {
+      "epoch": 0.48753561253561256,
+      "grad_norm": 0.5555846095085144,
+      "learning_rate": 0.0001928497146997795,
+      "loss": 1.0915,
+      "step": 2738
+    },
+    {
+      "epoch": 0.4877136752136752,
+      "grad_norm": 0.5166759490966797,
+      "learning_rate": 0.00019284451597543364,
+      "loss": 0.9349,
+      "step": 2739
+    },
+    {
+      "epoch": 0.4878917378917379,
+      "grad_norm": 0.47816506028175354,
+      "learning_rate": 0.00019283931543199234,
+      "loss": 0.8978,
+      "step": 2740
+    },
+    {
+      "epoch": 0.4880698005698006,
+      "grad_norm": 0.5632442831993103,
+      "learning_rate": 0.0001928341130695575,
+      "loss": 1.0491,
+      "step": 2741
+    },
+    {
+      "epoch": 0.48824786324786323,
+      "grad_norm": 0.6532769799232483,
+      "learning_rate": 0.00019282890888823107,
+      "loss": 1.2779,
+      "step": 2742
+    },
+    {
+      "epoch": 0.48842592592592593,
+      "grad_norm": 0.5733640789985657,
+      "learning_rate": 0.000192823702888115,
+      "loss": 1.4127,
+      "step": 2743
+    },
+    {
+      "epoch": 0.4886039886039886,
+      "grad_norm": 0.5701746344566345,
+      "learning_rate": 0.00019281849506931132,
+      "loss": 1.138,
+      "step": 2744
+    },
+    {
+      "epoch": 0.48878205128205127,
+      "grad_norm": 0.5227449536323547,
+      "learning_rate": 0.000192813285431922,
+      "loss": 1.1831,
+      "step": 2745
+    },
+    {
+      "epoch": 0.48896011396011396,
+      "grad_norm": 0.48457080125808716,
+      "learning_rate": 0.00019280807397604915,
+      "loss": 1.2468,
+      "step": 2746
+    },
+    {
+      "epoch": 0.48913817663817666,
+      "grad_norm": 0.4596176743507385,
+      "learning_rate": 0.0001928028607017949,
+      "loss": 1.1098,
+      "step": 2747
+    },
+    {
+      "epoch": 0.4893162393162393,
+      "grad_norm": 0.5204966068267822,
+      "learning_rate": 0.00019279764560926142,
+      "loss": 1.1501,
+      "step": 2748
+    },
+    {
+      "epoch": 0.489494301994302,
+      "grad_norm": 0.5179490447044373,
+      "learning_rate": 0.0001927924286985508,
+      "loss": 1.2601,
+      "step": 2749
+    },
+    {
+      "epoch": 0.4896723646723647,
+      "grad_norm": 0.4563423693180084,
+      "learning_rate": 0.00019278720996976533,
+      "loss": 1.081,
+      "step": 2750
+    },
+    {
+      "epoch": 0.48985042735042733,
+      "grad_norm": 0.4906339943408966,
+      "learning_rate": 0.00019278198942300717,
+      "loss": 1.157,
+      "step": 2751
+    },
+    {
+      "epoch": 0.49002849002849,
+      "grad_norm": 0.42241403460502625,
+      "learning_rate": 0.00019277676705837873,
+      "loss": 1.0333,
+      "step": 2752
+    },
+    {
+      "epoch": 0.4902065527065527,
+      "grad_norm": 0.6310175657272339,
+      "learning_rate": 0.00019277154287598226,
+      "loss": 1.1225,
+      "step": 2753
+    },
+    {
+      "epoch": 0.49038461538461536,
+      "grad_norm": 0.5109034776687622,
+      "learning_rate": 0.0001927663168759201,
+      "loss": 1.1619,
+      "step": 2754
+    },
+    {
+      "epoch": 0.49056267806267806,
+      "grad_norm": 0.4809598922729492,
+      "learning_rate": 0.00019276108905829465,
+      "loss": 1.0423,
+      "step": 2755
+    },
+    {
+      "epoch": 0.49074074074074076,
+      "grad_norm": 0.557502806186676,
+      "learning_rate": 0.00019275585942320837,
+      "loss": 0.8783,
+      "step": 2756
+    },
+    {
+      "epoch": 0.4909188034188034,
+      "grad_norm": 0.5434393882751465,
+      "learning_rate": 0.0001927506279707637,
+      "loss": 1.1701,
+      "step": 2757
+    },
+    {
+      "epoch": 0.4910968660968661,
+      "grad_norm": 0.49278944730758667,
+      "learning_rate": 0.00019274539470106317,
+      "loss": 1.0447,
+      "step": 2758
+    },
+    {
+      "epoch": 0.4912749287749288,
+      "grad_norm": 0.5634264349937439,
+      "learning_rate": 0.00019274015961420927,
+      "loss": 1.0639,
+      "step": 2759
+    },
+    {
+      "epoch": 0.49145299145299143,
+      "grad_norm": 0.5632645487785339,
+      "learning_rate": 0.00019273492271030464,
+      "loss": 0.9223,
+      "step": 2760
+    },
+    {
+      "epoch": 0.4916310541310541,
+      "grad_norm": 0.5949172377586365,
+      "learning_rate": 0.00019272968398945177,
+      "loss": 0.894,
+      "step": 2761
+    },
+    {
+      "epoch": 0.4918091168091168,
+      "grad_norm": 0.5375374555587769,
+      "learning_rate": 0.00019272444345175342,
+      "loss": 1.0311,
+      "step": 2762
+    },
+    {
+      "epoch": 0.49198717948717946,
+      "grad_norm": 0.5211305022239685,
+      "learning_rate": 0.00019271920109731222,
+      "loss": 1.1531,
+      "step": 2763
+    },
+    {
+      "epoch": 0.49216524216524216,
+      "grad_norm": 0.44022253155708313,
+      "learning_rate": 0.00019271395692623084,
+      "loss": 0.9147,
+      "step": 2764
+    },
+    {
+      "epoch": 0.49234330484330485,
+      "grad_norm": 0.4682174623012543,
+      "learning_rate": 0.0001927087109386121,
+      "loss": 1.081,
+      "step": 2765
+    },
+    {
+      "epoch": 0.49252136752136755,
+      "grad_norm": 0.4971517324447632,
+      "learning_rate": 0.0001927034631345588,
+      "loss": 1.1017,
+      "step": 2766
+    },
+    {
+      "epoch": 0.4926994301994302,
+      "grad_norm": 0.5015294551849365,
+      "learning_rate": 0.00019269821351417364,
+      "loss": 1.1093,
+      "step": 2767
+    },
+    {
+      "epoch": 0.4928774928774929,
+      "grad_norm": 0.5512694716453552,
+      "learning_rate": 0.00019269296207755958,
+      "loss": 0.9657,
+      "step": 2768
+    },
+    {
+      "epoch": 0.4930555555555556,
+      "grad_norm": 0.4914868474006653,
+      "learning_rate": 0.00019268770882481948,
+      "loss": 1.0379,
+      "step": 2769
+    },
+    {
+      "epoch": 0.4932336182336182,
+      "grad_norm": 0.567337691783905,
+      "learning_rate": 0.00019268245375605626,
+      "loss": 1.004,
+      "step": 2770
+    },
+    {
+      "epoch": 0.4934116809116809,
+      "grad_norm": 0.518489420413971,
+      "learning_rate": 0.0001926771968713729,
+      "loss": 1.0734,
+      "step": 2771
+    },
+    {
+      "epoch": 0.4935897435897436,
+      "grad_norm": 0.567742109298706,
+      "learning_rate": 0.00019267193817087237,
+      "loss": 1.1276,
+      "step": 2772
+    },
+    {
+      "epoch": 0.49376780626780625,
+      "grad_norm": 0.5287964344024658,
+      "learning_rate": 0.00019266667765465773,
+      "loss": 1.1429,
+      "step": 2773
+    },
+    {
+      "epoch": 0.49394586894586895,
+      "grad_norm": 0.5302085876464844,
+      "learning_rate": 0.00019266141532283207,
+      "loss": 1.0934,
+      "step": 2774
+    },
+    {
+      "epoch": 0.49412393162393164,
+      "grad_norm": 0.5569987297058105,
+      "learning_rate": 0.00019265615117549842,
+      "loss": 1.1453,
+      "step": 2775
+    },
+    {
+      "epoch": 0.4943019943019943,
+      "grad_norm": 0.519695520401001,
+      "learning_rate": 0.00019265088521275997,
+      "loss": 1.1255,
+      "step": 2776
+    },
+    {
+      "epoch": 0.494480056980057,
+      "grad_norm": 0.5073211193084717,
+      "learning_rate": 0.0001926456174347199,
+      "loss": 1.0609,
+      "step": 2777
+    },
+    {
+      "epoch": 0.4946581196581197,
+      "grad_norm": 0.45028239488601685,
+      "learning_rate": 0.00019264034784148142,
+      "loss": 0.9098,
+      "step": 2778
+    },
+    {
+      "epoch": 0.4948361823361823,
+      "grad_norm": 0.6641215682029724,
+      "learning_rate": 0.00019263507643314776,
+      "loss": 0.8903,
+      "step": 2779
+    },
+    {
+      "epoch": 0.495014245014245,
+      "grad_norm": 0.5281413793563843,
+      "learning_rate": 0.00019262980320982224,
+      "loss": 1.2906,
+      "step": 2780
+    },
+    {
+      "epoch": 0.4951923076923077,
+      "grad_norm": 0.6256437301635742,
+      "learning_rate": 0.0001926245281716081,
+      "loss": 1.4142,
+      "step": 2781
+    },
+    {
+      "epoch": 0.49537037037037035,
+      "grad_norm": 0.5422517657279968,
+      "learning_rate": 0.00019261925131860877,
+      "loss": 1.1606,
+      "step": 2782
+    },
+    {
+      "epoch": 0.49554843304843305,
+      "grad_norm": 0.46938949823379517,
+      "learning_rate": 0.0001926139726509276,
+      "loss": 1.0333,
+      "step": 2783
+    },
+    {
+      "epoch": 0.49572649572649574,
+      "grad_norm": 0.5799683928489685,
+      "learning_rate": 0.000192608692168668,
+      "loss": 1.0333,
+      "step": 2784
+    },
+    {
+      "epoch": 0.4959045584045584,
+      "grad_norm": 0.5231602787971497,
+      "learning_rate": 0.0001926034098719335,
+      "loss": 1.1847,
+      "step": 2785
+    },
+    {
+      "epoch": 0.4960826210826211,
+      "grad_norm": 0.477845698595047,
+      "learning_rate": 0.00019259812576082752,
+      "loss": 1.0746,
+      "step": 2786
+    },
+    {
+      "epoch": 0.4962606837606838,
+      "grad_norm": 0.5490350723266602,
+      "learning_rate": 0.00019259283983545365,
+      "loss": 1.2462,
+      "step": 2787
+    },
+    {
+      "epoch": 0.4964387464387464,
+      "grad_norm": 0.5788847208023071,
+      "learning_rate": 0.0001925875520959154,
+      "loss": 1.3485,
+      "step": 2788
+    },
+    {
+      "epoch": 0.4966168091168091,
+      "grad_norm": 0.46184736490249634,
+      "learning_rate": 0.00019258226254231643,
+      "loss": 0.8673,
+      "step": 2789
+    },
+    {
+      "epoch": 0.4967948717948718,
+      "grad_norm": 0.4890633225440979,
+      "learning_rate": 0.0001925769711747603,
+      "loss": 0.9474,
+      "step": 2790
+    },
+    {
+      "epoch": 0.49697293447293445,
+      "grad_norm": 0.5719282627105713,
+      "learning_rate": 0.00019257167799335078,
+      "loss": 1.2532,
+      "step": 2791
+    },
+    {
+      "epoch": 0.49715099715099714,
+      "grad_norm": 0.5385584235191345,
+      "learning_rate": 0.0001925663829981915,
+      "loss": 1.1326,
+      "step": 2792
+    },
+    {
+      "epoch": 0.49732905982905984,
+      "grad_norm": 0.5339545011520386,
+      "learning_rate": 0.00019256108618938625,
+      "loss": 1.1362,
+      "step": 2793
+    },
+    {
+      "epoch": 0.49750712250712253,
+      "grad_norm": 0.5017803907394409,
+      "learning_rate": 0.00019255578756703878,
+      "loss": 1.0449,
+      "step": 2794
+    },
+    {
+      "epoch": 0.4976851851851852,
+      "grad_norm": 0.6004226803779602,
+      "learning_rate": 0.00019255048713125294,
+      "loss": 0.9346,
+      "step": 2795
+    },
+    {
+      "epoch": 0.49786324786324787,
+      "grad_norm": 0.44581490755081177,
+      "learning_rate": 0.00019254518488213255,
+      "loss": 1.038,
+      "step": 2796
+    },
+    {
+      "epoch": 0.49804131054131057,
+      "grad_norm": 0.5180951356887817,
+      "learning_rate": 0.00019253988081978151,
+      "loss": 1.0479,
+      "step": 2797
+    },
+    {
+      "epoch": 0.4982193732193732,
+      "grad_norm": 0.53944993019104,
+      "learning_rate": 0.00019253457494430376,
+      "loss": 1.2598,
+      "step": 2798
+    },
+    {
+      "epoch": 0.4983974358974359,
+      "grad_norm": 0.5633010268211365,
+      "learning_rate": 0.00019252926725580322,
+      "loss": 1.205,
+      "step": 2799
+    },
+    {
+      "epoch": 0.4985754985754986,
+      "grad_norm": 0.6653175950050354,
+      "learning_rate": 0.0001925239577543839,
+      "loss": 1.2383,
+      "step": 2800
+    },
+    {
+      "epoch": 0.49875356125356124,
+      "grad_norm": 0.5083333849906921,
+      "learning_rate": 0.00019251864644014984,
+      "loss": 1.0649,
+      "step": 2801
+    },
+    {
+      "epoch": 0.49893162393162394,
+      "grad_norm": 0.4842020571231842,
+      "learning_rate": 0.00019251333331320506,
+      "loss": 1.1991,
+      "step": 2802
+    },
+    {
+      "epoch": 0.49910968660968663,
+      "grad_norm": 0.47987112402915955,
+      "learning_rate": 0.00019250801837365373,
+      "loss": 1.1686,
+      "step": 2803
+    },
+    {
+      "epoch": 0.49928774928774927,
+      "grad_norm": 0.5316333770751953,
+      "learning_rate": 0.00019250270162159992,
+      "loss": 1.1759,
+      "step": 2804
+    },
+    {
+      "epoch": 0.49946581196581197,
+      "grad_norm": 0.5015079379081726,
+      "learning_rate": 0.00019249738305714787,
+      "loss": 0.9424,
+      "step": 2805
+    },
+    {
+      "epoch": 0.49964387464387466,
+      "grad_norm": 0.6488274931907654,
+      "learning_rate": 0.00019249206268040172,
+      "loss": 1.066,
+      "step": 2806
+    },
+    {
+      "epoch": 0.4998219373219373,
+      "grad_norm": 0.40364864468574524,
+      "learning_rate": 0.00019248674049146574,
+      "loss": 0.6998,
+      "step": 2807
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.5535672903060913,
+      "learning_rate": 0.00019248141649044423,
+      "loss": 1.2207,
+      "step": 2808
+    },
+    {
+      "epoch": 0.5,
+      "eval_loss": 1.1072274446487427,
+      "eval_runtime": 28.6913,
+      "eval_samples_per_second": 36.283,
+      "eval_steps_per_second": 18.159,
+      "step": 2808
+    },
+    {
+      "epoch": 0.5001780626780626,
+      "grad_norm": 0.4834389090538025,
+      "learning_rate": 0.00019247609067744143,
+      "loss": 1.1686,
+      "step": 2809
+    },
+    {
+      "epoch": 0.5003561253561254,
+      "grad_norm": 0.5007249712944031,
+      "learning_rate": 0.00019247076305256176,
+      "loss": 1.1343,
+      "step": 2810
+    },
+    {
+      "epoch": 0.500534188034188,
+      "grad_norm": 0.4773348271846771,
+      "learning_rate": 0.00019246543361590957,
+      "loss": 0.9324,
+      "step": 2811
+    },
+    {
+      "epoch": 0.5007122507122507,
+      "grad_norm": 0.47324609756469727,
+      "learning_rate": 0.0001924601023675893,
+      "loss": 1.0223,
+      "step": 2812
+    },
+    {
+      "epoch": 0.5008903133903134,
+      "grad_norm": 0.5583845973014832,
+      "learning_rate": 0.00019245476930770537,
+      "loss": 1.1328,
+      "step": 2813
+    },
+    {
+      "epoch": 0.5010683760683761,
+      "grad_norm": 0.4814579486846924,
+      "learning_rate": 0.00019244943443636232,
+      "loss": 1.0528,
+      "step": 2814
+    },
+    {
+      "epoch": 0.5012464387464387,
+      "grad_norm": 0.4996104836463928,
+      "learning_rate": 0.00019244409775366465,
+      "loss": 1.2482,
+      "step": 2815
+    },
+    {
+      "epoch": 0.5014245014245015,
+      "grad_norm": 0.47870904207229614,
+      "learning_rate": 0.0001924387592597169,
+      "loss": 0.9452,
+      "step": 2816
+    },
+    {
+      "epoch": 0.5016025641025641,
+      "grad_norm": 0.5617441534996033,
+      "learning_rate": 0.0001924334189546237,
+      "loss": 1.378,
+      "step": 2817
+    },
+    {
+      "epoch": 0.5017806267806267,
+      "grad_norm": 0.4872083365917206,
+      "learning_rate": 0.00019242807683848967,
+      "loss": 1.1571,
+      "step": 2818
+    },
+    {
+      "epoch": 0.5019586894586895,
+      "grad_norm": 0.5147804021835327,
+      "learning_rate": 0.00019242273291141947,
+      "loss": 1.1086,
+      "step": 2819
+    },
+    {
+      "epoch": 0.5021367521367521,
+      "grad_norm": 0.4698995351791382,
+      "learning_rate": 0.00019241738717351784,
+      "loss": 1.1579,
+      "step": 2820
+    },
+    {
+      "epoch": 0.5023148148148148,
+      "grad_norm": 0.5158926844596863,
+      "learning_rate": 0.00019241203962488946,
+      "loss": 1.2763,
+      "step": 2821
+    },
+    {
+      "epoch": 0.5024928774928775,
+      "grad_norm": 0.5218976736068726,
+      "learning_rate": 0.00019240669026563914,
+      "loss": 1.0633,
+      "step": 2822
+    },
+    {
+      "epoch": 0.5026709401709402,
+      "grad_norm": 0.5511452555656433,
+      "learning_rate": 0.0001924013390958717,
+      "loss": 0.9939,
+      "step": 2823
+    },
+    {
+      "epoch": 0.5028490028490028,
+      "grad_norm": 0.5227555632591248,
+      "learning_rate": 0.00019239598611569191,
+      "loss": 1.2478,
+      "step": 2824
+    },
+    {
+      "epoch": 0.5030270655270656,
+      "grad_norm": 0.5444719791412354,
+      "learning_rate": 0.00019239063132520475,
+      "loss": 1.1574,
+      "step": 2825
+    },
+    {
+      "epoch": 0.5032051282051282,
+      "grad_norm": 0.4752781093120575,
+      "learning_rate": 0.0001923852747245151,
+      "loss": 0.9034,
+      "step": 2826
+    },
+    {
+      "epoch": 0.5033831908831908,
+      "grad_norm": 0.5286496877670288,
+      "learning_rate": 0.00019237991631372792,
+      "loss": 1.1391,
+      "step": 2827
+    },
+    {
+      "epoch": 0.5035612535612536,
+      "grad_norm": 0.5009933710098267,
+      "learning_rate": 0.00019237455609294815,
+      "loss": 1.2178,
+      "step": 2828
+    },
+    {
+      "epoch": 0.5037393162393162,
+      "grad_norm": 0.5012276768684387,
+      "learning_rate": 0.00019236919406228085,
+      "loss": 0.9877,
+      "step": 2829
+    },
+    {
+      "epoch": 0.5039173789173789,
+      "grad_norm": 0.576508104801178,
+      "learning_rate": 0.00019236383022183106,
+      "loss": 1.1299,
+      "step": 2830
+    },
+    {
+      "epoch": 0.5040954415954416,
+      "grad_norm": 0.4716590642929077,
+      "learning_rate": 0.0001923584645717039,
+      "loss": 1.0451,
+      "step": 2831
+    },
+    {
+      "epoch": 0.5042735042735043,
+      "grad_norm": 0.5817418098449707,
+      "learning_rate": 0.00019235309711200448,
+      "loss": 1.0911,
+      "step": 2832
+    },
+    {
+      "epoch": 0.5044515669515669,
+      "grad_norm": 0.5695745944976807,
+      "learning_rate": 0.000192347727842838,
+      "loss": 1.0229,
+      "step": 2833
+    },
+    {
+      "epoch": 0.5046296296296297,
+      "grad_norm": 0.49127066135406494,
+      "learning_rate": 0.00019234235676430958,
+      "loss": 1.1377,
+      "step": 2834
+    },
+    {
+      "epoch": 0.5048076923076923,
+      "grad_norm": 0.5426172614097595,
+      "learning_rate": 0.00019233698387652453,
+      "loss": 1.2427,
+      "step": 2835
+    },
+    {
+      "epoch": 0.5049857549857549,
+      "grad_norm": 0.5342385172843933,
+      "learning_rate": 0.0001923316091795881,
+      "loss": 1.1427,
+      "step": 2836
+    },
+    {
+      "epoch": 0.5051638176638177,
+      "grad_norm": 0.5480486750602722,
+      "learning_rate": 0.00019232623267360558,
+      "loss": 1.0647,
+      "step": 2837
+    },
+    {
+      "epoch": 0.5053418803418803,
+      "grad_norm": 0.4584530293941498,
+      "learning_rate": 0.00019232085435868235,
+      "loss": 1.0461,
+      "step": 2838
+    },
+    {
+      "epoch": 0.5055199430199431,
+      "grad_norm": 0.5992119908332825,
+      "learning_rate": 0.00019231547423492371,
+      "loss": 1.1456,
+      "step": 2839
+    },
+    {
+      "epoch": 0.5056980056980057,
+      "grad_norm": 0.514018177986145,
+      "learning_rate": 0.00019231009230243515,
+      "loss": 1.2559,
+      "step": 2840
+    },
+    {
+      "epoch": 0.5058760683760684,
+      "grad_norm": 0.5392283797264099,
+      "learning_rate": 0.0001923047085613221,
+      "loss": 1.044,
+      "step": 2841
+    },
+    {
+      "epoch": 0.5060541310541311,
+      "grad_norm": 0.4486566483974457,
+      "learning_rate": 0.00019229932301169,
+      "loss": 1.0679,
+      "step": 2842
+    },
+    {
+      "epoch": 0.5062321937321937,
+      "grad_norm": 0.4523460566997528,
+      "learning_rate": 0.00019229393565364442,
+      "loss": 1.1651,
+      "step": 2843
+    },
+    {
+      "epoch": 0.5064102564102564,
+      "grad_norm": 0.6032688021659851,
+      "learning_rate": 0.0001922885464872909,
+      "loss": 1.15,
+      "step": 2844
+    },
+    {
+      "epoch": 0.5065883190883191,
+      "grad_norm": 0.5883688926696777,
+      "learning_rate": 0.000192283155512735,
+      "loss": 1.2179,
+      "step": 2845
+    },
+    {
+      "epoch": 0.5067663817663818,
+      "grad_norm": 0.5534378886222839,
+      "learning_rate": 0.00019227776273008238,
+      "loss": 1.0387,
+      "step": 2846
+    },
+    {
+      "epoch": 0.5069444444444444,
+      "grad_norm": 0.5899033546447754,
+      "learning_rate": 0.00019227236813943872,
+      "loss": 1.0812,
+      "step": 2847
+    },
+    {
+      "epoch": 0.5071225071225072,
+      "grad_norm": 0.5718855261802673,
+      "learning_rate": 0.00019226697174090965,
+      "loss": 1.1375,
+      "step": 2848
+    },
+    {
+      "epoch": 0.5073005698005698,
+      "grad_norm": 0.5080967545509338,
+      "learning_rate": 0.00019226157353460094,
+      "loss": 1.1421,
+      "step": 2849
+    },
+    {
+      "epoch": 0.5074786324786325,
+      "grad_norm": 0.5253677368164062,
+      "learning_rate": 0.0001922561735206184,
+      "loss": 1.0166,
+      "step": 2850
+    },
+    {
+      "epoch": 0.5076566951566952,
+      "grad_norm": 0.47797444462776184,
+      "learning_rate": 0.00019225077169906772,
+      "loss": 1.0504,
+      "step": 2851
+    },
+    {
+      "epoch": 0.5078347578347578,
+      "grad_norm": 0.4911690652370453,
+      "learning_rate": 0.0001922453680700548,
+      "loss": 1.0629,
+      "step": 2852
+    },
+    {
+      "epoch": 0.5080128205128205,
+      "grad_norm": 0.49678200483322144,
+      "learning_rate": 0.00019223996263368557,
+      "loss": 1.1672,
+      "step": 2853
+    },
+    {
+      "epoch": 0.5081908831908832,
+      "grad_norm": 0.5451810359954834,
+      "learning_rate": 0.00019223455539006586,
+      "loss": 1.3031,
+      "step": 2854
+    },
+    {
+      "epoch": 0.5083689458689459,
+      "grad_norm": 0.5708984136581421,
+      "learning_rate": 0.00019222914633930166,
+      "loss": 1.0986,
+      "step": 2855
+    },
+    {
+      "epoch": 0.5085470085470085,
+      "grad_norm": 0.47232356667518616,
+      "learning_rate": 0.00019222373548149888,
+      "loss": 1.0449,
+      "step": 2856
+    },
+    {
+      "epoch": 0.5087250712250713,
+      "grad_norm": 0.6027610898017883,
+      "learning_rate": 0.0001922183228167636,
+      "loss": 0.862,
+      "step": 2857
+    },
+    {
+      "epoch": 0.5089031339031339,
+      "grad_norm": 0.5211802124977112,
+      "learning_rate": 0.00019221290834520188,
+      "loss": 1.1048,
+      "step": 2858
+    },
+    {
+      "epoch": 0.5090811965811965,
+      "grad_norm": 0.45101237297058105,
+      "learning_rate": 0.00019220749206691972,
+      "loss": 1.0046,
+      "step": 2859
+    },
+    {
+      "epoch": 0.5092592592592593,
+      "grad_norm": 0.5526158213615417,
+      "learning_rate": 0.00019220207398202335,
+      "loss": 1.2275,
+      "step": 2860
+    },
+    {
+      "epoch": 0.5094373219373219,
+      "grad_norm": 0.48322010040283203,
+      "learning_rate": 0.00019219665409061885,
+      "loss": 0.9974,
+      "step": 2861
+    },
+    {
+      "epoch": 0.5096153846153846,
+      "grad_norm": 0.4775219261646271,
+      "learning_rate": 0.00019219123239281244,
+      "loss": 1.1852,
+      "step": 2862
+    },
+    {
+      "epoch": 0.5097934472934473,
+      "grad_norm": 0.46184200048446655,
+      "learning_rate": 0.00019218580888871034,
+      "loss": 0.9393,
+      "step": 2863
+    },
+    {
+      "epoch": 0.50997150997151,
+      "grad_norm": 0.47495174407958984,
+      "learning_rate": 0.00019218038357841883,
+      "loss": 0.9631,
+      "step": 2864
+    },
+    {
+      "epoch": 0.5101495726495726,
+      "grad_norm": 0.48600029945373535,
+      "learning_rate": 0.00019217495646204418,
+      "loss": 1.0498,
+      "step": 2865
+    },
+    {
+      "epoch": 0.5103276353276354,
+      "grad_norm": 0.5801547169685364,
+      "learning_rate": 0.00019216952753969274,
+      "loss": 1.2181,
+      "step": 2866
+    },
+    {
+      "epoch": 0.510505698005698,
+      "grad_norm": 0.5082106590270996,
+      "learning_rate": 0.00019216409681147085,
+      "loss": 1.2009,
+      "step": 2867
+    },
+    {
+      "epoch": 0.5106837606837606,
+      "grad_norm": 0.4184330701828003,
+      "learning_rate": 0.00019215866427748493,
+      "loss": 0.8462,
+      "step": 2868
+    },
+    {
+      "epoch": 0.5108618233618234,
+      "grad_norm": 0.518099844455719,
+      "learning_rate": 0.00019215322993784147,
+      "loss": 1.2091,
+      "step": 2869
+    },
+    {
+      "epoch": 0.511039886039886,
+      "grad_norm": 0.569464921951294,
+      "learning_rate": 0.0001921477937926469,
+      "loss": 1.0264,
+      "step": 2870
+    },
+    {
+      "epoch": 0.5112179487179487,
+      "grad_norm": 0.526767909526825,
+      "learning_rate": 0.00019214235584200768,
+      "loss": 1.1192,
+      "step": 2871
+    },
+    {
+      "epoch": 0.5113960113960114,
+      "grad_norm": 0.6511057019233704,
+      "learning_rate": 0.00019213691608603047,
+      "loss": 1.3193,
+      "step": 2872
+    },
+    {
+      "epoch": 0.5115740740740741,
+      "grad_norm": 0.48536401987075806,
+      "learning_rate": 0.00019213147452482173,
+      "loss": 1.1671,
+      "step": 2873
+    },
+    {
+      "epoch": 0.5117521367521367,
+      "grad_norm": 0.7972469329833984,
+      "learning_rate": 0.00019212603115848818,
+      "loss": 1.1393,
+      "step": 2874
+    },
+    {
+      "epoch": 0.5119301994301995,
+      "grad_norm": 0.5543264746665955,
+      "learning_rate": 0.00019212058598713642,
+      "loss": 1.1436,
+      "step": 2875
+    },
+    {
+      "epoch": 0.5121082621082621,
+      "grad_norm": 0.49688720703125,
+      "learning_rate": 0.0001921151390108731,
+      "loss": 1.0897,
+      "step": 2876
+    },
+    {
+      "epoch": 0.5122863247863247,
+      "grad_norm": 0.4928736090660095,
+      "learning_rate": 0.000192109690229805,
+      "loss": 1.2426,
+      "step": 2877
+    },
+    {
+      "epoch": 0.5124643874643875,
+      "grad_norm": 0.4917896091938019,
+      "learning_rate": 0.0001921042396440389,
+      "loss": 1.0047,
+      "step": 2878
+    },
+    {
+      "epoch": 0.5126424501424501,
+      "grad_norm": 0.5485204458236694,
+      "learning_rate": 0.00019209878725368152,
+      "loss": 1.2615,
+      "step": 2879
+    },
+    {
+      "epoch": 0.5128205128205128,
+      "grad_norm": 0.5229470133781433,
+      "learning_rate": 0.0001920933330588397,
+      "loss": 1.3249,
+      "step": 2880
+    },
+    {
+      "epoch": 0.5129985754985755,
+      "grad_norm": 0.4783077538013458,
+      "learning_rate": 0.00019208787705962037,
+      "loss": 1.2004,
+      "step": 2881
+    },
+    {
+      "epoch": 0.5131766381766382,
+      "grad_norm": 0.5106910467147827,
+      "learning_rate": 0.00019208241925613035,
+      "loss": 1.1745,
+      "step": 2882
+    },
+    {
+      "epoch": 0.5133547008547008,
+      "grad_norm": 0.5308730006217957,
+      "learning_rate": 0.00019207695964847666,
+      "loss": 0.9706,
+      "step": 2883
+    },
+    {
+      "epoch": 0.5135327635327636,
+      "grad_norm": 0.5489775538444519,
+      "learning_rate": 0.00019207149823676617,
+      "loss": 1.0073,
+      "step": 2884
+    },
+    {
+      "epoch": 0.5137108262108262,
+      "grad_norm": 0.4992835521697998,
+      "learning_rate": 0.00019206603502110596,
+      "loss": 1.1053,
+      "step": 2885
+    },
+    {
+      "epoch": 0.5138888888888888,
+      "grad_norm": 0.5304922461509705,
+      "learning_rate": 0.00019206057000160302,
+      "loss": 1.0565,
+      "step": 2886
+    },
+    {
+      "epoch": 0.5140669515669516,
+      "grad_norm": 0.46411609649658203,
+      "learning_rate": 0.00019205510317836448,
+      "loss": 0.9202,
+      "step": 2887
+    },
+    {
+      "epoch": 0.5142450142450142,
+      "grad_norm": 0.5236835479736328,
+      "learning_rate": 0.0001920496345514974,
+      "loss": 0.9075,
+      "step": 2888
+    },
+    {
+      "epoch": 0.5144230769230769,
+      "grad_norm": 0.4416964054107666,
+      "learning_rate": 0.00019204416412110895,
+      "loss": 0.9225,
+      "step": 2889
+    },
+    {
+      "epoch": 0.5146011396011396,
+      "grad_norm": 0.5470940470695496,
+      "learning_rate": 0.00019203869188730633,
+      "loss": 1.2195,
+      "step": 2890
+    },
+    {
+      "epoch": 0.5147792022792023,
+      "grad_norm": 0.5380414128303528,
+      "learning_rate": 0.0001920332178501967,
+      "loss": 1.0731,
+      "step": 2891
+    },
+    {
+      "epoch": 0.5149572649572649,
+      "grad_norm": 0.4405716359615326,
+      "learning_rate": 0.00019202774200988737,
+      "loss": 0.8739,
+      "step": 2892
+    },
+    {
+      "epoch": 0.5151353276353277,
+      "grad_norm": 0.5222984552383423,
+      "learning_rate": 0.0001920222643664856,
+      "loss": 1.1806,
+      "step": 2893
+    },
+    {
+      "epoch": 0.5153133903133903,
+      "grad_norm": 0.48545539379119873,
+      "learning_rate": 0.0001920167849200987,
+      "loss": 0.9939,
+      "step": 2894
+    },
+    {
+      "epoch": 0.5154914529914529,
+      "grad_norm": 0.45078009366989136,
+      "learning_rate": 0.0001920113036708341,
+      "loss": 1.0085,
+      "step": 2895
+    },
+    {
+      "epoch": 0.5156695156695157,
+      "grad_norm": 0.5029830932617188,
+      "learning_rate": 0.00019200582061879913,
+      "loss": 1.1095,
+      "step": 2896
+    },
+    {
+      "epoch": 0.5158475783475783,
+      "grad_norm": 0.5316143035888672,
+      "learning_rate": 0.00019200033576410118,
+      "loss": 0.9883,
+      "step": 2897
+    },
+    {
+      "epoch": 0.5160256410256411,
+      "grad_norm": 0.5282100439071655,
+      "learning_rate": 0.0001919948491068478,
+      "loss": 1.1441,
+      "step": 2898
+    },
+    {
+      "epoch": 0.5162037037037037,
+      "grad_norm": 0.5145367980003357,
+      "learning_rate": 0.00019198936064714647,
+      "loss": 1.1999,
+      "step": 2899
+    },
+    {
+      "epoch": 0.5163817663817664,
+      "grad_norm": 0.5385651588439941,
+      "learning_rate": 0.00019198387038510468,
+      "loss": 1.1831,
+      "step": 2900
+    },
+    {
+      "epoch": 0.5165598290598291,
+      "grad_norm": 0.4971916377544403,
+      "learning_rate": 0.00019197837832083002,
+      "loss": 1.2518,
+      "step": 2901
+    },
+    {
+      "epoch": 0.5167378917378918,
+      "grad_norm": 0.5253807306289673,
+      "learning_rate": 0.00019197288445443016,
+      "loss": 1.0788,
+      "step": 2902
+    },
+    {
+      "epoch": 0.5169159544159544,
+      "grad_norm": 0.49724945425987244,
+      "learning_rate": 0.00019196738878601263,
+      "loss": 1.0985,
+      "step": 2903
+    },
+    {
+      "epoch": 0.5170940170940171,
+      "grad_norm": 0.5327325463294983,
+      "learning_rate": 0.0001919618913156852,
+      "loss": 1.2862,
+      "step": 2904
+    },
+    {
+      "epoch": 0.5172720797720798,
+      "grad_norm": 0.639999270439148,
+      "learning_rate": 0.00019195639204355554,
+      "loss": 1.2052,
+      "step": 2905
+    },
+    {
+      "epoch": 0.5174501424501424,
+      "grad_norm": 0.4630785584449768,
+      "learning_rate": 0.0001919508909697314,
+      "loss": 1.1157,
+      "step": 2906
+    },
+    {
+      "epoch": 0.5176282051282052,
+      "grad_norm": 0.513949990272522,
+      "learning_rate": 0.00019194538809432055,
+      "loss": 1.0047,
+      "step": 2907
+    },
+    {
+      "epoch": 0.5178062678062678,
+      "grad_norm": 0.488034725189209,
+      "learning_rate": 0.0001919398834174308,
+      "loss": 0.9008,
+      "step": 2908
+    },
+    {
+      "epoch": 0.5179843304843305,
+      "grad_norm": 0.4892788529396057,
+      "learning_rate": 0.00019193437693917006,
+      "loss": 1.1024,
+      "step": 2909
+    },
+    {
+      "epoch": 0.5181623931623932,
+      "grad_norm": 0.5503842830657959,
+      "learning_rate": 0.00019192886865964618,
+      "loss": 1.2283,
+      "step": 2910
+    },
+    {
+      "epoch": 0.5183404558404558,
+      "grad_norm": 0.48885393142700195,
+      "learning_rate": 0.00019192335857896707,
+      "loss": 0.9522,
+      "step": 2911
+    },
+    {
+      "epoch": 0.5185185185185185,
+      "grad_norm": 0.5479527115821838,
+      "learning_rate": 0.00019191784669724072,
+      "loss": 1.1616,
+      "step": 2912
+    },
+    {
+      "epoch": 0.5186965811965812,
+      "grad_norm": 0.42701148986816406,
+      "learning_rate": 0.00019191233301457506,
+      "loss": 0.8434,
+      "step": 2913
+    },
+    {
+      "epoch": 0.5188746438746439,
+      "grad_norm": 0.4273422658443451,
+      "learning_rate": 0.00019190681753107822,
+      "loss": 0.8316,
+      "step": 2914
+    },
+    {
+      "epoch": 0.5190527065527065,
+      "grad_norm": 0.5047736763954163,
+      "learning_rate": 0.00019190130024685818,
+      "loss": 1.171,
+      "step": 2915
+    },
+    {
+      "epoch": 0.5192307692307693,
+      "grad_norm": 0.5221177935600281,
+      "learning_rate": 0.00019189578116202307,
+      "loss": 1.0256,
+      "step": 2916
+    },
+    {
+      "epoch": 0.5194088319088319,
+      "grad_norm": 0.4782322943210602,
+      "learning_rate": 0.00019189026027668105,
+      "loss": 0.8598,
+      "step": 2917
+    },
+    {
+      "epoch": 0.5195868945868946,
+      "grad_norm": 0.5627185702323914,
+      "learning_rate": 0.00019188473759094022,
+      "loss": 1.1825,
+      "step": 2918
+    },
+    {
+      "epoch": 0.5197649572649573,
+      "grad_norm": 0.5036423206329346,
+      "learning_rate": 0.00019187921310490888,
+      "loss": 1.0881,
+      "step": 2919
+    },
+    {
+      "epoch": 0.51994301994302,
+      "grad_norm": 0.4271143972873688,
+      "learning_rate": 0.0001918736868186952,
+      "loss": 0.9265,
+      "step": 2920
+    },
+    {
+      "epoch": 0.5201210826210826,
+      "grad_norm": 0.5427432656288147,
+      "learning_rate": 0.00019186815873240747,
+      "loss": 1.196,
+      "step": 2921
+    },
+    {
+      "epoch": 0.5202991452991453,
+      "grad_norm": 0.5494198203086853,
+      "learning_rate": 0.00019186262884615402,
+      "loss": 1.1207,
+      "step": 2922
+    },
+    {
+      "epoch": 0.520477207977208,
+      "grad_norm": 0.5305119752883911,
+      "learning_rate": 0.0001918570971600432,
+      "loss": 1.0393,
+      "step": 2923
+    },
+    {
+      "epoch": 0.5206552706552706,
+      "grad_norm": 0.46713170409202576,
+      "learning_rate": 0.00019185156367418333,
+      "loss": 0.9583,
+      "step": 2924
+    },
+    {
+      "epoch": 0.5208333333333334,
+      "grad_norm": 0.597776472568512,
+      "learning_rate": 0.00019184602838868292,
+      "loss": 1.2978,
+      "step": 2925
+    },
+    {
+      "epoch": 0.521011396011396,
+      "grad_norm": 0.520976722240448,
+      "learning_rate": 0.00019184049130365036,
+      "loss": 1.0515,
+      "step": 2926
+    },
+    {
+      "epoch": 0.5211894586894587,
+      "grad_norm": 0.5266290307044983,
+      "learning_rate": 0.00019183495241919415,
+      "loss": 1.0437,
+      "step": 2927
+    },
+    {
+      "epoch": 0.5213675213675214,
+      "grad_norm": 0.50911545753479,
+      "learning_rate": 0.00019182941173542285,
+      "loss": 0.9977,
+      "step": 2928
+    },
+    {
+      "epoch": 0.521545584045584,
+      "grad_norm": 0.4924670457839966,
+      "learning_rate": 0.00019182386925244496,
+      "loss": 0.9309,
+      "step": 2929
+    },
+    {
+      "epoch": 0.5217236467236467,
+      "grad_norm": 0.4979301393032074,
+      "learning_rate": 0.00019181832497036912,
+      "loss": 0.87,
+      "step": 2930
+    },
+    {
+      "epoch": 0.5219017094017094,
+      "grad_norm": 0.6307916045188904,
+      "learning_rate": 0.0001918127788893039,
+      "loss": 1.2159,
+      "step": 2931
+    },
+    {
+      "epoch": 0.5220797720797721,
+      "grad_norm": 0.4915660619735718,
+      "learning_rate": 0.00019180723100935802,
+      "loss": 1.0828,
+      "step": 2932
+    },
+    {
+      "epoch": 0.5222578347578347,
+      "grad_norm": 0.4312742352485657,
+      "learning_rate": 0.00019180168133064017,
+      "loss": 1.0496,
+      "step": 2933
+    },
+    {
+      "epoch": 0.5224358974358975,
+      "grad_norm": 0.6006124019622803,
+      "learning_rate": 0.00019179612985325908,
+      "loss": 1.0751,
+      "step": 2934
+    },
+    {
+      "epoch": 0.5226139601139601,
+      "grad_norm": 0.5332220196723938,
+      "learning_rate": 0.0001917905765773235,
+      "loss": 1.2601,
+      "step": 2935
+    },
+    {
+      "epoch": 0.5227920227920227,
+      "grad_norm": 0.4877954423427582,
+      "learning_rate": 0.00019178502150294223,
+      "loss": 1.2279,
+      "step": 2936
+    },
+    {
+      "epoch": 0.5229700854700855,
+      "grad_norm": 0.5975968837738037,
+      "learning_rate": 0.00019177946463022418,
+      "loss": 1.3371,
+      "step": 2937
+    },
+    {
+      "epoch": 0.5231481481481481,
+      "grad_norm": 0.5363923907279968,
+      "learning_rate": 0.00019177390595927815,
+      "loss": 1.0705,
+      "step": 2938
+    },
+    {
+      "epoch": 0.5233262108262108,
+      "grad_norm": 0.4314909875392914,
+      "learning_rate": 0.0001917683454902131,
+      "loss": 0.9172,
+      "step": 2939
+    },
+    {
+      "epoch": 0.5235042735042735,
+      "grad_norm": 0.46187883615493774,
+      "learning_rate": 0.0001917627832231379,
+      "loss": 1.1201,
+      "step": 2940
+    },
+    {
+      "epoch": 0.5236823361823362,
+      "grad_norm": 0.4648260772228241,
+      "learning_rate": 0.00019175721915816162,
+      "loss": 1.1307,
+      "step": 2941
+    },
+    {
+      "epoch": 0.5238603988603988,
+      "grad_norm": 0.4427165687084198,
+      "learning_rate": 0.00019175165329539325,
+      "loss": 0.9459,
+      "step": 2942
+    },
+    {
+      "epoch": 0.5240384615384616,
+      "grad_norm": 0.4645056128501892,
+      "learning_rate": 0.0001917460856349418,
+      "loss": 0.9176,
+      "step": 2943
+    },
+    {
+      "epoch": 0.5242165242165242,
+      "grad_norm": 0.4939568042755127,
+      "learning_rate": 0.0001917405161769164,
+      "loss": 1.1056,
+      "step": 2944
+    },
+    {
+      "epoch": 0.5243945868945868,
+      "grad_norm": 0.6057310104370117,
+      "learning_rate": 0.00019173494492142617,
+      "loss": 1.2714,
+      "step": 2945
+    },
+    {
+      "epoch": 0.5245726495726496,
+      "grad_norm": 0.5038546323776245,
+      "learning_rate": 0.00019172937186858025,
+      "loss": 0.911,
+      "step": 2946
+    },
+    {
+      "epoch": 0.5247507122507122,
+      "grad_norm": 0.5521321296691895,
+      "learning_rate": 0.00019172379701848784,
+      "loss": 1.0781,
+      "step": 2947
+    },
+    {
+      "epoch": 0.5249287749287749,
+      "grad_norm": 0.516979455947876,
+      "learning_rate": 0.00019171822037125817,
+      "loss": 1.1051,
+      "step": 2948
+    },
+    {
+      "epoch": 0.5251068376068376,
+      "grad_norm": 0.5443150997161865,
+      "learning_rate": 0.0001917126419270005,
+      "loss": 1.0802,
+      "step": 2949
+    },
+    {
+      "epoch": 0.5252849002849003,
+      "grad_norm": 0.5373311042785645,
+      "learning_rate": 0.00019170706168582412,
+      "loss": 0.9313,
+      "step": 2950
+    },
+    {
+      "epoch": 0.5254629629629629,
+      "grad_norm": 0.7511917948722839,
+      "learning_rate": 0.0001917014796478384,
+      "loss": 1.1958,
+      "step": 2951
+    },
+    {
+      "epoch": 0.5256410256410257,
+      "grad_norm": 0.49893468618392944,
+      "learning_rate": 0.00019169589581315263,
+      "loss": 0.9387,
+      "step": 2952
+    },
+    {
+      "epoch": 0.5258190883190883,
+      "grad_norm": 0.48010289669036865,
+      "learning_rate": 0.00019169031018187628,
+      "loss": 1.2459,
+      "step": 2953
+    },
+    {
+      "epoch": 0.5259971509971509,
+      "grad_norm": 0.48768678307533264,
+      "learning_rate": 0.0001916847227541188,
+      "loss": 1.0127,
+      "step": 2954
+    },
+    {
+      "epoch": 0.5261752136752137,
+      "grad_norm": 0.5973068475723267,
+      "learning_rate": 0.00019167913352998963,
+      "loss": 1.1685,
+      "step": 2955
+    },
+    {
+      "epoch": 0.5263532763532763,
+      "grad_norm": 0.5567806959152222,
+      "learning_rate": 0.00019167354250959826,
+      "loss": 1.142,
+      "step": 2956
+    },
+    {
+      "epoch": 0.5265313390313391,
+      "grad_norm": 0.47819700837135315,
+      "learning_rate": 0.00019166794969305428,
+      "loss": 0.712,
+      "step": 2957
+    },
+    {
+      "epoch": 0.5267094017094017,
+      "grad_norm": 0.5191744565963745,
+      "learning_rate": 0.00019166235508046725,
+      "loss": 1.2208,
+      "step": 2958
+    },
+    {
+      "epoch": 0.5268874643874644,
+      "grad_norm": 0.4987856149673462,
+      "learning_rate": 0.00019165675867194675,
+      "loss": 1.0466,
+      "step": 2959
+    },
+    {
+      "epoch": 0.5270655270655271,
+      "grad_norm": 0.5017665028572083,
+      "learning_rate": 0.0001916511604676025,
+      "loss": 1.1236,
+      "step": 2960
+    },
+    {
+      "epoch": 0.5272435897435898,
+      "grad_norm": 0.5115348696708679,
+      "learning_rate": 0.00019164556046754415,
+      "loss": 1.1497,
+      "step": 2961
+    },
+    {
+      "epoch": 0.5274216524216524,
+      "grad_norm": 0.4934345781803131,
+      "learning_rate": 0.0001916399586718814,
+      "loss": 1.0183,
+      "step": 2962
+    },
+    {
+      "epoch": 0.5275997150997151,
+      "grad_norm": 0.5033719539642334,
+      "learning_rate": 0.00019163435508072404,
+      "loss": 1.0256,
+      "step": 2963
+    },
+    {
+      "epoch": 0.5277777777777778,
+      "grad_norm": 0.5325372219085693,
+      "learning_rate": 0.00019162874969418184,
+      "loss": 1.1384,
+      "step": 2964
+    },
+    {
+      "epoch": 0.5279558404558404,
+      "grad_norm": 0.4901772141456604,
+      "learning_rate": 0.00019162314251236465,
+      "loss": 1.0831,
+      "step": 2965
+    },
+    {
+      "epoch": 0.5281339031339032,
+      "grad_norm": 0.4743805229663849,
+      "learning_rate": 0.0001916175335353823,
+      "loss": 1.1894,
+      "step": 2966
+    },
+    {
+      "epoch": 0.5283119658119658,
+      "grad_norm": 0.5439450740814209,
+      "learning_rate": 0.00019161192276334466,
+      "loss": 1.2066,
+      "step": 2967
+    },
+    {
+      "epoch": 0.5284900284900285,
+      "grad_norm": 0.5123090744018555,
+      "learning_rate": 0.00019160631019636174,
+      "loss": 1.1829,
+      "step": 2968
+    },
+    {
+      "epoch": 0.5286680911680912,
+      "grad_norm": 0.5995343923568726,
+      "learning_rate": 0.00019160069583454346,
+      "loss": 1.4872,
+      "step": 2969
+    },
+    {
+      "epoch": 0.5288461538461539,
+      "grad_norm": 0.4596657156944275,
+      "learning_rate": 0.00019159507967799985,
+      "loss": 0.8948,
+      "step": 2970
+    },
+    {
+      "epoch": 0.5290242165242165,
+      "grad_norm": 0.5533682107925415,
+      "learning_rate": 0.0001915894617268409,
+      "loss": 1.1779,
+      "step": 2971
+    },
+    {
+      "epoch": 0.5292022792022792,
+      "grad_norm": 0.3860718309879303,
+      "learning_rate": 0.00019158384198117673,
+      "loss": 0.6424,
+      "step": 2972
+    },
+    {
+      "epoch": 0.5293803418803419,
+      "grad_norm": 0.47424063086509705,
+      "learning_rate": 0.0001915782204411174,
+      "loss": 1.1592,
+      "step": 2973
+    },
+    {
+      "epoch": 0.5295584045584045,
+      "grad_norm": 0.5050228834152222,
+      "learning_rate": 0.00019157259710677309,
+      "loss": 1.1971,
+      "step": 2974
+    },
+    {
+      "epoch": 0.5297364672364673,
+      "grad_norm": 0.6080113649368286,
+      "learning_rate": 0.00019156697197825396,
+      "loss": 1.1511,
+      "step": 2975
+    },
+    {
+      "epoch": 0.5299145299145299,
+      "grad_norm": 0.4805932641029358,
+      "learning_rate": 0.00019156134505567024,
+      "loss": 1.1033,
+      "step": 2976
+    },
+    {
+      "epoch": 0.5300925925925926,
+      "grad_norm": 0.4835345447063446,
+      "learning_rate": 0.00019155571633913215,
+      "loss": 1.1832,
+      "step": 2977
+    },
+    {
+      "epoch": 0.5302706552706553,
+      "grad_norm": 0.5183725953102112,
+      "learning_rate": 0.00019155008582875,
+      "loss": 0.9221,
+      "step": 2978
+    },
+    {
+      "epoch": 0.530448717948718,
+      "grad_norm": 0.48015761375427246,
+      "learning_rate": 0.00019154445352463412,
+      "loss": 1.045,
+      "step": 2979
+    },
+    {
+      "epoch": 0.5306267806267806,
+      "grad_norm": 0.4670043885707855,
+      "learning_rate": 0.0001915388194268948,
+      "loss": 0.9025,
+      "step": 2980
+    },
+    {
+      "epoch": 0.5308048433048433,
+      "grad_norm": 0.5048824548721313,
+      "learning_rate": 0.0001915331835356425,
+      "loss": 1.0681,
+      "step": 2981
+    },
+    {
+      "epoch": 0.530982905982906,
+      "grad_norm": 0.4785633981227875,
+      "learning_rate": 0.00019152754585098758,
+      "loss": 1.0097,
+      "step": 2982
+    },
+    {
+      "epoch": 0.5311609686609686,
+      "grad_norm": 0.4829573333263397,
+      "learning_rate": 0.00019152190637304056,
+      "loss": 1.0856,
+      "step": 2983
+    },
+    {
+      "epoch": 0.5313390313390314,
+      "grad_norm": 0.5425563454627991,
+      "learning_rate": 0.00019151626510191189,
+      "loss": 1.2313,
+      "step": 2984
+    },
+    {
+      "epoch": 0.531517094017094,
+      "grad_norm": 0.5532251596450806,
+      "learning_rate": 0.0001915106220377121,
+      "loss": 1.0328,
+      "step": 2985
+    },
+    {
+      "epoch": 0.5316951566951567,
+      "grad_norm": 0.47016972303390503,
+      "learning_rate": 0.0001915049771805518,
+      "loss": 1.2003,
+      "step": 2986
+    },
+    {
+      "epoch": 0.5318732193732194,
+      "grad_norm": 0.5241743326187134,
+      "learning_rate": 0.00019149933053054153,
+      "loss": 1.046,
+      "step": 2987
+    },
+    {
+      "epoch": 0.532051282051282,
+      "grad_norm": 0.5043526887893677,
+      "learning_rate": 0.00019149368208779197,
+      "loss": 1.0022,
+      "step": 2988
+    },
+    {
+      "epoch": 0.5322293447293447,
+      "grad_norm": 0.5563312768936157,
+      "learning_rate": 0.00019148803185241374,
+      "loss": 1.1017,
+      "step": 2989
+    },
+    {
+      "epoch": 0.5324074074074074,
+      "grad_norm": 0.5414231419563293,
+      "learning_rate": 0.00019148237982451763,
+      "loss": 0.9649,
+      "step": 2990
+    },
+    {
+      "epoch": 0.5325854700854701,
+      "grad_norm": 0.5452231764793396,
+      "learning_rate": 0.0001914767260042143,
+      "loss": 1.2281,
+      "step": 2991
+    },
+    {
+      "epoch": 0.5327635327635327,
+      "grad_norm": 0.5500698685646057,
+      "learning_rate": 0.00019147107039161454,
+      "loss": 1.2865,
+      "step": 2992
+    },
+    {
+      "epoch": 0.5329415954415955,
+      "grad_norm": 0.49747416377067566,
+      "learning_rate": 0.00019146541298682918,
+      "loss": 1.1296,
+      "step": 2993
+    },
+    {
+      "epoch": 0.5331196581196581,
+      "grad_norm": 0.5684167742729187,
+      "learning_rate": 0.00019145975378996903,
+      "loss": 1.0685,
+      "step": 2994
+    },
+    {
+      "epoch": 0.5332977207977208,
+      "grad_norm": 0.5411235690116882,
+      "learning_rate": 0.00019145409280114502,
+      "loss": 1.1372,
+      "step": 2995
+    },
+    {
+      "epoch": 0.5334757834757835,
+      "grad_norm": 0.5006675720214844,
+      "learning_rate": 0.00019144843002046806,
+      "loss": 1.0688,
+      "step": 2996
+    },
+    {
+      "epoch": 0.5336538461538461,
+      "grad_norm": 0.4591315686702728,
+      "learning_rate": 0.00019144276544804908,
+      "loss": 1.1071,
+      "step": 2997
+    },
+    {
+      "epoch": 0.5338319088319088,
+      "grad_norm": 0.5615306496620178,
+      "learning_rate": 0.000191437099083999,
+      "loss": 1.1033,
+      "step": 2998
+    },
+    {
+      "epoch": 0.5340099715099715,
+      "grad_norm": 0.4986817240715027,
+      "learning_rate": 0.00019143143092842897,
+      "loss": 1.176,
+      "step": 2999
+    },
+    {
+      "epoch": 0.5341880341880342,
+      "grad_norm": 0.5017120242118835,
+      "learning_rate": 0.00019142576098144995,
+      "loss": 1.0174,
+      "step": 3000
+    },
+    {
+      "epoch": 0.5343660968660968,
+      "grad_norm": 0.508298397064209,
+      "learning_rate": 0.0001914200892431731,
+      "loss": 1.164,
+      "step": 3001
+    },
+    {
+      "epoch": 0.5345441595441596,
+      "grad_norm": 0.48068809509277344,
+      "learning_rate": 0.0001914144157137095,
+      "loss": 0.7959,
+      "step": 3002
+    },
+    {
+      "epoch": 0.5347222222222222,
+      "grad_norm": 0.6347028017044067,
+      "learning_rate": 0.0001914087403931703,
+      "loss": 1.1727,
+      "step": 3003
+    },
+    {
+      "epoch": 0.5349002849002849,
+      "grad_norm": 0.5558401942253113,
+      "learning_rate": 0.00019140306328166676,
+      "loss": 1.2282,
+      "step": 3004
+    },
+    {
+      "epoch": 0.5350783475783476,
+      "grad_norm": 0.5093596577644348,
+      "learning_rate": 0.00019139738437931004,
+      "loss": 1.3258,
+      "step": 3005
+    },
+    {
+      "epoch": 0.5352564102564102,
+      "grad_norm": 0.4653106927871704,
+      "learning_rate": 0.0001913917036862114,
+      "loss": 1.1062,
+      "step": 3006
+    },
+    {
+      "epoch": 0.5354344729344729,
+      "grad_norm": 0.48085781931877136,
+      "learning_rate": 0.00019138602120248222,
+      "loss": 0.9019,
+      "step": 3007
+    },
+    {
+      "epoch": 0.5356125356125356,
+      "grad_norm": 0.5174745321273804,
+      "learning_rate": 0.0001913803369282338,
+      "loss": 1.044,
+      "step": 3008
+    },
+    {
+      "epoch": 0.5357905982905983,
+      "grad_norm": 0.5359669327735901,
+      "learning_rate": 0.00019137465086357746,
+      "loss": 1.0723,
+      "step": 3009
+    },
+    {
+      "epoch": 0.5359686609686609,
+      "grad_norm": 0.5583470463752747,
+      "learning_rate": 0.00019136896300862467,
+      "loss": 1.2192,
+      "step": 3010
+    },
+    {
+      "epoch": 0.5361467236467237,
+      "grad_norm": 0.4905693829059601,
+      "learning_rate": 0.00019136327336348688,
+      "loss": 1.2372,
+      "step": 3011
+    },
+    {
+      "epoch": 0.5363247863247863,
+      "grad_norm": 0.5741264820098877,
+      "learning_rate": 0.0001913575819282755,
+      "loss": 1.1703,
+      "step": 3012
+    },
+    {
+      "epoch": 0.5365028490028491,
+      "grad_norm": 0.577033281326294,
+      "learning_rate": 0.0001913518887031021,
+      "loss": 1.1555,
+      "step": 3013
+    },
+    {
+      "epoch": 0.5366809116809117,
+      "grad_norm": 0.46795153617858887,
+      "learning_rate": 0.00019134619368807822,
+      "loss": 0.8583,
+      "step": 3014
+    },
+    {
+      "epoch": 0.5368589743589743,
+      "grad_norm": 0.5973345637321472,
+      "learning_rate": 0.0001913404968833154,
+      "loss": 1.1509,
+      "step": 3015
+    },
+    {
+      "epoch": 0.5370370370370371,
+      "grad_norm": 0.62020343542099,
+      "learning_rate": 0.00019133479828892531,
+      "loss": 1.0781,
+      "step": 3016
+    },
+    {
+      "epoch": 0.5372150997150997,
+      "grad_norm": 0.5342286229133606,
+      "learning_rate": 0.00019132909790501958,
+      "loss": 1.1556,
+      "step": 3017
+    },
+    {
+      "epoch": 0.5373931623931624,
+      "grad_norm": 0.49612846970558167,
+      "learning_rate": 0.0001913233957317099,
+      "loss": 0.9027,
+      "step": 3018
+    },
+    {
+      "epoch": 0.5375712250712251,
+      "grad_norm": 0.5403908491134644,
+      "learning_rate": 0.00019131769176910796,
+      "loss": 1.1125,
+      "step": 3019
+    },
+    {
+      "epoch": 0.5377492877492878,
+      "grad_norm": 0.4952050447463989,
+      "learning_rate": 0.0001913119860173256,
+      "loss": 1.2329,
+      "step": 3020
+    },
+    {
+      "epoch": 0.5379273504273504,
+      "grad_norm": 0.5877819657325745,
+      "learning_rate": 0.0001913062784764745,
+      "loss": 1.2855,
+      "step": 3021
+    },
+    {
+      "epoch": 0.5381054131054132,
+      "grad_norm": 0.49312907457351685,
+      "learning_rate": 0.00019130056914666655,
+      "loss": 1.0212,
+      "step": 3022
+    },
+    {
+      "epoch": 0.5382834757834758,
+      "grad_norm": 0.45544490218162537,
+      "learning_rate": 0.00019129485802801366,
+      "loss": 0.9748,
+      "step": 3023
+    },
+    {
+      "epoch": 0.5384615384615384,
+      "grad_norm": 0.5535242557525635,
+      "learning_rate": 0.00019128914512062762,
+      "loss": 1.2134,
+      "step": 3024
+    },
+    {
+      "epoch": 0.5386396011396012,
+      "grad_norm": 0.45369696617126465,
+      "learning_rate": 0.00019128343042462044,
+      "loss": 0.9964,
+      "step": 3025
+    },
+    {
+      "epoch": 0.5388176638176638,
+      "grad_norm": 0.6240725517272949,
+      "learning_rate": 0.00019127771394010406,
+      "loss": 1.425,
+      "step": 3026
+    },
+    {
+      "epoch": 0.5389957264957265,
+      "grad_norm": 0.4859573245048523,
+      "learning_rate": 0.0001912719956671905,
+      "loss": 1.087,
+      "step": 3027
+    },
+    {
+      "epoch": 0.5391737891737892,
+      "grad_norm": 0.47529762983322144,
+      "learning_rate": 0.0001912662756059918,
+      "loss": 0.9517,
+      "step": 3028
+    },
+    {
+      "epoch": 0.5393518518518519,
+      "grad_norm": 0.5317288637161255,
+      "learning_rate": 0.00019126055375661997,
+      "loss": 1.0945,
+      "step": 3029
+    },
+    {
+      "epoch": 0.5395299145299145,
+      "grad_norm": 0.55974280834198,
+      "learning_rate": 0.00019125483011918722,
+      "loss": 1.0794,
+      "step": 3030
+    },
+    {
+      "epoch": 0.5397079772079773,
+      "grad_norm": 0.48579123616218567,
+      "learning_rate": 0.0001912491046938056,
+      "loss": 1.1421,
+      "step": 3031
+    },
+    {
+      "epoch": 0.5398860398860399,
+      "grad_norm": 0.4917181134223938,
+      "learning_rate": 0.00019124337748058733,
+      "loss": 0.9708,
+      "step": 3032
+    },
+    {
+      "epoch": 0.5400641025641025,
+      "grad_norm": 0.525291383266449,
+      "learning_rate": 0.00019123764847964466,
+      "loss": 1.064,
+      "step": 3033
+    },
+    {
+      "epoch": 0.5402421652421653,
+      "grad_norm": 0.5733301639556885,
+      "learning_rate": 0.00019123191769108977,
+      "loss": 1.2142,
+      "step": 3034
+    },
+    {
+      "epoch": 0.5404202279202279,
+      "grad_norm": 0.5400987863540649,
+      "learning_rate": 0.00019122618511503494,
+      "loss": 1.1309,
+      "step": 3035
+    },
+    {
+      "epoch": 0.5405982905982906,
+      "grad_norm": 0.6261051893234253,
+      "learning_rate": 0.00019122045075159257,
+      "loss": 1.2112,
+      "step": 3036
+    },
+    {
+      "epoch": 0.5407763532763533,
+      "grad_norm": 0.5483576059341431,
+      "learning_rate": 0.0001912147146008749,
+      "loss": 1.2705,
+      "step": 3037
+    },
+    {
+      "epoch": 0.540954415954416,
+      "grad_norm": 0.5442137122154236,
+      "learning_rate": 0.00019120897666299443,
+      "loss": 1.2512,
+      "step": 3038
+    },
+    {
+      "epoch": 0.5411324786324786,
+      "grad_norm": 0.5680811405181885,
+      "learning_rate": 0.00019120323693806355,
+      "loss": 1.392,
+      "step": 3039
+    },
+    {
+      "epoch": 0.5413105413105413,
+      "grad_norm": 0.5237287878990173,
+      "learning_rate": 0.00019119749542619466,
+      "loss": 1.1599,
+      "step": 3040
+    },
+    {
+      "epoch": 0.541488603988604,
+      "grad_norm": 0.48119300603866577,
+      "learning_rate": 0.00019119175212750032,
+      "loss": 1.0976,
+      "step": 3041
+    },
+    {
+      "epoch": 0.5416666666666666,
+      "grad_norm": 0.507033109664917,
+      "learning_rate": 0.00019118600704209302,
+      "loss": 1.0181,
+      "step": 3042
+    },
+    {
+      "epoch": 0.5418447293447294,
+      "grad_norm": 0.484672874212265,
+      "learning_rate": 0.00019118026017008531,
+      "loss": 1.1636,
+      "step": 3043
+    },
+    {
+      "epoch": 0.542022792022792,
+      "grad_norm": 0.4923502206802368,
+      "learning_rate": 0.00019117451151158985,
+      "loss": 1.0388,
+      "step": 3044
+    },
+    {
+      "epoch": 0.5422008547008547,
+      "grad_norm": 0.4882057309150696,
+      "learning_rate": 0.00019116876106671922,
+      "loss": 1.131,
+      "step": 3045
+    },
+    {
+      "epoch": 0.5423789173789174,
+      "grad_norm": 0.6068355441093445,
+      "learning_rate": 0.0001911630088355861,
+      "loss": 1.3218,
+      "step": 3046
+    },
+    {
+      "epoch": 0.54255698005698,
+      "grad_norm": 0.5012881755828857,
+      "learning_rate": 0.0001911572548183032,
+      "loss": 1.0514,
+      "step": 3047
+    },
+    {
+      "epoch": 0.5427350427350427,
+      "grad_norm": 0.49849793314933777,
+      "learning_rate": 0.00019115149901498328,
+      "loss": 1.0003,
+      "step": 3048
+    },
+    {
+      "epoch": 0.5429131054131054,
+      "grad_norm": 0.4934251010417938,
+      "learning_rate": 0.00019114574142573904,
+      "loss": 1.0319,
+      "step": 3049
+    },
+    {
+      "epoch": 0.5430911680911681,
+      "grad_norm": 0.4947762191295624,
+      "learning_rate": 0.00019113998205068334,
+      "loss": 1.0906,
+      "step": 3050
+    },
+    {
+      "epoch": 0.5432692307692307,
+      "grad_norm": 0.5449416041374207,
+      "learning_rate": 0.00019113422088992907,
+      "loss": 0.9093,
+      "step": 3051
+    },
+    {
+      "epoch": 0.5434472934472935,
+      "grad_norm": 0.49395284056663513,
+      "learning_rate": 0.00019112845794358902,
+      "loss": 1.0071,
+      "step": 3052
+    },
+    {
+      "epoch": 0.5436253561253561,
+      "grad_norm": 0.5478728413581848,
+      "learning_rate": 0.00019112269321177613,
+      "loss": 1.2124,
+      "step": 3053
+    },
+    {
+      "epoch": 0.5438034188034188,
+      "grad_norm": 0.6205173134803772,
+      "learning_rate": 0.0001911169266946034,
+      "loss": 1.021,
+      "step": 3054
+    },
+    {
+      "epoch": 0.5439814814814815,
+      "grad_norm": 0.4777783751487732,
+      "learning_rate": 0.00019111115839218372,
+      "loss": 0.9192,
+      "step": 3055
+    },
+    {
+      "epoch": 0.5441595441595442,
+      "grad_norm": 0.5541689991950989,
+      "learning_rate": 0.00019110538830463018,
+      "loss": 1.1248,
+      "step": 3056
+    },
+    {
+      "epoch": 0.5443376068376068,
+      "grad_norm": 0.4750942289829254,
+      "learning_rate": 0.0001910996164320558,
+      "loss": 1.3147,
+      "step": 3057
+    },
+    {
+      "epoch": 0.5445156695156695,
+      "grad_norm": 0.6283948421478271,
+      "learning_rate": 0.0001910938427745737,
+      "loss": 1.0919,
+      "step": 3058
+    },
+    {
+      "epoch": 0.5446937321937322,
+      "grad_norm": 0.552725076675415,
+      "learning_rate": 0.00019108806733229698,
+      "loss": 1.3807,
+      "step": 3059
+    },
+    {
+      "epoch": 0.5448717948717948,
+      "grad_norm": 0.4832848310470581,
+      "learning_rate": 0.0001910822901053388,
+      "loss": 1.0705,
+      "step": 3060
+    },
+    {
+      "epoch": 0.5450498575498576,
+      "grad_norm": 0.6468375325202942,
+      "learning_rate": 0.00019107651109381233,
+      "loss": 1.0766,
+      "step": 3061
+    },
+    {
+      "epoch": 0.5452279202279202,
+      "grad_norm": 0.5464920401573181,
+      "learning_rate": 0.00019107073029783083,
+      "loss": 1.0453,
+      "step": 3062
+    },
+    {
+      "epoch": 0.5454059829059829,
+      "grad_norm": 0.5321210026741028,
+      "learning_rate": 0.0001910649477175076,
+      "loss": 1.2326,
+      "step": 3063
+    },
+    {
+      "epoch": 0.5455840455840456,
+      "grad_norm": 0.5572962164878845,
+      "learning_rate": 0.00019105916335295582,
+      "loss": 1.0673,
+      "step": 3064
+    },
+    {
+      "epoch": 0.5457621082621082,
+      "grad_norm": 0.5239177942276001,
+      "learning_rate": 0.00019105337720428894,
+      "loss": 1.04,
+      "step": 3065
+    },
+    {
+      "epoch": 0.5459401709401709,
+      "grad_norm": 0.5633319616317749,
+      "learning_rate": 0.00019104758927162023,
+      "loss": 0.9606,
+      "step": 3066
+    },
+    {
+      "epoch": 0.5461182336182336,
+      "grad_norm": 0.5317914485931396,
+      "learning_rate": 0.0001910417995550632,
+      "loss": 1.0651,
+      "step": 3067
+    },
+    {
+      "epoch": 0.5462962962962963,
+      "grad_norm": 0.5126453638076782,
+      "learning_rate": 0.00019103600805473118,
+      "loss": 1.0316,
+      "step": 3068
+    },
+    {
+      "epoch": 0.5464743589743589,
+      "grad_norm": 0.5262107253074646,
+      "learning_rate": 0.00019103021477073773,
+      "loss": 1.0752,
+      "step": 3069
+    },
+    {
+      "epoch": 0.5466524216524217,
+      "grad_norm": 0.5384877324104309,
+      "learning_rate": 0.0001910244197031963,
+      "loss": 1.1731,
+      "step": 3070
+    },
+    {
+      "epoch": 0.5468304843304843,
+      "grad_norm": 0.5126553773880005,
+      "learning_rate": 0.00019101862285222048,
+      "loss": 1.2229,
+      "step": 3071
+    },
+    {
+      "epoch": 0.5470085470085471,
+      "grad_norm": 0.4841194450855255,
+      "learning_rate": 0.0001910128242179238,
+      "loss": 0.9955,
+      "step": 3072
+    },
+    {
+      "epoch": 0.5471866096866097,
+      "grad_norm": 0.526546061038971,
+      "learning_rate": 0.00019100702380041987,
+      "loss": 1.2436,
+      "step": 3073
+    },
+    {
+      "epoch": 0.5473646723646723,
+      "grad_norm": 0.5085833072662354,
+      "learning_rate": 0.0001910012215998224,
+      "loss": 1.011,
+      "step": 3074
+    },
+    {
+      "epoch": 0.5475427350427351,
+      "grad_norm": 0.5149994492530823,
+      "learning_rate": 0.000190995417616245,
+      "loss": 0.8632,
+      "step": 3075
+    },
+    {
+      "epoch": 0.5477207977207977,
+      "grad_norm": 0.48079630732536316,
+      "learning_rate": 0.00019098961184980145,
+      "loss": 1.1115,
+      "step": 3076
+    },
+    {
+      "epoch": 0.5478988603988604,
+      "grad_norm": 0.5769477486610413,
+      "learning_rate": 0.00019098380430060546,
+      "loss": 0.9544,
+      "step": 3077
+    },
+    {
+      "epoch": 0.5480769230769231,
+      "grad_norm": 0.5260093808174133,
+      "learning_rate": 0.0001909779949687708,
+      "loss": 1.2354,
+      "step": 3078
+    },
+    {
+      "epoch": 0.5482549857549858,
+      "grad_norm": 0.5518734455108643,
+      "learning_rate": 0.00019097218385441135,
+      "loss": 1.1944,
+      "step": 3079
+    },
+    {
+      "epoch": 0.5484330484330484,
+      "grad_norm": 0.5436808466911316,
+      "learning_rate": 0.00019096637095764095,
+      "loss": 1.0717,
+      "step": 3080
+    },
+    {
+      "epoch": 0.5486111111111112,
+      "grad_norm": 0.4749584197998047,
+      "learning_rate": 0.00019096055627857344,
+      "loss": 1.0417,
+      "step": 3081
+    },
+    {
+      "epoch": 0.5487891737891738,
+      "grad_norm": 0.5485591292381287,
+      "learning_rate": 0.0001909547398173228,
+      "loss": 1.2515,
+      "step": 3082
+    },
+    {
+      "epoch": 0.5489672364672364,
+      "grad_norm": 0.5751016736030579,
+      "learning_rate": 0.00019094892157400296,
+      "loss": 1.2112,
+      "step": 3083
+    },
+    {
+      "epoch": 0.5491452991452992,
+      "grad_norm": 0.5404475331306458,
+      "learning_rate": 0.00019094310154872795,
+      "loss": 0.4334,
+      "step": 3084
+    },
+    {
+      "epoch": 0.5493233618233618,
+      "grad_norm": 0.5198020935058594,
+      "learning_rate": 0.00019093727974161178,
+      "loss": 0.9759,
+      "step": 3085
+    },
+    {
+      "epoch": 0.5495014245014245,
+      "grad_norm": 0.4893439710140228,
+      "learning_rate": 0.0001909314561527685,
+      "loss": 1.1287,
+      "step": 3086
+    },
+    {
+      "epoch": 0.5496794871794872,
+      "grad_norm": 0.5675956606864929,
+      "learning_rate": 0.00019092563078231228,
+      "loss": 1.234,
+      "step": 3087
+    },
+    {
+      "epoch": 0.5498575498575499,
+      "grad_norm": 0.5539132356643677,
+      "learning_rate": 0.00019091980363035714,
+      "loss": 1.2378,
+      "step": 3088
+    },
+    {
+      "epoch": 0.5500356125356125,
+      "grad_norm": 0.5194353461265564,
+      "learning_rate": 0.00019091397469701735,
+      "loss": 1.1338,
+      "step": 3089
+    },
+    {
+      "epoch": 0.5502136752136753,
+      "grad_norm": 0.5143756866455078,
+      "learning_rate": 0.0001909081439824071,
+      "loss": 0.9118,
+      "step": 3090
+    },
+    {
+      "epoch": 0.5503917378917379,
+      "grad_norm": 0.5624327659606934,
+      "learning_rate": 0.0001909023114866406,
+      "loss": 1.035,
+      "step": 3091
+    },
+    {
+      "epoch": 0.5505698005698005,
+      "grad_norm": 0.5285067558288574,
+      "learning_rate": 0.0001908964772098321,
+      "loss": 1.0451,
+      "step": 3092
+    },
+    {
+      "epoch": 0.5507478632478633,
+      "grad_norm": 0.5730587244033813,
+      "learning_rate": 0.000190890641152096,
+      "loss": 1.0672,
+      "step": 3093
+    },
+    {
+      "epoch": 0.5509259259259259,
+      "grad_norm": 0.5822951197624207,
+      "learning_rate": 0.0001908848033135466,
+      "loss": 1.1791,
+      "step": 3094
+    },
+    {
+      "epoch": 0.5511039886039886,
+      "grad_norm": 0.596161961555481,
+      "learning_rate": 0.00019087896369429826,
+      "loss": 1.0954,
+      "step": 3095
+    },
+    {
+      "epoch": 0.5512820512820513,
+      "grad_norm": 0.5138190984725952,
+      "learning_rate": 0.00019087312229446542,
+      "loss": 0.896,
+      "step": 3096
+    },
+    {
+      "epoch": 0.551460113960114,
+      "grad_norm": 0.5061872601509094,
+      "learning_rate": 0.0001908672791141625,
+      "loss": 1.1017,
+      "step": 3097
+    },
+    {
+      "epoch": 0.5516381766381766,
+      "grad_norm": 0.5189547538757324,
+      "learning_rate": 0.00019086143415350404,
+      "loss": 1.2906,
+      "step": 3098
+    },
+    {
+      "epoch": 0.5518162393162394,
+      "grad_norm": 0.5640039443969727,
+      "learning_rate": 0.00019085558741260448,
+      "loss": 1.1001,
+      "step": 3099
+    },
+    {
+      "epoch": 0.551994301994302,
+      "grad_norm": 0.453867107629776,
+      "learning_rate": 0.00019084973889157844,
+      "loss": 0.9731,
+      "step": 3100
+    },
+    {
+      "epoch": 0.5521723646723646,
+      "grad_norm": 0.5431303977966309,
+      "learning_rate": 0.0001908438885905405,
+      "loss": 1.3511,
+      "step": 3101
+    },
+    {
+      "epoch": 0.5523504273504274,
+      "grad_norm": 0.47693368792533875,
+      "learning_rate": 0.00019083803650960527,
+      "loss": 1.0426,
+      "step": 3102
+    },
+    {
+      "epoch": 0.55252849002849,
+      "grad_norm": 0.4663422703742981,
+      "learning_rate": 0.00019083218264888743,
+      "loss": 1.05,
+      "step": 3103
+    },
+    {
+      "epoch": 0.5527065527065527,
+      "grad_norm": 0.561354398727417,
+      "learning_rate": 0.00019082632700850164,
+      "loss": 0.9608,
+      "step": 3104
+    },
+    {
+      "epoch": 0.5528846153846154,
+      "grad_norm": 0.4981916844844818,
+      "learning_rate": 0.00019082046958856266,
+      "loss": 1.1935,
+      "step": 3105
+    },
+    {
+      "epoch": 0.5530626780626781,
+      "grad_norm": 0.5301326513290405,
+      "learning_rate": 0.0001908146103891852,
+      "loss": 1.0646,
+      "step": 3106
+    },
+    {
+      "epoch": 0.5532407407407407,
+      "grad_norm": 0.5023610591888428,
+      "learning_rate": 0.00019080874941048416,
+      "loss": 1.127,
+      "step": 3107
+    },
+    {
+      "epoch": 0.5534188034188035,
+      "grad_norm": 0.5172514319419861,
+      "learning_rate": 0.00019080288665257426,
+      "loss": 1.0435,
+      "step": 3108
+    },
+    {
+      "epoch": 0.5535968660968661,
+      "grad_norm": 0.6340598464012146,
+      "learning_rate": 0.00019079702211557048,
+      "loss": 1.3528,
+      "step": 3109
+    },
+    {
+      "epoch": 0.5537749287749287,
+      "grad_norm": 0.46882256865501404,
+      "learning_rate": 0.0001907911557995876,
+      "loss": 1.1361,
+      "step": 3110
+    },
+    {
+      "epoch": 0.5539529914529915,
+      "grad_norm": 0.6401382088661194,
+      "learning_rate": 0.00019078528770474068,
+      "loss": 1.2415,
+      "step": 3111
+    },
+    {
+      "epoch": 0.5541310541310541,
+      "grad_norm": 0.5141328573226929,
+      "learning_rate": 0.00019077941783114463,
+      "loss": 1.0505,
+      "step": 3112
+    },
+    {
+      "epoch": 0.5543091168091168,
+      "grad_norm": 0.522318959236145,
+      "learning_rate": 0.00019077354617891444,
+      "loss": 1.0964,
+      "step": 3113
+    },
+    {
+      "epoch": 0.5544871794871795,
+      "grad_norm": 0.539551854133606,
+      "learning_rate": 0.00019076767274816517,
+      "loss": 1.0735,
+      "step": 3114
+    },
+    {
+      "epoch": 0.5546652421652422,
+      "grad_norm": 0.495320200920105,
+      "learning_rate": 0.00019076179753901195,
+      "loss": 0.9754,
+      "step": 3115
+    },
+    {
+      "epoch": 0.5548433048433048,
+      "grad_norm": 0.5499199628829956,
+      "learning_rate": 0.00019075592055156984,
+      "loss": 1.0043,
+      "step": 3116
+    },
+    {
+      "epoch": 0.5550213675213675,
+      "grad_norm": 0.5352509617805481,
+      "learning_rate": 0.00019075004178595396,
+      "loss": 1.1701,
+      "step": 3117
+    },
+    {
+      "epoch": 0.5551994301994302,
+      "grad_norm": 0.5392300486564636,
+      "learning_rate": 0.00019074416124227953,
+      "loss": 1.1612,
+      "step": 3118
+    },
+    {
+      "epoch": 0.5553774928774928,
+      "grad_norm": 0.5195050835609436,
+      "learning_rate": 0.0001907382789206618,
+      "loss": 1.0934,
+      "step": 3119
+    },
+    {
+      "epoch": 0.5555555555555556,
+      "grad_norm": 0.5276884436607361,
+      "learning_rate": 0.000190732394821216,
+      "loss": 0.9011,
+      "step": 3120
+    },
+    {
+      "epoch": 0.5557336182336182,
+      "grad_norm": 0.6115903258323669,
+      "learning_rate": 0.00019072650894405734,
+      "loss": 1.3065,
+      "step": 3121
+    },
+    {
+      "epoch": 0.5559116809116809,
+      "grad_norm": 0.5752483010292053,
+      "learning_rate": 0.00019072062128930127,
+      "loss": 1.0063,
+      "step": 3122
+    },
+    {
+      "epoch": 0.5560897435897436,
+      "grad_norm": 0.5508273243904114,
+      "learning_rate": 0.00019071473185706302,
+      "loss": 1.2598,
+      "step": 3123
+    },
+    {
+      "epoch": 0.5562678062678063,
+      "grad_norm": 0.49712198972702026,
+      "learning_rate": 0.00019070884064745808,
+      "loss": 0.924,
+      "step": 3124
+    },
+    {
+      "epoch": 0.5564458689458689,
+      "grad_norm": 0.572849452495575,
+      "learning_rate": 0.00019070294766060185,
+      "loss": 0.9683,
+      "step": 3125
+    },
+    {
+      "epoch": 0.5566239316239316,
+      "grad_norm": 0.4807920753955841,
+      "learning_rate": 0.00019069705289660976,
+      "loss": 1.0998,
+      "step": 3126
+    },
+    {
+      "epoch": 0.5568019943019943,
+      "grad_norm": 0.5543031096458435,
+      "learning_rate": 0.0001906911563555973,
+      "loss": 1.0878,
+      "step": 3127
+    },
+    {
+      "epoch": 0.5569800569800569,
+      "grad_norm": 0.5710418820381165,
+      "learning_rate": 0.00019068525803768007,
+      "loss": 1.0381,
+      "step": 3128
+    },
+    {
+      "epoch": 0.5571581196581197,
+      "grad_norm": 0.5169163346290588,
+      "learning_rate": 0.00019067935794297357,
+      "loss": 1.1149,
+      "step": 3129
+    },
+    {
+      "epoch": 0.5573361823361823,
+      "grad_norm": 0.6474376916885376,
+      "learning_rate": 0.00019067345607159345,
+      "loss": 0.9828,
+      "step": 3130
+    },
+    {
+      "epoch": 0.5575142450142451,
+      "grad_norm": 0.5029847621917725,
+      "learning_rate": 0.0001906675524236553,
+      "loss": 0.797,
+      "step": 3131
+    },
+    {
+      "epoch": 0.5576923076923077,
+      "grad_norm": 0.5681431293487549,
+      "learning_rate": 0.00019066164699927478,
+      "loss": 1.1565,
+      "step": 3132
+    },
+    {
+      "epoch": 0.5578703703703703,
+      "grad_norm": 0.5654549598693848,
+      "learning_rate": 0.00019065573979856764,
+      "loss": 1.2488,
+      "step": 3133
+    },
+    {
+      "epoch": 0.5580484330484331,
+      "grad_norm": 0.47653043270111084,
+      "learning_rate": 0.0001906498308216496,
+      "loss": 1.0428,
+      "step": 3134
+    },
+    {
+      "epoch": 0.5582264957264957,
+      "grad_norm": 0.5068467259407043,
+      "learning_rate": 0.00019064392006863643,
+      "loss": 0.9659,
+      "step": 3135
+    },
+    {
+      "epoch": 0.5584045584045584,
+      "grad_norm": 0.7076661586761475,
+      "learning_rate": 0.00019063800753964393,
+      "loss": 1.1289,
+      "step": 3136
+    },
+    {
+      "epoch": 0.5585826210826211,
+      "grad_norm": 0.551456868648529,
+      "learning_rate": 0.000190632093234788,
+      "loss": 1.1925,
+      "step": 3137
+    },
+    {
+      "epoch": 0.5587606837606838,
+      "grad_norm": 0.518276035785675,
+      "learning_rate": 0.00019062617715418442,
+      "loss": 0.8681,
+      "step": 3138
+    },
+    {
+      "epoch": 0.5589387464387464,
+      "grad_norm": 0.5272278785705566,
+      "learning_rate": 0.0001906202592979492,
+      "loss": 1.0865,
+      "step": 3139
+    },
+    {
+      "epoch": 0.5591168091168092,
+      "grad_norm": 0.5344942212104797,
+      "learning_rate": 0.00019061433966619822,
+      "loss": 1.1647,
+      "step": 3140
+    },
+    {
+      "epoch": 0.5592948717948718,
+      "grad_norm": 0.5833460092544556,
+      "learning_rate": 0.00019060841825904753,
+      "loss": 1.3403,
+      "step": 3141
+    },
+    {
+      "epoch": 0.5594729344729344,
+      "grad_norm": 0.5707054734230042,
+      "learning_rate": 0.00019060249507661306,
+      "loss": 1.1236,
+      "step": 3142
+    },
+    {
+      "epoch": 0.5596509971509972,
+      "grad_norm": 0.5446065664291382,
+      "learning_rate": 0.00019059657011901094,
+      "loss": 1.017,
+      "step": 3143
+    },
+    {
+      "epoch": 0.5598290598290598,
+      "grad_norm": 0.5285109281539917,
+      "learning_rate": 0.0001905906433863572,
+      "loss": 1.3186,
+      "step": 3144
+    },
+    {
+      "epoch": 0.5600071225071225,
+      "grad_norm": 0.5308659672737122,
+      "learning_rate": 0.00019058471487876802,
+      "loss": 0.8464,
+      "step": 3145
+    },
+    {
+      "epoch": 0.5601851851851852,
+      "grad_norm": 0.5218054056167603,
+      "learning_rate": 0.00019057878459635948,
+      "loss": 1.0219,
+      "step": 3146
+    },
+    {
+      "epoch": 0.5603632478632479,
+      "grad_norm": 0.45067787170410156,
+      "learning_rate": 0.00019057285253924785,
+      "loss": 1.0364,
+      "step": 3147
+    },
+    {
+      "epoch": 0.5605413105413105,
+      "grad_norm": 0.4856041669845581,
+      "learning_rate": 0.0001905669187075493,
+      "loss": 1.1928,
+      "step": 3148
+    },
+    {
+      "epoch": 0.5607193732193733,
+      "grad_norm": 0.506912112236023,
+      "learning_rate": 0.00019056098310138016,
+      "loss": 1.119,
+      "step": 3149
+    },
+    {
+      "epoch": 0.5608974358974359,
+      "grad_norm": 0.49049463868141174,
+      "learning_rate": 0.00019055504572085662,
+      "loss": 1.2165,
+      "step": 3150
+    },
+    {
+      "epoch": 0.5610754985754985,
+      "grad_norm": 0.5250293612480164,
+      "learning_rate": 0.0001905491065660951,
+      "loss": 1.1427,
+      "step": 3151
+    },
+    {
+      "epoch": 0.5612535612535613,
+      "grad_norm": 0.43438446521759033,
+      "learning_rate": 0.00019054316563721195,
+      "loss": 0.884,
+      "step": 3152
+    },
+    {
+      "epoch": 0.5614316239316239,
+      "grad_norm": 0.5386807918548584,
+      "learning_rate": 0.00019053722293432354,
+      "loss": 1.1494,
+      "step": 3153
+    },
+    {
+      "epoch": 0.5616096866096866,
+      "grad_norm": 0.5403809547424316,
+      "learning_rate": 0.00019053127845754632,
+      "loss": 1.1743,
+      "step": 3154
+    },
+    {
+      "epoch": 0.5617877492877493,
+      "grad_norm": 0.4759823977947235,
+      "learning_rate": 0.00019052533220699678,
+      "loss": 1.0716,
+      "step": 3155
+    },
+    {
+      "epoch": 0.561965811965812,
+      "grad_norm": 0.45332327485084534,
+      "learning_rate": 0.0001905193841827914,
+      "loss": 0.8405,
+      "step": 3156
+    },
+    {
+      "epoch": 0.5621438746438746,
+      "grad_norm": 0.5617053508758545,
+      "learning_rate": 0.00019051343438504671,
+      "loss": 1.0422,
+      "step": 3157
+    },
+    {
+      "epoch": 0.5623219373219374,
+      "grad_norm": 0.5088049173355103,
+      "learning_rate": 0.00019050748281387931,
+      "loss": 1.0067,
+      "step": 3158
+    },
+    {
+      "epoch": 0.5625,
+      "grad_norm": 0.5174484848976135,
+      "learning_rate": 0.00019050152946940578,
+      "loss": 1.0623,
+      "step": 3159
+    },
+    {
+      "epoch": 0.5626780626780626,
+      "grad_norm": 0.6093568801879883,
+      "learning_rate": 0.0001904955743517428,
+      "loss": 1.24,
+      "step": 3160
+    },
+    {
+      "epoch": 0.5628561253561254,
+      "grad_norm": 0.49063584208488464,
+      "learning_rate": 0.00019048961746100703,
+      "loss": 0.8563,
+      "step": 3161
+    },
+    {
+      "epoch": 0.563034188034188,
+      "grad_norm": 0.583940863609314,
+      "learning_rate": 0.00019048365879731517,
+      "loss": 1.0695,
+      "step": 3162
+    },
+    {
+      "epoch": 0.5632122507122507,
+      "grad_norm": 0.4943268597126007,
+      "learning_rate": 0.000190477698360784,
+      "loss": 0.8606,
+      "step": 3163
+    },
+    {
+      "epoch": 0.5633903133903134,
+      "grad_norm": 0.5050932168960571,
+      "learning_rate": 0.00019047173615153028,
+      "loss": 1.1591,
+      "step": 3164
+    },
+    {
+      "epoch": 0.5635683760683761,
+      "grad_norm": 0.5445677638053894,
+      "learning_rate": 0.0001904657721696708,
+      "loss": 1.262,
+      "step": 3165
+    },
+    {
+      "epoch": 0.5637464387464387,
+      "grad_norm": 0.5445297360420227,
+      "learning_rate": 0.00019045980641532246,
+      "loss": 1.223,
+      "step": 3166
+    },
+    {
+      "epoch": 0.5639245014245015,
+      "grad_norm": 0.5098413228988647,
+      "learning_rate": 0.00019045383888860213,
+      "loss": 1.0829,
+      "step": 3167
+    },
+    {
+      "epoch": 0.5641025641025641,
+      "grad_norm": 0.484998881816864,
+      "learning_rate": 0.0001904478695896267,
+      "loss": 1.0711,
+      "step": 3168
+    },
+    {
+      "epoch": 0.5642806267806267,
+      "grad_norm": 0.5515334010124207,
+      "learning_rate": 0.0001904418985185132,
+      "loss": 1.1583,
+      "step": 3169
+    },
+    {
+      "epoch": 0.5644586894586895,
+      "grad_norm": 0.545460045337677,
+      "learning_rate": 0.00019043592567537853,
+      "loss": 1.2321,
+      "step": 3170
+    },
+    {
+      "epoch": 0.5646367521367521,
+      "grad_norm": 0.5463964343070984,
+      "learning_rate": 0.0001904299510603398,
+      "loss": 1.1019,
+      "step": 3171
+    },
+    {
+      "epoch": 0.5648148148148148,
+      "grad_norm": 0.5619220733642578,
+      "learning_rate": 0.000190423974673514,
+      "loss": 1.1001,
+      "step": 3172
+    },
+    {
+      "epoch": 0.5649928774928775,
+      "grad_norm": 0.4448916018009186,
+      "learning_rate": 0.00019041799651501825,
+      "loss": 1.057,
+      "step": 3173
+    },
+    {
+      "epoch": 0.5651709401709402,
+      "grad_norm": 0.6073006987571716,
+      "learning_rate": 0.00019041201658496975,
+      "loss": 1.0306,
+      "step": 3174
+    },
+    {
+      "epoch": 0.5653490028490028,
+      "grad_norm": 0.5342072248458862,
+      "learning_rate": 0.0001904060348834855,
+      "loss": 0.9231,
+      "step": 3175
+    },
+    {
+      "epoch": 0.5655270655270656,
+      "grad_norm": 0.4505697786808014,
+      "learning_rate": 0.0001904000514106829,
+      "loss": 1.1134,
+      "step": 3176
+    },
+    {
+      "epoch": 0.5657051282051282,
+      "grad_norm": 0.5627852082252502,
+      "learning_rate": 0.00019039406616667902,
+      "loss": 1.2138,
+      "step": 3177
+    },
+    {
+      "epoch": 0.5658831908831908,
+      "grad_norm": 0.499734103679657,
+      "learning_rate": 0.0001903880791515912,
+      "loss": 1.1074,
+      "step": 3178
+    },
+    {
+      "epoch": 0.5660612535612536,
+      "grad_norm": 0.4768189489841461,
+      "learning_rate": 0.00019038209036553676,
+      "loss": 0.9442,
+      "step": 3179
+    },
+    {
+      "epoch": 0.5662393162393162,
+      "grad_norm": 0.5265373587608337,
+      "learning_rate": 0.00019037609980863298,
+      "loss": 1.0907,
+      "step": 3180
+    },
+    {
+      "epoch": 0.5664173789173789,
+      "grad_norm": 0.5506128072738647,
+      "learning_rate": 0.00019037010748099728,
+      "loss": 1.2541,
+      "step": 3181
+    },
+    {
+      "epoch": 0.5665954415954416,
+      "grad_norm": 0.44860872626304626,
+      "learning_rate": 0.00019036411338274703,
+      "loss": 0.893,
+      "step": 3182
+    },
+    {
+      "epoch": 0.5667735042735043,
+      "grad_norm": 0.4901522994041443,
+      "learning_rate": 0.00019035811751399973,
+      "loss": 1.0469,
+      "step": 3183
+    },
+    {
+      "epoch": 0.5669515669515669,
+      "grad_norm": 0.500868022441864,
+      "learning_rate": 0.0001903521198748728,
+      "loss": 1.0527,
+      "step": 3184
+    },
+    {
+      "epoch": 0.5671296296296297,
+      "grad_norm": 0.5508102774620056,
+      "learning_rate": 0.00019034612046548376,
+      "loss": 1.283,
+      "step": 3185
+    },
+    {
+      "epoch": 0.5673076923076923,
+      "grad_norm": 0.5079495906829834,
+      "learning_rate": 0.0001903401192859502,
+      "loss": 1.0808,
+      "step": 3186
+    },
+    {
+      "epoch": 0.5674857549857549,
+      "grad_norm": 0.5758788585662842,
+      "learning_rate": 0.00019033411633638964,
+      "loss": 1.1301,
+      "step": 3187
+    },
+    {
+      "epoch": 0.5676638176638177,
+      "grad_norm": 0.46557924151420593,
+      "learning_rate": 0.00019032811161691972,
+      "loss": 1.0205,
+      "step": 3188
+    },
+    {
+      "epoch": 0.5678418803418803,
+      "grad_norm": 0.5665056109428406,
+      "learning_rate": 0.0001903221051276581,
+      "loss": 1.1926,
+      "step": 3189
+    },
+    {
+      "epoch": 0.5680199430199431,
+      "grad_norm": 0.5948992967605591,
+      "learning_rate": 0.00019031609686872246,
+      "loss": 1.2724,
+      "step": 3190
+    },
+    {
+      "epoch": 0.5681980056980057,
+      "grad_norm": 0.6189367771148682,
+      "learning_rate": 0.00019031008684023055,
+      "loss": 1.2762,
+      "step": 3191
+    },
+    {
+      "epoch": 0.5683760683760684,
+      "grad_norm": 0.49511992931365967,
+      "learning_rate": 0.00019030407504230006,
+      "loss": 1.0117,
+      "step": 3192
+    },
+    {
+      "epoch": 0.5685541310541311,
+      "grad_norm": 0.5358837842941284,
+      "learning_rate": 0.00019029806147504878,
+      "loss": 0.944,
+      "step": 3193
+    },
+    {
+      "epoch": 0.5687321937321937,
+      "grad_norm": 0.458636999130249,
+      "learning_rate": 0.00019029204613859463,
+      "loss": 0.8174,
+      "step": 3194
+    },
+    {
+      "epoch": 0.5689102564102564,
+      "grad_norm": 0.5168304443359375,
+      "learning_rate": 0.00019028602903305535,
+      "loss": 1.1533,
+      "step": 3195
+    },
+    {
+      "epoch": 0.5690883190883191,
+      "grad_norm": 0.5334134697914124,
+      "learning_rate": 0.00019028001015854892,
+      "loss": 1.1868,
+      "step": 3196
+    },
+    {
+      "epoch": 0.5692663817663818,
+      "grad_norm": 0.5649123191833496,
+      "learning_rate": 0.0001902739895151932,
+      "loss": 0.9876,
+      "step": 3197
+    },
+    {
+      "epoch": 0.5694444444444444,
+      "grad_norm": 0.5647651553153992,
+      "learning_rate": 0.0001902679671031062,
+      "loss": 1.0805,
+      "step": 3198
+    },
+    {
+      "epoch": 0.5696225071225072,
+      "grad_norm": 0.5251876711845398,
+      "learning_rate": 0.00019026194292240587,
+      "loss": 1.2335,
+      "step": 3199
+    },
+    {
+      "epoch": 0.5698005698005698,
+      "grad_norm": 0.5268014669418335,
+      "learning_rate": 0.0001902559169732103,
+      "loss": 1.19,
+      "step": 3200
+    },
+    {
+      "epoch": 0.5699786324786325,
+      "grad_norm": 0.5301041007041931,
+      "learning_rate": 0.00019024988925563752,
+      "loss": 1.1173,
+      "step": 3201
+    },
+    {
+      "epoch": 0.5701566951566952,
+      "grad_norm": 0.4531562030315399,
+      "learning_rate": 0.00019024385976980566,
+      "loss": 0.7576,
+      "step": 3202
+    },
+    {
+      "epoch": 0.5703347578347578,
+      "grad_norm": 0.5779716372489929,
+      "learning_rate": 0.00019023782851583282,
+      "loss": 1.1719,
+      "step": 3203
+    },
+    {
+      "epoch": 0.5705128205128205,
+      "grad_norm": 0.4886093735694885,
+      "learning_rate": 0.00019023179549383716,
+      "loss": 1.085,
+      "step": 3204
+    },
+    {
+      "epoch": 0.5706908831908832,
+      "grad_norm": 0.510117769241333,
+      "learning_rate": 0.0001902257607039369,
+      "loss": 0.8931,
+      "step": 3205
+    },
+    {
+      "epoch": 0.5708689458689459,
+      "grad_norm": 0.5195479393005371,
+      "learning_rate": 0.00019021972414625036,
+      "loss": 0.9922,
+      "step": 3206
+    },
+    {
+      "epoch": 0.5710470085470085,
+      "grad_norm": 0.5791407227516174,
+      "learning_rate": 0.00019021368582089568,
+      "loss": 1.112,
+      "step": 3207
+    },
+    {
+      "epoch": 0.5712250712250713,
+      "grad_norm": 0.5056005716323853,
+      "learning_rate": 0.00019020764572799122,
+      "loss": 0.8474,
+      "step": 3208
+    },
+    {
+      "epoch": 0.5714031339031339,
+      "grad_norm": 0.5060068964958191,
+      "learning_rate": 0.00019020160386765537,
+      "loss": 1.071,
+      "step": 3209
+    },
+    {
+      "epoch": 0.5715811965811965,
+      "grad_norm": 0.5396568775177002,
+      "learning_rate": 0.00019019556024000648,
+      "loss": 1.0436,
+      "step": 3210
+    },
+    {
+      "epoch": 0.5717592592592593,
+      "grad_norm": 0.6552190780639648,
+      "learning_rate": 0.0001901895148451629,
+      "loss": 0.9869,
+      "step": 3211
+    },
+    {
+      "epoch": 0.5719373219373219,
+      "grad_norm": 0.5177004337310791,
+      "learning_rate": 0.00019018346768324314,
+      "loss": 1.0193,
+      "step": 3212
+    },
+    {
+      "epoch": 0.5721153846153846,
+      "grad_norm": 0.5192117094993591,
+      "learning_rate": 0.0001901774187543657,
+      "loss": 1.1263,
+      "step": 3213
+    },
+    {
+      "epoch": 0.5722934472934473,
+      "grad_norm": 0.4857729971408844,
+      "learning_rate": 0.00019017136805864906,
+      "loss": 0.9808,
+      "step": 3214
+    },
+    {
+      "epoch": 0.57247150997151,
+      "grad_norm": 0.5800918936729431,
+      "learning_rate": 0.00019016531559621177,
+      "loss": 1.2334,
+      "step": 3215
+    },
+    {
+      "epoch": 0.5726495726495726,
+      "grad_norm": 0.4812086522579193,
+      "learning_rate": 0.00019015926136717242,
+      "loss": 1.2409,
+      "step": 3216
+    },
+    {
+      "epoch": 0.5728276353276354,
+      "grad_norm": 0.5128398537635803,
+      "learning_rate": 0.00019015320537164963,
+      "loss": 0.9036,
+      "step": 3217
+    },
+    {
+      "epoch": 0.573005698005698,
+      "grad_norm": 0.4761141538619995,
+      "learning_rate": 0.00019014714760976205,
+      "loss": 1.1058,
+      "step": 3218
+    },
+    {
+      "epoch": 0.5731837606837606,
+      "grad_norm": 0.5850459933280945,
+      "learning_rate": 0.0001901410880816284,
+      "loss": 1.1011,
+      "step": 3219
+    },
+    {
+      "epoch": 0.5733618233618234,
+      "grad_norm": 0.5648714303970337,
+      "learning_rate": 0.00019013502678736738,
+      "loss": 1.0479,
+      "step": 3220
+    },
+    {
+      "epoch": 0.573539886039886,
+      "grad_norm": 0.5835902094841003,
+      "learning_rate": 0.00019012896372709774,
+      "loss": 1.0555,
+      "step": 3221
+    },
+    {
+      "epoch": 0.5737179487179487,
+      "grad_norm": 0.5155113935470581,
+      "learning_rate": 0.00019012289890093828,
+      "loss": 0.9488,
+      "step": 3222
+    },
+    {
+      "epoch": 0.5738960113960114,
+      "grad_norm": 0.5064889788627625,
+      "learning_rate": 0.00019011683230900784,
+      "loss": 0.9144,
+      "step": 3223
+    },
+    {
+      "epoch": 0.5740740740740741,
+      "grad_norm": 0.53825843334198,
+      "learning_rate": 0.00019011076395142527,
+      "loss": 1.0713,
+      "step": 3224
+    },
+    {
+      "epoch": 0.5742521367521367,
+      "grad_norm": 0.5341386198997498,
+      "learning_rate": 0.00019010469382830947,
+      "loss": 1.1438,
+      "step": 3225
+    },
+    {
+      "epoch": 0.5744301994301995,
+      "grad_norm": 0.5300050973892212,
+      "learning_rate": 0.00019009862193977936,
+      "loss": 1.0114,
+      "step": 3226
+    },
+    {
+      "epoch": 0.5746082621082621,
+      "grad_norm": 0.6033682823181152,
+      "learning_rate": 0.0001900925482859539,
+      "loss": 1.0458,
+      "step": 3227
+    },
+    {
+      "epoch": 0.5747863247863247,
+      "grad_norm": 0.5108983516693115,
+      "learning_rate": 0.00019008647286695215,
+      "loss": 1.1211,
+      "step": 3228
+    },
+    {
+      "epoch": 0.5749643874643875,
+      "grad_norm": 0.5263782739639282,
+      "learning_rate": 0.00019008039568289308,
+      "loss": 0.8647,
+      "step": 3229
+    },
+    {
+      "epoch": 0.5751424501424501,
+      "grad_norm": 0.47119566798210144,
+      "learning_rate": 0.0001900743167338958,
+      "loss": 1.019,
+      "step": 3230
+    },
+    {
+      "epoch": 0.5753205128205128,
+      "grad_norm": 0.56391841173172,
+      "learning_rate": 0.00019006823602007937,
+      "loss": 0.9791,
+      "step": 3231
+    },
+    {
+      "epoch": 0.5754985754985755,
+      "grad_norm": 0.5364985466003418,
+      "learning_rate": 0.000190062153541563,
+      "loss": 1.1355,
+      "step": 3232
+    },
+    {
+      "epoch": 0.5756766381766382,
+      "grad_norm": 0.5098565220832825,
+      "learning_rate": 0.00019005606929846578,
+      "loss": 0.987,
+      "step": 3233
+    },
+    {
+      "epoch": 0.5758547008547008,
+      "grad_norm": 0.6640968918800354,
+      "learning_rate": 0.00019004998329090692,
+      "loss": 1.1165,
+      "step": 3234
+    },
+    {
+      "epoch": 0.5760327635327636,
+      "grad_norm": 0.5044721961021423,
+      "learning_rate": 0.00019004389551900578,
+      "loss": 0.8643,
+      "step": 3235
+    },
+    {
+      "epoch": 0.5762108262108262,
+      "grad_norm": 0.4822785258293152,
+      "learning_rate": 0.00019003780598288153,
+      "loss": 1.0735,
+      "step": 3236
+    },
+    {
+      "epoch": 0.5763888888888888,
+      "grad_norm": 0.505261242389679,
+      "learning_rate": 0.00019003171468265348,
+      "loss": 1.0001,
+      "step": 3237
+    },
+    {
+      "epoch": 0.5765669515669516,
+      "grad_norm": 0.5020412802696228,
+      "learning_rate": 0.00019002562161844102,
+      "loss": 0.9601,
+      "step": 3238
+    },
+    {
+      "epoch": 0.5767450142450142,
+      "grad_norm": 0.4920475482940674,
+      "learning_rate": 0.00019001952679036354,
+      "loss": 1.0111,
+      "step": 3239
+    },
+    {
+      "epoch": 0.5769230769230769,
+      "grad_norm": 0.5638813376426697,
+      "learning_rate": 0.00019001343019854042,
+      "loss": 1.1456,
+      "step": 3240
+    },
+    {
+      "epoch": 0.5771011396011396,
+      "grad_norm": 0.5519235134124756,
+      "learning_rate": 0.0001900073318430911,
+      "loss": 0.9258,
+      "step": 3241
+    },
+    {
+      "epoch": 0.5772792022792023,
+      "grad_norm": 0.5207770466804504,
+      "learning_rate": 0.0001900012317241351,
+      "loss": 0.9859,
+      "step": 3242
+    },
+    {
+      "epoch": 0.5774572649572649,
+      "grad_norm": 0.5493707656860352,
+      "learning_rate": 0.00018999512984179195,
+      "loss": 1.1183,
+      "step": 3243
+    },
+    {
+      "epoch": 0.5776353276353277,
+      "grad_norm": 0.4504764676094055,
+      "learning_rate": 0.00018998902619618116,
+      "loss": 0.9363,
+      "step": 3244
+    },
+    {
+      "epoch": 0.5778133903133903,
+      "grad_norm": 0.5232836604118347,
+      "learning_rate": 0.00018998292078742233,
+      "loss": 1.1887,
+      "step": 3245
+    },
+    {
+      "epoch": 0.5779914529914529,
+      "grad_norm": 0.5715088248252869,
+      "learning_rate": 0.0001899768136156351,
+      "loss": 1.4524,
+      "step": 3246
+    },
+    {
+      "epoch": 0.5781695156695157,
+      "grad_norm": 0.59555584192276,
+      "learning_rate": 0.0001899707046809391,
+      "loss": 1.0922,
+      "step": 3247
+    },
+    {
+      "epoch": 0.5783475783475783,
+      "grad_norm": 0.4500894546508789,
+      "learning_rate": 0.00018996459398345404,
+      "loss": 1.0087,
+      "step": 3248
+    },
+    {
+      "epoch": 0.5785256410256411,
+      "grad_norm": 0.49126625061035156,
+      "learning_rate": 0.00018995848152329967,
+      "loss": 1.1512,
+      "step": 3249
+    },
+    {
+      "epoch": 0.5787037037037037,
+      "grad_norm": 0.4096335172653198,
+      "learning_rate": 0.00018995236730059574,
+      "loss": 0.7633,
+      "step": 3250
+    },
+    {
+      "epoch": 0.5788817663817664,
+      "grad_norm": 0.5364313721656799,
+      "learning_rate": 0.00018994625131546199,
+      "loss": 1.295,
+      "step": 3251
+    },
+    {
+      "epoch": 0.5790598290598291,
+      "grad_norm": 0.4897502660751343,
+      "learning_rate": 0.00018994013356801834,
+      "loss": 1.2197,
+      "step": 3252
+    },
+    {
+      "epoch": 0.5792378917378918,
+      "grad_norm": 0.5101368427276611,
+      "learning_rate": 0.00018993401405838456,
+      "loss": 1.1129,
+      "step": 3253
+    },
+    {
+      "epoch": 0.5794159544159544,
+      "grad_norm": 0.5426377654075623,
+      "learning_rate": 0.00018992789278668063,
+      "loss": 1.188,
+      "step": 3254
+    },
+    {
+      "epoch": 0.5795940170940171,
+      "grad_norm": 0.5066362023353577,
+      "learning_rate": 0.00018992176975302644,
+      "loss": 1.2802,
+      "step": 3255
+    },
+    {
+      "epoch": 0.5797720797720798,
+      "grad_norm": 0.5418947339057922,
+      "learning_rate": 0.00018991564495754196,
+      "loss": 1.1675,
+      "step": 3256
+    },
+    {
+      "epoch": 0.5799501424501424,
+      "grad_norm": 0.5139963626861572,
+      "learning_rate": 0.0001899095184003472,
+      "loss": 0.9717,
+      "step": 3257
+    },
+    {
+      "epoch": 0.5801282051282052,
+      "grad_norm": 0.5167285799980164,
+      "learning_rate": 0.00018990339008156219,
+      "loss": 1.1529,
+      "step": 3258
+    },
+    {
+      "epoch": 0.5803062678062678,
+      "grad_norm": 0.53471440076828,
+      "learning_rate": 0.00018989726000130704,
+      "loss": 1.0711,
+      "step": 3259
+    },
+    {
+      "epoch": 0.5804843304843305,
+      "grad_norm": 0.49875229597091675,
+      "learning_rate": 0.0001898911281597018,
+      "loss": 1.1095,
+      "step": 3260
+    },
+    {
+      "epoch": 0.5806623931623932,
+      "grad_norm": 0.4473155438899994,
+      "learning_rate": 0.00018988499455686663,
+      "loss": 0.836,
+      "step": 3261
+    },
+    {
+      "epoch": 0.5808404558404558,
+      "grad_norm": 0.6181996464729309,
+      "learning_rate": 0.00018987885919292174,
+      "loss": 1.2787,
+      "step": 3262
+    },
+    {
+      "epoch": 0.5810185185185185,
+      "grad_norm": 0.4996899664402008,
+      "learning_rate": 0.00018987272206798733,
+      "loss": 1.2132,
+      "step": 3263
+    },
+    {
+      "epoch": 0.5811965811965812,
+      "grad_norm": 0.49979713559150696,
+      "learning_rate": 0.00018986658318218358,
+      "loss": 0.8388,
+      "step": 3264
+    },
+    {
+      "epoch": 0.5813746438746439,
+      "grad_norm": 0.5288876295089722,
+      "learning_rate": 0.00018986044253563084,
+      "loss": 1.1871,
+      "step": 3265
+    },
+    {
+      "epoch": 0.5815527065527065,
+      "grad_norm": 0.534063458442688,
+      "learning_rate": 0.00018985430012844937,
+      "loss": 0.96,
+      "step": 3266
+    },
+    {
+      "epoch": 0.5817307692307693,
+      "grad_norm": 0.5081285834312439,
+      "learning_rate": 0.00018984815596075953,
+      "loss": 1.1577,
+      "step": 3267
+    },
+    {
+      "epoch": 0.5819088319088319,
+      "grad_norm": 0.5648202896118164,
+      "learning_rate": 0.00018984201003268176,
+      "loss": 1.2235,
+      "step": 3268
+    },
+    {
+      "epoch": 0.5820868945868946,
+      "grad_norm": 0.495061993598938,
+      "learning_rate": 0.00018983586234433642,
+      "loss": 1.056,
+      "step": 3269
+    },
+    {
+      "epoch": 0.5822649572649573,
+      "grad_norm": 0.47149857878685,
+      "learning_rate": 0.000189829712895844,
+      "loss": 1.0844,
+      "step": 3270
+    },
+    {
+      "epoch": 0.58244301994302,
+      "grad_norm": 0.6107062697410583,
+      "learning_rate": 0.00018982356168732492,
+      "loss": 0.9868,
+      "step": 3271
+    },
+    {
+      "epoch": 0.5826210826210826,
+      "grad_norm": 0.7355940341949463,
+      "learning_rate": 0.00018981740871889974,
+      "loss": 1.1448,
+      "step": 3272
+    },
+    {
+      "epoch": 0.5827991452991453,
+      "grad_norm": 0.5950441956520081,
+      "learning_rate": 0.00018981125399068907,
+      "loss": 0.9618,
+      "step": 3273
+    },
+    {
+      "epoch": 0.582977207977208,
+      "grad_norm": 0.47607290744781494,
+      "learning_rate": 0.0001898050975028134,
+      "loss": 0.957,
+      "step": 3274
+    },
+    {
+      "epoch": 0.5831552706552706,
+      "grad_norm": 0.541164755821228,
+      "learning_rate": 0.00018979893925539338,
+      "loss": 1.1426,
+      "step": 3275
+    },
+    {
+      "epoch": 0.5833333333333334,
+      "grad_norm": 0.5240640044212341,
+      "learning_rate": 0.00018979277924854974,
+      "loss": 1.1421,
+      "step": 3276
+    },
+    {
+      "epoch": 0.583511396011396,
+      "grad_norm": 0.48155727982521057,
+      "learning_rate": 0.00018978661748240307,
+      "loss": 1.0069,
+      "step": 3277
+    },
+    {
+      "epoch": 0.5836894586894587,
+      "grad_norm": 0.5559938549995422,
+      "learning_rate": 0.00018978045395707418,
+      "loss": 1.1227,
+      "step": 3278
+    },
+    {
+      "epoch": 0.5838675213675214,
+      "grad_norm": 0.5244291424751282,
+      "learning_rate": 0.0001897742886726838,
+      "loss": 1.1103,
+      "step": 3279
+    },
+    {
+      "epoch": 0.584045584045584,
+      "grad_norm": 0.5277758240699768,
+      "learning_rate": 0.00018976812162935268,
+      "loss": 1.2125,
+      "step": 3280
+    },
+    {
+      "epoch": 0.5842236467236467,
+      "grad_norm": 0.5415039658546448,
+      "learning_rate": 0.00018976195282720173,
+      "loss": 1.146,
+      "step": 3281
+    },
+    {
+      "epoch": 0.5844017094017094,
+      "grad_norm": 0.5152051448822021,
+      "learning_rate": 0.00018975578226635177,
+      "loss": 1.0092,
+      "step": 3282
+    },
+    {
+      "epoch": 0.5845797720797721,
+      "grad_norm": 0.5489452481269836,
+      "learning_rate": 0.00018974960994692371,
+      "loss": 1.2425,
+      "step": 3283
+    },
+    {
+      "epoch": 0.5847578347578347,
+      "grad_norm": 0.491274356842041,
+      "learning_rate": 0.00018974343586903848,
+      "loss": 0.9559,
+      "step": 3284
+    },
+    {
+      "epoch": 0.5849358974358975,
+      "grad_norm": 0.5783739686012268,
+      "learning_rate": 0.00018973726003281707,
+      "loss": 1.1971,
+      "step": 3285
+    },
+    {
+      "epoch": 0.5851139601139601,
+      "grad_norm": 0.5056472420692444,
+      "learning_rate": 0.00018973108243838045,
+      "loss": 1.0313,
+      "step": 3286
+    },
+    {
+      "epoch": 0.5852920227920227,
+      "grad_norm": 0.4939729571342468,
+      "learning_rate": 0.00018972490308584962,
+      "loss": 1.1061,
+      "step": 3287
+    },
+    {
+      "epoch": 0.5854700854700855,
+      "grad_norm": 0.4889580011367798,
+      "learning_rate": 0.00018971872197534576,
+      "loss": 0.9157,
+      "step": 3288
+    },
+    {
+      "epoch": 0.5856481481481481,
+      "grad_norm": 0.40889349579811096,
+      "learning_rate": 0.00018971253910698993,
+      "loss": 0.8083,
+      "step": 3289
+    },
+    {
+      "epoch": 0.5858262108262108,
+      "grad_norm": 0.5221503973007202,
+      "learning_rate": 0.00018970635448090322,
+      "loss": 0.9995,
+      "step": 3290
+    },
+    {
+      "epoch": 0.5860042735042735,
+      "grad_norm": 0.47060561180114746,
+      "learning_rate": 0.00018970016809720687,
+      "loss": 0.9738,
+      "step": 3291
+    },
+    {
+      "epoch": 0.5861823361823362,
+      "grad_norm": 0.6083170771598816,
+      "learning_rate": 0.000189693979956022,
+      "loss": 1.188,
+      "step": 3292
+    },
+    {
+      "epoch": 0.5863603988603988,
+      "grad_norm": 0.4696751534938812,
+      "learning_rate": 0.00018968779005746998,
+      "loss": 1.089,
+      "step": 3293
+    },
+    {
+      "epoch": 0.5865384615384616,
+      "grad_norm": 0.5081014633178711,
+      "learning_rate": 0.00018968159840167202,
+      "loss": 1.1869,
+      "step": 3294
+    },
+    {
+      "epoch": 0.5867165242165242,
+      "grad_norm": 0.48042431473731995,
+      "learning_rate": 0.0001896754049887494,
+      "loss": 0.964,
+      "step": 3295
+    },
+    {
+      "epoch": 0.5868945868945868,
+      "grad_norm": 0.5075193643569946,
+      "learning_rate": 0.00018966920981882353,
+      "loss": 1.1884,
+      "step": 3296
+    },
+    {
+      "epoch": 0.5870726495726496,
+      "grad_norm": 0.5734842419624329,
+      "learning_rate": 0.00018966301289201576,
+      "loss": 1.1475,
+      "step": 3297
+    },
+    {
+      "epoch": 0.5872507122507122,
+      "grad_norm": 0.5525311231613159,
+      "learning_rate": 0.00018965681420844753,
+      "loss": 1.241,
+      "step": 3298
+    },
+    {
+      "epoch": 0.5874287749287749,
+      "grad_norm": 0.48142680525779724,
+      "learning_rate": 0.00018965061376824025,
+      "loss": 1.0871,
+      "step": 3299
+    },
+    {
+      "epoch": 0.5876068376068376,
+      "grad_norm": 0.5360350608825684,
+      "learning_rate": 0.00018964441157151544,
+      "loss": 1.1895,
+      "step": 3300
+    },
+    {
+      "epoch": 0.5877849002849003,
+      "grad_norm": 0.5207685232162476,
+      "learning_rate": 0.00018963820761839457,
+      "loss": 0.9323,
+      "step": 3301
+    },
+    {
+      "epoch": 0.5879629629629629,
+      "grad_norm": 0.453620970249176,
+      "learning_rate": 0.00018963200190899926,
+      "loss": 0.802,
+      "step": 3302
+    },
+    {
+      "epoch": 0.5881410256410257,
+      "grad_norm": 0.5198796391487122,
+      "learning_rate": 0.00018962579444345106,
+      "loss": 1.0243,
+      "step": 3303
+    },
+    {
+      "epoch": 0.5883190883190883,
+      "grad_norm": 0.5597525835037231,
+      "learning_rate": 0.0001896195852218716,
+      "loss": 0.9351,
+      "step": 3304
+    },
+    {
+      "epoch": 0.5884971509971509,
+      "grad_norm": 0.5738299489021301,
+      "learning_rate": 0.00018961337424438254,
+      "loss": 1.3737,
+      "step": 3305
+    },
+    {
+      "epoch": 0.5886752136752137,
+      "grad_norm": 0.5569949150085449,
+      "learning_rate": 0.00018960716151110554,
+      "loss": 1.0469,
+      "step": 3306
+    },
+    {
+      "epoch": 0.5888532763532763,
+      "grad_norm": 0.5088010430335999,
+      "learning_rate": 0.00018960094702216238,
+      "loss": 1.0982,
+      "step": 3307
+    },
+    {
+      "epoch": 0.5890313390313391,
+      "grad_norm": 0.5127636790275574,
+      "learning_rate": 0.0001895947307776748,
+      "loss": 0.9986,
+      "step": 3308
+    },
+    {
+      "epoch": 0.5892094017094017,
+      "grad_norm": 0.5160682797431946,
+      "learning_rate": 0.00018958851277776456,
+      "loss": 1.0219,
+      "step": 3309
+    },
+    {
+      "epoch": 0.5893874643874644,
+      "grad_norm": 0.5380711555480957,
+      "learning_rate": 0.00018958229302255356,
+      "loss": 1.118,
+      "step": 3310
+    },
+    {
+      "epoch": 0.5895655270655271,
+      "grad_norm": 0.5571228861808777,
+      "learning_rate": 0.0001895760715121636,
+      "loss": 1.0302,
+      "step": 3311
+    },
+    {
+      "epoch": 0.5897435897435898,
+      "grad_norm": 0.542266309261322,
+      "learning_rate": 0.00018956984824671657,
+      "loss": 1.0372,
+      "step": 3312
+    },
+    {
+      "epoch": 0.5899216524216524,
+      "grad_norm": 0.48350459337234497,
+      "learning_rate": 0.00018956362322633446,
+      "loss": 1.2,
+      "step": 3313
+    },
+    {
+      "epoch": 0.5900997150997151,
+      "grad_norm": 0.5001645088195801,
+      "learning_rate": 0.0001895573964511392,
+      "loss": 0.9749,
+      "step": 3314
+    },
+    {
+      "epoch": 0.5902777777777778,
+      "grad_norm": 0.5227531790733337,
+      "learning_rate": 0.00018955116792125276,
+      "loss": 1.025,
+      "step": 3315
+    },
+    {
+      "epoch": 0.5904558404558404,
+      "grad_norm": 0.522251546382904,
+      "learning_rate": 0.00018954493763679727,
+      "loss": 1.0821,
+      "step": 3316
+    },
+    {
+      "epoch": 0.5906339031339032,
+      "grad_norm": 0.5423251390457153,
+      "learning_rate": 0.00018953870559789467,
+      "loss": 1.0961,
+      "step": 3317
+    },
+    {
+      "epoch": 0.5908119658119658,
+      "grad_norm": 0.5615720748901367,
+      "learning_rate": 0.0001895324718046672,
+      "loss": 1.1209,
+      "step": 3318
+    },
+    {
+      "epoch": 0.5909900284900285,
+      "grad_norm": 0.44746771454811096,
+      "learning_rate": 0.00018952623625723692,
+      "loss": 0.9935,
+      "step": 3319
+    },
+    {
+      "epoch": 0.5911680911680912,
+      "grad_norm": 0.5993229150772095,
+      "learning_rate": 0.00018951999895572597,
+      "loss": 1.1409,
+      "step": 3320
+    },
+    {
+      "epoch": 0.5913461538461539,
+      "grad_norm": 0.4969801902770996,
+      "learning_rate": 0.00018951375990025666,
+      "loss": 1.1568,
+      "step": 3321
+    },
+    {
+      "epoch": 0.5915242165242165,
+      "grad_norm": 0.6001267433166504,
+      "learning_rate": 0.00018950751909095116,
+      "loss": 1.1135,
+      "step": 3322
+    },
+    {
+      "epoch": 0.5917022792022792,
+      "grad_norm": 0.5386021733283997,
+      "learning_rate": 0.00018950127652793172,
+      "loss": 0.947,
+      "step": 3323
+    },
+    {
+      "epoch": 0.5918803418803419,
+      "grad_norm": 0.49043843150138855,
+      "learning_rate": 0.00018949503221132074,
+      "loss": 0.9581,
+      "step": 3324
+    },
+    {
+      "epoch": 0.5920584045584045,
+      "grad_norm": 0.5241141319274902,
+      "learning_rate": 0.00018948878614124048,
+      "loss": 1.0797,
+      "step": 3325
+    },
+    {
+      "epoch": 0.5922364672364673,
+      "grad_norm": 0.5755026340484619,
+      "learning_rate": 0.00018948253831781338,
+      "loss": 1.1046,
+      "step": 3326
+    },
+    {
+      "epoch": 0.5924145299145299,
+      "grad_norm": 0.5004449486732483,
+      "learning_rate": 0.00018947628874116179,
+      "loss": 1.1416,
+      "step": 3327
+    },
+    {
+      "epoch": 0.5925925925925926,
+      "grad_norm": 0.53347247838974,
+      "learning_rate": 0.00018947003741140821,
+      "loss": 1.2718,
+      "step": 3328
+    },
+    {
+      "epoch": 0.5927706552706553,
+      "grad_norm": 0.6473469138145447,
+      "learning_rate": 0.0001894637843286751,
+      "loss": 1.2255,
+      "step": 3329
+    },
+    {
+      "epoch": 0.592948717948718,
+      "grad_norm": 0.4750518798828125,
+      "learning_rate": 0.00018945752949308498,
+      "loss": 1.0537,
+      "step": 3330
+    },
+    {
+      "epoch": 0.5931267806267806,
+      "grad_norm": 0.5636306405067444,
+      "learning_rate": 0.00018945127290476043,
+      "loss": 0.9906,
+      "step": 3331
+    },
+    {
+      "epoch": 0.5933048433048433,
+      "grad_norm": 0.4871736466884613,
+      "learning_rate": 0.00018944501456382397,
+      "loss": 1.0549,
+      "step": 3332
+    },
+    {
+      "epoch": 0.593482905982906,
+      "grad_norm": 0.5554637312889099,
+      "learning_rate": 0.0001894387544703983,
+      "loss": 1.1587,
+      "step": 3333
+    },
+    {
+      "epoch": 0.5936609686609686,
+      "grad_norm": 0.5385799407958984,
+      "learning_rate": 0.000189432492624606,
+      "loss": 0.9565,
+      "step": 3334
+    },
+    {
+      "epoch": 0.5938390313390314,
+      "grad_norm": 0.4996553063392639,
+      "learning_rate": 0.00018942622902656976,
+      "loss": 1.0456,
+      "step": 3335
+    },
+    {
+      "epoch": 0.594017094017094,
+      "grad_norm": 0.46810707449913025,
+      "learning_rate": 0.00018941996367641237,
+      "loss": 1.119,
+      "step": 3336
+    },
+    {
+      "epoch": 0.5941951566951567,
+      "grad_norm": 0.5672653913497925,
+      "learning_rate": 0.0001894136965742565,
+      "loss": 1.1317,
+      "step": 3337
+    },
+    {
+      "epoch": 0.5943732193732194,
+      "grad_norm": 0.4790053367614746,
+      "learning_rate": 0.00018940742772022504,
+      "loss": 1.0967,
+      "step": 3338
+    },
+    {
+      "epoch": 0.594551282051282,
+      "grad_norm": 0.5935906171798706,
+      "learning_rate": 0.00018940115711444072,
+      "loss": 1.3044,
+      "step": 3339
+    },
+    {
+      "epoch": 0.5947293447293447,
+      "grad_norm": 0.4790516793727875,
+      "learning_rate": 0.00018939488475702647,
+      "loss": 1.074,
+      "step": 3340
+    },
+    {
+      "epoch": 0.5949074074074074,
+      "grad_norm": 0.474588006734848,
+      "learning_rate": 0.00018938861064810516,
+      "loss": 1.1476,
+      "step": 3341
+    },
+    {
+      "epoch": 0.5950854700854701,
+      "grad_norm": 0.4908665120601654,
+      "learning_rate": 0.0001893823347877997,
+      "loss": 1.216,
+      "step": 3342
+    },
+    {
+      "epoch": 0.5952635327635327,
+      "grad_norm": 0.531650960445404,
+      "learning_rate": 0.00018937605717623307,
+      "loss": 1.1057,
+      "step": 3343
+    },
+    {
+      "epoch": 0.5954415954415955,
+      "grad_norm": 0.5581082105636597,
+      "learning_rate": 0.00018936977781352823,
+      "loss": 0.7972,
+      "step": 3344
+    },
+    {
+      "epoch": 0.5956196581196581,
+      "grad_norm": 0.42370662093162537,
+      "learning_rate": 0.00018936349669980827,
+      "loss": 0.8888,
+      "step": 3345
+    },
+    {
+      "epoch": 0.5957977207977208,
+      "grad_norm": 0.5817318558692932,
+      "learning_rate": 0.00018935721383519624,
+      "loss": 1.2801,
+      "step": 3346
+    },
+    {
+      "epoch": 0.5959757834757835,
+      "grad_norm": 0.4766376316547394,
+      "learning_rate": 0.00018935092921981524,
+      "loss": 1.0918,
+      "step": 3347
+    },
+    {
+      "epoch": 0.5961538461538461,
+      "grad_norm": 0.5567346811294556,
+      "learning_rate": 0.00018934464285378836,
+      "loss": 1.0269,
+      "step": 3348
+    },
+    {
+      "epoch": 0.5963319088319088,
+      "grad_norm": 0.5285565257072449,
+      "learning_rate": 0.0001893383547372388,
+      "loss": 1.1887,
+      "step": 3349
+    },
+    {
+      "epoch": 0.5965099715099715,
+      "grad_norm": 0.49052694439888,
+      "learning_rate": 0.00018933206487028979,
+      "loss": 1.0773,
+      "step": 3350
+    },
+    {
+      "epoch": 0.5966880341880342,
+      "grad_norm": 0.6175199151039124,
+      "learning_rate": 0.0001893257732530645,
+      "loss": 1.0192,
+      "step": 3351
+    },
+    {
+      "epoch": 0.5968660968660968,
+      "grad_norm": 0.56049644947052,
+      "learning_rate": 0.00018931947988568628,
+      "loss": 0.9516,
+      "step": 3352
+    },
+    {
+      "epoch": 0.5970441595441596,
+      "grad_norm": 0.47873660922050476,
+      "learning_rate": 0.00018931318476827838,
+      "loss": 0.8174,
+      "step": 3353
+    },
+    {
+      "epoch": 0.5972222222222222,
+      "grad_norm": 0.4748854339122772,
+      "learning_rate": 0.00018930688790096416,
+      "loss": 1.0238,
+      "step": 3354
+    },
+    {
+      "epoch": 0.5974002849002849,
+      "grad_norm": 0.5382232666015625,
+      "learning_rate": 0.00018930058928386698,
+      "loss": 1.0815,
+      "step": 3355
+    },
+    {
+      "epoch": 0.5975783475783476,
+      "grad_norm": 0.5038299560546875,
+      "learning_rate": 0.00018929428891711027,
+      "loss": 1.0472,
+      "step": 3356
+    },
+    {
+      "epoch": 0.5977564102564102,
+      "grad_norm": 0.5185908079147339,
+      "learning_rate": 0.00018928798680081744,
+      "loss": 1.0435,
+      "step": 3357
+    },
+    {
+      "epoch": 0.5979344729344729,
+      "grad_norm": 0.5169877409934998,
+      "learning_rate": 0.00018928168293511202,
+      "loss": 1.0437,
+      "step": 3358
+    },
+    {
+      "epoch": 0.5981125356125356,
+      "grad_norm": 0.5218369960784912,
+      "learning_rate": 0.00018927537732011749,
+      "loss": 1.082,
+      "step": 3359
+    },
+    {
+      "epoch": 0.5982905982905983,
+      "grad_norm": 0.5358219742774963,
+      "learning_rate": 0.0001892690699559574,
+      "loss": 1.2523,
+      "step": 3360
+    },
+    {
+      "epoch": 0.5984686609686609,
+      "grad_norm": 0.47716647386550903,
+      "learning_rate": 0.0001892627608427553,
+      "loss": 1.2069,
+      "step": 3361
+    },
+    {
+      "epoch": 0.5986467236467237,
+      "grad_norm": 0.5484169125556946,
+      "learning_rate": 0.00018925644998063482,
+      "loss": 1.2016,
+      "step": 3362
+    },
+    {
+      "epoch": 0.5988247863247863,
+      "grad_norm": 0.46814846992492676,
+      "learning_rate": 0.00018925013736971965,
+      "loss": 0.7989,
+      "step": 3363
+    },
+    {
+      "epoch": 0.5990028490028491,
+      "grad_norm": 0.5391258001327515,
+      "learning_rate": 0.0001892438230101334,
+      "loss": 1.224,
+      "step": 3364
+    },
+    {
+      "epoch": 0.5991809116809117,
+      "grad_norm": 0.5248384475708008,
+      "learning_rate": 0.00018923750690199987,
+      "loss": 1.1532,
+      "step": 3365
+    },
+    {
+      "epoch": 0.5993589743589743,
+      "grad_norm": 0.5074637532234192,
+      "learning_rate": 0.00018923118904544273,
+      "loss": 1.0968,
+      "step": 3366
+    },
+    {
+      "epoch": 0.5995370370370371,
+      "grad_norm": 0.5260029435157776,
+      "learning_rate": 0.00018922486944058581,
+      "loss": 1.1311,
+      "step": 3367
+    },
+    {
+      "epoch": 0.5997150997150997,
+      "grad_norm": 0.48497965931892395,
+      "learning_rate": 0.00018921854808755294,
+      "loss": 1.1208,
+      "step": 3368
+    },
+    {
+      "epoch": 0.5998931623931624,
+      "grad_norm": 0.5108651518821716,
+      "learning_rate": 0.00018921222498646792,
+      "loss": 1.147,
+      "step": 3369
+    },
+    {
+      "epoch": 0.6000712250712251,
+      "grad_norm": 0.5243437886238098,
+      "learning_rate": 0.00018920590013745471,
+      "loss": 0.9614,
+      "step": 3370
+    },
+    {
+      "epoch": 0.6002492877492878,
+      "grad_norm": 0.47022634744644165,
+      "learning_rate": 0.00018919957354063719,
+      "loss": 1.0579,
+      "step": 3371
+    },
+    {
+      "epoch": 0.6004273504273504,
+      "grad_norm": 0.6461413502693176,
+      "learning_rate": 0.00018919324519613931,
+      "loss": 1.2126,
+      "step": 3372
+    },
+    {
+      "epoch": 0.6006054131054132,
+      "grad_norm": 0.4654616713523865,
+      "learning_rate": 0.00018918691510408508,
+      "loss": 1.1476,
+      "step": 3373
+    },
+    {
+      "epoch": 0.6007834757834758,
+      "grad_norm": 0.48571303486824036,
+      "learning_rate": 0.00018918058326459854,
+      "loss": 1.2093,
+      "step": 3374
+    },
+    {
+      "epoch": 0.6009615384615384,
+      "grad_norm": 0.5255016684532166,
+      "learning_rate": 0.00018917424967780368,
+      "loss": 1.1538,
+      "step": 3375
+    },
+    {
+      "epoch": 0.6011396011396012,
+      "grad_norm": 0.5059894323348999,
+      "learning_rate": 0.00018916791434382468,
+      "loss": 1.0556,
+      "step": 3376
+    },
+    {
+      "epoch": 0.6013176638176638,
+      "grad_norm": 0.4581229090690613,
+      "learning_rate": 0.00018916157726278561,
+      "loss": 1.1468,
+      "step": 3377
+    },
+    {
+      "epoch": 0.6014957264957265,
+      "grad_norm": 0.5701818466186523,
+      "learning_rate": 0.00018915523843481067,
+      "loss": 1.3641,
+      "step": 3378
+    },
+    {
+      "epoch": 0.6016737891737892,
+      "grad_norm": 0.5007243752479553,
+      "learning_rate": 0.00018914889786002403,
+      "loss": 1.2705,
+      "step": 3379
+    },
+    {
+      "epoch": 0.6018518518518519,
+      "grad_norm": 0.5192995071411133,
+      "learning_rate": 0.0001891425555385499,
+      "loss": 0.9922,
+      "step": 3380
+    },
+    {
+      "epoch": 0.6020299145299145,
+      "grad_norm": 0.5880612134933472,
+      "learning_rate": 0.00018913621147051258,
+      "loss": 0.8783,
+      "step": 3381
+    },
+    {
+      "epoch": 0.6022079772079773,
+      "grad_norm": 0.5161563158035278,
+      "learning_rate": 0.0001891298656560364,
+      "loss": 0.9634,
+      "step": 3382
+    },
+    {
+      "epoch": 0.6023860398860399,
+      "grad_norm": 0.48450782895088196,
+      "learning_rate": 0.00018912351809524563,
+      "loss": 0.809,
+      "step": 3383
+    },
+    {
+      "epoch": 0.6025641025641025,
+      "grad_norm": 0.621537983417511,
+      "learning_rate": 0.00018911716878826465,
+      "loss": 1.2031,
+      "step": 3384
+    },
+    {
+      "epoch": 0.6027421652421653,
+      "grad_norm": 0.6014544367790222,
+      "learning_rate": 0.00018911081773521787,
+      "loss": 1.1552,
+      "step": 3385
+    },
+    {
+      "epoch": 0.6029202279202279,
+      "grad_norm": 0.49995481967926025,
+      "learning_rate": 0.00018910446493622976,
+      "loss": 0.8569,
+      "step": 3386
+    },
+    {
+      "epoch": 0.6030982905982906,
+      "grad_norm": 0.5157307386398315,
+      "learning_rate": 0.00018909811039142472,
+      "loss": 0.9515,
+      "step": 3387
+    },
+    {
+      "epoch": 0.6032763532763533,
+      "grad_norm": 0.5164140462875366,
+      "learning_rate": 0.0001890917541009273,
+      "loss": 0.9803,
+      "step": 3388
+    },
+    {
+      "epoch": 0.603454415954416,
+      "grad_norm": 0.5555596947669983,
+      "learning_rate": 0.00018908539606486206,
+      "loss": 1.2994,
+      "step": 3389
+    },
+    {
+      "epoch": 0.6036324786324786,
+      "grad_norm": 0.605697512626648,
+      "learning_rate": 0.00018907903628335353,
+      "loss": 1.2865,
+      "step": 3390
+    },
+    {
+      "epoch": 0.6038105413105413,
+      "grad_norm": 0.5700713992118835,
+      "learning_rate": 0.0001890726747565263,
+      "loss": 1.2493,
+      "step": 3391
+    },
+    {
+      "epoch": 0.603988603988604,
+      "grad_norm": 0.5516746044158936,
+      "learning_rate": 0.0001890663114845051,
+      "loss": 1.2743,
+      "step": 3392
+    },
+    {
+      "epoch": 0.6041666666666666,
+      "grad_norm": 0.5233162641525269,
+      "learning_rate": 0.0001890599464674145,
+      "loss": 0.9237,
+      "step": 3393
+    },
+    {
+      "epoch": 0.6043447293447294,
+      "grad_norm": 0.5709942579269409,
+      "learning_rate": 0.00018905357970537925,
+      "loss": 0.9922,
+      "step": 3394
+    },
+    {
+      "epoch": 0.604522792022792,
+      "grad_norm": 0.48403796553611755,
+      "learning_rate": 0.0001890472111985241,
+      "loss": 1.1255,
+      "step": 3395
+    },
+    {
+      "epoch": 0.6047008547008547,
+      "grad_norm": 0.628718376159668,
+      "learning_rate": 0.00018904084094697386,
+      "loss": 1.1458,
+      "step": 3396
+    },
+    {
+      "epoch": 0.6048789173789174,
+      "grad_norm": 0.46822869777679443,
+      "learning_rate": 0.00018903446895085328,
+      "loss": 0.8727,
+      "step": 3397
+    },
+    {
+      "epoch": 0.60505698005698,
+      "grad_norm": 0.505584180355072,
+      "learning_rate": 0.00018902809521028724,
+      "loss": 1.1595,
+      "step": 3398
+    },
+    {
+      "epoch": 0.6052350427350427,
+      "grad_norm": 0.4494974911212921,
+      "learning_rate": 0.00018902171972540058,
+      "loss": 0.6685,
+      "step": 3399
+    },
+    {
+      "epoch": 0.6054131054131054,
+      "grad_norm": 0.5101519227027893,
+      "learning_rate": 0.0001890153424963183,
+      "loss": 0.9313,
+      "step": 3400
+    },
+    {
+      "epoch": 0.6055911680911681,
+      "grad_norm": 0.5081079602241516,
+      "learning_rate": 0.00018900896352316528,
+      "loss": 1.2588,
+      "step": 3401
+    },
+    {
+      "epoch": 0.6057692307692307,
+      "grad_norm": 0.5784309506416321,
+      "learning_rate": 0.00018900258280606653,
+      "loss": 1.2077,
+      "step": 3402
+    },
+    {
+      "epoch": 0.6059472934472935,
+      "grad_norm": 0.4506312608718872,
+      "learning_rate": 0.00018899620034514705,
+      "loss": 1.05,
+      "step": 3403
+    },
+    {
+      "epoch": 0.6061253561253561,
+      "grad_norm": 0.5243048071861267,
+      "learning_rate": 0.0001889898161405319,
+      "loss": 1.2295,
+      "step": 3404
+    },
+    {
+      "epoch": 0.6063034188034188,
+      "grad_norm": 0.5447196364402771,
+      "learning_rate": 0.00018898343019234615,
+      "loss": 1.1476,
+      "step": 3405
+    },
+    {
+      "epoch": 0.6064814814814815,
+      "grad_norm": 0.46813663840293884,
+      "learning_rate": 0.00018897704250071492,
+      "loss": 1.2113,
+      "step": 3406
+    },
+    {
+      "epoch": 0.6066595441595442,
+      "grad_norm": 0.5340631604194641,
+      "learning_rate": 0.00018897065306576342,
+      "loss": 1.1656,
+      "step": 3407
+    },
+    {
+      "epoch": 0.6068376068376068,
+      "grad_norm": 0.513708233833313,
+      "learning_rate": 0.00018896426188761675,
+      "loss": 1.1616,
+      "step": 3408
+    },
+    {
+      "epoch": 0.6070156695156695,
+      "grad_norm": 0.594601035118103,
+      "learning_rate": 0.00018895786896640023,
+      "loss": 1.2564,
+      "step": 3409
+    },
+    {
+      "epoch": 0.6071937321937322,
+      "grad_norm": 0.45067599415779114,
+      "learning_rate": 0.000188951474302239,
+      "loss": 1.0107,
+      "step": 3410
+    },
+    {
+      "epoch": 0.6073717948717948,
+      "grad_norm": 0.5394250750541687,
+      "learning_rate": 0.00018894507789525843,
+      "loss": 1.4081,
+      "step": 3411
+    },
+    {
+      "epoch": 0.6075498575498576,
+      "grad_norm": 0.5612049102783203,
+      "learning_rate": 0.00018893867974558383,
+      "loss": 1.1015,
+      "step": 3412
+    },
+    {
+      "epoch": 0.6077279202279202,
+      "grad_norm": 0.4794061779975891,
+      "learning_rate": 0.00018893227985334056,
+      "loss": 1.2103,
+      "step": 3413
+    },
+    {
+      "epoch": 0.6079059829059829,
+      "grad_norm": 0.6060562133789062,
+      "learning_rate": 0.00018892587821865402,
+      "loss": 1.3693,
+      "step": 3414
+    },
+    {
+      "epoch": 0.6080840455840456,
+      "grad_norm": 0.44624534249305725,
+      "learning_rate": 0.00018891947484164963,
+      "loss": 0.8209,
+      "step": 3415
+    },
+    {
+      "epoch": 0.6082621082621082,
+      "grad_norm": 0.49297213554382324,
+      "learning_rate": 0.0001889130697224528,
+      "loss": 1.2027,
+      "step": 3416
+    },
+    {
+      "epoch": 0.6084401709401709,
+      "grad_norm": 0.4431746304035187,
+      "learning_rate": 0.0001889066628611891,
+      "loss": 1.0347,
+      "step": 3417
+    },
+    {
+      "epoch": 0.6086182336182336,
+      "grad_norm": 0.5425933599472046,
+      "learning_rate": 0.00018890025425798404,
+      "loss": 1.0556,
+      "step": 3418
+    },
+    {
+      "epoch": 0.6087962962962963,
+      "grad_norm": 0.5502763390541077,
+      "learning_rate": 0.00018889384391296315,
+      "loss": 1.2362,
+      "step": 3419
+    },
+    {
+      "epoch": 0.6089743589743589,
+      "grad_norm": 0.5442292094230652,
+      "learning_rate": 0.00018888743182625203,
+      "loss": 1.1306,
+      "step": 3420
+    },
+    {
+      "epoch": 0.6091524216524217,
+      "grad_norm": 0.4651123583316803,
+      "learning_rate": 0.00018888101799797636,
+      "loss": 0.9305,
+      "step": 3421
+    },
+    {
+      "epoch": 0.6093304843304843,
+      "grad_norm": 0.4713892340660095,
+      "learning_rate": 0.00018887460242826177,
+      "loss": 1.0789,
+      "step": 3422
+    },
+    {
+      "epoch": 0.6095085470085471,
+      "grad_norm": 0.5283244848251343,
+      "learning_rate": 0.00018886818511723398,
+      "loss": 1.345,
+      "step": 3423
+    },
+    {
+      "epoch": 0.6096866096866097,
+      "grad_norm": 0.5527324080467224,
+      "learning_rate": 0.0001888617660650187,
+      "loss": 1.1297,
+      "step": 3424
+    },
+    {
+      "epoch": 0.6098646723646723,
+      "grad_norm": 0.5412901043891907,
+      "learning_rate": 0.00018885534527174168,
+      "loss": 1.1213,
+      "step": 3425
+    },
+    {
+      "epoch": 0.6100427350427351,
+      "grad_norm": 0.5295354127883911,
+      "learning_rate": 0.00018884892273752878,
+      "loss": 1.1217,
+      "step": 3426
+    },
+    {
+      "epoch": 0.6102207977207977,
+      "grad_norm": 0.461900532245636,
+      "learning_rate": 0.0001888424984625058,
+      "loss": 0.827,
+      "step": 3427
+    },
+    {
+      "epoch": 0.6103988603988604,
+      "grad_norm": 0.4922671616077423,
+      "learning_rate": 0.00018883607244679865,
+      "loss": 1.2216,
+      "step": 3428
+    },
+    {
+      "epoch": 0.6105769230769231,
+      "grad_norm": 0.5080927014350891,
+      "learning_rate": 0.00018882964469053317,
+      "loss": 1.2446,
+      "step": 3429
+    },
+    {
+      "epoch": 0.6107549857549858,
+      "grad_norm": 0.5523943901062012,
+      "learning_rate": 0.00018882321519383534,
+      "loss": 1.3346,
+      "step": 3430
+    },
+    {
+      "epoch": 0.6109330484330484,
+      "grad_norm": 0.5105271935462952,
+      "learning_rate": 0.0001888167839568311,
+      "loss": 1.1311,
+      "step": 3431
+    },
+    {
+      "epoch": 0.6111111111111112,
+      "grad_norm": 0.5635872483253479,
+      "learning_rate": 0.0001888103509796465,
+      "loss": 1.1875,
+      "step": 3432
+    },
+    {
+      "epoch": 0.6112891737891738,
+      "grad_norm": 0.4619547426700592,
+      "learning_rate": 0.00018880391626240755,
+      "loss": 0.9176,
+      "step": 3433
+    },
+    {
+      "epoch": 0.6114672364672364,
+      "grad_norm": 0.5896356105804443,
+      "learning_rate": 0.00018879747980524034,
+      "loss": 1.0251,
+      "step": 3434
+    },
+    {
+      "epoch": 0.6116452991452992,
+      "grad_norm": 0.49062737822532654,
+      "learning_rate": 0.000188791041608271,
+      "loss": 1.1598,
+      "step": 3435
+    },
+    {
+      "epoch": 0.6118233618233618,
+      "grad_norm": 0.45717164874076843,
+      "learning_rate": 0.00018878460167162558,
+      "loss": 0.8647,
+      "step": 3436
+    },
+    {
+      "epoch": 0.6120014245014245,
+      "grad_norm": 0.5903525352478027,
+      "learning_rate": 0.00018877815999543038,
+      "loss": 0.9671,
+      "step": 3437
+    },
+    {
+      "epoch": 0.6121794871794872,
+      "grad_norm": 0.5315384268760681,
+      "learning_rate": 0.00018877171657981153,
+      "loss": 1.1759,
+      "step": 3438
+    },
+    {
+      "epoch": 0.6123575498575499,
+      "grad_norm": 0.5650150775909424,
+      "learning_rate": 0.0001887652714248953,
+      "loss": 1.0128,
+      "step": 3439
+    },
+    {
+      "epoch": 0.6125356125356125,
+      "grad_norm": 0.49841752648353577,
+      "learning_rate": 0.000188758824530808,
+      "loss": 1.1259,
+      "step": 3440
+    },
+    {
+      "epoch": 0.6127136752136753,
+      "grad_norm": 0.4985620975494385,
+      "learning_rate": 0.00018875237589767593,
+      "loss": 1.0158,
+      "step": 3441
+    },
+    {
+      "epoch": 0.6128917378917379,
+      "grad_norm": 0.45266565680503845,
+      "learning_rate": 0.00018874592552562536,
+      "loss": 0.93,
+      "step": 3442
+    },
+    {
+      "epoch": 0.6130698005698005,
+      "grad_norm": 0.5696130990982056,
+      "learning_rate": 0.00018873947341478274,
+      "loss": 1.1432,
+      "step": 3443
+    },
+    {
+      "epoch": 0.6132478632478633,
+      "grad_norm": 0.5211645364761353,
+      "learning_rate": 0.00018873301956527451,
+      "loss": 1.1317,
+      "step": 3444
+    },
+    {
+      "epoch": 0.6134259259259259,
+      "grad_norm": 0.4991866946220398,
+      "learning_rate": 0.00018872656397722707,
+      "loss": 1.0362,
+      "step": 3445
+    },
+    {
+      "epoch": 0.6136039886039886,
+      "grad_norm": 0.5109508037567139,
+      "learning_rate": 0.00018872010665076694,
+      "loss": 1.2728,
+      "step": 3446
+    },
+    {
+      "epoch": 0.6137820512820513,
+      "grad_norm": 0.5838373899459839,
+      "learning_rate": 0.00018871364758602058,
+      "loss": 1.1131,
+      "step": 3447
+    },
+    {
+      "epoch": 0.613960113960114,
+      "grad_norm": 0.5139824151992798,
+      "learning_rate": 0.00018870718678311462,
+      "loss": 1.238,
+      "step": 3448
+    },
+    {
+      "epoch": 0.6141381766381766,
+      "grad_norm": 0.4852082431316376,
+      "learning_rate": 0.00018870072424217562,
+      "loss": 1.0677,
+      "step": 3449
+    },
+    {
+      "epoch": 0.6143162393162394,
+      "grad_norm": 0.5312315225601196,
+      "learning_rate": 0.00018869425996333018,
+      "loss": 1.178,
+      "step": 3450
+    },
+    {
+      "epoch": 0.614494301994302,
+      "grad_norm": 0.6343565583229065,
+      "learning_rate": 0.00018868779394670492,
+      "loss": 0.8839,
+      "step": 3451
+    },
+    {
+      "epoch": 0.6146723646723646,
+      "grad_norm": 0.6029773950576782,
+      "learning_rate": 0.00018868132619242662,
+      "loss": 1.1188,
+      "step": 3452
+    },
+    {
+      "epoch": 0.6148504273504274,
+      "grad_norm": 0.5246016383171082,
+      "learning_rate": 0.00018867485670062193,
+      "loss": 1.0797,
+      "step": 3453
+    },
+    {
+      "epoch": 0.61502849002849,
+      "grad_norm": 0.49307698011398315,
+      "learning_rate": 0.00018866838547141763,
+      "loss": 0.9749,
+      "step": 3454
+    },
+    {
+      "epoch": 0.6152065527065527,
+      "grad_norm": 0.5232903361320496,
+      "learning_rate": 0.00018866191250494052,
+      "loss": 1.0785,
+      "step": 3455
+    },
+    {
+      "epoch": 0.6153846153846154,
+      "grad_norm": 0.5545645356178284,
+      "learning_rate": 0.0001886554378013174,
+      "loss": 1.0496,
+      "step": 3456
+    },
+    {
+      "epoch": 0.6155626780626781,
+      "grad_norm": 0.493945837020874,
+      "learning_rate": 0.00018864896136067515,
+      "loss": 0.9248,
+      "step": 3457
+    },
+    {
+      "epoch": 0.6157407407407407,
+      "grad_norm": 0.5223548412322998,
+      "learning_rate": 0.00018864248318314065,
+      "loss": 1.0617,
+      "step": 3458
+    },
+    {
+      "epoch": 0.6159188034188035,
+      "grad_norm": 0.5666514039039612,
+      "learning_rate": 0.00018863600326884082,
+      "loss": 0.9981,
+      "step": 3459
+    },
+    {
+      "epoch": 0.6160968660968661,
+      "grad_norm": 0.4648127257823944,
+      "learning_rate": 0.00018862952161790265,
+      "loss": 0.917,
+      "step": 3460
+    },
+    {
+      "epoch": 0.6162749287749287,
+      "grad_norm": 0.590326189994812,
+      "learning_rate": 0.0001886230382304531,
+      "loss": 1.044,
+      "step": 3461
+    },
+    {
+      "epoch": 0.6164529914529915,
+      "grad_norm": 0.5511625409126282,
+      "learning_rate": 0.00018861655310661925,
+      "loss": 1.0988,
+      "step": 3462
+    },
+    {
+      "epoch": 0.6166310541310541,
+      "grad_norm": 0.567182183265686,
+      "learning_rate": 0.0001886100662465281,
+      "loss": 1.3017,
+      "step": 3463
+    },
+    {
+      "epoch": 0.6168091168091168,
+      "grad_norm": 0.5708897709846497,
+      "learning_rate": 0.0001886035776503068,
+      "loss": 0.9123,
+      "step": 3464
+    },
+    {
+      "epoch": 0.6169871794871795,
+      "grad_norm": 0.4945180416107178,
+      "learning_rate": 0.0001885970873180824,
+      "loss": 1.1645,
+      "step": 3465
+    },
+    {
+      "epoch": 0.6171652421652422,
+      "grad_norm": 0.4713336229324341,
+      "learning_rate": 0.00018859059524998215,
+      "loss": 1.0546,
+      "step": 3466
+    },
+    {
+      "epoch": 0.6173433048433048,
+      "grad_norm": 0.532859206199646,
+      "learning_rate": 0.0001885841014461332,
+      "loss": 1.0795,
+      "step": 3467
+    },
+    {
+      "epoch": 0.6175213675213675,
+      "grad_norm": 0.5165733695030212,
+      "learning_rate": 0.00018857760590666284,
+      "loss": 1.1284,
+      "step": 3468
+    },
+    {
+      "epoch": 0.6176994301994302,
+      "grad_norm": 0.48623126745224,
+      "learning_rate": 0.00018857110863169826,
+      "loss": 0.8618,
+      "step": 3469
+    },
+    {
+      "epoch": 0.6178774928774928,
+      "grad_norm": 0.628559947013855,
+      "learning_rate": 0.0001885646096213668,
+      "loss": 1.1089,
+      "step": 3470
+    },
+    {
+      "epoch": 0.6180555555555556,
+      "grad_norm": 0.503545880317688,
+      "learning_rate": 0.0001885581088757958,
+      "loss": 1.2311,
+      "step": 3471
+    },
+    {
+      "epoch": 0.6182336182336182,
+      "grad_norm": 0.6172101497650146,
+      "learning_rate": 0.00018855160639511264,
+      "loss": 1.2651,
+      "step": 3472
+    },
+    {
+      "epoch": 0.6184116809116809,
+      "grad_norm": 0.49572527408599854,
+      "learning_rate": 0.00018854510217944465,
+      "loss": 1.1026,
+      "step": 3473
+    },
+    {
+      "epoch": 0.6185897435897436,
+      "grad_norm": 0.5373549461364746,
+      "learning_rate": 0.00018853859622891938,
+      "loss": 1.2562,
+      "step": 3474
+    },
+    {
+      "epoch": 0.6187678062678063,
+      "grad_norm": 0.5272396206855774,
+      "learning_rate": 0.0001885320885436642,
+      "loss": 1.1763,
+      "step": 3475
+    },
+    {
+      "epoch": 0.6189458689458689,
+      "grad_norm": 0.46584269404411316,
+      "learning_rate": 0.00018852557912380665,
+      "loss": 1.1762,
+      "step": 3476
+    },
+    {
+      "epoch": 0.6191239316239316,
+      "grad_norm": 0.4798245131969452,
+      "learning_rate": 0.0001885190679694743,
+      "loss": 0.9229,
+      "step": 3477
+    },
+    {
+      "epoch": 0.6193019943019943,
+      "grad_norm": 0.5221366286277771,
+      "learning_rate": 0.0001885125550807947,
+      "loss": 1.1078,
+      "step": 3478
+    },
+    {
+      "epoch": 0.6194800569800569,
+      "grad_norm": 0.5051897168159485,
+      "learning_rate": 0.0001885060404578954,
+      "loss": 1.0055,
+      "step": 3479
+    },
+    {
+      "epoch": 0.6196581196581197,
+      "grad_norm": 0.492662250995636,
+      "learning_rate": 0.00018849952410090413,
+      "loss": 1.1172,
+      "step": 3480
+    },
+    {
+      "epoch": 0.6198361823361823,
+      "grad_norm": 0.4906775951385498,
+      "learning_rate": 0.00018849300600994853,
+      "loss": 1.1223,
+      "step": 3481
+    },
+    {
+      "epoch": 0.6200142450142451,
+      "grad_norm": 0.5032641291618347,
+      "learning_rate": 0.0001884864861851563,
+      "loss": 0.9541,
+      "step": 3482
+    },
+    {
+      "epoch": 0.6201923076923077,
+      "grad_norm": 0.5262296795845032,
+      "learning_rate": 0.00018847996462665521,
+      "loss": 1.021,
+      "step": 3483
+    },
+    {
+      "epoch": 0.6203703703703703,
+      "grad_norm": 0.5253522992134094,
+      "learning_rate": 0.00018847344133457295,
+      "loss": 0.9075,
+      "step": 3484
+    },
+    {
+      "epoch": 0.6205484330484331,
+      "grad_norm": 0.4204299747943878,
+      "learning_rate": 0.00018846691630903744,
+      "loss": 0.895,
+      "step": 3485
+    },
+    {
+      "epoch": 0.6207264957264957,
+      "grad_norm": 0.557604193687439,
+      "learning_rate": 0.0001884603895501765,
+      "loss": 1.1758,
+      "step": 3486
+    },
+    {
+      "epoch": 0.6209045584045584,
+      "grad_norm": 0.5981321930885315,
+      "learning_rate": 0.00018845386105811795,
+      "loss": 1.1087,
+      "step": 3487
+    },
+    {
+      "epoch": 0.6210826210826211,
+      "grad_norm": 0.5285581946372986,
+      "learning_rate": 0.00018844733083298975,
+      "loss": 1.0692,
+      "step": 3488
+    },
+    {
+      "epoch": 0.6212606837606838,
+      "grad_norm": 0.5403170585632324,
+      "learning_rate": 0.00018844079887491986,
+      "loss": 1.1998,
+      "step": 3489
+    },
+    {
+      "epoch": 0.6214387464387464,
+      "grad_norm": 0.5471615791320801,
+      "learning_rate": 0.0001884342651840362,
+      "loss": 0.9556,
+      "step": 3490
+    },
+    {
+      "epoch": 0.6216168091168092,
+      "grad_norm": 0.6126871705055237,
+      "learning_rate": 0.00018842772976046686,
+      "loss": 1.2629,
+      "step": 3491
+    },
+    {
+      "epoch": 0.6217948717948718,
+      "grad_norm": 0.45669353008270264,
+      "learning_rate": 0.00018842119260433982,
+      "loss": 1.0203,
+      "step": 3492
+    },
+    {
+      "epoch": 0.6219729344729344,
+      "grad_norm": 0.4998520612716675,
+      "learning_rate": 0.0001884146537157832,
+      "loss": 1.0271,
+      "step": 3493
+    },
+    {
+      "epoch": 0.6221509971509972,
+      "grad_norm": 0.5820242166519165,
+      "learning_rate": 0.00018840811309492507,
+      "loss": 1.0321,
+      "step": 3494
+    },
+    {
+      "epoch": 0.6223290598290598,
+      "grad_norm": 0.581676185131073,
+      "learning_rate": 0.00018840157074189367,
+      "loss": 0.9219,
+      "step": 3495
+    },
+    {
+      "epoch": 0.6225071225071225,
+      "grad_norm": 0.6044120788574219,
+      "learning_rate": 0.0001883950266568171,
+      "loss": 1.1621,
+      "step": 3496
+    },
+    {
+      "epoch": 0.6226851851851852,
+      "grad_norm": 0.5448858737945557,
+      "learning_rate": 0.0001883884808398236,
+      "loss": 1.0686,
+      "step": 3497
+    },
+    {
+      "epoch": 0.6228632478632479,
+      "grad_norm": 0.4921551048755646,
+      "learning_rate": 0.00018838193329104143,
+      "loss": 1.2259,
+      "step": 3498
+    },
+    {
+      "epoch": 0.6230413105413105,
+      "grad_norm": 0.5374335646629333,
+      "learning_rate": 0.00018837538401059888,
+      "loss": 1.2608,
+      "step": 3499
+    },
+    {
+      "epoch": 0.6232193732193733,
+      "grad_norm": 0.5123008489608765,
+      "learning_rate": 0.0001883688329986243,
+      "loss": 0.8682,
+      "step": 3500
+    },
+    {
+      "epoch": 0.6233974358974359,
+      "grad_norm": 0.566145122051239,
+      "learning_rate": 0.00018836228025524595,
+      "loss": 1.1807,
+      "step": 3501
+    },
+    {
+      "epoch": 0.6235754985754985,
+      "grad_norm": 0.6658587455749512,
+      "learning_rate": 0.00018835572578059233,
+      "loss": 1.1641,
+      "step": 3502
+    },
+    {
+      "epoch": 0.6237535612535613,
+      "grad_norm": 0.4992465078830719,
+      "learning_rate": 0.00018834916957479177,
+      "loss": 0.9125,
+      "step": 3503
+    },
+    {
+      "epoch": 0.6239316239316239,
+      "grad_norm": 0.5081812739372253,
+      "learning_rate": 0.00018834261163797278,
+      "loss": 1.0939,
+      "step": 3504
+    },
+    {
+      "epoch": 0.6241096866096866,
+      "grad_norm": 0.5168607234954834,
+      "learning_rate": 0.0001883360519702638,
+      "loss": 1.2382,
+      "step": 3505
+    },
+    {
+      "epoch": 0.6242877492877493,
+      "grad_norm": 0.5517697334289551,
+      "learning_rate": 0.00018832949057179344,
+      "loss": 1.206,
+      "step": 3506
+    },
+    {
+      "epoch": 0.624465811965812,
+      "grad_norm": 0.4505497217178345,
+      "learning_rate": 0.00018832292744269013,
+      "loss": 0.8485,
+      "step": 3507
+    },
+    {
+      "epoch": 0.6246438746438746,
+      "grad_norm": 0.5230690240859985,
+      "learning_rate": 0.0001883163625830826,
+      "loss": 1.1701,
+      "step": 3508
+    },
+    {
+      "epoch": 0.6248219373219374,
+      "grad_norm": 0.5062205195426941,
+      "learning_rate": 0.00018830979599309937,
+      "loss": 1.0602,
+      "step": 3509
+    },
+    {
+      "epoch": 0.625,
+      "grad_norm": 0.49922460317611694,
+      "learning_rate": 0.00018830322767286913,
+      "loss": 1.1937,
+      "step": 3510
+    },
+    {
+      "epoch": 0.6251780626780626,
+      "grad_norm": 0.4637366831302643,
+      "learning_rate": 0.0001882966576225206,
+      "loss": 1.038,
+      "step": 3511
+    },
+    {
+      "epoch": 0.6253561253561254,
+      "grad_norm": 0.5330080389976501,
+      "learning_rate": 0.00018829008584218246,
+      "loss": 0.9308,
+      "step": 3512
+    },
+    {
+      "epoch": 0.625534188034188,
+      "grad_norm": 0.5443428754806519,
+      "learning_rate": 0.0001882835123319835,
+      "loss": 1.0006,
+      "step": 3513
+    },
+    {
+      "epoch": 0.6257122507122507,
+      "grad_norm": 0.5534018874168396,
+      "learning_rate": 0.00018827693709205253,
+      "loss": 1.2383,
+      "step": 3514
+    },
+    {
+      "epoch": 0.6258903133903134,
+      "grad_norm": 0.49207547307014465,
+      "learning_rate": 0.00018827036012251832,
+      "loss": 0.9804,
+      "step": 3515
+    },
+    {
+      "epoch": 0.6260683760683761,
+      "grad_norm": 0.4900086224079132,
+      "learning_rate": 0.0001882637814235098,
+      "loss": 1.012,
+      "step": 3516
+    },
+    {
+      "epoch": 0.6262464387464387,
+      "grad_norm": 0.5267475247383118,
+      "learning_rate": 0.00018825720099515585,
+      "loss": 1.1104,
+      "step": 3517
+    },
+    {
+      "epoch": 0.6264245014245015,
+      "grad_norm": 0.5711902379989624,
+      "learning_rate": 0.00018825061883758534,
+      "loss": 1.0616,
+      "step": 3518
+    },
+    {
+      "epoch": 0.6266025641025641,
+      "grad_norm": 0.5007771849632263,
+      "learning_rate": 0.0001882440349509273,
+      "loss": 0.9578,
+      "step": 3519
+    },
+    {
+      "epoch": 0.6267806267806267,
+      "grad_norm": 0.5657192468643188,
+      "learning_rate": 0.00018823744933531075,
+      "loss": 1.2768,
+      "step": 3520
+    },
+    {
+      "epoch": 0.6269586894586895,
+      "grad_norm": 0.6077173352241516,
+      "learning_rate": 0.00018823086199086462,
+      "loss": 1.147,
+      "step": 3521
+    },
+    {
+      "epoch": 0.6271367521367521,
+      "grad_norm": 0.5114718079566956,
+      "learning_rate": 0.000188224272917718,
+      "loss": 1.1176,
+      "step": 3522
+    },
+    {
+      "epoch": 0.6273148148148148,
+      "grad_norm": 0.4831676185131073,
+      "learning_rate": 0.0001882176821160001,
+      "loss": 0.8021,
+      "step": 3523
+    },
+    {
+      "epoch": 0.6274928774928775,
+      "grad_norm": 0.6327390670776367,
+      "learning_rate": 0.00018821108958583994,
+      "loss": 0.9449,
+      "step": 3524
+    },
+    {
+      "epoch": 0.6276709401709402,
+      "grad_norm": 0.5541796684265137,
+      "learning_rate": 0.00018820449532736672,
+      "loss": 1.2018,
+      "step": 3525
+    },
+    {
+      "epoch": 0.6278490028490028,
+      "grad_norm": 0.5224639773368835,
+      "learning_rate": 0.00018819789934070968,
+      "loss": 1.0138,
+      "step": 3526
+    },
+    {
+      "epoch": 0.6280270655270656,
+      "grad_norm": 0.49359360337257385,
+      "learning_rate": 0.00018819130162599798,
+      "loss": 1.0768,
+      "step": 3527
+    },
+    {
+      "epoch": 0.6282051282051282,
+      "grad_norm": 0.5525050759315491,
+      "learning_rate": 0.00018818470218336092,
+      "loss": 1.0883,
+      "step": 3528
+    },
+    {
+      "epoch": 0.6283831908831908,
+      "grad_norm": 0.5563427209854126,
+      "learning_rate": 0.00018817810101292787,
+      "loss": 1.1491,
+      "step": 3529
+    },
+    {
+      "epoch": 0.6285612535612536,
+      "grad_norm": 0.49363306164741516,
+      "learning_rate": 0.00018817149811482803,
+      "loss": 1.1409,
+      "step": 3530
+    },
+    {
+      "epoch": 0.6287393162393162,
+      "grad_norm": 0.5102340579032898,
+      "learning_rate": 0.00018816489348919086,
+      "loss": 1.1914,
+      "step": 3531
+    },
+    {
+      "epoch": 0.6289173789173789,
+      "grad_norm": 0.5173332691192627,
+      "learning_rate": 0.00018815828713614576,
+      "loss": 0.9308,
+      "step": 3532
+    },
+    {
+      "epoch": 0.6290954415954416,
+      "grad_norm": 0.5093010067939758,
+      "learning_rate": 0.00018815167905582216,
+      "loss": 0.9429,
+      "step": 3533
+    },
+    {
+      "epoch": 0.6292735042735043,
+      "grad_norm": 0.5453153848648071,
+      "learning_rate": 0.00018814506924834954,
+      "loss": 1.0147,
+      "step": 3534
+    },
+    {
+      "epoch": 0.6294515669515669,
+      "grad_norm": 0.5850773453712463,
+      "learning_rate": 0.00018813845771385737,
+      "loss": 1.3372,
+      "step": 3535
+    },
+    {
+      "epoch": 0.6296296296296297,
+      "grad_norm": 0.5095621943473816,
+      "learning_rate": 0.00018813184445247525,
+      "loss": 1.0515,
+      "step": 3536
+    },
+    {
+      "epoch": 0.6298076923076923,
+      "grad_norm": 0.6216054558753967,
+      "learning_rate": 0.00018812522946433266,
+      "loss": 0.8703,
+      "step": 3537
+    },
+    {
+      "epoch": 0.6299857549857549,
+      "grad_norm": 0.4945531189441681,
+      "learning_rate": 0.00018811861274955932,
+      "loss": 1.1485,
+      "step": 3538
+    },
+    {
+      "epoch": 0.6301638176638177,
+      "grad_norm": 0.47882601618766785,
+      "learning_rate": 0.00018811199430828477,
+      "loss": 1.1107,
+      "step": 3539
+    },
+    {
+      "epoch": 0.6303418803418803,
+      "grad_norm": 0.5005326867103577,
+      "learning_rate": 0.00018810537414063876,
+      "loss": 1.0237,
+      "step": 3540
+    },
+    {
+      "epoch": 0.6305199430199431,
+      "grad_norm": 0.5382370352745056,
+      "learning_rate": 0.00018809875224675093,
+      "loss": 0.9965,
+      "step": 3541
+    },
+    {
+      "epoch": 0.6306980056980057,
+      "grad_norm": 0.47002625465393066,
+      "learning_rate": 0.0001880921286267511,
+      "loss": 1.065,
+      "step": 3542
+    },
+    {
+      "epoch": 0.6308760683760684,
+      "grad_norm": 0.4519105851650238,
+      "learning_rate": 0.00018808550328076897,
+      "loss": 0.9312,
+      "step": 3543
+    },
+    {
+      "epoch": 0.6310541310541311,
+      "grad_norm": 0.45360881090164185,
+      "learning_rate": 0.0001880788762089344,
+      "loss": 1.0739,
+      "step": 3544
+    },
+    {
+      "epoch": 0.6312321937321937,
+      "grad_norm": 0.5578218698501587,
+      "learning_rate": 0.00018807224741137723,
+      "loss": 1.2478,
+      "step": 3545
+    },
+    {
+      "epoch": 0.6314102564102564,
+      "grad_norm": 0.4838615655899048,
+      "learning_rate": 0.0001880656168882273,
+      "loss": 1.0221,
+      "step": 3546
+    },
+    {
+      "epoch": 0.6315883190883191,
+      "grad_norm": 0.5733556747436523,
+      "learning_rate": 0.0001880589846396146,
+      "loss": 1.1249,
+      "step": 3547
+    },
+    {
+      "epoch": 0.6317663817663818,
+      "grad_norm": 0.4939686954021454,
+      "learning_rate": 0.00018805235066566894,
+      "loss": 0.8559,
+      "step": 3548
+    },
+    {
+      "epoch": 0.6319444444444444,
+      "grad_norm": 0.5072234869003296,
+      "learning_rate": 0.00018804571496652044,
+      "loss": 1.0842,
+      "step": 3549
+    },
+    {
+      "epoch": 0.6321225071225072,
+      "grad_norm": 0.4640493392944336,
+      "learning_rate": 0.00018803907754229903,
+      "loss": 1.0728,
+      "step": 3550
+    },
+    {
+      "epoch": 0.6323005698005698,
+      "grad_norm": 0.5314788818359375,
+      "learning_rate": 0.00018803243839313481,
+      "loss": 1.0752,
+      "step": 3551
+    },
+    {
+      "epoch": 0.6324786324786325,
+      "grad_norm": 0.5511462092399597,
+      "learning_rate": 0.0001880257975191578,
+      "loss": 1.0238,
+      "step": 3552
+    },
+    {
+      "epoch": 0.6326566951566952,
+      "grad_norm": 0.4980711042881012,
+      "learning_rate": 0.00018801915492049816,
+      "loss": 1.0981,
+      "step": 3553
+    },
+    {
+      "epoch": 0.6328347578347578,
+      "grad_norm": 0.7746123671531677,
+      "learning_rate": 0.00018801251059728604,
+      "loss": 1.0968,
+      "step": 3554
+    },
+    {
+      "epoch": 0.6330128205128205,
+      "grad_norm": 0.5006106495857239,
+      "learning_rate": 0.00018800586454965155,
+      "loss": 1.1802,
+      "step": 3555
+    },
+    {
+      "epoch": 0.6331908831908832,
+      "grad_norm": 0.49427780508995056,
+      "learning_rate": 0.000187999216777725,
+      "loss": 1.1257,
+      "step": 3556
+    },
+    {
+      "epoch": 0.6333689458689459,
+      "grad_norm": 0.5484146475791931,
+      "learning_rate": 0.00018799256728163662,
+      "loss": 1.1344,
+      "step": 3557
+    },
+    {
+      "epoch": 0.6335470085470085,
+      "grad_norm": 0.5007877349853516,
+      "learning_rate": 0.00018798591606151662,
+      "loss": 1.1328,
+      "step": 3558
+    },
+    {
+      "epoch": 0.6337250712250713,
+      "grad_norm": 0.5068148970603943,
+      "learning_rate": 0.00018797926311749544,
+      "loss": 0.976,
+      "step": 3559
+    },
+    {
+      "epoch": 0.6339031339031339,
+      "grad_norm": 0.44936859607696533,
+      "learning_rate": 0.00018797260844970334,
+      "loss": 0.9735,
+      "step": 3560
+    },
+    {
+      "epoch": 0.6340811965811965,
+      "grad_norm": 0.4592931866645813,
+      "learning_rate": 0.0001879659520582707,
+      "loss": 1.1306,
+      "step": 3561
+    },
+    {
+      "epoch": 0.6342592592592593,
+      "grad_norm": 0.4664020836353302,
+      "learning_rate": 0.00018795929394332795,
+      "loss": 1.0577,
+      "step": 3562
+    },
+    {
+      "epoch": 0.6344373219373219,
+      "grad_norm": 0.5638116002082825,
+      "learning_rate": 0.00018795263410500556,
+      "loss": 1.1747,
+      "step": 3563
+    },
+    {
+      "epoch": 0.6346153846153846,
+      "grad_norm": 0.524736225605011,
+      "learning_rate": 0.00018794597254343401,
+      "loss": 0.8964,
+      "step": 3564
+    },
+    {
+      "epoch": 0.6347934472934473,
+      "grad_norm": 0.4645404517650604,
+      "learning_rate": 0.00018793930925874386,
+      "loss": 0.8673,
+      "step": 3565
+    },
+    {
+      "epoch": 0.63497150997151,
+      "grad_norm": 0.4800064265727997,
+      "learning_rate": 0.00018793264425106558,
+      "loss": 1.0334,
+      "step": 3566
+    },
+    {
+      "epoch": 0.6351495726495726,
+      "grad_norm": 0.6202501058578491,
+      "learning_rate": 0.0001879259775205298,
+      "loss": 1.1061,
+      "step": 3567
+    },
+    {
+      "epoch": 0.6353276353276354,
+      "grad_norm": 0.503383457660675,
+      "learning_rate": 0.00018791930906726718,
+      "loss": 0.8545,
+      "step": 3568
+    },
+    {
+      "epoch": 0.635505698005698,
+      "grad_norm": 0.5256780982017517,
+      "learning_rate": 0.00018791263889140832,
+      "loss": 1.0785,
+      "step": 3569
+    },
+    {
+      "epoch": 0.6356837606837606,
+      "grad_norm": 0.47562023997306824,
+      "learning_rate": 0.00018790596699308392,
+      "loss": 1.0041,
+      "step": 3570
+    },
+    {
+      "epoch": 0.6358618233618234,
+      "grad_norm": 0.5103238224983215,
+      "learning_rate": 0.00018789929337242469,
+      "loss": 1.1488,
+      "step": 3571
+    },
+    {
+      "epoch": 0.636039886039886,
+      "grad_norm": 0.5023695826530457,
+      "learning_rate": 0.0001878926180295614,
+      "loss": 1.0696,
+      "step": 3572
+    },
+    {
+      "epoch": 0.6362179487179487,
+      "grad_norm": 0.5302290916442871,
+      "learning_rate": 0.00018788594096462487,
+      "loss": 1.0554,
+      "step": 3573
+    },
+    {
+      "epoch": 0.6363960113960114,
+      "grad_norm": 0.4798361361026764,
+      "learning_rate": 0.00018787926217774588,
+      "loss": 0.8872,
+      "step": 3574
+    },
+    {
+      "epoch": 0.6365740740740741,
+      "grad_norm": 0.5529209971427917,
+      "learning_rate": 0.00018787258166905527,
+      "loss": 1.0976,
+      "step": 3575
+    },
+    {
+      "epoch": 0.6367521367521367,
+      "grad_norm": 0.49757125973701477,
+      "learning_rate": 0.00018786589943868402,
+      "loss": 1.0049,
+      "step": 3576
+    },
+    {
+      "epoch": 0.6369301994301995,
+      "grad_norm": 0.5497848391532898,
+      "learning_rate": 0.00018785921548676295,
+      "loss": 1.2272,
+      "step": 3577
+    },
+    {
+      "epoch": 0.6371082621082621,
+      "grad_norm": 0.5061752200126648,
+      "learning_rate": 0.0001878525298134231,
+      "loss": 1.0307,
+      "step": 3578
+    },
+    {
+      "epoch": 0.6372863247863247,
+      "grad_norm": 0.5427432656288147,
+      "learning_rate": 0.00018784584241879538,
+      "loss": 1.1064,
+      "step": 3579
+    },
+    {
+      "epoch": 0.6374643874643875,
+      "grad_norm": 0.48312774300575256,
+      "learning_rate": 0.0001878391533030109,
+      "loss": 1.078,
+      "step": 3580
+    },
+    {
+      "epoch": 0.6376424501424501,
+      "grad_norm": 0.5059898495674133,
+      "learning_rate": 0.00018783246246620067,
+      "loss": 1.0922,
+      "step": 3581
+    },
+    {
+      "epoch": 0.6378205128205128,
+      "grad_norm": 0.5144124031066895,
+      "learning_rate": 0.00018782576990849581,
+      "loss": 1.0909,
+      "step": 3582
+    },
+    {
+      "epoch": 0.6379985754985755,
+      "grad_norm": 0.5535032153129578,
+      "learning_rate": 0.0001878190756300274,
+      "loss": 1.2579,
+      "step": 3583
+    },
+    {
+      "epoch": 0.6381766381766382,
+      "grad_norm": 0.49145692586898804,
+      "learning_rate": 0.00018781237963092667,
+      "loss": 1.0823,
+      "step": 3584
+    },
+    {
+      "epoch": 0.6383547008547008,
+      "grad_norm": 0.5245576500892639,
+      "learning_rate": 0.00018780568191132472,
+      "loss": 0.9595,
+      "step": 3585
+    },
+    {
+      "epoch": 0.6385327635327636,
+      "grad_norm": 0.5026637315750122,
+      "learning_rate": 0.00018779898247135287,
+      "loss": 1.153,
+      "step": 3586
+    },
+    {
+      "epoch": 0.6387108262108262,
+      "grad_norm": 0.5092771053314209,
+      "learning_rate": 0.00018779228131114234,
+      "loss": 1.0661,
+      "step": 3587
+    },
+    {
+      "epoch": 0.6388888888888888,
+      "grad_norm": 0.517387330532074,
+      "learning_rate": 0.00018778557843082444,
+      "loss": 1.0113,
+      "step": 3588
+    },
+    {
+      "epoch": 0.6390669515669516,
+      "grad_norm": 0.5149948000907898,
+      "learning_rate": 0.00018777887383053047,
+      "loss": 0.9483,
+      "step": 3589
+    },
+    {
+      "epoch": 0.6392450142450142,
+      "grad_norm": 0.4854544997215271,
+      "learning_rate": 0.00018777216751039185,
+      "loss": 1.22,
+      "step": 3590
+    },
+    {
+      "epoch": 0.6394230769230769,
+      "grad_norm": 0.5317271947860718,
+      "learning_rate": 0.0001877654594705399,
+      "loss": 1.2483,
+      "step": 3591
+    },
+    {
+      "epoch": 0.6396011396011396,
+      "grad_norm": 0.4554755687713623,
+      "learning_rate": 0.0001877587497111061,
+      "loss": 0.9864,
+      "step": 3592
+    },
+    {
+      "epoch": 0.6397792022792023,
+      "grad_norm": 0.4833736717700958,
+      "learning_rate": 0.0001877520382322219,
+      "loss": 0.8895,
+      "step": 3593
+    },
+    {
+      "epoch": 0.6399572649572649,
+      "grad_norm": 0.5018072724342346,
+      "learning_rate": 0.00018774532503401878,
+      "loss": 1.2523,
+      "step": 3594
+    },
+    {
+      "epoch": 0.6401353276353277,
+      "grad_norm": 0.4478762447834015,
+      "learning_rate": 0.00018773861011662832,
+      "loss": 0.8833,
+      "step": 3595
+    },
+    {
+      "epoch": 0.6403133903133903,
+      "grad_norm": 0.5686985850334167,
+      "learning_rate": 0.00018773189348018205,
+      "loss": 0.9934,
+      "step": 3596
+    },
+    {
+      "epoch": 0.6404914529914529,
+      "grad_norm": 0.5144175291061401,
+      "learning_rate": 0.00018772517512481157,
+      "loss": 0.8149,
+      "step": 3597
+    },
+    {
+      "epoch": 0.6406695156695157,
+      "grad_norm": 0.5359936356544495,
+      "learning_rate": 0.00018771845505064852,
+      "loss": 1.1822,
+      "step": 3598
+    },
+    {
+      "epoch": 0.6408475783475783,
+      "grad_norm": 0.532573938369751,
+      "learning_rate": 0.00018771173325782457,
+      "loss": 1.0361,
+      "step": 3599
+    },
+    {
+      "epoch": 0.6410256410256411,
+      "grad_norm": 0.46121537685394287,
+      "learning_rate": 0.00018770500974647138,
+      "loss": 1.0792,
+      "step": 3600
+    },
+    {
+      "epoch": 0.6412037037037037,
+      "grad_norm": 0.4804821312427521,
+      "learning_rate": 0.00018769828451672076,
+      "loss": 1.1119,
+      "step": 3601
+    },
+    {
+      "epoch": 0.6413817663817664,
+      "grad_norm": 0.4955114722251892,
+      "learning_rate": 0.00018769155756870443,
+      "loss": 0.9312,
+      "step": 3602
+    },
+    {
+      "epoch": 0.6415598290598291,
+      "grad_norm": 0.4987298250198364,
+      "learning_rate": 0.00018768482890255415,
+      "loss": 1.2326,
+      "step": 3603
+    },
+    {
+      "epoch": 0.6417378917378918,
+      "grad_norm": 0.47216179966926575,
+      "learning_rate": 0.0001876780985184018,
+      "loss": 1.0114,
+      "step": 3604
+    },
+    {
+      "epoch": 0.6419159544159544,
+      "grad_norm": 0.5891931653022766,
+      "learning_rate": 0.0001876713664163793,
+      "loss": 1.2963,
+      "step": 3605
+    },
+    {
+      "epoch": 0.6420940170940171,
+      "grad_norm": 0.4645081162452698,
+      "learning_rate": 0.00018766463259661846,
+      "loss": 1.0874,
+      "step": 3606
+    },
+    {
+      "epoch": 0.6422720797720798,
+      "grad_norm": 0.5275476574897766,
+      "learning_rate": 0.00018765789705925125,
+      "loss": 0.9453,
+      "step": 3607
+    },
+    {
+      "epoch": 0.6424501424501424,
+      "grad_norm": 0.5884957313537598,
+      "learning_rate": 0.00018765115980440964,
+      "loss": 1.0796,
+      "step": 3608
+    },
+    {
+      "epoch": 0.6426282051282052,
+      "grad_norm": 0.4843178987503052,
+      "learning_rate": 0.00018764442083222567,
+      "loss": 1.1657,
+      "step": 3609
+    },
+    {
+      "epoch": 0.6428062678062678,
+      "grad_norm": 0.5188381671905518,
+      "learning_rate": 0.00018763768014283126,
+      "loss": 1.1109,
+      "step": 3610
+    },
+    {
+      "epoch": 0.6429843304843305,
+      "grad_norm": 0.4101468324661255,
+      "learning_rate": 0.00018763093773635863,
+      "loss": 0.895,
+      "step": 3611
+    },
+    {
+      "epoch": 0.6431623931623932,
+      "grad_norm": 0.4552084505558014,
+      "learning_rate": 0.00018762419361293979,
+      "loss": 0.9418,
+      "step": 3612
+    },
+    {
+      "epoch": 0.6433404558404558,
+      "grad_norm": 0.5924661159515381,
+      "learning_rate": 0.0001876174477727069,
+      "loss": 1.2562,
+      "step": 3613
+    },
+    {
+      "epoch": 0.6435185185185185,
+      "grad_norm": 0.5072348713874817,
+      "learning_rate": 0.00018761070021579212,
+      "loss": 1.1501,
+      "step": 3614
+    },
+    {
+      "epoch": 0.6436965811965812,
+      "grad_norm": 0.5312697887420654,
+      "learning_rate": 0.0001876039509423277,
+      "loss": 1.0751,
+      "step": 3615
+    },
+    {
+      "epoch": 0.6438746438746439,
+      "grad_norm": 0.6046462059020996,
+      "learning_rate": 0.0001875971999524458,
+      "loss": 1.0927,
+      "step": 3616
+    },
+    {
+      "epoch": 0.6440527065527065,
+      "grad_norm": 0.4992375373840332,
+      "learning_rate": 0.00018759044724627876,
+      "loss": 0.96,
+      "step": 3617
+    },
+    {
+      "epoch": 0.6442307692307693,
+      "grad_norm": 0.4983134865760803,
+      "learning_rate": 0.00018758369282395886,
+      "loss": 1.0599,
+      "step": 3618
+    },
+    {
+      "epoch": 0.6444088319088319,
+      "grad_norm": 0.5655683279037476,
+      "learning_rate": 0.00018757693668561843,
+      "loss": 1.2372,
+      "step": 3619
+    },
+    {
+      "epoch": 0.6445868945868946,
+      "grad_norm": 0.4968827962875366,
+      "learning_rate": 0.00018757017883138985,
+      "loss": 1.1639,
+      "step": 3620
+    },
+    {
+      "epoch": 0.6447649572649573,
+      "grad_norm": 0.5831420421600342,
+      "learning_rate": 0.00018756341926140553,
+      "loss": 0.9002,
+      "step": 3621
+    },
+    {
+      "epoch": 0.64494301994302,
+      "grad_norm": 0.4828467071056366,
+      "learning_rate": 0.0001875566579757979,
+      "loss": 0.9201,
+      "step": 3622
+    },
+    {
+      "epoch": 0.6451210826210826,
+      "grad_norm": 0.5067087411880493,
+      "learning_rate": 0.00018754989497469943,
+      "loss": 0.9874,
+      "step": 3623
+    },
+    {
+      "epoch": 0.6452991452991453,
+      "grad_norm": 0.5182318091392517,
+      "learning_rate": 0.00018754313025824267,
+      "loss": 1.1291,
+      "step": 3624
+    },
+    {
+      "epoch": 0.645477207977208,
+      "grad_norm": 0.472200483083725,
+      "learning_rate": 0.0001875363638265601,
+      "loss": 1.0286,
+      "step": 3625
+    },
+    {
+      "epoch": 0.6456552706552706,
+      "grad_norm": 0.4597308039665222,
+      "learning_rate": 0.0001875295956797843,
+      "loss": 0.7517,
+      "step": 3626
+    },
+    {
+      "epoch": 0.6458333333333334,
+      "grad_norm": 0.5358221530914307,
+      "learning_rate": 0.00018752282581804798,
+      "loss": 1.2264,
+      "step": 3627
+    },
+    {
+      "epoch": 0.646011396011396,
+      "grad_norm": 0.5268992781639099,
+      "learning_rate": 0.00018751605424148363,
+      "loss": 1.0801,
+      "step": 3628
+    },
+    {
+      "epoch": 0.6461894586894587,
+      "grad_norm": 0.5917379260063171,
+      "learning_rate": 0.00018750928095022403,
+      "loss": 0.9538,
+      "step": 3629
+    },
+    {
+      "epoch": 0.6463675213675214,
+      "grad_norm": 0.44506707787513733,
+      "learning_rate": 0.00018750250594440183,
+      "loss": 0.9818,
+      "step": 3630
+    },
+    {
+      "epoch": 0.646545584045584,
+      "grad_norm": 0.5578880906105042,
+      "learning_rate": 0.00018749572922414982,
+      "loss": 0.9958,
+      "step": 3631
+    },
+    {
+      "epoch": 0.6467236467236467,
+      "grad_norm": 0.5155318975448608,
+      "learning_rate": 0.00018748895078960076,
+      "loss": 1.2888,
+      "step": 3632
+    },
+    {
+      "epoch": 0.6469017094017094,
+      "grad_norm": 0.5117297768592834,
+      "learning_rate": 0.0001874821706408874,
+      "loss": 1.0452,
+      "step": 3633
+    },
+    {
+      "epoch": 0.6470797720797721,
+      "grad_norm": 0.5169841647148132,
+      "learning_rate": 0.00018747538877814267,
+      "loss": 1.1649,
+      "step": 3634
+    },
+    {
+      "epoch": 0.6472578347578347,
+      "grad_norm": 0.5001181960105896,
+      "learning_rate": 0.00018746860520149942,
+      "loss": 1.1472,
+      "step": 3635
+    },
+    {
+      "epoch": 0.6474358974358975,
+      "grad_norm": 0.6289856433868408,
+      "learning_rate": 0.00018746181991109056,
+      "loss": 1.0351,
+      "step": 3636
+    },
+    {
+      "epoch": 0.6476139601139601,
+      "grad_norm": 0.5490612983703613,
+      "learning_rate": 0.00018745503290704897,
+      "loss": 0.8938,
+      "step": 3637
+    },
+    {
+      "epoch": 0.6477920227920227,
+      "grad_norm": 0.47378283739089966,
+      "learning_rate": 0.00018744824418950775,
+      "loss": 0.937,
+      "step": 3638
+    },
+    {
+      "epoch": 0.6479700854700855,
+      "grad_norm": 0.6079059839248657,
+      "learning_rate": 0.0001874414537585998,
+      "loss": 1.0486,
+      "step": 3639
+    },
+    {
+      "epoch": 0.6481481481481481,
+      "grad_norm": 0.5351769924163818,
+      "learning_rate": 0.00018743466161445823,
+      "loss": 1.0316,
+      "step": 3640
+    },
+    {
+      "epoch": 0.6483262108262108,
+      "grad_norm": 0.5516425967216492,
+      "learning_rate": 0.0001874278677572161,
+      "loss": 1.1552,
+      "step": 3641
+    },
+    {
+      "epoch": 0.6485042735042735,
+      "grad_norm": 0.5027523636817932,
+      "learning_rate": 0.0001874210721870065,
+      "loss": 1.0491,
+      "step": 3642
+    },
+    {
+      "epoch": 0.6486823361823362,
+      "grad_norm": 0.5596168041229248,
+      "learning_rate": 0.00018741427490396258,
+      "loss": 1.0256,
+      "step": 3643
+    },
+    {
+      "epoch": 0.6488603988603988,
+      "grad_norm": 0.5601046681404114,
+      "learning_rate": 0.00018740747590821751,
+      "loss": 1.1604,
+      "step": 3644
+    },
+    {
+      "epoch": 0.6490384615384616,
+      "grad_norm": 0.49749523401260376,
+      "learning_rate": 0.0001874006751999046,
+      "loss": 1.0532,
+      "step": 3645
+    },
+    {
+      "epoch": 0.6492165242165242,
+      "grad_norm": 0.6226113438606262,
+      "learning_rate": 0.00018739387277915697,
+      "loss": 1.1402,
+      "step": 3646
+    },
+    {
+      "epoch": 0.6493945868945868,
+      "grad_norm": 0.6142009496688843,
+      "learning_rate": 0.00018738706864610794,
+      "loss": 1.2437,
+      "step": 3647
+    },
+    {
+      "epoch": 0.6495726495726496,
+      "grad_norm": 0.48814916610717773,
+      "learning_rate": 0.00018738026280089084,
+      "loss": 0.8429,
+      "step": 3648
+    },
+    {
+      "epoch": 0.6497507122507122,
+      "grad_norm": 0.5717982053756714,
+      "learning_rate": 0.00018737345524363902,
+      "loss": 1.1095,
+      "step": 3649
+    },
+    {
+      "epoch": 0.6499287749287749,
+      "grad_norm": 0.5150009989738464,
+      "learning_rate": 0.00018736664597448582,
+      "loss": 1.199,
+      "step": 3650
+    },
+    {
+      "epoch": 0.6501068376068376,
+      "grad_norm": 0.58461594581604,
+      "learning_rate": 0.00018735983499356472,
+      "loss": 1.0704,
+      "step": 3651
+    },
+    {
+      "epoch": 0.6502849002849003,
+      "grad_norm": 0.5108643770217896,
+      "learning_rate": 0.0001873530223010091,
+      "loss": 1.2039,
+      "step": 3652
+    },
+    {
+      "epoch": 0.6504629629629629,
+      "grad_norm": 0.513306736946106,
+      "learning_rate": 0.00018734620789695247,
+      "loss": 1.1448,
+      "step": 3653
+    },
+    {
+      "epoch": 0.6506410256410257,
+      "grad_norm": 0.5139986872673035,
+      "learning_rate": 0.00018733939178152835,
+      "loss": 1.0023,
+      "step": 3654
+    },
+    {
+      "epoch": 0.6508190883190883,
+      "grad_norm": 0.5187703967094421,
+      "learning_rate": 0.00018733257395487027,
+      "loss": 1.1304,
+      "step": 3655
+    },
+    {
+      "epoch": 0.6509971509971509,
+      "grad_norm": 0.5470501184463501,
+      "learning_rate": 0.00018732575441711183,
+      "loss": 1.0272,
+      "step": 3656
+    },
+    {
+      "epoch": 0.6511752136752137,
+      "grad_norm": 0.537309467792511,
+      "learning_rate": 0.00018731893316838665,
+      "loss": 1.0806,
+      "step": 3657
+    },
+    {
+      "epoch": 0.6513532763532763,
+      "grad_norm": 0.5187864899635315,
+      "learning_rate": 0.00018731211020882836,
+      "loss": 1.0154,
+      "step": 3658
+    },
+    {
+      "epoch": 0.6515313390313391,
+      "grad_norm": 0.48373252153396606,
+      "learning_rate": 0.00018730528553857062,
+      "loss": 1.0135,
+      "step": 3659
+    },
+    {
+      "epoch": 0.6517094017094017,
+      "grad_norm": 0.5645000338554382,
+      "learning_rate": 0.00018729845915774716,
+      "loss": 0.8924,
+      "step": 3660
+    },
+    {
+      "epoch": 0.6518874643874644,
+      "grad_norm": 0.5722129940986633,
+      "learning_rate": 0.00018729163106649178,
+      "loss": 1.2416,
+      "step": 3661
+    },
+    {
+      "epoch": 0.6520655270655271,
+      "grad_norm": 0.5904877185821533,
+      "learning_rate": 0.00018728480126493823,
+      "loss": 0.9792,
+      "step": 3662
+    },
+    {
+      "epoch": 0.6522435897435898,
+      "grad_norm": 0.5224713087081909,
+      "learning_rate": 0.00018727796975322026,
+      "loss": 1.079,
+      "step": 3663
+    },
+    {
+      "epoch": 0.6524216524216524,
+      "grad_norm": 0.5667217969894409,
+      "learning_rate": 0.00018727113653147184,
+      "loss": 1.1397,
+      "step": 3664
+    },
+    {
+      "epoch": 0.6525997150997151,
+      "grad_norm": 0.5274622440338135,
+      "learning_rate": 0.00018726430159982677,
+      "loss": 1.0569,
+      "step": 3665
+    },
+    {
+      "epoch": 0.6527777777777778,
+      "grad_norm": 0.5745310187339783,
+      "learning_rate": 0.00018725746495841896,
+      "loss": 1.2129,
+      "step": 3666
+    },
+    {
+      "epoch": 0.6529558404558404,
+      "grad_norm": 0.6123398542404175,
+      "learning_rate": 0.0001872506266073824,
+      "loss": 1.186,
+      "step": 3667
+    },
+    {
+      "epoch": 0.6531339031339032,
+      "grad_norm": 0.4983387291431427,
+      "learning_rate": 0.00018724378654685106,
+      "loss": 1.1957,
+      "step": 3668
+    },
+    {
+      "epoch": 0.6533119658119658,
+      "grad_norm": 0.5584192276000977,
+      "learning_rate": 0.00018723694477695897,
+      "loss": 1.0939,
+      "step": 3669
+    },
+    {
+      "epoch": 0.6534900284900285,
+      "grad_norm": 0.5318745374679565,
+      "learning_rate": 0.00018723010129784016,
+      "loss": 1.1869,
+      "step": 3670
+    },
+    {
+      "epoch": 0.6536680911680912,
+      "grad_norm": 0.4607617259025574,
+      "learning_rate": 0.0001872232561096287,
+      "loss": 0.8447,
+      "step": 3671
+    },
+    {
+      "epoch": 0.6538461538461539,
+      "grad_norm": 0.5312213897705078,
+      "learning_rate": 0.00018721640921245874,
+      "loss": 1.0623,
+      "step": 3672
+    },
+    {
+      "epoch": 0.6540242165242165,
+      "grad_norm": 0.5099136233329773,
+      "learning_rate": 0.0001872095606064644,
+      "loss": 0.7174,
+      "step": 3673
+    },
+    {
+      "epoch": 0.6542022792022792,
+      "grad_norm": 0.6894404888153076,
+      "learning_rate": 0.0001872027102917799,
+      "loss": 1.0251,
+      "step": 3674
+    },
+    {
+      "epoch": 0.6543803418803419,
+      "grad_norm": 0.5758535861968994,
+      "learning_rate": 0.00018719585826853944,
+      "loss": 1.1655,
+      "step": 3675
+    },
+    {
+      "epoch": 0.6545584045584045,
+      "grad_norm": 0.521824061870575,
+      "learning_rate": 0.0001871890045368773,
+      "loss": 1.1653,
+      "step": 3676
+    },
+    {
+      "epoch": 0.6547364672364673,
+      "grad_norm": 0.5370712280273438,
+      "learning_rate": 0.00018718214909692771,
+      "loss": 1.3152,
+      "step": 3677
+    },
+    {
+      "epoch": 0.6549145299145299,
+      "grad_norm": 0.4459827244281769,
+      "learning_rate": 0.000187175291948825,
+      "loss": 1.0953,
+      "step": 3678
+    },
+    {
+      "epoch": 0.6550925925925926,
+      "grad_norm": 0.44131460785865784,
+      "learning_rate": 0.00018716843309270353,
+      "loss": 0.8568,
+      "step": 3679
+    },
+    {
+      "epoch": 0.6552706552706553,
+      "grad_norm": 0.5529624819755554,
+      "learning_rate": 0.00018716157252869772,
+      "loss": 1.2085,
+      "step": 3680
+    },
+    {
+      "epoch": 0.655448717948718,
+      "grad_norm": 0.44604751467704773,
+      "learning_rate": 0.00018715471025694194,
+      "loss": 0.9605,
+      "step": 3681
+    },
+    {
+      "epoch": 0.6556267806267806,
+      "grad_norm": 0.4662449359893799,
+      "learning_rate": 0.0001871478462775707,
+      "loss": 1.2092,
+      "step": 3682
+    },
+    {
+      "epoch": 0.6558048433048433,
+      "grad_norm": 0.42632922530174255,
+      "learning_rate": 0.0001871409805907184,
+      "loss": 0.9141,
+      "step": 3683
+    },
+    {
+      "epoch": 0.655982905982906,
+      "grad_norm": 0.534009575843811,
+      "learning_rate": 0.00018713411319651958,
+      "loss": 1.0147,
+      "step": 3684
+    },
+    {
+      "epoch": 0.6561609686609686,
+      "grad_norm": 0.5433241724967957,
+      "learning_rate": 0.00018712724409510888,
+      "loss": 1.1998,
+      "step": 3685
+    },
+    {
+      "epoch": 0.6563390313390314,
+      "grad_norm": 0.4771319627761841,
+      "learning_rate": 0.0001871203732866208,
+      "loss": 1.0384,
+      "step": 3686
+    },
+    {
+      "epoch": 0.656517094017094,
+      "grad_norm": 0.507641077041626,
+      "learning_rate": 0.00018711350077119,
+      "loss": 0.9608,
+      "step": 3687
+    },
+    {
+      "epoch": 0.6566951566951567,
+      "grad_norm": 0.5069413185119629,
+      "learning_rate": 0.00018710662654895108,
+      "loss": 1.055,
+      "step": 3688
+    },
+    {
+      "epoch": 0.6568732193732194,
+      "grad_norm": 0.512340247631073,
+      "learning_rate": 0.00018709975062003876,
+      "loss": 0.9506,
+      "step": 3689
+    },
+    {
+      "epoch": 0.657051282051282,
+      "grad_norm": 0.5156390070915222,
+      "learning_rate": 0.00018709287298458778,
+      "loss": 1.0089,
+      "step": 3690
+    },
+    {
+      "epoch": 0.6572293447293447,
+      "grad_norm": 0.5101696252822876,
+      "learning_rate": 0.0001870859936427329,
+      "loss": 1.0441,
+      "step": 3691
+    },
+    {
+      "epoch": 0.6574074074074074,
+      "grad_norm": 0.4394689202308655,
+      "learning_rate": 0.00018707911259460884,
+      "loss": 0.9124,
+      "step": 3692
+    },
+    {
+      "epoch": 0.6575854700854701,
+      "grad_norm": 0.4842554032802582,
+      "learning_rate": 0.00018707222984035043,
+      "loss": 1.0051,
+      "step": 3693
+    },
+    {
+      "epoch": 0.6577635327635327,
+      "grad_norm": 0.6418108344078064,
+      "learning_rate": 0.00018706534538009262,
+      "loss": 1.1165,
+      "step": 3694
+    },
+    {
+      "epoch": 0.6579415954415955,
+      "grad_norm": 0.5596832036972046,
+      "learning_rate": 0.00018705845921397022,
+      "loss": 1.1127,
+      "step": 3695
+    },
+    {
+      "epoch": 0.6581196581196581,
+      "grad_norm": 0.6692909002304077,
+      "learning_rate": 0.00018705157134211813,
+      "loss": 1.2403,
+      "step": 3696
+    },
+    {
+      "epoch": 0.6582977207977208,
+      "grad_norm": 0.5046468377113342,
+      "learning_rate": 0.00018704468176467134,
+      "loss": 1.1016,
+      "step": 3697
+    },
+    {
+      "epoch": 0.6584757834757835,
+      "grad_norm": 0.6723586320877075,
+      "learning_rate": 0.00018703779048176485,
+      "loss": 1.1777,
+      "step": 3698
+    },
+    {
+      "epoch": 0.6586538461538461,
+      "grad_norm": 0.5269754528999329,
+      "learning_rate": 0.00018703089749353365,
+      "loss": 1.1441,
+      "step": 3699
+    },
+    {
+      "epoch": 0.6588319088319088,
+      "grad_norm": 0.5303323268890381,
+      "learning_rate": 0.0001870240028001128,
+      "loss": 1.07,
+      "step": 3700
+    },
+    {
+      "epoch": 0.6590099715099715,
+      "grad_norm": 0.4795511066913605,
+      "learning_rate": 0.00018701710640163738,
+      "loss": 1.0189,
+      "step": 3701
+    },
+    {
+      "epoch": 0.6591880341880342,
+      "grad_norm": 0.514659583568573,
+      "learning_rate": 0.00018701020829824255,
+      "loss": 1.0792,
+      "step": 3702
+    },
+    {
+      "epoch": 0.6593660968660968,
+      "grad_norm": 0.5407463312149048,
+      "learning_rate": 0.0001870033084900634,
+      "loss": 0.9346,
+      "step": 3703
+    },
+    {
+      "epoch": 0.6595441595441596,
+      "grad_norm": 0.5358424186706543,
+      "learning_rate": 0.0001869964069772352,
+      "loss": 1.1242,
+      "step": 3704
+    },
+    {
+      "epoch": 0.6597222222222222,
+      "grad_norm": 0.470825731754303,
+      "learning_rate": 0.00018698950375989307,
+      "loss": 0.9952,
+      "step": 3705
+    },
+    {
+      "epoch": 0.6599002849002849,
+      "grad_norm": 0.5711592435836792,
+      "learning_rate": 0.00018698259883817236,
+      "loss": 1.1678,
+      "step": 3706
+    },
+    {
+      "epoch": 0.6600783475783476,
+      "grad_norm": 0.5298995971679688,
+      "learning_rate": 0.00018697569221220832,
+      "loss": 0.869,
+      "step": 3707
+    },
+    {
+      "epoch": 0.6602564102564102,
+      "grad_norm": 0.5453875064849854,
+      "learning_rate": 0.00018696878388213626,
+      "loss": 0.9706,
+      "step": 3708
+    },
+    {
+      "epoch": 0.6604344729344729,
+      "grad_norm": 0.6219926476478577,
+      "learning_rate": 0.00018696187384809154,
+      "loss": 1.1902,
+      "step": 3709
+    },
+    {
+      "epoch": 0.6606125356125356,
+      "grad_norm": 0.5972491502761841,
+      "learning_rate": 0.00018695496211020953,
+      "loss": 1.2054,
+      "step": 3710
+    },
+    {
+      "epoch": 0.6607905982905983,
+      "grad_norm": 0.5048904418945312,
+      "learning_rate": 0.0001869480486686257,
+      "loss": 1.0405,
+      "step": 3711
+    },
+    {
+      "epoch": 0.6609686609686609,
+      "grad_norm": 0.5474200248718262,
+      "learning_rate": 0.00018694113352347546,
+      "loss": 1.09,
+      "step": 3712
+    },
+    {
+      "epoch": 0.6611467236467237,
+      "grad_norm": 0.5073318481445312,
+      "learning_rate": 0.00018693421667489432,
+      "loss": 1.0698,
+      "step": 3713
+    },
+    {
+      "epoch": 0.6613247863247863,
+      "grad_norm": 0.5693208575248718,
+      "learning_rate": 0.0001869272981230178,
+      "loss": 0.9664,
+      "step": 3714
+    },
+    {
+      "epoch": 0.6615028490028491,
+      "grad_norm": 0.5678503513336182,
+      "learning_rate": 0.00018692037786798143,
+      "loss": 1.0895,
+      "step": 3715
+    },
+    {
+      "epoch": 0.6616809116809117,
+      "grad_norm": 0.4950976073741913,
+      "learning_rate": 0.00018691345590992082,
+      "loss": 0.9584,
+      "step": 3716
+    },
+    {
+      "epoch": 0.6618589743589743,
+      "grad_norm": 0.4944666624069214,
+      "learning_rate": 0.0001869065322489716,
+      "loss": 0.8607,
+      "step": 3717
+    },
+    {
+      "epoch": 0.6620370370370371,
+      "grad_norm": 0.5197804570198059,
+      "learning_rate": 0.0001868996068852694,
+      "loss": 1.2335,
+      "step": 3718
+    },
+    {
+      "epoch": 0.6622150997150997,
+      "grad_norm": 0.6550365686416626,
+      "learning_rate": 0.00018689267981894994,
+      "loss": 1.0441,
+      "step": 3719
+    },
+    {
+      "epoch": 0.6623931623931624,
+      "grad_norm": 0.5331503748893738,
+      "learning_rate": 0.00018688575105014888,
+      "loss": 1.1696,
+      "step": 3720
+    },
+    {
+      "epoch": 0.6625712250712251,
+      "grad_norm": 0.47304239869117737,
+      "learning_rate": 0.00018687882057900207,
+      "loss": 0.9695,
+      "step": 3721
+    },
+    {
+      "epoch": 0.6627492877492878,
+      "grad_norm": 0.5653772354125977,
+      "learning_rate": 0.00018687188840564524,
+      "loss": 1.2082,
+      "step": 3722
+    },
+    {
+      "epoch": 0.6629273504273504,
+      "grad_norm": 0.5323491096496582,
+      "learning_rate": 0.00018686495453021417,
+      "loss": 0.9106,
+      "step": 3723
+    },
+    {
+      "epoch": 0.6631054131054132,
+      "grad_norm": 0.5612817406654358,
+      "learning_rate": 0.00018685801895284483,
+      "loss": 1.1302,
+      "step": 3724
+    },
+    {
+      "epoch": 0.6632834757834758,
+      "grad_norm": 0.4562164545059204,
+      "learning_rate": 0.000186851081673673,
+      "loss": 0.8886,
+      "step": 3725
+    },
+    {
+      "epoch": 0.6634615384615384,
+      "grad_norm": 0.5006430745124817,
+      "learning_rate": 0.00018684414269283463,
+      "loss": 0.9128,
+      "step": 3726
+    },
+    {
+      "epoch": 0.6636396011396012,
+      "grad_norm": 0.5305442810058594,
+      "learning_rate": 0.0001868372020104657,
+      "loss": 1.1766,
+      "step": 3727
+    },
+    {
+      "epoch": 0.6638176638176638,
+      "grad_norm": 0.6129274368286133,
+      "learning_rate": 0.0001868302596267022,
+      "loss": 1.04,
+      "step": 3728
+    },
+    {
+      "epoch": 0.6639957264957265,
+      "grad_norm": 0.5530399084091187,
+      "learning_rate": 0.00018682331554168013,
+      "loss": 1.4114,
+      "step": 3729
+    },
+    {
+      "epoch": 0.6641737891737892,
+      "grad_norm": 0.5397193431854248,
+      "learning_rate": 0.00018681636975553557,
+      "loss": 1.1945,
+      "step": 3730
+    },
+    {
+      "epoch": 0.6643518518518519,
+      "grad_norm": 0.5510205030441284,
+      "learning_rate": 0.00018680942226840456,
+      "loss": 1.0489,
+      "step": 3731
+    },
+    {
+      "epoch": 0.6645299145299145,
+      "grad_norm": 0.5519221425056458,
+      "learning_rate": 0.00018680247308042324,
+      "loss": 1.1633,
+      "step": 3732
+    },
+    {
+      "epoch": 0.6647079772079773,
+      "grad_norm": 0.4848768711090088,
+      "learning_rate": 0.00018679552219172784,
+      "loss": 0.8716,
+      "step": 3733
+    },
+    {
+      "epoch": 0.6648860398860399,
+      "grad_norm": 0.5490246415138245,
+      "learning_rate": 0.0001867885696024544,
+      "loss": 1.1347,
+      "step": 3734
+    },
+    {
+      "epoch": 0.6650641025641025,
+      "grad_norm": 0.5281458497047424,
+      "learning_rate": 0.00018678161531273928,
+      "loss": 1.0987,
+      "step": 3735
+    },
+    {
+      "epoch": 0.6652421652421653,
+      "grad_norm": 0.5313079953193665,
+      "learning_rate": 0.00018677465932271867,
+      "loss": 0.9705,
+      "step": 3736
+    },
+    {
+      "epoch": 0.6654202279202279,
+      "grad_norm": 0.5425750017166138,
+      "learning_rate": 0.0001867677016325289,
+      "loss": 1.1847,
+      "step": 3737
+    },
+    {
+      "epoch": 0.6655982905982906,
+      "grad_norm": 0.5796298980712891,
+      "learning_rate": 0.0001867607422423062,
+      "loss": 1.2639,
+      "step": 3738
+    },
+    {
+      "epoch": 0.6657763532763533,
+      "grad_norm": 0.49738675355911255,
+      "learning_rate": 0.00018675378115218702,
+      "loss": 1.0536,
+      "step": 3739
+    },
+    {
+      "epoch": 0.665954415954416,
+      "grad_norm": 0.665250301361084,
+      "learning_rate": 0.0001867468183623077,
+      "loss": 1.2836,
+      "step": 3740
+    },
+    {
+      "epoch": 0.6661324786324786,
+      "grad_norm": 0.5184717178344727,
+      "learning_rate": 0.00018673985387280469,
+      "loss": 1.0497,
+      "step": 3741
+    },
+    {
+      "epoch": 0.6663105413105413,
+      "grad_norm": 0.5129656791687012,
+      "learning_rate": 0.00018673288768381442,
+      "loss": 1.2041,
+      "step": 3742
+    },
+    {
+      "epoch": 0.666488603988604,
+      "grad_norm": 0.5308768153190613,
+      "learning_rate": 0.00018672591979547337,
+      "loss": 1.2092,
+      "step": 3743
+    },
+    {
+      "epoch": 0.6666666666666666,
+      "grad_norm": 0.5059141516685486,
+      "learning_rate": 0.00018671895020791812,
+      "loss": 1.1929,
+      "step": 3744
+    },
+    {
+      "epoch": 0.6668447293447294,
+      "grad_norm": 0.5237857103347778,
+      "learning_rate": 0.00018671197892128517,
+      "loss": 1.2538,
+      "step": 3745
+    },
+    {
+      "epoch": 0.667022792022792,
+      "grad_norm": 0.450000137090683,
+      "learning_rate": 0.0001867050059357111,
+      "loss": 0.7138,
+      "step": 3746
+    },
+    {
+      "epoch": 0.6672008547008547,
+      "grad_norm": 0.5413795709609985,
+      "learning_rate": 0.00018669803125133258,
+      "loss": 1.1383,
+      "step": 3747
+    },
+    {
+      "epoch": 0.6673789173789174,
+      "grad_norm": 0.4657825529575348,
+      "learning_rate": 0.00018669105486828622,
+      "loss": 1.0518,
+      "step": 3748
+    },
+    {
+      "epoch": 0.66755698005698,
+      "grad_norm": 0.6198551654815674,
+      "learning_rate": 0.00018668407678670875,
+      "loss": 1.2697,
+      "step": 3749
+    },
+    {
+      "epoch": 0.6677350427350427,
+      "grad_norm": 0.5112186074256897,
+      "learning_rate": 0.00018667709700673685,
+      "loss": 0.9907,
+      "step": 3750
+    },
+    {
+      "epoch": 0.6679131054131054,
+      "grad_norm": 0.5446593761444092,
+      "learning_rate": 0.00018667011552850728,
+      "loss": 1.0708,
+      "step": 3751
+    },
+    {
+      "epoch": 0.6680911680911681,
+      "grad_norm": 0.5673866271972656,
+      "learning_rate": 0.00018666313235215682,
+      "loss": 1.05,
+      "step": 3752
+    },
+    {
+      "epoch": 0.6682692307692307,
+      "grad_norm": 0.4821988046169281,
+      "learning_rate": 0.00018665614747782235,
+      "loss": 1.0543,
+      "step": 3753
+    },
+    {
+      "epoch": 0.6684472934472935,
+      "grad_norm": 0.5158842206001282,
+      "learning_rate": 0.00018664916090564067,
+      "loss": 1.0331,
+      "step": 3754
+    },
+    {
+      "epoch": 0.6686253561253561,
+      "grad_norm": 0.45486921072006226,
+      "learning_rate": 0.00018664217263574865,
+      "loss": 0.9262,
+      "step": 3755
+    },
+    {
+      "epoch": 0.6688034188034188,
+      "grad_norm": 0.46193036437034607,
+      "learning_rate": 0.00018663518266828327,
+      "loss": 0.9858,
+      "step": 3756
+    },
+    {
+      "epoch": 0.6689814814814815,
+      "grad_norm": 0.5144094824790955,
+      "learning_rate": 0.00018662819100338148,
+      "loss": 1.0302,
+      "step": 3757
+    },
+    {
+      "epoch": 0.6691595441595442,
+      "grad_norm": 0.5246134400367737,
+      "learning_rate": 0.0001866211976411802,
+      "loss": 1.064,
+      "step": 3758
+    },
+    {
+      "epoch": 0.6693376068376068,
+      "grad_norm": 0.4853166937828064,
+      "learning_rate": 0.0001866142025818165,
+      "loss": 0.9481,
+      "step": 3759
+    },
+    {
+      "epoch": 0.6695156695156695,
+      "grad_norm": 0.5029586553573608,
+      "learning_rate": 0.00018660720582542743,
+      "loss": 0.9443,
+      "step": 3760
+    },
+    {
+      "epoch": 0.6696937321937322,
+      "grad_norm": 0.5373172163963318,
+      "learning_rate": 0.0001866002073721501,
+      "loss": 1.1401,
+      "step": 3761
+    },
+    {
+      "epoch": 0.6698717948717948,
+      "grad_norm": 0.6236287951469421,
+      "learning_rate": 0.00018659320722212158,
+      "loss": 1.1255,
+      "step": 3762
+    },
+    {
+      "epoch": 0.6700498575498576,
+      "grad_norm": 0.5470684766769409,
+      "learning_rate": 0.00018658620537547903,
+      "loss": 1.0622,
+      "step": 3763
+    },
+    {
+      "epoch": 0.6702279202279202,
+      "grad_norm": 0.63177090883255,
+      "learning_rate": 0.00018657920183235964,
+      "loss": 0.9736,
+      "step": 3764
+    },
+    {
+      "epoch": 0.6704059829059829,
+      "grad_norm": 0.5456309914588928,
+      "learning_rate": 0.00018657219659290068,
+      "loss": 1.027,
+      "step": 3765
+    },
+    {
+      "epoch": 0.6705840455840456,
+      "grad_norm": 0.4816138744354248,
+      "learning_rate": 0.00018656518965723935,
+      "loss": 0.7801,
+      "step": 3766
+    },
+    {
+      "epoch": 0.6707621082621082,
+      "grad_norm": 0.4811640679836273,
+      "learning_rate": 0.00018655818102551294,
+      "loss": 1.0535,
+      "step": 3767
+    },
+    {
+      "epoch": 0.6709401709401709,
+      "grad_norm": 0.4677673280239105,
+      "learning_rate": 0.00018655117069785884,
+      "loss": 1.1043,
+      "step": 3768
+    },
+    {
+      "epoch": 0.6711182336182336,
+      "grad_norm": 0.5628635883331299,
+      "learning_rate": 0.0001865441586744143,
+      "loss": 1.0392,
+      "step": 3769
+    },
+    {
+      "epoch": 0.6712962962962963,
+      "grad_norm": 0.5484504103660583,
+      "learning_rate": 0.00018653714495531673,
+      "loss": 1.1533,
+      "step": 3770
+    },
+    {
+      "epoch": 0.6714743589743589,
+      "grad_norm": 0.5830571055412292,
+      "learning_rate": 0.0001865301295407036,
+      "loss": 1.2479,
+      "step": 3771
+    },
+    {
+      "epoch": 0.6716524216524217,
+      "grad_norm": 0.5516841411590576,
+      "learning_rate": 0.00018652311243071235,
+      "loss": 1.2152,
+      "step": 3772
+    },
+    {
+      "epoch": 0.6718304843304843,
+      "grad_norm": 0.6360766291618347,
+      "learning_rate": 0.0001865160936254804,
+      "loss": 1.0752,
+      "step": 3773
+    },
+    {
+      "epoch": 0.6720085470085471,
+      "grad_norm": 0.6038610935211182,
+      "learning_rate": 0.00018650907312514533,
+      "loss": 1.2425,
+      "step": 3774
+    },
+    {
+      "epoch": 0.6721866096866097,
+      "grad_norm": 0.49572908878326416,
+      "learning_rate": 0.0001865020509298447,
+      "loss": 1.0057,
+      "step": 3775
+    },
+    {
+      "epoch": 0.6723646723646723,
+      "grad_norm": 0.4551616311073303,
+      "learning_rate": 0.00018649502703971607,
+      "loss": 1.0763,
+      "step": 3776
+    },
+    {
+      "epoch": 0.6725427350427351,
+      "grad_norm": 0.6621482372283936,
+      "learning_rate": 0.00018648800145489706,
+      "loss": 1.0306,
+      "step": 3777
+    },
+    {
+      "epoch": 0.6727207977207977,
+      "grad_norm": 0.5523806810379028,
+      "learning_rate": 0.0001864809741755253,
+      "loss": 0.9906,
+      "step": 3778
+    },
+    {
+      "epoch": 0.6728988603988604,
+      "grad_norm": 0.5527048110961914,
+      "learning_rate": 0.00018647394520173856,
+      "loss": 1.0734,
+      "step": 3779
+    },
+    {
+      "epoch": 0.6730769230769231,
+      "grad_norm": 0.573573887348175,
+      "learning_rate": 0.00018646691453367444,
+      "loss": 1.1409,
+      "step": 3780
+    },
+    {
+      "epoch": 0.6732549857549858,
+      "grad_norm": 0.6273239254951477,
+      "learning_rate": 0.00018645988217147079,
+      "loss": 0.9682,
+      "step": 3781
+    },
+    {
+      "epoch": 0.6734330484330484,
+      "grad_norm": 0.4917762279510498,
+      "learning_rate": 0.00018645284811526534,
+      "loss": 0.9681,
+      "step": 3782
+    },
+    {
+      "epoch": 0.6736111111111112,
+      "grad_norm": 0.4901154339313507,
+      "learning_rate": 0.0001864458123651959,
+      "loss": 1.1828,
+      "step": 3783
+    },
+    {
+      "epoch": 0.6737891737891738,
+      "grad_norm": 0.6292546391487122,
+      "learning_rate": 0.00018643877492140036,
+      "loss": 1.1987,
+      "step": 3784
+    },
+    {
+      "epoch": 0.6739672364672364,
+      "grad_norm": 0.5334137678146362,
+      "learning_rate": 0.0001864317357840166,
+      "loss": 1.0347,
+      "step": 3785
+    },
+    {
+      "epoch": 0.6741452991452992,
+      "grad_norm": 0.6064338684082031,
+      "learning_rate": 0.0001864246949531825,
+      "loss": 1.4154,
+      "step": 3786
+    },
+    {
+      "epoch": 0.6743233618233618,
+      "grad_norm": 0.5442034602165222,
+      "learning_rate": 0.000186417652429036,
+      "loss": 1.2604,
+      "step": 3787
+    },
+    {
+      "epoch": 0.6745014245014245,
+      "grad_norm": 0.490858793258667,
+      "learning_rate": 0.00018641060821171518,
+      "loss": 1.1511,
+      "step": 3788
+    },
+    {
+      "epoch": 0.6746794871794872,
+      "grad_norm": 0.571116030216217,
+      "learning_rate": 0.00018640356230135798,
+      "loss": 1.1479,
+      "step": 3789
+    },
+    {
+      "epoch": 0.6748575498575499,
+      "grad_norm": 0.4857785105705261,
+      "learning_rate": 0.00018639651469810247,
+      "loss": 0.9,
+      "step": 3790
+    },
+    {
+      "epoch": 0.6750356125356125,
+      "grad_norm": 0.5320703983306885,
+      "learning_rate": 0.0001863894654020867,
+      "loss": 1.2284,
+      "step": 3791
+    },
+    {
+      "epoch": 0.6752136752136753,
+      "grad_norm": 0.5586925745010376,
+      "learning_rate": 0.0001863824144134488,
+      "loss": 1.1183,
+      "step": 3792
+    },
+    {
+      "epoch": 0.6753917378917379,
+      "grad_norm": 0.47740885615348816,
+      "learning_rate": 0.000186375361732327,
+      "loss": 1.1512,
+      "step": 3793
+    },
+    {
+      "epoch": 0.6755698005698005,
+      "grad_norm": 0.5867732167243958,
+      "learning_rate": 0.00018636830735885935,
+      "loss": 1.1903,
+      "step": 3794
+    },
+    {
+      "epoch": 0.6757478632478633,
+      "grad_norm": 0.5013887882232666,
+      "learning_rate": 0.0001863612512931842,
+      "loss": 0.8581,
+      "step": 3795
+    },
+    {
+      "epoch": 0.6759259259259259,
+      "grad_norm": 0.6026871204376221,
+      "learning_rate": 0.0001863541935354397,
+      "loss": 0.9581,
+      "step": 3796
+    },
+    {
+      "epoch": 0.6761039886039886,
+      "grad_norm": 0.5238468647003174,
+      "learning_rate": 0.00018634713408576415,
+      "loss": 1.0949,
+      "step": 3797
+    },
+    {
+      "epoch": 0.6762820512820513,
+      "grad_norm": 0.5128598213195801,
+      "learning_rate": 0.00018634007294429585,
+      "loss": 0.8992,
+      "step": 3798
+    },
+    {
+      "epoch": 0.676460113960114,
+      "grad_norm": 0.5092771053314209,
+      "learning_rate": 0.00018633301011117324,
+      "loss": 1.0793,
+      "step": 3799
+    },
+    {
+      "epoch": 0.6766381766381766,
+      "grad_norm": 0.592566728591919,
+      "learning_rate": 0.00018632594558653457,
+      "loss": 1.3242,
+      "step": 3800
+    },
+    {
+      "epoch": 0.6768162393162394,
+      "grad_norm": 0.4953067898750305,
+      "learning_rate": 0.0001863188793705184,
+      "loss": 0.9925,
+      "step": 3801
+    },
+    {
+      "epoch": 0.676994301994302,
+      "grad_norm": 0.4989747107028961,
+      "learning_rate": 0.00018631181146326305,
+      "loss": 1.0677,
+      "step": 3802
+    },
+    {
+      "epoch": 0.6771723646723646,
+      "grad_norm": 0.5375261902809143,
+      "learning_rate": 0.00018630474186490705,
+      "loss": 1.0556,
+      "step": 3803
+    },
+    {
+      "epoch": 0.6773504273504274,
+      "grad_norm": 0.6512624025344849,
+      "learning_rate": 0.00018629767057558894,
+      "loss": 1.2041,
+      "step": 3804
+    },
+    {
+      "epoch": 0.67752849002849,
+      "grad_norm": 0.5428260564804077,
+      "learning_rate": 0.00018629059759544723,
+      "loss": 0.9645,
+      "step": 3805
+    },
+    {
+      "epoch": 0.6777065527065527,
+      "grad_norm": 0.5598662495613098,
+      "learning_rate": 0.00018628352292462052,
+      "loss": 1.1683,
+      "step": 3806
+    },
+    {
+      "epoch": 0.6778846153846154,
+      "grad_norm": 0.49351340532302856,
+      "learning_rate": 0.0001862764465632474,
+      "loss": 1.1622,
+      "step": 3807
+    },
+    {
+      "epoch": 0.6780626780626781,
+      "grad_norm": 0.4796701669692993,
+      "learning_rate": 0.00018626936851146657,
+      "loss": 1.0017,
+      "step": 3808
+    },
+    {
+      "epoch": 0.6782407407407407,
+      "grad_norm": 0.444533109664917,
+      "learning_rate": 0.00018626228876941664,
+      "loss": 0.9145,
+      "step": 3809
+    },
+    {
+      "epoch": 0.6784188034188035,
+      "grad_norm": 0.5197392702102661,
+      "learning_rate": 0.00018625520733723635,
+      "loss": 1.283,
+      "step": 3810
+    },
+    {
+      "epoch": 0.6785968660968661,
+      "grad_norm": 0.48785829544067383,
+      "learning_rate": 0.00018624812421506447,
+      "loss": 1.1084,
+      "step": 3811
+    },
+    {
+      "epoch": 0.6787749287749287,
+      "grad_norm": 0.5083680152893066,
+      "learning_rate": 0.00018624103940303974,
+      "loss": 0.9071,
+      "step": 3812
+    },
+    {
+      "epoch": 0.6789529914529915,
+      "grad_norm": 0.553819477558136,
+      "learning_rate": 0.00018623395290130103,
+      "loss": 0.9986,
+      "step": 3813
+    },
+    {
+      "epoch": 0.6791310541310541,
+      "grad_norm": 0.5347508788108826,
+      "learning_rate": 0.00018622686470998713,
+      "loss": 1.0148,
+      "step": 3814
+    },
+    {
+      "epoch": 0.6793091168091168,
+      "grad_norm": 0.5080769062042236,
+      "learning_rate": 0.00018621977482923693,
+      "loss": 1.0169,
+      "step": 3815
+    },
+    {
+      "epoch": 0.6794871794871795,
+      "grad_norm": 0.5444077849388123,
+      "learning_rate": 0.00018621268325918938,
+      "loss": 1.172,
+      "step": 3816
+    },
+    {
+      "epoch": 0.6796652421652422,
+      "grad_norm": 0.521946132183075,
+      "learning_rate": 0.00018620558999998335,
+      "loss": 1.0247,
+      "step": 3817
+    },
+    {
+      "epoch": 0.6798433048433048,
+      "grad_norm": 0.5257413983345032,
+      "learning_rate": 0.00018619849505175786,
+      "loss": 1.1574,
+      "step": 3818
+    },
+    {
+      "epoch": 0.6800213675213675,
+      "grad_norm": 0.5473007559776306,
+      "learning_rate": 0.00018619139841465193,
+      "loss": 1.1254,
+      "step": 3819
+    },
+    {
+      "epoch": 0.6801994301994302,
+      "grad_norm": 0.5479872226715088,
+      "learning_rate": 0.00018618430008880463,
+      "loss": 1.0196,
+      "step": 3820
+    },
+    {
+      "epoch": 0.6803774928774928,
+      "grad_norm": 0.5918973088264465,
+      "learning_rate": 0.00018617720007435497,
+      "loss": 1.082,
+      "step": 3821
+    },
+    {
+      "epoch": 0.6805555555555556,
+      "grad_norm": 0.5411791801452637,
+      "learning_rate": 0.0001861700983714421,
+      "loss": 0.7723,
+      "step": 3822
+    },
+    {
+      "epoch": 0.6807336182336182,
+      "grad_norm": 0.5466326475143433,
+      "learning_rate": 0.00018616299498020516,
+      "loss": 1.0979,
+      "step": 3823
+    },
+    {
+      "epoch": 0.6809116809116809,
+      "grad_norm": 0.5405182838439941,
+      "learning_rate": 0.00018615588990078332,
+      "loss": 0.8891,
+      "step": 3824
+    },
+    {
+      "epoch": 0.6810897435897436,
+      "grad_norm": 0.5415780544281006,
+      "learning_rate": 0.00018614878313331579,
+      "loss": 1.0927,
+      "step": 3825
+    },
+    {
+      "epoch": 0.6812678062678063,
+      "grad_norm": 0.5284909605979919,
+      "learning_rate": 0.00018614167467794182,
+      "loss": 1.0684,
+      "step": 3826
+    },
+    {
+      "epoch": 0.6814458689458689,
+      "grad_norm": 0.4873995780944824,
+      "learning_rate": 0.00018613456453480062,
+      "loss": 1.1653,
+      "step": 3827
+    },
+    {
+      "epoch": 0.6816239316239316,
+      "grad_norm": 0.5506551265716553,
+      "learning_rate": 0.0001861274527040316,
+      "loss": 0.9876,
+      "step": 3828
+    },
+    {
+      "epoch": 0.6818019943019943,
+      "grad_norm": 0.5031297206878662,
+      "learning_rate": 0.0001861203391857741,
+      "loss": 1.067,
+      "step": 3829
+    },
+    {
+      "epoch": 0.6819800569800569,
+      "grad_norm": 0.622346043586731,
+      "learning_rate": 0.0001861132239801674,
+      "loss": 1.1514,
+      "step": 3830
+    },
+    {
+      "epoch": 0.6821581196581197,
+      "grad_norm": 0.47706183791160583,
+      "learning_rate": 0.000186106107087351,
+      "loss": 0.9857,
+      "step": 3831
+    },
+    {
+      "epoch": 0.6823361823361823,
+      "grad_norm": 0.5082845091819763,
+      "learning_rate": 0.00018609898850746424,
+      "loss": 1.123,
+      "step": 3832
+    },
+    {
+      "epoch": 0.6825142450142451,
+      "grad_norm": 0.5119805932044983,
+      "learning_rate": 0.00018609186824064671,
+      "loss": 1.1386,
+      "step": 3833
+    },
+    {
+      "epoch": 0.6826923076923077,
+      "grad_norm": 0.5247541069984436,
+      "learning_rate": 0.00018608474628703788,
+      "loss": 0.9433,
+      "step": 3834
+    },
+    {
+      "epoch": 0.6828703703703703,
+      "grad_norm": 0.4618282616138458,
+      "learning_rate": 0.00018607762264677722,
+      "loss": 0.8727,
+      "step": 3835
+    },
+    {
+      "epoch": 0.6830484330484331,
+      "grad_norm": 0.6014040112495422,
+      "learning_rate": 0.00018607049732000436,
+      "loss": 1.1823,
+      "step": 3836
+    },
+    {
+      "epoch": 0.6832264957264957,
+      "grad_norm": 0.6489043831825256,
+      "learning_rate": 0.00018606337030685892,
+      "loss": 1.1466,
+      "step": 3837
+    },
+    {
+      "epoch": 0.6834045584045584,
+      "grad_norm": 0.5527763366699219,
+      "learning_rate": 0.00018605624160748053,
+      "loss": 1.3015,
+      "step": 3838
+    },
+    {
+      "epoch": 0.6835826210826211,
+      "grad_norm": 0.5628284215927124,
+      "learning_rate": 0.0001860491112220088,
+      "loss": 1.1504,
+      "step": 3839
+    },
+    {
+      "epoch": 0.6837606837606838,
+      "grad_norm": 0.5414566993713379,
+      "learning_rate": 0.00018604197915058355,
+      "loss": 1.0155,
+      "step": 3840
+    },
+    {
+      "epoch": 0.6839387464387464,
+      "grad_norm": 0.5378929376602173,
+      "learning_rate": 0.00018603484539334443,
+      "loss": 0.8917,
+      "step": 3841
+    },
+    {
+      "epoch": 0.6841168091168092,
+      "grad_norm": 0.5953748822212219,
+      "learning_rate": 0.00018602770995043125,
+      "loss": 1.1971,
+      "step": 3842
+    },
+    {
+      "epoch": 0.6842948717948718,
+      "grad_norm": 0.511813759803772,
+      "learning_rate": 0.00018602057282198376,
+      "loss": 1.1345,
+      "step": 3843
+    },
+    {
+      "epoch": 0.6844729344729344,
+      "grad_norm": 0.5145484209060669,
+      "learning_rate": 0.00018601343400814185,
+      "loss": 1.0786,
+      "step": 3844
+    },
+    {
+      "epoch": 0.6846509971509972,
+      "grad_norm": 0.5199604034423828,
+      "learning_rate": 0.00018600629350904542,
+      "loss": 1.2063,
+      "step": 3845
+    },
+    {
+      "epoch": 0.6848290598290598,
+      "grad_norm": 0.5653825998306274,
+      "learning_rate": 0.0001859991513248343,
+      "loss": 1.0314,
+      "step": 3846
+    },
+    {
+      "epoch": 0.6850071225071225,
+      "grad_norm": 0.5660843849182129,
+      "learning_rate": 0.00018599200745564843,
+      "loss": 1.2754,
+      "step": 3847
+    },
+    {
+      "epoch": 0.6851851851851852,
+      "grad_norm": 0.5225719809532166,
+      "learning_rate": 0.00018598486190162788,
+      "loss": 1.0837,
+      "step": 3848
+    },
+    {
+      "epoch": 0.6853632478632479,
+      "grad_norm": 0.5011669397354126,
+      "learning_rate": 0.00018597771466291252,
+      "loss": 1.1,
+      "step": 3849
+    },
+    {
+      "epoch": 0.6855413105413105,
+      "grad_norm": 0.5923115015029907,
+      "learning_rate": 0.00018597056573964245,
+      "loss": 1.1875,
+      "step": 3850
+    },
+    {
+      "epoch": 0.6857193732193733,
+      "grad_norm": 0.5666482448577881,
+      "learning_rate": 0.00018596341513195776,
+      "loss": 1.1663,
+      "step": 3851
+    },
+    {
+      "epoch": 0.6858974358974359,
+      "grad_norm": 0.5396790504455566,
+      "learning_rate": 0.0001859562628399985,
+      "loss": 1.1179,
+      "step": 3852
+    },
+    {
+      "epoch": 0.6860754985754985,
+      "grad_norm": 0.5709532499313354,
+      "learning_rate": 0.00018594910886390485,
+      "loss": 1.0369,
+      "step": 3853
+    },
+    {
+      "epoch": 0.6862535612535613,
+      "grad_norm": 0.45524322986602783,
+      "learning_rate": 0.00018594195320381692,
+      "loss": 1.0171,
+      "step": 3854
+    },
+    {
+      "epoch": 0.6864316239316239,
+      "grad_norm": 0.6130724549293518,
+      "learning_rate": 0.00018593479585987498,
+      "loss": 1.1944,
+      "step": 3855
+    },
+    {
+      "epoch": 0.6866096866096866,
+      "grad_norm": 0.5079745054244995,
+      "learning_rate": 0.0001859276368322192,
+      "loss": 1.2567,
+      "step": 3856
+    },
+    {
+      "epoch": 0.6867877492877493,
+      "grad_norm": 0.49919846653938293,
+      "learning_rate": 0.00018592047612098992,
+      "loss": 0.9459,
+      "step": 3857
+    },
+    {
+      "epoch": 0.686965811965812,
+      "grad_norm": 0.5776857733726501,
+      "learning_rate": 0.00018591331372632734,
+      "loss": 1.2456,
+      "step": 3858
+    },
+    {
+      "epoch": 0.6871438746438746,
+      "grad_norm": 0.4740692377090454,
+      "learning_rate": 0.00018590614964837188,
+      "loss": 1.0401,
+      "step": 3859
+    },
+    {
+      "epoch": 0.6873219373219374,
+      "grad_norm": 0.5015742182731628,
+      "learning_rate": 0.00018589898388726389,
+      "loss": 1.2052,
+      "step": 3860
+    },
+    {
+      "epoch": 0.6875,
+      "grad_norm": 0.4819730818271637,
+      "learning_rate": 0.0001858918164431437,
+      "loss": 1.007,
+      "step": 3861
+    },
+    {
+      "epoch": 0.6876780626780626,
+      "grad_norm": 0.5510426163673401,
+      "learning_rate": 0.00018588464731615184,
+      "loss": 1.0123,
+      "step": 3862
+    },
+    {
+      "epoch": 0.6878561253561254,
+      "grad_norm": 0.4950829744338989,
+      "learning_rate": 0.00018587747650642867,
+      "loss": 1.033,
+      "step": 3863
+    },
+    {
+      "epoch": 0.688034188034188,
+      "grad_norm": 0.5278680920600891,
+      "learning_rate": 0.0001858703040141148,
+      "loss": 1.0912,
+      "step": 3864
+    },
+    {
+      "epoch": 0.6882122507122507,
+      "grad_norm": 0.6359158158302307,
+      "learning_rate": 0.00018586312983935068,
+      "loss": 1.2868,
+      "step": 3865
+    },
+    {
+      "epoch": 0.6883903133903134,
+      "grad_norm": 0.5098239183425903,
+      "learning_rate": 0.0001858559539822769,
+      "loss": 0.8364,
+      "step": 3866
+    },
+    {
+      "epoch": 0.6885683760683761,
+      "grad_norm": 0.5651038289070129,
+      "learning_rate": 0.000185848776443034,
+      "loss": 1.1983,
+      "step": 3867
+    },
+    {
+      "epoch": 0.6887464387464387,
+      "grad_norm": 0.5305678248405457,
+      "learning_rate": 0.00018584159722176272,
+      "loss": 1.32,
+      "step": 3868
+    },
+    {
+      "epoch": 0.6889245014245015,
+      "grad_norm": 0.5481845140457153,
+      "learning_rate": 0.00018583441631860368,
+      "loss": 1.013,
+      "step": 3869
+    },
+    {
+      "epoch": 0.6891025641025641,
+      "grad_norm": 0.5214795470237732,
+      "learning_rate": 0.00018582723373369753,
+      "loss": 1.172,
+      "step": 3870
+    },
+    {
+      "epoch": 0.6892806267806267,
+      "grad_norm": 0.6282780766487122,
+      "learning_rate": 0.00018582004946718502,
+      "loss": 1.7304,
+      "step": 3871
+    },
+    {
+      "epoch": 0.6894586894586895,
+      "grad_norm": 0.5266988277435303,
+      "learning_rate": 0.0001858128635192069,
+      "loss": 1.1418,
+      "step": 3872
+    },
+    {
+      "epoch": 0.6896367521367521,
+      "grad_norm": 0.4761001467704773,
+      "learning_rate": 0.000185805675889904,
+      "loss": 0.8585,
+      "step": 3873
+    },
+    {
+      "epoch": 0.6898148148148148,
+      "grad_norm": 0.528779923915863,
+      "learning_rate": 0.00018579848657941715,
+      "loss": 1.0036,
+      "step": 3874
+    },
+    {
+      "epoch": 0.6899928774928775,
+      "grad_norm": 0.5427684783935547,
+      "learning_rate": 0.00018579129558788716,
+      "loss": 0.9769,
+      "step": 3875
+    },
+    {
+      "epoch": 0.6901709401709402,
+      "grad_norm": 0.6229544281959534,
+      "learning_rate": 0.00018578410291545495,
+      "loss": 1.2848,
+      "step": 3876
+    },
+    {
+      "epoch": 0.6903490028490028,
+      "grad_norm": 0.6602693200111389,
+      "learning_rate": 0.00018577690856226147,
+      "loss": 1.2713,
+      "step": 3877
+    },
+    {
+      "epoch": 0.6905270655270656,
+      "grad_norm": 0.45884042978286743,
+      "learning_rate": 0.0001857697125284476,
+      "loss": 0.9143,
+      "step": 3878
+    },
+    {
+      "epoch": 0.6907051282051282,
+      "grad_norm": 0.4956444203853607,
+      "learning_rate": 0.00018576251481415443,
+      "loss": 0.9646,
+      "step": 3879
+    },
+    {
+      "epoch": 0.6908831908831908,
+      "grad_norm": 0.473561555147171,
+      "learning_rate": 0.00018575531541952292,
+      "loss": 0.843,
+      "step": 3880
+    },
+    {
+      "epoch": 0.6910612535612536,
+      "grad_norm": 0.4676312506198883,
+      "learning_rate": 0.00018574811434469415,
+      "loss": 0.9464,
+      "step": 3881
+    },
+    {
+      "epoch": 0.6912393162393162,
+      "grad_norm": 0.5452045202255249,
+      "learning_rate": 0.00018574091158980922,
+      "loss": 0.985,
+      "step": 3882
+    },
+    {
+      "epoch": 0.6914173789173789,
+      "grad_norm": 0.6274946331977844,
+      "learning_rate": 0.0001857337071550092,
+      "loss": 1.0357,
+      "step": 3883
+    },
+    {
+      "epoch": 0.6915954415954416,
+      "grad_norm": 0.5533788800239563,
+      "learning_rate": 0.00018572650104043531,
+      "loss": 1.2636,
+      "step": 3884
+    },
+    {
+      "epoch": 0.6917735042735043,
+      "grad_norm": 0.48312318325042725,
+      "learning_rate": 0.00018571929324622872,
+      "loss": 1.2402,
+      "step": 3885
+    },
+    {
+      "epoch": 0.6919515669515669,
+      "grad_norm": 0.6087453961372375,
+      "learning_rate": 0.00018571208377253062,
+      "loss": 1.2961,
+      "step": 3886
+    },
+    {
+      "epoch": 0.6921296296296297,
+      "grad_norm": 0.49156486988067627,
+      "learning_rate": 0.00018570487261948234,
+      "loss": 0.9585,
+      "step": 3887
+    },
+    {
+      "epoch": 0.6923076923076923,
+      "grad_norm": 0.5200015902519226,
+      "learning_rate": 0.0001856976597872251,
+      "loss": 0.9274,
+      "step": 3888
+    },
+    {
+      "epoch": 0.6924857549857549,
+      "grad_norm": 0.5185118913650513,
+      "learning_rate": 0.0001856904452759002,
+      "loss": 1.0015,
+      "step": 3889
+    },
+    {
+      "epoch": 0.6926638176638177,
+      "grad_norm": 0.5859049558639526,
+      "learning_rate": 0.00018568322908564904,
+      "loss": 1.0959,
+      "step": 3890
+    },
+    {
+      "epoch": 0.6928418803418803,
+      "grad_norm": 0.5882301926612854,
+      "learning_rate": 0.00018567601121661302,
+      "loss": 1.3214,
+      "step": 3891
+    },
+    {
+      "epoch": 0.6930199430199431,
+      "grad_norm": 0.6475503444671631,
+      "learning_rate": 0.0001856687916689335,
+      "loss": 1.3265,
+      "step": 3892
+    },
+    {
+      "epoch": 0.6931980056980057,
+      "grad_norm": 0.46175432205200195,
+      "learning_rate": 0.000185661570442752,
+      "loss": 0.8547,
+      "step": 3893
+    },
+    {
+      "epoch": 0.6933760683760684,
+      "grad_norm": 0.5362716913223267,
+      "learning_rate": 0.00018565434753820998,
+      "loss": 0.974,
+      "step": 3894
+    },
+    {
+      "epoch": 0.6935541310541311,
+      "grad_norm": 0.4317963719367981,
+      "learning_rate": 0.00018564712295544896,
+      "loss": 0.7653,
+      "step": 3895
+    },
+    {
+      "epoch": 0.6937321937321937,
+      "grad_norm": 0.5679717659950256,
+      "learning_rate": 0.00018563989669461047,
+      "loss": 1.0691,
+      "step": 3896
+    },
+    {
+      "epoch": 0.6939102564102564,
+      "grad_norm": 0.5058363676071167,
+      "learning_rate": 0.00018563266875583608,
+      "loss": 1.0665,
+      "step": 3897
+    },
+    {
+      "epoch": 0.6940883190883191,
+      "grad_norm": 0.5365496277809143,
+      "learning_rate": 0.00018562543913926746,
+      "loss": 0.9963,
+      "step": 3898
+    },
+    {
+      "epoch": 0.6942663817663818,
+      "grad_norm": 0.49945300817489624,
+      "learning_rate": 0.0001856182078450462,
+      "loss": 0.8668,
+      "step": 3899
+    },
+    {
+      "epoch": 0.6944444444444444,
+      "grad_norm": 0.5869430899620056,
+      "learning_rate": 0.00018561097487331405,
+      "loss": 1.1942,
+      "step": 3900
+    },
+    {
+      "epoch": 0.6946225071225072,
+      "grad_norm": 0.5188950300216675,
+      "learning_rate": 0.0001856037402242127,
+      "loss": 0.9493,
+      "step": 3901
+    },
+    {
+      "epoch": 0.6948005698005698,
+      "grad_norm": 0.510788083076477,
+      "learning_rate": 0.00018559650389788384,
+      "loss": 0.9989,
+      "step": 3902
+    },
+    {
+      "epoch": 0.6949786324786325,
+      "grad_norm": 0.5360601544380188,
+      "learning_rate": 0.0001855892658944693,
+      "loss": 1.2766,
+      "step": 3903
+    },
+    {
+      "epoch": 0.6951566951566952,
+      "grad_norm": 0.522502601146698,
+      "learning_rate": 0.00018558202621411093,
+      "loss": 0.8774,
+      "step": 3904
+    },
+    {
+      "epoch": 0.6953347578347578,
+      "grad_norm": 0.5330635905265808,
+      "learning_rate": 0.00018557478485695052,
+      "loss": 0.972,
+      "step": 3905
+    },
+    {
+      "epoch": 0.6955128205128205,
+      "grad_norm": 0.5387479066848755,
+      "learning_rate": 0.00018556754182312996,
+      "loss": 1.0574,
+      "step": 3906
+    },
+    {
+      "epoch": 0.6956908831908832,
+      "grad_norm": 0.5357984900474548,
+      "learning_rate": 0.00018556029711279116,
+      "loss": 1.396,
+      "step": 3907
+    },
+    {
+      "epoch": 0.6958689458689459,
+      "grad_norm": 0.5647178292274475,
+      "learning_rate": 0.00018555305072607612,
+      "loss": 1.3304,
+      "step": 3908
+    },
+    {
+      "epoch": 0.6960470085470085,
+      "grad_norm": 0.46460914611816406,
+      "learning_rate": 0.00018554580266312673,
+      "loss": 0.9574,
+      "step": 3909
+    },
+    {
+      "epoch": 0.6962250712250713,
+      "grad_norm": 0.6206206679344177,
+      "learning_rate": 0.00018553855292408503,
+      "loss": 1.1637,
+      "step": 3910
+    },
+    {
+      "epoch": 0.6964031339031339,
+      "grad_norm": 0.5899842977523804,
+      "learning_rate": 0.00018553130150909312,
+      "loss": 1.1067,
+      "step": 3911
+    },
+    {
+      "epoch": 0.6965811965811965,
+      "grad_norm": 0.47294262051582336,
+      "learning_rate": 0.000185524048418293,
+      "loss": 1.1516,
+      "step": 3912
+    },
+    {
+      "epoch": 0.6967592592592593,
+      "grad_norm": 0.5791197419166565,
+      "learning_rate": 0.00018551679365182684,
+      "loss": 1.0007,
+      "step": 3913
+    },
+    {
+      "epoch": 0.6969373219373219,
+      "grad_norm": 0.5678651332855225,
+      "learning_rate": 0.00018550953720983672,
+      "loss": 1.2698,
+      "step": 3914
+    },
+    {
+      "epoch": 0.6971153846153846,
+      "grad_norm": 0.6509683728218079,
+      "learning_rate": 0.0001855022790924649,
+      "loss": 1.0354,
+      "step": 3915
+    },
+    {
+      "epoch": 0.6972934472934473,
+      "grad_norm": 0.5176648497581482,
+      "learning_rate": 0.0001854950192998535,
+      "loss": 1.1243,
+      "step": 3916
+    },
+    {
+      "epoch": 0.69747150997151,
+      "grad_norm": 0.520631730556488,
+      "learning_rate": 0.00018548775783214477,
+      "loss": 1.1371,
+      "step": 3917
+    },
+    {
+      "epoch": 0.6976495726495726,
+      "grad_norm": 0.5408333539962769,
+      "learning_rate": 0.00018548049468948108,
+      "loss": 1.1185,
+      "step": 3918
+    },
+    {
+      "epoch": 0.6978276353276354,
+      "grad_norm": 0.5423790216445923,
+      "learning_rate": 0.00018547322987200461,
+      "loss": 1.1539,
+      "step": 3919
+    },
+    {
+      "epoch": 0.698005698005698,
+      "grad_norm": 0.5422113537788391,
+      "learning_rate": 0.0001854659633798578,
+      "loss": 1.171,
+      "step": 3920
+    },
+    {
+      "epoch": 0.6981837606837606,
+      "grad_norm": 0.5113416314125061,
+      "learning_rate": 0.00018545869521318292,
+      "loss": 1.0597,
+      "step": 3921
+    },
+    {
+      "epoch": 0.6983618233618234,
+      "grad_norm": 0.49901214241981506,
+      "learning_rate": 0.00018545142537212248,
+      "loss": 1.1043,
+      "step": 3922
+    },
+    {
+      "epoch": 0.698539886039886,
+      "grad_norm": 0.6606622338294983,
+      "learning_rate": 0.00018544415385681885,
+      "loss": 1.1797,
+      "step": 3923
+    },
+    {
+      "epoch": 0.6987179487179487,
+      "grad_norm": 0.4786234498023987,
+      "learning_rate": 0.00018543688066741454,
+      "loss": 0.9532,
+      "step": 3924
+    },
+    {
+      "epoch": 0.6988960113960114,
+      "grad_norm": 0.5900700688362122,
+      "learning_rate": 0.00018542960580405203,
+      "loss": 1.1171,
+      "step": 3925
+    },
+    {
+      "epoch": 0.6990740740740741,
+      "grad_norm": 0.53485506772995,
+      "learning_rate": 0.00018542232926687383,
+      "loss": 1.1535,
+      "step": 3926
+    },
+    {
+      "epoch": 0.6992521367521367,
+      "grad_norm": 0.5269177556037903,
+      "learning_rate": 0.00018541505105602255,
+      "loss": 1.0287,
+      "step": 3927
+    },
+    {
+      "epoch": 0.6994301994301995,
+      "grad_norm": 0.5185505151748657,
+      "learning_rate": 0.0001854077711716408,
+      "loss": 1.2526,
+      "step": 3928
+    },
+    {
+      "epoch": 0.6996082621082621,
+      "grad_norm": 0.5615512132644653,
+      "learning_rate": 0.00018540048961387115,
+      "loss": 1.0189,
+      "step": 3929
+    },
+    {
+      "epoch": 0.6997863247863247,
+      "grad_norm": 0.4492493271827698,
+      "learning_rate": 0.00018539320638285637,
+      "loss": 0.8917,
+      "step": 3930
+    },
+    {
+      "epoch": 0.6999643874643875,
+      "grad_norm": 0.5062302947044373,
+      "learning_rate": 0.00018538592147873906,
+      "loss": 1.053,
+      "step": 3931
+    },
+    {
+      "epoch": 0.7001424501424501,
+      "grad_norm": 0.5508798956871033,
+      "learning_rate": 0.000185378634901662,
+      "loss": 0.9638,
+      "step": 3932
+    },
+    {
+      "epoch": 0.7003205128205128,
+      "grad_norm": 0.463980108499527,
+      "learning_rate": 0.00018537134665176793,
+      "loss": 1.0945,
+      "step": 3933
+    },
+    {
+      "epoch": 0.7004985754985755,
+      "grad_norm": 0.5027088522911072,
+      "learning_rate": 0.0001853640567291997,
+      "loss": 1.1745,
+      "step": 3934
+    },
+    {
+      "epoch": 0.7006766381766382,
+      "grad_norm": 0.5006551146507263,
+      "learning_rate": 0.00018535676513410009,
+      "loss": 0.8521,
+      "step": 3935
+    },
+    {
+      "epoch": 0.7008547008547008,
+      "grad_norm": 0.5870724320411682,
+      "learning_rate": 0.000185349471866612,
+      "loss": 0.9197,
+      "step": 3936
+    },
+    {
+      "epoch": 0.7010327635327636,
+      "grad_norm": 0.5030696392059326,
+      "learning_rate": 0.00018534217692687825,
+      "loss": 1.1049,
+      "step": 3937
+    },
+    {
+      "epoch": 0.7012108262108262,
+      "grad_norm": 0.5212681889533997,
+      "learning_rate": 0.00018533488031504186,
+      "loss": 1.3397,
+      "step": 3938
+    },
+    {
+      "epoch": 0.7013888888888888,
+      "grad_norm": 0.5649709105491638,
+      "learning_rate": 0.0001853275820312458,
+      "loss": 1.1994,
+      "step": 3939
+    },
+    {
+      "epoch": 0.7015669515669516,
+      "grad_norm": 0.4892779290676117,
+      "learning_rate": 0.00018532028207563297,
+      "loss": 1.1511,
+      "step": 3940
+    },
+    {
+      "epoch": 0.7017450142450142,
+      "grad_norm": 0.4929407835006714,
+      "learning_rate": 0.00018531298044834643,
+      "loss": 1.0792,
+      "step": 3941
+    },
+    {
+      "epoch": 0.7019230769230769,
+      "grad_norm": 0.5645940899848938,
+      "learning_rate": 0.00018530567714952932,
+      "loss": 1.0937,
+      "step": 3942
+    },
+    {
+      "epoch": 0.7021011396011396,
+      "grad_norm": 0.5471178293228149,
+      "learning_rate": 0.00018529837217932466,
+      "loss": 1.193,
+      "step": 3943
+    },
+    {
+      "epoch": 0.7022792022792023,
+      "grad_norm": 0.576627790927887,
+      "learning_rate": 0.00018529106553787558,
+      "loss": 1.1032,
+      "step": 3944
+    },
+    {
+      "epoch": 0.7024572649572649,
+      "grad_norm": 0.5015735626220703,
+      "learning_rate": 0.00018528375722532526,
+      "loss": 1.066,
+      "step": 3945
+    },
+    {
+      "epoch": 0.7026353276353277,
+      "grad_norm": 0.5315404534339905,
+      "learning_rate": 0.00018527644724181683,
+      "loss": 1.2059,
+      "step": 3946
+    },
+    {
+      "epoch": 0.7028133903133903,
+      "grad_norm": 0.5516065955162048,
+      "learning_rate": 0.0001852691355874936,
+      "loss": 1.161,
+      "step": 3947
+    },
+    {
+      "epoch": 0.7029914529914529,
+      "grad_norm": 0.5026212930679321,
+      "learning_rate": 0.0001852618222624988,
+      "loss": 1.2616,
+      "step": 3948
+    },
+    {
+      "epoch": 0.7031695156695157,
+      "grad_norm": 0.49874603748321533,
+      "learning_rate": 0.0001852545072669757,
+      "loss": 0.805,
+      "step": 3949
+    },
+    {
+      "epoch": 0.7033475783475783,
+      "grad_norm": 0.47698748111724854,
+      "learning_rate": 0.00018524719060106763,
+      "loss": 1.2321,
+      "step": 3950
+    },
+    {
+      "epoch": 0.7035256410256411,
+      "grad_norm": 0.5201322436332703,
+      "learning_rate": 0.00018523987226491792,
+      "loss": 1.1577,
+      "step": 3951
+    },
+    {
+      "epoch": 0.7037037037037037,
+      "grad_norm": 0.5506543517112732,
+      "learning_rate": 0.00018523255225867002,
+      "loss": 1.2289,
+      "step": 3952
+    },
+    {
+      "epoch": 0.7038817663817664,
+      "grad_norm": 0.5691256523132324,
+      "learning_rate": 0.0001852252305824673,
+      "loss": 1.1945,
+      "step": 3953
+    },
+    {
+      "epoch": 0.7040598290598291,
+      "grad_norm": 0.5324838757514954,
+      "learning_rate": 0.00018521790723645322,
+      "loss": 1.1037,
+      "step": 3954
+    },
+    {
+      "epoch": 0.7042378917378918,
+      "grad_norm": 0.5238786339759827,
+      "learning_rate": 0.00018521058222077127,
+      "loss": 1.2075,
+      "step": 3955
+    },
+    {
+      "epoch": 0.7044159544159544,
+      "grad_norm": 0.4936453402042389,
+      "learning_rate": 0.00018520325553556498,
+      "loss": 1.0537,
+      "step": 3956
+    },
+    {
+      "epoch": 0.7045940170940171,
+      "grad_norm": 0.6198282837867737,
+      "learning_rate": 0.00018519592718097791,
+      "loss": 1.0728,
+      "step": 3957
+    },
+    {
+      "epoch": 0.7047720797720798,
+      "grad_norm": 0.44729140400886536,
+      "learning_rate": 0.0001851885971571536,
+      "loss": 0.8432,
+      "step": 3958
+    },
+    {
+      "epoch": 0.7049501424501424,
+      "grad_norm": 0.5884211659431458,
+      "learning_rate": 0.00018518126546423572,
+      "loss": 0.9515,
+      "step": 3959
+    },
+    {
+      "epoch": 0.7051282051282052,
+      "grad_norm": 0.5293807983398438,
+      "learning_rate": 0.00018517393210236788,
+      "loss": 1.1178,
+      "step": 3960
+    },
+    {
+      "epoch": 0.7053062678062678,
+      "grad_norm": 0.6036825180053711,
+      "learning_rate": 0.00018516659707169374,
+      "loss": 1.0408,
+      "step": 3961
+    },
+    {
+      "epoch": 0.7054843304843305,
+      "grad_norm": 0.5157122015953064,
+      "learning_rate": 0.0001851592603723571,
+      "loss": 1.2136,
+      "step": 3962
+    },
+    {
+      "epoch": 0.7056623931623932,
+      "grad_norm": 0.5354781150817871,
+      "learning_rate": 0.00018515192200450163,
+      "loss": 0.7165,
+      "step": 3963
+    },
+    {
+      "epoch": 0.7058404558404558,
+      "grad_norm": 0.6073734760284424,
+      "learning_rate": 0.00018514458196827111,
+      "loss": 1.3079,
+      "step": 3964
+    },
+    {
+      "epoch": 0.7060185185185185,
+      "grad_norm": 0.4324839413166046,
+      "learning_rate": 0.0001851372402638094,
+      "loss": 0.7903,
+      "step": 3965
+    },
+    {
+      "epoch": 0.7061965811965812,
+      "grad_norm": 0.6530333161354065,
+      "learning_rate": 0.00018512989689126034,
+      "loss": 1.3179,
+      "step": 3966
+    },
+    {
+      "epoch": 0.7063746438746439,
+      "grad_norm": 0.5500404238700867,
+      "learning_rate": 0.00018512255185076782,
+      "loss": 1.0624,
+      "step": 3967
+    },
+    {
+      "epoch": 0.7065527065527065,
+      "grad_norm": 0.6277863383293152,
+      "learning_rate": 0.00018511520514247567,
+      "loss": 1.1056,
+      "step": 3968
+    },
+    {
+      "epoch": 0.7067307692307693,
+      "grad_norm": 0.580544650554657,
+      "learning_rate": 0.0001851078567665279,
+      "loss": 0.9849,
+      "step": 3969
+    },
+    {
+      "epoch": 0.7069088319088319,
+      "grad_norm": 0.4880999028682709,
+      "learning_rate": 0.00018510050672306848,
+      "loss": 1.0185,
+      "step": 3970
+    },
+    {
+      "epoch": 0.7070868945868946,
+      "grad_norm": 0.4919959306716919,
+      "learning_rate": 0.0001850931550122414,
+      "loss": 1.0334,
+      "step": 3971
+    },
+    {
+      "epoch": 0.7072649572649573,
+      "grad_norm": 0.6001213192939758,
+      "learning_rate": 0.0001850858016341907,
+      "loss": 1.0729,
+      "step": 3972
+    },
+    {
+      "epoch": 0.70744301994302,
+      "grad_norm": 0.538690447807312,
+      "learning_rate": 0.00018507844658906052,
+      "loss": 1.0733,
+      "step": 3973
+    },
+    {
+      "epoch": 0.7076210826210826,
+      "grad_norm": 0.5427643656730652,
+      "learning_rate": 0.00018507108987699487,
+      "loss": 1.1207,
+      "step": 3974
+    },
+    {
+      "epoch": 0.7077991452991453,
+      "grad_norm": 0.43014347553253174,
+      "learning_rate": 0.00018506373149813795,
+      "loss": 0.7958,
+      "step": 3975
+    },
+    {
+      "epoch": 0.707977207977208,
+      "grad_norm": 0.56591796875,
+      "learning_rate": 0.00018505637145263394,
+      "loss": 1.2199,
+      "step": 3976
+    },
+    {
+      "epoch": 0.7081552706552706,
+      "grad_norm": 0.59147047996521,
+      "learning_rate": 0.000185049009740627,
+      "loss": 1.2354,
+      "step": 3977
+    },
+    {
+      "epoch": 0.7083333333333334,
+      "grad_norm": 0.5078346133232117,
+      "learning_rate": 0.00018504164636226137,
+      "loss": 0.976,
+      "step": 3978
+    },
+    {
+      "epoch": 0.708511396011396,
+      "grad_norm": 0.533302366733551,
+      "learning_rate": 0.00018503428131768135,
+      "loss": 0.9653,
+      "step": 3979
+    },
+    {
+      "epoch": 0.7086894586894587,
+      "grad_norm": 0.4985341727733612,
+      "learning_rate": 0.00018502691460703122,
+      "loss": 1.1485,
+      "step": 3980
+    },
+    {
+      "epoch": 0.7088675213675214,
+      "grad_norm": 0.5143141150474548,
+      "learning_rate": 0.00018501954623045532,
+      "loss": 1.148,
+      "step": 3981
+    },
+    {
+      "epoch": 0.709045584045584,
+      "grad_norm": 0.507189154624939,
+      "learning_rate": 0.00018501217618809804,
+      "loss": 0.9306,
+      "step": 3982
+    },
+    {
+      "epoch": 0.7092236467236467,
+      "grad_norm": 0.5246604084968567,
+      "learning_rate": 0.00018500480448010377,
+      "loss": 0.9116,
+      "step": 3983
+    },
+    {
+      "epoch": 0.7094017094017094,
+      "grad_norm": 0.5321049094200134,
+      "learning_rate": 0.00018499743110661693,
+      "loss": 0.9607,
+      "step": 3984
+    },
+    {
+      "epoch": 0.7095797720797721,
+      "grad_norm": 0.62645423412323,
+      "learning_rate": 0.000184990056067782,
+      "loss": 1.5834,
+      "step": 3985
+    },
+    {
+      "epoch": 0.7097578347578347,
+      "grad_norm": 0.486557275056839,
+      "learning_rate": 0.0001849826793637435,
+      "loss": 1.0598,
+      "step": 3986
+    },
+    {
+      "epoch": 0.7099358974358975,
+      "grad_norm": 0.5122783184051514,
+      "learning_rate": 0.0001849753009946459,
+      "loss": 1.2213,
+      "step": 3987
+    },
+    {
+      "epoch": 0.7101139601139601,
+      "grad_norm": 0.4864068627357483,
+      "learning_rate": 0.0001849679209606338,
+      "loss": 1.2708,
+      "step": 3988
+    },
+    {
+      "epoch": 0.7102920227920227,
+      "grad_norm": 0.5860990881919861,
+      "learning_rate": 0.00018496053926185183,
+      "loss": 1.2421,
+      "step": 3989
+    },
+    {
+      "epoch": 0.7104700854700855,
+      "grad_norm": 0.471194326877594,
+      "learning_rate": 0.00018495315589844453,
+      "loss": 0.879,
+      "step": 3990
+    },
+    {
+      "epoch": 0.7106481481481481,
+      "grad_norm": 0.5626323819160461,
+      "learning_rate": 0.00018494577087055662,
+      "loss": 1.1297,
+      "step": 3991
+    },
+    {
+      "epoch": 0.7108262108262108,
+      "grad_norm": 0.4706762135028839,
+      "learning_rate": 0.0001849383841783328,
+      "loss": 1.0444,
+      "step": 3992
+    },
+    {
+      "epoch": 0.7110042735042735,
+      "grad_norm": 0.5776444673538208,
+      "learning_rate": 0.00018493099582191783,
+      "loss": 1.1773,
+      "step": 3993
+    },
+    {
+      "epoch": 0.7111823361823362,
+      "grad_norm": 0.5493253469467163,
+      "learning_rate": 0.00018492360580145637,
+      "loss": 1.0354,
+      "step": 3994
+    },
+    {
+      "epoch": 0.7113603988603988,
+      "grad_norm": 0.5328514575958252,
+      "learning_rate": 0.0001849162141170933,
+      "loss": 0.9251,
+      "step": 3995
+    },
+    {
+      "epoch": 0.7115384615384616,
+      "grad_norm": 0.5814893841743469,
+      "learning_rate": 0.0001849088207689734,
+      "loss": 1.1066,
+      "step": 3996
+    },
+    {
+      "epoch": 0.7117165242165242,
+      "grad_norm": 0.5476071834564209,
+      "learning_rate": 0.00018490142575724154,
+      "loss": 1.1613,
+      "step": 3997
+    },
+    {
+      "epoch": 0.7118945868945868,
+      "grad_norm": 0.5216463208198547,
+      "learning_rate": 0.00018489402908204258,
+      "loss": 1.2574,
+      "step": 3998
+    },
+    {
+      "epoch": 0.7120726495726496,
+      "grad_norm": 0.5110020637512207,
+      "learning_rate": 0.00018488663074352153,
+      "loss": 1.0663,
+      "step": 3999
+    },
+    {
+      "epoch": 0.7122507122507122,
+      "grad_norm": 0.448090523481369,
+      "learning_rate": 0.00018487923074182326,
+      "loss": 0.6687,
+      "step": 4000
+    },
+    {
+      "epoch": 0.7124287749287749,
+      "grad_norm": 0.4980565011501312,
+      "learning_rate": 0.00018487182907709279,
+      "loss": 1.2365,
+      "step": 4001
+    },
+    {
+      "epoch": 0.7126068376068376,
+      "grad_norm": 0.485831081867218,
+      "learning_rate": 0.00018486442574947511,
+      "loss": 1.0941,
+      "step": 4002
+    },
+    {
+      "epoch": 0.7127849002849003,
+      "grad_norm": 0.4955040216445923,
+      "learning_rate": 0.00018485702075911534,
+      "loss": 1.248,
+      "step": 4003
+    },
+    {
+      "epoch": 0.7129629629629629,
+      "grad_norm": 0.5168375968933105,
+      "learning_rate": 0.00018484961410615845,
+      "loss": 1.1118,
+      "step": 4004
+    },
+    {
+      "epoch": 0.7131410256410257,
+      "grad_norm": 0.5255687832832336,
+      "learning_rate": 0.00018484220579074968,
+      "loss": 1.0558,
+      "step": 4005
+    },
+    {
+      "epoch": 0.7133190883190883,
+      "grad_norm": 0.5502219796180725,
+      "learning_rate": 0.00018483479581303416,
+      "loss": 1.1604,
+      "step": 4006
+    },
+    {
+      "epoch": 0.7134971509971509,
+      "grad_norm": 0.5155881643295288,
+      "learning_rate": 0.000184827384173157,
+      "loss": 0.8246,
+      "step": 4007
+    },
+    {
+      "epoch": 0.7136752136752137,
+      "grad_norm": 0.5321542024612427,
+      "learning_rate": 0.0001848199708712635,
+      "loss": 1.2058,
+      "step": 4008
+    },
+    {
+      "epoch": 0.7138532763532763,
+      "grad_norm": 0.4929848313331604,
+      "learning_rate": 0.00018481255590749884,
+      "loss": 1.4023,
+      "step": 4009
+    },
+    {
+      "epoch": 0.7140313390313391,
+      "grad_norm": 0.5070937871932983,
+      "learning_rate": 0.00018480513928200836,
+      "loss": 1.0561,
+      "step": 4010
+    },
+    {
+      "epoch": 0.7142094017094017,
+      "grad_norm": 0.5750083327293396,
+      "learning_rate": 0.00018479772099493728,
+      "loss": 1.0276,
+      "step": 4011
+    },
+    {
+      "epoch": 0.7143874643874644,
+      "grad_norm": 0.5265933275222778,
+      "learning_rate": 0.00018479030104643108,
+      "loss": 1.0295,
+      "step": 4012
+    },
+    {
+      "epoch": 0.7145655270655271,
+      "grad_norm": 0.526830792427063,
+      "learning_rate": 0.00018478287943663504,
+      "loss": 1.0157,
+      "step": 4013
+    },
+    {
+      "epoch": 0.7147435897435898,
+      "grad_norm": 0.5344091653823853,
+      "learning_rate": 0.00018477545616569458,
+      "loss": 1.1997,
+      "step": 4014
+    },
+    {
+      "epoch": 0.7149216524216524,
+      "grad_norm": 0.4935445189476013,
+      "learning_rate": 0.0001847680312337552,
+      "loss": 1.1858,
+      "step": 4015
+    },
+    {
+      "epoch": 0.7150997150997151,
+      "grad_norm": 0.5291212797164917,
+      "learning_rate": 0.0001847606046409623,
+      "loss": 0.926,
+      "step": 4016
+    },
+    {
+      "epoch": 0.7152777777777778,
+      "grad_norm": 0.559050977230072,
+      "learning_rate": 0.00018475317638746142,
+      "loss": 1.0947,
+      "step": 4017
+    },
+    {
+      "epoch": 0.7154558404558404,
+      "grad_norm": 0.4566570222377777,
+      "learning_rate": 0.00018474574647339814,
+      "loss": 1.0334,
+      "step": 4018
+    },
+    {
+      "epoch": 0.7156339031339032,
+      "grad_norm": 0.5156155824661255,
+      "learning_rate": 0.000184738314898918,
+      "loss": 1.0076,
+      "step": 4019
+    },
+    {
+      "epoch": 0.7158119658119658,
+      "grad_norm": 0.5008716583251953,
+      "learning_rate": 0.00018473088166416662,
+      "loss": 1.0378,
+      "step": 4020
+    },
+    {
+      "epoch": 0.7159900284900285,
+      "grad_norm": 0.49556368589401245,
+      "learning_rate": 0.0001847234467692896,
+      "loss": 1.15,
+      "step": 4021
+    },
+    {
+      "epoch": 0.7161680911680912,
+      "grad_norm": 0.5464680790901184,
+      "learning_rate": 0.00018471601021443265,
+      "loss": 1.2975,
+      "step": 4022
+    },
+    {
+      "epoch": 0.7163461538461539,
+      "grad_norm": 0.6291980147361755,
+      "learning_rate": 0.00018470857199974144,
+      "loss": 1.05,
+      "step": 4023
+    },
+    {
+      "epoch": 0.7165242165242165,
+      "grad_norm": 0.5566631555557251,
+      "learning_rate": 0.00018470113212536176,
+      "loss": 1.1296,
+      "step": 4024
+    },
+    {
+      "epoch": 0.7167022792022792,
+      "grad_norm": 0.5569562911987305,
+      "learning_rate": 0.00018469369059143933,
+      "loss": 1.2484,
+      "step": 4025
+    },
+    {
+      "epoch": 0.7168803418803419,
+      "grad_norm": 0.5804716944694519,
+      "learning_rate": 0.00018468624739812,
+      "loss": 1.0547,
+      "step": 4026
+    },
+    {
+      "epoch": 0.7170584045584045,
+      "grad_norm": 0.6316802501678467,
+      "learning_rate": 0.00018467880254554952,
+      "loss": 1.1188,
+      "step": 4027
+    },
+    {
+      "epoch": 0.7172364672364673,
+      "grad_norm": 0.6131419539451599,
+      "learning_rate": 0.00018467135603387385,
+      "loss": 1.1662,
+      "step": 4028
+    },
+    {
+      "epoch": 0.7174145299145299,
+      "grad_norm": 0.4703124761581421,
+      "learning_rate": 0.00018466390786323883,
+      "loss": 1.038,
+      "step": 4029
+    },
+    {
+      "epoch": 0.7175925925925926,
+      "grad_norm": 0.5718469023704529,
+      "learning_rate": 0.0001846564580337904,
+      "loss": 1.0786,
+      "step": 4030
+    },
+    {
+      "epoch": 0.7177706552706553,
+      "grad_norm": 0.5227612853050232,
+      "learning_rate": 0.00018464900654567457,
+      "loss": 1.0561,
+      "step": 4031
+    },
+    {
+      "epoch": 0.717948717948718,
+      "grad_norm": 0.5800358057022095,
+      "learning_rate": 0.00018464155339903727,
+      "loss": 1.0944,
+      "step": 4032
+    },
+    {
+      "epoch": 0.7181267806267806,
+      "grad_norm": 0.5562314987182617,
+      "learning_rate": 0.00018463409859402455,
+      "loss": 0.8573,
+      "step": 4033
+    },
+    {
+      "epoch": 0.7183048433048433,
+      "grad_norm": 0.6420153379440308,
+      "learning_rate": 0.0001846266421307825,
+      "loss": 1.088,
+      "step": 4034
+    },
+    {
+      "epoch": 0.718482905982906,
+      "grad_norm": 0.4745902717113495,
+      "learning_rate": 0.00018461918400945718,
+      "loss": 1.1679,
+      "step": 4035
+    },
+    {
+      "epoch": 0.7186609686609686,
+      "grad_norm": 0.5070300102233887,
+      "learning_rate": 0.00018461172423019475,
+      "loss": 1.1984,
+      "step": 4036
+    },
+    {
+      "epoch": 0.7188390313390314,
+      "grad_norm": 0.5339375138282776,
+      "learning_rate": 0.00018460426279314133,
+      "loss": 1.3038,
+      "step": 4037
+    },
+    {
+      "epoch": 0.719017094017094,
+      "grad_norm": 0.5947147607803345,
+      "learning_rate": 0.00018459679969844313,
+      "loss": 1.0103,
+      "step": 4038
+    },
+    {
+      "epoch": 0.7191951566951567,
+      "grad_norm": 0.5493791699409485,
+      "learning_rate": 0.00018458933494624642,
+      "loss": 1.1001,
+      "step": 4039
+    },
+    {
+      "epoch": 0.7193732193732194,
+      "grad_norm": 0.5700310468673706,
+      "learning_rate": 0.00018458186853669736,
+      "loss": 0.9006,
+      "step": 4040
+    },
+    {
+      "epoch": 0.719551282051282,
+      "grad_norm": 0.60371994972229,
+      "learning_rate": 0.0001845744004699423,
+      "loss": 1.3001,
+      "step": 4041
+    },
+    {
+      "epoch": 0.7197293447293447,
+      "grad_norm": 0.5469261407852173,
+      "learning_rate": 0.00018456693074612757,
+      "loss": 1.1745,
+      "step": 4042
+    },
+    {
+      "epoch": 0.7199074074074074,
+      "grad_norm": 0.5179165601730347,
+      "learning_rate": 0.00018455945936539947,
+      "loss": 0.9883,
+      "step": 4043
+    },
+    {
+      "epoch": 0.7200854700854701,
+      "grad_norm": 0.5396696329116821,
+      "learning_rate": 0.00018455198632790447,
+      "loss": 1.1277,
+      "step": 4044
+    },
+    {
+      "epoch": 0.7202635327635327,
+      "grad_norm": 0.4559909403324127,
+      "learning_rate": 0.00018454451163378888,
+      "loss": 0.9644,
+      "step": 4045
+    },
+    {
+      "epoch": 0.7204415954415955,
+      "grad_norm": 0.49863892793655396,
+      "learning_rate": 0.00018453703528319927,
+      "loss": 1.1276,
+      "step": 4046
+    },
+    {
+      "epoch": 0.7206196581196581,
+      "grad_norm": 0.4790710508823395,
+      "learning_rate": 0.000184529557276282,
+      "loss": 0.9443,
+      "step": 4047
+    },
+    {
+      "epoch": 0.7207977207977208,
+      "grad_norm": 0.541999876499176,
+      "learning_rate": 0.0001845220776131837,
+      "loss": 1.0681,
+      "step": 4048
+    },
+    {
+      "epoch": 0.7209757834757835,
+      "grad_norm": 0.5119109153747559,
+      "learning_rate": 0.00018451459629405088,
+      "loss": 1.2078,
+      "step": 4049
+    },
+    {
+      "epoch": 0.7211538461538461,
+      "grad_norm": 0.6141307353973389,
+      "learning_rate": 0.00018450711331903006,
+      "loss": 1.1071,
+      "step": 4050
+    },
+    {
+      "epoch": 0.7213319088319088,
+      "grad_norm": 0.48679864406585693,
+      "learning_rate": 0.00018449962868826795,
+      "loss": 0.9713,
+      "step": 4051
+    },
+    {
+      "epoch": 0.7215099715099715,
+      "grad_norm": 0.5548661947250366,
+      "learning_rate": 0.0001844921424019111,
+      "loss": 1.2099,
+      "step": 4052
+    },
+    {
+      "epoch": 0.7216880341880342,
+      "grad_norm": 0.5000107884407043,
+      "learning_rate": 0.00018448465446010626,
+      "loss": 1.0184,
+      "step": 4053
+    },
+    {
+      "epoch": 0.7218660968660968,
+      "grad_norm": 0.6131454110145569,
+      "learning_rate": 0.00018447716486300013,
+      "loss": 1.2581,
+      "step": 4054
+    },
+    {
+      "epoch": 0.7220441595441596,
+      "grad_norm": 0.5145987868309021,
+      "learning_rate": 0.0001844696736107394,
+      "loss": 1.1646,
+      "step": 4055
+    },
+    {
+      "epoch": 0.7222222222222222,
+      "grad_norm": 0.4361337125301361,
+      "learning_rate": 0.00018446218070347094,
+      "loss": 0.8239,
+      "step": 4056
+    },
+    {
+      "epoch": 0.7224002849002849,
+      "grad_norm": 0.5549173355102539,
+      "learning_rate": 0.00018445468614134146,
+      "loss": 1.1935,
+      "step": 4057
+    },
+    {
+      "epoch": 0.7225783475783476,
+      "grad_norm": 0.5569297671318054,
+      "learning_rate": 0.00018444718992449789,
+      "loss": 1.0137,
+      "step": 4058
+    },
+    {
+      "epoch": 0.7227564102564102,
+      "grad_norm": 0.44866305589675903,
+      "learning_rate": 0.00018443969205308704,
+      "loss": 0.987,
+      "step": 4059
+    },
+    {
+      "epoch": 0.7229344729344729,
+      "grad_norm": 0.5142943263053894,
+      "learning_rate": 0.0001844321925272558,
+      "loss": 1.0837,
+      "step": 4060
+    },
+    {
+      "epoch": 0.7231125356125356,
+      "grad_norm": 0.4922119379043579,
+      "learning_rate": 0.0001844246913471512,
+      "loss": 0.8477,
+      "step": 4061
+    },
+    {
+      "epoch": 0.7232905982905983,
+      "grad_norm": 0.5245375633239746,
+      "learning_rate": 0.0001844171885129201,
+      "loss": 0.9985,
+      "step": 4062
+    },
+    {
+      "epoch": 0.7234686609686609,
+      "grad_norm": 0.45562678575515747,
+      "learning_rate": 0.00018440968402470956,
+      "loss": 0.8678,
+      "step": 4063
+    },
+    {
+      "epoch": 0.7236467236467237,
+      "grad_norm": 0.5388376712799072,
+      "learning_rate": 0.0001844021778826666,
+      "loss": 1.0586,
+      "step": 4064
+    },
+    {
+      "epoch": 0.7238247863247863,
+      "grad_norm": 0.48945263028144836,
+      "learning_rate": 0.00018439467008693833,
+      "loss": 1.0547,
+      "step": 4065
+    },
+    {
+      "epoch": 0.7240028490028491,
+      "grad_norm": 0.5202330350875854,
+      "learning_rate": 0.00018438716063767178,
+      "loss": 1.3142,
+      "step": 4066
+    },
+    {
+      "epoch": 0.7241809116809117,
+      "grad_norm": 0.5432567000389099,
+      "learning_rate": 0.00018437964953501413,
+      "loss": 1.0192,
+      "step": 4067
+    },
+    {
+      "epoch": 0.7243589743589743,
+      "grad_norm": 0.5220325589179993,
+      "learning_rate": 0.00018437213677911253,
+      "loss": 1.0904,
+      "step": 4068
+    },
+    {
+      "epoch": 0.7245370370370371,
+      "grad_norm": 0.45711690187454224,
+      "learning_rate": 0.00018436462237011417,
+      "loss": 1.0417,
+      "step": 4069
+    },
+    {
+      "epoch": 0.7247150997150997,
+      "grad_norm": 0.560778021812439,
+      "learning_rate": 0.0001843571063081663,
+      "loss": 1.2316,
+      "step": 4070
+    },
+    {
+      "epoch": 0.7248931623931624,
+      "grad_norm": 0.591533362865448,
+      "learning_rate": 0.0001843495885934162,
+      "loss": 1.0294,
+      "step": 4071
+    },
+    {
+      "epoch": 0.7250712250712251,
+      "grad_norm": 0.5550443530082703,
+      "learning_rate": 0.00018434206922601106,
+      "loss": 1.0162,
+      "step": 4072
+    },
+    {
+      "epoch": 0.7252492877492878,
+      "grad_norm": 0.5744053721427917,
+      "learning_rate": 0.00018433454820609833,
+      "loss": 1.2774,
+      "step": 4073
+    },
+    {
+      "epoch": 0.7254273504273504,
+      "grad_norm": 0.6210703253746033,
+      "learning_rate": 0.0001843270255338253,
+      "loss": 1.2526,
+      "step": 4074
+    },
+    {
+      "epoch": 0.7256054131054132,
+      "grad_norm": 0.49684277176856995,
+      "learning_rate": 0.0001843195012093394,
+      "loss": 1.0786,
+      "step": 4075
+    },
+    {
+      "epoch": 0.7257834757834758,
+      "grad_norm": 0.5851606130599976,
+      "learning_rate": 0.00018431197523278802,
+      "loss": 1.14,
+      "step": 4076
+    },
+    {
+      "epoch": 0.7259615384615384,
+      "grad_norm": 0.5494425296783447,
+      "learning_rate": 0.00018430444760431862,
+      "loss": 1.211,
+      "step": 4077
+    },
+    {
+      "epoch": 0.7261396011396012,
+      "grad_norm": 0.5247658491134644,
+      "learning_rate": 0.00018429691832407867,
+      "loss": 0.8031,
+      "step": 4078
+    },
+    {
+      "epoch": 0.7263176638176638,
+      "grad_norm": 0.5012249946594238,
+      "learning_rate": 0.00018428938739221574,
+      "loss": 1.1258,
+      "step": 4079
+    },
+    {
+      "epoch": 0.7264957264957265,
+      "grad_norm": 0.5226427912712097,
+      "learning_rate": 0.0001842818548088774,
+      "loss": 1.0029,
+      "step": 4080
+    },
+    {
+      "epoch": 0.7266737891737892,
+      "grad_norm": 0.45008543133735657,
+      "learning_rate": 0.00018427432057421114,
+      "loss": 1.0681,
+      "step": 4081
+    },
+    {
+      "epoch": 0.7268518518518519,
+      "grad_norm": 0.5127285122871399,
+      "learning_rate": 0.00018426678468836467,
+      "loss": 1.1069,
+      "step": 4082
+    },
+    {
+      "epoch": 0.7270299145299145,
+      "grad_norm": 0.5406150221824646,
+      "learning_rate": 0.0001842592471514856,
+      "loss": 1.052,
+      "step": 4083
+    },
+    {
+      "epoch": 0.7272079772079773,
+      "grad_norm": 0.5001157522201538,
+      "learning_rate": 0.0001842517079637216,
+      "loss": 0.9157,
+      "step": 4084
+    },
+    {
+      "epoch": 0.7273860398860399,
+      "grad_norm": 0.6169779300689697,
+      "learning_rate": 0.00018424416712522042,
+      "loss": 1.3133,
+      "step": 4085
+    },
+    {
+      "epoch": 0.7275641025641025,
+      "grad_norm": 0.4891316890716553,
+      "learning_rate": 0.00018423662463612974,
+      "loss": 0.9505,
+      "step": 4086
+    },
+    {
+      "epoch": 0.7277421652421653,
+      "grad_norm": 0.5883708596229553,
+      "learning_rate": 0.00018422908049659743,
+      "loss": 1.2797,
+      "step": 4087
+    },
+    {
+      "epoch": 0.7279202279202279,
+      "grad_norm": 0.6679072976112366,
+      "learning_rate": 0.00018422153470677125,
+      "loss": 1.1096,
+      "step": 4088
+    },
+    {
+      "epoch": 0.7280982905982906,
+      "grad_norm": 0.5178479552268982,
+      "learning_rate": 0.00018421398726679904,
+      "loss": 1.0299,
+      "step": 4089
+    },
+    {
+      "epoch": 0.7282763532763533,
+      "grad_norm": 0.6343900561332703,
+      "learning_rate": 0.0001842064381768287,
+      "loss": 1.2983,
+      "step": 4090
+    },
+    {
+      "epoch": 0.728454415954416,
+      "grad_norm": 0.43816515803337097,
+      "learning_rate": 0.0001841988874370081,
+      "loss": 0.9452,
+      "step": 4091
+    },
+    {
+      "epoch": 0.7286324786324786,
+      "grad_norm": 0.579790472984314,
+      "learning_rate": 0.00018419133504748528,
+      "loss": 1.1037,
+      "step": 4092
+    },
+    {
+      "epoch": 0.7288105413105413,
+      "grad_norm": 0.571374773979187,
+      "learning_rate": 0.00018418378100840807,
+      "loss": 1.1655,
+      "step": 4093
+    },
+    {
+      "epoch": 0.728988603988604,
+      "grad_norm": 0.5163514018058777,
+      "learning_rate": 0.0001841762253199246,
+      "loss": 1.1579,
+      "step": 4094
+    },
+    {
+      "epoch": 0.7291666666666666,
+      "grad_norm": 0.6553022265434265,
+      "learning_rate": 0.0001841686679821828,
+      "loss": 0.9664,
+      "step": 4095
+    },
+    {
+      "epoch": 0.7293447293447294,
+      "grad_norm": 0.5072969198226929,
+      "learning_rate": 0.00018416110899533084,
+      "loss": 0.9416,
+      "step": 4096
+    },
+    {
+      "epoch": 0.729522792022792,
+      "grad_norm": 0.5103251338005066,
+      "learning_rate": 0.00018415354835951675,
+      "loss": 1.0715,
+      "step": 4097
+    },
+    {
+      "epoch": 0.7297008547008547,
+      "grad_norm": 0.49752289056777954,
+      "learning_rate": 0.00018414598607488874,
+      "loss": 1.1848,
+      "step": 4098
+    },
+    {
+      "epoch": 0.7298789173789174,
+      "grad_norm": 0.5361882448196411,
+      "learning_rate": 0.00018413842214159488,
+      "loss": 1.1035,
+      "step": 4099
+    },
+    {
+      "epoch": 0.73005698005698,
+      "grad_norm": 0.5167670249938965,
+      "learning_rate": 0.00018413085655978343,
+      "loss": 1.0015,
+      "step": 4100
+    },
+    {
+      "epoch": 0.7302350427350427,
+      "grad_norm": 0.5930629372596741,
+      "learning_rate": 0.00018412328932960263,
+      "loss": 0.9766,
+      "step": 4101
+    },
+    {
+      "epoch": 0.7304131054131054,
+      "grad_norm": 0.5234778523445129,
+      "learning_rate": 0.00018411572045120073,
+      "loss": 1.0317,
+      "step": 4102
+    },
+    {
+      "epoch": 0.7305911680911681,
+      "grad_norm": 0.5361374020576477,
+      "learning_rate": 0.000184108149924726,
+      "loss": 1.1228,
+      "step": 4103
+    },
+    {
+      "epoch": 0.7307692307692307,
+      "grad_norm": 0.5845770239830017,
+      "learning_rate": 0.0001841005777503268,
+      "loss": 0.9541,
+      "step": 4104
+    },
+    {
+      "epoch": 0.7309472934472935,
+      "grad_norm": 0.49320483207702637,
+      "learning_rate": 0.0001840930039281515,
+      "loss": 0.9445,
+      "step": 4105
+    },
+    {
+      "epoch": 0.7311253561253561,
+      "grad_norm": 0.5391250252723694,
+      "learning_rate": 0.00018408542845834845,
+      "loss": 1.1983,
+      "step": 4106
+    },
+    {
+      "epoch": 0.7313034188034188,
+      "grad_norm": 0.4890393316745758,
+      "learning_rate": 0.00018407785134106613,
+      "loss": 0.8353,
+      "step": 4107
+    },
+    {
+      "epoch": 0.7314814814814815,
+      "grad_norm": 0.5839747190475464,
+      "learning_rate": 0.00018407027257645296,
+      "loss": 1.4074,
+      "step": 4108
+    },
+    {
+      "epoch": 0.7316595441595442,
+      "grad_norm": 0.5957708358764648,
+      "learning_rate": 0.0001840626921646574,
+      "loss": 1.1032,
+      "step": 4109
+    },
+    {
+      "epoch": 0.7318376068376068,
+      "grad_norm": 0.5029017925262451,
+      "learning_rate": 0.00018405511010582805,
+      "loss": 1.095,
+      "step": 4110
+    },
+    {
+      "epoch": 0.7320156695156695,
+      "grad_norm": 0.6054347157478333,
+      "learning_rate": 0.00018404752640011345,
+      "loss": 1.0366,
+      "step": 4111
+    },
+    {
+      "epoch": 0.7321937321937322,
+      "grad_norm": 0.5476830005645752,
+      "learning_rate": 0.00018403994104766212,
+      "loss": 1.0976,
+      "step": 4112
+    },
+    {
+      "epoch": 0.7323717948717948,
+      "grad_norm": 0.5000962615013123,
+      "learning_rate": 0.00018403235404862277,
+      "loss": 1.0809,
+      "step": 4113
+    },
+    {
+      "epoch": 0.7325498575498576,
+      "grad_norm": 0.5119251012802124,
+      "learning_rate": 0.00018402476540314394,
+      "loss": 1.0176,
+      "step": 4114
+    },
+    {
+      "epoch": 0.7327279202279202,
+      "grad_norm": 0.5825830698013306,
+      "learning_rate": 0.00018401717511137445,
+      "loss": 1.2357,
+      "step": 4115
+    },
+    {
+      "epoch": 0.7329059829059829,
+      "grad_norm": 0.5702941417694092,
+      "learning_rate": 0.0001840095831734629,
+      "loss": 1.1549,
+      "step": 4116
+    },
+    {
+      "epoch": 0.7330840455840456,
+      "grad_norm": 0.5660699605941772,
+      "learning_rate": 0.00018400198958955807,
+      "loss": 1.1778,
+      "step": 4117
+    },
+    {
+      "epoch": 0.7332621082621082,
+      "grad_norm": 0.5241161584854126,
+      "learning_rate": 0.0001839943943598088,
+      "loss": 0.8587,
+      "step": 4118
+    },
+    {
+      "epoch": 0.7334401709401709,
+      "grad_norm": 0.581194281578064,
+      "learning_rate": 0.0001839867974843638,
+      "loss": 1.2169,
+      "step": 4119
+    },
+    {
+      "epoch": 0.7336182336182336,
+      "grad_norm": 0.4342379570007324,
+      "learning_rate": 0.00018397919896337198,
+      "loss": 0.9182,
+      "step": 4120
+    },
+    {
+      "epoch": 0.7337962962962963,
+      "grad_norm": 0.5708567500114441,
+      "learning_rate": 0.00018397159879698224,
+      "loss": 1.1781,
+      "step": 4121
+    },
+    {
+      "epoch": 0.7339743589743589,
+      "grad_norm": 0.5827265977859497,
+      "learning_rate": 0.00018396399698534344,
+      "loss": 1.2905,
+      "step": 4122
+    },
+    {
+      "epoch": 0.7341524216524217,
+      "grad_norm": 0.5274056792259216,
+      "learning_rate": 0.00018395639352860457,
+      "loss": 1.1786,
+      "step": 4123
+    },
+    {
+      "epoch": 0.7343304843304843,
+      "grad_norm": 0.5094266533851624,
+      "learning_rate": 0.00018394878842691452,
+      "loss": 1.2016,
+      "step": 4124
+    },
+    {
+      "epoch": 0.7345085470085471,
+      "grad_norm": 0.48779475688934326,
+      "learning_rate": 0.0001839411816804224,
+      "loss": 1.0562,
+      "step": 4125
+    },
+    {
+      "epoch": 0.7346866096866097,
+      "grad_norm": 0.5805709958076477,
+      "learning_rate": 0.00018393357328927716,
+      "loss": 1.1705,
+      "step": 4126
+    },
+    {
+      "epoch": 0.7348646723646723,
+      "grad_norm": 0.4910700023174286,
+      "learning_rate": 0.00018392596325362791,
+      "loss": 1.0682,
+      "step": 4127
+    },
+    {
+      "epoch": 0.7350427350427351,
+      "grad_norm": 0.5297428369522095,
+      "learning_rate": 0.0001839183515736238,
+      "loss": 0.9505,
+      "step": 4128
+    },
+    {
+      "epoch": 0.7352207977207977,
+      "grad_norm": 0.45442086458206177,
+      "learning_rate": 0.00018391073824941385,
+      "loss": 0.9548,
+      "step": 4129
+    },
+    {
+      "epoch": 0.7353988603988604,
+      "grad_norm": 0.49299946427345276,
+      "learning_rate": 0.00018390312328114733,
+      "loss": 1.0868,
+      "step": 4130
+    },
+    {
+      "epoch": 0.7355769230769231,
+      "grad_norm": 0.4839940369129181,
+      "learning_rate": 0.0001838955066689734,
+      "loss": 0.9565,
+      "step": 4131
+    },
+    {
+      "epoch": 0.7357549857549858,
+      "grad_norm": 0.48600608110427856,
+      "learning_rate": 0.00018388788841304128,
+      "loss": 1.2353,
+      "step": 4132
+    },
+    {
+      "epoch": 0.7359330484330484,
+      "grad_norm": 0.4893583357334137,
+      "learning_rate": 0.0001838802685135003,
+      "loss": 0.9595,
+      "step": 4133
+    },
+    {
+      "epoch": 0.7361111111111112,
+      "grad_norm": 0.4587398171424866,
+      "learning_rate": 0.00018387264697049963,
+      "loss": 1.1222,
+      "step": 4134
+    },
+    {
+      "epoch": 0.7362891737891738,
+      "grad_norm": 0.5361055731773376,
+      "learning_rate": 0.00018386502378418872,
+      "loss": 1.3304,
+      "step": 4135
+    },
+    {
+      "epoch": 0.7364672364672364,
+      "grad_norm": 0.5556629300117493,
+      "learning_rate": 0.00018385739895471686,
+      "loss": 1.0358,
+      "step": 4136
+    },
+    {
+      "epoch": 0.7366452991452992,
+      "grad_norm": 0.45555856823921204,
+      "learning_rate": 0.00018384977248223346,
+      "loss": 1.0081,
+      "step": 4137
+    },
+    {
+      "epoch": 0.7368233618233618,
+      "grad_norm": 0.5606052875518799,
+      "learning_rate": 0.00018384214436688797,
+      "loss": 0.9367,
+      "step": 4138
+    },
+    {
+      "epoch": 0.7370014245014245,
+      "grad_norm": 0.5428356528282166,
+      "learning_rate": 0.00018383451460882982,
+      "loss": 1.1391,
+      "step": 4139
+    },
+    {
+      "epoch": 0.7371794871794872,
+      "grad_norm": 0.4891330897808075,
+      "learning_rate": 0.00018382688320820853,
+      "loss": 0.9805,
+      "step": 4140
+    },
+    {
+      "epoch": 0.7373575498575499,
+      "grad_norm": 0.5407996773719788,
+      "learning_rate": 0.0001838192501651736,
+      "loss": 1.0532,
+      "step": 4141
+    },
+    {
+      "epoch": 0.7375356125356125,
+      "grad_norm": 0.5241971611976624,
+      "learning_rate": 0.00018381161547987454,
+      "loss": 0.9509,
+      "step": 4142
+    },
+    {
+      "epoch": 0.7377136752136753,
+      "grad_norm": 0.5370210409164429,
+      "learning_rate": 0.000183803979152461,
+      "loss": 1.2342,
+      "step": 4143
+    },
+    {
+      "epoch": 0.7378917378917379,
+      "grad_norm": 0.5470060706138611,
+      "learning_rate": 0.00018379634118308259,
+      "loss": 0.9621,
+      "step": 4144
+    },
+    {
+      "epoch": 0.7380698005698005,
+      "grad_norm": 0.546313464641571,
+      "learning_rate": 0.00018378870157188893,
+      "loss": 1.1253,
+      "step": 4145
+    },
+    {
+      "epoch": 0.7382478632478633,
+      "grad_norm": 0.502027153968811,
+      "learning_rate": 0.00018378106031902974,
+      "loss": 1.1919,
+      "step": 4146
+    },
+    {
+      "epoch": 0.7384259259259259,
+      "grad_norm": 0.5282283425331116,
+      "learning_rate": 0.0001837734174246547,
+      "loss": 1.0088,
+      "step": 4147
+    },
+    {
+      "epoch": 0.7386039886039886,
+      "grad_norm": 0.5152897238731384,
+      "learning_rate": 0.00018376577288891355,
+      "loss": 1.0813,
+      "step": 4148
+    },
+    {
+      "epoch": 0.7387820512820513,
+      "grad_norm": 0.5002804398536682,
+      "learning_rate": 0.0001837581267119561,
+      "loss": 0.9797,
+      "step": 4149
+    },
+    {
+      "epoch": 0.738960113960114,
+      "grad_norm": 0.5698176026344299,
+      "learning_rate": 0.00018375047889393215,
+      "loss": 1.1099,
+      "step": 4150
+    },
+    {
+      "epoch": 0.7391381766381766,
+      "grad_norm": 0.5384604930877686,
+      "learning_rate": 0.00018374282943499156,
+      "loss": 1.1944,
+      "step": 4151
+    },
+    {
+      "epoch": 0.7393162393162394,
+      "grad_norm": 0.5483044385910034,
+      "learning_rate": 0.00018373517833528418,
+      "loss": 1.1734,
+      "step": 4152
+    },
+    {
+      "epoch": 0.739494301994302,
+      "grad_norm": 0.4824066162109375,
+      "learning_rate": 0.0001837275255949599,
+      "loss": 0.9515,
+      "step": 4153
+    },
+    {
+      "epoch": 0.7396723646723646,
+      "grad_norm": 0.45413634181022644,
+      "learning_rate": 0.00018371987121416873,
+      "loss": 0.7534,
+      "step": 4154
+    },
+    {
+      "epoch": 0.7398504273504274,
+      "grad_norm": 0.5874246954917908,
+      "learning_rate": 0.00018371221519306055,
+      "loss": 0.9464,
+      "step": 4155
+    },
+    {
+      "epoch": 0.74002849002849,
+      "grad_norm": 0.5219913125038147,
+      "learning_rate": 0.00018370455753178544,
+      "loss": 1.0494,
+      "step": 4156
+    },
+    {
+      "epoch": 0.7402065527065527,
+      "grad_norm": 0.5937709212303162,
+      "learning_rate": 0.00018369689823049341,
+      "loss": 1.0529,
+      "step": 4157
+    },
+    {
+      "epoch": 0.7403846153846154,
+      "grad_norm": 0.5204295516014099,
+      "learning_rate": 0.00018368923728933449,
+      "loss": 1.0602,
+      "step": 4158
+    },
+    {
+      "epoch": 0.7405626780626781,
+      "grad_norm": 0.5422890186309814,
+      "learning_rate": 0.00018368157470845885,
+      "loss": 0.9261,
+      "step": 4159
+    },
+    {
+      "epoch": 0.7407407407407407,
+      "grad_norm": 0.6163852214813232,
+      "learning_rate": 0.00018367391048801655,
+      "loss": 1.2771,
+      "step": 4160
+    },
+    {
+      "epoch": 0.7409188034188035,
+      "grad_norm": 0.5070751309394836,
+      "learning_rate": 0.00018366624462815785,
+      "loss": 1.0401,
+      "step": 4161
+    },
+    {
+      "epoch": 0.7410968660968661,
+      "grad_norm": 0.4477100968360901,
+      "learning_rate": 0.00018365857712903283,
+      "loss": 1.1463,
+      "step": 4162
+    },
+    {
+      "epoch": 0.7412749287749287,
+      "grad_norm": 0.5421462655067444,
+      "learning_rate": 0.0001836509079907918,
+      "loss": 0.9373,
+      "step": 4163
+    },
+    {
+      "epoch": 0.7414529914529915,
+      "grad_norm": 0.6162141561508179,
+      "learning_rate": 0.000183643237213585,
+      "loss": 1.1827,
+      "step": 4164
+    },
+    {
+      "epoch": 0.7416310541310541,
+      "grad_norm": 0.5653836131095886,
+      "learning_rate": 0.00018363556479756272,
+      "loss": 1.0689,
+      "step": 4165
+    },
+    {
+      "epoch": 0.7418091168091168,
+      "grad_norm": 0.57053542137146,
+      "learning_rate": 0.00018362789074287527,
+      "loss": 1.0289,
+      "step": 4166
+    },
+    {
+      "epoch": 0.7419871794871795,
+      "grad_norm": 0.5603055953979492,
+      "learning_rate": 0.00018362021504967304,
+      "loss": 1.1926,
+      "step": 4167
+    },
+    {
+      "epoch": 0.7421652421652422,
+      "grad_norm": 0.5460166335105896,
+      "learning_rate": 0.0001836125377181064,
+      "loss": 1.1488,
+      "step": 4168
+    },
+    {
+      "epoch": 0.7423433048433048,
+      "grad_norm": 0.5097107887268066,
+      "learning_rate": 0.00018360485874832579,
+      "loss": 1.0781,
+      "step": 4169
+    },
+    {
+      "epoch": 0.7425213675213675,
+      "grad_norm": 0.6280624270439148,
+      "learning_rate": 0.00018359717814048164,
+      "loss": 1.3625,
+      "step": 4170
+    },
+    {
+      "epoch": 0.7426994301994302,
+      "grad_norm": 0.4528210759162903,
+      "learning_rate": 0.0001835894958947244,
+      "loss": 0.8417,
+      "step": 4171
+    },
+    {
+      "epoch": 0.7428774928774928,
+      "grad_norm": 0.48735132813453674,
+      "learning_rate": 0.00018358181201120468,
+      "loss": 0.9544,
+      "step": 4172
+    },
+    {
+      "epoch": 0.7430555555555556,
+      "grad_norm": 0.48388174176216125,
+      "learning_rate": 0.00018357412649007296,
+      "loss": 1.0663,
+      "step": 4173
+    },
+    {
+      "epoch": 0.7432336182336182,
+      "grad_norm": 0.5435357689857483,
+      "learning_rate": 0.00018356643933147986,
+      "loss": 1.2074,
+      "step": 4174
+    },
+    {
+      "epoch": 0.7434116809116809,
+      "grad_norm": 0.49890074133872986,
+      "learning_rate": 0.00018355875053557594,
+      "loss": 1.1322,
+      "step": 4175
+    },
+    {
+      "epoch": 0.7435897435897436,
+      "grad_norm": 0.5680708885192871,
+      "learning_rate": 0.0001835510601025119,
+      "loss": 1.1964,
+      "step": 4176
+    },
+    {
+      "epoch": 0.7437678062678063,
+      "grad_norm": 0.5002360939979553,
+      "learning_rate": 0.00018354336803243842,
+      "loss": 1.1396,
+      "step": 4177
+    },
+    {
+      "epoch": 0.7439458689458689,
+      "grad_norm": 0.5202965140342712,
+      "learning_rate": 0.00018353567432550616,
+      "loss": 1.1498,
+      "step": 4178
+    },
+    {
+      "epoch": 0.7441239316239316,
+      "grad_norm": 0.514492928981781,
+      "learning_rate": 0.00018352797898186588,
+      "loss": 1.0959,
+      "step": 4179
+    },
+    {
+      "epoch": 0.7443019943019943,
+      "grad_norm": 0.6395383477210999,
+      "learning_rate": 0.0001835202820016684,
+      "loss": 1.2867,
+      "step": 4180
+    },
+    {
+      "epoch": 0.7444800569800569,
+      "grad_norm": 0.5489062070846558,
+      "learning_rate": 0.00018351258338506447,
+      "loss": 1.1638,
+      "step": 4181
+    },
+    {
+      "epoch": 0.7446581196581197,
+      "grad_norm": 0.5705671906471252,
+      "learning_rate": 0.00018350488313220498,
+      "loss": 0.9493,
+      "step": 4182
+    },
+    {
+      "epoch": 0.7448361823361823,
+      "grad_norm": 0.5404297709465027,
+      "learning_rate": 0.00018349718124324076,
+      "loss": 0.9876,
+      "step": 4183
+    },
+    {
+      "epoch": 0.7450142450142451,
+      "grad_norm": 0.5841003060340881,
+      "learning_rate": 0.0001834894777183227,
+      "loss": 1.1225,
+      "step": 4184
+    },
+    {
+      "epoch": 0.7451923076923077,
+      "grad_norm": 0.49774688482284546,
+      "learning_rate": 0.00018348177255760178,
+      "loss": 1.1442,
+      "step": 4185
+    },
+    {
+      "epoch": 0.7453703703703703,
+      "grad_norm": 0.5212422609329224,
+      "learning_rate": 0.00018347406576122894,
+      "loss": 1.101,
+      "step": 4186
+    },
+    {
+      "epoch": 0.7455484330484331,
+      "grad_norm": 0.615024983882904,
+      "learning_rate": 0.00018346635732935517,
+      "loss": 1.4188,
+      "step": 4187
+    },
+    {
+      "epoch": 0.7457264957264957,
+      "grad_norm": 0.46818843483924866,
+      "learning_rate": 0.00018345864726213154,
+      "loss": 1.0071,
+      "step": 4188
+    },
+    {
+      "epoch": 0.7459045584045584,
+      "grad_norm": 0.4921121895313263,
+      "learning_rate": 0.00018345093555970906,
+      "loss": 1.015,
+      "step": 4189
+    },
+    {
+      "epoch": 0.7460826210826211,
+      "grad_norm": 0.5042136311531067,
+      "learning_rate": 0.00018344322222223889,
+      "loss": 0.9974,
+      "step": 4190
+    },
+    {
+      "epoch": 0.7462606837606838,
+      "grad_norm": 0.5872490406036377,
+      "learning_rate": 0.0001834355072498721,
+      "loss": 1.3166,
+      "step": 4191
+    },
+    {
+      "epoch": 0.7464387464387464,
+      "grad_norm": 0.559117317199707,
+      "learning_rate": 0.00018342779064275984,
+      "loss": 1.2227,
+      "step": 4192
+    },
+    {
+      "epoch": 0.7466168091168092,
+      "grad_norm": 0.5269635319709778,
+      "learning_rate": 0.00018342007240105336,
+      "loss": 1.0281,
+      "step": 4193
+    },
+    {
+      "epoch": 0.7467948717948718,
+      "grad_norm": 0.4608335793018341,
+      "learning_rate": 0.00018341235252490387,
+      "loss": 0.98,
+      "step": 4194
+    },
+    {
+      "epoch": 0.7469729344729344,
+      "grad_norm": 0.5818259119987488,
+      "learning_rate": 0.00018340463101446255,
+      "loss": 1.1544,
+      "step": 4195
+    },
+    {
+      "epoch": 0.7471509971509972,
+      "grad_norm": 0.5577529668807983,
+      "learning_rate": 0.00018339690786988079,
+      "loss": 1.3059,
+      "step": 4196
+    },
+    {
+      "epoch": 0.7473290598290598,
+      "grad_norm": 0.5430468320846558,
+      "learning_rate": 0.00018338918309130983,
+      "loss": 1.2766,
+      "step": 4197
+    },
+    {
+      "epoch": 0.7475071225071225,
+      "grad_norm": 0.4941701591014862,
+      "learning_rate": 0.0001833814566789011,
+      "loss": 1.193,
+      "step": 4198
+    },
+    {
+      "epoch": 0.7476851851851852,
+      "grad_norm": 0.5471884608268738,
+      "learning_rate": 0.00018337372863280589,
+      "loss": 1.2261,
+      "step": 4199
+    },
+    {
+      "epoch": 0.7478632478632479,
+      "grad_norm": 0.4641438126564026,
+      "learning_rate": 0.0001833659989531757,
+      "loss": 0.7953,
+      "step": 4200
+    },
+    {
+      "epoch": 0.7480413105413105,
+      "grad_norm": 0.5244714617729187,
+      "learning_rate": 0.0001833582676401619,
+      "loss": 0.9344,
+      "step": 4201
+    },
+    {
+      "epoch": 0.7482193732193733,
+      "grad_norm": 0.5964360237121582,
+      "learning_rate": 0.00018335053469391603,
+      "loss": 1.2072,
+      "step": 4202
+    },
+    {
+      "epoch": 0.7483974358974359,
+      "grad_norm": 0.4929158091545105,
+      "learning_rate": 0.00018334280011458954,
+      "loss": 1.2183,
+      "step": 4203
+    },
+    {
+      "epoch": 0.7485754985754985,
+      "grad_norm": 0.46221864223480225,
+      "learning_rate": 0.00018333506390233405,
+      "loss": 1.1957,
+      "step": 4204
+    },
+    {
+      "epoch": 0.7487535612535613,
+      "grad_norm": 0.6301732659339905,
+      "learning_rate": 0.0001833273260573011,
+      "loss": 1.0582,
+      "step": 4205
+    },
+    {
+      "epoch": 0.7489316239316239,
+      "grad_norm": 0.5606021881103516,
+      "learning_rate": 0.0001833195865796423,
+      "loss": 1.4034,
+      "step": 4206
+    },
+    {
+      "epoch": 0.7491096866096866,
+      "grad_norm": 0.44856077432632446,
+      "learning_rate": 0.00018331184546950926,
+      "loss": 0.8421,
+      "step": 4207
+    },
+    {
+      "epoch": 0.7492877492877493,
+      "grad_norm": 0.5487226247787476,
+      "learning_rate": 0.00018330410272705366,
+      "loss": 1.238,
+      "step": 4208
+    },
+    {
+      "epoch": 0.749465811965812,
+      "grad_norm": 0.6043636798858643,
+      "learning_rate": 0.00018329635835242724,
+      "loss": 1.1215,
+      "step": 4209
+    },
+    {
+      "epoch": 0.7496438746438746,
+      "grad_norm": 0.5145319104194641,
+      "learning_rate": 0.00018328861234578173,
+      "loss": 1.1002,
+      "step": 4210
+    },
+    {
+      "epoch": 0.7498219373219374,
+      "grad_norm": 0.5667078495025635,
+      "learning_rate": 0.00018328086470726884,
+      "loss": 1.2994,
+      "step": 4211
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.5117634534835815,
+      "learning_rate": 0.00018327311543704043,
+      "loss": 0.9448,
+      "step": 4212
+    },
+    {
+      "epoch": 0.75,
+      "eval_loss": 1.0982474088668823,
+      "eval_runtime": 24.6617,
+      "eval_samples_per_second": 42.211,
+      "eval_steps_per_second": 21.126,
+      "step": 4212
+    },
+    {
+      "epoch": 0.7501780626780626,
+      "grad_norm": 0.5451585054397583,
+      "learning_rate": 0.00018326536453524826,
+      "loss": 0.9023,
+      "step": 4213
+    },
+    {
+      "epoch": 0.7503561253561254,
+      "grad_norm": 0.6585208773612976,
+      "learning_rate": 0.0001832576120020443,
+      "loss": 1.2798,
+      "step": 4214
+    },
+    {
+      "epoch": 0.750534188034188,
+      "grad_norm": 0.6444812417030334,
+      "learning_rate": 0.00018324985783758037,
+      "loss": 1.3999,
+      "step": 4215
+    },
+    {
+      "epoch": 0.7507122507122507,
+      "grad_norm": 0.6178330779075623,
+      "learning_rate": 0.0001832421020420084,
+      "loss": 1.1846,
+      "step": 4216
+    },
+    {
+      "epoch": 0.7508903133903134,
+      "grad_norm": 0.509969174861908,
+      "learning_rate": 0.00018323434461548036,
+      "loss": 1.1831,
+      "step": 4217
+    },
+    {
+      "epoch": 0.7510683760683761,
+      "grad_norm": 0.5558911561965942,
+      "learning_rate": 0.00018322658555814826,
+      "loss": 1.1599,
+      "step": 4218
+    },
+    {
+      "epoch": 0.7512464387464387,
+      "grad_norm": 0.5714917778968811,
+      "learning_rate": 0.0001832188248701641,
+      "loss": 0.9702,
+      "step": 4219
+    },
+    {
+      "epoch": 0.7514245014245015,
+      "grad_norm": 0.6136442422866821,
+      "learning_rate": 0.00018321106255167995,
+      "loss": 0.9376,
+      "step": 4220
+    },
+    {
+      "epoch": 0.7516025641025641,
+      "grad_norm": 0.5832077264785767,
+      "learning_rate": 0.00018320329860284785,
+      "loss": 1.2564,
+      "step": 4221
+    },
+    {
+      "epoch": 0.7517806267806267,
+      "grad_norm": 0.45330923795700073,
+      "learning_rate": 0.00018319553302381997,
+      "loss": 0.9321,
+      "step": 4222
+    },
+    {
+      "epoch": 0.7519586894586895,
+      "grad_norm": 0.5278468132019043,
+      "learning_rate": 0.00018318776581474847,
+      "loss": 1.1334,
+      "step": 4223
+    },
+    {
+      "epoch": 0.7521367521367521,
+      "grad_norm": 0.49267473816871643,
+      "learning_rate": 0.00018317999697578549,
+      "loss": 1.1577,
+      "step": 4224
+    },
+    {
+      "epoch": 0.7523148148148148,
+      "grad_norm": 0.5372124314308167,
+      "learning_rate": 0.00018317222650708325,
+      "loss": 1.037,
+      "step": 4225
+    },
+    {
+      "epoch": 0.7524928774928775,
+      "grad_norm": 0.5879829525947571,
+      "learning_rate": 0.000183164454408794,
+      "loss": 1.1312,
+      "step": 4226
+    },
+    {
+      "epoch": 0.7526709401709402,
+      "grad_norm": 0.5363932251930237,
+      "learning_rate": 0.00018315668068107004,
+      "loss": 1.174,
+      "step": 4227
+    },
+    {
+      "epoch": 0.7528490028490028,
+      "grad_norm": 0.5585991740226746,
+      "learning_rate": 0.00018314890532406366,
+      "loss": 1.2106,
+      "step": 4228
+    },
+    {
+      "epoch": 0.7530270655270656,
+      "grad_norm": 0.49395787715911865,
+      "learning_rate": 0.0001831411283379272,
+      "loss": 1.1163,
+      "step": 4229
+    },
+    {
+      "epoch": 0.7532051282051282,
+      "grad_norm": 0.5081066489219666,
+      "learning_rate": 0.00018313334972281306,
+      "loss": 1.184,
+      "step": 4230
+    },
+    {
+      "epoch": 0.7533831908831908,
+      "grad_norm": 0.40304034948349,
+      "learning_rate": 0.0001831255694788736,
+      "loss": 0.7548,
+      "step": 4231
+    },
+    {
+      "epoch": 0.7535612535612536,
+      "grad_norm": 0.4999815821647644,
+      "learning_rate": 0.0001831177876062613,
+      "loss": 1.0092,
+      "step": 4232
+    },
+    {
+      "epoch": 0.7537393162393162,
+      "grad_norm": 0.48917025327682495,
+      "learning_rate": 0.00018311000410512862,
+      "loss": 1.0354,
+      "step": 4233
+    },
+    {
+      "epoch": 0.7539173789173789,
+      "grad_norm": 0.475606769323349,
+      "learning_rate": 0.00018310221897562806,
+      "loss": 0.8728,
+      "step": 4234
+    },
+    {
+      "epoch": 0.7540954415954416,
+      "grad_norm": 0.630439817905426,
+      "learning_rate": 0.00018309443221791214,
+      "loss": 1.1436,
+      "step": 4235
+    },
+    {
+      "epoch": 0.7542735042735043,
+      "grad_norm": 0.524740993976593,
+      "learning_rate": 0.00018308664383213344,
+      "loss": 1.0487,
+      "step": 4236
+    },
+    {
+      "epoch": 0.7544515669515669,
+      "grad_norm": 0.4734523892402649,
+      "learning_rate": 0.0001830788538184445,
+      "loss": 1.0681,
+      "step": 4237
+    },
+    {
+      "epoch": 0.7546296296296297,
+      "grad_norm": 0.5767266750335693,
+      "learning_rate": 0.00018307106217699807,
+      "loss": 1.0599,
+      "step": 4238
+    },
+    {
+      "epoch": 0.7548076923076923,
+      "grad_norm": 0.6276642084121704,
+      "learning_rate": 0.0001830632689079467,
+      "loss": 1.2837,
+      "step": 4239
+    },
+    {
+      "epoch": 0.7549857549857549,
+      "grad_norm": 0.5539988279342651,
+      "learning_rate": 0.00018305547401144316,
+      "loss": 0.9072,
+      "step": 4240
+    },
+    {
+      "epoch": 0.7551638176638177,
+      "grad_norm": 0.4551292061805725,
+      "learning_rate": 0.00018304767748764014,
+      "loss": 1.0204,
+      "step": 4241
+    },
+    {
+      "epoch": 0.7553418803418803,
+      "grad_norm": 0.47344550490379333,
+      "learning_rate": 0.00018303987933669034,
+      "loss": 1.0473,
+      "step": 4242
+    },
+    {
+      "epoch": 0.7555199430199431,
+      "grad_norm": 0.6050213575363159,
+      "learning_rate": 0.00018303207955874665,
+      "loss": 1.1552,
+      "step": 4243
+    },
+    {
+      "epoch": 0.7556980056980057,
+      "grad_norm": 0.48943889141082764,
+      "learning_rate": 0.00018302427815396186,
+      "loss": 1.0002,
+      "step": 4244
+    },
+    {
+      "epoch": 0.7558760683760684,
+      "grad_norm": 0.5664682984352112,
+      "learning_rate": 0.00018301647512248878,
+      "loss": 1.1865,
+      "step": 4245
+    },
+    {
+      "epoch": 0.7560541310541311,
+      "grad_norm": 0.5702242255210876,
+      "learning_rate": 0.00018300867046448034,
+      "loss": 1.3029,
+      "step": 4246
+    },
+    {
+      "epoch": 0.7562321937321937,
+      "grad_norm": 0.593207836151123,
+      "learning_rate": 0.00018300086418008942,
+      "loss": 1.109,
+      "step": 4247
+    },
+    {
+      "epoch": 0.7564102564102564,
+      "grad_norm": 0.5887887477874756,
+      "learning_rate": 0.000182993056269469,
+      "loss": 1.3022,
+      "step": 4248
+    },
+    {
+      "epoch": 0.7565883190883191,
+      "grad_norm": 0.5277966260910034,
+      "learning_rate": 0.00018298524673277203,
+      "loss": 1.1738,
+      "step": 4249
+    },
+    {
+      "epoch": 0.7567663817663818,
+      "grad_norm": 0.589347779750824,
+      "learning_rate": 0.00018297743557015155,
+      "loss": 1.0185,
+      "step": 4250
+    },
+    {
+      "epoch": 0.7569444444444444,
+      "grad_norm": 0.49920859932899475,
+      "learning_rate": 0.0001829696227817606,
+      "loss": 1.118,
+      "step": 4251
+    },
+    {
+      "epoch": 0.7571225071225072,
+      "grad_norm": 0.502565324306488,
+      "learning_rate": 0.0001829618083677522,
+      "loss": 1.1856,
+      "step": 4252
+    },
+    {
+      "epoch": 0.7573005698005698,
+      "grad_norm": 0.49814435839653015,
+      "learning_rate": 0.00018295399232827955,
+      "loss": 1.0432,
+      "step": 4253
+    },
+    {
+      "epoch": 0.7574786324786325,
+      "grad_norm": 0.5087502598762512,
+      "learning_rate": 0.00018294617466349574,
+      "loss": 1.2325,
+      "step": 4254
+    },
+    {
+      "epoch": 0.7576566951566952,
+      "grad_norm": 0.5107288956642151,
+      "learning_rate": 0.00018293835537355394,
+      "loss": 1.0487,
+      "step": 4255
+    },
+    {
+      "epoch": 0.7578347578347578,
+      "grad_norm": 0.524725615978241,
+      "learning_rate": 0.00018293053445860732,
+      "loss": 1.1821,
+      "step": 4256
+    },
+    {
+      "epoch": 0.7580128205128205,
+      "grad_norm": 0.5234082937240601,
+      "learning_rate": 0.0001829227119188092,
+      "loss": 0.8896,
+      "step": 4257
+    },
+    {
+      "epoch": 0.7581908831908832,
+      "grad_norm": 0.5102918744087219,
+      "learning_rate": 0.00018291488775431275,
+      "loss": 1.0246,
+      "step": 4258
+    },
+    {
+      "epoch": 0.7583689458689459,
+      "grad_norm": 0.5552714467048645,
+      "learning_rate": 0.00018290706196527135,
+      "loss": 1.0193,
+      "step": 4259
+    },
+    {
+      "epoch": 0.7585470085470085,
+      "grad_norm": 0.5395022630691528,
+      "learning_rate": 0.00018289923455183825,
+      "loss": 1.3203,
+      "step": 4260
+    },
+    {
+      "epoch": 0.7587250712250713,
+      "grad_norm": 0.7474865913391113,
+      "learning_rate": 0.00018289140551416692,
+      "loss": 1.182,
+      "step": 4261
+    },
+    {
+      "epoch": 0.7589031339031339,
+      "grad_norm": 0.4892016649246216,
+      "learning_rate": 0.00018288357485241066,
+      "loss": 0.968,
+      "step": 4262
+    },
+    {
+      "epoch": 0.7590811965811965,
+      "grad_norm": 0.4627816081047058,
+      "learning_rate": 0.00018287574256672291,
+      "loss": 0.6895,
+      "step": 4263
+    },
+    {
+      "epoch": 0.7592592592592593,
+      "grad_norm": 0.6221280097961426,
+      "learning_rate": 0.00018286790865725715,
+      "loss": 0.9691,
+      "step": 4264
+    },
+    {
+      "epoch": 0.7594373219373219,
+      "grad_norm": 0.5542295575141907,
+      "learning_rate": 0.0001828600731241669,
+      "loss": 0.9996,
+      "step": 4265
+    },
+    {
+      "epoch": 0.7596153846153846,
+      "grad_norm": 0.5570770502090454,
+      "learning_rate": 0.00018285223596760562,
+      "loss": 1.1996,
+      "step": 4266
+    },
+    {
+      "epoch": 0.7597934472934473,
+      "grad_norm": 0.5495262742042542,
+      "learning_rate": 0.00018284439718772687,
+      "loss": 1.1572,
+      "step": 4267
+    },
+    {
+      "epoch": 0.75997150997151,
+      "grad_norm": 0.5006741881370544,
+      "learning_rate": 0.00018283655678468427,
+      "loss": 1.1215,
+      "step": 4268
+    },
+    {
+      "epoch": 0.7601495726495726,
+      "grad_norm": 0.4682157635688782,
+      "learning_rate": 0.00018282871475863144,
+      "loss": 1.0547,
+      "step": 4269
+    },
+    {
+      "epoch": 0.7603276353276354,
+      "grad_norm": 0.6275840997695923,
+      "learning_rate": 0.00018282087110972197,
+      "loss": 1.3855,
+      "step": 4270
+    },
+    {
+      "epoch": 0.760505698005698,
+      "grad_norm": 0.5341474413871765,
+      "learning_rate": 0.0001828130258381096,
+      "loss": 1.2024,
+      "step": 4271
+    },
+    {
+      "epoch": 0.7606837606837606,
+      "grad_norm": 0.4330833852291107,
+      "learning_rate": 0.000182805178943948,
+      "loss": 1.0508,
+      "step": 4272
+    },
+    {
+      "epoch": 0.7608618233618234,
+      "grad_norm": 0.6276537179946899,
+      "learning_rate": 0.00018279733042739094,
+      "loss": 1.1635,
+      "step": 4273
+    },
+    {
+      "epoch": 0.761039886039886,
+      "grad_norm": 0.5370199084281921,
+      "learning_rate": 0.00018278948028859217,
+      "loss": 1.0579,
+      "step": 4274
+    },
+    {
+      "epoch": 0.7612179487179487,
+      "grad_norm": 0.524959921836853,
+      "learning_rate": 0.00018278162852770552,
+      "loss": 1.0972,
+      "step": 4275
+    },
+    {
+      "epoch": 0.7613960113960114,
+      "grad_norm": 0.5029389262199402,
+      "learning_rate": 0.00018277377514488486,
+      "loss": 0.959,
+      "step": 4276
+    },
+    {
+      "epoch": 0.7615740740740741,
+      "grad_norm": 0.49772894382476807,
+      "learning_rate": 0.00018276592014028397,
+      "loss": 1.2773,
+      "step": 4277
+    },
+    {
+      "epoch": 0.7617521367521367,
+      "grad_norm": 0.5195719003677368,
+      "learning_rate": 0.00018275806351405685,
+      "loss": 1.0676,
+      "step": 4278
+    },
+    {
+      "epoch": 0.7619301994301995,
+      "grad_norm": 0.5167942643165588,
+      "learning_rate": 0.00018275020526635735,
+      "loss": 1.0615,
+      "step": 4279
+    },
+    {
+      "epoch": 0.7621082621082621,
+      "grad_norm": 0.4958035945892334,
+      "learning_rate": 0.0001827423453973395,
+      "loss": 0.9605,
+      "step": 4280
+    },
+    {
+      "epoch": 0.7622863247863247,
+      "grad_norm": 0.6256808042526245,
+      "learning_rate": 0.00018273448390715728,
+      "loss": 1.2526,
+      "step": 4281
+    },
+    {
+      "epoch": 0.7624643874643875,
+      "grad_norm": 0.5062580108642578,
+      "learning_rate": 0.0001827266207959647,
+      "loss": 1.0604,
+      "step": 4282
+    },
+    {
+      "epoch": 0.7626424501424501,
+      "grad_norm": 0.5080778002738953,
+      "learning_rate": 0.00018271875606391583,
+      "loss": 1.1246,
+      "step": 4283
+    },
+    {
+      "epoch": 0.7628205128205128,
+      "grad_norm": 0.5069389939308167,
+      "learning_rate": 0.00018271088971116479,
+      "loss": 1.3158,
+      "step": 4284
+    },
+    {
+      "epoch": 0.7629985754985755,
+      "grad_norm": 0.7280121445655823,
+      "learning_rate": 0.00018270302173786567,
+      "loss": 1.2066,
+      "step": 4285
+    },
+    {
+      "epoch": 0.7631766381766382,
+      "grad_norm": 0.6523470282554626,
+      "learning_rate": 0.00018269515214417267,
+      "loss": 1.3236,
+      "step": 4286
+    },
+    {
+      "epoch": 0.7633547008547008,
+      "grad_norm": 0.5799322724342346,
+      "learning_rate": 0.00018268728093023988,
+      "loss": 0.9786,
+      "step": 4287
+    },
+    {
+      "epoch": 0.7635327635327636,
+      "grad_norm": 0.46675166487693787,
+      "learning_rate": 0.00018267940809622163,
+      "loss": 0.8131,
+      "step": 4288
+    },
+    {
+      "epoch": 0.7637108262108262,
+      "grad_norm": 0.5566182732582092,
+      "learning_rate": 0.00018267153364227214,
+      "loss": 1.0565,
+      "step": 4289
+    },
+    {
+      "epoch": 0.7638888888888888,
+      "grad_norm": 0.532028079032898,
+      "learning_rate": 0.00018266365756854566,
+      "loss": 0.952,
+      "step": 4290
+    },
+    {
+      "epoch": 0.7640669515669516,
+      "grad_norm": 0.5082666873931885,
+      "learning_rate": 0.00018265577987519653,
+      "loss": 1.0704,
+      "step": 4291
+    },
+    {
+      "epoch": 0.7642450142450142,
+      "grad_norm": 0.5223562717437744,
+      "learning_rate": 0.00018264790056237912,
+      "loss": 1.1161,
+      "step": 4292
+    },
+    {
+      "epoch": 0.7644230769230769,
+      "grad_norm": 0.48472318053245544,
+      "learning_rate": 0.00018264001963024778,
+      "loss": 0.8784,
+      "step": 4293
+    },
+    {
+      "epoch": 0.7646011396011396,
+      "grad_norm": 0.5901281833648682,
+      "learning_rate": 0.0001826321370789569,
+      "loss": 1.1031,
+      "step": 4294
+    },
+    {
+      "epoch": 0.7647792022792023,
+      "grad_norm": 0.570350706577301,
+      "learning_rate": 0.000182624252908661,
+      "loss": 0.9047,
+      "step": 4295
+    },
+    {
+      "epoch": 0.7649572649572649,
+      "grad_norm": 0.568373441696167,
+      "learning_rate": 0.00018261636711951445,
+      "loss": 1.0106,
+      "step": 4296
+    },
+    {
+      "epoch": 0.7651353276353277,
+      "grad_norm": 0.6175880432128906,
+      "learning_rate": 0.00018260847971167182,
+      "loss": 1.3531,
+      "step": 4297
+    },
+    {
+      "epoch": 0.7653133903133903,
+      "grad_norm": 0.5682594776153564,
+      "learning_rate": 0.00018260059068528762,
+      "loss": 1.1261,
+      "step": 4298
+    },
+    {
+      "epoch": 0.7654914529914529,
+      "grad_norm": 0.5050225257873535,
+      "learning_rate": 0.00018259270004051644,
+      "loss": 1.0921,
+      "step": 4299
+    },
+    {
+      "epoch": 0.7656695156695157,
+      "grad_norm": 0.5416565537452698,
+      "learning_rate": 0.0001825848077775129,
+      "loss": 1.0881,
+      "step": 4300
+    },
+    {
+      "epoch": 0.7658475783475783,
+      "grad_norm": 0.5418867468833923,
+      "learning_rate": 0.0001825769138964316,
+      "loss": 1.2069,
+      "step": 4301
+    },
+    {
+      "epoch": 0.7660256410256411,
+      "grad_norm": 0.5447866320610046,
+      "learning_rate": 0.00018256901839742718,
+      "loss": 1.1827,
+      "step": 4302
+    },
+    {
+      "epoch": 0.7662037037037037,
+      "grad_norm": 0.5482802987098694,
+      "learning_rate": 0.00018256112128065439,
+      "loss": 1.0492,
+      "step": 4303
+    },
+    {
+      "epoch": 0.7663817663817664,
+      "grad_norm": 0.5059601664543152,
+      "learning_rate": 0.0001825532225462679,
+      "loss": 1.0996,
+      "step": 4304
+    },
+    {
+      "epoch": 0.7665598290598291,
+      "grad_norm": 0.5153701901435852,
+      "learning_rate": 0.00018254532219442258,
+      "loss": 1.3237,
+      "step": 4305
+    },
+    {
+      "epoch": 0.7667378917378918,
+      "grad_norm": 0.5370768904685974,
+      "learning_rate": 0.0001825374202252731,
+      "loss": 0.9925,
+      "step": 4306
+    },
+    {
+      "epoch": 0.7669159544159544,
+      "grad_norm": 0.4516580402851105,
+      "learning_rate": 0.00018252951663897432,
+      "loss": 1.0749,
+      "step": 4307
+    },
+    {
+      "epoch": 0.7670940170940171,
+      "grad_norm": 0.5565171837806702,
+      "learning_rate": 0.0001825216114356811,
+      "loss": 1.1617,
+      "step": 4308
+    },
+    {
+      "epoch": 0.7672720797720798,
+      "grad_norm": 0.5212662220001221,
+      "learning_rate": 0.00018251370461554834,
+      "loss": 1.1108,
+      "step": 4309
+    },
+    {
+      "epoch": 0.7674501424501424,
+      "grad_norm": 0.49061715602874756,
+      "learning_rate": 0.00018250579617873095,
+      "loss": 1.0881,
+      "step": 4310
+    },
+    {
+      "epoch": 0.7676282051282052,
+      "grad_norm": 0.5535751581192017,
+      "learning_rate": 0.00018249788612538387,
+      "loss": 0.9341,
+      "step": 4311
+    },
+    {
+      "epoch": 0.7678062678062678,
+      "grad_norm": 0.5425209403038025,
+      "learning_rate": 0.00018248997445566208,
+      "loss": 1.1858,
+      "step": 4312
+    },
+    {
+      "epoch": 0.7679843304843305,
+      "grad_norm": 0.6224395036697388,
+      "learning_rate": 0.0001824820611697206,
+      "loss": 1.0836,
+      "step": 4313
+    },
+    {
+      "epoch": 0.7681623931623932,
+      "grad_norm": 0.4895690977573395,
+      "learning_rate": 0.00018247414626771445,
+      "loss": 0.8598,
+      "step": 4314
+    },
+    {
+      "epoch": 0.7683404558404558,
+      "grad_norm": 0.5279615521430969,
+      "learning_rate": 0.00018246622974979877,
+      "loss": 1.1742,
+      "step": 4315
+    },
+    {
+      "epoch": 0.7685185185185185,
+      "grad_norm": 0.45300471782684326,
+      "learning_rate": 0.0001824583116161286,
+      "loss": 0.8872,
+      "step": 4316
+    },
+    {
+      "epoch": 0.7686965811965812,
+      "grad_norm": 0.6499692797660828,
+      "learning_rate": 0.00018245039186685916,
+      "loss": 1.2495,
+      "step": 4317
+    },
+    {
+      "epoch": 0.7688746438746439,
+      "grad_norm": 0.48151278495788574,
+      "learning_rate": 0.00018244247050214552,
+      "loss": 1.2382,
+      "step": 4318
+    },
+    {
+      "epoch": 0.7690527065527065,
+      "grad_norm": 0.6597028374671936,
+      "learning_rate": 0.0001824345475221429,
+      "loss": 1.3453,
+      "step": 4319
+    },
+    {
+      "epoch": 0.7692307692307693,
+      "grad_norm": 0.4536992609500885,
+      "learning_rate": 0.0001824266229270066,
+      "loss": 1.1141,
+      "step": 4320
+    },
+    {
+      "epoch": 0.7694088319088319,
+      "grad_norm": 0.5489405393600464,
+      "learning_rate": 0.00018241869671689184,
+      "loss": 1.0333,
+      "step": 4321
+    },
+    {
+      "epoch": 0.7695868945868946,
+      "grad_norm": 0.5741586089134216,
+      "learning_rate": 0.00018241076889195394,
+      "loss": 0.9939,
+      "step": 4322
+    },
+    {
+      "epoch": 0.7697649572649573,
+      "grad_norm": 0.47170960903167725,
+      "learning_rate": 0.00018240283945234823,
+      "loss": 0.9878,
+      "step": 4323
+    },
+    {
+      "epoch": 0.76994301994302,
+      "grad_norm": 0.4729093313217163,
+      "learning_rate": 0.00018239490839823004,
+      "loss": 1.0087,
+      "step": 4324
+    },
+    {
+      "epoch": 0.7701210826210826,
+      "grad_norm": 0.49869823455810547,
+      "learning_rate": 0.0001823869757297548,
+      "loss": 1.169,
+      "step": 4325
+    },
+    {
+      "epoch": 0.7702991452991453,
+      "grad_norm": 0.5118468403816223,
+      "learning_rate": 0.0001823790414470779,
+      "loss": 1.1092,
+      "step": 4326
+    },
+    {
+      "epoch": 0.770477207977208,
+      "grad_norm": 0.5076048970222473,
+      "learning_rate": 0.0001823711055503548,
+      "loss": 1.1028,
+      "step": 4327
+    },
+    {
+      "epoch": 0.7706552706552706,
+      "grad_norm": 0.5661569237709045,
+      "learning_rate": 0.00018236316803974098,
+      "loss": 1.1114,
+      "step": 4328
+    },
+    {
+      "epoch": 0.7708333333333334,
+      "grad_norm": 0.5542354583740234,
+      "learning_rate": 0.000182355228915392,
+      "loss": 1.0931,
+      "step": 4329
+    },
+    {
+      "epoch": 0.771011396011396,
+      "grad_norm": 0.5476680994033813,
+      "learning_rate": 0.0001823472881774634,
+      "loss": 1.036,
+      "step": 4330
+    },
+    {
+      "epoch": 0.7711894586894587,
+      "grad_norm": 0.5449798703193665,
+      "learning_rate": 0.00018233934582611073,
+      "loss": 1.0682,
+      "step": 4331
+    },
+    {
+      "epoch": 0.7713675213675214,
+      "grad_norm": 0.61089026927948,
+      "learning_rate": 0.00018233140186148963,
+      "loss": 1.0748,
+      "step": 4332
+    },
+    {
+      "epoch": 0.771545584045584,
+      "grad_norm": 0.5015206336975098,
+      "learning_rate": 0.00018232345628375576,
+      "loss": 1.2032,
+      "step": 4333
+    },
+    {
+      "epoch": 0.7717236467236467,
+      "grad_norm": 0.579289972782135,
+      "learning_rate": 0.00018231550909306475,
+      "loss": 1.0764,
+      "step": 4334
+    },
+    {
+      "epoch": 0.7719017094017094,
+      "grad_norm": 0.5889299511909485,
+      "learning_rate": 0.00018230756028957235,
+      "loss": 1.1768,
+      "step": 4335
+    },
+    {
+      "epoch": 0.7720797720797721,
+      "grad_norm": 0.5328249335289001,
+      "learning_rate": 0.00018229960987343428,
+      "loss": 1.0055,
+      "step": 4336
+    },
+    {
+      "epoch": 0.7722578347578347,
+      "grad_norm": 0.5766382217407227,
+      "learning_rate": 0.0001822916578448063,
+      "loss": 0.9923,
+      "step": 4337
+    },
+    {
+      "epoch": 0.7724358974358975,
+      "grad_norm": 0.6448187828063965,
+      "learning_rate": 0.00018228370420384423,
+      "loss": 1.1135,
+      "step": 4338
+    },
+    {
+      "epoch": 0.7726139601139601,
+      "grad_norm": 0.5505210757255554,
+      "learning_rate": 0.00018227574895070394,
+      "loss": 1.2048,
+      "step": 4339
+    },
+    {
+      "epoch": 0.7727920227920227,
+      "grad_norm": 0.6278925538063049,
+      "learning_rate": 0.00018226779208554126,
+      "loss": 1.1045,
+      "step": 4340
+    },
+    {
+      "epoch": 0.7729700854700855,
+      "grad_norm": 0.5345009565353394,
+      "learning_rate": 0.00018225983360851207,
+      "loss": 1.0102,
+      "step": 4341
+    },
+    {
+      "epoch": 0.7731481481481481,
+      "grad_norm": 0.566633403301239,
+      "learning_rate": 0.00018225187351977233,
+      "loss": 1.0038,
+      "step": 4342
+    },
+    {
+      "epoch": 0.7733262108262108,
+      "grad_norm": 0.5066078901290894,
+      "learning_rate": 0.000182243911819478,
+      "loss": 1.0339,
+      "step": 4343
+    },
+    {
+      "epoch": 0.7735042735042735,
+      "grad_norm": 0.5614920258522034,
+      "learning_rate": 0.00018223594850778503,
+      "loss": 1.1021,
+      "step": 4344
+    },
+    {
+      "epoch": 0.7736823361823362,
+      "grad_norm": 0.7747337818145752,
+      "learning_rate": 0.0001822279835848495,
+      "loss": 1.1129,
+      "step": 4345
+    },
+    {
+      "epoch": 0.7738603988603988,
+      "grad_norm": 0.7066529989242554,
+      "learning_rate": 0.00018222001705082744,
+      "loss": 1.3234,
+      "step": 4346
+    },
+    {
+      "epoch": 0.7740384615384616,
+      "grad_norm": 0.6340884566307068,
+      "learning_rate": 0.00018221204890587497,
+      "loss": 1.0726,
+      "step": 4347
+    },
+    {
+      "epoch": 0.7742165242165242,
+      "grad_norm": 0.5401145815849304,
+      "learning_rate": 0.00018220407915014818,
+      "loss": 0.9904,
+      "step": 4348
+    },
+    {
+      "epoch": 0.7743945868945868,
+      "grad_norm": 0.5069159269332886,
+      "learning_rate": 0.00018219610778380315,
+      "loss": 1.0654,
+      "step": 4349
+    },
+    {
+      "epoch": 0.7745726495726496,
+      "grad_norm": 0.5422839522361755,
+      "learning_rate": 0.00018218813480699623,
+      "loss": 1.1741,
+      "step": 4350
+    },
+    {
+      "epoch": 0.7747507122507122,
+      "grad_norm": 0.5550300478935242,
+      "learning_rate": 0.0001821801602198835,
+      "loss": 1.0033,
+      "step": 4351
+    },
+    {
+      "epoch": 0.7749287749287749,
+      "grad_norm": 0.5987736582756042,
+      "learning_rate": 0.00018217218402262123,
+      "loss": 0.935,
+      "step": 4352
+    },
+    {
+      "epoch": 0.7751068376068376,
+      "grad_norm": 0.6137008666992188,
+      "learning_rate": 0.00018216420621536573,
+      "loss": 1.17,
+      "step": 4353
+    },
+    {
+      "epoch": 0.7752849002849003,
+      "grad_norm": 0.47124359011650085,
+      "learning_rate": 0.0001821562267982733,
+      "loss": 0.8316,
+      "step": 4354
+    },
+    {
+      "epoch": 0.7754629629629629,
+      "grad_norm": 0.5057868361473083,
+      "learning_rate": 0.00018214824577150024,
+      "loss": 1.0246,
+      "step": 4355
+    },
+    {
+      "epoch": 0.7756410256410257,
+      "grad_norm": 0.604055643081665,
+      "learning_rate": 0.00018214026313520299,
+      "loss": 1.1272,
+      "step": 4356
+    },
+    {
+      "epoch": 0.7758190883190883,
+      "grad_norm": 0.6690384149551392,
+      "learning_rate": 0.0001821322788895379,
+      "loss": 1.0464,
+      "step": 4357
+    },
+    {
+      "epoch": 0.7759971509971509,
+      "grad_norm": 0.5458958745002747,
+      "learning_rate": 0.0001821242930346614,
+      "loss": 1.1712,
+      "step": 4358
+    },
+    {
+      "epoch": 0.7761752136752137,
+      "grad_norm": 0.6448663473129272,
+      "learning_rate": 0.00018211630557073,
+      "loss": 1.1125,
+      "step": 4359
+    },
+    {
+      "epoch": 0.7763532763532763,
+      "grad_norm": 0.49889448285102844,
+      "learning_rate": 0.00018210831649790018,
+      "loss": 1.097,
+      "step": 4360
+    },
+    {
+      "epoch": 0.7765313390313391,
+      "grad_norm": 0.5118046998977661,
+      "learning_rate": 0.00018210032581632843,
+      "loss": 1.009,
+      "step": 4361
+    },
+    {
+      "epoch": 0.7767094017094017,
+      "grad_norm": 0.5450068116188049,
+      "learning_rate": 0.00018209233352617135,
+      "loss": 1.1138,
+      "step": 4362
+    },
+    {
+      "epoch": 0.7768874643874644,
+      "grad_norm": 0.6147481203079224,
+      "learning_rate": 0.00018208433962758558,
+      "loss": 1.212,
+      "step": 4363
+    },
+    {
+      "epoch": 0.7770655270655271,
+      "grad_norm": 0.554176926612854,
+      "learning_rate": 0.00018207634412072764,
+      "loss": 1.1271,
+      "step": 4364
+    },
+    {
+      "epoch": 0.7772435897435898,
+      "grad_norm": 0.5872851014137268,
+      "learning_rate": 0.00018206834700575426,
+      "loss": 1.2793,
+      "step": 4365
+    },
+    {
+      "epoch": 0.7774216524216524,
+      "grad_norm": 0.5135685205459595,
+      "learning_rate": 0.00018206034828282207,
+      "loss": 0.9642,
+      "step": 4366
+    },
+    {
+      "epoch": 0.7775997150997151,
+      "grad_norm": 0.5699490308761597,
+      "learning_rate": 0.00018205234795208786,
+      "loss": 0.9086,
+      "step": 4367
+    },
+    {
+      "epoch": 0.7777777777777778,
+      "grad_norm": 0.5908057689666748,
+      "learning_rate": 0.00018204434601370832,
+      "loss": 1.1973,
+      "step": 4368
+    },
+    {
+      "epoch": 0.7779558404558404,
+      "grad_norm": 0.5777581334114075,
+      "learning_rate": 0.00018203634246784025,
+      "loss": 1.0447,
+      "step": 4369
+    },
+    {
+      "epoch": 0.7781339031339032,
+      "grad_norm": 0.4822927713394165,
+      "learning_rate": 0.00018202833731464048,
+      "loss": 0.814,
+      "step": 4370
+    },
+    {
+      "epoch": 0.7783119658119658,
+      "grad_norm": 0.5343610644340515,
+      "learning_rate": 0.0001820203305542658,
+      "loss": 1.2785,
+      "step": 4371
+    },
+    {
+      "epoch": 0.7784900284900285,
+      "grad_norm": 0.5462222695350647,
+      "learning_rate": 0.00018201232218687316,
+      "loss": 1.1785,
+      "step": 4372
+    },
+    {
+      "epoch": 0.7786680911680912,
+      "grad_norm": 0.5177609324455261,
+      "learning_rate": 0.00018200431221261943,
+      "loss": 1.111,
+      "step": 4373
+    },
+    {
+      "epoch": 0.7788461538461539,
+      "grad_norm": 0.5324625968933105,
+      "learning_rate": 0.00018199630063166157,
+      "loss": 1.0738,
+      "step": 4374
+    },
+    {
+      "epoch": 0.7790242165242165,
+      "grad_norm": 0.6392876505851746,
+      "learning_rate": 0.0001819882874441565,
+      "loss": 1.1758,
+      "step": 4375
+    },
+    {
+      "epoch": 0.7792022792022792,
+      "grad_norm": 0.49964696168899536,
+      "learning_rate": 0.00018198027265026127,
+      "loss": 1.0556,
+      "step": 4376
+    },
+    {
+      "epoch": 0.7793803418803419,
+      "grad_norm": 0.6090660691261292,
+      "learning_rate": 0.00018197225625013287,
+      "loss": 1.0102,
+      "step": 4377
+    },
+    {
+      "epoch": 0.7795584045584045,
+      "grad_norm": 0.5242345929145813,
+      "learning_rate": 0.00018196423824392842,
+      "loss": 0.8335,
+      "step": 4378
+    },
+    {
+      "epoch": 0.7797364672364673,
+      "grad_norm": 0.5265036225318909,
+      "learning_rate": 0.00018195621863180498,
+      "loss": 1.0781,
+      "step": 4379
+    },
+    {
+      "epoch": 0.7799145299145299,
+      "grad_norm": 0.5115378499031067,
+      "learning_rate": 0.0001819481974139197,
+      "loss": 1.1658,
+      "step": 4380
+    },
+    {
+      "epoch": 0.7800925925925926,
+      "grad_norm": 0.6489549875259399,
+      "learning_rate": 0.00018194017459042972,
+      "loss": 1.0572,
+      "step": 4381
+    },
+    {
+      "epoch": 0.7802706552706553,
+      "grad_norm": 0.5800202488899231,
+      "learning_rate": 0.0001819321501614922,
+      "loss": 0.9593,
+      "step": 4382
+    },
+    {
+      "epoch": 0.780448717948718,
+      "grad_norm": 0.5608528256416321,
+      "learning_rate": 0.00018192412412726443,
+      "loss": 1.0324,
+      "step": 4383
+    },
+    {
+      "epoch": 0.7806267806267806,
+      "grad_norm": 0.5596401691436768,
+      "learning_rate": 0.00018191609648790362,
+      "loss": 1.071,
+      "step": 4384
+    },
+    {
+      "epoch": 0.7808048433048433,
+      "grad_norm": 0.5712903141975403,
+      "learning_rate": 0.00018190806724356707,
+      "loss": 0.9011,
+      "step": 4385
+    },
+    {
+      "epoch": 0.780982905982906,
+      "grad_norm": 0.5079438090324402,
+      "learning_rate": 0.0001819000363944121,
+      "loss": 1.1194,
+      "step": 4386
+    },
+    {
+      "epoch": 0.7811609686609686,
+      "grad_norm": 0.5785079598426819,
+      "learning_rate": 0.00018189200394059602,
+      "loss": 1.1703,
+      "step": 4387
+    },
+    {
+      "epoch": 0.7813390313390314,
+      "grad_norm": 0.6901816129684448,
+      "learning_rate": 0.00018188396988227625,
+      "loss": 1.6689,
+      "step": 4388
+    },
+    {
+      "epoch": 0.781517094017094,
+      "grad_norm": 0.48107922077178955,
+      "learning_rate": 0.00018187593421961022,
+      "loss": 1.0116,
+      "step": 4389
+    },
+    {
+      "epoch": 0.7816951566951567,
+      "grad_norm": 0.5843084454536438,
+      "learning_rate": 0.0001818678969527553,
+      "loss": 1.1172,
+      "step": 4390
+    },
+    {
+      "epoch": 0.7818732193732194,
+      "grad_norm": 0.479034423828125,
+      "learning_rate": 0.00018185985808186902,
+      "loss": 0.811,
+      "step": 4391
+    },
+    {
+      "epoch": 0.782051282051282,
+      "grad_norm": 0.5864158272743225,
+      "learning_rate": 0.00018185181760710888,
+      "loss": 0.9522,
+      "step": 4392
+    },
+    {
+      "epoch": 0.7822293447293447,
+      "grad_norm": 0.4824625551700592,
+      "learning_rate": 0.00018184377552863242,
+      "loss": 0.9039,
+      "step": 4393
+    },
+    {
+      "epoch": 0.7824074074074074,
+      "grad_norm": 0.580102801322937,
+      "learning_rate": 0.00018183573184659717,
+      "loss": 1.2382,
+      "step": 4394
+    },
+    {
+      "epoch": 0.7825854700854701,
+      "grad_norm": 0.5300056338310242,
+      "learning_rate": 0.00018182768656116073,
+      "loss": 1.2268,
+      "step": 4395
+    },
+    {
+      "epoch": 0.7827635327635327,
+      "grad_norm": 0.5548123121261597,
+      "learning_rate": 0.00018181963967248078,
+      "loss": 1.0628,
+      "step": 4396
+    },
+    {
+      "epoch": 0.7829415954415955,
+      "grad_norm": 0.5485070943832397,
+      "learning_rate": 0.00018181159118071496,
+      "loss": 0.9628,
+      "step": 4397
+    },
+    {
+      "epoch": 0.7831196581196581,
+      "grad_norm": 0.47405415773391724,
+      "learning_rate": 0.00018180354108602095,
+      "loss": 1.1413,
+      "step": 4398
+    },
+    {
+      "epoch": 0.7832977207977208,
+      "grad_norm": 0.5545752644538879,
+      "learning_rate": 0.0001817954893885565,
+      "loss": 1.3807,
+      "step": 4399
+    },
+    {
+      "epoch": 0.7834757834757835,
+      "grad_norm": 0.5339497327804565,
+      "learning_rate": 0.00018178743608847933,
+      "loss": 0.9978,
+      "step": 4400
+    },
+    {
+      "epoch": 0.7836538461538461,
+      "grad_norm": 0.5006352663040161,
+      "learning_rate": 0.00018177938118594725,
+      "loss": 0.8873,
+      "step": 4401
+    },
+    {
+      "epoch": 0.7838319088319088,
+      "grad_norm": 0.4845179319381714,
+      "learning_rate": 0.00018177132468111812,
+      "loss": 0.8866,
+      "step": 4402
+    },
+    {
+      "epoch": 0.7840099715099715,
+      "grad_norm": 0.5240967869758606,
+      "learning_rate": 0.0001817632665741497,
+      "loss": 1.0347,
+      "step": 4403
+    },
+    {
+      "epoch": 0.7841880341880342,
+      "grad_norm": 0.5311884880065918,
+      "learning_rate": 0.00018175520686519993,
+      "loss": 1.2065,
+      "step": 4404
+    },
+    {
+      "epoch": 0.7843660968660968,
+      "grad_norm": 0.5562815070152283,
+      "learning_rate": 0.00018174714555442673,
+      "loss": 1.1272,
+      "step": 4405
+    },
+    {
+      "epoch": 0.7845441595441596,
+      "grad_norm": 0.5524366497993469,
+      "learning_rate": 0.00018173908264198802,
+      "loss": 1.2337,
+      "step": 4406
+    },
+    {
+      "epoch": 0.7847222222222222,
+      "grad_norm": 0.5612216591835022,
+      "learning_rate": 0.0001817310181280418,
+      "loss": 1.1809,
+      "step": 4407
+    },
+    {
+      "epoch": 0.7849002849002849,
+      "grad_norm": 0.5315343737602234,
+      "learning_rate": 0.000181722952012746,
+      "loss": 1.0491,
+      "step": 4408
+    },
+    {
+      "epoch": 0.7850783475783476,
+      "grad_norm": 0.5233435034751892,
+      "learning_rate": 0.00018171488429625878,
+      "loss": 1.0457,
+      "step": 4409
+    },
+    {
+      "epoch": 0.7852564102564102,
+      "grad_norm": 0.7809093594551086,
+      "learning_rate": 0.00018170681497873813,
+      "loss": 1.1578,
+      "step": 4410
+    },
+    {
+      "epoch": 0.7854344729344729,
+      "grad_norm": 0.49659839272499084,
+      "learning_rate": 0.00018169874406034217,
+      "loss": 1.0815,
+      "step": 4411
+    },
+    {
+      "epoch": 0.7856125356125356,
+      "grad_norm": 0.5020765066146851,
+      "learning_rate": 0.00018169067154122904,
+      "loss": 1.1985,
+      "step": 4412
+    },
+    {
+      "epoch": 0.7857905982905983,
+      "grad_norm": 0.6408432126045227,
+      "learning_rate": 0.0001816825974215569,
+      "loss": 1.2272,
+      "step": 4413
+    },
+    {
+      "epoch": 0.7859686609686609,
+      "grad_norm": 0.5062605142593384,
+      "learning_rate": 0.00018167452170148396,
+      "loss": 0.9663,
+      "step": 4414
+    },
+    {
+      "epoch": 0.7861467236467237,
+      "grad_norm": 0.5100119113922119,
+      "learning_rate": 0.0001816664443811684,
+      "loss": 1.0256,
+      "step": 4415
+    },
+    {
+      "epoch": 0.7863247863247863,
+      "grad_norm": 0.5277643799781799,
+      "learning_rate": 0.00018165836546076854,
+      "loss": 1.2885,
+      "step": 4416
+    },
+    {
+      "epoch": 0.7865028490028491,
+      "grad_norm": 0.5568150281906128,
+      "learning_rate": 0.0001816502849404426,
+      "loss": 1.2673,
+      "step": 4417
+    },
+    {
+      "epoch": 0.7866809116809117,
+      "grad_norm": 0.5061392188072205,
+      "learning_rate": 0.00018164220282034896,
+      "loss": 1.072,
+      "step": 4418
+    },
+    {
+      "epoch": 0.7868589743589743,
+      "grad_norm": 0.5383077263832092,
+      "learning_rate": 0.00018163411910064597,
+      "loss": 1.0621,
+      "step": 4419
+    },
+    {
+      "epoch": 0.7870370370370371,
+      "grad_norm": 0.5167948007583618,
+      "learning_rate": 0.00018162603378149198,
+      "loss": 1.099,
+      "step": 4420
+    },
+    {
+      "epoch": 0.7872150997150997,
+      "grad_norm": 0.5084534287452698,
+      "learning_rate": 0.0001816179468630454,
+      "loss": 1.3984,
+      "step": 4421
+    },
+    {
+      "epoch": 0.7873931623931624,
+      "grad_norm": 0.608762264251709,
+      "learning_rate": 0.00018160985834546475,
+      "loss": 1.3553,
+      "step": 4422
+    },
+    {
+      "epoch": 0.7875712250712251,
+      "grad_norm": 0.4900866746902466,
+      "learning_rate": 0.00018160176822890842,
+      "loss": 1.0009,
+      "step": 4423
+    },
+    {
+      "epoch": 0.7877492877492878,
+      "grad_norm": 0.5928917527198792,
+      "learning_rate": 0.00018159367651353496,
+      "loss": 1.0523,
+      "step": 4424
+    },
+    {
+      "epoch": 0.7879273504273504,
+      "grad_norm": 0.624422013759613,
+      "learning_rate": 0.0001815855831995029,
+      "loss": 1.0519,
+      "step": 4425
+    },
+    {
+      "epoch": 0.7881054131054132,
+      "grad_norm": 0.5140150785446167,
+      "learning_rate": 0.00018157748828697082,
+      "loss": 1.048,
+      "step": 4426
+    },
+    {
+      "epoch": 0.7882834757834758,
+      "grad_norm": 0.47006943821907043,
+      "learning_rate": 0.00018156939177609732,
+      "loss": 1.0067,
+      "step": 4427
+    },
+    {
+      "epoch": 0.7884615384615384,
+      "grad_norm": 0.5178864002227783,
+      "learning_rate": 0.00018156129366704105,
+      "loss": 1.0583,
+      "step": 4428
+    },
+    {
+      "epoch": 0.7886396011396012,
+      "grad_norm": 0.5279985666275024,
+      "learning_rate": 0.00018155319395996066,
+      "loss": 1.3023,
+      "step": 4429
+    },
+    {
+      "epoch": 0.7888176638176638,
+      "grad_norm": 0.5238787531852722,
+      "learning_rate": 0.00018154509265501482,
+      "loss": 1.0851,
+      "step": 4430
+    },
+    {
+      "epoch": 0.7889957264957265,
+      "grad_norm": 0.5914917588233948,
+      "learning_rate": 0.00018153698975236228,
+      "loss": 0.9291,
+      "step": 4431
+    },
+    {
+      "epoch": 0.7891737891737892,
+      "grad_norm": 0.5046082735061646,
+      "learning_rate": 0.00018152888525216183,
+      "loss": 0.9951,
+      "step": 4432
+    },
+    {
+      "epoch": 0.7893518518518519,
+      "grad_norm": 0.5042256116867065,
+      "learning_rate": 0.00018152077915457225,
+      "loss": 1.0243,
+      "step": 4433
+    },
+    {
+      "epoch": 0.7895299145299145,
+      "grad_norm": 0.5950339436531067,
+      "learning_rate": 0.0001815126714597523,
+      "loss": 0.9803,
+      "step": 4434
+    },
+    {
+      "epoch": 0.7897079772079773,
+      "grad_norm": 0.5163764953613281,
+      "learning_rate": 0.0001815045621678609,
+      "loss": 1.0353,
+      "step": 4435
+    },
+    {
+      "epoch": 0.7898860398860399,
+      "grad_norm": 0.5166211128234863,
+      "learning_rate": 0.00018149645127905691,
+      "loss": 0.9649,
+      "step": 4436
+    },
+    {
+      "epoch": 0.7900641025641025,
+      "grad_norm": 0.5239769220352173,
+      "learning_rate": 0.00018148833879349927,
+      "loss": 0.9747,
+      "step": 4437
+    },
+    {
+      "epoch": 0.7902421652421653,
+      "grad_norm": 0.5803237557411194,
+      "learning_rate": 0.00018148022471134692,
+      "loss": 1.315,
+      "step": 4438
+    },
+    {
+      "epoch": 0.7904202279202279,
+      "grad_norm": 0.5141370296478271,
+      "learning_rate": 0.00018147210903275877,
+      "loss": 1.0547,
+      "step": 4439
+    },
+    {
+      "epoch": 0.7905982905982906,
+      "grad_norm": 0.545788586139679,
+      "learning_rate": 0.00018146399175789394,
+      "loss": 1.0797,
+      "step": 4440
+    },
+    {
+      "epoch": 0.7907763532763533,
+      "grad_norm": 0.5273314714431763,
+      "learning_rate": 0.0001814558728869114,
+      "loss": 0.7928,
+      "step": 4441
+    },
+    {
+      "epoch": 0.790954415954416,
+      "grad_norm": 0.4614652693271637,
+      "learning_rate": 0.00018144775241997024,
+      "loss": 0.8826,
+      "step": 4442
+    },
+    {
+      "epoch": 0.7911324786324786,
+      "grad_norm": 0.6203590631484985,
+      "learning_rate": 0.00018143963035722958,
+      "loss": 1.2891,
+      "step": 4443
+    },
+    {
+      "epoch": 0.7913105413105413,
+      "grad_norm": 0.4870408773422241,
+      "learning_rate": 0.0001814315066988485,
+      "loss": 1.0717,
+      "step": 4444
+    },
+    {
+      "epoch": 0.791488603988604,
+      "grad_norm": 0.6468982696533203,
+      "learning_rate": 0.00018142338144498625,
+      "loss": 1.3398,
+      "step": 4445
+    },
+    {
+      "epoch": 0.7916666666666666,
+      "grad_norm": 0.4727918207645416,
+      "learning_rate": 0.00018141525459580197,
+      "loss": 1.0195,
+      "step": 4446
+    },
+    {
+      "epoch": 0.7918447293447294,
+      "grad_norm": 0.5080479979515076,
+      "learning_rate": 0.0001814071261514549,
+      "loss": 1.0163,
+      "step": 4447
+    },
+    {
+      "epoch": 0.792022792022792,
+      "grad_norm": 0.5380908250808716,
+      "learning_rate": 0.0001813989961121043,
+      "loss": 1.1673,
+      "step": 4448
+    },
+    {
+      "epoch": 0.7922008547008547,
+      "grad_norm": 0.5020384192466736,
+      "learning_rate": 0.00018139086447790945,
+      "loss": 0.8591,
+      "step": 4449
+    },
+    {
+      "epoch": 0.7923789173789174,
+      "grad_norm": 0.5279949903488159,
+      "learning_rate": 0.0001813827312490297,
+      "loss": 1.1221,
+      "step": 4450
+    },
+    {
+      "epoch": 0.79255698005698,
+      "grad_norm": 0.6739233732223511,
+      "learning_rate": 0.00018137459642562437,
+      "loss": 1.2704,
+      "step": 4451
+    },
+    {
+      "epoch": 0.7927350427350427,
+      "grad_norm": 0.5112259984016418,
+      "learning_rate": 0.00018136646000785288,
+      "loss": 1.1161,
+      "step": 4452
+    },
+    {
+      "epoch": 0.7929131054131054,
+      "grad_norm": 0.5244031548500061,
+      "learning_rate": 0.00018135832199587463,
+      "loss": 0.7866,
+      "step": 4453
+    },
+    {
+      "epoch": 0.7930911680911681,
+      "grad_norm": 0.5803347229957581,
+      "learning_rate": 0.0001813501823898491,
+      "loss": 0.994,
+      "step": 4454
+    },
+    {
+      "epoch": 0.7932692307692307,
+      "grad_norm": 0.6191152930259705,
+      "learning_rate": 0.00018134204118993568,
+      "loss": 1.0725,
+      "step": 4455
+    },
+    {
+      "epoch": 0.7934472934472935,
+      "grad_norm": 0.549735963344574,
+      "learning_rate": 0.00018133389839629396,
+      "loss": 0.9915,
+      "step": 4456
+    },
+    {
+      "epoch": 0.7936253561253561,
+      "grad_norm": 0.4940381646156311,
+      "learning_rate": 0.00018132575400908347,
+      "loss": 1.1815,
+      "step": 4457
+    },
+    {
+      "epoch": 0.7938034188034188,
+      "grad_norm": 0.5009099245071411,
+      "learning_rate": 0.00018131760802846377,
+      "loss": 1.0833,
+      "step": 4458
+    },
+    {
+      "epoch": 0.7939814814814815,
+      "grad_norm": 0.595853865146637,
+      "learning_rate": 0.00018130946045459445,
+      "loss": 1.2774,
+      "step": 4459
+    },
+    {
+      "epoch": 0.7941595441595442,
+      "grad_norm": 0.534794807434082,
+      "learning_rate": 0.00018130131128763513,
+      "loss": 1.0891,
+      "step": 4460
+    },
+    {
+      "epoch": 0.7943376068376068,
+      "grad_norm": 0.5828582048416138,
+      "learning_rate": 0.00018129316052774557,
+      "loss": 1.0786,
+      "step": 4461
+    },
+    {
+      "epoch": 0.7945156695156695,
+      "grad_norm": 0.4750654697418213,
+      "learning_rate": 0.00018128500817508533,
+      "loss": 1.0818,
+      "step": 4462
+    },
+    {
+      "epoch": 0.7946937321937322,
+      "grad_norm": 0.5626576542854309,
+      "learning_rate": 0.00018127685422981426,
+      "loss": 1.0807,
+      "step": 4463
+    },
+    {
+      "epoch": 0.7948717948717948,
+      "grad_norm": 0.6434760093688965,
+      "learning_rate": 0.00018126869869209203,
+      "loss": 1.0908,
+      "step": 4464
+    },
+    {
+      "epoch": 0.7950498575498576,
+      "grad_norm": 0.5577414631843567,
+      "learning_rate": 0.00018126054156207853,
+      "loss": 1.0281,
+      "step": 4465
+    },
+    {
+      "epoch": 0.7952279202279202,
+      "grad_norm": 0.5001249313354492,
+      "learning_rate": 0.00018125238283993347,
+      "loss": 0.9083,
+      "step": 4466
+    },
+    {
+      "epoch": 0.7954059829059829,
+      "grad_norm": 0.5298314690589905,
+      "learning_rate": 0.00018124422252581676,
+      "loss": 0.971,
+      "step": 4467
+    },
+    {
+      "epoch": 0.7955840455840456,
+      "grad_norm": 0.4872737228870392,
+      "learning_rate": 0.00018123606061988832,
+      "loss": 1.0515,
+      "step": 4468
+    },
+    {
+      "epoch": 0.7957621082621082,
+      "grad_norm": 0.5895398259162903,
+      "learning_rate": 0.00018122789712230798,
+      "loss": 1.0771,
+      "step": 4469
+    },
+    {
+      "epoch": 0.7959401709401709,
+      "grad_norm": 0.5212514996528625,
+      "learning_rate": 0.00018121973203323577,
+      "loss": 1.0365,
+      "step": 4470
+    },
+    {
+      "epoch": 0.7961182336182336,
+      "grad_norm": 0.4679451584815979,
+      "learning_rate": 0.0001812115653528316,
+      "loss": 0.9445,
+      "step": 4471
+    },
+    {
+      "epoch": 0.7962962962962963,
+      "grad_norm": 0.5852653980255127,
+      "learning_rate": 0.00018120339708125552,
+      "loss": 1.1781,
+      "step": 4472
+    },
+    {
+      "epoch": 0.7964743589743589,
+      "grad_norm": 0.6081342697143555,
+      "learning_rate": 0.00018119522721866756,
+      "loss": 1.3881,
+      "step": 4473
+    },
+    {
+      "epoch": 0.7966524216524217,
+      "grad_norm": 0.5254155993461609,
+      "learning_rate": 0.00018118705576522777,
+      "loss": 1.2198,
+      "step": 4474
+    },
+    {
+      "epoch": 0.7968304843304843,
+      "grad_norm": 0.5959419012069702,
+      "learning_rate": 0.00018117888272109632,
+      "loss": 1.0922,
+      "step": 4475
+    },
+    {
+      "epoch": 0.7970085470085471,
+      "grad_norm": 0.6243147253990173,
+      "learning_rate": 0.0001811707080864333,
+      "loss": 1.1782,
+      "step": 4476
+    },
+    {
+      "epoch": 0.7971866096866097,
+      "grad_norm": 0.5336906909942627,
+      "learning_rate": 0.0001811625318613988,
+      "loss": 1.167,
+      "step": 4477
+    },
+    {
+      "epoch": 0.7973646723646723,
+      "grad_norm": 0.5287907719612122,
+      "learning_rate": 0.00018115435404615315,
+      "loss": 0.9923,
+      "step": 4478
+    },
+    {
+      "epoch": 0.7975427350427351,
+      "grad_norm": 0.48941442370414734,
+      "learning_rate": 0.0001811461746408565,
+      "loss": 0.863,
+      "step": 4479
+    },
+    {
+      "epoch": 0.7977207977207977,
+      "grad_norm": 0.48465651273727417,
+      "learning_rate": 0.0001811379936456691,
+      "loss": 1.147,
+      "step": 4480
+    },
+    {
+      "epoch": 0.7978988603988604,
+      "grad_norm": 0.5676067471504211,
+      "learning_rate": 0.0001811298110607513,
+      "loss": 1.3121,
+      "step": 4481
+    },
+    {
+      "epoch": 0.7980769230769231,
+      "grad_norm": 0.4894018769264221,
+      "learning_rate": 0.00018112162688626337,
+      "loss": 1.1831,
+      "step": 4482
+    },
+    {
+      "epoch": 0.7982549857549858,
+      "grad_norm": 0.5626382827758789,
+      "learning_rate": 0.0001811134411223657,
+      "loss": 1.1977,
+      "step": 4483
+    },
+    {
+      "epoch": 0.7984330484330484,
+      "grad_norm": 0.564119815826416,
+      "learning_rate": 0.00018110525376921862,
+      "loss": 1.2686,
+      "step": 4484
+    },
+    {
+      "epoch": 0.7986111111111112,
+      "grad_norm": 0.6385740041732788,
+      "learning_rate": 0.00018109706482698256,
+      "loss": 1.2418,
+      "step": 4485
+    },
+    {
+      "epoch": 0.7987891737891738,
+      "grad_norm": 0.5550164580345154,
+      "learning_rate": 0.00018108887429581802,
+      "loss": 1.081,
+      "step": 4486
+    },
+    {
+      "epoch": 0.7989672364672364,
+      "grad_norm": 0.5583973526954651,
+      "learning_rate": 0.00018108068217588544,
+      "loss": 1.1757,
+      "step": 4487
+    },
+    {
+      "epoch": 0.7991452991452992,
+      "grad_norm": 0.5533342957496643,
+      "learning_rate": 0.00018107248846734527,
+      "loss": 1.1947,
+      "step": 4488
+    },
+    {
+      "epoch": 0.7993233618233618,
+      "grad_norm": 0.5291479229927063,
+      "learning_rate": 0.00018106429317035815,
+      "loss": 1.2769,
+      "step": 4489
+    },
+    {
+      "epoch": 0.7995014245014245,
+      "grad_norm": 0.4680160582065582,
+      "learning_rate": 0.00018105609628508458,
+      "loss": 0.7059,
+      "step": 4490
+    },
+    {
+      "epoch": 0.7996794871794872,
+      "grad_norm": 0.5364881157875061,
+      "learning_rate": 0.00018104789781168517,
+      "loss": 1.0566,
+      "step": 4491
+    },
+    {
+      "epoch": 0.7998575498575499,
+      "grad_norm": 0.5917307734489441,
+      "learning_rate": 0.0001810396977503206,
+      "loss": 1.2263,
+      "step": 4492
+    },
+    {
+      "epoch": 0.8000356125356125,
+      "grad_norm": 0.6013199090957642,
+      "learning_rate": 0.0001810314961011515,
+      "loss": 1.2053,
+      "step": 4493
+    },
+    {
+      "epoch": 0.8002136752136753,
+      "grad_norm": 0.6005663275718689,
+      "learning_rate": 0.0001810232928643385,
+      "loss": 1.2241,
+      "step": 4494
+    },
+    {
+      "epoch": 0.8003917378917379,
+      "grad_norm": 0.49207603931427,
+      "learning_rate": 0.00018101508804004246,
+      "loss": 1.0661,
+      "step": 4495
+    },
+    {
+      "epoch": 0.8005698005698005,
+      "grad_norm": 0.4834063947200775,
+      "learning_rate": 0.00018100688162842401,
+      "loss": 1.1745,
+      "step": 4496
+    },
+    {
+      "epoch": 0.8007478632478633,
+      "grad_norm": 0.5347156524658203,
+      "learning_rate": 0.000180998673629644,
+      "loss": 1.0679,
+      "step": 4497
+    },
+    {
+      "epoch": 0.8009259259259259,
+      "grad_norm": 0.5815600156784058,
+      "learning_rate": 0.00018099046404386327,
+      "loss": 1.2652,
+      "step": 4498
+    },
+    {
+      "epoch": 0.8011039886039886,
+      "grad_norm": 0.5291135311126709,
+      "learning_rate": 0.00018098225287124263,
+      "loss": 1.2072,
+      "step": 4499
+    },
+    {
+      "epoch": 0.8012820512820513,
+      "grad_norm": 0.5779497027397156,
+      "learning_rate": 0.000180974040111943,
+      "loss": 1.3277,
+      "step": 4500
+    },
+    {
+      "epoch": 0.801460113960114,
+      "grad_norm": 0.44566696882247925,
+      "learning_rate": 0.0001809658257661252,
+      "loss": 0.7702,
+      "step": 4501
+    },
+    {
+      "epoch": 0.8016381766381766,
+      "grad_norm": 0.5407577753067017,
+      "learning_rate": 0.00018095760983395027,
+      "loss": 1.2894,
+      "step": 4502
+    },
+    {
+      "epoch": 0.8018162393162394,
+      "grad_norm": 0.4771903455257416,
+      "learning_rate": 0.00018094939231557916,
+      "loss": 1.045,
+      "step": 4503
+    },
+    {
+      "epoch": 0.801994301994302,
+      "grad_norm": 0.5970945358276367,
+      "learning_rate": 0.00018094117321117286,
+      "loss": 1.2059,
+      "step": 4504
+    },
+    {
+      "epoch": 0.8021723646723646,
+      "grad_norm": 0.4959338903427124,
+      "learning_rate": 0.0001809329525208924,
+      "loss": 1.155,
+      "step": 4505
+    },
+    {
+      "epoch": 0.8023504273504274,
+      "grad_norm": 0.5142548084259033,
+      "learning_rate": 0.00018092473024489887,
+      "loss": 0.9413,
+      "step": 4506
+    },
+    {
+      "epoch": 0.80252849002849,
+      "grad_norm": 0.5336433053016663,
+      "learning_rate": 0.00018091650638335334,
+      "loss": 1.0699,
+      "step": 4507
+    },
+    {
+      "epoch": 0.8027065527065527,
+      "grad_norm": 0.47770628333091736,
+      "learning_rate": 0.00018090828093641698,
+      "loss": 1.1515,
+      "step": 4508
+    },
+    {
+      "epoch": 0.8028846153846154,
+      "grad_norm": 0.5443438291549683,
+      "learning_rate": 0.00018090005390425091,
+      "loss": 1.189,
+      "step": 4509
+    },
+    {
+      "epoch": 0.8030626780626781,
+      "grad_norm": 0.523179829120636,
+      "learning_rate": 0.00018089182528701632,
+      "loss": 1.1272,
+      "step": 4510
+    },
+    {
+      "epoch": 0.8032407407407407,
+      "grad_norm": 0.49628451466560364,
+      "learning_rate": 0.00018088359508487448,
+      "loss": 0.9754,
+      "step": 4511
+    },
+    {
+      "epoch": 0.8034188034188035,
+      "grad_norm": 0.5933086276054382,
+      "learning_rate": 0.00018087536329798663,
+      "loss": 1.2111,
+      "step": 4512
+    },
+    {
+      "epoch": 0.8035968660968661,
+      "grad_norm": 0.4565310776233673,
+      "learning_rate": 0.00018086712992651402,
+      "loss": 0.7729,
+      "step": 4513
+    },
+    {
+      "epoch": 0.8037749287749287,
+      "grad_norm": 0.5013461112976074,
+      "learning_rate": 0.00018085889497061798,
+      "loss": 1.2178,
+      "step": 4514
+    },
+    {
+      "epoch": 0.8039529914529915,
+      "grad_norm": 0.5170024633407593,
+      "learning_rate": 0.00018085065843045987,
+      "loss": 0.9181,
+      "step": 4515
+    },
+    {
+      "epoch": 0.8041310541310541,
+      "grad_norm": 0.583363950252533,
+      "learning_rate": 0.00018084242030620104,
+      "loss": 1.1542,
+      "step": 4516
+    },
+    {
+      "epoch": 0.8043091168091168,
+      "grad_norm": 0.46835777163505554,
+      "learning_rate": 0.00018083418059800297,
+      "loss": 0.8954,
+      "step": 4517
+    },
+    {
+      "epoch": 0.8044871794871795,
+      "grad_norm": 0.5145657062530518,
+      "learning_rate": 0.000180825939306027,
+      "loss": 1.0417,
+      "step": 4518
+    },
+    {
+      "epoch": 0.8046652421652422,
+      "grad_norm": 0.47216105461120605,
+      "learning_rate": 0.00018081769643043467,
+      "loss": 0.9516,
+      "step": 4519
+    },
+    {
+      "epoch": 0.8048433048433048,
+      "grad_norm": 0.5059915781021118,
+      "learning_rate": 0.0001808094519713875,
+      "loss": 1.1643,
+      "step": 4520
+    },
+    {
+      "epoch": 0.8050213675213675,
+      "grad_norm": 0.5406439900398254,
+      "learning_rate": 0.00018080120592904692,
+      "loss": 1.2038,
+      "step": 4521
+    },
+    {
+      "epoch": 0.8051994301994302,
+      "grad_norm": 0.6123420000076294,
+      "learning_rate": 0.0001807929583035746,
+      "loss": 1.4004,
+      "step": 4522
+    },
+    {
+      "epoch": 0.8053774928774928,
+      "grad_norm": 0.49699845910072327,
+      "learning_rate": 0.00018078470909513208,
+      "loss": 1.0347,
+      "step": 4523
+    },
+    {
+      "epoch": 0.8055555555555556,
+      "grad_norm": 0.5369421243667603,
+      "learning_rate": 0.000180776458303881,
+      "loss": 1.0418,
+      "step": 4524
+    },
+    {
+      "epoch": 0.8057336182336182,
+      "grad_norm": 0.5407396554946899,
+      "learning_rate": 0.00018076820592998301,
+      "loss": 0.9546,
+      "step": 4525
+    },
+    {
+      "epoch": 0.8059116809116809,
+      "grad_norm": 0.5749752521514893,
+      "learning_rate": 0.00018075995197359984,
+      "loss": 1.1438,
+      "step": 4526
+    },
+    {
+      "epoch": 0.8060897435897436,
+      "grad_norm": 0.5523102283477783,
+      "learning_rate": 0.00018075169643489317,
+      "loss": 1.1312,
+      "step": 4527
+    },
+    {
+      "epoch": 0.8062678062678063,
+      "grad_norm": 0.5767508149147034,
+      "learning_rate": 0.00018074343931402472,
+      "loss": 1.1951,
+      "step": 4528
+    },
+    {
+      "epoch": 0.8064458689458689,
+      "grad_norm": 0.5262924432754517,
+      "learning_rate": 0.00018073518061115633,
+      "loss": 1.1985,
+      "step": 4529
+    },
+    {
+      "epoch": 0.8066239316239316,
+      "grad_norm": 0.4742378294467926,
+      "learning_rate": 0.0001807269203264498,
+      "loss": 1.0126,
+      "step": 4530
+    },
+    {
+      "epoch": 0.8068019943019943,
+      "grad_norm": 0.5190158486366272,
+      "learning_rate": 0.00018071865846006692,
+      "loss": 0.9985,
+      "step": 4531
+    },
+    {
+      "epoch": 0.8069800569800569,
+      "grad_norm": 0.5910618305206299,
+      "learning_rate": 0.00018071039501216964,
+      "loss": 1.2776,
+      "step": 4532
+    },
+    {
+      "epoch": 0.8071581196581197,
+      "grad_norm": 0.5363098382949829,
+      "learning_rate": 0.00018070212998291983,
+      "loss": 1.3346,
+      "step": 4533
+    },
+    {
+      "epoch": 0.8073361823361823,
+      "grad_norm": 0.47711408138275146,
+      "learning_rate": 0.0001806938633724794,
+      "loss": 1.04,
+      "step": 4534
+    },
+    {
+      "epoch": 0.8075142450142451,
+      "grad_norm": 0.5092964172363281,
+      "learning_rate": 0.0001806855951810104,
+      "loss": 1.1409,
+      "step": 4535
+    },
+    {
+      "epoch": 0.8076923076923077,
+      "grad_norm": 0.5828777551651001,
+      "learning_rate": 0.00018067732540867472,
+      "loss": 1.3048,
+      "step": 4536
+    },
+    {
+      "epoch": 0.8078703703703703,
+      "grad_norm": 0.5779826045036316,
+      "learning_rate": 0.00018066905405563445,
+      "loss": 1.1599,
+      "step": 4537
+    },
+    {
+      "epoch": 0.8080484330484331,
+      "grad_norm": 0.49908435344696045,
+      "learning_rate": 0.00018066078112205167,
+      "loss": 1.1502,
+      "step": 4538
+    },
+    {
+      "epoch": 0.8082264957264957,
+      "grad_norm": 0.4772704839706421,
+      "learning_rate": 0.0001806525066080884,
+      "loss": 0.7925,
+      "step": 4539
+    },
+    {
+      "epoch": 0.8084045584045584,
+      "grad_norm": 0.4298383295536041,
+      "learning_rate": 0.00018064423051390683,
+      "loss": 0.7322,
+      "step": 4540
+    },
+    {
+      "epoch": 0.8085826210826211,
+      "grad_norm": 0.49349579215049744,
+      "learning_rate": 0.0001806359528396691,
+      "loss": 1.0021,
+      "step": 4541
+    },
+    {
+      "epoch": 0.8087606837606838,
+      "grad_norm": 0.4698609411716461,
+      "learning_rate": 0.00018062767358553735,
+      "loss": 0.9751,
+      "step": 4542
+    },
+    {
+      "epoch": 0.8089387464387464,
+      "grad_norm": 0.4949014186859131,
+      "learning_rate": 0.00018061939275167385,
+      "loss": 0.9553,
+      "step": 4543
+    },
+    {
+      "epoch": 0.8091168091168092,
+      "grad_norm": 0.5604463815689087,
+      "learning_rate": 0.0001806111103382408,
+      "loss": 0.9894,
+      "step": 4544
+    },
+    {
+      "epoch": 0.8092948717948718,
+      "grad_norm": 0.5761561989784241,
+      "learning_rate": 0.00018060282634540053,
+      "loss": 1.258,
+      "step": 4545
+    },
+    {
+      "epoch": 0.8094729344729344,
+      "grad_norm": 0.5239115357398987,
+      "learning_rate": 0.00018059454077331527,
+      "loss": 0.9189,
+      "step": 4546
+    },
+    {
+      "epoch": 0.8096509971509972,
+      "grad_norm": 0.47902220487594604,
+      "learning_rate": 0.00018058625362214742,
+      "loss": 1.0389,
+      "step": 4547
+    },
+    {
+      "epoch": 0.8098290598290598,
+      "grad_norm": 0.6274173259735107,
+      "learning_rate": 0.00018057796489205936,
+      "loss": 1.3368,
+      "step": 4548
+    },
+    {
+      "epoch": 0.8100071225071225,
+      "grad_norm": 0.5789401531219482,
+      "learning_rate": 0.00018056967458321345,
+      "loss": 1.1473,
+      "step": 4549
+    },
+    {
+      "epoch": 0.8101851851851852,
+      "grad_norm": 0.5850043296813965,
+      "learning_rate": 0.0001805613826957721,
+      "loss": 1.2224,
+      "step": 4550
+    },
+    {
+      "epoch": 0.8103632478632479,
+      "grad_norm": 0.6310738921165466,
+      "learning_rate": 0.00018055308922989788,
+      "loss": 1.0707,
+      "step": 4551
+    },
+    {
+      "epoch": 0.8105413105413105,
+      "grad_norm": 0.5198429822921753,
+      "learning_rate": 0.00018054479418575317,
+      "loss": 0.8984,
+      "step": 4552
+    },
+    {
+      "epoch": 0.8107193732193733,
+      "grad_norm": 0.5757743120193481,
+      "learning_rate": 0.00018053649756350054,
+      "loss": 1.2007,
+      "step": 4553
+    },
+    {
+      "epoch": 0.8108974358974359,
+      "grad_norm": 0.5109567642211914,
+      "learning_rate": 0.0001805281993633025,
+      "loss": 1.0696,
+      "step": 4554
+    },
+    {
+      "epoch": 0.8110754985754985,
+      "grad_norm": 0.5030225515365601,
+      "learning_rate": 0.00018051989958532173,
+      "loss": 0.9667,
+      "step": 4555
+    },
+    {
+      "epoch": 0.8112535612535613,
+      "grad_norm": 0.5291743874549866,
+      "learning_rate": 0.00018051159822972079,
+      "loss": 1.0219,
+      "step": 4556
+    },
+    {
+      "epoch": 0.8114316239316239,
+      "grad_norm": 0.5874896049499512,
+      "learning_rate": 0.00018050329529666233,
+      "loss": 0.8589,
+      "step": 4557
+    },
+    {
+      "epoch": 0.8116096866096866,
+      "grad_norm": 0.673284113407135,
+      "learning_rate": 0.000180494990786309,
+      "loss": 1.1902,
+      "step": 4558
+    },
+    {
+      "epoch": 0.8117877492877493,
+      "grad_norm": 0.4742524027824402,
+      "learning_rate": 0.00018048668469882354,
+      "loss": 1.0578,
+      "step": 4559
+    },
+    {
+      "epoch": 0.811965811965812,
+      "grad_norm": 0.5519167184829712,
+      "learning_rate": 0.0001804783770343687,
+      "loss": 1.083,
+      "step": 4560
+    },
+    {
+      "epoch": 0.8121438746438746,
+      "grad_norm": 0.5669941306114197,
+      "learning_rate": 0.00018047006779310727,
+      "loss": 1.0784,
+      "step": 4561
+    },
+    {
+      "epoch": 0.8123219373219374,
+      "grad_norm": 0.512759804725647,
+      "learning_rate": 0.000180461756975202,
+      "loss": 1.0361,
+      "step": 4562
+    },
+    {
+      "epoch": 0.8125,
+      "grad_norm": 0.5721749067306519,
+      "learning_rate": 0.00018045344458081575,
+      "loss": 1.0246,
+      "step": 4563
+    },
+    {
+      "epoch": 0.8126780626780626,
+      "grad_norm": 0.566430389881134,
+      "learning_rate": 0.00018044513061011137,
+      "loss": 1.1452,
+      "step": 4564
+    },
+    {
+      "epoch": 0.8128561253561254,
+      "grad_norm": 0.49391916394233704,
+      "learning_rate": 0.00018043681506325177,
+      "loss": 0.89,
+      "step": 4565
+    },
+    {
+      "epoch": 0.813034188034188,
+      "grad_norm": 0.5379437804222107,
+      "learning_rate": 0.00018042849794039988,
+      "loss": 1.1289,
+      "step": 4566
+    },
+    {
+      "epoch": 0.8132122507122507,
+      "grad_norm": 0.5667982697486877,
+      "learning_rate": 0.00018042017924171865,
+      "loss": 1.1596,
+      "step": 4567
+    },
+    {
+      "epoch": 0.8133903133903134,
+      "grad_norm": 0.6214209794998169,
+      "learning_rate": 0.00018041185896737109,
+      "loss": 1.0622,
+      "step": 4568
+    },
+    {
+      "epoch": 0.8135683760683761,
+      "grad_norm": 0.5442491173744202,
+      "learning_rate": 0.00018040353711752015,
+      "loss": 1.0536,
+      "step": 4569
+    },
+    {
+      "epoch": 0.8137464387464387,
+      "grad_norm": 0.5266172885894775,
+      "learning_rate": 0.00018039521369232894,
+      "loss": 1.0576,
+      "step": 4570
+    },
+    {
+      "epoch": 0.8139245014245015,
+      "grad_norm": 0.6057912111282349,
+      "learning_rate": 0.00018038688869196053,
+      "loss": 1.3067,
+      "step": 4571
+    },
+    {
+      "epoch": 0.8141025641025641,
+      "grad_norm": 0.489869087934494,
+      "learning_rate": 0.00018037856211657803,
+      "loss": 1.0279,
+      "step": 4572
+    },
+    {
+      "epoch": 0.8142806267806267,
+      "grad_norm": 0.5497978329658508,
+      "learning_rate": 0.00018037023396634457,
+      "loss": 1.1568,
+      "step": 4573
+    },
+    {
+      "epoch": 0.8144586894586895,
+      "grad_norm": 0.5243251919746399,
+      "learning_rate": 0.0001803619042414233,
+      "loss": 0.9767,
+      "step": 4574
+    },
+    {
+      "epoch": 0.8146367521367521,
+      "grad_norm": 0.503032922744751,
+      "learning_rate": 0.0001803535729419775,
+      "loss": 1.065,
+      "step": 4575
+    },
+    {
+      "epoch": 0.8148148148148148,
+      "grad_norm": 0.49955418705940247,
+      "learning_rate": 0.00018034524006817034,
+      "loss": 1.2752,
+      "step": 4576
+    },
+    {
+      "epoch": 0.8149928774928775,
+      "grad_norm": 0.5746406316757202,
+      "learning_rate": 0.00018033690562016508,
+      "loss": 1.098,
+      "step": 4577
+    },
+    {
+      "epoch": 0.8151709401709402,
+      "grad_norm": 0.5224192142486572,
+      "learning_rate": 0.00018032856959812507,
+      "loss": 1.1284,
+      "step": 4578
+    },
+    {
+      "epoch": 0.8153490028490028,
+      "grad_norm": 0.5484535694122314,
+      "learning_rate": 0.00018032023200221362,
+      "loss": 0.9182,
+      "step": 4579
+    },
+    {
+      "epoch": 0.8155270655270656,
+      "grad_norm": 0.5003355741500854,
+      "learning_rate": 0.00018031189283259405,
+      "loss": 1.136,
+      "step": 4580
+    },
+    {
+      "epoch": 0.8157051282051282,
+      "grad_norm": 0.5395768284797668,
+      "learning_rate": 0.00018030355208942977,
+      "loss": 1.2349,
+      "step": 4581
+    },
+    {
+      "epoch": 0.8158831908831908,
+      "grad_norm": 0.561966598033905,
+      "learning_rate": 0.0001802952097728842,
+      "loss": 0.999,
+      "step": 4582
+    },
+    {
+      "epoch": 0.8160612535612536,
+      "grad_norm": 0.4886479675769806,
+      "learning_rate": 0.00018028686588312083,
+      "loss": 0.9165,
+      "step": 4583
+    },
+    {
+      "epoch": 0.8162393162393162,
+      "grad_norm": 0.4769509732723236,
+      "learning_rate": 0.00018027852042030307,
+      "loss": 1.1377,
+      "step": 4584
+    },
+    {
+      "epoch": 0.8164173789173789,
+      "grad_norm": 0.4723633825778961,
+      "learning_rate": 0.00018027017338459448,
+      "loss": 1.0274,
+      "step": 4585
+    },
+    {
+      "epoch": 0.8165954415954416,
+      "grad_norm": 0.5773285627365112,
+      "learning_rate": 0.00018026182477615859,
+      "loss": 1.1468,
+      "step": 4586
+    },
+    {
+      "epoch": 0.8167735042735043,
+      "grad_norm": 0.5529203414916992,
+      "learning_rate": 0.00018025347459515895,
+      "loss": 1.0815,
+      "step": 4587
+    },
+    {
+      "epoch": 0.8169515669515669,
+      "grad_norm": 0.5449469685554504,
+      "learning_rate": 0.00018024512284175922,
+      "loss": 1.1637,
+      "step": 4588
+    },
+    {
+      "epoch": 0.8171296296296297,
+      "grad_norm": 0.5155341625213623,
+      "learning_rate": 0.00018023676951612298,
+      "loss": 1.1842,
+      "step": 4589
+    },
+    {
+      "epoch": 0.8173076923076923,
+      "grad_norm": 0.5569564700126648,
+      "learning_rate": 0.00018022841461841393,
+      "loss": 0.9254,
+      "step": 4590
+    },
+    {
+      "epoch": 0.8174857549857549,
+      "grad_norm": 0.45203131437301636,
+      "learning_rate": 0.00018022005814879573,
+      "loss": 0.9561,
+      "step": 4591
+    },
+    {
+      "epoch": 0.8176638176638177,
+      "grad_norm": 0.5735056400299072,
+      "learning_rate": 0.00018021170010743218,
+      "loss": 1.1402,
+      "step": 4592
+    },
+    {
+      "epoch": 0.8178418803418803,
+      "grad_norm": 0.6075260043144226,
+      "learning_rate": 0.00018020334049448697,
+      "loss": 0.8601,
+      "step": 4593
+    },
+    {
+      "epoch": 0.8180199430199431,
+      "grad_norm": 0.522682785987854,
+      "learning_rate": 0.0001801949793101239,
+      "loss": 1.0088,
+      "step": 4594
+    },
+    {
+      "epoch": 0.8181980056980057,
+      "grad_norm": 0.5648437142372131,
+      "learning_rate": 0.00018018661655450682,
+      "loss": 0.8359,
+      "step": 4595
+    },
+    {
+      "epoch": 0.8183760683760684,
+      "grad_norm": 0.5406472086906433,
+      "learning_rate": 0.00018017825222779954,
+      "loss": 1.1553,
+      "step": 4596
+    },
+    {
+      "epoch": 0.8185541310541311,
+      "grad_norm": 0.4917788803577423,
+      "learning_rate": 0.000180169886330166,
+      "loss": 1.2198,
+      "step": 4597
+    },
+    {
+      "epoch": 0.8187321937321937,
+      "grad_norm": 0.6293069124221802,
+      "learning_rate": 0.00018016151886177004,
+      "loss": 1.0245,
+      "step": 4598
+    },
+    {
+      "epoch": 0.8189102564102564,
+      "grad_norm": 0.47277843952178955,
+      "learning_rate": 0.00018015314982277564,
+      "loss": 1.1141,
+      "step": 4599
+    },
+    {
+      "epoch": 0.8190883190883191,
+      "grad_norm": 0.6132395267486572,
+      "learning_rate": 0.0001801447792133468,
+      "loss": 1.1227,
+      "step": 4600
+    },
+    {
+      "epoch": 0.8192663817663818,
+      "grad_norm": 0.46839597821235657,
+      "learning_rate": 0.00018013640703364747,
+      "loss": 0.9239,
+      "step": 4601
+    },
+    {
+      "epoch": 0.8194444444444444,
+      "grad_norm": 0.5055009722709656,
+      "learning_rate": 0.00018012803328384171,
+      "loss": 0.8486,
+      "step": 4602
+    },
+    {
+      "epoch": 0.8196225071225072,
+      "grad_norm": 0.5094841718673706,
+      "learning_rate": 0.00018011965796409362,
+      "loss": 0.9969,
+      "step": 4603
+    },
+    {
+      "epoch": 0.8198005698005698,
+      "grad_norm": 0.6177363395690918,
+      "learning_rate": 0.00018011128107456726,
+      "loss": 1.242,
+      "step": 4604
+    },
+    {
+      "epoch": 0.8199786324786325,
+      "grad_norm": 0.5280042290687561,
+      "learning_rate": 0.00018010290261542676,
+      "loss": 1.1569,
+      "step": 4605
+    },
+    {
+      "epoch": 0.8201566951566952,
+      "grad_norm": 0.5259367227554321,
+      "learning_rate": 0.00018009452258683625,
+      "loss": 0.9993,
+      "step": 4606
+    },
+    {
+      "epoch": 0.8203347578347578,
+      "grad_norm": 0.464469850063324,
+      "learning_rate": 0.00018008614098896,
+      "loss": 1.0288,
+      "step": 4607
+    },
+    {
+      "epoch": 0.8205128205128205,
+      "grad_norm": 0.6136324405670166,
+      "learning_rate": 0.00018007775782196214,
+      "loss": 1.1541,
+      "step": 4608
+    },
+    {
+      "epoch": 0.8206908831908832,
+      "grad_norm": 0.5376590490341187,
+      "learning_rate": 0.000180069373086007,
+      "loss": 1.0624,
+      "step": 4609
+    },
+    {
+      "epoch": 0.8208689458689459,
+      "grad_norm": 0.662916362285614,
+      "learning_rate": 0.0001800609867812588,
+      "loss": 1.1502,
+      "step": 4610
+    },
+    {
+      "epoch": 0.8210470085470085,
+      "grad_norm": 0.5153383612632751,
+      "learning_rate": 0.00018005259890788188,
+      "loss": 0.9789,
+      "step": 4611
+    },
+    {
+      "epoch": 0.8212250712250713,
+      "grad_norm": 0.5042359232902527,
+      "learning_rate": 0.00018004420946604057,
+      "loss": 0.9585,
+      "step": 4612
+    },
+    {
+      "epoch": 0.8214031339031339,
+      "grad_norm": 0.5395993590354919,
+      "learning_rate": 0.00018003581845589927,
+      "loss": 1.159,
+      "step": 4613
+    },
+    {
+      "epoch": 0.8215811965811965,
+      "grad_norm": 0.5561928749084473,
+      "learning_rate": 0.00018002742587762237,
+      "loss": 1.1604,
+      "step": 4614
+    },
+    {
+      "epoch": 0.8217592592592593,
+      "grad_norm": 0.5602710843086243,
+      "learning_rate": 0.00018001903173137432,
+      "loss": 0.9922,
+      "step": 4615
+    },
+    {
+      "epoch": 0.8219373219373219,
+      "grad_norm": 0.5529088377952576,
+      "learning_rate": 0.00018001063601731955,
+      "loss": 1.0943,
+      "step": 4616
+    },
+    {
+      "epoch": 0.8221153846153846,
+      "grad_norm": 0.5156456828117371,
+      "learning_rate": 0.00018000223873562254,
+      "loss": 1.1399,
+      "step": 4617
+    },
+    {
+      "epoch": 0.8222934472934473,
+      "grad_norm": 0.4868306517601013,
+      "learning_rate": 0.0001799938398864479,
+      "loss": 1.0692,
+      "step": 4618
+    },
+    {
+      "epoch": 0.82247150997151,
+      "grad_norm": 0.5372915267944336,
+      "learning_rate": 0.0001799854394699601,
+      "loss": 1.2675,
+      "step": 4619
+    },
+    {
+      "epoch": 0.8226495726495726,
+      "grad_norm": 0.6101839542388916,
+      "learning_rate": 0.0001799770374863238,
+      "loss": 0.9586,
+      "step": 4620
+    },
+    {
+      "epoch": 0.8228276353276354,
+      "grad_norm": 0.5034586787223816,
+      "learning_rate": 0.00017996863393570357,
+      "loss": 1.0885,
+      "step": 4621
+    },
+    {
+      "epoch": 0.823005698005698,
+      "grad_norm": 0.5608823299407959,
+      "learning_rate": 0.0001799602288182641,
+      "loss": 1.0002,
+      "step": 4622
+    },
+    {
+      "epoch": 0.8231837606837606,
+      "grad_norm": 0.5700048208236694,
+      "learning_rate": 0.00017995182213417,
+      "loss": 1.1484,
+      "step": 4623
+    },
+    {
+      "epoch": 0.8233618233618234,
+      "grad_norm": 0.5283229351043701,
+      "learning_rate": 0.00017994341388358608,
+      "loss": 1.0744,
+      "step": 4624
+    },
+    {
+      "epoch": 0.823539886039886,
+      "grad_norm": 0.5215758681297302,
+      "learning_rate": 0.00017993500406667703,
+      "loss": 1.2686,
+      "step": 4625
+    },
+    {
+      "epoch": 0.8237179487179487,
+      "grad_norm": 0.528883159160614,
+      "learning_rate": 0.0001799265926836076,
+      "loss": 1.1393,
+      "step": 4626
+    },
+    {
+      "epoch": 0.8238960113960114,
+      "grad_norm": 0.5589834451675415,
+      "learning_rate": 0.00017991817973454265,
+      "loss": 1.1744,
+      "step": 4627
+    },
+    {
+      "epoch": 0.8240740740740741,
+      "grad_norm": 0.49817174673080444,
+      "learning_rate": 0.00017990976521964697,
+      "loss": 1.0544,
+      "step": 4628
+    },
+    {
+      "epoch": 0.8242521367521367,
+      "grad_norm": 0.613961398601532,
+      "learning_rate": 0.00017990134913908542,
+      "loss": 1.0951,
+      "step": 4629
+    },
+    {
+      "epoch": 0.8244301994301995,
+      "grad_norm": 0.47278255224227905,
+      "learning_rate": 0.00017989293149302295,
+      "loss": 0.9742,
+      "step": 4630
+    },
+    {
+      "epoch": 0.8246082621082621,
+      "grad_norm": 0.49807092547416687,
+      "learning_rate": 0.00017988451228162443,
+      "loss": 1.0985,
+      "step": 4631
+    },
+    {
+      "epoch": 0.8247863247863247,
+      "grad_norm": 0.5624374747276306,
+      "learning_rate": 0.00017987609150505485,
+      "loss": 1.2446,
+      "step": 4632
+    },
+    {
+      "epoch": 0.8249643874643875,
+      "grad_norm": 0.4863535761833191,
+      "learning_rate": 0.00017986766916347916,
+      "loss": 1.0239,
+      "step": 4633
+    },
+    {
+      "epoch": 0.8251424501424501,
+      "grad_norm": 0.679585874080658,
+      "learning_rate": 0.00017985924525706245,
+      "loss": 1.1698,
+      "step": 4634
+    },
+    {
+      "epoch": 0.8253205128205128,
+      "grad_norm": 0.5545455813407898,
+      "learning_rate": 0.00017985081978596967,
+      "loss": 1.0926,
+      "step": 4635
+    },
+    {
+      "epoch": 0.8254985754985755,
+      "grad_norm": 0.5303109288215637,
+      "learning_rate": 0.000179842392750366,
+      "loss": 1.0978,
+      "step": 4636
+    },
+    {
+      "epoch": 0.8256766381766382,
+      "grad_norm": 0.6053299307823181,
+      "learning_rate": 0.00017983396415041644,
+      "loss": 1.0596,
+      "step": 4637
+    },
+    {
+      "epoch": 0.8258547008547008,
+      "grad_norm": 0.5241885185241699,
+      "learning_rate": 0.00017982553398628625,
+      "loss": 0.8541,
+      "step": 4638
+    },
+    {
+      "epoch": 0.8260327635327636,
+      "grad_norm": 0.5934443473815918,
+      "learning_rate": 0.00017981710225814052,
+      "loss": 1.145,
+      "step": 4639
+    },
+    {
+      "epoch": 0.8262108262108262,
+      "grad_norm": 0.5341619849205017,
+      "learning_rate": 0.00017980866896614447,
+      "loss": 1.0745,
+      "step": 4640
+    },
+    {
+      "epoch": 0.8263888888888888,
+      "grad_norm": 0.6732913851737976,
+      "learning_rate": 0.00017980023411046336,
+      "loss": 1.0775,
+      "step": 4641
+    },
+    {
+      "epoch": 0.8265669515669516,
+      "grad_norm": 0.5134359002113342,
+      "learning_rate": 0.0001797917976912624,
+      "loss": 1.0298,
+      "step": 4642
+    },
+    {
+      "epoch": 0.8267450142450142,
+      "grad_norm": 0.5234783887863159,
+      "learning_rate": 0.00017978335970870698,
+      "loss": 1.1069,
+      "step": 4643
+    },
+    {
+      "epoch": 0.8269230769230769,
+      "grad_norm": 0.4776439964771271,
+      "learning_rate": 0.00017977492016296232,
+      "loss": 0.6367,
+      "step": 4644
+    },
+    {
+      "epoch": 0.8271011396011396,
+      "grad_norm": 0.53763347864151,
+      "learning_rate": 0.0001797664790541938,
+      "loss": 1.1356,
+      "step": 4645
+    },
+    {
+      "epoch": 0.8272792022792023,
+      "grad_norm": 0.5082212686538696,
+      "learning_rate": 0.00017975803638256682,
+      "loss": 0.7873,
+      "step": 4646
+    },
+    {
+      "epoch": 0.8274572649572649,
+      "grad_norm": 0.5156424641609192,
+      "learning_rate": 0.00017974959214824685,
+      "loss": 1.084,
+      "step": 4647
+    },
+    {
+      "epoch": 0.8276353276353277,
+      "grad_norm": 0.5275198817253113,
+      "learning_rate": 0.00017974114635139926,
+      "loss": 1.1219,
+      "step": 4648
+    },
+    {
+      "epoch": 0.8278133903133903,
+      "grad_norm": 0.5548223257064819,
+      "learning_rate": 0.00017973269899218956,
+      "loss": 1.0808,
+      "step": 4649
+    },
+    {
+      "epoch": 0.8279914529914529,
+      "grad_norm": 0.535347580909729,
+      "learning_rate": 0.00017972425007078323,
+      "loss": 1.1211,
+      "step": 4650
+    },
+    {
+      "epoch": 0.8281695156695157,
+      "grad_norm": 0.5299580693244934,
+      "learning_rate": 0.00017971579958734587,
+      "loss": 0.9911,
+      "step": 4651
+    },
+    {
+      "epoch": 0.8283475783475783,
+      "grad_norm": 0.4863550066947937,
+      "learning_rate": 0.000179707347542043,
+      "loss": 0.9122,
+      "step": 4652
+    },
+    {
+      "epoch": 0.8285256410256411,
+      "grad_norm": 0.5284972190856934,
+      "learning_rate": 0.00017969889393504022,
+      "loss": 1.0424,
+      "step": 4653
+    },
+    {
+      "epoch": 0.8287037037037037,
+      "grad_norm": 0.5305661559104919,
+      "learning_rate": 0.00017969043876650317,
+      "loss": 1.1122,
+      "step": 4654
+    },
+    {
+      "epoch": 0.8288817663817664,
+      "grad_norm": 0.5645657777786255,
+      "learning_rate": 0.00017968198203659755,
+      "loss": 1.2195,
+      "step": 4655
+    },
+    {
+      "epoch": 0.8290598290598291,
+      "grad_norm": 0.521649181842804,
+      "learning_rate": 0.000179673523745489,
+      "loss": 1.2684,
+      "step": 4656
+    },
+    {
+      "epoch": 0.8292378917378918,
+      "grad_norm": 0.5984422564506531,
+      "learning_rate": 0.00017966506389334322,
+      "loss": 0.9894,
+      "step": 4657
+    },
+    {
+      "epoch": 0.8294159544159544,
+      "grad_norm": 0.5318729281425476,
+      "learning_rate": 0.00017965660248032603,
+      "loss": 1.2929,
+      "step": 4658
+    },
+    {
+      "epoch": 0.8295940170940171,
+      "grad_norm": 0.4666081368923187,
+      "learning_rate": 0.0001796481395066032,
+      "loss": 0.9646,
+      "step": 4659
+    },
+    {
+      "epoch": 0.8297720797720798,
+      "grad_norm": 0.5780388116836548,
+      "learning_rate": 0.00017963967497234054,
+      "loss": 1.1043,
+      "step": 4660
+    },
+    {
+      "epoch": 0.8299501424501424,
+      "grad_norm": 0.44089245796203613,
+      "learning_rate": 0.00017963120887770387,
+      "loss": 0.8932,
+      "step": 4661
+    },
+    {
+      "epoch": 0.8301282051282052,
+      "grad_norm": 0.5198349356651306,
+      "learning_rate": 0.0001796227412228591,
+      "loss": 0.9378,
+      "step": 4662
+    },
+    {
+      "epoch": 0.8303062678062678,
+      "grad_norm": 0.5298343896865845,
+      "learning_rate": 0.00017961427200797206,
+      "loss": 1.0272,
+      "step": 4663
+    },
+    {
+      "epoch": 0.8304843304843305,
+      "grad_norm": 0.5087099671363831,
+      "learning_rate": 0.0001796058012332088,
+      "loss": 0.989,
+      "step": 4664
+    },
+    {
+      "epoch": 0.8306623931623932,
+      "grad_norm": 0.504228949546814,
+      "learning_rate": 0.0001795973288987352,
+      "loss": 1.0134,
+      "step": 4665
+    },
+    {
+      "epoch": 0.8308404558404558,
+      "grad_norm": 0.6788033843040466,
+      "learning_rate": 0.00017958885500471728,
+      "loss": 0.8856,
+      "step": 4666
+    },
+    {
+      "epoch": 0.8310185185185185,
+      "grad_norm": 0.5166172385215759,
+      "learning_rate": 0.00017958037955132113,
+      "loss": 0.8711,
+      "step": 4667
+    },
+    {
+      "epoch": 0.8311965811965812,
+      "grad_norm": 0.5712400078773499,
+      "learning_rate": 0.00017957190253871272,
+      "loss": 1.0418,
+      "step": 4668
+    },
+    {
+      "epoch": 0.8313746438746439,
+      "grad_norm": 0.5531231164932251,
+      "learning_rate": 0.0001795634239670582,
+      "loss": 0.9021,
+      "step": 4669
+    },
+    {
+      "epoch": 0.8315527065527065,
+      "grad_norm": 0.6165615916252136,
+      "learning_rate": 0.00017955494383652365,
+      "loss": 1.0927,
+      "step": 4670
+    },
+    {
+      "epoch": 0.8317307692307693,
+      "grad_norm": 0.5920368432998657,
+      "learning_rate": 0.00017954646214727525,
+      "loss": 1.231,
+      "step": 4671
+    },
+    {
+      "epoch": 0.8319088319088319,
+      "grad_norm": 0.5037244558334351,
+      "learning_rate": 0.00017953797889947915,
+      "loss": 0.85,
+      "step": 4672
+    },
+    {
+      "epoch": 0.8320868945868946,
+      "grad_norm": 0.5618211627006531,
+      "learning_rate": 0.0001795294940933016,
+      "loss": 1.145,
+      "step": 4673
+    },
+    {
+      "epoch": 0.8322649572649573,
+      "grad_norm": 0.6275593042373657,
+      "learning_rate": 0.00017952100772890877,
+      "loss": 0.9061,
+      "step": 4674
+    },
+    {
+      "epoch": 0.83244301994302,
+      "grad_norm": 0.5376096367835999,
+      "learning_rate": 0.00017951251980646702,
+      "loss": 1.1948,
+      "step": 4675
+    },
+    {
+      "epoch": 0.8326210826210826,
+      "grad_norm": 0.5162268877029419,
+      "learning_rate": 0.0001795040303261426,
+      "loss": 1.2158,
+      "step": 4676
+    },
+    {
+      "epoch": 0.8327991452991453,
+      "grad_norm": 0.5730512142181396,
+      "learning_rate": 0.0001794955392881019,
+      "loss": 0.9962,
+      "step": 4677
+    },
+    {
+      "epoch": 0.832977207977208,
+      "grad_norm": 0.5128712058067322,
+      "learning_rate": 0.00017948704669251122,
+      "loss": 1.2797,
+      "step": 4678
+    },
+    {
+      "epoch": 0.8331552706552706,
+      "grad_norm": 0.5173979997634888,
+      "learning_rate": 0.00017947855253953697,
+      "loss": 1.1093,
+      "step": 4679
+    },
+    {
+      "epoch": 0.8333333333333334,
+      "grad_norm": 0.504646897315979,
+      "learning_rate": 0.0001794700568293456,
+      "loss": 1.3171,
+      "step": 4680
+    },
+    {
+      "epoch": 0.833511396011396,
+      "grad_norm": 0.5638105869293213,
+      "learning_rate": 0.00017946155956210356,
+      "loss": 0.9224,
+      "step": 4681
+    },
+    {
+      "epoch": 0.8336894586894587,
+      "grad_norm": 0.5289680361747742,
+      "learning_rate": 0.00017945306073797733,
+      "loss": 0.8919,
+      "step": 4682
+    },
+    {
+      "epoch": 0.8338675213675214,
+      "grad_norm": 0.5224629044532776,
+      "learning_rate": 0.0001794445603571334,
+      "loss": 1.0345,
+      "step": 4683
+    },
+    {
+      "epoch": 0.834045584045584,
+      "grad_norm": 0.5342282056808472,
+      "learning_rate": 0.00017943605841973836,
+      "loss": 1.2305,
+      "step": 4684
+    },
+    {
+      "epoch": 0.8342236467236467,
+      "grad_norm": 0.6118032336235046,
+      "learning_rate": 0.00017942755492595874,
+      "loss": 1.0316,
+      "step": 4685
+    },
+    {
+      "epoch": 0.8344017094017094,
+      "grad_norm": 0.49112311005592346,
+      "learning_rate": 0.00017941904987596121,
+      "loss": 0.9809,
+      "step": 4686
+    },
+    {
+      "epoch": 0.8345797720797721,
+      "grad_norm": 0.5044063925743103,
+      "learning_rate": 0.0001794105432699124,
+      "loss": 0.834,
+      "step": 4687
+    },
+    {
+      "epoch": 0.8347578347578347,
+      "grad_norm": 0.4849987328052521,
+      "learning_rate": 0.00017940203510797892,
+      "loss": 0.9971,
+      "step": 4688
+    },
+    {
+      "epoch": 0.8349358974358975,
+      "grad_norm": 0.5539469122886658,
+      "learning_rate": 0.00017939352539032748,
+      "loss": 1.1599,
+      "step": 4689
+    },
+    {
+      "epoch": 0.8351139601139601,
+      "grad_norm": 0.5474258065223694,
+      "learning_rate": 0.00017938501411712485,
+      "loss": 1.25,
+      "step": 4690
+    },
+    {
+      "epoch": 0.8352920227920227,
+      "grad_norm": 0.4880213737487793,
+      "learning_rate": 0.0001793765012885378,
+      "loss": 1.1471,
+      "step": 4691
+    },
+    {
+      "epoch": 0.8354700854700855,
+      "grad_norm": 0.5602759718894958,
+      "learning_rate": 0.00017936798690473309,
+      "loss": 1.0723,
+      "step": 4692
+    },
+    {
+      "epoch": 0.8356481481481481,
+      "grad_norm": 0.627775251865387,
+      "learning_rate": 0.00017935947096587755,
+      "loss": 1.3768,
+      "step": 4693
+    },
+    {
+      "epoch": 0.8358262108262108,
+      "grad_norm": 0.5324847102165222,
+      "learning_rate": 0.00017935095347213804,
+      "loss": 0.9945,
+      "step": 4694
+    },
+    {
+      "epoch": 0.8360042735042735,
+      "grad_norm": 0.5244048237800598,
+      "learning_rate": 0.0001793424344236814,
+      "loss": 1.1725,
+      "step": 4695
+    },
+    {
+      "epoch": 0.8361823361823362,
+      "grad_norm": 0.5420708656311035,
+      "learning_rate": 0.00017933391382067462,
+      "loss": 1.1267,
+      "step": 4696
+    },
+    {
+      "epoch": 0.8363603988603988,
+      "grad_norm": 0.5285456776618958,
+      "learning_rate": 0.00017932539166328458,
+      "loss": 1.0368,
+      "step": 4697
+    },
+    {
+      "epoch": 0.8365384615384616,
+      "grad_norm": 0.5330373048782349,
+      "learning_rate": 0.00017931686795167825,
+      "loss": 1.1082,
+      "step": 4698
+    },
+    {
+      "epoch": 0.8367165242165242,
+      "grad_norm": 0.5516682267189026,
+      "learning_rate": 0.0001793083426860227,
+      "loss": 1.1833,
+      "step": 4699
+    },
+    {
+      "epoch": 0.8368945868945868,
+      "grad_norm": 0.5229935646057129,
+      "learning_rate": 0.0001792998158664849,
+      "loss": 0.8527,
+      "step": 4700
+    },
+    {
+      "epoch": 0.8370726495726496,
+      "grad_norm": 0.4821490943431854,
+      "learning_rate": 0.00017929128749323195,
+      "loss": 1.1201,
+      "step": 4701
+    },
+    {
+      "epoch": 0.8372507122507122,
+      "grad_norm": 0.6276404857635498,
+      "learning_rate": 0.0001792827575664309,
+      "loss": 1.0986,
+      "step": 4702
+    },
+    {
+      "epoch": 0.8374287749287749,
+      "grad_norm": 0.5681334733963013,
+      "learning_rate": 0.00017927422608624897,
+      "loss": 1.3821,
+      "step": 4703
+    },
+    {
+      "epoch": 0.8376068376068376,
+      "grad_norm": 0.5257087349891663,
+      "learning_rate": 0.00017926569305285324,
+      "loss": 1.1033,
+      "step": 4704
+    },
+    {
+      "epoch": 0.8377849002849003,
+      "grad_norm": 0.5665168166160583,
+      "learning_rate": 0.0001792571584664109,
+      "loss": 1.104,
+      "step": 4705
+    },
+    {
+      "epoch": 0.8379629629629629,
+      "grad_norm": 0.5202076435089111,
+      "learning_rate": 0.00017924862232708918,
+      "loss": 1.052,
+      "step": 4706
+    },
+    {
+      "epoch": 0.8381410256410257,
+      "grad_norm": 0.5103010535240173,
+      "learning_rate": 0.00017924008463505534,
+      "loss": 1.1348,
+      "step": 4707
+    },
+    {
+      "epoch": 0.8383190883190883,
+      "grad_norm": 0.6811865568161011,
+      "learning_rate": 0.00017923154539047667,
+      "loss": 1.2804,
+      "step": 4708
+    },
+    {
+      "epoch": 0.8384971509971509,
+      "grad_norm": 0.46808311343193054,
+      "learning_rate": 0.00017922300459352042,
+      "loss": 0.9302,
+      "step": 4709
+    },
+    {
+      "epoch": 0.8386752136752137,
+      "grad_norm": 0.47713059186935425,
+      "learning_rate": 0.00017921446224435398,
+      "loss": 0.78,
+      "step": 4710
+    },
+    {
+      "epoch": 0.8388532763532763,
+      "grad_norm": 0.7579890489578247,
+      "learning_rate": 0.0001792059183431447,
+      "loss": 1.4776,
+      "step": 4711
+    },
+    {
+      "epoch": 0.8390313390313391,
+      "grad_norm": 0.6009423136711121,
+      "learning_rate": 0.00017919737289006,
+      "loss": 1.2679,
+      "step": 4712
+    },
+    {
+      "epoch": 0.8392094017094017,
+      "grad_norm": 0.56390780210495,
+      "learning_rate": 0.00017918882588526729,
+      "loss": 1.0402,
+      "step": 4713
+    },
+    {
+      "epoch": 0.8393874643874644,
+      "grad_norm": 0.5698862075805664,
+      "learning_rate": 0.00017918027732893404,
+      "loss": 1.2336,
+      "step": 4714
+    },
+    {
+      "epoch": 0.8395655270655271,
+      "grad_norm": 0.5016305446624756,
+      "learning_rate": 0.0001791717272212277,
+      "loss": 1.0373,
+      "step": 4715
+    },
+    {
+      "epoch": 0.8397435897435898,
+      "grad_norm": 0.5886971950531006,
+      "learning_rate": 0.0001791631755623159,
+      "loss": 1.1062,
+      "step": 4716
+    },
+    {
+      "epoch": 0.8399216524216524,
+      "grad_norm": 0.647833526134491,
+      "learning_rate": 0.00017915462235236607,
+      "loss": 1.0464,
+      "step": 4717
+    },
+    {
+      "epoch": 0.8400997150997151,
+      "grad_norm": 0.4961194396018982,
+      "learning_rate": 0.00017914606759154587,
+      "loss": 1.0763,
+      "step": 4718
+    },
+    {
+      "epoch": 0.8402777777777778,
+      "grad_norm": 0.47041359543800354,
+      "learning_rate": 0.00017913751128002288,
+      "loss": 1.0685,
+      "step": 4719
+    },
+    {
+      "epoch": 0.8404558404558404,
+      "grad_norm": 0.5752858519554138,
+      "learning_rate": 0.00017912895341796475,
+      "loss": 1.0577,
+      "step": 4720
+    },
+    {
+      "epoch": 0.8406339031339032,
+      "grad_norm": 0.5233224034309387,
+      "learning_rate": 0.00017912039400553914,
+      "loss": 1.1484,
+      "step": 4721
+    },
+    {
+      "epoch": 0.8408119658119658,
+      "grad_norm": 0.5327485203742981,
+      "learning_rate": 0.00017911183304291378,
+      "loss": 1.0028,
+      "step": 4722
+    },
+    {
+      "epoch": 0.8409900284900285,
+      "grad_norm": 0.5320752263069153,
+      "learning_rate": 0.00017910327053025638,
+      "loss": 1.1247,
+      "step": 4723
+    },
+    {
+      "epoch": 0.8411680911680912,
+      "grad_norm": 0.529617965221405,
+      "learning_rate": 0.00017909470646773477,
+      "loss": 1.1698,
+      "step": 4724
+    },
+    {
+      "epoch": 0.8413461538461539,
+      "grad_norm": 0.5055609345436096,
+      "learning_rate": 0.00017908614085551664,
+      "loss": 1.0925,
+      "step": 4725
+    },
+    {
+      "epoch": 0.8415242165242165,
+      "grad_norm": 0.5356255769729614,
+      "learning_rate": 0.00017907757369376985,
+      "loss": 1.0354,
+      "step": 4726
+    },
+    {
+      "epoch": 0.8417022792022792,
+      "grad_norm": 0.582834780216217,
+      "learning_rate": 0.00017906900498266233,
+      "loss": 1.1248,
+      "step": 4727
+    },
+    {
+      "epoch": 0.8418803418803419,
+      "grad_norm": 0.5750834941864014,
+      "learning_rate": 0.00017906043472236188,
+      "loss": 1.0119,
+      "step": 4728
+    },
+    {
+      "epoch": 0.8420584045584045,
+      "grad_norm": 0.5923320055007935,
+      "learning_rate": 0.00017905186291303644,
+      "loss": 1.0662,
+      "step": 4729
+    },
+    {
+      "epoch": 0.8422364672364673,
+      "grad_norm": 0.4767811894416809,
+      "learning_rate": 0.00017904328955485396,
+      "loss": 1.0911,
+      "step": 4730
+    },
+    {
+      "epoch": 0.8424145299145299,
+      "grad_norm": 0.5294556021690369,
+      "learning_rate": 0.00017903471464798245,
+      "loss": 1.2861,
+      "step": 4731
+    },
+    {
+      "epoch": 0.8425925925925926,
+      "grad_norm": 0.599117636680603,
+      "learning_rate": 0.00017902613819258985,
+      "loss": 1.1707,
+      "step": 4732
+    },
+    {
+      "epoch": 0.8427706552706553,
+      "grad_norm": 0.5912977457046509,
+      "learning_rate": 0.00017901756018884424,
+      "loss": 1.1884,
+      "step": 4733
+    },
+    {
+      "epoch": 0.842948717948718,
+      "grad_norm": 0.587676465511322,
+      "learning_rate": 0.0001790089806369137,
+      "loss": 1.1054,
+      "step": 4734
+    },
+    {
+      "epoch": 0.8431267806267806,
+      "grad_norm": 0.6271800398826599,
+      "learning_rate": 0.0001790003995369663,
+      "loss": 1.2094,
+      "step": 4735
+    },
+    {
+      "epoch": 0.8433048433048433,
+      "grad_norm": 0.47198590636253357,
+      "learning_rate": 0.00017899181688917017,
+      "loss": 0.9561,
+      "step": 4736
+    },
+    {
+      "epoch": 0.843482905982906,
+      "grad_norm": 0.690732479095459,
+      "learning_rate": 0.00017898323269369351,
+      "loss": 1.1629,
+      "step": 4737
+    },
+    {
+      "epoch": 0.8436609686609686,
+      "grad_norm": 0.4926888048648834,
+      "learning_rate": 0.00017897464695070445,
+      "loss": 1.1097,
+      "step": 4738
+    },
+    {
+      "epoch": 0.8438390313390314,
+      "grad_norm": 0.7071278691291809,
+      "learning_rate": 0.00017896605966037128,
+      "loss": 1.195,
+      "step": 4739
+    },
+    {
+      "epoch": 0.844017094017094,
+      "grad_norm": 0.5650486350059509,
+      "learning_rate": 0.00017895747082286216,
+      "loss": 1.0107,
+      "step": 4740
+    },
+    {
+      "epoch": 0.8441951566951567,
+      "grad_norm": 0.5291931629180908,
+      "learning_rate": 0.00017894888043834545,
+      "loss": 1.0104,
+      "step": 4741
+    },
+    {
+      "epoch": 0.8443732193732194,
+      "grad_norm": 0.5751241445541382,
+      "learning_rate": 0.00017894028850698942,
+      "loss": 1.2482,
+      "step": 4742
+    },
+    {
+      "epoch": 0.844551282051282,
+      "grad_norm": 0.5833632349967957,
+      "learning_rate": 0.0001789316950289624,
+      "loss": 1.0552,
+      "step": 4743
+    },
+    {
+      "epoch": 0.8447293447293447,
+      "grad_norm": 0.543729841709137,
+      "learning_rate": 0.00017892310000443282,
+      "loss": 1.1453,
+      "step": 4744
+    },
+    {
+      "epoch": 0.8449074074074074,
+      "grad_norm": 0.5674204230308533,
+      "learning_rate": 0.00017891450343356902,
+      "loss": 1.0757,
+      "step": 4745
+    },
+    {
+      "epoch": 0.8450854700854701,
+      "grad_norm": 0.5161892771720886,
+      "learning_rate": 0.00017890590531653946,
+      "loss": 1.1163,
+      "step": 4746
+    },
+    {
+      "epoch": 0.8452635327635327,
+      "grad_norm": 0.49907612800598145,
+      "learning_rate": 0.00017889730565351258,
+      "loss": 1.0356,
+      "step": 4747
+    },
+    {
+      "epoch": 0.8454415954415955,
+      "grad_norm": 0.4994732439517975,
+      "learning_rate": 0.00017888870444465692,
+      "loss": 1.026,
+      "step": 4748
+    },
+    {
+      "epoch": 0.8456196581196581,
+      "grad_norm": 0.6397520303726196,
+      "learning_rate": 0.00017888010169014095,
+      "loss": 0.957,
+      "step": 4749
+    },
+    {
+      "epoch": 0.8457977207977208,
+      "grad_norm": 0.5379729270935059,
+      "learning_rate": 0.00017887149739013327,
+      "loss": 1.1664,
+      "step": 4750
+    },
+    {
+      "epoch": 0.8459757834757835,
+      "grad_norm": 0.4487382769584656,
+      "learning_rate": 0.00017886289154480246,
+      "loss": 0.9377,
+      "step": 4751
+    },
+    {
+      "epoch": 0.8461538461538461,
+      "grad_norm": 0.5645943880081177,
+      "learning_rate": 0.00017885428415431707,
+      "loss": 1.273,
+      "step": 4752
+    },
+    {
+      "epoch": 0.8463319088319088,
+      "grad_norm": 0.5535289645195007,
+      "learning_rate": 0.00017884567521884577,
+      "loss": 1.1779,
+      "step": 4753
+    },
+    {
+      "epoch": 0.8465099715099715,
+      "grad_norm": 0.5039721131324768,
+      "learning_rate": 0.0001788370647385573,
+      "loss": 1.0237,
+      "step": 4754
+    },
+    {
+      "epoch": 0.8466880341880342,
+      "grad_norm": 0.4543854892253876,
+      "learning_rate": 0.00017882845271362032,
+      "loss": 0.8149,
+      "step": 4755
+    },
+    {
+      "epoch": 0.8468660968660968,
+      "grad_norm": 0.5095639824867249,
+      "learning_rate": 0.00017881983914420352,
+      "loss": 1.0141,
+      "step": 4756
+    },
+    {
+      "epoch": 0.8470441595441596,
+      "grad_norm": 0.5341798663139343,
+      "learning_rate": 0.00017881122403047575,
+      "loss": 1.1885,
+      "step": 4757
+    },
+    {
+      "epoch": 0.8472222222222222,
+      "grad_norm": 0.5595062971115112,
+      "learning_rate": 0.00017880260737260573,
+      "loss": 0.8939,
+      "step": 4758
+    },
+    {
+      "epoch": 0.8474002849002849,
+      "grad_norm": 0.5355880260467529,
+      "learning_rate": 0.00017879398917076232,
+      "loss": 1.2434,
+      "step": 4759
+    },
+    {
+      "epoch": 0.8475783475783476,
+      "grad_norm": 0.49477261304855347,
+      "learning_rate": 0.0001787853694251144,
+      "loss": 0.979,
+      "step": 4760
+    },
+    {
+      "epoch": 0.8477564102564102,
+      "grad_norm": 0.5154359340667725,
+      "learning_rate": 0.00017877674813583078,
+      "loss": 1.0957,
+      "step": 4761
+    },
+    {
+      "epoch": 0.8479344729344729,
+      "grad_norm": 0.5651070475578308,
+      "learning_rate": 0.00017876812530308046,
+      "loss": 1.1884,
+      "step": 4762
+    },
+    {
+      "epoch": 0.8481125356125356,
+      "grad_norm": 0.537277340888977,
+      "learning_rate": 0.00017875950092703232,
+      "loss": 1.0272,
+      "step": 4763
+    },
+    {
+      "epoch": 0.8482905982905983,
+      "grad_norm": 0.5259691476821899,
+      "learning_rate": 0.00017875087500785538,
+      "loss": 1.1493,
+      "step": 4764
+    },
+    {
+      "epoch": 0.8484686609686609,
+      "grad_norm": 0.5491300225257874,
+      "learning_rate": 0.00017874224754571867,
+      "loss": 0.8316,
+      "step": 4765
+    },
+    {
+      "epoch": 0.8486467236467237,
+      "grad_norm": 0.5493744611740112,
+      "learning_rate": 0.00017873361854079116,
+      "loss": 1.2328,
+      "step": 4766
+    },
+    {
+      "epoch": 0.8488247863247863,
+      "grad_norm": 0.571002185344696,
+      "learning_rate": 0.00017872498799324197,
+      "loss": 1.1384,
+      "step": 4767
+    },
+    {
+      "epoch": 0.8490028490028491,
+      "grad_norm": 0.538152813911438,
+      "learning_rate": 0.00017871635590324013,
+      "loss": 1.0581,
+      "step": 4768
+    },
+    {
+      "epoch": 0.8491809116809117,
+      "grad_norm": 0.5214923620223999,
+      "learning_rate": 0.00017870772227095486,
+      "loss": 1.0612,
+      "step": 4769
+    },
+    {
+      "epoch": 0.8493589743589743,
+      "grad_norm": 0.5714883804321289,
+      "learning_rate": 0.0001786990870965553,
+      "loss": 0.9076,
+      "step": 4770
+    },
+    {
+      "epoch": 0.8495370370370371,
+      "grad_norm": 0.4181775450706482,
+      "learning_rate": 0.00017869045038021054,
+      "loss": 0.8366,
+      "step": 4771
+    },
+    {
+      "epoch": 0.8497150997150997,
+      "grad_norm": 0.6266027688980103,
+      "learning_rate": 0.00017868181212208993,
+      "loss": 1.2047,
+      "step": 4772
+    },
+    {
+      "epoch": 0.8498931623931624,
+      "grad_norm": 0.5423732399940491,
+      "learning_rate": 0.0001786731723223626,
+      "loss": 1.3878,
+      "step": 4773
+    },
+    {
+      "epoch": 0.8500712250712251,
+      "grad_norm": 0.5512300133705139,
+      "learning_rate": 0.00017866453098119793,
+      "loss": 1.1132,
+      "step": 4774
+    },
+    {
+      "epoch": 0.8502492877492878,
+      "grad_norm": 0.5767185688018799,
+      "learning_rate": 0.00017865588809876519,
+      "loss": 0.97,
+      "step": 4775
+    },
+    {
+      "epoch": 0.8504273504273504,
+      "grad_norm": 0.5305790305137634,
+      "learning_rate": 0.00017864724367523368,
+      "loss": 1.1158,
+      "step": 4776
+    },
+    {
+      "epoch": 0.8506054131054132,
+      "grad_norm": 0.49702391028404236,
+      "learning_rate": 0.00017863859771077284,
+      "loss": 0.9669,
+      "step": 4777
+    },
+    {
+      "epoch": 0.8507834757834758,
+      "grad_norm": 0.5490063428878784,
+      "learning_rate": 0.00017862995020555205,
+      "loss": 1.0646,
+      "step": 4778
+    },
+    {
+      "epoch": 0.8509615384615384,
+      "grad_norm": 0.5308689475059509,
+      "learning_rate": 0.00017862130115974068,
+      "loss": 0.8922,
+      "step": 4779
+    },
+    {
+      "epoch": 0.8511396011396012,
+      "grad_norm": 0.5412983894348145,
+      "learning_rate": 0.00017861265057350826,
+      "loss": 1.1444,
+      "step": 4780
+    },
+    {
+      "epoch": 0.8513176638176638,
+      "grad_norm": 0.5857377052307129,
+      "learning_rate": 0.00017860399844702425,
+      "loss": 1.1643,
+      "step": 4781
+    },
+    {
+      "epoch": 0.8514957264957265,
+      "grad_norm": 0.599273681640625,
+      "learning_rate": 0.00017859534478045815,
+      "loss": 1.169,
+      "step": 4782
+    },
+    {
+      "epoch": 0.8516737891737892,
+      "grad_norm": 0.5677087903022766,
+      "learning_rate": 0.00017858668957397957,
+      "loss": 1.0793,
+      "step": 4783
+    },
+    {
+      "epoch": 0.8518518518518519,
+      "grad_norm": 0.5648362636566162,
+      "learning_rate": 0.00017857803282775807,
+      "loss": 1.1932,
+      "step": 4784
+    },
+    {
+      "epoch": 0.8520299145299145,
+      "grad_norm": 0.5138826966285706,
+      "learning_rate": 0.00017856937454196323,
+      "loss": 1.0011,
+      "step": 4785
+    },
+    {
+      "epoch": 0.8522079772079773,
+      "grad_norm": 0.5951429009437561,
+      "learning_rate": 0.0001785607147167647,
+      "loss": 1.3198,
+      "step": 4786
+    },
+    {
+      "epoch": 0.8523860398860399,
+      "grad_norm": 0.5341953039169312,
+      "learning_rate": 0.00017855205335233216,
+      "loss": 0.9094,
+      "step": 4787
+    },
+    {
+      "epoch": 0.8525641025641025,
+      "grad_norm": 0.5193579196929932,
+      "learning_rate": 0.00017854339044883535,
+      "loss": 0.892,
+      "step": 4788
+    },
+    {
+      "epoch": 0.8527421652421653,
+      "grad_norm": 0.5053097009658813,
+      "learning_rate": 0.00017853472600644392,
+      "loss": 1.0589,
+      "step": 4789
+    },
+    {
+      "epoch": 0.8529202279202279,
+      "grad_norm": 0.5819617509841919,
+      "learning_rate": 0.0001785260600253277,
+      "loss": 1.2646,
+      "step": 4790
+    },
+    {
+      "epoch": 0.8530982905982906,
+      "grad_norm": 0.5327470302581787,
+      "learning_rate": 0.00017851739250565645,
+      "loss": 1.056,
+      "step": 4791
+    },
+    {
+      "epoch": 0.8532763532763533,
+      "grad_norm": 0.5131269097328186,
+      "learning_rate": 0.0001785087234476,
+      "loss": 1.1192,
+      "step": 4792
+    },
+    {
+      "epoch": 0.853454415954416,
+      "grad_norm": 0.4698086977005005,
+      "learning_rate": 0.00017850005285132821,
+      "loss": 0.9849,
+      "step": 4793
+    },
+    {
+      "epoch": 0.8536324786324786,
+      "grad_norm": 0.5503947734832764,
+      "learning_rate": 0.00017849138071701092,
+      "loss": 1.1139,
+      "step": 4794
+    },
+    {
+      "epoch": 0.8538105413105413,
+      "grad_norm": 0.5120903849601746,
+      "learning_rate": 0.0001784827070448181,
+      "loss": 0.9801,
+      "step": 4795
+    },
+    {
+      "epoch": 0.853988603988604,
+      "grad_norm": 0.47650405764579773,
+      "learning_rate": 0.00017847403183491968,
+      "loss": 1.0268,
+      "step": 4796
+    },
+    {
+      "epoch": 0.8541666666666666,
+      "grad_norm": 0.5773387551307678,
+      "learning_rate": 0.0001784653550874856,
+      "loss": 1.0336,
+      "step": 4797
+    },
+    {
+      "epoch": 0.8543447293447294,
+      "grad_norm": 0.545531153678894,
+      "learning_rate": 0.00017845667680268593,
+      "loss": 1.0532,
+      "step": 4798
+    },
+    {
+      "epoch": 0.854522792022792,
+      "grad_norm": 0.533161461353302,
+      "learning_rate": 0.0001784479969806906,
+      "loss": 1.1964,
+      "step": 4799
+    },
+    {
+      "epoch": 0.8547008547008547,
+      "grad_norm": 0.5880789160728455,
+      "learning_rate": 0.00017843931562166977,
+      "loss": 1.1588,
+      "step": 4800
+    },
+    {
+      "epoch": 0.8548789173789174,
+      "grad_norm": 0.5381524562835693,
+      "learning_rate": 0.00017843063272579346,
+      "loss": 1.1533,
+      "step": 4801
+    },
+    {
+      "epoch": 0.85505698005698,
+      "grad_norm": 0.6280176639556885,
+      "learning_rate": 0.00017842194829323187,
+      "loss": 1.0084,
+      "step": 4802
+    },
+    {
+      "epoch": 0.8552350427350427,
+      "grad_norm": 0.5098552703857422,
+      "learning_rate": 0.0001784132623241551,
+      "loss": 1.0804,
+      "step": 4803
+    },
+    {
+      "epoch": 0.8554131054131054,
+      "grad_norm": 0.5406526923179626,
+      "learning_rate": 0.00017840457481873328,
+      "loss": 1.2571,
+      "step": 4804
+    },
+    {
+      "epoch": 0.8555911680911681,
+      "grad_norm": 0.5859003663063049,
+      "learning_rate": 0.00017839588577713678,
+      "loss": 1.2462,
+      "step": 4805
+    },
+    {
+      "epoch": 0.8557692307692307,
+      "grad_norm": 0.6209002137184143,
+      "learning_rate": 0.00017838719519953572,
+      "loss": 1.307,
+      "step": 4806
+    },
+    {
+      "epoch": 0.8559472934472935,
+      "grad_norm": 0.525753915309906,
+      "learning_rate": 0.00017837850308610037,
+      "loss": 1.2957,
+      "step": 4807
+    },
+    {
+      "epoch": 0.8561253561253561,
+      "grad_norm": 0.5096195340156555,
+      "learning_rate": 0.0001783698094370011,
+      "loss": 1.1433,
+      "step": 4808
+    },
+    {
+      "epoch": 0.8563034188034188,
+      "grad_norm": 0.5873076915740967,
+      "learning_rate": 0.0001783611142524082,
+      "loss": 1.2271,
+      "step": 4809
+    },
+    {
+      "epoch": 0.8564814814814815,
+      "grad_norm": 0.5093944668769836,
+      "learning_rate": 0.0001783524175324921,
+      "loss": 0.8788,
+      "step": 4810
+    },
+    {
+      "epoch": 0.8566595441595442,
+      "grad_norm": 0.5485084652900696,
+      "learning_rate": 0.00017834371927742307,
+      "loss": 1.256,
+      "step": 4811
+    },
+    {
+      "epoch": 0.8568376068376068,
+      "grad_norm": 0.5808873772621155,
+      "learning_rate": 0.00017833501948737163,
+      "loss": 0.9287,
+      "step": 4812
+    },
+    {
+      "epoch": 0.8570156695156695,
+      "grad_norm": 0.5113978385925293,
+      "learning_rate": 0.00017832631816250822,
+      "loss": 1.0372,
+      "step": 4813
+    },
+    {
+      "epoch": 0.8571937321937322,
+      "grad_norm": 0.5877016186714172,
+      "learning_rate": 0.0001783176153030033,
+      "loss": 1.3023,
+      "step": 4814
+    },
+    {
+      "epoch": 0.8573717948717948,
+      "grad_norm": 0.534328043460846,
+      "learning_rate": 0.00017830891090902742,
+      "loss": 1.1023,
+      "step": 4815
+    },
+    {
+      "epoch": 0.8575498575498576,
+      "grad_norm": 0.5781638026237488,
+      "learning_rate": 0.0001783002049807511,
+      "loss": 0.9562,
+      "step": 4816
+    },
+    {
+      "epoch": 0.8577279202279202,
+      "grad_norm": 0.5760263204574585,
+      "learning_rate": 0.00017829149751834487,
+      "loss": 0.8733,
+      "step": 4817
+    },
+    {
+      "epoch": 0.8579059829059829,
+      "grad_norm": 0.3887255787849426,
+      "learning_rate": 0.00017828278852197944,
+      "loss": 0.5949,
+      "step": 4818
+    },
+    {
+      "epoch": 0.8580840455840456,
+      "grad_norm": 0.47814446687698364,
+      "learning_rate": 0.00017827407799182537,
+      "loss": 1.0698,
+      "step": 4819
+    },
+    {
+      "epoch": 0.8582621082621082,
+      "grad_norm": 0.5520272254943848,
+      "learning_rate": 0.00017826536592805334,
+      "loss": 1.1314,
+      "step": 4820
+    },
+    {
+      "epoch": 0.8584401709401709,
+      "grad_norm": 0.5285319685935974,
+      "learning_rate": 0.00017825665233083405,
+      "loss": 1.1618,
+      "step": 4821
+    },
+    {
+      "epoch": 0.8586182336182336,
+      "grad_norm": 0.6080102324485779,
+      "learning_rate": 0.0001782479372003382,
+      "loss": 1.3817,
+      "step": 4822
+    },
+    {
+      "epoch": 0.8587962962962963,
+      "grad_norm": 0.7474410533905029,
+      "learning_rate": 0.00017823922053673662,
+      "loss": 1.1321,
+      "step": 4823
+    },
+    {
+      "epoch": 0.8589743589743589,
+      "grad_norm": 0.559283435344696,
+      "learning_rate": 0.0001782305023402,
+      "loss": 1.1894,
+      "step": 4824
+    },
+    {
+      "epoch": 0.8591524216524217,
+      "grad_norm": 0.5620571374893188,
+      "learning_rate": 0.00017822178261089918,
+      "loss": 1.134,
+      "step": 4825
+    },
+    {
+      "epoch": 0.8593304843304843,
+      "grad_norm": 0.5553044676780701,
+      "learning_rate": 0.00017821306134900504,
+      "loss": 1.3222,
+      "step": 4826
+    },
+    {
+      "epoch": 0.8595085470085471,
+      "grad_norm": 0.6177778244018555,
+      "learning_rate": 0.00017820433855468846,
+      "loss": 1.2545,
+      "step": 4827
+    },
+    {
+      "epoch": 0.8596866096866097,
+      "grad_norm": 0.656233012676239,
+      "learning_rate": 0.0001781956142281203,
+      "loss": 1.1346,
+      "step": 4828
+    },
+    {
+      "epoch": 0.8598646723646723,
+      "grad_norm": 0.6710973381996155,
+      "learning_rate": 0.0001781868883694715,
+      "loss": 1.1361,
+      "step": 4829
+    },
+    {
+      "epoch": 0.8600427350427351,
+      "grad_norm": 0.5093601942062378,
+      "learning_rate": 0.0001781781609789131,
+      "loss": 1.0509,
+      "step": 4830
+    },
+    {
+      "epoch": 0.8602207977207977,
+      "grad_norm": 0.5707578063011169,
+      "learning_rate": 0.00017816943205661598,
+      "loss": 1.0964,
+      "step": 4831
+    },
+    {
+      "epoch": 0.8603988603988604,
+      "grad_norm": 0.6159597635269165,
+      "learning_rate": 0.00017816070160275125,
+      "loss": 1.0322,
+      "step": 4832
+    },
+    {
+      "epoch": 0.8605769230769231,
+      "grad_norm": 0.5430580377578735,
+      "learning_rate": 0.0001781519696174899,
+      "loss": 1.2464,
+      "step": 4833
+    },
+    {
+      "epoch": 0.8607549857549858,
+      "grad_norm": 0.48104700446128845,
+      "learning_rate": 0.0001781432361010031,
+      "loss": 1.1031,
+      "step": 4834
+    },
+    {
+      "epoch": 0.8609330484330484,
+      "grad_norm": 0.5304946303367615,
+      "learning_rate": 0.0001781345010534619,
+      "loss": 1.0281,
+      "step": 4835
+    },
+    {
+      "epoch": 0.8611111111111112,
+      "grad_norm": 0.5230711698532104,
+      "learning_rate": 0.00017812576447503742,
+      "loss": 0.9499,
+      "step": 4836
+    },
+    {
+      "epoch": 0.8612891737891738,
+      "grad_norm": 0.5363606214523315,
+      "learning_rate": 0.00017811702636590093,
+      "loss": 1.1358,
+      "step": 4837
+    },
+    {
+      "epoch": 0.8614672364672364,
+      "grad_norm": 0.5880044102668762,
+      "learning_rate": 0.00017810828672622358,
+      "loss": 1.1765,
+      "step": 4838
+    },
+    {
+      "epoch": 0.8616452991452992,
+      "grad_norm": 0.5194395184516907,
+      "learning_rate": 0.0001780995455561766,
+      "loss": 1.1622,
+      "step": 4839
+    },
+    {
+      "epoch": 0.8618233618233618,
+      "grad_norm": 0.5114264488220215,
+      "learning_rate": 0.00017809080285593126,
+      "loss": 1.0081,
+      "step": 4840
+    },
+    {
+      "epoch": 0.8620014245014245,
+      "grad_norm": 0.6174240112304688,
+      "learning_rate": 0.00017808205862565886,
+      "loss": 1.0745,
+      "step": 4841
+    },
+    {
+      "epoch": 0.8621794871794872,
+      "grad_norm": 0.5662630200386047,
+      "learning_rate": 0.0001780733128655307,
+      "loss": 1.3369,
+      "step": 4842
+    },
+    {
+      "epoch": 0.8623575498575499,
+      "grad_norm": 0.5917882919311523,
+      "learning_rate": 0.00017806456557571817,
+      "loss": 1.1631,
+      "step": 4843
+    },
+    {
+      "epoch": 0.8625356125356125,
+      "grad_norm": 0.5305736660957336,
+      "learning_rate": 0.00017805581675639265,
+      "loss": 0.9875,
+      "step": 4844
+    },
+    {
+      "epoch": 0.8627136752136753,
+      "grad_norm": 0.5181219577789307,
+      "learning_rate": 0.00017804706640772556,
+      "loss": 0.9918,
+      "step": 4845
+    },
+    {
+      "epoch": 0.8628917378917379,
+      "grad_norm": 0.5467997789382935,
+      "learning_rate": 0.00017803831452988832,
+      "loss": 1.1395,
+      "step": 4846
+    },
+    {
+      "epoch": 0.8630698005698005,
+      "grad_norm": 0.5494031310081482,
+      "learning_rate": 0.00017802956112305241,
+      "loss": 1.0312,
+      "step": 4847
+    },
+    {
+      "epoch": 0.8632478632478633,
+      "grad_norm": 0.5804065465927124,
+      "learning_rate": 0.00017802080618738931,
+      "loss": 1.1555,
+      "step": 4848
+    },
+    {
+      "epoch": 0.8634259259259259,
+      "grad_norm": 0.5424801111221313,
+      "learning_rate": 0.00017801204972307067,
+      "loss": 1.0215,
+      "step": 4849
+    },
+    {
+      "epoch": 0.8636039886039886,
+      "grad_norm": 0.5321891903877258,
+      "learning_rate": 0.0001780032917302679,
+      "loss": 1.0187,
+      "step": 4850
+    },
+    {
+      "epoch": 0.8637820512820513,
+      "grad_norm": 0.5543400049209595,
+      "learning_rate": 0.0001779945322091527,
+      "loss": 1.1972,
+      "step": 4851
+    },
+    {
+      "epoch": 0.863960113960114,
+      "grad_norm": 0.566649317741394,
+      "learning_rate": 0.00017798577115989668,
+      "loss": 1.0758,
+      "step": 4852
+    },
+    {
+      "epoch": 0.8641381766381766,
+      "grad_norm": 0.5538444519042969,
+      "learning_rate": 0.00017797700858267145,
+      "loss": 1.1338,
+      "step": 4853
+    },
+    {
+      "epoch": 0.8643162393162394,
+      "grad_norm": 0.5641313791275024,
+      "learning_rate": 0.0001779682444776487,
+      "loss": 1.256,
+      "step": 4854
+    },
+    {
+      "epoch": 0.864494301994302,
+      "grad_norm": 0.6377350091934204,
+      "learning_rate": 0.00017795947884500016,
+      "loss": 1.144,
+      "step": 4855
+    },
+    {
+      "epoch": 0.8646723646723646,
+      "grad_norm": 0.5581876039505005,
+      "learning_rate": 0.0001779507116848976,
+      "loss": 1.3163,
+      "step": 4856
+    },
+    {
+      "epoch": 0.8648504273504274,
+      "grad_norm": 0.5416772365570068,
+      "learning_rate": 0.0001779419429975128,
+      "loss": 1.0219,
+      "step": 4857
+    },
+    {
+      "epoch": 0.86502849002849,
+      "grad_norm": 0.5450608730316162,
+      "learning_rate": 0.0001779331727830175,
+      "loss": 1.0093,
+      "step": 4858
+    },
+    {
+      "epoch": 0.8652065527065527,
+      "grad_norm": 0.5151242017745972,
+      "learning_rate": 0.00017792440104158358,
+      "loss": 1.067,
+      "step": 4859
+    },
+    {
+      "epoch": 0.8653846153846154,
+      "grad_norm": 0.5225046873092651,
+      "learning_rate": 0.0001779156277733829,
+      "loss": 1.0432,
+      "step": 4860
+    },
+    {
+      "epoch": 0.8655626780626781,
+      "grad_norm": 0.5168602466583252,
+      "learning_rate": 0.00017790685297858737,
+      "loss": 0.9665,
+      "step": 4861
+    },
+    {
+      "epoch": 0.8657407407407407,
+      "grad_norm": 0.5749059319496155,
+      "learning_rate": 0.00017789807665736889,
+      "loss": 1.1607,
+      "step": 4862
+    },
+    {
+      "epoch": 0.8659188034188035,
+      "grad_norm": 0.45656394958496094,
+      "learning_rate": 0.00017788929880989938,
+      "loss": 0.8362,
+      "step": 4863
+    },
+    {
+      "epoch": 0.8660968660968661,
+      "grad_norm": 0.5090615749359131,
+      "learning_rate": 0.00017788051943635086,
+      "loss": 0.9553,
+      "step": 4864
+    },
+    {
+      "epoch": 0.8662749287749287,
+      "grad_norm": 0.5381240248680115,
+      "learning_rate": 0.0001778717385368954,
+      "loss": 1.1391,
+      "step": 4865
+    },
+    {
+      "epoch": 0.8664529914529915,
+      "grad_norm": 0.522720456123352,
+      "learning_rate": 0.00017786295611170493,
+      "loss": 1.1869,
+      "step": 4866
+    },
+    {
+      "epoch": 0.8666310541310541,
+      "grad_norm": 0.530986487865448,
+      "learning_rate": 0.0001778541721609516,
+      "loss": 1.1046,
+      "step": 4867
+    },
+    {
+      "epoch": 0.8668091168091168,
+      "grad_norm": 0.5065864324569702,
+      "learning_rate": 0.0001778453866848075,
+      "loss": 1.008,
+      "step": 4868
+    },
+    {
+      "epoch": 0.8669871794871795,
+      "grad_norm": 0.5541394352912903,
+      "learning_rate": 0.00017783659968344476,
+      "loss": 1.0004,
+      "step": 4869
+    },
+    {
+      "epoch": 0.8671652421652422,
+      "grad_norm": 0.5059576630592346,
+      "learning_rate": 0.00017782781115703556,
+      "loss": 1.128,
+      "step": 4870
+    },
+    {
+      "epoch": 0.8673433048433048,
+      "grad_norm": 0.5052187442779541,
+      "learning_rate": 0.00017781902110575203,
+      "loss": 0.8544,
+      "step": 4871
+    },
+    {
+      "epoch": 0.8675213675213675,
+      "grad_norm": 0.5383397340774536,
+      "learning_rate": 0.00017781022952976646,
+      "loss": 1.1411,
+      "step": 4872
+    },
+    {
+      "epoch": 0.8676994301994302,
+      "grad_norm": 0.4760429859161377,
+      "learning_rate": 0.00017780143642925106,
+      "loss": 0.8246,
+      "step": 4873
+    },
+    {
+      "epoch": 0.8678774928774928,
+      "grad_norm": 0.5480535626411438,
+      "learning_rate": 0.00017779264180437817,
+      "loss": 1.013,
+      "step": 4874
+    },
+    {
+      "epoch": 0.8680555555555556,
+      "grad_norm": 0.5303317904472351,
+      "learning_rate": 0.00017778384565532004,
+      "loss": 1.0201,
+      "step": 4875
+    },
+    {
+      "epoch": 0.8682336182336182,
+      "grad_norm": 0.5365355014801025,
+      "learning_rate": 0.00017777504798224903,
+      "loss": 1.1107,
+      "step": 4876
+    },
+    {
+      "epoch": 0.8684116809116809,
+      "grad_norm": 0.5173360705375671,
+      "learning_rate": 0.00017776624878533754,
+      "loss": 1.0808,
+      "step": 4877
+    },
+    {
+      "epoch": 0.8685897435897436,
+      "grad_norm": 0.5088842511177063,
+      "learning_rate": 0.00017775744806475792,
+      "loss": 0.995,
+      "step": 4878
+    },
+    {
+      "epoch": 0.8687678062678063,
+      "grad_norm": 0.5796698927879333,
+      "learning_rate": 0.00017774864582068264,
+      "loss": 1.1485,
+      "step": 4879
+    },
+    {
+      "epoch": 0.8689458689458689,
+      "grad_norm": 0.5719375610351562,
+      "learning_rate": 0.00017773984205328417,
+      "loss": 1.0133,
+      "step": 4880
+    },
+    {
+      "epoch": 0.8691239316239316,
+      "grad_norm": 0.6396418213844299,
+      "learning_rate": 0.00017773103676273498,
+      "loss": 1.0932,
+      "step": 4881
+    },
+    {
+      "epoch": 0.8693019943019943,
+      "grad_norm": 0.5602468252182007,
+      "learning_rate": 0.00017772222994920763,
+      "loss": 0.9702,
+      "step": 4882
+    },
+    {
+      "epoch": 0.8694800569800569,
+      "grad_norm": 0.5167748332023621,
+      "learning_rate": 0.00017771342161287457,
+      "loss": 1.0528,
+      "step": 4883
+    },
+    {
+      "epoch": 0.8696581196581197,
+      "grad_norm": 0.5572916865348816,
+      "learning_rate": 0.00017770461175390848,
+      "loss": 1.1341,
+      "step": 4884
+    },
+    {
+      "epoch": 0.8698361823361823,
+      "grad_norm": 0.6666276454925537,
+      "learning_rate": 0.00017769580037248195,
+      "loss": 1.1948,
+      "step": 4885
+    },
+    {
+      "epoch": 0.8700142450142451,
+      "grad_norm": 0.5348601937294006,
+      "learning_rate": 0.0001776869874687676,
+      "loss": 1.0562,
+      "step": 4886
+    },
+    {
+      "epoch": 0.8701923076923077,
+      "grad_norm": 0.5449648499488831,
+      "learning_rate": 0.00017767817304293812,
+      "loss": 0.988,
+      "step": 4887
+    },
+    {
+      "epoch": 0.8703703703703703,
+      "grad_norm": 0.5995045304298401,
+      "learning_rate": 0.0001776693570951662,
+      "loss": 1.2526,
+      "step": 4888
+    },
+    {
+      "epoch": 0.8705484330484331,
+      "grad_norm": 0.6575320959091187,
+      "learning_rate": 0.00017766053962562457,
+      "loss": 1.1717,
+      "step": 4889
+    },
+    {
+      "epoch": 0.8707264957264957,
+      "grad_norm": 0.5882139801979065,
+      "learning_rate": 0.00017765172063448597,
+      "loss": 1.238,
+      "step": 4890
+    },
+    {
+      "epoch": 0.8709045584045584,
+      "grad_norm": 0.5908389091491699,
+      "learning_rate": 0.00017764290012192325,
+      "loss": 1.0606,
+      "step": 4891
+    },
+    {
+      "epoch": 0.8710826210826211,
+      "grad_norm": 0.6169339418411255,
+      "learning_rate": 0.00017763407808810917,
+      "loss": 1.1456,
+      "step": 4892
+    },
+    {
+      "epoch": 0.8712606837606838,
+      "grad_norm": 0.5916035771369934,
+      "learning_rate": 0.0001776252545332166,
+      "loss": 1.0026,
+      "step": 4893
+    },
+    {
+      "epoch": 0.8714387464387464,
+      "grad_norm": 0.539995551109314,
+      "learning_rate": 0.00017761642945741843,
+      "loss": 1.2397,
+      "step": 4894
+    },
+    {
+      "epoch": 0.8716168091168092,
+      "grad_norm": 0.5346137881278992,
+      "learning_rate": 0.00017760760286088755,
+      "loss": 1.1232,
+      "step": 4895
+    },
+    {
+      "epoch": 0.8717948717948718,
+      "grad_norm": 0.570202112197876,
+      "learning_rate": 0.00017759877474379692,
+      "loss": 1.0708,
+      "step": 4896
+    },
+    {
+      "epoch": 0.8719729344729344,
+      "grad_norm": 0.5023398399353027,
+      "learning_rate": 0.00017758994510631948,
+      "loss": 1.1056,
+      "step": 4897
+    },
+    {
+      "epoch": 0.8721509971509972,
+      "grad_norm": 0.5447137951850891,
+      "learning_rate": 0.00017758111394862826,
+      "loss": 0.8776,
+      "step": 4898
+    },
+    {
+      "epoch": 0.8723290598290598,
+      "grad_norm": 0.5193906426429749,
+      "learning_rate": 0.00017757228127089625,
+      "loss": 0.9959,
+      "step": 4899
+    },
+    {
+      "epoch": 0.8725071225071225,
+      "grad_norm": 0.5958787798881531,
+      "learning_rate": 0.00017756344707329656,
+      "loss": 1.092,
+      "step": 4900
+    },
+    {
+      "epoch": 0.8726851851851852,
+      "grad_norm": 0.521045982837677,
+      "learning_rate": 0.00017755461135600221,
+      "loss": 0.9864,
+      "step": 4901
+    },
+    {
+      "epoch": 0.8728632478632479,
+      "grad_norm": 0.5257635116577148,
+      "learning_rate": 0.00017754577411918638,
+      "loss": 1.216,
+      "step": 4902
+    },
+    {
+      "epoch": 0.8730413105413105,
+      "grad_norm": 0.5425964593887329,
+      "learning_rate": 0.0001775369353630222,
+      "loss": 1.1432,
+      "step": 4903
+    },
+    {
+      "epoch": 0.8732193732193733,
+      "grad_norm": 0.47995322942733765,
+      "learning_rate": 0.00017752809508768286,
+      "loss": 1.0227,
+      "step": 4904
+    },
+    {
+      "epoch": 0.8733974358974359,
+      "grad_norm": 0.5747429728507996,
+      "learning_rate": 0.0001775192532933415,
+      "loss": 0.9984,
+      "step": 4905
+    },
+    {
+      "epoch": 0.8735754985754985,
+      "grad_norm": 0.5745723247528076,
+      "learning_rate": 0.00017751040998017142,
+      "loss": 1.2559,
+      "step": 4906
+    },
+    {
+      "epoch": 0.8737535612535613,
+      "grad_norm": 0.6114141941070557,
+      "learning_rate": 0.0001775015651483459,
+      "loss": 1.3224,
+      "step": 4907
+    },
+    {
+      "epoch": 0.8739316239316239,
+      "grad_norm": 0.4757187068462372,
+      "learning_rate": 0.00017749271879803817,
+      "loss": 1.0352,
+      "step": 4908
+    },
+    {
+      "epoch": 0.8741096866096866,
+      "grad_norm": 0.48644450306892395,
+      "learning_rate": 0.0001774838709294216,
+      "loss": 1.0876,
+      "step": 4909
+    },
+    {
+      "epoch": 0.8742877492877493,
+      "grad_norm": 0.5652037262916565,
+      "learning_rate": 0.00017747502154266955,
+      "loss": 0.9189,
+      "step": 4910
+    },
+    {
+      "epoch": 0.874465811965812,
+      "grad_norm": 0.5289644002914429,
+      "learning_rate": 0.00017746617063795538,
+      "loss": 0.9431,
+      "step": 4911
+    },
+    {
+      "epoch": 0.8746438746438746,
+      "grad_norm": 0.594656229019165,
+      "learning_rate": 0.00017745731821545253,
+      "loss": 1.2408,
+      "step": 4912
+    },
+    {
+      "epoch": 0.8748219373219374,
+      "grad_norm": 0.5693240165710449,
+      "learning_rate": 0.0001774484642753344,
+      "loss": 1.347,
+      "step": 4913
+    },
+    {
+      "epoch": 0.875,
+      "grad_norm": 0.5291008949279785,
+      "learning_rate": 0.00017743960881777456,
+      "loss": 1.161,
+      "step": 4914
+    },
+    {
+      "epoch": 0.8751780626780626,
+      "grad_norm": 0.5958300232887268,
+      "learning_rate": 0.00017743075184294642,
+      "loss": 1.2058,
+      "step": 4915
+    },
+    {
+      "epoch": 0.8753561253561254,
+      "grad_norm": 0.513884425163269,
+      "learning_rate": 0.00017742189335102354,
+      "loss": 1.0952,
+      "step": 4916
+    },
+    {
+      "epoch": 0.875534188034188,
+      "grad_norm": 0.5860681533813477,
+      "learning_rate": 0.00017741303334217948,
+      "loss": 1.1801,
+      "step": 4917
+    },
+    {
+      "epoch": 0.8757122507122507,
+      "grad_norm": 0.47962820529937744,
+      "learning_rate": 0.00017740417181658788,
+      "loss": 1.0785,
+      "step": 4918
+    },
+    {
+      "epoch": 0.8758903133903134,
+      "grad_norm": 0.5110440254211426,
+      "learning_rate": 0.00017739530877442227,
+      "loss": 1.1385,
+      "step": 4919
+    },
+    {
+      "epoch": 0.8760683760683761,
+      "grad_norm": 0.5106285214424133,
+      "learning_rate": 0.00017738644421585643,
+      "loss": 1.1204,
+      "step": 4920
+    },
+    {
+      "epoch": 0.8762464387464387,
+      "grad_norm": 0.5709205865859985,
+      "learning_rate": 0.00017737757814106393,
+      "loss": 1.0108,
+      "step": 4921
+    },
+    {
+      "epoch": 0.8764245014245015,
+      "grad_norm": 0.5850250124931335,
+      "learning_rate": 0.0001773687105502185,
+      "loss": 1.0059,
+      "step": 4922
+    },
+    {
+      "epoch": 0.8766025641025641,
+      "grad_norm": 0.5194727778434753,
+      "learning_rate": 0.00017735984144349396,
+      "loss": 0.9466,
+      "step": 4923
+    },
+    {
+      "epoch": 0.8767806267806267,
+      "grad_norm": 0.5246787667274475,
+      "learning_rate": 0.000177350970821064,
+      "loss": 1.1336,
+      "step": 4924
+    },
+    {
+      "epoch": 0.8769586894586895,
+      "grad_norm": 0.5798323154449463,
+      "learning_rate": 0.00017734209868310244,
+      "loss": 1.1641,
+      "step": 4925
+    },
+    {
+      "epoch": 0.8771367521367521,
+      "grad_norm": 0.5188565850257874,
+      "learning_rate": 0.00017733322502978314,
+      "loss": 0.9959,
+      "step": 4926
+    },
+    {
+      "epoch": 0.8773148148148148,
+      "grad_norm": 0.5969653725624084,
+      "learning_rate": 0.00017732434986127995,
+      "loss": 1.2162,
+      "step": 4927
+    },
+    {
+      "epoch": 0.8774928774928775,
+      "grad_norm": 0.5520089268684387,
+      "learning_rate": 0.00017731547317776674,
+      "loss": 1.0163,
+      "step": 4928
+    },
+    {
+      "epoch": 0.8776709401709402,
+      "grad_norm": 0.48789507150650024,
+      "learning_rate": 0.00017730659497941745,
+      "loss": 0.9757,
+      "step": 4929
+    },
+    {
+      "epoch": 0.8778490028490028,
+      "grad_norm": 0.6034960746765137,
+      "learning_rate": 0.000177297715266406,
+      "loss": 1.1278,
+      "step": 4930
+    },
+    {
+      "epoch": 0.8780270655270656,
+      "grad_norm": 0.53016597032547,
+      "learning_rate": 0.00017728883403890638,
+      "loss": 1.0637,
+      "step": 4931
+    },
+    {
+      "epoch": 0.8782051282051282,
+      "grad_norm": 0.5073726177215576,
+      "learning_rate": 0.00017727995129709266,
+      "loss": 1.1491,
+      "step": 4932
+    },
+    {
+      "epoch": 0.8783831908831908,
+      "grad_norm": 0.540605366230011,
+      "learning_rate": 0.00017727106704113878,
+      "loss": 1.0133,
+      "step": 4933
+    },
+    {
+      "epoch": 0.8785612535612536,
+      "grad_norm": 0.5346775054931641,
+      "learning_rate": 0.0001772621812712189,
+      "loss": 1.1781,
+      "step": 4934
+    },
+    {
+      "epoch": 0.8787393162393162,
+      "grad_norm": 0.5659036040306091,
+      "learning_rate": 0.00017725329398750702,
+      "loss": 1.1023,
+      "step": 4935
+    },
+    {
+      "epoch": 0.8789173789173789,
+      "grad_norm": 0.591063380241394,
+      "learning_rate": 0.00017724440519017738,
+      "loss": 1.0298,
+      "step": 4936
+    },
+    {
+      "epoch": 0.8790954415954416,
+      "grad_norm": 0.5173781514167786,
+      "learning_rate": 0.0001772355148794041,
+      "loss": 1.0483,
+      "step": 4937
+    },
+    {
+      "epoch": 0.8792735042735043,
+      "grad_norm": 0.5405352711677551,
+      "learning_rate": 0.0001772266230553613,
+      "loss": 1.0716,
+      "step": 4938
+    },
+    {
+      "epoch": 0.8794515669515669,
+      "grad_norm": 0.518442690372467,
+      "learning_rate": 0.00017721772971822323,
+      "loss": 1.1373,
+      "step": 4939
+    },
+    {
+      "epoch": 0.8796296296296297,
+      "grad_norm": 0.533673107624054,
+      "learning_rate": 0.0001772088348681642,
+      "loss": 1.0489,
+      "step": 4940
+    },
+    {
+      "epoch": 0.8798076923076923,
+      "grad_norm": 0.46117857098579407,
+      "learning_rate": 0.0001771999385053584,
+      "loss": 1.0297,
+      "step": 4941
+    },
+    {
+      "epoch": 0.8799857549857549,
+      "grad_norm": 0.4687997102737427,
+      "learning_rate": 0.0001771910406299802,
+      "loss": 1.071,
+      "step": 4942
+    },
+    {
+      "epoch": 0.8801638176638177,
+      "grad_norm": 0.5064153075218201,
+      "learning_rate": 0.0001771821412422039,
+      "loss": 0.9518,
+      "step": 4943
+    },
+    {
+      "epoch": 0.8803418803418803,
+      "grad_norm": 0.6561978459358215,
+      "learning_rate": 0.00017717324034220385,
+      "loss": 1.11,
+      "step": 4944
+    },
+    {
+      "epoch": 0.8805199430199431,
+      "grad_norm": 0.5551498532295227,
+      "learning_rate": 0.00017716433793015454,
+      "loss": 0.9719,
+      "step": 4945
+    },
+    {
+      "epoch": 0.8806980056980057,
+      "grad_norm": 0.47059500217437744,
+      "learning_rate": 0.00017715543400623025,
+      "loss": 0.8891,
+      "step": 4946
+    },
+    {
+      "epoch": 0.8808760683760684,
+      "grad_norm": 0.5035740733146667,
+      "learning_rate": 0.00017714652857060554,
+      "loss": 0.9671,
+      "step": 4947
+    },
+    {
+      "epoch": 0.8810541310541311,
+      "grad_norm": 0.4599960446357727,
+      "learning_rate": 0.00017713762162345487,
+      "loss": 0.9588,
+      "step": 4948
+    },
+    {
+      "epoch": 0.8812321937321937,
+      "grad_norm": 0.5087231397628784,
+      "learning_rate": 0.0001771287131649527,
+      "loss": 1.1433,
+      "step": 4949
+    },
+    {
+      "epoch": 0.8814102564102564,
+      "grad_norm": 0.5609854459762573,
+      "learning_rate": 0.00017711980319527366,
+      "loss": 1.2022,
+      "step": 4950
+    },
+    {
+      "epoch": 0.8815883190883191,
+      "grad_norm": 0.49460700154304504,
+      "learning_rate": 0.00017711089171459227,
+      "loss": 1.019,
+      "step": 4951
+    },
+    {
+      "epoch": 0.8817663817663818,
+      "grad_norm": 0.5047259330749512,
+      "learning_rate": 0.00017710197872308314,
+      "loss": 0.8301,
+      "step": 4952
+    },
+    {
+      "epoch": 0.8819444444444444,
+      "grad_norm": 0.5784406065940857,
+      "learning_rate": 0.0001770930642209209,
+      "loss": 0.9336,
+      "step": 4953
+    },
+    {
+      "epoch": 0.8821225071225072,
+      "grad_norm": 0.5037121772766113,
+      "learning_rate": 0.00017708414820828022,
+      "loss": 1.0199,
+      "step": 4954
+    },
+    {
+      "epoch": 0.8823005698005698,
+      "grad_norm": 0.5683804750442505,
+      "learning_rate": 0.00017707523068533575,
+      "loss": 0.9758,
+      "step": 4955
+    },
+    {
+      "epoch": 0.8824786324786325,
+      "grad_norm": 0.5167922973632812,
+      "learning_rate": 0.0001770663116522623,
+      "loss": 1.0389,
+      "step": 4956
+    },
+    {
+      "epoch": 0.8826566951566952,
+      "grad_norm": 0.5813606381416321,
+      "learning_rate": 0.0001770573911092345,
+      "loss": 1.3998,
+      "step": 4957
+    },
+    {
+      "epoch": 0.8828347578347578,
+      "grad_norm": 0.5280475616455078,
+      "learning_rate": 0.00017704846905642723,
+      "loss": 1.0545,
+      "step": 4958
+    },
+    {
+      "epoch": 0.8830128205128205,
+      "grad_norm": 0.5421732068061829,
+      "learning_rate": 0.00017703954549401528,
+      "loss": 0.899,
+      "step": 4959
+    },
+    {
+      "epoch": 0.8831908831908832,
+      "grad_norm": 0.5177720189094543,
+      "learning_rate": 0.00017703062042217344,
+      "loss": 0.975,
+      "step": 4960
+    },
+    {
+      "epoch": 0.8833689458689459,
+      "grad_norm": 0.639327883720398,
+      "learning_rate": 0.00017702169384107666,
+      "loss": 1.1936,
+      "step": 4961
+    },
+    {
+      "epoch": 0.8835470085470085,
+      "grad_norm": 0.5201572179794312,
+      "learning_rate": 0.00017701276575089975,
+      "loss": 0.9891,
+      "step": 4962
+    },
+    {
+      "epoch": 0.8837250712250713,
+      "grad_norm": 0.5304145216941833,
+      "learning_rate": 0.00017700383615181767,
+      "loss": 1.0569,
+      "step": 4963
+    },
+    {
+      "epoch": 0.8839031339031339,
+      "grad_norm": 0.6068132519721985,
+      "learning_rate": 0.00017699490504400538,
+      "loss": 1.2653,
+      "step": 4964
+    },
+    {
+      "epoch": 0.8840811965811965,
+      "grad_norm": 0.597895085811615,
+      "learning_rate": 0.00017698597242763787,
+      "loss": 1.2577,
+      "step": 4965
+    },
+    {
+      "epoch": 0.8842592592592593,
+      "grad_norm": 0.5356902480125427,
+      "learning_rate": 0.00017697703830289017,
+      "loss": 1.1056,
+      "step": 4966
+    },
+    {
+      "epoch": 0.8844373219373219,
+      "grad_norm": 0.5429540872573853,
+      "learning_rate": 0.0001769681026699373,
+      "loss": 1.0951,
+      "step": 4967
+    },
+    {
+      "epoch": 0.8846153846153846,
+      "grad_norm": 0.5789309144020081,
+      "learning_rate": 0.00017695916552895436,
+      "loss": 1.0786,
+      "step": 4968
+    },
+    {
+      "epoch": 0.8847934472934473,
+      "grad_norm": 0.5621341466903687,
+      "learning_rate": 0.0001769502268801164,
+      "loss": 1.0645,
+      "step": 4969
+    },
+    {
+      "epoch": 0.88497150997151,
+      "grad_norm": 0.5879453420639038,
+      "learning_rate": 0.00017694128672359865,
+      "loss": 1.2171,
+      "step": 4970
+    },
+    {
+      "epoch": 0.8851495726495726,
+      "grad_norm": 0.5005951523780823,
+      "learning_rate": 0.0001769323450595762,
+      "loss": 1.0725,
+      "step": 4971
+    },
+    {
+      "epoch": 0.8853276353276354,
+      "grad_norm": 0.5439660549163818,
+      "learning_rate": 0.00017692340188822425,
+      "loss": 1.162,
+      "step": 4972
+    },
+    {
+      "epoch": 0.885505698005698,
+      "grad_norm": 0.6309837698936462,
+      "learning_rate": 0.00017691445720971802,
+      "loss": 1.2861,
+      "step": 4973
+    },
+    {
+      "epoch": 0.8856837606837606,
+      "grad_norm": 0.4997463822364807,
+      "learning_rate": 0.00017690551102423282,
+      "loss": 1.1887,
+      "step": 4974
+    },
+    {
+      "epoch": 0.8858618233618234,
+      "grad_norm": 0.5430852174758911,
+      "learning_rate": 0.00017689656333194385,
+      "loss": 1.1231,
+      "step": 4975
+    },
+    {
+      "epoch": 0.886039886039886,
+      "grad_norm": 0.5414215922355652,
+      "learning_rate": 0.00017688761413302644,
+      "loss": 1.2345,
+      "step": 4976
+    },
+    {
+      "epoch": 0.8862179487179487,
+      "grad_norm": 0.5594443082809448,
+      "learning_rate": 0.00017687866342765601,
+      "loss": 1.0775,
+      "step": 4977
+    },
+    {
+      "epoch": 0.8863960113960114,
+      "grad_norm": 0.5827134847640991,
+      "learning_rate": 0.00017686971121600787,
+      "loss": 1.0609,
+      "step": 4978
+    },
+    {
+      "epoch": 0.8865740740740741,
+      "grad_norm": 0.5075414776802063,
+      "learning_rate": 0.00017686075749825738,
+      "loss": 0.796,
+      "step": 4979
+    },
+    {
+      "epoch": 0.8867521367521367,
+      "grad_norm": 0.6007544994354248,
+      "learning_rate": 0.00017685180227458003,
+      "loss": 1.1716,
+      "step": 4980
+    },
+    {
+      "epoch": 0.8869301994301995,
+      "grad_norm": 0.6458030343055725,
+      "learning_rate": 0.00017684284554515128,
+      "loss": 1.1945,
+      "step": 4981
+    },
+    {
+      "epoch": 0.8871082621082621,
+      "grad_norm": 0.5519212484359741,
+      "learning_rate": 0.00017683388731014657,
+      "loss": 1.2571,
+      "step": 4982
+    },
+    {
+      "epoch": 0.8872863247863247,
+      "grad_norm": 0.5079960227012634,
+      "learning_rate": 0.00017682492756974146,
+      "loss": 1.1186,
+      "step": 4983
+    },
+    {
+      "epoch": 0.8874643874643875,
+      "grad_norm": 0.63576740026474,
+      "learning_rate": 0.00017681596632411147,
+      "loss": 1.389,
+      "step": 4984
+    },
+    {
+      "epoch": 0.8876424501424501,
+      "grad_norm": 0.43325698375701904,
+      "learning_rate": 0.0001768070035734322,
+      "loss": 0.7757,
+      "step": 4985
+    },
+    {
+      "epoch": 0.8878205128205128,
+      "grad_norm": 0.49492064118385315,
+      "learning_rate": 0.00017679803931787923,
+      "loss": 1.0096,
+      "step": 4986
+    },
+    {
+      "epoch": 0.8879985754985755,
+      "grad_norm": 0.5561224222183228,
+      "learning_rate": 0.00017678907355762825,
+      "loss": 0.952,
+      "step": 4987
+    },
+    {
+      "epoch": 0.8881766381766382,
+      "grad_norm": 0.5392457246780396,
+      "learning_rate": 0.00017678010629285486,
+      "loss": 1.0442,
+      "step": 4988
+    },
+    {
+      "epoch": 0.8883547008547008,
+      "grad_norm": 0.4659234881401062,
+      "learning_rate": 0.00017677113752373482,
+      "loss": 0.8668,
+      "step": 4989
+    },
+    {
+      "epoch": 0.8885327635327636,
+      "grad_norm": 0.5139175057411194,
+      "learning_rate": 0.0001767621672504438,
+      "loss": 0.8386,
+      "step": 4990
+    },
+    {
+      "epoch": 0.8887108262108262,
+      "grad_norm": 0.5395823121070862,
+      "learning_rate": 0.00017675319547315755,
+      "loss": 0.9754,
+      "step": 4991
+    },
+    {
+      "epoch": 0.8888888888888888,
+      "grad_norm": 0.4751867949962616,
+      "learning_rate": 0.0001767442221920519,
+      "loss": 0.8775,
+      "step": 4992
+    },
+    {
+      "epoch": 0.8890669515669516,
+      "grad_norm": 0.5728281736373901,
+      "learning_rate": 0.00017673524740730265,
+      "loss": 1.2807,
+      "step": 4993
+    },
+    {
+      "epoch": 0.8892450142450142,
+      "grad_norm": 0.5545622110366821,
+      "learning_rate": 0.00017672627111908558,
+      "loss": 1.0039,
+      "step": 4994
+    },
+    {
+      "epoch": 0.8894230769230769,
+      "grad_norm": 0.5127374529838562,
+      "learning_rate": 0.00017671729332757665,
+      "loss": 1.0505,
+      "step": 4995
+    },
+    {
+      "epoch": 0.8896011396011396,
+      "grad_norm": 0.5238714218139648,
+      "learning_rate": 0.00017670831403295175,
+      "loss": 1.1775,
+      "step": 4996
+    },
+    {
+      "epoch": 0.8897792022792023,
+      "grad_norm": 0.5610160827636719,
+      "learning_rate": 0.00017669933323538674,
+      "loss": 1.0555,
+      "step": 4997
+    },
+    {
+      "epoch": 0.8899572649572649,
+      "grad_norm": 0.5481634736061096,
+      "learning_rate": 0.00017669035093505762,
+      "loss": 1.0802,
+      "step": 4998
+    },
+    {
+      "epoch": 0.8901353276353277,
+      "grad_norm": 0.4725174307823181,
+      "learning_rate": 0.0001766813671321404,
+      "loss": 0.9611,
+      "step": 4999
+    },
+    {
+      "epoch": 0.8903133903133903,
+      "grad_norm": 0.5184635519981384,
+      "learning_rate": 0.0001766723818268111,
+      "loss": 1.1659,
+      "step": 5000
+    },
+    {
+      "epoch": 0.8904914529914529,
+      "grad_norm": 0.5503578186035156,
+      "learning_rate": 0.00017666339501924575,
+      "loss": 1.2165,
+      "step": 5001
+    },
+    {
+      "epoch": 0.8906695156695157,
+      "grad_norm": 0.5299594402313232,
+      "learning_rate": 0.0001766544067096204,
+      "loss": 1.0196,
+      "step": 5002
+    },
+    {
+      "epoch": 0.8908475783475783,
+      "grad_norm": 0.5673944354057312,
+      "learning_rate": 0.00017664541689811118,
+      "loss": 1.2058,
+      "step": 5003
+    },
+    {
+      "epoch": 0.8910256410256411,
+      "grad_norm": 0.6057320833206177,
+      "learning_rate": 0.00017663642558489426,
+      "loss": 1.0136,
+      "step": 5004
+    },
+    {
+      "epoch": 0.8912037037037037,
+      "grad_norm": 0.4767026901245117,
+      "learning_rate": 0.00017662743277014578,
+      "loss": 0.8522,
+      "step": 5005
+    },
+    {
+      "epoch": 0.8913817663817664,
+      "grad_norm": 0.5346270203590393,
+      "learning_rate": 0.00017661843845404192,
+      "loss": 1.1568,
+      "step": 5006
+    },
+    {
+      "epoch": 0.8915598290598291,
+      "grad_norm": 0.5365738868713379,
+      "learning_rate": 0.00017660944263675891,
+      "loss": 1.0488,
+      "step": 5007
+    },
+    {
+      "epoch": 0.8917378917378918,
+      "grad_norm": 0.5536269545555115,
+      "learning_rate": 0.00017660044531847305,
+      "loss": 1.1216,
+      "step": 5008
+    },
+    {
+      "epoch": 0.8919159544159544,
+      "grad_norm": 0.6325978636741638,
+      "learning_rate": 0.00017659144649936055,
+      "loss": 1.2843,
+      "step": 5009
+    },
+    {
+      "epoch": 0.8920940170940171,
+      "grad_norm": 0.5890641212463379,
+      "learning_rate": 0.00017658244617959777,
+      "loss": 1.1976,
+      "step": 5010
+    },
+    {
+      "epoch": 0.8922720797720798,
+      "grad_norm": 0.604870080947876,
+      "learning_rate": 0.00017657344435936107,
+      "loss": 1.2881,
+      "step": 5011
+    },
+    {
+      "epoch": 0.8924501424501424,
+      "grad_norm": 0.49805206060409546,
+      "learning_rate": 0.00017656444103882676,
+      "loss": 0.8998,
+      "step": 5012
+    },
+    {
+      "epoch": 0.8926282051282052,
+      "grad_norm": 0.506926953792572,
+      "learning_rate": 0.0001765554362181713,
+      "loss": 1.0731,
+      "step": 5013
+    },
+    {
+      "epoch": 0.8928062678062678,
+      "grad_norm": 0.5353260636329651,
+      "learning_rate": 0.0001765464298975711,
+      "loss": 1.0676,
+      "step": 5014
+    },
+    {
+      "epoch": 0.8929843304843305,
+      "grad_norm": 0.5641853213310242,
+      "learning_rate": 0.0001765374220772026,
+      "loss": 0.9606,
+      "step": 5015
+    },
+    {
+      "epoch": 0.8931623931623932,
+      "grad_norm": 0.5049327611923218,
+      "learning_rate": 0.00017652841275724233,
+      "loss": 1.009,
+      "step": 5016
+    },
+    {
+      "epoch": 0.8933404558404558,
+      "grad_norm": 0.6255155205726624,
+      "learning_rate": 0.0001765194019378668,
+      "loss": 1.138,
+      "step": 5017
+    },
+    {
+      "epoch": 0.8935185185185185,
+      "grad_norm": 0.5816851854324341,
+      "learning_rate": 0.00017651038961925247,
+      "loss": 1.3398,
+      "step": 5018
+    },
+    {
+      "epoch": 0.8936965811965812,
+      "grad_norm": 0.5188020467758179,
+      "learning_rate": 0.00017650137580157605,
+      "loss": 1.0126,
+      "step": 5019
+    },
+    {
+      "epoch": 0.8938746438746439,
+      "grad_norm": 0.5231554508209229,
+      "learning_rate": 0.00017649236048501406,
+      "loss": 1.0328,
+      "step": 5020
+    },
+    {
+      "epoch": 0.8940527065527065,
+      "grad_norm": 0.7638634443283081,
+      "learning_rate": 0.0001764833436697432,
+      "loss": 1.3016,
+      "step": 5021
+    },
+    {
+      "epoch": 0.8942307692307693,
+      "grad_norm": 0.5354094505310059,
+      "learning_rate": 0.00017647432535594008,
+      "loss": 1.0646,
+      "step": 5022
+    },
+    {
+      "epoch": 0.8944088319088319,
+      "grad_norm": 0.6938086748123169,
+      "learning_rate": 0.0001764653055437814,
+      "loss": 1.2051,
+      "step": 5023
+    },
+    {
+      "epoch": 0.8945868945868946,
+      "grad_norm": 0.5546849370002747,
+      "learning_rate": 0.00017645628423344393,
+      "loss": 1.0671,
+      "step": 5024
+    },
+    {
+      "epoch": 0.8947649572649573,
+      "grad_norm": 0.49294665455818176,
+      "learning_rate": 0.0001764472614251044,
+      "loss": 1.0328,
+      "step": 5025
+    },
+    {
+      "epoch": 0.89494301994302,
+      "grad_norm": 0.5965796113014221,
+      "learning_rate": 0.00017643823711893956,
+      "loss": 1.0741,
+      "step": 5026
+    },
+    {
+      "epoch": 0.8951210826210826,
+      "grad_norm": 0.4846448302268982,
+      "learning_rate": 0.00017642921131512626,
+      "loss": 1.0409,
+      "step": 5027
+    },
+    {
+      "epoch": 0.8952991452991453,
+      "grad_norm": 0.5767390131950378,
+      "learning_rate": 0.00017642018401384135,
+      "loss": 1.018,
+      "step": 5028
+    },
+    {
+      "epoch": 0.895477207977208,
+      "grad_norm": 0.503027617931366,
+      "learning_rate": 0.00017641115521526167,
+      "loss": 1.0002,
+      "step": 5029
+    },
+    {
+      "epoch": 0.8956552706552706,
+      "grad_norm": 0.6668619513511658,
+      "learning_rate": 0.00017640212491956412,
+      "loss": 1.2154,
+      "step": 5030
+    },
+    {
+      "epoch": 0.8958333333333334,
+      "grad_norm": 0.5544148683547974,
+      "learning_rate": 0.00017639309312692566,
+      "loss": 1.2701,
+      "step": 5031
+    },
+    {
+      "epoch": 0.896011396011396,
+      "grad_norm": 0.6026872992515564,
+      "learning_rate": 0.00017638405983752323,
+      "loss": 0.9335,
+      "step": 5032
+    },
+    {
+      "epoch": 0.8961894586894587,
+      "grad_norm": 0.6288694143295288,
+      "learning_rate": 0.00017637502505153384,
+      "loss": 0.9075,
+      "step": 5033
+    },
+    {
+      "epoch": 0.8963675213675214,
+      "grad_norm": 0.4890204966068268,
+      "learning_rate": 0.00017636598876913446,
+      "loss": 0.8492,
+      "step": 5034
+    },
+    {
+      "epoch": 0.896545584045584,
+      "grad_norm": 0.5746598243713379,
+      "learning_rate": 0.00017635695099050218,
+      "loss": 1.1557,
+      "step": 5035
+    },
+    {
+      "epoch": 0.8967236467236467,
+      "grad_norm": 0.5165683031082153,
+      "learning_rate": 0.00017634791171581405,
+      "loss": 1.0899,
+      "step": 5036
+    },
+    {
+      "epoch": 0.8969017094017094,
+      "grad_norm": 0.4621037244796753,
+      "learning_rate": 0.0001763388709452472,
+      "loss": 1.0457,
+      "step": 5037
+    },
+    {
+      "epoch": 0.8970797720797721,
+      "grad_norm": 0.532358705997467,
+      "learning_rate": 0.00017632982867897876,
+      "loss": 1.139,
+      "step": 5038
+    },
+    {
+      "epoch": 0.8972578347578347,
+      "grad_norm": 0.5794399976730347,
+      "learning_rate": 0.00017632078491718587,
+      "loss": 1.031,
+      "step": 5039
+    },
+    {
+      "epoch": 0.8974358974358975,
+      "grad_norm": 0.5031905174255371,
+      "learning_rate": 0.00017631173966004576,
+      "loss": 0.9508,
+      "step": 5040
+    },
+    {
+      "epoch": 0.8976139601139601,
+      "grad_norm": 0.6528840065002441,
+      "learning_rate": 0.00017630269290773564,
+      "loss": 0.9974,
+      "step": 5041
+    },
+    {
+      "epoch": 0.8977920227920227,
+      "grad_norm": 0.6007558703422546,
+      "learning_rate": 0.00017629364466043273,
+      "loss": 1.0993,
+      "step": 5042
+    },
+    {
+      "epoch": 0.8979700854700855,
+      "grad_norm": 0.5104095339775085,
+      "learning_rate": 0.00017628459491831437,
+      "loss": 0.9175,
+      "step": 5043
+    },
+    {
+      "epoch": 0.8981481481481481,
+      "grad_norm": 0.5285516977310181,
+      "learning_rate": 0.00017627554368155782,
+      "loss": 0.998,
+      "step": 5044
+    },
+    {
+      "epoch": 0.8983262108262108,
+      "grad_norm": 0.5629046559333801,
+      "learning_rate": 0.00017626649095034045,
+      "loss": 1.2021,
+      "step": 5045
+    },
+    {
+      "epoch": 0.8985042735042735,
+      "grad_norm": 0.57548987865448,
+      "learning_rate": 0.00017625743672483962,
+      "loss": 1.2076,
+      "step": 5046
+    },
+    {
+      "epoch": 0.8986823361823362,
+      "grad_norm": 0.4883024990558624,
+      "learning_rate": 0.0001762483810052327,
+      "loss": 0.9761,
+      "step": 5047
+    },
+    {
+      "epoch": 0.8988603988603988,
+      "grad_norm": 0.6378034949302673,
+      "learning_rate": 0.0001762393237916972,
+      "loss": 1.2266,
+      "step": 5048
+    },
+    {
+      "epoch": 0.8990384615384616,
+      "grad_norm": 0.5201624035835266,
+      "learning_rate": 0.0001762302650844105,
+      "loss": 1.247,
+      "step": 5049
+    },
+    {
+      "epoch": 0.8992165242165242,
+      "grad_norm": 0.5438048243522644,
+      "learning_rate": 0.0001762212048835501,
+      "loss": 0.993,
+      "step": 5050
+    },
+    {
+      "epoch": 0.8993945868945868,
+      "grad_norm": 0.5928253531455994,
+      "learning_rate": 0.00017621214318929354,
+      "loss": 1.0469,
+      "step": 5051
+    },
+    {
+      "epoch": 0.8995726495726496,
+      "grad_norm": 0.6437996625900269,
+      "learning_rate": 0.00017620308000181831,
+      "loss": 1.3136,
+      "step": 5052
+    },
+    {
+      "epoch": 0.8997507122507122,
+      "grad_norm": 0.5961456298828125,
+      "learning_rate": 0.00017619401532130208,
+      "loss": 1.1495,
+      "step": 5053
+    },
+    {
+      "epoch": 0.8999287749287749,
+      "grad_norm": 0.497388631105423,
+      "learning_rate": 0.0001761849491479224,
+      "loss": 0.7783,
+      "step": 5054
+    },
+    {
+      "epoch": 0.9001068376068376,
+      "grad_norm": 0.5984451174736023,
+      "learning_rate": 0.00017617588148185687,
+      "loss": 1.3115,
+      "step": 5055
+    },
+    {
+      "epoch": 0.9002849002849003,
+      "grad_norm": 0.549163818359375,
+      "learning_rate": 0.0001761668123232832,
+      "loss": 1.1649,
+      "step": 5056
+    },
+    {
+      "epoch": 0.9004629629629629,
+      "grad_norm": 0.5831968188285828,
+      "learning_rate": 0.00017615774167237903,
+      "loss": 1.1749,
+      "step": 5057
+    },
+    {
+      "epoch": 0.9006410256410257,
+      "grad_norm": 0.5111076235771179,
+      "learning_rate": 0.00017614866952932214,
+      "loss": 0.8936,
+      "step": 5058
+    },
+    {
+      "epoch": 0.9008190883190883,
+      "grad_norm": 0.5740947723388672,
+      "learning_rate": 0.00017613959589429028,
+      "loss": 1.2606,
+      "step": 5059
+    },
+    {
+      "epoch": 0.9009971509971509,
+      "grad_norm": 0.5881099700927734,
+      "learning_rate": 0.0001761305207674612,
+      "loss": 1.3682,
+      "step": 5060
+    },
+    {
+      "epoch": 0.9011752136752137,
+      "grad_norm": 0.5007091760635376,
+      "learning_rate": 0.00017612144414901268,
+      "loss": 0.7788,
+      "step": 5061
+    },
+    {
+      "epoch": 0.9013532763532763,
+      "grad_norm": 0.5127760171890259,
+      "learning_rate": 0.00017611236603912262,
+      "loss": 1.0519,
+      "step": 5062
+    },
+    {
+      "epoch": 0.9015313390313391,
+      "grad_norm": 0.6185184121131897,
+      "learning_rate": 0.00017610328643796882,
+      "loss": 1.1672,
+      "step": 5063
+    },
+    {
+      "epoch": 0.9017094017094017,
+      "grad_norm": 0.49707287549972534,
+      "learning_rate": 0.00017609420534572926,
+      "loss": 1.1865,
+      "step": 5064
+    },
+    {
+      "epoch": 0.9018874643874644,
+      "grad_norm": 0.5667552351951599,
+      "learning_rate": 0.0001760851227625818,
+      "loss": 1.1388,
+      "step": 5065
+    },
+    {
+      "epoch": 0.9020655270655271,
+      "grad_norm": 0.50298011302948,
+      "learning_rate": 0.00017607603868870442,
+      "loss": 0.9552,
+      "step": 5066
+    },
+    {
+      "epoch": 0.9022435897435898,
+      "grad_norm": 0.5709219574928284,
+      "learning_rate": 0.0001760669531242751,
+      "loss": 1.2636,
+      "step": 5067
+    },
+    {
+      "epoch": 0.9024216524216524,
+      "grad_norm": 0.4943496286869049,
+      "learning_rate": 0.0001760578660694718,
+      "loss": 0.8951,
+      "step": 5068
+    },
+    {
+      "epoch": 0.9025997150997151,
+      "grad_norm": 0.5475931167602539,
+      "learning_rate": 0.00017604877752447267,
+      "loss": 1.1442,
+      "step": 5069
+    },
+    {
+      "epoch": 0.9027777777777778,
+      "grad_norm": 0.5280239582061768,
+      "learning_rate": 0.0001760396874894557,
+      "loss": 0.9537,
+      "step": 5070
+    },
+    {
+      "epoch": 0.9029558404558404,
+      "grad_norm": 0.5480797290802002,
+      "learning_rate": 0.000176030595964599,
+      "loss": 1.1557,
+      "step": 5071
+    },
+    {
+      "epoch": 0.9031339031339032,
+      "grad_norm": 0.5232734680175781,
+      "learning_rate": 0.00017602150295008073,
+      "loss": 1.0219,
+      "step": 5072
+    },
+    {
+      "epoch": 0.9033119658119658,
+      "grad_norm": 0.5448359251022339,
+      "learning_rate": 0.000176012408446079,
+      "loss": 1.1964,
+      "step": 5073
+    },
+    {
+      "epoch": 0.9034900284900285,
+      "grad_norm": 0.4841914474964142,
+      "learning_rate": 0.00017600331245277206,
+      "loss": 1.0667,
+      "step": 5074
+    },
+    {
+      "epoch": 0.9036680911680912,
+      "grad_norm": 0.5407083630561829,
+      "learning_rate": 0.0001759942149703381,
+      "loss": 1.1895,
+      "step": 5075
+    },
+    {
+      "epoch": 0.9038461538461539,
+      "grad_norm": 0.5140416026115417,
+      "learning_rate": 0.00017598511599895534,
+      "loss": 0.9402,
+      "step": 5076
+    },
+    {
+      "epoch": 0.9040242165242165,
+      "grad_norm": 0.6333765983581543,
+      "learning_rate": 0.00017597601553880207,
+      "loss": 1.239,
+      "step": 5077
+    },
+    {
+      "epoch": 0.9042022792022792,
+      "grad_norm": 0.4996028244495392,
+      "learning_rate": 0.00017596691359005664,
+      "loss": 1.0259,
+      "step": 5078
+    },
+    {
+      "epoch": 0.9043803418803419,
+      "grad_norm": 0.591892421245575,
+      "learning_rate": 0.00017595781015289732,
+      "loss": 1.2148,
+      "step": 5079
+    },
+    {
+      "epoch": 0.9045584045584045,
+      "grad_norm": 0.736499011516571,
+      "learning_rate": 0.0001759487052275025,
+      "loss": 1.1373,
+      "step": 5080
+    },
+    {
+      "epoch": 0.9047364672364673,
+      "grad_norm": 0.5951572060585022,
+      "learning_rate": 0.00017593959881405057,
+      "loss": 1.1833,
+      "step": 5081
+    },
+    {
+      "epoch": 0.9049145299145299,
+      "grad_norm": 0.5092006325721741,
+      "learning_rate": 0.00017593049091271996,
+      "loss": 0.8841,
+      "step": 5082
+    },
+    {
+      "epoch": 0.9050925925925926,
+      "grad_norm": 0.5679013729095459,
+      "learning_rate": 0.0001759213815236891,
+      "loss": 1.1056,
+      "step": 5083
+    },
+    {
+      "epoch": 0.9052706552706553,
+      "grad_norm": 0.5708174109458923,
+      "learning_rate": 0.0001759122706471365,
+      "loss": 1.1952,
+      "step": 5084
+    },
+    {
+      "epoch": 0.905448717948718,
+      "grad_norm": 0.5726733803749084,
+      "learning_rate": 0.00017590315828324067,
+      "loss": 1.1013,
+      "step": 5085
+    },
+    {
+      "epoch": 0.9056267806267806,
+      "grad_norm": 0.5821273326873779,
+      "learning_rate": 0.00017589404443218008,
+      "loss": 1.2323,
+      "step": 5086
+    },
+    {
+      "epoch": 0.9058048433048433,
+      "grad_norm": 0.5811445713043213,
+      "learning_rate": 0.00017588492909413337,
+      "loss": 1.2241,
+      "step": 5087
+    },
+    {
+      "epoch": 0.905982905982906,
+      "grad_norm": 0.5377545952796936,
+      "learning_rate": 0.0001758758122692791,
+      "loss": 0.9777,
+      "step": 5088
+    },
+    {
+      "epoch": 0.9061609686609686,
+      "grad_norm": 0.5985640287399292,
+      "learning_rate": 0.0001758666939577959,
+      "loss": 0.9737,
+      "step": 5089
+    },
+    {
+      "epoch": 0.9063390313390314,
+      "grad_norm": 0.6038222908973694,
+      "learning_rate": 0.00017585757415986247,
+      "loss": 1.2116,
+      "step": 5090
+    },
+    {
+      "epoch": 0.906517094017094,
+      "grad_norm": 0.6752246022224426,
+      "learning_rate": 0.00017584845287565743,
+      "loss": 1.1975,
+      "step": 5091
+    },
+    {
+      "epoch": 0.9066951566951567,
+      "grad_norm": 0.5400625467300415,
+      "learning_rate": 0.0001758393301053595,
+      "loss": 0.9669,
+      "step": 5092
+    },
+    {
+      "epoch": 0.9068732193732194,
+      "grad_norm": 0.5637784004211426,
+      "learning_rate": 0.00017583020584914746,
+      "loss": 1.2672,
+      "step": 5093
+    },
+    {
+      "epoch": 0.907051282051282,
+      "grad_norm": 0.4825877249240875,
+      "learning_rate": 0.00017582108010720006,
+      "loss": 0.9719,
+      "step": 5094
+    },
+    {
+      "epoch": 0.9072293447293447,
+      "grad_norm": 0.49902790784835815,
+      "learning_rate": 0.00017581195287969613,
+      "loss": 0.7941,
+      "step": 5095
+    },
+    {
+      "epoch": 0.9074074074074074,
+      "grad_norm": 0.5991541743278503,
+      "learning_rate": 0.0001758028241668144,
+      "loss": 1.049,
+      "step": 5096
+    },
+    {
+      "epoch": 0.9075854700854701,
+      "grad_norm": 0.5788859724998474,
+      "learning_rate": 0.00017579369396873384,
+      "loss": 1.0318,
+      "step": 5097
+    },
+    {
+      "epoch": 0.9077635327635327,
+      "grad_norm": 0.5914160013198853,
+      "learning_rate": 0.0001757845622856333,
+      "loss": 1.1007,
+      "step": 5098
+    },
+    {
+      "epoch": 0.9079415954415955,
+      "grad_norm": 0.5361711382865906,
+      "learning_rate": 0.00017577542911769166,
+      "loss": 1.0694,
+      "step": 5099
+    },
+    {
+      "epoch": 0.9081196581196581,
+      "grad_norm": 0.5752849578857422,
+      "learning_rate": 0.00017576629446508792,
+      "loss": 1.1184,
+      "step": 5100
+    },
+    {
+      "epoch": 0.9082977207977208,
+      "grad_norm": 0.6042249798774719,
+      "learning_rate": 0.000175757158328001,
+      "loss": 1.2808,
+      "step": 5101
+    },
+    {
+      "epoch": 0.9084757834757835,
+      "grad_norm": 0.508352518081665,
+      "learning_rate": 0.00017574802070661,
+      "loss": 1.0038,
+      "step": 5102
+    },
+    {
+      "epoch": 0.9086538461538461,
+      "grad_norm": 0.5667358040809631,
+      "learning_rate": 0.00017573888160109385,
+      "loss": 1.0208,
+      "step": 5103
+    },
+    {
+      "epoch": 0.9088319088319088,
+      "grad_norm": 0.653619647026062,
+      "learning_rate": 0.00017572974101163165,
+      "loss": 1.2053,
+      "step": 5104
+    },
+    {
+      "epoch": 0.9090099715099715,
+      "grad_norm": 0.5069597363471985,
+      "learning_rate": 0.00017572059893840246,
+      "loss": 0.8634,
+      "step": 5105
+    },
+    {
+      "epoch": 0.9091880341880342,
+      "grad_norm": 0.6160602569580078,
+      "learning_rate": 0.00017571145538158547,
+      "loss": 1.2626,
+      "step": 5106
+    },
+    {
+      "epoch": 0.9093660968660968,
+      "grad_norm": 0.6335833668708801,
+      "learning_rate": 0.00017570231034135978,
+      "loss": 1.3381,
+      "step": 5107
+    },
+    {
+      "epoch": 0.9095441595441596,
+      "grad_norm": 0.5140398740768433,
+      "learning_rate": 0.00017569316381790454,
+      "loss": 1.1258,
+      "step": 5108
+    },
+    {
+      "epoch": 0.9097222222222222,
+      "grad_norm": 0.5682975649833679,
+      "learning_rate": 0.00017568401581139905,
+      "loss": 1.3367,
+      "step": 5109
+    },
+    {
+      "epoch": 0.9099002849002849,
+      "grad_norm": 0.49765729904174805,
+      "learning_rate": 0.00017567486632202246,
+      "loss": 1.1891,
+      "step": 5110
+    },
+    {
+      "epoch": 0.9100783475783476,
+      "grad_norm": 0.5139224529266357,
+      "learning_rate": 0.00017566571534995406,
+      "loss": 0.9768,
+      "step": 5111
+    },
+    {
+      "epoch": 0.9102564102564102,
+      "grad_norm": 0.5510922074317932,
+      "learning_rate": 0.00017565656289537316,
+      "loss": 1.1552,
+      "step": 5112
+    },
+    {
+      "epoch": 0.9104344729344729,
+      "grad_norm": 0.6243364810943604,
+      "learning_rate": 0.00017564740895845908,
+      "loss": 1.1341,
+      "step": 5113
+    },
+    {
+      "epoch": 0.9106125356125356,
+      "grad_norm": 0.5334977507591248,
+      "learning_rate": 0.00017563825353939116,
+      "loss": 1.0894,
+      "step": 5114
+    },
+    {
+      "epoch": 0.9107905982905983,
+      "grad_norm": 0.5195826292037964,
+      "learning_rate": 0.00017562909663834878,
+      "loss": 1.1011,
+      "step": 5115
+    },
+    {
+      "epoch": 0.9109686609686609,
+      "grad_norm": 0.5298168063163757,
+      "learning_rate": 0.00017561993825551138,
+      "loss": 1.0079,
+      "step": 5116
+    },
+    {
+      "epoch": 0.9111467236467237,
+      "grad_norm": 0.5858965516090393,
+      "learning_rate": 0.00017561077839105835,
+      "loss": 1.2746,
+      "step": 5117
+    },
+    {
+      "epoch": 0.9113247863247863,
+      "grad_norm": 0.5572476387023926,
+      "learning_rate": 0.0001756016170451692,
+      "loss": 0.8169,
+      "step": 5118
+    },
+    {
+      "epoch": 0.9115028490028491,
+      "grad_norm": 0.5247095823287964,
+      "learning_rate": 0.0001755924542180234,
+      "loss": 1.1206,
+      "step": 5119
+    },
+    {
+      "epoch": 0.9116809116809117,
+      "grad_norm": 0.5605118274688721,
+      "learning_rate": 0.0001755832899098005,
+      "loss": 1.371,
+      "step": 5120
+    },
+    {
+      "epoch": 0.9118589743589743,
+      "grad_norm": 0.5732316970825195,
+      "learning_rate": 0.00017557412412068005,
+      "loss": 1.1248,
+      "step": 5121
+    },
+    {
+      "epoch": 0.9120370370370371,
+      "grad_norm": 0.6167279481887817,
+      "learning_rate": 0.0001755649568508416,
+      "loss": 0.94,
+      "step": 5122
+    },
+    {
+      "epoch": 0.9122150997150997,
+      "grad_norm": 0.5497499108314514,
+      "learning_rate": 0.00017555578810046483,
+      "loss": 1.0112,
+      "step": 5123
+    },
+    {
+      "epoch": 0.9123931623931624,
+      "grad_norm": 0.540762186050415,
+      "learning_rate": 0.00017554661786972931,
+      "loss": 1.1058,
+      "step": 5124
+    },
+    {
+      "epoch": 0.9125712250712251,
+      "grad_norm": 0.5943556427955627,
+      "learning_rate": 0.0001755374461588148,
+      "loss": 0.9086,
+      "step": 5125
+    },
+    {
+      "epoch": 0.9127492877492878,
+      "grad_norm": 0.5300756692886353,
+      "learning_rate": 0.0001755282729679009,
+      "loss": 1.1566,
+      "step": 5126
+    },
+    {
+      "epoch": 0.9129273504273504,
+      "grad_norm": 0.5390434861183167,
+      "learning_rate": 0.00017551909829716743,
+      "loss": 1.1395,
+      "step": 5127
+    },
+    {
+      "epoch": 0.9131054131054132,
+      "grad_norm": 0.627434492111206,
+      "learning_rate": 0.00017550992214679405,
+      "loss": 1.1537,
+      "step": 5128
+    },
+    {
+      "epoch": 0.9132834757834758,
+      "grad_norm": 0.4806903302669525,
+      "learning_rate": 0.00017550074451696063,
+      "loss": 0.7905,
+      "step": 5129
+    },
+    {
+      "epoch": 0.9134615384615384,
+      "grad_norm": 0.5714817047119141,
+      "learning_rate": 0.00017549156540784696,
+      "loss": 1.1042,
+      "step": 5130
+    },
+    {
+      "epoch": 0.9136396011396012,
+      "grad_norm": 0.5839236378669739,
+      "learning_rate": 0.0001754823848196329,
+      "loss": 1.0383,
+      "step": 5131
+    },
+    {
+      "epoch": 0.9138176638176638,
+      "grad_norm": 0.6089872717857361,
+      "learning_rate": 0.0001754732027524983,
+      "loss": 0.9399,
+      "step": 5132
+    },
+    {
+      "epoch": 0.9139957264957265,
+      "grad_norm": 0.4937956631183624,
+      "learning_rate": 0.00017546401920662307,
+      "loss": 0.7382,
+      "step": 5133
+    },
+    {
+      "epoch": 0.9141737891737892,
+      "grad_norm": 0.5918676257133484,
+      "learning_rate": 0.00017545483418218716,
+      "loss": 1.2207,
+      "step": 5134
+    },
+    {
+      "epoch": 0.9143518518518519,
+      "grad_norm": 0.5825346112251282,
+      "learning_rate": 0.0001754456476793705,
+      "loss": 0.9669,
+      "step": 5135
+    },
+    {
+      "epoch": 0.9145299145299145,
+      "grad_norm": 0.49829617142677307,
+      "learning_rate": 0.0001754364596983531,
+      "loss": 1.2247,
+      "step": 5136
+    },
+    {
+      "epoch": 0.9147079772079773,
+      "grad_norm": 0.5128271579742432,
+      "learning_rate": 0.00017542727023931497,
+      "loss": 0.9563,
+      "step": 5137
+    },
+    {
+      "epoch": 0.9148860398860399,
+      "grad_norm": 0.5789414644241333,
+      "learning_rate": 0.00017541807930243622,
+      "loss": 1.22,
+      "step": 5138
+    },
+    {
+      "epoch": 0.9150641025641025,
+      "grad_norm": 0.44155433773994446,
+      "learning_rate": 0.00017540888688789683,
+      "loss": 0.9897,
+      "step": 5139
+    },
+    {
+      "epoch": 0.9152421652421653,
+      "grad_norm": 0.550464391708374,
+      "learning_rate": 0.00017539969299587696,
+      "loss": 1.0624,
+      "step": 5140
+    },
+    {
+      "epoch": 0.9154202279202279,
+      "grad_norm": 0.5019831657409668,
+      "learning_rate": 0.0001753904976265567,
+      "loss": 0.9045,
+      "step": 5141
+    },
+    {
+      "epoch": 0.9155982905982906,
+      "grad_norm": 0.589658796787262,
+      "learning_rate": 0.0001753813007801163,
+      "loss": 1.0454,
+      "step": 5142
+    },
+    {
+      "epoch": 0.9157763532763533,
+      "grad_norm": 0.5945459008216858,
+      "learning_rate": 0.00017537210245673586,
+      "loss": 1.0042,
+      "step": 5143
+    },
+    {
+      "epoch": 0.915954415954416,
+      "grad_norm": 0.5409809947013855,
+      "learning_rate": 0.00017536290265659566,
+      "loss": 1.0609,
+      "step": 5144
+    },
+    {
+      "epoch": 0.9161324786324786,
+      "grad_norm": 0.5302975177764893,
+      "learning_rate": 0.00017535370137987597,
+      "loss": 1.1394,
+      "step": 5145
+    },
+    {
+      "epoch": 0.9163105413105413,
+      "grad_norm": 0.5253351330757141,
+      "learning_rate": 0.00017534449862675698,
+      "loss": 1.2249,
+      "step": 5146
+    },
+    {
+      "epoch": 0.916488603988604,
+      "grad_norm": 0.6363829970359802,
+      "learning_rate": 0.00017533529439741908,
+      "loss": 1.1333,
+      "step": 5147
+    },
+    {
+      "epoch": 0.9166666666666666,
+      "grad_norm": 0.4703354835510254,
+      "learning_rate": 0.0001753260886920426,
+      "loss": 0.9971,
+      "step": 5148
+    },
+    {
+      "epoch": 0.9168447293447294,
+      "grad_norm": 0.6394907236099243,
+      "learning_rate": 0.00017531688151080786,
+      "loss": 1.5942,
+      "step": 5149
+    },
+    {
+      "epoch": 0.917022792022792,
+      "grad_norm": 0.5573459267616272,
+      "learning_rate": 0.00017530767285389527,
+      "loss": 0.9669,
+      "step": 5150
+    },
+    {
+      "epoch": 0.9172008547008547,
+      "grad_norm": 0.5000962615013123,
+      "learning_rate": 0.00017529846272148532,
+      "loss": 1.2151,
+      "step": 5151
+    },
+    {
+      "epoch": 0.9173789173789174,
+      "grad_norm": 0.5550395846366882,
+      "learning_rate": 0.0001752892511137584,
+      "loss": 1.1765,
+      "step": 5152
+    },
+    {
+      "epoch": 0.91755698005698,
+      "grad_norm": 0.5461394786834717,
+      "learning_rate": 0.00017528003803089496,
+      "loss": 1.1136,
+      "step": 5153
+    },
+    {
+      "epoch": 0.9177350427350427,
+      "grad_norm": 0.5512672662734985,
+      "learning_rate": 0.00017527082347307558,
+      "loss": 1.1727,
+      "step": 5154
+    },
+    {
+      "epoch": 0.9179131054131054,
+      "grad_norm": 0.5210778713226318,
+      "learning_rate": 0.0001752616074404808,
+      "loss": 1.09,
+      "step": 5155
+    },
+    {
+      "epoch": 0.9180911680911681,
+      "grad_norm": 0.5214943289756775,
+      "learning_rate": 0.00017525238993329115,
+      "loss": 0.9654,
+      "step": 5156
+    },
+    {
+      "epoch": 0.9182692307692307,
+      "grad_norm": 0.5822862386703491,
+      "learning_rate": 0.00017524317095168724,
+      "loss": 1.0951,
+      "step": 5157
+    },
+    {
+      "epoch": 0.9184472934472935,
+      "grad_norm": 0.43948012590408325,
+      "learning_rate": 0.0001752339504958497,
+      "loss": 0.6984,
+      "step": 5158
+    },
+    {
+      "epoch": 0.9186253561253561,
+      "grad_norm": 0.5024449229240417,
+      "learning_rate": 0.00017522472856595916,
+      "loss": 0.983,
+      "step": 5159
+    },
+    {
+      "epoch": 0.9188034188034188,
+      "grad_norm": 0.5815144181251526,
+      "learning_rate": 0.00017521550516219636,
+      "loss": 0.9784,
+      "step": 5160
+    },
+    {
+      "epoch": 0.9189814814814815,
+      "grad_norm": 0.5519825220108032,
+      "learning_rate": 0.00017520628028474197,
+      "loss": 1.064,
+      "step": 5161
+    },
+    {
+      "epoch": 0.9191595441595442,
+      "grad_norm": 0.5615749955177307,
+      "learning_rate": 0.00017519705393377675,
+      "loss": 1.1284,
+      "step": 5162
+    },
+    {
+      "epoch": 0.9193376068376068,
+      "grad_norm": 0.5929917693138123,
+      "learning_rate": 0.00017518782610948148,
+      "loss": 1.1221,
+      "step": 5163
+    },
+    {
+      "epoch": 0.9195156695156695,
+      "grad_norm": 0.7116361856460571,
+      "learning_rate": 0.00017517859681203692,
+      "loss": 1.0188,
+      "step": 5164
+    },
+    {
+      "epoch": 0.9196937321937322,
+      "grad_norm": 0.5095893740653992,
+      "learning_rate": 0.00017516936604162396,
+      "loss": 1.0724,
+      "step": 5165
+    },
+    {
+      "epoch": 0.9198717948717948,
+      "grad_norm": 0.5701385736465454,
+      "learning_rate": 0.00017516013379842337,
+      "loss": 1.0572,
+      "step": 5166
+    },
+    {
+      "epoch": 0.9200498575498576,
+      "grad_norm": 0.518412709236145,
+      "learning_rate": 0.00017515090008261613,
+      "loss": 1.0514,
+      "step": 5167
+    },
+    {
+      "epoch": 0.9202279202279202,
+      "grad_norm": 0.5324261784553528,
+      "learning_rate": 0.00017514166489438312,
+      "loss": 1.1708,
+      "step": 5168
+    },
+    {
+      "epoch": 0.9204059829059829,
+      "grad_norm": 0.5640990138053894,
+      "learning_rate": 0.00017513242823390525,
+      "loss": 1.2846,
+      "step": 5169
+    },
+    {
+      "epoch": 0.9205840455840456,
+      "grad_norm": 0.510352373123169,
+      "learning_rate": 0.00017512319010136356,
+      "loss": 1.0763,
+      "step": 5170
+    },
+    {
+      "epoch": 0.9207621082621082,
+      "grad_norm": 0.4994175136089325,
+      "learning_rate": 0.00017511395049693898,
+      "loss": 0.9665,
+      "step": 5171
+    },
+    {
+      "epoch": 0.9209401709401709,
+      "grad_norm": 0.43196994066238403,
+      "learning_rate": 0.00017510470942081258,
+      "loss": 0.761,
+      "step": 5172
+    },
+    {
+      "epoch": 0.9211182336182336,
+      "grad_norm": 0.558977484703064,
+      "learning_rate": 0.00017509546687316543,
+      "loss": 1.0758,
+      "step": 5173
+    },
+    {
+      "epoch": 0.9212962962962963,
+      "grad_norm": 0.573302149772644,
+      "learning_rate": 0.0001750862228541786,
+      "loss": 0.9635,
+      "step": 5174
+    },
+    {
+      "epoch": 0.9214743589743589,
+      "grad_norm": 0.5083786845207214,
+      "learning_rate": 0.00017507697736403321,
+      "loss": 1.0311,
+      "step": 5175
+    },
+    {
+      "epoch": 0.9216524216524217,
+      "grad_norm": 0.5478954911231995,
+      "learning_rate": 0.00017506773040291043,
+      "loss": 1.074,
+      "step": 5176
+    },
+    {
+      "epoch": 0.9218304843304843,
+      "grad_norm": 0.522376537322998,
+      "learning_rate": 0.00017505848197099137,
+      "loss": 1.1162,
+      "step": 5177
+    },
+    {
+      "epoch": 0.9220085470085471,
+      "grad_norm": 0.5946292281150818,
+      "learning_rate": 0.0001750492320684573,
+      "loss": 0.9494,
+      "step": 5178
+    },
+    {
+      "epoch": 0.9221866096866097,
+      "grad_norm": 0.5423247814178467,
+      "learning_rate": 0.00017503998069548943,
+      "loss": 1.0558,
+      "step": 5179
+    },
+    {
+      "epoch": 0.9223646723646723,
+      "grad_norm": 0.49960651993751526,
+      "learning_rate": 0.000175030727852269,
+      "loss": 1.0748,
+      "step": 5180
+    },
+    {
+      "epoch": 0.9225427350427351,
+      "grad_norm": 0.6066586375236511,
+      "learning_rate": 0.00017502147353897732,
+      "loss": 1.2066,
+      "step": 5181
+    },
+    {
+      "epoch": 0.9227207977207977,
+      "grad_norm": 0.57244473695755,
+      "learning_rate": 0.00017501221775579576,
+      "loss": 1.048,
+      "step": 5182
+    },
+    {
+      "epoch": 0.9228988603988604,
+      "grad_norm": 0.512464165687561,
+      "learning_rate": 0.00017500296050290557,
+      "loss": 1.1405,
+      "step": 5183
+    },
+    {
+      "epoch": 0.9230769230769231,
+      "grad_norm": 0.5380734801292419,
+      "learning_rate": 0.00017499370178048818,
+      "loss": 1.0641,
+      "step": 5184
+    },
+    {
+      "epoch": 0.9232549857549858,
+      "grad_norm": 0.47102874517440796,
+      "learning_rate": 0.000174984441588725,
+      "loss": 0.7948,
+      "step": 5185
+    },
+    {
+      "epoch": 0.9234330484330484,
+      "grad_norm": 0.6702211499214172,
+      "learning_rate": 0.00017497517992779747,
+      "loss": 1.3009,
+      "step": 5186
+    },
+    {
+      "epoch": 0.9236111111111112,
+      "grad_norm": 0.4685834050178528,
+      "learning_rate": 0.000174965916797887,
+      "loss": 0.8136,
+      "step": 5187
+    },
+    {
+      "epoch": 0.9237891737891738,
+      "grad_norm": 0.5414277911186218,
+      "learning_rate": 0.00017495665219917513,
+      "loss": 0.9708,
+      "step": 5188
+    },
+    {
+      "epoch": 0.9239672364672364,
+      "grad_norm": 0.5253050923347473,
+      "learning_rate": 0.0001749473861318434,
+      "loss": 1.0691,
+      "step": 5189
+    },
+    {
+      "epoch": 0.9241452991452992,
+      "grad_norm": 0.6009906530380249,
+      "learning_rate": 0.00017493811859607328,
+      "loss": 1.2023,
+      "step": 5190
+    },
+    {
+      "epoch": 0.9243233618233618,
+      "grad_norm": 0.5519336462020874,
+      "learning_rate": 0.00017492884959204643,
+      "loss": 1.189,
+      "step": 5191
+    },
+    {
+      "epoch": 0.9245014245014245,
+      "grad_norm": 0.5024857521057129,
+      "learning_rate": 0.0001749195791199444,
+      "loss": 0.8685,
+      "step": 5192
+    },
+    {
+      "epoch": 0.9246794871794872,
+      "grad_norm": 0.5735679864883423,
+      "learning_rate": 0.00017491030717994887,
+      "loss": 1.1903,
+      "step": 5193
+    },
+    {
+      "epoch": 0.9248575498575499,
+      "grad_norm": 0.5338658094406128,
+      "learning_rate": 0.00017490103377224147,
+      "loss": 1.0442,
+      "step": 5194
+    },
+    {
+      "epoch": 0.9250356125356125,
+      "grad_norm": 0.46669119596481323,
+      "learning_rate": 0.0001748917588970039,
+      "loss": 0.6343,
+      "step": 5195
+    },
+    {
+      "epoch": 0.9252136752136753,
+      "grad_norm": 0.510910153388977,
+      "learning_rate": 0.00017488248255441793,
+      "loss": 0.9334,
+      "step": 5196
+    },
+    {
+      "epoch": 0.9253917378917379,
+      "grad_norm": 0.5732216238975525,
+      "learning_rate": 0.00017487320474466524,
+      "loss": 1.0483,
+      "step": 5197
+    },
+    {
+      "epoch": 0.9255698005698005,
+      "grad_norm": 0.5864318609237671,
+      "learning_rate": 0.00017486392546792762,
+      "loss": 1.0669,
+      "step": 5198
+    },
+    {
+      "epoch": 0.9257478632478633,
+      "grad_norm": 0.5074281096458435,
+      "learning_rate": 0.00017485464472438692,
+      "loss": 1.0636,
+      "step": 5199
+    },
+    {
+      "epoch": 0.9259259259259259,
+      "grad_norm": 0.5833215117454529,
+      "learning_rate": 0.00017484536251422496,
+      "loss": 1.2005,
+      "step": 5200
+    },
+    {
+      "epoch": 0.9261039886039886,
+      "grad_norm": 0.5624990463256836,
+      "learning_rate": 0.0001748360788376236,
+      "loss": 1.1623,
+      "step": 5201
+    },
+    {
+      "epoch": 0.9262820512820513,
+      "grad_norm": 0.5618230104446411,
+      "learning_rate": 0.00017482679369476472,
+      "loss": 1.0495,
+      "step": 5202
+    },
+    {
+      "epoch": 0.926460113960114,
+      "grad_norm": 0.6254985332489014,
+      "learning_rate": 0.00017481750708583024,
+      "loss": 0.9521,
+      "step": 5203
+    },
+    {
+      "epoch": 0.9266381766381766,
+      "grad_norm": 0.5488203763961792,
+      "learning_rate": 0.00017480821901100216,
+      "loss": 1.0689,
+      "step": 5204
+    },
+    {
+      "epoch": 0.9268162393162394,
+      "grad_norm": 0.6157993674278259,
+      "learning_rate": 0.00017479892947046245,
+      "loss": 1.2852,
+      "step": 5205
+    },
+    {
+      "epoch": 0.926994301994302,
+      "grad_norm": 0.49653390049934387,
+      "learning_rate": 0.00017478963846439305,
+      "loss": 0.8616,
+      "step": 5206
+    },
+    {
+      "epoch": 0.9271723646723646,
+      "grad_norm": 0.5079081058502197,
+      "learning_rate": 0.00017478034599297603,
+      "loss": 1.0192,
+      "step": 5207
+    },
+    {
+      "epoch": 0.9273504273504274,
+      "grad_norm": 0.5392495393753052,
+      "learning_rate": 0.00017477105205639354,
+      "loss": 1.115,
+      "step": 5208
+    },
+    {
+      "epoch": 0.92752849002849,
+      "grad_norm": 0.5336191654205322,
+      "learning_rate": 0.00017476175665482756,
+      "loss": 1.1892,
+      "step": 5209
+    },
+    {
+      "epoch": 0.9277065527065527,
+      "grad_norm": 0.631712019443512,
+      "learning_rate": 0.00017475245978846026,
+      "loss": 0.9619,
+      "step": 5210
+    },
+    {
+      "epoch": 0.9278846153846154,
+      "grad_norm": 0.5123951435089111,
+      "learning_rate": 0.0001747431614574738,
+      "loss": 1.1477,
+      "step": 5211
+    },
+    {
+      "epoch": 0.9280626780626781,
+      "grad_norm": 0.5045743584632874,
+      "learning_rate": 0.00017473386166205038,
+      "loss": 0.9749,
+      "step": 5212
+    },
+    {
+      "epoch": 0.9282407407407407,
+      "grad_norm": 0.5296525359153748,
+      "learning_rate": 0.00017472456040237217,
+      "loss": 1.0736,
+      "step": 5213
+    },
+    {
+      "epoch": 0.9284188034188035,
+      "grad_norm": 0.6304933428764343,
+      "learning_rate": 0.00017471525767862145,
+      "loss": 1.2444,
+      "step": 5214
+    },
+    {
+      "epoch": 0.9285968660968661,
+      "grad_norm": 0.4851958155632019,
+      "learning_rate": 0.00017470595349098044,
+      "loss": 0.9049,
+      "step": 5215
+    },
+    {
+      "epoch": 0.9287749287749287,
+      "grad_norm": 0.5730679631233215,
+      "learning_rate": 0.00017469664783963148,
+      "loss": 1.0773,
+      "step": 5216
+    },
+    {
+      "epoch": 0.9289529914529915,
+      "grad_norm": 0.6020415425300598,
+      "learning_rate": 0.00017468734072475684,
+      "loss": 1.3247,
+      "step": 5217
+    },
+    {
+      "epoch": 0.9291310541310541,
+      "grad_norm": 0.47981077432632446,
+      "learning_rate": 0.00017467803214653893,
+      "loss": 1.0009,
+      "step": 5218
+    },
+    {
+      "epoch": 0.9293091168091168,
+      "grad_norm": 0.5787527561187744,
+      "learning_rate": 0.0001746687221051601,
+      "loss": 1.2523,
+      "step": 5219
+    },
+    {
+      "epoch": 0.9294871794871795,
+      "grad_norm": 0.4495891332626343,
+      "learning_rate": 0.00017465941060080278,
+      "loss": 0.7364,
+      "step": 5220
+    },
+    {
+      "epoch": 0.9296652421652422,
+      "grad_norm": 0.5721768140792847,
+      "learning_rate": 0.0001746500976336494,
+      "loss": 1.015,
+      "step": 5221
+    },
+    {
+      "epoch": 0.9298433048433048,
+      "grad_norm": 0.5500208735466003,
+      "learning_rate": 0.0001746407832038824,
+      "loss": 1.053,
+      "step": 5222
+    },
+    {
+      "epoch": 0.9300213675213675,
+      "grad_norm": 0.5784386992454529,
+      "learning_rate": 0.00017463146731168437,
+      "loss": 0.9784,
+      "step": 5223
+    },
+    {
+      "epoch": 0.9301994301994302,
+      "grad_norm": 0.4960322082042694,
+      "learning_rate": 0.00017462214995723772,
+      "loss": 0.8674,
+      "step": 5224
+    },
+    {
+      "epoch": 0.9303774928774928,
+      "grad_norm": 0.5005537271499634,
+      "learning_rate": 0.00017461283114072508,
+      "loss": 1.0486,
+      "step": 5225
+    },
+    {
+      "epoch": 0.9305555555555556,
+      "grad_norm": 0.5064167380332947,
+      "learning_rate": 0.000174603510862329,
+      "loss": 0.9722,
+      "step": 5226
+    },
+    {
+      "epoch": 0.9307336182336182,
+      "grad_norm": 0.583558976650238,
+      "learning_rate": 0.0001745941891222321,
+      "loss": 0.9957,
+      "step": 5227
+    },
+    {
+      "epoch": 0.9309116809116809,
+      "grad_norm": 0.4982515871524811,
+      "learning_rate": 0.00017458486592061704,
+      "loss": 0.958,
+      "step": 5228
+    },
+    {
+      "epoch": 0.9310897435897436,
+      "grad_norm": 0.526549756526947,
+      "learning_rate": 0.0001745755412576664,
+      "loss": 1.1172,
+      "step": 5229
+    },
+    {
+      "epoch": 0.9312678062678063,
+      "grad_norm": 0.6129719018936157,
+      "learning_rate": 0.000174566215133563,
+      "loss": 1.2524,
+      "step": 5230
+    },
+    {
+      "epoch": 0.9314458689458689,
+      "grad_norm": 0.5385653972625732,
+      "learning_rate": 0.00017455688754848948,
+      "loss": 1.1655,
+      "step": 5231
+    },
+    {
+      "epoch": 0.9316239316239316,
+      "grad_norm": 0.5646410584449768,
+      "learning_rate": 0.0001745475585026287,
+      "loss": 0.9026,
+      "step": 5232
+    },
+    {
+      "epoch": 0.9318019943019943,
+      "grad_norm": 0.549223780632019,
+      "learning_rate": 0.0001745382279961633,
+      "loss": 0.804,
+      "step": 5233
+    },
+    {
+      "epoch": 0.9319800569800569,
+      "grad_norm": 0.48547953367233276,
+      "learning_rate": 0.0001745288960292762,
+      "loss": 1.0224,
+      "step": 5234
+    },
+    {
+      "epoch": 0.9321581196581197,
+      "grad_norm": 0.5260967016220093,
+      "learning_rate": 0.00017451956260215016,
+      "loss": 0.9688,
+      "step": 5235
+    },
+    {
+      "epoch": 0.9323361823361823,
+      "grad_norm": 0.6261999011039734,
+      "learning_rate": 0.00017451022771496812,
+      "loss": 1.2539,
+      "step": 5236
+    },
+    {
+      "epoch": 0.9325142450142451,
+      "grad_norm": 0.5801421999931335,
+      "learning_rate": 0.00017450089136791298,
+      "loss": 1.11,
+      "step": 5237
+    },
+    {
+      "epoch": 0.9326923076923077,
+      "grad_norm": 0.5833573937416077,
+      "learning_rate": 0.0001744915535611676,
+      "loss": 0.9328,
+      "step": 5238
+    },
+    {
+      "epoch": 0.9328703703703703,
+      "grad_norm": 0.5422634482383728,
+      "learning_rate": 0.00017448221429491496,
+      "loss": 1.034,
+      "step": 5239
+    },
+    {
+      "epoch": 0.9330484330484331,
+      "grad_norm": 0.5105658769607544,
+      "learning_rate": 0.00017447287356933808,
+      "loss": 0.8924,
+      "step": 5240
+    },
+    {
+      "epoch": 0.9332264957264957,
+      "grad_norm": 0.5114831924438477,
+      "learning_rate": 0.00017446353138461995,
+      "loss": 0.9328,
+      "step": 5241
+    },
+    {
+      "epoch": 0.9334045584045584,
+      "grad_norm": 0.5105039477348328,
+      "learning_rate": 0.00017445418774094358,
+      "loss": 1.0468,
+      "step": 5242
+    },
+    {
+      "epoch": 0.9335826210826211,
+      "grad_norm": 0.593250036239624,
+      "learning_rate": 0.00017444484263849208,
+      "loss": 1.0603,
+      "step": 5243
+    },
+    {
+      "epoch": 0.9337606837606838,
+      "grad_norm": 0.600788414478302,
+      "learning_rate": 0.00017443549607744853,
+      "loss": 1.1506,
+      "step": 5244
+    },
+    {
+      "epoch": 0.9339387464387464,
+      "grad_norm": 0.5394418239593506,
+      "learning_rate": 0.00017442614805799605,
+      "loss": 1.038,
+      "step": 5245
+    },
+    {
+      "epoch": 0.9341168091168092,
+      "grad_norm": 0.5446375608444214,
+      "learning_rate": 0.00017441679858031786,
+      "loss": 1.079,
+      "step": 5246
+    },
+    {
+      "epoch": 0.9342948717948718,
+      "grad_norm": 0.5859794616699219,
+      "learning_rate": 0.00017440744764459702,
+      "loss": 1.1453,
+      "step": 5247
+    },
+    {
+      "epoch": 0.9344729344729344,
+      "grad_norm": 0.4899081289768219,
+      "learning_rate": 0.00017439809525101688,
+      "loss": 1.163,
+      "step": 5248
+    },
+    {
+      "epoch": 0.9346509971509972,
+      "grad_norm": 0.652846097946167,
+      "learning_rate": 0.00017438874139976055,
+      "loss": 1.1819,
+      "step": 5249
+    },
+    {
+      "epoch": 0.9348290598290598,
+      "grad_norm": 0.5402514934539795,
+      "learning_rate": 0.00017437938609101138,
+      "loss": 1.0159,
+      "step": 5250
+    },
+    {
+      "epoch": 0.9350071225071225,
+      "grad_norm": 0.565864086151123,
+      "learning_rate": 0.00017437002932495265,
+      "loss": 1.1121,
+      "step": 5251
+    },
+    {
+      "epoch": 0.9351851851851852,
+      "grad_norm": 0.611786425113678,
+      "learning_rate": 0.0001743606711017677,
+      "loss": 1.2511,
+      "step": 5252
+    },
+    {
+      "epoch": 0.9353632478632479,
+      "grad_norm": 0.5706882476806641,
+      "learning_rate": 0.00017435131142163988,
+      "loss": 1.128,
+      "step": 5253
+    },
+    {
+      "epoch": 0.9355413105413105,
+      "grad_norm": 0.5369367003440857,
+      "learning_rate": 0.00017434195028475253,
+      "loss": 1.0562,
+      "step": 5254
+    },
+    {
+      "epoch": 0.9357193732193733,
+      "grad_norm": 0.49957552552223206,
+      "learning_rate": 0.0001743325876912891,
+      "loss": 1.0568,
+      "step": 5255
+    },
+    {
+      "epoch": 0.9358974358974359,
+      "grad_norm": 0.5398106575012207,
+      "learning_rate": 0.00017432322364143305,
+      "loss": 1.1502,
+      "step": 5256
+    },
+    {
+      "epoch": 0.9360754985754985,
+      "grad_norm": 0.6522027254104614,
+      "learning_rate": 0.00017431385813536783,
+      "loss": 1.0591,
+      "step": 5257
+    },
+    {
+      "epoch": 0.9362535612535613,
+      "grad_norm": 0.5872012972831726,
+      "learning_rate": 0.00017430449117327693,
+      "loss": 1.3737,
+      "step": 5258
+    },
+    {
+      "epoch": 0.9364316239316239,
+      "grad_norm": 0.5124474167823792,
+      "learning_rate": 0.00017429512275534382,
+      "loss": 1.0727,
+      "step": 5259
+    },
+    {
+      "epoch": 0.9366096866096866,
+      "grad_norm": 0.5103365778923035,
+      "learning_rate": 0.00017428575288175218,
+      "loss": 1.0339,
+      "step": 5260
+    },
+    {
+      "epoch": 0.9367877492877493,
+      "grad_norm": 0.585483729839325,
+      "learning_rate": 0.0001742763815526855,
+      "loss": 1.1844,
+      "step": 5261
+    },
+    {
+      "epoch": 0.936965811965812,
+      "grad_norm": 0.5855562090873718,
+      "learning_rate": 0.00017426700876832746,
+      "loss": 1.3234,
+      "step": 5262
+    },
+    {
+      "epoch": 0.9371438746438746,
+      "grad_norm": 0.5774588584899902,
+      "learning_rate": 0.00017425763452886162,
+      "loss": 1.0937,
+      "step": 5263
+    },
+    {
+      "epoch": 0.9373219373219374,
+      "grad_norm": 0.5718343257904053,
+      "learning_rate": 0.00017424825883447168,
+      "loss": 1.0783,
+      "step": 5264
+    },
+    {
+      "epoch": 0.9375,
+      "grad_norm": 0.5414558053016663,
+      "learning_rate": 0.00017423888168534136,
+      "loss": 1.1244,
+      "step": 5265
+    },
+    {
+      "epoch": 0.9376780626780626,
+      "grad_norm": 0.5818275809288025,
+      "learning_rate": 0.00017422950308165438,
+      "loss": 1.247,
+      "step": 5266
+    },
+    {
+      "epoch": 0.9378561253561254,
+      "grad_norm": 0.586398184299469,
+      "learning_rate": 0.00017422012302359448,
+      "loss": 1.0515,
+      "step": 5267
+    },
+    {
+      "epoch": 0.938034188034188,
+      "grad_norm": 0.5236606001853943,
+      "learning_rate": 0.00017421074151134544,
+      "loss": 1.1907,
+      "step": 5268
+    },
+    {
+      "epoch": 0.9382122507122507,
+      "grad_norm": 0.5108010172843933,
+      "learning_rate": 0.0001742013585450911,
+      "loss": 1.1125,
+      "step": 5269
+    },
+    {
+      "epoch": 0.9383903133903134,
+      "grad_norm": 0.4956454038619995,
+      "learning_rate": 0.00017419197412501527,
+      "loss": 1.0305,
+      "step": 5270
+    },
+    {
+      "epoch": 0.9385683760683761,
+      "grad_norm": 0.5432302951812744,
+      "learning_rate": 0.0001741825882513018,
+      "loss": 1.1946,
+      "step": 5271
+    },
+    {
+      "epoch": 0.9387464387464387,
+      "grad_norm": 0.5119295716285706,
+      "learning_rate": 0.00017417320092413463,
+      "loss": 0.875,
+      "step": 5272
+    },
+    {
+      "epoch": 0.9389245014245015,
+      "grad_norm": 0.49740248918533325,
+      "learning_rate": 0.0001741638121436977,
+      "loss": 1.1093,
+      "step": 5273
+    },
+    {
+      "epoch": 0.9391025641025641,
+      "grad_norm": 0.5069027543067932,
+      "learning_rate": 0.00017415442191017491,
+      "loss": 1.2498,
+      "step": 5274
+    },
+    {
+      "epoch": 0.9392806267806267,
+      "grad_norm": 0.570264995098114,
+      "learning_rate": 0.00017414503022375027,
+      "loss": 1.0192,
+      "step": 5275
+    },
+    {
+      "epoch": 0.9394586894586895,
+      "grad_norm": 0.48129352927207947,
+      "learning_rate": 0.00017413563708460776,
+      "loss": 0.8467,
+      "step": 5276
+    },
+    {
+      "epoch": 0.9396367521367521,
+      "grad_norm": 0.5214534401893616,
+      "learning_rate": 0.00017412624249293148,
+      "loss": 0.9723,
+      "step": 5277
+    },
+    {
+      "epoch": 0.9398148148148148,
+      "grad_norm": 0.5150161385536194,
+      "learning_rate": 0.00017411684644890544,
+      "loss": 1.0906,
+      "step": 5278
+    },
+    {
+      "epoch": 0.9399928774928775,
+      "grad_norm": 0.5695852637290955,
+      "learning_rate": 0.00017410744895271377,
+      "loss": 1.2891,
+      "step": 5279
+    },
+    {
+      "epoch": 0.9401709401709402,
+      "grad_norm": 0.5613594651222229,
+      "learning_rate": 0.00017409805000454055,
+      "loss": 1.1373,
+      "step": 5280
+    },
+    {
+      "epoch": 0.9403490028490028,
+      "grad_norm": 0.5134239196777344,
+      "learning_rate": 0.00017408864960457004,
+      "loss": 1.1081,
+      "step": 5281
+    },
+    {
+      "epoch": 0.9405270655270656,
+      "grad_norm": 0.5256397724151611,
+      "learning_rate": 0.00017407924775298628,
+      "loss": 1.058,
+      "step": 5282
+    },
+    {
+      "epoch": 0.9407051282051282,
+      "grad_norm": 0.5145402550697327,
+      "learning_rate": 0.00017406984444997357,
+      "loss": 1.0667,
+      "step": 5283
+    },
+    {
+      "epoch": 0.9408831908831908,
+      "grad_norm": 0.5435704588890076,
+      "learning_rate": 0.0001740604396957161,
+      "loss": 1.2275,
+      "step": 5284
+    },
+    {
+      "epoch": 0.9410612535612536,
+      "grad_norm": 0.5798762440681458,
+      "learning_rate": 0.0001740510334903982,
+      "loss": 1.2061,
+      "step": 5285
+    },
+    {
+      "epoch": 0.9412393162393162,
+      "grad_norm": 0.5461057424545288,
+      "learning_rate": 0.00017404162583420414,
+      "loss": 1.1585,
+      "step": 5286
+    },
+    {
+      "epoch": 0.9414173789173789,
+      "grad_norm": 0.5090487003326416,
+      "learning_rate": 0.00017403221672731818,
+      "loss": 1.2496,
+      "step": 5287
+    },
+    {
+      "epoch": 0.9415954415954416,
+      "grad_norm": 0.5171035528182983,
+      "learning_rate": 0.00017402280616992476,
+      "loss": 1.1947,
+      "step": 5288
+    },
+    {
+      "epoch": 0.9417735042735043,
+      "grad_norm": 0.5292364358901978,
+      "learning_rate": 0.00017401339416220818,
+      "loss": 1.0182,
+      "step": 5289
+    },
+    {
+      "epoch": 0.9419515669515669,
+      "grad_norm": 0.5011499524116516,
+      "learning_rate": 0.00017400398070435293,
+      "loss": 1.3363,
+      "step": 5290
+    },
+    {
+      "epoch": 0.9421296296296297,
+      "grad_norm": 0.4821554720401764,
+      "learning_rate": 0.0001739945657965434,
+      "loss": 0.9077,
+      "step": 5291
+    },
+    {
+      "epoch": 0.9423076923076923,
+      "grad_norm": 0.5849515199661255,
+      "learning_rate": 0.00017398514943896403,
+      "loss": 1.1582,
+      "step": 5292
+    },
+    {
+      "epoch": 0.9424857549857549,
+      "grad_norm": 0.49826139211654663,
+      "learning_rate": 0.00017397573163179937,
+      "loss": 1.1025,
+      "step": 5293
+    },
+    {
+      "epoch": 0.9426638176638177,
+      "grad_norm": 0.6031842827796936,
+      "learning_rate": 0.00017396631237523392,
+      "loss": 1.1932,
+      "step": 5294
+    },
+    {
+      "epoch": 0.9428418803418803,
+      "grad_norm": 0.6013330221176147,
+      "learning_rate": 0.00017395689166945224,
+      "loss": 1.2078,
+      "step": 5295
+    },
+    {
+      "epoch": 0.9430199430199431,
+      "grad_norm": 0.5147021412849426,
+      "learning_rate": 0.00017394746951463893,
+      "loss": 0.9988,
+      "step": 5296
+    },
+    {
+      "epoch": 0.9431980056980057,
+      "grad_norm": 0.5721762776374817,
+      "learning_rate": 0.0001739380459109785,
+      "loss": 1.1442,
+      "step": 5297
+    },
+    {
+      "epoch": 0.9433760683760684,
+      "grad_norm": 0.49272531270980835,
+      "learning_rate": 0.0001739286208586557,
+      "loss": 1.0481,
+      "step": 5298
+    },
+    {
+      "epoch": 0.9435541310541311,
+      "grad_norm": 0.6545688509941101,
+      "learning_rate": 0.00017391919435785514,
+      "loss": 1.1393,
+      "step": 5299
+    },
+    {
+      "epoch": 0.9437321937321937,
+      "grad_norm": 0.617756724357605,
+      "learning_rate": 0.00017390976640876152,
+      "loss": 1.1108,
+      "step": 5300
+    },
+    {
+      "epoch": 0.9439102564102564,
+      "grad_norm": 0.4870470464229584,
+      "learning_rate": 0.00017390033701155955,
+      "loss": 0.9028,
+      "step": 5301
+    },
+    {
+      "epoch": 0.9440883190883191,
+      "grad_norm": 0.5250138640403748,
+      "learning_rate": 0.000173890906166434,
+      "loss": 1.0326,
+      "step": 5302
+    },
+    {
+      "epoch": 0.9442663817663818,
+      "grad_norm": 0.5879467129707336,
+      "learning_rate": 0.00017388147387356964,
+      "loss": 1.1569,
+      "step": 5303
+    },
+    {
+      "epoch": 0.9444444444444444,
+      "grad_norm": 0.4790486991405487,
+      "learning_rate": 0.00017387204013315127,
+      "loss": 0.967,
+      "step": 5304
+    },
+    {
+      "epoch": 0.9446225071225072,
+      "grad_norm": 0.5884372591972351,
+      "learning_rate": 0.0001738626049453637,
+      "loss": 1.1342,
+      "step": 5305
+    },
+    {
+      "epoch": 0.9448005698005698,
+      "grad_norm": 0.4633975028991699,
+      "learning_rate": 0.00017385316831039187,
+      "loss": 0.8942,
+      "step": 5306
+    },
+    {
+      "epoch": 0.9449786324786325,
+      "grad_norm": 0.5301823019981384,
+      "learning_rate": 0.0001738437302284206,
+      "loss": 1.1683,
+      "step": 5307
+    },
+    {
+      "epoch": 0.9451566951566952,
+      "grad_norm": 0.5476770997047424,
+      "learning_rate": 0.00017383429069963484,
+      "loss": 1.1574,
+      "step": 5308
+    },
+    {
+      "epoch": 0.9453347578347578,
+      "grad_norm": 0.47689101099967957,
+      "learning_rate": 0.00017382484972421953,
+      "loss": 1.0792,
+      "step": 5309
+    },
+    {
+      "epoch": 0.9455128205128205,
+      "grad_norm": 0.526063084602356,
+      "learning_rate": 0.00017381540730235963,
+      "loss": 0.9012,
+      "step": 5310
+    },
+    {
+      "epoch": 0.9456908831908832,
+      "grad_norm": 0.5667058229446411,
+      "learning_rate": 0.0001738059634342402,
+      "loss": 1.0908,
+      "step": 5311
+    },
+    {
+      "epoch": 0.9458689458689459,
+      "grad_norm": 0.5402196645736694,
+      "learning_rate": 0.00017379651812004623,
+      "loss": 0.943,
+      "step": 5312
+    },
+    {
+      "epoch": 0.9460470085470085,
+      "grad_norm": 0.5288932919502258,
+      "learning_rate": 0.00017378707135996276,
+      "loss": 1.0055,
+      "step": 5313
+    },
+    {
+      "epoch": 0.9462250712250713,
+      "grad_norm": 0.5607456564903259,
+      "learning_rate": 0.00017377762315417492,
+      "loss": 1.2073,
+      "step": 5314
+    },
+    {
+      "epoch": 0.9464031339031339,
+      "grad_norm": 0.5737698674201965,
+      "learning_rate": 0.00017376817350286781,
+      "loss": 1.0001,
+      "step": 5315
+    },
+    {
+      "epoch": 0.9465811965811965,
+      "grad_norm": 0.6562079787254333,
+      "learning_rate": 0.00017375872240622657,
+      "loss": 1.1503,
+      "step": 5316
+    },
+    {
+      "epoch": 0.9467592592592593,
+      "grad_norm": 0.5407183170318604,
+      "learning_rate": 0.0001737492698644364,
+      "loss": 1.1169,
+      "step": 5317
+    },
+    {
+      "epoch": 0.9469373219373219,
+      "grad_norm": 0.5504152178764343,
+      "learning_rate": 0.00017373981587768248,
+      "loss": 1.0468,
+      "step": 5318
+    },
+    {
+      "epoch": 0.9471153846153846,
+      "grad_norm": 0.4813530743122101,
+      "learning_rate": 0.00017373036044615006,
+      "loss": 0.9707,
+      "step": 5319
+    },
+    {
+      "epoch": 0.9472934472934473,
+      "grad_norm": 0.5810509920120239,
+      "learning_rate": 0.00017372090357002437,
+      "loss": 1.4949,
+      "step": 5320
+    },
+    {
+      "epoch": 0.94747150997151,
+      "grad_norm": 0.5250222086906433,
+      "learning_rate": 0.00017371144524949074,
+      "loss": 1.0818,
+      "step": 5321
+    },
+    {
+      "epoch": 0.9476495726495726,
+      "grad_norm": 0.4852280914783478,
+      "learning_rate": 0.00017370198548473444,
+      "loss": 1.1793,
+      "step": 5322
+    },
+    {
+      "epoch": 0.9478276353276354,
+      "grad_norm": 0.5392420291900635,
+      "learning_rate": 0.00017369252427594086,
+      "loss": 1.153,
+      "step": 5323
+    },
+    {
+      "epoch": 0.948005698005698,
+      "grad_norm": 0.521294116973877,
+      "learning_rate": 0.00017368306162329533,
+      "loss": 0.8572,
+      "step": 5324
+    },
+    {
+      "epoch": 0.9481837606837606,
+      "grad_norm": 0.5579673647880554,
+      "learning_rate": 0.0001736735975269833,
+      "loss": 1.0452,
+      "step": 5325
+    },
+    {
+      "epoch": 0.9483618233618234,
+      "grad_norm": 0.6027318835258484,
+      "learning_rate": 0.0001736641319871901,
+      "loss": 1.3475,
+      "step": 5326
+    },
+    {
+      "epoch": 0.948539886039886,
+      "grad_norm": 0.5600738525390625,
+      "learning_rate": 0.00017365466500410132,
+      "loss": 1.0338,
+      "step": 5327
+    },
+    {
+      "epoch": 0.9487179487179487,
+      "grad_norm": 0.5691532492637634,
+      "learning_rate": 0.00017364519657790236,
+      "loss": 1.129,
+      "step": 5328
+    },
+    {
+      "epoch": 0.9488960113960114,
+      "grad_norm": 0.5161463022232056,
+      "learning_rate": 0.0001736357267087788,
+      "loss": 1.0438,
+      "step": 5329
+    },
+    {
+      "epoch": 0.9490740740740741,
+      "grad_norm": 0.5049656629562378,
+      "learning_rate": 0.0001736262553969161,
+      "loss": 0.9484,
+      "step": 5330
+    },
+    {
+      "epoch": 0.9492521367521367,
+      "grad_norm": 0.5477150678634644,
+      "learning_rate": 0.00017361678264249988,
+      "loss": 0.8995,
+      "step": 5331
+    },
+    {
+      "epoch": 0.9494301994301995,
+      "grad_norm": 0.5679608583450317,
+      "learning_rate": 0.0001736073084457157,
+      "loss": 1.241,
+      "step": 5332
+    },
+    {
+      "epoch": 0.9496082621082621,
+      "grad_norm": 0.5748196840286255,
+      "learning_rate": 0.00017359783280674926,
+      "loss": 1.0046,
+      "step": 5333
+    },
+    {
+      "epoch": 0.9497863247863247,
+      "grad_norm": 0.5677094459533691,
+      "learning_rate": 0.00017358835572578617,
+      "loss": 1.2913,
+      "step": 5334
+    },
+    {
+      "epoch": 0.9499643874643875,
+      "grad_norm": 0.49663659930229187,
+      "learning_rate": 0.0001735788772030121,
+      "loss": 1.0388,
+      "step": 5335
+    },
+    {
+      "epoch": 0.9501424501424501,
+      "grad_norm": 0.5687218904495239,
+      "learning_rate": 0.0001735693972386128,
+      "loss": 1.1631,
+      "step": 5336
+    },
+    {
+      "epoch": 0.9503205128205128,
+      "grad_norm": 0.520708441734314,
+      "learning_rate": 0.00017355991583277395,
+      "loss": 1.0744,
+      "step": 5337
+    },
+    {
+      "epoch": 0.9504985754985755,
+      "grad_norm": 0.5738952159881592,
+      "learning_rate": 0.00017355043298568137,
+      "loss": 1.318,
+      "step": 5338
+    },
+    {
+      "epoch": 0.9506766381766382,
+      "grad_norm": 0.5378455519676208,
+      "learning_rate": 0.00017354094869752085,
+      "loss": 0.9827,
+      "step": 5339
+    },
+    {
+      "epoch": 0.9508547008547008,
+      "grad_norm": 0.5047366619110107,
+      "learning_rate": 0.0001735314629684782,
+      "loss": 1.0966,
+      "step": 5340
+    },
+    {
+      "epoch": 0.9510327635327636,
+      "grad_norm": 0.5526043772697449,
+      "learning_rate": 0.0001735219757987393,
+      "loss": 1.059,
+      "step": 5341
+    },
+    {
+      "epoch": 0.9512108262108262,
+      "grad_norm": 0.5741400718688965,
+      "learning_rate": 0.00017351248718849003,
+      "loss": 1.1232,
+      "step": 5342
+    },
+    {
+      "epoch": 0.9513888888888888,
+      "grad_norm": 0.5421118140220642,
+      "learning_rate": 0.00017350299713791626,
+      "loss": 1.0427,
+      "step": 5343
+    },
+    {
+      "epoch": 0.9515669515669516,
+      "grad_norm": 0.4857081472873688,
+      "learning_rate": 0.00017349350564720392,
+      "loss": 0.8663,
+      "step": 5344
+    },
+    {
+      "epoch": 0.9517450142450142,
+      "grad_norm": 0.5411618947982788,
+      "learning_rate": 0.00017348401271653904,
+      "loss": 1.0317,
+      "step": 5345
+    },
+    {
+      "epoch": 0.9519230769230769,
+      "grad_norm": 0.5246246457099915,
+      "learning_rate": 0.00017347451834610756,
+      "loss": 1.0076,
+      "step": 5346
+    },
+    {
+      "epoch": 0.9521011396011396,
+      "grad_norm": 0.5278927683830261,
+      "learning_rate": 0.00017346502253609556,
+      "loss": 0.931,
+      "step": 5347
+    },
+    {
+      "epoch": 0.9522792022792023,
+      "grad_norm": 0.5934548377990723,
+      "learning_rate": 0.00017345552528668902,
+      "loss": 1.3205,
+      "step": 5348
+    },
+    {
+      "epoch": 0.9524572649572649,
+      "grad_norm": 0.5466100573539734,
+      "learning_rate": 0.00017344602659807406,
+      "loss": 0.8725,
+      "step": 5349
+    },
+    {
+      "epoch": 0.9526353276353277,
+      "grad_norm": 0.5220118761062622,
+      "learning_rate": 0.00017343652647043678,
+      "loss": 1.1642,
+      "step": 5350
+    },
+    {
+      "epoch": 0.9528133903133903,
+      "grad_norm": 0.6166301965713501,
+      "learning_rate": 0.0001734270249039633,
+      "loss": 0.8152,
+      "step": 5351
+    },
+    {
+      "epoch": 0.9529914529914529,
+      "grad_norm": 0.5173428058624268,
+      "learning_rate": 0.00017341752189883983,
+      "loss": 0.9296,
+      "step": 5352
+    },
+    {
+      "epoch": 0.9531695156695157,
+      "grad_norm": 0.5363461375236511,
+      "learning_rate": 0.0001734080174552525,
+      "loss": 1.3546,
+      "step": 5353
+    },
+    {
+      "epoch": 0.9533475783475783,
+      "grad_norm": 0.5333831906318665,
+      "learning_rate": 0.0001733985115733876,
+      "loss": 1.0401,
+      "step": 5354
+    },
+    {
+      "epoch": 0.9535256410256411,
+      "grad_norm": 0.5179334878921509,
+      "learning_rate": 0.00017338900425343132,
+      "loss": 1.1254,
+      "step": 5355
+    },
+    {
+      "epoch": 0.9537037037037037,
+      "grad_norm": 0.5171303153038025,
+      "learning_rate": 0.00017337949549556993,
+      "loss": 1.0518,
+      "step": 5356
+    },
+    {
+      "epoch": 0.9538817663817664,
+      "grad_norm": 0.5164596438407898,
+      "learning_rate": 0.00017336998529998978,
+      "loss": 0.8732,
+      "step": 5357
+    },
+    {
+      "epoch": 0.9540598290598291,
+      "grad_norm": 0.5555717349052429,
+      "learning_rate": 0.00017336047366687719,
+      "loss": 1.2312,
+      "step": 5358
+    },
+    {
+      "epoch": 0.9542378917378918,
+      "grad_norm": 0.45685622096061707,
+      "learning_rate": 0.00017335096059641847,
+      "loss": 0.8882,
+      "step": 5359
+    },
+    {
+      "epoch": 0.9544159544159544,
+      "grad_norm": 0.5260133743286133,
+      "learning_rate": 0.0001733414460888001,
+      "loss": 1.0952,
+      "step": 5360
+    },
+    {
+      "epoch": 0.9545940170940171,
+      "grad_norm": 0.4597703814506531,
+      "learning_rate": 0.0001733319301442084,
+      "loss": 1.0835,
+      "step": 5361
+    },
+    {
+      "epoch": 0.9547720797720798,
+      "grad_norm": 0.5279495120048523,
+      "learning_rate": 0.0001733224127628299,
+      "loss": 1.0295,
+      "step": 5362
+    },
+    {
+      "epoch": 0.9549501424501424,
+      "grad_norm": 0.48919400572776794,
+      "learning_rate": 0.00017331289394485104,
+      "loss": 0.9693,
+      "step": 5363
+    },
+    {
+      "epoch": 0.9551282051282052,
+      "grad_norm": 0.5639515519142151,
+      "learning_rate": 0.0001733033736904583,
+      "loss": 1.0893,
+      "step": 5364
+    },
+    {
+      "epoch": 0.9553062678062678,
+      "grad_norm": 0.49761319160461426,
+      "learning_rate": 0.00017329385199983823,
+      "loss": 1.038,
+      "step": 5365
+    },
+    {
+      "epoch": 0.9554843304843305,
+      "grad_norm": 0.5503305792808533,
+      "learning_rate": 0.0001732843288731774,
+      "loss": 0.9976,
+      "step": 5366
+    },
+    {
+      "epoch": 0.9556623931623932,
+      "grad_norm": 0.5633028745651245,
+      "learning_rate": 0.00017327480431066235,
+      "loss": 1.0602,
+      "step": 5367
+    },
+    {
+      "epoch": 0.9558404558404558,
+      "grad_norm": 0.48074454069137573,
+      "learning_rate": 0.00017326527831247973,
+      "loss": 1.0286,
+      "step": 5368
+    },
+    {
+      "epoch": 0.9560185185185185,
+      "grad_norm": 0.506597638130188,
+      "learning_rate": 0.0001732557508788162,
+      "loss": 0.9061,
+      "step": 5369
+    },
+    {
+      "epoch": 0.9561965811965812,
+      "grad_norm": 0.6570749282836914,
+      "learning_rate": 0.0001732462220098584,
+      "loss": 1.0852,
+      "step": 5370
+    },
+    {
+      "epoch": 0.9563746438746439,
+      "grad_norm": 0.5607653856277466,
+      "learning_rate": 0.00017323669170579302,
+      "loss": 1.0486,
+      "step": 5371
+    },
+    {
+      "epoch": 0.9565527065527065,
+      "grad_norm": 0.6047050356864929,
+      "learning_rate": 0.0001732271599668068,
+      "loss": 1.2175,
+      "step": 5372
+    },
+    {
+      "epoch": 0.9567307692307693,
+      "grad_norm": 0.5506869554519653,
+      "learning_rate": 0.00017321762679308651,
+      "loss": 1.0114,
+      "step": 5373
+    },
+    {
+      "epoch": 0.9569088319088319,
+      "grad_norm": 0.5868638157844543,
+      "learning_rate": 0.00017320809218481891,
+      "loss": 1.2983,
+      "step": 5374
+    },
+    {
+      "epoch": 0.9570868945868946,
+      "grad_norm": 0.539619505405426,
+      "learning_rate": 0.00017319855614219084,
+      "loss": 1.2361,
+      "step": 5375
+    },
+    {
+      "epoch": 0.9572649572649573,
+      "grad_norm": 0.5525495409965515,
+      "learning_rate": 0.0001731890186653891,
+      "loss": 1.1316,
+      "step": 5376
+    },
+    {
+      "epoch": 0.95744301994302,
+      "grad_norm": 0.5549767017364502,
+      "learning_rate": 0.0001731794797546006,
+      "loss": 1.0547,
+      "step": 5377
+    },
+    {
+      "epoch": 0.9576210826210826,
+      "grad_norm": 0.5356076955795288,
+      "learning_rate": 0.00017316993941001222,
+      "loss": 0.9942,
+      "step": 5378
+    },
+    {
+      "epoch": 0.9577991452991453,
+      "grad_norm": 0.5365784168243408,
+      "learning_rate": 0.00017316039763181084,
+      "loss": 1.226,
+      "step": 5379
+    },
+    {
+      "epoch": 0.957977207977208,
+      "grad_norm": 0.5190927386283875,
+      "learning_rate": 0.00017315085442018343,
+      "loss": 1.1704,
+      "step": 5380
+    },
+    {
+      "epoch": 0.9581552706552706,
+      "grad_norm": 0.526658833026886,
+      "learning_rate": 0.00017314130977531705,
+      "loss": 1.109,
+      "step": 5381
+    },
+    {
+      "epoch": 0.9583333333333334,
+      "grad_norm": 0.5373684763908386,
+      "learning_rate": 0.0001731317636973986,
+      "loss": 1.0018,
+      "step": 5382
+    },
+    {
+      "epoch": 0.958511396011396,
+      "grad_norm": 0.5714904069900513,
+      "learning_rate": 0.00017312221618661516,
+      "loss": 1.1855,
+      "step": 5383
+    },
+    {
+      "epoch": 0.9586894586894587,
+      "grad_norm": 0.5707863569259644,
+      "learning_rate": 0.00017311266724315377,
+      "loss": 0.9482,
+      "step": 5384
+    },
+    {
+      "epoch": 0.9588675213675214,
+      "grad_norm": 0.5856872797012329,
+      "learning_rate": 0.00017310311686720157,
+      "loss": 0.9543,
+      "step": 5385
+    },
+    {
+      "epoch": 0.959045584045584,
+      "grad_norm": 0.5041963458061218,
+      "learning_rate": 0.00017309356505894568,
+      "loss": 1.1427,
+      "step": 5386
+    },
+    {
+      "epoch": 0.9592236467236467,
+      "grad_norm": 0.5409179925918579,
+      "learning_rate": 0.00017308401181857316,
+      "loss": 0.8432,
+      "step": 5387
+    },
+    {
+      "epoch": 0.9594017094017094,
+      "grad_norm": 0.5248702764511108,
+      "learning_rate": 0.00017307445714627128,
+      "loss": 1.1403,
+      "step": 5388
+    },
+    {
+      "epoch": 0.9595797720797721,
+      "grad_norm": 0.50718092918396,
+      "learning_rate": 0.00017306490104222722,
+      "loss": 0.9066,
+      "step": 5389
+    },
+    {
+      "epoch": 0.9597578347578347,
+      "grad_norm": 0.5563821196556091,
+      "learning_rate": 0.0001730553435066282,
+      "loss": 1.0204,
+      "step": 5390
+    },
+    {
+      "epoch": 0.9599358974358975,
+      "grad_norm": 0.5696987509727478,
+      "learning_rate": 0.00017304578453966146,
+      "loss": 1.1405,
+      "step": 5391
+    },
+    {
+      "epoch": 0.9601139601139601,
+      "grad_norm": 0.5927395224571228,
+      "learning_rate": 0.00017303622414151435,
+      "loss": 1.0398,
+      "step": 5392
+    },
+    {
+      "epoch": 0.9602920227920227,
+      "grad_norm": 0.5375707745552063,
+      "learning_rate": 0.0001730266623123741,
+      "loss": 0.9519,
+      "step": 5393
+    },
+    {
+      "epoch": 0.9604700854700855,
+      "grad_norm": 0.457998126745224,
+      "learning_rate": 0.00017301709905242815,
+      "loss": 0.8743,
+      "step": 5394
+    },
+    {
+      "epoch": 0.9606481481481481,
+      "grad_norm": 0.5427796244621277,
+      "learning_rate": 0.00017300753436186382,
+      "loss": 1.078,
+      "step": 5395
+    },
+    {
+      "epoch": 0.9608262108262108,
+      "grad_norm": 0.5458595752716064,
+      "learning_rate": 0.0001729979682408685,
+      "loss": 1.1081,
+      "step": 5396
+    },
+    {
+      "epoch": 0.9610042735042735,
+      "grad_norm": 0.5495280027389526,
+      "learning_rate": 0.00017298840068962962,
+      "loss": 1.0141,
+      "step": 5397
+    },
+    {
+      "epoch": 0.9611823361823362,
+      "grad_norm": 0.5878560543060303,
+      "learning_rate": 0.00017297883170833465,
+      "loss": 1.302,
+      "step": 5398
+    },
+    {
+      "epoch": 0.9613603988603988,
+      "grad_norm": 0.5452881455421448,
+      "learning_rate": 0.00017296926129717108,
+      "loss": 0.9929,
+      "step": 5399
+    },
+    {
+      "epoch": 0.9615384615384616,
+      "grad_norm": 0.6021811366081238,
+      "learning_rate": 0.0001729596894563264,
+      "loss": 1.2629,
+      "step": 5400
+    },
+    {
+      "epoch": 0.9617165242165242,
+      "grad_norm": 0.5820204615592957,
+      "learning_rate": 0.0001729501161859882,
+      "loss": 1.0662,
+      "step": 5401
+    },
+    {
+      "epoch": 0.9618945868945868,
+      "grad_norm": 0.4953218102455139,
+      "learning_rate": 0.000172940541486344,
+      "loss": 1.047,
+      "step": 5402
+    },
+    {
+      "epoch": 0.9620726495726496,
+      "grad_norm": 0.5409793853759766,
+      "learning_rate": 0.00017293096535758143,
+      "loss": 1.1993,
+      "step": 5403
+    },
+    {
+      "epoch": 0.9622507122507122,
+      "grad_norm": 0.49702873826026917,
+      "learning_rate": 0.00017292138779988805,
+      "loss": 1.2471,
+      "step": 5404
+    },
+    {
+      "epoch": 0.9624287749287749,
+      "grad_norm": 0.5743489861488342,
+      "learning_rate": 0.00017291180881345158,
+      "loss": 1.0816,
+      "step": 5405
+    },
+    {
+      "epoch": 0.9626068376068376,
+      "grad_norm": 0.5747945308685303,
+      "learning_rate": 0.00017290222839845968,
+      "loss": 1.3548,
+      "step": 5406
+    },
+    {
+      "epoch": 0.9627849002849003,
+      "grad_norm": 0.5341345071792603,
+      "learning_rate": 0.00017289264655510005,
+      "loss": 1.0435,
+      "step": 5407
+    },
+    {
+      "epoch": 0.9629629629629629,
+      "grad_norm": 0.5719689130783081,
+      "learning_rate": 0.00017288306328356044,
+      "loss": 1.2319,
+      "step": 5408
+    },
+    {
+      "epoch": 0.9631410256410257,
+      "grad_norm": 0.4783279597759247,
+      "learning_rate": 0.0001728734785840286,
+      "loss": 0.9397,
+      "step": 5409
+    },
+    {
+      "epoch": 0.9633190883190883,
+      "grad_norm": 0.4730507731437683,
+      "learning_rate": 0.00017286389245669233,
+      "loss": 0.9384,
+      "step": 5410
+    },
+    {
+      "epoch": 0.9634971509971509,
+      "grad_norm": 0.5309939384460449,
+      "learning_rate": 0.00017285430490173944,
+      "loss": 1.098,
+      "step": 5411
+    },
+    {
+      "epoch": 0.9636752136752137,
+      "grad_norm": 0.5177853107452393,
+      "learning_rate": 0.0001728447159193578,
+      "loss": 1.2777,
+      "step": 5412
+    },
+    {
+      "epoch": 0.9638532763532763,
+      "grad_norm": 0.6437913775444031,
+      "learning_rate": 0.00017283512550973526,
+      "loss": 1.2661,
+      "step": 5413
+    },
+    {
+      "epoch": 0.9640313390313391,
+      "grad_norm": 0.6096072196960449,
+      "learning_rate": 0.00017282553367305975,
+      "loss": 0.9569,
+      "step": 5414
+    },
+    {
+      "epoch": 0.9642094017094017,
+      "grad_norm": 0.5104934573173523,
+      "learning_rate": 0.00017281594040951918,
+      "loss": 0.9666,
+      "step": 5415
+    },
+    {
+      "epoch": 0.9643874643874644,
+      "grad_norm": 0.6178240776062012,
+      "learning_rate": 0.00017280634571930153,
+      "loss": 1.1277,
+      "step": 5416
+    },
+    {
+      "epoch": 0.9645655270655271,
+      "grad_norm": 0.5749034881591797,
+      "learning_rate": 0.0001727967496025948,
+      "loss": 1.245,
+      "step": 5417
+    },
+    {
+      "epoch": 0.9647435897435898,
+      "grad_norm": 0.5036978721618652,
+      "learning_rate": 0.00017278715205958694,
+      "loss": 1.3049,
+      "step": 5418
+    },
+    {
+      "epoch": 0.9649216524216524,
+      "grad_norm": 0.5593041777610779,
+      "learning_rate": 0.00017277755309046605,
+      "loss": 1.2304,
+      "step": 5419
+    },
+    {
+      "epoch": 0.9650997150997151,
+      "grad_norm": 0.5446555614471436,
+      "learning_rate": 0.0001727679526954202,
+      "loss": 0.732,
+      "step": 5420
+    },
+    {
+      "epoch": 0.9652777777777778,
+      "grad_norm": 0.6063070297241211,
+      "learning_rate": 0.00017275835087463747,
+      "loss": 1.3723,
+      "step": 5421
+    },
+    {
+      "epoch": 0.9654558404558404,
+      "grad_norm": 0.4994211792945862,
+      "learning_rate": 0.00017274874762830602,
+      "loss": 1.0505,
+      "step": 5422
+    },
+    {
+      "epoch": 0.9656339031339032,
+      "grad_norm": 0.49396973848342896,
+      "learning_rate": 0.00017273914295661395,
+      "loss": 0.8691,
+      "step": 5423
+    },
+    {
+      "epoch": 0.9658119658119658,
+      "grad_norm": 0.5067027807235718,
+      "learning_rate": 0.0001727295368597495,
+      "loss": 0.9744,
+      "step": 5424
+    },
+    {
+      "epoch": 0.9659900284900285,
+      "grad_norm": 0.6720643043518066,
+      "learning_rate": 0.00017271992933790085,
+      "loss": 1.1513,
+      "step": 5425
+    },
+    {
+      "epoch": 0.9661680911680912,
+      "grad_norm": 0.5494341254234314,
+      "learning_rate": 0.00017271032039125624,
+      "loss": 0.8295,
+      "step": 5426
+    },
+    {
+      "epoch": 0.9663461538461539,
+      "grad_norm": 0.644332230091095,
+      "learning_rate": 0.00017270071002000394,
+      "loss": 1.0043,
+      "step": 5427
+    },
+    {
+      "epoch": 0.9665242165242165,
+      "grad_norm": 0.5658500790596008,
+      "learning_rate": 0.00017269109822433225,
+      "loss": 1.2575,
+      "step": 5428
+    },
+    {
+      "epoch": 0.9667022792022792,
+      "grad_norm": 0.5163155794143677,
+      "learning_rate": 0.00017268148500442952,
+      "loss": 1.1391,
+      "step": 5429
+    },
+    {
+      "epoch": 0.9668803418803419,
+      "grad_norm": 0.5113703608512878,
+      "learning_rate": 0.00017267187036048404,
+      "loss": 1.0819,
+      "step": 5430
+    },
+    {
+      "epoch": 0.9670584045584045,
+      "grad_norm": 0.6339422464370728,
+      "learning_rate": 0.00017266225429268426,
+      "loss": 1.0733,
+      "step": 5431
+    },
+    {
+      "epoch": 0.9672364672364673,
+      "grad_norm": 0.5158288478851318,
+      "learning_rate": 0.0001726526368012185,
+      "loss": 0.9518,
+      "step": 5432
+    },
+    {
+      "epoch": 0.9674145299145299,
+      "grad_norm": 0.593717634677887,
+      "learning_rate": 0.00017264301788627527,
+      "loss": 0.9416,
+      "step": 5433
+    },
+    {
+      "epoch": 0.9675925925925926,
+      "grad_norm": 0.49593186378479004,
+      "learning_rate": 0.00017263339754804301,
+      "loss": 1.0307,
+      "step": 5434
+    },
+    {
+      "epoch": 0.9677706552706553,
+      "grad_norm": 0.44032949209213257,
+      "learning_rate": 0.00017262377578671024,
+      "loss": 0.7884,
+      "step": 5435
+    },
+    {
+      "epoch": 0.967948717948718,
+      "grad_norm": 0.513073742389679,
+      "learning_rate": 0.00017261415260246538,
+      "loss": 0.9797,
+      "step": 5436
+    },
+    {
+      "epoch": 0.9681267806267806,
+      "grad_norm": 0.5737422108650208,
+      "learning_rate": 0.0001726045279954971,
+      "loss": 1.0487,
+      "step": 5437
+    },
+    {
+      "epoch": 0.9683048433048433,
+      "grad_norm": 0.5385867953300476,
+      "learning_rate": 0.0001725949019659939,
+      "loss": 1.4166,
+      "step": 5438
+    },
+    {
+      "epoch": 0.968482905982906,
+      "grad_norm": 0.5224326848983765,
+      "learning_rate": 0.00017258527451414438,
+      "loss": 1.195,
+      "step": 5439
+    },
+    {
+      "epoch": 0.9686609686609686,
+      "grad_norm": 0.5305148363113403,
+      "learning_rate": 0.0001725756456401372,
+      "loss": 1.0301,
+      "step": 5440
+    },
+    {
+      "epoch": 0.9688390313390314,
+      "grad_norm": 0.532588005065918,
+      "learning_rate": 0.000172566015344161,
+      "loss": 1.1269,
+      "step": 5441
+    },
+    {
+      "epoch": 0.969017094017094,
+      "grad_norm": 0.5812515020370483,
+      "learning_rate": 0.0001725563836264045,
+      "loss": 1.1787,
+      "step": 5442
+    },
+    {
+      "epoch": 0.9691951566951567,
+      "grad_norm": 0.4962109327316284,
+      "learning_rate": 0.00017254675048705638,
+      "loss": 1.0639,
+      "step": 5443
+    },
+    {
+      "epoch": 0.9693732193732194,
+      "grad_norm": 0.5094883441925049,
+      "learning_rate": 0.00017253711592630534,
+      "loss": 1.0922,
+      "step": 5444
+    },
+    {
+      "epoch": 0.969551282051282,
+      "grad_norm": 0.5728049874305725,
+      "learning_rate": 0.00017252747994434025,
+      "loss": 1.1237,
+      "step": 5445
+    },
+    {
+      "epoch": 0.9697293447293447,
+      "grad_norm": 0.5406180620193481,
+      "learning_rate": 0.00017251784254134983,
+      "loss": 1.1161,
+      "step": 5446
+    },
+    {
+      "epoch": 0.9699074074074074,
+      "grad_norm": 0.5724552869796753,
+      "learning_rate": 0.00017250820371752292,
+      "loss": 1.2205,
+      "step": 5447
+    },
+    {
+      "epoch": 0.9700854700854701,
+      "grad_norm": 0.5698846578598022,
+      "learning_rate": 0.0001724985634730484,
+      "loss": 1.1472,
+      "step": 5448
+    },
+    {
+      "epoch": 0.9702635327635327,
+      "grad_norm": 0.5315805673599243,
+      "learning_rate": 0.0001724889218081151,
+      "loss": 1.0253,
+      "step": 5449
+    },
+    {
+      "epoch": 0.9704415954415955,
+      "grad_norm": 0.5970377326011658,
+      "learning_rate": 0.000172479278722912,
+      "loss": 1.3033,
+      "step": 5450
+    },
+    {
+      "epoch": 0.9706196581196581,
+      "grad_norm": 0.6149488687515259,
+      "learning_rate": 0.00017246963421762798,
+      "loss": 1.0689,
+      "step": 5451
+    },
+    {
+      "epoch": 0.9707977207977208,
+      "grad_norm": 0.4848574995994568,
+      "learning_rate": 0.00017245998829245202,
+      "loss": 0.8829,
+      "step": 5452
+    },
+    {
+      "epoch": 0.9709757834757835,
+      "grad_norm": 0.6073294281959534,
+      "learning_rate": 0.00017245034094757312,
+      "loss": 1.2378,
+      "step": 5453
+    },
+    {
+      "epoch": 0.9711538461538461,
+      "grad_norm": 0.6362034678459167,
+      "learning_rate": 0.00017244069218318026,
+      "loss": 1.3606,
+      "step": 5454
+    },
+    {
+      "epoch": 0.9713319088319088,
+      "grad_norm": 0.5353880524635315,
+      "learning_rate": 0.00017243104199946257,
+      "loss": 1.1288,
+      "step": 5455
+    },
+    {
+      "epoch": 0.9715099715099715,
+      "grad_norm": 0.5096352100372314,
+      "learning_rate": 0.00017242139039660902,
+      "loss": 1.0056,
+      "step": 5456
+    },
+    {
+      "epoch": 0.9716880341880342,
+      "grad_norm": 0.5086682438850403,
+      "learning_rate": 0.00017241173737480884,
+      "loss": 1.091,
+      "step": 5457
+    },
+    {
+      "epoch": 0.9718660968660968,
+      "grad_norm": 0.5034295320510864,
+      "learning_rate": 0.000172402082934251,
+      "loss": 0.9749,
+      "step": 5458
+    },
+    {
+      "epoch": 0.9720441595441596,
+      "grad_norm": 0.5205379724502563,
+      "learning_rate": 0.0001723924270751248,
+      "loss": 1.1068,
+      "step": 5459
+    },
+    {
+      "epoch": 0.9722222222222222,
+      "grad_norm": 0.5904826521873474,
+      "learning_rate": 0.00017238276979761937,
+      "loss": 1.0613,
+      "step": 5460
+    },
+    {
+      "epoch": 0.9724002849002849,
+      "grad_norm": 0.6415045261383057,
+      "learning_rate": 0.0001723731111019239,
+      "loss": 1.2126,
+      "step": 5461
+    },
+    {
+      "epoch": 0.9725783475783476,
+      "grad_norm": 0.5769147872924805,
+      "learning_rate": 0.0001723634509882277,
+      "loss": 1.337,
+      "step": 5462
+    },
+    {
+      "epoch": 0.9727564102564102,
+      "grad_norm": 0.5585111975669861,
+      "learning_rate": 0.00017235378945671998,
+      "loss": 1.3922,
+      "step": 5463
+    },
+    {
+      "epoch": 0.9729344729344729,
+      "grad_norm": 0.5788411498069763,
+      "learning_rate": 0.00017234412650759008,
+      "loss": 0.8532,
+      "step": 5464
+    },
+    {
+      "epoch": 0.9731125356125356,
+      "grad_norm": 0.5617673397064209,
+      "learning_rate": 0.00017233446214102728,
+      "loss": 1.2575,
+      "step": 5465
+    },
+    {
+      "epoch": 0.9732905982905983,
+      "grad_norm": 0.4227815568447113,
+      "learning_rate": 0.00017232479635722093,
+      "loss": 1.0618,
+      "step": 5466
+    },
+    {
+      "epoch": 0.9734686609686609,
+      "grad_norm": 0.49751797318458557,
+      "learning_rate": 0.00017231512915636047,
+      "loss": 0.7714,
+      "step": 5467
+    },
+    {
+      "epoch": 0.9736467236467237,
+      "grad_norm": 0.5983800292015076,
+      "learning_rate": 0.0001723054605386353,
+      "loss": 1.2297,
+      "step": 5468
+    },
+    {
+      "epoch": 0.9738247863247863,
+      "grad_norm": 0.543394923210144,
+      "learning_rate": 0.0001722957905042348,
+      "loss": 1.0078,
+      "step": 5469
+    },
+    {
+      "epoch": 0.9740028490028491,
+      "grad_norm": 0.5633566975593567,
+      "learning_rate": 0.00017228611905334846,
+      "loss": 1.0938,
+      "step": 5470
+    },
+    {
+      "epoch": 0.9741809116809117,
+      "grad_norm": 0.49377235770225525,
+      "learning_rate": 0.00017227644618616578,
+      "loss": 1.096,
+      "step": 5471
+    },
+    {
+      "epoch": 0.9743589743589743,
+      "grad_norm": 0.4963362216949463,
+      "learning_rate": 0.00017226677190287627,
+      "loss": 1.0003,
+      "step": 5472
+    },
+    {
+      "epoch": 0.9745370370370371,
+      "grad_norm": 0.4483006000518799,
+      "learning_rate": 0.00017225709620366953,
+      "loss": 0.8623,
+      "step": 5473
+    },
+    {
+      "epoch": 0.9747150997150997,
+      "grad_norm": 0.5429352521896362,
+      "learning_rate": 0.00017224741908873506,
+      "loss": 1.1383,
+      "step": 5474
+    },
+    {
+      "epoch": 0.9748931623931624,
+      "grad_norm": 0.5871657729148865,
+      "learning_rate": 0.0001722377405582625,
+      "loss": 1.2005,
+      "step": 5475
+    },
+    {
+      "epoch": 0.9750712250712251,
+      "grad_norm": 0.6002383828163147,
+      "learning_rate": 0.0001722280606124415,
+      "loss": 1.0696,
+      "step": 5476
+    },
+    {
+      "epoch": 0.9752492877492878,
+      "grad_norm": 0.5351617336273193,
+      "learning_rate": 0.00017221837925146164,
+      "loss": 1.243,
+      "step": 5477
+    },
+    {
+      "epoch": 0.9754273504273504,
+      "grad_norm": 0.46613118052482605,
+      "learning_rate": 0.00017220869647551268,
+      "loss": 1.0344,
+      "step": 5478
+    },
+    {
+      "epoch": 0.9756054131054132,
+      "grad_norm": 0.6015593409538269,
+      "learning_rate": 0.00017219901228478432,
+      "loss": 1.082,
+      "step": 5479
+    },
+    {
+      "epoch": 0.9757834757834758,
+      "grad_norm": 0.5829521417617798,
+      "learning_rate": 0.0001721893266794663,
+      "loss": 0.8683,
+      "step": 5480
+    },
+    {
+      "epoch": 0.9759615384615384,
+      "grad_norm": 0.6344960927963257,
+      "learning_rate": 0.00017217963965974838,
+      "loss": 1.1048,
+      "step": 5481
+    },
+    {
+      "epoch": 0.9761396011396012,
+      "grad_norm": 0.5586308240890503,
+      "learning_rate": 0.00017216995122582034,
+      "loss": 0.9657,
+      "step": 5482
+    },
+    {
+      "epoch": 0.9763176638176638,
+      "grad_norm": 0.48625239729881287,
+      "learning_rate": 0.00017216026137787204,
+      "loss": 1.1026,
+      "step": 5483
+    },
+    {
+      "epoch": 0.9764957264957265,
+      "grad_norm": 0.5625223517417908,
+      "learning_rate": 0.00017215057011609332,
+      "loss": 1.1579,
+      "step": 5484
+    },
+    {
+      "epoch": 0.9766737891737892,
+      "grad_norm": 0.6016653776168823,
+      "learning_rate": 0.0001721408774406741,
+      "loss": 1.1777,
+      "step": 5485
+    },
+    {
+      "epoch": 0.9768518518518519,
+      "grad_norm": 0.5444921851158142,
+      "learning_rate": 0.00017213118335180418,
+      "loss": 1.119,
+      "step": 5486
+    },
+    {
+      "epoch": 0.9770299145299145,
+      "grad_norm": 0.5574755668640137,
+      "learning_rate": 0.0001721214878496736,
+      "loss": 1.1128,
+      "step": 5487
+    },
+    {
+      "epoch": 0.9772079772079773,
+      "grad_norm": 0.5486113429069519,
+      "learning_rate": 0.00017211179093447226,
+      "loss": 1.1673,
+      "step": 5488
+    },
+    {
+      "epoch": 0.9773860398860399,
+      "grad_norm": 0.5545483231544495,
+      "learning_rate": 0.00017210209260639018,
+      "loss": 1.1748,
+      "step": 5489
+    },
+    {
+      "epoch": 0.9775641025641025,
+      "grad_norm": 0.5756667256355286,
+      "learning_rate": 0.0001720923928656174,
+      "loss": 1.2377,
+      "step": 5490
+    },
+    {
+      "epoch": 0.9777421652421653,
+      "grad_norm": 0.5744972229003906,
+      "learning_rate": 0.00017208269171234392,
+      "loss": 1.1242,
+      "step": 5491
+    },
+    {
+      "epoch": 0.9779202279202279,
+      "grad_norm": 0.6109468340873718,
+      "learning_rate": 0.00017207298914675984,
+      "loss": 1.1948,
+      "step": 5492
+    },
+    {
+      "epoch": 0.9780982905982906,
+      "grad_norm": 0.5195167660713196,
+      "learning_rate": 0.00017206328516905525,
+      "loss": 1.0941,
+      "step": 5493
+    },
+    {
+      "epoch": 0.9782763532763533,
+      "grad_norm": 0.5549042224884033,
+      "learning_rate": 0.0001720535797794203,
+      "loss": 1.1503,
+      "step": 5494
+    },
+    {
+      "epoch": 0.978454415954416,
+      "grad_norm": 0.6317743062973022,
+      "learning_rate": 0.0001720438729780451,
+      "loss": 1.3468,
+      "step": 5495
+    },
+    {
+      "epoch": 0.9786324786324786,
+      "grad_norm": 0.5932528972625732,
+      "learning_rate": 0.0001720341647651199,
+      "loss": 1.105,
+      "step": 5496
+    },
+    {
+      "epoch": 0.9788105413105413,
+      "grad_norm": 0.607880175113678,
+      "learning_rate": 0.00017202445514083488,
+      "loss": 1.1465,
+      "step": 5497
+    },
+    {
+      "epoch": 0.978988603988604,
+      "grad_norm": 0.49227309226989746,
+      "learning_rate": 0.00017201474410538027,
+      "loss": 0.9075,
+      "step": 5498
+    },
+    {
+      "epoch": 0.9791666666666666,
+      "grad_norm": 0.5059443116188049,
+      "learning_rate": 0.00017200503165894636,
+      "loss": 1.0483,
+      "step": 5499
+    },
+    {
+      "epoch": 0.9793447293447294,
+      "grad_norm": 0.5792799592018127,
+      "learning_rate": 0.0001719953178017234,
+      "loss": 1.0987,
+      "step": 5500
+    },
+    {
+      "epoch": 0.979522792022792,
+      "grad_norm": 0.5010457038879395,
+      "learning_rate": 0.00017198560253390177,
+      "loss": 1.1051,
+      "step": 5501
+    },
+    {
+      "epoch": 0.9797008547008547,
+      "grad_norm": 0.5866543054580688,
+      "learning_rate": 0.0001719758858556718,
+      "loss": 1.2824,
+      "step": 5502
+    },
+    {
+      "epoch": 0.9798789173789174,
+      "grad_norm": 0.5392137169837952,
+      "learning_rate": 0.00017196616776722382,
+      "loss": 0.886,
+      "step": 5503
+    },
+    {
+      "epoch": 0.98005698005698,
+      "grad_norm": 0.5200899839401245,
+      "learning_rate": 0.00017195644826874834,
+      "loss": 1.1504,
+      "step": 5504
+    },
+    {
+      "epoch": 0.9802350427350427,
+      "grad_norm": 0.533159077167511,
+      "learning_rate": 0.00017194672736043569,
+      "loss": 1.1216,
+      "step": 5505
+    },
+    {
+      "epoch": 0.9804131054131054,
+      "grad_norm": 0.5543524622917175,
+      "learning_rate": 0.0001719370050424764,
+      "loss": 1.0161,
+      "step": 5506
+    },
+    {
+      "epoch": 0.9805911680911681,
+      "grad_norm": 0.5315365195274353,
+      "learning_rate": 0.00017192728131506092,
+      "loss": 1.0509,
+      "step": 5507
+    },
+    {
+      "epoch": 0.9807692307692307,
+      "grad_norm": 0.5406147837638855,
+      "learning_rate": 0.00017191755617837977,
+      "loss": 1.0695,
+      "step": 5508
+    },
+    {
+      "epoch": 0.9809472934472935,
+      "grad_norm": 0.4563386142253876,
+      "learning_rate": 0.00017190782963262354,
+      "loss": 0.995,
+      "step": 5509
+    },
+    {
+      "epoch": 0.9811253561253561,
+      "grad_norm": 0.5456405282020569,
+      "learning_rate": 0.00017189810167798274,
+      "loss": 1.0546,
+      "step": 5510
+    },
+    {
+      "epoch": 0.9813034188034188,
+      "grad_norm": 0.6275575160980225,
+      "learning_rate": 0.00017188837231464795,
+      "loss": 1.0432,
+      "step": 5511
+    },
+    {
+      "epoch": 0.9814814814814815,
+      "grad_norm": 0.49735602736473083,
+      "learning_rate": 0.0001718786415428099,
+      "loss": 1.035,
+      "step": 5512
+    },
+    {
+      "epoch": 0.9816595441595442,
+      "grad_norm": 0.5234259963035583,
+      "learning_rate": 0.00017186890936265916,
+      "loss": 1.0918,
+      "step": 5513
+    },
+    {
+      "epoch": 0.9818376068376068,
+      "grad_norm": 0.5091170072555542,
+      "learning_rate": 0.00017185917577438643,
+      "loss": 1.0239,
+      "step": 5514
+    },
+    {
+      "epoch": 0.9820156695156695,
+      "grad_norm": 0.6155703067779541,
+      "learning_rate": 0.00017184944077818244,
+      "loss": 1.2366,
+      "step": 5515
+    },
+    {
+      "epoch": 0.9821937321937322,
+      "grad_norm": 0.5074070692062378,
+      "learning_rate": 0.0001718397043742379,
+      "loss": 1.0318,
+      "step": 5516
+    },
+    {
+      "epoch": 0.9823717948717948,
+      "grad_norm": 0.5234423279762268,
+      "learning_rate": 0.0001718299665627436,
+      "loss": 1.0322,
+      "step": 5517
+    },
+    {
+      "epoch": 0.9825498575498576,
+      "grad_norm": 0.5783474445343018,
+      "learning_rate": 0.0001718202273438903,
+      "loss": 0.9486,
+      "step": 5518
+    },
+    {
+      "epoch": 0.9827279202279202,
+      "grad_norm": 0.5708683133125305,
+      "learning_rate": 0.00017181048671786886,
+      "loss": 1.0785,
+      "step": 5519
+    },
+    {
+      "epoch": 0.9829059829059829,
+      "grad_norm": 0.5985961556434631,
+      "learning_rate": 0.00017180074468487009,
+      "loss": 1.198,
+      "step": 5520
+    },
+    {
+      "epoch": 0.9830840455840456,
+      "grad_norm": 0.5711352229118347,
+      "learning_rate": 0.0001717910012450849,
+      "loss": 1.0386,
+      "step": 5521
+    },
+    {
+      "epoch": 0.9832621082621082,
+      "grad_norm": 0.5338063836097717,
+      "learning_rate": 0.00017178125639870416,
+      "loss": 1.1594,
+      "step": 5522
+    },
+    {
+      "epoch": 0.9834401709401709,
+      "grad_norm": 0.6144943237304688,
+      "learning_rate": 0.00017177151014591881,
+      "loss": 1.1083,
+      "step": 5523
+    },
+    {
+      "epoch": 0.9836182336182336,
+      "grad_norm": 0.547285795211792,
+      "learning_rate": 0.00017176176248691983,
+      "loss": 1.1507,
+      "step": 5524
+    },
+    {
+      "epoch": 0.9837962962962963,
+      "grad_norm": 0.5807644724845886,
+      "learning_rate": 0.00017175201342189817,
+      "loss": 1.3044,
+      "step": 5525
+    },
+    {
+      "epoch": 0.9839743589743589,
+      "grad_norm": 0.5229477882385254,
+      "learning_rate": 0.00017174226295104485,
+      "loss": 1.2622,
+      "step": 5526
+    },
+    {
+      "epoch": 0.9841524216524217,
+      "grad_norm": 0.6100695133209229,
+      "learning_rate": 0.00017173251107455094,
+      "loss": 1.2026,
+      "step": 5527
+    },
+    {
+      "epoch": 0.9843304843304843,
+      "grad_norm": 0.5410884618759155,
+      "learning_rate": 0.00017172275779260744,
+      "loss": 1.2964,
+      "step": 5528
+    },
+    {
+      "epoch": 0.9845085470085471,
+      "grad_norm": 0.5937406420707703,
+      "learning_rate": 0.00017171300310540554,
+      "loss": 1.1435,
+      "step": 5529
+    },
+    {
+      "epoch": 0.9846866096866097,
+      "grad_norm": 0.56817227602005,
+      "learning_rate": 0.00017170324701313634,
+      "loss": 1.0099,
+      "step": 5530
+    },
+    {
+      "epoch": 0.9848646723646723,
+      "grad_norm": 0.5776323080062866,
+      "learning_rate": 0.00017169348951599092,
+      "loss": 1.3539,
+      "step": 5531
+    },
+    {
+      "epoch": 0.9850427350427351,
+      "grad_norm": 0.5208535194396973,
+      "learning_rate": 0.0001716837306141605,
+      "loss": 1.2306,
+      "step": 5532
+    },
+    {
+      "epoch": 0.9852207977207977,
+      "grad_norm": 0.552173376083374,
+      "learning_rate": 0.0001716739703078363,
+      "loss": 1.0551,
+      "step": 5533
+    },
+    {
+      "epoch": 0.9853988603988604,
+      "grad_norm": 0.5327515602111816,
+      "learning_rate": 0.00017166420859720955,
+      "loss": 1.2443,
+      "step": 5534
+    },
+    {
+      "epoch": 0.9855769230769231,
+      "grad_norm": 0.5255244374275208,
+      "learning_rate": 0.0001716544454824715,
+      "loss": 1.005,
+      "step": 5535
+    },
+    {
+      "epoch": 0.9857549857549858,
+      "grad_norm": 0.4753847122192383,
+      "learning_rate": 0.00017164468096381343,
+      "loss": 1.0081,
+      "step": 5536
+    },
+    {
+      "epoch": 0.9859330484330484,
+      "grad_norm": 0.5261829495429993,
+      "learning_rate": 0.00017163491504142665,
+      "loss": 1.2249,
+      "step": 5537
+    },
+    {
+      "epoch": 0.9861111111111112,
+      "grad_norm": 0.46499499678611755,
+      "learning_rate": 0.00017162514771550255,
+      "loss": 0.8759,
+      "step": 5538
+    },
+    {
+      "epoch": 0.9862891737891738,
+      "grad_norm": 0.5233004689216614,
+      "learning_rate": 0.00017161537898623247,
+      "loss": 1.0474,
+      "step": 5539
+    },
+    {
+      "epoch": 0.9864672364672364,
+      "grad_norm": 0.46905553340911865,
+      "learning_rate": 0.00017160560885380778,
+      "loss": 0.9033,
+      "step": 5540
+    },
+    {
+      "epoch": 0.9866452991452992,
+      "grad_norm": 0.5816231369972229,
+      "learning_rate": 0.00017159583731841998,
+      "loss": 1.0628,
+      "step": 5541
+    },
+    {
+      "epoch": 0.9868233618233618,
+      "grad_norm": 0.4575413167476654,
+      "learning_rate": 0.00017158606438026045,
+      "loss": 1.0446,
+      "step": 5542
+    },
+    {
+      "epoch": 0.9870014245014245,
+      "grad_norm": 0.5968109965324402,
+      "learning_rate": 0.00017157629003952067,
+      "loss": 1.032,
+      "step": 5543
+    },
+    {
+      "epoch": 0.9871794871794872,
+      "grad_norm": 0.5316148400306702,
+      "learning_rate": 0.00017156651429639218,
+      "loss": 0.9167,
+      "step": 5544
+    },
+    {
+      "epoch": 0.9873575498575499,
+      "grad_norm": 0.5185125470161438,
+      "learning_rate": 0.00017155673715106651,
+      "loss": 1.1527,
+      "step": 5545
+    },
+    {
+      "epoch": 0.9875356125356125,
+      "grad_norm": 0.5167772769927979,
+      "learning_rate": 0.00017154695860373525,
+      "loss": 0.9954,
+      "step": 5546
+    },
+    {
+      "epoch": 0.9877136752136753,
+      "grad_norm": 0.6406680345535278,
+      "learning_rate": 0.00017153717865458994,
+      "loss": 1.2758,
+      "step": 5547
+    },
+    {
+      "epoch": 0.9878917378917379,
+      "grad_norm": 0.5223956108093262,
+      "learning_rate": 0.00017152739730382223,
+      "loss": 1.1526,
+      "step": 5548
+    },
+    {
+      "epoch": 0.9880698005698005,
+      "grad_norm": 0.6131790280342102,
+      "learning_rate": 0.00017151761455162375,
+      "loss": 1.1024,
+      "step": 5549
+    },
+    {
+      "epoch": 0.9882478632478633,
+      "grad_norm": 0.5574753880500793,
+      "learning_rate": 0.00017150783039818616,
+      "loss": 0.9733,
+      "step": 5550
+    },
+    {
+      "epoch": 0.9884259259259259,
+      "grad_norm": 0.5417882800102234,
+      "learning_rate": 0.0001714980448437012,
+      "loss": 1.2244,
+      "step": 5551
+    },
+    {
+      "epoch": 0.9886039886039886,
+      "grad_norm": 0.6217474341392517,
+      "learning_rate": 0.0001714882578883606,
+      "loss": 0.9224,
+      "step": 5552
+    },
+    {
+      "epoch": 0.9887820512820513,
+      "grad_norm": 0.5846285223960876,
+      "learning_rate": 0.00017147846953235606,
+      "loss": 1.2429,
+      "step": 5553
+    },
+    {
+      "epoch": 0.988960113960114,
+      "grad_norm": 0.5924782752990723,
+      "learning_rate": 0.00017146867977587936,
+      "loss": 0.9907,
+      "step": 5554
+    },
+    {
+      "epoch": 0.9891381766381766,
+      "grad_norm": 0.5756853818893433,
+      "learning_rate": 0.00017145888861912242,
+      "loss": 1.1266,
+      "step": 5555
+    },
+    {
+      "epoch": 0.9893162393162394,
+      "grad_norm": 0.5277376770973206,
+      "learning_rate": 0.00017144909606227693,
+      "loss": 1.1676,
+      "step": 5556
+    },
+    {
+      "epoch": 0.989494301994302,
+      "grad_norm": 0.5138902068138123,
+      "learning_rate": 0.00017143930210553485,
+      "loss": 0.9864,
+      "step": 5557
+    },
+    {
+      "epoch": 0.9896723646723646,
+      "grad_norm": 0.8072507977485657,
+      "learning_rate": 0.00017142950674908805,
+      "loss": 1.111,
+      "step": 5558
+    },
+    {
+      "epoch": 0.9898504273504274,
+      "grad_norm": 0.5641721487045288,
+      "learning_rate": 0.00017141970999312844,
+      "loss": 0.9106,
+      "step": 5559
+    },
+    {
+      "epoch": 0.99002849002849,
+      "grad_norm": 0.5260798931121826,
+      "learning_rate": 0.000171409911837848,
+      "loss": 1.1609,
+      "step": 5560
+    },
+    {
+      "epoch": 0.9902065527065527,
+      "grad_norm": 0.5398530960083008,
+      "learning_rate": 0.00017140011228343864,
+      "loss": 1.0368,
+      "step": 5561
+    },
+    {
+      "epoch": 0.9903846153846154,
+      "grad_norm": 0.6011313199996948,
+      "learning_rate": 0.00017139031133009245,
+      "loss": 1.1314,
+      "step": 5562
+    },
+    {
+      "epoch": 0.9905626780626781,
+      "grad_norm": 0.6194971203804016,
+      "learning_rate": 0.00017138050897800135,
+      "loss": 1.3493,
+      "step": 5563
+    },
+    {
+      "epoch": 0.9907407407407407,
+      "grad_norm": 0.5779356956481934,
+      "learning_rate": 0.0001713707052273575,
+      "loss": 0.943,
+      "step": 5564
+    },
+    {
+      "epoch": 0.9909188034188035,
+      "grad_norm": 0.5321127772331238,
+      "learning_rate": 0.00017136090007835293,
+      "loss": 0.7914,
+      "step": 5565
+    },
+    {
+      "epoch": 0.9910968660968661,
+      "grad_norm": 0.5470426678657532,
+      "learning_rate": 0.00017135109353117977,
+      "loss": 1.2113,
+      "step": 5566
+    },
+    {
+      "epoch": 0.9912749287749287,
+      "grad_norm": 0.5551436543464661,
+      "learning_rate": 0.00017134128558603012,
+      "loss": 0.8932,
+      "step": 5567
+    },
+    {
+      "epoch": 0.9914529914529915,
+      "grad_norm": 0.45770928263664246,
+      "learning_rate": 0.0001713314762430962,
+      "loss": 1.0061,
+      "step": 5568
+    },
+    {
+      "epoch": 0.9916310541310541,
+      "grad_norm": 0.5578967332839966,
+      "learning_rate": 0.00017132166550257017,
+      "loss": 1.148,
+      "step": 5569
+    },
+    {
+      "epoch": 0.9918091168091168,
+      "grad_norm": 0.5086452960968018,
+      "learning_rate": 0.0001713118533646443,
+      "loss": 0.9803,
+      "step": 5570
+    },
+    {
+      "epoch": 0.9919871794871795,
+      "grad_norm": 0.4714745879173279,
+      "learning_rate": 0.00017130203982951078,
+      "loss": 1.0176,
+      "step": 5571
+    },
+    {
+      "epoch": 0.9921652421652422,
+      "grad_norm": 0.6254406571388245,
+      "learning_rate": 0.0001712922248973619,
+      "loss": 1.0932,
+      "step": 5572
+    },
+    {
+      "epoch": 0.9923433048433048,
+      "grad_norm": 0.5005003809928894,
+      "learning_rate": 0.00017128240856838998,
+      "loss": 1.0783,
+      "step": 5573
+    },
+    {
+      "epoch": 0.9925213675213675,
+      "grad_norm": 0.5668206214904785,
+      "learning_rate": 0.00017127259084278733,
+      "loss": 1.0404,
+      "step": 5574
+    },
+    {
+      "epoch": 0.9926994301994302,
+      "grad_norm": 0.4976036250591278,
+      "learning_rate": 0.00017126277172074632,
+      "loss": 1.1437,
+      "step": 5575
+    },
+    {
+      "epoch": 0.9928774928774928,
+      "grad_norm": 0.567546546459198,
+      "learning_rate": 0.00017125295120245935,
+      "loss": 1.2188,
+      "step": 5576
+    },
+    {
+      "epoch": 0.9930555555555556,
+      "grad_norm": 0.5614372491836548,
+      "learning_rate": 0.0001712431292881188,
+      "loss": 0.9187,
+      "step": 5577
+    },
+    {
+      "epoch": 0.9932336182336182,
+      "grad_norm": 0.6117973327636719,
+      "learning_rate": 0.00017123330597791712,
+      "loss": 1.1285,
+      "step": 5578
+    },
+    {
+      "epoch": 0.9934116809116809,
+      "grad_norm": 0.6000342965126038,
+      "learning_rate": 0.00017122348127204676,
+      "loss": 0.9837,
+      "step": 5579
+    },
+    {
+      "epoch": 0.9935897435897436,
+      "grad_norm": 0.5453050136566162,
+      "learning_rate": 0.0001712136551707003,
+      "loss": 0.8771,
+      "step": 5580
+    },
+    {
+      "epoch": 0.9937678062678063,
+      "grad_norm": 0.49603891372680664,
+      "learning_rate": 0.00017120382767407018,
+      "loss": 1.0754,
+      "step": 5581
+    },
+    {
+      "epoch": 0.9939458689458689,
+      "grad_norm": 0.48031488060951233,
+      "learning_rate": 0.00017119399878234894,
+      "loss": 0.6933,
+      "step": 5582
+    },
+    {
+      "epoch": 0.9941239316239316,
+      "grad_norm": 0.6048742532730103,
+      "learning_rate": 0.0001711841684957292,
+      "loss": 0.9696,
+      "step": 5583
+    },
+    {
+      "epoch": 0.9943019943019943,
+      "grad_norm": 0.5183123350143433,
+      "learning_rate": 0.00017117433681440355,
+      "loss": 1.1313,
+      "step": 5584
+    },
+    {
+      "epoch": 0.9944800569800569,
+      "grad_norm": 0.504916250705719,
+      "learning_rate": 0.00017116450373856466,
+      "loss": 1.0273,
+      "step": 5585
+    },
+    {
+      "epoch": 0.9946581196581197,
+      "grad_norm": 0.5804886817932129,
+      "learning_rate": 0.0001711546692684051,
+      "loss": 1.1162,
+      "step": 5586
+    },
+    {
+      "epoch": 0.9948361823361823,
+      "grad_norm": 0.5531938672065735,
+      "learning_rate": 0.0001711448334041176,
+      "loss": 1.2893,
+      "step": 5587
+    },
+    {
+      "epoch": 0.9950142450142451,
+      "grad_norm": 0.5079928636550903,
+      "learning_rate": 0.00017113499614589492,
+      "loss": 1.0393,
+      "step": 5588
+    },
+    {
+      "epoch": 0.9951923076923077,
+      "grad_norm": 0.5421964526176453,
+      "learning_rate": 0.00017112515749392973,
+      "loss": 0.8844,
+      "step": 5589
+    },
+    {
+      "epoch": 0.9953703703703703,
+      "grad_norm": 0.4834558367729187,
+      "learning_rate": 0.00017111531744841486,
+      "loss": 1.0187,
+      "step": 5590
+    },
+    {
+      "epoch": 0.9955484330484331,
+      "grad_norm": 0.6704340577125549,
+      "learning_rate": 0.00017110547600954307,
+      "loss": 0.8524,
+      "step": 5591
+    },
+    {
+      "epoch": 0.9957264957264957,
+      "grad_norm": 0.4578927159309387,
+      "learning_rate": 0.00017109563317750718,
+      "loss": 1.059,
+      "step": 5592
+    },
+    {
+      "epoch": 0.9959045584045584,
+      "grad_norm": 0.5563494563102722,
+      "learning_rate": 0.00017108578895250006,
+      "loss": 1.1211,
+      "step": 5593
+    },
+    {
+      "epoch": 0.9960826210826211,
+      "grad_norm": 0.5272170901298523,
+      "learning_rate": 0.00017107594333471454,
+      "loss": 0.9224,
+      "step": 5594
+    },
+    {
+      "epoch": 0.9962606837606838,
+      "grad_norm": 0.5697501301765442,
+      "learning_rate": 0.00017106609632434357,
+      "loss": 1.2223,
+      "step": 5595
+    },
+    {
+      "epoch": 0.9964387464387464,
+      "grad_norm": 0.5385653376579285,
+      "learning_rate": 0.00017105624792158007,
+      "loss": 1.0809,
+      "step": 5596
+    },
+    {
+      "epoch": 0.9966168091168092,
+      "grad_norm": 0.5608006119728088,
+      "learning_rate": 0.000171046398126617,
+      "loss": 1.3936,
+      "step": 5597
+    },
+    {
+      "epoch": 0.9967948717948718,
+      "grad_norm": 0.5063132643699646,
+      "learning_rate": 0.00017103654693964736,
+      "loss": 1.2086,
+      "step": 5598
+    },
+    {
+      "epoch": 0.9969729344729344,
+      "grad_norm": 0.6014235019683838,
+      "learning_rate": 0.00017102669436086415,
+      "loss": 1.1231,
+      "step": 5599
+    },
+    {
+      "epoch": 0.9971509971509972,
+      "grad_norm": 0.49549567699432373,
+      "learning_rate": 0.00017101684039046036,
+      "loss": 1.0013,
+      "step": 5600
+    },
+    {
+      "epoch": 0.9973290598290598,
+      "grad_norm": 0.517464816570282,
+      "learning_rate": 0.00017100698502862916,
+      "loss": 1.1143,
+      "step": 5601
+    },
+    {
+      "epoch": 0.9975071225071225,
+      "grad_norm": 0.514281153678894,
+      "learning_rate": 0.00017099712827556358,
+      "loss": 1.0336,
+      "step": 5602
+    },
+    {
+      "epoch": 0.9976851851851852,
+      "grad_norm": 0.5378567576408386,
+      "learning_rate": 0.00017098727013145672,
+      "loss": 0.8278,
+      "step": 5603
+    },
+    {
+      "epoch": 0.9978632478632479,
+      "grad_norm": 0.5098404884338379,
+      "learning_rate": 0.0001709774105965018,
+      "loss": 0.9902,
+      "step": 5604
+    },
+    {
+      "epoch": 0.9980413105413105,
+      "grad_norm": 0.6231759190559387,
+      "learning_rate": 0.00017096754967089198,
+      "loss": 1.0564,
+      "step": 5605
+    },
+    {
+      "epoch": 0.9982193732193733,
+      "grad_norm": 0.47434380650520325,
+      "learning_rate": 0.00017095768735482042,
+      "loss": 0.7457,
+      "step": 5606
+    },
+    {
+      "epoch": 0.9983974358974359,
+      "grad_norm": 0.5771013498306274,
+      "learning_rate": 0.00017094782364848035,
+      "loss": 1.1191,
+      "step": 5607
+    },
+    {
+      "epoch": 0.9985754985754985,
+      "grad_norm": 0.5617234706878662,
+      "learning_rate": 0.00017093795855206508,
+      "loss": 1.0779,
+      "step": 5608
+    },
+    {
+      "epoch": 0.9987535612535613,
+      "grad_norm": 0.6573554873466492,
+      "learning_rate": 0.00017092809206576792,
+      "loss": 1.0191,
+      "step": 5609
+    },
+    {
+      "epoch": 0.9989316239316239,
+      "grad_norm": 0.482834130525589,
+      "learning_rate": 0.00017091822418978207,
+      "loss": 1.0119,
+      "step": 5610
+    },
+    {
+      "epoch": 0.9991096866096866,
+      "grad_norm": 0.47496405243873596,
+      "learning_rate": 0.000170908354924301,
+      "loss": 0.8297,
+      "step": 5611
+    },
+    {
+      "epoch": 0.9992877492877493,
+      "grad_norm": 0.5013265013694763,
+      "learning_rate": 0.00017089848426951796,
+      "loss": 1.1511,
+      "step": 5612
+    },
+    {
+      "epoch": 0.999465811965812,
+      "grad_norm": 0.5402522683143616,
+      "learning_rate": 0.00017088861222562643,
+      "loss": 1.1401,
+      "step": 5613
+    },
+    {
+      "epoch": 0.9996438746438746,
+      "grad_norm": 0.546302318572998,
+      "learning_rate": 0.00017087873879281977,
+      "loss": 0.8611,
+      "step": 5614
+    },
+    {
+      "epoch": 0.9998219373219374,
+      "grad_norm": 0.44279807806015015,
+      "learning_rate": 0.0001708688639712915,
+      "loss": 0.79,
+      "step": 5615
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.5514659285545349,
+      "learning_rate": 0.00017085898776123502,
+      "loss": 1.0709,
+      "step": 5616
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 1.093075156211853,
+      "eval_runtime": 24.6155,
+      "eval_samples_per_second": 42.29,
+      "eval_steps_per_second": 21.166,
+      "step": 5616
+    },
+    {
+      "epoch": 1.0001780626780628,
+      "grad_norm": 0.6290156841278076,
+      "learning_rate": 0.0001708491101628439,
+      "loss": 1.1786,
+      "step": 5617
+    },
+    {
+      "epoch": 1.0001780626780628,
+      "grad_norm": 0.4703841209411621,
+      "learning_rate": 0.00017083923117631162,
+      "loss": 0.9548,
+      "step": 5618
+    },
+    {
+      "epoch": 1.0003561253561253,
+      "grad_norm": 0.4518105089664459,
+      "learning_rate": 0.0001708293508018318,
+      "loss": 1.0089,
+      "step": 5619
+    },
+    {
+      "epoch": 1.000534188034188,
+      "grad_norm": 0.5658619403839111,
+      "learning_rate": 0.00017081946903959794,
+      "loss": 0.9466,
+      "step": 5620
+    },
+    {
+      "epoch": 1.0007122507122508,
+      "grad_norm": 0.6153838634490967,
+      "learning_rate": 0.00017080958588980372,
+      "loss": 1.2898,
+      "step": 5621
+    },
+    {
+      "epoch": 1.0008903133903133,
+      "grad_norm": 0.5245628952980042,
+      "learning_rate": 0.00017079970135264275,
+      "loss": 1.1702,
+      "step": 5622
+    },
+    {
+      "epoch": 1.001068376068376,
+      "grad_norm": 0.5291880965232849,
+      "learning_rate": 0.00017078981542830875,
+      "loss": 1.0779,
+      "step": 5623
+    },
+    {
+      "epoch": 1.0012464387464388,
+      "grad_norm": 0.500579297542572,
+      "learning_rate": 0.0001707799281169953,
+      "loss": 0.9587,
+      "step": 5624
+    },
+    {
+      "epoch": 1.0014245014245013,
+      "grad_norm": 0.45739707350730896,
+      "learning_rate": 0.00017077003941889625,
+      "loss": 0.9373,
+      "step": 5625
+    },
+    {
+      "epoch": 1.001602564102564,
+      "grad_norm": 0.5513401031494141,
+      "learning_rate": 0.00017076014933420526,
+      "loss": 1.0368,
+      "step": 5626
+    },
+    {
+      "epoch": 1.0017806267806268,
+      "grad_norm": 0.46513232588768005,
+      "learning_rate": 0.00017075025786311612,
+      "loss": 0.9422,
+      "step": 5627
+    },
+    {
+      "epoch": 1.0019586894586894,
+      "grad_norm": 0.4530394673347473,
+      "learning_rate": 0.00017074036500582267,
+      "loss": 0.8211,
+      "step": 5628
+    },
+    {
+      "epoch": 1.0021367521367521,
+      "grad_norm": 0.5612013339996338,
+      "learning_rate": 0.00017073047076251872,
+      "loss": 0.9466,
+      "step": 5629
+    },
+    {
+      "epoch": 1.0023148148148149,
+      "grad_norm": 0.4976879954338074,
+      "learning_rate": 0.00017072057513339812,
+      "loss": 0.8059,
+      "step": 5630
+    },
+    {
+      "epoch": 1.0024928774928774,
+      "grad_norm": 0.4842833876609802,
+      "learning_rate": 0.00017071067811865476,
+      "loss": 0.6554,
+      "step": 5631
+    },
+    {
+      "epoch": 1.0026709401709402,
+      "grad_norm": 0.5446373224258423,
+      "learning_rate": 0.00017070077971848257,
+      "loss": 1.1001,
+      "step": 5632
+    },
+    {
+      "epoch": 1.002849002849003,
+      "grad_norm": 0.5996584892272949,
+      "learning_rate": 0.00017069087993307544,
+      "loss": 1.0317,
+      "step": 5633
+    },
+    {
+      "epoch": 1.0030270655270654,
+      "grad_norm": 0.5369443297386169,
+      "learning_rate": 0.00017068097876262738,
+      "loss": 0.8019,
+      "step": 5634
+    },
+    {
+      "epoch": 1.0032051282051282,
+      "grad_norm": 0.4985966682434082,
+      "learning_rate": 0.00017067107620733236,
+      "loss": 1.0121,
+      "step": 5635
+    },
+    {
+      "epoch": 1.003383190883191,
+      "grad_norm": 0.5262824892997742,
+      "learning_rate": 0.0001706611722673844,
+      "loss": 1.0157,
+      "step": 5636
+    },
+    {
+      "epoch": 1.0035612535612535,
+      "grad_norm": 0.5912795066833496,
+      "learning_rate": 0.00017065126694297756,
+      "loss": 1.0327,
+      "step": 5637
+    },
+    {
+      "epoch": 1.0037393162393162,
+      "grad_norm": 0.5866343379020691,
+      "learning_rate": 0.00017064136023430595,
+      "loss": 1.1194,
+      "step": 5638
+    },
+    {
+      "epoch": 1.003917378917379,
+      "grad_norm": 0.5009918808937073,
+      "learning_rate": 0.0001706314521415636,
+      "loss": 1.0467,
+      "step": 5639
+    },
+    {
+      "epoch": 1.0040954415954415,
+      "grad_norm": 0.5455304384231567,
+      "learning_rate": 0.00017062154266494464,
+      "loss": 0.8749,
+      "step": 5640
+    },
+    {
+      "epoch": 1.0042735042735043,
+      "grad_norm": 0.5648258328437805,
+      "learning_rate": 0.00017061163180464328,
+      "loss": 0.9408,
+      "step": 5641
+    },
+    {
+      "epoch": 1.004451566951567,
+      "grad_norm": 0.5276365876197815,
+      "learning_rate": 0.00017060171956085368,
+      "loss": 0.9681,
+      "step": 5642
+    },
+    {
+      "epoch": 1.0046296296296295,
+      "grad_norm": 0.5212745070457458,
+      "learning_rate": 0.00017059180593377007,
+      "loss": 0.9188,
+      "step": 5643
+    },
+    {
+      "epoch": 1.0048076923076923,
+      "grad_norm": 0.540626585483551,
+      "learning_rate": 0.00017058189092358664,
+      "loss": 1.0809,
+      "step": 5644
+    },
+    {
+      "epoch": 1.004985754985755,
+      "grad_norm": 0.5592377781867981,
+      "learning_rate": 0.00017057197453049767,
+      "loss": 0.8589,
+      "step": 5645
+    },
+    {
+      "epoch": 1.0051638176638176,
+      "grad_norm": 0.5115051865577698,
+      "learning_rate": 0.00017056205675469746,
+      "loss": 0.8006,
+      "step": 5646
+    },
+    {
+      "epoch": 1.0053418803418803,
+      "grad_norm": 0.5031117796897888,
+      "learning_rate": 0.00017055213759638034,
+      "loss": 0.9242,
+      "step": 5647
+    },
+    {
+      "epoch": 1.005519943019943,
+      "grad_norm": 0.5342774987220764,
+      "learning_rate": 0.00017054221705574066,
+      "loss": 0.8268,
+      "step": 5648
+    },
+    {
+      "epoch": 1.0056980056980056,
+      "grad_norm": 0.44480493664741516,
+      "learning_rate": 0.00017053229513297276,
+      "loss": 0.6892,
+      "step": 5649
+    },
+    {
+      "epoch": 1.0058760683760684,
+      "grad_norm": 0.5032621622085571,
+      "learning_rate": 0.00017052237182827105,
+      "loss": 0.971,
+      "step": 5650
+    },
+    {
+      "epoch": 1.006054131054131,
+      "grad_norm": 0.5611015558242798,
+      "learning_rate": 0.00017051244714182996,
+      "loss": 0.9403,
+      "step": 5651
+    },
+    {
+      "epoch": 1.0062321937321936,
+      "grad_norm": 0.5064613223075867,
+      "learning_rate": 0.00017050252107384393,
+      "loss": 0.9718,
+      "step": 5652
+    },
+    {
+      "epoch": 1.0064102564102564,
+      "grad_norm": 0.6458395719528198,
+      "learning_rate": 0.0001704925936245075,
+      "loss": 1.1161,
+      "step": 5653
+    },
+    {
+      "epoch": 1.0065883190883191,
+      "grad_norm": 0.527418315410614,
+      "learning_rate": 0.00017048266479401512,
+      "loss": 0.9315,
+      "step": 5654
+    },
+    {
+      "epoch": 1.0067663817663817,
+      "grad_norm": 0.5127941370010376,
+      "learning_rate": 0.00017047273458256133,
+      "loss": 0.8206,
+      "step": 5655
+    },
+    {
+      "epoch": 1.0069444444444444,
+      "grad_norm": 0.6257100105285645,
+      "learning_rate": 0.00017046280299034067,
+      "loss": 0.9854,
+      "step": 5656
+    },
+    {
+      "epoch": 1.0071225071225072,
+      "grad_norm": 0.5081700682640076,
+      "learning_rate": 0.0001704528700175478,
+      "loss": 0.9478,
+      "step": 5657
+    },
+    {
+      "epoch": 1.0073005698005697,
+      "grad_norm": 0.598127543926239,
+      "learning_rate": 0.00017044293566437725,
+      "loss": 1.0721,
+      "step": 5658
+    },
+    {
+      "epoch": 1.0074786324786325,
+      "grad_norm": 0.5429877638816833,
+      "learning_rate": 0.00017043299993102376,
+      "loss": 0.9732,
+      "step": 5659
+    },
+    {
+      "epoch": 1.0076566951566952,
+      "grad_norm": 0.6006619334220886,
+      "learning_rate": 0.00017042306281768194,
+      "loss": 1.1262,
+      "step": 5660
+    },
+    {
+      "epoch": 1.0078347578347577,
+      "grad_norm": 0.48933324217796326,
+      "learning_rate": 0.00017041312432454646,
+      "loss": 0.8596,
+      "step": 5661
+    },
+    {
+      "epoch": 1.0080128205128205,
+      "grad_norm": 0.5902166366577148,
+      "learning_rate": 0.0001704031844518121,
+      "loss": 1.1035,
+      "step": 5662
+    },
+    {
+      "epoch": 1.0081908831908832,
+      "grad_norm": 0.523597776889801,
+      "learning_rate": 0.0001703932431996736,
+      "loss": 0.7117,
+      "step": 5663
+    },
+    {
+      "epoch": 1.0083689458689458,
+      "grad_norm": 0.6313928365707397,
+      "learning_rate": 0.00017038330056832573,
+      "loss": 1.0204,
+      "step": 5664
+    },
+    {
+      "epoch": 1.0085470085470085,
+      "grad_norm": 0.5627471804618835,
+      "learning_rate": 0.00017037335655796328,
+      "loss": 0.7648,
+      "step": 5665
+    },
+    {
+      "epoch": 1.0087250712250713,
+      "grad_norm": 0.5817851424217224,
+      "learning_rate": 0.0001703634111687811,
+      "loss": 1.0452,
+      "step": 5666
+    },
+    {
+      "epoch": 1.0089031339031338,
+      "grad_norm": 0.5143535137176514,
+      "learning_rate": 0.00017035346440097407,
+      "loss": 0.9788,
+      "step": 5667
+    },
+    {
+      "epoch": 1.0090811965811965,
+      "grad_norm": 0.5331187844276428,
+      "learning_rate": 0.000170343516254737,
+      "loss": 0.7584,
+      "step": 5668
+    },
+    {
+      "epoch": 1.0092592592592593,
+      "grad_norm": 0.5723634362220764,
+      "learning_rate": 0.00017033356673026487,
+      "loss": 0.9435,
+      "step": 5669
+    },
+    {
+      "epoch": 1.0094373219373218,
+      "grad_norm": 0.6012297868728638,
+      "learning_rate": 0.00017032361582775265,
+      "loss": 1.142,
+      "step": 5670
+    },
+    {
+      "epoch": 1.0096153846153846,
+      "grad_norm": 0.6161282658576965,
+      "learning_rate": 0.00017031366354739523,
+      "loss": 1.2823,
+      "step": 5671
+    },
+    {
+      "epoch": 1.0097934472934473,
+      "grad_norm": 0.5088054537773132,
+      "learning_rate": 0.00017030370988938763,
+      "loss": 0.9743,
+      "step": 5672
+    },
+    {
+      "epoch": 1.0099715099715099,
+      "grad_norm": 0.512003481388092,
+      "learning_rate": 0.0001702937548539249,
+      "loss": 0.9112,
+      "step": 5673
+    },
+    {
+      "epoch": 1.0101495726495726,
+      "grad_norm": 0.5565149784088135,
+      "learning_rate": 0.00017028379844120207,
+      "loss": 1.0074,
+      "step": 5674
+    },
+    {
+      "epoch": 1.0103276353276354,
+      "grad_norm": 0.6463099718093872,
+      "learning_rate": 0.00017027384065141418,
+      "loss": 1.175,
+      "step": 5675
+    },
+    {
+      "epoch": 1.010505698005698,
+      "grad_norm": 0.46999064087867737,
+      "learning_rate": 0.00017026388148475637,
+      "loss": 0.8429,
+      "step": 5676
+    },
+    {
+      "epoch": 1.0106837606837606,
+      "grad_norm": 0.5617384910583496,
+      "learning_rate": 0.00017025392094142377,
+      "loss": 1.045,
+      "step": 5677
+    },
+    {
+      "epoch": 1.0108618233618234,
+      "grad_norm": 0.5156623721122742,
+      "learning_rate": 0.00017024395902161154,
+      "loss": 1.016,
+      "step": 5678
+    },
+    {
+      "epoch": 1.0110398860398861,
+      "grad_norm": 0.5693390369415283,
+      "learning_rate": 0.00017023399572551484,
+      "loss": 0.8616,
+      "step": 5679
+    },
+    {
+      "epoch": 1.0112179487179487,
+      "grad_norm": 0.5234879851341248,
+      "learning_rate": 0.00017022403105332892,
+      "loss": 0.9244,
+      "step": 5680
+    },
+    {
+      "epoch": 1.0113960113960114,
+      "grad_norm": 0.6513097286224365,
+      "learning_rate": 0.00017021406500524893,
+      "loss": 0.9565,
+      "step": 5681
+    },
+    {
+      "epoch": 1.0115740740740742,
+      "grad_norm": 0.5788878202438354,
+      "learning_rate": 0.00017020409758147022,
+      "loss": 0.8994,
+      "step": 5682
+    },
+    {
+      "epoch": 1.0117521367521367,
+      "grad_norm": 0.5495247840881348,
+      "learning_rate": 0.00017019412878218807,
+      "loss": 0.9371,
+      "step": 5683
+    },
+    {
+      "epoch": 1.0119301994301995,
+      "grad_norm": 0.639045238494873,
+      "learning_rate": 0.00017018415860759777,
+      "loss": 1.0297,
+      "step": 5684
+    },
+    {
+      "epoch": 1.0121082621082622,
+      "grad_norm": 0.5167784690856934,
+      "learning_rate": 0.0001701741870578947,
+      "loss": 0.8974,
+      "step": 5685
+    },
+    {
+      "epoch": 1.0122863247863247,
+      "grad_norm": 0.6131011247634888,
+      "learning_rate": 0.00017016421413327417,
+      "loss": 1.13,
+      "step": 5686
+    },
+    {
+      "epoch": 1.0124643874643875,
+      "grad_norm": 0.4804688096046448,
+      "learning_rate": 0.00017015423983393166,
+      "loss": 1.0098,
+      "step": 5687
+    },
+    {
+      "epoch": 1.0126424501424502,
+      "grad_norm": 0.6605221629142761,
+      "learning_rate": 0.00017014426416006253,
+      "loss": 1.1123,
+      "step": 5688
+    },
+    {
+      "epoch": 1.0128205128205128,
+      "grad_norm": 0.5523666739463806,
+      "learning_rate": 0.00017013428711186226,
+      "loss": 0.8226,
+      "step": 5689
+    },
+    {
+      "epoch": 1.0129985754985755,
+      "grad_norm": 0.6012941598892212,
+      "learning_rate": 0.00017012430868952632,
+      "loss": 0.8915,
+      "step": 5690
+    },
+    {
+      "epoch": 1.0131766381766383,
+      "grad_norm": 0.5830875039100647,
+      "learning_rate": 0.00017011432889325022,
+      "loss": 1.021,
+      "step": 5691
+    },
+    {
+      "epoch": 1.0133547008547008,
+      "grad_norm": 0.5546056032180786,
+      "learning_rate": 0.0001701043477232295,
+      "loss": 0.7656,
+      "step": 5692
+    },
+    {
+      "epoch": 1.0135327635327636,
+      "grad_norm": 0.5592601299285889,
+      "learning_rate": 0.0001700943651796597,
+      "loss": 1.0172,
+      "step": 5693
+    },
+    {
+      "epoch": 1.0137108262108263,
+      "grad_norm": 0.5708866715431213,
+      "learning_rate": 0.00017008438126273645,
+      "loss": 1.0012,
+      "step": 5694
+    },
+    {
+      "epoch": 1.0138888888888888,
+      "grad_norm": 0.6856338381767273,
+      "learning_rate": 0.0001700743959726553,
+      "loss": 1.1278,
+      "step": 5695
+    },
+    {
+      "epoch": 1.0140669515669516,
+      "grad_norm": 0.6523802876472473,
+      "learning_rate": 0.000170064409309612,
+      "loss": 1.0406,
+      "step": 5696
+    },
+    {
+      "epoch": 1.0142450142450143,
+      "grad_norm": 0.6653079986572266,
+      "learning_rate": 0.00017005442127380208,
+      "loss": 1.1086,
+      "step": 5697
+    },
+    {
+      "epoch": 1.0144230769230769,
+      "grad_norm": 0.5841104388237,
+      "learning_rate": 0.00017004443186542133,
+      "loss": 0.9335,
+      "step": 5698
+    },
+    {
+      "epoch": 1.0146011396011396,
+      "grad_norm": 0.5696784257888794,
+      "learning_rate": 0.0001700344410846654,
+      "loss": 1.0247,
+      "step": 5699
+    },
+    {
+      "epoch": 1.0147792022792024,
+      "grad_norm": 0.7135653495788574,
+      "learning_rate": 0.00017002444893173013,
+      "loss": 1.0259,
+      "step": 5700
+    },
+    {
+      "epoch": 1.014957264957265,
+      "grad_norm": 0.5806999802589417,
+      "learning_rate": 0.00017001445540681124,
+      "loss": 1.0053,
+      "step": 5701
+    },
+    {
+      "epoch": 1.0151353276353277,
+      "grad_norm": 0.5298715829849243,
+      "learning_rate": 0.0001700044605101045,
+      "loss": 0.9415,
+      "step": 5702
+    },
+    {
+      "epoch": 1.0153133903133904,
+      "grad_norm": 0.5817379951477051,
+      "learning_rate": 0.0001699944642418058,
+      "loss": 1.0906,
+      "step": 5703
+    },
+    {
+      "epoch": 1.015491452991453,
+      "grad_norm": 0.6564923524856567,
+      "learning_rate": 0.00016998446660211098,
+      "loss": 0.9933,
+      "step": 5704
+    },
+    {
+      "epoch": 1.0156695156695157,
+      "grad_norm": 0.6547308564186096,
+      "learning_rate": 0.00016997446759121592,
+      "loss": 1.0045,
+      "step": 5705
+    },
+    {
+      "epoch": 1.0158475783475784,
+      "grad_norm": 0.5763013958930969,
+      "learning_rate": 0.00016996446720931652,
+      "loss": 1.0898,
+      "step": 5706
+    },
+    {
+      "epoch": 1.016025641025641,
+      "grad_norm": 0.6118074059486389,
+      "learning_rate": 0.00016995446545660871,
+      "loss": 0.9398,
+      "step": 5707
+    },
+    {
+      "epoch": 1.0162037037037037,
+      "grad_norm": 0.6810526251792908,
+      "learning_rate": 0.0001699444623332885,
+      "loss": 1.0968,
+      "step": 5708
+    },
+    {
+      "epoch": 1.0163817663817665,
+      "grad_norm": 0.5292752981185913,
+      "learning_rate": 0.00016993445783955184,
+      "loss": 0.7549,
+      "step": 5709
+    },
+    {
+      "epoch": 1.016559829059829,
+      "grad_norm": 0.6014277935028076,
+      "learning_rate": 0.00016992445197559474,
+      "loss": 1.1711,
+      "step": 5710
+    },
+    {
+      "epoch": 1.0167378917378918,
+      "grad_norm": 0.5089772343635559,
+      "learning_rate": 0.00016991444474161326,
+      "loss": 0.9188,
+      "step": 5711
+    },
+    {
+      "epoch": 1.0169159544159545,
+      "grad_norm": 0.567193865776062,
+      "learning_rate": 0.0001699044361378035,
+      "loss": 0.7462,
+      "step": 5712
+    },
+    {
+      "epoch": 1.017094017094017,
+      "grad_norm": 0.5638598799705505,
+      "learning_rate": 0.00016989442616436147,
+      "loss": 0.9643,
+      "step": 5713
+    },
+    {
+      "epoch": 1.0172720797720798,
+      "grad_norm": 0.5634039640426636,
+      "learning_rate": 0.0001698844148214834,
+      "loss": 1.0141,
+      "step": 5714
+    },
+    {
+      "epoch": 1.0174501424501425,
+      "grad_norm": 0.5326652526855469,
+      "learning_rate": 0.00016987440210936537,
+      "loss": 0.865,
+      "step": 5715
+    },
+    {
+      "epoch": 1.017628205128205,
+      "grad_norm": 0.5858046412467957,
+      "learning_rate": 0.0001698643880282036,
+      "loss": 0.9561,
+      "step": 5716
+    },
+    {
+      "epoch": 1.0178062678062678,
+      "grad_norm": 0.6424698829650879,
+      "learning_rate": 0.00016985437257819428,
+      "loss": 1.0169,
+      "step": 5717
+    },
+    {
+      "epoch": 1.0179843304843306,
+      "grad_norm": 0.6294280290603638,
+      "learning_rate": 0.00016984435575953364,
+      "loss": 1.0438,
+      "step": 5718
+    },
+    {
+      "epoch": 1.018162393162393,
+      "grad_norm": 0.5533088445663452,
+      "learning_rate": 0.00016983433757241788,
+      "loss": 0.8901,
+      "step": 5719
+    },
+    {
+      "epoch": 1.0183404558404558,
+      "grad_norm": 0.5148718953132629,
+      "learning_rate": 0.00016982431801704342,
+      "loss": 0.9201,
+      "step": 5720
+    },
+    {
+      "epoch": 1.0185185185185186,
+      "grad_norm": 0.5609371662139893,
+      "learning_rate": 0.00016981429709360645,
+      "loss": 0.9347,
+      "step": 5721
+    },
+    {
+      "epoch": 1.0186965811965811,
+      "grad_norm": 0.5502731204032898,
+      "learning_rate": 0.00016980427480230338,
+      "loss": 1.0508,
+      "step": 5722
+    },
+    {
+      "epoch": 1.0188746438746439,
+      "grad_norm": 0.5880394577980042,
+      "learning_rate": 0.00016979425114333055,
+      "loss": 1.1258,
+      "step": 5723
+    },
+    {
+      "epoch": 1.0190527065527066,
+      "grad_norm": 0.5569866895675659,
+      "learning_rate": 0.0001697842261168843,
+      "loss": 0.9186,
+      "step": 5724
+    },
+    {
+      "epoch": 1.0192307692307692,
+      "grad_norm": 0.7468093037605286,
+      "learning_rate": 0.00016977419972316116,
+      "loss": 1.2066,
+      "step": 5725
+    },
+    {
+      "epoch": 1.019408831908832,
+      "grad_norm": 0.6041515469551086,
+      "learning_rate": 0.00016976417196235753,
+      "loss": 0.939,
+      "step": 5726
+    },
+    {
+      "epoch": 1.0195868945868947,
+      "grad_norm": 0.6102641224861145,
+      "learning_rate": 0.00016975414283466983,
+      "loss": 0.8334,
+      "step": 5727
+    },
+    {
+      "epoch": 1.0197649572649572,
+      "grad_norm": 0.5418640375137329,
+      "learning_rate": 0.00016974411234029467,
+      "loss": 0.8072,
+      "step": 5728
+    },
+    {
+      "epoch": 1.01994301994302,
+      "grad_norm": 0.6569705605506897,
+      "learning_rate": 0.00016973408047942843,
+      "loss": 1.103,
+      "step": 5729
+    },
+    {
+      "epoch": 1.0201210826210827,
+      "grad_norm": 0.5778102278709412,
+      "learning_rate": 0.00016972404725226778,
+      "loss": 0.9353,
+      "step": 5730
+    },
+    {
+      "epoch": 1.0202991452991452,
+      "grad_norm": 0.5474382638931274,
+      "learning_rate": 0.0001697140126590093,
+      "loss": 1.0009,
+      "step": 5731
+    },
+    {
+      "epoch": 1.020477207977208,
+      "grad_norm": 0.5869506597518921,
+      "learning_rate": 0.00016970397669984947,
+      "loss": 1.0027,
+      "step": 5732
+    },
+    {
+      "epoch": 1.0206552706552707,
+      "grad_norm": 0.5078117251396179,
+      "learning_rate": 0.00016969393937498508,
+      "loss": 0.8316,
+      "step": 5733
+    },
+    {
+      "epoch": 1.0208333333333333,
+      "grad_norm": 0.5488452911376953,
+      "learning_rate": 0.0001696839006846127,
+      "loss": 0.8438,
+      "step": 5734
+    },
+    {
+      "epoch": 1.021011396011396,
+      "grad_norm": 0.5921052098274231,
+      "learning_rate": 0.00016967386062892908,
+      "loss": 0.9147,
+      "step": 5735
+    },
+    {
+      "epoch": 1.0211894586894588,
+      "grad_norm": 0.5486881136894226,
+      "learning_rate": 0.00016966381920813085,
+      "loss": 0.7619,
+      "step": 5736
+    },
+    {
+      "epoch": 1.0213675213675213,
+      "grad_norm": 0.5250689387321472,
+      "learning_rate": 0.00016965377642241483,
+      "loss": 0.9192,
+      "step": 5737
+    },
+    {
+      "epoch": 1.021545584045584,
+      "grad_norm": 0.5355087518692017,
+      "learning_rate": 0.00016964373227197773,
+      "loss": 0.954,
+      "step": 5738
+    },
+    {
+      "epoch": 1.0217236467236468,
+      "grad_norm": 0.6758780479431152,
+      "learning_rate": 0.0001696336867570164,
+      "loss": 1.1257,
+      "step": 5739
+    },
+    {
+      "epoch": 1.0219017094017093,
+      "grad_norm": 0.6361044049263,
+      "learning_rate": 0.00016962363987772756,
+      "loss": 1.0889,
+      "step": 5740
+    },
+    {
+      "epoch": 1.022079772079772,
+      "grad_norm": 0.5802326798439026,
+      "learning_rate": 0.00016961359163430819,
+      "loss": 0.8966,
+      "step": 5741
+    },
+    {
+      "epoch": 1.0222578347578348,
+      "grad_norm": 0.5535712242126465,
+      "learning_rate": 0.00016960354202695508,
+      "loss": 1.0007,
+      "step": 5742
+    },
+    {
+      "epoch": 1.0224358974358974,
+      "grad_norm": 0.5469220280647278,
+      "learning_rate": 0.00016959349105586516,
+      "loss": 0.8202,
+      "step": 5743
+    },
+    {
+      "epoch": 1.02261396011396,
+      "grad_norm": 0.5533008575439453,
+      "learning_rate": 0.00016958343872123534,
+      "loss": 0.9576,
+      "step": 5744
+    },
+    {
+      "epoch": 1.0227920227920229,
+      "grad_norm": 0.615132749080658,
+      "learning_rate": 0.00016957338502326258,
+      "loss": 0.8719,
+      "step": 5745
+    },
+    {
+      "epoch": 1.0229700854700854,
+      "grad_norm": 0.519075334072113,
+      "learning_rate": 0.0001695633299621439,
+      "loss": 0.8309,
+      "step": 5746
+    },
+    {
+      "epoch": 1.0231481481481481,
+      "grad_norm": 0.6249759197235107,
+      "learning_rate": 0.00016955327353807624,
+      "loss": 1.151,
+      "step": 5747
+    },
+    {
+      "epoch": 1.023326210826211,
+      "grad_norm": 0.560299277305603,
+      "learning_rate": 0.00016954321575125668,
+      "loss": 0.7889,
+      "step": 5748
+    },
+    {
+      "epoch": 1.0235042735042734,
+      "grad_norm": 0.5735262036323547,
+      "learning_rate": 0.0001695331566018823,
+      "loss": 0.8794,
+      "step": 5749
+    },
+    {
+      "epoch": 1.0236823361823362,
+      "grad_norm": 0.5893994569778442,
+      "learning_rate": 0.00016952309609015012,
+      "loss": 0.9696,
+      "step": 5750
+    },
+    {
+      "epoch": 1.023860398860399,
+      "grad_norm": 0.6064512133598328,
+      "learning_rate": 0.0001695130342162573,
+      "loss": 0.9771,
+      "step": 5751
+    },
+    {
+      "epoch": 1.0240384615384615,
+      "grad_norm": 0.5833427309989929,
+      "learning_rate": 0.00016950297098040099,
+      "loss": 1.1768,
+      "step": 5752
+    },
+    {
+      "epoch": 1.0242165242165242,
+      "grad_norm": 0.5940282344818115,
+      "learning_rate": 0.00016949290638277833,
+      "loss": 1.0758,
+      "step": 5753
+    },
+    {
+      "epoch": 1.024394586894587,
+      "grad_norm": 0.5267124772071838,
+      "learning_rate": 0.00016948284042358656,
+      "loss": 0.772,
+      "step": 5754
+    },
+    {
+      "epoch": 1.0245726495726495,
+      "grad_norm": 0.6217982172966003,
+      "learning_rate": 0.00016947277310302284,
+      "loss": 0.8583,
+      "step": 5755
+    },
+    {
+      "epoch": 1.0247507122507122,
+      "grad_norm": 0.6192215085029602,
+      "learning_rate": 0.00016946270442128443,
+      "loss": 0.9148,
+      "step": 5756
+    },
+    {
+      "epoch": 1.024928774928775,
+      "grad_norm": 0.5337123870849609,
+      "learning_rate": 0.00016945263437856867,
+      "loss": 1.0054,
+      "step": 5757
+    },
+    {
+      "epoch": 1.0251068376068375,
+      "grad_norm": 0.5462040901184082,
+      "learning_rate": 0.00016944256297507276,
+      "loss": 1.1097,
+      "step": 5758
+    },
+    {
+      "epoch": 1.0252849002849003,
+      "grad_norm": 0.5606170892715454,
+      "learning_rate": 0.00016943249021099415,
+      "loss": 1.0192,
+      "step": 5759
+    },
+    {
+      "epoch": 1.025462962962963,
+      "grad_norm": 0.636974573135376,
+      "learning_rate": 0.00016942241608653008,
+      "loss": 1.0241,
+      "step": 5760
+    },
+    {
+      "epoch": 1.0256410256410255,
+      "grad_norm": 0.4895164966583252,
+      "learning_rate": 0.00016941234060187797,
+      "loss": 0.9057,
+      "step": 5761
+    },
+    {
+      "epoch": 1.0258190883190883,
+      "grad_norm": 0.5810303092002869,
+      "learning_rate": 0.00016940226375723527,
+      "loss": 1.0809,
+      "step": 5762
+    },
+    {
+      "epoch": 1.025997150997151,
+      "grad_norm": 0.6043853163719177,
+      "learning_rate": 0.00016939218555279937,
+      "loss": 1.0685,
+      "step": 5763
+    },
+    {
+      "epoch": 1.0261752136752136,
+      "grad_norm": 0.5827188491821289,
+      "learning_rate": 0.00016938210598876774,
+      "loss": 1.0236,
+      "step": 5764
+    },
+    {
+      "epoch": 1.0263532763532763,
+      "grad_norm": 0.6677887439727783,
+      "learning_rate": 0.0001693720250653379,
+      "loss": 1.0586,
+      "step": 5765
+    },
+    {
+      "epoch": 1.026531339031339,
+      "grad_norm": 0.558051347732544,
+      "learning_rate": 0.0001693619427827073,
+      "loss": 0.745,
+      "step": 5766
+    },
+    {
+      "epoch": 1.0267094017094016,
+      "grad_norm": 0.6336706280708313,
+      "learning_rate": 0.0001693518591410735,
+      "loss": 1.0658,
+      "step": 5767
+    },
+    {
+      "epoch": 1.0268874643874644,
+      "grad_norm": 0.7077126502990723,
+      "learning_rate": 0.00016934177414063416,
+      "loss": 1.18,
+      "step": 5768
+    },
+    {
+      "epoch": 1.0270655270655271,
+      "grad_norm": 0.5342326760292053,
+      "learning_rate": 0.00016933168778158675,
+      "loss": 0.8347,
+      "step": 5769
+    },
+    {
+      "epoch": 1.0272435897435896,
+      "grad_norm": 0.6116416454315186,
+      "learning_rate": 0.00016932160006412895,
+      "loss": 1.0648,
+      "step": 5770
+    },
+    {
+      "epoch": 1.0274216524216524,
+      "grad_norm": 0.5411320924758911,
+      "learning_rate": 0.0001693115109884584,
+      "loss": 1.0756,
+      "step": 5771
+    },
+    {
+      "epoch": 1.0275997150997151,
+      "grad_norm": 0.5549847483634949,
+      "learning_rate": 0.00016930142055477277,
+      "loss": 0.7259,
+      "step": 5772
+    },
+    {
+      "epoch": 1.0277777777777777,
+      "grad_norm": 0.549010694026947,
+      "learning_rate": 0.00016929132876326977,
+      "loss": 0.9488,
+      "step": 5773
+    },
+    {
+      "epoch": 1.0279558404558404,
+      "grad_norm": 0.6302017569541931,
+      "learning_rate": 0.00016928123561414714,
+      "loss": 0.8851,
+      "step": 5774
+    },
+    {
+      "epoch": 1.0281339031339032,
+      "grad_norm": 0.5831273198127747,
+      "learning_rate": 0.00016927114110760257,
+      "loss": 0.7841,
+      "step": 5775
+    },
+    {
+      "epoch": 1.0283119658119657,
+      "grad_norm": 0.5528474450111389,
+      "learning_rate": 0.00016926104524383394,
+      "loss": 1.0108,
+      "step": 5776
+    },
+    {
+      "epoch": 1.0284900284900285,
+      "grad_norm": 0.6279126405715942,
+      "learning_rate": 0.00016925094802303897,
+      "loss": 0.8632,
+      "step": 5777
+    },
+    {
+      "epoch": 1.0286680911680912,
+      "grad_norm": 0.6783218383789062,
+      "learning_rate": 0.00016924084944541554,
+      "loss": 1.0746,
+      "step": 5778
+    },
+    {
+      "epoch": 1.0288461538461537,
+      "grad_norm": 0.5823925137519836,
+      "learning_rate": 0.00016923074951116153,
+      "loss": 1.0486,
+      "step": 5779
+    },
+    {
+      "epoch": 1.0290242165242165,
+      "grad_norm": 0.6095981597900391,
+      "learning_rate": 0.00016922064822047473,
+      "loss": 0.8113,
+      "step": 5780
+    },
+    {
+      "epoch": 1.0292022792022792,
+      "grad_norm": 0.7887664437294006,
+      "learning_rate": 0.00016921054557355317,
+      "loss": 1.2411,
+      "step": 5781
+    },
+    {
+      "epoch": 1.0293803418803418,
+      "grad_norm": 0.6511263251304626,
+      "learning_rate": 0.00016920044157059475,
+      "loss": 0.924,
+      "step": 5782
+    },
+    {
+      "epoch": 1.0295584045584045,
+      "grad_norm": 0.6045661568641663,
+      "learning_rate": 0.00016919033621179744,
+      "loss": 0.8373,
+      "step": 5783
+    },
+    {
+      "epoch": 1.0297364672364673,
+      "grad_norm": 0.6914188861846924,
+      "learning_rate": 0.0001691802294973592,
+      "loss": 0.9589,
+      "step": 5784
+    },
+    {
+      "epoch": 1.0299145299145298,
+      "grad_norm": 0.6483730673789978,
+      "learning_rate": 0.00016917012142747805,
+      "loss": 0.9871,
+      "step": 5785
+    },
+    {
+      "epoch": 1.0300925925925926,
+      "grad_norm": 0.5775033235549927,
+      "learning_rate": 0.0001691600120023521,
+      "loss": 1.0591,
+      "step": 5786
+    },
+    {
+      "epoch": 1.0302706552706553,
+      "grad_norm": 0.6206814646720886,
+      "learning_rate": 0.00016914990122217932,
+      "loss": 0.9126,
+      "step": 5787
+    },
+    {
+      "epoch": 1.0304487179487178,
+      "grad_norm": 0.5422028303146362,
+      "learning_rate": 0.00016913978908715796,
+      "loss": 0.8227,
+      "step": 5788
+    },
+    {
+      "epoch": 1.0306267806267806,
+      "grad_norm": 0.5824416875839233,
+      "learning_rate": 0.000169129675597486,
+      "loss": 1.111,
+      "step": 5789
+    },
+    {
+      "epoch": 1.0308048433048433,
+      "grad_norm": 0.5419015884399414,
+      "learning_rate": 0.00016911956075336165,
+      "loss": 0.8941,
+      "step": 5790
+    },
+    {
+      "epoch": 1.0309829059829059,
+      "grad_norm": 0.6171557903289795,
+      "learning_rate": 0.0001691094445549831,
+      "loss": 0.8679,
+      "step": 5791
+    },
+    {
+      "epoch": 1.0311609686609686,
+      "grad_norm": 0.6136980056762695,
+      "learning_rate": 0.00016909932700254855,
+      "loss": 0.9266,
+      "step": 5792
+    },
+    {
+      "epoch": 1.0313390313390314,
+      "grad_norm": 0.6275020241737366,
+      "learning_rate": 0.00016908920809625624,
+      "loss": 1.0828,
+      "step": 5793
+    },
+    {
+      "epoch": 1.0315170940170941,
+      "grad_norm": 0.6538251638412476,
+      "learning_rate": 0.0001690790878363044,
+      "loss": 0.8413,
+      "step": 5794
+    },
+    {
+      "epoch": 1.0316951566951567,
+      "grad_norm": 0.5981295108795166,
+      "learning_rate": 0.00016906896622289136,
+      "loss": 0.9845,
+      "step": 5795
+    },
+    {
+      "epoch": 1.0318732193732194,
+      "grad_norm": 0.5390967130661011,
+      "learning_rate": 0.00016905884325621538,
+      "loss": 0.8755,
+      "step": 5796
+    },
+    {
+      "epoch": 1.032051282051282,
+      "grad_norm": 0.5534448623657227,
+      "learning_rate": 0.00016904871893647482,
+      "loss": 1.1868,
+      "step": 5797
+    },
+    {
+      "epoch": 1.0322293447293447,
+      "grad_norm": 0.664556086063385,
+      "learning_rate": 0.00016903859326386806,
+      "loss": 1.1418,
+      "step": 5798
+    },
+    {
+      "epoch": 1.0324074074074074,
+      "grad_norm": 0.5737143158912659,
+      "learning_rate": 0.00016902846623859346,
+      "loss": 1.124,
+      "step": 5799
+    },
+    {
+      "epoch": 1.0325854700854702,
+      "grad_norm": 0.6499935388565063,
+      "learning_rate": 0.0001690183378608495,
+      "loss": 1.0331,
+      "step": 5800
+    },
+    {
+      "epoch": 1.0327635327635327,
+      "grad_norm": 0.5721518993377686,
+      "learning_rate": 0.00016900820813083454,
+      "loss": 0.8664,
+      "step": 5801
+    },
+    {
+      "epoch": 1.0329415954415955,
+      "grad_norm": 0.5651140809059143,
+      "learning_rate": 0.0001689980770487471,
+      "loss": 1.1661,
+      "step": 5802
+    },
+    {
+      "epoch": 1.0331196581196582,
+      "grad_norm": 0.5935871005058289,
+      "learning_rate": 0.0001689879446147857,
+      "loss": 0.8722,
+      "step": 5803
+    },
+    {
+      "epoch": 1.0332977207977208,
+      "grad_norm": 0.5627842545509338,
+      "learning_rate": 0.00016897781082914884,
+      "loss": 1.0036,
+      "step": 5804
+    },
+    {
+      "epoch": 1.0334757834757835,
+      "grad_norm": 0.5866895914077759,
+      "learning_rate": 0.00016896767569203502,
+      "loss": 0.9739,
+      "step": 5805
+    },
+    {
+      "epoch": 1.0336538461538463,
+      "grad_norm": 0.5568059682846069,
+      "learning_rate": 0.0001689575392036429,
+      "loss": 0.7081,
+      "step": 5806
+    },
+    {
+      "epoch": 1.0338319088319088,
+      "grad_norm": 0.6054235100746155,
+      "learning_rate": 0.00016894740136417103,
+      "loss": 1.1168,
+      "step": 5807
+    },
+    {
+      "epoch": 1.0340099715099715,
+      "grad_norm": 0.5215454697608948,
+      "learning_rate": 0.00016893726217381805,
+      "loss": 0.9172,
+      "step": 5808
+    },
+    {
+      "epoch": 1.0341880341880343,
+      "grad_norm": 0.5415732860565186,
+      "learning_rate": 0.00016892712163278263,
+      "loss": 0.7812,
+      "step": 5809
+    },
+    {
+      "epoch": 1.0343660968660968,
+      "grad_norm": 0.6341692805290222,
+      "learning_rate": 0.00016891697974126345,
+      "loss": 1.0658,
+      "step": 5810
+    },
+    {
+      "epoch": 1.0345441595441596,
+      "grad_norm": 0.6326245665550232,
+      "learning_rate": 0.00016890683649945922,
+      "loss": 1.0134,
+      "step": 5811
+    },
+    {
+      "epoch": 1.0347222222222223,
+      "grad_norm": 0.5729571580886841,
+      "learning_rate": 0.00016889669190756868,
+      "loss": 0.9139,
+      "step": 5812
+    },
+    {
+      "epoch": 1.0349002849002849,
+      "grad_norm": 0.5912853479385376,
+      "learning_rate": 0.00016888654596579054,
+      "loss": 1.122,
+      "step": 5813
+    },
+    {
+      "epoch": 1.0350783475783476,
+      "grad_norm": 0.8410450220108032,
+      "learning_rate": 0.00016887639867432368,
+      "loss": 1.3009,
+      "step": 5814
+    },
+    {
+      "epoch": 1.0352564102564104,
+      "grad_norm": 0.5416620969772339,
+      "learning_rate": 0.00016886625003336683,
+      "loss": 0.8751,
+      "step": 5815
+    },
+    {
+      "epoch": 1.0354344729344729,
+      "grad_norm": 0.6367851495742798,
+      "learning_rate": 0.0001688561000431189,
+      "loss": 0.956,
+      "step": 5816
+    },
+    {
+      "epoch": 1.0356125356125356,
+      "grad_norm": 0.4618827700614929,
+      "learning_rate": 0.0001688459487037787,
+      "loss": 0.5313,
+      "step": 5817
+    },
+    {
+      "epoch": 1.0357905982905984,
+      "grad_norm": 0.7139244079589844,
+      "learning_rate": 0.00016883579601554516,
+      "loss": 1.0787,
+      "step": 5818
+    },
+    {
+      "epoch": 1.035968660968661,
+      "grad_norm": 0.6896135210990906,
+      "learning_rate": 0.00016882564197861715,
+      "loss": 0.932,
+      "step": 5819
+    },
+    {
+      "epoch": 1.0361467236467237,
+      "grad_norm": 0.5889739394187927,
+      "learning_rate": 0.00016881548659319372,
+      "loss": 0.8852,
+      "step": 5820
+    },
+    {
+      "epoch": 1.0363247863247864,
+      "grad_norm": 0.5954701900482178,
+      "learning_rate": 0.00016880532985947375,
+      "loss": 0.8192,
+      "step": 5821
+    },
+    {
+      "epoch": 1.036502849002849,
+      "grad_norm": 0.6665091514587402,
+      "learning_rate": 0.00016879517177765627,
+      "loss": 0.9578,
+      "step": 5822
+    },
+    {
+      "epoch": 1.0366809116809117,
+      "grad_norm": 0.5990539789199829,
+      "learning_rate": 0.00016878501234794034,
+      "loss": 0.9797,
+      "step": 5823
+    },
+    {
+      "epoch": 1.0368589743589745,
+      "grad_norm": 0.596755862236023,
+      "learning_rate": 0.00016877485157052496,
+      "loss": 1.173,
+      "step": 5824
+    },
+    {
+      "epoch": 1.037037037037037,
+      "grad_norm": 0.544658362865448,
+      "learning_rate": 0.00016876468944560923,
+      "loss": 1.0742,
+      "step": 5825
+    },
+    {
+      "epoch": 1.0372150997150997,
+      "grad_norm": 0.5841910243034363,
+      "learning_rate": 0.00016875452597339225,
+      "loss": 1.029,
+      "step": 5826
+    },
+    {
+      "epoch": 1.0373931623931625,
+      "grad_norm": 0.6508592963218689,
+      "learning_rate": 0.00016874436115407317,
+      "loss": 0.9883,
+      "step": 5827
+    },
+    {
+      "epoch": 1.037571225071225,
+      "grad_norm": 0.590050458908081,
+      "learning_rate": 0.00016873419498785114,
+      "loss": 1.0713,
+      "step": 5828
+    },
+    {
+      "epoch": 1.0377492877492878,
+      "grad_norm": 0.5386307239532471,
+      "learning_rate": 0.00016872402747492534,
+      "loss": 1.0159,
+      "step": 5829
+    },
+    {
+      "epoch": 1.0379273504273505,
+      "grad_norm": 0.6173896193504333,
+      "learning_rate": 0.00016871385861549497,
+      "loss": 1.0056,
+      "step": 5830
+    },
+    {
+      "epoch": 1.038105413105413,
+      "grad_norm": 0.5377787351608276,
+      "learning_rate": 0.0001687036884097593,
+      "loss": 0.8708,
+      "step": 5831
+    },
+    {
+      "epoch": 1.0382834757834758,
+      "grad_norm": 0.5753569006919861,
+      "learning_rate": 0.00016869351685791756,
+      "loss": 1.0529,
+      "step": 5832
+    },
+    {
+      "epoch": 1.0384615384615385,
+      "grad_norm": 0.6085895299911499,
+      "learning_rate": 0.00016868334396016906,
+      "loss": 1.1017,
+      "step": 5833
+    },
+    {
+      "epoch": 1.038639601139601,
+      "grad_norm": 0.6320509910583496,
+      "learning_rate": 0.0001686731697167131,
+      "loss": 1.0543,
+      "step": 5834
+    },
+    {
+      "epoch": 1.0388176638176638,
+      "grad_norm": 0.5691760778427124,
+      "learning_rate": 0.00016866299412774907,
+      "loss": 0.9975,
+      "step": 5835
+    },
+    {
+      "epoch": 1.0389957264957266,
+      "grad_norm": 0.5990765690803528,
+      "learning_rate": 0.0001686528171934763,
+      "loss": 0.8776,
+      "step": 5836
+    },
+    {
+      "epoch": 1.039173789173789,
+      "grad_norm": 0.6650477647781372,
+      "learning_rate": 0.00016864263891409415,
+      "loss": 1.0652,
+      "step": 5837
+    },
+    {
+      "epoch": 1.0393518518518519,
+      "grad_norm": 0.6050353646278381,
+      "learning_rate": 0.00016863245928980212,
+      "loss": 0.9313,
+      "step": 5838
+    },
+    {
+      "epoch": 1.0395299145299146,
+      "grad_norm": 0.587505578994751,
+      "learning_rate": 0.0001686222783207996,
+      "loss": 0.9892,
+      "step": 5839
+    },
+    {
+      "epoch": 1.0397079772079771,
+      "grad_norm": 0.6310170292854309,
+      "learning_rate": 0.00016861209600728608,
+      "loss": 1.1045,
+      "step": 5840
+    },
+    {
+      "epoch": 1.03988603988604,
+      "grad_norm": 0.5683430433273315,
+      "learning_rate": 0.0001686019123494611,
+      "loss": 1.0507,
+      "step": 5841
+    },
+    {
+      "epoch": 1.0400641025641026,
+      "grad_norm": 0.6621488332748413,
+      "learning_rate": 0.00016859172734752414,
+      "loss": 0.9255,
+      "step": 5842
+    },
+    {
+      "epoch": 1.0402421652421652,
+      "grad_norm": 0.6197706460952759,
+      "learning_rate": 0.00016858154100167475,
+      "loss": 1.0031,
+      "step": 5843
+    },
+    {
+      "epoch": 1.040420227920228,
+      "grad_norm": 0.6805898547172546,
+      "learning_rate": 0.00016857135331211257,
+      "loss": 0.9901,
+      "step": 5844
+    },
+    {
+      "epoch": 1.0405982905982907,
+      "grad_norm": 0.5512405633926392,
+      "learning_rate": 0.00016856116427903714,
+      "loss": 1.0033,
+      "step": 5845
+    },
+    {
+      "epoch": 1.0407763532763532,
+      "grad_norm": 0.5643384456634521,
+      "learning_rate": 0.00016855097390264815,
+      "loss": 0.9136,
+      "step": 5846
+    },
+    {
+      "epoch": 1.040954415954416,
+      "grad_norm": 0.48351922631263733,
+      "learning_rate": 0.0001685407821831452,
+      "loss": 0.6163,
+      "step": 5847
+    },
+    {
+      "epoch": 1.0411324786324787,
+      "grad_norm": 0.6256039142608643,
+      "learning_rate": 0.00016853058912072802,
+      "loss": 0.9409,
+      "step": 5848
+    },
+    {
+      "epoch": 1.0413105413105412,
+      "grad_norm": 0.6539996862411499,
+      "learning_rate": 0.00016852039471559627,
+      "loss": 0.9367,
+      "step": 5849
+    },
+    {
+      "epoch": 1.041488603988604,
+      "grad_norm": 0.6192609667778015,
+      "learning_rate": 0.00016851019896794975,
+      "loss": 0.9631,
+      "step": 5850
+    },
+    {
+      "epoch": 1.0416666666666667,
+      "grad_norm": 0.613563060760498,
+      "learning_rate": 0.0001685000018779882,
+      "loss": 0.9132,
+      "step": 5851
+    },
+    {
+      "epoch": 1.0418447293447293,
+      "grad_norm": 0.6004200577735901,
+      "learning_rate": 0.0001684898034459114,
+      "loss": 1.1313,
+      "step": 5852
+    },
+    {
+      "epoch": 1.042022792022792,
+      "grad_norm": 0.6158567070960999,
+      "learning_rate": 0.0001684796036719192,
+      "loss": 1.0253,
+      "step": 5853
+    },
+    {
+      "epoch": 1.0422008547008548,
+      "grad_norm": 0.6362335085868835,
+      "learning_rate": 0.00016846940255621143,
+      "loss": 0.93,
+      "step": 5854
+    },
+    {
+      "epoch": 1.0423789173789173,
+      "grad_norm": 0.6148427128791809,
+      "learning_rate": 0.00016845920009898787,
+      "loss": 0.9122,
+      "step": 5855
+    },
+    {
+      "epoch": 1.04255698005698,
+      "grad_norm": 0.5119984149932861,
+      "learning_rate": 0.00016844899630044858,
+      "loss": 0.7954,
+      "step": 5856
+    },
+    {
+      "epoch": 1.0427350427350428,
+      "grad_norm": 0.571849524974823,
+      "learning_rate": 0.00016843879116079338,
+      "loss": 0.8588,
+      "step": 5857
+    },
+    {
+      "epoch": 1.0429131054131053,
+      "grad_norm": 0.6173384785652161,
+      "learning_rate": 0.00016842858468022221,
+      "loss": 1.0475,
+      "step": 5858
+    },
+    {
+      "epoch": 1.043091168091168,
+      "grad_norm": 0.566114068031311,
+      "learning_rate": 0.0001684183768589351,
+      "loss": 0.8485,
+      "step": 5859
+    },
+    {
+      "epoch": 1.0432692307692308,
+      "grad_norm": 0.653134286403656,
+      "learning_rate": 0.000168408167697132,
+      "loss": 0.9976,
+      "step": 5860
+    },
+    {
+      "epoch": 1.0434472934472934,
+      "grad_norm": 0.63815838098526,
+      "learning_rate": 0.00016839795719501296,
+      "loss": 0.7091,
+      "step": 5861
+    },
+    {
+      "epoch": 1.0436253561253561,
+      "grad_norm": 0.5109001994132996,
+      "learning_rate": 0.00016838774535277805,
+      "loss": 0.7668,
+      "step": 5862
+    },
+    {
+      "epoch": 1.0438034188034189,
+      "grad_norm": 0.6741907596588135,
+      "learning_rate": 0.0001683775321706273,
+      "loss": 1.0493,
+      "step": 5863
+    },
+    {
+      "epoch": 1.0439814814814814,
+      "grad_norm": 0.6006115674972534,
+      "learning_rate": 0.0001683673176487609,
+      "loss": 0.9784,
+      "step": 5864
+    },
+    {
+      "epoch": 1.0441595441595442,
+      "grad_norm": 0.5504778027534485,
+      "learning_rate": 0.0001683571017873789,
+      "loss": 0.9718,
+      "step": 5865
+    },
+    {
+      "epoch": 1.044337606837607,
+      "grad_norm": 0.5713102221488953,
+      "learning_rate": 0.00016834688458668148,
+      "loss": 1.12,
+      "step": 5866
+    },
+    {
+      "epoch": 1.0445156695156694,
+      "grad_norm": 0.7878454923629761,
+      "learning_rate": 0.00016833666604686886,
+      "loss": 1.1803,
+      "step": 5867
+    },
+    {
+      "epoch": 1.0446937321937322,
+      "grad_norm": 0.582697331905365,
+      "learning_rate": 0.00016832644616814122,
+      "loss": 0.943,
+      "step": 5868
+    },
+    {
+      "epoch": 1.044871794871795,
+      "grad_norm": 0.5300645232200623,
+      "learning_rate": 0.00016831622495069878,
+      "loss": 0.9087,
+      "step": 5869
+    },
+    {
+      "epoch": 1.0450498575498575,
+      "grad_norm": 0.5627666115760803,
+      "learning_rate": 0.00016830600239474186,
+      "loss": 1.081,
+      "step": 5870
+    },
+    {
+      "epoch": 1.0452279202279202,
+      "grad_norm": 0.6760496497154236,
+      "learning_rate": 0.0001682957785004707,
+      "loss": 1.1098,
+      "step": 5871
+    },
+    {
+      "epoch": 1.045405982905983,
+      "grad_norm": 0.6424084901809692,
+      "learning_rate": 0.00016828555326808565,
+      "loss": 0.9657,
+      "step": 5872
+    },
+    {
+      "epoch": 1.0455840455840455,
+      "grad_norm": 0.5523313283920288,
+      "learning_rate": 0.000168275326697787,
+      "loss": 1.0163,
+      "step": 5873
+    },
+    {
+      "epoch": 1.0457621082621082,
+      "grad_norm": 0.5582337975502014,
+      "learning_rate": 0.00016826509878977518,
+      "loss": 0.8825,
+      "step": 5874
+    },
+    {
+      "epoch": 1.045940170940171,
+      "grad_norm": 0.5603214502334595,
+      "learning_rate": 0.00016825486954425055,
+      "loss": 0.9032,
+      "step": 5875
+    },
+    {
+      "epoch": 1.0461182336182335,
+      "grad_norm": 0.5944222807884216,
+      "learning_rate": 0.00016824463896141355,
+      "loss": 0.9384,
+      "step": 5876
+    },
+    {
+      "epoch": 1.0462962962962963,
+      "grad_norm": 0.6220229268074036,
+      "learning_rate": 0.00016823440704146457,
+      "loss": 0.8962,
+      "step": 5877
+    },
+    {
+      "epoch": 1.046474358974359,
+      "grad_norm": 0.5607972145080566,
+      "learning_rate": 0.0001682241737846042,
+      "loss": 0.9385,
+      "step": 5878
+    },
+    {
+      "epoch": 1.0466524216524216,
+      "grad_norm": 0.6206870079040527,
+      "learning_rate": 0.00016821393919103282,
+      "loss": 1.0597,
+      "step": 5879
+    },
+    {
+      "epoch": 1.0468304843304843,
+      "grad_norm": 0.5126399993896484,
+      "learning_rate": 0.000168203703260951,
+      "loss": 0.9403,
+      "step": 5880
+    },
+    {
+      "epoch": 1.047008547008547,
+      "grad_norm": 0.6569282412528992,
+      "learning_rate": 0.00016819346599455929,
+      "loss": 0.8124,
+      "step": 5881
+    },
+    {
+      "epoch": 1.0471866096866096,
+      "grad_norm": 0.6670137047767639,
+      "learning_rate": 0.0001681832273920583,
+      "loss": 1.1927,
+      "step": 5882
+    },
+    {
+      "epoch": 1.0473646723646723,
+      "grad_norm": 0.5403243899345398,
+      "learning_rate": 0.00016817298745364862,
+      "loss": 0.8539,
+      "step": 5883
+    },
+    {
+      "epoch": 1.047542735042735,
+      "grad_norm": 0.5500505566596985,
+      "learning_rate": 0.00016816274617953086,
+      "loss": 1.1064,
+      "step": 5884
+    },
+    {
+      "epoch": 1.0477207977207976,
+      "grad_norm": 0.5482703447341919,
+      "learning_rate": 0.00016815250356990566,
+      "loss": 0.7276,
+      "step": 5885
+    },
+    {
+      "epoch": 1.0478988603988604,
+      "grad_norm": 0.6290771961212158,
+      "learning_rate": 0.00016814225962497373,
+      "loss": 0.9018,
+      "step": 5886
+    },
+    {
+      "epoch": 1.0480769230769231,
+      "grad_norm": 0.6404094696044922,
+      "learning_rate": 0.00016813201434493578,
+      "loss": 1.0638,
+      "step": 5887
+    },
+    {
+      "epoch": 1.0482549857549857,
+      "grad_norm": 0.5484994053840637,
+      "learning_rate": 0.0001681217677299926,
+      "loss": 1.0033,
+      "step": 5888
+    },
+    {
+      "epoch": 1.0484330484330484,
+      "grad_norm": 0.6474852561950684,
+      "learning_rate": 0.0001681115197803448,
+      "loss": 1.1017,
+      "step": 5889
+    },
+    {
+      "epoch": 1.0486111111111112,
+      "grad_norm": 0.6186243295669556,
+      "learning_rate": 0.0001681012704961933,
+      "loss": 0.9978,
+      "step": 5890
+    },
+    {
+      "epoch": 1.0487891737891737,
+      "grad_norm": 0.6244034767150879,
+      "learning_rate": 0.00016809101987773887,
+      "loss": 0.9906,
+      "step": 5891
+    },
+    {
+      "epoch": 1.0489672364672364,
+      "grad_norm": 0.5893426537513733,
+      "learning_rate": 0.00016808076792518235,
+      "loss": 0.9345,
+      "step": 5892
+    },
+    {
+      "epoch": 1.0491452991452992,
+      "grad_norm": 0.6283876299858093,
+      "learning_rate": 0.0001680705146387246,
+      "loss": 1.0041,
+      "step": 5893
+    },
+    {
+      "epoch": 1.0493233618233617,
+      "grad_norm": 0.6075255870819092,
+      "learning_rate": 0.00016806026001856656,
+      "loss": 1.0661,
+      "step": 5894
+    },
+    {
+      "epoch": 1.0495014245014245,
+      "grad_norm": 0.5350496768951416,
+      "learning_rate": 0.00016805000406490907,
+      "loss": 0.6789,
+      "step": 5895
+    },
+    {
+      "epoch": 1.0496794871794872,
+      "grad_norm": 0.5380373597145081,
+      "learning_rate": 0.00016803974677795312,
+      "loss": 0.8889,
+      "step": 5896
+    },
+    {
+      "epoch": 1.0498575498575498,
+      "grad_norm": 0.6145668029785156,
+      "learning_rate": 0.0001680294881578997,
+      "loss": 0.8952,
+      "step": 5897
+    },
+    {
+      "epoch": 1.0500356125356125,
+      "grad_norm": 0.5666532516479492,
+      "learning_rate": 0.00016801922820494972,
+      "loss": 0.9697,
+      "step": 5898
+    },
+    {
+      "epoch": 1.0502136752136753,
+      "grad_norm": 0.5352747440338135,
+      "learning_rate": 0.0001680089669193043,
+      "loss": 0.9619,
+      "step": 5899
+    },
+    {
+      "epoch": 1.0503917378917378,
+      "grad_norm": 0.5405527949333191,
+      "learning_rate": 0.00016799870430116444,
+      "loss": 0.8733,
+      "step": 5900
+    },
+    {
+      "epoch": 1.0505698005698005,
+      "grad_norm": 0.5936748385429382,
+      "learning_rate": 0.00016798844035073124,
+      "loss": 0.8746,
+      "step": 5901
+    },
+    {
+      "epoch": 1.0507478632478633,
+      "grad_norm": 0.539652943611145,
+      "learning_rate": 0.00016797817506820578,
+      "loss": 0.8743,
+      "step": 5902
+    },
+    {
+      "epoch": 1.0509259259259258,
+      "grad_norm": 0.644528865814209,
+      "learning_rate": 0.00016796790845378915,
+      "loss": 0.9251,
+      "step": 5903
+    },
+    {
+      "epoch": 1.0511039886039886,
+      "grad_norm": 0.5429201126098633,
+      "learning_rate": 0.00016795764050768258,
+      "loss": 0.747,
+      "step": 5904
+    },
+    {
+      "epoch": 1.0512820512820513,
+      "grad_norm": 0.6432006359100342,
+      "learning_rate": 0.00016794737123008725,
+      "loss": 0.9166,
+      "step": 5905
+    },
+    {
+      "epoch": 1.0514601139601139,
+      "grad_norm": 0.6084117293357849,
+      "learning_rate": 0.00016793710062120427,
+      "loss": 1.0778,
+      "step": 5906
+    },
+    {
+      "epoch": 1.0516381766381766,
+      "grad_norm": 0.5351580381393433,
+      "learning_rate": 0.00016792682868123495,
+      "loss": 0.9124,
+      "step": 5907
+    },
+    {
+      "epoch": 1.0518162393162394,
+      "grad_norm": 0.7078854441642761,
+      "learning_rate": 0.00016791655541038053,
+      "loss": 1.1209,
+      "step": 5908
+    },
+    {
+      "epoch": 1.051994301994302,
+      "grad_norm": 0.5943832993507385,
+      "learning_rate": 0.0001679062808088423,
+      "loss": 0.9077,
+      "step": 5909
+    },
+    {
+      "epoch": 1.0521723646723646,
+      "grad_norm": 0.5216894745826721,
+      "learning_rate": 0.00016789600487682156,
+      "loss": 0.9866,
+      "step": 5910
+    },
+    {
+      "epoch": 1.0523504273504274,
+      "grad_norm": 0.738451361656189,
+      "learning_rate": 0.00016788572761451963,
+      "loss": 1.1611,
+      "step": 5911
+    },
+    {
+      "epoch": 1.05252849002849,
+      "grad_norm": 0.6411251425743103,
+      "learning_rate": 0.00016787544902213791,
+      "loss": 1.1481,
+      "step": 5912
+    },
+    {
+      "epoch": 1.0527065527065527,
+      "grad_norm": 0.6768319010734558,
+      "learning_rate": 0.00016786516909987774,
+      "loss": 0.8614,
+      "step": 5913
+    },
+    {
+      "epoch": 1.0528846153846154,
+      "grad_norm": 0.5838070511817932,
+      "learning_rate": 0.0001678548878479406,
+      "loss": 0.9719,
+      "step": 5914
+    },
+    {
+      "epoch": 1.0530626780626782,
+      "grad_norm": 0.541522741317749,
+      "learning_rate": 0.00016784460526652784,
+      "loss": 0.767,
+      "step": 5915
+    },
+    {
+      "epoch": 1.0532407407407407,
+      "grad_norm": 0.6064762473106384,
+      "learning_rate": 0.000167834321355841,
+      "loss": 1.0792,
+      "step": 5916
+    },
+    {
+      "epoch": 1.0534188034188035,
+      "grad_norm": 0.5515492558479309,
+      "learning_rate": 0.00016782403611608152,
+      "loss": 0.7897,
+      "step": 5917
+    },
+    {
+      "epoch": 1.0535968660968662,
+      "grad_norm": 0.6326262950897217,
+      "learning_rate": 0.000167813749547451,
+      "loss": 0.9279,
+      "step": 5918
+    },
+    {
+      "epoch": 1.0537749287749287,
+      "grad_norm": 0.6262009739875793,
+      "learning_rate": 0.0001678034616501509,
+      "loss": 0.9752,
+      "step": 5919
+    },
+    {
+      "epoch": 1.0539529914529915,
+      "grad_norm": 0.6049023270606995,
+      "learning_rate": 0.00016779317242438278,
+      "loss": 0.9167,
+      "step": 5920
+    },
+    {
+      "epoch": 1.0541310541310542,
+      "grad_norm": 0.6286031007766724,
+      "learning_rate": 0.0001677828818703483,
+      "loss": 1.1277,
+      "step": 5921
+    },
+    {
+      "epoch": 1.0543091168091168,
+      "grad_norm": 0.662086009979248,
+      "learning_rate": 0.00016777258998824907,
+      "loss": 1.0824,
+      "step": 5922
+    },
+    {
+      "epoch": 1.0544871794871795,
+      "grad_norm": 0.5358783006668091,
+      "learning_rate": 0.00016776229677828672,
+      "loss": 0.825,
+      "step": 5923
+    },
+    {
+      "epoch": 1.0546652421652423,
+      "grad_norm": 0.490326464176178,
+      "learning_rate": 0.00016775200224066294,
+      "loss": 0.7916,
+      "step": 5924
+    },
+    {
+      "epoch": 1.0548433048433048,
+      "grad_norm": 0.5940443277359009,
+      "learning_rate": 0.0001677417063755794,
+      "loss": 1.0121,
+      "step": 5925
+    },
+    {
+      "epoch": 1.0550213675213675,
+      "grad_norm": 0.5974507927894592,
+      "learning_rate": 0.00016773140918323787,
+      "loss": 0.7629,
+      "step": 5926
+    },
+    {
+      "epoch": 1.0551994301994303,
+      "grad_norm": 0.5747174024581909,
+      "learning_rate": 0.00016772111066384003,
+      "loss": 0.9373,
+      "step": 5927
+    },
+    {
+      "epoch": 1.0553774928774928,
+      "grad_norm": 0.5998024940490723,
+      "learning_rate": 0.00016771081081758772,
+      "loss": 0.8543,
+      "step": 5928
+    },
+    {
+      "epoch": 1.0555555555555556,
+      "grad_norm": 0.5771155953407288,
+      "learning_rate": 0.00016770050964468275,
+      "loss": 0.9108,
+      "step": 5929
+    },
+    {
+      "epoch": 1.0557336182336183,
+      "grad_norm": 0.5695661306381226,
+      "learning_rate": 0.00016769020714532692,
+      "loss": 0.8055,
+      "step": 5930
+    },
+    {
+      "epoch": 1.0559116809116809,
+      "grad_norm": 0.6164212226867676,
+      "learning_rate": 0.0001676799033197221,
+      "loss": 1.0917,
+      "step": 5931
+    },
+    {
+      "epoch": 1.0560897435897436,
+      "grad_norm": 0.6092487573623657,
+      "learning_rate": 0.00016766959816807018,
+      "loss": 0.9276,
+      "step": 5932
+    },
+    {
+      "epoch": 1.0562678062678064,
+      "grad_norm": 0.5595401525497437,
+      "learning_rate": 0.00016765929169057305,
+      "loss": 0.9435,
+      "step": 5933
+    },
+    {
+      "epoch": 1.056445868945869,
+      "grad_norm": 0.5875109434127808,
+      "learning_rate": 0.00016764898388743263,
+      "loss": 0.959,
+      "step": 5934
+    },
+    {
+      "epoch": 1.0566239316239316,
+      "grad_norm": 0.6045668721199036,
+      "learning_rate": 0.00016763867475885088,
+      "loss": 0.8636,
+      "step": 5935
+    },
+    {
+      "epoch": 1.0568019943019944,
+      "grad_norm": 0.6088171005249023,
+      "learning_rate": 0.00016762836430502987,
+      "loss": 0.6807,
+      "step": 5936
+    },
+    {
+      "epoch": 1.056980056980057,
+      "grad_norm": 0.6293274760246277,
+      "learning_rate": 0.00016761805252617148,
+      "loss": 1.042,
+      "step": 5937
+    },
+    {
+      "epoch": 1.0571581196581197,
+      "grad_norm": 0.588472843170166,
+      "learning_rate": 0.00016760773942247785,
+      "loss": 0.8896,
+      "step": 5938
+    },
+    {
+      "epoch": 1.0573361823361824,
+      "grad_norm": 0.4412326216697693,
+      "learning_rate": 0.000167597424994151,
+      "loss": 0.6727,
+      "step": 5939
+    },
+    {
+      "epoch": 1.057514245014245,
+      "grad_norm": 0.6086825132369995,
+      "learning_rate": 0.00016758710924139302,
+      "loss": 0.9908,
+      "step": 5940
+    },
+    {
+      "epoch": 1.0576923076923077,
+      "grad_norm": 0.6424705386161804,
+      "learning_rate": 0.00016757679216440608,
+      "loss": 1.0182,
+      "step": 5941
+    },
+    {
+      "epoch": 1.0578703703703705,
+      "grad_norm": 0.6610676050186157,
+      "learning_rate": 0.00016756647376339222,
+      "loss": 0.9645,
+      "step": 5942
+    },
+    {
+      "epoch": 1.058048433048433,
+      "grad_norm": 0.598292887210846,
+      "learning_rate": 0.0001675561540385537,
+      "loss": 0.9694,
+      "step": 5943
+    },
+    {
+      "epoch": 1.0582264957264957,
+      "grad_norm": 0.6941167116165161,
+      "learning_rate": 0.00016754583299009266,
+      "loss": 1.0786,
+      "step": 5944
+    },
+    {
+      "epoch": 1.0584045584045585,
+      "grad_norm": 0.6543232798576355,
+      "learning_rate": 0.00016753551061821133,
+      "loss": 1.0488,
+      "step": 5945
+    },
+    {
+      "epoch": 1.058582621082621,
+      "grad_norm": 0.606159508228302,
+      "learning_rate": 0.000167525186923112,
+      "loss": 0.9448,
+      "step": 5946
+    },
+    {
+      "epoch": 1.0587606837606838,
+      "grad_norm": 0.5051791071891785,
+      "learning_rate": 0.00016751486190499685,
+      "loss": 0.7485,
+      "step": 5947
+    },
+    {
+      "epoch": 1.0589387464387465,
+      "grad_norm": 0.6459367275238037,
+      "learning_rate": 0.00016750453556406826,
+      "loss": 1.0055,
+      "step": 5948
+    },
+    {
+      "epoch": 1.059116809116809,
+      "grad_norm": 0.551591157913208,
+      "learning_rate": 0.00016749420790052852,
+      "loss": 0.9717,
+      "step": 5949
+    },
+    {
+      "epoch": 1.0592948717948718,
+      "grad_norm": 0.5899214148521423,
+      "learning_rate": 0.00016748387891458,
+      "loss": 0.7774,
+      "step": 5950
+    },
+    {
+      "epoch": 1.0594729344729346,
+      "grad_norm": 0.582379162311554,
+      "learning_rate": 0.00016747354860642503,
+      "loss": 0.953,
+      "step": 5951
+    },
+    {
+      "epoch": 1.059650997150997,
+      "grad_norm": 0.6035816073417664,
+      "learning_rate": 0.00016746321697626605,
+      "loss": 1.1175,
+      "step": 5952
+    },
+    {
+      "epoch": 1.0598290598290598,
+      "grad_norm": 0.6476401686668396,
+      "learning_rate": 0.00016745288402430548,
+      "loss": 0.9448,
+      "step": 5953
+    },
+    {
+      "epoch": 1.0600071225071226,
+      "grad_norm": 0.6126405596733093,
+      "learning_rate": 0.00016744254975074578,
+      "loss": 0.882,
+      "step": 5954
+    },
+    {
+      "epoch": 1.0601851851851851,
+      "grad_norm": 0.5333579182624817,
+      "learning_rate": 0.0001674322141557894,
+      "loss": 0.9539,
+      "step": 5955
+    },
+    {
+      "epoch": 1.0603632478632479,
+      "grad_norm": 0.6085022687911987,
+      "learning_rate": 0.0001674218772396389,
+      "loss": 1.0028,
+      "step": 5956
+    },
+    {
+      "epoch": 1.0605413105413106,
+      "grad_norm": 0.5809528827667236,
+      "learning_rate": 0.0001674115390024967,
+      "loss": 0.84,
+      "step": 5957
+    },
+    {
+      "epoch": 1.0607193732193732,
+      "grad_norm": 0.5820229649543762,
+      "learning_rate": 0.00016740119944456548,
+      "loss": 0.9563,
+      "step": 5958
+    },
+    {
+      "epoch": 1.060897435897436,
+      "grad_norm": 0.6349015831947327,
+      "learning_rate": 0.00016739085856604775,
+      "loss": 0.9739,
+      "step": 5959
+    },
+    {
+      "epoch": 1.0610754985754987,
+      "grad_norm": 0.6346020102500916,
+      "learning_rate": 0.00016738051636714616,
+      "loss": 0.907,
+      "step": 5960
+    },
+    {
+      "epoch": 1.0612535612535612,
+      "grad_norm": 0.5850573778152466,
+      "learning_rate": 0.0001673701728480633,
+      "loss": 1.0688,
+      "step": 5961
+    },
+    {
+      "epoch": 1.061431623931624,
+      "grad_norm": 0.6258122324943542,
+      "learning_rate": 0.00016735982800900184,
+      "loss": 0.9997,
+      "step": 5962
+    },
+    {
+      "epoch": 1.0616096866096867,
+      "grad_norm": 0.6744239330291748,
+      "learning_rate": 0.00016734948185016452,
+      "loss": 0.9431,
+      "step": 5963
+    },
+    {
+      "epoch": 1.0617877492877492,
+      "grad_norm": 0.5769457817077637,
+      "learning_rate": 0.000167339134371754,
+      "loss": 0.9658,
+      "step": 5964
+    },
+    {
+      "epoch": 1.061965811965812,
+      "grad_norm": 0.6385112404823303,
+      "learning_rate": 0.000167328785573973,
+      "loss": 1.0199,
+      "step": 5965
+    },
+    {
+      "epoch": 1.0621438746438747,
+      "grad_norm": 0.536522388458252,
+      "learning_rate": 0.00016731843545702435,
+      "loss": 0.8496,
+      "step": 5966
+    },
+    {
+      "epoch": 1.0623219373219372,
+      "grad_norm": 0.5978497862815857,
+      "learning_rate": 0.00016730808402111075,
+      "loss": 0.8536,
+      "step": 5967
+    },
+    {
+      "epoch": 1.0625,
+      "grad_norm": 0.6091681122779846,
+      "learning_rate": 0.0001672977312664351,
+      "loss": 1.0241,
+      "step": 5968
+    },
+    {
+      "epoch": 1.0626780626780628,
+      "grad_norm": 0.5807273387908936,
+      "learning_rate": 0.0001672873771932002,
+      "loss": 1.0522,
+      "step": 5969
+    },
+    {
+      "epoch": 1.0628561253561253,
+      "grad_norm": 0.6511965990066528,
+      "learning_rate": 0.0001672770218016089,
+      "loss": 0.8908,
+      "step": 5970
+    },
+    {
+      "epoch": 1.063034188034188,
+      "grad_norm": 0.6241721510887146,
+      "learning_rate": 0.00016726666509186416,
+      "loss": 0.9854,
+      "step": 5971
+    },
+    {
+      "epoch": 1.0632122507122508,
+      "grad_norm": 0.6112468242645264,
+      "learning_rate": 0.0001672563070641688,
+      "loss": 1.0091,
+      "step": 5972
+    },
+    {
+      "epoch": 1.0633903133903133,
+      "grad_norm": 0.6135509014129639,
+      "learning_rate": 0.00016724594771872587,
+      "loss": 0.8891,
+      "step": 5973
+    },
+    {
+      "epoch": 1.063568376068376,
+      "grad_norm": 0.608384370803833,
+      "learning_rate": 0.00016723558705573823,
+      "loss": 1.017,
+      "step": 5974
+    },
+    {
+      "epoch": 1.0637464387464388,
+      "grad_norm": 0.6578485369682312,
+      "learning_rate": 0.00016722522507540895,
+      "loss": 0.9165,
+      "step": 5975
+    },
+    {
+      "epoch": 1.0639245014245013,
+      "grad_norm": 0.562588095664978,
+      "learning_rate": 0.00016721486177794106,
+      "loss": 0.7989,
+      "step": 5976
+    },
+    {
+      "epoch": 1.064102564102564,
+      "grad_norm": 0.5541409254074097,
+      "learning_rate": 0.00016720449716353753,
+      "loss": 0.8917,
+      "step": 5977
+    },
+    {
+      "epoch": 1.0642806267806268,
+      "grad_norm": 0.551167905330658,
+      "learning_rate": 0.0001671941312324015,
+      "loss": 0.824,
+      "step": 5978
+    },
+    {
+      "epoch": 1.0644586894586894,
+      "grad_norm": 0.6280582547187805,
+      "learning_rate": 0.0001671837639847361,
+      "loss": 0.9708,
+      "step": 5979
+    },
+    {
+      "epoch": 1.0646367521367521,
+      "grad_norm": 0.6389226913452148,
+      "learning_rate": 0.00016717339542074436,
+      "loss": 1.0081,
+      "step": 5980
+    },
+    {
+      "epoch": 1.0648148148148149,
+      "grad_norm": 0.6677889823913574,
+      "learning_rate": 0.0001671630255406295,
+      "loss": 1.2709,
+      "step": 5981
+    },
+    {
+      "epoch": 1.0649928774928774,
+      "grad_norm": 0.5748161673545837,
+      "learning_rate": 0.00016715265434459465,
+      "loss": 0.9157,
+      "step": 5982
+    },
+    {
+      "epoch": 1.0651709401709402,
+      "grad_norm": 0.6677651405334473,
+      "learning_rate": 0.00016714228183284304,
+      "loss": 1.1097,
+      "step": 5983
+    },
+    {
+      "epoch": 1.065349002849003,
+      "grad_norm": 0.6253604292869568,
+      "learning_rate": 0.0001671319080055779,
+      "loss": 0.9819,
+      "step": 5984
+    },
+    {
+      "epoch": 1.0655270655270654,
+      "grad_norm": 0.5548844337463379,
+      "learning_rate": 0.0001671215328630025,
+      "loss": 0.9324,
+      "step": 5985
+    },
+    {
+      "epoch": 1.0657051282051282,
+      "grad_norm": 0.622062623500824,
+      "learning_rate": 0.00016711115640532004,
+      "loss": 0.8749,
+      "step": 5986
+    },
+    {
+      "epoch": 1.065883190883191,
+      "grad_norm": 0.6496043801307678,
+      "learning_rate": 0.00016710077863273394,
+      "loss": 1.0642,
+      "step": 5987
+    },
+    {
+      "epoch": 1.0660612535612535,
+      "grad_norm": 0.6140534281730652,
+      "learning_rate": 0.00016709039954544746,
+      "loss": 0.8928,
+      "step": 5988
+    },
+    {
+      "epoch": 1.0662393162393162,
+      "grad_norm": 0.6387218236923218,
+      "learning_rate": 0.00016708001914366393,
+      "loss": 0.9525,
+      "step": 5989
+    },
+    {
+      "epoch": 1.066417378917379,
+      "grad_norm": 0.6119858026504517,
+      "learning_rate": 0.0001670696374275868,
+      "loss": 0.8663,
+      "step": 5990
+    },
+    {
+      "epoch": 1.0665954415954415,
+      "grad_norm": 0.6722040772438049,
+      "learning_rate": 0.00016705925439741947,
+      "loss": 1.1173,
+      "step": 5991
+    },
+    {
+      "epoch": 1.0667735042735043,
+      "grad_norm": 0.8226081132888794,
+      "learning_rate": 0.00016704887005336534,
+      "loss": 1.0572,
+      "step": 5992
+    },
+    {
+      "epoch": 1.066951566951567,
+      "grad_norm": 0.7248596549034119,
+      "learning_rate": 0.00016703848439562785,
+      "loss": 1.0493,
+      "step": 5993
+    },
+    {
+      "epoch": 1.0671296296296295,
+      "grad_norm": 0.7185787558555603,
+      "learning_rate": 0.00016702809742441058,
+      "loss": 1.1366,
+      "step": 5994
+    },
+    {
+      "epoch": 1.0673076923076923,
+      "grad_norm": 0.6118780970573425,
+      "learning_rate": 0.00016701770913991694,
+      "loss": 0.9557,
+      "step": 5995
+    },
+    {
+      "epoch": 1.067485754985755,
+      "grad_norm": 0.6472596526145935,
+      "learning_rate": 0.0001670073195423505,
+      "loss": 0.9977,
+      "step": 5996
+    },
+    {
+      "epoch": 1.0676638176638176,
+      "grad_norm": 0.7110133767127991,
+      "learning_rate": 0.00016699692863191484,
+      "loss": 1.1932,
+      "step": 5997
+    },
+    {
+      "epoch": 1.0678418803418803,
+      "grad_norm": 0.5827305912971497,
+      "learning_rate": 0.00016698653640881354,
+      "loss": 0.7641,
+      "step": 5998
+    },
+    {
+      "epoch": 1.068019943019943,
+      "grad_norm": 0.527208149433136,
+      "learning_rate": 0.00016697614287325017,
+      "loss": 0.7683,
+      "step": 5999
+    },
+    {
+      "epoch": 1.0681980056980056,
+      "grad_norm": 0.6680626273155212,
+      "learning_rate": 0.00016696574802542848,
+      "loss": 1.1748,
+      "step": 6000
+    },
+    {
+      "epoch": 1.0683760683760684,
+      "grad_norm": 0.5947227478027344,
+      "learning_rate": 0.00016695535186555204,
+      "loss": 1.0894,
+      "step": 6001
+    },
+    {
+      "epoch": 1.068554131054131,
+      "grad_norm": 0.5828250646591187,
+      "learning_rate": 0.00016694495439382456,
+      "loss": 0.9895,
+      "step": 6002
+    },
+    {
+      "epoch": 1.0687321937321936,
+      "grad_norm": 0.5897728204727173,
+      "learning_rate": 0.00016693455561044978,
+      "loss": 0.9686,
+      "step": 6003
+    },
+    {
+      "epoch": 1.0689102564102564,
+      "grad_norm": 0.5441751480102539,
+      "learning_rate": 0.0001669241555156314,
+      "loss": 0.8948,
+      "step": 6004
+    },
+    {
+      "epoch": 1.0690883190883191,
+      "grad_norm": 0.694199800491333,
+      "learning_rate": 0.00016691375410957324,
+      "loss": 1.0824,
+      "step": 6005
+    },
+    {
+      "epoch": 1.0692663817663817,
+      "grad_norm": 0.6077630519866943,
+      "learning_rate": 0.00016690335139247906,
+      "loss": 1.0931,
+      "step": 6006
+    },
+    {
+      "epoch": 1.0694444444444444,
+      "grad_norm": 0.6558539867401123,
+      "learning_rate": 0.0001668929473645527,
+      "loss": 1.0099,
+      "step": 6007
+    },
+    {
+      "epoch": 1.0696225071225072,
+      "grad_norm": 0.5722812414169312,
+      "learning_rate": 0.00016688254202599798,
+      "loss": 0.7999,
+      "step": 6008
+    },
+    {
+      "epoch": 1.0698005698005697,
+      "grad_norm": 0.5915400981903076,
+      "learning_rate": 0.0001668721353770188,
+      "loss": 0.7866,
+      "step": 6009
+    },
+    {
+      "epoch": 1.0699786324786325,
+      "grad_norm": 0.5290952324867249,
+      "learning_rate": 0.00016686172741781901,
+      "loss": 0.793,
+      "step": 6010
+    },
+    {
+      "epoch": 1.0701566951566952,
+      "grad_norm": 0.5501774549484253,
+      "learning_rate": 0.00016685131814860263,
+      "loss": 0.8775,
+      "step": 6011
+    },
+    {
+      "epoch": 1.0703347578347577,
+      "grad_norm": 0.6192594766616821,
+      "learning_rate": 0.00016684090756957347,
+      "loss": 1.1686,
+      "step": 6012
+    },
+    {
+      "epoch": 1.0705128205128205,
+      "grad_norm": 0.6640267968177795,
+      "learning_rate": 0.00016683049568093561,
+      "loss": 1.1789,
+      "step": 6013
+    },
+    {
+      "epoch": 1.0706908831908832,
+      "grad_norm": 0.552893877029419,
+      "learning_rate": 0.00016682008248289303,
+      "loss": 0.7957,
+      "step": 6014
+    },
+    {
+      "epoch": 1.0708689458689458,
+      "grad_norm": 0.6406302452087402,
+      "learning_rate": 0.00016680966797564972,
+      "loss": 1.1174,
+      "step": 6015
+    },
+    {
+      "epoch": 1.0710470085470085,
+      "grad_norm": Infinity,
+      "learning_rate": 0.00016680966797564972,
+      "loss": 0.9168,
+      "step": 6016
+    },
+    {
+      "epoch": 1.0712250712250713,
+      "grad_norm": 0.6384762525558472,
+      "learning_rate": 0.00016679925215940975,
+      "loss": 0.9831,
+      "step": 6017
+    },
+    {
+      "epoch": 1.071403133903134,
+      "grad_norm": 0.5906224846839905,
+      "learning_rate": 0.0001667888350343772,
+      "loss": 0.9167,
+      "step": 6018
+    },
+    {
+      "epoch": 1.0715811965811965,
+      "grad_norm": 0.658044695854187,
+      "learning_rate": 0.00016677841660075617,
+      "loss": 1.0075,
+      "step": 6019
+    },
+    {
+      "epoch": 1.0717592592592593,
+      "grad_norm": 0.6313242316246033,
+      "learning_rate": 0.00016676799685875078,
+      "loss": 0.8551,
+      "step": 6020
+    },
+    {
+      "epoch": 1.0719373219373218,
+      "grad_norm": 0.5891841053962708,
+      "learning_rate": 0.00016675757580856518,
+      "loss": 0.8475,
+      "step": 6021
+    },
+    {
+      "epoch": 1.0721153846153846,
+      "grad_norm": 0.581317126750946,
+      "learning_rate": 0.00016674715345040358,
+      "loss": 0.9308,
+      "step": 6022
+    },
+    {
+      "epoch": 1.0722934472934473,
+      "grad_norm": 0.5952537655830383,
+      "learning_rate": 0.00016673672978447017,
+      "loss": 0.9104,
+      "step": 6023
+    },
+    {
+      "epoch": 1.07247150997151,
+      "grad_norm": 0.5934227705001831,
+      "learning_rate": 0.00016672630481096915,
+      "loss": 0.9882,
+      "step": 6024
+    },
+    {
+      "epoch": 1.0726495726495726,
+      "grad_norm": 0.5867539048194885,
+      "learning_rate": 0.00016671587853010482,
+      "loss": 1.0186,
+      "step": 6025
+    },
+    {
+      "epoch": 1.0728276353276354,
+      "grad_norm": 0.6002280116081238,
+      "learning_rate": 0.00016670545094208143,
+      "loss": 0.92,
+      "step": 6026
+    },
+    {
+      "epoch": 1.073005698005698,
+      "grad_norm": 0.6261683702468872,
+      "learning_rate": 0.0001666950220471033,
+      "loss": 0.9293,
+      "step": 6027
+    },
+    {
+      "epoch": 1.0731837606837606,
+      "grad_norm": 0.6128147840499878,
+      "learning_rate": 0.00016668459184537477,
+      "loss": 1.0787,
+      "step": 6028
+    },
+    {
+      "epoch": 1.0733618233618234,
+      "grad_norm": 0.62148118019104,
+      "learning_rate": 0.00016667416033710016,
+      "loss": 0.8843,
+      "step": 6029
+    },
+    {
+      "epoch": 1.0735398860398861,
+      "grad_norm": 0.7166166305541992,
+      "learning_rate": 0.0001666637275224839,
+      "loss": 0.8877,
+      "step": 6030
+    },
+    {
+      "epoch": 1.0737179487179487,
+      "grad_norm": 0.5275574922561646,
+      "learning_rate": 0.0001666532934017304,
+      "loss": 0.9604,
+      "step": 6031
+    },
+    {
+      "epoch": 1.0738960113960114,
+      "grad_norm": 0.8132784962654114,
+      "learning_rate": 0.00016664285797504406,
+      "loss": 1.0203,
+      "step": 6032
+    },
+    {
+      "epoch": 1.074074074074074,
+      "grad_norm": 0.5887695550918579,
+      "learning_rate": 0.00016663242124262935,
+      "loss": 0.8819,
+      "step": 6033
+    },
+    {
+      "epoch": 1.0742521367521367,
+      "grad_norm": 0.5552900433540344,
+      "learning_rate": 0.00016662198320469078,
+      "loss": 0.7542,
+      "step": 6034
+    },
+    {
+      "epoch": 1.0744301994301995,
+      "grad_norm": 0.6228970885276794,
+      "learning_rate": 0.0001666115438614328,
+      "loss": 1.0362,
+      "step": 6035
+    },
+    {
+      "epoch": 1.0746082621082622,
+      "grad_norm": 0.7193471789360046,
+      "learning_rate": 0.00016660110321306003,
+      "loss": 1.3073,
+      "step": 6036
+    },
+    {
+      "epoch": 1.0747863247863247,
+      "grad_norm": 0.6167412996292114,
+      "learning_rate": 0.000166590661259777,
+      "loss": 0.941,
+      "step": 6037
+    },
+    {
+      "epoch": 1.0749643874643875,
+      "grad_norm": 0.5716922879219055,
+      "learning_rate": 0.00016658021800178827,
+      "loss": 0.83,
+      "step": 6038
+    },
+    {
+      "epoch": 1.0751424501424502,
+      "grad_norm": 0.6404047012329102,
+      "learning_rate": 0.00016656977343929848,
+      "loss": 1.0617,
+      "step": 6039
+    },
+    {
+      "epoch": 1.0753205128205128,
+      "grad_norm": 0.531395435333252,
+      "learning_rate": 0.00016655932757251226,
+      "loss": 0.7785,
+      "step": 6040
+    },
+    {
+      "epoch": 1.0754985754985755,
+      "grad_norm": 0.6468462347984314,
+      "learning_rate": 0.0001665488804016343,
+      "loss": 0.7893,
+      "step": 6041
+    },
+    {
+      "epoch": 1.0756766381766383,
+      "grad_norm": 0.6539653539657593,
+      "learning_rate": 0.00016653843192686925,
+      "loss": 1.1011,
+      "step": 6042
+    },
+    {
+      "epoch": 1.0758547008547008,
+      "grad_norm": 0.630107045173645,
+      "learning_rate": 0.0001665279821484219,
+      "loss": 0.9262,
+      "step": 6043
+    },
+    {
+      "epoch": 1.0760327635327636,
+      "grad_norm": 0.5875992774963379,
+      "learning_rate": 0.00016651753106649688,
+      "loss": 1.0501,
+      "step": 6044
+    },
+    {
+      "epoch": 1.0762108262108263,
+      "grad_norm": 0.573428750038147,
+      "learning_rate": 0.00016650707868129904,
+      "loss": 1.0672,
+      "step": 6045
+    },
+    {
+      "epoch": 1.0763888888888888,
+      "grad_norm": 0.6215469241142273,
+      "learning_rate": 0.00016649662499303316,
+      "loss": 0.868,
+      "step": 6046
+    },
+    {
+      "epoch": 1.0765669515669516,
+      "grad_norm": 0.6666893362998962,
+      "learning_rate": 0.00016648617000190402,
+      "loss": 1.0965,
+      "step": 6047
+    },
+    {
+      "epoch": 1.0767450142450143,
+      "grad_norm": 0.8343498706817627,
+      "learning_rate": 0.00016647571370811653,
+      "loss": 1.2302,
+      "step": 6048
+    },
+    {
+      "epoch": 1.0769230769230769,
+      "grad_norm": 0.591147780418396,
+      "learning_rate": 0.0001664652561118755,
+      "loss": 0.9698,
+      "step": 6049
+    },
+    {
+      "epoch": 1.0771011396011396,
+      "grad_norm": 0.573375940322876,
+      "learning_rate": 0.00016645479721338584,
+      "loss": 0.8798,
+      "step": 6050
+    },
+    {
+      "epoch": 1.0772792022792024,
+      "grad_norm": 0.4956737160682678,
+      "learning_rate": 0.00016644433701285246,
+      "loss": 0.6523,
+      "step": 6051
+    },
+    {
+      "epoch": 1.077457264957265,
+      "grad_norm": 0.6896619200706482,
+      "learning_rate": 0.00016643387551048034,
+      "loss": 0.8911,
+      "step": 6052
+    },
+    {
+      "epoch": 1.0776353276353277,
+      "grad_norm": 0.5820416808128357,
+      "learning_rate": 0.00016642341270647445,
+      "loss": 1.1486,
+      "step": 6053
+    },
+    {
+      "epoch": 1.0778133903133904,
+      "grad_norm": 0.611132025718689,
+      "learning_rate": 0.00016641294860103976,
+      "loss": 1.0705,
+      "step": 6054
+    },
+    {
+      "epoch": 1.077991452991453,
+      "grad_norm": 0.6705698370933533,
+      "learning_rate": 0.00016640248319438133,
+      "loss": 0.9826,
+      "step": 6055
+    },
+    {
+      "epoch": 1.0781695156695157,
+      "grad_norm": 0.5987013578414917,
+      "learning_rate": 0.00016639201648670416,
+      "loss": 1.0409,
+      "step": 6056
+    },
+    {
+      "epoch": 1.0783475783475784,
+      "grad_norm": 0.6707149744033813,
+      "learning_rate": 0.00016638154847821332,
+      "loss": 1.1332,
+      "step": 6057
+    },
+    {
+      "epoch": 1.078525641025641,
+      "grad_norm": 0.6400678157806396,
+      "learning_rate": 0.00016637107916911393,
+      "loss": 1.2559,
+      "step": 6058
+    },
+    {
+      "epoch": 1.0787037037037037,
+      "grad_norm": 0.6370311379432678,
+      "learning_rate": 0.00016636060855961115,
+      "loss": 0.9752,
+      "step": 6059
+    },
+    {
+      "epoch": 1.0788817663817665,
+      "grad_norm": 0.6116052269935608,
+      "learning_rate": 0.00016635013664991012,
+      "loss": 0.8364,
+      "step": 6060
+    },
+    {
+      "epoch": 1.079059829059829,
+      "grad_norm": 0.7932127714157104,
+      "learning_rate": 0.00016633966344021593,
+      "loss": 0.939,
+      "step": 6061
+    },
+    {
+      "epoch": 1.0792378917378918,
+      "grad_norm": 0.576249897480011,
+      "learning_rate": 0.00016632918893073385,
+      "loss": 0.8911,
+      "step": 6062
+    },
+    {
+      "epoch": 1.0794159544159545,
+      "grad_norm": 0.5456888675689697,
+      "learning_rate": 0.00016631871312166915,
+      "loss": 0.8646,
+      "step": 6063
+    },
+    {
+      "epoch": 1.079594017094017,
+      "grad_norm": 0.717522919178009,
+      "learning_rate": 0.000166308236013227,
+      "loss": 1.0814,
+      "step": 6064
+    },
+    {
+      "epoch": 1.0797720797720798,
+      "grad_norm": 0.6637256145477295,
+      "learning_rate": 0.0001662977576056127,
+      "loss": 1.22,
+      "step": 6065
+    },
+    {
+      "epoch": 1.0799501424501425,
+      "grad_norm": 0.5846666693687439,
+      "learning_rate": 0.0001662872778990316,
+      "loss": 1.1745,
+      "step": 6066
+    },
+    {
+      "epoch": 1.080128205128205,
+      "grad_norm": 0.6611326336860657,
+      "learning_rate": 0.00016627679689368895,
+      "loss": 1.1262,
+      "step": 6067
+    },
+    {
+      "epoch": 1.0803062678062678,
+      "grad_norm": 0.6022892594337463,
+      "learning_rate": 0.00016626631458979015,
+      "loss": 0.9741,
+      "step": 6068
+    },
+    {
+      "epoch": 1.0804843304843306,
+      "grad_norm": 0.5862685441970825,
+      "learning_rate": 0.00016625583098754058,
+      "loss": 0.914,
+      "step": 6069
+    },
+    {
+      "epoch": 1.080662393162393,
+      "grad_norm": 0.7089241147041321,
+      "learning_rate": 0.00016624534608714563,
+      "loss": 1.0614,
+      "step": 6070
+    },
+    {
+      "epoch": 1.0808404558404558,
+      "grad_norm": 0.5286028981208801,
+      "learning_rate": 0.00016623485988881076,
+      "loss": 0.8756,
+      "step": 6071
+    },
+    {
+      "epoch": 1.0810185185185186,
+      "grad_norm": 0.6437101364135742,
+      "learning_rate": 0.00016622437239274137,
+      "loss": 0.7222,
+      "step": 6072
+    },
+    {
+      "epoch": 1.0811965811965811,
+      "grad_norm": 0.6197740435600281,
+      "learning_rate": 0.000166213883599143,
+      "loss": 0.7876,
+      "step": 6073
+    },
+    {
+      "epoch": 1.0813746438746439,
+      "grad_norm": 0.5889328122138977,
+      "learning_rate": 0.0001662033935082211,
+      "loss": 0.9587,
+      "step": 6074
+    },
+    {
+      "epoch": 1.0815527065527066,
+      "grad_norm": 0.5353847742080688,
+      "learning_rate": 0.00016619290212018125,
+      "loss": 0.8664,
+      "step": 6075
+    },
+    {
+      "epoch": 1.0817307692307692,
+      "grad_norm": 0.7202061414718628,
+      "learning_rate": 0.00016618240943522898,
+      "loss": 1.0429,
+      "step": 6076
+    },
+    {
+      "epoch": 1.081908831908832,
+      "grad_norm": 0.5831515192985535,
+      "learning_rate": 0.0001661719154535699,
+      "loss": 1.0323,
+      "step": 6077
+    },
+    {
+      "epoch": 1.0820868945868947,
+      "grad_norm": 0.6270500421524048,
+      "learning_rate": 0.00016616142017540953,
+      "loss": 0.9272,
+      "step": 6078
+    },
+    {
+      "epoch": 1.0822649572649572,
+      "grad_norm": 0.6064695119857788,
+      "learning_rate": 0.00016615092360095364,
+      "loss": 1.0629,
+      "step": 6079
+    },
+    {
+      "epoch": 1.08244301994302,
+      "grad_norm": 0.5578122138977051,
+      "learning_rate": 0.00016614042573040777,
+      "loss": 0.8601,
+      "step": 6080
+    },
+    {
+      "epoch": 1.0826210826210827,
+      "grad_norm": 0.5920688509941101,
+      "learning_rate": 0.0001661299265639777,
+      "loss": 1.0082,
+      "step": 6081
+    },
+    {
+      "epoch": 1.0827991452991452,
+      "grad_norm": 0.6191682815551758,
+      "learning_rate": 0.0001661194261018691,
+      "loss": 0.9645,
+      "step": 6082
+    },
+    {
+      "epoch": 1.082977207977208,
+      "grad_norm": 0.6403279304504395,
+      "learning_rate": 0.00016610892434428765,
+      "loss": 0.9263,
+      "step": 6083
+    },
+    {
+      "epoch": 1.0831552706552707,
+      "grad_norm": 0.579502284526825,
+      "learning_rate": 0.00016609842129143915,
+      "loss": 0.8997,
+      "step": 6084
+    },
+    {
+      "epoch": 1.0833333333333333,
+      "grad_norm": 0.5831437706947327,
+      "learning_rate": 0.00016608791694352944,
+      "loss": 1.0703,
+      "step": 6085
+    },
+    {
+      "epoch": 1.083511396011396,
+      "grad_norm": 0.6188452243804932,
+      "learning_rate": 0.00016607741130076424,
+      "loss": 0.8856,
+      "step": 6086
+    },
+    {
+      "epoch": 1.0836894586894588,
+      "grad_norm": 0.7413692474365234,
+      "learning_rate": 0.00016606690436334946,
+      "loss": 1.1995,
+      "step": 6087
+    },
+    {
+      "epoch": 1.0838675213675213,
+      "grad_norm": 0.5552099347114563,
+      "learning_rate": 0.00016605639613149093,
+      "loss": 0.8514,
+      "step": 6088
+    },
+    {
+      "epoch": 1.084045584045584,
+      "grad_norm": 0.5906503200531006,
+      "learning_rate": 0.00016604588660539452,
+      "loss": 0.9431,
+      "step": 6089
+    },
+    {
+      "epoch": 1.0842236467236468,
+      "grad_norm": 0.5326111316680908,
+      "learning_rate": 0.0001660353757852662,
+      "loss": 0.8306,
+      "step": 6090
+    },
+    {
+      "epoch": 1.0844017094017093,
+      "grad_norm": 0.7273091673851013,
+      "learning_rate": 0.0001660248636713118,
+      "loss": 1.1109,
+      "step": 6091
+    },
+    {
+      "epoch": 1.084579772079772,
+      "grad_norm": 0.66513592004776,
+      "learning_rate": 0.00016601435026373737,
+      "loss": 1.0621,
+      "step": 6092
+    },
+    {
+      "epoch": 1.0847578347578348,
+      "grad_norm": 0.6470831632614136,
+      "learning_rate": 0.00016600383556274892,
+      "loss": 1.1075,
+      "step": 6093
+    },
+    {
+      "epoch": 1.0849358974358974,
+      "grad_norm": 0.6308658719062805,
+      "learning_rate": 0.0001659933195685524,
+      "loss": 0.9832,
+      "step": 6094
+    },
+    {
+      "epoch": 1.08511396011396,
+      "grad_norm": 0.6569336652755737,
+      "learning_rate": 0.00016598280228135388,
+      "loss": 0.9754,
+      "step": 6095
+    },
+    {
+      "epoch": 1.0852920227920229,
+      "grad_norm": 0.5672318339347839,
+      "learning_rate": 0.0001659722837013594,
+      "loss": 0.9075,
+      "step": 6096
+    },
+    {
+      "epoch": 1.0854700854700854,
+      "grad_norm": 0.6397247314453125,
+      "learning_rate": 0.00016596176382877506,
+      "loss": 1.0358,
+      "step": 6097
+    },
+    {
+      "epoch": 1.0856481481481481,
+      "grad_norm": 0.6046154499053955,
+      "learning_rate": 0.000165951242663807,
+      "loss": 0.9036,
+      "step": 6098
+    },
+    {
+      "epoch": 1.085826210826211,
+      "grad_norm": 0.7190790176391602,
+      "learning_rate": 0.00016594072020666134,
+      "loss": 1.05,
+      "step": 6099
+    },
+    {
+      "epoch": 1.0860042735042734,
+      "grad_norm": 0.636986255645752,
+      "learning_rate": 0.00016593019645754425,
+      "loss": 1.0648,
+      "step": 6100
+    },
+    {
+      "epoch": 1.0861823361823362,
+      "grad_norm": 0.7239426374435425,
+      "learning_rate": 0.00016591967141666193,
+      "loss": 1.3332,
+      "step": 6101
+    },
+    {
+      "epoch": 1.086360398860399,
+      "grad_norm": 0.5623281002044678,
+      "learning_rate": 0.00016590914508422054,
+      "loss": 0.997,
+      "step": 6102
+    },
+    {
+      "epoch": 1.0865384615384615,
+      "grad_norm": 0.5559574365615845,
+      "learning_rate": 0.00016589861746042642,
+      "loss": 0.9309,
+      "step": 6103
+    },
+    {
+      "epoch": 1.0867165242165242,
+      "grad_norm": 0.6056998372077942,
+      "learning_rate": 0.00016588808854548574,
+      "loss": 1.05,
+      "step": 6104
+    },
+    {
+      "epoch": 1.086894586894587,
+      "grad_norm": 0.6419603228569031,
+      "learning_rate": 0.00016587755833960487,
+      "loss": 0.8933,
+      "step": 6105
+    },
+    {
+      "epoch": 1.0870726495726495,
+      "grad_norm": 0.5236496329307556,
+      "learning_rate": 0.00016586702684299006,
+      "loss": 1.0061,
+      "step": 6106
+    },
+    {
+      "epoch": 1.0872507122507122,
+      "grad_norm": 0.5764613747596741,
+      "learning_rate": 0.0001658564940558477,
+      "loss": 1.0218,
+      "step": 6107
+    },
+    {
+      "epoch": 1.087428774928775,
+      "grad_norm": 0.6049391627311707,
+      "learning_rate": 0.00016584595997838416,
+      "loss": 0.8157,
+      "step": 6108
+    },
+    {
+      "epoch": 1.0876068376068375,
+      "grad_norm": 0.585422933101654,
+      "learning_rate": 0.0001658354246108058,
+      "loss": 1.2761,
+      "step": 6109
+    },
+    {
+      "epoch": 1.0877849002849003,
+      "grad_norm": 0.6420125365257263,
+      "learning_rate": 0.00016582488795331907,
+      "loss": 1.1978,
+      "step": 6110
+    },
+    {
+      "epoch": 1.087962962962963,
+      "grad_norm": 0.646091878414154,
+      "learning_rate": 0.00016581435000613038,
+      "loss": 0.8946,
+      "step": 6111
+    },
+    {
+      "epoch": 1.0881410256410255,
+      "grad_norm": 0.6563934087753296,
+      "learning_rate": 0.00016580381076944625,
+      "loss": 1.0625,
+      "step": 6112
+    },
+    {
+      "epoch": 1.0883190883190883,
+      "grad_norm": 0.6796613931655884,
+      "learning_rate": 0.0001657932702434731,
+      "loss": 0.9401,
+      "step": 6113
+    },
+    {
+      "epoch": 1.088497150997151,
+      "grad_norm": 0.6248648762702942,
+      "learning_rate": 0.00016578272842841753,
+      "loss": 0.8558,
+      "step": 6114
+    },
+    {
+      "epoch": 1.0886752136752136,
+      "grad_norm": 0.5136269330978394,
+      "learning_rate": 0.00016577218532448605,
+      "loss": 0.6424,
+      "step": 6115
+    },
+    {
+      "epoch": 1.0888532763532763,
+      "grad_norm": 0.5581641793251038,
+      "learning_rate": 0.00016576164093188523,
+      "loss": 0.7923,
+      "step": 6116
+    },
+    {
+      "epoch": 1.089031339031339,
+      "grad_norm": 0.630352258682251,
+      "learning_rate": 0.0001657510952508216,
+      "loss": 0.9115,
+      "step": 6117
+    },
+    {
+      "epoch": 1.0892094017094016,
+      "grad_norm": 0.6167593002319336,
+      "learning_rate": 0.0001657405482815019,
+      "loss": 1.1112,
+      "step": 6118
+    },
+    {
+      "epoch": 1.0893874643874644,
+      "grad_norm": 0.5908578634262085,
+      "learning_rate": 0.00016573000002413271,
+      "loss": 1.0359,
+      "step": 6119
+    },
+    {
+      "epoch": 1.0895655270655271,
+      "grad_norm": 0.6326140761375427,
+      "learning_rate": 0.00016571945047892073,
+      "loss": 1.0459,
+      "step": 6120
+    },
+    {
+      "epoch": 1.0897435897435896,
+      "grad_norm": 0.7273572683334351,
+      "learning_rate": 0.00016570889964607262,
+      "loss": 1.0901,
+      "step": 6121
+    },
+    {
+      "epoch": 1.0899216524216524,
+      "grad_norm": 0.6168062090873718,
+      "learning_rate": 0.00016569834752579513,
+      "loss": 0.8739,
+      "step": 6122
+    },
+    {
+      "epoch": 1.0900997150997151,
+      "grad_norm": 0.5620378255844116,
+      "learning_rate": 0.00016568779411829497,
+      "loss": 0.9614,
+      "step": 6123
+    },
+    {
+      "epoch": 1.0902777777777777,
+      "grad_norm": 0.6319156885147095,
+      "learning_rate": 0.00016567723942377899,
+      "loss": 1.1031,
+      "step": 6124
+    },
+    {
+      "epoch": 1.0904558404558404,
+      "grad_norm": 0.6590072512626648,
+      "learning_rate": 0.00016566668344245388,
+      "loss": 1.0086,
+      "step": 6125
+    },
+    {
+      "epoch": 1.0906339031339032,
+      "grad_norm": 0.5823387503623962,
+      "learning_rate": 0.00016565612617452656,
+      "loss": 0.8886,
+      "step": 6126
+    },
+    {
+      "epoch": 1.0908119658119657,
+      "grad_norm": 0.5795989632606506,
+      "learning_rate": 0.00016564556762020381,
+      "loss": 0.7683,
+      "step": 6127
+    },
+    {
+      "epoch": 1.0909900284900285,
+      "grad_norm": 0.5940101742744446,
+      "learning_rate": 0.00016563500777969255,
+      "loss": 0.8873,
+      "step": 6128
+    },
+    {
+      "epoch": 1.0911680911680912,
+      "grad_norm": 0.5708247423171997,
+      "learning_rate": 0.00016562444665319963,
+      "loss": 0.7382,
+      "step": 6129
+    },
+    {
+      "epoch": 1.0913461538461537,
+      "grad_norm": 0.6339239478111267,
+      "learning_rate": 0.00016561388424093202,
+      "loss": 0.9323,
+      "step": 6130
+    },
+    {
+      "epoch": 1.0915242165242165,
+      "grad_norm": 0.720000147819519,
+      "learning_rate": 0.00016560332054309663,
+      "loss": 1.0437,
+      "step": 6131
+    },
+    {
+      "epoch": 1.0917022792022792,
+      "grad_norm": 0.686580240726471,
+      "learning_rate": 0.00016559275555990048,
+      "loss": 0.9841,
+      "step": 6132
+    },
+    {
+      "epoch": 1.091880341880342,
+      "grad_norm": 0.6067900061607361,
+      "learning_rate": 0.00016558218929155053,
+      "loss": 1.0862,
+      "step": 6133
+    },
+    {
+      "epoch": 1.0920584045584045,
+      "grad_norm": 0.6678896546363831,
+      "learning_rate": 0.00016557162173825384,
+      "loss": 0.8509,
+      "step": 6134
+    },
+    {
+      "epoch": 1.0922364672364673,
+      "grad_norm": 0.53044193983078,
+      "learning_rate": 0.0001655610529002174,
+      "loss": 0.9227,
+      "step": 6135
+    },
+    {
+      "epoch": 1.0924145299145298,
+      "grad_norm": 0.6499412655830383,
+      "learning_rate": 0.00016555048277764836,
+      "loss": 1.0867,
+      "step": 6136
+    },
+    {
+      "epoch": 1.0925925925925926,
+      "grad_norm": 0.6543099284172058,
+      "learning_rate": 0.00016553991137075374,
+      "loss": 0.849,
+      "step": 6137
+    },
+    {
+      "epoch": 1.0927706552706553,
+      "grad_norm": 0.5772737860679626,
+      "learning_rate": 0.0001655293386797407,
+      "loss": 0.8475,
+      "step": 6138
+    },
+    {
+      "epoch": 1.092948717948718,
+      "grad_norm": 0.616348385810852,
+      "learning_rate": 0.00016551876470481642,
+      "loss": 0.9205,
+      "step": 6139
+    },
+    {
+      "epoch": 1.0931267806267806,
+      "grad_norm": 0.7151142954826355,
+      "learning_rate": 0.00016550818944618801,
+      "loss": 1.1389,
+      "step": 6140
+    },
+    {
+      "epoch": 1.0933048433048433,
+      "grad_norm": 0.6566469669342041,
+      "learning_rate": 0.00016549761290406275,
+      "loss": 0.8216,
+      "step": 6141
+    },
+    {
+      "epoch": 1.0934829059829059,
+      "grad_norm": 0.7075428366661072,
+      "learning_rate": 0.00016548703507864783,
+      "loss": 1.065,
+      "step": 6142
+    },
+    {
+      "epoch": 1.0936609686609686,
+      "grad_norm": 0.6589360237121582,
+      "learning_rate": 0.00016547645597015046,
+      "loss": 0.9899,
+      "step": 6143
+    },
+    {
+      "epoch": 1.0938390313390314,
+      "grad_norm": 0.6445585489273071,
+      "learning_rate": 0.00016546587557877797,
+      "loss": 1.1629,
+      "step": 6144
+    },
+    {
+      "epoch": 1.0940170940170941,
+      "grad_norm": 0.6216462850570679,
+      "learning_rate": 0.00016545529390473763,
+      "loss": 0.9685,
+      "step": 6145
+    },
+    {
+      "epoch": 1.0941951566951567,
+      "grad_norm": 0.6195303797721863,
+      "learning_rate": 0.0001654447109482368,
+      "loss": 1.144,
+      "step": 6146
+    },
+    {
+      "epoch": 1.0943732193732194,
+      "grad_norm": 0.6625444293022156,
+      "learning_rate": 0.0001654341267094828,
+      "loss": 0.9886,
+      "step": 6147
+    },
+    {
+      "epoch": 1.094551282051282,
+      "grad_norm": 0.6449851393699646,
+      "learning_rate": 0.000165423541188683,
+      "loss": 0.9568,
+      "step": 6148
+    },
+    {
+      "epoch": 1.0947293447293447,
+      "grad_norm": 0.6490375995635986,
+      "learning_rate": 0.00016541295438604484,
+      "loss": 1.1304,
+      "step": 6149
+    },
+    {
+      "epoch": 1.0949074074074074,
+      "grad_norm": 0.6771987676620483,
+      "learning_rate": 0.00016540236630177574,
+      "loss": 1.0426,
+      "step": 6150
+    },
+    {
+      "epoch": 1.0950854700854702,
+      "grad_norm": 0.5214568376541138,
+      "learning_rate": 0.00016539177693608307,
+      "loss": 0.6742,
+      "step": 6151
+    },
+    {
+      "epoch": 1.0952635327635327,
+      "grad_norm": 0.6005097031593323,
+      "learning_rate": 0.00016538118628917442,
+      "loss": 0.9901,
+      "step": 6152
+    },
+    {
+      "epoch": 1.0954415954415955,
+      "grad_norm": 0.6449539065361023,
+      "learning_rate": 0.0001653705943612572,
+      "loss": 0.9654,
+      "step": 6153
+    },
+    {
+      "epoch": 1.095619658119658,
+      "grad_norm": 0.6443646550178528,
+      "learning_rate": 0.00016536000115253903,
+      "loss": 0.9084,
+      "step": 6154
+    },
+    {
+      "epoch": 1.0957977207977208,
+      "grad_norm": 0.6072495579719543,
+      "learning_rate": 0.0001653494066632274,
+      "loss": 0.6308,
+      "step": 6155
+    },
+    {
+      "epoch": 1.0959757834757835,
+      "grad_norm": 0.5751157999038696,
+      "learning_rate": 0.00016533881089352988,
+      "loss": 0.96,
+      "step": 6156
+    },
+    {
+      "epoch": 1.0961538461538463,
+      "grad_norm": 0.6310713291168213,
+      "learning_rate": 0.0001653282138436541,
+      "loss": 1.0997,
+      "step": 6157
+    },
+    {
+      "epoch": 1.0963319088319088,
+      "grad_norm": 0.5573651790618896,
+      "learning_rate": 0.00016531761551380765,
+      "loss": 0.9738,
+      "step": 6158
+    },
+    {
+      "epoch": 1.0965099715099715,
+      "grad_norm": 0.5615308880805969,
+      "learning_rate": 0.00016530701590419824,
+      "loss": 0.9658,
+      "step": 6159
+    },
+    {
+      "epoch": 1.0966880341880343,
+      "grad_norm": 0.6471942663192749,
+      "learning_rate": 0.0001652964150150335,
+      "loss": 1.0763,
+      "step": 6160
+    },
+    {
+      "epoch": 1.0968660968660968,
+      "grad_norm": 0.6305427551269531,
+      "learning_rate": 0.00016528581284652117,
+      "loss": 1.112,
+      "step": 6161
+    },
+    {
+      "epoch": 1.0970441595441596,
+      "grad_norm": 0.6881145238876343,
+      "learning_rate": 0.00016527520939886892,
+      "loss": 0.8476,
+      "step": 6162
+    },
+    {
+      "epoch": 1.0972222222222223,
+      "grad_norm": 0.6507891416549683,
+      "learning_rate": 0.00016526460467228458,
+      "loss": 1.1097,
+      "step": 6163
+    },
+    {
+      "epoch": 1.0974002849002849,
+      "grad_norm": 0.5960137844085693,
+      "learning_rate": 0.00016525399866697586,
+      "loss": 0.9934,
+      "step": 6164
+    },
+    {
+      "epoch": 1.0975783475783476,
+      "grad_norm": 0.6001808643341064,
+      "learning_rate": 0.0001652433913831506,
+      "loss": 1.0782,
+      "step": 6165
+    },
+    {
+      "epoch": 1.0977564102564104,
+      "grad_norm": 0.5639005303382874,
+      "learning_rate": 0.00016523278282101663,
+      "loss": 1.0929,
+      "step": 6166
+    },
+    {
+      "epoch": 1.0979344729344729,
+      "grad_norm": 0.5962058305740356,
+      "learning_rate": 0.00016522217298078177,
+      "loss": 1.0315,
+      "step": 6167
+    },
+    {
+      "epoch": 1.0981125356125356,
+      "grad_norm": 0.6920329928398132,
+      "learning_rate": 0.0001652115618626539,
+      "loss": 0.9176,
+      "step": 6168
+    },
+    {
+      "epoch": 1.0982905982905984,
+      "grad_norm": 0.6963527202606201,
+      "learning_rate": 0.00016520094946684098,
+      "loss": 1.2136,
+      "step": 6169
+    },
+    {
+      "epoch": 1.098468660968661,
+      "grad_norm": 0.5855711102485657,
+      "learning_rate": 0.00016519033579355093,
+      "loss": 0.8453,
+      "step": 6170
+    },
+    {
+      "epoch": 1.0986467236467237,
+      "grad_norm": 0.6454927325248718,
+      "learning_rate": 0.0001651797208429916,
+      "loss": 1.0747,
+      "step": 6171
+    },
+    {
+      "epoch": 1.0988247863247864,
+      "grad_norm": 0.644585907459259,
+      "learning_rate": 0.00016516910461537108,
+      "loss": 0.8165,
+      "step": 6172
+    },
+    {
+      "epoch": 1.099002849002849,
+      "grad_norm": 0.6488069891929626,
+      "learning_rate": 0.00016515848711089732,
+      "loss": 1.1048,
+      "step": 6173
+    },
+    {
+      "epoch": 1.0991809116809117,
+      "grad_norm": 0.5867953896522522,
+      "learning_rate": 0.00016514786832977834,
+      "loss": 0.63,
+      "step": 6174
+    },
+    {
+      "epoch": 1.0993589743589745,
+      "grad_norm": 0.560591459274292,
+      "learning_rate": 0.00016513724827222227,
+      "loss": 0.9255,
+      "step": 6175
+    },
+    {
+      "epoch": 1.099537037037037,
+      "grad_norm": 0.675262451171875,
+      "learning_rate": 0.00016512662693843707,
+      "loss": 0.7637,
+      "step": 6176
+    },
+    {
+      "epoch": 1.0997150997150997,
+      "grad_norm": 0.6515669822692871,
+      "learning_rate": 0.00016511600432863091,
+      "loss": 0.7579,
+      "step": 6177
+    },
+    {
+      "epoch": 1.0998931623931625,
+      "grad_norm": 0.683409571647644,
+      "learning_rate": 0.00016510538044301192,
+      "loss": 0.9183,
+      "step": 6178
+    },
+    {
+      "epoch": 1.100071225071225,
+      "grad_norm": 0.6194507479667664,
+      "learning_rate": 0.00016509475528178827,
+      "loss": 1.16,
+      "step": 6179
+    },
+    {
+      "epoch": 1.1002492877492878,
+      "grad_norm": 0.6192209720611572,
+      "learning_rate": 0.0001650841288451681,
+      "loss": 1.1392,
+      "step": 6180
+    },
+    {
+      "epoch": 1.1004273504273505,
+      "grad_norm": 0.6029189825057983,
+      "learning_rate": 0.0001650735011333596,
+      "loss": 1.1453,
+      "step": 6181
+    },
+    {
+      "epoch": 1.100605413105413,
+      "grad_norm": 0.7040731310844421,
+      "learning_rate": 0.00016506287214657105,
+      "loss": 0.9367,
+      "step": 6182
+    },
+    {
+      "epoch": 1.1007834757834758,
+      "grad_norm": 0.5909842252731323,
+      "learning_rate": 0.00016505224188501067,
+      "loss": 0.6463,
+      "step": 6183
+    },
+    {
+      "epoch": 1.1009615384615385,
+      "grad_norm": 0.6129698157310486,
+      "learning_rate": 0.00016504161034888674,
+      "loss": 0.9432,
+      "step": 6184
+    },
+    {
+      "epoch": 1.101139601139601,
+      "grad_norm": 0.6181607842445374,
+      "learning_rate": 0.00016503097753840757,
+      "loss": 0.9934,
+      "step": 6185
+    },
+    {
+      "epoch": 1.1013176638176638,
+      "grad_norm": 0.6463226675987244,
+      "learning_rate": 0.0001650203434537815,
+      "loss": 0.8471,
+      "step": 6186
+    },
+    {
+      "epoch": 1.1014957264957266,
+      "grad_norm": 0.5999348163604736,
+      "learning_rate": 0.00016500970809521688,
+      "loss": 0.9418,
+      "step": 6187
+    },
+    {
+      "epoch": 1.101673789173789,
+      "grad_norm": 0.629504919052124,
+      "learning_rate": 0.00016499907146292204,
+      "loss": 0.9699,
+      "step": 6188
+    },
+    {
+      "epoch": 1.1018518518518519,
+      "grad_norm": 0.694767951965332,
+      "learning_rate": 0.00016498843355710542,
+      "loss": 0.8793,
+      "step": 6189
+    },
+    {
+      "epoch": 1.1020299145299146,
+      "grad_norm": 0.6205509901046753,
+      "learning_rate": 0.00016497779437797547,
+      "loss": 0.8384,
+      "step": 6190
+    },
+    {
+      "epoch": 1.1022079772079771,
+      "grad_norm": 0.6256579756736755,
+      "learning_rate": 0.0001649671539257406,
+      "loss": 0.9275,
+      "step": 6191
+    },
+    {
+      "epoch": 1.10238603988604,
+      "grad_norm": 0.6593793034553528,
+      "learning_rate": 0.00016495651220060933,
+      "loss": 1.0495,
+      "step": 6192
+    },
+    {
+      "epoch": 1.1025641025641026,
+      "grad_norm": 0.7809221148490906,
+      "learning_rate": 0.00016494586920279012,
+      "loss": 1.0485,
+      "step": 6193
+    },
+    {
+      "epoch": 1.1027421652421652,
+      "grad_norm": 0.6147717833518982,
+      "learning_rate": 0.0001649352249324915,
+      "loss": 0.8739,
+      "step": 6194
+    },
+    {
+      "epoch": 1.102920227920228,
+      "grad_norm": 0.565411388874054,
+      "learning_rate": 0.00016492457938992208,
+      "loss": 0.9759,
+      "step": 6195
+    },
+    {
+      "epoch": 1.1030982905982907,
+      "grad_norm": 0.596370279788971,
+      "learning_rate": 0.00016491393257529036,
+      "loss": 0.9658,
+      "step": 6196
+    },
+    {
+      "epoch": 1.1032763532763532,
+      "grad_norm": 0.6334326863288879,
+      "learning_rate": 0.00016490328448880498,
+      "loss": 0.8785,
+      "step": 6197
+    },
+    {
+      "epoch": 1.103454415954416,
+      "grad_norm": 0.5538334846496582,
+      "learning_rate": 0.0001648926351306746,
+      "loss": 0.7174,
+      "step": 6198
+    },
+    {
+      "epoch": 1.1036324786324787,
+      "grad_norm": 0.6249658465385437,
+      "learning_rate": 0.00016488198450110778,
+      "loss": 0.8579,
+      "step": 6199
+    },
+    {
+      "epoch": 1.1038105413105412,
+      "grad_norm": 0.6128895878791809,
+      "learning_rate": 0.00016487133260031329,
+      "loss": 0.8538,
+      "step": 6200
+    },
+    {
+      "epoch": 1.103988603988604,
+      "grad_norm": 0.5808702707290649,
+      "learning_rate": 0.0001648606794284998,
+      "loss": 0.8143,
+      "step": 6201
+    },
+    {
+      "epoch": 1.1041666666666667,
+      "grad_norm": 0.671419084072113,
+      "learning_rate": 0.00016485002498587602,
+      "loss": 1.1268,
+      "step": 6202
+    },
+    {
+      "epoch": 1.1043447293447293,
+      "grad_norm": 0.5706788897514343,
+      "learning_rate": 0.00016483936927265075,
+      "loss": 0.9558,
+      "step": 6203
+    },
+    {
+      "epoch": 1.104522792022792,
+      "grad_norm": 0.5700307488441467,
+      "learning_rate": 0.00016482871228903266,
+      "loss": 0.9616,
+      "step": 6204
+    },
+    {
+      "epoch": 1.1047008547008548,
+      "grad_norm": 0.5764816403388977,
+      "learning_rate": 0.0001648180540352307,
+      "loss": 0.8692,
+      "step": 6205
+    },
+    {
+      "epoch": 1.1048789173789173,
+      "grad_norm": 0.5786563754081726,
+      "learning_rate": 0.00016480739451145358,
+      "loss": 0.9406,
+      "step": 6206
+    },
+    {
+      "epoch": 1.10505698005698,
+      "grad_norm": 0.6112591624259949,
+      "learning_rate": 0.0001647967337179102,
+      "loss": 0.8999,
+      "step": 6207
+    },
+    {
+      "epoch": 1.1052350427350428,
+      "grad_norm": 0.5708907246589661,
+      "learning_rate": 0.00016478607165480944,
+      "loss": 0.9236,
+      "step": 6208
+    },
+    {
+      "epoch": 1.1054131054131053,
+      "grad_norm": 0.6742013692855835,
+      "learning_rate": 0.00016477540832236014,
+      "loss": 1.0911,
+      "step": 6209
+    },
+    {
+      "epoch": 1.105591168091168,
+      "grad_norm": 0.6382617354393005,
+      "learning_rate": 0.0001647647437207713,
+      "loss": 0.7901,
+      "step": 6210
+    },
+    {
+      "epoch": 1.1057692307692308,
+      "grad_norm": 0.6241547465324402,
+      "learning_rate": 0.00016475407785025188,
+      "loss": 1.0048,
+      "step": 6211
+    },
+    {
+      "epoch": 1.1059472934472934,
+      "grad_norm": 0.6452877521514893,
+      "learning_rate": 0.00016474341071101077,
+      "loss": 0.8902,
+      "step": 6212
+    },
+    {
+      "epoch": 1.1061253561253561,
+      "grad_norm": 0.6212326288223267,
+      "learning_rate": 0.00016473274230325704,
+      "loss": 1.078,
+      "step": 6213
+    },
+    {
+      "epoch": 1.1063034188034189,
+      "grad_norm": 0.6870912909507751,
+      "learning_rate": 0.00016472207262719968,
+      "loss": 0.9127,
+      "step": 6214
+    },
+    {
+      "epoch": 1.1064814814814814,
+      "grad_norm": 0.6286750435829163,
+      "learning_rate": 0.00016471140168304777,
+      "loss": 1.0271,
+      "step": 6215
+    },
+    {
+      "epoch": 1.1066595441595442,
+      "grad_norm": 0.645806074142456,
+      "learning_rate": 0.00016470072947101036,
+      "loss": 1.1514,
+      "step": 6216
+    },
+    {
+      "epoch": 1.106837606837607,
+      "grad_norm": 0.6800320148468018,
+      "learning_rate": 0.00016469005599129653,
+      "loss": 0.9322,
+      "step": 6217
+    },
+    {
+      "epoch": 1.1070156695156694,
+      "grad_norm": 0.5898309946060181,
+      "learning_rate": 0.0001646793812441155,
+      "loss": 1.065,
+      "step": 6218
+    },
+    {
+      "epoch": 1.1071937321937322,
+      "grad_norm": 0.6000019907951355,
+      "learning_rate": 0.00016466870522967634,
+      "loss": 0.911,
+      "step": 6219
+    },
+    {
+      "epoch": 1.107371794871795,
+      "grad_norm": 0.6164331436157227,
+      "learning_rate": 0.0001646580279481882,
+      "loss": 0.8421,
+      "step": 6220
+    },
+    {
+      "epoch": 1.1075498575498575,
+      "grad_norm": 0.6410242319107056,
+      "learning_rate": 0.00016464734939986036,
+      "loss": 0.9688,
+      "step": 6221
+    },
+    {
+      "epoch": 1.1077279202279202,
+      "grad_norm": 0.7153300046920776,
+      "learning_rate": 0.00016463666958490197,
+      "loss": 1.0722,
+      "step": 6222
+    },
+    {
+      "epoch": 1.107905982905983,
+      "grad_norm": 0.6977026462554932,
+      "learning_rate": 0.00016462598850352234,
+      "loss": 1.0192,
+      "step": 6223
+    },
+    {
+      "epoch": 1.1080840455840455,
+      "grad_norm": 0.6379461884498596,
+      "learning_rate": 0.0001646153061559307,
+      "loss": 1.0474,
+      "step": 6224
+    },
+    {
+      "epoch": 1.1082621082621082,
+      "grad_norm": 0.6135090589523315,
+      "learning_rate": 0.00016460462254233634,
+      "loss": 1.0082,
+      "step": 6225
+    },
+    {
+      "epoch": 1.108440170940171,
+      "grad_norm": 0.6326230764389038,
+      "learning_rate": 0.00016459393766294866,
+      "loss": 1.1097,
+      "step": 6226
+    },
+    {
+      "epoch": 1.1086182336182335,
+      "grad_norm": 0.6636839509010315,
+      "learning_rate": 0.0001645832515179769,
+      "loss": 0.9689,
+      "step": 6227
+    },
+    {
+      "epoch": 1.1087962962962963,
+      "grad_norm": 0.5713129043579102,
+      "learning_rate": 0.00016457256410763052,
+      "loss": 0.8642,
+      "step": 6228
+    },
+    {
+      "epoch": 1.108974358974359,
+      "grad_norm": 0.584204912185669,
+      "learning_rate": 0.00016456187543211888,
+      "loss": 0.9957,
+      "step": 6229
+    },
+    {
+      "epoch": 1.1091524216524216,
+      "grad_norm": 0.5920230746269226,
+      "learning_rate": 0.0001645511854916514,
+      "loss": 0.7297,
+      "step": 6230
+    },
+    {
+      "epoch": 1.1093304843304843,
+      "grad_norm": 0.6207385063171387,
+      "learning_rate": 0.0001645404942864375,
+      "loss": 0.868,
+      "step": 6231
+    },
+    {
+      "epoch": 1.109508547008547,
+      "grad_norm": 0.7267234921455383,
+      "learning_rate": 0.00016452980181668673,
+      "loss": 1.0248,
+      "step": 6232
+    },
+    {
+      "epoch": 1.1096866096866096,
+      "grad_norm": 0.5925650596618652,
+      "learning_rate": 0.00016451910808260852,
+      "loss": 1.1075,
+      "step": 6233
+    },
+    {
+      "epoch": 1.1098646723646723,
+      "grad_norm": 0.5632196664810181,
+      "learning_rate": 0.00016450841308441244,
+      "loss": 0.9865,
+      "step": 6234
+    },
+    {
+      "epoch": 1.110042735042735,
+      "grad_norm": 0.6115161180496216,
+      "learning_rate": 0.000164497716822308,
+      "loss": 1.1343,
+      "step": 6235
+    },
+    {
+      "epoch": 1.1102207977207976,
+      "grad_norm": 0.634398341178894,
+      "learning_rate": 0.00016448701929650477,
+      "loss": 1.1039,
+      "step": 6236
+    },
+    {
+      "epoch": 1.1103988603988604,
+      "grad_norm": 0.5843468308448792,
+      "learning_rate": 0.00016447632050721237,
+      "loss": 0.8462,
+      "step": 6237
+    },
+    {
+      "epoch": 1.1105769230769231,
+      "grad_norm": 0.799375593662262,
+      "learning_rate": 0.0001644656204546404,
+      "loss": 0.9861,
+      "step": 6238
+    },
+    {
+      "epoch": 1.1107549857549857,
+      "grad_norm": 0.600289523601532,
+      "learning_rate": 0.0001644549191389985,
+      "loss": 1.0323,
+      "step": 6239
+    },
+    {
+      "epoch": 1.1109330484330484,
+      "grad_norm": 0.6154919266700745,
+      "learning_rate": 0.00016444421656049637,
+      "loss": 0.9158,
+      "step": 6240
+    },
+    {
+      "epoch": 1.1111111111111112,
+      "grad_norm": 0.6685689687728882,
+      "learning_rate": 0.00016443351271934367,
+      "loss": 1.0429,
+      "step": 6241
+    },
+    {
+      "epoch": 1.1112891737891737,
+      "grad_norm": 0.699978232383728,
+      "learning_rate": 0.00016442280761575016,
+      "loss": 1.072,
+      "step": 6242
+    },
+    {
+      "epoch": 1.1114672364672364,
+      "grad_norm": 0.6461396217346191,
+      "learning_rate": 0.00016441210124992556,
+      "loss": 0.9758,
+      "step": 6243
+    },
+    {
+      "epoch": 1.1116452991452992,
+      "grad_norm": 0.6463284492492676,
+      "learning_rate": 0.00016440139362207962,
+      "loss": 0.9205,
+      "step": 6244
+    },
+    {
+      "epoch": 1.1118233618233617,
+      "grad_norm": 0.6587556004524231,
+      "learning_rate": 0.00016439068473242217,
+      "loss": 1.0027,
+      "step": 6245
+    },
+    {
+      "epoch": 1.1120014245014245,
+      "grad_norm": 0.6896520256996155,
+      "learning_rate": 0.000164379974581163,
+      "loss": 0.9788,
+      "step": 6246
+    },
+    {
+      "epoch": 1.1121794871794872,
+      "grad_norm": 0.6766142845153809,
+      "learning_rate": 0.000164369263168512,
+      "loss": 0.9647,
+      "step": 6247
+    },
+    {
+      "epoch": 1.1123575498575498,
+      "grad_norm": 0.7024297118186951,
+      "learning_rate": 0.00016435855049467898,
+      "loss": 1.1163,
+      "step": 6248
+    },
+    {
+      "epoch": 1.1125356125356125,
+      "grad_norm": 0.6654963493347168,
+      "learning_rate": 0.00016434783655987385,
+      "loss": 0.9302,
+      "step": 6249
+    },
+    {
+      "epoch": 1.1127136752136753,
+      "grad_norm": 0.6973692774772644,
+      "learning_rate": 0.0001643371213643065,
+      "loss": 0.9585,
+      "step": 6250
+    },
+    {
+      "epoch": 1.1128917378917378,
+      "grad_norm": 0.7153545022010803,
+      "learning_rate": 0.000164326404908187,
+      "loss": 1.0485,
+      "step": 6251
+    },
+    {
+      "epoch": 1.1130698005698005,
+      "grad_norm": 0.6114685535430908,
+      "learning_rate": 0.00016431568719172516,
+      "loss": 0.8881,
+      "step": 6252
+    },
+    {
+      "epoch": 1.1132478632478633,
+      "grad_norm": 0.6500731706619263,
+      "learning_rate": 0.00016430496821513103,
+      "loss": 1.0658,
+      "step": 6253
+    },
+    {
+      "epoch": 1.113425925925926,
+      "grad_norm": 0.5800092220306396,
+      "learning_rate": 0.00016429424797861466,
+      "loss": 0.9158,
+      "step": 6254
+    },
+    {
+      "epoch": 1.1136039886039886,
+      "grad_norm": 0.6653759479522705,
+      "learning_rate": 0.00016428352648238602,
+      "loss": 0.9762,
+      "step": 6255
+    },
+    {
+      "epoch": 1.1137820512820513,
+      "grad_norm": 0.649208128452301,
+      "learning_rate": 0.00016427280372665525,
+      "loss": 1.1184,
+      "step": 6256
+    },
+    {
+      "epoch": 1.1139601139601139,
+      "grad_norm": 0.6665199398994446,
+      "learning_rate": 0.00016426207971163238,
+      "loss": 0.9417,
+      "step": 6257
+    },
+    {
+      "epoch": 1.1141381766381766,
+      "grad_norm": 0.6110978126525879,
+      "learning_rate": 0.00016425135443752758,
+      "loss": 1.1531,
+      "step": 6258
+    },
+    {
+      "epoch": 1.1143162393162394,
+      "grad_norm": 0.6517077088356018,
+      "learning_rate": 0.00016424062790455093,
+      "loss": 0.9055,
+      "step": 6259
+    },
+    {
+      "epoch": 1.114494301994302,
+      "grad_norm": 0.6278966665267944,
+      "learning_rate": 0.00016422990011291265,
+      "loss": 1.0087,
+      "step": 6260
+    },
+    {
+      "epoch": 1.1146723646723646,
+      "grad_norm": 0.5818809270858765,
+      "learning_rate": 0.00016421917106282288,
+      "loss": 1.0202,
+      "step": 6261
+    },
+    {
+      "epoch": 1.1148504273504274,
+      "grad_norm": 0.5670005679130554,
+      "learning_rate": 0.00016420844075449187,
+      "loss": 0.841,
+      "step": 6262
+    },
+    {
+      "epoch": 1.11502849002849,
+      "grad_norm": 0.6584762334823608,
+      "learning_rate": 0.00016419770918812984,
+      "loss": 1.0322,
+      "step": 6263
+    },
+    {
+      "epoch": 1.1152065527065527,
+      "grad_norm": 0.6023790836334229,
+      "learning_rate": 0.00016418697636394705,
+      "loss": 0.9152,
+      "step": 6264
+    },
+    {
+      "epoch": 1.1153846153846154,
+      "grad_norm": 0.6234691739082336,
+      "learning_rate": 0.00016417624228215382,
+      "loss": 0.9555,
+      "step": 6265
+    },
+    {
+      "epoch": 1.1155626780626782,
+      "grad_norm": 0.6690816879272461,
+      "learning_rate": 0.00016416550694296045,
+      "loss": 0.9341,
+      "step": 6266
+    },
+    {
+      "epoch": 1.1157407407407407,
+      "grad_norm": 0.6030237078666687,
+      "learning_rate": 0.00016415477034657723,
+      "loss": 1.0442,
+      "step": 6267
+    },
+    {
+      "epoch": 1.1159188034188035,
+      "grad_norm": 0.5954633951187134,
+      "learning_rate": 0.00016414403249321455,
+      "loss": 0.9132,
+      "step": 6268
+    },
+    {
+      "epoch": 1.116096866096866,
+      "grad_norm": 0.7876830101013184,
+      "learning_rate": 0.0001641332933830828,
+      "loss": 0.9456,
+      "step": 6269
+    },
+    {
+      "epoch": 1.1162749287749287,
+      "grad_norm": 0.6776009798049927,
+      "learning_rate": 0.00016412255301639244,
+      "loss": 0.9022,
+      "step": 6270
+    },
+    {
+      "epoch": 1.1164529914529915,
+      "grad_norm": 0.6094426512718201,
+      "learning_rate": 0.0001641118113933538,
+      "loss": 0.9629,
+      "step": 6271
+    },
+    {
+      "epoch": 1.1166310541310542,
+      "grad_norm": 0.5818213820457458,
+      "learning_rate": 0.00016410106851417742,
+      "loss": 0.9049,
+      "step": 6272
+    },
+    {
+      "epoch": 1.1168091168091168,
+      "grad_norm": 0.5668078064918518,
+      "learning_rate": 0.00016409032437907377,
+      "loss": 1.0011,
+      "step": 6273
+    },
+    {
+      "epoch": 1.1169871794871795,
+      "grad_norm": 0.6984922289848328,
+      "learning_rate": 0.00016407957898825334,
+      "loss": 0.9454,
+      "step": 6274
+    },
+    {
+      "epoch": 1.1171652421652423,
+      "grad_norm": 0.5509830117225647,
+      "learning_rate": 0.00016406883234192668,
+      "loss": 0.9132,
+      "step": 6275
+    },
+    {
+      "epoch": 1.1173433048433048,
+      "grad_norm": 0.5117461681365967,
+      "learning_rate": 0.00016405808444030435,
+      "loss": 0.7675,
+      "step": 6276
+    },
+    {
+      "epoch": 1.1175213675213675,
+      "grad_norm": 0.6358339786529541,
+      "learning_rate": 0.00016404733528359688,
+      "loss": 0.9777,
+      "step": 6277
+    },
+    {
+      "epoch": 1.1176994301994303,
+      "grad_norm": 0.5870591402053833,
+      "learning_rate": 0.00016403658487201494,
+      "loss": 0.8576,
+      "step": 6278
+    },
+    {
+      "epoch": 1.1178774928774928,
+      "grad_norm": 0.6532407999038696,
+      "learning_rate": 0.00016402583320576915,
+      "loss": 1.1787,
+      "step": 6279
+    },
+    {
+      "epoch": 1.1180555555555556,
+      "grad_norm": 0.6374639272689819,
+      "learning_rate": 0.00016401508028507017,
+      "loss": 0.9298,
+      "step": 6280
+    },
+    {
+      "epoch": 1.1182336182336183,
+      "grad_norm": 0.7280316352844238,
+      "learning_rate": 0.00016400432611012869,
+      "loss": 1.1081,
+      "step": 6281
+    },
+    {
+      "epoch": 1.1184116809116809,
+      "grad_norm": 0.6070699095726013,
+      "learning_rate": 0.00016399357068115538,
+      "loss": 0.9107,
+      "step": 6282
+    },
+    {
+      "epoch": 1.1185897435897436,
+      "grad_norm": 0.6701489686965942,
+      "learning_rate": 0.00016398281399836097,
+      "loss": 1.0879,
+      "step": 6283
+    },
+    {
+      "epoch": 1.1187678062678064,
+      "grad_norm": 0.6343162655830383,
+      "learning_rate": 0.00016397205606195626,
+      "loss": 0.8552,
+      "step": 6284
+    },
+    {
+      "epoch": 1.118945868945869,
+      "grad_norm": 0.6450608968734741,
+      "learning_rate": 0.00016396129687215198,
+      "loss": 1.1119,
+      "step": 6285
+    },
+    {
+      "epoch": 1.1191239316239316,
+      "grad_norm": 0.7219904661178589,
+      "learning_rate": 0.00016395053642915896,
+      "loss": 0.9081,
+      "step": 6286
+    },
+    {
+      "epoch": 1.1193019943019944,
+      "grad_norm": 0.6189733147621155,
+      "learning_rate": 0.00016393977473318802,
+      "loss": 0.9818,
+      "step": 6287
+    },
+    {
+      "epoch": 1.119480056980057,
+      "grad_norm": 0.6310907602310181,
+      "learning_rate": 0.00016392901178445004,
+      "loss": 1.0334,
+      "step": 6288
+    },
+    {
+      "epoch": 1.1196581196581197,
+      "grad_norm": 0.6556720733642578,
+      "learning_rate": 0.00016391824758315587,
+      "loss": 1.0452,
+      "step": 6289
+    },
+    {
+      "epoch": 1.1198361823361824,
+      "grad_norm": 0.6697782278060913,
+      "learning_rate": 0.00016390748212951638,
+      "loss": 0.9627,
+      "step": 6290
+    },
+    {
+      "epoch": 1.120014245014245,
+      "grad_norm": 0.6341549754142761,
+      "learning_rate": 0.00016389671542374256,
+      "loss": 1.112,
+      "step": 6291
+    },
+    {
+      "epoch": 1.1201923076923077,
+      "grad_norm": 0.6913946270942688,
+      "learning_rate": 0.00016388594746604535,
+      "loss": 0.9622,
+      "step": 6292
+    },
+    {
+      "epoch": 1.1203703703703705,
+      "grad_norm": 0.695488691329956,
+      "learning_rate": 0.0001638751782566357,
+      "loss": 1.0951,
+      "step": 6293
+    },
+    {
+      "epoch": 1.120548433048433,
+      "grad_norm": 0.6965359449386597,
+      "learning_rate": 0.00016386440779572463,
+      "loss": 1.1742,
+      "step": 6294
+    },
+    {
+      "epoch": 1.1207264957264957,
+      "grad_norm": 0.624679684638977,
+      "learning_rate": 0.00016385363608352314,
+      "loss": 0.9756,
+      "step": 6295
+    },
+    {
+      "epoch": 1.1209045584045585,
+      "grad_norm": 0.7511318922042847,
+      "learning_rate": 0.0001638428631202423,
+      "loss": 0.907,
+      "step": 6296
+    },
+    {
+      "epoch": 1.121082621082621,
+      "grad_norm": 0.5334641337394714,
+      "learning_rate": 0.00016383208890609317,
+      "loss": 0.7932,
+      "step": 6297
+    },
+    {
+      "epoch": 1.1212606837606838,
+      "grad_norm": 0.7518552541732788,
+      "learning_rate": 0.00016382131344128687,
+      "loss": 1.1556,
+      "step": 6298
+    },
+    {
+      "epoch": 1.1214387464387465,
+      "grad_norm": 0.618618369102478,
+      "learning_rate": 0.00016381053672603449,
+      "loss": 1.1027,
+      "step": 6299
+    },
+    {
+      "epoch": 1.121616809116809,
+      "grad_norm": 0.638956606388092,
+      "learning_rate": 0.00016379975876054724,
+      "loss": 1.0377,
+      "step": 6300
+    },
+    {
+      "epoch": 1.1217948717948718,
+      "grad_norm": 0.8031370639801025,
+      "learning_rate": 0.0001637889795450362,
+      "loss": 1.0821,
+      "step": 6301
+    },
+    {
+      "epoch": 1.1219729344729346,
+      "grad_norm": 0.6710168123245239,
+      "learning_rate": 0.00016377819907971265,
+      "loss": 1.2896,
+      "step": 6302
+    },
+    {
+      "epoch": 1.122150997150997,
+      "grad_norm": 0.5850739479064941,
+      "learning_rate": 0.00016376741736478777,
+      "loss": 1.0836,
+      "step": 6303
+    },
+    {
+      "epoch": 1.1223290598290598,
+      "grad_norm": 0.6410611271858215,
+      "learning_rate": 0.0001637566344004728,
+      "loss": 1.0395,
+      "step": 6304
+    },
+    {
+      "epoch": 1.1225071225071226,
+      "grad_norm": 0.6884660720825195,
+      "learning_rate": 0.00016374585018697903,
+      "loss": 0.871,
+      "step": 6305
+    },
+    {
+      "epoch": 1.1226851851851851,
+      "grad_norm": 0.622207522392273,
+      "learning_rate": 0.00016373506472451777,
+      "loss": 0.9897,
+      "step": 6306
+    },
+    {
+      "epoch": 1.1228632478632479,
+      "grad_norm": 0.6018275618553162,
+      "learning_rate": 0.00016372427801330028,
+      "loss": 0.8398,
+      "step": 6307
+    },
+    {
+      "epoch": 1.1230413105413106,
+      "grad_norm": 0.6451539993286133,
+      "learning_rate": 0.00016371349005353796,
+      "loss": 0.9878,
+      "step": 6308
+    },
+    {
+      "epoch": 1.1232193732193732,
+      "grad_norm": 0.5549424886703491,
+      "learning_rate": 0.00016370270084544215,
+      "loss": 0.844,
+      "step": 6309
+    },
+    {
+      "epoch": 1.123397435897436,
+      "grad_norm": 0.6082940697669983,
+      "learning_rate": 0.00016369191038922423,
+      "loss": 1.0704,
+      "step": 6310
+    },
+    {
+      "epoch": 1.1235754985754987,
+      "grad_norm": 0.6423100829124451,
+      "learning_rate": 0.00016368111868509563,
+      "loss": 1.0639,
+      "step": 6311
+    },
+    {
+      "epoch": 1.1237535612535612,
+      "grad_norm": 0.6274200081825256,
+      "learning_rate": 0.00016367032573326784,
+      "loss": 0.9996,
+      "step": 6312
+    },
+    {
+      "epoch": 1.123931623931624,
+      "grad_norm": 0.6618558168411255,
+      "learning_rate": 0.00016365953153395227,
+      "loss": 0.8074,
+      "step": 6313
+    },
+    {
+      "epoch": 1.1241096866096867,
+      "grad_norm": 0.7624069452285767,
+      "learning_rate": 0.00016364873608736038,
+      "loss": 0.9741,
+      "step": 6314
+    },
+    {
+      "epoch": 1.1242877492877492,
+      "grad_norm": 0.5391361117362976,
+      "learning_rate": 0.00016363793939370375,
+      "loss": 0.6992,
+      "step": 6315
+    },
+    {
+      "epoch": 1.124465811965812,
+      "grad_norm": 0.7564396858215332,
+      "learning_rate": 0.0001636271414531939,
+      "loss": 1.1971,
+      "step": 6316
+    },
+    {
+      "epoch": 1.1246438746438747,
+      "grad_norm": 0.6584066152572632,
+      "learning_rate": 0.00016361634226604239,
+      "loss": 1.0842,
+      "step": 6317
+    },
+    {
+      "epoch": 1.1248219373219372,
+      "grad_norm": 0.6851227283477783,
+      "learning_rate": 0.00016360554183246078,
+      "loss": 1.0879,
+      "step": 6318
+    },
+    {
+      "epoch": 1.125,
+      "grad_norm": 0.5699417591094971,
+      "learning_rate": 0.00016359474015266074,
+      "loss": 0.782,
+      "step": 6319
+    },
+    {
+      "epoch": 1.1251780626780628,
+      "grad_norm": 0.5495570302009583,
+      "learning_rate": 0.00016358393722685385,
+      "loss": 1.076,
+      "step": 6320
+    },
+    {
+      "epoch": 1.1253561253561253,
+      "grad_norm": 0.5872206091880798,
+      "learning_rate": 0.0001635731330552518,
+      "loss": 0.8601,
+      "step": 6321
+    },
+    {
+      "epoch": 1.125534188034188,
+      "grad_norm": 0.7012827396392822,
+      "learning_rate": 0.00016356232763806627,
+      "loss": 1.0443,
+      "step": 6322
+    },
+    {
+      "epoch": 1.1257122507122508,
+      "grad_norm": 0.6645881533622742,
+      "learning_rate": 0.00016355152097550897,
+      "loss": 1.0027,
+      "step": 6323
+    },
+    {
+      "epoch": 1.1258903133903133,
+      "grad_norm": 0.7376120090484619,
+      "learning_rate": 0.00016354071306779163,
+      "loss": 1.1941,
+      "step": 6324
+    },
+    {
+      "epoch": 1.126068376068376,
+      "grad_norm": 0.648932695388794,
+      "learning_rate": 0.000163529903915126,
+      "loss": 1.096,
+      "step": 6325
+    },
+    {
+      "epoch": 1.1262464387464388,
+      "grad_norm": 0.6186314821243286,
+      "learning_rate": 0.0001635190935177239,
+      "loss": 1.011,
+      "step": 6326
+    },
+    {
+      "epoch": 1.1264245014245013,
+      "grad_norm": 0.5964710116386414,
+      "learning_rate": 0.0001635082818757971,
+      "loss": 0.8893,
+      "step": 6327
+    },
+    {
+      "epoch": 1.126602564102564,
+      "grad_norm": 0.5264934301376343,
+      "learning_rate": 0.00016349746898955747,
+      "loss": 0.7325,
+      "step": 6328
+    },
+    {
+      "epoch": 1.1267806267806268,
+      "grad_norm": 0.6523048877716064,
+      "learning_rate": 0.00016348665485921678,
+      "loss": 1.0488,
+      "step": 6329
+    },
+    {
+      "epoch": 1.1269586894586894,
+      "grad_norm": 0.6878600120544434,
+      "learning_rate": 0.00016347583948498703,
+      "loss": 1.0926,
+      "step": 6330
+    },
+    {
+      "epoch": 1.1271367521367521,
+      "grad_norm": 0.592656672000885,
+      "learning_rate": 0.00016346502286708004,
+      "loss": 0.978,
+      "step": 6331
+    },
+    {
+      "epoch": 1.1273148148148149,
+      "grad_norm": 0.6338315606117249,
+      "learning_rate": 0.00016345420500570777,
+      "loss": 1.1048,
+      "step": 6332
+    },
+    {
+      "epoch": 1.1274928774928774,
+      "grad_norm": 0.5955204367637634,
+      "learning_rate": 0.00016344338590108218,
+      "loss": 0.88,
+      "step": 6333
+    },
+    {
+      "epoch": 1.1276709401709402,
+      "grad_norm": 0.690448522567749,
+      "learning_rate": 0.0001634325655534152,
+      "loss": 1.0564,
+      "step": 6334
+    },
+    {
+      "epoch": 1.127849002849003,
+      "grad_norm": 0.6125795841217041,
+      "learning_rate": 0.00016342174396291888,
+      "loss": 1.0608,
+      "step": 6335
+    },
+    {
+      "epoch": 1.1280270655270654,
+      "grad_norm": 0.6387807726860046,
+      "learning_rate": 0.00016341092112980523,
+      "loss": 0.9581,
+      "step": 6336
+    },
+    {
+      "epoch": 1.1282051282051282,
+      "grad_norm": 0.6247823238372803,
+      "learning_rate": 0.0001634000970542863,
+      "loss": 0.932,
+      "step": 6337
+    },
+    {
+      "epoch": 1.128383190883191,
+      "grad_norm": 0.5928077697753906,
+      "learning_rate": 0.0001633892717365742,
+      "loss": 0.8963,
+      "step": 6338
+    },
+    {
+      "epoch": 1.1285612535612535,
+      "grad_norm": 0.5922074913978577,
+      "learning_rate": 0.000163378445176881,
+      "loss": 0.9772,
+      "step": 6339
+    },
+    {
+      "epoch": 1.1287393162393162,
+      "grad_norm": 0.6573056578636169,
+      "learning_rate": 0.00016336761737541878,
+      "loss": 0.8233,
+      "step": 6340
+    },
+    {
+      "epoch": 1.128917378917379,
+      "grad_norm": 0.627772867679596,
+      "learning_rate": 0.0001633567883323998,
+      "loss": 0.9618,
+      "step": 6341
+    },
+    {
+      "epoch": 1.1290954415954415,
+      "grad_norm": 0.6066579818725586,
+      "learning_rate": 0.0001633459580480361,
+      "loss": 0.9066,
+      "step": 6342
+    },
+    {
+      "epoch": 1.1292735042735043,
+      "grad_norm": 0.670295000076294,
+      "learning_rate": 0.00016333512652253997,
+      "loss": 0.8003,
+      "step": 6343
+    },
+    {
+      "epoch": 1.129451566951567,
+      "grad_norm": 0.6402488946914673,
+      "learning_rate": 0.0001633242937561236,
+      "loss": 0.998,
+      "step": 6344
+    },
+    {
+      "epoch": 1.1296296296296295,
+      "grad_norm": 0.7224995493888855,
+      "learning_rate": 0.00016331345974899923,
+      "loss": 1.0308,
+      "step": 6345
+    },
+    {
+      "epoch": 1.1298076923076923,
+      "grad_norm": 0.5019716620445251,
+      "learning_rate": 0.00016330262450137917,
+      "loss": 0.6874,
+      "step": 6346
+    },
+    {
+      "epoch": 1.129985754985755,
+      "grad_norm": 0.5774167776107788,
+      "learning_rate": 0.00016329178801347566,
+      "loss": 0.8287,
+      "step": 6347
+    },
+    {
+      "epoch": 1.1301638176638176,
+      "grad_norm": 0.7797795534133911,
+      "learning_rate": 0.00016328095028550103,
+      "loss": 1.2145,
+      "step": 6348
+    },
+    {
+      "epoch": 1.1303418803418803,
+      "grad_norm": 0.5384017825126648,
+      "learning_rate": 0.00016327011131766765,
+      "loss": 0.8022,
+      "step": 6349
+    },
+    {
+      "epoch": 1.130519943019943,
+      "grad_norm": 0.6350888609886169,
+      "learning_rate": 0.00016325927111018786,
+      "loss": 1.1178,
+      "step": 6350
+    },
+    {
+      "epoch": 1.1306980056980056,
+      "grad_norm": 0.6386831998825073,
+      "learning_rate": 0.0001632484296632741,
+      "loss": 0.967,
+      "step": 6351
+    },
+    {
+      "epoch": 1.1308760683760684,
+      "grad_norm": 0.6214167475700378,
+      "learning_rate": 0.0001632375869771387,
+      "loss": 0.9416,
+      "step": 6352
+    },
+    {
+      "epoch": 1.131054131054131,
+      "grad_norm": 0.6145567297935486,
+      "learning_rate": 0.00016322674305199416,
+      "loss": 0.9175,
+      "step": 6353
+    },
+    {
+      "epoch": 1.1312321937321936,
+      "grad_norm": 0.7027857303619385,
+      "learning_rate": 0.00016321589788805297,
+      "loss": 1.0063,
+      "step": 6354
+    },
+    {
+      "epoch": 1.1314102564102564,
+      "grad_norm": 0.6942669153213501,
+      "learning_rate": 0.00016320505148552755,
+      "loss": 0.9191,
+      "step": 6355
+    },
+    {
+      "epoch": 1.1315883190883191,
+      "grad_norm": 0.6388658285140991,
+      "learning_rate": 0.0001631942038446304,
+      "loss": 0.993,
+      "step": 6356
+    },
+    {
+      "epoch": 1.131766381766382,
+      "grad_norm": 0.6627292633056641,
+      "learning_rate": 0.00016318335496557415,
+      "loss": 1.0055,
+      "step": 6357
+    },
+    {
+      "epoch": 1.1319444444444444,
+      "grad_norm": 0.7997342944145203,
+      "learning_rate": 0.0001631725048485713,
+      "loss": 0.9019,
+      "step": 6358
+    },
+    {
+      "epoch": 1.1321225071225072,
+      "grad_norm": 0.8817830681800842,
+      "learning_rate": 0.00016316165349383445,
+      "loss": 0.9793,
+      "step": 6359
+    },
+    {
+      "epoch": 1.1323005698005697,
+      "grad_norm": 0.5629408955574036,
+      "learning_rate": 0.00016315080090157621,
+      "loss": 0.6139,
+      "step": 6360
+    },
+    {
+      "epoch": 1.1324786324786325,
+      "grad_norm": 0.647220253944397,
+      "learning_rate": 0.0001631399470720092,
+      "loss": 0.9776,
+      "step": 6361
+    },
+    {
+      "epoch": 1.1326566951566952,
+      "grad_norm": 0.6762630939483643,
+      "learning_rate": 0.0001631290920053461,
+      "loss": 1.1027,
+      "step": 6362
+    },
+    {
+      "epoch": 1.132834757834758,
+      "grad_norm": 0.5862727761268616,
+      "learning_rate": 0.00016311823570179957,
+      "loss": 1.1359,
+      "step": 6363
+    },
+    {
+      "epoch": 1.1330128205128205,
+      "grad_norm": 0.7042981386184692,
+      "learning_rate": 0.00016310737816158235,
+      "loss": 1.142,
+      "step": 6364
+    },
+    {
+      "epoch": 1.1331908831908832,
+      "grad_norm": 0.5990639328956604,
+      "learning_rate": 0.00016309651938490712,
+      "loss": 0.9306,
+      "step": 6365
+    },
+    {
+      "epoch": 1.1333689458689458,
+      "grad_norm": 0.5894871950149536,
+      "learning_rate": 0.00016308565937198669,
+      "loss": 0.8343,
+      "step": 6366
+    },
+    {
+      "epoch": 1.1335470085470085,
+      "grad_norm": 0.6863628029823303,
+      "learning_rate": 0.0001630747981230338,
+      "loss": 0.9552,
+      "step": 6367
+    },
+    {
+      "epoch": 1.1337250712250713,
+      "grad_norm": 0.7438958287239075,
+      "learning_rate": 0.00016306393563826128,
+      "loss": 1.0422,
+      "step": 6368
+    },
+    {
+      "epoch": 1.133903133903134,
+      "grad_norm": 0.5695775747299194,
+      "learning_rate": 0.00016305307191788194,
+      "loss": 0.8633,
+      "step": 6369
+    },
+    {
+      "epoch": 1.1340811965811965,
+      "grad_norm": 0.6257741451263428,
+      "learning_rate": 0.00016304220696210863,
+      "loss": 1.0333,
+      "step": 6370
+    },
+    {
+      "epoch": 1.1342592592592593,
+      "grad_norm": 0.6366072297096252,
+      "learning_rate": 0.00016303134077115425,
+      "loss": 1.1452,
+      "step": 6371
+    },
+    {
+      "epoch": 1.1344373219373218,
+      "grad_norm": 0.624569296836853,
+      "learning_rate": 0.00016302047334523168,
+      "loss": 1.0569,
+      "step": 6372
+    },
+    {
+      "epoch": 1.1346153846153846,
+      "grad_norm": 0.5585938096046448,
+      "learning_rate": 0.00016300960468455382,
+      "loss": 0.9612,
+      "step": 6373
+    },
+    {
+      "epoch": 1.1347934472934473,
+      "grad_norm": 0.5738831162452698,
+      "learning_rate": 0.00016299873478933368,
+      "loss": 0.9206,
+      "step": 6374
+    },
+    {
+      "epoch": 1.13497150997151,
+      "grad_norm": 0.6797143220901489,
+      "learning_rate": 0.00016298786365978417,
+      "loss": 1.0748,
+      "step": 6375
+    },
+    {
+      "epoch": 1.1351495726495726,
+      "grad_norm": 0.6341326832771301,
+      "learning_rate": 0.00016297699129611833,
+      "loss": 0.9901,
+      "step": 6376
+    },
+    {
+      "epoch": 1.1353276353276354,
+      "grad_norm": 0.6568490862846375,
+      "learning_rate": 0.00016296611769854916,
+      "loss": 1.0598,
+      "step": 6377
+    },
+    {
+      "epoch": 1.135505698005698,
+      "grad_norm": 0.6151928901672363,
+      "learning_rate": 0.00016295524286728973,
+      "loss": 0.8352,
+      "step": 6378
+    },
+    {
+      "epoch": 1.1356837606837606,
+      "grad_norm": 0.7209593057632446,
+      "learning_rate": 0.0001629443668025531,
+      "loss": 0.9945,
+      "step": 6379
+    },
+    {
+      "epoch": 1.1358618233618234,
+      "grad_norm": 0.6600689888000488,
+      "learning_rate": 0.00016293348950455235,
+      "loss": 1.0572,
+      "step": 6380
+    },
+    {
+      "epoch": 1.1360398860398861,
+      "grad_norm": 0.5587523579597473,
+      "learning_rate": 0.0001629226109735006,
+      "loss": 0.8526,
+      "step": 6381
+    },
+    {
+      "epoch": 1.1362179487179487,
+      "grad_norm": 0.6184542775154114,
+      "learning_rate": 0.00016291173120961102,
+      "loss": 0.8246,
+      "step": 6382
+    },
+    {
+      "epoch": 1.1363960113960114,
+      "grad_norm": 0.6604713797569275,
+      "learning_rate": 0.00016290085021309673,
+      "loss": 1.0349,
+      "step": 6383
+    },
+    {
+      "epoch": 1.136574074074074,
+      "grad_norm": 0.5880835056304932,
+      "learning_rate": 0.00016288996798417097,
+      "loss": 0.8726,
+      "step": 6384
+    },
+    {
+      "epoch": 1.1367521367521367,
+      "grad_norm": 0.5770880579948425,
+      "learning_rate": 0.00016287908452304692,
+      "loss": 0.7639,
+      "step": 6385
+    },
+    {
+      "epoch": 1.1369301994301995,
+      "grad_norm": 0.5719713568687439,
+      "learning_rate": 0.00016286819982993782,
+      "loss": 0.9717,
+      "step": 6386
+    },
+    {
+      "epoch": 1.1371082621082622,
+      "grad_norm": 0.7028461694717407,
+      "learning_rate": 0.00016285731390505695,
+      "loss": 1.0147,
+      "step": 6387
+    },
+    {
+      "epoch": 1.1372863247863247,
+      "grad_norm": 0.5396828651428223,
+      "learning_rate": 0.00016284642674861756,
+      "loss": 0.8119,
+      "step": 6388
+    },
+    {
+      "epoch": 1.1374643874643875,
+      "grad_norm": 0.592580258846283,
+      "learning_rate": 0.00016283553836083303,
+      "loss": 1.0914,
+      "step": 6389
+    },
+    {
+      "epoch": 1.13764245014245,
+      "grad_norm": 0.634596586227417,
+      "learning_rate": 0.00016282464874191663,
+      "loss": 1.1037,
+      "step": 6390
+    },
+    {
+      "epoch": 1.1378205128205128,
+      "grad_norm": 0.6462705731391907,
+      "learning_rate": 0.00016281375789208176,
+      "loss": 1.1523,
+      "step": 6391
+    },
+    {
+      "epoch": 1.1379985754985755,
+      "grad_norm": 0.6527917385101318,
+      "learning_rate": 0.0001628028658115418,
+      "loss": 1.0415,
+      "step": 6392
+    },
+    {
+      "epoch": 1.1381766381766383,
+      "grad_norm": 0.6309964060783386,
+      "learning_rate": 0.00016279197250051013,
+      "loss": 0.9747,
+      "step": 6393
+    },
+    {
+      "epoch": 1.1383547008547008,
+      "grad_norm": 0.6342993974685669,
+      "learning_rate": 0.00016278107795920018,
+      "loss": 0.9897,
+      "step": 6394
+    },
+    {
+      "epoch": 1.1385327635327636,
+      "grad_norm": 0.7149887084960938,
+      "learning_rate": 0.00016277018218782544,
+      "loss": 0.9659,
+      "step": 6395
+    },
+    {
+      "epoch": 1.138710826210826,
+      "grad_norm": 0.7219462394714355,
+      "learning_rate": 0.00016275928518659938,
+      "loss": 0.9301,
+      "step": 6396
+    },
+    {
+      "epoch": 1.1388888888888888,
+      "grad_norm": 0.6649485230445862,
+      "learning_rate": 0.0001627483869557355,
+      "loss": 0.9012,
+      "step": 6397
+    },
+    {
+      "epoch": 1.1390669515669516,
+      "grad_norm": 0.6910027861595154,
+      "learning_rate": 0.00016273748749544731,
+      "loss": 0.956,
+      "step": 6398
+    },
+    {
+      "epoch": 1.1392450142450143,
+      "grad_norm": 0.6369016766548157,
+      "learning_rate": 0.00016272658680594837,
+      "loss": 0.8027,
+      "step": 6399
+    },
+    {
+      "epoch": 1.1394230769230769,
+      "grad_norm": 0.6540524959564209,
+      "learning_rate": 0.00016271568488745227,
+      "loss": 1.2397,
+      "step": 6400
+    },
+    {
+      "epoch": 1.1396011396011396,
+      "grad_norm": 0.5912376046180725,
+      "learning_rate": 0.00016270478174017263,
+      "loss": 0.8453,
+      "step": 6401
+    },
+    {
+      "epoch": 1.1397792022792024,
+      "grad_norm": 0.6847240924835205,
+      "learning_rate": 0.00016269387736432303,
+      "loss": 0.9776,
+      "step": 6402
+    },
+    {
+      "epoch": 1.139957264957265,
+      "grad_norm": 0.6465024352073669,
+      "learning_rate": 0.00016268297176011716,
+      "loss": 0.8971,
+      "step": 6403
+    },
+    {
+      "epoch": 1.1401353276353277,
+      "grad_norm": 0.6639063954353333,
+      "learning_rate": 0.00016267206492776866,
+      "loss": 0.9756,
+      "step": 6404
+    },
+    {
+      "epoch": 1.1403133903133904,
+      "grad_norm": 0.6343763470649719,
+      "learning_rate": 0.00016266115686749123,
+      "loss": 0.9368,
+      "step": 6405
+    },
+    {
+      "epoch": 1.140491452991453,
+      "grad_norm": 0.7144993543624878,
+      "learning_rate": 0.0001626502475794986,
+      "loss": 0.9285,
+      "step": 6406
+    },
+    {
+      "epoch": 1.1406695156695157,
+      "grad_norm": 0.6217414736747742,
+      "learning_rate": 0.00016263933706400451,
+      "loss": 0.8867,
+      "step": 6407
+    },
+    {
+      "epoch": 1.1408475783475784,
+      "grad_norm": 0.6843730807304382,
+      "learning_rate": 0.00016262842532122274,
+      "loss": 0.9863,
+      "step": 6408
+    },
+    {
+      "epoch": 1.141025641025641,
+      "grad_norm": 0.6866166591644287,
+      "learning_rate": 0.00016261751235136705,
+      "loss": 1.0517,
+      "step": 6409
+    },
+    {
+      "epoch": 1.1412037037037037,
+      "grad_norm": 0.6650584936141968,
+      "learning_rate": 0.0001626065981546513,
+      "loss": 1.0629,
+      "step": 6410
+    },
+    {
+      "epoch": 1.1413817663817665,
+      "grad_norm": 0.5805012583732605,
+      "learning_rate": 0.00016259568273128933,
+      "loss": 0.8175,
+      "step": 6411
+    },
+    {
+      "epoch": 1.141559829059829,
+      "grad_norm": 0.7005903124809265,
+      "learning_rate": 0.00016258476608149497,
+      "loss": 1.0267,
+      "step": 6412
+    },
+    {
+      "epoch": 1.1417378917378918,
+      "grad_norm": 0.6293461322784424,
+      "learning_rate": 0.00016257384820548217,
+      "loss": 1.1034,
+      "step": 6413
+    },
+    {
+      "epoch": 1.1419159544159545,
+      "grad_norm": 0.6281774640083313,
+      "learning_rate": 0.00016256292910346476,
+      "loss": 1.0775,
+      "step": 6414
+    },
+    {
+      "epoch": 1.142094017094017,
+      "grad_norm": 0.5912862420082092,
+      "learning_rate": 0.0001625520087756567,
+      "loss": 0.9589,
+      "step": 6415
+    },
+    {
+      "epoch": 1.1422720797720798,
+      "grad_norm": 0.5813978314399719,
+      "learning_rate": 0.00016254108722227198,
+      "loss": 0.9195,
+      "step": 6416
+    },
+    {
+      "epoch": 1.1424501424501425,
+      "grad_norm": 0.650805652141571,
+      "learning_rate": 0.00016253016444352458,
+      "loss": 1.0207,
+      "step": 6417
+    },
+    {
+      "epoch": 1.142628205128205,
+      "grad_norm": 0.6909520030021667,
+      "learning_rate": 0.00016251924043962851,
+      "loss": 0.9854,
+      "step": 6418
+    },
+    {
+      "epoch": 1.1428062678062678,
+      "grad_norm": 0.6054595112800598,
+      "learning_rate": 0.0001625083152107978,
+      "loss": 0.852,
+      "step": 6419
+    },
+    {
+      "epoch": 1.1429843304843306,
+      "grad_norm": 0.601078987121582,
+      "learning_rate": 0.00016249738875724647,
+      "loss": 0.9609,
+      "step": 6420
+    },
+    {
+      "epoch": 1.143162393162393,
+      "grad_norm": 0.5340180397033691,
+      "learning_rate": 0.00016248646107918868,
+      "loss": 0.8364,
+      "step": 6421
+    },
+    {
+      "epoch": 1.1433404558404558,
+      "grad_norm": 0.6687821745872498,
+      "learning_rate": 0.00016247553217683846,
+      "loss": 1.005,
+      "step": 6422
+    },
+    {
+      "epoch": 1.1435185185185186,
+      "grad_norm": 0.6347902417182922,
+      "learning_rate": 0.00016246460205040998,
+      "loss": 1.026,
+      "step": 6423
+    },
+    {
+      "epoch": 1.1436965811965811,
+      "grad_norm": 0.6136734485626221,
+      "learning_rate": 0.00016245367070011736,
+      "loss": 0.7811,
+      "step": 6424
+    },
+    {
+      "epoch": 1.1438746438746439,
+      "grad_norm": 0.6591334342956543,
+      "learning_rate": 0.00016244273812617482,
+      "loss": 0.991,
+      "step": 6425
+    },
+    {
+      "epoch": 1.1440527065527066,
+      "grad_norm": 0.6062475442886353,
+      "learning_rate": 0.00016243180432879656,
+      "loss": 0.9879,
+      "step": 6426
+    },
+    {
+      "epoch": 1.1442307692307692,
+      "grad_norm": 0.5941380858421326,
+      "learning_rate": 0.00016242086930819678,
+      "loss": 0.9771,
+      "step": 6427
+    },
+    {
+      "epoch": 1.144408831908832,
+      "grad_norm": 0.7320533990859985,
+      "learning_rate": 0.00016240993306458973,
+      "loss": 1.0919,
+      "step": 6428
+    },
+    {
+      "epoch": 1.1445868945868947,
+      "grad_norm": 0.6998075246810913,
+      "learning_rate": 0.00016239899559818962,
+      "loss": 1.0721,
+      "step": 6429
+    },
+    {
+      "epoch": 1.1447649572649572,
+      "grad_norm": 0.847931444644928,
+      "learning_rate": 0.0001623880569092109,
+      "loss": 0.8759,
+      "step": 6430
+    },
+    {
+      "epoch": 1.14494301994302,
+      "grad_norm": 0.6670104265213013,
+      "learning_rate": 0.00016237711699786775,
+      "loss": 1.0515,
+      "step": 6431
+    },
+    {
+      "epoch": 1.1451210826210827,
+      "grad_norm": 0.601759672164917,
+      "learning_rate": 0.00016236617586437463,
+      "loss": 0.7298,
+      "step": 6432
+    },
+    {
+      "epoch": 1.1452991452991452,
+      "grad_norm": 0.6411594152450562,
+      "learning_rate": 0.00016235523350894578,
+      "loss": 0.9336,
+      "step": 6433
+    },
+    {
+      "epoch": 1.145477207977208,
+      "grad_norm": 0.6485120058059692,
+      "learning_rate": 0.0001623442899317957,
+      "loss": 1.1215,
+      "step": 6434
+    },
+    {
+      "epoch": 1.1456552706552707,
+      "grad_norm": 0.6041508316993713,
+      "learning_rate": 0.00016233334513313875,
+      "loss": 0.8917,
+      "step": 6435
+    },
+    {
+      "epoch": 1.1458333333333333,
+      "grad_norm": 0.6292745471000671,
+      "learning_rate": 0.0001623223991131894,
+      "loss": 0.9976,
+      "step": 6436
+    },
+    {
+      "epoch": 1.146011396011396,
+      "grad_norm": 0.5442200303077698,
+      "learning_rate": 0.0001623114518721621,
+      "loss": 0.8072,
+      "step": 6437
+    },
+    {
+      "epoch": 1.1461894586894588,
+      "grad_norm": 0.6668170094490051,
+      "learning_rate": 0.00016230050341027136,
+      "loss": 0.9641,
+      "step": 6438
+    },
+    {
+      "epoch": 1.1463675213675213,
+      "grad_norm": 0.644186794757843,
+      "learning_rate": 0.00016228955372773164,
+      "loss": 0.9248,
+      "step": 6439
+    },
+    {
+      "epoch": 1.146545584045584,
+      "grad_norm": 0.6661991477012634,
+      "learning_rate": 0.00016227860282475753,
+      "loss": 0.8719,
+      "step": 6440
+    },
+    {
+      "epoch": 1.1467236467236468,
+      "grad_norm": 0.5232062935829163,
+      "learning_rate": 0.00016226765070156355,
+      "loss": 0.5418,
+      "step": 6441
+    },
+    {
+      "epoch": 1.1469017094017093,
+      "grad_norm": 0.573176383972168,
+      "learning_rate": 0.00016225669735836436,
+      "loss": 1.0858,
+      "step": 6442
+    },
+    {
+      "epoch": 1.147079772079772,
+      "grad_norm": 0.6137439608573914,
+      "learning_rate": 0.00016224574279537446,
+      "loss": 1.1205,
+      "step": 6443
+    },
+    {
+      "epoch": 1.1472578347578348,
+      "grad_norm": 0.6328136920928955,
+      "learning_rate": 0.00016223478701280855,
+      "loss": 0.8957,
+      "step": 6444
+    },
+    {
+      "epoch": 1.1474358974358974,
+      "grad_norm": 0.6687374114990234,
+      "learning_rate": 0.00016222383001088126,
+      "loss": 1.0318,
+      "step": 6445
+    },
+    {
+      "epoch": 1.14761396011396,
+      "grad_norm": 0.6057115793228149,
+      "learning_rate": 0.0001622128717898073,
+      "loss": 0.9575,
+      "step": 6446
+    },
+    {
+      "epoch": 1.1477920227920229,
+      "grad_norm": 0.6758735775947571,
+      "learning_rate": 0.0001622019123498013,
+      "loss": 1.2273,
+      "step": 6447
+    },
+    {
+      "epoch": 1.1479700854700854,
+      "grad_norm": 0.6233550310134888,
+      "learning_rate": 0.0001621909516910781,
+      "loss": 0.7875,
+      "step": 6448
+    },
+    {
+      "epoch": 1.1481481481481481,
+      "grad_norm": 0.6371827721595764,
+      "learning_rate": 0.0001621799898138524,
+      "loss": 1.0488,
+      "step": 6449
+    },
+    {
+      "epoch": 1.148326210826211,
+      "grad_norm": 0.6179831624031067,
+      "learning_rate": 0.00016216902671833892,
+      "loss": 0.9792,
+      "step": 6450
+    },
+    {
+      "epoch": 1.1485042735042734,
+      "grad_norm": 0.6234193444252014,
+      "learning_rate": 0.00016215806240475256,
+      "loss": 0.927,
+      "step": 6451
+    },
+    {
+      "epoch": 1.1486823361823362,
+      "grad_norm": 0.6940563917160034,
+      "learning_rate": 0.00016214709687330803,
+      "loss": 1.047,
+      "step": 6452
+    },
+    {
+      "epoch": 1.148860398860399,
+      "grad_norm": 0.6567606925964355,
+      "learning_rate": 0.00016213613012422027,
+      "loss": 0.9695,
+      "step": 6453
+    },
+    {
+      "epoch": 1.1490384615384615,
+      "grad_norm": 0.7374183535575867,
+      "learning_rate": 0.0001621251621577041,
+      "loss": 1.0443,
+      "step": 6454
+    },
+    {
+      "epoch": 1.1492165242165242,
+      "grad_norm": 0.6789869666099548,
+      "learning_rate": 0.00016211419297397443,
+      "loss": 1.0319,
+      "step": 6455
+    },
+    {
+      "epoch": 1.149394586894587,
+      "grad_norm": 0.6225521564483643,
+      "learning_rate": 0.00016210322257324619,
+      "loss": 1.0529,
+      "step": 6456
+    },
+    {
+      "epoch": 1.1495726495726495,
+      "grad_norm": 0.619701623916626,
+      "learning_rate": 0.00016209225095573432,
+      "loss": 0.962,
+      "step": 6457
+    },
+    {
+      "epoch": 1.1497507122507122,
+      "grad_norm": 0.6132834553718567,
+      "learning_rate": 0.00016208127812165375,
+      "loss": 0.9588,
+      "step": 6458
+    },
+    {
+      "epoch": 1.149928774928775,
+      "grad_norm": 0.6005367040634155,
+      "learning_rate": 0.00016207030407121954,
+      "loss": 0.9497,
+      "step": 6459
+    },
+    {
+      "epoch": 1.1501068376068375,
+      "grad_norm": 0.575309157371521,
+      "learning_rate": 0.00016205932880464664,
+      "loss": 1.0035,
+      "step": 6460
+    },
+    {
+      "epoch": 1.1502849002849003,
+      "grad_norm": 0.5958710312843323,
+      "learning_rate": 0.0001620483523221501,
+      "loss": 1.0004,
+      "step": 6461
+    },
+    {
+      "epoch": 1.150462962962963,
+      "grad_norm": 0.5934719443321228,
+      "learning_rate": 0.000162037374623945,
+      "loss": 0.8694,
+      "step": 6462
+    },
+    {
+      "epoch": 1.1506410256410255,
+      "grad_norm": 0.6042510271072388,
+      "learning_rate": 0.00016202639571024643,
+      "loss": 0.8598,
+      "step": 6463
+    },
+    {
+      "epoch": 1.1508190883190883,
+      "grad_norm": 0.6206158399581909,
+      "learning_rate": 0.00016201541558126946,
+      "loss": 0.961,
+      "step": 6464
+    },
+    {
+      "epoch": 1.150997150997151,
+      "grad_norm": 0.5997715592384338,
+      "learning_rate": 0.00016200443423722925,
+      "loss": 0.8686,
+      "step": 6465
+    },
+    {
+      "epoch": 1.1511752136752136,
+      "grad_norm": 0.742457926273346,
+      "learning_rate": 0.00016199345167834098,
+      "loss": 1.1113,
+      "step": 6466
+    },
+    {
+      "epoch": 1.1513532763532763,
+      "grad_norm": 0.6772766709327698,
+      "learning_rate": 0.00016198246790481976,
+      "loss": 1.0717,
+      "step": 6467
+    },
+    {
+      "epoch": 1.151531339031339,
+      "grad_norm": 0.6127712726593018,
+      "learning_rate": 0.0001619714829168809,
+      "loss": 0.8887,
+      "step": 6468
+    },
+    {
+      "epoch": 1.1517094017094016,
+      "grad_norm": 0.5585067272186279,
+      "learning_rate": 0.00016196049671473954,
+      "loss": 1.0144,
+      "step": 6469
+    },
+    {
+      "epoch": 1.1518874643874644,
+      "grad_norm": 0.6269431710243225,
+      "learning_rate": 0.00016194950929861092,
+      "loss": 1.0206,
+      "step": 6470
+    },
+    {
+      "epoch": 1.1520655270655271,
+      "grad_norm": 0.6270785331726074,
+      "learning_rate": 0.0001619385206687104,
+      "loss": 1.0517,
+      "step": 6471
+    },
+    {
+      "epoch": 1.1522435897435896,
+      "grad_norm": 0.744712233543396,
+      "learning_rate": 0.00016192753082525322,
+      "loss": 1.0699,
+      "step": 6472
+    },
+    {
+      "epoch": 1.1524216524216524,
+      "grad_norm": 0.7025929689407349,
+      "learning_rate": 0.00016191653976845474,
+      "loss": 0.951,
+      "step": 6473
+    },
+    {
+      "epoch": 1.1525997150997151,
+      "grad_norm": 0.6175379753112793,
+      "learning_rate": 0.00016190554749853024,
+      "loss": 1.2153,
+      "step": 6474
+    },
+    {
+      "epoch": 1.1527777777777777,
+      "grad_norm": 0.6212149858474731,
+      "learning_rate": 0.00016189455401569513,
+      "loss": 1.0428,
+      "step": 6475
+    },
+    {
+      "epoch": 1.1529558404558404,
+      "grad_norm": 0.6716817617416382,
+      "learning_rate": 0.00016188355932016484,
+      "loss": 1.179,
+      "step": 6476
+    },
+    {
+      "epoch": 1.1531339031339032,
+      "grad_norm": 0.6247739791870117,
+      "learning_rate": 0.00016187256341215476,
+      "loss": 0.9451,
+      "step": 6477
+    },
+    {
+      "epoch": 1.153311965811966,
+      "grad_norm": 0.6223008036613464,
+      "learning_rate": 0.00016186156629188032,
+      "loss": 0.9915,
+      "step": 6478
+    },
+    {
+      "epoch": 1.1534900284900285,
+      "grad_norm": 0.5610866546630859,
+      "learning_rate": 0.000161850567959557,
+      "loss": 0.7741,
+      "step": 6479
+    },
+    {
+      "epoch": 1.1536680911680912,
+      "grad_norm": 0.6241226196289062,
+      "learning_rate": 0.0001618395684154003,
+      "loss": 1.2193,
+      "step": 6480
+    },
+    {
+      "epoch": 1.1538461538461537,
+      "grad_norm": 0.703789472579956,
+      "learning_rate": 0.00016182856765962567,
+      "loss": 1.0725,
+      "step": 6481
+    },
+    {
+      "epoch": 1.1540242165242165,
+      "grad_norm": 0.6802006959915161,
+      "learning_rate": 0.00016181756569244872,
+      "loss": 1.0908,
+      "step": 6482
+    },
+    {
+      "epoch": 1.1542022792022792,
+      "grad_norm": 0.6504136919975281,
+      "learning_rate": 0.000161806562514085,
+      "loss": 0.9706,
+      "step": 6483
+    },
+    {
+      "epoch": 1.154380341880342,
+      "grad_norm": 0.7217034101486206,
+      "learning_rate": 0.00016179555812475003,
+      "loss": 0.9084,
+      "step": 6484
+    },
+    {
+      "epoch": 1.1545584045584045,
+      "grad_norm": 0.5919039249420166,
+      "learning_rate": 0.0001617845525246595,
+      "loss": 0.949,
+      "step": 6485
+    },
+    {
+      "epoch": 1.1547364672364673,
+      "grad_norm": 0.6160184741020203,
+      "learning_rate": 0.00016177354571402902,
+      "loss": 0.8144,
+      "step": 6486
+    },
+    {
+      "epoch": 1.1549145299145298,
+      "grad_norm": 0.7323806285858154,
+      "learning_rate": 0.00016176253769307426,
+      "loss": 1.0528,
+      "step": 6487
+    },
+    {
+      "epoch": 1.1550925925925926,
+      "grad_norm": 0.6051317453384399,
+      "learning_rate": 0.0001617515284620108,
+      "loss": 0.9558,
+      "step": 6488
+    },
+    {
+      "epoch": 1.1552706552706553,
+      "grad_norm": 0.6418905258178711,
+      "learning_rate": 0.00016174051802105447,
+      "loss": 1.062,
+      "step": 6489
+    },
+    {
+      "epoch": 1.155448717948718,
+      "grad_norm": 0.6914883852005005,
+      "learning_rate": 0.00016172950637042096,
+      "loss": 0.9999,
+      "step": 6490
+    },
+    {
+      "epoch": 1.1556267806267806,
+      "grad_norm": 0.5558316707611084,
+      "learning_rate": 0.000161718493510326,
+      "loss": 0.9561,
+      "step": 6491
+    },
+    {
+      "epoch": 1.1558048433048433,
+      "grad_norm": 0.6632496118545532,
+      "learning_rate": 0.00016170747944098531,
+      "loss": 1.0133,
+      "step": 6492
+    },
+    {
+      "epoch": 1.1559829059829059,
+      "grad_norm": 0.6407149434089661,
+      "learning_rate": 0.00016169646416261478,
+      "loss": 1.0563,
+      "step": 6493
+    },
+    {
+      "epoch": 1.1561609686609686,
+      "grad_norm": 0.8128494024276733,
+      "learning_rate": 0.0001616854476754302,
+      "loss": 1.1559,
+      "step": 6494
+    },
+    {
+      "epoch": 1.1563390313390314,
+      "grad_norm": 0.6403429508209229,
+      "learning_rate": 0.00016167442997964742,
+      "loss": 1.0983,
+      "step": 6495
+    },
+    {
+      "epoch": 1.1565170940170941,
+      "grad_norm": 0.76612788438797,
+      "learning_rate": 0.0001616634110754823,
+      "loss": 0.973,
+      "step": 6496
+    },
+    {
+      "epoch": 1.1566951566951567,
+      "grad_norm": 0.6914355754852295,
+      "learning_rate": 0.0001616523909631507,
+      "loss": 0.9307,
+      "step": 6497
+    },
+    {
+      "epoch": 1.1568732193732194,
+      "grad_norm": 0.546602725982666,
+      "learning_rate": 0.00016164136964286863,
+      "loss": 1.0328,
+      "step": 6498
+    },
+    {
+      "epoch": 1.157051282051282,
+      "grad_norm": 0.5695818662643433,
+      "learning_rate": 0.00016163034711485193,
+      "loss": 0.9607,
+      "step": 6499
+    },
+    {
+      "epoch": 1.1572293447293447,
+      "grad_norm": 0.5649738311767578,
+      "learning_rate": 0.00016161932337931662,
+      "loss": 1.1521,
+      "step": 6500
+    },
+    {
+      "epoch": 1.1574074074074074,
+      "grad_norm": 0.6437582969665527,
+      "learning_rate": 0.00016160829843647867,
+      "loss": 0.9613,
+      "step": 6501
+    },
+    {
+      "epoch": 1.1575854700854702,
+      "grad_norm": 0.5841929316520691,
+      "learning_rate": 0.0001615972722865541,
+      "loss": 0.8187,
+      "step": 6502
+    },
+    {
+      "epoch": 1.1577635327635327,
+      "grad_norm": 0.6481246948242188,
+      "learning_rate": 0.00016158624492975892,
+      "loss": 1.0447,
+      "step": 6503
+    },
+    {
+      "epoch": 1.1579415954415955,
+      "grad_norm": 0.629804790019989,
+      "learning_rate": 0.0001615752163663092,
+      "loss": 0.9034,
+      "step": 6504
+    },
+    {
+      "epoch": 1.158119658119658,
+      "grad_norm": 0.5797054171562195,
+      "learning_rate": 0.00016156418659642104,
+      "loss": 0.8168,
+      "step": 6505
+    },
+    {
+      "epoch": 1.1582977207977208,
+      "grad_norm": 0.588424563407898,
+      "learning_rate": 0.00016155315562031052,
+      "loss": 0.828,
+      "step": 6506
+    },
+    {
+      "epoch": 1.1584757834757835,
+      "grad_norm": 0.7120068669319153,
+      "learning_rate": 0.0001615421234381938,
+      "loss": 1.0637,
+      "step": 6507
+    },
+    {
+      "epoch": 1.1586538461538463,
+      "grad_norm": 0.6635081768035889,
+      "learning_rate": 0.00016153109005028702,
+      "loss": 0.9838,
+      "step": 6508
+    },
+    {
+      "epoch": 1.1588319088319088,
+      "grad_norm": 0.6080414056777954,
+      "learning_rate": 0.00016152005545680634,
+      "loss": 0.983,
+      "step": 6509
+    },
+    {
+      "epoch": 1.1590099715099715,
+      "grad_norm": 0.7131237983703613,
+      "learning_rate": 0.00016150901965796796,
+      "loss": 1.1053,
+      "step": 6510
+    },
+    {
+      "epoch": 1.159188034188034,
+      "grad_norm": 0.6051005125045776,
+      "learning_rate": 0.00016149798265398813,
+      "loss": 0.9903,
+      "step": 6511
+    },
+    {
+      "epoch": 1.1593660968660968,
+      "grad_norm": 0.6193733811378479,
+      "learning_rate": 0.00016148694444508306,
+      "loss": 1.0478,
+      "step": 6512
+    },
+    {
+      "epoch": 1.1595441595441596,
+      "grad_norm": 0.567888081073761,
+      "learning_rate": 0.00016147590503146905,
+      "loss": 0.7995,
+      "step": 6513
+    },
+    {
+      "epoch": 1.1597222222222223,
+      "grad_norm": 0.6889783143997192,
+      "learning_rate": 0.00016146486441336242,
+      "loss": 0.9684,
+      "step": 6514
+    },
+    {
+      "epoch": 1.1599002849002849,
+      "grad_norm": 0.6470308303833008,
+      "learning_rate": 0.0001614538225909794,
+      "loss": 0.9824,
+      "step": 6515
+    },
+    {
+      "epoch": 1.1600783475783476,
+      "grad_norm": 0.6833886504173279,
+      "learning_rate": 0.00016144277956453638,
+      "loss": 0.9845,
+      "step": 6516
+    },
+    {
+      "epoch": 1.1602564102564104,
+      "grad_norm": 0.5827815532684326,
+      "learning_rate": 0.00016143173533424978,
+      "loss": 0.9476,
+      "step": 6517
+    },
+    {
+      "epoch": 1.1604344729344729,
+      "grad_norm": 0.6701242327690125,
+      "learning_rate": 0.00016142068990033593,
+      "loss": 1.0839,
+      "step": 6518
+    },
+    {
+      "epoch": 1.1606125356125356,
+      "grad_norm": 0.5844996571540833,
+      "learning_rate": 0.00016140964326301122,
+      "loss": 0.8861,
+      "step": 6519
+    },
+    {
+      "epoch": 1.1607905982905984,
+      "grad_norm": 0.5831994414329529,
+      "learning_rate": 0.00016139859542249214,
+      "loss": 0.9817,
+      "step": 6520
+    },
+    {
+      "epoch": 1.160968660968661,
+      "grad_norm": 0.6830124855041504,
+      "learning_rate": 0.0001613875463789951,
+      "loss": 0.8749,
+      "step": 6521
+    },
+    {
+      "epoch": 1.1611467236467237,
+      "grad_norm": 0.6003018021583557,
+      "learning_rate": 0.00016137649613273667,
+      "loss": 0.9593,
+      "step": 6522
+    },
+    {
+      "epoch": 1.1613247863247864,
+      "grad_norm": 0.5973994731903076,
+      "learning_rate": 0.00016136544468393327,
+      "loss": 1.0384,
+      "step": 6523
+    },
+    {
+      "epoch": 1.161502849002849,
+      "grad_norm": 0.6702523827552795,
+      "learning_rate": 0.00016135439203280143,
+      "loss": 1.0431,
+      "step": 6524
+    },
+    {
+      "epoch": 1.1616809116809117,
+      "grad_norm": 0.6160697937011719,
+      "learning_rate": 0.00016134333817955775,
+      "loss": 1.0339,
+      "step": 6525
+    },
+    {
+      "epoch": 1.1618589743589745,
+      "grad_norm": 0.7078264355659485,
+      "learning_rate": 0.0001613322831244188,
+      "loss": 1.0285,
+      "step": 6526
+    },
+    {
+      "epoch": 1.162037037037037,
+      "grad_norm": 0.5744216442108154,
+      "learning_rate": 0.00016132122686760117,
+      "loss": 0.6589,
+      "step": 6527
+    },
+    {
+      "epoch": 1.1622150997150997,
+      "grad_norm": 0.6802098155021667,
+      "learning_rate": 0.00016131016940932146,
+      "loss": 0.9532,
+      "step": 6528
+    },
+    {
+      "epoch": 1.1623931623931625,
+      "grad_norm": 0.6523237228393555,
+      "learning_rate": 0.00016129911074979635,
+      "loss": 0.9409,
+      "step": 6529
+    },
+    {
+      "epoch": 1.162571225071225,
+      "grad_norm": 0.710307776927948,
+      "learning_rate": 0.00016128805088924252,
+      "loss": 1.2536,
+      "step": 6530
+    },
+    {
+      "epoch": 1.1627492877492878,
+      "grad_norm": 0.6349819898605347,
+      "learning_rate": 0.0001612769898278766,
+      "loss": 1.0857,
+      "step": 6531
+    },
+    {
+      "epoch": 1.1629273504273505,
+      "grad_norm": 0.5348139405250549,
+      "learning_rate": 0.00016126592756591542,
+      "loss": 0.5969,
+      "step": 6532
+    },
+    {
+      "epoch": 1.163105413105413,
+      "grad_norm": 0.635619580745697,
+      "learning_rate": 0.00016125486410357564,
+      "loss": 0.9885,
+      "step": 6533
+    },
+    {
+      "epoch": 1.1632834757834758,
+      "grad_norm": 0.6434559226036072,
+      "learning_rate": 0.000161243799441074,
+      "loss": 0.8377,
+      "step": 6534
+    },
+    {
+      "epoch": 1.1634615384615385,
+      "grad_norm": 0.6509647369384766,
+      "learning_rate": 0.00016123273357862737,
+      "loss": 0.8393,
+      "step": 6535
+    },
+    {
+      "epoch": 1.163639601139601,
+      "grad_norm": 0.6179081797599792,
+      "learning_rate": 0.0001612216665164525,
+      "loss": 0.9143,
+      "step": 6536
+    },
+    {
+      "epoch": 1.1638176638176638,
+      "grad_norm": 0.5923223495483398,
+      "learning_rate": 0.0001612105982547663,
+      "loss": 1.0185,
+      "step": 6537
+    },
+    {
+      "epoch": 1.1639957264957266,
+      "grad_norm": 0.702150285243988,
+      "learning_rate": 0.00016119952879378556,
+      "loss": 0.863,
+      "step": 6538
+    },
+    {
+      "epoch": 1.164173789173789,
+      "grad_norm": 0.6596643328666687,
+      "learning_rate": 0.00016118845813372715,
+      "loss": 1.0089,
+      "step": 6539
+    },
+    {
+      "epoch": 1.1643518518518519,
+      "grad_norm": 0.7675769329071045,
+      "learning_rate": 0.00016117738627480804,
+      "loss": 1.0179,
+      "step": 6540
+    },
+    {
+      "epoch": 1.1645299145299146,
+      "grad_norm": 0.6742541193962097,
+      "learning_rate": 0.00016116631321724513,
+      "loss": 1.0663,
+      "step": 6541
+    },
+    {
+      "epoch": 1.1647079772079771,
+      "grad_norm": 0.7379785776138306,
+      "learning_rate": 0.0001611552389612554,
+      "loss": 1.0162,
+      "step": 6542
+    },
+    {
+      "epoch": 1.16488603988604,
+      "grad_norm": 0.5729365944862366,
+      "learning_rate": 0.00016114416350705577,
+      "loss": 0.8146,
+      "step": 6543
+    },
+    {
+      "epoch": 1.1650641025641026,
+      "grad_norm": 0.6481349468231201,
+      "learning_rate": 0.00016113308685486327,
+      "loss": 1.0748,
+      "step": 6544
+    },
+    {
+      "epoch": 1.1652421652421652,
+      "grad_norm": 0.5588181018829346,
+      "learning_rate": 0.00016112200900489493,
+      "loss": 0.7511,
+      "step": 6545
+    },
+    {
+      "epoch": 1.165420227920228,
+      "grad_norm": 0.674363911151886,
+      "learning_rate": 0.0001611109299573678,
+      "loss": 0.9852,
+      "step": 6546
+    },
+    {
+      "epoch": 1.1655982905982907,
+      "grad_norm": 0.6712620854377747,
+      "learning_rate": 0.00016109984971249893,
+      "loss": 0.9558,
+      "step": 6547
+    },
+    {
+      "epoch": 1.1657763532763532,
+      "grad_norm": 0.5260626077651978,
+      "learning_rate": 0.00016108876827050544,
+      "loss": 0.7008,
+      "step": 6548
+    },
+    {
+      "epoch": 1.165954415954416,
+      "grad_norm": 0.6056292057037354,
+      "learning_rate": 0.00016107768563160445,
+      "loss": 0.7756,
+      "step": 6549
+    },
+    {
+      "epoch": 1.1661324786324787,
+      "grad_norm": 0.5725821256637573,
+      "learning_rate": 0.00016106660179601308,
+      "loss": 0.8228,
+      "step": 6550
+    },
+    {
+      "epoch": 1.1663105413105412,
+      "grad_norm": 0.6708397269248962,
+      "learning_rate": 0.00016105551676394848,
+      "loss": 1.0711,
+      "step": 6551
+    },
+    {
+      "epoch": 1.166488603988604,
+      "grad_norm": 0.645453155040741,
+      "learning_rate": 0.00016104443053562787,
+      "loss": 0.9299,
+      "step": 6552
+    },
+    {
+      "epoch": 1.1666666666666667,
+      "grad_norm": 0.6743524074554443,
+      "learning_rate": 0.00016103334311126847,
+      "loss": 0.8977,
+      "step": 6553
+    },
+    {
+      "epoch": 1.1668447293447293,
+      "grad_norm": 0.7248545289039612,
+      "learning_rate": 0.0001610222544910875,
+      "loss": 1.2135,
+      "step": 6554
+    },
+    {
+      "epoch": 1.167022792022792,
+      "grad_norm": 0.5798853635787964,
+      "learning_rate": 0.00016101116467530217,
+      "loss": 0.857,
+      "step": 6555
+    },
+    {
+      "epoch": 1.1672008547008548,
+      "grad_norm": 0.6828082799911499,
+      "learning_rate": 0.00016100007366412985,
+      "loss": 0.9405,
+      "step": 6556
+    },
+    {
+      "epoch": 1.1673789173789173,
+      "grad_norm": 0.6820163130760193,
+      "learning_rate": 0.0001609889814577878,
+      "loss": 0.9144,
+      "step": 6557
+    },
+    {
+      "epoch": 1.16755698005698,
+      "grad_norm": 0.6482275128364563,
+      "learning_rate": 0.00016097788805649333,
+      "loss": 0.8586,
+      "step": 6558
+    },
+    {
+      "epoch": 1.1677350427350428,
+      "grad_norm": 0.6404715180397034,
+      "learning_rate": 0.00016096679346046385,
+      "loss": 0.7018,
+      "step": 6559
+    },
+    {
+      "epoch": 1.1679131054131053,
+      "grad_norm": 0.6315203309059143,
+      "learning_rate": 0.0001609556976699167,
+      "loss": 0.9602,
+      "step": 6560
+    },
+    {
+      "epoch": 1.168091168091168,
+      "grad_norm": 0.5521387457847595,
+      "learning_rate": 0.00016094460068506925,
+      "loss": 0.9294,
+      "step": 6561
+    },
+    {
+      "epoch": 1.1682692307692308,
+      "grad_norm": 0.583372175693512,
+      "learning_rate": 0.00016093350250613895,
+      "loss": 1.077,
+      "step": 6562
+    },
+    {
+      "epoch": 1.1684472934472934,
+      "grad_norm": 0.5990512371063232,
+      "learning_rate": 0.00016092240313334325,
+      "loss": 1.0102,
+      "step": 6563
+    },
+    {
+      "epoch": 1.1686253561253561,
+      "grad_norm": 0.675128161907196,
+      "learning_rate": 0.00016091130256689964,
+      "loss": 1.0407,
+      "step": 6564
+    },
+    {
+      "epoch": 1.1688034188034189,
+      "grad_norm": 0.48797324299812317,
+      "learning_rate": 0.00016090020080702556,
+      "loss": 0.7821,
+      "step": 6565
+    },
+    {
+      "epoch": 1.1689814814814814,
+      "grad_norm": 0.7487484216690063,
+      "learning_rate": 0.00016088909785393857,
+      "loss": 1.0444,
+      "step": 6566
+    },
+    {
+      "epoch": 1.1691595441595442,
+      "grad_norm": 0.6288858652114868,
+      "learning_rate": 0.00016087799370785618,
+      "loss": 1.1854,
+      "step": 6567
+    },
+    {
+      "epoch": 1.169337606837607,
+      "grad_norm": 0.6639021635055542,
+      "learning_rate": 0.000160866888368996,
+      "loss": 0.9632,
+      "step": 6568
+    },
+    {
+      "epoch": 1.1695156695156694,
+      "grad_norm": 0.6553738713264465,
+      "learning_rate": 0.00016085578183757556,
+      "loss": 1.2765,
+      "step": 6569
+    },
+    {
+      "epoch": 1.1696937321937322,
+      "grad_norm": 0.7489066123962402,
+      "learning_rate": 0.00016084467411381248,
+      "loss": 1.0705,
+      "step": 6570
+    },
+    {
+      "epoch": 1.169871794871795,
+      "grad_norm": 0.7079828381538391,
+      "learning_rate": 0.00016083356519792444,
+      "loss": 0.8256,
+      "step": 6571
+    },
+    {
+      "epoch": 1.1700498575498575,
+      "grad_norm": 0.7065926790237427,
+      "learning_rate": 0.00016082245509012902,
+      "loss": 1.0439,
+      "step": 6572
+    },
+    {
+      "epoch": 1.1702279202279202,
+      "grad_norm": 0.6113346815109253,
+      "learning_rate": 0.00016081134379064395,
+      "loss": 0.9153,
+      "step": 6573
+    },
+    {
+      "epoch": 1.170405982905983,
+      "grad_norm": 0.6094171404838562,
+      "learning_rate": 0.0001608002312996869,
+      "loss": 0.9723,
+      "step": 6574
+    },
+    {
+      "epoch": 1.1705840455840455,
+      "grad_norm": 0.6208072900772095,
+      "learning_rate": 0.00016078911761747565,
+      "loss": 0.948,
+      "step": 6575
+    },
+    {
+      "epoch": 1.1707621082621082,
+      "grad_norm": 0.5736680626869202,
+      "learning_rate": 0.00016077800274422792,
+      "loss": 0.9155,
+      "step": 6576
+    },
+    {
+      "epoch": 1.170940170940171,
+      "grad_norm": 0.6793957948684692,
+      "learning_rate": 0.0001607668866801615,
+      "loss": 0.9574,
+      "step": 6577
+    },
+    {
+      "epoch": 1.1711182336182335,
+      "grad_norm": 0.6251805424690247,
+      "learning_rate": 0.00016075576942549413,
+      "loss": 1.0319,
+      "step": 6578
+    },
+    {
+      "epoch": 1.1712962962962963,
+      "grad_norm": 0.628882110118866,
+      "learning_rate": 0.0001607446509804437,
+      "loss": 0.9336,
+      "step": 6579
+    },
+    {
+      "epoch": 1.171474358974359,
+      "grad_norm": 0.6712356805801392,
+      "learning_rate": 0.000160733531345228,
+      "loss": 1.0958,
+      "step": 6580
+    },
+    {
+      "epoch": 1.1716524216524216,
+      "grad_norm": 0.599365770816803,
+      "learning_rate": 0.0001607224105200649,
+      "loss": 0.9814,
+      "step": 6581
+    },
+    {
+      "epoch": 1.1718304843304843,
+      "grad_norm": 0.5798245668411255,
+      "learning_rate": 0.00016071128850517235,
+      "loss": 1.0355,
+      "step": 6582
+    },
+    {
+      "epoch": 1.172008547008547,
+      "grad_norm": 0.7646229863166809,
+      "learning_rate": 0.00016070016530076817,
+      "loss": 0.9976,
+      "step": 6583
+    },
+    {
+      "epoch": 1.1721866096866096,
+      "grad_norm": 0.6371127367019653,
+      "learning_rate": 0.0001606890409070704,
+      "loss": 0.9588,
+      "step": 6584
+    },
+    {
+      "epoch": 1.1723646723646723,
+      "grad_norm": 0.6497066617012024,
+      "learning_rate": 0.0001606779153242969,
+      "loss": 0.8817,
+      "step": 6585
+    },
+    {
+      "epoch": 1.172542735042735,
+      "grad_norm": 0.7255781888961792,
+      "learning_rate": 0.0001606667885526657,
+      "loss": 1.1319,
+      "step": 6586
+    },
+    {
+      "epoch": 1.1727207977207976,
+      "grad_norm": 0.67711341381073,
+      "learning_rate": 0.00016065566059239483,
+      "loss": 1.0755,
+      "step": 6587
+    },
+    {
+      "epoch": 1.1728988603988604,
+      "grad_norm": 0.6159650087356567,
+      "learning_rate": 0.00016064453144370227,
+      "loss": 0.9892,
+      "step": 6588
+    },
+    {
+      "epoch": 1.1730769230769231,
+      "grad_norm": 0.658938467502594,
+      "learning_rate": 0.00016063340110680609,
+      "loss": 0.9131,
+      "step": 6589
+    },
+    {
+      "epoch": 1.1732549857549857,
+      "grad_norm": 0.6754795908927917,
+      "learning_rate": 0.00016062226958192438,
+      "loss": 1.0119,
+      "step": 6590
+    },
+    {
+      "epoch": 1.1734330484330484,
+      "grad_norm": 0.6453405022621155,
+      "learning_rate": 0.00016061113686927523,
+      "loss": 0.997,
+      "step": 6591
+    },
+    {
+      "epoch": 1.1736111111111112,
+      "grad_norm": 0.6580284237861633,
+      "learning_rate": 0.00016060000296907675,
+      "loss": 0.8432,
+      "step": 6592
+    },
+    {
+      "epoch": 1.173789173789174,
+      "grad_norm": 0.6588153839111328,
+      "learning_rate": 0.00016058886788154712,
+      "loss": 1.0725,
+      "step": 6593
+    },
+    {
+      "epoch": 1.1739672364672364,
+      "grad_norm": 0.6247910857200623,
+      "learning_rate": 0.00016057773160690447,
+      "loss": 0.8736,
+      "step": 6594
+    },
+    {
+      "epoch": 1.1741452991452992,
+      "grad_norm": 0.579594075679779,
+      "learning_rate": 0.000160566594145367,
+      "loss": 0.8809,
+      "step": 6595
+    },
+    {
+      "epoch": 1.1743233618233617,
+      "grad_norm": 0.6738116145133972,
+      "learning_rate": 0.00016055545549715293,
+      "loss": 0.825,
+      "step": 6596
+    },
+    {
+      "epoch": 1.1745014245014245,
+      "grad_norm": 0.6658982634544373,
+      "learning_rate": 0.00016054431566248054,
+      "loss": 1.0809,
+      "step": 6597
+    },
+    {
+      "epoch": 1.1746794871794872,
+      "grad_norm": 0.5367915630340576,
+      "learning_rate": 0.00016053317464156803,
+      "loss": 0.9005,
+      "step": 6598
+    },
+    {
+      "epoch": 1.17485754985755,
+      "grad_norm": 0.7243228554725647,
+      "learning_rate": 0.00016052203243463372,
+      "loss": 1.0573,
+      "step": 6599
+    },
+    {
+      "epoch": 1.1750356125356125,
+      "grad_norm": 0.6359432935714722,
+      "learning_rate": 0.0001605108890418959,
+      "loss": 0.8569,
+      "step": 6600
+    },
+    {
+      "epoch": 1.1752136752136753,
+      "grad_norm": 0.6565225720405579,
+      "learning_rate": 0.0001604997444635729,
+      "loss": 0.9748,
+      "step": 6601
+    },
+    {
+      "epoch": 1.1753917378917378,
+      "grad_norm": 0.7124663591384888,
+      "learning_rate": 0.0001604885986998831,
+      "loss": 1.0271,
+      "step": 6602
+    },
+    {
+      "epoch": 1.1755698005698005,
+      "grad_norm": 0.659766435623169,
+      "learning_rate": 0.00016047745175104487,
+      "loss": 1.0635,
+      "step": 6603
+    },
+    {
+      "epoch": 1.1757478632478633,
+      "grad_norm": 0.5874318480491638,
+      "learning_rate": 0.00016046630361727656,
+      "loss": 0.9257,
+      "step": 6604
+    },
+    {
+      "epoch": 1.175925925925926,
+      "grad_norm": 0.587345540523529,
+      "learning_rate": 0.0001604551542987967,
+      "loss": 1.0759,
+      "step": 6605
+    },
+    {
+      "epoch": 1.1761039886039886,
+      "grad_norm": 0.733567476272583,
+      "learning_rate": 0.00016044400379582364,
+      "loss": 0.9877,
+      "step": 6606
+    },
+    {
+      "epoch": 1.1762820512820513,
+      "grad_norm": 0.6538317203521729,
+      "learning_rate": 0.0001604328521085759,
+      "loss": 1.0094,
+      "step": 6607
+    },
+    {
+      "epoch": 1.1764601139601139,
+      "grad_norm": 0.6279696822166443,
+      "learning_rate": 0.00016042169923727195,
+      "loss": 1.1049,
+      "step": 6608
+    },
+    {
+      "epoch": 1.1766381766381766,
+      "grad_norm": 0.6949752569198608,
+      "learning_rate": 0.00016041054518213033,
+      "loss": 1.1418,
+      "step": 6609
+    },
+    {
+      "epoch": 1.1768162393162394,
+      "grad_norm": 0.6144010424613953,
+      "learning_rate": 0.00016039938994336957,
+      "loss": 1.0306,
+      "step": 6610
+    },
+    {
+      "epoch": 1.176994301994302,
+      "grad_norm": 0.5868683457374573,
+      "learning_rate": 0.00016038823352120823,
+      "loss": 0.9894,
+      "step": 6611
+    },
+    {
+      "epoch": 1.1771723646723646,
+      "grad_norm": 0.7181115746498108,
+      "learning_rate": 0.0001603770759158649,
+      "loss": 1.1674,
+      "step": 6612
+    },
+    {
+      "epoch": 1.1773504273504274,
+      "grad_norm": 0.6271308064460754,
+      "learning_rate": 0.00016036591712755818,
+      "loss": 0.9726,
+      "step": 6613
+    },
+    {
+      "epoch": 1.17752849002849,
+      "grad_norm": 0.6922675371170044,
+      "learning_rate": 0.00016035475715650668,
+      "loss": 0.9142,
+      "step": 6614
+    },
+    {
+      "epoch": 1.1777065527065527,
+      "grad_norm": 0.6838833689689636,
+      "learning_rate": 0.00016034359600292913,
+      "loss": 1.1627,
+      "step": 6615
+    },
+    {
+      "epoch": 1.1778846153846154,
+      "grad_norm": 0.6628252267837524,
+      "learning_rate": 0.00016033243366704418,
+      "loss": 0.739,
+      "step": 6616
+    },
+    {
+      "epoch": 1.1780626780626782,
+      "grad_norm": 0.6367576122283936,
+      "learning_rate": 0.0001603212701490705,
+      "loss": 0.9015,
+      "step": 6617
+    },
+    {
+      "epoch": 1.1782407407407407,
+      "grad_norm": 0.6498967409133911,
+      "learning_rate": 0.00016031010544922687,
+      "loss": 0.9645,
+      "step": 6618
+    },
+    {
+      "epoch": 1.1784188034188035,
+      "grad_norm": 0.468795508146286,
+      "learning_rate": 0.00016029893956773198,
+      "loss": 0.7305,
+      "step": 6619
+    },
+    {
+      "epoch": 1.178596866096866,
+      "grad_norm": 0.6355500817298889,
+      "learning_rate": 0.00016028777250480465,
+      "loss": 0.9183,
+      "step": 6620
+    },
+    {
+      "epoch": 1.1787749287749287,
+      "grad_norm": 0.7582615613937378,
+      "learning_rate": 0.0001602766042606636,
+      "loss": 1.1641,
+      "step": 6621
+    },
+    {
+      "epoch": 1.1789529914529915,
+      "grad_norm": 0.580035924911499,
+      "learning_rate": 0.00016026543483552776,
+      "loss": 0.9164,
+      "step": 6622
+    },
+    {
+      "epoch": 1.1791310541310542,
+      "grad_norm": 0.6198559999465942,
+      "learning_rate": 0.00016025426422961592,
+      "loss": 0.9803,
+      "step": 6623
+    },
+    {
+      "epoch": 1.1793091168091168,
+      "grad_norm": 0.59112149477005,
+      "learning_rate": 0.0001602430924431469,
+      "loss": 0.8645,
+      "step": 6624
+    },
+    {
+      "epoch": 1.1794871794871795,
+      "grad_norm": 0.6200533509254456,
+      "learning_rate": 0.00016023191947633965,
+      "loss": 1.068,
+      "step": 6625
+    },
+    {
+      "epoch": 1.179665242165242,
+      "grad_norm": 0.6077516078948975,
+      "learning_rate": 0.00016022074532941305,
+      "loss": 1.0017,
+      "step": 6626
+    },
+    {
+      "epoch": 1.1798433048433048,
+      "grad_norm": 0.6770145893096924,
+      "learning_rate": 0.00016020957000258606,
+      "loss": 0.9022,
+      "step": 6627
+    },
+    {
+      "epoch": 1.1800213675213675,
+      "grad_norm": 0.6478054523468018,
+      "learning_rate": 0.0001601983934960776,
+      "loss": 0.8615,
+      "step": 6628
+    },
+    {
+      "epoch": 1.1801994301994303,
+      "grad_norm": 0.6528988480567932,
+      "learning_rate": 0.00016018721581010666,
+      "loss": 1.0015,
+      "step": 6629
+    },
+    {
+      "epoch": 1.1803774928774928,
+      "grad_norm": 0.6160712242126465,
+      "learning_rate": 0.0001601760369448923,
+      "loss": 0.9382,
+      "step": 6630
+    },
+    {
+      "epoch": 1.1805555555555556,
+      "grad_norm": 0.5755789875984192,
+      "learning_rate": 0.00016016485690065345,
+      "loss": 1.0551,
+      "step": 6631
+    },
+    {
+      "epoch": 1.180733618233618,
+      "grad_norm": 0.8495022654533386,
+      "learning_rate": 0.00016015367567760925,
+      "loss": 0.9295,
+      "step": 6632
+    },
+    {
+      "epoch": 1.1809116809116809,
+      "grad_norm": 0.6010929346084595,
+      "learning_rate": 0.0001601424932759787,
+      "loss": 1.0413,
+      "step": 6633
+    },
+    {
+      "epoch": 1.1810897435897436,
+      "grad_norm": 0.6953579187393188,
+      "learning_rate": 0.00016013130969598093,
+      "loss": 1.0149,
+      "step": 6634
+    },
+    {
+      "epoch": 1.1812678062678064,
+      "grad_norm": 0.6949529647827148,
+      "learning_rate": 0.0001601201249378351,
+      "loss": 0.9992,
+      "step": 6635
+    },
+    {
+      "epoch": 1.181445868945869,
+      "grad_norm": 0.6471893787384033,
+      "learning_rate": 0.00016010893900176028,
+      "loss": 0.7985,
+      "step": 6636
+    },
+    {
+      "epoch": 1.1816239316239316,
+      "grad_norm": 0.6524858474731445,
+      "learning_rate": 0.00016009775188797568,
+      "loss": 0.9517,
+      "step": 6637
+    },
+    {
+      "epoch": 1.1818019943019944,
+      "grad_norm": 0.639214038848877,
+      "learning_rate": 0.00016008656359670046,
+      "loss": 1.0357,
+      "step": 6638
+    },
+    {
+      "epoch": 1.181980056980057,
+      "grad_norm": 0.6039628386497498,
+      "learning_rate": 0.00016007537412815386,
+      "loss": 1.0536,
+      "step": 6639
+    },
+    {
+      "epoch": 1.1821581196581197,
+      "grad_norm": 0.653540313243866,
+      "learning_rate": 0.00016006418348255507,
+      "loss": 0.9414,
+      "step": 6640
+    },
+    {
+      "epoch": 1.1823361823361824,
+      "grad_norm": 0.6331741809844971,
+      "learning_rate": 0.0001600529916601234,
+      "loss": 1.0352,
+      "step": 6641
+    },
+    {
+      "epoch": 1.182514245014245,
+      "grad_norm": 0.7552719712257385,
+      "learning_rate": 0.00016004179866107812,
+      "loss": 1.1103,
+      "step": 6642
+    },
+    {
+      "epoch": 1.1826923076923077,
+      "grad_norm": 0.6795875430107117,
+      "learning_rate": 0.00016003060448563852,
+      "loss": 1.1246,
+      "step": 6643
+    },
+    {
+      "epoch": 1.1828703703703705,
+      "grad_norm": 0.6308842301368713,
+      "learning_rate": 0.0001600194091340239,
+      "loss": 0.9532,
+      "step": 6644
+    },
+    {
+      "epoch": 1.183048433048433,
+      "grad_norm": 0.5640553832054138,
+      "learning_rate": 0.00016000821260645366,
+      "loss": 0.7491,
+      "step": 6645
+    },
+    {
+      "epoch": 1.1832264957264957,
+      "grad_norm": 0.5611832141876221,
+      "learning_rate": 0.00015999701490314712,
+      "loss": 0.9239,
+      "step": 6646
+    },
+    {
+      "epoch": 1.1834045584045585,
+      "grad_norm": 0.5881187915802002,
+      "learning_rate": 0.00015998581602432374,
+      "loss": 0.9246,
+      "step": 6647
+    },
+    {
+      "epoch": 1.183582621082621,
+      "grad_norm": 0.7291010022163391,
+      "learning_rate": 0.00015997461597020291,
+      "loss": 1.0314,
+      "step": 6648
+    },
+    {
+      "epoch": 1.1837606837606838,
+      "grad_norm": 0.6784794926643372,
+      "learning_rate": 0.00015996341474100402,
+      "loss": 1.0011,
+      "step": 6649
+    },
+    {
+      "epoch": 1.1839387464387465,
+      "grad_norm": 0.7083746194839478,
+      "learning_rate": 0.00015995221233694663,
+      "loss": 1.0336,
+      "step": 6650
+    },
+    {
+      "epoch": 1.184116809116809,
+      "grad_norm": 0.7081790566444397,
+      "learning_rate": 0.00015994100875825015,
+      "loss": 1.2386,
+      "step": 6651
+    },
+    {
+      "epoch": 1.1842948717948718,
+      "grad_norm": 0.5938812494277954,
+      "learning_rate": 0.00015992980400513415,
+      "loss": 0.7549,
+      "step": 6652
+    },
+    {
+      "epoch": 1.1844729344729346,
+      "grad_norm": 0.7084267139434814,
+      "learning_rate": 0.00015991859807781811,
+      "loss": 1.1194,
+      "step": 6653
+    },
+    {
+      "epoch": 1.184650997150997,
+      "grad_norm": 0.6391362547874451,
+      "learning_rate": 0.0001599073909765216,
+      "loss": 1.0857,
+      "step": 6654
+    },
+    {
+      "epoch": 1.1848290598290598,
+      "grad_norm": 0.8074106574058533,
+      "learning_rate": 0.00015989618270146423,
+      "loss": 1.1715,
+      "step": 6655
+    },
+    {
+      "epoch": 1.1850071225071226,
+      "grad_norm": 0.5778565406799316,
+      "learning_rate": 0.0001598849732528656,
+      "loss": 0.8843,
+      "step": 6656
+    },
+    {
+      "epoch": 1.1851851851851851,
+      "grad_norm": 0.6955079436302185,
+      "learning_rate": 0.00015987376263094526,
+      "loss": 1.0281,
+      "step": 6657
+    },
+    {
+      "epoch": 1.1853632478632479,
+      "grad_norm": 0.6789296269416809,
+      "learning_rate": 0.00015986255083592297,
+      "loss": 0.9739,
+      "step": 6658
+    },
+    {
+      "epoch": 1.1855413105413106,
+      "grad_norm": 0.6294292211532593,
+      "learning_rate": 0.00015985133786801834,
+      "loss": 1.0692,
+      "step": 6659
+    },
+    {
+      "epoch": 1.1857193732193732,
+      "grad_norm": 0.5604581832885742,
+      "learning_rate": 0.00015984012372745107,
+      "loss": 0.9059,
+      "step": 6660
+    },
+    {
+      "epoch": 1.185897435897436,
+      "grad_norm": 0.6727550625801086,
+      "learning_rate": 0.00015982890841444088,
+      "loss": 1.049,
+      "step": 6661
+    },
+    {
+      "epoch": 1.1860754985754987,
+      "grad_norm": 0.620914101600647,
+      "learning_rate": 0.0001598176919292075,
+      "loss": 1.1021,
+      "step": 6662
+    },
+    {
+      "epoch": 1.1862535612535612,
+      "grad_norm": 0.6696683168411255,
+      "learning_rate": 0.00015980647427197076,
+      "loss": 0.9053,
+      "step": 6663
+    },
+    {
+      "epoch": 1.186431623931624,
+      "grad_norm": 0.6713385581970215,
+      "learning_rate": 0.00015979525544295036,
+      "loss": 0.9596,
+      "step": 6664
+    },
+    {
+      "epoch": 1.1866096866096867,
+      "grad_norm": 0.7643477320671082,
+      "learning_rate": 0.00015978403544236614,
+      "loss": 0.882,
+      "step": 6665
+    },
+    {
+      "epoch": 1.1867877492877492,
+      "grad_norm": 0.5890966057777405,
+      "learning_rate": 0.00015977281427043794,
+      "loss": 1.0215,
+      "step": 6666
+    },
+    {
+      "epoch": 1.186965811965812,
+      "grad_norm": 0.7287502288818359,
+      "learning_rate": 0.0001597615919273856,
+      "loss": 1.0111,
+      "step": 6667
+    },
+    {
+      "epoch": 1.1871438746438747,
+      "grad_norm": 0.5713803172111511,
+      "learning_rate": 0.00015975036841342903,
+      "loss": 1.0068,
+      "step": 6668
+    },
+    {
+      "epoch": 1.1873219373219372,
+      "grad_norm": 0.5113094449043274,
+      "learning_rate": 0.0001597391437287881,
+      "loss": 0.9018,
+      "step": 6669
+    },
+    {
+      "epoch": 1.1875,
+      "grad_norm": 0.585640013217926,
+      "learning_rate": 0.00015972791787368276,
+      "loss": 1.0375,
+      "step": 6670
+    },
+    {
+      "epoch": 1.1876780626780628,
+      "grad_norm": 0.5778326392173767,
+      "learning_rate": 0.00015971669084833293,
+      "loss": 0.9975,
+      "step": 6671
+    },
+    {
+      "epoch": 1.1878561253561253,
+      "grad_norm": 0.6707763075828552,
+      "learning_rate": 0.0001597054626529586,
+      "loss": 1.0048,
+      "step": 6672
+    },
+    {
+      "epoch": 1.188034188034188,
+      "grad_norm": 0.6113292574882507,
+      "learning_rate": 0.00015969423328777974,
+      "loss": 1.1447,
+      "step": 6673
+    },
+    {
+      "epoch": 1.1882122507122508,
+      "grad_norm": 0.6075651049613953,
+      "learning_rate": 0.00015968300275301638,
+      "loss": 0.9212,
+      "step": 6674
+    },
+    {
+      "epoch": 1.1883903133903133,
+      "grad_norm": 0.6990494132041931,
+      "learning_rate": 0.00015967177104888857,
+      "loss": 0.9952,
+      "step": 6675
+    },
+    {
+      "epoch": 1.188568376068376,
+      "grad_norm": 0.6228706240653992,
+      "learning_rate": 0.00015966053817561638,
+      "loss": 1.0187,
+      "step": 6676
+    },
+    {
+      "epoch": 1.1887464387464388,
+      "grad_norm": 0.6387844085693359,
+      "learning_rate": 0.00015964930413341985,
+      "loss": 1.1614,
+      "step": 6677
+    },
+    {
+      "epoch": 1.1889245014245013,
+      "grad_norm": 0.6501925587654114,
+      "learning_rate": 0.00015963806892251915,
+      "loss": 1.0366,
+      "step": 6678
+    },
+    {
+      "epoch": 1.189102564102564,
+      "grad_norm": 0.6923739910125732,
+      "learning_rate": 0.00015962683254313435,
+      "loss": 1.1992,
+      "step": 6679
+    },
+    {
+      "epoch": 1.1892806267806268,
+      "grad_norm": 0.6640275120735168,
+      "learning_rate": 0.00015961559499548563,
+      "loss": 0.8883,
+      "step": 6680
+    },
+    {
+      "epoch": 1.1894586894586894,
+      "grad_norm": 0.6493857502937317,
+      "learning_rate": 0.00015960435627979317,
+      "loss": 1.1368,
+      "step": 6681
+    },
+    {
+      "epoch": 1.1896367521367521,
+      "grad_norm": 0.6357189416885376,
+      "learning_rate": 0.0001595931163962772,
+      "loss": 1.0502,
+      "step": 6682
+    },
+    {
+      "epoch": 1.1898148148148149,
+      "grad_norm": 0.5756343007087708,
+      "learning_rate": 0.0001595818753451579,
+      "loss": 0.9871,
+      "step": 6683
+    },
+    {
+      "epoch": 1.1899928774928774,
+      "grad_norm": 0.7369210124015808,
+      "learning_rate": 0.0001595706331266555,
+      "loss": 1.3229,
+      "step": 6684
+    },
+    {
+      "epoch": 1.1901709401709402,
+      "grad_norm": 0.7140820622444153,
+      "learning_rate": 0.0001595593897409903,
+      "loss": 1.1154,
+      "step": 6685
+    },
+    {
+      "epoch": 1.190349002849003,
+      "grad_norm": 0.696973443031311,
+      "learning_rate": 0.00015954814518838255,
+      "loss": 0.9806,
+      "step": 6686
+    },
+    {
+      "epoch": 1.1905270655270654,
+      "grad_norm": 0.5299260020256042,
+      "learning_rate": 0.00015953689946905262,
+      "loss": 0.771,
+      "step": 6687
+    },
+    {
+      "epoch": 1.1907051282051282,
+      "grad_norm": 0.6814879775047302,
+      "learning_rate": 0.00015952565258322085,
+      "loss": 0.8444,
+      "step": 6688
+    },
+    {
+      "epoch": 1.190883190883191,
+      "grad_norm": 0.6215870976448059,
+      "learning_rate": 0.00015951440453110754,
+      "loss": 1.0743,
+      "step": 6689
+    },
+    {
+      "epoch": 1.1910612535612535,
+      "grad_norm": 0.7017203569412231,
+      "learning_rate": 0.00015950315531293308,
+      "loss": 1.185,
+      "step": 6690
+    },
+    {
+      "epoch": 1.1912393162393162,
+      "grad_norm": 0.7147250175476074,
+      "learning_rate": 0.00015949190492891795,
+      "loss": 1.0646,
+      "step": 6691
+    },
+    {
+      "epoch": 1.191417378917379,
+      "grad_norm": 0.5867117047309875,
+      "learning_rate": 0.00015948065337928252,
+      "loss": 1.0554,
+      "step": 6692
+    },
+    {
+      "epoch": 1.1915954415954415,
+      "grad_norm": 0.6813527345657349,
+      "learning_rate": 0.0001594694006642472,
+      "loss": 1.1451,
+      "step": 6693
+    },
+    {
+      "epoch": 1.1917735042735043,
+      "grad_norm": 0.5192593932151794,
+      "learning_rate": 0.00015945814678403256,
+      "loss": 0.7886,
+      "step": 6694
+    },
+    {
+      "epoch": 1.191951566951567,
+      "grad_norm": 0.6537744402885437,
+      "learning_rate": 0.00015944689173885904,
+      "loss": 0.9905,
+      "step": 6695
+    },
+    {
+      "epoch": 1.1921296296296295,
+      "grad_norm": 0.7350276112556458,
+      "learning_rate": 0.00015943563552894716,
+      "loss": 0.9009,
+      "step": 6696
+    },
+    {
+      "epoch": 1.1923076923076923,
+      "grad_norm": 0.7086381316184998,
+      "learning_rate": 0.00015942437815451746,
+      "loss": 0.9117,
+      "step": 6697
+    },
+    {
+      "epoch": 1.192485754985755,
+      "grad_norm": 0.6774969696998596,
+      "learning_rate": 0.00015941311961579054,
+      "loss": 1.1172,
+      "step": 6698
+    },
+    {
+      "epoch": 1.1926638176638176,
+      "grad_norm": 0.7034362554550171,
+      "learning_rate": 0.00015940185991298694,
+      "loss": 0.8054,
+      "step": 6699
+    },
+    {
+      "epoch": 1.1928418803418803,
+      "grad_norm": 0.66145920753479,
+      "learning_rate": 0.00015939059904632728,
+      "loss": 0.7417,
+      "step": 6700
+    },
+    {
+      "epoch": 1.193019943019943,
+      "grad_norm": 0.6590890884399414,
+      "learning_rate": 0.00015937933701603223,
+      "loss": 0.9169,
+      "step": 6701
+    },
+    {
+      "epoch": 1.1931980056980056,
+      "grad_norm": 0.7492850422859192,
+      "learning_rate": 0.0001593680738223224,
+      "loss": 1.0529,
+      "step": 6702
+    },
+    {
+      "epoch": 1.1933760683760684,
+      "grad_norm": 0.7103236317634583,
+      "learning_rate": 0.00015935680946541848,
+      "loss": 1.1377,
+      "step": 6703
+    },
+    {
+      "epoch": 1.193554131054131,
+      "grad_norm": 0.6164175868034363,
+      "learning_rate": 0.00015934554394554122,
+      "loss": 0.8636,
+      "step": 6704
+    },
+    {
+      "epoch": 1.1937321937321936,
+      "grad_norm": 0.6667410135269165,
+      "learning_rate": 0.0001593342772629113,
+      "loss": 1.0073,
+      "step": 6705
+    },
+    {
+      "epoch": 1.1939102564102564,
+      "grad_norm": 0.6785695552825928,
+      "learning_rate": 0.00015932300941774944,
+      "loss": 1.0752,
+      "step": 6706
+    },
+    {
+      "epoch": 1.1940883190883191,
+      "grad_norm": 0.6446872353553772,
+      "learning_rate": 0.0001593117404102765,
+      "loss": 0.9509,
+      "step": 6707
+    },
+    {
+      "epoch": 1.194266381766382,
+      "grad_norm": 0.6607686877250671,
+      "learning_rate": 0.00015930047024071317,
+      "loss": 1.0902,
+      "step": 6708
+    },
+    {
+      "epoch": 1.1944444444444444,
+      "grad_norm": 0.664804995059967,
+      "learning_rate": 0.0001592891989092803,
+      "loss": 0.9783,
+      "step": 6709
+    },
+    {
+      "epoch": 1.1946225071225072,
+      "grad_norm": 0.7147907018661499,
+      "learning_rate": 0.00015927792641619876,
+      "loss": 1.0558,
+      "step": 6710
+    },
+    {
+      "epoch": 1.1948005698005697,
+      "grad_norm": 0.6858944296836853,
+      "learning_rate": 0.0001592666527616894,
+      "loss": 1.0514,
+      "step": 6711
+    },
+    {
+      "epoch": 1.1949786324786325,
+      "grad_norm": 0.598463773727417,
+      "learning_rate": 0.0001592553779459731,
+      "loss": 0.8927,
+      "step": 6712
+    },
+    {
+      "epoch": 1.1951566951566952,
+      "grad_norm": 0.6872668862342834,
+      "learning_rate": 0.00015924410196927076,
+      "loss": 1.016,
+      "step": 6713
+    },
+    {
+      "epoch": 1.195334757834758,
+      "grad_norm": 0.6547996401786804,
+      "learning_rate": 0.00015923282483180326,
+      "loss": 1.1573,
+      "step": 6714
+    },
+    {
+      "epoch": 1.1955128205128205,
+      "grad_norm": 0.6254705786705017,
+      "learning_rate": 0.00015922154653379167,
+      "loss": 1.0179,
+      "step": 6715
+    },
+    {
+      "epoch": 1.1956908831908832,
+      "grad_norm": 0.6049207448959351,
+      "learning_rate": 0.00015921026707545684,
+      "loss": 1.0713,
+      "step": 6716
+    },
+    {
+      "epoch": 1.1958689458689458,
+      "grad_norm": 0.6042858958244324,
+      "learning_rate": 0.0001591989864570199,
+      "loss": 0.919,
+      "step": 6717
+    },
+    {
+      "epoch": 1.1960470085470085,
+      "grad_norm": 0.6521187424659729,
+      "learning_rate": 0.0001591877046787017,
+      "loss": 1.0112,
+      "step": 6718
+    },
+    {
+      "epoch": 1.1962250712250713,
+      "grad_norm": 0.766260027885437,
+      "learning_rate": 0.00015917642174072348,
+      "loss": 0.9774,
+      "step": 6719
+    },
+    {
+      "epoch": 1.196403133903134,
+      "grad_norm": 0.7066532373428345,
+      "learning_rate": 0.00015916513764330613,
+      "loss": 1.1112,
+      "step": 6720
+    },
+    {
+      "epoch": 1.1965811965811965,
+      "grad_norm": 0.7351508140563965,
+      "learning_rate": 0.00015915385238667083,
+      "loss": 0.9841,
+      "step": 6721
+    },
+    {
+      "epoch": 1.1967592592592593,
+      "grad_norm": 0.6133812069892883,
+      "learning_rate": 0.0001591425659710387,
+      "loss": 0.8629,
+      "step": 6722
+    },
+    {
+      "epoch": 1.1969373219373218,
+      "grad_norm": 0.7244157791137695,
+      "learning_rate": 0.00015913127839663083,
+      "loss": 1.1584,
+      "step": 6723
+    },
+    {
+      "epoch": 1.1971153846153846,
+      "grad_norm": 0.5986210107803345,
+      "learning_rate": 0.00015911998966366842,
+      "loss": 0.8507,
+      "step": 6724
+    },
+    {
+      "epoch": 1.1972934472934473,
+      "grad_norm": 0.6087439060211182,
+      "learning_rate": 0.00015910869977237257,
+      "loss": 0.884,
+      "step": 6725
+    },
+    {
+      "epoch": 1.19747150997151,
+      "grad_norm": 0.7546007633209229,
+      "learning_rate": 0.00015909740872296457,
+      "loss": 1.1449,
+      "step": 6726
+    },
+    {
+      "epoch": 1.1976495726495726,
+      "grad_norm": 0.6437731385231018,
+      "learning_rate": 0.0001590861165156656,
+      "loss": 0.7845,
+      "step": 6727
+    },
+    {
+      "epoch": 1.1978276353276354,
+      "grad_norm": 0.6281737089157104,
+      "learning_rate": 0.00015907482315069693,
+      "loss": 0.8969,
+      "step": 6728
+    },
+    {
+      "epoch": 1.198005698005698,
+      "grad_norm": 0.6196113228797913,
+      "learning_rate": 0.00015906352862827983,
+      "loss": 1.0264,
+      "step": 6729
+    },
+    {
+      "epoch": 1.1981837606837606,
+      "grad_norm": 0.5990965962409973,
+      "learning_rate": 0.00015905223294863553,
+      "loss": 1.0017,
+      "step": 6730
+    },
+    {
+      "epoch": 1.1983618233618234,
+      "grad_norm": 0.6509191393852234,
+      "learning_rate": 0.00015904093611198542,
+      "loss": 1.1066,
+      "step": 6731
+    },
+    {
+      "epoch": 1.1985398860398861,
+      "grad_norm": 0.6648043990135193,
+      "learning_rate": 0.00015902963811855085,
+      "loss": 1.077,
+      "step": 6732
+    },
+    {
+      "epoch": 1.1987179487179487,
+      "grad_norm": 0.7071963548660278,
+      "learning_rate": 0.00015901833896855307,
+      "loss": 1.1346,
+      "step": 6733
+    },
+    {
+      "epoch": 1.1988960113960114,
+      "grad_norm": 0.5889959335327148,
+      "learning_rate": 0.0001590070386622136,
+      "loss": 0.9525,
+      "step": 6734
+    },
+    {
+      "epoch": 1.199074074074074,
+      "grad_norm": 0.6233037710189819,
+      "learning_rate": 0.00015899573719975376,
+      "loss": 1.0513,
+      "step": 6735
+    },
+    {
+      "epoch": 1.1992521367521367,
+      "grad_norm": 0.7912302613258362,
+      "learning_rate": 0.000158984434581395,
+      "loss": 0.8749,
+      "step": 6736
+    },
+    {
+      "epoch": 1.1994301994301995,
+      "grad_norm": 0.5783160924911499,
+      "learning_rate": 0.0001589731308073588,
+      "loss": 0.7173,
+      "step": 6737
+    },
+    {
+      "epoch": 1.1996082621082622,
+      "grad_norm": 0.718950092792511,
+      "learning_rate": 0.00015896182587786658,
+      "loss": 1.0815,
+      "step": 6738
+    },
+    {
+      "epoch": 1.1997863247863247,
+      "grad_norm": 0.6700926423072815,
+      "learning_rate": 0.0001589505197931399,
+      "loss": 1.0817,
+      "step": 6739
+    },
+    {
+      "epoch": 1.1999643874643875,
+      "grad_norm": 0.7614455223083496,
+      "learning_rate": 0.0001589392125534002,
+      "loss": 0.9707,
+      "step": 6740
+    },
+    {
+      "epoch": 1.20014245014245,
+      "grad_norm": 0.6998619437217712,
+      "learning_rate": 0.00015892790415886906,
+      "loss": 1.0541,
+      "step": 6741
+    },
+    {
+      "epoch": 1.2003205128205128,
+      "grad_norm": 0.6127668619155884,
+      "learning_rate": 0.0001589165946097681,
+      "loss": 0.9147,
+      "step": 6742
+    },
+    {
+      "epoch": 1.2004985754985755,
+      "grad_norm": 0.7112005352973938,
+      "learning_rate": 0.00015890528390631885,
+      "loss": 0.868,
+      "step": 6743
+    },
+    {
+      "epoch": 1.2006766381766383,
+      "grad_norm": 0.6631024479866028,
+      "learning_rate": 0.0001588939720487429,
+      "loss": 0.9277,
+      "step": 6744
+    },
+    {
+      "epoch": 1.2008547008547008,
+      "grad_norm": 0.6106321215629578,
+      "learning_rate": 0.00015888265903726188,
+      "loss": 1.0223,
+      "step": 6745
+    },
+    {
+      "epoch": 1.2010327635327636,
+      "grad_norm": 0.6400851607322693,
+      "learning_rate": 0.00015887134487209753,
+      "loss": 1.1279,
+      "step": 6746
+    },
+    {
+      "epoch": 1.201210826210826,
+      "grad_norm": 0.6298650503158569,
+      "learning_rate": 0.00015886002955347147,
+      "loss": 0.9481,
+      "step": 6747
+    },
+    {
+      "epoch": 1.2013888888888888,
+      "grad_norm": 0.647974967956543,
+      "learning_rate": 0.00015884871308160538,
+      "loss": 1.1513,
+      "step": 6748
+    },
+    {
+      "epoch": 1.2015669515669516,
+      "grad_norm": 0.6770651936531067,
+      "learning_rate": 0.000158837395456721,
+      "loss": 0.9914,
+      "step": 6749
+    },
+    {
+      "epoch": 1.2017450142450143,
+      "grad_norm": 0.6708947420120239,
+      "learning_rate": 0.0001588260766790401,
+      "loss": 1.1848,
+      "step": 6750
+    },
+    {
+      "epoch": 1.2019230769230769,
+      "grad_norm": 0.5624440908432007,
+      "learning_rate": 0.00015881475674878442,
+      "loss": 0.9848,
+      "step": 6751
+    },
+    {
+      "epoch": 1.2021011396011396,
+      "grad_norm": 0.5512633919715881,
+      "learning_rate": 0.00015880343566617575,
+      "loss": 1.0308,
+      "step": 6752
+    },
+    {
+      "epoch": 1.2022792022792024,
+      "grad_norm": 0.5621042251586914,
+      "learning_rate": 0.0001587921134314359,
+      "loss": 0.8724,
+      "step": 6753
+    },
+    {
+      "epoch": 1.202457264957265,
+      "grad_norm": 0.6881251931190491,
+      "learning_rate": 0.00015878079004478675,
+      "loss": 0.9771,
+      "step": 6754
+    },
+    {
+      "epoch": 1.2026353276353277,
+      "grad_norm": 0.729998767375946,
+      "learning_rate": 0.0001587694655064501,
+      "loss": 1.002,
+      "step": 6755
+    },
+    {
+      "epoch": 1.2028133903133904,
+      "grad_norm": 0.5972567200660706,
+      "learning_rate": 0.00015875813981664787,
+      "loss": 1.0571,
+      "step": 6756
+    },
+    {
+      "epoch": 1.202991452991453,
+      "grad_norm": 0.6319229006767273,
+      "learning_rate": 0.00015874681297560196,
+      "loss": 0.9294,
+      "step": 6757
+    },
+    {
+      "epoch": 1.2031695156695157,
+      "grad_norm": 0.6751521825790405,
+      "learning_rate": 0.00015873548498353428,
+      "loss": 0.783,
+      "step": 6758
+    },
+    {
+      "epoch": 1.2033475783475784,
+      "grad_norm": 0.6476554870605469,
+      "learning_rate": 0.00015872415584066677,
+      "loss": 0.8939,
+      "step": 6759
+    },
+    {
+      "epoch": 1.203525641025641,
+      "grad_norm": 0.6530960202217102,
+      "learning_rate": 0.0001587128255472214,
+      "loss": 0.9828,
+      "step": 6760
+    },
+    {
+      "epoch": 1.2037037037037037,
+      "grad_norm": 0.6708502173423767,
+      "learning_rate": 0.00015870149410342023,
+      "loss": 0.9285,
+      "step": 6761
+    },
+    {
+      "epoch": 1.2038817663817665,
+      "grad_norm": 0.7749543190002441,
+      "learning_rate": 0.0001586901615094852,
+      "loss": 1.1295,
+      "step": 6762
+    },
+    {
+      "epoch": 1.204059829059829,
+      "grad_norm": 0.6750495433807373,
+      "learning_rate": 0.00015867882776563836,
+      "loss": 1.0562,
+      "step": 6763
+    },
+    {
+      "epoch": 1.2042378917378918,
+      "grad_norm": 0.6892416477203369,
+      "learning_rate": 0.00015866749287210178,
+      "loss": 0.7207,
+      "step": 6764
+    },
+    {
+      "epoch": 1.2044159544159545,
+      "grad_norm": 0.7066485285758972,
+      "learning_rate": 0.00015865615682909758,
+      "loss": 1.0489,
+      "step": 6765
+    },
+    {
+      "epoch": 1.204594017094017,
+      "grad_norm": 0.5669938325881958,
+      "learning_rate": 0.00015864481963684783,
+      "loss": 0.8149,
+      "step": 6766
+    },
+    {
+      "epoch": 1.2047720797720798,
+      "grad_norm": 0.6467341780662537,
+      "learning_rate": 0.0001586334812955746,
+      "loss": 0.9595,
+      "step": 6767
+    },
+    {
+      "epoch": 1.2049501424501425,
+      "grad_norm": 0.6026045680046082,
+      "learning_rate": 0.0001586221418055002,
+      "loss": 0.9832,
+      "step": 6768
+    },
+    {
+      "epoch": 1.205128205128205,
+      "grad_norm": 0.7655174732208252,
+      "learning_rate": 0.00015861080116684665,
+      "loss": 0.9796,
+      "step": 6769
+    },
+    {
+      "epoch": 1.2053062678062678,
+      "grad_norm": 0.6386621594429016,
+      "learning_rate": 0.00015859945937983624,
+      "loss": 0.9368,
+      "step": 6770
+    },
+    {
+      "epoch": 1.2054843304843306,
+      "grad_norm": 0.7088032364845276,
+      "learning_rate": 0.0001585881164446911,
+      "loss": 1.0167,
+      "step": 6771
+    },
+    {
+      "epoch": 1.205662393162393,
+      "grad_norm": 0.6015275716781616,
+      "learning_rate": 0.0001585767723616336,
+      "loss": 0.8551,
+      "step": 6772
+    },
+    {
+      "epoch": 1.2058404558404558,
+      "grad_norm": 0.7013260722160339,
+      "learning_rate": 0.00015856542713088583,
+      "loss": 0.8009,
+      "step": 6773
+    },
+    {
+      "epoch": 1.2060185185185186,
+      "grad_norm": 0.6931240558624268,
+      "learning_rate": 0.00015855408075267024,
+      "loss": 0.9964,
+      "step": 6774
+    },
+    {
+      "epoch": 1.2061965811965811,
+      "grad_norm": 0.7274388670921326,
+      "learning_rate": 0.00015854273322720908,
+      "loss": 1.0991,
+      "step": 6775
+    },
+    {
+      "epoch": 1.2063746438746439,
+      "grad_norm": 0.6353716254234314,
+      "learning_rate": 0.00015853138455472466,
+      "loss": 1.0893,
+      "step": 6776
+    },
+    {
+      "epoch": 1.2065527065527066,
+      "grad_norm": 0.6958979368209839,
+      "learning_rate": 0.00015852003473543932,
+      "loss": 1.0238,
+      "step": 6777
+    },
+    {
+      "epoch": 1.2067307692307692,
+      "grad_norm": 0.626838743686676,
+      "learning_rate": 0.00015850868376957551,
+      "loss": 0.9384,
+      "step": 6778
+    },
+    {
+      "epoch": 1.206908831908832,
+      "grad_norm": 0.5455024242401123,
+      "learning_rate": 0.00015849733165735556,
+      "loss": 0.8068,
+      "step": 6779
+    },
+    {
+      "epoch": 1.2070868945868947,
+      "grad_norm": 0.6337353587150574,
+      "learning_rate": 0.0001584859783990019,
+      "loss": 1.1341,
+      "step": 6780
+    },
+    {
+      "epoch": 1.2072649572649572,
+      "grad_norm": 0.6318019032478333,
+      "learning_rate": 0.000158474623994737,
+      "loss": 1.1095,
+      "step": 6781
+    },
+    {
+      "epoch": 1.20744301994302,
+      "grad_norm": 0.8183810710906982,
+      "learning_rate": 0.00015846326844478332,
+      "loss": 1.1471,
+      "step": 6782
+    },
+    {
+      "epoch": 1.2076210826210827,
+      "grad_norm": 0.6140483021736145,
+      "learning_rate": 0.00015845191174936334,
+      "loss": 0.8538,
+      "step": 6783
+    },
+    {
+      "epoch": 1.2077991452991452,
+      "grad_norm": 0.7570197582244873,
+      "learning_rate": 0.0001584405539086996,
+      "loss": 1.427,
+      "step": 6784
+    },
+    {
+      "epoch": 1.207977207977208,
+      "grad_norm": 0.7616991996765137,
+      "learning_rate": 0.00015842919492301455,
+      "loss": 1.2214,
+      "step": 6785
+    },
+    {
+      "epoch": 1.2081552706552707,
+      "grad_norm": 0.561996579170227,
+      "learning_rate": 0.00015841783479253084,
+      "loss": 0.8916,
+      "step": 6786
+    },
+    {
+      "epoch": 1.2083333333333333,
+      "grad_norm": 0.6124222874641418,
+      "learning_rate": 0.000158406473517471,
+      "loss": 0.9637,
+      "step": 6787
+    },
+    {
+      "epoch": 1.208511396011396,
+      "grad_norm": 0.6053098440170288,
+      "learning_rate": 0.00015839511109805762,
+      "loss": 1.0365,
+      "step": 6788
+    },
+    {
+      "epoch": 1.2086894586894588,
+      "grad_norm": 0.6451675295829773,
+      "learning_rate": 0.00015838374753451338,
+      "loss": 1.0497,
+      "step": 6789
+    },
+    {
+      "epoch": 1.2088675213675213,
+      "grad_norm": 0.6789399981498718,
+      "learning_rate": 0.00015837238282706087,
+      "loss": 0.9286,
+      "step": 6790
+    },
+    {
+      "epoch": 1.209045584045584,
+      "grad_norm": 0.5742998123168945,
+      "learning_rate": 0.0001583610169759228,
+      "loss": 1.082,
+      "step": 6791
+    },
+    {
+      "epoch": 1.2092236467236468,
+      "grad_norm": 0.6813693642616272,
+      "learning_rate": 0.0001583496499813218,
+      "loss": 0.9785,
+      "step": 6792
+    },
+    {
+      "epoch": 1.2094017094017093,
+      "grad_norm": 0.6150603890419006,
+      "learning_rate": 0.0001583382818434806,
+      "loss": 0.9533,
+      "step": 6793
+    },
+    {
+      "epoch": 1.209579772079772,
+      "grad_norm": 0.6905919909477234,
+      "learning_rate": 0.000158326912562622,
+      "loss": 1.0132,
+      "step": 6794
+    },
+    {
+      "epoch": 1.2097578347578348,
+      "grad_norm": 0.5861411094665527,
+      "learning_rate": 0.0001583155421389687,
+      "loss": 0.7071,
+      "step": 6795
+    },
+    {
+      "epoch": 1.2099358974358974,
+      "grad_norm": 0.6822740435600281,
+      "learning_rate": 0.0001583041705727435,
+      "loss": 1.1366,
+      "step": 6796
+    },
+    {
+      "epoch": 1.21011396011396,
+      "grad_norm": 0.6013675928115845,
+      "learning_rate": 0.00015829279786416916,
+      "loss": 0.9232,
+      "step": 6797
+    },
+    {
+      "epoch": 1.2102920227920229,
+      "grad_norm": 0.650675356388092,
+      "learning_rate": 0.00015828142401346857,
+      "loss": 0.887,
+      "step": 6798
+    },
+    {
+      "epoch": 1.2104700854700854,
+      "grad_norm": 0.6764078736305237,
+      "learning_rate": 0.00015827004902086456,
+      "loss": 0.8423,
+      "step": 6799
+    },
+    {
+      "epoch": 1.2106481481481481,
+      "grad_norm": 0.6460821628570557,
+      "learning_rate": 0.00015825867288657994,
+      "loss": 1.0074,
+      "step": 6800
+    },
+    {
+      "epoch": 1.210826210826211,
+      "grad_norm": 0.692562997341156,
+      "learning_rate": 0.00015824729561083768,
+      "loss": 0.7978,
+      "step": 6801
+    },
+    {
+      "epoch": 1.2110042735042734,
+      "grad_norm": 0.7255034446716309,
+      "learning_rate": 0.00015823591719386066,
+      "loss": 1.071,
+      "step": 6802
+    },
+    {
+      "epoch": 1.2111823361823362,
+      "grad_norm": 0.6598904728889465,
+      "learning_rate": 0.0001582245376358718,
+      "loss": 0.9736,
+      "step": 6803
+    },
+    {
+      "epoch": 1.211360398860399,
+      "grad_norm": 0.6372483968734741,
+      "learning_rate": 0.0001582131569370941,
+      "loss": 0.9029,
+      "step": 6804
+    },
+    {
+      "epoch": 1.2115384615384615,
+      "grad_norm": 0.5907173156738281,
+      "learning_rate": 0.00015820177509775048,
+      "loss": 0.918,
+      "step": 6805
+    },
+    {
+      "epoch": 1.2117165242165242,
+      "grad_norm": 0.6252630949020386,
+      "learning_rate": 0.00015819039211806404,
+      "loss": 0.7801,
+      "step": 6806
+    },
+    {
+      "epoch": 1.211894586894587,
+      "grad_norm": 0.5793096423149109,
+      "learning_rate": 0.0001581790079982577,
+      "loss": 0.5769,
+      "step": 6807
+    },
+    {
+      "epoch": 1.2120726495726495,
+      "grad_norm": 0.7267270684242249,
+      "learning_rate": 0.00015816762273855454,
+      "loss": 1.1428,
+      "step": 6808
+    },
+    {
+      "epoch": 1.2122507122507122,
+      "grad_norm": 0.7481234073638916,
+      "learning_rate": 0.00015815623633917767,
+      "loss": 1.0209,
+      "step": 6809
+    },
+    {
+      "epoch": 1.212428774928775,
+      "grad_norm": 0.6114386916160583,
+      "learning_rate": 0.00015814484880035017,
+      "loss": 0.9073,
+      "step": 6810
+    },
+    {
+      "epoch": 1.2126068376068375,
+      "grad_norm": 0.6871182322502136,
+      "learning_rate": 0.00015813346012229516,
+      "loss": 1.151,
+      "step": 6811
+    },
+    {
+      "epoch": 1.2127849002849003,
+      "grad_norm": 0.6380293965339661,
+      "learning_rate": 0.0001581220703052357,
+      "loss": 1.0981,
+      "step": 6812
+    },
+    {
+      "epoch": 1.212962962962963,
+      "grad_norm": 0.6013718247413635,
+      "learning_rate": 0.00015811067934939503,
+      "loss": 0.8832,
+      "step": 6813
+    },
+    {
+      "epoch": 1.2131410256410255,
+      "grad_norm": 0.5816897749900818,
+      "learning_rate": 0.00015809928725499632,
+      "loss": 1.063,
+      "step": 6814
+    },
+    {
+      "epoch": 1.2133190883190883,
+      "grad_norm": 0.5970914363861084,
+      "learning_rate": 0.00015808789402226278,
+      "loss": 1.1177,
+      "step": 6815
+    },
+    {
+      "epoch": 1.213497150997151,
+      "grad_norm": 0.7624936103820801,
+      "learning_rate": 0.00015807649965141762,
+      "loss": 1.048,
+      "step": 6816
+    },
+    {
+      "epoch": 1.2136752136752136,
+      "grad_norm": 0.636263906955719,
+      "learning_rate": 0.0001580651041426841,
+      "loss": 0.9743,
+      "step": 6817
+    },
+    {
+      "epoch": 1.2138532763532763,
+      "grad_norm": 0.641090452671051,
+      "learning_rate": 0.00015805370749628547,
+      "loss": 1.0227,
+      "step": 6818
+    },
+    {
+      "epoch": 1.214031339031339,
+      "grad_norm": 0.6484021544456482,
+      "learning_rate": 0.00015804230971244504,
+      "loss": 0.9615,
+      "step": 6819
+    },
+    {
+      "epoch": 1.2142094017094016,
+      "grad_norm": 0.6473353505134583,
+      "learning_rate": 0.00015803091079138613,
+      "loss": 1.0507,
+      "step": 6820
+    },
+    {
+      "epoch": 1.2143874643874644,
+      "grad_norm": 0.5477129220962524,
+      "learning_rate": 0.00015801951073333206,
+      "loss": 0.7928,
+      "step": 6821
+    },
+    {
+      "epoch": 1.2145655270655271,
+      "grad_norm": 0.7256210446357727,
+      "learning_rate": 0.0001580081095385062,
+      "loss": 1.0172,
+      "step": 6822
+    },
+    {
+      "epoch": 1.2147435897435896,
+      "grad_norm": 0.5785418748855591,
+      "learning_rate": 0.00015799670720713195,
+      "loss": 0.8478,
+      "step": 6823
+    },
+    {
+      "epoch": 1.2149216524216524,
+      "grad_norm": 0.6782996654510498,
+      "learning_rate": 0.00015798530373943267,
+      "loss": 1.1819,
+      "step": 6824
+    },
+    {
+      "epoch": 1.2150997150997151,
+      "grad_norm": 0.6513699293136597,
+      "learning_rate": 0.00015797389913563186,
+      "loss": 0.9626,
+      "step": 6825
+    },
+    {
+      "epoch": 1.2152777777777777,
+      "grad_norm": 0.6503037214279175,
+      "learning_rate": 0.0001579624933959529,
+      "loss": 1.0282,
+      "step": 6826
+    },
+    {
+      "epoch": 1.2154558404558404,
+      "grad_norm": 0.581501841545105,
+      "learning_rate": 0.0001579510865206193,
+      "loss": 0.8976,
+      "step": 6827
+    },
+    {
+      "epoch": 1.2156339031339032,
+      "grad_norm": 0.6696721911430359,
+      "learning_rate": 0.00015793967850985454,
+      "loss": 0.6418,
+      "step": 6828
+    },
+    {
+      "epoch": 1.215811965811966,
+      "grad_norm": 0.6577274203300476,
+      "learning_rate": 0.00015792826936388213,
+      "loss": 1.0615,
+      "step": 6829
+    },
+    {
+      "epoch": 1.2159900284900285,
+      "grad_norm": 0.66291743516922,
+      "learning_rate": 0.00015791685908292564,
+      "loss": 0.8582,
+      "step": 6830
+    },
+    {
+      "epoch": 1.2161680911680912,
+      "grad_norm": 0.6548362374305725,
+      "learning_rate": 0.0001579054476672086,
+      "loss": 1.0343,
+      "step": 6831
+    },
+    {
+      "epoch": 1.2163461538461537,
+      "grad_norm": 0.6381218433380127,
+      "learning_rate": 0.00015789403511695457,
+      "loss": 0.8133,
+      "step": 6832
+    },
+    {
+      "epoch": 1.2165242165242165,
+      "grad_norm": 0.7217492461204529,
+      "learning_rate": 0.00015788262143238722,
+      "loss": 0.9183,
+      "step": 6833
+    },
+    {
+      "epoch": 1.2167022792022792,
+      "grad_norm": 0.610454797744751,
+      "learning_rate": 0.00015787120661373013,
+      "loss": 0.8488,
+      "step": 6834
+    },
+    {
+      "epoch": 1.216880341880342,
+      "grad_norm": 0.592771053314209,
+      "learning_rate": 0.00015785979066120696,
+      "loss": 0.8673,
+      "step": 6835
+    },
+    {
+      "epoch": 1.2170584045584045,
+      "grad_norm": 0.5787834525108337,
+      "learning_rate": 0.00015784837357504138,
+      "loss": 0.7945,
+      "step": 6836
+    },
+    {
+      "epoch": 1.2172364672364673,
+      "grad_norm": 0.6814196109771729,
+      "learning_rate": 0.0001578369553554571,
+      "loss": 0.8906,
+      "step": 6837
+    },
+    {
+      "epoch": 1.2174145299145298,
+      "grad_norm": 0.6383981108665466,
+      "learning_rate": 0.00015782553600267787,
+      "loss": 0.8962,
+      "step": 6838
+    },
+    {
+      "epoch": 1.2175925925925926,
+      "grad_norm": 0.6733864545822144,
+      "learning_rate": 0.0001578141155169273,
+      "loss": 1.2077,
+      "step": 6839
+    },
+    {
+      "epoch": 1.2177706552706553,
+      "grad_norm": 0.5891284346580505,
+      "learning_rate": 0.0001578026938984293,
+      "loss": 0.9477,
+      "step": 6840
+    },
+    {
+      "epoch": 1.217948717948718,
+      "grad_norm": 0.7220266461372375,
+      "learning_rate": 0.00015779127114740757,
+      "loss": 1.0343,
+      "step": 6841
+    },
+    {
+      "epoch": 1.2181267806267806,
+      "grad_norm": 0.6566546559333801,
+      "learning_rate": 0.0001577798472640859,
+      "loss": 0.9576,
+      "step": 6842
+    },
+    {
+      "epoch": 1.2183048433048433,
+      "grad_norm": 0.6428449153900146,
+      "learning_rate": 0.0001577684222486882,
+      "loss": 0.8957,
+      "step": 6843
+    },
+    {
+      "epoch": 1.2184829059829059,
+      "grad_norm": 0.6542909741401672,
+      "learning_rate": 0.00015775699610143823,
+      "loss": 0.9942,
+      "step": 6844
+    },
+    {
+      "epoch": 1.2186609686609686,
+      "grad_norm": 0.7101675868034363,
+      "learning_rate": 0.00015774556882255992,
+      "loss": 1.015,
+      "step": 6845
+    },
+    {
+      "epoch": 1.2188390313390314,
+      "grad_norm": 0.6606267094612122,
+      "learning_rate": 0.00015773414041227713,
+      "loss": 1.1406,
+      "step": 6846
+    },
+    {
+      "epoch": 1.2190170940170941,
+      "grad_norm": 0.67124342918396,
+      "learning_rate": 0.00015772271087081383,
+      "loss": 1.2392,
+      "step": 6847
+    },
+    {
+      "epoch": 1.2191951566951567,
+      "grad_norm": 0.6615056991577148,
+      "learning_rate": 0.0001577112801983939,
+      "loss": 1.1583,
+      "step": 6848
+    },
+    {
+      "epoch": 1.2193732193732194,
+      "grad_norm": 0.6941317319869995,
+      "learning_rate": 0.0001576998483952413,
+      "loss": 1.0255,
+      "step": 6849
+    },
+    {
+      "epoch": 1.219551282051282,
+      "grad_norm": 0.5740683674812317,
+      "learning_rate": 0.00015768841546158005,
+      "loss": 1.0393,
+      "step": 6850
+    },
+    {
+      "epoch": 1.2197293447293447,
+      "grad_norm": 0.7143667340278625,
+      "learning_rate": 0.00015767698139763415,
+      "loss": 0.7564,
+      "step": 6851
+    },
+    {
+      "epoch": 1.2199074074074074,
+      "grad_norm": 0.6730484366416931,
+      "learning_rate": 0.00015766554620362758,
+      "loss": 1.2221,
+      "step": 6852
+    },
+    {
+      "epoch": 1.2200854700854702,
+      "grad_norm": 0.6883087754249573,
+      "learning_rate": 0.00015765410987978444,
+      "loss": 1.0156,
+      "step": 6853
+    },
+    {
+      "epoch": 1.2202635327635327,
+      "grad_norm": 0.6585961580276489,
+      "learning_rate": 0.00015764267242632875,
+      "loss": 1.0888,
+      "step": 6854
+    },
+    {
+      "epoch": 1.2204415954415955,
+      "grad_norm": 0.6325246691703796,
+      "learning_rate": 0.00015763123384348465,
+      "loss": 0.973,
+      "step": 6855
+    },
+    {
+      "epoch": 1.220619658119658,
+      "grad_norm": 0.5930588245391846,
+      "learning_rate": 0.00015761979413147627,
+      "loss": 0.8551,
+      "step": 6856
+    },
+    {
+      "epoch": 1.2207977207977208,
+      "grad_norm": 0.6440611481666565,
+      "learning_rate": 0.0001576083532905277,
+      "loss": 0.8396,
+      "step": 6857
+    },
+    {
+      "epoch": 1.2209757834757835,
+      "grad_norm": 0.6796659231185913,
+      "learning_rate": 0.00015759691132086315,
+      "loss": 1.0662,
+      "step": 6858
+    },
+    {
+      "epoch": 1.2211538461538463,
+      "grad_norm": 0.6813400983810425,
+      "learning_rate": 0.00015758546822270674,
+      "loss": 1.0457,
+      "step": 6859
+    },
+    {
+      "epoch": 1.2213319088319088,
+      "grad_norm": 0.6871716976165771,
+      "learning_rate": 0.00015757402399628272,
+      "loss": 1.1675,
+      "step": 6860
+    },
+    {
+      "epoch": 1.2215099715099715,
+      "grad_norm": 0.6431481838226318,
+      "learning_rate": 0.00015756257864181524,
+      "loss": 0.9366,
+      "step": 6861
+    },
+    {
+      "epoch": 1.221688034188034,
+      "grad_norm": 0.6061800718307495,
+      "learning_rate": 0.00015755113215952868,
+      "loss": 0.9267,
+      "step": 6862
+    },
+    {
+      "epoch": 1.2218660968660968,
+      "grad_norm": 0.5755770206451416,
+      "learning_rate": 0.00015753968454964722,
+      "loss": 0.7342,
+      "step": 6863
+    },
+    {
+      "epoch": 1.2220441595441596,
+      "grad_norm": 0.571345329284668,
+      "learning_rate": 0.00015752823581239515,
+      "loss": 0.8943,
+      "step": 6864
+    },
+    {
+      "epoch": 1.2222222222222223,
+      "grad_norm": 0.6925615668296814,
+      "learning_rate": 0.0001575167859479968,
+      "loss": 0.8801,
+      "step": 6865
+    },
+    {
+      "epoch": 1.2224002849002849,
+      "grad_norm": 0.6812975406646729,
+      "learning_rate": 0.00015750533495667655,
+      "loss": 0.9567,
+      "step": 6866
+    },
+    {
+      "epoch": 1.2225783475783476,
+      "grad_norm": 0.8216777443885803,
+      "learning_rate": 0.00015749388283865868,
+      "loss": 1.0908,
+      "step": 6867
+    },
+    {
+      "epoch": 1.2227564102564104,
+      "grad_norm": 0.6051010489463806,
+      "learning_rate": 0.00015748242959416763,
+      "loss": 0.8851,
+      "step": 6868
+    },
+    {
+      "epoch": 1.2229344729344729,
+      "grad_norm": 0.7750816345214844,
+      "learning_rate": 0.00015747097522342775,
+      "loss": 1.1526,
+      "step": 6869
+    },
+    {
+      "epoch": 1.2231125356125356,
+      "grad_norm": 0.6240930557250977,
+      "learning_rate": 0.00015745951972666355,
+      "loss": 1.0603,
+      "step": 6870
+    },
+    {
+      "epoch": 1.2232905982905984,
+      "grad_norm": 0.7228875160217285,
+      "learning_rate": 0.00015744806310409937,
+      "loss": 1.1028,
+      "step": 6871
+    },
+    {
+      "epoch": 1.223468660968661,
+      "grad_norm": 0.724075436592102,
+      "learning_rate": 0.00015743660535595978,
+      "loss": 0.8983,
+      "step": 6872
+    },
+    {
+      "epoch": 1.2236467236467237,
+      "grad_norm": 0.6398203372955322,
+      "learning_rate": 0.00015742514648246916,
+      "loss": 1.0548,
+      "step": 6873
+    },
+    {
+      "epoch": 1.2238247863247864,
+      "grad_norm": 0.7024285793304443,
+      "learning_rate": 0.00015741368648385212,
+      "loss": 1.0172,
+      "step": 6874
+    },
+    {
+      "epoch": 1.224002849002849,
+      "grad_norm": 0.6717609763145447,
+      "learning_rate": 0.00015740222536033316,
+      "loss": 0.9002,
+      "step": 6875
+    },
+    {
+      "epoch": 1.2241809116809117,
+      "grad_norm": 0.5886133313179016,
+      "learning_rate": 0.00015739076311213686,
+      "loss": 0.8614,
+      "step": 6876
+    },
+    {
+      "epoch": 1.2243589743589745,
+      "grad_norm": 0.6856684684753418,
+      "learning_rate": 0.00015737929973948776,
+      "loss": 1.1633,
+      "step": 6877
+    },
+    {
+      "epoch": 1.224537037037037,
+      "grad_norm": 0.6771421432495117,
+      "learning_rate": 0.00015736783524261045,
+      "loss": 1.0921,
+      "step": 6878
+    },
+    {
+      "epoch": 1.2247150997150997,
+      "grad_norm": 0.5016412138938904,
+      "learning_rate": 0.0001573563696217296,
+      "loss": 0.6732,
+      "step": 6879
+    },
+    {
+      "epoch": 1.2248931623931625,
+      "grad_norm": 0.7595276236534119,
+      "learning_rate": 0.00015734490287706984,
+      "loss": 1.0427,
+      "step": 6880
+    },
+    {
+      "epoch": 1.225071225071225,
+      "grad_norm": 0.6664281487464905,
+      "learning_rate": 0.00015733343500885582,
+      "loss": 1.2836,
+      "step": 6881
+    },
+    {
+      "epoch": 1.2252492877492878,
+      "grad_norm": 0.6662577390670776,
+      "learning_rate": 0.00015732196601731224,
+      "loss": 1.1288,
+      "step": 6882
+    },
+    {
+      "epoch": 1.2254273504273505,
+      "grad_norm": 0.6238988041877747,
+      "learning_rate": 0.00015731049590266385,
+      "loss": 1.0809,
+      "step": 6883
+    },
+    {
+      "epoch": 1.225605413105413,
+      "grad_norm": 0.6483062505722046,
+      "learning_rate": 0.00015729902466513532,
+      "loss": 0.9992,
+      "step": 6884
+    },
+    {
+      "epoch": 1.2257834757834758,
+      "grad_norm": 0.6890861988067627,
+      "learning_rate": 0.0001572875523049514,
+      "loss": 1.1844,
+      "step": 6885
+    },
+    {
+      "epoch": 1.2259615384615385,
+      "grad_norm": 0.7087607383728027,
+      "learning_rate": 0.00015727607882233695,
+      "loss": 1.013,
+      "step": 6886
+    },
+    {
+      "epoch": 1.226139601139601,
+      "grad_norm": 0.709048867225647,
+      "learning_rate": 0.00015726460421751668,
+      "loss": 0.9748,
+      "step": 6887
+    },
+    {
+      "epoch": 1.2263176638176638,
+      "grad_norm": 0.5918150544166565,
+      "learning_rate": 0.00015725312849071546,
+      "loss": 0.9978,
+      "step": 6888
+    },
+    {
+      "epoch": 1.2264957264957266,
+      "grad_norm": 0.4343377947807312,
+      "learning_rate": 0.0001572416516421581,
+      "loss": 0.6233,
+      "step": 6889
+    },
+    {
+      "epoch": 1.226673789173789,
+      "grad_norm": 0.6360403895378113,
+      "learning_rate": 0.00015723017367206952,
+      "loss": 0.9698,
+      "step": 6890
+    },
+    {
+      "epoch": 1.2268518518518519,
+      "grad_norm": 0.7261984944343567,
+      "learning_rate": 0.00015721869458067454,
+      "loss": 1.0426,
+      "step": 6891
+    },
+    {
+      "epoch": 1.2270299145299146,
+      "grad_norm": 0.6806774139404297,
+      "learning_rate": 0.0001572072143681981,
+      "loss": 0.9692,
+      "step": 6892
+    },
+    {
+      "epoch": 1.2272079772079771,
+      "grad_norm": 0.7140612006187439,
+      "learning_rate": 0.00015719573303486515,
+      "loss": 1.0828,
+      "step": 6893
+    },
+    {
+      "epoch": 1.22738603988604,
+      "grad_norm": 0.5383326411247253,
+      "learning_rate": 0.0001571842505809006,
+      "loss": 1.012,
+      "step": 6894
+    },
+    {
+      "epoch": 1.2275641025641026,
+      "grad_norm": 0.5992259383201599,
+      "learning_rate": 0.0001571727670065295,
+      "loss": 0.876,
+      "step": 6895
+    },
+    {
+      "epoch": 1.2277421652421652,
+      "grad_norm": 0.636696457862854,
+      "learning_rate": 0.00015716128231197676,
+      "loss": 1.1001,
+      "step": 6896
+    },
+    {
+      "epoch": 1.227920227920228,
+      "grad_norm": 0.5980371236801147,
+      "learning_rate": 0.00015714979649746744,
+      "loss": 0.937,
+      "step": 6897
+    },
+    {
+      "epoch": 1.2280982905982907,
+      "grad_norm": 0.7678794860839844,
+      "learning_rate": 0.00015713830956322656,
+      "loss": 1.1965,
+      "step": 6898
+    },
+    {
+      "epoch": 1.2282763532763532,
+      "grad_norm": 0.6918835639953613,
+      "learning_rate": 0.00015712682150947923,
+      "loss": 0.8578,
+      "step": 6899
+    },
+    {
+      "epoch": 1.228454415954416,
+      "grad_norm": 0.6463451385498047,
+      "learning_rate": 0.00015711533233645048,
+      "loss": 1.009,
+      "step": 6900
+    },
+    {
+      "epoch": 1.2286324786324787,
+      "grad_norm": 0.6720646023750305,
+      "learning_rate": 0.00015710384204436549,
+      "loss": 1.0031,
+      "step": 6901
+    },
+    {
+      "epoch": 1.2288105413105412,
+      "grad_norm": 0.6618736982345581,
+      "learning_rate": 0.00015709235063344926,
+      "loss": 0.9017,
+      "step": 6902
+    },
+    {
+      "epoch": 1.228988603988604,
+      "grad_norm": 0.6789427399635315,
+      "learning_rate": 0.0001570808581039271,
+      "loss": 1.1289,
+      "step": 6903
+    },
+    {
+      "epoch": 1.2291666666666667,
+      "grad_norm": 0.6395950317382812,
+      "learning_rate": 0.00015706936445602403,
+      "loss": 1.1051,
+      "step": 6904
+    },
+    {
+      "epoch": 1.2293447293447293,
+      "grad_norm": 0.7023917436599731,
+      "learning_rate": 0.00015705786968996533,
+      "loss": 1.2876,
+      "step": 6905
+    },
+    {
+      "epoch": 1.229522792022792,
+      "grad_norm": 0.7473352551460266,
+      "learning_rate": 0.00015704637380597623,
+      "loss": 1.237,
+      "step": 6906
+    },
+    {
+      "epoch": 1.2297008547008548,
+      "grad_norm": 0.6952672004699707,
+      "learning_rate": 0.00015703487680428192,
+      "loss": 1.0674,
+      "step": 6907
+    },
+    {
+      "epoch": 1.2298789173789173,
+      "grad_norm": 0.5968644022941589,
+      "learning_rate": 0.0001570233786851077,
+      "loss": 0.9169,
+      "step": 6908
+    },
+    {
+      "epoch": 1.23005698005698,
+      "grad_norm": 0.7219798564910889,
+      "learning_rate": 0.0001570118794486788,
+      "loss": 1.0556,
+      "step": 6909
+    },
+    {
+      "epoch": 1.2302350427350428,
+      "grad_norm": 0.6603400707244873,
+      "learning_rate": 0.0001570003790952206,
+      "loss": 0.9596,
+      "step": 6910
+    },
+    {
+      "epoch": 1.2304131054131053,
+      "grad_norm": 0.5972838401794434,
+      "learning_rate": 0.0001569888776249583,
+      "loss": 0.9168,
+      "step": 6911
+    },
+    {
+      "epoch": 1.230591168091168,
+      "grad_norm": 0.792585551738739,
+      "learning_rate": 0.00015697737503811738,
+      "loss": 1.1074,
+      "step": 6912
+    },
+    {
+      "epoch": 1.2307692307692308,
+      "grad_norm": 0.5845609903335571,
+      "learning_rate": 0.00015696587133492314,
+      "loss": 0.8413,
+      "step": 6913
+    },
+    {
+      "epoch": 1.2309472934472934,
+      "grad_norm": 0.6603896021842957,
+      "learning_rate": 0.000156954366515601,
+      "loss": 0.9109,
+      "step": 6914
+    },
+    {
+      "epoch": 1.2311253561253561,
+      "grad_norm": 0.6367142796516418,
+      "learning_rate": 0.00015694286058037636,
+      "loss": 1.0119,
+      "step": 6915
+    },
+    {
+      "epoch": 1.2313034188034189,
+      "grad_norm": 0.693854570388794,
+      "learning_rate": 0.00015693135352947465,
+      "loss": 1.0925,
+      "step": 6916
+    },
+    {
+      "epoch": 1.2314814814814814,
+      "grad_norm": 0.6570404171943665,
+      "learning_rate": 0.00015691984536312135,
+      "loss": 0.9731,
+      "step": 6917
+    },
+    {
+      "epoch": 1.2316595441595442,
+      "grad_norm": 0.6778639554977417,
+      "learning_rate": 0.0001569083360815419,
+      "loss": 1.1415,
+      "step": 6918
+    },
+    {
+      "epoch": 1.231837606837607,
+      "grad_norm": 0.6656233668327332,
+      "learning_rate": 0.00015689682568496182,
+      "loss": 0.8603,
+      "step": 6919
+    },
+    {
+      "epoch": 1.2320156695156694,
+      "grad_norm": 0.6569861173629761,
+      "learning_rate": 0.00015688531417360665,
+      "loss": 0.8374,
+      "step": 6920
+    },
+    {
+      "epoch": 1.2321937321937322,
+      "grad_norm": 0.6746888160705566,
+      "learning_rate": 0.0001568738015477019,
+      "loss": 1.1395,
+      "step": 6921
+    },
+    {
+      "epoch": 1.232371794871795,
+      "grad_norm": 0.6180813908576965,
+      "learning_rate": 0.00015686228780747316,
+      "loss": 1.0049,
+      "step": 6922
+    },
+    {
+      "epoch": 1.2325498575498575,
+      "grad_norm": 0.7326146960258484,
+      "learning_rate": 0.000156850772953146,
+      "loss": 1.2389,
+      "step": 6923
+    },
+    {
+      "epoch": 1.2327279202279202,
+      "grad_norm": 0.5912215709686279,
+      "learning_rate": 0.00015683925698494608,
+      "loss": 1.0174,
+      "step": 6924
+    },
+    {
+      "epoch": 1.232905982905983,
+      "grad_norm": 0.5214745402336121,
+      "learning_rate": 0.00015682773990309895,
+      "loss": 0.5778,
+      "step": 6925
+    },
+    {
+      "epoch": 1.2330840455840455,
+      "grad_norm": 0.6862079501152039,
+      "learning_rate": 0.00015681622170783034,
+      "loss": 0.896,
+      "step": 6926
+    },
+    {
+      "epoch": 1.2332621082621082,
+      "grad_norm": 0.7858926057815552,
+      "learning_rate": 0.00015680470239936586,
+      "loss": 1.0714,
+      "step": 6927
+    },
+    {
+      "epoch": 1.233440170940171,
+      "grad_norm": 0.6706146597862244,
+      "learning_rate": 0.00015679318197793127,
+      "loss": 1.0157,
+      "step": 6928
+    },
+    {
+      "epoch": 1.2336182336182335,
+      "grad_norm": 0.6657105088233948,
+      "learning_rate": 0.00015678166044375225,
+      "loss": 0.9674,
+      "step": 6929
+    },
+    {
+      "epoch": 1.2337962962962963,
+      "grad_norm": 0.6790838837623596,
+      "learning_rate": 0.0001567701377970545,
+      "loss": 0.9744,
+      "step": 6930
+    },
+    {
+      "epoch": 1.233974358974359,
+      "grad_norm": 0.6469771862030029,
+      "learning_rate": 0.00015675861403806386,
+      "loss": 1.0205,
+      "step": 6931
+    },
+    {
+      "epoch": 1.2341524216524216,
+      "grad_norm": 0.4926300346851349,
+      "learning_rate": 0.0001567470891670061,
+      "loss": 0.6336,
+      "step": 6932
+    },
+    {
+      "epoch": 1.2343304843304843,
+      "grad_norm": 0.6762157082557678,
+      "learning_rate": 0.000156735563184107,
+      "loss": 1.059,
+      "step": 6933
+    },
+    {
+      "epoch": 1.234508547008547,
+      "grad_norm": 0.6998521685600281,
+      "learning_rate": 0.0001567240360895924,
+      "loss": 1.0586,
+      "step": 6934
+    },
+    {
+      "epoch": 1.2346866096866096,
+      "grad_norm": 0.5947706699371338,
+      "learning_rate": 0.00015671250788368814,
+      "loss": 0.8815,
+      "step": 6935
+    },
+    {
+      "epoch": 1.2348646723646723,
+      "grad_norm": 0.6966122984886169,
+      "learning_rate": 0.0001567009785666201,
+      "loss": 1.0105,
+      "step": 6936
+    },
+    {
+      "epoch": 1.235042735042735,
+      "grad_norm": 0.6747866272926331,
+      "learning_rate": 0.0001566894481386142,
+      "loss": 0.8783,
+      "step": 6937
+    },
+    {
+      "epoch": 1.2352207977207976,
+      "grad_norm": 0.6348921060562134,
+      "learning_rate": 0.0001566779165998963,
+      "loss": 0.7813,
+      "step": 6938
+    },
+    {
+      "epoch": 1.2353988603988604,
+      "grad_norm": 0.596466600894928,
+      "learning_rate": 0.00015666638395069236,
+      "loss": 0.8689,
+      "step": 6939
+    },
+    {
+      "epoch": 1.2355769230769231,
+      "grad_norm": 0.6926795244216919,
+      "learning_rate": 0.00015665485019122834,
+      "loss": 1.0266,
+      "step": 6940
+    },
+    {
+      "epoch": 1.2357549857549857,
+      "grad_norm": 0.6590100526809692,
+      "learning_rate": 0.00015664331532173022,
+      "loss": 1.128,
+      "step": 6941
+    },
+    {
+      "epoch": 1.2359330484330484,
+      "grad_norm": 0.7422109246253967,
+      "learning_rate": 0.00015663177934242402,
+      "loss": 0.8495,
+      "step": 6942
+    },
+    {
+      "epoch": 1.2361111111111112,
+      "grad_norm": 0.6463228464126587,
+      "learning_rate": 0.0001566202422535357,
+      "loss": 1.0941,
+      "step": 6943
+    },
+    {
+      "epoch": 1.236289173789174,
+      "grad_norm": 0.7278686761856079,
+      "learning_rate": 0.0001566087040552914,
+      "loss": 1.2039,
+      "step": 6944
+    },
+    {
+      "epoch": 1.2364672364672364,
+      "grad_norm": 0.6917086839675903,
+      "learning_rate": 0.00015659716474791712,
+      "loss": 1.042,
+      "step": 6945
+    },
+    {
+      "epoch": 1.2366452991452992,
+      "grad_norm": 0.637205183506012,
+      "learning_rate": 0.00015658562433163898,
+      "loss": 1.0379,
+      "step": 6946
+    },
+    {
+      "epoch": 1.2368233618233617,
+      "grad_norm": 0.6706623435020447,
+      "learning_rate": 0.00015657408280668307,
+      "loss": 1.0347,
+      "step": 6947
+    },
+    {
+      "epoch": 1.2370014245014245,
+      "grad_norm": 0.6435480713844299,
+      "learning_rate": 0.00015656254017327553,
+      "loss": 0.7708,
+      "step": 6948
+    },
+    {
+      "epoch": 1.2371794871794872,
+      "grad_norm": 0.5703113675117493,
+      "learning_rate": 0.0001565509964316425,
+      "loss": 0.8786,
+      "step": 6949
+    },
+    {
+      "epoch": 1.23735754985755,
+      "grad_norm": 0.6438127160072327,
+      "learning_rate": 0.00015653945158201018,
+      "loss": 0.9435,
+      "step": 6950
+    },
+    {
+      "epoch": 1.2375356125356125,
+      "grad_norm": 0.68101966381073,
+      "learning_rate": 0.00015652790562460474,
+      "loss": 1.1062,
+      "step": 6951
+    },
+    {
+      "epoch": 1.2377136752136753,
+      "grad_norm": 0.661230206489563,
+      "learning_rate": 0.00015651635855965242,
+      "loss": 1.0113,
+      "step": 6952
+    },
+    {
+      "epoch": 1.2378917378917378,
+      "grad_norm": 0.6399117708206177,
+      "learning_rate": 0.0001565048103873795,
+      "loss": 1.1423,
+      "step": 6953
+    },
+    {
+      "epoch": 1.2380698005698005,
+      "grad_norm": 0.7614672780036926,
+      "learning_rate": 0.00015649326110801215,
+      "loss": 1.0359,
+      "step": 6954
+    },
+    {
+      "epoch": 1.2382478632478633,
+      "grad_norm": 0.6461986303329468,
+      "learning_rate": 0.00015648171072177674,
+      "loss": 1.0145,
+      "step": 6955
+    },
+    {
+      "epoch": 1.238425925925926,
+      "grad_norm": 0.5902668833732605,
+      "learning_rate": 0.0001564701592288995,
+      "loss": 0.9451,
+      "step": 6956
+    },
+    {
+      "epoch": 1.2386039886039886,
+      "grad_norm": 0.5686020255088806,
+      "learning_rate": 0.00015645860662960682,
+      "loss": 0.7512,
+      "step": 6957
+    },
+    {
+      "epoch": 1.2387820512820513,
+      "grad_norm": 0.6640077829360962,
+      "learning_rate": 0.00015644705292412503,
+      "loss": 0.7133,
+      "step": 6958
+    },
+    {
+      "epoch": 1.2389601139601139,
+      "grad_norm": 0.7402132749557495,
+      "learning_rate": 0.00015643549811268049,
+      "loss": 1.0903,
+      "step": 6959
+    },
+    {
+      "epoch": 1.2391381766381766,
+      "grad_norm": 0.62332683801651,
+      "learning_rate": 0.00015642394219549962,
+      "loss": 0.9378,
+      "step": 6960
+    },
+    {
+      "epoch": 1.2393162393162394,
+      "grad_norm": 0.6374901533126831,
+      "learning_rate": 0.00015641238517280877,
+      "loss": 1.0746,
+      "step": 6961
+    },
+    {
+      "epoch": 1.239494301994302,
+      "grad_norm": 0.5939112901687622,
+      "learning_rate": 0.00015640082704483443,
+      "loss": 0.7185,
+      "step": 6962
+    },
+    {
+      "epoch": 1.2396723646723646,
+      "grad_norm": 0.8378096222877502,
+      "learning_rate": 0.00015638926781180306,
+      "loss": 1.1932,
+      "step": 6963
+    },
+    {
+      "epoch": 1.2398504273504274,
+      "grad_norm": 0.5707982778549194,
+      "learning_rate": 0.0001563777074739411,
+      "loss": 0.9834,
+      "step": 6964
+    },
+    {
+      "epoch": 1.24002849002849,
+      "grad_norm": 0.6339748501777649,
+      "learning_rate": 0.00015636614603147512,
+      "loss": 1.0307,
+      "step": 6965
+    },
+    {
+      "epoch": 1.2402065527065527,
+      "grad_norm": 0.7353155016899109,
+      "learning_rate": 0.00015635458348463156,
+      "loss": 1.0311,
+      "step": 6966
+    },
+    {
+      "epoch": 1.2403846153846154,
+      "grad_norm": 0.8307726979255676,
+      "learning_rate": 0.00015634301983363704,
+      "loss": 1.0673,
+      "step": 6967
+    },
+    {
+      "epoch": 1.2405626780626782,
+      "grad_norm": 0.5299199819564819,
+      "learning_rate": 0.00015633145507871807,
+      "loss": 0.6649,
+      "step": 6968
+    },
+    {
+      "epoch": 1.2407407407407407,
+      "grad_norm": 0.6162533760070801,
+      "learning_rate": 0.00015631988922010126,
+      "loss": 0.8096,
+      "step": 6969
+    },
+    {
+      "epoch": 1.2409188034188035,
+      "grad_norm": 0.6212689876556396,
+      "learning_rate": 0.0001563083222580132,
+      "loss": 1.0371,
+      "step": 6970
+    },
+    {
+      "epoch": 1.241096866096866,
+      "grad_norm": 0.6148123145103455,
+      "learning_rate": 0.00015629675419268055,
+      "loss": 1.0439,
+      "step": 6971
+    },
+    {
+      "epoch": 1.2412749287749287,
+      "grad_norm": 0.6163684129714966,
+      "learning_rate": 0.00015628518502432994,
+      "loss": 0.9075,
+      "step": 6972
+    },
+    {
+      "epoch": 1.2414529914529915,
+      "grad_norm": 0.5127472877502441,
+      "learning_rate": 0.00015627361475318807,
+      "loss": 0.6138,
+      "step": 6973
+    },
+    {
+      "epoch": 1.2416310541310542,
+      "grad_norm": 0.6508103013038635,
+      "learning_rate": 0.0001562620433794816,
+      "loss": 0.9608,
+      "step": 6974
+    },
+    {
+      "epoch": 1.2418091168091168,
+      "grad_norm": 0.6711046695709229,
+      "learning_rate": 0.0001562504709034373,
+      "loss": 1.1494,
+      "step": 6975
+    },
+    {
+      "epoch": 1.2419871794871795,
+      "grad_norm": 0.6831514835357666,
+      "learning_rate": 0.00015623889732528182,
+      "loss": 0.9664,
+      "step": 6976
+    },
+    {
+      "epoch": 1.242165242165242,
+      "grad_norm": 0.693732738494873,
+      "learning_rate": 0.00015622732264524198,
+      "loss": 0.9055,
+      "step": 6977
+    },
+    {
+      "epoch": 1.2423433048433048,
+      "grad_norm": 0.8475173711776733,
+      "learning_rate": 0.00015621574686354456,
+      "loss": 1.2014,
+      "step": 6978
+    },
+    {
+      "epoch": 1.2425213675213675,
+      "grad_norm": 0.6342347264289856,
+      "learning_rate": 0.0001562041699804164,
+      "loss": 1.0691,
+      "step": 6979
+    },
+    {
+      "epoch": 1.2426994301994303,
+      "grad_norm": 0.620517373085022,
+      "learning_rate": 0.00015619259199608422,
+      "loss": 0.7318,
+      "step": 6980
+    },
+    {
+      "epoch": 1.2428774928774928,
+      "grad_norm": 0.589567244052887,
+      "learning_rate": 0.000156181012910775,
+      "loss": 1.0656,
+      "step": 6981
+    },
+    {
+      "epoch": 1.2430555555555556,
+      "grad_norm": 0.7570258975028992,
+      "learning_rate": 0.00015616943272471546,
+      "loss": 1.0517,
+      "step": 6982
+    },
+    {
+      "epoch": 1.243233618233618,
+      "grad_norm": 0.6232032775878906,
+      "learning_rate": 0.00015615785143813262,
+      "loss": 0.8867,
+      "step": 6983
+    },
+    {
+      "epoch": 1.2434116809116809,
+      "grad_norm": 0.630095899105072,
+      "learning_rate": 0.0001561462690512533,
+      "loss": 0.9287,
+      "step": 6984
+    },
+    {
+      "epoch": 1.2435897435897436,
+      "grad_norm": 0.7410848140716553,
+      "learning_rate": 0.00015613468556430454,
+      "loss": 1.162,
+      "step": 6985
+    },
+    {
+      "epoch": 1.2437678062678064,
+      "grad_norm": 0.7574684023857117,
+      "learning_rate": 0.00015612310097751317,
+      "loss": 1.2118,
+      "step": 6986
+    },
+    {
+      "epoch": 1.243945868945869,
+      "grad_norm": 0.580760657787323,
+      "learning_rate": 0.0001561115152911062,
+      "loss": 1.0612,
+      "step": 6987
+    },
+    {
+      "epoch": 1.2441239316239316,
+      "grad_norm": 0.6105104088783264,
+      "learning_rate": 0.00015609992850531073,
+      "loss": 0.9262,
+      "step": 6988
+    },
+    {
+      "epoch": 1.2443019943019944,
+      "grad_norm": 0.669435441493988,
+      "learning_rate": 0.00015608834062035362,
+      "loss": 0.9595,
+      "step": 6989
+    },
+    {
+      "epoch": 1.244480056980057,
+      "grad_norm": 0.6530314683914185,
+      "learning_rate": 0.00015607675163646206,
+      "loss": 0.7987,
+      "step": 6990
+    },
+    {
+      "epoch": 1.2446581196581197,
+      "grad_norm": 0.5801477432250977,
+      "learning_rate": 0.00015606516155386297,
+      "loss": 0.7667,
+      "step": 6991
+    },
+    {
+      "epoch": 1.2448361823361824,
+      "grad_norm": 0.5773885250091553,
+      "learning_rate": 0.00015605357037278355,
+      "loss": 0.847,
+      "step": 6992
+    },
+    {
+      "epoch": 1.245014245014245,
+      "grad_norm": 0.5399810075759888,
+      "learning_rate": 0.00015604197809345082,
+      "loss": 0.9284,
+      "step": 6993
+    },
+    {
+      "epoch": 1.2451923076923077,
+      "grad_norm": 0.5910452604293823,
+      "learning_rate": 0.000156030384716092,
+      "loss": 1.0004,
+      "step": 6994
+    },
+    {
+      "epoch": 1.2453703703703705,
+      "grad_norm": 0.5979224443435669,
+      "learning_rate": 0.00015601879024093414,
+      "loss": 0.9027,
+      "step": 6995
+    },
+    {
+      "epoch": 1.245548433048433,
+      "grad_norm": 0.6092126369476318,
+      "learning_rate": 0.0001560071946682045,
+      "loss": 0.9755,
+      "step": 6996
+    },
+    {
+      "epoch": 1.2457264957264957,
+      "grad_norm": 0.6536708474159241,
+      "learning_rate": 0.0001559955979981302,
+      "loss": 1.1828,
+      "step": 6997
+    },
+    {
+      "epoch": 1.2459045584045585,
+      "grad_norm": 0.6602030992507935,
+      "learning_rate": 0.00015598400023093847,
+      "loss": 1.0395,
+      "step": 6998
+    },
+    {
+      "epoch": 1.246082621082621,
+      "grad_norm": 0.6864825487136841,
+      "learning_rate": 0.00015597240136685657,
+      "loss": 1.083,
+      "step": 6999
+    },
+    {
+      "epoch": 1.2462606837606838,
+      "grad_norm": 0.6194674968719482,
+      "learning_rate": 0.0001559608014061117,
+      "loss": 1.0461,
+      "step": 7000
+    },
+    {
+      "epoch": 1.2464387464387465,
+      "grad_norm": 0.5879074335098267,
+      "learning_rate": 0.00015594920034893122,
+      "loss": 1.076,
+      "step": 7001
+    },
+    {
+      "epoch": 1.246616809116809,
+      "grad_norm": 0.6514387726783752,
+      "learning_rate": 0.00015593759819554234,
+      "loss": 1.0396,
+      "step": 7002
+    },
+    {
+      "epoch": 1.2467948717948718,
+      "grad_norm": 0.5988301634788513,
+      "learning_rate": 0.00015592599494617247,
+      "loss": 0.9501,
+      "step": 7003
+    },
+    {
+      "epoch": 1.2469729344729346,
+      "grad_norm": 0.6282773613929749,
+      "learning_rate": 0.00015591439060104887,
+      "loss": 1.1002,
+      "step": 7004
+    },
+    {
+      "epoch": 1.247150997150997,
+      "grad_norm": 0.6910465955734253,
+      "learning_rate": 0.00015590278516039896,
+      "loss": 1.1771,
+      "step": 7005
+    },
+    {
+      "epoch": 1.2473290598290598,
+      "grad_norm": 0.6097282767295837,
+      "learning_rate": 0.00015589117862445007,
+      "loss": 1.0707,
+      "step": 7006
+    },
+    {
+      "epoch": 1.2475071225071226,
+      "grad_norm": 0.7076875567436218,
+      "learning_rate": 0.00015587957099342967,
+      "loss": 1.0078,
+      "step": 7007
+    },
+    {
+      "epoch": 1.2476851851851851,
+      "grad_norm": 0.6776556372642517,
+      "learning_rate": 0.00015586796226756518,
+      "loss": 0.8971,
+      "step": 7008
+    },
+    {
+      "epoch": 1.2478632478632479,
+      "grad_norm": 0.6506341695785522,
+      "learning_rate": 0.00015585635244708398,
+      "loss": 0.9727,
+      "step": 7009
+    },
+    {
+      "epoch": 1.2480413105413106,
+      "grad_norm": 0.624724805355072,
+      "learning_rate": 0.00015584474153221357,
+      "loss": 0.9858,
+      "step": 7010
+    },
+    {
+      "epoch": 1.2482193732193732,
+      "grad_norm": 0.6070096492767334,
+      "learning_rate": 0.0001558331295231815,
+      "loss": 0.9385,
+      "step": 7011
+    },
+    {
+      "epoch": 1.248397435897436,
+      "grad_norm": 0.6948656439781189,
+      "learning_rate": 0.00015582151642021524,
+      "loss": 0.9425,
+      "step": 7012
+    },
+    {
+      "epoch": 1.2485754985754987,
+      "grad_norm": 0.6559088230133057,
+      "learning_rate": 0.0001558099022235423,
+      "loss": 1.0002,
+      "step": 7013
+    },
+    {
+      "epoch": 1.2487535612535612,
+      "grad_norm": 0.6097117066383362,
+      "learning_rate": 0.00015579828693339026,
+      "loss": 1.0234,
+      "step": 7014
+    },
+    {
+      "epoch": 1.248931623931624,
+      "grad_norm": 0.6612260341644287,
+      "learning_rate": 0.00015578667054998673,
+      "loss": 1.1376,
+      "step": 7015
+    },
+    {
+      "epoch": 1.2491096866096867,
+      "grad_norm": 0.6305607557296753,
+      "learning_rate": 0.00015577505307355925,
+      "loss": 0.9127,
+      "step": 7016
+    },
+    {
+      "epoch": 1.2492877492877492,
+      "grad_norm": 0.6648319959640503,
+      "learning_rate": 0.00015576343450433549,
+      "loss": 0.8697,
+      "step": 7017
+    },
+    {
+      "epoch": 1.249465811965812,
+      "grad_norm": 0.7642946839332581,
+      "learning_rate": 0.00015575181484254303,
+      "loss": 1.0998,
+      "step": 7018
+    },
+    {
+      "epoch": 1.2496438746438747,
+      "grad_norm": 0.6775243282318115,
+      "learning_rate": 0.00015574019408840962,
+      "loss": 1.0186,
+      "step": 7019
+    },
+    {
+      "epoch": 1.2498219373219372,
+      "grad_norm": 0.6075591444969177,
+      "learning_rate": 0.00015572857224216286,
+      "loss": 0.9592,
+      "step": 7020
+    },
+    {
+      "epoch": 1.2498219373219372,
+      "eval_loss": 1.105136752128601,
+      "eval_runtime": 24.4793,
+      "eval_samples_per_second": 42.526,
+      "eval_steps_per_second": 21.283,
+      "step": 7020
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 0.5856962203979492,
+      "learning_rate": 0.0001557169493040305,
+      "loss": 0.8336,
+      "step": 7021
+    },
+    {
+      "epoch": 1.2501780626780628,
+      "grad_norm": 0.6451364159584045,
+      "learning_rate": 0.00015570532527424028,
+      "loss": 0.8805,
+      "step": 7022
+    },
+    {
+      "epoch": 1.2503561253561253,
+      "grad_norm": 0.6266474723815918,
+      "learning_rate": 0.00015569370015301991,
+      "loss": 1.0023,
+      "step": 7023
+    },
+    {
+      "epoch": 1.250534188034188,
+      "grad_norm": 0.5547378063201904,
+      "learning_rate": 0.00015568207394059722,
+      "loss": 0.7385,
+      "step": 7024
+    },
+    {
+      "epoch": 1.2507122507122508,
+      "grad_norm": 0.604169487953186,
+      "learning_rate": 0.0001556704466371999,
+      "loss": 0.9194,
+      "step": 7025
+    },
+    {
+      "epoch": 1.2508903133903133,
+      "grad_norm": 0.7054405212402344,
+      "learning_rate": 0.00015565881824305586,
+      "loss": 1.1864,
+      "step": 7026
+    },
+    {
+      "epoch": 1.251068376068376,
+      "grad_norm": 0.6429929733276367,
+      "learning_rate": 0.0001556471887583929,
+      "loss": 1.0129,
+      "step": 7027
+    },
+    {
+      "epoch": 1.2512464387464388,
+      "grad_norm": 0.695957362651825,
+      "learning_rate": 0.00015563555818343887,
+      "loss": 1.2994,
+      "step": 7028
+    },
+    {
+      "epoch": 1.2514245014245013,
+      "grad_norm": 0.5889938473701477,
+      "learning_rate": 0.0001556239265184216,
+      "loss": 1.0109,
+      "step": 7029
+    },
+    {
+      "epoch": 1.251602564102564,
+      "grad_norm": 0.6424569487571716,
+      "learning_rate": 0.0001556122937635691,
+      "loss": 0.8585,
+      "step": 7030
+    },
+    {
+      "epoch": 1.2517806267806268,
+      "grad_norm": 0.5561244487762451,
+      "learning_rate": 0.0001556006599191092,
+      "loss": 0.9994,
+      "step": 7031
+    },
+    {
+      "epoch": 1.2519586894586894,
+      "grad_norm": 0.6355302333831787,
+      "learning_rate": 0.00015558902498526988,
+      "loss": 0.9495,
+      "step": 7032
+    },
+    {
+      "epoch": 1.2521367521367521,
+      "grad_norm": 0.6272686719894409,
+      "learning_rate": 0.00015557738896227908,
+      "loss": 0.7611,
+      "step": 7033
+    },
+    {
+      "epoch": 1.2523148148148149,
+      "grad_norm": 0.7069199085235596,
+      "learning_rate": 0.00015556575185036482,
+      "loss": 1.0612,
+      "step": 7034
+    },
+    {
+      "epoch": 1.2524928774928774,
+      "grad_norm": 0.6635094285011292,
+      "learning_rate": 0.00015555411364975505,
+      "loss": 1.1182,
+      "step": 7035
+    },
+    {
+      "epoch": 1.2526709401709402,
+      "grad_norm": 0.6112014651298523,
+      "learning_rate": 0.00015554247436067785,
+      "loss": 0.8677,
+      "step": 7036
+    },
+    {
+      "epoch": 1.252849002849003,
+      "grad_norm": 0.678963303565979,
+      "learning_rate": 0.00015553083398336126,
+      "loss": 1.1421,
+      "step": 7037
+    },
+    {
+      "epoch": 1.2530270655270654,
+      "grad_norm": 0.6291939616203308,
+      "learning_rate": 0.0001555191925180333,
+      "loss": 0.9157,
+      "step": 7038
+    },
+    {
+      "epoch": 1.2532051282051282,
+      "grad_norm": 0.6519795656204224,
+      "learning_rate": 0.0001555075499649221,
+      "loss": 1.0074,
+      "step": 7039
+    },
+    {
+      "epoch": 1.253383190883191,
+      "grad_norm": 0.6063529849052429,
+      "learning_rate": 0.00015549590632425576,
+      "loss": 1.0205,
+      "step": 7040
+    },
+    {
+      "epoch": 1.2535612535612537,
+      "grad_norm": 0.7055633664131165,
+      "learning_rate": 0.00015548426159626242,
+      "loss": 1.0254,
+      "step": 7041
+    },
+    {
+      "epoch": 1.2537393162393162,
+      "grad_norm": 0.6783022880554199,
+      "learning_rate": 0.00015547261578117025,
+      "loss": 1.1017,
+      "step": 7042
+    },
+    {
+      "epoch": 1.253917378917379,
+      "grad_norm": 0.7055003643035889,
+      "learning_rate": 0.0001554609688792074,
+      "loss": 1.0269,
+      "step": 7043
+    },
+    {
+      "epoch": 1.2540954415954415,
+      "grad_norm": 0.6465007662773132,
+      "learning_rate": 0.0001554493208906021,
+      "loss": 1.0492,
+      "step": 7044
+    },
+    {
+      "epoch": 1.2542735042735043,
+      "grad_norm": 0.6443775296211243,
+      "learning_rate": 0.0001554376718155825,
+      "loss": 0.9778,
+      "step": 7045
+    },
+    {
+      "epoch": 1.254451566951567,
+      "grad_norm": 0.695214569568634,
+      "learning_rate": 0.0001554260216543769,
+      "loss": 0.8792,
+      "step": 7046
+    },
+    {
+      "epoch": 1.2546296296296298,
+      "grad_norm": 0.6777814626693726,
+      "learning_rate": 0.00015541437040721354,
+      "loss": 0.8944,
+      "step": 7047
+    },
+    {
+      "epoch": 1.2548076923076923,
+      "grad_norm": 0.6269369721412659,
+      "learning_rate": 0.0001554027180743207,
+      "loss": 0.8825,
+      "step": 7048
+    },
+    {
+      "epoch": 1.254985754985755,
+      "grad_norm": 0.6197061538696289,
+      "learning_rate": 0.0001553910646559267,
+      "loss": 0.9823,
+      "step": 7049
+    },
+    {
+      "epoch": 1.2551638176638176,
+      "grad_norm": 0.681347131729126,
+      "learning_rate": 0.00015537941015225984,
+      "loss": 0.995,
+      "step": 7050
+    },
+    {
+      "epoch": 1.2553418803418803,
+      "grad_norm": 0.6224286556243896,
+      "learning_rate": 0.00015536775456354848,
+      "loss": 0.7714,
+      "step": 7051
+    },
+    {
+      "epoch": 1.255519943019943,
+      "grad_norm": 0.6113278269767761,
+      "learning_rate": 0.00015535609789002098,
+      "loss": 0.9859,
+      "step": 7052
+    },
+    {
+      "epoch": 1.2556980056980058,
+      "grad_norm": 0.6985422372817993,
+      "learning_rate": 0.00015534444013190577,
+      "loss": 0.8785,
+      "step": 7053
+    },
+    {
+      "epoch": 1.2558760683760684,
+      "grad_norm": 0.5602933168411255,
+      "learning_rate": 0.00015533278128943118,
+      "loss": 0.8341,
+      "step": 7054
+    },
+    {
+      "epoch": 1.256054131054131,
+      "grad_norm": 0.587684690952301,
+      "learning_rate": 0.0001553211213628257,
+      "loss": 0.7933,
+      "step": 7055
+    },
+    {
+      "epoch": 1.2562321937321936,
+      "grad_norm": 0.692997932434082,
+      "learning_rate": 0.0001553094603523178,
+      "loss": 1.0957,
+      "step": 7056
+    },
+    {
+      "epoch": 1.2564102564102564,
+      "grad_norm": 0.6925587058067322,
+      "learning_rate": 0.00015529779825813588,
+      "loss": 0.8602,
+      "step": 7057
+    },
+    {
+      "epoch": 1.2565883190883191,
+      "grad_norm": 0.6383063197135925,
+      "learning_rate": 0.0001552861350805085,
+      "loss": 0.9933,
+      "step": 7058
+    },
+    {
+      "epoch": 1.256766381766382,
+      "grad_norm": 0.6520544290542603,
+      "learning_rate": 0.00015527447081966413,
+      "loss": 0.9498,
+      "step": 7059
+    },
+    {
+      "epoch": 1.2569444444444444,
+      "grad_norm": 0.7353914380073547,
+      "learning_rate": 0.00015526280547583133,
+      "loss": 1.1071,
+      "step": 7060
+    },
+    {
+      "epoch": 1.2571225071225072,
+      "grad_norm": 0.7141618132591248,
+      "learning_rate": 0.00015525113904923864,
+      "loss": 0.8333,
+      "step": 7061
+    },
+    {
+      "epoch": 1.2573005698005697,
+      "grad_norm": 0.6194499731063843,
+      "learning_rate": 0.00015523947154011468,
+      "loss": 0.9421,
+      "step": 7062
+    },
+    {
+      "epoch": 1.2574786324786325,
+      "grad_norm": 0.7514514327049255,
+      "learning_rate": 0.00015522780294868803,
+      "loss": 1.226,
+      "step": 7063
+    },
+    {
+      "epoch": 1.2576566951566952,
+      "grad_norm": 0.762923538684845,
+      "learning_rate": 0.0001552161332751873,
+      "loss": 1.1893,
+      "step": 7064
+    },
+    {
+      "epoch": 1.257834757834758,
+      "grad_norm": 0.6265730261802673,
+      "learning_rate": 0.00015520446251984113,
+      "loss": 0.6604,
+      "step": 7065
+    },
+    {
+      "epoch": 1.2580128205128205,
+      "grad_norm": 0.6447750329971313,
+      "learning_rate": 0.0001551927906828782,
+      "loss": 0.9814,
+      "step": 7066
+    },
+    {
+      "epoch": 1.2581908831908832,
+      "grad_norm": 0.5791042447090149,
+      "learning_rate": 0.00015518111776452722,
+      "loss": 0.8283,
+      "step": 7067
+    },
+    {
+      "epoch": 1.2583689458689458,
+      "grad_norm": 0.5267777442932129,
+      "learning_rate": 0.00015516944376501682,
+      "loss": 0.5748,
+      "step": 7068
+    },
+    {
+      "epoch": 1.2585470085470085,
+      "grad_norm": 0.7343912720680237,
+      "learning_rate": 0.0001551577686845758,
+      "loss": 1.1777,
+      "step": 7069
+    },
+    {
+      "epoch": 1.2587250712250713,
+      "grad_norm": 0.645746111869812,
+      "learning_rate": 0.00015514609252343284,
+      "loss": 0.9356,
+      "step": 7070
+    },
+    {
+      "epoch": 1.258903133903134,
+      "grad_norm": 0.6993104219436646,
+      "learning_rate": 0.0001551344152818168,
+      "loss": 1.06,
+      "step": 7071
+    },
+    {
+      "epoch": 1.2590811965811965,
+      "grad_norm": 0.6661365628242493,
+      "learning_rate": 0.0001551227369599564,
+      "loss": 1.061,
+      "step": 7072
+    },
+    {
+      "epoch": 1.2592592592592593,
+      "grad_norm": 0.7833736538887024,
+      "learning_rate": 0.0001551110575580805,
+      "loss": 0.9674,
+      "step": 7073
+    },
+    {
+      "epoch": 1.2594373219373218,
+      "grad_norm": 0.5878575444221497,
+      "learning_rate": 0.00015509937707641787,
+      "loss": 0.9002,
+      "step": 7074
+    },
+    {
+      "epoch": 1.2596153846153846,
+      "grad_norm": 0.6402907371520996,
+      "learning_rate": 0.00015508769551519745,
+      "loss": 1.0157,
+      "step": 7075
+    },
+    {
+      "epoch": 1.2597934472934473,
+      "grad_norm": 0.6794611215591431,
+      "learning_rate": 0.00015507601287464805,
+      "loss": 1.052,
+      "step": 7076
+    },
+    {
+      "epoch": 1.25997150997151,
+      "grad_norm": 0.706922173500061,
+      "learning_rate": 0.0001550643291549986,
+      "loss": 1.0814,
+      "step": 7077
+    },
+    {
+      "epoch": 1.2601495726495726,
+      "grad_norm": 0.6722953915596008,
+      "learning_rate": 0.000155052644356478,
+      "loss": 1.1402,
+      "step": 7078
+    },
+    {
+      "epoch": 1.2603276353276354,
+      "grad_norm": 0.6619611978530884,
+      "learning_rate": 0.00015504095847931518,
+      "loss": 0.9583,
+      "step": 7079
+    },
+    {
+      "epoch": 1.260505698005698,
+      "grad_norm": 0.5645583271980286,
+      "learning_rate": 0.00015502927152373914,
+      "loss": 0.6746,
+      "step": 7080
+    },
+    {
+      "epoch": 1.2606837606837606,
+      "grad_norm": 0.6634977459907532,
+      "learning_rate": 0.00015501758348997882,
+      "loss": 1.0451,
+      "step": 7081
+    },
+    {
+      "epoch": 1.2608618233618234,
+      "grad_norm": 0.7167651057243347,
+      "learning_rate": 0.00015500589437826326,
+      "loss": 0.931,
+      "step": 7082
+    },
+    {
+      "epoch": 1.2610398860398861,
+      "grad_norm": 0.6179340481758118,
+      "learning_rate": 0.00015499420418882146,
+      "loss": 1.0953,
+      "step": 7083
+    },
+    {
+      "epoch": 1.2612179487179487,
+      "grad_norm": 0.6948468685150146,
+      "learning_rate": 0.00015498251292188247,
+      "loss": 1.0277,
+      "step": 7084
+    },
+    {
+      "epoch": 1.2613960113960114,
+      "grad_norm": 0.6256045699119568,
+      "learning_rate": 0.00015497082057767532,
+      "loss": 1.0154,
+      "step": 7085
+    },
+    {
+      "epoch": 1.261574074074074,
+      "grad_norm": 0.6457428336143494,
+      "learning_rate": 0.0001549591271564292,
+      "loss": 0.9693,
+      "step": 7086
+    },
+    {
+      "epoch": 1.2617521367521367,
+      "grad_norm": 0.722259521484375,
+      "learning_rate": 0.0001549474326583731,
+      "loss": 0.9176,
+      "step": 7087
+    },
+    {
+      "epoch": 1.2619301994301995,
+      "grad_norm": 0.742477297782898,
+      "learning_rate": 0.0001549357370837362,
+      "loss": 0.9813,
+      "step": 7088
+    },
+    {
+      "epoch": 1.2621082621082622,
+      "grad_norm": 0.5981723666191101,
+      "learning_rate": 0.0001549240404327477,
+      "loss": 0.8943,
+      "step": 7089
+    },
+    {
+      "epoch": 1.2622863247863247,
+      "grad_norm": 0.6266574859619141,
+      "learning_rate": 0.00015491234270563665,
+      "loss": 0.8439,
+      "step": 7090
+    },
+    {
+      "epoch": 1.2624643874643875,
+      "grad_norm": 0.6723998188972473,
+      "learning_rate": 0.00015490064390263238,
+      "loss": 1.2278,
+      "step": 7091
+    },
+    {
+      "epoch": 1.26264245014245,
+      "grad_norm": 0.6628100275993347,
+      "learning_rate": 0.00015488894402396398,
+      "loss": 0.9526,
+      "step": 7092
+    },
+    {
+      "epoch": 1.2628205128205128,
+      "grad_norm": 0.6661350727081299,
+      "learning_rate": 0.0001548772430698608,
+      "loss": 0.974,
+      "step": 7093
+    },
+    {
+      "epoch": 1.2629985754985755,
+      "grad_norm": 0.8210669755935669,
+      "learning_rate": 0.000154865541040552,
+      "loss": 1.1142,
+      "step": 7094
+    },
+    {
+      "epoch": 1.2631766381766383,
+      "grad_norm": 0.6329003572463989,
+      "learning_rate": 0.0001548538379362669,
+      "loss": 0.8485,
+      "step": 7095
+    },
+    {
+      "epoch": 1.2633547008547008,
+      "grad_norm": 0.6288384795188904,
+      "learning_rate": 0.0001548421337572348,
+      "loss": 0.816,
+      "step": 7096
+    },
+    {
+      "epoch": 1.2635327635327636,
+      "grad_norm": 0.631060004234314,
+      "learning_rate": 0.00015483042850368504,
+      "loss": 0.8237,
+      "step": 7097
+    },
+    {
+      "epoch": 1.263710826210826,
+      "grad_norm": 0.7343839406967163,
+      "learning_rate": 0.0001548187221758469,
+      "loss": 1.1507,
+      "step": 7098
+    },
+    {
+      "epoch": 1.2638888888888888,
+      "grad_norm": 0.6313042640686035,
+      "learning_rate": 0.0001548070147739498,
+      "loss": 0.7762,
+      "step": 7099
+    },
+    {
+      "epoch": 1.2640669515669516,
+      "grad_norm": 0.6449850797653198,
+      "learning_rate": 0.00015479530629822308,
+      "loss": 0.9225,
+      "step": 7100
+    },
+    {
+      "epoch": 1.2642450142450143,
+      "grad_norm": 0.6371589303016663,
+      "learning_rate": 0.00015478359674889617,
+      "loss": 1.0088,
+      "step": 7101
+    },
+    {
+      "epoch": 1.2644230769230769,
+      "grad_norm": 0.6483678221702576,
+      "learning_rate": 0.00015477188612619849,
+      "loss": 0.6234,
+      "step": 7102
+    },
+    {
+      "epoch": 1.2646011396011396,
+      "grad_norm": 0.6945441365242004,
+      "learning_rate": 0.00015476017443035947,
+      "loss": 1.123,
+      "step": 7103
+    },
+    {
+      "epoch": 1.2647792022792022,
+      "grad_norm": 0.6356340050697327,
+      "learning_rate": 0.00015474846166160856,
+      "loss": 0.9923,
+      "step": 7104
+    },
+    {
+      "epoch": 1.264957264957265,
+      "grad_norm": 0.6774702668190002,
+      "learning_rate": 0.00015473674782017532,
+      "loss": 0.9694,
+      "step": 7105
+    },
+    {
+      "epoch": 1.2651353276353277,
+      "grad_norm": 0.6332793831825256,
+      "learning_rate": 0.0001547250329062892,
+      "loss": 1.0633,
+      "step": 7106
+    },
+    {
+      "epoch": 1.2653133903133904,
+      "grad_norm": 0.6563684344291687,
+      "learning_rate": 0.00015471331692017972,
+      "loss": 1.0893,
+      "step": 7107
+    },
+    {
+      "epoch": 1.265491452991453,
+      "grad_norm": 0.7318371534347534,
+      "learning_rate": 0.0001547015998620765,
+      "loss": 1.1777,
+      "step": 7108
+    },
+    {
+      "epoch": 1.2656695156695157,
+      "grad_norm": 0.7099173069000244,
+      "learning_rate": 0.000154689881732209,
+      "loss": 1.1717,
+      "step": 7109
+    },
+    {
+      "epoch": 1.2658475783475782,
+      "grad_norm": 0.661078691482544,
+      "learning_rate": 0.00015467816253080693,
+      "loss": 1.0448,
+      "step": 7110
+    },
+    {
+      "epoch": 1.266025641025641,
+      "grad_norm": 0.6206802129745483,
+      "learning_rate": 0.0001546664422580998,
+      "loss": 0.9334,
+      "step": 7111
+    },
+    {
+      "epoch": 1.2662037037037037,
+      "grad_norm": 0.6514355540275574,
+      "learning_rate": 0.00015465472091431728,
+      "loss": 0.9533,
+      "step": 7112
+    },
+    {
+      "epoch": 1.2663817663817665,
+      "grad_norm": 0.6090209484100342,
+      "learning_rate": 0.0001546429984996891,
+      "loss": 0.9206,
+      "step": 7113
+    },
+    {
+      "epoch": 1.266559829059829,
+      "grad_norm": 0.6345987915992737,
+      "learning_rate": 0.00015463127501444488,
+      "loss": 1.0537,
+      "step": 7114
+    },
+    {
+      "epoch": 1.2667378917378918,
+      "grad_norm": 0.6095160245895386,
+      "learning_rate": 0.0001546195504588143,
+      "loss": 0.8652,
+      "step": 7115
+    },
+    {
+      "epoch": 1.2669159544159543,
+      "grad_norm": 0.6751621961593628,
+      "learning_rate": 0.00015460782483302707,
+      "loss": 0.9001,
+      "step": 7116
+    },
+    {
+      "epoch": 1.267094017094017,
+      "grad_norm": 0.6261575222015381,
+      "learning_rate": 0.00015459609813731295,
+      "loss": 0.929,
+      "step": 7117
+    },
+    {
+      "epoch": 1.2672720797720798,
+      "grad_norm": 0.589495837688446,
+      "learning_rate": 0.0001545843703719017,
+      "loss": 0.9023,
+      "step": 7118
+    },
+    {
+      "epoch": 1.2674501424501425,
+      "grad_norm": 0.6364617943763733,
+      "learning_rate": 0.00015457264153702311,
+      "loss": 0.8261,
+      "step": 7119
+    },
+    {
+      "epoch": 1.267628205128205,
+      "grad_norm": 0.6685599684715271,
+      "learning_rate": 0.00015456091163290698,
+      "loss": 1.1267,
+      "step": 7120
+    },
+    {
+      "epoch": 1.2678062678062678,
+      "grad_norm": 0.6440932750701904,
+      "learning_rate": 0.0001545491806597831,
+      "loss": 0.9643,
+      "step": 7121
+    },
+    {
+      "epoch": 1.2679843304843303,
+      "grad_norm": 0.7641597390174866,
+      "learning_rate": 0.00015453744861788137,
+      "loss": 1.1577,
+      "step": 7122
+    },
+    {
+      "epoch": 1.268162393162393,
+      "grad_norm": 0.6965937614440918,
+      "learning_rate": 0.00015452571550743163,
+      "loss": 0.7835,
+      "step": 7123
+    },
+    {
+      "epoch": 1.2683404558404558,
+      "grad_norm": 0.6332844495773315,
+      "learning_rate": 0.00015451398132866376,
+      "loss": 0.9794,
+      "step": 7124
+    },
+    {
+      "epoch": 1.2685185185185186,
+      "grad_norm": 0.6719903349876404,
+      "learning_rate": 0.00015450224608180765,
+      "loss": 0.9795,
+      "step": 7125
+    },
+    {
+      "epoch": 1.2686965811965811,
+      "grad_norm": 0.567414402961731,
+      "learning_rate": 0.00015449050976709328,
+      "loss": 0.9737,
+      "step": 7126
+    },
+    {
+      "epoch": 1.2688746438746439,
+      "grad_norm": 0.6810645461082458,
+      "learning_rate": 0.0001544787723847505,
+      "loss": 1.2358,
+      "step": 7127
+    },
+    {
+      "epoch": 1.2690527065527066,
+      "grad_norm": 0.6693191528320312,
+      "learning_rate": 0.00015446703393500938,
+      "loss": 0.9475,
+      "step": 7128
+    },
+    {
+      "epoch": 1.2692307692307692,
+      "grad_norm": 0.7077522277832031,
+      "learning_rate": 0.00015445529441809988,
+      "loss": 1.013,
+      "step": 7129
+    },
+    {
+      "epoch": 1.269408831908832,
+      "grad_norm": 0.6596258878707886,
+      "learning_rate": 0.000154443553834252,
+      "loss": 1.1506,
+      "step": 7130
+    },
+    {
+      "epoch": 1.2695868945868947,
+      "grad_norm": 0.6721500754356384,
+      "learning_rate": 0.0001544318121836958,
+      "loss": 0.8848,
+      "step": 7131
+    },
+    {
+      "epoch": 1.2697649572649572,
+      "grad_norm": 0.6943998336791992,
+      "learning_rate": 0.00015442006946666132,
+      "loss": 1.1118,
+      "step": 7132
+    },
+    {
+      "epoch": 1.26994301994302,
+      "grad_norm": 0.6132234930992126,
+      "learning_rate": 0.0001544083256833786,
+      "loss": 0.9932,
+      "step": 7133
+    },
+    {
+      "epoch": 1.2701210826210827,
+      "grad_norm": 0.7337939739227295,
+      "learning_rate": 0.00015439658083407775,
+      "loss": 1.0973,
+      "step": 7134
+    },
+    {
+      "epoch": 1.2702991452991452,
+      "grad_norm": 0.6551772356033325,
+      "learning_rate": 0.00015438483491898893,
+      "loss": 1.0006,
+      "step": 7135
+    },
+    {
+      "epoch": 1.270477207977208,
+      "grad_norm": 0.660068929195404,
+      "learning_rate": 0.00015437308793834223,
+      "loss": 0.9291,
+      "step": 7136
+    },
+    {
+      "epoch": 1.2706552706552707,
+      "grad_norm": 0.7622788548469543,
+      "learning_rate": 0.00015436133989236783,
+      "loss": 1.0782,
+      "step": 7137
+    },
+    {
+      "epoch": 1.2708333333333333,
+      "grad_norm": 0.848494291305542,
+      "learning_rate": 0.00015434959078129587,
+      "loss": 1.2001,
+      "step": 7138
+    },
+    {
+      "epoch": 1.271011396011396,
+      "grad_norm": 0.6222602725028992,
+      "learning_rate": 0.0001543378406053566,
+      "loss": 1.011,
+      "step": 7139
+    },
+    {
+      "epoch": 1.2711894586894588,
+      "grad_norm": 0.6164663434028625,
+      "learning_rate": 0.00015432608936478026,
+      "loss": 1.0282,
+      "step": 7140
+    },
+    {
+      "epoch": 1.2713675213675213,
+      "grad_norm": 0.7236546277999878,
+      "learning_rate": 0.000154314337059797,
+      "loss": 1.0112,
+      "step": 7141
+    },
+    {
+      "epoch": 1.271545584045584,
+      "grad_norm": 0.6891111135482788,
+      "learning_rate": 0.00015430258369063715,
+      "loss": 1.1191,
+      "step": 7142
+    },
+    {
+      "epoch": 1.2717236467236468,
+      "grad_norm": 0.6600295901298523,
+      "learning_rate": 0.00015429082925753099,
+      "loss": 0.9561,
+      "step": 7143
+    },
+    {
+      "epoch": 1.2719017094017093,
+      "grad_norm": 0.6819902062416077,
+      "learning_rate": 0.0001542790737607088,
+      "loss": 1.0631,
+      "step": 7144
+    },
+    {
+      "epoch": 1.272079772079772,
+      "grad_norm": 0.6518470644950867,
+      "learning_rate": 0.0001542673172004009,
+      "loss": 1.0806,
+      "step": 7145
+    },
+    {
+      "epoch": 1.2722578347578348,
+      "grad_norm": 0.737501859664917,
+      "learning_rate": 0.00015425555957683767,
+      "loss": 1.0144,
+      "step": 7146
+    },
+    {
+      "epoch": 1.2724358974358974,
+      "grad_norm": 0.6245740652084351,
+      "learning_rate": 0.00015424380089024944,
+      "loss": 1.0612,
+      "step": 7147
+    },
+    {
+      "epoch": 1.27261396011396,
+      "grad_norm": 0.7118125557899475,
+      "learning_rate": 0.0001542320411408666,
+      "loss": 1.1458,
+      "step": 7148
+    },
+    {
+      "epoch": 1.2727920227920229,
+      "grad_norm": 0.6965761780738831,
+      "learning_rate": 0.00015422028032891958,
+      "loss": 0.8052,
+      "step": 7149
+    },
+    {
+      "epoch": 1.2729700854700854,
+      "grad_norm": 0.7661466598510742,
+      "learning_rate": 0.0001542085184546388,
+      "loss": 1.1245,
+      "step": 7150
+    },
+    {
+      "epoch": 1.2731481481481481,
+      "grad_norm": 0.7238876223564148,
+      "learning_rate": 0.00015419675551825475,
+      "loss": 0.9346,
+      "step": 7151
+    },
+    {
+      "epoch": 1.273326210826211,
+      "grad_norm": 0.669562041759491,
+      "learning_rate": 0.0001541849915199978,
+      "loss": 0.7816,
+      "step": 7152
+    },
+    {
+      "epoch": 1.2735042735042734,
+      "grad_norm": 0.6799174547195435,
+      "learning_rate": 0.00015417322646009855,
+      "loss": 1.047,
+      "step": 7153
+    },
+    {
+      "epoch": 1.2736823361823362,
+      "grad_norm": 0.6012796759605408,
+      "learning_rate": 0.00015416146033878745,
+      "loss": 1.0101,
+      "step": 7154
+    },
+    {
+      "epoch": 1.273860398860399,
+      "grad_norm": 0.7008427977561951,
+      "learning_rate": 0.00015414969315629505,
+      "loss": 1.1321,
+      "step": 7155
+    },
+    {
+      "epoch": 1.2740384615384617,
+      "grad_norm": 0.6555556058883667,
+      "learning_rate": 0.0001541379249128519,
+      "loss": 0.9926,
+      "step": 7156
+    },
+    {
+      "epoch": 1.2742165242165242,
+      "grad_norm": 0.6324251294136047,
+      "learning_rate": 0.00015412615560868854,
+      "loss": 0.9051,
+      "step": 7157
+    },
+    {
+      "epoch": 1.274394586894587,
+      "grad_norm": 0.6035568714141846,
+      "learning_rate": 0.0001541143852440356,
+      "loss": 0.8248,
+      "step": 7158
+    },
+    {
+      "epoch": 1.2745726495726495,
+      "grad_norm": 0.6733569502830505,
+      "learning_rate": 0.0001541026138191237,
+      "loss": 0.9149,
+      "step": 7159
+    },
+    {
+      "epoch": 1.2747507122507122,
+      "grad_norm": 0.8306798338890076,
+      "learning_rate": 0.0001540908413341835,
+      "loss": 1.0694,
+      "step": 7160
+    },
+    {
+      "epoch": 1.274928774928775,
+      "grad_norm": 0.6649713516235352,
+      "learning_rate": 0.00015407906778944563,
+      "loss": 1.1358,
+      "step": 7161
+    },
+    {
+      "epoch": 1.2751068376068377,
+      "grad_norm": 0.6889697909355164,
+      "learning_rate": 0.00015406729318514074,
+      "loss": 1.0096,
+      "step": 7162
+    },
+    {
+      "epoch": 1.2752849002849003,
+      "grad_norm": 0.6948645114898682,
+      "learning_rate": 0.0001540555175214996,
+      "loss": 1.0649,
+      "step": 7163
+    },
+    {
+      "epoch": 1.275462962962963,
+      "grad_norm": 0.6844844818115234,
+      "learning_rate": 0.0001540437407987528,
+      "loss": 0.884,
+      "step": 7164
+    },
+    {
+      "epoch": 1.2756410256410255,
+      "grad_norm": 0.7124526500701904,
+      "learning_rate": 0.00015403196301713124,
+      "loss": 1.1307,
+      "step": 7165
+    },
+    {
+      "epoch": 1.2758190883190883,
+      "grad_norm": 0.7328375577926636,
+      "learning_rate": 0.00015402018417686556,
+      "loss": 1.0348,
+      "step": 7166
+    },
+    {
+      "epoch": 1.275997150997151,
+      "grad_norm": 0.5872696042060852,
+      "learning_rate": 0.00015400840427818663,
+      "loss": 0.9827,
+      "step": 7167
+    },
+    {
+      "epoch": 1.2761752136752138,
+      "grad_norm": 0.6370702385902405,
+      "learning_rate": 0.00015399662332132519,
+      "loss": 0.9171,
+      "step": 7168
+    },
+    {
+      "epoch": 1.2763532763532763,
+      "grad_norm": 0.6481866240501404,
+      "learning_rate": 0.00015398484130651205,
+      "loss": 0.8704,
+      "step": 7169
+    },
+    {
+      "epoch": 1.276531339031339,
+      "grad_norm": 0.598739743232727,
+      "learning_rate": 0.00015397305823397812,
+      "loss": 0.8097,
+      "step": 7170
+    },
+    {
+      "epoch": 1.2767094017094016,
+      "grad_norm": 0.5941228270530701,
+      "learning_rate": 0.00015396127410395423,
+      "loss": 0.8853,
+      "step": 7171
+    },
+    {
+      "epoch": 1.2768874643874644,
+      "grad_norm": 0.6485885381698608,
+      "learning_rate": 0.00015394948891667127,
+      "loss": 0.702,
+      "step": 7172
+    },
+    {
+      "epoch": 1.2770655270655271,
+      "grad_norm": 0.5314942598342896,
+      "learning_rate": 0.00015393770267236017,
+      "loss": 0.7899,
+      "step": 7173
+    },
+    {
+      "epoch": 1.2772435897435899,
+      "grad_norm": 0.6113781929016113,
+      "learning_rate": 0.00015392591537125182,
+      "loss": 0.9871,
+      "step": 7174
+    },
+    {
+      "epoch": 1.2774216524216524,
+      "grad_norm": 0.5625866651535034,
+      "learning_rate": 0.00015391412701357715,
+      "loss": 0.8246,
+      "step": 7175
+    },
+    {
+      "epoch": 1.2775997150997151,
+      "grad_norm": 0.6006998419761658,
+      "learning_rate": 0.00015390233759956718,
+      "loss": 0.899,
+      "step": 7176
+    },
+    {
+      "epoch": 1.2777777777777777,
+      "grad_norm": 0.6916918158531189,
+      "learning_rate": 0.0001538905471294529,
+      "loss": 1.0443,
+      "step": 7177
+    },
+    {
+      "epoch": 1.2779558404558404,
+      "grad_norm": 0.6263536810874939,
+      "learning_rate": 0.00015387875560346525,
+      "loss": 0.9159,
+      "step": 7178
+    },
+    {
+      "epoch": 1.2781339031339032,
+      "grad_norm": 0.6563085913658142,
+      "learning_rate": 0.00015386696302183535,
+      "loss": 0.994,
+      "step": 7179
+    },
+    {
+      "epoch": 1.278311965811966,
+      "grad_norm": 0.6312007904052734,
+      "learning_rate": 0.00015385516938479416,
+      "loss": 0.9148,
+      "step": 7180
+    },
+    {
+      "epoch": 1.2784900284900285,
+      "grad_norm": 0.6408209204673767,
+      "learning_rate": 0.00015384337469257284,
+      "loss": 1.0508,
+      "step": 7181
+    },
+    {
+      "epoch": 1.2786680911680912,
+      "grad_norm": 0.656234085559845,
+      "learning_rate": 0.00015383157894540244,
+      "loss": 0.9952,
+      "step": 7182
+    },
+    {
+      "epoch": 1.2788461538461537,
+      "grad_norm": 0.7401639819145203,
+      "learning_rate": 0.00015381978214351407,
+      "loss": 1.1615,
+      "step": 7183
+    },
+    {
+      "epoch": 1.2790242165242165,
+      "grad_norm": 0.5746055841445923,
+      "learning_rate": 0.00015380798428713885,
+      "loss": 0.9142,
+      "step": 7184
+    },
+    {
+      "epoch": 1.2792022792022792,
+      "grad_norm": 0.8061720728874207,
+      "learning_rate": 0.00015379618537650797,
+      "loss": 1.13,
+      "step": 7185
+    },
+    {
+      "epoch": 1.279380341880342,
+      "grad_norm": 0.6336073875427246,
+      "learning_rate": 0.0001537843854118526,
+      "loss": 1.0581,
+      "step": 7186
+    },
+    {
+      "epoch": 1.2795584045584045,
+      "grad_norm": 0.6549856066703796,
+      "learning_rate": 0.0001537725843934039,
+      "loss": 1.09,
+      "step": 7187
+    },
+    {
+      "epoch": 1.2797364672364673,
+      "grad_norm": 0.5759010910987854,
+      "learning_rate": 0.00015376078232139315,
+      "loss": 0.8441,
+      "step": 7188
+    },
+    {
+      "epoch": 1.2799145299145298,
+      "grad_norm": 0.5733884572982788,
+      "learning_rate": 0.00015374897919605152,
+      "loss": 0.9086,
+      "step": 7189
+    },
+    {
+      "epoch": 1.2800925925925926,
+      "grad_norm": 0.6505870819091797,
+      "learning_rate": 0.0001537371750176103,
+      "loss": 1.1683,
+      "step": 7190
+    },
+    {
+      "epoch": 1.2802706552706553,
+      "grad_norm": 0.6744688153266907,
+      "learning_rate": 0.00015372536978630077,
+      "loss": 0.9483,
+      "step": 7191
+    },
+    {
+      "epoch": 1.280448717948718,
+      "grad_norm": 0.598098874092102,
+      "learning_rate": 0.0001537135635023542,
+      "loss": 0.7747,
+      "step": 7192
+    },
+    {
+      "epoch": 1.2806267806267806,
+      "grad_norm": 0.6711761951446533,
+      "learning_rate": 0.00015370175616600195,
+      "loss": 1.1897,
+      "step": 7193
+    },
+    {
+      "epoch": 1.2808048433048433,
+      "grad_norm": 0.6207453608512878,
+      "learning_rate": 0.00015368994777747536,
+      "loss": 1.0063,
+      "step": 7194
+    },
+    {
+      "epoch": 1.2809829059829059,
+      "grad_norm": 0.6701686382293701,
+      "learning_rate": 0.00015367813833700575,
+      "loss": 1.0864,
+      "step": 7195
+    },
+    {
+      "epoch": 1.2811609686609686,
+      "grad_norm": 0.5916469693183899,
+      "learning_rate": 0.00015366632784482456,
+      "loss": 0.8786,
+      "step": 7196
+    },
+    {
+      "epoch": 1.2813390313390314,
+      "grad_norm": 0.6567547917366028,
+      "learning_rate": 0.00015365451630116312,
+      "loss": 0.9977,
+      "step": 7197
+    },
+    {
+      "epoch": 1.2815170940170941,
+      "grad_norm": 0.7287433743476868,
+      "learning_rate": 0.00015364270370625294,
+      "loss": 1.1248,
+      "step": 7198
+    },
+    {
+      "epoch": 1.2816951566951567,
+      "grad_norm": 0.7736039161682129,
+      "learning_rate": 0.0001536308900603254,
+      "loss": 0.9832,
+      "step": 7199
+    },
+    {
+      "epoch": 1.2818732193732194,
+      "grad_norm": 0.6799852252006531,
+      "learning_rate": 0.00015361907536361194,
+      "loss": 1.0275,
+      "step": 7200
+    },
+    {
+      "epoch": 1.282051282051282,
+      "grad_norm": 0.5975812673568726,
+      "learning_rate": 0.00015360725961634407,
+      "loss": 1.0516,
+      "step": 7201
+    },
+    {
+      "epoch": 1.2822293447293447,
+      "grad_norm": 0.616307258605957,
+      "learning_rate": 0.00015359544281875337,
+      "loss": 0.8095,
+      "step": 7202
+    },
+    {
+      "epoch": 1.2824074074074074,
+      "grad_norm": 0.6357580423355103,
+      "learning_rate": 0.00015358362497107126,
+      "loss": 0.9186,
+      "step": 7203
+    },
+    {
+      "epoch": 1.2825854700854702,
+      "grad_norm": 0.679333508014679,
+      "learning_rate": 0.00015357180607352935,
+      "loss": 0.9433,
+      "step": 7204
+    },
+    {
+      "epoch": 1.2827635327635327,
+      "grad_norm": 0.6345439553260803,
+      "learning_rate": 0.00015355998612635914,
+      "loss": 0.9186,
+      "step": 7205
+    },
+    {
+      "epoch": 1.2829415954415955,
+      "grad_norm": 0.6256508827209473,
+      "learning_rate": 0.00015354816512979231,
+      "loss": 0.9984,
+      "step": 7206
+    },
+    {
+      "epoch": 1.283119658119658,
+      "grad_norm": 0.7973852753639221,
+      "learning_rate": 0.00015353634308406044,
+      "loss": 1.1145,
+      "step": 7207
+    },
+    {
+      "epoch": 1.2832977207977208,
+      "grad_norm": 0.711125910282135,
+      "learning_rate": 0.0001535245199893951,
+      "loss": 1.1947,
+      "step": 7208
+    },
+    {
+      "epoch": 1.2834757834757835,
+      "grad_norm": 0.6096055507659912,
+      "learning_rate": 0.00015351269584602798,
+      "loss": 1.0078,
+      "step": 7209
+    },
+    {
+      "epoch": 1.2836538461538463,
+      "grad_norm": 0.7089232802391052,
+      "learning_rate": 0.00015350087065419077,
+      "loss": 1.112,
+      "step": 7210
+    },
+    {
+      "epoch": 1.2838319088319088,
+      "grad_norm": 0.716199517250061,
+      "learning_rate": 0.00015348904441411508,
+      "loss": 1.1015,
+      "step": 7211
+    },
+    {
+      "epoch": 1.2840099715099715,
+      "grad_norm": 0.6374632716178894,
+      "learning_rate": 0.00015347721712603276,
+      "loss": 1.0519,
+      "step": 7212
+    },
+    {
+      "epoch": 1.284188034188034,
+      "grad_norm": 0.6500036716461182,
+      "learning_rate": 0.0001534653887901754,
+      "loss": 1.1719,
+      "step": 7213
+    },
+    {
+      "epoch": 1.2843660968660968,
+      "grad_norm": 0.7249937653541565,
+      "learning_rate": 0.00015345355940677485,
+      "loss": 1.0188,
+      "step": 7214
+    },
+    {
+      "epoch": 1.2845441595441596,
+      "grad_norm": 0.6645919680595398,
+      "learning_rate": 0.00015344172897606285,
+      "loss": 0.9788,
+      "step": 7215
+    },
+    {
+      "epoch": 1.2847222222222223,
+      "grad_norm": 0.7032710313796997,
+      "learning_rate": 0.00015342989749827113,
+      "loss": 1.1093,
+      "step": 7216
+    },
+    {
+      "epoch": 1.2849002849002849,
+      "grad_norm": 0.622767984867096,
+      "learning_rate": 0.0001534180649736316,
+      "loss": 0.8978,
+      "step": 7217
+    },
+    {
+      "epoch": 1.2850783475783476,
+      "grad_norm": 0.7499693036079407,
+      "learning_rate": 0.00015340623140237605,
+      "loss": 1.2232,
+      "step": 7218
+    },
+    {
+      "epoch": 1.2852564102564101,
+      "grad_norm": 0.6308625936508179,
+      "learning_rate": 0.00015339439678473636,
+      "loss": 0.8621,
+      "step": 7219
+    },
+    {
+      "epoch": 1.2854344729344729,
+      "grad_norm": 0.6513667106628418,
+      "learning_rate": 0.00015338256112094434,
+      "loss": 1.0541,
+      "step": 7220
+    },
+    {
+      "epoch": 1.2856125356125356,
+      "grad_norm": 0.6080937385559082,
+      "learning_rate": 0.00015337072441123193,
+      "loss": 0.8474,
+      "step": 7221
+    },
+    {
+      "epoch": 1.2857905982905984,
+      "grad_norm": 0.6742652058601379,
+      "learning_rate": 0.00015335888665583104,
+      "loss": 1.0172,
+      "step": 7222
+    },
+    {
+      "epoch": 1.285968660968661,
+      "grad_norm": 0.620810866355896,
+      "learning_rate": 0.00015334704785497364,
+      "loss": 1.049,
+      "step": 7223
+    },
+    {
+      "epoch": 1.2861467236467237,
+      "grad_norm": 0.5733018517494202,
+      "learning_rate": 0.00015333520800889165,
+      "loss": 0.7371,
+      "step": 7224
+    },
+    {
+      "epoch": 1.2863247863247862,
+      "grad_norm": 0.6447640061378479,
+      "learning_rate": 0.00015332336711781702,
+      "loss": 0.9925,
+      "step": 7225
+    },
+    {
+      "epoch": 1.286502849002849,
+      "grad_norm": 0.6764999628067017,
+      "learning_rate": 0.00015331152518198183,
+      "loss": 0.9052,
+      "step": 7226
+    },
+    {
+      "epoch": 1.2866809116809117,
+      "grad_norm": 0.6492836475372314,
+      "learning_rate": 0.00015329968220161803,
+      "loss": 0.9493,
+      "step": 7227
+    },
+    {
+      "epoch": 1.2868589743589745,
+      "grad_norm": 0.666157603263855,
+      "learning_rate": 0.00015328783817695766,
+      "loss": 1.0626,
+      "step": 7228
+    },
+    {
+      "epoch": 1.287037037037037,
+      "grad_norm": 0.7098026871681213,
+      "learning_rate": 0.00015327599310823283,
+      "loss": 1.0461,
+      "step": 7229
+    },
+    {
+      "epoch": 1.2872150997150997,
+      "grad_norm": 0.637778103351593,
+      "learning_rate": 0.00015326414699567555,
+      "loss": 0.9383,
+      "step": 7230
+    },
+    {
+      "epoch": 1.2873931623931623,
+      "grad_norm": 0.6816399693489075,
+      "learning_rate": 0.00015325229983951798,
+      "loss": 1.0647,
+      "step": 7231
+    },
+    {
+      "epoch": 1.287571225071225,
+      "grad_norm": 0.668689489364624,
+      "learning_rate": 0.0001532404516399922,
+      "loss": 1.0479,
+      "step": 7232
+    },
+    {
+      "epoch": 1.2877492877492878,
+      "grad_norm": 0.6459103226661682,
+      "learning_rate": 0.0001532286023973304,
+      "loss": 1.1751,
+      "step": 7233
+    },
+    {
+      "epoch": 1.2879273504273505,
+      "grad_norm": 0.679999589920044,
+      "learning_rate": 0.00015321675211176468,
+      "loss": 0.7541,
+      "step": 7234
+    },
+    {
+      "epoch": 1.288105413105413,
+      "grad_norm": 0.5415067672729492,
+      "learning_rate": 0.00015320490078352724,
+      "loss": 0.822,
+      "step": 7235
+    },
+    {
+      "epoch": 1.2882834757834758,
+      "grad_norm": 0.6817963719367981,
+      "learning_rate": 0.00015319304841285032,
+      "loss": 0.9424,
+      "step": 7236
+    },
+    {
+      "epoch": 1.2884615384615383,
+      "grad_norm": 0.6187505125999451,
+      "learning_rate": 0.0001531811949999661,
+      "loss": 0.8596,
+      "step": 7237
+    },
+    {
+      "epoch": 1.288639601139601,
+      "grad_norm": 0.6737838387489319,
+      "learning_rate": 0.00015316934054510685,
+      "loss": 1.0046,
+      "step": 7238
+    },
+    {
+      "epoch": 1.2888176638176638,
+      "grad_norm": 0.6445996761322021,
+      "learning_rate": 0.00015315748504850482,
+      "loss": 1.01,
+      "step": 7239
+    },
+    {
+      "epoch": 1.2889957264957266,
+      "grad_norm": 0.7279136180877686,
+      "learning_rate": 0.0001531456285103923,
+      "loss": 0.9066,
+      "step": 7240
+    },
+    {
+      "epoch": 1.289173789173789,
+      "grad_norm": 0.6619178652763367,
+      "learning_rate": 0.00015313377093100153,
+      "loss": 0.8977,
+      "step": 7241
+    },
+    {
+      "epoch": 1.2893518518518519,
+      "grad_norm": 0.7644323110580444,
+      "learning_rate": 0.000153121912310565,
+      "loss": 1.3085,
+      "step": 7242
+    },
+    {
+      "epoch": 1.2895299145299146,
+      "grad_norm": 0.645882248878479,
+      "learning_rate": 0.00015311005264931487,
+      "loss": 1.0337,
+      "step": 7243
+    },
+    {
+      "epoch": 1.2897079772079771,
+      "grad_norm": 0.6868017911911011,
+      "learning_rate": 0.0001530981919474836,
+      "loss": 0.9616,
+      "step": 7244
+    },
+    {
+      "epoch": 1.28988603988604,
+      "grad_norm": 0.7176693677902222,
+      "learning_rate": 0.00015308633020530362,
+      "loss": 1.1975,
+      "step": 7245
+    },
+    {
+      "epoch": 1.2900641025641026,
+      "grad_norm": 0.7358015775680542,
+      "learning_rate": 0.00015307446742300718,
+      "loss": 0.9308,
+      "step": 7246
+    },
+    {
+      "epoch": 1.2902421652421652,
+      "grad_norm": 0.7330248355865479,
+      "learning_rate": 0.00015306260360082688,
+      "loss": 0.9518,
+      "step": 7247
+    },
+    {
+      "epoch": 1.290420227920228,
+      "grad_norm": 0.6571981310844421,
+      "learning_rate": 0.00015305073873899503,
+      "loss": 0.9531,
+      "step": 7248
+    },
+    {
+      "epoch": 1.2905982905982907,
+      "grad_norm": 0.5968486666679382,
+      "learning_rate": 0.00015303887283774417,
+      "loss": 0.9245,
+      "step": 7249
+    },
+    {
+      "epoch": 1.2907763532763532,
+      "grad_norm": 0.6398176550865173,
+      "learning_rate": 0.0001530270058973068,
+      "loss": 1.0452,
+      "step": 7250
+    },
+    {
+      "epoch": 1.290954415954416,
+      "grad_norm": 0.5462267994880676,
+      "learning_rate": 0.00015301513791791542,
+      "loss": 0.8451,
+      "step": 7251
+    },
+    {
+      "epoch": 1.2911324786324787,
+      "grad_norm": 0.7536166906356812,
+      "learning_rate": 0.00015300326889980252,
+      "loss": 1.0086,
+      "step": 7252
+    },
+    {
+      "epoch": 1.2913105413105412,
+      "grad_norm": 0.6208569407463074,
+      "learning_rate": 0.00015299139884320065,
+      "loss": 0.7437,
+      "step": 7253
+    },
+    {
+      "epoch": 1.291488603988604,
+      "grad_norm": 0.7025452852249146,
+      "learning_rate": 0.00015297952774834242,
+      "loss": 0.8874,
+      "step": 7254
+    },
+    {
+      "epoch": 1.2916666666666667,
+      "grad_norm": 0.6758308410644531,
+      "learning_rate": 0.00015296765561546041,
+      "loss": 1.0378,
+      "step": 7255
+    },
+    {
+      "epoch": 1.2918447293447293,
+      "grad_norm": 0.7170431613922119,
+      "learning_rate": 0.00015295578244478724,
+      "loss": 1.0111,
+      "step": 7256
+    },
+    {
+      "epoch": 1.292022792022792,
+      "grad_norm": 0.6263511180877686,
+      "learning_rate": 0.00015294390823655544,
+      "loss": 0.7836,
+      "step": 7257
+    },
+    {
+      "epoch": 1.2922008547008548,
+      "grad_norm": 0.5887803435325623,
+      "learning_rate": 0.0001529320329909978,
+      "loss": 1.068,
+      "step": 7258
+    },
+    {
+      "epoch": 1.2923789173789173,
+      "grad_norm": 0.5955889821052551,
+      "learning_rate": 0.00015292015670834692,
+      "loss": 0.8903,
+      "step": 7259
+    },
+    {
+      "epoch": 1.29255698005698,
+      "grad_norm": 0.630449652671814,
+      "learning_rate": 0.00015290827938883552,
+      "loss": 1.1096,
+      "step": 7260
+    },
+    {
+      "epoch": 1.2927350427350428,
+      "grad_norm": 0.7405480146408081,
+      "learning_rate": 0.00015289640103269625,
+      "loss": 1.0648,
+      "step": 7261
+    },
+    {
+      "epoch": 1.2929131054131053,
+      "grad_norm": 0.6082221865653992,
+      "learning_rate": 0.00015288452164016191,
+      "loss": 0.9266,
+      "step": 7262
+    },
+    {
+      "epoch": 1.293091168091168,
+      "grad_norm": 0.6211720108985901,
+      "learning_rate": 0.00015287264121146524,
+      "loss": 0.849,
+      "step": 7263
+    },
+    {
+      "epoch": 1.2932692307692308,
+      "grad_norm": 0.6481043100357056,
+      "learning_rate": 0.00015286075974683898,
+      "loss": 0.7761,
+      "step": 7264
+    },
+    {
+      "epoch": 1.2934472934472934,
+      "grad_norm": 0.5957167744636536,
+      "learning_rate": 0.00015284887724651593,
+      "loss": 0.8942,
+      "step": 7265
+    },
+    {
+      "epoch": 1.2936253561253561,
+      "grad_norm": 0.7272268533706665,
+      "learning_rate": 0.00015283699371072894,
+      "loss": 1.0913,
+      "step": 7266
+    },
+    {
+      "epoch": 1.2938034188034189,
+      "grad_norm": 0.5902758836746216,
+      "learning_rate": 0.0001528251091397108,
+      "loss": 1.1045,
+      "step": 7267
+    },
+    {
+      "epoch": 1.2939814814814814,
+      "grad_norm": 0.6382482051849365,
+      "learning_rate": 0.00015281322353369436,
+      "loss": 0.9265,
+      "step": 7268
+    },
+    {
+      "epoch": 1.2941595441595442,
+      "grad_norm": 0.6556048393249512,
+      "learning_rate": 0.00015280133689291256,
+      "loss": 1.0536,
+      "step": 7269
+    },
+    {
+      "epoch": 1.294337606837607,
+      "grad_norm": 0.680895209312439,
+      "learning_rate": 0.00015278944921759822,
+      "loss": 0.9996,
+      "step": 7270
+    },
+    {
+      "epoch": 1.2945156695156697,
+      "grad_norm": 0.670317530632019,
+      "learning_rate": 0.00015277756050798428,
+      "loss": 1.1402,
+      "step": 7271
+    },
+    {
+      "epoch": 1.2946937321937322,
+      "grad_norm": 0.6312688589096069,
+      "learning_rate": 0.0001527656707643037,
+      "loss": 1.0669,
+      "step": 7272
+    },
+    {
+      "epoch": 1.294871794871795,
+      "grad_norm": 0.6267009973526001,
+      "learning_rate": 0.0001527537799867894,
+      "loss": 0.8985,
+      "step": 7273
+    },
+    {
+      "epoch": 1.2950498575498575,
+      "grad_norm": 0.7069001197814941,
+      "learning_rate": 0.00015274188817567436,
+      "loss": 0.9478,
+      "step": 7274
+    },
+    {
+      "epoch": 1.2952279202279202,
+      "grad_norm": 0.7229067087173462,
+      "learning_rate": 0.00015272999533119162,
+      "loss": 0.9005,
+      "step": 7275
+    },
+    {
+      "epoch": 1.295405982905983,
+      "grad_norm": 0.6254632472991943,
+      "learning_rate": 0.00015271810145357412,
+      "loss": 0.9746,
+      "step": 7276
+    },
+    {
+      "epoch": 1.2955840455840457,
+      "grad_norm": 0.6772669553756714,
+      "learning_rate": 0.00015270620654305494,
+      "loss": 1.1714,
+      "step": 7277
+    },
+    {
+      "epoch": 1.2957621082621082,
+      "grad_norm": 0.605576753616333,
+      "learning_rate": 0.00015269431059986713,
+      "loss": 0.7735,
+      "step": 7278
+    },
+    {
+      "epoch": 1.295940170940171,
+      "grad_norm": 0.7144771814346313,
+      "learning_rate": 0.00015268241362424378,
+      "loss": 0.9757,
+      "step": 7279
+    },
+    {
+      "epoch": 1.2961182336182335,
+      "grad_norm": 0.5275486707687378,
+      "learning_rate": 0.00015267051561641798,
+      "loss": 0.5669,
+      "step": 7280
+    },
+    {
+      "epoch": 1.2962962962962963,
+      "grad_norm": 0.6619452238082886,
+      "learning_rate": 0.00015265861657662284,
+      "loss": 0.9511,
+      "step": 7281
+    },
+    {
+      "epoch": 1.296474358974359,
+      "grad_norm": 0.6788223385810852,
+      "learning_rate": 0.00015264671650509147,
+      "loss": 1.2649,
+      "step": 7282
+    },
+    {
+      "epoch": 1.2966524216524218,
+      "grad_norm": 0.6198732852935791,
+      "learning_rate": 0.00015263481540205706,
+      "loss": 1.0659,
+      "step": 7283
+    },
+    {
+      "epoch": 1.2968304843304843,
+      "grad_norm": 0.6038815975189209,
+      "learning_rate": 0.0001526229132677528,
+      "loss": 1.0655,
+      "step": 7284
+    },
+    {
+      "epoch": 1.297008547008547,
+      "grad_norm": 0.7616196870803833,
+      "learning_rate": 0.00015261101010241186,
+      "loss": 1.131,
+      "step": 7285
+    },
+    {
+      "epoch": 1.2971866096866096,
+      "grad_norm": 0.7002527713775635,
+      "learning_rate": 0.00015259910590626746,
+      "loss": 1.1375,
+      "step": 7286
+    },
+    {
+      "epoch": 1.2973646723646723,
+      "grad_norm": 0.6067437529563904,
+      "learning_rate": 0.00015258720067955284,
+      "loss": 0.9306,
+      "step": 7287
+    },
+    {
+      "epoch": 1.297542735042735,
+      "grad_norm": 0.653232216835022,
+      "learning_rate": 0.00015257529442250128,
+      "loss": 1.107,
+      "step": 7288
+    },
+    {
+      "epoch": 1.2977207977207978,
+      "grad_norm": 0.6969175934791565,
+      "learning_rate": 0.00015256338713534603,
+      "loss": 0.8365,
+      "step": 7289
+    },
+    {
+      "epoch": 1.2978988603988604,
+      "grad_norm": 0.6176731586456299,
+      "learning_rate": 0.00015255147881832043,
+      "loss": 0.9707,
+      "step": 7290
+    },
+    {
+      "epoch": 1.2980769230769231,
+      "grad_norm": 0.6543741822242737,
+      "learning_rate": 0.00015253956947165772,
+      "loss": 0.7714,
+      "step": 7291
+    },
+    {
+      "epoch": 1.2982549857549857,
+      "grad_norm": 0.5224920511245728,
+      "learning_rate": 0.00015252765909559135,
+      "loss": 0.7469,
+      "step": 7292
+    },
+    {
+      "epoch": 1.2984330484330484,
+      "grad_norm": 0.638708770275116,
+      "learning_rate": 0.00015251574769035455,
+      "loss": 1.0965,
+      "step": 7293
+    },
+    {
+      "epoch": 1.2986111111111112,
+      "grad_norm": 0.6742943525314331,
+      "learning_rate": 0.0001525038352561808,
+      "loss": 1.1286,
+      "step": 7294
+    },
+    {
+      "epoch": 1.298789173789174,
+      "grad_norm": 0.6027839183807373,
+      "learning_rate": 0.00015249192179330346,
+      "loss": 0.8824,
+      "step": 7295
+    },
+    {
+      "epoch": 1.2989672364672364,
+      "grad_norm": 0.7462167143821716,
+      "learning_rate": 0.00015248000730195597,
+      "loss": 0.94,
+      "step": 7296
+    },
+    {
+      "epoch": 1.2991452991452992,
+      "grad_norm": 0.6972534656524658,
+      "learning_rate": 0.00015246809178237172,
+      "loss": 1.0664,
+      "step": 7297
+    },
+    {
+      "epoch": 1.2993233618233617,
+      "grad_norm": 0.569949209690094,
+      "learning_rate": 0.0001524561752347842,
+      "loss": 0.691,
+      "step": 7298
+    },
+    {
+      "epoch": 1.2995014245014245,
+      "grad_norm": 0.6066586375236511,
+      "learning_rate": 0.00015244425765942695,
+      "loss": 1.083,
+      "step": 7299
+    },
+    {
+      "epoch": 1.2996794871794872,
+      "grad_norm": 0.6927483677864075,
+      "learning_rate": 0.00015243233905653337,
+      "loss": 1.0068,
+      "step": 7300
+    },
+    {
+      "epoch": 1.29985754985755,
+      "grad_norm": 0.752824604511261,
+      "learning_rate": 0.00015242041942633704,
+      "loss": 0.9946,
+      "step": 7301
+    },
+    {
+      "epoch": 1.3000356125356125,
+      "grad_norm": 0.6532080173492432,
+      "learning_rate": 0.0001524084987690715,
+      "loss": 1.2326,
+      "step": 7302
+    },
+    {
+      "epoch": 1.3002136752136753,
+      "grad_norm": 0.7954180836677551,
+      "learning_rate": 0.0001523965770849703,
+      "loss": 1.1105,
+      "step": 7303
+    },
+    {
+      "epoch": 1.3003917378917378,
+      "grad_norm": 0.5971781015396118,
+      "learning_rate": 0.000152384654374267,
+      "loss": 1.0984,
+      "step": 7304
+    },
+    {
+      "epoch": 1.3005698005698005,
+      "grad_norm": 0.7778682112693787,
+      "learning_rate": 0.0001523727306371952,
+      "loss": 1.0795,
+      "step": 7305
+    },
+    {
+      "epoch": 1.3007478632478633,
+      "grad_norm": 0.6712004542350769,
+      "learning_rate": 0.00015236080587398856,
+      "loss": 1.0814,
+      "step": 7306
+    },
+    {
+      "epoch": 1.300925925925926,
+      "grad_norm": 0.581048846244812,
+      "learning_rate": 0.00015234888008488066,
+      "loss": 0.9868,
+      "step": 7307
+    },
+    {
+      "epoch": 1.3011039886039886,
+      "grad_norm": 0.697695791721344,
+      "learning_rate": 0.00015233695327010523,
+      "loss": 1.1045,
+      "step": 7308
+    },
+    {
+      "epoch": 1.3012820512820513,
+      "grad_norm": 0.6858421564102173,
+      "learning_rate": 0.00015232502542989593,
+      "loss": 1.0769,
+      "step": 7309
+    },
+    {
+      "epoch": 1.3014601139601139,
+      "grad_norm": 0.6312826871871948,
+      "learning_rate": 0.00015231309656448642,
+      "loss": 0.9523,
+      "step": 7310
+    },
+    {
+      "epoch": 1.3016381766381766,
+      "grad_norm": 0.9243300557136536,
+      "learning_rate": 0.0001523011666741105,
+      "loss": 0.947,
+      "step": 7311
+    },
+    {
+      "epoch": 1.3018162393162394,
+      "grad_norm": 0.6808217763900757,
+      "learning_rate": 0.00015228923575900184,
+      "loss": 0.8631,
+      "step": 7312
+    },
+    {
+      "epoch": 1.301994301994302,
+      "grad_norm": 0.6713891625404358,
+      "learning_rate": 0.00015227730381939424,
+      "loss": 0.9157,
+      "step": 7313
+    },
+    {
+      "epoch": 1.3021723646723646,
+      "grad_norm": 0.6802582740783691,
+      "learning_rate": 0.00015226537085552146,
+      "loss": 1.041,
+      "step": 7314
+    },
+    {
+      "epoch": 1.3023504273504274,
+      "grad_norm": 0.6543951034545898,
+      "learning_rate": 0.0001522534368676173,
+      "loss": 0.8709,
+      "step": 7315
+    },
+    {
+      "epoch": 1.30252849002849,
+      "grad_norm": 0.6290678381919861,
+      "learning_rate": 0.0001522415018559156,
+      "loss": 1.0568,
+      "step": 7316
+    },
+    {
+      "epoch": 1.3027065527065527,
+      "grad_norm": 0.6590015292167664,
+      "learning_rate": 0.0001522295658206502,
+      "loss": 0.9919,
+      "step": 7317
+    },
+    {
+      "epoch": 1.3028846153846154,
+      "grad_norm": 0.6374103426933289,
+      "learning_rate": 0.00015221762876205494,
+      "loss": 0.878,
+      "step": 7318
+    },
+    {
+      "epoch": 1.3030626780626782,
+      "grad_norm": 0.7247048616409302,
+      "learning_rate": 0.00015220569068036372,
+      "loss": 1.061,
+      "step": 7319
+    },
+    {
+      "epoch": 1.3032407407407407,
+      "grad_norm": 0.6450991630554199,
+      "learning_rate": 0.00015219375157581047,
+      "loss": 0.9389,
+      "step": 7320
+    },
+    {
+      "epoch": 1.3034188034188035,
+      "grad_norm": 0.8039840459823608,
+      "learning_rate": 0.00015218181144862903,
+      "loss": 1.0692,
+      "step": 7321
+    },
+    {
+      "epoch": 1.303596866096866,
+      "grad_norm": 0.6539456248283386,
+      "learning_rate": 0.00015216987029905346,
+      "loss": 1.0478,
+      "step": 7322
+    },
+    {
+      "epoch": 1.3037749287749287,
+      "grad_norm": 0.60880047082901,
+      "learning_rate": 0.00015215792812731758,
+      "loss": 0.8412,
+      "step": 7323
+    },
+    {
+      "epoch": 1.3039529914529915,
+      "grad_norm": 0.6757258176803589,
+      "learning_rate": 0.0001521459849336555,
+      "loss": 0.896,
+      "step": 7324
+    },
+    {
+      "epoch": 1.3041310541310542,
+      "grad_norm": 0.6735622882843018,
+      "learning_rate": 0.00015213404071830116,
+      "loss": 1.1078,
+      "step": 7325
+    },
+    {
+      "epoch": 1.3043091168091168,
+      "grad_norm": 0.7321233749389648,
+      "learning_rate": 0.00015212209548148858,
+      "loss": 1.1021,
+      "step": 7326
+    },
+    {
+      "epoch": 1.3044871794871795,
+      "grad_norm": 0.6678910851478577,
+      "learning_rate": 0.00015211014922345182,
+      "loss": 1.0043,
+      "step": 7327
+    },
+    {
+      "epoch": 1.304665242165242,
+      "grad_norm": 0.6876940727233887,
+      "learning_rate": 0.0001520982019444249,
+      "loss": 1.0376,
+      "step": 7328
+    },
+    {
+      "epoch": 1.3048433048433048,
+      "grad_norm": 0.6171853542327881,
+      "learning_rate": 0.00015208625364464195,
+      "loss": 0.839,
+      "step": 7329
+    },
+    {
+      "epoch": 1.3050213675213675,
+      "grad_norm": 0.6449569463729858,
+      "learning_rate": 0.0001520743043243371,
+      "loss": 1.0908,
+      "step": 7330
+    },
+    {
+      "epoch": 1.3051994301994303,
+      "grad_norm": 0.6894628405570984,
+      "learning_rate": 0.00015206235398374443,
+      "loss": 1.0263,
+      "step": 7331
+    },
+    {
+      "epoch": 1.3053774928774928,
+      "grad_norm": 0.5853552222251892,
+      "learning_rate": 0.00015205040262309804,
+      "loss": 0.8342,
+      "step": 7332
+    },
+    {
+      "epoch": 1.3055555555555556,
+      "grad_norm": 0.5934799313545227,
+      "learning_rate": 0.00015203845024263214,
+      "loss": 0.9464,
+      "step": 7333
+    },
+    {
+      "epoch": 1.305733618233618,
+      "grad_norm": 0.668927788734436,
+      "learning_rate": 0.00015202649684258095,
+      "loss": 0.9018,
+      "step": 7334
+    },
+    {
+      "epoch": 1.3059116809116809,
+      "grad_norm": 0.676810085773468,
+      "learning_rate": 0.0001520145424231786,
+      "loss": 0.9284,
+      "step": 7335
+    },
+    {
+      "epoch": 1.3060897435897436,
+      "grad_norm": 0.6223878264427185,
+      "learning_rate": 0.00015200258698465935,
+      "loss": 1.0779,
+      "step": 7336
+    },
+    {
+      "epoch": 1.3062678062678064,
+      "grad_norm": 0.6092363595962524,
+      "learning_rate": 0.00015199063052725745,
+      "loss": 0.8602,
+      "step": 7337
+    },
+    {
+      "epoch": 1.306445868945869,
+      "grad_norm": 0.7668731212615967,
+      "learning_rate": 0.00015197867305120712,
+      "loss": 1.0756,
+      "step": 7338
+    },
+    {
+      "epoch": 1.3066239316239316,
+      "grad_norm": 0.6485331654548645,
+      "learning_rate": 0.00015196671455674268,
+      "loss": 1.0193,
+      "step": 7339
+    },
+    {
+      "epoch": 1.3068019943019942,
+      "grad_norm": 0.5661036372184753,
+      "learning_rate": 0.0001519547550440984,
+      "loss": 0.8321,
+      "step": 7340
+    },
+    {
+      "epoch": 1.306980056980057,
+      "grad_norm": 0.6270507574081421,
+      "learning_rate": 0.00015194279451350866,
+      "loss": 0.6403,
+      "step": 7341
+    },
+    {
+      "epoch": 1.3071581196581197,
+      "grad_norm": 0.7283764481544495,
+      "learning_rate": 0.00015193083296520773,
+      "loss": 1.0401,
+      "step": 7342
+    },
+    {
+      "epoch": 1.3073361823361824,
+      "grad_norm": 0.658835232257843,
+      "learning_rate": 0.00015191887039943,
+      "loss": 1.0172,
+      "step": 7343
+    },
+    {
+      "epoch": 1.307514245014245,
+      "grad_norm": 0.6288984417915344,
+      "learning_rate": 0.00015190690681640988,
+      "loss": 0.8649,
+      "step": 7344
+    },
+    {
+      "epoch": 1.3076923076923077,
+      "grad_norm": 0.666442334651947,
+      "learning_rate": 0.00015189494221638176,
+      "loss": 1.0757,
+      "step": 7345
+    },
+    {
+      "epoch": 1.3078703703703702,
+      "grad_norm": 0.6116433143615723,
+      "learning_rate": 0.00015188297659958003,
+      "loss": 0.9244,
+      "step": 7346
+    },
+    {
+      "epoch": 1.308048433048433,
+      "grad_norm": 0.6378964185714722,
+      "learning_rate": 0.0001518710099662392,
+      "loss": 0.9629,
+      "step": 7347
+    },
+    {
+      "epoch": 1.3082264957264957,
+      "grad_norm": 0.6258945465087891,
+      "learning_rate": 0.00015185904231659357,
+      "loss": 0.8524,
+      "step": 7348
+    },
+    {
+      "epoch": 1.3084045584045585,
+      "grad_norm": 0.6498504877090454,
+      "learning_rate": 0.0001518470736508778,
+      "loss": 0.9685,
+      "step": 7349
+    },
+    {
+      "epoch": 1.308582621082621,
+      "grad_norm": 0.6928247809410095,
+      "learning_rate": 0.00015183510396932635,
+      "loss": 0.9054,
+      "step": 7350
+    },
+    {
+      "epoch": 1.3087606837606838,
+      "grad_norm": 0.6350936889648438,
+      "learning_rate": 0.0001518231332721737,
+      "loss": 1.0039,
+      "step": 7351
+    },
+    {
+      "epoch": 1.3089387464387463,
+      "grad_norm": 0.6652286648750305,
+      "learning_rate": 0.00015181116155965437,
+      "loss": 0.8946,
+      "step": 7352
+    },
+    {
+      "epoch": 1.309116809116809,
+      "grad_norm": 0.6554864048957825,
+      "learning_rate": 0.000151799188832003,
+      "loss": 0.9518,
+      "step": 7353
+    },
+    {
+      "epoch": 1.3092948717948718,
+      "grad_norm": 0.7523114085197449,
+      "learning_rate": 0.0001517872150894541,
+      "loss": 0.9462,
+      "step": 7354
+    },
+    {
+      "epoch": 1.3094729344729346,
+      "grad_norm": 0.7113336324691772,
+      "learning_rate": 0.0001517752403322423,
+      "loss": 1.2347,
+      "step": 7355
+    },
+    {
+      "epoch": 1.309650997150997,
+      "grad_norm": 0.6461622714996338,
+      "learning_rate": 0.00015176326456060223,
+      "loss": 0.8891,
+      "step": 7356
+    },
+    {
+      "epoch": 1.3098290598290598,
+      "grad_norm": 0.7429143190383911,
+      "learning_rate": 0.00015175128777476852,
+      "loss": 1.1944,
+      "step": 7357
+    },
+    {
+      "epoch": 1.3100071225071226,
+      "grad_norm": 0.6816306114196777,
+      "learning_rate": 0.00015173930997497585,
+      "loss": 1.1445,
+      "step": 7358
+    },
+    {
+      "epoch": 1.3101851851851851,
+      "grad_norm": 0.6644450426101685,
+      "learning_rate": 0.00015172733116145884,
+      "loss": 0.9808,
+      "step": 7359
+    },
+    {
+      "epoch": 1.3103632478632479,
+      "grad_norm": 0.6921063661575317,
+      "learning_rate": 0.00015171535133445225,
+      "loss": 1.0162,
+      "step": 7360
+    },
+    {
+      "epoch": 1.3105413105413106,
+      "grad_norm": 0.6386187672615051,
+      "learning_rate": 0.00015170337049419082,
+      "loss": 0.9951,
+      "step": 7361
+    },
+    {
+      "epoch": 1.3107193732193732,
+      "grad_norm": 0.6505418419837952,
+      "learning_rate": 0.0001516913886409092,
+      "loss": 0.8872,
+      "step": 7362
+    },
+    {
+      "epoch": 1.310897435897436,
+      "grad_norm": 0.6415576934814453,
+      "learning_rate": 0.00015167940577484222,
+      "loss": 1.056,
+      "step": 7363
+    },
+    {
+      "epoch": 1.3110754985754987,
+      "grad_norm": 0.6691195964813232,
+      "learning_rate": 0.00015166742189622458,
+      "loss": 1.0561,
+      "step": 7364
+    },
+    {
+      "epoch": 1.3112535612535612,
+      "grad_norm": 0.6376257538795471,
+      "learning_rate": 0.00015165543700529122,
+      "loss": 0.8499,
+      "step": 7365
+    },
+    {
+      "epoch": 1.311431623931624,
+      "grad_norm": 0.6270790696144104,
+      "learning_rate": 0.00015164345110227684,
+      "loss": 1.0244,
+      "step": 7366
+    },
+    {
+      "epoch": 1.3116096866096867,
+      "grad_norm": 0.7120122313499451,
+      "learning_rate": 0.0001516314641874163,
+      "loss": 1.0476,
+      "step": 7367
+    },
+    {
+      "epoch": 1.3117877492877492,
+      "grad_norm": 0.6152660250663757,
+      "learning_rate": 0.0001516194762609445,
+      "loss": 0.897,
+      "step": 7368
+    },
+    {
+      "epoch": 1.311965811965812,
+      "grad_norm": 0.7578088045120239,
+      "learning_rate": 0.00015160748732309626,
+      "loss": 1.1609,
+      "step": 7369
+    },
+    {
+      "epoch": 1.3121438746438747,
+      "grad_norm": 0.6594924330711365,
+      "learning_rate": 0.00015159549737410656,
+      "loss": 1.1706,
+      "step": 7370
+    },
+    {
+      "epoch": 1.3123219373219372,
+      "grad_norm": 0.6559173464775085,
+      "learning_rate": 0.00015158350641421024,
+      "loss": 0.9452,
+      "step": 7371
+    },
+    {
+      "epoch": 1.3125,
+      "grad_norm": 0.6667516231536865,
+      "learning_rate": 0.00015157151444364226,
+      "loss": 0.8153,
+      "step": 7372
+    },
+    {
+      "epoch": 1.3126780626780628,
+      "grad_norm": 0.7054803371429443,
+      "learning_rate": 0.00015155952146263761,
+      "loss": 0.9887,
+      "step": 7373
+    },
+    {
+      "epoch": 1.3128561253561253,
+      "grad_norm": 0.7035902142524719,
+      "learning_rate": 0.00015154752747143123,
+      "loss": 1.1832,
+      "step": 7374
+    },
+    {
+      "epoch": 1.313034188034188,
+      "grad_norm": 0.6297488212585449,
+      "learning_rate": 0.00015153553247025813,
+      "loss": 0.9602,
+      "step": 7375
+    },
+    {
+      "epoch": 1.3132122507122508,
+      "grad_norm": 0.6851378083229065,
+      "learning_rate": 0.00015152353645935335,
+      "loss": 1.0743,
+      "step": 7376
+    },
+    {
+      "epoch": 1.3133903133903133,
+      "grad_norm": 0.6215537786483765,
+      "learning_rate": 0.00015151153943895187,
+      "loss": 0.9484,
+      "step": 7377
+    },
+    {
+      "epoch": 1.313568376068376,
+      "grad_norm": 0.6848666071891785,
+      "learning_rate": 0.0001514995414092888,
+      "loss": 1.0978,
+      "step": 7378
+    },
+    {
+      "epoch": 1.3137464387464388,
+      "grad_norm": 0.7527492642402649,
+      "learning_rate": 0.00015148754237059918,
+      "loss": 1.083,
+      "step": 7379
+    },
+    {
+      "epoch": 1.3139245014245013,
+      "grad_norm": 0.6264588236808777,
+      "learning_rate": 0.00015147554232311814,
+      "loss": 0.9995,
+      "step": 7380
+    },
+    {
+      "epoch": 1.314102564102564,
+      "grad_norm": 0.6666619181632996,
+      "learning_rate": 0.00015146354126708075,
+      "loss": 1.0156,
+      "step": 7381
+    },
+    {
+      "epoch": 1.3142806267806268,
+      "grad_norm": 0.6626597046852112,
+      "learning_rate": 0.00015145153920272222,
+      "loss": 1.0047,
+      "step": 7382
+    },
+    {
+      "epoch": 1.3144586894586894,
+      "grad_norm": 0.5975428223609924,
+      "learning_rate": 0.0001514395361302776,
+      "loss": 0.806,
+      "step": 7383
+    },
+    {
+      "epoch": 1.3146367521367521,
+      "grad_norm": 0.6509957909584045,
+      "learning_rate": 0.00015142753204998218,
+      "loss": 0.8871,
+      "step": 7384
+    },
+    {
+      "epoch": 1.3148148148148149,
+      "grad_norm": 0.6672926545143127,
+      "learning_rate": 0.00015141552696207108,
+      "loss": 0.9616,
+      "step": 7385
+    },
+    {
+      "epoch": 1.3149928774928774,
+      "grad_norm": 0.6965435147285461,
+      "learning_rate": 0.00015140352086677954,
+      "loss": 1.124,
+      "step": 7386
+    },
+    {
+      "epoch": 1.3151709401709402,
+      "grad_norm": 0.6559258103370667,
+      "learning_rate": 0.00015139151376434277,
+      "loss": 1.0271,
+      "step": 7387
+    },
+    {
+      "epoch": 1.315349002849003,
+      "grad_norm": 0.7613587379455566,
+      "learning_rate": 0.00015137950565499608,
+      "loss": 1.0349,
+      "step": 7388
+    },
+    {
+      "epoch": 1.3155270655270654,
+      "grad_norm": 0.7001944780349731,
+      "learning_rate": 0.0001513674965389747,
+      "loss": 0.8551,
+      "step": 7389
+    },
+    {
+      "epoch": 1.3157051282051282,
+      "grad_norm": 0.6087043285369873,
+      "learning_rate": 0.0001513554864165139,
+      "loss": 0.7118,
+      "step": 7390
+    },
+    {
+      "epoch": 1.315883190883191,
+      "grad_norm": 0.71526700258255,
+      "learning_rate": 0.00015134347528784908,
+      "loss": 1.0478,
+      "step": 7391
+    },
+    {
+      "epoch": 1.3160612535612537,
+      "grad_norm": 0.6182073950767517,
+      "learning_rate": 0.00015133146315321548,
+      "loss": 0.9474,
+      "step": 7392
+    },
+    {
+      "epoch": 1.3162393162393162,
+      "grad_norm": 0.7771387696266174,
+      "learning_rate": 0.0001513194500128485,
+      "loss": 1.0544,
+      "step": 7393
+    },
+    {
+      "epoch": 1.316417378917379,
+      "grad_norm": 0.7108260989189148,
+      "learning_rate": 0.00015130743586698353,
+      "loss": 0.8813,
+      "step": 7394
+    },
+    {
+      "epoch": 1.3165954415954415,
+      "grad_norm": 0.7057309150695801,
+      "learning_rate": 0.0001512954207158559,
+      "loss": 0.899,
+      "step": 7395
+    },
+    {
+      "epoch": 1.3167735042735043,
+      "grad_norm": 0.6139237880706787,
+      "learning_rate": 0.00015128340455970106,
+      "loss": 0.8885,
+      "step": 7396
+    },
+    {
+      "epoch": 1.316951566951567,
+      "grad_norm": 0.7166598439216614,
+      "learning_rate": 0.00015127138739875443,
+      "loss": 0.9792,
+      "step": 7397
+    },
+    {
+      "epoch": 1.3171296296296298,
+      "grad_norm": 0.6916186809539795,
+      "learning_rate": 0.00015125936923325153,
+      "loss": 0.8871,
+      "step": 7398
+    },
+    {
+      "epoch": 1.3173076923076923,
+      "grad_norm": 0.7189087271690369,
+      "learning_rate": 0.0001512473500634277,
+      "loss": 0.8302,
+      "step": 7399
+    },
+    {
+      "epoch": 1.317485754985755,
+      "grad_norm": 0.5739200115203857,
+      "learning_rate": 0.00015123532988951853,
+      "loss": 0.9137,
+      "step": 7400
+    },
+    {
+      "epoch": 1.3176638176638176,
+      "grad_norm": 0.7661057114601135,
+      "learning_rate": 0.00015122330871175952,
+      "loss": 1.1255,
+      "step": 7401
+    },
+    {
+      "epoch": 1.3178418803418803,
+      "grad_norm": 0.6487592458724976,
+      "learning_rate": 0.00015121128653038617,
+      "loss": 1.0519,
+      "step": 7402
+    },
+    {
+      "epoch": 1.318019943019943,
+      "grad_norm": 0.693134605884552,
+      "learning_rate": 0.00015119926334563406,
+      "loss": 0.9585,
+      "step": 7403
+    },
+    {
+      "epoch": 1.3181980056980058,
+      "grad_norm": 0.5895997285842896,
+      "learning_rate": 0.0001511872391577387,
+      "loss": 0.8033,
+      "step": 7404
+    },
+    {
+      "epoch": 1.3183760683760684,
+      "grad_norm": 0.654876172542572,
+      "learning_rate": 0.00015117521396693575,
+      "loss": 1.0082,
+      "step": 7405
+    },
+    {
+      "epoch": 1.318554131054131,
+      "grad_norm": 0.5877239108085632,
+      "learning_rate": 0.0001511631877734608,
+      "loss": 1.0147,
+      "step": 7406
+    },
+    {
+      "epoch": 1.3187321937321936,
+      "grad_norm": 0.6109837889671326,
+      "learning_rate": 0.00015115116057754944,
+      "loss": 0.7498,
+      "step": 7407
+    },
+    {
+      "epoch": 1.3189102564102564,
+      "grad_norm": 0.643856942653656,
+      "learning_rate": 0.00015113913237943736,
+      "loss": 1.0417,
+      "step": 7408
+    },
+    {
+      "epoch": 1.3190883190883191,
+      "grad_norm": 0.654077410697937,
+      "learning_rate": 0.00015112710317936022,
+      "loss": 1.1809,
+      "step": 7409
+    },
+    {
+      "epoch": 1.319266381766382,
+      "grad_norm": 0.6785375475883484,
+      "learning_rate": 0.00015111507297755367,
+      "loss": 0.9447,
+      "step": 7410
+    },
+    {
+      "epoch": 1.3194444444444444,
+      "grad_norm": 0.6513382196426392,
+      "learning_rate": 0.00015110304177425347,
+      "loss": 0.8286,
+      "step": 7411
+    },
+    {
+      "epoch": 1.3196225071225072,
+      "grad_norm": 0.6536405682563782,
+      "learning_rate": 0.00015109100956969533,
+      "loss": 1.1959,
+      "step": 7412
+    },
+    {
+      "epoch": 1.3198005698005697,
+      "grad_norm": 0.6633172035217285,
+      "learning_rate": 0.00015107897636411498,
+      "loss": 0.8839,
+      "step": 7413
+    },
+    {
+      "epoch": 1.3199786324786325,
+      "grad_norm": 0.5773791670799255,
+      "learning_rate": 0.00015106694215774821,
+      "loss": 0.9785,
+      "step": 7414
+    },
+    {
+      "epoch": 1.3201566951566952,
+      "grad_norm": 0.7005468010902405,
+      "learning_rate": 0.00015105490695083078,
+      "loss": 1.0752,
+      "step": 7415
+    },
+    {
+      "epoch": 1.320334757834758,
+      "grad_norm": 0.6509538888931274,
+      "learning_rate": 0.0001510428707435985,
+      "loss": 0.9886,
+      "step": 7416
+    },
+    {
+      "epoch": 1.3205128205128205,
+      "grad_norm": 0.6607788801193237,
+      "learning_rate": 0.0001510308335362872,
+      "loss": 0.9756,
+      "step": 7417
+    },
+    {
+      "epoch": 1.3206908831908832,
+      "grad_norm": 0.5977858304977417,
+      "learning_rate": 0.00015101879532913274,
+      "loss": 1.0574,
+      "step": 7418
+    },
+    {
+      "epoch": 1.3208689458689458,
+      "grad_norm": 0.6478607058525085,
+      "learning_rate": 0.00015100675612237096,
+      "loss": 1.0076,
+      "step": 7419
+    },
+    {
+      "epoch": 1.3210470085470085,
+      "grad_norm": 0.6386681199073792,
+      "learning_rate": 0.00015099471591623775,
+      "loss": 0.9639,
+      "step": 7420
+    },
+    {
+      "epoch": 1.3212250712250713,
+      "grad_norm": 0.6348143815994263,
+      "learning_rate": 0.000150982674710969,
+      "loss": 1.0226,
+      "step": 7421
+    },
+    {
+      "epoch": 1.321403133903134,
+      "grad_norm": 0.6737388372421265,
+      "learning_rate": 0.00015097063250680068,
+      "loss": 0.9985,
+      "step": 7422
+    },
+    {
+      "epoch": 1.3215811965811965,
+      "grad_norm": 0.7302656769752502,
+      "learning_rate": 0.00015095858930396866,
+      "loss": 0.9969,
+      "step": 7423
+    },
+    {
+      "epoch": 1.3217592592592593,
+      "grad_norm": 0.7062691450119019,
+      "learning_rate": 0.00015094654510270898,
+      "loss": 0.9137,
+      "step": 7424
+    },
+    {
+      "epoch": 1.3219373219373218,
+      "grad_norm": 0.6289888620376587,
+      "learning_rate": 0.00015093449990325754,
+      "loss": 0.9231,
+      "step": 7425
+    },
+    {
+      "epoch": 1.3221153846153846,
+      "grad_norm": 0.643284261226654,
+      "learning_rate": 0.0001509224537058504,
+      "loss": 0.8981,
+      "step": 7426
+    },
+    {
+      "epoch": 1.3222934472934473,
+      "grad_norm": 0.7019244432449341,
+      "learning_rate": 0.00015091040651072355,
+      "loss": 0.9994,
+      "step": 7427
+    },
+    {
+      "epoch": 1.32247150997151,
+      "grad_norm": 0.5982088446617126,
+      "learning_rate": 0.0001508983583181131,
+      "loss": 0.9365,
+      "step": 7428
+    },
+    {
+      "epoch": 1.3226495726495726,
+      "grad_norm": 0.6086063385009766,
+      "learning_rate": 0.00015088630912825498,
+      "loss": 0.8621,
+      "step": 7429
+    },
+    {
+      "epoch": 1.3228276353276354,
+      "grad_norm": 0.6829213500022888,
+      "learning_rate": 0.00015087425894138535,
+      "loss": 1.1959,
+      "step": 7430
+    },
+    {
+      "epoch": 1.323005698005698,
+      "grad_norm": 0.6538017392158508,
+      "learning_rate": 0.00015086220775774033,
+      "loss": 0.9412,
+      "step": 7431
+    },
+    {
+      "epoch": 1.3231837606837606,
+      "grad_norm": 0.6334070563316345,
+      "learning_rate": 0.00015085015557755597,
+      "loss": 0.9044,
+      "step": 7432
+    },
+    {
+      "epoch": 1.3233618233618234,
+      "grad_norm": 0.6514624357223511,
+      "learning_rate": 0.00015083810240106845,
+      "loss": 0.8859,
+      "step": 7433
+    },
+    {
+      "epoch": 1.3235398860398861,
+      "grad_norm": 0.7130434513092041,
+      "learning_rate": 0.00015082604822851397,
+      "loss": 1.2845,
+      "step": 7434
+    },
+    {
+      "epoch": 1.3237179487179487,
+      "grad_norm": 0.609419584274292,
+      "learning_rate": 0.00015081399306012862,
+      "loss": 1.0725,
+      "step": 7435
+    },
+    {
+      "epoch": 1.3238960113960114,
+      "grad_norm": 0.586807370185852,
+      "learning_rate": 0.0001508019368961486,
+      "loss": 0.9032,
+      "step": 7436
+    },
+    {
+      "epoch": 1.324074074074074,
+      "grad_norm": 0.6937291026115417,
+      "learning_rate": 0.0001507898797368102,
+      "loss": 0.7975,
+      "step": 7437
+    },
+    {
+      "epoch": 1.3242521367521367,
+      "grad_norm": 0.6804966330528259,
+      "learning_rate": 0.00015077782158234962,
+      "loss": 1.1018,
+      "step": 7438
+    },
+    {
+      "epoch": 1.3244301994301995,
+      "grad_norm": 0.6110677123069763,
+      "learning_rate": 0.0001507657624330031,
+      "loss": 0.7988,
+      "step": 7439
+    },
+    {
+      "epoch": 1.3246082621082622,
+      "grad_norm": 0.6340961456298828,
+      "learning_rate": 0.0001507537022890069,
+      "loss": 0.844,
+      "step": 7440
+    },
+    {
+      "epoch": 1.3247863247863247,
+      "grad_norm": 0.7291021943092346,
+      "learning_rate": 0.00015074164115059735,
+      "loss": 0.9867,
+      "step": 7441
+    },
+    {
+      "epoch": 1.3249643874643875,
+      "grad_norm": 0.6818505525588989,
+      "learning_rate": 0.00015072957901801076,
+      "loss": 1.1541,
+      "step": 7442
+    },
+    {
+      "epoch": 1.32514245014245,
+      "grad_norm": 0.6174707412719727,
+      "learning_rate": 0.00015071751589148345,
+      "loss": 1.1679,
+      "step": 7443
+    },
+    {
+      "epoch": 1.3253205128205128,
+      "grad_norm": 0.6481367945671082,
+      "learning_rate": 0.00015070545177125176,
+      "loss": 1.0955,
+      "step": 7444
+    },
+    {
+      "epoch": 1.3254985754985755,
+      "grad_norm": 0.6752339005470276,
+      "learning_rate": 0.00015069338665755203,
+      "loss": 0.8651,
+      "step": 7445
+    },
+    {
+      "epoch": 1.3256766381766383,
+      "grad_norm": 0.6608055830001831,
+      "learning_rate": 0.00015068132055062077,
+      "loss": 0.9553,
+      "step": 7446
+    },
+    {
+      "epoch": 1.3258547008547008,
+      "grad_norm": 0.5933246612548828,
+      "learning_rate": 0.00015066925345069425,
+      "loss": 0.8584,
+      "step": 7447
+    },
+    {
+      "epoch": 1.3260327635327636,
+      "grad_norm": 0.6301844716072083,
+      "learning_rate": 0.000150657185358009,
+      "loss": 0.8583,
+      "step": 7448
+    },
+    {
+      "epoch": 1.326210826210826,
+      "grad_norm": 0.7359434962272644,
+      "learning_rate": 0.00015064511627280145,
+      "loss": 1.0905,
+      "step": 7449
+    },
+    {
+      "epoch": 1.3263888888888888,
+      "grad_norm": 0.6334579586982727,
+      "learning_rate": 0.00015063304619530806,
+      "loss": 0.9814,
+      "step": 7450
+    },
+    {
+      "epoch": 1.3265669515669516,
+      "grad_norm": 0.6974197626113892,
+      "learning_rate": 0.00015062097512576528,
+      "loss": 0.9302,
+      "step": 7451
+    },
+    {
+      "epoch": 1.3267450142450143,
+      "grad_norm": 0.6895849704742432,
+      "learning_rate": 0.00015060890306440965,
+      "loss": 1.0175,
+      "step": 7452
+    },
+    {
+      "epoch": 1.3269230769230769,
+      "grad_norm": 0.5938003659248352,
+      "learning_rate": 0.00015059683001147767,
+      "loss": 0.8084,
+      "step": 7453
+    },
+    {
+      "epoch": 1.3271011396011396,
+      "grad_norm": 0.6821470856666565,
+      "learning_rate": 0.00015058475596720596,
+      "loss": 0.9897,
+      "step": 7454
+    },
+    {
+      "epoch": 1.3272792022792022,
+      "grad_norm": 0.5507164001464844,
+      "learning_rate": 0.00015057268093183104,
+      "loss": 0.7012,
+      "step": 7455
+    },
+    {
+      "epoch": 1.327457264957265,
+      "grad_norm": 0.6216199398040771,
+      "learning_rate": 0.00015056060490558945,
+      "loss": 1.0281,
+      "step": 7456
+    },
+    {
+      "epoch": 1.3276353276353277,
+      "grad_norm": 0.6674157977104187,
+      "learning_rate": 0.00015054852788871787,
+      "loss": 0.8776,
+      "step": 7457
+    },
+    {
+      "epoch": 1.3278133903133904,
+      "grad_norm": 0.666963517665863,
+      "learning_rate": 0.0001505364498814529,
+      "loss": 1.0742,
+      "step": 7458
+    },
+    {
+      "epoch": 1.327991452991453,
+      "grad_norm": 0.6205331683158875,
+      "learning_rate": 0.00015052437088403114,
+      "loss": 1.1109,
+      "step": 7459
+    },
+    {
+      "epoch": 1.3281695156695157,
+      "grad_norm": 0.6402750611305237,
+      "learning_rate": 0.00015051229089668933,
+      "loss": 1.0648,
+      "step": 7460
+    },
+    {
+      "epoch": 1.3283475783475782,
+      "grad_norm": 0.7445703744888306,
+      "learning_rate": 0.00015050020991966406,
+      "loss": 0.8989,
+      "step": 7461
+    },
+    {
+      "epoch": 1.328525641025641,
+      "grad_norm": 0.8131299614906311,
+      "learning_rate": 0.00015048812795319212,
+      "loss": 0.9552,
+      "step": 7462
+    },
+    {
+      "epoch": 1.3287037037037037,
+      "grad_norm": 0.7007313966751099,
+      "learning_rate": 0.00015047604499751017,
+      "loss": 0.9899,
+      "step": 7463
+    },
+    {
+      "epoch": 1.3288817663817665,
+      "grad_norm": 0.60536789894104,
+      "learning_rate": 0.000150463961052855,
+      "loss": 0.7694,
+      "step": 7464
+    },
+    {
+      "epoch": 1.329059829059829,
+      "grad_norm": 0.6910434365272522,
+      "learning_rate": 0.00015045187611946331,
+      "loss": 0.9575,
+      "step": 7465
+    },
+    {
+      "epoch": 1.3292378917378918,
+      "grad_norm": 0.7693352103233337,
+      "learning_rate": 0.00015043979019757194,
+      "loss": 1.1987,
+      "step": 7466
+    },
+    {
+      "epoch": 1.3294159544159543,
+      "grad_norm": 0.6675218939781189,
+      "learning_rate": 0.00015042770328741763,
+      "loss": 1.0099,
+      "step": 7467
+    },
+    {
+      "epoch": 1.329594017094017,
+      "grad_norm": 0.8040883541107178,
+      "learning_rate": 0.00015041561538923722,
+      "loss": 0.9493,
+      "step": 7468
+    },
+    {
+      "epoch": 1.3297720797720798,
+      "grad_norm": 0.6765826344490051,
+      "learning_rate": 0.00015040352650326762,
+      "loss": 1.1035,
+      "step": 7469
+    },
+    {
+      "epoch": 1.3299501424501425,
+      "grad_norm": 0.7099924087524414,
+      "learning_rate": 0.0001503914366297456,
+      "loss": 0.9198,
+      "step": 7470
+    },
+    {
+      "epoch": 1.330128205128205,
+      "grad_norm": 0.6673682928085327,
+      "learning_rate": 0.00015037934576890804,
+      "loss": 1.0234,
+      "step": 7471
+    },
+    {
+      "epoch": 1.3303062678062678,
+      "grad_norm": 0.7022300958633423,
+      "learning_rate": 0.00015036725392099184,
+      "loss": 1.3875,
+      "step": 7472
+    },
+    {
+      "epoch": 1.3304843304843303,
+      "grad_norm": 0.6997060179710388,
+      "learning_rate": 0.00015035516108623394,
+      "loss": 0.8114,
+      "step": 7473
+    },
+    {
+      "epoch": 1.330662393162393,
+      "grad_norm": 0.6262350678443909,
+      "learning_rate": 0.00015034306726487127,
+      "loss": 1.128,
+      "step": 7474
+    },
+    {
+      "epoch": 1.3308404558404558,
+      "grad_norm": 0.6330382227897644,
+      "learning_rate": 0.00015033097245714078,
+      "loss": 0.9032,
+      "step": 7475
+    },
+    {
+      "epoch": 1.3310185185185186,
+      "grad_norm": 0.6527551412582397,
+      "learning_rate": 0.00015031887666327944,
+      "loss": 0.9311,
+      "step": 7476
+    },
+    {
+      "epoch": 1.3311965811965811,
+      "grad_norm": 0.6754798889160156,
+      "learning_rate": 0.00015030677988352422,
+      "loss": 1.0626,
+      "step": 7477
+    },
+    {
+      "epoch": 1.3313746438746439,
+      "grad_norm": 0.6397945284843445,
+      "learning_rate": 0.00015029468211811216,
+      "loss": 0.9222,
+      "step": 7478
+    },
+    {
+      "epoch": 1.3315527065527066,
+      "grad_norm": 0.8163481950759888,
+      "learning_rate": 0.0001502825833672803,
+      "loss": 1.1827,
+      "step": 7479
+    },
+    {
+      "epoch": 1.3317307692307692,
+      "grad_norm": 0.6645621657371521,
+      "learning_rate": 0.00015027048363126566,
+      "loss": 0.9744,
+      "step": 7480
+    },
+    {
+      "epoch": 1.331908831908832,
+      "grad_norm": 0.6943182349205017,
+      "learning_rate": 0.0001502583829103053,
+      "loss": 1.1597,
+      "step": 7481
+    },
+    {
+      "epoch": 1.3320868945868947,
+      "grad_norm": 0.6283710598945618,
+      "learning_rate": 0.00015024628120463636,
+      "loss": 0.9514,
+      "step": 7482
+    },
+    {
+      "epoch": 1.3322649572649572,
+      "grad_norm": 0.6159678101539612,
+      "learning_rate": 0.0001502341785144959,
+      "loss": 0.9752,
+      "step": 7483
+    },
+    {
+      "epoch": 1.33244301994302,
+      "grad_norm": 0.6259802579879761,
+      "learning_rate": 0.00015022207484012107,
+      "loss": 0.9356,
+      "step": 7484
+    },
+    {
+      "epoch": 1.3326210826210827,
+      "grad_norm": 0.7322365641593933,
+      "learning_rate": 0.00015020997018174904,
+      "loss": 1.2072,
+      "step": 7485
+    },
+    {
+      "epoch": 1.3327991452991452,
+      "grad_norm": 0.6323443651199341,
+      "learning_rate": 0.0001501978645396169,
+      "loss": 1.1661,
+      "step": 7486
+    },
+    {
+      "epoch": 1.332977207977208,
+      "grad_norm": 0.7811527848243713,
+      "learning_rate": 0.00015018575791396187,
+      "loss": 1.0304,
+      "step": 7487
+    },
+    {
+      "epoch": 1.3331552706552707,
+      "grad_norm": 0.7221232056617737,
+      "learning_rate": 0.0001501736503050212,
+      "loss": 0.8838,
+      "step": 7488
+    },
+    {
+      "epoch": 1.3333333333333333,
+      "grad_norm": 0.6980099081993103,
+      "learning_rate": 0.00015016154171303207,
+      "loss": 1.1841,
+      "step": 7489
+    },
+    {
+      "epoch": 1.333511396011396,
+      "grad_norm": 0.6802879571914673,
+      "learning_rate": 0.00015014943213823175,
+      "loss": 0.959,
+      "step": 7490
+    },
+    {
+      "epoch": 1.3336894586894588,
+      "grad_norm": 0.637698233127594,
+      "learning_rate": 0.00015013732158085746,
+      "loss": 1.0517,
+      "step": 7491
+    },
+    {
+      "epoch": 1.3338675213675213,
+      "grad_norm": 0.6386787295341492,
+      "learning_rate": 0.0001501252100411465,
+      "loss": 0.7125,
+      "step": 7492
+    },
+    {
+      "epoch": 1.334045584045584,
+      "grad_norm": 0.6287358403205872,
+      "learning_rate": 0.0001501130975193362,
+      "loss": 0.8913,
+      "step": 7493
+    },
+    {
+      "epoch": 1.3342236467236468,
+      "grad_norm": 0.6142337322235107,
+      "learning_rate": 0.00015010098401566386,
+      "loss": 0.8149,
+      "step": 7494
+    },
+    {
+      "epoch": 1.3344017094017093,
+      "grad_norm": 0.6369916200637817,
+      "learning_rate": 0.0001500888695303668,
+      "loss": 1.0186,
+      "step": 7495
+    },
+    {
+      "epoch": 1.334579772079772,
+      "grad_norm": 0.7526934146881104,
+      "learning_rate": 0.0001500767540636824,
+      "loss": 1.2421,
+      "step": 7496
+    },
+    {
+      "epoch": 1.3347578347578348,
+      "grad_norm": 0.7278095483779907,
+      "learning_rate": 0.00015006463761584802,
+      "loss": 0.9856,
+      "step": 7497
+    },
+    {
+      "epoch": 1.3349358974358974,
+      "grad_norm": 0.6165127158164978,
+      "learning_rate": 0.00015005252018710104,
+      "loss": 1.0041,
+      "step": 7498
+    },
+    {
+      "epoch": 1.33511396011396,
+      "grad_norm": 0.637856662273407,
+      "learning_rate": 0.00015004040177767896,
+      "loss": 0.9134,
+      "step": 7499
+    },
+    {
+      "epoch": 1.3352920227920229,
+      "grad_norm": 0.661227285861969,
+      "learning_rate": 0.00015002828238781912,
+      "loss": 1.0393,
+      "step": 7500
+    },
+    {
+      "epoch": 1.3354700854700854,
+      "grad_norm": 0.6061869859695435,
+      "learning_rate": 0.000150016162017759,
+      "loss": 0.8453,
+      "step": 7501
+    },
+    {
+      "epoch": 1.3356481481481481,
+      "grad_norm": 0.6938419938087463,
+      "learning_rate": 0.0001500040406677361,
+      "loss": 1.0338,
+      "step": 7502
+    },
+    {
+      "epoch": 1.335826210826211,
+      "grad_norm": 0.6672863960266113,
+      "learning_rate": 0.0001499919183379879,
+      "loss": 0.8765,
+      "step": 7503
+    },
+    {
+      "epoch": 1.3360042735042734,
+      "grad_norm": 0.6200515031814575,
+      "learning_rate": 0.00014997979502875193,
+      "loss": 0.8286,
+      "step": 7504
+    },
+    {
+      "epoch": 1.3361823361823362,
+      "grad_norm": 0.6287549138069153,
+      "learning_rate": 0.00014996767074026567,
+      "loss": 0.9761,
+      "step": 7505
+    },
+    {
+      "epoch": 1.336360398860399,
+      "grad_norm": 0.6036837100982666,
+      "learning_rate": 0.0001499555454727667,
+      "loss": 1.0506,
+      "step": 7506
+    },
+    {
+      "epoch": 1.3365384615384617,
+      "grad_norm": 0.6875260472297668,
+      "learning_rate": 0.0001499434192264926,
+      "loss": 1.001,
+      "step": 7507
+    },
+    {
+      "epoch": 1.3367165242165242,
+      "grad_norm": 0.6558469533920288,
+      "learning_rate": 0.00014993129200168096,
+      "loss": 0.6874,
+      "step": 7508
+    },
+    {
+      "epoch": 1.336894586894587,
+      "grad_norm": 0.604167103767395,
+      "learning_rate": 0.00014991916379856934,
+      "loss": 1.0173,
+      "step": 7509
+    },
+    {
+      "epoch": 1.3370726495726495,
+      "grad_norm": 0.5941442251205444,
+      "learning_rate": 0.00014990703461739544,
+      "loss": 0.8569,
+      "step": 7510
+    },
+    {
+      "epoch": 1.3372507122507122,
+      "grad_norm": 0.7645071148872375,
+      "learning_rate": 0.00014989490445839687,
+      "loss": 1.0172,
+      "step": 7511
+    },
+    {
+      "epoch": 1.337428774928775,
+      "grad_norm": 0.5491678714752197,
+      "learning_rate": 0.00014988277332181126,
+      "loss": 0.8018,
+      "step": 7512
+    },
+    {
+      "epoch": 1.3376068376068377,
+      "grad_norm": 0.583322286605835,
+      "learning_rate": 0.00014987064120787635,
+      "loss": 0.8704,
+      "step": 7513
+    },
+    {
+      "epoch": 1.3377849002849003,
+      "grad_norm": 0.7385724186897278,
+      "learning_rate": 0.00014985850811682984,
+      "loss": 1.1121,
+      "step": 7514
+    },
+    {
+      "epoch": 1.337962962962963,
+      "grad_norm": 0.6842585206031799,
+      "learning_rate": 0.00014984637404890941,
+      "loss": 0.914,
+      "step": 7515
+    },
+    {
+      "epoch": 1.3381410256410255,
+      "grad_norm": 0.6771186590194702,
+      "learning_rate": 0.00014983423900435285,
+      "loss": 1.0838,
+      "step": 7516
+    },
+    {
+      "epoch": 1.3383190883190883,
+      "grad_norm": 0.7562049031257629,
+      "learning_rate": 0.00014982210298339788,
+      "loss": 1.123,
+      "step": 7517
+    },
+    {
+      "epoch": 1.338497150997151,
+      "grad_norm": 0.7617804408073425,
+      "learning_rate": 0.0001498099659862823,
+      "loss": 0.9438,
+      "step": 7518
+    },
+    {
+      "epoch": 1.3386752136752138,
+      "grad_norm": 0.561958909034729,
+      "learning_rate": 0.00014979782801324392,
+      "loss": 0.8739,
+      "step": 7519
+    },
+    {
+      "epoch": 1.3388532763532763,
+      "grad_norm": 0.7726154923439026,
+      "learning_rate": 0.00014978568906452052,
+      "loss": 1.1306,
+      "step": 7520
+    },
+    {
+      "epoch": 1.339031339031339,
+      "grad_norm": 0.6658660173416138,
+      "learning_rate": 0.00014977354914035002,
+      "loss": 1.0214,
+      "step": 7521
+    },
+    {
+      "epoch": 1.3392094017094016,
+      "grad_norm": 0.6385402679443359,
+      "learning_rate": 0.00014976140824097015,
+      "loss": 0.8851,
+      "step": 7522
+    },
+    {
+      "epoch": 1.3393874643874644,
+      "grad_norm": 0.6315767168998718,
+      "learning_rate": 0.0001497492663666189,
+      "loss": 0.986,
+      "step": 7523
+    },
+    {
+      "epoch": 1.3395655270655271,
+      "grad_norm": 0.6379088759422302,
+      "learning_rate": 0.0001497371235175341,
+      "loss": 0.9322,
+      "step": 7524
+    },
+    {
+      "epoch": 1.3397435897435899,
+      "grad_norm": 0.6605859994888306,
+      "learning_rate": 0.0001497249796939537,
+      "loss": 1.1112,
+      "step": 7525
+    },
+    {
+      "epoch": 1.3399216524216524,
+      "grad_norm": 0.7342822551727295,
+      "learning_rate": 0.0001497128348961156,
+      "loss": 0.9798,
+      "step": 7526
+    },
+    {
+      "epoch": 1.3400997150997151,
+      "grad_norm": 0.5667192935943604,
+      "learning_rate": 0.0001497006891242578,
+      "loss": 0.7493,
+      "step": 7527
+    },
+    {
+      "epoch": 1.3402777777777777,
+      "grad_norm": 0.6106827855110168,
+      "learning_rate": 0.0001496885423786182,
+      "loss": 1.0924,
+      "step": 7528
+    },
+    {
+      "epoch": 1.3404558404558404,
+      "grad_norm": 0.6207202076911926,
+      "learning_rate": 0.00014967639465943486,
+      "loss": 1.1123,
+      "step": 7529
+    },
+    {
+      "epoch": 1.3406339031339032,
+      "grad_norm": 0.6272760033607483,
+      "learning_rate": 0.00014966424596694574,
+      "loss": 0.9275,
+      "step": 7530
+    },
+    {
+      "epoch": 1.340811965811966,
+      "grad_norm": 0.6485986113548279,
+      "learning_rate": 0.0001496520963013889,
+      "loss": 1.1491,
+      "step": 7531
+    },
+    {
+      "epoch": 1.3409900284900285,
+      "grad_norm": 0.5743561387062073,
+      "learning_rate": 0.00014963994566300238,
+      "loss": 1.1101,
+      "step": 7532
+    },
+    {
+      "epoch": 1.3411680911680912,
+      "grad_norm": 0.6508657336235046,
+      "learning_rate": 0.00014962779405202424,
+      "loss": 1.0368,
+      "step": 7533
+    },
+    {
+      "epoch": 1.3413461538461537,
+      "grad_norm": 0.6598748564720154,
+      "learning_rate": 0.00014961564146869259,
+      "loss": 1.1064,
+      "step": 7534
+    },
+    {
+      "epoch": 1.3415242165242165,
+      "grad_norm": 0.6722840070724487,
+      "learning_rate": 0.00014960348791324547,
+      "loss": 0.9758,
+      "step": 7535
+    },
+    {
+      "epoch": 1.3417022792022792,
+      "grad_norm": 0.5807220935821533,
+      "learning_rate": 0.00014959133338592108,
+      "loss": 0.9936,
+      "step": 7536
+    },
+    {
+      "epoch": 1.341880341880342,
+      "grad_norm": 0.6318647265434265,
+      "learning_rate": 0.00014957917788695752,
+      "loss": 0.907,
+      "step": 7537
+    },
+    {
+      "epoch": 1.3420584045584045,
+      "grad_norm": 0.6725485324859619,
+      "learning_rate": 0.00014956702141659295,
+      "loss": 0.988,
+      "step": 7538
+    },
+    {
+      "epoch": 1.3422364672364673,
+      "grad_norm": 0.6675217747688293,
+      "learning_rate": 0.0001495548639750656,
+      "loss": 1.0194,
+      "step": 7539
+    },
+    {
+      "epoch": 1.3424145299145298,
+      "grad_norm": 0.6976884603500366,
+      "learning_rate": 0.0001495427055626136,
+      "loss": 1.2515,
+      "step": 7540
+    },
+    {
+      "epoch": 1.3425925925925926,
+      "grad_norm": 0.654941737651825,
+      "learning_rate": 0.0001495305461794752,
+      "loss": 1.2072,
+      "step": 7541
+    },
+    {
+      "epoch": 1.3427706552706553,
+      "grad_norm": 0.7085291743278503,
+      "learning_rate": 0.00014951838582588864,
+      "loss": 0.9772,
+      "step": 7542
+    },
+    {
+      "epoch": 1.342948717948718,
+      "grad_norm": 0.6319566965103149,
+      "learning_rate": 0.00014950622450209217,
+      "loss": 1.0162,
+      "step": 7543
+    },
+    {
+      "epoch": 1.3431267806267806,
+      "grad_norm": 0.6272495985031128,
+      "learning_rate": 0.00014949406220832407,
+      "loss": 0.7985,
+      "step": 7544
+    },
+    {
+      "epoch": 1.3433048433048433,
+      "grad_norm": 0.6352069973945618,
+      "learning_rate": 0.00014948189894482266,
+      "loss": 1.0041,
+      "step": 7545
+    },
+    {
+      "epoch": 1.3434829059829059,
+      "grad_norm": 0.6071867346763611,
+      "learning_rate": 0.0001494697347118262,
+      "loss": 0.9486,
+      "step": 7546
+    },
+    {
+      "epoch": 1.3436609686609686,
+      "grad_norm": 0.6458829641342163,
+      "learning_rate": 0.00014945756950957308,
+      "loss": 0.9417,
+      "step": 7547
+    },
+    {
+      "epoch": 1.3438390313390314,
+      "grad_norm": 0.6472262740135193,
+      "learning_rate": 0.0001494454033383016,
+      "loss": 1.056,
+      "step": 7548
+    },
+    {
+      "epoch": 1.3440170940170941,
+      "grad_norm": 0.6985635161399841,
+      "learning_rate": 0.00014943323619825017,
+      "loss": 1.0483,
+      "step": 7549
+    },
+    {
+      "epoch": 1.3441951566951567,
+      "grad_norm": 0.6379460096359253,
+      "learning_rate": 0.00014942106808965718,
+      "loss": 0.9552,
+      "step": 7550
+    },
+    {
+      "epoch": 1.3443732193732194,
+      "grad_norm": 0.7036557793617249,
+      "learning_rate": 0.00014940889901276098,
+      "loss": 0.9647,
+      "step": 7551
+    },
+    {
+      "epoch": 1.344551282051282,
+      "grad_norm": 0.6697289943695068,
+      "learning_rate": 0.0001493967289678001,
+      "loss": 0.9029,
+      "step": 7552
+    },
+    {
+      "epoch": 1.3447293447293447,
+      "grad_norm": 0.6336250901222229,
+      "learning_rate": 0.00014938455795501286,
+      "loss": 0.9458,
+      "step": 7553
+    },
+    {
+      "epoch": 1.3449074074074074,
+      "grad_norm": 0.7279673218727112,
+      "learning_rate": 0.00014937238597463785,
+      "loss": 1.0228,
+      "step": 7554
+    },
+    {
+      "epoch": 1.3450854700854702,
+      "grad_norm": 0.6514406204223633,
+      "learning_rate": 0.00014936021302691349,
+      "loss": 0.8265,
+      "step": 7555
+    },
+    {
+      "epoch": 1.3452635327635327,
+      "grad_norm": 0.6405338644981384,
+      "learning_rate": 0.0001493480391120783,
+      "loss": 0.9516,
+      "step": 7556
+    },
+    {
+      "epoch": 1.3454415954415955,
+      "grad_norm": 0.6442672610282898,
+      "learning_rate": 0.00014933586423037076,
+      "loss": 0.9279,
+      "step": 7557
+    },
+    {
+      "epoch": 1.345619658119658,
+      "grad_norm": 0.7588633894920349,
+      "learning_rate": 0.00014932368838202945,
+      "loss": 1.0976,
+      "step": 7558
+    },
+    {
+      "epoch": 1.3457977207977208,
+      "grad_norm": 0.5536739230155945,
+      "learning_rate": 0.00014931151156729296,
+      "loss": 0.713,
+      "step": 7559
+    },
+    {
+      "epoch": 1.3459757834757835,
+      "grad_norm": 0.6897570490837097,
+      "learning_rate": 0.00014929933378639981,
+      "loss": 0.9521,
+      "step": 7560
+    },
+    {
+      "epoch": 1.3461538461538463,
+      "grad_norm": 0.6654927134513855,
+      "learning_rate": 0.00014928715503958863,
+      "loss": 0.8506,
+      "step": 7561
+    },
+    {
+      "epoch": 1.3463319088319088,
+      "grad_norm": 0.655806839466095,
+      "learning_rate": 0.00014927497532709808,
+      "loss": 0.8636,
+      "step": 7562
+    },
+    {
+      "epoch": 1.3465099715099715,
+      "grad_norm": 0.6547064185142517,
+      "learning_rate": 0.00014926279464916667,
+      "loss": 0.9155,
+      "step": 7563
+    },
+    {
+      "epoch": 1.346688034188034,
+      "grad_norm": 0.7555415034294128,
+      "learning_rate": 0.00014925061300603316,
+      "loss": 0.8791,
+      "step": 7564
+    },
+    {
+      "epoch": 1.3468660968660968,
+      "grad_norm": 0.7439392805099487,
+      "learning_rate": 0.0001492384303979362,
+      "loss": 1.1669,
+      "step": 7565
+    },
+    {
+      "epoch": 1.3470441595441596,
+      "grad_norm": 0.6016925573348999,
+      "learning_rate": 0.0001492262468251145,
+      "loss": 0.9811,
+      "step": 7566
+    },
+    {
+      "epoch": 1.3472222222222223,
+      "grad_norm": 0.644652783870697,
+      "learning_rate": 0.00014921406228780675,
+      "loss": 0.7096,
+      "step": 7567
+    },
+    {
+      "epoch": 1.3474002849002849,
+      "grad_norm": 0.721814751625061,
+      "learning_rate": 0.00014920187678625166,
+      "loss": 0.9933,
+      "step": 7568
+    },
+    {
+      "epoch": 1.3475783475783476,
+      "grad_norm": 0.6212092638015747,
+      "learning_rate": 0.000149189690320688,
+      "loss": 0.8499,
+      "step": 7569
+    },
+    {
+      "epoch": 1.3477564102564101,
+      "grad_norm": 0.6235958337783813,
+      "learning_rate": 0.00014917750289135455,
+      "loss": 0.9189,
+      "step": 7570
+    },
+    {
+      "epoch": 1.3479344729344729,
+      "grad_norm": 0.6309674978256226,
+      "learning_rate": 0.0001491653144984901,
+      "loss": 0.9744,
+      "step": 7571
+    },
+    {
+      "epoch": 1.3481125356125356,
+      "grad_norm": 0.7606496214866638,
+      "learning_rate": 0.00014915312514233344,
+      "loss": 1.0181,
+      "step": 7572
+    },
+    {
+      "epoch": 1.3482905982905984,
+      "grad_norm": 0.6892654895782471,
+      "learning_rate": 0.00014914093482312342,
+      "loss": 0.9517,
+      "step": 7573
+    },
+    {
+      "epoch": 1.348468660968661,
+      "grad_norm": 0.6746503114700317,
+      "learning_rate": 0.0001491287435410988,
+      "loss": 1.056,
+      "step": 7574
+    },
+    {
+      "epoch": 1.3486467236467237,
+      "grad_norm": 0.5892919301986694,
+      "learning_rate": 0.00014911655129649858,
+      "loss": 1.0515,
+      "step": 7575
+    },
+    {
+      "epoch": 1.3488247863247862,
+      "grad_norm": 0.6278096437454224,
+      "learning_rate": 0.0001491043580895615,
+      "loss": 0.864,
+      "step": 7576
+    },
+    {
+      "epoch": 1.349002849002849,
+      "grad_norm": 0.7017706632614136,
+      "learning_rate": 0.0001490921639205266,
+      "loss": 1.0618,
+      "step": 7577
+    },
+    {
+      "epoch": 1.3491809116809117,
+      "grad_norm": 0.7318746447563171,
+      "learning_rate": 0.00014907996878963268,
+      "loss": 0.9905,
+      "step": 7578
+    },
+    {
+      "epoch": 1.3493589743589745,
+      "grad_norm": 0.6485885977745056,
+      "learning_rate": 0.00014906777269711873,
+      "loss": 1.0498,
+      "step": 7579
+    },
+    {
+      "epoch": 1.349537037037037,
+      "grad_norm": 0.644902229309082,
+      "learning_rate": 0.00014905557564322372,
+      "loss": 0.885,
+      "step": 7580
+    },
+    {
+      "epoch": 1.3497150997150997,
+      "grad_norm": 0.6567610502243042,
+      "learning_rate": 0.0001490433776281866,
+      "loss": 0.8938,
+      "step": 7581
+    },
+    {
+      "epoch": 1.3498931623931623,
+      "grad_norm": 0.6233102679252625,
+      "learning_rate": 0.0001490311786522464,
+      "loss": 0.9007,
+      "step": 7582
+    },
+    {
+      "epoch": 1.350071225071225,
+      "grad_norm": 0.6962146759033203,
+      "learning_rate": 0.00014901897871564206,
+      "loss": 0.9257,
+      "step": 7583
+    },
+    {
+      "epoch": 1.3502492877492878,
+      "grad_norm": 0.6986933350563049,
+      "learning_rate": 0.00014900677781861266,
+      "loss": 1.0089,
+      "step": 7584
+    },
+    {
+      "epoch": 1.3504273504273505,
+      "grad_norm": 0.7527925968170166,
+      "learning_rate": 0.00014899457596139729,
+      "loss": 1.0762,
+      "step": 7585
+    },
+    {
+      "epoch": 1.350605413105413,
+      "grad_norm": 0.69191974401474,
+      "learning_rate": 0.00014898237314423494,
+      "loss": 0.9829,
+      "step": 7586
+    },
+    {
+      "epoch": 1.3507834757834758,
+      "grad_norm": 0.7866443395614624,
+      "learning_rate": 0.00014897016936736478,
+      "loss": 1.0911,
+      "step": 7587
+    },
+    {
+      "epoch": 1.3509615384615383,
+      "grad_norm": 0.7087522745132446,
+      "learning_rate": 0.00014895796463102587,
+      "loss": 1.0693,
+      "step": 7588
+    },
+    {
+      "epoch": 1.351139601139601,
+      "grad_norm": 0.704276442527771,
+      "learning_rate": 0.00014894575893545736,
+      "loss": 0.9082,
+      "step": 7589
+    },
+    {
+      "epoch": 1.3513176638176638,
+      "grad_norm": 0.7074487805366516,
+      "learning_rate": 0.00014893355228089833,
+      "loss": 0.8731,
+      "step": 7590
+    },
+    {
+      "epoch": 1.3514957264957266,
+      "grad_norm": 0.6542425155639648,
+      "learning_rate": 0.00014892134466758803,
+      "loss": 0.9325,
+      "step": 7591
+    },
+    {
+      "epoch": 1.351673789173789,
+      "grad_norm": 0.6577230095863342,
+      "learning_rate": 0.0001489091360957656,
+      "loss": 0.8468,
+      "step": 7592
+    },
+    {
+      "epoch": 1.3518518518518519,
+      "grad_norm": 0.638534426689148,
+      "learning_rate": 0.00014889692656567025,
+      "loss": 0.8598,
+      "step": 7593
+    },
+    {
+      "epoch": 1.3520299145299146,
+      "grad_norm": 0.751133918762207,
+      "learning_rate": 0.0001488847160775412,
+      "loss": 1.0006,
+      "step": 7594
+    },
+    {
+      "epoch": 1.3522079772079771,
+      "grad_norm": 0.6272708773612976,
+      "learning_rate": 0.00014887250463161767,
+      "loss": 0.8782,
+      "step": 7595
+    },
+    {
+      "epoch": 1.35238603988604,
+      "grad_norm": 0.7242439985275269,
+      "learning_rate": 0.00014886029222813897,
+      "loss": 1.2443,
+      "step": 7596
+    },
+    {
+      "epoch": 1.3525641025641026,
+      "grad_norm": 0.6199275851249695,
+      "learning_rate": 0.0001488480788673443,
+      "loss": 0.9211,
+      "step": 7597
+    },
+    {
+      "epoch": 1.3527421652421652,
+      "grad_norm": 0.6401306986808777,
+      "learning_rate": 0.00014883586454947305,
+      "loss": 0.8808,
+      "step": 7598
+    },
+    {
+      "epoch": 1.352920227920228,
+      "grad_norm": 0.6340938806533813,
+      "learning_rate": 0.00014882364927476443,
+      "loss": 0.9406,
+      "step": 7599
+    },
+    {
+      "epoch": 1.3530982905982907,
+      "grad_norm": 0.6388604044914246,
+      "learning_rate": 0.00014881143304345783,
+      "loss": 1.0674,
+      "step": 7600
+    },
+    {
+      "epoch": 1.3532763532763532,
+      "grad_norm": 0.7562061548233032,
+      "learning_rate": 0.00014879921585579263,
+      "loss": 1.0959,
+      "step": 7601
+    },
+    {
+      "epoch": 1.353454415954416,
+      "grad_norm": 0.6303606033325195,
+      "learning_rate": 0.00014878699771200815,
+      "loss": 0.9641,
+      "step": 7602
+    },
+    {
+      "epoch": 1.3536324786324787,
+      "grad_norm": 0.8623232841491699,
+      "learning_rate": 0.00014877477861234382,
+      "loss": 1.1529,
+      "step": 7603
+    },
+    {
+      "epoch": 1.3538105413105412,
+      "grad_norm": 0.6607624888420105,
+      "learning_rate": 0.00014876255855703896,
+      "loss": 0.6291,
+      "step": 7604
+    },
+    {
+      "epoch": 1.353988603988604,
+      "grad_norm": 0.6226931214332581,
+      "learning_rate": 0.0001487503375463331,
+      "loss": 0.7485,
+      "step": 7605
+    },
+    {
+      "epoch": 1.3541666666666667,
+      "grad_norm": 0.7626705169677734,
+      "learning_rate": 0.00014873811558046565,
+      "loss": 0.9694,
+      "step": 7606
+    },
+    {
+      "epoch": 1.3543447293447293,
+      "grad_norm": 0.5436057448387146,
+      "learning_rate": 0.00014872589265967605,
+      "loss": 0.6173,
+      "step": 7607
+    },
+    {
+      "epoch": 1.354522792022792,
+      "grad_norm": 0.7822177410125732,
+      "learning_rate": 0.00014871366878420382,
+      "loss": 1.0048,
+      "step": 7608
+    },
+    {
+      "epoch": 1.3547008547008548,
+      "grad_norm": 0.6955201625823975,
+      "learning_rate": 0.00014870144395428848,
+      "loss": 0.9487,
+      "step": 7609
+    },
+    {
+      "epoch": 1.3548789173789173,
+      "grad_norm": 0.6625505685806274,
+      "learning_rate": 0.00014868921817016943,
+      "loss": 0.9389,
+      "step": 7610
+    },
+    {
+      "epoch": 1.35505698005698,
+      "grad_norm": 0.6625354886054993,
+      "learning_rate": 0.00014867699143208634,
+      "loss": 0.9538,
+      "step": 7611
+    },
+    {
+      "epoch": 1.3552350427350428,
+      "grad_norm": 0.7426592707633972,
+      "learning_rate": 0.00014866476374027874,
+      "loss": 1.2566,
+      "step": 7612
+    },
+    {
+      "epoch": 1.3554131054131053,
+      "grad_norm": 0.6856544017791748,
+      "learning_rate": 0.00014865253509498616,
+      "loss": 0.9663,
+      "step": 7613
+    },
+    {
+      "epoch": 1.355591168091168,
+      "grad_norm": 0.6343915462493896,
+      "learning_rate": 0.00014864030549644825,
+      "loss": 0.9416,
+      "step": 7614
+    },
+    {
+      "epoch": 1.3557692307692308,
+      "grad_norm": 0.6319553256034851,
+      "learning_rate": 0.00014862807494490454,
+      "loss": 0.9335,
+      "step": 7615
+    },
+    {
+      "epoch": 1.3559472934472934,
+      "grad_norm": 0.6919772624969482,
+      "learning_rate": 0.00014861584344059476,
+      "loss": 0.8516,
+      "step": 7616
+    },
+    {
+      "epoch": 1.3561253561253561,
+      "grad_norm": 0.6405790448188782,
+      "learning_rate": 0.00014860361098375851,
+      "loss": 1.1278,
+      "step": 7617
+    },
+    {
+      "epoch": 1.3563034188034189,
+      "grad_norm": 0.7591732144355774,
+      "learning_rate": 0.00014859137757463548,
+      "loss": 1.0961,
+      "step": 7618
+    },
+    {
+      "epoch": 1.3564814814814814,
+      "grad_norm": 0.6166727542877197,
+      "learning_rate": 0.0001485791432134653,
+      "loss": 0.9358,
+      "step": 7619
+    },
+    {
+      "epoch": 1.3566595441595442,
+      "grad_norm": 0.7068707346916199,
+      "learning_rate": 0.00014856690790048777,
+      "loss": 0.8325,
+      "step": 7620
+    },
+    {
+      "epoch": 1.356837606837607,
+      "grad_norm": 0.8465402722358704,
+      "learning_rate": 0.00014855467163594257,
+      "loss": 1.0047,
+      "step": 7621
+    },
+    {
+      "epoch": 1.3570156695156697,
+      "grad_norm": 0.7403460741043091,
+      "learning_rate": 0.00014854243442006943,
+      "loss": 1.0907,
+      "step": 7622
+    },
+    {
+      "epoch": 1.3571937321937322,
+      "grad_norm": 0.6939566135406494,
+      "learning_rate": 0.00014853019625310813,
+      "loss": 0.9156,
+      "step": 7623
+    },
+    {
+      "epoch": 1.357371794871795,
+      "grad_norm": 0.6425924897193909,
+      "learning_rate": 0.0001485179571352984,
+      "loss": 0.8156,
+      "step": 7624
+    },
+    {
+      "epoch": 1.3575498575498575,
+      "grad_norm": 0.7091902494430542,
+      "learning_rate": 0.00014850571706688013,
+      "loss": 1.0483,
+      "step": 7625
+    },
+    {
+      "epoch": 1.3577279202279202,
+      "grad_norm": 0.663342297077179,
+      "learning_rate": 0.00014849347604809312,
+      "loss": 1.0405,
+      "step": 7626
+    },
+    {
+      "epoch": 1.357905982905983,
+      "grad_norm": 0.6727671027183533,
+      "learning_rate": 0.00014848123407917716,
+      "loss": 1.0389,
+      "step": 7627
+    },
+    {
+      "epoch": 1.3580840455840457,
+      "grad_norm": 0.6572692394256592,
+      "learning_rate": 0.0001484689911603721,
+      "loss": 1.0489,
+      "step": 7628
+    },
+    {
+      "epoch": 1.3582621082621082,
+      "grad_norm": 0.7629066109657288,
+      "learning_rate": 0.0001484567472919179,
+      "loss": 1.0372,
+      "step": 7629
+    },
+    {
+      "epoch": 1.358440170940171,
+      "grad_norm": 0.7848913669586182,
+      "learning_rate": 0.00014844450247405435,
+      "loss": 0.9437,
+      "step": 7630
+    },
+    {
+      "epoch": 1.3586182336182335,
+      "grad_norm": 0.715949535369873,
+      "learning_rate": 0.00014843225670702143,
+      "loss": 1.1949,
+      "step": 7631
+    },
+    {
+      "epoch": 1.3587962962962963,
+      "grad_norm": 0.6498245596885681,
+      "learning_rate": 0.00014842000999105905,
+      "loss": 0.8845,
+      "step": 7632
+    },
+    {
+      "epoch": 1.358974358974359,
+      "grad_norm": 0.7251074910163879,
+      "learning_rate": 0.00014840776232640716,
+      "loss": 1.093,
+      "step": 7633
+    },
+    {
+      "epoch": 1.3591524216524218,
+      "grad_norm": 0.6223580837249756,
+      "learning_rate": 0.0001483955137133057,
+      "loss": 1.0344,
+      "step": 7634
+    },
+    {
+      "epoch": 1.3593304843304843,
+      "grad_norm": 0.6504943370819092,
+      "learning_rate": 0.00014838326415199472,
+      "loss": 1.109,
+      "step": 7635
+    },
+    {
+      "epoch": 1.359508547008547,
+      "grad_norm": 0.5912374258041382,
+      "learning_rate": 0.00014837101364271416,
+      "loss": 1.0756,
+      "step": 7636
+    },
+    {
+      "epoch": 1.3596866096866096,
+      "grad_norm": 0.6116467714309692,
+      "learning_rate": 0.00014835876218570408,
+      "loss": 0.7871,
+      "step": 7637
+    },
+    {
+      "epoch": 1.3598646723646723,
+      "grad_norm": 0.7013412117958069,
+      "learning_rate": 0.0001483465097812045,
+      "loss": 1.0003,
+      "step": 7638
+    },
+    {
+      "epoch": 1.360042735042735,
+      "grad_norm": 0.5930750370025635,
+      "learning_rate": 0.00014833425642945552,
+      "loss": 0.9926,
+      "step": 7639
+    },
+    {
+      "epoch": 1.3602207977207978,
+      "grad_norm": 0.732955276966095,
+      "learning_rate": 0.00014832200213069717,
+      "loss": 1.2801,
+      "step": 7640
+    },
+    {
+      "epoch": 1.3603988603988604,
+      "grad_norm": 0.6836149096488953,
+      "learning_rate": 0.00014830974688516958,
+      "loss": 0.9292,
+      "step": 7641
+    },
+    {
+      "epoch": 1.3605769230769231,
+      "grad_norm": 0.6531919836997986,
+      "learning_rate": 0.00014829749069311283,
+      "loss": 0.9551,
+      "step": 7642
+    },
+    {
+      "epoch": 1.3607549857549857,
+      "grad_norm": 0.719093382358551,
+      "learning_rate": 0.0001482852335547671,
+      "loss": 0.8588,
+      "step": 7643
+    },
+    {
+      "epoch": 1.3609330484330484,
+      "grad_norm": 0.6144105792045593,
+      "learning_rate": 0.00014827297547037252,
+      "loss": 0.9033,
+      "step": 7644
+    },
+    {
+      "epoch": 1.3611111111111112,
+      "grad_norm": 0.789241373538971,
+      "learning_rate": 0.00014826071644016926,
+      "loss": 1.1916,
+      "step": 7645
+    },
+    {
+      "epoch": 1.361289173789174,
+      "grad_norm": 0.6137418746948242,
+      "learning_rate": 0.0001482484564643975,
+      "loss": 0.9648,
+      "step": 7646
+    },
+    {
+      "epoch": 1.3614672364672364,
+      "grad_norm": 0.6789261698722839,
+      "learning_rate": 0.00014823619554329745,
+      "loss": 0.829,
+      "step": 7647
+    },
+    {
+      "epoch": 1.3616452991452992,
+      "grad_norm": 0.6508790254592896,
+      "learning_rate": 0.0001482239336771094,
+      "loss": 0.942,
+      "step": 7648
+    },
+    {
+      "epoch": 1.3618233618233617,
+      "grad_norm": 0.6725571751594543,
+      "learning_rate": 0.00014821167086607353,
+      "loss": 0.8884,
+      "step": 7649
+    },
+    {
+      "epoch": 1.3620014245014245,
+      "grad_norm": 0.6252003908157349,
+      "learning_rate": 0.00014819940711043012,
+      "loss": 0.9778,
+      "step": 7650
+    },
+    {
+      "epoch": 1.3621794871794872,
+      "grad_norm": 0.6950626969337463,
+      "learning_rate": 0.00014818714241041943,
+      "loss": 1.2104,
+      "step": 7651
+    },
+    {
+      "epoch": 1.36235754985755,
+      "grad_norm": 0.6527379155158997,
+      "learning_rate": 0.0001481748767662818,
+      "loss": 0.7845,
+      "step": 7652
+    },
+    {
+      "epoch": 1.3625356125356125,
+      "grad_norm": 0.7438235282897949,
+      "learning_rate": 0.00014816261017825755,
+      "loss": 0.9513,
+      "step": 7653
+    },
+    {
+      "epoch": 1.3627136752136753,
+      "grad_norm": 0.6412696838378906,
+      "learning_rate": 0.000148150342646587,
+      "loss": 0.8478,
+      "step": 7654
+    },
+    {
+      "epoch": 1.3628917378917378,
+      "grad_norm": 0.658481240272522,
+      "learning_rate": 0.00014813807417151046,
+      "loss": 0.6816,
+      "step": 7655
+    },
+    {
+      "epoch": 1.3630698005698005,
+      "grad_norm": 0.6170126795768738,
+      "learning_rate": 0.0001481258047532684,
+      "loss": 0.8862,
+      "step": 7656
+    },
+    {
+      "epoch": 1.3632478632478633,
+      "grad_norm": 0.7049173712730408,
+      "learning_rate": 0.0001481135343921012,
+      "loss": 1.0027,
+      "step": 7657
+    },
+    {
+      "epoch": 1.363425925925926,
+      "grad_norm": 0.7780741453170776,
+      "learning_rate": 0.0001481012630882492,
+      "loss": 1.0183,
+      "step": 7658
+    },
+    {
+      "epoch": 1.3636039886039886,
+      "grad_norm": 0.6658362746238708,
+      "learning_rate": 0.00014808899084195286,
+      "loss": 0.878,
+      "step": 7659
+    },
+    {
+      "epoch": 1.3637820512820513,
+      "grad_norm": 0.7192076444625854,
+      "learning_rate": 0.00014807671765345267,
+      "loss": 1.2269,
+      "step": 7660
+    },
+    {
+      "epoch": 1.3639601139601139,
+      "grad_norm": 0.7038660049438477,
+      "learning_rate": 0.00014806444352298903,
+      "loss": 0.889,
+      "step": 7661
+    },
+    {
+      "epoch": 1.3641381766381766,
+      "grad_norm": 0.622803270816803,
+      "learning_rate": 0.00014805216845080249,
+      "loss": 0.9623,
+      "step": 7662
+    },
+    {
+      "epoch": 1.3643162393162394,
+      "grad_norm": 0.9157076478004456,
+      "learning_rate": 0.00014803989243713353,
+      "loss": 1.106,
+      "step": 7663
+    },
+    {
+      "epoch": 1.364494301994302,
+      "grad_norm": 0.6369999647140503,
+      "learning_rate": 0.00014802761548222268,
+      "loss": 0.9755,
+      "step": 7664
+    },
+    {
+      "epoch": 1.3646723646723646,
+      "grad_norm": 0.8318394422531128,
+      "learning_rate": 0.00014801533758631045,
+      "loss": 1.1786,
+      "step": 7665
+    },
+    {
+      "epoch": 1.3648504273504274,
+      "grad_norm": 0.7065796852111816,
+      "learning_rate": 0.00014800305874963744,
+      "loss": 1.2066,
+      "step": 7666
+    },
+    {
+      "epoch": 1.36502849002849,
+      "grad_norm": 0.6570265293121338,
+      "learning_rate": 0.0001479907789724442,
+      "loss": 1.0084,
+      "step": 7667
+    },
+    {
+      "epoch": 1.3652065527065527,
+      "grad_norm": 0.637321949005127,
+      "learning_rate": 0.00014797849825497135,
+      "loss": 0.9075,
+      "step": 7668
+    },
+    {
+      "epoch": 1.3653846153846154,
+      "grad_norm": 0.7656470537185669,
+      "learning_rate": 0.00014796621659745948,
+      "loss": 1.1497,
+      "step": 7669
+    },
+    {
+      "epoch": 1.3655626780626782,
+      "grad_norm": 0.6798120737075806,
+      "learning_rate": 0.0001479539340001493,
+      "loss": 0.8154,
+      "step": 7670
+    },
+    {
+      "epoch": 1.3657407407407407,
+      "grad_norm": 0.7004328966140747,
+      "learning_rate": 0.0001479416504632813,
+      "loss": 1.0513,
+      "step": 7671
+    },
+    {
+      "epoch": 1.3659188034188035,
+      "grad_norm": 0.6551713943481445,
+      "learning_rate": 0.0001479293659870963,
+      "loss": 0.8735,
+      "step": 7672
+    },
+    {
+      "epoch": 1.366096866096866,
+      "grad_norm": 0.7685719132423401,
+      "learning_rate": 0.00014791708057183494,
+      "loss": 1.111,
+      "step": 7673
+    },
+    {
+      "epoch": 1.3662749287749287,
+      "grad_norm": 0.673624575138092,
+      "learning_rate": 0.0001479047942177379,
+      "loss": 0.9418,
+      "step": 7674
+    },
+    {
+      "epoch": 1.3664529914529915,
+      "grad_norm": 0.6281047463417053,
+      "learning_rate": 0.00014789250692504597,
+      "loss": 1.0938,
+      "step": 7675
+    },
+    {
+      "epoch": 1.3666310541310542,
+      "grad_norm": 0.5846312642097473,
+      "learning_rate": 0.0001478802186939998,
+      "loss": 0.6352,
+      "step": 7676
+    },
+    {
+      "epoch": 1.3668091168091168,
+      "grad_norm": 0.7037251591682434,
+      "learning_rate": 0.00014786792952484025,
+      "loss": 1.1775,
+      "step": 7677
+    },
+    {
+      "epoch": 1.3669871794871795,
+      "grad_norm": 0.69822758436203,
+      "learning_rate": 0.00014785563941780808,
+      "loss": 1.0877,
+      "step": 7678
+    },
+    {
+      "epoch": 1.367165242165242,
+      "grad_norm": 0.7229313254356384,
+      "learning_rate": 0.000147843348373144,
+      "loss": 1.0305,
+      "step": 7679
+    },
+    {
+      "epoch": 1.3673433048433048,
+      "grad_norm": 0.665771484375,
+      "learning_rate": 0.00014783105639108897,
+      "loss": 0.9056,
+      "step": 7680
+    },
+    {
+      "epoch": 1.3675213675213675,
+      "grad_norm": 0.6418357491493225,
+      "learning_rate": 0.00014781876347188367,
+      "loss": 0.9374,
+      "step": 7681
+    },
+    {
+      "epoch": 1.3676994301994303,
+      "grad_norm": 0.7255483269691467,
+      "learning_rate": 0.0001478064696157691,
+      "loss": 0.8533,
+      "step": 7682
+    },
+    {
+      "epoch": 1.3678774928774928,
+      "grad_norm": 0.668064534664154,
+      "learning_rate": 0.00014779417482298603,
+      "loss": 0.9002,
+      "step": 7683
+    },
+    {
+      "epoch": 1.3680555555555556,
+      "grad_norm": 0.6797603368759155,
+      "learning_rate": 0.0001477818790937754,
+      "loss": 0.9733,
+      "step": 7684
+    },
+    {
+      "epoch": 1.368233618233618,
+      "grad_norm": 0.6905350685119629,
+      "learning_rate": 0.0001477695824283781,
+      "loss": 0.7985,
+      "step": 7685
+    },
+    {
+      "epoch": 1.3684116809116809,
+      "grad_norm": 0.6846137046813965,
+      "learning_rate": 0.00014775728482703507,
+      "loss": 0.9154,
+      "step": 7686
+    },
+    {
+      "epoch": 1.3685897435897436,
+      "grad_norm": 0.6686832904815674,
+      "learning_rate": 0.00014774498628998726,
+      "loss": 0.926,
+      "step": 7687
+    },
+    {
+      "epoch": 1.3687678062678064,
+      "grad_norm": 0.7050234079360962,
+      "learning_rate": 0.00014773268681747561,
+      "loss": 0.9386,
+      "step": 7688
+    },
+    {
+      "epoch": 1.368945868945869,
+      "grad_norm": 0.7048354744911194,
+      "learning_rate": 0.00014772038640974112,
+      "loss": 1.1483,
+      "step": 7689
+    },
+    {
+      "epoch": 1.3691239316239316,
+      "grad_norm": 0.698192298412323,
+      "learning_rate": 0.0001477080850670248,
+      "loss": 1.1452,
+      "step": 7690
+    },
+    {
+      "epoch": 1.3693019943019942,
+      "grad_norm": 0.6838962435722351,
+      "learning_rate": 0.00014769578278956766,
+      "loss": 0.9789,
+      "step": 7691
+    },
+    {
+      "epoch": 1.369480056980057,
+      "grad_norm": 0.6636955142021179,
+      "learning_rate": 0.00014768347957761074,
+      "loss": 0.931,
+      "step": 7692
+    },
+    {
+      "epoch": 1.3696581196581197,
+      "grad_norm": 0.706030547618866,
+      "learning_rate": 0.0001476711754313951,
+      "loss": 1.1096,
+      "step": 7693
+    },
+    {
+      "epoch": 1.3698361823361824,
+      "grad_norm": 0.6771288514137268,
+      "learning_rate": 0.00014765887035116178,
+      "loss": 0.9641,
+      "step": 7694
+    },
+    {
+      "epoch": 1.370014245014245,
+      "grad_norm": 0.6805008053779602,
+      "learning_rate": 0.00014764656433715188,
+      "loss": 0.8724,
+      "step": 7695
+    },
+    {
+      "epoch": 1.3701923076923077,
+      "grad_norm": 0.6599233746528625,
+      "learning_rate": 0.00014763425738960657,
+      "loss": 0.8477,
+      "step": 7696
+    },
+    {
+      "epoch": 1.3703703703703702,
+      "grad_norm": 0.7036116123199463,
+      "learning_rate": 0.0001476219495087669,
+      "loss": 1.0991,
+      "step": 7697
+    },
+    {
+      "epoch": 1.370548433048433,
+      "grad_norm": 0.6677989363670349,
+      "learning_rate": 0.0001476096406948741,
+      "loss": 1.2397,
+      "step": 7698
+    },
+    {
+      "epoch": 1.3707264957264957,
+      "grad_norm": 0.5652269721031189,
+      "learning_rate": 0.00014759733094816928,
+      "loss": 0.9302,
+      "step": 7699
+    },
+    {
+      "epoch": 1.3709045584045585,
+      "grad_norm": 0.6670156121253967,
+      "learning_rate": 0.00014758502026889362,
+      "loss": 0.8362,
+      "step": 7700
+    },
+    {
+      "epoch": 1.371082621082621,
+      "grad_norm": 0.6705406904220581,
+      "learning_rate": 0.00014757270865728832,
+      "loss": 0.876,
+      "step": 7701
+    },
+    {
+      "epoch": 1.3712606837606838,
+      "grad_norm": 0.6020053625106812,
+      "learning_rate": 0.00014756039611359465,
+      "loss": 0.9182,
+      "step": 7702
+    },
+    {
+      "epoch": 1.3714387464387463,
+      "grad_norm": 0.6370134949684143,
+      "learning_rate": 0.0001475480826380538,
+      "loss": 1.1063,
+      "step": 7703
+    },
+    {
+      "epoch": 1.371616809116809,
+      "grad_norm": 0.6906460523605347,
+      "learning_rate": 0.00014753576823090705,
+      "loss": 0.988,
+      "step": 7704
+    },
+    {
+      "epoch": 1.3717948717948718,
+      "grad_norm": 0.6047569513320923,
+      "learning_rate": 0.00014752345289239567,
+      "loss": 1.15,
+      "step": 7705
+    },
+    {
+      "epoch": 1.3719729344729346,
+      "grad_norm": 0.7019868493080139,
+      "learning_rate": 0.00014751113662276095,
+      "loss": 1.1185,
+      "step": 7706
+    },
+    {
+      "epoch": 1.372150997150997,
+      "grad_norm": 0.6534035801887512,
+      "learning_rate": 0.00014749881942224417,
+      "loss": 0.9006,
+      "step": 7707
+    },
+    {
+      "epoch": 1.3723290598290598,
+      "grad_norm": 0.6111651659011841,
+      "learning_rate": 0.00014748650129108674,
+      "loss": 0.935,
+      "step": 7708
+    },
+    {
+      "epoch": 1.3725071225071226,
+      "grad_norm": 0.6678512096405029,
+      "learning_rate": 0.00014747418222952995,
+      "loss": 0.8771,
+      "step": 7709
+    },
+    {
+      "epoch": 1.3726851851851851,
+      "grad_norm": 0.607829749584198,
+      "learning_rate": 0.00014746186223781518,
+      "loss": 1.0509,
+      "step": 7710
+    },
+    {
+      "epoch": 1.3728632478632479,
+      "grad_norm": 0.7274412512779236,
+      "learning_rate": 0.00014744954131618382,
+      "loss": 0.9545,
+      "step": 7711
+    },
+    {
+      "epoch": 1.3730413105413106,
+      "grad_norm": 0.640333354473114,
+      "learning_rate": 0.00014743721946487723,
+      "loss": 1.018,
+      "step": 7712
+    },
+    {
+      "epoch": 1.3732193732193732,
+      "grad_norm": 0.6772079467773438,
+      "learning_rate": 0.0001474248966841369,
+      "loss": 1.0983,
+      "step": 7713
+    },
+    {
+      "epoch": 1.373397435897436,
+      "grad_norm": 0.49630534648895264,
+      "learning_rate": 0.00014741257297420422,
+      "loss": 0.5238,
+      "step": 7714
+    },
+    {
+      "epoch": 1.3735754985754987,
+      "grad_norm": 0.6316596269607544,
+      "learning_rate": 0.00014740024833532068,
+      "loss": 1.1342,
+      "step": 7715
+    },
+    {
+      "epoch": 1.3737535612535612,
+      "grad_norm": 0.5928404331207275,
+      "learning_rate": 0.00014738792276772775,
+      "loss": 0.7987,
+      "step": 7716
+    },
+    {
+      "epoch": 1.373931623931624,
+      "grad_norm": 0.6773418188095093,
+      "learning_rate": 0.00014737559627166688,
+      "loss": 0.934,
+      "step": 7717
+    },
+    {
+      "epoch": 1.3741096866096867,
+      "grad_norm": 0.7895028591156006,
+      "learning_rate": 0.00014736326884737963,
+      "loss": 0.984,
+      "step": 7718
+    },
+    {
+      "epoch": 1.3742877492877492,
+      "grad_norm": 0.7074753046035767,
+      "learning_rate": 0.00014735094049510752,
+      "loss": 1.0093,
+      "step": 7719
+    },
+    {
+      "epoch": 1.374465811965812,
+      "grad_norm": 0.5389847159385681,
+      "learning_rate": 0.00014733861121509208,
+      "loss": 0.8138,
+      "step": 7720
+    },
+    {
+      "epoch": 1.3746438746438747,
+      "grad_norm": 0.6138495206832886,
+      "learning_rate": 0.00014732628100757493,
+      "loss": 0.9282,
+      "step": 7721
+    },
+    {
+      "epoch": 1.3748219373219372,
+      "grad_norm": 0.7609560489654541,
+      "learning_rate": 0.00014731394987279757,
+      "loss": 0.9859,
+      "step": 7722
+    },
+    {
+      "epoch": 1.375,
+      "grad_norm": 0.6806198954582214,
+      "learning_rate": 0.00014730161781100165,
+      "loss": 0.8932,
+      "step": 7723
+    },
+    {
+      "epoch": 1.3751780626780628,
+      "grad_norm": 0.7229103446006775,
+      "learning_rate": 0.0001472892848224288,
+      "loss": 0.956,
+      "step": 7724
+    },
+    {
+      "epoch": 1.3753561253561253,
+      "grad_norm": 0.6157994866371155,
+      "learning_rate": 0.00014727695090732066,
+      "loss": 1.0285,
+      "step": 7725
+    },
+    {
+      "epoch": 1.375534188034188,
+      "grad_norm": 0.5885980129241943,
+      "learning_rate": 0.00014726461606591885,
+      "loss": 0.9174,
+      "step": 7726
+    },
+    {
+      "epoch": 1.3757122507122508,
+      "grad_norm": 0.6655769944190979,
+      "learning_rate": 0.0001472522802984651,
+      "loss": 0.9059,
+      "step": 7727
+    },
+    {
+      "epoch": 1.3758903133903133,
+      "grad_norm": 0.7075541019439697,
+      "learning_rate": 0.00014723994360520105,
+      "loss": 1.0055,
+      "step": 7728
+    },
+    {
+      "epoch": 1.376068376068376,
+      "grad_norm": 0.6947159171104431,
+      "learning_rate": 0.00014722760598636847,
+      "loss": 0.9782,
+      "step": 7729
+    },
+    {
+      "epoch": 1.3762464387464388,
+      "grad_norm": 0.6629964709281921,
+      "learning_rate": 0.00014721526744220905,
+      "loss": 0.9427,
+      "step": 7730
+    },
+    {
+      "epoch": 1.3764245014245013,
+      "grad_norm": 0.7385284304618835,
+      "learning_rate": 0.00014720292797296453,
+      "loss": 0.9953,
+      "step": 7731
+    },
+    {
+      "epoch": 1.376602564102564,
+      "grad_norm": 0.6123563051223755,
+      "learning_rate": 0.0001471905875788767,
+      "loss": 1.0103,
+      "step": 7732
+    },
+    {
+      "epoch": 1.3767806267806268,
+      "grad_norm": 0.6457047462463379,
+      "learning_rate": 0.00014717824626018732,
+      "loss": 0.9779,
+      "step": 7733
+    },
+    {
+      "epoch": 1.3769586894586894,
+      "grad_norm": 0.6196442246437073,
+      "learning_rate": 0.00014716590401713824,
+      "loss": 0.8747,
+      "step": 7734
+    },
+    {
+      "epoch": 1.3771367521367521,
+      "grad_norm": 0.7932298183441162,
+      "learning_rate": 0.00014715356084997122,
+      "loss": 1.1617,
+      "step": 7735
+    },
+    {
+      "epoch": 1.3773148148148149,
+      "grad_norm": 0.787304699420929,
+      "learning_rate": 0.00014714121675892815,
+      "loss": 1.1383,
+      "step": 7736
+    },
+    {
+      "epoch": 1.3774928774928774,
+      "grad_norm": 0.672795295715332,
+      "learning_rate": 0.00014712887174425085,
+      "loss": 1.2563,
+      "step": 7737
+    },
+    {
+      "epoch": 1.3776709401709402,
+      "grad_norm": 0.6505744457244873,
+      "learning_rate": 0.00014711652580618123,
+      "loss": 0.9194,
+      "step": 7738
+    },
+    {
+      "epoch": 1.377849002849003,
+      "grad_norm": 0.8141193985939026,
+      "learning_rate": 0.00014710417894496115,
+      "loss": 1.1428,
+      "step": 7739
+    },
+    {
+      "epoch": 1.3780270655270654,
+      "grad_norm": 0.6269707679748535,
+      "learning_rate": 0.00014709183116083253,
+      "loss": 0.7164,
+      "step": 7740
+    },
+    {
+      "epoch": 1.3782051282051282,
+      "grad_norm": 0.6737076640129089,
+      "learning_rate": 0.0001470794824540373,
+      "loss": 0.9965,
+      "step": 7741
+    },
+    {
+      "epoch": 1.378383190883191,
+      "grad_norm": 0.6451728343963623,
+      "learning_rate": 0.0001470671328248174,
+      "loss": 1.0539,
+      "step": 7742
+    },
+    {
+      "epoch": 1.3785612535612537,
+      "grad_norm": 0.6480295062065125,
+      "learning_rate": 0.00014705478227341486,
+      "loss": 0.9118,
+      "step": 7743
+    },
+    {
+      "epoch": 1.3787393162393162,
+      "grad_norm": 0.7429090738296509,
+      "learning_rate": 0.00014704243080007154,
+      "loss": 1.0031,
+      "step": 7744
+    },
+    {
+      "epoch": 1.378917378917379,
+      "grad_norm": 0.5601376891136169,
+      "learning_rate": 0.00014703007840502955,
+      "loss": 0.849,
+      "step": 7745
+    },
+    {
+      "epoch": 1.3790954415954415,
+      "grad_norm": 0.7067657113075256,
+      "learning_rate": 0.00014701772508853088,
+      "loss": 1.3067,
+      "step": 7746
+    },
+    {
+      "epoch": 1.3792735042735043,
+      "grad_norm": 0.7016390562057495,
+      "learning_rate": 0.00014700537085081755,
+      "loss": 1.0236,
+      "step": 7747
+    },
+    {
+      "epoch": 1.379451566951567,
+      "grad_norm": 0.6505000591278076,
+      "learning_rate": 0.0001469930156921316,
+      "loss": 1.0121,
+      "step": 7748
+    },
+    {
+      "epoch": 1.3796296296296298,
+      "grad_norm": 0.8515380620956421,
+      "learning_rate": 0.00014698065961271512,
+      "loss": 1.0413,
+      "step": 7749
+    },
+    {
+      "epoch": 1.3798076923076923,
+      "grad_norm": 0.6322008371353149,
+      "learning_rate": 0.00014696830261281025,
+      "loss": 0.8306,
+      "step": 7750
+    },
+    {
+      "epoch": 1.379985754985755,
+      "grad_norm": 0.7090431451797485,
+      "learning_rate": 0.00014695594469265902,
+      "loss": 1.1829,
+      "step": 7751
+    },
+    {
+      "epoch": 1.3801638176638176,
+      "grad_norm": 0.5913167595863342,
+      "learning_rate": 0.00014694358585250363,
+      "loss": 0.9769,
+      "step": 7752
+    },
+    {
+      "epoch": 1.3803418803418803,
+      "grad_norm": 0.7345432639122009,
+      "learning_rate": 0.00014693122609258616,
+      "loss": 0.9928,
+      "step": 7753
+    },
+    {
+      "epoch": 1.380519943019943,
+      "grad_norm": 0.6158214211463928,
+      "learning_rate": 0.00014691886541314884,
+      "loss": 1.1166,
+      "step": 7754
+    },
+    {
+      "epoch": 1.3806980056980058,
+      "grad_norm": 0.6874041557312012,
+      "learning_rate": 0.0001469065038144338,
+      "loss": 1.0808,
+      "step": 7755
+    },
+    {
+      "epoch": 1.3808760683760684,
+      "grad_norm": 0.8135195970535278,
+      "learning_rate": 0.00014689414129668326,
+      "loss": 0.9482,
+      "step": 7756
+    },
+    {
+      "epoch": 1.381054131054131,
+      "grad_norm": 0.6389174461364746,
+      "learning_rate": 0.00014688177786013944,
+      "loss": 1.039,
+      "step": 7757
+    },
+    {
+      "epoch": 1.3812321937321936,
+      "grad_norm": 0.6953016519546509,
+      "learning_rate": 0.00014686941350504454,
+      "loss": 0.9426,
+      "step": 7758
+    },
+    {
+      "epoch": 1.3814102564102564,
+      "grad_norm": 0.8171859383583069,
+      "learning_rate": 0.00014685704823164087,
+      "loss": 1.0393,
+      "step": 7759
+    },
+    {
+      "epoch": 1.3815883190883191,
+      "grad_norm": 0.6968414783477783,
+      "learning_rate": 0.0001468446820401707,
+      "loss": 1.1167,
+      "step": 7760
+    },
+    {
+      "epoch": 1.381766381766382,
+      "grad_norm": 0.6916623711585999,
+      "learning_rate": 0.00014683231493087628,
+      "loss": 1.1886,
+      "step": 7761
+    },
+    {
+      "epoch": 1.3819444444444444,
+      "grad_norm": 0.7351683378219604,
+      "learning_rate": 0.00014681994690399992,
+      "loss": 0.9893,
+      "step": 7762
+    },
+    {
+      "epoch": 1.3821225071225072,
+      "grad_norm": 0.6617491245269775,
+      "learning_rate": 0.00014680757795978395,
+      "loss": 1.0505,
+      "step": 7763
+    },
+    {
+      "epoch": 1.3823005698005697,
+      "grad_norm": 0.6627485156059265,
+      "learning_rate": 0.00014679520809847074,
+      "loss": 0.9878,
+      "step": 7764
+    },
+    {
+      "epoch": 1.3824786324786325,
+      "grad_norm": 0.704636812210083,
+      "learning_rate": 0.00014678283732030264,
+      "loss": 0.8332,
+      "step": 7765
+    },
+    {
+      "epoch": 1.3826566951566952,
+      "grad_norm": 0.698853075504303,
+      "learning_rate": 0.00014677046562552203,
+      "loss": 1.0926,
+      "step": 7766
+    },
+    {
+      "epoch": 1.382834757834758,
+      "grad_norm": 0.6695869565010071,
+      "learning_rate": 0.0001467580930143713,
+      "loss": 1.0626,
+      "step": 7767
+    },
+    {
+      "epoch": 1.3830128205128205,
+      "grad_norm": 0.672173023223877,
+      "learning_rate": 0.00014674571948709286,
+      "loss": 0.8842,
+      "step": 7768
+    },
+    {
+      "epoch": 1.3831908831908832,
+      "grad_norm": 0.6735473871231079,
+      "learning_rate": 0.00014673334504392916,
+      "loss": 0.9382,
+      "step": 7769
+    },
+    {
+      "epoch": 1.3833689458689458,
+      "grad_norm": 0.6864013075828552,
+      "learning_rate": 0.00014672096968512265,
+      "loss": 1.1369,
+      "step": 7770
+    },
+    {
+      "epoch": 1.3835470085470085,
+      "grad_norm": 0.7154954075813293,
+      "learning_rate": 0.0001467085934109158,
+      "loss": 1.1447,
+      "step": 7771
+    },
+    {
+      "epoch": 1.3837250712250713,
+      "grad_norm": 0.5934487581253052,
+      "learning_rate": 0.0001466962162215511,
+      "loss": 0.8923,
+      "step": 7772
+    },
+    {
+      "epoch": 1.383903133903134,
+      "grad_norm": 0.8116832971572876,
+      "learning_rate": 0.00014668383811727097,
+      "loss": 1.0997,
+      "step": 7773
+    },
+    {
+      "epoch": 1.3840811965811965,
+      "grad_norm": 0.8661674857139587,
+      "learning_rate": 0.00014667145909831808,
+      "loss": 1.0112,
+      "step": 7774
+    },
+    {
+      "epoch": 1.3842592592592593,
+      "grad_norm": 0.5173856616020203,
+      "learning_rate": 0.00014665907916493488,
+      "loss": 0.6571,
+      "step": 7775
+    },
+    {
+      "epoch": 1.3844373219373218,
+      "grad_norm": 0.6165067553520203,
+      "learning_rate": 0.00014664669831736395,
+      "loss": 1.0992,
+      "step": 7776
+    },
+    {
+      "epoch": 1.3846153846153846,
+      "grad_norm": 0.6564429998397827,
+      "learning_rate": 0.00014663431655584787,
+      "loss": 0.9103,
+      "step": 7777
+    },
+    {
+      "epoch": 1.3847934472934473,
+      "grad_norm": 0.7162124514579773,
+      "learning_rate": 0.00014662193388062923,
+      "loss": 1.0645,
+      "step": 7778
+    },
+    {
+      "epoch": 1.38497150997151,
+      "grad_norm": 0.6391215920448303,
+      "learning_rate": 0.00014660955029195064,
+      "loss": 0.902,
+      "step": 7779
+    },
+    {
+      "epoch": 1.3851495726495726,
+      "grad_norm": 0.6876635551452637,
+      "learning_rate": 0.00014659716579005475,
+      "loss": 1.0924,
+      "step": 7780
+    },
+    {
+      "epoch": 1.3853276353276354,
+      "grad_norm": 0.7254653573036194,
+      "learning_rate": 0.00014658478037518418,
+      "loss": 1.0135,
+      "step": 7781
+    },
+    {
+      "epoch": 1.385505698005698,
+      "grad_norm": 0.6900535225868225,
+      "learning_rate": 0.00014657239404758162,
+      "loss": 0.983,
+      "step": 7782
+    },
+    {
+      "epoch": 1.3856837606837606,
+      "grad_norm": 0.7477042078971863,
+      "learning_rate": 0.00014656000680748975,
+      "loss": 1.0707,
+      "step": 7783
+    },
+    {
+      "epoch": 1.3858618233618234,
+      "grad_norm": 0.5756927132606506,
+      "learning_rate": 0.00014654761865515124,
+      "loss": 0.8881,
+      "step": 7784
+    },
+    {
+      "epoch": 1.3860398860398861,
+      "grad_norm": 0.6736083626747131,
+      "learning_rate": 0.00014653522959080884,
+      "loss": 1.0193,
+      "step": 7785
+    },
+    {
+      "epoch": 1.3862179487179487,
+      "grad_norm": 0.616179883480072,
+      "learning_rate": 0.0001465228396147053,
+      "loss": 0.8676,
+      "step": 7786
+    },
+    {
+      "epoch": 1.3863960113960114,
+      "grad_norm": 0.7956456542015076,
+      "learning_rate": 0.00014651044872708338,
+      "loss": 0.9787,
+      "step": 7787
+    },
+    {
+      "epoch": 1.386574074074074,
+      "grad_norm": 0.6613463163375854,
+      "learning_rate": 0.00014649805692818578,
+      "loss": 1.0032,
+      "step": 7788
+    },
+    {
+      "epoch": 1.3867521367521367,
+      "grad_norm": 0.6215800642967224,
+      "learning_rate": 0.0001464856642182554,
+      "loss": 1.0123,
+      "step": 7789
+    },
+    {
+      "epoch": 1.3869301994301995,
+      "grad_norm": 0.6701171398162842,
+      "learning_rate": 0.00014647327059753496,
+      "loss": 0.9108,
+      "step": 7790
+    },
+    {
+      "epoch": 1.3871082621082622,
+      "grad_norm": 0.6213465929031372,
+      "learning_rate": 0.00014646087606626736,
+      "loss": 0.9313,
+      "step": 7791
+    },
+    {
+      "epoch": 1.3872863247863247,
+      "grad_norm": 0.7535304427146912,
+      "learning_rate": 0.00014644848062469535,
+      "loss": 1.0813,
+      "step": 7792
+    },
+    {
+      "epoch": 1.3874643874643875,
+      "grad_norm": 0.6778230667114258,
+      "learning_rate": 0.0001464360842730619,
+      "loss": 1.0405,
+      "step": 7793
+    },
+    {
+      "epoch": 1.38764245014245,
+      "grad_norm": 0.7816025614738464,
+      "learning_rate": 0.0001464236870116098,
+      "loss": 0.9228,
+      "step": 7794
+    },
+    {
+      "epoch": 1.3878205128205128,
+      "grad_norm": 0.6815229058265686,
+      "learning_rate": 0.00014641128884058203,
+      "loss": 0.9607,
+      "step": 7795
+    },
+    {
+      "epoch": 1.3879985754985755,
+      "grad_norm": 0.7027714848518372,
+      "learning_rate": 0.00014639888976022145,
+      "loss": 0.9379,
+      "step": 7796
+    },
+    {
+      "epoch": 1.3881766381766383,
+      "grad_norm": 0.7636353373527527,
+      "learning_rate": 0.00014638648977077104,
+      "loss": 1.1186,
+      "step": 7797
+    },
+    {
+      "epoch": 1.3883547008547008,
+      "grad_norm": 0.6732974052429199,
+      "learning_rate": 0.00014637408887247365,
+      "loss": 1.1378,
+      "step": 7798
+    },
+    {
+      "epoch": 1.3885327635327636,
+      "grad_norm": 0.7539397478103638,
+      "learning_rate": 0.0001463616870655724,
+      "loss": 0.999,
+      "step": 7799
+    },
+    {
+      "epoch": 1.388710826210826,
+      "grad_norm": 0.6872972846031189,
+      "learning_rate": 0.00014634928435031013,
+      "loss": 0.9564,
+      "step": 7800
+    },
+    {
+      "epoch": 1.3888888888888888,
+      "grad_norm": 0.6823115348815918,
+      "learning_rate": 0.00014633688072693,
+      "loss": 0.9745,
+      "step": 7801
+    },
+    {
+      "epoch": 1.3890669515669516,
+      "grad_norm": 0.6462571620941162,
+      "learning_rate": 0.00014632447619567488,
+      "loss": 0.8314,
+      "step": 7802
+    },
+    {
+      "epoch": 1.3892450142450143,
+      "grad_norm": 0.7245402932167053,
+      "learning_rate": 0.0001463120707567879,
+      "loss": 0.8291,
+      "step": 7803
+    },
+    {
+      "epoch": 1.3894230769230769,
+      "grad_norm": 0.697179913520813,
+      "learning_rate": 0.00014629966441051208,
+      "loss": 1.017,
+      "step": 7804
+    },
+    {
+      "epoch": 1.3896011396011396,
+      "grad_norm": 0.6304250359535217,
+      "learning_rate": 0.00014628725715709053,
+      "loss": 0.9262,
+      "step": 7805
+    },
+    {
+      "epoch": 1.3897792022792022,
+      "grad_norm": 0.5780240297317505,
+      "learning_rate": 0.00014627484899676634,
+      "loss": 0.6596,
+      "step": 7806
+    },
+    {
+      "epoch": 1.389957264957265,
+      "grad_norm": 0.8030684590339661,
+      "learning_rate": 0.0001462624399297826,
+      "loss": 0.9977,
+      "step": 7807
+    },
+    {
+      "epoch": 1.3901353276353277,
+      "grad_norm": 0.7999774813652039,
+      "learning_rate": 0.00014625002995638246,
+      "loss": 1.1036,
+      "step": 7808
+    },
+    {
+      "epoch": 1.3903133903133904,
+      "grad_norm": 0.7054862976074219,
+      "learning_rate": 0.00014623761907680904,
+      "loss": 1.1435,
+      "step": 7809
+    },
+    {
+      "epoch": 1.390491452991453,
+      "grad_norm": 0.6660647392272949,
+      "learning_rate": 0.00014622520729130556,
+      "loss": 0.703,
+      "step": 7810
+    },
+    {
+      "epoch": 1.3906695156695157,
+      "grad_norm": 0.6339690089225769,
+      "learning_rate": 0.00014621279460011515,
+      "loss": 1.0451,
+      "step": 7811
+    },
+    {
+      "epoch": 1.3908475783475782,
+      "grad_norm": 0.8568736910820007,
+      "learning_rate": 0.00014620038100348102,
+      "loss": 1.009,
+      "step": 7812
+    },
+    {
+      "epoch": 1.391025641025641,
+      "grad_norm": 0.7126797437667847,
+      "learning_rate": 0.00014618796650164642,
+      "loss": 0.9592,
+      "step": 7813
+    },
+    {
+      "epoch": 1.3912037037037037,
+      "grad_norm": 0.6768994331359863,
+      "learning_rate": 0.00014617555109485453,
+      "loss": 1.09,
+      "step": 7814
+    },
+    {
+      "epoch": 1.3913817663817665,
+      "grad_norm": 0.7609471678733826,
+      "learning_rate": 0.00014616313478334864,
+      "loss": 0.9781,
+      "step": 7815
+    },
+    {
+      "epoch": 1.391559829059829,
+      "grad_norm": 0.7107006907463074,
+      "learning_rate": 0.00014615071756737203,
+      "loss": 0.9769,
+      "step": 7816
+    },
+    {
+      "epoch": 1.3917378917378918,
+      "grad_norm": 0.6324763894081116,
+      "learning_rate": 0.00014613829944716802,
+      "loss": 1.089,
+      "step": 7817
+    },
+    {
+      "epoch": 1.3919159544159543,
+      "grad_norm": 0.6617186069488525,
+      "learning_rate": 0.00014612588042297984,
+      "loss": 1.0466,
+      "step": 7818
+    },
+    {
+      "epoch": 1.392094017094017,
+      "grad_norm": 0.7881436944007874,
+      "learning_rate": 0.00014611346049505083,
+      "loss": 1.003,
+      "step": 7819
+    },
+    {
+      "epoch": 1.3922720797720798,
+      "grad_norm": 0.7391049861907959,
+      "learning_rate": 0.00014610103966362437,
+      "loss": 1.0531,
+      "step": 7820
+    },
+    {
+      "epoch": 1.3924501424501425,
+      "grad_norm": 0.6299472451210022,
+      "learning_rate": 0.00014608861792894383,
+      "loss": 0.8433,
+      "step": 7821
+    },
+    {
+      "epoch": 1.392628205128205,
+      "grad_norm": 0.6053452491760254,
+      "learning_rate": 0.00014607619529125255,
+      "loss": 0.7945,
+      "step": 7822
+    },
+    {
+      "epoch": 1.3928062678062678,
+      "grad_norm": 0.7160114645957947,
+      "learning_rate": 0.0001460637717507939,
+      "loss": 1.1604,
+      "step": 7823
+    },
+    {
+      "epoch": 1.3929843304843303,
+      "grad_norm": 0.6308854222297668,
+      "learning_rate": 0.00014605134730781135,
+      "loss": 1.0918,
+      "step": 7824
+    },
+    {
+      "epoch": 1.393162393162393,
+      "grad_norm": 0.7187000513076782,
+      "learning_rate": 0.00014603892196254833,
+      "loss": 1.0594,
+      "step": 7825
+    },
+    {
+      "epoch": 1.3933404558404558,
+      "grad_norm": 0.7516581416130066,
+      "learning_rate": 0.00014602649571524826,
+      "loss": 0.9222,
+      "step": 7826
+    },
+    {
+      "epoch": 1.3935185185185186,
+      "grad_norm": 0.6340481638908386,
+      "learning_rate": 0.00014601406856615463,
+      "loss": 0.8131,
+      "step": 7827
+    },
+    {
+      "epoch": 1.3936965811965811,
+      "grad_norm": 0.8161744475364685,
+      "learning_rate": 0.0001460016405155109,
+      "loss": 0.8695,
+      "step": 7828
+    },
+    {
+      "epoch": 1.3938746438746439,
+      "grad_norm": 0.6926971077919006,
+      "learning_rate": 0.0001459892115635606,
+      "loss": 0.9548,
+      "step": 7829
+    },
+    {
+      "epoch": 1.3940527065527066,
+      "grad_norm": 0.6669796109199524,
+      "learning_rate": 0.0001459767817105472,
+      "loss": 0.9255,
+      "step": 7830
+    },
+    {
+      "epoch": 1.3942307692307692,
+      "grad_norm": 0.6626184582710266,
+      "learning_rate": 0.00014596435095671432,
+      "loss": 1.1141,
+      "step": 7831
+    },
+    {
+      "epoch": 1.394408831908832,
+      "grad_norm": 0.6755738854408264,
+      "learning_rate": 0.00014595191930230546,
+      "loss": 0.9596,
+      "step": 7832
+    },
+    {
+      "epoch": 1.3945868945868947,
+      "grad_norm": 0.6034863591194153,
+      "learning_rate": 0.00014593948674756417,
+      "loss": 0.8088,
+      "step": 7833
+    },
+    {
+      "epoch": 1.3947649572649572,
+      "grad_norm": 0.5638226866722107,
+      "learning_rate": 0.00014592705329273406,
+      "loss": 0.5828,
+      "step": 7834
+    },
+    {
+      "epoch": 1.39494301994302,
+      "grad_norm": 0.6902222633361816,
+      "learning_rate": 0.0001459146189380588,
+      "loss": 0.7954,
+      "step": 7835
+    },
+    {
+      "epoch": 1.3951210826210827,
+      "grad_norm": 0.7579947710037231,
+      "learning_rate": 0.0001459021836837819,
+      "loss": 1.1301,
+      "step": 7836
+    },
+    {
+      "epoch": 1.3952991452991452,
+      "grad_norm": 0.6894911527633667,
+      "learning_rate": 0.00014588974753014712,
+      "loss": 1.082,
+      "step": 7837
+    },
+    {
+      "epoch": 1.395477207977208,
+      "grad_norm": 0.6330230832099915,
+      "learning_rate": 0.000145877310477398,
+      "loss": 0.7614,
+      "step": 7838
+    },
+    {
+      "epoch": 1.3956552706552707,
+      "grad_norm": 0.6164960265159607,
+      "learning_rate": 0.00014586487252577832,
+      "loss": 0.8981,
+      "step": 7839
+    },
+    {
+      "epoch": 1.3958333333333333,
+      "grad_norm": 0.6575061678886414,
+      "learning_rate": 0.0001458524336755317,
+      "loss": 0.9735,
+      "step": 7840
+    },
+    {
+      "epoch": 1.396011396011396,
+      "grad_norm": 0.687921941280365,
+      "learning_rate": 0.00014583999392690195,
+      "loss": 0.9207,
+      "step": 7841
+    },
+    {
+      "epoch": 1.3961894586894588,
+      "grad_norm": 0.6175212860107422,
+      "learning_rate": 0.00014582755328013274,
+      "loss": 1.0444,
+      "step": 7842
+    },
+    {
+      "epoch": 1.3963675213675213,
+      "grad_norm": 0.6351733207702637,
+      "learning_rate": 0.00014581511173546781,
+      "loss": 1.0143,
+      "step": 7843
+    },
+    {
+      "epoch": 1.396545584045584,
+      "grad_norm": 0.7235051989555359,
+      "learning_rate": 0.00014580266929315093,
+      "loss": 0.9108,
+      "step": 7844
+    },
+    {
+      "epoch": 1.3967236467236468,
+      "grad_norm": 0.6432043313980103,
+      "learning_rate": 0.00014579022595342586,
+      "loss": 0.8674,
+      "step": 7845
+    },
+    {
+      "epoch": 1.3969017094017093,
+      "grad_norm": 0.7775412797927856,
+      "learning_rate": 0.00014577778171653648,
+      "loss": 1.0637,
+      "step": 7846
+    },
+    {
+      "epoch": 1.397079772079772,
+      "grad_norm": 0.6748763918876648,
+      "learning_rate": 0.00014576533658272655,
+      "loss": 1.0356,
+      "step": 7847
+    },
+    {
+      "epoch": 1.3972578347578348,
+      "grad_norm": 0.6940401196479797,
+      "learning_rate": 0.00014575289055223994,
+      "loss": 0.9937,
+      "step": 7848
+    },
+    {
+      "epoch": 1.3974358974358974,
+      "grad_norm": 0.6971304416656494,
+      "learning_rate": 0.00014574044362532045,
+      "loss": 0.9753,
+      "step": 7849
+    },
+    {
+      "epoch": 1.39761396011396,
+      "grad_norm": 0.6576017141342163,
+      "learning_rate": 0.00014572799580221197,
+      "loss": 1.1233,
+      "step": 7850
+    },
+    {
+      "epoch": 1.3977920227920229,
+      "grad_norm": 0.6270702481269836,
+      "learning_rate": 0.00014571554708315843,
+      "loss": 0.9771,
+      "step": 7851
+    },
+    {
+      "epoch": 1.3979700854700854,
+      "grad_norm": 0.6898425817489624,
+      "learning_rate": 0.00014570309746840372,
+      "loss": 0.9235,
+      "step": 7852
+    },
+    {
+      "epoch": 1.3981481481481481,
+      "grad_norm": 0.7017102241516113,
+      "learning_rate": 0.00014569064695819174,
+      "loss": 1.1056,
+      "step": 7853
+    },
+    {
+      "epoch": 1.398326210826211,
+      "grad_norm": 0.6298288702964783,
+      "learning_rate": 0.00014567819555276647,
+      "loss": 0.8635,
+      "step": 7854
+    },
+    {
+      "epoch": 1.3985042735042734,
+      "grad_norm": 0.7173134684562683,
+      "learning_rate": 0.00014566574325237182,
+      "loss": 1.0893,
+      "step": 7855
+    },
+    {
+      "epoch": 1.3986823361823362,
+      "grad_norm": 0.7541036605834961,
+      "learning_rate": 0.0001456532900572518,
+      "loss": 1.0996,
+      "step": 7856
+    },
+    {
+      "epoch": 1.398860398860399,
+      "grad_norm": 0.6204771399497986,
+      "learning_rate": 0.0001456408359676504,
+      "loss": 0.7601,
+      "step": 7857
+    },
+    {
+      "epoch": 1.3990384615384617,
+      "grad_norm": 0.629557192325592,
+      "learning_rate": 0.00014562838098381163,
+      "loss": 0.9239,
+      "step": 7858
+    },
+    {
+      "epoch": 1.3992165242165242,
+      "grad_norm": 0.6878390908241272,
+      "learning_rate": 0.00014561592510597954,
+      "loss": 0.9641,
+      "step": 7859
+    },
+    {
+      "epoch": 1.399394586894587,
+      "grad_norm": 0.7490049004554749,
+      "learning_rate": 0.00014560346833439813,
+      "loss": 1.0198,
+      "step": 7860
+    },
+    {
+      "epoch": 1.3995726495726495,
+      "grad_norm": 0.6337960958480835,
+      "learning_rate": 0.0001455910106693115,
+      "loss": 0.8709,
+      "step": 7861
+    },
+    {
+      "epoch": 1.3997507122507122,
+      "grad_norm": 0.6210524439811707,
+      "learning_rate": 0.0001455785521109637,
+      "loss": 1.1049,
+      "step": 7862
+    },
+    {
+      "epoch": 1.399928774928775,
+      "grad_norm": 0.7894936203956604,
+      "learning_rate": 0.00014556609265959887,
+      "loss": 0.8933,
+      "step": 7863
+    },
+    {
+      "epoch": 1.4001068376068377,
+      "grad_norm": 0.6888098120689392,
+      "learning_rate": 0.00014555363231546112,
+      "loss": 0.9738,
+      "step": 7864
+    },
+    {
+      "epoch": 1.4002849002849003,
+      "grad_norm": 0.608799934387207,
+      "learning_rate": 0.00014554117107879456,
+      "loss": 0.9103,
+      "step": 7865
+    },
+    {
+      "epoch": 1.400462962962963,
+      "grad_norm": 0.7390474081039429,
+      "learning_rate": 0.00014552870894984335,
+      "loss": 1.2484,
+      "step": 7866
+    },
+    {
+      "epoch": 1.4006410256410255,
+      "grad_norm": 0.6513381600379944,
+      "learning_rate": 0.00014551624592885169,
+      "loss": 0.8523,
+      "step": 7867
+    },
+    {
+      "epoch": 1.4008190883190883,
+      "grad_norm": 0.6357464790344238,
+      "learning_rate": 0.00014550378201606373,
+      "loss": 0.9594,
+      "step": 7868
+    },
+    {
+      "epoch": 1.400997150997151,
+      "grad_norm": 0.6893286108970642,
+      "learning_rate": 0.0001454913172117237,
+      "loss": 0.9798,
+      "step": 7869
+    },
+    {
+      "epoch": 1.4011752136752138,
+      "grad_norm": 0.6566550731658936,
+      "learning_rate": 0.0001454788515160758,
+      "loss": 1.0532,
+      "step": 7870
+    },
+    {
+      "epoch": 1.4013532763532763,
+      "grad_norm": 0.6442158222198486,
+      "learning_rate": 0.00014546638492936425,
+      "loss": 1.0789,
+      "step": 7871
+    },
+    {
+      "epoch": 1.401531339031339,
+      "grad_norm": 0.7570971846580505,
+      "learning_rate": 0.0001454539174518334,
+      "loss": 0.9806,
+      "step": 7872
+    },
+    {
+      "epoch": 1.4017094017094016,
+      "grad_norm": 0.6180047392845154,
+      "learning_rate": 0.0001454414490837274,
+      "loss": 0.857,
+      "step": 7873
+    },
+    {
+      "epoch": 1.4018874643874644,
+      "grad_norm": 0.7143170237541199,
+      "learning_rate": 0.0001454289798252906,
+      "loss": 0.8815,
+      "step": 7874
+    },
+    {
+      "epoch": 1.4020655270655271,
+      "grad_norm": 0.6388922929763794,
+      "learning_rate": 0.00014541650967676736,
+      "loss": 0.95,
+      "step": 7875
+    },
+    {
+      "epoch": 1.4022435897435899,
+      "grad_norm": 0.7137351632118225,
+      "learning_rate": 0.00014540403863840193,
+      "loss": 0.8973,
+      "step": 7876
+    },
+    {
+      "epoch": 1.4024216524216524,
+      "grad_norm": 0.656315267086029,
+      "learning_rate": 0.0001453915667104387,
+      "loss": 1.149,
+      "step": 7877
+    },
+    {
+      "epoch": 1.4025997150997151,
+      "grad_norm": 0.7234711647033691,
+      "learning_rate": 0.000145379093893122,
+      "loss": 0.9798,
+      "step": 7878
+    },
+    {
+      "epoch": 1.4027777777777777,
+      "grad_norm": 0.6595289707183838,
+      "learning_rate": 0.00014536662018669623,
+      "loss": 1.2704,
+      "step": 7879
+    },
+    {
+      "epoch": 1.4029558404558404,
+      "grad_norm": 0.6760551333427429,
+      "learning_rate": 0.00014535414559140576,
+      "loss": 0.8672,
+      "step": 7880
+    },
+    {
+      "epoch": 1.4031339031339032,
+      "grad_norm": 0.5916706919670105,
+      "learning_rate": 0.000145341670107495,
+      "loss": 0.888,
+      "step": 7881
+    },
+    {
+      "epoch": 1.403311965811966,
+      "grad_norm": 0.7272133231163025,
+      "learning_rate": 0.00014532919373520846,
+      "loss": 1.0466,
+      "step": 7882
+    },
+    {
+      "epoch": 1.4034900284900285,
+      "grad_norm": 0.8512467741966248,
+      "learning_rate": 0.00014531671647479048,
+      "loss": 1.2482,
+      "step": 7883
+    },
+    {
+      "epoch": 1.4036680911680912,
+      "grad_norm": 0.5536492466926575,
+      "learning_rate": 0.0001453042383264856,
+      "loss": 0.7823,
+      "step": 7884
+    },
+    {
+      "epoch": 1.4038461538461537,
+      "grad_norm": 0.7262215614318848,
+      "learning_rate": 0.0001452917592905383,
+      "loss": 0.9713,
+      "step": 7885
+    },
+    {
+      "epoch": 1.4040242165242165,
+      "grad_norm": 0.7146059274673462,
+      "learning_rate": 0.00014527927936719304,
+      "loss": 1.1064,
+      "step": 7886
+    },
+    {
+      "epoch": 1.4042022792022792,
+      "grad_norm": 0.5915318131446838,
+      "learning_rate": 0.00014526679855669436,
+      "loss": 0.8567,
+      "step": 7887
+    },
+    {
+      "epoch": 1.404380341880342,
+      "grad_norm": 0.6548298001289368,
+      "learning_rate": 0.00014525431685928682,
+      "loss": 1.1359,
+      "step": 7888
+    },
+    {
+      "epoch": 1.4045584045584045,
+      "grad_norm": 0.7482563853263855,
+      "learning_rate": 0.0001452418342752149,
+      "loss": 0.9095,
+      "step": 7889
+    },
+    {
+      "epoch": 1.4047364672364673,
+      "grad_norm": 0.6660130023956299,
+      "learning_rate": 0.0001452293508047233,
+      "loss": 1.2343,
+      "step": 7890
+    },
+    {
+      "epoch": 1.4049145299145298,
+      "grad_norm": 0.7457148432731628,
+      "learning_rate": 0.00014521686644805644,
+      "loss": 1.2086,
+      "step": 7891
+    },
+    {
+      "epoch": 1.4050925925925926,
+      "grad_norm": 0.5957929491996765,
+      "learning_rate": 0.00014520438120545906,
+      "loss": 0.9724,
+      "step": 7892
+    },
+    {
+      "epoch": 1.4052706552706553,
+      "grad_norm": 0.6832270622253418,
+      "learning_rate": 0.00014519189507717573,
+      "loss": 0.9903,
+      "step": 7893
+    },
+    {
+      "epoch": 1.405448717948718,
+      "grad_norm": 0.6202489733695984,
+      "learning_rate": 0.00014517940806345109,
+      "loss": 0.962,
+      "step": 7894
+    },
+    {
+      "epoch": 1.4056267806267806,
+      "grad_norm": 0.6419472694396973,
+      "learning_rate": 0.0001451669201645298,
+      "loss": 0.8147,
+      "step": 7895
+    },
+    {
+      "epoch": 1.4058048433048433,
+      "grad_norm": 0.61143958568573,
+      "learning_rate": 0.00014515443138065652,
+      "loss": 0.8674,
+      "step": 7896
+    },
+    {
+      "epoch": 1.4059829059829059,
+      "grad_norm": 0.7527356743812561,
+      "learning_rate": 0.00014514194171207597,
+      "loss": 1.0581,
+      "step": 7897
+    },
+    {
+      "epoch": 1.4061609686609686,
+      "grad_norm": 0.7195194363594055,
+      "learning_rate": 0.00014512945115903285,
+      "loss": 1.0268,
+      "step": 7898
+    },
+    {
+      "epoch": 1.4063390313390314,
+      "grad_norm": 0.7919661998748779,
+      "learning_rate": 0.00014511695972177187,
+      "loss": 1.0259,
+      "step": 7899
+    },
+    {
+      "epoch": 1.4065170940170941,
+      "grad_norm": 0.6774758696556091,
+      "learning_rate": 0.00014510446740053783,
+      "loss": 1.1214,
+      "step": 7900
+    },
+    {
+      "epoch": 1.4066951566951567,
+      "grad_norm": 0.6102406978607178,
+      "learning_rate": 0.0001450919741955754,
+      "loss": 1.1846,
+      "step": 7901
+    },
+    {
+      "epoch": 1.4068732193732194,
+      "grad_norm": 0.7189443707466125,
+      "learning_rate": 0.00014507948010712942,
+      "loss": 0.7758,
+      "step": 7902
+    },
+    {
+      "epoch": 1.407051282051282,
+      "grad_norm": 0.654153048992157,
+      "learning_rate": 0.00014506698513544467,
+      "loss": 0.899,
+      "step": 7903
+    },
+    {
+      "epoch": 1.4072293447293447,
+      "grad_norm": 0.637934684753418,
+      "learning_rate": 0.00014505448928076598,
+      "loss": 0.8301,
+      "step": 7904
+    },
+    {
+      "epoch": 1.4074074074074074,
+      "grad_norm": 0.7504615783691406,
+      "learning_rate": 0.00014504199254333812,
+      "loss": 0.9883,
+      "step": 7905
+    },
+    {
+      "epoch": 1.4075854700854702,
+      "grad_norm": 0.7902522683143616,
+      "learning_rate": 0.00014502949492340602,
+      "loss": 0.9615,
+      "step": 7906
+    },
+    {
+      "epoch": 1.4077635327635327,
+      "grad_norm": 0.5832732319831848,
+      "learning_rate": 0.0001450169964212145,
+      "loss": 0.7136,
+      "step": 7907
+    },
+    {
+      "epoch": 1.4079415954415955,
+      "grad_norm": 0.6025400757789612,
+      "learning_rate": 0.00014500449703700846,
+      "loss": 0.8812,
+      "step": 7908
+    },
+    {
+      "epoch": 1.408119658119658,
+      "grad_norm": 0.6412411332130432,
+      "learning_rate": 0.0001449919967710328,
+      "loss": 0.9346,
+      "step": 7909
+    },
+    {
+      "epoch": 1.4082977207977208,
+      "grad_norm": 0.7546970844268799,
+      "learning_rate": 0.00014497949562353242,
+      "loss": 1.0794,
+      "step": 7910
+    },
+    {
+      "epoch": 1.4084757834757835,
+      "grad_norm": 0.6175593733787537,
+      "learning_rate": 0.00014496699359475222,
+      "loss": 0.8939,
+      "step": 7911
+    },
+    {
+      "epoch": 1.4086538461538463,
+      "grad_norm": 0.6571716666221619,
+      "learning_rate": 0.00014495449068493722,
+      "loss": 1.1003,
+      "step": 7912
+    },
+    {
+      "epoch": 1.4088319088319088,
+      "grad_norm": 0.7038990259170532,
+      "learning_rate": 0.00014494198689433236,
+      "loss": 0.8844,
+      "step": 7913
+    },
+    {
+      "epoch": 1.4090099715099715,
+      "grad_norm": 0.7007337212562561,
+      "learning_rate": 0.00014492948222318263,
+      "loss": 1.2038,
+      "step": 7914
+    },
+    {
+      "epoch": 1.409188034188034,
+      "grad_norm": 0.7318591475486755,
+      "learning_rate": 0.00014491697667173302,
+      "loss": 1.0388,
+      "step": 7915
+    },
+    {
+      "epoch": 1.4093660968660968,
+      "grad_norm": 0.7010329961776733,
+      "learning_rate": 0.00014490447024022855,
+      "loss": 1.1485,
+      "step": 7916
+    },
+    {
+      "epoch": 1.4095441595441596,
+      "grad_norm": 0.7844831347465515,
+      "learning_rate": 0.0001448919629289143,
+      "loss": 1.1417,
+      "step": 7917
+    },
+    {
+      "epoch": 1.4097222222222223,
+      "grad_norm": 0.6953392624855042,
+      "learning_rate": 0.00014487945473803525,
+      "loss": 0.9546,
+      "step": 7918
+    },
+    {
+      "epoch": 1.4099002849002849,
+      "grad_norm": 0.6307587623596191,
+      "learning_rate": 0.00014486694566783655,
+      "loss": 0.9912,
+      "step": 7919
+    },
+    {
+      "epoch": 1.4100783475783476,
+      "grad_norm": 0.6200215816497803,
+      "learning_rate": 0.00014485443571856326,
+      "loss": 1.0998,
+      "step": 7920
+    },
+    {
+      "epoch": 1.4102564102564101,
+      "grad_norm": 0.7096502184867859,
+      "learning_rate": 0.00014484192489046043,
+      "loss": 0.9587,
+      "step": 7921
+    },
+    {
+      "epoch": 1.4104344729344729,
+      "grad_norm": 0.6965526342391968,
+      "learning_rate": 0.00014482941318377327,
+      "loss": 0.8791,
+      "step": 7922
+    },
+    {
+      "epoch": 1.4106125356125356,
+      "grad_norm": 0.7303466200828552,
+      "learning_rate": 0.00014481690059874687,
+      "loss": 1.084,
+      "step": 7923
+    },
+    {
+      "epoch": 1.4107905982905984,
+      "grad_norm": 0.6144066452980042,
+      "learning_rate": 0.00014480438713562638,
+      "loss": 0.9646,
+      "step": 7924
+    },
+    {
+      "epoch": 1.410968660968661,
+      "grad_norm": 0.645222008228302,
+      "learning_rate": 0.00014479187279465704,
+      "loss": 0.728,
+      "step": 7925
+    },
+    {
+      "epoch": 1.4111467236467237,
+      "grad_norm": 0.6069912314414978,
+      "learning_rate": 0.000144779357576084,
+      "loss": 0.842,
+      "step": 7926
+    },
+    {
+      "epoch": 1.4113247863247862,
+      "grad_norm": 0.6212135553359985,
+      "learning_rate": 0.00014476684148015243,
+      "loss": 0.9817,
+      "step": 7927
+    },
+    {
+      "epoch": 1.411502849002849,
+      "grad_norm": 0.6893343329429626,
+      "learning_rate": 0.00014475432450710763,
+      "loss": 1.0265,
+      "step": 7928
+    },
+    {
+      "epoch": 1.4116809116809117,
+      "grad_norm": 0.6842793822288513,
+      "learning_rate": 0.00014474180665719478,
+      "loss": 1.0593,
+      "step": 7929
+    },
+    {
+      "epoch": 1.4118589743589745,
+      "grad_norm": 0.74690842628479,
+      "learning_rate": 0.0001447292879306592,
+      "loss": 0.9096,
+      "step": 7930
+    },
+    {
+      "epoch": 1.412037037037037,
+      "grad_norm": 0.6624761819839478,
+      "learning_rate": 0.00014471676832774613,
+      "loss": 1.2244,
+      "step": 7931
+    },
+    {
+      "epoch": 1.4122150997150997,
+      "grad_norm": 0.6205778121948242,
+      "learning_rate": 0.00014470424784870088,
+      "loss": 1.1,
+      "step": 7932
+    },
+    {
+      "epoch": 1.4123931623931623,
+      "grad_norm": 0.7592337131500244,
+      "learning_rate": 0.00014469172649376875,
+      "loss": 0.963,
+      "step": 7933
+    },
+    {
+      "epoch": 1.412571225071225,
+      "grad_norm": 0.673328697681427,
+      "learning_rate": 0.00014467920426319508,
+      "loss": 0.8923,
+      "step": 7934
+    },
+    {
+      "epoch": 1.4127492877492878,
+      "grad_norm": 0.6064394116401672,
+      "learning_rate": 0.00014466668115722522,
+      "loss": 0.9679,
+      "step": 7935
+    },
+    {
+      "epoch": 1.4129273504273505,
+      "grad_norm": 0.7738677859306335,
+      "learning_rate": 0.00014465415717610454,
+      "loss": 1.0678,
+      "step": 7936
+    },
+    {
+      "epoch": 1.413105413105413,
+      "grad_norm": 0.7013397812843323,
+      "learning_rate": 0.00014464163232007836,
+      "loss": 0.9017,
+      "step": 7937
+    },
+    {
+      "epoch": 1.4132834757834758,
+      "grad_norm": 0.713291347026825,
+      "learning_rate": 0.0001446291065893922,
+      "loss": 1.1953,
+      "step": 7938
+    },
+    {
+      "epoch": 1.4134615384615383,
+      "grad_norm": 0.7538655996322632,
+      "learning_rate": 0.00014461657998429136,
+      "loss": 1.0571,
+      "step": 7939
+    },
+    {
+      "epoch": 1.413639601139601,
+      "grad_norm": 0.6358973383903503,
+      "learning_rate": 0.00014460405250502133,
+      "loss": 0.8552,
+      "step": 7940
+    },
+    {
+      "epoch": 1.4138176638176638,
+      "grad_norm": 0.67508864402771,
+      "learning_rate": 0.00014459152415182756,
+      "loss": 1.0293,
+      "step": 7941
+    },
+    {
+      "epoch": 1.4139957264957266,
+      "grad_norm": 0.7074598670005798,
+      "learning_rate": 0.00014457899492495546,
+      "loss": 1.2102,
+      "step": 7942
+    },
+    {
+      "epoch": 1.414173789173789,
+      "grad_norm": 0.7157037854194641,
+      "learning_rate": 0.00014456646482465058,
+      "loss": 1.0566,
+      "step": 7943
+    },
+    {
+      "epoch": 1.4143518518518519,
+      "grad_norm": 0.7918477058410645,
+      "learning_rate": 0.00014455393385115844,
+      "loss": 1.3727,
+      "step": 7944
+    },
+    {
+      "epoch": 1.4145299145299146,
+      "grad_norm": 0.569144606590271,
+      "learning_rate": 0.0001445414020047245,
+      "loss": 0.7251,
+      "step": 7945
+    },
+    {
+      "epoch": 1.4147079772079771,
+      "grad_norm": 0.7589054107666016,
+      "learning_rate": 0.0001445288692855943,
+      "loss": 1.0155,
+      "step": 7946
+    },
+    {
+      "epoch": 1.41488603988604,
+      "grad_norm": 0.7531685829162598,
+      "learning_rate": 0.0001445163356940134,
+      "loss": 0.8404,
+      "step": 7947
+    },
+    {
+      "epoch": 1.4150641025641026,
+      "grad_norm": 0.5730917453765869,
+      "learning_rate": 0.0001445038012302274,
+      "loss": 0.8215,
+      "step": 7948
+    },
+    {
+      "epoch": 1.4152421652421652,
+      "grad_norm": 0.6960710883140564,
+      "learning_rate": 0.00014449126589448187,
+      "loss": 0.7902,
+      "step": 7949
+    },
+    {
+      "epoch": 1.415420227920228,
+      "grad_norm": 0.8207054138183594,
+      "learning_rate": 0.0001444787296870224,
+      "loss": 1.493,
+      "step": 7950
+    },
+    {
+      "epoch": 1.4155982905982907,
+      "grad_norm": 0.5854668617248535,
+      "learning_rate": 0.00014446619260809462,
+      "loss": 0.9262,
+      "step": 7951
+    },
+    {
+      "epoch": 1.4157763532763532,
+      "grad_norm": 0.5458414554595947,
+      "learning_rate": 0.00014445365465794413,
+      "loss": 0.8431,
+      "step": 7952
+    },
+    {
+      "epoch": 1.415954415954416,
+      "grad_norm": 0.6880569458007812,
+      "learning_rate": 0.00014444111583681666,
+      "loss": 1.0184,
+      "step": 7953
+    },
+    {
+      "epoch": 1.4161324786324787,
+      "grad_norm": 0.6391083598136902,
+      "learning_rate": 0.00014442857614495783,
+      "loss": 0.88,
+      "step": 7954
+    },
+    {
+      "epoch": 1.4163105413105412,
+      "grad_norm": 0.6246135234832764,
+      "learning_rate": 0.00014441603558261335,
+      "loss": 0.776,
+      "step": 7955
+    },
+    {
+      "epoch": 1.416488603988604,
+      "grad_norm": 0.6263493895530701,
+      "learning_rate": 0.00014440349415002893,
+      "loss": 0.9069,
+      "step": 7956
+    },
+    {
+      "epoch": 1.4166666666666667,
+      "grad_norm": 0.7123475670814514,
+      "learning_rate": 0.00014439095184745024,
+      "loss": 0.8339,
+      "step": 7957
+    },
+    {
+      "epoch": 1.4168447293447293,
+      "grad_norm": 0.7171050906181335,
+      "learning_rate": 0.00014437840867512309,
+      "loss": 1.0633,
+      "step": 7958
+    },
+    {
+      "epoch": 1.417022792022792,
+      "grad_norm": 0.7097769975662231,
+      "learning_rate": 0.00014436586463329322,
+      "loss": 1.0852,
+      "step": 7959
+    },
+    {
+      "epoch": 1.4172008547008548,
+      "grad_norm": 0.6889223456382751,
+      "learning_rate": 0.00014435331972220637,
+      "loss": 0.916,
+      "step": 7960
+    },
+    {
+      "epoch": 1.4173789173789173,
+      "grad_norm": 0.6674435138702393,
+      "learning_rate": 0.0001443407739421084,
+      "loss": 0.9307,
+      "step": 7961
+    },
+    {
+      "epoch": 1.41755698005698,
+      "grad_norm": 0.6578894853591919,
+      "learning_rate": 0.00014432822729324503,
+      "loss": 0.8767,
+      "step": 7962
+    },
+    {
+      "epoch": 1.4177350427350428,
+      "grad_norm": 0.7145379781723022,
+      "learning_rate": 0.00014431567977586212,
+      "loss": 0.9962,
+      "step": 7963
+    },
+    {
+      "epoch": 1.4179131054131053,
+      "grad_norm": 0.6916680335998535,
+      "learning_rate": 0.00014430313139020555,
+      "loss": 1.0464,
+      "step": 7964
+    },
+    {
+      "epoch": 1.418091168091168,
+      "grad_norm": 0.6296181678771973,
+      "learning_rate": 0.00014429058213652116,
+      "loss": 1.0699,
+      "step": 7965
+    },
+    {
+      "epoch": 1.4182692307692308,
+      "grad_norm": 0.5640227198600769,
+      "learning_rate": 0.00014427803201505482,
+      "loss": 0.7006,
+      "step": 7966
+    },
+    {
+      "epoch": 1.4184472934472934,
+      "grad_norm": 0.7181212306022644,
+      "learning_rate": 0.0001442654810260524,
+      "loss": 1.1648,
+      "step": 7967
+    },
+    {
+      "epoch": 1.4186253561253561,
+      "grad_norm": 0.6830772757530212,
+      "learning_rate": 0.00014425292916975984,
+      "loss": 1.0641,
+      "step": 7968
+    },
+    {
+      "epoch": 1.4188034188034189,
+      "grad_norm": 0.665716290473938,
+      "learning_rate": 0.00014424037644642307,
+      "loss": 0.8769,
+      "step": 7969
+    },
+    {
+      "epoch": 1.4189814814814814,
+      "grad_norm": 0.8088666796684265,
+      "learning_rate": 0.00014422782285628802,
+      "loss": 1.1496,
+      "step": 7970
+    },
+    {
+      "epoch": 1.4191595441595442,
+      "grad_norm": 0.7186072468757629,
+      "learning_rate": 0.00014421526839960064,
+      "loss": 0.7421,
+      "step": 7971
+    },
+    {
+      "epoch": 1.419337606837607,
+      "grad_norm": 0.6405926942825317,
+      "learning_rate": 0.00014420271307660694,
+      "loss": 1.0139,
+      "step": 7972
+    },
+    {
+      "epoch": 1.4195156695156697,
+      "grad_norm": 0.7097104787826538,
+      "learning_rate": 0.0001441901568875529,
+      "loss": 1.1582,
+      "step": 7973
+    },
+    {
+      "epoch": 1.4196937321937322,
+      "grad_norm": 0.7347947359085083,
+      "learning_rate": 0.00014417759983268452,
+      "loss": 0.9751,
+      "step": 7974
+    },
+    {
+      "epoch": 1.419871794871795,
+      "grad_norm": 0.6999621987342834,
+      "learning_rate": 0.00014416504191224787,
+      "loss": 0.9419,
+      "step": 7975
+    },
+    {
+      "epoch": 1.4200498575498575,
+      "grad_norm": 0.6500616073608398,
+      "learning_rate": 0.00014415248312648897,
+      "loss": 0.9407,
+      "step": 7976
+    },
+    {
+      "epoch": 1.4202279202279202,
+      "grad_norm": 0.6368781328201294,
+      "learning_rate": 0.00014413992347565383,
+      "loss": 1.1224,
+      "step": 7977
+    },
+    {
+      "epoch": 1.420405982905983,
+      "grad_norm": 0.6422648429870605,
+      "learning_rate": 0.00014412736295998864,
+      "loss": 0.9573,
+      "step": 7978
+    },
+    {
+      "epoch": 1.4205840455840457,
+      "grad_norm": 0.744057297706604,
+      "learning_rate": 0.00014411480157973942,
+      "loss": 1.1384,
+      "step": 7979
+    },
+    {
+      "epoch": 1.4207621082621082,
+      "grad_norm": 0.5905839204788208,
+      "learning_rate": 0.00014410223933515232,
+      "loss": 0.8212,
+      "step": 7980
+    },
+    {
+      "epoch": 1.420940170940171,
+      "grad_norm": 0.5905438661575317,
+      "learning_rate": 0.0001440896762264734,
+      "loss": 0.8281,
+      "step": 7981
+    },
+    {
+      "epoch": 1.4211182336182335,
+      "grad_norm": 0.7087140679359436,
+      "learning_rate": 0.00014407711225394892,
+      "loss": 1.0165,
+      "step": 7982
+    },
+    {
+      "epoch": 1.4212962962962963,
+      "grad_norm": 0.6173902153968811,
+      "learning_rate": 0.00014406454741782495,
+      "loss": 0.8823,
+      "step": 7983
+    },
+    {
+      "epoch": 1.421474358974359,
+      "grad_norm": 0.6649761199951172,
+      "learning_rate": 0.00014405198171834772,
+      "loss": 0.9489,
+      "step": 7984
+    },
+    {
+      "epoch": 1.4216524216524218,
+      "grad_norm": 0.619286835193634,
+      "learning_rate": 0.00014403941515576344,
+      "loss": 0.8149,
+      "step": 7985
+    },
+    {
+      "epoch": 1.4218304843304843,
+      "grad_norm": 0.6358469724655151,
+      "learning_rate": 0.0001440268477303183,
+      "loss": 1.0558,
+      "step": 7986
+    },
+    {
+      "epoch": 1.422008547008547,
+      "grad_norm": 0.7239769697189331,
+      "learning_rate": 0.0001440142794422585,
+      "loss": 1.0528,
+      "step": 7987
+    },
+    {
+      "epoch": 1.4221866096866096,
+      "grad_norm": 0.681168794631958,
+      "learning_rate": 0.00014400171029183036,
+      "loss": 1.0867,
+      "step": 7988
+    },
+    {
+      "epoch": 1.4223646723646723,
+      "grad_norm": 0.6741157174110413,
+      "learning_rate": 0.0001439891402792801,
+      "loss": 0.9153,
+      "step": 7989
+    },
+    {
+      "epoch": 1.422542735042735,
+      "grad_norm": 0.5881659984588623,
+      "learning_rate": 0.00014397656940485403,
+      "loss": 0.92,
+      "step": 7990
+    },
+    {
+      "epoch": 1.4227207977207978,
+      "grad_norm": 0.637093722820282,
+      "learning_rate": 0.00014396399766879842,
+      "loss": 0.921,
+      "step": 7991
+    },
+    {
+      "epoch": 1.4228988603988604,
+      "grad_norm": 0.7760605216026306,
+      "learning_rate": 0.0001439514250713596,
+      "loss": 1.1451,
+      "step": 7992
+    },
+    {
+      "epoch": 1.4230769230769231,
+      "grad_norm": 0.6619600653648376,
+      "learning_rate": 0.00014393885161278393,
+      "loss": 1.0365,
+      "step": 7993
+    },
+    {
+      "epoch": 1.4232549857549857,
+      "grad_norm": 0.5354374051094055,
+      "learning_rate": 0.0001439262772933177,
+      "loss": 0.8718,
+      "step": 7994
+    },
+    {
+      "epoch": 1.4234330484330484,
+      "grad_norm": 0.7063560485839844,
+      "learning_rate": 0.00014391370211320735,
+      "loss": 0.8258,
+      "step": 7995
+    },
+    {
+      "epoch": 1.4236111111111112,
+      "grad_norm": 0.6876368522644043,
+      "learning_rate": 0.00014390112607269923,
+      "loss": 0.9579,
+      "step": 7996
+    },
+    {
+      "epoch": 1.423789173789174,
+      "grad_norm": 0.6976612210273743,
+      "learning_rate": 0.00014388854917203974,
+      "loss": 1.0376,
+      "step": 7997
+    },
+    {
+      "epoch": 1.4239672364672364,
+      "grad_norm": 0.6157355308532715,
+      "learning_rate": 0.00014387597141147525,
+      "loss": 0.8743,
+      "step": 7998
+    },
+    {
+      "epoch": 1.4241452991452992,
+      "grad_norm": 0.7273156046867371,
+      "learning_rate": 0.0001438633927912523,
+      "loss": 1.101,
+      "step": 7999
+    },
+    {
+      "epoch": 1.4243233618233617,
+      "grad_norm": 0.918380618095398,
+      "learning_rate": 0.0001438508133116173,
+      "loss": 0.9625,
+      "step": 8000
+    },
+    {
+      "epoch": 1.4245014245014245,
+      "grad_norm": 0.626040518283844,
+      "learning_rate": 0.00014383823297281666,
+      "loss": 0.9552,
+      "step": 8001
+    },
+    {
+      "epoch": 1.4246794871794872,
+      "grad_norm": 0.7320386171340942,
+      "learning_rate": 0.00014382565177509693,
+      "loss": 1.0719,
+      "step": 8002
+    },
+    {
+      "epoch": 1.42485754985755,
+      "grad_norm": 0.7283148169517517,
+      "learning_rate": 0.0001438130697187046,
+      "loss": 1.0455,
+      "step": 8003
+    },
+    {
+      "epoch": 1.4250356125356125,
+      "grad_norm": 0.6614177823066711,
+      "learning_rate": 0.00014380048680388613,
+      "loss": 0.9876,
+      "step": 8004
+    },
+    {
+      "epoch": 1.4252136752136753,
+      "grad_norm": 0.6726453900337219,
+      "learning_rate": 0.00014378790303088817,
+      "loss": 0.9861,
+      "step": 8005
+    },
+    {
+      "epoch": 1.4253917378917378,
+      "grad_norm": 0.7968725562095642,
+      "learning_rate": 0.00014377531839995718,
+      "loss": 1.1662,
+      "step": 8006
+    },
+    {
+      "epoch": 1.4255698005698005,
+      "grad_norm": 0.6510586738586426,
+      "learning_rate": 0.0001437627329113398,
+      "loss": 0.9452,
+      "step": 8007
+    },
+    {
+      "epoch": 1.4257478632478633,
+      "grad_norm": 0.6933155655860901,
+      "learning_rate": 0.00014375014656528253,
+      "loss": 1.0149,
+      "step": 8008
+    },
+    {
+      "epoch": 1.425925925925926,
+      "grad_norm": 0.7141832113265991,
+      "learning_rate": 0.00014373755936203204,
+      "loss": 1.0667,
+      "step": 8009
+    },
+    {
+      "epoch": 1.4261039886039886,
+      "grad_norm": 0.6352181434631348,
+      "learning_rate": 0.00014372497130183494,
+      "loss": 0.8652,
+      "step": 8010
+    },
+    {
+      "epoch": 1.4262820512820513,
+      "grad_norm": 0.7494860291481018,
+      "learning_rate": 0.00014371238238493786,
+      "loss": 0.9592,
+      "step": 8011
+    },
+    {
+      "epoch": 1.4264601139601139,
+      "grad_norm": 0.610556423664093,
+      "learning_rate": 0.00014369979261158746,
+      "loss": 0.7015,
+      "step": 8012
+    },
+    {
+      "epoch": 1.4266381766381766,
+      "grad_norm": 0.7305756211280823,
+      "learning_rate": 0.00014368720198203037,
+      "loss": 0.9681,
+      "step": 8013
+    },
+    {
+      "epoch": 1.4268162393162394,
+      "grad_norm": 0.6964020133018494,
+      "learning_rate": 0.0001436746104965133,
+      "loss": 1.1166,
+      "step": 8014
+    },
+    {
+      "epoch": 1.426994301994302,
+      "grad_norm": 0.7449237108230591,
+      "learning_rate": 0.00014366201815528302,
+      "loss": 1.1331,
+      "step": 8015
+    },
+    {
+      "epoch": 1.4271723646723646,
+      "grad_norm": 0.625834047794342,
+      "learning_rate": 0.00014364942495858615,
+      "loss": 0.8796,
+      "step": 8016
+    },
+    {
+      "epoch": 1.4273504273504274,
+      "grad_norm": 0.664559006690979,
+      "learning_rate": 0.0001436368309066695,
+      "loss": 1.0263,
+      "step": 8017
+    },
+    {
+      "epoch": 1.42752849002849,
+      "grad_norm": Infinity,
+      "learning_rate": 0.0001436368309066695,
+      "loss": 1.0731,
+      "step": 8018
+    },
+    {
+      "epoch": 1.4277065527065527,
+      "grad_norm": 0.6714464426040649,
+      "learning_rate": 0.00014362423599977977,
+      "loss": 0.9345,
+      "step": 8019
+    },
+    {
+      "epoch": 1.4278846153846154,
+      "grad_norm": 0.7595751285552979,
+      "learning_rate": 0.00014361164023816376,
+      "loss": 0.9646,
+      "step": 8020
+    },
+    {
+      "epoch": 1.4280626780626782,
+      "grad_norm": 0.6413954496383667,
+      "learning_rate": 0.00014359904362206828,
+      "loss": 1.0471,
+      "step": 8021
+    },
+    {
+      "epoch": 1.4282407407407407,
+      "grad_norm": 0.7298843264579773,
+      "learning_rate": 0.00014358644615174008,
+      "loss": 0.8932,
+      "step": 8022
+    },
+    {
+      "epoch": 1.4284188034188035,
+      "grad_norm": 0.8022156953811646,
+      "learning_rate": 0.00014357384782742602,
+      "loss": 1.0437,
+      "step": 8023
+    },
+    {
+      "epoch": 1.428596866096866,
+      "grad_norm": 0.7264443635940552,
+      "learning_rate": 0.00014356124864937296,
+      "loss": 0.9368,
+      "step": 8024
+    },
+    {
+      "epoch": 1.4287749287749287,
+      "grad_norm": 0.6819384098052979,
+      "learning_rate": 0.00014354864861782768,
+      "loss": 1.0,
+      "step": 8025
+    },
+    {
+      "epoch": 1.4289529914529915,
+      "grad_norm": 0.5945104956626892,
+      "learning_rate": 0.0001435360477330371,
+      "loss": 0.8108,
+      "step": 8026
+    },
+    {
+      "epoch": 1.4291310541310542,
+      "grad_norm": 0.6497398018836975,
+      "learning_rate": 0.0001435234459952481,
+      "loss": 0.8712,
+      "step": 8027
+    },
+    {
+      "epoch": 1.4293091168091168,
+      "grad_norm": 0.6424077749252319,
+      "learning_rate": 0.0001435108434047076,
+      "loss": 0.9172,
+      "step": 8028
+    },
+    {
+      "epoch": 1.4294871794871795,
+      "grad_norm": 0.6806963086128235,
+      "learning_rate": 0.00014349823996166253,
+      "loss": 1.1648,
+      "step": 8029
+    },
+    {
+      "epoch": 1.429665242165242,
+      "grad_norm": 0.6601083874702454,
+      "learning_rate": 0.00014348563566635977,
+      "loss": 0.9453,
+      "step": 8030
+    },
+    {
+      "epoch": 1.4298433048433048,
+      "grad_norm": 0.7024385929107666,
+      "learning_rate": 0.00014347303051904636,
+      "loss": 1.074,
+      "step": 8031
+    },
+    {
+      "epoch": 1.4300213675213675,
+      "grad_norm": 0.7094005942344666,
+      "learning_rate": 0.00014346042451996918,
+      "loss": 0.9976,
+      "step": 8032
+    },
+    {
+      "epoch": 1.4301994301994303,
+      "grad_norm": 0.6775936484336853,
+      "learning_rate": 0.0001434478176693753,
+      "loss": 0.9039,
+      "step": 8033
+    },
+    {
+      "epoch": 1.4303774928774928,
+      "grad_norm": 0.6920986771583557,
+      "learning_rate": 0.00014343520996751166,
+      "loss": 0.9122,
+      "step": 8034
+    },
+    {
+      "epoch": 1.4305555555555556,
+      "grad_norm": 0.720690906047821,
+      "learning_rate": 0.00014342260141462528,
+      "loss": 1.1028,
+      "step": 8035
+    },
+    {
+      "epoch": 1.430733618233618,
+      "grad_norm": 0.624546229839325,
+      "learning_rate": 0.00014340999201096328,
+      "loss": 0.9083,
+      "step": 8036
+    },
+    {
+      "epoch": 1.4309116809116809,
+      "grad_norm": 0.6560490727424622,
+      "learning_rate": 0.00014339738175677265,
+      "loss": 0.8029,
+      "step": 8037
+    },
+    {
+      "epoch": 1.4310897435897436,
+      "grad_norm": 0.8266100883483887,
+      "learning_rate": 0.00014338477065230047,
+      "loss": 0.9655,
+      "step": 8038
+    },
+    {
+      "epoch": 1.4312678062678064,
+      "grad_norm": 0.6593570113182068,
+      "learning_rate": 0.00014337215869779385,
+      "loss": 1.0299,
+      "step": 8039
+    },
+    {
+      "epoch": 1.431445868945869,
+      "grad_norm": 0.6321794390678406,
+      "learning_rate": 0.00014335954589349986,
+      "loss": 0.8755,
+      "step": 8040
+    },
+    {
+      "epoch": 1.4316239316239316,
+      "grad_norm": 0.7030870318412781,
+      "learning_rate": 0.00014334693223966562,
+      "loss": 1.1226,
+      "step": 8041
+    },
+    {
+      "epoch": 1.4318019943019942,
+      "grad_norm": 0.7794312238693237,
+      "learning_rate": 0.0001433343177365383,
+      "loss": 1.1252,
+      "step": 8042
+    },
+    {
+      "epoch": 1.431980056980057,
+      "grad_norm": 0.6115018129348755,
+      "learning_rate": 0.00014332170238436507,
+      "loss": 0.8753,
+      "step": 8043
+    },
+    {
+      "epoch": 1.4321581196581197,
+      "grad_norm": 0.8525674939155579,
+      "learning_rate": 0.00014330908618339304,
+      "loss": 0.9135,
+      "step": 8044
+    },
+    {
+      "epoch": 1.4323361823361824,
+      "grad_norm": 0.6869912147521973,
+      "learning_rate": 0.00014329646913386948,
+      "loss": 0.868,
+      "step": 8045
+    },
+    {
+      "epoch": 1.432514245014245,
+      "grad_norm": 0.5877542495727539,
+      "learning_rate": 0.0001432838512360415,
+      "loss": 0.9051,
+      "step": 8046
+    },
+    {
+      "epoch": 1.4326923076923077,
+      "grad_norm": 0.6609327793121338,
+      "learning_rate": 0.0001432712324901564,
+      "loss": 0.9084,
+      "step": 8047
+    },
+    {
+      "epoch": 1.4328703703703702,
+      "grad_norm": 0.6318345069885254,
+      "learning_rate": 0.0001432586128964614,
+      "loss": 0.8291,
+      "step": 8048
+    },
+    {
+      "epoch": 1.433048433048433,
+      "grad_norm": 0.6973567008972168,
+      "learning_rate": 0.0001432459924552037,
+      "loss": 0.97,
+      "step": 8049
+    },
+    {
+      "epoch": 1.4332264957264957,
+      "grad_norm": 0.6838201284408569,
+      "learning_rate": 0.00014323337116663062,
+      "loss": 1.0957,
+      "step": 8050
+    },
+    {
+      "epoch": 1.4334045584045585,
+      "grad_norm": 0.7472857236862183,
+      "learning_rate": 0.00014322074903098944,
+      "loss": 1.0981,
+      "step": 8051
+    },
+    {
+      "epoch": 1.433582621082621,
+      "grad_norm": 0.7723061442375183,
+      "learning_rate": 0.0001432081260485275,
+      "loss": 1.2231,
+      "step": 8052
+    },
+    {
+      "epoch": 1.4337606837606838,
+      "grad_norm": 0.681834876537323,
+      "learning_rate": 0.00014319550221949208,
+      "loss": 1.073,
+      "step": 8053
+    },
+    {
+      "epoch": 1.4339387464387463,
+      "grad_norm": 0.6566045880317688,
+      "learning_rate": 0.00014318287754413051,
+      "loss": 1.1298,
+      "step": 8054
+    },
+    {
+      "epoch": 1.434116809116809,
+      "grad_norm": 0.6792440414428711,
+      "learning_rate": 0.00014317025202269015,
+      "loss": 1.2224,
+      "step": 8055
+    },
+    {
+      "epoch": 1.4342948717948718,
+      "grad_norm": 0.7946709394454956,
+      "learning_rate": 0.00014315762565541838,
+      "loss": 1.0728,
+      "step": 8056
+    },
+    {
+      "epoch": 1.4344729344729346,
+      "grad_norm": 0.633466899394989,
+      "learning_rate": 0.00014314499844256262,
+      "loss": 0.944,
+      "step": 8057
+    },
+    {
+      "epoch": 1.434650997150997,
+      "grad_norm": 0.7308502197265625,
+      "learning_rate": 0.00014313237038437023,
+      "loss": 1.0684,
+      "step": 8058
+    },
+    {
+      "epoch": 1.4348290598290598,
+      "grad_norm": 0.6483737230300903,
+      "learning_rate": 0.00014311974148108862,
+      "loss": 1.0843,
+      "step": 8059
+    },
+    {
+      "epoch": 1.4350071225071226,
+      "grad_norm": 0.6301209926605225,
+      "learning_rate": 0.00014310711173296526,
+      "loss": 1.0083,
+      "step": 8060
+    },
+    {
+      "epoch": 1.4351851851851851,
+      "grad_norm": 0.6674302816390991,
+      "learning_rate": 0.00014309448114024757,
+      "loss": 0.9877,
+      "step": 8061
+    },
+    {
+      "epoch": 1.4353632478632479,
+      "grad_norm": 0.6888732314109802,
+      "learning_rate": 0.00014308184970318307,
+      "loss": 0.9937,
+      "step": 8062
+    },
+    {
+      "epoch": 1.4355413105413106,
+      "grad_norm": 0.6922950148582458,
+      "learning_rate": 0.00014306921742201923,
+      "loss": 1.0149,
+      "step": 8063
+    },
+    {
+      "epoch": 1.4357193732193732,
+      "grad_norm": 0.6050686240196228,
+      "learning_rate": 0.00014305658429700352,
+      "loss": 0.7882,
+      "step": 8064
+    },
+    {
+      "epoch": 1.435897435897436,
+      "grad_norm": 0.5080767869949341,
+      "learning_rate": 0.00014304395032838348,
+      "loss": 0.7796,
+      "step": 8065
+    },
+    {
+      "epoch": 1.4360754985754987,
+      "grad_norm": 0.6382707953453064,
+      "learning_rate": 0.00014303131551640668,
+      "loss": 0.965,
+      "step": 8066
+    },
+    {
+      "epoch": 1.4362535612535612,
+      "grad_norm": 0.7153477668762207,
+      "learning_rate": 0.00014301867986132063,
+      "loss": 1.1277,
+      "step": 8067
+    },
+    {
+      "epoch": 1.436431623931624,
+      "grad_norm": 0.6208404898643494,
+      "learning_rate": 0.00014300604336337292,
+      "loss": 0.8246,
+      "step": 8068
+    },
+    {
+      "epoch": 1.4366096866096867,
+      "grad_norm": 0.719695508480072,
+      "learning_rate": 0.0001429934060228111,
+      "loss": 0.7681,
+      "step": 8069
+    },
+    {
+      "epoch": 1.4367877492877492,
+      "grad_norm": 0.6219030618667603,
+      "learning_rate": 0.0001429807678398828,
+      "loss": 1.0425,
+      "step": 8070
+    },
+    {
+      "epoch": 1.436965811965812,
+      "grad_norm": 0.6080238819122314,
+      "learning_rate": 0.00014296812881483566,
+      "loss": 0.8762,
+      "step": 8071
+    },
+    {
+      "epoch": 1.4371438746438747,
+      "grad_norm": 0.6264194846153259,
+      "learning_rate": 0.00014295548894791729,
+      "loss": 1.087,
+      "step": 8072
+    },
+    {
+      "epoch": 1.4373219373219372,
+      "grad_norm": 0.6503600478172302,
+      "learning_rate": 0.00014294284823937535,
+      "loss": 1.0583,
+      "step": 8073
+    },
+    {
+      "epoch": 1.4375,
+      "grad_norm": 0.7623817324638367,
+      "learning_rate": 0.0001429302066894575,
+      "loss": 1.2372,
+      "step": 8074
+    },
+    {
+      "epoch": 1.4376780626780628,
+      "grad_norm": 0.7020344138145447,
+      "learning_rate": 0.00014291756429841144,
+      "loss": 1.2163,
+      "step": 8075
+    },
+    {
+      "epoch": 1.4378561253561253,
+      "grad_norm": 0.7070338129997253,
+      "learning_rate": 0.00014290492106648484,
+      "loss": 0.986,
+      "step": 8076
+    },
+    {
+      "epoch": 1.438034188034188,
+      "grad_norm": 0.6407621502876282,
+      "learning_rate": 0.00014289227699392545,
+      "loss": 0.9329,
+      "step": 8077
+    },
+    {
+      "epoch": 1.4382122507122508,
+      "grad_norm": 0.6836710572242737,
+      "learning_rate": 0.00014287963208098098,
+      "loss": 0.9252,
+      "step": 8078
+    },
+    {
+      "epoch": 1.4383903133903133,
+      "grad_norm": 0.648642897605896,
+      "learning_rate": 0.00014286698632789922,
+      "loss": 1.0457,
+      "step": 8079
+    },
+    {
+      "epoch": 1.438568376068376,
+      "grad_norm": 0.7015881538391113,
+      "learning_rate": 0.0001428543397349279,
+      "loss": 1.0516,
+      "step": 8080
+    },
+    {
+      "epoch": 1.4387464387464388,
+      "grad_norm": 0.6031532883644104,
+      "learning_rate": 0.0001428416923023148,
+      "loss": 0.9423,
+      "step": 8081
+    },
+    {
+      "epoch": 1.4389245014245013,
+      "grad_norm": 0.8235578536987305,
+      "learning_rate": 0.00014282904403030772,
+      "loss": 1.3433,
+      "step": 8082
+    },
+    {
+      "epoch": 1.439102564102564,
+      "grad_norm": 0.7355761528015137,
+      "learning_rate": 0.00014281639491915452,
+      "loss": 1.0128,
+      "step": 8083
+    },
+    {
+      "epoch": 1.4392806267806268,
+      "grad_norm": 0.7429629564285278,
+      "learning_rate": 0.00014280374496910303,
+      "loss": 0.8546,
+      "step": 8084
+    },
+    {
+      "epoch": 1.4394586894586894,
+      "grad_norm": 0.5831776857376099,
+      "learning_rate": 0.00014279109418040105,
+      "loss": 0.9021,
+      "step": 8085
+    },
+    {
+      "epoch": 1.4396367521367521,
+      "grad_norm": 0.6585184931755066,
+      "learning_rate": 0.00014277844255329645,
+      "loss": 0.9256,
+      "step": 8086
+    },
+    {
+      "epoch": 1.4398148148148149,
+      "grad_norm": 0.6412501931190491,
+      "learning_rate": 0.00014276579008803717,
+      "loss": 0.9305,
+      "step": 8087
+    },
+    {
+      "epoch": 1.4399928774928774,
+      "grad_norm": 0.6305423378944397,
+      "learning_rate": 0.00014275313678487102,
+      "loss": 0.9471,
+      "step": 8088
+    },
+    {
+      "epoch": 1.4401709401709402,
+      "grad_norm": 0.7160914540290833,
+      "learning_rate": 0.00014274048264404602,
+      "loss": 0.8798,
+      "step": 8089
+    },
+    {
+      "epoch": 1.440349002849003,
+      "grad_norm": 0.6740858554840088,
+      "learning_rate": 0.00014272782766581004,
+      "loss": 0.9022,
+      "step": 8090
+    },
+    {
+      "epoch": 1.4405270655270654,
+      "grad_norm": 0.7554821968078613,
+      "learning_rate": 0.000142715171850411,
+      "loss": 1.0924,
+      "step": 8091
+    },
+    {
+      "epoch": 1.4407051282051282,
+      "grad_norm": 0.7361162304878235,
+      "learning_rate": 0.00014270251519809694,
+      "loss": 0.9907,
+      "step": 8092
+    },
+    {
+      "epoch": 1.440883190883191,
+      "grad_norm": 0.731813371181488,
+      "learning_rate": 0.0001426898577091158,
+      "loss": 1.1765,
+      "step": 8093
+    },
+    {
+      "epoch": 1.4410612535612537,
+      "grad_norm": 0.6877756714820862,
+      "learning_rate": 0.00014267719938371558,
+      "loss": 1.0536,
+      "step": 8094
+    },
+    {
+      "epoch": 1.4412393162393162,
+      "grad_norm": 0.6724407076835632,
+      "learning_rate": 0.00014266454022214426,
+      "loss": 1.1895,
+      "step": 8095
+    },
+    {
+      "epoch": 1.441417378917379,
+      "grad_norm": 0.6946671605110168,
+      "learning_rate": 0.0001426518802246499,
+      "loss": 1.0437,
+      "step": 8096
+    },
+    {
+      "epoch": 1.4415954415954415,
+      "grad_norm": 0.7032839059829712,
+      "learning_rate": 0.00014263921939148058,
+      "loss": 1.1363,
+      "step": 8097
+    },
+    {
+      "epoch": 1.4417735042735043,
+      "grad_norm": 0.6942192316055298,
+      "learning_rate": 0.00014262655772288434,
+      "loss": 1.315,
+      "step": 8098
+    },
+    {
+      "epoch": 1.441951566951567,
+      "grad_norm": 0.7002301812171936,
+      "learning_rate": 0.00014261389521910922,
+      "loss": 1.0546,
+      "step": 8099
+    },
+    {
+      "epoch": 1.4421296296296298,
+      "grad_norm": 0.7260788083076477,
+      "learning_rate": 0.00014260123188040335,
+      "loss": 0.9374,
+      "step": 8100
+    },
+    {
+      "epoch": 1.4423076923076923,
+      "grad_norm": 0.6629201173782349,
+      "learning_rate": 0.00014258856770701486,
+      "loss": 0.8632,
+      "step": 8101
+    },
+    {
+      "epoch": 1.442485754985755,
+      "grad_norm": 0.6570318937301636,
+      "learning_rate": 0.0001425759026991918,
+      "loss": 1.0102,
+      "step": 8102
+    },
+    {
+      "epoch": 1.4426638176638176,
+      "grad_norm": 0.7696560621261597,
+      "learning_rate": 0.00014256323685718242,
+      "loss": 0.9703,
+      "step": 8103
+    },
+    {
+      "epoch": 1.4428418803418803,
+      "grad_norm": 0.7206611633300781,
+      "learning_rate": 0.00014255057018123482,
+      "loss": 1.1728,
+      "step": 8104
+    },
+    {
+      "epoch": 1.443019943019943,
+      "grad_norm": 0.6871611475944519,
+      "learning_rate": 0.0001425379026715972,
+      "loss": 0.9377,
+      "step": 8105
+    },
+    {
+      "epoch": 1.4431980056980058,
+      "grad_norm": 0.6027442812919617,
+      "learning_rate": 0.00014252523432851775,
+      "loss": 0.9212,
+      "step": 8106
+    },
+    {
+      "epoch": 1.4433760683760684,
+      "grad_norm": 0.7149752378463745,
+      "learning_rate": 0.00014251256515224463,
+      "loss": 0.9654,
+      "step": 8107
+    },
+    {
+      "epoch": 1.443554131054131,
+      "grad_norm": 0.5949522256851196,
+      "learning_rate": 0.00014249989514302614,
+      "loss": 1.0646,
+      "step": 8108
+    },
+    {
+      "epoch": 1.4437321937321936,
+      "grad_norm": 0.7345452904701233,
+      "learning_rate": 0.0001424872243011105,
+      "loss": 0.9801,
+      "step": 8109
+    },
+    {
+      "epoch": 1.4439102564102564,
+      "grad_norm": 0.8045009970664978,
+      "learning_rate": 0.00014247455262674592,
+      "loss": 1.3529,
+      "step": 8110
+    },
+    {
+      "epoch": 1.4440883190883191,
+      "grad_norm": 0.6712123155593872,
+      "learning_rate": 0.00014246188012018073,
+      "loss": 1.0416,
+      "step": 8111
+    },
+    {
+      "epoch": 1.444266381766382,
+      "grad_norm": 0.7811154127120972,
+      "learning_rate": 0.00014244920678166322,
+      "loss": 1.2019,
+      "step": 8112
+    },
+    {
+      "epoch": 1.4444444444444444,
+      "grad_norm": 0.6834486126899719,
+      "learning_rate": 0.00014243653261144167,
+      "loss": 0.986,
+      "step": 8113
+    },
+    {
+      "epoch": 1.4446225071225072,
+      "grad_norm": 0.6901041269302368,
+      "learning_rate": 0.00014242385760976443,
+      "loss": 1.0988,
+      "step": 8114
+    },
+    {
+      "epoch": 1.4448005698005697,
+      "grad_norm": 0.6233634948730469,
+      "learning_rate": 0.00014241118177687982,
+      "loss": 0.7748,
+      "step": 8115
+    },
+    {
+      "epoch": 1.4449786324786325,
+      "grad_norm": 0.6899837851524353,
+      "learning_rate": 0.00014239850511303624,
+      "loss": 0.9734,
+      "step": 8116
+    },
+    {
+      "epoch": 1.4451566951566952,
+      "grad_norm": 0.6316244006156921,
+      "learning_rate": 0.00014238582761848197,
+      "loss": 0.7888,
+      "step": 8117
+    },
+    {
+      "epoch": 1.445334757834758,
+      "grad_norm": 0.6074259877204895,
+      "learning_rate": 0.00014237314929346545,
+      "loss": 0.8843,
+      "step": 8118
+    },
+    {
+      "epoch": 1.4455128205128205,
+      "grad_norm": 0.6112192273139954,
+      "learning_rate": 0.00014236047013823516,
+      "loss": 0.8529,
+      "step": 8119
+    },
+    {
+      "epoch": 1.4456908831908832,
+      "grad_norm": 0.6883894801139832,
+      "learning_rate": 0.0001423477901530394,
+      "loss": 0.9506,
+      "step": 8120
+    },
+    {
+      "epoch": 1.4458689458689458,
+      "grad_norm": 0.7248309254646301,
+      "learning_rate": 0.00014233510933812666,
+      "loss": 0.9573,
+      "step": 8121
+    },
+    {
+      "epoch": 1.4460470085470085,
+      "grad_norm": 0.6853367686271667,
+      "learning_rate": 0.00014232242769374542,
+      "loss": 0.9903,
+      "step": 8122
+    },
+    {
+      "epoch": 1.4462250712250713,
+      "grad_norm": 0.7179274559020996,
+      "learning_rate": 0.0001423097452201441,
+      "loss": 0.9157,
+      "step": 8123
+    },
+    {
+      "epoch": 1.446403133903134,
+      "grad_norm": 0.6704817414283752,
+      "learning_rate": 0.00014229706191757127,
+      "loss": 1.1361,
+      "step": 8124
+    },
+    {
+      "epoch": 1.4465811965811965,
+      "grad_norm": 0.6380739212036133,
+      "learning_rate": 0.00014228437778627533,
+      "loss": 0.9336,
+      "step": 8125
+    },
+    {
+      "epoch": 1.4467592592592593,
+      "grad_norm": 0.6275362372398376,
+      "learning_rate": 0.00014227169282650487,
+      "loss": 0.9617,
+      "step": 8126
+    },
+    {
+      "epoch": 1.4469373219373218,
+      "grad_norm": 0.5644828677177429,
+      "learning_rate": 0.00014225900703850836,
+      "loss": 0.7384,
+      "step": 8127
+    },
+    {
+      "epoch": 1.4471153846153846,
+      "grad_norm": 0.6522284150123596,
+      "learning_rate": 0.00014224632042253443,
+      "loss": 1.1098,
+      "step": 8128
+    },
+    {
+      "epoch": 1.4472934472934473,
+      "grad_norm": 0.6228049993515015,
+      "learning_rate": 0.0001422336329788316,
+      "loss": 1.1061,
+      "step": 8129
+    },
+    {
+      "epoch": 1.44747150997151,
+      "grad_norm": 0.6092000603675842,
+      "learning_rate": 0.00014222094470764848,
+      "loss": 0.808,
+      "step": 8130
+    },
+    {
+      "epoch": 1.4476495726495726,
+      "grad_norm": 0.667435348033905,
+      "learning_rate": 0.00014220825560923363,
+      "loss": 1.1223,
+      "step": 8131
+    },
+    {
+      "epoch": 1.4478276353276354,
+      "grad_norm": 0.6080766320228577,
+      "learning_rate": 0.0001421955656838357,
+      "loss": 1.0099,
+      "step": 8132
+    },
+    {
+      "epoch": 1.448005698005698,
+      "grad_norm": 0.7597638368606567,
+      "learning_rate": 0.00014218287493170332,
+      "loss": 0.9718,
+      "step": 8133
+    },
+    {
+      "epoch": 1.4481837606837606,
+      "grad_norm": 0.574130654335022,
+      "learning_rate": 0.0001421701833530851,
+      "loss": 0.7745,
+      "step": 8134
+    },
+    {
+      "epoch": 1.4483618233618234,
+      "grad_norm": 0.6372822523117065,
+      "learning_rate": 0.0001421574909482298,
+      "loss": 1.0088,
+      "step": 8135
+    },
+    {
+      "epoch": 1.4485398860398861,
+      "grad_norm": 0.6759644746780396,
+      "learning_rate": 0.000142144797717386,
+      "loss": 0.9684,
+      "step": 8136
+    },
+    {
+      "epoch": 1.4487179487179487,
+      "grad_norm": 0.706351637840271,
+      "learning_rate": 0.00014213210366080244,
+      "loss": 1.021,
+      "step": 8137
+    },
+    {
+      "epoch": 1.4488960113960114,
+      "grad_norm": 0.6976894736289978,
+      "learning_rate": 0.0001421194087787278,
+      "loss": 1.1038,
+      "step": 8138
+    },
+    {
+      "epoch": 1.449074074074074,
+      "grad_norm": 0.7322551012039185,
+      "learning_rate": 0.00014210671307141092,
+      "loss": 1.0213,
+      "step": 8139
+    },
+    {
+      "epoch": 1.4492521367521367,
+      "grad_norm": 0.5885626077651978,
+      "learning_rate": 0.0001420940165391004,
+      "loss": 0.821,
+      "step": 8140
+    },
+    {
+      "epoch": 1.4494301994301995,
+      "grad_norm": 0.7009791135787964,
+      "learning_rate": 0.0001420813191820451,
+      "loss": 0.8647,
+      "step": 8141
+    },
+    {
+      "epoch": 1.4496082621082622,
+      "grad_norm": 0.5715423822402954,
+      "learning_rate": 0.00014206862100049375,
+      "loss": 0.873,
+      "step": 8142
+    },
+    {
+      "epoch": 1.4497863247863247,
+      "grad_norm": 1.1452178955078125,
+      "learning_rate": 0.00014205592199469514,
+      "loss": 1.2523,
+      "step": 8143
+    },
+    {
+      "epoch": 1.4499643874643875,
+      "grad_norm": 0.8076814413070679,
+      "learning_rate": 0.00014204322216489814,
+      "loss": 1.1071,
+      "step": 8144
+    },
+    {
+      "epoch": 1.45014245014245,
+      "grad_norm": 0.7325751185417175,
+      "learning_rate": 0.00014203052151135154,
+      "loss": 0.9846,
+      "step": 8145
+    },
+    {
+      "epoch": 1.4503205128205128,
+      "grad_norm": 0.7009061574935913,
+      "learning_rate": 0.00014201782003430417,
+      "loss": 0.8153,
+      "step": 8146
+    },
+    {
+      "epoch": 1.4504985754985755,
+      "grad_norm": 0.6502353549003601,
+      "learning_rate": 0.0001420051177340049,
+      "loss": 0.8959,
+      "step": 8147
+    },
+    {
+      "epoch": 1.4506766381766383,
+      "grad_norm": 0.6134430170059204,
+      "learning_rate": 0.00014199241461070261,
+      "loss": 0.9683,
+      "step": 8148
+    },
+    {
+      "epoch": 1.4508547008547008,
+      "grad_norm": 0.720160722732544,
+      "learning_rate": 0.0001419797106646462,
+      "loss": 0.9579,
+      "step": 8149
+    },
+    {
+      "epoch": 1.4510327635327636,
+      "grad_norm": 0.6141422986984253,
+      "learning_rate": 0.00014196700589608454,
+      "loss": 0.9427,
+      "step": 8150
+    },
+    {
+      "epoch": 1.451210826210826,
+      "grad_norm": 0.6835139393806458,
+      "learning_rate": 0.00014195430030526656,
+      "loss": 1.0374,
+      "step": 8151
+    },
+    {
+      "epoch": 1.4513888888888888,
+      "grad_norm": 0.6829691529273987,
+      "learning_rate": 0.00014194159389244128,
+      "loss": 0.9418,
+      "step": 8152
+    },
+    {
+      "epoch": 1.4515669515669516,
+      "grad_norm": 0.7142195701599121,
+      "learning_rate": 0.00014192888665785755,
+      "loss": 1.1876,
+      "step": 8153
+    },
+    {
+      "epoch": 1.4517450142450143,
+      "grad_norm": 0.6719943284988403,
+      "learning_rate": 0.0001419161786017644,
+      "loss": 1.1417,
+      "step": 8154
+    },
+    {
+      "epoch": 1.4519230769230769,
+      "grad_norm": 0.6478939652442932,
+      "learning_rate": 0.0001419034697244108,
+      "loss": 0.943,
+      "step": 8155
+    },
+    {
+      "epoch": 1.4521011396011396,
+      "grad_norm": 0.6308888792991638,
+      "learning_rate": 0.00014189076002604575,
+      "loss": 0.9842,
+      "step": 8156
+    },
+    {
+      "epoch": 1.4522792022792022,
+      "grad_norm": 0.673559844493866,
+      "learning_rate": 0.00014187804950691827,
+      "loss": 0.8108,
+      "step": 8157
+    },
+    {
+      "epoch": 1.452457264957265,
+      "grad_norm": 0.5895359516143799,
+      "learning_rate": 0.00014186533816727744,
+      "loss": 0.8187,
+      "step": 8158
+    },
+    {
+      "epoch": 1.4526353276353277,
+      "grad_norm": 0.6703287363052368,
+      "learning_rate": 0.00014185262600737225,
+      "loss": 0.9012,
+      "step": 8159
+    },
+    {
+      "epoch": 1.4528133903133904,
+      "grad_norm": 0.697728157043457,
+      "learning_rate": 0.00014183991302745182,
+      "loss": 1.2572,
+      "step": 8160
+    },
+    {
+      "epoch": 1.452991452991453,
+      "grad_norm": 0.599371075630188,
+      "learning_rate": 0.00014182719922776514,
+      "loss": 1.078,
+      "step": 8161
+    },
+    {
+      "epoch": 1.4531695156695157,
+      "grad_norm": 0.6774863600730896,
+      "learning_rate": 0.00014181448460856143,
+      "loss": 1.0607,
+      "step": 8162
+    },
+    {
+      "epoch": 1.4533475783475782,
+      "grad_norm": 0.6872009038925171,
+      "learning_rate": 0.00014180176917008976,
+      "loss": 1.0713,
+      "step": 8163
+    },
+    {
+      "epoch": 1.453525641025641,
+      "grad_norm": 0.7949981093406677,
+      "learning_rate": 0.00014178905291259926,
+      "loss": 1.0471,
+      "step": 8164
+    },
+    {
+      "epoch": 1.4537037037037037,
+      "grad_norm": 0.6592127084732056,
+      "learning_rate": 0.00014177633583633908,
+      "loss": 0.8409,
+      "step": 8165
+    },
+    {
+      "epoch": 1.4538817663817665,
+      "grad_norm": 0.6745635867118835,
+      "learning_rate": 0.00014176361794155837,
+      "loss": 1.0859,
+      "step": 8166
+    },
+    {
+      "epoch": 1.454059829059829,
+      "grad_norm": 0.6661605834960938,
+      "learning_rate": 0.00014175089922850633,
+      "loss": 1.0587,
+      "step": 8167
+    },
+    {
+      "epoch": 1.4542378917378918,
+      "grad_norm": 0.6697571873664856,
+      "learning_rate": 0.00014173817969743212,
+      "loss": 0.8876,
+      "step": 8168
+    },
+    {
+      "epoch": 1.4544159544159543,
+      "grad_norm": 0.6162588000297546,
+      "learning_rate": 0.000141725459348585,
+      "loss": 0.9575,
+      "step": 8169
+    },
+    {
+      "epoch": 1.454594017094017,
+      "grad_norm": 0.6235088109970093,
+      "learning_rate": 0.00014171273818221422,
+      "loss": 0.9209,
+      "step": 8170
+    },
+    {
+      "epoch": 1.4547720797720798,
+      "grad_norm": 0.6744212508201599,
+      "learning_rate": 0.00014170001619856896,
+      "loss": 0.9704,
+      "step": 8171
+    },
+    {
+      "epoch": 1.4549501424501425,
+      "grad_norm": 0.6781345009803772,
+      "learning_rate": 0.0001416872933978985,
+      "loss": 1.1507,
+      "step": 8172
+    },
+    {
+      "epoch": 1.455128205128205,
+      "grad_norm": 0.7160060405731201,
+      "learning_rate": 0.0001416745697804521,
+      "loss": 1.2529,
+      "step": 8173
+    },
+    {
+      "epoch": 1.4553062678062678,
+      "grad_norm": 0.6742389798164368,
+      "learning_rate": 0.00014166184534647913,
+      "loss": 1.0168,
+      "step": 8174
+    },
+    {
+      "epoch": 1.4554843304843303,
+      "grad_norm": 0.6685828566551208,
+      "learning_rate": 0.0001416491200962288,
+      "loss": 1.0807,
+      "step": 8175
+    },
+    {
+      "epoch": 1.455662393162393,
+      "grad_norm": 0.6998327374458313,
+      "learning_rate": 0.0001416363940299505,
+      "loss": 1.1711,
+      "step": 8176
+    },
+    {
+      "epoch": 1.4558404558404558,
+      "grad_norm": 0.7132518291473389,
+      "learning_rate": 0.00014162366714789358,
+      "loss": 1.1392,
+      "step": 8177
+    },
+    {
+      "epoch": 1.4560185185185186,
+      "grad_norm": 0.6995887160301208,
+      "learning_rate": 0.0001416109394503073,
+      "loss": 1.3335,
+      "step": 8178
+    },
+    {
+      "epoch": 1.4561965811965811,
+      "grad_norm": 0.7161234021186829,
+      "learning_rate": 0.00014159821093744115,
+      "loss": 0.9725,
+      "step": 8179
+    },
+    {
+      "epoch": 1.4563746438746439,
+      "grad_norm": 0.7678874135017395,
+      "learning_rate": 0.00014158548160954446,
+      "loss": 1.1578,
+      "step": 8180
+    },
+    {
+      "epoch": 1.4565527065527066,
+      "grad_norm": 0.67372065782547,
+      "learning_rate": 0.00014157275146686662,
+      "loss": 1.0867,
+      "step": 8181
+    },
+    {
+      "epoch": 1.4567307692307692,
+      "grad_norm": 0.7757831811904907,
+      "learning_rate": 0.00014156002050965712,
+      "loss": 0.9768,
+      "step": 8182
+    },
+    {
+      "epoch": 1.456908831908832,
+      "grad_norm": 0.7174801230430603,
+      "learning_rate": 0.00014154728873816533,
+      "loss": 1.1712,
+      "step": 8183
+    },
+    {
+      "epoch": 1.4570868945868947,
+      "grad_norm": 0.5972673892974854,
+      "learning_rate": 0.0001415345561526407,
+      "loss": 0.9571,
+      "step": 8184
+    },
+    {
+      "epoch": 1.4572649572649572,
+      "grad_norm": 0.7999650835990906,
+      "learning_rate": 0.00014152182275333275,
+      "loss": 1.0583,
+      "step": 8185
+    },
+    {
+      "epoch": 1.45744301994302,
+      "grad_norm": 0.6737848520278931,
+      "learning_rate": 0.00014150908854049091,
+      "loss": 1.0562,
+      "step": 8186
+    },
+    {
+      "epoch": 1.4576210826210827,
+      "grad_norm": 0.7756418585777283,
+      "learning_rate": 0.00014149635351436474,
+      "loss": 1.2301,
+      "step": 8187
+    },
+    {
+      "epoch": 1.4577991452991452,
+      "grad_norm": 0.5633914470672607,
+      "learning_rate": 0.00014148361767520374,
+      "loss": 0.8847,
+      "step": 8188
+    },
+    {
+      "epoch": 1.457977207977208,
+      "grad_norm": 0.8462759256362915,
+      "learning_rate": 0.00014147088102325737,
+      "loss": 0.8046,
+      "step": 8189
+    },
+    {
+      "epoch": 1.4581552706552707,
+      "grad_norm": 0.7081632614135742,
+      "learning_rate": 0.00014145814355877526,
+      "loss": 1.0764,
+      "step": 8190
+    },
+    {
+      "epoch": 1.4583333333333333,
+      "grad_norm": 0.7357106804847717,
+      "learning_rate": 0.00014144540528200698,
+      "loss": 1.0202,
+      "step": 8191
+    },
+    {
+      "epoch": 1.458511396011396,
+      "grad_norm": 0.603566586971283,
+      "learning_rate": 0.00014143266619320204,
+      "loss": 0.8214,
+      "step": 8192
+    },
+    {
+      "epoch": 1.4586894586894588,
+      "grad_norm": 0.6829110383987427,
+      "learning_rate": 0.00014141992629261007,
+      "loss": 0.9479,
+      "step": 8193
+    },
+    {
+      "epoch": 1.4588675213675213,
+      "grad_norm": 0.6822739839553833,
+      "learning_rate": 0.00014140718558048072,
+      "loss": 0.9117,
+      "step": 8194
+    },
+    {
+      "epoch": 1.459045584045584,
+      "grad_norm": 0.7383607029914856,
+      "learning_rate": 0.00014139444405706356,
+      "loss": 0.9819,
+      "step": 8195
+    },
+    {
+      "epoch": 1.4592236467236468,
+      "grad_norm": 0.6319897770881653,
+      "learning_rate": 0.00014138170172260826,
+      "loss": 1.0508,
+      "step": 8196
+    },
+    {
+      "epoch": 1.4594017094017093,
+      "grad_norm": 0.6804461479187012,
+      "learning_rate": 0.0001413689585773645,
+      "loss": 0.992,
+      "step": 8197
+    },
+    {
+      "epoch": 1.459579772079772,
+      "grad_norm": 0.6198720335960388,
+      "learning_rate": 0.0001413562146215819,
+      "loss": 1.0113,
+      "step": 8198
+    },
+    {
+      "epoch": 1.4597578347578348,
+      "grad_norm": 0.5968540906906128,
+      "learning_rate": 0.0001413434698555102,
+      "loss": 0.7562,
+      "step": 8199
+    },
+    {
+      "epoch": 1.4599358974358974,
+      "grad_norm": 0.5370334982872009,
+      "learning_rate": 0.00014133072427939913,
+      "loss": 0.9238,
+      "step": 8200
+    },
+    {
+      "epoch": 1.46011396011396,
+      "grad_norm": 0.6652548909187317,
+      "learning_rate": 0.00014131797789349832,
+      "loss": 0.9464,
+      "step": 8201
+    },
+    {
+      "epoch": 1.4602920227920229,
+      "grad_norm": 0.637852668762207,
+      "learning_rate": 0.00014130523069805757,
+      "loss": 1.0395,
+      "step": 8202
+    },
+    {
+      "epoch": 1.4604700854700854,
+      "grad_norm": 0.8186550140380859,
+      "learning_rate": 0.00014129248269332664,
+      "loss": 1.2116,
+      "step": 8203
+    },
+    {
+      "epoch": 1.4606481481481481,
+      "grad_norm": 0.5290196537971497,
+      "learning_rate": 0.00014127973387955528,
+      "loss": 0.7331,
+      "step": 8204
+    },
+    {
+      "epoch": 1.460826210826211,
+      "grad_norm": 0.6516342163085938,
+      "learning_rate": 0.00014126698425699332,
+      "loss": 0.9275,
+      "step": 8205
+    },
+    {
+      "epoch": 1.4610042735042734,
+      "grad_norm": 0.767254114151001,
+      "learning_rate": 0.00014125423382589048,
+      "loss": 0.9355,
+      "step": 8206
+    },
+    {
+      "epoch": 1.4611823361823362,
+      "grad_norm": 0.6476777195930481,
+      "learning_rate": 0.00014124148258649668,
+      "loss": 0.9263,
+      "step": 8207
+    },
+    {
+      "epoch": 1.461360398860399,
+      "grad_norm": 0.6737871766090393,
+      "learning_rate": 0.00014122873053906167,
+      "loss": 0.9815,
+      "step": 8208
+    },
+    {
+      "epoch": 1.4615384615384617,
+      "grad_norm": 0.6311159729957581,
+      "learning_rate": 0.00014121597768383532,
+      "loss": 0.9607,
+      "step": 8209
+    },
+    {
+      "epoch": 1.4617165242165242,
+      "grad_norm": 0.6061250567436218,
+      "learning_rate": 0.00014120322402106752,
+      "loss": 0.7428,
+      "step": 8210
+    },
+    {
+      "epoch": 1.461894586894587,
+      "grad_norm": 0.6916252970695496,
+      "learning_rate": 0.00014119046955100815,
+      "loss": 0.9664,
+      "step": 8211
+    },
+    {
+      "epoch": 1.4620726495726495,
+      "grad_norm": 0.6583660840988159,
+      "learning_rate": 0.00014117771427390706,
+      "loss": 1.0645,
+      "step": 8212
+    },
+    {
+      "epoch": 1.4622507122507122,
+      "grad_norm": 0.7034604549407959,
+      "learning_rate": 0.00014116495819001425,
+      "loss": 0.9223,
+      "step": 8213
+    },
+    {
+      "epoch": 1.462428774928775,
+      "grad_norm": 0.6378605961799622,
+      "learning_rate": 0.00014115220129957954,
+      "loss": 0.7963,
+      "step": 8214
+    },
+    {
+      "epoch": 1.4626068376068377,
+      "grad_norm": 0.6251596212387085,
+      "learning_rate": 0.00014113944360285297,
+      "loss": 0.9852,
+      "step": 8215
+    },
+    {
+      "epoch": 1.4627849002849003,
+      "grad_norm": 0.7055560946464539,
+      "learning_rate": 0.00014112668510008446,
+      "loss": 0.9342,
+      "step": 8216
+    },
+    {
+      "epoch": 1.462962962962963,
+      "grad_norm": 0.6250377893447876,
+      "learning_rate": 0.00014111392579152396,
+      "loss": 0.9886,
+      "step": 8217
+    },
+    {
+      "epoch": 1.4631410256410255,
+      "grad_norm": 0.6011185050010681,
+      "learning_rate": 0.00014110116567742152,
+      "loss": 0.8465,
+      "step": 8218
+    },
+    {
+      "epoch": 1.4633190883190883,
+      "grad_norm": 0.6632489562034607,
+      "learning_rate": 0.0001410884047580271,
+      "loss": 0.8619,
+      "step": 8219
+    },
+    {
+      "epoch": 1.463497150997151,
+      "grad_norm": 0.7194828987121582,
+      "learning_rate": 0.00014107564303359076,
+      "loss": 1.1231,
+      "step": 8220
+    },
+    {
+      "epoch": 1.4636752136752138,
+      "grad_norm": 0.7640393376350403,
+      "learning_rate": 0.0001410628805043625,
+      "loss": 1.1955,
+      "step": 8221
+    },
+    {
+      "epoch": 1.4638532763532763,
+      "grad_norm": 0.9118906259536743,
+      "learning_rate": 0.0001410501171705924,
+      "loss": 1.0555,
+      "step": 8222
+    },
+    {
+      "epoch": 1.464031339031339,
+      "grad_norm": 0.7545066475868225,
+      "learning_rate": 0.00014103735303253053,
+      "loss": 0.9425,
+      "step": 8223
+    },
+    {
+      "epoch": 1.4642094017094016,
+      "grad_norm": 0.6848801970481873,
+      "learning_rate": 0.000141024588090427,
+      "loss": 1.0418,
+      "step": 8224
+    },
+    {
+      "epoch": 1.4643874643874644,
+      "grad_norm": 0.6825160384178162,
+      "learning_rate": 0.00014101182234453185,
+      "loss": 0.9615,
+      "step": 8225
+    },
+    {
+      "epoch": 1.4645655270655271,
+      "grad_norm": 0.8258556723594666,
+      "learning_rate": 0.00014099905579509527,
+      "loss": 1.1237,
+      "step": 8226
+    },
+    {
+      "epoch": 1.4647435897435899,
+      "grad_norm": 0.6427522897720337,
+      "learning_rate": 0.00014098628844236733,
+      "loss": 1.0853,
+      "step": 8227
+    },
+    {
+      "epoch": 1.4649216524216524,
+      "grad_norm": 0.6476351022720337,
+      "learning_rate": 0.00014097352028659825,
+      "loss": 1.1286,
+      "step": 8228
+    },
+    {
+      "epoch": 1.4650997150997151,
+      "grad_norm": 0.7621034383773804,
+      "learning_rate": 0.00014096075132803812,
+      "loss": 1.1402,
+      "step": 8229
+    },
+    {
+      "epoch": 1.4652777777777777,
+      "grad_norm": 0.6629892587661743,
+      "learning_rate": 0.00014094798156693718,
+      "loss": 0.7108,
+      "step": 8230
+    },
+    {
+      "epoch": 1.4654558404558404,
+      "grad_norm": 0.6902043223381042,
+      "learning_rate": 0.00014093521100354557,
+      "loss": 1.1761,
+      "step": 8231
+    },
+    {
+      "epoch": 1.4656339031339032,
+      "grad_norm": 0.7422910928726196,
+      "learning_rate": 0.00014092243963811357,
+      "loss": 0.867,
+      "step": 8232
+    },
+    {
+      "epoch": 1.465811965811966,
+      "grad_norm": 0.7424963712692261,
+      "learning_rate": 0.00014090966747089137,
+      "loss": 1.015,
+      "step": 8233
+    },
+    {
+      "epoch": 1.4659900284900285,
+      "grad_norm": 0.6855891942977905,
+      "learning_rate": 0.0001408968945021292,
+      "loss": 0.9624,
+      "step": 8234
+    },
+    {
+      "epoch": 1.4661680911680912,
+      "grad_norm": 0.5968918204307556,
+      "learning_rate": 0.00014088412073207736,
+      "loss": 0.9243,
+      "step": 8235
+    },
+    {
+      "epoch": 1.4663461538461537,
+      "grad_norm": 0.6153344511985779,
+      "learning_rate": 0.0001408713461609861,
+      "loss": 1.0305,
+      "step": 8236
+    },
+    {
+      "epoch": 1.4665242165242165,
+      "grad_norm": 0.6627458333969116,
+      "learning_rate": 0.0001408585707891057,
+      "loss": 1.1102,
+      "step": 8237
+    },
+    {
+      "epoch": 1.4667022792022792,
+      "grad_norm": 0.6475233435630798,
+      "learning_rate": 0.0001408457946166865,
+      "loss": 1.0045,
+      "step": 8238
+    },
+    {
+      "epoch": 1.466880341880342,
+      "grad_norm": 0.6792858839035034,
+      "learning_rate": 0.00014083301764397876,
+      "loss": 1.0092,
+      "step": 8239
+    },
+    {
+      "epoch": 1.4670584045584045,
+      "grad_norm": 0.6916255354881287,
+      "learning_rate": 0.00014082023987123293,
+      "loss": 1.0761,
+      "step": 8240
+    },
+    {
+      "epoch": 1.4672364672364673,
+      "grad_norm": 0.7901251912117004,
+      "learning_rate": 0.00014080746129869923,
+      "loss": 0.8002,
+      "step": 8241
+    },
+    {
+      "epoch": 1.4674145299145298,
+      "grad_norm": 0.8078263401985168,
+      "learning_rate": 0.00014079468192662812,
+      "loss": 0.9738,
+      "step": 8242
+    },
+    {
+      "epoch": 1.4675925925925926,
+      "grad_norm": 0.6370784640312195,
+      "learning_rate": 0.00014078190175526996,
+      "loss": 1.0256,
+      "step": 8243
+    },
+    {
+      "epoch": 1.4677706552706553,
+      "grad_norm": 0.6087532639503479,
+      "learning_rate": 0.0001407691207848752,
+      "loss": 0.9747,
+      "step": 8244
+    },
+    {
+      "epoch": 1.467948717948718,
+      "grad_norm": 0.6333357691764832,
+      "learning_rate": 0.00014075633901569414,
+      "loss": 1.0135,
+      "step": 8245
+    },
+    {
+      "epoch": 1.4681267806267806,
+      "grad_norm": 0.6914255619049072,
+      "learning_rate": 0.00014074355644797733,
+      "loss": 1.0261,
+      "step": 8246
+    },
+    {
+      "epoch": 1.4683048433048433,
+      "grad_norm": 0.6374734044075012,
+      "learning_rate": 0.00014073077308197513,
+      "loss": 0.9197,
+      "step": 8247
+    },
+    {
+      "epoch": 1.4684829059829059,
+      "grad_norm": 0.8023789525032043,
+      "learning_rate": 0.00014071798891793807,
+      "loss": 1.1085,
+      "step": 8248
+    },
+    {
+      "epoch": 1.4686609686609686,
+      "grad_norm": 0.7722933888435364,
+      "learning_rate": 0.0001407052039561166,
+      "loss": 1.2018,
+      "step": 8249
+    },
+    {
+      "epoch": 1.4688390313390314,
+      "grad_norm": 0.6823393106460571,
+      "learning_rate": 0.0001406924181967612,
+      "loss": 1.088,
+      "step": 8250
+    },
+    {
+      "epoch": 1.4690170940170941,
+      "grad_norm": 0.7037357687950134,
+      "learning_rate": 0.00014067963164012242,
+      "loss": 1.0324,
+      "step": 8251
+    },
+    {
+      "epoch": 1.4691951566951567,
+      "grad_norm": 0.6549737453460693,
+      "learning_rate": 0.00014066684428645074,
+      "loss": 1.152,
+      "step": 8252
+    },
+    {
+      "epoch": 1.4693732193732194,
+      "grad_norm": 0.5349790453910828,
+      "learning_rate": 0.00014065405613599674,
+      "loss": 0.6996,
+      "step": 8253
+    },
+    {
+      "epoch": 1.469551282051282,
+      "grad_norm": 0.6760679483413696,
+      "learning_rate": 0.00014064126718901096,
+      "loss": 0.9856,
+      "step": 8254
+    },
+    {
+      "epoch": 1.4697293447293447,
+      "grad_norm": 0.5912436842918396,
+      "learning_rate": 0.00014062847744574395,
+      "loss": 1.0076,
+      "step": 8255
+    },
+    {
+      "epoch": 1.4699074074074074,
+      "grad_norm": 0.75101637840271,
+      "learning_rate": 0.00014061568690644632,
+      "loss": 1.0033,
+      "step": 8256
+    },
+    {
+      "epoch": 1.4700854700854702,
+      "grad_norm": 0.6233504414558411,
+      "learning_rate": 0.00014060289557136873,
+      "loss": 0.8525,
+      "step": 8257
+    },
+    {
+      "epoch": 1.4702635327635327,
+      "grad_norm": 0.659570038318634,
+      "learning_rate": 0.00014059010344076171,
+      "loss": 0.855,
+      "step": 8258
+    },
+    {
+      "epoch": 1.4704415954415955,
+      "grad_norm": 0.8096539974212646,
+      "learning_rate": 0.00014057731051487593,
+      "loss": 0.9905,
+      "step": 8259
+    },
+    {
+      "epoch": 1.470619658119658,
+      "grad_norm": 0.5829728245735168,
+      "learning_rate": 0.00014056451679396204,
+      "loss": 0.7974,
+      "step": 8260
+    },
+    {
+      "epoch": 1.4707977207977208,
+      "grad_norm": 0.6176979541778564,
+      "learning_rate": 0.0001405517222782707,
+      "loss": 0.9556,
+      "step": 8261
+    },
+    {
+      "epoch": 1.4709757834757835,
+      "grad_norm": 0.6322479248046875,
+      "learning_rate": 0.00014053892696805264,
+      "loss": 0.8837,
+      "step": 8262
+    },
+    {
+      "epoch": 1.4711538461538463,
+      "grad_norm": 0.6886917948722839,
+      "learning_rate": 0.0001405261308635585,
+      "loss": 0.9242,
+      "step": 8263
+    },
+    {
+      "epoch": 1.4713319088319088,
+      "grad_norm": 0.7474521994590759,
+      "learning_rate": 0.00014051333396503901,
+      "loss": 0.9906,
+      "step": 8264
+    },
+    {
+      "epoch": 1.4715099715099715,
+      "grad_norm": 0.7120978832244873,
+      "learning_rate": 0.00014050053627274488,
+      "loss": 1.1074,
+      "step": 8265
+    },
+    {
+      "epoch": 1.471688034188034,
+      "grad_norm": 0.6778998374938965,
+      "learning_rate": 0.0001404877377869269,
+      "loss": 1.0027,
+      "step": 8266
+    },
+    {
+      "epoch": 1.4718660968660968,
+      "grad_norm": 0.6832901239395142,
+      "learning_rate": 0.0001404749385078358,
+      "loss": 0.9399,
+      "step": 8267
+    },
+    {
+      "epoch": 1.4720441595441596,
+      "grad_norm": 0.7428423762321472,
+      "learning_rate": 0.00014046213843572236,
+      "loss": 1.0591,
+      "step": 8268
+    },
+    {
+      "epoch": 1.4722222222222223,
+      "grad_norm": 0.7522720098495483,
+      "learning_rate": 0.00014044933757083737,
+      "loss": 1.1184,
+      "step": 8269
+    },
+    {
+      "epoch": 1.4724002849002849,
+      "grad_norm": 0.7714734673500061,
+      "learning_rate": 0.00014043653591343163,
+      "loss": 1.0783,
+      "step": 8270
+    },
+    {
+      "epoch": 1.4725783475783476,
+      "grad_norm": 0.5860890746116638,
+      "learning_rate": 0.00014042373346375597,
+      "loss": 0.8394,
+      "step": 8271
+    },
+    {
+      "epoch": 1.4727564102564101,
+      "grad_norm": 0.6400395035743713,
+      "learning_rate": 0.0001404109302220612,
+      "loss": 0.9153,
+      "step": 8272
+    },
+    {
+      "epoch": 1.4729344729344729,
+      "grad_norm": 0.7441139817237854,
+      "learning_rate": 0.00014039812618859827,
+      "loss": 0.9224,
+      "step": 8273
+    },
+    {
+      "epoch": 1.4731125356125356,
+      "grad_norm": 0.6030932664871216,
+      "learning_rate": 0.00014038532136361793,
+      "loss": 1.0783,
+      "step": 8274
+    },
+    {
+      "epoch": 1.4732905982905984,
+      "grad_norm": 0.7243345975875854,
+      "learning_rate": 0.0001403725157473711,
+      "loss": 0.9894,
+      "step": 8275
+    },
+    {
+      "epoch": 1.473468660968661,
+      "grad_norm": 0.6880641579627991,
+      "learning_rate": 0.0001403597093401087,
+      "loss": 0.9459,
+      "step": 8276
+    },
+    {
+      "epoch": 1.4736467236467237,
+      "grad_norm": 0.6263882517814636,
+      "learning_rate": 0.00014034690214208165,
+      "loss": 0.8781,
+      "step": 8277
+    },
+    {
+      "epoch": 1.4738247863247862,
+      "grad_norm": 0.7159495949745178,
+      "learning_rate": 0.00014033409415354085,
+      "loss": 1.0511,
+      "step": 8278
+    },
+    {
+      "epoch": 1.474002849002849,
+      "grad_norm": 0.7182226181030273,
+      "learning_rate": 0.00014032128537473727,
+      "loss": 1.1196,
+      "step": 8279
+    },
+    {
+      "epoch": 1.4741809116809117,
+      "grad_norm": 0.744478166103363,
+      "learning_rate": 0.00014030847580592186,
+      "loss": 1.0747,
+      "step": 8280
+    },
+    {
+      "epoch": 1.4743589743589745,
+      "grad_norm": 0.6806797385215759,
+      "learning_rate": 0.00014029566544734558,
+      "loss": 1.1519,
+      "step": 8281
+    },
+    {
+      "epoch": 1.474537037037037,
+      "grad_norm": 0.6813502311706543,
+      "learning_rate": 0.00014028285429925946,
+      "loss": 0.968,
+      "step": 8282
+    },
+    {
+      "epoch": 1.4747150997150997,
+      "grad_norm": 0.639784574508667,
+      "learning_rate": 0.00014027004236191452,
+      "loss": 1.0685,
+      "step": 8283
+    },
+    {
+      "epoch": 1.4748931623931623,
+      "grad_norm": 0.6325878500938416,
+      "learning_rate": 0.00014025722963556173,
+      "loss": 1.0358,
+      "step": 8284
+    },
+    {
+      "epoch": 1.475071225071225,
+      "grad_norm": 0.7012955546379089,
+      "learning_rate": 0.00014024441612045215,
+      "loss": 1.1059,
+      "step": 8285
+    },
+    {
+      "epoch": 1.4752492877492878,
+      "grad_norm": 0.690380334854126,
+      "learning_rate": 0.00014023160181683684,
+      "loss": 0.9628,
+      "step": 8286
+    },
+    {
+      "epoch": 1.4754273504273505,
+      "grad_norm": 0.7178516983985901,
+      "learning_rate": 0.00014021878672496686,
+      "loss": 0.963,
+      "step": 8287
+    },
+    {
+      "epoch": 1.475605413105413,
+      "grad_norm": 0.7049064636230469,
+      "learning_rate": 0.0001402059708450933,
+      "loss": 0.8996,
+      "step": 8288
+    },
+    {
+      "epoch": 1.4757834757834758,
+      "grad_norm": 0.6777819395065308,
+      "learning_rate": 0.00014019315417746728,
+      "loss": 1.0696,
+      "step": 8289
+    },
+    {
+      "epoch": 1.4759615384615383,
+      "grad_norm": 0.5948763489723206,
+      "learning_rate": 0.00014018033672233987,
+      "loss": 0.928,
+      "step": 8290
+    },
+    {
+      "epoch": 1.476139601139601,
+      "grad_norm": 0.7183942198753357,
+      "learning_rate": 0.00014016751847996224,
+      "loss": 1.1053,
+      "step": 8291
+    },
+    {
+      "epoch": 1.4763176638176638,
+      "grad_norm": 0.7426177263259888,
+      "learning_rate": 0.00014015469945058556,
+      "loss": 0.9504,
+      "step": 8292
+    },
+    {
+      "epoch": 1.4764957264957266,
+      "grad_norm": 0.6508159041404724,
+      "learning_rate": 0.0001401418796344609,
+      "loss": 1.1176,
+      "step": 8293
+    },
+    {
+      "epoch": 1.476673789173789,
+      "grad_norm": 0.6954567432403564,
+      "learning_rate": 0.00014012905903183954,
+      "loss": 0.9238,
+      "step": 8294
+    },
+    {
+      "epoch": 1.4768518518518519,
+      "grad_norm": 0.7023960947990417,
+      "learning_rate": 0.0001401162376429726,
+      "loss": 1.2032,
+      "step": 8295
+    },
+    {
+      "epoch": 1.4770299145299146,
+      "grad_norm": 0.7174739837646484,
+      "learning_rate": 0.00014010341546811134,
+      "loss": 0.9385,
+      "step": 8296
+    },
+    {
+      "epoch": 1.4772079772079771,
+      "grad_norm": 0.611980140209198,
+      "learning_rate": 0.00014009059250750695,
+      "loss": 0.9469,
+      "step": 8297
+    },
+    {
+      "epoch": 1.47738603988604,
+      "grad_norm": 0.6362917423248291,
+      "learning_rate": 0.0001400777687614107,
+      "loss": 1.1406,
+      "step": 8298
+    },
+    {
+      "epoch": 1.4775641025641026,
+      "grad_norm": 0.6884697675704956,
+      "learning_rate": 0.00014006494423007381,
+      "loss": 0.7915,
+      "step": 8299
+    },
+    {
+      "epoch": 1.4777421652421652,
+      "grad_norm": 0.6266025304794312,
+      "learning_rate": 0.00014005211891374755,
+      "loss": 0.94,
+      "step": 8300
+    },
+    {
+      "epoch": 1.477920227920228,
+      "grad_norm": 0.6130280494689941,
+      "learning_rate": 0.00014003929281268323,
+      "loss": 0.9369,
+      "step": 8301
+    },
+    {
+      "epoch": 1.4780982905982907,
+      "grad_norm": 0.7244207859039307,
+      "learning_rate": 0.00014002646592713215,
+      "loss": 1.1449,
+      "step": 8302
+    },
+    {
+      "epoch": 1.4782763532763532,
+      "grad_norm": 0.6527345776557922,
+      "learning_rate": 0.0001400136382573456,
+      "loss": 0.7792,
+      "step": 8303
+    },
+    {
+      "epoch": 1.478454415954416,
+      "grad_norm": 0.7102689743041992,
+      "learning_rate": 0.00014000080980357496,
+      "loss": 0.9577,
+      "step": 8304
+    },
+    {
+      "epoch": 1.4786324786324787,
+      "grad_norm": 0.6179325580596924,
+      "learning_rate": 0.00013998798056607154,
+      "loss": 0.827,
+      "step": 8305
+    },
+    {
+      "epoch": 1.4788105413105412,
+      "grad_norm": 0.761234700679779,
+      "learning_rate": 0.00013997515054508668,
+      "loss": 1.0576,
+      "step": 8306
+    },
+    {
+      "epoch": 1.478988603988604,
+      "grad_norm": 0.6200914978981018,
+      "learning_rate": 0.0001399623197408718,
+      "loss": 1.0514,
+      "step": 8307
+    },
+    {
+      "epoch": 1.4791666666666667,
+      "grad_norm": 0.5961193442344666,
+      "learning_rate": 0.0001399494881536783,
+      "loss": 0.7846,
+      "step": 8308
+    },
+    {
+      "epoch": 1.4793447293447293,
+      "grad_norm": 0.645984411239624,
+      "learning_rate": 0.00013993665578375758,
+      "loss": 0.9927,
+      "step": 8309
+    },
+    {
+      "epoch": 1.479522792022792,
+      "grad_norm": 0.7258989810943604,
+      "learning_rate": 0.000139923822631361,
+      "loss": 0.7567,
+      "step": 8310
+    },
+    {
+      "epoch": 1.4797008547008548,
+      "grad_norm": 0.708882212638855,
+      "learning_rate": 0.00013991098869674007,
+      "loss": 1.1147,
+      "step": 8311
+    },
+    {
+      "epoch": 1.4798789173789173,
+      "grad_norm": 0.669262707233429,
+      "learning_rate": 0.00013989815398014624,
+      "loss": 0.7142,
+      "step": 8312
+    },
+    {
+      "epoch": 1.48005698005698,
+      "grad_norm": 0.7398767471313477,
+      "learning_rate": 0.00013988531848183096,
+      "loss": 1.043,
+      "step": 8313
+    },
+    {
+      "epoch": 1.4802350427350428,
+      "grad_norm": 0.753197193145752,
+      "learning_rate": 0.0001398724822020457,
+      "loss": 1.058,
+      "step": 8314
+    },
+    {
+      "epoch": 1.4804131054131053,
+      "grad_norm": 0.663526177406311,
+      "learning_rate": 0.000139859645141042,
+      "loss": 1.1272,
+      "step": 8315
+    },
+    {
+      "epoch": 1.480591168091168,
+      "grad_norm": 0.6537514925003052,
+      "learning_rate": 0.00013984680729907135,
+      "loss": 1.011,
+      "step": 8316
+    },
+    {
+      "epoch": 1.4807692307692308,
+      "grad_norm": 0.707554817199707,
+      "learning_rate": 0.00013983396867638527,
+      "loss": 1.0593,
+      "step": 8317
+    },
+    {
+      "epoch": 1.4809472934472934,
+      "grad_norm": 0.6261475086212158,
+      "learning_rate": 0.00013982112927323533,
+      "loss": 1.0731,
+      "step": 8318
+    },
+    {
+      "epoch": 1.4811253561253561,
+      "grad_norm": 0.6694258451461792,
+      "learning_rate": 0.00013980828908987308,
+      "loss": 1.0703,
+      "step": 8319
+    },
+    {
+      "epoch": 1.4813034188034189,
+      "grad_norm": 0.7793164253234863,
+      "learning_rate": 0.00013979544812655012,
+      "loss": 1.0447,
+      "step": 8320
+    },
+    {
+      "epoch": 1.4814814814814814,
+      "grad_norm": 0.6496448516845703,
+      "learning_rate": 0.00013978260638351802,
+      "loss": 1.0208,
+      "step": 8321
+    },
+    {
+      "epoch": 1.4816595441595442,
+      "grad_norm": 0.5992059111595154,
+      "learning_rate": 0.00013976976386102834,
+      "loss": 0.9717,
+      "step": 8322
+    },
+    {
+      "epoch": 1.481837606837607,
+      "grad_norm": 0.7473567128181458,
+      "learning_rate": 0.0001397569205593328,
+      "loss": 0.9612,
+      "step": 8323
+    },
+    {
+      "epoch": 1.4820156695156697,
+      "grad_norm": 0.657558798789978,
+      "learning_rate": 0.00013974407647868297,
+      "loss": 1.2137,
+      "step": 8324
+    },
+    {
+      "epoch": 1.4821937321937322,
+      "grad_norm": 0.7040614485740662,
+      "learning_rate": 0.00013973123161933055,
+      "loss": 1.007,
+      "step": 8325
+    },
+    {
+      "epoch": 1.482371794871795,
+      "grad_norm": 0.6098681092262268,
+      "learning_rate": 0.00013971838598152717,
+      "loss": 1.0595,
+      "step": 8326
+    },
+    {
+      "epoch": 1.4825498575498575,
+      "grad_norm": 0.7194869518280029,
+      "learning_rate": 0.0001397055395655245,
+      "loss": 0.9632,
+      "step": 8327
+    },
+    {
+      "epoch": 1.4827279202279202,
+      "grad_norm": 0.645972728729248,
+      "learning_rate": 0.00013969269237157426,
+      "loss": 1.0712,
+      "step": 8328
+    },
+    {
+      "epoch": 1.482905982905983,
+      "grad_norm": 0.6580560207366943,
+      "learning_rate": 0.0001396798443999282,
+      "loss": 1.2117,
+      "step": 8329
+    },
+    {
+      "epoch": 1.4830840455840457,
+      "grad_norm": 0.6624418497085571,
+      "learning_rate": 0.00013966699565083802,
+      "loss": 0.8529,
+      "step": 8330
+    },
+    {
+      "epoch": 1.4832621082621082,
+      "grad_norm": 0.659896731376648,
+      "learning_rate": 0.00013965414612455545,
+      "loss": 0.9359,
+      "step": 8331
+    },
+    {
+      "epoch": 1.483440170940171,
+      "grad_norm": 0.6690883636474609,
+      "learning_rate": 0.00013964129582133222,
+      "loss": 0.971,
+      "step": 8332
+    },
+    {
+      "epoch": 1.4836182336182335,
+      "grad_norm": 0.6767334938049316,
+      "learning_rate": 0.00013962844474142022,
+      "loss": 1.0137,
+      "step": 8333
+    },
+    {
+      "epoch": 1.4837962962962963,
+      "grad_norm": 0.6412752270698547,
+      "learning_rate": 0.0001396155928850711,
+      "loss": 1.2812,
+      "step": 8334
+    },
+    {
+      "epoch": 1.483974358974359,
+      "grad_norm": 0.6731469035148621,
+      "learning_rate": 0.0001396027402525368,
+      "loss": 0.8723,
+      "step": 8335
+    },
+    {
+      "epoch": 1.4841524216524218,
+      "grad_norm": 0.7327923774719238,
+      "learning_rate": 0.000139589886844069,
+      "loss": 0.9606,
+      "step": 8336
+    },
+    {
+      "epoch": 1.4843304843304843,
+      "grad_norm": 0.6194515824317932,
+      "learning_rate": 0.00013957703265991963,
+      "loss": 0.8514,
+      "step": 8337
+    },
+    {
+      "epoch": 1.484508547008547,
+      "grad_norm": 0.7250012755393982,
+      "learning_rate": 0.00013956417770034053,
+      "loss": 0.9755,
+      "step": 8338
+    },
+    {
+      "epoch": 1.4846866096866096,
+      "grad_norm": 0.7484263181686401,
+      "learning_rate": 0.00013955132196558358,
+      "loss": 1.0376,
+      "step": 8339
+    },
+    {
+      "epoch": 1.4848646723646723,
+      "grad_norm": 0.7593362331390381,
+      "learning_rate": 0.00013953846545590058,
+      "loss": 1.3011,
+      "step": 8340
+    },
+    {
+      "epoch": 1.485042735042735,
+      "grad_norm": 0.6670466065406799,
+      "learning_rate": 0.00013952560817154352,
+      "loss": 0.9726,
+      "step": 8341
+    },
+    {
+      "epoch": 1.4852207977207978,
+      "grad_norm": 0.8001134395599365,
+      "learning_rate": 0.00013951275011276425,
+      "loss": 1.1447,
+      "step": 8342
+    },
+    {
+      "epoch": 1.4853988603988604,
+      "grad_norm": 0.741450309753418,
+      "learning_rate": 0.00013949989127981475,
+      "loss": 1.1101,
+      "step": 8343
+    },
+    {
+      "epoch": 1.4855769230769231,
+      "grad_norm": 0.6594467163085938,
+      "learning_rate": 0.00013948703167294694,
+      "loss": 1.0205,
+      "step": 8344
+    },
+    {
+      "epoch": 1.4857549857549857,
+      "grad_norm": 0.6303030252456665,
+      "learning_rate": 0.00013947417129241276,
+      "loss": 0.9179,
+      "step": 8345
+    },
+    {
+      "epoch": 1.4859330484330484,
+      "grad_norm": 0.6352720856666565,
+      "learning_rate": 0.00013946131013846418,
+      "loss": 1.158,
+      "step": 8346
+    },
+    {
+      "epoch": 1.4861111111111112,
+      "grad_norm": 0.6720923781394958,
+      "learning_rate": 0.0001394484482113532,
+      "loss": 0.8805,
+      "step": 8347
+    },
+    {
+      "epoch": 1.486289173789174,
+      "grad_norm": 0.7186421751976013,
+      "learning_rate": 0.00013943558551133186,
+      "loss": 0.8951,
+      "step": 8348
+    },
+    {
+      "epoch": 1.4864672364672364,
+      "grad_norm": 0.6038698554039001,
+      "learning_rate": 0.00013942272203865214,
+      "loss": 1.0079,
+      "step": 8349
+    },
+    {
+      "epoch": 1.4866452991452992,
+      "grad_norm": 0.665790319442749,
+      "learning_rate": 0.00013940985779356606,
+      "loss": 0.8853,
+      "step": 8350
+    },
+    {
+      "epoch": 1.4868233618233617,
+      "grad_norm": 0.6941595673561096,
+      "learning_rate": 0.00013939699277632568,
+      "loss": 1.1404,
+      "step": 8351
+    },
+    {
+      "epoch": 1.4870014245014245,
+      "grad_norm": 0.7943871021270752,
+      "learning_rate": 0.00013938412698718305,
+      "loss": 0.9961,
+      "step": 8352
+    },
+    {
+      "epoch": 1.4871794871794872,
+      "grad_norm": 0.6363818645477295,
+      "learning_rate": 0.00013937126042639028,
+      "loss": 0.8621,
+      "step": 8353
+    },
+    {
+      "epoch": 1.48735754985755,
+      "grad_norm": 0.7986421585083008,
+      "learning_rate": 0.00013935839309419943,
+      "loss": 1.0547,
+      "step": 8354
+    },
+    {
+      "epoch": 1.4875356125356125,
+      "grad_norm": 0.5890130400657654,
+      "learning_rate": 0.00013934552499086266,
+      "loss": 0.9863,
+      "step": 8355
+    },
+    {
+      "epoch": 1.4877136752136753,
+      "grad_norm": 0.7915370464324951,
+      "learning_rate": 0.00013933265611663207,
+      "loss": 1.0385,
+      "step": 8356
+    },
+    {
+      "epoch": 1.4878917378917378,
+      "grad_norm": 0.7062503695487976,
+      "learning_rate": 0.00013931978647175973,
+      "loss": 1.0984,
+      "step": 8357
+    },
+    {
+      "epoch": 1.4880698005698005,
+      "grad_norm": 0.6496769785881042,
+      "learning_rate": 0.00013930691605649792,
+      "loss": 1.0884,
+      "step": 8358
+    },
+    {
+      "epoch": 1.4882478632478633,
+      "grad_norm": 0.6527266502380371,
+      "learning_rate": 0.0001392940448710987,
+      "loss": 1.0366,
+      "step": 8359
+    },
+    {
+      "epoch": 1.488425925925926,
+      "grad_norm": 0.6269870400428772,
+      "learning_rate": 0.00013928117291581431,
+      "loss": 0.9097,
+      "step": 8360
+    },
+    {
+      "epoch": 1.4886039886039886,
+      "grad_norm": 0.6581160426139832,
+      "learning_rate": 0.00013926830019089694,
+      "loss": 0.8694,
+      "step": 8361
+    },
+    {
+      "epoch": 1.4887820512820513,
+      "grad_norm": 0.6196219325065613,
+      "learning_rate": 0.0001392554266965988,
+      "loss": 0.8054,
+      "step": 8362
+    },
+    {
+      "epoch": 1.4889601139601139,
+      "grad_norm": 0.6246176362037659,
+      "learning_rate": 0.0001392425524331721,
+      "loss": 0.9309,
+      "step": 8363
+    },
+    {
+      "epoch": 1.4891381766381766,
+      "grad_norm": 0.7293874025344849,
+      "learning_rate": 0.00013922967740086914,
+      "loss": 1.051,
+      "step": 8364
+    },
+    {
+      "epoch": 1.4893162393162394,
+      "grad_norm": 0.6581604480743408,
+      "learning_rate": 0.00013921680159994213,
+      "loss": 0.8475,
+      "step": 8365
+    },
+    {
+      "epoch": 1.489494301994302,
+      "grad_norm": 0.6294612288475037,
+      "learning_rate": 0.00013920392503064335,
+      "loss": 0.6946,
+      "step": 8366
+    },
+    {
+      "epoch": 1.4896723646723646,
+      "grad_norm": 0.5725370645523071,
+      "learning_rate": 0.00013919104769322512,
+      "loss": 0.7838,
+      "step": 8367
+    },
+    {
+      "epoch": 1.4898504273504274,
+      "grad_norm": 0.681520402431488,
+      "learning_rate": 0.00013917816958793967,
+      "loss": 0.99,
+      "step": 8368
+    },
+    {
+      "epoch": 1.49002849002849,
+      "grad_norm": 0.6660219430923462,
+      "learning_rate": 0.00013916529071503943,
+      "loss": 0.9113,
+      "step": 8369
+    },
+    {
+      "epoch": 1.4902065527065527,
+      "grad_norm": 0.7567862272262573,
+      "learning_rate": 0.00013915241107477665,
+      "loss": 1.2498,
+      "step": 8370
+    },
+    {
+      "epoch": 1.4903846153846154,
+      "grad_norm": 0.7366036176681519,
+      "learning_rate": 0.00013913953066740372,
+      "loss": 1.115,
+      "step": 8371
+    },
+    {
+      "epoch": 1.4905626780626782,
+      "grad_norm": 0.6201434135437012,
+      "learning_rate": 0.00013912664949317297,
+      "loss": 0.8447,
+      "step": 8372
+    },
+    {
+      "epoch": 1.4907407407407407,
+      "grad_norm": 0.7618655562400818,
+      "learning_rate": 0.00013911376755233683,
+      "loss": 0.9696,
+      "step": 8373
+    },
+    {
+      "epoch": 1.4909188034188035,
+      "grad_norm": 0.6716726422309875,
+      "learning_rate": 0.00013910088484514764,
+      "loss": 0.9753,
+      "step": 8374
+    },
+    {
+      "epoch": 1.491096866096866,
+      "grad_norm": 0.6745659112930298,
+      "learning_rate": 0.0001390880013718579,
+      "loss": 1.134,
+      "step": 8375
+    },
+    {
+      "epoch": 1.4912749287749287,
+      "grad_norm": 0.7524410486221313,
+      "learning_rate": 0.0001390751171327199,
+      "loss": 1.0235,
+      "step": 8376
+    },
+    {
+      "epoch": 1.4914529914529915,
+      "grad_norm": 0.7409411072731018,
+      "learning_rate": 0.00013906223212798615,
+      "loss": 0.752,
+      "step": 8377
+    },
+    {
+      "epoch": 1.4916310541310542,
+      "grad_norm": 0.7016384601593018,
+      "learning_rate": 0.00013904934635790913,
+      "loss": 1.1712,
+      "step": 8378
+    },
+    {
+      "epoch": 1.4918091168091168,
+      "grad_norm": 0.6537824869155884,
+      "learning_rate": 0.00013903645982274129,
+      "loss": 1.1162,
+      "step": 8379
+    },
+    {
+      "epoch": 1.4919871794871795,
+      "grad_norm": 0.6460806727409363,
+      "learning_rate": 0.0001390235725227351,
+      "loss": 0.9389,
+      "step": 8380
+    },
+    {
+      "epoch": 1.492165242165242,
+      "grad_norm": 0.6405501365661621,
+      "learning_rate": 0.0001390106844581431,
+      "loss": 1.0508,
+      "step": 8381
+    },
+    {
+      "epoch": 1.4923433048433048,
+      "grad_norm": 0.6672594547271729,
+      "learning_rate": 0.00013899779562921775,
+      "loss": 1.0018,
+      "step": 8382
+    },
+    {
+      "epoch": 1.4925213675213675,
+      "grad_norm": 0.6303185820579529,
+      "learning_rate": 0.0001389849060362116,
+      "loss": 0.9964,
+      "step": 8383
+    },
+    {
+      "epoch": 1.4926994301994303,
+      "grad_norm": 0.6981508731842041,
+      "learning_rate": 0.00013897201567937719,
+      "loss": 1.174,
+      "step": 8384
+    },
+    {
+      "epoch": 1.4928774928774928,
+      "grad_norm": 0.6195989847183228,
+      "learning_rate": 0.0001389591245589671,
+      "loss": 0.9254,
+      "step": 8385
+    },
+    {
+      "epoch": 1.4930555555555556,
+      "grad_norm": 0.6232163310050964,
+      "learning_rate": 0.00013894623267523393,
+      "loss": 0.7151,
+      "step": 8386
+    },
+    {
+      "epoch": 1.493233618233618,
+      "grad_norm": 0.673067033290863,
+      "learning_rate": 0.0001389333400284302,
+      "loss": 1.0156,
+      "step": 8387
+    },
+    {
+      "epoch": 1.4934116809116809,
+      "grad_norm": 0.706266462802887,
+      "learning_rate": 0.00013892044661880856,
+      "loss": 0.9387,
+      "step": 8388
+    },
+    {
+      "epoch": 1.4935897435897436,
+      "grad_norm": 0.742640495300293,
+      "learning_rate": 0.00013890755244662161,
+      "loss": 1.1597,
+      "step": 8389
+    },
+    {
+      "epoch": 1.4937678062678064,
+      "grad_norm": 0.6856846809387207,
+      "learning_rate": 0.000138894657512122,
+      "loss": 0.9998,
+      "step": 8390
+    },
+    {
+      "epoch": 1.493945868945869,
+      "grad_norm": 0.7214110493659973,
+      "learning_rate": 0.0001388817618155624,
+      "loss": 1.1867,
+      "step": 8391
+    },
+    {
+      "epoch": 1.4941239316239316,
+      "grad_norm": 0.7346787452697754,
+      "learning_rate": 0.0001388688653571954,
+      "loss": 0.9071,
+      "step": 8392
+    },
+    {
+      "epoch": 1.4943019943019942,
+      "grad_norm": 0.7019181847572327,
+      "learning_rate": 0.00013885596813727373,
+      "loss": 1.0472,
+      "step": 8393
+    },
+    {
+      "epoch": 1.494480056980057,
+      "grad_norm": 0.6780814528465271,
+      "learning_rate": 0.00013884307015605012,
+      "loss": 1.0031,
+      "step": 8394
+    },
+    {
+      "epoch": 1.4946581196581197,
+      "grad_norm": 0.6722873449325562,
+      "learning_rate": 0.0001388301714137772,
+      "loss": 0.8889,
+      "step": 8395
+    },
+    {
+      "epoch": 1.4948361823361824,
+      "grad_norm": 0.6736134886741638,
+      "learning_rate": 0.00013881727191070777,
+      "loss": 0.8695,
+      "step": 8396
+    },
+    {
+      "epoch": 1.495014245014245,
+      "grad_norm": 0.632648766040802,
+      "learning_rate": 0.00013880437164709452,
+      "loss": 0.9391,
+      "step": 8397
+    },
+    {
+      "epoch": 1.4951923076923077,
+      "grad_norm": 0.7004299163818359,
+      "learning_rate": 0.0001387914706231902,
+      "loss": 1.1423,
+      "step": 8398
+    },
+    {
+      "epoch": 1.4953703703703702,
+      "grad_norm": 0.5787134766578674,
+      "learning_rate": 0.0001387785688392476,
+      "loss": 0.9953,
+      "step": 8399
+    },
+    {
+      "epoch": 1.495548433048433,
+      "grad_norm": 0.6671785712242126,
+      "learning_rate": 0.0001387656662955195,
+      "loss": 0.9356,
+      "step": 8400
+    },
+    {
+      "epoch": 1.4957264957264957,
+      "grad_norm": 0.7216096520423889,
+      "learning_rate": 0.0001387527629922587,
+      "loss": 0.9065,
+      "step": 8401
+    },
+    {
+      "epoch": 1.4959045584045585,
+      "grad_norm": 0.6469849348068237,
+      "learning_rate": 0.00013873985892971801,
+      "loss": 1.0664,
+      "step": 8402
+    },
+    {
+      "epoch": 1.496082621082621,
+      "grad_norm": 0.5598217248916626,
+      "learning_rate": 0.00013872695410815027,
+      "loss": 0.8834,
+      "step": 8403
+    },
+    {
+      "epoch": 1.4962606837606838,
+      "grad_norm": 0.6860302686691284,
+      "learning_rate": 0.00013871404852780828,
+      "loss": 0.9061,
+      "step": 8404
+    },
+    {
+      "epoch": 1.4964387464387463,
+      "grad_norm": 0.7101688385009766,
+      "learning_rate": 0.00013870114218894497,
+      "loss": 1.0236,
+      "step": 8405
+    },
+    {
+      "epoch": 1.496616809116809,
+      "grad_norm": 0.6494225859642029,
+      "learning_rate": 0.00013868823509181313,
+      "loss": 0.9631,
+      "step": 8406
+    },
+    {
+      "epoch": 1.4967948717948718,
+      "grad_norm": 0.6804189085960388,
+      "learning_rate": 0.00013867532723666574,
+      "loss": 0.9341,
+      "step": 8407
+    },
+    {
+      "epoch": 1.4969729344729346,
+      "grad_norm": 0.8493942022323608,
+      "learning_rate": 0.00013866241862375562,
+      "loss": 1.1451,
+      "step": 8408
+    },
+    {
+      "epoch": 1.497150997150997,
+      "grad_norm": 0.6248497366905212,
+      "learning_rate": 0.00013864950925333576,
+      "loss": 0.8584,
+      "step": 8409
+    },
+    {
+      "epoch": 1.4973290598290598,
+      "grad_norm": 0.6238769292831421,
+      "learning_rate": 0.00013863659912565903,
+      "loss": 1.1612,
+      "step": 8410
+    },
+    {
+      "epoch": 1.4975071225071226,
+      "grad_norm": 0.8538609147071838,
+      "learning_rate": 0.0001386236882409784,
+      "loss": 1.0817,
+      "step": 8411
+    },
+    {
+      "epoch": 1.4976851851851851,
+      "grad_norm": 0.7301406264305115,
+      "learning_rate": 0.00013861077659954683,
+      "loss": 0.943,
+      "step": 8412
+    },
+    {
+      "epoch": 1.4978632478632479,
+      "grad_norm": 0.6573456525802612,
+      "learning_rate": 0.0001385978642016173,
+      "loss": 1.0154,
+      "step": 8413
+    },
+    {
+      "epoch": 1.4980413105413106,
+      "grad_norm": 0.7634185552597046,
+      "learning_rate": 0.0001385849510474428,
+      "loss": 1.0432,
+      "step": 8414
+    },
+    {
+      "epoch": 1.4982193732193732,
+      "grad_norm": 0.6156686544418335,
+      "learning_rate": 0.00013857203713727633,
+      "loss": 1.0442,
+      "step": 8415
+    },
+    {
+      "epoch": 1.498397435897436,
+      "grad_norm": 0.5386871695518494,
+      "learning_rate": 0.00013855912247137092,
+      "loss": 0.9055,
+      "step": 8416
+    },
+    {
+      "epoch": 1.4985754985754987,
+      "grad_norm": 0.7108574509620667,
+      "learning_rate": 0.00013854620704997962,
+      "loss": 0.9705,
+      "step": 8417
+    },
+    {
+      "epoch": 1.4987535612535612,
+      "grad_norm": 0.7313347458839417,
+      "learning_rate": 0.00013853329087335547,
+      "loss": 0.7541,
+      "step": 8418
+    },
+    {
+      "epoch": 1.498931623931624,
+      "grad_norm": 0.8369119167327881,
+      "learning_rate": 0.0001385203739417515,
+      "loss": 1.1317,
+      "step": 8419
+    },
+    {
+      "epoch": 1.4991096866096867,
+      "grad_norm": 0.6763789057731628,
+      "learning_rate": 0.00013850745625542085,
+      "loss": 0.7909,
+      "step": 8420
+    },
+    {
+      "epoch": 1.4992877492877492,
+      "grad_norm": 0.7369635105133057,
+      "learning_rate": 0.00013849453781461656,
+      "loss": 1.1454,
+      "step": 8421
+    },
+    {
+      "epoch": 1.499465811965812,
+      "grad_norm": 0.7165971398353577,
+      "learning_rate": 0.0001384816186195918,
+      "loss": 1.1927,
+      "step": 8422
+    },
+    {
+      "epoch": 1.4996438746438747,
+      "grad_norm": 0.7502337694168091,
+      "learning_rate": 0.00013846869867059966,
+      "loss": 1.0592,
+      "step": 8423
+    },
+    {
+      "epoch": 1.4998219373219372,
+      "grad_norm": 0.7207813858985901,
+      "learning_rate": 0.00013845577796789326,
+      "loss": 1.1133,
+      "step": 8424
+    },
+    {
+      "epoch": 1.4998219373219372,
+      "eval_loss": 1.1057652235031128,
+      "eval_runtime": 24.7975,
+      "eval_samples_per_second": 41.98,
+      "eval_steps_per_second": 21.01,
+      "step": 8424
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 0.6962727308273315,
+      "learning_rate": 0.00013844285651172576,
+      "loss": 1.0711,
+      "step": 8425
+    },
+    {
+      "epoch": 1.5001780626780628,
+      "grad_norm": 0.6585133075714111,
+      "learning_rate": 0.00013842993430235038,
+      "loss": 0.9793,
+      "step": 8426
+    },
+    {
+      "epoch": 1.5003561253561255,
+      "grad_norm": 0.7045056819915771,
+      "learning_rate": 0.00013841701134002029,
+      "loss": 1.0046,
+      "step": 8427
+    },
+    {
+      "epoch": 1.500534188034188,
+      "grad_norm": 0.6788702011108398,
+      "learning_rate": 0.00013840408762498863,
+      "loss": 0.9539,
+      "step": 8428
+    },
+    {
+      "epoch": 1.5007122507122506,
+      "grad_norm": 0.7253114581108093,
+      "learning_rate": 0.00013839116315750863,
+      "loss": 0.9446,
+      "step": 8429
+    },
+    {
+      "epoch": 1.5008903133903133,
+      "grad_norm": 0.6103765368461609,
+      "learning_rate": 0.0001383782379378336,
+      "loss": 0.7862,
+      "step": 8430
+    },
+    {
+      "epoch": 1.501068376068376,
+      "grad_norm": 0.6662353873252869,
+      "learning_rate": 0.00013836531196621666,
+      "loss": 1.2178,
+      "step": 8431
+    },
+    {
+      "epoch": 1.5012464387464388,
+      "grad_norm": 0.6871803998947144,
+      "learning_rate": 0.00013835238524291117,
+      "loss": 0.9263,
+      "step": 8432
+    },
+    {
+      "epoch": 1.5014245014245016,
+      "grad_norm": 0.62713223695755,
+      "learning_rate": 0.00013833945776817034,
+      "loss": 0.8879,
+      "step": 8433
+    },
+    {
+      "epoch": 1.501602564102564,
+      "grad_norm": 0.6698164343833923,
+      "learning_rate": 0.00013832652954224748,
+      "loss": 0.9847,
+      "step": 8434
+    },
+    {
+      "epoch": 1.5017806267806266,
+      "grad_norm": 0.6855883002281189,
+      "learning_rate": 0.0001383136005653959,
+      "loss": 0.8614,
+      "step": 8435
+    },
+    {
+      "epoch": 1.5019586894586894,
+      "grad_norm": 0.7028802037239075,
+      "learning_rate": 0.0001383006708378689,
+      "loss": 1.0153,
+      "step": 8436
+    },
+    {
+      "epoch": 1.5021367521367521,
+      "grad_norm": 0.6710380911827087,
+      "learning_rate": 0.00013828774035991981,
+      "loss": 1.0163,
+      "step": 8437
+    },
+    {
+      "epoch": 1.5023148148148149,
+      "grad_norm": 0.618984580039978,
+      "learning_rate": 0.000138274809131802,
+      "loss": 1.0015,
+      "step": 8438
+    },
+    {
+      "epoch": 1.5024928774928776,
+      "grad_norm": 0.6881645321846008,
+      "learning_rate": 0.00013826187715376882,
+      "loss": 0.9776,
+      "step": 8439
+    },
+    {
+      "epoch": 1.5026709401709402,
+      "grad_norm": 0.6715859770774841,
+      "learning_rate": 0.00013824894442607358,
+      "loss": 0.9129,
+      "step": 8440
+    },
+    {
+      "epoch": 1.5028490028490027,
+      "grad_norm": 0.5940943360328674,
+      "learning_rate": 0.0001382360109489698,
+      "loss": 1.0724,
+      "step": 8441
+    },
+    {
+      "epoch": 1.5030270655270654,
+      "grad_norm": 0.6536458134651184,
+      "learning_rate": 0.0001382230767227108,
+      "loss": 1.0162,
+      "step": 8442
+    },
+    {
+      "epoch": 1.5032051282051282,
+      "grad_norm": 0.6163156628608704,
+      "learning_rate": 0.00013821014174755,
+      "loss": 1.0521,
+      "step": 8443
+    },
+    {
+      "epoch": 1.503383190883191,
+      "grad_norm": 0.7592282891273499,
+      "learning_rate": 0.00013819720602374082,
+      "loss": 0.9525,
+      "step": 8444
+    },
+    {
+      "epoch": 1.5035612535612537,
+      "grad_norm": 0.6672595143318176,
+      "learning_rate": 0.0001381842695515368,
+      "loss": 0.9359,
+      "step": 8445
+    },
+    {
+      "epoch": 1.5037393162393162,
+      "grad_norm": 0.6395034193992615,
+      "learning_rate": 0.0001381713323311913,
+      "loss": 1.166,
+      "step": 8446
+    },
+    {
+      "epoch": 1.5039173789173788,
+      "grad_norm": 0.5958148837089539,
+      "learning_rate": 0.00013815839436295783,
+      "loss": 0.9885,
+      "step": 8447
+    },
+    {
+      "epoch": 1.5040954415954415,
+      "grad_norm": 0.676555871963501,
+      "learning_rate": 0.0001381454556470899,
+      "loss": 1.0637,
+      "step": 8448
+    },
+    {
+      "epoch": 1.5042735042735043,
+      "grad_norm": 0.642428994178772,
+      "learning_rate": 0.00013813251618384102,
+      "loss": 0.9288,
+      "step": 8449
+    },
+    {
+      "epoch": 1.504451566951567,
+      "grad_norm": 0.6730920076370239,
+      "learning_rate": 0.00013811957597346467,
+      "loss": 1.1345,
+      "step": 8450
+    },
+    {
+      "epoch": 1.5046296296296298,
+      "grad_norm": 0.7824259996414185,
+      "learning_rate": 0.00013810663501621443,
+      "loss": 0.7532,
+      "step": 8451
+    },
+    {
+      "epoch": 1.5048076923076923,
+      "grad_norm": 0.8184825778007507,
+      "learning_rate": 0.00013809369331234386,
+      "loss": 1.2674,
+      "step": 8452
+    },
+    {
+      "epoch": 1.5049857549857548,
+      "grad_norm": 0.7369286417961121,
+      "learning_rate": 0.00013808075086210647,
+      "loss": 1.0978,
+      "step": 8453
+    },
+    {
+      "epoch": 1.5051638176638176,
+      "grad_norm": 0.6336679458618164,
+      "learning_rate": 0.00013806780766575588,
+      "loss": 1.0922,
+      "step": 8454
+    },
+    {
+      "epoch": 1.5053418803418803,
+      "grad_norm": 0.700219452381134,
+      "learning_rate": 0.0001380548637235457,
+      "loss": 1.0908,
+      "step": 8455
+    },
+    {
+      "epoch": 1.505519943019943,
+      "grad_norm": 0.6346127986907959,
+      "learning_rate": 0.0001380419190357295,
+      "loss": 1.1265,
+      "step": 8456
+    },
+    {
+      "epoch": 1.5056980056980058,
+      "grad_norm": 0.8653196096420288,
+      "learning_rate": 0.00013802897360256093,
+      "loss": 1.0466,
+      "step": 8457
+    },
+    {
+      "epoch": 1.5058760683760684,
+      "grad_norm": 0.6589069962501526,
+      "learning_rate": 0.0001380160274242936,
+      "loss": 1.245,
+      "step": 8458
+    },
+    {
+      "epoch": 1.506054131054131,
+      "grad_norm": 0.6527602076530457,
+      "learning_rate": 0.00013800308050118117,
+      "loss": 1.1539,
+      "step": 8459
+    },
+    {
+      "epoch": 1.5062321937321936,
+      "grad_norm": 0.6005436182022095,
+      "learning_rate": 0.00013799013283347734,
+      "loss": 0.899,
+      "step": 8460
+    },
+    {
+      "epoch": 1.5064102564102564,
+      "grad_norm": 0.6954274773597717,
+      "learning_rate": 0.0001379771844214358,
+      "loss": 1.1245,
+      "step": 8461
+    },
+    {
+      "epoch": 1.5065883190883191,
+      "grad_norm": 0.658764660358429,
+      "learning_rate": 0.00013796423526531019,
+      "loss": 0.9884,
+      "step": 8462
+    },
+    {
+      "epoch": 1.506766381766382,
+      "grad_norm": 0.652214527130127,
+      "learning_rate": 0.0001379512853653543,
+      "loss": 0.9711,
+      "step": 8463
+    },
+    {
+      "epoch": 1.5069444444444444,
+      "grad_norm": 0.5680044889450073,
+      "learning_rate": 0.00013793833472182176,
+      "loss": 0.9055,
+      "step": 8464
+    },
+    {
+      "epoch": 1.5071225071225072,
+      "grad_norm": 0.7524166703224182,
+      "learning_rate": 0.0001379253833349664,
+      "loss": 1.1163,
+      "step": 8465
+    },
+    {
+      "epoch": 1.5073005698005697,
+      "grad_norm": 0.692936897277832,
+      "learning_rate": 0.0001379124312050419,
+      "loss": 0.899,
+      "step": 8466
+    },
+    {
+      "epoch": 1.5074786324786325,
+      "grad_norm": 0.6871617436408997,
+      "learning_rate": 0.00013789947833230207,
+      "loss": 0.9416,
+      "step": 8467
+    },
+    {
+      "epoch": 1.5076566951566952,
+      "grad_norm": 0.5983462333679199,
+      "learning_rate": 0.0001378865247170007,
+      "loss": 0.9776,
+      "step": 8468
+    },
+    {
+      "epoch": 1.507834757834758,
+      "grad_norm": 0.6486790180206299,
+      "learning_rate": 0.0001378735703593916,
+      "loss": 0.9346,
+      "step": 8469
+    },
+    {
+      "epoch": 1.5080128205128205,
+      "grad_norm": 0.6843809485435486,
+      "learning_rate": 0.00013786061525972857,
+      "loss": 1.1276,
+      "step": 8470
+    },
+    {
+      "epoch": 1.5081908831908832,
+      "grad_norm": 0.5734516382217407,
+      "learning_rate": 0.00013784765941826538,
+      "loss": 0.6939,
+      "step": 8471
+    },
+    {
+      "epoch": 1.5083689458689458,
+      "grad_norm": 0.6126381754875183,
+      "learning_rate": 0.00013783470283525596,
+      "loss": 0.8609,
+      "step": 8472
+    },
+    {
+      "epoch": 1.5085470085470085,
+      "grad_norm": 0.7570928335189819,
+      "learning_rate": 0.00013782174551095415,
+      "loss": 0.8809,
+      "step": 8473
+    },
+    {
+      "epoch": 1.5087250712250713,
+      "grad_norm": 0.6911360025405884,
+      "learning_rate": 0.00013780878744561377,
+      "loss": 0.9916,
+      "step": 8474
+    },
+    {
+      "epoch": 1.508903133903134,
+      "grad_norm": 0.6651954650878906,
+      "learning_rate": 0.00013779582863948878,
+      "loss": 1.0012,
+      "step": 8475
+    },
+    {
+      "epoch": 1.5090811965811965,
+      "grad_norm": 0.845396876335144,
+      "learning_rate": 0.000137782869092833,
+      "loss": 0.8455,
+      "step": 8476
+    },
+    {
+      "epoch": 1.5092592592592593,
+      "grad_norm": 0.6958050727844238,
+      "learning_rate": 0.00013776990880590042,
+      "loss": 1.0264,
+      "step": 8477
+    },
+    {
+      "epoch": 1.5094373219373218,
+      "grad_norm": 0.6950124502182007,
+      "learning_rate": 0.00013775694777894493,
+      "loss": 1.0547,
+      "step": 8478
+    },
+    {
+      "epoch": 1.5096153846153846,
+      "grad_norm": 0.7243088483810425,
+      "learning_rate": 0.00013774398601222045,
+      "loss": 1.0999,
+      "step": 8479
+    },
+    {
+      "epoch": 1.5097934472934473,
+      "grad_norm": 0.6820448040962219,
+      "learning_rate": 0.00013773102350598097,
+      "loss": 0.823,
+      "step": 8480
+    },
+    {
+      "epoch": 1.50997150997151,
+      "grad_norm": 0.689996063709259,
+      "learning_rate": 0.0001377180602604805,
+      "loss": 1.049,
+      "step": 8481
+    },
+    {
+      "epoch": 1.5101495726495726,
+      "grad_norm": 0.6763314604759216,
+      "learning_rate": 0.000137705096275973,
+      "loss": 0.9633,
+      "step": 8482
+    },
+    {
+      "epoch": 1.5103276353276354,
+      "grad_norm": 0.6760517358779907,
+      "learning_rate": 0.00013769213155271243,
+      "loss": 1.0326,
+      "step": 8483
+    },
+    {
+      "epoch": 1.510505698005698,
+      "grad_norm": 0.7181188464164734,
+      "learning_rate": 0.00013767916609095285,
+      "loss": 0.9629,
+      "step": 8484
+    },
+    {
+      "epoch": 1.5106837606837606,
+      "grad_norm": 0.7102212905883789,
+      "learning_rate": 0.0001376661998909483,
+      "loss": 1.2714,
+      "step": 8485
+    },
+    {
+      "epoch": 1.5108618233618234,
+      "grad_norm": 0.6719805598258972,
+      "learning_rate": 0.00013765323295295278,
+      "loss": 0.7848,
+      "step": 8486
+    },
+    {
+      "epoch": 1.5110398860398861,
+      "grad_norm": 0.6592095494270325,
+      "learning_rate": 0.0001376402652772204,
+      "loss": 0.882,
+      "step": 8487
+    },
+    {
+      "epoch": 1.5112179487179487,
+      "grad_norm": 0.6858693361282349,
+      "learning_rate": 0.00013762729686400522,
+      "loss": 0.9418,
+      "step": 8488
+    },
+    {
+      "epoch": 1.5113960113960114,
+      "grad_norm": 0.7183199524879456,
+      "learning_rate": 0.0001376143277135613,
+      "loss": 1.0611,
+      "step": 8489
+    },
+    {
+      "epoch": 1.511574074074074,
+      "grad_norm": 0.6294263005256653,
+      "learning_rate": 0.00013760135782614277,
+      "loss": 0.864,
+      "step": 8490
+    },
+    {
+      "epoch": 1.5117521367521367,
+      "grad_norm": 0.6762619614601135,
+      "learning_rate": 0.00013758838720200376,
+      "loss": 1.0295,
+      "step": 8491
+    },
+    {
+      "epoch": 1.5119301994301995,
+      "grad_norm": 0.6919726133346558,
+      "learning_rate": 0.00013757541584139834,
+      "loss": 1.0803,
+      "step": 8492
+    },
+    {
+      "epoch": 1.5121082621082622,
+      "grad_norm": 0.6801241040229797,
+      "learning_rate": 0.00013756244374458075,
+      "loss": 1.1394,
+      "step": 8493
+    },
+    {
+      "epoch": 1.5122863247863247,
+      "grad_norm": 0.6758754253387451,
+      "learning_rate": 0.0001375494709118051,
+      "loss": 1.0053,
+      "step": 8494
+    },
+    {
+      "epoch": 1.5124643874643875,
+      "grad_norm": 0.6727001070976257,
+      "learning_rate": 0.00013753649734332555,
+      "loss": 1.1407,
+      "step": 8495
+    },
+    {
+      "epoch": 1.51264245014245,
+      "grad_norm": 0.693913459777832,
+      "learning_rate": 0.00013752352303939632,
+      "loss": 1.1804,
+      "step": 8496
+    },
+    {
+      "epoch": 1.5128205128205128,
+      "grad_norm": 0.6122510433197021,
+      "learning_rate": 0.0001375105480002716,
+      "loss": 0.917,
+      "step": 8497
+    },
+    {
+      "epoch": 1.5129985754985755,
+      "grad_norm": 0.6305009722709656,
+      "learning_rate": 0.00013749757222620562,
+      "loss": 1.1075,
+      "step": 8498
+    },
+    {
+      "epoch": 1.5131766381766383,
+      "grad_norm": 0.7249642610549927,
+      "learning_rate": 0.0001374845957174526,
+      "loss": 0.9107,
+      "step": 8499
+    },
+    {
+      "epoch": 1.5133547008547008,
+      "grad_norm": 0.6922136545181274,
+      "learning_rate": 0.0001374716184742668,
+      "loss": 0.9974,
+      "step": 8500
+    },
+    {
+      "epoch": 1.5135327635327636,
+      "grad_norm": 0.6989904046058655,
+      "learning_rate": 0.00013745864049690245,
+      "loss": 0.9866,
+      "step": 8501
+    },
+    {
+      "epoch": 1.513710826210826,
+      "grad_norm": 0.6284058094024658,
+      "learning_rate": 0.0001374456617856139,
+      "loss": 0.8658,
+      "step": 8502
+    },
+    {
+      "epoch": 1.5138888888888888,
+      "grad_norm": 0.615388810634613,
+      "learning_rate": 0.00013743268234065535,
+      "loss": 0.7876,
+      "step": 8503
+    },
+    {
+      "epoch": 1.5140669515669516,
+      "grad_norm": 0.6212600469589233,
+      "learning_rate": 0.0001374197021622812,
+      "loss": 0.855,
+      "step": 8504
+    },
+    {
+      "epoch": 1.5142450142450143,
+      "grad_norm": 0.6312419772148132,
+      "learning_rate": 0.00013740672125074567,
+      "loss": 0.9252,
+      "step": 8505
+    },
+    {
+      "epoch": 1.5144230769230769,
+      "grad_norm": 0.7094576954841614,
+      "learning_rate": 0.00013739373960630315,
+      "loss": 0.7655,
+      "step": 8506
+    },
+    {
+      "epoch": 1.5146011396011396,
+      "grad_norm": 0.5583470463752747,
+      "learning_rate": 0.000137380757229208,
+      "loss": 0.7855,
+      "step": 8507
+    },
+    {
+      "epoch": 1.5147792022792022,
+      "grad_norm": 0.6798399686813354,
+      "learning_rate": 0.00013736777411971457,
+      "loss": 0.9935,
+      "step": 8508
+    },
+    {
+      "epoch": 1.514957264957265,
+      "grad_norm": 0.7835991978645325,
+      "learning_rate": 0.00013735479027807723,
+      "loss": 1.1603,
+      "step": 8509
+    },
+    {
+      "epoch": 1.5151353276353277,
+      "grad_norm": 0.6230790615081787,
+      "learning_rate": 0.00013734180570455033,
+      "loss": 1.1463,
+      "step": 8510
+    },
+    {
+      "epoch": 1.5153133903133904,
+      "grad_norm": 0.646603524684906,
+      "learning_rate": 0.00013732882039938835,
+      "loss": 0.9564,
+      "step": 8511
+    },
+    {
+      "epoch": 1.515491452991453,
+      "grad_norm": 0.6619647145271301,
+      "learning_rate": 0.0001373158343628457,
+      "loss": 0.8492,
+      "step": 8512
+    },
+    {
+      "epoch": 1.5156695156695157,
+      "grad_norm": 0.6458454132080078,
+      "learning_rate": 0.00013730284759517675,
+      "loss": 1.0049,
+      "step": 8513
+    },
+    {
+      "epoch": 1.5158475783475782,
+      "grad_norm": 0.7415743470191956,
+      "learning_rate": 0.00013728986009663602,
+      "loss": 0.872,
+      "step": 8514
+    },
+    {
+      "epoch": 1.516025641025641,
+      "grad_norm": 0.6198840141296387,
+      "learning_rate": 0.00013727687186747793,
+      "loss": 0.8645,
+      "step": 8515
+    },
+    {
+      "epoch": 1.5162037037037037,
+      "grad_norm": 0.7160853147506714,
+      "learning_rate": 0.00013726388290795697,
+      "loss": 1.0144,
+      "step": 8516
+    },
+    {
+      "epoch": 1.5163817663817665,
+      "grad_norm": 0.6604135632514954,
+      "learning_rate": 0.00013725089321832765,
+      "loss": 0.9827,
+      "step": 8517
+    },
+    {
+      "epoch": 1.5165598290598292,
+      "grad_norm": 0.6480790972709656,
+      "learning_rate": 0.00013723790279884443,
+      "loss": 1.0357,
+      "step": 8518
+    },
+    {
+      "epoch": 1.5167378917378918,
+      "grad_norm": 0.6207128167152405,
+      "learning_rate": 0.00013722491164976187,
+      "loss": 0.9467,
+      "step": 8519
+    },
+    {
+      "epoch": 1.5169159544159543,
+      "grad_norm": 0.6024298667907715,
+      "learning_rate": 0.00013721191977133452,
+      "loss": 0.8821,
+      "step": 8520
+    },
+    {
+      "epoch": 1.517094017094017,
+      "grad_norm": 0.684898316860199,
+      "learning_rate": 0.00013719892716381688,
+      "loss": 0.9823,
+      "step": 8521
+    },
+    {
+      "epoch": 1.5172720797720798,
+      "grad_norm": 0.7460635304450989,
+      "learning_rate": 0.00013718593382746355,
+      "loss": 1.2573,
+      "step": 8522
+    },
+    {
+      "epoch": 1.5174501424501425,
+      "grad_norm": 0.7193243503570557,
+      "learning_rate": 0.00013717293976252907,
+      "loss": 1.0162,
+      "step": 8523
+    },
+    {
+      "epoch": 1.5176282051282053,
+      "grad_norm": 0.6328752040863037,
+      "learning_rate": 0.0001371599449692681,
+      "loss": 0.8183,
+      "step": 8524
+    },
+    {
+      "epoch": 1.5178062678062678,
+      "grad_norm": 0.658784806728363,
+      "learning_rate": 0.00013714694944793517,
+      "loss": 0.9315,
+      "step": 8525
+    },
+    {
+      "epoch": 1.5179843304843303,
+      "grad_norm": 0.7875827550888062,
+      "learning_rate": 0.00013713395319878493,
+      "loss": 1.0889,
+      "step": 8526
+    },
+    {
+      "epoch": 1.518162393162393,
+      "grad_norm": 0.6580079793930054,
+      "learning_rate": 0.00013712095622207203,
+      "loss": 1.0276,
+      "step": 8527
+    },
+    {
+      "epoch": 1.5183404558404558,
+      "grad_norm": 0.6214027404785156,
+      "learning_rate": 0.00013710795851805106,
+      "loss": 0.9692,
+      "step": 8528
+    },
+    {
+      "epoch": 1.5185185185185186,
+      "grad_norm": 0.7839403748512268,
+      "learning_rate": 0.0001370949600869768,
+      "loss": 0.7378,
+      "step": 8529
+    },
+    {
+      "epoch": 1.5186965811965814,
+      "grad_norm": 0.6632764339447021,
+      "learning_rate": 0.0001370819609291038,
+      "loss": 0.9431,
+      "step": 8530
+    },
+    {
+      "epoch": 1.5188746438746439,
+      "grad_norm": 0.7071712017059326,
+      "learning_rate": 0.00013706896104468682,
+      "loss": 0.7684,
+      "step": 8531
+    },
+    {
+      "epoch": 1.5190527065527064,
+      "grad_norm": 0.7494829297065735,
+      "learning_rate": 0.00013705596043398058,
+      "loss": 0.9709,
+      "step": 8532
+    },
+    {
+      "epoch": 1.5192307692307692,
+      "grad_norm": 0.6408106088638306,
+      "learning_rate": 0.00013704295909723973,
+      "loss": 0.8494,
+      "step": 8533
+    },
+    {
+      "epoch": 1.519408831908832,
+      "grad_norm": 0.6043150424957275,
+      "learning_rate": 0.0001370299570347191,
+      "loss": 0.7485,
+      "step": 8534
+    },
+    {
+      "epoch": 1.5195868945868947,
+      "grad_norm": 0.6944992542266846,
+      "learning_rate": 0.00013701695424667336,
+      "loss": 0.8403,
+      "step": 8535
+    },
+    {
+      "epoch": 1.5197649572649574,
+      "grad_norm": 0.7730217576026917,
+      "learning_rate": 0.00013700395073335726,
+      "loss": 0.9122,
+      "step": 8536
+    },
+    {
+      "epoch": 1.51994301994302,
+      "grad_norm": 0.6300255060195923,
+      "learning_rate": 0.00013699094649502564,
+      "loss": 0.9185,
+      "step": 8537
+    },
+    {
+      "epoch": 1.5201210826210825,
+      "grad_norm": 0.648676335811615,
+      "learning_rate": 0.00013697794153193327,
+      "loss": 0.9897,
+      "step": 8538
+    },
+    {
+      "epoch": 1.5202991452991452,
+      "grad_norm": 0.7365788817405701,
+      "learning_rate": 0.00013696493584433494,
+      "loss": 0.958,
+      "step": 8539
+    },
+    {
+      "epoch": 1.520477207977208,
+      "grad_norm": 0.6634557247161865,
+      "learning_rate": 0.00013695192943248552,
+      "loss": 0.9389,
+      "step": 8540
+    },
+    {
+      "epoch": 1.5206552706552707,
+      "grad_norm": 0.6110827922821045,
+      "learning_rate": 0.00013693892229663977,
+      "loss": 0.9341,
+      "step": 8541
+    },
+    {
+      "epoch": 1.5208333333333335,
+      "grad_norm": 0.7207275032997131,
+      "learning_rate": 0.00013692591443705256,
+      "loss": 0.9526,
+      "step": 8542
+    },
+    {
+      "epoch": 1.521011396011396,
+      "grad_norm": 0.7071022391319275,
+      "learning_rate": 0.0001369129058539788,
+      "loss": 0.9572,
+      "step": 8543
+    },
+    {
+      "epoch": 1.5211894586894585,
+      "grad_norm": 0.5898227691650391,
+      "learning_rate": 0.0001368998965476733,
+      "loss": 0.921,
+      "step": 8544
+    },
+    {
+      "epoch": 1.5213675213675213,
+      "grad_norm": 0.7542559504508972,
+      "learning_rate": 0.000136886886518391,
+      "loss": 0.7799,
+      "step": 8545
+    },
+    {
+      "epoch": 1.521545584045584,
+      "grad_norm": 0.6904959678649902,
+      "learning_rate": 0.00013687387576638674,
+      "loss": 0.9601,
+      "step": 8546
+    },
+    {
+      "epoch": 1.5217236467236468,
+      "grad_norm": 0.763414204120636,
+      "learning_rate": 0.00013686086429191553,
+      "loss": 1.0046,
+      "step": 8547
+    },
+    {
+      "epoch": 1.5219017094017095,
+      "grad_norm": 0.6879960298538208,
+      "learning_rate": 0.00013684785209523224,
+      "loss": 0.9615,
+      "step": 8548
+    },
+    {
+      "epoch": 1.522079772079772,
+      "grad_norm": 0.7166057229042053,
+      "learning_rate": 0.00013683483917659186,
+      "loss": 0.9481,
+      "step": 8549
+    },
+    {
+      "epoch": 1.5222578347578346,
+      "grad_norm": 0.6384348273277283,
+      "learning_rate": 0.0001368218255362493,
+      "loss": 1.1037,
+      "step": 8550
+    },
+    {
+      "epoch": 1.5224358974358974,
+      "grad_norm": 0.6564528346061707,
+      "learning_rate": 0.00013680881117445953,
+      "loss": 0.951,
+      "step": 8551
+    },
+    {
+      "epoch": 1.52261396011396,
+      "grad_norm": 0.749301016330719,
+      "learning_rate": 0.00013679579609147762,
+      "loss": 0.9324,
+      "step": 8552
+    },
+    {
+      "epoch": 1.5227920227920229,
+      "grad_norm": 0.8130472898483276,
+      "learning_rate": 0.00013678278028755848,
+      "loss": 1.0178,
+      "step": 8553
+    },
+    {
+      "epoch": 1.5229700854700856,
+      "grad_norm": 0.6763297319412231,
+      "learning_rate": 0.0001367697637629572,
+      "loss": 0.9224,
+      "step": 8554
+    },
+    {
+      "epoch": 1.5231481481481481,
+      "grad_norm": 0.6630885601043701,
+      "learning_rate": 0.00013675674651792878,
+      "loss": 1.0254,
+      "step": 8555
+    },
+    {
+      "epoch": 1.5233262108262107,
+      "grad_norm": 0.7377206087112427,
+      "learning_rate": 0.00013674372855272825,
+      "loss": 1.0413,
+      "step": 8556
+    },
+    {
+      "epoch": 1.5235042735042734,
+      "grad_norm": 0.5270320177078247,
+      "learning_rate": 0.00013673070986761068,
+      "loss": 0.7124,
+      "step": 8557
+    },
+    {
+      "epoch": 1.5236823361823362,
+      "grad_norm": 0.5941976308822632,
+      "learning_rate": 0.00013671769046283116,
+      "loss": 1.0281,
+      "step": 8558
+    },
+    {
+      "epoch": 1.523860398860399,
+      "grad_norm": 0.6131376028060913,
+      "learning_rate": 0.0001367046703386448,
+      "loss": 0.7593,
+      "step": 8559
+    },
+    {
+      "epoch": 1.5240384615384617,
+      "grad_norm": 0.7381763458251953,
+      "learning_rate": 0.00013669164949530664,
+      "loss": 1.148,
+      "step": 8560
+    },
+    {
+      "epoch": 1.5242165242165242,
+      "grad_norm": 0.683274507522583,
+      "learning_rate": 0.00013667862793307185,
+      "loss": 0.8354,
+      "step": 8561
+    },
+    {
+      "epoch": 1.5243945868945867,
+      "grad_norm": 0.6912649273872375,
+      "learning_rate": 0.0001366656056521955,
+      "loss": 0.9043,
+      "step": 8562
+    },
+    {
+      "epoch": 1.5245726495726495,
+      "grad_norm": 0.5999594330787659,
+      "learning_rate": 0.0001366525826529328,
+      "loss": 0.6138,
+      "step": 8563
+    },
+    {
+      "epoch": 1.5247507122507122,
+      "grad_norm": 0.7185927629470825,
+      "learning_rate": 0.00013663955893553892,
+      "loss": 0.895,
+      "step": 8564
+    },
+    {
+      "epoch": 1.524928774928775,
+      "grad_norm": 0.5967002511024475,
+      "learning_rate": 0.00013662653450026893,
+      "loss": 0.9636,
+      "step": 8565
+    },
+    {
+      "epoch": 1.5251068376068377,
+      "grad_norm": 0.7122953534126282,
+      "learning_rate": 0.00013661350934737813,
+      "loss": 0.9465,
+      "step": 8566
+    },
+    {
+      "epoch": 1.5252849002849003,
+      "grad_norm": 0.705326497554779,
+      "learning_rate": 0.00013660048347712163,
+      "loss": 1.121,
+      "step": 8567
+    },
+    {
+      "epoch": 1.5254629629629628,
+      "grad_norm": 0.6023733019828796,
+      "learning_rate": 0.0001365874568897547,
+      "loss": 0.9881,
+      "step": 8568
+    },
+    {
+      "epoch": 1.5256410256410255,
+      "grad_norm": 0.6883122324943542,
+      "learning_rate": 0.0001365744295855326,
+      "loss": 1.2372,
+      "step": 8569
+    },
+    {
+      "epoch": 1.5258190883190883,
+      "grad_norm": 0.718126654624939,
+      "learning_rate": 0.0001365614015647105,
+      "loss": 1.0888,
+      "step": 8570
+    },
+    {
+      "epoch": 1.525997150997151,
+      "grad_norm": 0.6649243831634521,
+      "learning_rate": 0.00013654837282754367,
+      "loss": 1.0458,
+      "step": 8571
+    },
+    {
+      "epoch": 1.5261752136752138,
+      "grad_norm": 0.6959797143936157,
+      "learning_rate": 0.00013653534337428738,
+      "loss": 0.9282,
+      "step": 8572
+    },
+    {
+      "epoch": 1.5263532763532763,
+      "grad_norm": 0.6069976687431335,
+      "learning_rate": 0.00013652231320519697,
+      "loss": 0.9706,
+      "step": 8573
+    },
+    {
+      "epoch": 1.526531339031339,
+      "grad_norm": 0.7085374593734741,
+      "learning_rate": 0.0001365092823205277,
+      "loss": 1.1241,
+      "step": 8574
+    },
+    {
+      "epoch": 1.5267094017094016,
+      "grad_norm": 0.575106143951416,
+      "learning_rate": 0.00013649625072053488,
+      "loss": 0.9814,
+      "step": 8575
+    },
+    {
+      "epoch": 1.5268874643874644,
+      "grad_norm": 0.6541273593902588,
+      "learning_rate": 0.00013648321840547384,
+      "loss": 1.0155,
+      "step": 8576
+    },
+    {
+      "epoch": 1.5270655270655271,
+      "grad_norm": 0.6754382848739624,
+      "learning_rate": 0.0001364701853755999,
+      "loss": 1.0284,
+      "step": 8577
+    },
+    {
+      "epoch": 1.5272435897435899,
+      "grad_norm": 0.6219634413719177,
+      "learning_rate": 0.00013645715163116846,
+      "loss": 1.1539,
+      "step": 8578
+    },
+    {
+      "epoch": 1.5274216524216524,
+      "grad_norm": 0.7625157833099365,
+      "learning_rate": 0.00013644411717243486,
+      "loss": 1.1157,
+      "step": 8579
+    },
+    {
+      "epoch": 1.5275997150997151,
+      "grad_norm": 0.6944296956062317,
+      "learning_rate": 0.0001364310819996545,
+      "loss": 0.8309,
+      "step": 8580
+    },
+    {
+      "epoch": 1.5277777777777777,
+      "grad_norm": 0.7198494672775269,
+      "learning_rate": 0.00013641804611308277,
+      "loss": 1.0883,
+      "step": 8581
+    },
+    {
+      "epoch": 1.5279558404558404,
+      "grad_norm": 0.6398822069168091,
+      "learning_rate": 0.00013640500951297508,
+      "loss": 1.0173,
+      "step": 8582
+    },
+    {
+      "epoch": 1.5281339031339032,
+      "grad_norm": 0.7306683659553528,
+      "learning_rate": 0.00013639197219958682,
+      "loss": 0.9979,
+      "step": 8583
+    },
+    {
+      "epoch": 1.528311965811966,
+      "grad_norm": 0.6873512268066406,
+      "learning_rate": 0.00013637893417317348,
+      "loss": 0.7883,
+      "step": 8584
+    },
+    {
+      "epoch": 1.5284900284900285,
+      "grad_norm": 0.6482085585594177,
+      "learning_rate": 0.00013636589543399052,
+      "loss": 0.9367,
+      "step": 8585
+    },
+    {
+      "epoch": 1.5286680911680912,
+      "grad_norm": 0.8161232471466064,
+      "learning_rate": 0.00013635285598229336,
+      "loss": 1.0582,
+      "step": 8586
+    },
+    {
+      "epoch": 1.5288461538461537,
+      "grad_norm": 0.6722155809402466,
+      "learning_rate": 0.0001363398158183375,
+      "loss": 0.9805,
+      "step": 8587
+    },
+    {
+      "epoch": 1.5290242165242165,
+      "grad_norm": 0.7175397872924805,
+      "learning_rate": 0.00013632677494237845,
+      "loss": 1.0747,
+      "step": 8588
+    },
+    {
+      "epoch": 1.5292022792022792,
+      "grad_norm": 0.6665592789649963,
+      "learning_rate": 0.00013631373335467172,
+      "loss": 1.006,
+      "step": 8589
+    },
+    {
+      "epoch": 1.529380341880342,
+      "grad_norm": 0.7002299427986145,
+      "learning_rate": 0.0001363006910554728,
+      "loss": 1.0702,
+      "step": 8590
+    },
+    {
+      "epoch": 1.5295584045584045,
+      "grad_norm": 0.7712168097496033,
+      "learning_rate": 0.00013628764804503725,
+      "loss": 1.0628,
+      "step": 8591
+    },
+    {
+      "epoch": 1.5297364672364673,
+      "grad_norm": 0.6620795130729675,
+      "learning_rate": 0.0001362746043236206,
+      "loss": 1.01,
+      "step": 8592
+    },
+    {
+      "epoch": 1.5299145299145298,
+      "grad_norm": 0.6374393701553345,
+      "learning_rate": 0.00013626155989147846,
+      "loss": 0.9106,
+      "step": 8593
+    },
+    {
+      "epoch": 1.5300925925925926,
+      "grad_norm": 0.6531631946563721,
+      "learning_rate": 0.00013624851474886636,
+      "loss": 1.0488,
+      "step": 8594
+    },
+    {
+      "epoch": 1.5302706552706553,
+      "grad_norm": 0.6843775510787964,
+      "learning_rate": 0.00013623546889603993,
+      "loss": 0.8599,
+      "step": 8595
+    },
+    {
+      "epoch": 1.530448717948718,
+      "grad_norm": 0.7232706546783447,
+      "learning_rate": 0.00013622242233325476,
+      "loss": 1.0875,
+      "step": 8596
+    },
+    {
+      "epoch": 1.5306267806267806,
+      "grad_norm": 0.695691704750061,
+      "learning_rate": 0.00013620937506076644,
+      "loss": 0.9835,
+      "step": 8597
+    },
+    {
+      "epoch": 1.5308048433048433,
+      "grad_norm": 0.6321248412132263,
+      "learning_rate": 0.00013619632707883065,
+      "loss": 0.9778,
+      "step": 8598
+    },
+    {
+      "epoch": 1.5309829059829059,
+      "grad_norm": 0.6469168663024902,
+      "learning_rate": 0.00013618327838770303,
+      "loss": 0.9968,
+      "step": 8599
+    },
+    {
+      "epoch": 1.5311609686609686,
+      "grad_norm": 0.6798683404922485,
+      "learning_rate": 0.00013617022898763925,
+      "loss": 0.78,
+      "step": 8600
+    },
+    {
+      "epoch": 1.5313390313390314,
+      "grad_norm": 0.6932336091995239,
+      "learning_rate": 0.00013615717887889496,
+      "loss": 0.9473,
+      "step": 8601
+    },
+    {
+      "epoch": 1.5315170940170941,
+      "grad_norm": 0.7304185628890991,
+      "learning_rate": 0.00013614412806172585,
+      "loss": 1.0478,
+      "step": 8602
+    },
+    {
+      "epoch": 1.5316951566951567,
+      "grad_norm": 0.6585272550582886,
+      "learning_rate": 0.00013613107653638763,
+      "loss": 0.8563,
+      "step": 8603
+    },
+    {
+      "epoch": 1.5318732193732194,
+      "grad_norm": 0.6804470419883728,
+      "learning_rate": 0.00013611802430313604,
+      "loss": 0.9839,
+      "step": 8604
+    },
+    {
+      "epoch": 1.532051282051282,
+      "grad_norm": 0.7271378040313721,
+      "learning_rate": 0.0001361049713622268,
+      "loss": 1.0906,
+      "step": 8605
+    },
+    {
+      "epoch": 1.5322293447293447,
+      "grad_norm": 0.7731603980064392,
+      "learning_rate": 0.00013609191771391562,
+      "loss": 1.1318,
+      "step": 8606
+    },
+    {
+      "epoch": 1.5324074074074074,
+      "grad_norm": 0.6143709421157837,
+      "learning_rate": 0.0001360788633584583,
+      "loss": 0.8726,
+      "step": 8607
+    },
+    {
+      "epoch": 1.5325854700854702,
+      "grad_norm": 0.6847203373908997,
+      "learning_rate": 0.00013606580829611056,
+      "loss": 0.9963,
+      "step": 8608
+    },
+    {
+      "epoch": 1.5327635327635327,
+      "grad_norm": 0.7561219334602356,
+      "learning_rate": 0.0001360527525271283,
+      "loss": 0.8873,
+      "step": 8609
+    },
+    {
+      "epoch": 1.5329415954415955,
+      "grad_norm": 0.7997925281524658,
+      "learning_rate": 0.0001360396960517672,
+      "loss": 0.7675,
+      "step": 8610
+    },
+    {
+      "epoch": 1.533119658119658,
+      "grad_norm": 0.7206357717514038,
+      "learning_rate": 0.00013602663887028315,
+      "loss": 1.0084,
+      "step": 8611
+    },
+    {
+      "epoch": 1.5332977207977208,
+      "grad_norm": 0.6454238891601562,
+      "learning_rate": 0.00013601358098293194,
+      "loss": 0.8194,
+      "step": 8612
+    },
+    {
+      "epoch": 1.5334757834757835,
+      "grad_norm": 0.5531884431838989,
+      "learning_rate": 0.0001360005223899694,
+      "loss": 0.8596,
+      "step": 8613
+    },
+    {
+      "epoch": 1.5336538461538463,
+      "grad_norm": 0.659161388874054,
+      "learning_rate": 0.00013598746309165144,
+      "loss": 1.0363,
+      "step": 8614
+    },
+    {
+      "epoch": 1.5338319088319088,
+      "grad_norm": 0.6958948373794556,
+      "learning_rate": 0.00013597440308823385,
+      "loss": 0.9852,
+      "step": 8615
+    },
+    {
+      "epoch": 1.5340099715099715,
+      "grad_norm": 0.7147171497344971,
+      "learning_rate": 0.0001359613423799726,
+      "loss": 1.0506,
+      "step": 8616
+    },
+    {
+      "epoch": 1.534188034188034,
+      "grad_norm": 0.604450523853302,
+      "learning_rate": 0.00013594828096712353,
+      "loss": 0.9344,
+      "step": 8617
+    },
+    {
+      "epoch": 1.5343660968660968,
+      "grad_norm": 0.714547336101532,
+      "learning_rate": 0.00013593521884994257,
+      "loss": 1.1583,
+      "step": 8618
+    },
+    {
+      "epoch": 1.5345441595441596,
+      "grad_norm": 0.6864442825317383,
+      "learning_rate": 0.00013592215602868565,
+      "loss": 0.991,
+      "step": 8619
+    },
+    {
+      "epoch": 1.5347222222222223,
+      "grad_norm": 0.6384446620941162,
+      "learning_rate": 0.00013590909250360873,
+      "loss": 0.8799,
+      "step": 8620
+    },
+    {
+      "epoch": 1.5349002849002849,
+      "grad_norm": 0.7307949662208557,
+      "learning_rate": 0.00013589602827496772,
+      "loss": 1.0276,
+      "step": 8621
+    },
+    {
+      "epoch": 1.5350783475783476,
+      "grad_norm": 0.6620129942893982,
+      "learning_rate": 0.00013588296334301862,
+      "loss": 0.9378,
+      "step": 8622
+    },
+    {
+      "epoch": 1.5352564102564101,
+      "grad_norm": 0.7216851711273193,
+      "learning_rate": 0.00013586989770801735,
+      "loss": 0.8984,
+      "step": 8623
+    },
+    {
+      "epoch": 1.5354344729344729,
+      "grad_norm": 0.7319885492324829,
+      "learning_rate": 0.00013585683137022,
+      "loss": 1.0357,
+      "step": 8624
+    },
+    {
+      "epoch": 1.5356125356125356,
+      "grad_norm": 0.7455703616142273,
+      "learning_rate": 0.00013584376432988247,
+      "loss": 0.9727,
+      "step": 8625
+    },
+    {
+      "epoch": 1.5357905982905984,
+      "grad_norm": 0.7285277247428894,
+      "learning_rate": 0.0001358306965872609,
+      "loss": 1.1132,
+      "step": 8626
+    },
+    {
+      "epoch": 1.535968660968661,
+      "grad_norm": 0.6250096559524536,
+      "learning_rate": 0.00013581762814261124,
+      "loss": 0.8538,
+      "step": 8627
+    },
+    {
+      "epoch": 1.5361467236467237,
+      "grad_norm": 0.6252279281616211,
+      "learning_rate": 0.0001358045589961895,
+      "loss": 0.822,
+      "step": 8628
+    },
+    {
+      "epoch": 1.5363247863247862,
+      "grad_norm": 0.7723368406295776,
+      "learning_rate": 0.0001357914891482519,
+      "loss": 0.9841,
+      "step": 8629
+    },
+    {
+      "epoch": 1.536502849002849,
+      "grad_norm": 0.6855236887931824,
+      "learning_rate": 0.00013577841859905435,
+      "loss": 0.9512,
+      "step": 8630
+    },
+    {
+      "epoch": 1.5366809116809117,
+      "grad_norm": 0.8320944309234619,
+      "learning_rate": 0.00013576534734885303,
+      "loss": 1.0324,
+      "step": 8631
+    },
+    {
+      "epoch": 1.5368589743589745,
+      "grad_norm": 0.6970052123069763,
+      "learning_rate": 0.00013575227539790405,
+      "loss": 0.9874,
+      "step": 8632
+    },
+    {
+      "epoch": 1.5370370370370372,
+      "grad_norm": 0.7774853110313416,
+      "learning_rate": 0.00013573920274646345,
+      "loss": 0.962,
+      "step": 8633
+    },
+    {
+      "epoch": 1.5372150997150997,
+      "grad_norm": 0.6479182839393616,
+      "learning_rate": 0.0001357261293947875,
+      "loss": 0.9438,
+      "step": 8634
+    },
+    {
+      "epoch": 1.5373931623931623,
+      "grad_norm": 0.6855679750442505,
+      "learning_rate": 0.00013571305534313218,
+      "loss": 1.0898,
+      "step": 8635
+    },
+    {
+      "epoch": 1.537571225071225,
+      "grad_norm": 0.6527835726737976,
+      "learning_rate": 0.00013569998059175377,
+      "loss": 0.954,
+      "step": 8636
+    },
+    {
+      "epoch": 1.5377492877492878,
+      "grad_norm": 0.6601176857948303,
+      "learning_rate": 0.00013568690514090837,
+      "loss": 1.0183,
+      "step": 8637
+    },
+    {
+      "epoch": 1.5379273504273505,
+      "grad_norm": 0.6628120541572571,
+      "learning_rate": 0.0001356738289908522,
+      "loss": 1.0651,
+      "step": 8638
+    },
+    {
+      "epoch": 1.5381054131054133,
+      "grad_norm": 0.7492203712463379,
+      "learning_rate": 0.00013566075214184147,
+      "loss": 1.2438,
+      "step": 8639
+    },
+    {
+      "epoch": 1.5382834757834758,
+      "grad_norm": 0.6781343817710876,
+      "learning_rate": 0.00013564767459413237,
+      "loss": 0.9413,
+      "step": 8640
+    },
+    {
+      "epoch": 1.5384615384615383,
+      "grad_norm": 0.6890891790390015,
+      "learning_rate": 0.00013563459634798115,
+      "loss": 0.9912,
+      "step": 8641
+    },
+    {
+      "epoch": 1.538639601139601,
+      "grad_norm": 0.722820520401001,
+      "learning_rate": 0.00013562151740364404,
+      "loss": 1.1799,
+      "step": 8642
+    },
+    {
+      "epoch": 1.5388176638176638,
+      "grad_norm": 0.738369882106781,
+      "learning_rate": 0.0001356084377613773,
+      "loss": 1.1313,
+      "step": 8643
+    },
+    {
+      "epoch": 1.5389957264957266,
+      "grad_norm": 0.6232718229293823,
+      "learning_rate": 0.00013559535742143717,
+      "loss": 0.9035,
+      "step": 8644
+    },
+    {
+      "epoch": 1.5391737891737893,
+      "grad_norm": 0.7371624708175659,
+      "learning_rate": 0.00013558227638407996,
+      "loss": 1.3377,
+      "step": 8645
+    },
+    {
+      "epoch": 1.5393518518518519,
+      "grad_norm": 0.658353865146637,
+      "learning_rate": 0.00013556919464956197,
+      "loss": 0.9591,
+      "step": 8646
+    },
+    {
+      "epoch": 1.5395299145299144,
+      "grad_norm": 0.6205827593803406,
+      "learning_rate": 0.0001355561122181395,
+      "loss": 0.9217,
+      "step": 8647
+    },
+    {
+      "epoch": 1.5397079772079771,
+      "grad_norm": 0.5892502069473267,
+      "learning_rate": 0.00013554302909006888,
+      "loss": 0.8893,
+      "step": 8648
+    },
+    {
+      "epoch": 1.53988603988604,
+      "grad_norm": 1.224568486213684,
+      "learning_rate": 0.0001355299452656064,
+      "loss": 0.8237,
+      "step": 8649
+    },
+    {
+      "epoch": 1.5400641025641026,
+      "grad_norm": 0.7732635736465454,
+      "learning_rate": 0.0001355168607450085,
+      "loss": 1.1043,
+      "step": 8650
+    },
+    {
+      "epoch": 1.5402421652421654,
+      "grad_norm": 0.6365402340888977,
+      "learning_rate": 0.00013550377552853146,
+      "loss": 1.0345,
+      "step": 8651
+    },
+    {
+      "epoch": 1.540420227920228,
+      "grad_norm": 0.7046400904655457,
+      "learning_rate": 0.00013549068961643171,
+      "loss": 1.0361,
+      "step": 8652
+    },
+    {
+      "epoch": 1.5405982905982905,
+      "grad_norm": 0.6760256886482239,
+      "learning_rate": 0.0001354776030089656,
+      "loss": 0.9437,
+      "step": 8653
+    },
+    {
+      "epoch": 1.5407763532763532,
+      "grad_norm": 0.6180984973907471,
+      "learning_rate": 0.00013546451570638958,
+      "loss": 0.9737,
+      "step": 8654
+    },
+    {
+      "epoch": 1.540954415954416,
+      "grad_norm": 0.6221960186958313,
+      "learning_rate": 0.00013545142770896005,
+      "loss": 0.9313,
+      "step": 8655
+    },
+    {
+      "epoch": 1.5411324786324787,
+      "grad_norm": 0.6887816786766052,
+      "learning_rate": 0.0001354383390169334,
+      "loss": 1.1736,
+      "step": 8656
+    },
+    {
+      "epoch": 1.5413105413105415,
+      "grad_norm": 0.5840606093406677,
+      "learning_rate": 0.00013542524963056614,
+      "loss": 0.9269,
+      "step": 8657
+    },
+    {
+      "epoch": 1.541488603988604,
+      "grad_norm": 0.7396654486656189,
+      "learning_rate": 0.00013541215955011472,
+      "loss": 1.1189,
+      "step": 8658
+    },
+    {
+      "epoch": 1.5416666666666665,
+      "grad_norm": 0.780616819858551,
+      "learning_rate": 0.00013539906877583555,
+      "loss": 1.1251,
+      "step": 8659
+    },
+    {
+      "epoch": 1.5418447293447293,
+      "grad_norm": 0.6975206732749939,
+      "learning_rate": 0.0001353859773079852,
+      "loss": 1.2134,
+      "step": 8660
+    },
+    {
+      "epoch": 1.542022792022792,
+      "grad_norm": 0.7572869658470154,
+      "learning_rate": 0.00013537288514682013,
+      "loss": 0.9396,
+      "step": 8661
+    },
+    {
+      "epoch": 1.5422008547008548,
+      "grad_norm": 0.6252159476280212,
+      "learning_rate": 0.00013535979229259686,
+      "loss": 0.8449,
+      "step": 8662
+    },
+    {
+      "epoch": 1.5423789173789175,
+      "grad_norm": 0.7321650981903076,
+      "learning_rate": 0.0001353466987455719,
+      "loss": 1.3263,
+      "step": 8663
+    },
+    {
+      "epoch": 1.54255698005698,
+      "grad_norm": 0.7168700695037842,
+      "learning_rate": 0.00013533360450600177,
+      "loss": 0.8923,
+      "step": 8664
+    },
+    {
+      "epoch": 1.5427350427350426,
+      "grad_norm": 0.5931934714317322,
+      "learning_rate": 0.00013532050957414313,
+      "loss": 0.8448,
+      "step": 8665
+    },
+    {
+      "epoch": 1.5429131054131053,
+      "grad_norm": 0.6621279120445251,
+      "learning_rate": 0.00013530741395025245,
+      "loss": 1.1023,
+      "step": 8666
+    },
+    {
+      "epoch": 1.543091168091168,
+      "grad_norm": 0.7133732438087463,
+      "learning_rate": 0.00013529431763458633,
+      "loss": 0.9986,
+      "step": 8667
+    },
+    {
+      "epoch": 1.5432692307692308,
+      "grad_norm": 0.7589015960693359,
+      "learning_rate": 0.0001352812206274014,
+      "loss": 1.0111,
+      "step": 8668
+    },
+    {
+      "epoch": 1.5434472934472936,
+      "grad_norm": 0.6958192586898804,
+      "learning_rate": 0.0001352681229289542,
+      "loss": 0.9466,
+      "step": 8669
+    },
+    {
+      "epoch": 1.5436253561253561,
+      "grad_norm": 0.7539750337600708,
+      "learning_rate": 0.0001352550245395014,
+      "loss": 1.0974,
+      "step": 8670
+    },
+    {
+      "epoch": 1.5438034188034186,
+      "grad_norm": 0.7003816366195679,
+      "learning_rate": 0.00013524192545929964,
+      "loss": 1.0354,
+      "step": 8671
+    },
+    {
+      "epoch": 1.5439814814814814,
+      "grad_norm": 0.6503025889396667,
+      "learning_rate": 0.00013522882568860558,
+      "loss": 1.0476,
+      "step": 8672
+    },
+    {
+      "epoch": 1.5441595441595442,
+      "grad_norm": 0.6757345199584961,
+      "learning_rate": 0.00013521572522767584,
+      "loss": 0.864,
+      "step": 8673
+    },
+    {
+      "epoch": 1.544337606837607,
+      "grad_norm": 0.6857611536979675,
+      "learning_rate": 0.0001352026240767671,
+      "loss": 1.1627,
+      "step": 8674
+    },
+    {
+      "epoch": 1.5445156695156697,
+      "grad_norm": 0.5775430798530579,
+      "learning_rate": 0.0001351895222361361,
+      "loss": 0.7444,
+      "step": 8675
+    },
+    {
+      "epoch": 1.5446937321937322,
+      "grad_norm": 0.7511499524116516,
+      "learning_rate": 0.00013517641970603952,
+      "loss": 1.1547,
+      "step": 8676
+    },
+    {
+      "epoch": 1.5448717948717947,
+      "grad_norm": 0.6727504730224609,
+      "learning_rate": 0.00013516331648673403,
+      "loss": 1.0829,
+      "step": 8677
+    },
+    {
+      "epoch": 1.5450498575498575,
+      "grad_norm": 0.6128812432289124,
+      "learning_rate": 0.00013515021257847642,
+      "loss": 0.9318,
+      "step": 8678
+    },
+    {
+      "epoch": 1.5452279202279202,
+      "grad_norm": 0.7309781312942505,
+      "learning_rate": 0.00013513710798152343,
+      "loss": 1.0844,
+      "step": 8679
+    },
+    {
+      "epoch": 1.545405982905983,
+      "grad_norm": 0.695655882358551,
+      "learning_rate": 0.00013512400269613176,
+      "loss": 1.113,
+      "step": 8680
+    },
+    {
+      "epoch": 1.5455840455840457,
+      "grad_norm": 0.696441650390625,
+      "learning_rate": 0.00013511089672255824,
+      "loss": 1.0499,
+      "step": 8681
+    },
+    {
+      "epoch": 1.5457621082621082,
+      "grad_norm": 0.6309961080551147,
+      "learning_rate": 0.00013509779006105964,
+      "loss": 0.8759,
+      "step": 8682
+    },
+    {
+      "epoch": 1.5459401709401708,
+      "grad_norm": 0.6155984401702881,
+      "learning_rate": 0.00013508468271189277,
+      "loss": 0.8967,
+      "step": 8683
+    },
+    {
+      "epoch": 1.5461182336182335,
+      "grad_norm": 0.6786884665489197,
+      "learning_rate": 0.00013507157467531442,
+      "loss": 1.0806,
+      "step": 8684
+    },
+    {
+      "epoch": 1.5462962962962963,
+      "grad_norm": 0.6494075059890747,
+      "learning_rate": 0.00013505846595158138,
+      "loss": 1.0196,
+      "step": 8685
+    },
+    {
+      "epoch": 1.546474358974359,
+      "grad_norm": 0.7599824070930481,
+      "learning_rate": 0.00013504535654095055,
+      "loss": 0.8662,
+      "step": 8686
+    },
+    {
+      "epoch": 1.5466524216524218,
+      "grad_norm": 0.6017210483551025,
+      "learning_rate": 0.00013503224644367877,
+      "loss": 0.872,
+      "step": 8687
+    },
+    {
+      "epoch": 1.5468304843304843,
+      "grad_norm": 0.7972410321235657,
+      "learning_rate": 0.00013501913566002288,
+      "loss": 1.0958,
+      "step": 8688
+    },
+    {
+      "epoch": 1.547008547008547,
+      "grad_norm": 0.7572960257530212,
+      "learning_rate": 0.00013500602419023978,
+      "loss": 1.0219,
+      "step": 8689
+    },
+    {
+      "epoch": 1.5471866096866096,
+      "grad_norm": 0.6329224109649658,
+      "learning_rate": 0.00013499291203458635,
+      "loss": 0.8636,
+      "step": 8690
+    },
+    {
+      "epoch": 1.5473646723646723,
+      "grad_norm": 0.6777113080024719,
+      "learning_rate": 0.0001349797991933195,
+      "loss": 1.0297,
+      "step": 8691
+    },
+    {
+      "epoch": 1.547542735042735,
+      "grad_norm": 0.6449527144432068,
+      "learning_rate": 0.00013496668566669617,
+      "loss": 1.0296,
+      "step": 8692
+    },
+    {
+      "epoch": 1.5477207977207978,
+      "grad_norm": 0.8236973881721497,
+      "learning_rate": 0.00013495357145497326,
+      "loss": 0.8569,
+      "step": 8693
+    },
+    {
+      "epoch": 1.5478988603988604,
+      "grad_norm": 0.6753743290901184,
+      "learning_rate": 0.0001349404565584077,
+      "loss": 1.0733,
+      "step": 8694
+    },
+    {
+      "epoch": 1.5480769230769231,
+      "grad_norm": 0.6642967462539673,
+      "learning_rate": 0.0001349273409772565,
+      "loss": 0.9437,
+      "step": 8695
+    },
+    {
+      "epoch": 1.5482549857549857,
+      "grad_norm": 0.6470823884010315,
+      "learning_rate": 0.00013491422471177661,
+      "loss": 0.999,
+      "step": 8696
+    },
+    {
+      "epoch": 1.5484330484330484,
+      "grad_norm": 0.7287036776542664,
+      "learning_rate": 0.000134901107762225,
+      "loss": 0.9396,
+      "step": 8697
+    },
+    {
+      "epoch": 1.5486111111111112,
+      "grad_norm": 0.6258324980735779,
+      "learning_rate": 0.00013488799012885872,
+      "loss": 1.045,
+      "step": 8698
+    },
+    {
+      "epoch": 1.548789173789174,
+      "grad_norm": 0.6540539860725403,
+      "learning_rate": 0.00013487487181193473,
+      "loss": 0.9939,
+      "step": 8699
+    },
+    {
+      "epoch": 1.5489672364672364,
+      "grad_norm": 0.7129563093185425,
+      "learning_rate": 0.00013486175281171003,
+      "loss": 1.2079,
+      "step": 8700
+    },
+    {
+      "epoch": 1.5491452991452992,
+      "grad_norm": 0.6383145451545715,
+      "learning_rate": 0.00013484863312844173,
+      "loss": 0.9999,
+      "step": 8701
+    },
+    {
+      "epoch": 1.5493233618233617,
+      "grad_norm": 0.6310200691223145,
+      "learning_rate": 0.0001348355127623869,
+      "loss": 1.1193,
+      "step": 8702
+    },
+    {
+      "epoch": 1.5495014245014245,
+      "grad_norm": 0.6370054483413696,
+      "learning_rate": 0.0001348223917138025,
+      "loss": 1.0213,
+      "step": 8703
+    },
+    {
+      "epoch": 1.5496794871794872,
+      "grad_norm": 0.7052688598632812,
+      "learning_rate": 0.00013480926998294573,
+      "loss": 0.8773,
+      "step": 8704
+    },
+    {
+      "epoch": 1.54985754985755,
+      "grad_norm": 0.6369579434394836,
+      "learning_rate": 0.00013479614757007355,
+      "loss": 1.0072,
+      "step": 8705
+    },
+    {
+      "epoch": 1.5500356125356125,
+      "grad_norm": 0.7152075171470642,
+      "learning_rate": 0.0001347830244754432,
+      "loss": 1.0409,
+      "step": 8706
+    },
+    {
+      "epoch": 1.5502136752136753,
+      "grad_norm": 0.654183566570282,
+      "learning_rate": 0.00013476990069931173,
+      "loss": 0.9363,
+      "step": 8707
+    },
+    {
+      "epoch": 1.5503917378917378,
+      "grad_norm": 0.6700537204742432,
+      "learning_rate": 0.00013475677624193627,
+      "loss": 0.985,
+      "step": 8708
+    },
+    {
+      "epoch": 1.5505698005698005,
+      "grad_norm": 0.7195445895195007,
+      "learning_rate": 0.00013474365110357402,
+      "loss": 0.988,
+      "step": 8709
+    },
+    {
+      "epoch": 1.5507478632478633,
+      "grad_norm": 0.6019890904426575,
+      "learning_rate": 0.00013473052528448201,
+      "loss": 0.9915,
+      "step": 8710
+    },
+    {
+      "epoch": 1.550925925925926,
+      "grad_norm": 0.7787565588951111,
+      "learning_rate": 0.0001347173987849176,
+      "loss": 0.9676,
+      "step": 8711
+    },
+    {
+      "epoch": 1.5511039886039886,
+      "grad_norm": 0.6997103691101074,
+      "learning_rate": 0.00013470427160513782,
+      "loss": 1.1158,
+      "step": 8712
+    },
+    {
+      "epoch": 1.5512820512820513,
+      "grad_norm": 0.6259464025497437,
+      "learning_rate": 0.00013469114374539998,
+      "loss": 0.8784,
+      "step": 8713
+    },
+    {
+      "epoch": 1.5514601139601139,
+      "grad_norm": 0.6159056425094604,
+      "learning_rate": 0.00013467801520596122,
+      "loss": 0.9184,
+      "step": 8714
+    },
+    {
+      "epoch": 1.5516381766381766,
+      "grad_norm": 0.6823606491088867,
+      "learning_rate": 0.00013466488598707876,
+      "loss": 0.9542,
+      "step": 8715
+    },
+    {
+      "epoch": 1.5518162393162394,
+      "grad_norm": 0.6781585812568665,
+      "learning_rate": 0.0001346517560890099,
+      "loss": 1.1761,
+      "step": 8716
+    },
+    {
+      "epoch": 1.551994301994302,
+      "grad_norm": 0.6313831806182861,
+      "learning_rate": 0.00013463862551201184,
+      "loss": 0.8935,
+      "step": 8717
+    },
+    {
+      "epoch": 1.5521723646723646,
+      "grad_norm": 0.7466186881065369,
+      "learning_rate": 0.0001346254942563419,
+      "loss": 1.0583,
+      "step": 8718
+    },
+    {
+      "epoch": 1.5523504273504274,
+      "grad_norm": 0.7073680758476257,
+      "learning_rate": 0.0001346123623222573,
+      "loss": 0.9863,
+      "step": 8719
+    },
+    {
+      "epoch": 1.55252849002849,
+      "grad_norm": 0.6286870241165161,
+      "learning_rate": 0.00013459922971001536,
+      "loss": 0.9921,
+      "step": 8720
+    },
+    {
+      "epoch": 1.5527065527065527,
+      "grad_norm": 0.6047035455703735,
+      "learning_rate": 0.0001345860964198734,
+      "loss": 0.9155,
+      "step": 8721
+    },
+    {
+      "epoch": 1.5528846153846154,
+      "grad_norm": 0.5909964442253113,
+      "learning_rate": 0.00013457296245208874,
+      "loss": 0.9593,
+      "step": 8722
+    },
+    {
+      "epoch": 1.5530626780626782,
+      "grad_norm": 0.7838597893714905,
+      "learning_rate": 0.00013455982780691869,
+      "loss": 0.8872,
+      "step": 8723
+    },
+    {
+      "epoch": 1.5532407407407407,
+      "grad_norm": 0.6914706230163574,
+      "learning_rate": 0.00013454669248462063,
+      "loss": 0.9104,
+      "step": 8724
+    },
+    {
+      "epoch": 1.5534188034188035,
+      "grad_norm": 0.6777952909469604,
+      "learning_rate": 0.00013453355648545182,
+      "loss": 0.9839,
+      "step": 8725
+    },
+    {
+      "epoch": 1.553596866096866,
+      "grad_norm": 0.7482799291610718,
+      "learning_rate": 0.00013452041980966978,
+      "loss": 1.1164,
+      "step": 8726
+    },
+    {
+      "epoch": 1.5537749287749287,
+      "grad_norm": 0.6616327166557312,
+      "learning_rate": 0.0001345072824575318,
+      "loss": 0.9574,
+      "step": 8727
+    },
+    {
+      "epoch": 1.5539529914529915,
+      "grad_norm": 0.7193203568458557,
+      "learning_rate": 0.00013449414442929532,
+      "loss": 1.0609,
+      "step": 8728
+    },
+    {
+      "epoch": 1.5541310541310542,
+      "grad_norm": 0.6599446535110474,
+      "learning_rate": 0.0001344810057252177,
+      "loss": 0.9574,
+      "step": 8729
+    },
+    {
+      "epoch": 1.5543091168091168,
+      "grad_norm": 0.7221707105636597,
+      "learning_rate": 0.00013446786634555642,
+      "loss": 0.9819,
+      "step": 8730
+    },
+    {
+      "epoch": 1.5544871794871795,
+      "grad_norm": 0.6531312465667725,
+      "learning_rate": 0.0001344547262905689,
+      "loss": 0.9986,
+      "step": 8731
+    },
+    {
+      "epoch": 1.554665242165242,
+      "grad_norm": 0.6879804730415344,
+      "learning_rate": 0.0001344415855605126,
+      "loss": 1.1078,
+      "step": 8732
+    },
+    {
+      "epoch": 1.5548433048433048,
+      "grad_norm": 0.708907425403595,
+      "learning_rate": 0.00013442844415564498,
+      "loss": 1.0221,
+      "step": 8733
+    },
+    {
+      "epoch": 1.5550213675213675,
+      "grad_norm": 0.7957375645637512,
+      "learning_rate": 0.0001344153020762235,
+      "loss": 1.3101,
+      "step": 8734
+    },
+    {
+      "epoch": 1.5551994301994303,
+      "grad_norm": 0.7068197727203369,
+      "learning_rate": 0.00013440215932250567,
+      "loss": 0.8995,
+      "step": 8735
+    },
+    {
+      "epoch": 1.5553774928774928,
+      "grad_norm": 0.6455841064453125,
+      "learning_rate": 0.00013438901589474898,
+      "loss": 0.7244,
+      "step": 8736
+    },
+    {
+      "epoch": 1.5555555555555556,
+      "grad_norm": 0.7500516772270203,
+      "learning_rate": 0.00013437587179321097,
+      "loss": 1.0161,
+      "step": 8737
+    },
+    {
+      "epoch": 1.555733618233618,
+      "grad_norm": 0.5983143448829651,
+      "learning_rate": 0.00013436272701814917,
+      "loss": 0.9922,
+      "step": 8738
+    },
+    {
+      "epoch": 1.5559116809116809,
+      "grad_norm": 0.8761729598045349,
+      "learning_rate": 0.0001343495815698211,
+      "loss": 1.022,
+      "step": 8739
+    },
+    {
+      "epoch": 1.5560897435897436,
+      "grad_norm": 0.6901857852935791,
+      "learning_rate": 0.00013433643544848438,
+      "loss": 1.0668,
+      "step": 8740
+    },
+    {
+      "epoch": 1.5562678062678064,
+      "grad_norm": 0.6770836114883423,
+      "learning_rate": 0.00013432328865439647,
+      "loss": 0.9516,
+      "step": 8741
+    },
+    {
+      "epoch": 1.556445868945869,
+      "grad_norm": 0.6138805150985718,
+      "learning_rate": 0.00013431014118781505,
+      "loss": 0.8682,
+      "step": 8742
+    },
+    {
+      "epoch": 1.5566239316239316,
+      "grad_norm": 0.6796693801879883,
+      "learning_rate": 0.00013429699304899772,
+      "loss": 1.1132,
+      "step": 8743
+    },
+    {
+      "epoch": 1.5568019943019942,
+      "grad_norm": 0.6626394987106323,
+      "learning_rate": 0.000134283844238202,
+      "loss": 0.9273,
+      "step": 8744
+    },
+    {
+      "epoch": 1.556980056980057,
+      "grad_norm": 0.7088519334793091,
+      "learning_rate": 0.00013427069475568563,
+      "loss": 0.8915,
+      "step": 8745
+    },
+    {
+      "epoch": 1.5571581196581197,
+      "grad_norm": 0.6244857311248779,
+      "learning_rate": 0.0001342575446017061,
+      "loss": 0.9466,
+      "step": 8746
+    },
+    {
+      "epoch": 1.5573361823361824,
+      "grad_norm": 0.6969038248062134,
+      "learning_rate": 0.00013424439377652123,
+      "loss": 1.2307,
+      "step": 8747
+    },
+    {
+      "epoch": 1.5575142450142452,
+      "grad_norm": 0.6636740565299988,
+      "learning_rate": 0.0001342312422803886,
+      "loss": 0.9456,
+      "step": 8748
+    },
+    {
+      "epoch": 1.5576923076923077,
+      "grad_norm": 0.7863389253616333,
+      "learning_rate": 0.00013421809011356586,
+      "loss": 1.1888,
+      "step": 8749
+    },
+    {
+      "epoch": 1.5578703703703702,
+      "grad_norm": 0.7504058480262756,
+      "learning_rate": 0.00013420493727631073,
+      "loss": 1.2602,
+      "step": 8750
+    },
+    {
+      "epoch": 1.558048433048433,
+      "grad_norm": 0.7173139452934265,
+      "learning_rate": 0.00013419178376888085,
+      "loss": 1.0726,
+      "step": 8751
+    },
+    {
+      "epoch": 1.5582264957264957,
+      "grad_norm": 0.6517474055290222,
+      "learning_rate": 0.00013417862959153406,
+      "loss": 1.1299,
+      "step": 8752
+    },
+    {
+      "epoch": 1.5584045584045585,
+      "grad_norm": 0.8911739587783813,
+      "learning_rate": 0.00013416547474452803,
+      "loss": 1.105,
+      "step": 8753
+    },
+    {
+      "epoch": 1.5585826210826212,
+      "grad_norm": 0.7116649150848389,
+      "learning_rate": 0.00013415231922812049,
+      "loss": 0.8037,
+      "step": 8754
+    },
+    {
+      "epoch": 1.5587606837606838,
+      "grad_norm": 0.6935904026031494,
+      "learning_rate": 0.00013413916304256916,
+      "loss": 1.2778,
+      "step": 8755
+    },
+    {
+      "epoch": 1.5589387464387463,
+      "grad_norm": 0.652763843536377,
+      "learning_rate": 0.00013412600618813186,
+      "loss": 0.9188,
+      "step": 8756
+    },
+    {
+      "epoch": 1.559116809116809,
+      "grad_norm": 0.6545276641845703,
+      "learning_rate": 0.00013411284866506637,
+      "loss": 1.0116,
+      "step": 8757
+    },
+    {
+      "epoch": 1.5592948717948718,
+      "grad_norm": 0.632165253162384,
+      "learning_rate": 0.0001340996904736305,
+      "loss": 0.8538,
+      "step": 8758
+    },
+    {
+      "epoch": 1.5594729344729346,
+      "grad_norm": 0.6719664931297302,
+      "learning_rate": 0.000134086531614082,
+      "loss": 1.1877,
+      "step": 8759
+    },
+    {
+      "epoch": 1.5596509971509973,
+      "grad_norm": 0.6691158413887024,
+      "learning_rate": 0.00013407337208667873,
+      "loss": 1.0411,
+      "step": 8760
+    },
+    {
+      "epoch": 1.5598290598290598,
+      "grad_norm": 0.7711479067802429,
+      "learning_rate": 0.0001340602118916785,
+      "loss": 0.9995,
+      "step": 8761
+    },
+    {
+      "epoch": 1.5600071225071224,
+      "grad_norm": 0.7229881286621094,
+      "learning_rate": 0.0001340470510293392,
+      "loss": 1.1751,
+      "step": 8762
+    },
+    {
+      "epoch": 1.5601851851851851,
+      "grad_norm": 0.7183271646499634,
+      "learning_rate": 0.00013403388949991864,
+      "loss": 0.9371,
+      "step": 8763
+    },
+    {
+      "epoch": 1.5603632478632479,
+      "grad_norm": 0.8142383098602295,
+      "learning_rate": 0.00013402072730367475,
+      "loss": 1.0199,
+      "step": 8764
+    },
+    {
+      "epoch": 1.5605413105413106,
+      "grad_norm": 0.6349362134933472,
+      "learning_rate": 0.00013400756444086534,
+      "loss": 0.8453,
+      "step": 8765
+    },
+    {
+      "epoch": 1.5607193732193734,
+      "grad_norm": 0.651900589466095,
+      "learning_rate": 0.00013399440091174834,
+      "loss": 0.8952,
+      "step": 8766
+    },
+    {
+      "epoch": 1.560897435897436,
+      "grad_norm": 0.6873346567153931,
+      "learning_rate": 0.00013398123671658172,
+      "loss": 0.9438,
+      "step": 8767
+    },
+    {
+      "epoch": 1.5610754985754984,
+      "grad_norm": 0.7404754757881165,
+      "learning_rate": 0.00013396807185562333,
+      "loss": 1.123,
+      "step": 8768
+    },
+    {
+      "epoch": 1.5612535612535612,
+      "grad_norm": 0.7449641227722168,
+      "learning_rate": 0.00013395490632913111,
+      "loss": 0.9407,
+      "step": 8769
+    },
+    {
+      "epoch": 1.561431623931624,
+      "grad_norm": 0.7393384575843811,
+      "learning_rate": 0.0001339417401373631,
+      "loss": 1.0209,
+      "step": 8770
+    },
+    {
+      "epoch": 1.5616096866096867,
+      "grad_norm": 0.6787426471710205,
+      "learning_rate": 0.00013392857328057713,
+      "loss": 0.9768,
+      "step": 8771
+    },
+    {
+      "epoch": 1.5617877492877494,
+      "grad_norm": 0.6295693516731262,
+      "learning_rate": 0.00013391540575903127,
+      "loss": 0.9011,
+      "step": 8772
+    },
+    {
+      "epoch": 1.561965811965812,
+      "grad_norm": 0.7114503979682922,
+      "learning_rate": 0.00013390223757298354,
+      "loss": 1.0696,
+      "step": 8773
+    },
+    {
+      "epoch": 1.5621438746438745,
+      "grad_norm": 0.7540110349655151,
+      "learning_rate": 0.00013388906872269184,
+      "loss": 1.0071,
+      "step": 8774
+    },
+    {
+      "epoch": 1.5623219373219372,
+      "grad_norm": 0.6472305059432983,
+      "learning_rate": 0.00013387589920841423,
+      "loss": 1.105,
+      "step": 8775
+    },
+    {
+      "epoch": 1.5625,
+      "grad_norm": 0.6936793327331543,
+      "learning_rate": 0.00013386272903040874,
+      "loss": 0.885,
+      "step": 8776
+    },
+    {
+      "epoch": 1.5626780626780628,
+      "grad_norm": 0.7487989068031311,
+      "learning_rate": 0.00013384955818893343,
+      "loss": 0.7842,
+      "step": 8777
+    },
+    {
+      "epoch": 1.5628561253561255,
+      "grad_norm": 0.6109505891799927,
+      "learning_rate": 0.00013383638668424633,
+      "loss": 0.9461,
+      "step": 8778
+    },
+    {
+      "epoch": 1.563034188034188,
+      "grad_norm": 0.6650055646896362,
+      "learning_rate": 0.00013382321451660558,
+      "loss": 1.0463,
+      "step": 8779
+    },
+    {
+      "epoch": 1.5632122507122506,
+      "grad_norm": 0.7147329449653625,
+      "learning_rate": 0.00013381004168626915,
+      "loss": 0.946,
+      "step": 8780
+    },
+    {
+      "epoch": 1.5633903133903133,
+      "grad_norm": 0.6919382810592651,
+      "learning_rate": 0.00013379686819349522,
+      "loss": 0.8946,
+      "step": 8781
+    },
+    {
+      "epoch": 1.563568376068376,
+      "grad_norm": 0.7339401245117188,
+      "learning_rate": 0.00013378369403854184,
+      "loss": 0.9625,
+      "step": 8782
+    },
+    {
+      "epoch": 1.5637464387464388,
+      "grad_norm": 0.6337129473686218,
+      "learning_rate": 0.00013377051922166717,
+      "loss": 1.0854,
+      "step": 8783
+    },
+    {
+      "epoch": 1.5639245014245016,
+      "grad_norm": 0.7301266193389893,
+      "learning_rate": 0.0001337573437431293,
+      "loss": 1.017,
+      "step": 8784
+    },
+    {
+      "epoch": 1.564102564102564,
+      "grad_norm": 0.689540684223175,
+      "learning_rate": 0.00013374416760318644,
+      "loss": 0.8734,
+      "step": 8785
+    },
+    {
+      "epoch": 1.5642806267806266,
+      "grad_norm": 0.7121307849884033,
+      "learning_rate": 0.0001337309908020967,
+      "loss": 1.0827,
+      "step": 8786
+    },
+    {
+      "epoch": 1.5644586894586894,
+      "grad_norm": 0.6715386509895325,
+      "learning_rate": 0.00013371781334011826,
+      "loss": 0.946,
+      "step": 8787
+    },
+    {
+      "epoch": 1.5646367521367521,
+      "grad_norm": 0.6895501613616943,
+      "learning_rate": 0.00013370463521750932,
+      "loss": 1.1113,
+      "step": 8788
+    },
+    {
+      "epoch": 1.5648148148148149,
+      "grad_norm": 0.6592531204223633,
+      "learning_rate": 0.00013369145643452805,
+      "loss": 0.9952,
+      "step": 8789
+    },
+    {
+      "epoch": 1.5649928774928776,
+      "grad_norm": 0.7495190501213074,
+      "learning_rate": 0.0001336782769914327,
+      "loss": 1.0936,
+      "step": 8790
+    },
+    {
+      "epoch": 1.5651709401709402,
+      "grad_norm": 0.7273977398872375,
+      "learning_rate": 0.00013366509688848147,
+      "loss": 1.1749,
+      "step": 8791
+    },
+    {
+      "epoch": 1.5653490028490027,
+      "grad_norm": 0.6447354555130005,
+      "learning_rate": 0.0001336519161259326,
+      "loss": 0.8638,
+      "step": 8792
+    },
+    {
+      "epoch": 1.5655270655270654,
+      "grad_norm": 0.6572020053863525,
+      "learning_rate": 0.00013363873470404432,
+      "loss": 0.8005,
+      "step": 8793
+    },
+    {
+      "epoch": 1.5657051282051282,
+      "grad_norm": 0.676418662071228,
+      "learning_rate": 0.00013362555262307491,
+      "loss": 0.7651,
+      "step": 8794
+    },
+    {
+      "epoch": 1.565883190883191,
+      "grad_norm": 0.6886745095252991,
+      "learning_rate": 0.0001336123698832827,
+      "loss": 1.0765,
+      "step": 8795
+    },
+    {
+      "epoch": 1.5660612535612537,
+      "grad_norm": 0.8134182095527649,
+      "learning_rate": 0.00013359918648492584,
+      "loss": 1.2228,
+      "step": 8796
+    },
+    {
+      "epoch": 1.5662393162393162,
+      "grad_norm": 0.7210384011268616,
+      "learning_rate": 0.00013358600242826277,
+      "loss": 0.8247,
+      "step": 8797
+    },
+    {
+      "epoch": 1.5664173789173788,
+      "grad_norm": 0.7086136341094971,
+      "learning_rate": 0.00013357281771355175,
+      "loss": 1.0323,
+      "step": 8798
+    },
+    {
+      "epoch": 1.5665954415954415,
+      "grad_norm": 0.7419785857200623,
+      "learning_rate": 0.0001335596323410511,
+      "loss": 1.213,
+      "step": 8799
+    },
+    {
+      "epoch": 1.5667735042735043,
+      "grad_norm": 0.6390291452407837,
+      "learning_rate": 0.0001335464463110192,
+      "loss": 1.0403,
+      "step": 8800
+    },
+    {
+      "epoch": 1.566951566951567,
+      "grad_norm": 0.6111941337585449,
+      "learning_rate": 0.00013353325962371434,
+      "loss": 0.9747,
+      "step": 8801
+    },
+    {
+      "epoch": 1.5671296296296298,
+      "grad_norm": 0.6792671084403992,
+      "learning_rate": 0.00013352007227939488,
+      "loss": 1.1179,
+      "step": 8802
+    },
+    {
+      "epoch": 1.5673076923076923,
+      "grad_norm": 0.6656535863876343,
+      "learning_rate": 0.0001335068842783193,
+      "loss": 0.9214,
+      "step": 8803
+    },
+    {
+      "epoch": 1.5674857549857548,
+      "grad_norm": 0.6910907626152039,
+      "learning_rate": 0.0001334936956207459,
+      "loss": 1.0609,
+      "step": 8804
+    },
+    {
+      "epoch": 1.5676638176638176,
+      "grad_norm": 0.65049147605896,
+      "learning_rate": 0.00013348050630693315,
+      "loss": 0.7189,
+      "step": 8805
+    },
+    {
+      "epoch": 1.5678418803418803,
+      "grad_norm": 0.6258065104484558,
+      "learning_rate": 0.0001334673163371394,
+      "loss": 1.0683,
+      "step": 8806
+    },
+    {
+      "epoch": 1.568019943019943,
+      "grad_norm": 0.7518934607505798,
+      "learning_rate": 0.00013345412571162305,
+      "loss": 1.2415,
+      "step": 8807
+    },
+    {
+      "epoch": 1.5681980056980058,
+      "grad_norm": 0.7395275235176086,
+      "learning_rate": 0.00013344093443064267,
+      "loss": 0.9153,
+      "step": 8808
+    },
+    {
+      "epoch": 1.5683760683760684,
+      "grad_norm": 0.6789839267730713,
+      "learning_rate": 0.00013342774249445663,
+      "loss": 0.8051,
+      "step": 8809
+    },
+    {
+      "epoch": 1.568554131054131,
+      "grad_norm": 0.786247193813324,
+      "learning_rate": 0.00013341454990332342,
+      "loss": 1.203,
+      "step": 8810
+    },
+    {
+      "epoch": 1.5687321937321936,
+      "grad_norm": 0.6858161687850952,
+      "learning_rate": 0.00013340135665750153,
+      "loss": 0.9494,
+      "step": 8811
+    },
+    {
+      "epoch": 1.5689102564102564,
+      "grad_norm": 0.7245797514915466,
+      "learning_rate": 0.0001333881627572494,
+      "loss": 1.0544,
+      "step": 8812
+    },
+    {
+      "epoch": 1.5690883190883191,
+      "grad_norm": 0.6176164150238037,
+      "learning_rate": 0.00013337496820282563,
+      "loss": 0.9084,
+      "step": 8813
+    },
+    {
+      "epoch": 1.569266381766382,
+      "grad_norm": 0.7342953681945801,
+      "learning_rate": 0.00013336177299448868,
+      "loss": 1.0006,
+      "step": 8814
+    },
+    {
+      "epoch": 1.5694444444444444,
+      "grad_norm": 0.5183523297309875,
+      "learning_rate": 0.00013334857713249708,
+      "loss": 0.6295,
+      "step": 8815
+    },
+    {
+      "epoch": 1.5696225071225072,
+      "grad_norm": 0.6664513349533081,
+      "learning_rate": 0.00013333538061710936,
+      "loss": 0.7569,
+      "step": 8816
+    },
+    {
+      "epoch": 1.5698005698005697,
+      "grad_norm": 0.7051160931587219,
+      "learning_rate": 0.0001333221834485841,
+      "loss": 0.9917,
+      "step": 8817
+    },
+    {
+      "epoch": 1.5699786324786325,
+      "grad_norm": 0.7888057231903076,
+      "learning_rate": 0.0001333089856271799,
+      "loss": 1.0337,
+      "step": 8818
+    },
+    {
+      "epoch": 1.5701566951566952,
+      "grad_norm": 0.6796144247055054,
+      "learning_rate": 0.00013329578715315534,
+      "loss": 1.0915,
+      "step": 8819
+    },
+    {
+      "epoch": 1.570334757834758,
+      "grad_norm": 0.7442883849143982,
+      "learning_rate": 0.000133282588026769,
+      "loss": 1.1695,
+      "step": 8820
+    },
+    {
+      "epoch": 1.5705128205128205,
+      "grad_norm": 0.6164735555648804,
+      "learning_rate": 0.00013326938824827946,
+      "loss": 1.0143,
+      "step": 8821
+    },
+    {
+      "epoch": 1.5706908831908832,
+      "grad_norm": 0.6526502966880798,
+      "learning_rate": 0.00013325618781794539,
+      "loss": 0.8402,
+      "step": 8822
+    },
+    {
+      "epoch": 1.5708689458689458,
+      "grad_norm": 0.6376087069511414,
+      "learning_rate": 0.00013324298673602535,
+      "loss": 0.7582,
+      "step": 8823
+    },
+    {
+      "epoch": 1.5710470085470085,
+      "grad_norm": 0.6888708472251892,
+      "learning_rate": 0.00013322978500277807,
+      "loss": 0.997,
+      "step": 8824
+    },
+    {
+      "epoch": 1.5712250712250713,
+      "grad_norm": 0.553656280040741,
+      "learning_rate": 0.0001332165826184622,
+      "loss": 0.6917,
+      "step": 8825
+    },
+    {
+      "epoch": 1.571403133903134,
+      "grad_norm": 0.643285870552063,
+      "learning_rate": 0.0001332033795833364,
+      "loss": 0.8689,
+      "step": 8826
+    },
+    {
+      "epoch": 1.5715811965811965,
+      "grad_norm": 0.6210280060768127,
+      "learning_rate": 0.00013319017589765933,
+      "loss": 0.9047,
+      "step": 8827
+    },
+    {
+      "epoch": 1.5717592592592593,
+      "grad_norm": 0.7612366676330566,
+      "learning_rate": 0.0001331769715616897,
+      "loss": 0.9818,
+      "step": 8828
+    },
+    {
+      "epoch": 1.5719373219373218,
+      "grad_norm": 0.5970702171325684,
+      "learning_rate": 0.00013316376657568628,
+      "loss": 0.82,
+      "step": 8829
+    },
+    {
+      "epoch": 1.5721153846153846,
+      "grad_norm": 0.7182583808898926,
+      "learning_rate": 0.0001331505609399077,
+      "loss": 1.0633,
+      "step": 8830
+    },
+    {
+      "epoch": 1.5722934472934473,
+      "grad_norm": 0.7230739593505859,
+      "learning_rate": 0.00013313735465461278,
+      "loss": 0.977,
+      "step": 8831
+    },
+    {
+      "epoch": 1.57247150997151,
+      "grad_norm": 0.6752985119819641,
+      "learning_rate": 0.00013312414772006018,
+      "loss": 0.9666,
+      "step": 8832
+    },
+    {
+      "epoch": 1.5726495726495726,
+      "grad_norm": 0.7724275588989258,
+      "learning_rate": 0.00013311094013650877,
+      "loss": 1.148,
+      "step": 8833
+    },
+    {
+      "epoch": 1.5728276353276354,
+      "grad_norm": 0.7216386198997498,
+      "learning_rate": 0.00013309773190421724,
+      "loss": 0.9935,
+      "step": 8834
+    },
+    {
+      "epoch": 1.573005698005698,
+      "grad_norm": 0.6422320008277893,
+      "learning_rate": 0.0001330845230234444,
+      "loss": 0.9383,
+      "step": 8835
+    },
+    {
+      "epoch": 1.5731837606837606,
+      "grad_norm": 0.669538140296936,
+      "learning_rate": 0.00013307131349444906,
+      "loss": 1.0866,
+      "step": 8836
+    },
+    {
+      "epoch": 1.5733618233618234,
+      "grad_norm": 0.6994584798812866,
+      "learning_rate": 0.00013305810331749003,
+      "loss": 0.7882,
+      "step": 8837
+    },
+    {
+      "epoch": 1.5735398860398861,
+      "grad_norm": 0.8094269633293152,
+      "learning_rate": 0.00013304489249282617,
+      "loss": 1.2316,
+      "step": 8838
+    },
+    {
+      "epoch": 1.5737179487179487,
+      "grad_norm": 0.7180120348930359,
+      "learning_rate": 0.00013303168102071625,
+      "loss": 0.9795,
+      "step": 8839
+    },
+    {
+      "epoch": 1.5738960113960114,
+      "grad_norm": 0.6191438436508179,
+      "learning_rate": 0.00013301846890141918,
+      "loss": 0.8957,
+      "step": 8840
+    },
+    {
+      "epoch": 1.574074074074074,
+      "grad_norm": 0.671094536781311,
+      "learning_rate": 0.00013300525613519382,
+      "loss": 1.059,
+      "step": 8841
+    },
+    {
+      "epoch": 1.5742521367521367,
+      "grad_norm": 0.8062624931335449,
+      "learning_rate": 0.000132992042722299,
+      "loss": 0.9782,
+      "step": 8842
+    },
+    {
+      "epoch": 1.5744301994301995,
+      "grad_norm": 0.6674807667732239,
+      "learning_rate": 0.00013297882866299362,
+      "loss": 0.7765,
+      "step": 8843
+    },
+    {
+      "epoch": 1.5746082621082622,
+      "grad_norm": 0.6369131803512573,
+      "learning_rate": 0.00013296561395753664,
+      "loss": 0.97,
+      "step": 8844
+    },
+    {
+      "epoch": 1.5747863247863247,
+      "grad_norm": 0.7913636565208435,
+      "learning_rate": 0.00013295239860618691,
+      "loss": 1.0458,
+      "step": 8845
+    },
+    {
+      "epoch": 1.5749643874643875,
+      "grad_norm": 0.6722261905670166,
+      "learning_rate": 0.0001329391826092034,
+      "loss": 1.1118,
+      "step": 8846
+    },
+    {
+      "epoch": 1.57514245014245,
+      "grad_norm": 0.6936299800872803,
+      "learning_rate": 0.00013292596596684502,
+      "loss": 1.009,
+      "step": 8847
+    },
+    {
+      "epoch": 1.5753205128205128,
+      "grad_norm": 0.7009961009025574,
+      "learning_rate": 0.00013291274867937073,
+      "loss": 0.9904,
+      "step": 8848
+    },
+    {
+      "epoch": 1.5754985754985755,
+      "grad_norm": 0.6900732517242432,
+      "learning_rate": 0.0001328995307470395,
+      "loss": 1.0488,
+      "step": 8849
+    },
+    {
+      "epoch": 1.5756766381766383,
+      "grad_norm": 0.6389018297195435,
+      "learning_rate": 0.00013288631217011032,
+      "loss": 0.9444,
+      "step": 8850
+    },
+    {
+      "epoch": 1.5758547008547008,
+      "grad_norm": 0.6370900869369507,
+      "learning_rate": 0.00013287309294884216,
+      "loss": 0.7465,
+      "step": 8851
+    },
+    {
+      "epoch": 1.5760327635327636,
+      "grad_norm": 0.6463848948478699,
+      "learning_rate": 0.00013285987308349405,
+      "loss": 0.896,
+      "step": 8852
+    },
+    {
+      "epoch": 1.576210826210826,
+      "grad_norm": 0.6022449731826782,
+      "learning_rate": 0.00013284665257432495,
+      "loss": 0.8822,
+      "step": 8853
+    },
+    {
+      "epoch": 1.5763888888888888,
+      "grad_norm": 0.768189013004303,
+      "learning_rate": 0.00013283343142159396,
+      "loss": 0.9862,
+      "step": 8854
+    },
+    {
+      "epoch": 1.5765669515669516,
+      "grad_norm": 0.6642358303070068,
+      "learning_rate": 0.00013282020962556007,
+      "loss": 1.0713,
+      "step": 8855
+    },
+    {
+      "epoch": 1.5767450142450143,
+      "grad_norm": 0.6883034706115723,
+      "learning_rate": 0.00013280698718648234,
+      "loss": 1.0351,
+      "step": 8856
+    },
+    {
+      "epoch": 1.5769230769230769,
+      "grad_norm": 0.602808952331543,
+      "learning_rate": 0.00013279376410461988,
+      "loss": 0.7615,
+      "step": 8857
+    },
+    {
+      "epoch": 1.5771011396011396,
+      "grad_norm": 0.5968614220619202,
+      "learning_rate": 0.0001327805403802317,
+      "loss": 0.9443,
+      "step": 8858
+    },
+    {
+      "epoch": 1.5772792022792022,
+      "grad_norm": 0.7314837574958801,
+      "learning_rate": 0.00013276731601357696,
+      "loss": 0.8784,
+      "step": 8859
+    },
+    {
+      "epoch": 1.577457264957265,
+      "grad_norm": 0.619754433631897,
+      "learning_rate": 0.0001327540910049147,
+      "loss": 0.954,
+      "step": 8860
+    },
+    {
+      "epoch": 1.5776353276353277,
+      "grad_norm": 0.7195139527320862,
+      "learning_rate": 0.0001327408653545041,
+      "loss": 1.0227,
+      "step": 8861
+    },
+    {
+      "epoch": 1.5778133903133904,
+      "grad_norm": 0.6796214580535889,
+      "learning_rate": 0.0001327276390626042,
+      "loss": 1.0593,
+      "step": 8862
+    },
+    {
+      "epoch": 1.577991452991453,
+      "grad_norm": 0.6576255559921265,
+      "learning_rate": 0.00013271441212947427,
+      "loss": 0.7921,
+      "step": 8863
+    },
+    {
+      "epoch": 1.5781695156695157,
+      "grad_norm": 0.7222092151641846,
+      "learning_rate": 0.00013270118455537336,
+      "loss": 1.0545,
+      "step": 8864
+    },
+    {
+      "epoch": 1.5783475783475782,
+      "grad_norm": 0.7159737348556519,
+      "learning_rate": 0.00013268795634056066,
+      "loss": 0.9664,
+      "step": 8865
+    },
+    {
+      "epoch": 1.578525641025641,
+      "grad_norm": 0.7120481133460999,
+      "learning_rate": 0.00013267472748529536,
+      "loss": 1.0148,
+      "step": 8866
+    },
+    {
+      "epoch": 1.5787037037037037,
+      "grad_norm": 0.7353253364562988,
+      "learning_rate": 0.00013266149798983666,
+      "loss": 0.9288,
+      "step": 8867
+    },
+    {
+      "epoch": 1.5788817663817665,
+      "grad_norm": 0.6652441620826721,
+      "learning_rate": 0.00013264826785444375,
+      "loss": 0.8246,
+      "step": 8868
+    },
+    {
+      "epoch": 1.5790598290598292,
+      "grad_norm": 0.7254189252853394,
+      "learning_rate": 0.00013263503707937584,
+      "loss": 0.9892,
+      "step": 8869
+    },
+    {
+      "epoch": 1.5792378917378918,
+      "grad_norm": 0.6305747032165527,
+      "learning_rate": 0.00013262180566489223,
+      "loss": 0.8931,
+      "step": 8870
+    },
+    {
+      "epoch": 1.5794159544159543,
+      "grad_norm": 0.6560617089271545,
+      "learning_rate": 0.00013260857361125205,
+      "loss": 0.9245,
+      "step": 8871
+    },
+    {
+      "epoch": 1.579594017094017,
+      "grad_norm": 0.7304151654243469,
+      "learning_rate": 0.00013259534091871462,
+      "loss": 1.009,
+      "step": 8872
+    },
+    {
+      "epoch": 1.5797720797720798,
+      "grad_norm": 0.782636821269989,
+      "learning_rate": 0.00013258210758753918,
+      "loss": 1.1123,
+      "step": 8873
+    },
+    {
+      "epoch": 1.5799501424501425,
+      "grad_norm": 0.6992011070251465,
+      "learning_rate": 0.00013256887361798504,
+      "loss": 1.099,
+      "step": 8874
+    },
+    {
+      "epoch": 1.5801282051282053,
+      "grad_norm": 0.7159731984138489,
+      "learning_rate": 0.00013255563901031148,
+      "loss": 1.0257,
+      "step": 8875
+    },
+    {
+      "epoch": 1.5803062678062678,
+      "grad_norm": 0.6055454611778259,
+      "learning_rate": 0.0001325424037647778,
+      "loss": 0.9199,
+      "step": 8876
+    },
+    {
+      "epoch": 1.5804843304843303,
+      "grad_norm": 0.6838310360908508,
+      "learning_rate": 0.00013252916788164334,
+      "loss": 0.8644,
+      "step": 8877
+    },
+    {
+      "epoch": 1.580662393162393,
+      "grad_norm": 0.7067445516586304,
+      "learning_rate": 0.00013251593136116738,
+      "loss": 1.0285,
+      "step": 8878
+    },
+    {
+      "epoch": 1.5808404558404558,
+      "grad_norm": 0.7021774649620056,
+      "learning_rate": 0.00013250269420360928,
+      "loss": 1.1263,
+      "step": 8879
+    },
+    {
+      "epoch": 1.5810185185185186,
+      "grad_norm": 0.6586757302284241,
+      "learning_rate": 0.00013248945640922843,
+      "loss": 0.906,
+      "step": 8880
+    },
+    {
+      "epoch": 1.5811965811965814,
+      "grad_norm": 0.6673910021781921,
+      "learning_rate": 0.00013247621797828418,
+      "loss": 1.0652,
+      "step": 8881
+    },
+    {
+      "epoch": 1.5813746438746439,
+      "grad_norm": 0.6763964295387268,
+      "learning_rate": 0.00013246297891103588,
+      "loss": 1.0227,
+      "step": 8882
+    },
+    {
+      "epoch": 1.5815527065527064,
+      "grad_norm": 0.6536892056465149,
+      "learning_rate": 0.00013244973920774298,
+      "loss": 0.9026,
+      "step": 8883
+    },
+    {
+      "epoch": 1.5817307692307692,
+      "grad_norm": 0.8010411858558655,
+      "learning_rate": 0.0001324364988686648,
+      "loss": 1.1167,
+      "step": 8884
+    },
+    {
+      "epoch": 1.581908831908832,
+      "grad_norm": 0.8159251809120178,
+      "learning_rate": 0.00013242325789406082,
+      "loss": 1.233,
+      "step": 8885
+    },
+    {
+      "epoch": 1.5820868945868947,
+      "grad_norm": 0.6487745046615601,
+      "learning_rate": 0.00013241001628419048,
+      "loss": 0.9888,
+      "step": 8886
+    },
+    {
+      "epoch": 1.5822649572649574,
+      "grad_norm": 0.6750285029411316,
+      "learning_rate": 0.00013239677403931318,
+      "loss": 0.8874,
+      "step": 8887
+    },
+    {
+      "epoch": 1.58244301994302,
+      "grad_norm": 0.7164602875709534,
+      "learning_rate": 0.0001323835311596884,
+      "loss": 1.2029,
+      "step": 8888
+    },
+    {
+      "epoch": 1.5826210826210825,
+      "grad_norm": 0.6081351041793823,
+      "learning_rate": 0.00013237028764557558,
+      "loss": 0.9593,
+      "step": 8889
+    },
+    {
+      "epoch": 1.5827991452991452,
+      "grad_norm": 0.7235409021377563,
+      "learning_rate": 0.00013235704349723424,
+      "loss": 1.5324,
+      "step": 8890
+    },
+    {
+      "epoch": 1.582977207977208,
+      "grad_norm": 0.6658480763435364,
+      "learning_rate": 0.0001323437987149238,
+      "loss": 0.9756,
+      "step": 8891
+    },
+    {
+      "epoch": 1.5831552706552707,
+      "grad_norm": 0.7924265265464783,
+      "learning_rate": 0.00013233055329890387,
+      "loss": 0.9329,
+      "step": 8892
+    },
+    {
+      "epoch": 1.5833333333333335,
+      "grad_norm": 0.6262093186378479,
+      "learning_rate": 0.0001323173072494339,
+      "loss": 0.8288,
+      "step": 8893
+    },
+    {
+      "epoch": 1.583511396011396,
+      "grad_norm": 0.6851989030838013,
+      "learning_rate": 0.0001323040605667734,
+      "loss": 0.9822,
+      "step": 8894
+    },
+    {
+      "epoch": 1.5836894586894585,
+      "grad_norm": 0.6963728666305542,
+      "learning_rate": 0.00013229081325118194,
+      "loss": 1.0416,
+      "step": 8895
+    },
+    {
+      "epoch": 1.5838675213675213,
+      "grad_norm": 0.6017457842826843,
+      "learning_rate": 0.0001322775653029191,
+      "loss": 0.8123,
+      "step": 8896
+    },
+    {
+      "epoch": 1.584045584045584,
+      "grad_norm": 0.7396472096443176,
+      "learning_rate": 0.0001322643167222444,
+      "loss": 1.0339,
+      "step": 8897
+    },
+    {
+      "epoch": 1.5842236467236468,
+      "grad_norm": 0.6360299587249756,
+      "learning_rate": 0.00013225106750941744,
+      "loss": 0.9463,
+      "step": 8898
+    },
+    {
+      "epoch": 1.5844017094017095,
+      "grad_norm": 0.6297624111175537,
+      "learning_rate": 0.00013223781766469783,
+      "loss": 0.9921,
+      "step": 8899
+    },
+    {
+      "epoch": 1.584579772079772,
+      "grad_norm": 0.7722037434577942,
+      "learning_rate": 0.0001322245671883451,
+      "loss": 0.8394,
+      "step": 8900
+    },
+    {
+      "epoch": 1.5847578347578346,
+      "grad_norm": 0.677364706993103,
+      "learning_rate": 0.00013221131608061895,
+      "loss": 1.0954,
+      "step": 8901
+    },
+    {
+      "epoch": 1.5849358974358974,
+      "grad_norm": 0.6954908967018127,
+      "learning_rate": 0.00013219806434177899,
+      "loss": 1.0637,
+      "step": 8902
+    },
+    {
+      "epoch": 1.58511396011396,
+      "grad_norm": 0.7079192996025085,
+      "learning_rate": 0.00013218481197208484,
+      "loss": 1.039,
+      "step": 8903
+    },
+    {
+      "epoch": 1.5852920227920229,
+      "grad_norm": 0.7070451378822327,
+      "learning_rate": 0.00013217155897179611,
+      "loss": 1.0025,
+      "step": 8904
+    },
+    {
+      "epoch": 1.5854700854700856,
+      "grad_norm": 0.6940776705741882,
+      "learning_rate": 0.00013215830534117257,
+      "loss": 0.8039,
+      "step": 8905
+    },
+    {
+      "epoch": 1.5856481481481481,
+      "grad_norm": 0.6545892953872681,
+      "learning_rate": 0.00013214505108047382,
+      "loss": 0.9347,
+      "step": 8906
+    },
+    {
+      "epoch": 1.5858262108262107,
+      "grad_norm": 0.6769635081291199,
+      "learning_rate": 0.00013213179618995957,
+      "loss": 1.0321,
+      "step": 8907
+    },
+    {
+      "epoch": 1.5860042735042734,
+      "grad_norm": 0.6505448222160339,
+      "learning_rate": 0.00013211854066988953,
+      "loss": 1.0558,
+      "step": 8908
+    },
+    {
+      "epoch": 1.5861823361823362,
+      "grad_norm": 0.6764090061187744,
+      "learning_rate": 0.00013210528452052336,
+      "loss": 0.8407,
+      "step": 8909
+    },
+    {
+      "epoch": 1.586360398860399,
+      "grad_norm": 0.6454851627349854,
+      "learning_rate": 0.00013209202774212088,
+      "loss": 0.7439,
+      "step": 8910
+    },
+    {
+      "epoch": 1.5865384615384617,
+      "grad_norm": 0.6911695599555969,
+      "learning_rate": 0.00013207877033494177,
+      "loss": 0.9625,
+      "step": 8911
+    },
+    {
+      "epoch": 1.5867165242165242,
+      "grad_norm": 0.7405226826667786,
+      "learning_rate": 0.0001320655122992458,
+      "loss": 1.054,
+      "step": 8912
+    },
+    {
+      "epoch": 1.5868945868945867,
+      "grad_norm": 0.7362869381904602,
+      "learning_rate": 0.00013205225363529274,
+      "loss": 1.0516,
+      "step": 8913
+    },
+    {
+      "epoch": 1.5870726495726495,
+      "grad_norm": 0.6923766136169434,
+      "learning_rate": 0.0001320389943433423,
+      "loss": 1.2323,
+      "step": 8914
+    },
+    {
+      "epoch": 1.5872507122507122,
+      "grad_norm": 0.7980395555496216,
+      "learning_rate": 0.00013202573442365435,
+      "loss": 1.0229,
+      "step": 8915
+    },
+    {
+      "epoch": 1.587428774928775,
+      "grad_norm": 0.7211610078811646,
+      "learning_rate": 0.00013201247387648868,
+      "loss": 1.0666,
+      "step": 8916
+    },
+    {
+      "epoch": 1.5876068376068377,
+      "grad_norm": 0.6728795766830444,
+      "learning_rate": 0.00013199921270210506,
+      "loss": 1.0322,
+      "step": 8917
+    },
+    {
+      "epoch": 1.5877849002849003,
+      "grad_norm": 0.6226436495780945,
+      "learning_rate": 0.00013198595090076337,
+      "loss": 1.0517,
+      "step": 8918
+    },
+    {
+      "epoch": 1.5879629629629628,
+      "grad_norm": 0.6396511197090149,
+      "learning_rate": 0.0001319726884727234,
+      "loss": 0.8662,
+      "step": 8919
+    },
+    {
+      "epoch": 1.5881410256410255,
+      "grad_norm": 0.5664374828338623,
+      "learning_rate": 0.00013195942541824497,
+      "loss": 0.6601,
+      "step": 8920
+    },
+    {
+      "epoch": 1.5883190883190883,
+      "grad_norm": 0.6556946039199829,
+      "learning_rate": 0.00013194616173758806,
+      "loss": 0.9662,
+      "step": 8921
+    },
+    {
+      "epoch": 1.588497150997151,
+      "grad_norm": 0.7332060933113098,
+      "learning_rate": 0.00013193289743101245,
+      "loss": 0.7687,
+      "step": 8922
+    },
+    {
+      "epoch": 1.5886752136752138,
+      "grad_norm": 0.6103306412696838,
+      "learning_rate": 0.00013191963249877805,
+      "loss": 0.8329,
+      "step": 8923
+    },
+    {
+      "epoch": 1.5888532763532763,
+      "grad_norm": 0.63165283203125,
+      "learning_rate": 0.00013190636694114475,
+      "loss": 0.8336,
+      "step": 8924
+    },
+    {
+      "epoch": 1.589031339031339,
+      "grad_norm": 0.6955820322036743,
+      "learning_rate": 0.00013189310075837246,
+      "loss": 1.0457,
+      "step": 8925
+    },
+    {
+      "epoch": 1.5892094017094016,
+      "grad_norm": 0.6911605596542358,
+      "learning_rate": 0.00013187983395072114,
+      "loss": 0.9389,
+      "step": 8926
+    },
+    {
+      "epoch": 1.5893874643874644,
+      "grad_norm": 0.6493414640426636,
+      "learning_rate": 0.00013186656651845068,
+      "loss": 0.9821,
+      "step": 8927
+    },
+    {
+      "epoch": 1.5895655270655271,
+      "grad_norm": 0.6168226599693298,
+      "learning_rate": 0.00013185329846182107,
+      "loss": 1.0259,
+      "step": 8928
+    },
+    {
+      "epoch": 1.5897435897435899,
+      "grad_norm": 0.6460188627243042,
+      "learning_rate": 0.0001318400297810922,
+      "loss": 0.9836,
+      "step": 8929
+    },
+    {
+      "epoch": 1.5899216524216524,
+      "grad_norm": 0.6630695462226868,
+      "learning_rate": 0.0001318267604765241,
+      "loss": 0.8936,
+      "step": 8930
+    },
+    {
+      "epoch": 1.5900997150997151,
+      "grad_norm": 0.6308651566505432,
+      "learning_rate": 0.00013181349054837676,
+      "loss": 0.9583,
+      "step": 8931
+    },
+    {
+      "epoch": 1.5902777777777777,
+      "grad_norm": 0.6508499979972839,
+      "learning_rate": 0.00013180021999691018,
+      "loss": 0.7647,
+      "step": 8932
+    },
+    {
+      "epoch": 1.5904558404558404,
+      "grad_norm": 0.6625795960426331,
+      "learning_rate": 0.00013178694882238432,
+      "loss": 1.0329,
+      "step": 8933
+    },
+    {
+      "epoch": 1.5906339031339032,
+      "grad_norm": 0.6721987128257751,
+      "learning_rate": 0.00013177367702505924,
+      "loss": 0.9377,
+      "step": 8934
+    },
+    {
+      "epoch": 1.590811965811966,
+      "grad_norm": 0.7295519709587097,
+      "learning_rate": 0.00013176040460519497,
+      "loss": 0.9396,
+      "step": 8935
+    },
+    {
+      "epoch": 1.5909900284900285,
+      "grad_norm": 0.6673944592475891,
+      "learning_rate": 0.0001317471315630515,
+      "loss": 1.0284,
+      "step": 8936
+    },
+    {
+      "epoch": 1.5911680911680912,
+      "grad_norm": 0.6858960390090942,
+      "learning_rate": 0.00013173385789888898,
+      "loss": 1.2022,
+      "step": 8937
+    },
+    {
+      "epoch": 1.5913461538461537,
+      "grad_norm": 0.5836796164512634,
+      "learning_rate": 0.00013172058361296743,
+      "loss": 1.0078,
+      "step": 8938
+    },
+    {
+      "epoch": 1.5915242165242165,
+      "grad_norm": 0.7732513546943665,
+      "learning_rate": 0.00013170730870554694,
+      "loss": 1.0912,
+      "step": 8939
+    },
+    {
+      "epoch": 1.5917022792022792,
+      "grad_norm": 0.7095892429351807,
+      "learning_rate": 0.0001316940331768876,
+      "loss": 1.0506,
+      "step": 8940
+    },
+    {
+      "epoch": 1.591880341880342,
+      "grad_norm": 0.757534384727478,
+      "learning_rate": 0.00013168075702724952,
+      "loss": 1.036,
+      "step": 8941
+    },
+    {
+      "epoch": 1.5920584045584045,
+      "grad_norm": 0.6719361543655396,
+      "learning_rate": 0.00013166748025689282,
+      "loss": 0.9406,
+      "step": 8942
+    },
+    {
+      "epoch": 1.5922364672364673,
+      "grad_norm": 0.6955735087394714,
+      "learning_rate": 0.00013165420286607763,
+      "loss": 0.9325,
+      "step": 8943
+    },
+    {
+      "epoch": 1.5924145299145298,
+      "grad_norm": 0.6810322999954224,
+      "learning_rate": 0.00013164092485506407,
+      "loss": 1.0402,
+      "step": 8944
+    },
+    {
+      "epoch": 1.5925925925925926,
+      "grad_norm": 0.6346224546432495,
+      "learning_rate": 0.00013162764622411233,
+      "loss": 0.9725,
+      "step": 8945
+    },
+    {
+      "epoch": 1.5927706552706553,
+      "grad_norm": 0.728705883026123,
+      "learning_rate": 0.00013161436697348258,
+      "loss": 0.9665,
+      "step": 8946
+    },
+    {
+      "epoch": 1.592948717948718,
+      "grad_norm": 0.6838595271110535,
+      "learning_rate": 0.00013160108710343494,
+      "loss": 0.9771,
+      "step": 8947
+    },
+    {
+      "epoch": 1.5931267806267806,
+      "grad_norm": 0.7052602767944336,
+      "learning_rate": 0.00013158780661422966,
+      "loss": 0.8819,
+      "step": 8948
+    },
+    {
+      "epoch": 1.5933048433048433,
+      "grad_norm": 0.7237630486488342,
+      "learning_rate": 0.00013157452550612697,
+      "loss": 1.0609,
+      "step": 8949
+    },
+    {
+      "epoch": 1.5934829059829059,
+      "grad_norm": 0.6554936766624451,
+      "learning_rate": 0.00013156124377938699,
+      "loss": 0.8592,
+      "step": 8950
+    },
+    {
+      "epoch": 1.5936609686609686,
+      "grad_norm": 0.6125665307044983,
+      "learning_rate": 0.00013154796143427,
+      "loss": 0.8399,
+      "step": 8951
+    },
+    {
+      "epoch": 1.5938390313390314,
+      "grad_norm": 0.6930897235870361,
+      "learning_rate": 0.0001315346784710363,
+      "loss": 0.9965,
+      "step": 8952
+    },
+    {
+      "epoch": 1.5940170940170941,
+      "grad_norm": 0.7808064818382263,
+      "learning_rate": 0.00013152139488994605,
+      "loss": 1.0527,
+      "step": 8953
+    },
+    {
+      "epoch": 1.5941951566951567,
+      "grad_norm": 0.6125522255897522,
+      "learning_rate": 0.0001315081106912595,
+      "loss": 1.1159,
+      "step": 8954
+    },
+    {
+      "epoch": 1.5943732193732194,
+      "grad_norm": 0.5863428711891174,
+      "learning_rate": 0.00013149482587523703,
+      "loss": 0.84,
+      "step": 8955
+    },
+    {
+      "epoch": 1.594551282051282,
+      "grad_norm": 0.7170202732086182,
+      "learning_rate": 0.00013148154044213882,
+      "loss": 1.0821,
+      "step": 8956
+    },
+    {
+      "epoch": 1.5947293447293447,
+      "grad_norm": 0.6409463882446289,
+      "learning_rate": 0.00013146825439222528,
+      "loss": 1.0097,
+      "step": 8957
+    },
+    {
+      "epoch": 1.5949074074074074,
+      "grad_norm": 0.7037690281867981,
+      "learning_rate": 0.00013145496772575666,
+      "loss": 1.1511,
+      "step": 8958
+    },
+    {
+      "epoch": 1.5950854700854702,
+      "grad_norm": 0.6400953531265259,
+      "learning_rate": 0.00013144168044299326,
+      "loss": 1.0809,
+      "step": 8959
+    },
+    {
+      "epoch": 1.5952635327635327,
+      "grad_norm": 0.6129940152168274,
+      "learning_rate": 0.00013142839254419545,
+      "loss": 0.8481,
+      "step": 8960
+    },
+    {
+      "epoch": 1.5954415954415955,
+      "grad_norm": 0.7452271580696106,
+      "learning_rate": 0.00013141510402962358,
+      "loss": 1.0649,
+      "step": 8961
+    },
+    {
+      "epoch": 1.595619658119658,
+      "grad_norm": 0.7407623529434204,
+      "learning_rate": 0.000131401814899538,
+      "loss": 0.9084,
+      "step": 8962
+    },
+    {
+      "epoch": 1.5957977207977208,
+      "grad_norm": 0.7103050947189331,
+      "learning_rate": 0.0001313885251541991,
+      "loss": 0.946,
+      "step": 8963
+    },
+    {
+      "epoch": 1.5959757834757835,
+      "grad_norm": 0.5566636323928833,
+      "learning_rate": 0.00013137523479386727,
+      "loss": 0.6781,
+      "step": 8964
+    },
+    {
+      "epoch": 1.5961538461538463,
+      "grad_norm": 0.8137457966804504,
+      "learning_rate": 0.00013136194381880288,
+      "loss": 0.9273,
+      "step": 8965
+    },
+    {
+      "epoch": 1.5963319088319088,
+      "grad_norm": 0.779330849647522,
+      "learning_rate": 0.0001313486522292663,
+      "loss": 1.1105,
+      "step": 8966
+    },
+    {
+      "epoch": 1.5965099715099715,
+      "grad_norm": 0.6807126998901367,
+      "learning_rate": 0.00013133536002551808,
+      "loss": 1.0728,
+      "step": 8967
+    },
+    {
+      "epoch": 1.596688034188034,
+      "grad_norm": 0.7371507287025452,
+      "learning_rate": 0.00013132206720781853,
+      "loss": 0.979,
+      "step": 8968
+    },
+    {
+      "epoch": 1.5968660968660968,
+      "grad_norm": 0.6811465620994568,
+      "learning_rate": 0.00013130877377642814,
+      "loss": 0.9821,
+      "step": 8969
+    },
+    {
+      "epoch": 1.5970441595441596,
+      "grad_norm": 0.6732743978500366,
+      "learning_rate": 0.00013129547973160738,
+      "loss": 0.8511,
+      "step": 8970
+    },
+    {
+      "epoch": 1.5972222222222223,
+      "grad_norm": 0.594901978969574,
+      "learning_rate": 0.0001312821850736167,
+      "loss": 0.9674,
+      "step": 8971
+    },
+    {
+      "epoch": 1.5974002849002849,
+      "grad_norm": 0.6743764281272888,
+      "learning_rate": 0.00013126888980271657,
+      "loss": 0.9268,
+      "step": 8972
+    },
+    {
+      "epoch": 1.5975783475783476,
+      "grad_norm": 0.7532161474227905,
+      "learning_rate": 0.00013125559391916752,
+      "loss": 1.0474,
+      "step": 8973
+    },
+    {
+      "epoch": 1.5977564102564101,
+      "grad_norm": 0.6331499814987183,
+      "learning_rate": 0.00013124229742323,
+      "loss": 1.05,
+      "step": 8974
+    },
+    {
+      "epoch": 1.5979344729344729,
+      "grad_norm": 0.7418690323829651,
+      "learning_rate": 0.0001312290003151646,
+      "loss": 0.9475,
+      "step": 8975
+    },
+    {
+      "epoch": 1.5981125356125356,
+      "grad_norm": 0.6511179804801941,
+      "learning_rate": 0.0001312157025952318,
+      "loss": 0.9206,
+      "step": 8976
+    },
+    {
+      "epoch": 1.5982905982905984,
+      "grad_norm": 0.6380775570869446,
+      "learning_rate": 0.00013120240426369215,
+      "loss": 0.9953,
+      "step": 8977
+    },
+    {
+      "epoch": 1.598468660968661,
+      "grad_norm": 0.8483675122261047,
+      "learning_rate": 0.00013118910532080623,
+      "loss": 0.9454,
+      "step": 8978
+    },
+    {
+      "epoch": 1.5986467236467237,
+      "grad_norm": 0.6700518727302551,
+      "learning_rate": 0.00013117580576683455,
+      "loss": 1.0413,
+      "step": 8979
+    },
+    {
+      "epoch": 1.5988247863247862,
+      "grad_norm": 0.7750083208084106,
+      "learning_rate": 0.00013116250560203774,
+      "loss": 1.1868,
+      "step": 8980
+    },
+    {
+      "epoch": 1.599002849002849,
+      "grad_norm": 0.7474972009658813,
+      "learning_rate": 0.00013114920482667635,
+      "loss": 1.0876,
+      "step": 8981
+    },
+    {
+      "epoch": 1.5991809116809117,
+      "grad_norm": 0.6920070052146912,
+      "learning_rate": 0.000131135903441011,
+      "loss": 1.0787,
+      "step": 8982
+    },
+    {
+      "epoch": 1.5993589743589745,
+      "grad_norm": 0.7572436928749084,
+      "learning_rate": 0.00013112260144530232,
+      "loss": 0.9798,
+      "step": 8983
+    },
+    {
+      "epoch": 1.5995370370370372,
+      "grad_norm": 0.6983019709587097,
+      "learning_rate": 0.00013110929883981088,
+      "loss": 1.1115,
+      "step": 8984
+    },
+    {
+      "epoch": 1.5997150997150997,
+      "grad_norm": 0.6352120041847229,
+      "learning_rate": 0.0001310959956247974,
+      "loss": 0.9962,
+      "step": 8985
+    },
+    {
+      "epoch": 1.5998931623931623,
+      "grad_norm": 0.596858561038971,
+      "learning_rate": 0.00013108269180052244,
+      "loss": 0.8686,
+      "step": 8986
+    },
+    {
+      "epoch": 1.600071225071225,
+      "grad_norm": 0.6237605214118958,
+      "learning_rate": 0.00013106938736724672,
+      "loss": 0.9166,
+      "step": 8987
+    },
+    {
+      "epoch": 1.6002492877492878,
+      "grad_norm": 0.6818585395812988,
+      "learning_rate": 0.0001310560823252309,
+      "loss": 0.9993,
+      "step": 8988
+    },
+    {
+      "epoch": 1.6004273504273505,
+      "grad_norm": 0.6372287273406982,
+      "learning_rate": 0.00013104277667473564,
+      "loss": 0.8589,
+      "step": 8989
+    },
+    {
+      "epoch": 1.6006054131054133,
+      "grad_norm": 0.6057302355766296,
+      "learning_rate": 0.0001310294704160217,
+      "loss": 0.9325,
+      "step": 8990
+    },
+    {
+      "epoch": 1.6007834757834758,
+      "grad_norm": 0.6999384164810181,
+      "learning_rate": 0.0001310161635493497,
+      "loss": 0.8691,
+      "step": 8991
+    },
+    {
+      "epoch": 1.6009615384615383,
+      "grad_norm": 0.6182113289833069,
+      "learning_rate": 0.00013100285607498045,
+      "loss": 1.0271,
+      "step": 8992
+    },
+    {
+      "epoch": 1.601139601139601,
+      "grad_norm": 0.6681149005889893,
+      "learning_rate": 0.0001309895479931746,
+      "loss": 0.989,
+      "step": 8993
+    },
+    {
+      "epoch": 1.6013176638176638,
+      "grad_norm": 0.6187826991081238,
+      "learning_rate": 0.00013097623930419293,
+      "loss": 0.8051,
+      "step": 8994
+    },
+    {
+      "epoch": 1.6014957264957266,
+      "grad_norm": 0.698793888092041,
+      "learning_rate": 0.00013096293000829621,
+      "loss": 1.0762,
+      "step": 8995
+    },
+    {
+      "epoch": 1.6016737891737893,
+      "grad_norm": 0.693149745464325,
+      "learning_rate": 0.0001309496201057452,
+      "loss": 1.0894,
+      "step": 8996
+    },
+    {
+      "epoch": 1.6018518518518519,
+      "grad_norm": 0.6664052605628967,
+      "learning_rate": 0.00013093630959680068,
+      "loss": 0.9835,
+      "step": 8997
+    },
+    {
+      "epoch": 1.6020299145299144,
+      "grad_norm": 0.6919469833374023,
+      "learning_rate": 0.0001309229984817234,
+      "loss": 0.9062,
+      "step": 8998
+    },
+    {
+      "epoch": 1.6022079772079771,
+      "grad_norm": 0.704781174659729,
+      "learning_rate": 0.00013090968676077427,
+      "loss": 0.8582,
+      "step": 8999
+    },
+    {
+      "epoch": 1.60238603988604,
+      "grad_norm": 0.8055264949798584,
+      "learning_rate": 0.000130896374434214,
+      "loss": 0.9813,
+      "step": 9000
+    },
+    {
+      "epoch": 1.6025641025641026,
+      "grad_norm": 0.6301952004432678,
+      "learning_rate": 0.00013088306150230348,
+      "loss": 0.7056,
+      "step": 9001
+    },
+    {
+      "epoch": 1.6027421652421654,
+      "grad_norm": 0.698544442653656,
+      "learning_rate": 0.00013086974796530347,
+      "loss": 0.9806,
+      "step": 9002
+    },
+    {
+      "epoch": 1.602920227920228,
+      "grad_norm": 0.669548511505127,
+      "learning_rate": 0.00013085643382347491,
+      "loss": 1.0317,
+      "step": 9003
+    },
+    {
+      "epoch": 1.6030982905982905,
+      "grad_norm": 0.6404716372489929,
+      "learning_rate": 0.00013084311907707864,
+      "loss": 0.8885,
+      "step": 9004
+    },
+    {
+      "epoch": 1.6032763532763532,
+      "grad_norm": 0.6968616843223572,
+      "learning_rate": 0.0001308298037263755,
+      "loss": 1.0665,
+      "step": 9005
+    },
+    {
+      "epoch": 1.603454415954416,
+      "grad_norm": 0.849311113357544,
+      "learning_rate": 0.00013081648777162644,
+      "loss": 1.1404,
+      "step": 9006
+    },
+    {
+      "epoch": 1.6036324786324787,
+      "grad_norm": 0.6603094935417175,
+      "learning_rate": 0.00013080317121309223,
+      "loss": 0.8341,
+      "step": 9007
+    },
+    {
+      "epoch": 1.6038105413105415,
+      "grad_norm": 0.6777810454368591,
+      "learning_rate": 0.00013078985405103394,
+      "loss": 1.044,
+      "step": 9008
+    },
+    {
+      "epoch": 1.603988603988604,
+      "grad_norm": 0.6783546209335327,
+      "learning_rate": 0.0001307765362857124,
+      "loss": 1.042,
+      "step": 9009
+    },
+    {
+      "epoch": 1.6041666666666665,
+      "grad_norm": 0.7251788377761841,
+      "learning_rate": 0.00013076321791738858,
+      "loss": 0.9004,
+      "step": 9010
+    },
+    {
+      "epoch": 1.6043447293447293,
+      "grad_norm": 0.7885342240333557,
+      "learning_rate": 0.00013074989894632338,
+      "loss": 1.1966,
+      "step": 9011
+    },
+    {
+      "epoch": 1.604522792022792,
+      "grad_norm": 0.7171013355255127,
+      "learning_rate": 0.0001307365793727778,
+      "loss": 1.2242,
+      "step": 9012
+    },
+    {
+      "epoch": 1.6047008547008548,
+      "grad_norm": 0.6027249693870544,
+      "learning_rate": 0.00013072325919701283,
+      "loss": 0.917,
+      "step": 9013
+    },
+    {
+      "epoch": 1.6048789173789175,
+      "grad_norm": 0.5957151055335999,
+      "learning_rate": 0.00013070993841928936,
+      "loss": 0.9154,
+      "step": 9014
+    },
+    {
+      "epoch": 1.60505698005698,
+      "grad_norm": 0.6190659403800964,
+      "learning_rate": 0.00013069661703986847,
+      "loss": 0.7071,
+      "step": 9015
+    },
+    {
+      "epoch": 1.6052350427350426,
+      "grad_norm": 0.6454868316650391,
+      "learning_rate": 0.00013068329505901117,
+      "loss": 0.8381,
+      "step": 9016
+    },
+    {
+      "epoch": 1.6054131054131053,
+      "grad_norm": 0.6255491375923157,
+      "learning_rate": 0.00013066997247697837,
+      "loss": 0.7515,
+      "step": 9017
+    },
+    {
+      "epoch": 1.605591168091168,
+      "grad_norm": 0.6214072108268738,
+      "learning_rate": 0.0001306566492940312,
+      "loss": 1.0101,
+      "step": 9018
+    },
+    {
+      "epoch": 1.6057692307692308,
+      "grad_norm": 0.7244150638580322,
+      "learning_rate": 0.0001306433255104307,
+      "loss": 1.2558,
+      "step": 9019
+    },
+    {
+      "epoch": 1.6059472934472936,
+      "grad_norm": 0.6162270903587341,
+      "learning_rate": 0.00013063000112643785,
+      "loss": 1.1009,
+      "step": 9020
+    },
+    {
+      "epoch": 1.6061253561253561,
+      "grad_norm": 0.7309414744377136,
+      "learning_rate": 0.0001306166761423138,
+      "loss": 1.1973,
+      "step": 9021
+    },
+    {
+      "epoch": 1.6063034188034186,
+      "grad_norm": 0.7150956392288208,
+      "learning_rate": 0.00013060335055831957,
+      "loss": 0.9136,
+      "step": 9022
+    },
+    {
+      "epoch": 1.6064814814814814,
+      "grad_norm": 0.8187742829322815,
+      "learning_rate": 0.00013059002437471623,
+      "loss": 1.0524,
+      "step": 9023
+    },
+    {
+      "epoch": 1.6066595441595442,
+      "grad_norm": 0.7928692698478699,
+      "learning_rate": 0.00013057669759176493,
+      "loss": 1.0249,
+      "step": 9024
+    },
+    {
+      "epoch": 1.606837606837607,
+      "grad_norm": 0.6929279565811157,
+      "learning_rate": 0.00013056337020972677,
+      "loss": 1.1804,
+      "step": 9025
+    },
+    {
+      "epoch": 1.6070156695156697,
+      "grad_norm": 0.6771654486656189,
+      "learning_rate": 0.00013055004222886285,
+      "loss": 1.0284,
+      "step": 9026
+    },
+    {
+      "epoch": 1.6071937321937322,
+      "grad_norm": 0.6689024567604065,
+      "learning_rate": 0.0001305367136494343,
+      "loss": 1.0431,
+      "step": 9027
+    },
+    {
+      "epoch": 1.6073717948717947,
+      "grad_norm": 0.71135413646698,
+      "learning_rate": 0.0001305233844717023,
+      "loss": 0.9692,
+      "step": 9028
+    },
+    {
+      "epoch": 1.6075498575498575,
+      "grad_norm": 0.5459749698638916,
+      "learning_rate": 0.00013051005469592796,
+      "loss": 0.5643,
+      "step": 9029
+    },
+    {
+      "epoch": 1.6077279202279202,
+      "grad_norm": 0.7225865125656128,
+      "learning_rate": 0.00013049672432237253,
+      "loss": 1.0954,
+      "step": 9030
+    },
+    {
+      "epoch": 1.607905982905983,
+      "grad_norm": 0.6878093481063843,
+      "learning_rate": 0.0001304833933512971,
+      "loss": 0.894,
+      "step": 9031
+    },
+    {
+      "epoch": 1.6080840455840457,
+      "grad_norm": 0.6967248320579529,
+      "learning_rate": 0.00013047006178296288,
+      "loss": 1.0356,
+      "step": 9032
+    },
+    {
+      "epoch": 1.6082621082621082,
+      "grad_norm": 0.6404993534088135,
+      "learning_rate": 0.00013045672961763114,
+      "loss": 0.8528,
+      "step": 9033
+    },
+    {
+      "epoch": 1.6084401709401708,
+      "grad_norm": 0.5919156074523926,
+      "learning_rate": 0.000130443396855563,
+      "loss": 0.7196,
+      "step": 9034
+    },
+    {
+      "epoch": 1.6086182336182335,
+      "grad_norm": 0.6792302131652832,
+      "learning_rate": 0.00013043006349701977,
+      "loss": 0.9519,
+      "step": 9035
+    },
+    {
+      "epoch": 1.6087962962962963,
+      "grad_norm": 0.6263542175292969,
+      "learning_rate": 0.00013041672954226268,
+      "loss": 1.0483,
+      "step": 9036
+    },
+    {
+      "epoch": 1.608974358974359,
+      "grad_norm": 0.5865579843521118,
+      "learning_rate": 0.00013040339499155294,
+      "loss": 0.8794,
+      "step": 9037
+    },
+    {
+      "epoch": 1.6091524216524218,
+      "grad_norm": 0.8383142948150635,
+      "learning_rate": 0.00013039005984515181,
+      "loss": 0.8929,
+      "step": 9038
+    },
+    {
+      "epoch": 1.6093304843304843,
+      "grad_norm": 0.6438691020011902,
+      "learning_rate": 0.00013037672410332063,
+      "loss": 0.9957,
+      "step": 9039
+    },
+    {
+      "epoch": 1.609508547008547,
+      "grad_norm": 0.74748694896698,
+      "learning_rate": 0.0001303633877663206,
+      "loss": 0.9809,
+      "step": 9040
+    },
+    {
+      "epoch": 1.6096866096866096,
+      "grad_norm": 0.6697205901145935,
+      "learning_rate": 0.00013035005083441312,
+      "loss": 0.9556,
+      "step": 9041
+    },
+    {
+      "epoch": 1.6098646723646723,
+      "grad_norm": 0.6577828526496887,
+      "learning_rate": 0.00013033671330785941,
+      "loss": 0.8956,
+      "step": 9042
+    },
+    {
+      "epoch": 1.610042735042735,
+      "grad_norm": 0.6423429846763611,
+      "learning_rate": 0.0001303233751869208,
+      "loss": 0.8467,
+      "step": 9043
+    },
+    {
+      "epoch": 1.6102207977207978,
+      "grad_norm": 0.6552175879478455,
+      "learning_rate": 0.00013031003647185867,
+      "loss": 0.8656,
+      "step": 9044
+    },
+    {
+      "epoch": 1.6103988603988604,
+      "grad_norm": 0.6755174398422241,
+      "learning_rate": 0.00013029669716293433,
+      "loss": 0.7836,
+      "step": 9045
+    },
+    {
+      "epoch": 1.6105769230769231,
+      "grad_norm": 0.6832906007766724,
+      "learning_rate": 0.00013028335726040914,
+      "loss": 1.1531,
+      "step": 9046
+    },
+    {
+      "epoch": 1.6107549857549857,
+      "grad_norm": 0.6498637795448303,
+      "learning_rate": 0.00013027001676454446,
+      "loss": 0.8637,
+      "step": 9047
+    },
+    {
+      "epoch": 1.6109330484330484,
+      "grad_norm": 0.6792944073677063,
+      "learning_rate": 0.0001302566756756017,
+      "loss": 1.0865,
+      "step": 9048
+    },
+    {
+      "epoch": 1.6111111111111112,
+      "grad_norm": 0.6801337003707886,
+      "learning_rate": 0.00013024333399384226,
+      "loss": 1.0738,
+      "step": 9049
+    },
+    {
+      "epoch": 1.611289173789174,
+      "grad_norm": 0.675216794013977,
+      "learning_rate": 0.0001302299917195275,
+      "loss": 1.1074,
+      "step": 9050
+    },
+    {
+      "epoch": 1.6114672364672364,
+      "grad_norm": 0.6418983340263367,
+      "learning_rate": 0.00013021664885291885,
+      "loss": 1.0025,
+      "step": 9051
+    },
+    {
+      "epoch": 1.6116452991452992,
+      "grad_norm": 0.7778789401054382,
+      "learning_rate": 0.0001302033053942777,
+      "loss": 1.0847,
+      "step": 9052
+    },
+    {
+      "epoch": 1.6118233618233617,
+      "grad_norm": 0.7672827243804932,
+      "learning_rate": 0.00013018996134386555,
+      "loss": 1.0565,
+      "step": 9053
+    },
+    {
+      "epoch": 1.6120014245014245,
+      "grad_norm": 0.6770617961883545,
+      "learning_rate": 0.00013017661670194382,
+      "loss": 0.9069,
+      "step": 9054
+    },
+    {
+      "epoch": 1.6121794871794872,
+      "grad_norm": 0.7161242961883545,
+      "learning_rate": 0.00013016327146877393,
+      "loss": 1.1301,
+      "step": 9055
+    },
+    {
+      "epoch": 1.61235754985755,
+      "grad_norm": 0.6923251152038574,
+      "learning_rate": 0.00013014992564461746,
+      "loss": 0.9546,
+      "step": 9056
+    },
+    {
+      "epoch": 1.6125356125356125,
+      "grad_norm": 0.622953474521637,
+      "learning_rate": 0.0001301365792297358,
+      "loss": 0.8152,
+      "step": 9057
+    },
+    {
+      "epoch": 1.6127136752136753,
+      "grad_norm": 0.7477008104324341,
+      "learning_rate": 0.00013012323222439046,
+      "loss": 0.8428,
+      "step": 9058
+    },
+    {
+      "epoch": 1.6128917378917378,
+      "grad_norm": 0.6612883806228638,
+      "learning_rate": 0.000130109884628843,
+      "loss": 1.0678,
+      "step": 9059
+    },
+    {
+      "epoch": 1.6130698005698005,
+      "grad_norm": 0.6406781077384949,
+      "learning_rate": 0.00013009653644335486,
+      "loss": 0.6792,
+      "step": 9060
+    },
+    {
+      "epoch": 1.6132478632478633,
+      "grad_norm": 0.6279141902923584,
+      "learning_rate": 0.00013008318766818763,
+      "loss": 0.9826,
+      "step": 9061
+    },
+    {
+      "epoch": 1.613425925925926,
+      "grad_norm": 0.6616412401199341,
+      "learning_rate": 0.00013006983830360285,
+      "loss": 1.0691,
+      "step": 9062
+    },
+    {
+      "epoch": 1.6136039886039886,
+      "grad_norm": 0.6520406603813171,
+      "learning_rate": 0.000130056488349862,
+      "loss": 0.9487,
+      "step": 9063
+    },
+    {
+      "epoch": 1.6137820512820513,
+      "grad_norm": 0.6378647089004517,
+      "learning_rate": 0.00013004313780722672,
+      "loss": 0.8557,
+      "step": 9064
+    },
+    {
+      "epoch": 1.6139601139601139,
+      "grad_norm": 0.6547569036483765,
+      "learning_rate": 0.00013002978667595857,
+      "loss": 0.879,
+      "step": 9065
+    },
+    {
+      "epoch": 1.6141381766381766,
+      "grad_norm": 0.7347842454910278,
+      "learning_rate": 0.00013001643495631914,
+      "loss": 1.0757,
+      "step": 9066
+    },
+    {
+      "epoch": 1.6143162393162394,
+      "grad_norm": 0.5988406538963318,
+      "learning_rate": 0.00013000308264857002,
+      "loss": 0.6754,
+      "step": 9067
+    },
+    {
+      "epoch": 1.614494301994302,
+      "grad_norm": 0.6949366331100464,
+      "learning_rate": 0.00012998972975297282,
+      "loss": 1.1236,
+      "step": 9068
+    },
+    {
+      "epoch": 1.6146723646723646,
+      "grad_norm": 0.7095484137535095,
+      "learning_rate": 0.00012997637626978913,
+      "loss": 1.0124,
+      "step": 9069
+    },
+    {
+      "epoch": 1.6148504273504274,
+      "grad_norm": 0.6634095311164856,
+      "learning_rate": 0.00012996302219928064,
+      "loss": 1.2018,
+      "step": 9070
+    },
+    {
+      "epoch": 1.61502849002849,
+      "grad_norm": 0.6894524693489075,
+      "learning_rate": 0.000129949667541709,
+      "loss": 0.9959,
+      "step": 9071
+    },
+    {
+      "epoch": 1.6152065527065527,
+      "grad_norm": 0.672334611415863,
+      "learning_rate": 0.00012993631229733582,
+      "loss": 1.0369,
+      "step": 9072
+    },
+    {
+      "epoch": 1.6153846153846154,
+      "grad_norm": 0.725759744644165,
+      "learning_rate": 0.00012992295646642278,
+      "loss": 1.0079,
+      "step": 9073
+    },
+    {
+      "epoch": 1.6155626780626782,
+      "grad_norm": 0.7941585779190063,
+      "learning_rate": 0.00012990960004923154,
+      "loss": 0.9468,
+      "step": 9074
+    },
+    {
+      "epoch": 1.6157407407407407,
+      "grad_norm": 0.6556950807571411,
+      "learning_rate": 0.00012989624304602385,
+      "loss": 0.9915,
+      "step": 9075
+    },
+    {
+      "epoch": 1.6159188034188035,
+      "grad_norm": 0.7515892386436462,
+      "learning_rate": 0.0001298828854570614,
+      "loss": 1.0924,
+      "step": 9076
+    },
+    {
+      "epoch": 1.616096866096866,
+      "grad_norm": 0.6944101452827454,
+      "learning_rate": 0.00012986952728260586,
+      "loss": 0.9632,
+      "step": 9077
+    },
+    {
+      "epoch": 1.6162749287749287,
+      "grad_norm": 0.6286170482635498,
+      "learning_rate": 0.000129856168522919,
+      "loss": 1.0311,
+      "step": 9078
+    },
+    {
+      "epoch": 1.6164529914529915,
+      "grad_norm": 0.8362757563591003,
+      "learning_rate": 0.0001298428091782625,
+      "loss": 1.1232,
+      "step": 9079
+    },
+    {
+      "epoch": 1.6166310541310542,
+      "grad_norm": 0.6199851632118225,
+      "learning_rate": 0.0001298294492488982,
+      "loss": 0.9454,
+      "step": 9080
+    },
+    {
+      "epoch": 1.6168091168091168,
+      "grad_norm": 0.7541791796684265,
+      "learning_rate": 0.0001298160887350878,
+      "loss": 0.9759,
+      "step": 9081
+    },
+    {
+      "epoch": 1.6169871794871795,
+      "grad_norm": 0.6940878033638,
+      "learning_rate": 0.00012980272763709304,
+      "loss": 0.9258,
+      "step": 9082
+    },
+    {
+      "epoch": 1.617165242165242,
+      "grad_norm": 0.6934045553207397,
+      "learning_rate": 0.00012978936595517575,
+      "loss": 1.0142,
+      "step": 9083
+    },
+    {
+      "epoch": 1.6173433048433048,
+      "grad_norm": 0.8147503733634949,
+      "learning_rate": 0.00012977600368959774,
+      "loss": 0.964,
+      "step": 9084
+    },
+    {
+      "epoch": 1.6175213675213675,
+      "grad_norm": 0.6583107709884644,
+      "learning_rate": 0.00012976264084062079,
+      "loss": 1.0315,
+      "step": 9085
+    },
+    {
+      "epoch": 1.6176994301994303,
+      "grad_norm": 0.7192013263702393,
+      "learning_rate": 0.0001297492774085067,
+      "loss": 0.9528,
+      "step": 9086
+    },
+    {
+      "epoch": 1.6178774928774928,
+      "grad_norm": 0.665888786315918,
+      "learning_rate": 0.00012973591339351733,
+      "loss": 1.0188,
+      "step": 9087
+    },
+    {
+      "epoch": 1.6180555555555556,
+      "grad_norm": 0.7170987725257874,
+      "learning_rate": 0.0001297225487959145,
+      "loss": 0.8969,
+      "step": 9088
+    },
+    {
+      "epoch": 1.618233618233618,
+      "grad_norm": 0.6768732070922852,
+      "learning_rate": 0.00012970918361596007,
+      "loss": 1.1951,
+      "step": 9089
+    },
+    {
+      "epoch": 1.6184116809116809,
+      "grad_norm": 0.6640290021896362,
+      "learning_rate": 0.00012969581785391592,
+      "loss": 0.9649,
+      "step": 9090
+    },
+    {
+      "epoch": 1.6185897435897436,
+      "grad_norm": 0.6200813055038452,
+      "learning_rate": 0.00012968245151004392,
+      "loss": 0.9446,
+      "step": 9091
+    },
+    {
+      "epoch": 1.6187678062678064,
+      "grad_norm": 0.6815837621688843,
+      "learning_rate": 0.0001296690845846059,
+      "loss": 1.0506,
+      "step": 9092
+    },
+    {
+      "epoch": 1.618945868945869,
+      "grad_norm": 0.7252637147903442,
+      "learning_rate": 0.0001296557170778638,
+      "loss": 1.1977,
+      "step": 9093
+    },
+    {
+      "epoch": 1.6191239316239316,
+      "grad_norm": 0.5609107613563538,
+      "learning_rate": 0.00012964234899007955,
+      "loss": 0.8009,
+      "step": 9094
+    },
+    {
+      "epoch": 1.6193019943019942,
+      "grad_norm": 0.6539437770843506,
+      "learning_rate": 0.00012962898032151506,
+      "loss": 0.8482,
+      "step": 9095
+    },
+    {
+      "epoch": 1.619480056980057,
+      "grad_norm": 0.6993300914764404,
+      "learning_rate": 0.0001296156110724322,
+      "loss": 1.0725,
+      "step": 9096
+    },
+    {
+      "epoch": 1.6196581196581197,
+      "grad_norm": 0.6768273711204529,
+      "learning_rate": 0.000129602241243093,
+      "loss": 0.9247,
+      "step": 9097
+    },
+    {
+      "epoch": 1.6198361823361824,
+      "grad_norm": 0.6896265745162964,
+      "learning_rate": 0.00012958887083375939,
+      "loss": 0.9526,
+      "step": 9098
+    },
+    {
+      "epoch": 1.6200142450142452,
+      "grad_norm": 0.7475146651268005,
+      "learning_rate": 0.00012957549984469327,
+      "loss": 0.8302,
+      "step": 9099
+    },
+    {
+      "epoch": 1.6201923076923077,
+      "grad_norm": 0.6622769236564636,
+      "learning_rate": 0.00012956212827615674,
+      "loss": 0.9505,
+      "step": 9100
+    },
+    {
+      "epoch": 1.6203703703703702,
+      "grad_norm": 0.6938058137893677,
+      "learning_rate": 0.00012954875612841167,
+      "loss": 0.9757,
+      "step": 9101
+    },
+    {
+      "epoch": 1.620548433048433,
+      "grad_norm": 0.7453510761260986,
+      "learning_rate": 0.0001295353834017201,
+      "loss": 1.0919,
+      "step": 9102
+    },
+    {
+      "epoch": 1.6207264957264957,
+      "grad_norm": 0.7868932485580444,
+      "learning_rate": 0.0001295220100963441,
+      "loss": 0.9265,
+      "step": 9103
+    },
+    {
+      "epoch": 1.6209045584045585,
+      "grad_norm": 0.6779825091362,
+      "learning_rate": 0.00012950863621254558,
+      "loss": 0.98,
+      "step": 9104
+    },
+    {
+      "epoch": 1.6210826210826212,
+      "grad_norm": 0.6825897097587585,
+      "learning_rate": 0.00012949526175058662,
+      "loss": 0.9218,
+      "step": 9105
+    },
+    {
+      "epoch": 1.6212606837606838,
+      "grad_norm": 0.6686047911643982,
+      "learning_rate": 0.00012948188671072934,
+      "loss": 0.9546,
+      "step": 9106
+    },
+    {
+      "epoch": 1.6214387464387463,
+      "grad_norm": 0.7456090450286865,
+      "learning_rate": 0.0001294685110932357,
+      "loss": 1.0819,
+      "step": 9107
+    },
+    {
+      "epoch": 1.621616809116809,
+      "grad_norm": 0.7111441493034363,
+      "learning_rate": 0.0001294551348983678,
+      "loss": 0.9916,
+      "step": 9108
+    },
+    {
+      "epoch": 1.6217948717948718,
+      "grad_norm": 0.6534699201583862,
+      "learning_rate": 0.00012944175812638773,
+      "loss": 1.0374,
+      "step": 9109
+    },
+    {
+      "epoch": 1.6219729344729346,
+      "grad_norm": 0.6046397089958191,
+      "learning_rate": 0.00012942838077755758,
+      "loss": 0.7922,
+      "step": 9110
+    },
+    {
+      "epoch": 1.6221509971509973,
+      "grad_norm": 0.7736679911613464,
+      "learning_rate": 0.00012941500285213942,
+      "loss": 1.0056,
+      "step": 9111
+    },
+    {
+      "epoch": 1.6223290598290598,
+      "grad_norm": 0.6850929260253906,
+      "learning_rate": 0.00012940162435039538,
+      "loss": 0.9538,
+      "step": 9112
+    },
+    {
+      "epoch": 1.6225071225071224,
+      "grad_norm": 0.6305751800537109,
+      "learning_rate": 0.00012938824527258756,
+      "loss": 0.9341,
+      "step": 9113
+    },
+    {
+      "epoch": 1.6226851851851851,
+      "grad_norm": 0.6740923523902893,
+      "learning_rate": 0.0001293748656189782,
+      "loss": 1.0037,
+      "step": 9114
+    },
+    {
+      "epoch": 1.6228632478632479,
+      "grad_norm": 0.6579762101173401,
+      "learning_rate": 0.00012936148538982928,
+      "loss": 1.0022,
+      "step": 9115
+    },
+    {
+      "epoch": 1.6230413105413106,
+      "grad_norm": 0.6500434279441833,
+      "learning_rate": 0.0001293481045854031,
+      "loss": 0.8589,
+      "step": 9116
+    },
+    {
+      "epoch": 1.6232193732193734,
+      "grad_norm": 0.7825912237167358,
+      "learning_rate": 0.00012933472320596177,
+      "loss": 1.0345,
+      "step": 9117
+    },
+    {
+      "epoch": 1.623397435897436,
+      "grad_norm": 0.8341414332389832,
+      "learning_rate": 0.0001293213412517675,
+      "loss": 1.0314,
+      "step": 9118
+    },
+    {
+      "epoch": 1.6235754985754984,
+      "grad_norm": 0.63664311170578,
+      "learning_rate": 0.00012930795872308242,
+      "loss": 0.819,
+      "step": 9119
+    },
+    {
+      "epoch": 1.6237535612535612,
+      "grad_norm": 0.6800840497016907,
+      "learning_rate": 0.00012929457562016878,
+      "loss": 0.95,
+      "step": 9120
+    },
+    {
+      "epoch": 1.623931623931624,
+      "grad_norm": 0.754165530204773,
+      "learning_rate": 0.0001292811919432888,
+      "loss": 1.1193,
+      "step": 9121
+    },
+    {
+      "epoch": 1.6241096866096867,
+      "grad_norm": 0.678871750831604,
+      "learning_rate": 0.00012926780769270465,
+      "loss": 0.9015,
+      "step": 9122
+    },
+    {
+      "epoch": 1.6242877492877494,
+      "grad_norm": 0.6642945408821106,
+      "learning_rate": 0.00012925442286867866,
+      "loss": 0.9095,
+      "step": 9123
+    },
+    {
+      "epoch": 1.624465811965812,
+      "grad_norm": 0.6089697480201721,
+      "learning_rate": 0.000129241037471473,
+      "loss": 0.8994,
+      "step": 9124
+    },
+    {
+      "epoch": 1.6246438746438745,
+      "grad_norm": 0.7320881485939026,
+      "learning_rate": 0.00012922765150134995,
+      "loss": 1.0518,
+      "step": 9125
+    },
+    {
+      "epoch": 1.6248219373219372,
+      "grad_norm": 0.7308032512664795,
+      "learning_rate": 0.0001292142649585718,
+      "loss": 1.0557,
+      "step": 9126
+    },
+    {
+      "epoch": 1.625,
+      "grad_norm": 0.6896602511405945,
+      "learning_rate": 0.0001292008778434008,
+      "loss": 1.145,
+      "step": 9127
+    },
+    {
+      "epoch": 1.6251780626780628,
+      "grad_norm": 0.6112532615661621,
+      "learning_rate": 0.00012918749015609926,
+      "loss": 0.9611,
+      "step": 9128
+    },
+    {
+      "epoch": 1.6253561253561255,
+      "grad_norm": 0.6856057643890381,
+      "learning_rate": 0.00012917410189692947,
+      "loss": 1.0124,
+      "step": 9129
+    },
+    {
+      "epoch": 1.625534188034188,
+      "grad_norm": 0.699252188205719,
+      "learning_rate": 0.00012916071306615378,
+      "loss": 0.8854,
+      "step": 9130
+    },
+    {
+      "epoch": 1.6257122507122506,
+      "grad_norm": 0.6306683421134949,
+      "learning_rate": 0.0001291473236640345,
+      "loss": 1.0722,
+      "step": 9131
+    },
+    {
+      "epoch": 1.6258903133903133,
+      "grad_norm": 0.6358118653297424,
+      "learning_rate": 0.00012913393369083393,
+      "loss": 0.889,
+      "step": 9132
+    },
+    {
+      "epoch": 1.626068376068376,
+      "grad_norm": 0.6953601837158203,
+      "learning_rate": 0.00012912054314681445,
+      "loss": 1.0168,
+      "step": 9133
+    },
+    {
+      "epoch": 1.6262464387464388,
+      "grad_norm": 0.6742331385612488,
+      "learning_rate": 0.00012910715203223844,
+      "loss": 0.8152,
+      "step": 9134
+    },
+    {
+      "epoch": 1.6264245014245016,
+      "grad_norm": 0.5872861742973328,
+      "learning_rate": 0.00012909376034736823,
+      "loss": 0.8702,
+      "step": 9135
+    },
+    {
+      "epoch": 1.626602564102564,
+      "grad_norm": 0.7580631971359253,
+      "learning_rate": 0.00012908036809246623,
+      "loss": 0.994,
+      "step": 9136
+    },
+    {
+      "epoch": 1.6267806267806266,
+      "grad_norm": 0.7544930577278137,
+      "learning_rate": 0.00012906697526779488,
+      "loss": 0.7475,
+      "step": 9137
+    },
+    {
+      "epoch": 1.6269586894586894,
+      "grad_norm": 0.6850766539573669,
+      "learning_rate": 0.00012905358187361647,
+      "loss": 1.0943,
+      "step": 9138
+    },
+    {
+      "epoch": 1.6271367521367521,
+      "grad_norm": 0.6821565628051758,
+      "learning_rate": 0.0001290401879101935,
+      "loss": 1.2928,
+      "step": 9139
+    },
+    {
+      "epoch": 1.6273148148148149,
+      "grad_norm": 0.6961034536361694,
+      "learning_rate": 0.00012902679337778835,
+      "loss": 0.8694,
+      "step": 9140
+    },
+    {
+      "epoch": 1.6274928774928776,
+      "grad_norm": 0.7159550786018372,
+      "learning_rate": 0.00012901339827666353,
+      "loss": 0.8827,
+      "step": 9141
+    },
+    {
+      "epoch": 1.6276709401709402,
+      "grad_norm": 0.7491081953048706,
+      "learning_rate": 0.0001290000026070814,
+      "loss": 0.8159,
+      "step": 9142
+    },
+    {
+      "epoch": 1.6278490028490027,
+      "grad_norm": 0.7107849717140198,
+      "learning_rate": 0.00012898660636930447,
+      "loss": 1.0625,
+      "step": 9143
+    },
+    {
+      "epoch": 1.6280270655270654,
+      "grad_norm": 0.7227210998535156,
+      "learning_rate": 0.0001289732095635952,
+      "loss": 0.9744,
+      "step": 9144
+    },
+    {
+      "epoch": 1.6282051282051282,
+      "grad_norm": 0.7141995429992676,
+      "learning_rate": 0.00012895981219021607,
+      "loss": 0.9836,
+      "step": 9145
+    },
+    {
+      "epoch": 1.628383190883191,
+      "grad_norm": 0.6445552706718445,
+      "learning_rate": 0.00012894641424942958,
+      "loss": 1.0183,
+      "step": 9146
+    },
+    {
+      "epoch": 1.6285612535612537,
+      "grad_norm": 0.698783278465271,
+      "learning_rate": 0.00012893301574149824,
+      "loss": 0.8392,
+      "step": 9147
+    },
+    {
+      "epoch": 1.6287393162393162,
+      "grad_norm": 0.6529116034507751,
+      "learning_rate": 0.00012891961666668458,
+      "loss": 0.9317,
+      "step": 9148
+    },
+    {
+      "epoch": 1.6289173789173788,
+      "grad_norm": 0.7780548930168152,
+      "learning_rate": 0.0001289062170252511,
+      "loss": 1.2406,
+      "step": 9149
+    },
+    {
+      "epoch": 1.6290954415954415,
+      "grad_norm": 0.6500990986824036,
+      "learning_rate": 0.0001288928168174603,
+      "loss": 1.0381,
+      "step": 9150
+    },
+    {
+      "epoch": 1.6292735042735043,
+      "grad_norm": 0.7098208665847778,
+      "learning_rate": 0.00012887941604357482,
+      "loss": 1.2126,
+      "step": 9151
+    },
+    {
+      "epoch": 1.629451566951567,
+      "grad_norm": 0.730648398399353,
+      "learning_rate": 0.0001288660147038572,
+      "loss": 0.8351,
+      "step": 9152
+    },
+    {
+      "epoch": 1.6296296296296298,
+      "grad_norm": 0.5520278215408325,
+      "learning_rate": 0.0001288526127985699,
+      "loss": 0.5877,
+      "step": 9153
+    },
+    {
+      "epoch": 1.6298076923076923,
+      "grad_norm": 0.7611770033836365,
+      "learning_rate": 0.00012883921032797563,
+      "loss": 1.2227,
+      "step": 9154
+    },
+    {
+      "epoch": 1.6299857549857548,
+      "grad_norm": 0.636820375919342,
+      "learning_rate": 0.00012882580729233696,
+      "loss": 0.8305,
+      "step": 9155
+    },
+    {
+      "epoch": 1.6301638176638176,
+      "grad_norm": 0.694492518901825,
+      "learning_rate": 0.00012881240369191644,
+      "loss": 1.0452,
+      "step": 9156
+    },
+    {
+      "epoch": 1.6303418803418803,
+      "grad_norm": 0.67826908826828,
+      "learning_rate": 0.00012879899952697677,
+      "loss": 0.8345,
+      "step": 9157
+    },
+    {
+      "epoch": 1.630519943019943,
+      "grad_norm": 0.5891323685646057,
+      "learning_rate": 0.00012878559479778052,
+      "loss": 0.8367,
+      "step": 9158
+    },
+    {
+      "epoch": 1.6306980056980058,
+      "grad_norm": 0.6766192317008972,
+      "learning_rate": 0.0001287721895045903,
+      "loss": 0.8319,
+      "step": 9159
+    },
+    {
+      "epoch": 1.6308760683760684,
+      "grad_norm": 0.5306392908096313,
+      "learning_rate": 0.0001287587836476688,
+      "loss": 0.7945,
+      "step": 9160
+    },
+    {
+      "epoch": 1.631054131054131,
+      "grad_norm": 0.6677970290184021,
+      "learning_rate": 0.0001287453772272787,
+      "loss": 1.1228,
+      "step": 9161
+    },
+    {
+      "epoch": 1.6312321937321936,
+      "grad_norm": 0.810052752494812,
+      "learning_rate": 0.00012873197024368266,
+      "loss": 0.8395,
+      "step": 9162
+    },
+    {
+      "epoch": 1.6314102564102564,
+      "grad_norm": 0.7619220018386841,
+      "learning_rate": 0.00012871856269714333,
+      "loss": 1.3713,
+      "step": 9163
+    },
+    {
+      "epoch": 1.6315883190883191,
+      "grad_norm": 0.6564521193504333,
+      "learning_rate": 0.00012870515458792342,
+      "loss": 1.0513,
+      "step": 9164
+    },
+    {
+      "epoch": 1.631766381766382,
+      "grad_norm": 0.6874445676803589,
+      "learning_rate": 0.00012869174591628564,
+      "loss": 1.0255,
+      "step": 9165
+    },
+    {
+      "epoch": 1.6319444444444444,
+      "grad_norm": 0.6958737373352051,
+      "learning_rate": 0.0001286783366824927,
+      "loss": 0.9361,
+      "step": 9166
+    },
+    {
+      "epoch": 1.6321225071225072,
+      "grad_norm": 0.6909199357032776,
+      "learning_rate": 0.0001286649268868073,
+      "loss": 0.9855,
+      "step": 9167
+    },
+    {
+      "epoch": 1.6323005698005697,
+      "grad_norm": 0.7671375274658203,
+      "learning_rate": 0.00012865151652949225,
+      "loss": 1.084,
+      "step": 9168
+    },
+    {
+      "epoch": 1.6324786324786325,
+      "grad_norm": 0.750200092792511,
+      "learning_rate": 0.00012863810561081023,
+      "loss": 0.9341,
+      "step": 9169
+    },
+    {
+      "epoch": 1.6326566951566952,
+      "grad_norm": 0.6595860123634338,
+      "learning_rate": 0.00012862469413102402,
+      "loss": 0.9386,
+      "step": 9170
+    },
+    {
+      "epoch": 1.632834757834758,
+      "grad_norm": 0.622373640537262,
+      "learning_rate": 0.0001286112820903964,
+      "loss": 0.7697,
+      "step": 9171
+    },
+    {
+      "epoch": 1.6330128205128205,
+      "grad_norm": 0.9628498554229736,
+      "learning_rate": 0.00012859786948919014,
+      "loss": 1.2629,
+      "step": 9172
+    },
+    {
+      "epoch": 1.6331908831908832,
+      "grad_norm": 0.7610561847686768,
+      "learning_rate": 0.000128584456327668,
+      "loss": 0.9748,
+      "step": 9173
+    },
+    {
+      "epoch": 1.6333689458689458,
+      "grad_norm": 0.6585374474525452,
+      "learning_rate": 0.00012857104260609285,
+      "loss": 0.9049,
+      "step": 9174
+    },
+    {
+      "epoch": 1.6335470085470085,
+      "grad_norm": 0.6996221542358398,
+      "learning_rate": 0.00012855762832472746,
+      "loss": 0.8893,
+      "step": 9175
+    },
+    {
+      "epoch": 1.6337250712250713,
+      "grad_norm": 0.6226270198822021,
+      "learning_rate": 0.00012854421348383466,
+      "loss": 0.8913,
+      "step": 9176
+    },
+    {
+      "epoch": 1.633903133903134,
+      "grad_norm": 0.6570866107940674,
+      "learning_rate": 0.00012853079808367731,
+      "loss": 0.8632,
+      "step": 9177
+    },
+    {
+      "epoch": 1.6340811965811965,
+      "grad_norm": 0.6899664402008057,
+      "learning_rate": 0.00012851738212451826,
+      "loss": 0.8177,
+      "step": 9178
+    },
+    {
+      "epoch": 1.6342592592592593,
+      "grad_norm": 0.75257807970047,
+      "learning_rate": 0.0001285039656066203,
+      "loss": 0.9096,
+      "step": 9179
+    },
+    {
+      "epoch": 1.6344373219373218,
+      "grad_norm": 0.6614963412284851,
+      "learning_rate": 0.00012849054853024638,
+      "loss": 0.9255,
+      "step": 9180
+    },
+    {
+      "epoch": 1.6346153846153846,
+      "grad_norm": 0.7245957851409912,
+      "learning_rate": 0.00012847713089565933,
+      "loss": 1.0122,
+      "step": 9181
+    },
+    {
+      "epoch": 1.6347934472934473,
+      "grad_norm": 0.7332839369773865,
+      "learning_rate": 0.00012846371270312204,
+      "loss": 0.8484,
+      "step": 9182
+    },
+    {
+      "epoch": 1.63497150997151,
+      "grad_norm": 0.628089189529419,
+      "learning_rate": 0.00012845029395289748,
+      "loss": 1.0171,
+      "step": 9183
+    },
+    {
+      "epoch": 1.6351495726495726,
+      "grad_norm": 0.7493528723716736,
+      "learning_rate": 0.00012843687464524848,
+      "loss": 1.1635,
+      "step": 9184
+    },
+    {
+      "epoch": 1.6353276353276354,
+      "grad_norm": 0.6328163146972656,
+      "learning_rate": 0.00012842345478043799,
+      "loss": 1.1254,
+      "step": 9185
+    },
+    {
+      "epoch": 1.635505698005698,
+      "grad_norm": 0.6720291376113892,
+      "learning_rate": 0.00012841003435872894,
+      "loss": 0.9729,
+      "step": 9186
+    },
+    {
+      "epoch": 1.6356837606837606,
+      "grad_norm": 0.6657332181930542,
+      "learning_rate": 0.00012839661338038427,
+      "loss": 1.1047,
+      "step": 9187
+    },
+    {
+      "epoch": 1.6358618233618234,
+      "grad_norm": 0.7416180968284607,
+      "learning_rate": 0.000128383191845667,
+      "loss": 0.9505,
+      "step": 9188
+    },
+    {
+      "epoch": 1.6360398860398861,
+      "grad_norm": 0.8737816214561462,
+      "learning_rate": 0.00012836976975484,
+      "loss": 1.0518,
+      "step": 9189
+    },
+    {
+      "epoch": 1.6362179487179487,
+      "grad_norm": 0.7351877093315125,
+      "learning_rate": 0.0001283563471081663,
+      "loss": 1.1152,
+      "step": 9190
+    },
+    {
+      "epoch": 1.6363960113960114,
+      "grad_norm": 0.6442788243293762,
+      "learning_rate": 0.00012834292390590893,
+      "loss": 0.9432,
+      "step": 9191
+    },
+    {
+      "epoch": 1.636574074074074,
+      "grad_norm": 0.6848029494285583,
+      "learning_rate": 0.0001283295001483308,
+      "loss": 0.8528,
+      "step": 9192
+    },
+    {
+      "epoch": 1.6367521367521367,
+      "grad_norm": 0.6627060174942017,
+      "learning_rate": 0.00012831607583569497,
+      "loss": 1.0222,
+      "step": 9193
+    },
+    {
+      "epoch": 1.6369301994301995,
+      "grad_norm": 0.7319555878639221,
+      "learning_rate": 0.00012830265096826446,
+      "loss": 0.9392,
+      "step": 9194
+    },
+    {
+      "epoch": 1.6371082621082622,
+      "grad_norm": 0.6986424326896667,
+      "learning_rate": 0.0001282892255463023,
+      "loss": 1.2095,
+      "step": 9195
+    },
+    {
+      "epoch": 1.6372863247863247,
+      "grad_norm": 0.6649929881095886,
+      "learning_rate": 0.0001282757995700715,
+      "loss": 0.9426,
+      "step": 9196
+    },
+    {
+      "epoch": 1.6374643874643875,
+      "grad_norm": 0.6789031624794006,
+      "learning_rate": 0.0001282623730398352,
+      "loss": 0.9705,
+      "step": 9197
+    },
+    {
+      "epoch": 1.63764245014245,
+      "grad_norm": 0.6388779878616333,
+      "learning_rate": 0.00012824894595585637,
+      "loss": 1.0698,
+      "step": 9198
+    },
+    {
+      "epoch": 1.6378205128205128,
+      "grad_norm": 0.636832594871521,
+      "learning_rate": 0.00012823551831839814,
+      "loss": 0.9445,
+      "step": 9199
+    },
+    {
+      "epoch": 1.6379985754985755,
+      "grad_norm": 0.670190691947937,
+      "learning_rate": 0.0001282220901277236,
+      "loss": 0.9847,
+      "step": 9200
+    },
+    {
+      "epoch": 1.6381766381766383,
+      "grad_norm": 0.6020209193229675,
+      "learning_rate": 0.0001282086613840958,
+      "loss": 1.0047,
+      "step": 9201
+    },
+    {
+      "epoch": 1.6383547008547008,
+      "grad_norm": 0.6648211479187012,
+      "learning_rate": 0.0001281952320877779,
+      "loss": 0.8717,
+      "step": 9202
+    },
+    {
+      "epoch": 1.6385327635327636,
+      "grad_norm": 0.7207710146903992,
+      "learning_rate": 0.000128181802239033,
+      "loss": 1.1232,
+      "step": 9203
+    },
+    {
+      "epoch": 1.638710826210826,
+      "grad_norm": 0.800992488861084,
+      "learning_rate": 0.0001281683718381242,
+      "loss": 1.0688,
+      "step": 9204
+    },
+    {
+      "epoch": 1.6388888888888888,
+      "grad_norm": 0.789398193359375,
+      "learning_rate": 0.0001281549408853147,
+      "loss": 1.1772,
+      "step": 9205
+    },
+    {
+      "epoch": 1.6390669515669516,
+      "grad_norm": 0.6514480710029602,
+      "learning_rate": 0.0001281415093808676,
+      "loss": 1.1685,
+      "step": 9206
+    },
+    {
+      "epoch": 1.6392450142450143,
+      "grad_norm": 0.6914686560630798,
+      "learning_rate": 0.00012812807732504608,
+      "loss": 1.1307,
+      "step": 9207
+    },
+    {
+      "epoch": 1.6394230769230769,
+      "grad_norm": 0.6788144111633301,
+      "learning_rate": 0.00012811464471811334,
+      "loss": 1.1735,
+      "step": 9208
+    },
+    {
+      "epoch": 1.6396011396011396,
+      "grad_norm": 0.7049870491027832,
+      "learning_rate": 0.00012810121156033252,
+      "loss": 1.0128,
+      "step": 9209
+    },
+    {
+      "epoch": 1.6397792022792022,
+      "grad_norm": 0.7156766057014465,
+      "learning_rate": 0.00012808777785196687,
+      "loss": 0.9503,
+      "step": 9210
+    },
+    {
+      "epoch": 1.639957264957265,
+      "grad_norm": 0.651716411113739,
+      "learning_rate": 0.0001280743435932795,
+      "loss": 1.1227,
+      "step": 9211
+    },
+    {
+      "epoch": 1.6401353276353277,
+      "grad_norm": 0.7276262044906616,
+      "learning_rate": 0.0001280609087845337,
+      "loss": 1.06,
+      "step": 9212
+    },
+    {
+      "epoch": 1.6403133903133904,
+      "grad_norm": 0.6591095924377441,
+      "learning_rate": 0.0001280474734259927,
+      "loss": 1.0861,
+      "step": 9213
+    },
+    {
+      "epoch": 1.640491452991453,
+      "grad_norm": 0.6675926446914673,
+      "learning_rate": 0.00012803403751791975,
+      "loss": 0.9815,
+      "step": 9214
+    },
+    {
+      "epoch": 1.6406695156695157,
+      "grad_norm": 0.6391474008560181,
+      "learning_rate": 0.00012802060106057803,
+      "loss": 0.8027,
+      "step": 9215
+    },
+    {
+      "epoch": 1.6408475783475782,
+      "grad_norm": 0.6384556293487549,
+      "learning_rate": 0.00012800716405423086,
+      "loss": 0.7877,
+      "step": 9216
+    },
+    {
+      "epoch": 1.641025641025641,
+      "grad_norm": 0.661191463470459,
+      "learning_rate": 0.00012799372649914146,
+      "loss": 0.9725,
+      "step": 9217
+    },
+    {
+      "epoch": 1.6412037037037037,
+      "grad_norm": 0.7418332695960999,
+      "learning_rate": 0.0001279802883955732,
+      "loss": 1.1756,
+      "step": 9218
+    },
+    {
+      "epoch": 1.6413817663817665,
+      "grad_norm": 0.6588954329490662,
+      "learning_rate": 0.00012796684974378928,
+      "loss": 1.0428,
+      "step": 9219
+    },
+    {
+      "epoch": 1.6415598290598292,
+      "grad_norm": 0.7566093802452087,
+      "learning_rate": 0.000127953410544053,
+      "loss": 1.1254,
+      "step": 9220
+    },
+    {
+      "epoch": 1.6417378917378918,
+      "grad_norm": 0.6801039576530457,
+      "learning_rate": 0.00012793997079662777,
+      "loss": 1.0854,
+      "step": 9221
+    },
+    {
+      "epoch": 1.6419159544159543,
+      "grad_norm": 0.7262716889381409,
+      "learning_rate": 0.0001279265305017768,
+      "loss": 0.9343,
+      "step": 9222
+    },
+    {
+      "epoch": 1.642094017094017,
+      "grad_norm": 0.628625750541687,
+      "learning_rate": 0.0001279130896597635,
+      "loss": 0.8942,
+      "step": 9223
+    },
+    {
+      "epoch": 1.6422720797720798,
+      "grad_norm": 0.6183576583862305,
+      "learning_rate": 0.0001278996482708512,
+      "loss": 0.9284,
+      "step": 9224
+    },
+    {
+      "epoch": 1.6424501424501425,
+      "grad_norm": 0.7912000417709351,
+      "learning_rate": 0.00012788620633530327,
+      "loss": 1.3043,
+      "step": 9225
+    },
+    {
+      "epoch": 1.6426282051282053,
+      "grad_norm": 0.6982026100158691,
+      "learning_rate": 0.00012787276385338298,
+      "loss": 1.0224,
+      "step": 9226
+    },
+    {
+      "epoch": 1.6428062678062678,
+      "grad_norm": 0.6734985709190369,
+      "learning_rate": 0.00012785932082535386,
+      "loss": 0.8781,
+      "step": 9227
+    },
+    {
+      "epoch": 1.6429843304843303,
+      "grad_norm": 0.8799532055854797,
+      "learning_rate": 0.0001278458772514792,
+      "loss": 1.1482,
+      "step": 9228
+    },
+    {
+      "epoch": 1.643162393162393,
+      "grad_norm": 0.590295672416687,
+      "learning_rate": 0.0001278324331320224,
+      "loss": 0.9502,
+      "step": 9229
+    },
+    {
+      "epoch": 1.6433404558404558,
+      "grad_norm": 0.6562125086784363,
+      "learning_rate": 0.0001278189884672469,
+      "loss": 0.9834,
+      "step": 9230
+    },
+    {
+      "epoch": 1.6435185185185186,
+      "grad_norm": 0.6848936676979065,
+      "learning_rate": 0.00012780554325741612,
+      "loss": 1.0414,
+      "step": 9231
+    },
+    {
+      "epoch": 1.6436965811965814,
+      "grad_norm": 0.5985032320022583,
+      "learning_rate": 0.00012779209750279344,
+      "loss": 0.9469,
+      "step": 9232
+    },
+    {
+      "epoch": 1.6438746438746439,
+      "grad_norm": 0.7500917911529541,
+      "learning_rate": 0.00012777865120364238,
+      "loss": 0.9626,
+      "step": 9233
+    },
+    {
+      "epoch": 1.6440527065527064,
+      "grad_norm": 0.6565709114074707,
+      "learning_rate": 0.00012776520436022634,
+      "loss": 1.0594,
+      "step": 9234
+    },
+    {
+      "epoch": 1.6442307692307692,
+      "grad_norm": 0.8005441427230835,
+      "learning_rate": 0.00012775175697280882,
+      "loss": 1.2379,
+      "step": 9235
+    },
+    {
+      "epoch": 1.644408831908832,
+      "grad_norm": 0.6734150648117065,
+      "learning_rate": 0.00012773830904165326,
+      "loss": 0.9171,
+      "step": 9236
+    },
+    {
+      "epoch": 1.6445868945868947,
+      "grad_norm": 0.6950868368148804,
+      "learning_rate": 0.00012772486056702314,
+      "loss": 1.1782,
+      "step": 9237
+    },
+    {
+      "epoch": 1.6447649572649574,
+      "grad_norm": 0.8009599447250366,
+      "learning_rate": 0.000127711411549182,
+      "loss": 1.0288,
+      "step": 9238
+    },
+    {
+      "epoch": 1.64494301994302,
+      "grad_norm": 0.6227970719337463,
+      "learning_rate": 0.0001276979619883933,
+      "loss": 0.9327,
+      "step": 9239
+    },
+    {
+      "epoch": 1.6451210826210825,
+      "grad_norm": 0.6828190088272095,
+      "learning_rate": 0.00012768451188492058,
+      "loss": 0.9816,
+      "step": 9240
+    },
+    {
+      "epoch": 1.6452991452991452,
+      "grad_norm": 0.9689767360687256,
+      "learning_rate": 0.00012767106123902738,
+      "loss": 0.9049,
+      "step": 9241
+    },
+    {
+      "epoch": 1.645477207977208,
+      "grad_norm": 0.677061140537262,
+      "learning_rate": 0.00012765761005097717,
+      "loss": 0.9472,
+      "step": 9242
+    },
+    {
+      "epoch": 1.6456552706552707,
+      "grad_norm": 0.7227110862731934,
+      "learning_rate": 0.00012764415832103356,
+      "loss": 1.0384,
+      "step": 9243
+    },
+    {
+      "epoch": 1.6458333333333335,
+      "grad_norm": 0.6540094614028931,
+      "learning_rate": 0.0001276307060494601,
+      "loss": 0.8166,
+      "step": 9244
+    },
+    {
+      "epoch": 1.646011396011396,
+      "grad_norm": 0.6921904683113098,
+      "learning_rate": 0.00012761725323652033,
+      "loss": 0.9746,
+      "step": 9245
+    },
+    {
+      "epoch": 1.6461894586894585,
+      "grad_norm": 0.6742660999298096,
+      "learning_rate": 0.0001276037998824779,
+      "loss": 0.8441,
+      "step": 9246
+    },
+    {
+      "epoch": 1.6463675213675213,
+      "grad_norm": 0.6611103415489197,
+      "learning_rate": 0.0001275903459875963,
+      "loss": 1.087,
+      "step": 9247
+    },
+    {
+      "epoch": 1.646545584045584,
+      "grad_norm": 0.6805498003959656,
+      "learning_rate": 0.00012757689155213923,
+      "loss": 0.923,
+      "step": 9248
+    },
+    {
+      "epoch": 1.6467236467236468,
+      "grad_norm": 0.6598179340362549,
+      "learning_rate": 0.00012756343657637024,
+      "loss": 0.9371,
+      "step": 9249
+    },
+    {
+      "epoch": 1.6469017094017095,
+      "grad_norm": 0.7147273421287537,
+      "learning_rate": 0.00012754998106055297,
+      "loss": 1.053,
+      "step": 9250
+    },
+    {
+      "epoch": 1.647079772079772,
+      "grad_norm": 0.72414630651474,
+      "learning_rate": 0.00012753652500495103,
+      "loss": 1.0547,
+      "step": 9251
+    },
+    {
+      "epoch": 1.6472578347578346,
+      "grad_norm": 0.7784913182258606,
+      "learning_rate": 0.00012752306840982811,
+      "loss": 0.9012,
+      "step": 9252
+    },
+    {
+      "epoch": 1.6474358974358974,
+      "grad_norm": 0.644026517868042,
+      "learning_rate": 0.0001275096112754478,
+      "loss": 1.0911,
+      "step": 9253
+    },
+    {
+      "epoch": 1.64761396011396,
+      "grad_norm": 0.691124677658081,
+      "learning_rate": 0.00012749615360207382,
+      "loss": 0.9918,
+      "step": 9254
+    },
+    {
+      "epoch": 1.6477920227920229,
+      "grad_norm": 0.6632972359657288,
+      "learning_rate": 0.00012748269538996986,
+      "loss": 0.9438,
+      "step": 9255
+    },
+    {
+      "epoch": 1.6479700854700856,
+      "grad_norm": 0.6548733115196228,
+      "learning_rate": 0.00012746923663939955,
+      "loss": 1.1082,
+      "step": 9256
+    },
+    {
+      "epoch": 1.6481481481481481,
+      "grad_norm": 0.6737542748451233,
+      "learning_rate": 0.00012745577735062664,
+      "loss": 0.9255,
+      "step": 9257
+    },
+    {
+      "epoch": 1.6483262108262107,
+      "grad_norm": 0.686862051486969,
+      "learning_rate": 0.00012744231752391479,
+      "loss": 0.9493,
+      "step": 9258
+    },
+    {
+      "epoch": 1.6485042735042734,
+      "grad_norm": 0.6096474528312683,
+      "learning_rate": 0.00012742885715952772,
+      "loss": 0.6849,
+      "step": 9259
+    },
+    {
+      "epoch": 1.6486823361823362,
+      "grad_norm": 0.702751636505127,
+      "learning_rate": 0.00012741539625772918,
+      "loss": 1.0335,
+      "step": 9260
+    },
+    {
+      "epoch": 1.648860398860399,
+      "grad_norm": 0.7470958232879639,
+      "learning_rate": 0.0001274019348187829,
+      "loss": 1.105,
+      "step": 9261
+    },
+    {
+      "epoch": 1.6490384615384617,
+      "grad_norm": 0.6642739176750183,
+      "learning_rate": 0.0001273884728429526,
+      "loss": 1.01,
+      "step": 9262
+    },
+    {
+      "epoch": 1.6492165242165242,
+      "grad_norm": 0.6470904350280762,
+      "learning_rate": 0.00012737501033050213,
+      "loss": 0.9009,
+      "step": 9263
+    },
+    {
+      "epoch": 1.6493945868945867,
+      "grad_norm": 0.7487246990203857,
+      "learning_rate": 0.00012736154728169518,
+      "loss": 0.9832,
+      "step": 9264
+    },
+    {
+      "epoch": 1.6495726495726495,
+      "grad_norm": 0.7370779514312744,
+      "learning_rate": 0.00012734808369679553,
+      "loss": 1.0464,
+      "step": 9265
+    },
+    {
+      "epoch": 1.6497507122507122,
+      "grad_norm": 0.7942814826965332,
+      "learning_rate": 0.00012733461957606702,
+      "loss": 1.102,
+      "step": 9266
+    },
+    {
+      "epoch": 1.649928774928775,
+      "grad_norm": 0.6535606980323792,
+      "learning_rate": 0.00012732115491977336,
+      "loss": 1.0655,
+      "step": 9267
+    },
+    {
+      "epoch": 1.6501068376068377,
+      "grad_norm": 0.601716935634613,
+      "learning_rate": 0.00012730768972817847,
+      "loss": 0.8236,
+      "step": 9268
+    },
+    {
+      "epoch": 1.6502849002849003,
+      "grad_norm": 0.7375118732452393,
+      "learning_rate": 0.00012729422400154614,
+      "loss": 0.9313,
+      "step": 9269
+    },
+    {
+      "epoch": 1.6504629629629628,
+      "grad_norm": 0.7360411882400513,
+      "learning_rate": 0.00012728075774014018,
+      "loss": 0.9254,
+      "step": 9270
+    },
+    {
+      "epoch": 1.6506410256410255,
+      "grad_norm": 0.8453929424285889,
+      "learning_rate": 0.00012726729094422444,
+      "loss": 1.0975,
+      "step": 9271
+    },
+    {
+      "epoch": 1.6508190883190883,
+      "grad_norm": 0.5615501999855042,
+      "learning_rate": 0.00012725382361406274,
+      "loss": 0.8243,
+      "step": 9272
+    },
+    {
+      "epoch": 1.650997150997151,
+      "grad_norm": 0.6494898796081543,
+      "learning_rate": 0.000127240355749919,
+      "loss": 0.9766,
+      "step": 9273
+    },
+    {
+      "epoch": 1.6511752136752138,
+      "grad_norm": 0.6544778347015381,
+      "learning_rate": 0.0001272268873520571,
+      "loss": 0.9969,
+      "step": 9274
+    },
+    {
+      "epoch": 1.6513532763532763,
+      "grad_norm": 0.6937400698661804,
+      "learning_rate": 0.00012721341842074092,
+      "loss": 1.0626,
+      "step": 9275
+    },
+    {
+      "epoch": 1.651531339031339,
+      "grad_norm": 0.7068421244621277,
+      "learning_rate": 0.0001271999489562343,
+      "loss": 1.0068,
+      "step": 9276
+    },
+    {
+      "epoch": 1.6517094017094016,
+      "grad_norm": 0.6425052285194397,
+      "learning_rate": 0.0001271864789588012,
+      "loss": 0.8716,
+      "step": 9277
+    },
+    {
+      "epoch": 1.6518874643874644,
+      "grad_norm": 0.6895090341567993,
+      "learning_rate": 0.0001271730084287055,
+      "loss": 1.081,
+      "step": 9278
+    },
+    {
+      "epoch": 1.6520655270655271,
+      "grad_norm": 0.6773712038993835,
+      "learning_rate": 0.00012715953736621116,
+      "loss": 0.7586,
+      "step": 9279
+    },
+    {
+      "epoch": 1.6522435897435899,
+      "grad_norm": 0.6085716485977173,
+      "learning_rate": 0.0001271460657715821,
+      "loss": 0.8627,
+      "step": 9280
+    },
+    {
+      "epoch": 1.6524216524216524,
+      "grad_norm": 0.6415461897850037,
+      "learning_rate": 0.00012713259364508227,
+      "loss": 0.9751,
+      "step": 9281
+    },
+    {
+      "epoch": 1.6525997150997151,
+      "grad_norm": 0.6460939645767212,
+      "learning_rate": 0.00012711912098697565,
+      "loss": 0.9578,
+      "step": 9282
+    },
+    {
+      "epoch": 1.6527777777777777,
+      "grad_norm": 0.6076797246932983,
+      "learning_rate": 0.00012710564779752615,
+      "loss": 0.9627,
+      "step": 9283
+    },
+    {
+      "epoch": 1.6529558404558404,
+      "grad_norm": 0.710782527923584,
+      "learning_rate": 0.00012709217407699783,
+      "loss": 0.8725,
+      "step": 9284
+    },
+    {
+      "epoch": 1.6531339031339032,
+      "grad_norm": 0.6793623566627502,
+      "learning_rate": 0.00012707869982565463,
+      "loss": 0.908,
+      "step": 9285
+    },
+    {
+      "epoch": 1.653311965811966,
+      "grad_norm": 0.6841681003570557,
+      "learning_rate": 0.00012706522504376055,
+      "loss": 0.8546,
+      "step": 9286
+    },
+    {
+      "epoch": 1.6534900284900285,
+      "grad_norm": 0.7908675670623779,
+      "learning_rate": 0.0001270517497315796,
+      "loss": 0.9409,
+      "step": 9287
+    },
+    {
+      "epoch": 1.6536680911680912,
+      "grad_norm": 0.6918683648109436,
+      "learning_rate": 0.0001270382738893758,
+      "loss": 1.0493,
+      "step": 9288
+    },
+    {
+      "epoch": 1.6538461538461537,
+      "grad_norm": 0.6891819834709167,
+      "learning_rate": 0.00012702479751741322,
+      "loss": 1.0675,
+      "step": 9289
+    },
+    {
+      "epoch": 1.6540242165242165,
+      "grad_norm": 0.6965166926383972,
+      "learning_rate": 0.00012701132061595586,
+      "loss": 0.8563,
+      "step": 9290
+    },
+    {
+      "epoch": 1.6542022792022792,
+      "grad_norm": 0.7549001574516296,
+      "learning_rate": 0.00012699784318526779,
+      "loss": 1.1572,
+      "step": 9291
+    },
+    {
+      "epoch": 1.654380341880342,
+      "grad_norm": 0.6100513339042664,
+      "learning_rate": 0.00012698436522561303,
+      "loss": 0.897,
+      "step": 9292
+    },
+    {
+      "epoch": 1.6545584045584045,
+      "grad_norm": 0.6477037668228149,
+      "learning_rate": 0.00012697088673725574,
+      "loss": 0.7961,
+      "step": 9293
+    },
+    {
+      "epoch": 1.6547364672364673,
+      "grad_norm": 0.7402619123458862,
+      "learning_rate": 0.0001269574077204599,
+      "loss": 1.2001,
+      "step": 9294
+    },
+    {
+      "epoch": 1.6549145299145298,
+      "grad_norm": 0.7162346243858337,
+      "learning_rate": 0.0001269439281754897,
+      "loss": 0.9963,
+      "step": 9295
+    },
+    {
+      "epoch": 1.6550925925925926,
+      "grad_norm": 0.6757413744926453,
+      "learning_rate": 0.0001269304481026092,
+      "loss": 1.0476,
+      "step": 9296
+    },
+    {
+      "epoch": 1.6552706552706553,
+      "grad_norm": 0.6455655097961426,
+      "learning_rate": 0.0001269169675020825,
+      "loss": 0.9716,
+      "step": 9297
+    },
+    {
+      "epoch": 1.655448717948718,
+      "grad_norm": 0.7705031037330627,
+      "learning_rate": 0.0001269034863741737,
+      "loss": 0.9886,
+      "step": 9298
+    },
+    {
+      "epoch": 1.6556267806267806,
+      "grad_norm": 0.6084272861480713,
+      "learning_rate": 0.000126890004719147,
+      "loss": 0.8231,
+      "step": 9299
+    },
+    {
+      "epoch": 1.6558048433048433,
+      "grad_norm": 0.7051045298576355,
+      "learning_rate": 0.00012687652253726652,
+      "loss": 0.8673,
+      "step": 9300
+    },
+    {
+      "epoch": 1.6559829059829059,
+      "grad_norm": 0.731675386428833,
+      "learning_rate": 0.0001268630398287964,
+      "loss": 0.8609,
+      "step": 9301
+    },
+    {
+      "epoch": 1.6561609686609686,
+      "grad_norm": 0.6796799302101135,
+      "learning_rate": 0.00012684955659400087,
+      "loss": 1.0157,
+      "step": 9302
+    },
+    {
+      "epoch": 1.6563390313390314,
+      "grad_norm": 0.6270264983177185,
+      "learning_rate": 0.000126836072833144,
+      "loss": 0.8924,
+      "step": 9303
+    },
+    {
+      "epoch": 1.6565170940170941,
+      "grad_norm": 0.7235464453697205,
+      "learning_rate": 0.00012682258854649004,
+      "loss": 0.8904,
+      "step": 9304
+    },
+    {
+      "epoch": 1.6566951566951567,
+      "grad_norm": 0.7644724249839783,
+      "learning_rate": 0.00012680910373430318,
+      "loss": 0.9119,
+      "step": 9305
+    },
+    {
+      "epoch": 1.6568732193732194,
+      "grad_norm": 0.661411702632904,
+      "learning_rate": 0.00012679561839684764,
+      "loss": 1.0066,
+      "step": 9306
+    },
+    {
+      "epoch": 1.657051282051282,
+      "grad_norm": 0.6981723308563232,
+      "learning_rate": 0.0001267821325343876,
+      "loss": 1.2579,
+      "step": 9307
+    },
+    {
+      "epoch": 1.6572293447293447,
+      "grad_norm": 0.6469807028770447,
+      "learning_rate": 0.0001267686461471873,
+      "loss": 0.8678,
+      "step": 9308
+    },
+    {
+      "epoch": 1.6574074074074074,
+      "grad_norm": 0.8255495429039001,
+      "learning_rate": 0.000126755159235511,
+      "loss": 0.9053,
+      "step": 9309
+    },
+    {
+      "epoch": 1.6575854700854702,
+      "grad_norm": 0.6882261037826538,
+      "learning_rate": 0.00012674167179962294,
+      "loss": 0.8364,
+      "step": 9310
+    },
+    {
+      "epoch": 1.6577635327635327,
+      "grad_norm": 0.6816701889038086,
+      "learning_rate": 0.00012672818383978733,
+      "loss": 0.9627,
+      "step": 9311
+    },
+    {
+      "epoch": 1.6579415954415955,
+      "grad_norm": 0.6993424892425537,
+      "learning_rate": 0.00012671469535626852,
+      "loss": 0.8337,
+      "step": 9312
+    },
+    {
+      "epoch": 1.658119658119658,
+      "grad_norm": 0.6271458864212036,
+      "learning_rate": 0.00012670120634933075,
+      "loss": 0.8322,
+      "step": 9313
+    },
+    {
+      "epoch": 1.6582977207977208,
+      "grad_norm": 0.7012003660202026,
+      "learning_rate": 0.00012668771681923827,
+      "loss": 0.8895,
+      "step": 9314
+    },
+    {
+      "epoch": 1.6584757834757835,
+      "grad_norm": 0.6704670190811157,
+      "learning_rate": 0.00012667422676625547,
+      "loss": 1.0544,
+      "step": 9315
+    },
+    {
+      "epoch": 1.6586538461538463,
+      "grad_norm": 0.6189491748809814,
+      "learning_rate": 0.0001266607361906466,
+      "loss": 0.9623,
+      "step": 9316
+    },
+    {
+      "epoch": 1.6588319088319088,
+      "grad_norm": 0.7065694332122803,
+      "learning_rate": 0.000126647245092676,
+      "loss": 0.8874,
+      "step": 9317
+    },
+    {
+      "epoch": 1.6590099715099715,
+      "grad_norm": 0.7473452687263489,
+      "learning_rate": 0.00012663375347260795,
+      "loss": 1.0576,
+      "step": 9318
+    },
+    {
+      "epoch": 1.659188034188034,
+      "grad_norm": 0.6839408874511719,
+      "learning_rate": 0.0001266202613307068,
+      "loss": 0.9127,
+      "step": 9319
+    },
+    {
+      "epoch": 1.6593660968660968,
+      "grad_norm": 0.7154020071029663,
+      "learning_rate": 0.00012660676866723699,
+      "loss": 1.1174,
+      "step": 9320
+    },
+    {
+      "epoch": 1.6595441595441596,
+      "grad_norm": 0.7123729586601257,
+      "learning_rate": 0.0001265932754824628,
+      "loss": 0.9617,
+      "step": 9321
+    },
+    {
+      "epoch": 1.6597222222222223,
+      "grad_norm": 0.7537810802459717,
+      "learning_rate": 0.0001265797817766486,
+      "loss": 1.0333,
+      "step": 9322
+    },
+    {
+      "epoch": 1.6599002849002849,
+      "grad_norm": 0.706551730632782,
+      "learning_rate": 0.00012656628755005884,
+      "loss": 1.0838,
+      "step": 9323
+    },
+    {
+      "epoch": 1.6600783475783476,
+      "grad_norm": 0.8104004859924316,
+      "learning_rate": 0.0001265527928029578,
+      "loss": 0.9807,
+      "step": 9324
+    },
+    {
+      "epoch": 1.6602564102564101,
+      "grad_norm": 0.6892881989479065,
+      "learning_rate": 0.00012653929753560998,
+      "loss": 0.9941,
+      "step": 9325
+    },
+    {
+      "epoch": 1.6604344729344729,
+      "grad_norm": 0.5919203758239746,
+      "learning_rate": 0.00012652580174827974,
+      "loss": 0.9268,
+      "step": 9326
+    },
+    {
+      "epoch": 1.6606125356125356,
+      "grad_norm": 0.6715863347053528,
+      "learning_rate": 0.00012651230544123154,
+      "loss": 1.0912,
+      "step": 9327
+    },
+    {
+      "epoch": 1.6607905982905984,
+      "grad_norm": 0.6765137314796448,
+      "learning_rate": 0.0001264988086147298,
+      "loss": 1.1576,
+      "step": 9328
+    },
+    {
+      "epoch": 1.660968660968661,
+      "grad_norm": 0.6781638860702515,
+      "learning_rate": 0.00012648531126903888,
+      "loss": 1.1162,
+      "step": 9329
+    },
+    {
+      "epoch": 1.6611467236467237,
+      "grad_norm": 0.715871274471283,
+      "learning_rate": 0.00012647181340442337,
+      "loss": 0.714,
+      "step": 9330
+    },
+    {
+      "epoch": 1.6613247863247862,
+      "grad_norm": 0.6237258315086365,
+      "learning_rate": 0.00012645831502114762,
+      "loss": 0.8512,
+      "step": 9331
+    },
+    {
+      "epoch": 1.661502849002849,
+      "grad_norm": 0.6668339967727661,
+      "learning_rate": 0.0001264448161194762,
+      "loss": 1.0384,
+      "step": 9332
+    },
+    {
+      "epoch": 1.6616809116809117,
+      "grad_norm": 0.8316730260848999,
+      "learning_rate": 0.00012643131669967352,
+      "loss": 0.8931,
+      "step": 9333
+    },
+    {
+      "epoch": 1.6618589743589745,
+      "grad_norm": 0.7013183832168579,
+      "learning_rate": 0.00012641781676200406,
+      "loss": 1.0548,
+      "step": 9334
+    },
+    {
+      "epoch": 1.6620370370370372,
+      "grad_norm": 0.6980466842651367,
+      "learning_rate": 0.00012640431630673243,
+      "loss": 0.8988,
+      "step": 9335
+    },
+    {
+      "epoch": 1.6622150997150997,
+      "grad_norm": 0.7045995593070984,
+      "learning_rate": 0.000126390815334123,
+      "loss": 1.107,
+      "step": 9336
+    },
+    {
+      "epoch": 1.6623931623931623,
+      "grad_norm": 0.6699773669242859,
+      "learning_rate": 0.00012637731384444043,
+      "loss": 1.1757,
+      "step": 9337
+    },
+    {
+      "epoch": 1.662571225071225,
+      "grad_norm": 0.6489999294281006,
+      "learning_rate": 0.00012636381183794916,
+      "loss": 0.9282,
+      "step": 9338
+    },
+    {
+      "epoch": 1.6627492877492878,
+      "grad_norm": 0.7085952758789062,
+      "learning_rate": 0.00012635030931491375,
+      "loss": 1.0221,
+      "step": 9339
+    },
+    {
+      "epoch": 1.6629273504273505,
+      "grad_norm": 0.6893135905265808,
+      "learning_rate": 0.00012633680627559878,
+      "loss": 1.0517,
+      "step": 9340
+    },
+    {
+      "epoch": 1.6631054131054133,
+      "grad_norm": 0.5659682154655457,
+      "learning_rate": 0.00012632330272026882,
+      "loss": 0.6294,
+      "step": 9341
+    },
+    {
+      "epoch": 1.6632834757834758,
+      "grad_norm": 0.6889018416404724,
+      "learning_rate": 0.00012630979864918838,
+      "loss": 1.0735,
+      "step": 9342
+    },
+    {
+      "epoch": 1.6634615384615383,
+      "grad_norm": 0.7333424687385559,
+      "learning_rate": 0.00012629629406262212,
+      "loss": 0.9079,
+      "step": 9343
+    },
+    {
+      "epoch": 1.663639601139601,
+      "grad_norm": 0.6340580582618713,
+      "learning_rate": 0.00012628278896083462,
+      "loss": 0.9738,
+      "step": 9344
+    },
+    {
+      "epoch": 1.6638176638176638,
+      "grad_norm": 0.7042564749717712,
+      "learning_rate": 0.00012626928334409044,
+      "loss": 0.959,
+      "step": 9345
+    },
+    {
+      "epoch": 1.6639957264957266,
+      "grad_norm": 0.711757242679596,
+      "learning_rate": 0.00012625577721265424,
+      "loss": 0.8113,
+      "step": 9346
+    },
+    {
+      "epoch": 1.6641737891737893,
+      "grad_norm": 0.7723299264907837,
+      "learning_rate": 0.0001262422705667906,
+      "loss": 1.1724,
+      "step": 9347
+    },
+    {
+      "epoch": 1.6643518518518519,
+      "grad_norm": 0.711334228515625,
+      "learning_rate": 0.00012622876340676422,
+      "loss": 1.0121,
+      "step": 9348
+    },
+    {
+      "epoch": 1.6645299145299144,
+      "grad_norm": 0.6954590678215027,
+      "learning_rate": 0.0001262152557328397,
+      "loss": 1.2093,
+      "step": 9349
+    },
+    {
+      "epoch": 1.6647079772079771,
+      "grad_norm": 0.6341620087623596,
+      "learning_rate": 0.00012620174754528166,
+      "loss": 1.0535,
+      "step": 9350
+    },
+    {
+      "epoch": 1.66488603988604,
+      "grad_norm": 0.6434268355369568,
+      "learning_rate": 0.00012618823884435484,
+      "loss": 0.8964,
+      "step": 9351
+    },
+    {
+      "epoch": 1.6650641025641026,
+      "grad_norm": 0.7685084939002991,
+      "learning_rate": 0.00012617472963032385,
+      "loss": 1.0639,
+      "step": 9352
+    },
+    {
+      "epoch": 1.6652421652421654,
+      "grad_norm": 0.6347958445549011,
+      "learning_rate": 0.00012616121990345345,
+      "loss": 1.0252,
+      "step": 9353
+    },
+    {
+      "epoch": 1.665420227920228,
+      "grad_norm": 0.647722601890564,
+      "learning_rate": 0.0001261477096640083,
+      "loss": 0.9527,
+      "step": 9354
+    },
+    {
+      "epoch": 1.6655982905982905,
+      "grad_norm": 0.5942047834396362,
+      "learning_rate": 0.000126134198912253,
+      "loss": 1.0062,
+      "step": 9355
+    },
+    {
+      "epoch": 1.6657763532763532,
+      "grad_norm": 0.683555006980896,
+      "learning_rate": 0.00012612068764845247,
+      "loss": 0.8101,
+      "step": 9356
+    },
+    {
+      "epoch": 1.665954415954416,
+      "grad_norm": 0.6832289099693298,
+      "learning_rate": 0.00012610717587287128,
+      "loss": 1.1436,
+      "step": 9357
+    },
+    {
+      "epoch": 1.6661324786324787,
+      "grad_norm": 0.7035253047943115,
+      "learning_rate": 0.00012609366358577422,
+      "loss": 0.9724,
+      "step": 9358
+    },
+    {
+      "epoch": 1.6663105413105415,
+      "grad_norm": 0.6471409797668457,
+      "learning_rate": 0.00012608015078742604,
+      "loss": 0.776,
+      "step": 9359
+    },
+    {
+      "epoch": 1.666488603988604,
+      "grad_norm": 0.7069687247276306,
+      "learning_rate": 0.00012606663747809145,
+      "loss": 0.9667,
+      "step": 9360
+    },
+    {
+      "epoch": 1.6666666666666665,
+      "grad_norm": 0.6744135618209839,
+      "learning_rate": 0.00012605312365803525,
+      "loss": 1.1152,
+      "step": 9361
+    },
+    {
+      "epoch": 1.6668447293447293,
+      "grad_norm": 0.7212334275245667,
+      "learning_rate": 0.00012603960932752227,
+      "loss": 1.1543,
+      "step": 9362
+    },
+    {
+      "epoch": 1.667022792022792,
+      "grad_norm": 0.6501669883728027,
+      "learning_rate": 0.0001260260944868172,
+      "loss": 0.8595,
+      "step": 9363
+    },
+    {
+      "epoch": 1.6672008547008548,
+      "grad_norm": 0.6970864534378052,
+      "learning_rate": 0.00012601257913618486,
+      "loss": 0.9364,
+      "step": 9364
+    },
+    {
+      "epoch": 1.6673789173789175,
+      "grad_norm": 0.6802223324775696,
+      "learning_rate": 0.00012599906327589007,
+      "loss": 0.8429,
+      "step": 9365
+    },
+    {
+      "epoch": 1.66755698005698,
+      "grad_norm": 0.6842933893203735,
+      "learning_rate": 0.00012598554690619764,
+      "loss": 1.1255,
+      "step": 9366
+    },
+    {
+      "epoch": 1.6677350427350426,
+      "grad_norm": 0.6547088623046875,
+      "learning_rate": 0.0001259720300273724,
+      "loss": 0.983,
+      "step": 9367
+    },
+    {
+      "epoch": 1.6679131054131053,
+      "grad_norm": 0.620424211025238,
+      "learning_rate": 0.0001259585126396792,
+      "loss": 0.918,
+      "step": 9368
+    },
+    {
+      "epoch": 1.668091168091168,
+      "grad_norm": 0.5659816861152649,
+      "learning_rate": 0.00012594499474338287,
+      "loss": 0.7788,
+      "step": 9369
+    },
+    {
+      "epoch": 1.6682692307692308,
+      "grad_norm": 0.5904595255851746,
+      "learning_rate": 0.00012593147633874826,
+      "loss": 0.801,
+      "step": 9370
+    },
+    {
+      "epoch": 1.6684472934472936,
+      "grad_norm": 0.6444024443626404,
+      "learning_rate": 0.0001259179574260402,
+      "loss": 1.0997,
+      "step": 9371
+    },
+    {
+      "epoch": 1.6686253561253561,
+      "grad_norm": 0.6408827304840088,
+      "learning_rate": 0.00012590443800552365,
+      "loss": 0.9839,
+      "step": 9372
+    },
+    {
+      "epoch": 1.6688034188034186,
+      "grad_norm": 0.752391517162323,
+      "learning_rate": 0.00012589091807746345,
+      "loss": 1.0249,
+      "step": 9373
+    },
+    {
+      "epoch": 1.6689814814814814,
+      "grad_norm": 0.8256397247314453,
+      "learning_rate": 0.00012587739764212448,
+      "loss": 0.9541,
+      "step": 9374
+    },
+    {
+      "epoch": 1.6691595441595442,
+      "grad_norm": 0.7878768444061279,
+      "learning_rate": 0.00012586387669977166,
+      "loss": 1.0071,
+      "step": 9375
+    },
+    {
+      "epoch": 1.669337606837607,
+      "grad_norm": 0.6179735660552979,
+      "learning_rate": 0.0001258503552506699,
+      "loss": 0.8495,
+      "step": 9376
+    },
+    {
+      "epoch": 1.6695156695156697,
+      "grad_norm": 0.6699580550193787,
+      "learning_rate": 0.00012583683329508413,
+      "loss": 0.8999,
+      "step": 9377
+    },
+    {
+      "epoch": 1.6696937321937322,
+      "grad_norm": 0.6542006731033325,
+      "learning_rate": 0.00012582331083327929,
+      "loss": 1.0357,
+      "step": 9378
+    },
+    {
+      "epoch": 1.6698717948717947,
+      "grad_norm": 0.7275210618972778,
+      "learning_rate": 0.0001258097878655203,
+      "loss": 1.0259,
+      "step": 9379
+    },
+    {
+      "epoch": 1.6700498575498575,
+      "grad_norm": 0.6836326122283936,
+      "learning_rate": 0.00012579626439207216,
+      "loss": 1.0428,
+      "step": 9380
+    },
+    {
+      "epoch": 1.6702279202279202,
+      "grad_norm": 0.760123610496521,
+      "learning_rate": 0.00012578274041319978,
+      "loss": 0.9716,
+      "step": 9381
+    },
+    {
+      "epoch": 1.670405982905983,
+      "grad_norm": 0.5525194406509399,
+      "learning_rate": 0.00012576921592916818,
+      "loss": 0.8253,
+      "step": 9382
+    },
+    {
+      "epoch": 1.6705840455840457,
+      "grad_norm": 0.6881270408630371,
+      "learning_rate": 0.00012575569094024232,
+      "loss": 1.0571,
+      "step": 9383
+    },
+    {
+      "epoch": 1.6707621082621082,
+      "grad_norm": 0.6776245832443237,
+      "learning_rate": 0.0001257421654466872,
+      "loss": 0.9119,
+      "step": 9384
+    },
+    {
+      "epoch": 1.6709401709401708,
+      "grad_norm": 0.7903014421463013,
+      "learning_rate": 0.0001257286394487678,
+      "loss": 1.0626,
+      "step": 9385
+    },
+    {
+      "epoch": 1.6711182336182335,
+      "grad_norm": 0.61158287525177,
+      "learning_rate": 0.0001257151129467492,
+      "loss": 0.9378,
+      "step": 9386
+    },
+    {
+      "epoch": 1.6712962962962963,
+      "grad_norm": 0.655189573764801,
+      "learning_rate": 0.00012570158594089637,
+      "loss": 0.9334,
+      "step": 9387
+    },
+    {
+      "epoch": 1.671474358974359,
+      "grad_norm": 0.6707320809364319,
+      "learning_rate": 0.0001256880584314743,
+      "loss": 1.1802,
+      "step": 9388
+    },
+    {
+      "epoch": 1.6716524216524218,
+      "grad_norm": 0.847341775894165,
+      "learning_rate": 0.00012567453041874814,
+      "loss": 1.1169,
+      "step": 9389
+    },
+    {
+      "epoch": 1.6718304843304843,
+      "grad_norm": 0.6136410236358643,
+      "learning_rate": 0.00012566100190298287,
+      "loss": 0.8959,
+      "step": 9390
+    },
+    {
+      "epoch": 1.672008547008547,
+      "grad_norm": 0.7203437089920044,
+      "learning_rate": 0.00012564747288444357,
+      "loss": 0.9803,
+      "step": 9391
+    },
+    {
+      "epoch": 1.6721866096866096,
+      "grad_norm": 0.7832576632499695,
+      "learning_rate": 0.00012563394336339534,
+      "loss": 0.8696,
+      "step": 9392
+    },
+    {
+      "epoch": 1.6723646723646723,
+      "grad_norm": 0.6940804719924927,
+      "learning_rate": 0.00012562041334010323,
+      "loss": 1.0571,
+      "step": 9393
+    },
+    {
+      "epoch": 1.672542735042735,
+      "grad_norm": 0.6042298674583435,
+      "learning_rate": 0.00012560688281483234,
+      "loss": 0.8835,
+      "step": 9394
+    },
+    {
+      "epoch": 1.6727207977207978,
+      "grad_norm": 0.7870675921440125,
+      "learning_rate": 0.00012559335178784776,
+      "loss": 1.1585,
+      "step": 9395
+    },
+    {
+      "epoch": 1.6728988603988604,
+      "grad_norm": 0.7448568940162659,
+      "learning_rate": 0.00012557982025941463,
+      "loss": 0.9699,
+      "step": 9396
+    },
+    {
+      "epoch": 1.6730769230769231,
+      "grad_norm": 0.7226544618606567,
+      "learning_rate": 0.00012556628822979807,
+      "loss": 0.7817,
+      "step": 9397
+    },
+    {
+      "epoch": 1.6732549857549857,
+      "grad_norm": 0.5652043223381042,
+      "learning_rate": 0.0001255527556992632,
+      "loss": 0.8077,
+      "step": 9398
+    },
+    {
+      "epoch": 1.6734330484330484,
+      "grad_norm": 0.6459930539131165,
+      "learning_rate": 0.00012553922266807517,
+      "loss": 1.22,
+      "step": 9399
+    },
+    {
+      "epoch": 1.6736111111111112,
+      "grad_norm": 0.7568991780281067,
+      "learning_rate": 0.00012552568913649912,
+      "loss": 1.1559,
+      "step": 9400
+    },
+    {
+      "epoch": 1.673789173789174,
+      "grad_norm": 0.7462680339813232,
+      "learning_rate": 0.0001255121551048002,
+      "loss": 1.1438,
+      "step": 9401
+    },
+    {
+      "epoch": 1.6739672364672364,
+      "grad_norm": 0.6653871536254883,
+      "learning_rate": 0.0001254986205732436,
+      "loss": 0.9468,
+      "step": 9402
+    },
+    {
+      "epoch": 1.6741452991452992,
+      "grad_norm": 0.6261825561523438,
+      "learning_rate": 0.0001254850855420945,
+      "loss": 0.8558,
+      "step": 9403
+    },
+    {
+      "epoch": 1.6743233618233617,
+      "grad_norm": 0.6442354321479797,
+      "learning_rate": 0.0001254715500116181,
+      "loss": 0.8605,
+      "step": 9404
+    },
+    {
+      "epoch": 1.6745014245014245,
+      "grad_norm": 0.7483665943145752,
+      "learning_rate": 0.00012545801398207958,
+      "loss": 0.9089,
+      "step": 9405
+    },
+    {
+      "epoch": 1.6746794871794872,
+      "grad_norm": 0.7319819927215576,
+      "learning_rate": 0.00012544447745374416,
+      "loss": 0.9937,
+      "step": 9406
+    },
+    {
+      "epoch": 1.67485754985755,
+      "grad_norm": 0.703014075756073,
+      "learning_rate": 0.00012543094042687708,
+      "loss": 0.9597,
+      "step": 9407
+    },
+    {
+      "epoch": 1.6750356125356125,
+      "grad_norm": 0.6593887209892273,
+      "learning_rate": 0.00012541740290174353,
+      "loss": 0.844,
+      "step": 9408
+    },
+    {
+      "epoch": 1.6752136752136753,
+      "grad_norm": 0.6567463874816895,
+      "learning_rate": 0.00012540386487860879,
+      "loss": 1.0744,
+      "step": 9409
+    },
+    {
+      "epoch": 1.6753917378917378,
+      "grad_norm": 0.7784611582756042,
+      "learning_rate": 0.00012539032635773805,
+      "loss": 0.974,
+      "step": 9410
+    },
+    {
+      "epoch": 1.6755698005698005,
+      "grad_norm": 0.6760087609291077,
+      "learning_rate": 0.00012537678733939663,
+      "loss": 0.8948,
+      "step": 9411
+    },
+    {
+      "epoch": 1.6757478632478633,
+      "grad_norm": 0.825965940952301,
+      "learning_rate": 0.0001253632478238498,
+      "loss": 1.1196,
+      "step": 9412
+    },
+    {
+      "epoch": 1.675925925925926,
+      "grad_norm": 0.7215564250946045,
+      "learning_rate": 0.00012534970781136277,
+      "loss": 1.1774,
+      "step": 9413
+    },
+    {
+      "epoch": 1.6761039886039886,
+      "grad_norm": 0.6548578143119812,
+      "learning_rate": 0.00012533616730220094,
+      "loss": 0.8671,
+      "step": 9414
+    },
+    {
+      "epoch": 1.6762820512820513,
+      "grad_norm": 0.7257684469223022,
+      "learning_rate": 0.00012532262629662947,
+      "loss": 1.105,
+      "step": 9415
+    },
+    {
+      "epoch": 1.6764601139601139,
+      "grad_norm": 0.6695847511291504,
+      "learning_rate": 0.00012530908479491378,
+      "loss": 0.9189,
+      "step": 9416
+    },
+    {
+      "epoch": 1.6766381766381766,
+      "grad_norm": 0.684695303440094,
+      "learning_rate": 0.00012529554279731915,
+      "loss": 1.066,
+      "step": 9417
+    },
+    {
+      "epoch": 1.6768162393162394,
+      "grad_norm": 0.7107276320457458,
+      "learning_rate": 0.0001252820003041109,
+      "loss": 0.9311,
+      "step": 9418
+    },
+    {
+      "epoch": 1.676994301994302,
+      "grad_norm": 0.6755440831184387,
+      "learning_rate": 0.0001252684573155544,
+      "loss": 1.1036,
+      "step": 9419
+    },
+    {
+      "epoch": 1.6771723646723646,
+      "grad_norm": 0.7571110725402832,
+      "learning_rate": 0.00012525491383191491,
+      "loss": 1.0244,
+      "step": 9420
+    },
+    {
+      "epoch": 1.6773504273504274,
+      "grad_norm": 0.6960614323616028,
+      "learning_rate": 0.0001252413698534579,
+      "loss": 0.9077,
+      "step": 9421
+    },
+    {
+      "epoch": 1.67752849002849,
+      "grad_norm": 0.6675550937652588,
+      "learning_rate": 0.00012522782538044867,
+      "loss": 1.0543,
+      "step": 9422
+    },
+    {
+      "epoch": 1.6777065527065527,
+      "grad_norm": 0.6637391448020935,
+      "learning_rate": 0.0001252142804131526,
+      "loss": 0.9471,
+      "step": 9423
+    },
+    {
+      "epoch": 1.6778846153846154,
+      "grad_norm": 0.6382880210876465,
+      "learning_rate": 0.00012520073495183508,
+      "loss": 0.9729,
+      "step": 9424
+    },
+    {
+      "epoch": 1.6780626780626782,
+      "grad_norm": 0.731922447681427,
+      "learning_rate": 0.0001251871889967615,
+      "loss": 1.0385,
+      "step": 9425
+    },
+    {
+      "epoch": 1.6782407407407407,
+      "grad_norm": 0.5868890285491943,
+      "learning_rate": 0.00012517364254819728,
+      "loss": 0.8466,
+      "step": 9426
+    },
+    {
+      "epoch": 1.6784188034188035,
+      "grad_norm": 0.8535677790641785,
+      "learning_rate": 0.00012516009560640786,
+      "loss": 1.1009,
+      "step": 9427
+    },
+    {
+      "epoch": 1.678596866096866,
+      "grad_norm": 0.7044199705123901,
+      "learning_rate": 0.0001251465481716586,
+      "loss": 1.0862,
+      "step": 9428
+    },
+    {
+      "epoch": 1.6787749287749287,
+      "grad_norm": 0.7207323312759399,
+      "learning_rate": 0.00012513300024421498,
+      "loss": 1.064,
+      "step": 9429
+    },
+    {
+      "epoch": 1.6789529914529915,
+      "grad_norm": 0.7739703059196472,
+      "learning_rate": 0.0001251194518243424,
+      "loss": 1.1738,
+      "step": 9430
+    },
+    {
+      "epoch": 1.6791310541310542,
+      "grad_norm": 0.6829344630241394,
+      "learning_rate": 0.00012510590291230637,
+      "loss": 1.0555,
+      "step": 9431
+    },
+    {
+      "epoch": 1.6793091168091168,
+      "grad_norm": 0.6760238409042358,
+      "learning_rate": 0.0001250923535083723,
+      "loss": 1.2177,
+      "step": 9432
+    },
+    {
+      "epoch": 1.6794871794871795,
+      "grad_norm": 0.6666911840438843,
+      "learning_rate": 0.0001250788036128057,
+      "loss": 0.8957,
+      "step": 9433
+    },
+    {
+      "epoch": 1.679665242165242,
+      "grad_norm": 0.747797429561615,
+      "learning_rate": 0.00012506525322587207,
+      "loss": 0.9793,
+      "step": 9434
+    },
+    {
+      "epoch": 1.6798433048433048,
+      "grad_norm": 0.6261107325553894,
+      "learning_rate": 0.00012505170234783686,
+      "loss": 0.7781,
+      "step": 9435
+    },
+    {
+      "epoch": 1.6800213675213675,
+      "grad_norm": 0.7055163979530334,
+      "learning_rate": 0.00012503815097896555,
+      "loss": 1.0617,
+      "step": 9436
+    },
+    {
+      "epoch": 1.6801994301994303,
+      "grad_norm": 0.5567409992218018,
+      "learning_rate": 0.00012502459911952371,
+      "loss": 0.7911,
+      "step": 9437
+    },
+    {
+      "epoch": 1.6803774928774928,
+      "grad_norm": 0.7410423159599304,
+      "learning_rate": 0.0001250110467697768,
+      "loss": 1.1041,
+      "step": 9438
+    },
+    {
+      "epoch": 1.6805555555555556,
+      "grad_norm": 0.6185283064842224,
+      "learning_rate": 0.00012499749392999045,
+      "loss": 0.8101,
+      "step": 9439
+    },
+    {
+      "epoch": 1.680733618233618,
+      "grad_norm": 0.6988311409950256,
+      "learning_rate": 0.0001249839406004301,
+      "loss": 0.8579,
+      "step": 9440
+    },
+    {
+      "epoch": 1.6809116809116809,
+      "grad_norm": 0.5588746070861816,
+      "learning_rate": 0.00012497038678136132,
+      "loss": 0.8035,
+      "step": 9441
+    },
+    {
+      "epoch": 1.6810897435897436,
+      "grad_norm": 0.6568905711174011,
+      "learning_rate": 0.0001249568324730497,
+      "loss": 0.7455,
+      "step": 9442
+    },
+    {
+      "epoch": 1.6812678062678064,
+      "grad_norm": 0.6924821138381958,
+      "learning_rate": 0.00012494327767576078,
+      "loss": 1.134,
+      "step": 9443
+    },
+    {
+      "epoch": 1.681445868945869,
+      "grad_norm": 0.6940170526504517,
+      "learning_rate": 0.00012492972238976018,
+      "loss": 0.9719,
+      "step": 9444
+    },
+    {
+      "epoch": 1.6816239316239316,
+      "grad_norm": 0.667465090751648,
+      "learning_rate": 0.00012491616661531343,
+      "loss": 0.953,
+      "step": 9445
+    },
+    {
+      "epoch": 1.6818019943019942,
+      "grad_norm": 0.7693275809288025,
+      "learning_rate": 0.00012490261035268612,
+      "loss": 1.1342,
+      "step": 9446
+    },
+    {
+      "epoch": 1.681980056980057,
+      "grad_norm": 0.7243115305900574,
+      "learning_rate": 0.00012488905360214393,
+      "loss": 1.1847,
+      "step": 9447
+    },
+    {
+      "epoch": 1.6821581196581197,
+      "grad_norm": 0.657357931137085,
+      "learning_rate": 0.00012487549636395245,
+      "loss": 0.8747,
+      "step": 9448
+    },
+    {
+      "epoch": 1.6823361823361824,
+      "grad_norm": 0.7471592426300049,
+      "learning_rate": 0.00012486193863837727,
+      "loss": 1.0472,
+      "step": 9449
+    },
+    {
+      "epoch": 1.6825142450142452,
+      "grad_norm": 0.7476530075073242,
+      "learning_rate": 0.00012484838042568406,
+      "loss": 1.0708,
+      "step": 9450
+    },
+    {
+      "epoch": 1.6826923076923077,
+      "grad_norm": 0.6031121611595154,
+      "learning_rate": 0.00012483482172613846,
+      "loss": 0.8243,
+      "step": 9451
+    },
+    {
+      "epoch": 1.6828703703703702,
+      "grad_norm": 0.6733492016792297,
+      "learning_rate": 0.00012482126254000607,
+      "loss": 0.7808,
+      "step": 9452
+    },
+    {
+      "epoch": 1.683048433048433,
+      "grad_norm": 0.5865318179130554,
+      "learning_rate": 0.00012480770286755265,
+      "loss": 0.829,
+      "step": 9453
+    },
+    {
+      "epoch": 1.6832264957264957,
+      "grad_norm": 0.6805713772773743,
+      "learning_rate": 0.0001247941427090438,
+      "loss": 0.7206,
+      "step": 9454
+    },
+    {
+      "epoch": 1.6834045584045585,
+      "grad_norm": 0.6514836549758911,
+      "learning_rate": 0.0001247805820647453,
+      "loss": 0.9499,
+      "step": 9455
+    },
+    {
+      "epoch": 1.6835826210826212,
+      "grad_norm": 0.7432990074157715,
+      "learning_rate": 0.0001247670209349227,
+      "loss": 1.1324,
+      "step": 9456
+    },
+    {
+      "epoch": 1.6837606837606838,
+      "grad_norm": 0.6348414421081543,
+      "learning_rate": 0.00012475345931984178,
+      "loss": 0.8246,
+      "step": 9457
+    },
+    {
+      "epoch": 1.6839387464387463,
+      "grad_norm": 0.7194374203681946,
+      "learning_rate": 0.00012473989721976825,
+      "loss": 0.9634,
+      "step": 9458
+    },
+    {
+      "epoch": 1.684116809116809,
+      "grad_norm": 0.7869647741317749,
+      "learning_rate": 0.00012472633463496785,
+      "loss": 1.2115,
+      "step": 9459
+    },
+    {
+      "epoch": 1.6842948717948718,
+      "grad_norm": 0.6672070026397705,
+      "learning_rate": 0.00012471277156570623,
+      "loss": 0.9842,
+      "step": 9460
+    },
+    {
+      "epoch": 1.6844729344729346,
+      "grad_norm": 0.6611466407775879,
+      "learning_rate": 0.00012469920801224925,
+      "loss": 0.9343,
+      "step": 9461
+    },
+    {
+      "epoch": 1.6846509971509973,
+      "grad_norm": 0.6715068221092224,
+      "learning_rate": 0.0001246856439748626,
+      "loss": 0.6852,
+      "step": 9462
+    },
+    {
+      "epoch": 1.6848290598290598,
+      "grad_norm": 0.641942024230957,
+      "learning_rate": 0.00012467207945381198,
+      "loss": 0.8863,
+      "step": 9463
+    },
+    {
+      "epoch": 1.6850071225071224,
+      "grad_norm": 0.8414762616157532,
+      "learning_rate": 0.00012465851444936325,
+      "loss": 1.3404,
+      "step": 9464
+    },
+    {
+      "epoch": 1.6851851851851851,
+      "grad_norm": 0.715752363204956,
+      "learning_rate": 0.00012464494896178216,
+      "loss": 1.123,
+      "step": 9465
+    },
+    {
+      "epoch": 1.6853632478632479,
+      "grad_norm": 0.6913973093032837,
+      "learning_rate": 0.00012463138299133447,
+      "loss": 1.0659,
+      "step": 9466
+    },
+    {
+      "epoch": 1.6855413105413106,
+      "grad_norm": 0.6998484134674072,
+      "learning_rate": 0.000124617816538286,
+      "loss": 1.0555,
+      "step": 9467
+    },
+    {
+      "epoch": 1.6857193732193734,
+      "grad_norm": 0.7313308119773865,
+      "learning_rate": 0.00012460424960290256,
+      "loss": 1.0915,
+      "step": 9468
+    },
+    {
+      "epoch": 1.685897435897436,
+      "grad_norm": 0.6790569424629211,
+      "learning_rate": 0.00012459068218544995,
+      "loss": 1.0214,
+      "step": 9469
+    },
+    {
+      "epoch": 1.6860754985754984,
+      "grad_norm": 0.6494466662406921,
+      "learning_rate": 0.00012457711428619402,
+      "loss": 0.9476,
+      "step": 9470
+    },
+    {
+      "epoch": 1.6862535612535612,
+      "grad_norm": 0.8048526048660278,
+      "learning_rate": 0.0001245635459054006,
+      "loss": 1.1852,
+      "step": 9471
+    },
+    {
+      "epoch": 1.686431623931624,
+      "grad_norm": 0.6237879395484924,
+      "learning_rate": 0.0001245499770433355,
+      "loss": 1.0106,
+      "step": 9472
+    },
+    {
+      "epoch": 1.6866096866096867,
+      "grad_norm": 0.6282906532287598,
+      "learning_rate": 0.0001245364077002646,
+      "loss": 0.9858,
+      "step": 9473
+    },
+    {
+      "epoch": 1.6867877492877494,
+      "grad_norm": 0.7239370346069336,
+      "learning_rate": 0.00012452283787645375,
+      "loss": 0.9586,
+      "step": 9474
+    },
+    {
+      "epoch": 1.686965811965812,
+      "grad_norm": 0.6438776850700378,
+      "learning_rate": 0.00012450926757216887,
+      "loss": 0.9198,
+      "step": 9475
+    },
+    {
+      "epoch": 1.6871438746438745,
+      "grad_norm": 0.6451360583305359,
+      "learning_rate": 0.00012449569678767578,
+      "loss": 1.0183,
+      "step": 9476
+    },
+    {
+      "epoch": 1.6873219373219372,
+      "grad_norm": 0.6950216293334961,
+      "learning_rate": 0.0001244821255232404,
+      "loss": 0.9048,
+      "step": 9477
+    },
+    {
+      "epoch": 1.6875,
+      "grad_norm": 0.710489809513092,
+      "learning_rate": 0.00012446855377912865,
+      "loss": 1.1596,
+      "step": 9478
+    },
+    {
+      "epoch": 1.6876780626780628,
+      "grad_norm": 0.6819305419921875,
+      "learning_rate": 0.0001244549815556064,
+      "loss": 0.8486,
+      "step": 9479
+    },
+    {
+      "epoch": 1.6878561253561255,
+      "grad_norm": 0.7185879945755005,
+      "learning_rate": 0.00012444140885293958,
+      "loss": 0.9539,
+      "step": 9480
+    },
+    {
+      "epoch": 1.688034188034188,
+      "grad_norm": 0.8181464672088623,
+      "learning_rate": 0.00012442783567139415,
+      "loss": 1.0038,
+      "step": 9481
+    },
+    {
+      "epoch": 1.6882122507122506,
+      "grad_norm": 0.47161349654197693,
+      "learning_rate": 0.000124414262011236,
+      "loss": 0.67,
+      "step": 9482
+    },
+    {
+      "epoch": 1.6883903133903133,
+      "grad_norm": 0.7752482295036316,
+      "learning_rate": 0.00012440068787273112,
+      "loss": 0.9944,
+      "step": 9483
+    },
+    {
+      "epoch": 1.688568376068376,
+      "grad_norm": 0.7119397521018982,
+      "learning_rate": 0.00012438711325614543,
+      "loss": 0.9098,
+      "step": 9484
+    },
+    {
+      "epoch": 1.6887464387464388,
+      "grad_norm": 0.7161153554916382,
+      "learning_rate": 0.00012437353816174493,
+      "loss": 1.0003,
+      "step": 9485
+    },
+    {
+      "epoch": 1.6889245014245016,
+      "grad_norm": 0.5989507436752319,
+      "learning_rate": 0.0001243599625897956,
+      "loss": 1.0301,
+      "step": 9486
+    },
+    {
+      "epoch": 1.689102564102564,
+      "grad_norm": 0.7906841039657593,
+      "learning_rate": 0.00012434638654056334,
+      "loss": 1.0388,
+      "step": 9487
+    },
+    {
+      "epoch": 1.6892806267806266,
+      "grad_norm": 0.6679551601409912,
+      "learning_rate": 0.00012433281001431428,
+      "loss": 0.9505,
+      "step": 9488
+    },
+    {
+      "epoch": 1.6894586894586894,
+      "grad_norm": 0.7090578675270081,
+      "learning_rate": 0.0001243192330113143,
+      "loss": 0.8616,
+      "step": 9489
+    },
+    {
+      "epoch": 1.6896367521367521,
+      "grad_norm": 0.6401308178901672,
+      "learning_rate": 0.00012430565553182949,
+      "loss": 0.9099,
+      "step": 9490
+    },
+    {
+      "epoch": 1.6898148148148149,
+      "grad_norm": 0.7360149621963501,
+      "learning_rate": 0.00012429207757612586,
+      "loss": 1.0233,
+      "step": 9491
+    },
+    {
+      "epoch": 1.6899928774928776,
+      "grad_norm": 0.6736137270927429,
+      "learning_rate": 0.00012427849914446946,
+      "loss": 0.9803,
+      "step": 9492
+    },
+    {
+      "epoch": 1.6901709401709402,
+      "grad_norm": 0.7728668451309204,
+      "learning_rate": 0.00012426492023712623,
+      "loss": 1.2316,
+      "step": 9493
+    },
+    {
+      "epoch": 1.6903490028490027,
+      "grad_norm": 0.789718508720398,
+      "learning_rate": 0.00012425134085436234,
+      "loss": 1.1218,
+      "step": 9494
+    },
+    {
+      "epoch": 1.6905270655270654,
+      "grad_norm": 0.7314121723175049,
+      "learning_rate": 0.0001242377609964438,
+      "loss": 1.1294,
+      "step": 9495
+    },
+    {
+      "epoch": 1.6907051282051282,
+      "grad_norm": 0.7222046256065369,
+      "learning_rate": 0.0001242241806636367,
+      "loss": 1.0288,
+      "step": 9496
+    },
+    {
+      "epoch": 1.690883190883191,
+      "grad_norm": 0.7546363472938538,
+      "learning_rate": 0.00012421059985620708,
+      "loss": 0.8781,
+      "step": 9497
+    },
+    {
+      "epoch": 1.6910612535612537,
+      "grad_norm": 0.7502550482749939,
+      "learning_rate": 0.00012419701857442104,
+      "loss": 0.927,
+      "step": 9498
+    },
+    {
+      "epoch": 1.6912393162393162,
+      "grad_norm": 0.6244059205055237,
+      "learning_rate": 0.00012418343681854473,
+      "loss": 0.9689,
+      "step": 9499
+    },
+    {
+      "epoch": 1.6914173789173788,
+      "grad_norm": 0.7214263677597046,
+      "learning_rate": 0.00012416985458884417,
+      "loss": 1.0842,
+      "step": 9500
+    },
+    {
+      "epoch": 1.6915954415954415,
+      "grad_norm": 0.6960242390632629,
+      "learning_rate": 0.00012415627188558555,
+      "loss": 0.9766,
+      "step": 9501
+    },
+    {
+      "epoch": 1.6917735042735043,
+      "grad_norm": 0.6687830686569214,
+      "learning_rate": 0.00012414268870903494,
+      "loss": 1.0222,
+      "step": 9502
+    },
+    {
+      "epoch": 1.691951566951567,
+      "grad_norm": 0.8611155152320862,
+      "learning_rate": 0.00012412910505945848,
+      "loss": 1.1792,
+      "step": 9503
+    },
+    {
+      "epoch": 1.6921296296296298,
+      "grad_norm": 0.6655587553977966,
+      "learning_rate": 0.00012411552093712235,
+      "loss": 0.8763,
+      "step": 9504
+    },
+    {
+      "epoch": 1.6923076923076923,
+      "grad_norm": 0.7829837799072266,
+      "learning_rate": 0.00012410193634229268,
+      "loss": 1.0803,
+      "step": 9505
+    },
+    {
+      "epoch": 1.6924857549857548,
+      "grad_norm": 0.7951042652130127,
+      "learning_rate": 0.00012408835127523566,
+      "loss": 1.0925,
+      "step": 9506
+    },
+    {
+      "epoch": 1.6926638176638176,
+      "grad_norm": 0.715495228767395,
+      "learning_rate": 0.0001240747657362174,
+      "loss": 1.2411,
+      "step": 9507
+    },
+    {
+      "epoch": 1.6928418803418803,
+      "grad_norm": 0.6779513359069824,
+      "learning_rate": 0.00012406117972550414,
+      "loss": 0.8886,
+      "step": 9508
+    },
+    {
+      "epoch": 1.693019943019943,
+      "grad_norm": 0.647588312625885,
+      "learning_rate": 0.00012404759324336203,
+      "loss": 1.107,
+      "step": 9509
+    },
+    {
+      "epoch": 1.6931980056980058,
+      "grad_norm": 0.7398989796638489,
+      "learning_rate": 0.00012403400629005726,
+      "loss": 1.0256,
+      "step": 9510
+    },
+    {
+      "epoch": 1.6933760683760684,
+      "grad_norm": 0.7572638392448425,
+      "learning_rate": 0.0001240204188658561,
+      "loss": 0.9662,
+      "step": 9511
+    },
+    {
+      "epoch": 1.693554131054131,
+      "grad_norm": 0.7044163346290588,
+      "learning_rate": 0.00012400683097102473,
+      "loss": 1.1388,
+      "step": 9512
+    },
+    {
+      "epoch": 1.6937321937321936,
+      "grad_norm": 0.7889094948768616,
+      "learning_rate": 0.00012399324260582936,
+      "loss": 1.0453,
+      "step": 9513
+    },
+    {
+      "epoch": 1.6939102564102564,
+      "grad_norm": 0.7977854609489441,
+      "learning_rate": 0.00012397965377053627,
+      "loss": 1.015,
+      "step": 9514
+    },
+    {
+      "epoch": 1.6940883190883191,
+      "grad_norm": 0.6223814487457275,
+      "learning_rate": 0.00012396606446541165,
+      "loss": 0.7985,
+      "step": 9515
+    },
+    {
+      "epoch": 1.694266381766382,
+      "grad_norm": 0.8307462334632874,
+      "learning_rate": 0.0001239524746907218,
+      "loss": 0.8899,
+      "step": 9516
+    },
+    {
+      "epoch": 1.6944444444444444,
+      "grad_norm": 0.7780544757843018,
+      "learning_rate": 0.00012393888444673295,
+      "loss": 0.9406,
+      "step": 9517
+    },
+    {
+      "epoch": 1.6946225071225072,
+      "grad_norm": 0.6894499659538269,
+      "learning_rate": 0.0001239252937337114,
+      "loss": 0.9412,
+      "step": 9518
+    },
+    {
+      "epoch": 1.6948005698005697,
+      "grad_norm": 0.7000680565834045,
+      "learning_rate": 0.00012391170255192342,
+      "loss": 1.0314,
+      "step": 9519
+    },
+    {
+      "epoch": 1.6949786324786325,
+      "grad_norm": 0.6772416830062866,
+      "learning_rate": 0.0001238981109016353,
+      "loss": 0.9153,
+      "step": 9520
+    },
+    {
+      "epoch": 1.6951566951566952,
+      "grad_norm": 0.7069609761238098,
+      "learning_rate": 0.00012388451878311333,
+      "loss": 1.1777,
+      "step": 9521
+    },
+    {
+      "epoch": 1.695334757834758,
+      "grad_norm": 0.6138432621955872,
+      "learning_rate": 0.00012387092619662386,
+      "loss": 0.8085,
+      "step": 9522
+    },
+    {
+      "epoch": 1.6955128205128205,
+      "grad_norm": 0.6122859716415405,
+      "learning_rate": 0.00012385733314243313,
+      "loss": 0.8534,
+      "step": 9523
+    },
+    {
+      "epoch": 1.6956908831908832,
+      "grad_norm": 0.7499903440475464,
+      "learning_rate": 0.00012384373962080755,
+      "loss": 0.9329,
+      "step": 9524
+    },
+    {
+      "epoch": 1.6958689458689458,
+      "grad_norm": 0.6413441896438599,
+      "learning_rate": 0.00012383014563201343,
+      "loss": 0.9609,
+      "step": 9525
+    },
+    {
+      "epoch": 1.6960470085470085,
+      "grad_norm": 0.7467969059944153,
+      "learning_rate": 0.0001238165511763171,
+      "loss": 0.9142,
+      "step": 9526
+    },
+    {
+      "epoch": 1.6962250712250713,
+      "grad_norm": 0.6540884375572205,
+      "learning_rate": 0.00012380295625398494,
+      "loss": 0.9503,
+      "step": 9527
+    },
+    {
+      "epoch": 1.696403133903134,
+      "grad_norm": 0.6298567652702332,
+      "learning_rate": 0.00012378936086528326,
+      "loss": 0.8853,
+      "step": 9528
+    },
+    {
+      "epoch": 1.6965811965811965,
+      "grad_norm": 0.8003417253494263,
+      "learning_rate": 0.00012377576501047845,
+      "loss": 0.969,
+      "step": 9529
+    },
+    {
+      "epoch": 1.6967592592592593,
+      "grad_norm": 0.8318493962287903,
+      "learning_rate": 0.00012376216868983697,
+      "loss": 1.1413,
+      "step": 9530
+    },
+    {
+      "epoch": 1.6969373219373218,
+      "grad_norm": 0.8294426202774048,
+      "learning_rate": 0.00012374857190362515,
+      "loss": 1.1885,
+      "step": 9531
+    },
+    {
+      "epoch": 1.6971153846153846,
+      "grad_norm": 0.7502955198287964,
+      "learning_rate": 0.0001237349746521094,
+      "loss": 1.233,
+      "step": 9532
+    },
+    {
+      "epoch": 1.6972934472934473,
+      "grad_norm": 0.6306588649749756,
+      "learning_rate": 0.00012372137693555612,
+      "loss": 1.2255,
+      "step": 9533
+    },
+    {
+      "epoch": 1.69747150997151,
+      "grad_norm": 0.7802746891975403,
+      "learning_rate": 0.0001237077787542317,
+      "loss": 1.2054,
+      "step": 9534
+    },
+    {
+      "epoch": 1.6976495726495726,
+      "grad_norm": 0.685114860534668,
+      "learning_rate": 0.00012369418010840265,
+      "loss": 0.9865,
+      "step": 9535
+    },
+    {
+      "epoch": 1.6978276353276354,
+      "grad_norm": 0.6656857132911682,
+      "learning_rate": 0.00012368058099833536,
+      "loss": 1.1579,
+      "step": 9536
+    },
+    {
+      "epoch": 1.698005698005698,
+      "grad_norm": 0.6596674919128418,
+      "learning_rate": 0.00012366698142429625,
+      "loss": 0.9104,
+      "step": 9537
+    },
+    {
+      "epoch": 1.6981837606837606,
+      "grad_norm": 0.6025584936141968,
+      "learning_rate": 0.00012365338138655183,
+      "loss": 1.117,
+      "step": 9538
+    },
+    {
+      "epoch": 1.6983618233618234,
+      "grad_norm": 0.671585202217102,
+      "learning_rate": 0.0001236397808853685,
+      "loss": 1.0271,
+      "step": 9539
+    },
+    {
+      "epoch": 1.6985398860398861,
+      "grad_norm": 0.7467984557151794,
+      "learning_rate": 0.0001236261799210128,
+      "loss": 1.0411,
+      "step": 9540
+    },
+    {
+      "epoch": 1.6987179487179487,
+      "grad_norm": 0.6251640915870667,
+      "learning_rate": 0.0001236125784937512,
+      "loss": 0.7154,
+      "step": 9541
+    },
+    {
+      "epoch": 1.6988960113960114,
+      "grad_norm": 0.7560956478118896,
+      "learning_rate": 0.00012359897660385016,
+      "loss": 1.0048,
+      "step": 9542
+    },
+    {
+      "epoch": 1.699074074074074,
+      "grad_norm": 0.6144903302192688,
+      "learning_rate": 0.00012358537425157618,
+      "loss": 1.1294,
+      "step": 9543
+    },
+    {
+      "epoch": 1.6992521367521367,
+      "grad_norm": 0.7839425206184387,
+      "learning_rate": 0.00012357177143719578,
+      "loss": 1.0725,
+      "step": 9544
+    },
+    {
+      "epoch": 1.6994301994301995,
+      "grad_norm": 0.6488651037216187,
+      "learning_rate": 0.00012355816816097553,
+      "loss": 0.9267,
+      "step": 9545
+    },
+    {
+      "epoch": 1.6996082621082622,
+      "grad_norm": 0.6848782896995544,
+      "learning_rate": 0.00012354456442318187,
+      "loss": 1.0426,
+      "step": 9546
+    },
+    {
+      "epoch": 1.6997863247863247,
+      "grad_norm": 0.7164611220359802,
+      "learning_rate": 0.0001235309602240814,
+      "loss": 0.8208,
+      "step": 9547
+    },
+    {
+      "epoch": 1.6999643874643875,
+      "grad_norm": 0.6725530624389648,
+      "learning_rate": 0.0001235173555639406,
+      "loss": 0.9366,
+      "step": 9548
+    },
+    {
+      "epoch": 1.70014245014245,
+      "grad_norm": 0.6958004236221313,
+      "learning_rate": 0.00012350375044302612,
+      "loss": 1.0185,
+      "step": 9549
+    },
+    {
+      "epoch": 1.7003205128205128,
+      "grad_norm": 0.8035947680473328,
+      "learning_rate": 0.00012349014486160445,
+      "loss": 1.065,
+      "step": 9550
+    },
+    {
+      "epoch": 1.7004985754985755,
+      "grad_norm": 0.6705633997917175,
+      "learning_rate": 0.00012347653881994222,
+      "loss": 0.8381,
+      "step": 9551
+    },
+    {
+      "epoch": 1.7006766381766383,
+      "grad_norm": 0.6652300357818604,
+      "learning_rate": 0.00012346293231830596,
+      "loss": 1.1428,
+      "step": 9552
+    },
+    {
+      "epoch": 1.7008547008547008,
+      "grad_norm": 0.6719335913658142,
+      "learning_rate": 0.0001234493253569623,
+      "loss": 1.0138,
+      "step": 9553
+    },
+    {
+      "epoch": 1.7010327635327636,
+      "grad_norm": 0.746981680393219,
+      "learning_rate": 0.0001234357179361778,
+      "loss": 1.1169,
+      "step": 9554
+    },
+    {
+      "epoch": 1.701210826210826,
+      "grad_norm": 0.6768170595169067,
+      "learning_rate": 0.0001234221100562191,
+      "loss": 0.9065,
+      "step": 9555
+    },
+    {
+      "epoch": 1.7013888888888888,
+      "grad_norm": 0.7127171754837036,
+      "learning_rate": 0.00012340850171735278,
+      "loss": 0.9467,
+      "step": 9556
+    },
+    {
+      "epoch": 1.7015669515669516,
+      "grad_norm": 0.6802694797515869,
+      "learning_rate": 0.00012339489291984554,
+      "loss": 0.8938,
+      "step": 9557
+    },
+    {
+      "epoch": 1.7017450142450143,
+      "grad_norm": 0.7101455926895142,
+      "learning_rate": 0.00012338128366396394,
+      "loss": 1.1939,
+      "step": 9558
+    },
+    {
+      "epoch": 1.7019230769230769,
+      "grad_norm": 0.621223509311676,
+      "learning_rate": 0.00012336767394997467,
+      "loss": 0.7583,
+      "step": 9559
+    },
+    {
+      "epoch": 1.7021011396011396,
+      "grad_norm": 0.7130763530731201,
+      "learning_rate": 0.00012335406377814439,
+      "loss": 0.8684,
+      "step": 9560
+    },
+    {
+      "epoch": 1.7022792022792022,
+      "grad_norm": 0.6761086583137512,
+      "learning_rate": 0.00012334045314873972,
+      "loss": 1.0197,
+      "step": 9561
+    },
+    {
+      "epoch": 1.702457264957265,
+      "grad_norm": 0.7030459642410278,
+      "learning_rate": 0.00012332684206202736,
+      "loss": 0.8627,
+      "step": 9562
+    },
+    {
+      "epoch": 1.7026353276353277,
+      "grad_norm": 0.6278037428855896,
+      "learning_rate": 0.000123313230518274,
+      "loss": 0.8953,
+      "step": 9563
+    },
+    {
+      "epoch": 1.7028133903133904,
+      "grad_norm": 0.6450623869895935,
+      "learning_rate": 0.00012329961851774627,
+      "loss": 0.8826,
+      "step": 9564
+    },
+    {
+      "epoch": 1.702991452991453,
+      "grad_norm": 0.7324244976043701,
+      "learning_rate": 0.00012328600606071097,
+      "loss": 1.0133,
+      "step": 9565
+    },
+    {
+      "epoch": 1.7031695156695157,
+      "grad_norm": 0.6560033559799194,
+      "learning_rate": 0.00012327239314743473,
+      "loss": 0.9601,
+      "step": 9566
+    },
+    {
+      "epoch": 1.7033475783475782,
+      "grad_norm": 0.6693514585494995,
+      "learning_rate": 0.0001232587797781843,
+      "loss": 0.9447,
+      "step": 9567
+    },
+    {
+      "epoch": 1.703525641025641,
+      "grad_norm": 0.6403199434280396,
+      "learning_rate": 0.00012324516595322638,
+      "loss": 0.8554,
+      "step": 9568
+    },
+    {
+      "epoch": 1.7037037037037037,
+      "grad_norm": 0.8290280103683472,
+      "learning_rate": 0.00012323155167282774,
+      "loss": 1.1877,
+      "step": 9569
+    },
+    {
+      "epoch": 1.7038817663817665,
+      "grad_norm": 0.7207778692245483,
+      "learning_rate": 0.00012321793693725509,
+      "loss": 1.0978,
+      "step": 9570
+    },
+    {
+      "epoch": 1.7040598290598292,
+      "grad_norm": 0.8794265985488892,
+      "learning_rate": 0.00012320432174677519,
+      "loss": 0.9387,
+      "step": 9571
+    },
+    {
+      "epoch": 1.7042378917378918,
+      "grad_norm": 0.6683359146118164,
+      "learning_rate": 0.00012319070610165484,
+      "loss": 0.9227,
+      "step": 9572
+    },
+    {
+      "epoch": 1.7044159544159543,
+      "grad_norm": 0.7342001795768738,
+      "learning_rate": 0.00012317709000216076,
+      "loss": 0.9453,
+      "step": 9573
+    },
+    {
+      "epoch": 1.704594017094017,
+      "grad_norm": 0.6315770149230957,
+      "learning_rate": 0.00012316347344855973,
+      "loss": 0.8263,
+      "step": 9574
+    },
+    {
+      "epoch": 1.7047720797720798,
+      "grad_norm": 0.7697155475616455,
+      "learning_rate": 0.00012314985644111857,
+      "loss": 1.0238,
+      "step": 9575
+    },
+    {
+      "epoch": 1.7049501424501425,
+      "grad_norm": 0.6674068570137024,
+      "learning_rate": 0.00012313623898010408,
+      "loss": 1.0823,
+      "step": 9576
+    },
+    {
+      "epoch": 1.7051282051282053,
+      "grad_norm": 0.6995484232902527,
+      "learning_rate": 0.00012312262106578304,
+      "loss": 1.2001,
+      "step": 9577
+    },
+    {
+      "epoch": 1.7053062678062678,
+      "grad_norm": 0.7639257907867432,
+      "learning_rate": 0.00012310900269842226,
+      "loss": 1.3438,
+      "step": 9578
+    },
+    {
+      "epoch": 1.7054843304843303,
+      "grad_norm": 0.6486390233039856,
+      "learning_rate": 0.00012309538387828857,
+      "loss": 0.9924,
+      "step": 9579
+    },
+    {
+      "epoch": 1.705662393162393,
+      "grad_norm": 0.6737813949584961,
+      "learning_rate": 0.00012308176460564885,
+      "loss": 0.8722,
+      "step": 9580
+    },
+    {
+      "epoch": 1.7058404558404558,
+      "grad_norm": 0.6462090611457825,
+      "learning_rate": 0.00012306814488076987,
+      "loss": 1.1013,
+      "step": 9581
+    },
+    {
+      "epoch": 1.7060185185185186,
+      "grad_norm": 0.7887832522392273,
+      "learning_rate": 0.00012305452470391852,
+      "loss": 0.9998,
+      "step": 9582
+    },
+    {
+      "epoch": 1.7061965811965814,
+      "grad_norm": 0.6345070004463196,
+      "learning_rate": 0.00012304090407536165,
+      "loss": 1.0305,
+      "step": 9583
+    },
+    {
+      "epoch": 1.7063746438746439,
+      "grad_norm": 0.6398460268974304,
+      "learning_rate": 0.0001230272829953661,
+      "loss": 1.2243,
+      "step": 9584
+    },
+    {
+      "epoch": 1.7065527065527064,
+      "grad_norm": 0.6501944065093994,
+      "learning_rate": 0.00012301366146419879,
+      "loss": 0.9425,
+      "step": 9585
+    },
+    {
+      "epoch": 1.7067307692307692,
+      "grad_norm": 0.6406761407852173,
+      "learning_rate": 0.00012300003948212661,
+      "loss": 0.948,
+      "step": 9586
+    },
+    {
+      "epoch": 1.706908831908832,
+      "grad_norm": 0.7114266157150269,
+      "learning_rate": 0.00012298641704941644,
+      "loss": 1.1291,
+      "step": 9587
+    },
+    {
+      "epoch": 1.7070868945868947,
+      "grad_norm": 0.6653099656105042,
+      "learning_rate": 0.00012297279416633515,
+      "loss": 1.0156,
+      "step": 9588
+    },
+    {
+      "epoch": 1.7072649572649574,
+      "grad_norm": 0.5970917344093323,
+      "learning_rate": 0.0001229591708331497,
+      "loss": 0.9424,
+      "step": 9589
+    },
+    {
+      "epoch": 1.70744301994302,
+      "grad_norm": 0.6861461400985718,
+      "learning_rate": 0.00012294554705012694,
+      "loss": 0.7581,
+      "step": 9590
+    },
+    {
+      "epoch": 1.7076210826210825,
+      "grad_norm": 0.6930568218231201,
+      "learning_rate": 0.00012293192281753393,
+      "loss": 1.0544,
+      "step": 9591
+    },
+    {
+      "epoch": 1.7077991452991452,
+      "grad_norm": 0.7420656085014343,
+      "learning_rate": 0.00012291829813563748,
+      "loss": 0.7092,
+      "step": 9592
+    },
+    {
+      "epoch": 1.707977207977208,
+      "grad_norm": 0.6607801914215088,
+      "learning_rate": 0.0001229046730047046,
+      "loss": 0.5544,
+      "step": 9593
+    },
+    {
+      "epoch": 1.7081552706552707,
+      "grad_norm": 0.8419139385223389,
+      "learning_rate": 0.00012289104742500224,
+      "loss": 1.0443,
+      "step": 9594
+    },
+    {
+      "epoch": 1.7083333333333335,
+      "grad_norm": 0.6774617433547974,
+      "learning_rate": 0.00012287742139679734,
+      "loss": 1.0098,
+      "step": 9595
+    },
+    {
+      "epoch": 1.708511396011396,
+      "grad_norm": 0.7517698407173157,
+      "learning_rate": 0.0001228637949203569,
+      "loss": 1.1145,
+      "step": 9596
+    },
+    {
+      "epoch": 1.7086894586894585,
+      "grad_norm": 0.6048635840415955,
+      "learning_rate": 0.00012285016799594791,
+      "loss": 0.7398,
+      "step": 9597
+    },
+    {
+      "epoch": 1.7088675213675213,
+      "grad_norm": 0.8054425716400146,
+      "learning_rate": 0.00012283654062383734,
+      "loss": 1.0893,
+      "step": 9598
+    },
+    {
+      "epoch": 1.709045584045584,
+      "grad_norm": 0.8694897294044495,
+      "learning_rate": 0.0001228229128042922,
+      "loss": 1.2366,
+      "step": 9599
+    },
+    {
+      "epoch": 1.7092236467236468,
+      "grad_norm": 0.7460638880729675,
+      "learning_rate": 0.00012280928453757946,
+      "loss": 1.1753,
+      "step": 9600
+    },
+    {
+      "epoch": 1.7094017094017095,
+      "grad_norm": 0.6714958548545837,
+      "learning_rate": 0.00012279565582396618,
+      "loss": 1.0473,
+      "step": 9601
+    },
+    {
+      "epoch": 1.709579772079772,
+      "grad_norm": 0.6893340945243835,
+      "learning_rate": 0.00012278202666371937,
+      "loss": 1.2761,
+      "step": 9602
+    },
+    {
+      "epoch": 1.7097578347578346,
+      "grad_norm": 0.6816153526306152,
+      "learning_rate": 0.00012276839705710612,
+      "loss": 0.991,
+      "step": 9603
+    },
+    {
+      "epoch": 1.7099358974358974,
+      "grad_norm": 0.6961633563041687,
+      "learning_rate": 0.0001227547670043934,
+      "loss": 1.0634,
+      "step": 9604
+    },
+    {
+      "epoch": 1.71011396011396,
+      "grad_norm": 0.643734872341156,
+      "learning_rate": 0.0001227411365058483,
+      "loss": 0.8672,
+      "step": 9605
+    },
+    {
+      "epoch": 1.7102920227920229,
+      "grad_norm": 0.7313315272331238,
+      "learning_rate": 0.00012272750556173784,
+      "loss": 1.1152,
+      "step": 9606
+    },
+    {
+      "epoch": 1.7104700854700856,
+      "grad_norm": 0.6464954614639282,
+      "learning_rate": 0.00012271387417232916,
+      "loss": 0.8798,
+      "step": 9607
+    },
+    {
+      "epoch": 1.7106481481481481,
+      "grad_norm": 0.8365204334259033,
+      "learning_rate": 0.00012270024233788929,
+      "loss": 1.213,
+      "step": 9608
+    },
+    {
+      "epoch": 1.7108262108262107,
+      "grad_norm": 0.6460705995559692,
+      "learning_rate": 0.0001226866100586853,
+      "loss": 0.9232,
+      "step": 9609
+    },
+    {
+      "epoch": 1.7110042735042734,
+      "grad_norm": 0.6446022987365723,
+      "learning_rate": 0.00012267297733498434,
+      "loss": 0.8295,
+      "step": 9610
+    },
+    {
+      "epoch": 1.7111823361823362,
+      "grad_norm": 0.7692012190818787,
+      "learning_rate": 0.00012265934416705345,
+      "loss": 1.0715,
+      "step": 9611
+    },
+    {
+      "epoch": 1.711360398860399,
+      "grad_norm": 0.671154260635376,
+      "learning_rate": 0.0001226457105551598,
+      "loss": 0.9752,
+      "step": 9612
+    },
+    {
+      "epoch": 1.7115384615384617,
+      "grad_norm": 0.6525935530662537,
+      "learning_rate": 0.00012263207649957053,
+      "loss": 1.09,
+      "step": 9613
+    },
+    {
+      "epoch": 1.7117165242165242,
+      "grad_norm": 0.6984749436378479,
+      "learning_rate": 0.0001226184420005527,
+      "loss": 0.9956,
+      "step": 9614
+    },
+    {
+      "epoch": 1.7118945868945867,
+      "grad_norm": 0.6769809126853943,
+      "learning_rate": 0.0001226048070583735,
+      "loss": 1.0151,
+      "step": 9615
+    },
+    {
+      "epoch": 1.7120726495726495,
+      "grad_norm": 0.6085978746414185,
+      "learning_rate": 0.00012259117167330005,
+      "loss": 0.8706,
+      "step": 9616
+    },
+    {
+      "epoch": 1.7122507122507122,
+      "grad_norm": 0.7335749268531799,
+      "learning_rate": 0.00012257753584559952,
+      "loss": 1.0575,
+      "step": 9617
+    },
+    {
+      "epoch": 1.712428774928775,
+      "grad_norm": 0.7392038106918335,
+      "learning_rate": 0.0001225638995755391,
+      "loss": 0.8763,
+      "step": 9618
+    },
+    {
+      "epoch": 1.7126068376068377,
+      "grad_norm": 0.6708608865737915,
+      "learning_rate": 0.00012255026286338592,
+      "loss": 1.131,
+      "step": 9619
+    },
+    {
+      "epoch": 1.7127849002849003,
+      "grad_norm": 0.726657509803772,
+      "learning_rate": 0.0001225366257094072,
+      "loss": 1.0569,
+      "step": 9620
+    },
+    {
+      "epoch": 1.7129629629629628,
+      "grad_norm": 0.749098002910614,
+      "learning_rate": 0.0001225229881138701,
+      "loss": 0.9196,
+      "step": 9621
+    },
+    {
+      "epoch": 1.7131410256410255,
+      "grad_norm": 0.6550580263137817,
+      "learning_rate": 0.00012250935007704182,
+      "loss": 1.0244,
+      "step": 9622
+    },
+    {
+      "epoch": 1.7133190883190883,
+      "grad_norm": 0.7714282274246216,
+      "learning_rate": 0.00012249571159918962,
+      "loss": 1.1025,
+      "step": 9623
+    },
+    {
+      "epoch": 1.713497150997151,
+      "grad_norm": 0.7869850397109985,
+      "learning_rate": 0.00012248207268058064,
+      "loss": 0.9238,
+      "step": 9624
+    },
+    {
+      "epoch": 1.7136752136752138,
+      "grad_norm": 0.7187856435775757,
+      "learning_rate": 0.00012246843332148216,
+      "loss": 1.081,
+      "step": 9625
+    },
+    {
+      "epoch": 1.7138532763532763,
+      "grad_norm": 0.6634210348129272,
+      "learning_rate": 0.00012245479352216142,
+      "loss": 1.1944,
+      "step": 9626
+    },
+    {
+      "epoch": 1.714031339031339,
+      "grad_norm": 0.6609212160110474,
+      "learning_rate": 0.00012244115328288567,
+      "loss": 0.9613,
+      "step": 9627
+    },
+    {
+      "epoch": 1.7142094017094016,
+      "grad_norm": 0.7906867861747742,
+      "learning_rate": 0.0001224275126039221,
+      "loss": 1.2692,
+      "step": 9628
+    },
+    {
+      "epoch": 1.7143874643874644,
+      "grad_norm": 0.8037096858024597,
+      "learning_rate": 0.000122413871485538,
+      "loss": 0.9823,
+      "step": 9629
+    },
+    {
+      "epoch": 1.7145655270655271,
+      "grad_norm": 0.7740145921707153,
+      "learning_rate": 0.00012240022992800068,
+      "loss": 1.1937,
+      "step": 9630
+    },
+    {
+      "epoch": 1.7147435897435899,
+      "grad_norm": 0.595372200012207,
+      "learning_rate": 0.00012238658793157738,
+      "loss": 0.9153,
+      "step": 9631
+    },
+    {
+      "epoch": 1.7149216524216524,
+      "grad_norm": 0.6671900749206543,
+      "learning_rate": 0.0001223729454965354,
+      "loss": 1.0895,
+      "step": 9632
+    },
+    {
+      "epoch": 1.7150997150997151,
+      "grad_norm": 0.5805774927139282,
+      "learning_rate": 0.000122359302623142,
+      "loss": 1.0001,
+      "step": 9633
+    },
+    {
+      "epoch": 1.7152777777777777,
+      "grad_norm": 0.8851602673530579,
+      "learning_rate": 0.00012234565931166456,
+      "loss": 1.2828,
+      "step": 9634
+    },
+    {
+      "epoch": 1.7154558404558404,
+      "grad_norm": 0.6960011720657349,
+      "learning_rate": 0.0001223320155623703,
+      "loss": 1.0622,
+      "step": 9635
+    },
+    {
+      "epoch": 1.7156339031339032,
+      "grad_norm": 0.5587009191513062,
+      "learning_rate": 0.0001223183713755266,
+      "loss": 0.83,
+      "step": 9636
+    },
+    {
+      "epoch": 1.715811965811966,
+      "grad_norm": 0.6892730593681335,
+      "learning_rate": 0.00012230472675140076,
+      "loss": 0.9214,
+      "step": 9637
+    },
+    {
+      "epoch": 1.7159900284900285,
+      "grad_norm": 0.6545090079307556,
+      "learning_rate": 0.00012229108169026017,
+      "loss": 0.829,
+      "step": 9638
+    },
+    {
+      "epoch": 1.7161680911680912,
+      "grad_norm": 0.6539101600646973,
+      "learning_rate": 0.00012227743619237213,
+      "loss": 1.0686,
+      "step": 9639
+    },
+    {
+      "epoch": 1.7163461538461537,
+      "grad_norm": 0.5887274146080017,
+      "learning_rate": 0.000122263790258004,
+      "loss": 0.9285,
+      "step": 9640
+    },
+    {
+      "epoch": 1.7165242165242165,
+      "grad_norm": 0.6328918933868408,
+      "learning_rate": 0.00012225014388742313,
+      "loss": 0.9684,
+      "step": 9641
+    },
+    {
+      "epoch": 1.7167022792022792,
+      "grad_norm": 0.6377436518669128,
+      "learning_rate": 0.00012223649708089694,
+      "loss": 0.9425,
+      "step": 9642
+    },
+    {
+      "epoch": 1.716880341880342,
+      "grad_norm": 0.6967392563819885,
+      "learning_rate": 0.00012222284983869275,
+      "loss": 0.9342,
+      "step": 9643
+    },
+    {
+      "epoch": 1.7170584045584045,
+      "grad_norm": 0.7051317691802979,
+      "learning_rate": 0.00012220920216107802,
+      "loss": 1.1843,
+      "step": 9644
+    },
+    {
+      "epoch": 1.7172364672364673,
+      "grad_norm": 0.6864503622055054,
+      "learning_rate": 0.00012219555404832007,
+      "loss": 1.0371,
+      "step": 9645
+    },
+    {
+      "epoch": 1.7174145299145298,
+      "grad_norm": 0.583454430103302,
+      "learning_rate": 0.00012218190550068638,
+      "loss": 0.6774,
+      "step": 9646
+    },
+    {
+      "epoch": 1.7175925925925926,
+      "grad_norm": 0.6755677461624146,
+      "learning_rate": 0.0001221682565184443,
+      "loss": 0.9517,
+      "step": 9647
+    },
+    {
+      "epoch": 1.7177706552706553,
+      "grad_norm": 0.7230031490325928,
+      "learning_rate": 0.0001221546071018613,
+      "loss": 1.0385,
+      "step": 9648
+    },
+    {
+      "epoch": 1.717948717948718,
+      "grad_norm": 0.7381200194358826,
+      "learning_rate": 0.0001221409572512048,
+      "loss": 0.9893,
+      "step": 9649
+    },
+    {
+      "epoch": 1.7181267806267806,
+      "grad_norm": 0.7079094648361206,
+      "learning_rate": 0.0001221273069667422,
+      "loss": 0.7793,
+      "step": 9650
+    },
+    {
+      "epoch": 1.7183048433048433,
+      "grad_norm": 0.6666881442070007,
+      "learning_rate": 0.00012211365624874106,
+      "loss": 0.9752,
+      "step": 9651
+    },
+    {
+      "epoch": 1.7184829059829059,
+      "grad_norm": 0.6196922659873962,
+      "learning_rate": 0.00012210000509746868,
+      "loss": 0.922,
+      "step": 9652
+    },
+    {
+      "epoch": 1.7186609686609686,
+      "grad_norm": 0.657879650592804,
+      "learning_rate": 0.00012208635351319266,
+      "loss": 1.2583,
+      "step": 9653
+    },
+    {
+      "epoch": 1.7188390313390314,
+      "grad_norm": 0.7240459322929382,
+      "learning_rate": 0.00012207270149618043,
+      "loss": 0.8479,
+      "step": 9654
+    },
+    {
+      "epoch": 1.7190170940170941,
+      "grad_norm": 0.8293825387954712,
+      "learning_rate": 0.00012205904904669945,
+      "loss": 0.9092,
+      "step": 9655
+    },
+    {
+      "epoch": 1.7191951566951567,
+      "grad_norm": 0.6907553672790527,
+      "learning_rate": 0.0001220453961650172,
+      "loss": 1.0543,
+      "step": 9656
+    },
+    {
+      "epoch": 1.7193732193732194,
+      "grad_norm": 0.7178300023078918,
+      "learning_rate": 0.00012203174285140124,
+      "loss": 0.9147,
+      "step": 9657
+    },
+    {
+      "epoch": 1.719551282051282,
+      "grad_norm": 0.7037166357040405,
+      "learning_rate": 0.00012201808910611905,
+      "loss": 0.8685,
+      "step": 9658
+    },
+    {
+      "epoch": 1.7197293447293447,
+      "grad_norm": 0.5850751996040344,
+      "learning_rate": 0.00012200443492943813,
+      "loss": 0.72,
+      "step": 9659
+    },
+    {
+      "epoch": 1.7199074074074074,
+      "grad_norm": 0.744239330291748,
+      "learning_rate": 0.00012199078032162603,
+      "loss": 0.9717,
+      "step": 9660
+    },
+    {
+      "epoch": 1.7200854700854702,
+      "grad_norm": 0.6509126424789429,
+      "learning_rate": 0.00012197712528295025,
+      "loss": 0.9768,
+      "step": 9661
+    },
+    {
+      "epoch": 1.7202635327635327,
+      "grad_norm": 0.623220682144165,
+      "learning_rate": 0.00012196346981367837,
+      "loss": 0.9824,
+      "step": 9662
+    },
+    {
+      "epoch": 1.7204415954415955,
+      "grad_norm": 0.6376451849937439,
+      "learning_rate": 0.00012194981391407792,
+      "loss": 0.8228,
+      "step": 9663
+    },
+    {
+      "epoch": 1.720619658119658,
+      "grad_norm": 0.794830322265625,
+      "learning_rate": 0.00012193615758441648,
+      "loss": 0.9168,
+      "step": 9664
+    },
+    {
+      "epoch": 1.7207977207977208,
+      "grad_norm": 0.7812975645065308,
+      "learning_rate": 0.0001219225008249616,
+      "loss": 0.8625,
+      "step": 9665
+    },
+    {
+      "epoch": 1.7209757834757835,
+      "grad_norm": 0.6843218207359314,
+      "learning_rate": 0.0001219088436359808,
+      "loss": 1.0176,
+      "step": 9666
+    },
+    {
+      "epoch": 1.7211538461538463,
+      "grad_norm": 0.6924905180931091,
+      "learning_rate": 0.00012189518601774178,
+      "loss": 0.855,
+      "step": 9667
+    },
+    {
+      "epoch": 1.7213319088319088,
+      "grad_norm": 0.6348826289176941,
+      "learning_rate": 0.00012188152797051202,
+      "loss": 1.1596,
+      "step": 9668
+    },
+    {
+      "epoch": 1.7215099715099715,
+      "grad_norm": 0.7170482873916626,
+      "learning_rate": 0.00012186786949455922,
+      "loss": 0.9811,
+      "step": 9669
+    },
+    {
+      "epoch": 1.721688034188034,
+      "grad_norm": 0.7471763491630554,
+      "learning_rate": 0.00012185421059015094,
+      "loss": 1.0925,
+      "step": 9670
+    },
+    {
+      "epoch": 1.7218660968660968,
+      "grad_norm": 0.6771119236946106,
+      "learning_rate": 0.00012184055125755481,
+      "loss": 0.9403,
+      "step": 9671
+    },
+    {
+      "epoch": 1.7220441595441596,
+      "grad_norm": 0.4335343539714813,
+      "learning_rate": 0.0001218268914970384,
+      "loss": 0.4925,
+      "step": 9672
+    },
+    {
+      "epoch": 1.7222222222222223,
+      "grad_norm": 0.6652585864067078,
+      "learning_rate": 0.00012181323130886943,
+      "loss": 0.7684,
+      "step": 9673
+    },
+    {
+      "epoch": 1.7224002849002849,
+      "grad_norm": 0.6465467810630798,
+      "learning_rate": 0.00012179957069331548,
+      "loss": 0.9011,
+      "step": 9674
+    },
+    {
+      "epoch": 1.7225783475783476,
+      "grad_norm": 0.6725688576698303,
+      "learning_rate": 0.00012178590965064427,
+      "loss": 0.9563,
+      "step": 9675
+    },
+    {
+      "epoch": 1.7227564102564101,
+      "grad_norm": 0.6223418712615967,
+      "learning_rate": 0.00012177224818112341,
+      "loss": 0.9099,
+      "step": 9676
+    },
+    {
+      "epoch": 1.7229344729344729,
+      "grad_norm": 0.79325270652771,
+      "learning_rate": 0.00012175858628502053,
+      "loss": 1.0318,
+      "step": 9677
+    },
+    {
+      "epoch": 1.7231125356125356,
+      "grad_norm": 0.6735602617263794,
+      "learning_rate": 0.0001217449239626034,
+      "loss": 1.0797,
+      "step": 9678
+    },
+    {
+      "epoch": 1.7232905982905984,
+      "grad_norm": 0.7082492113113403,
+      "learning_rate": 0.00012173126121413962,
+      "loss": 1.1341,
+      "step": 9679
+    },
+    {
+      "epoch": 1.723468660968661,
+      "grad_norm": 0.6563859581947327,
+      "learning_rate": 0.00012171759803989696,
+      "loss": 0.8778,
+      "step": 9680
+    },
+    {
+      "epoch": 1.7236467236467237,
+      "grad_norm": 0.6867792010307312,
+      "learning_rate": 0.00012170393444014306,
+      "loss": 0.8301,
+      "step": 9681
+    },
+    {
+      "epoch": 1.7238247863247862,
+      "grad_norm": 0.7870511412620544,
+      "learning_rate": 0.00012169027041514562,
+      "loss": 0.9165,
+      "step": 9682
+    },
+    {
+      "epoch": 1.724002849002849,
+      "grad_norm": 0.8006493449211121,
+      "learning_rate": 0.00012167660596517241,
+      "loss": 1.0395,
+      "step": 9683
+    },
+    {
+      "epoch": 1.7241809116809117,
+      "grad_norm": 0.6936125159263611,
+      "learning_rate": 0.00012166294109049114,
+      "loss": 1.1037,
+      "step": 9684
+    },
+    {
+      "epoch": 1.7243589743589745,
+      "grad_norm": 0.8176514506340027,
+      "learning_rate": 0.00012164927579136956,
+      "loss": 0.8791,
+      "step": 9685
+    },
+    {
+      "epoch": 1.7245370370370372,
+      "grad_norm": 0.6948300004005432,
+      "learning_rate": 0.00012163561006807537,
+      "loss": 0.9292,
+      "step": 9686
+    },
+    {
+      "epoch": 1.7247150997150997,
+      "grad_norm": 0.6237453818321228,
+      "learning_rate": 0.00012162194392087634,
+      "loss": 0.8553,
+      "step": 9687
+    },
+    {
+      "epoch": 1.7248931623931623,
+      "grad_norm": 0.6198007464408875,
+      "learning_rate": 0.00012160827735004021,
+      "loss": 0.9599,
+      "step": 9688
+    },
+    {
+      "epoch": 1.725071225071225,
+      "grad_norm": 0.639838695526123,
+      "learning_rate": 0.00012159461035583482,
+      "loss": 0.9328,
+      "step": 9689
+    },
+    {
+      "epoch": 1.7252492877492878,
+      "grad_norm": 0.7264436483383179,
+      "learning_rate": 0.00012158094293852789,
+      "loss": 1.0247,
+      "step": 9690
+    },
+    {
+      "epoch": 1.7254273504273505,
+      "grad_norm": 0.6320534348487854,
+      "learning_rate": 0.00012156727509838721,
+      "loss": 1.1222,
+      "step": 9691
+    },
+    {
+      "epoch": 1.7256054131054133,
+      "grad_norm": 0.6204122304916382,
+      "learning_rate": 0.00012155360683568056,
+      "loss": 0.9765,
+      "step": 9692
+    },
+    {
+      "epoch": 1.7257834757834758,
+      "grad_norm": 0.7026457190513611,
+      "learning_rate": 0.00012153993815067579,
+      "loss": 1.0178,
+      "step": 9693
+    },
+    {
+      "epoch": 1.7259615384615383,
+      "grad_norm": 0.6471006870269775,
+      "learning_rate": 0.00012152626904364067,
+      "loss": 1.0035,
+      "step": 9694
+    },
+    {
+      "epoch": 1.726139601139601,
+      "grad_norm": 0.6875706911087036,
+      "learning_rate": 0.00012151259951484301,
+      "loss": 0.7921,
+      "step": 9695
+    },
+    {
+      "epoch": 1.7263176638176638,
+      "grad_norm": 0.6963251233100891,
+      "learning_rate": 0.00012149892956455067,
+      "loss": 0.9677,
+      "step": 9696
+    },
+    {
+      "epoch": 1.7264957264957266,
+      "grad_norm": 0.9077282547950745,
+      "learning_rate": 0.00012148525919303142,
+      "loss": 0.9362,
+      "step": 9697
+    },
+    {
+      "epoch": 1.7266737891737893,
+      "grad_norm": 0.7347434163093567,
+      "learning_rate": 0.00012147158840055319,
+      "loss": 0.8712,
+      "step": 9698
+    },
+    {
+      "epoch": 1.7268518518518519,
+      "grad_norm": 0.7206630110740662,
+      "learning_rate": 0.00012145791718738377,
+      "loss": 1.032,
+      "step": 9699
+    },
+    {
+      "epoch": 1.7270299145299144,
+      "grad_norm": 0.7174662947654724,
+      "learning_rate": 0.00012144424555379106,
+      "loss": 0.954,
+      "step": 9700
+    },
+    {
+      "epoch": 1.7272079772079771,
+      "grad_norm": 0.7442345023155212,
+      "learning_rate": 0.0001214305735000429,
+      "loss": 1.0709,
+      "step": 9701
+    },
+    {
+      "epoch": 1.72738603988604,
+      "grad_norm": 0.6154376268386841,
+      "learning_rate": 0.00012141690102640715,
+      "loss": 0.9365,
+      "step": 9702
+    },
+    {
+      "epoch": 1.7275641025641026,
+      "grad_norm": 0.6213796734809875,
+      "learning_rate": 0.00012140322813315172,
+      "loss": 0.8337,
+      "step": 9703
+    },
+    {
+      "epoch": 1.7277421652421654,
+      "grad_norm": 0.7682011127471924,
+      "learning_rate": 0.0001213895548205445,
+      "loss": 1.1579,
+      "step": 9704
+    },
+    {
+      "epoch": 1.727920227920228,
+      "grad_norm": 0.6796970963478088,
+      "learning_rate": 0.0001213758810888534,
+      "loss": 0.8875,
+      "step": 9705
+    },
+    {
+      "epoch": 1.7280982905982905,
+      "grad_norm": 0.7203732132911682,
+      "learning_rate": 0.0001213622069383463,
+      "loss": 0.7827,
+      "step": 9706
+    },
+    {
+      "epoch": 1.7282763532763532,
+      "grad_norm": 0.6151877045631409,
+      "learning_rate": 0.00012134853236929111,
+      "loss": 1.0282,
+      "step": 9707
+    },
+    {
+      "epoch": 1.728454415954416,
+      "grad_norm": 0.6665124297142029,
+      "learning_rate": 0.0001213348573819558,
+      "loss": 1.0636,
+      "step": 9708
+    },
+    {
+      "epoch": 1.7286324786324787,
+      "grad_norm": 0.7334614396095276,
+      "learning_rate": 0.00012132118197660829,
+      "loss": 1.0889,
+      "step": 9709
+    },
+    {
+      "epoch": 1.7288105413105415,
+      "grad_norm": 0.7267759442329407,
+      "learning_rate": 0.00012130750615351649,
+      "loss": 1.096,
+      "step": 9710
+    },
+    {
+      "epoch": 1.728988603988604,
+      "grad_norm": 0.6542944312095642,
+      "learning_rate": 0.00012129382991294837,
+      "loss": 1.0855,
+      "step": 9711
+    },
+    {
+      "epoch": 1.7291666666666665,
+      "grad_norm": 0.694523274898529,
+      "learning_rate": 0.00012128015325517193,
+      "loss": 0.8482,
+      "step": 9712
+    },
+    {
+      "epoch": 1.7293447293447293,
+      "grad_norm": 0.7879082560539246,
+      "learning_rate": 0.00012126647618045504,
+      "loss": 1.2356,
+      "step": 9713
+    },
+    {
+      "epoch": 1.729522792022792,
+      "grad_norm": 0.7108420729637146,
+      "learning_rate": 0.00012125279868906574,
+      "loss": 1.0185,
+      "step": 9714
+    },
+    {
+      "epoch": 1.7297008547008548,
+      "grad_norm": 0.6928725838661194,
+      "learning_rate": 0.000121239120781272,
+      "loss": 1.1507,
+      "step": 9715
+    },
+    {
+      "epoch": 1.7298789173789175,
+      "grad_norm": 0.6195241212844849,
+      "learning_rate": 0.00012122544245734182,
+      "loss": 0.8656,
+      "step": 9716
+    },
+    {
+      "epoch": 1.73005698005698,
+      "grad_norm": 0.5962017774581909,
+      "learning_rate": 0.00012121176371754317,
+      "loss": 0.918,
+      "step": 9717
+    },
+    {
+      "epoch": 1.7302350427350426,
+      "grad_norm": 0.7409394979476929,
+      "learning_rate": 0.00012119808456214407,
+      "loss": 1.0283,
+      "step": 9718
+    },
+    {
+      "epoch": 1.7304131054131053,
+      "grad_norm": 0.6571973562240601,
+      "learning_rate": 0.00012118440499141257,
+      "loss": 1.1015,
+      "step": 9719
+    },
+    {
+      "epoch": 1.730591168091168,
+      "grad_norm": 0.681394100189209,
+      "learning_rate": 0.00012117072500561664,
+      "loss": 0.8247,
+      "step": 9720
+    },
+    {
+      "epoch": 1.7307692307692308,
+      "grad_norm": 0.7278251647949219,
+      "learning_rate": 0.00012115704460502432,
+      "loss": 1.0693,
+      "step": 9721
+    },
+    {
+      "epoch": 1.7309472934472936,
+      "grad_norm": 0.6569405794143677,
+      "learning_rate": 0.0001211433637899037,
+      "loss": 0.8992,
+      "step": 9722
+    },
+    {
+      "epoch": 1.7311253561253561,
+      "grad_norm": 0.6305136680603027,
+      "learning_rate": 0.00012112968256052272,
+      "loss": 0.8543,
+      "step": 9723
+    },
+    {
+      "epoch": 1.7313034188034186,
+      "grad_norm": 0.6111339330673218,
+      "learning_rate": 0.00012111600091714956,
+      "loss": 0.991,
+      "step": 9724
+    },
+    {
+      "epoch": 1.7314814814814814,
+      "grad_norm": 0.646973192691803,
+      "learning_rate": 0.00012110231886005223,
+      "loss": 0.8855,
+      "step": 9725
+    },
+    {
+      "epoch": 1.7316595441595442,
+      "grad_norm": 0.7054407000541687,
+      "learning_rate": 0.00012108863638949879,
+      "loss": 1.0816,
+      "step": 9726
+    },
+    {
+      "epoch": 1.731837606837607,
+      "grad_norm": 0.6592162847518921,
+      "learning_rate": 0.00012107495350575729,
+      "loss": 1.0961,
+      "step": 9727
+    },
+    {
+      "epoch": 1.7320156695156697,
+      "grad_norm": 0.6615595817565918,
+      "learning_rate": 0.00012106127020909587,
+      "loss": 0.9669,
+      "step": 9728
+    },
+    {
+      "epoch": 1.7321937321937322,
+      "grad_norm": 0.9030881524085999,
+      "learning_rate": 0.00012104758649978263,
+      "loss": 0.9438,
+      "step": 9729
+    },
+    {
+      "epoch": 1.7323717948717947,
+      "grad_norm": 0.6776516437530518,
+      "learning_rate": 0.00012103390237808566,
+      "loss": 0.8967,
+      "step": 9730
+    },
+    {
+      "epoch": 1.7325498575498575,
+      "grad_norm": 0.6010605096817017,
+      "learning_rate": 0.00012102021784427306,
+      "loss": 0.8893,
+      "step": 9731
+    },
+    {
+      "epoch": 1.7327279202279202,
+      "grad_norm": 0.6540384292602539,
+      "learning_rate": 0.00012100653289861295,
+      "loss": 0.9328,
+      "step": 9732
+    },
+    {
+      "epoch": 1.732905982905983,
+      "grad_norm": 0.6836950182914734,
+      "learning_rate": 0.00012099284754137345,
+      "loss": 0.9019,
+      "step": 9733
+    },
+    {
+      "epoch": 1.7330840455840457,
+      "grad_norm": 0.7597874402999878,
+      "learning_rate": 0.00012097916177282274,
+      "loss": 1.0093,
+      "step": 9734
+    },
+    {
+      "epoch": 1.7332621082621082,
+      "grad_norm": 0.7686513066291809,
+      "learning_rate": 0.00012096547559322892,
+      "loss": 0.8685,
+      "step": 9735
+    },
+    {
+      "epoch": 1.7334401709401708,
+      "grad_norm": 0.613777220249176,
+      "learning_rate": 0.0001209517890028602,
+      "loss": 0.8317,
+      "step": 9736
+    },
+    {
+      "epoch": 1.7336182336182335,
+      "grad_norm": 0.6788455843925476,
+      "learning_rate": 0.00012093810200198466,
+      "loss": 0.866,
+      "step": 9737
+    },
+    {
+      "epoch": 1.7337962962962963,
+      "grad_norm": 0.616801381111145,
+      "learning_rate": 0.00012092441459087047,
+      "loss": 0.8299,
+      "step": 9738
+    },
+    {
+      "epoch": 1.733974358974359,
+      "grad_norm": 0.731987476348877,
+      "learning_rate": 0.00012091072676978589,
+      "loss": 1.089,
+      "step": 9739
+    },
+    {
+      "epoch": 1.7341524216524218,
+      "grad_norm": 0.7042871117591858,
+      "learning_rate": 0.00012089703853899905,
+      "loss": 0.8667,
+      "step": 9740
+    },
+    {
+      "epoch": 1.7343304843304843,
+      "grad_norm": 0.62722247838974,
+      "learning_rate": 0.00012088334989877817,
+      "loss": 0.9185,
+      "step": 9741
+    },
+    {
+      "epoch": 1.734508547008547,
+      "grad_norm": 0.6354684829711914,
+      "learning_rate": 0.0001208696608493914,
+      "loss": 0.9951,
+      "step": 9742
+    },
+    {
+      "epoch": 1.7346866096866096,
+      "grad_norm": 0.658647894859314,
+      "learning_rate": 0.00012085597139110698,
+      "loss": 0.9324,
+      "step": 9743
+    },
+    {
+      "epoch": 1.7348646723646723,
+      "grad_norm": 0.84359210729599,
+      "learning_rate": 0.00012084228152419312,
+      "loss": 1.0861,
+      "step": 9744
+    },
+    {
+      "epoch": 1.735042735042735,
+      "grad_norm": 0.6293938755989075,
+      "learning_rate": 0.00012082859124891807,
+      "loss": 0.9676,
+      "step": 9745
+    },
+    {
+      "epoch": 1.7352207977207978,
+      "grad_norm": 0.6398760676383972,
+      "learning_rate": 0.00012081490056555004,
+      "loss": 0.8502,
+      "step": 9746
+    },
+    {
+      "epoch": 1.7353988603988604,
+      "grad_norm": 0.6918041706085205,
+      "learning_rate": 0.00012080120947435726,
+      "loss": 1.0081,
+      "step": 9747
+    },
+    {
+      "epoch": 1.7355769230769231,
+      "grad_norm": 0.7374079823493958,
+      "learning_rate": 0.00012078751797560798,
+      "loss": 0.9485,
+      "step": 9748
+    },
+    {
+      "epoch": 1.7357549857549857,
+      "grad_norm": 0.7392128705978394,
+      "learning_rate": 0.00012077382606957049,
+      "loss": 0.9283,
+      "step": 9749
+    },
+    {
+      "epoch": 1.7359330484330484,
+      "grad_norm": 0.701320230960846,
+      "learning_rate": 0.00012076013375651303,
+      "loss": 1.0339,
+      "step": 9750
+    },
+    {
+      "epoch": 1.7361111111111112,
+      "grad_norm": 0.6316696405410767,
+      "learning_rate": 0.00012074644103670387,
+      "loss": 0.9097,
+      "step": 9751
+    },
+    {
+      "epoch": 1.736289173789174,
+      "grad_norm": 0.6892024278640747,
+      "learning_rate": 0.00012073274791041132,
+      "loss": 1.0863,
+      "step": 9752
+    },
+    {
+      "epoch": 1.7364672364672364,
+      "grad_norm": 0.6032847762107849,
+      "learning_rate": 0.00012071905437790361,
+      "loss": 0.9305,
+      "step": 9753
+    },
+    {
+      "epoch": 1.7366452991452992,
+      "grad_norm": 0.6659184098243713,
+      "learning_rate": 0.00012070536043944907,
+      "loss": 0.9793,
+      "step": 9754
+    },
+    {
+      "epoch": 1.7368233618233617,
+      "grad_norm": 0.7413665056228638,
+      "learning_rate": 0.00012069166609531602,
+      "loss": 1.0523,
+      "step": 9755
+    },
+    {
+      "epoch": 1.7370014245014245,
+      "grad_norm": 0.7814368009567261,
+      "learning_rate": 0.00012067797134577275,
+      "loss": 0.9988,
+      "step": 9756
+    },
+    {
+      "epoch": 1.7371794871794872,
+      "grad_norm": 0.6174948811531067,
+      "learning_rate": 0.00012066427619108757,
+      "loss": 0.9002,
+      "step": 9757
+    },
+    {
+      "epoch": 1.73735754985755,
+      "grad_norm": 0.6521819233894348,
+      "learning_rate": 0.00012065058063152885,
+      "loss": 1.1307,
+      "step": 9758
+    },
+    {
+      "epoch": 1.7375356125356125,
+      "grad_norm": 0.6797493696212769,
+      "learning_rate": 0.00012063688466736489,
+      "loss": 0.84,
+      "step": 9759
+    },
+    {
+      "epoch": 1.7377136752136753,
+      "grad_norm": 0.6496474146842957,
+      "learning_rate": 0.00012062318829886404,
+      "loss": 0.86,
+      "step": 9760
+    },
+    {
+      "epoch": 1.7378917378917378,
+      "grad_norm": 0.6701306104660034,
+      "learning_rate": 0.00012060949152629467,
+      "loss": 0.9422,
+      "step": 9761
+    },
+    {
+      "epoch": 1.7380698005698005,
+      "grad_norm": 0.7331172823905945,
+      "learning_rate": 0.00012059579434992512,
+      "loss": 1.1648,
+      "step": 9762
+    },
+    {
+      "epoch": 1.7382478632478633,
+      "grad_norm": 0.63930743932724,
+      "learning_rate": 0.00012058209677002375,
+      "loss": 1.0617,
+      "step": 9763
+    },
+    {
+      "epoch": 1.738425925925926,
+      "grad_norm": 0.668851912021637,
+      "learning_rate": 0.00012056839878685895,
+      "loss": 0.8219,
+      "step": 9764
+    },
+    {
+      "epoch": 1.7386039886039886,
+      "grad_norm": 0.7305747270584106,
+      "learning_rate": 0.00012055470040069912,
+      "loss": 1.0416,
+      "step": 9765
+    },
+    {
+      "epoch": 1.7387820512820513,
+      "grad_norm": 0.6931866407394409,
+      "learning_rate": 0.00012054100161181264,
+      "loss": 1.0588,
+      "step": 9766
+    },
+    {
+      "epoch": 1.7389601139601139,
+      "grad_norm": 0.6565485000610352,
+      "learning_rate": 0.00012052730242046785,
+      "loss": 0.7885,
+      "step": 9767
+    },
+    {
+      "epoch": 1.7391381766381766,
+      "grad_norm": 0.739985466003418,
+      "learning_rate": 0.00012051360282693327,
+      "loss": 1.0973,
+      "step": 9768
+    },
+    {
+      "epoch": 1.7393162393162394,
+      "grad_norm": 0.6477079391479492,
+      "learning_rate": 0.00012049990283147723,
+      "loss": 0.9841,
+      "step": 9769
+    },
+    {
+      "epoch": 1.739494301994302,
+      "grad_norm": 0.7018330097198486,
+      "learning_rate": 0.00012048620243436819,
+      "loss": 1.0869,
+      "step": 9770
+    },
+    {
+      "epoch": 1.7396723646723646,
+      "grad_norm": 0.7087421417236328,
+      "learning_rate": 0.00012047250163587456,
+      "loss": 0.916,
+      "step": 9771
+    },
+    {
+      "epoch": 1.7398504273504274,
+      "grad_norm": 0.8747151494026184,
+      "learning_rate": 0.00012045880043626481,
+      "loss": 0.8245,
+      "step": 9772
+    },
+    {
+      "epoch": 1.74002849002849,
+      "grad_norm": 0.777498722076416,
+      "learning_rate": 0.00012044509883580735,
+      "loss": 1.071,
+      "step": 9773
+    },
+    {
+      "epoch": 1.7402065527065527,
+      "grad_norm": 0.6668971180915833,
+      "learning_rate": 0.00012043139683477062,
+      "loss": 1.0447,
+      "step": 9774
+    },
+    {
+      "epoch": 1.7403846153846154,
+      "grad_norm": 0.6702026724815369,
+      "learning_rate": 0.00012041769443342317,
+      "loss": 0.8688,
+      "step": 9775
+    },
+    {
+      "epoch": 1.7405626780626782,
+      "grad_norm": 0.7866267561912537,
+      "learning_rate": 0.00012040399163203337,
+      "loss": 1.0842,
+      "step": 9776
+    },
+    {
+      "epoch": 1.7407407407407407,
+      "grad_norm": 0.7655110955238342,
+      "learning_rate": 0.00012039028843086977,
+      "loss": 1.2417,
+      "step": 9777
+    },
+    {
+      "epoch": 1.7409188034188035,
+      "grad_norm": 0.7084119915962219,
+      "learning_rate": 0.0001203765848302008,
+      "loss": 0.9844,
+      "step": 9778
+    },
+    {
+      "epoch": 1.741096866096866,
+      "grad_norm": 0.7135398983955383,
+      "learning_rate": 0.00012036288083029497,
+      "loss": 1.1102,
+      "step": 9779
+    },
+    {
+      "epoch": 1.7412749287749287,
+      "grad_norm": 0.6784615516662598,
+      "learning_rate": 0.0001203491764314208,
+      "loss": 1.0349,
+      "step": 9780
+    },
+    {
+      "epoch": 1.7414529914529915,
+      "grad_norm": 0.7170301079750061,
+      "learning_rate": 0.00012033547163384682,
+      "loss": 1.0899,
+      "step": 9781
+    },
+    {
+      "epoch": 1.7416310541310542,
+      "grad_norm": 0.6692060828208923,
+      "learning_rate": 0.0001203217664378415,
+      "loss": 1.0486,
+      "step": 9782
+    },
+    {
+      "epoch": 1.7418091168091168,
+      "grad_norm": 0.6730037927627563,
+      "learning_rate": 0.00012030806084367336,
+      "loss": 0.9684,
+      "step": 9783
+    },
+    {
+      "epoch": 1.7419871794871795,
+      "grad_norm": 0.5983504056930542,
+      "learning_rate": 0.00012029435485161096,
+      "loss": 0.7106,
+      "step": 9784
+    },
+    {
+      "epoch": 1.742165242165242,
+      "grad_norm": 0.6834231615066528,
+      "learning_rate": 0.00012028064846192284,
+      "loss": 0.803,
+      "step": 9785
+    },
+    {
+      "epoch": 1.7423433048433048,
+      "grad_norm": 0.621046245098114,
+      "learning_rate": 0.00012026694167487755,
+      "loss": 0.9129,
+      "step": 9786
+    },
+    {
+      "epoch": 1.7425213675213675,
+      "grad_norm": 0.6348989605903625,
+      "learning_rate": 0.00012025323449074361,
+      "loss": 1.0076,
+      "step": 9787
+    },
+    {
+      "epoch": 1.7426994301994303,
+      "grad_norm": 0.6139974594116211,
+      "learning_rate": 0.00012023952690978966,
+      "loss": 1.0756,
+      "step": 9788
+    },
+    {
+      "epoch": 1.7428774928774928,
+      "grad_norm": 0.6473259925842285,
+      "learning_rate": 0.00012022581893228419,
+      "loss": 1.0568,
+      "step": 9789
+    },
+    {
+      "epoch": 1.7430555555555556,
+      "grad_norm": 0.6133778095245361,
+      "learning_rate": 0.00012021211055849581,
+      "loss": 0.8722,
+      "step": 9790
+    },
+    {
+      "epoch": 1.743233618233618,
+      "grad_norm": 0.6934139728546143,
+      "learning_rate": 0.00012019840178869315,
+      "loss": 1.0329,
+      "step": 9791
+    },
+    {
+      "epoch": 1.7434116809116809,
+      "grad_norm": 0.6730150580406189,
+      "learning_rate": 0.00012018469262314474,
+      "loss": 0.9326,
+      "step": 9792
+    },
+    {
+      "epoch": 1.7435897435897436,
+      "grad_norm": 0.6805521249771118,
+      "learning_rate": 0.0001201709830621192,
+      "loss": 1.0527,
+      "step": 9793
+    },
+    {
+      "epoch": 1.7437678062678064,
+      "grad_norm": 0.6972569823265076,
+      "learning_rate": 0.00012015727310588516,
+      "loss": 1.0024,
+      "step": 9794
+    },
+    {
+      "epoch": 1.743945868945869,
+      "grad_norm": 0.7329187989234924,
+      "learning_rate": 0.00012014356275471122,
+      "loss": 1.1864,
+      "step": 9795
+    },
+    {
+      "epoch": 1.7441239316239316,
+      "grad_norm": 0.7220240831375122,
+      "learning_rate": 0.00012012985200886602,
+      "loss": 0.8831,
+      "step": 9796
+    },
+    {
+      "epoch": 1.7443019943019942,
+      "grad_norm": 0.7829749584197998,
+      "learning_rate": 0.00012011614086861818,
+      "loss": 1.0365,
+      "step": 9797
+    },
+    {
+      "epoch": 1.744480056980057,
+      "grad_norm": 0.7148944735527039,
+      "learning_rate": 0.00012010242933423637,
+      "loss": 1.0413,
+      "step": 9798
+    },
+    {
+      "epoch": 1.7446581196581197,
+      "grad_norm": 0.5607262253761292,
+      "learning_rate": 0.00012008871740598917,
+      "loss": 0.8154,
+      "step": 9799
+    },
+    {
+      "epoch": 1.7448361823361824,
+      "grad_norm": 0.754626452922821,
+      "learning_rate": 0.00012007500508414531,
+      "loss": 1.0569,
+      "step": 9800
+    },
+    {
+      "epoch": 1.7450142450142452,
+      "grad_norm": 0.7216293215751648,
+      "learning_rate": 0.00012006129236897343,
+      "loss": 1.1641,
+      "step": 9801
+    },
+    {
+      "epoch": 1.7451923076923077,
+      "grad_norm": 0.6575515270233154,
+      "learning_rate": 0.0001200475792607422,
+      "loss": 0.9063,
+      "step": 9802
+    },
+    {
+      "epoch": 1.7453703703703702,
+      "grad_norm": 0.7411505579948425,
+      "learning_rate": 0.00012003386575972031,
+      "loss": 0.9791,
+      "step": 9803
+    },
+    {
+      "epoch": 1.745548433048433,
+      "grad_norm": 0.6945903301239014,
+      "learning_rate": 0.0001200201518661764,
+      "loss": 0.8111,
+      "step": 9804
+    },
+    {
+      "epoch": 1.7457264957264957,
+      "grad_norm": 0.5760970115661621,
+      "learning_rate": 0.00012000643758037924,
+      "loss": 1.1054,
+      "step": 9805
+    },
+    {
+      "epoch": 1.7459045584045585,
+      "grad_norm": 0.6732224225997925,
+      "learning_rate": 0.00011999272290259748,
+      "loss": 0.8992,
+      "step": 9806
+    },
+    {
+      "epoch": 1.7460826210826212,
+      "grad_norm": 0.673270046710968,
+      "learning_rate": 0.00011997900783309983,
+      "loss": 1.0554,
+      "step": 9807
+    },
+    {
+      "epoch": 1.7462606837606838,
+      "grad_norm": 0.7233314514160156,
+      "learning_rate": 0.00011996529237215503,
+      "loss": 1.066,
+      "step": 9808
+    },
+    {
+      "epoch": 1.7464387464387463,
+      "grad_norm": 0.7016494274139404,
+      "learning_rate": 0.00011995157652003183,
+      "loss": 0.891,
+      "step": 9809
+    },
+    {
+      "epoch": 1.746616809116809,
+      "grad_norm": 0.9377092719078064,
+      "learning_rate": 0.00011993786027699889,
+      "loss": 0.8626,
+      "step": 9810
+    },
+    {
+      "epoch": 1.7467948717948718,
+      "grad_norm": 0.6825845241546631,
+      "learning_rate": 0.00011992414364332503,
+      "loss": 0.8996,
+      "step": 9811
+    },
+    {
+      "epoch": 1.7469729344729346,
+      "grad_norm": 0.6836053729057312,
+      "learning_rate": 0.00011991042661927896,
+      "loss": 0.9338,
+      "step": 9812
+    },
+    {
+      "epoch": 1.7471509971509973,
+      "grad_norm": 0.6462908387184143,
+      "learning_rate": 0.00011989670920512943,
+      "loss": 1.1185,
+      "step": 9813
+    },
+    {
+      "epoch": 1.7473290598290598,
+      "grad_norm": 0.7191921472549438,
+      "learning_rate": 0.00011988299140114522,
+      "loss": 0.9084,
+      "step": 9814
+    },
+    {
+      "epoch": 1.7475071225071224,
+      "grad_norm": 0.6951598525047302,
+      "learning_rate": 0.00011986927320759508,
+      "loss": 1.0653,
+      "step": 9815
+    },
+    {
+      "epoch": 1.7476851851851851,
+      "grad_norm": 0.7512598037719727,
+      "learning_rate": 0.00011985555462474784,
+      "loss": 1.0259,
+      "step": 9816
+    },
+    {
+      "epoch": 1.7478632478632479,
+      "grad_norm": 0.6885492205619812,
+      "learning_rate": 0.00011984183565287226,
+      "loss": 0.7148,
+      "step": 9817
+    },
+    {
+      "epoch": 1.7480413105413106,
+      "grad_norm": 0.6880139708518982,
+      "learning_rate": 0.00011982811629223709,
+      "loss": 1.1567,
+      "step": 9818
+    },
+    {
+      "epoch": 1.7482193732193734,
+      "grad_norm": 0.7381170392036438,
+      "learning_rate": 0.0001198143965431112,
+      "loss": 0.8483,
+      "step": 9819
+    },
+    {
+      "epoch": 1.748397435897436,
+      "grad_norm": 0.6761063933372498,
+      "learning_rate": 0.00011980067640576333,
+      "loss": 0.9498,
+      "step": 9820
+    },
+    {
+      "epoch": 1.7485754985754984,
+      "grad_norm": 0.6454669237136841,
+      "learning_rate": 0.00011978695588046238,
+      "loss": 0.7336,
+      "step": 9821
+    },
+    {
+      "epoch": 1.7487535612535612,
+      "grad_norm": 0.6026871800422668,
+      "learning_rate": 0.00011977323496747712,
+      "loss": 0.8618,
+      "step": 9822
+    },
+    {
+      "epoch": 1.748931623931624,
+      "grad_norm": 0.6877408027648926,
+      "learning_rate": 0.0001197595136670764,
+      "loss": 0.9146,
+      "step": 9823
+    },
+    {
+      "epoch": 1.7491096866096867,
+      "grad_norm": 0.6874892115592957,
+      "learning_rate": 0.00011974579197952906,
+      "loss": 1.1628,
+      "step": 9824
+    },
+    {
+      "epoch": 1.7492877492877494,
+      "grad_norm": 0.7464384436607361,
+      "learning_rate": 0.00011973206990510393,
+      "loss": 1.007,
+      "step": 9825
+    },
+    {
+      "epoch": 1.749465811965812,
+      "grad_norm": 0.7281473278999329,
+      "learning_rate": 0.00011971834744406986,
+      "loss": 1.0776,
+      "step": 9826
+    },
+    {
+      "epoch": 1.7496438746438745,
+      "grad_norm": 0.6112284660339355,
+      "learning_rate": 0.00011970462459669575,
+      "loss": 0.7616,
+      "step": 9827
+    },
+    {
+      "epoch": 1.7498219373219372,
+      "grad_norm": 0.6498035192489624,
+      "learning_rate": 0.00011969090136325048,
+      "loss": 0.884,
+      "step": 9828
+    },
+    {
+      "epoch": 1.7498219373219372,
+      "eval_loss": 1.1018389463424683,
+      "eval_runtime": 24.5594,
+      "eval_samples_per_second": 42.387,
+      "eval_steps_per_second": 21.214,
+      "step": 9828
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 0.6746426224708557,
+      "learning_rate": 0.00011967717774400289,
+      "loss": 0.9023,
+      "step": 9829
+    },
+    {
+      "epoch": 1.7501780626780628,
+      "grad_norm": 0.6513423323631287,
+      "learning_rate": 0.00011966345373922188,
+      "loss": 0.9786,
+      "step": 9830
+    },
+    {
+      "epoch": 1.7503561253561255,
+      "grad_norm": 0.7053804397583008,
+      "learning_rate": 0.00011964972934917632,
+      "loss": 1.0667,
+      "step": 9831
+    },
+    {
+      "epoch": 1.750534188034188,
+      "grad_norm": 0.6769008040428162,
+      "learning_rate": 0.00011963600457413513,
+      "loss": 0.8596,
+      "step": 9832
+    },
+    {
+      "epoch": 1.7507122507122506,
+      "grad_norm": 0.7162246108055115,
+      "learning_rate": 0.00011962227941436725,
+      "loss": 1.0746,
+      "step": 9833
+    },
+    {
+      "epoch": 1.7508903133903133,
+      "grad_norm": 0.7665811777114868,
+      "learning_rate": 0.00011960855387014156,
+      "loss": 1.0056,
+      "step": 9834
+    },
+    {
+      "epoch": 1.751068376068376,
+      "grad_norm": 0.6186950206756592,
+      "learning_rate": 0.00011959482794172696,
+      "loss": 0.9016,
+      "step": 9835
+    },
+    {
+      "epoch": 1.7512464387464388,
+      "grad_norm": 0.8018904328346252,
+      "learning_rate": 0.00011958110162939245,
+      "loss": 0.9534,
+      "step": 9836
+    },
+    {
+      "epoch": 1.7514245014245016,
+      "grad_norm": 0.8239033818244934,
+      "learning_rate": 0.0001195673749334069,
+      "loss": 1.214,
+      "step": 9837
+    },
+    {
+      "epoch": 1.751602564102564,
+      "grad_norm": 0.7886297106742859,
+      "learning_rate": 0.00011955364785403931,
+      "loss": 0.9672,
+      "step": 9838
+    },
+    {
+      "epoch": 1.7517806267806266,
+      "grad_norm": 0.6463177800178528,
+      "learning_rate": 0.00011953992039155862,
+      "loss": 0.9184,
+      "step": 9839
+    },
+    {
+      "epoch": 1.7519586894586894,
+      "grad_norm": 0.7374706864356995,
+      "learning_rate": 0.00011952619254623374,
+      "loss": 0.9988,
+      "step": 9840
+    },
+    {
+      "epoch": 1.7521367521367521,
+      "grad_norm": 0.7456657886505127,
+      "learning_rate": 0.00011951246431833369,
+      "loss": 1.2197,
+      "step": 9841
+    },
+    {
+      "epoch": 1.7523148148148149,
+      "grad_norm": 0.6644248962402344,
+      "learning_rate": 0.00011949873570812746,
+      "loss": 0.9449,
+      "step": 9842
+    },
+    {
+      "epoch": 1.7524928774928776,
+      "grad_norm": 0.707919180393219,
+      "learning_rate": 0.000119485006715884,
+      "loss": 0.774,
+      "step": 9843
+    },
+    {
+      "epoch": 1.7526709401709402,
+      "grad_norm": 0.6273906826972961,
+      "learning_rate": 0.00011947127734187231,
+      "loss": 0.8682,
+      "step": 9844
+    },
+    {
+      "epoch": 1.7528490028490027,
+      "grad_norm": 0.8335350155830383,
+      "learning_rate": 0.00011945754758636136,
+      "loss": 1.2282,
+      "step": 9845
+    },
+    {
+      "epoch": 1.7530270655270654,
+      "grad_norm": 0.6849051117897034,
+      "learning_rate": 0.00011944381744962022,
+      "loss": 1.1091,
+      "step": 9846
+    },
+    {
+      "epoch": 1.7532051282051282,
+      "grad_norm": 0.8571760058403015,
+      "learning_rate": 0.00011943008693191781,
+      "loss": 0.9806,
+      "step": 9847
+    },
+    {
+      "epoch": 1.753383190883191,
+      "grad_norm": 0.7045019268989563,
+      "learning_rate": 0.00011941635603352328,
+      "loss": 0.9217,
+      "step": 9848
+    },
+    {
+      "epoch": 1.7535612535612537,
+      "grad_norm": 0.6820187568664551,
+      "learning_rate": 0.00011940262475470556,
+      "loss": 0.9983,
+      "step": 9849
+    },
+    {
+      "epoch": 1.7537393162393162,
+      "grad_norm": 0.7400697469711304,
+      "learning_rate": 0.00011938889309573374,
+      "loss": 0.9521,
+      "step": 9850
+    },
+    {
+      "epoch": 1.7539173789173788,
+      "grad_norm": 0.7027658820152283,
+      "learning_rate": 0.00011937516105687678,
+      "loss": 1.0749,
+      "step": 9851
+    },
+    {
+      "epoch": 1.7540954415954415,
+      "grad_norm": 0.6778307557106018,
+      "learning_rate": 0.00011936142863840382,
+      "loss": 1.0249,
+      "step": 9852
+    },
+    {
+      "epoch": 1.7542735042735043,
+      "grad_norm": 0.6787961721420288,
+      "learning_rate": 0.00011934769584058389,
+      "loss": 1.0014,
+      "step": 9853
+    },
+    {
+      "epoch": 1.754451566951567,
+      "grad_norm": 0.7515636086463928,
+      "learning_rate": 0.00011933396266368606,
+      "loss": 1.0351,
+      "step": 9854
+    },
+    {
+      "epoch": 1.7546296296296298,
+      "grad_norm": 0.6620134115219116,
+      "learning_rate": 0.00011932022910797938,
+      "loss": 1.0294,
+      "step": 9855
+    },
+    {
+      "epoch": 1.7548076923076923,
+      "grad_norm": 0.8260951638221741,
+      "learning_rate": 0.00011930649517373294,
+      "loss": 0.9078,
+      "step": 9856
+    },
+    {
+      "epoch": 1.7549857549857548,
+      "grad_norm": 0.7680675983428955,
+      "learning_rate": 0.00011929276086121584,
+      "loss": 0.92,
+      "step": 9857
+    },
+    {
+      "epoch": 1.7551638176638176,
+      "grad_norm": 0.7104191184043884,
+      "learning_rate": 0.00011927902617069717,
+      "loss": 0.9937,
+      "step": 9858
+    },
+    {
+      "epoch": 1.7553418803418803,
+      "grad_norm": 0.7185840606689453,
+      "learning_rate": 0.00011926529110244603,
+      "loss": 0.9775,
+      "step": 9859
+    },
+    {
+      "epoch": 1.755519943019943,
+      "grad_norm": 0.7114652991294861,
+      "learning_rate": 0.00011925155565673151,
+      "loss": 0.883,
+      "step": 9860
+    },
+    {
+      "epoch": 1.7556980056980058,
+      "grad_norm": 0.6906639337539673,
+      "learning_rate": 0.00011923781983382276,
+      "loss": 0.9789,
+      "step": 9861
+    },
+    {
+      "epoch": 1.7558760683760684,
+      "grad_norm": 0.706908106803894,
+      "learning_rate": 0.00011922408363398892,
+      "loss": 1.1186,
+      "step": 9862
+    },
+    {
+      "epoch": 1.756054131054131,
+      "grad_norm": 0.7532939910888672,
+      "learning_rate": 0.00011921034705749908,
+      "loss": 0.977,
+      "step": 9863
+    },
+    {
+      "epoch": 1.7562321937321936,
+      "grad_norm": 0.7397763729095459,
+      "learning_rate": 0.0001191966101046224,
+      "loss": 1.1121,
+      "step": 9864
+    },
+    {
+      "epoch": 1.7564102564102564,
+      "grad_norm": 0.6955398321151733,
+      "learning_rate": 0.00011918287277562801,
+      "loss": 1.0439,
+      "step": 9865
+    },
+    {
+      "epoch": 1.7565883190883191,
+      "grad_norm": 0.7485929727554321,
+      "learning_rate": 0.00011916913507078507,
+      "loss": 1.1644,
+      "step": 9866
+    },
+    {
+      "epoch": 1.756766381766382,
+      "grad_norm": 0.6337487101554871,
+      "learning_rate": 0.00011915539699036274,
+      "loss": 0.8216,
+      "step": 9867
+    },
+    {
+      "epoch": 1.7569444444444444,
+      "grad_norm": 0.6628872752189636,
+      "learning_rate": 0.00011914165853463022,
+      "loss": 0.9584,
+      "step": 9868
+    },
+    {
+      "epoch": 1.7571225071225072,
+      "grad_norm": 0.6577547788619995,
+      "learning_rate": 0.00011912791970385666,
+      "loss": 0.9484,
+      "step": 9869
+    },
+    {
+      "epoch": 1.7573005698005697,
+      "grad_norm": 0.6409304738044739,
+      "learning_rate": 0.00011911418049831127,
+      "loss": 1.1256,
+      "step": 9870
+    },
+    {
+      "epoch": 1.7574786324786325,
+      "grad_norm": 0.7499844431877136,
+      "learning_rate": 0.00011910044091826319,
+      "loss": 0.7991,
+      "step": 9871
+    },
+    {
+      "epoch": 1.7576566951566952,
+      "grad_norm": 0.6786715388298035,
+      "learning_rate": 0.00011908670096398165,
+      "loss": 1.0368,
+      "step": 9872
+    },
+    {
+      "epoch": 1.757834757834758,
+      "grad_norm": 0.6432101130485535,
+      "learning_rate": 0.00011907296063573585,
+      "loss": 0.9059,
+      "step": 9873
+    },
+    {
+      "epoch": 1.7580128205128205,
+      "grad_norm": 0.6542613506317139,
+      "learning_rate": 0.00011905921993379503,
+      "loss": 0.9866,
+      "step": 9874
+    },
+    {
+      "epoch": 1.7581908831908832,
+      "grad_norm": 0.6048218011856079,
+      "learning_rate": 0.00011904547885842838,
+      "loss": 0.9488,
+      "step": 9875
+    },
+    {
+      "epoch": 1.7583689458689458,
+      "grad_norm": 0.7694938778877258,
+      "learning_rate": 0.00011903173740990512,
+      "loss": 1.1026,
+      "step": 9876
+    },
+    {
+      "epoch": 1.7585470085470085,
+      "grad_norm": 0.6621627807617188,
+      "learning_rate": 0.00011901799558849451,
+      "loss": 1.135,
+      "step": 9877
+    },
+    {
+      "epoch": 1.7587250712250713,
+      "grad_norm": 0.6561587452888489,
+      "learning_rate": 0.0001190042533944658,
+      "loss": 0.9322,
+      "step": 9878
+    },
+    {
+      "epoch": 1.758903133903134,
+      "grad_norm": 0.7846759557723999,
+      "learning_rate": 0.00011899051082808821,
+      "loss": 0.9324,
+      "step": 9879
+    },
+    {
+      "epoch": 1.7590811965811965,
+      "grad_norm": 0.6004071831703186,
+      "learning_rate": 0.00011897676788963101,
+      "loss": 0.9641,
+      "step": 9880
+    },
+    {
+      "epoch": 1.7592592592592593,
+      "grad_norm": 0.6731070280075073,
+      "learning_rate": 0.00011896302457936344,
+      "loss": 1.1437,
+      "step": 9881
+    },
+    {
+      "epoch": 1.7594373219373218,
+      "grad_norm": 0.6768675446510315,
+      "learning_rate": 0.00011894928089755481,
+      "loss": 1.0707,
+      "step": 9882
+    },
+    {
+      "epoch": 1.7596153846153846,
+      "grad_norm": 0.8368878960609436,
+      "learning_rate": 0.0001189355368444744,
+      "loss": 1.0435,
+      "step": 9883
+    },
+    {
+      "epoch": 1.7597934472934473,
+      "grad_norm": 0.6132324934005737,
+      "learning_rate": 0.00011892179242039149,
+      "loss": 0.8889,
+      "step": 9884
+    },
+    {
+      "epoch": 1.75997150997151,
+      "grad_norm": 0.7598093152046204,
+      "learning_rate": 0.00011890804762557535,
+      "loss": 1.151,
+      "step": 9885
+    },
+    {
+      "epoch": 1.7601495726495726,
+      "grad_norm": 0.7317715883255005,
+      "learning_rate": 0.00011889430246029527,
+      "loss": 0.9992,
+      "step": 9886
+    },
+    {
+      "epoch": 1.7603276353276354,
+      "grad_norm": 0.7664858102798462,
+      "learning_rate": 0.00011888055692482059,
+      "loss": 0.8398,
+      "step": 9887
+    },
+    {
+      "epoch": 1.760505698005698,
+      "grad_norm": 0.6916853189468384,
+      "learning_rate": 0.00011886681101942063,
+      "loss": 0.9507,
+      "step": 9888
+    },
+    {
+      "epoch": 1.7606837606837606,
+      "grad_norm": 0.7103399634361267,
+      "learning_rate": 0.0001188530647443647,
+      "loss": 0.915,
+      "step": 9889
+    },
+    {
+      "epoch": 1.7608618233618234,
+      "grad_norm": 0.6177804470062256,
+      "learning_rate": 0.00011883931809992215,
+      "loss": 0.721,
+      "step": 9890
+    },
+    {
+      "epoch": 1.7610398860398861,
+      "grad_norm": 0.7523959279060364,
+      "learning_rate": 0.00011882557108636227,
+      "loss": 0.99,
+      "step": 9891
+    },
+    {
+      "epoch": 1.7612179487179487,
+      "grad_norm": 0.6211134791374207,
+      "learning_rate": 0.00011881182370395442,
+      "loss": 0.8089,
+      "step": 9892
+    },
+    {
+      "epoch": 1.7613960113960114,
+      "grad_norm": 0.6660307049751282,
+      "learning_rate": 0.00011879807595296802,
+      "loss": 1.1062,
+      "step": 9893
+    },
+    {
+      "epoch": 1.761574074074074,
+      "grad_norm": 0.7039240598678589,
+      "learning_rate": 0.00011878432783367232,
+      "loss": 0.9739,
+      "step": 9894
+    },
+    {
+      "epoch": 1.7617521367521367,
+      "grad_norm": 0.658064603805542,
+      "learning_rate": 0.00011877057934633675,
+      "loss": 0.9438,
+      "step": 9895
+    },
+    {
+      "epoch": 1.7619301994301995,
+      "grad_norm": 0.8227152228355408,
+      "learning_rate": 0.00011875683049123068,
+      "loss": 0.8385,
+      "step": 9896
+    },
+    {
+      "epoch": 1.7621082621082622,
+      "grad_norm": 0.6622483730316162,
+      "learning_rate": 0.00011874308126862346,
+      "loss": 0.9432,
+      "step": 9897
+    },
+    {
+      "epoch": 1.7622863247863247,
+      "grad_norm": 0.7211357951164246,
+      "learning_rate": 0.00011872933167878453,
+      "loss": 1.2471,
+      "step": 9898
+    },
+    {
+      "epoch": 1.7624643874643875,
+      "grad_norm": 0.6177424192428589,
+      "learning_rate": 0.00011871558172198322,
+      "loss": 0.8892,
+      "step": 9899
+    },
+    {
+      "epoch": 1.76264245014245,
+      "grad_norm": 0.6924285888671875,
+      "learning_rate": 0.00011870183139848898,
+      "loss": 1.021,
+      "step": 9900
+    },
+    {
+      "epoch": 1.7628205128205128,
+      "grad_norm": 0.6168648600578308,
+      "learning_rate": 0.0001186880807085712,
+      "loss": 0.9013,
+      "step": 9901
+    },
+    {
+      "epoch": 1.7629985754985755,
+      "grad_norm": 0.6410452723503113,
+      "learning_rate": 0.00011867432965249929,
+      "loss": 0.6686,
+      "step": 9902
+    },
+    {
+      "epoch": 1.7631766381766383,
+      "grad_norm": 0.6959559917449951,
+      "learning_rate": 0.0001186605782305427,
+      "loss": 0.9814,
+      "step": 9903
+    },
+    {
+      "epoch": 1.7633547008547008,
+      "grad_norm": 0.7456178069114685,
+      "learning_rate": 0.00011864682644297085,
+      "loss": 1.0151,
+      "step": 9904
+    },
+    {
+      "epoch": 1.7635327635327636,
+      "grad_norm": 0.6499991416931152,
+      "learning_rate": 0.00011863307429005317,
+      "loss": 0.83,
+      "step": 9905
+    },
+    {
+      "epoch": 1.763710826210826,
+      "grad_norm": 0.643344521522522,
+      "learning_rate": 0.00011861932177205908,
+      "loss": 0.8853,
+      "step": 9906
+    },
+    {
+      "epoch": 1.7638888888888888,
+      "grad_norm": 0.6570441722869873,
+      "learning_rate": 0.00011860556888925804,
+      "loss": 0.9179,
+      "step": 9907
+    },
+    {
+      "epoch": 1.7640669515669516,
+      "grad_norm": 0.6892307996749878,
+      "learning_rate": 0.00011859181564191957,
+      "loss": 0.9657,
+      "step": 9908
+    },
+    {
+      "epoch": 1.7642450142450143,
+      "grad_norm": 0.648158073425293,
+      "learning_rate": 0.0001185780620303131,
+      "loss": 0.9179,
+      "step": 9909
+    },
+    {
+      "epoch": 1.7644230769230769,
+      "grad_norm": 0.5833603143692017,
+      "learning_rate": 0.00011856430805470808,
+      "loss": 0.8505,
+      "step": 9910
+    },
+    {
+      "epoch": 1.7646011396011396,
+      "grad_norm": 0.8302416205406189,
+      "learning_rate": 0.000118550553715374,
+      "loss": 0.8948,
+      "step": 9911
+    },
+    {
+      "epoch": 1.7647792022792022,
+      "grad_norm": 0.7075300216674805,
+      "learning_rate": 0.00011853679901258035,
+      "loss": 1.2467,
+      "step": 9912
+    },
+    {
+      "epoch": 1.764957264957265,
+      "grad_norm": 0.81916344165802,
+      "learning_rate": 0.00011852304394659666,
+      "loss": 0.9963,
+      "step": 9913
+    },
+    {
+      "epoch": 1.7651353276353277,
+      "grad_norm": 0.6492435932159424,
+      "learning_rate": 0.00011850928851769239,
+      "loss": 1.0704,
+      "step": 9914
+    },
+    {
+      "epoch": 1.7653133903133904,
+      "grad_norm": 0.7301090359687805,
+      "learning_rate": 0.00011849553272613704,
+      "loss": 1.0477,
+      "step": 9915
+    },
+    {
+      "epoch": 1.765491452991453,
+      "grad_norm": 0.7280275821685791,
+      "learning_rate": 0.00011848177657220019,
+      "loss": 0.9124,
+      "step": 9916
+    },
+    {
+      "epoch": 1.7656695156695157,
+      "grad_norm": 0.6948845386505127,
+      "learning_rate": 0.00011846802005615127,
+      "loss": 1.2275,
+      "step": 9917
+    },
+    {
+      "epoch": 1.7658475783475782,
+      "grad_norm": 0.6553834676742554,
+      "learning_rate": 0.0001184542631782599,
+      "loss": 1.2311,
+      "step": 9918
+    },
+    {
+      "epoch": 1.766025641025641,
+      "grad_norm": 0.6899739503860474,
+      "learning_rate": 0.00011844050593879556,
+      "loss": 0.8936,
+      "step": 9919
+    },
+    {
+      "epoch": 1.7662037037037037,
+      "grad_norm": 0.6076815128326416,
+      "learning_rate": 0.00011842674833802782,
+      "loss": 0.8432,
+      "step": 9920
+    },
+    {
+      "epoch": 1.7663817663817665,
+      "grad_norm": 0.7650902271270752,
+      "learning_rate": 0.00011841299037622624,
+      "loss": 1.0447,
+      "step": 9921
+    },
+    {
+      "epoch": 1.7665598290598292,
+      "grad_norm": 0.6864938735961914,
+      "learning_rate": 0.00011839923205366032,
+      "loss": 0.936,
+      "step": 9922
+    },
+    {
+      "epoch": 1.7667378917378918,
+      "grad_norm": 0.7176852226257324,
+      "learning_rate": 0.0001183854733705997,
+      "loss": 0.9764,
+      "step": 9923
+    },
+    {
+      "epoch": 1.7669159544159543,
+      "grad_norm": 0.6513439416885376,
+      "learning_rate": 0.00011837171432731393,
+      "loss": 1.0095,
+      "step": 9924
+    },
+    {
+      "epoch": 1.767094017094017,
+      "grad_norm": 0.8031024932861328,
+      "learning_rate": 0.00011835795492407256,
+      "loss": 1.1348,
+      "step": 9925
+    },
+    {
+      "epoch": 1.7672720797720798,
+      "grad_norm": 0.7659830451011658,
+      "learning_rate": 0.00011834419516114518,
+      "loss": 0.9058,
+      "step": 9926
+    },
+    {
+      "epoch": 1.7674501424501425,
+      "grad_norm": 0.8864039778709412,
+      "learning_rate": 0.00011833043503880145,
+      "loss": 1.0342,
+      "step": 9927
+    },
+    {
+      "epoch": 1.7676282051282053,
+      "grad_norm": 0.6870512962341309,
+      "learning_rate": 0.00011831667455731088,
+      "loss": 0.9361,
+      "step": 9928
+    },
+    {
+      "epoch": 1.7678062678062678,
+      "grad_norm": 0.6458830833435059,
+      "learning_rate": 0.00011830291371694315,
+      "loss": 0.8215,
+      "step": 9929
+    },
+    {
+      "epoch": 1.7679843304843303,
+      "grad_norm": 0.7456086874008179,
+      "learning_rate": 0.00011828915251796787,
+      "loss": 1.1243,
+      "step": 9930
+    },
+    {
+      "epoch": 1.768162393162393,
+      "grad_norm": 0.6834850311279297,
+      "learning_rate": 0.00011827539096065459,
+      "loss": 0.9536,
+      "step": 9931
+    },
+    {
+      "epoch": 1.7683404558404558,
+      "grad_norm": 0.643864631652832,
+      "learning_rate": 0.00011826162904527302,
+      "loss": 1.1707,
+      "step": 9932
+    },
+    {
+      "epoch": 1.7685185185185186,
+      "grad_norm": 0.6312864422798157,
+      "learning_rate": 0.00011824786677209275,
+      "loss": 0.7937,
+      "step": 9933
+    },
+    {
+      "epoch": 1.7686965811965814,
+      "grad_norm": 0.6092729568481445,
+      "learning_rate": 0.00011823410414138343,
+      "loss": 0.8787,
+      "step": 9934
+    },
+    {
+      "epoch": 1.7688746438746439,
+      "grad_norm": 0.6859988570213318,
+      "learning_rate": 0.00011822034115341474,
+      "loss": 0.9691,
+      "step": 9935
+    },
+    {
+      "epoch": 1.7690527065527064,
+      "grad_norm": 0.7219935059547424,
+      "learning_rate": 0.0001182065778084563,
+      "loss": 1.0606,
+      "step": 9936
+    },
+    {
+      "epoch": 1.7692307692307692,
+      "grad_norm": 0.6596202850341797,
+      "learning_rate": 0.00011819281410677778,
+      "loss": 1.0543,
+      "step": 9937
+    },
+    {
+      "epoch": 1.769408831908832,
+      "grad_norm": 0.6616338491439819,
+      "learning_rate": 0.00011817905004864887,
+      "loss": 0.9757,
+      "step": 9938
+    },
+    {
+      "epoch": 1.7695868945868947,
+      "grad_norm": 0.6637360453605652,
+      "learning_rate": 0.00011816528563433924,
+      "loss": 0.925,
+      "step": 9939
+    },
+    {
+      "epoch": 1.7697649572649574,
+      "grad_norm": 0.8422333002090454,
+      "learning_rate": 0.00011815152086411859,
+      "loss": 1.1343,
+      "step": 9940
+    },
+    {
+      "epoch": 1.76994301994302,
+      "grad_norm": 0.6638204455375671,
+      "learning_rate": 0.00011813775573825656,
+      "loss": 1.2136,
+      "step": 9941
+    },
+    {
+      "epoch": 1.7701210826210825,
+      "grad_norm": 0.7258831858634949,
+      "learning_rate": 0.0001181239902570229,
+      "loss": 0.7308,
+      "step": 9942
+    },
+    {
+      "epoch": 1.7702991452991452,
+      "grad_norm": 0.730582594871521,
+      "learning_rate": 0.0001181102244206873,
+      "loss": 1.1097,
+      "step": 9943
+    },
+    {
+      "epoch": 1.770477207977208,
+      "grad_norm": 0.7324019074440002,
+      "learning_rate": 0.00011809645822951946,
+      "loss": 0.9802,
+      "step": 9944
+    },
+    {
+      "epoch": 1.7706552706552707,
+      "grad_norm": 0.5565997958183289,
+      "learning_rate": 0.00011808269168378914,
+      "loss": 0.7079,
+      "step": 9945
+    },
+    {
+      "epoch": 1.7708333333333335,
+      "grad_norm": 0.6395503282546997,
+      "learning_rate": 0.00011806892478376601,
+      "loss": 1.0048,
+      "step": 9946
+    },
+    {
+      "epoch": 1.771011396011396,
+      "grad_norm": 0.7670905590057373,
+      "learning_rate": 0.00011805515752971985,
+      "loss": 1.2509,
+      "step": 9947
+    },
+    {
+      "epoch": 1.7711894586894585,
+      "grad_norm": 0.5945813655853271,
+      "learning_rate": 0.00011804138992192037,
+      "loss": 0.8856,
+      "step": 9948
+    },
+    {
+      "epoch": 1.7713675213675213,
+      "grad_norm": 0.7355493307113647,
+      "learning_rate": 0.00011802762196063737,
+      "loss": 0.9629,
+      "step": 9949
+    },
+    {
+      "epoch": 1.771545584045584,
+      "grad_norm": 0.7024806141853333,
+      "learning_rate": 0.00011801385364614055,
+      "loss": 1.1351,
+      "step": 9950
+    },
+    {
+      "epoch": 1.7717236467236468,
+      "grad_norm": 0.6553003191947937,
+      "learning_rate": 0.00011800008497869968,
+      "loss": 0.911,
+      "step": 9951
+    },
+    {
+      "epoch": 1.7719017094017095,
+      "grad_norm": 0.6883971691131592,
+      "learning_rate": 0.00011798631595858454,
+      "loss": 1.0099,
+      "step": 9952
+    },
+    {
+      "epoch": 1.772079772079772,
+      "grad_norm": 0.7106832265853882,
+      "learning_rate": 0.00011797254658606489,
+      "loss": 1.0298,
+      "step": 9953
+    },
+    {
+      "epoch": 1.7722578347578346,
+      "grad_norm": 0.7902877926826477,
+      "learning_rate": 0.00011795877686141055,
+      "loss": 1.0572,
+      "step": 9954
+    },
+    {
+      "epoch": 1.7724358974358974,
+      "grad_norm": 0.7105007171630859,
+      "learning_rate": 0.00011794500678489126,
+      "loss": 1.1725,
+      "step": 9955
+    },
+    {
+      "epoch": 1.77261396011396,
+      "grad_norm": 0.7314959764480591,
+      "learning_rate": 0.00011793123635677685,
+      "loss": 1.1074,
+      "step": 9956
+    },
+    {
+      "epoch": 1.7727920227920229,
+      "grad_norm": 0.6358618140220642,
+      "learning_rate": 0.00011791746557733712,
+      "loss": 0.8786,
+      "step": 9957
+    },
+    {
+      "epoch": 1.7729700854700856,
+      "grad_norm": 0.6441367864608765,
+      "learning_rate": 0.00011790369444684187,
+      "loss": 1.1332,
+      "step": 9958
+    },
+    {
+      "epoch": 1.7731481481481481,
+      "grad_norm": 0.686787486076355,
+      "learning_rate": 0.0001178899229655609,
+      "loss": 0.9566,
+      "step": 9959
+    },
+    {
+      "epoch": 1.7733262108262107,
+      "grad_norm": 0.653840184211731,
+      "learning_rate": 0.00011787615113376407,
+      "loss": 0.8763,
+      "step": 9960
+    },
+    {
+      "epoch": 1.7735042735042734,
+      "grad_norm": 0.7106643915176392,
+      "learning_rate": 0.00011786237895172119,
+      "loss": 0.9929,
+      "step": 9961
+    },
+    {
+      "epoch": 1.7736823361823362,
+      "grad_norm": 0.6634044051170349,
+      "learning_rate": 0.0001178486064197021,
+      "loss": 0.7467,
+      "step": 9962
+    },
+    {
+      "epoch": 1.773860398860399,
+      "grad_norm": 0.7087352871894836,
+      "learning_rate": 0.00011783483353797663,
+      "loss": 1.0104,
+      "step": 9963
+    },
+    {
+      "epoch": 1.7740384615384617,
+      "grad_norm": 0.8088061213493347,
+      "learning_rate": 0.00011782106030681466,
+      "loss": 1.0376,
+      "step": 9964
+    },
+    {
+      "epoch": 1.7742165242165242,
+      "grad_norm": 0.7204688787460327,
+      "learning_rate": 0.00011780728672648604,
+      "loss": 0.8556,
+      "step": 9965
+    },
+    {
+      "epoch": 1.7743945868945867,
+      "grad_norm": 0.7893314957618713,
+      "learning_rate": 0.0001177935127972606,
+      "loss": 0.9764,
+      "step": 9966
+    },
+    {
+      "epoch": 1.7745726495726495,
+      "grad_norm": 0.6098896265029907,
+      "learning_rate": 0.00011777973851940826,
+      "loss": 0.9407,
+      "step": 9967
+    },
+    {
+      "epoch": 1.7747507122507122,
+      "grad_norm": 0.6420868039131165,
+      "learning_rate": 0.0001177659638931989,
+      "loss": 1.1328,
+      "step": 9968
+    },
+    {
+      "epoch": 1.774928774928775,
+      "grad_norm": 0.7732378244400024,
+      "learning_rate": 0.00011775218891890234,
+      "loss": 1.1236,
+      "step": 9969
+    },
+    {
+      "epoch": 1.7751068376068377,
+      "grad_norm": 0.6591582894325256,
+      "learning_rate": 0.00011773841359678855,
+      "loss": 1.1523,
+      "step": 9970
+    },
+    {
+      "epoch": 1.7752849002849003,
+      "grad_norm": 0.6337170004844666,
+      "learning_rate": 0.00011772463792712738,
+      "loss": 1.1998,
+      "step": 9971
+    },
+    {
+      "epoch": 1.7754629629629628,
+      "grad_norm": 0.6400532126426697,
+      "learning_rate": 0.00011771086191018874,
+      "loss": 0.9543,
+      "step": 9972
+    },
+    {
+      "epoch": 1.7756410256410255,
+      "grad_norm": 0.6431527733802795,
+      "learning_rate": 0.00011769708554624257,
+      "loss": 0.8164,
+      "step": 9973
+    },
+    {
+      "epoch": 1.7758190883190883,
+      "grad_norm": 0.7303599119186401,
+      "learning_rate": 0.00011768330883555876,
+      "loss": 0.9553,
+      "step": 9974
+    },
+    {
+      "epoch": 1.775997150997151,
+      "grad_norm": 0.7838605642318726,
+      "learning_rate": 0.00011766953177840725,
+      "loss": 0.9759,
+      "step": 9975
+    },
+    {
+      "epoch": 1.7761752136752138,
+      "grad_norm": 0.6505265831947327,
+      "learning_rate": 0.00011765575437505796,
+      "loss": 0.8527,
+      "step": 9976
+    },
+    {
+      "epoch": 1.7763532763532763,
+      "grad_norm": 0.7336180806159973,
+      "learning_rate": 0.00011764197662578086,
+      "loss": 1.1098,
+      "step": 9977
+    },
+    {
+      "epoch": 1.776531339031339,
+      "grad_norm": 0.7040138244628906,
+      "learning_rate": 0.00011762819853084586,
+      "loss": 1.1289,
+      "step": 9978
+    },
+    {
+      "epoch": 1.7767094017094016,
+      "grad_norm": 0.6414867043495178,
+      "learning_rate": 0.00011761442009052293,
+      "loss": 1.0826,
+      "step": 9979
+    },
+    {
+      "epoch": 1.7768874643874644,
+      "grad_norm": 0.6760666370391846,
+      "learning_rate": 0.00011760064130508204,
+      "loss": 1.0188,
+      "step": 9980
+    },
+    {
+      "epoch": 1.7770655270655271,
+      "grad_norm": 0.7864978909492493,
+      "learning_rate": 0.00011758686217479316,
+      "loss": 1.1938,
+      "step": 9981
+    },
+    {
+      "epoch": 1.7772435897435899,
+      "grad_norm": 0.7964870929718018,
+      "learning_rate": 0.00011757308269992622,
+      "loss": 0.9876,
+      "step": 9982
+    },
+    {
+      "epoch": 1.7774216524216524,
+      "grad_norm": 0.5158692002296448,
+      "learning_rate": 0.00011755930288075123,
+      "loss": 0.6508,
+      "step": 9983
+    },
+    {
+      "epoch": 1.7775997150997151,
+      "grad_norm": 0.7208606600761414,
+      "learning_rate": 0.00011754552271753819,
+      "loss": 1.0738,
+      "step": 9984
+    },
+    {
+      "epoch": 1.7777777777777777,
+      "grad_norm": 0.6811334490776062,
+      "learning_rate": 0.00011753174221055705,
+      "loss": 1.1216,
+      "step": 9985
+    },
+    {
+      "epoch": 1.7779558404558404,
+      "grad_norm": 0.6389986276626587,
+      "learning_rate": 0.00011751796136007787,
+      "loss": 0.9664,
+      "step": 9986
+    },
+    {
+      "epoch": 1.7781339031339032,
+      "grad_norm": 0.7081875205039978,
+      "learning_rate": 0.00011750418016637064,
+      "loss": 0.9365,
+      "step": 9987
+    },
+    {
+      "epoch": 1.778311965811966,
+      "grad_norm": 0.7291778326034546,
+      "learning_rate": 0.00011749039862970535,
+      "loss": 1.3222,
+      "step": 9988
+    },
+    {
+      "epoch": 1.7784900284900285,
+      "grad_norm": 0.6790453791618347,
+      "learning_rate": 0.000117476616750352,
+      "loss": 0.9537,
+      "step": 9989
+    },
+    {
+      "epoch": 1.7786680911680912,
+      "grad_norm": 0.6271076202392578,
+      "learning_rate": 0.00011746283452858069,
+      "loss": 0.9842,
+      "step": 9990
+    },
+    {
+      "epoch": 1.7788461538461537,
+      "grad_norm": 0.675628662109375,
+      "learning_rate": 0.00011744905196466138,
+      "loss": 0.8675,
+      "step": 9991
+    },
+    {
+      "epoch": 1.7790242165242165,
+      "grad_norm": 0.7328314185142517,
+      "learning_rate": 0.00011743526905886417,
+      "loss": 0.9793,
+      "step": 9992
+    },
+    {
+      "epoch": 1.7792022792022792,
+      "grad_norm": 0.698764979839325,
+      "learning_rate": 0.00011742148581145908,
+      "loss": 0.9527,
+      "step": 9993
+    },
+    {
+      "epoch": 1.779380341880342,
+      "grad_norm": 0.6911364793777466,
+      "learning_rate": 0.00011740770222271616,
+      "loss": 1.1069,
+      "step": 9994
+    },
+    {
+      "epoch": 1.7795584045584045,
+      "grad_norm": 0.6990836262702942,
+      "learning_rate": 0.00011739391829290547,
+      "loss": 0.9132,
+      "step": 9995
+    },
+    {
+      "epoch": 1.7797364672364673,
+      "grad_norm": 0.7056801319122314,
+      "learning_rate": 0.0001173801340222971,
+      "loss": 1.053,
+      "step": 9996
+    },
+    {
+      "epoch": 1.7799145299145298,
+      "grad_norm": 0.7453791499137878,
+      "learning_rate": 0.0001173663494111611,
+      "loss": 0.8806,
+      "step": 9997
+    },
+    {
+      "epoch": 1.7800925925925926,
+      "grad_norm": 0.7211771011352539,
+      "learning_rate": 0.00011735256445976757,
+      "loss": 0.9968,
+      "step": 9998
+    },
+    {
+      "epoch": 1.7802706552706553,
+      "grad_norm": 0.7259734272956848,
+      "learning_rate": 0.00011733877916838656,
+      "loss": 1.167,
+      "step": 9999
+    },
+    {
+      "epoch": 1.780448717948718,
+      "grad_norm": 0.6931926012039185,
+      "learning_rate": 0.00011732499353728821,
+      "loss": 1.0634,
+      "step": 10000
+    },
+    {
+      "epoch": 1.7806267806267806,
+      "grad_norm": 0.6900074481964111,
+      "learning_rate": 0.00011731120756674259,
+      "loss": 0.9718,
+      "step": 10001
+    },
+    {
+      "epoch": 1.7808048433048433,
+      "grad_norm": 0.6817582845687866,
+      "learning_rate": 0.00011729742125701984,
+      "loss": 1.0896,
+      "step": 10002
+    },
+    {
+      "epoch": 1.7809829059829059,
+      "grad_norm": 0.6901891231536865,
+      "learning_rate": 0.00011728363460839003,
+      "loss": 1.0163,
+      "step": 10003
+    },
+    {
+      "epoch": 1.7811609686609686,
+      "grad_norm": 0.9138323664665222,
+      "learning_rate": 0.00011726984762112328,
+      "loss": 1.1713,
+      "step": 10004
+    },
+    {
+      "epoch": 1.7813390313390314,
+      "grad_norm": 0.6105810403823853,
+      "learning_rate": 0.00011725606029548977,
+      "loss": 0.9331,
+      "step": 10005
+    },
+    {
+      "epoch": 1.7815170940170941,
+      "grad_norm": 0.5605259537696838,
+      "learning_rate": 0.0001172422726317596,
+      "loss": 0.7154,
+      "step": 10006
+    },
+    {
+      "epoch": 1.7816951566951567,
+      "grad_norm": 0.6950963735580444,
+      "learning_rate": 0.00011722848463020292,
+      "loss": 1.0093,
+      "step": 10007
+    },
+    {
+      "epoch": 1.7818732193732194,
+      "grad_norm": 0.6806309819221497,
+      "learning_rate": 0.00011721469629108988,
+      "loss": 0.8662,
+      "step": 10008
+    },
+    {
+      "epoch": 1.782051282051282,
+      "grad_norm": 0.7528520226478577,
+      "learning_rate": 0.00011720090761469063,
+      "loss": 0.8567,
+      "step": 10009
+    },
+    {
+      "epoch": 1.7822293447293447,
+      "grad_norm": 0.6617229580879211,
+      "learning_rate": 0.00011718711860127529,
+      "loss": 1.0378,
+      "step": 10010
+    },
+    {
+      "epoch": 1.7824074074074074,
+      "grad_norm": 0.6468376517295837,
+      "learning_rate": 0.00011717332925111411,
+      "loss": 1.0658,
+      "step": 10011
+    },
+    {
+      "epoch": 1.7825854700854702,
+      "grad_norm": 0.7141897082328796,
+      "learning_rate": 0.00011715953956447721,
+      "loss": 1.023,
+      "step": 10012
+    },
+    {
+      "epoch": 1.7827635327635327,
+      "grad_norm": 0.5777570605278015,
+      "learning_rate": 0.00011714574954163475,
+      "loss": 0.9154,
+      "step": 10013
+    },
+    {
+      "epoch": 1.7829415954415955,
+      "grad_norm": 0.7536137700080872,
+      "learning_rate": 0.00011713195918285695,
+      "loss": 0.9651,
+      "step": 10014
+    },
+    {
+      "epoch": 1.783119658119658,
+      "grad_norm": 0.6977683305740356,
+      "learning_rate": 0.00011711816848841402,
+      "loss": 0.7977,
+      "step": 10015
+    },
+    {
+      "epoch": 1.7832977207977208,
+      "grad_norm": 0.6522472500801086,
+      "learning_rate": 0.00011710437745857614,
+      "loss": 0.8834,
+      "step": 10016
+    },
+    {
+      "epoch": 1.7834757834757835,
+      "grad_norm": 0.6263057589530945,
+      "learning_rate": 0.0001170905860936135,
+      "loss": 1.0576,
+      "step": 10017
+    },
+    {
+      "epoch": 1.7836538461538463,
+      "grad_norm": 0.6470699310302734,
+      "learning_rate": 0.00011707679439379635,
+      "loss": 0.9412,
+      "step": 10018
+    },
+    {
+      "epoch": 1.7838319088319088,
+      "grad_norm": Infinity,
+      "learning_rate": 0.00011707679439379635,
+      "loss": 1.1746,
+      "step": 10019
+    },
+    {
+      "epoch": 1.7840099715099715,
+      "grad_norm": 0.6022017002105713,
+      "learning_rate": 0.00011706300235939485,
+      "loss": 0.8945,
+      "step": 10020
+    },
+    {
+      "epoch": 1.784188034188034,
+      "grad_norm": 0.637208104133606,
+      "learning_rate": 0.00011704920999067927,
+      "loss": 1.0215,
+      "step": 10021
+    },
+    {
+      "epoch": 1.7843660968660968,
+      "grad_norm": 0.7467851042747498,
+      "learning_rate": 0.00011703541728791987,
+      "loss": 1.0341,
+      "step": 10022
+    },
+    {
+      "epoch": 1.7845441595441596,
+      "grad_norm": 0.7562711238861084,
+      "learning_rate": 0.00011702162425138683,
+      "loss": 0.9748,
+      "step": 10023
+    },
+    {
+      "epoch": 1.7847222222222223,
+      "grad_norm": 0.6480089426040649,
+      "learning_rate": 0.00011700783088135043,
+      "loss": 1.05,
+      "step": 10024
+    },
+    {
+      "epoch": 1.7849002849002849,
+      "grad_norm": 0.6293981671333313,
+      "learning_rate": 0.00011699403717808091,
+      "loss": 1.0376,
+      "step": 10025
+    },
+    {
+      "epoch": 1.7850783475783476,
+      "grad_norm": 0.6821253895759583,
+      "learning_rate": 0.00011698024314184853,
+      "loss": 1.0542,
+      "step": 10026
+    },
+    {
+      "epoch": 1.7852564102564101,
+      "grad_norm": 0.6681216359138489,
+      "learning_rate": 0.00011696644877292356,
+      "loss": 1.0018,
+      "step": 10027
+    },
+    {
+      "epoch": 1.7854344729344729,
+      "grad_norm": 0.6788804531097412,
+      "learning_rate": 0.00011695265407157628,
+      "loss": 1.1823,
+      "step": 10028
+    },
+    {
+      "epoch": 1.7856125356125356,
+      "grad_norm": 0.6147881150245667,
+      "learning_rate": 0.00011693885903807697,
+      "loss": 0.9246,
+      "step": 10029
+    },
+    {
+      "epoch": 1.7857905982905984,
+      "grad_norm": 0.7952296137809753,
+      "learning_rate": 0.00011692506367269588,
+      "loss": 1.0528,
+      "step": 10030
+    },
+    {
+      "epoch": 1.785968660968661,
+      "grad_norm": 0.6985954642295837,
+      "learning_rate": 0.00011691126797570333,
+      "loss": 0.9173,
+      "step": 10031
+    },
+    {
+      "epoch": 1.7861467236467237,
+      "grad_norm": 0.6211223602294922,
+      "learning_rate": 0.00011689747194736961,
+      "loss": 0.7527,
+      "step": 10032
+    },
+    {
+      "epoch": 1.7863247863247862,
+      "grad_norm": 0.7531208992004395,
+      "learning_rate": 0.00011688367558796507,
+      "loss": 1.1087,
+      "step": 10033
+    },
+    {
+      "epoch": 1.786502849002849,
+      "grad_norm": 0.7742924690246582,
+      "learning_rate": 0.00011686987889775996,
+      "loss": 1.1512,
+      "step": 10034
+    },
+    {
+      "epoch": 1.7866809116809117,
+      "grad_norm": 0.7046231627464294,
+      "learning_rate": 0.00011685608187702459,
+      "loss": 1.0516,
+      "step": 10035
+    },
+    {
+      "epoch": 1.7868589743589745,
+      "grad_norm": 0.6264076232910156,
+      "learning_rate": 0.00011684228452602933,
+      "loss": 0.8938,
+      "step": 10036
+    },
+    {
+      "epoch": 1.7870370370370372,
+      "grad_norm": 0.6342145800590515,
+      "learning_rate": 0.00011682848684504448,
+      "loss": 0.8177,
+      "step": 10037
+    },
+    {
+      "epoch": 1.7872150997150997,
+      "grad_norm": 0.6609861254692078,
+      "learning_rate": 0.00011681468883434041,
+      "loss": 0.9692,
+      "step": 10038
+    },
+    {
+      "epoch": 1.7873931623931623,
+      "grad_norm": 0.7918622493743896,
+      "learning_rate": 0.00011680089049418743,
+      "loss": 0.8246,
+      "step": 10039
+    },
+    {
+      "epoch": 1.787571225071225,
+      "grad_norm": 0.697712779045105,
+      "learning_rate": 0.00011678709182485592,
+      "loss": 0.8981,
+      "step": 10040
+    },
+    {
+      "epoch": 1.7877492877492878,
+      "grad_norm": 0.6747658252716064,
+      "learning_rate": 0.00011677329282661617,
+      "loss": 1.1243,
+      "step": 10041
+    },
+    {
+      "epoch": 1.7879273504273505,
+      "grad_norm": 0.6525771617889404,
+      "learning_rate": 0.00011675949349973863,
+      "loss": 0.852,
+      "step": 10042
+    },
+    {
+      "epoch": 1.7881054131054133,
+      "grad_norm": 0.7062464952468872,
+      "learning_rate": 0.00011674569384449363,
+      "loss": 1.2582,
+      "step": 10043
+    },
+    {
+      "epoch": 1.7882834757834758,
+      "grad_norm": 0.6453786492347717,
+      "learning_rate": 0.00011673189386115154,
+      "loss": 0.868,
+      "step": 10044
+    },
+    {
+      "epoch": 1.7884615384615383,
+      "grad_norm": 0.7939708232879639,
+      "learning_rate": 0.00011671809354998273,
+      "loss": 0.7553,
+      "step": 10045
+    },
+    {
+      "epoch": 1.788639601139601,
+      "grad_norm": 0.6466066837310791,
+      "learning_rate": 0.00011670429291125761,
+      "loss": 0.942,
+      "step": 10046
+    },
+    {
+      "epoch": 1.7888176638176638,
+      "grad_norm": 0.7380510568618774,
+      "learning_rate": 0.00011669049194524657,
+      "loss": 1.044,
+      "step": 10047
+    },
+    {
+      "epoch": 1.7889957264957266,
+      "grad_norm": 0.6719707250595093,
+      "learning_rate": 0.00011667669065222002,
+      "loss": 1.1624,
+      "step": 10048
+    },
+    {
+      "epoch": 1.7891737891737893,
+      "grad_norm": 0.6996603012084961,
+      "learning_rate": 0.00011666288903244837,
+      "loss": 1.001,
+      "step": 10049
+    },
+    {
+      "epoch": 1.7893518518518519,
+      "grad_norm": 0.696590006351471,
+      "learning_rate": 0.00011664908708620202,
+      "loss": 1.17,
+      "step": 10050
+    },
+    {
+      "epoch": 1.7895299145299144,
+      "grad_norm": 0.7226764559745789,
+      "learning_rate": 0.00011663528481375137,
+      "loss": 1.0762,
+      "step": 10051
+    },
+    {
+      "epoch": 1.7897079772079771,
+      "grad_norm": 0.6117866635322571,
+      "learning_rate": 0.00011662148221536689,
+      "loss": 0.9199,
+      "step": 10052
+    },
+    {
+      "epoch": 1.78988603988604,
+      "grad_norm": 0.6424985527992249,
+      "learning_rate": 0.000116607679291319,
+      "loss": 1.1672,
+      "step": 10053
+    },
+    {
+      "epoch": 1.7900641025641026,
+      "grad_norm": 0.6390290856361389,
+      "learning_rate": 0.00011659387604187813,
+      "loss": 1.1895,
+      "step": 10054
+    },
+    {
+      "epoch": 1.7902421652421654,
+      "grad_norm": 0.6553205251693726,
+      "learning_rate": 0.00011658007246731473,
+      "loss": 1.0967,
+      "step": 10055
+    },
+    {
+      "epoch": 1.790420227920228,
+      "grad_norm": 0.7737570405006409,
+      "learning_rate": 0.00011656626856789922,
+      "loss": 0.9637,
+      "step": 10056
+    },
+    {
+      "epoch": 1.7905982905982905,
+      "grad_norm": 0.644296407699585,
+      "learning_rate": 0.00011655246434390212,
+      "loss": 0.9933,
+      "step": 10057
+    },
+    {
+      "epoch": 1.7907763532763532,
+      "grad_norm": 0.8154410123825073,
+      "learning_rate": 0.00011653865979559388,
+      "loss": 0.9623,
+      "step": 10058
+    },
+    {
+      "epoch": 1.790954415954416,
+      "grad_norm": 0.7181384563446045,
+      "learning_rate": 0.00011652485492324495,
+      "loss": 0.9113,
+      "step": 10059
+    },
+    {
+      "epoch": 1.7911324786324787,
+      "grad_norm": 0.7835097908973694,
+      "learning_rate": 0.00011651104972712582,
+      "loss": 1.0804,
+      "step": 10060
+    },
+    {
+      "epoch": 1.7913105413105415,
+      "grad_norm": 0.6843693852424622,
+      "learning_rate": 0.00011649724420750691,
+      "loss": 1.0242,
+      "step": 10061
+    },
+    {
+      "epoch": 1.791488603988604,
+      "grad_norm": 0.8364703059196472,
+      "learning_rate": 0.00011648343836465885,
+      "loss": 0.8445,
+      "step": 10062
+    },
+    {
+      "epoch": 1.7916666666666665,
+      "grad_norm": 0.7122092843055725,
+      "learning_rate": 0.00011646963219885201,
+      "loss": 1.0453,
+      "step": 10063
+    },
+    {
+      "epoch": 1.7918447293447293,
+      "grad_norm": 0.7018755078315735,
+      "learning_rate": 0.00011645582571035696,
+      "loss": 0.9753,
+      "step": 10064
+    },
+    {
+      "epoch": 1.792022792022792,
+      "grad_norm": 0.6522594094276428,
+      "learning_rate": 0.00011644201889944419,
+      "loss": 1.0328,
+      "step": 10065
+    },
+    {
+      "epoch": 1.7922008547008548,
+      "grad_norm": 0.70301353931427,
+      "learning_rate": 0.00011642821176638419,
+      "loss": 0.9143,
+      "step": 10066
+    },
+    {
+      "epoch": 1.7923789173789175,
+      "grad_norm": 0.6255469918251038,
+      "learning_rate": 0.0001164144043114475,
+      "loss": 0.9527,
+      "step": 10067
+    },
+    {
+      "epoch": 1.79255698005698,
+      "grad_norm": 0.6780602931976318,
+      "learning_rate": 0.0001164005965349047,
+      "loss": 0.9192,
+      "step": 10068
+    },
+    {
+      "epoch": 1.7927350427350426,
+      "grad_norm": 0.6025984287261963,
+      "learning_rate": 0.00011638678843702626,
+      "loss": 0.9055,
+      "step": 10069
+    },
+    {
+      "epoch": 1.7929131054131053,
+      "grad_norm": 0.6430829763412476,
+      "learning_rate": 0.00011637298001808275,
+      "loss": 0.9359,
+      "step": 10070
+    },
+    {
+      "epoch": 1.793091168091168,
+      "grad_norm": 0.6388106942176819,
+      "learning_rate": 0.0001163591712783447,
+      "loss": 0.8847,
+      "step": 10071
+    },
+    {
+      "epoch": 1.7932692307692308,
+      "grad_norm": 0.706347644329071,
+      "learning_rate": 0.00011634536221808265,
+      "loss": 0.9055,
+      "step": 10072
+    },
+    {
+      "epoch": 1.7934472934472936,
+      "grad_norm": 0.661226749420166,
+      "learning_rate": 0.00011633155283756721,
+      "loss": 1.118,
+      "step": 10073
+    },
+    {
+      "epoch": 1.7936253561253561,
+      "grad_norm": 0.543207049369812,
+      "learning_rate": 0.00011631774313706891,
+      "loss": 0.8856,
+      "step": 10074
+    },
+    {
+      "epoch": 1.7938034188034186,
+      "grad_norm": 0.6514154672622681,
+      "learning_rate": 0.00011630393311685835,
+      "loss": 0.8967,
+      "step": 10075
+    },
+    {
+      "epoch": 1.7939814814814814,
+      "grad_norm": 0.8669198155403137,
+      "learning_rate": 0.00011629012277720607,
+      "loss": 1.0362,
+      "step": 10076
+    },
+    {
+      "epoch": 1.7941595441595442,
+      "grad_norm": 0.7256068587303162,
+      "learning_rate": 0.00011627631211838266,
+      "loss": 1.1948,
+      "step": 10077
+    },
+    {
+      "epoch": 1.794337606837607,
+      "grad_norm": 0.6504935622215271,
+      "learning_rate": 0.00011626250114065875,
+      "loss": 0.8309,
+      "step": 10078
+    },
+    {
+      "epoch": 1.7945156695156697,
+      "grad_norm": 0.6964160799980164,
+      "learning_rate": 0.0001162486898443049,
+      "loss": 0.9593,
+      "step": 10079
+    },
+    {
+      "epoch": 1.7946937321937322,
+      "grad_norm": 0.668727695941925,
+      "learning_rate": 0.00011623487822959174,
+      "loss": 0.8897,
+      "step": 10080
+    },
+    {
+      "epoch": 1.7948717948717947,
+      "grad_norm": 0.6907223463058472,
+      "learning_rate": 0.00011622106629678986,
+      "loss": 0.897,
+      "step": 10081
+    },
+    {
+      "epoch": 1.7950498575498575,
+      "grad_norm": 0.6652865409851074,
+      "learning_rate": 0.00011620725404616985,
+      "loss": 0.9321,
+      "step": 10082
+    },
+    {
+      "epoch": 1.7952279202279202,
+      "grad_norm": 0.6523811221122742,
+      "learning_rate": 0.00011619344147800239,
+      "loss": 0.8991,
+      "step": 10083
+    },
+    {
+      "epoch": 1.795405982905983,
+      "grad_norm": 0.6162952184677124,
+      "learning_rate": 0.0001161796285925581,
+      "loss": 0.8061,
+      "step": 10084
+    },
+    {
+      "epoch": 1.7955840455840457,
+      "grad_norm": 0.670606791973114,
+      "learning_rate": 0.0001161658153901076,
+      "loss": 0.9341,
+      "step": 10085
+    },
+    {
+      "epoch": 1.7957621082621082,
+      "grad_norm": 0.6372489333152771,
+      "learning_rate": 0.00011615200187092148,
+      "loss": 1.1049,
+      "step": 10086
+    },
+    {
+      "epoch": 1.7959401709401708,
+      "grad_norm": 0.7311037182807922,
+      "learning_rate": 0.00011613818803527045,
+      "loss": 1.0881,
+      "step": 10087
+    },
+    {
+      "epoch": 1.7961182336182335,
+      "grad_norm": 0.7440751194953918,
+      "learning_rate": 0.00011612437388342518,
+      "loss": 0.9487,
+      "step": 10088
+    },
+    {
+      "epoch": 1.7962962962962963,
+      "grad_norm": 0.6605934500694275,
+      "learning_rate": 0.00011611055941565629,
+      "loss": 0.8757,
+      "step": 10089
+    },
+    {
+      "epoch": 1.796474358974359,
+      "grad_norm": 0.7546001076698303,
+      "learning_rate": 0.00011609674463223446,
+      "loss": 0.9368,
+      "step": 10090
+    },
+    {
+      "epoch": 1.7966524216524218,
+      "grad_norm": 0.7001389861106873,
+      "learning_rate": 0.00011608292953343036,
+      "loss": 0.9098,
+      "step": 10091
+    },
+    {
+      "epoch": 1.7968304843304843,
+      "grad_norm": 0.6898102760314941,
+      "learning_rate": 0.00011606911411951462,
+      "loss": 0.8821,
+      "step": 10092
+    },
+    {
+      "epoch": 1.797008547008547,
+      "grad_norm": 0.7020773887634277,
+      "learning_rate": 0.00011605529839075801,
+      "loss": 1.2775,
+      "step": 10093
+    },
+    {
+      "epoch": 1.7971866096866096,
+      "grad_norm": 0.6061446070671082,
+      "learning_rate": 0.0001160414823474312,
+      "loss": 1.0156,
+      "step": 10094
+    },
+    {
+      "epoch": 1.7973646723646723,
+      "grad_norm": 0.6746069192886353,
+      "learning_rate": 0.00011602766598980484,
+      "loss": 0.8223,
+      "step": 10095
+    },
+    {
+      "epoch": 1.797542735042735,
+      "grad_norm": 0.655829131603241,
+      "learning_rate": 0.00011601384931814967,
+      "loss": 0.9482,
+      "step": 10096
+    },
+    {
+      "epoch": 1.7977207977207978,
+      "grad_norm": 0.6762703061103821,
+      "learning_rate": 0.00011600003233273636,
+      "loss": 1.0191,
+      "step": 10097
+    },
+    {
+      "epoch": 1.7978988603988604,
+      "grad_norm": 0.7610527276992798,
+      "learning_rate": 0.00011598621503383566,
+      "loss": 1.0771,
+      "step": 10098
+    },
+    {
+      "epoch": 1.7980769230769231,
+      "grad_norm": 0.6857240200042725,
+      "learning_rate": 0.0001159723974217183,
+      "loss": 0.8325,
+      "step": 10099
+    },
+    {
+      "epoch": 1.7982549857549857,
+      "grad_norm": 0.6897954940795898,
+      "learning_rate": 0.00011595857949665501,
+      "loss": 1.0064,
+      "step": 10100
+    },
+    {
+      "epoch": 1.7984330484330484,
+      "grad_norm": 0.7023211717605591,
+      "learning_rate": 0.00011594476125891649,
+      "loss": 1.1346,
+      "step": 10101
+    },
+    {
+      "epoch": 1.7986111111111112,
+      "grad_norm": 0.8131003975868225,
+      "learning_rate": 0.00011593094270877347,
+      "loss": 1.0384,
+      "step": 10102
+    },
+    {
+      "epoch": 1.798789173789174,
+      "grad_norm": 0.6504445672035217,
+      "learning_rate": 0.00011591712384649676,
+      "loss": 0.8172,
+      "step": 10103
+    },
+    {
+      "epoch": 1.7989672364672364,
+      "grad_norm": 0.7379748821258545,
+      "learning_rate": 0.00011590330467235704,
+      "loss": 1.0118,
+      "step": 10104
+    },
+    {
+      "epoch": 1.7991452991452992,
+      "grad_norm": 0.8867329955101013,
+      "learning_rate": 0.0001158894851866251,
+      "loss": 1.023,
+      "step": 10105
+    },
+    {
+      "epoch": 1.7993233618233617,
+      "grad_norm": 0.7057412266731262,
+      "learning_rate": 0.00011587566538957173,
+      "loss": 0.8415,
+      "step": 10106
+    },
+    {
+      "epoch": 1.7995014245014245,
+      "grad_norm": 0.7479654550552368,
+      "learning_rate": 0.00011586184528146769,
+      "loss": 0.9663,
+      "step": 10107
+    },
+    {
+      "epoch": 1.7996794871794872,
+      "grad_norm": 0.6280845403671265,
+      "learning_rate": 0.00011584802486258368,
+      "loss": 0.973,
+      "step": 10108
+    },
+    {
+      "epoch": 1.79985754985755,
+      "grad_norm": 0.6735749840736389,
+      "learning_rate": 0.00011583420413319059,
+      "loss": 0.8631,
+      "step": 10109
+    },
+    {
+      "epoch": 1.8000356125356125,
+      "grad_norm": 0.5940406918525696,
+      "learning_rate": 0.00011582038309355918,
+      "loss": 0.8533,
+      "step": 10110
+    },
+    {
+      "epoch": 1.8002136752136753,
+      "grad_norm": 0.6923874020576477,
+      "learning_rate": 0.00011580656174396021,
+      "loss": 1.1105,
+      "step": 10111
+    },
+    {
+      "epoch": 1.8003917378917378,
+      "grad_norm": 0.6996715664863586,
+      "learning_rate": 0.00011579274008466447,
+      "loss": 0.9952,
+      "step": 10112
+    },
+    {
+      "epoch": 1.8005698005698005,
+      "grad_norm": 0.656561553478241,
+      "learning_rate": 0.00011577891811594281,
+      "loss": 0.9621,
+      "step": 10113
+    },
+    {
+      "epoch": 1.8007478632478633,
+      "grad_norm": 0.7121242880821228,
+      "learning_rate": 0.00011576509583806605,
+      "loss": 0.8658,
+      "step": 10114
+    },
+    {
+      "epoch": 1.800925925925926,
+      "grad_norm": 0.7864459753036499,
+      "learning_rate": 0.00011575127325130498,
+      "loss": 0.9867,
+      "step": 10115
+    },
+    {
+      "epoch": 1.8011039886039886,
+      "grad_norm": 0.6086452007293701,
+      "learning_rate": 0.00011573745035593042,
+      "loss": 0.8625,
+      "step": 10116
+    },
+    {
+      "epoch": 1.8012820512820513,
+      "grad_norm": 0.6553642749786377,
+      "learning_rate": 0.00011572362715221321,
+      "loss": 0.8475,
+      "step": 10117
+    },
+    {
+      "epoch": 1.8014601139601139,
+      "grad_norm": 0.6677348017692566,
+      "learning_rate": 0.00011570980364042419,
+      "loss": 0.9672,
+      "step": 10118
+    },
+    {
+      "epoch": 1.8016381766381766,
+      "grad_norm": 0.6275015473365784,
+      "learning_rate": 0.0001156959798208342,
+      "loss": 0.8663,
+      "step": 10119
+    },
+    {
+      "epoch": 1.8018162393162394,
+      "grad_norm": 0.787568211555481,
+      "learning_rate": 0.0001156821556937141,
+      "loss": 1.0188,
+      "step": 10120
+    },
+    {
+      "epoch": 1.801994301994302,
+      "grad_norm": 0.6983163356781006,
+      "learning_rate": 0.00011566833125933473,
+      "loss": 1.0767,
+      "step": 10121
+    },
+    {
+      "epoch": 1.8021723646723646,
+      "grad_norm": 0.7008936405181885,
+      "learning_rate": 0.00011565450651796695,
+      "loss": 1.0116,
+      "step": 10122
+    },
+    {
+      "epoch": 1.8023504273504274,
+      "grad_norm": 0.7694976925849915,
+      "learning_rate": 0.00011564068146988163,
+      "loss": 1.0227,
+      "step": 10123
+    },
+    {
+      "epoch": 1.80252849002849,
+      "grad_norm": 0.9530014991760254,
+      "learning_rate": 0.00011562685611534967,
+      "loss": 0.907,
+      "step": 10124
+    },
+    {
+      "epoch": 1.8027065527065527,
+      "grad_norm": 0.6714984178543091,
+      "learning_rate": 0.00011561303045464189,
+      "loss": 0.9501,
+      "step": 10125
+    },
+    {
+      "epoch": 1.8028846153846154,
+      "grad_norm": 0.7233797311782837,
+      "learning_rate": 0.00011559920448802925,
+      "loss": 1.021,
+      "step": 10126
+    },
+    {
+      "epoch": 1.8030626780626782,
+      "grad_norm": 0.7600540518760681,
+      "learning_rate": 0.0001155853782157826,
+      "loss": 1.1056,
+      "step": 10127
+    },
+    {
+      "epoch": 1.8032407407407407,
+      "grad_norm": 0.7836297750473022,
+      "learning_rate": 0.00011557155163817281,
+      "loss": 0.9906,
+      "step": 10128
+    },
+    {
+      "epoch": 1.8034188034188035,
+      "grad_norm": 0.7161104083061218,
+      "learning_rate": 0.00011555772475547084,
+      "loss": 0.9541,
+      "step": 10129
+    },
+    {
+      "epoch": 1.803596866096866,
+      "grad_norm": 0.6613732576370239,
+      "learning_rate": 0.00011554389756794757,
+      "loss": 0.9188,
+      "step": 10130
+    },
+    {
+      "epoch": 1.8037749287749287,
+      "grad_norm": 0.6415915489196777,
+      "learning_rate": 0.00011553007007587391,
+      "loss": 0.9928,
+      "step": 10131
+    },
+    {
+      "epoch": 1.8039529914529915,
+      "grad_norm": 0.7730516195297241,
+      "learning_rate": 0.0001155162422795208,
+      "loss": 1.0654,
+      "step": 10132
+    },
+    {
+      "epoch": 1.8041310541310542,
+      "grad_norm": 0.6769654750823975,
+      "learning_rate": 0.00011550241417915913,
+      "loss": 1.0678,
+      "step": 10133
+    },
+    {
+      "epoch": 1.8043091168091168,
+      "grad_norm": 0.6542425751686096,
+      "learning_rate": 0.00011548858577505988,
+      "loss": 0.9796,
+      "step": 10134
+    },
+    {
+      "epoch": 1.8044871794871795,
+      "grad_norm": 0.7282404899597168,
+      "learning_rate": 0.00011547475706749395,
+      "loss": 1.0314,
+      "step": 10135
+    },
+    {
+      "epoch": 1.804665242165242,
+      "grad_norm": 0.6450245976448059,
+      "learning_rate": 0.00011546092805673232,
+      "loss": 0.9564,
+      "step": 10136
+    },
+    {
+      "epoch": 1.8048433048433048,
+      "grad_norm": 0.65577632188797,
+      "learning_rate": 0.0001154470987430459,
+      "loss": 1.0219,
+      "step": 10137
+    },
+    {
+      "epoch": 1.8050213675213675,
+      "grad_norm": 0.7151737809181213,
+      "learning_rate": 0.00011543326912670567,
+      "loss": 0.9245,
+      "step": 10138
+    },
+    {
+      "epoch": 1.8051994301994303,
+      "grad_norm": 0.6695905327796936,
+      "learning_rate": 0.00011541943920798259,
+      "loss": 0.9535,
+      "step": 10139
+    },
+    {
+      "epoch": 1.8053774928774928,
+      "grad_norm": 0.7443813681602478,
+      "learning_rate": 0.00011540560898714767,
+      "loss": 1.1697,
+      "step": 10140
+    },
+    {
+      "epoch": 1.8055555555555556,
+      "grad_norm": 0.5701992511749268,
+      "learning_rate": 0.0001153917784644718,
+      "loss": 0.7868,
+      "step": 10141
+    },
+    {
+      "epoch": 1.805733618233618,
+      "grad_norm": 0.6992354989051819,
+      "learning_rate": 0.00011537794764022605,
+      "loss": 0.9856,
+      "step": 10142
+    },
+    {
+      "epoch": 1.8059116809116809,
+      "grad_norm": 0.6354477405548096,
+      "learning_rate": 0.00011536411651468131,
+      "loss": 0.8752,
+      "step": 10143
+    },
+    {
+      "epoch": 1.8060897435897436,
+      "grad_norm": 0.6952932476997375,
+      "learning_rate": 0.00011535028508810864,
+      "loss": 0.9446,
+      "step": 10144
+    },
+    {
+      "epoch": 1.8062678062678064,
+      "grad_norm": 0.5527541637420654,
+      "learning_rate": 0.00011533645336077901,
+      "loss": 0.5486,
+      "step": 10145
+    },
+    {
+      "epoch": 1.806445868945869,
+      "grad_norm": 0.685046374797821,
+      "learning_rate": 0.00011532262133296345,
+      "loss": 0.9529,
+      "step": 10146
+    },
+    {
+      "epoch": 1.8066239316239316,
+      "grad_norm": 0.6927558779716492,
+      "learning_rate": 0.00011530878900493296,
+      "loss": 1.1758,
+      "step": 10147
+    },
+    {
+      "epoch": 1.8068019943019942,
+      "grad_norm": 0.6758309602737427,
+      "learning_rate": 0.00011529495637695855,
+      "loss": 1.0076,
+      "step": 10148
+    },
+    {
+      "epoch": 1.806980056980057,
+      "grad_norm": 0.6739441156387329,
+      "learning_rate": 0.00011528112344931121,
+      "loss": 1.1914,
+      "step": 10149
+    },
+    {
+      "epoch": 1.8071581196581197,
+      "grad_norm": 0.7031944394111633,
+      "learning_rate": 0.00011526729022226204,
+      "loss": 0.783,
+      "step": 10150
+    },
+    {
+      "epoch": 1.8073361823361824,
+      "grad_norm": 0.6476930975914001,
+      "learning_rate": 0.00011525345669608202,
+      "loss": 0.9595,
+      "step": 10151
+    },
+    {
+      "epoch": 1.8075142450142452,
+      "grad_norm": 0.710498571395874,
+      "learning_rate": 0.00011523962287104222,
+      "loss": 0.8821,
+      "step": 10152
+    },
+    {
+      "epoch": 1.8076923076923077,
+      "grad_norm": 0.6664412617683411,
+      "learning_rate": 0.00011522578874741365,
+      "loss": 1.0182,
+      "step": 10153
+    },
+    {
+      "epoch": 1.8078703703703702,
+      "grad_norm": 0.8374263048171997,
+      "learning_rate": 0.00011521195432546737,
+      "loss": 0.9394,
+      "step": 10154
+    },
+    {
+      "epoch": 1.808048433048433,
+      "grad_norm": 0.6770764589309692,
+      "learning_rate": 0.00011519811960547447,
+      "loss": 1.0568,
+      "step": 10155
+    },
+    {
+      "epoch": 1.8082264957264957,
+      "grad_norm": 0.7014045715332031,
+      "learning_rate": 0.00011518428458770595,
+      "loss": 1.1705,
+      "step": 10156
+    },
+    {
+      "epoch": 1.8084045584045585,
+      "grad_norm": 0.6590061187744141,
+      "learning_rate": 0.00011517044927243295,
+      "loss": 1.1233,
+      "step": 10157
+    },
+    {
+      "epoch": 1.8085826210826212,
+      "grad_norm": 0.6093801856040955,
+      "learning_rate": 0.00011515661365992647,
+      "loss": 0.953,
+      "step": 10158
+    },
+    {
+      "epoch": 1.8087606837606838,
+      "grad_norm": 0.6197089552879333,
+      "learning_rate": 0.00011514277775045768,
+      "loss": 0.9414,
+      "step": 10159
+    },
+    {
+      "epoch": 1.8089387464387463,
+      "grad_norm": 0.7530463337898254,
+      "learning_rate": 0.00011512894154429759,
+      "loss": 0.9168,
+      "step": 10160
+    },
+    {
+      "epoch": 1.809116809116809,
+      "grad_norm": 0.6051347851753235,
+      "learning_rate": 0.00011511510504171735,
+      "loss": 0.9132,
+      "step": 10161
+    },
+    {
+      "epoch": 1.8092948717948718,
+      "grad_norm": 0.6388311982154846,
+      "learning_rate": 0.000115101268242988,
+      "loss": 0.6551,
+      "step": 10162
+    },
+    {
+      "epoch": 1.8094729344729346,
+      "grad_norm": 0.7040972709655762,
+      "learning_rate": 0.00011508743114838063,
+      "loss": 0.9409,
+      "step": 10163
+    },
+    {
+      "epoch": 1.8096509971509973,
+      "grad_norm": 0.7669548392295837,
+      "learning_rate": 0.00011507359375816644,
+      "loss": 1.0376,
+      "step": 10164
+    },
+    {
+      "epoch": 1.8098290598290598,
+      "grad_norm": 0.7309662699699402,
+      "learning_rate": 0.00011505975607261646,
+      "loss": 0.9071,
+      "step": 10165
+    },
+    {
+      "epoch": 1.8100071225071224,
+      "grad_norm": 0.6624547839164734,
+      "learning_rate": 0.00011504591809200187,
+      "loss": 1.0765,
+      "step": 10166
+    },
+    {
+      "epoch": 1.8101851851851851,
+      "grad_norm": 0.7719045281410217,
+      "learning_rate": 0.00011503207981659376,
+      "loss": 0.9244,
+      "step": 10167
+    },
+    {
+      "epoch": 1.8103632478632479,
+      "grad_norm": 0.6701484322547913,
+      "learning_rate": 0.0001150182412466633,
+      "loss": 0.9475,
+      "step": 10168
+    },
+    {
+      "epoch": 1.8105413105413106,
+      "grad_norm": 0.5604981184005737,
+      "learning_rate": 0.00011500440238248154,
+      "loss": 0.6268,
+      "step": 10169
+    },
+    {
+      "epoch": 1.8107193732193734,
+      "grad_norm": 0.6736510992050171,
+      "learning_rate": 0.00011499056322431973,
+      "loss": 0.9088,
+      "step": 10170
+    },
+    {
+      "epoch": 1.810897435897436,
+      "grad_norm": 0.7428455948829651,
+      "learning_rate": 0.00011497672377244897,
+      "loss": 0.9298,
+      "step": 10171
+    },
+    {
+      "epoch": 1.8110754985754984,
+      "grad_norm": 0.6543142795562744,
+      "learning_rate": 0.00011496288402714042,
+      "loss": 0.8863,
+      "step": 10172
+    },
+    {
+      "epoch": 1.8112535612535612,
+      "grad_norm": 0.6809250712394714,
+      "learning_rate": 0.00011494904398866524,
+      "loss": 0.977,
+      "step": 10173
+    },
+    {
+      "epoch": 1.811431623931624,
+      "grad_norm": 0.8105120062828064,
+      "learning_rate": 0.00011493520365729456,
+      "loss": 1.2115,
+      "step": 10174
+    },
+    {
+      "epoch": 1.8116096866096867,
+      "grad_norm": 0.6985095143318176,
+      "learning_rate": 0.00011492136303329964,
+      "loss": 0.8233,
+      "step": 10175
+    },
+    {
+      "epoch": 1.8117877492877494,
+      "grad_norm": 0.7198361754417419,
+      "learning_rate": 0.00011490752211695158,
+      "loss": 1.0552,
+      "step": 10176
+    },
+    {
+      "epoch": 1.811965811965812,
+      "grad_norm": 0.7077036499977112,
+      "learning_rate": 0.0001148936809085216,
+      "loss": 0.9171,
+      "step": 10177
+    },
+    {
+      "epoch": 1.8121438746438745,
+      "grad_norm": 0.9362925887107849,
+      "learning_rate": 0.00011487983940828089,
+      "loss": 0.9042,
+      "step": 10178
+    },
+    {
+      "epoch": 1.8123219373219372,
+      "grad_norm": 0.6732819676399231,
+      "learning_rate": 0.0001148659976165006,
+      "loss": 1.1033,
+      "step": 10179
+    },
+    {
+      "epoch": 1.8125,
+      "grad_norm": 0.747702419757843,
+      "learning_rate": 0.00011485215553345201,
+      "loss": 1.0692,
+      "step": 10180
+    },
+    {
+      "epoch": 1.8126780626780628,
+      "grad_norm": 0.7011259198188782,
+      "learning_rate": 0.00011483831315940627,
+      "loss": 0.9278,
+      "step": 10181
+    },
+    {
+      "epoch": 1.8128561253561255,
+      "grad_norm": 0.8542702198028564,
+      "learning_rate": 0.00011482447049463462,
+      "loss": 0.9476,
+      "step": 10182
+    },
+    {
+      "epoch": 1.813034188034188,
+      "grad_norm": 0.6975166201591492,
+      "learning_rate": 0.00011481062753940825,
+      "loss": 0.9486,
+      "step": 10183
+    },
+    {
+      "epoch": 1.8132122507122506,
+      "grad_norm": 0.8239036798477173,
+      "learning_rate": 0.0001147967842939984,
+      "loss": 1.0518,
+      "step": 10184
+    },
+    {
+      "epoch": 1.8133903133903133,
+      "grad_norm": 0.7559717297554016,
+      "learning_rate": 0.00011478294075867628,
+      "loss": 1.1877,
+      "step": 10185
+    },
+    {
+      "epoch": 1.813568376068376,
+      "grad_norm": 0.6755532026290894,
+      "learning_rate": 0.00011476909693371318,
+      "loss": 0.9287,
+      "step": 10186
+    },
+    {
+      "epoch": 1.8137464387464388,
+      "grad_norm": 0.6561332941055298,
+      "learning_rate": 0.0001147552528193803,
+      "loss": 0.83,
+      "step": 10187
+    },
+    {
+      "epoch": 1.8139245014245016,
+      "grad_norm": 0.7223508954048157,
+      "learning_rate": 0.00011474140841594887,
+      "loss": 1.1259,
+      "step": 10188
+    },
+    {
+      "epoch": 1.814102564102564,
+      "grad_norm": 0.7920593023300171,
+      "learning_rate": 0.0001147275637236902,
+      "loss": 1.0925,
+      "step": 10189
+    },
+    {
+      "epoch": 1.8142806267806266,
+      "grad_norm": 0.6896616816520691,
+      "learning_rate": 0.00011471371874287546,
+      "loss": 1.0204,
+      "step": 10190
+    },
+    {
+      "epoch": 1.8144586894586894,
+      "grad_norm": 0.6149865388870239,
+      "learning_rate": 0.00011469987347377602,
+      "loss": 1.1249,
+      "step": 10191
+    },
+    {
+      "epoch": 1.8146367521367521,
+      "grad_norm": 0.6650002598762512,
+      "learning_rate": 0.00011468602791666307,
+      "loss": 0.9723,
+      "step": 10192
+    },
+    {
+      "epoch": 1.8148148148148149,
+      "grad_norm": 0.7298738956451416,
+      "learning_rate": 0.00011467218207180792,
+      "loss": 1.0225,
+      "step": 10193
+    },
+    {
+      "epoch": 1.8149928774928776,
+      "grad_norm": 0.8075628876686096,
+      "learning_rate": 0.00011465833593948183,
+      "loss": 1.0429,
+      "step": 10194
+    },
+    {
+      "epoch": 1.8151709401709402,
+      "grad_norm": 0.8196593523025513,
+      "learning_rate": 0.0001146444895199561,
+      "loss": 0.9148,
+      "step": 10195
+    },
+    {
+      "epoch": 1.8153490028490027,
+      "grad_norm": 0.6394698023796082,
+      "learning_rate": 0.00011463064281350204,
+      "loss": 0.9781,
+      "step": 10196
+    },
+    {
+      "epoch": 1.8155270655270654,
+      "grad_norm": 0.7302836775779724,
+      "learning_rate": 0.00011461679582039091,
+      "loss": 1.0394,
+      "step": 10197
+    },
+    {
+      "epoch": 1.8157051282051282,
+      "grad_norm": 0.7066670060157776,
+      "learning_rate": 0.00011460294854089404,
+      "loss": 1.1153,
+      "step": 10198
+    },
+    {
+      "epoch": 1.815883190883191,
+      "grad_norm": 0.6471068263053894,
+      "learning_rate": 0.0001145891009752827,
+      "loss": 1.1533,
+      "step": 10199
+    },
+    {
+      "epoch": 1.8160612535612537,
+      "grad_norm": 0.6842355132102966,
+      "learning_rate": 0.00011457525312382826,
+      "loss": 0.953,
+      "step": 10200
+    },
+    {
+      "epoch": 1.8162393162393162,
+      "grad_norm": 0.6720319986343384,
+      "learning_rate": 0.00011456140498680202,
+      "loss": 1.003,
+      "step": 10201
+    },
+    {
+      "epoch": 1.8164173789173788,
+      "grad_norm": 0.632017970085144,
+      "learning_rate": 0.00011454755656447527,
+      "loss": 0.8148,
+      "step": 10202
+    },
+    {
+      "epoch": 1.8165954415954415,
+      "grad_norm": 0.7193828225135803,
+      "learning_rate": 0.00011453370785711939,
+      "loss": 1.0098,
+      "step": 10203
+    },
+    {
+      "epoch": 1.8167735042735043,
+      "grad_norm": 0.7098045349121094,
+      "learning_rate": 0.00011451985886500566,
+      "loss": 1.1276,
+      "step": 10204
+    },
+    {
+      "epoch": 1.816951566951567,
+      "grad_norm": 0.7076733708381653,
+      "learning_rate": 0.00011450600958840547,
+      "loss": 1.1216,
+      "step": 10205
+    },
+    {
+      "epoch": 1.8171296296296298,
+      "grad_norm": 0.6864610314369202,
+      "learning_rate": 0.00011449216002759018,
+      "loss": 0.9896,
+      "step": 10206
+    },
+    {
+      "epoch": 1.8173076923076923,
+      "grad_norm": 0.737727701663971,
+      "learning_rate": 0.0001144783101828311,
+      "loss": 0.9447,
+      "step": 10207
+    },
+    {
+      "epoch": 1.8174857549857548,
+      "grad_norm": 0.6562525033950806,
+      "learning_rate": 0.00011446446005439964,
+      "loss": 1.1208,
+      "step": 10208
+    },
+    {
+      "epoch": 1.8176638176638176,
+      "grad_norm": 0.7203826308250427,
+      "learning_rate": 0.0001144506096425671,
+      "loss": 1.1339,
+      "step": 10209
+    },
+    {
+      "epoch": 1.8178418803418803,
+      "grad_norm": 0.6657233834266663,
+      "learning_rate": 0.00011443675894760489,
+      "loss": 0.8307,
+      "step": 10210
+    },
+    {
+      "epoch": 1.818019943019943,
+      "grad_norm": 0.7032586932182312,
+      "learning_rate": 0.00011442290796978437,
+      "loss": 0.8546,
+      "step": 10211
+    },
+    {
+      "epoch": 1.8181980056980058,
+      "grad_norm": 0.6989460587501526,
+      "learning_rate": 0.00011440905670937696,
+      "loss": 1.0749,
+      "step": 10212
+    },
+    {
+      "epoch": 1.8183760683760684,
+      "grad_norm": 0.6461085677146912,
+      "learning_rate": 0.00011439520516665399,
+      "loss": 0.984,
+      "step": 10213
+    },
+    {
+      "epoch": 1.818554131054131,
+      "grad_norm": 0.7077372670173645,
+      "learning_rate": 0.00011438135334188689,
+      "loss": 1.0813,
+      "step": 10214
+    },
+    {
+      "epoch": 1.8187321937321936,
+      "grad_norm": 0.6724075675010681,
+      "learning_rate": 0.00011436750123534704,
+      "loss": 0.9975,
+      "step": 10215
+    },
+    {
+      "epoch": 1.8189102564102564,
+      "grad_norm": 0.6205753684043884,
+      "learning_rate": 0.00011435364884730583,
+      "loss": 0.7414,
+      "step": 10216
+    },
+    {
+      "epoch": 1.8190883190883191,
+      "grad_norm": 0.6416093707084656,
+      "learning_rate": 0.00011433979617803472,
+      "loss": 1.0024,
+      "step": 10217
+    },
+    {
+      "epoch": 1.819266381766382,
+      "grad_norm": 0.7817183136940002,
+      "learning_rate": 0.00011432594322780508,
+      "loss": 1.0577,
+      "step": 10218
+    },
+    {
+      "epoch": 1.8194444444444444,
+      "grad_norm": 0.688220202922821,
+      "learning_rate": 0.00011431208999688835,
+      "loss": 1.0301,
+      "step": 10219
+    },
+    {
+      "epoch": 1.8196225071225072,
+      "grad_norm": 0.6464754343032837,
+      "learning_rate": 0.0001142982364855559,
+      "loss": 1.0608,
+      "step": 10220
+    },
+    {
+      "epoch": 1.8198005698005697,
+      "grad_norm": 0.6607306599617004,
+      "learning_rate": 0.00011428438269407926,
+      "loss": 1.1203,
+      "step": 10221
+    },
+    {
+      "epoch": 1.8199786324786325,
+      "grad_norm": 0.5779942870140076,
+      "learning_rate": 0.00011427052862272982,
+      "loss": 0.7895,
+      "step": 10222
+    },
+    {
+      "epoch": 1.8201566951566952,
+      "grad_norm": 0.7599068880081177,
+      "learning_rate": 0.000114256674271779,
+      "loss": 0.883,
+      "step": 10223
+    },
+    {
+      "epoch": 1.820334757834758,
+      "grad_norm": 0.6578865051269531,
+      "learning_rate": 0.00011424281964149824,
+      "loss": 1.101,
+      "step": 10224
+    },
+    {
+      "epoch": 1.8205128205128205,
+      "grad_norm": 0.7090746760368347,
+      "learning_rate": 0.00011422896473215905,
+      "loss": 0.9514,
+      "step": 10225
+    },
+    {
+      "epoch": 1.8206908831908832,
+      "grad_norm": 0.7537758946418762,
+      "learning_rate": 0.00011421510954403281,
+      "loss": 1.2193,
+      "step": 10226
+    },
+    {
+      "epoch": 1.8208689458689458,
+      "grad_norm": 0.670183002948761,
+      "learning_rate": 0.00011420125407739106,
+      "loss": 1.1408,
+      "step": 10227
+    },
+    {
+      "epoch": 1.8210470085470085,
+      "grad_norm": 0.742520809173584,
+      "learning_rate": 0.00011418739833250524,
+      "loss": 0.8826,
+      "step": 10228
+    },
+    {
+      "epoch": 1.8212250712250713,
+      "grad_norm": 0.6542800664901733,
+      "learning_rate": 0.00011417354230964683,
+      "loss": 1.0039,
+      "step": 10229
+    },
+    {
+      "epoch": 1.821403133903134,
+      "grad_norm": 0.6713709235191345,
+      "learning_rate": 0.00011415968600908727,
+      "loss": 0.9351,
+      "step": 10230
+    },
+    {
+      "epoch": 1.8215811965811965,
+      "grad_norm": 0.6794951558113098,
+      "learning_rate": 0.0001141458294310981,
+      "loss": 0.9491,
+      "step": 10231
+    },
+    {
+      "epoch": 1.8217592592592593,
+      "grad_norm": 0.6921972632408142,
+      "learning_rate": 0.00011413197257595079,
+      "loss": 1.1342,
+      "step": 10232
+    },
+    {
+      "epoch": 1.8219373219373218,
+      "grad_norm": 0.702586829662323,
+      "learning_rate": 0.00011411811544391682,
+      "loss": 0.9992,
+      "step": 10233
+    },
+    {
+      "epoch": 1.8221153846153846,
+      "grad_norm": 0.8147975206375122,
+      "learning_rate": 0.00011410425803526772,
+      "loss": 1.0507,
+      "step": 10234
+    },
+    {
+      "epoch": 1.8222934472934473,
+      "grad_norm": 0.66419517993927,
+      "learning_rate": 0.00011409040035027496,
+      "loss": 1.0426,
+      "step": 10235
+    },
+    {
+      "epoch": 1.82247150997151,
+      "grad_norm": 0.6132485866546631,
+      "learning_rate": 0.00011407654238921011,
+      "loss": 0.9859,
+      "step": 10236
+    },
+    {
+      "epoch": 1.8226495726495726,
+      "grad_norm": 0.7522366046905518,
+      "learning_rate": 0.00011406268415234462,
+      "loss": 0.9379,
+      "step": 10237
+    },
+    {
+      "epoch": 1.8228276353276354,
+      "grad_norm": 0.6335554122924805,
+      "learning_rate": 0.00011404882563995007,
+      "loss": 0.9322,
+      "step": 10238
+    },
+    {
+      "epoch": 1.823005698005698,
+      "grad_norm": 0.7577497363090515,
+      "learning_rate": 0.00011403496685229797,
+      "loss": 1.1383,
+      "step": 10239
+    },
+    {
+      "epoch": 1.8231837606837606,
+      "grad_norm": 0.6796886920928955,
+      "learning_rate": 0.00011402110778965982,
+      "loss": 1.0092,
+      "step": 10240
+    },
+    {
+      "epoch": 1.8233618233618234,
+      "grad_norm": 0.7676617503166199,
+      "learning_rate": 0.0001140072484523072,
+      "loss": 1.0137,
+      "step": 10241
+    },
+    {
+      "epoch": 1.8235398860398861,
+      "grad_norm": 0.7807821035385132,
+      "learning_rate": 0.00011399338884051165,
+      "loss": 0.8987,
+      "step": 10242
+    },
+    {
+      "epoch": 1.8237179487179487,
+      "grad_norm": 0.7169568538665771,
+      "learning_rate": 0.00011397952895454473,
+      "loss": 0.8984,
+      "step": 10243
+    },
+    {
+      "epoch": 1.8238960113960114,
+      "grad_norm": 0.6564654111862183,
+      "learning_rate": 0.00011396566879467793,
+      "loss": 1.0255,
+      "step": 10244
+    },
+    {
+      "epoch": 1.824074074074074,
+      "grad_norm": 0.7290034294128418,
+      "learning_rate": 0.00011395180836118292,
+      "loss": 0.9962,
+      "step": 10245
+    },
+    {
+      "epoch": 1.8242521367521367,
+      "grad_norm": 0.6610758900642395,
+      "learning_rate": 0.00011393794765433115,
+      "loss": 1.102,
+      "step": 10246
+    },
+    {
+      "epoch": 1.8244301994301995,
+      "grad_norm": 0.6875932216644287,
+      "learning_rate": 0.0001139240866743943,
+      "loss": 0.9963,
+      "step": 10247
+    },
+    {
+      "epoch": 1.8246082621082622,
+      "grad_norm": 0.7595645189285278,
+      "learning_rate": 0.00011391022542164387,
+      "loss": 1.1285,
+      "step": 10248
+    },
+    {
+      "epoch": 1.8247863247863247,
+      "grad_norm": 0.6752721667289734,
+      "learning_rate": 0.0001138963638963515,
+      "loss": 0.9447,
+      "step": 10249
+    },
+    {
+      "epoch": 1.8249643874643875,
+      "grad_norm": 0.6697955131530762,
+      "learning_rate": 0.00011388250209878873,
+      "loss": 1.0804,
+      "step": 10250
+    },
+    {
+      "epoch": 1.82514245014245,
+      "grad_norm": 0.6546956896781921,
+      "learning_rate": 0.00011386864002922713,
+      "loss": 0.9626,
+      "step": 10251
+    },
+    {
+      "epoch": 1.8253205128205128,
+      "grad_norm": 0.8002896904945374,
+      "learning_rate": 0.00011385477768793838,
+      "loss": 1.1933,
+      "step": 10252
+    },
+    {
+      "epoch": 1.8254985754985755,
+      "grad_norm": 0.6566781401634216,
+      "learning_rate": 0.00011384091507519403,
+      "loss": 0.9802,
+      "step": 10253
+    },
+    {
+      "epoch": 1.8256766381766383,
+      "grad_norm": 0.617420494556427,
+      "learning_rate": 0.00011382705219126572,
+      "loss": 1.1098,
+      "step": 10254
+    },
+    {
+      "epoch": 1.8258547008547008,
+      "grad_norm": 0.6558036208152771,
+      "learning_rate": 0.00011381318903642504,
+      "loss": 1.0291,
+      "step": 10255
+    },
+    {
+      "epoch": 1.8260327635327636,
+      "grad_norm": 0.6295637488365173,
+      "learning_rate": 0.00011379932561094358,
+      "loss": 1.0792,
+      "step": 10256
+    },
+    {
+      "epoch": 1.826210826210826,
+      "grad_norm": 0.7475154399871826,
+      "learning_rate": 0.00011378546191509303,
+      "loss": 1.1362,
+      "step": 10257
+    },
+    {
+      "epoch": 1.8263888888888888,
+      "grad_norm": 0.6814939379692078,
+      "learning_rate": 0.00011377159794914498,
+      "loss": 0.9131,
+      "step": 10258
+    },
+    {
+      "epoch": 1.8265669515669516,
+      "grad_norm": 0.6726876497268677,
+      "learning_rate": 0.00011375773371337111,
+      "loss": 0.9147,
+      "step": 10259
+    },
+    {
+      "epoch": 1.8267450142450143,
+      "grad_norm": 0.785943865776062,
+      "learning_rate": 0.00011374386920804298,
+      "loss": 1.0137,
+      "step": 10260
+    },
+    {
+      "epoch": 1.8269230769230769,
+      "grad_norm": 0.7614478468894958,
+      "learning_rate": 0.0001137300044334323,
+      "loss": 1.2118,
+      "step": 10261
+    },
+    {
+      "epoch": 1.8271011396011396,
+      "grad_norm": 0.7317564487457275,
+      "learning_rate": 0.00011371613938981072,
+      "loss": 1.0602,
+      "step": 10262
+    },
+    {
+      "epoch": 1.8272792022792022,
+      "grad_norm": 0.6716432571411133,
+      "learning_rate": 0.00011370227407744987,
+      "loss": 0.952,
+      "step": 10263
+    },
+    {
+      "epoch": 1.827457264957265,
+      "grad_norm": 0.6946425437927246,
+      "learning_rate": 0.00011368840849662139,
+      "loss": 1.0554,
+      "step": 10264
+    },
+    {
+      "epoch": 1.8276353276353277,
+      "grad_norm": 0.6692264080047607,
+      "learning_rate": 0.00011367454264759703,
+      "loss": 0.8944,
+      "step": 10265
+    },
+    {
+      "epoch": 1.8278133903133904,
+      "grad_norm": 0.6931505799293518,
+      "learning_rate": 0.00011366067653064838,
+      "loss": 0.9045,
+      "step": 10266
+    },
+    {
+      "epoch": 1.827991452991453,
+      "grad_norm": 0.7233194708824158,
+      "learning_rate": 0.00011364681014604716,
+      "loss": 0.9441,
+      "step": 10267
+    },
+    {
+      "epoch": 1.8281695156695157,
+      "grad_norm": 0.6451242566108704,
+      "learning_rate": 0.00011363294349406506,
+      "loss": 0.9948,
+      "step": 10268
+    },
+    {
+      "epoch": 1.8283475783475782,
+      "grad_norm": 0.6993351578712463,
+      "learning_rate": 0.00011361907657497375,
+      "loss": 1.1057,
+      "step": 10269
+    },
+    {
+      "epoch": 1.828525641025641,
+      "grad_norm": 0.7241137623786926,
+      "learning_rate": 0.00011360520938904493,
+      "loss": 0.974,
+      "step": 10270
+    },
+    {
+      "epoch": 1.8287037037037037,
+      "grad_norm": 0.6349480152130127,
+      "learning_rate": 0.00011359134193655027,
+      "loss": 0.9026,
+      "step": 10271
+    },
+    {
+      "epoch": 1.8288817663817665,
+      "grad_norm": 0.6916826963424683,
+      "learning_rate": 0.00011357747421776151,
+      "loss": 0.9153,
+      "step": 10272
+    },
+    {
+      "epoch": 1.8290598290598292,
+      "grad_norm": 0.879770040512085,
+      "learning_rate": 0.00011356360623295037,
+      "loss": 1.0818,
+      "step": 10273
+    },
+    {
+      "epoch": 1.8292378917378918,
+      "grad_norm": 0.6293807029724121,
+      "learning_rate": 0.00011354973798238853,
+      "loss": 1.1164,
+      "step": 10274
+    },
+    {
+      "epoch": 1.8294159544159543,
+      "grad_norm": 0.7070622444152832,
+      "learning_rate": 0.0001135358694663477,
+      "loss": 0.8795,
+      "step": 10275
+    },
+    {
+      "epoch": 1.829594017094017,
+      "grad_norm": 0.6847673654556274,
+      "learning_rate": 0.00011352200068509962,
+      "loss": 0.9173,
+      "step": 10276
+    },
+    {
+      "epoch": 1.8297720797720798,
+      "grad_norm": 0.6552146077156067,
+      "learning_rate": 0.00011350813163891605,
+      "loss": 1.0425,
+      "step": 10277
+    },
+    {
+      "epoch": 1.8299501424501425,
+      "grad_norm": 0.6432808041572571,
+      "learning_rate": 0.0001134942623280687,
+      "loss": 0.9418,
+      "step": 10278
+    },
+    {
+      "epoch": 1.8301282051282053,
+      "grad_norm": 0.7412393093109131,
+      "learning_rate": 0.00011348039275282931,
+      "loss": 1.1212,
+      "step": 10279
+    },
+    {
+      "epoch": 1.8303062678062678,
+      "grad_norm": 0.6543423533439636,
+      "learning_rate": 0.00011346652291346965,
+      "loss": 1.0553,
+      "step": 10280
+    },
+    {
+      "epoch": 1.8304843304843303,
+      "grad_norm": 0.7159286141395569,
+      "learning_rate": 0.00011345265281026138,
+      "loss": 1.0582,
+      "step": 10281
+    },
+    {
+      "epoch": 1.830662393162393,
+      "grad_norm": 0.6443323493003845,
+      "learning_rate": 0.00011343878244347639,
+      "loss": 0.9462,
+      "step": 10282
+    },
+    {
+      "epoch": 1.8308404558404558,
+      "grad_norm": 0.7592014074325562,
+      "learning_rate": 0.00011342491181338634,
+      "loss": 1.2718,
+      "step": 10283
+    },
+    {
+      "epoch": 1.8310185185185186,
+      "grad_norm": 0.627109944820404,
+      "learning_rate": 0.00011341104092026302,
+      "loss": 1.0177,
+      "step": 10284
+    },
+    {
+      "epoch": 1.8311965811965814,
+      "grad_norm": 0.8061598539352417,
+      "learning_rate": 0.00011339716976437827,
+      "loss": 0.9416,
+      "step": 10285
+    },
+    {
+      "epoch": 1.8313746438746439,
+      "grad_norm": 0.6584261059761047,
+      "learning_rate": 0.00011338329834600377,
+      "loss": 0.8297,
+      "step": 10286
+    },
+    {
+      "epoch": 1.8315527065527064,
+      "grad_norm": 0.6329470276832581,
+      "learning_rate": 0.00011336942666541133,
+      "loss": 0.8386,
+      "step": 10287
+    },
+    {
+      "epoch": 1.8317307692307692,
+      "grad_norm": 0.6833979487419128,
+      "learning_rate": 0.00011335555472287275,
+      "loss": 0.9407,
+      "step": 10288
+    },
+    {
+      "epoch": 1.831908831908832,
+      "grad_norm": 0.7663840651512146,
+      "learning_rate": 0.00011334168251865985,
+      "loss": 1.0018,
+      "step": 10289
+    },
+    {
+      "epoch": 1.8320868945868947,
+      "grad_norm": 0.7751262784004211,
+      "learning_rate": 0.00011332781005304436,
+      "loss": 1.0576,
+      "step": 10290
+    },
+    {
+      "epoch": 1.8322649572649574,
+      "grad_norm": 0.6857370138168335,
+      "learning_rate": 0.00011331393732629814,
+      "loss": 0.9888,
+      "step": 10291
+    },
+    {
+      "epoch": 1.83244301994302,
+      "grad_norm": 0.7534535527229309,
+      "learning_rate": 0.00011330006433869296,
+      "loss": 1.0834,
+      "step": 10292
+    },
+    {
+      "epoch": 1.8326210826210825,
+      "grad_norm": 0.6785250306129456,
+      "learning_rate": 0.00011328619109050065,
+      "loss": 1.0471,
+      "step": 10293
+    },
+    {
+      "epoch": 1.8327991452991452,
+      "grad_norm": 0.7023689150810242,
+      "learning_rate": 0.00011327231758199303,
+      "loss": 1.0652,
+      "step": 10294
+    },
+    {
+      "epoch": 1.832977207977208,
+      "grad_norm": 0.6776610612869263,
+      "learning_rate": 0.00011325844381344192,
+      "loss": 0.9504,
+      "step": 10295
+    },
+    {
+      "epoch": 1.8331552706552707,
+      "grad_norm": 0.7704112529754639,
+      "learning_rate": 0.00011324456978511917,
+      "loss": 0.9712,
+      "step": 10296
+    },
+    {
+      "epoch": 1.8333333333333335,
+      "grad_norm": 0.601502537727356,
+      "learning_rate": 0.00011323069549729654,
+      "loss": 1.075,
+      "step": 10297
+    },
+    {
+      "epoch": 1.833511396011396,
+      "grad_norm": 0.6282439231872559,
+      "learning_rate": 0.00011321682095024596,
+      "loss": 0.9238,
+      "step": 10298
+    },
+    {
+      "epoch": 1.8336894586894585,
+      "grad_norm": 0.6873499155044556,
+      "learning_rate": 0.00011320294614423921,
+      "loss": 1.0464,
+      "step": 10299
+    },
+    {
+      "epoch": 1.8338675213675213,
+      "grad_norm": 0.6063792705535889,
+      "learning_rate": 0.00011318907107954815,
+      "loss": 0.9732,
+      "step": 10300
+    },
+    {
+      "epoch": 1.834045584045584,
+      "grad_norm": 0.5830921530723572,
+      "learning_rate": 0.00011317519575644464,
+      "loss": 0.7568,
+      "step": 10301
+    },
+    {
+      "epoch": 1.8342236467236468,
+      "grad_norm": 0.6394222378730774,
+      "learning_rate": 0.00011316132017520053,
+      "loss": 0.9958,
+      "step": 10302
+    },
+    {
+      "epoch": 1.8344017094017095,
+      "grad_norm": 0.7052412033081055,
+      "learning_rate": 0.00011314744433608773,
+      "loss": 0.9129,
+      "step": 10303
+    },
+    {
+      "epoch": 1.834579772079772,
+      "grad_norm": 0.7287624478340149,
+      "learning_rate": 0.00011313356823937801,
+      "loss": 0.8608,
+      "step": 10304
+    },
+    {
+      "epoch": 1.8347578347578346,
+      "grad_norm": 0.702937662601471,
+      "learning_rate": 0.00011311969188534334,
+      "loss": 1.3074,
+      "step": 10305
+    },
+    {
+      "epoch": 1.8349358974358974,
+      "grad_norm": 0.6693850159645081,
+      "learning_rate": 0.00011310581527425557,
+      "loss": 0.928,
+      "step": 10306
+    },
+    {
+      "epoch": 1.83511396011396,
+      "grad_norm": 0.8153932094573975,
+      "learning_rate": 0.00011309193840638654,
+      "loss": 1.1771,
+      "step": 10307
+    },
+    {
+      "epoch": 1.8352920227920229,
+      "grad_norm": 0.6517418622970581,
+      "learning_rate": 0.00011307806128200821,
+      "loss": 0.9634,
+      "step": 10308
+    },
+    {
+      "epoch": 1.8354700854700856,
+      "grad_norm": 0.6626226305961609,
+      "learning_rate": 0.00011306418390139245,
+      "loss": 0.9371,
+      "step": 10309
+    },
+    {
+      "epoch": 1.8356481481481481,
+      "grad_norm": 0.7397477030754089,
+      "learning_rate": 0.0001130503062648111,
+      "loss": 0.9398,
+      "step": 10310
+    },
+    {
+      "epoch": 1.8358262108262107,
+      "grad_norm": 0.6790265440940857,
+      "learning_rate": 0.00011303642837253614,
+      "loss": 0.9728,
+      "step": 10311
+    },
+    {
+      "epoch": 1.8360042735042734,
+      "grad_norm": 0.6266449093818665,
+      "learning_rate": 0.00011302255022483941,
+      "loss": 0.847,
+      "step": 10312
+    },
+    {
+      "epoch": 1.8361823361823362,
+      "grad_norm": 0.791657030582428,
+      "learning_rate": 0.00011300867182199288,
+      "loss": 0.8342,
+      "step": 10313
+    },
+    {
+      "epoch": 1.836360398860399,
+      "grad_norm": 0.7128583788871765,
+      "learning_rate": 0.00011299479316426846,
+      "loss": 0.9591,
+      "step": 10314
+    },
+    {
+      "epoch": 1.8365384615384617,
+      "grad_norm": 0.659928023815155,
+      "learning_rate": 0.00011298091425193806,
+      "loss": 1.0282,
+      "step": 10315
+    },
+    {
+      "epoch": 1.8367165242165242,
+      "grad_norm": 0.6641396284103394,
+      "learning_rate": 0.00011296703508527363,
+      "loss": 1.0161,
+      "step": 10316
+    },
+    {
+      "epoch": 1.8368945868945867,
+      "grad_norm": 0.7921316027641296,
+      "learning_rate": 0.00011295315566454702,
+      "loss": 0.8897,
+      "step": 10317
+    },
+    {
+      "epoch": 1.8370726495726495,
+      "grad_norm": 0.6900694966316223,
+      "learning_rate": 0.00011293927599003029,
+      "loss": 1.0094,
+      "step": 10318
+    },
+    {
+      "epoch": 1.8372507122507122,
+      "grad_norm": 0.8054366707801819,
+      "learning_rate": 0.0001129253960619953,
+      "loss": 0.9489,
+      "step": 10319
+    },
+    {
+      "epoch": 1.837428774928775,
+      "grad_norm": 0.6623767018318176,
+      "learning_rate": 0.00011291151588071405,
+      "loss": 0.92,
+      "step": 10320
+    },
+    {
+      "epoch": 1.8376068376068377,
+      "grad_norm": 0.6143901348114014,
+      "learning_rate": 0.00011289763544645846,
+      "loss": 0.8093,
+      "step": 10321
+    },
+    {
+      "epoch": 1.8377849002849003,
+      "grad_norm": 0.8207027316093445,
+      "learning_rate": 0.00011288375475950046,
+      "loss": 1.2402,
+      "step": 10322
+    },
+    {
+      "epoch": 1.8379629629629628,
+      "grad_norm": 0.6759985685348511,
+      "learning_rate": 0.00011286987382011209,
+      "loss": 0.9179,
+      "step": 10323
+    },
+    {
+      "epoch": 1.8381410256410255,
+      "grad_norm": 0.745439887046814,
+      "learning_rate": 0.00011285599262856523,
+      "loss": 0.8157,
+      "step": 10324
+    },
+    {
+      "epoch": 1.8383190883190883,
+      "grad_norm": 0.6873317360877991,
+      "learning_rate": 0.00011284211118513194,
+      "loss": 0.8681,
+      "step": 10325
+    },
+    {
+      "epoch": 1.838497150997151,
+      "grad_norm": 0.7060160040855408,
+      "learning_rate": 0.00011282822949008416,
+      "loss": 1.0833,
+      "step": 10326
+    },
+    {
+      "epoch": 1.8386752136752138,
+      "grad_norm": 0.8079642653465271,
+      "learning_rate": 0.00011281434754369389,
+      "loss": 0.8639,
+      "step": 10327
+    },
+    {
+      "epoch": 1.8388532763532763,
+      "grad_norm": 0.6434001922607422,
+      "learning_rate": 0.00011280046534623303,
+      "loss": 0.9269,
+      "step": 10328
+    },
+    {
+      "epoch": 1.839031339031339,
+      "grad_norm": 0.7005292773246765,
+      "learning_rate": 0.0001127865828979737,
+      "loss": 1.1475,
+      "step": 10329
+    },
+    {
+      "epoch": 1.8392094017094016,
+      "grad_norm": 0.7004852890968323,
+      "learning_rate": 0.00011277270019918784,
+      "loss": 0.9467,
+      "step": 10330
+    },
+    {
+      "epoch": 1.8393874643874644,
+      "grad_norm": 0.7542549967765808,
+      "learning_rate": 0.00011275881725014743,
+      "loss": 1.0371,
+      "step": 10331
+    },
+    {
+      "epoch": 1.8395655270655271,
+      "grad_norm": 0.674051821231842,
+      "learning_rate": 0.00011274493405112452,
+      "loss": 1.1097,
+      "step": 10332
+    },
+    {
+      "epoch": 1.8397435897435899,
+      "grad_norm": 0.8136405348777771,
+      "learning_rate": 0.00011273105060239107,
+      "loss": 0.9718,
+      "step": 10333
+    },
+    {
+      "epoch": 1.8399216524216524,
+      "grad_norm": 0.6524073481559753,
+      "learning_rate": 0.00011271716690421916,
+      "loss": 0.9953,
+      "step": 10334
+    },
+    {
+      "epoch": 1.8400997150997151,
+      "grad_norm": 0.7436625957489014,
+      "learning_rate": 0.00011270328295688077,
+      "loss": 1.0722,
+      "step": 10335
+    },
+    {
+      "epoch": 1.8402777777777777,
+      "grad_norm": 0.6815723180770874,
+      "learning_rate": 0.00011268939876064795,
+      "loss": 1.0924,
+      "step": 10336
+    },
+    {
+      "epoch": 1.8404558404558404,
+      "grad_norm": 0.6923388242721558,
+      "learning_rate": 0.0001126755143157927,
+      "loss": 0.921,
+      "step": 10337
+    },
+    {
+      "epoch": 1.8406339031339032,
+      "grad_norm": 0.7464849948883057,
+      "learning_rate": 0.00011266162962258708,
+      "loss": 1.0549,
+      "step": 10338
+    },
+    {
+      "epoch": 1.840811965811966,
+      "grad_norm": 0.6621805429458618,
+      "learning_rate": 0.00011264774468130315,
+      "loss": 1.0764,
+      "step": 10339
+    },
+    {
+      "epoch": 1.8409900284900285,
+      "grad_norm": 0.7370132803916931,
+      "learning_rate": 0.00011263385949221295,
+      "loss": 0.7818,
+      "step": 10340
+    },
+    {
+      "epoch": 1.8411680911680912,
+      "grad_norm": 0.673100471496582,
+      "learning_rate": 0.00011261997405558848,
+      "loss": 1.04,
+      "step": 10341
+    },
+    {
+      "epoch": 1.8413461538461537,
+      "grad_norm": 0.5978201031684875,
+      "learning_rate": 0.00011260608837170183,
+      "loss": 0.9644,
+      "step": 10342
+    },
+    {
+      "epoch": 1.8415242165242165,
+      "grad_norm": 0.6868628263473511,
+      "learning_rate": 0.00011259220244082507,
+      "loss": 0.9533,
+      "step": 10343
+    },
+    {
+      "epoch": 1.8417022792022792,
+      "grad_norm": 0.6580314636230469,
+      "learning_rate": 0.0001125783162632303,
+      "loss": 0.9506,
+      "step": 10344
+    },
+    {
+      "epoch": 1.841880341880342,
+      "grad_norm": 0.7238291501998901,
+      "learning_rate": 0.00011256442983918951,
+      "loss": 0.8663,
+      "step": 10345
+    },
+    {
+      "epoch": 1.8420584045584045,
+      "grad_norm": 0.5838520526885986,
+      "learning_rate": 0.00011255054316897484,
+      "loss": 0.9606,
+      "step": 10346
+    },
+    {
+      "epoch": 1.8422364672364673,
+      "grad_norm": 0.7102842926979065,
+      "learning_rate": 0.00011253665625285836,
+      "loss": 0.801,
+      "step": 10347
+    },
+    {
+      "epoch": 1.8424145299145298,
+      "grad_norm": 0.6449147462844849,
+      "learning_rate": 0.0001125227690911121,
+      "loss": 1.0827,
+      "step": 10348
+    },
+    {
+      "epoch": 1.8425925925925926,
+      "grad_norm": 0.6355304718017578,
+      "learning_rate": 0.00011250888168400823,
+      "loss": 1.0369,
+      "step": 10349
+    },
+    {
+      "epoch": 1.8427706552706553,
+      "grad_norm": 0.678977906703949,
+      "learning_rate": 0.0001124949940318188,
+      "loss": 0.9491,
+      "step": 10350
+    },
+    {
+      "epoch": 1.842948717948718,
+      "grad_norm": 0.6366633772850037,
+      "learning_rate": 0.00011248110613481592,
+      "loss": 0.7272,
+      "step": 10351
+    },
+    {
+      "epoch": 1.8431267806267806,
+      "grad_norm": 0.6639098525047302,
+      "learning_rate": 0.00011246721799327171,
+      "loss": 1.0313,
+      "step": 10352
+    },
+    {
+      "epoch": 1.8433048433048433,
+      "grad_norm": 0.6034720540046692,
+      "learning_rate": 0.00011245332960745822,
+      "loss": 0.7141,
+      "step": 10353
+    },
+    {
+      "epoch": 1.8434829059829059,
+      "grad_norm": 0.8118346333503723,
+      "learning_rate": 0.00011243944097764763,
+      "loss": 1.171,
+      "step": 10354
+    },
+    {
+      "epoch": 1.8436609686609686,
+      "grad_norm": 0.6706618070602417,
+      "learning_rate": 0.00011242555210411203,
+      "loss": 0.9578,
+      "step": 10355
+    },
+    {
+      "epoch": 1.8438390313390314,
+      "grad_norm": 0.619562029838562,
+      "learning_rate": 0.00011241166298712355,
+      "loss": 0.9883,
+      "step": 10356
+    },
+    {
+      "epoch": 1.8440170940170941,
+      "grad_norm": 0.6471936106681824,
+      "learning_rate": 0.00011239777362695434,
+      "loss": 0.8897,
+      "step": 10357
+    },
+    {
+      "epoch": 1.8441951566951567,
+      "grad_norm": 0.7179005742073059,
+      "learning_rate": 0.00011238388402387645,
+      "loss": 0.9646,
+      "step": 10358
+    },
+    {
+      "epoch": 1.8443732193732194,
+      "grad_norm": 0.7726966738700867,
+      "learning_rate": 0.00011236999417816214,
+      "loss": 0.8855,
+      "step": 10359
+    },
+    {
+      "epoch": 1.844551282051282,
+      "grad_norm": 0.6733565330505371,
+      "learning_rate": 0.00011235610409008346,
+      "loss": 1.0379,
+      "step": 10360
+    },
+    {
+      "epoch": 1.8447293447293447,
+      "grad_norm": 0.7317814826965332,
+      "learning_rate": 0.0001123422137599126,
+      "loss": 0.8528,
+      "step": 10361
+    },
+    {
+      "epoch": 1.8449074074074074,
+      "grad_norm": 0.6727005839347839,
+      "learning_rate": 0.0001123283231879217,
+      "loss": 0.9612,
+      "step": 10362
+    },
+    {
+      "epoch": 1.8450854700854702,
+      "grad_norm": 0.6350542306900024,
+      "learning_rate": 0.00011231443237438289,
+      "loss": 0.9939,
+      "step": 10363
+    },
+    {
+      "epoch": 1.8452635327635327,
+      "grad_norm": 0.693148672580719,
+      "learning_rate": 0.00011230054131956836,
+      "loss": 1.0149,
+      "step": 10364
+    },
+    {
+      "epoch": 1.8454415954415955,
+      "grad_norm": 0.7263579368591309,
+      "learning_rate": 0.0001122866500237503,
+      "loss": 1.1044,
+      "step": 10365
+    },
+    {
+      "epoch": 1.845619658119658,
+      "grad_norm": 0.7044230699539185,
+      "learning_rate": 0.00011227275848720085,
+      "loss": 1.0677,
+      "step": 10366
+    },
+    {
+      "epoch": 1.8457977207977208,
+      "grad_norm": 0.6895326972007751,
+      "learning_rate": 0.00011225886671019219,
+      "loss": 1.1025,
+      "step": 10367
+    },
+    {
+      "epoch": 1.8459757834757835,
+      "grad_norm": 0.6045145988464355,
+      "learning_rate": 0.00011224497469299651,
+      "loss": 0.8079,
+      "step": 10368
+    },
+    {
+      "epoch": 1.8461538461538463,
+      "grad_norm": 0.6613210439682007,
+      "learning_rate": 0.00011223108243588599,
+      "loss": 1.0345,
+      "step": 10369
+    },
+    {
+      "epoch": 1.8463319088319088,
+      "grad_norm": 0.6288960576057434,
+      "learning_rate": 0.0001122171899391328,
+      "loss": 1.0166,
+      "step": 10370
+    },
+    {
+      "epoch": 1.8465099715099715,
+      "grad_norm": 0.6158748865127563,
+      "learning_rate": 0.00011220329720300917,
+      "loss": 0.895,
+      "step": 10371
+    },
+    {
+      "epoch": 1.846688034188034,
+      "grad_norm": 0.6583057641983032,
+      "learning_rate": 0.00011218940422778728,
+      "loss": 0.8059,
+      "step": 10372
+    },
+    {
+      "epoch": 1.8468660968660968,
+      "grad_norm": 0.6761550903320312,
+      "learning_rate": 0.00011217551101373932,
+      "loss": 0.9253,
+      "step": 10373
+    },
+    {
+      "epoch": 1.8470441595441596,
+      "grad_norm": 0.5969263315200806,
+      "learning_rate": 0.0001121616175611375,
+      "loss": 0.8549,
+      "step": 10374
+    },
+    {
+      "epoch": 1.8472222222222223,
+      "grad_norm": 0.7994722723960876,
+      "learning_rate": 0.00011214772387025407,
+      "loss": 0.9918,
+      "step": 10375
+    },
+    {
+      "epoch": 1.8474002849002849,
+      "grad_norm": 0.6949167847633362,
+      "learning_rate": 0.00011213382994136123,
+      "loss": 1.1853,
+      "step": 10376
+    },
+    {
+      "epoch": 1.8475783475783476,
+      "grad_norm": 0.7356176376342773,
+      "learning_rate": 0.00011211993577473121,
+      "loss": 0.8809,
+      "step": 10377
+    },
+    {
+      "epoch": 1.8477564102564101,
+      "grad_norm": 0.7110268473625183,
+      "learning_rate": 0.0001121060413706362,
+      "loss": 0.9805,
+      "step": 10378
+    },
+    {
+      "epoch": 1.8479344729344729,
+      "grad_norm": 0.6509962677955627,
+      "learning_rate": 0.00011209214672934846,
+      "loss": 0.8899,
+      "step": 10379
+    },
+    {
+      "epoch": 1.8481125356125356,
+      "grad_norm": 0.6103082299232483,
+      "learning_rate": 0.00011207825185114025,
+      "loss": 0.8576,
+      "step": 10380
+    },
+    {
+      "epoch": 1.8482905982905984,
+      "grad_norm": 0.6261070966720581,
+      "learning_rate": 0.00011206435673628377,
+      "loss": 0.8884,
+      "step": 10381
+    },
+    {
+      "epoch": 1.848468660968661,
+      "grad_norm": 0.7629222273826599,
+      "learning_rate": 0.00011205046138505126,
+      "loss": 1.1714,
+      "step": 10382
+    },
+    {
+      "epoch": 1.8486467236467237,
+      "grad_norm": 0.617957353591919,
+      "learning_rate": 0.000112036565797715,
+      "loss": 0.9546,
+      "step": 10383
+    },
+    {
+      "epoch": 1.8488247863247862,
+      "grad_norm": 0.6926987171173096,
+      "learning_rate": 0.00011202266997454724,
+      "loss": 0.8842,
+      "step": 10384
+    },
+    {
+      "epoch": 1.849002849002849,
+      "grad_norm": 0.602758526802063,
+      "learning_rate": 0.00011200877391582025,
+      "loss": 0.9782,
+      "step": 10385
+    },
+    {
+      "epoch": 1.8491809116809117,
+      "grad_norm": 0.706731915473938,
+      "learning_rate": 0.00011199487762180627,
+      "loss": 0.8176,
+      "step": 10386
+    },
+    {
+      "epoch": 1.8493589743589745,
+      "grad_norm": 0.7135118842124939,
+      "learning_rate": 0.0001119809810927776,
+      "loss": 0.9277,
+      "step": 10387
+    },
+    {
+      "epoch": 1.8495370370370372,
+      "grad_norm": 0.7484592199325562,
+      "learning_rate": 0.00011196708432900647,
+      "loss": 1.0733,
+      "step": 10388
+    },
+    {
+      "epoch": 1.8497150997150997,
+      "grad_norm": 0.7087157964706421,
+      "learning_rate": 0.00011195318733076519,
+      "loss": 0.9443,
+      "step": 10389
+    },
+    {
+      "epoch": 1.8498931623931623,
+      "grad_norm": 0.6511468291282654,
+      "learning_rate": 0.00011193929009832602,
+      "loss": 0.955,
+      "step": 10390
+    },
+    {
+      "epoch": 1.850071225071225,
+      "grad_norm": 0.6386628746986389,
+      "learning_rate": 0.0001119253926319613,
+      "loss": 1.0357,
+      "step": 10391
+    },
+    {
+      "epoch": 1.8502492877492878,
+      "grad_norm": 0.6400021314620972,
+      "learning_rate": 0.00011191149493194327,
+      "loss": 0.8094,
+      "step": 10392
+    },
+    {
+      "epoch": 1.8504273504273505,
+      "grad_norm": 0.7942537069320679,
+      "learning_rate": 0.00011189759699854423,
+      "loss": 0.9717,
+      "step": 10393
+    },
+    {
+      "epoch": 1.8506054131054133,
+      "grad_norm": 0.7230474948883057,
+      "learning_rate": 0.00011188369883203647,
+      "loss": 0.9043,
+      "step": 10394
+    },
+    {
+      "epoch": 1.8507834757834758,
+      "grad_norm": 0.8837162852287292,
+      "learning_rate": 0.00011186980043269235,
+      "loss": 1.2821,
+      "step": 10395
+    },
+    {
+      "epoch": 1.8509615384615383,
+      "grad_norm": 0.7260291576385498,
+      "learning_rate": 0.00011185590180078413,
+      "loss": 1.1672,
+      "step": 10396
+    },
+    {
+      "epoch": 1.851139601139601,
+      "grad_norm": 0.6290066242218018,
+      "learning_rate": 0.00011184200293658415,
+      "loss": 0.8942,
+      "step": 10397
+    },
+    {
+      "epoch": 1.8513176638176638,
+      "grad_norm": 0.6571013331413269,
+      "learning_rate": 0.00011182810384036475,
+      "loss": 1.0753,
+      "step": 10398
+    },
+    {
+      "epoch": 1.8514957264957266,
+      "grad_norm": 0.6494737267494202,
+      "learning_rate": 0.00011181420451239817,
+      "loss": 0.8833,
+      "step": 10399
+    },
+    {
+      "epoch": 1.8516737891737893,
+      "grad_norm": 0.7383694648742676,
+      "learning_rate": 0.00011180030495295684,
+      "loss": 1.0094,
+      "step": 10400
+    },
+    {
+      "epoch": 1.8518518518518519,
+      "grad_norm": 0.6713876724243164,
+      "learning_rate": 0.00011178640516231302,
+      "loss": 0.975,
+      "step": 10401
+    },
+    {
+      "epoch": 1.8520299145299144,
+      "grad_norm": 0.8041042685508728,
+      "learning_rate": 0.00011177250514073912,
+      "loss": 1.1419,
+      "step": 10402
+    },
+    {
+      "epoch": 1.8522079772079771,
+      "grad_norm": 0.7035061120986938,
+      "learning_rate": 0.00011175860488850738,
+      "loss": 1.0921,
+      "step": 10403
+    },
+    {
+      "epoch": 1.85238603988604,
+      "grad_norm": 0.6135673522949219,
+      "learning_rate": 0.00011174470440589022,
+      "loss": 0.9611,
+      "step": 10404
+    },
+    {
+      "epoch": 1.8525641025641026,
+      "grad_norm": 0.7868386507034302,
+      "learning_rate": 0.00011173080369315999,
+      "loss": 0.8561,
+      "step": 10405
+    },
+    {
+      "epoch": 1.8527421652421654,
+      "grad_norm": 0.6575735211372375,
+      "learning_rate": 0.00011171690275058902,
+      "loss": 1.0256,
+      "step": 10406
+    },
+    {
+      "epoch": 1.852920227920228,
+      "grad_norm": 0.7514392137527466,
+      "learning_rate": 0.00011170300157844969,
+      "loss": 1.0868,
+      "step": 10407
+    },
+    {
+      "epoch": 1.8530982905982905,
+      "grad_norm": 0.6915257573127747,
+      "learning_rate": 0.00011168910017701436,
+      "loss": 1.1223,
+      "step": 10408
+    },
+    {
+      "epoch": 1.8532763532763532,
+      "grad_norm": 0.7406772971153259,
+      "learning_rate": 0.00011167519854655535,
+      "loss": 1.0922,
+      "step": 10409
+    },
+    {
+      "epoch": 1.853454415954416,
+      "grad_norm": 0.6632742881774902,
+      "learning_rate": 0.0001116612966873451,
+      "loss": 0.9082,
+      "step": 10410
+    },
+    {
+      "epoch": 1.8536324786324787,
+      "grad_norm": 0.8154461979866028,
+      "learning_rate": 0.00011164739459965598,
+      "loss": 1.1126,
+      "step": 10411
+    },
+    {
+      "epoch": 1.8538105413105415,
+      "grad_norm": 0.895764172077179,
+      "learning_rate": 0.00011163349228376037,
+      "loss": 1.0589,
+      "step": 10412
+    },
+    {
+      "epoch": 1.853988603988604,
+      "grad_norm": 0.6746504902839661,
+      "learning_rate": 0.00011161958973993063,
+      "loss": 1.0184,
+      "step": 10413
+    },
+    {
+      "epoch": 1.8541666666666665,
+      "grad_norm": 0.7271263003349304,
+      "learning_rate": 0.00011160568696843916,
+      "loss": 0.9989,
+      "step": 10414
+    },
+    {
+      "epoch": 1.8543447293447293,
+      "grad_norm": 0.7503132820129395,
+      "learning_rate": 0.00011159178396955836,
+      "loss": 1.0783,
+      "step": 10415
+    },
+    {
+      "epoch": 1.854522792022792,
+      "grad_norm": 0.6768177151679993,
+      "learning_rate": 0.00011157788074356066,
+      "loss": 0.9916,
+      "step": 10416
+    },
+    {
+      "epoch": 1.8547008547008548,
+      "grad_norm": 0.6804978251457214,
+      "learning_rate": 0.00011156397729071842,
+      "loss": 0.9534,
+      "step": 10417
+    },
+    {
+      "epoch": 1.8548789173789175,
+      "grad_norm": 0.7144617438316345,
+      "learning_rate": 0.00011155007361130408,
+      "loss": 0.991,
+      "step": 10418
+    },
+    {
+      "epoch": 1.85505698005698,
+      "grad_norm": 0.6816750168800354,
+      "learning_rate": 0.00011153616970559,
+      "loss": 0.9551,
+      "step": 10419
+    },
+    {
+      "epoch": 1.8552350427350426,
+      "grad_norm": 0.6620030999183655,
+      "learning_rate": 0.00011152226557384866,
+      "loss": 0.8854,
+      "step": 10420
+    },
+    {
+      "epoch": 1.8554131054131053,
+      "grad_norm": 0.8400058746337891,
+      "learning_rate": 0.00011150836121635249,
+      "loss": 1.1593,
+      "step": 10421
+    },
+    {
+      "epoch": 1.855591168091168,
+      "grad_norm": 0.6666815280914307,
+      "learning_rate": 0.00011149445663337385,
+      "loss": 1.2112,
+      "step": 10422
+    },
+    {
+      "epoch": 1.8557692307692308,
+      "grad_norm": 0.7298431396484375,
+      "learning_rate": 0.00011148055182518522,
+      "loss": 0.9721,
+      "step": 10423
+    },
+    {
+      "epoch": 1.8559472934472936,
+      "grad_norm": 0.66816645860672,
+      "learning_rate": 0.00011146664679205903,
+      "loss": 1.0945,
+      "step": 10424
+    },
+    {
+      "epoch": 1.8561253561253561,
+      "grad_norm": 0.5979483127593994,
+      "learning_rate": 0.00011145274153426771,
+      "loss": 1.0176,
+      "step": 10425
+    },
+    {
+      "epoch": 1.8563034188034186,
+      "grad_norm": 0.6579445600509644,
+      "learning_rate": 0.00011143883605208372,
+      "loss": 0.9143,
+      "step": 10426
+    },
+    {
+      "epoch": 1.8564814814814814,
+      "grad_norm": 0.6871697902679443,
+      "learning_rate": 0.0001114249303457795,
+      "loss": 1.071,
+      "step": 10427
+    },
+    {
+      "epoch": 1.8566595441595442,
+      "grad_norm": 0.6683333516120911,
+      "learning_rate": 0.0001114110244156275,
+      "loss": 0.7809,
+      "step": 10428
+    },
+    {
+      "epoch": 1.856837606837607,
+      "grad_norm": 0.6122907996177673,
+      "learning_rate": 0.0001113971182619002,
+      "loss": 0.8329,
+      "step": 10429
+    },
+    {
+      "epoch": 1.8570156695156697,
+      "grad_norm": 0.6510575413703918,
+      "learning_rate": 0.00011138321188487,
+      "loss": 1.0068,
+      "step": 10430
+    },
+    {
+      "epoch": 1.8571937321937322,
+      "grad_norm": 0.6417793035507202,
+      "learning_rate": 0.00011136930528480945,
+      "loss": 1.0093,
+      "step": 10431
+    },
+    {
+      "epoch": 1.8573717948717947,
+      "grad_norm": 0.595824658870697,
+      "learning_rate": 0.00011135539846199096,
+      "loss": 0.9856,
+      "step": 10432
+    },
+    {
+      "epoch": 1.8575498575498575,
+      "grad_norm": 0.7594470381736755,
+      "learning_rate": 0.00011134149141668704,
+      "loss": 0.8173,
+      "step": 10433
+    },
+    {
+      "epoch": 1.8577279202279202,
+      "grad_norm": 0.7078324556350708,
+      "learning_rate": 0.00011132758414917016,
+      "loss": 1.0236,
+      "step": 10434
+    },
+    {
+      "epoch": 1.857905982905983,
+      "grad_norm": 0.6830437779426575,
+      "learning_rate": 0.00011131367665971275,
+      "loss": 0.8483,
+      "step": 10435
+    },
+    {
+      "epoch": 1.8580840455840457,
+      "grad_norm": 0.6856399774551392,
+      "learning_rate": 0.0001112997689485874,
+      "loss": 0.8729,
+      "step": 10436
+    },
+    {
+      "epoch": 1.8582621082621082,
+      "grad_norm": 0.6530426144599915,
+      "learning_rate": 0.00011128586101606653,
+      "loss": 0.8616,
+      "step": 10437
+    },
+    {
+      "epoch": 1.8584401709401708,
+      "grad_norm": 0.6341808438301086,
+      "learning_rate": 0.00011127195286242267,
+      "loss": 0.896,
+      "step": 10438
+    },
+    {
+      "epoch": 1.8586182336182335,
+      "grad_norm": 0.6278257966041565,
+      "learning_rate": 0.00011125804448792831,
+      "loss": 0.8309,
+      "step": 10439
+    },
+    {
+      "epoch": 1.8587962962962963,
+      "grad_norm": 0.708705723285675,
+      "learning_rate": 0.00011124413589285594,
+      "loss": 1.1065,
+      "step": 10440
+    },
+    {
+      "epoch": 1.858974358974359,
+      "grad_norm": 0.6845232248306274,
+      "learning_rate": 0.00011123022707747808,
+      "loss": 0.9292,
+      "step": 10441
+    },
+    {
+      "epoch": 1.8591524216524218,
+      "grad_norm": 0.749204695224762,
+      "learning_rate": 0.00011121631804206726,
+      "loss": 1.0487,
+      "step": 10442
+    },
+    {
+      "epoch": 1.8593304843304843,
+      "grad_norm": 0.7123128771781921,
+      "learning_rate": 0.00011120240878689599,
+      "loss": 0.9138,
+      "step": 10443
+    },
+    {
+      "epoch": 1.859508547008547,
+      "grad_norm": 0.6862115263938904,
+      "learning_rate": 0.00011118849931223679,
+      "loss": 1.0675,
+      "step": 10444
+    },
+    {
+      "epoch": 1.8596866096866096,
+      "grad_norm": 0.7245760560035706,
+      "learning_rate": 0.00011117458961836215,
+      "loss": 0.9643,
+      "step": 10445
+    },
+    {
+      "epoch": 1.8598646723646723,
+      "grad_norm": 0.701574444770813,
+      "learning_rate": 0.0001111606797055447,
+      "loss": 1.0022,
+      "step": 10446
+    },
+    {
+      "epoch": 1.860042735042735,
+      "grad_norm": 0.7292088270187378,
+      "learning_rate": 0.0001111467695740569,
+      "loss": 0.9465,
+      "step": 10447
+    },
+    {
+      "epoch": 1.8602207977207978,
+      "grad_norm": 0.7045044302940369,
+      "learning_rate": 0.0001111328592241713,
+      "loss": 1.0942,
+      "step": 10448
+    },
+    {
+      "epoch": 1.8603988603988604,
+      "grad_norm": 0.7181426286697388,
+      "learning_rate": 0.00011111894865616046,
+      "loss": 1.2108,
+      "step": 10449
+    },
+    {
+      "epoch": 1.8605769230769231,
+      "grad_norm": 0.6083306074142456,
+      "learning_rate": 0.00011110503787029689,
+      "loss": 0.929,
+      "step": 10450
+    },
+    {
+      "epoch": 1.8607549857549857,
+      "grad_norm": 0.6847347617149353,
+      "learning_rate": 0.00011109112686685319,
+      "loss": 1.0911,
+      "step": 10451
+    },
+    {
+      "epoch": 1.8609330484330484,
+      "grad_norm": 0.7131744027137756,
+      "learning_rate": 0.0001110772156461019,
+      "loss": 0.9649,
+      "step": 10452
+    },
+    {
+      "epoch": 1.8611111111111112,
+      "grad_norm": 0.7920312881469727,
+      "learning_rate": 0.00011106330420831559,
+      "loss": 0.9965,
+      "step": 10453
+    },
+    {
+      "epoch": 1.861289173789174,
+      "grad_norm": 0.6640987992286682,
+      "learning_rate": 0.00011104939255376681,
+      "loss": 1.2346,
+      "step": 10454
+    },
+    {
+      "epoch": 1.8614672364672364,
+      "grad_norm": 0.5878208875656128,
+      "learning_rate": 0.00011103548068272811,
+      "loss": 0.8565,
+      "step": 10455
+    },
+    {
+      "epoch": 1.8616452991452992,
+      "grad_norm": 0.6636882424354553,
+      "learning_rate": 0.0001110215685954721,
+      "loss": 0.8556,
+      "step": 10456
+    },
+    {
+      "epoch": 1.8618233618233617,
+      "grad_norm": 0.5985570549964905,
+      "learning_rate": 0.00011100765629227137,
+      "loss": 1.0291,
+      "step": 10457
+    },
+    {
+      "epoch": 1.8620014245014245,
+      "grad_norm": 0.7546643614768982,
+      "learning_rate": 0.00011099374377339846,
+      "loss": 1.0199,
+      "step": 10458
+    },
+    {
+      "epoch": 1.8621794871794872,
+      "grad_norm": 0.6529727578163147,
+      "learning_rate": 0.00011097983103912602,
+      "loss": 1.0826,
+      "step": 10459
+    },
+    {
+      "epoch": 1.86235754985755,
+      "grad_norm": 0.6394338607788086,
+      "learning_rate": 0.00011096591808972654,
+      "loss": 0.9896,
+      "step": 10460
+    },
+    {
+      "epoch": 1.8625356125356125,
+      "grad_norm": 0.6508805751800537,
+      "learning_rate": 0.00011095200492547271,
+      "loss": 0.9659,
+      "step": 10461
+    },
+    {
+      "epoch": 1.8627136752136753,
+      "grad_norm": 0.7085812091827393,
+      "learning_rate": 0.00011093809154663705,
+      "loss": 0.9998,
+      "step": 10462
+    },
+    {
+      "epoch": 1.8628917378917378,
+      "grad_norm": 0.6488457322120667,
+      "learning_rate": 0.00011092417795349226,
+      "loss": 0.9757,
+      "step": 10463
+    },
+    {
+      "epoch": 1.8630698005698005,
+      "grad_norm": 0.6405763626098633,
+      "learning_rate": 0.0001109102641463109,
+      "loss": 0.8188,
+      "step": 10464
+    },
+    {
+      "epoch": 1.8632478632478633,
+      "grad_norm": 0.713361918926239,
+      "learning_rate": 0.00011089635012536554,
+      "loss": 0.886,
+      "step": 10465
+    },
+    {
+      "epoch": 1.863425925925926,
+      "grad_norm": 0.5752255916595459,
+      "learning_rate": 0.00011088243589092886,
+      "loss": 1.0223,
+      "step": 10466
+    },
+    {
+      "epoch": 1.8636039886039886,
+      "grad_norm": 0.6722734570503235,
+      "learning_rate": 0.00011086852144327344,
+      "loss": 0.9499,
+      "step": 10467
+    },
+    {
+      "epoch": 1.8637820512820513,
+      "grad_norm": 0.5516420006752014,
+      "learning_rate": 0.00011085460678267194,
+      "loss": 0.7767,
+      "step": 10468
+    },
+    {
+      "epoch": 1.8639601139601139,
+      "grad_norm": 0.731257438659668,
+      "learning_rate": 0.00011084069190939697,
+      "loss": 1.2299,
+      "step": 10469
+    },
+    {
+      "epoch": 1.8641381766381766,
+      "grad_norm": 0.7977055907249451,
+      "learning_rate": 0.00011082677682372114,
+      "loss": 0.9109,
+      "step": 10470
+    },
+    {
+      "epoch": 1.8643162393162394,
+      "grad_norm": 0.679900586605072,
+      "learning_rate": 0.0001108128615259171,
+      "loss": 0.9319,
+      "step": 10471
+    },
+    {
+      "epoch": 1.864494301994302,
+      "grad_norm": 0.7428545951843262,
+      "learning_rate": 0.00011079894601625754,
+      "loss": 0.8585,
+      "step": 10472
+    },
+    {
+      "epoch": 1.8646723646723646,
+      "grad_norm": 0.6560967564582825,
+      "learning_rate": 0.00011078503029501504,
+      "loss": 1.0069,
+      "step": 10473
+    },
+    {
+      "epoch": 1.8648504273504274,
+      "grad_norm": 0.636202871799469,
+      "learning_rate": 0.00011077111436246228,
+      "loss": 1.0329,
+      "step": 10474
+    },
+    {
+      "epoch": 1.86502849002849,
+      "grad_norm": 0.6666205525398254,
+      "learning_rate": 0.00011075719821887191,
+      "loss": 1.0123,
+      "step": 10475
+    },
+    {
+      "epoch": 1.8652065527065527,
+      "grad_norm": 0.7089471220970154,
+      "learning_rate": 0.00011074328186451657,
+      "loss": 0.7851,
+      "step": 10476
+    },
+    {
+      "epoch": 1.8653846153846154,
+      "grad_norm": 0.6054788827896118,
+      "learning_rate": 0.00011072936529966895,
+      "loss": 0.8224,
+      "step": 10477
+    },
+    {
+      "epoch": 1.8655626780626782,
+      "grad_norm": 0.6009029150009155,
+      "learning_rate": 0.00011071544852460172,
+      "loss": 0.865,
+      "step": 10478
+    },
+    {
+      "epoch": 1.8657407407407407,
+      "grad_norm": 0.6238716244697571,
+      "learning_rate": 0.00011070153153958753,
+      "loss": 0.8685,
+      "step": 10479
+    },
+    {
+      "epoch": 1.8659188034188035,
+      "grad_norm": 0.719985842704773,
+      "learning_rate": 0.00011068761434489903,
+      "loss": 1.2204,
+      "step": 10480
+    },
+    {
+      "epoch": 1.866096866096866,
+      "grad_norm": 0.72972172498703,
+      "learning_rate": 0.00011067369694080895,
+      "loss": 1.0454,
+      "step": 10481
+    },
+    {
+      "epoch": 1.8662749287749287,
+      "grad_norm": 0.6741998791694641,
+      "learning_rate": 0.00011065977932758995,
+      "loss": 0.9992,
+      "step": 10482
+    },
+    {
+      "epoch": 1.8664529914529915,
+      "grad_norm": 0.6150268912315369,
+      "learning_rate": 0.00011064586150551472,
+      "loss": 0.8866,
+      "step": 10483
+    },
+    {
+      "epoch": 1.8666310541310542,
+      "grad_norm": 0.8253782391548157,
+      "learning_rate": 0.00011063194347485597,
+      "loss": 1.1173,
+      "step": 10484
+    },
+    {
+      "epoch": 1.8668091168091168,
+      "grad_norm": 0.7176247835159302,
+      "learning_rate": 0.00011061802523588636,
+      "loss": 1.0414,
+      "step": 10485
+    },
+    {
+      "epoch": 1.8669871794871795,
+      "grad_norm": 0.6372736096382141,
+      "learning_rate": 0.00011060410678887858,
+      "loss": 1.0548,
+      "step": 10486
+    },
+    {
+      "epoch": 1.867165242165242,
+      "grad_norm": 0.7107454538345337,
+      "learning_rate": 0.00011059018813410538,
+      "loss": 1.2298,
+      "step": 10487
+    },
+    {
+      "epoch": 1.8673433048433048,
+      "grad_norm": 0.7113911509513855,
+      "learning_rate": 0.00011057626927183944,
+      "loss": 0.9598,
+      "step": 10488
+    },
+    {
+      "epoch": 1.8675213675213675,
+      "grad_norm": 0.6734410524368286,
+      "learning_rate": 0.00011056235020235346,
+      "loss": 0.9475,
+      "step": 10489
+    },
+    {
+      "epoch": 1.8676994301994303,
+      "grad_norm": 0.6875202655792236,
+      "learning_rate": 0.0001105484309259202,
+      "loss": 1.0735,
+      "step": 10490
+    },
+    {
+      "epoch": 1.8678774928774928,
+      "grad_norm": 0.6908353567123413,
+      "learning_rate": 0.0001105345114428123,
+      "loss": 1.0558,
+      "step": 10491
+    },
+    {
+      "epoch": 1.8680555555555556,
+      "grad_norm": 0.6283324360847473,
+      "learning_rate": 0.00011052059175330256,
+      "loss": 0.8872,
+      "step": 10492
+    },
+    {
+      "epoch": 1.868233618233618,
+      "grad_norm": 0.6422587633132935,
+      "learning_rate": 0.00011050667185766368,
+      "loss": 1.1022,
+      "step": 10493
+    },
+    {
+      "epoch": 1.8684116809116809,
+      "grad_norm": 0.7075859904289246,
+      "learning_rate": 0.0001104927517561684,
+      "loss": 1.1389,
+      "step": 10494
+    },
+    {
+      "epoch": 1.8685897435897436,
+      "grad_norm": 0.5896905064582825,
+      "learning_rate": 0.00011047883144908944,
+      "loss": 0.7732,
+      "step": 10495
+    },
+    {
+      "epoch": 1.8687678062678064,
+      "grad_norm": 0.7647629976272583,
+      "learning_rate": 0.00011046491093669953,
+      "loss": 0.9983,
+      "step": 10496
+    },
+    {
+      "epoch": 1.868945868945869,
+      "grad_norm": 0.5864735841751099,
+      "learning_rate": 0.00011045099021927144,
+      "loss": 0.8427,
+      "step": 10497
+    },
+    {
+      "epoch": 1.8691239316239316,
+      "grad_norm": 0.6766837239265442,
+      "learning_rate": 0.00011043706929707791,
+      "loss": 0.9595,
+      "step": 10498
+    },
+    {
+      "epoch": 1.8693019943019942,
+      "grad_norm": 0.5480074286460876,
+      "learning_rate": 0.00011042314817039168,
+      "loss": 0.691,
+      "step": 10499
+    },
+    {
+      "epoch": 1.869480056980057,
+      "grad_norm": 0.6259615421295166,
+      "learning_rate": 0.00011040922683948553,
+      "loss": 0.9991,
+      "step": 10500
+    },
+    {
+      "epoch": 1.8696581196581197,
+      "grad_norm": 0.5950598120689392,
+      "learning_rate": 0.00011039530530463218,
+      "loss": 0.7413,
+      "step": 10501
+    },
+    {
+      "epoch": 1.8698361823361824,
+      "grad_norm": 0.8099377751350403,
+      "learning_rate": 0.00011038138356610441,
+      "loss": 1.1351,
+      "step": 10502
+    },
+    {
+      "epoch": 1.8700142450142452,
+      "grad_norm": 0.6716185212135315,
+      "learning_rate": 0.00011036746162417501,
+      "loss": 1.1057,
+      "step": 10503
+    },
+    {
+      "epoch": 1.8701923076923077,
+      "grad_norm": 0.7993219494819641,
+      "learning_rate": 0.00011035353947911675,
+      "loss": 1.2095,
+      "step": 10504
+    },
+    {
+      "epoch": 1.8703703703703702,
+      "grad_norm": 0.6381276249885559,
+      "learning_rate": 0.00011033961713120237,
+      "loss": 1.0261,
+      "step": 10505
+    },
+    {
+      "epoch": 1.870548433048433,
+      "grad_norm": 0.6326032280921936,
+      "learning_rate": 0.00011032569458070469,
+      "loss": 0.8664,
+      "step": 10506
+    },
+    {
+      "epoch": 1.8707264957264957,
+      "grad_norm": 0.6864820718765259,
+      "learning_rate": 0.00011031177182789644,
+      "loss": 0.9959,
+      "step": 10507
+    },
+    {
+      "epoch": 1.8709045584045585,
+      "grad_norm": 0.6341838240623474,
+      "learning_rate": 0.00011029784887305048,
+      "loss": 0.8029,
+      "step": 10508
+    },
+    {
+      "epoch": 1.8710826210826212,
+      "grad_norm": 0.6559172868728638,
+      "learning_rate": 0.00011028392571643957,
+      "loss": 0.9282,
+      "step": 10509
+    },
+    {
+      "epoch": 1.8712606837606838,
+      "grad_norm": 0.6976849436759949,
+      "learning_rate": 0.0001102700023583365,
+      "loss": 1.0198,
+      "step": 10510
+    },
+    {
+      "epoch": 1.8714387464387463,
+      "grad_norm": 0.7159395217895508,
+      "learning_rate": 0.00011025607879901402,
+      "loss": 1.1585,
+      "step": 10511
+    },
+    {
+      "epoch": 1.871616809116809,
+      "grad_norm": 0.7168624997138977,
+      "learning_rate": 0.000110242155038745,
+      "loss": 1.0558,
+      "step": 10512
+    },
+    {
+      "epoch": 1.8717948717948718,
+      "grad_norm": 0.5784319043159485,
+      "learning_rate": 0.00011022823107780224,
+      "loss": 0.9481,
+      "step": 10513
+    },
+    {
+      "epoch": 1.8719729344729346,
+      "grad_norm": 0.6602259874343872,
+      "learning_rate": 0.00011021430691645856,
+      "loss": 1.0538,
+      "step": 10514
+    },
+    {
+      "epoch": 1.8721509971509973,
+      "grad_norm": 0.6874588131904602,
+      "learning_rate": 0.00011020038255498672,
+      "loss": 1.1396,
+      "step": 10515
+    },
+    {
+      "epoch": 1.8723290598290598,
+      "grad_norm": 0.7311663031578064,
+      "learning_rate": 0.00011018645799365956,
+      "loss": 1.084,
+      "step": 10516
+    },
+    {
+      "epoch": 1.8725071225071224,
+      "grad_norm": 0.7097118496894836,
+      "learning_rate": 0.00011017253323274996,
+      "loss": 0.9872,
+      "step": 10517
+    },
+    {
+      "epoch": 1.8726851851851851,
+      "grad_norm": 0.6667875051498413,
+      "learning_rate": 0.00011015860827253068,
+      "loss": 1.105,
+      "step": 10518
+    },
+    {
+      "epoch": 1.8728632478632479,
+      "grad_norm": 0.6807677745819092,
+      "learning_rate": 0.0001101446831132746,
+      "loss": 0.9093,
+      "step": 10519
+    },
+    {
+      "epoch": 1.8730413105413106,
+      "grad_norm": 0.6885797381401062,
+      "learning_rate": 0.0001101307577552545,
+      "loss": 0.8479,
+      "step": 10520
+    },
+    {
+      "epoch": 1.8732193732193734,
+      "grad_norm": 0.6269213557243347,
+      "learning_rate": 0.00011011683219874323,
+      "loss": 0.9457,
+      "step": 10521
+    },
+    {
+      "epoch": 1.873397435897436,
+      "grad_norm": 0.7096766829490662,
+      "learning_rate": 0.00011010290644401364,
+      "loss": 1.0971,
+      "step": 10522
+    },
+    {
+      "epoch": 1.8735754985754984,
+      "grad_norm": 0.6909209489822388,
+      "learning_rate": 0.00011008898049133863,
+      "loss": 0.9928,
+      "step": 10523
+    },
+    {
+      "epoch": 1.8737535612535612,
+      "grad_norm": 0.6586211323738098,
+      "learning_rate": 0.000110075054340991,
+      "loss": 0.818,
+      "step": 10524
+    },
+    {
+      "epoch": 1.873931623931624,
+      "grad_norm": 0.5934817790985107,
+      "learning_rate": 0.0001100611279932436,
+      "loss": 0.7698,
+      "step": 10525
+    },
+    {
+      "epoch": 1.8741096866096867,
+      "grad_norm": 0.6361709237098694,
+      "learning_rate": 0.00011004720144836931,
+      "loss": 0.9465,
+      "step": 10526
+    },
+    {
+      "epoch": 1.8742877492877494,
+      "grad_norm": 0.6742212176322937,
+      "learning_rate": 0.00011003327470664095,
+      "loss": 1.0998,
+      "step": 10527
+    },
+    {
+      "epoch": 1.874465811965812,
+      "grad_norm": 0.6634946465492249,
+      "learning_rate": 0.00011001934776833143,
+      "loss": 0.8328,
+      "step": 10528
+    },
+    {
+      "epoch": 1.8746438746438745,
+      "grad_norm": 0.6754063963890076,
+      "learning_rate": 0.0001100054206337136,
+      "loss": 1.147,
+      "step": 10529
+    },
+    {
+      "epoch": 1.8748219373219372,
+      "grad_norm": 0.5951135158538818,
+      "learning_rate": 0.00010999149330306036,
+      "loss": 0.8956,
+      "step": 10530
+    },
+    {
+      "epoch": 1.875,
+      "grad_norm": 0.6140317320823669,
+      "learning_rate": 0.00010997756577664455,
+      "loss": 0.9368,
+      "step": 10531
+    },
+    {
+      "epoch": 1.8751780626780628,
+      "grad_norm": 0.6419258713722229,
+      "learning_rate": 0.00010996363805473904,
+      "loss": 0.9817,
+      "step": 10532
+    },
+    {
+      "epoch": 1.8753561253561255,
+      "grad_norm": 0.7173396348953247,
+      "learning_rate": 0.00010994971013761677,
+      "loss": 0.9638,
+      "step": 10533
+    },
+    {
+      "epoch": 1.875534188034188,
+      "grad_norm": 0.8125925660133362,
+      "learning_rate": 0.0001099357820255506,
+      "loss": 1.0996,
+      "step": 10534
+    },
+    {
+      "epoch": 1.8757122507122506,
+      "grad_norm": 0.6191564798355103,
+      "learning_rate": 0.00010992185371881341,
+      "loss": 0.8266,
+      "step": 10535
+    },
+    {
+      "epoch": 1.8758903133903133,
+      "grad_norm": 0.6632885336875916,
+      "learning_rate": 0.0001099079252176781,
+      "loss": 1.1884,
+      "step": 10536
+    },
+    {
+      "epoch": 1.876068376068376,
+      "grad_norm": 0.7323372960090637,
+      "learning_rate": 0.00010989399652241759,
+      "loss": 1.0842,
+      "step": 10537
+    },
+    {
+      "epoch": 1.8762464387464388,
+      "grad_norm": 0.7553854584693909,
+      "learning_rate": 0.00010988006763330476,
+      "loss": 0.9948,
+      "step": 10538
+    },
+    {
+      "epoch": 1.8764245014245016,
+      "grad_norm": 0.5887658596038818,
+      "learning_rate": 0.00010986613855061255,
+      "loss": 0.7653,
+      "step": 10539
+    },
+    {
+      "epoch": 1.876602564102564,
+      "grad_norm": 0.6849574446678162,
+      "learning_rate": 0.00010985220927461384,
+      "loss": 1.152,
+      "step": 10540
+    },
+    {
+      "epoch": 1.8767806267806266,
+      "grad_norm": 0.6985000371932983,
+      "learning_rate": 0.00010983827980558155,
+      "loss": 0.9869,
+      "step": 10541
+    },
+    {
+      "epoch": 1.8769586894586894,
+      "grad_norm": 0.6885373592376709,
+      "learning_rate": 0.00010982435014378858,
+      "loss": 1.1803,
+      "step": 10542
+    },
+    {
+      "epoch": 1.8771367521367521,
+      "grad_norm": 0.7610142827033997,
+      "learning_rate": 0.00010981042028950788,
+      "loss": 0.9219,
+      "step": 10543
+    },
+    {
+      "epoch": 1.8773148148148149,
+      "grad_norm": 0.6545612215995789,
+      "learning_rate": 0.00010979649024301242,
+      "loss": 1.0337,
+      "step": 10544
+    },
+    {
+      "epoch": 1.8774928774928776,
+      "grad_norm": 0.7307698130607605,
+      "learning_rate": 0.00010978256000457505,
+      "loss": 0.9726,
+      "step": 10545
+    },
+    {
+      "epoch": 1.8776709401709402,
+      "grad_norm": 0.68310546875,
+      "learning_rate": 0.00010976862957446877,
+      "loss": 1.161,
+      "step": 10546
+    },
+    {
+      "epoch": 1.8778490028490027,
+      "grad_norm": 0.6114758253097534,
+      "learning_rate": 0.00010975469895296646,
+      "loss": 0.8863,
+      "step": 10547
+    },
+    {
+      "epoch": 1.8780270655270654,
+      "grad_norm": 0.732390820980072,
+      "learning_rate": 0.00010974076814034106,
+      "loss": 1.0339,
+      "step": 10548
+    },
+    {
+      "epoch": 1.8782051282051282,
+      "grad_norm": 0.6741712689399719,
+      "learning_rate": 0.0001097268371368656,
+      "loss": 1.0024,
+      "step": 10549
+    },
+    {
+      "epoch": 1.878383190883191,
+      "grad_norm": 0.6374897360801697,
+      "learning_rate": 0.00010971290594281294,
+      "loss": 0.91,
+      "step": 10550
+    },
+    {
+      "epoch": 1.8785612535612537,
+      "grad_norm": 0.6434261202812195,
+      "learning_rate": 0.00010969897455845608,
+      "loss": 1.0048,
+      "step": 10551
+    },
+    {
+      "epoch": 1.8787393162393162,
+      "grad_norm": 0.6573047041893005,
+      "learning_rate": 0.00010968504298406794,
+      "loss": 1.118,
+      "step": 10552
+    },
+    {
+      "epoch": 1.8789173789173788,
+      "grad_norm": 0.6686552166938782,
+      "learning_rate": 0.00010967111121992152,
+      "loss": 1.089,
+      "step": 10553
+    },
+    {
+      "epoch": 1.8790954415954415,
+      "grad_norm": 0.7899606823921204,
+      "learning_rate": 0.00010965717926628976,
+      "loss": 1.059,
+      "step": 10554
+    },
+    {
+      "epoch": 1.8792735042735043,
+      "grad_norm": 0.5808879733085632,
+      "learning_rate": 0.00010964324712344564,
+      "loss": 0.9369,
+      "step": 10555
+    },
+    {
+      "epoch": 1.879451566951567,
+      "grad_norm": 0.6322834491729736,
+      "learning_rate": 0.00010962931479166211,
+      "loss": 0.8783,
+      "step": 10556
+    },
+    {
+      "epoch": 1.8796296296296298,
+      "grad_norm": 0.647002637386322,
+      "learning_rate": 0.00010961538227121218,
+      "loss": 0.9468,
+      "step": 10557
+    },
+    {
+      "epoch": 1.8798076923076923,
+      "grad_norm": 0.6581854820251465,
+      "learning_rate": 0.0001096014495623688,
+      "loss": 1.0077,
+      "step": 10558
+    },
+    {
+      "epoch": 1.8799857549857548,
+      "grad_norm": 0.6879259943962097,
+      "learning_rate": 0.00010958751666540496,
+      "loss": 0.976,
+      "step": 10559
+    },
+    {
+      "epoch": 1.8801638176638176,
+      "grad_norm": 0.7055090665817261,
+      "learning_rate": 0.00010957358358059364,
+      "loss": 0.8903,
+      "step": 10560
+    },
+    {
+      "epoch": 1.8803418803418803,
+      "grad_norm": 0.6865016222000122,
+      "learning_rate": 0.00010955965030820782,
+      "loss": 0.9872,
+      "step": 10561
+    },
+    {
+      "epoch": 1.880519943019943,
+      "grad_norm": 0.663436770439148,
+      "learning_rate": 0.00010954571684852055,
+      "loss": 1.0485,
+      "step": 10562
+    },
+    {
+      "epoch": 1.8806980056980058,
+      "grad_norm": 0.6861656904220581,
+      "learning_rate": 0.00010953178320180475,
+      "loss": 1.0691,
+      "step": 10563
+    },
+    {
+      "epoch": 1.8808760683760684,
+      "grad_norm": 0.8045449256896973,
+      "learning_rate": 0.0001095178493683335,
+      "loss": 1.1534,
+      "step": 10564
+    },
+    {
+      "epoch": 1.881054131054131,
+      "grad_norm": 0.6493151187896729,
+      "learning_rate": 0.00010950391534837973,
+      "loss": 0.8756,
+      "step": 10565
+    },
+    {
+      "epoch": 1.8812321937321936,
+      "grad_norm": 0.7057121992111206,
+      "learning_rate": 0.00010948998114221651,
+      "loss": 1.1709,
+      "step": 10566
+    },
+    {
+      "epoch": 1.8814102564102564,
+      "grad_norm": 0.7708197236061096,
+      "learning_rate": 0.0001094760467501168,
+      "loss": 1.0037,
+      "step": 10567
+    },
+    {
+      "epoch": 1.8815883190883191,
+      "grad_norm": 0.7234642505645752,
+      "learning_rate": 0.00010946211217235364,
+      "loss": 1.0757,
+      "step": 10568
+    },
+    {
+      "epoch": 1.881766381766382,
+      "grad_norm": 0.6964395642280579,
+      "learning_rate": 0.00010944817740920006,
+      "loss": 1.0769,
+      "step": 10569
+    },
+    {
+      "epoch": 1.8819444444444444,
+      "grad_norm": 0.7465848922729492,
+      "learning_rate": 0.00010943424246092906,
+      "loss": 0.9772,
+      "step": 10570
+    },
+    {
+      "epoch": 1.8821225071225072,
+      "grad_norm": 0.7145788073539734,
+      "learning_rate": 0.0001094203073278137,
+      "loss": 0.9638,
+      "step": 10571
+    },
+    {
+      "epoch": 1.8823005698005697,
+      "grad_norm": 0.7421764135360718,
+      "learning_rate": 0.00010940637201012698,
+      "loss": 1.0324,
+      "step": 10572
+    },
+    {
+      "epoch": 1.8824786324786325,
+      "grad_norm": 0.7373253107070923,
+      "learning_rate": 0.0001093924365081419,
+      "loss": 1.1554,
+      "step": 10573
+    },
+    {
+      "epoch": 1.8826566951566952,
+      "grad_norm": 0.6861984729766846,
+      "learning_rate": 0.00010937850082213156,
+      "loss": 0.9899,
+      "step": 10574
+    },
+    {
+      "epoch": 1.882834757834758,
+      "grad_norm": 0.6173393130302429,
+      "learning_rate": 0.000109364564952369,
+      "loss": 0.8495,
+      "step": 10575
+    },
+    {
+      "epoch": 1.8830128205128205,
+      "grad_norm": 0.6871610879898071,
+      "learning_rate": 0.00010935062889912723,
+      "loss": 1.2164,
+      "step": 10576
+    },
+    {
+      "epoch": 1.8831908831908832,
+      "grad_norm": 0.7062903642654419,
+      "learning_rate": 0.00010933669266267931,
+      "loss": 1.1077,
+      "step": 10577
+    },
+    {
+      "epoch": 1.8833689458689458,
+      "grad_norm": 0.6574689745903015,
+      "learning_rate": 0.00010932275624329828,
+      "loss": 0.9326,
+      "step": 10578
+    },
+    {
+      "epoch": 1.8835470085470085,
+      "grad_norm": 0.636385440826416,
+      "learning_rate": 0.00010930881964125723,
+      "loss": 1.0581,
+      "step": 10579
+    },
+    {
+      "epoch": 1.8837250712250713,
+      "grad_norm": 0.6178432106971741,
+      "learning_rate": 0.0001092948828568292,
+      "loss": 1.1288,
+      "step": 10580
+    },
+    {
+      "epoch": 1.883903133903134,
+      "grad_norm": 0.6509431600570679,
+      "learning_rate": 0.00010928094589028721,
+      "loss": 1.0113,
+      "step": 10581
+    },
+    {
+      "epoch": 1.8840811965811965,
+      "grad_norm": 0.6543706059455872,
+      "learning_rate": 0.00010926700874190441,
+      "loss": 1.0041,
+      "step": 10582
+    },
+    {
+      "epoch": 1.8842592592592593,
+      "grad_norm": 0.6815463304519653,
+      "learning_rate": 0.0001092530714119538,
+      "loss": 1.0892,
+      "step": 10583
+    },
+    {
+      "epoch": 1.8844373219373218,
+      "grad_norm": 0.6787421107292175,
+      "learning_rate": 0.00010923913390070846,
+      "loss": 1.2693,
+      "step": 10584
+    },
+    {
+      "epoch": 1.8846153846153846,
+      "grad_norm": 0.6953850984573364,
+      "learning_rate": 0.00010922519620844151,
+      "loss": 0.9848,
+      "step": 10585
+    },
+    {
+      "epoch": 1.8847934472934473,
+      "grad_norm": 0.7061360478401184,
+      "learning_rate": 0.000109211258335426,
+      "loss": 0.949,
+      "step": 10586
+    },
+    {
+      "epoch": 1.88497150997151,
+      "grad_norm": 0.6845372915267944,
+      "learning_rate": 0.00010919732028193504,
+      "loss": 0.9554,
+      "step": 10587
+    },
+    {
+      "epoch": 1.8851495726495726,
+      "grad_norm": 0.6524720788002014,
+      "learning_rate": 0.00010918338204824165,
+      "loss": 1.1037,
+      "step": 10588
+    },
+    {
+      "epoch": 1.8853276353276354,
+      "grad_norm": 0.6410523653030396,
+      "learning_rate": 0.00010916944363461899,
+      "loss": 0.9085,
+      "step": 10589
+    },
+    {
+      "epoch": 1.885505698005698,
+      "grad_norm": 0.7109059691429138,
+      "learning_rate": 0.00010915550504134014,
+      "loss": 1.0526,
+      "step": 10590
+    },
+    {
+      "epoch": 1.8856837606837606,
+      "grad_norm": 0.7781991362571716,
+      "learning_rate": 0.00010914156626867818,
+      "loss": 0.9737,
+      "step": 10591
+    },
+    {
+      "epoch": 1.8858618233618234,
+      "grad_norm": 0.7173767685890198,
+      "learning_rate": 0.00010912762731690623,
+      "loss": 0.8862,
+      "step": 10592
+    },
+    {
+      "epoch": 1.8860398860398861,
+      "grad_norm": 0.7650504112243652,
+      "learning_rate": 0.00010911368818629732,
+      "loss": 1.2175,
+      "step": 10593
+    },
+    {
+      "epoch": 1.8862179487179487,
+      "grad_norm": 0.6316116452217102,
+      "learning_rate": 0.00010909974887712468,
+      "loss": 0.8332,
+      "step": 10594
+    },
+    {
+      "epoch": 1.8863960113960114,
+      "grad_norm": 0.6504800319671631,
+      "learning_rate": 0.00010908580938966138,
+      "loss": 0.8864,
+      "step": 10595
+    },
+    {
+      "epoch": 1.886574074074074,
+      "grad_norm": 0.675507128238678,
+      "learning_rate": 0.00010907186972418049,
+      "loss": 0.8523,
+      "step": 10596
+    },
+    {
+      "epoch": 1.8867521367521367,
+      "grad_norm": 0.6535763144493103,
+      "learning_rate": 0.00010905792988095515,
+      "loss": 1.0786,
+      "step": 10597
+    },
+    {
+      "epoch": 1.8869301994301995,
+      "grad_norm": 0.7071853280067444,
+      "learning_rate": 0.0001090439898602585,
+      "loss": 0.9319,
+      "step": 10598
+    },
+    {
+      "epoch": 1.8871082621082622,
+      "grad_norm": 0.699466347694397,
+      "learning_rate": 0.00010903004966236365,
+      "loss": 0.9573,
+      "step": 10599
+    },
+    {
+      "epoch": 1.8872863247863247,
+      "grad_norm": 0.7099201083183289,
+      "learning_rate": 0.00010901610928754375,
+      "loss": 0.9447,
+      "step": 10600
+    },
+    {
+      "epoch": 1.8874643874643875,
+      "grad_norm": 0.6140450835227966,
+      "learning_rate": 0.00010900216873607189,
+      "loss": 1.0227,
+      "step": 10601
+    },
+    {
+      "epoch": 1.88764245014245,
+      "grad_norm": 0.6613629460334778,
+      "learning_rate": 0.00010898822800822127,
+      "loss": 1.0152,
+      "step": 10602
+    },
+    {
+      "epoch": 1.8878205128205128,
+      "grad_norm": 0.7334819436073303,
+      "learning_rate": 0.00010897428710426498,
+      "loss": 1.1452,
+      "step": 10603
+    },
+    {
+      "epoch": 1.8879985754985755,
+      "grad_norm": 0.6819368004798889,
+      "learning_rate": 0.00010896034602447616,
+      "loss": 1.0504,
+      "step": 10604
+    },
+    {
+      "epoch": 1.8881766381766383,
+      "grad_norm": 0.6781361103057861,
+      "learning_rate": 0.00010894640476912799,
+      "loss": 0.8719,
+      "step": 10605
+    },
+    {
+      "epoch": 1.8883547008547008,
+      "grad_norm": 0.621960461139679,
+      "learning_rate": 0.00010893246333849361,
+      "loss": 0.9264,
+      "step": 10606
+    },
+    {
+      "epoch": 1.8885327635327636,
+      "grad_norm": 0.6350592374801636,
+      "learning_rate": 0.00010891852173284615,
+      "loss": 1.0042,
+      "step": 10607
+    },
+    {
+      "epoch": 1.888710826210826,
+      "grad_norm": 0.6650694012641907,
+      "learning_rate": 0.00010890457995245879,
+      "loss": 1.1387,
+      "step": 10608
+    },
+    {
+      "epoch": 1.8888888888888888,
+      "grad_norm": 0.6515723466873169,
+      "learning_rate": 0.00010889063799760468,
+      "loss": 0.9508,
+      "step": 10609
+    },
+    {
+      "epoch": 1.8890669515669516,
+      "grad_norm": 0.6368890404701233,
+      "learning_rate": 0.000108876695868557,
+      "loss": 0.8051,
+      "step": 10610
+    },
+    {
+      "epoch": 1.8892450142450143,
+      "grad_norm": 0.7971013188362122,
+      "learning_rate": 0.00010886275356558888,
+      "loss": 0.8629,
+      "step": 10611
+    },
+    {
+      "epoch": 1.8894230769230769,
+      "grad_norm": 0.6739095449447632,
+      "learning_rate": 0.00010884881108897353,
+      "loss": 0.9606,
+      "step": 10612
+    },
+    {
+      "epoch": 1.8896011396011396,
+      "grad_norm": 0.7754076719284058,
+      "learning_rate": 0.00010883486843898412,
+      "loss": 1.0751,
+      "step": 10613
+    },
+    {
+      "epoch": 1.8897792022792022,
+      "grad_norm": 0.6538285613059998,
+      "learning_rate": 0.00010882092561589379,
+      "loss": 0.9288,
+      "step": 10614
+    },
+    {
+      "epoch": 1.889957264957265,
+      "grad_norm": 0.7373257875442505,
+      "learning_rate": 0.00010880698261997577,
+      "loss": 0.9884,
+      "step": 10615
+    },
+    {
+      "epoch": 1.8901353276353277,
+      "grad_norm": 0.6575660109519958,
+      "learning_rate": 0.00010879303945150321,
+      "loss": 1.0307,
+      "step": 10616
+    },
+    {
+      "epoch": 1.8903133903133904,
+      "grad_norm": 0.7500179409980774,
+      "learning_rate": 0.00010877909611074932,
+      "loss": 1.0812,
+      "step": 10617
+    },
+    {
+      "epoch": 1.890491452991453,
+      "grad_norm": 0.7607308030128479,
+      "learning_rate": 0.00010876515259798727,
+      "loss": 0.9746,
+      "step": 10618
+    },
+    {
+      "epoch": 1.8906695156695157,
+      "grad_norm": 0.7930253744125366,
+      "learning_rate": 0.00010875120891349024,
+      "loss": 0.7911,
+      "step": 10619
+    },
+    {
+      "epoch": 1.8908475783475782,
+      "grad_norm": 0.635254979133606,
+      "learning_rate": 0.00010873726505753148,
+      "loss": 1.0468,
+      "step": 10620
+    },
+    {
+      "epoch": 1.891025641025641,
+      "grad_norm": 0.7579759359359741,
+      "learning_rate": 0.00010872332103038414,
+      "loss": 0.9558,
+      "step": 10621
+    },
+    {
+      "epoch": 1.8912037037037037,
+      "grad_norm": 0.5841903686523438,
+      "learning_rate": 0.00010870937683232146,
+      "loss": 0.913,
+      "step": 10622
+    },
+    {
+      "epoch": 1.8913817663817665,
+      "grad_norm": 0.7088860273361206,
+      "learning_rate": 0.00010869543246361664,
+      "loss": 1.0814,
+      "step": 10623
+    },
+    {
+      "epoch": 1.8915598290598292,
+      "grad_norm": 0.6713772416114807,
+      "learning_rate": 0.00010868148792454285,
+      "loss": 0.9972,
+      "step": 10624
+    },
+    {
+      "epoch": 1.8917378917378918,
+      "grad_norm": 0.6733243465423584,
+      "learning_rate": 0.00010866754321537338,
+      "loss": 0.9596,
+      "step": 10625
+    },
+    {
+      "epoch": 1.8919159544159543,
+      "grad_norm": 0.7747747898101807,
+      "learning_rate": 0.00010865359833638138,
+      "loss": 1.0871,
+      "step": 10626
+    },
+    {
+      "epoch": 1.892094017094017,
+      "grad_norm": 0.677175760269165,
+      "learning_rate": 0.00010863965328784011,
+      "loss": 0.9939,
+      "step": 10627
+    },
+    {
+      "epoch": 1.8922720797720798,
+      "grad_norm": 0.7883930206298828,
+      "learning_rate": 0.00010862570807002279,
+      "loss": 1.0708,
+      "step": 10628
+    },
+    {
+      "epoch": 1.8924501424501425,
+      "grad_norm": 0.7003030180931091,
+      "learning_rate": 0.00010861176268320261,
+      "loss": 0.9791,
+      "step": 10629
+    },
+    {
+      "epoch": 1.8926282051282053,
+      "grad_norm": 0.7450358271598816,
+      "learning_rate": 0.00010859781712765284,
+      "loss": 0.9672,
+      "step": 10630
+    },
+    {
+      "epoch": 1.8928062678062678,
+      "grad_norm": 0.7776696085929871,
+      "learning_rate": 0.00010858387140364672,
+      "loss": 1.1037,
+      "step": 10631
+    },
+    {
+      "epoch": 1.8929843304843303,
+      "grad_norm": 0.6896173357963562,
+      "learning_rate": 0.00010856992551145745,
+      "loss": 1.0048,
+      "step": 10632
+    },
+    {
+      "epoch": 1.893162393162393,
+      "grad_norm": 0.5997697710990906,
+      "learning_rate": 0.00010855597945135834,
+      "loss": 0.8025,
+      "step": 10633
+    },
+    {
+      "epoch": 1.8933404558404558,
+      "grad_norm": 0.8781484365463257,
+      "learning_rate": 0.00010854203322362251,
+      "loss": 1.0014,
+      "step": 10634
+    },
+    {
+      "epoch": 1.8935185185185186,
+      "grad_norm": 0.6348843574523926,
+      "learning_rate": 0.00010852808682852334,
+      "loss": 0.9857,
+      "step": 10635
+    },
+    {
+      "epoch": 1.8936965811965814,
+      "grad_norm": 0.9704267978668213,
+      "learning_rate": 0.000108514140266334,
+      "loss": 1.0522,
+      "step": 10636
+    },
+    {
+      "epoch": 1.8938746438746439,
+      "grad_norm": 0.70372074842453,
+      "learning_rate": 0.00010850019353732779,
+      "loss": 1.1044,
+      "step": 10637
+    },
+    {
+      "epoch": 1.8940527065527064,
+      "grad_norm": 0.6528043150901794,
+      "learning_rate": 0.00010848624664177793,
+      "loss": 0.9328,
+      "step": 10638
+    },
+    {
+      "epoch": 1.8942307692307692,
+      "grad_norm": 0.6299768090248108,
+      "learning_rate": 0.00010847229957995768,
+      "loss": 1.0099,
+      "step": 10639
+    },
+    {
+      "epoch": 1.894408831908832,
+      "grad_norm": 0.6347038149833679,
+      "learning_rate": 0.00010845835235214034,
+      "loss": 1.1354,
+      "step": 10640
+    },
+    {
+      "epoch": 1.8945868945868947,
+      "grad_norm": 0.7087811827659607,
+      "learning_rate": 0.00010844440495859913,
+      "loss": 1.0543,
+      "step": 10641
+    },
+    {
+      "epoch": 1.8947649572649574,
+      "grad_norm": 0.7386305332183838,
+      "learning_rate": 0.00010843045739960738,
+      "loss": 0.9192,
+      "step": 10642
+    },
+    {
+      "epoch": 1.89494301994302,
+      "grad_norm": 0.6047097444534302,
+      "learning_rate": 0.00010841650967543833,
+      "loss": 0.8668,
+      "step": 10643
+    },
+    {
+      "epoch": 1.8951210826210825,
+      "grad_norm": 0.6779503226280212,
+      "learning_rate": 0.00010840256178636523,
+      "loss": 0.9263,
+      "step": 10644
+    },
+    {
+      "epoch": 1.8952991452991452,
+      "grad_norm": 0.7398194670677185,
+      "learning_rate": 0.00010838861373266138,
+      "loss": 0.9534,
+      "step": 10645
+    },
+    {
+      "epoch": 1.895477207977208,
+      "grad_norm": 0.8138558864593506,
+      "learning_rate": 0.00010837466551460011,
+      "loss": 0.9835,
+      "step": 10646
+    },
+    {
+      "epoch": 1.8956552706552707,
+      "grad_norm": 0.8847818374633789,
+      "learning_rate": 0.00010836071713245466,
+      "loss": 0.9769,
+      "step": 10647
+    },
+    {
+      "epoch": 1.8958333333333335,
+      "grad_norm": 0.6824164390563965,
+      "learning_rate": 0.0001083467685864983,
+      "loss": 0.9901,
+      "step": 10648
+    },
+    {
+      "epoch": 1.896011396011396,
+      "grad_norm": 0.6318182945251465,
+      "learning_rate": 0.00010833281987700436,
+      "loss": 0.7677,
+      "step": 10649
+    },
+    {
+      "epoch": 1.8961894586894585,
+      "grad_norm": 0.7372074127197266,
+      "learning_rate": 0.00010831887100424612,
+      "loss": 0.9858,
+      "step": 10650
+    },
+    {
+      "epoch": 1.8963675213675213,
+      "grad_norm": 0.7246516346931458,
+      "learning_rate": 0.00010830492196849688,
+      "loss": 0.9644,
+      "step": 10651
+    },
+    {
+      "epoch": 1.896545584045584,
+      "grad_norm": 0.6517095565795898,
+      "learning_rate": 0.00010829097277002997,
+      "loss": 1.1733,
+      "step": 10652
+    },
+    {
+      "epoch": 1.8967236467236468,
+      "grad_norm": 0.6931695342063904,
+      "learning_rate": 0.00010827702340911867,
+      "loss": 0.9923,
+      "step": 10653
+    },
+    {
+      "epoch": 1.8969017094017095,
+      "grad_norm": 0.6210272312164307,
+      "learning_rate": 0.00010826307388603628,
+      "loss": 0.8757,
+      "step": 10654
+    },
+    {
+      "epoch": 1.897079772079772,
+      "grad_norm": 0.7011165618896484,
+      "learning_rate": 0.00010824912420105611,
+      "loss": 1.0011,
+      "step": 10655
+    },
+    {
+      "epoch": 1.8972578347578346,
+      "grad_norm": 0.7431246638298035,
+      "learning_rate": 0.0001082351743544515,
+      "loss": 1.1498,
+      "step": 10656
+    },
+    {
+      "epoch": 1.8974358974358974,
+      "grad_norm": 0.7099978923797607,
+      "learning_rate": 0.00010822122434649576,
+      "loss": 1.0673,
+      "step": 10657
+    },
+    {
+      "epoch": 1.89761396011396,
+      "grad_norm": 0.7375551462173462,
+      "learning_rate": 0.00010820727417746219,
+      "loss": 1.0157,
+      "step": 10658
+    },
+    {
+      "epoch": 1.8977920227920229,
+      "grad_norm": 0.8155642151832581,
+      "learning_rate": 0.00010819332384762413,
+      "loss": 1.229,
+      "step": 10659
+    },
+    {
+      "epoch": 1.8979700854700856,
+      "grad_norm": 0.6917914748191833,
+      "learning_rate": 0.00010817937335725493,
+      "loss": 0.9701,
+      "step": 10660
+    },
+    {
+      "epoch": 1.8981481481481481,
+      "grad_norm": 0.8498218059539795,
+      "learning_rate": 0.00010816542270662786,
+      "loss": 1.0123,
+      "step": 10661
+    },
+    {
+      "epoch": 1.8983262108262107,
+      "grad_norm": 0.7234359979629517,
+      "learning_rate": 0.00010815147189601634,
+      "loss": 1.0755,
+      "step": 10662
+    },
+    {
+      "epoch": 1.8985042735042734,
+      "grad_norm": 0.6997553110122681,
+      "learning_rate": 0.00010813752092569365,
+      "loss": 1.1594,
+      "step": 10663
+    },
+    {
+      "epoch": 1.8986823361823362,
+      "grad_norm": 0.6519457101821899,
+      "learning_rate": 0.00010812356979593314,
+      "loss": 0.9609,
+      "step": 10664
+    },
+    {
+      "epoch": 1.898860398860399,
+      "grad_norm": 0.7215374708175659,
+      "learning_rate": 0.00010810961850700813,
+      "loss": 1.1392,
+      "step": 10665
+    },
+    {
+      "epoch": 1.8990384615384617,
+      "grad_norm": 0.7766093611717224,
+      "learning_rate": 0.00010809566705919202,
+      "loss": 1.0256,
+      "step": 10666
+    },
+    {
+      "epoch": 1.8992165242165242,
+      "grad_norm": 0.6520358920097351,
+      "learning_rate": 0.00010808171545275814,
+      "loss": 1.0434,
+      "step": 10667
+    },
+    {
+      "epoch": 1.8993945868945867,
+      "grad_norm": 0.7454953193664551,
+      "learning_rate": 0.00010806776368797982,
+      "loss": 1.2323,
+      "step": 10668
+    },
+    {
+      "epoch": 1.8995726495726495,
+      "grad_norm": 0.6891530752182007,
+      "learning_rate": 0.00010805381176513043,
+      "loss": 1.1104,
+      "step": 10669
+    },
+    {
+      "epoch": 1.8997507122507122,
+      "grad_norm": 0.6609626412391663,
+      "learning_rate": 0.00010803985968448331,
+      "loss": 0.8565,
+      "step": 10670
+    },
+    {
+      "epoch": 1.899928774928775,
+      "grad_norm": 0.6650999188423157,
+      "learning_rate": 0.00010802590744631187,
+      "loss": 1.1003,
+      "step": 10671
+    },
+    {
+      "epoch": 1.9001068376068377,
+      "grad_norm": 0.5794292092323303,
+      "learning_rate": 0.00010801195505088945,
+      "loss": 0.528,
+      "step": 10672
+    },
+    {
+      "epoch": 1.9002849002849003,
+      "grad_norm": 1.0802743434906006,
+      "learning_rate": 0.00010799800249848939,
+      "loss": 0.8861,
+      "step": 10673
+    },
+    {
+      "epoch": 1.9004629629629628,
+      "grad_norm": 0.650833249092102,
+      "learning_rate": 0.00010798404978938513,
+      "loss": 0.9962,
+      "step": 10674
+    },
+    {
+      "epoch": 1.9006410256410255,
+      "grad_norm": 0.7290451526641846,
+      "learning_rate": 0.00010797009692384994,
+      "loss": 1.0764,
+      "step": 10675
+    },
+    {
+      "epoch": 1.9008190883190883,
+      "grad_norm": 0.6273928880691528,
+      "learning_rate": 0.00010795614390215727,
+      "loss": 0.9478,
+      "step": 10676
+    },
+    {
+      "epoch": 1.900997150997151,
+      "grad_norm": 0.6939455270767212,
+      "learning_rate": 0.00010794219072458052,
+      "loss": 0.8991,
+      "step": 10677
+    },
+    {
+      "epoch": 1.9011752136752138,
+      "grad_norm": 0.7455828189849854,
+      "learning_rate": 0.00010792823739139302,
+      "loss": 0.8902,
+      "step": 10678
+    },
+    {
+      "epoch": 1.9013532763532763,
+      "grad_norm": 0.6894607543945312,
+      "learning_rate": 0.00010791428390286817,
+      "loss": 0.9355,
+      "step": 10679
+    },
+    {
+      "epoch": 1.901531339031339,
+      "grad_norm": 0.6844658851623535,
+      "learning_rate": 0.00010790033025927936,
+      "loss": 0.9835,
+      "step": 10680
+    },
+    {
+      "epoch": 1.9017094017094016,
+      "grad_norm": 0.6646730899810791,
+      "learning_rate": 0.00010788637646090001,
+      "loss": 0.9376,
+      "step": 10681
+    },
+    {
+      "epoch": 1.9018874643874644,
+      "grad_norm": 0.6494864225387573,
+      "learning_rate": 0.00010787242250800349,
+      "loss": 0.8533,
+      "step": 10682
+    },
+    {
+      "epoch": 1.9020655270655271,
+      "grad_norm": 0.686198353767395,
+      "learning_rate": 0.0001078584684008632,
+      "loss": 0.8075,
+      "step": 10683
+    },
+    {
+      "epoch": 1.9022435897435899,
+      "grad_norm": 0.7014855742454529,
+      "learning_rate": 0.00010784451413975256,
+      "loss": 1.0805,
+      "step": 10684
+    },
+    {
+      "epoch": 1.9024216524216524,
+      "grad_norm": 0.7191864252090454,
+      "learning_rate": 0.00010783055972494496,
+      "loss": 0.9375,
+      "step": 10685
+    },
+    {
+      "epoch": 1.9025997150997151,
+      "grad_norm": 0.8114212155342102,
+      "learning_rate": 0.00010781660515671379,
+      "loss": 0.9716,
+      "step": 10686
+    },
+    {
+      "epoch": 1.9027777777777777,
+      "grad_norm": 0.7423529028892517,
+      "learning_rate": 0.0001078026504353325,
+      "loss": 0.9066,
+      "step": 10687
+    },
+    {
+      "epoch": 1.9029558404558404,
+      "grad_norm": 0.6517882347106934,
+      "learning_rate": 0.00010778869556107447,
+      "loss": 0.9908,
+      "step": 10688
+    },
+    {
+      "epoch": 1.9031339031339032,
+      "grad_norm": 0.6983367800712585,
+      "learning_rate": 0.00010777474053421315,
+      "loss": 1.1048,
+      "step": 10689
+    },
+    {
+      "epoch": 1.903311965811966,
+      "grad_norm": 0.597766101360321,
+      "learning_rate": 0.00010776078535502193,
+      "loss": 0.84,
+      "step": 10690
+    },
+    {
+      "epoch": 1.9034900284900285,
+      "grad_norm": 0.7335455417633057,
+      "learning_rate": 0.00010774683002377422,
+      "loss": 1.0387,
+      "step": 10691
+    },
+    {
+      "epoch": 1.9036680911680912,
+      "grad_norm": 0.6742176413536072,
+      "learning_rate": 0.0001077328745407435,
+      "loss": 0.9743,
+      "step": 10692
+    },
+    {
+      "epoch": 1.9038461538461537,
+      "grad_norm": 0.7954961657524109,
+      "learning_rate": 0.00010771891890620316,
+      "loss": 1.1025,
+      "step": 10693
+    },
+    {
+      "epoch": 1.9040242165242165,
+      "grad_norm": 0.733351469039917,
+      "learning_rate": 0.00010770496312042664,
+      "loss": 1.028,
+      "step": 10694
+    },
+    {
+      "epoch": 1.9042022792022792,
+      "grad_norm": 0.7059772610664368,
+      "learning_rate": 0.00010769100718368734,
+      "loss": 1.0103,
+      "step": 10695
+    },
+    {
+      "epoch": 1.904380341880342,
+      "grad_norm": 0.6234813332557678,
+      "learning_rate": 0.00010767705109625877,
+      "loss": 0.6893,
+      "step": 10696
+    },
+    {
+      "epoch": 1.9045584045584045,
+      "grad_norm": 0.6670311689376831,
+      "learning_rate": 0.0001076630948584143,
+      "loss": 1.1386,
+      "step": 10697
+    },
+    {
+      "epoch": 1.9047364672364673,
+      "grad_norm": 0.7444894909858704,
+      "learning_rate": 0.00010764913847042744,
+      "loss": 0.8524,
+      "step": 10698
+    },
+    {
+      "epoch": 1.9049145299145298,
+      "grad_norm": 0.6252964735031128,
+      "learning_rate": 0.00010763518193257158,
+      "loss": 0.9407,
+      "step": 10699
+    },
+    {
+      "epoch": 1.9050925925925926,
+      "grad_norm": 0.7794382572174072,
+      "learning_rate": 0.0001076212252451202,
+      "loss": 1.05,
+      "step": 10700
+    },
+    {
+      "epoch": 1.9052706552706553,
+      "grad_norm": 0.6313693523406982,
+      "learning_rate": 0.00010760726840834671,
+      "loss": 0.8667,
+      "step": 10701
+    },
+    {
+      "epoch": 1.905448717948718,
+      "grad_norm": 0.6766461730003357,
+      "learning_rate": 0.00010759331142252462,
+      "loss": 0.9675,
+      "step": 10702
+    },
+    {
+      "epoch": 1.9056267806267806,
+      "grad_norm": 0.7457365393638611,
+      "learning_rate": 0.00010757935428792739,
+      "loss": 0.9177,
+      "step": 10703
+    },
+    {
+      "epoch": 1.9058048433048433,
+      "grad_norm": 0.6649872064590454,
+      "learning_rate": 0.00010756539700482844,
+      "loss": 0.8703,
+      "step": 10704
+    },
+    {
+      "epoch": 1.9059829059829059,
+      "grad_norm": 0.8418740034103394,
+      "learning_rate": 0.00010755143957350127,
+      "loss": 0.8993,
+      "step": 10705
+    },
+    {
+      "epoch": 1.9061609686609686,
+      "grad_norm": 0.6767167448997498,
+      "learning_rate": 0.00010753748199421929,
+      "loss": 1.0063,
+      "step": 10706
+    },
+    {
+      "epoch": 1.9063390313390314,
+      "grad_norm": 0.6959242820739746,
+      "learning_rate": 0.00010752352426725603,
+      "loss": 1.0516,
+      "step": 10707
+    },
+    {
+      "epoch": 1.9065170940170941,
+      "grad_norm": 0.7106529474258423,
+      "learning_rate": 0.00010750956639288493,
+      "loss": 0.9596,
+      "step": 10708
+    },
+    {
+      "epoch": 1.9066951566951567,
+      "grad_norm": 0.7611243724822998,
+      "learning_rate": 0.00010749560837137949,
+      "loss": 1.0739,
+      "step": 10709
+    },
+    {
+      "epoch": 1.9068732193732194,
+      "grad_norm": 0.6684338450431824,
+      "learning_rate": 0.00010748165020301317,
+      "loss": 1.1437,
+      "step": 10710
+    },
+    {
+      "epoch": 1.907051282051282,
+      "grad_norm": 0.5957385897636414,
+      "learning_rate": 0.00010746769188805945,
+      "loss": 0.8802,
+      "step": 10711
+    },
+    {
+      "epoch": 1.9072293447293447,
+      "grad_norm": 0.69919353723526,
+      "learning_rate": 0.00010745373342679184,
+      "loss": 1.1891,
+      "step": 10712
+    },
+    {
+      "epoch": 1.9074074074074074,
+      "grad_norm": 0.7562127709388733,
+      "learning_rate": 0.0001074397748194838,
+      "loss": 0.8717,
+      "step": 10713
+    },
+    {
+      "epoch": 1.9075854700854702,
+      "grad_norm": 0.6420038938522339,
+      "learning_rate": 0.00010742581606640882,
+      "loss": 1.1196,
+      "step": 10714
+    },
+    {
+      "epoch": 1.9077635327635327,
+      "grad_norm": 0.7545611262321472,
+      "learning_rate": 0.00010741185716784039,
+      "loss": 1.161,
+      "step": 10715
+    },
+    {
+      "epoch": 1.9079415954415955,
+      "grad_norm": 0.6467727422714233,
+      "learning_rate": 0.000107397898124052,
+      "loss": 0.8029,
+      "step": 10716
+    },
+    {
+      "epoch": 1.908119658119658,
+      "grad_norm": 0.6129235625267029,
+      "learning_rate": 0.00010738393893531722,
+      "loss": 0.8802,
+      "step": 10717
+    },
+    {
+      "epoch": 1.9082977207977208,
+      "grad_norm": 0.6416113376617432,
+      "learning_rate": 0.00010736997960190946,
+      "loss": 0.8465,
+      "step": 10718
+    },
+    {
+      "epoch": 1.9084757834757835,
+      "grad_norm": 0.6609050631523132,
+      "learning_rate": 0.00010735602012410229,
+      "loss": 0.9484,
+      "step": 10719
+    },
+    {
+      "epoch": 1.9086538461538463,
+      "grad_norm": 0.6302639842033386,
+      "learning_rate": 0.00010734206050216913,
+      "loss": 0.898,
+      "step": 10720
+    },
+    {
+      "epoch": 1.9088319088319088,
+      "grad_norm": 0.7291215658187866,
+      "learning_rate": 0.00010732810073638358,
+      "loss": 0.9544,
+      "step": 10721
+    },
+    {
+      "epoch": 1.9090099715099715,
+      "grad_norm": 0.6436966061592102,
+      "learning_rate": 0.0001073141408270191,
+      "loss": 0.956,
+      "step": 10722
+    },
+    {
+      "epoch": 1.909188034188034,
+      "grad_norm": 0.6247875094413757,
+      "learning_rate": 0.00010730018077434924,
+      "loss": 0.8704,
+      "step": 10723
+    },
+    {
+      "epoch": 1.9093660968660968,
+      "grad_norm": 0.7599029541015625,
+      "learning_rate": 0.00010728622057864753,
+      "loss": 1.2024,
+      "step": 10724
+    },
+    {
+      "epoch": 1.9095441595441596,
+      "grad_norm": 0.6894544959068298,
+      "learning_rate": 0.00010727226024018744,
+      "loss": 1.1226,
+      "step": 10725
+    },
+    {
+      "epoch": 1.9097222222222223,
+      "grad_norm": 0.6920733451843262,
+      "learning_rate": 0.0001072582997592425,
+      "loss": 0.7682,
+      "step": 10726
+    },
+    {
+      "epoch": 1.9099002849002849,
+      "grad_norm": 0.6013005375862122,
+      "learning_rate": 0.00010724433913608627,
+      "loss": 0.9462,
+      "step": 10727
+    },
+    {
+      "epoch": 1.9100783475783476,
+      "grad_norm": 0.7466302514076233,
+      "learning_rate": 0.00010723037837099225,
+      "loss": 0.9507,
+      "step": 10728
+    },
+    {
+      "epoch": 1.9102564102564101,
+      "grad_norm": 0.7070091962814331,
+      "learning_rate": 0.00010721641746423401,
+      "loss": 1.0704,
+      "step": 10729
+    },
+    {
+      "epoch": 1.9104344729344729,
+      "grad_norm": 0.6747950315475464,
+      "learning_rate": 0.00010720245641608506,
+      "loss": 0.7899,
+      "step": 10730
+    },
+    {
+      "epoch": 1.9106125356125356,
+      "grad_norm": 0.7338371276855469,
+      "learning_rate": 0.00010718849522681891,
+      "loss": 0.9574,
+      "step": 10731
+    },
+    {
+      "epoch": 1.9107905982905984,
+      "grad_norm": 0.6923216581344604,
+      "learning_rate": 0.00010717453389670915,
+      "loss": 1.0725,
+      "step": 10732
+    },
+    {
+      "epoch": 1.910968660968661,
+      "grad_norm": 0.6050783395767212,
+      "learning_rate": 0.0001071605724260293,
+      "loss": 0.9224,
+      "step": 10733
+    },
+    {
+      "epoch": 1.9111467236467237,
+      "grad_norm": 0.6854597330093384,
+      "learning_rate": 0.00010714661081505291,
+      "loss": 0.9749,
+      "step": 10734
+    },
+    {
+      "epoch": 1.9113247863247862,
+      "grad_norm": 0.7661508321762085,
+      "learning_rate": 0.00010713264906405351,
+      "loss": 1.1564,
+      "step": 10735
+    },
+    {
+      "epoch": 1.911502849002849,
+      "grad_norm": 0.6389622688293457,
+      "learning_rate": 0.00010711868717330467,
+      "loss": 0.8148,
+      "step": 10736
+    },
+    {
+      "epoch": 1.9116809116809117,
+      "grad_norm": 0.6318161487579346,
+      "learning_rate": 0.00010710472514307996,
+      "loss": 0.7833,
+      "step": 10737
+    },
+    {
+      "epoch": 1.9118589743589745,
+      "grad_norm": 0.8646727800369263,
+      "learning_rate": 0.00010709076297365292,
+      "loss": 1.2682,
+      "step": 10738
+    },
+    {
+      "epoch": 1.9120370370370372,
+      "grad_norm": 0.6085501909255981,
+      "learning_rate": 0.0001070768006652971,
+      "loss": 0.8706,
+      "step": 10739
+    },
+    {
+      "epoch": 1.9122150997150997,
+      "grad_norm": 0.8259731531143188,
+      "learning_rate": 0.00010706283821828607,
+      "loss": 0.9014,
+      "step": 10740
+    },
+    {
+      "epoch": 1.9123931623931623,
+      "grad_norm": 0.6509148478507996,
+      "learning_rate": 0.0001070488756328934,
+      "loss": 0.8814,
+      "step": 10741
+    },
+    {
+      "epoch": 1.912571225071225,
+      "grad_norm": 0.7241966128349304,
+      "learning_rate": 0.00010703491290939264,
+      "loss": 0.9925,
+      "step": 10742
+    },
+    {
+      "epoch": 1.9127492877492878,
+      "grad_norm": 0.7736822366714478,
+      "learning_rate": 0.00010702095004805738,
+      "loss": 1.0881,
+      "step": 10743
+    },
+    {
+      "epoch": 1.9129273504273505,
+      "grad_norm": 0.6912824511528015,
+      "learning_rate": 0.00010700698704916123,
+      "loss": 1.2334,
+      "step": 10744
+    },
+    {
+      "epoch": 1.9131054131054133,
+      "grad_norm": 0.825065553188324,
+      "learning_rate": 0.0001069930239129777,
+      "loss": 0.9783,
+      "step": 10745
+    },
+    {
+      "epoch": 1.9132834757834758,
+      "grad_norm": 0.7650560140609741,
+      "learning_rate": 0.00010697906063978038,
+      "loss": 0.9788,
+      "step": 10746
+    },
+    {
+      "epoch": 1.9134615384615383,
+      "grad_norm": 0.7368232607841492,
+      "learning_rate": 0.00010696509722984287,
+      "loss": 0.8704,
+      "step": 10747
+    },
+    {
+      "epoch": 1.913639601139601,
+      "grad_norm": 0.6630628108978271,
+      "learning_rate": 0.00010695113368343875,
+      "loss": 1.1993,
+      "step": 10748
+    },
+    {
+      "epoch": 1.9138176638176638,
+      "grad_norm": 0.6842190027236938,
+      "learning_rate": 0.0001069371700008416,
+      "loss": 0.9128,
+      "step": 10749
+    },
+    {
+      "epoch": 1.9139957264957266,
+      "grad_norm": 0.591655969619751,
+      "learning_rate": 0.00010692320618232503,
+      "loss": 1.0607,
+      "step": 10750
+    },
+    {
+      "epoch": 1.9141737891737893,
+      "grad_norm": 0.74644535779953,
+      "learning_rate": 0.0001069092422281626,
+      "loss": 1.0937,
+      "step": 10751
+    },
+    {
+      "epoch": 1.9143518518518519,
+      "grad_norm": 0.7123813629150391,
+      "learning_rate": 0.00010689527813862792,
+      "loss": 0.9043,
+      "step": 10752
+    },
+    {
+      "epoch": 1.9145299145299144,
+      "grad_norm": 0.6850089430809021,
+      "learning_rate": 0.0001068813139139946,
+      "loss": 1.0908,
+      "step": 10753
+    },
+    {
+      "epoch": 1.9147079772079771,
+      "grad_norm": 0.5882078409194946,
+      "learning_rate": 0.00010686734955453623,
+      "loss": 0.829,
+      "step": 10754
+    },
+    {
+      "epoch": 1.91488603988604,
+      "grad_norm": 0.6741717457771301,
+      "learning_rate": 0.00010685338506052642,
+      "loss": 0.9197,
+      "step": 10755
+    },
+    {
+      "epoch": 1.9150641025641026,
+      "grad_norm": 0.6597354412078857,
+      "learning_rate": 0.00010683942043223876,
+      "loss": 0.8778,
+      "step": 10756
+    },
+    {
+      "epoch": 1.9152421652421654,
+      "grad_norm": 0.6682151556015015,
+      "learning_rate": 0.00010682545566994684,
+      "loss": 0.9305,
+      "step": 10757
+    },
+    {
+      "epoch": 1.915420227920228,
+      "grad_norm": 0.8283176422119141,
+      "learning_rate": 0.00010681149077392431,
+      "loss": 1.0164,
+      "step": 10758
+    },
+    {
+      "epoch": 1.9155982905982905,
+      "grad_norm": 0.648845374584198,
+      "learning_rate": 0.00010679752574444477,
+      "loss": 1.0114,
+      "step": 10759
+    },
+    {
+      "epoch": 1.9157763532763532,
+      "grad_norm": 0.755913496017456,
+      "learning_rate": 0.00010678356058178182,
+      "loss": 1.1142,
+      "step": 10760
+    },
+    {
+      "epoch": 1.915954415954416,
+      "grad_norm": 0.7334780097007751,
+      "learning_rate": 0.00010676959528620911,
+      "loss": 0.8758,
+      "step": 10761
+    },
+    {
+      "epoch": 1.9161324786324787,
+      "grad_norm": 0.9132041335105896,
+      "learning_rate": 0.00010675562985800025,
+      "loss": 0.995,
+      "step": 10762
+    },
+    {
+      "epoch": 1.9163105413105415,
+      "grad_norm": 0.7070860266685486,
+      "learning_rate": 0.00010674166429742882,
+      "loss": 0.9856,
+      "step": 10763
+    },
+    {
+      "epoch": 1.916488603988604,
+      "grad_norm": 0.7143638134002686,
+      "learning_rate": 0.00010672769860476853,
+      "loss": 1.0612,
+      "step": 10764
+    },
+    {
+      "epoch": 1.9166666666666665,
+      "grad_norm": 0.815717339515686,
+      "learning_rate": 0.00010671373278029293,
+      "loss": 1.1539,
+      "step": 10765
+    },
+    {
+      "epoch": 1.9168447293447293,
+      "grad_norm": 0.6379499435424805,
+      "learning_rate": 0.0001066997668242757,
+      "loss": 0.8295,
+      "step": 10766
+    },
+    {
+      "epoch": 1.917022792022792,
+      "grad_norm": 0.6482511758804321,
+      "learning_rate": 0.00010668580073699044,
+      "loss": 1.0079,
+      "step": 10767
+    },
+    {
+      "epoch": 1.9172008547008548,
+      "grad_norm": 0.7382873296737671,
+      "learning_rate": 0.00010667183451871082,
+      "loss": 0.8973,
+      "step": 10768
+    },
+    {
+      "epoch": 1.9173789173789175,
+      "grad_norm": 0.7818579077720642,
+      "learning_rate": 0.00010665786816971044,
+      "loss": 1.2131,
+      "step": 10769
+    },
+    {
+      "epoch": 1.91755698005698,
+      "grad_norm": 0.6960901021957397,
+      "learning_rate": 0.000106643901690263,
+      "loss": 1.1466,
+      "step": 10770
+    },
+    {
+      "epoch": 1.9177350427350426,
+      "grad_norm": 0.696966826915741,
+      "learning_rate": 0.00010662993508064208,
+      "loss": 0.854,
+      "step": 10771
+    },
+    {
+      "epoch": 1.9179131054131053,
+      "grad_norm": 0.6745442152023315,
+      "learning_rate": 0.00010661596834112133,
+      "loss": 0.9559,
+      "step": 10772
+    },
+    {
+      "epoch": 1.918091168091168,
+      "grad_norm": 0.7436230778694153,
+      "learning_rate": 0.00010660200147197447,
+      "loss": 1.1367,
+      "step": 10773
+    },
+    {
+      "epoch": 1.9182692307692308,
+      "grad_norm": 0.6051676869392395,
+      "learning_rate": 0.00010658803447347509,
+      "loss": 1.05,
+      "step": 10774
+    },
+    {
+      "epoch": 1.9184472934472936,
+      "grad_norm": 0.5662530660629272,
+      "learning_rate": 0.00010657406734589686,
+      "loss": 0.8697,
+      "step": 10775
+    },
+    {
+      "epoch": 1.9186253561253561,
+      "grad_norm": 0.6640757322311401,
+      "learning_rate": 0.00010656010008951344,
+      "loss": 1.0636,
+      "step": 10776
+    },
+    {
+      "epoch": 1.9188034188034186,
+      "grad_norm": 0.6994011998176575,
+      "learning_rate": 0.00010654613270459848,
+      "loss": 0.9326,
+      "step": 10777
+    },
+    {
+      "epoch": 1.9189814814814814,
+      "grad_norm": 0.6827420592308044,
+      "learning_rate": 0.00010653216519142563,
+      "loss": 0.8667,
+      "step": 10778
+    },
+    {
+      "epoch": 1.9191595441595442,
+      "grad_norm": 0.6814691424369812,
+      "learning_rate": 0.00010651819755026862,
+      "loss": 0.828,
+      "step": 10779
+    },
+    {
+      "epoch": 1.919337606837607,
+      "grad_norm": 0.7033611536026001,
+      "learning_rate": 0.00010650422978140103,
+      "loss": 1.0427,
+      "step": 10780
+    },
+    {
+      "epoch": 1.9195156695156697,
+      "grad_norm": 0.7098833322525024,
+      "learning_rate": 0.00010649026188509657,
+      "loss": 1.1723,
+      "step": 10781
+    },
+    {
+      "epoch": 1.9196937321937322,
+      "grad_norm": 0.7184767723083496,
+      "learning_rate": 0.00010647629386162893,
+      "loss": 0.852,
+      "step": 10782
+    },
+    {
+      "epoch": 1.9198717948717947,
+      "grad_norm": 0.6682565808296204,
+      "learning_rate": 0.00010646232571127175,
+      "loss": 0.8827,
+      "step": 10783
+    },
+    {
+      "epoch": 1.9200498575498575,
+      "grad_norm": 0.6699280142784119,
+      "learning_rate": 0.00010644835743429873,
+      "loss": 0.8346,
+      "step": 10784
+    },
+    {
+      "epoch": 1.9202279202279202,
+      "grad_norm": 0.8041857481002808,
+      "learning_rate": 0.00010643438903098355,
+      "loss": 0.9622,
+      "step": 10785
+    },
+    {
+      "epoch": 1.920405982905983,
+      "grad_norm": 0.7315110564231873,
+      "learning_rate": 0.00010642042050159986,
+      "loss": 1.0443,
+      "step": 10786
+    },
+    {
+      "epoch": 1.9205840455840457,
+      "grad_norm": 0.5850204229354858,
+      "learning_rate": 0.0001064064518464214,
+      "loss": 1.0155,
+      "step": 10787
+    },
+    {
+      "epoch": 1.9207621082621082,
+      "grad_norm": 0.7320640683174133,
+      "learning_rate": 0.00010639248306572178,
+      "loss": 1.1556,
+      "step": 10788
+    },
+    {
+      "epoch": 1.9209401709401708,
+      "grad_norm": 0.689804196357727,
+      "learning_rate": 0.00010637851415977478,
+      "loss": 1.1058,
+      "step": 10789
+    },
+    {
+      "epoch": 1.9211182336182335,
+      "grad_norm": 0.6433262228965759,
+      "learning_rate": 0.000106364545128854,
+      "loss": 1.0916,
+      "step": 10790
+    },
+    {
+      "epoch": 1.9212962962962963,
+      "grad_norm": 0.6802626252174377,
+      "learning_rate": 0.00010635057597323323,
+      "loss": 1.126,
+      "step": 10791
+    },
+    {
+      "epoch": 1.921474358974359,
+      "grad_norm": 0.7503384351730347,
+      "learning_rate": 0.00010633660669318608,
+      "loss": 0.9354,
+      "step": 10792
+    },
+    {
+      "epoch": 1.9216524216524218,
+      "grad_norm": 0.6370253562927246,
+      "learning_rate": 0.00010632263728898629,
+      "loss": 0.9976,
+      "step": 10793
+    },
+    {
+      "epoch": 1.9218304843304843,
+      "grad_norm": 0.7566042542457581,
+      "learning_rate": 0.00010630866776090755,
+      "loss": 1.0311,
+      "step": 10794
+    },
+    {
+      "epoch": 1.922008547008547,
+      "grad_norm": 0.7011943459510803,
+      "learning_rate": 0.0001062946981092236,
+      "loss": 0.8777,
+      "step": 10795
+    },
+    {
+      "epoch": 1.9221866096866096,
+      "grad_norm": 0.6621114015579224,
+      "learning_rate": 0.00010628072833420811,
+      "loss": 0.9615,
+      "step": 10796
+    },
+    {
+      "epoch": 1.9223646723646723,
+      "grad_norm": 0.6863150000572205,
+      "learning_rate": 0.00010626675843613478,
+      "loss": 1.071,
+      "step": 10797
+    },
+    {
+      "epoch": 1.922542735042735,
+      "grad_norm": 0.597970724105835,
+      "learning_rate": 0.00010625278841527733,
+      "loss": 0.8661,
+      "step": 10798
+    },
+    {
+      "epoch": 1.9227207977207978,
+      "grad_norm": 0.5958755612373352,
+      "learning_rate": 0.00010623881827190947,
+      "loss": 0.9075,
+      "step": 10799
+    },
+    {
+      "epoch": 1.9228988603988604,
+      "grad_norm": 0.7764523029327393,
+      "learning_rate": 0.00010622484800630494,
+      "loss": 1.0576,
+      "step": 10800
+    },
+    {
+      "epoch": 1.9230769230769231,
+      "grad_norm": 0.774156391620636,
+      "learning_rate": 0.00010621087761873748,
+      "loss": 0.9273,
+      "step": 10801
+    },
+    {
+      "epoch": 1.9232549857549857,
+      "grad_norm": 0.6321687698364258,
+      "learning_rate": 0.00010619690710948074,
+      "loss": 0.8805,
+      "step": 10802
+    },
+    {
+      "epoch": 1.9234330484330484,
+      "grad_norm": 0.659538984298706,
+      "learning_rate": 0.00010618293647880846,
+      "loss": 0.9845,
+      "step": 10803
+    },
+    {
+      "epoch": 1.9236111111111112,
+      "grad_norm": 0.6931299567222595,
+      "learning_rate": 0.00010616896572699442,
+      "loss": 1.2005,
+      "step": 10804
+    },
+    {
+      "epoch": 1.923789173789174,
+      "grad_norm": 0.6054762005805969,
+      "learning_rate": 0.00010615499485431228,
+      "loss": 0.825,
+      "step": 10805
+    },
+    {
+      "epoch": 1.9239672364672364,
+      "grad_norm": 0.6631526947021484,
+      "learning_rate": 0.00010614102386103584,
+      "loss": 0.9149,
+      "step": 10806
+    },
+    {
+      "epoch": 1.9241452991452992,
+      "grad_norm": 0.6667893528938293,
+      "learning_rate": 0.00010612705274743878,
+      "loss": 1.014,
+      "step": 10807
+    },
+    {
+      "epoch": 1.9243233618233617,
+      "grad_norm": 0.861302375793457,
+      "learning_rate": 0.00010611308151379482,
+      "loss": 1.0809,
+      "step": 10808
+    },
+    {
+      "epoch": 1.9245014245014245,
+      "grad_norm": 0.6997994780540466,
+      "learning_rate": 0.00010609911016037777,
+      "loss": 0.8897,
+      "step": 10809
+    },
+    {
+      "epoch": 1.9246794871794872,
+      "grad_norm": 0.5689206123352051,
+      "learning_rate": 0.00010608513868746131,
+      "loss": 0.7517,
+      "step": 10810
+    },
+    {
+      "epoch": 1.92485754985755,
+      "grad_norm": 0.5972287654876709,
+      "learning_rate": 0.00010607116709531918,
+      "loss": 0.9015,
+      "step": 10811
+    },
+    {
+      "epoch": 1.9250356125356125,
+      "grad_norm": 0.7115643620491028,
+      "learning_rate": 0.00010605719538422519,
+      "loss": 0.6974,
+      "step": 10812
+    },
+    {
+      "epoch": 1.9252136752136753,
+      "grad_norm": 0.6548098921775818,
+      "learning_rate": 0.00010604322355445297,
+      "loss": 0.7075,
+      "step": 10813
+    },
+    {
+      "epoch": 1.9253917378917378,
+      "grad_norm": 0.6666337847709656,
+      "learning_rate": 0.00010602925160627639,
+      "loss": 1.0389,
+      "step": 10814
+    },
+    {
+      "epoch": 1.9255698005698005,
+      "grad_norm": 0.7754444479942322,
+      "learning_rate": 0.00010601527953996913,
+      "loss": 1.0674,
+      "step": 10815
+    },
+    {
+      "epoch": 1.9257478632478633,
+      "grad_norm": 0.6602712869644165,
+      "learning_rate": 0.00010600130735580498,
+      "loss": 1.2622,
+      "step": 10816
+    },
+    {
+      "epoch": 1.925925925925926,
+      "grad_norm": 0.6974020004272461,
+      "learning_rate": 0.00010598733505405767,
+      "loss": 0.9748,
+      "step": 10817
+    },
+    {
+      "epoch": 1.9261039886039886,
+      "grad_norm": 0.6236271858215332,
+      "learning_rate": 0.00010597336263500095,
+      "loss": 0.9463,
+      "step": 10818
+    },
+    {
+      "epoch": 1.9262820512820513,
+      "grad_norm": 0.6856079697608948,
+      "learning_rate": 0.00010595939009890859,
+      "loss": 0.9484,
+      "step": 10819
+    },
+    {
+      "epoch": 1.9264601139601139,
+      "grad_norm": 0.7300925850868225,
+      "learning_rate": 0.00010594541744605437,
+      "loss": 0.9702,
+      "step": 10820
+    },
+    {
+      "epoch": 1.9266381766381766,
+      "grad_norm": 0.6546478867530823,
+      "learning_rate": 0.00010593144467671208,
+      "loss": 0.8235,
+      "step": 10821
+    },
+    {
+      "epoch": 1.9268162393162394,
+      "grad_norm": 0.7215169072151184,
+      "learning_rate": 0.00010591747179115543,
+      "loss": 0.9986,
+      "step": 10822
+    },
+    {
+      "epoch": 1.926994301994302,
+      "grad_norm": 0.7304712533950806,
+      "learning_rate": 0.00010590349878965822,
+      "loss": 1.099,
+      "step": 10823
+    },
+    {
+      "epoch": 1.9271723646723646,
+      "grad_norm": 0.5883305668830872,
+      "learning_rate": 0.0001058895256724942,
+      "loss": 1.0647,
+      "step": 10824
+    },
+    {
+      "epoch": 1.9273504273504274,
+      "grad_norm": 0.8067272305488586,
+      "learning_rate": 0.00010587555243993716,
+      "loss": 1.0295,
+      "step": 10825
+    },
+    {
+      "epoch": 1.92752849002849,
+      "grad_norm": 0.6607550978660583,
+      "learning_rate": 0.00010586157909226089,
+      "loss": 0.8669,
+      "step": 10826
+    },
+    {
+      "epoch": 1.9277065527065527,
+      "grad_norm": 0.7256106734275818,
+      "learning_rate": 0.00010584760562973914,
+      "loss": 1.1674,
+      "step": 10827
+    },
+    {
+      "epoch": 1.9278846153846154,
+      "grad_norm": 0.6584621071815491,
+      "learning_rate": 0.00010583363205264574,
+      "loss": 0.8901,
+      "step": 10828
+    },
+    {
+      "epoch": 1.9280626780626782,
+      "grad_norm": 0.7200617790222168,
+      "learning_rate": 0.00010581965836125439,
+      "loss": 1.0463,
+      "step": 10829
+    },
+    {
+      "epoch": 1.9282407407407407,
+      "grad_norm": 0.7244223952293396,
+      "learning_rate": 0.00010580568455583894,
+      "loss": 1.0973,
+      "step": 10830
+    },
+    {
+      "epoch": 1.9284188034188035,
+      "grad_norm": 0.7678009867668152,
+      "learning_rate": 0.00010579171063667317,
+      "loss": 1.1753,
+      "step": 10831
+    },
+    {
+      "epoch": 1.928596866096866,
+      "grad_norm": 0.6455881595611572,
+      "learning_rate": 0.00010577773660403085,
+      "loss": 0.8988,
+      "step": 10832
+    },
+    {
+      "epoch": 1.9287749287749287,
+      "grad_norm": 0.6804864406585693,
+      "learning_rate": 0.0001057637624581858,
+      "loss": 0.8156,
+      "step": 10833
+    },
+    {
+      "epoch": 1.9289529914529915,
+      "grad_norm": 0.7874828577041626,
+      "learning_rate": 0.00010574978819941176,
+      "loss": 1.1876,
+      "step": 10834
+    },
+    {
+      "epoch": 1.9291310541310542,
+      "grad_norm": 0.7396490573883057,
+      "learning_rate": 0.00010573581382798261,
+      "loss": 0.8709,
+      "step": 10835
+    },
+    {
+      "epoch": 1.9293091168091168,
+      "grad_norm": 0.6800381541252136,
+      "learning_rate": 0.00010572183934417209,
+      "loss": 0.9906,
+      "step": 10836
+    },
+    {
+      "epoch": 1.9294871794871795,
+      "grad_norm": 0.7077754139900208,
+      "learning_rate": 0.000105707864748254,
+      "loss": 0.9785,
+      "step": 10837
+    },
+    {
+      "epoch": 1.929665242165242,
+      "grad_norm": 0.693249523639679,
+      "learning_rate": 0.00010569389004050216,
+      "loss": 0.9515,
+      "step": 10838
+    },
+    {
+      "epoch": 1.9298433048433048,
+      "grad_norm": 0.706924319267273,
+      "learning_rate": 0.00010567991522119037,
+      "loss": 1.074,
+      "step": 10839
+    },
+    {
+      "epoch": 1.9300213675213675,
+      "grad_norm": 0.6504101157188416,
+      "learning_rate": 0.00010566594029059244,
+      "loss": 1.0635,
+      "step": 10840
+    },
+    {
+      "epoch": 1.9301994301994303,
+      "grad_norm": 0.7620238661766052,
+      "learning_rate": 0.00010565196524898219,
+      "loss": 0.944,
+      "step": 10841
+    },
+    {
+      "epoch": 1.9303774928774928,
+      "grad_norm": 0.6713484525680542,
+      "learning_rate": 0.00010563799009663344,
+      "loss": 0.749,
+      "step": 10842
+    },
+    {
+      "epoch": 1.9305555555555556,
+      "grad_norm": 0.9279242157936096,
+      "learning_rate": 0.00010562401483381997,
+      "loss": 0.961,
+      "step": 10843
+    },
+    {
+      "epoch": 1.930733618233618,
+      "grad_norm": 0.6710723638534546,
+      "learning_rate": 0.00010561003946081558,
+      "loss": 1.1288,
+      "step": 10844
+    },
+    {
+      "epoch": 1.9309116809116809,
+      "grad_norm": 0.7751701474189758,
+      "learning_rate": 0.00010559606397789416,
+      "loss": 0.9435,
+      "step": 10845
+    },
+    {
+      "epoch": 1.9310897435897436,
+      "grad_norm": 0.6741766929626465,
+      "learning_rate": 0.00010558208838532948,
+      "loss": 1.0299,
+      "step": 10846
+    },
+    {
+      "epoch": 1.9312678062678064,
+      "grad_norm": 0.6988041400909424,
+      "learning_rate": 0.00010556811268339539,
+      "loss": 1.0236,
+      "step": 10847
+    },
+    {
+      "epoch": 1.931445868945869,
+      "grad_norm": 0.6353505253791809,
+      "learning_rate": 0.00010555413687236568,
+      "loss": 1.0361,
+      "step": 10848
+    },
+    {
+      "epoch": 1.9316239316239316,
+      "grad_norm": 0.7162703275680542,
+      "learning_rate": 0.0001055401609525142,
+      "loss": 1.0931,
+      "step": 10849
+    },
+    {
+      "epoch": 1.9318019943019942,
+      "grad_norm": 0.61545330286026,
+      "learning_rate": 0.00010552618492411476,
+      "loss": 0.8829,
+      "step": 10850
+    },
+    {
+      "epoch": 1.931980056980057,
+      "grad_norm": 0.6304612159729004,
+      "learning_rate": 0.00010551220878744124,
+      "loss": 0.8574,
+      "step": 10851
+    },
+    {
+      "epoch": 1.9321581196581197,
+      "grad_norm": 0.6372067928314209,
+      "learning_rate": 0.00010549823254276743,
+      "loss": 1.0949,
+      "step": 10852
+    },
+    {
+      "epoch": 1.9323361823361824,
+      "grad_norm": 0.6952856779098511,
+      "learning_rate": 0.00010548425619036715,
+      "loss": 0.9232,
+      "step": 10853
+    },
+    {
+      "epoch": 1.9325142450142452,
+      "grad_norm": 0.6510106325149536,
+      "learning_rate": 0.00010547027973051427,
+      "loss": 1.0753,
+      "step": 10854
+    },
+    {
+      "epoch": 1.9326923076923077,
+      "grad_norm": 0.6377716064453125,
+      "learning_rate": 0.00010545630316348263,
+      "loss": 0.8466,
+      "step": 10855
+    },
+    {
+      "epoch": 1.9328703703703702,
+      "grad_norm": 0.7366968393325806,
+      "learning_rate": 0.00010544232648954606,
+      "loss": 0.9351,
+      "step": 10856
+    },
+    {
+      "epoch": 1.933048433048433,
+      "grad_norm": 0.703652024269104,
+      "learning_rate": 0.00010542834970897843,
+      "loss": 1.0032,
+      "step": 10857
+    },
+    {
+      "epoch": 1.9332264957264957,
+      "grad_norm": 0.6685494780540466,
+      "learning_rate": 0.00010541437282205355,
+      "loss": 0.8818,
+      "step": 10858
+    },
+    {
+      "epoch": 1.9334045584045585,
+      "grad_norm": 0.6594362854957581,
+      "learning_rate": 0.00010540039582904527,
+      "loss": 0.9535,
+      "step": 10859
+    },
+    {
+      "epoch": 1.9335826210826212,
+      "grad_norm": 0.8003259301185608,
+      "learning_rate": 0.00010538641873022744,
+      "loss": 0.8852,
+      "step": 10860
+    },
+    {
+      "epoch": 1.9337606837606838,
+      "grad_norm": 0.6567012071609497,
+      "learning_rate": 0.00010537244152587393,
+      "loss": 1.0832,
+      "step": 10861
+    },
+    {
+      "epoch": 1.9339387464387463,
+      "grad_norm": 0.6714941263198853,
+      "learning_rate": 0.00010535846421625862,
+      "loss": 1.1047,
+      "step": 10862
+    },
+    {
+      "epoch": 1.934116809116809,
+      "grad_norm": 0.6998924612998962,
+      "learning_rate": 0.00010534448680165531,
+      "loss": 0.8827,
+      "step": 10863
+    },
+    {
+      "epoch": 1.9342948717948718,
+      "grad_norm": 0.6065765619277954,
+      "learning_rate": 0.0001053305092823379,
+      "loss": 0.5773,
+      "step": 10864
+    },
+    {
+      "epoch": 1.9344729344729346,
+      "grad_norm": 0.7678273916244507,
+      "learning_rate": 0.0001053165316585802,
+      "loss": 0.9199,
+      "step": 10865
+    },
+    {
+      "epoch": 1.9346509971509973,
+      "grad_norm": 0.7071540951728821,
+      "learning_rate": 0.00010530255393065613,
+      "loss": 1.0292,
+      "step": 10866
+    },
+    {
+      "epoch": 1.9348290598290598,
+      "grad_norm": 0.6329835057258606,
+      "learning_rate": 0.00010528857609883956,
+      "loss": 0.9915,
+      "step": 10867
+    },
+    {
+      "epoch": 1.9350071225071224,
+      "grad_norm": 0.6274038553237915,
+      "learning_rate": 0.00010527459816340427,
+      "loss": 0.8499,
+      "step": 10868
+    },
+    {
+      "epoch": 1.9351851851851851,
+      "grad_norm": 0.6564371585845947,
+      "learning_rate": 0.00010526062012462424,
+      "loss": 1.1707,
+      "step": 10869
+    },
+    {
+      "epoch": 1.9353632478632479,
+      "grad_norm": 0.8561269044876099,
+      "learning_rate": 0.00010524664198277326,
+      "loss": 1.148,
+      "step": 10870
+    },
+    {
+      "epoch": 1.9355413105413106,
+      "grad_norm": 0.6322671175003052,
+      "learning_rate": 0.00010523266373812521,
+      "loss": 0.9165,
+      "step": 10871
+    },
+    {
+      "epoch": 1.9357193732193734,
+      "grad_norm": 0.7602947354316711,
+      "learning_rate": 0.00010521868539095403,
+      "loss": 0.9647,
+      "step": 10872
+    },
+    {
+      "epoch": 1.935897435897436,
+      "grad_norm": 0.5962168574333191,
+      "learning_rate": 0.00010520470694153353,
+      "loss": 0.8585,
+      "step": 10873
+    },
+    {
+      "epoch": 1.9360754985754984,
+      "grad_norm": 0.7498637437820435,
+      "learning_rate": 0.00010519072839013757,
+      "loss": 0.9828,
+      "step": 10874
+    },
+    {
+      "epoch": 1.9362535612535612,
+      "grad_norm": 0.6841256022453308,
+      "learning_rate": 0.00010517674973704012,
+      "loss": 0.9991,
+      "step": 10875
+    },
+    {
+      "epoch": 1.936431623931624,
+      "grad_norm": 0.8281826972961426,
+      "learning_rate": 0.00010516277098251499,
+      "loss": 1.028,
+      "step": 10876
+    },
+    {
+      "epoch": 1.9366096866096867,
+      "grad_norm": 0.6673563718795776,
+      "learning_rate": 0.0001051487921268361,
+      "loss": 1.1594,
+      "step": 10877
+    },
+    {
+      "epoch": 1.9367877492877494,
+      "grad_norm": 0.7833667993545532,
+      "learning_rate": 0.00010513481317027733,
+      "loss": 0.7675,
+      "step": 10878
+    },
+    {
+      "epoch": 1.936965811965812,
+      "grad_norm": 0.6087225675582886,
+      "learning_rate": 0.00010512083411311253,
+      "loss": 0.7803,
+      "step": 10879
+    },
+    {
+      "epoch": 1.9371438746438745,
+      "grad_norm": 0.6758120656013489,
+      "learning_rate": 0.00010510685495561563,
+      "loss": 1.0621,
+      "step": 10880
+    },
+    {
+      "epoch": 1.9373219373219372,
+      "grad_norm": 0.6720096468925476,
+      "learning_rate": 0.00010509287569806055,
+      "loss": 0.8502,
+      "step": 10881
+    },
+    {
+      "epoch": 1.9375,
+      "grad_norm": 0.6233887672424316,
+      "learning_rate": 0.00010507889634072113,
+      "loss": 1.0127,
+      "step": 10882
+    },
+    {
+      "epoch": 1.9376780626780628,
+      "grad_norm": 0.667742908000946,
+      "learning_rate": 0.00010506491688387127,
+      "loss": 0.9086,
+      "step": 10883
+    },
+    {
+      "epoch": 1.9378561253561255,
+      "grad_norm": 0.6533677577972412,
+      "learning_rate": 0.00010505093732778492,
+      "loss": 0.9724,
+      "step": 10884
+    },
+    {
+      "epoch": 1.938034188034188,
+      "grad_norm": 0.7171359062194824,
+      "learning_rate": 0.00010503695767273591,
+      "loss": 0.9915,
+      "step": 10885
+    },
+    {
+      "epoch": 1.9382122507122506,
+      "grad_norm": 0.723655641078949,
+      "learning_rate": 0.0001050229779189982,
+      "loss": 0.8981,
+      "step": 10886
+    },
+    {
+      "epoch": 1.9383903133903133,
+      "grad_norm": 0.6863494515419006,
+      "learning_rate": 0.00010500899806684568,
+      "loss": 1.2577,
+      "step": 10887
+    },
+    {
+      "epoch": 1.938568376068376,
+      "grad_norm": 0.8174706697463989,
+      "learning_rate": 0.00010499501811655224,
+      "loss": 0.9848,
+      "step": 10888
+    },
+    {
+      "epoch": 1.9387464387464388,
+      "grad_norm": 0.6378024220466614,
+      "learning_rate": 0.00010498103806839179,
+      "loss": 0.8499,
+      "step": 10889
+    },
+    {
+      "epoch": 1.9389245014245016,
+      "grad_norm": 0.6734544634819031,
+      "learning_rate": 0.00010496705792263823,
+      "loss": 0.8446,
+      "step": 10890
+    },
+    {
+      "epoch": 1.939102564102564,
+      "grad_norm": 0.6802361607551575,
+      "learning_rate": 0.00010495307767956551,
+      "loss": 0.9285,
+      "step": 10891
+    },
+    {
+      "epoch": 1.9392806267806266,
+      "grad_norm": 0.7821299433708191,
+      "learning_rate": 0.00010493909733944752,
+      "loss": 1.08,
+      "step": 10892
+    },
+    {
+      "epoch": 1.9394586894586894,
+      "grad_norm": 0.6204990148544312,
+      "learning_rate": 0.00010492511690255818,
+      "loss": 0.7861,
+      "step": 10893
+    },
+    {
+      "epoch": 1.9396367521367521,
+      "grad_norm": 0.6386391520500183,
+      "learning_rate": 0.0001049111363691714,
+      "loss": 0.9162,
+      "step": 10894
+    },
+    {
+      "epoch": 1.9398148148148149,
+      "grad_norm": 0.6885092854499817,
+      "learning_rate": 0.0001048971557395611,
+      "loss": 1.0026,
+      "step": 10895
+    },
+    {
+      "epoch": 1.9399928774928776,
+      "grad_norm": 0.6962558627128601,
+      "learning_rate": 0.00010488317501400122,
+      "loss": 1.146,
+      "step": 10896
+    },
+    {
+      "epoch": 1.9401709401709402,
+      "grad_norm": 0.6283716559410095,
+      "learning_rate": 0.00010486919419276566,
+      "loss": 1.0268,
+      "step": 10897
+    },
+    {
+      "epoch": 1.9403490028490027,
+      "grad_norm": 0.7183622717857361,
+      "learning_rate": 0.00010485521327612835,
+      "loss": 1.0123,
+      "step": 10898
+    },
+    {
+      "epoch": 1.9405270655270654,
+      "grad_norm": 0.6354197263717651,
+      "learning_rate": 0.00010484123226436321,
+      "loss": 0.871,
+      "step": 10899
+    },
+    {
+      "epoch": 1.9407051282051282,
+      "grad_norm": 0.804358184337616,
+      "learning_rate": 0.00010482725115774421,
+      "loss": 1.1001,
+      "step": 10900
+    },
+    {
+      "epoch": 1.940883190883191,
+      "grad_norm": 0.6896754503250122,
+      "learning_rate": 0.00010481326995654524,
+      "loss": 1.0976,
+      "step": 10901
+    },
+    {
+      "epoch": 1.9410612535612537,
+      "grad_norm": 0.9108015894889832,
+      "learning_rate": 0.00010479928866104023,
+      "loss": 0.8785,
+      "step": 10902
+    },
+    {
+      "epoch": 1.9412393162393162,
+      "grad_norm": 0.6963121294975281,
+      "learning_rate": 0.00010478530727150316,
+      "loss": 1.0458,
+      "step": 10903
+    },
+    {
+      "epoch": 1.9414173789173788,
+      "grad_norm": 0.6657114624977112,
+      "learning_rate": 0.00010477132578820792,
+      "loss": 0.8188,
+      "step": 10904
+    },
+    {
+      "epoch": 1.9415954415954415,
+      "grad_norm": 0.671716034412384,
+      "learning_rate": 0.00010475734421142847,
+      "loss": 1.0915,
+      "step": 10905
+    },
+    {
+      "epoch": 1.9417735042735043,
+      "grad_norm": 0.6790717244148254,
+      "learning_rate": 0.0001047433625414387,
+      "loss": 0.9688,
+      "step": 10906
+    },
+    {
+      "epoch": 1.941951566951567,
+      "grad_norm": 0.6411764621734619,
+      "learning_rate": 0.00010472938077851264,
+      "loss": 1.0387,
+      "step": 10907
+    },
+    {
+      "epoch": 1.9421296296296298,
+      "grad_norm": 0.8579615950584412,
+      "learning_rate": 0.00010471539892292417,
+      "loss": 1.1635,
+      "step": 10908
+    },
+    {
+      "epoch": 1.9423076923076923,
+      "grad_norm": 0.7031029462814331,
+      "learning_rate": 0.00010470141697494726,
+      "loss": 0.9813,
+      "step": 10909
+    },
+    {
+      "epoch": 1.9424857549857548,
+      "grad_norm": 0.6657388806343079,
+      "learning_rate": 0.00010468743493485584,
+      "loss": 0.7947,
+      "step": 10910
+    },
+    {
+      "epoch": 1.9426638176638176,
+      "grad_norm": 0.6364194750785828,
+      "learning_rate": 0.00010467345280292389,
+      "loss": 0.8554,
+      "step": 10911
+    },
+    {
+      "epoch": 1.9428418803418803,
+      "grad_norm": 0.7394127249717712,
+      "learning_rate": 0.00010465947057942534,
+      "loss": 0.822,
+      "step": 10912
+    },
+    {
+      "epoch": 1.943019943019943,
+      "grad_norm": 0.6557473540306091,
+      "learning_rate": 0.00010464548826463411,
+      "loss": 1.0025,
+      "step": 10913
+    },
+    {
+      "epoch": 1.9431980056980058,
+      "grad_norm": 0.6530601382255554,
+      "learning_rate": 0.00010463150585882422,
+      "loss": 1.0828,
+      "step": 10914
+    },
+    {
+      "epoch": 1.9433760683760684,
+      "grad_norm": 0.7376404404640198,
+      "learning_rate": 0.00010461752336226957,
+      "loss": 0.9413,
+      "step": 10915
+    },
+    {
+      "epoch": 1.943554131054131,
+      "grad_norm": 0.7110656499862671,
+      "learning_rate": 0.00010460354077524417,
+      "loss": 0.9162,
+      "step": 10916
+    },
+    {
+      "epoch": 1.9437321937321936,
+      "grad_norm": 0.6515666246414185,
+      "learning_rate": 0.00010458955809802194,
+      "loss": 0.9211,
+      "step": 10917
+    },
+    {
+      "epoch": 1.9439102564102564,
+      "grad_norm": 0.6888720989227295,
+      "learning_rate": 0.00010457557533087683,
+      "loss": 1.0632,
+      "step": 10918
+    },
+    {
+      "epoch": 1.9440883190883191,
+      "grad_norm": 0.7246627807617188,
+      "learning_rate": 0.00010456159247408286,
+      "loss": 0.9807,
+      "step": 10919
+    },
+    {
+      "epoch": 1.944266381766382,
+      "grad_norm": 0.727834165096283,
+      "learning_rate": 0.00010454760952791394,
+      "loss": 1.0793,
+      "step": 10920
+    },
+    {
+      "epoch": 1.9444444444444444,
+      "grad_norm": 0.6365306377410889,
+      "learning_rate": 0.00010453362649264407,
+      "loss": 1.0415,
+      "step": 10921
+    },
+    {
+      "epoch": 1.9446225071225072,
+      "grad_norm": 0.7187839150428772,
+      "learning_rate": 0.0001045196433685472,
+      "loss": 1.007,
+      "step": 10922
+    },
+    {
+      "epoch": 1.9448005698005697,
+      "grad_norm": 0.5905138254165649,
+      "learning_rate": 0.00010450566015589732,
+      "loss": 0.9818,
+      "step": 10923
+    },
+    {
+      "epoch": 1.9449786324786325,
+      "grad_norm": 0.7008894085884094,
+      "learning_rate": 0.00010449167685496837,
+      "loss": 0.8444,
+      "step": 10924
+    },
+    {
+      "epoch": 1.9451566951566952,
+      "grad_norm": 0.6126312017440796,
+      "learning_rate": 0.00010447769346603435,
+      "loss": 0.7207,
+      "step": 10925
+    },
+    {
+      "epoch": 1.945334757834758,
+      "grad_norm": 0.7513176202774048,
+      "learning_rate": 0.00010446370998936922,
+      "loss": 0.8693,
+      "step": 10926
+    },
+    {
+      "epoch": 1.9455128205128205,
+      "grad_norm": 0.6382531523704529,
+      "learning_rate": 0.00010444972642524697,
+      "loss": 0.8379,
+      "step": 10927
+    },
+    {
+      "epoch": 1.9456908831908832,
+      "grad_norm": 0.7062170505523682,
+      "learning_rate": 0.0001044357427739416,
+      "loss": 1.0525,
+      "step": 10928
+    },
+    {
+      "epoch": 1.9458689458689458,
+      "grad_norm": 0.6954067349433899,
+      "learning_rate": 0.00010442175903572703,
+      "loss": 1.0238,
+      "step": 10929
+    },
+    {
+      "epoch": 1.9460470085470085,
+      "grad_norm": 0.7257117033004761,
+      "learning_rate": 0.00010440777521087731,
+      "loss": 1.1413,
+      "step": 10930
+    },
+    {
+      "epoch": 1.9462250712250713,
+      "grad_norm": 0.6617701053619385,
+      "learning_rate": 0.00010439379129966635,
+      "loss": 1.0089,
+      "step": 10931
+    },
+    {
+      "epoch": 1.946403133903134,
+      "grad_norm": 0.6860800385475159,
+      "learning_rate": 0.00010437980730236821,
+      "loss": 1.1778,
+      "step": 10932
+    },
+    {
+      "epoch": 1.9465811965811965,
+      "grad_norm": 0.846235454082489,
+      "learning_rate": 0.00010436582321925684,
+      "loss": 0.9851,
+      "step": 10933
+    },
+    {
+      "epoch": 1.9467592592592593,
+      "grad_norm": 0.6385617852210999,
+      "learning_rate": 0.00010435183905060623,
+      "loss": 0.9542,
+      "step": 10934
+    },
+    {
+      "epoch": 1.9469373219373218,
+      "grad_norm": 0.7137401700019836,
+      "learning_rate": 0.00010433785479669038,
+      "loss": 1.0499,
+      "step": 10935
+    },
+    {
+      "epoch": 1.9471153846153846,
+      "grad_norm": 0.6269308924674988,
+      "learning_rate": 0.00010432387045778324,
+      "loss": 0.8929,
+      "step": 10936
+    },
+    {
+      "epoch": 1.9472934472934473,
+      "grad_norm": 0.7903163433074951,
+      "learning_rate": 0.00010430988603415888,
+      "loss": 0.9812,
+      "step": 10937
+    },
+    {
+      "epoch": 1.94747150997151,
+      "grad_norm": 0.6006736159324646,
+      "learning_rate": 0.00010429590152609121,
+      "loss": 0.7959,
+      "step": 10938
+    },
+    {
+      "epoch": 1.9476495726495726,
+      "grad_norm": 0.6061521768569946,
+      "learning_rate": 0.00010428191693385431,
+      "loss": 0.8748,
+      "step": 10939
+    },
+    {
+      "epoch": 1.9478276353276354,
+      "grad_norm": 0.6637623906135559,
+      "learning_rate": 0.00010426793225772216,
+      "loss": 0.7047,
+      "step": 10940
+    },
+    {
+      "epoch": 1.948005698005698,
+      "grad_norm": 0.7650586366653442,
+      "learning_rate": 0.00010425394749796874,
+      "loss": 1.0018,
+      "step": 10941
+    },
+    {
+      "epoch": 1.9481837606837606,
+      "grad_norm": 0.6575125455856323,
+      "learning_rate": 0.000104239962654868,
+      "loss": 0.8915,
+      "step": 10942
+    },
+    {
+      "epoch": 1.9483618233618234,
+      "grad_norm": 0.6315393447875977,
+      "learning_rate": 0.00010422597772869404,
+      "loss": 1.1884,
+      "step": 10943
+    },
+    {
+      "epoch": 1.9485398860398861,
+      "grad_norm": 0.7607148885726929,
+      "learning_rate": 0.00010421199271972083,
+      "loss": 0.9341,
+      "step": 10944
+    },
+    {
+      "epoch": 1.9487179487179487,
+      "grad_norm": 0.6491827964782715,
+      "learning_rate": 0.00010419800762822239,
+      "loss": 0.9991,
+      "step": 10945
+    },
+    {
+      "epoch": 1.9488960113960114,
+      "grad_norm": 0.6294243335723877,
+      "learning_rate": 0.00010418402245447265,
+      "loss": 0.9253,
+      "step": 10946
+    },
+    {
+      "epoch": 1.949074074074074,
+      "grad_norm": 0.6472215056419373,
+      "learning_rate": 0.00010417003719874571,
+      "loss": 1.0402,
+      "step": 10947
+    },
+    {
+      "epoch": 1.9492521367521367,
+      "grad_norm": 0.7377899885177612,
+      "learning_rate": 0.00010415605186131559,
+      "loss": 1.046,
+      "step": 10948
+    },
+    {
+      "epoch": 1.9494301994301995,
+      "grad_norm": 0.6391907334327698,
+      "learning_rate": 0.00010414206644245623,
+      "loss": 0.8529,
+      "step": 10949
+    },
+    {
+      "epoch": 1.9496082621082622,
+      "grad_norm": 0.7101355195045471,
+      "learning_rate": 0.0001041280809424417,
+      "loss": 0.925,
+      "step": 10950
+    },
+    {
+      "epoch": 1.9497863247863247,
+      "grad_norm": 0.7891978025436401,
+      "learning_rate": 0.00010411409536154597,
+      "loss": 1.0691,
+      "step": 10951
+    },
+    {
+      "epoch": 1.9499643874643875,
+      "grad_norm": 0.7225242853164673,
+      "learning_rate": 0.00010410010970004311,
+      "loss": 1.158,
+      "step": 10952
+    },
+    {
+      "epoch": 1.95014245014245,
+      "grad_norm": 0.6073256731033325,
+      "learning_rate": 0.00010408612395820714,
+      "loss": 0.9977,
+      "step": 10953
+    },
+    {
+      "epoch": 1.9503205128205128,
+      "grad_norm": 0.6373769044876099,
+      "learning_rate": 0.00010407213813631203,
+      "loss": 1.019,
+      "step": 10954
+    },
+    {
+      "epoch": 1.9504985754985755,
+      "grad_norm": 0.7451884746551514,
+      "learning_rate": 0.00010405815223463184,
+      "loss": 0.9497,
+      "step": 10955
+    },
+    {
+      "epoch": 1.9506766381766383,
+      "grad_norm": 0.7760418057441711,
+      "learning_rate": 0.00010404416625344058,
+      "loss": 1.0378,
+      "step": 10956
+    },
+    {
+      "epoch": 1.9508547008547008,
+      "grad_norm": 0.7057808041572571,
+      "learning_rate": 0.00010403018019301228,
+      "loss": 0.8953,
+      "step": 10957
+    },
+    {
+      "epoch": 1.9510327635327636,
+      "grad_norm": 0.6599584817886353,
+      "learning_rate": 0.00010401619405362095,
+      "loss": 0.8859,
+      "step": 10958
+    },
+    {
+      "epoch": 1.951210826210826,
+      "grad_norm": 0.6977253556251526,
+      "learning_rate": 0.00010400220783554069,
+      "loss": 0.9038,
+      "step": 10959
+    },
+    {
+      "epoch": 1.9513888888888888,
+      "grad_norm": 0.6930267810821533,
+      "learning_rate": 0.00010398822153904546,
+      "loss": 1.1547,
+      "step": 10960
+    },
+    {
+      "epoch": 1.9515669515669516,
+      "grad_norm": 0.6301694512367249,
+      "learning_rate": 0.00010397423516440931,
+      "loss": 0.8875,
+      "step": 10961
+    },
+    {
+      "epoch": 1.9517450142450143,
+      "grad_norm": 0.7447484135627747,
+      "learning_rate": 0.00010396024871190628,
+      "loss": 1.0454,
+      "step": 10962
+    },
+    {
+      "epoch": 1.9519230769230769,
+      "grad_norm": 0.8666765093803406,
+      "learning_rate": 0.00010394626218181041,
+      "loss": 1.2211,
+      "step": 10963
+    },
+    {
+      "epoch": 1.9521011396011396,
+      "grad_norm": 0.599354088306427,
+      "learning_rate": 0.00010393227557439573,
+      "loss": 1.0419,
+      "step": 10964
+    },
+    {
+      "epoch": 1.9522792022792022,
+      "grad_norm": 0.6991702914237976,
+      "learning_rate": 0.00010391828888993627,
+      "loss": 0.8217,
+      "step": 10965
+    },
+    {
+      "epoch": 1.952457264957265,
+      "grad_norm": 0.7467028498649597,
+      "learning_rate": 0.0001039043021287061,
+      "loss": 0.8708,
+      "step": 10966
+    },
+    {
+      "epoch": 1.9526353276353277,
+      "grad_norm": 0.6806215047836304,
+      "learning_rate": 0.0001038903152909792,
+      "loss": 1.218,
+      "step": 10967
+    },
+    {
+      "epoch": 1.9528133903133904,
+      "grad_norm": 0.6704212427139282,
+      "learning_rate": 0.00010387632837702968,
+      "loss": 0.8428,
+      "step": 10968
+    },
+    {
+      "epoch": 1.952991452991453,
+      "grad_norm": 0.6843154430389404,
+      "learning_rate": 0.00010386234138713155,
+      "loss": 0.9729,
+      "step": 10969
+    },
+    {
+      "epoch": 1.9531695156695157,
+      "grad_norm": 0.6619821190834045,
+      "learning_rate": 0.00010384835432155888,
+      "loss": 1.021,
+      "step": 10970
+    },
+    {
+      "epoch": 1.9533475783475782,
+      "grad_norm": 0.6249803900718689,
+      "learning_rate": 0.0001038343671805857,
+      "loss": 0.9321,
+      "step": 10971
+    },
+    {
+      "epoch": 1.953525641025641,
+      "grad_norm": 0.7361689805984497,
+      "learning_rate": 0.00010382037996448604,
+      "loss": 0.9451,
+      "step": 10972
+    },
+    {
+      "epoch": 1.9537037037037037,
+      "grad_norm": 0.6464847922325134,
+      "learning_rate": 0.00010380639267353398,
+      "loss": 1.0188,
+      "step": 10973
+    },
+    {
+      "epoch": 1.9538817663817665,
+      "grad_norm": 0.5975635647773743,
+      "learning_rate": 0.00010379240530800356,
+      "loss": 0.9025,
+      "step": 10974
+    },
+    {
+      "epoch": 1.9540598290598292,
+      "grad_norm": 0.6734475493431091,
+      "learning_rate": 0.00010377841786816884,
+      "loss": 1.0742,
+      "step": 10975
+    },
+    {
+      "epoch": 1.9542378917378918,
+      "grad_norm": 0.7318592667579651,
+      "learning_rate": 0.00010376443035430386,
+      "loss": 1.1082,
+      "step": 10976
+    },
+    {
+      "epoch": 1.9544159544159543,
+      "grad_norm": 0.7696142792701721,
+      "learning_rate": 0.00010375044276668271,
+      "loss": 0.8421,
+      "step": 10977
+    },
+    {
+      "epoch": 1.954594017094017,
+      "grad_norm": 0.68442302942276,
+      "learning_rate": 0.00010373645510557939,
+      "loss": 1.0794,
+      "step": 10978
+    },
+    {
+      "epoch": 1.9547720797720798,
+      "grad_norm": 0.7582547068595886,
+      "learning_rate": 0.00010372246737126801,
+      "loss": 1.0332,
+      "step": 10979
+    },
+    {
+      "epoch": 1.9549501424501425,
+      "grad_norm": 0.6529998183250427,
+      "learning_rate": 0.00010370847956402262,
+      "loss": 1.1833,
+      "step": 10980
+    },
+    {
+      "epoch": 1.9551282051282053,
+      "grad_norm": 0.7565605044364929,
+      "learning_rate": 0.00010369449168411729,
+      "loss": 1.0494,
+      "step": 10981
+    },
+    {
+      "epoch": 1.9553062678062678,
+      "grad_norm": 0.6346915364265442,
+      "learning_rate": 0.00010368050373182605,
+      "loss": 1.0052,
+      "step": 10982
+    },
+    {
+      "epoch": 1.9554843304843303,
+      "grad_norm": 0.7021830081939697,
+      "learning_rate": 0.00010366651570742298,
+      "loss": 0.9716,
+      "step": 10983
+    },
+    {
+      "epoch": 1.955662393162393,
+      "grad_norm": 0.6464530825614929,
+      "learning_rate": 0.00010365252761118218,
+      "loss": 0.9802,
+      "step": 10984
+    },
+    {
+      "epoch": 1.9558404558404558,
+      "grad_norm": 0.6845090985298157,
+      "learning_rate": 0.00010363853944337768,
+      "loss": 0.9529,
+      "step": 10985
+    },
+    {
+      "epoch": 1.9560185185185186,
+      "grad_norm": 0.7178115248680115,
+      "learning_rate": 0.00010362455120428356,
+      "loss": 0.9968,
+      "step": 10986
+    },
+    {
+      "epoch": 1.9561965811965814,
+      "grad_norm": 0.6131038069725037,
+      "learning_rate": 0.00010361056289417385,
+      "loss": 1.0559,
+      "step": 10987
+    },
+    {
+      "epoch": 1.9563746438746439,
+      "grad_norm": 0.6946909427642822,
+      "learning_rate": 0.0001035965745133227,
+      "loss": 1.0457,
+      "step": 10988
+    },
+    {
+      "epoch": 1.9565527065527064,
+      "grad_norm": 0.7376706600189209,
+      "learning_rate": 0.00010358258606200413,
+      "loss": 0.7775,
+      "step": 10989
+    },
+    {
+      "epoch": 1.9567307692307692,
+      "grad_norm": 0.6864920854568481,
+      "learning_rate": 0.00010356859754049225,
+      "loss": 0.8798,
+      "step": 10990
+    },
+    {
+      "epoch": 1.956908831908832,
+      "grad_norm": 0.6301153302192688,
+      "learning_rate": 0.0001035546089490611,
+      "loss": 0.8757,
+      "step": 10991
+    },
+    {
+      "epoch": 1.9570868945868947,
+      "grad_norm": 0.7184807062149048,
+      "learning_rate": 0.00010354062028798474,
+      "loss": 1.0783,
+      "step": 10992
+    },
+    {
+      "epoch": 1.9572649572649574,
+      "grad_norm": 0.7138563394546509,
+      "learning_rate": 0.00010352663155753732,
+      "loss": 1.0328,
+      "step": 10993
+    },
+    {
+      "epoch": 1.95744301994302,
+      "grad_norm": 0.6565547585487366,
+      "learning_rate": 0.00010351264275799286,
+      "loss": 1.1312,
+      "step": 10994
+    },
+    {
+      "epoch": 1.9576210826210825,
+      "grad_norm": 0.7055862545967102,
+      "learning_rate": 0.00010349865388962547,
+      "loss": 1.0787,
+      "step": 10995
+    },
+    {
+      "epoch": 1.9577991452991452,
+      "grad_norm": 0.6184022426605225,
+      "learning_rate": 0.00010348466495270926,
+      "loss": 0.9635,
+      "step": 10996
+    },
+    {
+      "epoch": 1.957977207977208,
+      "grad_norm": 0.6563652753829956,
+      "learning_rate": 0.0001034706759475182,
+      "loss": 0.772,
+      "step": 10997
+    },
+    {
+      "epoch": 1.9581552706552707,
+      "grad_norm": 0.6103591322898865,
+      "learning_rate": 0.00010345668687432651,
+      "loss": 0.8113,
+      "step": 10998
+    },
+    {
+      "epoch": 1.9583333333333335,
+      "grad_norm": 0.6715512275695801,
+      "learning_rate": 0.0001034426977334082,
+      "loss": 1.1841,
+      "step": 10999
+    },
+    {
+      "epoch": 1.958511396011396,
+      "grad_norm": 0.680092453956604,
+      "learning_rate": 0.00010342870852503739,
+      "loss": 0.9992,
+      "step": 11000
+    },
+    {
+      "epoch": 1.9586894586894585,
+      "grad_norm": 0.828472375869751,
+      "learning_rate": 0.00010341471924948816,
+      "loss": 1.0975,
+      "step": 11001
+    },
+    {
+      "epoch": 1.9588675213675213,
+      "grad_norm": 0.758441686630249,
+      "learning_rate": 0.00010340072990703463,
+      "loss": 1.0632,
+      "step": 11002
+    },
+    {
+      "epoch": 1.959045584045584,
+      "grad_norm": 0.6847560405731201,
+      "learning_rate": 0.00010338674049795079,
+      "loss": 1.0054,
+      "step": 11003
+    },
+    {
+      "epoch": 1.9592236467236468,
+      "grad_norm": 0.707626223564148,
+      "learning_rate": 0.00010337275102251085,
+      "loss": 0.9427,
+      "step": 11004
+    },
+    {
+      "epoch": 1.9594017094017095,
+      "grad_norm": 0.769036591053009,
+      "learning_rate": 0.00010335876148098887,
+      "loss": 1.0424,
+      "step": 11005
+    },
+    {
+      "epoch": 1.959579772079772,
+      "grad_norm": 0.822695791721344,
+      "learning_rate": 0.00010334477187365892,
+      "loss": 1.1573,
+      "step": 11006
+    },
+    {
+      "epoch": 1.9597578347578346,
+      "grad_norm": 0.6290286183357239,
+      "learning_rate": 0.00010333078220079513,
+      "loss": 0.936,
+      "step": 11007
+    },
+    {
+      "epoch": 1.9599358974358974,
+      "grad_norm": 0.6802252531051636,
+      "learning_rate": 0.00010331679246267155,
+      "loss": 0.8049,
+      "step": 11008
+    },
+    {
+      "epoch": 1.96011396011396,
+      "grad_norm": 0.6652607321739197,
+      "learning_rate": 0.00010330280265956232,
+      "loss": 0.926,
+      "step": 11009
+    },
+    {
+      "epoch": 1.9602920227920229,
+      "grad_norm": 0.7057216763496399,
+      "learning_rate": 0.00010328881279174154,
+      "loss": 0.9464,
+      "step": 11010
+    },
+    {
+      "epoch": 1.9604700854700856,
+      "grad_norm": 0.6951601505279541,
+      "learning_rate": 0.00010327482285948331,
+      "loss": 0.9882,
+      "step": 11011
+    },
+    {
+      "epoch": 1.9606481481481481,
+      "grad_norm": 0.6537632942199707,
+      "learning_rate": 0.00010326083286306174,
+      "loss": 0.8663,
+      "step": 11012
+    },
+    {
+      "epoch": 1.9608262108262107,
+      "grad_norm": 0.7252047657966614,
+      "learning_rate": 0.0001032468428027509,
+      "loss": 1.1377,
+      "step": 11013
+    },
+    {
+      "epoch": 1.9610042735042734,
+      "grad_norm": 0.6494104266166687,
+      "learning_rate": 0.00010323285267882492,
+      "loss": 0.8072,
+      "step": 11014
+    },
+    {
+      "epoch": 1.9611823361823362,
+      "grad_norm": 0.8463460206985474,
+      "learning_rate": 0.00010321886249155792,
+      "loss": 1.22,
+      "step": 11015
+    },
+    {
+      "epoch": 1.961360398860399,
+      "grad_norm": 0.6071396470069885,
+      "learning_rate": 0.00010320487224122401,
+      "loss": 0.7975,
+      "step": 11016
+    },
+    {
+      "epoch": 1.9615384615384617,
+      "grad_norm": 0.6546960473060608,
+      "learning_rate": 0.00010319088192809725,
+      "loss": 1.1729,
+      "step": 11017
+    },
+    {
+      "epoch": 1.9617165242165242,
+      "grad_norm": 0.7399442791938782,
+      "learning_rate": 0.00010317689155245178,
+      "loss": 1.092,
+      "step": 11018
+    },
+    {
+      "epoch": 1.9618945868945867,
+      "grad_norm": 0.7103837728500366,
+      "learning_rate": 0.00010316290111456175,
+      "loss": 0.8436,
+      "step": 11019
+    },
+    {
+      "epoch": 1.9620726495726495,
+      "grad_norm": 0.6990065574645996,
+      "learning_rate": 0.00010314891061470125,
+      "loss": 0.9003,
+      "step": 11020
+    },
+    {
+      "epoch": 1.9622507122507122,
+      "grad_norm": 0.7945666313171387,
+      "learning_rate": 0.00010313492005314438,
+      "loss": 0.8812,
+      "step": 11021
+    },
+    {
+      "epoch": 1.962428774928775,
+      "grad_norm": 0.6177538633346558,
+      "learning_rate": 0.00010312092943016527,
+      "loss": 1.0091,
+      "step": 11022
+    },
+    {
+      "epoch": 1.9626068376068377,
+      "grad_norm": 0.7260771989822388,
+      "learning_rate": 0.000103106938746038,
+      "loss": 0.9376,
+      "step": 11023
+    },
+    {
+      "epoch": 1.9627849002849003,
+      "grad_norm": 0.6726518273353577,
+      "learning_rate": 0.00010309294800103674,
+      "loss": 0.8048,
+      "step": 11024
+    },
+    {
+      "epoch": 1.9629629629629628,
+      "grad_norm": 0.8759992122650146,
+      "learning_rate": 0.00010307895719543562,
+      "loss": 1.0248,
+      "step": 11025
+    },
+    {
+      "epoch": 1.9631410256410255,
+      "grad_norm": 0.683437168598175,
+      "learning_rate": 0.00010306496632950868,
+      "loss": 1.0314,
+      "step": 11026
+    },
+    {
+      "epoch": 1.9633190883190883,
+      "grad_norm": 0.7255756258964539,
+      "learning_rate": 0.00010305097540353012,
+      "loss": 0.9828,
+      "step": 11027
+    },
+    {
+      "epoch": 1.963497150997151,
+      "grad_norm": 0.6904804706573486,
+      "learning_rate": 0.000103036984417774,
+      "loss": 0.9054,
+      "step": 11028
+    },
+    {
+      "epoch": 1.9636752136752138,
+      "grad_norm": 0.6906846761703491,
+      "learning_rate": 0.00010302299337251451,
+      "loss": 1.0287,
+      "step": 11029
+    },
+    {
+      "epoch": 1.9638532763532763,
+      "grad_norm": 0.6677078008651733,
+      "learning_rate": 0.00010300900226802575,
+      "loss": 0.8742,
+      "step": 11030
+    },
+    {
+      "epoch": 1.964031339031339,
+      "grad_norm": 0.6144888997077942,
+      "learning_rate": 0.00010299501110458183,
+      "loss": 0.6942,
+      "step": 11031
+    },
+    {
+      "epoch": 1.9642094017094016,
+      "grad_norm": 0.753010094165802,
+      "learning_rate": 0.0001029810198824569,
+      "loss": 0.9018,
+      "step": 11032
+    },
+    {
+      "epoch": 1.9643874643874644,
+      "grad_norm": 0.6872276663780212,
+      "learning_rate": 0.00010296702860192505,
+      "loss": 1.1647,
+      "step": 11033
+    },
+    {
+      "epoch": 1.9645655270655271,
+      "grad_norm": 0.709000289440155,
+      "learning_rate": 0.00010295303726326047,
+      "loss": 0.9143,
+      "step": 11034
+    },
+    {
+      "epoch": 1.9647435897435899,
+      "grad_norm": 0.6507021188735962,
+      "learning_rate": 0.00010293904586673723,
+      "loss": 1.006,
+      "step": 11035
+    },
+    {
+      "epoch": 1.9649216524216524,
+      "grad_norm": 0.6789946556091309,
+      "learning_rate": 0.00010292505441262952,
+      "loss": 0.9049,
+      "step": 11036
+    },
+    {
+      "epoch": 1.9650997150997151,
+      "grad_norm": 0.7156081795692444,
+      "learning_rate": 0.00010291106290121143,
+      "loss": 0.9195,
+      "step": 11037
+    },
+    {
+      "epoch": 1.9652777777777777,
+      "grad_norm": 0.6770932078361511,
+      "learning_rate": 0.0001028970713327571,
+      "loss": 0.9524,
+      "step": 11038
+    },
+    {
+      "epoch": 1.9654558404558404,
+      "grad_norm": 0.7304288148880005,
+      "learning_rate": 0.00010288307970754067,
+      "loss": 0.9276,
+      "step": 11039
+    },
+    {
+      "epoch": 1.9656339031339032,
+      "grad_norm": 0.7603645324707031,
+      "learning_rate": 0.0001028690880258363,
+      "loss": 1.2157,
+      "step": 11040
+    },
+    {
+      "epoch": 1.965811965811966,
+      "grad_norm": 0.6875246167182922,
+      "learning_rate": 0.00010285509628791811,
+      "loss": 1.0269,
+      "step": 11041
+    },
+    {
+      "epoch": 1.9659900284900285,
+      "grad_norm": 0.7234818935394287,
+      "learning_rate": 0.00010284110449406026,
+      "loss": 0.9695,
+      "step": 11042
+    },
+    {
+      "epoch": 1.9661680911680912,
+      "grad_norm": 0.7322804927825928,
+      "learning_rate": 0.00010282711264453684,
+      "loss": 0.9752,
+      "step": 11043
+    },
+    {
+      "epoch": 1.9663461538461537,
+      "grad_norm": 0.7524822950363159,
+      "learning_rate": 0.00010281312073962202,
+      "loss": 1.2144,
+      "step": 11044
+    },
+    {
+      "epoch": 1.9665242165242165,
+      "grad_norm": 0.6623101234436035,
+      "learning_rate": 0.00010279912877958995,
+      "loss": 1.1334,
+      "step": 11045
+    },
+    {
+      "epoch": 1.9667022792022792,
+      "grad_norm": 0.7814893126487732,
+      "learning_rate": 0.00010278513676471477,
+      "loss": 1.266,
+      "step": 11046
+    },
+    {
+      "epoch": 1.966880341880342,
+      "grad_norm": 0.7129884362220764,
+      "learning_rate": 0.00010277114469527063,
+      "loss": 1.0918,
+      "step": 11047
+    },
+    {
+      "epoch": 1.9670584045584045,
+      "grad_norm": 0.6996828317642212,
+      "learning_rate": 0.00010275715257153164,
+      "loss": 0.9269,
+      "step": 11048
+    },
+    {
+      "epoch": 1.9672364672364673,
+      "grad_norm": 0.6439059972763062,
+      "learning_rate": 0.00010274316039377198,
+      "loss": 1.1998,
+      "step": 11049
+    },
+    {
+      "epoch": 1.9674145299145298,
+      "grad_norm": 0.6837672591209412,
+      "learning_rate": 0.00010272916816226581,
+      "loss": 0.8899,
+      "step": 11050
+    },
+    {
+      "epoch": 1.9675925925925926,
+      "grad_norm": 0.702583909034729,
+      "learning_rate": 0.00010271517587728726,
+      "loss": 1.1862,
+      "step": 11051
+    },
+    {
+      "epoch": 1.9677706552706553,
+      "grad_norm": 0.6627798676490784,
+      "learning_rate": 0.00010270118353911047,
+      "loss": 0.898,
+      "step": 11052
+    },
+    {
+      "epoch": 1.967948717948718,
+      "grad_norm": 0.7628579139709473,
+      "learning_rate": 0.00010268719114800957,
+      "loss": 1.006,
+      "step": 11053
+    },
+    {
+      "epoch": 1.9681267806267806,
+      "grad_norm": 0.6425395607948303,
+      "learning_rate": 0.00010267319870425877,
+      "loss": 0.962,
+      "step": 11054
+    },
+    {
+      "epoch": 1.9683048433048433,
+      "grad_norm": 0.7462666630744934,
+      "learning_rate": 0.00010265920620813219,
+      "loss": 1.0703,
+      "step": 11055
+    },
+    {
+      "epoch": 1.9684829059829059,
+      "grad_norm": 0.67641681432724,
+      "learning_rate": 0.00010264521365990401,
+      "loss": 1.1077,
+      "step": 11056
+    },
+    {
+      "epoch": 1.9686609686609686,
+      "grad_norm": 0.6716381311416626,
+      "learning_rate": 0.0001026312210598483,
+      "loss": 1.1048,
+      "step": 11057
+    },
+    {
+      "epoch": 1.9688390313390314,
+      "grad_norm": 0.7207448482513428,
+      "learning_rate": 0.00010261722840823935,
+      "loss": 0.9236,
+      "step": 11058
+    },
+    {
+      "epoch": 1.9690170940170941,
+      "grad_norm": 0.7208544015884399,
+      "learning_rate": 0.0001026032357053512,
+      "loss": 1.0814,
+      "step": 11059
+    },
+    {
+      "epoch": 1.9691951566951567,
+      "grad_norm": 0.6076363325119019,
+      "learning_rate": 0.00010258924295145807,
+      "loss": 0.9388,
+      "step": 11060
+    },
+    {
+      "epoch": 1.9693732193732194,
+      "grad_norm": 0.6460439562797546,
+      "learning_rate": 0.00010257525014683411,
+      "loss": 0.9506,
+      "step": 11061
+    },
+    {
+      "epoch": 1.969551282051282,
+      "grad_norm": 0.7449939250946045,
+      "learning_rate": 0.00010256125729175348,
+      "loss": 1.0209,
+      "step": 11062
+    },
+    {
+      "epoch": 1.9697293447293447,
+      "grad_norm": 0.640885055065155,
+      "learning_rate": 0.00010254726438649031,
+      "loss": 1.0235,
+      "step": 11063
+    },
+    {
+      "epoch": 1.9699074074074074,
+      "grad_norm": 0.6872261166572571,
+      "learning_rate": 0.00010253327143131879,
+      "loss": 0.9217,
+      "step": 11064
+    },
+    {
+      "epoch": 1.9700854700854702,
+      "grad_norm": 0.6213285326957703,
+      "learning_rate": 0.0001025192784265131,
+      "loss": 0.8204,
+      "step": 11065
+    },
+    {
+      "epoch": 1.9702635327635327,
+      "grad_norm": 0.6594449281692505,
+      "learning_rate": 0.00010250528537234736,
+      "loss": 0.9789,
+      "step": 11066
+    },
+    {
+      "epoch": 1.9704415954415955,
+      "grad_norm": 0.7098729610443115,
+      "learning_rate": 0.00010249129226909577,
+      "loss": 1.2551,
+      "step": 11067
+    },
+    {
+      "epoch": 1.970619658119658,
+      "grad_norm": 0.7455953359603882,
+      "learning_rate": 0.0001024772991170325,
+      "loss": 1.0281,
+      "step": 11068
+    },
+    {
+      "epoch": 1.9707977207977208,
+      "grad_norm": 0.6657416224479675,
+      "learning_rate": 0.00010246330591643166,
+      "loss": 0.9421,
+      "step": 11069
+    },
+    {
+      "epoch": 1.9709757834757835,
+      "grad_norm": 0.6480659246444702,
+      "learning_rate": 0.00010244931266756748,
+      "loss": 0.9424,
+      "step": 11070
+    },
+    {
+      "epoch": 1.9711538461538463,
+      "grad_norm": 0.6440510749816895,
+      "learning_rate": 0.00010243531937071411,
+      "loss": 0.9651,
+      "step": 11071
+    },
+    {
+      "epoch": 1.9713319088319088,
+      "grad_norm": 0.6329794526100159,
+      "learning_rate": 0.00010242132602614571,
+      "loss": 0.9233,
+      "step": 11072
+    },
+    {
+      "epoch": 1.9715099715099715,
+      "grad_norm": 0.6694819927215576,
+      "learning_rate": 0.00010240733263413646,
+      "loss": 0.884,
+      "step": 11073
+    },
+    {
+      "epoch": 1.971688034188034,
+      "grad_norm": 0.7702556848526001,
+      "learning_rate": 0.0001023933391949605,
+      "loss": 1.216,
+      "step": 11074
+    },
+    {
+      "epoch": 1.9718660968660968,
+      "grad_norm": 0.6587536931037903,
+      "learning_rate": 0.00010237934570889207,
+      "loss": 0.9324,
+      "step": 11075
+    },
+    {
+      "epoch": 1.9720441595441596,
+      "grad_norm": 0.7919837832450867,
+      "learning_rate": 0.00010236535217620529,
+      "loss": 1.0011,
+      "step": 11076
+    },
+    {
+      "epoch": 1.9722222222222223,
+      "grad_norm": 0.6604606509208679,
+      "learning_rate": 0.00010235135859717433,
+      "loss": 0.929,
+      "step": 11077
+    },
+    {
+      "epoch": 1.9724002849002849,
+      "grad_norm": 0.7158446907997131,
+      "learning_rate": 0.0001023373649720734,
+      "loss": 0.8912,
+      "step": 11078
+    },
+    {
+      "epoch": 1.9725783475783476,
+      "grad_norm": 0.7450904846191406,
+      "learning_rate": 0.00010232337130117666,
+      "loss": 1.0782,
+      "step": 11079
+    },
+    {
+      "epoch": 1.9727564102564101,
+      "grad_norm": 0.6687077283859253,
+      "learning_rate": 0.00010230937758475827,
+      "loss": 1.0662,
+      "step": 11080
+    },
+    {
+      "epoch": 1.9729344729344729,
+      "grad_norm": 0.7188364267349243,
+      "learning_rate": 0.00010229538382309245,
+      "loss": 1.024,
+      "step": 11081
+    },
+    {
+      "epoch": 1.9731125356125356,
+      "grad_norm": 0.6787814497947693,
+      "learning_rate": 0.00010228139001645334,
+      "loss": 0.9559,
+      "step": 11082
+    },
+    {
+      "epoch": 1.9732905982905984,
+      "grad_norm": 0.6834072470664978,
+      "learning_rate": 0.00010226739616511513,
+      "loss": 0.8143,
+      "step": 11083
+    },
+    {
+      "epoch": 1.973468660968661,
+      "grad_norm": 0.6651090979576111,
+      "learning_rate": 0.00010225340226935201,
+      "loss": 1.05,
+      "step": 11084
+    },
+    {
+      "epoch": 1.9736467236467237,
+      "grad_norm": 0.7125018835067749,
+      "learning_rate": 0.00010223940832943813,
+      "loss": 1.0275,
+      "step": 11085
+    },
+    {
+      "epoch": 1.9738247863247862,
+      "grad_norm": 0.6886870861053467,
+      "learning_rate": 0.00010222541434564772,
+      "loss": 1.0972,
+      "step": 11086
+    },
+    {
+      "epoch": 1.974002849002849,
+      "grad_norm": 0.7068913578987122,
+      "learning_rate": 0.00010221142031825492,
+      "loss": 0.9248,
+      "step": 11087
+    },
+    {
+      "epoch": 1.9741809116809117,
+      "grad_norm": 0.7752319574356079,
+      "learning_rate": 0.00010219742624753397,
+      "loss": 0.9754,
+      "step": 11088
+    },
+    {
+      "epoch": 1.9743589743589745,
+      "grad_norm": 0.7915459871292114,
+      "learning_rate": 0.00010218343213375896,
+      "loss": 1.2589,
+      "step": 11089
+    },
+    {
+      "epoch": 1.9745370370370372,
+      "grad_norm": 0.6597068309783936,
+      "learning_rate": 0.00010216943797720418,
+      "loss": 1.0004,
+      "step": 11090
+    },
+    {
+      "epoch": 1.9747150997150997,
+      "grad_norm": 0.7060620188713074,
+      "learning_rate": 0.00010215544377814375,
+      "loss": 0.9968,
+      "step": 11091
+    },
+    {
+      "epoch": 1.9748931623931623,
+      "grad_norm": 0.6815677881240845,
+      "learning_rate": 0.0001021414495368519,
+      "loss": 0.8889,
+      "step": 11092
+    },
+    {
+      "epoch": 1.975071225071225,
+      "grad_norm": 0.6872935891151428,
+      "learning_rate": 0.00010212745525360277,
+      "loss": 1.1582,
+      "step": 11093
+    },
+    {
+      "epoch": 1.9752492877492878,
+      "grad_norm": 0.6781140565872192,
+      "learning_rate": 0.00010211346092867056,
+      "loss": 0.9988,
+      "step": 11094
+    },
+    {
+      "epoch": 1.9754273504273505,
+      "grad_norm": 0.6959224343299866,
+      "learning_rate": 0.00010209946656232949,
+      "loss": 1.1097,
+      "step": 11095
+    },
+    {
+      "epoch": 1.9756054131054133,
+      "grad_norm": 0.7205058336257935,
+      "learning_rate": 0.00010208547215485376,
+      "loss": 0.9951,
+      "step": 11096
+    },
+    {
+      "epoch": 1.9757834757834758,
+      "grad_norm": 0.6968751549720764,
+      "learning_rate": 0.00010207147770651748,
+      "loss": 0.9313,
+      "step": 11097
+    },
+    {
+      "epoch": 1.9759615384615383,
+      "grad_norm": 0.6688823103904724,
+      "learning_rate": 0.00010205748321759494,
+      "loss": 0.9439,
+      "step": 11098
+    },
+    {
+      "epoch": 1.976139601139601,
+      "grad_norm": 0.6169568300247192,
+      "learning_rate": 0.00010204348868836028,
+      "loss": 1.123,
+      "step": 11099
+    },
+    {
+      "epoch": 1.9763176638176638,
+      "grad_norm": 0.6995537281036377,
+      "learning_rate": 0.00010202949411908768,
+      "loss": 1.1928,
+      "step": 11100
+    },
+    {
+      "epoch": 1.9764957264957266,
+      "grad_norm": 0.7102637887001038,
+      "learning_rate": 0.00010201549951005138,
+      "loss": 1.0265,
+      "step": 11101
+    },
+    {
+      "epoch": 1.9766737891737893,
+      "grad_norm": 0.6820045113563538,
+      "learning_rate": 0.00010200150486152558,
+      "loss": 0.9309,
+      "step": 11102
+    },
+    {
+      "epoch": 1.9768518518518519,
+      "grad_norm": 0.7050938010215759,
+      "learning_rate": 0.00010198751017378443,
+      "loss": 1.0047,
+      "step": 11103
+    },
+    {
+      "epoch": 1.9770299145299144,
+      "grad_norm": 0.6418201923370361,
+      "learning_rate": 0.00010197351544710214,
+      "loss": 1.1172,
+      "step": 11104
+    },
+    {
+      "epoch": 1.9772079772079771,
+      "grad_norm": 0.6681215763092041,
+      "learning_rate": 0.0001019595206817529,
+      "loss": 1.0621,
+      "step": 11105
+    },
+    {
+      "epoch": 1.97738603988604,
+      "grad_norm": 0.7725709676742554,
+      "learning_rate": 0.00010194552587801094,
+      "loss": 1.0044,
+      "step": 11106
+    },
+    {
+      "epoch": 1.9775641025641026,
+      "grad_norm": 0.6870455741882324,
+      "learning_rate": 0.00010193153103615045,
+      "loss": 1.2652,
+      "step": 11107
+    },
+    {
+      "epoch": 1.9777421652421654,
+      "grad_norm": 0.6352108120918274,
+      "learning_rate": 0.00010191753615644561,
+      "loss": 1.1081,
+      "step": 11108
+    },
+    {
+      "epoch": 1.977920227920228,
+      "grad_norm": 0.7322626113891602,
+      "learning_rate": 0.00010190354123917066,
+      "loss": 1.0003,
+      "step": 11109
+    },
+    {
+      "epoch": 1.9780982905982905,
+      "grad_norm": 0.6240935921669006,
+      "learning_rate": 0.00010188954628459972,
+      "loss": 0.8925,
+      "step": 11110
+    },
+    {
+      "epoch": 1.9782763532763532,
+      "grad_norm": 0.6648945212364197,
+      "learning_rate": 0.00010187555129300708,
+      "loss": 1.0882,
+      "step": 11111
+    },
+    {
+      "epoch": 1.978454415954416,
+      "grad_norm": 0.6704208850860596,
+      "learning_rate": 0.00010186155626466692,
+      "loss": 0.8873,
+      "step": 11112
+    },
+    {
+      "epoch": 1.9786324786324787,
+      "grad_norm": 0.6716459393501282,
+      "learning_rate": 0.00010184756119985341,
+      "loss": 1.0045,
+      "step": 11113
+    },
+    {
+      "epoch": 1.9788105413105415,
+      "grad_norm": 0.81277996301651,
+      "learning_rate": 0.0001018335660988408,
+      "loss": 0.8867,
+      "step": 11114
+    },
+    {
+      "epoch": 1.978988603988604,
+      "grad_norm": 0.7008311748504639,
+      "learning_rate": 0.00010181957096190323,
+      "loss": 0.9391,
+      "step": 11115
+    },
+    {
+      "epoch": 1.9791666666666665,
+      "grad_norm": 0.727676272392273,
+      "learning_rate": 0.00010180557578931498,
+      "loss": 1.0157,
+      "step": 11116
+    },
+    {
+      "epoch": 1.9793447293447293,
+      "grad_norm": 0.7058015465736389,
+      "learning_rate": 0.00010179158058135018,
+      "loss": 1.0,
+      "step": 11117
+    },
+    {
+      "epoch": 1.979522792022792,
+      "grad_norm": 0.7770412564277649,
+      "learning_rate": 0.00010177758533828312,
+      "loss": 1.0428,
+      "step": 11118
+    },
+    {
+      "epoch": 1.9797008547008548,
+      "grad_norm": 0.6557414531707764,
+      "learning_rate": 0.00010176359006038798,
+      "loss": 0.8557,
+      "step": 11119
+    },
+    {
+      "epoch": 1.9798789173789175,
+      "grad_norm": 0.7681090235710144,
+      "learning_rate": 0.00010174959474793894,
+      "loss": 0.867,
+      "step": 11120
+    },
+    {
+      "epoch": 1.98005698005698,
+      "grad_norm": 0.7915860414505005,
+      "learning_rate": 0.0001017355994012102,
+      "loss": 0.9961,
+      "step": 11121
+    },
+    {
+      "epoch": 1.9802350427350426,
+      "grad_norm": 0.8039166927337646,
+      "learning_rate": 0.00010172160402047604,
+      "loss": 1.1378,
+      "step": 11122
+    },
+    {
+      "epoch": 1.9804131054131053,
+      "grad_norm": 0.6641189455986023,
+      "learning_rate": 0.0001017076086060106,
+      "loss": 0.8914,
+      "step": 11123
+    },
+    {
+      "epoch": 1.980591168091168,
+      "grad_norm": 0.7673811316490173,
+      "learning_rate": 0.00010169361315808812,
+      "loss": 1.018,
+      "step": 11124
+    },
+    {
+      "epoch": 1.9807692307692308,
+      "grad_norm": 0.7320558428764343,
+      "learning_rate": 0.00010167961767698279,
+      "loss": 1.0515,
+      "step": 11125
+    },
+    {
+      "epoch": 1.9809472934472936,
+      "grad_norm": 0.5717357993125916,
+      "learning_rate": 0.00010166562216296886,
+      "loss": 0.7619,
+      "step": 11126
+    },
+    {
+      "epoch": 1.9811253561253561,
+      "grad_norm": 0.6638465523719788,
+      "learning_rate": 0.00010165162661632052,
+      "loss": 1.0161,
+      "step": 11127
+    },
+    {
+      "epoch": 1.9813034188034186,
+      "grad_norm": 0.7293243408203125,
+      "learning_rate": 0.00010163763103731201,
+      "loss": 1.063,
+      "step": 11128
+    },
+    {
+      "epoch": 1.9814814814814814,
+      "grad_norm": 0.634694516658783,
+      "learning_rate": 0.00010162363542621752,
+      "loss": 0.8945,
+      "step": 11129
+    },
+    {
+      "epoch": 1.9816595441595442,
+      "grad_norm": 0.7086902856826782,
+      "learning_rate": 0.00010160963978331122,
+      "loss": 1.0542,
+      "step": 11130
+    },
+    {
+      "epoch": 1.981837606837607,
+      "grad_norm": 0.5939825773239136,
+      "learning_rate": 0.00010159564410886742,
+      "loss": 0.7822,
+      "step": 11131
+    },
+    {
+      "epoch": 1.9820156695156697,
+      "grad_norm": 0.722183346748352,
+      "learning_rate": 0.00010158164840316027,
+      "loss": 1.0252,
+      "step": 11132
+    },
+    {
+      "epoch": 1.9821937321937322,
+      "grad_norm": 0.7300103306770325,
+      "learning_rate": 0.000101567652666464,
+      "loss": 0.9099,
+      "step": 11133
+    },
+    {
+      "epoch": 1.9823717948717947,
+      "grad_norm": 0.7148736119270325,
+      "learning_rate": 0.00010155365689905285,
+      "loss": 1.0149,
+      "step": 11134
+    },
+    {
+      "epoch": 1.9825498575498575,
+      "grad_norm": 0.8214462995529175,
+      "learning_rate": 0.000101539661101201,
+      "loss": 1.0127,
+      "step": 11135
+    },
+    {
+      "epoch": 1.9827279202279202,
+      "grad_norm": 0.7111126780509949,
+      "learning_rate": 0.00010152566527318265,
+      "loss": 1.045,
+      "step": 11136
+    },
+    {
+      "epoch": 1.982905982905983,
+      "grad_norm": 0.6640021800994873,
+      "learning_rate": 0.00010151166941527213,
+      "loss": 0.9618,
+      "step": 11137
+    },
+    {
+      "epoch": 1.9830840455840457,
+      "grad_norm": 0.7177722454071045,
+      "learning_rate": 0.00010149767352774358,
+      "loss": 1.0373,
+      "step": 11138
+    },
+    {
+      "epoch": 1.9832621082621082,
+      "grad_norm": 0.6728883981704712,
+      "learning_rate": 0.00010148367761087121,
+      "loss": 0.9886,
+      "step": 11139
+    },
+    {
+      "epoch": 1.9834401709401708,
+      "grad_norm": 0.7060428857803345,
+      "learning_rate": 0.00010146968166492926,
+      "loss": 1.042,
+      "step": 11140
+    },
+    {
+      "epoch": 1.9836182336182335,
+      "grad_norm": 0.706253707408905,
+      "learning_rate": 0.00010145568569019192,
+      "loss": 1.2249,
+      "step": 11141
+    },
+    {
+      "epoch": 1.9837962962962963,
+      "grad_norm": 0.618221640586853,
+      "learning_rate": 0.00010144168968693348,
+      "loss": 0.9223,
+      "step": 11142
+    },
+    {
+      "epoch": 1.983974358974359,
+      "grad_norm": 0.7005748748779297,
+      "learning_rate": 0.00010142769365542814,
+      "loss": 1.2735,
+      "step": 11143
+    },
+    {
+      "epoch": 1.9841524216524218,
+      "grad_norm": 0.6059799194335938,
+      "learning_rate": 0.0001014136975959501,
+      "loss": 0.7216,
+      "step": 11144
+    },
+    {
+      "epoch": 1.9843304843304843,
+      "grad_norm": 0.7169116735458374,
+      "learning_rate": 0.00010139970150877358,
+      "loss": 0.9541,
+      "step": 11145
+    },
+    {
+      "epoch": 1.984508547008547,
+      "grad_norm": 0.7402058839797974,
+      "learning_rate": 0.00010138570539417281,
+      "loss": 1.1268,
+      "step": 11146
+    },
+    {
+      "epoch": 1.9846866096866096,
+      "grad_norm": 0.7204117178916931,
+      "learning_rate": 0.00010137170925242201,
+      "loss": 1.1557,
+      "step": 11147
+    },
+    {
+      "epoch": 1.9848646723646723,
+      "grad_norm": 0.589163064956665,
+      "learning_rate": 0.00010135771308379545,
+      "loss": 0.9863,
+      "step": 11148
+    },
+    {
+      "epoch": 1.985042735042735,
+      "grad_norm": 0.6342785358428955,
+      "learning_rate": 0.00010134371688856732,
+      "loss": 0.9294,
+      "step": 11149
+    },
+    {
+      "epoch": 1.9852207977207978,
+      "grad_norm": 0.7144256234169006,
+      "learning_rate": 0.00010132972066701183,
+      "loss": 0.9428,
+      "step": 11150
+    },
+    {
+      "epoch": 1.9853988603988604,
+      "grad_norm": 0.658032238483429,
+      "learning_rate": 0.00010131572441940322,
+      "loss": 0.9749,
+      "step": 11151
+    },
+    {
+      "epoch": 1.9855769230769231,
+      "grad_norm": 0.7609163522720337,
+      "learning_rate": 0.00010130172814601576,
+      "loss": 1.1771,
+      "step": 11152
+    },
+    {
+      "epoch": 1.9857549857549857,
+      "grad_norm": 0.6531760692596436,
+      "learning_rate": 0.00010128773184712361,
+      "loss": 0.8529,
+      "step": 11153
+    },
+    {
+      "epoch": 1.9859330484330484,
+      "grad_norm": 0.6983599066734314,
+      "learning_rate": 0.00010127373552300103,
+      "loss": 1.0307,
+      "step": 11154
+    },
+    {
+      "epoch": 1.9861111111111112,
+      "grad_norm": 0.7121559381484985,
+      "learning_rate": 0.00010125973917392224,
+      "loss": 0.9426,
+      "step": 11155
+    },
+    {
+      "epoch": 1.986289173789174,
+      "grad_norm": 0.6282170414924622,
+      "learning_rate": 0.0001012457428001615,
+      "loss": 0.8983,
+      "step": 11156
+    },
+    {
+      "epoch": 1.9864672364672364,
+      "grad_norm": 0.6960387825965881,
+      "learning_rate": 0.000101231746401993,
+      "loss": 0.9001,
+      "step": 11157
+    },
+    {
+      "epoch": 1.9866452991452992,
+      "grad_norm": 0.7523152232170105,
+      "learning_rate": 0.000101217749979691,
+      "loss": 1.3462,
+      "step": 11158
+    },
+    {
+      "epoch": 1.9868233618233617,
+      "grad_norm": 0.71713787317276,
+      "learning_rate": 0.00010120375353352971,
+      "loss": 1.0147,
+      "step": 11159
+    },
+    {
+      "epoch": 1.9870014245014245,
+      "grad_norm": 0.7304390072822571,
+      "learning_rate": 0.00010118975706378339,
+      "loss": 0.8436,
+      "step": 11160
+    },
+    {
+      "epoch": 1.9871794871794872,
+      "grad_norm": 0.789968729019165,
+      "learning_rate": 0.00010117576057072622,
+      "loss": 1.1162,
+      "step": 11161
+    },
+    {
+      "epoch": 1.98735754985755,
+      "grad_norm": 0.6752170920372009,
+      "learning_rate": 0.00010116176405463249,
+      "loss": 1.0619,
+      "step": 11162
+    },
+    {
+      "epoch": 1.9875356125356125,
+      "grad_norm": 0.681398868560791,
+      "learning_rate": 0.0001011477675157764,
+      "loss": 0.8981,
+      "step": 11163
+    },
+    {
+      "epoch": 1.9877136752136753,
+      "grad_norm": 0.61469566822052,
+      "learning_rate": 0.0001011337709544322,
+      "loss": 1.0139,
+      "step": 11164
+    },
+    {
+      "epoch": 1.9878917378917378,
+      "grad_norm": 0.7524265050888062,
+      "learning_rate": 0.0001011197743708741,
+      "loss": 1.1571,
+      "step": 11165
+    },
+    {
+      "epoch": 1.9880698005698005,
+      "grad_norm": 0.6289594173431396,
+      "learning_rate": 0.00010110577776537633,
+      "loss": 0.93,
+      "step": 11166
+    },
+    {
+      "epoch": 1.9882478632478633,
+      "grad_norm": 0.6991903781890869,
+      "learning_rate": 0.00010109178113821318,
+      "loss": 1.1176,
+      "step": 11167
+    },
+    {
+      "epoch": 1.988425925925926,
+      "grad_norm": 0.7604053020477295,
+      "learning_rate": 0.00010107778448965883,
+      "loss": 1.0497,
+      "step": 11168
+    },
+    {
+      "epoch": 1.9886039886039886,
+      "grad_norm": 0.7166453003883362,
+      "learning_rate": 0.00010106378781998753,
+      "loss": 1.1237,
+      "step": 11169
+    },
+    {
+      "epoch": 1.9887820512820513,
+      "grad_norm": 0.6071686744689941,
+      "learning_rate": 0.00010104979112947352,
+      "loss": 0.8934,
+      "step": 11170
+    },
+    {
+      "epoch": 1.9889601139601139,
+      "grad_norm": 0.6618169546127319,
+      "learning_rate": 0.00010103579441839101,
+      "loss": 1.0596,
+      "step": 11171
+    },
+    {
+      "epoch": 1.9891381766381766,
+      "grad_norm": 0.6838458776473999,
+      "learning_rate": 0.0001010217976870143,
+      "loss": 1.0167,
+      "step": 11172
+    },
+    {
+      "epoch": 1.9893162393162394,
+      "grad_norm": 0.6369979381561279,
+      "learning_rate": 0.00010100780093561757,
+      "loss": 0.9001,
+      "step": 11173
+    },
+    {
+      "epoch": 1.989494301994302,
+      "grad_norm": 0.661313533782959,
+      "learning_rate": 0.00010099380416447508,
+      "loss": 0.8952,
+      "step": 11174
+    },
+    {
+      "epoch": 1.9896723646723646,
+      "grad_norm": 0.6991600394248962,
+      "learning_rate": 0.00010097980737386106,
+      "loss": 1.0083,
+      "step": 11175
+    },
+    {
+      "epoch": 1.9898504273504274,
+      "grad_norm": 0.618748664855957,
+      "learning_rate": 0.00010096581056404972,
+      "loss": 0.8797,
+      "step": 11176
+    },
+    {
+      "epoch": 1.99002849002849,
+      "grad_norm": 0.7039223909378052,
+      "learning_rate": 0.00010095181373531535,
+      "loss": 1.0385,
+      "step": 11177
+    },
+    {
+      "epoch": 1.9902065527065527,
+      "grad_norm": 0.7598999738693237,
+      "learning_rate": 0.00010093781688793216,
+      "loss": 0.9205,
+      "step": 11178
+    },
+    {
+      "epoch": 1.9903846153846154,
+      "grad_norm": 0.6355955600738525,
+      "learning_rate": 0.00010092382002217441,
+      "loss": 0.8646,
+      "step": 11179
+    },
+    {
+      "epoch": 1.9905626780626782,
+      "grad_norm": 0.8024569153785706,
+      "learning_rate": 0.00010090982313831634,
+      "loss": 1.1678,
+      "step": 11180
+    },
+    {
+      "epoch": 1.9907407407407407,
+      "grad_norm": 0.5960529446601868,
+      "learning_rate": 0.00010089582623663216,
+      "loss": 0.8277,
+      "step": 11181
+    },
+    {
+      "epoch": 1.9909188034188035,
+      "grad_norm": 0.6323728561401367,
+      "learning_rate": 0.00010088182931739609,
+      "loss": 0.948,
+      "step": 11182
+    },
+    {
+      "epoch": 1.991096866096866,
+      "grad_norm": 0.7532381415367126,
+      "learning_rate": 0.00010086783238088244,
+      "loss": 1.2948,
+      "step": 11183
+    },
+    {
+      "epoch": 1.9912749287749287,
+      "grad_norm": 0.5740166306495667,
+      "learning_rate": 0.00010085383542736543,
+      "loss": 0.7019,
+      "step": 11184
+    },
+    {
+      "epoch": 1.9914529914529915,
+      "grad_norm": 0.616985559463501,
+      "learning_rate": 0.00010083983845711929,
+      "loss": 1.0802,
+      "step": 11185
+    },
+    {
+      "epoch": 1.9916310541310542,
+      "grad_norm": 0.7505929470062256,
+      "learning_rate": 0.00010082584147041824,
+      "loss": 1.0523,
+      "step": 11186
+    },
+    {
+      "epoch": 1.9918091168091168,
+      "grad_norm": 0.7147656679153442,
+      "learning_rate": 0.00010081184446753653,
+      "loss": 1.0019,
+      "step": 11187
+    },
+    {
+      "epoch": 1.9919871794871795,
+      "grad_norm": 0.7301992774009705,
+      "learning_rate": 0.00010079784744874845,
+      "loss": 1.0329,
+      "step": 11188
+    },
+    {
+      "epoch": 1.992165242165242,
+      "grad_norm": 0.6847206354141235,
+      "learning_rate": 0.00010078385041432819,
+      "loss": 1.0367,
+      "step": 11189
+    },
+    {
+      "epoch": 1.9923433048433048,
+      "grad_norm": 0.7310990691184998,
+      "learning_rate": 0.00010076985336455,
+      "loss": 1.1675,
+      "step": 11190
+    },
+    {
+      "epoch": 1.9925213675213675,
+      "grad_norm": 0.6916858553886414,
+      "learning_rate": 0.00010075585629968813,
+      "loss": 0.8615,
+      "step": 11191
+    },
+    {
+      "epoch": 1.9926994301994303,
+      "grad_norm": 0.6519390344619751,
+      "learning_rate": 0.00010074185922001685,
+      "loss": 0.8105,
+      "step": 11192
+    },
+    {
+      "epoch": 1.9928774928774928,
+      "grad_norm": 0.7437400817871094,
+      "learning_rate": 0.00010072786212581036,
+      "loss": 0.9993,
+      "step": 11193
+    },
+    {
+      "epoch": 1.9930555555555556,
+      "grad_norm": 0.5048928260803223,
+      "learning_rate": 0.00010071386501734292,
+      "loss": 0.7912,
+      "step": 11194
+    },
+    {
+      "epoch": 1.993233618233618,
+      "grad_norm": 0.8042343258857727,
+      "learning_rate": 0.00010069986789488882,
+      "loss": 0.9156,
+      "step": 11195
+    },
+    {
+      "epoch": 1.9934116809116809,
+      "grad_norm": 0.7188669443130493,
+      "learning_rate": 0.0001006858707587222,
+      "loss": 1.0474,
+      "step": 11196
+    },
+    {
+      "epoch": 1.9935897435897436,
+      "grad_norm": 0.7377660870552063,
+      "learning_rate": 0.00010067187360911738,
+      "loss": 0.7013,
+      "step": 11197
+    },
+    {
+      "epoch": 1.9937678062678064,
+      "grad_norm": 0.6684696078300476,
+      "learning_rate": 0.00010065787644634861,
+      "loss": 0.9199,
+      "step": 11198
+    },
+    {
+      "epoch": 1.993945868945869,
+      "grad_norm": 0.7341524958610535,
+      "learning_rate": 0.00010064387927069012,
+      "loss": 1.0925,
+      "step": 11199
+    },
+    {
+      "epoch": 1.9941239316239316,
+      "grad_norm": 0.685745120048523,
+      "learning_rate": 0.00010062988208241614,
+      "loss": 1.083,
+      "step": 11200
+    },
+    {
+      "epoch": 1.9943019943019942,
+      "grad_norm": 0.6923556327819824,
+      "learning_rate": 0.00010061588488180096,
+      "loss": 1.2728,
+      "step": 11201
+    },
+    {
+      "epoch": 1.994480056980057,
+      "grad_norm": 0.6663293242454529,
+      "learning_rate": 0.00010060188766911876,
+      "loss": 1.0937,
+      "step": 11202
+    },
+    {
+      "epoch": 1.9946581196581197,
+      "grad_norm": 0.7963639497756958,
+      "learning_rate": 0.00010058789044464383,
+      "loss": 1.0592,
+      "step": 11203
+    },
+    {
+      "epoch": 1.9948361823361824,
+      "grad_norm": 0.6362990140914917,
+      "learning_rate": 0.00010057389320865042,
+      "loss": 0.8872,
+      "step": 11204
+    },
+    {
+      "epoch": 1.9950142450142452,
+      "grad_norm": 0.7752974033355713,
+      "learning_rate": 0.00010055989596141278,
+      "loss": 1.043,
+      "step": 11205
+    },
+    {
+      "epoch": 1.9951923076923077,
+      "grad_norm": 0.7125133275985718,
+      "learning_rate": 0.00010054589870320512,
+      "loss": 1.0015,
+      "step": 11206
+    },
+    {
+      "epoch": 1.9953703703703702,
+      "grad_norm": 0.7102736830711365,
+      "learning_rate": 0.00010053190143430169,
+      "loss": 1.0052,
+      "step": 11207
+    },
+    {
+      "epoch": 1.995548433048433,
+      "grad_norm": 0.8628628849983215,
+      "learning_rate": 0.00010051790415497677,
+      "loss": 1.2351,
+      "step": 11208
+    },
+    {
+      "epoch": 1.9957264957264957,
+      "grad_norm": 0.7233129739761353,
+      "learning_rate": 0.00010050390686550462,
+      "loss": 1.0848,
+      "step": 11209
+    },
+    {
+      "epoch": 1.9959045584045585,
+      "grad_norm": 0.5936228036880493,
+      "learning_rate": 0.00010048990956615944,
+      "loss": 0.7998,
+      "step": 11210
+    },
+    {
+      "epoch": 1.9960826210826212,
+      "grad_norm": 0.7345388531684875,
+      "learning_rate": 0.0001004759122572155,
+      "loss": 1.0329,
+      "step": 11211
+    },
+    {
+      "epoch": 1.9962606837606838,
+      "grad_norm": 0.7344130873680115,
+      "learning_rate": 0.00010046191493894703,
+      "loss": 1.1563,
+      "step": 11212
+    },
+    {
+      "epoch": 1.9964387464387463,
+      "grad_norm": 0.6979942321777344,
+      "learning_rate": 0.00010044791761162833,
+      "loss": 0.9269,
+      "step": 11213
+    },
+    {
+      "epoch": 1.996616809116809,
+      "grad_norm": 0.67514967918396,
+      "learning_rate": 0.0001004339202755336,
+      "loss": 0.9028,
+      "step": 11214
+    },
+    {
+      "epoch": 1.9967948717948718,
+      "grad_norm": 0.6379111409187317,
+      "learning_rate": 0.00010041992293093712,
+      "loss": 0.7816,
+      "step": 11215
+    },
+    {
+      "epoch": 1.9969729344729346,
+      "grad_norm": 0.693976104259491,
+      "learning_rate": 0.00010040592557811308,
+      "loss": 0.8411,
+      "step": 11216
+    },
+    {
+      "epoch": 1.9971509971509973,
+      "grad_norm": 0.5952646732330322,
+      "learning_rate": 0.0001003919282173358,
+      "loss": 0.8681,
+      "step": 11217
+    },
+    {
+      "epoch": 1.9973290598290598,
+      "grad_norm": 0.7452160716056824,
+      "learning_rate": 0.00010037793084887948,
+      "loss": 1.0198,
+      "step": 11218
+    },
+    {
+      "epoch": 1.9975071225071224,
+      "grad_norm": 0.6683938503265381,
+      "learning_rate": 0.00010036393347301841,
+      "loss": 0.8162,
+      "step": 11219
+    },
+    {
+      "epoch": 1.9976851851851851,
+      "grad_norm": 0.6849120855331421,
+      "learning_rate": 0.00010034993609002683,
+      "loss": 1.0668,
+      "step": 11220
+    },
+    {
+      "epoch": 1.9978632478632479,
+      "grad_norm": 0.8782517910003662,
+      "learning_rate": 0.00010033593870017897,
+      "loss": 1.222,
+      "step": 11221
+    },
+    {
+      "epoch": 1.9980413105413106,
+      "grad_norm": 0.6482772827148438,
+      "learning_rate": 0.00010032194130374908,
+      "loss": 0.7722,
+      "step": 11222
+    },
+    {
+      "epoch": 1.9982193732193734,
+      "grad_norm": 0.8595399260520935,
+      "learning_rate": 0.00010030794390101142,
+      "loss": 1.3004,
+      "step": 11223
+    },
+    {
+      "epoch": 1.998397435897436,
+      "grad_norm": 0.7258931994438171,
+      "learning_rate": 0.00010029394649224024,
+      "loss": 0.8825,
+      "step": 11224
+    },
+    {
+      "epoch": 1.9985754985754984,
+      "grad_norm": 0.6291348934173584,
+      "learning_rate": 0.00010027994907770981,
+      "loss": 0.8681,
+      "step": 11225
+    },
+    {
+      "epoch": 1.9987535612535612,
+      "grad_norm": 0.7528844475746155,
+      "learning_rate": 0.00010026595165769434,
+      "loss": 1.1443,
+      "step": 11226
+    },
+    {
+      "epoch": 1.998931623931624,
+      "grad_norm": 0.654017984867096,
+      "learning_rate": 0.0001002519542324681,
+      "loss": 0.8585,
+      "step": 11227
+    },
+    {
+      "epoch": 1.9991096866096867,
+      "grad_norm": 0.6812533736228943,
+      "learning_rate": 0.00010023795680230532,
+      "loss": 0.8757,
+      "step": 11228
+    },
+    {
+      "epoch": 1.9992877492877494,
+      "grad_norm": 0.7120179533958435,
+      "learning_rate": 0.0001002239593674803,
+      "loss": 1.0159,
+      "step": 11229
+    },
+    {
+      "epoch": 1.999465811965812,
+      "grad_norm": 0.6943802237510681,
+      "learning_rate": 0.00010020996192826725,
+      "loss": 1.0193,
+      "step": 11230
+    },
+    {
+      "epoch": 1.9996438746438745,
+      "grad_norm": 0.7227906584739685,
+      "learning_rate": 0.00010019596448494047,
+      "loss": 1.1536,
+      "step": 11231
+    },
+    {
+      "epoch": 1.9998219373219372,
+      "grad_norm": 0.6233312487602234,
+      "learning_rate": 0.00010018196703777411,
+      "loss": 0.9117,
+      "step": 11232
+    },
+    {
+      "epoch": 1.9998219373219372,
+      "eval_loss": 1.0963108539581299,
+      "eval_runtime": 24.4478,
+      "eval_samples_per_second": 42.58,
+      "eval_steps_per_second": 21.311,
+      "step": 11232
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 0.67911696434021,
+      "learning_rate": 0.00010016796958704254,
+      "loss": 0.9516,
+      "step": 11233
+    },
+    {
+      "epoch": 2.0001780626780628,
+      "grad_norm": 0.7372198700904846,
+      "learning_rate": 0.00010015397213301992,
+      "loss": 1.3066,
+      "step": 11234
+    },
+    {
+      "epoch": 2.0001780626780628,
+      "grad_norm": 0.7573498487472534,
+      "learning_rate": 0.00010013997467598055,
+      "loss": 1.0204,
+      "step": 11235
+    },
+    {
+      "epoch": 2.0003561253561255,
+      "grad_norm": 0.6862889528274536,
+      "learning_rate": 0.00010012597721619863,
+      "loss": 0.9447,
+      "step": 11236
+    },
+    {
+      "epoch": 2.0005341880341883,
+      "grad_norm": 0.5977628231048584,
+      "learning_rate": 0.00010011197975394851,
+      "loss": 0.9849,
+      "step": 11237
+    },
+    {
+      "epoch": 2.0007122507122506,
+      "grad_norm": 0.620206892490387,
+      "learning_rate": 0.00010009798228950431,
+      "loss": 0.7498,
+      "step": 11238
+    },
+    {
+      "epoch": 2.0008903133903133,
+      "grad_norm": 0.5694536566734314,
+      "learning_rate": 0.0001000839848231404,
+      "loss": 0.7092,
+      "step": 11239
+    },
+    {
+      "epoch": 2.001068376068376,
+      "grad_norm": 0.5880212783813477,
+      "learning_rate": 0.00010006998735513098,
+      "loss": 0.9057,
+      "step": 11240
+    },
+    {
+      "epoch": 2.001246438746439,
+      "grad_norm": 0.6152323484420776,
+      "learning_rate": 0.00010005598988575029,
+      "loss": 0.8356,
+      "step": 11241
+    },
+    {
+      "epoch": 2.0014245014245016,
+      "grad_norm": 0.6827659010887146,
+      "learning_rate": 0.00010004199241527261,
+      "loss": 0.8302,
+      "step": 11242
+    },
+    {
+      "epoch": 2.0016025641025643,
+      "grad_norm": 0.5883491635322571,
+      "learning_rate": 0.00010002799494397215,
+      "loss": 0.8616,
+      "step": 11243
+    },
+    {
+      "epoch": 2.0017806267806266,
+      "grad_norm": 0.7649462819099426,
+      "learning_rate": 0.00010001399747212322,
+      "loss": 1.1643,
+      "step": 11244
+    },
+    {
+      "epoch": 2.0019586894586894,
+      "grad_norm": 0.6435316205024719,
+      "learning_rate": 0.0001,
+      "loss": 0.9239,
+      "step": 11245
+    },
+    {
+      "epoch": 2.002136752136752,
+      "grad_norm": 0.5662951469421387,
+      "learning_rate": 9.99860025278768e-05,
+      "loss": 0.9111,
+      "step": 11246
+    },
+    {
+      "epoch": 2.002314814814815,
+      "grad_norm": 0.6234064102172852,
+      "learning_rate": 9.997200505602787e-05,
+      "loss": 0.4852,
+      "step": 11247
+    },
+    {
+      "epoch": 2.0024928774928776,
+      "grad_norm": 0.6322146058082581,
+      "learning_rate": 9.995800758472741e-05,
+      "loss": 0.8599,
+      "step": 11248
+    },
+    {
+      "epoch": 2.0026709401709404,
+      "grad_norm": 0.6131469011306763,
+      "learning_rate": 9.994401011424972e-05,
+      "loss": 0.8504,
+      "step": 11249
+    },
+    {
+      "epoch": 2.0028490028490027,
+      "grad_norm": 0.6809168457984924,
+      "learning_rate": 9.993001264486903e-05,
+      "loss": 0.761,
+      "step": 11250
+    },
+    {
+      "epoch": 2.0030270655270654,
+      "grad_norm": 0.6721677184104919,
+      "learning_rate": 9.991601517685962e-05,
+      "loss": 0.9146,
+      "step": 11251
+    },
+    {
+      "epoch": 2.003205128205128,
+      "grad_norm": 0.6395483016967773,
+      "learning_rate": 9.990201771049569e-05,
+      "loss": 0.8583,
+      "step": 11252
+    },
+    {
+      "epoch": 2.003383190883191,
+      "grad_norm": 0.8524805903434753,
+      "learning_rate": 9.988802024605153e-05,
+      "loss": 0.859,
+      "step": 11253
+    },
+    {
+      "epoch": 2.0035612535612537,
+      "grad_norm": 0.6186681389808655,
+      "learning_rate": 9.987402278380136e-05,
+      "loss": 0.6695,
+      "step": 11254
+    },
+    {
+      "epoch": 2.0037393162393164,
+      "grad_norm": 0.593245267868042,
+      "learning_rate": 9.98600253240195e-05,
+      "loss": 0.7104,
+      "step": 11255
+    },
+    {
+      "epoch": 2.0039173789173788,
+      "grad_norm": 0.6806482672691345,
+      "learning_rate": 9.98460278669801e-05,
+      "loss": 0.6208,
+      "step": 11256
+    },
+    {
+      "epoch": 2.0040954415954415,
+      "grad_norm": 0.7329097390174866,
+      "learning_rate": 9.983203041295753e-05,
+      "loss": 0.8264,
+      "step": 11257
+    },
+    {
+      "epoch": 2.0042735042735043,
+      "grad_norm": 0.7579078078269958,
+      "learning_rate": 9.981803296222591e-05,
+      "loss": 0.7572,
+      "step": 11258
+    },
+    {
+      "epoch": 2.004451566951567,
+      "grad_norm": 0.7945193648338318,
+      "learning_rate": 9.980403551505958e-05,
+      "loss": 0.7916,
+      "step": 11259
+    },
+    {
+      "epoch": 2.0046296296296298,
+      "grad_norm": 0.5528121590614319,
+      "learning_rate": 9.979003807173276e-05,
+      "loss": 0.5609,
+      "step": 11260
+    },
+    {
+      "epoch": 2.0048076923076925,
+      "grad_norm": 0.7321668863296509,
+      "learning_rate": 9.977604063251973e-05,
+      "loss": 0.9041,
+      "step": 11261
+    },
+    {
+      "epoch": 2.004985754985755,
+      "grad_norm": 0.6553691029548645,
+      "learning_rate": 9.976204319769469e-05,
+      "loss": 0.853,
+      "step": 11262
+    },
+    {
+      "epoch": 2.0051638176638176,
+      "grad_norm": 0.789152979850769,
+      "learning_rate": 9.974804576753194e-05,
+      "loss": 0.9909,
+      "step": 11263
+    },
+    {
+      "epoch": 2.0053418803418803,
+      "grad_norm": 0.6342231631278992,
+      "learning_rate": 9.973404834230568e-05,
+      "loss": 0.8841,
+      "step": 11264
+    },
+    {
+      "epoch": 2.005519943019943,
+      "grad_norm": 0.671882688999176,
+      "learning_rate": 9.97200509222902e-05,
+      "loss": 0.8147,
+      "step": 11265
+    },
+    {
+      "epoch": 2.005698005698006,
+      "grad_norm": 0.8409315347671509,
+      "learning_rate": 9.970605350775978e-05,
+      "loss": 1.0466,
+      "step": 11266
+    },
+    {
+      "epoch": 2.0058760683760686,
+      "grad_norm": 0.6155081987380981,
+      "learning_rate": 9.969205609898858e-05,
+      "loss": 0.9461,
+      "step": 11267
+    },
+    {
+      "epoch": 2.006054131054131,
+      "grad_norm": 0.656370997428894,
+      "learning_rate": 9.967805869625093e-05,
+      "loss": 0.6152,
+      "step": 11268
+    },
+    {
+      "epoch": 2.0062321937321936,
+      "grad_norm": 0.6441524624824524,
+      "learning_rate": 9.966406129982103e-05,
+      "loss": 0.9023,
+      "step": 11269
+    },
+    {
+      "epoch": 2.0064102564102564,
+      "grad_norm": 0.7976031303405762,
+      "learning_rate": 9.96500639099732e-05,
+      "loss": 0.8886,
+      "step": 11270
+    },
+    {
+      "epoch": 2.006588319088319,
+      "grad_norm": 0.6888235807418823,
+      "learning_rate": 9.963606652698159e-05,
+      "loss": 0.7216,
+      "step": 11271
+    },
+    {
+      "epoch": 2.006766381766382,
+      "grad_norm": 0.8439735770225525,
+      "learning_rate": 9.962206915112054e-05,
+      "loss": 1.0161,
+      "step": 11272
+    },
+    {
+      "epoch": 2.0069444444444446,
+      "grad_norm": 0.6425265669822693,
+      "learning_rate": 9.960807178266423e-05,
+      "loss": 0.8718,
+      "step": 11273
+    },
+    {
+      "epoch": 2.007122507122507,
+      "grad_norm": 0.7393937110900879,
+      "learning_rate": 9.959407442188696e-05,
+      "loss": 0.7615,
+      "step": 11274
+    },
+    {
+      "epoch": 2.0073005698005697,
+      "grad_norm": 0.5919229984283447,
+      "learning_rate": 9.958007706906292e-05,
+      "loss": 0.6761,
+      "step": 11275
+    },
+    {
+      "epoch": 2.0074786324786325,
+      "grad_norm": 0.765935480594635,
+      "learning_rate": 9.956607972446644e-05,
+      "loss": 0.9057,
+      "step": 11276
+    },
+    {
+      "epoch": 2.007656695156695,
+      "grad_norm": 0.7091122269630432,
+      "learning_rate": 9.955208238837169e-05,
+      "loss": 0.8322,
+      "step": 11277
+    },
+    {
+      "epoch": 2.007834757834758,
+      "grad_norm": 0.850652813911438,
+      "learning_rate": 9.953808506105299e-05,
+      "loss": 0.9942,
+      "step": 11278
+    },
+    {
+      "epoch": 2.0080128205128207,
+      "grad_norm": 0.7341200113296509,
+      "learning_rate": 9.952408774278452e-05,
+      "loss": 0.7826,
+      "step": 11279
+    },
+    {
+      "epoch": 2.008190883190883,
+      "grad_norm": 0.6891999840736389,
+      "learning_rate": 9.95100904338406e-05,
+      "loss": 0.8939,
+      "step": 11280
+    },
+    {
+      "epoch": 2.0083689458689458,
+      "grad_norm": 0.800881028175354,
+      "learning_rate": 9.94960931344954e-05,
+      "loss": 0.8036,
+      "step": 11281
+    },
+    {
+      "epoch": 2.0085470085470085,
+      "grad_norm": 0.7483115792274475,
+      "learning_rate": 9.948209584502328e-05,
+      "loss": 0.7203,
+      "step": 11282
+    },
+    {
+      "epoch": 2.0087250712250713,
+      "grad_norm": 0.7314630150794983,
+      "learning_rate": 9.946809856569833e-05,
+      "loss": 0.8907,
+      "step": 11283
+    },
+    {
+      "epoch": 2.008903133903134,
+      "grad_norm": 0.7317429184913635,
+      "learning_rate": 9.945410129679493e-05,
+      "loss": 0.8971,
+      "step": 11284
+    },
+    {
+      "epoch": 2.0090811965811968,
+      "grad_norm": 0.6968898177146912,
+      "learning_rate": 9.944010403858726e-05,
+      "loss": 0.8638,
+      "step": 11285
+    },
+    {
+      "epoch": 2.009259259259259,
+      "grad_norm": 0.6680058240890503,
+      "learning_rate": 9.942610679134957e-05,
+      "loss": 0.7524,
+      "step": 11286
+    },
+    {
+      "epoch": 2.009437321937322,
+      "grad_norm": 0.6863839030265808,
+      "learning_rate": 9.941210955535618e-05,
+      "loss": 0.9647,
+      "step": 11287
+    },
+    {
+      "epoch": 2.0096153846153846,
+      "grad_norm": 0.7137607336044312,
+      "learning_rate": 9.939811233088125e-05,
+      "loss": 0.7089,
+      "step": 11288
+    },
+    {
+      "epoch": 2.0097934472934473,
+      "grad_norm": 0.8341759443283081,
+      "learning_rate": 9.938411511819907e-05,
+      "loss": 0.9461,
+      "step": 11289
+    },
+    {
+      "epoch": 2.00997150997151,
+      "grad_norm": 0.7326228022575378,
+      "learning_rate": 9.937011791758384e-05,
+      "loss": 0.8795,
+      "step": 11290
+    },
+    {
+      "epoch": 2.010149572649573,
+      "grad_norm": 0.6795905232429504,
+      "learning_rate": 9.935612072930989e-05,
+      "loss": 0.8298,
+      "step": 11291
+    },
+    {
+      "epoch": 2.010327635327635,
+      "grad_norm": 0.7060360312461853,
+      "learning_rate": 9.934212355365139e-05,
+      "loss": 0.8483,
+      "step": 11292
+    },
+    {
+      "epoch": 2.010505698005698,
+      "grad_norm": 0.7532246112823486,
+      "learning_rate": 9.932812639088265e-05,
+      "loss": 0.9061,
+      "step": 11293
+    },
+    {
+      "epoch": 2.0106837606837606,
+      "grad_norm": 0.6563972234725952,
+      "learning_rate": 9.931412924127781e-05,
+      "loss": 0.8511,
+      "step": 11294
+    },
+    {
+      "epoch": 2.0108618233618234,
+      "grad_norm": 0.6672948002815247,
+      "learning_rate": 9.930013210511125e-05,
+      "loss": 0.7875,
+      "step": 11295
+    },
+    {
+      "epoch": 2.011039886039886,
+      "grad_norm": 0.7173593640327454,
+      "learning_rate": 9.928613498265709e-05,
+      "loss": 0.8602,
+      "step": 11296
+    },
+    {
+      "epoch": 2.011217948717949,
+      "grad_norm": 0.7399459481239319,
+      "learning_rate": 9.927213787418968e-05,
+      "loss": 0.8711,
+      "step": 11297
+    },
+    {
+      "epoch": 2.011396011396011,
+      "grad_norm": 0.7693262696266174,
+      "learning_rate": 9.925814077998317e-05,
+      "loss": 0.9927,
+      "step": 11298
+    },
+    {
+      "epoch": 2.011574074074074,
+      "grad_norm": 0.7998616695404053,
+      "learning_rate": 9.92441437003119e-05,
+      "loss": 1.1585,
+      "step": 11299
+    },
+    {
+      "epoch": 2.0117521367521367,
+      "grad_norm": 0.7239874005317688,
+      "learning_rate": 9.923014663545002e-05,
+      "loss": 0.8736,
+      "step": 11300
+    },
+    {
+      "epoch": 2.0119301994301995,
+      "grad_norm": 0.8565806150436401,
+      "learning_rate": 9.921614958567186e-05,
+      "loss": 0.9768,
+      "step": 11301
+    },
+    {
+      "epoch": 2.012108262108262,
+      "grad_norm": 0.6341429948806763,
+      "learning_rate": 9.920215255125158e-05,
+      "loss": 0.6553,
+      "step": 11302
+    },
+    {
+      "epoch": 2.012286324786325,
+      "grad_norm": 0.824182391166687,
+      "learning_rate": 9.91881555324635e-05,
+      "loss": 1.0138,
+      "step": 11303
+    },
+    {
+      "epoch": 2.0124643874643873,
+      "grad_norm": 0.6309344172477722,
+      "learning_rate": 9.917415852958178e-05,
+      "loss": 0.619,
+      "step": 11304
+    },
+    {
+      "epoch": 2.01264245014245,
+      "grad_norm": 0.7469239830970764,
+      "learning_rate": 9.916016154288071e-05,
+      "loss": 0.8537,
+      "step": 11305
+    },
+    {
+      "epoch": 2.0128205128205128,
+      "grad_norm": 0.7433663606643677,
+      "learning_rate": 9.914616457263459e-05,
+      "loss": 0.8518,
+      "step": 11306
+    },
+    {
+      "epoch": 2.0129985754985755,
+      "grad_norm": 0.6550318002700806,
+      "learning_rate": 9.913216761911755e-05,
+      "loss": 0.8021,
+      "step": 11307
+    },
+    {
+      "epoch": 2.0131766381766383,
+      "grad_norm": 0.7360837459564209,
+      "learning_rate": 9.911817068260392e-05,
+      "loss": 0.7002,
+      "step": 11308
+    },
+    {
+      "epoch": 2.013354700854701,
+      "grad_norm": 0.7208407521247864,
+      "learning_rate": 9.910417376336786e-05,
+      "loss": 0.8633,
+      "step": 11309
+    },
+    {
+      "epoch": 2.0135327635327633,
+      "grad_norm": 0.7758026719093323,
+      "learning_rate": 9.909017686168369e-05,
+      "loss": 0.764,
+      "step": 11310
+    },
+    {
+      "epoch": 2.013710826210826,
+      "grad_norm": 0.8215547204017639,
+      "learning_rate": 9.90761799778256e-05,
+      "loss": 0.7062,
+      "step": 11311
+    },
+    {
+      "epoch": 2.013888888888889,
+      "grad_norm": 0.6731052994728088,
+      "learning_rate": 9.906218311206786e-05,
+      "loss": 0.902,
+      "step": 11312
+    },
+    {
+      "epoch": 2.0140669515669516,
+      "grad_norm": 0.74113929271698,
+      "learning_rate": 9.904818626468466e-05,
+      "loss": 0.7229,
+      "step": 11313
+    },
+    {
+      "epoch": 2.0142450142450143,
+      "grad_norm": 0.6673575639724731,
+      "learning_rate": 9.90341894359503e-05,
+      "loss": 0.7299,
+      "step": 11314
+    },
+    {
+      "epoch": 2.014423076923077,
+      "grad_norm": 0.7665545344352722,
+      "learning_rate": 9.902019262613897e-05,
+      "loss": 0.6993,
+      "step": 11315
+    },
+    {
+      "epoch": 2.0146011396011394,
+      "grad_norm": 0.6423895359039307,
+      "learning_rate": 9.900619583552497e-05,
+      "loss": 0.7344,
+      "step": 11316
+    },
+    {
+      "epoch": 2.014779202279202,
+      "grad_norm": 0.7071038484573364,
+      "learning_rate": 9.899219906438245e-05,
+      "loss": 0.6951,
+      "step": 11317
+    },
+    {
+      "epoch": 2.014957264957265,
+      "grad_norm": 0.689984142780304,
+      "learning_rate": 9.897820231298574e-05,
+      "loss": 0.8496,
+      "step": 11318
+    },
+    {
+      "epoch": 2.0151353276353277,
+      "grad_norm": 0.8747256398200989,
+      "learning_rate": 9.896420558160901e-05,
+      "loss": 0.9752,
+      "step": 11319
+    },
+    {
+      "epoch": 2.0153133903133904,
+      "grad_norm": 0.6828433275222778,
+      "learning_rate": 9.895020887052651e-05,
+      "loss": 0.8369,
+      "step": 11320
+    },
+    {
+      "epoch": 2.015491452991453,
+      "grad_norm": 0.7334261536598206,
+      "learning_rate": 9.89362121800125e-05,
+      "loss": 0.7744,
+      "step": 11321
+    },
+    {
+      "epoch": 2.0156695156695155,
+      "grad_norm": 0.7896139621734619,
+      "learning_rate": 9.892221551034122e-05,
+      "loss": 0.8353,
+      "step": 11322
+    },
+    {
+      "epoch": 2.015847578347578,
+      "grad_norm": 0.6673476099967957,
+      "learning_rate": 9.890821886178684e-05,
+      "loss": 0.8644,
+      "step": 11323
+    },
+    {
+      "epoch": 2.016025641025641,
+      "grad_norm": 0.7475691437721252,
+      "learning_rate": 9.889422223462368e-05,
+      "loss": 0.8034,
+      "step": 11324
+    },
+    {
+      "epoch": 2.0162037037037037,
+      "grad_norm": 0.9086315631866455,
+      "learning_rate": 9.888022562912593e-05,
+      "loss": 1.1878,
+      "step": 11325
+    },
+    {
+      "epoch": 2.0163817663817665,
+      "grad_norm": 0.6634678244590759,
+      "learning_rate": 9.88662290455678e-05,
+      "loss": 0.9655,
+      "step": 11326
+    },
+    {
+      "epoch": 2.0165598290598292,
+      "grad_norm": 0.7184932827949524,
+      "learning_rate": 9.885223248422361e-05,
+      "loss": 0.5964,
+      "step": 11327
+    },
+    {
+      "epoch": 2.0167378917378915,
+      "grad_norm": 0.6319148540496826,
+      "learning_rate": 9.883823594536751e-05,
+      "loss": 0.5692,
+      "step": 11328
+    },
+    {
+      "epoch": 2.0169159544159543,
+      "grad_norm": 0.6232550144195557,
+      "learning_rate": 9.88242394292738e-05,
+      "loss": 0.6492,
+      "step": 11329
+    },
+    {
+      "epoch": 2.017094017094017,
+      "grad_norm": 0.7149667143821716,
+      "learning_rate": 9.881024293621663e-05,
+      "loss": 0.7023,
+      "step": 11330
+    },
+    {
+      "epoch": 2.01727207977208,
+      "grad_norm": 0.8871679902076721,
+      "learning_rate": 9.879624646647031e-05,
+      "loss": 0.954,
+      "step": 11331
+    },
+    {
+      "epoch": 2.0174501424501425,
+      "grad_norm": 0.6905941367149353,
+      "learning_rate": 9.878225002030901e-05,
+      "loss": 0.8534,
+      "step": 11332
+    },
+    {
+      "epoch": 2.0176282051282053,
+      "grad_norm": 0.8891478776931763,
+      "learning_rate": 9.876825359800703e-05,
+      "loss": 0.8324,
+      "step": 11333
+    },
+    {
+      "epoch": 2.0178062678062676,
+      "grad_norm": 0.8125092387199402,
+      "learning_rate": 9.875425719983852e-05,
+      "loss": 0.9604,
+      "step": 11334
+    },
+    {
+      "epoch": 2.0179843304843303,
+      "grad_norm": 0.7362027764320374,
+      "learning_rate": 9.874026082607778e-05,
+      "loss": 0.7879,
+      "step": 11335
+    },
+    {
+      "epoch": 2.018162393162393,
+      "grad_norm": 0.6763492226600647,
+      "learning_rate": 9.872626447699899e-05,
+      "loss": 0.8839,
+      "step": 11336
+    },
+    {
+      "epoch": 2.018340455840456,
+      "grad_norm": 0.7350467443466187,
+      "learning_rate": 9.871226815287644e-05,
+      "loss": 0.834,
+      "step": 11337
+    },
+    {
+      "epoch": 2.0185185185185186,
+      "grad_norm": 0.7768327593803406,
+      "learning_rate": 9.869827185398428e-05,
+      "loss": 1.1123,
+      "step": 11338
+    },
+    {
+      "epoch": 2.0186965811965814,
+      "grad_norm": 0.9218043088912964,
+      "learning_rate": 9.868427558059681e-05,
+      "loss": 0.9439,
+      "step": 11339
+    },
+    {
+      "epoch": 2.0188746438746437,
+      "grad_norm": 0.6613419651985168,
+      "learning_rate": 9.867027933298819e-05,
+      "loss": 0.836,
+      "step": 11340
+    },
+    {
+      "epoch": 2.0190527065527064,
+      "grad_norm": 0.7251055240631104,
+      "learning_rate": 9.865628311143273e-05,
+      "loss": 0.973,
+      "step": 11341
+    },
+    {
+      "epoch": 2.019230769230769,
+      "grad_norm": 0.6571859121322632,
+      "learning_rate": 9.864228691620458e-05,
+      "loss": 0.8811,
+      "step": 11342
+    },
+    {
+      "epoch": 2.019408831908832,
+      "grad_norm": 0.7552264928817749,
+      "learning_rate": 9.862829074757802e-05,
+      "loss": 0.9128,
+      "step": 11343
+    },
+    {
+      "epoch": 2.0195868945868947,
+      "grad_norm": 0.6724083423614502,
+      "learning_rate": 9.861429460582723e-05,
+      "loss": 0.8894,
+      "step": 11344
+    },
+    {
+      "epoch": 2.0197649572649574,
+      "grad_norm": 0.8309593200683594,
+      "learning_rate": 9.860029849122644e-05,
+      "loss": 0.9374,
+      "step": 11345
+    },
+    {
+      "epoch": 2.0199430199430197,
+      "grad_norm": 0.7709865570068359,
+      "learning_rate": 9.858630240404993e-05,
+      "loss": 0.8195,
+      "step": 11346
+    },
+    {
+      "epoch": 2.0201210826210825,
+      "grad_norm": 0.8163080811500549,
+      "learning_rate": 9.857230634457187e-05,
+      "loss": 0.9329,
+      "step": 11347
+    },
+    {
+      "epoch": 2.0202991452991452,
+      "grad_norm": 0.8424021005630493,
+      "learning_rate": 9.855831031306653e-05,
+      "loss": 0.8732,
+      "step": 11348
+    },
+    {
+      "epoch": 2.020477207977208,
+      "grad_norm": 0.7816365361213684,
+      "learning_rate": 9.854431430980808e-05,
+      "loss": 0.8858,
+      "step": 11349
+    },
+    {
+      "epoch": 2.0206552706552707,
+      "grad_norm": 0.7559000253677368,
+      "learning_rate": 9.853031833507075e-05,
+      "loss": 0.7146,
+      "step": 11350
+    },
+    {
+      "epoch": 2.0208333333333335,
+      "grad_norm": 0.6723140478134155,
+      "learning_rate": 9.85163223891288e-05,
+      "loss": 0.813,
+      "step": 11351
+    },
+    {
+      "epoch": 2.021011396011396,
+      "grad_norm": 0.757641077041626,
+      "learning_rate": 9.850232647225646e-05,
+      "loss": 0.794,
+      "step": 11352
+    },
+    {
+      "epoch": 2.0211894586894585,
+      "grad_norm": 0.8217115998268127,
+      "learning_rate": 9.848833058472787e-05,
+      "loss": 1.0407,
+      "step": 11353
+    },
+    {
+      "epoch": 2.0213675213675213,
+      "grad_norm": 0.8016467690467834,
+      "learning_rate": 9.847433472681736e-05,
+      "loss": 0.8967,
+      "step": 11354
+    },
+    {
+      "epoch": 2.021545584045584,
+      "grad_norm": 0.7703533172607422,
+      "learning_rate": 9.846033889879903e-05,
+      "loss": 0.9669,
+      "step": 11355
+    },
+    {
+      "epoch": 2.021723646723647,
+      "grad_norm": 0.7372044920921326,
+      "learning_rate": 9.84463431009472e-05,
+      "loss": 0.8581,
+      "step": 11356
+    },
+    {
+      "epoch": 2.0219017094017095,
+      "grad_norm": 0.7676188945770264,
+      "learning_rate": 9.8432347333536e-05,
+      "loss": 0.8498,
+      "step": 11357
+    },
+    {
+      "epoch": 2.0220797720797723,
+      "grad_norm": 0.7485190629959106,
+      "learning_rate": 9.841835159683977e-05,
+      "loss": 0.8492,
+      "step": 11358
+    },
+    {
+      "epoch": 2.0222578347578346,
+      "grad_norm": 0.7287883758544922,
+      "learning_rate": 9.840435589113262e-05,
+      "loss": 0.9072,
+      "step": 11359
+    },
+    {
+      "epoch": 2.0224358974358974,
+      "grad_norm": 0.7719354033470154,
+      "learning_rate": 9.83903602166888e-05,
+      "loss": 0.7657,
+      "step": 11360
+    },
+    {
+      "epoch": 2.02261396011396,
+      "grad_norm": 0.7679458260536194,
+      "learning_rate": 9.837636457378251e-05,
+      "loss": 0.7098,
+      "step": 11361
+    },
+    {
+      "epoch": 2.022792022792023,
+      "grad_norm": 0.7496665120124817,
+      "learning_rate": 9.836236896268803e-05,
+      "loss": 0.8459,
+      "step": 11362
+    },
+    {
+      "epoch": 2.0229700854700856,
+      "grad_norm": 0.8511863350868225,
+      "learning_rate": 9.834837338367949e-05,
+      "loss": 0.9782,
+      "step": 11363
+    },
+    {
+      "epoch": 2.0231481481481484,
+      "grad_norm": 0.5752342343330383,
+      "learning_rate": 9.833437783703114e-05,
+      "loss": 0.4539,
+      "step": 11364
+    },
+    {
+      "epoch": 2.0233262108262107,
+      "grad_norm": 0.6654593348503113,
+      "learning_rate": 9.832038232301722e-05,
+      "loss": 0.8009,
+      "step": 11365
+    },
+    {
+      "epoch": 2.0235042735042734,
+      "grad_norm": 0.7296777963638306,
+      "learning_rate": 9.83063868419119e-05,
+      "loss": 0.7841,
+      "step": 11366
+    },
+    {
+      "epoch": 2.023682336182336,
+      "grad_norm": 0.8404465913772583,
+      "learning_rate": 9.829239139398943e-05,
+      "loss": 0.9152,
+      "step": 11367
+    },
+    {
+      "epoch": 2.023860398860399,
+      "grad_norm": 0.6407002806663513,
+      "learning_rate": 9.827839597952397e-05,
+      "loss": 0.6953,
+      "step": 11368
+    },
+    {
+      "epoch": 2.0240384615384617,
+      "grad_norm": 0.8107042908668518,
+      "learning_rate": 9.826440059878982e-05,
+      "loss": 0.8726,
+      "step": 11369
+    },
+    {
+      "epoch": 2.0242165242165244,
+      "grad_norm": 0.803804874420166,
+      "learning_rate": 9.825040525206108e-05,
+      "loss": 0.8906,
+      "step": 11370
+    },
+    {
+      "epoch": 2.0243945868945867,
+      "grad_norm": 0.7625358700752258,
+      "learning_rate": 9.823640993961205e-05,
+      "loss": 0.8938,
+      "step": 11371
+    },
+    {
+      "epoch": 2.0245726495726495,
+      "grad_norm": 0.690793514251709,
+      "learning_rate": 9.822241466171686e-05,
+      "loss": 0.7926,
+      "step": 11372
+    },
+    {
+      "epoch": 2.0247507122507122,
+      "grad_norm": 0.7006554007530212,
+      "learning_rate": 9.820841941864983e-05,
+      "loss": 0.793,
+      "step": 11373
+    },
+    {
+      "epoch": 2.024928774928775,
+      "grad_norm": 0.8029078841209412,
+      "learning_rate": 9.819442421068504e-05,
+      "loss": 0.867,
+      "step": 11374
+    },
+    {
+      "epoch": 2.0251068376068377,
+      "grad_norm": 0.6999112367630005,
+      "learning_rate": 9.818042903809678e-05,
+      "loss": 0.688,
+      "step": 11375
+    },
+    {
+      "epoch": 2.0252849002849005,
+      "grad_norm": 0.6848462224006653,
+      "learning_rate": 9.816643390115923e-05,
+      "loss": 0.7337,
+      "step": 11376
+    },
+    {
+      "epoch": 2.025462962962963,
+      "grad_norm": 0.7698155641555786,
+      "learning_rate": 9.815243880014663e-05,
+      "loss": 0.9712,
+      "step": 11377
+    },
+    {
+      "epoch": 2.0256410256410255,
+      "grad_norm": 0.8449836373329163,
+      "learning_rate": 9.81384437353331e-05,
+      "loss": 0.9144,
+      "step": 11378
+    },
+    {
+      "epoch": 2.0258190883190883,
+      "grad_norm": 0.6340110301971436,
+      "learning_rate": 9.812444870699296e-05,
+      "loss": 0.6365,
+      "step": 11379
+    },
+    {
+      "epoch": 2.025997150997151,
+      "grad_norm": 0.7104073762893677,
+      "learning_rate": 9.81104537154003e-05,
+      "loss": 0.7781,
+      "step": 11380
+    },
+    {
+      "epoch": 2.026175213675214,
+      "grad_norm": 0.7287606000900269,
+      "learning_rate": 9.809645876082939e-05,
+      "loss": 0.9351,
+      "step": 11381
+    },
+    {
+      "epoch": 2.0263532763532766,
+      "grad_norm": 0.9640787243843079,
+      "learning_rate": 9.80824638435544e-05,
+      "loss": 0.8745,
+      "step": 11382
+    },
+    {
+      "epoch": 2.026531339031339,
+      "grad_norm": 0.5718010067939758,
+      "learning_rate": 9.806846896384959e-05,
+      "loss": 0.4711,
+      "step": 11383
+    },
+    {
+      "epoch": 2.0267094017094016,
+      "grad_norm": 0.7903527021408081,
+      "learning_rate": 9.805447412198907e-05,
+      "loss": 0.8241,
+      "step": 11384
+    },
+    {
+      "epoch": 2.0268874643874644,
+      "grad_norm": 0.8579357862472534,
+      "learning_rate": 9.80404793182471e-05,
+      "loss": 0.8621,
+      "step": 11385
+    },
+    {
+      "epoch": 2.027065527065527,
+      "grad_norm": 0.8466464877128601,
+      "learning_rate": 9.802648455289787e-05,
+      "loss": 0.8772,
+      "step": 11386
+    },
+    {
+      "epoch": 2.02724358974359,
+      "grad_norm": 0.7888286709785461,
+      "learning_rate": 9.801248982621557e-05,
+      "loss": 0.8352,
+      "step": 11387
+    },
+    {
+      "epoch": 2.0274216524216526,
+      "grad_norm": 0.6967005133628845,
+      "learning_rate": 9.799849513847444e-05,
+      "loss": 0.7936,
+      "step": 11388
+    },
+    {
+      "epoch": 2.027599715099715,
+      "grad_norm": 0.6987027525901794,
+      "learning_rate": 9.79845004899486e-05,
+      "loss": 0.7778,
+      "step": 11389
+    },
+    {
+      "epoch": 2.0277777777777777,
+      "grad_norm": 0.7414312362670898,
+      "learning_rate": 9.797050588091233e-05,
+      "loss": 0.9017,
+      "step": 11390
+    },
+    {
+      "epoch": 2.0279558404558404,
+      "grad_norm": 0.7932028770446777,
+      "learning_rate": 9.795651131163974e-05,
+      "loss": 0.8662,
+      "step": 11391
+    },
+    {
+      "epoch": 2.028133903133903,
+      "grad_norm": 0.8166332244873047,
+      "learning_rate": 9.79425167824051e-05,
+      "loss": 1.0489,
+      "step": 11392
+    },
+    {
+      "epoch": 2.028311965811966,
+      "grad_norm": 0.7265253663063049,
+      "learning_rate": 9.792852229348251e-05,
+      "loss": 0.9458,
+      "step": 11393
+    },
+    {
+      "epoch": 2.0284900284900287,
+      "grad_norm": 0.7374703288078308,
+      "learning_rate": 9.791452784514629e-05,
+      "loss": 0.9203,
+      "step": 11394
+    },
+    {
+      "epoch": 2.028668091168091,
+      "grad_norm": 0.6912441253662109,
+      "learning_rate": 9.790053343767052e-05,
+      "loss": 0.8986,
+      "step": 11395
+    },
+    {
+      "epoch": 2.0288461538461537,
+      "grad_norm": 0.871231734752655,
+      "learning_rate": 9.788653907132946e-05,
+      "loss": 0.6811,
+      "step": 11396
+    },
+    {
+      "epoch": 2.0290242165242165,
+      "grad_norm": 0.7361812591552734,
+      "learning_rate": 9.787254474639726e-05,
+      "loss": 0.7868,
+      "step": 11397
+    },
+    {
+      "epoch": 2.0292022792022792,
+      "grad_norm": 0.6828895211219788,
+      "learning_rate": 9.785855046314815e-05,
+      "loss": 0.7739,
+      "step": 11398
+    },
+    {
+      "epoch": 2.029380341880342,
+      "grad_norm": 0.7203328609466553,
+      "learning_rate": 9.784455622185626e-05,
+      "loss": 0.6474,
+      "step": 11399
+    },
+    {
+      "epoch": 2.0295584045584047,
+      "grad_norm": 0.774886429309845,
+      "learning_rate": 9.783056202279587e-05,
+      "loss": 0.8073,
+      "step": 11400
+    },
+    {
+      "epoch": 2.029736467236467,
+      "grad_norm": 0.6479005813598633,
+      "learning_rate": 9.781656786624106e-05,
+      "loss": 0.7237,
+      "step": 11401
+    },
+    {
+      "epoch": 2.02991452991453,
+      "grad_norm": 0.7269866466522217,
+      "learning_rate": 9.78025737524661e-05,
+      "loss": 0.9089,
+      "step": 11402
+    },
+    {
+      "epoch": 2.0300925925925926,
+      "grad_norm": 0.7265415191650391,
+      "learning_rate": 9.778857968174509e-05,
+      "loss": 0.827,
+      "step": 11403
+    },
+    {
+      "epoch": 2.0302706552706553,
+      "grad_norm": 0.8174277544021606,
+      "learning_rate": 9.777458565435227e-05,
+      "loss": 0.6752,
+      "step": 11404
+    },
+    {
+      "epoch": 2.030448717948718,
+      "grad_norm": 0.9333333969116211,
+      "learning_rate": 9.77605916705619e-05,
+      "loss": 0.9542,
+      "step": 11405
+    },
+    {
+      "epoch": 2.030626780626781,
+      "grad_norm": 0.6854027509689331,
+      "learning_rate": 9.774659773064801e-05,
+      "loss": 0.8526,
+      "step": 11406
+    },
+    {
+      "epoch": 2.030804843304843,
+      "grad_norm": 0.7711043357849121,
+      "learning_rate": 9.773260383488489e-05,
+      "loss": 0.9009,
+      "step": 11407
+    },
+    {
+      "epoch": 2.030982905982906,
+      "grad_norm": 0.6915287971496582,
+      "learning_rate": 9.771860998354667e-05,
+      "loss": 0.9635,
+      "step": 11408
+    },
+    {
+      "epoch": 2.0311609686609686,
+      "grad_norm": 0.7978841066360474,
+      "learning_rate": 9.770461617690758e-05,
+      "loss": 0.7563,
+      "step": 11409
+    },
+    {
+      "epoch": 2.0313390313390314,
+      "grad_norm": 0.6686414480209351,
+      "learning_rate": 9.769062241524172e-05,
+      "loss": 0.8282,
+      "step": 11410
+    },
+    {
+      "epoch": 2.031517094017094,
+      "grad_norm": 0.7024029493331909,
+      "learning_rate": 9.767662869882335e-05,
+      "loss": 0.9176,
+      "step": 11411
+    },
+    {
+      "epoch": 2.031695156695157,
+      "grad_norm": 0.6945844292640686,
+      "learning_rate": 9.766263502792659e-05,
+      "loss": 0.86,
+      "step": 11412
+    },
+    {
+      "epoch": 2.031873219373219,
+      "grad_norm": 0.7351676821708679,
+      "learning_rate": 9.764864140282569e-05,
+      "loss": 0.865,
+      "step": 11413
+    },
+    {
+      "epoch": 2.032051282051282,
+      "grad_norm": 0.7663825750350952,
+      "learning_rate": 9.763464782379472e-05,
+      "loss": 0.9309,
+      "step": 11414
+    },
+    {
+      "epoch": 2.0322293447293447,
+      "grad_norm": 0.7552894949913025,
+      "learning_rate": 9.762065429110798e-05,
+      "loss": 0.8366,
+      "step": 11415
+    },
+    {
+      "epoch": 2.0324074074074074,
+      "grad_norm": 0.6852208971977234,
+      "learning_rate": 9.760666080503951e-05,
+      "loss": 0.9095,
+      "step": 11416
+    },
+    {
+      "epoch": 2.03258547008547,
+      "grad_norm": 0.7759820222854614,
+      "learning_rate": 9.759266736586358e-05,
+      "loss": 0.7461,
+      "step": 11417
+    },
+    {
+      "epoch": 2.032763532763533,
+      "grad_norm": 0.6514183878898621,
+      "learning_rate": 9.757867397385431e-05,
+      "loss": 0.5479,
+      "step": 11418
+    },
+    {
+      "epoch": 2.0329415954415953,
+      "grad_norm": 0.7703103423118591,
+      "learning_rate": 9.756468062928593e-05,
+      "loss": 0.9588,
+      "step": 11419
+    },
+    {
+      "epoch": 2.033119658119658,
+      "grad_norm": 0.6937198638916016,
+      "learning_rate": 9.755068733243255e-05,
+      "loss": 0.8661,
+      "step": 11420
+    },
+    {
+      "epoch": 2.0332977207977208,
+      "grad_norm": 0.6675645112991333,
+      "learning_rate": 9.753669408356835e-05,
+      "loss": 0.7484,
+      "step": 11421
+    },
+    {
+      "epoch": 2.0334757834757835,
+      "grad_norm": 0.6653266549110413,
+      "learning_rate": 9.752270088296753e-05,
+      "loss": 0.7217,
+      "step": 11422
+    },
+    {
+      "epoch": 2.0336538461538463,
+      "grad_norm": 0.7893908023834229,
+      "learning_rate": 9.750870773090425e-05,
+      "loss": 0.6346,
+      "step": 11423
+    },
+    {
+      "epoch": 2.033831908831909,
+      "grad_norm": 0.7442745566368103,
+      "learning_rate": 9.749471462765265e-05,
+      "loss": 0.9452,
+      "step": 11424
+    },
+    {
+      "epoch": 2.0340099715099713,
+      "grad_norm": 0.8270035982131958,
+      "learning_rate": 9.748072157348691e-05,
+      "loss": 0.9906,
+      "step": 11425
+    },
+    {
+      "epoch": 2.034188034188034,
+      "grad_norm": 0.7195143699645996,
+      "learning_rate": 9.746672856868123e-05,
+      "loss": 0.9564,
+      "step": 11426
+    },
+    {
+      "epoch": 2.034366096866097,
+      "grad_norm": 0.75486820936203,
+      "learning_rate": 9.745273561350971e-05,
+      "loss": 0.9052,
+      "step": 11427
+    },
+    {
+      "epoch": 2.0345441595441596,
+      "grad_norm": 0.6710293889045715,
+      "learning_rate": 9.743874270824655e-05,
+      "loss": 0.6801,
+      "step": 11428
+    },
+    {
+      "epoch": 2.0347222222222223,
+      "grad_norm": 0.704175591468811,
+      "learning_rate": 9.742474985316588e-05,
+      "loss": 0.8619,
+      "step": 11429
+    },
+    {
+      "epoch": 2.034900284900285,
+      "grad_norm": 0.7941717505455017,
+      "learning_rate": 9.741075704854196e-05,
+      "loss": 0.8318,
+      "step": 11430
+    },
+    {
+      "epoch": 2.0350783475783474,
+      "grad_norm": 0.8592050671577454,
+      "learning_rate": 9.739676429464881e-05,
+      "loss": 0.8203,
+      "step": 11431
+    },
+    {
+      "epoch": 2.03525641025641,
+      "grad_norm": 0.9149407148361206,
+      "learning_rate": 9.738277159176068e-05,
+      "loss": 0.87,
+      "step": 11432
+    },
+    {
+      "epoch": 2.035434472934473,
+      "grad_norm": 0.780890941619873,
+      "learning_rate": 9.736877894015169e-05,
+      "loss": 0.6971,
+      "step": 11433
+    },
+    {
+      "epoch": 2.0356125356125356,
+      "grad_norm": 0.7540209293365479,
+      "learning_rate": 9.735478634009605e-05,
+      "loss": 0.8927,
+      "step": 11434
+    },
+    {
+      "epoch": 2.0357905982905984,
+      "grad_norm": 0.8556281924247742,
+      "learning_rate": 9.734079379186782e-05,
+      "loss": 0.7498,
+      "step": 11435
+    },
+    {
+      "epoch": 2.035968660968661,
+      "grad_norm": 0.8710931539535522,
+      "learning_rate": 9.732680129574128e-05,
+      "loss": 0.6009,
+      "step": 11436
+    },
+    {
+      "epoch": 2.0361467236467234,
+      "grad_norm": 0.6873082518577576,
+      "learning_rate": 9.731280885199045e-05,
+      "loss": 0.8441,
+      "step": 11437
+    },
+    {
+      "epoch": 2.036324786324786,
+      "grad_norm": 0.8333037495613098,
+      "learning_rate": 9.729881646088958e-05,
+      "loss": 0.888,
+      "step": 11438
+    },
+    {
+      "epoch": 2.036502849002849,
+      "grad_norm": 0.859365701675415,
+      "learning_rate": 9.728482412271277e-05,
+      "loss": 1.0272,
+      "step": 11439
+    },
+    {
+      "epoch": 2.0366809116809117,
+      "grad_norm": 0.7239334583282471,
+      "learning_rate": 9.727083183773423e-05,
+      "loss": 0.9428,
+      "step": 11440
+    },
+    {
+      "epoch": 2.0368589743589745,
+      "grad_norm": 0.8341524004936218,
+      "learning_rate": 9.725683960622804e-05,
+      "loss": 0.9275,
+      "step": 11441
+    },
+    {
+      "epoch": 2.037037037037037,
+      "grad_norm": 0.6992602348327637,
+      "learning_rate": 9.724284742846838e-05,
+      "loss": 0.8492,
+      "step": 11442
+    },
+    {
+      "epoch": 2.0372150997150995,
+      "grad_norm": 0.7429133057594299,
+      "learning_rate": 9.72288553047294e-05,
+      "loss": 0.8246,
+      "step": 11443
+    },
+    {
+      "epoch": 2.0373931623931623,
+      "grad_norm": 0.7765250205993652,
+      "learning_rate": 9.721486323528522e-05,
+      "loss": 0.8624,
+      "step": 11444
+    },
+    {
+      "epoch": 2.037571225071225,
+      "grad_norm": 0.9104889631271362,
+      "learning_rate": 9.720087122041007e-05,
+      "loss": 0.8369,
+      "step": 11445
+    },
+    {
+      "epoch": 2.0377492877492878,
+      "grad_norm": 0.6483191251754761,
+      "learning_rate": 9.718687926037798e-05,
+      "loss": 0.7347,
+      "step": 11446
+    },
+    {
+      "epoch": 2.0379273504273505,
+      "grad_norm": 0.7816178202629089,
+      "learning_rate": 9.717288735546317e-05,
+      "loss": 0.8607,
+      "step": 11447
+    },
+    {
+      "epoch": 2.0381054131054133,
+      "grad_norm": 0.6909009218215942,
+      "learning_rate": 9.715889550593975e-05,
+      "loss": 0.8764,
+      "step": 11448
+    },
+    {
+      "epoch": 2.0382834757834756,
+      "grad_norm": 0.8101255297660828,
+      "learning_rate": 9.71449037120819e-05,
+      "loss": 0.8858,
+      "step": 11449
+    },
+    {
+      "epoch": 2.0384615384615383,
+      "grad_norm": 0.7476511001586914,
+      "learning_rate": 9.71309119741637e-05,
+      "loss": 0.8765,
+      "step": 11450
+    },
+    {
+      "epoch": 2.038639601139601,
+      "grad_norm": 0.7514875531196594,
+      "learning_rate": 9.711692029245934e-05,
+      "loss": 0.942,
+      "step": 11451
+    },
+    {
+      "epoch": 2.038817663817664,
+      "grad_norm": 0.7400087118148804,
+      "learning_rate": 9.710292866724292e-05,
+      "loss": 0.8327,
+      "step": 11452
+    },
+    {
+      "epoch": 2.0389957264957266,
+      "grad_norm": 0.832979142665863,
+      "learning_rate": 9.70889370987886e-05,
+      "loss": 0.9714,
+      "step": 11453
+    },
+    {
+      "epoch": 2.0391737891737893,
+      "grad_norm": 0.6918326616287231,
+      "learning_rate": 9.70749455873705e-05,
+      "loss": 0.7765,
+      "step": 11454
+    },
+    {
+      "epoch": 2.0393518518518516,
+      "grad_norm": 0.8286036849021912,
+      "learning_rate": 9.70609541332628e-05,
+      "loss": 0.9138,
+      "step": 11455
+    },
+    {
+      "epoch": 2.0395299145299144,
+      "grad_norm": 0.6436729431152344,
+      "learning_rate": 9.704696273673955e-05,
+      "loss": 0.738,
+      "step": 11456
+    },
+    {
+      "epoch": 2.039707977207977,
+      "grad_norm": 0.7057681679725647,
+      "learning_rate": 9.703297139807496e-05,
+      "loss": 0.8107,
+      "step": 11457
+    },
+    {
+      "epoch": 2.03988603988604,
+      "grad_norm": 0.7444550395011902,
+      "learning_rate": 9.701898011754313e-05,
+      "loss": 0.8188,
+      "step": 11458
+    },
+    {
+      "epoch": 2.0400641025641026,
+      "grad_norm": 0.7622130513191223,
+      "learning_rate": 9.70049888954182e-05,
+      "loss": 0.8451,
+      "step": 11459
+    },
+    {
+      "epoch": 2.0402421652421654,
+      "grad_norm": 0.8166092038154602,
+      "learning_rate": 9.699099773197426e-05,
+      "loss": 0.9399,
+      "step": 11460
+    },
+    {
+      "epoch": 2.0404202279202277,
+      "grad_norm": 0.7235924601554871,
+      "learning_rate": 9.697700662748552e-05,
+      "loss": 0.7863,
+      "step": 11461
+    },
+    {
+      "epoch": 2.0405982905982905,
+      "grad_norm": 0.7150312662124634,
+      "learning_rate": 9.696301558222601e-05,
+      "loss": 0.8288,
+      "step": 11462
+    },
+    {
+      "epoch": 2.040776353276353,
+      "grad_norm": 0.8007016777992249,
+      "learning_rate": 9.694902459646993e-05,
+      "loss": 0.9203,
+      "step": 11463
+    },
+    {
+      "epoch": 2.040954415954416,
+      "grad_norm": 0.7665491700172424,
+      "learning_rate": 9.693503367049134e-05,
+      "loss": 0.7956,
+      "step": 11464
+    },
+    {
+      "epoch": 2.0411324786324787,
+      "grad_norm": 0.7499460577964783,
+      "learning_rate": 9.692104280456439e-05,
+      "loss": 0.9973,
+      "step": 11465
+    },
+    {
+      "epoch": 2.0413105413105415,
+      "grad_norm": 0.7598159909248352,
+      "learning_rate": 9.690705199896327e-05,
+      "loss": 0.95,
+      "step": 11466
+    },
+    {
+      "epoch": 2.041488603988604,
+      "grad_norm": 0.7699945569038391,
+      "learning_rate": 9.689306125396201e-05,
+      "loss": 0.8731,
+      "step": 11467
+    },
+    {
+      "epoch": 2.0416666666666665,
+      "grad_norm": 0.6724731922149658,
+      "learning_rate": 9.687907056983476e-05,
+      "loss": 0.906,
+      "step": 11468
+    },
+    {
+      "epoch": 2.0418447293447293,
+      "grad_norm": 0.9238275289535522,
+      "learning_rate": 9.686507994685562e-05,
+      "loss": 0.8397,
+      "step": 11469
+    },
+    {
+      "epoch": 2.042022792022792,
+      "grad_norm": 0.744969367980957,
+      "learning_rate": 9.685108938529876e-05,
+      "loss": 0.8436,
+      "step": 11470
+    },
+    {
+      "epoch": 2.0422008547008548,
+      "grad_norm": 0.6983298063278198,
+      "learning_rate": 9.683709888543824e-05,
+      "loss": 0.8235,
+      "step": 11471
+    },
+    {
+      "epoch": 2.0423789173789175,
+      "grad_norm": 0.7098708748817444,
+      "learning_rate": 9.682310844754824e-05,
+      "loss": 0.8235,
+      "step": 11472
+    },
+    {
+      "epoch": 2.04255698005698,
+      "grad_norm": 0.7492793798446655,
+      "learning_rate": 9.680911807190277e-05,
+      "loss": 0.7988,
+      "step": 11473
+    },
+    {
+      "epoch": 2.0427350427350426,
+      "grad_norm": 0.6952250003814697,
+      "learning_rate": 9.679512775877604e-05,
+      "loss": 0.7928,
+      "step": 11474
+    },
+    {
+      "epoch": 2.0429131054131053,
+      "grad_norm": 0.6442983150482178,
+      "learning_rate": 9.678113750844209e-05,
+      "loss": 0.8206,
+      "step": 11475
+    },
+    {
+      "epoch": 2.043091168091168,
+      "grad_norm": 0.7408245205879211,
+      "learning_rate": 9.67671473211751e-05,
+      "loss": 0.6941,
+      "step": 11476
+    },
+    {
+      "epoch": 2.043269230769231,
+      "grad_norm": 0.8277738094329834,
+      "learning_rate": 9.675315719724913e-05,
+      "loss": 1.3153,
+      "step": 11477
+    },
+    {
+      "epoch": 2.0434472934472936,
+      "grad_norm": 0.7535714507102966,
+      "learning_rate": 9.67391671369383e-05,
+      "loss": 0.9238,
+      "step": 11478
+    },
+    {
+      "epoch": 2.0436253561253563,
+      "grad_norm": 0.8341996073722839,
+      "learning_rate": 9.67251771405167e-05,
+      "loss": 0.8149,
+      "step": 11479
+    },
+    {
+      "epoch": 2.0438034188034186,
+      "grad_norm": 0.7365956902503967,
+      "learning_rate": 9.671118720825849e-05,
+      "loss": 0.799,
+      "step": 11480
+    },
+    {
+      "epoch": 2.0439814814814814,
+      "grad_norm": 0.7630738615989685,
+      "learning_rate": 9.669719734043769e-05,
+      "loss": 0.9284,
+      "step": 11481
+    },
+    {
+      "epoch": 2.044159544159544,
+      "grad_norm": 0.659172773361206,
+      "learning_rate": 9.668320753732848e-05,
+      "loss": 0.7594,
+      "step": 11482
+    },
+    {
+      "epoch": 2.044337606837607,
+      "grad_norm": 0.7724705934524536,
+      "learning_rate": 9.66692177992049e-05,
+      "loss": 0.8623,
+      "step": 11483
+    },
+    {
+      "epoch": 2.0445156695156697,
+      "grad_norm": 0.7140040993690491,
+      "learning_rate": 9.665522812634108e-05,
+      "loss": 0.851,
+      "step": 11484
+    },
+    {
+      "epoch": 2.0446937321937324,
+      "grad_norm": 0.9072890877723694,
+      "learning_rate": 9.664123851901115e-05,
+      "loss": 0.9459,
+      "step": 11485
+    },
+    {
+      "epoch": 2.0448717948717947,
+      "grad_norm": 0.8145443201065063,
+      "learning_rate": 9.662724897748915e-05,
+      "loss": 0.9067,
+      "step": 11486
+    },
+    {
+      "epoch": 2.0450498575498575,
+      "grad_norm": 0.8471246957778931,
+      "learning_rate": 9.661325950204922e-05,
+      "loss": 0.7194,
+      "step": 11487
+    },
+    {
+      "epoch": 2.04522792022792,
+      "grad_norm": 0.8465375304222107,
+      "learning_rate": 9.659927009296541e-05,
+      "loss": 0.9495,
+      "step": 11488
+    },
+    {
+      "epoch": 2.045405982905983,
+      "grad_norm": 0.7597832083702087,
+      "learning_rate": 9.658528075051185e-05,
+      "loss": 0.7526,
+      "step": 11489
+    },
+    {
+      "epoch": 2.0455840455840457,
+      "grad_norm": 0.8013564944267273,
+      "learning_rate": 9.657129147496261e-05,
+      "loss": 1.0514,
+      "step": 11490
+    },
+    {
+      "epoch": 2.0457621082621085,
+      "grad_norm": 0.8695764541625977,
+      "learning_rate": 9.655730226659182e-05,
+      "loss": 0.9925,
+      "step": 11491
+    },
+    {
+      "epoch": 2.0459401709401708,
+      "grad_norm": 0.7295607328414917,
+      "learning_rate": 9.65433131256735e-05,
+      "loss": 0.8652,
+      "step": 11492
+    },
+    {
+      "epoch": 2.0461182336182335,
+      "grad_norm": 0.7819971442222595,
+      "learning_rate": 9.652932405248181e-05,
+      "loss": 0.8601,
+      "step": 11493
+    },
+    {
+      "epoch": 2.0462962962962963,
+      "grad_norm": 0.7244205474853516,
+      "learning_rate": 9.651533504729078e-05,
+      "loss": 0.752,
+      "step": 11494
+    },
+    {
+      "epoch": 2.046474358974359,
+      "grad_norm": 0.7774363160133362,
+      "learning_rate": 9.650134611037456e-05,
+      "loss": 0.8638,
+      "step": 11495
+    },
+    {
+      "epoch": 2.046652421652422,
+      "grad_norm": 0.7955372929573059,
+      "learning_rate": 9.648735724200715e-05,
+      "loss": 0.9662,
+      "step": 11496
+    },
+    {
+      "epoch": 2.0468304843304845,
+      "grad_norm": 0.7114127278327942,
+      "learning_rate": 9.647336844246273e-05,
+      "loss": 0.9523,
+      "step": 11497
+    },
+    {
+      "epoch": 2.047008547008547,
+      "grad_norm": 0.7449100017547607,
+      "learning_rate": 9.645937971201527e-05,
+      "loss": 0.7898,
+      "step": 11498
+    },
+    {
+      "epoch": 2.0471866096866096,
+      "grad_norm": 0.7541512846946716,
+      "learning_rate": 9.644539105093895e-05,
+      "loss": 0.9286,
+      "step": 11499
+    },
+    {
+      "epoch": 2.0473646723646723,
+      "grad_norm": 0.6816682815551758,
+      "learning_rate": 9.643140245950778e-05,
+      "loss": 0.7757,
+      "step": 11500
+    },
+    {
+      "epoch": 2.047542735042735,
+      "grad_norm": 0.7222850918769836,
+      "learning_rate": 9.641741393799591e-05,
+      "loss": 0.8415,
+      "step": 11501
+    },
+    {
+      "epoch": 2.047720797720798,
+      "grad_norm": 0.7605552077293396,
+      "learning_rate": 9.640342548667732e-05,
+      "loss": 0.8875,
+      "step": 11502
+    },
+    {
+      "epoch": 2.0478988603988606,
+      "grad_norm": 0.7442240118980408,
+      "learning_rate": 9.638943710582615e-05,
+      "loss": 0.8755,
+      "step": 11503
+    },
+    {
+      "epoch": 2.048076923076923,
+      "grad_norm": 0.7065736651420593,
+      "learning_rate": 9.637544879571648e-05,
+      "loss": 0.6885,
+      "step": 11504
+    },
+    {
+      "epoch": 2.0482549857549857,
+      "grad_norm": 0.6400303244590759,
+      "learning_rate": 9.636146055662232e-05,
+      "loss": 0.5775,
+      "step": 11505
+    },
+    {
+      "epoch": 2.0484330484330484,
+      "grad_norm": 0.7955389022827148,
+      "learning_rate": 9.634747238881783e-05,
+      "loss": 1.0182,
+      "step": 11506
+    },
+    {
+      "epoch": 2.048611111111111,
+      "grad_norm": 0.8283255696296692,
+      "learning_rate": 9.6333484292577e-05,
+      "loss": 0.9247,
+      "step": 11507
+    },
+    {
+      "epoch": 2.048789173789174,
+      "grad_norm": 0.7619521617889404,
+      "learning_rate": 9.631949626817399e-05,
+      "loss": 0.8355,
+      "step": 11508
+    },
+    {
+      "epoch": 2.0489672364672367,
+      "grad_norm": 0.7204191088676453,
+      "learning_rate": 9.630550831588273e-05,
+      "loss": 0.8571,
+      "step": 11509
+    },
+    {
+      "epoch": 2.049145299145299,
+      "grad_norm": 0.7568399310112,
+      "learning_rate": 9.629152043597738e-05,
+      "loss": 0.7349,
+      "step": 11510
+    },
+    {
+      "epoch": 2.0493233618233617,
+      "grad_norm": 0.8594959378242493,
+      "learning_rate": 9.627753262873199e-05,
+      "loss": 0.6918,
+      "step": 11511
+    },
+    {
+      "epoch": 2.0495014245014245,
+      "grad_norm": 0.6345391869544983,
+      "learning_rate": 9.626354489442064e-05,
+      "loss": 0.871,
+      "step": 11512
+    },
+    {
+      "epoch": 2.0496794871794872,
+      "grad_norm": 0.7671827673912048,
+      "learning_rate": 9.624955723331732e-05,
+      "loss": 0.8518,
+      "step": 11513
+    },
+    {
+      "epoch": 2.04985754985755,
+      "grad_norm": 0.7182049751281738,
+      "learning_rate": 9.623556964569616e-05,
+      "loss": 0.889,
+      "step": 11514
+    },
+    {
+      "epoch": 2.0500356125356127,
+      "grad_norm": 0.8342016339302063,
+      "learning_rate": 9.622158213183118e-05,
+      "loss": 0.9712,
+      "step": 11515
+    },
+    {
+      "epoch": 2.050213675213675,
+      "grad_norm": 0.7007761001586914,
+      "learning_rate": 9.620759469199649e-05,
+      "loss": 0.8113,
+      "step": 11516
+    },
+    {
+      "epoch": 2.050391737891738,
+      "grad_norm": 0.7129531502723694,
+      "learning_rate": 9.619360732646605e-05,
+      "loss": 0.8666,
+      "step": 11517
+    },
+    {
+      "epoch": 2.0505698005698005,
+      "grad_norm": 0.7505812048912048,
+      "learning_rate": 9.6179620035514e-05,
+      "loss": 0.7321,
+      "step": 11518
+    },
+    {
+      "epoch": 2.0507478632478633,
+      "grad_norm": 0.7407607436180115,
+      "learning_rate": 9.616563281941433e-05,
+      "loss": 0.9275,
+      "step": 11519
+    },
+    {
+      "epoch": 2.050925925925926,
+      "grad_norm": 0.769345223903656,
+      "learning_rate": 9.615164567844116e-05,
+      "loss": 0.9731,
+      "step": 11520
+    },
+    {
+      "epoch": 2.051103988603989,
+      "grad_norm": 0.7782812118530273,
+      "learning_rate": 9.613765861286846e-05,
+      "loss": 0.9702,
+      "step": 11521
+    },
+    {
+      "epoch": 2.051282051282051,
+      "grad_norm": 0.7071413993835449,
+      "learning_rate": 9.612367162297037e-05,
+      "loss": 0.8451,
+      "step": 11522
+    },
+    {
+      "epoch": 2.051460113960114,
+      "grad_norm": 0.7598503232002258,
+      "learning_rate": 9.610968470902082e-05,
+      "loss": 0.8641,
+      "step": 11523
+    },
+    {
+      "epoch": 2.0516381766381766,
+      "grad_norm": 0.7951003313064575,
+      "learning_rate": 9.609569787129394e-05,
+      "loss": 0.9131,
+      "step": 11524
+    },
+    {
+      "epoch": 2.0518162393162394,
+      "grad_norm": 0.8029175996780396,
+      "learning_rate": 9.608171111006374e-05,
+      "loss": 0.8618,
+      "step": 11525
+    },
+    {
+      "epoch": 2.051994301994302,
+      "grad_norm": 0.6993120908737183,
+      "learning_rate": 9.606772442560428e-05,
+      "loss": 0.8487,
+      "step": 11526
+    },
+    {
+      "epoch": 2.052172364672365,
+      "grad_norm": 0.8039231896400452,
+      "learning_rate": 9.605373781818961e-05,
+      "loss": 1.0102,
+      "step": 11527
+    },
+    {
+      "epoch": 2.052350427350427,
+      "grad_norm": 0.714849054813385,
+      "learning_rate": 9.603975128809373e-05,
+      "loss": 0.8977,
+      "step": 11528
+    },
+    {
+      "epoch": 2.05252849002849,
+      "grad_norm": 0.8728037476539612,
+      "learning_rate": 9.60257648355907e-05,
+      "loss": 0.8004,
+      "step": 11529
+    },
+    {
+      "epoch": 2.0527065527065527,
+      "grad_norm": 0.764776885509491,
+      "learning_rate": 9.601177846095454e-05,
+      "loss": 0.9205,
+      "step": 11530
+    },
+    {
+      "epoch": 2.0528846153846154,
+      "grad_norm": 0.6948725581169128,
+      "learning_rate": 9.599779216445934e-05,
+      "loss": 0.7864,
+      "step": 11531
+    },
+    {
+      "epoch": 2.053062678062678,
+      "grad_norm": 0.7663996815681458,
+      "learning_rate": 9.598380594637903e-05,
+      "loss": 0.8877,
+      "step": 11532
+    },
+    {
+      "epoch": 2.053240740740741,
+      "grad_norm": 0.7584146857261658,
+      "learning_rate": 9.596981980698776e-05,
+      "loss": 1.1328,
+      "step": 11533
+    },
+    {
+      "epoch": 2.0534188034188032,
+      "grad_norm": 0.7701094150543213,
+      "learning_rate": 9.595583374655945e-05,
+      "loss": 0.7551,
+      "step": 11534
+    },
+    {
+      "epoch": 2.053596866096866,
+      "grad_norm": 0.7745714783668518,
+      "learning_rate": 9.594184776536821e-05,
+      "loss": 0.8862,
+      "step": 11535
+    },
+    {
+      "epoch": 2.0537749287749287,
+      "grad_norm": 0.7832430005073547,
+      "learning_rate": 9.5927861863688e-05,
+      "loss": 0.8736,
+      "step": 11536
+    },
+    {
+      "epoch": 2.0539529914529915,
+      "grad_norm": 0.7354840040206909,
+      "learning_rate": 9.591387604179291e-05,
+      "loss": 0.8183,
+      "step": 11537
+    },
+    {
+      "epoch": 2.0541310541310542,
+      "grad_norm": 0.7516480684280396,
+      "learning_rate": 9.589989029995691e-05,
+      "loss": 0.924,
+      "step": 11538
+    },
+    {
+      "epoch": 2.054309116809117,
+      "grad_norm": 0.7942310571670532,
+      "learning_rate": 9.588590463845405e-05,
+      "loss": 1.0283,
+      "step": 11539
+    },
+    {
+      "epoch": 2.0544871794871793,
+      "grad_norm": 0.7716572880744934,
+      "learning_rate": 9.587191905755832e-05,
+      "loss": 0.8686,
+      "step": 11540
+    },
+    {
+      "epoch": 2.054665242165242,
+      "grad_norm": 0.8075140118598938,
+      "learning_rate": 9.585793355754381e-05,
+      "loss": 0.8731,
+      "step": 11541
+    },
+    {
+      "epoch": 2.054843304843305,
+      "grad_norm": 0.8119283318519592,
+      "learning_rate": 9.584394813868444e-05,
+      "loss": 0.9543,
+      "step": 11542
+    },
+    {
+      "epoch": 2.0550213675213675,
+      "grad_norm": 0.6476314067840576,
+      "learning_rate": 9.582996280125427e-05,
+      "loss": 0.6943,
+      "step": 11543
+    },
+    {
+      "epoch": 2.0551994301994303,
+      "grad_norm": 0.7617185711860657,
+      "learning_rate": 9.581597754552737e-05,
+      "loss": 0.6942,
+      "step": 11544
+    },
+    {
+      "epoch": 2.055377492877493,
+      "grad_norm": 0.879355788230896,
+      "learning_rate": 9.580199237177765e-05,
+      "loss": 0.825,
+      "step": 11545
+    },
+    {
+      "epoch": 2.0555555555555554,
+      "grad_norm": 0.8229055404663086,
+      "learning_rate": 9.578800728027919e-05,
+      "loss": 0.9973,
+      "step": 11546
+    },
+    {
+      "epoch": 2.055733618233618,
+      "grad_norm": 0.7808930277824402,
+      "learning_rate": 9.577402227130596e-05,
+      "loss": 0.9525,
+      "step": 11547
+    },
+    {
+      "epoch": 2.055911680911681,
+      "grad_norm": 0.870499849319458,
+      "learning_rate": 9.576003734513201e-05,
+      "loss": 0.8874,
+      "step": 11548
+    },
+    {
+      "epoch": 2.0560897435897436,
+      "grad_norm": 0.8254318833351135,
+      "learning_rate": 9.57460525020313e-05,
+      "loss": 0.6656,
+      "step": 11549
+    },
+    {
+      "epoch": 2.0562678062678064,
+      "grad_norm": 0.8358132243156433,
+      "learning_rate": 9.573206774227786e-05,
+      "loss": 0.7946,
+      "step": 11550
+    },
+    {
+      "epoch": 2.056445868945869,
+      "grad_norm": 0.636366605758667,
+      "learning_rate": 9.571808306614568e-05,
+      "loss": 0.6757,
+      "step": 11551
+    },
+    {
+      "epoch": 2.0566239316239314,
+      "grad_norm": 0.8884546160697937,
+      "learning_rate": 9.57040984739088e-05,
+      "loss": 0.6775,
+      "step": 11552
+    },
+    {
+      "epoch": 2.056801994301994,
+      "grad_norm": 0.7240797877311707,
+      "learning_rate": 9.569011396584115e-05,
+      "loss": 0.8033,
+      "step": 11553
+    },
+    {
+      "epoch": 2.056980056980057,
+      "grad_norm": 0.8730767965316772,
+      "learning_rate": 9.567612954221678e-05,
+      "loss": 0.9577,
+      "step": 11554
+    },
+    {
+      "epoch": 2.0571581196581197,
+      "grad_norm": 0.6785064339637756,
+      "learning_rate": 9.566214520330966e-05,
+      "loss": 0.6241,
+      "step": 11555
+    },
+    {
+      "epoch": 2.0573361823361824,
+      "grad_norm": 0.7757805585861206,
+      "learning_rate": 9.564816094939382e-05,
+      "loss": 0.7926,
+      "step": 11556
+    },
+    {
+      "epoch": 2.057514245014245,
+      "grad_norm": 0.7630164623260498,
+      "learning_rate": 9.563417678074319e-05,
+      "loss": 0.8547,
+      "step": 11557
+    },
+    {
+      "epoch": 2.0576923076923075,
+      "grad_norm": 0.7690725922584534,
+      "learning_rate": 9.562019269763184e-05,
+      "loss": 0.9172,
+      "step": 11558
+    },
+    {
+      "epoch": 2.0578703703703702,
+      "grad_norm": 0.81644207239151,
+      "learning_rate": 9.560620870033367e-05,
+      "loss": 0.811,
+      "step": 11559
+    },
+    {
+      "epoch": 2.058048433048433,
+      "grad_norm": 0.8240723013877869,
+      "learning_rate": 9.559222478912273e-05,
+      "loss": 0.9094,
+      "step": 11560
+    },
+    {
+      "epoch": 2.0582264957264957,
+      "grad_norm": 0.7168204188346863,
+      "learning_rate": 9.557824096427297e-05,
+      "loss": 1.0617,
+      "step": 11561
+    },
+    {
+      "epoch": 2.0584045584045585,
+      "grad_norm": 0.6648391485214233,
+      "learning_rate": 9.556425722605846e-05,
+      "loss": 0.6556,
+      "step": 11562
+    },
+    {
+      "epoch": 2.0585826210826212,
+      "grad_norm": 0.7291145324707031,
+      "learning_rate": 9.555027357475305e-05,
+      "loss": 0.784,
+      "step": 11563
+    },
+    {
+      "epoch": 2.0587606837606836,
+      "grad_norm": 0.6910824775695801,
+      "learning_rate": 9.553629001063079e-05,
+      "loss": 0.9332,
+      "step": 11564
+    },
+    {
+      "epoch": 2.0589387464387463,
+      "grad_norm": 0.757247805595398,
+      "learning_rate": 9.552230653396566e-05,
+      "loss": 0.6598,
+      "step": 11565
+    },
+    {
+      "epoch": 2.059116809116809,
+      "grad_norm": 0.7778435349464417,
+      "learning_rate": 9.550832314503163e-05,
+      "loss": 0.8899,
+      "step": 11566
+    },
+    {
+      "epoch": 2.059294871794872,
+      "grad_norm": 0.7827669978141785,
+      "learning_rate": 9.54943398441027e-05,
+      "loss": 0.8036,
+      "step": 11567
+    },
+    {
+      "epoch": 2.0594729344729346,
+      "grad_norm": 0.7462462186813354,
+      "learning_rate": 9.54803566314528e-05,
+      "loss": 0.9306,
+      "step": 11568
+    },
+    {
+      "epoch": 2.0596509971509973,
+      "grad_norm": 0.8088639974594116,
+      "learning_rate": 9.546637350735597e-05,
+      "loss": 0.8766,
+      "step": 11569
+    },
+    {
+      "epoch": 2.0598290598290596,
+      "grad_norm": 0.6477743983268738,
+      "learning_rate": 9.545239047208607e-05,
+      "loss": 0.7239,
+      "step": 11570
+    },
+    {
+      "epoch": 2.0600071225071224,
+      "grad_norm": 0.7535004615783691,
+      "learning_rate": 9.543840752591718e-05,
+      "loss": 0.8891,
+      "step": 11571
+    },
+    {
+      "epoch": 2.060185185185185,
+      "grad_norm": 0.7085242867469788,
+      "learning_rate": 9.542442466912316e-05,
+      "loss": 0.8105,
+      "step": 11572
+    },
+    {
+      "epoch": 2.060363247863248,
+      "grad_norm": 0.8129137754440308,
+      "learning_rate": 9.541044190197811e-05,
+      "loss": 0.6955,
+      "step": 11573
+    },
+    {
+      "epoch": 2.0605413105413106,
+      "grad_norm": 0.7160677909851074,
+      "learning_rate": 9.539645922475586e-05,
+      "loss": 0.833,
+      "step": 11574
+    },
+    {
+      "epoch": 2.0607193732193734,
+      "grad_norm": 0.6983035206794739,
+      "learning_rate": 9.538247663773044e-05,
+      "loss": 0.6439,
+      "step": 11575
+    },
+    {
+      "epoch": 2.0608974358974357,
+      "grad_norm": 0.8732622861862183,
+      "learning_rate": 9.536849414117578e-05,
+      "loss": 0.7763,
+      "step": 11576
+    },
+    {
+      "epoch": 2.0610754985754984,
+      "grad_norm": 0.7745480537414551,
+      "learning_rate": 9.535451173536591e-05,
+      "loss": 0.8272,
+      "step": 11577
+    },
+    {
+      "epoch": 2.061253561253561,
+      "grad_norm": 0.8210037350654602,
+      "learning_rate": 9.53405294205747e-05,
+      "loss": 0.9539,
+      "step": 11578
+    },
+    {
+      "epoch": 2.061431623931624,
+      "grad_norm": 0.6742323637008667,
+      "learning_rate": 9.532654719707617e-05,
+      "loss": 0.6525,
+      "step": 11579
+    },
+    {
+      "epoch": 2.0616096866096867,
+      "grad_norm": 0.8312603831291199,
+      "learning_rate": 9.531256506514418e-05,
+      "loss": 0.7776,
+      "step": 11580
+    },
+    {
+      "epoch": 2.0617877492877494,
+      "grad_norm": 0.7817347049713135,
+      "learning_rate": 9.529858302505278e-05,
+      "loss": 1.0148,
+      "step": 11581
+    },
+    {
+      "epoch": 2.0619658119658117,
+      "grad_norm": 0.751153290271759,
+      "learning_rate": 9.528460107707584e-05,
+      "loss": 0.8064,
+      "step": 11582
+    },
+    {
+      "epoch": 2.0621438746438745,
+      "grad_norm": 0.7483627200126648,
+      "learning_rate": 9.527061922148737e-05,
+      "loss": 0.8706,
+      "step": 11583
+    },
+    {
+      "epoch": 2.0623219373219372,
+      "grad_norm": 0.7044979929924011,
+      "learning_rate": 9.525663745856132e-05,
+      "loss": 0.8008,
+      "step": 11584
+    },
+    {
+      "epoch": 2.0625,
+      "grad_norm": 0.8249054551124573,
+      "learning_rate": 9.524265578857157e-05,
+      "loss": 0.9339,
+      "step": 11585
+    },
+    {
+      "epoch": 2.0626780626780628,
+      "grad_norm": 0.7184668183326721,
+      "learning_rate": 9.522867421179211e-05,
+      "loss": 0.9191,
+      "step": 11586
+    },
+    {
+      "epoch": 2.0628561253561255,
+      "grad_norm": 0.8135001063346863,
+      "learning_rate": 9.521469272849685e-05,
+      "loss": 0.966,
+      "step": 11587
+    },
+    {
+      "epoch": 2.0630341880341883,
+      "grad_norm": 0.8151242733001709,
+      "learning_rate": 9.520071133895978e-05,
+      "loss": 0.7947,
+      "step": 11588
+    },
+    {
+      "epoch": 2.0632122507122506,
+      "grad_norm": 0.8044771552085876,
+      "learning_rate": 9.518673004345477e-05,
+      "loss": 0.893,
+      "step": 11589
+    },
+    {
+      "epoch": 2.0633903133903133,
+      "grad_norm": 0.730505645275116,
+      "learning_rate": 9.517274884225581e-05,
+      "loss": 0.8691,
+      "step": 11590
+    },
+    {
+      "epoch": 2.063568376068376,
+      "grad_norm": 0.7419933676719666,
+      "learning_rate": 9.515876773563678e-05,
+      "loss": 0.809,
+      "step": 11591
+    },
+    {
+      "epoch": 2.063746438746439,
+      "grad_norm": 0.7809683084487915,
+      "learning_rate": 9.514478672387169e-05,
+      "loss": 0.6926,
+      "step": 11592
+    },
+    {
+      "epoch": 2.0639245014245016,
+      "grad_norm": 1.0065315961837769,
+      "learning_rate": 9.513080580723435e-05,
+      "loss": 0.7506,
+      "step": 11593
+    },
+    {
+      "epoch": 2.064102564102564,
+      "grad_norm": 0.7424543499946594,
+      "learning_rate": 9.511682498599883e-05,
+      "loss": 0.9158,
+      "step": 11594
+    },
+    {
+      "epoch": 2.0642806267806266,
+      "grad_norm": 0.6907097697257996,
+      "learning_rate": 9.510284426043893e-05,
+      "loss": 0.8795,
+      "step": 11595
+    },
+    {
+      "epoch": 2.0644586894586894,
+      "grad_norm": 0.7849169969558716,
+      "learning_rate": 9.508886363082864e-05,
+      "loss": 1.0065,
+      "step": 11596
+    },
+    {
+      "epoch": 2.064636752136752,
+      "grad_norm": 0.7421438694000244,
+      "learning_rate": 9.507488309744183e-05,
+      "loss": 0.7574,
+      "step": 11597
+    },
+    {
+      "epoch": 2.064814814814815,
+      "grad_norm": 0.6636283993721008,
+      "learning_rate": 9.506090266055252e-05,
+      "loss": 0.7489,
+      "step": 11598
+    },
+    {
+      "epoch": 2.0649928774928776,
+      "grad_norm": 0.7133244872093201,
+      "learning_rate": 9.504692232043452e-05,
+      "loss": 0.7652,
+      "step": 11599
+    },
+    {
+      "epoch": 2.0651709401709404,
+      "grad_norm": 0.7891597151756287,
+      "learning_rate": 9.50329420773618e-05,
+      "loss": 0.7268,
+      "step": 11600
+    },
+    {
+      "epoch": 2.0653490028490027,
+      "grad_norm": 0.8578699827194214,
+      "learning_rate": 9.501896193160822e-05,
+      "loss": 0.9872,
+      "step": 11601
+    },
+    {
+      "epoch": 2.0655270655270654,
+      "grad_norm": 0.7071980834007263,
+      "learning_rate": 9.500498188344777e-05,
+      "loss": 0.7278,
+      "step": 11602
+    },
+    {
+      "epoch": 2.065705128205128,
+      "grad_norm": 0.8434318900108337,
+      "learning_rate": 9.499100193315436e-05,
+      "loss": 0.8811,
+      "step": 11603
+    },
+    {
+      "epoch": 2.065883190883191,
+      "grad_norm": 0.7429414391517639,
+      "learning_rate": 9.49770220810018e-05,
+      "loss": 0.8256,
+      "step": 11604
+    },
+    {
+      "epoch": 2.0660612535612537,
+      "grad_norm": 0.7059712409973145,
+      "learning_rate": 9.496304232726412e-05,
+      "loss": 0.8012,
+      "step": 11605
+    },
+    {
+      "epoch": 2.0662393162393164,
+      "grad_norm": 0.7095850706100464,
+      "learning_rate": 9.49490626722151e-05,
+      "loss": 0.8767,
+      "step": 11606
+    },
+    {
+      "epoch": 2.0664173789173788,
+      "grad_norm": 0.8135038018226624,
+      "learning_rate": 9.493508311612874e-05,
+      "loss": 1.1402,
+      "step": 11607
+    },
+    {
+      "epoch": 2.0665954415954415,
+      "grad_norm": 0.9023036360740662,
+      "learning_rate": 9.492110365927888e-05,
+      "loss": 0.7307,
+      "step": 11608
+    },
+    {
+      "epoch": 2.0667735042735043,
+      "grad_norm": 0.6990833282470703,
+      "learning_rate": 9.490712430193949e-05,
+      "loss": 0.7996,
+      "step": 11609
+    },
+    {
+      "epoch": 2.066951566951567,
+      "grad_norm": 0.7765957713127136,
+      "learning_rate": 9.489314504438437e-05,
+      "loss": 0.9721,
+      "step": 11610
+    },
+    {
+      "epoch": 2.0671296296296298,
+      "grad_norm": 0.883575975894928,
+      "learning_rate": 9.487916588688749e-05,
+      "loss": 0.8778,
+      "step": 11611
+    },
+    {
+      "epoch": 2.0673076923076925,
+      "grad_norm": 0.7226536870002747,
+      "learning_rate": 9.48651868297227e-05,
+      "loss": 1.0438,
+      "step": 11612
+    },
+    {
+      "epoch": 2.067485754985755,
+      "grad_norm": 0.7399018406867981,
+      "learning_rate": 9.485120787316394e-05,
+      "loss": 0.8154,
+      "step": 11613
+    },
+    {
+      "epoch": 2.0676638176638176,
+      "grad_norm": 1.0130186080932617,
+      "learning_rate": 9.483722901748502e-05,
+      "loss": 0.9832,
+      "step": 11614
+    },
+    {
+      "epoch": 2.0678418803418803,
+      "grad_norm": 0.8163331151008606,
+      "learning_rate": 9.482325026295993e-05,
+      "loss": 0.8854,
+      "step": 11615
+    },
+    {
+      "epoch": 2.068019943019943,
+      "grad_norm": 0.7165096998214722,
+      "learning_rate": 9.480927160986244e-05,
+      "loss": 0.7636,
+      "step": 11616
+    },
+    {
+      "epoch": 2.068198005698006,
+      "grad_norm": 0.8579450845718384,
+      "learning_rate": 9.479529305846652e-05,
+      "loss": 0.9105,
+      "step": 11617
+    },
+    {
+      "epoch": 2.0683760683760686,
+      "grad_norm": 0.7062679529190063,
+      "learning_rate": 9.4781314609046e-05,
+      "loss": 0.7601,
+      "step": 11618
+    },
+    {
+      "epoch": 2.068554131054131,
+      "grad_norm": 0.8178739547729492,
+      "learning_rate": 9.476733626187483e-05,
+      "loss": 0.8393,
+      "step": 11619
+    },
+    {
+      "epoch": 2.0687321937321936,
+      "grad_norm": 0.6667241454124451,
+      "learning_rate": 9.475335801722678e-05,
+      "loss": 0.8791,
+      "step": 11620
+    },
+    {
+      "epoch": 2.0689102564102564,
+      "grad_norm": 0.6603145599365234,
+      "learning_rate": 9.47393798753758e-05,
+      "loss": 0.7073,
+      "step": 11621
+    },
+    {
+      "epoch": 2.069088319088319,
+      "grad_norm": 0.7719821333885193,
+      "learning_rate": 9.472540183659573e-05,
+      "loss": 0.9195,
+      "step": 11622
+    },
+    {
+      "epoch": 2.069266381766382,
+      "grad_norm": 0.8059320449829102,
+      "learning_rate": 9.471142390116045e-05,
+      "loss": 0.927,
+      "step": 11623
+    },
+    {
+      "epoch": 2.0694444444444446,
+      "grad_norm": 0.6513992547988892,
+      "learning_rate": 9.469744606934388e-05,
+      "loss": 0.857,
+      "step": 11624
+    },
+    {
+      "epoch": 2.069622507122507,
+      "grad_norm": 0.6948497295379639,
+      "learning_rate": 9.468346834141979e-05,
+      "loss": 0.65,
+      "step": 11625
+    },
+    {
+      "epoch": 2.0698005698005697,
+      "grad_norm": 0.8086618781089783,
+      "learning_rate": 9.466949071766213e-05,
+      "loss": 0.8328,
+      "step": 11626
+    },
+    {
+      "epoch": 2.0699786324786325,
+      "grad_norm": 0.794731616973877,
+      "learning_rate": 9.465551319834468e-05,
+      "loss": 0.8641,
+      "step": 11627
+    },
+    {
+      "epoch": 2.070156695156695,
+      "grad_norm": 0.7312739491462708,
+      "learning_rate": 9.46415357837414e-05,
+      "loss": 0.7878,
+      "step": 11628
+    },
+    {
+      "epoch": 2.070334757834758,
+      "grad_norm": 0.8025211691856384,
+      "learning_rate": 9.462755847412606e-05,
+      "loss": 0.8624,
+      "step": 11629
+    },
+    {
+      "epoch": 2.0705128205128207,
+      "grad_norm": 0.7296801209449768,
+      "learning_rate": 9.461358126977259e-05,
+      "loss": 0.7299,
+      "step": 11630
+    },
+    {
+      "epoch": 2.070690883190883,
+      "grad_norm": 0.7176340222358704,
+      "learning_rate": 9.459960417095477e-05,
+      "loss": 0.7374,
+      "step": 11631
+    },
+    {
+      "epoch": 2.0708689458689458,
+      "grad_norm": 0.7656565308570862,
+      "learning_rate": 9.45856271779465e-05,
+      "loss": 0.7791,
+      "step": 11632
+    },
+    {
+      "epoch": 2.0710470085470085,
+      "grad_norm": 0.7232711315155029,
+      "learning_rate": 9.457165029102159e-05,
+      "loss": 0.85,
+      "step": 11633
+    },
+    {
+      "epoch": 2.0712250712250713,
+      "grad_norm": 0.7342440485954285,
+      "learning_rate": 9.455767351045397e-05,
+      "loss": 0.8423,
+      "step": 11634
+    },
+    {
+      "epoch": 2.071403133903134,
+      "grad_norm": 0.7844834923744202,
+      "learning_rate": 9.45436968365174e-05,
+      "loss": 0.9179,
+      "step": 11635
+    },
+    {
+      "epoch": 2.0715811965811968,
+      "grad_norm": 0.8880203366279602,
+      "learning_rate": 9.452972026948575e-05,
+      "loss": 0.7139,
+      "step": 11636
+    },
+    {
+      "epoch": 2.071759259259259,
+      "grad_norm": 0.7611206769943237,
+      "learning_rate": 9.451574380963286e-05,
+      "loss": 0.8915,
+      "step": 11637
+    },
+    {
+      "epoch": 2.071937321937322,
+      "grad_norm": 0.8123503923416138,
+      "learning_rate": 9.450176745723262e-05,
+      "loss": 0.7011,
+      "step": 11638
+    },
+    {
+      "epoch": 2.0721153846153846,
+      "grad_norm": 0.7703253030776978,
+      "learning_rate": 9.448779121255879e-05,
+      "loss": 0.8493,
+      "step": 11639
+    },
+    {
+      "epoch": 2.0722934472934473,
+      "grad_norm": 0.755836009979248,
+      "learning_rate": 9.447381507588527e-05,
+      "loss": 0.9145,
+      "step": 11640
+    },
+    {
+      "epoch": 2.07247150997151,
+      "grad_norm": 0.7879568338394165,
+      "learning_rate": 9.445983904748583e-05,
+      "loss": 0.7761,
+      "step": 11641
+    },
+    {
+      "epoch": 2.072649572649573,
+      "grad_norm": 0.6695574522018433,
+      "learning_rate": 9.444586312763434e-05,
+      "loss": 0.8594,
+      "step": 11642
+    },
+    {
+      "epoch": 2.072827635327635,
+      "grad_norm": 0.6734640002250671,
+      "learning_rate": 9.443188731660462e-05,
+      "loss": 0.7324,
+      "step": 11643
+    },
+    {
+      "epoch": 2.073005698005698,
+      "grad_norm": 0.7823841571807861,
+      "learning_rate": 9.441791161467051e-05,
+      "loss": 1.0223,
+      "step": 11644
+    },
+    {
+      "epoch": 2.0731837606837606,
+      "grad_norm": 0.8152045011520386,
+      "learning_rate": 9.440393602210585e-05,
+      "loss": 1.0364,
+      "step": 11645
+    },
+    {
+      "epoch": 2.0733618233618234,
+      "grad_norm": 0.8664864897727966,
+      "learning_rate": 9.438996053918441e-05,
+      "loss": 0.7607,
+      "step": 11646
+    },
+    {
+      "epoch": 2.073539886039886,
+      "grad_norm": 0.7949544787406921,
+      "learning_rate": 9.437598516618006e-05,
+      "loss": 0.7644,
+      "step": 11647
+    },
+    {
+      "epoch": 2.073717948717949,
+      "grad_norm": 0.767045259475708,
+      "learning_rate": 9.436200990336657e-05,
+      "loss": 0.7563,
+      "step": 11648
+    },
+    {
+      "epoch": 2.073896011396011,
+      "grad_norm": 0.669129490852356,
+      "learning_rate": 9.434803475101782e-05,
+      "loss": 0.7644,
+      "step": 11649
+    },
+    {
+      "epoch": 2.074074074074074,
+      "grad_norm": 0.7969587445259094,
+      "learning_rate": 9.433405970940755e-05,
+      "loss": 1.0249,
+      "step": 11650
+    },
+    {
+      "epoch": 2.0742521367521367,
+      "grad_norm": 0.6744855642318726,
+      "learning_rate": 9.432008477880966e-05,
+      "loss": 0.7478,
+      "step": 11651
+    },
+    {
+      "epoch": 2.0744301994301995,
+      "grad_norm": 0.6236920356750488,
+      "learning_rate": 9.430610995949786e-05,
+      "loss": 0.7309,
+      "step": 11652
+    },
+    {
+      "epoch": 2.074608262108262,
+      "grad_norm": 0.7952008843421936,
+      "learning_rate": 9.429213525174603e-05,
+      "loss": 0.927,
+      "step": 11653
+    },
+    {
+      "epoch": 2.074786324786325,
+      "grad_norm": 0.7075965404510498,
+      "learning_rate": 9.427816065582792e-05,
+      "loss": 0.8494,
+      "step": 11654
+    },
+    {
+      "epoch": 2.0749643874643873,
+      "grad_norm": 0.8018102049827576,
+      "learning_rate": 9.426418617201744e-05,
+      "loss": 0.9261,
+      "step": 11655
+    },
+    {
+      "epoch": 2.07514245014245,
+      "grad_norm": 0.7155446410179138,
+      "learning_rate": 9.425021180058824e-05,
+      "loss": 0.8296,
+      "step": 11656
+    },
+    {
+      "epoch": 2.0753205128205128,
+      "grad_norm": 0.6611294150352478,
+      "learning_rate": 9.423623754181425e-05,
+      "loss": 0.687,
+      "step": 11657
+    },
+    {
+      "epoch": 2.0754985754985755,
+      "grad_norm": 0.706280529499054,
+      "learning_rate": 9.422226339596917e-05,
+      "loss": 0.8664,
+      "step": 11658
+    },
+    {
+      "epoch": 2.0756766381766383,
+      "grad_norm": 0.7512072324752808,
+      "learning_rate": 9.420828936332687e-05,
+      "loss": 0.9074,
+      "step": 11659
+    },
+    {
+      "epoch": 2.075854700854701,
+      "grad_norm": 0.8833743333816528,
+      "learning_rate": 9.419431544416108e-05,
+      "loss": 1.0541,
+      "step": 11660
+    },
+    {
+      "epoch": 2.0760327635327633,
+      "grad_norm": 0.6991413235664368,
+      "learning_rate": 9.418034163874564e-05,
+      "loss": 0.7543,
+      "step": 11661
+    },
+    {
+      "epoch": 2.076210826210826,
+      "grad_norm": 0.784294605255127,
+      "learning_rate": 9.41663679473543e-05,
+      "loss": 0.8156,
+      "step": 11662
+    },
+    {
+      "epoch": 2.076388888888889,
+      "grad_norm": 0.7716241478919983,
+      "learning_rate": 9.415239437026086e-05,
+      "loss": 0.9613,
+      "step": 11663
+    },
+    {
+      "epoch": 2.0765669515669516,
+      "grad_norm": 0.8247698545455933,
+      "learning_rate": 9.413842090773914e-05,
+      "loss": 0.8811,
+      "step": 11664
+    },
+    {
+      "epoch": 2.0767450142450143,
+      "grad_norm": 0.7988204956054688,
+      "learning_rate": 9.412444756006283e-05,
+      "loss": 1.017,
+      "step": 11665
+    },
+    {
+      "epoch": 2.076923076923077,
+      "grad_norm": 0.8069472908973694,
+      "learning_rate": 9.411047432750583e-05,
+      "loss": 0.8292,
+      "step": 11666
+    },
+    {
+      "epoch": 2.0771011396011394,
+      "grad_norm": 0.8177345991134644,
+      "learning_rate": 9.40965012103418e-05,
+      "loss": 0.8004,
+      "step": 11667
+    },
+    {
+      "epoch": 2.077279202279202,
+      "grad_norm": 0.8589172959327698,
+      "learning_rate": 9.40825282088446e-05,
+      "loss": 0.8274,
+      "step": 11668
+    },
+    {
+      "epoch": 2.077457264957265,
+      "grad_norm": 0.677379846572876,
+      "learning_rate": 9.406855532328792e-05,
+      "loss": 0.7044,
+      "step": 11669
+    },
+    {
+      "epoch": 2.0776353276353277,
+      "grad_norm": 0.9417888522148132,
+      "learning_rate": 9.405458255394564e-05,
+      "loss": 0.8418,
+      "step": 11670
+    },
+    {
+      "epoch": 2.0778133903133904,
+      "grad_norm": 0.7226679921150208,
+      "learning_rate": 9.404060990109141e-05,
+      "loss": 0.8496,
+      "step": 11671
+    },
+    {
+      "epoch": 2.077991452991453,
+      "grad_norm": 0.7451614737510681,
+      "learning_rate": 9.402663736499909e-05,
+      "loss": 0.7569,
+      "step": 11672
+    },
+    {
+      "epoch": 2.0781695156695155,
+      "grad_norm": 0.6516944169998169,
+      "learning_rate": 9.401266494594235e-05,
+      "loss": 0.5591,
+      "step": 11673
+    },
+    {
+      "epoch": 2.078347578347578,
+      "grad_norm": 0.7473219633102417,
+      "learning_rate": 9.399869264419507e-05,
+      "loss": 1.0098,
+      "step": 11674
+    },
+    {
+      "epoch": 2.078525641025641,
+      "grad_norm": 0.8346691131591797,
+      "learning_rate": 9.398472046003088e-05,
+      "loss": 1.2654,
+      "step": 11675
+    },
+    {
+      "epoch": 2.0787037037037037,
+      "grad_norm": 0.8611979484558105,
+      "learning_rate": 9.397074839372366e-05,
+      "loss": 0.571,
+      "step": 11676
+    },
+    {
+      "epoch": 2.0788817663817665,
+      "grad_norm": 0.8093259334564209,
+      "learning_rate": 9.395677644554705e-05,
+      "loss": 1.0383,
+      "step": 11677
+    },
+    {
+      "epoch": 2.0790598290598292,
+      "grad_norm": 0.7954222559928894,
+      "learning_rate": 9.394280461577488e-05,
+      "loss": 0.8078,
+      "step": 11678
+    },
+    {
+      "epoch": 2.0792378917378915,
+      "grad_norm": 0.8380635380744934,
+      "learning_rate": 9.392883290468083e-05,
+      "loss": 1.0368,
+      "step": 11679
+    },
+    {
+      "epoch": 2.0794159544159543,
+      "grad_norm": 0.8427146077156067,
+      "learning_rate": 9.391486131253874e-05,
+      "loss": 0.8638,
+      "step": 11680
+    },
+    {
+      "epoch": 2.079594017094017,
+      "grad_norm": 0.7211564779281616,
+      "learning_rate": 9.390088983962227e-05,
+      "loss": 0.8211,
+      "step": 11681
+    },
+    {
+      "epoch": 2.07977207977208,
+      "grad_norm": 0.7480773329734802,
+      "learning_rate": 9.388691848620517e-05,
+      "loss": 0.9313,
+      "step": 11682
+    },
+    {
+      "epoch": 2.0799501424501425,
+      "grad_norm": 0.8421902060508728,
+      "learning_rate": 9.387294725256123e-05,
+      "loss": 0.8808,
+      "step": 11683
+    },
+    {
+      "epoch": 2.0801282051282053,
+      "grad_norm": 0.7753815650939941,
+      "learning_rate": 9.385897613896416e-05,
+      "loss": 0.7738,
+      "step": 11684
+    },
+    {
+      "epoch": 2.0803062678062676,
+      "grad_norm": 0.8053030967712402,
+      "learning_rate": 9.384500514568773e-05,
+      "loss": 0.9194,
+      "step": 11685
+    },
+    {
+      "epoch": 2.0804843304843303,
+      "grad_norm": 0.7628602981567383,
+      "learning_rate": 9.383103427300559e-05,
+      "loss": 0.7866,
+      "step": 11686
+    },
+    {
+      "epoch": 2.080662393162393,
+      "grad_norm": 0.7087932825088501,
+      "learning_rate": 9.381706352119156e-05,
+      "loss": 0.8817,
+      "step": 11687
+    },
+    {
+      "epoch": 2.080840455840456,
+      "grad_norm": 0.8687152862548828,
+      "learning_rate": 9.380309289051929e-05,
+      "loss": 0.8902,
+      "step": 11688
+    },
+    {
+      "epoch": 2.0810185185185186,
+      "grad_norm": 0.8181152939796448,
+      "learning_rate": 9.378912238126256e-05,
+      "loss": 0.9244,
+      "step": 11689
+    },
+    {
+      "epoch": 2.0811965811965814,
+      "grad_norm": 0.7961983680725098,
+      "learning_rate": 9.377515199369506e-05,
+      "loss": 0.714,
+      "step": 11690
+    },
+    {
+      "epoch": 2.0813746438746437,
+      "grad_norm": 0.8307793736457825,
+      "learning_rate": 9.376118172809056e-05,
+      "loss": 0.9573,
+      "step": 11691
+    },
+    {
+      "epoch": 2.0815527065527064,
+      "grad_norm": 0.7349256277084351,
+      "learning_rate": 9.374721158472269e-05,
+      "loss": 0.7533,
+      "step": 11692
+    },
+    {
+      "epoch": 2.081730769230769,
+      "grad_norm": 0.7625117897987366,
+      "learning_rate": 9.373324156386526e-05,
+      "loss": 0.8387,
+      "step": 11693
+    },
+    {
+      "epoch": 2.081908831908832,
+      "grad_norm": 0.9537683129310608,
+      "learning_rate": 9.371927166579191e-05,
+      "loss": 0.9444,
+      "step": 11694
+    },
+    {
+      "epoch": 2.0820868945868947,
+      "grad_norm": 0.7170497179031372,
+      "learning_rate": 9.370530189077644e-05,
+      "loss": 0.9132,
+      "step": 11695
+    },
+    {
+      "epoch": 2.0822649572649574,
+      "grad_norm": 0.7750041484832764,
+      "learning_rate": 9.369133223909246e-05,
+      "loss": 0.6635,
+      "step": 11696
+    },
+    {
+      "epoch": 2.08244301994302,
+      "grad_norm": 0.8990386128425598,
+      "learning_rate": 9.367736271101373e-05,
+      "loss": 0.8692,
+      "step": 11697
+    },
+    {
+      "epoch": 2.0826210826210825,
+      "grad_norm": 0.5909343361854553,
+      "learning_rate": 9.366339330681393e-05,
+      "loss": 0.6811,
+      "step": 11698
+    },
+    {
+      "epoch": 2.0827991452991452,
+      "grad_norm": 0.7783302068710327,
+      "learning_rate": 9.364942402676682e-05,
+      "loss": 1.1024,
+      "step": 11699
+    },
+    {
+      "epoch": 2.082977207977208,
+      "grad_norm": 0.8926466703414917,
+      "learning_rate": 9.3635454871146e-05,
+      "loss": 0.971,
+      "step": 11700
+    },
+    {
+      "epoch": 2.0831552706552707,
+      "grad_norm": 0.7374816536903381,
+      "learning_rate": 9.362148584022527e-05,
+      "loss": 0.7151,
+      "step": 11701
+    },
+    {
+      "epoch": 2.0833333333333335,
+      "grad_norm": 0.7491161227226257,
+      "learning_rate": 9.360751693427823e-05,
+      "loss": 0.9213,
+      "step": 11702
+    },
+    {
+      "epoch": 2.083511396011396,
+      "grad_norm": 0.726859986782074,
+      "learning_rate": 9.359354815357862e-05,
+      "loss": 0.8412,
+      "step": 11703
+    },
+    {
+      "epoch": 2.0836894586894585,
+      "grad_norm": 0.756703794002533,
+      "learning_rate": 9.357957949840015e-05,
+      "loss": 0.8074,
+      "step": 11704
+    },
+    {
+      "epoch": 2.0838675213675213,
+      "grad_norm": 0.8475984334945679,
+      "learning_rate": 9.356561096901646e-05,
+      "loss": 0.8926,
+      "step": 11705
+    },
+    {
+      "epoch": 2.084045584045584,
+      "grad_norm": 0.9776971936225891,
+      "learning_rate": 9.355164256570129e-05,
+      "loss": 0.8543,
+      "step": 11706
+    },
+    {
+      "epoch": 2.084223646723647,
+      "grad_norm": 0.7185834646224976,
+      "learning_rate": 9.353767428872826e-05,
+      "loss": 0.6946,
+      "step": 11707
+    },
+    {
+      "epoch": 2.0844017094017095,
+      "grad_norm": 0.7075535655021667,
+      "learning_rate": 9.352370613837109e-05,
+      "loss": 0.7171,
+      "step": 11708
+    },
+    {
+      "epoch": 2.0845797720797723,
+      "grad_norm": 0.8549726009368896,
+      "learning_rate": 9.350973811490343e-05,
+      "loss": 0.9028,
+      "step": 11709
+    },
+    {
+      "epoch": 2.0847578347578346,
+      "grad_norm": 0.731235682964325,
+      "learning_rate": 9.3495770218599e-05,
+      "loss": 0.7703,
+      "step": 11710
+    },
+    {
+      "epoch": 2.0849358974358974,
+      "grad_norm": 0.8660612106323242,
+      "learning_rate": 9.34818024497314e-05,
+      "loss": 0.8464,
+      "step": 11711
+    },
+    {
+      "epoch": 2.08511396011396,
+      "grad_norm": 0.7687711715698242,
+      "learning_rate": 9.346783480857439e-05,
+      "loss": 0.8199,
+      "step": 11712
+    },
+    {
+      "epoch": 2.085292022792023,
+      "grad_norm": 0.6802884936332703,
+      "learning_rate": 9.345386729540155e-05,
+      "loss": 0.7537,
+      "step": 11713
+    },
+    {
+      "epoch": 2.0854700854700856,
+      "grad_norm": 0.7688863277435303,
+      "learning_rate": 9.34398999104866e-05,
+      "loss": 0.8374,
+      "step": 11714
+    },
+    {
+      "epoch": 2.0856481481481484,
+      "grad_norm": 0.7872602939605713,
+      "learning_rate": 9.342593265410315e-05,
+      "loss": 0.8786,
+      "step": 11715
+    },
+    {
+      "epoch": 2.0858262108262107,
+      "grad_norm": 0.9752106666564941,
+      "learning_rate": 9.341196552652496e-05,
+      "loss": 0.9572,
+      "step": 11716
+    },
+    {
+      "epoch": 2.0860042735042734,
+      "grad_norm": 0.7023422718048096,
+      "learning_rate": 9.339799852802555e-05,
+      "loss": 0.7613,
+      "step": 11717
+    },
+    {
+      "epoch": 2.086182336182336,
+      "grad_norm": 0.8366875052452087,
+      "learning_rate": 9.338403165887868e-05,
+      "loss": 0.8206,
+      "step": 11718
+    },
+    {
+      "epoch": 2.086360398860399,
+      "grad_norm": 0.8534985184669495,
+      "learning_rate": 9.337006491935794e-05,
+      "loss": 0.8549,
+      "step": 11719
+    },
+    {
+      "epoch": 2.0865384615384617,
+      "grad_norm": 0.7902935743331909,
+      "learning_rate": 9.335609830973707e-05,
+      "loss": 0.8399,
+      "step": 11720
+    },
+    {
+      "epoch": 2.0867165242165244,
+      "grad_norm": 0.8064647316932678,
+      "learning_rate": 9.334213183028958e-05,
+      "loss": 0.7978,
+      "step": 11721
+    },
+    {
+      "epoch": 2.0868945868945867,
+      "grad_norm": 0.816412627696991,
+      "learning_rate": 9.332816548128919e-05,
+      "loss": 0.8814,
+      "step": 11722
+    },
+    {
+      "epoch": 2.0870726495726495,
+      "grad_norm": 0.7778908610343933,
+      "learning_rate": 9.33141992630096e-05,
+      "loss": 0.9916,
+      "step": 11723
+    },
+    {
+      "epoch": 2.0872507122507122,
+      "grad_norm": 0.7899400591850281,
+      "learning_rate": 9.330023317572433e-05,
+      "loss": 0.5682,
+      "step": 11724
+    },
+    {
+      "epoch": 2.087428774928775,
+      "grad_norm": 0.6770033836364746,
+      "learning_rate": 9.32862672197071e-05,
+      "loss": 0.7327,
+      "step": 11725
+    },
+    {
+      "epoch": 2.0876068376068377,
+      "grad_norm": 0.8385946750640869,
+      "learning_rate": 9.327230139523148e-05,
+      "loss": 0.7793,
+      "step": 11726
+    },
+    {
+      "epoch": 2.0877849002849005,
+      "grad_norm": 0.708091139793396,
+      "learning_rate": 9.32583357025712e-05,
+      "loss": 0.6199,
+      "step": 11727
+    },
+    {
+      "epoch": 2.087962962962963,
+      "grad_norm": 0.8172122836112976,
+      "learning_rate": 9.324437014199978e-05,
+      "loss": 0.749,
+      "step": 11728
+    },
+    {
+      "epoch": 2.0881410256410255,
+      "grad_norm": 0.818324625492096,
+      "learning_rate": 9.323040471379091e-05,
+      "loss": 0.6605,
+      "step": 11729
+    },
+    {
+      "epoch": 2.0883190883190883,
+      "grad_norm": 0.9074803590774536,
+      "learning_rate": 9.321643941821819e-05,
+      "loss": 0.9231,
+      "step": 11730
+    },
+    {
+      "epoch": 2.088497150997151,
+      "grad_norm": 0.7559560537338257,
+      "learning_rate": 9.320247425555527e-05,
+      "loss": 0.7291,
+      "step": 11731
+    },
+    {
+      "epoch": 2.088675213675214,
+      "grad_norm": 0.8001563549041748,
+      "learning_rate": 9.318850922607571e-05,
+      "loss": 0.7781,
+      "step": 11732
+    },
+    {
+      "epoch": 2.0888532763532766,
+      "grad_norm": 0.7365888953208923,
+      "learning_rate": 9.31745443300532e-05,
+      "loss": 0.8243,
+      "step": 11733
+    },
+    {
+      "epoch": 2.089031339031339,
+      "grad_norm": 0.7861692309379578,
+      "learning_rate": 9.316057956776126e-05,
+      "loss": 0.7568,
+      "step": 11734
+    },
+    {
+      "epoch": 2.0892094017094016,
+      "grad_norm": 0.8399034738540649,
+      "learning_rate": 9.314661493947363e-05,
+      "loss": 0.8019,
+      "step": 11735
+    },
+    {
+      "epoch": 2.0893874643874644,
+      "grad_norm": 0.7718507051467896,
+      "learning_rate": 9.313265044546378e-05,
+      "loss": 0.909,
+      "step": 11736
+    },
+    {
+      "epoch": 2.089565527065527,
+      "grad_norm": 0.8940733671188354,
+      "learning_rate": 9.311868608600543e-05,
+      "loss": 0.7154,
+      "step": 11737
+    },
+    {
+      "epoch": 2.08974358974359,
+      "grad_norm": 0.8506718873977661,
+      "learning_rate": 9.31047218613721e-05,
+      "loss": 0.8367,
+      "step": 11738
+    },
+    {
+      "epoch": 2.0899216524216526,
+      "grad_norm": 0.8431367874145508,
+      "learning_rate": 9.309075777183743e-05,
+      "loss": 0.9532,
+      "step": 11739
+    },
+    {
+      "epoch": 2.090099715099715,
+      "grad_norm": 0.7683414220809937,
+      "learning_rate": 9.307679381767499e-05,
+      "loss": 0.9301,
+      "step": 11740
+    },
+    {
+      "epoch": 2.0902777777777777,
+      "grad_norm": 0.7601380348205566,
+      "learning_rate": 9.306282999915839e-05,
+      "loss": 0.8462,
+      "step": 11741
+    },
+    {
+      "epoch": 2.0904558404558404,
+      "grad_norm": 0.7531782388687134,
+      "learning_rate": 9.304886631656127e-05,
+      "loss": 0.8012,
+      "step": 11742
+    },
+    {
+      "epoch": 2.090633903133903,
+      "grad_norm": 0.7869617938995361,
+      "learning_rate": 9.303490277015714e-05,
+      "loss": 0.6645,
+      "step": 11743
+    },
+    {
+      "epoch": 2.090811965811966,
+      "grad_norm": 0.8042751550674438,
+      "learning_rate": 9.302093936021964e-05,
+      "loss": 1.1078,
+      "step": 11744
+    },
+    {
+      "epoch": 2.0909900284900287,
+      "grad_norm": 0.750350296497345,
+      "learning_rate": 9.300697608702231e-05,
+      "loss": 0.8552,
+      "step": 11745
+    },
+    {
+      "epoch": 2.091168091168091,
+      "grad_norm": 0.7624406814575195,
+      "learning_rate": 9.29930129508388e-05,
+      "loss": 0.861,
+      "step": 11746
+    },
+    {
+      "epoch": 2.0913461538461537,
+      "grad_norm": 0.7634474635124207,
+      "learning_rate": 9.29790499519426e-05,
+      "loss": 0.9483,
+      "step": 11747
+    },
+    {
+      "epoch": 2.0915242165242165,
+      "grad_norm": 0.7312899231910706,
+      "learning_rate": 9.296508709060738e-05,
+      "loss": 0.655,
+      "step": 11748
+    },
+    {
+      "epoch": 2.0917022792022792,
+      "grad_norm": 0.8181857466697693,
+      "learning_rate": 9.295112436710662e-05,
+      "loss": 0.7912,
+      "step": 11749
+    },
+    {
+      "epoch": 2.091880341880342,
+      "grad_norm": 0.6349542737007141,
+      "learning_rate": 9.293716178171396e-05,
+      "loss": 0.6268,
+      "step": 11750
+    },
+    {
+      "epoch": 2.0920584045584047,
+      "grad_norm": 0.8832548260688782,
+      "learning_rate": 9.292319933470291e-05,
+      "loss": 0.7805,
+      "step": 11751
+    },
+    {
+      "epoch": 2.092236467236467,
+      "grad_norm": 0.7251408100128174,
+      "learning_rate": 9.290923702634712e-05,
+      "loss": 0.7553,
+      "step": 11752
+    },
+    {
+      "epoch": 2.09241452991453,
+      "grad_norm": 0.8794457912445068,
+      "learning_rate": 9.289527485692006e-05,
+      "loss": 0.9187,
+      "step": 11753
+    },
+    {
+      "epoch": 2.0925925925925926,
+      "grad_norm": 0.7768839597702026,
+      "learning_rate": 9.288131282669534e-05,
+      "loss": 0.9267,
+      "step": 11754
+    },
+    {
+      "epoch": 2.0927706552706553,
+      "grad_norm": 0.744144856929779,
+      "learning_rate": 9.28673509359465e-05,
+      "loss": 0.714,
+      "step": 11755
+    },
+    {
+      "epoch": 2.092948717948718,
+      "grad_norm": 0.9117433428764343,
+      "learning_rate": 9.285338918494714e-05,
+      "loss": 0.9965,
+      "step": 11756
+    },
+    {
+      "epoch": 2.093126780626781,
+      "grad_norm": 0.8105267286300659,
+      "learning_rate": 9.283942757397073e-05,
+      "loss": 0.7517,
+      "step": 11757
+    },
+    {
+      "epoch": 2.093304843304843,
+      "grad_norm": 0.7348153591156006,
+      "learning_rate": 9.28254661032909e-05,
+      "loss": 0.7101,
+      "step": 11758
+    },
+    {
+      "epoch": 2.093482905982906,
+      "grad_norm": 0.7625702023506165,
+      "learning_rate": 9.281150477318113e-05,
+      "loss": 0.6863,
+      "step": 11759
+    },
+    {
+      "epoch": 2.0936609686609686,
+      "grad_norm": 0.7987569570541382,
+      "learning_rate": 9.2797543583915e-05,
+      "loss": 0.8848,
+      "step": 11760
+    },
+    {
+      "epoch": 2.0938390313390314,
+      "grad_norm": 0.706235408782959,
+      "learning_rate": 9.278358253576601e-05,
+      "loss": 0.7375,
+      "step": 11761
+    },
+    {
+      "epoch": 2.094017094017094,
+      "grad_norm": 0.9716742038726807,
+      "learning_rate": 9.276962162900774e-05,
+      "loss": 0.8602,
+      "step": 11762
+    },
+    {
+      "epoch": 2.094195156695157,
+      "grad_norm": 0.7711777687072754,
+      "learning_rate": 9.275566086391377e-05,
+      "loss": 0.8553,
+      "step": 11763
+    },
+    {
+      "epoch": 2.094373219373219,
+      "grad_norm": 0.8542511463165283,
+      "learning_rate": 9.274170024075751e-05,
+      "loss": 0.8412,
+      "step": 11764
+    },
+    {
+      "epoch": 2.094551282051282,
+      "grad_norm": 0.8255360126495361,
+      "learning_rate": 9.272773975981259e-05,
+      "loss": 1.0245,
+      "step": 11765
+    },
+    {
+      "epoch": 2.0947293447293447,
+      "grad_norm": 0.7416045665740967,
+      "learning_rate": 9.271377942135248e-05,
+      "loss": 0.57,
+      "step": 11766
+    },
+    {
+      "epoch": 2.0949074074074074,
+      "grad_norm": 0.8805620670318604,
+      "learning_rate": 9.269981922565078e-05,
+      "loss": 1.0262,
+      "step": 11767
+    },
+    {
+      "epoch": 2.09508547008547,
+      "grad_norm": 0.7293491363525391,
+      "learning_rate": 9.26858591729809e-05,
+      "loss": 0.7945,
+      "step": 11768
+    },
+    {
+      "epoch": 2.095263532763533,
+      "grad_norm": 0.7949206233024597,
+      "learning_rate": 9.267189926361643e-05,
+      "loss": 0.7071,
+      "step": 11769
+    },
+    {
+      "epoch": 2.0954415954415953,
+      "grad_norm": 0.771806538105011,
+      "learning_rate": 9.265793949783087e-05,
+      "loss": 0.8125,
+      "step": 11770
+    },
+    {
+      "epoch": 2.095619658119658,
+      "grad_norm": 0.7256866693496704,
+      "learning_rate": 9.264397987589776e-05,
+      "loss": 0.7607,
+      "step": 11771
+    },
+    {
+      "epoch": 2.0957977207977208,
+      "grad_norm": 0.8175343871116638,
+      "learning_rate": 9.263002039809055e-05,
+      "loss": 0.8486,
+      "step": 11772
+    },
+    {
+      "epoch": 2.0959757834757835,
+      "grad_norm": 0.7618881464004517,
+      "learning_rate": 9.261606106468282e-05,
+      "loss": 0.8182,
+      "step": 11773
+    },
+    {
+      "epoch": 2.0961538461538463,
+      "grad_norm": 0.7574927806854248,
+      "learning_rate": 9.2602101875948e-05,
+      "loss": 0.8703,
+      "step": 11774
+    },
+    {
+      "epoch": 2.096331908831909,
+      "grad_norm": 0.8639108538627625,
+      "learning_rate": 9.258814283215964e-05,
+      "loss": 0.9044,
+      "step": 11775
+    },
+    {
+      "epoch": 2.0965099715099713,
+      "grad_norm": 0.7221997380256653,
+      "learning_rate": 9.25741839335912e-05,
+      "loss": 0.7599,
+      "step": 11776
+    },
+    {
+      "epoch": 2.096688034188034,
+      "grad_norm": 0.9379764795303345,
+      "learning_rate": 9.256022518051626e-05,
+      "loss": 1.0002,
+      "step": 11777
+    },
+    {
+      "epoch": 2.096866096866097,
+      "grad_norm": 0.8430935740470886,
+      "learning_rate": 9.25462665732082e-05,
+      "loss": 0.7711,
+      "step": 11778
+    },
+    {
+      "epoch": 2.0970441595441596,
+      "grad_norm": 0.8371061086654663,
+      "learning_rate": 9.253230811194057e-05,
+      "loss": 0.9028,
+      "step": 11779
+    },
+    {
+      "epoch": 2.0972222222222223,
+      "grad_norm": 0.6960258483886719,
+      "learning_rate": 9.251834979698684e-05,
+      "loss": 0.7491,
+      "step": 11780
+    },
+    {
+      "epoch": 2.097400284900285,
+      "grad_norm": 0.7736398577690125,
+      "learning_rate": 9.25043916286205e-05,
+      "loss": 0.8985,
+      "step": 11781
+    },
+    {
+      "epoch": 2.0975783475783474,
+      "grad_norm": 0.6901512145996094,
+      "learning_rate": 9.249043360711509e-05,
+      "loss": 0.5881,
+      "step": 11782
+    },
+    {
+      "epoch": 2.09775641025641,
+      "grad_norm": 0.6741603016853333,
+      "learning_rate": 9.247647573274397e-05,
+      "loss": 0.7641,
+      "step": 11783
+    },
+    {
+      "epoch": 2.097934472934473,
+      "grad_norm": 0.736657440662384,
+      "learning_rate": 9.246251800578074e-05,
+      "loss": 0.8286,
+      "step": 11784
+    },
+    {
+      "epoch": 2.0981125356125356,
+      "grad_norm": 0.8235752582550049,
+      "learning_rate": 9.244856042649877e-05,
+      "loss": 0.8835,
+      "step": 11785
+    },
+    {
+      "epoch": 2.0982905982905984,
+      "grad_norm": 0.8083409667015076,
+      "learning_rate": 9.243460299517158e-05,
+      "loss": 0.9032,
+      "step": 11786
+    },
+    {
+      "epoch": 2.098468660968661,
+      "grad_norm": 0.7650952339172363,
+      "learning_rate": 9.242064571207262e-05,
+      "loss": 0.775,
+      "step": 11787
+    },
+    {
+      "epoch": 2.0986467236467234,
+      "grad_norm": 0.7961280345916748,
+      "learning_rate": 9.24066885774754e-05,
+      "loss": 0.6308,
+      "step": 11788
+    },
+    {
+      "epoch": 2.098824786324786,
+      "grad_norm": 0.8032481670379639,
+      "learning_rate": 9.23927315916533e-05,
+      "loss": 0.7544,
+      "step": 11789
+    },
+    {
+      "epoch": 2.099002849002849,
+      "grad_norm": 0.7452995777130127,
+      "learning_rate": 9.237877475487984e-05,
+      "loss": 0.8573,
+      "step": 11790
+    },
+    {
+      "epoch": 2.0991809116809117,
+      "grad_norm": 0.8141751289367676,
+      "learning_rate": 9.236481806742844e-05,
+      "loss": 0.9055,
+      "step": 11791
+    },
+    {
+      "epoch": 2.0993589743589745,
+      "grad_norm": 0.7862252593040466,
+      "learning_rate": 9.235086152957261e-05,
+      "loss": 0.6967,
+      "step": 11792
+    },
+    {
+      "epoch": 2.099537037037037,
+      "grad_norm": 0.771587073802948,
+      "learning_rate": 9.233690514158571e-05,
+      "loss": 0.7544,
+      "step": 11793
+    },
+    {
+      "epoch": 2.0997150997150995,
+      "grad_norm": 0.851445198059082,
+      "learning_rate": 9.23229489037413e-05,
+      "loss": 0.9249,
+      "step": 11794
+    },
+    {
+      "epoch": 2.0998931623931623,
+      "grad_norm": 0.7483612895011902,
+      "learning_rate": 9.23089928163127e-05,
+      "loss": 0.747,
+      "step": 11795
+    },
+    {
+      "epoch": 2.100071225071225,
+      "grad_norm": 0.8493219017982483,
+      "learning_rate": 9.229503687957342e-05,
+      "loss": 0.8898,
+      "step": 11796
+    },
+    {
+      "epoch": 2.1002492877492878,
+      "grad_norm": 0.8331718444824219,
+      "learning_rate": 9.228108109379687e-05,
+      "loss": 0.8943,
+      "step": 11797
+    },
+    {
+      "epoch": 2.1004273504273505,
+      "grad_norm": 0.7756054997444153,
+      "learning_rate": 9.226712545925655e-05,
+      "loss": 0.8586,
+      "step": 11798
+    },
+    {
+      "epoch": 2.1006054131054133,
+      "grad_norm": 0.7292607426643372,
+      "learning_rate": 9.225316997622579e-05,
+      "loss": 0.7591,
+      "step": 11799
+    },
+    {
+      "epoch": 2.1007834757834756,
+      "grad_norm": 0.8575723767280579,
+      "learning_rate": 9.223921464497811e-05,
+      "loss": 1.0147,
+      "step": 11800
+    },
+    {
+      "epoch": 2.1009615384615383,
+      "grad_norm": 0.7882707118988037,
+      "learning_rate": 9.222525946578687e-05,
+      "loss": 0.8297,
+      "step": 11801
+    },
+    {
+      "epoch": 2.101139601139601,
+      "grad_norm": 0.7982630729675293,
+      "learning_rate": 9.221130443892551e-05,
+      "loss": 0.9308,
+      "step": 11802
+    },
+    {
+      "epoch": 2.101317663817664,
+      "grad_norm": 0.7577962279319763,
+      "learning_rate": 9.219734956466752e-05,
+      "loss": 0.8474,
+      "step": 11803
+    },
+    {
+      "epoch": 2.1014957264957266,
+      "grad_norm": 0.7103776335716248,
+      "learning_rate": 9.218339484328621e-05,
+      "loss": 0.7863,
+      "step": 11804
+    },
+    {
+      "epoch": 2.1016737891737893,
+      "grad_norm": 0.8307296633720398,
+      "learning_rate": 9.216944027505505e-05,
+      "loss": 0.7633,
+      "step": 11805
+    },
+    {
+      "epoch": 2.1018518518518516,
+      "grad_norm": 0.8197653293609619,
+      "learning_rate": 9.215548586024743e-05,
+      "loss": 0.8987,
+      "step": 11806
+    },
+    {
+      "epoch": 2.1020299145299144,
+      "grad_norm": 0.9192719459533691,
+      "learning_rate": 9.21415315991368e-05,
+      "loss": 0.7829,
+      "step": 11807
+    },
+    {
+      "epoch": 2.102207977207977,
+      "grad_norm": 0.7249892354011536,
+      "learning_rate": 9.21275774919965e-05,
+      "loss": 0.9143,
+      "step": 11808
+    },
+    {
+      "epoch": 2.10238603988604,
+      "grad_norm": 0.7942582368850708,
+      "learning_rate": 9.211362353910002e-05,
+      "loss": 0.8634,
+      "step": 11809
+    },
+    {
+      "epoch": 2.1025641025641026,
+      "grad_norm": 0.7773341536521912,
+      "learning_rate": 9.209966974072065e-05,
+      "loss": 0.7865,
+      "step": 11810
+    },
+    {
+      "epoch": 2.1027421652421654,
+      "grad_norm": 0.802175760269165,
+      "learning_rate": 9.208571609713185e-05,
+      "loss": 0.7473,
+      "step": 11811
+    },
+    {
+      "epoch": 2.1029202279202277,
+      "grad_norm": 1.0248547792434692,
+      "learning_rate": 9.207176260860701e-05,
+      "loss": 1.0097,
+      "step": 11812
+    },
+    {
+      "epoch": 2.1030982905982905,
+      "grad_norm": 0.5781275629997253,
+      "learning_rate": 9.205780927541954e-05,
+      "loss": 0.5813,
+      "step": 11813
+    },
+    {
+      "epoch": 2.103276353276353,
+      "grad_norm": 0.7252389192581177,
+      "learning_rate": 9.204385609784274e-05,
+      "loss": 0.7978,
+      "step": 11814
+    },
+    {
+      "epoch": 2.103454415954416,
+      "grad_norm": 0.8497771620750427,
+      "learning_rate": 9.20299030761501e-05,
+      "loss": 0.95,
+      "step": 11815
+    },
+    {
+      "epoch": 2.1036324786324787,
+      "grad_norm": 0.8420650362968445,
+      "learning_rate": 9.201595021061491e-05,
+      "loss": 0.9693,
+      "step": 11816
+    },
+    {
+      "epoch": 2.1038105413105415,
+      "grad_norm": 0.8286302089691162,
+      "learning_rate": 9.200199750151063e-05,
+      "loss": 0.9457,
+      "step": 11817
+    },
+    {
+      "epoch": 2.103988603988604,
+      "grad_norm": 0.877740204334259,
+      "learning_rate": 9.198804494911057e-05,
+      "loss": 0.9082,
+      "step": 11818
+    },
+    {
+      "epoch": 2.1041666666666665,
+      "grad_norm": 0.7579863667488098,
+      "learning_rate": 9.197409255368817e-05,
+      "loss": 0.7681,
+      "step": 11819
+    },
+    {
+      "epoch": 2.1043447293447293,
+      "grad_norm": 0.7141458988189697,
+      "learning_rate": 9.19601403155167e-05,
+      "loss": 0.659,
+      "step": 11820
+    },
+    {
+      "epoch": 2.104522792022792,
+      "grad_norm": 0.8493850827217102,
+      "learning_rate": 9.194618823486958e-05,
+      "loss": 0.8197,
+      "step": 11821
+    },
+    {
+      "epoch": 2.1047008547008548,
+      "grad_norm": 0.8319337368011475,
+      "learning_rate": 9.193223631202019e-05,
+      "loss": 0.8955,
+      "step": 11822
+    },
+    {
+      "epoch": 2.1048789173789175,
+      "grad_norm": 0.7180153727531433,
+      "learning_rate": 9.191828454724186e-05,
+      "loss": 0.8068,
+      "step": 11823
+    },
+    {
+      "epoch": 2.10505698005698,
+      "grad_norm": 0.6748450398445129,
+      "learning_rate": 9.190433294080799e-05,
+      "loss": 0.7469,
+      "step": 11824
+    },
+    {
+      "epoch": 2.1052350427350426,
+      "grad_norm": 0.7750198841094971,
+      "learning_rate": 9.189038149299186e-05,
+      "loss": 0.932,
+      "step": 11825
+    },
+    {
+      "epoch": 2.1054131054131053,
+      "grad_norm": 0.7763389945030212,
+      "learning_rate": 9.187643020406688e-05,
+      "loss": 0.8027,
+      "step": 11826
+    },
+    {
+      "epoch": 2.105591168091168,
+      "grad_norm": 0.8382455110549927,
+      "learning_rate": 9.186247907430636e-05,
+      "loss": 0.8288,
+      "step": 11827
+    },
+    {
+      "epoch": 2.105769230769231,
+      "grad_norm": 0.6744221448898315,
+      "learning_rate": 9.184852810398367e-05,
+      "loss": 0.6807,
+      "step": 11828
+    },
+    {
+      "epoch": 2.1059472934472936,
+      "grad_norm": 0.7798452377319336,
+      "learning_rate": 9.183457729337212e-05,
+      "loss": 0.9853,
+      "step": 11829
+    },
+    {
+      "epoch": 2.1061253561253563,
+      "grad_norm": 0.7377058863639832,
+      "learning_rate": 9.182062664274513e-05,
+      "loss": 0.9043,
+      "step": 11830
+    },
+    {
+      "epoch": 2.1063034188034186,
+      "grad_norm": 0.8190791010856628,
+      "learning_rate": 9.180667615237589e-05,
+      "loss": 0.9786,
+      "step": 11831
+    },
+    {
+      "epoch": 2.1064814814814814,
+      "grad_norm": 0.7629963755607605,
+      "learning_rate": 9.179272582253785e-05,
+      "loss": 0.9168,
+      "step": 11832
+    },
+    {
+      "epoch": 2.106659544159544,
+      "grad_norm": 0.7753663063049316,
+      "learning_rate": 9.177877565350426e-05,
+      "loss": 0.963,
+      "step": 11833
+    },
+    {
+      "epoch": 2.106837606837607,
+      "grad_norm": 0.7842921614646912,
+      "learning_rate": 9.176482564554855e-05,
+      "loss": 0.8194,
+      "step": 11834
+    },
+    {
+      "epoch": 2.1070156695156697,
+      "grad_norm": 0.6640288233757019,
+      "learning_rate": 9.175087579894393e-05,
+      "loss": 0.6227,
+      "step": 11835
+    },
+    {
+      "epoch": 2.1071937321937324,
+      "grad_norm": 0.8474540710449219,
+      "learning_rate": 9.173692611396376e-05,
+      "loss": 0.7817,
+      "step": 11836
+    },
+    {
+      "epoch": 2.1073717948717947,
+      "grad_norm": 0.7123007774353027,
+      "learning_rate": 9.172297659088135e-05,
+      "loss": 0.9508,
+      "step": 11837
+    },
+    {
+      "epoch": 2.1075498575498575,
+      "grad_norm": 0.7418060898780823,
+      "learning_rate": 9.170902722997007e-05,
+      "loss": 0.8832,
+      "step": 11838
+    },
+    {
+      "epoch": 2.10772792022792,
+      "grad_norm": 0.7899464964866638,
+      "learning_rate": 9.169507803150313e-05,
+      "loss": 0.8474,
+      "step": 11839
+    },
+    {
+      "epoch": 2.107905982905983,
+      "grad_norm": 0.7543701529502869,
+      "learning_rate": 9.168112899575388e-05,
+      "loss": 0.8113,
+      "step": 11840
+    },
+    {
+      "epoch": 2.1080840455840457,
+      "grad_norm": 0.8057922720909119,
+      "learning_rate": 9.166718012299565e-05,
+      "loss": 0.998,
+      "step": 11841
+    },
+    {
+      "epoch": 2.1082621082621085,
+      "grad_norm": 0.7879176139831543,
+      "learning_rate": 9.16532314135017e-05,
+      "loss": 1.0509,
+      "step": 11842
+    },
+    {
+      "epoch": 2.1084401709401708,
+      "grad_norm": 0.8796642422676086,
+      "learning_rate": 9.163928286754537e-05,
+      "loss": 1.0481,
+      "step": 11843
+    },
+    {
+      "epoch": 2.1086182336182335,
+      "grad_norm": 0.7158889174461365,
+      "learning_rate": 9.16253344853999e-05,
+      "loss": 0.796,
+      "step": 11844
+    },
+    {
+      "epoch": 2.1087962962962963,
+      "grad_norm": 0.8020899295806885,
+      "learning_rate": 9.161138626733863e-05,
+      "loss": 0.822,
+      "step": 11845
+    },
+    {
+      "epoch": 2.108974358974359,
+      "grad_norm": 0.7217469215393066,
+      "learning_rate": 9.159743821363478e-05,
+      "loss": 1.0037,
+      "step": 11846
+    },
+    {
+      "epoch": 2.109152421652422,
+      "grad_norm": 0.762450098991394,
+      "learning_rate": 9.158349032456171e-05,
+      "loss": 1.0047,
+      "step": 11847
+    },
+    {
+      "epoch": 2.1093304843304845,
+      "grad_norm": 0.7227019667625427,
+      "learning_rate": 9.156954260039263e-05,
+      "loss": 0.8034,
+      "step": 11848
+    },
+    {
+      "epoch": 2.109508547008547,
+      "grad_norm": 0.7358957529067993,
+      "learning_rate": 9.155559504140089e-05,
+      "loss": 0.9483,
+      "step": 11849
+    },
+    {
+      "epoch": 2.1096866096866096,
+      "grad_norm": 0.7039931416511536,
+      "learning_rate": 9.154164764785968e-05,
+      "loss": 0.9255,
+      "step": 11850
+    },
+    {
+      "epoch": 2.1098646723646723,
+      "grad_norm": 0.8479618430137634,
+      "learning_rate": 9.152770042004234e-05,
+      "loss": 0.7379,
+      "step": 11851
+    },
+    {
+      "epoch": 2.110042735042735,
+      "grad_norm": 0.8320785164833069,
+      "learning_rate": 9.151375335822208e-05,
+      "loss": 0.944,
+      "step": 11852
+    },
+    {
+      "epoch": 2.110220797720798,
+      "grad_norm": 0.8186322450637817,
+      "learning_rate": 9.149980646267225e-05,
+      "loss": 0.7757,
+      "step": 11853
+    },
+    {
+      "epoch": 2.1103988603988606,
+      "grad_norm": 0.7816671133041382,
+      "learning_rate": 9.148585973366601e-05,
+      "loss": 0.8592,
+      "step": 11854
+    },
+    {
+      "epoch": 2.110576923076923,
+      "grad_norm": 0.8747152090072632,
+      "learning_rate": 9.147191317147671e-05,
+      "loss": 1.0852,
+      "step": 11855
+    },
+    {
+      "epoch": 2.1107549857549857,
+      "grad_norm": 0.7762712240219116,
+      "learning_rate": 9.14579667763775e-05,
+      "loss": 0.8466,
+      "step": 11856
+    },
+    {
+      "epoch": 2.1109330484330484,
+      "grad_norm": 0.8426344394683838,
+      "learning_rate": 9.144402054864171e-05,
+      "loss": 0.9949,
+      "step": 11857
+    },
+    {
+      "epoch": 2.111111111111111,
+      "grad_norm": 0.7581121921539307,
+      "learning_rate": 9.143007448854256e-05,
+      "loss": 0.748,
+      "step": 11858
+    },
+    {
+      "epoch": 2.111289173789174,
+      "grad_norm": 0.837939977645874,
+      "learning_rate": 9.141612859635333e-05,
+      "loss": 0.9479,
+      "step": 11859
+    },
+    {
+      "epoch": 2.1114672364672367,
+      "grad_norm": 0.7402070760726929,
+      "learning_rate": 9.140218287234718e-05,
+      "loss": 0.7829,
+      "step": 11860
+    },
+    {
+      "epoch": 2.111645299145299,
+      "grad_norm": 0.7125605344772339,
+      "learning_rate": 9.13882373167974e-05,
+      "loss": 1.0175,
+      "step": 11861
+    },
+    {
+      "epoch": 2.1118233618233617,
+      "grad_norm": 0.8021374344825745,
+      "learning_rate": 9.137429192997723e-05,
+      "loss": 0.9258,
+      "step": 11862
+    },
+    {
+      "epoch": 2.1120014245014245,
+      "grad_norm": 0.7860891222953796,
+      "learning_rate": 9.136034671215988e-05,
+      "loss": 0.7351,
+      "step": 11863
+    },
+    {
+      "epoch": 2.1121794871794872,
+      "grad_norm": 0.8324207067489624,
+      "learning_rate": 9.134640166361864e-05,
+      "loss": 0.8933,
+      "step": 11864
+    },
+    {
+      "epoch": 2.11235754985755,
+      "grad_norm": 0.8209179639816284,
+      "learning_rate": 9.133245678462663e-05,
+      "loss": 0.6983,
+      "step": 11865
+    },
+    {
+      "epoch": 2.1125356125356127,
+      "grad_norm": 0.7071694731712341,
+      "learning_rate": 9.131851207545716e-05,
+      "loss": 0.7796,
+      "step": 11866
+    },
+    {
+      "epoch": 2.112713675213675,
+      "grad_norm": 0.8126310110092163,
+      "learning_rate": 9.130456753638339e-05,
+      "loss": 0.8887,
+      "step": 11867
+    },
+    {
+      "epoch": 2.112891737891738,
+      "grad_norm": 0.7713829874992371,
+      "learning_rate": 9.129062316767855e-05,
+      "loss": 0.8169,
+      "step": 11868
+    },
+    {
+      "epoch": 2.1130698005698005,
+      "grad_norm": 0.8065944314002991,
+      "learning_rate": 9.127667896961585e-05,
+      "loss": 0.9295,
+      "step": 11869
+    },
+    {
+      "epoch": 2.1132478632478633,
+      "grad_norm": 0.7433435320854187,
+      "learning_rate": 9.126273494246856e-05,
+      "loss": 1.089,
+      "step": 11870
+    },
+    {
+      "epoch": 2.113425925925926,
+      "grad_norm": 0.8168141841888428,
+      "learning_rate": 9.124879108650978e-05,
+      "loss": 0.7914,
+      "step": 11871
+    },
+    {
+      "epoch": 2.113603988603989,
+      "grad_norm": 0.7703335285186768,
+      "learning_rate": 9.123484740201276e-05,
+      "loss": 1.0599,
+      "step": 11872
+    },
+    {
+      "epoch": 2.113782051282051,
+      "grad_norm": 0.810584545135498,
+      "learning_rate": 9.12209038892507e-05,
+      "loss": 0.886,
+      "step": 11873
+    },
+    {
+      "epoch": 2.113960113960114,
+      "grad_norm": 0.8441819548606873,
+      "learning_rate": 9.120696054849683e-05,
+      "loss": 0.9069,
+      "step": 11874
+    },
+    {
+      "epoch": 2.1141381766381766,
+      "grad_norm": 0.816067636013031,
+      "learning_rate": 9.119301738002425e-05,
+      "loss": 0.9084,
+      "step": 11875
+    },
+    {
+      "epoch": 2.1143162393162394,
+      "grad_norm": 0.8595525622367859,
+      "learning_rate": 9.117907438410622e-05,
+      "loss": 0.916,
+      "step": 11876
+    },
+    {
+      "epoch": 2.114494301994302,
+      "grad_norm": 0.8604792356491089,
+      "learning_rate": 9.116513156101589e-05,
+      "loss": 1.1207,
+      "step": 11877
+    },
+    {
+      "epoch": 2.114672364672365,
+      "grad_norm": 0.673664927482605,
+      "learning_rate": 9.115118891102649e-05,
+      "loss": 0.9767,
+      "step": 11878
+    },
+    {
+      "epoch": 2.114850427350427,
+      "grad_norm": 0.7064382433891296,
+      "learning_rate": 9.113724643441113e-05,
+      "loss": 0.91,
+      "step": 11879
+    },
+    {
+      "epoch": 2.11502849002849,
+      "grad_norm": 0.7256918549537659,
+      "learning_rate": 9.112330413144301e-05,
+      "loss": 0.9061,
+      "step": 11880
+    },
+    {
+      "epoch": 2.1152065527065527,
+      "grad_norm": 0.7914155721664429,
+      "learning_rate": 9.110936200239534e-05,
+      "loss": 0.6652,
+      "step": 11881
+    },
+    {
+      "epoch": 2.1153846153846154,
+      "grad_norm": 0.7484595775604248,
+      "learning_rate": 9.109542004754122e-05,
+      "loss": 0.8049,
+      "step": 11882
+    },
+    {
+      "epoch": 2.115562678062678,
+      "grad_norm": 0.8062677979469299,
+      "learning_rate": 9.108147826715387e-05,
+      "loss": 0.8671,
+      "step": 11883
+    },
+    {
+      "epoch": 2.115740740740741,
+      "grad_norm": 0.9595313668251038,
+      "learning_rate": 9.10675366615064e-05,
+      "loss": 0.995,
+      "step": 11884
+    },
+    {
+      "epoch": 2.1159188034188032,
+      "grad_norm": 0.7263179421424866,
+      "learning_rate": 9.105359523087203e-05,
+      "loss": 0.9177,
+      "step": 11885
+    },
+    {
+      "epoch": 2.116096866096866,
+      "grad_norm": 0.900650680065155,
+      "learning_rate": 9.103965397552385e-05,
+      "loss": 0.8599,
+      "step": 11886
+    },
+    {
+      "epoch": 2.1162749287749287,
+      "grad_norm": 0.7682752013206482,
+      "learning_rate": 9.102571289573506e-05,
+      "loss": 0.8942,
+      "step": 11887
+    },
+    {
+      "epoch": 2.1164529914529915,
+      "grad_norm": 0.7076446413993835,
+      "learning_rate": 9.101177199177874e-05,
+      "loss": 0.7498,
+      "step": 11888
+    },
+    {
+      "epoch": 2.1166310541310542,
+      "grad_norm": 0.711475133895874,
+      "learning_rate": 9.099783126392813e-05,
+      "loss": 0.7035,
+      "step": 11889
+    },
+    {
+      "epoch": 2.116809116809117,
+      "grad_norm": 0.6720870137214661,
+      "learning_rate": 9.098389071245627e-05,
+      "loss": 0.7315,
+      "step": 11890
+    },
+    {
+      "epoch": 2.1169871794871793,
+      "grad_norm": 0.8207699656486511,
+      "learning_rate": 9.096995033763639e-05,
+      "loss": 0.7465,
+      "step": 11891
+    },
+    {
+      "epoch": 2.117165242165242,
+      "grad_norm": 0.9032317996025085,
+      "learning_rate": 9.095601013974153e-05,
+      "loss": 0.9209,
+      "step": 11892
+    },
+    {
+      "epoch": 2.117343304843305,
+      "grad_norm": 0.886545717716217,
+      "learning_rate": 9.094207011904489e-05,
+      "loss": 0.9411,
+      "step": 11893
+    },
+    {
+      "epoch": 2.1175213675213675,
+      "grad_norm": 0.8235130906105042,
+      "learning_rate": 9.092813027581953e-05,
+      "loss": 0.9264,
+      "step": 11894
+    },
+    {
+      "epoch": 2.1176994301994303,
+      "grad_norm": 0.7530205845832825,
+      "learning_rate": 9.091419061033867e-05,
+      "loss": 0.8926,
+      "step": 11895
+    },
+    {
+      "epoch": 2.117877492877493,
+      "grad_norm": 0.8329548835754395,
+      "learning_rate": 9.090025112287533e-05,
+      "loss": 0.9615,
+      "step": 11896
+    },
+    {
+      "epoch": 2.1180555555555554,
+      "grad_norm": 0.8184738755226135,
+      "learning_rate": 9.088631181370269e-05,
+      "loss": 0.9069,
+      "step": 11897
+    },
+    {
+      "epoch": 2.118233618233618,
+      "grad_norm": 0.8071370720863342,
+      "learning_rate": 9.087237268309381e-05,
+      "loss": 0.8721,
+      "step": 11898
+    },
+    {
+      "epoch": 2.118411680911681,
+      "grad_norm": 0.8995245695114136,
+      "learning_rate": 9.085843373132187e-05,
+      "loss": 0.8815,
+      "step": 11899
+    },
+    {
+      "epoch": 2.1185897435897436,
+      "grad_norm": 0.7601714730262756,
+      "learning_rate": 9.084449495865989e-05,
+      "loss": 0.6824,
+      "step": 11900
+    },
+    {
+      "epoch": 2.1187678062678064,
+      "grad_norm": 0.8499618172645569,
+      "learning_rate": 9.083055636538101e-05,
+      "loss": 0.9868,
+      "step": 11901
+    },
+    {
+      "epoch": 2.118945868945869,
+      "grad_norm": 0.8190310001373291,
+      "learning_rate": 9.081661795175837e-05,
+      "loss": 0.8156,
+      "step": 11902
+    },
+    {
+      "epoch": 2.1191239316239314,
+      "grad_norm": 0.8340418934822083,
+      "learning_rate": 9.080267971806498e-05,
+      "loss": 1.0153,
+      "step": 11903
+    },
+    {
+      "epoch": 2.119301994301994,
+      "grad_norm": 0.8460756540298462,
+      "learning_rate": 9.0788741664574e-05,
+      "loss": 0.8752,
+      "step": 11904
+    },
+    {
+      "epoch": 2.119480056980057,
+      "grad_norm": 0.7457373738288879,
+      "learning_rate": 9.077480379155848e-05,
+      "loss": 0.9105,
+      "step": 11905
+    },
+    {
+      "epoch": 2.1196581196581197,
+      "grad_norm": 0.7883822917938232,
+      "learning_rate": 9.076086609929155e-05,
+      "loss": 0.8782,
+      "step": 11906
+    },
+    {
+      "epoch": 2.1198361823361824,
+      "grad_norm": 0.912143886089325,
+      "learning_rate": 9.074692858804622e-05,
+      "loss": 0.9898,
+      "step": 11907
+    },
+    {
+      "epoch": 2.120014245014245,
+      "grad_norm": 0.7801905274391174,
+      "learning_rate": 9.073299125809562e-05,
+      "loss": 1.091,
+      "step": 11908
+    },
+    {
+      "epoch": 2.1201923076923075,
+      "grad_norm": 0.6836256384849548,
+      "learning_rate": 9.071905410971279e-05,
+      "loss": 0.7967,
+      "step": 11909
+    },
+    {
+      "epoch": 2.1203703703703702,
+      "grad_norm": 0.7656795382499695,
+      "learning_rate": 9.070511714317085e-05,
+      "loss": 0.9696,
+      "step": 11910
+    },
+    {
+      "epoch": 2.120548433048433,
+      "grad_norm": 0.7010015249252319,
+      "learning_rate": 9.06911803587428e-05,
+      "loss": 0.6501,
+      "step": 11911
+    },
+    {
+      "epoch": 2.1207264957264957,
+      "grad_norm": 0.6673064827919006,
+      "learning_rate": 9.067724375670174e-05,
+      "loss": 0.5663,
+      "step": 11912
+    },
+    {
+      "epoch": 2.1209045584045585,
+      "grad_norm": 0.8683220148086548,
+      "learning_rate": 9.06633073373207e-05,
+      "loss": 0.9722,
+      "step": 11913
+    },
+    {
+      "epoch": 2.1210826210826212,
+      "grad_norm": 0.7793976068496704,
+      "learning_rate": 9.06493711008728e-05,
+      "loss": 0.7595,
+      "step": 11914
+    },
+    {
+      "epoch": 2.1212606837606836,
+      "grad_norm": 0.7803528308868408,
+      "learning_rate": 9.0635435047631e-05,
+      "loss": 0.9262,
+      "step": 11915
+    },
+    {
+      "epoch": 2.1214387464387463,
+      "grad_norm": 0.8067244291305542,
+      "learning_rate": 9.062149917786846e-05,
+      "loss": 0.9376,
+      "step": 11916
+    },
+    {
+      "epoch": 2.121616809116809,
+      "grad_norm": 0.7389153838157654,
+      "learning_rate": 9.060756349185812e-05,
+      "loss": 0.7414,
+      "step": 11917
+    },
+    {
+      "epoch": 2.121794871794872,
+      "grad_norm": 0.7717151045799255,
+      "learning_rate": 9.059362798987308e-05,
+      "loss": 0.7261,
+      "step": 11918
+    },
+    {
+      "epoch": 2.1219729344729346,
+      "grad_norm": 0.7668650150299072,
+      "learning_rate": 9.057969267218632e-05,
+      "loss": 0.8145,
+      "step": 11919
+    },
+    {
+      "epoch": 2.1221509971509973,
+      "grad_norm": 1.0015910863876343,
+      "learning_rate": 9.056575753907093e-05,
+      "loss": 0.8997,
+      "step": 11920
+    },
+    {
+      "epoch": 2.1223290598290596,
+      "grad_norm": 0.8731024861335754,
+      "learning_rate": 9.055182259079997e-05,
+      "loss": 1.0101,
+      "step": 11921
+    },
+    {
+      "epoch": 2.1225071225071224,
+      "grad_norm": 0.7662718892097473,
+      "learning_rate": 9.053788782764637e-05,
+      "loss": 0.697,
+      "step": 11922
+    },
+    {
+      "epoch": 2.122685185185185,
+      "grad_norm": 0.7783135771751404,
+      "learning_rate": 9.05239532498832e-05,
+      "loss": 0.8506,
+      "step": 11923
+    },
+    {
+      "epoch": 2.122863247863248,
+      "grad_norm": 0.8667652606964111,
+      "learning_rate": 9.05100188577835e-05,
+      "loss": 0.9851,
+      "step": 11924
+    },
+    {
+      "epoch": 2.1230413105413106,
+      "grad_norm": 0.7785412073135376,
+      "learning_rate": 9.049608465162028e-05,
+      "loss": 0.5924,
+      "step": 11925
+    },
+    {
+      "epoch": 2.1232193732193734,
+      "grad_norm": 0.7968559861183167,
+      "learning_rate": 9.04821506316665e-05,
+      "loss": 0.8114,
+      "step": 11926
+    },
+    {
+      "epoch": 2.123397435897436,
+      "grad_norm": 0.8065921068191528,
+      "learning_rate": 9.046821679819527e-05,
+      "loss": 0.9045,
+      "step": 11927
+    },
+    {
+      "epoch": 2.1235754985754984,
+      "grad_norm": 0.7509779930114746,
+      "learning_rate": 9.045428315147948e-05,
+      "loss": 0.7337,
+      "step": 11928
+    },
+    {
+      "epoch": 2.123753561253561,
+      "grad_norm": 0.8174976110458374,
+      "learning_rate": 9.044034969179219e-05,
+      "loss": 1.0113,
+      "step": 11929
+    },
+    {
+      "epoch": 2.123931623931624,
+      "grad_norm": 0.8723294734954834,
+      "learning_rate": 9.042641641940638e-05,
+      "loss": 0.9657,
+      "step": 11930
+    },
+    {
+      "epoch": 2.1241096866096867,
+      "grad_norm": 0.7412081360816956,
+      "learning_rate": 9.041248333459509e-05,
+      "loss": 0.9311,
+      "step": 11931
+    },
+    {
+      "epoch": 2.1242877492877494,
+      "grad_norm": 0.7376424670219421,
+      "learning_rate": 9.039855043763124e-05,
+      "loss": 0.7039,
+      "step": 11932
+    },
+    {
+      "epoch": 2.1244658119658117,
+      "grad_norm": 0.8002118468284607,
+      "learning_rate": 9.038461772878786e-05,
+      "loss": 0.9555,
+      "step": 11933
+    },
+    {
+      "epoch": 2.1246438746438745,
+      "grad_norm": 0.7221434712409973,
+      "learning_rate": 9.03706852083379e-05,
+      "loss": 0.8462,
+      "step": 11934
+    },
+    {
+      "epoch": 2.1248219373219372,
+      "grad_norm": 0.8506385684013367,
+      "learning_rate": 9.035675287655441e-05,
+      "loss": 0.7977,
+      "step": 11935
+    },
+    {
+      "epoch": 2.125,
+      "grad_norm": 0.8088411688804626,
+      "learning_rate": 9.034282073371025e-05,
+      "loss": 1.0146,
+      "step": 11936
+    },
+    {
+      "epoch": 2.1251780626780628,
+      "grad_norm": 0.9231638312339783,
+      "learning_rate": 9.032888878007853e-05,
+      "loss": 0.7017,
+      "step": 11937
+    },
+    {
+      "epoch": 2.1253561253561255,
+      "grad_norm": 0.721066951751709,
+      "learning_rate": 9.03149570159321e-05,
+      "loss": 0.7662,
+      "step": 11938
+    },
+    {
+      "epoch": 2.1255341880341883,
+      "grad_norm": 0.7804762721061707,
+      "learning_rate": 9.030102544154395e-05,
+      "loss": 0.6835,
+      "step": 11939
+    },
+    {
+      "epoch": 2.1257122507122506,
+      "grad_norm": 0.9728445410728455,
+      "learning_rate": 9.028709405718707e-05,
+      "loss": 0.9161,
+      "step": 11940
+    },
+    {
+      "epoch": 2.1258903133903133,
+      "grad_norm": 0.8209855556488037,
+      "learning_rate": 9.02731628631344e-05,
+      "loss": 0.7492,
+      "step": 11941
+    },
+    {
+      "epoch": 2.126068376068376,
+      "grad_norm": 0.7054622769355774,
+      "learning_rate": 9.025923185965896e-05,
+      "loss": 0.7908,
+      "step": 11942
+    },
+    {
+      "epoch": 2.126246438746439,
+      "grad_norm": 0.73018878698349,
+      "learning_rate": 9.024530104703358e-05,
+      "loss": 0.7902,
+      "step": 11943
+    },
+    {
+      "epoch": 2.1264245014245016,
+      "grad_norm": 0.73788982629776,
+      "learning_rate": 9.023137042553127e-05,
+      "loss": 0.8473,
+      "step": 11944
+    },
+    {
+      "epoch": 2.126602564102564,
+      "grad_norm": 0.7733396291732788,
+      "learning_rate": 9.021743999542495e-05,
+      "loss": 0.9595,
+      "step": 11945
+    },
+    {
+      "epoch": 2.1267806267806266,
+      "grad_norm": 0.9066760540008545,
+      "learning_rate": 9.020350975698761e-05,
+      "loss": 0.8517,
+      "step": 11946
+    },
+    {
+      "epoch": 2.1269586894586894,
+      "grad_norm": 0.7552717328071594,
+      "learning_rate": 9.018957971049211e-05,
+      "loss": 0.6802,
+      "step": 11947
+    },
+    {
+      "epoch": 2.127136752136752,
+      "grad_norm": 0.7437541484832764,
+      "learning_rate": 9.017564985621144e-05,
+      "loss": 0.9365,
+      "step": 11948
+    },
+    {
+      "epoch": 2.127314814814815,
+      "grad_norm": 0.8216256499290466,
+      "learning_rate": 9.016172019441847e-05,
+      "loss": 0.9019,
+      "step": 11949
+    },
+    {
+      "epoch": 2.1274928774928776,
+      "grad_norm": 0.752247154712677,
+      "learning_rate": 9.014779072538621e-05,
+      "loss": 0.7771,
+      "step": 11950
+    },
+    {
+      "epoch": 2.1276709401709404,
+      "grad_norm": 0.7714348435401917,
+      "learning_rate": 9.013386144938748e-05,
+      "loss": 0.8495,
+      "step": 11951
+    },
+    {
+      "epoch": 2.1278490028490027,
+      "grad_norm": 0.8347537517547607,
+      "learning_rate": 9.011993236669529e-05,
+      "loss": 0.861,
+      "step": 11952
+    },
+    {
+      "epoch": 2.1280270655270654,
+      "grad_norm": 0.8180193901062012,
+      "learning_rate": 9.010600347758245e-05,
+      "loss": 0.9059,
+      "step": 11953
+    },
+    {
+      "epoch": 2.128205128205128,
+      "grad_norm": 0.7328528761863708,
+      "learning_rate": 9.009207478232193e-05,
+      "loss": 0.9144,
+      "step": 11954
+    },
+    {
+      "epoch": 2.128383190883191,
+      "grad_norm": 0.7590839862823486,
+      "learning_rate": 9.007814628118661e-05,
+      "loss": 0.8642,
+      "step": 11955
+    },
+    {
+      "epoch": 2.1285612535612537,
+      "grad_norm": 0.7962782382965088,
+      "learning_rate": 9.006421797444945e-05,
+      "loss": 0.8958,
+      "step": 11956
+    },
+    {
+      "epoch": 2.128739316239316,
+      "grad_norm": 0.7302426695823669,
+      "learning_rate": 9.005028986238325e-05,
+      "loss": 0.9419,
+      "step": 11957
+    },
+    {
+      "epoch": 2.1289173789173788,
+      "grad_norm": 0.9223780632019043,
+      "learning_rate": 9.003636194526098e-05,
+      "loss": 0.7631,
+      "step": 11958
+    },
+    {
+      "epoch": 2.1290954415954415,
+      "grad_norm": 0.728225588798523,
+      "learning_rate": 9.002243422335547e-05,
+      "loss": 0.7705,
+      "step": 11959
+    },
+    {
+      "epoch": 2.1292735042735043,
+      "grad_norm": 0.8519338369369507,
+      "learning_rate": 9.000850669693964e-05,
+      "loss": 0.8962,
+      "step": 11960
+    },
+    {
+      "epoch": 2.129451566951567,
+      "grad_norm": 0.8920532464981079,
+      "learning_rate": 8.999457936628641e-05,
+      "loss": 0.618,
+      "step": 11961
+    },
+    {
+      "epoch": 2.1296296296296298,
+      "grad_norm": 0.9719427824020386,
+      "learning_rate": 8.998065223166857e-05,
+      "loss": 0.9142,
+      "step": 11962
+    },
+    {
+      "epoch": 2.1298076923076925,
+      "grad_norm": 0.8130887150764465,
+      "learning_rate": 8.996672529335908e-05,
+      "loss": 1.0246,
+      "step": 11963
+    },
+    {
+      "epoch": 2.129985754985755,
+      "grad_norm": 0.7682677507400513,
+      "learning_rate": 8.995279855163073e-05,
+      "loss": 0.7964,
+      "step": 11964
+    },
+    {
+      "epoch": 2.1301638176638176,
+      "grad_norm": 0.8507778644561768,
+      "learning_rate": 8.993887200675641e-05,
+      "loss": 0.756,
+      "step": 11965
+    },
+    {
+      "epoch": 2.1303418803418803,
+      "grad_norm": 0.815487802028656,
+      "learning_rate": 8.992494565900901e-05,
+      "loss": 0.7596,
+      "step": 11966
+    },
+    {
+      "epoch": 2.130519943019943,
+      "grad_norm": 0.8560892939567566,
+      "learning_rate": 8.991101950866138e-05,
+      "loss": 0.9939,
+      "step": 11967
+    },
+    {
+      "epoch": 2.130698005698006,
+      "grad_norm": 0.8737899661064148,
+      "learning_rate": 8.989709355598635e-05,
+      "loss": 0.9235,
+      "step": 11968
+    },
+    {
+      "epoch": 2.1308760683760686,
+      "grad_norm": 0.8434267640113831,
+      "learning_rate": 8.98831678012568e-05,
+      "loss": 0.7832,
+      "step": 11969
+    },
+    {
+      "epoch": 2.131054131054131,
+      "grad_norm": 0.8286582827568054,
+      "learning_rate": 8.986924224474553e-05,
+      "loss": 1.0591,
+      "step": 11970
+    },
+    {
+      "epoch": 2.1312321937321936,
+      "grad_norm": 0.8023663759231567,
+      "learning_rate": 8.985531688672546e-05,
+      "loss": 0.935,
+      "step": 11971
+    },
+    {
+      "epoch": 2.1314102564102564,
+      "grad_norm": 0.6504420042037964,
+      "learning_rate": 8.984139172746933e-05,
+      "loss": 0.79,
+      "step": 11972
+    },
+    {
+      "epoch": 2.131588319088319,
+      "grad_norm": 0.8969349265098572,
+      "learning_rate": 8.982746676725009e-05,
+      "loss": 1.0531,
+      "step": 11973
+    },
+    {
+      "epoch": 2.131766381766382,
+      "grad_norm": 0.802094042301178,
+      "learning_rate": 8.981354200634046e-05,
+      "loss": 0.8873,
+      "step": 11974
+    },
+    {
+      "epoch": 2.1319444444444446,
+      "grad_norm": 0.7630797624588013,
+      "learning_rate": 8.979961744501332e-05,
+      "loss": 0.9299,
+      "step": 11975
+    },
+    {
+      "epoch": 2.132122507122507,
+      "grad_norm": 0.8395546674728394,
+      "learning_rate": 8.978569308354148e-05,
+      "loss": 0.922,
+      "step": 11976
+    },
+    {
+      "epoch": 2.1323005698005697,
+      "grad_norm": 0.9325534701347351,
+      "learning_rate": 8.97717689221978e-05,
+      "loss": 0.9156,
+      "step": 11977
+    },
+    {
+      "epoch": 2.1324786324786325,
+      "grad_norm": 0.8139503002166748,
+      "learning_rate": 8.975784496125502e-05,
+      "loss": 0.8882,
+      "step": 11978
+    },
+    {
+      "epoch": 2.132656695156695,
+      "grad_norm": 1.0311007499694824,
+      "learning_rate": 8.974392120098599e-05,
+      "loss": 1.0068,
+      "step": 11979
+    },
+    {
+      "epoch": 2.132834757834758,
+      "grad_norm": 0.9328663945198059,
+      "learning_rate": 8.972999764166354e-05,
+      "loss": 0.8313,
+      "step": 11980
+    },
+    {
+      "epoch": 2.1330128205128207,
+      "grad_norm": 0.747276782989502,
+      "learning_rate": 8.971607428356044e-05,
+      "loss": 0.9302,
+      "step": 11981
+    },
+    {
+      "epoch": 2.133190883190883,
+      "grad_norm": 0.7572789788246155,
+      "learning_rate": 8.970215112694953e-05,
+      "loss": 0.8016,
+      "step": 11982
+    },
+    {
+      "epoch": 2.1333689458689458,
+      "grad_norm": 0.8988085389137268,
+      "learning_rate": 8.968822817210354e-05,
+      "loss": 0.9307,
+      "step": 11983
+    },
+    {
+      "epoch": 2.1335470085470085,
+      "grad_norm": 0.7537818551063538,
+      "learning_rate": 8.967430541929532e-05,
+      "loss": 0.6423,
+      "step": 11984
+    },
+    {
+      "epoch": 2.1337250712250713,
+      "grad_norm": 0.7470884323120117,
+      "learning_rate": 8.966038286879763e-05,
+      "loss": 0.7753,
+      "step": 11985
+    },
+    {
+      "epoch": 2.133903133903134,
+      "grad_norm": 0.8670676946640015,
+      "learning_rate": 8.964646052088328e-05,
+      "loss": 1.0407,
+      "step": 11986
+    },
+    {
+      "epoch": 2.1340811965811968,
+      "grad_norm": 0.8322215676307678,
+      "learning_rate": 8.9632538375825e-05,
+      "loss": 0.6498,
+      "step": 11987
+    },
+    {
+      "epoch": 2.134259259259259,
+      "grad_norm": 0.7089048027992249,
+      "learning_rate": 8.961861643389562e-05,
+      "loss": 0.8778,
+      "step": 11988
+    },
+    {
+      "epoch": 2.134437321937322,
+      "grad_norm": 0.7980125546455383,
+      "learning_rate": 8.960469469536786e-05,
+      "loss": 0.7797,
+      "step": 11989
+    },
+    {
+      "epoch": 2.1346153846153846,
+      "grad_norm": 0.9979715943336487,
+      "learning_rate": 8.959077316051452e-05,
+      "loss": 0.7388,
+      "step": 11990
+    },
+    {
+      "epoch": 2.1347934472934473,
+      "grad_norm": 1.0040662288665771,
+      "learning_rate": 8.957685182960833e-05,
+      "loss": 0.954,
+      "step": 11991
+    },
+    {
+      "epoch": 2.13497150997151,
+      "grad_norm": 0.7885099053382874,
+      "learning_rate": 8.956293070292214e-05,
+      "loss": 0.9232,
+      "step": 11992
+    },
+    {
+      "epoch": 2.135149572649573,
+      "grad_norm": 0.7242771983146667,
+      "learning_rate": 8.954900978072859e-05,
+      "loss": 0.8614,
+      "step": 11993
+    },
+    {
+      "epoch": 2.135327635327635,
+      "grad_norm": 0.7970352172851562,
+      "learning_rate": 8.95350890633005e-05,
+      "loss": 0.8959,
+      "step": 11994
+    },
+    {
+      "epoch": 2.135505698005698,
+      "grad_norm": 0.8587128520011902,
+      "learning_rate": 8.952116855091059e-05,
+      "loss": 0.9981,
+      "step": 11995
+    },
+    {
+      "epoch": 2.1356837606837606,
+      "grad_norm": 0.8206220269203186,
+      "learning_rate": 8.950724824383164e-05,
+      "loss": 1.0271,
+      "step": 11996
+    },
+    {
+      "epoch": 2.1358618233618234,
+      "grad_norm": 0.8085001707077026,
+      "learning_rate": 8.949332814233635e-05,
+      "loss": 0.925,
+      "step": 11997
+    },
+    {
+      "epoch": 2.136039886039886,
+      "grad_norm": 0.8361417651176453,
+      "learning_rate": 8.947940824669748e-05,
+      "loss": 0.8744,
+      "step": 11998
+    },
+    {
+      "epoch": 2.136217948717949,
+      "grad_norm": 0.7548407316207886,
+      "learning_rate": 8.946548855718773e-05,
+      "loss": 0.7365,
+      "step": 11999
+    },
+    {
+      "epoch": 2.136396011396011,
+      "grad_norm": 0.8671223521232605,
+      "learning_rate": 8.945156907407983e-05,
+      "loss": 0.8958,
+      "step": 12000
+    },
+    {
+      "epoch": 2.136574074074074,
+      "grad_norm": 0.8007429838180542,
+      "learning_rate": 8.943764979764656e-05,
+      "loss": 0.955,
+      "step": 12001
+    },
+    {
+      "epoch": 2.1367521367521367,
+      "grad_norm": 0.7834315299987793,
+      "learning_rate": 8.942373072816057e-05,
+      "loss": 0.9226,
+      "step": 12002
+    },
+    {
+      "epoch": 2.1369301994301995,
+      "grad_norm": 0.896920919418335,
+      "learning_rate": 8.940981186589466e-05,
+      "loss": 0.8779,
+      "step": 12003
+    },
+    {
+      "epoch": 2.137108262108262,
+      "grad_norm": 0.7473411560058594,
+      "learning_rate": 8.939589321112143e-05,
+      "loss": 0.8993,
+      "step": 12004
+    },
+    {
+      "epoch": 2.137286324786325,
+      "grad_norm": 0.8071674704551697,
+      "learning_rate": 8.938197476411367e-05,
+      "loss": 0.9998,
+      "step": 12005
+    },
+    {
+      "epoch": 2.1374643874643873,
+      "grad_norm": 0.839290976524353,
+      "learning_rate": 8.936805652514404e-05,
+      "loss": 0.8311,
+      "step": 12006
+    },
+    {
+      "epoch": 2.13764245014245,
+      "grad_norm": 0.7217035293579102,
+      "learning_rate": 8.93541384944853e-05,
+      "loss": 0.8009,
+      "step": 12007
+    },
+    {
+      "epoch": 2.1378205128205128,
+      "grad_norm": 0.7392259836196899,
+      "learning_rate": 8.934022067241004e-05,
+      "loss": 0.9854,
+      "step": 12008
+    },
+    {
+      "epoch": 2.1379985754985755,
+      "grad_norm": 0.7470507621765137,
+      "learning_rate": 8.932630305919107e-05,
+      "loss": 0.8111,
+      "step": 12009
+    },
+    {
+      "epoch": 2.1381766381766383,
+      "grad_norm": 0.7988318204879761,
+      "learning_rate": 8.931238565510098e-05,
+      "loss": 0.8492,
+      "step": 12010
+    },
+    {
+      "epoch": 2.138354700854701,
+      "grad_norm": 0.9267526268959045,
+      "learning_rate": 8.929846846041251e-05,
+      "loss": 1.2238,
+      "step": 12011
+    },
+    {
+      "epoch": 2.1385327635327633,
+      "grad_norm": 0.8036465644836426,
+      "learning_rate": 8.92845514753983e-05,
+      "loss": 0.8837,
+      "step": 12012
+    },
+    {
+      "epoch": 2.138710826210826,
+      "grad_norm": 0.809256911277771,
+      "learning_rate": 8.927063470033109e-05,
+      "loss": 0.8836,
+      "step": 12013
+    },
+    {
+      "epoch": 2.138888888888889,
+      "grad_norm": 0.754692792892456,
+      "learning_rate": 8.925671813548345e-05,
+      "loss": 0.9469,
+      "step": 12014
+    },
+    {
+      "epoch": 2.1390669515669516,
+      "grad_norm": 0.9183036088943481,
+      "learning_rate": 8.924280178112814e-05,
+      "loss": 0.7654,
+      "step": 12015
+    },
+    {
+      "epoch": 2.1392450142450143,
+      "grad_norm": 0.82411128282547,
+      "learning_rate": 8.922888563753775e-05,
+      "loss": 0.9132,
+      "step": 12016
+    },
+    {
+      "epoch": 2.139423076923077,
+      "grad_norm": 0.8455918431282043,
+      "learning_rate": 8.9214969704985e-05,
+      "loss": 1.0041,
+      "step": 12017
+    },
+    {
+      "epoch": 2.1396011396011394,
+      "grad_norm": 0.9235896468162537,
+      "learning_rate": 8.92010539837425e-05,
+      "loss": 0.7842,
+      "step": 12018
+    },
+    {
+      "epoch": 2.139779202279202,
+      "grad_norm": 0.8965059518814087,
+      "learning_rate": 8.918713847408289e-05,
+      "loss": 1.0665,
+      "step": 12019
+    },
+    {
+      "epoch": 2.139957264957265,
+      "grad_norm": Infinity,
+      "learning_rate": 8.918713847408289e-05,
+      "loss": 1.0773,
+      "step": 12020
+    },
+    {
+      "epoch": 2.1401353276353277,
+      "grad_norm": 0.8859738707542419,
+      "learning_rate": 8.917322317627887e-05,
+      "loss": 0.9175,
+      "step": 12021
+    },
+    {
+      "epoch": 2.1403133903133904,
+      "grad_norm": 0.7828214764595032,
+      "learning_rate": 8.915930809060304e-05,
+      "loss": 0.8433,
+      "step": 12022
+    },
+    {
+      "epoch": 2.140491452991453,
+      "grad_norm": 0.7705734372138977,
+      "learning_rate": 8.914539321732808e-05,
+      "loss": 0.8696,
+      "step": 12023
+    },
+    {
+      "epoch": 2.1406695156695155,
+      "grad_norm": 0.7999989986419678,
+      "learning_rate": 8.913147855672655e-05,
+      "loss": 1.0531,
+      "step": 12024
+    },
+    {
+      "epoch": 2.140847578347578,
+      "grad_norm": 0.7210655212402344,
+      "learning_rate": 8.911756410907118e-05,
+      "loss": 0.6703,
+      "step": 12025
+    },
+    {
+      "epoch": 2.141025641025641,
+      "grad_norm": 0.7153459191322327,
+      "learning_rate": 8.910364987463447e-05,
+      "loss": 0.7166,
+      "step": 12026
+    },
+    {
+      "epoch": 2.1412037037037037,
+      "grad_norm": 0.771530032157898,
+      "learning_rate": 8.908973585368913e-05,
+      "loss": 0.6881,
+      "step": 12027
+    },
+    {
+      "epoch": 2.1413817663817665,
+      "grad_norm": 0.9988116025924683,
+      "learning_rate": 8.907582204650774e-05,
+      "loss": 0.8329,
+      "step": 12028
+    },
+    {
+      "epoch": 2.1415598290598292,
+      "grad_norm": 0.6992440819740295,
+      "learning_rate": 8.906190845336296e-05,
+      "loss": 0.6262,
+      "step": 12029
+    },
+    {
+      "epoch": 2.1417378917378915,
+      "grad_norm": 0.8061181902885437,
+      "learning_rate": 8.904799507452731e-05,
+      "loss": 0.8325,
+      "step": 12030
+    },
+    {
+      "epoch": 2.1419159544159543,
+      "grad_norm": 0.8372871279716492,
+      "learning_rate": 8.903408191027349e-05,
+      "loss": 0.8894,
+      "step": 12031
+    },
+    {
+      "epoch": 2.142094017094017,
+      "grad_norm": 0.803719162940979,
+      "learning_rate": 8.902016896087402e-05,
+      "loss": 0.9031,
+      "step": 12032
+    },
+    {
+      "epoch": 2.14227207977208,
+      "grad_norm": 0.8168890476226807,
+      "learning_rate": 8.900625622660158e-05,
+      "loss": 0.8174,
+      "step": 12033
+    },
+    {
+      "epoch": 2.1424501424501425,
+      "grad_norm": 0.8011388182640076,
+      "learning_rate": 8.899234370772865e-05,
+      "loss": 0.8267,
+      "step": 12034
+    },
+    {
+      "epoch": 2.1426282051282053,
+      "grad_norm": 0.8209220767021179,
+      "learning_rate": 8.897843140452795e-05,
+      "loss": 0.9303,
+      "step": 12035
+    },
+    {
+      "epoch": 2.142806267806268,
+      "grad_norm": 0.773525059223175,
+      "learning_rate": 8.896451931727192e-05,
+      "loss": 0.7037,
+      "step": 12036
+    },
+    {
+      "epoch": 2.1429843304843303,
+      "grad_norm": 0.7568892240524292,
+      "learning_rate": 8.895060744623324e-05,
+      "loss": 0.8568,
+      "step": 12037
+    },
+    {
+      "epoch": 2.143162393162393,
+      "grad_norm": 0.713636040687561,
+      "learning_rate": 8.893669579168444e-05,
+      "loss": 0.7838,
+      "step": 12038
+    },
+    {
+      "epoch": 2.143340455840456,
+      "grad_norm": 0.7462167739868164,
+      "learning_rate": 8.892278435389814e-05,
+      "loss": 0.6311,
+      "step": 12039
+    },
+    {
+      "epoch": 2.1435185185185186,
+      "grad_norm": 0.7164530158042908,
+      "learning_rate": 8.890887313314685e-05,
+      "loss": 1.0228,
+      "step": 12040
+    },
+    {
+      "epoch": 2.1436965811965814,
+      "grad_norm": 0.7540927529335022,
+      "learning_rate": 8.889496212970312e-05,
+      "loss": 0.8958,
+      "step": 12041
+    },
+    {
+      "epoch": 2.1438746438746437,
+      "grad_norm": 0.8119065761566162,
+      "learning_rate": 8.888105134383957e-05,
+      "loss": 0.8925,
+      "step": 12042
+    },
+    {
+      "epoch": 2.1440527065527064,
+      "grad_norm": 0.7905679941177368,
+      "learning_rate": 8.88671407758287e-05,
+      "loss": 0.7579,
+      "step": 12043
+    },
+    {
+      "epoch": 2.144230769230769,
+      "grad_norm": 0.8901177048683167,
+      "learning_rate": 8.885323042594312e-05,
+      "loss": 0.8849,
+      "step": 12044
+    },
+    {
+      "epoch": 2.144408831908832,
+      "grad_norm": 0.6958974599838257,
+      "learning_rate": 8.88393202944553e-05,
+      "loss": 0.8072,
+      "step": 12045
+    },
+    {
+      "epoch": 2.1445868945868947,
+      "grad_norm": 0.790036141872406,
+      "learning_rate": 8.882541038163786e-05,
+      "loss": 0.796,
+      "step": 12046
+    },
+    {
+      "epoch": 2.1447649572649574,
+      "grad_norm": 0.757655680179596,
+      "learning_rate": 8.881150068776324e-05,
+      "loss": 0.8094,
+      "step": 12047
+    },
+    {
+      "epoch": 2.14494301994302,
+      "grad_norm": 0.7525215148925781,
+      "learning_rate": 8.879759121310404e-05,
+      "loss": 0.6746,
+      "step": 12048
+    },
+    {
+      "epoch": 2.1451210826210825,
+      "grad_norm": 0.740566074848175,
+      "learning_rate": 8.878368195793276e-05,
+      "loss": 0.688,
+      "step": 12049
+    },
+    {
+      "epoch": 2.1452991452991452,
+      "grad_norm": 0.7771985530853271,
+      "learning_rate": 8.876977292252196e-05,
+      "loss": 1.0013,
+      "step": 12050
+    },
+    {
+      "epoch": 2.145477207977208,
+      "grad_norm": 0.8582369685173035,
+      "learning_rate": 8.875586410714409e-05,
+      "loss": 0.9185,
+      "step": 12051
+    },
+    {
+      "epoch": 2.1456552706552707,
+      "grad_norm": 0.7992526292800903,
+      "learning_rate": 8.874195551207174e-05,
+      "loss": 0.8388,
+      "step": 12052
+    },
+    {
+      "epoch": 2.1458333333333335,
+      "grad_norm": 0.795129120349884,
+      "learning_rate": 8.872804713757735e-05,
+      "loss": 0.88,
+      "step": 12053
+    },
+    {
+      "epoch": 2.146011396011396,
+      "grad_norm": 0.7467540502548218,
+      "learning_rate": 8.871413898393351e-05,
+      "loss": 0.8092,
+      "step": 12054
+    },
+    {
+      "epoch": 2.1461894586894585,
+      "grad_norm": 0.9468266367912292,
+      "learning_rate": 8.870023105141264e-05,
+      "loss": 0.7759,
+      "step": 12055
+    },
+    {
+      "epoch": 2.1463675213675213,
+      "grad_norm": 0.7893772721290588,
+      "learning_rate": 8.868632334028727e-05,
+      "loss": 0.8508,
+      "step": 12056
+    },
+    {
+      "epoch": 2.146545584045584,
+      "grad_norm": 0.6931375861167908,
+      "learning_rate": 8.867241585082988e-05,
+      "loss": 0.5013,
+      "step": 12057
+    },
+    {
+      "epoch": 2.146723646723647,
+      "grad_norm": 0.8978447318077087,
+      "learning_rate": 8.865850858331301e-05,
+      "loss": 0.9518,
+      "step": 12058
+    },
+    {
+      "epoch": 2.1469017094017095,
+      "grad_norm": 0.7293453812599182,
+      "learning_rate": 8.864460153800906e-05,
+      "loss": 0.731,
+      "step": 12059
+    },
+    {
+      "epoch": 2.1470797720797723,
+      "grad_norm": 0.8537824749946594,
+      "learning_rate": 8.863069471519056e-05,
+      "loss": 0.7935,
+      "step": 12060
+    },
+    {
+      "epoch": 2.1472578347578346,
+      "grad_norm": 0.6527614593505859,
+      "learning_rate": 8.861678811513002e-05,
+      "loss": 0.6579,
+      "step": 12061
+    },
+    {
+      "epoch": 2.1474358974358974,
+      "grad_norm": 0.9407904148101807,
+      "learning_rate": 8.860288173809983e-05,
+      "loss": 0.9057,
+      "step": 12062
+    },
+    {
+      "epoch": 2.14761396011396,
+      "grad_norm": 0.9314194321632385,
+      "learning_rate": 8.858897558437251e-05,
+      "loss": 1.0826,
+      "step": 12063
+    },
+    {
+      "epoch": 2.147792022792023,
+      "grad_norm": 0.7872337102890015,
+      "learning_rate": 8.85750696542205e-05,
+      "loss": 0.8919,
+      "step": 12064
+    },
+    {
+      "epoch": 2.1479700854700856,
+      "grad_norm": 0.8379341959953308,
+      "learning_rate": 8.85611639479163e-05,
+      "loss": 0.8193,
+      "step": 12065
+    },
+    {
+      "epoch": 2.148148148148148,
+      "grad_norm": 0.801295280456543,
+      "learning_rate": 8.85472584657323e-05,
+      "loss": 0.9305,
+      "step": 12066
+    },
+    {
+      "epoch": 2.1483262108262107,
+      "grad_norm": 0.7625086903572083,
+      "learning_rate": 8.853335320794098e-05,
+      "loss": 0.8379,
+      "step": 12067
+    },
+    {
+      "epoch": 2.1485042735042734,
+      "grad_norm": 0.8256231546401978,
+      "learning_rate": 8.851944817481478e-05,
+      "loss": 0.8901,
+      "step": 12068
+    },
+    {
+      "epoch": 2.148682336182336,
+      "grad_norm": 0.6940581202507019,
+      "learning_rate": 8.850554336662618e-05,
+      "loss": 0.6706,
+      "step": 12069
+    },
+    {
+      "epoch": 2.148860398860399,
+      "grad_norm": 0.910836398601532,
+      "learning_rate": 8.849163878364755e-05,
+      "loss": 0.9326,
+      "step": 12070
+    },
+    {
+      "epoch": 2.1490384615384617,
+      "grad_norm": 0.8550460934638977,
+      "learning_rate": 8.847773442615138e-05,
+      "loss": 0.8474,
+      "step": 12071
+    },
+    {
+      "epoch": 2.1492165242165244,
+      "grad_norm": 0.8178627490997314,
+      "learning_rate": 8.846383029441002e-05,
+      "loss": 0.8331,
+      "step": 12072
+    },
+    {
+      "epoch": 2.1493945868945867,
+      "grad_norm": 0.7606281638145447,
+      "learning_rate": 8.844992638869599e-05,
+      "loss": 0.6571,
+      "step": 12073
+    },
+    {
+      "epoch": 2.1495726495726495,
+      "grad_norm": 0.7166888117790222,
+      "learning_rate": 8.84360227092816e-05,
+      "loss": 0.8592,
+      "step": 12074
+    },
+    {
+      "epoch": 2.1497507122507122,
+      "grad_norm": 0.7688186764717102,
+      "learning_rate": 8.84221192564394e-05,
+      "loss": 0.691,
+      "step": 12075
+    },
+    {
+      "epoch": 2.149928774928775,
+      "grad_norm": 0.876740038394928,
+      "learning_rate": 8.840821603044166e-05,
+      "loss": 0.9962,
+      "step": 12076
+    },
+    {
+      "epoch": 2.1501068376068377,
+      "grad_norm": 0.7910363078117371,
+      "learning_rate": 8.839431303156087e-05,
+      "loss": 0.8061,
+      "step": 12077
+    },
+    {
+      "epoch": 2.1502849002849005,
+      "grad_norm": 0.6880493760108948,
+      "learning_rate": 8.83804102600694e-05,
+      "loss": 0.8078,
+      "step": 12078
+    },
+    {
+      "epoch": 2.150462962962963,
+      "grad_norm": 0.7795937061309814,
+      "learning_rate": 8.836650771623963e-05,
+      "loss": 0.8504,
+      "step": 12079
+    },
+    {
+      "epoch": 2.1506410256410255,
+      "grad_norm": 0.7761844992637634,
+      "learning_rate": 8.835260540034403e-05,
+      "loss": 0.7253,
+      "step": 12080
+    },
+    {
+      "epoch": 2.1508190883190883,
+      "grad_norm": 0.7070515751838684,
+      "learning_rate": 8.83387033126549e-05,
+      "loss": 0.7156,
+      "step": 12081
+    },
+    {
+      "epoch": 2.150997150997151,
+      "grad_norm": 0.7666274309158325,
+      "learning_rate": 8.832480145344467e-05,
+      "loss": 0.813,
+      "step": 12082
+    },
+    {
+      "epoch": 2.151175213675214,
+      "grad_norm": 0.9145975708961487,
+      "learning_rate": 8.831089982298568e-05,
+      "loss": 0.8889,
+      "step": 12083
+    },
+    {
+      "epoch": 2.1513532763532766,
+      "grad_norm": 0.7735843062400818,
+      "learning_rate": 8.829699842155035e-05,
+      "loss": 0.7152,
+      "step": 12084
+    },
+    {
+      "epoch": 2.151531339031339,
+      "grad_norm": 0.7625414729118347,
+      "learning_rate": 8.828309724941099e-05,
+      "loss": 0.8752,
+      "step": 12085
+    },
+    {
+      "epoch": 2.1517094017094016,
+      "grad_norm": 0.8874264359474182,
+      "learning_rate": 8.826919630684005e-05,
+      "loss": 0.8175,
+      "step": 12086
+    },
+    {
+      "epoch": 2.1518874643874644,
+      "grad_norm": 0.7425693273544312,
+      "learning_rate": 8.82552955941098e-05,
+      "loss": 0.7505,
+      "step": 12087
+    },
+    {
+      "epoch": 2.152065527065527,
+      "grad_norm": 0.7098270058631897,
+      "learning_rate": 8.824139511149265e-05,
+      "loss": 0.7129,
+      "step": 12088
+    },
+    {
+      "epoch": 2.15224358974359,
+      "grad_norm": 0.8470510840415955,
+      "learning_rate": 8.822749485926092e-05,
+      "loss": 0.9656,
+      "step": 12089
+    },
+    {
+      "epoch": 2.1524216524216526,
+      "grad_norm": 0.7690402865409851,
+      "learning_rate": 8.8213594837687e-05,
+      "loss": 0.9436,
+      "step": 12090
+    },
+    {
+      "epoch": 2.152599715099715,
+      "grad_norm": 0.77431321144104,
+      "learning_rate": 8.819969504704318e-05,
+      "loss": 0.9912,
+      "step": 12091
+    },
+    {
+      "epoch": 2.1527777777777777,
+      "grad_norm": 0.7590892314910889,
+      "learning_rate": 8.818579548760184e-05,
+      "loss": 0.7412,
+      "step": 12092
+    },
+    {
+      "epoch": 2.1529558404558404,
+      "grad_norm": 0.870966374874115,
+      "learning_rate": 8.817189615963528e-05,
+      "loss": 1.0248,
+      "step": 12093
+    },
+    {
+      "epoch": 2.153133903133903,
+      "grad_norm": 0.7989356517791748,
+      "learning_rate": 8.815799706341587e-05,
+      "loss": 0.7104,
+      "step": 12094
+    },
+    {
+      "epoch": 2.153311965811966,
+      "grad_norm": 0.9615582227706909,
+      "learning_rate": 8.814409819921589e-05,
+      "loss": 1.0191,
+      "step": 12095
+    },
+    {
+      "epoch": 2.1534900284900287,
+      "grad_norm": 0.7063159346580505,
+      "learning_rate": 8.81301995673077e-05,
+      "loss": 0.8209,
+      "step": 12096
+    },
+    {
+      "epoch": 2.153668091168091,
+      "grad_norm": 0.8179874420166016,
+      "learning_rate": 8.811630116796356e-05,
+      "loss": 0.9457,
+      "step": 12097
+    },
+    {
+      "epoch": 2.1538461538461537,
+      "grad_norm": 0.7227353453636169,
+      "learning_rate": 8.810240300145582e-05,
+      "loss": 0.8112,
+      "step": 12098
+    },
+    {
+      "epoch": 2.1540242165242165,
+      "grad_norm": 0.7480359077453613,
+      "learning_rate": 8.808850506805677e-05,
+      "loss": 0.6293,
+      "step": 12099
+    },
+    {
+      "epoch": 2.1542022792022792,
+      "grad_norm": 0.7610893845558167,
+      "learning_rate": 8.807460736803871e-05,
+      "loss": 0.911,
+      "step": 12100
+    },
+    {
+      "epoch": 2.154380341880342,
+      "grad_norm": 0.774640679359436,
+      "learning_rate": 8.806070990167399e-05,
+      "loss": 0.8144,
+      "step": 12101
+    },
+    {
+      "epoch": 2.1545584045584047,
+      "grad_norm": 0.7785552144050598,
+      "learning_rate": 8.804681266923482e-05,
+      "loss": 0.8841,
+      "step": 12102
+    },
+    {
+      "epoch": 2.154736467236467,
+      "grad_norm": 0.843715488910675,
+      "learning_rate": 8.803291567099354e-05,
+      "loss": 0.9056,
+      "step": 12103
+    },
+    {
+      "epoch": 2.15491452991453,
+      "grad_norm": 0.7996346354484558,
+      "learning_rate": 8.801901890722241e-05,
+      "loss": 0.8916,
+      "step": 12104
+    },
+    {
+      "epoch": 2.1550925925925926,
+      "grad_norm": 0.9159125685691833,
+      "learning_rate": 8.800512237819376e-05,
+      "loss": 0.8225,
+      "step": 12105
+    },
+    {
+      "epoch": 2.1552706552706553,
+      "grad_norm": 0.8341643810272217,
+      "learning_rate": 8.799122608417976e-05,
+      "loss": 0.8702,
+      "step": 12106
+    },
+    {
+      "epoch": 2.155448717948718,
+      "grad_norm": 0.8075932264328003,
+      "learning_rate": 8.797733002545278e-05,
+      "loss": 0.9167,
+      "step": 12107
+    },
+    {
+      "epoch": 2.155626780626781,
+      "grad_norm": 0.8370183706283569,
+      "learning_rate": 8.7963434202285e-05,
+      "loss": 0.9213,
+      "step": 12108
+    },
+    {
+      "epoch": 2.155804843304843,
+      "grad_norm": 0.7500374913215637,
+      "learning_rate": 8.794953861494877e-05,
+      "loss": 0.7702,
+      "step": 12109
+    },
+    {
+      "epoch": 2.155982905982906,
+      "grad_norm": 0.7347766160964966,
+      "learning_rate": 8.793564326371626e-05,
+      "loss": 0.7057,
+      "step": 12110
+    },
+    {
+      "epoch": 2.1561609686609686,
+      "grad_norm": 0.754917562007904,
+      "learning_rate": 8.79217481488598e-05,
+      "loss": 0.8725,
+      "step": 12111
+    },
+    {
+      "epoch": 2.1563390313390314,
+      "grad_norm": 0.6942774057388306,
+      "learning_rate": 8.790785327065155e-05,
+      "loss": 0.85,
+      "step": 12112
+    },
+    {
+      "epoch": 2.156517094017094,
+      "grad_norm": 0.8082157969474792,
+      "learning_rate": 8.789395862936383e-05,
+      "loss": 1.1462,
+      "step": 12113
+    },
+    {
+      "epoch": 2.156695156695157,
+      "grad_norm": 0.898435652256012,
+      "learning_rate": 8.788006422526881e-05,
+      "loss": 0.8044,
+      "step": 12114
+    },
+    {
+      "epoch": 2.156873219373219,
+      "grad_norm": 0.9474737048149109,
+      "learning_rate": 8.786617005863879e-05,
+      "loss": 0.9089,
+      "step": 12115
+    },
+    {
+      "epoch": 2.157051282051282,
+      "grad_norm": 0.7898718118667603,
+      "learning_rate": 8.785227612974594e-05,
+      "loss": 0.9758,
+      "step": 12116
+    },
+    {
+      "epoch": 2.1572293447293447,
+      "grad_norm": 0.6734052300453186,
+      "learning_rate": 8.783838243886253e-05,
+      "loss": 0.7835,
+      "step": 12117
+    },
+    {
+      "epoch": 2.1574074074074074,
+      "grad_norm": 0.9381069540977478,
+      "learning_rate": 8.782448898626072e-05,
+      "loss": 0.7666,
+      "step": 12118
+    },
+    {
+      "epoch": 2.15758547008547,
+      "grad_norm": 0.8677506446838379,
+      "learning_rate": 8.781059577221276e-05,
+      "loss": 0.7442,
+      "step": 12119
+    },
+    {
+      "epoch": 2.157763532763533,
+      "grad_norm": 0.8244445323944092,
+      "learning_rate": 8.779670279699086e-05,
+      "loss": 0.8104,
+      "step": 12120
+    },
+    {
+      "epoch": 2.1579415954415953,
+      "grad_norm": 0.7984805703163147,
+      "learning_rate": 8.77828100608672e-05,
+      "loss": 1.029,
+      "step": 12121
+    },
+    {
+      "epoch": 2.158119658119658,
+      "grad_norm": 0.7817366123199463,
+      "learning_rate": 8.776891756411405e-05,
+      "loss": 0.6797,
+      "step": 12122
+    },
+    {
+      "epoch": 2.1582977207977208,
+      "grad_norm": 0.7084082365036011,
+      "learning_rate": 8.77550253070035e-05,
+      "loss": 0.688,
+      "step": 12123
+    },
+    {
+      "epoch": 2.1584757834757835,
+      "grad_norm": 0.7659782767295837,
+      "learning_rate": 8.774113328980782e-05,
+      "loss": 0.8691,
+      "step": 12124
+    },
+    {
+      "epoch": 2.1586538461538463,
+      "grad_norm": 0.7010130286216736,
+      "learning_rate": 8.772724151279913e-05,
+      "loss": 0.7587,
+      "step": 12125
+    },
+    {
+      "epoch": 2.158831908831909,
+      "grad_norm": 0.8183525800704956,
+      "learning_rate": 8.771334997624973e-05,
+      "loss": 0.8696,
+      "step": 12126
+    },
+    {
+      "epoch": 2.1590099715099713,
+      "grad_norm": 0.7944908142089844,
+      "learning_rate": 8.769945868043164e-05,
+      "loss": 0.8625,
+      "step": 12127
+    },
+    {
+      "epoch": 2.159188034188034,
+      "grad_norm": 0.7710323333740234,
+      "learning_rate": 8.768556762561713e-05,
+      "loss": 0.7765,
+      "step": 12128
+    },
+    {
+      "epoch": 2.159366096866097,
+      "grad_norm": 0.7416872382164001,
+      "learning_rate": 8.767167681207833e-05,
+      "loss": 0.9151,
+      "step": 12129
+    },
+    {
+      "epoch": 2.1595441595441596,
+      "grad_norm": 0.9230012893676758,
+      "learning_rate": 8.765778624008744e-05,
+      "loss": 0.914,
+      "step": 12130
+    },
+    {
+      "epoch": 2.1597222222222223,
+      "grad_norm": 0.7468557357788086,
+      "learning_rate": 8.764389590991657e-05,
+      "loss": 0.8624,
+      "step": 12131
+    },
+    {
+      "epoch": 2.159900284900285,
+      "grad_norm": 0.7746220827102661,
+      "learning_rate": 8.763000582183791e-05,
+      "loss": 0.9683,
+      "step": 12132
+    },
+    {
+      "epoch": 2.1600783475783474,
+      "grad_norm": 0.8429577350616455,
+      "learning_rate": 8.761611597612356e-05,
+      "loss": 0.8808,
+      "step": 12133
+    },
+    {
+      "epoch": 2.16025641025641,
+      "grad_norm": 0.8117298483848572,
+      "learning_rate": 8.760222637304572e-05,
+      "loss": 0.7067,
+      "step": 12134
+    },
+    {
+      "epoch": 2.160434472934473,
+      "grad_norm": 0.7717329859733582,
+      "learning_rate": 8.758833701287647e-05,
+      "loss": 1.0001,
+      "step": 12135
+    },
+    {
+      "epoch": 2.1606125356125356,
+      "grad_norm": 0.8493856191635132,
+      "learning_rate": 8.7574447895888e-05,
+      "loss": 0.9883,
+      "step": 12136
+    },
+    {
+      "epoch": 2.1607905982905984,
+      "grad_norm": 0.8592587113380432,
+      "learning_rate": 8.75605590223524e-05,
+      "loss": 0.7437,
+      "step": 12137
+    },
+    {
+      "epoch": 2.160968660968661,
+      "grad_norm": 0.6487032771110535,
+      "learning_rate": 8.75466703925418e-05,
+      "loss": 0.6841,
+      "step": 12138
+    },
+    {
+      "epoch": 2.1611467236467234,
+      "grad_norm": 0.8449310660362244,
+      "learning_rate": 8.753278200672832e-05,
+      "loss": 0.9221,
+      "step": 12139
+    },
+    {
+      "epoch": 2.161324786324786,
+      "grad_norm": 0.9603136777877808,
+      "learning_rate": 8.751889386518407e-05,
+      "loss": 0.8664,
+      "step": 12140
+    },
+    {
+      "epoch": 2.161502849002849,
+      "grad_norm": 0.7288493514060974,
+      "learning_rate": 8.750500596818121e-05,
+      "loss": 0.5745,
+      "step": 12141
+    },
+    {
+      "epoch": 2.1616809116809117,
+      "grad_norm": 0.8626441955566406,
+      "learning_rate": 8.749111831599178e-05,
+      "loss": 0.8346,
+      "step": 12142
+    },
+    {
+      "epoch": 2.1618589743589745,
+      "grad_norm": 0.7634188532829285,
+      "learning_rate": 8.74772309088879e-05,
+      "loss": 0.7503,
+      "step": 12143
+    },
+    {
+      "epoch": 2.162037037037037,
+      "grad_norm": 0.8641456365585327,
+      "learning_rate": 8.746334374714167e-05,
+      "loss": 0.9033,
+      "step": 12144
+    },
+    {
+      "epoch": 2.1622150997150995,
+      "grad_norm": 0.8103315234184265,
+      "learning_rate": 8.744945683102517e-05,
+      "loss": 0.8181,
+      "step": 12145
+    },
+    {
+      "epoch": 2.1623931623931623,
+      "grad_norm": 1.2493078708648682,
+      "learning_rate": 8.743557016081047e-05,
+      "loss": 1.0308,
+      "step": 12146
+    },
+    {
+      "epoch": 2.162571225071225,
+      "grad_norm": 0.6447771191596985,
+      "learning_rate": 8.742168373676973e-05,
+      "loss": 0.6886,
+      "step": 12147
+    },
+    {
+      "epoch": 2.1627492877492878,
+      "grad_norm": 0.90229332447052,
+      "learning_rate": 8.740779755917492e-05,
+      "loss": 1.0361,
+      "step": 12148
+    },
+    {
+      "epoch": 2.1629273504273505,
+      "grad_norm": 0.7414017915725708,
+      "learning_rate": 8.739391162829818e-05,
+      "loss": 0.781,
+      "step": 12149
+    },
+    {
+      "epoch": 2.1631054131054133,
+      "grad_norm": 0.8897294998168945,
+      "learning_rate": 8.738002594441154e-05,
+      "loss": 0.7712,
+      "step": 12150
+    },
+    {
+      "epoch": 2.1632834757834756,
+      "grad_norm": 0.8515656590461731,
+      "learning_rate": 8.73661405077871e-05,
+      "loss": 0.8843,
+      "step": 12151
+    },
+    {
+      "epoch": 2.1634615384615383,
+      "grad_norm": 0.7901699542999268,
+      "learning_rate": 8.735225531869686e-05,
+      "loss": 0.8588,
+      "step": 12152
+    },
+    {
+      "epoch": 2.163639601139601,
+      "grad_norm": 0.7262305021286011,
+      "learning_rate": 8.733837037741295e-05,
+      "loss": 0.9257,
+      "step": 12153
+    },
+    {
+      "epoch": 2.163817663817664,
+      "grad_norm": 1.1076871156692505,
+      "learning_rate": 8.732448568420732e-05,
+      "loss": 0.9817,
+      "step": 12154
+    },
+    {
+      "epoch": 2.1639957264957266,
+      "grad_norm": 0.8384785652160645,
+      "learning_rate": 8.731060123935209e-05,
+      "loss": 0.8024,
+      "step": 12155
+    },
+    {
+      "epoch": 2.1641737891737893,
+      "grad_norm": 0.8376259803771973,
+      "learning_rate": 8.729671704311924e-05,
+      "loss": 1.0299,
+      "step": 12156
+    },
+    {
+      "epoch": 2.164351851851852,
+      "grad_norm": 0.8248558044433594,
+      "learning_rate": 8.728283309578089e-05,
+      "loss": 0.9557,
+      "step": 12157
+    },
+    {
+      "epoch": 2.1645299145299144,
+      "grad_norm": 0.7452875375747681,
+      "learning_rate": 8.726894939760894e-05,
+      "loss": 0.8267,
+      "step": 12158
+    },
+    {
+      "epoch": 2.164707977207977,
+      "grad_norm": 0.8329267501831055,
+      "learning_rate": 8.72550659488755e-05,
+      "loss": 0.9288,
+      "step": 12159
+    },
+    {
+      "epoch": 2.16488603988604,
+      "grad_norm": 0.8748268485069275,
+      "learning_rate": 8.724118274985259e-05,
+      "loss": 0.8663,
+      "step": 12160
+    },
+    {
+      "epoch": 2.1650641025641026,
+      "grad_norm": 0.6839116811752319,
+      "learning_rate": 8.722729980081217e-05,
+      "loss": 0.6067,
+      "step": 12161
+    },
+    {
+      "epoch": 2.1652421652421654,
+      "grad_norm": 0.8343674540519714,
+      "learning_rate": 8.721341710202632e-05,
+      "loss": 0.9611,
+      "step": 12162
+    },
+    {
+      "epoch": 2.1654202279202277,
+      "grad_norm": 0.7783843874931335,
+      "learning_rate": 8.719953465376695e-05,
+      "loss": 0.8921,
+      "step": 12163
+    },
+    {
+      "epoch": 2.1655982905982905,
+      "grad_norm": 0.8357030749320984,
+      "learning_rate": 8.718565245630615e-05,
+      "loss": 0.9189,
+      "step": 12164
+    },
+    {
+      "epoch": 2.165776353276353,
+      "grad_norm": 0.8150131702423096,
+      "learning_rate": 8.717177050991582e-05,
+      "loss": 0.7486,
+      "step": 12165
+    },
+    {
+      "epoch": 2.165954415954416,
+      "grad_norm": 0.7282506823539734,
+      "learning_rate": 8.715788881486807e-05,
+      "loss": 0.8894,
+      "step": 12166
+    },
+    {
+      "epoch": 2.1661324786324787,
+      "grad_norm": 0.9958226680755615,
+      "learning_rate": 8.714400737143475e-05,
+      "loss": 1.0359,
+      "step": 12167
+    },
+    {
+      "epoch": 2.1663105413105415,
+      "grad_norm": 0.7162553071975708,
+      "learning_rate": 8.713012617988796e-05,
+      "loss": 0.7728,
+      "step": 12168
+    },
+    {
+      "epoch": 2.166488603988604,
+      "grad_norm": 0.6364821791648865,
+      "learning_rate": 8.711624524049955e-05,
+      "loss": 0.5881,
+      "step": 12169
+    },
+    {
+      "epoch": 2.1666666666666665,
+      "grad_norm": 0.9431148767471313,
+      "learning_rate": 8.710236455354159e-05,
+      "loss": 0.8804,
+      "step": 12170
+    },
+    {
+      "epoch": 2.1668447293447293,
+      "grad_norm": 0.7328855395317078,
+      "learning_rate": 8.708848411928598e-05,
+      "loss": 0.7762,
+      "step": 12171
+    },
+    {
+      "epoch": 2.167022792022792,
+      "grad_norm": 0.7855633497238159,
+      "learning_rate": 8.707460393800472e-05,
+      "loss": 0.7687,
+      "step": 12172
+    },
+    {
+      "epoch": 2.1672008547008548,
+      "grad_norm": 0.8694273233413696,
+      "learning_rate": 8.706072400996973e-05,
+      "loss": 0.7153,
+      "step": 12173
+    },
+    {
+      "epoch": 2.1673789173789175,
+      "grad_norm": 0.7371255159378052,
+      "learning_rate": 8.704684433545299e-05,
+      "loss": 0.7901,
+      "step": 12174
+    },
+    {
+      "epoch": 2.16755698005698,
+      "grad_norm": 0.7719849944114685,
+      "learning_rate": 8.70329649147264e-05,
+      "loss": 0.7569,
+      "step": 12175
+    },
+    {
+      "epoch": 2.1677350427350426,
+      "grad_norm": 0.883618175983429,
+      "learning_rate": 8.701908574806197e-05,
+      "loss": 0.8941,
+      "step": 12176
+    },
+    {
+      "epoch": 2.1679131054131053,
+      "grad_norm": 0.9455791711807251,
+      "learning_rate": 8.700520683573155e-05,
+      "loss": 0.8596,
+      "step": 12177
+    },
+    {
+      "epoch": 2.168091168091168,
+      "grad_norm": 0.7487229108810425,
+      "learning_rate": 8.69913281780071e-05,
+      "loss": 0.7353,
+      "step": 12178
+    },
+    {
+      "epoch": 2.168269230769231,
+      "grad_norm": 0.8050364255905151,
+      "learning_rate": 8.697744977516062e-05,
+      "loss": 0.8564,
+      "step": 12179
+    },
+    {
+      "epoch": 2.1684472934472936,
+      "grad_norm": 0.759355902671814,
+      "learning_rate": 8.69635716274639e-05,
+      "loss": 0.7128,
+      "step": 12180
+    },
+    {
+      "epoch": 2.1686253561253563,
+      "grad_norm": 0.8730760216712952,
+      "learning_rate": 8.694969373518892e-05,
+      "loss": 0.9944,
+      "step": 12181
+    },
+    {
+      "epoch": 2.1688034188034186,
+      "grad_norm": 0.7761130332946777,
+      "learning_rate": 8.693581609860756e-05,
+      "loss": 0.6845,
+      "step": 12182
+    },
+    {
+      "epoch": 2.1689814814814814,
+      "grad_norm": 0.8118788003921509,
+      "learning_rate": 8.692193871799181e-05,
+      "loss": 0.798,
+      "step": 12183
+    },
+    {
+      "epoch": 2.169159544159544,
+      "grad_norm": 0.8340219855308533,
+      "learning_rate": 8.690806159361344e-05,
+      "loss": 0.9754,
+      "step": 12184
+    },
+    {
+      "epoch": 2.169337606837607,
+      "grad_norm": 0.7515831589698792,
+      "learning_rate": 8.689418472574444e-05,
+      "loss": 0.998,
+      "step": 12185
+    },
+    {
+      "epoch": 2.1695156695156697,
+      "grad_norm": 0.7781083583831787,
+      "learning_rate": 8.688030811465665e-05,
+      "loss": 1.0152,
+      "step": 12186
+    },
+    {
+      "epoch": 2.169693732193732,
+      "grad_norm": 0.775097131729126,
+      "learning_rate": 8.6866431760622e-05,
+      "loss": 0.808,
+      "step": 12187
+    },
+    {
+      "epoch": 2.1698717948717947,
+      "grad_norm": 0.8288158178329468,
+      "learning_rate": 8.68525556639123e-05,
+      "loss": 0.9172,
+      "step": 12188
+    },
+    {
+      "epoch": 2.1700498575498575,
+      "grad_norm": 0.7754917740821838,
+      "learning_rate": 8.68386798247995e-05,
+      "loss": 0.719,
+      "step": 12189
+    },
+    {
+      "epoch": 2.17022792022792,
+      "grad_norm": 0.786685585975647,
+      "learning_rate": 8.682480424355539e-05,
+      "loss": 0.8883,
+      "step": 12190
+    },
+    {
+      "epoch": 2.170405982905983,
+      "grad_norm": 0.9970952272415161,
+      "learning_rate": 8.681092892045189e-05,
+      "loss": 0.9258,
+      "step": 12191
+    },
+    {
+      "epoch": 2.1705840455840457,
+      "grad_norm": 0.9732664227485657,
+      "learning_rate": 8.679705385576082e-05,
+      "loss": 0.8916,
+      "step": 12192
+    },
+    {
+      "epoch": 2.1707621082621085,
+      "grad_norm": 0.8557142615318298,
+      "learning_rate": 8.67831790497541e-05,
+      "loss": 0.8908,
+      "step": 12193
+    },
+    {
+      "epoch": 2.1709401709401708,
+      "grad_norm": 0.8564930558204651,
+      "learning_rate": 8.676930450270347e-05,
+      "loss": 1.054,
+      "step": 12194
+    },
+    {
+      "epoch": 2.1711182336182335,
+      "grad_norm": 0.785732090473175,
+      "learning_rate": 8.675543021488087e-05,
+      "loss": 0.7459,
+      "step": 12195
+    },
+    {
+      "epoch": 2.1712962962962963,
+      "grad_norm": 0.8739910125732422,
+      "learning_rate": 8.674155618655809e-05,
+      "loss": 0.8464,
+      "step": 12196
+    },
+    {
+      "epoch": 2.171474358974359,
+      "grad_norm": 0.8624834418296814,
+      "learning_rate": 8.672768241800699e-05,
+      "loss": 0.9405,
+      "step": 12197
+    },
+    {
+      "epoch": 2.171652421652422,
+      "grad_norm": 0.7948583364486694,
+      "learning_rate": 8.671380890949936e-05,
+      "loss": 1.0271,
+      "step": 12198
+    },
+    {
+      "epoch": 2.1718304843304845,
+      "grad_norm": 0.8078029155731201,
+      "learning_rate": 8.669993566130704e-05,
+      "loss": 0.6845,
+      "step": 12199
+    },
+    {
+      "epoch": 2.172008547008547,
+      "grad_norm": 0.7599586844444275,
+      "learning_rate": 8.668606267370187e-05,
+      "loss": 0.8438,
+      "step": 12200
+    },
+    {
+      "epoch": 2.1721866096866096,
+      "grad_norm": 0.8085161447525024,
+      "learning_rate": 8.667218994695562e-05,
+      "loss": 0.8398,
+      "step": 12201
+    },
+    {
+      "epoch": 2.1723646723646723,
+      "grad_norm": 0.9033090472221375,
+      "learning_rate": 8.665831748134019e-05,
+      "loss": 0.8591,
+      "step": 12202
+    },
+    {
+      "epoch": 2.172542735042735,
+      "grad_norm": 0.8638277649879456,
+      "learning_rate": 8.664444527712726e-05,
+      "loss": 0.8276,
+      "step": 12203
+    },
+    {
+      "epoch": 2.172720797720798,
+      "grad_norm": 0.8230745792388916,
+      "learning_rate": 8.663057333458871e-05,
+      "loss": 0.8663,
+      "step": 12204
+    },
+    {
+      "epoch": 2.1728988603988606,
+      "grad_norm": 0.8588439226150513,
+      "learning_rate": 8.661670165399626e-05,
+      "loss": 0.8543,
+      "step": 12205
+    },
+    {
+      "epoch": 2.173076923076923,
+      "grad_norm": 0.8461976051330566,
+      "learning_rate": 8.660283023562177e-05,
+      "loss": 0.8973,
+      "step": 12206
+    },
+    {
+      "epoch": 2.1732549857549857,
+      "grad_norm": 0.7768828868865967,
+      "learning_rate": 8.658895907973697e-05,
+      "loss": 0.7285,
+      "step": 12207
+    },
+    {
+      "epoch": 2.1734330484330484,
+      "grad_norm": 0.7384130954742432,
+      "learning_rate": 8.65750881866137e-05,
+      "loss": 0.7654,
+      "step": 12208
+    },
+    {
+      "epoch": 2.173611111111111,
+      "grad_norm": 0.8700957298278809,
+      "learning_rate": 8.656121755652365e-05,
+      "loss": 1.067,
+      "step": 12209
+    },
+    {
+      "epoch": 2.173789173789174,
+      "grad_norm": 0.8067826628684998,
+      "learning_rate": 8.654734718973863e-05,
+      "loss": 0.9863,
+      "step": 12210
+    },
+    {
+      "epoch": 2.1739672364672367,
+      "grad_norm": 0.7515989542007446,
+      "learning_rate": 8.653347708653039e-05,
+      "loss": 0.8434,
+      "step": 12211
+    },
+    {
+      "epoch": 2.174145299145299,
+      "grad_norm": 0.8280966877937317,
+      "learning_rate": 8.651960724717072e-05,
+      "loss": 1.0065,
+      "step": 12212
+    },
+    {
+      "epoch": 2.1743233618233617,
+      "grad_norm": 0.7988734841346741,
+      "learning_rate": 8.650573767193132e-05,
+      "loss": 0.7892,
+      "step": 12213
+    },
+    {
+      "epoch": 2.1745014245014245,
+      "grad_norm": 0.785323977470398,
+      "learning_rate": 8.649186836108399e-05,
+      "loss": 0.7563,
+      "step": 12214
+    },
+    {
+      "epoch": 2.1746794871794872,
+      "grad_norm": 0.7884892821311951,
+      "learning_rate": 8.64779993149004e-05,
+      "loss": 0.7225,
+      "step": 12215
+    },
+    {
+      "epoch": 2.17485754985755,
+      "grad_norm": 0.9376154541969299,
+      "learning_rate": 8.646413053365235e-05,
+      "loss": 0.8103,
+      "step": 12216
+    },
+    {
+      "epoch": 2.1750356125356127,
+      "grad_norm": 0.9680297374725342,
+      "learning_rate": 8.64502620176115e-05,
+      "loss": 0.8924,
+      "step": 12217
+    },
+    {
+      "epoch": 2.175213675213675,
+      "grad_norm": 1.016848087310791,
+      "learning_rate": 8.643639376704964e-05,
+      "loss": 0.9017,
+      "step": 12218
+    },
+    {
+      "epoch": 2.175391737891738,
+      "grad_norm": 0.790868878364563,
+      "learning_rate": 8.64225257822385e-05,
+      "loss": 0.7597,
+      "step": 12219
+    },
+    {
+      "epoch": 2.1755698005698005,
+      "grad_norm": 0.7539415955543518,
+      "learning_rate": 8.640865806344974e-05,
+      "loss": 0.7801,
+      "step": 12220
+    },
+    {
+      "epoch": 2.1757478632478633,
+      "grad_norm": 0.7288404703140259,
+      "learning_rate": 8.63947906109551e-05,
+      "loss": 0.6753,
+      "step": 12221
+    },
+    {
+      "epoch": 2.175925925925926,
+      "grad_norm": 0.8449869155883789,
+      "learning_rate": 8.638092342502623e-05,
+      "loss": 0.7165,
+      "step": 12222
+    },
+    {
+      "epoch": 2.176103988603989,
+      "grad_norm": 0.8210735321044922,
+      "learning_rate": 8.636705650593495e-05,
+      "loss": 0.8677,
+      "step": 12223
+    },
+    {
+      "epoch": 2.176282051282051,
+      "grad_norm": 0.7431774735450745,
+      "learning_rate": 8.635318985395284e-05,
+      "loss": 0.7914,
+      "step": 12224
+    },
+    {
+      "epoch": 2.176460113960114,
+      "grad_norm": 0.8223997354507446,
+      "learning_rate": 8.633932346935165e-05,
+      "loss": 0.7243,
+      "step": 12225
+    },
+    {
+      "epoch": 2.1766381766381766,
+      "grad_norm": 1.0101778507232666,
+      "learning_rate": 8.632545735240299e-05,
+      "loss": 0.8608,
+      "step": 12226
+    },
+    {
+      "epoch": 2.1768162393162394,
+      "grad_norm": 0.7270255088806152,
+      "learning_rate": 8.631159150337862e-05,
+      "loss": 0.8699,
+      "step": 12227
+    },
+    {
+      "epoch": 2.176994301994302,
+      "grad_norm": 0.8687323331832886,
+      "learning_rate": 8.629772592255016e-05,
+      "loss": 0.9168,
+      "step": 12228
+    },
+    {
+      "epoch": 2.177172364672365,
+      "grad_norm": 0.7623698115348816,
+      "learning_rate": 8.628386061018934e-05,
+      "loss": 0.9012,
+      "step": 12229
+    },
+    {
+      "epoch": 2.177350427350427,
+      "grad_norm": 0.7458708882331848,
+      "learning_rate": 8.626999556656771e-05,
+      "loss": 0.9068,
+      "step": 12230
+    },
+    {
+      "epoch": 2.17752849002849,
+      "grad_norm": 0.8262876868247986,
+      "learning_rate": 8.625613079195704e-05,
+      "loss": 0.9425,
+      "step": 12231
+    },
+    {
+      "epoch": 2.1777065527065527,
+      "grad_norm": 0.8737035989761353,
+      "learning_rate": 8.624226628662893e-05,
+      "loss": 0.9943,
+      "step": 12232
+    },
+    {
+      "epoch": 2.1778846153846154,
+      "grad_norm": 0.8250965476036072,
+      "learning_rate": 8.622840205085505e-05,
+      "loss": 0.9237,
+      "step": 12233
+    },
+    {
+      "epoch": 2.178062678062678,
+      "grad_norm": 0.8689019680023193,
+      "learning_rate": 8.621453808490699e-05,
+      "loss": 0.8544,
+      "step": 12234
+    },
+    {
+      "epoch": 2.178240740740741,
+      "grad_norm": 0.8672708868980408,
+      "learning_rate": 8.620067438905643e-05,
+      "loss": 0.7623,
+      "step": 12235
+    },
+    {
+      "epoch": 2.1784188034188032,
+      "grad_norm": 0.7077436447143555,
+      "learning_rate": 8.6186810963575e-05,
+      "loss": 0.7468,
+      "step": 12236
+    },
+    {
+      "epoch": 2.178596866096866,
+      "grad_norm": 0.838474452495575,
+      "learning_rate": 8.617294780873433e-05,
+      "loss": 0.9207,
+      "step": 12237
+    },
+    {
+      "epoch": 2.1787749287749287,
+      "grad_norm": 0.7240039110183716,
+      "learning_rate": 8.615908492480598e-05,
+      "loss": 0.8981,
+      "step": 12238
+    },
+    {
+      "epoch": 2.1789529914529915,
+      "grad_norm": 0.6995998620986938,
+      "learning_rate": 8.614522231206162e-05,
+      "loss": 0.7131,
+      "step": 12239
+    },
+    {
+      "epoch": 2.1791310541310542,
+      "grad_norm": 0.7011054158210754,
+      "learning_rate": 8.613135997077288e-05,
+      "loss": 0.8138,
+      "step": 12240
+    },
+    {
+      "epoch": 2.179309116809117,
+      "grad_norm": 0.9815019369125366,
+      "learning_rate": 8.611749790121131e-05,
+      "loss": 0.9637,
+      "step": 12241
+    },
+    {
+      "epoch": 2.1794871794871793,
+      "grad_norm": 0.7523870468139648,
+      "learning_rate": 8.610363610364853e-05,
+      "loss": 0.7555,
+      "step": 12242
+    },
+    {
+      "epoch": 2.179665242165242,
+      "grad_norm": 0.8286668658256531,
+      "learning_rate": 8.608977457835612e-05,
+      "loss": 0.7911,
+      "step": 12243
+    },
+    {
+      "epoch": 2.179843304843305,
+      "grad_norm": 0.8183441758155823,
+      "learning_rate": 8.607591332560573e-05,
+      "loss": 0.793,
+      "step": 12244
+    },
+    {
+      "epoch": 2.1800213675213675,
+      "grad_norm": 0.7104299068450928,
+      "learning_rate": 8.606205234566885e-05,
+      "loss": 0.6856,
+      "step": 12245
+    },
+    {
+      "epoch": 2.1801994301994303,
+      "grad_norm": 0.871588945388794,
+      "learning_rate": 8.60481916388171e-05,
+      "loss": 0.8342,
+      "step": 12246
+    },
+    {
+      "epoch": 2.180377492877493,
+      "grad_norm": 0.8002356290817261,
+      "learning_rate": 8.603433120532206e-05,
+      "loss": 0.9451,
+      "step": 12247
+    },
+    {
+      "epoch": 2.1805555555555554,
+      "grad_norm": 0.8223865032196045,
+      "learning_rate": 8.602047104545532e-05,
+      "loss": 0.8446,
+      "step": 12248
+    },
+    {
+      "epoch": 2.180733618233618,
+      "grad_norm": 0.7381762266159058,
+      "learning_rate": 8.600661115948836e-05,
+      "loss": 0.7575,
+      "step": 12249
+    },
+    {
+      "epoch": 2.180911680911681,
+      "grad_norm": 0.8717563152313232,
+      "learning_rate": 8.599275154769284e-05,
+      "loss": 0.9615,
+      "step": 12250
+    },
+    {
+      "epoch": 2.1810897435897436,
+      "grad_norm": 0.7935179471969604,
+      "learning_rate": 8.597889221034022e-05,
+      "loss": 0.8603,
+      "step": 12251
+    },
+    {
+      "epoch": 2.1812678062678064,
+      "grad_norm": 0.6350329518318176,
+      "learning_rate": 8.596503314770208e-05,
+      "loss": 0.4981,
+      "step": 12252
+    },
+    {
+      "epoch": 2.181445868945869,
+      "grad_norm": 0.8739648461341858,
+      "learning_rate": 8.595117436004995e-05,
+      "loss": 0.7872,
+      "step": 12253
+    },
+    {
+      "epoch": 2.1816239316239314,
+      "grad_norm": 0.8199412822723389,
+      "learning_rate": 8.593731584765542e-05,
+      "loss": 0.7905,
+      "step": 12254
+    },
+    {
+      "epoch": 2.181801994301994,
+      "grad_norm": 0.7289649844169617,
+      "learning_rate": 8.592345761078993e-05,
+      "loss": 0.6981,
+      "step": 12255
+    },
+    {
+      "epoch": 2.181980056980057,
+      "grad_norm": 0.8234626650810242,
+      "learning_rate": 8.590959964972506e-05,
+      "loss": 1.0442,
+      "step": 12256
+    },
+    {
+      "epoch": 2.1821581196581197,
+      "grad_norm": 0.7804498076438904,
+      "learning_rate": 8.589574196473229e-05,
+      "loss": 0.9386,
+      "step": 12257
+    },
+    {
+      "epoch": 2.1823361823361824,
+      "grad_norm": 0.9459218382835388,
+      "learning_rate": 8.588188455608317e-05,
+      "loss": 0.8075,
+      "step": 12258
+    },
+    {
+      "epoch": 2.182514245014245,
+      "grad_norm": 0.8133191466331482,
+      "learning_rate": 8.586802742404924e-05,
+      "loss": 1.0275,
+      "step": 12259
+    },
+    {
+      "epoch": 2.1826923076923075,
+      "grad_norm": 0.8302663564682007,
+      "learning_rate": 8.58541705689019e-05,
+      "loss": 0.9887,
+      "step": 12260
+    },
+    {
+      "epoch": 2.1828703703703702,
+      "grad_norm": 0.7839202284812927,
+      "learning_rate": 8.584031399091274e-05,
+      "loss": 1.0256,
+      "step": 12261
+    },
+    {
+      "epoch": 2.183048433048433,
+      "grad_norm": 0.8050578236579895,
+      "learning_rate": 8.582645769035319e-05,
+      "loss": 0.843,
+      "step": 12262
+    },
+    {
+      "epoch": 2.1832264957264957,
+      "grad_norm": 0.749110221862793,
+      "learning_rate": 8.581260166749477e-05,
+      "loss": 0.7683,
+      "step": 12263
+    },
+    {
+      "epoch": 2.1834045584045585,
+      "grad_norm": 0.7982701659202576,
+      "learning_rate": 8.579874592260894e-05,
+      "loss": 0.942,
+      "step": 12264
+    },
+    {
+      "epoch": 2.1835826210826212,
+      "grad_norm": 0.7571866512298584,
+      "learning_rate": 8.57848904559672e-05,
+      "loss": 0.8828,
+      "step": 12265
+    },
+    {
+      "epoch": 2.183760683760684,
+      "grad_norm": 0.7445113658905029,
+      "learning_rate": 8.577103526784098e-05,
+      "loss": 0.8869,
+      "step": 12266
+    },
+    {
+      "epoch": 2.1839387464387463,
+      "grad_norm": 0.7999380230903625,
+      "learning_rate": 8.575718035850177e-05,
+      "loss": 0.9476,
+      "step": 12267
+    },
+    {
+      "epoch": 2.184116809116809,
+      "grad_norm": 0.7188777923583984,
+      "learning_rate": 8.574332572822103e-05,
+      "loss": 0.7961,
+      "step": 12268
+    },
+    {
+      "epoch": 2.184294871794872,
+      "grad_norm": 0.9545742869377136,
+      "learning_rate": 8.572947137727023e-05,
+      "loss": 0.8629,
+      "step": 12269
+    },
+    {
+      "epoch": 2.1844729344729346,
+      "grad_norm": 0.8066838979721069,
+      "learning_rate": 8.571561730592075e-05,
+      "loss": 0.7728,
+      "step": 12270
+    },
+    {
+      "epoch": 2.1846509971509973,
+      "grad_norm": 0.7819525003433228,
+      "learning_rate": 8.57017635144441e-05,
+      "loss": 0.9897,
+      "step": 12271
+    },
+    {
+      "epoch": 2.1848290598290596,
+      "grad_norm": 0.9249349236488342,
+      "learning_rate": 8.568791000311166e-05,
+      "loss": 0.8562,
+      "step": 12272
+    },
+    {
+      "epoch": 2.1850071225071224,
+      "grad_norm": 0.8118993043899536,
+      "learning_rate": 8.567405677219497e-05,
+      "loss": 0.819,
+      "step": 12273
+    },
+    {
+      "epoch": 2.185185185185185,
+      "grad_norm": 0.7858524322509766,
+      "learning_rate": 8.566020382196532e-05,
+      "loss": 1.0708,
+      "step": 12274
+    },
+    {
+      "epoch": 2.185363247863248,
+      "grad_norm": 1.0223300457000732,
+      "learning_rate": 8.564635115269422e-05,
+      "loss": 0.9929,
+      "step": 12275
+    },
+    {
+      "epoch": 2.1855413105413106,
+      "grad_norm": 0.7749526500701904,
+      "learning_rate": 8.5632498764653e-05,
+      "loss": 0.7555,
+      "step": 12276
+    },
+    {
+      "epoch": 2.1857193732193734,
+      "grad_norm": 0.8443665504455566,
+      "learning_rate": 8.561864665811313e-05,
+      "loss": 0.8488,
+      "step": 12277
+    },
+    {
+      "epoch": 2.185897435897436,
+      "grad_norm": 0.7482786178588867,
+      "learning_rate": 8.560479483334603e-05,
+      "loss": 0.6535,
+      "step": 12278
+    },
+    {
+      "epoch": 2.1860754985754984,
+      "grad_norm": 0.7981070876121521,
+      "learning_rate": 8.559094329062305e-05,
+      "loss": 0.7698,
+      "step": 12279
+    },
+    {
+      "epoch": 2.186253561253561,
+      "grad_norm": 0.7612428665161133,
+      "learning_rate": 8.557709203021564e-05,
+      "loss": 0.9086,
+      "step": 12280
+    },
+    {
+      "epoch": 2.186431623931624,
+      "grad_norm": 0.8246445059776306,
+      "learning_rate": 8.556324105239512e-05,
+      "loss": 0.9927,
+      "step": 12281
+    },
+    {
+      "epoch": 2.1866096866096867,
+      "grad_norm": 0.8902820348739624,
+      "learning_rate": 8.554939035743292e-05,
+      "loss": 0.8474,
+      "step": 12282
+    },
+    {
+      "epoch": 2.1867877492877494,
+      "grad_norm": 0.9992623329162598,
+      "learning_rate": 8.553553994560037e-05,
+      "loss": 1.0898,
+      "step": 12283
+    },
+    {
+      "epoch": 2.1869658119658117,
+      "grad_norm": 0.9124125838279724,
+      "learning_rate": 8.552168981716892e-05,
+      "loss": 0.9542,
+      "step": 12284
+    },
+    {
+      "epoch": 2.1871438746438745,
+      "grad_norm": 0.6818730235099792,
+      "learning_rate": 8.550783997240983e-05,
+      "loss": 0.678,
+      "step": 12285
+    },
+    {
+      "epoch": 2.1873219373219372,
+      "grad_norm": 0.8302112817764282,
+      "learning_rate": 8.549399041159455e-05,
+      "loss": 0.9955,
+      "step": 12286
+    },
+    {
+      "epoch": 2.1875,
+      "grad_norm": 0.840419352054596,
+      "learning_rate": 8.548014113499436e-05,
+      "loss": 0.9299,
+      "step": 12287
+    },
+    {
+      "epoch": 2.1876780626780628,
+      "grad_norm": 0.8317474722862244,
+      "learning_rate": 8.546629214288067e-05,
+      "loss": 0.754,
+      "step": 12288
+    },
+    {
+      "epoch": 2.1878561253561255,
+      "grad_norm": 0.879148542881012,
+      "learning_rate": 8.545244343552476e-05,
+      "loss": 0.9238,
+      "step": 12289
+    },
+    {
+      "epoch": 2.1880341880341883,
+      "grad_norm": 0.7899607419967651,
+      "learning_rate": 8.543859501319805e-05,
+      "loss": 0.8277,
+      "step": 12290
+    },
+    {
+      "epoch": 2.1882122507122506,
+      "grad_norm": 0.837785542011261,
+      "learning_rate": 8.542474687617176e-05,
+      "loss": 0.8801,
+      "step": 12291
+    },
+    {
+      "epoch": 2.1883903133903133,
+      "grad_norm": 0.856213390827179,
+      "learning_rate": 8.541089902471733e-05,
+      "loss": 0.7589,
+      "step": 12292
+    },
+    {
+      "epoch": 2.188568376068376,
+      "grad_norm": 0.7915818095207214,
+      "learning_rate": 8.539705145910599e-05,
+      "loss": 0.8071,
+      "step": 12293
+    },
+    {
+      "epoch": 2.188746438746439,
+      "grad_norm": 0.8266519904136658,
+      "learning_rate": 8.538320417960914e-05,
+      "loss": 0.9482,
+      "step": 12294
+    },
+    {
+      "epoch": 2.1889245014245016,
+      "grad_norm": 0.8505687713623047,
+      "learning_rate": 8.536935718649799e-05,
+      "loss": 0.9272,
+      "step": 12295
+    },
+    {
+      "epoch": 2.189102564102564,
+      "grad_norm": 0.7530698776245117,
+      "learning_rate": 8.535551048004394e-05,
+      "loss": 0.7908,
+      "step": 12296
+    },
+    {
+      "epoch": 2.1892806267806266,
+      "grad_norm": 0.7904362678527832,
+      "learning_rate": 8.534166406051818e-05,
+      "loss": 0.9771,
+      "step": 12297
+    },
+    {
+      "epoch": 2.1894586894586894,
+      "grad_norm": 0.7860299944877625,
+      "learning_rate": 8.532781792819209e-05,
+      "loss": 0.605,
+      "step": 12298
+    },
+    {
+      "epoch": 2.189636752136752,
+      "grad_norm": 0.7718655467033386,
+      "learning_rate": 8.531397208333695e-05,
+      "loss": 0.8844,
+      "step": 12299
+    },
+    {
+      "epoch": 2.189814814814815,
+      "grad_norm": 0.8069637417793274,
+      "learning_rate": 8.530012652622397e-05,
+      "loss": 0.8571,
+      "step": 12300
+    },
+    {
+      "epoch": 2.1899928774928776,
+      "grad_norm": 0.8557140231132507,
+      "learning_rate": 8.528628125712455e-05,
+      "loss": 0.7396,
+      "step": 12301
+    },
+    {
+      "epoch": 2.1901709401709404,
+      "grad_norm": 0.8547600507736206,
+      "learning_rate": 8.527243627630983e-05,
+      "loss": 0.8073,
+      "step": 12302
+    },
+    {
+      "epoch": 2.1903490028490027,
+      "grad_norm": 0.8217329382896423,
+      "learning_rate": 8.525859158405114e-05,
+      "loss": 0.9723,
+      "step": 12303
+    },
+    {
+      "epoch": 2.1905270655270654,
+      "grad_norm": 0.896946132183075,
+      "learning_rate": 8.524474718061972e-05,
+      "loss": 0.7896,
+      "step": 12304
+    },
+    {
+      "epoch": 2.190705128205128,
+      "grad_norm": 1.0846823453903198,
+      "learning_rate": 8.523090306628685e-05,
+      "loss": 0.7689,
+      "step": 12305
+    },
+    {
+      "epoch": 2.190883190883191,
+      "grad_norm": 0.7265166640281677,
+      "learning_rate": 8.521705924132373e-05,
+      "loss": 0.8451,
+      "step": 12306
+    },
+    {
+      "epoch": 2.1910612535612537,
+      "grad_norm": 0.8806917071342468,
+      "learning_rate": 8.520321570600162e-05,
+      "loss": 0.8769,
+      "step": 12307
+    },
+    {
+      "epoch": 2.191239316239316,
+      "grad_norm": 0.7528414130210876,
+      "learning_rate": 8.518937246059176e-05,
+      "loss": 0.7137,
+      "step": 12308
+    },
+    {
+      "epoch": 2.1914173789173788,
+      "grad_norm": 0.9313900470733643,
+      "learning_rate": 8.517552950536543e-05,
+      "loss": 0.933,
+      "step": 12309
+    },
+    {
+      "epoch": 2.1915954415954415,
+      "grad_norm": 0.8363727331161499,
+      "learning_rate": 8.516168684059375e-05,
+      "loss": 0.899,
+      "step": 12310
+    },
+    {
+      "epoch": 2.1917735042735043,
+      "grad_norm": 0.7939122915267944,
+      "learning_rate": 8.514784446654803e-05,
+      "loss": 1.0323,
+      "step": 12311
+    },
+    {
+      "epoch": 2.191951566951567,
+      "grad_norm": 0.8744710087776184,
+      "learning_rate": 8.51340023834994e-05,
+      "loss": 0.8738,
+      "step": 12312
+    },
+    {
+      "epoch": 2.1921296296296298,
+      "grad_norm": 0.779353678226471,
+      "learning_rate": 8.512016059171916e-05,
+      "loss": 0.7692,
+      "step": 12313
+    },
+    {
+      "epoch": 2.1923076923076925,
+      "grad_norm": 0.8578362464904785,
+      "learning_rate": 8.510631909147841e-05,
+      "loss": 1.0636,
+      "step": 12314
+    },
+    {
+      "epoch": 2.192485754985755,
+      "grad_norm": 0.7210206985473633,
+      "learning_rate": 8.509247788304846e-05,
+      "loss": 0.6342,
+      "step": 12315
+    },
+    {
+      "epoch": 2.1926638176638176,
+      "grad_norm": 0.7221980690956116,
+      "learning_rate": 8.50786369667004e-05,
+      "loss": 0.7022,
+      "step": 12316
+    },
+    {
+      "epoch": 2.1928418803418803,
+      "grad_norm": 0.7871465086936951,
+      "learning_rate": 8.506479634270544e-05,
+      "loss": 0.9349,
+      "step": 12317
+    },
+    {
+      "epoch": 2.193019943019943,
+      "grad_norm": 0.7396262884140015,
+      "learning_rate": 8.505095601133479e-05,
+      "loss": 0.8644,
+      "step": 12318
+    },
+    {
+      "epoch": 2.193198005698006,
+      "grad_norm": 0.7513349652290344,
+      "learning_rate": 8.503711597285959e-05,
+      "loss": 0.7881,
+      "step": 12319
+    },
+    {
+      "epoch": 2.1933760683760686,
+      "grad_norm": 0.7280148863792419,
+      "learning_rate": 8.502327622755106e-05,
+      "loss": 0.7115,
+      "step": 12320
+    },
+    {
+      "epoch": 2.193554131054131,
+      "grad_norm": 0.792238712310791,
+      "learning_rate": 8.500943677568028e-05,
+      "loss": 0.8193,
+      "step": 12321
+    },
+    {
+      "epoch": 2.1937321937321936,
+      "grad_norm": 0.8709526062011719,
+      "learning_rate": 8.499559761751847e-05,
+      "loss": 0.8957,
+      "step": 12322
+    },
+    {
+      "epoch": 2.1939102564102564,
+      "grad_norm": 0.6865217685699463,
+      "learning_rate": 8.498175875333674e-05,
+      "loss": 0.6853,
+      "step": 12323
+    },
+    {
+      "epoch": 2.194088319088319,
+      "grad_norm": 0.7797526121139526,
+      "learning_rate": 8.496792018340625e-05,
+      "loss": 0.8885,
+      "step": 12324
+    },
+    {
+      "epoch": 2.194266381766382,
+      "grad_norm": 0.8806295394897461,
+      "learning_rate": 8.495408190799814e-05,
+      "loss": 0.9322,
+      "step": 12325
+    },
+    {
+      "epoch": 2.1944444444444446,
+      "grad_norm": 0.8566734790802002,
+      "learning_rate": 8.494024392738355e-05,
+      "loss": 0.9363,
+      "step": 12326
+    },
+    {
+      "epoch": 2.194622507122507,
+      "grad_norm": 0.8058465123176575,
+      "learning_rate": 8.49264062418336e-05,
+      "loss": 0.9007,
+      "step": 12327
+    },
+    {
+      "epoch": 2.1948005698005697,
+      "grad_norm": 0.7895804643630981,
+      "learning_rate": 8.491256885161938e-05,
+      "loss": 0.8486,
+      "step": 12328
+    },
+    {
+      "epoch": 2.1949786324786325,
+      "grad_norm": 0.7626506686210632,
+      "learning_rate": 8.489873175701204e-05,
+      "loss": 0.8208,
+      "step": 12329
+    },
+    {
+      "epoch": 2.195156695156695,
+      "grad_norm": 0.8917649388313293,
+      "learning_rate": 8.488489495828272e-05,
+      "loss": 1.1036,
+      "step": 12330
+    },
+    {
+      "epoch": 2.195334757834758,
+      "grad_norm": 0.7614438533782959,
+      "learning_rate": 8.487105845570242e-05,
+      "loss": 0.7124,
+      "step": 12331
+    },
+    {
+      "epoch": 2.1955128205128207,
+      "grad_norm": 0.7697421312332153,
+      "learning_rate": 8.485722224954237e-05,
+      "loss": 0.8831,
+      "step": 12332
+    },
+    {
+      "epoch": 2.195690883190883,
+      "grad_norm": 0.7449761629104614,
+      "learning_rate": 8.484338634007354e-05,
+      "loss": 0.9115,
+      "step": 12333
+    },
+    {
+      "epoch": 2.1958689458689458,
+      "grad_norm": 0.7099741101264954,
+      "learning_rate": 8.482955072756709e-05,
+      "loss": 0.6907,
+      "step": 12334
+    },
+    {
+      "epoch": 2.1960470085470085,
+      "grad_norm": 0.7856435775756836,
+      "learning_rate": 8.481571541229406e-05,
+      "loss": 1.0565,
+      "step": 12335
+    },
+    {
+      "epoch": 2.1962250712250713,
+      "grad_norm": 0.8374622464179993,
+      "learning_rate": 8.48018803945256e-05,
+      "loss": 0.9191,
+      "step": 12336
+    },
+    {
+      "epoch": 2.196403133903134,
+      "grad_norm": 0.7530848383903503,
+      "learning_rate": 8.478804567453265e-05,
+      "loss": 0.6576,
+      "step": 12337
+    },
+    {
+      "epoch": 2.1965811965811968,
+      "grad_norm": 0.774861216545105,
+      "learning_rate": 8.477421125258637e-05,
+      "loss": 1.0258,
+      "step": 12338
+    },
+    {
+      "epoch": 2.196759259259259,
+      "grad_norm": 0.9623909592628479,
+      "learning_rate": 8.47603771289578e-05,
+      "loss": 1.0192,
+      "step": 12339
+    },
+    {
+      "epoch": 2.196937321937322,
+      "grad_norm": 0.8253501653671265,
+      "learning_rate": 8.474654330391797e-05,
+      "loss": 0.7823,
+      "step": 12340
+    },
+    {
+      "epoch": 2.1971153846153846,
+      "grad_norm": 0.8683596849441528,
+      "learning_rate": 8.473270977773797e-05,
+      "loss": 0.8002,
+      "step": 12341
+    },
+    {
+      "epoch": 2.1972934472934473,
+      "grad_norm": 0.9093332886695862,
+      "learning_rate": 8.471887655068877e-05,
+      "loss": 1.0315,
+      "step": 12342
+    },
+    {
+      "epoch": 2.19747150997151,
+      "grad_norm": 0.7313206791877747,
+      "learning_rate": 8.470504362304147e-05,
+      "loss": 0.8238,
+      "step": 12343
+    },
+    {
+      "epoch": 2.197649572649573,
+      "grad_norm": 0.8464672565460205,
+      "learning_rate": 8.469121099506703e-05,
+      "loss": 0.8104,
+      "step": 12344
+    },
+    {
+      "epoch": 2.197827635327635,
+      "grad_norm": 0.9213936924934387,
+      "learning_rate": 8.467737866703657e-05,
+      "loss": 0.9963,
+      "step": 12345
+    },
+    {
+      "epoch": 2.198005698005698,
+      "grad_norm": 0.8033352494239807,
+      "learning_rate": 8.466354663922099e-05,
+      "loss": 0.9788,
+      "step": 12346
+    },
+    {
+      "epoch": 2.1981837606837606,
+      "grad_norm": 0.7210986018180847,
+      "learning_rate": 8.464971491189141e-05,
+      "loss": 0.7597,
+      "step": 12347
+    },
+    {
+      "epoch": 2.1983618233618234,
+      "grad_norm": 0.8128374814987183,
+      "learning_rate": 8.463588348531872e-05,
+      "loss": 0.9575,
+      "step": 12348
+    },
+    {
+      "epoch": 2.198539886039886,
+      "grad_norm": 0.7276061773300171,
+      "learning_rate": 8.4622052359774e-05,
+      "loss": 0.815,
+      "step": 12349
+    },
+    {
+      "epoch": 2.198717948717949,
+      "grad_norm": 0.7463665008544922,
+      "learning_rate": 8.46082215355282e-05,
+      "loss": 0.9782,
+      "step": 12350
+    },
+    {
+      "epoch": 2.198896011396011,
+      "grad_norm": 0.8288317918777466,
+      "learning_rate": 8.459439101285238e-05,
+      "loss": 0.8206,
+      "step": 12351
+    },
+    {
+      "epoch": 2.199074074074074,
+      "grad_norm": 0.8286055326461792,
+      "learning_rate": 8.458056079201742e-05,
+      "loss": 0.9819,
+      "step": 12352
+    },
+    {
+      "epoch": 2.1992521367521367,
+      "grad_norm": 0.8138381242752075,
+      "learning_rate": 8.456673087329436e-05,
+      "loss": 0.8565,
+      "step": 12353
+    },
+    {
+      "epoch": 2.1994301994301995,
+      "grad_norm": 0.9059311747550964,
+      "learning_rate": 8.455290125695412e-05,
+      "loss": 0.8727,
+      "step": 12354
+    },
+    {
+      "epoch": 2.199608262108262,
+      "grad_norm": 0.6138933300971985,
+      "learning_rate": 8.453907194326773e-05,
+      "loss": 0.5635,
+      "step": 12355
+    },
+    {
+      "epoch": 2.199786324786325,
+      "grad_norm": 0.870585560798645,
+      "learning_rate": 8.452524293250608e-05,
+      "loss": 0.7401,
+      "step": 12356
+    },
+    {
+      "epoch": 2.1999643874643873,
+      "grad_norm": 0.8393024802207947,
+      "learning_rate": 8.451141422494013e-05,
+      "loss": 1.0083,
+      "step": 12357
+    },
+    {
+      "epoch": 2.20014245014245,
+      "grad_norm": 0.7667146325111389,
+      "learning_rate": 8.449758582084091e-05,
+      "loss": 0.8915,
+      "step": 12358
+    },
+    {
+      "epoch": 2.2003205128205128,
+      "grad_norm": 1.0229144096374512,
+      "learning_rate": 8.448375772047923e-05,
+      "loss": 0.8879,
+      "step": 12359
+    },
+    {
+      "epoch": 2.2004985754985755,
+      "grad_norm": 0.7670294046401978,
+      "learning_rate": 8.446992992412611e-05,
+      "loss": 0.8233,
+      "step": 12360
+    },
+    {
+      "epoch": 2.2006766381766383,
+      "grad_norm": 0.7110083103179932,
+      "learning_rate": 8.445610243205244e-05,
+      "loss": 0.6315,
+      "step": 12361
+    },
+    {
+      "epoch": 2.200854700854701,
+      "grad_norm": 0.7801400423049927,
+      "learning_rate": 8.444227524452918e-05,
+      "loss": 0.7758,
+      "step": 12362
+    },
+    {
+      "epoch": 2.2010327635327633,
+      "grad_norm": 0.8762022852897644,
+      "learning_rate": 8.44284483618272e-05,
+      "loss": 0.9308,
+      "step": 12363
+    },
+    {
+      "epoch": 2.201210826210826,
+      "grad_norm": 0.811890184879303,
+      "learning_rate": 8.441462178421742e-05,
+      "loss": 1.0322,
+      "step": 12364
+    },
+    {
+      "epoch": 2.201388888888889,
+      "grad_norm": 0.8128690719604492,
+      "learning_rate": 8.440079551197076e-05,
+      "loss": 1.0669,
+      "step": 12365
+    },
+    {
+      "epoch": 2.2015669515669516,
+      "grad_norm": 0.8925766348838806,
+      "learning_rate": 8.438696954535812e-05,
+      "loss": 0.8848,
+      "step": 12366
+    },
+    {
+      "epoch": 2.2017450142450143,
+      "grad_norm": 0.9104064106941223,
+      "learning_rate": 8.437314388465036e-05,
+      "loss": 0.8227,
+      "step": 12367
+    },
+    {
+      "epoch": 2.201923076923077,
+      "grad_norm": 0.7956777215003967,
+      "learning_rate": 8.43593185301184e-05,
+      "loss": 0.7616,
+      "step": 12368
+    },
+    {
+      "epoch": 2.2021011396011394,
+      "grad_norm": 0.7658423185348511,
+      "learning_rate": 8.434549348203309e-05,
+      "loss": 0.9406,
+      "step": 12369
+    },
+    {
+      "epoch": 2.202279202279202,
+      "grad_norm": 0.7650682926177979,
+      "learning_rate": 8.433166874066532e-05,
+      "loss": 0.9031,
+      "step": 12370
+    },
+    {
+      "epoch": 2.202457264957265,
+      "grad_norm": 0.8613301515579224,
+      "learning_rate": 8.431784430628594e-05,
+      "loss": 0.9184,
+      "step": 12371
+    },
+    {
+      "epoch": 2.2026353276353277,
+      "grad_norm": 0.8446599245071411,
+      "learning_rate": 8.430402017916586e-05,
+      "loss": 0.8639,
+      "step": 12372
+    },
+    {
+      "epoch": 2.2028133903133904,
+      "grad_norm": 0.8082340955734253,
+      "learning_rate": 8.429019635957585e-05,
+      "loss": 0.7365,
+      "step": 12373
+    },
+    {
+      "epoch": 2.202991452991453,
+      "grad_norm": 0.8843092918395996,
+      "learning_rate": 8.427637284778683e-05,
+      "loss": 0.8679,
+      "step": 12374
+    },
+    {
+      "epoch": 2.2031695156695155,
+      "grad_norm": 0.8475705981254578,
+      "learning_rate": 8.426254964406961e-05,
+      "loss": 0.6614,
+      "step": 12375
+    },
+    {
+      "epoch": 2.203347578347578,
+      "grad_norm": 0.9980667233467102,
+      "learning_rate": 8.424872674869507e-05,
+      "loss": 0.9103,
+      "step": 12376
+    },
+    {
+      "epoch": 2.203525641025641,
+      "grad_norm": 0.8033170104026794,
+      "learning_rate": 8.423490416193398e-05,
+      "loss": 0.7668,
+      "step": 12377
+    },
+    {
+      "epoch": 2.2037037037037037,
+      "grad_norm": 0.8275265097618103,
+      "learning_rate": 8.422108188405718e-05,
+      "loss": 0.7448,
+      "step": 12378
+    },
+    {
+      "epoch": 2.2038817663817665,
+      "grad_norm": 0.7622979283332825,
+      "learning_rate": 8.420725991533554e-05,
+      "loss": 0.8121,
+      "step": 12379
+    },
+    {
+      "epoch": 2.2040598290598292,
+      "grad_norm": 0.8580977320671082,
+      "learning_rate": 8.41934382560398e-05,
+      "loss": 0.8437,
+      "step": 12380
+    },
+    {
+      "epoch": 2.2042378917378915,
+      "grad_norm": 0.8443751931190491,
+      "learning_rate": 8.417961690644086e-05,
+      "loss": 0.971,
+      "step": 12381
+    },
+    {
+      "epoch": 2.2044159544159543,
+      "grad_norm": 0.782430112361908,
+      "learning_rate": 8.416579586680939e-05,
+      "loss": 0.8367,
+      "step": 12382
+    },
+    {
+      "epoch": 2.204594017094017,
+      "grad_norm": 0.8664544820785522,
+      "learning_rate": 8.415197513741633e-05,
+      "loss": 0.8288,
+      "step": 12383
+    },
+    {
+      "epoch": 2.20477207977208,
+      "grad_norm": 0.7207586169242859,
+      "learning_rate": 8.413815471853235e-05,
+      "loss": 0.8038,
+      "step": 12384
+    },
+    {
+      "epoch": 2.2049501424501425,
+      "grad_norm": 0.743195652961731,
+      "learning_rate": 8.412433461042828e-05,
+      "loss": 0.705,
+      "step": 12385
+    },
+    {
+      "epoch": 2.2051282051282053,
+      "grad_norm": 0.7891412377357483,
+      "learning_rate": 8.411051481337488e-05,
+      "loss": 0.9729,
+      "step": 12386
+    },
+    {
+      "epoch": 2.205306267806268,
+      "grad_norm": 0.838847815990448,
+      "learning_rate": 8.4096695327643e-05,
+      "loss": 0.9053,
+      "step": 12387
+    },
+    {
+      "epoch": 2.2054843304843303,
+      "grad_norm": 0.7717056274414062,
+      "learning_rate": 8.408287615350328e-05,
+      "loss": 0.7388,
+      "step": 12388
+    },
+    {
+      "epoch": 2.205662393162393,
+      "grad_norm": 0.7209389209747314,
+      "learning_rate": 8.406905729122654e-05,
+      "loss": 0.6411,
+      "step": 12389
+    },
+    {
+      "epoch": 2.205840455840456,
+      "grad_norm": 0.822475790977478,
+      "learning_rate": 8.405523874108354e-05,
+      "loss": 0.9574,
+      "step": 12390
+    },
+    {
+      "epoch": 2.2060185185185186,
+      "grad_norm": 0.9401286840438843,
+      "learning_rate": 8.404142050334504e-05,
+      "loss": 0.8915,
+      "step": 12391
+    },
+    {
+      "epoch": 2.2061965811965814,
+      "grad_norm": 0.8247103691101074,
+      "learning_rate": 8.40276025782817e-05,
+      "loss": 0.8369,
+      "step": 12392
+    },
+    {
+      "epoch": 2.2063746438746437,
+      "grad_norm": 0.8082301020622253,
+      "learning_rate": 8.401378496616437e-05,
+      "loss": 0.9321,
+      "step": 12393
+    },
+    {
+      "epoch": 2.2065527065527064,
+      "grad_norm": 0.8156028389930725,
+      "learning_rate": 8.399996766726367e-05,
+      "loss": 0.7599,
+      "step": 12394
+    },
+    {
+      "epoch": 2.206730769230769,
+      "grad_norm": 0.7941898107528687,
+      "learning_rate": 8.398615068185038e-05,
+      "loss": 0.812,
+      "step": 12395
+    },
+    {
+      "epoch": 2.206908831908832,
+      "grad_norm": 0.7013470530509949,
+      "learning_rate": 8.397233401019518e-05,
+      "loss": 0.7914,
+      "step": 12396
+    },
+    {
+      "epoch": 2.2070868945868947,
+      "grad_norm": 0.6028649210929871,
+      "learning_rate": 8.395851765256881e-05,
+      "loss": 0.5787,
+      "step": 12397
+    },
+    {
+      "epoch": 2.2072649572649574,
+      "grad_norm": 0.9031504392623901,
+      "learning_rate": 8.3944701609242e-05,
+      "loss": 0.8677,
+      "step": 12398
+    },
+    {
+      "epoch": 2.20744301994302,
+      "grad_norm": 0.7370864748954773,
+      "learning_rate": 8.393088588048536e-05,
+      "loss": 0.9025,
+      "step": 12399
+    },
+    {
+      "epoch": 2.2076210826210825,
+      "grad_norm": 0.7764220237731934,
+      "learning_rate": 8.391707046656968e-05,
+      "loss": 0.8805,
+      "step": 12400
+    },
+    {
+      "epoch": 2.2077991452991452,
+      "grad_norm": 0.7456721663475037,
+      "learning_rate": 8.390325536776553e-05,
+      "loss": 0.7739,
+      "step": 12401
+    },
+    {
+      "epoch": 2.207977207977208,
+      "grad_norm": 0.8032360076904297,
+      "learning_rate": 8.388944058434373e-05,
+      "loss": 0.9765,
+      "step": 12402
+    },
+    {
+      "epoch": 2.2081552706552707,
+      "grad_norm": 0.8502830266952515,
+      "learning_rate": 8.387562611657483e-05,
+      "loss": 0.9356,
+      "step": 12403
+    },
+    {
+      "epoch": 2.2083333333333335,
+      "grad_norm": 0.812216579914093,
+      "learning_rate": 8.386181196472956e-05,
+      "loss": 0.8846,
+      "step": 12404
+    },
+    {
+      "epoch": 2.208511396011396,
+      "grad_norm": 0.6996115446090698,
+      "learning_rate": 8.384799812907853e-05,
+      "loss": 0.7035,
+      "step": 12405
+    },
+    {
+      "epoch": 2.2086894586894585,
+      "grad_norm": 0.7909261584281921,
+      "learning_rate": 8.383418460989245e-05,
+      "loss": 0.8025,
+      "step": 12406
+    },
+    {
+      "epoch": 2.2088675213675213,
+      "grad_norm": 0.8278310894966125,
+      "learning_rate": 8.382037140744192e-05,
+      "loss": 0.7982,
+      "step": 12407
+    },
+    {
+      "epoch": 2.209045584045584,
+      "grad_norm": 0.7558199167251587,
+      "learning_rate": 8.380655852199763e-05,
+      "loss": 0.854,
+      "step": 12408
+    },
+    {
+      "epoch": 2.209223646723647,
+      "grad_norm": 0.8516034483909607,
+      "learning_rate": 8.379274595383016e-05,
+      "loss": 0.7497,
+      "step": 12409
+    },
+    {
+      "epoch": 2.2094017094017095,
+      "grad_norm": 0.777004599571228,
+      "learning_rate": 8.377893370321018e-05,
+      "loss": 0.797,
+      "step": 12410
+    },
+    {
+      "epoch": 2.2095797720797723,
+      "grad_norm": 0.8820251822471619,
+      "learning_rate": 8.376512177040829e-05,
+      "loss": 0.9229,
+      "step": 12411
+    },
+    {
+      "epoch": 2.2097578347578346,
+      "grad_norm": 0.8623200058937073,
+      "learning_rate": 8.375131015569514e-05,
+      "loss": 1.011,
+      "step": 12412
+    },
+    {
+      "epoch": 2.2099358974358974,
+      "grad_norm": 0.9192054271697998,
+      "learning_rate": 8.373749885934127e-05,
+      "loss": 0.8711,
+      "step": 12413
+    },
+    {
+      "epoch": 2.21011396011396,
+      "grad_norm": 0.7627860903739929,
+      "learning_rate": 8.372368788161736e-05,
+      "loss": 0.5937,
+      "step": 12414
+    },
+    {
+      "epoch": 2.210292022792023,
+      "grad_norm": 0.74603670835495,
+      "learning_rate": 8.370987722279395e-05,
+      "loss": 0.8238,
+      "step": 12415
+    },
+    {
+      "epoch": 2.2104700854700856,
+      "grad_norm": 0.884469211101532,
+      "learning_rate": 8.369606688314165e-05,
+      "loss": 1.1957,
+      "step": 12416
+    },
+    {
+      "epoch": 2.210648148148148,
+      "grad_norm": 0.8145224452018738,
+      "learning_rate": 8.36822568629311e-05,
+      "loss": 0.8517,
+      "step": 12417
+    },
+    {
+      "epoch": 2.2108262108262107,
+      "grad_norm": 0.8167604207992554,
+      "learning_rate": 8.366844716243279e-05,
+      "loss": 0.9701,
+      "step": 12418
+    },
+    {
+      "epoch": 2.2110042735042734,
+      "grad_norm": 0.7668562531471252,
+      "learning_rate": 8.365463778191736e-05,
+      "loss": 1.0281,
+      "step": 12419
+    },
+    {
+      "epoch": 2.211182336182336,
+      "grad_norm": 0.8455148339271545,
+      "learning_rate": 8.364082872165532e-05,
+      "loss": 0.7812,
+      "step": 12420
+    },
+    {
+      "epoch": 2.211360398860399,
+      "grad_norm": 0.8756504654884338,
+      "learning_rate": 8.362701998191728e-05,
+      "loss": 0.779,
+      "step": 12421
+    },
+    {
+      "epoch": 2.2115384615384617,
+      "grad_norm": 0.8239594101905823,
+      "learning_rate": 8.361321156297374e-05,
+      "loss": 0.8581,
+      "step": 12422
+    },
+    {
+      "epoch": 2.2117165242165244,
+      "grad_norm": 0.7719405889511108,
+      "learning_rate": 8.359940346509533e-05,
+      "loss": 0.7593,
+      "step": 12423
+    },
+    {
+      "epoch": 2.2118945868945867,
+      "grad_norm": 0.8607308268547058,
+      "learning_rate": 8.358559568855249e-05,
+      "loss": 1.0618,
+      "step": 12424
+    },
+    {
+      "epoch": 2.2120726495726495,
+      "grad_norm": 0.750431478023529,
+      "learning_rate": 8.357178823361582e-05,
+      "loss": 0.7779,
+      "step": 12425
+    },
+    {
+      "epoch": 2.2122507122507122,
+      "grad_norm": 0.7770674824714661,
+      "learning_rate": 8.355798110055583e-05,
+      "loss": 0.6837,
+      "step": 12426
+    },
+    {
+      "epoch": 2.212428774928775,
+      "grad_norm": 0.7924200296401978,
+      "learning_rate": 8.354417428964307e-05,
+      "loss": 0.8092,
+      "step": 12427
+    },
+    {
+      "epoch": 2.2126068376068377,
+      "grad_norm": 0.7784677743911743,
+      "learning_rate": 8.3530367801148e-05,
+      "loss": 0.7168,
+      "step": 12428
+    },
+    {
+      "epoch": 2.2127849002849005,
+      "grad_norm": 1.0548151731491089,
+      "learning_rate": 8.351656163534121e-05,
+      "loss": 0.9286,
+      "step": 12429
+    },
+    {
+      "epoch": 2.212962962962963,
+      "grad_norm": 0.8983006477355957,
+      "learning_rate": 8.35027557924931e-05,
+      "loss": 0.878,
+      "step": 12430
+    },
+    {
+      "epoch": 2.2131410256410255,
+      "grad_norm": 0.8136780261993408,
+      "learning_rate": 8.348895027287424e-05,
+      "loss": 0.7901,
+      "step": 12431
+    },
+    {
+      "epoch": 2.2133190883190883,
+      "grad_norm": 0.8186678290367126,
+      "learning_rate": 8.347514507675508e-05,
+      "loss": 0.8994,
+      "step": 12432
+    },
+    {
+      "epoch": 2.213497150997151,
+      "grad_norm": 0.880790650844574,
+      "learning_rate": 8.346134020440617e-05,
+      "loss": 1.0681,
+      "step": 12433
+    },
+    {
+      "epoch": 2.213675213675214,
+      "grad_norm": 0.8061994910240173,
+      "learning_rate": 8.344753565609789e-05,
+      "loss": 0.8466,
+      "step": 12434
+    },
+    {
+      "epoch": 2.2138532763532766,
+      "grad_norm": 0.8041423559188843,
+      "learning_rate": 8.34337314321008e-05,
+      "loss": 0.897,
+      "step": 12435
+    },
+    {
+      "epoch": 2.214031339031339,
+      "grad_norm": 0.5797891616821289,
+      "learning_rate": 8.34199275326853e-05,
+      "loss": 0.4827,
+      "step": 12436
+    },
+    {
+      "epoch": 2.2142094017094016,
+      "grad_norm": 0.7373392581939697,
+      "learning_rate": 8.340612395812188e-05,
+      "loss": 0.779,
+      "step": 12437
+    },
+    {
+      "epoch": 2.2143874643874644,
+      "grad_norm": 0.7852202653884888,
+      "learning_rate": 8.339232070868102e-05,
+      "loss": 0.8001,
+      "step": 12438
+    },
+    {
+      "epoch": 2.214565527065527,
+      "grad_norm": 0.8209689259529114,
+      "learning_rate": 8.337851778463311e-05,
+      "loss": 0.7492,
+      "step": 12439
+    },
+    {
+      "epoch": 2.21474358974359,
+      "grad_norm": 0.9393492937088013,
+      "learning_rate": 8.336471518624867e-05,
+      "loss": 0.884,
+      "step": 12440
+    },
+    {
+      "epoch": 2.2149216524216526,
+      "grad_norm": 0.6966122984886169,
+      "learning_rate": 8.3350912913798e-05,
+      "loss": 0.7364,
+      "step": 12441
+    },
+    {
+      "epoch": 2.215099715099715,
+      "grad_norm": 0.7379066944122314,
+      "learning_rate": 8.333711096755165e-05,
+      "loss": 0.7345,
+      "step": 12442
+    },
+    {
+      "epoch": 2.2152777777777777,
+      "grad_norm": 0.9011021256446838,
+      "learning_rate": 8.332330934777999e-05,
+      "loss": 0.8392,
+      "step": 12443
+    },
+    {
+      "epoch": 2.2154558404558404,
+      "grad_norm": 0.7718381285667419,
+      "learning_rate": 8.330950805475346e-05,
+      "loss": 0.9062,
+      "step": 12444
+    },
+    {
+      "epoch": 2.215633903133903,
+      "grad_norm": 0.8584564328193665,
+      "learning_rate": 8.329570708874241e-05,
+      "loss": 0.9612,
+      "step": 12445
+    },
+    {
+      "epoch": 2.215811965811966,
+      "grad_norm": 0.7711616158485413,
+      "learning_rate": 8.32819064500173e-05,
+      "loss": 0.731,
+      "step": 12446
+    },
+    {
+      "epoch": 2.2159900284900287,
+      "grad_norm": 0.8014609217643738,
+      "learning_rate": 8.326810613884849e-05,
+      "loss": 1.0128,
+      "step": 12447
+    },
+    {
+      "epoch": 2.216168091168091,
+      "grad_norm": 0.7837486863136292,
+      "learning_rate": 8.325430615550642e-05,
+      "loss": 0.8271,
+      "step": 12448
+    },
+    {
+      "epoch": 2.2163461538461537,
+      "grad_norm": 0.9399738907814026,
+      "learning_rate": 8.324050650026139e-05,
+      "loss": 1.0433,
+      "step": 12449
+    },
+    {
+      "epoch": 2.2165242165242165,
+      "grad_norm": 0.8302193284034729,
+      "learning_rate": 8.322670717338385e-05,
+      "loss": 1.0259,
+      "step": 12450
+    },
+    {
+      "epoch": 2.2167022792022792,
+      "grad_norm": 0.7707721590995789,
+      "learning_rate": 8.321290817514411e-05,
+      "loss": 0.6972,
+      "step": 12451
+    },
+    {
+      "epoch": 2.216880341880342,
+      "grad_norm": 0.5814536809921265,
+      "learning_rate": 8.319910950581261e-05,
+      "loss": 0.5846,
+      "step": 12452
+    },
+    {
+      "epoch": 2.2170584045584047,
+      "grad_norm": 0.8249124884605408,
+      "learning_rate": 8.318531116565962e-05,
+      "loss": 0.7417,
+      "step": 12453
+    },
+    {
+      "epoch": 2.217236467236467,
+      "grad_norm": 0.7116015553474426,
+      "learning_rate": 8.317151315495556e-05,
+      "loss": 0.8698,
+      "step": 12454
+    },
+    {
+      "epoch": 2.21741452991453,
+      "grad_norm": 0.8025332689285278,
+      "learning_rate": 8.31577154739707e-05,
+      "loss": 0.825,
+      "step": 12455
+    },
+    {
+      "epoch": 2.2175925925925926,
+      "grad_norm": 0.8962773680686951,
+      "learning_rate": 8.314391812297542e-05,
+      "loss": 0.9987,
+      "step": 12456
+    },
+    {
+      "epoch": 2.2177706552706553,
+      "grad_norm": 0.8446899652481079,
+      "learning_rate": 8.313012110224008e-05,
+      "loss": 0.8554,
+      "step": 12457
+    },
+    {
+      "epoch": 2.217948717948718,
+      "grad_norm": 0.7759326696395874,
+      "learning_rate": 8.311632441203494e-05,
+      "loss": 0.8206,
+      "step": 12458
+    },
+    {
+      "epoch": 2.218126780626781,
+      "grad_norm": 0.9782015085220337,
+      "learning_rate": 8.31025280526304e-05,
+      "loss": 0.8183,
+      "step": 12459
+    },
+    {
+      "epoch": 2.218304843304843,
+      "grad_norm": 0.7445226907730103,
+      "learning_rate": 8.308873202429666e-05,
+      "loss": 0.6819,
+      "step": 12460
+    },
+    {
+      "epoch": 2.218482905982906,
+      "grad_norm": 0.7613980770111084,
+      "learning_rate": 8.307493632730413e-05,
+      "loss": 0.6283,
+      "step": 12461
+    },
+    {
+      "epoch": 2.2186609686609686,
+      "grad_norm": 0.7437549829483032,
+      "learning_rate": 8.306114096192304e-05,
+      "loss": 0.7511,
+      "step": 12462
+    },
+    {
+      "epoch": 2.2188390313390314,
+      "grad_norm": 0.7600140571594238,
+      "learning_rate": 8.304734592842373e-05,
+      "loss": 0.8784,
+      "step": 12463
+    },
+    {
+      "epoch": 2.219017094017094,
+      "grad_norm": 0.9086898565292358,
+      "learning_rate": 8.303355122707644e-05,
+      "loss": 1.0818,
+      "step": 12464
+    },
+    {
+      "epoch": 2.219195156695157,
+      "grad_norm": 0.8674180507659912,
+      "learning_rate": 8.30197568581515e-05,
+      "loss": 0.8925,
+      "step": 12465
+    },
+    {
+      "epoch": 2.219373219373219,
+      "grad_norm": 0.893606960773468,
+      "learning_rate": 8.300596282191911e-05,
+      "loss": 0.9382,
+      "step": 12466
+    },
+    {
+      "epoch": 2.219551282051282,
+      "grad_norm": 0.7664543390274048,
+      "learning_rate": 8.29921691186496e-05,
+      "loss": 0.7893,
+      "step": 12467
+    },
+    {
+      "epoch": 2.2197293447293447,
+      "grad_norm": 0.8730209469795227,
+      "learning_rate": 8.297837574861318e-05,
+      "loss": 1.0509,
+      "step": 12468
+    },
+    {
+      "epoch": 2.2199074074074074,
+      "grad_norm": 0.8138112425804138,
+      "learning_rate": 8.296458271208018e-05,
+      "loss": 0.784,
+      "step": 12469
+    },
+    {
+      "epoch": 2.22008547008547,
+      "grad_norm": 0.8362413644790649,
+      "learning_rate": 8.295079000932073e-05,
+      "loss": 1.0236,
+      "step": 12470
+    },
+    {
+      "epoch": 2.220263532763533,
+      "grad_norm": 0.8422487378120422,
+      "learning_rate": 8.293699764060518e-05,
+      "loss": 0.9677,
+      "step": 12471
+    },
+    {
+      "epoch": 2.2204415954415953,
+      "grad_norm": 0.7290427088737488,
+      "learning_rate": 8.292320560620369e-05,
+      "loss": 0.7514,
+      "step": 12472
+    },
+    {
+      "epoch": 2.220619658119658,
+      "grad_norm": 0.8083370923995972,
+      "learning_rate": 8.290941390638653e-05,
+      "loss": 0.8136,
+      "step": 12473
+    },
+    {
+      "epoch": 2.2207977207977208,
+      "grad_norm": 0.8045510053634644,
+      "learning_rate": 8.289562254142389e-05,
+      "loss": 0.6753,
+      "step": 12474
+    },
+    {
+      "epoch": 2.2209757834757835,
+      "grad_norm": 0.8019934892654419,
+      "learning_rate": 8.288183151158602e-05,
+      "loss": 0.8147,
+      "step": 12475
+    },
+    {
+      "epoch": 2.2211538461538463,
+      "grad_norm": 0.8129584193229675,
+      "learning_rate": 8.286804081714306e-05,
+      "loss": 0.9137,
+      "step": 12476
+    },
+    {
+      "epoch": 2.221331908831909,
+      "grad_norm": 0.9729450345039368,
+      "learning_rate": 8.285425045836526e-05,
+      "loss": 0.7884,
+      "step": 12477
+    },
+    {
+      "epoch": 2.2215099715099713,
+      "grad_norm": 0.755081295967102,
+      "learning_rate": 8.284046043552282e-05,
+      "loss": 0.8496,
+      "step": 12478
+    },
+    {
+      "epoch": 2.221688034188034,
+      "grad_norm": 0.725267767906189,
+      "learning_rate": 8.282667074888589e-05,
+      "loss": 0.7054,
+      "step": 12479
+    },
+    {
+      "epoch": 2.221866096866097,
+      "grad_norm": 0.832098662853241,
+      "learning_rate": 8.281288139872472e-05,
+      "loss": 0.8729,
+      "step": 12480
+    },
+    {
+      "epoch": 2.2220441595441596,
+      "grad_norm": 0.9908086657524109,
+      "learning_rate": 8.27990923853094e-05,
+      "loss": 0.9106,
+      "step": 12481
+    },
+    {
+      "epoch": 2.2222222222222223,
+      "grad_norm": 0.8001172542572021,
+      "learning_rate": 8.278530370891013e-05,
+      "loss": 0.906,
+      "step": 12482
+    },
+    {
+      "epoch": 2.222400284900285,
+      "grad_norm": 0.7607424259185791,
+      "learning_rate": 8.277151536979709e-05,
+      "loss": 0.8125,
+      "step": 12483
+    },
+    {
+      "epoch": 2.2225783475783474,
+      "grad_norm": 0.7850996255874634,
+      "learning_rate": 8.275772736824042e-05,
+      "loss": 0.7017,
+      "step": 12484
+    },
+    {
+      "epoch": 2.22275641025641,
+      "grad_norm": 0.8376613855361938,
+      "learning_rate": 8.274393970451024e-05,
+      "loss": 1.0453,
+      "step": 12485
+    },
+    {
+      "epoch": 2.222934472934473,
+      "grad_norm": 0.7973353266716003,
+      "learning_rate": 8.273015237887673e-05,
+      "loss": 0.8337,
+      "step": 12486
+    },
+    {
+      "epoch": 2.2231125356125356,
+      "grad_norm": 0.7622607350349426,
+      "learning_rate": 8.271636539161e-05,
+      "loss": 0.8574,
+      "step": 12487
+    },
+    {
+      "epoch": 2.2232905982905984,
+      "grad_norm": 0.7839400768280029,
+      "learning_rate": 8.270257874298022e-05,
+      "loss": 0.8857,
+      "step": 12488
+    },
+    {
+      "epoch": 2.223468660968661,
+      "grad_norm": 0.7730473875999451,
+      "learning_rate": 8.268879243325743e-05,
+      "loss": 0.9578,
+      "step": 12489
+    },
+    {
+      "epoch": 2.2236467236467234,
+      "grad_norm": 0.7811899185180664,
+      "learning_rate": 8.267500646271184e-05,
+      "loss": 0.9469,
+      "step": 12490
+    },
+    {
+      "epoch": 2.223824786324786,
+      "grad_norm": 0.8570041060447693,
+      "learning_rate": 8.266122083161347e-05,
+      "loss": 0.8853,
+      "step": 12491
+    },
+    {
+      "epoch": 2.224002849002849,
+      "grad_norm": 0.7989770174026489,
+      "learning_rate": 8.264743554023248e-05,
+      "loss": 0.7467,
+      "step": 12492
+    },
+    {
+      "epoch": 2.2241809116809117,
+      "grad_norm": 0.8287475109100342,
+      "learning_rate": 8.263365058883891e-05,
+      "loss": 0.9987,
+      "step": 12493
+    },
+    {
+      "epoch": 2.2243589743589745,
+      "grad_norm": 0.8879026174545288,
+      "learning_rate": 8.261986597770295e-05,
+      "loss": 0.9503,
+      "step": 12494
+    },
+    {
+      "epoch": 2.224537037037037,
+      "grad_norm": 0.8153596520423889,
+      "learning_rate": 8.260608170709456e-05,
+      "loss": 0.9715,
+      "step": 12495
+    },
+    {
+      "epoch": 2.2247150997150995,
+      "grad_norm": 0.8294584155082703,
+      "learning_rate": 8.259229777728384e-05,
+      "loss": 0.958,
+      "step": 12496
+    },
+    {
+      "epoch": 2.2248931623931623,
+      "grad_norm": 0.76850426197052,
+      "learning_rate": 8.257851418854093e-05,
+      "loss": 0.7666,
+      "step": 12497
+    },
+    {
+      "epoch": 2.225071225071225,
+      "grad_norm": 0.743966817855835,
+      "learning_rate": 8.256473094113582e-05,
+      "loss": 0.8893,
+      "step": 12498
+    },
+    {
+      "epoch": 2.2252492877492878,
+      "grad_norm": 0.7339308857917786,
+      "learning_rate": 8.255094803533863e-05,
+      "loss": 0.7317,
+      "step": 12499
+    },
+    {
+      "epoch": 2.2254273504273505,
+      "grad_norm": 1.0800104141235352,
+      "learning_rate": 8.253716547141932e-05,
+      "loss": 1.0147,
+      "step": 12500
+    },
+    {
+      "epoch": 2.2256054131054133,
+      "grad_norm": 0.8518815636634827,
+      "learning_rate": 8.252338324964802e-05,
+      "loss": 0.9695,
+      "step": 12501
+    },
+    {
+      "epoch": 2.2257834757834756,
+      "grad_norm": 0.8706745505332947,
+      "learning_rate": 8.250960137029469e-05,
+      "loss": 0.7735,
+      "step": 12502
+    },
+    {
+      "epoch": 2.2259615384615383,
+      "grad_norm": 1.0482546091079712,
+      "learning_rate": 8.24958198336294e-05,
+      "loss": 1.0882,
+      "step": 12503
+    },
+    {
+      "epoch": 2.226139601139601,
+      "grad_norm": 0.8025278449058533,
+      "learning_rate": 8.248203863992213e-05,
+      "loss": 0.8573,
+      "step": 12504
+    },
+    {
+      "epoch": 2.226317663817664,
+      "grad_norm": 0.8267400860786438,
+      "learning_rate": 8.246825778944297e-05,
+      "loss": 0.8609,
+      "step": 12505
+    },
+    {
+      "epoch": 2.2264957264957266,
+      "grad_norm": 0.703681230545044,
+      "learning_rate": 8.245447728246184e-05,
+      "loss": 0.6934,
+      "step": 12506
+    },
+    {
+      "epoch": 2.2266737891737893,
+      "grad_norm": 0.807736873626709,
+      "learning_rate": 8.24406971192488e-05,
+      "loss": 0.7258,
+      "step": 12507
+    },
+    {
+      "epoch": 2.226851851851852,
+      "grad_norm": 0.7663748860359192,
+      "learning_rate": 8.24269173000738e-05,
+      "loss": 0.7825,
+      "step": 12508
+    },
+    {
+      "epoch": 2.2270299145299144,
+      "grad_norm": 0.7799240946769714,
+      "learning_rate": 8.24131378252069e-05,
+      "loss": 0.7868,
+      "step": 12509
+    },
+    {
+      "epoch": 2.227207977207977,
+      "grad_norm": 0.8309668302536011,
+      "learning_rate": 8.239935869491799e-05,
+      "loss": 0.7697,
+      "step": 12510
+    },
+    {
+      "epoch": 2.22738603988604,
+      "grad_norm": 0.7257094979286194,
+      "learning_rate": 8.23855799094771e-05,
+      "loss": 0.8168,
+      "step": 12511
+    },
+    {
+      "epoch": 2.2275641025641026,
+      "grad_norm": 0.8902100920677185,
+      "learning_rate": 8.237180146915416e-05,
+      "loss": 0.8606,
+      "step": 12512
+    },
+    {
+      "epoch": 2.2277421652421654,
+      "grad_norm": 0.8100315928459167,
+      "learning_rate": 8.235802337421919e-05,
+      "loss": 0.9225,
+      "step": 12513
+    },
+    {
+      "epoch": 2.2279202279202277,
+      "grad_norm": 0.6804848909378052,
+      "learning_rate": 8.234424562494205e-05,
+      "loss": 0.7047,
+      "step": 12514
+    },
+    {
+      "epoch": 2.2280982905982905,
+      "grad_norm": 0.8664964437484741,
+      "learning_rate": 8.233046822159276e-05,
+      "loss": 1.0255,
+      "step": 12515
+    },
+    {
+      "epoch": 2.228276353276353,
+      "grad_norm": 0.836857795715332,
+      "learning_rate": 8.231669116444128e-05,
+      "loss": 0.9818,
+      "step": 12516
+    },
+    {
+      "epoch": 2.228454415954416,
+      "grad_norm": 0.6999024748802185,
+      "learning_rate": 8.230291445375744e-05,
+      "loss": 0.7298,
+      "step": 12517
+    },
+    {
+      "epoch": 2.2286324786324787,
+      "grad_norm": 0.8676811456680298,
+      "learning_rate": 8.228913808981127e-05,
+      "loss": 0.9592,
+      "step": 12518
+    },
+    {
+      "epoch": 2.2288105413105415,
+      "grad_norm": 0.8088808655738831,
+      "learning_rate": 8.227536207287263e-05,
+      "loss": 1.0021,
+      "step": 12519
+    },
+    {
+      "epoch": 2.228988603988604,
+      "grad_norm": 0.7802120447158813,
+      "learning_rate": 8.226158640321149e-05,
+      "loss": 0.8519,
+      "step": 12520
+    },
+    {
+      "epoch": 2.2291666666666665,
+      "grad_norm": 0.7560334801673889,
+      "learning_rate": 8.224781108109766e-05,
+      "loss": 0.7676,
+      "step": 12521
+    },
+    {
+      "epoch": 2.2293447293447293,
+      "grad_norm": 0.7806954383850098,
+      "learning_rate": 8.223403610680113e-05,
+      "loss": 0.9151,
+      "step": 12522
+    },
+    {
+      "epoch": 2.229522792022792,
+      "grad_norm": 0.7972870469093323,
+      "learning_rate": 8.222026148059173e-05,
+      "loss": 0.8785,
+      "step": 12523
+    },
+    {
+      "epoch": 2.2297008547008548,
+      "grad_norm": 0.7868863344192505,
+      "learning_rate": 8.220648720273941e-05,
+      "loss": 0.8981,
+      "step": 12524
+    },
+    {
+      "epoch": 2.2298789173789175,
+      "grad_norm": 0.7388648390769958,
+      "learning_rate": 8.219271327351397e-05,
+      "loss": 0.7361,
+      "step": 12525
+    },
+    {
+      "epoch": 2.23005698005698,
+      "grad_norm": 0.7367138862609863,
+      "learning_rate": 8.217893969318538e-05,
+      "loss": 0.7357,
+      "step": 12526
+    },
+    {
+      "epoch": 2.2302350427350426,
+      "grad_norm": 0.8345077037811279,
+      "learning_rate": 8.216516646202339e-05,
+      "loss": 0.9671,
+      "step": 12527
+    },
+    {
+      "epoch": 2.2304131054131053,
+      "grad_norm": 0.7875744104385376,
+      "learning_rate": 8.215139358029793e-05,
+      "loss": 0.7991,
+      "step": 12528
+    },
+    {
+      "epoch": 2.230591168091168,
+      "grad_norm": 0.7444638609886169,
+      "learning_rate": 8.213762104827882e-05,
+      "loss": 0.6524,
+      "step": 12529
+    },
+    {
+      "epoch": 2.230769230769231,
+      "grad_norm": 0.6670697927474976,
+      "learning_rate": 8.212384886623597e-05,
+      "loss": 0.639,
+      "step": 12530
+    },
+    {
+      "epoch": 2.2309472934472936,
+      "grad_norm": 0.8348705172538757,
+      "learning_rate": 8.211007703443913e-05,
+      "loss": 0.8904,
+      "step": 12531
+    },
+    {
+      "epoch": 2.2311253561253563,
+      "grad_norm": 0.8458212614059448,
+      "learning_rate": 8.209630555315817e-05,
+      "loss": 0.8398,
+      "step": 12532
+    },
+    {
+      "epoch": 2.2313034188034186,
+      "grad_norm": 0.9043961763381958,
+      "learning_rate": 8.20825344226629e-05,
+      "loss": 0.904,
+      "step": 12533
+    },
+    {
+      "epoch": 2.2314814814814814,
+      "grad_norm": 0.8207734227180481,
+      "learning_rate": 8.206876364322319e-05,
+      "loss": 0.853,
+      "step": 12534
+    },
+    {
+      "epoch": 2.231659544159544,
+      "grad_norm": 0.9311240911483765,
+      "learning_rate": 8.205499321510876e-05,
+      "loss": 0.9807,
+      "step": 12535
+    },
+    {
+      "epoch": 2.231837606837607,
+      "grad_norm": 0.8379791378974915,
+      "learning_rate": 8.204122313858946e-05,
+      "loss": 0.8318,
+      "step": 12536
+    },
+    {
+      "epoch": 2.2320156695156697,
+      "grad_norm": 0.8078454732894897,
+      "learning_rate": 8.202745341393515e-05,
+      "loss": 0.8692,
+      "step": 12537
+    },
+    {
+      "epoch": 2.232193732193732,
+      "grad_norm": 0.7555927038192749,
+      "learning_rate": 8.201368404141547e-05,
+      "loss": 0.8514,
+      "step": 12538
+    },
+    {
+      "epoch": 2.2323717948717947,
+      "grad_norm": 0.7724241018295288,
+      "learning_rate": 8.199991502130035e-05,
+      "loss": 0.5758,
+      "step": 12539
+    },
+    {
+      "epoch": 2.2325498575498575,
+      "grad_norm": 0.7388870120048523,
+      "learning_rate": 8.198614635385946e-05,
+      "loss": 0.6265,
+      "step": 12540
+    },
+    {
+      "epoch": 2.23272792022792,
+      "grad_norm": 0.9006723761558533,
+      "learning_rate": 8.197237803936267e-05,
+      "loss": 0.8238,
+      "step": 12541
+    },
+    {
+      "epoch": 2.232905982905983,
+      "grad_norm": 0.917884349822998,
+      "learning_rate": 8.195861007807962e-05,
+      "loss": 0.9447,
+      "step": 12542
+    },
+    {
+      "epoch": 2.2330840455840457,
+      "grad_norm": 0.81849205493927,
+      "learning_rate": 8.194484247028016e-05,
+      "loss": 0.9071,
+      "step": 12543
+    },
+    {
+      "epoch": 2.2332621082621085,
+      "grad_norm": 0.8572089076042175,
+      "learning_rate": 8.193107521623398e-05,
+      "loss": 0.9068,
+      "step": 12544
+    },
+    {
+      "epoch": 2.2334401709401708,
+      "grad_norm": 0.7870976328849792,
+      "learning_rate": 8.19173083162109e-05,
+      "loss": 0.7595,
+      "step": 12545
+    },
+    {
+      "epoch": 2.2336182336182335,
+      "grad_norm": 0.8728759288787842,
+      "learning_rate": 8.190354177048055e-05,
+      "loss": 1.0974,
+      "step": 12546
+    },
+    {
+      "epoch": 2.2337962962962963,
+      "grad_norm": 0.7679606080055237,
+      "learning_rate": 8.188977557931274e-05,
+      "loss": 0.7068,
+      "step": 12547
+    },
+    {
+      "epoch": 2.233974358974359,
+      "grad_norm": 0.7753520011901855,
+      "learning_rate": 8.187600974297714e-05,
+      "loss": 0.8008,
+      "step": 12548
+    },
+    {
+      "epoch": 2.234152421652422,
+      "grad_norm": 0.7785305976867676,
+      "learning_rate": 8.186224426174348e-05,
+      "loss": 0.8528,
+      "step": 12549
+    },
+    {
+      "epoch": 2.2343304843304845,
+      "grad_norm": 0.7762976288795471,
+      "learning_rate": 8.184847913588145e-05,
+      "loss": 0.9264,
+      "step": 12550
+    },
+    {
+      "epoch": 2.234508547008547,
+      "grad_norm": 1.0543726682662964,
+      "learning_rate": 8.18347143656608e-05,
+      "loss": 0.9201,
+      "step": 12551
+    },
+    {
+      "epoch": 2.2346866096866096,
+      "grad_norm": 0.815389096736908,
+      "learning_rate": 8.182094995135116e-05,
+      "loss": 0.8834,
+      "step": 12552
+    },
+    {
+      "epoch": 2.2348646723646723,
+      "grad_norm": 0.774773895740509,
+      "learning_rate": 8.180718589322225e-05,
+      "loss": 0.8864,
+      "step": 12553
+    },
+    {
+      "epoch": 2.235042735042735,
+      "grad_norm": 0.8139658570289612,
+      "learning_rate": 8.179342219154372e-05,
+      "loss": 0.8696,
+      "step": 12554
+    },
+    {
+      "epoch": 2.235220797720798,
+      "grad_norm": 0.7804924249649048,
+      "learning_rate": 8.177965884658527e-05,
+      "loss": 0.8854,
+      "step": 12555
+    },
+    {
+      "epoch": 2.2353988603988606,
+      "grad_norm": 0.8601226210594177,
+      "learning_rate": 8.176589585861659e-05,
+      "loss": 0.9115,
+      "step": 12556
+    },
+    {
+      "epoch": 2.235576923076923,
+      "grad_norm": 0.7518162727355957,
+      "learning_rate": 8.175213322790726e-05,
+      "loss": 0.7871,
+      "step": 12557
+    },
+    {
+      "epoch": 2.2357549857549857,
+      "grad_norm": 0.7595868110656738,
+      "learning_rate": 8.1738370954727e-05,
+      "loss": 0.7597,
+      "step": 12558
+    },
+    {
+      "epoch": 2.2359330484330484,
+      "grad_norm": 0.8191643357276917,
+      "learning_rate": 8.17246090393454e-05,
+      "loss": 0.9443,
+      "step": 12559
+    },
+    {
+      "epoch": 2.236111111111111,
+      "grad_norm": 0.7854904532432556,
+      "learning_rate": 8.171084748203217e-05,
+      "loss": 0.8547,
+      "step": 12560
+    },
+    {
+      "epoch": 2.236289173789174,
+      "grad_norm": 0.8610023260116577,
+      "learning_rate": 8.169708628305684e-05,
+      "loss": 0.7846,
+      "step": 12561
+    },
+    {
+      "epoch": 2.2364672364672367,
+      "grad_norm": 0.8254715204238892,
+      "learning_rate": 8.168332544268914e-05,
+      "loss": 0.7493,
+      "step": 12562
+    },
+    {
+      "epoch": 2.236645299145299,
+      "grad_norm": 0.8390897512435913,
+      "learning_rate": 8.166956496119857e-05,
+      "loss": 0.9867,
+      "step": 12563
+    },
+    {
+      "epoch": 2.2368233618233617,
+      "grad_norm": 0.8179677128791809,
+      "learning_rate": 8.165580483885483e-05,
+      "loss": 0.8039,
+      "step": 12564
+    },
+    {
+      "epoch": 2.2370014245014245,
+      "grad_norm": 0.6722155809402466,
+      "learning_rate": 8.164204507592745e-05,
+      "loss": 0.695,
+      "step": 12565
+    },
+    {
+      "epoch": 2.2371794871794872,
+      "grad_norm": 0.8228170871734619,
+      "learning_rate": 8.162828567268612e-05,
+      "loss": 1.0414,
+      "step": 12566
+    },
+    {
+      "epoch": 2.23735754985755,
+      "grad_norm": 0.8676900267601013,
+      "learning_rate": 8.161452662940032e-05,
+      "loss": 1.0157,
+      "step": 12567
+    },
+    {
+      "epoch": 2.2375356125356127,
+      "grad_norm": 0.8174694180488586,
+      "learning_rate": 8.16007679463397e-05,
+      "loss": 0.691,
+      "step": 12568
+    },
+    {
+      "epoch": 2.237713675213675,
+      "grad_norm": 0.8137148022651672,
+      "learning_rate": 8.158700962377379e-05,
+      "loss": 1.0022,
+      "step": 12569
+    },
+    {
+      "epoch": 2.237891737891738,
+      "grad_norm": 0.970250129699707,
+      "learning_rate": 8.157325166197221e-05,
+      "loss": 0.7946,
+      "step": 12570
+    },
+    {
+      "epoch": 2.2380698005698005,
+      "grad_norm": 0.7366915941238403,
+      "learning_rate": 8.155949406120446e-05,
+      "loss": 0.9039,
+      "step": 12571
+    },
+    {
+      "epoch": 2.2382478632478633,
+      "grad_norm": 0.878358781337738,
+      "learning_rate": 8.154573682174014e-05,
+      "loss": 0.8172,
+      "step": 12572
+    },
+    {
+      "epoch": 2.238425925925926,
+      "grad_norm": 0.7552989721298218,
+      "learning_rate": 8.153197994384875e-05,
+      "loss": 0.9955,
+      "step": 12573
+    },
+    {
+      "epoch": 2.238603988603989,
+      "grad_norm": 0.8198257684707642,
+      "learning_rate": 8.151822342779985e-05,
+      "loss": 0.8677,
+      "step": 12574
+    },
+    {
+      "epoch": 2.238782051282051,
+      "grad_norm": 0.9128977656364441,
+      "learning_rate": 8.150446727386297e-05,
+      "loss": 0.9531,
+      "step": 12575
+    },
+    {
+      "epoch": 2.238960113960114,
+      "grad_norm": 0.867671549320221,
+      "learning_rate": 8.149071148230762e-05,
+      "loss": 1.0226,
+      "step": 12576
+    },
+    {
+      "epoch": 2.2391381766381766,
+      "grad_norm": 0.8640758395195007,
+      "learning_rate": 8.147695605340337e-05,
+      "loss": 1.1284,
+      "step": 12577
+    },
+    {
+      "epoch": 2.2393162393162394,
+      "grad_norm": 0.7453210353851318,
+      "learning_rate": 8.146320098741964e-05,
+      "loss": 0.7812,
+      "step": 12578
+    },
+    {
+      "epoch": 2.239494301994302,
+      "grad_norm": 0.9207521080970764,
+      "learning_rate": 8.144944628462602e-05,
+      "loss": 0.9955,
+      "step": 12579
+    },
+    {
+      "epoch": 2.239672364672365,
+      "grad_norm": 0.751732349395752,
+      "learning_rate": 8.143569194529193e-05,
+      "loss": 0.7858,
+      "step": 12580
+    },
+    {
+      "epoch": 2.239850427350427,
+      "grad_norm": 0.7955539226531982,
+      "learning_rate": 8.142193796968694e-05,
+      "loss": 0.8482,
+      "step": 12581
+    },
+    {
+      "epoch": 2.24002849002849,
+      "grad_norm": 0.8020164370536804,
+      "learning_rate": 8.140818435808043e-05,
+      "loss": 0.8069,
+      "step": 12582
+    },
+    {
+      "epoch": 2.2402065527065527,
+      "grad_norm": 0.7460235357284546,
+      "learning_rate": 8.139443111074198e-05,
+      "loss": 0.6478,
+      "step": 12583
+    },
+    {
+      "epoch": 2.2403846153846154,
+      "grad_norm": 0.7504379153251648,
+      "learning_rate": 8.138067822794096e-05,
+      "loss": 0.726,
+      "step": 12584
+    },
+    {
+      "epoch": 2.240562678062678,
+      "grad_norm": 0.8214267492294312,
+      "learning_rate": 8.136692570994688e-05,
+      "loss": 1.0114,
+      "step": 12585
+    },
+    {
+      "epoch": 2.240740740740741,
+      "grad_norm": 0.9436941742897034,
+      "learning_rate": 8.135317355702917e-05,
+      "loss": 0.873,
+      "step": 12586
+    },
+    {
+      "epoch": 2.2409188034188032,
+      "grad_norm": 0.7541804909706116,
+      "learning_rate": 8.133942176945733e-05,
+      "loss": 0.8013,
+      "step": 12587
+    },
+    {
+      "epoch": 2.241096866096866,
+      "grad_norm": 0.8725557327270508,
+      "learning_rate": 8.132567034750073e-05,
+      "loss": 0.8506,
+      "step": 12588
+    },
+    {
+      "epoch": 2.2412749287749287,
+      "grad_norm": 0.7766169905662537,
+      "learning_rate": 8.131191929142882e-05,
+      "loss": 0.9076,
+      "step": 12589
+    },
+    {
+      "epoch": 2.2414529914529915,
+      "grad_norm": 0.8852736353874207,
+      "learning_rate": 8.129816860151104e-05,
+      "loss": 0.9278,
+      "step": 12590
+    },
+    {
+      "epoch": 2.2416310541310542,
+      "grad_norm": 0.6939527988433838,
+      "learning_rate": 8.128441827801681e-05,
+      "loss": 0.753,
+      "step": 12591
+    },
+    {
+      "epoch": 2.241809116809117,
+      "grad_norm": 0.8932832479476929,
+      "learning_rate": 8.127066832121551e-05,
+      "loss": 0.8089,
+      "step": 12592
+    },
+    {
+      "epoch": 2.2419871794871793,
+      "grad_norm": 0.7399743795394897,
+      "learning_rate": 8.125691873137656e-05,
+      "loss": 0.6905,
+      "step": 12593
+    },
+    {
+      "epoch": 2.242165242165242,
+      "grad_norm": 0.7664098143577576,
+      "learning_rate": 8.124316950876933e-05,
+      "loss": 0.8698,
+      "step": 12594
+    },
+    {
+      "epoch": 2.242343304843305,
+      "grad_norm": 0.8222574591636658,
+      "learning_rate": 8.122942065366323e-05,
+      "loss": 0.8922,
+      "step": 12595
+    },
+    {
+      "epoch": 2.2425213675213675,
+      "grad_norm": 0.8072433471679688,
+      "learning_rate": 8.121567216632771e-05,
+      "loss": 0.7613,
+      "step": 12596
+    },
+    {
+      "epoch": 2.2426994301994303,
+      "grad_norm": 0.7647300362586975,
+      "learning_rate": 8.120192404703199e-05,
+      "loss": 0.8736,
+      "step": 12597
+    },
+    {
+      "epoch": 2.242877492877493,
+      "grad_norm": 0.7536396980285645,
+      "learning_rate": 8.118817629604559e-05,
+      "loss": 0.7697,
+      "step": 12598
+    },
+    {
+      "epoch": 2.2430555555555554,
+      "grad_norm": 0.7295291423797607,
+      "learning_rate": 8.117442891363774e-05,
+      "loss": 0.8477,
+      "step": 12599
+    },
+    {
+      "epoch": 2.243233618233618,
+      "grad_norm": 0.7677894830703735,
+      "learning_rate": 8.116068190007787e-05,
+      "loss": 0.8113,
+      "step": 12600
+    },
+    {
+      "epoch": 2.243411680911681,
+      "grad_norm": 0.825614869594574,
+      "learning_rate": 8.114693525563529e-05,
+      "loss": 0.915,
+      "step": 12601
+    },
+    {
+      "epoch": 2.2435897435897436,
+      "grad_norm": 0.7841798663139343,
+      "learning_rate": 8.113318898057939e-05,
+      "loss": 0.7028,
+      "step": 12602
+    },
+    {
+      "epoch": 2.2437678062678064,
+      "grad_norm": 1.085337519645691,
+      "learning_rate": 8.111944307517942e-05,
+      "loss": 0.8354,
+      "step": 12603
+    },
+    {
+      "epoch": 2.243945868945869,
+      "grad_norm": 0.7831527590751648,
+      "learning_rate": 8.110569753970475e-05,
+      "loss": 1.0275,
+      "step": 12604
+    },
+    {
+      "epoch": 2.2441239316239314,
+      "grad_norm": 0.800504744052887,
+      "learning_rate": 8.109195237442467e-05,
+      "loss": 0.7006,
+      "step": 12605
+    },
+    {
+      "epoch": 2.244301994301994,
+      "grad_norm": 0.8189738392829895,
+      "learning_rate": 8.107820757960856e-05,
+      "loss": 0.8036,
+      "step": 12606
+    },
+    {
+      "epoch": 2.244480056980057,
+      "grad_norm": 0.8892425298690796,
+      "learning_rate": 8.106446315552562e-05,
+      "loss": 0.8274,
+      "step": 12607
+    },
+    {
+      "epoch": 2.2446581196581197,
+      "grad_norm": 0.8144643306732178,
+      "learning_rate": 8.105071910244521e-05,
+      "loss": 1.0648,
+      "step": 12608
+    },
+    {
+      "epoch": 2.2448361823361824,
+      "grad_norm": 0.914513111114502,
+      "learning_rate": 8.103697542063657e-05,
+      "loss": 0.8999,
+      "step": 12609
+    },
+    {
+      "epoch": 2.245014245014245,
+      "grad_norm": 0.8273763656616211,
+      "learning_rate": 8.102323211036904e-05,
+      "loss": 0.8554,
+      "step": 12610
+    },
+    {
+      "epoch": 2.2451923076923075,
+      "grad_norm": 0.9459149837493896,
+      "learning_rate": 8.100948917191181e-05,
+      "loss": 1.2345,
+      "step": 12611
+    },
+    {
+      "epoch": 2.2453703703703702,
+      "grad_norm": 0.8377025723457336,
+      "learning_rate": 8.099574660553425e-05,
+      "loss": 0.8096,
+      "step": 12612
+    },
+    {
+      "epoch": 2.245548433048433,
+      "grad_norm": 0.8639607429504395,
+      "learning_rate": 8.098200441150551e-05,
+      "loss": 0.8238,
+      "step": 12613
+    },
+    {
+      "epoch": 2.2457264957264957,
+      "grad_norm": 1.0107637643814087,
+      "learning_rate": 8.09682625900949e-05,
+      "loss": 0.8747,
+      "step": 12614
+    },
+    {
+      "epoch": 2.2459045584045585,
+      "grad_norm": 0.8153043985366821,
+      "learning_rate": 8.095452114157164e-05,
+      "loss": 0.9357,
+      "step": 12615
+    },
+    {
+      "epoch": 2.2460826210826212,
+      "grad_norm": 0.8948562741279602,
+      "learning_rate": 8.094078006620497e-05,
+      "loss": 0.8245,
+      "step": 12616
+    },
+    {
+      "epoch": 2.246260683760684,
+      "grad_norm": 0.7983259558677673,
+      "learning_rate": 8.092703936426416e-05,
+      "loss": 0.8936,
+      "step": 12617
+    },
+    {
+      "epoch": 2.2464387464387463,
+      "grad_norm": 0.9016979336738586,
+      "learning_rate": 8.091329903601835e-05,
+      "loss": 1.0685,
+      "step": 12618
+    },
+    {
+      "epoch": 2.246616809116809,
+      "grad_norm": 0.7192493677139282,
+      "learning_rate": 8.089955908173685e-05,
+      "loss": 0.8622,
+      "step": 12619
+    },
+    {
+      "epoch": 2.246794871794872,
+      "grad_norm": 0.78288334608078,
+      "learning_rate": 8.088581950168877e-05,
+      "loss": 0.7874,
+      "step": 12620
+    },
+    {
+      "epoch": 2.2469729344729346,
+      "grad_norm": 0.8438683152198792,
+      "learning_rate": 8.087208029614336e-05,
+      "loss": 0.9262,
+      "step": 12621
+    },
+    {
+      "epoch": 2.2471509971509973,
+      "grad_norm": 0.8384907245635986,
+      "learning_rate": 8.085834146536978e-05,
+      "loss": 0.9069,
+      "step": 12622
+    },
+    {
+      "epoch": 2.2473290598290596,
+      "grad_norm": 0.8209545016288757,
+      "learning_rate": 8.084460300963729e-05,
+      "loss": 0.9457,
+      "step": 12623
+    },
+    {
+      "epoch": 2.2475071225071224,
+      "grad_norm": 0.8220782279968262,
+      "learning_rate": 8.083086492921496e-05,
+      "loss": 0.9224,
+      "step": 12624
+    },
+    {
+      "epoch": 2.247685185185185,
+      "grad_norm": 0.8927256464958191,
+      "learning_rate": 8.081712722437204e-05,
+      "loss": 0.7091,
+      "step": 12625
+    },
+    {
+      "epoch": 2.247863247863248,
+      "grad_norm": 0.8878564238548279,
+      "learning_rate": 8.080338989537764e-05,
+      "loss": 0.8879,
+      "step": 12626
+    },
+    {
+      "epoch": 2.2480413105413106,
+      "grad_norm": 0.8380948305130005,
+      "learning_rate": 8.078965294250097e-05,
+      "loss": 0.8504,
+      "step": 12627
+    },
+    {
+      "epoch": 2.2482193732193734,
+      "grad_norm": 0.8005350828170776,
+      "learning_rate": 8.07759163660111e-05,
+      "loss": 1.2119,
+      "step": 12628
+    },
+    {
+      "epoch": 2.248397435897436,
+      "grad_norm": 0.7990152835845947,
+      "learning_rate": 8.076218016617726e-05,
+      "loss": 0.72,
+      "step": 12629
+    },
+    {
+      "epoch": 2.2485754985754984,
+      "grad_norm": 0.9264963269233704,
+      "learning_rate": 8.07484443432685e-05,
+      "loss": 0.8398,
+      "step": 12630
+    },
+    {
+      "epoch": 2.248753561253561,
+      "grad_norm": 0.9103235602378845,
+      "learning_rate": 8.073470889755402e-05,
+      "loss": 0.9122,
+      "step": 12631
+    },
+    {
+      "epoch": 2.248931623931624,
+      "grad_norm": 0.8042106032371521,
+      "learning_rate": 8.072097382930285e-05,
+      "loss": 0.8065,
+      "step": 12632
+    },
+    {
+      "epoch": 2.2491096866096867,
+      "grad_norm": 0.8464857935905457,
+      "learning_rate": 8.070723913878421e-05,
+      "loss": 0.9117,
+      "step": 12633
+    },
+    {
+      "epoch": 2.2492877492877494,
+      "grad_norm": 0.7476474642753601,
+      "learning_rate": 8.06935048262671e-05,
+      "loss": 1.0252,
+      "step": 12634
+    },
+    {
+      "epoch": 2.2494658119658117,
+      "grad_norm": 0.8098256587982178,
+      "learning_rate": 8.067977089202065e-05,
+      "loss": 0.911,
+      "step": 12635
+    },
+    {
+      "epoch": 2.2496438746438745,
+      "grad_norm": 0.9311509728431702,
+      "learning_rate": 8.066603733631398e-05,
+      "loss": 0.9594,
+      "step": 12636
+    },
+    {
+      "epoch": 2.2496438746438745,
+      "eval_loss": 1.1335573196411133,
+      "eval_runtime": 24.2688,
+      "eval_samples_per_second": 42.895,
+      "eval_steps_per_second": 21.468,
+      "step": 12636
+    },
+    {
+      "epoch": 2.2498219373219372,
+      "grad_norm": 0.7744980454444885,
+      "learning_rate": 8.065230415941612e-05,
+      "loss": 0.8983,
+      "step": 12637
+    },
+    {
+      "epoch": 2.25,
+      "grad_norm": 0.9464056491851807,
+      "learning_rate": 8.06385713615962e-05,
+      "loss": 0.8646,
+      "step": 12638
+    },
+    {
+      "epoch": 2.2501780626780628,
+      "grad_norm": 0.8263896107673645,
+      "learning_rate": 8.062483894312323e-05,
+      "loss": 0.8557,
+      "step": 12639
+    },
+    {
+      "epoch": 2.2503561253561255,
+      "grad_norm": 0.8827885389328003,
+      "learning_rate": 8.06111069042663e-05,
+      "loss": 0.7632,
+      "step": 12640
+    },
+    {
+      "epoch": 2.2505341880341883,
+      "grad_norm": 0.8537881374359131,
+      "learning_rate": 8.059737524529443e-05,
+      "loss": 0.8004,
+      "step": 12641
+    },
+    {
+      "epoch": 2.2507122507122506,
+      "grad_norm": 0.8397842049598694,
+      "learning_rate": 8.058364396647674e-05,
+      "loss": 0.9487,
+      "step": 12642
+    },
+    {
+      "epoch": 2.2508903133903133,
+      "grad_norm": 1.071976661682129,
+      "learning_rate": 8.056991306808217e-05,
+      "loss": 1.0699,
+      "step": 12643
+    },
+    {
+      "epoch": 2.251068376068376,
+      "grad_norm": 0.8712023496627808,
+      "learning_rate": 8.055618255037983e-05,
+      "loss": 0.6518,
+      "step": 12644
+    },
+    {
+      "epoch": 2.251246438746439,
+      "grad_norm": 0.7885438799858093,
+      "learning_rate": 8.054245241363866e-05,
+      "loss": 0.8458,
+      "step": 12645
+    },
+    {
+      "epoch": 2.2514245014245016,
+      "grad_norm": 0.947169840335846,
+      "learning_rate": 8.052872265812774e-05,
+      "loss": 0.6631,
+      "step": 12646
+    },
+    {
+      "epoch": 2.251602564102564,
+      "grad_norm": 0.8554182052612305,
+      "learning_rate": 8.051499328411603e-05,
+      "loss": 0.8945,
+      "step": 12647
+    },
+    {
+      "epoch": 2.2517806267806266,
+      "grad_norm": 0.8081278800964355,
+      "learning_rate": 8.050126429187259e-05,
+      "loss": 0.8969,
+      "step": 12648
+    },
+    {
+      "epoch": 2.2519586894586894,
+      "grad_norm": 0.7826179265975952,
+      "learning_rate": 8.048753568166633e-05,
+      "loss": 0.6965,
+      "step": 12649
+    },
+    {
+      "epoch": 2.252136752136752,
+      "grad_norm": 0.9688517451286316,
+      "learning_rate": 8.04738074537663e-05,
+      "loss": 1.0044,
+      "step": 12650
+    },
+    {
+      "epoch": 2.252314814814815,
+      "grad_norm": 0.7780970931053162,
+      "learning_rate": 8.04600796084414e-05,
+      "loss": 0.8712,
+      "step": 12651
+    },
+    {
+      "epoch": 2.2524928774928776,
+      "grad_norm": 0.8360016942024231,
+      "learning_rate": 8.044635214596073e-05,
+      "loss": 0.9522,
+      "step": 12652
+    },
+    {
+      "epoch": 2.2526709401709404,
+      "grad_norm": 0.8137710094451904,
+      "learning_rate": 8.043262506659311e-05,
+      "loss": 0.7953,
+      "step": 12653
+    },
+    {
+      "epoch": 2.2528490028490027,
+      "grad_norm": 0.8394312858581543,
+      "learning_rate": 8.041889837060755e-05,
+      "loss": 0.77,
+      "step": 12654
+    },
+    {
+      "epoch": 2.2530270655270654,
+      "grad_norm": 0.7245169878005981,
+      "learning_rate": 8.040517205827307e-05,
+      "loss": 0.7657,
+      "step": 12655
+    },
+    {
+      "epoch": 2.253205128205128,
+      "grad_norm": 0.8018792271614075,
+      "learning_rate": 8.039144612985846e-05,
+      "loss": 0.8974,
+      "step": 12656
+    },
+    {
+      "epoch": 2.253383190883191,
+      "grad_norm": 0.8204617500305176,
+      "learning_rate": 8.037772058563278e-05,
+      "loss": 0.8635,
+      "step": 12657
+    },
+    {
+      "epoch": 2.2535612535612537,
+      "grad_norm": 0.906288743019104,
+      "learning_rate": 8.036399542586485e-05,
+      "loss": 1.0498,
+      "step": 12658
+    },
+    {
+      "epoch": 2.253739316239316,
+      "grad_norm": 0.8674196600914001,
+      "learning_rate": 8.035027065082371e-05,
+      "loss": 0.8621,
+      "step": 12659
+    },
+    {
+      "epoch": 2.2539173789173788,
+      "grad_norm": 0.8112890124320984,
+      "learning_rate": 8.033654626077816e-05,
+      "loss": 0.9937,
+      "step": 12660
+    },
+    {
+      "epoch": 2.2540954415954415,
+      "grad_norm": 0.8072839975357056,
+      "learning_rate": 8.032282225599714e-05,
+      "loss": 0.8555,
+      "step": 12661
+    },
+    {
+      "epoch": 2.2542735042735043,
+      "grad_norm": 0.7853979468345642,
+      "learning_rate": 8.030909863674952e-05,
+      "loss": 0.8698,
+      "step": 12662
+    },
+    {
+      "epoch": 2.254451566951567,
+      "grad_norm": 0.7456761598587036,
+      "learning_rate": 8.029537540330426e-05,
+      "loss": 0.6214,
+      "step": 12663
+    },
+    {
+      "epoch": 2.2546296296296298,
+      "grad_norm": 0.7207663059234619,
+      "learning_rate": 8.028165255593015e-05,
+      "loss": 0.7641,
+      "step": 12664
+    },
+    {
+      "epoch": 2.2548076923076925,
+      "grad_norm": 0.6541373133659363,
+      "learning_rate": 8.02679300948961e-05,
+      "loss": 0.7438,
+      "step": 12665
+    },
+    {
+      "epoch": 2.254985754985755,
+      "grad_norm": 0.7535310983657837,
+      "learning_rate": 8.025420802047096e-05,
+      "loss": 0.9417,
+      "step": 12666
+    },
+    {
+      "epoch": 2.2551638176638176,
+      "grad_norm": 0.88471919298172,
+      "learning_rate": 8.024048633292364e-05,
+      "loss": 0.9122,
+      "step": 12667
+    },
+    {
+      "epoch": 2.2553418803418803,
+      "grad_norm": 0.8621570467948914,
+      "learning_rate": 8.02267650325229e-05,
+      "loss": 0.7026,
+      "step": 12668
+    },
+    {
+      "epoch": 2.255519943019943,
+      "grad_norm": 0.8574202060699463,
+      "learning_rate": 8.021304411953767e-05,
+      "loss": 0.8997,
+      "step": 12669
+    },
+    {
+      "epoch": 2.255698005698006,
+      "grad_norm": 0.8038806915283203,
+      "learning_rate": 8.019932359423667e-05,
+      "loss": 0.9386,
+      "step": 12670
+    },
+    {
+      "epoch": 2.255876068376068,
+      "grad_norm": 0.7760711908340454,
+      "learning_rate": 8.018560345688883e-05,
+      "loss": 0.7777,
+      "step": 12671
+    },
+    {
+      "epoch": 2.256054131054131,
+      "grad_norm": 0.7433146834373474,
+      "learning_rate": 8.017188370776292e-05,
+      "loss": 0.7245,
+      "step": 12672
+    },
+    {
+      "epoch": 2.2562321937321936,
+      "grad_norm": 0.8710882067680359,
+      "learning_rate": 8.01581643471278e-05,
+      "loss": 0.9478,
+      "step": 12673
+    },
+    {
+      "epoch": 2.2564102564102564,
+      "grad_norm": 0.7726423740386963,
+      "learning_rate": 8.014444537525218e-05,
+      "loss": 0.8388,
+      "step": 12674
+    },
+    {
+      "epoch": 2.256588319088319,
+      "grad_norm": 0.7967063188552856,
+      "learning_rate": 8.01307267924049e-05,
+      "loss": 0.8226,
+      "step": 12675
+    },
+    {
+      "epoch": 2.256766381766382,
+      "grad_norm": 0.7524598836898804,
+      "learning_rate": 8.011700859885479e-05,
+      "loss": 0.7285,
+      "step": 12676
+    },
+    {
+      "epoch": 2.2569444444444446,
+      "grad_norm": 0.808729887008667,
+      "learning_rate": 8.010329079487055e-05,
+      "loss": 0.7498,
+      "step": 12677
+    },
+    {
+      "epoch": 2.257122507122507,
+      "grad_norm": 0.7842788100242615,
+      "learning_rate": 8.008957338072106e-05,
+      "loss": 0.9216,
+      "step": 12678
+    },
+    {
+      "epoch": 2.2573005698005697,
+      "grad_norm": 0.8905709981918335,
+      "learning_rate": 8.007585635667497e-05,
+      "loss": 0.9254,
+      "step": 12679
+    },
+    {
+      "epoch": 2.2574786324786325,
+      "grad_norm": 0.7495295405387878,
+      "learning_rate": 8.006213972300112e-05,
+      "loss": 0.8407,
+      "step": 12680
+    },
+    {
+      "epoch": 2.257656695156695,
+      "grad_norm": 0.7425774335861206,
+      "learning_rate": 8.004842347996819e-05,
+      "loss": 0.7893,
+      "step": 12681
+    },
+    {
+      "epoch": 2.257834757834758,
+      "grad_norm": 0.8028583526611328,
+      "learning_rate": 8.003470762784498e-05,
+      "loss": 0.8106,
+      "step": 12682
+    },
+    {
+      "epoch": 2.2580128205128207,
+      "grad_norm": 0.8874917030334473,
+      "learning_rate": 8.002099216690017e-05,
+      "loss": 0.97,
+      "step": 12683
+    },
+    {
+      "epoch": 2.258190883190883,
+      "grad_norm": 0.8830558061599731,
+      "learning_rate": 8.000727709740257e-05,
+      "loss": 1.028,
+      "step": 12684
+    },
+    {
+      "epoch": 2.2583689458689458,
+      "grad_norm": 0.8720272779464722,
+      "learning_rate": 7.99935624196208e-05,
+      "loss": 0.9401,
+      "step": 12685
+    },
+    {
+      "epoch": 2.2585470085470085,
+      "grad_norm": 0.736709713935852,
+      "learning_rate": 7.997984813382362e-05,
+      "loss": 0.8479,
+      "step": 12686
+    },
+    {
+      "epoch": 2.2587250712250713,
+      "grad_norm": 0.8028469085693359,
+      "learning_rate": 7.996613424027973e-05,
+      "loss": 0.9291,
+      "step": 12687
+    },
+    {
+      "epoch": 2.258903133903134,
+      "grad_norm": 0.777618944644928,
+      "learning_rate": 7.995242073925784e-05,
+      "loss": 0.7021,
+      "step": 12688
+    },
+    {
+      "epoch": 2.2590811965811968,
+      "grad_norm": 0.8371155261993408,
+      "learning_rate": 7.993870763102659e-05,
+      "loss": 0.8309,
+      "step": 12689
+    },
+    {
+      "epoch": 2.259259259259259,
+      "grad_norm": 0.8853654861450195,
+      "learning_rate": 7.992499491585473e-05,
+      "loss": 0.762,
+      "step": 12690
+    },
+    {
+      "epoch": 2.259437321937322,
+      "grad_norm": 0.742594301700592,
+      "learning_rate": 7.991128259401086e-05,
+      "loss": 0.8025,
+      "step": 12691
+    },
+    {
+      "epoch": 2.2596153846153846,
+      "grad_norm": 1.0678842067718506,
+      "learning_rate": 7.989757066576369e-05,
+      "loss": 0.9127,
+      "step": 12692
+    },
+    {
+      "epoch": 2.2597934472934473,
+      "grad_norm": 0.7917565703392029,
+      "learning_rate": 7.988385913138183e-05,
+      "loss": 0.8078,
+      "step": 12693
+    },
+    {
+      "epoch": 2.25997150997151,
+      "grad_norm": 0.6907288432121277,
+      "learning_rate": 7.987014799113397e-05,
+      "loss": 0.6313,
+      "step": 12694
+    },
+    {
+      "epoch": 2.260149572649573,
+      "grad_norm": 0.9007455706596375,
+      "learning_rate": 7.98564372452888e-05,
+      "loss": 1.0734,
+      "step": 12695
+    },
+    {
+      "epoch": 2.260327635327635,
+      "grad_norm": 0.7732774615287781,
+      "learning_rate": 7.984272689411484e-05,
+      "loss": 0.9925,
+      "step": 12696
+    },
+    {
+      "epoch": 2.260505698005698,
+      "grad_norm": 0.7470823526382446,
+      "learning_rate": 7.982901693788082e-05,
+      "loss": 0.8518,
+      "step": 12697
+    },
+    {
+      "epoch": 2.2606837606837606,
+      "grad_norm": 0.8018864989280701,
+      "learning_rate": 7.981530737685526e-05,
+      "loss": 0.8668,
+      "step": 12698
+    },
+    {
+      "epoch": 2.2608618233618234,
+      "grad_norm": 0.8459745049476624,
+      "learning_rate": 7.980159821130688e-05,
+      "loss": 0.8972,
+      "step": 12699
+    },
+    {
+      "epoch": 2.261039886039886,
+      "grad_norm": 0.8255595564842224,
+      "learning_rate": 7.978788944150419e-05,
+      "loss": 0.9562,
+      "step": 12700
+    },
+    {
+      "epoch": 2.261217948717949,
+      "grad_norm": 0.8243128061294556,
+      "learning_rate": 7.977418106771582e-05,
+      "loss": 0.6634,
+      "step": 12701
+    },
+    {
+      "epoch": 2.261396011396011,
+      "grad_norm": 0.802949845790863,
+      "learning_rate": 7.976047309021034e-05,
+      "loss": 0.8155,
+      "step": 12702
+    },
+    {
+      "epoch": 2.261574074074074,
+      "grad_norm": 0.8480857014656067,
+      "learning_rate": 7.97467655092564e-05,
+      "loss": 0.8568,
+      "step": 12703
+    },
+    {
+      "epoch": 2.2617521367521367,
+      "grad_norm": 0.8777545690536499,
+      "learning_rate": 7.973305832512247e-05,
+      "loss": 0.8688,
+      "step": 12704
+    },
+    {
+      "epoch": 2.2619301994301995,
+      "grad_norm": 0.8334060907363892,
+      "learning_rate": 7.971935153807719e-05,
+      "loss": 0.932,
+      "step": 12705
+    },
+    {
+      "epoch": 2.262108262108262,
+      "grad_norm": 0.836976170539856,
+      "learning_rate": 7.970564514838907e-05,
+      "loss": 0.8205,
+      "step": 12706
+    },
+    {
+      "epoch": 2.262286324786325,
+      "grad_norm": 0.782866895198822,
+      "learning_rate": 7.969193915632667e-05,
+      "loss": 0.8362,
+      "step": 12707
+    },
+    {
+      "epoch": 2.2624643874643873,
+      "grad_norm": 0.9018504619598389,
+      "learning_rate": 7.967823356215854e-05,
+      "loss": 0.8354,
+      "step": 12708
+    },
+    {
+      "epoch": 2.26264245014245,
+      "grad_norm": 0.7974916696548462,
+      "learning_rate": 7.966452836615324e-05,
+      "loss": 0.8035,
+      "step": 12709
+    },
+    {
+      "epoch": 2.2628205128205128,
+      "grad_norm": 0.8745712637901306,
+      "learning_rate": 7.965082356857922e-05,
+      "loss": 0.8803,
+      "step": 12710
+    },
+    {
+      "epoch": 2.2629985754985755,
+      "grad_norm": 0.8667176365852356,
+      "learning_rate": 7.963711916970505e-05,
+      "loss": 0.8005,
+      "step": 12711
+    },
+    {
+      "epoch": 2.2631766381766383,
+      "grad_norm": 0.849998950958252,
+      "learning_rate": 7.962341516979922e-05,
+      "loss": 0.8208,
+      "step": 12712
+    },
+    {
+      "epoch": 2.263354700854701,
+      "grad_norm": 0.803727388381958,
+      "learning_rate": 7.960971156913028e-05,
+      "loss": 0.7232,
+      "step": 12713
+    },
+    {
+      "epoch": 2.263532763532764,
+      "grad_norm": 0.842913031578064,
+      "learning_rate": 7.959600836796664e-05,
+      "loss": 0.8182,
+      "step": 12714
+    },
+    {
+      "epoch": 2.263710826210826,
+      "grad_norm": 0.8191903829574585,
+      "learning_rate": 7.958230556657684e-05,
+      "loss": 0.8353,
+      "step": 12715
+    },
+    {
+      "epoch": 2.263888888888889,
+      "grad_norm": 0.8525017499923706,
+      "learning_rate": 7.95686031652294e-05,
+      "loss": 0.9824,
+      "step": 12716
+    },
+    {
+      "epoch": 2.2640669515669516,
+      "grad_norm": 0.7176641225814819,
+      "learning_rate": 7.955490116419267e-05,
+      "loss": 0.7722,
+      "step": 12717
+    },
+    {
+      "epoch": 2.2642450142450143,
+      "grad_norm": 0.8740555047988892,
+      "learning_rate": 7.954119956373521e-05,
+      "loss": 0.8286,
+      "step": 12718
+    },
+    {
+      "epoch": 2.264423076923077,
+      "grad_norm": 0.7928949594497681,
+      "learning_rate": 7.952749836412543e-05,
+      "loss": 0.9183,
+      "step": 12719
+    },
+    {
+      "epoch": 2.2646011396011394,
+      "grad_norm": 0.787661612033844,
+      "learning_rate": 7.951379756563185e-05,
+      "loss": 0.7741,
+      "step": 12720
+    },
+    {
+      "epoch": 2.264779202279202,
+      "grad_norm": 0.8369856476783752,
+      "learning_rate": 7.950009716852277e-05,
+      "loss": 0.911,
+      "step": 12721
+    },
+    {
+      "epoch": 2.264957264957265,
+      "grad_norm": 0.7838568687438965,
+      "learning_rate": 7.948639717306675e-05,
+      "loss": 0.8532,
+      "step": 12722
+    },
+    {
+      "epoch": 2.2651353276353277,
+      "grad_norm": 0.8287179470062256,
+      "learning_rate": 7.947269757953213e-05,
+      "loss": 0.893,
+      "step": 12723
+    },
+    {
+      "epoch": 2.2653133903133904,
+      "grad_norm": 0.7754728198051453,
+      "learning_rate": 7.945899838818741e-05,
+      "loss": 0.9516,
+      "step": 12724
+    },
+    {
+      "epoch": 2.265491452991453,
+      "grad_norm": 0.7088906764984131,
+      "learning_rate": 7.94452995993009e-05,
+      "loss": 0.6797,
+      "step": 12725
+    },
+    {
+      "epoch": 2.265669515669516,
+      "grad_norm": 0.8004380464553833,
+      "learning_rate": 7.94316012131411e-05,
+      "loss": 0.8583,
+      "step": 12726
+    },
+    {
+      "epoch": 2.265847578347578,
+      "grad_norm": 0.8221408128738403,
+      "learning_rate": 7.941790322997629e-05,
+      "loss": 0.9575,
+      "step": 12727
+    },
+    {
+      "epoch": 2.266025641025641,
+      "grad_norm": 0.8640061020851135,
+      "learning_rate": 7.940420565007492e-05,
+      "loss": 0.9471,
+      "step": 12728
+    },
+    {
+      "epoch": 2.2662037037037037,
+      "grad_norm": 0.8151915669441223,
+      "learning_rate": 7.939050847370536e-05,
+      "loss": 0.7841,
+      "step": 12729
+    },
+    {
+      "epoch": 2.2663817663817665,
+      "grad_norm": 0.7910612225532532,
+      "learning_rate": 7.9376811701136e-05,
+      "loss": 0.8826,
+      "step": 12730
+    },
+    {
+      "epoch": 2.2665598290598292,
+      "grad_norm": 0.7158875465393066,
+      "learning_rate": 7.936311533263514e-05,
+      "loss": 0.8598,
+      "step": 12731
+    },
+    {
+      "epoch": 2.2667378917378915,
+      "grad_norm": 0.6968050003051758,
+      "learning_rate": 7.934941936847119e-05,
+      "loss": 0.742,
+      "step": 12732
+    },
+    {
+      "epoch": 2.2669159544159543,
+      "grad_norm": 0.8630516529083252,
+      "learning_rate": 7.933572380891245e-05,
+      "loss": 0.789,
+      "step": 12733
+    },
+    {
+      "epoch": 2.267094017094017,
+      "grad_norm": 0.8060851097106934,
+      "learning_rate": 7.932202865422726e-05,
+      "loss": 0.8447,
+      "step": 12734
+    },
+    {
+      "epoch": 2.26727207977208,
+      "grad_norm": 0.9570813775062561,
+      "learning_rate": 7.930833390468402e-05,
+      "loss": 0.8948,
+      "step": 12735
+    },
+    {
+      "epoch": 2.2674501424501425,
+      "grad_norm": 0.7649935483932495,
+      "learning_rate": 7.929463956055094e-05,
+      "loss": 0.905,
+      "step": 12736
+    },
+    {
+      "epoch": 2.2676282051282053,
+      "grad_norm": 0.7498226165771484,
+      "learning_rate": 7.928094562209641e-05,
+      "loss": 0.9168,
+      "step": 12737
+    },
+    {
+      "epoch": 2.267806267806268,
+      "grad_norm": 0.7915979027748108,
+      "learning_rate": 7.926725208958869e-05,
+      "loss": 0.8628,
+      "step": 12738
+    },
+    {
+      "epoch": 2.2679843304843303,
+      "grad_norm": 0.7620252966880798,
+      "learning_rate": 7.925355896329615e-05,
+      "loss": 0.8768,
+      "step": 12739
+    },
+    {
+      "epoch": 2.268162393162393,
+      "grad_norm": 0.9785344004631042,
+      "learning_rate": 7.923986624348697e-05,
+      "loss": 0.9579,
+      "step": 12740
+    },
+    {
+      "epoch": 2.268340455840456,
+      "grad_norm": 0.9146337509155273,
+      "learning_rate": 7.922617393042954e-05,
+      "loss": 1.2241,
+      "step": 12741
+    },
+    {
+      "epoch": 2.2685185185185186,
+      "grad_norm": 0.7815660238265991,
+      "learning_rate": 7.921248202439203e-05,
+      "loss": 0.7367,
+      "step": 12742
+    },
+    {
+      "epoch": 2.2686965811965814,
+      "grad_norm": 0.8466008305549622,
+      "learning_rate": 7.919879052564276e-05,
+      "loss": 0.923,
+      "step": 12743
+    },
+    {
+      "epoch": 2.2688746438746437,
+      "grad_norm": 0.742203950881958,
+      "learning_rate": 7.918509943444998e-05,
+      "loss": 0.8282,
+      "step": 12744
+    },
+    {
+      "epoch": 2.2690527065527064,
+      "grad_norm": 0.785446286201477,
+      "learning_rate": 7.917140875108196e-05,
+      "loss": 0.7689,
+      "step": 12745
+    },
+    {
+      "epoch": 2.269230769230769,
+      "grad_norm": 0.912765383720398,
+      "learning_rate": 7.915771847580689e-05,
+      "loss": 0.8259,
+      "step": 12746
+    },
+    {
+      "epoch": 2.269408831908832,
+      "grad_norm": 0.7319221496582031,
+      "learning_rate": 7.914402860889305e-05,
+      "loss": 0.8861,
+      "step": 12747
+    },
+    {
+      "epoch": 2.2695868945868947,
+      "grad_norm": 1.0215193033218384,
+      "learning_rate": 7.913033915060861e-05,
+      "loss": 0.8556,
+      "step": 12748
+    },
+    {
+      "epoch": 2.2697649572649574,
+      "grad_norm": 0.9348630905151367,
+      "learning_rate": 7.911665010122188e-05,
+      "loss": 1.0269,
+      "step": 12749
+    },
+    {
+      "epoch": 2.26994301994302,
+      "grad_norm": 0.7521753311157227,
+      "learning_rate": 7.910296146100096e-05,
+      "loss": 0.8492,
+      "step": 12750
+    },
+    {
+      "epoch": 2.2701210826210825,
+      "grad_norm": 0.7274978756904602,
+      "learning_rate": 7.908927323021414e-05,
+      "loss": 0.7029,
+      "step": 12751
+    },
+    {
+      "epoch": 2.2702991452991452,
+      "grad_norm": 0.8103266954421997,
+      "learning_rate": 7.907558540912954e-05,
+      "loss": 0.5268,
+      "step": 12752
+    },
+    {
+      "epoch": 2.270477207977208,
+      "grad_norm": 0.8645551800727844,
+      "learning_rate": 7.906189799801538e-05,
+      "loss": 0.8172,
+      "step": 12753
+    },
+    {
+      "epoch": 2.2706552706552707,
+      "grad_norm": 0.8652981519699097,
+      "learning_rate": 7.904821099713984e-05,
+      "loss": 0.8711,
+      "step": 12754
+    },
+    {
+      "epoch": 2.2708333333333335,
+      "grad_norm": 0.7020241618156433,
+      "learning_rate": 7.903452440677106e-05,
+      "loss": 0.7202,
+      "step": 12755
+    },
+    {
+      "epoch": 2.271011396011396,
+      "grad_norm": 0.9812583923339844,
+      "learning_rate": 7.902083822717727e-05,
+      "loss": 0.8274,
+      "step": 12756
+    },
+    {
+      "epoch": 2.2711894586894585,
+      "grad_norm": 0.9119269847869873,
+      "learning_rate": 7.900715245862655e-05,
+      "loss": 0.8695,
+      "step": 12757
+    },
+    {
+      "epoch": 2.2713675213675213,
+      "grad_norm": 0.7336047291755676,
+      "learning_rate": 7.899346710138706e-05,
+      "loss": 0.8138,
+      "step": 12758
+    },
+    {
+      "epoch": 2.271545584045584,
+      "grad_norm": 0.900337278842926,
+      "learning_rate": 7.897978215572695e-05,
+      "loss": 0.8346,
+      "step": 12759
+    },
+    {
+      "epoch": 2.271723646723647,
+      "grad_norm": 0.736018717288971,
+      "learning_rate": 7.896609762191437e-05,
+      "loss": 0.7045,
+      "step": 12760
+    },
+    {
+      "epoch": 2.2719017094017095,
+      "grad_norm": 0.8484935760498047,
+      "learning_rate": 7.895241350021737e-05,
+      "loss": 1.05,
+      "step": 12761
+    },
+    {
+      "epoch": 2.2720797720797723,
+      "grad_norm": 0.8032337427139282,
+      "learning_rate": 7.893872979090415e-05,
+      "loss": 0.8024,
+      "step": 12762
+    },
+    {
+      "epoch": 2.2722578347578346,
+      "grad_norm": 0.8957629203796387,
+      "learning_rate": 7.892504649424272e-05,
+      "loss": 0.9593,
+      "step": 12763
+    },
+    {
+      "epoch": 2.2724358974358974,
+      "grad_norm": 0.9227191805839539,
+      "learning_rate": 7.891136361050126e-05,
+      "loss": 0.9978,
+      "step": 12764
+    },
+    {
+      "epoch": 2.27261396011396,
+      "grad_norm": 0.8649391531944275,
+      "learning_rate": 7.88976811399478e-05,
+      "loss": 0.8525,
+      "step": 12765
+    },
+    {
+      "epoch": 2.272792022792023,
+      "grad_norm": 0.8762859106063843,
+      "learning_rate": 7.888399908285046e-05,
+      "loss": 0.9526,
+      "step": 12766
+    },
+    {
+      "epoch": 2.2729700854700856,
+      "grad_norm": 0.8566350340843201,
+      "learning_rate": 7.887031743947729e-05,
+      "loss": 0.7886,
+      "step": 12767
+    },
+    {
+      "epoch": 2.273148148148148,
+      "grad_norm": 0.9285386800765991,
+      "learning_rate": 7.885663621009634e-05,
+      "loss": 1.013,
+      "step": 12768
+    },
+    {
+      "epoch": 2.2733262108262107,
+      "grad_norm": 0.9326284527778625,
+      "learning_rate": 7.884295539497569e-05,
+      "loss": 0.9908,
+      "step": 12769
+    },
+    {
+      "epoch": 2.2735042735042734,
+      "grad_norm": 0.8035810589790344,
+      "learning_rate": 7.882927499438341e-05,
+      "loss": 0.7452,
+      "step": 12770
+    },
+    {
+      "epoch": 2.273682336182336,
+      "grad_norm": 0.831741988658905,
+      "learning_rate": 7.881559500858747e-05,
+      "loss": 0.8782,
+      "step": 12771
+    },
+    {
+      "epoch": 2.273860398860399,
+      "grad_norm": 0.7790034413337708,
+      "learning_rate": 7.880191543785594e-05,
+      "loss": 0.9494,
+      "step": 12772
+    },
+    {
+      "epoch": 2.2740384615384617,
+      "grad_norm": 0.7070899605751038,
+      "learning_rate": 7.878823628245684e-05,
+      "loss": 0.7007,
+      "step": 12773
+    },
+    {
+      "epoch": 2.2742165242165244,
+      "grad_norm": 0.739573061466217,
+      "learning_rate": 7.877455754265818e-05,
+      "loss": 0.758,
+      "step": 12774
+    },
+    {
+      "epoch": 2.2743945868945867,
+      "grad_norm": 1.091391921043396,
+      "learning_rate": 7.876087921872803e-05,
+      "loss": 1.1333,
+      "step": 12775
+    },
+    {
+      "epoch": 2.2745726495726495,
+      "grad_norm": 0.623710036277771,
+      "learning_rate": 7.874720131093427e-05,
+      "loss": 0.7068,
+      "step": 12776
+    },
+    {
+      "epoch": 2.2747507122507122,
+      "grad_norm": 0.7989393472671509,
+      "learning_rate": 7.8733523819545e-05,
+      "loss": 0.77,
+      "step": 12777
+    },
+    {
+      "epoch": 2.274928774928775,
+      "grad_norm": 0.8401352167129517,
+      "learning_rate": 7.87198467448281e-05,
+      "loss": 0.8192,
+      "step": 12778
+    },
+    {
+      "epoch": 2.2751068376068377,
+      "grad_norm": 0.7962843179702759,
+      "learning_rate": 7.870617008705164e-05,
+      "loss": 0.8071,
+      "step": 12779
+    },
+    {
+      "epoch": 2.2752849002849,
+      "grad_norm": 0.9518889784812927,
+      "learning_rate": 7.869249384648351e-05,
+      "loss": 0.8956,
+      "step": 12780
+    },
+    {
+      "epoch": 2.275462962962963,
+      "grad_norm": 0.7469878792762756,
+      "learning_rate": 7.867881802339175e-05,
+      "loss": 0.6816,
+      "step": 12781
+    },
+    {
+      "epoch": 2.2756410256410255,
+      "grad_norm": 0.8888431191444397,
+      "learning_rate": 7.866514261804421e-05,
+      "loss": 0.7906,
+      "step": 12782
+    },
+    {
+      "epoch": 2.2758190883190883,
+      "grad_norm": 0.9856036305427551,
+      "learning_rate": 7.86514676307089e-05,
+      "loss": 0.9371,
+      "step": 12783
+    },
+    {
+      "epoch": 2.275997150997151,
+      "grad_norm": 0.9144912958145142,
+      "learning_rate": 7.863779306165371e-05,
+      "loss": 0.9613,
+      "step": 12784
+    },
+    {
+      "epoch": 2.276175213675214,
+      "grad_norm": 0.7898108959197998,
+      "learning_rate": 7.862411891114665e-05,
+      "loss": 0.8631,
+      "step": 12785
+    },
+    {
+      "epoch": 2.2763532763532766,
+      "grad_norm": 0.8524056077003479,
+      "learning_rate": 7.861044517945552e-05,
+      "loss": 0.7011,
+      "step": 12786
+    },
+    {
+      "epoch": 2.276531339031339,
+      "grad_norm": 0.8811307549476624,
+      "learning_rate": 7.859677186684831e-05,
+      "loss": 0.8138,
+      "step": 12787
+    },
+    {
+      "epoch": 2.2767094017094016,
+      "grad_norm": 0.9247646331787109,
+      "learning_rate": 7.858309897359289e-05,
+      "loss": 0.8971,
+      "step": 12788
+    },
+    {
+      "epoch": 2.2768874643874644,
+      "grad_norm": 0.8655884861946106,
+      "learning_rate": 7.856942649995715e-05,
+      "loss": 0.885,
+      "step": 12789
+    },
+    {
+      "epoch": 2.277065527065527,
+      "grad_norm": 0.9330910444259644,
+      "learning_rate": 7.855575444620897e-05,
+      "loss": 0.8493,
+      "step": 12790
+    },
+    {
+      "epoch": 2.27724358974359,
+      "grad_norm": 0.746694028377533,
+      "learning_rate": 7.854208281261626e-05,
+      "loss": 0.7169,
+      "step": 12791
+    },
+    {
+      "epoch": 2.277421652421652,
+      "grad_norm": 0.9785143136978149,
+      "learning_rate": 7.852841159944685e-05,
+      "loss": 1.023,
+      "step": 12792
+    },
+    {
+      "epoch": 2.277599715099715,
+      "grad_norm": 0.6107021570205688,
+      "learning_rate": 7.851474080696859e-05,
+      "loss": 0.652,
+      "step": 12793
+    },
+    {
+      "epoch": 2.2777777777777777,
+      "grad_norm": 0.9269224405288696,
+      "learning_rate": 7.850107043544937e-05,
+      "loss": 0.8024,
+      "step": 12794
+    },
+    {
+      "epoch": 2.2779558404558404,
+      "grad_norm": 0.8488328456878662,
+      "learning_rate": 7.8487400485157e-05,
+      "loss": 0.8626,
+      "step": 12795
+    },
+    {
+      "epoch": 2.278133903133903,
+      "grad_norm": 0.7187852263450623,
+      "learning_rate": 7.847373095635937e-05,
+      "loss": 0.7416,
+      "step": 12796
+    },
+    {
+      "epoch": 2.278311965811966,
+      "grad_norm": 1.00519859790802,
+      "learning_rate": 7.846006184932422e-05,
+      "loss": 1.0577,
+      "step": 12797
+    },
+    {
+      "epoch": 2.2784900284900287,
+      "grad_norm": 0.8175216913223267,
+      "learning_rate": 7.844639316431945e-05,
+      "loss": 0.9685,
+      "step": 12798
+    },
+    {
+      "epoch": 2.278668091168091,
+      "grad_norm": 0.8239067792892456,
+      "learning_rate": 7.843272490161281e-05,
+      "loss": 0.8714,
+      "step": 12799
+    },
+    {
+      "epoch": 2.2788461538461537,
+      "grad_norm": 0.8089447617530823,
+      "learning_rate": 7.841905706147212e-05,
+      "loss": 0.8397,
+      "step": 12800
+    },
+    {
+      "epoch": 2.2790242165242165,
+      "grad_norm": 0.8505867719650269,
+      "learning_rate": 7.840538964416518e-05,
+      "loss": 0.6872,
+      "step": 12801
+    },
+    {
+      "epoch": 2.2792022792022792,
+      "grad_norm": 0.8512473702430725,
+      "learning_rate": 7.83917226499598e-05,
+      "loss": 1.0422,
+      "step": 12802
+    },
+    {
+      "epoch": 2.279380341880342,
+      "grad_norm": 0.8935198187828064,
+      "learning_rate": 7.837805607912369e-05,
+      "loss": 0.9874,
+      "step": 12803
+    },
+    {
+      "epoch": 2.2795584045584047,
+      "grad_norm": 0.6903477907180786,
+      "learning_rate": 7.836438993192466e-05,
+      "loss": 0.7301,
+      "step": 12804
+    },
+    {
+      "epoch": 2.279736467236467,
+      "grad_norm": 0.7140037417411804,
+      "learning_rate": 7.835072420863046e-05,
+      "loss": 0.8323,
+      "step": 12805
+    },
+    {
+      "epoch": 2.27991452991453,
+      "grad_norm": 0.7974498867988586,
+      "learning_rate": 7.833705890950888e-05,
+      "loss": 0.7784,
+      "step": 12806
+    },
+    {
+      "epoch": 2.2800925925925926,
+      "grad_norm": 0.8191199898719788,
+      "learning_rate": 7.83233940348276e-05,
+      "loss": 0.8584,
+      "step": 12807
+    },
+    {
+      "epoch": 2.2802706552706553,
+      "grad_norm": 0.843112587928772,
+      "learning_rate": 7.83097295848544e-05,
+      "loss": 1.043,
+      "step": 12808
+    },
+    {
+      "epoch": 2.280448717948718,
+      "grad_norm": 0.8029288053512573,
+      "learning_rate": 7.829606555985698e-05,
+      "loss": 0.6806,
+      "step": 12809
+    },
+    {
+      "epoch": 2.280626780626781,
+      "grad_norm": 0.712228536605835,
+      "learning_rate": 7.828240196010311e-05,
+      "loss": 0.7748,
+      "step": 12810
+    },
+    {
+      "epoch": 2.280804843304843,
+      "grad_norm": 0.801659882068634,
+      "learning_rate": 7.82687387858604e-05,
+      "loss": 0.9374,
+      "step": 12811
+    },
+    {
+      "epoch": 2.280982905982906,
+      "grad_norm": 0.8457205891609192,
+      "learning_rate": 7.825507603739666e-05,
+      "loss": 0.9453,
+      "step": 12812
+    },
+    {
+      "epoch": 2.2811609686609686,
+      "grad_norm": 0.9129060506820679,
+      "learning_rate": 7.824141371497948e-05,
+      "loss": 0.9324,
+      "step": 12813
+    },
+    {
+      "epoch": 2.2813390313390314,
+      "grad_norm": 0.947914183139801,
+      "learning_rate": 7.822775181887663e-05,
+      "loss": 0.8275,
+      "step": 12814
+    },
+    {
+      "epoch": 2.281517094017094,
+      "grad_norm": 0.7204358577728271,
+      "learning_rate": 7.821409034935576e-05,
+      "loss": 0.6538,
+      "step": 12815
+    },
+    {
+      "epoch": 2.281695156695157,
+      "grad_norm": 0.8021003603935242,
+      "learning_rate": 7.82004293066845e-05,
+      "loss": 1.0464,
+      "step": 12816
+    },
+    {
+      "epoch": 2.281873219373219,
+      "grad_norm": 0.9530314803123474,
+      "learning_rate": 7.818676869113059e-05,
+      "loss": 0.8854,
+      "step": 12817
+    },
+    {
+      "epoch": 2.282051282051282,
+      "grad_norm": 0.7932098507881165,
+      "learning_rate": 7.81731085029616e-05,
+      "loss": 0.8219,
+      "step": 12818
+    },
+    {
+      "epoch": 2.2822293447293447,
+      "grad_norm": 0.7955925464630127,
+      "learning_rate": 7.815944874244523e-05,
+      "loss": 0.801,
+      "step": 12819
+    },
+    {
+      "epoch": 2.2824074074074074,
+      "grad_norm": 0.8490158915519714,
+      "learning_rate": 7.814578940984907e-05,
+      "loss": 0.8666,
+      "step": 12820
+    },
+    {
+      "epoch": 2.28258547008547,
+      "grad_norm": 0.7325232028961182,
+      "learning_rate": 7.813213050544081e-05,
+      "loss": 0.9579,
+      "step": 12821
+    },
+    {
+      "epoch": 2.282763532763533,
+      "grad_norm": 0.9203488230705261,
+      "learning_rate": 7.811847202948798e-05,
+      "loss": 1.0581,
+      "step": 12822
+    },
+    {
+      "epoch": 2.2829415954415953,
+      "grad_norm": 0.8207429647445679,
+      "learning_rate": 7.810481398225827e-05,
+      "loss": 0.8613,
+      "step": 12823
+    },
+    {
+      "epoch": 2.283119658119658,
+      "grad_norm": 0.872207522392273,
+      "learning_rate": 7.809115636401921e-05,
+      "loss": 0.9155,
+      "step": 12824
+    },
+    {
+      "epoch": 2.2832977207977208,
+      "grad_norm": 0.8032099604606628,
+      "learning_rate": 7.807749917503845e-05,
+      "loss": 0.8294,
+      "step": 12825
+    },
+    {
+      "epoch": 2.2834757834757835,
+      "grad_norm": 0.8824980854988098,
+      "learning_rate": 7.806384241558354e-05,
+      "loss": 0.8618,
+      "step": 12826
+    },
+    {
+      "epoch": 2.2836538461538463,
+      "grad_norm": 0.9057566523551941,
+      "learning_rate": 7.805018608592212e-05,
+      "loss": 0.826,
+      "step": 12827
+    },
+    {
+      "epoch": 2.283831908831909,
+      "grad_norm": 0.8092000484466553,
+      "learning_rate": 7.803653018632164e-05,
+      "loss": 0.8091,
+      "step": 12828
+    },
+    {
+      "epoch": 2.2840099715099713,
+      "grad_norm": 0.8372754454612732,
+      "learning_rate": 7.802287471704976e-05,
+      "loss": 1.108,
+      "step": 12829
+    },
+    {
+      "epoch": 2.284188034188034,
+      "grad_norm": 0.8702181577682495,
+      "learning_rate": 7.800921967837398e-05,
+      "loss": 0.9654,
+      "step": 12830
+    },
+    {
+      "epoch": 2.284366096866097,
+      "grad_norm": 0.9543859958648682,
+      "learning_rate": 7.79955650705619e-05,
+      "loss": 0.9268,
+      "step": 12831
+    },
+    {
+      "epoch": 2.2845441595441596,
+      "grad_norm": 0.7992038726806641,
+      "learning_rate": 7.798191089388096e-05,
+      "loss": 0.8299,
+      "step": 12832
+    },
+    {
+      "epoch": 2.2847222222222223,
+      "grad_norm": 0.8655165433883667,
+      "learning_rate": 7.796825714859874e-05,
+      "loss": 0.9656,
+      "step": 12833
+    },
+    {
+      "epoch": 2.284900284900285,
+      "grad_norm": 0.9013311862945557,
+      "learning_rate": 7.795460383498281e-05,
+      "loss": 0.9373,
+      "step": 12834
+    },
+    {
+      "epoch": 2.285078347578348,
+      "grad_norm": 0.8453806638717651,
+      "learning_rate": 7.794095095330058e-05,
+      "loss": 0.7711,
+      "step": 12835
+    },
+    {
+      "epoch": 2.28525641025641,
+      "grad_norm": 0.8016965985298157,
+      "learning_rate": 7.792729850381959e-05,
+      "loss": 0.7492,
+      "step": 12836
+    },
+    {
+      "epoch": 2.285434472934473,
+      "grad_norm": 0.7191343307495117,
+      "learning_rate": 7.791364648680734e-05,
+      "loss": 0.7541,
+      "step": 12837
+    },
+    {
+      "epoch": 2.2856125356125356,
+      "grad_norm": 0.8210958242416382,
+      "learning_rate": 7.789999490253133e-05,
+      "loss": 0.7448,
+      "step": 12838
+    },
+    {
+      "epoch": 2.2857905982905984,
+      "grad_norm": 0.904022216796875,
+      "learning_rate": 7.788634375125898e-05,
+      "loss": 1.0329,
+      "step": 12839
+    },
+    {
+      "epoch": 2.285968660968661,
+      "grad_norm": 0.8934714794158936,
+      "learning_rate": 7.787269303325779e-05,
+      "loss": 0.8982,
+      "step": 12840
+    },
+    {
+      "epoch": 2.2861467236467234,
+      "grad_norm": 0.9424307942390442,
+      "learning_rate": 7.785904274879521e-05,
+      "loss": 1.0298,
+      "step": 12841
+    },
+    {
+      "epoch": 2.286324786324786,
+      "grad_norm": 0.7753969430923462,
+      "learning_rate": 7.784539289813873e-05,
+      "loss": 0.7811,
+      "step": 12842
+    },
+    {
+      "epoch": 2.286502849002849,
+      "grad_norm": 0.7851901054382324,
+      "learning_rate": 7.78317434815557e-05,
+      "loss": 0.8395,
+      "step": 12843
+    },
+    {
+      "epoch": 2.2866809116809117,
+      "grad_norm": 0.7734000086784363,
+      "learning_rate": 7.781809449931365e-05,
+      "loss": 0.6572,
+      "step": 12844
+    },
+    {
+      "epoch": 2.2868589743589745,
+      "grad_norm": 0.8322952389717102,
+      "learning_rate": 7.780444595167992e-05,
+      "loss": 0.9707,
+      "step": 12845
+    },
+    {
+      "epoch": 2.287037037037037,
+      "grad_norm": 0.8243176341056824,
+      "learning_rate": 7.779079783892203e-05,
+      "loss": 0.8413,
+      "step": 12846
+    },
+    {
+      "epoch": 2.2872150997151,
+      "grad_norm": 0.8600375056266785,
+      "learning_rate": 7.777715016130727e-05,
+      "loss": 0.8471,
+      "step": 12847
+    },
+    {
+      "epoch": 2.2873931623931623,
+      "grad_norm": 0.9846388101577759,
+      "learning_rate": 7.776350291910311e-05,
+      "loss": 1.0187,
+      "step": 12848
+    },
+    {
+      "epoch": 2.287571225071225,
+      "grad_norm": 0.8445034623146057,
+      "learning_rate": 7.774985611257688e-05,
+      "loss": 0.9113,
+      "step": 12849
+    },
+    {
+      "epoch": 2.2877492877492878,
+      "grad_norm": 0.804595947265625,
+      "learning_rate": 7.773620974199604e-05,
+      "loss": 0.8331,
+      "step": 12850
+    },
+    {
+      "epoch": 2.2879273504273505,
+      "grad_norm": 0.7600802779197693,
+      "learning_rate": 7.772256380762789e-05,
+      "loss": 0.8448,
+      "step": 12851
+    },
+    {
+      "epoch": 2.2881054131054133,
+      "grad_norm": 0.7406377792358398,
+      "learning_rate": 7.770891830973984e-05,
+      "loss": 0.7904,
+      "step": 12852
+    },
+    {
+      "epoch": 2.2882834757834756,
+      "grad_norm": 0.7294487357139587,
+      "learning_rate": 7.769527324859924e-05,
+      "loss": 0.8799,
+      "step": 12853
+    },
+    {
+      "epoch": 2.2884615384615383,
+      "grad_norm": 0.8864750266075134,
+      "learning_rate": 7.768162862447342e-05,
+      "loss": 0.9038,
+      "step": 12854
+    },
+    {
+      "epoch": 2.288639601139601,
+      "grad_norm": 0.8933553099632263,
+      "learning_rate": 7.766798443762972e-05,
+      "loss": 0.929,
+      "step": 12855
+    },
+    {
+      "epoch": 2.288817663817664,
+      "grad_norm": 0.8065192103385925,
+      "learning_rate": 7.765434068833545e-05,
+      "loss": 0.9335,
+      "step": 12856
+    },
+    {
+      "epoch": 2.2889957264957266,
+      "grad_norm": 0.8644578456878662,
+      "learning_rate": 7.764069737685802e-05,
+      "loss": 0.7717,
+      "step": 12857
+    },
+    {
+      "epoch": 2.2891737891737893,
+      "grad_norm": 0.8957899212837219,
+      "learning_rate": 7.762705450346462e-05,
+      "loss": 0.8625,
+      "step": 12858
+    },
+    {
+      "epoch": 2.289351851851852,
+      "grad_norm": 0.7164827585220337,
+      "learning_rate": 7.761341206842265e-05,
+      "loss": 0.8018,
+      "step": 12859
+    },
+    {
+      "epoch": 2.2895299145299144,
+      "grad_norm": 0.8752971291542053,
+      "learning_rate": 7.759977007199933e-05,
+      "loss": 0.8517,
+      "step": 12860
+    },
+    {
+      "epoch": 2.289707977207977,
+      "grad_norm": 0.8448139429092407,
+      "learning_rate": 7.758612851446201e-05,
+      "loss": 1.0449,
+      "step": 12861
+    },
+    {
+      "epoch": 2.28988603988604,
+      "grad_norm": 0.81675785779953,
+      "learning_rate": 7.75724873960779e-05,
+      "loss": 1.0952,
+      "step": 12862
+    },
+    {
+      "epoch": 2.2900641025641026,
+      "grad_norm": 0.8215656876564026,
+      "learning_rate": 7.755884671711437e-05,
+      "loss": 0.8419,
+      "step": 12863
+    },
+    {
+      "epoch": 2.2902421652421654,
+      "grad_norm": 0.8270167708396912,
+      "learning_rate": 7.754520647783859e-05,
+      "loss": 0.9065,
+      "step": 12864
+    },
+    {
+      "epoch": 2.2904202279202277,
+      "grad_norm": 0.8222723603248596,
+      "learning_rate": 7.753156667851784e-05,
+      "loss": 0.8536,
+      "step": 12865
+    },
+    {
+      "epoch": 2.2905982905982905,
+      "grad_norm": 0.8383764028549194,
+      "learning_rate": 7.751792731941936e-05,
+      "loss": 0.8829,
+      "step": 12866
+    },
+    {
+      "epoch": 2.290776353276353,
+      "grad_norm": 0.8115772008895874,
+      "learning_rate": 7.750428840081043e-05,
+      "loss": 0.8969,
+      "step": 12867
+    },
+    {
+      "epoch": 2.290954415954416,
+      "grad_norm": 0.8721897602081299,
+      "learning_rate": 7.74906499229582e-05,
+      "loss": 1.031,
+      "step": 12868
+    },
+    {
+      "epoch": 2.2911324786324787,
+      "grad_norm": 0.6958467364311218,
+      "learning_rate": 7.747701188612996e-05,
+      "loss": 0.7528,
+      "step": 12869
+    },
+    {
+      "epoch": 2.2913105413105415,
+      "grad_norm": 0.8352338671684265,
+      "learning_rate": 7.746337429059285e-05,
+      "loss": 0.9297,
+      "step": 12870
+    },
+    {
+      "epoch": 2.291488603988604,
+      "grad_norm": 0.8407408595085144,
+      "learning_rate": 7.744973713661411e-05,
+      "loss": 0.8209,
+      "step": 12871
+    },
+    {
+      "epoch": 2.2916666666666665,
+      "grad_norm": 0.9509777426719666,
+      "learning_rate": 7.743610042446092e-05,
+      "loss": 0.9408,
+      "step": 12872
+    },
+    {
+      "epoch": 2.2918447293447293,
+      "grad_norm": 0.7913112640380859,
+      "learning_rate": 7.742246415440048e-05,
+      "loss": 0.9063,
+      "step": 12873
+    },
+    {
+      "epoch": 2.292022792022792,
+      "grad_norm": 0.90866619348526,
+      "learning_rate": 7.740882832669998e-05,
+      "loss": 1.0178,
+      "step": 12874
+    },
+    {
+      "epoch": 2.2922008547008548,
+      "grad_norm": 0.5832980871200562,
+      "learning_rate": 7.739519294162652e-05,
+      "loss": 0.4138,
+      "step": 12875
+    },
+    {
+      "epoch": 2.2923789173789175,
+      "grad_norm": 0.717993974685669,
+      "learning_rate": 7.738155799944732e-05,
+      "loss": 0.7303,
+      "step": 12876
+    },
+    {
+      "epoch": 2.29255698005698,
+      "grad_norm": 0.7821396589279175,
+      "learning_rate": 7.736792350042948e-05,
+      "loss": 0.829,
+      "step": 12877
+    },
+    {
+      "epoch": 2.2927350427350426,
+      "grad_norm": 0.8877809047698975,
+      "learning_rate": 7.735428944484021e-05,
+      "loss": 0.8883,
+      "step": 12878
+    },
+    {
+      "epoch": 2.2929131054131053,
+      "grad_norm": 0.7754776477813721,
+      "learning_rate": 7.734065583294656e-05,
+      "loss": 0.807,
+      "step": 12879
+    },
+    {
+      "epoch": 2.293091168091168,
+      "grad_norm": 0.851157009601593,
+      "learning_rate": 7.73270226650157e-05,
+      "loss": 0.8859,
+      "step": 12880
+    },
+    {
+      "epoch": 2.293269230769231,
+      "grad_norm": 0.7635365128517151,
+      "learning_rate": 7.731338994131472e-05,
+      "loss": 0.9796,
+      "step": 12881
+    },
+    {
+      "epoch": 2.2934472934472936,
+      "grad_norm": 0.8386050462722778,
+      "learning_rate": 7.729975766211078e-05,
+      "loss": 0.788,
+      "step": 12882
+    },
+    {
+      "epoch": 2.2936253561253563,
+      "grad_norm": 0.7092825174331665,
+      "learning_rate": 7.728612582767088e-05,
+      "loss": 0.6855,
+      "step": 12883
+    },
+    {
+      "epoch": 2.2938034188034186,
+      "grad_norm": 0.8651222586631775,
+      "learning_rate": 7.72724944382622e-05,
+      "loss": 0.8875,
+      "step": 12884
+    },
+    {
+      "epoch": 2.2939814814814814,
+      "grad_norm": 0.89743572473526,
+      "learning_rate": 7.725886349415175e-05,
+      "loss": 0.9256,
+      "step": 12885
+    },
+    {
+      "epoch": 2.294159544159544,
+      "grad_norm": 0.8257600665092468,
+      "learning_rate": 7.724523299560664e-05,
+      "loss": 0.6703,
+      "step": 12886
+    },
+    {
+      "epoch": 2.294337606837607,
+      "grad_norm": 0.8133751153945923,
+      "learning_rate": 7.72316029428939e-05,
+      "loss": 0.8991,
+      "step": 12887
+    },
+    {
+      "epoch": 2.2945156695156697,
+      "grad_norm": 0.7874962687492371,
+      "learning_rate": 7.721797333628065e-05,
+      "loss": 0.8679,
+      "step": 12888
+    },
+    {
+      "epoch": 2.294693732193732,
+      "grad_norm": 0.8284404277801514,
+      "learning_rate": 7.720434417603384e-05,
+      "loss": 0.873,
+      "step": 12889
+    },
+    {
+      "epoch": 2.2948717948717947,
+      "grad_norm": 0.8751698136329651,
+      "learning_rate": 7.719071546242058e-05,
+      "loss": 1.0671,
+      "step": 12890
+    },
+    {
+      "epoch": 2.2950498575498575,
+      "grad_norm": 0.9355120062828064,
+      "learning_rate": 7.717708719570784e-05,
+      "loss": 0.93,
+      "step": 12891
+    },
+    {
+      "epoch": 2.29522792022792,
+      "grad_norm": 0.8643141984939575,
+      "learning_rate": 7.716345937616267e-05,
+      "loss": 0.7635,
+      "step": 12892
+    },
+    {
+      "epoch": 2.295405982905983,
+      "grad_norm": 0.9343852996826172,
+      "learning_rate": 7.714983200405212e-05,
+      "loss": 1.0624,
+      "step": 12893
+    },
+    {
+      "epoch": 2.2955840455840457,
+      "grad_norm": 0.893825352191925,
+      "learning_rate": 7.71362050796431e-05,
+      "loss": 1.0843,
+      "step": 12894
+    },
+    {
+      "epoch": 2.2957621082621085,
+      "grad_norm": 0.920723021030426,
+      "learning_rate": 7.712257860320269e-05,
+      "loss": 0.9681,
+      "step": 12895
+    },
+    {
+      "epoch": 2.2959401709401708,
+      "grad_norm": 0.9275181293487549,
+      "learning_rate": 7.710895257499778e-05,
+      "loss": 0.8904,
+      "step": 12896
+    },
+    {
+      "epoch": 2.2961182336182335,
+      "grad_norm": 0.9343428611755371,
+      "learning_rate": 7.709532699529543e-05,
+      "loss": 0.9338,
+      "step": 12897
+    },
+    {
+      "epoch": 2.2962962962962963,
+      "grad_norm": 0.7457774877548218,
+      "learning_rate": 7.708170186436252e-05,
+      "loss": 0.6521,
+      "step": 12898
+    },
+    {
+      "epoch": 2.296474358974359,
+      "grad_norm": 0.7977834343910217,
+      "learning_rate": 7.706807718246611e-05,
+      "loss": 0.887,
+      "step": 12899
+    },
+    {
+      "epoch": 2.296652421652422,
+      "grad_norm": 0.774459719657898,
+      "learning_rate": 7.705445294987304e-05,
+      "loss": 0.914,
+      "step": 12900
+    },
+    {
+      "epoch": 2.296830484330484,
+      "grad_norm": 0.8464851379394531,
+      "learning_rate": 7.704082916685034e-05,
+      "loss": 1.0116,
+      "step": 12901
+    },
+    {
+      "epoch": 2.297008547008547,
+      "grad_norm": 0.8497290015220642,
+      "learning_rate": 7.702720583366486e-05,
+      "loss": 0.9242,
+      "step": 12902
+    },
+    {
+      "epoch": 2.2971866096866096,
+      "grad_norm": 0.8673670291900635,
+      "learning_rate": 7.70135829505836e-05,
+      "loss": 0.8172,
+      "step": 12903
+    },
+    {
+      "epoch": 2.2973646723646723,
+      "grad_norm": 0.786389172077179,
+      "learning_rate": 7.699996051787341e-05,
+      "loss": 0.6713,
+      "step": 12904
+    },
+    {
+      "epoch": 2.297542735042735,
+      "grad_norm": 0.8441919088363647,
+      "learning_rate": 7.698633853580124e-05,
+      "loss": 0.7835,
+      "step": 12905
+    },
+    {
+      "epoch": 2.297720797720798,
+      "grad_norm": 0.8806493878364563,
+      "learning_rate": 7.697271700463392e-05,
+      "loss": 0.9103,
+      "step": 12906
+    },
+    {
+      "epoch": 2.2978988603988606,
+      "grad_norm": 0.7418580651283264,
+      "learning_rate": 7.69590959246384e-05,
+      "loss": 0.9052,
+      "step": 12907
+    },
+    {
+      "epoch": 2.298076923076923,
+      "grad_norm": 0.7883853316307068,
+      "learning_rate": 7.694547529608152e-05,
+      "loss": 0.7689,
+      "step": 12908
+    },
+    {
+      "epoch": 2.2982549857549857,
+      "grad_norm": 0.7842690944671631,
+      "learning_rate": 7.693185511923017e-05,
+      "loss": 0.9587,
+      "step": 12909
+    },
+    {
+      "epoch": 2.2984330484330484,
+      "grad_norm": 0.884484052658081,
+      "learning_rate": 7.691823539435119e-05,
+      "loss": 0.9562,
+      "step": 12910
+    },
+    {
+      "epoch": 2.298611111111111,
+      "grad_norm": 0.8152852058410645,
+      "learning_rate": 7.690461612171145e-05,
+      "loss": 0.9857,
+      "step": 12911
+    },
+    {
+      "epoch": 2.298789173789174,
+      "grad_norm": 0.8502064943313599,
+      "learning_rate": 7.689099730157776e-05,
+      "loss": 0.7806,
+      "step": 12912
+    },
+    {
+      "epoch": 2.298967236467236,
+      "grad_norm": 0.9655177593231201,
+      "learning_rate": 7.687737893421697e-05,
+      "loss": 0.9693,
+      "step": 12913
+    },
+    {
+      "epoch": 2.299145299145299,
+      "grad_norm": 0.7759003639221191,
+      "learning_rate": 7.686376101989596e-05,
+      "loss": 0.9137,
+      "step": 12914
+    },
+    {
+      "epoch": 2.2993233618233617,
+      "grad_norm": 0.6987054944038391,
+      "learning_rate": 7.685014355888143e-05,
+      "loss": 0.9026,
+      "step": 12915
+    },
+    {
+      "epoch": 2.2995014245014245,
+      "grad_norm": 0.762819230556488,
+      "learning_rate": 7.683652655144027e-05,
+      "loss": 0.8358,
+      "step": 12916
+    },
+    {
+      "epoch": 2.2996794871794872,
+      "grad_norm": 0.8233383893966675,
+      "learning_rate": 7.682290999783924e-05,
+      "loss": 0.8468,
+      "step": 12917
+    },
+    {
+      "epoch": 2.29985754985755,
+      "grad_norm": 0.8558689951896667,
+      "learning_rate": 7.68092938983452e-05,
+      "loss": 0.9018,
+      "step": 12918
+    },
+    {
+      "epoch": 2.3000356125356127,
+      "grad_norm": 0.741760790348053,
+      "learning_rate": 7.67956782532248e-05,
+      "loss": 0.7968,
+      "step": 12919
+    },
+    {
+      "epoch": 2.300213675213675,
+      "grad_norm": 0.9132583737373352,
+      "learning_rate": 7.678206306274495e-05,
+      "loss": 0.9952,
+      "step": 12920
+    },
+    {
+      "epoch": 2.300391737891738,
+      "grad_norm": 0.7656551003456116,
+      "learning_rate": 7.67684483271723e-05,
+      "loss": 0.8772,
+      "step": 12921
+    },
+    {
+      "epoch": 2.3005698005698005,
+      "grad_norm": 0.7407111525535583,
+      "learning_rate": 7.675483404677364e-05,
+      "loss": 0.8199,
+      "step": 12922
+    },
+    {
+      "epoch": 2.3007478632478633,
+      "grad_norm": 0.9602083563804626,
+      "learning_rate": 7.674122022181571e-05,
+      "loss": 1.0837,
+      "step": 12923
+    },
+    {
+      "epoch": 2.300925925925926,
+      "grad_norm": 0.7562392354011536,
+      "learning_rate": 7.672760685256531e-05,
+      "loss": 0.8148,
+      "step": 12924
+    },
+    {
+      "epoch": 2.301103988603989,
+      "grad_norm": 0.9260091185569763,
+      "learning_rate": 7.671399393928906e-05,
+      "loss": 0.9508,
+      "step": 12925
+    },
+    {
+      "epoch": 2.301282051282051,
+      "grad_norm": 0.8745924234390259,
+      "learning_rate": 7.670038148225374e-05,
+      "loss": 0.8688,
+      "step": 12926
+    },
+    {
+      "epoch": 2.301460113960114,
+      "grad_norm": 0.7802116274833679,
+      "learning_rate": 7.668676948172602e-05,
+      "loss": 0.7698,
+      "step": 12927
+    },
+    {
+      "epoch": 2.3016381766381766,
+      "grad_norm": 0.7701709866523743,
+      "learning_rate": 7.667315793797268e-05,
+      "loss": 0.7633,
+      "step": 12928
+    },
+    {
+      "epoch": 2.3018162393162394,
+      "grad_norm": 0.8084021806716919,
+      "learning_rate": 7.66595468512603e-05,
+      "loss": 0.8502,
+      "step": 12929
+    },
+    {
+      "epoch": 2.301994301994302,
+      "grad_norm": 1.0485330820083618,
+      "learning_rate": 7.664593622185568e-05,
+      "loss": 0.8049,
+      "step": 12930
+    },
+    {
+      "epoch": 2.302172364672365,
+      "grad_norm": 0.7852743864059448,
+      "learning_rate": 7.663232605002535e-05,
+      "loss": 0.882,
+      "step": 12931
+    },
+    {
+      "epoch": 2.302350427350427,
+      "grad_norm": 0.7795702815055847,
+      "learning_rate": 7.661871633603607e-05,
+      "loss": 0.7841,
+      "step": 12932
+    },
+    {
+      "epoch": 2.30252849002849,
+      "grad_norm": 0.8882975578308105,
+      "learning_rate": 7.660510708015448e-05,
+      "loss": 1.117,
+      "step": 12933
+    },
+    {
+      "epoch": 2.3027065527065527,
+      "grad_norm": 0.7987662553787231,
+      "learning_rate": 7.65914982826472e-05,
+      "loss": 0.8552,
+      "step": 12934
+    },
+    {
+      "epoch": 2.3028846153846154,
+      "grad_norm": 0.8141679167747498,
+      "learning_rate": 7.657788994378095e-05,
+      "loss": 0.8288,
+      "step": 12935
+    },
+    {
+      "epoch": 2.303062678062678,
+      "grad_norm": 0.8506320118904114,
+      "learning_rate": 7.656428206382222e-05,
+      "loss": 0.7521,
+      "step": 12936
+    },
+    {
+      "epoch": 2.303240740740741,
+      "grad_norm": 0.7666227221488953,
+      "learning_rate": 7.655067464303773e-05,
+      "loss": 0.8394,
+      "step": 12937
+    },
+    {
+      "epoch": 2.3034188034188032,
+      "grad_norm": 0.8018062710762024,
+      "learning_rate": 7.653706768169405e-05,
+      "loss": 0.7566,
+      "step": 12938
+    },
+    {
+      "epoch": 2.303596866096866,
+      "grad_norm": 0.8054059743881226,
+      "learning_rate": 7.652346118005779e-05,
+      "loss": 0.8749,
+      "step": 12939
+    },
+    {
+      "epoch": 2.3037749287749287,
+      "grad_norm": 0.8663263320922852,
+      "learning_rate": 7.650985513839554e-05,
+      "loss": 0.7799,
+      "step": 12940
+    },
+    {
+      "epoch": 2.3039529914529915,
+      "grad_norm": 0.7591161727905273,
+      "learning_rate": 7.64962495569739e-05,
+      "loss": 0.7378,
+      "step": 12941
+    },
+    {
+      "epoch": 2.3041310541310542,
+      "grad_norm": 0.8118969202041626,
+      "learning_rate": 7.64826444360594e-05,
+      "loss": 0.7948,
+      "step": 12942
+    },
+    {
+      "epoch": 2.304309116809117,
+      "grad_norm": 0.6880847811698914,
+      "learning_rate": 7.646903977591865e-05,
+      "loss": 0.9164,
+      "step": 12943
+    },
+    {
+      "epoch": 2.3044871794871793,
+      "grad_norm": 0.814386248588562,
+      "learning_rate": 7.645543557681816e-05,
+      "loss": 0.7998,
+      "step": 12944
+    },
+    {
+      "epoch": 2.304665242165242,
+      "grad_norm": 0.8295530676841736,
+      "learning_rate": 7.644183183902454e-05,
+      "loss": 0.812,
+      "step": 12945
+    },
+    {
+      "epoch": 2.304843304843305,
+      "grad_norm": 0.7872505187988281,
+      "learning_rate": 7.642822856280424e-05,
+      "loss": 0.9073,
+      "step": 12946
+    },
+    {
+      "epoch": 2.3050213675213675,
+      "grad_norm": 0.9217497110366821,
+      "learning_rate": 7.641462574842387e-05,
+      "loss": 0.7762,
+      "step": 12947
+    },
+    {
+      "epoch": 2.3051994301994303,
+      "grad_norm": 0.7502169609069824,
+      "learning_rate": 7.640102339614987e-05,
+      "loss": 0.9374,
+      "step": 12948
+    },
+    {
+      "epoch": 2.305377492877493,
+      "grad_norm": 0.8262767195701599,
+      "learning_rate": 7.638742150624886e-05,
+      "loss": 0.5363,
+      "step": 12949
+    },
+    {
+      "epoch": 2.3055555555555554,
+      "grad_norm": 0.7571384310722351,
+      "learning_rate": 7.637382007898722e-05,
+      "loss": 0.9548,
+      "step": 12950
+    },
+    {
+      "epoch": 2.305733618233618,
+      "grad_norm": 0.7899317145347595,
+      "learning_rate": 7.636021911463152e-05,
+      "loss": 0.7718,
+      "step": 12951
+    },
+    {
+      "epoch": 2.305911680911681,
+      "grad_norm": 0.7772458791732788,
+      "learning_rate": 7.634661861344819e-05,
+      "loss": 0.7158,
+      "step": 12952
+    },
+    {
+      "epoch": 2.3060897435897436,
+      "grad_norm": 0.8279168009757996,
+      "learning_rate": 7.633301857570374e-05,
+      "loss": 0.7835,
+      "step": 12953
+    },
+    {
+      "epoch": 2.3062678062678064,
+      "grad_norm": 0.751268208026886,
+      "learning_rate": 7.631941900166468e-05,
+      "loss": 0.8609,
+      "step": 12954
+    },
+    {
+      "epoch": 2.306445868945869,
+      "grad_norm": 0.8101294636726379,
+      "learning_rate": 7.630581989159736e-05,
+      "loss": 1.0242,
+      "step": 12955
+    },
+    {
+      "epoch": 2.306623931623932,
+      "grad_norm": 0.7707645297050476,
+      "learning_rate": 7.629222124576831e-05,
+      "loss": 0.7969,
+      "step": 12956
+    },
+    {
+      "epoch": 2.306801994301994,
+      "grad_norm": 0.6519944667816162,
+      "learning_rate": 7.627862306444391e-05,
+      "loss": 0.5459,
+      "step": 12957
+    },
+    {
+      "epoch": 2.306980056980057,
+      "grad_norm": 0.7738897800445557,
+      "learning_rate": 7.626502534789063e-05,
+      "loss": 0.957,
+      "step": 12958
+    },
+    {
+      "epoch": 2.3071581196581197,
+      "grad_norm": 0.7059842944145203,
+      "learning_rate": 7.625142809637485e-05,
+      "loss": 0.6316,
+      "step": 12959
+    },
+    {
+      "epoch": 2.3073361823361824,
+      "grad_norm": 0.8380797505378723,
+      "learning_rate": 7.623783131016305e-05,
+      "loss": 0.9685,
+      "step": 12960
+    },
+    {
+      "epoch": 2.307514245014245,
+      "grad_norm": 0.8272121548652649,
+      "learning_rate": 7.622423498952154e-05,
+      "loss": 0.9425,
+      "step": 12961
+    },
+    {
+      "epoch": 2.3076923076923075,
+      "grad_norm": 0.763522744178772,
+      "learning_rate": 7.621063913471678e-05,
+      "loss": 0.7778,
+      "step": 12962
+    },
+    {
+      "epoch": 2.3078703703703702,
+      "grad_norm": 0.8345584273338318,
+      "learning_rate": 7.61970437460151e-05,
+      "loss": 0.9652,
+      "step": 12963
+    },
+    {
+      "epoch": 2.308048433048433,
+      "grad_norm": 0.943286657333374,
+      "learning_rate": 7.618344882368294e-05,
+      "loss": 0.9088,
+      "step": 12964
+    },
+    {
+      "epoch": 2.3082264957264957,
+      "grad_norm": 0.8568450212478638,
+      "learning_rate": 7.616985436798659e-05,
+      "loss": 0.7535,
+      "step": 12965
+    },
+    {
+      "epoch": 2.3084045584045585,
+      "grad_norm": 0.8722548484802246,
+      "learning_rate": 7.615626037919248e-05,
+      "loss": 0.9802,
+      "step": 12966
+    },
+    {
+      "epoch": 2.3085826210826212,
+      "grad_norm": 1.0332363843917847,
+      "learning_rate": 7.614266685756688e-05,
+      "loss": 0.9105,
+      "step": 12967
+    },
+    {
+      "epoch": 2.308760683760684,
+      "grad_norm": 0.7503480315208435,
+      "learning_rate": 7.612907380337619e-05,
+      "loss": 0.7345,
+      "step": 12968
+    },
+    {
+      "epoch": 2.3089387464387463,
+      "grad_norm": 0.7406014204025269,
+      "learning_rate": 7.611548121688668e-05,
+      "loss": 0.9222,
+      "step": 12969
+    },
+    {
+      "epoch": 2.309116809116809,
+      "grad_norm": 0.7574487328529358,
+      "learning_rate": 7.610188909836474e-05,
+      "loss": 0.7709,
+      "step": 12970
+    },
+    {
+      "epoch": 2.309294871794872,
+      "grad_norm": 0.8669037818908691,
+      "learning_rate": 7.608829744807661e-05,
+      "loss": 0.838,
+      "step": 12971
+    },
+    {
+      "epoch": 2.3094729344729346,
+      "grad_norm": 0.7544569373130798,
+      "learning_rate": 7.607470626628861e-05,
+      "loss": 0.6966,
+      "step": 12972
+    },
+    {
+      "epoch": 2.3096509971509973,
+      "grad_norm": 0.8201249241828918,
+      "learning_rate": 7.606111555326706e-05,
+      "loss": 0.9322,
+      "step": 12973
+    },
+    {
+      "epoch": 2.3098290598290596,
+      "grad_norm": 0.7935477495193481,
+      "learning_rate": 7.60475253092782e-05,
+      "loss": 0.7981,
+      "step": 12974
+    },
+    {
+      "epoch": 2.3100071225071224,
+      "grad_norm": 0.8775026798248291,
+      "learning_rate": 7.603393553458838e-05,
+      "loss": 0.8352,
+      "step": 12975
+    },
+    {
+      "epoch": 2.310185185185185,
+      "grad_norm": 0.8422450423240662,
+      "learning_rate": 7.602034622946374e-05,
+      "loss": 0.577,
+      "step": 12976
+    },
+    {
+      "epoch": 2.310363247863248,
+      "grad_norm": 0.8584204316139221,
+      "learning_rate": 7.600675739417067e-05,
+      "loss": 0.9767,
+      "step": 12977
+    },
+    {
+      "epoch": 2.3105413105413106,
+      "grad_norm": 0.7818547487258911,
+      "learning_rate": 7.599316902897528e-05,
+      "loss": 0.7675,
+      "step": 12978
+    },
+    {
+      "epoch": 2.3107193732193734,
+      "grad_norm": 0.93815016746521,
+      "learning_rate": 7.597958113414391e-05,
+      "loss": 0.8517,
+      "step": 12979
+    },
+    {
+      "epoch": 2.310897435897436,
+      "grad_norm": 0.8092408776283264,
+      "learning_rate": 7.596599370994272e-05,
+      "loss": 0.7266,
+      "step": 12980
+    },
+    {
+      "epoch": 2.3110754985754984,
+      "grad_norm": 0.8577243089675903,
+      "learning_rate": 7.595240675663802e-05,
+      "loss": 0.9138,
+      "step": 12981
+    },
+    {
+      "epoch": 2.311253561253561,
+      "grad_norm": 0.8600401878356934,
+      "learning_rate": 7.59388202744959e-05,
+      "loss": 0.8348,
+      "step": 12982
+    },
+    {
+      "epoch": 2.311431623931624,
+      "grad_norm": 0.6399564743041992,
+      "learning_rate": 7.592523426378264e-05,
+      "loss": 0.6649,
+      "step": 12983
+    },
+    {
+      "epoch": 2.3116096866096867,
+      "grad_norm": 0.7916820049285889,
+      "learning_rate": 7.591164872476438e-05,
+      "loss": 0.8048,
+      "step": 12984
+    },
+    {
+      "epoch": 2.3117877492877494,
+      "grad_norm": 0.7748355269432068,
+      "learning_rate": 7.589806365770736e-05,
+      "loss": 1.0101,
+      "step": 12985
+    },
+    {
+      "epoch": 2.3119658119658117,
+      "grad_norm": 0.8463436365127563,
+      "learning_rate": 7.588447906287767e-05,
+      "loss": 0.7547,
+      "step": 12986
+    },
+    {
+      "epoch": 2.3121438746438745,
+      "grad_norm": 0.8257808685302734,
+      "learning_rate": 7.587089494054155e-05,
+      "loss": 0.8093,
+      "step": 12987
+    },
+    {
+      "epoch": 2.3123219373219372,
+      "grad_norm": 0.843781054019928,
+      "learning_rate": 7.58573112909651e-05,
+      "loss": 0.8379,
+      "step": 12988
+    },
+    {
+      "epoch": 2.3125,
+      "grad_norm": 0.8782341480255127,
+      "learning_rate": 7.584372811441452e-05,
+      "loss": 0.9372,
+      "step": 12989
+    },
+    {
+      "epoch": 2.3126780626780628,
+      "grad_norm": 0.8465158343315125,
+      "learning_rate": 7.583014541115585e-05,
+      "loss": 0.8427,
+      "step": 12990
+    },
+    {
+      "epoch": 2.3128561253561255,
+      "grad_norm": 0.7140238285064697,
+      "learning_rate": 7.58165631814553e-05,
+      "loss": 0.7896,
+      "step": 12991
+    },
+    {
+      "epoch": 2.3130341880341883,
+      "grad_norm": 0.9414699077606201,
+      "learning_rate": 7.580298142557898e-05,
+      "loss": 1.0464,
+      "step": 12992
+    },
+    {
+      "epoch": 2.3132122507122506,
+      "grad_norm": 0.7970326542854309,
+      "learning_rate": 7.578940014379293e-05,
+      "loss": 0.696,
+      "step": 12993
+    },
+    {
+      "epoch": 2.3133903133903133,
+      "grad_norm": 0.7377375960350037,
+      "learning_rate": 7.577581933636332e-05,
+      "loss": 0.7205,
+      "step": 12994
+    },
+    {
+      "epoch": 2.313568376068376,
+      "grad_norm": 0.7740316987037659,
+      "learning_rate": 7.576223900355619e-05,
+      "loss": 0.7448,
+      "step": 12995
+    },
+    {
+      "epoch": 2.313746438746439,
+      "grad_norm": 0.7820385098457336,
+      "learning_rate": 7.574865914563767e-05,
+      "loss": 0.7289,
+      "step": 12996
+    },
+    {
+      "epoch": 2.3139245014245016,
+      "grad_norm": 0.8180822134017944,
+      "learning_rate": 7.573507976287376e-05,
+      "loss": 0.8709,
+      "step": 12997
+    },
+    {
+      "epoch": 2.314102564102564,
+      "grad_norm": 0.9008440971374512,
+      "learning_rate": 7.572150085553058e-05,
+      "loss": 0.7938,
+      "step": 12998
+    },
+    {
+      "epoch": 2.3142806267806266,
+      "grad_norm": 0.786400318145752,
+      "learning_rate": 7.570792242387414e-05,
+      "loss": 0.9866,
+      "step": 12999
+    },
+    {
+      "epoch": 2.3144586894586894,
+      "grad_norm": 0.872160792350769,
+      "learning_rate": 7.569434446817052e-05,
+      "loss": 0.7319,
+      "step": 13000
+    },
+    {
+      "epoch": 2.314636752136752,
+      "grad_norm": 0.7858988642692566,
+      "learning_rate": 7.56807669886857e-05,
+      "loss": 0.8786,
+      "step": 13001
+    },
+    {
+      "epoch": 2.314814814814815,
+      "grad_norm": 0.7090579271316528,
+      "learning_rate": 7.566718998568579e-05,
+      "loss": 0.7092,
+      "step": 13002
+    },
+    {
+      "epoch": 2.3149928774928776,
+      "grad_norm": 0.7881498336791992,
+      "learning_rate": 7.565361345943668e-05,
+      "loss": 0.876,
+      "step": 13003
+    },
+    {
+      "epoch": 2.3151709401709404,
+      "grad_norm": 0.8768819570541382,
+      "learning_rate": 7.564003741020447e-05,
+      "loss": 1.0374,
+      "step": 13004
+    },
+    {
+      "epoch": 2.3153490028490027,
+      "grad_norm": 0.7608295679092407,
+      "learning_rate": 7.56264618382551e-05,
+      "loss": 0.7551,
+      "step": 13005
+    },
+    {
+      "epoch": 2.3155270655270654,
+      "grad_norm": 0.6947942972183228,
+      "learning_rate": 7.561288674385462e-05,
+      "loss": 0.8132,
+      "step": 13006
+    },
+    {
+      "epoch": 2.315705128205128,
+      "grad_norm": 0.8722706437110901,
+      "learning_rate": 7.559931212726892e-05,
+      "loss": 0.7486,
+      "step": 13007
+    },
+    {
+      "epoch": 2.315883190883191,
+      "grad_norm": 0.9804681539535522,
+      "learning_rate": 7.558573798876404e-05,
+      "loss": 0.899,
+      "step": 13008
+    },
+    {
+      "epoch": 2.3160612535612537,
+      "grad_norm": 0.9246195554733276,
+      "learning_rate": 7.557216432860587e-05,
+      "loss": 0.7742,
+      "step": 13009
+    },
+    {
+      "epoch": 2.316239316239316,
+      "grad_norm": 0.8792895078659058,
+      "learning_rate": 7.555859114706046e-05,
+      "loss": 0.8299,
+      "step": 13010
+    },
+    {
+      "epoch": 2.3164173789173788,
+      "grad_norm": 0.8280500769615173,
+      "learning_rate": 7.554501844439362e-05,
+      "loss": 0.8708,
+      "step": 13011
+    },
+    {
+      "epoch": 2.3165954415954415,
+      "grad_norm": 0.8560570478439331,
+      "learning_rate": 7.553144622087136e-05,
+      "loss": 0.9571,
+      "step": 13012
+    },
+    {
+      "epoch": 2.3167735042735043,
+      "grad_norm": 0.8504697680473328,
+      "learning_rate": 7.551787447675962e-05,
+      "loss": 0.7609,
+      "step": 13013
+    },
+    {
+      "epoch": 2.316951566951567,
+      "grad_norm": 0.8199480772018433,
+      "learning_rate": 7.550430321232423e-05,
+      "loss": 0.7077,
+      "step": 13014
+    },
+    {
+      "epoch": 2.3171296296296298,
+      "grad_norm": 0.854341447353363,
+      "learning_rate": 7.549073242783115e-05,
+      "loss": 0.9602,
+      "step": 13015
+    },
+    {
+      "epoch": 2.3173076923076925,
+      "grad_norm": 0.7619566917419434,
+      "learning_rate": 7.547716212354623e-05,
+      "loss": 0.9967,
+      "step": 13016
+    },
+    {
+      "epoch": 2.317485754985755,
+      "grad_norm": 0.7371547222137451,
+      "learning_rate": 7.546359229973543e-05,
+      "loss": 0.7239,
+      "step": 13017
+    },
+    {
+      "epoch": 2.3176638176638176,
+      "grad_norm": 0.7797415852546692,
+      "learning_rate": 7.545002295666453e-05,
+      "loss": 0.7472,
+      "step": 13018
+    },
+    {
+      "epoch": 2.3178418803418803,
+      "grad_norm": 0.8549608588218689,
+      "learning_rate": 7.543645409459943e-05,
+      "loss": 0.8968,
+      "step": 13019
+    },
+    {
+      "epoch": 2.318019943019943,
+      "grad_norm": 0.7931239008903503,
+      "learning_rate": 7.542288571380598e-05,
+      "loss": 0.9853,
+      "step": 13020
+    },
+    {
+      "epoch": 2.318198005698006,
+      "grad_norm": 0.797726035118103,
+      "learning_rate": 7.540931781455008e-05,
+      "loss": 0.9366,
+      "step": 13021
+    },
+    {
+      "epoch": 2.318376068376068,
+      "grad_norm": 0.7382092475891113,
+      "learning_rate": 7.539575039709747e-05,
+      "loss": 0.6484,
+      "step": 13022
+    },
+    {
+      "epoch": 2.318554131054131,
+      "grad_norm": 0.83231121301651,
+      "learning_rate": 7.538218346171403e-05,
+      "loss": 1.0184,
+      "step": 13023
+    },
+    {
+      "epoch": 2.3187321937321936,
+      "grad_norm": 0.8613845109939575,
+      "learning_rate": 7.536861700866554e-05,
+      "loss": 0.8019,
+      "step": 13024
+    },
+    {
+      "epoch": 2.3189102564102564,
+      "grad_norm": 0.7736538648605347,
+      "learning_rate": 7.53550510382179e-05,
+      "loss": 0.8228,
+      "step": 13025
+    },
+    {
+      "epoch": 2.319088319088319,
+      "grad_norm": 0.7894346714019775,
+      "learning_rate": 7.534148555063678e-05,
+      "loss": 0.8189,
+      "step": 13026
+    },
+    {
+      "epoch": 2.319266381766382,
+      "grad_norm": 0.7333146333694458,
+      "learning_rate": 7.532792054618807e-05,
+      "loss": 0.8456,
+      "step": 13027
+    },
+    {
+      "epoch": 2.3194444444444446,
+      "grad_norm": 1.0321780443191528,
+      "learning_rate": 7.531435602513745e-05,
+      "loss": 0.8594,
+      "step": 13028
+    },
+    {
+      "epoch": 2.319622507122507,
+      "grad_norm": 0.8658601641654968,
+      "learning_rate": 7.530079198775079e-05,
+      "loss": 0.9638,
+      "step": 13029
+    },
+    {
+      "epoch": 2.3198005698005697,
+      "grad_norm": 0.7287920713424683,
+      "learning_rate": 7.528722843429376e-05,
+      "loss": 0.7001,
+      "step": 13030
+    },
+    {
+      "epoch": 2.3199786324786325,
+      "grad_norm": 0.7398431301116943,
+      "learning_rate": 7.527366536503218e-05,
+      "loss": 0.8595,
+      "step": 13031
+    },
+    {
+      "epoch": 2.320156695156695,
+      "grad_norm": 0.8127739429473877,
+      "learning_rate": 7.526010278023178e-05,
+      "loss": 0.7641,
+      "step": 13032
+    },
+    {
+      "epoch": 2.320334757834758,
+      "grad_norm": 0.776497483253479,
+      "learning_rate": 7.524654068015824e-05,
+      "loss": 0.8666,
+      "step": 13033
+    },
+    {
+      "epoch": 2.3205128205128207,
+      "grad_norm": 0.8524185419082642,
+      "learning_rate": 7.523297906507733e-05,
+      "loss": 0.9815,
+      "step": 13034
+    },
+    {
+      "epoch": 2.320690883190883,
+      "grad_norm": 0.7745016813278198,
+      "learning_rate": 7.521941793525474e-05,
+      "loss": 0.8527,
+      "step": 13035
+    },
+    {
+      "epoch": 2.3208689458689458,
+      "grad_norm": 0.8695911169052124,
+      "learning_rate": 7.52058572909562e-05,
+      "loss": 0.8881,
+      "step": 13036
+    },
+    {
+      "epoch": 2.3210470085470085,
+      "grad_norm": 0.788969099521637,
+      "learning_rate": 7.519229713244736e-05,
+      "loss": 0.7886,
+      "step": 13037
+    },
+    {
+      "epoch": 2.3212250712250713,
+      "grad_norm": 0.776520311832428,
+      "learning_rate": 7.517873745999394e-05,
+      "loss": 0.5986,
+      "step": 13038
+    },
+    {
+      "epoch": 2.321403133903134,
+      "grad_norm": 0.8118561506271362,
+      "learning_rate": 7.516517827386158e-05,
+      "loss": 0.8805,
+      "step": 13039
+    },
+    {
+      "epoch": 2.3215811965811968,
+      "grad_norm": 0.8859134912490845,
+      "learning_rate": 7.515161957431596e-05,
+      "loss": 0.8861,
+      "step": 13040
+    },
+    {
+      "epoch": 2.321759259259259,
+      "grad_norm": 0.8181297779083252,
+      "learning_rate": 7.513806136162273e-05,
+      "loss": 0.9015,
+      "step": 13041
+    },
+    {
+      "epoch": 2.321937321937322,
+      "grad_norm": 0.8488339185714722,
+      "learning_rate": 7.512450363604759e-05,
+      "loss": 1.0423,
+      "step": 13042
+    },
+    {
+      "epoch": 2.3221153846153846,
+      "grad_norm": 0.7755734920501709,
+      "learning_rate": 7.511094639785607e-05,
+      "loss": 0.7595,
+      "step": 13043
+    },
+    {
+      "epoch": 2.3222934472934473,
+      "grad_norm": 0.8437283635139465,
+      "learning_rate": 7.509738964731389e-05,
+      "loss": 0.9011,
+      "step": 13044
+    },
+    {
+      "epoch": 2.32247150997151,
+      "grad_norm": 0.7508310675621033,
+      "learning_rate": 7.508383338468659e-05,
+      "loss": 0.8335,
+      "step": 13045
+    },
+    {
+      "epoch": 2.322649572649573,
+      "grad_norm": 0.8001464605331421,
+      "learning_rate": 7.507027761023987e-05,
+      "loss": 0.9785,
+      "step": 13046
+    },
+    {
+      "epoch": 2.322827635327635,
+      "grad_norm": 0.8142531514167786,
+      "learning_rate": 7.505672232423923e-05,
+      "loss": 0.8577,
+      "step": 13047
+    },
+    {
+      "epoch": 2.323005698005698,
+      "grad_norm": 0.7852125763893127,
+      "learning_rate": 7.504316752695035e-05,
+      "loss": 0.798,
+      "step": 13048
+    },
+    {
+      "epoch": 2.3231837606837606,
+      "grad_norm": 0.8998631238937378,
+      "learning_rate": 7.502961321863871e-05,
+      "loss": 0.9291,
+      "step": 13049
+    },
+    {
+      "epoch": 2.3233618233618234,
+      "grad_norm": 0.8850175738334656,
+      "learning_rate": 7.501605939956995e-05,
+      "loss": 0.9536,
+      "step": 13050
+    },
+    {
+      "epoch": 2.323539886039886,
+      "grad_norm": 0.8305020928382874,
+      "learning_rate": 7.500250607000959e-05,
+      "loss": 0.8695,
+      "step": 13051
+    },
+    {
+      "epoch": 2.323717948717949,
+      "grad_norm": 0.8073359727859497,
+      "learning_rate": 7.498895323022317e-05,
+      "loss": 0.6831,
+      "step": 13052
+    },
+    {
+      "epoch": 2.323896011396011,
+      "grad_norm": 0.8435724973678589,
+      "learning_rate": 7.497540088047632e-05,
+      "loss": 0.9419,
+      "step": 13053
+    },
+    {
+      "epoch": 2.324074074074074,
+      "grad_norm": 0.927147388458252,
+      "learning_rate": 7.496184902103446e-05,
+      "loss": 0.957,
+      "step": 13054
+    },
+    {
+      "epoch": 2.3242521367521367,
+      "grad_norm": 0.7923009395599365,
+      "learning_rate": 7.494829765216319e-05,
+      "loss": 0.839,
+      "step": 13055
+    },
+    {
+      "epoch": 2.3244301994301995,
+      "grad_norm": 0.7830277681350708,
+      "learning_rate": 7.493474677412794e-05,
+      "loss": 1.0236,
+      "step": 13056
+    },
+    {
+      "epoch": 2.324608262108262,
+      "grad_norm": 0.8470967411994934,
+      "learning_rate": 7.492119638719432e-05,
+      "loss": 0.9144,
+      "step": 13057
+    },
+    {
+      "epoch": 2.324786324786325,
+      "grad_norm": 0.7469272613525391,
+      "learning_rate": 7.490764649162771e-05,
+      "loss": 0.7101,
+      "step": 13058
+    },
+    {
+      "epoch": 2.3249643874643873,
+      "grad_norm": 0.9236082434654236,
+      "learning_rate": 7.489409708769366e-05,
+      "loss": 0.6658,
+      "step": 13059
+    },
+    {
+      "epoch": 2.32514245014245,
+      "grad_norm": 0.8271692395210266,
+      "learning_rate": 7.48805481756576e-05,
+      "loss": 0.775,
+      "step": 13060
+    },
+    {
+      "epoch": 2.3253205128205128,
+      "grad_norm": 0.9878279566764832,
+      "learning_rate": 7.486699975578507e-05,
+      "loss": 0.6881,
+      "step": 13061
+    },
+    {
+      "epoch": 2.3254985754985755,
+      "grad_norm": 0.7332003712654114,
+      "learning_rate": 7.485345182834142e-05,
+      "loss": 0.8384,
+      "step": 13062
+    },
+    {
+      "epoch": 2.3256766381766383,
+      "grad_norm": 0.9525214433670044,
+      "learning_rate": 7.483990439359221e-05,
+      "loss": 0.9892,
+      "step": 13063
+    },
+    {
+      "epoch": 2.325854700854701,
+      "grad_norm": 0.7413233518600464,
+      "learning_rate": 7.482635745180273e-05,
+      "loss": 0.8749,
+      "step": 13064
+    },
+    {
+      "epoch": 2.326032763532764,
+      "grad_norm": 0.8286891579627991,
+      "learning_rate": 7.481281100323854e-05,
+      "loss": 1.0313,
+      "step": 13065
+    },
+    {
+      "epoch": 2.326210826210826,
+      "grad_norm": 0.868653416633606,
+      "learning_rate": 7.479926504816495e-05,
+      "loss": 0.7407,
+      "step": 13066
+    },
+    {
+      "epoch": 2.326388888888889,
+      "grad_norm": 0.790052056312561,
+      "learning_rate": 7.478571958684746e-05,
+      "loss": 0.7156,
+      "step": 13067
+    },
+    {
+      "epoch": 2.3265669515669516,
+      "grad_norm": 0.8799049854278564,
+      "learning_rate": 7.477217461955137e-05,
+      "loss": 0.7706,
+      "step": 13068
+    },
+    {
+      "epoch": 2.3267450142450143,
+      "grad_norm": 0.8246361017227173,
+      "learning_rate": 7.475863014654214e-05,
+      "loss": 0.9402,
+      "step": 13069
+    },
+    {
+      "epoch": 2.326923076923077,
+      "grad_norm": 0.870917022228241,
+      "learning_rate": 7.474508616808508e-05,
+      "loss": 0.962,
+      "step": 13070
+    },
+    {
+      "epoch": 2.3271011396011394,
+      "grad_norm": 0.8706079125404358,
+      "learning_rate": 7.473154268444563e-05,
+      "loss": 0.9094,
+      "step": 13071
+    },
+    {
+      "epoch": 2.327279202279202,
+      "grad_norm": 0.9031453728675842,
+      "learning_rate": 7.471799969588912e-05,
+      "loss": 0.8447,
+      "step": 13072
+    },
+    {
+      "epoch": 2.327457264957265,
+      "grad_norm": 0.9153435230255127,
+      "learning_rate": 7.470445720268086e-05,
+      "loss": 0.9935,
+      "step": 13073
+    },
+    {
+      "epoch": 2.3276353276353277,
+      "grad_norm": 0.8236302733421326,
+      "learning_rate": 7.469091520508624e-05,
+      "loss": 0.7911,
+      "step": 13074
+    },
+    {
+      "epoch": 2.3278133903133904,
+      "grad_norm": 0.7344710826873779,
+      "learning_rate": 7.467737370337054e-05,
+      "loss": 0.6565,
+      "step": 13075
+    },
+    {
+      "epoch": 2.327991452991453,
+      "grad_norm": 0.8711966276168823,
+      "learning_rate": 7.466383269779911e-05,
+      "loss": 0.8332,
+      "step": 13076
+    },
+    {
+      "epoch": 2.328169515669516,
+      "grad_norm": 0.836825966835022,
+      "learning_rate": 7.465029218863723e-05,
+      "loss": 0.9359,
+      "step": 13077
+    },
+    {
+      "epoch": 2.328347578347578,
+      "grad_norm": 0.9791260361671448,
+      "learning_rate": 7.463675217615024e-05,
+      "loss": 0.8938,
+      "step": 13078
+    },
+    {
+      "epoch": 2.328525641025641,
+      "grad_norm": 0.7260454893112183,
+      "learning_rate": 7.46232126606034e-05,
+      "loss": 0.6871,
+      "step": 13079
+    },
+    {
+      "epoch": 2.3287037037037037,
+      "grad_norm": 0.7887428998947144,
+      "learning_rate": 7.460967364226197e-05,
+      "loss": 0.8098,
+      "step": 13080
+    },
+    {
+      "epoch": 2.3288817663817665,
+      "grad_norm": 0.8303743004798889,
+      "learning_rate": 7.459613512139124e-05,
+      "loss": 0.8897,
+      "step": 13081
+    },
+    {
+      "epoch": 2.3290598290598292,
+      "grad_norm": 0.7933324575424194,
+      "learning_rate": 7.458259709825652e-05,
+      "loss": 0.754,
+      "step": 13082
+    },
+    {
+      "epoch": 2.3292378917378915,
+      "grad_norm": 0.8998779058456421,
+      "learning_rate": 7.456905957312296e-05,
+      "loss": 0.866,
+      "step": 13083
+    },
+    {
+      "epoch": 2.3294159544159543,
+      "grad_norm": 0.8205044269561768,
+      "learning_rate": 7.455552254625588e-05,
+      "loss": 0.7455,
+      "step": 13084
+    },
+    {
+      "epoch": 2.329594017094017,
+      "grad_norm": 0.8731769323348999,
+      "learning_rate": 7.454198601792046e-05,
+      "loss": 0.8876,
+      "step": 13085
+    },
+    {
+      "epoch": 2.32977207977208,
+      "grad_norm": 0.9183599352836609,
+      "learning_rate": 7.452844998838194e-05,
+      "loss": 1.0991,
+      "step": 13086
+    },
+    {
+      "epoch": 2.3299501424501425,
+      "grad_norm": 0.8820931315422058,
+      "learning_rate": 7.451491445790553e-05,
+      "loss": 0.7591,
+      "step": 13087
+    },
+    {
+      "epoch": 2.3301282051282053,
+      "grad_norm": 0.7837240099906921,
+      "learning_rate": 7.450137942675646e-05,
+      "loss": 0.792,
+      "step": 13088
+    },
+    {
+      "epoch": 2.330306267806268,
+      "grad_norm": 0.8960266709327698,
+      "learning_rate": 7.448784489519984e-05,
+      "loss": 0.9725,
+      "step": 13089
+    },
+    {
+      "epoch": 2.3304843304843303,
+      "grad_norm": 0.8010196685791016,
+      "learning_rate": 7.447431086350092e-05,
+      "loss": 0.8418,
+      "step": 13090
+    },
+    {
+      "epoch": 2.330662393162393,
+      "grad_norm": 0.8652680516242981,
+      "learning_rate": 7.446077733192486e-05,
+      "loss": 0.93,
+      "step": 13091
+    },
+    {
+      "epoch": 2.330840455840456,
+      "grad_norm": 0.9385902285575867,
+      "learning_rate": 7.44472443007368e-05,
+      "loss": 0.968,
+      "step": 13092
+    },
+    {
+      "epoch": 2.3310185185185186,
+      "grad_norm": 0.8097951412200928,
+      "learning_rate": 7.443371177020195e-05,
+      "loss": 0.8715,
+      "step": 13093
+    },
+    {
+      "epoch": 2.3311965811965814,
+      "grad_norm": 0.7931473255157471,
+      "learning_rate": 7.442017974058537e-05,
+      "loss": 0.865,
+      "step": 13094
+    },
+    {
+      "epoch": 2.3313746438746437,
+      "grad_norm": 0.7680486440658569,
+      "learning_rate": 7.440664821215224e-05,
+      "loss": 0.9155,
+      "step": 13095
+    },
+    {
+      "epoch": 2.3315527065527064,
+      "grad_norm": 0.8128345012664795,
+      "learning_rate": 7.439311718516766e-05,
+      "loss": 0.8707,
+      "step": 13096
+    },
+    {
+      "epoch": 2.331730769230769,
+      "grad_norm": 0.9534463286399841,
+      "learning_rate": 7.43795866598968e-05,
+      "loss": 1.1102,
+      "step": 13097
+    },
+    {
+      "epoch": 2.331908831908832,
+      "grad_norm": 0.9140331745147705,
+      "learning_rate": 7.436605663660468e-05,
+      "loss": 0.7744,
+      "step": 13098
+    },
+    {
+      "epoch": 2.3320868945868947,
+      "grad_norm": 0.8316463828086853,
+      "learning_rate": 7.435252711555645e-05,
+      "loss": 0.7201,
+      "step": 13099
+    },
+    {
+      "epoch": 2.3322649572649574,
+      "grad_norm": 0.7714298963546753,
+      "learning_rate": 7.433899809701714e-05,
+      "loss": 0.9111,
+      "step": 13100
+    },
+    {
+      "epoch": 2.33244301994302,
+      "grad_norm": 0.999081552028656,
+      "learning_rate": 7.432546958125188e-05,
+      "loss": 0.8839,
+      "step": 13101
+    },
+    {
+      "epoch": 2.3326210826210825,
+      "grad_norm": 0.8515602350234985,
+      "learning_rate": 7.43119415685257e-05,
+      "loss": 0.7245,
+      "step": 13102
+    },
+    {
+      "epoch": 2.3327991452991452,
+      "grad_norm": 0.9441094398498535,
+      "learning_rate": 7.42984140591037e-05,
+      "loss": 0.8678,
+      "step": 13103
+    },
+    {
+      "epoch": 2.332977207977208,
+      "grad_norm": 0.8220996856689453,
+      "learning_rate": 7.428488705325084e-05,
+      "loss": 0.9737,
+      "step": 13104
+    },
+    {
+      "epoch": 2.3331552706552707,
+      "grad_norm": 0.8164090514183044,
+      "learning_rate": 7.427136055123222e-05,
+      "loss": 0.9138,
+      "step": 13105
+    },
+    {
+      "epoch": 2.3333333333333335,
+      "grad_norm": 0.9672707319259644,
+      "learning_rate": 7.425783455331281e-05,
+      "loss": 0.7723,
+      "step": 13106
+    },
+    {
+      "epoch": 2.333511396011396,
+      "grad_norm": 0.7953858971595764,
+      "learning_rate": 7.424430905975773e-05,
+      "loss": 0.8876,
+      "step": 13107
+    },
+    {
+      "epoch": 2.3336894586894585,
+      "grad_norm": 0.7809541821479797,
+      "learning_rate": 7.423078407083183e-05,
+      "loss": 1.0268,
+      "step": 13108
+    },
+    {
+      "epoch": 2.3338675213675213,
+      "grad_norm": 0.805270254611969,
+      "learning_rate": 7.421725958680025e-05,
+      "loss": 0.7515,
+      "step": 13109
+    },
+    {
+      "epoch": 2.334045584045584,
+      "grad_norm": 0.8066652417182922,
+      "learning_rate": 7.420373560792788e-05,
+      "loss": 1.004,
+      "step": 13110
+    },
+    {
+      "epoch": 2.334223646723647,
+      "grad_norm": 0.9382686018943787,
+      "learning_rate": 7.41902121344797e-05,
+      "loss": 0.8769,
+      "step": 13111
+    },
+    {
+      "epoch": 2.3344017094017095,
+      "grad_norm": 0.7908356785774231,
+      "learning_rate": 7.417668916672074e-05,
+      "loss": 0.8491,
+      "step": 13112
+    },
+    {
+      "epoch": 2.3345797720797723,
+      "grad_norm": 0.7188867330551147,
+      "learning_rate": 7.416316670491588e-05,
+      "loss": 0.7138,
+      "step": 13113
+    },
+    {
+      "epoch": 2.3347578347578346,
+      "grad_norm": 0.8477714657783508,
+      "learning_rate": 7.414964474933012e-05,
+      "loss": 0.9715,
+      "step": 13114
+    },
+    {
+      "epoch": 2.3349358974358974,
+      "grad_norm": 0.8769845366477966,
+      "learning_rate": 7.413612330022835e-05,
+      "loss": 0.9029,
+      "step": 13115
+    },
+    {
+      "epoch": 2.33511396011396,
+      "grad_norm": 0.9011028409004211,
+      "learning_rate": 7.412260235787554e-05,
+      "loss": 1.026,
+      "step": 13116
+    },
+    {
+      "epoch": 2.335292022792023,
+      "grad_norm": 0.7775689363479614,
+      "learning_rate": 7.410908192253656e-05,
+      "loss": 0.8492,
+      "step": 13117
+    },
+    {
+      "epoch": 2.3354700854700856,
+      "grad_norm": 0.9587660431861877,
+      "learning_rate": 7.409556199447637e-05,
+      "loss": 0.8731,
+      "step": 13118
+    },
+    {
+      "epoch": 2.335648148148148,
+      "grad_norm": 0.8117266297340393,
+      "learning_rate": 7.408204257395979e-05,
+      "loss": 0.8827,
+      "step": 13119
+    },
+    {
+      "epoch": 2.3358262108262107,
+      "grad_norm": 0.7382497787475586,
+      "learning_rate": 7.40685236612518e-05,
+      "loss": 0.6617,
+      "step": 13120
+    },
+    {
+      "epoch": 2.3360042735042734,
+      "grad_norm": 0.8630974888801575,
+      "learning_rate": 7.405500525661717e-05,
+      "loss": 0.843,
+      "step": 13121
+    },
+    {
+      "epoch": 2.336182336182336,
+      "grad_norm": 0.7496539950370789,
+      "learning_rate": 7.404148736032083e-05,
+      "loss": 1.0354,
+      "step": 13122
+    },
+    {
+      "epoch": 2.336360398860399,
+      "grad_norm": 0.8409397602081299,
+      "learning_rate": 7.402796997262761e-05,
+      "loss": 0.9848,
+      "step": 13123
+    },
+    {
+      "epoch": 2.3365384615384617,
+      "grad_norm": 0.8018865585327148,
+      "learning_rate": 7.40144530938024e-05,
+      "loss": 0.8929,
+      "step": 13124
+    },
+    {
+      "epoch": 2.3367165242165244,
+      "grad_norm": 0.7378625869750977,
+      "learning_rate": 7.400093672410996e-05,
+      "loss": 0.7749,
+      "step": 13125
+    },
+    {
+      "epoch": 2.3368945868945867,
+      "grad_norm": 0.8906251192092896,
+      "learning_rate": 7.398742086381519e-05,
+      "loss": 1.007,
+      "step": 13126
+    },
+    {
+      "epoch": 2.3370726495726495,
+      "grad_norm": 0.8324725031852722,
+      "learning_rate": 7.397390551318283e-05,
+      "loss": 0.8493,
+      "step": 13127
+    },
+    {
+      "epoch": 2.3372507122507122,
+      "grad_norm": 0.781080961227417,
+      "learning_rate": 7.39603906724778e-05,
+      "loss": 0.696,
+      "step": 13128
+    },
+    {
+      "epoch": 2.337428774928775,
+      "grad_norm": 0.8068976402282715,
+      "learning_rate": 7.394687634196476e-05,
+      "loss": 0.7196,
+      "step": 13129
+    },
+    {
+      "epoch": 2.3376068376068377,
+      "grad_norm": 0.7588358521461487,
+      "learning_rate": 7.393336252190854e-05,
+      "loss": 0.9179,
+      "step": 13130
+    },
+    {
+      "epoch": 2.3377849002849,
+      "grad_norm": 0.8334088325500488,
+      "learning_rate": 7.391984921257398e-05,
+      "loss": 0.94,
+      "step": 13131
+    },
+    {
+      "epoch": 2.337962962962963,
+      "grad_norm": 0.9485353231430054,
+      "learning_rate": 7.390633641422578e-05,
+      "loss": 0.9253,
+      "step": 13132
+    },
+    {
+      "epoch": 2.3381410256410255,
+      "grad_norm": 0.9447978734970093,
+      "learning_rate": 7.389282412712874e-05,
+      "loss": 0.9112,
+      "step": 13133
+    },
+    {
+      "epoch": 2.3383190883190883,
+      "grad_norm": 0.7348376512527466,
+      "learning_rate": 7.387931235154754e-05,
+      "loss": 0.7817,
+      "step": 13134
+    },
+    {
+      "epoch": 2.338497150997151,
+      "grad_norm": 0.8610092401504517,
+      "learning_rate": 7.386580108774699e-05,
+      "loss": 0.8231,
+      "step": 13135
+    },
+    {
+      "epoch": 2.338675213675214,
+      "grad_norm": 0.8314286470413208,
+      "learning_rate": 7.385229033599175e-05,
+      "loss": 0.7323,
+      "step": 13136
+    },
+    {
+      "epoch": 2.3388532763532766,
+      "grad_norm": 0.7775855660438538,
+      "learning_rate": 7.383878009654657e-05,
+      "loss": 0.9897,
+      "step": 13137
+    },
+    {
+      "epoch": 2.339031339031339,
+      "grad_norm": 0.8140097260475159,
+      "learning_rate": 7.382527036967614e-05,
+      "loss": 0.7815,
+      "step": 13138
+    },
+    {
+      "epoch": 2.3392094017094016,
+      "grad_norm": 0.8154003620147705,
+      "learning_rate": 7.38117611556452e-05,
+      "loss": 0.7646,
+      "step": 13139
+    },
+    {
+      "epoch": 2.3393874643874644,
+      "grad_norm": 0.7705643177032471,
+      "learning_rate": 7.379825245471836e-05,
+      "loss": 0.7633,
+      "step": 13140
+    },
+    {
+      "epoch": 2.339565527065527,
+      "grad_norm": 0.7856985330581665,
+      "learning_rate": 7.378474426716035e-05,
+      "loss": 0.6803,
+      "step": 13141
+    },
+    {
+      "epoch": 2.33974358974359,
+      "grad_norm": 0.8384547233581543,
+      "learning_rate": 7.377123659323579e-05,
+      "loss": 0.9092,
+      "step": 13142
+    },
+    {
+      "epoch": 2.339921652421652,
+      "grad_norm": 0.7456032633781433,
+      "learning_rate": 7.375772943320942e-05,
+      "loss": 0.8393,
+      "step": 13143
+    },
+    {
+      "epoch": 2.340099715099715,
+      "grad_norm": 0.9527342319488525,
+      "learning_rate": 7.374422278734579e-05,
+      "loss": 1.0272,
+      "step": 13144
+    },
+    {
+      "epoch": 2.3402777777777777,
+      "grad_norm": 0.8976300954818726,
+      "learning_rate": 7.37307166559096e-05,
+      "loss": 0.7184,
+      "step": 13145
+    },
+    {
+      "epoch": 2.3404558404558404,
+      "grad_norm": 0.7698291540145874,
+      "learning_rate": 7.371721103916542e-05,
+      "loss": 0.6783,
+      "step": 13146
+    },
+    {
+      "epoch": 2.340633903133903,
+      "grad_norm": 0.8646810054779053,
+      "learning_rate": 7.37037059373779e-05,
+      "loss": 0.8559,
+      "step": 13147
+    },
+    {
+      "epoch": 2.340811965811966,
+      "grad_norm": 0.7534750699996948,
+      "learning_rate": 7.369020135081161e-05,
+      "loss": 0.8087,
+      "step": 13148
+    },
+    {
+      "epoch": 2.3409900284900287,
+      "grad_norm": 0.7408546209335327,
+      "learning_rate": 7.367669727973123e-05,
+      "loss": 0.675,
+      "step": 13149
+    },
+    {
+      "epoch": 2.341168091168091,
+      "grad_norm": 0.8753145933151245,
+      "learning_rate": 7.366319372440124e-05,
+      "loss": 0.8163,
+      "step": 13150
+    },
+    {
+      "epoch": 2.3413461538461537,
+      "grad_norm": 0.7065265774726868,
+      "learning_rate": 7.364969068508624e-05,
+      "loss": 0.5786,
+      "step": 13151
+    },
+    {
+      "epoch": 2.3415242165242165,
+      "grad_norm": 0.7976117730140686,
+      "learning_rate": 7.363618816205087e-05,
+      "loss": 0.9053,
+      "step": 13152
+    },
+    {
+      "epoch": 2.3417022792022792,
+      "grad_norm": 0.7261707782745361,
+      "learning_rate": 7.362268615555958e-05,
+      "loss": 0.7677,
+      "step": 13153
+    },
+    {
+      "epoch": 2.341880341880342,
+      "grad_norm": 0.7868889570236206,
+      "learning_rate": 7.360918466587701e-05,
+      "loss": 0.8648,
+      "step": 13154
+    },
+    {
+      "epoch": 2.3420584045584047,
+      "grad_norm": 0.8473666310310364,
+      "learning_rate": 7.35956836932676e-05,
+      "loss": 0.8533,
+      "step": 13155
+    },
+    {
+      "epoch": 2.342236467236467,
+      "grad_norm": 0.7456569671630859,
+      "learning_rate": 7.358218323799594e-05,
+      "loss": 0.7617,
+      "step": 13156
+    },
+    {
+      "epoch": 2.34241452991453,
+      "grad_norm": 0.8130928874015808,
+      "learning_rate": 7.356868330032652e-05,
+      "loss": 0.8667,
+      "step": 13157
+    },
+    {
+      "epoch": 2.3425925925925926,
+      "grad_norm": 0.8743309378623962,
+      "learning_rate": 7.355518388052384e-05,
+      "loss": 0.9196,
+      "step": 13158
+    },
+    {
+      "epoch": 2.3427706552706553,
+      "grad_norm": 0.8228809237480164,
+      "learning_rate": 7.354168497885237e-05,
+      "loss": 0.6509,
+      "step": 13159
+    },
+    {
+      "epoch": 2.342948717948718,
+      "grad_norm": 0.6998807191848755,
+      "learning_rate": 7.352818659557668e-05,
+      "loss": 0.5762,
+      "step": 13160
+    },
+    {
+      "epoch": 2.343126780626781,
+      "grad_norm": 0.8757675290107727,
+      "learning_rate": 7.351468873096114e-05,
+      "loss": 0.9094,
+      "step": 13161
+    },
+    {
+      "epoch": 2.343304843304843,
+      "grad_norm": 0.7495744824409485,
+      "learning_rate": 7.350119138527026e-05,
+      "loss": 0.653,
+      "step": 13162
+    },
+    {
+      "epoch": 2.343482905982906,
+      "grad_norm": 0.8229764103889465,
+      "learning_rate": 7.348769455876849e-05,
+      "loss": 0.8146,
+      "step": 13163
+    },
+    {
+      "epoch": 2.3436609686609686,
+      "grad_norm": 0.8317791819572449,
+      "learning_rate": 7.347419825172029e-05,
+      "loss": 0.7754,
+      "step": 13164
+    },
+    {
+      "epoch": 2.3438390313390314,
+      "grad_norm": 0.8210344910621643,
+      "learning_rate": 7.346070246439005e-05,
+      "loss": 0.876,
+      "step": 13165
+    },
+    {
+      "epoch": 2.344017094017094,
+      "grad_norm": 0.7711526155471802,
+      "learning_rate": 7.344720719704223e-05,
+      "loss": 0.7426,
+      "step": 13166
+    },
+    {
+      "epoch": 2.344195156695157,
+      "grad_norm": 0.8231741189956665,
+      "learning_rate": 7.343371244994119e-05,
+      "loss": 0.8992,
+      "step": 13167
+    },
+    {
+      "epoch": 2.344373219373219,
+      "grad_norm": 0.7145521640777588,
+      "learning_rate": 7.342021822335143e-05,
+      "loss": 0.8787,
+      "step": 13168
+    },
+    {
+      "epoch": 2.344551282051282,
+      "grad_norm": 0.8323171734809875,
+      "learning_rate": 7.340672451753723e-05,
+      "loss": 0.7035,
+      "step": 13169
+    },
+    {
+      "epoch": 2.3447293447293447,
+      "grad_norm": 0.7061881422996521,
+      "learning_rate": 7.339323133276301e-05,
+      "loss": 0.8077,
+      "step": 13170
+    },
+    {
+      "epoch": 2.3449074074074074,
+      "grad_norm": 0.8705938458442688,
+      "learning_rate": 7.33797386692932e-05,
+      "loss": 0.8616,
+      "step": 13171
+    },
+    {
+      "epoch": 2.34508547008547,
+      "grad_norm": 0.8777729868888855,
+      "learning_rate": 7.336624652739208e-05,
+      "loss": 0.9524,
+      "step": 13172
+    },
+    {
+      "epoch": 2.345263532763533,
+      "grad_norm": 0.9099276065826416,
+      "learning_rate": 7.335275490732406e-05,
+      "loss": 0.8248,
+      "step": 13173
+    },
+    {
+      "epoch": 2.3454415954415953,
+      "grad_norm": 0.7963444590568542,
+      "learning_rate": 7.333926380935341e-05,
+      "loss": 0.78,
+      "step": 13174
+    },
+    {
+      "epoch": 2.345619658119658,
+      "grad_norm": 0.9400636553764343,
+      "learning_rate": 7.332577323374454e-05,
+      "loss": 1.0062,
+      "step": 13175
+    },
+    {
+      "epoch": 2.3457977207977208,
+      "grad_norm": 0.7794054746627808,
+      "learning_rate": 7.331228318076171e-05,
+      "loss": 0.8564,
+      "step": 13176
+    },
+    {
+      "epoch": 2.3459757834757835,
+      "grad_norm": 0.7767263054847717,
+      "learning_rate": 7.329879365066927e-05,
+      "loss": 0.8191,
+      "step": 13177
+    },
+    {
+      "epoch": 2.3461538461538463,
+      "grad_norm": 0.7170942425727844,
+      "learning_rate": 7.328530464373148e-05,
+      "loss": 0.7018,
+      "step": 13178
+    },
+    {
+      "epoch": 2.346331908831909,
+      "grad_norm": 0.8246886134147644,
+      "learning_rate": 7.327181616021268e-05,
+      "loss": 0.8498,
+      "step": 13179
+    },
+    {
+      "epoch": 2.3465099715099713,
+      "grad_norm": 0.9531362652778625,
+      "learning_rate": 7.325832820037711e-05,
+      "loss": 0.7031,
+      "step": 13180
+    },
+    {
+      "epoch": 2.346688034188034,
+      "grad_norm": 0.8561878204345703,
+      "learning_rate": 7.324484076448905e-05,
+      "loss": 0.8627,
+      "step": 13181
+    },
+    {
+      "epoch": 2.346866096866097,
+      "grad_norm": 0.7890949845314026,
+      "learning_rate": 7.323135385281274e-05,
+      "loss": 0.7675,
+      "step": 13182
+    },
+    {
+      "epoch": 2.3470441595441596,
+      "grad_norm": 0.72523033618927,
+      "learning_rate": 7.321786746561246e-05,
+      "loss": 0.8847,
+      "step": 13183
+    },
+    {
+      "epoch": 2.3472222222222223,
+      "grad_norm": 0.7866469025611877,
+      "learning_rate": 7.32043816031524e-05,
+      "loss": 0.9657,
+      "step": 13184
+    },
+    {
+      "epoch": 2.347400284900285,
+      "grad_norm": 0.8669828176498413,
+      "learning_rate": 7.319089626569687e-05,
+      "loss": 0.9098,
+      "step": 13185
+    },
+    {
+      "epoch": 2.347578347578348,
+      "grad_norm": 0.7874458432197571,
+      "learning_rate": 7.317741145351e-05,
+      "loss": 0.9545,
+      "step": 13186
+    },
+    {
+      "epoch": 2.34775641025641,
+      "grad_norm": 0.7924689054489136,
+      "learning_rate": 7.316392716685604e-05,
+      "loss": 0.8577,
+      "step": 13187
+    },
+    {
+      "epoch": 2.347934472934473,
+      "grad_norm": 0.731119692325592,
+      "learning_rate": 7.315044340599918e-05,
+      "loss": 0.9251,
+      "step": 13188
+    },
+    {
+      "epoch": 2.3481125356125356,
+      "grad_norm": 0.914900004863739,
+      "learning_rate": 7.313696017120361e-05,
+      "loss": 0.9224,
+      "step": 13189
+    },
+    {
+      "epoch": 2.3482905982905984,
+      "grad_norm": 0.7616490125656128,
+      "learning_rate": 7.312347746273349e-05,
+      "loss": 0.7263,
+      "step": 13190
+    },
+    {
+      "epoch": 2.348468660968661,
+      "grad_norm": 0.8357210159301758,
+      "learning_rate": 7.310999528085301e-05,
+      "loss": 0.8572,
+      "step": 13191
+    },
+    {
+      "epoch": 2.3486467236467234,
+      "grad_norm": 0.8404232263565063,
+      "learning_rate": 7.309651362582633e-05,
+      "loss": 0.6822,
+      "step": 13192
+    },
+    {
+      "epoch": 2.348824786324786,
+      "grad_norm": 0.8992070555686951,
+      "learning_rate": 7.308303249791754e-05,
+      "loss": 0.91,
+      "step": 13193
+    },
+    {
+      "epoch": 2.349002849002849,
+      "grad_norm": 0.8150524497032166,
+      "learning_rate": 7.306955189739084e-05,
+      "loss": 0.984,
+      "step": 13194
+    },
+    {
+      "epoch": 2.3491809116809117,
+      "grad_norm": 0.9042861461639404,
+      "learning_rate": 7.305607182451031e-05,
+      "loss": 1.0111,
+      "step": 13195
+    },
+    {
+      "epoch": 2.3493589743589745,
+      "grad_norm": 0.8402968049049377,
+      "learning_rate": 7.30425922795401e-05,
+      "loss": 0.801,
+      "step": 13196
+    },
+    {
+      "epoch": 2.349537037037037,
+      "grad_norm": 0.7742997407913208,
+      "learning_rate": 7.302911326274428e-05,
+      "loss": 0.659,
+      "step": 13197
+    },
+    {
+      "epoch": 2.3497150997151,
+      "grad_norm": 0.8005271553993225,
+      "learning_rate": 7.301563477438698e-05,
+      "loss": 0.8549,
+      "step": 13198
+    },
+    {
+      "epoch": 2.3498931623931623,
+      "grad_norm": 0.8253805637359619,
+      "learning_rate": 7.300215681473224e-05,
+      "loss": 0.9049,
+      "step": 13199
+    },
+    {
+      "epoch": 2.350071225071225,
+      "grad_norm": 0.8539033532142639,
+      "learning_rate": 7.29886793840442e-05,
+      "loss": 0.9359,
+      "step": 13200
+    },
+    {
+      "epoch": 2.3502492877492878,
+      "grad_norm": 0.827608048915863,
+      "learning_rate": 7.297520248258681e-05,
+      "loss": 1.0105,
+      "step": 13201
+    },
+    {
+      "epoch": 2.3504273504273505,
+      "grad_norm": 0.8418487310409546,
+      "learning_rate": 7.296172611062422e-05,
+      "loss": 0.8138,
+      "step": 13202
+    },
+    {
+      "epoch": 2.3506054131054133,
+      "grad_norm": 0.7853255867958069,
+      "learning_rate": 7.294825026842042e-05,
+      "loss": 0.9279,
+      "step": 13203
+    },
+    {
+      "epoch": 2.3507834757834756,
+      "grad_norm": 0.8454880714416504,
+      "learning_rate": 7.293477495623951e-05,
+      "loss": 0.7687,
+      "step": 13204
+    },
+    {
+      "epoch": 2.3509615384615383,
+      "grad_norm": 0.7620453238487244,
+      "learning_rate": 7.29213001743454e-05,
+      "loss": 0.7567,
+      "step": 13205
+    },
+    {
+      "epoch": 2.351139601139601,
+      "grad_norm": 0.8993792533874512,
+      "learning_rate": 7.290782592300223e-05,
+      "loss": 0.9716,
+      "step": 13206
+    },
+    {
+      "epoch": 2.351317663817664,
+      "grad_norm": 1.1063668727874756,
+      "learning_rate": 7.289435220247387e-05,
+      "loss": 0.9763,
+      "step": 13207
+    },
+    {
+      "epoch": 2.3514957264957266,
+      "grad_norm": 0.8205364346504211,
+      "learning_rate": 7.288087901302439e-05,
+      "loss": 0.9395,
+      "step": 13208
+    },
+    {
+      "epoch": 2.3516737891737893,
+      "grad_norm": 0.680487871170044,
+      "learning_rate": 7.286740635491774e-05,
+      "loss": 0.6252,
+      "step": 13209
+    },
+    {
+      "epoch": 2.351851851851852,
+      "grad_norm": 0.8450767397880554,
+      "learning_rate": 7.285393422841791e-05,
+      "loss": 0.8707,
+      "step": 13210
+    },
+    {
+      "epoch": 2.3520299145299144,
+      "grad_norm": 0.6871187686920166,
+      "learning_rate": 7.284046263378888e-05,
+      "loss": 0.5695,
+      "step": 13211
+    },
+    {
+      "epoch": 2.352207977207977,
+      "grad_norm": 0.7968555688858032,
+      "learning_rate": 7.282699157129451e-05,
+      "loss": 0.7014,
+      "step": 13212
+    },
+    {
+      "epoch": 2.35238603988604,
+      "grad_norm": 0.863798201084137,
+      "learning_rate": 7.281352104119883e-05,
+      "loss": 0.9241,
+      "step": 13213
+    },
+    {
+      "epoch": 2.3525641025641026,
+      "grad_norm": 0.8848825693130493,
+      "learning_rate": 7.28000510437657e-05,
+      "loss": 0.8252,
+      "step": 13214
+    },
+    {
+      "epoch": 2.3527421652421654,
+      "grad_norm": 0.7528855800628662,
+      "learning_rate": 7.278658157925912e-05,
+      "loss": 0.7428,
+      "step": 13215
+    },
+    {
+      "epoch": 2.3529202279202277,
+      "grad_norm": 0.7636159062385559,
+      "learning_rate": 7.277311264794288e-05,
+      "loss": 0.8952,
+      "step": 13216
+    },
+    {
+      "epoch": 2.3530982905982905,
+      "grad_norm": 1.0585514307022095,
+      "learning_rate": 7.2759644250081e-05,
+      "loss": 0.9153,
+      "step": 13217
+    },
+    {
+      "epoch": 2.353276353276353,
+      "grad_norm": 0.7691277265548706,
+      "learning_rate": 7.274617638593725e-05,
+      "loss": 1.057,
+      "step": 13218
+    },
+    {
+      "epoch": 2.353454415954416,
+      "grad_norm": 0.8324813842773438,
+      "learning_rate": 7.273270905577561e-05,
+      "loss": 0.9253,
+      "step": 13219
+    },
+    {
+      "epoch": 2.3536324786324787,
+      "grad_norm": 0.835491418838501,
+      "learning_rate": 7.271924225985984e-05,
+      "loss": 1.0103,
+      "step": 13220
+    },
+    {
+      "epoch": 2.3538105413105415,
+      "grad_norm": 0.8318347930908203,
+      "learning_rate": 7.270577599845389e-05,
+      "loss": 0.8896,
+      "step": 13221
+    },
+    {
+      "epoch": 2.353988603988604,
+      "grad_norm": 0.7801460027694702,
+      "learning_rate": 7.269231027182153e-05,
+      "loss": 0.9274,
+      "step": 13222
+    },
+    {
+      "epoch": 2.3541666666666665,
+      "grad_norm": 0.8195397257804871,
+      "learning_rate": 7.267884508022665e-05,
+      "loss": 0.8126,
+      "step": 13223
+    },
+    {
+      "epoch": 2.3543447293447293,
+      "grad_norm": 0.7978246212005615,
+      "learning_rate": 7.2665380423933e-05,
+      "loss": 0.8426,
+      "step": 13224
+    },
+    {
+      "epoch": 2.354522792022792,
+      "grad_norm": 0.7614684104919434,
+      "learning_rate": 7.265191630320452e-05,
+      "loss": 0.7277,
+      "step": 13225
+    },
+    {
+      "epoch": 2.3547008547008548,
+      "grad_norm": 0.8684967756271362,
+      "learning_rate": 7.263845271830485e-05,
+      "loss": 0.7054,
+      "step": 13226
+    },
+    {
+      "epoch": 2.3548789173789175,
+      "grad_norm": 0.878842830657959,
+      "learning_rate": 7.262498966949791e-05,
+      "loss": 1.0478,
+      "step": 13227
+    },
+    {
+      "epoch": 2.35505698005698,
+      "grad_norm": 0.8321235179901123,
+      "learning_rate": 7.26115271570474e-05,
+      "loss": 0.8119,
+      "step": 13228
+    },
+    {
+      "epoch": 2.3552350427350426,
+      "grad_norm": 0.9144030213356018,
+      "learning_rate": 7.259806518121713e-05,
+      "loss": 0.8626,
+      "step": 13229
+    },
+    {
+      "epoch": 2.3554131054131053,
+      "grad_norm": 0.8437082767486572,
+      "learning_rate": 7.258460374227085e-05,
+      "loss": 0.8439,
+      "step": 13230
+    },
+    {
+      "epoch": 2.355591168091168,
+      "grad_norm": 0.8405697345733643,
+      "learning_rate": 7.257114284047229e-05,
+      "loss": 0.863,
+      "step": 13231
+    },
+    {
+      "epoch": 2.355769230769231,
+      "grad_norm": 0.8674731850624084,
+      "learning_rate": 7.255768247608525e-05,
+      "loss": 0.9823,
+      "step": 13232
+    },
+    {
+      "epoch": 2.3559472934472936,
+      "grad_norm": 0.8844531178474426,
+      "learning_rate": 7.254422264937337e-05,
+      "loss": 0.9018,
+      "step": 13233
+    },
+    {
+      "epoch": 2.3561253561253563,
+      "grad_norm": 0.8394746780395508,
+      "learning_rate": 7.253076336060045e-05,
+      "loss": 0.8407,
+      "step": 13234
+    },
+    {
+      "epoch": 2.3563034188034186,
+      "grad_norm": 0.8759872317314148,
+      "learning_rate": 7.251730461003012e-05,
+      "loss": 0.841,
+      "step": 13235
+    },
+    {
+      "epoch": 2.3564814814814814,
+      "grad_norm": 0.7240089774131775,
+      "learning_rate": 7.250384639792617e-05,
+      "loss": 0.7918,
+      "step": 13236
+    },
+    {
+      "epoch": 2.356659544159544,
+      "grad_norm": 0.8619599342346191,
+      "learning_rate": 7.24903887245522e-05,
+      "loss": 0.761,
+      "step": 13237
+    },
+    {
+      "epoch": 2.356837606837607,
+      "grad_norm": 0.7291443943977356,
+      "learning_rate": 7.247693159017192e-05,
+      "loss": 0.8189,
+      "step": 13238
+    },
+    {
+      "epoch": 2.3570156695156697,
+      "grad_norm": 0.8006066083908081,
+      "learning_rate": 7.246347499504898e-05,
+      "loss": 0.8924,
+      "step": 13239
+    },
+    {
+      "epoch": 2.357193732193732,
+      "grad_norm": 0.7774627208709717,
+      "learning_rate": 7.245001893944707e-05,
+      "loss": 0.946,
+      "step": 13240
+    },
+    {
+      "epoch": 2.3573717948717947,
+      "grad_norm": 0.7643784284591675,
+      "learning_rate": 7.243656342362978e-05,
+      "loss": 0.8717,
+      "step": 13241
+    },
+    {
+      "epoch": 2.3575498575498575,
+      "grad_norm": 0.7197792530059814,
+      "learning_rate": 7.242310844786082e-05,
+      "loss": 0.6792,
+      "step": 13242
+    },
+    {
+      "epoch": 2.35772792022792,
+      "grad_norm": 0.9124938249588013,
+      "learning_rate": 7.240965401240371e-05,
+      "loss": 0.841,
+      "step": 13243
+    },
+    {
+      "epoch": 2.357905982905983,
+      "grad_norm": 0.7350388765335083,
+      "learning_rate": 7.239620011752215e-05,
+      "loss": 0.8294,
+      "step": 13244
+    },
+    {
+      "epoch": 2.3580840455840457,
+      "grad_norm": 0.8814936280250549,
+      "learning_rate": 7.238274676347967e-05,
+      "loss": 0.9732,
+      "step": 13245
+    },
+    {
+      "epoch": 2.3582621082621085,
+      "grad_norm": 0.8379302024841309,
+      "learning_rate": 7.236929395053995e-05,
+      "loss": 0.8896,
+      "step": 13246
+    },
+    {
+      "epoch": 2.3584401709401708,
+      "grad_norm": 0.8200546503067017,
+      "learning_rate": 7.235584167896648e-05,
+      "loss": 0.7991,
+      "step": 13247
+    },
+    {
+      "epoch": 2.3586182336182335,
+      "grad_norm": 0.7842608690261841,
+      "learning_rate": 7.234238994902287e-05,
+      "loss": 0.695,
+      "step": 13248
+    },
+    {
+      "epoch": 2.3587962962962963,
+      "grad_norm": 0.8872218132019043,
+      "learning_rate": 7.232893876097266e-05,
+      "loss": 0.8611,
+      "step": 13249
+    },
+    {
+      "epoch": 2.358974358974359,
+      "grad_norm": 0.8358500599861145,
+      "learning_rate": 7.231548811507942e-05,
+      "loss": 0.7829,
+      "step": 13250
+    },
+    {
+      "epoch": 2.359152421652422,
+      "grad_norm": 0.8269400000572205,
+      "learning_rate": 7.23020380116067e-05,
+      "loss": 0.6904,
+      "step": 13251
+    },
+    {
+      "epoch": 2.359330484330484,
+      "grad_norm": 0.8693541288375854,
+      "learning_rate": 7.2288588450818e-05,
+      "loss": 0.8659,
+      "step": 13252
+    },
+    {
+      "epoch": 2.359508547008547,
+      "grad_norm": 0.858076810836792,
+      "learning_rate": 7.227513943297688e-05,
+      "loss": 0.8824,
+      "step": 13253
+    },
+    {
+      "epoch": 2.3596866096866096,
+      "grad_norm": 0.796541154384613,
+      "learning_rate": 7.226169095834675e-05,
+      "loss": 0.8999,
+      "step": 13254
+    },
+    {
+      "epoch": 2.3598646723646723,
+      "grad_norm": 0.7692779898643494,
+      "learning_rate": 7.22482430271912e-05,
+      "loss": 0.9492,
+      "step": 13255
+    },
+    {
+      "epoch": 2.360042735042735,
+      "grad_norm": 0.9259434342384338,
+      "learning_rate": 7.223479563977364e-05,
+      "loss": 0.9115,
+      "step": 13256
+    },
+    {
+      "epoch": 2.360220797720798,
+      "grad_norm": 0.9048989415168762,
+      "learning_rate": 7.222134879635764e-05,
+      "loss": 0.8057,
+      "step": 13257
+    },
+    {
+      "epoch": 2.3603988603988606,
+      "grad_norm": 0.9342616200447083,
+      "learning_rate": 7.220790249720656e-05,
+      "loss": 0.8554,
+      "step": 13258
+    },
+    {
+      "epoch": 2.360576923076923,
+      "grad_norm": 0.7747787237167358,
+      "learning_rate": 7.219445674258392e-05,
+      "loss": 0.7555,
+      "step": 13259
+    },
+    {
+      "epoch": 2.3607549857549857,
+      "grad_norm": 0.805437445640564,
+      "learning_rate": 7.218101153275311e-05,
+      "loss": 0.7442,
+      "step": 13260
+    },
+    {
+      "epoch": 2.3609330484330484,
+      "grad_norm": 0.9797805547714233,
+      "learning_rate": 7.216756686797764e-05,
+      "loss": 1.0975,
+      "step": 13261
+    },
+    {
+      "epoch": 2.361111111111111,
+      "grad_norm": 0.7361458539962769,
+      "learning_rate": 7.215412274852083e-05,
+      "loss": 0.6597,
+      "step": 13262
+    },
+    {
+      "epoch": 2.361289173789174,
+      "grad_norm": 0.8041569590568542,
+      "learning_rate": 7.21406791746462e-05,
+      "loss": 0.8343,
+      "step": 13263
+    },
+    {
+      "epoch": 2.361467236467236,
+      "grad_norm": 0.8364384770393372,
+      "learning_rate": 7.212723614661703e-05,
+      "loss": 0.9486,
+      "step": 13264
+    },
+    {
+      "epoch": 2.361645299145299,
+      "grad_norm": 0.714241623878479,
+      "learning_rate": 7.21137936646968e-05,
+      "loss": 0.5978,
+      "step": 13265
+    },
+    {
+      "epoch": 2.3618233618233617,
+      "grad_norm": 0.8830710053443909,
+      "learning_rate": 7.210035172914882e-05,
+      "loss": 0.9584,
+      "step": 13266
+    },
+    {
+      "epoch": 2.3620014245014245,
+      "grad_norm": 0.714112401008606,
+      "learning_rate": 7.208691034023653e-05,
+      "loss": 0.8878,
+      "step": 13267
+    },
+    {
+      "epoch": 2.3621794871794872,
+      "grad_norm": 0.7654083371162415,
+      "learning_rate": 7.207346949822322e-05,
+      "loss": 0.822,
+      "step": 13268
+    },
+    {
+      "epoch": 2.36235754985755,
+      "grad_norm": 0.772693395614624,
+      "learning_rate": 7.206002920337225e-05,
+      "loss": 0.7993,
+      "step": 13269
+    },
+    {
+      "epoch": 2.3625356125356127,
+      "grad_norm": 0.9678596258163452,
+      "learning_rate": 7.2046589455947e-05,
+      "loss": 0.948,
+      "step": 13270
+    },
+    {
+      "epoch": 2.362713675213675,
+      "grad_norm": 0.8254278302192688,
+      "learning_rate": 7.203315025621073e-05,
+      "loss": 0.8654,
+      "step": 13271
+    },
+    {
+      "epoch": 2.362891737891738,
+      "grad_norm": 0.7527315020561218,
+      "learning_rate": 7.201971160442685e-05,
+      "loss": 0.6881,
+      "step": 13272
+    },
+    {
+      "epoch": 2.3630698005698005,
+      "grad_norm": 0.7658267021179199,
+      "learning_rate": 7.200627350085853e-05,
+      "loss": 0.7332,
+      "step": 13273
+    },
+    {
+      "epoch": 2.3632478632478633,
+      "grad_norm": 0.8590806126594543,
+      "learning_rate": 7.199283594576916e-05,
+      "loss": 0.879,
+      "step": 13274
+    },
+    {
+      "epoch": 2.363425925925926,
+      "grad_norm": 0.7533347606658936,
+      "learning_rate": 7.197939893942197e-05,
+      "loss": 0.8738,
+      "step": 13275
+    },
+    {
+      "epoch": 2.363603988603989,
+      "grad_norm": Infinity,
+      "learning_rate": 7.197939893942197e-05,
+      "loss": 0.7641,
+      "step": 13276
+    },
+    {
+      "epoch": 2.363782051282051,
+      "grad_norm": 0.6873685121536255,
+      "learning_rate": 7.196596248208029e-05,
+      "loss": 0.4708,
+      "step": 13277
+    },
+    {
+      "epoch": 2.363960113960114,
+      "grad_norm": 0.7659112215042114,
+      "learning_rate": 7.195252657400729e-05,
+      "loss": 0.839,
+      "step": 13278
+    },
+    {
+      "epoch": 2.3641381766381766,
+      "grad_norm": 0.8355028629302979,
+      "learning_rate": 7.193909121546631e-05,
+      "loss": 0.9792,
+      "step": 13279
+    },
+    {
+      "epoch": 2.3643162393162394,
+      "grad_norm": 0.9633997678756714,
+      "learning_rate": 7.192565640672052e-05,
+      "loss": 0.9891,
+      "step": 13280
+    },
+    {
+      "epoch": 2.364494301994302,
+      "grad_norm": 0.7984298467636108,
+      "learning_rate": 7.191222214803318e-05,
+      "loss": 0.8343,
+      "step": 13281
+    },
+    {
+      "epoch": 2.364672364672365,
+      "grad_norm": 0.8239994645118713,
+      "learning_rate": 7.189878843966749e-05,
+      "loss": 0.8586,
+      "step": 13282
+    },
+    {
+      "epoch": 2.364850427350427,
+      "grad_norm": 0.8695420026779175,
+      "learning_rate": 7.188535528188671e-05,
+      "loss": 0.9161,
+      "step": 13283
+    },
+    {
+      "epoch": 2.36502849002849,
+      "grad_norm": 0.8272924423217773,
+      "learning_rate": 7.187192267495393e-05,
+      "loss": 0.8158,
+      "step": 13284
+    },
+    {
+      "epoch": 2.3652065527065527,
+      "grad_norm": 0.8217222690582275,
+      "learning_rate": 7.185849061913243e-05,
+      "loss": 0.892,
+      "step": 13285
+    },
+    {
+      "epoch": 2.3653846153846154,
+      "grad_norm": 0.9041243195533752,
+      "learning_rate": 7.184505911468532e-05,
+      "loss": 0.9093,
+      "step": 13286
+    },
+    {
+      "epoch": 2.365562678062678,
+      "grad_norm": 0.8325521349906921,
+      "learning_rate": 7.183162816187582e-05,
+      "loss": 0.7546,
+      "step": 13287
+    },
+    {
+      "epoch": 2.365740740740741,
+      "grad_norm": 0.9160267114639282,
+      "learning_rate": 7.181819776096704e-05,
+      "loss": 0.9662,
+      "step": 13288
+    },
+    {
+      "epoch": 2.3659188034188032,
+      "grad_norm": 0.8771381974220276,
+      "learning_rate": 7.180476791222215e-05,
+      "loss": 1.0083,
+      "step": 13289
+    },
+    {
+      "epoch": 2.366096866096866,
+      "grad_norm": 0.8251327872276306,
+      "learning_rate": 7.179133861590421e-05,
+      "loss": 0.8209,
+      "step": 13290
+    },
+    {
+      "epoch": 2.3662749287749287,
+      "grad_norm": 0.8760706186294556,
+      "learning_rate": 7.177790987227641e-05,
+      "loss": 0.7479,
+      "step": 13291
+    },
+    {
+      "epoch": 2.3664529914529915,
+      "grad_norm": 0.7857288122177124,
+      "learning_rate": 7.176448168160187e-05,
+      "loss": 0.6511,
+      "step": 13292
+    },
+    {
+      "epoch": 2.3666310541310542,
+      "grad_norm": 0.9548102021217346,
+      "learning_rate": 7.175105404414362e-05,
+      "loss": 0.731,
+      "step": 13293
+    },
+    {
+      "epoch": 2.366809116809117,
+      "grad_norm": 0.7604304552078247,
+      "learning_rate": 7.173762696016484e-05,
+      "loss": 0.8212,
+      "step": 13294
+    },
+    {
+      "epoch": 2.3669871794871793,
+      "grad_norm": 0.9121061563491821,
+      "learning_rate": 7.172420042992849e-05,
+      "loss": 0.939,
+      "step": 13295
+    },
+    {
+      "epoch": 2.367165242165242,
+      "grad_norm": 0.8128613233566284,
+      "learning_rate": 7.171077445369772e-05,
+      "loss": 0.8908,
+      "step": 13296
+    },
+    {
+      "epoch": 2.367343304843305,
+      "grad_norm": 0.9184401035308838,
+      "learning_rate": 7.169734903173555e-05,
+      "loss": 0.957,
+      "step": 13297
+    },
+    {
+      "epoch": 2.3675213675213675,
+      "grad_norm": 0.9234427809715271,
+      "learning_rate": 7.168392416430507e-05,
+      "loss": 0.8403,
+      "step": 13298
+    },
+    {
+      "epoch": 2.3676994301994303,
+      "grad_norm": 0.8810806274414062,
+      "learning_rate": 7.167049985166922e-05,
+      "loss": 0.9754,
+      "step": 13299
+    },
+    {
+      "epoch": 2.367877492877493,
+      "grad_norm": 0.8208937048912048,
+      "learning_rate": 7.165707609409113e-05,
+      "loss": 0.9418,
+      "step": 13300
+    },
+    {
+      "epoch": 2.3680555555555554,
+      "grad_norm": 0.8666219711303711,
+      "learning_rate": 7.164365289183371e-05,
+      "loss": 0.8936,
+      "step": 13301
+    },
+    {
+      "epoch": 2.368233618233618,
+      "grad_norm": 0.9385154843330383,
+      "learning_rate": 7.163023024516002e-05,
+      "loss": 0.8158,
+      "step": 13302
+    },
+    {
+      "epoch": 2.368411680911681,
+      "grad_norm": 1.0415911674499512,
+      "learning_rate": 7.161680815433303e-05,
+      "loss": 1.0445,
+      "step": 13303
+    },
+    {
+      "epoch": 2.3685897435897436,
+      "grad_norm": 0.6882192492485046,
+      "learning_rate": 7.160338661961577e-05,
+      "loss": 0.4929,
+      "step": 13304
+    },
+    {
+      "epoch": 2.3687678062678064,
+      "grad_norm": 0.8695144653320312,
+      "learning_rate": 7.15899656412711e-05,
+      "loss": 0.8991,
+      "step": 13305
+    },
+    {
+      "epoch": 2.368945868945869,
+      "grad_norm": 0.8973569273948669,
+      "learning_rate": 7.157654521956206e-05,
+      "loss": 0.8423,
+      "step": 13306
+    },
+    {
+      "epoch": 2.369123931623932,
+      "grad_norm": 0.7656881213188171,
+      "learning_rate": 7.156312535475155e-05,
+      "loss": 0.7351,
+      "step": 13307
+    },
+    {
+      "epoch": 2.369301994301994,
+      "grad_norm": 0.8023402690887451,
+      "learning_rate": 7.154970604710258e-05,
+      "loss": 0.9943,
+      "step": 13308
+    },
+    {
+      "epoch": 2.369480056980057,
+      "grad_norm": 0.916946530342102,
+      "learning_rate": 7.153628729687797e-05,
+      "loss": 0.8649,
+      "step": 13309
+    },
+    {
+      "epoch": 2.3696581196581197,
+      "grad_norm": 0.8764750361442566,
+      "learning_rate": 7.152286910434068e-05,
+      "loss": 0.9799,
+      "step": 13310
+    },
+    {
+      "epoch": 2.3698361823361824,
+      "grad_norm": 0.8732671737670898,
+      "learning_rate": 7.150945146975364e-05,
+      "loss": 1.0431,
+      "step": 13311
+    },
+    {
+      "epoch": 2.370014245014245,
+      "grad_norm": 0.8447144031524658,
+      "learning_rate": 7.149603439337969e-05,
+      "loss": 0.7805,
+      "step": 13312
+    },
+    {
+      "epoch": 2.3701923076923075,
+      "grad_norm": 0.9017399549484253,
+      "learning_rate": 7.148261787548178e-05,
+      "loss": 0.8102,
+      "step": 13313
+    },
+    {
+      "epoch": 2.3703703703703702,
+      "grad_norm": 0.7187124490737915,
+      "learning_rate": 7.14692019163227e-05,
+      "loss": 0.7327,
+      "step": 13314
+    },
+    {
+      "epoch": 2.370548433048433,
+      "grad_norm": 0.8579949736595154,
+      "learning_rate": 7.145578651616536e-05,
+      "loss": 0.8685,
+      "step": 13315
+    },
+    {
+      "epoch": 2.3707264957264957,
+      "grad_norm": 0.6088887453079224,
+      "learning_rate": 7.144237167527256e-05,
+      "loss": 0.7004,
+      "step": 13316
+    },
+    {
+      "epoch": 2.3709045584045585,
+      "grad_norm": 0.6400231719017029,
+      "learning_rate": 7.142895739390718e-05,
+      "loss": 0.7273,
+      "step": 13317
+    },
+    {
+      "epoch": 2.3710826210826212,
+      "grad_norm": 0.8680049180984497,
+      "learning_rate": 7.141554367233201e-05,
+      "loss": 0.7886,
+      "step": 13318
+    },
+    {
+      "epoch": 2.371260683760684,
+      "grad_norm": 0.8894832134246826,
+      "learning_rate": 7.140213051080991e-05,
+      "loss": 1.0597,
+      "step": 13319
+    },
+    {
+      "epoch": 2.3714387464387463,
+      "grad_norm": 0.7371698021888733,
+      "learning_rate": 7.138871790960365e-05,
+      "loss": 0.8344,
+      "step": 13320
+    },
+    {
+      "epoch": 2.371616809116809,
+      "grad_norm": 0.7396906018257141,
+      "learning_rate": 7.137530586897601e-05,
+      "loss": 0.7185,
+      "step": 13321
+    },
+    {
+      "epoch": 2.371794871794872,
+      "grad_norm": 0.7884365320205688,
+      "learning_rate": 7.136189438918978e-05,
+      "loss": 0.8311,
+      "step": 13322
+    },
+    {
+      "epoch": 2.3719729344729346,
+      "grad_norm": 0.8064826130867004,
+      "learning_rate": 7.13484834705078e-05,
+      "loss": 0.6933,
+      "step": 13323
+    },
+    {
+      "epoch": 2.3721509971509973,
+      "grad_norm": 0.8865584135055542,
+      "learning_rate": 7.13350731131927e-05,
+      "loss": 0.979,
+      "step": 13324
+    },
+    {
+      "epoch": 2.3723290598290596,
+      "grad_norm": 0.7782325148582458,
+      "learning_rate": 7.132166331750736e-05,
+      "loss": 0.8147,
+      "step": 13325
+    },
+    {
+      "epoch": 2.3725071225071224,
+      "grad_norm": 0.8515480160713196,
+      "learning_rate": 7.13082540837144e-05,
+      "loss": 0.7571,
+      "step": 13326
+    },
+    {
+      "epoch": 2.372685185185185,
+      "grad_norm": 0.8665108680725098,
+      "learning_rate": 7.129484541207662e-05,
+      "loss": 0.8171,
+      "step": 13327
+    },
+    {
+      "epoch": 2.372863247863248,
+      "grad_norm": 0.7640653252601624,
+      "learning_rate": 7.128143730285668e-05,
+      "loss": 0.7118,
+      "step": 13328
+    },
+    {
+      "epoch": 2.3730413105413106,
+      "grad_norm": 0.844083309173584,
+      "learning_rate": 7.126802975631735e-05,
+      "loss": 0.8394,
+      "step": 13329
+    },
+    {
+      "epoch": 2.3732193732193734,
+      "grad_norm": 0.8718371391296387,
+      "learning_rate": 7.12546227727213e-05,
+      "loss": 0.8729,
+      "step": 13330
+    },
+    {
+      "epoch": 2.373397435897436,
+      "grad_norm": 0.7254782319068909,
+      "learning_rate": 7.124121635233118e-05,
+      "loss": 0.8178,
+      "step": 13331
+    },
+    {
+      "epoch": 2.3735754985754984,
+      "grad_norm": 0.7211804389953613,
+      "learning_rate": 7.12278104954097e-05,
+      "loss": 0.9415,
+      "step": 13332
+    },
+    {
+      "epoch": 2.373753561253561,
+      "grad_norm": 0.8538317680358887,
+      "learning_rate": 7.121440520221949e-05,
+      "loss": 0.8614,
+      "step": 13333
+    },
+    {
+      "epoch": 2.373931623931624,
+      "grad_norm": 0.8942680358886719,
+      "learning_rate": 7.120100047302324e-05,
+      "loss": 0.985,
+      "step": 13334
+    },
+    {
+      "epoch": 2.3741096866096867,
+      "grad_norm": 0.8282434344291687,
+      "learning_rate": 7.118759630808354e-05,
+      "loss": 0.94,
+      "step": 13335
+    },
+    {
+      "epoch": 2.3742877492877494,
+      "grad_norm": 0.8036409616470337,
+      "learning_rate": 7.117419270766308e-05,
+      "loss": 0.7145,
+      "step": 13336
+    },
+    {
+      "epoch": 2.3744658119658117,
+      "grad_norm": 0.9169675707817078,
+      "learning_rate": 7.116078967202437e-05,
+      "loss": 1.1078,
+      "step": 13337
+    },
+    {
+      "epoch": 2.3746438746438745,
+      "grad_norm": 0.7805418372154236,
+      "learning_rate": 7.114738720143011e-05,
+      "loss": 0.8216,
+      "step": 13338
+    },
+    {
+      "epoch": 2.3748219373219372,
+      "grad_norm": 1.0444506406784058,
+      "learning_rate": 7.113398529614285e-05,
+      "loss": 0.8153,
+      "step": 13339
+    },
+    {
+      "epoch": 2.375,
+      "grad_norm": 0.8254665732383728,
+      "learning_rate": 7.112058395642522e-05,
+      "loss": 0.8127,
+      "step": 13340
+    },
+    {
+      "epoch": 2.3751780626780628,
+      "grad_norm": 0.8327687382698059,
+      "learning_rate": 7.11071831825397e-05,
+      "loss": 0.7014,
+      "step": 13341
+    },
+    {
+      "epoch": 2.3753561253561255,
+      "grad_norm": 0.7473437786102295,
+      "learning_rate": 7.109378297474894e-05,
+      "loss": 0.7621,
+      "step": 13342
+    },
+    {
+      "epoch": 2.3755341880341883,
+      "grad_norm": 0.8537931442260742,
+      "learning_rate": 7.108038333331544e-05,
+      "loss": 0.9302,
+      "step": 13343
+    },
+    {
+      "epoch": 2.3757122507122506,
+      "grad_norm": 0.81959468126297,
+      "learning_rate": 7.106698425850178e-05,
+      "loss": 0.9157,
+      "step": 13344
+    },
+    {
+      "epoch": 2.3758903133903133,
+      "grad_norm": 0.769257128238678,
+      "learning_rate": 7.105358575057043e-05,
+      "loss": 0.8739,
+      "step": 13345
+    },
+    {
+      "epoch": 2.376068376068376,
+      "grad_norm": 0.7428072690963745,
+      "learning_rate": 7.104018780978394e-05,
+      "loss": 0.7001,
+      "step": 13346
+    },
+    {
+      "epoch": 2.376246438746439,
+      "grad_norm": 0.8152543306350708,
+      "learning_rate": 7.102679043640481e-05,
+      "loss": 0.9866,
+      "step": 13347
+    },
+    {
+      "epoch": 2.3764245014245016,
+      "grad_norm": 0.8732424974441528,
+      "learning_rate": 7.101339363069556e-05,
+      "loss": 1.0207,
+      "step": 13348
+    },
+    {
+      "epoch": 2.376602564102564,
+      "grad_norm": 0.759279191493988,
+      "learning_rate": 7.099999739291862e-05,
+      "loss": 0.8703,
+      "step": 13349
+    },
+    {
+      "epoch": 2.3767806267806266,
+      "grad_norm": 0.8751664161682129,
+      "learning_rate": 7.098660172333648e-05,
+      "loss": 0.9805,
+      "step": 13350
+    },
+    {
+      "epoch": 2.3769586894586894,
+      "grad_norm": 0.9646390080451965,
+      "learning_rate": 7.097320662221168e-05,
+      "loss": 0.8623,
+      "step": 13351
+    },
+    {
+      "epoch": 2.377136752136752,
+      "grad_norm": 0.8626869320869446,
+      "learning_rate": 7.095981208980652e-05,
+      "loss": 0.7175,
+      "step": 13352
+    },
+    {
+      "epoch": 2.377314814814815,
+      "grad_norm": 0.8075738549232483,
+      "learning_rate": 7.094641812638354e-05,
+      "loss": 0.7741,
+      "step": 13353
+    },
+    {
+      "epoch": 2.3774928774928776,
+      "grad_norm": 0.7733559608459473,
+      "learning_rate": 7.093302473220513e-05,
+      "loss": 0.8553,
+      "step": 13354
+    },
+    {
+      "epoch": 2.3776709401709404,
+      "grad_norm": 0.7372797727584839,
+      "learning_rate": 7.091963190753376e-05,
+      "loss": 0.8554,
+      "step": 13355
+    },
+    {
+      "epoch": 2.3778490028490027,
+      "grad_norm": 0.804649293422699,
+      "learning_rate": 7.090623965263177e-05,
+      "loss": 0.8704,
+      "step": 13356
+    },
+    {
+      "epoch": 2.3780270655270654,
+      "grad_norm": 0.8370727300643921,
+      "learning_rate": 7.089284796776157e-05,
+      "loss": 0.9786,
+      "step": 13357
+    },
+    {
+      "epoch": 2.378205128205128,
+      "grad_norm": 0.7565299272537231,
+      "learning_rate": 7.087945685318554e-05,
+      "loss": 0.8096,
+      "step": 13358
+    },
+    {
+      "epoch": 2.378383190883191,
+      "grad_norm": 0.9046086072921753,
+      "learning_rate": 7.086606630916611e-05,
+      "loss": 0.8108,
+      "step": 13359
+    },
+    {
+      "epoch": 2.3785612535612537,
+      "grad_norm": 0.8453067541122437,
+      "learning_rate": 7.085267633596552e-05,
+      "loss": 0.8226,
+      "step": 13360
+    },
+    {
+      "epoch": 2.378739316239316,
+      "grad_norm": 0.8499273061752319,
+      "learning_rate": 7.083928693384628e-05,
+      "loss": 1.001,
+      "step": 13361
+    },
+    {
+      "epoch": 2.3789173789173788,
+      "grad_norm": 0.8358726501464844,
+      "learning_rate": 7.082589810307055e-05,
+      "loss": 0.7891,
+      "step": 13362
+    },
+    {
+      "epoch": 2.3790954415954415,
+      "grad_norm": 0.9156573414802551,
+      "learning_rate": 7.081250984390078e-05,
+      "loss": 0.9381,
+      "step": 13363
+    },
+    {
+      "epoch": 2.3792735042735043,
+      "grad_norm": 0.8704338669776917,
+      "learning_rate": 7.079912215659923e-05,
+      "loss": 0.9004,
+      "step": 13364
+    },
+    {
+      "epoch": 2.379451566951567,
+      "grad_norm": 0.8201949000358582,
+      "learning_rate": 7.078573504142824e-05,
+      "loss": 0.7501,
+      "step": 13365
+    },
+    {
+      "epoch": 2.3796296296296298,
+      "grad_norm": 0.9453420639038086,
+      "learning_rate": 7.077234849865008e-05,
+      "loss": 0.9658,
+      "step": 13366
+    },
+    {
+      "epoch": 2.3798076923076925,
+      "grad_norm": 0.8556796908378601,
+      "learning_rate": 7.075896252852703e-05,
+      "loss": 0.8054,
+      "step": 13367
+    },
+    {
+      "epoch": 2.379985754985755,
+      "grad_norm": 0.7961027026176453,
+      "learning_rate": 7.074557713132136e-05,
+      "loss": 0.8065,
+      "step": 13368
+    },
+    {
+      "epoch": 2.3801638176638176,
+      "grad_norm": 0.8777903318405151,
+      "learning_rate": 7.073219230729533e-05,
+      "loss": 0.9399,
+      "step": 13369
+    },
+    {
+      "epoch": 2.3803418803418803,
+      "grad_norm": 0.8569813370704651,
+      "learning_rate": 7.071880805671123e-05,
+      "loss": 0.9424,
+      "step": 13370
+    },
+    {
+      "epoch": 2.380519943019943,
+      "grad_norm": 0.8810455203056335,
+      "learning_rate": 7.070542437983123e-05,
+      "loss": 1.1313,
+      "step": 13371
+    },
+    {
+      "epoch": 2.380698005698006,
+      "grad_norm": 0.8691363334655762,
+      "learning_rate": 7.069204127691761e-05,
+      "loss": 0.9114,
+      "step": 13372
+    },
+    {
+      "epoch": 2.380876068376068,
+      "grad_norm": 0.7922945618629456,
+      "learning_rate": 7.067865874823253e-05,
+      "loss": 0.9158,
+      "step": 13373
+    },
+    {
+      "epoch": 2.381054131054131,
+      "grad_norm": 0.7465389370918274,
+      "learning_rate": 7.066527679403825e-05,
+      "loss": 0.6597,
+      "step": 13374
+    },
+    {
+      "epoch": 2.3812321937321936,
+      "grad_norm": 0.8386009931564331,
+      "learning_rate": 7.065189541459689e-05,
+      "loss": 0.7194,
+      "step": 13375
+    },
+    {
+      "epoch": 2.3814102564102564,
+      "grad_norm": 0.8633689880371094,
+      "learning_rate": 7.063851461017073e-05,
+      "loss": 0.8877,
+      "step": 13376
+    },
+    {
+      "epoch": 2.381588319088319,
+      "grad_norm": 0.8689528107643127,
+      "learning_rate": 7.062513438102184e-05,
+      "loss": 0.8384,
+      "step": 13377
+    },
+    {
+      "epoch": 2.381766381766382,
+      "grad_norm": 0.7648544311523438,
+      "learning_rate": 7.061175472741243e-05,
+      "loss": 0.7669,
+      "step": 13378
+    },
+    {
+      "epoch": 2.3819444444444446,
+      "grad_norm": 0.8502510786056519,
+      "learning_rate": 7.059837564960465e-05,
+      "loss": 0.9379,
+      "step": 13379
+    },
+    {
+      "epoch": 2.382122507122507,
+      "grad_norm": 0.8277843594551086,
+      "learning_rate": 7.058499714786063e-05,
+      "loss": 0.7372,
+      "step": 13380
+    },
+    {
+      "epoch": 2.3823005698005697,
+      "grad_norm": 0.7394976615905762,
+      "learning_rate": 7.057161922244246e-05,
+      "loss": 0.7628,
+      "step": 13381
+    },
+    {
+      "epoch": 2.3824786324786325,
+      "grad_norm": 0.7906123399734497,
+      "learning_rate": 7.05582418736123e-05,
+      "loss": 0.7645,
+      "step": 13382
+    },
+    {
+      "epoch": 2.382656695156695,
+      "grad_norm": 0.7889885902404785,
+      "learning_rate": 7.054486510163221e-05,
+      "loss": 0.8316,
+      "step": 13383
+    },
+    {
+      "epoch": 2.382834757834758,
+      "grad_norm": 0.7983359098434448,
+      "learning_rate": 7.053148890676434e-05,
+      "loss": 0.7925,
+      "step": 13384
+    },
+    {
+      "epoch": 2.3830128205128207,
+      "grad_norm": 0.9067932963371277,
+      "learning_rate": 7.051811328927067e-05,
+      "loss": 0.9385,
+      "step": 13385
+    },
+    {
+      "epoch": 2.383190883190883,
+      "grad_norm": 0.7210679650306702,
+      "learning_rate": 7.05047382494134e-05,
+      "loss": 0.591,
+      "step": 13386
+    },
+    {
+      "epoch": 2.3833689458689458,
+      "grad_norm": 0.9977821707725525,
+      "learning_rate": 7.049136378745445e-05,
+      "loss": 0.8362,
+      "step": 13387
+    },
+    {
+      "epoch": 2.3835470085470085,
+      "grad_norm": 0.9260198473930359,
+      "learning_rate": 7.047798990365595e-05,
+      "loss": 1.0051,
+      "step": 13388
+    },
+    {
+      "epoch": 2.3837250712250713,
+      "grad_norm": 0.8903454542160034,
+      "learning_rate": 7.04646165982799e-05,
+      "loss": 0.7055,
+      "step": 13389
+    },
+    {
+      "epoch": 2.383903133903134,
+      "grad_norm": 0.9634504914283752,
+      "learning_rate": 7.045124387158832e-05,
+      "loss": 0.7681,
+      "step": 13390
+    },
+    {
+      "epoch": 2.3840811965811968,
+      "grad_norm": 0.8645864129066467,
+      "learning_rate": 7.043787172384329e-05,
+      "loss": 0.9271,
+      "step": 13391
+    },
+    {
+      "epoch": 2.384259259259259,
+      "grad_norm": 0.8738446235656738,
+      "learning_rate": 7.04245001553067e-05,
+      "loss": 0.9,
+      "step": 13392
+    },
+    {
+      "epoch": 2.384437321937322,
+      "grad_norm": 0.7869822382926941,
+      "learning_rate": 7.041112916624062e-05,
+      "loss": 0.8639,
+      "step": 13393
+    },
+    {
+      "epoch": 2.3846153846153846,
+      "grad_norm": 0.8728111386299133,
+      "learning_rate": 7.039775875690698e-05,
+      "loss": 1.0367,
+      "step": 13394
+    },
+    {
+      "epoch": 2.3847934472934473,
+      "grad_norm": 0.7883852124214172,
+      "learning_rate": 7.03843889275678e-05,
+      "loss": 0.8338,
+      "step": 13395
+    },
+    {
+      "epoch": 2.38497150997151,
+      "grad_norm": 0.9267113208770752,
+      "learning_rate": 7.037101967848496e-05,
+      "loss": 0.8931,
+      "step": 13396
+    },
+    {
+      "epoch": 2.385149572649573,
+      "grad_norm": 0.8940320611000061,
+      "learning_rate": 7.035765100992048e-05,
+      "loss": 0.8071,
+      "step": 13397
+    },
+    {
+      "epoch": 2.385327635327635,
+      "grad_norm": 0.8109263777732849,
+      "learning_rate": 7.03442829221362e-05,
+      "loss": 0.8083,
+      "step": 13398
+    },
+    {
+      "epoch": 2.385505698005698,
+      "grad_norm": 0.8223438262939453,
+      "learning_rate": 7.033091541539413e-05,
+      "loss": 0.9296,
+      "step": 13399
+    },
+    {
+      "epoch": 2.3856837606837606,
+      "grad_norm": 0.817894697189331,
+      "learning_rate": 7.031754848995612e-05,
+      "loss": 0.9168,
+      "step": 13400
+    },
+    {
+      "epoch": 2.3858618233618234,
+      "grad_norm": 0.831462562084198,
+      "learning_rate": 7.030418214608411e-05,
+      "loss": 0.8613,
+      "step": 13401
+    },
+    {
+      "epoch": 2.386039886039886,
+      "grad_norm": 0.8388770818710327,
+      "learning_rate": 7.029081638403994e-05,
+      "loss": 0.7477,
+      "step": 13402
+    },
+    {
+      "epoch": 2.386217948717949,
+      "grad_norm": 0.9557843804359436,
+      "learning_rate": 7.02774512040855e-05,
+      "loss": 0.8932,
+      "step": 13403
+    },
+    {
+      "epoch": 2.386396011396011,
+      "grad_norm": 0.8249707221984863,
+      "learning_rate": 7.026408660648268e-05,
+      "loss": 1.0301,
+      "step": 13404
+    },
+    {
+      "epoch": 2.386574074074074,
+      "grad_norm": 0.8355069160461426,
+      "learning_rate": 7.025072259149333e-05,
+      "loss": 0.8081,
+      "step": 13405
+    },
+    {
+      "epoch": 2.3867521367521367,
+      "grad_norm": 0.8373300433158875,
+      "learning_rate": 7.023735915937924e-05,
+      "loss": 0.9911,
+      "step": 13406
+    },
+    {
+      "epoch": 2.3869301994301995,
+      "grad_norm": 0.7177539467811584,
+      "learning_rate": 7.022399631040228e-05,
+      "loss": 0.6397,
+      "step": 13407
+    },
+    {
+      "epoch": 2.387108262108262,
+      "grad_norm": 0.7371904253959656,
+      "learning_rate": 7.021063404482426e-05,
+      "loss": 0.8634,
+      "step": 13408
+    },
+    {
+      "epoch": 2.387286324786325,
+      "grad_norm": 0.8919385671615601,
+      "learning_rate": 7.019727236290696e-05,
+      "loss": 0.9514,
+      "step": 13409
+    },
+    {
+      "epoch": 2.3874643874643873,
+      "grad_norm": 0.7673050761222839,
+      "learning_rate": 7.018391126491225e-05,
+      "loss": 0.8957,
+      "step": 13410
+    },
+    {
+      "epoch": 2.38764245014245,
+      "grad_norm": 0.8401889801025391,
+      "learning_rate": 7.01705507511018e-05,
+      "loss": 0.908,
+      "step": 13411
+    },
+    {
+      "epoch": 2.3878205128205128,
+      "grad_norm": 0.822903037071228,
+      "learning_rate": 7.01571908217375e-05,
+      "loss": 0.911,
+      "step": 13412
+    },
+    {
+      "epoch": 2.3879985754985755,
+      "grad_norm": 0.9824740290641785,
+      "learning_rate": 7.014383147708102e-05,
+      "loss": 0.8314,
+      "step": 13413
+    },
+    {
+      "epoch": 2.3881766381766383,
+      "grad_norm": 0.9485064148902893,
+      "learning_rate": 7.013047271739414e-05,
+      "loss": 0.9819,
+      "step": 13414
+    },
+    {
+      "epoch": 2.388354700854701,
+      "grad_norm": 0.7565387487411499,
+      "learning_rate": 7.01171145429386e-05,
+      "loss": 0.9702,
+      "step": 13415
+    },
+    {
+      "epoch": 2.388532763532764,
+      "grad_norm": 0.8159620761871338,
+      "learning_rate": 7.010375695397615e-05,
+      "loss": 0.7302,
+      "step": 13416
+    },
+    {
+      "epoch": 2.388710826210826,
+      "grad_norm": 0.7818536162376404,
+      "learning_rate": 7.009039995076844e-05,
+      "loss": 0.6821,
+      "step": 13417
+    },
+    {
+      "epoch": 2.388888888888889,
+      "grad_norm": 0.7958348989486694,
+      "learning_rate": 7.007704353357724e-05,
+      "loss": 0.7996,
+      "step": 13418
+    },
+    {
+      "epoch": 2.3890669515669516,
+      "grad_norm": 0.8097305297851562,
+      "learning_rate": 7.006368770266421e-05,
+      "loss": 0.8177,
+      "step": 13419
+    },
+    {
+      "epoch": 2.3892450142450143,
+      "grad_norm": 0.9326507449150085,
+      "learning_rate": 7.005033245829105e-05,
+      "loss": 0.9307,
+      "step": 13420
+    },
+    {
+      "epoch": 2.389423076923077,
+      "grad_norm": 0.8954049944877625,
+      "learning_rate": 7.003697780071936e-05,
+      "loss": 0.8527,
+      "step": 13421
+    },
+    {
+      "epoch": 2.3896011396011394,
+      "grad_norm": 0.890548586845398,
+      "learning_rate": 7.00236237302109e-05,
+      "loss": 0.8203,
+      "step": 13422
+    },
+    {
+      "epoch": 2.389779202279202,
+      "grad_norm": 0.7508596181869507,
+      "learning_rate": 7.001027024702722e-05,
+      "loss": 0.7056,
+      "step": 13423
+    },
+    {
+      "epoch": 2.389957264957265,
+      "grad_norm": 0.9403550624847412,
+      "learning_rate": 6.999691735143002e-05,
+      "loss": 0.7336,
+      "step": 13424
+    },
+    {
+      "epoch": 2.3901353276353277,
+      "grad_norm": 0.8187662959098816,
+      "learning_rate": 6.998356504368087e-05,
+      "loss": 0.6897,
+      "step": 13425
+    },
+    {
+      "epoch": 2.3903133903133904,
+      "grad_norm": 0.8584417104721069,
+      "learning_rate": 6.997021332404145e-05,
+      "loss": 0.9143,
+      "step": 13426
+    },
+    {
+      "epoch": 2.390491452991453,
+      "grad_norm": 0.8739892840385437,
+      "learning_rate": 6.995686219277329e-05,
+      "loss": 0.8028,
+      "step": 13427
+    },
+    {
+      "epoch": 2.390669515669516,
+      "grad_norm": 0.9291013479232788,
+      "learning_rate": 6.994351165013799e-05,
+      "loss": 1.0305,
+      "step": 13428
+    },
+    {
+      "epoch": 2.390847578347578,
+      "grad_norm": 0.7937391400337219,
+      "learning_rate": 6.993016169639719e-05,
+      "loss": 0.8326,
+      "step": 13429
+    },
+    {
+      "epoch": 2.391025641025641,
+      "grad_norm": 0.655261754989624,
+      "learning_rate": 6.991681233181236e-05,
+      "loss": 0.7939,
+      "step": 13430
+    },
+    {
+      "epoch": 2.3912037037037037,
+      "grad_norm": 0.9606142640113831,
+      "learning_rate": 6.990346355664515e-05,
+      "loss": 1.1344,
+      "step": 13431
+    },
+    {
+      "epoch": 2.3913817663817665,
+      "grad_norm": 0.8111617565155029,
+      "learning_rate": 6.9890115371157e-05,
+      "loss": 0.8398,
+      "step": 13432
+    },
+    {
+      "epoch": 2.3915598290598292,
+      "grad_norm": 0.8111898899078369,
+      "learning_rate": 6.987676777560955e-05,
+      "loss": 0.9189,
+      "step": 13433
+    },
+    {
+      "epoch": 2.3917378917378915,
+      "grad_norm": 0.7850473523139954,
+      "learning_rate": 6.98634207702642e-05,
+      "loss": 0.9563,
+      "step": 13434
+    },
+    {
+      "epoch": 2.3919159544159543,
+      "grad_norm": 0.7740257978439331,
+      "learning_rate": 6.985007435538256e-05,
+      "loss": 0.7446,
+      "step": 13435
+    },
+    {
+      "epoch": 2.392094017094017,
+      "grad_norm": 0.9354606866836548,
+      "learning_rate": 6.983672853122604e-05,
+      "loss": 0.879,
+      "step": 13436
+    },
+    {
+      "epoch": 2.39227207977208,
+      "grad_norm": 0.8909385800361633,
+      "learning_rate": 6.982338329805622e-05,
+      "loss": 0.9381,
+      "step": 13437
+    },
+    {
+      "epoch": 2.3924501424501425,
+      "grad_norm": 0.7748416066169739,
+      "learning_rate": 6.981003865613448e-05,
+      "loss": 0.7169,
+      "step": 13438
+    },
+    {
+      "epoch": 2.3926282051282053,
+      "grad_norm": 0.7357833981513977,
+      "learning_rate": 6.979669460572234e-05,
+      "loss": 0.669,
+      "step": 13439
+    },
+    {
+      "epoch": 2.392806267806268,
+      "grad_norm": 0.8370460271835327,
+      "learning_rate": 6.978335114708119e-05,
+      "loss": 0.6215,
+      "step": 13440
+    },
+    {
+      "epoch": 2.3929843304843303,
+      "grad_norm": 0.7578476071357727,
+      "learning_rate": 6.977000828047256e-05,
+      "loss": 0.871,
+      "step": 13441
+    },
+    {
+      "epoch": 2.393162393162393,
+      "grad_norm": 0.8111903071403503,
+      "learning_rate": 6.975666600615776e-05,
+      "loss": 0.7888,
+      "step": 13442
+    },
+    {
+      "epoch": 2.393340455840456,
+      "grad_norm": 0.9584433436393738,
+      "learning_rate": 6.974332432439831e-05,
+      "loss": 1.0011,
+      "step": 13443
+    },
+    {
+      "epoch": 2.3935185185185186,
+      "grad_norm": 0.9105294942855835,
+      "learning_rate": 6.972998323545555e-05,
+      "loss": 1.0832,
+      "step": 13444
+    },
+    {
+      "epoch": 2.3936965811965814,
+      "grad_norm": 0.7990328669548035,
+      "learning_rate": 6.971664273959089e-05,
+      "loss": 0.9561,
+      "step": 13445
+    },
+    {
+      "epoch": 2.3938746438746437,
+      "grad_norm": 0.8575631976127625,
+      "learning_rate": 6.970330283706569e-05,
+      "loss": 0.7965,
+      "step": 13446
+    },
+    {
+      "epoch": 2.3940527065527064,
+      "grad_norm": 0.8147784471511841,
+      "learning_rate": 6.968996352814139e-05,
+      "loss": 0.806,
+      "step": 13447
+    },
+    {
+      "epoch": 2.394230769230769,
+      "grad_norm": 0.8284323215484619,
+      "learning_rate": 6.967662481307923e-05,
+      "loss": 0.942,
+      "step": 13448
+    },
+    {
+      "epoch": 2.394408831908832,
+      "grad_norm": 0.8238104581832886,
+      "learning_rate": 6.966328669214062e-05,
+      "loss": 0.9163,
+      "step": 13449
+    },
+    {
+      "epoch": 2.3945868945868947,
+      "grad_norm": 0.8855763673782349,
+      "learning_rate": 6.964994916558692e-05,
+      "loss": 0.7683,
+      "step": 13450
+    },
+    {
+      "epoch": 2.3947649572649574,
+      "grad_norm": 1.02780020236969,
+      "learning_rate": 6.963661223367937e-05,
+      "loss": 0.904,
+      "step": 13451
+    },
+    {
+      "epoch": 2.39494301994302,
+      "grad_norm": 0.8001773953437805,
+      "learning_rate": 6.96232758966794e-05,
+      "loss": 0.8459,
+      "step": 13452
+    },
+    {
+      "epoch": 2.3951210826210825,
+      "grad_norm": 0.755388617515564,
+      "learning_rate": 6.960994015484818e-05,
+      "loss": 0.7759,
+      "step": 13453
+    },
+    {
+      "epoch": 2.3952991452991452,
+      "grad_norm": 0.7774340510368347,
+      "learning_rate": 6.959660500844708e-05,
+      "loss": 0.7353,
+      "step": 13454
+    },
+    {
+      "epoch": 2.395477207977208,
+      "grad_norm": 0.8696026802062988,
+      "learning_rate": 6.958327045773733e-05,
+      "loss": 0.635,
+      "step": 13455
+    },
+    {
+      "epoch": 2.3956552706552707,
+      "grad_norm": 0.8419780731201172,
+      "learning_rate": 6.956993650298025e-05,
+      "loss": 0.8515,
+      "step": 13456
+    },
+    {
+      "epoch": 2.3958333333333335,
+      "grad_norm": 0.9125590324401855,
+      "learning_rate": 6.955660314443699e-05,
+      "loss": 0.9099,
+      "step": 13457
+    },
+    {
+      "epoch": 2.396011396011396,
+      "grad_norm": 0.6847489476203918,
+      "learning_rate": 6.954327038236891e-05,
+      "loss": 0.6652,
+      "step": 13458
+    },
+    {
+      "epoch": 2.3961894586894585,
+      "grad_norm": 0.8674905896186829,
+      "learning_rate": 6.952993821703713e-05,
+      "loss": 0.7049,
+      "step": 13459
+    },
+    {
+      "epoch": 2.3963675213675213,
+      "grad_norm": 0.7777035236358643,
+      "learning_rate": 6.951660664870296e-05,
+      "loss": 0.818,
+      "step": 13460
+    },
+    {
+      "epoch": 2.396545584045584,
+      "grad_norm": 0.8349783420562744,
+      "learning_rate": 6.950327567762751e-05,
+      "loss": 0.9203,
+      "step": 13461
+    },
+    {
+      "epoch": 2.396723646723647,
+      "grad_norm": 0.7589834332466125,
+      "learning_rate": 6.948994530407206e-05,
+      "loss": 1.015,
+      "step": 13462
+    },
+    {
+      "epoch": 2.3969017094017095,
+      "grad_norm": 0.9340610504150391,
+      "learning_rate": 6.947661552829773e-05,
+      "loss": 1.0575,
+      "step": 13463
+    },
+    {
+      "epoch": 2.3970797720797723,
+      "grad_norm": 0.9100959300994873,
+      "learning_rate": 6.946328635056573e-05,
+      "loss": 0.8824,
+      "step": 13464
+    },
+    {
+      "epoch": 2.3972578347578346,
+      "grad_norm": 0.8255945444107056,
+      "learning_rate": 6.944995777113717e-05,
+      "loss": 0.7701,
+      "step": 13465
+    },
+    {
+      "epoch": 2.3974358974358974,
+      "grad_norm": 0.8572675585746765,
+      "learning_rate": 6.943662979027328e-05,
+      "loss": 0.9425,
+      "step": 13466
+    },
+    {
+      "epoch": 2.39761396011396,
+      "grad_norm": 0.8219536542892456,
+      "learning_rate": 6.94233024082351e-05,
+      "loss": 0.8184,
+      "step": 13467
+    },
+    {
+      "epoch": 2.397792022792023,
+      "grad_norm": 0.8260995149612427,
+      "learning_rate": 6.940997562528377e-05,
+      "loss": 0.8324,
+      "step": 13468
+    },
+    {
+      "epoch": 2.3979700854700856,
+      "grad_norm": 0.9707075357437134,
+      "learning_rate": 6.939664944168047e-05,
+      "loss": 0.9865,
+      "step": 13469
+    },
+    {
+      "epoch": 2.398148148148148,
+      "grad_norm": 0.9030438661575317,
+      "learning_rate": 6.938332385768622e-05,
+      "loss": 1.0244,
+      "step": 13470
+    },
+    {
+      "epoch": 2.3983262108262107,
+      "grad_norm": 0.8425108194351196,
+      "learning_rate": 6.936999887356214e-05,
+      "loss": 0.7053,
+      "step": 13471
+    },
+    {
+      "epoch": 2.3985042735042734,
+      "grad_norm": 1.0073270797729492,
+      "learning_rate": 6.93566744895693e-05,
+      "loss": 0.9324,
+      "step": 13472
+    },
+    {
+      "epoch": 2.398682336182336,
+      "grad_norm": 0.7647563219070435,
+      "learning_rate": 6.93433507059688e-05,
+      "loss": 0.7233,
+      "step": 13473
+    },
+    {
+      "epoch": 2.398860398860399,
+      "grad_norm": 0.7632454633712769,
+      "learning_rate": 6.933002752302162e-05,
+      "loss": 0.8678,
+      "step": 13474
+    },
+    {
+      "epoch": 2.3990384615384617,
+      "grad_norm": 0.7943702936172485,
+      "learning_rate": 6.931670494098887e-05,
+      "loss": 0.8805,
+      "step": 13475
+    },
+    {
+      "epoch": 2.3992165242165244,
+      "grad_norm": 0.9440419673919678,
+      "learning_rate": 6.930338296013153e-05,
+      "loss": 1.0103,
+      "step": 13476
+    },
+    {
+      "epoch": 2.3993945868945867,
+      "grad_norm": 0.9119253754615784,
+      "learning_rate": 6.929006158071065e-05,
+      "loss": 1.0235,
+      "step": 13477
+    },
+    {
+      "epoch": 2.3995726495726495,
+      "grad_norm": 0.7750248908996582,
+      "learning_rate": 6.927674080298721e-05,
+      "loss": 0.957,
+      "step": 13478
+    },
+    {
+      "epoch": 2.3997507122507122,
+      "grad_norm": 0.8847192525863647,
+      "learning_rate": 6.926342062722223e-05,
+      "loss": 0.9066,
+      "step": 13479
+    },
+    {
+      "epoch": 2.399928774928775,
+      "grad_norm": 0.814396321773529,
+      "learning_rate": 6.925010105367665e-05,
+      "loss": 1.0001,
+      "step": 13480
+    },
+    {
+      "epoch": 2.4001068376068377,
+      "grad_norm": 0.8323664665222168,
+      "learning_rate": 6.923678208261147e-05,
+      "loss": 1.0027,
+      "step": 13481
+    },
+    {
+      "epoch": 2.4002849002849,
+      "grad_norm": 0.8351104259490967,
+      "learning_rate": 6.92234637142876e-05,
+      "loss": 0.8629,
+      "step": 13482
+    },
+    {
+      "epoch": 2.400462962962963,
+      "grad_norm": 0.9298360347747803,
+      "learning_rate": 6.92101459489661e-05,
+      "loss": 0.9161,
+      "step": 13483
+    },
+    {
+      "epoch": 2.4006410256410255,
+      "grad_norm": 0.9423344135284424,
+      "learning_rate": 6.919682878690777e-05,
+      "loss": 1.4416,
+      "step": 13484
+    },
+    {
+      "epoch": 2.4008190883190883,
+      "grad_norm": 0.8340599536895752,
+      "learning_rate": 6.918351222837363e-05,
+      "loss": 0.8696,
+      "step": 13485
+    },
+    {
+      "epoch": 2.400997150997151,
+      "grad_norm": 0.8533751368522644,
+      "learning_rate": 6.917019627362451e-05,
+      "loss": 1.1383,
+      "step": 13486
+    },
+    {
+      "epoch": 2.401175213675214,
+      "grad_norm": 0.8060563206672668,
+      "learning_rate": 6.91568809229214e-05,
+      "loss": 0.8544,
+      "step": 13487
+    },
+    {
+      "epoch": 2.4013532763532766,
+      "grad_norm": 0.865485668182373,
+      "learning_rate": 6.914356617652511e-05,
+      "loss": 0.9286,
+      "step": 13488
+    },
+    {
+      "epoch": 2.401531339031339,
+      "grad_norm": 0.8785045742988586,
+      "learning_rate": 6.913025203469652e-05,
+      "loss": 0.7339,
+      "step": 13489
+    },
+    {
+      "epoch": 2.4017094017094016,
+      "grad_norm": 0.7718466520309448,
+      "learning_rate": 6.911693849769654e-05,
+      "loss": 0.8821,
+      "step": 13490
+    },
+    {
+      "epoch": 2.4018874643874644,
+      "grad_norm": 0.7274343371391296,
+      "learning_rate": 6.910362556578599e-05,
+      "loss": 0.6179,
+      "step": 13491
+    },
+    {
+      "epoch": 2.402065527065527,
+      "grad_norm": 0.8848530054092407,
+      "learning_rate": 6.909031323922574e-05,
+      "loss": 0.7848,
+      "step": 13492
+    },
+    {
+      "epoch": 2.40224358974359,
+      "grad_norm": 0.7384527325630188,
+      "learning_rate": 6.907700151827657e-05,
+      "loss": 0.5,
+      "step": 13493
+    },
+    {
+      "epoch": 2.402421652421652,
+      "grad_norm": 0.865505576133728,
+      "learning_rate": 6.906369040319936e-05,
+      "loss": 0.7127,
+      "step": 13494
+    },
+    {
+      "epoch": 2.402599715099715,
+      "grad_norm": 0.8588849902153015,
+      "learning_rate": 6.90503798942548e-05,
+      "loss": 0.8833,
+      "step": 13495
+    },
+    {
+      "epoch": 2.4027777777777777,
+      "grad_norm": 0.8570847511291504,
+      "learning_rate": 6.903706999170381e-05,
+      "loss": 0.9765,
+      "step": 13496
+    },
+    {
+      "epoch": 2.4029558404558404,
+      "grad_norm": 0.9193849563598633,
+      "learning_rate": 6.902376069580706e-05,
+      "loss": 0.8654,
+      "step": 13497
+    },
+    {
+      "epoch": 2.403133903133903,
+      "grad_norm": 0.8181582093238831,
+      "learning_rate": 6.901045200682545e-05,
+      "loss": 0.8815,
+      "step": 13498
+    },
+    {
+      "epoch": 2.403311965811966,
+      "grad_norm": 0.783163845539093,
+      "learning_rate": 6.89971439250196e-05,
+      "loss": 0.8383,
+      "step": 13499
+    },
+    {
+      "epoch": 2.4034900284900287,
+      "grad_norm": 1.0679216384887695,
+      "learning_rate": 6.898383645065032e-05,
+      "loss": 1.0525,
+      "step": 13500
+    },
+    {
+      "epoch": 2.403668091168091,
+      "grad_norm": 0.7945899367332458,
+      "learning_rate": 6.897052958397831e-05,
+      "loss": 1.0091,
+      "step": 13501
+    },
+    {
+      "epoch": 2.4038461538461537,
+      "grad_norm": 0.8310369253158569,
+      "learning_rate": 6.895722332526438e-05,
+      "loss": 0.8909,
+      "step": 13502
+    },
+    {
+      "epoch": 2.4040242165242165,
+      "grad_norm": 0.8811371922492981,
+      "learning_rate": 6.894391767476911e-05,
+      "loss": 0.8354,
+      "step": 13503
+    },
+    {
+      "epoch": 2.4042022792022792,
+      "grad_norm": 1.011495590209961,
+      "learning_rate": 6.893061263275332e-05,
+      "loss": 0.8846,
+      "step": 13504
+    },
+    {
+      "epoch": 2.404380341880342,
+      "grad_norm": 0.7587227821350098,
+      "learning_rate": 6.891730819947758e-05,
+      "loss": 0.8886,
+      "step": 13505
+    },
+    {
+      "epoch": 2.4045584045584047,
+      "grad_norm": 0.8367353677749634,
+      "learning_rate": 6.890400437520265e-05,
+      "loss": 1.008,
+      "step": 13506
+    },
+    {
+      "epoch": 2.404736467236467,
+      "grad_norm": 0.7200010418891907,
+      "learning_rate": 6.889070116018911e-05,
+      "loss": 0.8405,
+      "step": 13507
+    },
+    {
+      "epoch": 2.40491452991453,
+      "grad_norm": 0.9391907453536987,
+      "learning_rate": 6.887739855469769e-05,
+      "loss": 0.8904,
+      "step": 13508
+    },
+    {
+      "epoch": 2.4050925925925926,
+      "grad_norm": 0.8687568306922913,
+      "learning_rate": 6.886409655898902e-05,
+      "loss": 0.7145,
+      "step": 13509
+    },
+    {
+      "epoch": 2.4052706552706553,
+      "grad_norm": 0.7382767796516418,
+      "learning_rate": 6.885079517332366e-05,
+      "loss": 0.7639,
+      "step": 13510
+    },
+    {
+      "epoch": 2.405448717948718,
+      "grad_norm": 0.8322962522506714,
+      "learning_rate": 6.883749439796227e-05,
+      "loss": 1.0002,
+      "step": 13511
+    },
+    {
+      "epoch": 2.405626780626781,
+      "grad_norm": 0.815183162689209,
+      "learning_rate": 6.882419423316544e-05,
+      "loss": 0.8628,
+      "step": 13512
+    },
+    {
+      "epoch": 2.405804843304843,
+      "grad_norm": 0.9304860234260559,
+      "learning_rate": 6.881089467919381e-05,
+      "loss": 0.9489,
+      "step": 13513
+    },
+    {
+      "epoch": 2.405982905982906,
+      "grad_norm": 0.9071274995803833,
+      "learning_rate": 6.879759573630784e-05,
+      "loss": 0.8117,
+      "step": 13514
+    },
+    {
+      "epoch": 2.4061609686609686,
+      "grad_norm": 0.9378795027732849,
+      "learning_rate": 6.878429740476822e-05,
+      "loss": 1.22,
+      "step": 13515
+    },
+    {
+      "epoch": 2.4063390313390314,
+      "grad_norm": 0.7354511618614197,
+      "learning_rate": 6.877099968483541e-05,
+      "loss": 0.6696,
+      "step": 13516
+    },
+    {
+      "epoch": 2.406517094017094,
+      "grad_norm": 0.8701893091201782,
+      "learning_rate": 6.875770257677002e-05,
+      "loss": 0.8691,
+      "step": 13517
+    },
+    {
+      "epoch": 2.406695156695157,
+      "grad_norm": 0.8819001913070679,
+      "learning_rate": 6.87444060808325e-05,
+      "loss": 0.7428,
+      "step": 13518
+    },
+    {
+      "epoch": 2.406873219373219,
+      "grad_norm": 0.7339609265327454,
+      "learning_rate": 6.873111019728347e-05,
+      "loss": 0.7959,
+      "step": 13519
+    },
+    {
+      "epoch": 2.407051282051282,
+      "grad_norm": 0.8365123867988586,
+      "learning_rate": 6.871781492638335e-05,
+      "loss": 0.8199,
+      "step": 13520
+    },
+    {
+      "epoch": 2.4072293447293447,
+      "grad_norm": 0.9667043685913086,
+      "learning_rate": 6.870452026839266e-05,
+      "loss": 0.8261,
+      "step": 13521
+    },
+    {
+      "epoch": 2.4074074074074074,
+      "grad_norm": 0.6979679465293884,
+      "learning_rate": 6.869122622357187e-05,
+      "loss": 0.5909,
+      "step": 13522
+    },
+    {
+      "epoch": 2.40758547008547,
+      "grad_norm": 0.7326778769493103,
+      "learning_rate": 6.867793279218152e-05,
+      "loss": 0.9297,
+      "step": 13523
+    },
+    {
+      "epoch": 2.407763532763533,
+      "grad_norm": 0.8808563351631165,
+      "learning_rate": 6.866463997448196e-05,
+      "loss": 0.7481,
+      "step": 13524
+    },
+    {
+      "epoch": 2.4079415954415953,
+      "grad_norm": 0.7830268740653992,
+      "learning_rate": 6.86513477707337e-05,
+      "loss": 0.7902,
+      "step": 13525
+    },
+    {
+      "epoch": 2.408119658119658,
+      "grad_norm": 0.9482602477073669,
+      "learning_rate": 6.863805618119713e-05,
+      "loss": 1.1541,
+      "step": 13526
+    },
+    {
+      "epoch": 2.4082977207977208,
+      "grad_norm": 0.8369114995002747,
+      "learning_rate": 6.862476520613276e-05,
+      "loss": 0.874,
+      "step": 13527
+    },
+    {
+      "epoch": 2.4084757834757835,
+      "grad_norm": 0.9107078909873962,
+      "learning_rate": 6.86114748458009e-05,
+      "loss": 0.9412,
+      "step": 13528
+    },
+    {
+      "epoch": 2.4086538461538463,
+      "grad_norm": 0.8086137771606445,
+      "learning_rate": 6.859818510046199e-05,
+      "loss": 0.8495,
+      "step": 13529
+    },
+    {
+      "epoch": 2.408831908831909,
+      "grad_norm": 0.8824704885482788,
+      "learning_rate": 6.858489597037646e-05,
+      "loss": 0.8967,
+      "step": 13530
+    },
+    {
+      "epoch": 2.4090099715099713,
+      "grad_norm": 0.8514662384986877,
+      "learning_rate": 6.857160745580455e-05,
+      "loss": 0.9171,
+      "step": 13531
+    },
+    {
+      "epoch": 2.409188034188034,
+      "grad_norm": 0.7788167595863342,
+      "learning_rate": 6.855831955700675e-05,
+      "loss": 0.904,
+      "step": 13532
+    },
+    {
+      "epoch": 2.409366096866097,
+      "grad_norm": 0.913113534450531,
+      "learning_rate": 6.854503227424337e-05,
+      "loss": 0.8696,
+      "step": 13533
+    },
+    {
+      "epoch": 2.4095441595441596,
+      "grad_norm": 0.8424487113952637,
+      "learning_rate": 6.853174560777475e-05,
+      "loss": 0.8388,
+      "step": 13534
+    },
+    {
+      "epoch": 2.4097222222222223,
+      "grad_norm": 0.8609711527824402,
+      "learning_rate": 6.851845955786116e-05,
+      "loss": 0.7142,
+      "step": 13535
+    },
+    {
+      "epoch": 2.409900284900285,
+      "grad_norm": 0.8141375184059143,
+      "learning_rate": 6.850517412476301e-05,
+      "loss": 0.7198,
+      "step": 13536
+    },
+    {
+      "epoch": 2.410078347578348,
+      "grad_norm": 0.8615440130233765,
+      "learning_rate": 6.84918893087405e-05,
+      "loss": 0.958,
+      "step": 13537
+    },
+    {
+      "epoch": 2.41025641025641,
+      "grad_norm": 0.7733060717582703,
+      "learning_rate": 6.847860511005401e-05,
+      "loss": 0.7639,
+      "step": 13538
+    },
+    {
+      "epoch": 2.410434472934473,
+      "grad_norm": 0.9519185423851013,
+      "learning_rate": 6.846532152896375e-05,
+      "loss": 0.8239,
+      "step": 13539
+    },
+    {
+      "epoch": 2.4106125356125356,
+      "grad_norm": 0.774053692817688,
+      "learning_rate": 6.845203856573002e-05,
+      "loss": 0.891,
+      "step": 13540
+    },
+    {
+      "epoch": 2.4107905982905984,
+      "grad_norm": 0.8791571259498596,
+      "learning_rate": 6.843875622061304e-05,
+      "loss": 1.0107,
+      "step": 13541
+    },
+    {
+      "epoch": 2.410968660968661,
+      "grad_norm": 0.9431949853897095,
+      "learning_rate": 6.842547449387309e-05,
+      "loss": 0.8575,
+      "step": 13542
+    },
+    {
+      "epoch": 2.4111467236467234,
+      "grad_norm": 1.0521612167358398,
+      "learning_rate": 6.841219338577034e-05,
+      "loss": 0.9446,
+      "step": 13543
+    },
+    {
+      "epoch": 2.411324786324786,
+      "grad_norm": 0.7592857480049133,
+      "learning_rate": 6.83989128965651e-05,
+      "loss": 1.0595,
+      "step": 13544
+    },
+    {
+      "epoch": 2.411502849002849,
+      "grad_norm": 0.9002043604850769,
+      "learning_rate": 6.838563302651747e-05,
+      "loss": 0.9067,
+      "step": 13545
+    },
+    {
+      "epoch": 2.4116809116809117,
+      "grad_norm": 0.7144047021865845,
+      "learning_rate": 6.83723537758877e-05,
+      "loss": 0.6699,
+      "step": 13546
+    },
+    {
+      "epoch": 2.4118589743589745,
+      "grad_norm": 0.8226693868637085,
+      "learning_rate": 6.835907514493594e-05,
+      "loss": 0.987,
+      "step": 13547
+    },
+    {
+      "epoch": 2.412037037037037,
+      "grad_norm": 0.8507830500602722,
+      "learning_rate": 6.834579713392237e-05,
+      "loss": 0.7803,
+      "step": 13548
+    },
+    {
+      "epoch": 2.4122150997151,
+      "grad_norm": 0.727870762348175,
+      "learning_rate": 6.83325197431072e-05,
+      "loss": 0.8071,
+      "step": 13549
+    },
+    {
+      "epoch": 2.4123931623931623,
+      "grad_norm": 0.7601624727249146,
+      "learning_rate": 6.831924297275049e-05,
+      "loss": 0.7627,
+      "step": 13550
+    },
+    {
+      "epoch": 2.412571225071225,
+      "grad_norm": 0.8519877791404724,
+      "learning_rate": 6.830596682311243e-05,
+      "loss": 0.9271,
+      "step": 13551
+    },
+    {
+      "epoch": 2.4127492877492878,
+      "grad_norm": 1.0122307538986206,
+      "learning_rate": 6.829269129445307e-05,
+      "loss": 0.6424,
+      "step": 13552
+    },
+    {
+      "epoch": 2.4129273504273505,
+      "grad_norm": 0.8992687463760376,
+      "learning_rate": 6.827941638703258e-05,
+      "loss": 0.8034,
+      "step": 13553
+    },
+    {
+      "epoch": 2.4131054131054133,
+      "grad_norm": 0.7740746140480042,
+      "learning_rate": 6.826614210111102e-05,
+      "loss": 0.955,
+      "step": 13554
+    },
+    {
+      "epoch": 2.4132834757834756,
+      "grad_norm": 0.8176493048667908,
+      "learning_rate": 6.825286843694852e-05,
+      "loss": 0.7844,
+      "step": 13555
+    },
+    {
+      "epoch": 2.4134615384615383,
+      "grad_norm": 0.8112488985061646,
+      "learning_rate": 6.823959539480507e-05,
+      "loss": 0.8495,
+      "step": 13556
+    },
+    {
+      "epoch": 2.413639601139601,
+      "grad_norm": 0.8186960220336914,
+      "learning_rate": 6.822632297494078e-05,
+      "loss": 0.8922,
+      "step": 13557
+    },
+    {
+      "epoch": 2.413817663817664,
+      "grad_norm": 0.9498438835144043,
+      "learning_rate": 6.821305117761569e-05,
+      "loss": 0.8862,
+      "step": 13558
+    },
+    {
+      "epoch": 2.4139957264957266,
+      "grad_norm": 0.8591099381446838,
+      "learning_rate": 6.819978000308987e-05,
+      "loss": 0.837,
+      "step": 13559
+    },
+    {
+      "epoch": 2.4141737891737893,
+      "grad_norm": 0.8130860328674316,
+      "learning_rate": 6.818650945162324e-05,
+      "loss": 1.0723,
+      "step": 13560
+    },
+    {
+      "epoch": 2.414351851851852,
+      "grad_norm": 0.8800109624862671,
+      "learning_rate": 6.81732395234759e-05,
+      "loss": 0.8067,
+      "step": 13561
+    },
+    {
+      "epoch": 2.4145299145299144,
+      "grad_norm": 0.7786064147949219,
+      "learning_rate": 6.81599702189078e-05,
+      "loss": 0.9,
+      "step": 13562
+    },
+    {
+      "epoch": 2.414707977207977,
+      "grad_norm": 0.8343027234077454,
+      "learning_rate": 6.814670153817898e-05,
+      "loss": 0.7487,
+      "step": 13563
+    },
+    {
+      "epoch": 2.41488603988604,
+      "grad_norm": 0.7904187440872192,
+      "learning_rate": 6.813343348154934e-05,
+      "loss": 0.7904,
+      "step": 13564
+    },
+    {
+      "epoch": 2.4150641025641026,
+      "grad_norm": 0.7609010934829712,
+      "learning_rate": 6.81201660492789e-05,
+      "loss": 0.6734,
+      "step": 13565
+    },
+    {
+      "epoch": 2.4152421652421654,
+      "grad_norm": 0.8402243256568909,
+      "learning_rate": 6.810689924162756e-05,
+      "loss": 0.9581,
+      "step": 13566
+    },
+    {
+      "epoch": 2.4154202279202277,
+      "grad_norm": 0.8557454943656921,
+      "learning_rate": 6.809363305885527e-05,
+      "loss": 0.8387,
+      "step": 13567
+    },
+    {
+      "epoch": 2.4155982905982905,
+      "grad_norm": 0.8983132243156433,
+      "learning_rate": 6.808036750122197e-05,
+      "loss": 0.832,
+      "step": 13568
+    },
+    {
+      "epoch": 2.415776353276353,
+      "grad_norm": 0.8552190065383911,
+      "learning_rate": 6.806710256898755e-05,
+      "loss": 0.9257,
+      "step": 13569
+    },
+    {
+      "epoch": 2.415954415954416,
+      "grad_norm": 1.0639078617095947,
+      "learning_rate": 6.805383826241197e-05,
+      "loss": 0.9743,
+      "step": 13570
+    },
+    {
+      "epoch": 2.4161324786324787,
+      "grad_norm": 0.7951667904853821,
+      "learning_rate": 6.804057458175501e-05,
+      "loss": 0.9326,
+      "step": 13571
+    },
+    {
+      "epoch": 2.4163105413105415,
+      "grad_norm": 0.8652639985084534,
+      "learning_rate": 6.802731152727664e-05,
+      "loss": 1.0114,
+      "step": 13572
+    },
+    {
+      "epoch": 2.416488603988604,
+      "grad_norm": 0.8777487874031067,
+      "learning_rate": 6.801404909923664e-05,
+      "loss": 0.759,
+      "step": 13573
+    },
+    {
+      "epoch": 2.4166666666666665,
+      "grad_norm": 0.7922869920730591,
+      "learning_rate": 6.800078729789497e-05,
+      "loss": 0.8392,
+      "step": 13574
+    },
+    {
+      "epoch": 2.4168447293447293,
+      "grad_norm": 0.9189477562904358,
+      "learning_rate": 6.798752612351133e-05,
+      "loss": 0.9485,
+      "step": 13575
+    },
+    {
+      "epoch": 2.417022792022792,
+      "grad_norm": 0.8752175569534302,
+      "learning_rate": 6.797426557634567e-05,
+      "loss": 0.725,
+      "step": 13576
+    },
+    {
+      "epoch": 2.4172008547008548,
+      "grad_norm": 0.8646897077560425,
+      "learning_rate": 6.79610056566577e-05,
+      "loss": 0.9791,
+      "step": 13577
+    },
+    {
+      "epoch": 2.4173789173789175,
+      "grad_norm": 0.8749415278434753,
+      "learning_rate": 6.794774636470731e-05,
+      "loss": 0.9059,
+      "step": 13578
+    },
+    {
+      "epoch": 2.41755698005698,
+      "grad_norm": 0.9642252922058105,
+      "learning_rate": 6.793448770075422e-05,
+      "loss": 0.9972,
+      "step": 13579
+    },
+    {
+      "epoch": 2.4177350427350426,
+      "grad_norm": 0.8430541157722473,
+      "learning_rate": 6.792122966505827e-05,
+      "loss": 0.7126,
+      "step": 13580
+    },
+    {
+      "epoch": 2.4179131054131053,
+      "grad_norm": 0.8478374481201172,
+      "learning_rate": 6.790797225787913e-05,
+      "loss": 0.8995,
+      "step": 13581
+    },
+    {
+      "epoch": 2.418091168091168,
+      "grad_norm": 0.7008727788925171,
+      "learning_rate": 6.789471547947665e-05,
+      "loss": 0.6694,
+      "step": 13582
+    },
+    {
+      "epoch": 2.418269230769231,
+      "grad_norm": 0.873543918132782,
+      "learning_rate": 6.78814593301105e-05,
+      "loss": 1.0418,
+      "step": 13583
+    },
+    {
+      "epoch": 2.4184472934472936,
+      "grad_norm": 0.7208766341209412,
+      "learning_rate": 6.786820381004047e-05,
+      "loss": 0.8095,
+      "step": 13584
+    },
+    {
+      "epoch": 2.4186253561253563,
+      "grad_norm": 0.7272628545761108,
+      "learning_rate": 6.78549489195262e-05,
+      "loss": 0.5801,
+      "step": 13585
+    },
+    {
+      "epoch": 2.4188034188034186,
+      "grad_norm": 0.7155343294143677,
+      "learning_rate": 6.784169465882747e-05,
+      "loss": 0.748,
+      "step": 13586
+    },
+    {
+      "epoch": 2.4189814814814814,
+      "grad_norm": 0.928404688835144,
+      "learning_rate": 6.78284410282039e-05,
+      "loss": 0.9175,
+      "step": 13587
+    },
+    {
+      "epoch": 2.419159544159544,
+      "grad_norm": 0.7239044308662415,
+      "learning_rate": 6.781518802791519e-05,
+      "loss": 0.5855,
+      "step": 13588
+    },
+    {
+      "epoch": 2.419337606837607,
+      "grad_norm": 0.8126311302185059,
+      "learning_rate": 6.780193565822104e-05,
+      "loss": 0.832,
+      "step": 13589
+    },
+    {
+      "epoch": 2.4195156695156697,
+      "grad_norm": 0.7470774054527283,
+      "learning_rate": 6.778868391938103e-05,
+      "loss": 0.6202,
+      "step": 13590
+    },
+    {
+      "epoch": 2.419693732193732,
+      "grad_norm": 0.9161462187767029,
+      "learning_rate": 6.77754328116549e-05,
+      "loss": 0.8674,
+      "step": 13591
+    },
+    {
+      "epoch": 2.4198717948717947,
+      "grad_norm": 0.7225745320320129,
+      "learning_rate": 6.77621823353022e-05,
+      "loss": 0.8745,
+      "step": 13592
+    },
+    {
+      "epoch": 2.4200498575498575,
+      "grad_norm": 0.8380082845687866,
+      "learning_rate": 6.774893249058257e-05,
+      "loss": 0.5501,
+      "step": 13593
+    },
+    {
+      "epoch": 2.42022792022792,
+      "grad_norm": 0.8031942844390869,
+      "learning_rate": 6.77356832777556e-05,
+      "loss": 0.9925,
+      "step": 13594
+    },
+    {
+      "epoch": 2.420405982905983,
+      "grad_norm": 0.8278502821922302,
+      "learning_rate": 6.772243469708093e-05,
+      "loss": 0.6411,
+      "step": 13595
+    },
+    {
+      "epoch": 2.4205840455840457,
+      "grad_norm": 0.7655481100082397,
+      "learning_rate": 6.770918674881805e-05,
+      "loss": 0.7896,
+      "step": 13596
+    },
+    {
+      "epoch": 2.4207621082621085,
+      "grad_norm": 0.8260186314582825,
+      "learning_rate": 6.769593943322661e-05,
+      "loss": 0.8531,
+      "step": 13597
+    },
+    {
+      "epoch": 2.4209401709401708,
+      "grad_norm": 0.8293251395225525,
+      "learning_rate": 6.76826927505661e-05,
+      "loss": 0.8193,
+      "step": 13598
+    },
+    {
+      "epoch": 2.4211182336182335,
+      "grad_norm": 0.8868293762207031,
+      "learning_rate": 6.766944670109616e-05,
+      "loss": 0.8453,
+      "step": 13599
+    },
+    {
+      "epoch": 2.4212962962962963,
+      "grad_norm": 0.769124448299408,
+      "learning_rate": 6.765620128507619e-05,
+      "loss": 0.7412,
+      "step": 13600
+    },
+    {
+      "epoch": 2.421474358974359,
+      "grad_norm": 0.7727167010307312,
+      "learning_rate": 6.764295650276581e-05,
+      "loss": 0.8721,
+      "step": 13601
+    },
+    {
+      "epoch": 2.421652421652422,
+      "grad_norm": 0.9975818395614624,
+      "learning_rate": 6.762971235442444e-05,
+      "loss": 0.8128,
+      "step": 13602
+    },
+    {
+      "epoch": 2.421830484330484,
+      "grad_norm": 0.8000788688659668,
+      "learning_rate": 6.761646884031164e-05,
+      "loss": 0.8328,
+      "step": 13603
+    },
+    {
+      "epoch": 2.422008547008547,
+      "grad_norm": 0.7196731567382812,
+      "learning_rate": 6.760322596068684e-05,
+      "loss": 0.8912,
+      "step": 13604
+    },
+    {
+      "epoch": 2.4221866096866096,
+      "grad_norm": 0.8092321753501892,
+      "learning_rate": 6.758998371580955e-05,
+      "loss": 0.9066,
+      "step": 13605
+    },
+    {
+      "epoch": 2.4223646723646723,
+      "grad_norm": 0.7664031982421875,
+      "learning_rate": 6.757674210593918e-05,
+      "loss": 0.9854,
+      "step": 13606
+    },
+    {
+      "epoch": 2.422542735042735,
+      "grad_norm": 0.794507622718811,
+      "learning_rate": 6.75635011313352e-05,
+      "loss": 0.9296,
+      "step": 13607
+    },
+    {
+      "epoch": 2.422720797720798,
+      "grad_norm": 0.9127107858657837,
+      "learning_rate": 6.755026079225705e-05,
+      "loss": 0.9516,
+      "step": 13608
+    },
+    {
+      "epoch": 2.4228988603988606,
+      "grad_norm": 0.8025720715522766,
+      "learning_rate": 6.753702108896411e-05,
+      "loss": 1.0664,
+      "step": 13609
+    },
+    {
+      "epoch": 2.423076923076923,
+      "grad_norm": 0.7304871678352356,
+      "learning_rate": 6.752378202171585e-05,
+      "loss": 0.9071,
+      "step": 13610
+    },
+    {
+      "epoch": 2.4232549857549857,
+      "grad_norm": 0.9048241972923279,
+      "learning_rate": 6.751054359077157e-05,
+      "loss": 0.8649,
+      "step": 13611
+    },
+    {
+      "epoch": 2.4234330484330484,
+      "grad_norm": 0.8589995503425598,
+      "learning_rate": 6.749730579639074e-05,
+      "loss": 0.8895,
+      "step": 13612
+    },
+    {
+      "epoch": 2.423611111111111,
+      "grad_norm": 0.8098960518836975,
+      "learning_rate": 6.748406863883265e-05,
+      "loss": 0.6651,
+      "step": 13613
+    },
+    {
+      "epoch": 2.423789173789174,
+      "grad_norm": 0.9875120520591736,
+      "learning_rate": 6.74708321183567e-05,
+      "loss": 0.9605,
+      "step": 13614
+    },
+    {
+      "epoch": 2.423967236467236,
+      "grad_norm": 0.8211431503295898,
+      "learning_rate": 6.74575962352222e-05,
+      "loss": 0.8721,
+      "step": 13615
+    },
+    {
+      "epoch": 2.424145299145299,
+      "grad_norm": 0.9732884168624878,
+      "learning_rate": 6.744436098968855e-05,
+      "loss": 0.7501,
+      "step": 13616
+    },
+    {
+      "epoch": 2.4243233618233617,
+      "grad_norm": 0.9813733696937561,
+      "learning_rate": 6.743112638201496e-05,
+      "loss": 0.9823,
+      "step": 13617
+    },
+    {
+      "epoch": 2.4245014245014245,
+      "grad_norm": 0.8075012564659119,
+      "learning_rate": 6.741789241246083e-05,
+      "loss": 0.7018,
+      "step": 13618
+    },
+    {
+      "epoch": 2.4246794871794872,
+      "grad_norm": 0.7845864295959473,
+      "learning_rate": 6.740465908128539e-05,
+      "loss": 0.7423,
+      "step": 13619
+    },
+    {
+      "epoch": 2.42485754985755,
+      "grad_norm": 0.7754862308502197,
+      "learning_rate": 6.739142638874799e-05,
+      "loss": 0.8735,
+      "step": 13620
+    },
+    {
+      "epoch": 2.4250356125356127,
+      "grad_norm": 0.7971537709236145,
+      "learning_rate": 6.737819433510781e-05,
+      "loss": 0.7663,
+      "step": 13621
+    },
+    {
+      "epoch": 2.425213675213675,
+      "grad_norm": 0.8043563365936279,
+      "learning_rate": 6.736496292062416e-05,
+      "loss": 0.9311,
+      "step": 13622
+    },
+    {
+      "epoch": 2.425391737891738,
+      "grad_norm": 0.8150136470794678,
+      "learning_rate": 6.735173214555628e-05,
+      "loss": 0.9164,
+      "step": 13623
+    },
+    {
+      "epoch": 2.4255698005698005,
+      "grad_norm": 0.9853758811950684,
+      "learning_rate": 6.733850201016338e-05,
+      "loss": 0.6253,
+      "step": 13624
+    },
+    {
+      "epoch": 2.4257478632478633,
+      "grad_norm": 1.2138506174087524,
+      "learning_rate": 6.732527251470465e-05,
+      "loss": 1.0536,
+      "step": 13625
+    },
+    {
+      "epoch": 2.425925925925926,
+      "grad_norm": 0.9306546449661255,
+      "learning_rate": 6.73120436594394e-05,
+      "loss": 0.8607,
+      "step": 13626
+    },
+    {
+      "epoch": 2.426103988603989,
+      "grad_norm": 0.8536837697029114,
+      "learning_rate": 6.729881544462668e-05,
+      "loss": 0.9418,
+      "step": 13627
+    },
+    {
+      "epoch": 2.426282051282051,
+      "grad_norm": 0.8561417460441589,
+      "learning_rate": 6.728558787052574e-05,
+      "loss": 0.9556,
+      "step": 13628
+    },
+    {
+      "epoch": 2.426460113960114,
+      "grad_norm": 0.7499847412109375,
+      "learning_rate": 6.727236093739579e-05,
+      "loss": 0.7795,
+      "step": 13629
+    },
+    {
+      "epoch": 2.4266381766381766,
+      "grad_norm": 0.8541018962860107,
+      "learning_rate": 6.725913464549591e-05,
+      "loss": 1.0322,
+      "step": 13630
+    },
+    {
+      "epoch": 2.4268162393162394,
+      "grad_norm": 0.9659489989280701,
+      "learning_rate": 6.724590899508532e-05,
+      "loss": 1.1907,
+      "step": 13631
+    },
+    {
+      "epoch": 2.426994301994302,
+      "grad_norm": 0.9548102617263794,
+      "learning_rate": 6.723268398642307e-05,
+      "loss": 1.0545,
+      "step": 13632
+    },
+    {
+      "epoch": 2.427172364672365,
+      "grad_norm": 0.8543868660926819,
+      "learning_rate": 6.72194596197683e-05,
+      "loss": 0.8041,
+      "step": 13633
+    },
+    {
+      "epoch": 2.427350427350427,
+      "grad_norm": 0.838178277015686,
+      "learning_rate": 6.720623589538013e-05,
+      "loss": 0.7772,
+      "step": 13634
+    },
+    {
+      "epoch": 2.42752849002849,
+      "grad_norm": 0.8207933306694031,
+      "learning_rate": 6.719301281351768e-05,
+      "loss": 0.9222,
+      "step": 13635
+    },
+    {
+      "epoch": 2.4277065527065527,
+      "grad_norm": 0.7705093026161194,
+      "learning_rate": 6.717979037443996e-05,
+      "loss": 0.8989,
+      "step": 13636
+    },
+    {
+      "epoch": 2.4278846153846154,
+      "grad_norm": 0.8627061247825623,
+      "learning_rate": 6.716656857840609e-05,
+      "loss": 0.8834,
+      "step": 13637
+    },
+    {
+      "epoch": 2.428062678062678,
+      "grad_norm": 0.8404269218444824,
+      "learning_rate": 6.715334742567507e-05,
+      "loss": 0.8087,
+      "step": 13638
+    },
+    {
+      "epoch": 2.428240740740741,
+      "grad_norm": 0.9098958373069763,
+      "learning_rate": 6.7140126916506e-05,
+      "loss": 0.803,
+      "step": 13639
+    },
+    {
+      "epoch": 2.4284188034188032,
+      "grad_norm": 0.7482922673225403,
+      "learning_rate": 6.712690705115785e-05,
+      "loss": 0.8254,
+      "step": 13640
+    },
+    {
+      "epoch": 2.428596866096866,
+      "grad_norm": 0.8636375665664673,
+      "learning_rate": 6.711368782988972e-05,
+      "loss": 0.8788,
+      "step": 13641
+    },
+    {
+      "epoch": 2.4287749287749287,
+      "grad_norm": 0.8261808753013611,
+      "learning_rate": 6.710046925296052e-05,
+      "loss": 0.9135,
+      "step": 13642
+    },
+    {
+      "epoch": 2.4289529914529915,
+      "grad_norm": 1.060263752937317,
+      "learning_rate": 6.70872513206293e-05,
+      "loss": 0.9589,
+      "step": 13643
+    },
+    {
+      "epoch": 2.4291310541310542,
+      "grad_norm": 0.8128657341003418,
+      "learning_rate": 6.7074034033155e-05,
+      "loss": 0.7735,
+      "step": 13644
+    },
+    {
+      "epoch": 2.429309116809117,
+      "grad_norm": 0.9948938488960266,
+      "learning_rate": 6.706081739079663e-05,
+      "loss": 0.7242,
+      "step": 13645
+    },
+    {
+      "epoch": 2.4294871794871793,
+      "grad_norm": 0.8850025534629822,
+      "learning_rate": 6.704760139381311e-05,
+      "loss": 0.9393,
+      "step": 13646
+    },
+    {
+      "epoch": 2.429665242165242,
+      "grad_norm": 0.833534300327301,
+      "learning_rate": 6.703438604246337e-05,
+      "loss": 0.7824,
+      "step": 13647
+    },
+    {
+      "epoch": 2.429843304843305,
+      "grad_norm": 0.7362738251686096,
+      "learning_rate": 6.70211713370064e-05,
+      "loss": 0.7594,
+      "step": 13648
+    },
+    {
+      "epoch": 2.4300213675213675,
+      "grad_norm": 0.97635817527771,
+      "learning_rate": 6.700795727770101e-05,
+      "loss": 1.0097,
+      "step": 13649
+    },
+    {
+      "epoch": 2.4301994301994303,
+      "grad_norm": 0.8434939980506897,
+      "learning_rate": 6.699474386480622e-05,
+      "loss": 0.6639,
+      "step": 13650
+    },
+    {
+      "epoch": 2.430377492877493,
+      "grad_norm": 0.7960709929466248,
+      "learning_rate": 6.69815310985808e-05,
+      "loss": 0.9557,
+      "step": 13651
+    },
+    {
+      "epoch": 2.4305555555555554,
+      "grad_norm": 0.8336359262466431,
+      "learning_rate": 6.696831897928376e-05,
+      "loss": 1.0112,
+      "step": 13652
+    },
+    {
+      "epoch": 2.430733618233618,
+      "grad_norm": 0.8353996872901917,
+      "learning_rate": 6.695510750717384e-05,
+      "loss": 0.8578,
+      "step": 13653
+    },
+    {
+      "epoch": 2.430911680911681,
+      "grad_norm": 0.8968163132667542,
+      "learning_rate": 6.694189668250996e-05,
+      "loss": 1.0412,
+      "step": 13654
+    },
+    {
+      "epoch": 2.4310897435897436,
+      "grad_norm": 0.8091850876808167,
+      "learning_rate": 6.692868650555093e-05,
+      "loss": 0.761,
+      "step": 13655
+    },
+    {
+      "epoch": 2.4312678062678064,
+      "grad_norm": 0.7735705375671387,
+      "learning_rate": 6.691547697655563e-05,
+      "loss": 1.0507,
+      "step": 13656
+    },
+    {
+      "epoch": 2.431445868945869,
+      "grad_norm": 0.7707101702690125,
+      "learning_rate": 6.690226809578279e-05,
+      "loss": 0.8883,
+      "step": 13657
+    },
+    {
+      "epoch": 2.431623931623932,
+      "grad_norm": 0.8384747505187988,
+      "learning_rate": 6.688905986349127e-05,
+      "loss": 0.7772,
+      "step": 13658
+    },
+    {
+      "epoch": 2.431801994301994,
+      "grad_norm": 0.7676185369491577,
+      "learning_rate": 6.687585227993985e-05,
+      "loss": 0.844,
+      "step": 13659
+    },
+    {
+      "epoch": 2.431980056980057,
+      "grad_norm": 0.8745819926261902,
+      "learning_rate": 6.686264534538726e-05,
+      "loss": 0.9996,
+      "step": 13660
+    },
+    {
+      "epoch": 2.4321581196581197,
+      "grad_norm": 0.7455142736434937,
+      "learning_rate": 6.684943906009232e-05,
+      "loss": 0.7133,
+      "step": 13661
+    },
+    {
+      "epoch": 2.4323361823361824,
+      "grad_norm": 0.8742238879203796,
+      "learning_rate": 6.683623342431378e-05,
+      "loss": 0.8155,
+      "step": 13662
+    },
+    {
+      "epoch": 2.432514245014245,
+      "grad_norm": 0.7863791584968567,
+      "learning_rate": 6.68230284383103e-05,
+      "loss": 1.0017,
+      "step": 13663
+    },
+    {
+      "epoch": 2.4326923076923075,
+      "grad_norm": 0.9469232559204102,
+      "learning_rate": 6.68098241023407e-05,
+      "loss": 0.8172,
+      "step": 13664
+    },
+    {
+      "epoch": 2.4328703703703702,
+      "grad_norm": 0.808024525642395,
+      "learning_rate": 6.679662041666362e-05,
+      "loss": 0.8813,
+      "step": 13665
+    },
+    {
+      "epoch": 2.433048433048433,
+      "grad_norm": 0.834863543510437,
+      "learning_rate": 6.67834173815378e-05,
+      "loss": 0.9765,
+      "step": 13666
+    },
+    {
+      "epoch": 2.4332264957264957,
+      "grad_norm": 0.8903583288192749,
+      "learning_rate": 6.677021499722193e-05,
+      "loss": 0.9393,
+      "step": 13667
+    },
+    {
+      "epoch": 2.4334045584045585,
+      "grad_norm": 0.8341929912567139,
+      "learning_rate": 6.675701326397466e-05,
+      "loss": 0.8681,
+      "step": 13668
+    },
+    {
+      "epoch": 2.4335826210826212,
+      "grad_norm": 0.9348049163818359,
+      "learning_rate": 6.674381218205465e-05,
+      "loss": 0.7396,
+      "step": 13669
+    },
+    {
+      "epoch": 2.433760683760684,
+      "grad_norm": 0.8898159861564636,
+      "learning_rate": 6.673061175172055e-05,
+      "loss": 0.8638,
+      "step": 13670
+    },
+    {
+      "epoch": 2.4339387464387463,
+      "grad_norm": 0.8101391792297363,
+      "learning_rate": 6.671741197323105e-05,
+      "loss": 0.8064,
+      "step": 13671
+    },
+    {
+      "epoch": 2.434116809116809,
+      "grad_norm": 0.8756688237190247,
+      "learning_rate": 6.670421284684467e-05,
+      "loss": 0.7718,
+      "step": 13672
+    },
+    {
+      "epoch": 2.434294871794872,
+      "grad_norm": 0.8060923218727112,
+      "learning_rate": 6.669101437282012e-05,
+      "loss": 0.8137,
+      "step": 13673
+    },
+    {
+      "epoch": 2.4344729344729346,
+      "grad_norm": 0.792891800403595,
+      "learning_rate": 6.667781655141589e-05,
+      "loss": 0.9104,
+      "step": 13674
+    },
+    {
+      "epoch": 2.4346509971509973,
+      "grad_norm": 0.8590527772903442,
+      "learning_rate": 6.666461938289068e-05,
+      "loss": 0.9578,
+      "step": 13675
+    },
+    {
+      "epoch": 2.4348290598290596,
+      "grad_norm": 0.8593253493309021,
+      "learning_rate": 6.665142286750297e-05,
+      "loss": 0.8083,
+      "step": 13676
+    },
+    {
+      "epoch": 2.4350071225071224,
+      "grad_norm": 0.8237900733947754,
+      "learning_rate": 6.663822700551137e-05,
+      "loss": 0.8096,
+      "step": 13677
+    },
+    {
+      "epoch": 2.435185185185185,
+      "grad_norm": 0.9017227292060852,
+      "learning_rate": 6.66250317971744e-05,
+      "loss": 0.9599,
+      "step": 13678
+    },
+    {
+      "epoch": 2.435363247863248,
+      "grad_norm": 0.7811765670776367,
+      "learning_rate": 6.661183724275061e-05,
+      "loss": 0.8392,
+      "step": 13679
+    },
+    {
+      "epoch": 2.4355413105413106,
+      "grad_norm": 0.8113176822662354,
+      "learning_rate": 6.659864334249848e-05,
+      "loss": 0.6788,
+      "step": 13680
+    },
+    {
+      "epoch": 2.4357193732193734,
+      "grad_norm": 0.795261561870575,
+      "learning_rate": 6.65854500966766e-05,
+      "loss": 0.783,
+      "step": 13681
+    },
+    {
+      "epoch": 2.435897435897436,
+      "grad_norm": 0.7738518714904785,
+      "learning_rate": 6.657225750554338e-05,
+      "loss": 0.676,
+      "step": 13682
+    },
+    {
+      "epoch": 2.4360754985754984,
+      "grad_norm": 0.8513518571853638,
+      "learning_rate": 6.655906556935737e-05,
+      "loss": 0.8743,
+      "step": 13683
+    },
+    {
+      "epoch": 2.436253561253561,
+      "grad_norm": 0.9595896005630493,
+      "learning_rate": 6.654587428837696e-05,
+      "loss": 0.9333,
+      "step": 13684
+    },
+    {
+      "epoch": 2.436431623931624,
+      "grad_norm": 0.7720373272895813,
+      "learning_rate": 6.653268366286066e-05,
+      "loss": 0.77,
+      "step": 13685
+    },
+    {
+      "epoch": 2.4366096866096867,
+      "grad_norm": 0.9022032022476196,
+      "learning_rate": 6.651949369306689e-05,
+      "loss": 0.8786,
+      "step": 13686
+    },
+    {
+      "epoch": 2.4367877492877494,
+      "grad_norm": 0.795092761516571,
+      "learning_rate": 6.650630437925409e-05,
+      "loss": 0.8557,
+      "step": 13687
+    },
+    {
+      "epoch": 2.4369658119658117,
+      "grad_norm": 0.8517789244651794,
+      "learning_rate": 6.649311572168072e-05,
+      "loss": 0.9781,
+      "step": 13688
+    },
+    {
+      "epoch": 2.4371438746438745,
+      "grad_norm": 1.0523463487625122,
+      "learning_rate": 6.64799277206051e-05,
+      "loss": 0.9046,
+      "step": 13689
+    },
+    {
+      "epoch": 2.4373219373219372,
+      "grad_norm": 0.9401832818984985,
+      "learning_rate": 6.646674037628568e-05,
+      "loss": 1.0966,
+      "step": 13690
+    },
+    {
+      "epoch": 2.4375,
+      "grad_norm": 0.9895738959312439,
+      "learning_rate": 6.645355368898082e-05,
+      "loss": 0.714,
+      "step": 13691
+    },
+    {
+      "epoch": 2.4376780626780628,
+      "grad_norm": 0.8458610773086548,
+      "learning_rate": 6.644036765894892e-05,
+      "loss": 0.6265,
+      "step": 13692
+    },
+    {
+      "epoch": 2.4378561253561255,
+      "grad_norm": 0.7908345460891724,
+      "learning_rate": 6.642718228644826e-05,
+      "loss": 0.7943,
+      "step": 13693
+    },
+    {
+      "epoch": 2.4380341880341883,
+      "grad_norm": 0.8119938373565674,
+      "learning_rate": 6.641399757173725e-05,
+      "loss": 0.5948,
+      "step": 13694
+    },
+    {
+      "epoch": 2.4382122507122506,
+      "grad_norm": 0.8175633549690247,
+      "learning_rate": 6.640081351507417e-05,
+      "loss": 0.9098,
+      "step": 13695
+    },
+    {
+      "epoch": 2.4383903133903133,
+      "grad_norm": 0.8546686768531799,
+      "learning_rate": 6.638763011671736e-05,
+      "loss": 1.0347,
+      "step": 13696
+    },
+    {
+      "epoch": 2.438568376068376,
+      "grad_norm": 0.812406599521637,
+      "learning_rate": 6.637444737692508e-05,
+      "loss": 0.8469,
+      "step": 13697
+    },
+    {
+      "epoch": 2.438746438746439,
+      "grad_norm": 0.7802549004554749,
+      "learning_rate": 6.636126529595572e-05,
+      "loss": 0.7024,
+      "step": 13698
+    },
+    {
+      "epoch": 2.4389245014245016,
+      "grad_norm": 0.8046648502349854,
+      "learning_rate": 6.634808387406744e-05,
+      "loss": 0.8292,
+      "step": 13699
+    },
+    {
+      "epoch": 2.439102564102564,
+      "grad_norm": 0.8544600009918213,
+      "learning_rate": 6.633490311151857e-05,
+      "loss": 0.8033,
+      "step": 13700
+    },
+    {
+      "epoch": 2.4392806267806266,
+      "grad_norm": 0.8327271938323975,
+      "learning_rate": 6.632172300856731e-05,
+      "loss": 0.8641,
+      "step": 13701
+    },
+    {
+      "epoch": 2.4394586894586894,
+      "grad_norm": 0.9563352465629578,
+      "learning_rate": 6.630854356547199e-05,
+      "loss": 0.8144,
+      "step": 13702
+    },
+    {
+      "epoch": 2.439636752136752,
+      "grad_norm": 0.8993256092071533,
+      "learning_rate": 6.629536478249071e-05,
+      "loss": 0.8688,
+      "step": 13703
+    },
+    {
+      "epoch": 2.439814814814815,
+      "grad_norm": 0.8741861581802368,
+      "learning_rate": 6.628218665988178e-05,
+      "loss": 0.9868,
+      "step": 13704
+    },
+    {
+      "epoch": 2.4399928774928776,
+      "grad_norm": 0.7898648381233215,
+      "learning_rate": 6.626900919790332e-05,
+      "loss": 0.7802,
+      "step": 13705
+    },
+    {
+      "epoch": 2.4401709401709404,
+      "grad_norm": 0.7651925086975098,
+      "learning_rate": 6.625583239681357e-05,
+      "loss": 0.8131,
+      "step": 13706
+    },
+    {
+      "epoch": 2.4403490028490027,
+      "grad_norm": 0.7917741537094116,
+      "learning_rate": 6.624265625687071e-05,
+      "loss": 0.8581,
+      "step": 13707
+    },
+    {
+      "epoch": 2.4405270655270654,
+      "grad_norm": 0.7631075978279114,
+      "learning_rate": 6.622948077833284e-05,
+      "loss": 0.6069,
+      "step": 13708
+    },
+    {
+      "epoch": 2.440705128205128,
+      "grad_norm": 0.920765221118927,
+      "learning_rate": 6.621630596145819e-05,
+      "loss": 0.6846,
+      "step": 13709
+    },
+    {
+      "epoch": 2.440883190883191,
+      "grad_norm": 0.822335422039032,
+      "learning_rate": 6.62031318065048e-05,
+      "loss": 1.0309,
+      "step": 13710
+    },
+    {
+      "epoch": 2.4410612535612537,
+      "grad_norm": 0.7978029251098633,
+      "learning_rate": 6.618995831373086e-05,
+      "loss": 0.9593,
+      "step": 13711
+    },
+    {
+      "epoch": 2.441239316239316,
+      "grad_norm": 0.8908950686454773,
+      "learning_rate": 6.617678548339443e-05,
+      "loss": 0.7147,
+      "step": 13712
+    },
+    {
+      "epoch": 2.4414173789173788,
+      "grad_norm": 0.7772884368896484,
+      "learning_rate": 6.616361331575368e-05,
+      "loss": 0.8839,
+      "step": 13713
+    },
+    {
+      "epoch": 2.4415954415954415,
+      "grad_norm": 0.8437771797180176,
+      "learning_rate": 6.615044181106658e-05,
+      "loss": 0.8388,
+      "step": 13714
+    },
+    {
+      "epoch": 2.4417735042735043,
+      "grad_norm": 0.8549850583076477,
+      "learning_rate": 6.613727096959128e-05,
+      "loss": 0.9251,
+      "step": 13715
+    },
+    {
+      "epoch": 2.441951566951567,
+      "grad_norm": 0.8402581810951233,
+      "learning_rate": 6.612410079158579e-05,
+      "loss": 0.8607,
+      "step": 13716
+    },
+    {
+      "epoch": 2.4421296296296298,
+      "grad_norm": 0.8181160688400269,
+      "learning_rate": 6.611093127730821e-05,
+      "loss": 0.6082,
+      "step": 13717
+    },
+    {
+      "epoch": 2.4423076923076925,
+      "grad_norm": 0.9006236791610718,
+      "learning_rate": 6.609776242701651e-05,
+      "loss": 0.9091,
+      "step": 13718
+    },
+    {
+      "epoch": 2.442485754985755,
+      "grad_norm": 0.7759920358657837,
+      "learning_rate": 6.608459424096876e-05,
+      "loss": 0.842,
+      "step": 13719
+    },
+    {
+      "epoch": 2.4426638176638176,
+      "grad_norm": 0.825701117515564,
+      "learning_rate": 6.60714267194229e-05,
+      "loss": 0.9325,
+      "step": 13720
+    },
+    {
+      "epoch": 2.4428418803418803,
+      "grad_norm": 0.7646961212158203,
+      "learning_rate": 6.605825986263697e-05,
+      "loss": 0.8124,
+      "step": 13721
+    },
+    {
+      "epoch": 2.443019943019943,
+      "grad_norm": 0.896112322807312,
+      "learning_rate": 6.604509367086888e-05,
+      "loss": 1.0962,
+      "step": 13722
+    },
+    {
+      "epoch": 2.443198005698006,
+      "grad_norm": 0.8079821467399597,
+      "learning_rate": 6.603192814437672e-05,
+      "loss": 0.8195,
+      "step": 13723
+    },
+    {
+      "epoch": 2.443376068376068,
+      "grad_norm": 0.8901529908180237,
+      "learning_rate": 6.601876328341831e-05,
+      "loss": 0.7886,
+      "step": 13724
+    },
+    {
+      "epoch": 2.443554131054131,
+      "grad_norm": 1.0454550981521606,
+      "learning_rate": 6.600559908825168e-05,
+      "loss": 0.9642,
+      "step": 13725
+    },
+    {
+      "epoch": 2.4437321937321936,
+      "grad_norm": 0.7995026707649231,
+      "learning_rate": 6.599243555913469e-05,
+      "loss": 0.6927,
+      "step": 13726
+    },
+    {
+      "epoch": 2.4439102564102564,
+      "grad_norm": 0.9235756397247314,
+      "learning_rate": 6.597927269632526e-05,
+      "loss": 0.8986,
+      "step": 13727
+    },
+    {
+      "epoch": 2.444088319088319,
+      "grad_norm": 0.7869365215301514,
+      "learning_rate": 6.596611050008137e-05,
+      "loss": 0.7592,
+      "step": 13728
+    },
+    {
+      "epoch": 2.444266381766382,
+      "grad_norm": 0.8172873258590698,
+      "learning_rate": 6.595294897066081e-05,
+      "loss": 0.8048,
+      "step": 13729
+    },
+    {
+      "epoch": 2.4444444444444446,
+      "grad_norm": 0.8021790981292725,
+      "learning_rate": 6.593978810832152e-05,
+      "loss": 0.767,
+      "step": 13730
+    },
+    {
+      "epoch": 2.444622507122507,
+      "grad_norm": 0.7781784534454346,
+      "learning_rate": 6.592662791332129e-05,
+      "loss": 0.8364,
+      "step": 13731
+    },
+    {
+      "epoch": 2.4448005698005697,
+      "grad_norm": 0.8227871656417847,
+      "learning_rate": 6.591346838591803e-05,
+      "loss": 0.8967,
+      "step": 13732
+    },
+    {
+      "epoch": 2.4449786324786325,
+      "grad_norm": 0.8349295854568481,
+      "learning_rate": 6.590030952636952e-05,
+      "loss": 0.8593,
+      "step": 13733
+    },
+    {
+      "epoch": 2.445156695156695,
+      "grad_norm": 1.0261762142181396,
+      "learning_rate": 6.588715133493365e-05,
+      "loss": 0.701,
+      "step": 13734
+    },
+    {
+      "epoch": 2.445334757834758,
+      "grad_norm": 0.8612635731697083,
+      "learning_rate": 6.587399381186814e-05,
+      "loss": 0.9803,
+      "step": 13735
+    },
+    {
+      "epoch": 2.4455128205128207,
+      "grad_norm": 0.7890039682388306,
+      "learning_rate": 6.586083695743086e-05,
+      "loss": 0.8001,
+      "step": 13736
+    },
+    {
+      "epoch": 2.445690883190883,
+      "grad_norm": 0.9934018850326538,
+      "learning_rate": 6.584768077187955e-05,
+      "loss": 1.0089,
+      "step": 13737
+    },
+    {
+      "epoch": 2.4458689458689458,
+      "grad_norm": 0.8232909440994263,
+      "learning_rate": 6.583452525547202e-05,
+      "loss": 0.8088,
+      "step": 13738
+    },
+    {
+      "epoch": 2.4460470085470085,
+      "grad_norm": 0.8635872006416321,
+      "learning_rate": 6.582137040846595e-05,
+      "loss": 1.0007,
+      "step": 13739
+    },
+    {
+      "epoch": 2.4462250712250713,
+      "grad_norm": 0.905575692653656,
+      "learning_rate": 6.580821623111914e-05,
+      "loss": 0.7577,
+      "step": 13740
+    },
+    {
+      "epoch": 2.446403133903134,
+      "grad_norm": 0.8264324069023132,
+      "learning_rate": 6.579506272368931e-05,
+      "loss": 0.9337,
+      "step": 13741
+    },
+    {
+      "epoch": 2.4465811965811968,
+      "grad_norm": 0.9100900292396545,
+      "learning_rate": 6.57819098864342e-05,
+      "loss": 0.8115,
+      "step": 13742
+    },
+    {
+      "epoch": 2.446759259259259,
+      "grad_norm": 0.8536351919174194,
+      "learning_rate": 6.576875771961145e-05,
+      "loss": 0.8612,
+      "step": 13743
+    },
+    {
+      "epoch": 2.446937321937322,
+      "grad_norm": 0.8968019485473633,
+      "learning_rate": 6.57556062234788e-05,
+      "loss": 0.8798,
+      "step": 13744
+    },
+    {
+      "epoch": 2.4471153846153846,
+      "grad_norm": 0.8745046854019165,
+      "learning_rate": 6.574245539829389e-05,
+      "loss": 0.8992,
+      "step": 13745
+    },
+    {
+      "epoch": 2.4472934472934473,
+      "grad_norm": 0.8336703777313232,
+      "learning_rate": 6.57293052443144e-05,
+      "loss": 0.7947,
+      "step": 13746
+    },
+    {
+      "epoch": 2.44747150997151,
+      "grad_norm": 0.8544902801513672,
+      "learning_rate": 6.571615576179801e-05,
+      "loss": 1.0315,
+      "step": 13747
+    },
+    {
+      "epoch": 2.447649572649573,
+      "grad_norm": 0.848242461681366,
+      "learning_rate": 6.570300695100229e-05,
+      "loss": 0.853,
+      "step": 13748
+    },
+    {
+      "epoch": 2.447827635327635,
+      "grad_norm": 0.7753778100013733,
+      "learning_rate": 6.568985881218496e-05,
+      "loss": 0.934,
+      "step": 13749
+    },
+    {
+      "epoch": 2.448005698005698,
+      "grad_norm": 0.8294853568077087,
+      "learning_rate": 6.567671134560351e-05,
+      "loss": 0.9328,
+      "step": 13750
+    },
+    {
+      "epoch": 2.4481837606837606,
+      "grad_norm": 0.8720992803573608,
+      "learning_rate": 6.566356455151565e-05,
+      "loss": 0.875,
+      "step": 13751
+    },
+    {
+      "epoch": 2.4483618233618234,
+      "grad_norm": 0.8204464316368103,
+      "learning_rate": 6.565041843017888e-05,
+      "loss": 0.923,
+      "step": 13752
+    },
+    {
+      "epoch": 2.448539886039886,
+      "grad_norm": 0.940037190914154,
+      "learning_rate": 6.563727298185085e-05,
+      "loss": 1.1596,
+      "step": 13753
+    },
+    {
+      "epoch": 2.448717948717949,
+      "grad_norm": 0.8390263915061951,
+      "learning_rate": 6.562412820678902e-05,
+      "loss": 0.8256,
+      "step": 13754
+    },
+    {
+      "epoch": 2.448896011396011,
+      "grad_norm": 0.8572748303413391,
+      "learning_rate": 6.561098410525106e-05,
+      "loss": 0.7833,
+      "step": 13755
+    },
+    {
+      "epoch": 2.449074074074074,
+      "grad_norm": 0.7981020212173462,
+      "learning_rate": 6.559784067749436e-05,
+      "loss": 0.7609,
+      "step": 13756
+    },
+    {
+      "epoch": 2.4492521367521367,
+      "grad_norm": 0.8356930613517761,
+      "learning_rate": 6.558469792377653e-05,
+      "loss": 0.7542,
+      "step": 13757
+    },
+    {
+      "epoch": 2.4494301994301995,
+      "grad_norm": 0.9340906739234924,
+      "learning_rate": 6.557155584435504e-05,
+      "loss": 0.9898,
+      "step": 13758
+    },
+    {
+      "epoch": 2.449608262108262,
+      "grad_norm": 1.0551100969314575,
+      "learning_rate": 6.555841443948743e-05,
+      "loss": 0.7189,
+      "step": 13759
+    },
+    {
+      "epoch": 2.449786324786325,
+      "grad_norm": 0.9572125673294067,
+      "learning_rate": 6.554527370943111e-05,
+      "loss": 0.8878,
+      "step": 13760
+    },
+    {
+      "epoch": 2.4499643874643873,
+      "grad_norm": 0.8760324716567993,
+      "learning_rate": 6.55321336544436e-05,
+      "loss": 0.75,
+      "step": 13761
+    },
+    {
+      "epoch": 2.45014245014245,
+      "grad_norm": 0.7599226236343384,
+      "learning_rate": 6.55189942747823e-05,
+      "loss": 0.9222,
+      "step": 13762
+    },
+    {
+      "epoch": 2.4503205128205128,
+      "grad_norm": 0.7307319045066833,
+      "learning_rate": 6.550585557070473e-05,
+      "loss": 0.833,
+      "step": 13763
+    },
+    {
+      "epoch": 2.4504985754985755,
+      "grad_norm": 0.8022613525390625,
+      "learning_rate": 6.549271754246822e-05,
+      "loss": 0.9439,
+      "step": 13764
+    },
+    {
+      "epoch": 2.4506766381766383,
+      "grad_norm": 0.7447740435600281,
+      "learning_rate": 6.547958019033024e-05,
+      "loss": 0.7803,
+      "step": 13765
+    },
+    {
+      "epoch": 2.450854700854701,
+      "grad_norm": 0.9021183252334595,
+      "learning_rate": 6.546644351454818e-05,
+      "loss": 0.8373,
+      "step": 13766
+    },
+    {
+      "epoch": 2.451032763532764,
+      "grad_norm": 0.8230152726173401,
+      "learning_rate": 6.545330751537941e-05,
+      "loss": 0.7023,
+      "step": 13767
+    },
+    {
+      "epoch": 2.451210826210826,
+      "grad_norm": 0.9581316113471985,
+      "learning_rate": 6.544017219308132e-05,
+      "loss": 1.0024,
+      "step": 13768
+    },
+    {
+      "epoch": 2.451388888888889,
+      "grad_norm": 0.7969945073127747,
+      "learning_rate": 6.542703754791127e-05,
+      "loss": 0.8996,
+      "step": 13769
+    },
+    {
+      "epoch": 2.4515669515669516,
+      "grad_norm": 0.864604115486145,
+      "learning_rate": 6.54139035801266e-05,
+      "loss": 0.9325,
+      "step": 13770
+    },
+    {
+      "epoch": 2.4517450142450143,
+      "grad_norm": 0.8156671524047852,
+      "learning_rate": 6.540077028998463e-05,
+      "loss": 0.7926,
+      "step": 13771
+    },
+    {
+      "epoch": 2.451923076923077,
+      "grad_norm": 0.8704202175140381,
+      "learning_rate": 6.538763767774272e-05,
+      "loss": 0.8855,
+      "step": 13772
+    },
+    {
+      "epoch": 2.4521011396011394,
+      "grad_norm": 0.7533015012741089,
+      "learning_rate": 6.537450574365811e-05,
+      "loss": 0.9322,
+      "step": 13773
+    },
+    {
+      "epoch": 2.452279202279202,
+      "grad_norm": 0.8272553086280823,
+      "learning_rate": 6.536137448798819e-05,
+      "loss": 0.7474,
+      "step": 13774
+    },
+    {
+      "epoch": 2.452457264957265,
+      "grad_norm": 0.7788257598876953,
+      "learning_rate": 6.534824391099013e-05,
+      "loss": 0.7163,
+      "step": 13775
+    },
+    {
+      "epoch": 2.4526353276353277,
+      "grad_norm": 0.8309275507926941,
+      "learning_rate": 6.533511401292125e-05,
+      "loss": 1.1595,
+      "step": 13776
+    },
+    {
+      "epoch": 2.4528133903133904,
+      "grad_norm": 0.8369085788726807,
+      "learning_rate": 6.53219847940388e-05,
+      "loss": 0.7211,
+      "step": 13777
+    },
+    {
+      "epoch": 2.452991452991453,
+      "grad_norm": 0.8571248054504395,
+      "learning_rate": 6.530885625460007e-05,
+      "loss": 0.729,
+      "step": 13778
+    },
+    {
+      "epoch": 2.453169515669516,
+      "grad_norm": 0.7579928040504456,
+      "learning_rate": 6.529572839486217e-05,
+      "loss": 0.8799,
+      "step": 13779
+    },
+    {
+      "epoch": 2.453347578347578,
+      "grad_norm": 0.822463870048523,
+      "learning_rate": 6.528260121508245e-05,
+      "loss": 0.7948,
+      "step": 13780
+    },
+    {
+      "epoch": 2.453525641025641,
+      "grad_norm": 0.7910317778587341,
+      "learning_rate": 6.526947471551798e-05,
+      "loss": 0.7727,
+      "step": 13781
+    },
+    {
+      "epoch": 2.4537037037037037,
+      "grad_norm": 0.9321692585945129,
+      "learning_rate": 6.525634889642605e-05,
+      "loss": 0.8754,
+      "step": 13782
+    },
+    {
+      "epoch": 2.4538817663817665,
+      "grad_norm": 1.0130813121795654,
+      "learning_rate": 6.524322375806374e-05,
+      "loss": 0.7845,
+      "step": 13783
+    },
+    {
+      "epoch": 2.4540598290598292,
+      "grad_norm": 0.7254214882850647,
+      "learning_rate": 6.52300993006883e-05,
+      "loss": 0.4665,
+      "step": 13784
+    },
+    {
+      "epoch": 2.4542378917378915,
+      "grad_norm": 0.7874964475631714,
+      "learning_rate": 6.521697552455683e-05,
+      "loss": 0.8535,
+      "step": 13785
+    },
+    {
+      "epoch": 2.4544159544159543,
+      "grad_norm": 0.8275010585784912,
+      "learning_rate": 6.520385242992644e-05,
+      "loss": 0.7744,
+      "step": 13786
+    },
+    {
+      "epoch": 2.454594017094017,
+      "grad_norm": 0.7972453236579895,
+      "learning_rate": 6.519073001705431e-05,
+      "loss": 0.9494,
+      "step": 13787
+    },
+    {
+      "epoch": 2.45477207977208,
+      "grad_norm": 0.8763988018035889,
+      "learning_rate": 6.517760828619748e-05,
+      "loss": 0.8043,
+      "step": 13788
+    },
+    {
+      "epoch": 2.4549501424501425,
+      "grad_norm": 0.7948910593986511,
+      "learning_rate": 6.516448723761315e-05,
+      "loss": 0.7218,
+      "step": 13789
+    },
+    {
+      "epoch": 2.4551282051282053,
+      "grad_norm": 0.9416671395301819,
+      "learning_rate": 6.515136687155825e-05,
+      "loss": 0.7866,
+      "step": 13790
+    },
+    {
+      "epoch": 2.455306267806268,
+      "grad_norm": 0.8702704906463623,
+      "learning_rate": 6.513824718828999e-05,
+      "loss": 1.1579,
+      "step": 13791
+    },
+    {
+      "epoch": 2.4554843304843303,
+      "grad_norm": 0.8148752450942993,
+      "learning_rate": 6.51251281880653e-05,
+      "loss": 0.897,
+      "step": 13792
+    },
+    {
+      "epoch": 2.455662393162393,
+      "grad_norm": 0.8088299036026001,
+      "learning_rate": 6.511200987114132e-05,
+      "loss": 0.939,
+      "step": 13793
+    },
+    {
+      "epoch": 2.455840455840456,
+      "grad_norm": 0.9836809635162354,
+      "learning_rate": 6.509889223777499e-05,
+      "loss": 0.8841,
+      "step": 13794
+    },
+    {
+      "epoch": 2.4560185185185186,
+      "grad_norm": 0.7677251696586609,
+      "learning_rate": 6.508577528822342e-05,
+      "loss": 0.7816,
+      "step": 13795
+    },
+    {
+      "epoch": 2.4561965811965814,
+      "grad_norm": 0.835421085357666,
+      "learning_rate": 6.507265902274351e-05,
+      "loss": 0.9302,
+      "step": 13796
+    },
+    {
+      "epoch": 2.4563746438746437,
+      "grad_norm": 0.8892473578453064,
+      "learning_rate": 6.50595434415923e-05,
+      "loss": 0.8281,
+      "step": 13797
+    },
+    {
+      "epoch": 2.4565527065527064,
+      "grad_norm": 0.810459315776825,
+      "learning_rate": 6.504642854502676e-05,
+      "loss": 0.81,
+      "step": 13798
+    },
+    {
+      "epoch": 2.456730769230769,
+      "grad_norm": 0.9277065992355347,
+      "learning_rate": 6.503331433330386e-05,
+      "loss": 0.7956,
+      "step": 13799
+    },
+    {
+      "epoch": 2.456908831908832,
+      "grad_norm": 0.861725389957428,
+      "learning_rate": 6.502020080668051e-05,
+      "loss": 0.7788,
+      "step": 13800
+    },
+    {
+      "epoch": 2.4570868945868947,
+      "grad_norm": 1.0000818967819214,
+      "learning_rate": 6.500708796541366e-05,
+      "loss": 0.9197,
+      "step": 13801
+    },
+    {
+      "epoch": 2.4572649572649574,
+      "grad_norm": 0.920998215675354,
+      "learning_rate": 6.499397580976024e-05,
+      "loss": 0.7816,
+      "step": 13802
+    },
+    {
+      "epoch": 2.45744301994302,
+      "grad_norm": 0.7574821710586548,
+      "learning_rate": 6.498086433997715e-05,
+      "loss": 0.8982,
+      "step": 13803
+    },
+    {
+      "epoch": 2.4576210826210825,
+      "grad_norm": 1.026700496673584,
+      "learning_rate": 6.496775355632125e-05,
+      "loss": 1.131,
+      "step": 13804
+    },
+    {
+      "epoch": 2.4577991452991452,
+      "grad_norm": 0.7532633543014526,
+      "learning_rate": 6.495464345904945e-05,
+      "loss": 0.7998,
+      "step": 13805
+    },
+    {
+      "epoch": 2.457977207977208,
+      "grad_norm": 0.7380105257034302,
+      "learning_rate": 6.494153404841865e-05,
+      "loss": 0.7656,
+      "step": 13806
+    },
+    {
+      "epoch": 2.4581552706552707,
+      "grad_norm": 0.7933080792427063,
+      "learning_rate": 6.492842532468561e-05,
+      "loss": 0.7419,
+      "step": 13807
+    },
+    {
+      "epoch": 2.4583333333333335,
+      "grad_norm": 0.7731907963752747,
+      "learning_rate": 6.491531728810724e-05,
+      "loss": 0.8334,
+      "step": 13808
+    },
+    {
+      "epoch": 2.458511396011396,
+      "grad_norm": 0.7368177771568298,
+      "learning_rate": 6.490220993894035e-05,
+      "loss": 0.6184,
+      "step": 13809
+    },
+    {
+      "epoch": 2.4586894586894585,
+      "grad_norm": 0.8381120562553406,
+      "learning_rate": 6.488910327744178e-05,
+      "loss": 0.7875,
+      "step": 13810
+    },
+    {
+      "epoch": 2.4588675213675213,
+      "grad_norm": 0.910142183303833,
+      "learning_rate": 6.487599730386824e-05,
+      "loss": 0.8216,
+      "step": 13811
+    },
+    {
+      "epoch": 2.459045584045584,
+      "grad_norm": 0.9005017876625061,
+      "learning_rate": 6.48628920184766e-05,
+      "loss": 0.8928,
+      "step": 13812
+    },
+    {
+      "epoch": 2.459223646723647,
+      "grad_norm": 0.8437321782112122,
+      "learning_rate": 6.484978742152358e-05,
+      "loss": 0.9243,
+      "step": 13813
+    },
+    {
+      "epoch": 2.4594017094017095,
+      "grad_norm": 0.9145610928535461,
+      "learning_rate": 6.483668351326599e-05,
+      "loss": 0.9759,
+      "step": 13814
+    },
+    {
+      "epoch": 2.4595797720797723,
+      "grad_norm": 0.8391930460929871,
+      "learning_rate": 6.48235802939605e-05,
+      "loss": 0.8021,
+      "step": 13815
+    },
+    {
+      "epoch": 2.4597578347578346,
+      "grad_norm": 0.8035653233528137,
+      "learning_rate": 6.481047776386394e-05,
+      "loss": 0.8622,
+      "step": 13816
+    },
+    {
+      "epoch": 2.4599358974358974,
+      "grad_norm": 0.7238573431968689,
+      "learning_rate": 6.479737592323291e-05,
+      "loss": 0.6864,
+      "step": 13817
+    },
+    {
+      "epoch": 2.46011396011396,
+      "grad_norm": 0.8629193902015686,
+      "learning_rate": 6.47842747723242e-05,
+      "loss": 0.6107,
+      "step": 13818
+    },
+    {
+      "epoch": 2.460292022792023,
+      "grad_norm": 0.9015333652496338,
+      "learning_rate": 6.477117431139444e-05,
+      "loss": 0.845,
+      "step": 13819
+    },
+    {
+      "epoch": 2.4604700854700856,
+      "grad_norm": 0.8544989824295044,
+      "learning_rate": 6.47580745407004e-05,
+      "loss": 0.7103,
+      "step": 13820
+    },
+    {
+      "epoch": 2.460648148148148,
+      "grad_norm": 0.7665401101112366,
+      "learning_rate": 6.474497546049862e-05,
+      "loss": 0.6864,
+      "step": 13821
+    },
+    {
+      "epoch": 2.4608262108262107,
+      "grad_norm": 0.8640002012252808,
+      "learning_rate": 6.473187707104584e-05,
+      "loss": 0.953,
+      "step": 13822
+    },
+    {
+      "epoch": 2.4610042735042734,
+      "grad_norm": 0.8563477396965027,
+      "learning_rate": 6.471877937259864e-05,
+      "loss": 0.7776,
+      "step": 13823
+    },
+    {
+      "epoch": 2.461182336182336,
+      "grad_norm": 0.8089157938957214,
+      "learning_rate": 6.470568236541371e-05,
+      "loss": 0.8273,
+      "step": 13824
+    },
+    {
+      "epoch": 2.461360398860399,
+      "grad_norm": 0.8710005283355713,
+      "learning_rate": 6.469258604974757e-05,
+      "loss": 0.8827,
+      "step": 13825
+    },
+    {
+      "epoch": 2.4615384615384617,
+      "grad_norm": 0.7780489325523376,
+      "learning_rate": 6.467949042585688e-05,
+      "loss": 0.8035,
+      "step": 13826
+    },
+    {
+      "epoch": 2.4617165242165244,
+      "grad_norm": 1.010976791381836,
+      "learning_rate": 6.466639549399822e-05,
+      "loss": 0.7442,
+      "step": 13827
+    },
+    {
+      "epoch": 2.4618945868945867,
+      "grad_norm": 0.714077889919281,
+      "learning_rate": 6.465330125442812e-05,
+      "loss": 0.694,
+      "step": 13828
+    },
+    {
+      "epoch": 2.4620726495726495,
+      "grad_norm": 0.7408512234687805,
+      "learning_rate": 6.464020770740316e-05,
+      "loss": 0.6709,
+      "step": 13829
+    },
+    {
+      "epoch": 2.4622507122507122,
+      "grad_norm": 0.8433945178985596,
+      "learning_rate": 6.462711485317987e-05,
+      "loss": 0.9127,
+      "step": 13830
+    },
+    {
+      "epoch": 2.462428774928775,
+      "grad_norm": 1.031745195388794,
+      "learning_rate": 6.461402269201481e-05,
+      "loss": 0.9105,
+      "step": 13831
+    },
+    {
+      "epoch": 2.4626068376068377,
+      "grad_norm": 0.8884360790252686,
+      "learning_rate": 6.460093122416444e-05,
+      "loss": 0.9354,
+      "step": 13832
+    },
+    {
+      "epoch": 2.4627849002849,
+      "grad_norm": 0.8466372489929199,
+      "learning_rate": 6.45878404498853e-05,
+      "loss": 0.9554,
+      "step": 13833
+    },
+    {
+      "epoch": 2.462962962962963,
+      "grad_norm": 0.9026118516921997,
+      "learning_rate": 6.457475036943386e-05,
+      "loss": 0.928,
+      "step": 13834
+    },
+    {
+      "epoch": 2.4631410256410255,
+      "grad_norm": 0.9034590721130371,
+      "learning_rate": 6.456166098306661e-05,
+      "loss": 0.7694,
+      "step": 13835
+    },
+    {
+      "epoch": 2.4633190883190883,
+      "grad_norm": 0.8369483947753906,
+      "learning_rate": 6.454857229103998e-05,
+      "loss": 0.928,
+      "step": 13836
+    },
+    {
+      "epoch": 2.463497150997151,
+      "grad_norm": 0.8670645356178284,
+      "learning_rate": 6.453548429361045e-05,
+      "loss": 0.971,
+      "step": 13837
+    },
+    {
+      "epoch": 2.463675213675214,
+      "grad_norm": 0.8415539860725403,
+      "learning_rate": 6.452239699103442e-05,
+      "loss": 0.8461,
+      "step": 13838
+    },
+    {
+      "epoch": 2.4638532763532766,
+      "grad_norm": 0.7434490323066711,
+      "learning_rate": 6.450931038356834e-05,
+      "loss": 0.6677,
+      "step": 13839
+    },
+    {
+      "epoch": 2.464031339031339,
+      "grad_norm": 0.8113850355148315,
+      "learning_rate": 6.449622447146855e-05,
+      "loss": 0.8644,
+      "step": 13840
+    },
+    {
+      "epoch": 2.4642094017094016,
+      "grad_norm": 0.7424083352088928,
+      "learning_rate": 6.448313925499154e-05,
+      "loss": 0.7469,
+      "step": 13841
+    },
+    {
+      "epoch": 2.4643874643874644,
+      "grad_norm": 1.006949782371521,
+      "learning_rate": 6.44700547343936e-05,
+      "loss": 0.8426,
+      "step": 13842
+    },
+    {
+      "epoch": 2.464565527065527,
+      "grad_norm": 1.0643857717514038,
+      "learning_rate": 6.445697090993117e-05,
+      "loss": 0.7793,
+      "step": 13843
+    },
+    {
+      "epoch": 2.46474358974359,
+      "grad_norm": 0.8716835975646973,
+      "learning_rate": 6.444388778186051e-05,
+      "loss": 0.9684,
+      "step": 13844
+    },
+    {
+      "epoch": 2.464921652421652,
+      "grad_norm": 0.8677120804786682,
+      "learning_rate": 6.443080535043802e-05,
+      "loss": 0.9772,
+      "step": 13845
+    },
+    {
+      "epoch": 2.465099715099715,
+      "grad_norm": 0.8955141305923462,
+      "learning_rate": 6.441772361592005e-05,
+      "loss": 0.881,
+      "step": 13846
+    },
+    {
+      "epoch": 2.4652777777777777,
+      "grad_norm": 0.806794285774231,
+      "learning_rate": 6.440464257856283e-05,
+      "loss": 0.6393,
+      "step": 13847
+    },
+    {
+      "epoch": 2.4654558404558404,
+      "grad_norm": 0.8438352346420288,
+      "learning_rate": 6.439156223862272e-05,
+      "loss": 0.8305,
+      "step": 13848
+    },
+    {
+      "epoch": 2.465633903133903,
+      "grad_norm": 0.828960120677948,
+      "learning_rate": 6.437848259635594e-05,
+      "loss": 0.8564,
+      "step": 13849
+    },
+    {
+      "epoch": 2.465811965811966,
+      "grad_norm": 0.790199339389801,
+      "learning_rate": 6.436540365201886e-05,
+      "loss": 0.6573,
+      "step": 13850
+    },
+    {
+      "epoch": 2.4659900284900287,
+      "grad_norm": 0.8476296663284302,
+      "learning_rate": 6.435232540586763e-05,
+      "loss": 0.6979,
+      "step": 13851
+    },
+    {
+      "epoch": 2.466168091168091,
+      "grad_norm": 0.6880464553833008,
+      "learning_rate": 6.433924785815857e-05,
+      "loss": 0.6925,
+      "step": 13852
+    },
+    {
+      "epoch": 2.4663461538461537,
+      "grad_norm": 0.9369434118270874,
+      "learning_rate": 6.432617100914782e-05,
+      "loss": 0.6891,
+      "step": 13853
+    },
+    {
+      "epoch": 2.4665242165242165,
+      "grad_norm": 0.8522159457206726,
+      "learning_rate": 6.431309485909166e-05,
+      "loss": 1.0786,
+      "step": 13854
+    },
+    {
+      "epoch": 2.4667022792022792,
+      "grad_norm": 0.8479002714157104,
+      "learning_rate": 6.430001940824625e-05,
+      "loss": 0.9293,
+      "step": 13855
+    },
+    {
+      "epoch": 2.466880341880342,
+      "grad_norm": 0.8382098078727722,
+      "learning_rate": 6.428694465686787e-05,
+      "loss": 1.1279,
+      "step": 13856
+    },
+    {
+      "epoch": 2.4670584045584047,
+      "grad_norm": 0.9641128182411194,
+      "learning_rate": 6.427387060521255e-05,
+      "loss": 0.7938,
+      "step": 13857
+    },
+    {
+      "epoch": 2.467236467236467,
+      "grad_norm": 0.8458924293518066,
+      "learning_rate": 6.426079725353656e-05,
+      "loss": 0.7804,
+      "step": 13858
+    },
+    {
+      "epoch": 2.46741452991453,
+      "grad_norm": 0.8023849725723267,
+      "learning_rate": 6.424772460209597e-05,
+      "loss": 0.718,
+      "step": 13859
+    },
+    {
+      "epoch": 2.4675925925925926,
+      "grad_norm": 0.800864040851593,
+      "learning_rate": 6.423465265114699e-05,
+      "loss": 0.7627,
+      "step": 13860
+    },
+    {
+      "epoch": 2.4677706552706553,
+      "grad_norm": 0.9159586429595947,
+      "learning_rate": 6.422158140094566e-05,
+      "loss": 0.8348,
+      "step": 13861
+    },
+    {
+      "epoch": 2.467948717948718,
+      "grad_norm": 0.7982872724533081,
+      "learning_rate": 6.420851085174817e-05,
+      "loss": 0.8092,
+      "step": 13862
+    },
+    {
+      "epoch": 2.468126780626781,
+      "grad_norm": 0.8847397565841675,
+      "learning_rate": 6.41954410038105e-05,
+      "loss": 0.9165,
+      "step": 13863
+    },
+    {
+      "epoch": 2.468304843304843,
+      "grad_norm": 0.7885190844535828,
+      "learning_rate": 6.418237185738882e-05,
+      "loss": 0.733,
+      "step": 13864
+    },
+    {
+      "epoch": 2.468482905982906,
+      "grad_norm": 0.902428150177002,
+      "learning_rate": 6.416930341273914e-05,
+      "loss": 0.8345,
+      "step": 13865
+    },
+    {
+      "epoch": 2.4686609686609686,
+      "grad_norm": 0.9344130158424377,
+      "learning_rate": 6.415623567011751e-05,
+      "loss": 0.8651,
+      "step": 13866
+    },
+    {
+      "epoch": 2.4688390313390314,
+      "grad_norm": 0.840679407119751,
+      "learning_rate": 6.414316862978003e-05,
+      "loss": 0.7534,
+      "step": 13867
+    },
+    {
+      "epoch": 2.469017094017094,
+      "grad_norm": 0.8799613118171692,
+      "learning_rate": 6.413010229198263e-05,
+      "loss": 0.9335,
+      "step": 13868
+    },
+    {
+      "epoch": 2.469195156695157,
+      "grad_norm": 1.1401816606521606,
+      "learning_rate": 6.411703665698142e-05,
+      "loss": 0.8902,
+      "step": 13869
+    },
+    {
+      "epoch": 2.469373219373219,
+      "grad_norm": 0.9733933210372925,
+      "learning_rate": 6.410397172503227e-05,
+      "loss": 0.9288,
+      "step": 13870
+    },
+    {
+      "epoch": 2.469551282051282,
+      "grad_norm": 0.9260223507881165,
+      "learning_rate": 6.409090749639128e-05,
+      "loss": 0.9664,
+      "step": 13871
+    },
+    {
+      "epoch": 2.4697293447293447,
+      "grad_norm": 1.0200423002243042,
+      "learning_rate": 6.407784397131433e-05,
+      "loss": 0.9095,
+      "step": 13872
+    },
+    {
+      "epoch": 2.4699074074074074,
+      "grad_norm": 0.8015561103820801,
+      "learning_rate": 6.406478115005743e-05,
+      "loss": 0.8015,
+      "step": 13873
+    },
+    {
+      "epoch": 2.47008547008547,
+      "grad_norm": 0.8035915493965149,
+      "learning_rate": 6.40517190328765e-05,
+      "loss": 0.9275,
+      "step": 13874
+    },
+    {
+      "epoch": 2.470263532763533,
+      "grad_norm": 0.8248090147972107,
+      "learning_rate": 6.403865762002743e-05,
+      "loss": 0.8736,
+      "step": 13875
+    },
+    {
+      "epoch": 2.4704415954415953,
+      "grad_norm": 0.9310920834541321,
+      "learning_rate": 6.402559691176616e-05,
+      "loss": 0.794,
+      "step": 13876
+    },
+    {
+      "epoch": 2.470619658119658,
+      "grad_norm": 0.7796428799629211,
+      "learning_rate": 6.401253690834863e-05,
+      "loss": 0.832,
+      "step": 13877
+    },
+    {
+      "epoch": 2.4707977207977208,
+      "grad_norm": 0.9046199321746826,
+      "learning_rate": 6.399947761003063e-05,
+      "loss": 0.9988,
+      "step": 13878
+    },
+    {
+      "epoch": 2.4709757834757835,
+      "grad_norm": 0.8970019221305847,
+      "learning_rate": 6.398641901706812e-05,
+      "loss": 0.6805,
+      "step": 13879
+    },
+    {
+      "epoch": 2.4711538461538463,
+      "grad_norm": 0.935786247253418,
+      "learning_rate": 6.397336112971688e-05,
+      "loss": 0.9826,
+      "step": 13880
+    },
+    {
+      "epoch": 2.471331908831909,
+      "grad_norm": 0.8497617244720459,
+      "learning_rate": 6.396030394823285e-05,
+      "loss": 0.8842,
+      "step": 13881
+    },
+    {
+      "epoch": 2.4715099715099713,
+      "grad_norm": 0.9159898161888123,
+      "learning_rate": 6.394724747287173e-05,
+      "loss": 0.8592,
+      "step": 13882
+    },
+    {
+      "epoch": 2.471688034188034,
+      "grad_norm": 0.891951858997345,
+      "learning_rate": 6.393419170388943e-05,
+      "loss": 0.8197,
+      "step": 13883
+    },
+    {
+      "epoch": 2.471866096866097,
+      "grad_norm": 0.9038097858428955,
+      "learning_rate": 6.392113664154172e-05,
+      "loss": 0.7535,
+      "step": 13884
+    },
+    {
+      "epoch": 2.4720441595441596,
+      "grad_norm": 0.8502489924430847,
+      "learning_rate": 6.390808228608438e-05,
+      "loss": 0.9183,
+      "step": 13885
+    },
+    {
+      "epoch": 2.4722222222222223,
+      "grad_norm": 0.7442654371261597,
+      "learning_rate": 6.389502863777323e-05,
+      "loss": 0.7741,
+      "step": 13886
+    },
+    {
+      "epoch": 2.472400284900285,
+      "grad_norm": 0.8811324238777161,
+      "learning_rate": 6.388197569686395e-05,
+      "loss": 0.7261,
+      "step": 13887
+    },
+    {
+      "epoch": 2.472578347578348,
+      "grad_norm": 0.9487552046775818,
+      "learning_rate": 6.386892346361239e-05,
+      "loss": 0.779,
+      "step": 13888
+    },
+    {
+      "epoch": 2.47275641025641,
+      "grad_norm": 0.8138917088508606,
+      "learning_rate": 6.385587193827416e-05,
+      "loss": 0.915,
+      "step": 13889
+    },
+    {
+      "epoch": 2.472934472934473,
+      "grad_norm": 0.7842695713043213,
+      "learning_rate": 6.384282112110506e-05,
+      "loss": 0.9458,
+      "step": 13890
+    },
+    {
+      "epoch": 2.4731125356125356,
+      "grad_norm": 0.7852116823196411,
+      "learning_rate": 6.382977101236074e-05,
+      "loss": 0.9515,
+      "step": 13891
+    },
+    {
+      "epoch": 2.4732905982905984,
+      "grad_norm": 0.8429296016693115,
+      "learning_rate": 6.381672161229698e-05,
+      "loss": 0.9466,
+      "step": 13892
+    },
+    {
+      "epoch": 2.473468660968661,
+      "grad_norm": 0.8713327050209045,
+      "learning_rate": 6.380367292116933e-05,
+      "loss": 0.7552,
+      "step": 13893
+    },
+    {
+      "epoch": 2.4736467236467234,
+      "grad_norm": 0.8153441548347473,
+      "learning_rate": 6.379062493923355e-05,
+      "loss": 0.7833,
+      "step": 13894
+    },
+    {
+      "epoch": 2.473824786324786,
+      "grad_norm": 0.8283601999282837,
+      "learning_rate": 6.377757766674526e-05,
+      "loss": 0.8986,
+      "step": 13895
+    },
+    {
+      "epoch": 2.474002849002849,
+      "grad_norm": 0.8116408586502075,
+      "learning_rate": 6.37645311039601e-05,
+      "loss": 0.8549,
+      "step": 13896
+    },
+    {
+      "epoch": 2.4741809116809117,
+      "grad_norm": 0.7999116778373718,
+      "learning_rate": 6.375148525113365e-05,
+      "loss": 0.8082,
+      "step": 13897
+    },
+    {
+      "epoch": 2.4743589743589745,
+      "grad_norm": 0.7307565808296204,
+      "learning_rate": 6.373844010852159e-05,
+      "loss": 0.7946,
+      "step": 13898
+    },
+    {
+      "epoch": 2.474537037037037,
+      "grad_norm": 0.7519806027412415,
+      "learning_rate": 6.372539567637941e-05,
+      "loss": 0.6511,
+      "step": 13899
+    },
+    {
+      "epoch": 2.4747150997151,
+      "grad_norm": 0.8571820259094238,
+      "learning_rate": 6.371235195496279e-05,
+      "loss": 0.8266,
+      "step": 13900
+    },
+    {
+      "epoch": 2.4748931623931623,
+      "grad_norm": 0.8118062019348145,
+      "learning_rate": 6.369930894452723e-05,
+      "loss": 0.8573,
+      "step": 13901
+    },
+    {
+      "epoch": 2.475071225071225,
+      "grad_norm": 0.8729892373085022,
+      "learning_rate": 6.368626664532833e-05,
+      "loss": 0.812,
+      "step": 13902
+    },
+    {
+      "epoch": 2.4752492877492878,
+      "grad_norm": 0.7663209438323975,
+      "learning_rate": 6.367322505762157e-05,
+      "loss": 0.6648,
+      "step": 13903
+    },
+    {
+      "epoch": 2.4754273504273505,
+      "grad_norm": 0.7913058996200562,
+      "learning_rate": 6.366018418166251e-05,
+      "loss": 0.7486,
+      "step": 13904
+    },
+    {
+      "epoch": 2.4756054131054133,
+      "grad_norm": 0.7714928984642029,
+      "learning_rate": 6.364714401770666e-05,
+      "loss": 0.9134,
+      "step": 13905
+    },
+    {
+      "epoch": 2.4757834757834756,
+      "grad_norm": 0.8226378560066223,
+      "learning_rate": 6.363410456600949e-05,
+      "loss": 0.903,
+      "step": 13906
+    },
+    {
+      "epoch": 2.4759615384615383,
+      "grad_norm": 0.8643919825553894,
+      "learning_rate": 6.362106582682653e-05,
+      "loss": 0.9068,
+      "step": 13907
+    },
+    {
+      "epoch": 2.476139601139601,
+      "grad_norm": 0.8390868306159973,
+      "learning_rate": 6.360802780041317e-05,
+      "loss": 0.8938,
+      "step": 13908
+    },
+    {
+      "epoch": 2.476317663817664,
+      "grad_norm": 0.9549261927604675,
+      "learning_rate": 6.359499048702495e-05,
+      "loss": 0.8431,
+      "step": 13909
+    },
+    {
+      "epoch": 2.4764957264957266,
+      "grad_norm": 0.7509152889251709,
+      "learning_rate": 6.358195388691726e-05,
+      "loss": 0.8661,
+      "step": 13910
+    },
+    {
+      "epoch": 2.4766737891737893,
+      "grad_norm": 0.9447416067123413,
+      "learning_rate": 6.356891800034552e-05,
+      "loss": 0.8766,
+      "step": 13911
+    },
+    {
+      "epoch": 2.476851851851852,
+      "grad_norm": 0.8972395062446594,
+      "learning_rate": 6.355588282756515e-05,
+      "loss": 0.8517,
+      "step": 13912
+    },
+    {
+      "epoch": 2.4770299145299144,
+      "grad_norm": 0.8645047545433044,
+      "learning_rate": 6.354284836883156e-05,
+      "loss": 0.8188,
+      "step": 13913
+    },
+    {
+      "epoch": 2.477207977207977,
+      "grad_norm": 0.7939230799674988,
+      "learning_rate": 6.35298146244001e-05,
+      "loss": 0.778,
+      "step": 13914
+    },
+    {
+      "epoch": 2.47738603988604,
+      "grad_norm": 0.7714613676071167,
+      "learning_rate": 6.351678159452618e-05,
+      "loss": 0.7964,
+      "step": 13915
+    },
+    {
+      "epoch": 2.4775641025641026,
+      "grad_norm": 0.9293754696846008,
+      "learning_rate": 6.350374927946512e-05,
+      "loss": 0.7965,
+      "step": 13916
+    },
+    {
+      "epoch": 2.4777421652421654,
+      "grad_norm": 0.8025050759315491,
+      "learning_rate": 6.349071767947233e-05,
+      "loss": 0.8222,
+      "step": 13917
+    },
+    {
+      "epoch": 2.4779202279202277,
+      "grad_norm": 0.7790399193763733,
+      "learning_rate": 6.347768679480304e-05,
+      "loss": 1.0294,
+      "step": 13918
+    },
+    {
+      "epoch": 2.4780982905982905,
+      "grad_norm": 1.0063512325286865,
+      "learning_rate": 6.346465662571261e-05,
+      "loss": 0.953,
+      "step": 13919
+    },
+    {
+      "epoch": 2.478276353276353,
+      "grad_norm": 0.8742708563804626,
+      "learning_rate": 6.345162717245634e-05,
+      "loss": 0.7119,
+      "step": 13920
+    },
+    {
+      "epoch": 2.478454415954416,
+      "grad_norm": 0.760497510433197,
+      "learning_rate": 6.343859843528955e-05,
+      "loss": 0.7446,
+      "step": 13921
+    },
+    {
+      "epoch": 2.4786324786324787,
+      "grad_norm": 0.7635362148284912,
+      "learning_rate": 6.342557041446743e-05,
+      "loss": 0.8286,
+      "step": 13922
+    },
+    {
+      "epoch": 2.4788105413105415,
+      "grad_norm": 0.7876720428466797,
+      "learning_rate": 6.341254311024532e-05,
+      "loss": 0.8007,
+      "step": 13923
+    },
+    {
+      "epoch": 2.478988603988604,
+      "grad_norm": 0.8144401907920837,
+      "learning_rate": 6.339951652287839e-05,
+      "loss": 0.9177,
+      "step": 13924
+    },
+    {
+      "epoch": 2.4791666666666665,
+      "grad_norm": 0.7400189638137817,
+      "learning_rate": 6.338649065262189e-05,
+      "loss": 0.8935,
+      "step": 13925
+    },
+    {
+      "epoch": 2.4793447293447293,
+      "grad_norm": 0.8412175178527832,
+      "learning_rate": 6.337346549973106e-05,
+      "loss": 0.6929,
+      "step": 13926
+    },
+    {
+      "epoch": 2.479522792022792,
+      "grad_norm": 1.0156967639923096,
+      "learning_rate": 6.336044106446108e-05,
+      "loss": 0.9843,
+      "step": 13927
+    },
+    {
+      "epoch": 2.4797008547008548,
+      "grad_norm": 0.8556809425354004,
+      "learning_rate": 6.33474173470672e-05,
+      "loss": 0.81,
+      "step": 13928
+    },
+    {
+      "epoch": 2.4798789173789175,
+      "grad_norm": 0.8103616237640381,
+      "learning_rate": 6.333439434780448e-05,
+      "loss": 0.925,
+      "step": 13929
+    },
+    {
+      "epoch": 2.48005698005698,
+      "grad_norm": 0.9460168480873108,
+      "learning_rate": 6.332137206692817e-05,
+      "loss": 0.7059,
+      "step": 13930
+    },
+    {
+      "epoch": 2.4802350427350426,
+      "grad_norm": 0.847226619720459,
+      "learning_rate": 6.330835050469334e-05,
+      "loss": 1.0139,
+      "step": 13931
+    },
+    {
+      "epoch": 2.4804131054131053,
+      "grad_norm": 0.7639240622520447,
+      "learning_rate": 6.329532966135523e-05,
+      "loss": 0.8141,
+      "step": 13932
+    },
+    {
+      "epoch": 2.480591168091168,
+      "grad_norm": 0.9273494482040405,
+      "learning_rate": 6.328230953716883e-05,
+      "loss": 0.9864,
+      "step": 13933
+    },
+    {
+      "epoch": 2.480769230769231,
+      "grad_norm": 0.7709840536117554,
+      "learning_rate": 6.326929013238934e-05,
+      "loss": 0.7275,
+      "step": 13934
+    },
+    {
+      "epoch": 2.4809472934472936,
+      "grad_norm": 0.755933940410614,
+      "learning_rate": 6.325627144727177e-05,
+      "loss": 0.7785,
+      "step": 13935
+    },
+    {
+      "epoch": 2.4811253561253563,
+      "grad_norm": 0.9058536291122437,
+      "learning_rate": 6.324325348207125e-05,
+      "loss": 0.9694,
+      "step": 13936
+    },
+    {
+      "epoch": 2.4813034188034186,
+      "grad_norm": 0.8490056395530701,
+      "learning_rate": 6.323023623704282e-05,
+      "loss": 0.8882,
+      "step": 13937
+    },
+    {
+      "epoch": 2.4814814814814814,
+      "grad_norm": 0.9559429883956909,
+      "learning_rate": 6.321721971244155e-05,
+      "loss": 0.9243,
+      "step": 13938
+    },
+    {
+      "epoch": 2.481659544159544,
+      "grad_norm": 0.8607096076011658,
+      "learning_rate": 6.320420390852242e-05,
+      "loss": 0.9281,
+      "step": 13939
+    },
+    {
+      "epoch": 2.481837606837607,
+      "grad_norm": 1.1263439655303955,
+      "learning_rate": 6.319118882554049e-05,
+      "loss": 0.8772,
+      "step": 13940
+    },
+    {
+      "epoch": 2.4820156695156697,
+      "grad_norm": 0.9691354632377625,
+      "learning_rate": 6.317817446375074e-05,
+      "loss": 0.9349,
+      "step": 13941
+    },
+    {
+      "epoch": 2.482193732193732,
+      "grad_norm": 0.8636828064918518,
+      "learning_rate": 6.31651608234082e-05,
+      "loss": 0.9021,
+      "step": 13942
+    },
+    {
+      "epoch": 2.4823717948717947,
+      "grad_norm": 0.8405864238739014,
+      "learning_rate": 6.315214790476777e-05,
+      "loss": 0.8917,
+      "step": 13943
+    },
+    {
+      "epoch": 2.4825498575498575,
+      "grad_norm": 0.8082821369171143,
+      "learning_rate": 6.313913570808448e-05,
+      "loss": 0.914,
+      "step": 13944
+    },
+    {
+      "epoch": 2.48272792022792,
+      "grad_norm": 0.8734335899353027,
+      "learning_rate": 6.312612423361328e-05,
+      "loss": 0.6885,
+      "step": 13945
+    },
+    {
+      "epoch": 2.482905982905983,
+      "grad_norm": 0.943190336227417,
+      "learning_rate": 6.311311348160904e-05,
+      "loss": 0.894,
+      "step": 13946
+    },
+    {
+      "epoch": 2.4830840455840457,
+      "grad_norm": 0.9425446391105652,
+      "learning_rate": 6.310010345232673e-05,
+      "loss": 0.7893,
+      "step": 13947
+    },
+    {
+      "epoch": 2.4832621082621085,
+      "grad_norm": 0.9668664932250977,
+      "learning_rate": 6.308709414602123e-05,
+      "loss": 0.9715,
+      "step": 13948
+    },
+    {
+      "epoch": 2.4834401709401708,
+      "grad_norm": 0.7771579027175903,
+      "learning_rate": 6.307408556294747e-05,
+      "loss": 0.872,
+      "step": 13949
+    },
+    {
+      "epoch": 2.4836182336182335,
+      "grad_norm": 0.8183084726333618,
+      "learning_rate": 6.306107770336025e-05,
+      "loss": 1.0008,
+      "step": 13950
+    },
+    {
+      "epoch": 2.4837962962962963,
+      "grad_norm": 0.8284399509429932,
+      "learning_rate": 6.304807056751452e-05,
+      "loss": 0.687,
+      "step": 13951
+    },
+    {
+      "epoch": 2.483974358974359,
+      "grad_norm": 0.8663082718849182,
+      "learning_rate": 6.303506415566504e-05,
+      "loss": 0.8792,
+      "step": 13952
+    },
+    {
+      "epoch": 2.484152421652422,
+      "grad_norm": 0.8272001147270203,
+      "learning_rate": 6.302205846806675e-05,
+      "loss": 0.9255,
+      "step": 13953
+    },
+    {
+      "epoch": 2.484330484330484,
+      "grad_norm": 0.9398671388626099,
+      "learning_rate": 6.300905350497437e-05,
+      "loss": 0.9605,
+      "step": 13954
+    },
+    {
+      "epoch": 2.484508547008547,
+      "grad_norm": 0.7368931770324707,
+      "learning_rate": 6.299604926664276e-05,
+      "loss": 0.6734,
+      "step": 13955
+    },
+    {
+      "epoch": 2.4846866096866096,
+      "grad_norm": 0.8315541744232178,
+      "learning_rate": 6.298304575332668e-05,
+      "loss": 0.9578,
+      "step": 13956
+    },
+    {
+      "epoch": 2.4848646723646723,
+      "grad_norm": 0.8656954169273376,
+      "learning_rate": 6.297004296528095e-05,
+      "loss": 0.8897,
+      "step": 13957
+    },
+    {
+      "epoch": 2.485042735042735,
+      "grad_norm": 0.9047118425369263,
+      "learning_rate": 6.295704090276026e-05,
+      "loss": 1.0308,
+      "step": 13958
+    },
+    {
+      "epoch": 2.485220797720798,
+      "grad_norm": 0.8771422505378723,
+      "learning_rate": 6.294403956601946e-05,
+      "loss": 0.9505,
+      "step": 13959
+    },
+    {
+      "epoch": 2.4853988603988606,
+      "grad_norm": 0.8935427069664001,
+      "learning_rate": 6.293103895531319e-05,
+      "loss": 1.0485,
+      "step": 13960
+    },
+    {
+      "epoch": 2.485576923076923,
+      "grad_norm": 0.7345624566078186,
+      "learning_rate": 6.291803907089621e-05,
+      "loss": 0.7485,
+      "step": 13961
+    },
+    {
+      "epoch": 2.4857549857549857,
+      "grad_norm": 0.6415224075317383,
+      "learning_rate": 6.290503991302324e-05,
+      "loss": 0.6237,
+      "step": 13962
+    },
+    {
+      "epoch": 2.4859330484330484,
+      "grad_norm": 0.8547754883766174,
+      "learning_rate": 6.289204148194896e-05,
+      "loss": 0.9152,
+      "step": 13963
+    },
+    {
+      "epoch": 2.486111111111111,
+      "grad_norm": 0.7434722185134888,
+      "learning_rate": 6.2879043777928e-05,
+      "loss": 0.8499,
+      "step": 13964
+    },
+    {
+      "epoch": 2.486289173789174,
+      "grad_norm": 0.8609980940818787,
+      "learning_rate": 6.286604680121509e-05,
+      "loss": 0.8101,
+      "step": 13965
+    },
+    {
+      "epoch": 2.486467236467236,
+      "grad_norm": 0.8709290027618408,
+      "learning_rate": 6.285305055206486e-05,
+      "loss": 0.8335,
+      "step": 13966
+    },
+    {
+      "epoch": 2.486645299145299,
+      "grad_norm": 0.7758293747901917,
+      "learning_rate": 6.284005503073191e-05,
+      "loss": 0.8408,
+      "step": 13967
+    },
+    {
+      "epoch": 2.4868233618233617,
+      "grad_norm": 0.9778353571891785,
+      "learning_rate": 6.282706023747094e-05,
+      "loss": 0.8595,
+      "step": 13968
+    },
+    {
+      "epoch": 2.4870014245014245,
+      "grad_norm": 0.8438369631767273,
+      "learning_rate": 6.281406617253646e-05,
+      "loss": 0.8843,
+      "step": 13969
+    },
+    {
+      "epoch": 2.4871794871794872,
+      "grad_norm": 0.9704681634902954,
+      "learning_rate": 6.280107283618315e-05,
+      "loss": 0.62,
+      "step": 13970
+    },
+    {
+      "epoch": 2.48735754985755,
+      "grad_norm": 0.7774441242218018,
+      "learning_rate": 6.278808022866549e-05,
+      "loss": 0.9087,
+      "step": 13971
+    },
+    {
+      "epoch": 2.4875356125356127,
+      "grad_norm": 0.8387142419815063,
+      "learning_rate": 6.277508835023813e-05,
+      "loss": 0.854,
+      "step": 13972
+    },
+    {
+      "epoch": 2.487713675213675,
+      "grad_norm": 0.8483029007911682,
+      "learning_rate": 6.276209720115556e-05,
+      "loss": 0.8665,
+      "step": 13973
+    },
+    {
+      "epoch": 2.487891737891738,
+      "grad_norm": 0.8251432180404663,
+      "learning_rate": 6.274910678167239e-05,
+      "loss": 0.7816,
+      "step": 13974
+    },
+    {
+      "epoch": 2.4880698005698005,
+      "grad_norm": 0.8503836989402771,
+      "learning_rate": 6.273611709204304e-05,
+      "loss": 0.939,
+      "step": 13975
+    },
+    {
+      "epoch": 2.4882478632478633,
+      "grad_norm": 0.6545158624649048,
+      "learning_rate": 6.27231281325221e-05,
+      "loss": 0.564,
+      "step": 13976
+    },
+    {
+      "epoch": 2.488425925925926,
+      "grad_norm": 0.7353499531745911,
+      "learning_rate": 6.2710139903364e-05,
+      "loss": 0.7103,
+      "step": 13977
+    },
+    {
+      "epoch": 2.488603988603989,
+      "grad_norm": 0.9032405614852905,
+      "learning_rate": 6.269715240482327e-05,
+      "loss": 0.8802,
+      "step": 13978
+    },
+    {
+      "epoch": 2.488782051282051,
+      "grad_norm": 0.8141019940376282,
+      "learning_rate": 6.268416563715434e-05,
+      "loss": 0.9836,
+      "step": 13979
+    },
+    {
+      "epoch": 2.488960113960114,
+      "grad_norm": 0.9087637066841125,
+      "learning_rate": 6.267117960061167e-05,
+      "loss": 0.8488,
+      "step": 13980
+    },
+    {
+      "epoch": 2.4891381766381766,
+      "grad_norm": 0.6649556756019592,
+      "learning_rate": 6.265819429544969e-05,
+      "loss": 0.604,
+      "step": 13981
+    },
+    {
+      "epoch": 2.4893162393162394,
+      "grad_norm": 0.7872918248176575,
+      "learning_rate": 6.264520972192283e-05,
+      "loss": 0.8537,
+      "step": 13982
+    },
+    {
+      "epoch": 2.489494301994302,
+      "grad_norm": 0.8821072578430176,
+      "learning_rate": 6.263222588028546e-05,
+      "loss": 0.7485,
+      "step": 13983
+    },
+    {
+      "epoch": 2.489672364672365,
+      "grad_norm": 0.8077933192253113,
+      "learning_rate": 6.2619242770792e-05,
+      "loss": 0.9174,
+      "step": 13984
+    },
+    {
+      "epoch": 2.489850427350427,
+      "grad_norm": 0.8087183237075806,
+      "learning_rate": 6.260626039369686e-05,
+      "loss": 0.9184,
+      "step": 13985
+    },
+    {
+      "epoch": 2.49002849002849,
+      "grad_norm": 0.8849205374717712,
+      "learning_rate": 6.259327874925434e-05,
+      "loss": 0.9095,
+      "step": 13986
+    },
+    {
+      "epoch": 2.4902065527065527,
+      "grad_norm": 0.7899976968765259,
+      "learning_rate": 6.258029783771884e-05,
+      "loss": 0.7286,
+      "step": 13987
+    },
+    {
+      "epoch": 2.4903846153846154,
+      "grad_norm": 0.7907543182373047,
+      "learning_rate": 6.256731765934464e-05,
+      "loss": 0.8433,
+      "step": 13988
+    },
+    {
+      "epoch": 2.490562678062678,
+      "grad_norm": 0.7794694304466248,
+      "learning_rate": 6.255433821438614e-05,
+      "loss": 0.7197,
+      "step": 13989
+    },
+    {
+      "epoch": 2.490740740740741,
+      "grad_norm": 0.8443161249160767,
+      "learning_rate": 6.254135950309753e-05,
+      "loss": 0.7982,
+      "step": 13990
+    },
+    {
+      "epoch": 2.4909188034188032,
+      "grad_norm": 0.972024142742157,
+      "learning_rate": 6.252838152573323e-05,
+      "loss": 0.9212,
+      "step": 13991
+    },
+    {
+      "epoch": 2.491096866096866,
+      "grad_norm": 0.8017764687538147,
+      "learning_rate": 6.25154042825474e-05,
+      "loss": 0.9544,
+      "step": 13992
+    },
+    {
+      "epoch": 2.4912749287749287,
+      "grad_norm": 0.8622884154319763,
+      "learning_rate": 6.250242777379442e-05,
+      "loss": 0.8411,
+      "step": 13993
+    },
+    {
+      "epoch": 2.4914529914529915,
+      "grad_norm": 0.7384446263313293,
+      "learning_rate": 6.248945199972842e-05,
+      "loss": 0.8357,
+      "step": 13994
+    },
+    {
+      "epoch": 2.4916310541310542,
+      "grad_norm": 0.7748960256576538,
+      "learning_rate": 6.247647696060372e-05,
+      "loss": 0.7739,
+      "step": 13995
+    },
+    {
+      "epoch": 2.491809116809117,
+      "grad_norm": 0.8295742273330688,
+      "learning_rate": 6.246350265667448e-05,
+      "loss": 0.8032,
+      "step": 13996
+    },
+    {
+      "epoch": 2.4919871794871793,
+      "grad_norm": 0.8604934811592102,
+      "learning_rate": 6.245052908819494e-05,
+      "loss": 0.8738,
+      "step": 13997
+    },
+    {
+      "epoch": 2.492165242165242,
+      "grad_norm": 0.8381406664848328,
+      "learning_rate": 6.243755625541926e-05,
+      "loss": 0.8351,
+      "step": 13998
+    },
+    {
+      "epoch": 2.492343304843305,
+      "grad_norm": 0.9238134026527405,
+      "learning_rate": 6.242458415860168e-05,
+      "loss": 0.9529,
+      "step": 13999
+    },
+    {
+      "epoch": 2.4925213675213675,
+      "grad_norm": 0.9234444499015808,
+      "learning_rate": 6.241161279799628e-05,
+      "loss": 1.1086,
+      "step": 14000
+    },
+    {
+      "epoch": 2.4926994301994303,
+      "grad_norm": 0.8056737780570984,
+      "learning_rate": 6.239864217385727e-05,
+      "loss": 0.7957,
+      "step": 14001
+    },
+    {
+      "epoch": 2.492877492877493,
+      "grad_norm": 0.7877696752548218,
+      "learning_rate": 6.238567228643872e-05,
+      "loss": 0.9577,
+      "step": 14002
+    },
+    {
+      "epoch": 2.4930555555555554,
+      "grad_norm": 0.7437340021133423,
+      "learning_rate": 6.237270313599479e-05,
+      "loss": 0.6171,
+      "step": 14003
+    },
+    {
+      "epoch": 2.493233618233618,
+      "grad_norm": 0.8503403067588806,
+      "learning_rate": 6.235973472277962e-05,
+      "loss": 0.7608,
+      "step": 14004
+    },
+    {
+      "epoch": 2.493411680911681,
+      "grad_norm": 0.8557562232017517,
+      "learning_rate": 6.234676704704722e-05,
+      "loss": 0.8414,
+      "step": 14005
+    },
+    {
+      "epoch": 2.4935897435897436,
+      "grad_norm": 0.9188289046287537,
+      "learning_rate": 6.233380010905174e-05,
+      "loss": 0.89,
+      "step": 14006
+    },
+    {
+      "epoch": 2.4937678062678064,
+      "grad_norm": 0.9433556199073792,
+      "learning_rate": 6.232083390904716e-05,
+      "loss": 0.9316,
+      "step": 14007
+    },
+    {
+      "epoch": 2.493945868945869,
+      "grad_norm": 0.9278882145881653,
+      "learning_rate": 6.230786844728759e-05,
+      "loss": 0.9211,
+      "step": 14008
+    },
+    {
+      "epoch": 2.494123931623932,
+      "grad_norm": 0.8365640640258789,
+      "learning_rate": 6.229490372402702e-05,
+      "loss": 0.9578,
+      "step": 14009
+    },
+    {
+      "epoch": 2.494301994301994,
+      "grad_norm": 0.7987647652626038,
+      "learning_rate": 6.228193973951953e-05,
+      "loss": 0.9279,
+      "step": 14010
+    },
+    {
+      "epoch": 2.494480056980057,
+      "grad_norm": 0.7707502841949463,
+      "learning_rate": 6.226897649401902e-05,
+      "loss": 0.8879,
+      "step": 14011
+    },
+    {
+      "epoch": 2.4946581196581197,
+      "grad_norm": 0.8623191118240356,
+      "learning_rate": 6.225601398777957e-05,
+      "loss": 0.7427,
+      "step": 14012
+    },
+    {
+      "epoch": 2.4948361823361824,
+      "grad_norm": 0.8470782041549683,
+      "learning_rate": 6.22430522210551e-05,
+      "loss": 0.9425,
+      "step": 14013
+    },
+    {
+      "epoch": 2.495014245014245,
+      "grad_norm": 0.9169524908065796,
+      "learning_rate": 6.223009119409963e-05,
+      "loss": 0.9595,
+      "step": 14014
+    },
+    {
+      "epoch": 2.4951923076923075,
+      "grad_norm": 0.8541738986968994,
+      "learning_rate": 6.221713090716701e-05,
+      "loss": 1.0726,
+      "step": 14015
+    },
+    {
+      "epoch": 2.4953703703703702,
+      "grad_norm": 0.8801444172859192,
+      "learning_rate": 6.220417136051126e-05,
+      "loss": 0.8323,
+      "step": 14016
+    },
+    {
+      "epoch": 2.495548433048433,
+      "grad_norm": 0.845448911190033,
+      "learning_rate": 6.219121255438624e-05,
+      "loss": 0.835,
+      "step": 14017
+    },
+    {
+      "epoch": 2.4957264957264957,
+      "grad_norm": 0.7653858661651611,
+      "learning_rate": 6.217825448904588e-05,
+      "loss": 0.7027,
+      "step": 14018
+    },
+    {
+      "epoch": 2.4959045584045585,
+      "grad_norm": 0.7779282927513123,
+      "learning_rate": 6.216529716474404e-05,
+      "loss": 0.7881,
+      "step": 14019
+    },
+    {
+      "epoch": 2.4960826210826212,
+      "grad_norm": 0.8739959597587585,
+      "learning_rate": 6.215234058173465e-05,
+      "loss": 0.9738,
+      "step": 14020
+    },
+    {
+      "epoch": 2.496260683760684,
+      "grad_norm": 0.8388087749481201,
+      "learning_rate": 6.213938474027148e-05,
+      "loss": 1.0128,
+      "step": 14021
+    },
+    {
+      "epoch": 2.4964387464387463,
+      "grad_norm": 0.8963341116905212,
+      "learning_rate": 6.212642964060843e-05,
+      "loss": 0.9669,
+      "step": 14022
+    },
+    {
+      "epoch": 2.496616809116809,
+      "grad_norm": 0.8959031701087952,
+      "learning_rate": 6.211347528299928e-05,
+      "loss": 0.9558,
+      "step": 14023
+    },
+    {
+      "epoch": 2.496794871794872,
+      "grad_norm": 0.8463472127914429,
+      "learning_rate": 6.210052166769791e-05,
+      "loss": 0.9835,
+      "step": 14024
+    },
+    {
+      "epoch": 2.4969729344729346,
+      "grad_norm": 0.7827564477920532,
+      "learning_rate": 6.208756879495812e-05,
+      "loss": 0.8411,
+      "step": 14025
+    },
+    {
+      "epoch": 2.4971509971509973,
+      "grad_norm": 0.8851028084754944,
+      "learning_rate": 6.207461666503363e-05,
+      "loss": 1.0409,
+      "step": 14026
+    },
+    {
+      "epoch": 2.4973290598290596,
+      "grad_norm": 1.0151652097702026,
+      "learning_rate": 6.206166527817825e-05,
+      "loss": 0.6671,
+      "step": 14027
+    },
+    {
+      "epoch": 2.4975071225071224,
+      "grad_norm": 0.7924346923828125,
+      "learning_rate": 6.204871463464572e-05,
+      "loss": 0.9971,
+      "step": 14028
+    },
+    {
+      "epoch": 2.497685185185185,
+      "grad_norm": 0.8524144887924194,
+      "learning_rate": 6.203576473468981e-05,
+      "loss": 0.9228,
+      "step": 14029
+    },
+    {
+      "epoch": 2.497863247863248,
+      "grad_norm": 0.7936401963233948,
+      "learning_rate": 6.20228155785642e-05,
+      "loss": 0.8128,
+      "step": 14030
+    },
+    {
+      "epoch": 2.4980413105413106,
+      "grad_norm": 1.0074050426483154,
+      "learning_rate": 6.200986716652267e-05,
+      "loss": 0.7846,
+      "step": 14031
+    },
+    {
+      "epoch": 2.4982193732193734,
+      "grad_norm": 0.7972239851951599,
+      "learning_rate": 6.199691949881882e-05,
+      "loss": 0.8689,
+      "step": 14032
+    },
+    {
+      "epoch": 2.498397435897436,
+      "grad_norm": 0.8810364007949829,
+      "learning_rate": 6.198397257570643e-05,
+      "loss": 0.7775,
+      "step": 14033
+    },
+    {
+      "epoch": 2.4985754985754984,
+      "grad_norm": 0.8819566965103149,
+      "learning_rate": 6.19710263974391e-05,
+      "loss": 0.8852,
+      "step": 14034
+    },
+    {
+      "epoch": 2.498753561253561,
+      "grad_norm": 0.8020595908164978,
+      "learning_rate": 6.195808096427054e-05,
+      "loss": 0.9691,
+      "step": 14035
+    },
+    {
+      "epoch": 2.498931623931624,
+      "grad_norm": 0.83958899974823,
+      "learning_rate": 6.194513627645433e-05,
+      "loss": 0.8072,
+      "step": 14036
+    },
+    {
+      "epoch": 2.4991096866096867,
+      "grad_norm": 0.7525333166122437,
+      "learning_rate": 6.193219233424414e-05,
+      "loss": 0.7839,
+      "step": 14037
+    },
+    {
+      "epoch": 2.4992877492877494,
+      "grad_norm": 0.8687964677810669,
+      "learning_rate": 6.191924913789353e-05,
+      "loss": 0.9512,
+      "step": 14038
+    },
+    {
+      "epoch": 2.4994658119658117,
+      "grad_norm": 0.9080697298049927,
+      "learning_rate": 6.190630668765617e-05,
+      "loss": 0.8635,
+      "step": 14039
+    },
+    {
+      "epoch": 2.4996438746438745,
+      "grad_norm": 0.8174137473106384,
+      "learning_rate": 6.189336498378557e-05,
+      "loss": 0.9034,
+      "step": 14040
+    },
+    {
+      "epoch": 2.4996438746438745,
+      "eval_loss": 1.1338438987731934,
+      "eval_runtime": 24.4013,
+      "eval_samples_per_second": 42.662,
+      "eval_steps_per_second": 21.351,
+      "step": 14040
+    },
+    {
+      "epoch": 2.4998219373219372,
+      "grad_norm": 0.9711320996284485,
+      "learning_rate": 6.188042402653536e-05,
+      "loss": 0.9892,
+      "step": 14041
+    },
+    {
+      "epoch": 2.5,
+      "grad_norm": 0.8726856112480164,
+      "learning_rate": 6.1867483816159e-05,
+      "loss": 0.7482,
+      "step": 14042
+    },
+    {
+      "epoch": 2.5001780626780628,
+      "grad_norm": 0.875801682472229,
+      "learning_rate": 6.18545443529101e-05,
+      "loss": 0.7599,
+      "step": 14043
+    },
+    {
+      "epoch": 2.5003561253561255,
+      "grad_norm": 0.8867987990379333,
+      "learning_rate": 6.184160563704218e-05,
+      "loss": 0.805,
+      "step": 14044
+    },
+    {
+      "epoch": 2.5005341880341883,
+      "grad_norm": 0.8766322135925293,
+      "learning_rate": 6.18286676688087e-05,
+      "loss": 0.7343,
+      "step": 14045
+    },
+    {
+      "epoch": 2.5007122507122506,
+      "grad_norm": 0.8096646070480347,
+      "learning_rate": 6.181573044846323e-05,
+      "loss": 0.7957,
+      "step": 14046
+    },
+    {
+      "epoch": 2.5008903133903133,
+      "grad_norm": 1.0121821165084839,
+      "learning_rate": 6.180279397625917e-05,
+      "loss": 0.8775,
+      "step": 14047
+    },
+    {
+      "epoch": 2.501068376068376,
+      "grad_norm": 0.79291170835495,
+      "learning_rate": 6.178985825245003e-05,
+      "loss": 0.91,
+      "step": 14048
+    },
+    {
+      "epoch": 2.501246438746439,
+      "grad_norm": 0.83204185962677,
+      "learning_rate": 6.177692327728922e-05,
+      "loss": 0.799,
+      "step": 14049
+    },
+    {
+      "epoch": 2.5014245014245016,
+      "grad_norm": 0.8746328949928284,
+      "learning_rate": 6.176398905103023e-05,
+      "loss": 0.9595,
+      "step": 14050
+    },
+    {
+      "epoch": 2.501602564102564,
+      "grad_norm": 0.7665601968765259,
+      "learning_rate": 6.17510555739264e-05,
+      "loss": 0.7935,
+      "step": 14051
+    },
+    {
+      "epoch": 2.5017806267806266,
+      "grad_norm": 0.8761195540428162,
+      "learning_rate": 6.173812284623122e-05,
+      "loss": 1.017,
+      "step": 14052
+    },
+    {
+      "epoch": 2.5019586894586894,
+      "grad_norm": 0.8847656846046448,
+      "learning_rate": 6.172519086819802e-05,
+      "loss": 0.7684,
+      "step": 14053
+    },
+    {
+      "epoch": 2.502136752136752,
+      "grad_norm": 0.8320107460021973,
+      "learning_rate": 6.171225964008021e-05,
+      "loss": 0.8828,
+      "step": 14054
+    },
+    {
+      "epoch": 2.502314814814815,
+      "grad_norm": 1.0184354782104492,
+      "learning_rate": 6.169932916213111e-05,
+      "loss": 0.887,
+      "step": 14055
+    },
+    {
+      "epoch": 2.5024928774928776,
+      "grad_norm": 0.7870062589645386,
+      "learning_rate": 6.168639943460415e-05,
+      "loss": 0.8595,
+      "step": 14056
+    },
+    {
+      "epoch": 2.5026709401709404,
+      "grad_norm": 0.8314430117607117,
+      "learning_rate": 6.167347045775254e-05,
+      "loss": 0.7822,
+      "step": 14057
+    },
+    {
+      "epoch": 2.5028490028490027,
+      "grad_norm": 0.7521854043006897,
+      "learning_rate": 6.166054223182968e-05,
+      "loss": 0.916,
+      "step": 14058
+    },
+    {
+      "epoch": 2.5030270655270654,
+      "grad_norm": 0.8240202069282532,
+      "learning_rate": 6.164761475708885e-05,
+      "loss": 0.9404,
+      "step": 14059
+    },
+    {
+      "epoch": 2.503205128205128,
+      "grad_norm": 0.8467113375663757,
+      "learning_rate": 6.163468803378338e-05,
+      "loss": 0.7554,
+      "step": 14060
+    },
+    {
+      "epoch": 2.503383190883191,
+      "grad_norm": 0.7639012336730957,
+      "learning_rate": 6.162176206216645e-05,
+      "loss": 0.8141,
+      "step": 14061
+    },
+    {
+      "epoch": 2.5035612535612537,
+      "grad_norm": 0.8578195571899414,
+      "learning_rate": 6.160883684249138e-05,
+      "loss": 0.821,
+      "step": 14062
+    },
+    {
+      "epoch": 2.503739316239316,
+      "grad_norm": 0.7478210926055908,
+      "learning_rate": 6.159591237501139e-05,
+      "loss": 0.5878,
+      "step": 14063
+    },
+    {
+      "epoch": 2.5039173789173788,
+      "grad_norm": 0.7936450242996216,
+      "learning_rate": 6.158298865997972e-05,
+      "loss": 0.9616,
+      "step": 14064
+    },
+    {
+      "epoch": 2.5040954415954415,
+      "grad_norm": 0.9196288585662842,
+      "learning_rate": 6.157006569764963e-05,
+      "loss": 0.8147,
+      "step": 14065
+    },
+    {
+      "epoch": 2.5042735042735043,
+      "grad_norm": 1.0488382577896118,
+      "learning_rate": 6.155714348827422e-05,
+      "loss": 0.7941,
+      "step": 14066
+    },
+    {
+      "epoch": 2.504451566951567,
+      "grad_norm": 0.9195658564567566,
+      "learning_rate": 6.154422203210676e-05,
+      "loss": 1.0186,
+      "step": 14067
+    },
+    {
+      "epoch": 2.5046296296296298,
+      "grad_norm": 0.9088640213012695,
+      "learning_rate": 6.153130132940037e-05,
+      "loss": 0.7611,
+      "step": 14068
+    },
+    {
+      "epoch": 2.5048076923076925,
+      "grad_norm": 0.8168773651123047,
+      "learning_rate": 6.151838138040821e-05,
+      "loss": 0.8466,
+      "step": 14069
+    },
+    {
+      "epoch": 2.504985754985755,
+      "grad_norm": 0.9976982474327087,
+      "learning_rate": 6.150546218538342e-05,
+      "loss": 0.9438,
+      "step": 14070
+    },
+    {
+      "epoch": 2.5051638176638176,
+      "grad_norm": 0.9469537138938904,
+      "learning_rate": 6.149254374457917e-05,
+      "loss": 0.9485,
+      "step": 14071
+    },
+    {
+      "epoch": 2.5053418803418803,
+      "grad_norm": 0.8861194849014282,
+      "learning_rate": 6.147962605824851e-05,
+      "loss": 0.9676,
+      "step": 14072
+    },
+    {
+      "epoch": 2.505519943019943,
+      "grad_norm": 0.9008424878120422,
+      "learning_rate": 6.146670912664457e-05,
+      "loss": 0.7343,
+      "step": 14073
+    },
+    {
+      "epoch": 2.505698005698006,
+      "grad_norm": 0.8957796096801758,
+      "learning_rate": 6.145379295002038e-05,
+      "loss": 0.8538,
+      "step": 14074
+    },
+    {
+      "epoch": 2.505876068376068,
+      "grad_norm": 0.8739160895347595,
+      "learning_rate": 6.14408775286291e-05,
+      "loss": 0.9029,
+      "step": 14075
+    },
+    {
+      "epoch": 2.506054131054131,
+      "grad_norm": 0.7713274955749512,
+      "learning_rate": 6.142796286272368e-05,
+      "loss": 0.6962,
+      "step": 14076
+    },
+    {
+      "epoch": 2.5062321937321936,
+      "grad_norm": 0.8545170426368713,
+      "learning_rate": 6.141504895255725e-05,
+      "loss": 0.9208,
+      "step": 14077
+    },
+    {
+      "epoch": 2.5064102564102564,
+      "grad_norm": 0.8102772235870361,
+      "learning_rate": 6.140213579838274e-05,
+      "loss": 0.7785,
+      "step": 14078
+    },
+    {
+      "epoch": 2.506588319088319,
+      "grad_norm": 0.8055099844932556,
+      "learning_rate": 6.138922340045321e-05,
+      "loss": 0.8502,
+      "step": 14079
+    },
+    {
+      "epoch": 2.506766381766382,
+      "grad_norm": 0.8132893443107605,
+      "learning_rate": 6.137631175902164e-05,
+      "loss": 0.7559,
+      "step": 14080
+    },
+    {
+      "epoch": 2.5069444444444446,
+      "grad_norm": 0.8608863949775696,
+      "learning_rate": 6.136340087434102e-05,
+      "loss": 0.838,
+      "step": 14081
+    },
+    {
+      "epoch": 2.5071225071225074,
+      "grad_norm": 0.8480643630027771,
+      "learning_rate": 6.135049074666428e-05,
+      "loss": 0.9062,
+      "step": 14082
+    },
+    {
+      "epoch": 2.5073005698005697,
+      "grad_norm": 0.7107672691345215,
+      "learning_rate": 6.133758137624437e-05,
+      "loss": 0.7494,
+      "step": 14083
+    },
+    {
+      "epoch": 2.5074786324786325,
+      "grad_norm": 0.812416672706604,
+      "learning_rate": 6.132467276333427e-05,
+      "loss": 0.6428,
+      "step": 14084
+    },
+    {
+      "epoch": 2.507656695156695,
+      "grad_norm": 0.8304431438446045,
+      "learning_rate": 6.131176490818684e-05,
+      "loss": 0.9931,
+      "step": 14085
+    },
+    {
+      "epoch": 2.507834757834758,
+      "grad_norm": 0.8344886302947998,
+      "learning_rate": 6.129885781105507e-05,
+      "loss": 0.921,
+      "step": 14086
+    },
+    {
+      "epoch": 2.5080128205128203,
+      "grad_norm": 0.8137457966804504,
+      "learning_rate": 6.128595147219172e-05,
+      "loss": 0.9113,
+      "step": 14087
+    },
+    {
+      "epoch": 2.508190883190883,
+      "grad_norm": 0.7404686212539673,
+      "learning_rate": 6.127304589184976e-05,
+      "loss": 0.7625,
+      "step": 14088
+    },
+    {
+      "epoch": 2.5083689458689458,
+      "grad_norm": 0.8179733157157898,
+      "learning_rate": 6.126014107028202e-05,
+      "loss": 0.9049,
+      "step": 14089
+    },
+    {
+      "epoch": 2.5085470085470085,
+      "grad_norm": 0.7788520455360413,
+      "learning_rate": 6.124723700774133e-05,
+      "loss": 0.7391,
+      "step": 14090
+    },
+    {
+      "epoch": 2.5087250712250713,
+      "grad_norm": 0.8127198219299316,
+      "learning_rate": 6.123433370448052e-05,
+      "loss": 0.8551,
+      "step": 14091
+    },
+    {
+      "epoch": 2.508903133903134,
+      "grad_norm": 0.8134245276451111,
+      "learning_rate": 6.122143116075245e-05,
+      "loss": 0.7422,
+      "step": 14092
+    },
+    {
+      "epoch": 2.5090811965811968,
+      "grad_norm": 0.9117823243141174,
+      "learning_rate": 6.120852937680983e-05,
+      "loss": 0.8649,
+      "step": 14093
+    },
+    {
+      "epoch": 2.5092592592592595,
+      "grad_norm": 0.8417702913284302,
+      "learning_rate": 6.119562835290553e-05,
+      "loss": 0.6902,
+      "step": 14094
+    },
+    {
+      "epoch": 2.509437321937322,
+      "grad_norm": 0.8655431866645813,
+      "learning_rate": 6.118272808929225e-05,
+      "loss": 1.0778,
+      "step": 14095
+    },
+    {
+      "epoch": 2.5096153846153846,
+      "grad_norm": 0.9228867888450623,
+      "learning_rate": 6.116982858622282e-05,
+      "loss": 0.9991,
+      "step": 14096
+    },
+    {
+      "epoch": 2.5097934472934473,
+      "grad_norm": 0.819505512714386,
+      "learning_rate": 6.115692984394992e-05,
+      "loss": 1.0351,
+      "step": 14097
+    },
+    {
+      "epoch": 2.50997150997151,
+      "grad_norm": 0.8419737219810486,
+      "learning_rate": 6.114403186272628e-05,
+      "loss": 0.92,
+      "step": 14098
+    },
+    {
+      "epoch": 2.5101495726495724,
+      "grad_norm": 0.8294256329536438,
+      "learning_rate": 6.11311346428046e-05,
+      "loss": 0.792,
+      "step": 14099
+    },
+    {
+      "epoch": 2.510327635327635,
+      "grad_norm": 0.8549113273620605,
+      "learning_rate": 6.111823818443765e-05,
+      "loss": 0.9453,
+      "step": 14100
+    },
+    {
+      "epoch": 2.510505698005698,
+      "grad_norm": 1.0194092988967896,
+      "learning_rate": 6.1105342487878e-05,
+      "loss": 0.9747,
+      "step": 14101
+    },
+    {
+      "epoch": 2.5106837606837606,
+      "grad_norm": 0.7633654475212097,
+      "learning_rate": 6.109244755337842e-05,
+      "loss": 0.8227,
+      "step": 14102
+    },
+    {
+      "epoch": 2.5108618233618234,
+      "grad_norm": 0.9679104685783386,
+      "learning_rate": 6.107955338119147e-05,
+      "loss": 1.0407,
+      "step": 14103
+    },
+    {
+      "epoch": 2.511039886039886,
+      "grad_norm": 0.8342793583869934,
+      "learning_rate": 6.10666599715698e-05,
+      "loss": 0.7868,
+      "step": 14104
+    },
+    {
+      "epoch": 2.511217948717949,
+      "grad_norm": 0.9264410734176636,
+      "learning_rate": 6.105376732476609e-05,
+      "loss": 0.802,
+      "step": 14105
+    },
+    {
+      "epoch": 2.5113960113960117,
+      "grad_norm": 0.7511885762214661,
+      "learning_rate": 6.104087544103287e-05,
+      "loss": 0.7561,
+      "step": 14106
+    },
+    {
+      "epoch": 2.511574074074074,
+      "grad_norm": 0.8330591320991516,
+      "learning_rate": 6.102798432062282e-05,
+      "loss": 0.8511,
+      "step": 14107
+    },
+    {
+      "epoch": 2.5117521367521367,
+      "grad_norm": 0.8971241116523743,
+      "learning_rate": 6.1015093963788415e-05,
+      "loss": 0.8854,
+      "step": 14108
+    },
+    {
+      "epoch": 2.5119301994301995,
+      "grad_norm": 0.7926762700080872,
+      "learning_rate": 6.100220437078228e-05,
+      "loss": 0.6803,
+      "step": 14109
+    },
+    {
+      "epoch": 2.512108262108262,
+      "grad_norm": 0.7384431958198547,
+      "learning_rate": 6.098931554185692e-05,
+      "loss": 0.6162,
+      "step": 14110
+    },
+    {
+      "epoch": 2.5122863247863245,
+      "grad_norm": 0.9201281070709229,
+      "learning_rate": 6.097642747726491e-05,
+      "loss": 1.1095,
+      "step": 14111
+    },
+    {
+      "epoch": 2.5124643874643873,
+      "grad_norm": 0.8822020888328552,
+      "learning_rate": 6.0963540177258716e-05,
+      "loss": 0.87,
+      "step": 14112
+    },
+    {
+      "epoch": 2.51264245014245,
+      "grad_norm": 0.8243268728256226,
+      "learning_rate": 6.09506536420909e-05,
+      "loss": 1.0899,
+      "step": 14113
+    },
+    {
+      "epoch": 2.5128205128205128,
+      "grad_norm": 0.8657538294792175,
+      "learning_rate": 6.093776787201386e-05,
+      "loss": 0.8218,
+      "step": 14114
+    },
+    {
+      "epoch": 2.5129985754985755,
+      "grad_norm": 0.8651030659675598,
+      "learning_rate": 6.092488286728013e-05,
+      "loss": 0.7903,
+      "step": 14115
+    },
+    {
+      "epoch": 2.5131766381766383,
+      "grad_norm": 0.8341799378395081,
+      "learning_rate": 6.091199862814214e-05,
+      "loss": 0.8612,
+      "step": 14116
+    },
+    {
+      "epoch": 2.513354700854701,
+      "grad_norm": 0.7693229913711548,
+      "learning_rate": 6.0899115154852384e-05,
+      "loss": 0.819,
+      "step": 14117
+    },
+    {
+      "epoch": 2.513532763532764,
+      "grad_norm": 0.8883055448532104,
+      "learning_rate": 6.088623244766318e-05,
+      "loss": 0.9026,
+      "step": 14118
+    },
+    {
+      "epoch": 2.513710826210826,
+      "grad_norm": 0.7761621475219727,
+      "learning_rate": 6.087335050682703e-05,
+      "loss": 0.7505,
+      "step": 14119
+    },
+    {
+      "epoch": 2.513888888888889,
+      "grad_norm": 0.8152571320533752,
+      "learning_rate": 6.086046933259628e-05,
+      "loss": 0.7637,
+      "step": 14120
+    },
+    {
+      "epoch": 2.5140669515669516,
+      "grad_norm": 0.7990148663520813,
+      "learning_rate": 6.0847588925223376e-05,
+      "loss": 0.8615,
+      "step": 14121
+    },
+    {
+      "epoch": 2.5142450142450143,
+      "grad_norm": 0.844756007194519,
+      "learning_rate": 6.083470928496058e-05,
+      "loss": 0.8696,
+      "step": 14122
+    },
+    {
+      "epoch": 2.5144230769230766,
+      "grad_norm": 0.7533631324768066,
+      "learning_rate": 6.082183041206031e-05,
+      "loss": 0.7674,
+      "step": 14123
+    },
+    {
+      "epoch": 2.5146011396011394,
+      "grad_norm": 0.7914009690284729,
+      "learning_rate": 6.0808952306774905e-05,
+      "loss": 0.7577,
+      "step": 14124
+    },
+    {
+      "epoch": 2.514779202279202,
+      "grad_norm": 0.8341572284698486,
+      "learning_rate": 6.079607496935666e-05,
+      "loss": 0.8899,
+      "step": 14125
+    },
+    {
+      "epoch": 2.514957264957265,
+      "grad_norm": 0.9185548424720764,
+      "learning_rate": 6.078319840005788e-05,
+      "loss": 0.9486,
+      "step": 14126
+    },
+    {
+      "epoch": 2.5151353276353277,
+      "grad_norm": 0.8611742854118347,
+      "learning_rate": 6.0770322599130856e-05,
+      "loss": 0.8267,
+      "step": 14127
+    },
+    {
+      "epoch": 2.5153133903133904,
+      "grad_norm": 0.899135410785675,
+      "learning_rate": 6.0757447566827906e-05,
+      "loss": 1.0829,
+      "step": 14128
+    },
+    {
+      "epoch": 2.515491452991453,
+      "grad_norm": 0.8016429543495178,
+      "learning_rate": 6.074457330340122e-05,
+      "loss": 0.8582,
+      "step": 14129
+    },
+    {
+      "epoch": 2.515669515669516,
+      "grad_norm": 0.7781331539154053,
+      "learning_rate": 6.073169980910307e-05,
+      "loss": 0.8435,
+      "step": 14130
+    },
+    {
+      "epoch": 2.515847578347578,
+      "grad_norm": 0.7605105042457581,
+      "learning_rate": 6.071882708418568e-05,
+      "loss": 0.6961,
+      "step": 14131
+    },
+    {
+      "epoch": 2.516025641025641,
+      "grad_norm": 0.9337655901908875,
+      "learning_rate": 6.0705955128901326e-05,
+      "loss": 0.8673,
+      "step": 14132
+    },
+    {
+      "epoch": 2.5162037037037037,
+      "grad_norm": 0.7868272662162781,
+      "learning_rate": 6.06930839435021e-05,
+      "loss": 0.7526,
+      "step": 14133
+    },
+    {
+      "epoch": 2.5163817663817665,
+      "grad_norm": 0.8722387552261353,
+      "learning_rate": 6.068021352824027e-05,
+      "loss": 0.9541,
+      "step": 14134
+    },
+    {
+      "epoch": 2.5165598290598292,
+      "grad_norm": 0.7682648301124573,
+      "learning_rate": 6.066734388336794e-05,
+      "loss": 0.7191,
+      "step": 14135
+    },
+    {
+      "epoch": 2.5167378917378915,
+      "grad_norm": 0.9540650844573975,
+      "learning_rate": 6.065447500913737e-05,
+      "loss": 1.0638,
+      "step": 14136
+    },
+    {
+      "epoch": 2.5169159544159543,
+      "grad_norm": 0.8276218175888062,
+      "learning_rate": 6.064160690580056e-05,
+      "loss": 0.7967,
+      "step": 14137
+    },
+    {
+      "epoch": 2.517094017094017,
+      "grad_norm": 0.7966098785400391,
+      "learning_rate": 6.062873957360976e-05,
+      "loss": 0.8913,
+      "step": 14138
+    },
+    {
+      "epoch": 2.51727207977208,
+      "grad_norm": 0.9670028686523438,
+      "learning_rate": 6.0615873012816974e-05,
+      "loss": 0.8846,
+      "step": 14139
+    },
+    {
+      "epoch": 2.5174501424501425,
+      "grad_norm": 0.819952666759491,
+      "learning_rate": 6.0603007223674366e-05,
+      "loss": 0.8409,
+      "step": 14140
+    },
+    {
+      "epoch": 2.5176282051282053,
+      "grad_norm": 0.7746681571006775,
+      "learning_rate": 6.0590142206433973e-05,
+      "loss": 0.7382,
+      "step": 14141
+    },
+    {
+      "epoch": 2.517806267806268,
+      "grad_norm": 0.9452744722366333,
+      "learning_rate": 6.057727796134787e-05,
+      "loss": 0.8878,
+      "step": 14142
+    },
+    {
+      "epoch": 2.5179843304843303,
+      "grad_norm": 0.7940170168876648,
+      "learning_rate": 6.0564414488668165e-05,
+      "loss": 0.8289,
+      "step": 14143
+    },
+    {
+      "epoch": 2.518162393162393,
+      "grad_norm": 0.9046176671981812,
+      "learning_rate": 6.0551551788646774e-05,
+      "loss": 0.8596,
+      "step": 14144
+    },
+    {
+      "epoch": 2.518340455840456,
+      "grad_norm": 0.8460658192634583,
+      "learning_rate": 6.053868986153581e-05,
+      "loss": 0.7678,
+      "step": 14145
+    },
+    {
+      "epoch": 2.5185185185185186,
+      "grad_norm": 0.9131760597229004,
+      "learning_rate": 6.052582870758723e-05,
+      "loss": 0.8845,
+      "step": 14146
+    },
+    {
+      "epoch": 2.5186965811965814,
+      "grad_norm": 0.8375167846679688,
+      "learning_rate": 6.0512968327053076e-05,
+      "loss": 1.0082,
+      "step": 14147
+    },
+    {
+      "epoch": 2.5188746438746437,
+      "grad_norm": 0.8587140440940857,
+      "learning_rate": 6.050010872018523e-05,
+      "loss": 0.8745,
+      "step": 14148
+    },
+    {
+      "epoch": 2.5190527065527064,
+      "grad_norm": 0.8347265124320984,
+      "learning_rate": 6.048724988723575e-05,
+      "loss": 0.8345,
+      "step": 14149
+    },
+    {
+      "epoch": 2.519230769230769,
+      "grad_norm": 1.0271183252334595,
+      "learning_rate": 6.047439182845649e-05,
+      "loss": 0.9863,
+      "step": 14150
+    },
+    {
+      "epoch": 2.519408831908832,
+      "grad_norm": 0.6951111555099487,
+      "learning_rate": 6.046153454409943e-05,
+      "loss": 0.6162,
+      "step": 14151
+    },
+    {
+      "epoch": 2.5195868945868947,
+      "grad_norm": 0.7702959179878235,
+      "learning_rate": 6.044867803441645e-05,
+      "loss": 0.8127,
+      "step": 14152
+    },
+    {
+      "epoch": 2.5197649572649574,
+      "grad_norm": 0.7997276186943054,
+      "learning_rate": 6.0435822299659496e-05,
+      "loss": 0.6777,
+      "step": 14153
+    },
+    {
+      "epoch": 2.51994301994302,
+      "grad_norm": 0.8006166815757751,
+      "learning_rate": 6.0422967340080385e-05,
+      "loss": 0.9122,
+      "step": 14154
+    },
+    {
+      "epoch": 2.5201210826210825,
+      "grad_norm": 0.888225793838501,
+      "learning_rate": 6.041011315593102e-05,
+      "loss": 0.7621,
+      "step": 14155
+    },
+    {
+      "epoch": 2.5202991452991452,
+      "grad_norm": 0.928814172744751,
+      "learning_rate": 6.039725974746324e-05,
+      "loss": 1.0245,
+      "step": 14156
+    },
+    {
+      "epoch": 2.520477207977208,
+      "grad_norm": 0.7914403676986694,
+      "learning_rate": 6.038440711492892e-05,
+      "loss": 0.6585,
+      "step": 14157
+    },
+    {
+      "epoch": 2.5206552706552707,
+      "grad_norm": 0.82389897108078,
+      "learning_rate": 6.0371555258579826e-05,
+      "loss": 0.7862,
+      "step": 14158
+    },
+    {
+      "epoch": 2.5208333333333335,
+      "grad_norm": 0.952135443687439,
+      "learning_rate": 6.035870417866778e-05,
+      "loss": 0.8952,
+      "step": 14159
+    },
+    {
+      "epoch": 2.521011396011396,
+      "grad_norm": 0.8626661896705627,
+      "learning_rate": 6.034585387544458e-05,
+      "loss": 0.9166,
+      "step": 14160
+    },
+    {
+      "epoch": 2.5211894586894585,
+      "grad_norm": 0.9641584157943726,
+      "learning_rate": 6.033300434916203e-05,
+      "loss": 0.8481,
+      "step": 14161
+    },
+    {
+      "epoch": 2.5213675213675213,
+      "grad_norm": 0.949110209941864,
+      "learning_rate": 6.0320155600071814e-05,
+      "loss": 0.9628,
+      "step": 14162
+    },
+    {
+      "epoch": 2.521545584045584,
+      "grad_norm": 0.8198522329330444,
+      "learning_rate": 6.030730762842573e-05,
+      "loss": 0.817,
+      "step": 14163
+    },
+    {
+      "epoch": 2.521723646723647,
+      "grad_norm": 0.9209866523742676,
+      "learning_rate": 6.029446043447553e-05,
+      "loss": 0.925,
+      "step": 14164
+    },
+    {
+      "epoch": 2.5219017094017095,
+      "grad_norm": 0.8604369163513184,
+      "learning_rate": 6.0281614018472854e-05,
+      "loss": 0.7846,
+      "step": 14165
+    },
+    {
+      "epoch": 2.5220797720797723,
+      "grad_norm": 0.882255494594574,
+      "learning_rate": 6.026876838066948e-05,
+      "loss": 0.8715,
+      "step": 14166
+    },
+    {
+      "epoch": 2.5222578347578346,
+      "grad_norm": 0.8609021306037903,
+      "learning_rate": 6.0255923521317015e-05,
+      "loss": 0.8627,
+      "step": 14167
+    },
+    {
+      "epoch": 2.5224358974358974,
+      "grad_norm": 0.9782202243804932,
+      "learning_rate": 6.0243079440667226e-05,
+      "loss": 0.8499,
+      "step": 14168
+    },
+    {
+      "epoch": 2.52261396011396,
+      "grad_norm": 0.7932701706886292,
+      "learning_rate": 6.023023613897165e-05,
+      "loss": 0.9174,
+      "step": 14169
+    },
+    {
+      "epoch": 2.522792022792023,
+      "grad_norm": 0.8827422261238098,
+      "learning_rate": 6.021739361648202e-05,
+      "loss": 0.9384,
+      "step": 14170
+    },
+    {
+      "epoch": 2.5229700854700856,
+      "grad_norm": 0.9764171838760376,
+      "learning_rate": 6.020455187344989e-05,
+      "loss": 0.8806,
+      "step": 14171
+    },
+    {
+      "epoch": 2.523148148148148,
+      "grad_norm": 0.7635362148284912,
+      "learning_rate": 6.019171091012694e-05,
+      "loss": 0.7519,
+      "step": 14172
+    },
+    {
+      "epoch": 2.5233262108262107,
+      "grad_norm": 0.9925556182861328,
+      "learning_rate": 6.017887072676468e-05,
+      "loss": 0.8467,
+      "step": 14173
+    },
+    {
+      "epoch": 2.5235042735042734,
+      "grad_norm": 0.9624950289726257,
+      "learning_rate": 6.016603132361477e-05,
+      "loss": 0.9492,
+      "step": 14174
+    },
+    {
+      "epoch": 2.523682336182336,
+      "grad_norm": 0.7960891127586365,
+      "learning_rate": 6.0153192700928685e-05,
+      "loss": 1.0111,
+      "step": 14175
+    },
+    {
+      "epoch": 2.523860398860399,
+      "grad_norm": 0.8387307524681091,
+      "learning_rate": 6.014035485895804e-05,
+      "loss": 0.8013,
+      "step": 14176
+    },
+    {
+      "epoch": 2.5240384615384617,
+      "grad_norm": 0.8488287925720215,
+      "learning_rate": 6.0127517797954316e-05,
+      "loss": 0.8508,
+      "step": 14177
+    },
+    {
+      "epoch": 2.5242165242165244,
+      "grad_norm": 0.7339358329772949,
+      "learning_rate": 6.011468151816908e-05,
+      "loss": 0.7225,
+      "step": 14178
+    },
+    {
+      "epoch": 2.5243945868945867,
+      "grad_norm": 0.9265308976173401,
+      "learning_rate": 6.010184601985378e-05,
+      "loss": 0.7993,
+      "step": 14179
+    },
+    {
+      "epoch": 2.5245726495726495,
+      "grad_norm": 0.7752045392990112,
+      "learning_rate": 6.0089011303259944e-05,
+      "loss": 0.8315,
+      "step": 14180
+    },
+    {
+      "epoch": 2.5247507122507122,
+      "grad_norm": 0.7794929146766663,
+      "learning_rate": 6.007617736863901e-05,
+      "loss": 0.9174,
+      "step": 14181
+    },
+    {
+      "epoch": 2.524928774928775,
+      "grad_norm": 0.9099361896514893,
+      "learning_rate": 6.0063344216242434e-05,
+      "loss": 0.8948,
+      "step": 14182
+    },
+    {
+      "epoch": 2.5251068376068377,
+      "grad_norm": 0.8161521553993225,
+      "learning_rate": 6.005051184632171e-05,
+      "loss": 0.8018,
+      "step": 14183
+    },
+    {
+      "epoch": 2.5252849002849,
+      "grad_norm": 0.9279208183288574,
+      "learning_rate": 6.003768025912819e-05,
+      "loss": 0.9032,
+      "step": 14184
+    },
+    {
+      "epoch": 2.525462962962963,
+      "grad_norm": 0.9689664840698242,
+      "learning_rate": 6.002484945491333e-05,
+      "loss": 0.9463,
+      "step": 14185
+    },
+    {
+      "epoch": 2.5256410256410255,
+      "grad_norm": 0.8367486596107483,
+      "learning_rate": 6.001201943392848e-05,
+      "loss": 0.7866,
+      "step": 14186
+    },
+    {
+      "epoch": 2.5258190883190883,
+      "grad_norm": 0.8383589386940002,
+      "learning_rate": 5.9999190196425056e-05,
+      "loss": 0.7642,
+      "step": 14187
+    },
+    {
+      "epoch": 2.525997150997151,
+      "grad_norm": 0.9113569855690002,
+      "learning_rate": 5.99863617426544e-05,
+      "loss": 1.0451,
+      "step": 14188
+    },
+    {
+      "epoch": 2.526175213675214,
+      "grad_norm": 0.896575391292572,
+      "learning_rate": 5.997353407286788e-05,
+      "loss": 0.8559,
+      "step": 14189
+    },
+    {
+      "epoch": 2.5263532763532766,
+      "grad_norm": 0.899214506149292,
+      "learning_rate": 5.996070718731679e-05,
+      "loss": 1.0006,
+      "step": 14190
+    },
+    {
+      "epoch": 2.5265313390313393,
+      "grad_norm": 0.9739418625831604,
+      "learning_rate": 5.994788108625247e-05,
+      "loss": 0.9412,
+      "step": 14191
+    },
+    {
+      "epoch": 2.5267094017094016,
+      "grad_norm": 0.7940781712532043,
+      "learning_rate": 5.9935055769926215e-05,
+      "loss": 0.8802,
+      "step": 14192
+    },
+    {
+      "epoch": 2.5268874643874644,
+      "grad_norm": 0.802066445350647,
+      "learning_rate": 5.9922231238589346e-05,
+      "loss": 0.8459,
+      "step": 14193
+    },
+    {
+      "epoch": 2.527065527065527,
+      "grad_norm": 0.827560544013977,
+      "learning_rate": 5.990940749249306e-05,
+      "loss": 0.8732,
+      "step": 14194
+    },
+    {
+      "epoch": 2.52724358974359,
+      "grad_norm": 0.7782348394393921,
+      "learning_rate": 5.989658453188869e-05,
+      "loss": 0.6742,
+      "step": 14195
+    },
+    {
+      "epoch": 2.527421652421652,
+      "grad_norm": 0.8418310284614563,
+      "learning_rate": 5.9883762357027416e-05,
+      "loss": 0.8699,
+      "step": 14196
+    },
+    {
+      "epoch": 2.527599715099715,
+      "grad_norm": 0.7925812602043152,
+      "learning_rate": 5.987094096816051e-05,
+      "loss": 0.6807,
+      "step": 14197
+    },
+    {
+      "epoch": 2.5277777777777777,
+      "grad_norm": 0.8200794458389282,
+      "learning_rate": 5.9858120365539105e-05,
+      "loss": 0.8249,
+      "step": 14198
+    },
+    {
+      "epoch": 2.5279558404558404,
+      "grad_norm": 0.7137587070465088,
+      "learning_rate": 5.9845300549414505e-05,
+      "loss": 0.7882,
+      "step": 14199
+    },
+    {
+      "epoch": 2.528133903133903,
+      "grad_norm": 0.8084787726402283,
+      "learning_rate": 5.983248152003778e-05,
+      "loss": 1.0161,
+      "step": 14200
+    },
+    {
+      "epoch": 2.528311965811966,
+      "grad_norm": 0.7717064023017883,
+      "learning_rate": 5.9819663277660156e-05,
+      "loss": 0.798,
+      "step": 14201
+    },
+    {
+      "epoch": 2.5284900284900287,
+      "grad_norm": 0.7722328305244446,
+      "learning_rate": 5.980684582253275e-05,
+      "loss": 0.8324,
+      "step": 14202
+    },
+    {
+      "epoch": 2.5286680911680914,
+      "grad_norm": 0.8357635140419006,
+      "learning_rate": 5.9794029154906696e-05,
+      "loss": 0.9224,
+      "step": 14203
+    },
+    {
+      "epoch": 2.5288461538461537,
+      "grad_norm": 0.8159863352775574,
+      "learning_rate": 5.978121327503317e-05,
+      "loss": 0.7529,
+      "step": 14204
+    },
+    {
+      "epoch": 2.5290242165242165,
+      "grad_norm": 0.8255389332771301,
+      "learning_rate": 5.976839818316317e-05,
+      "loss": 0.9674,
+      "step": 14205
+    },
+    {
+      "epoch": 2.5292022792022792,
+      "grad_norm": 0.8204228281974792,
+      "learning_rate": 5.975558387954787e-05,
+      "loss": 0.9138,
+      "step": 14206
+    },
+    {
+      "epoch": 2.529380341880342,
+      "grad_norm": 0.8232463598251343,
+      "learning_rate": 5.9742770364438275e-05,
+      "loss": 0.7949,
+      "step": 14207
+    },
+    {
+      "epoch": 2.5295584045584043,
+      "grad_norm": 0.8164107203483582,
+      "learning_rate": 5.972995763808551e-05,
+      "loss": 0.7087,
+      "step": 14208
+    },
+    {
+      "epoch": 2.529736467236467,
+      "grad_norm": 0.8100822567939758,
+      "learning_rate": 5.971714570074052e-05,
+      "loss": 0.9187,
+      "step": 14209
+    },
+    {
+      "epoch": 2.52991452991453,
+      "grad_norm": 0.7401103377342224,
+      "learning_rate": 5.970433455265443e-05,
+      "loss": 0.798,
+      "step": 14210
+    },
+    {
+      "epoch": 2.5300925925925926,
+      "grad_norm": 0.798327624797821,
+      "learning_rate": 5.9691524194078154e-05,
+      "loss": 0.7312,
+      "step": 14211
+    },
+    {
+      "epoch": 2.5302706552706553,
+      "grad_norm": 0.8566045165061951,
+      "learning_rate": 5.9678714625262754e-05,
+      "loss": 0.8555,
+      "step": 14212
+    },
+    {
+      "epoch": 2.530448717948718,
+      "grad_norm": 0.8005902767181396,
+      "learning_rate": 5.9665905846459155e-05,
+      "loss": 0.7979,
+      "step": 14213
+    },
+    {
+      "epoch": 2.530626780626781,
+      "grad_norm": 0.815990686416626,
+      "learning_rate": 5.9653097857918396e-05,
+      "loss": 0.8739,
+      "step": 14214
+    },
+    {
+      "epoch": 2.5308048433048436,
+      "grad_norm": 0.7694230079650879,
+      "learning_rate": 5.9640290659891316e-05,
+      "loss": 0.7249,
+      "step": 14215
+    },
+    {
+      "epoch": 2.530982905982906,
+      "grad_norm": 0.8469253182411194,
+      "learning_rate": 5.962748425262892e-05,
+      "loss": 0.8505,
+      "step": 14216
+    },
+    {
+      "epoch": 2.5311609686609686,
+      "grad_norm": 0.8061797022819519,
+      "learning_rate": 5.961467863638209e-05,
+      "loss": 0.8979,
+      "step": 14217
+    },
+    {
+      "epoch": 2.5313390313390314,
+      "grad_norm": 1.0380569696426392,
+      "learning_rate": 5.960187381140179e-05,
+      "loss": 0.8664,
+      "step": 14218
+    },
+    {
+      "epoch": 2.531517094017094,
+      "grad_norm": 0.9435166716575623,
+      "learning_rate": 5.9589069777938786e-05,
+      "loss": 0.7566,
+      "step": 14219
+    },
+    {
+      "epoch": 2.5316951566951564,
+      "grad_norm": 0.8882613182067871,
+      "learning_rate": 5.957626653624407e-05,
+      "loss": 0.6999,
+      "step": 14220
+    },
+    {
+      "epoch": 2.531873219373219,
+      "grad_norm": 0.8544003963470459,
+      "learning_rate": 5.95634640865684e-05,
+      "loss": 0.8028,
+      "step": 14221
+    },
+    {
+      "epoch": 2.532051282051282,
+      "grad_norm": 0.8407679796218872,
+      "learning_rate": 5.9550662429162655e-05,
+      "loss": 0.6868,
+      "step": 14222
+    },
+    {
+      "epoch": 2.5322293447293447,
+      "grad_norm": 0.9049725532531738,
+      "learning_rate": 5.9537861564277654e-05,
+      "loss": 0.8177,
+      "step": 14223
+    },
+    {
+      "epoch": 2.5324074074074074,
+      "grad_norm": 0.938050389289856,
+      "learning_rate": 5.952506149216419e-05,
+      "loss": 0.9187,
+      "step": 14224
+    },
+    {
+      "epoch": 2.53258547008547,
+      "grad_norm": 0.9515482783317566,
+      "learning_rate": 5.951226221307312e-05,
+      "loss": 0.6735,
+      "step": 14225
+    },
+    {
+      "epoch": 2.532763532763533,
+      "grad_norm": 0.8545815348625183,
+      "learning_rate": 5.949946372725512e-05,
+      "loss": 0.7643,
+      "step": 14226
+    },
+    {
+      "epoch": 2.5329415954415957,
+      "grad_norm": 0.8388620615005493,
+      "learning_rate": 5.9486666034961e-05,
+      "loss": 0.9437,
+      "step": 14227
+    },
+    {
+      "epoch": 2.533119658119658,
+      "grad_norm": 0.7202512621879578,
+      "learning_rate": 5.9473869136441506e-05,
+      "loss": 0.718,
+      "step": 14228
+    },
+    {
+      "epoch": 2.5332977207977208,
+      "grad_norm": 0.8375558853149414,
+      "learning_rate": 5.946107303194739e-05,
+      "loss": 0.852,
+      "step": 14229
+    },
+    {
+      "epoch": 2.5334757834757835,
+      "grad_norm": 0.8980572819709778,
+      "learning_rate": 5.94482777217293e-05,
+      "loss": 0.9684,
+      "step": 14230
+    },
+    {
+      "epoch": 2.5336538461538463,
+      "grad_norm": 0.7374732494354248,
+      "learning_rate": 5.9435483206037977e-05,
+      "loss": 0.6498,
+      "step": 14231
+    },
+    {
+      "epoch": 2.5338319088319086,
+      "grad_norm": 1.073758602142334,
+      "learning_rate": 5.942268948512409e-05,
+      "loss": 1.0315,
+      "step": 14232
+    },
+    {
+      "epoch": 2.5340099715099713,
+      "grad_norm": 0.9503611326217651,
+      "learning_rate": 5.940989655923832e-05,
+      "loss": 1.0644,
+      "step": 14233
+    },
+    {
+      "epoch": 2.534188034188034,
+      "grad_norm": 0.870490550994873,
+      "learning_rate": 5.939710442863129e-05,
+      "loss": 1.1,
+      "step": 14234
+    },
+    {
+      "epoch": 2.534366096866097,
+      "grad_norm": 0.8019965887069702,
+      "learning_rate": 5.93843130935537e-05,
+      "loss": 0.9169,
+      "step": 14235
+    },
+    {
+      "epoch": 2.5345441595441596,
+      "grad_norm": 0.8333065509796143,
+      "learning_rate": 5.9371522554256076e-05,
+      "loss": 0.7848,
+      "step": 14236
+    },
+    {
+      "epoch": 2.5347222222222223,
+      "grad_norm": 0.8606435656547546,
+      "learning_rate": 5.935873281098909e-05,
+      "loss": 0.9941,
+      "step": 14237
+    },
+    {
+      "epoch": 2.534900284900285,
+      "grad_norm": 0.7711295485496521,
+      "learning_rate": 5.934594386400328e-05,
+      "loss": 0.8495,
+      "step": 14238
+    },
+    {
+      "epoch": 2.535078347578348,
+      "grad_norm": 0.871533215045929,
+      "learning_rate": 5.93331557135493e-05,
+      "loss": 0.9071,
+      "step": 14239
+    },
+    {
+      "epoch": 2.53525641025641,
+      "grad_norm": 0.9828163981437683,
+      "learning_rate": 5.932036835987762e-05,
+      "loss": 0.9561,
+      "step": 14240
+    },
+    {
+      "epoch": 2.535434472934473,
+      "grad_norm": 0.8485092520713806,
+      "learning_rate": 5.930758180323881e-05,
+      "loss": 0.7278,
+      "step": 14241
+    },
+    {
+      "epoch": 2.5356125356125356,
+      "grad_norm": 0.7608986496925354,
+      "learning_rate": 5.929479604388342e-05,
+      "loss": 1.0449,
+      "step": 14242
+    },
+    {
+      "epoch": 2.5357905982905984,
+      "grad_norm": 0.7852896451950073,
+      "learning_rate": 5.928201108206193e-05,
+      "loss": 0.8844,
+      "step": 14243
+    },
+    {
+      "epoch": 2.5359686609686607,
+      "grad_norm": 0.7636764645576477,
+      "learning_rate": 5.9269226918024875e-05,
+      "loss": 0.8259,
+      "step": 14244
+    },
+    {
+      "epoch": 2.5361467236467234,
+      "grad_norm": 0.9067455530166626,
+      "learning_rate": 5.925644355202269e-05,
+      "loss": 0.8742,
+      "step": 14245
+    },
+    {
+      "epoch": 2.536324786324786,
+      "grad_norm": 0.7911350727081299,
+      "learning_rate": 5.924366098430588e-05,
+      "loss": 0.8586,
+      "step": 14246
+    },
+    {
+      "epoch": 2.536502849002849,
+      "grad_norm": 0.8010593056678772,
+      "learning_rate": 5.923087921512483e-05,
+      "loss": 0.8524,
+      "step": 14247
+    },
+    {
+      "epoch": 2.5366809116809117,
+      "grad_norm": 1.232219934463501,
+      "learning_rate": 5.9218098244730034e-05,
+      "loss": 0.8302,
+      "step": 14248
+    },
+    {
+      "epoch": 2.5368589743589745,
+      "grad_norm": 0.8717244267463684,
+      "learning_rate": 5.9205318073371874e-05,
+      "loss": 0.8692,
+      "step": 14249
+    },
+    {
+      "epoch": 2.537037037037037,
+      "grad_norm": 0.9757453799247742,
+      "learning_rate": 5.919253870130079e-05,
+      "loss": 0.7986,
+      "step": 14250
+    },
+    {
+      "epoch": 2.5372150997151,
+      "grad_norm": 0.8183274865150452,
+      "learning_rate": 5.917976012876712e-05,
+      "loss": 0.8277,
+      "step": 14251
+    },
+    {
+      "epoch": 2.5373931623931623,
+      "grad_norm": 0.823930025100708,
+      "learning_rate": 5.916698235602125e-05,
+      "loss": 0.7972,
+      "step": 14252
+    },
+    {
+      "epoch": 2.537571225071225,
+      "grad_norm": 0.8480231761932373,
+      "learning_rate": 5.915420538331353e-05,
+      "loss": 0.8234,
+      "step": 14253
+    },
+    {
+      "epoch": 2.5377492877492878,
+      "grad_norm": 0.6718716621398926,
+      "learning_rate": 5.914142921089434e-05,
+      "loss": 0.5984,
+      "step": 14254
+    },
+    {
+      "epoch": 2.5379273504273505,
+      "grad_norm": 0.8506333827972412,
+      "learning_rate": 5.912865383901394e-05,
+      "loss": 0.8004,
+      "step": 14255
+    },
+    {
+      "epoch": 2.5381054131054133,
+      "grad_norm": 0.755740225315094,
+      "learning_rate": 5.911587926792269e-05,
+      "loss": 0.7465,
+      "step": 14256
+    },
+    {
+      "epoch": 2.5382834757834756,
+      "grad_norm": 0.8908467888832092,
+      "learning_rate": 5.9103105497870815e-05,
+      "loss": 0.7822,
+      "step": 14257
+    },
+    {
+      "epoch": 2.5384615384615383,
+      "grad_norm": 0.9572851061820984,
+      "learning_rate": 5.909033252910867e-05,
+      "loss": 0.7891,
+      "step": 14258
+    },
+    {
+      "epoch": 2.538639601139601,
+      "grad_norm": 0.8606489896774292,
+      "learning_rate": 5.907756036188644e-05,
+      "loss": 0.9071,
+      "step": 14259
+    },
+    {
+      "epoch": 2.538817663817664,
+      "grad_norm": 0.837658166885376,
+      "learning_rate": 5.906478899645444e-05,
+      "loss": 0.829,
+      "step": 14260
+    },
+    {
+      "epoch": 2.5389957264957266,
+      "grad_norm": 0.8964337706565857,
+      "learning_rate": 5.905201843306285e-05,
+      "loss": 1.0385,
+      "step": 14261
+    },
+    {
+      "epoch": 2.5391737891737893,
+      "grad_norm": 0.7854750156402588,
+      "learning_rate": 5.903924867196189e-05,
+      "loss": 0.8905,
+      "step": 14262
+    },
+    {
+      "epoch": 2.539351851851852,
+      "grad_norm": 0.8828065991401672,
+      "learning_rate": 5.902647971340176e-05,
+      "loss": 0.8541,
+      "step": 14263
+    },
+    {
+      "epoch": 2.5395299145299144,
+      "grad_norm": 0.804121196269989,
+      "learning_rate": 5.9013711557632645e-05,
+      "loss": 0.8333,
+      "step": 14264
+    },
+    {
+      "epoch": 2.539707977207977,
+      "grad_norm": 0.8868918418884277,
+      "learning_rate": 5.900094420490475e-05,
+      "loss": 0.7959,
+      "step": 14265
+    },
+    {
+      "epoch": 2.53988603988604,
+      "grad_norm": 0.9231327176094055,
+      "learning_rate": 5.8988177655468134e-05,
+      "loss": 0.8007,
+      "step": 14266
+    },
+    {
+      "epoch": 2.5400641025641026,
+      "grad_norm": 0.9918177127838135,
+      "learning_rate": 5.897541190957301e-05,
+      "loss": 0.7766,
+      "step": 14267
+    },
+    {
+      "epoch": 2.5402421652421654,
+      "grad_norm": 0.8468625545501709,
+      "learning_rate": 5.896264696746947e-05,
+      "loss": 0.9209,
+      "step": 14268
+    },
+    {
+      "epoch": 2.5404202279202277,
+      "grad_norm": 0.9160833358764648,
+      "learning_rate": 5.894988282940761e-05,
+      "loss": 0.8994,
+      "step": 14269
+    },
+    {
+      "epoch": 2.5405982905982905,
+      "grad_norm": 0.8029152154922485,
+      "learning_rate": 5.8937119495637515e-05,
+      "loss": 0.7936,
+      "step": 14270
+    },
+    {
+      "epoch": 2.540776353276353,
+      "grad_norm": 0.8829928636550903,
+      "learning_rate": 5.8924356966409286e-05,
+      "loss": 0.7368,
+      "step": 14271
+    },
+    {
+      "epoch": 2.540954415954416,
+      "grad_norm": 0.9698056578636169,
+      "learning_rate": 5.8911595241972925e-05,
+      "loss": 0.789,
+      "step": 14272
+    },
+    {
+      "epoch": 2.5411324786324787,
+      "grad_norm": 0.7949244379997253,
+      "learning_rate": 5.8898834322578524e-05,
+      "loss": 0.8885,
+      "step": 14273
+    },
+    {
+      "epoch": 2.5413105413105415,
+      "grad_norm": 1.2430917024612427,
+      "learning_rate": 5.888607420847605e-05,
+      "loss": 0.861,
+      "step": 14274
+    },
+    {
+      "epoch": 2.541488603988604,
+      "grad_norm": 0.7476705312728882,
+      "learning_rate": 5.887331489991559e-05,
+      "loss": 0.7942,
+      "step": 14275
+    },
+    {
+      "epoch": 2.5416666666666665,
+      "grad_norm": 0.9204338192939758,
+      "learning_rate": 5.886055639714706e-05,
+      "loss": 0.8633,
+      "step": 14276
+    },
+    {
+      "epoch": 2.5418447293447293,
+      "grad_norm": 0.8812162280082703,
+      "learning_rate": 5.884779870042047e-05,
+      "loss": 0.7162,
+      "step": 14277
+    },
+    {
+      "epoch": 2.542022792022792,
+      "grad_norm": 0.7859770655632019,
+      "learning_rate": 5.883504180998578e-05,
+      "loss": 0.7965,
+      "step": 14278
+    },
+    {
+      "epoch": 2.5422008547008548,
+      "grad_norm": 0.7732986211776733,
+      "learning_rate": 5.882228572609296e-05,
+      "loss": 0.9671,
+      "step": 14279
+    },
+    {
+      "epoch": 2.5423789173789175,
+      "grad_norm": 0.8555598855018616,
+      "learning_rate": 5.880953044899189e-05,
+      "loss": 0.8993,
+      "step": 14280
+    },
+    {
+      "epoch": 2.54255698005698,
+      "grad_norm": 0.7980908155441284,
+      "learning_rate": 5.879677597893248e-05,
+      "loss": 0.873,
+      "step": 14281
+    },
+    {
+      "epoch": 2.5427350427350426,
+      "grad_norm": 0.9244991540908813,
+      "learning_rate": 5.878402231616471e-05,
+      "loss": 0.934,
+      "step": 14282
+    },
+    {
+      "epoch": 2.5429131054131053,
+      "grad_norm": 1.0128331184387207,
+      "learning_rate": 5.877126946093835e-05,
+      "loss": 0.9607,
+      "step": 14283
+    },
+    {
+      "epoch": 2.543091168091168,
+      "grad_norm": 0.7916569709777832,
+      "learning_rate": 5.875851741350334e-05,
+      "loss": 0.831,
+      "step": 14284
+    },
+    {
+      "epoch": 2.543269230769231,
+      "grad_norm": 0.705007791519165,
+      "learning_rate": 5.8745766174109495e-05,
+      "loss": 0.6399,
+      "step": 14285
+    },
+    {
+      "epoch": 2.5434472934472936,
+      "grad_norm": 0.8785403966903687,
+      "learning_rate": 5.873301574300671e-05,
+      "loss": 0.9336,
+      "step": 14286
+    },
+    {
+      "epoch": 2.5436253561253563,
+      "grad_norm": 0.8225776553153992,
+      "learning_rate": 5.872026612044471e-05,
+      "loss": 0.8252,
+      "step": 14287
+    },
+    {
+      "epoch": 2.5438034188034186,
+      "grad_norm": 0.9629518985748291,
+      "learning_rate": 5.870751730667337e-05,
+      "loss": 1.0213,
+      "step": 14288
+    },
+    {
+      "epoch": 2.5439814814814814,
+      "grad_norm": 0.8242672681808472,
+      "learning_rate": 5.869476930194242e-05,
+      "loss": 0.9642,
+      "step": 14289
+    },
+    {
+      "epoch": 2.544159544159544,
+      "grad_norm": 0.5798216462135315,
+      "learning_rate": 5.868202210650171e-05,
+      "loss": 0.4366,
+      "step": 14290
+    },
+    {
+      "epoch": 2.544337606837607,
+      "grad_norm": 0.7945725917816162,
+      "learning_rate": 5.86692757206009e-05,
+      "loss": 0.9252,
+      "step": 14291
+    },
+    {
+      "epoch": 2.5445156695156697,
+      "grad_norm": 0.9078665375709534,
+      "learning_rate": 5.865653014448982e-05,
+      "loss": 1.0551,
+      "step": 14292
+    },
+    {
+      "epoch": 2.544693732193732,
+      "grad_norm": 0.8044732809066772,
+      "learning_rate": 5.86437853784181e-05,
+      "loss": 0.7778,
+      "step": 14293
+    },
+    {
+      "epoch": 2.5448717948717947,
+      "grad_norm": 0.8317133784294128,
+      "learning_rate": 5.863104142263553e-05,
+      "loss": 1.0047,
+      "step": 14294
+    },
+    {
+      "epoch": 2.5450498575498575,
+      "grad_norm": 0.8330327272415161,
+      "learning_rate": 5.861829827739174e-05,
+      "loss": 0.8074,
+      "step": 14295
+    },
+    {
+      "epoch": 2.54522792022792,
+      "grad_norm": 0.8731801509857178,
+      "learning_rate": 5.8605555942936474e-05,
+      "loss": 0.9311,
+      "step": 14296
+    },
+    {
+      "epoch": 2.545405982905983,
+      "grad_norm": 0.8906812071800232,
+      "learning_rate": 5.85928144195193e-05,
+      "loss": 0.9084,
+      "step": 14297
+    },
+    {
+      "epoch": 2.5455840455840457,
+      "grad_norm": 0.948535144329071,
+      "learning_rate": 5.8580073707389935e-05,
+      "loss": 0.923,
+      "step": 14298
+    },
+    {
+      "epoch": 2.5457621082621085,
+      "grad_norm": 1.0418797731399536,
+      "learning_rate": 5.8567333806797975e-05,
+      "loss": 0.9786,
+      "step": 14299
+    },
+    {
+      "epoch": 2.5459401709401708,
+      "grad_norm": 0.8591430187225342,
+      "learning_rate": 5.8554594717993075e-05,
+      "loss": 0.8706,
+      "step": 14300
+    },
+    {
+      "epoch": 2.5461182336182335,
+      "grad_norm": 1.1056550741195679,
+      "learning_rate": 5.854185644122475e-05,
+      "loss": 0.891,
+      "step": 14301
+    },
+    {
+      "epoch": 2.5462962962962963,
+      "grad_norm": 0.8945133090019226,
+      "learning_rate": 5.8529118976742624e-05,
+      "loss": 0.9584,
+      "step": 14302
+    },
+    {
+      "epoch": 2.546474358974359,
+      "grad_norm": 0.8568279147148132,
+      "learning_rate": 5.851638232479629e-05,
+      "loss": 0.7462,
+      "step": 14303
+    },
+    {
+      "epoch": 2.546652421652422,
+      "grad_norm": 0.934648871421814,
+      "learning_rate": 5.850364648563527e-05,
+      "loss": 0.8977,
+      "step": 14304
+    },
+    {
+      "epoch": 2.546830484330484,
+      "grad_norm": 0.8074216842651367,
+      "learning_rate": 5.849091145950909e-05,
+      "loss": 0.8779,
+      "step": 14305
+    },
+    {
+      "epoch": 2.547008547008547,
+      "grad_norm": 0.8781399726867676,
+      "learning_rate": 5.8478177246667266e-05,
+      "loss": 0.8715,
+      "step": 14306
+    },
+    {
+      "epoch": 2.5471866096866096,
+      "grad_norm": 0.8237441182136536,
+      "learning_rate": 5.846544384735933e-05,
+      "loss": 0.8806,
+      "step": 14307
+    },
+    {
+      "epoch": 2.5473646723646723,
+      "grad_norm": 0.933709442615509,
+      "learning_rate": 5.8452711261834717e-05,
+      "loss": 0.8555,
+      "step": 14308
+    },
+    {
+      "epoch": 2.547542735042735,
+      "grad_norm": 0.9045436978340149,
+      "learning_rate": 5.843997949034292e-05,
+      "loss": 1.0105,
+      "step": 14309
+    },
+    {
+      "epoch": 2.547720797720798,
+      "grad_norm": 0.8088112473487854,
+      "learning_rate": 5.842724853313337e-05,
+      "loss": 0.8667,
+      "step": 14310
+    },
+    {
+      "epoch": 2.5478988603988606,
+      "grad_norm": 1.1022162437438965,
+      "learning_rate": 5.841451839045559e-05,
+      "loss": 0.9818,
+      "step": 14311
+    },
+    {
+      "epoch": 2.5480769230769234,
+      "grad_norm": 0.8974189162254333,
+      "learning_rate": 5.8401789062558876e-05,
+      "loss": 0.843,
+      "step": 14312
+    },
+    {
+      "epoch": 2.5482549857549857,
+      "grad_norm": 0.7816309928894043,
+      "learning_rate": 5.838906054969272e-05,
+      "loss": 0.8665,
+      "step": 14313
+    },
+    {
+      "epoch": 2.5484330484330484,
+      "grad_norm": 0.8243623971939087,
+      "learning_rate": 5.8376332852106485e-05,
+      "loss": 0.7291,
+      "step": 14314
+    },
+    {
+      "epoch": 2.548611111111111,
+      "grad_norm": 0.8475931286811829,
+      "learning_rate": 5.8363605970049526e-05,
+      "loss": 0.7551,
+      "step": 14315
+    },
+    {
+      "epoch": 2.548789173789174,
+      "grad_norm": 0.8949251770973206,
+      "learning_rate": 5.835087990377124e-05,
+      "loss": 0.9138,
+      "step": 14316
+    },
+    {
+      "epoch": 2.548967236467236,
+      "grad_norm": 1.0444703102111816,
+      "learning_rate": 5.833815465352093e-05,
+      "loss": 0.8663,
+      "step": 14317
+    },
+    {
+      "epoch": 2.549145299145299,
+      "grad_norm": 0.8611619472503662,
+      "learning_rate": 5.8325430219547895e-05,
+      "loss": 0.795,
+      "step": 14318
+    },
+    {
+      "epoch": 2.5493233618233617,
+      "grad_norm": 0.7808047533035278,
+      "learning_rate": 5.8312706602101564e-05,
+      "loss": 0.905,
+      "step": 14319
+    },
+    {
+      "epoch": 2.5495014245014245,
+      "grad_norm": 0.9137473106384277,
+      "learning_rate": 5.8299983801431066e-05,
+      "loss": 0.8763,
+      "step": 14320
+    },
+    {
+      "epoch": 2.5496794871794872,
+      "grad_norm": 0.9511715769767761,
+      "learning_rate": 5.828726181778581e-05,
+      "loss": 0.8385,
+      "step": 14321
+    },
+    {
+      "epoch": 2.54985754985755,
+      "grad_norm": 0.9250940084457397,
+      "learning_rate": 5.8274540651415e-05,
+      "loss": 0.7026,
+      "step": 14322
+    },
+    {
+      "epoch": 2.5500356125356127,
+      "grad_norm": 1.001017689704895,
+      "learning_rate": 5.826182030256786e-05,
+      "loss": 0.7952,
+      "step": 14323
+    },
+    {
+      "epoch": 2.5502136752136755,
+      "grad_norm": 0.7638011574745178,
+      "learning_rate": 5.824910077149371e-05,
+      "loss": 0.7178,
+      "step": 14324
+    },
+    {
+      "epoch": 2.550391737891738,
+      "grad_norm": 0.9289371967315674,
+      "learning_rate": 5.823638205844164e-05,
+      "loss": 0.8492,
+      "step": 14325
+    },
+    {
+      "epoch": 2.5505698005698005,
+      "grad_norm": 0.8494341969490051,
+      "learning_rate": 5.822366416366093e-05,
+      "loss": 0.9095,
+      "step": 14326
+    },
+    {
+      "epoch": 2.5507478632478633,
+      "grad_norm": 0.8686699867248535,
+      "learning_rate": 5.8210947087400746e-05,
+      "loss": 0.9548,
+      "step": 14327
+    },
+    {
+      "epoch": 2.550925925925926,
+      "grad_norm": 1.1318142414093018,
+      "learning_rate": 5.819823082991025e-05,
+      "loss": 1.0554,
+      "step": 14328
+    },
+    {
+      "epoch": 2.5511039886039883,
+      "grad_norm": 0.8405448198318481,
+      "learning_rate": 5.818551539143857e-05,
+      "loss": 0.704,
+      "step": 14329
+    },
+    {
+      "epoch": 2.551282051282051,
+      "grad_norm": 0.9133256673812866,
+      "learning_rate": 5.8172800772234856e-05,
+      "loss": 0.922,
+      "step": 14330
+    },
+    {
+      "epoch": 2.551460113960114,
+      "grad_norm": 0.8526531457901001,
+      "learning_rate": 5.816008697254824e-05,
+      "loss": 0.6452,
+      "step": 14331
+    },
+    {
+      "epoch": 2.5516381766381766,
+      "grad_norm": 0.7987905144691467,
+      "learning_rate": 5.81473739926278e-05,
+      "loss": 0.8159,
+      "step": 14332
+    },
+    {
+      "epoch": 2.5518162393162394,
+      "grad_norm": 0.8217538595199585,
+      "learning_rate": 5.813466183272257e-05,
+      "loss": 0.6703,
+      "step": 14333
+    },
+    {
+      "epoch": 2.551994301994302,
+      "grad_norm": 0.7654905915260315,
+      "learning_rate": 5.8121950493081765e-05,
+      "loss": 0.8711,
+      "step": 14334
+    },
+    {
+      "epoch": 2.552172364672365,
+      "grad_norm": 0.872327983379364,
+      "learning_rate": 5.8109239973954264e-05,
+      "loss": 1.0079,
+      "step": 14335
+    },
+    {
+      "epoch": 2.5523504273504276,
+      "grad_norm": 0.7675468325614929,
+      "learning_rate": 5.809653027558922e-05,
+      "loss": 0.7541,
+      "step": 14336
+    },
+    {
+      "epoch": 2.55252849002849,
+      "grad_norm": 0.8367551565170288,
+      "learning_rate": 5.808382139823563e-05,
+      "loss": 0.9325,
+      "step": 14337
+    },
+    {
+      "epoch": 2.5527065527065527,
+      "grad_norm": 0.7946585416793823,
+      "learning_rate": 5.807111334214248e-05,
+      "loss": 0.6703,
+      "step": 14338
+    },
+    {
+      "epoch": 2.5528846153846154,
+      "grad_norm": 0.8752394318580627,
+      "learning_rate": 5.805840610755876e-05,
+      "loss": 0.82,
+      "step": 14339
+    },
+    {
+      "epoch": 2.553062678062678,
+      "grad_norm": 0.9394813776016235,
+      "learning_rate": 5.804569969473341e-05,
+      "loss": 1.0094,
+      "step": 14340
+    },
+    {
+      "epoch": 2.5532407407407405,
+      "grad_norm": 0.7763680219650269,
+      "learning_rate": 5.803299410391551e-05,
+      "loss": 0.8424,
+      "step": 14341
+    },
+    {
+      "epoch": 2.5534188034188032,
+      "grad_norm": 0.9148688912391663,
+      "learning_rate": 5.8020289335353816e-05,
+      "loss": 0.9344,
+      "step": 14342
+    },
+    {
+      "epoch": 2.553596866096866,
+      "grad_norm": 0.8666844367980957,
+      "learning_rate": 5.80075853892974e-05,
+      "loss": 0.8651,
+      "step": 14343
+    },
+    {
+      "epoch": 2.5537749287749287,
+      "grad_norm": 0.7240473628044128,
+      "learning_rate": 5.799488226599511e-05,
+      "loss": 0.6913,
+      "step": 14344
+    },
+    {
+      "epoch": 2.5539529914529915,
+      "grad_norm": 0.8949013352394104,
+      "learning_rate": 5.798217996569585e-05,
+      "loss": 0.7419,
+      "step": 14345
+    },
+    {
+      "epoch": 2.5541310541310542,
+      "grad_norm": 0.7760846614837646,
+      "learning_rate": 5.796947848864849e-05,
+      "loss": 0.8292,
+      "step": 14346
+    },
+    {
+      "epoch": 2.554309116809117,
+      "grad_norm": 0.8448507785797119,
+      "learning_rate": 5.795677783510187e-05,
+      "loss": 0.9953,
+      "step": 14347
+    },
+    {
+      "epoch": 2.5544871794871797,
+      "grad_norm": 0.834007203578949,
+      "learning_rate": 5.794407800530484e-05,
+      "loss": 0.8135,
+      "step": 14348
+    },
+    {
+      "epoch": 2.554665242165242,
+      "grad_norm": 0.8247915506362915,
+      "learning_rate": 5.793137899950629e-05,
+      "loss": 0.8607,
+      "step": 14349
+    },
+    {
+      "epoch": 2.554843304843305,
+      "grad_norm": 0.8796288967132568,
+      "learning_rate": 5.7918680817954906e-05,
+      "loss": 1.0479,
+      "step": 14350
+    },
+    {
+      "epoch": 2.5550213675213675,
+      "grad_norm": 0.8384763598442078,
+      "learning_rate": 5.790598346089964e-05,
+      "loss": 0.98,
+      "step": 14351
+    },
+    {
+      "epoch": 2.5551994301994303,
+      "grad_norm": 0.9394076466560364,
+      "learning_rate": 5.7893286928589107e-05,
+      "loss": 0.922,
+      "step": 14352
+    },
+    {
+      "epoch": 2.5553774928774926,
+      "grad_norm": 0.9548128843307495,
+      "learning_rate": 5.7880591221272184e-05,
+      "loss": 0.9877,
+      "step": 14353
+    },
+    {
+      "epoch": 2.5555555555555554,
+      "grad_norm": 0.7609717845916748,
+      "learning_rate": 5.786789633919758e-05,
+      "loss": 0.8115,
+      "step": 14354
+    },
+    {
+      "epoch": 2.555733618233618,
+      "grad_norm": 0.7415568232536316,
+      "learning_rate": 5.785520228261403e-05,
+      "loss": 0.6336,
+      "step": 14355
+    },
+    {
+      "epoch": 2.555911680911681,
+      "grad_norm": 0.8595952391624451,
+      "learning_rate": 5.7842509051770246e-05,
+      "loss": 0.8065,
+      "step": 14356
+    },
+    {
+      "epoch": 2.5560897435897436,
+      "grad_norm": 1.0075218677520752,
+      "learning_rate": 5.782981664691491e-05,
+      "loss": 0.9967,
+      "step": 14357
+    },
+    {
+      "epoch": 2.5562678062678064,
+      "grad_norm": 0.8405288457870483,
+      "learning_rate": 5.781712506829669e-05,
+      "loss": 0.7953,
+      "step": 14358
+    },
+    {
+      "epoch": 2.556445868945869,
+      "grad_norm": 0.8259321451187134,
+      "learning_rate": 5.780443431616435e-05,
+      "loss": 0.925,
+      "step": 14359
+    },
+    {
+      "epoch": 2.556623931623932,
+      "grad_norm": 0.8155162334442139,
+      "learning_rate": 5.7791744390766376e-05,
+      "loss": 0.9658,
+      "step": 14360
+    },
+    {
+      "epoch": 2.556801994301994,
+      "grad_norm": 0.8670404553413391,
+      "learning_rate": 5.7779055292351545e-05,
+      "loss": 1.0029,
+      "step": 14361
+    },
+    {
+      "epoch": 2.556980056980057,
+      "grad_norm": 0.8574714660644531,
+      "learning_rate": 5.7766367021168423e-05,
+      "loss": 0.9208,
+      "step": 14362
+    },
+    {
+      "epoch": 2.5571581196581197,
+      "grad_norm": 1.0231248140335083,
+      "learning_rate": 5.775367957746556e-05,
+      "loss": 1.0422,
+      "step": 14363
+    },
+    {
+      "epoch": 2.5573361823361824,
+      "grad_norm": 0.8403676152229309,
+      "learning_rate": 5.7740992961491655e-05,
+      "loss": 0.8068,
+      "step": 14364
+    },
+    {
+      "epoch": 2.557514245014245,
+      "grad_norm": 0.8792767524719238,
+      "learning_rate": 5.7728307173495136e-05,
+      "loss": 1.0405,
+      "step": 14365
+    },
+    {
+      "epoch": 2.5576923076923075,
+      "grad_norm": 0.8546510934829712,
+      "learning_rate": 5.771562221372471e-05,
+      "loss": 0.8246,
+      "step": 14366
+    },
+    {
+      "epoch": 2.5578703703703702,
+      "grad_norm": 0.7620588541030884,
+      "learning_rate": 5.770293808242875e-05,
+      "loss": 0.7588,
+      "step": 14367
+    },
+    {
+      "epoch": 2.558048433048433,
+      "grad_norm": 0.8154500722885132,
+      "learning_rate": 5.769025477985588e-05,
+      "loss": 0.8217,
+      "step": 14368
+    },
+    {
+      "epoch": 2.5582264957264957,
+      "grad_norm": 0.8630158305168152,
+      "learning_rate": 5.767757230625459e-05,
+      "loss": 0.8486,
+      "step": 14369
+    },
+    {
+      "epoch": 2.5584045584045585,
+      "grad_norm": 0.8991047143936157,
+      "learning_rate": 5.766489066187335e-05,
+      "loss": 0.9012,
+      "step": 14370
+    },
+    {
+      "epoch": 2.5585826210826212,
+      "grad_norm": 1.056725263595581,
+      "learning_rate": 5.7652209846960626e-05,
+      "loss": 0.8764,
+      "step": 14371
+    },
+    {
+      "epoch": 2.558760683760684,
+      "grad_norm": 0.7467330694198608,
+      "learning_rate": 5.7639529861764885e-05,
+      "loss": 0.6614,
+      "step": 14372
+    },
+    {
+      "epoch": 2.5589387464387463,
+      "grad_norm": 0.7930710315704346,
+      "learning_rate": 5.762685070653453e-05,
+      "loss": 0.6866,
+      "step": 14373
+    },
+    {
+      "epoch": 2.559116809116809,
+      "grad_norm": 0.9234277606010437,
+      "learning_rate": 5.7614172381518085e-05,
+      "loss": 0.9158,
+      "step": 14374
+    },
+    {
+      "epoch": 2.559294871794872,
+      "grad_norm": 1.0100786685943604,
+      "learning_rate": 5.7601494886963806e-05,
+      "loss": 0.9061,
+      "step": 14375
+    },
+    {
+      "epoch": 2.5594729344729346,
+      "grad_norm": 0.9864867925643921,
+      "learning_rate": 5.758881822312023e-05,
+      "loss": 0.9955,
+      "step": 14376
+    },
+    {
+      "epoch": 2.5596509971509973,
+      "grad_norm": 0.7328418493270874,
+      "learning_rate": 5.757614239023559e-05,
+      "loss": 0.874,
+      "step": 14377
+    },
+    {
+      "epoch": 2.5598290598290596,
+      "grad_norm": 0.8538700938224792,
+      "learning_rate": 5.7563467388558355e-05,
+      "loss": 0.8251,
+      "step": 14378
+    },
+    {
+      "epoch": 2.5600071225071224,
+      "grad_norm": 0.7603667378425598,
+      "learning_rate": 5.755079321833681e-05,
+      "loss": 0.8466,
+      "step": 14379
+    },
+    {
+      "epoch": 2.560185185185185,
+      "grad_norm": 0.8983954787254333,
+      "learning_rate": 5.753811987981925e-05,
+      "loss": 0.754,
+      "step": 14380
+    },
+    {
+      "epoch": 2.560363247863248,
+      "grad_norm": 0.8304823040962219,
+      "learning_rate": 5.752544737325411e-05,
+      "loss": 0.7057,
+      "step": 14381
+    },
+    {
+      "epoch": 2.5605413105413106,
+      "grad_norm": 0.8694877028465271,
+      "learning_rate": 5.751277569888952e-05,
+      "loss": 0.843,
+      "step": 14382
+    },
+    {
+      "epoch": 2.5607193732193734,
+      "grad_norm": 0.7965344786643982,
+      "learning_rate": 5.750010485697387e-05,
+      "loss": 0.7679,
+      "step": 14383
+    },
+    {
+      "epoch": 2.560897435897436,
+      "grad_norm": 0.8181809782981873,
+      "learning_rate": 5.7487434847755386e-05,
+      "loss": 0.8408,
+      "step": 14384
+    },
+    {
+      "epoch": 2.5610754985754984,
+      "grad_norm": 0.8492250442504883,
+      "learning_rate": 5.747476567148229e-05,
+      "loss": 0.6768,
+      "step": 14385
+    },
+    {
+      "epoch": 2.561253561253561,
+      "grad_norm": 0.9129379987716675,
+      "learning_rate": 5.746209732840282e-05,
+      "loss": 0.8804,
+      "step": 14386
+    },
+    {
+      "epoch": 2.561431623931624,
+      "grad_norm": 0.8701111078262329,
+      "learning_rate": 5.74494298187652e-05,
+      "loss": 1.0908,
+      "step": 14387
+    },
+    {
+      "epoch": 2.5616096866096867,
+      "grad_norm": 0.9152243733406067,
+      "learning_rate": 5.7436763142817606e-05,
+      "loss": 0.8197,
+      "step": 14388
+    },
+    {
+      "epoch": 2.5617877492877494,
+      "grad_norm": 0.8663429617881775,
+      "learning_rate": 5.742409730080822e-05,
+      "loss": 0.8065,
+      "step": 14389
+    },
+    {
+      "epoch": 2.5619658119658117,
+      "grad_norm": 0.8722090721130371,
+      "learning_rate": 5.741143229298516e-05,
+      "loss": 0.8634,
+      "step": 14390
+    },
+    {
+      "epoch": 2.5621438746438745,
+      "grad_norm": 0.8126732707023621,
+      "learning_rate": 5.7398768119596704e-05,
+      "loss": 0.831,
+      "step": 14391
+    },
+    {
+      "epoch": 2.5623219373219372,
+      "grad_norm": 0.9060684442520142,
+      "learning_rate": 5.7386104780890794e-05,
+      "loss": 0.8757,
+      "step": 14392
+    },
+    {
+      "epoch": 2.5625,
+      "grad_norm": 0.947692334651947,
+      "learning_rate": 5.7373442277115696e-05,
+      "loss": 0.8606,
+      "step": 14393
+    },
+    {
+      "epoch": 2.5626780626780628,
+      "grad_norm": 0.8826618790626526,
+      "learning_rate": 5.736078060851944e-05,
+      "loss": 0.9942,
+      "step": 14394
+    },
+    {
+      "epoch": 2.5628561253561255,
+      "grad_norm": 0.915372908115387,
+      "learning_rate": 5.734811977535011e-05,
+      "loss": 0.7871,
+      "step": 14395
+    },
+    {
+      "epoch": 2.5630341880341883,
+      "grad_norm": 0.8202184438705444,
+      "learning_rate": 5.733545977785577e-05,
+      "loss": 0.8889,
+      "step": 14396
+    },
+    {
+      "epoch": 2.5632122507122506,
+      "grad_norm": 1.0160186290740967,
+      "learning_rate": 5.7322800616284475e-05,
+      "loss": 0.8973,
+      "step": 14397
+    },
+    {
+      "epoch": 2.5633903133903133,
+      "grad_norm": 0.848753809928894,
+      "learning_rate": 5.7310142290884206e-05,
+      "loss": 0.9517,
+      "step": 14398
+    },
+    {
+      "epoch": 2.563568376068376,
+      "grad_norm": 0.7473777532577515,
+      "learning_rate": 5.72974848019031e-05,
+      "loss": 0.6549,
+      "step": 14399
+    },
+    {
+      "epoch": 2.563746438746439,
+      "grad_norm": 0.7396529316902161,
+      "learning_rate": 5.728482814958899e-05,
+      "loss": 0.7814,
+      "step": 14400
+    },
+    {
+      "epoch": 2.5639245014245016,
+      "grad_norm": 0.8617672920227051,
+      "learning_rate": 5.727217233418998e-05,
+      "loss": 0.885,
+      "step": 14401
+    },
+    {
+      "epoch": 2.564102564102564,
+      "grad_norm": 0.920477569103241,
+      "learning_rate": 5.7259517355953984e-05,
+      "loss": 1.0269,
+      "step": 14402
+    },
+    {
+      "epoch": 2.5642806267806266,
+      "grad_norm": 0.8460386991500854,
+      "learning_rate": 5.7246863215128975e-05,
+      "loss": 0.7418,
+      "step": 14403
+    },
+    {
+      "epoch": 2.5644586894586894,
+      "grad_norm": 0.8857694268226624,
+      "learning_rate": 5.723420991196287e-05,
+      "loss": 0.7771,
+      "step": 14404
+    },
+    {
+      "epoch": 2.564636752136752,
+      "grad_norm": 0.9912863969802856,
+      "learning_rate": 5.722155744670352e-05,
+      "loss": 1.101,
+      "step": 14405
+    },
+    {
+      "epoch": 2.564814814814815,
+      "grad_norm": 0.8062789440155029,
+      "learning_rate": 5.720890581959899e-05,
+      "loss": 0.8602,
+      "step": 14406
+    },
+    {
+      "epoch": 2.5649928774928776,
+      "grad_norm": 0.8620314598083496,
+      "learning_rate": 5.719625503089698e-05,
+      "loss": 0.9433,
+      "step": 14407
+    },
+    {
+      "epoch": 2.5651709401709404,
+      "grad_norm": 0.8119623064994812,
+      "learning_rate": 5.718360508084546e-05,
+      "loss": 0.884,
+      "step": 14408
+    },
+    {
+      "epoch": 2.5653490028490027,
+      "grad_norm": 0.7872169613838196,
+      "learning_rate": 5.7170955969692265e-05,
+      "loss": 0.8247,
+      "step": 14409
+    },
+    {
+      "epoch": 2.5655270655270654,
+      "grad_norm": 0.8314040303230286,
+      "learning_rate": 5.715830769768522e-05,
+      "loss": 0.8643,
+      "step": 14410
+    },
+    {
+      "epoch": 2.565705128205128,
+      "grad_norm": 0.9003102779388428,
+      "learning_rate": 5.7145660265072145e-05,
+      "loss": 0.9426,
+      "step": 14411
+    },
+    {
+      "epoch": 2.565883190883191,
+      "grad_norm": 0.6572127938270569,
+      "learning_rate": 5.713301367210082e-05,
+      "loss": 0.4852,
+      "step": 14412
+    },
+    {
+      "epoch": 2.5660612535612537,
+      "grad_norm": 0.9557960629463196,
+      "learning_rate": 5.7120367919019044e-05,
+      "loss": 0.9281,
+      "step": 14413
+    },
+    {
+      "epoch": 2.566239316239316,
+      "grad_norm": 0.9009736180305481,
+      "learning_rate": 5.71077230060746e-05,
+      "loss": 1.0154,
+      "step": 14414
+    },
+    {
+      "epoch": 2.5664173789173788,
+      "grad_norm": 0.8672121167182922,
+      "learning_rate": 5.7095078933515175e-05,
+      "loss": 0.805,
+      "step": 14415
+    },
+    {
+      "epoch": 2.5665954415954415,
+      "grad_norm": 0.9077832698822021,
+      "learning_rate": 5.708243570158862e-05,
+      "loss": 0.7446,
+      "step": 14416
+    },
+    {
+      "epoch": 2.5667735042735043,
+      "grad_norm": 0.850246787071228,
+      "learning_rate": 5.706979331054252e-05,
+      "loss": 0.7773,
+      "step": 14417
+    },
+    {
+      "epoch": 2.566951566951567,
+      "grad_norm": 0.803983211517334,
+      "learning_rate": 5.705715176062467e-05,
+      "loss": 0.9361,
+      "step": 14418
+    },
+    {
+      "epoch": 2.5671296296296298,
+      "grad_norm": 0.8956922888755798,
+      "learning_rate": 5.704451105208273e-05,
+      "loss": 0.8962,
+      "step": 14419
+    },
+    {
+      "epoch": 2.5673076923076925,
+      "grad_norm": 0.8994067907333374,
+      "learning_rate": 5.703187118516433e-05,
+      "loss": 0.9902,
+      "step": 14420
+    },
+    {
+      "epoch": 2.567485754985755,
+      "grad_norm": 0.7383418679237366,
+      "learning_rate": 5.701923216011722e-05,
+      "loss": 0.8188,
+      "step": 14421
+    },
+    {
+      "epoch": 2.5676638176638176,
+      "grad_norm": 0.8397318720817566,
+      "learning_rate": 5.70065939771889e-05,
+      "loss": 0.8557,
+      "step": 14422
+    },
+    {
+      "epoch": 2.5678418803418803,
+      "grad_norm": 0.8804301023483276,
+      "learning_rate": 5.699395663662714e-05,
+      "loss": 0.7248,
+      "step": 14423
+    },
+    {
+      "epoch": 2.568019943019943,
+      "grad_norm": 0.8391412496566772,
+      "learning_rate": 5.698132013867938e-05,
+      "loss": 0.6986,
+      "step": 14424
+    },
+    {
+      "epoch": 2.568198005698006,
+      "grad_norm": 0.7337331771850586,
+      "learning_rate": 5.6968684483593334e-05,
+      "loss": 0.7911,
+      "step": 14425
+    },
+    {
+      "epoch": 2.568376068376068,
+      "grad_norm": 1.006412386894226,
+      "learning_rate": 5.695604967161652e-05,
+      "loss": 1.0131,
+      "step": 14426
+    },
+    {
+      "epoch": 2.568554131054131,
+      "grad_norm": 0.7777771353721619,
+      "learning_rate": 5.6943415702996494e-05,
+      "loss": 0.67,
+      "step": 14427
+    },
+    {
+      "epoch": 2.5687321937321936,
+      "grad_norm": 0.8864775896072388,
+      "learning_rate": 5.6930782577980803e-05,
+      "loss": 0.9513,
+      "step": 14428
+    },
+    {
+      "epoch": 2.5689102564102564,
+      "grad_norm": 0.8505052328109741,
+      "learning_rate": 5.691815029681695e-05,
+      "loss": 0.7213,
+      "step": 14429
+    },
+    {
+      "epoch": 2.569088319088319,
+      "grad_norm": 0.705781877040863,
+      "learning_rate": 5.6905518859752416e-05,
+      "loss": 0.8273,
+      "step": 14430
+    },
+    {
+      "epoch": 2.569266381766382,
+      "grad_norm": 0.7157384753227234,
+      "learning_rate": 5.689288826703479e-05,
+      "loss": 0.6854,
+      "step": 14431
+    },
+    {
+      "epoch": 2.5694444444444446,
+      "grad_norm": 0.871244490146637,
+      "learning_rate": 5.68802585189114e-05,
+      "loss": 0.8786,
+      "step": 14432
+    },
+    {
+      "epoch": 2.5696225071225074,
+      "grad_norm": 0.8742622137069702,
+      "learning_rate": 5.686762961562981e-05,
+      "loss": 0.7253,
+      "step": 14433
+    },
+    {
+      "epoch": 2.5698005698005697,
+      "grad_norm": 0.8194206357002258,
+      "learning_rate": 5.685500155743742e-05,
+      "loss": 0.9028,
+      "step": 14434
+    },
+    {
+      "epoch": 2.5699786324786325,
+      "grad_norm": 0.7505850195884705,
+      "learning_rate": 5.684237434458164e-05,
+      "loss": 0.7711,
+      "step": 14435
+    },
+    {
+      "epoch": 2.570156695156695,
+      "grad_norm": 0.9128859639167786,
+      "learning_rate": 5.6829747977309885e-05,
+      "loss": 0.8495,
+      "step": 14436
+    },
+    {
+      "epoch": 2.570334757834758,
+      "grad_norm": 0.6996384263038635,
+      "learning_rate": 5.681712245586954e-05,
+      "loss": 0.6938,
+      "step": 14437
+    },
+    {
+      "epoch": 2.5705128205128203,
+      "grad_norm": 0.8720461130142212,
+      "learning_rate": 5.680449778050798e-05,
+      "loss": 1.0547,
+      "step": 14438
+    },
+    {
+      "epoch": 2.570690883190883,
+      "grad_norm": 0.7767693996429443,
+      "learning_rate": 5.6791873951472544e-05,
+      "loss": 0.8718,
+      "step": 14439
+    },
+    {
+      "epoch": 2.5708689458689458,
+      "grad_norm": 0.8596739768981934,
+      "learning_rate": 5.6779250969010554e-05,
+      "loss": 0.792,
+      "step": 14440
+    },
+    {
+      "epoch": 2.5710470085470085,
+      "grad_norm": 1.0065197944641113,
+      "learning_rate": 5.676662883336939e-05,
+      "loss": 0.9199,
+      "step": 14441
+    },
+    {
+      "epoch": 2.5712250712250713,
+      "grad_norm": 0.8707680702209473,
+      "learning_rate": 5.6754007544796316e-05,
+      "loss": 0.9696,
+      "step": 14442
+    },
+    {
+      "epoch": 2.571403133903134,
+      "grad_norm": 0.8060235977172852,
+      "learning_rate": 5.674138710353865e-05,
+      "loss": 0.6626,
+      "step": 14443
+    },
+    {
+      "epoch": 2.5715811965811968,
+      "grad_norm": 0.7640239596366882,
+      "learning_rate": 5.6728767509843627e-05,
+      "loss": 0.8177,
+      "step": 14444
+    },
+    {
+      "epoch": 2.5717592592592595,
+      "grad_norm": 1.0821335315704346,
+      "learning_rate": 5.671614876395848e-05,
+      "loss": 1.0084,
+      "step": 14445
+    },
+    {
+      "epoch": 2.571937321937322,
+      "grad_norm": 0.874721884727478,
+      "learning_rate": 5.670353086613056e-05,
+      "loss": 0.9508,
+      "step": 14446
+    },
+    {
+      "epoch": 2.5721153846153846,
+      "grad_norm": 0.7837753891944885,
+      "learning_rate": 5.669091381660694e-05,
+      "loss": 0.6546,
+      "step": 14447
+    },
+    {
+      "epoch": 2.5722934472934473,
+      "grad_norm": 0.832924485206604,
+      "learning_rate": 5.6678297615634965e-05,
+      "loss": 0.9055,
+      "step": 14448
+    },
+    {
+      "epoch": 2.57247150997151,
+      "grad_norm": 0.8463562726974487,
+      "learning_rate": 5.6665682263461696e-05,
+      "loss": 0.8234,
+      "step": 14449
+    },
+    {
+      "epoch": 2.5726495726495724,
+      "grad_norm": 0.8785214424133301,
+      "learning_rate": 5.6653067760334386e-05,
+      "loss": 0.8478,
+      "step": 14450
+    },
+    {
+      "epoch": 2.572827635327635,
+      "grad_norm": 0.7375151515007019,
+      "learning_rate": 5.664045410650017e-05,
+      "loss": 0.8629,
+      "step": 14451
+    },
+    {
+      "epoch": 2.573005698005698,
+      "grad_norm": 0.7428547143936157,
+      "learning_rate": 5.6627841302206196e-05,
+      "loss": 0.9198,
+      "step": 14452
+    },
+    {
+      "epoch": 2.5731837606837606,
+      "grad_norm": 0.7373468279838562,
+      "learning_rate": 5.661522934769956e-05,
+      "loss": 0.6931,
+      "step": 14453
+    },
+    {
+      "epoch": 2.5733618233618234,
+      "grad_norm": 0.9162034392356873,
+      "learning_rate": 5.660261824322739e-05,
+      "loss": 0.9971,
+      "step": 14454
+    },
+    {
+      "epoch": 2.573539886039886,
+      "grad_norm": 0.7816632390022278,
+      "learning_rate": 5.659000798903672e-05,
+      "loss": 1.0481,
+      "step": 14455
+    },
+    {
+      "epoch": 2.573717948717949,
+      "grad_norm": 0.8594158291816711,
+      "learning_rate": 5.657739858537474e-05,
+      "loss": 1.1846,
+      "step": 14456
+    },
+    {
+      "epoch": 2.5738960113960117,
+      "grad_norm": 0.8171747922897339,
+      "learning_rate": 5.656479003248836e-05,
+      "loss": 0.8435,
+      "step": 14457
+    },
+    {
+      "epoch": 2.574074074074074,
+      "grad_norm": 0.8568267822265625,
+      "learning_rate": 5.6552182330624784e-05,
+      "loss": 1.031,
+      "step": 14458
+    },
+    {
+      "epoch": 2.5742521367521367,
+      "grad_norm": 0.8238523602485657,
+      "learning_rate": 5.653957548003084e-05,
+      "loss": 0.8917,
+      "step": 14459
+    },
+    {
+      "epoch": 2.5744301994301995,
+      "grad_norm": 0.7226746082305908,
+      "learning_rate": 5.652696948095369e-05,
+      "loss": 0.832,
+      "step": 14460
+    },
+    {
+      "epoch": 2.574608262108262,
+      "grad_norm": 0.9448554515838623,
+      "learning_rate": 5.651436433364024e-05,
+      "loss": 0.9696,
+      "step": 14461
+    },
+    {
+      "epoch": 2.5747863247863245,
+      "grad_norm": 0.9404924511909485,
+      "learning_rate": 5.650176003833747e-05,
+      "loss": 0.9813,
+      "step": 14462
+    },
+    {
+      "epoch": 2.5749643874643873,
+      "grad_norm": 0.9445366859436035,
+      "learning_rate": 5.648915659529241e-05,
+      "loss": 0.7205,
+      "step": 14463
+    },
+    {
+      "epoch": 2.57514245014245,
+      "grad_norm": 0.9205772876739502,
+      "learning_rate": 5.647655400475189e-05,
+      "loss": 0.958,
+      "step": 14464
+    },
+    {
+      "epoch": 2.5753205128205128,
+      "grad_norm": 0.9025790691375732,
+      "learning_rate": 5.646395226696291e-05,
+      "loss": 0.9107,
+      "step": 14465
+    },
+    {
+      "epoch": 2.5754985754985755,
+      "grad_norm": 0.9562451839447021,
+      "learning_rate": 5.645135138217235e-05,
+      "loss": 0.7618,
+      "step": 14466
+    },
+    {
+      "epoch": 2.5756766381766383,
+      "grad_norm": 0.8896244764328003,
+      "learning_rate": 5.6438751350627085e-05,
+      "loss": 0.9696,
+      "step": 14467
+    },
+    {
+      "epoch": 2.575854700854701,
+      "grad_norm": 0.9051744937896729,
+      "learning_rate": 5.6426152172574e-05,
+      "loss": 0.9537,
+      "step": 14468
+    },
+    {
+      "epoch": 2.576032763532764,
+      "grad_norm": 0.844556450843811,
+      "learning_rate": 5.641355384825995e-05,
+      "loss": 0.8686,
+      "step": 14469
+    },
+    {
+      "epoch": 2.576210826210826,
+      "grad_norm": 0.7751742601394653,
+      "learning_rate": 5.6400956377931726e-05,
+      "loss": 0.8373,
+      "step": 14470
+    },
+    {
+      "epoch": 2.576388888888889,
+      "grad_norm": 0.8988052010536194,
+      "learning_rate": 5.638835976183627e-05,
+      "loss": 0.8661,
+      "step": 14471
+    },
+    {
+      "epoch": 2.5765669515669516,
+      "grad_norm": 0.9114456176757812,
+      "learning_rate": 5.637576400022023e-05,
+      "loss": 1.0583,
+      "step": 14472
+    },
+    {
+      "epoch": 2.5767450142450143,
+      "grad_norm": 0.8742861151695251,
+      "learning_rate": 5.636316909333056e-05,
+      "loss": 0.8392,
+      "step": 14473
+    },
+    {
+      "epoch": 2.5769230769230766,
+      "grad_norm": 0.8418447375297546,
+      "learning_rate": 5.6350575041413854e-05,
+      "loss": 0.7494,
+      "step": 14474
+    },
+    {
+      "epoch": 2.5771011396011394,
+      "grad_norm": 0.9942673444747925,
+      "learning_rate": 5.633798184471701e-05,
+      "loss": 1.0183,
+      "step": 14475
+    },
+    {
+      "epoch": 2.577279202279202,
+      "grad_norm": 0.7663289308547974,
+      "learning_rate": 5.63253895034867e-05,
+      "loss": 0.7551,
+      "step": 14476
+    },
+    {
+      "epoch": 2.577457264957265,
+      "grad_norm": 0.8866778016090393,
+      "learning_rate": 5.631279801796966e-05,
+      "loss": 0.8623,
+      "step": 14477
+    },
+    {
+      "epoch": 2.5776353276353277,
+      "grad_norm": 0.9198449850082397,
+      "learning_rate": 5.6300207388412595e-05,
+      "loss": 1.0388,
+      "step": 14478
+    },
+    {
+      "epoch": 2.5778133903133904,
+      "grad_norm": 0.8202611804008484,
+      "learning_rate": 5.628761761506214e-05,
+      "loss": 0.7556,
+      "step": 14479
+    },
+    {
+      "epoch": 2.577991452991453,
+      "grad_norm": 0.751899003982544,
+      "learning_rate": 5.627502869816505e-05,
+      "loss": 0.8231,
+      "step": 14480
+    },
+    {
+      "epoch": 2.578169515669516,
+      "grad_norm": 1.0094623565673828,
+      "learning_rate": 5.626244063796795e-05,
+      "loss": 0.9778,
+      "step": 14481
+    },
+    {
+      "epoch": 2.578347578347578,
+      "grad_norm": 0.8163259625434875,
+      "learning_rate": 5.624985343471747e-05,
+      "loss": 0.8355,
+      "step": 14482
+    },
+    {
+      "epoch": 2.578525641025641,
+      "grad_norm": 0.8190516829490662,
+      "learning_rate": 5.623726708866023e-05,
+      "loss": 0.7736,
+      "step": 14483
+    },
+    {
+      "epoch": 2.5787037037037037,
+      "grad_norm": 0.884303629398346,
+      "learning_rate": 5.622468160004283e-05,
+      "loss": 0.8618,
+      "step": 14484
+    },
+    {
+      "epoch": 2.5788817663817665,
+      "grad_norm": 0.8564121723175049,
+      "learning_rate": 5.621209696911185e-05,
+      "loss": 0.9691,
+      "step": 14485
+    },
+    {
+      "epoch": 2.5790598290598292,
+      "grad_norm": 0.8122418522834778,
+      "learning_rate": 5.619951319611388e-05,
+      "loss": 0.7539,
+      "step": 14486
+    },
+    {
+      "epoch": 2.5792378917378915,
+      "grad_norm": 0.764470636844635,
+      "learning_rate": 5.6186930281295425e-05,
+      "loss": 0.7065,
+      "step": 14487
+    },
+    {
+      "epoch": 2.5794159544159543,
+      "grad_norm": 0.7477477192878723,
+      "learning_rate": 5.617434822490313e-05,
+      "loss": 0.7456,
+      "step": 14488
+    },
+    {
+      "epoch": 2.579594017094017,
+      "grad_norm": 0.9460917711257935,
+      "learning_rate": 5.616176702718335e-05,
+      "loss": 0.8427,
+      "step": 14489
+    },
+    {
+      "epoch": 2.57977207977208,
+      "grad_norm": 0.858561098575592,
+      "learning_rate": 5.614918668838274e-05,
+      "loss": 0.8913,
+      "step": 14490
+    },
+    {
+      "epoch": 2.5799501424501425,
+      "grad_norm": 0.8664894104003906,
+      "learning_rate": 5.613660720874772e-05,
+      "loss": 0.9211,
+      "step": 14491
+    },
+    {
+      "epoch": 2.5801282051282053,
+      "grad_norm": 0.8475569486618042,
+      "learning_rate": 5.612402858852475e-05,
+      "loss": 0.8149,
+      "step": 14492
+    },
+    {
+      "epoch": 2.580306267806268,
+      "grad_norm": 0.9543033838272095,
+      "learning_rate": 5.6111450827960296e-05,
+      "loss": 0.679,
+      "step": 14493
+    },
+    {
+      "epoch": 2.5804843304843303,
+      "grad_norm": 0.8219496011734009,
+      "learning_rate": 5.60988739273008e-05,
+      "loss": 0.9586,
+      "step": 14494
+    },
+    {
+      "epoch": 2.580662393162393,
+      "grad_norm": 0.8432445526123047,
+      "learning_rate": 5.6086297886792684e-05,
+      "loss": 0.8939,
+      "step": 14495
+    },
+    {
+      "epoch": 2.580840455840456,
+      "grad_norm": 0.9026654362678528,
+      "learning_rate": 5.607372270668232e-05,
+      "loss": 0.7422,
+      "step": 14496
+    },
+    {
+      "epoch": 2.5810185185185186,
+      "grad_norm": 1.0681802034378052,
+      "learning_rate": 5.606114838721608e-05,
+      "loss": 0.8208,
+      "step": 14497
+    },
+    {
+      "epoch": 2.5811965811965814,
+      "grad_norm": 0.8807427883148193,
+      "learning_rate": 5.604857492864044e-05,
+      "loss": 0.8463,
+      "step": 14498
+    },
+    {
+      "epoch": 2.5813746438746437,
+      "grad_norm": 0.7520862221717834,
+      "learning_rate": 5.603600233120159e-05,
+      "loss": 0.6691,
+      "step": 14499
+    },
+    {
+      "epoch": 2.5815527065527064,
+      "grad_norm": 0.8214079737663269,
+      "learning_rate": 5.602343059514599e-05,
+      "loss": 0.8416,
+      "step": 14500
+    },
+    {
+      "epoch": 2.581730769230769,
+      "grad_norm": 0.9263389110565186,
+      "learning_rate": 5.601085972071991e-05,
+      "loss": 1.1466,
+      "step": 14501
+    },
+    {
+      "epoch": 2.581908831908832,
+      "grad_norm": 0.8501101136207581,
+      "learning_rate": 5.5998289708169626e-05,
+      "loss": 0.855,
+      "step": 14502
+    },
+    {
+      "epoch": 2.5820868945868947,
+      "grad_norm": 0.8312939405441284,
+      "learning_rate": 5.598572055774152e-05,
+      "loss": 0.9843,
+      "step": 14503
+    },
+    {
+      "epoch": 2.5822649572649574,
+      "grad_norm": 0.7309035658836365,
+      "learning_rate": 5.5973152269681714e-05,
+      "loss": 0.813,
+      "step": 14504
+    },
+    {
+      "epoch": 2.58244301994302,
+      "grad_norm": 0.8962578177452087,
+      "learning_rate": 5.596058484423656e-05,
+      "loss": 0.7619,
+      "step": 14505
+    },
+    {
+      "epoch": 2.5826210826210825,
+      "grad_norm": 0.7805112600326538,
+      "learning_rate": 5.594801828165228e-05,
+      "loss": 1.1011,
+      "step": 14506
+    },
+    {
+      "epoch": 2.5827991452991452,
+      "grad_norm": 1.224509358406067,
+      "learning_rate": 5.593545258217505e-05,
+      "loss": 0.9764,
+      "step": 14507
+    },
+    {
+      "epoch": 2.582977207977208,
+      "grad_norm": 0.8085877895355225,
+      "learning_rate": 5.59228877460511e-05,
+      "loss": 0.9324,
+      "step": 14508
+    },
+    {
+      "epoch": 2.5831552706552707,
+      "grad_norm": 0.7962629199028015,
+      "learning_rate": 5.591032377352661e-05,
+      "loss": 0.6294,
+      "step": 14509
+    },
+    {
+      "epoch": 2.5833333333333335,
+      "grad_norm": 0.8638611435890198,
+      "learning_rate": 5.589776066484773e-05,
+      "loss": 0.7355,
+      "step": 14510
+    },
+    {
+      "epoch": 2.583511396011396,
+      "grad_norm": 0.8975821733474731,
+      "learning_rate": 5.588519842026061e-05,
+      "loss": 1.0264,
+      "step": 14511
+    },
+    {
+      "epoch": 2.5836894586894585,
+      "grad_norm": 0.8327218890190125,
+      "learning_rate": 5.5872637040011355e-05,
+      "loss": 0.8864,
+      "step": 14512
+    },
+    {
+      "epoch": 2.5838675213675213,
+      "grad_norm": 0.8141334652900696,
+      "learning_rate": 5.5860076524346197e-05,
+      "loss": 1.0277,
+      "step": 14513
+    },
+    {
+      "epoch": 2.584045584045584,
+      "grad_norm": 0.8557519316673279,
+      "learning_rate": 5.584751687351105e-05,
+      "loss": 0.9215,
+      "step": 14514
+    },
+    {
+      "epoch": 2.584223646723647,
+      "grad_norm": 0.902601957321167,
+      "learning_rate": 5.583495808775214e-05,
+      "loss": 0.8527,
+      "step": 14515
+    },
+    {
+      "epoch": 2.5844017094017095,
+      "grad_norm": 0.826359212398529,
+      "learning_rate": 5.582240016731548e-05,
+      "loss": 0.8524,
+      "step": 14516
+    },
+    {
+      "epoch": 2.5845797720797723,
+      "grad_norm": 0.7099179029464722,
+      "learning_rate": 5.580984311244713e-05,
+      "loss": 0.6923,
+      "step": 14517
+    },
+    {
+      "epoch": 2.5847578347578346,
+      "grad_norm": 0.829795777797699,
+      "learning_rate": 5.5797286923393086e-05,
+      "loss": 0.7211,
+      "step": 14518
+    },
+    {
+      "epoch": 2.5849358974358974,
+      "grad_norm": 0.8006768226623535,
+      "learning_rate": 5.5784731600399355e-05,
+      "loss": 0.7237,
+      "step": 14519
+    },
+    {
+      "epoch": 2.58511396011396,
+      "grad_norm": 0.7596119046211243,
+      "learning_rate": 5.577217714371203e-05,
+      "loss": 0.7651,
+      "step": 14520
+    },
+    {
+      "epoch": 2.585292022792023,
+      "grad_norm": 0.7901585102081299,
+      "learning_rate": 5.575962355357694e-05,
+      "loss": 0.7672,
+      "step": 14521
+    },
+    {
+      "epoch": 2.5854700854700856,
+      "grad_norm": 0.8586403131484985,
+      "learning_rate": 5.574707083024018e-05,
+      "loss": 1.084,
+      "step": 14522
+    },
+    {
+      "epoch": 2.585648148148148,
+      "grad_norm": 0.7670607566833496,
+      "learning_rate": 5.5734518973947616e-05,
+      "loss": 0.7929,
+      "step": 14523
+    },
+    {
+      "epoch": 2.5858262108262107,
+      "grad_norm": 0.8114384412765503,
+      "learning_rate": 5.572196798494522e-05,
+      "loss": 0.8154,
+      "step": 14524
+    },
+    {
+      "epoch": 2.5860042735042734,
+      "grad_norm": 0.8050188422203064,
+      "learning_rate": 5.570941786347888e-05,
+      "loss": 0.7969,
+      "step": 14525
+    },
+    {
+      "epoch": 2.586182336182336,
+      "grad_norm": 0.8641461133956909,
+      "learning_rate": 5.569686860979447e-05,
+      "loss": 0.8469,
+      "step": 14526
+    },
+    {
+      "epoch": 2.586360398860399,
+      "grad_norm": 0.7644940614700317,
+      "learning_rate": 5.568432022413787e-05,
+      "loss": 0.563,
+      "step": 14527
+    },
+    {
+      "epoch": 2.5865384615384617,
+      "grad_norm": 0.7620565891265869,
+      "learning_rate": 5.567177270675503e-05,
+      "loss": 0.657,
+      "step": 14528
+    },
+    {
+      "epoch": 2.5867165242165244,
+      "grad_norm": 0.8371306657791138,
+      "learning_rate": 5.5659226057891634e-05,
+      "loss": 0.8862,
+      "step": 14529
+    },
+    {
+      "epoch": 2.5868945868945867,
+      "grad_norm": 0.8996389508247375,
+      "learning_rate": 5.564668027779367e-05,
+      "loss": 0.6031,
+      "step": 14530
+    },
+    {
+      "epoch": 2.5870726495726495,
+      "grad_norm": 0.8691734671592712,
+      "learning_rate": 5.5634135366706806e-05,
+      "loss": 0.9198,
+      "step": 14531
+    },
+    {
+      "epoch": 2.5872507122507122,
+      "grad_norm": 0.8926620483398438,
+      "learning_rate": 5.562159132487693e-05,
+      "loss": 0.8691,
+      "step": 14532
+    },
+    {
+      "epoch": 2.587428774928775,
+      "grad_norm": 1.0852068662643433,
+      "learning_rate": 5.5609048152549794e-05,
+      "loss": 1.2338,
+      "step": 14533
+    },
+    {
+      "epoch": 2.5876068376068377,
+      "grad_norm": 0.7894790172576904,
+      "learning_rate": 5.5596505849971124e-05,
+      "loss": 0.907,
+      "step": 14534
+    },
+    {
+      "epoch": 2.5877849002849,
+      "grad_norm": 0.8084964156150818,
+      "learning_rate": 5.558396441738669e-05,
+      "loss": 0.9082,
+      "step": 14535
+    },
+    {
+      "epoch": 2.587962962962963,
+      "grad_norm": 1.0563920736312866,
+      "learning_rate": 5.557142385504222e-05,
+      "loss": 1.0364,
+      "step": 14536
+    },
+    {
+      "epoch": 2.5881410256410255,
+      "grad_norm": 0.7996996641159058,
+      "learning_rate": 5.5558884163183354e-05,
+      "loss": 0.925,
+      "step": 14537
+    },
+    {
+      "epoch": 2.5883190883190883,
+      "grad_norm": 0.7493244409561157,
+      "learning_rate": 5.5546345342055916e-05,
+      "loss": 0.9516,
+      "step": 14538
+    },
+    {
+      "epoch": 2.588497150997151,
+      "grad_norm": 0.8916776776313782,
+      "learning_rate": 5.553380739190541e-05,
+      "loss": 0.8164,
+      "step": 14539
+    },
+    {
+      "epoch": 2.588675213675214,
+      "grad_norm": 0.8178156614303589,
+      "learning_rate": 5.552127031297762e-05,
+      "loss": 0.905,
+      "step": 14540
+    },
+    {
+      "epoch": 2.5888532763532766,
+      "grad_norm": 0.8305806517601013,
+      "learning_rate": 5.550873410551816e-05,
+      "loss": 0.789,
+      "step": 14541
+    },
+    {
+      "epoch": 2.5890313390313393,
+      "grad_norm": 0.9307064414024353,
+      "learning_rate": 5.549619876977258e-05,
+      "loss": 0.8529,
+      "step": 14542
+    },
+    {
+      "epoch": 2.5892094017094016,
+      "grad_norm": 0.8526419401168823,
+      "learning_rate": 5.5483664305986614e-05,
+      "loss": 0.8314,
+      "step": 14543
+    },
+    {
+      "epoch": 2.5893874643874644,
+      "grad_norm": 0.884918212890625,
+      "learning_rate": 5.547113071440568e-05,
+      "loss": 0.7957,
+      "step": 14544
+    },
+    {
+      "epoch": 2.589565527065527,
+      "grad_norm": 0.7517948746681213,
+      "learning_rate": 5.5458597995275554e-05,
+      "loss": 0.7012,
+      "step": 14545
+    },
+    {
+      "epoch": 2.58974358974359,
+      "grad_norm": 0.8321232199668884,
+      "learning_rate": 5.5446066148841556e-05,
+      "loss": 1.0017,
+      "step": 14546
+    },
+    {
+      "epoch": 2.589921652421652,
+      "grad_norm": 0.8279885053634644,
+      "learning_rate": 5.543353517534939e-05,
+      "loss": 0.987,
+      "step": 14547
+    },
+    {
+      "epoch": 2.590099715099715,
+      "grad_norm": 0.8651175498962402,
+      "learning_rate": 5.542100507504454e-05,
+      "loss": 0.8929,
+      "step": 14548
+    },
+    {
+      "epoch": 2.5902777777777777,
+      "grad_norm": 0.9273492097854614,
+      "learning_rate": 5.540847584817248e-05,
+      "loss": 0.9503,
+      "step": 14549
+    },
+    {
+      "epoch": 2.5904558404558404,
+      "grad_norm": 0.8779071569442749,
+      "learning_rate": 5.5395947494978696e-05,
+      "loss": 0.9099,
+      "step": 14550
+    },
+    {
+      "epoch": 2.590633903133903,
+      "grad_norm": 0.8860164880752563,
+      "learning_rate": 5.538342001570868e-05,
+      "loss": 0.9559,
+      "step": 14551
+    },
+    {
+      "epoch": 2.590811965811966,
+      "grad_norm": 0.9232339859008789,
+      "learning_rate": 5.5370893410607816e-05,
+      "loss": 0.9495,
+      "step": 14552
+    },
+    {
+      "epoch": 2.5909900284900287,
+      "grad_norm": 0.8176831007003784,
+      "learning_rate": 5.5358367679921666e-05,
+      "loss": 0.8897,
+      "step": 14553
+    },
+    {
+      "epoch": 2.5911680911680914,
+      "grad_norm": 0.7926605939865112,
+      "learning_rate": 5.5345842823895486e-05,
+      "loss": 0.7609,
+      "step": 14554
+    },
+    {
+      "epoch": 2.5913461538461537,
+      "grad_norm": 0.9837173819541931,
+      "learning_rate": 5.533331884277484e-05,
+      "loss": 0.7842,
+      "step": 14555
+    },
+    {
+      "epoch": 2.5915242165242165,
+      "grad_norm": 0.7303726673126221,
+      "learning_rate": 5.5320795736804945e-05,
+      "loss": 0.824,
+      "step": 14556
+    },
+    {
+      "epoch": 2.5917022792022792,
+      "grad_norm": 0.8379296660423279,
+      "learning_rate": 5.530827350623128e-05,
+      "loss": 0.8005,
+      "step": 14557
+    },
+    {
+      "epoch": 2.591880341880342,
+      "grad_norm": 0.8562047481536865,
+      "learning_rate": 5.529575215129916e-05,
+      "loss": 1.0048,
+      "step": 14558
+    },
+    {
+      "epoch": 2.5920584045584043,
+      "grad_norm": 0.7543022632598877,
+      "learning_rate": 5.528323167225386e-05,
+      "loss": 0.7543,
+      "step": 14559
+    },
+    {
+      "epoch": 2.592236467236467,
+      "grad_norm": 0.8205977082252502,
+      "learning_rate": 5.5270712069340847e-05,
+      "loss": 0.997,
+      "step": 14560
+    },
+    {
+      "epoch": 2.59241452991453,
+      "grad_norm": 0.8566918969154358,
+      "learning_rate": 5.525819334280522e-05,
+      "loss": 0.9222,
+      "step": 14561
+    },
+    {
+      "epoch": 2.5925925925925926,
+      "grad_norm": 0.8513971567153931,
+      "learning_rate": 5.524567549289239e-05,
+      "loss": 0.7007,
+      "step": 14562
+    },
+    {
+      "epoch": 2.5927706552706553,
+      "grad_norm": 0.8939194679260254,
+      "learning_rate": 5.523315851984758e-05,
+      "loss": 0.8597,
+      "step": 14563
+    },
+    {
+      "epoch": 2.592948717948718,
+      "grad_norm": 0.7597625851631165,
+      "learning_rate": 5.5220642423916035e-05,
+      "loss": 0.7122,
+      "step": 14564
+    },
+    {
+      "epoch": 2.593126780626781,
+      "grad_norm": 0.9511955976486206,
+      "learning_rate": 5.5208127205342983e-05,
+      "loss": 1.0905,
+      "step": 14565
+    },
+    {
+      "epoch": 2.5933048433048436,
+      "grad_norm": 0.8359304070472717,
+      "learning_rate": 5.5195612864373626e-05,
+      "loss": 0.7132,
+      "step": 14566
+    },
+    {
+      "epoch": 2.593482905982906,
+      "grad_norm": 0.8302733302116394,
+      "learning_rate": 5.518309940125317e-05,
+      "loss": 0.9123,
+      "step": 14567
+    },
+    {
+      "epoch": 2.5936609686609686,
+      "grad_norm": 0.7923629283905029,
+      "learning_rate": 5.517058681622678e-05,
+      "loss": 0.8384,
+      "step": 14568
+    },
+    {
+      "epoch": 2.5938390313390314,
+      "grad_norm": 1.0625137090682983,
+      "learning_rate": 5.515807510953956e-05,
+      "loss": 1.0262,
+      "step": 14569
+    },
+    {
+      "epoch": 2.594017094017094,
+      "grad_norm": 1.0595879554748535,
+      "learning_rate": 5.5145564281436804e-05,
+      "loss": 0.9112,
+      "step": 14570
+    },
+    {
+      "epoch": 2.5941951566951564,
+      "grad_norm": 0.7307499647140503,
+      "learning_rate": 5.513305433216346e-05,
+      "loss": 0.8273,
+      "step": 14571
+    },
+    {
+      "epoch": 2.594373219373219,
+      "grad_norm": 0.9221912026405334,
+      "learning_rate": 5.512054526196475e-05,
+      "loss": 1.0679,
+      "step": 14572
+    },
+    {
+      "epoch": 2.594551282051282,
+      "grad_norm": 0.8098722100257874,
+      "learning_rate": 5.5108037071085725e-05,
+      "loss": 0.922,
+      "step": 14573
+    },
+    {
+      "epoch": 2.5947293447293447,
+      "grad_norm": 0.984785258769989,
+      "learning_rate": 5.509552975977146e-05,
+      "loss": 0.7525,
+      "step": 14574
+    },
+    {
+      "epoch": 2.5949074074074074,
+      "grad_norm": 0.8076850771903992,
+      "learning_rate": 5.5083023328267006e-05,
+      "loss": 1.008,
+      "step": 14575
+    },
+    {
+      "epoch": 2.59508547008547,
+      "grad_norm": 0.8375436067581177,
+      "learning_rate": 5.507051777681741e-05,
+      "loss": 0.8822,
+      "step": 14576
+    },
+    {
+      "epoch": 2.595263532763533,
+      "grad_norm": 0.779228687286377,
+      "learning_rate": 5.505801310566764e-05,
+      "loss": 0.8072,
+      "step": 14577
+    },
+    {
+      "epoch": 2.5954415954415957,
+      "grad_norm": 0.7347875833511353,
+      "learning_rate": 5.504550931506278e-05,
+      "loss": 0.7796,
+      "step": 14578
+    },
+    {
+      "epoch": 2.595619658119658,
+      "grad_norm": 0.8229580521583557,
+      "learning_rate": 5.503300640524779e-05,
+      "loss": 0.9337,
+      "step": 14579
+    },
+    {
+      "epoch": 2.5957977207977208,
+      "grad_norm": 0.8643096089363098,
+      "learning_rate": 5.502050437646762e-05,
+      "loss": 0.9101,
+      "step": 14580
+    },
+    {
+      "epoch": 2.5959757834757835,
+      "grad_norm": 0.769158661365509,
+      "learning_rate": 5.500800322896723e-05,
+      "loss": 0.8417,
+      "step": 14581
+    },
+    {
+      "epoch": 2.5961538461538463,
+      "grad_norm": 0.7792086005210876,
+      "learning_rate": 5.4995502962991566e-05,
+      "loss": 0.6965,
+      "step": 14582
+    },
+    {
+      "epoch": 2.5963319088319086,
+      "grad_norm": 0.7833219170570374,
+      "learning_rate": 5.498300357878552e-05,
+      "loss": 0.641,
+      "step": 14583
+    },
+    {
+      "epoch": 2.5965099715099713,
+      "grad_norm": 0.9491978287696838,
+      "learning_rate": 5.4970505076593956e-05,
+      "loss": 0.9229,
+      "step": 14584
+    },
+    {
+      "epoch": 2.596688034188034,
+      "grad_norm": 0.9128090739250183,
+      "learning_rate": 5.495800745666191e-05,
+      "loss": 0.8047,
+      "step": 14585
+    },
+    {
+      "epoch": 2.596866096866097,
+      "grad_norm": 0.9235281944274902,
+      "learning_rate": 5.494551071923404e-05,
+      "loss": 0.961,
+      "step": 14586
+    },
+    {
+      "epoch": 2.5970441595441596,
+      "grad_norm": 0.8582631349563599,
+      "learning_rate": 5.493301486455536e-05,
+      "loss": 0.7203,
+      "step": 14587
+    },
+    {
+      "epoch": 2.5972222222222223,
+      "grad_norm": 0.9605505466461182,
+      "learning_rate": 5.4920519892870605e-05,
+      "loss": 0.8315,
+      "step": 14588
+    },
+    {
+      "epoch": 2.597400284900285,
+      "grad_norm": 0.9344304203987122,
+      "learning_rate": 5.490802580442462e-05,
+      "loss": 0.9031,
+      "step": 14589
+    },
+    {
+      "epoch": 2.597578347578348,
+      "grad_norm": 1.0027791261672974,
+      "learning_rate": 5.4895532599462216e-05,
+      "loss": 1.0361,
+      "step": 14590
+    },
+    {
+      "epoch": 2.59775641025641,
+      "grad_norm": 0.8774647116661072,
+      "learning_rate": 5.488304027822815e-05,
+      "loss": 0.9533,
+      "step": 14591
+    },
+    {
+      "epoch": 2.597934472934473,
+      "grad_norm": 0.886246919631958,
+      "learning_rate": 5.487054884096718e-05,
+      "loss": 0.8588,
+      "step": 14592
+    },
+    {
+      "epoch": 2.5981125356125356,
+      "grad_norm": 0.8963425755500793,
+      "learning_rate": 5.485805828792408e-05,
+      "loss": 0.8685,
+      "step": 14593
+    },
+    {
+      "epoch": 2.5982905982905984,
+      "grad_norm": 0.7650768756866455,
+      "learning_rate": 5.484556861934349e-05,
+      "loss": 0.7441,
+      "step": 14594
+    },
+    {
+      "epoch": 2.5984686609686607,
+      "grad_norm": 0.8266916871070862,
+      "learning_rate": 5.483307983547026e-05,
+      "loss": 0.9625,
+      "step": 14595
+    },
+    {
+      "epoch": 2.5986467236467234,
+      "grad_norm": 0.8243923783302307,
+      "learning_rate": 5.482059193654894e-05,
+      "loss": 0.8553,
+      "step": 14596
+    },
+    {
+      "epoch": 2.598824786324786,
+      "grad_norm": 0.8200470209121704,
+      "learning_rate": 5.48081049228243e-05,
+      "loss": 0.6682,
+      "step": 14597
+    },
+    {
+      "epoch": 2.599002849002849,
+      "grad_norm": 0.8360442519187927,
+      "learning_rate": 5.479561879454097e-05,
+      "loss": 0.8996,
+      "step": 14598
+    },
+    {
+      "epoch": 2.5991809116809117,
+      "grad_norm": 0.8326625227928162,
+      "learning_rate": 5.4783133551943546e-05,
+      "loss": 0.6532,
+      "step": 14599
+    },
+    {
+      "epoch": 2.5993589743589745,
+      "grad_norm": 0.8162251114845276,
+      "learning_rate": 5.4770649195276766e-05,
+      "loss": 1.0514,
+      "step": 14600
+    },
+    {
+      "epoch": 2.599537037037037,
+      "grad_norm": 1.0407251119613647,
+      "learning_rate": 5.4758165724785084e-05,
+      "loss": 0.7991,
+      "step": 14601
+    },
+    {
+      "epoch": 2.5997150997151,
+      "grad_norm": 0.9161550998687744,
+      "learning_rate": 5.474568314071323e-05,
+      "loss": 0.8623,
+      "step": 14602
+    },
+    {
+      "epoch": 2.5998931623931623,
+      "grad_norm": 0.8405734896659851,
+      "learning_rate": 5.4733201443305646e-05,
+      "loss": 0.8406,
+      "step": 14603
+    },
+    {
+      "epoch": 2.600071225071225,
+      "grad_norm": 0.937198281288147,
+      "learning_rate": 5.472072063280698e-05,
+      "loss": 1.0887,
+      "step": 14604
+    },
+    {
+      "epoch": 2.6002492877492878,
+      "grad_norm": 0.8800520896911621,
+      "learning_rate": 5.470824070946172e-05,
+      "loss": 0.8738,
+      "step": 14605
+    },
+    {
+      "epoch": 2.6004273504273505,
+      "grad_norm": 0.9473027586936951,
+      "learning_rate": 5.4695761673514425e-05,
+      "loss": 0.8188,
+      "step": 14606
+    },
+    {
+      "epoch": 2.6006054131054133,
+      "grad_norm": 0.8547683954238892,
+      "learning_rate": 5.468328352520955e-05,
+      "loss": 0.7619,
+      "step": 14607
+    },
+    {
+      "epoch": 2.6007834757834756,
+      "grad_norm": 1.0138040781021118,
+      "learning_rate": 5.4670806264791595e-05,
+      "loss": 1.0805,
+      "step": 14608
+    },
+    {
+      "epoch": 2.6009615384615383,
+      "grad_norm": 0.8458215594291687,
+      "learning_rate": 5.465832989250499e-05,
+      "loss": 0.8386,
+      "step": 14609
+    },
+    {
+      "epoch": 2.601139601139601,
+      "grad_norm": 0.811152458190918,
+      "learning_rate": 5.464585440859431e-05,
+      "loss": 0.8158,
+      "step": 14610
+    },
+    {
+      "epoch": 2.601317663817664,
+      "grad_norm": 0.9584031701087952,
+      "learning_rate": 5.463337981330381e-05,
+      "loss": 0.8537,
+      "step": 14611
+    },
+    {
+      "epoch": 2.6014957264957266,
+      "grad_norm": 0.8734773397445679,
+      "learning_rate": 5.462090610687802e-05,
+      "loss": 1.0246,
+      "step": 14612
+    },
+    {
+      "epoch": 2.6016737891737893,
+      "grad_norm": 0.8463562726974487,
+      "learning_rate": 5.460843328956133e-05,
+      "loss": 0.8763,
+      "step": 14613
+    },
+    {
+      "epoch": 2.601851851851852,
+      "grad_norm": 0.8010903000831604,
+      "learning_rate": 5.459596136159808e-05,
+      "loss": 0.8438,
+      "step": 14614
+    },
+    {
+      "epoch": 2.6020299145299144,
+      "grad_norm": 0.7927500009536743,
+      "learning_rate": 5.458349032323267e-05,
+      "loss": 0.7388,
+      "step": 14615
+    },
+    {
+      "epoch": 2.602207977207977,
+      "grad_norm": 0.784017026424408,
+      "learning_rate": 5.4571020174709407e-05,
+      "loss": 0.6981,
+      "step": 14616
+    },
+    {
+      "epoch": 2.60238603988604,
+      "grad_norm": 0.8732004761695862,
+      "learning_rate": 5.455855091627263e-05,
+      "loss": 1.043,
+      "step": 14617
+    },
+    {
+      "epoch": 2.6025641025641026,
+      "grad_norm": 0.7947654128074646,
+      "learning_rate": 5.454608254816662e-05,
+      "loss": 0.9487,
+      "step": 14618
+    },
+    {
+      "epoch": 2.6027421652421654,
+      "grad_norm": 0.8809077739715576,
+      "learning_rate": 5.4533615070635734e-05,
+      "loss": 0.8499,
+      "step": 14619
+    },
+    {
+      "epoch": 2.6029202279202277,
+      "grad_norm": 0.9094803333282471,
+      "learning_rate": 5.452114848392422e-05,
+      "loss": 0.9522,
+      "step": 14620
+    },
+    {
+      "epoch": 2.6030982905982905,
+      "grad_norm": 0.8943446278572083,
+      "learning_rate": 5.4508682788276324e-05,
+      "loss": 0.7328,
+      "step": 14621
+    },
+    {
+      "epoch": 2.603276353276353,
+      "grad_norm": 0.856849730014801,
+      "learning_rate": 5.449621798393628e-05,
+      "loss": 0.7536,
+      "step": 14622
+    },
+    {
+      "epoch": 2.603454415954416,
+      "grad_norm": 0.8199608325958252,
+      "learning_rate": 5.448375407114833e-05,
+      "loss": 0.6377,
+      "step": 14623
+    },
+    {
+      "epoch": 2.6036324786324787,
+      "grad_norm": 0.8981915712356567,
+      "learning_rate": 5.4471291050156626e-05,
+      "loss": 1.0372,
+      "step": 14624
+    },
+    {
+      "epoch": 2.6038105413105415,
+      "grad_norm": 0.8449446558952332,
+      "learning_rate": 5.4458828921205465e-05,
+      "loss": 0.9948,
+      "step": 14625
+    },
+    {
+      "epoch": 2.603988603988604,
+      "grad_norm": 0.8807474970817566,
+      "learning_rate": 5.444636768453888e-05,
+      "loss": 0.9752,
+      "step": 14626
+    },
+    {
+      "epoch": 2.6041666666666665,
+      "grad_norm": 0.8212316036224365,
+      "learning_rate": 5.443390734040117e-05,
+      "loss": 0.9221,
+      "step": 14627
+    },
+    {
+      "epoch": 2.6043447293447293,
+      "grad_norm": 0.8049453496932983,
+      "learning_rate": 5.4421447889036304e-05,
+      "loss": 0.7726,
+      "step": 14628
+    },
+    {
+      "epoch": 2.604522792022792,
+      "grad_norm": 0.8091840744018555,
+      "learning_rate": 5.440898933068853e-05,
+      "loss": 0.9152,
+      "step": 14629
+    },
+    {
+      "epoch": 2.6047008547008548,
+      "grad_norm": 0.8409022688865662,
+      "learning_rate": 5.43965316656019e-05,
+      "loss": 0.8672,
+      "step": 14630
+    },
+    {
+      "epoch": 2.6048789173789175,
+      "grad_norm": 0.7622308731079102,
+      "learning_rate": 5.4384074894020496e-05,
+      "loss": 0.9021,
+      "step": 14631
+    },
+    {
+      "epoch": 2.60505698005698,
+      "grad_norm": 0.8272425532341003,
+      "learning_rate": 5.437161901618839e-05,
+      "loss": 0.7729,
+      "step": 14632
+    },
+    {
+      "epoch": 2.6052350427350426,
+      "grad_norm": 0.8699020743370056,
+      "learning_rate": 5.435916403234963e-05,
+      "loss": 0.8211,
+      "step": 14633
+    },
+    {
+      "epoch": 2.6054131054131053,
+      "grad_norm": 0.8145751357078552,
+      "learning_rate": 5.4346709942748196e-05,
+      "loss": 0.8996,
+      "step": 14634
+    },
+    {
+      "epoch": 2.605591168091168,
+      "grad_norm": 0.9398832321166992,
+      "learning_rate": 5.433425674762822e-05,
+      "loss": 0.8116,
+      "step": 14635
+    },
+    {
+      "epoch": 2.605769230769231,
+      "grad_norm": 0.9191767573356628,
+      "learning_rate": 5.4321804447233535e-05,
+      "loss": 0.8933,
+      "step": 14636
+    },
+    {
+      "epoch": 2.6059472934472936,
+      "grad_norm": 0.7511529326438904,
+      "learning_rate": 5.430935304180831e-05,
+      "loss": 0.7595,
+      "step": 14637
+    },
+    {
+      "epoch": 2.6061253561253563,
+      "grad_norm": 0.9087170362472534,
+      "learning_rate": 5.4296902531596296e-05,
+      "loss": 0.9781,
+      "step": 14638
+    },
+    {
+      "epoch": 2.6063034188034186,
+      "grad_norm": 0.8496448397636414,
+      "learning_rate": 5.4284452916841575e-05,
+      "loss": 0.9852,
+      "step": 14639
+    },
+    {
+      "epoch": 2.6064814814814814,
+      "grad_norm": 0.868609607219696,
+      "learning_rate": 5.427200419778804e-05,
+      "loss": 0.781,
+      "step": 14640
+    },
+    {
+      "epoch": 2.606659544159544,
+      "grad_norm": 0.7752132415771484,
+      "learning_rate": 5.4259556374679553e-05,
+      "loss": 0.7319,
+      "step": 14641
+    },
+    {
+      "epoch": 2.606837606837607,
+      "grad_norm": 0.8950543999671936,
+      "learning_rate": 5.4247109447760124e-05,
+      "loss": 0.7637,
+      "step": 14642
+    },
+    {
+      "epoch": 2.6070156695156697,
+      "grad_norm": 0.892699658870697,
+      "learning_rate": 5.423466341727346e-05,
+      "loss": 0.8274,
+      "step": 14643
+    },
+    {
+      "epoch": 2.607193732193732,
+      "grad_norm": 0.9283786416053772,
+      "learning_rate": 5.422221828346352e-05,
+      "loss": 1.1009,
+      "step": 14644
+    },
+    {
+      "epoch": 2.6073717948717947,
+      "grad_norm": 0.7551446557044983,
+      "learning_rate": 5.420977404657413e-05,
+      "loss": 0.8105,
+      "step": 14645
+    },
+    {
+      "epoch": 2.6075498575498575,
+      "grad_norm": 0.8014101386070251,
+      "learning_rate": 5.41973307068491e-05,
+      "loss": 0.838,
+      "step": 14646
+    },
+    {
+      "epoch": 2.60772792022792,
+      "grad_norm": 0.8941731452941895,
+      "learning_rate": 5.418488826453223e-05,
+      "loss": 0.9557,
+      "step": 14647
+    },
+    {
+      "epoch": 2.607905982905983,
+      "grad_norm": 0.7990903258323669,
+      "learning_rate": 5.41724467198673e-05,
+      "loss": 0.9634,
+      "step": 14648
+    },
+    {
+      "epoch": 2.6080840455840457,
+      "grad_norm": 1.0688040256500244,
+      "learning_rate": 5.4160006073098035e-05,
+      "loss": 0.8976,
+      "step": 14649
+    },
+    {
+      "epoch": 2.6082621082621085,
+      "grad_norm": 0.8451266884803772,
+      "learning_rate": 5.4147566324468313e-05,
+      "loss": 0.8703,
+      "step": 14650
+    },
+    {
+      "epoch": 2.6084401709401708,
+      "grad_norm": 0.8196333050727844,
+      "learning_rate": 5.413512747422169e-05,
+      "loss": 0.9423,
+      "step": 14651
+    },
+    {
+      "epoch": 2.6086182336182335,
+      "grad_norm": 0.7639298439025879,
+      "learning_rate": 5.412268952260204e-05,
+      "loss": 0.9092,
+      "step": 14652
+    },
+    {
+      "epoch": 2.6087962962962963,
+      "grad_norm": 0.88963782787323,
+      "learning_rate": 5.411025246985293e-05,
+      "loss": 1.2503,
+      "step": 14653
+    },
+    {
+      "epoch": 2.608974358974359,
+      "grad_norm": 0.831516683101654,
+      "learning_rate": 5.409781631621812e-05,
+      "loss": 0.8643,
+      "step": 14654
+    },
+    {
+      "epoch": 2.609152421652422,
+      "grad_norm": 0.7729721069335938,
+      "learning_rate": 5.408538106194125e-05,
+      "loss": 0.8289,
+      "step": 14655
+    },
+    {
+      "epoch": 2.609330484330484,
+      "grad_norm": 0.8360101580619812,
+      "learning_rate": 5.407294670726596e-05,
+      "loss": 0.8619,
+      "step": 14656
+    },
+    {
+      "epoch": 2.609508547008547,
+      "grad_norm": 0.7525733709335327,
+      "learning_rate": 5.406051325243586e-05,
+      "loss": 0.8353,
+      "step": 14657
+    },
+    {
+      "epoch": 2.6096866096866096,
+      "grad_norm": 0.8943357467651367,
+      "learning_rate": 5.404808069769456e-05,
+      "loss": 0.9291,
+      "step": 14658
+    },
+    {
+      "epoch": 2.6098646723646723,
+      "grad_norm": 1.024953007698059,
+      "learning_rate": 5.403564904328568e-05,
+      "loss": 1.0414,
+      "step": 14659
+    },
+    {
+      "epoch": 2.610042735042735,
+      "grad_norm": 0.8671780228614807,
+      "learning_rate": 5.402321828945278e-05,
+      "loss": 0.9309,
+      "step": 14660
+    },
+    {
+      "epoch": 2.610220797720798,
+      "grad_norm": 0.9765334725379944,
+      "learning_rate": 5.4010788436439406e-05,
+      "loss": 0.9399,
+      "step": 14661
+    },
+    {
+      "epoch": 2.6103988603988606,
+      "grad_norm": 0.8996732234954834,
+      "learning_rate": 5.3998359484489106e-05,
+      "loss": 0.9868,
+      "step": 14662
+    },
+    {
+      "epoch": 2.6105769230769234,
+      "grad_norm": 0.8597404956817627,
+      "learning_rate": 5.398593143384538e-05,
+      "loss": 1.0328,
+      "step": 14663
+    },
+    {
+      "epoch": 2.6107549857549857,
+      "grad_norm": 0.8909318447113037,
+      "learning_rate": 5.397350428475176e-05,
+      "loss": 0.9362,
+      "step": 14664
+    },
+    {
+      "epoch": 2.6109330484330484,
+      "grad_norm": 0.8874006867408752,
+      "learning_rate": 5.39610780374517e-05,
+      "loss": 0.9254,
+      "step": 14665
+    },
+    {
+      "epoch": 2.611111111111111,
+      "grad_norm": 0.8325822949409485,
+      "learning_rate": 5.3948652692188626e-05,
+      "loss": 0.8495,
+      "step": 14666
+    },
+    {
+      "epoch": 2.611289173789174,
+      "grad_norm": 0.847998857498169,
+      "learning_rate": 5.393622824920614e-05,
+      "loss": 0.8372,
+      "step": 14667
+    },
+    {
+      "epoch": 2.611467236467236,
+      "grad_norm": 0.8439756631851196,
+      "learning_rate": 5.392380470874749e-05,
+      "loss": 0.8934,
+      "step": 14668
+    },
+    {
+      "epoch": 2.611645299145299,
+      "grad_norm": 0.9563834071159363,
+      "learning_rate": 5.39113820710562e-05,
+      "loss": 1.1213,
+      "step": 14669
+    },
+    {
+      "epoch": 2.6118233618233617,
+      "grad_norm": 0.7761119604110718,
+      "learning_rate": 5.3898960336375646e-05,
+      "loss": 0.6104,
+      "step": 14670
+    },
+    {
+      "epoch": 2.6120014245014245,
+      "grad_norm": 0.8661524653434753,
+      "learning_rate": 5.38865395049492e-05,
+      "loss": 0.7562,
+      "step": 14671
+    },
+    {
+      "epoch": 2.6121794871794872,
+      "grad_norm": 0.854347825050354,
+      "learning_rate": 5.387411957702021e-05,
+      "loss": 0.8613,
+      "step": 14672
+    },
+    {
+      "epoch": 2.61235754985755,
+      "grad_norm": 0.7728402614593506,
+      "learning_rate": 5.386170055283204e-05,
+      "loss": 0.7879,
+      "step": 14673
+    },
+    {
+      "epoch": 2.6125356125356127,
+      "grad_norm": 0.8647109270095825,
+      "learning_rate": 5.384928243262799e-05,
+      "loss": 0.835,
+      "step": 14674
+    },
+    {
+      "epoch": 2.6127136752136755,
+      "grad_norm": 0.6764749884605408,
+      "learning_rate": 5.383686521665139e-05,
+      "loss": 0.7233,
+      "step": 14675
+    },
+    {
+      "epoch": 2.612891737891738,
+      "grad_norm": 0.8431640863418579,
+      "learning_rate": 5.382444890514548e-05,
+      "loss": 1.1699,
+      "step": 14676
+    },
+    {
+      "epoch": 2.6130698005698005,
+      "grad_norm": 0.9196193814277649,
+      "learning_rate": 5.381203349835364e-05,
+      "loss": 0.8668,
+      "step": 14677
+    },
+    {
+      "epoch": 2.6132478632478633,
+      "grad_norm": 0.9449048638343811,
+      "learning_rate": 5.3799618996519e-05,
+      "loss": 0.8353,
+      "step": 14678
+    },
+    {
+      "epoch": 2.613425925925926,
+      "grad_norm": 0.9835928678512573,
+      "learning_rate": 5.378720539988488e-05,
+      "loss": 1.1129,
+      "step": 14679
+    },
+    {
+      "epoch": 2.6136039886039883,
+      "grad_norm": 0.763592004776001,
+      "learning_rate": 5.377479270869448e-05,
+      "loss": 0.7929,
+      "step": 14680
+    },
+    {
+      "epoch": 2.613782051282051,
+      "grad_norm": 0.8119748830795288,
+      "learning_rate": 5.376238092319094e-05,
+      "loss": 1.0257,
+      "step": 14681
+    },
+    {
+      "epoch": 2.613960113960114,
+      "grad_norm": 0.7605236172676086,
+      "learning_rate": 5.374997004361757e-05,
+      "loss": 0.7005,
+      "step": 14682
+    },
+    {
+      "epoch": 2.6141381766381766,
+      "grad_norm": 0.9077369570732117,
+      "learning_rate": 5.3737560070217394e-05,
+      "loss": 0.9208,
+      "step": 14683
+    },
+    {
+      "epoch": 2.6143162393162394,
+      "grad_norm": 0.9089310765266418,
+      "learning_rate": 5.3725151003233665e-05,
+      "loss": 0.6855,
+      "step": 14684
+    },
+    {
+      "epoch": 2.614494301994302,
+      "grad_norm": 0.8387685418128967,
+      "learning_rate": 5.371274284290947e-05,
+      "loss": 0.8682,
+      "step": 14685
+    },
+    {
+      "epoch": 2.614672364672365,
+      "grad_norm": 0.7626301050186157,
+      "learning_rate": 5.3700335589487925e-05,
+      "loss": 0.6928,
+      "step": 14686
+    },
+    {
+      "epoch": 2.6148504273504276,
+      "grad_norm": 1.2667319774627686,
+      "learning_rate": 5.368792924321213e-05,
+      "loss": 0.9288,
+      "step": 14687
+    },
+    {
+      "epoch": 2.61502849002849,
+      "grad_norm": 0.8570333123207092,
+      "learning_rate": 5.3675523804325154e-05,
+      "loss": 0.9916,
+      "step": 14688
+    },
+    {
+      "epoch": 2.6152065527065527,
+      "grad_norm": 0.9050240516662598,
+      "learning_rate": 5.366311927307006e-05,
+      "loss": 0.7734,
+      "step": 14689
+    },
+    {
+      "epoch": 2.6153846153846154,
+      "grad_norm": 1.000036358833313,
+      "learning_rate": 5.365071564968989e-05,
+      "loss": 0.7932,
+      "step": 14690
+    },
+    {
+      "epoch": 2.615562678062678,
+      "grad_norm": 0.8147441744804382,
+      "learning_rate": 5.363831293442763e-05,
+      "loss": 0.8867,
+      "step": 14691
+    },
+    {
+      "epoch": 2.6157407407407405,
+      "grad_norm": 0.8662015795707703,
+      "learning_rate": 5.3625911127526375e-05,
+      "loss": 0.6742,
+      "step": 14692
+    },
+    {
+      "epoch": 2.6159188034188032,
+      "grad_norm": 0.8576271533966064,
+      "learning_rate": 5.3613510229229e-05,
+      "loss": 0.8161,
+      "step": 14693
+    },
+    {
+      "epoch": 2.616096866096866,
+      "grad_norm": 0.8862481713294983,
+      "learning_rate": 5.360111023977856e-05,
+      "loss": 0.8774,
+      "step": 14694
+    },
+    {
+      "epoch": 2.6162749287749287,
+      "grad_norm": 0.8384450674057007,
+      "learning_rate": 5.358871115941799e-05,
+      "loss": 0.9149,
+      "step": 14695
+    },
+    {
+      "epoch": 2.6164529914529915,
+      "grad_norm": 0.9055412411689758,
+      "learning_rate": 5.357631298839021e-05,
+      "loss": 0.8197,
+      "step": 14696
+    },
+    {
+      "epoch": 2.6166310541310542,
+      "grad_norm": 0.937764585018158,
+      "learning_rate": 5.356391572693813e-05,
+      "loss": 1.0392,
+      "step": 14697
+    },
+    {
+      "epoch": 2.616809116809117,
+      "grad_norm": 0.8917306661605835,
+      "learning_rate": 5.355151937530463e-05,
+      "loss": 0.868,
+      "step": 14698
+    },
+    {
+      "epoch": 2.6169871794871797,
+      "grad_norm": 0.7353024482727051,
+      "learning_rate": 5.3539123933732705e-05,
+      "loss": 0.7788,
+      "step": 14699
+    },
+    {
+      "epoch": 2.617165242165242,
+      "grad_norm": 0.8607454299926758,
+      "learning_rate": 5.352672940246504e-05,
+      "loss": 0.7746,
+      "step": 14700
+    },
+    {
+      "epoch": 2.617343304843305,
+      "grad_norm": 0.9775658249855042,
+      "learning_rate": 5.3514335781744616e-05,
+      "loss": 0.9438,
+      "step": 14701
+    },
+    {
+      "epoch": 2.6175213675213675,
+      "grad_norm": 0.9416237473487854,
+      "learning_rate": 5.350194307181422e-05,
+      "loss": 0.9581,
+      "step": 14702
+    },
+    {
+      "epoch": 2.6176994301994303,
+      "grad_norm": 0.8378105163574219,
+      "learning_rate": 5.348955127291666e-05,
+      "loss": 1.0038,
+      "step": 14703
+    },
+    {
+      "epoch": 2.6178774928774926,
+      "grad_norm": 0.8199161887168884,
+      "learning_rate": 5.347716038529471e-05,
+      "loss": 0.9492,
+      "step": 14704
+    },
+    {
+      "epoch": 2.6180555555555554,
+      "grad_norm": 0.9511042833328247,
+      "learning_rate": 5.3464770409191176e-05,
+      "loss": 1.2101,
+      "step": 14705
+    },
+    {
+      "epoch": 2.618233618233618,
+      "grad_norm": 0.8017105460166931,
+      "learning_rate": 5.3452381344848754e-05,
+      "loss": 0.9524,
+      "step": 14706
+    },
+    {
+      "epoch": 2.618411680911681,
+      "grad_norm": 0.8174898624420166,
+      "learning_rate": 5.34399931925103e-05,
+      "loss": 0.911,
+      "step": 14707
+    },
+    {
+      "epoch": 2.6185897435897436,
+      "grad_norm": 0.8134239315986633,
+      "learning_rate": 5.342760595241838e-05,
+      "loss": 0.8971,
+      "step": 14708
+    },
+    {
+      "epoch": 2.6187678062678064,
+      "grad_norm": 0.817252516746521,
+      "learning_rate": 5.341521962481586e-05,
+      "loss": 0.8472,
+      "step": 14709
+    },
+    {
+      "epoch": 2.618945868945869,
+      "grad_norm": 0.8675270080566406,
+      "learning_rate": 5.3402834209945264e-05,
+      "loss": 0.9607,
+      "step": 14710
+    },
+    {
+      "epoch": 2.619123931623932,
+      "grad_norm": 1.0281410217285156,
+      "learning_rate": 5.339044970804936e-05,
+      "loss": 1.0487,
+      "step": 14711
+    },
+    {
+      "epoch": 2.619301994301994,
+      "grad_norm": 0.9276307225227356,
+      "learning_rate": 5.33780661193708e-05,
+      "loss": 0.8915,
+      "step": 14712
+    },
+    {
+      "epoch": 2.619480056980057,
+      "grad_norm": 0.8479217290878296,
+      "learning_rate": 5.336568344415216e-05,
+      "loss": 0.929,
+      "step": 14713
+    },
+    {
+      "epoch": 2.6196581196581197,
+      "grad_norm": 0.8695724010467529,
+      "learning_rate": 5.335330168263608e-05,
+      "loss": 0.8651,
+      "step": 14714
+    },
+    {
+      "epoch": 2.6198361823361824,
+      "grad_norm": 0.7740936875343323,
+      "learning_rate": 5.3340920835065155e-05,
+      "loss": 0.8572,
+      "step": 14715
+    },
+    {
+      "epoch": 2.620014245014245,
+      "grad_norm": 0.8619815111160278,
+      "learning_rate": 5.332854090168192e-05,
+      "loss": 0.6934,
+      "step": 14716
+    },
+    {
+      "epoch": 2.6201923076923075,
+      "grad_norm": 0.8866271376609802,
+      "learning_rate": 5.331616188272902e-05,
+      "loss": 1.038,
+      "step": 14717
+    },
+    {
+      "epoch": 2.6203703703703702,
+      "grad_norm": 0.7526047825813293,
+      "learning_rate": 5.330378377844896e-05,
+      "loss": 0.8534,
+      "step": 14718
+    },
+    {
+      "epoch": 2.620548433048433,
+      "grad_norm": 0.6914070248603821,
+      "learning_rate": 5.329140658908423e-05,
+      "loss": 0.5355,
+      "step": 14719
+    },
+    {
+      "epoch": 2.6207264957264957,
+      "grad_norm": 0.886074423789978,
+      "learning_rate": 5.3279030314877374e-05,
+      "loss": 0.8277,
+      "step": 14720
+    },
+    {
+      "epoch": 2.6209045584045585,
+      "grad_norm": 0.9101460576057434,
+      "learning_rate": 5.326665495607082e-05,
+      "loss": 0.8711,
+      "step": 14721
+    },
+    {
+      "epoch": 2.6210826210826212,
+      "grad_norm": 0.9744461178779602,
+      "learning_rate": 5.3254280512907175e-05,
+      "loss": 1.2376,
+      "step": 14722
+    },
+    {
+      "epoch": 2.621260683760684,
+      "grad_norm": 1.013480544090271,
+      "learning_rate": 5.32419069856287e-05,
+      "loss": 0.8946,
+      "step": 14723
+    },
+    {
+      "epoch": 2.6214387464387463,
+      "grad_norm": 0.82442706823349,
+      "learning_rate": 5.3229534374478005e-05,
+      "loss": 0.732,
+      "step": 14724
+    },
+    {
+      "epoch": 2.621616809116809,
+      "grad_norm": 0.7960239052772522,
+      "learning_rate": 5.3217162679697366e-05,
+      "loss": 0.7633,
+      "step": 14725
+    },
+    {
+      "epoch": 2.621794871794872,
+      "grad_norm": 0.819844126701355,
+      "learning_rate": 5.320479190152926e-05,
+      "loss": 0.974,
+      "step": 14726
+    },
+    {
+      "epoch": 2.6219729344729346,
+      "grad_norm": 0.8245221376419067,
+      "learning_rate": 5.319242204021606e-05,
+      "loss": 0.9122,
+      "step": 14727
+    },
+    {
+      "epoch": 2.6221509971509973,
+      "grad_norm": 0.7574561834335327,
+      "learning_rate": 5.318005309600011e-05,
+      "loss": 0.8427,
+      "step": 14728
+    },
+    {
+      "epoch": 2.6223290598290596,
+      "grad_norm": 1.0385704040527344,
+      "learning_rate": 5.316768506912377e-05,
+      "loss": 0.8214,
+      "step": 14729
+    },
+    {
+      "epoch": 2.6225071225071224,
+      "grad_norm": 0.8616722822189331,
+      "learning_rate": 5.3155317959829346e-05,
+      "loss": 0.8469,
+      "step": 14730
+    },
+    {
+      "epoch": 2.622685185185185,
+      "grad_norm": 0.909667432308197,
+      "learning_rate": 5.314295176835912e-05,
+      "loss": 0.9156,
+      "step": 14731
+    },
+    {
+      "epoch": 2.622863247863248,
+      "grad_norm": 0.9016293883323669,
+      "learning_rate": 5.3130586494955494e-05,
+      "loss": 0.9183,
+      "step": 14732
+    },
+    {
+      "epoch": 2.6230413105413106,
+      "grad_norm": 0.8828284740447998,
+      "learning_rate": 5.311822213986057e-05,
+      "loss": 0.8338,
+      "step": 14733
+    },
+    {
+      "epoch": 2.6232193732193734,
+      "grad_norm": 0.8159047365188599,
+      "learning_rate": 5.3105858703316794e-05,
+      "loss": 0.7055,
+      "step": 14734
+    },
+    {
+      "epoch": 2.623397435897436,
+      "grad_norm": 0.9240905046463013,
+      "learning_rate": 5.309349618556623e-05,
+      "loss": 0.9078,
+      "step": 14735
+    },
+    {
+      "epoch": 2.6235754985754984,
+      "grad_norm": 0.8881595134735107,
+      "learning_rate": 5.308113458685118e-05,
+      "loss": 0.9946,
+      "step": 14736
+    },
+    {
+      "epoch": 2.623753561253561,
+      "grad_norm": 0.8781841397285461,
+      "learning_rate": 5.306877390741385e-05,
+      "loss": 0.8252,
+      "step": 14737
+    },
+    {
+      "epoch": 2.623931623931624,
+      "grad_norm": 0.8348106741905212,
+      "learning_rate": 5.3056414147496355e-05,
+      "loss": 0.8653,
+      "step": 14738
+    },
+    {
+      "epoch": 2.6241096866096867,
+      "grad_norm": 0.9692304134368896,
+      "learning_rate": 5.3044055307341e-05,
+      "loss": 0.7814,
+      "step": 14739
+    },
+    {
+      "epoch": 2.6242877492877494,
+      "grad_norm": 0.866179347038269,
+      "learning_rate": 5.303169738718976e-05,
+      "loss": 0.9255,
+      "step": 14740
+    },
+    {
+      "epoch": 2.6244658119658117,
+      "grad_norm": 0.9306690692901611,
+      "learning_rate": 5.301934038728487e-05,
+      "loss": 0.9123,
+      "step": 14741
+    },
+    {
+      "epoch": 2.6246438746438745,
+      "grad_norm": 0.949357271194458,
+      "learning_rate": 5.3006984307868415e-05,
+      "loss": 0.8452,
+      "step": 14742
+    },
+    {
+      "epoch": 2.6248219373219372,
+      "grad_norm": 0.8638128042221069,
+      "learning_rate": 5.299462914918249e-05,
+      "loss": 0.8026,
+      "step": 14743
+    },
+    {
+      "epoch": 2.625,
+      "grad_norm": 0.9075117707252502,
+      "learning_rate": 5.2982274911469154e-05,
+      "loss": 1.0644,
+      "step": 14744
+    },
+    {
+      "epoch": 2.6251780626780628,
+      "grad_norm": 0.8146225810050964,
+      "learning_rate": 5.296992159497047e-05,
+      "loss": 0.8494,
+      "step": 14745
+    },
+    {
+      "epoch": 2.6253561253561255,
+      "grad_norm": 0.8887025713920593,
+      "learning_rate": 5.295756919992847e-05,
+      "loss": 0.8143,
+      "step": 14746
+    },
+    {
+      "epoch": 2.6255341880341883,
+      "grad_norm": 0.8262654542922974,
+      "learning_rate": 5.29452177265852e-05,
+      "loss": 0.7559,
+      "step": 14747
+    },
+    {
+      "epoch": 2.6257122507122506,
+      "grad_norm": 0.8126912117004395,
+      "learning_rate": 5.2932867175182574e-05,
+      "loss": 0.8528,
+      "step": 14748
+    },
+    {
+      "epoch": 2.6258903133903133,
+      "grad_norm": 0.8970595598220825,
+      "learning_rate": 5.2920517545962746e-05,
+      "loss": 0.8584,
+      "step": 14749
+    },
+    {
+      "epoch": 2.626068376068376,
+      "grad_norm": 0.8678651452064514,
+      "learning_rate": 5.290816883916748e-05,
+      "loss": 0.8686,
+      "step": 14750
+    },
+    {
+      "epoch": 2.626246438746439,
+      "grad_norm": 0.8069576621055603,
+      "learning_rate": 5.289582105503887e-05,
+      "loss": 0.868,
+      "step": 14751
+    },
+    {
+      "epoch": 2.6264245014245016,
+      "grad_norm": 1.0322144031524658,
+      "learning_rate": 5.28834741938188e-05,
+      "loss": 1.1537,
+      "step": 14752
+    },
+    {
+      "epoch": 2.626602564102564,
+      "grad_norm": 0.8274349570274353,
+      "learning_rate": 5.287112825574917e-05,
+      "loss": 1.0126,
+      "step": 14753
+    },
+    {
+      "epoch": 2.6267806267806266,
+      "grad_norm": 0.8820709586143494,
+      "learning_rate": 5.2858783241071875e-05,
+      "loss": 0.893,
+      "step": 14754
+    },
+    {
+      "epoch": 2.6269586894586894,
+      "grad_norm": 1.0102146863937378,
+      "learning_rate": 5.28464391500288e-05,
+      "loss": 0.8524,
+      "step": 14755
+    },
+    {
+      "epoch": 2.627136752136752,
+      "grad_norm": 0.875468373298645,
+      "learning_rate": 5.2834095982861764e-05,
+      "loss": 1.0991,
+      "step": 14756
+    },
+    {
+      "epoch": 2.627314814814815,
+      "grad_norm": 0.8155242800712585,
+      "learning_rate": 5.282175373981267e-05,
+      "loss": 0.666,
+      "step": 14757
+    },
+    {
+      "epoch": 2.6274928774928776,
+      "grad_norm": 0.8777057528495789,
+      "learning_rate": 5.280941242112332e-05,
+      "loss": 0.892,
+      "step": 14758
+    },
+    {
+      "epoch": 2.6276709401709404,
+      "grad_norm": 0.8357667922973633,
+      "learning_rate": 5.279707202703549e-05,
+      "loss": 0.8118,
+      "step": 14759
+    },
+    {
+      "epoch": 2.6278490028490027,
+      "grad_norm": 0.7862337827682495,
+      "learning_rate": 5.278473255779097e-05,
+      "loss": 0.7287,
+      "step": 14760
+    },
+    {
+      "epoch": 2.6280270655270654,
+      "grad_norm": 0.8340336084365845,
+      "learning_rate": 5.277239401363155e-05,
+      "loss": 0.7697,
+      "step": 14761
+    },
+    {
+      "epoch": 2.628205128205128,
+      "grad_norm": 0.7986457943916321,
+      "learning_rate": 5.276005639479896e-05,
+      "loss": 0.9358,
+      "step": 14762
+    },
+    {
+      "epoch": 2.628383190883191,
+      "grad_norm": 0.7377769947052002,
+      "learning_rate": 5.2747719701534895e-05,
+      "loss": 0.8091,
+      "step": 14763
+    },
+    {
+      "epoch": 2.6285612535612537,
+      "grad_norm": 0.9749723672866821,
+      "learning_rate": 5.273538393408117e-05,
+      "loss": 0.8163,
+      "step": 14764
+    },
+    {
+      "epoch": 2.628739316239316,
+      "grad_norm": 0.8718321323394775,
+      "learning_rate": 5.2723049092679354e-05,
+      "loss": 1.1587,
+      "step": 14765
+    },
+    {
+      "epoch": 2.6289173789173788,
+      "grad_norm": 0.9394767880439758,
+      "learning_rate": 5.27107151775712e-05,
+      "loss": 0.9409,
+      "step": 14766
+    },
+    {
+      "epoch": 2.6290954415954415,
+      "grad_norm": 0.9763813614845276,
+      "learning_rate": 5.269838218899836e-05,
+      "loss": 1.0171,
+      "step": 14767
+    },
+    {
+      "epoch": 2.6292735042735043,
+      "grad_norm": 0.878968358039856,
+      "learning_rate": 5.268605012720247e-05,
+      "loss": 0.9117,
+      "step": 14768
+    },
+    {
+      "epoch": 2.629451566951567,
+      "grad_norm": 0.8240547776222229,
+      "learning_rate": 5.267371899242512e-05,
+      "loss": 0.9351,
+      "step": 14769
+    },
+    {
+      "epoch": 2.6296296296296298,
+      "grad_norm": 0.8048275709152222,
+      "learning_rate": 5.266138878490795e-05,
+      "loss": 0.9331,
+      "step": 14770
+    },
+    {
+      "epoch": 2.6298076923076925,
+      "grad_norm": 0.7176041007041931,
+      "learning_rate": 5.264905950489252e-05,
+      "loss": 0.6424,
+      "step": 14771
+    },
+    {
+      "epoch": 2.629985754985755,
+      "grad_norm": 0.973258912563324,
+      "learning_rate": 5.263673115262041e-05,
+      "loss": 0.9295,
+      "step": 14772
+    },
+    {
+      "epoch": 2.6301638176638176,
+      "grad_norm": 0.8955824971199036,
+      "learning_rate": 5.262440372833313e-05,
+      "loss": 0.9306,
+      "step": 14773
+    },
+    {
+      "epoch": 2.6303418803418803,
+      "grad_norm": 0.8430632948875427,
+      "learning_rate": 5.2612077232272305e-05,
+      "loss": 0.9343,
+      "step": 14774
+    },
+    {
+      "epoch": 2.630519943019943,
+      "grad_norm": 1.0231794118881226,
+      "learning_rate": 5.2599751664679334e-05,
+      "loss": 0.941,
+      "step": 14775
+    },
+    {
+      "epoch": 2.630698005698006,
+      "grad_norm": 0.9726024866104126,
+      "learning_rate": 5.258742702579579e-05,
+      "loss": 1.1726,
+      "step": 14776
+    },
+    {
+      "epoch": 2.630876068376068,
+      "grad_norm": 0.8575723171234131,
+      "learning_rate": 5.257510331586312e-05,
+      "loss": 0.5644,
+      "step": 14777
+    },
+    {
+      "epoch": 2.631054131054131,
+      "grad_norm": 0.853165864944458,
+      "learning_rate": 5.2562780535122744e-05,
+      "loss": 0.8555,
+      "step": 14778
+    },
+    {
+      "epoch": 2.6312321937321936,
+      "grad_norm": 0.861574649810791,
+      "learning_rate": 5.255045868381623e-05,
+      "loss": 0.8298,
+      "step": 14779
+    },
+    {
+      "epoch": 2.6314102564102564,
+      "grad_norm": 0.8744526505470276,
+      "learning_rate": 5.2538137762184816e-05,
+      "loss": 0.9889,
+      "step": 14780
+    },
+    {
+      "epoch": 2.631588319088319,
+      "grad_norm": 0.7891412973403931,
+      "learning_rate": 5.2525817770470084e-05,
+      "loss": 0.9765,
+      "step": 14781
+    },
+    {
+      "epoch": 2.631766381766382,
+      "grad_norm": 0.9155156016349792,
+      "learning_rate": 5.251349870891327e-05,
+      "loss": 0.8927,
+      "step": 14782
+    },
+    {
+      "epoch": 2.6319444444444446,
+      "grad_norm": 0.8547508120536804,
+      "learning_rate": 5.250118057775582e-05,
+      "loss": 0.8479,
+      "step": 14783
+    },
+    {
+      "epoch": 2.6321225071225074,
+      "grad_norm": 0.7606263756752014,
+      "learning_rate": 5.248886337723908e-05,
+      "loss": 0.7557,
+      "step": 14784
+    },
+    {
+      "epoch": 2.6323005698005697,
+      "grad_norm": 0.855315625667572,
+      "learning_rate": 5.247654710760437e-05,
+      "loss": 0.8527,
+      "step": 14785
+    },
+    {
+      "epoch": 2.6324786324786325,
+      "grad_norm": 0.7656288743019104,
+      "learning_rate": 5.246423176909298e-05,
+      "loss": 0.8881,
+      "step": 14786
+    },
+    {
+      "epoch": 2.632656695156695,
+      "grad_norm": 0.817034125328064,
+      "learning_rate": 5.2451917361946236e-05,
+      "loss": 1.042,
+      "step": 14787
+    },
+    {
+      "epoch": 2.632834757834758,
+      "grad_norm": 0.8473303318023682,
+      "learning_rate": 5.2439603886405356e-05,
+      "loss": 0.8804,
+      "step": 14788
+    },
+    {
+      "epoch": 2.6330128205128203,
+      "grad_norm": 0.9563126564025879,
+      "learning_rate": 5.242729134271171e-05,
+      "loss": 0.8463,
+      "step": 14789
+    },
+    {
+      "epoch": 2.633190883190883,
+      "grad_norm": 0.8297066688537598,
+      "learning_rate": 5.241497973110641e-05,
+      "loss": 0.7776,
+      "step": 14790
+    },
+    {
+      "epoch": 2.6333689458689458,
+      "grad_norm": 0.8433563709259033,
+      "learning_rate": 5.240266905183075e-05,
+      "loss": 0.8712,
+      "step": 14791
+    },
+    {
+      "epoch": 2.6335470085470085,
+      "grad_norm": 0.814725935459137,
+      "learning_rate": 5.239035930512593e-05,
+      "loss": 0.9819,
+      "step": 14792
+    },
+    {
+      "epoch": 2.6337250712250713,
+      "grad_norm": 0.844292163848877,
+      "learning_rate": 5.23780504912331e-05,
+      "loss": 0.8693,
+      "step": 14793
+    },
+    {
+      "epoch": 2.633903133903134,
+      "grad_norm": 0.8194862008094788,
+      "learning_rate": 5.2365742610393464e-05,
+      "loss": 0.7878,
+      "step": 14794
+    },
+    {
+      "epoch": 2.6340811965811968,
+      "grad_norm": 0.8570502400398254,
+      "learning_rate": 5.2353435662848135e-05,
+      "loss": 0.815,
+      "step": 14795
+    },
+    {
+      "epoch": 2.6342592592592595,
+      "grad_norm": 0.9301772713661194,
+      "learning_rate": 5.2341129648838275e-05,
+      "loss": 0.9092,
+      "step": 14796
+    },
+    {
+      "epoch": 2.634437321937322,
+      "grad_norm": 0.7605858445167542,
+      "learning_rate": 5.232882456860493e-05,
+      "loss": 0.8753,
+      "step": 14797
+    },
+    {
+      "epoch": 2.6346153846153846,
+      "grad_norm": 0.8265452980995178,
+      "learning_rate": 5.231652042238927e-05,
+      "loss": 0.9134,
+      "step": 14798
+    },
+    {
+      "epoch": 2.6347934472934473,
+      "grad_norm": 0.7440468072891235,
+      "learning_rate": 5.230421721043235e-05,
+      "loss": 0.7471,
+      "step": 14799
+    },
+    {
+      "epoch": 2.63497150997151,
+      "grad_norm": 0.9172230958938599,
+      "learning_rate": 5.2291914932975205e-05,
+      "loss": 1.0155,
+      "step": 14800
+    },
+    {
+      "epoch": 2.6351495726495724,
+      "grad_norm": 0.8364499807357788,
+      "learning_rate": 5.227961359025888e-05,
+      "loss": 0.9561,
+      "step": 14801
+    },
+    {
+      "epoch": 2.635327635327635,
+      "grad_norm": 0.7756382822990417,
+      "learning_rate": 5.22673131825244e-05,
+      "loss": 0.6893,
+      "step": 14802
+    },
+    {
+      "epoch": 2.635505698005698,
+      "grad_norm": 0.9042136669158936,
+      "learning_rate": 5.225501371001273e-05,
+      "loss": 0.7613,
+      "step": 14803
+    },
+    {
+      "epoch": 2.6356837606837606,
+      "grad_norm": 0.8989379405975342,
+      "learning_rate": 5.224271517296495e-05,
+      "loss": 0.8092,
+      "step": 14804
+    },
+    {
+      "epoch": 2.6358618233618234,
+      "grad_norm": 0.7999827265739441,
+      "learning_rate": 5.2230417571621906e-05,
+      "loss": 0.8115,
+      "step": 14805
+    },
+    {
+      "epoch": 2.636039886039886,
+      "grad_norm": 0.9071131348609924,
+      "learning_rate": 5.221812090622464e-05,
+      "loss": 0.9072,
+      "step": 14806
+    },
+    {
+      "epoch": 2.636217948717949,
+      "grad_norm": 0.7227704524993896,
+      "learning_rate": 5.220582517701398e-05,
+      "loss": 0.7598,
+      "step": 14807
+    },
+    {
+      "epoch": 2.6363960113960117,
+      "grad_norm": 0.8520537614822388,
+      "learning_rate": 5.219353038423094e-05,
+      "loss": 1.1072,
+      "step": 14808
+    },
+    {
+      "epoch": 2.636574074074074,
+      "grad_norm": 0.8690574765205383,
+      "learning_rate": 5.218123652811634e-05,
+      "loss": 0.773,
+      "step": 14809
+    },
+    {
+      "epoch": 2.6367521367521367,
+      "grad_norm": 0.7897602319717407,
+      "learning_rate": 5.216894360891109e-05,
+      "loss": 0.792,
+      "step": 14810
+    },
+    {
+      "epoch": 2.6369301994301995,
+      "grad_norm": 0.8746532201766968,
+      "learning_rate": 5.215665162685601e-05,
+      "loss": 0.8853,
+      "step": 14811
+    },
+    {
+      "epoch": 2.637108262108262,
+      "grad_norm": 0.8525128364562988,
+      "learning_rate": 5.214436058219199e-05,
+      "loss": 0.7293,
+      "step": 14812
+    },
+    {
+      "epoch": 2.6372863247863245,
+      "grad_norm": 0.979969322681427,
+      "learning_rate": 5.213207047515975e-05,
+      "loss": 0.8485,
+      "step": 14813
+    },
+    {
+      "epoch": 2.6374643874643873,
+      "grad_norm": 0.8439529538154602,
+      "learning_rate": 5.211978130600024e-05,
+      "loss": 0.7492,
+      "step": 14814
+    },
+    {
+      "epoch": 2.63764245014245,
+      "grad_norm": 0.8356610536575317,
+      "learning_rate": 5.2107493074954064e-05,
+      "loss": 0.8255,
+      "step": 14815
+    },
+    {
+      "epoch": 2.6378205128205128,
+      "grad_norm": 0.7857736349105835,
+      "learning_rate": 5.2095205782262116e-05,
+      "loss": 0.766,
+      "step": 14816
+    },
+    {
+      "epoch": 2.6379985754985755,
+      "grad_norm": 0.919058084487915,
+      "learning_rate": 5.20829194281651e-05,
+      "loss": 1.0661,
+      "step": 14817
+    },
+    {
+      "epoch": 2.6381766381766383,
+      "grad_norm": 0.8793047070503235,
+      "learning_rate": 5.207063401290373e-05,
+      "loss": 0.8297,
+      "step": 14818
+    },
+    {
+      "epoch": 2.638354700854701,
+      "grad_norm": 0.7848390340805054,
+      "learning_rate": 5.205834953671873e-05,
+      "loss": 0.8051,
+      "step": 14819
+    },
+    {
+      "epoch": 2.638532763532764,
+      "grad_norm": 0.8391907215118408,
+      "learning_rate": 5.2046065999850736e-05,
+      "loss": 0.8444,
+      "step": 14820
+    },
+    {
+      "epoch": 2.638710826210826,
+      "grad_norm": 0.8137226700782776,
+      "learning_rate": 5.2033783402540546e-05,
+      "loss": 0.7908,
+      "step": 14821
+    },
+    {
+      "epoch": 2.638888888888889,
+      "grad_norm": 0.8440108299255371,
+      "learning_rate": 5.2021501745028645e-05,
+      "loss": 0.7985,
+      "step": 14822
+    },
+    {
+      "epoch": 2.6390669515669516,
+      "grad_norm": 0.7432600855827332,
+      "learning_rate": 5.200922102755581e-05,
+      "loss": 0.7816,
+      "step": 14823
+    },
+    {
+      "epoch": 2.6392450142450143,
+      "grad_norm": 0.9003379344940186,
+      "learning_rate": 5.199694125036257e-05,
+      "loss": 0.9171,
+      "step": 14824
+    },
+    {
+      "epoch": 2.6394230769230766,
+      "grad_norm": 0.8994988203048706,
+      "learning_rate": 5.198466241368957e-05,
+      "loss": 0.8333,
+      "step": 14825
+    },
+    {
+      "epoch": 2.6396011396011394,
+      "grad_norm": 0.9042859077453613,
+      "learning_rate": 5.197238451777735e-05,
+      "loss": 0.7491,
+      "step": 14826
+    },
+    {
+      "epoch": 2.639779202279202,
+      "grad_norm": 0.8024145364761353,
+      "learning_rate": 5.196010756286649e-05,
+      "loss": 0.882,
+      "step": 14827
+    },
+    {
+      "epoch": 2.639957264957265,
+      "grad_norm": 0.73011714220047,
+      "learning_rate": 5.1947831549197504e-05,
+      "loss": 0.865,
+      "step": 14828
+    },
+    {
+      "epoch": 2.6401353276353277,
+      "grad_norm": 0.845160186290741,
+      "learning_rate": 5.1935556477011006e-05,
+      "loss": 0.8912,
+      "step": 14829
+    },
+    {
+      "epoch": 2.6403133903133904,
+      "grad_norm": 0.8264908194541931,
+      "learning_rate": 5.192328234654735e-05,
+      "loss": 0.8597,
+      "step": 14830
+    },
+    {
+      "epoch": 2.640491452991453,
+      "grad_norm": 0.8400609493255615,
+      "learning_rate": 5.191100915804718e-05,
+      "loss": 0.5906,
+      "step": 14831
+    },
+    {
+      "epoch": 2.640669515669516,
+      "grad_norm": 0.8633815050125122,
+      "learning_rate": 5.189873691175082e-05,
+      "loss": 1.031,
+      "step": 14832
+    },
+    {
+      "epoch": 2.640847578347578,
+      "grad_norm": 0.9047896862030029,
+      "learning_rate": 5.188646560789884e-05,
+      "loss": 0.6929,
+      "step": 14833
+    },
+    {
+      "epoch": 2.641025641025641,
+      "grad_norm": 0.8293144106864929,
+      "learning_rate": 5.18741952467316e-05,
+      "loss": 0.9608,
+      "step": 14834
+    },
+    {
+      "epoch": 2.6412037037037037,
+      "grad_norm": 0.7980968356132507,
+      "learning_rate": 5.186192582848955e-05,
+      "loss": 0.6021,
+      "step": 14835
+    },
+    {
+      "epoch": 2.6413817663817665,
+      "grad_norm": 0.7945372462272644,
+      "learning_rate": 5.184965735341305e-05,
+      "loss": 0.7069,
+      "step": 14836
+    },
+    {
+      "epoch": 2.6415598290598292,
+      "grad_norm": 0.8388827443122864,
+      "learning_rate": 5.183738982174246e-05,
+      "loss": 1.2404,
+      "step": 14837
+    },
+    {
+      "epoch": 2.6417378917378915,
+      "grad_norm": 0.8332177400588989,
+      "learning_rate": 5.18251232337182e-05,
+      "loss": 0.9353,
+      "step": 14838
+    },
+    {
+      "epoch": 2.6419159544159543,
+      "grad_norm": 0.9658130407333374,
+      "learning_rate": 5.1812857589580565e-05,
+      "loss": 0.8,
+      "step": 14839
+    },
+    {
+      "epoch": 2.642094017094017,
+      "grad_norm": 0.9074252247810364,
+      "learning_rate": 5.180059288956991e-05,
+      "loss": 0.7567,
+      "step": 14840
+    },
+    {
+      "epoch": 2.64227207977208,
+      "grad_norm": 0.8543582558631897,
+      "learning_rate": 5.178832913392649e-05,
+      "loss": 0.9754,
+      "step": 14841
+    },
+    {
+      "epoch": 2.6424501424501425,
+      "grad_norm": 0.8235877156257629,
+      "learning_rate": 5.177606632289063e-05,
+      "loss": 0.825,
+      "step": 14842
+    },
+    {
+      "epoch": 2.6426282051282053,
+      "grad_norm": 0.8550012111663818,
+      "learning_rate": 5.1763804456702545e-05,
+      "loss": 1.0286,
+      "step": 14843
+    },
+    {
+      "epoch": 2.642806267806268,
+      "grad_norm": 0.8879600763320923,
+      "learning_rate": 5.175154353560254e-05,
+      "loss": 0.8935,
+      "step": 14844
+    },
+    {
+      "epoch": 2.6429843304843303,
+      "grad_norm": 0.8822683095932007,
+      "learning_rate": 5.1739283559830754e-05,
+      "loss": 0.8659,
+      "step": 14845
+    },
+    {
+      "epoch": 2.643162393162393,
+      "grad_norm": 1.0260087251663208,
+      "learning_rate": 5.1727024529627544e-05,
+      "loss": 0.8952,
+      "step": 14846
+    },
+    {
+      "epoch": 2.643340455840456,
+      "grad_norm": 0.8105470538139343,
+      "learning_rate": 5.171476644523292e-05,
+      "loss": 0.7987,
+      "step": 14847
+    },
+    {
+      "epoch": 2.6435185185185186,
+      "grad_norm": 0.8861166834831238,
+      "learning_rate": 5.170250930688719e-05,
+      "loss": 0.8476,
+      "step": 14848
+    },
+    {
+      "epoch": 2.6436965811965814,
+      "grad_norm": 0.8035899996757507,
+      "learning_rate": 5.169025311483047e-05,
+      "loss": 0.7366,
+      "step": 14849
+    },
+    {
+      "epoch": 2.6438746438746437,
+      "grad_norm": 0.8359752297401428,
+      "learning_rate": 5.1677997869302874e-05,
+      "loss": 0.8931,
+      "step": 14850
+    },
+    {
+      "epoch": 2.6440527065527064,
+      "grad_norm": 0.8483668565750122,
+      "learning_rate": 5.166574357054452e-05,
+      "loss": 0.7662,
+      "step": 14851
+    },
+    {
+      "epoch": 2.644230769230769,
+      "grad_norm": 0.9865937829017639,
+      "learning_rate": 5.165349021879553e-05,
+      "loss": 1.101,
+      "step": 14852
+    },
+    {
+      "epoch": 2.644408831908832,
+      "grad_norm": 0.8491073250770569,
+      "learning_rate": 5.164123781429596e-05,
+      "loss": 0.9576,
+      "step": 14853
+    },
+    {
+      "epoch": 2.6445868945868947,
+      "grad_norm": 0.8185597062110901,
+      "learning_rate": 5.162898635728588e-05,
+      "loss": 0.6353,
+      "step": 14854
+    },
+    {
+      "epoch": 2.6447649572649574,
+      "grad_norm": 0.8583887815475464,
+      "learning_rate": 5.1616735848005306e-05,
+      "loss": 0.8715,
+      "step": 14855
+    },
+    {
+      "epoch": 2.64494301994302,
+      "grad_norm": 0.8107531666755676,
+      "learning_rate": 5.16044862866943e-05,
+      "loss": 0.7111,
+      "step": 14856
+    },
+    {
+      "epoch": 2.6451210826210825,
+      "grad_norm": 0.7675925493240356,
+      "learning_rate": 5.1592237673592867e-05,
+      "loss": 0.8145,
+      "step": 14857
+    },
+    {
+      "epoch": 2.6452991452991452,
+      "grad_norm": 0.9418326020240784,
+      "learning_rate": 5.157999000894098e-05,
+      "loss": 0.8454,
+      "step": 14858
+    },
+    {
+      "epoch": 2.645477207977208,
+      "grad_norm": 0.8420053720474243,
+      "learning_rate": 5.15677432929786e-05,
+      "loss": 0.7343,
+      "step": 14859
+    },
+    {
+      "epoch": 2.6456552706552707,
+      "grad_norm": 0.9815202951431274,
+      "learning_rate": 5.155549752594564e-05,
+      "loss": 0.9252,
+      "step": 14860
+    },
+    {
+      "epoch": 2.6458333333333335,
+      "grad_norm": 0.8282185792922974,
+      "learning_rate": 5.1543252708082146e-05,
+      "loss": 0.9935,
+      "step": 14861
+    },
+    {
+      "epoch": 2.646011396011396,
+      "grad_norm": 0.7398781180381775,
+      "learning_rate": 5.153100883962788e-05,
+      "loss": 0.5024,
+      "step": 14862
+    },
+    {
+      "epoch": 2.6461894586894585,
+      "grad_norm": 1.0273998975753784,
+      "learning_rate": 5.1518765920822856e-05,
+      "loss": 0.9023,
+      "step": 14863
+    },
+    {
+      "epoch": 2.6463675213675213,
+      "grad_norm": 0.8017948269844055,
+      "learning_rate": 5.150652395190689e-05,
+      "loss": 0.6755,
+      "step": 14864
+    },
+    {
+      "epoch": 2.646545584045584,
+      "grad_norm": 0.7470258474349976,
+      "learning_rate": 5.1494282933119864e-05,
+      "loss": 0.5408,
+      "step": 14865
+    },
+    {
+      "epoch": 2.646723646723647,
+      "grad_norm": 0.8118627071380615,
+      "learning_rate": 5.1482042864701595e-05,
+      "loss": 0.8032,
+      "step": 14866
+    },
+    {
+      "epoch": 2.6469017094017095,
+      "grad_norm": 0.8302956223487854,
+      "learning_rate": 5.146980374689192e-05,
+      "loss": 0.7428,
+      "step": 14867
+    },
+    {
+      "epoch": 2.6470797720797723,
+      "grad_norm": 0.8660209774971008,
+      "learning_rate": 5.145756557993061e-05,
+      "loss": 0.8284,
+      "step": 14868
+    },
+    {
+      "epoch": 2.6472578347578346,
+      "grad_norm": 1.0153858661651611,
+      "learning_rate": 5.1445328364057475e-05,
+      "loss": 0.9766,
+      "step": 14869
+    },
+    {
+      "epoch": 2.6474358974358974,
+      "grad_norm": 0.9047706127166748,
+      "learning_rate": 5.143309209951223e-05,
+      "loss": 1.0099,
+      "step": 14870
+    },
+    {
+      "epoch": 2.64761396011396,
+      "grad_norm": 0.7924295663833618,
+      "learning_rate": 5.1420856786534724e-05,
+      "loss": 0.8385,
+      "step": 14871
+    },
+    {
+      "epoch": 2.647792022792023,
+      "grad_norm": 0.8885742425918579,
+      "learning_rate": 5.140862242536455e-05,
+      "loss": 1.0259,
+      "step": 14872
+    },
+    {
+      "epoch": 2.6479700854700856,
+      "grad_norm": 0.8826889991760254,
+      "learning_rate": 5.139638901624151e-05,
+      "loss": 1.0755,
+      "step": 14873
+    },
+    {
+      "epoch": 2.648148148148148,
+      "grad_norm": 0.7793754935264587,
+      "learning_rate": 5.138415655940526e-05,
+      "loss": 0.8289,
+      "step": 14874
+    },
+    {
+      "epoch": 2.6483262108262107,
+      "grad_norm": 0.8587870597839355,
+      "learning_rate": 5.137192505509547e-05,
+      "loss": 0.9185,
+      "step": 14875
+    },
+    {
+      "epoch": 2.6485042735042734,
+      "grad_norm": 0.8799259066581726,
+      "learning_rate": 5.13596945035518e-05,
+      "loss": 0.9493,
+      "step": 14876
+    },
+    {
+      "epoch": 2.648682336182336,
+      "grad_norm": 0.8108882308006287,
+      "learning_rate": 5.1347464905013834e-05,
+      "loss": 0.8892,
+      "step": 14877
+    },
+    {
+      "epoch": 2.648860398860399,
+      "grad_norm": 0.8387644290924072,
+      "learning_rate": 5.1335236259721296e-05,
+      "loss": 0.8723,
+      "step": 14878
+    },
+    {
+      "epoch": 2.6490384615384617,
+      "grad_norm": 0.8750926852226257,
+      "learning_rate": 5.1323008567913655e-05,
+      "loss": 0.6978,
+      "step": 14879
+    },
+    {
+      "epoch": 2.6492165242165244,
+      "grad_norm": 0.7837518453598022,
+      "learning_rate": 5.131078182983055e-05,
+      "loss": 0.8747,
+      "step": 14880
+    },
+    {
+      "epoch": 2.6493945868945867,
+      "grad_norm": 0.8998439311981201,
+      "learning_rate": 5.1298556045711566e-05,
+      "loss": 0.9903,
+      "step": 14881
+    },
+    {
+      "epoch": 2.6495726495726495,
+      "grad_norm": 0.8012915253639221,
+      "learning_rate": 5.128633121579619e-05,
+      "loss": 0.9767,
+      "step": 14882
+    },
+    {
+      "epoch": 2.6497507122507122,
+      "grad_norm": 0.9051218032836914,
+      "learning_rate": 5.1274107340323964e-05,
+      "loss": 0.7454,
+      "step": 14883
+    },
+    {
+      "epoch": 2.649928774928775,
+      "grad_norm": 0.8373401761054993,
+      "learning_rate": 5.1261884419534376e-05,
+      "loss": 0.821,
+      "step": 14884
+    },
+    {
+      "epoch": 2.6501068376068377,
+      "grad_norm": 0.7482876181602478,
+      "learning_rate": 5.124966245366689e-05,
+      "loss": 0.7051,
+      "step": 14885
+    },
+    {
+      "epoch": 2.6502849002849,
+      "grad_norm": 0.8445764183998108,
+      "learning_rate": 5.1237441442961074e-05,
+      "loss": 0.7416,
+      "step": 14886
+    },
+    {
+      "epoch": 2.650462962962963,
+      "grad_norm": 0.887598991394043,
+      "learning_rate": 5.122522138765622e-05,
+      "loss": 0.9027,
+      "step": 14887
+    },
+    {
+      "epoch": 2.6506410256410255,
+      "grad_norm": 0.8089238405227661,
+      "learning_rate": 5.1213002287991905e-05,
+      "loss": 0.9294,
+      "step": 14888
+    },
+    {
+      "epoch": 2.6508190883190883,
+      "grad_norm": 0.8614209890365601,
+      "learning_rate": 5.120078414420739e-05,
+      "loss": 0.7716,
+      "step": 14889
+    },
+    {
+      "epoch": 2.650997150997151,
+      "grad_norm": 0.6805269718170166,
+      "learning_rate": 5.118856695654217e-05,
+      "loss": 0.6183,
+      "step": 14890
+    },
+    {
+      "epoch": 2.651175213675214,
+      "grad_norm": 0.9024596214294434,
+      "learning_rate": 5.117635072523559e-05,
+      "loss": 0.9516,
+      "step": 14891
+    },
+    {
+      "epoch": 2.6513532763532766,
+      "grad_norm": 0.906373143196106,
+      "learning_rate": 5.116413545052701e-05,
+      "loss": 0.7522,
+      "step": 14892
+    },
+    {
+      "epoch": 2.6515313390313393,
+      "grad_norm": 0.827235996723175,
+      "learning_rate": 5.1151921132655725e-05,
+      "loss": 0.6776,
+      "step": 14893
+    },
+    {
+      "epoch": 2.6517094017094016,
+      "grad_norm": 0.7769291996955872,
+      "learning_rate": 5.113970777186108e-05,
+      "loss": 0.6682,
+      "step": 14894
+    },
+    {
+      "epoch": 2.6518874643874644,
+      "grad_norm": 0.8420324921607971,
+      "learning_rate": 5.112749536838233e-05,
+      "loss": 0.8303,
+      "step": 14895
+    },
+    {
+      "epoch": 2.652065527065527,
+      "grad_norm": 0.789368748664856,
+      "learning_rate": 5.1115283922458814e-05,
+      "loss": 0.773,
+      "step": 14896
+    },
+    {
+      "epoch": 2.65224358974359,
+      "grad_norm": 0.9156190752983093,
+      "learning_rate": 5.1103073434329766e-05,
+      "loss": 1.0318,
+      "step": 14897
+    },
+    {
+      "epoch": 2.652421652421652,
+      "grad_norm": 1.0411027669906616,
+      "learning_rate": 5.109086390423441e-05,
+      "loss": 0.81,
+      "step": 14898
+    },
+    {
+      "epoch": 2.652599715099715,
+      "grad_norm": 0.9908538460731506,
+      "learning_rate": 5.107865533241198e-05,
+      "loss": 0.9386,
+      "step": 14899
+    },
+    {
+      "epoch": 2.6527777777777777,
+      "grad_norm": 0.7364035844802856,
+      "learning_rate": 5.106644771910165e-05,
+      "loss": 0.675,
+      "step": 14900
+    },
+    {
+      "epoch": 2.6529558404558404,
+      "grad_norm": 0.8409245014190674,
+      "learning_rate": 5.1054241064542686e-05,
+      "loss": 0.9446,
+      "step": 14901
+    },
+    {
+      "epoch": 2.653133903133903,
+      "grad_norm": 0.7731066942214966,
+      "learning_rate": 5.104203536897412e-05,
+      "loss": 0.4684,
+      "step": 14902
+    },
+    {
+      "epoch": 2.653311965811966,
+      "grad_norm": 0.9114529490470886,
+      "learning_rate": 5.102983063263525e-05,
+      "loss": 0.9551,
+      "step": 14903
+    },
+    {
+      "epoch": 2.6534900284900287,
+      "grad_norm": 0.7949321269989014,
+      "learning_rate": 5.101762685576503e-05,
+      "loss": 0.989,
+      "step": 14904
+    },
+    {
+      "epoch": 2.6536680911680914,
+      "grad_norm": 0.940191924571991,
+      "learning_rate": 5.1005424038602724e-05,
+      "loss": 1.0377,
+      "step": 14905
+    },
+    {
+      "epoch": 2.6538461538461537,
+      "grad_norm": 0.7629654407501221,
+      "learning_rate": 5.0993222181387334e-05,
+      "loss": 0.7908,
+      "step": 14906
+    },
+    {
+      "epoch": 2.6540242165242165,
+      "grad_norm": 0.9712302684783936,
+      "learning_rate": 5.098102128435797e-05,
+      "loss": 1.1486,
+      "step": 14907
+    },
+    {
+      "epoch": 2.6542022792022792,
+      "grad_norm": 0.9054526686668396,
+      "learning_rate": 5.096882134775365e-05,
+      "loss": 0.8078,
+      "step": 14908
+    },
+    {
+      "epoch": 2.654380341880342,
+      "grad_norm": 0.824647068977356,
+      "learning_rate": 5.095662237181343e-05,
+      "loss": 0.9095,
+      "step": 14909
+    },
+    {
+      "epoch": 2.6545584045584043,
+      "grad_norm": 0.8760488033294678,
+      "learning_rate": 5.0944424356776287e-05,
+      "loss": 0.8538,
+      "step": 14910
+    },
+    {
+      "epoch": 2.654736467236467,
+      "grad_norm": 0.8012890219688416,
+      "learning_rate": 5.093222730288131e-05,
+      "loss": 0.7972,
+      "step": 14911
+    },
+    {
+      "epoch": 2.65491452991453,
+      "grad_norm": 0.9025147557258606,
+      "learning_rate": 5.0920031210367326e-05,
+      "loss": 0.8485,
+      "step": 14912
+    },
+    {
+      "epoch": 2.6550925925925926,
+      "grad_norm": 0.8621100783348083,
+      "learning_rate": 5.090783607947347e-05,
+      "loss": 1.1856,
+      "step": 14913
+    },
+    {
+      "epoch": 2.6552706552706553,
+      "grad_norm": 0.7914317846298218,
+      "learning_rate": 5.08956419104385e-05,
+      "loss": 0.78,
+      "step": 14914
+    },
+    {
+      "epoch": 2.655448717948718,
+      "grad_norm": 0.8691070675849915,
+      "learning_rate": 5.088344870350146e-05,
+      "loss": 0.8406,
+      "step": 14915
+    },
+    {
+      "epoch": 2.655626780626781,
+      "grad_norm": 0.8521141409873962,
+      "learning_rate": 5.087125645890121e-05,
+      "loss": 1.0077,
+      "step": 14916
+    },
+    {
+      "epoch": 2.6558048433048436,
+      "grad_norm": 0.7918437123298645,
+      "learning_rate": 5.08590651768766e-05,
+      "loss": 0.8367,
+      "step": 14917
+    },
+    {
+      "epoch": 2.655982905982906,
+      "grad_norm": 0.8580697178840637,
+      "learning_rate": 5.084687485766659e-05,
+      "loss": 0.921,
+      "step": 14918
+    },
+    {
+      "epoch": 2.6561609686609686,
+      "grad_norm": 0.7943900227546692,
+      "learning_rate": 5.0834685501509894e-05,
+      "loss": 0.7934,
+      "step": 14919
+    },
+    {
+      "epoch": 2.6563390313390314,
+      "grad_norm": 0.7467655539512634,
+      "learning_rate": 5.082249710864544e-05,
+      "loss": 0.8625,
+      "step": 14920
+    },
+    {
+      "epoch": 2.656517094017094,
+      "grad_norm": 0.7654036283493042,
+      "learning_rate": 5.0810309679311996e-05,
+      "loss": 0.888,
+      "step": 14921
+    },
+    {
+      "epoch": 2.6566951566951564,
+      "grad_norm": 0.8428319692611694,
+      "learning_rate": 5.079812321374836e-05,
+      "loss": 0.858,
+      "step": 14922
+    },
+    {
+      "epoch": 2.656873219373219,
+      "grad_norm": 0.8273693323135376,
+      "learning_rate": 5.078593771219329e-05,
+      "loss": 0.8982,
+      "step": 14923
+    },
+    {
+      "epoch": 2.657051282051282,
+      "grad_norm": 0.9037185311317444,
+      "learning_rate": 5.077375317488553e-05,
+      "loss": 0.7022,
+      "step": 14924
+    },
+    {
+      "epoch": 2.6572293447293447,
+      "grad_norm": 0.916585385799408,
+      "learning_rate": 5.0761569602063816e-05,
+      "loss": 0.8058,
+      "step": 14925
+    },
+    {
+      "epoch": 2.6574074074074074,
+      "grad_norm": 0.8697561621665955,
+      "learning_rate": 5.074938699396687e-05,
+      "loss": 0.8142,
+      "step": 14926
+    },
+    {
+      "epoch": 2.65758547008547,
+      "grad_norm": 1.024512529373169,
+      "learning_rate": 5.073720535083334e-05,
+      "loss": 0.7462,
+      "step": 14927
+    },
+    {
+      "epoch": 2.657763532763533,
+      "grad_norm": 0.8258776664733887,
+      "learning_rate": 5.072502467290201e-05,
+      "loss": 0.7467,
+      "step": 14928
+    },
+    {
+      "epoch": 2.6579415954415957,
+      "grad_norm": 0.8279047012329102,
+      "learning_rate": 5.071284496041138e-05,
+      "loss": 0.9148,
+      "step": 14929
+    },
+    {
+      "epoch": 2.658119658119658,
+      "grad_norm": 0.8176717758178711,
+      "learning_rate": 5.070066621360021e-05,
+      "loss": 1.0971,
+      "step": 14930
+    },
+    {
+      "epoch": 2.6582977207977208,
+      "grad_norm": 0.7482925057411194,
+      "learning_rate": 5.0688488432707074e-05,
+      "loss": 0.8666,
+      "step": 14931
+    },
+    {
+      "epoch": 2.6584757834757835,
+      "grad_norm": 0.9302734136581421,
+      "learning_rate": 5.067631161797057e-05,
+      "loss": 0.9994,
+      "step": 14932
+    },
+    {
+      "epoch": 2.6586538461538463,
+      "grad_norm": 0.7811494469642639,
+      "learning_rate": 5.066413576962927e-05,
+      "loss": 0.5959,
+      "step": 14933
+    },
+    {
+      "epoch": 2.6588319088319086,
+      "grad_norm": 0.8109773993492126,
+      "learning_rate": 5.065196088792177e-05,
+      "loss": 0.7342,
+      "step": 14934
+    },
+    {
+      "epoch": 2.6590099715099713,
+      "grad_norm": 0.8351961374282837,
+      "learning_rate": 5.0639786973086525e-05,
+      "loss": 0.775,
+      "step": 14935
+    },
+    {
+      "epoch": 2.659188034188034,
+      "grad_norm": 0.8558792471885681,
+      "learning_rate": 5.062761402536216e-05,
+      "loss": 0.8819,
+      "step": 14936
+    },
+    {
+      "epoch": 2.659366096866097,
+      "grad_norm": 0.7928652167320251,
+      "learning_rate": 5.061544204498714e-05,
+      "loss": 0.8313,
+      "step": 14937
+    },
+    {
+      "epoch": 2.6595441595441596,
+      "grad_norm": 0.8388734459877014,
+      "learning_rate": 5.060327103219993e-05,
+      "loss": 0.7208,
+      "step": 14938
+    },
+    {
+      "epoch": 2.6597222222222223,
+      "grad_norm": 0.8921391367912292,
+      "learning_rate": 5.059110098723903e-05,
+      "loss": 0.8974,
+      "step": 14939
+    },
+    {
+      "epoch": 2.659900284900285,
+      "grad_norm": 0.8111342787742615,
+      "learning_rate": 5.057893191034286e-05,
+      "loss": 0.6879,
+      "step": 14940
+    },
+    {
+      "epoch": 2.660078347578348,
+      "grad_norm": 0.8677322864532471,
+      "learning_rate": 5.056676380174985e-05,
+      "loss": 0.8643,
+      "step": 14941
+    },
+    {
+      "epoch": 2.66025641025641,
+      "grad_norm": 0.7969355583190918,
+      "learning_rate": 5.055459666169839e-05,
+      "loss": 0.8462,
+      "step": 14942
+    },
+    {
+      "epoch": 2.660434472934473,
+      "grad_norm": 0.9927026629447937,
+      "learning_rate": 5.0542430490426975e-05,
+      "loss": 0.7954,
+      "step": 14943
+    },
+    {
+      "epoch": 2.6606125356125356,
+      "grad_norm": 1.0181084871292114,
+      "learning_rate": 5.053026528817379e-05,
+      "loss": 0.9597,
+      "step": 14944
+    },
+    {
+      "epoch": 2.6607905982905984,
+      "grad_norm": 1.0274122953414917,
+      "learning_rate": 5.0518101055177355e-05,
+      "loss": 0.7321,
+      "step": 14945
+    },
+    {
+      "epoch": 2.6609686609686607,
+      "grad_norm": 1.056132197380066,
+      "learning_rate": 5.050593779167594e-05,
+      "loss": 0.8405,
+      "step": 14946
+    },
+    {
+      "epoch": 2.6611467236467234,
+      "grad_norm": 0.8586339950561523,
+      "learning_rate": 5.0493775497907846e-05,
+      "loss": 1.0238,
+      "step": 14947
+    },
+    {
+      "epoch": 2.661324786324786,
+      "grad_norm": 0.8103144764900208,
+      "learning_rate": 5.048161417411139e-05,
+      "loss": 0.5885,
+      "step": 14948
+    },
+    {
+      "epoch": 2.661502849002849,
+      "grad_norm": 0.7321345210075378,
+      "learning_rate": 5.0469453820524834e-05,
+      "loss": 0.7987,
+      "step": 14949
+    },
+    {
+      "epoch": 2.6616809116809117,
+      "grad_norm": 0.8244233727455139,
+      "learning_rate": 5.045729443738645e-05,
+      "loss": 0.8855,
+      "step": 14950
+    },
+    {
+      "epoch": 2.6618589743589745,
+      "grad_norm": 0.7888374924659729,
+      "learning_rate": 5.0445136024934456e-05,
+      "loss": 0.9192,
+      "step": 14951
+    },
+    {
+      "epoch": 2.662037037037037,
+      "grad_norm": 0.8414669036865234,
+      "learning_rate": 5.0432978583407044e-05,
+      "loss": 0.8152,
+      "step": 14952
+    },
+    {
+      "epoch": 2.6622150997151,
+      "grad_norm": 0.9176363348960876,
+      "learning_rate": 5.042082211304252e-05,
+      "loss": 0.8836,
+      "step": 14953
+    },
+    {
+      "epoch": 2.6623931623931623,
+      "grad_norm": 0.9827163219451904,
+      "learning_rate": 5.040866661407893e-05,
+      "loss": 0.963,
+      "step": 14954
+    },
+    {
+      "epoch": 2.662571225071225,
+      "grad_norm": 0.8765084743499756,
+      "learning_rate": 5.0396512086754535e-05,
+      "loss": 1.022,
+      "step": 14955
+    },
+    {
+      "epoch": 2.6627492877492878,
+      "grad_norm": 0.9236209392547607,
+      "learning_rate": 5.038435853130743e-05,
+      "loss": 0.9152,
+      "step": 14956
+    },
+    {
+      "epoch": 2.6629273504273505,
+      "grad_norm": 0.8300418853759766,
+      "learning_rate": 5.037220594797574e-05,
+      "loss": 0.8063,
+      "step": 14957
+    },
+    {
+      "epoch": 2.6631054131054133,
+      "grad_norm": 0.9248050451278687,
+      "learning_rate": 5.036005433699764e-05,
+      "loss": 0.8799,
+      "step": 14958
+    },
+    {
+      "epoch": 2.6632834757834756,
+      "grad_norm": 0.9670597910881042,
+      "learning_rate": 5.0347903698611085e-05,
+      "loss": 0.9068,
+      "step": 14959
+    },
+    {
+      "epoch": 2.6634615384615383,
+      "grad_norm": 0.851403534412384,
+      "learning_rate": 5.033575403305428e-05,
+      "loss": 0.8058,
+      "step": 14960
+    },
+    {
+      "epoch": 2.663639601139601,
+      "grad_norm": 0.9643952250480652,
+      "learning_rate": 5.032360534056515e-05,
+      "loss": 1.076,
+      "step": 14961
+    },
+    {
+      "epoch": 2.663817663817664,
+      "grad_norm": 0.8473731279373169,
+      "learning_rate": 5.031145762138181e-05,
+      "loss": 0.9585,
+      "step": 14962
+    },
+    {
+      "epoch": 2.6639957264957266,
+      "grad_norm": 0.8265015482902527,
+      "learning_rate": 5.029931087574222e-05,
+      "loss": 0.8602,
+      "step": 14963
+    },
+    {
+      "epoch": 2.6641737891737893,
+      "grad_norm": 0.8004183173179626,
+      "learning_rate": 5.0287165103884416e-05,
+      "loss": 0.7293,
+      "step": 14964
+    },
+    {
+      "epoch": 2.664351851851852,
+      "grad_norm": 0.8410465121269226,
+      "learning_rate": 5.027502030604633e-05,
+      "loss": 0.9479,
+      "step": 14965
+    },
+    {
+      "epoch": 2.6645299145299144,
+      "grad_norm": 0.8365132808685303,
+      "learning_rate": 5.0262876482465925e-05,
+      "loss": 0.7373,
+      "step": 14966
+    },
+    {
+      "epoch": 2.664707977207977,
+      "grad_norm": 0.9017055630683899,
+      "learning_rate": 5.025073363338111e-05,
+      "loss": 0.9463,
+      "step": 14967
+    },
+    {
+      "epoch": 2.66488603988604,
+      "grad_norm": 0.7985300421714783,
+      "learning_rate": 5.023859175902988e-05,
+      "loss": 0.7074,
+      "step": 14968
+    },
+    {
+      "epoch": 2.6650641025641026,
+      "grad_norm": 0.8032601475715637,
+      "learning_rate": 5.022645085965001e-05,
+      "loss": 0.6796,
+      "step": 14969
+    },
+    {
+      "epoch": 2.6652421652421654,
+      "grad_norm": 0.7785899639129639,
+      "learning_rate": 5.021431093547948e-05,
+      "loss": 0.7256,
+      "step": 14970
+    },
+    {
+      "epoch": 2.6654202279202277,
+      "grad_norm": 0.8083044290542603,
+      "learning_rate": 5.02021719867561e-05,
+      "loss": 0.9254,
+      "step": 14971
+    },
+    {
+      "epoch": 2.6655982905982905,
+      "grad_norm": 0.8896783590316772,
+      "learning_rate": 5.019003401371771e-05,
+      "loss": 0.9231,
+      "step": 14972
+    },
+    {
+      "epoch": 2.665776353276353,
+      "grad_norm": 0.9304720163345337,
+      "learning_rate": 5.017789701660215e-05,
+      "loss": 0.8915,
+      "step": 14973
+    },
+    {
+      "epoch": 2.665954415954416,
+      "grad_norm": 0.8683121204376221,
+      "learning_rate": 5.016576099564718e-05,
+      "loss": 0.8654,
+      "step": 14974
+    },
+    {
+      "epoch": 2.6661324786324787,
+      "grad_norm": 1.1082890033721924,
+      "learning_rate": 5.015362595109062e-05,
+      "loss": 1.0669,
+      "step": 14975
+    },
+    {
+      "epoch": 2.6663105413105415,
+      "grad_norm": 1.1696041822433472,
+      "learning_rate": 5.014149188317017e-05,
+      "loss": 0.9273,
+      "step": 14976
+    },
+    {
+      "epoch": 2.666488603988604,
+      "grad_norm": 0.8726202845573425,
+      "learning_rate": 5.0129358792123637e-05,
+      "loss": 0.6615,
+      "step": 14977
+    },
+    {
+      "epoch": 2.6666666666666665,
+      "grad_norm": 0.8246448636054993,
+      "learning_rate": 5.011722667818875e-05,
+      "loss": 0.8263,
+      "step": 14978
+    },
+    {
+      "epoch": 2.6668447293447293,
+      "grad_norm": 0.7201130390167236,
+      "learning_rate": 5.010509554160316e-05,
+      "loss": 0.7122,
+      "step": 14979
+    },
+    {
+      "epoch": 2.667022792022792,
+      "grad_norm": 0.8296586275100708,
+      "learning_rate": 5.009296538260457e-05,
+      "loss": 0.8816,
+      "step": 14980
+    },
+    {
+      "epoch": 2.6672008547008548,
+      "grad_norm": 0.8647085428237915,
+      "learning_rate": 5.008083620143067e-05,
+      "loss": 1.0,
+      "step": 14981
+    },
+    {
+      "epoch": 2.6673789173789175,
+      "grad_norm": 0.8175796270370483,
+      "learning_rate": 5.0068707998319045e-05,
+      "loss": 0.6727,
+      "step": 14982
+    },
+    {
+      "epoch": 2.66755698005698,
+      "grad_norm": 0.8537090420722961,
+      "learning_rate": 5.0056580773507434e-05,
+      "loss": 0.8034,
+      "step": 14983
+    },
+    {
+      "epoch": 2.6677350427350426,
+      "grad_norm": 0.7980232238769531,
+      "learning_rate": 5.00444545272333e-05,
+      "loss": 0.83,
+      "step": 14984
+    },
+    {
+      "epoch": 2.6679131054131053,
+      "grad_norm": 0.8231784701347351,
+      "learning_rate": 5.003232925973438e-05,
+      "loss": 0.6292,
+      "step": 14985
+    },
+    {
+      "epoch": 2.668091168091168,
+      "grad_norm": 0.9140519499778748,
+      "learning_rate": 5.0020204971248096e-05,
+      "loss": 0.893,
+      "step": 14986
+    },
+    {
+      "epoch": 2.668269230769231,
+      "grad_norm": 0.7462875247001648,
+      "learning_rate": 5.000808166201212e-05,
+      "loss": 0.7335,
+      "step": 14987
+    },
+    {
+      "epoch": 2.6684472934472936,
+      "grad_norm": 0.8201214671134949,
+      "learning_rate": 4.999595933226392e-05,
+      "loss": 0.7888,
+      "step": 14988
+    },
+    {
+      "epoch": 2.6686253561253563,
+      "grad_norm": 0.9165699481964111,
+      "learning_rate": 4.9983837982241024e-05,
+      "loss": 0.8808,
+      "step": 14989
+    },
+    {
+      "epoch": 2.6688034188034186,
+      "grad_norm": 0.9286229610443115,
+      "learning_rate": 4.997171761218092e-05,
+      "loss": 0.969,
+      "step": 14990
+    },
+    {
+      "epoch": 2.6689814814814814,
+      "grad_norm": 0.6710283160209656,
+      "learning_rate": 4.995959822232109e-05,
+      "loss": 0.6046,
+      "step": 14991
+    },
+    {
+      "epoch": 2.669159544159544,
+      "grad_norm": 0.9091618061065674,
+      "learning_rate": 4.994747981289895e-05,
+      "loss": 0.9747,
+      "step": 14992
+    },
+    {
+      "epoch": 2.669337606837607,
+      "grad_norm": 0.7992748618125916,
+      "learning_rate": 4.993536238415204e-05,
+      "loss": 0.8441,
+      "step": 14993
+    },
+    {
+      "epoch": 2.6695156695156697,
+      "grad_norm": 0.926811695098877,
+      "learning_rate": 4.992324593631762e-05,
+      "loss": 1.0308,
+      "step": 14994
+    },
+    {
+      "epoch": 2.669693732193732,
+      "grad_norm": 0.8966291546821594,
+      "learning_rate": 4.9911130469633216e-05,
+      "loss": 1.0689,
+      "step": 14995
+    },
+    {
+      "epoch": 2.6698717948717947,
+      "grad_norm": 0.8300046324729919,
+      "learning_rate": 4.989901598433616e-05,
+      "loss": 0.8539,
+      "step": 14996
+    },
+    {
+      "epoch": 2.6700498575498575,
+      "grad_norm": 0.9567606449127197,
+      "learning_rate": 4.988690248066381e-05,
+      "loss": 0.7707,
+      "step": 14997
+    },
+    {
+      "epoch": 2.67022792022792,
+      "grad_norm": 0.7993598580360413,
+      "learning_rate": 4.987478995885351e-05,
+      "loss": 0.9241,
+      "step": 14998
+    },
+    {
+      "epoch": 2.670405982905983,
+      "grad_norm": 0.9573900103569031,
+      "learning_rate": 4.986267841914253e-05,
+      "loss": 0.8051,
+      "step": 14999
+    },
+    {
+      "epoch": 2.6705840455840457,
+      "grad_norm": 0.8562188148498535,
+      "learning_rate": 4.985056786176828e-05,
+      "loss": 0.8818,
+      "step": 15000
+    },
+    {
+      "epoch": 2.6707621082621085,
+      "grad_norm": 0.7997880578041077,
+      "learning_rate": 4.983845828696792e-05,
+      "loss": 0.8749,
+      "step": 15001
+    },
+    {
+      "epoch": 2.6709401709401708,
+      "grad_norm": 0.8442137837409973,
+      "learning_rate": 4.982634969497879e-05,
+      "loss": 1.0239,
+      "step": 15002
+    },
+    {
+      "epoch": 2.6711182336182335,
+      "grad_norm": 0.775762140750885,
+      "learning_rate": 4.981424208603812e-05,
+      "loss": 0.728,
+      "step": 15003
+    },
+    {
+      "epoch": 2.6712962962962963,
+      "grad_norm": 0.7570006251335144,
+      "learning_rate": 4.9802135460383126e-05,
+      "loss": 0.6964,
+      "step": 15004
+    },
+    {
+      "epoch": 2.671474358974359,
+      "grad_norm": 0.8406931161880493,
+      "learning_rate": 4.979002981825101e-05,
+      "loss": 0.783,
+      "step": 15005
+    },
+    {
+      "epoch": 2.671652421652422,
+      "grad_norm": 0.827357828617096,
+      "learning_rate": 4.977792515987896e-05,
+      "loss": 0.9294,
+      "step": 15006
+    },
+    {
+      "epoch": 2.671830484330484,
+      "grad_norm": 0.9244057536125183,
+      "learning_rate": 4.9765821485504094e-05,
+      "loss": 0.8993,
+      "step": 15007
+    },
+    {
+      "epoch": 2.672008547008547,
+      "grad_norm": 0.7569696307182312,
+      "learning_rate": 4.975371879536368e-05,
+      "loss": 0.8305,
+      "step": 15008
+    },
+    {
+      "epoch": 2.6721866096866096,
+      "grad_norm": 0.8337959051132202,
+      "learning_rate": 4.9741617089694695e-05,
+      "loss": 0.8793,
+      "step": 15009
+    },
+    {
+      "epoch": 2.6723646723646723,
+      "grad_norm": 0.7254770994186401,
+      "learning_rate": 4.97295163687344e-05,
+      "loss": 0.9325,
+      "step": 15010
+    },
+    {
+      "epoch": 2.672542735042735,
+      "grad_norm": 0.7988013029098511,
+      "learning_rate": 4.971741663271972e-05,
+      "loss": 0.9787,
+      "step": 15011
+    },
+    {
+      "epoch": 2.672720797720798,
+      "grad_norm": 0.8326970338821411,
+      "learning_rate": 4.9705317881887845e-05,
+      "loss": 0.9164,
+      "step": 15012
+    },
+    {
+      "epoch": 2.6728988603988606,
+      "grad_norm": 0.7416687607765198,
+      "learning_rate": 4.96932201164758e-05,
+      "loss": 0.9041,
+      "step": 15013
+    },
+    {
+      "epoch": 2.6730769230769234,
+      "grad_norm": 0.868765652179718,
+      "learning_rate": 4.968112333672059e-05,
+      "loss": 0.646,
+      "step": 15014
+    },
+    {
+      "epoch": 2.6732549857549857,
+      "grad_norm": 0.7440044283866882,
+      "learning_rate": 4.966902754285925e-05,
+      "loss": 0.9147,
+      "step": 15015
+    },
+    {
+      "epoch": 2.6734330484330484,
+      "grad_norm": 0.8410077691078186,
+      "learning_rate": 4.9656932735128724e-05,
+      "loss": 0.8044,
+      "step": 15016
+    },
+    {
+      "epoch": 2.673611111111111,
+      "grad_norm": 0.8185286521911621,
+      "learning_rate": 4.964483891376606e-05,
+      "loss": 0.8057,
+      "step": 15017
+    },
+    {
+      "epoch": 2.673789173789174,
+      "grad_norm": 0.8550063967704773,
+      "learning_rate": 4.9632746079008166e-05,
+      "loss": 0.7841,
+      "step": 15018
+    },
+    {
+      "epoch": 2.673967236467236,
+      "grad_norm": 0.9171682000160217,
+      "learning_rate": 4.962065423109199e-05,
+      "loss": 0.8731,
+      "step": 15019
+    },
+    {
+      "epoch": 2.674145299145299,
+      "grad_norm": 0.8567686676979065,
+      "learning_rate": 4.9608563370254436e-05,
+      "loss": 0.9284,
+      "step": 15020
+    },
+    {
+      "epoch": 2.6743233618233617,
+      "grad_norm": 0.8641629219055176,
+      "learning_rate": 4.959647349673241e-05,
+      "loss": 1.0165,
+      "step": 15021
+    },
+    {
+      "epoch": 2.6745014245014245,
+      "grad_norm": 0.8058172464370728,
+      "learning_rate": 4.958438461076277e-05,
+      "loss": 0.9737,
+      "step": 15022
+    },
+    {
+      "epoch": 2.6746794871794872,
+      "grad_norm": 0.8329246640205383,
+      "learning_rate": 4.95722967125824e-05,
+      "loss": 0.7943,
+      "step": 15023
+    },
+    {
+      "epoch": 2.67485754985755,
+      "grad_norm": 0.9603211879730225,
+      "learning_rate": 4.956020980242807e-05,
+      "loss": 0.9453,
+      "step": 15024
+    },
+    {
+      "epoch": 2.6750356125356127,
+      "grad_norm": 0.8635705709457397,
+      "learning_rate": 4.9548123880536736e-05,
+      "loss": 0.9028,
+      "step": 15025
+    },
+    {
+      "epoch": 2.6752136752136755,
+      "grad_norm": 0.8909839987754822,
+      "learning_rate": 4.9536038947145024e-05,
+      "loss": 1.0376,
+      "step": 15026
+    },
+    {
+      "epoch": 2.675391737891738,
+      "grad_norm": 0.7507481575012207,
+      "learning_rate": 4.952395500248984e-05,
+      "loss": 0.7151,
+      "step": 15027
+    },
+    {
+      "epoch": 2.6755698005698005,
+      "grad_norm": 0.9425675272941589,
+      "learning_rate": 4.951187204680791e-05,
+      "loss": 0.839,
+      "step": 15028
+    },
+    {
+      "epoch": 2.6757478632478633,
+      "grad_norm": 0.8826829195022583,
+      "learning_rate": 4.949979008033596e-05,
+      "loss": 1.0107,
+      "step": 15029
+    },
+    {
+      "epoch": 2.675925925925926,
+      "grad_norm": 0.9209766387939453,
+      "learning_rate": 4.948770910331072e-05,
+      "loss": 0.8685,
+      "step": 15030
+    },
+    {
+      "epoch": 2.6761039886039883,
+      "grad_norm": 0.8018497824668884,
+      "learning_rate": 4.947562911596889e-05,
+      "loss": 0.7417,
+      "step": 15031
+    },
+    {
+      "epoch": 2.676282051282051,
+      "grad_norm": 0.7865417003631592,
+      "learning_rate": 4.9463550118547155e-05,
+      "loss": 0.9332,
+      "step": 15032
+    },
+    {
+      "epoch": 2.676460113960114,
+      "grad_norm": 0.8146806955337524,
+      "learning_rate": 4.945147211128216e-05,
+      "loss": 0.8658,
+      "step": 15033
+    },
+    {
+      "epoch": 2.6766381766381766,
+      "grad_norm": 0.8176286816596985,
+      "learning_rate": 4.943939509441054e-05,
+      "loss": 1.0603,
+      "step": 15034
+    },
+    {
+      "epoch": 2.6768162393162394,
+      "grad_norm": 0.8441028594970703,
+      "learning_rate": 4.942731906816897e-05,
+      "loss": 0.8699,
+      "step": 15035
+    },
+    {
+      "epoch": 2.676994301994302,
+      "grad_norm": 1.0035977363586426,
+      "learning_rate": 4.941524403279405e-05,
+      "loss": 0.8149,
+      "step": 15036
+    },
+    {
+      "epoch": 2.677172364672365,
+      "grad_norm": 0.8316586017608643,
+      "learning_rate": 4.9403169988522324e-05,
+      "loss": 0.9674,
+      "step": 15037
+    },
+    {
+      "epoch": 2.6773504273504276,
+      "grad_norm": 0.7379693388938904,
+      "learning_rate": 4.9391096935590375e-05,
+      "loss": 0.7097,
+      "step": 15038
+    },
+    {
+      "epoch": 2.67752849002849,
+      "grad_norm": 0.8861358165740967,
+      "learning_rate": 4.937902487423473e-05,
+      "loss": 0.9145,
+      "step": 15039
+    },
+    {
+      "epoch": 2.6777065527065527,
+      "grad_norm": 0.8769996166229248,
+      "learning_rate": 4.9366953804691994e-05,
+      "loss": 0.92,
+      "step": 15040
+    },
+    {
+      "epoch": 2.6778846153846154,
+      "grad_norm": 0.891703724861145,
+      "learning_rate": 4.9354883727198545e-05,
+      "loss": 0.8898,
+      "step": 15041
+    },
+    {
+      "epoch": 2.678062678062678,
+      "grad_norm": 0.8371208310127258,
+      "learning_rate": 4.934281464199099e-05,
+      "loss": 0.8868,
+      "step": 15042
+    },
+    {
+      "epoch": 2.6782407407407405,
+      "grad_norm": 0.8618297576904297,
+      "learning_rate": 4.933074654930574e-05,
+      "loss": 0.8577,
+      "step": 15043
+    },
+    {
+      "epoch": 2.6784188034188032,
+      "grad_norm": 0.7748361229896545,
+      "learning_rate": 4.931867944937926e-05,
+      "loss": 0.7273,
+      "step": 15044
+    },
+    {
+      "epoch": 2.678596866096866,
+      "grad_norm": 0.8320143222808838,
+      "learning_rate": 4.930661334244797e-05,
+      "loss": 0.8654,
+      "step": 15045
+    },
+    {
+      "epoch": 2.6787749287749287,
+      "grad_norm": 0.8370615243911743,
+      "learning_rate": 4.929454822874829e-05,
+      "loss": 0.751,
+      "step": 15046
+    },
+    {
+      "epoch": 2.6789529914529915,
+      "grad_norm": 0.9115342497825623,
+      "learning_rate": 4.9282484108516614e-05,
+      "loss": 0.823,
+      "step": 15047
+    },
+    {
+      "epoch": 2.6791310541310542,
+      "grad_norm": 0.9542914032936096,
+      "learning_rate": 4.9270420981989294e-05,
+      "loss": 0.9271,
+      "step": 15048
+    },
+    {
+      "epoch": 2.679309116809117,
+      "grad_norm": 0.765336275100708,
+      "learning_rate": 4.9258358849402655e-05,
+      "loss": 0.5523,
+      "step": 15049
+    },
+    {
+      "epoch": 2.6794871794871797,
+      "grad_norm": 0.8169335722923279,
+      "learning_rate": 4.924629771099315e-05,
+      "loss": 0.7437,
+      "step": 15050
+    },
+    {
+      "epoch": 2.679665242165242,
+      "grad_norm": 0.8192304968833923,
+      "learning_rate": 4.9234237566996935e-05,
+      "loss": 0.8888,
+      "step": 15051
+    },
+    {
+      "epoch": 2.679843304843305,
+      "grad_norm": 0.8657594919204712,
+      "learning_rate": 4.922217841765041e-05,
+      "loss": 0.9858,
+      "step": 15052
+    },
+    {
+      "epoch": 2.6800213675213675,
+      "grad_norm": 0.9291723370552063,
+      "learning_rate": 4.921012026318982e-05,
+      "loss": 0.9731,
+      "step": 15053
+    },
+    {
+      "epoch": 2.6801994301994303,
+      "grad_norm": 0.7988953590393066,
+      "learning_rate": 4.919806310385138e-05,
+      "loss": 0.8467,
+      "step": 15054
+    },
+    {
+      "epoch": 2.6803774928774926,
+      "grad_norm": 0.8022913336753845,
+      "learning_rate": 4.9186006939871434e-05,
+      "loss": 0.9009,
+      "step": 15055
+    },
+    {
+      "epoch": 2.6805555555555554,
+      "grad_norm": 0.8444825410842896,
+      "learning_rate": 4.917395177148605e-05,
+      "loss": 0.8851,
+      "step": 15056
+    },
+    {
+      "epoch": 2.680733618233618,
+      "grad_norm": 0.8054760694503784,
+      "learning_rate": 4.9161897598931575e-05,
+      "loss": 0.8679,
+      "step": 15057
+    },
+    {
+      "epoch": 2.680911680911681,
+      "grad_norm": 0.8291507959365845,
+      "learning_rate": 4.9149844422444023e-05,
+      "loss": 0.7229,
+      "step": 15058
+    },
+    {
+      "epoch": 2.6810897435897436,
+      "grad_norm": 0.9225491285324097,
+      "learning_rate": 4.91377922422597e-05,
+      "loss": 0.7584,
+      "step": 15059
+    },
+    {
+      "epoch": 2.6812678062678064,
+      "grad_norm": 0.9598490595817566,
+      "learning_rate": 4.912574105861466e-05,
+      "loss": 1.0548,
+      "step": 15060
+    },
+    {
+      "epoch": 2.681445868945869,
+      "grad_norm": 0.7480899691581726,
+      "learning_rate": 4.911369087174504e-05,
+      "loss": 0.8389,
+      "step": 15061
+    },
+    {
+      "epoch": 2.681623931623932,
+      "grad_norm": 1.0396811962127686,
+      "learning_rate": 4.910164168188696e-05,
+      "loss": 0.8776,
+      "step": 15062
+    },
+    {
+      "epoch": 2.681801994301994,
+      "grad_norm": 0.8191503882408142,
+      "learning_rate": 4.9089593489276465e-05,
+      "loss": 0.7601,
+      "step": 15063
+    },
+    {
+      "epoch": 2.681980056980057,
+      "grad_norm": 0.8405289053916931,
+      "learning_rate": 4.907754629414959e-05,
+      "loss": 1.0859,
+      "step": 15064
+    },
+    {
+      "epoch": 2.6821581196581197,
+      "grad_norm": 0.8369600176811218,
+      "learning_rate": 4.90655000967425e-05,
+      "loss": 0.9159,
+      "step": 15065
+    },
+    {
+      "epoch": 2.6823361823361824,
+      "grad_norm": 0.8304924368858337,
+      "learning_rate": 4.905345489729104e-05,
+      "loss": 0.743,
+      "step": 15066
+    },
+    {
+      "epoch": 2.682514245014245,
+      "grad_norm": 0.7378702163696289,
+      "learning_rate": 4.904141069603139e-05,
+      "loss": 0.9386,
+      "step": 15067
+    },
+    {
+      "epoch": 2.6826923076923075,
+      "grad_norm": 0.9135075807571411,
+      "learning_rate": 4.902936749319935e-05,
+      "loss": 0.7341,
+      "step": 15068
+    },
+    {
+      "epoch": 2.6828703703703702,
+      "grad_norm": 0.77586430311203,
+      "learning_rate": 4.901732528903101e-05,
+      "loss": 0.5586,
+      "step": 15069
+    },
+    {
+      "epoch": 2.683048433048433,
+      "grad_norm": 0.8733307719230652,
+      "learning_rate": 4.900528408376228e-05,
+      "loss": 0.8173,
+      "step": 15070
+    },
+    {
+      "epoch": 2.6832264957264957,
+      "grad_norm": 0.7499578595161438,
+      "learning_rate": 4.8993243877629066e-05,
+      "loss": 0.7355,
+      "step": 15071
+    },
+    {
+      "epoch": 2.6834045584045585,
+      "grad_norm": 0.8372282385826111,
+      "learning_rate": 4.8981204670867295e-05,
+      "loss": 0.8169,
+      "step": 15072
+    },
+    {
+      "epoch": 2.6835826210826212,
+      "grad_norm": 0.7705212235450745,
+      "learning_rate": 4.8969166463712834e-05,
+      "loss": 0.7382,
+      "step": 15073
+    },
+    {
+      "epoch": 2.683760683760684,
+      "grad_norm": 0.8367058038711548,
+      "learning_rate": 4.89571292564015e-05,
+      "loss": 0.7268,
+      "step": 15074
+    },
+    {
+      "epoch": 2.6839387464387463,
+      "grad_norm": 0.8421934843063354,
+      "learning_rate": 4.8945093049169233e-05,
+      "loss": 0.8319,
+      "step": 15075
+    },
+    {
+      "epoch": 2.684116809116809,
+      "grad_norm": 0.8927276730537415,
+      "learning_rate": 4.893305784225181e-05,
+      "loss": 0.8669,
+      "step": 15076
+    },
+    {
+      "epoch": 2.684294871794872,
+      "grad_norm": 0.8147335052490234,
+      "learning_rate": 4.892102363588503e-05,
+      "loss": 0.7722,
+      "step": 15077
+    },
+    {
+      "epoch": 2.6844729344729346,
+      "grad_norm": 0.9491320848464966,
+      "learning_rate": 4.890899043030469e-05,
+      "loss": 1.0213,
+      "step": 15078
+    },
+    {
+      "epoch": 2.6846509971509973,
+      "grad_norm": 0.8635398745536804,
+      "learning_rate": 4.889695822574651e-05,
+      "loss": 0.797,
+      "step": 15079
+    },
+    {
+      "epoch": 2.6848290598290596,
+      "grad_norm": 0.7290985584259033,
+      "learning_rate": 4.888492702244636e-05,
+      "loss": 1.0142,
+      "step": 15080
+    },
+    {
+      "epoch": 2.6850071225071224,
+      "grad_norm": 0.7667058110237122,
+      "learning_rate": 4.8872896820639794e-05,
+      "loss": 0.7547,
+      "step": 15081
+    },
+    {
+      "epoch": 2.685185185185185,
+      "grad_norm": 0.9096128344535828,
+      "learning_rate": 4.886086762056269e-05,
+      "loss": 0.7972,
+      "step": 15082
+    },
+    {
+      "epoch": 2.685363247863248,
+      "grad_norm": 0.7461803555488586,
+      "learning_rate": 4.884883942245057e-05,
+      "loss": 0.8994,
+      "step": 15083
+    },
+    {
+      "epoch": 2.6855413105413106,
+      "grad_norm": 0.7640016674995422,
+      "learning_rate": 4.883681222653923e-05,
+      "loss": 0.7607,
+      "step": 15084
+    },
+    {
+      "epoch": 2.6857193732193734,
+      "grad_norm": 0.7481253743171692,
+      "learning_rate": 4.882478603306427e-05,
+      "loss": 0.7089,
+      "step": 15085
+    },
+    {
+      "epoch": 2.685897435897436,
+      "grad_norm": 0.825998842716217,
+      "learning_rate": 4.881276084226132e-05,
+      "loss": 0.6617,
+      "step": 15086
+    },
+    {
+      "epoch": 2.6860754985754984,
+      "grad_norm": 0.9775291085243225,
+      "learning_rate": 4.8800736654365986e-05,
+      "loss": 0.9345,
+      "step": 15087
+    },
+    {
+      "epoch": 2.686253561253561,
+      "grad_norm": 0.8158339262008667,
+      "learning_rate": 4.878871346961387e-05,
+      "loss": 0.8198,
+      "step": 15088
+    },
+    {
+      "epoch": 2.686431623931624,
+      "grad_norm": 0.8778133988380432,
+      "learning_rate": 4.8776691288240486e-05,
+      "loss": 0.8323,
+      "step": 15089
+    },
+    {
+      "epoch": 2.6866096866096867,
+      "grad_norm": 0.9657309055328369,
+      "learning_rate": 4.8764670110481505e-05,
+      "loss": 0.907,
+      "step": 15090
+    },
+    {
+      "epoch": 2.6867877492877494,
+      "grad_norm": 1.0467438697814941,
+      "learning_rate": 4.8752649936572304e-05,
+      "loss": 1.0128,
+      "step": 15091
+    },
+    {
+      "epoch": 2.6869658119658117,
+      "grad_norm": 0.7682142853736877,
+      "learning_rate": 4.874063076674854e-05,
+      "loss": 1.0164,
+      "step": 15092
+    },
+    {
+      "epoch": 2.6871438746438745,
+      "grad_norm": 0.8184331059455872,
+      "learning_rate": 4.8728612601245574e-05,
+      "loss": 0.6614,
+      "step": 15093
+    },
+    {
+      "epoch": 2.6873219373219372,
+      "grad_norm": 0.8372936248779297,
+      "learning_rate": 4.871659544029896e-05,
+      "loss": 0.9011,
+      "step": 15094
+    },
+    {
+      "epoch": 2.6875,
+      "grad_norm": 0.7872710824012756,
+      "learning_rate": 4.870457928414414e-05,
+      "loss": 0.6986,
+      "step": 15095
+    },
+    {
+      "epoch": 2.6876780626780628,
+      "grad_norm": 0.7297250628471375,
+      "learning_rate": 4.8692564133016485e-05,
+      "loss": 0.5399,
+      "step": 15096
+    },
+    {
+      "epoch": 2.6878561253561255,
+      "grad_norm": 0.8855645060539246,
+      "learning_rate": 4.868054998715153e-05,
+      "loss": 0.8992,
+      "step": 15097
+    },
+    {
+      "epoch": 2.6880341880341883,
+      "grad_norm": 0.9055765271186829,
+      "learning_rate": 4.866853684678452e-05,
+      "loss": 0.888,
+      "step": 15098
+    },
+    {
+      "epoch": 2.6882122507122506,
+      "grad_norm": 1.0414996147155762,
+      "learning_rate": 4.865652471215093e-05,
+      "loss": 1.0375,
+      "step": 15099
+    },
+    {
+      "epoch": 2.6883903133903133,
+      "grad_norm": 0.8606446385383606,
+      "learning_rate": 4.8644513583486086e-05,
+      "loss": 0.8906,
+      "step": 15100
+    },
+    {
+      "epoch": 2.688568376068376,
+      "grad_norm": 0.9065528512001038,
+      "learning_rate": 4.8632503461025316e-05,
+      "loss": 0.848,
+      "step": 15101
+    },
+    {
+      "epoch": 2.688746438746439,
+      "grad_norm": 0.7832834720611572,
+      "learning_rate": 4.862049434500393e-05,
+      "loss": 0.7028,
+      "step": 15102
+    },
+    {
+      "epoch": 2.6889245014245016,
+      "grad_norm": 0.7107385396957397,
+      "learning_rate": 4.860848623565723e-05,
+      "loss": 0.7249,
+      "step": 15103
+    },
+    {
+      "epoch": 2.689102564102564,
+      "grad_norm": 0.8936449289321899,
+      "learning_rate": 4.8596479133220485e-05,
+      "loss": 0.9651,
+      "step": 15104
+    },
+    {
+      "epoch": 2.6892806267806266,
+      "grad_norm": 0.9019163846969604,
+      "learning_rate": 4.8584473037928944e-05,
+      "loss": 0.7165,
+      "step": 15105
+    },
+    {
+      "epoch": 2.6894586894586894,
+      "grad_norm": 0.8838223218917847,
+      "learning_rate": 4.857246795001782e-05,
+      "loss": 0.8148,
+      "step": 15106
+    },
+    {
+      "epoch": 2.689636752136752,
+      "grad_norm": 0.8004612922668457,
+      "learning_rate": 4.856046386972243e-05,
+      "loss": 0.9109,
+      "step": 15107
+    },
+    {
+      "epoch": 2.689814814814815,
+      "grad_norm": 0.9337486028671265,
+      "learning_rate": 4.854846079727781e-05,
+      "loss": 1.0952,
+      "step": 15108
+    },
+    {
+      "epoch": 2.6899928774928776,
+      "grad_norm": 0.6513102650642395,
+      "learning_rate": 4.853645873291926e-05,
+      "loss": 0.5435,
+      "step": 15109
+    },
+    {
+      "epoch": 2.6901709401709404,
+      "grad_norm": 0.8750485181808472,
+      "learning_rate": 4.85244576768819e-05,
+      "loss": 0.8783,
+      "step": 15110
+    },
+    {
+      "epoch": 2.6903490028490027,
+      "grad_norm": 0.9513342976570129,
+      "learning_rate": 4.851245762940085e-05,
+      "loss": 0.8822,
+      "step": 15111
+    },
+    {
+      "epoch": 2.6905270655270654,
+      "grad_norm": 0.8832191824913025,
+      "learning_rate": 4.850045859071125e-05,
+      "loss": 0.9216,
+      "step": 15112
+    },
+    {
+      "epoch": 2.690705128205128,
+      "grad_norm": 0.875396728515625,
+      "learning_rate": 4.8488460561048175e-05,
+      "loss": 0.998,
+      "step": 15113
+    },
+    {
+      "epoch": 2.690883190883191,
+      "grad_norm": 0.8847890496253967,
+      "learning_rate": 4.847646354064668e-05,
+      "loss": 1.0916,
+      "step": 15114
+    },
+    {
+      "epoch": 2.6910612535612537,
+      "grad_norm": 0.8235226273536682,
+      "learning_rate": 4.846446752974187e-05,
+      "loss": 0.8154,
+      "step": 15115
+    },
+    {
+      "epoch": 2.691239316239316,
+      "grad_norm": 0.8099366426467896,
+      "learning_rate": 4.845247252856878e-05,
+      "loss": 0.9392,
+      "step": 15116
+    },
+    {
+      "epoch": 2.6914173789173788,
+      "grad_norm": 0.8525599837303162,
+      "learning_rate": 4.84404785373624e-05,
+      "loss": 0.6619,
+      "step": 15117
+    },
+    {
+      "epoch": 2.6915954415954415,
+      "grad_norm": 1.0223274230957031,
+      "learning_rate": 4.842848555635775e-05,
+      "loss": 0.9479,
+      "step": 15118
+    },
+    {
+      "epoch": 2.6917735042735043,
+      "grad_norm": 0.7834655046463013,
+      "learning_rate": 4.841649358578978e-05,
+      "loss": 0.6962,
+      "step": 15119
+    },
+    {
+      "epoch": 2.691951566951567,
+      "grad_norm": 0.787391185760498,
+      "learning_rate": 4.8404502625893474e-05,
+      "loss": 0.8598,
+      "step": 15120
+    },
+    {
+      "epoch": 2.6921296296296298,
+      "grad_norm": 0.907228410243988,
+      "learning_rate": 4.839251267690371e-05,
+      "loss": 0.9913,
+      "step": 15121
+    },
+    {
+      "epoch": 2.6923076923076925,
+      "grad_norm": 0.8313533663749695,
+      "learning_rate": 4.838052373905554e-05,
+      "loss": 0.9542,
+      "step": 15122
+    },
+    {
+      "epoch": 2.692485754985755,
+      "grad_norm": 0.8444675207138062,
+      "learning_rate": 4.83685358125837e-05,
+      "loss": 0.7437,
+      "step": 15123
+    },
+    {
+      "epoch": 2.6926638176638176,
+      "grad_norm": 0.8656189441680908,
+      "learning_rate": 4.835654889772319e-05,
+      "loss": 1.104,
+      "step": 15124
+    },
+    {
+      "epoch": 2.6928418803418803,
+      "grad_norm": 0.9181584715843201,
+      "learning_rate": 4.8344562994708805e-05,
+      "loss": 0.8533,
+      "step": 15125
+    },
+    {
+      "epoch": 2.693019943019943,
+      "grad_norm": 0.5977702140808105,
+      "learning_rate": 4.833257810377542e-05,
+      "loss": 0.495,
+      "step": 15126
+    },
+    {
+      "epoch": 2.693198005698006,
+      "grad_norm": 0.8839932084083557,
+      "learning_rate": 4.8320594225157834e-05,
+      "loss": 0.8026,
+      "step": 15127
+    },
+    {
+      "epoch": 2.693376068376068,
+      "grad_norm": 0.876559853553772,
+      "learning_rate": 4.8308611359090846e-05,
+      "loss": 0.893,
+      "step": 15128
+    },
+    {
+      "epoch": 2.693554131054131,
+      "grad_norm": 0.7847880721092224,
+      "learning_rate": 4.829662950580924e-05,
+      "loss": 0.7794,
+      "step": 15129
+    },
+    {
+      "epoch": 2.6937321937321936,
+      "grad_norm": 0.8713442087173462,
+      "learning_rate": 4.828464866554778e-05,
+      "loss": 1.0394,
+      "step": 15130
+    },
+    {
+      "epoch": 2.6939102564102564,
+      "grad_norm": 0.9720988869667053,
+      "learning_rate": 4.827266883854116e-05,
+      "loss": 0.7844,
+      "step": 15131
+    },
+    {
+      "epoch": 2.694088319088319,
+      "grad_norm": 0.8163195252418518,
+      "learning_rate": 4.82606900250242e-05,
+      "loss": 0.711,
+      "step": 15132
+    },
+    {
+      "epoch": 2.694266381766382,
+      "grad_norm": 0.7119855880737305,
+      "learning_rate": 4.8248712225231486e-05,
+      "loss": 0.6224,
+      "step": 15133
+    },
+    {
+      "epoch": 2.6944444444444446,
+      "grad_norm": 0.8176950812339783,
+      "learning_rate": 4.823673543939777e-05,
+      "loss": 0.8695,
+      "step": 15134
+    },
+    {
+      "epoch": 2.6946225071225074,
+      "grad_norm": 0.8138632774353027,
+      "learning_rate": 4.822475966775771e-05,
+      "loss": 0.7331,
+      "step": 15135
+    },
+    {
+      "epoch": 2.6948005698005697,
+      "grad_norm": 0.9323116540908813,
+      "learning_rate": 4.821278491054589e-05,
+      "loss": 0.8275,
+      "step": 15136
+    },
+    {
+      "epoch": 2.6949786324786325,
+      "grad_norm": 0.7593950033187866,
+      "learning_rate": 4.820081116799704e-05,
+      "loss": 0.9571,
+      "step": 15137
+    },
+    {
+      "epoch": 2.695156695156695,
+      "grad_norm": 0.9058876037597656,
+      "learning_rate": 4.818883844034563e-05,
+      "loss": 0.7676,
+      "step": 15138
+    },
+    {
+      "epoch": 2.695334757834758,
+      "grad_norm": 1.0943962335586548,
+      "learning_rate": 4.8176866727826365e-05,
+      "loss": 0.7542,
+      "step": 15139
+    },
+    {
+      "epoch": 2.6955128205128203,
+      "grad_norm": 0.9133912324905396,
+      "learning_rate": 4.8164896030673664e-05,
+      "loss": 0.8419,
+      "step": 15140
+    },
+    {
+      "epoch": 2.695690883190883,
+      "grad_norm": 0.8556821942329407,
+      "learning_rate": 4.8152926349122195e-05,
+      "loss": 0.8234,
+      "step": 15141
+    },
+    {
+      "epoch": 2.6958689458689458,
+      "grad_norm": 1.0329471826553345,
+      "learning_rate": 4.814095768340643e-05,
+      "loss": 0.8181,
+      "step": 15142
+    },
+    {
+      "epoch": 2.6960470085470085,
+      "grad_norm": 0.89934903383255,
+      "learning_rate": 4.812899003376087e-05,
+      "loss": 0.8392,
+      "step": 15143
+    },
+    {
+      "epoch": 2.6962250712250713,
+      "grad_norm": 0.7836576104164124,
+      "learning_rate": 4.811702340042e-05,
+      "loss": 0.9491,
+      "step": 15144
+    },
+    {
+      "epoch": 2.696403133903134,
+      "grad_norm": 0.9841184020042419,
+      "learning_rate": 4.810505778361828e-05,
+      "loss": 1.0763,
+      "step": 15145
+    },
+    {
+      "epoch": 2.6965811965811968,
+      "grad_norm": 1.0479893684387207,
+      "learning_rate": 4.80930931835901e-05,
+      "loss": 1.054,
+      "step": 15146
+    },
+    {
+      "epoch": 2.6967592592592595,
+      "grad_norm": 0.895803689956665,
+      "learning_rate": 4.808112960057002e-05,
+      "loss": 0.8769,
+      "step": 15147
+    },
+    {
+      "epoch": 2.696937321937322,
+      "grad_norm": 0.8467312455177307,
+      "learning_rate": 4.806916703479227e-05,
+      "loss": 0.8036,
+      "step": 15148
+    },
+    {
+      "epoch": 2.6971153846153846,
+      "grad_norm": 0.7371073365211487,
+      "learning_rate": 4.8057205486491366e-05,
+      "loss": 0.72,
+      "step": 15149
+    },
+    {
+      "epoch": 2.6972934472934473,
+      "grad_norm": 0.9631866812705994,
+      "learning_rate": 4.80452449559016e-05,
+      "loss": 0.8661,
+      "step": 15150
+    },
+    {
+      "epoch": 2.69747150997151,
+      "grad_norm": 0.8467531204223633,
+      "learning_rate": 4.803328544325735e-05,
+      "loss": 0.9359,
+      "step": 15151
+    },
+    {
+      "epoch": 2.6976495726495724,
+      "grad_norm": 0.8170605897903442,
+      "learning_rate": 4.802132694879291e-05,
+      "loss": 0.9086,
+      "step": 15152
+    },
+    {
+      "epoch": 2.697827635327635,
+      "grad_norm": 0.8378857970237732,
+      "learning_rate": 4.800936947274255e-05,
+      "loss": 0.6255,
+      "step": 15153
+    },
+    {
+      "epoch": 2.698005698005698,
+      "grad_norm": 0.8074176907539368,
+      "learning_rate": 4.799741301534067e-05,
+      "loss": 0.9129,
+      "step": 15154
+    },
+    {
+      "epoch": 2.6981837606837606,
+      "grad_norm": 0.862147331237793,
+      "learning_rate": 4.798545757682139e-05,
+      "loss": 0.8298,
+      "step": 15155
+    },
+    {
+      "epoch": 2.6983618233618234,
+      "grad_norm": 0.8020915985107422,
+      "learning_rate": 4.797350315741905e-05,
+      "loss": 0.8364,
+      "step": 15156
+    },
+    {
+      "epoch": 2.698539886039886,
+      "grad_norm": 0.7929054498672485,
+      "learning_rate": 4.7961549757367854e-05,
+      "loss": 1.0302,
+      "step": 15157
+    },
+    {
+      "epoch": 2.698717948717949,
+      "grad_norm": 0.8528931140899658,
+      "learning_rate": 4.7949597376901964e-05,
+      "loss": 0.7891,
+      "step": 15158
+    },
+    {
+      "epoch": 2.6988960113960117,
+      "grad_norm": 0.8090588450431824,
+      "learning_rate": 4.793764601625561e-05,
+      "loss": 0.7905,
+      "step": 15159
+    },
+    {
+      "epoch": 2.699074074074074,
+      "grad_norm": 0.8221202492713928,
+      "learning_rate": 4.7925695675662916e-05,
+      "loss": 0.8156,
+      "step": 15160
+    },
+    {
+      "epoch": 2.6992521367521367,
+      "grad_norm": 0.8121498823165894,
+      "learning_rate": 4.791374635535802e-05,
+      "loss": 0.865,
+      "step": 15161
+    },
+    {
+      "epoch": 2.6994301994301995,
+      "grad_norm": 0.7626228928565979,
+      "learning_rate": 4.790179805557513e-05,
+      "loss": 0.8033,
+      "step": 15162
+    },
+    {
+      "epoch": 2.699608262108262,
+      "grad_norm": 0.8483169078826904,
+      "learning_rate": 4.7889850776548205e-05,
+      "loss": 0.9239,
+      "step": 15163
+    },
+    {
+      "epoch": 2.6997863247863245,
+      "grad_norm": 0.8302589058876038,
+      "learning_rate": 4.7877904518511485e-05,
+      "loss": 0.8445,
+      "step": 15164
+    },
+    {
+      "epoch": 2.6999643874643873,
+      "grad_norm": 0.9140453338623047,
+      "learning_rate": 4.786595928169887e-05,
+      "loss": 1.0492,
+      "step": 15165
+    },
+    {
+      "epoch": 2.70014245014245,
+      "grad_norm": 0.8046873807907104,
+      "learning_rate": 4.785401506634453e-05,
+      "loss": 1.0009,
+      "step": 15166
+    },
+    {
+      "epoch": 2.7003205128205128,
+      "grad_norm": 0.8879752159118652,
+      "learning_rate": 4.7842071872682434e-05,
+      "loss": 0.7788,
+      "step": 15167
+    },
+    {
+      "epoch": 2.7004985754985755,
+      "grad_norm": 0.8190163969993591,
+      "learning_rate": 4.783012970094659e-05,
+      "loss": 0.9063,
+      "step": 15168
+    },
+    {
+      "epoch": 2.7006766381766383,
+      "grad_norm": 0.9363130331039429,
+      "learning_rate": 4.781818855137099e-05,
+      "loss": 0.9723,
+      "step": 15169
+    },
+    {
+      "epoch": 2.700854700854701,
+      "grad_norm": 0.8428171873092651,
+      "learning_rate": 4.780624842418958e-05,
+      "loss": 0.9173,
+      "step": 15170
+    },
+    {
+      "epoch": 2.701032763532764,
+      "grad_norm": 0.8089821934700012,
+      "learning_rate": 4.779430931963627e-05,
+      "loss": 0.8996,
+      "step": 15171
+    },
+    {
+      "epoch": 2.701210826210826,
+      "grad_norm": 0.8893290758132935,
+      "learning_rate": 4.77823712379451e-05,
+      "loss": 0.9483,
+      "step": 15172
+    },
+    {
+      "epoch": 2.701388888888889,
+      "grad_norm": 0.8589824438095093,
+      "learning_rate": 4.777043417934981e-05,
+      "loss": 0.8765,
+      "step": 15173
+    },
+    {
+      "epoch": 2.7015669515669516,
+      "grad_norm": 0.8665438294410706,
+      "learning_rate": 4.7758498144084405e-05,
+      "loss": 0.8546,
+      "step": 15174
+    },
+    {
+      "epoch": 2.7017450142450143,
+      "grad_norm": 0.743841826915741,
+      "learning_rate": 4.774656313238272e-05,
+      "loss": 0.6866,
+      "step": 15175
+    },
+    {
+      "epoch": 2.7019230769230766,
+      "grad_norm": 0.9317346811294556,
+      "learning_rate": 4.7734629144478574e-05,
+      "loss": 0.8004,
+      "step": 15176
+    },
+    {
+      "epoch": 2.7021011396011394,
+      "grad_norm": 0.8244655132293701,
+      "learning_rate": 4.77226961806058e-05,
+      "loss": 0.9302,
+      "step": 15177
+    },
+    {
+      "epoch": 2.702279202279202,
+      "grad_norm": 1.0759600400924683,
+      "learning_rate": 4.771076424099815e-05,
+      "loss": 0.9073,
+      "step": 15178
+    },
+    {
+      "epoch": 2.702457264957265,
+      "grad_norm": 0.8852303624153137,
+      "learning_rate": 4.769883332588954e-05,
+      "loss": 0.8084,
+      "step": 15179
+    },
+    {
+      "epoch": 2.7026353276353277,
+      "grad_norm": 0.8642051815986633,
+      "learning_rate": 4.7686903435513564e-05,
+      "loss": 1.0018,
+      "step": 15180
+    },
+    {
+      "epoch": 2.7028133903133904,
+      "grad_norm": 0.9442928433418274,
+      "learning_rate": 4.767497457010408e-05,
+      "loss": 0.8099,
+      "step": 15181
+    },
+    {
+      "epoch": 2.702991452991453,
+      "grad_norm": 0.8357751965522766,
+      "learning_rate": 4.7663046729894776e-05,
+      "loss": 0.8594,
+      "step": 15182
+    },
+    {
+      "epoch": 2.703169515669516,
+      "grad_norm": 1.0791765451431274,
+      "learning_rate": 4.765111991511936e-05,
+      "loss": 1.1203,
+      "step": 15183
+    },
+    {
+      "epoch": 2.703347578347578,
+      "grad_norm": 0.7855654954910278,
+      "learning_rate": 4.7639194126011485e-05,
+      "loss": 0.7218,
+      "step": 15184
+    },
+    {
+      "epoch": 2.703525641025641,
+      "grad_norm": 0.8058420419692993,
+      "learning_rate": 4.762726936280485e-05,
+      "loss": 0.7885,
+      "step": 15185
+    },
+    {
+      "epoch": 2.7037037037037037,
+      "grad_norm": 0.7701787352561951,
+      "learning_rate": 4.761534562573302e-05,
+      "loss": 0.6378,
+      "step": 15186
+    },
+    {
+      "epoch": 2.7038817663817665,
+      "grad_norm": 0.9011744856834412,
+      "learning_rate": 4.760342291502976e-05,
+      "loss": 0.9106,
+      "step": 15187
+    },
+    {
+      "epoch": 2.7040598290598292,
+      "grad_norm": 0.7268012762069702,
+      "learning_rate": 4.759150123092851e-05,
+      "loss": 0.6303,
+      "step": 15188
+    },
+    {
+      "epoch": 2.7042378917378915,
+      "grad_norm": 0.8369283676147461,
+      "learning_rate": 4.7579580573663e-05,
+      "loss": 0.8013,
+      "step": 15189
+    },
+    {
+      "epoch": 2.7044159544159543,
+      "grad_norm": 0.9511098861694336,
+      "learning_rate": 4.756766094346663e-05,
+      "loss": 1.0211,
+      "step": 15190
+    },
+    {
+      "epoch": 2.704594017094017,
+      "grad_norm": 0.8408896327018738,
+      "learning_rate": 4.7555742340573074e-05,
+      "loss": 1.1018,
+      "step": 15191
+    },
+    {
+      "epoch": 2.70477207977208,
+      "grad_norm": 0.9166504740715027,
+      "learning_rate": 4.7543824765215795e-05,
+      "loss": 0.9222,
+      "step": 15192
+    },
+    {
+      "epoch": 2.7049501424501425,
+      "grad_norm": 0.8373738527297974,
+      "learning_rate": 4.753190821762826e-05,
+      "loss": 0.9735,
+      "step": 15193
+    },
+    {
+      "epoch": 2.7051282051282053,
+      "grad_norm": 0.8610605597496033,
+      "learning_rate": 4.751999269804408e-05,
+      "loss": 0.7942,
+      "step": 15194
+    },
+    {
+      "epoch": 2.705306267806268,
+      "grad_norm": 0.8778019547462463,
+      "learning_rate": 4.750807820669654e-05,
+      "loss": 0.8055,
+      "step": 15195
+    },
+    {
+      "epoch": 2.7054843304843303,
+      "grad_norm": 0.9997664093971252,
+      "learning_rate": 4.749616474381921e-05,
+      "loss": 0.8461,
+      "step": 15196
+    },
+    {
+      "epoch": 2.705662393162393,
+      "grad_norm": 0.8362101316452026,
+      "learning_rate": 4.748425230964545e-05,
+      "loss": 1.0008,
+      "step": 15197
+    },
+    {
+      "epoch": 2.705840455840456,
+      "grad_norm": 0.870482861995697,
+      "learning_rate": 4.747234090440869e-05,
+      "loss": 0.9547,
+      "step": 15198
+    },
+    {
+      "epoch": 2.7060185185185186,
+      "grad_norm": 0.867431104183197,
+      "learning_rate": 4.746043052834228e-05,
+      "loss": 0.8533,
+      "step": 15199
+    },
+    {
+      "epoch": 2.7061965811965814,
+      "grad_norm": 0.842071533203125,
+      "learning_rate": 4.7448521181679604e-05,
+      "loss": 0.8919,
+      "step": 15200
+    },
+    {
+      "epoch": 2.7063746438746437,
+      "grad_norm": 0.9487791657447815,
+      "learning_rate": 4.743661286465398e-05,
+      "loss": 0.8072,
+      "step": 15201
+    },
+    {
+      "epoch": 2.7065527065527064,
+      "grad_norm": 0.8469042181968689,
+      "learning_rate": 4.742470557749874e-05,
+      "loss": 0.8792,
+      "step": 15202
+    },
+    {
+      "epoch": 2.706730769230769,
+      "grad_norm": 0.86415696144104,
+      "learning_rate": 4.7412799320447145e-05,
+      "loss": 0.9725,
+      "step": 15203
+    },
+    {
+      "epoch": 2.706908831908832,
+      "grad_norm": 0.9035004377365112,
+      "learning_rate": 4.740089409373257e-05,
+      "loss": 0.9915,
+      "step": 15204
+    },
+    {
+      "epoch": 2.7070868945868947,
+      "grad_norm": 0.8122807741165161,
+      "learning_rate": 4.7388989897588156e-05,
+      "loss": 0.946,
+      "step": 15205
+    },
+    {
+      "epoch": 2.7072649572649574,
+      "grad_norm": 0.9801422357559204,
+      "learning_rate": 4.737708673224721e-05,
+      "loss": 0.9357,
+      "step": 15206
+    },
+    {
+      "epoch": 2.70744301994302,
+      "grad_norm": 1.0265265703201294,
+      "learning_rate": 4.736518459794295e-05,
+      "loss": 0.7982,
+      "step": 15207
+    },
+    {
+      "epoch": 2.7076210826210825,
+      "grad_norm": 0.828814685344696,
+      "learning_rate": 4.735328349490855e-05,
+      "loss": 0.6864,
+      "step": 15208
+    },
+    {
+      "epoch": 2.7077991452991452,
+      "grad_norm": 0.7948212623596191,
+      "learning_rate": 4.7341383423377195e-05,
+      "loss": 0.8661,
+      "step": 15209
+    },
+    {
+      "epoch": 2.707977207977208,
+      "grad_norm": 0.8372616767883301,
+      "learning_rate": 4.7329484383582046e-05,
+      "loss": 0.8818,
+      "step": 15210
+    },
+    {
+      "epoch": 2.7081552706552707,
+      "grad_norm": 0.8000285029411316,
+      "learning_rate": 4.731758637575624e-05,
+      "loss": 0.8006,
+      "step": 15211
+    },
+    {
+      "epoch": 2.7083333333333335,
+      "grad_norm": 0.7860875725746155,
+      "learning_rate": 4.730568940013289e-05,
+      "loss": 0.926,
+      "step": 15212
+    },
+    {
+      "epoch": 2.708511396011396,
+      "grad_norm": 0.9157412052154541,
+      "learning_rate": 4.7293793456945054e-05,
+      "loss": 0.7042,
+      "step": 15213
+    },
+    {
+      "epoch": 2.7086894586894585,
+      "grad_norm": 0.8802906274795532,
+      "learning_rate": 4.728189854642589e-05,
+      "loss": 0.8639,
+      "step": 15214
+    },
+    {
+      "epoch": 2.7088675213675213,
+      "grad_norm": 0.8047248721122742,
+      "learning_rate": 4.7270004668808397e-05,
+      "loss": 0.7603,
+      "step": 15215
+    },
+    {
+      "epoch": 2.709045584045584,
+      "grad_norm": 0.9848080277442932,
+      "learning_rate": 4.725811182432564e-05,
+      "loss": 0.8213,
+      "step": 15216
+    },
+    {
+      "epoch": 2.709223646723647,
+      "grad_norm": 0.8568090200424194,
+      "learning_rate": 4.724622001321062e-05,
+      "loss": 0.7663,
+      "step": 15217
+    },
+    {
+      "epoch": 2.7094017094017095,
+      "grad_norm": 0.7926214337348938,
+      "learning_rate": 4.7234329235696284e-05,
+      "loss": 0.874,
+      "step": 15218
+    },
+    {
+      "epoch": 2.7095797720797723,
+      "grad_norm": 0.8389978408813477,
+      "learning_rate": 4.7222439492015734e-05,
+      "loss": 0.623,
+      "step": 15219
+    },
+    {
+      "epoch": 2.7097578347578346,
+      "grad_norm": 0.8635174036026001,
+      "learning_rate": 4.7210550782401773e-05,
+      "loss": 0.822,
+      "step": 15220
+    },
+    {
+      "epoch": 2.7099358974358974,
+      "grad_norm": 0.8381666541099548,
+      "learning_rate": 4.7198663107087446e-05,
+      "loss": 1.0864,
+      "step": 15221
+    },
+    {
+      "epoch": 2.71011396011396,
+      "grad_norm": 1.0722376108169556,
+      "learning_rate": 4.718677646630564e-05,
+      "loss": 0.8527,
+      "step": 15222
+    },
+    {
+      "epoch": 2.710292022792023,
+      "grad_norm": 0.9505516290664673,
+      "learning_rate": 4.7174890860289224e-05,
+      "loss": 1.0645,
+      "step": 15223
+    },
+    {
+      "epoch": 2.7104700854700856,
+      "grad_norm": 0.7757406234741211,
+      "learning_rate": 4.7163006289271095e-05,
+      "loss": 0.5924,
+      "step": 15224
+    },
+    {
+      "epoch": 2.710648148148148,
+      "grad_norm": 0.816387414932251,
+      "learning_rate": 4.71511227534841e-05,
+      "loss": 0.8337,
+      "step": 15225
+    },
+    {
+      "epoch": 2.7108262108262107,
+      "grad_norm": 0.7817156910896301,
+      "learning_rate": 4.7139240253161065e-05,
+      "loss": 0.8315,
+      "step": 15226
+    },
+    {
+      "epoch": 2.7110042735042734,
+      "grad_norm": 0.9753041863441467,
+      "learning_rate": 4.7127358788534816e-05,
+      "loss": 0.851,
+      "step": 15227
+    },
+    {
+      "epoch": 2.711182336182336,
+      "grad_norm": 0.7564638257026672,
+      "learning_rate": 4.7115478359838095e-05,
+      "loss": 0.8132,
+      "step": 15228
+    },
+    {
+      "epoch": 2.711360398860399,
+      "grad_norm": 0.8709259629249573,
+      "learning_rate": 4.710359896730379e-05,
+      "loss": 1.0277,
+      "step": 15229
+    },
+    {
+      "epoch": 2.7115384615384617,
+      "grad_norm": 0.9849836230278015,
+      "learning_rate": 4.7091720611164504e-05,
+      "loss": 0.9778,
+      "step": 15230
+    },
+    {
+      "epoch": 2.7117165242165244,
+      "grad_norm": 0.8330100178718567,
+      "learning_rate": 4.707984329165309e-05,
+      "loss": 0.7138,
+      "step": 15231
+    },
+    {
+      "epoch": 2.7118945868945867,
+      "grad_norm": 1.005644679069519,
+      "learning_rate": 4.706796700900221e-05,
+      "loss": 1.0089,
+      "step": 15232
+    },
+    {
+      "epoch": 2.7120726495726495,
+      "grad_norm": 0.8292263746261597,
+      "learning_rate": 4.705609176344452e-05,
+      "loss": 0.8323,
+      "step": 15233
+    },
+    {
+      "epoch": 2.7122507122507122,
+      "grad_norm": 0.860713005065918,
+      "learning_rate": 4.704421755521281e-05,
+      "loss": 0.821,
+      "step": 15234
+    },
+    {
+      "epoch": 2.712428774928775,
+      "grad_norm": 0.8316803574562073,
+      "learning_rate": 4.703234438453958e-05,
+      "loss": 0.9181,
+      "step": 15235
+    },
+    {
+      "epoch": 2.7126068376068377,
+      "grad_norm": 0.7368014454841614,
+      "learning_rate": 4.70204722516576e-05,
+      "loss": 0.8206,
+      "step": 15236
+    },
+    {
+      "epoch": 2.7127849002849,
+      "grad_norm": 1.0202926397323608,
+      "learning_rate": 4.7008601156799336e-05,
+      "loss": 0.8101,
+      "step": 15237
+    },
+    {
+      "epoch": 2.712962962962963,
+      "grad_norm": 0.8069320917129517,
+      "learning_rate": 4.69967311001975e-05,
+      "loss": 0.9042,
+      "step": 15238
+    },
+    {
+      "epoch": 2.7131410256410255,
+      "grad_norm": 0.8426684737205505,
+      "learning_rate": 4.69848620820846e-05,
+      "loss": 0.7318,
+      "step": 15239
+    },
+    {
+      "epoch": 2.7133190883190883,
+      "grad_norm": 0.8863842487335205,
+      "learning_rate": 4.69729941026932e-05,
+      "loss": 1.0172,
+      "step": 15240
+    },
+    {
+      "epoch": 2.713497150997151,
+      "grad_norm": 0.7984182834625244,
+      "learning_rate": 4.696112716225582e-05,
+      "loss": 0.8298,
+      "step": 15241
+    },
+    {
+      "epoch": 2.713675213675214,
+      "grad_norm": 0.8328375220298767,
+      "learning_rate": 4.6949261261005e-05,
+      "loss": 0.7663,
+      "step": 15242
+    },
+    {
+      "epoch": 2.7138532763532766,
+      "grad_norm": 0.9197641015052795,
+      "learning_rate": 4.693739639917314e-05,
+      "loss": 0.8951,
+      "step": 15243
+    },
+    {
+      "epoch": 2.7140313390313393,
+      "grad_norm": 0.7421545386314392,
+      "learning_rate": 4.692553257699286e-05,
+      "loss": 0.7235,
+      "step": 15244
+    },
+    {
+      "epoch": 2.7142094017094016,
+      "grad_norm": 0.8033188581466675,
+      "learning_rate": 4.691366979469642e-05,
+      "loss": 0.9693,
+      "step": 15245
+    },
+    {
+      "epoch": 2.7143874643874644,
+      "grad_norm": 0.8765473365783691,
+      "learning_rate": 4.6901808052516436e-05,
+      "loss": 0.8851,
+      "step": 15246
+    },
+    {
+      "epoch": 2.714565527065527,
+      "grad_norm": 0.8351873755455017,
+      "learning_rate": 4.688994735068515e-05,
+      "loss": 1.0156,
+      "step": 15247
+    },
+    {
+      "epoch": 2.71474358974359,
+      "grad_norm": 0.8569470643997192,
+      "learning_rate": 4.6878087689435046e-05,
+      "loss": 0.7149,
+      "step": 15248
+    },
+    {
+      "epoch": 2.714921652421652,
+      "grad_norm": 0.8334367871284485,
+      "learning_rate": 4.686622906899847e-05,
+      "loss": 0.9218,
+      "step": 15249
+    },
+    {
+      "epoch": 2.715099715099715,
+      "grad_norm": 0.8889651298522949,
+      "learning_rate": 4.685437148960775e-05,
+      "loss": 0.8987,
+      "step": 15250
+    },
+    {
+      "epoch": 2.7152777777777777,
+      "grad_norm": 0.9381657838821411,
+      "learning_rate": 4.684251495149522e-05,
+      "loss": 0.7798,
+      "step": 15251
+    },
+    {
+      "epoch": 2.7154558404558404,
+      "grad_norm": 0.7698730826377869,
+      "learning_rate": 4.68306594548932e-05,
+      "loss": 0.8248,
+      "step": 15252
+    },
+    {
+      "epoch": 2.715633903133903,
+      "grad_norm": 0.8980026245117188,
+      "learning_rate": 4.681880500003391e-05,
+      "loss": 1.0156,
+      "step": 15253
+    },
+    {
+      "epoch": 2.715811965811966,
+      "grad_norm": 0.7872338891029358,
+      "learning_rate": 4.6806951587149694e-05,
+      "loss": 0.6389,
+      "step": 15254
+    },
+    {
+      "epoch": 2.7159900284900287,
+      "grad_norm": 0.8155974745750427,
+      "learning_rate": 4.6795099216472774e-05,
+      "loss": 0.9081,
+      "step": 15255
+    },
+    {
+      "epoch": 2.7161680911680914,
+      "grad_norm": 0.7678217887878418,
+      "learning_rate": 4.678324788823535e-05,
+      "loss": 0.6193,
+      "step": 15256
+    },
+    {
+      "epoch": 2.7163461538461537,
+      "grad_norm": 0.75429767370224,
+      "learning_rate": 4.6771397602669643e-05,
+      "loss": 0.9384,
+      "step": 15257
+    },
+    {
+      "epoch": 2.7165242165242165,
+      "grad_norm": 0.8755250573158264,
+      "learning_rate": 4.675954836000779e-05,
+      "loss": 0.8563,
+      "step": 15258
+    },
+    {
+      "epoch": 2.7167022792022792,
+      "grad_norm": 0.8393009305000305,
+      "learning_rate": 4.6747700160482053e-05,
+      "loss": 0.9407,
+      "step": 15259
+    },
+    {
+      "epoch": 2.716880341880342,
+      "grad_norm": 0.8478221297264099,
+      "learning_rate": 4.673585300432445e-05,
+      "loss": 0.7562,
+      "step": 15260
+    },
+    {
+      "epoch": 2.7170584045584043,
+      "grad_norm": 0.7497259974479675,
+      "learning_rate": 4.672400689176722e-05,
+      "loss": 0.8406,
+      "step": 15261
+    },
+    {
+      "epoch": 2.717236467236467,
+      "grad_norm": 0.9695250391960144,
+      "learning_rate": 4.671216182304234e-05,
+      "loss": 0.9505,
+      "step": 15262
+    },
+    {
+      "epoch": 2.71741452991453,
+      "grad_norm": 0.9375512599945068,
+      "learning_rate": 4.6700317798382e-05,
+      "loss": 0.9024,
+      "step": 15263
+    },
+    {
+      "epoch": 2.7175925925925926,
+      "grad_norm": 0.7930737137794495,
+      "learning_rate": 4.6688474818018194e-05,
+      "loss": 0.8416,
+      "step": 15264
+    },
+    {
+      "epoch": 2.7177706552706553,
+      "grad_norm": 0.9707022309303284,
+      "learning_rate": 4.667663288218298e-05,
+      "loss": 1.1172,
+      "step": 15265
+    },
+    {
+      "epoch": 2.717948717948718,
+      "grad_norm": 0.7616816759109497,
+      "learning_rate": 4.666479199110838e-05,
+      "loss": 0.8557,
+      "step": 15266
+    },
+    {
+      "epoch": 2.718126780626781,
+      "grad_norm": 0.7836055159568787,
+      "learning_rate": 4.66529521450264e-05,
+      "loss": 0.7299,
+      "step": 15267
+    },
+    {
+      "epoch": 2.7183048433048436,
+      "grad_norm": 0.8313519954681396,
+      "learning_rate": 4.664111334416894e-05,
+      "loss": 0.8268,
+      "step": 15268
+    },
+    {
+      "epoch": 2.718482905982906,
+      "grad_norm": 0.9130576252937317,
+      "learning_rate": 4.662927558876812e-05,
+      "loss": 0.8913,
+      "step": 15269
+    },
+    {
+      "epoch": 2.7186609686609686,
+      "grad_norm": 0.8552213907241821,
+      "learning_rate": 4.661743887905569e-05,
+      "loss": 0.9396,
+      "step": 15270
+    },
+    {
+      "epoch": 2.7188390313390314,
+      "grad_norm": 0.7953839898109436,
+      "learning_rate": 4.660560321526373e-05,
+      "loss": 0.74,
+      "step": 15271
+    },
+    {
+      "epoch": 2.719017094017094,
+      "grad_norm": 0.9148657321929932,
+      "learning_rate": 4.6593768597623974e-05,
+      "loss": 0.7821,
+      "step": 15272
+    },
+    {
+      "epoch": 2.7191951566951564,
+      "grad_norm": 0.8587655425071716,
+      "learning_rate": 4.658193502636843e-05,
+      "loss": 0.9495,
+      "step": 15273
+    },
+    {
+      "epoch": 2.719373219373219,
+      "grad_norm": 0.8915669322013855,
+      "learning_rate": 4.6570102501728896e-05,
+      "loss": 0.8612,
+      "step": 15274
+    },
+    {
+      "epoch": 2.719551282051282,
+      "grad_norm": 0.957039475440979,
+      "learning_rate": 4.655827102393717e-05,
+      "loss": 0.8506,
+      "step": 15275
+    },
+    {
+      "epoch": 2.7197293447293447,
+      "grad_norm": 0.7784267067909241,
+      "learning_rate": 4.654644059322519e-05,
+      "loss": 0.6864,
+      "step": 15276
+    },
+    {
+      "epoch": 2.7199074074074074,
+      "grad_norm": 0.9508241415023804,
+      "learning_rate": 4.65346112098246e-05,
+      "loss": 1.0097,
+      "step": 15277
+    },
+    {
+      "epoch": 2.72008547008547,
+      "grad_norm": 0.8316742777824402,
+      "learning_rate": 4.6522782873967265e-05,
+      "loss": 0.7444,
+      "step": 15278
+    },
+    {
+      "epoch": 2.720263532763533,
+      "grad_norm": 0.8781944513320923,
+      "learning_rate": 4.651095558588491e-05,
+      "loss": 0.8725,
+      "step": 15279
+    },
+    {
+      "epoch": 2.7204415954415957,
+      "grad_norm": 0.9407825469970703,
+      "learning_rate": 4.649912934580927e-05,
+      "loss": 0.9788,
+      "step": 15280
+    },
+    {
+      "epoch": 2.720619658119658,
+      "grad_norm": 0.9863289594650269,
+      "learning_rate": 4.6487304153972045e-05,
+      "loss": 0.7777,
+      "step": 15281
+    },
+    {
+      "epoch": 2.7207977207977208,
+      "grad_norm": 0.7580869793891907,
+      "learning_rate": 4.6475480010604945e-05,
+      "loss": 0.5835,
+      "step": 15282
+    },
+    {
+      "epoch": 2.7209757834757835,
+      "grad_norm": 0.7973836660385132,
+      "learning_rate": 4.646365691593961e-05,
+      "loss": 0.7633,
+      "step": 15283
+    },
+    {
+      "epoch": 2.7211538461538463,
+      "grad_norm": 0.8107978701591492,
+      "learning_rate": 4.645183487020772e-05,
+      "loss": 0.7149,
+      "step": 15284
+    },
+    {
+      "epoch": 2.7213319088319086,
+      "grad_norm": 0.8944578170776367,
+      "learning_rate": 4.644001387364084e-05,
+      "loss": 0.9227,
+      "step": 15285
+    },
+    {
+      "epoch": 2.7215099715099713,
+      "grad_norm": 0.7592978477478027,
+      "learning_rate": 4.642819392647071e-05,
+      "loss": 0.464,
+      "step": 15286
+    },
+    {
+      "epoch": 2.721688034188034,
+      "grad_norm": 0.8484344482421875,
+      "learning_rate": 4.641637502892876e-05,
+      "loss": 1.0439,
+      "step": 15287
+    },
+    {
+      "epoch": 2.721866096866097,
+      "grad_norm": 0.8766823410987854,
+      "learning_rate": 4.640455718124667e-05,
+      "loss": 0.7561,
+      "step": 15288
+    },
+    {
+      "epoch": 2.7220441595441596,
+      "grad_norm": 0.8039024472236633,
+      "learning_rate": 4.639274038365594e-05,
+      "loss": 0.6774,
+      "step": 15289
+    },
+    {
+      "epoch": 2.7222222222222223,
+      "grad_norm": 0.8199611902236938,
+      "learning_rate": 4.63809246363881e-05,
+      "loss": 0.7721,
+      "step": 15290
+    },
+    {
+      "epoch": 2.722400284900285,
+      "grad_norm": 0.8209745287895203,
+      "learning_rate": 4.636910993967467e-05,
+      "loss": 0.7017,
+      "step": 15291
+    },
+    {
+      "epoch": 2.722578347578348,
+      "grad_norm": 0.8822476267814636,
+      "learning_rate": 4.6357296293747075e-05,
+      "loss": 0.8742,
+      "step": 15292
+    },
+    {
+      "epoch": 2.72275641025641,
+      "grad_norm": 0.8172603249549866,
+      "learning_rate": 4.634548369883687e-05,
+      "loss": 0.8165,
+      "step": 15293
+    },
+    {
+      "epoch": 2.722934472934473,
+      "grad_norm": 0.8601866960525513,
+      "learning_rate": 4.633367215517546e-05,
+      "loss": 0.7961,
+      "step": 15294
+    },
+    {
+      "epoch": 2.7231125356125356,
+      "grad_norm": 0.9346174001693726,
+      "learning_rate": 4.632186166299425e-05,
+      "loss": 0.9229,
+      "step": 15295
+    },
+    {
+      "epoch": 2.7232905982905984,
+      "grad_norm": 0.8956635594367981,
+      "learning_rate": 4.631005222252465e-05,
+      "loss": 0.7886,
+      "step": 15296
+    },
+    {
+      "epoch": 2.7234686609686607,
+      "grad_norm": 0.8453384637832642,
+      "learning_rate": 4.629824383399805e-05,
+      "loss": 0.8513,
+      "step": 15297
+    },
+    {
+      "epoch": 2.7236467236467234,
+      "grad_norm": 0.8931429982185364,
+      "learning_rate": 4.628643649764581e-05,
+      "loss": 1.0195,
+      "step": 15298
+    },
+    {
+      "epoch": 2.723824786324786,
+      "grad_norm": 0.7326723337173462,
+      "learning_rate": 4.6274630213699265e-05,
+      "loss": 0.7616,
+      "step": 15299
+    },
+    {
+      "epoch": 2.724002849002849,
+      "grad_norm": 0.8572023510932922,
+      "learning_rate": 4.6262824982389706e-05,
+      "loss": 0.8266,
+      "step": 15300
+    },
+    {
+      "epoch": 2.7241809116809117,
+      "grad_norm": 0.7753783464431763,
+      "learning_rate": 4.625102080394853e-05,
+      "loss": 0.6907,
+      "step": 15301
+    },
+    {
+      "epoch": 2.7243589743589745,
+      "grad_norm": 0.8758052587509155,
+      "learning_rate": 4.623921767860687e-05,
+      "loss": 0.6369,
+      "step": 15302
+    },
+    {
+      "epoch": 2.724537037037037,
+      "grad_norm": 0.8508220314979553,
+      "learning_rate": 4.6227415606596104e-05,
+      "loss": 0.933,
+      "step": 15303
+    },
+    {
+      "epoch": 2.7247150997151,
+      "grad_norm": 0.7440072298049927,
+      "learning_rate": 4.621561458814743e-05,
+      "loss": 0.7172,
+      "step": 15304
+    },
+    {
+      "epoch": 2.7248931623931623,
+      "grad_norm": 0.9081870317459106,
+      "learning_rate": 4.6203814623492046e-05,
+      "loss": 0.8964,
+      "step": 15305
+    },
+    {
+      "epoch": 2.725071225071225,
+      "grad_norm": 0.9127907156944275,
+      "learning_rate": 4.619201571286117e-05,
+      "loss": 1.0081,
+      "step": 15306
+    },
+    {
+      "epoch": 2.7252492877492878,
+      "grad_norm": 0.9508554935455322,
+      "learning_rate": 4.618021785648597e-05,
+      "loss": 0.94,
+      "step": 15307
+    },
+    {
+      "epoch": 2.7254273504273505,
+      "grad_norm": 0.8726735711097717,
+      "learning_rate": 4.616842105459761e-05,
+      "loss": 0.9284,
+      "step": 15308
+    },
+    {
+      "epoch": 2.7256054131054133,
+      "grad_norm": 0.9266753792762756,
+      "learning_rate": 4.6156625307427206e-05,
+      "loss": 0.9069,
+      "step": 15309
+    },
+    {
+      "epoch": 2.7257834757834756,
+      "grad_norm": 0.952553391456604,
+      "learning_rate": 4.614483061520584e-05,
+      "loss": 0.8604,
+      "step": 15310
+    },
+    {
+      "epoch": 2.7259615384615383,
+      "grad_norm": 0.7702621817588806,
+      "learning_rate": 4.613303697816471e-05,
+      "loss": 0.808,
+      "step": 15311
+    },
+    {
+      "epoch": 2.726139601139601,
+      "grad_norm": 0.8052653670310974,
+      "learning_rate": 4.612124439653477e-05,
+      "loss": 0.8696,
+      "step": 15312
+    },
+    {
+      "epoch": 2.726317663817664,
+      "grad_norm": 0.8808547854423523,
+      "learning_rate": 4.610945287054714e-05,
+      "loss": 0.9595,
+      "step": 15313
+    },
+    {
+      "epoch": 2.7264957264957266,
+      "grad_norm": 1.0233266353607178,
+      "learning_rate": 4.609766240043284e-05,
+      "loss": 1.0691,
+      "step": 15314
+    },
+    {
+      "epoch": 2.7266737891737893,
+      "grad_norm": 0.8129898309707642,
+      "learning_rate": 4.6085872986422826e-05,
+      "loss": 0.9269,
+      "step": 15315
+    },
+    {
+      "epoch": 2.726851851851852,
+      "grad_norm": 1.2745141983032227,
+      "learning_rate": 4.607408462874823e-05,
+      "loss": 0.937,
+      "step": 15316
+    },
+    {
+      "epoch": 2.7270299145299144,
+      "grad_norm": 0.808274507522583,
+      "learning_rate": 4.606229732763984e-05,
+      "loss": 0.789,
+      "step": 15317
+    },
+    {
+      "epoch": 2.727207977207977,
+      "grad_norm": 0.8849375247955322,
+      "learning_rate": 4.605051108332875e-05,
+      "loss": 0.8993,
+      "step": 15318
+    },
+    {
+      "epoch": 2.72738603988604,
+      "grad_norm": 0.8251593112945557,
+      "learning_rate": 4.603872589604576e-05,
+      "loss": 0.9057,
+      "step": 15319
+    },
+    {
+      "epoch": 2.7275641025641026,
+      "grad_norm": 0.8271582126617432,
+      "learning_rate": 4.602694176602188e-05,
+      "loss": 0.8378,
+      "step": 15320
+    },
+    {
+      "epoch": 2.7277421652421654,
+      "grad_norm": 0.8139070868492126,
+      "learning_rate": 4.6015158693487956e-05,
+      "loss": 0.8014,
+      "step": 15321
+    },
+    {
+      "epoch": 2.7279202279202277,
+      "grad_norm": 0.8873880505561829,
+      "learning_rate": 4.600337667867486e-05,
+      "loss": 0.8707,
+      "step": 15322
+    },
+    {
+      "epoch": 2.7280982905982905,
+      "grad_norm": 0.8616414666175842,
+      "learning_rate": 4.599159572181342e-05,
+      "loss": 0.8538,
+      "step": 15323
+    },
+    {
+      "epoch": 2.728276353276353,
+      "grad_norm": 0.8280995488166809,
+      "learning_rate": 4.5979815823134466e-05,
+      "loss": 0.8444,
+      "step": 15324
+    },
+    {
+      "epoch": 2.728454415954416,
+      "grad_norm": 0.8684375882148743,
+      "learning_rate": 4.596803698286878e-05,
+      "loss": 0.7562,
+      "step": 15325
+    },
+    {
+      "epoch": 2.7286324786324787,
+      "grad_norm": 0.8113002181053162,
+      "learning_rate": 4.595625920124723e-05,
+      "loss": 0.8331,
+      "step": 15326
+    },
+    {
+      "epoch": 2.7288105413105415,
+      "grad_norm": 0.8675588965415955,
+      "learning_rate": 4.5944482478500436e-05,
+      "loss": 1.1016,
+      "step": 15327
+    },
+    {
+      "epoch": 2.728988603988604,
+      "grad_norm": 0.9015034437179565,
+      "learning_rate": 4.593270681485927e-05,
+      "loss": 0.9002,
+      "step": 15328
+    },
+    {
+      "epoch": 2.7291666666666665,
+      "grad_norm": 0.9215324521064758,
+      "learning_rate": 4.592093221055439e-05,
+      "loss": 0.9491,
+      "step": 15329
+    },
+    {
+      "epoch": 2.7293447293447293,
+      "grad_norm": 0.8969921469688416,
+      "learning_rate": 4.590915866581651e-05,
+      "loss": 0.8791,
+      "step": 15330
+    },
+    {
+      "epoch": 2.729522792022792,
+      "grad_norm": 0.9012344479560852,
+      "learning_rate": 4.5897386180876304e-05,
+      "loss": 0.9114,
+      "step": 15331
+    },
+    {
+      "epoch": 2.7297008547008548,
+      "grad_norm": 1.0024429559707642,
+      "learning_rate": 4.588561475596438e-05,
+      "loss": 0.8782,
+      "step": 15332
+    },
+    {
+      "epoch": 2.7298789173789175,
+      "grad_norm": 0.9079484343528748,
+      "learning_rate": 4.5873844391311496e-05,
+      "loss": 1.0012,
+      "step": 15333
+    },
+    {
+      "epoch": 2.73005698005698,
+      "grad_norm": 0.709800660610199,
+      "learning_rate": 4.5862075087148124e-05,
+      "loss": 0.7473,
+      "step": 15334
+    },
+    {
+      "epoch": 2.7302350427350426,
+      "grad_norm": 0.9776272773742676,
+      "learning_rate": 4.585030684370497e-05,
+      "loss": 1.1927,
+      "step": 15335
+    },
+    {
+      "epoch": 2.7304131054131053,
+      "grad_norm": 0.8624512553215027,
+      "learning_rate": 4.5838539661212565e-05,
+      "loss": 0.6661,
+      "step": 15336
+    },
+    {
+      "epoch": 2.730591168091168,
+      "grad_norm": 0.7901379466056824,
+      "learning_rate": 4.5826773539901456e-05,
+      "loss": 0.8244,
+      "step": 15337
+    },
+    {
+      "epoch": 2.730769230769231,
+      "grad_norm": 0.8546316027641296,
+      "learning_rate": 4.58150084800022e-05,
+      "loss": 0.8232,
+      "step": 15338
+    },
+    {
+      "epoch": 2.7309472934472936,
+      "grad_norm": 1.0038648843765259,
+      "learning_rate": 4.5803244481745275e-05,
+      "loss": 0.8363,
+      "step": 15339
+    },
+    {
+      "epoch": 2.7311253561253563,
+      "grad_norm": 0.7757763266563416,
+      "learning_rate": 4.579148154536117e-05,
+      "loss": 0.6935,
+      "step": 15340
+    },
+    {
+      "epoch": 2.7313034188034186,
+      "grad_norm": 0.8671833276748657,
+      "learning_rate": 4.5779719671080436e-05,
+      "loss": 0.8453,
+      "step": 15341
+    },
+    {
+      "epoch": 2.7314814814814814,
+      "grad_norm": 0.8507152795791626,
+      "learning_rate": 4.57679588591334e-05,
+      "loss": 0.8142,
+      "step": 15342
+    },
+    {
+      "epoch": 2.731659544159544,
+      "grad_norm": 0.8205499053001404,
+      "learning_rate": 4.575619910975062e-05,
+      "loss": 0.8442,
+      "step": 15343
+    },
+    {
+      "epoch": 2.731837606837607,
+      "grad_norm": 0.8809645771980286,
+      "learning_rate": 4.574444042316236e-05,
+      "loss": 1.024,
+      "step": 15344
+    },
+    {
+      "epoch": 2.7320156695156697,
+      "grad_norm": 0.825038492679596,
+      "learning_rate": 4.573268279959912e-05,
+      "loss": 0.9089,
+      "step": 15345
+    },
+    {
+      "epoch": 2.732193732193732,
+      "grad_norm": 0.7646815776824951,
+      "learning_rate": 4.572092623929124e-05,
+      "loss": 0.9239,
+      "step": 15346
+    },
+    {
+      "epoch": 2.7323717948717947,
+      "grad_norm": 0.8372252583503723,
+      "learning_rate": 4.570917074246905e-05,
+      "loss": 0.8687,
+      "step": 15347
+    },
+    {
+      "epoch": 2.7325498575498575,
+      "grad_norm": 0.9108861088752747,
+      "learning_rate": 4.5697416309362885e-05,
+      "loss": 0.9451,
+      "step": 15348
+    },
+    {
+      "epoch": 2.73272792022792,
+      "grad_norm": 0.8537670969963074,
+      "learning_rate": 4.568566294020303e-05,
+      "loss": 0.7437,
+      "step": 15349
+    },
+    {
+      "epoch": 2.732905982905983,
+      "grad_norm": 0.8660921454429626,
+      "learning_rate": 4.5673910635219766e-05,
+      "loss": 1.0617,
+      "step": 15350
+    },
+    {
+      "epoch": 2.7330840455840457,
+      "grad_norm": 0.8499423265457153,
+      "learning_rate": 4.5662159394643424e-05,
+      "loss": 0.9355,
+      "step": 15351
+    },
+    {
+      "epoch": 2.7332621082621085,
+      "grad_norm": 0.7776598930358887,
+      "learning_rate": 4.565040921870413e-05,
+      "loss": 0.8132,
+      "step": 15352
+    },
+    {
+      "epoch": 2.7334401709401708,
+      "grad_norm": 0.9197307229042053,
+      "learning_rate": 4.563866010763219e-05,
+      "loss": 0.8648,
+      "step": 15353
+    },
+    {
+      "epoch": 2.7336182336182335,
+      "grad_norm": 0.8543015122413635,
+      "learning_rate": 4.5626912061657786e-05,
+      "loss": 0.8453,
+      "step": 15354
+    },
+    {
+      "epoch": 2.7337962962962963,
+      "grad_norm": 0.9448479413986206,
+      "learning_rate": 4.561516508101109e-05,
+      "loss": 0.9515,
+      "step": 15355
+    },
+    {
+      "epoch": 2.733974358974359,
+      "grad_norm": 0.7448729276657104,
+      "learning_rate": 4.5603419165922265e-05,
+      "loss": 0.64,
+      "step": 15356
+    },
+    {
+      "epoch": 2.734152421652422,
+      "grad_norm": 0.8229237198829651,
+      "learning_rate": 4.5591674316621405e-05,
+      "loss": 0.7936,
+      "step": 15357
+    },
+    {
+      "epoch": 2.734330484330484,
+      "grad_norm": 0.8518769145011902,
+      "learning_rate": 4.557993053333873e-05,
+      "loss": 1.1976,
+      "step": 15358
+    },
+    {
+      "epoch": 2.734508547008547,
+      "grad_norm": 0.8680224418640137,
+      "learning_rate": 4.55681878163042e-05,
+      "loss": 0.8223,
+      "step": 15359
+    },
+    {
+      "epoch": 2.7346866096866096,
+      "grad_norm": 0.8199124336242676,
+      "learning_rate": 4.555644616574799e-05,
+      "loss": 0.634,
+      "step": 15360
+    },
+    {
+      "epoch": 2.7348646723646723,
+      "grad_norm": 0.8262977004051208,
+      "learning_rate": 4.554470558190013e-05,
+      "loss": 0.6373,
+      "step": 15361
+    },
+    {
+      "epoch": 2.735042735042735,
+      "grad_norm": 0.8114070296287537,
+      "learning_rate": 4.553296606499062e-05,
+      "loss": 0.6624,
+      "step": 15362
+    },
+    {
+      "epoch": 2.735220797720798,
+      "grad_norm": 0.9944671392440796,
+      "learning_rate": 4.552122761524952e-05,
+      "loss": 0.8246,
+      "step": 15363
+    },
+    {
+      "epoch": 2.7353988603988606,
+      "grad_norm": 0.8174465298652649,
+      "learning_rate": 4.550949023290678e-05,
+      "loss": 0.8431,
+      "step": 15364
+    },
+    {
+      "epoch": 2.7355769230769234,
+      "grad_norm": 0.8303970694541931,
+      "learning_rate": 4.5497753918192356e-05,
+      "loss": 0.417,
+      "step": 15365
+    },
+    {
+      "epoch": 2.7357549857549857,
+      "grad_norm": 0.8428391218185425,
+      "learning_rate": 4.548601867133629e-05,
+      "loss": 0.751,
+      "step": 15366
+    },
+    {
+      "epoch": 2.7359330484330484,
+      "grad_norm": 0.8769099116325378,
+      "learning_rate": 4.5474284492568384e-05,
+      "loss": 0.8984,
+      "step": 15367
+    },
+    {
+      "epoch": 2.736111111111111,
+      "grad_norm": 0.8389245867729187,
+      "learning_rate": 4.546255138211867e-05,
+      "loss": 0.8503,
+      "step": 15368
+    },
+    {
+      "epoch": 2.736289173789174,
+      "grad_norm": 0.8404824137687683,
+      "learning_rate": 4.5450819340216896e-05,
+      "loss": 0.836,
+      "step": 15369
+    },
+    {
+      "epoch": 2.736467236467236,
+      "grad_norm": 1.0007327795028687,
+      "learning_rate": 4.543908836709304e-05,
+      "loss": 0.9746,
+      "step": 15370
+    },
+    {
+      "epoch": 2.736645299145299,
+      "grad_norm": 0.7373863458633423,
+      "learning_rate": 4.542735846297691e-05,
+      "loss": 0.6752,
+      "step": 15371
+    },
+    {
+      "epoch": 2.7368233618233617,
+      "grad_norm": 0.8973239660263062,
+      "learning_rate": 4.541562962809829e-05,
+      "loss": 0.8988,
+      "step": 15372
+    },
+    {
+      "epoch": 2.7370014245014245,
+      "grad_norm": 0.8576705455780029,
+      "learning_rate": 4.5403901862687095e-05,
+      "loss": 0.9279,
+      "step": 15373
+    },
+    {
+      "epoch": 2.7371794871794872,
+      "grad_norm": 0.7462539076805115,
+      "learning_rate": 4.539217516697295e-05,
+      "loss": 0.7228,
+      "step": 15374
+    },
+    {
+      "epoch": 2.73735754985755,
+      "grad_norm": 0.8082219362258911,
+      "learning_rate": 4.538044954118573e-05,
+      "loss": 0.9562,
+      "step": 15375
+    },
+    {
+      "epoch": 2.7375356125356127,
+      "grad_norm": 0.7067760825157166,
+      "learning_rate": 4.5368724985555134e-05,
+      "loss": 0.5306,
+      "step": 15376
+    },
+    {
+      "epoch": 2.7377136752136755,
+      "grad_norm": 0.9178285598754883,
+      "learning_rate": 4.535700150031089e-05,
+      "loss": 0.8087,
+      "step": 15377
+    },
+    {
+      "epoch": 2.737891737891738,
+      "grad_norm": 0.803240180015564,
+      "learning_rate": 4.53452790856827e-05,
+      "loss": 0.7315,
+      "step": 15378
+    },
+    {
+      "epoch": 2.7380698005698005,
+      "grad_norm": 0.7200242877006531,
+      "learning_rate": 4.5333557741900226e-05,
+      "loss": 0.6329,
+      "step": 15379
+    },
+    {
+      "epoch": 2.7382478632478633,
+      "grad_norm": 0.8744874596595764,
+      "learning_rate": 4.5321837469193117e-05,
+      "loss": 0.8279,
+      "step": 15380
+    },
+    {
+      "epoch": 2.738425925925926,
+      "grad_norm": 0.7736984491348267,
+      "learning_rate": 4.531011826779103e-05,
+      "loss": 0.7267,
+      "step": 15381
+    },
+    {
+      "epoch": 2.7386039886039883,
+      "grad_norm": 0.893189549446106,
+      "learning_rate": 4.5298400137923527e-05,
+      "loss": 0.754,
+      "step": 15382
+    },
+    {
+      "epoch": 2.738782051282051,
+      "grad_norm": 0.7637171149253845,
+      "learning_rate": 4.5286683079820314e-05,
+      "loss": 0.6201,
+      "step": 15383
+    },
+    {
+      "epoch": 2.738960113960114,
+      "grad_norm": 0.8324360847473145,
+      "learning_rate": 4.527496709371082e-05,
+      "loss": 0.7617,
+      "step": 15384
+    },
+    {
+      "epoch": 2.7391381766381766,
+      "grad_norm": 0.7570679783821106,
+      "learning_rate": 4.52632521798247e-05,
+      "loss": 0.7275,
+      "step": 15385
+    },
+    {
+      "epoch": 2.7393162393162394,
+      "grad_norm": 0.7802938222885132,
+      "learning_rate": 4.525153833839144e-05,
+      "loss": 0.7423,
+      "step": 15386
+    },
+    {
+      "epoch": 2.739494301994302,
+      "grad_norm": 0.8645743727684021,
+      "learning_rate": 4.523982556964056e-05,
+      "loss": 0.8066,
+      "step": 15387
+    },
+    {
+      "epoch": 2.739672364672365,
+      "grad_norm": 0.8080064654350281,
+      "learning_rate": 4.522811387380155e-05,
+      "loss": 0.7418,
+      "step": 15388
+    },
+    {
+      "epoch": 2.7398504273504276,
+      "grad_norm": 0.925401508808136,
+      "learning_rate": 4.521640325110387e-05,
+      "loss": 0.9622,
+      "step": 15389
+    },
+    {
+      "epoch": 2.74002849002849,
+      "grad_norm": 0.8898165822029114,
+      "learning_rate": 4.520469370177696e-05,
+      "loss": 0.8543,
+      "step": 15390
+    },
+    {
+      "epoch": 2.7402065527065527,
+      "grad_norm": 0.8610122799873352,
+      "learning_rate": 4.519298522605021e-05,
+      "loss": 0.9502,
+      "step": 15391
+    },
+    {
+      "epoch": 2.7403846153846154,
+      "grad_norm": 0.8111294507980347,
+      "learning_rate": 4.51812778241531e-05,
+      "loss": 0.7526,
+      "step": 15392
+    },
+    {
+      "epoch": 2.740562678062678,
+      "grad_norm": 0.8465895056724548,
+      "learning_rate": 4.516957149631498e-05,
+      "loss": 0.9076,
+      "step": 15393
+    },
+    {
+      "epoch": 2.7407407407407405,
+      "grad_norm": 0.8541668057441711,
+      "learning_rate": 4.51578662427652e-05,
+      "loss": 0.8996,
+      "step": 15394
+    },
+    {
+      "epoch": 2.7409188034188032,
+      "grad_norm": 0.9113210439682007,
+      "learning_rate": 4.514616206373311e-05,
+      "loss": 0.9129,
+      "step": 15395
+    },
+    {
+      "epoch": 2.741096866096866,
+      "grad_norm": 0.7553523182868958,
+      "learning_rate": 4.513445895944802e-05,
+      "loss": 0.7265,
+      "step": 15396
+    },
+    {
+      "epoch": 2.7412749287749287,
+      "grad_norm": 0.8949921131134033,
+      "learning_rate": 4.5122756930139206e-05,
+      "loss": 0.9176,
+      "step": 15397
+    },
+    {
+      "epoch": 2.7414529914529915,
+      "grad_norm": 0.7957020401954651,
+      "learning_rate": 4.5111055976036044e-05,
+      "loss": 0.9384,
+      "step": 15398
+    },
+    {
+      "epoch": 2.7416310541310542,
+      "grad_norm": 0.759608805179596,
+      "learning_rate": 4.509935609736764e-05,
+      "loss": 0.7791,
+      "step": 15399
+    },
+    {
+      "epoch": 2.741809116809117,
+      "grad_norm": 0.927768886089325,
+      "learning_rate": 4.508765729436335e-05,
+      "loss": 0.9113,
+      "step": 15400
+    },
+    {
+      "epoch": 2.7419871794871797,
+      "grad_norm": 0.910513162612915,
+      "learning_rate": 4.5075959567252335e-05,
+      "loss": 0.9334,
+      "step": 15401
+    },
+    {
+      "epoch": 2.742165242165242,
+      "grad_norm": 0.9029644727706909,
+      "learning_rate": 4.5064262916263814e-05,
+      "loss": 0.9487,
+      "step": 15402
+    },
+    {
+      "epoch": 2.742343304843305,
+      "grad_norm": 0.8001708984375,
+      "learning_rate": 4.505256734162693e-05,
+      "loss": 0.8447,
+      "step": 15403
+    },
+    {
+      "epoch": 2.7425213675213675,
+      "grad_norm": 0.8000209927558899,
+      "learning_rate": 4.504087284357085e-05,
+      "loss": 0.6764,
+      "step": 15404
+    },
+    {
+      "epoch": 2.7426994301994303,
+      "grad_norm": 0.7368536591529846,
+      "learning_rate": 4.5029179422324686e-05,
+      "loss": 0.563,
+      "step": 15405
+    },
+    {
+      "epoch": 2.7428774928774926,
+      "grad_norm": 0.9035481214523315,
+      "learning_rate": 4.501748707811757e-05,
+      "loss": 0.8165,
+      "step": 15406
+    },
+    {
+      "epoch": 2.7430555555555554,
+      "grad_norm": 0.7985709309577942,
+      "learning_rate": 4.500579581117854e-05,
+      "loss": 1.0773,
+      "step": 15407
+    },
+    {
+      "epoch": 2.743233618233618,
+      "grad_norm": 0.7867546677589417,
+      "learning_rate": 4.499410562173678e-05,
+      "loss": 0.7416,
+      "step": 15408
+    },
+    {
+      "epoch": 2.743411680911681,
+      "grad_norm": 0.8863609433174133,
+      "learning_rate": 4.498241651002117e-05,
+      "loss": 0.8609,
+      "step": 15409
+    },
+    {
+      "epoch": 2.7435897435897436,
+      "grad_norm": 0.8197270631790161,
+      "learning_rate": 4.497072847626087e-05,
+      "loss": 1.0664,
+      "step": 15410
+    },
+    {
+      "epoch": 2.7437678062678064,
+      "grad_norm": 0.843718409538269,
+      "learning_rate": 4.495904152068483e-05,
+      "loss": 0.9831,
+      "step": 15411
+    },
+    {
+      "epoch": 2.743945868945869,
+      "grad_norm": 0.8311102986335754,
+      "learning_rate": 4.4947355643521985e-05,
+      "loss": 0.8035,
+      "step": 15412
+    },
+    {
+      "epoch": 2.744123931623932,
+      "grad_norm": 0.8396357297897339,
+      "learning_rate": 4.493567084500143e-05,
+      "loss": 1.0015,
+      "step": 15413
+    },
+    {
+      "epoch": 2.744301994301994,
+      "grad_norm": 0.7959007620811462,
+      "learning_rate": 4.492398712535194e-05,
+      "loss": 0.8414,
+      "step": 15414
+    },
+    {
+      "epoch": 2.744480056980057,
+      "grad_norm": 0.7720336318016052,
+      "learning_rate": 4.491230448480258e-05,
+      "loss": 0.8185,
+      "step": 15415
+    },
+    {
+      "epoch": 2.7446581196581197,
+      "grad_norm": 0.7999769449234009,
+      "learning_rate": 4.4900622923582115e-05,
+      "loss": 0.6807,
+      "step": 15416
+    },
+    {
+      "epoch": 2.7448361823361824,
+      "grad_norm": 0.9882165789604187,
+      "learning_rate": 4.488894244191951e-05,
+      "loss": 0.975,
+      "step": 15417
+    },
+    {
+      "epoch": 2.745014245014245,
+      "grad_norm": 0.8275474309921265,
+      "learning_rate": 4.48772630400436e-05,
+      "loss": 0.8459,
+      "step": 15418
+    },
+    {
+      "epoch": 2.7451923076923075,
+      "grad_norm": 0.8468943238258362,
+      "learning_rate": 4.486558471818322e-05,
+      "loss": 0.8217,
+      "step": 15419
+    },
+    {
+      "epoch": 2.7453703703703702,
+      "grad_norm": 0.8845008015632629,
+      "learning_rate": 4.485390747656717e-05,
+      "loss": 0.9811,
+      "step": 15420
+    },
+    {
+      "epoch": 2.745548433048433,
+      "grad_norm": 1.0010331869125366,
+      "learning_rate": 4.4842231315424255e-05,
+      "loss": 0.9437,
+      "step": 15421
+    },
+    {
+      "epoch": 2.7457264957264957,
+      "grad_norm": 0.7468565106391907,
+      "learning_rate": 4.483055623498319e-05,
+      "loss": 0.7205,
+      "step": 15422
+    },
+    {
+      "epoch": 2.7459045584045585,
+      "grad_norm": 0.9002050757408142,
+      "learning_rate": 4.4818882235472845e-05,
+      "loss": 0.8812,
+      "step": 15423
+    },
+    {
+      "epoch": 2.7460826210826212,
+      "grad_norm": 0.8684462904930115,
+      "learning_rate": 4.48072093171218e-05,
+      "loss": 0.7929,
+      "step": 15424
+    },
+    {
+      "epoch": 2.746260683760684,
+      "grad_norm": 0.8685877323150635,
+      "learning_rate": 4.479553748015891e-05,
+      "loss": 0.805,
+      "step": 15425
+    },
+    {
+      "epoch": 2.7464387464387463,
+      "grad_norm": 0.8292124271392822,
+      "learning_rate": 4.478386672481272e-05,
+      "loss": 0.9622,
+      "step": 15426
+    },
+    {
+      "epoch": 2.746616809116809,
+      "grad_norm": 0.8269517421722412,
+      "learning_rate": 4.477219705131199e-05,
+      "loss": 0.8011,
+      "step": 15427
+    },
+    {
+      "epoch": 2.746794871794872,
+      "grad_norm": 0.8913753628730774,
+      "learning_rate": 4.4760528459885334e-05,
+      "loss": 0.9794,
+      "step": 15428
+    },
+    {
+      "epoch": 2.7469729344729346,
+      "grad_norm": 0.8017858266830444,
+      "learning_rate": 4.474886095076137e-05,
+      "loss": 0.8593,
+      "step": 15429
+    },
+    {
+      "epoch": 2.7471509971509973,
+      "grad_norm": 1.0657325983047485,
+      "learning_rate": 4.47371945241687e-05,
+      "loss": 0.8043,
+      "step": 15430
+    },
+    {
+      "epoch": 2.7473290598290596,
+      "grad_norm": 0.8358477354049683,
+      "learning_rate": 4.472552918033588e-05,
+      "loss": 0.818,
+      "step": 15431
+    },
+    {
+      "epoch": 2.7475071225071224,
+      "grad_norm": 1.0436886548995972,
+      "learning_rate": 4.4713864919491514e-05,
+      "loss": 0.9246,
+      "step": 15432
+    },
+    {
+      "epoch": 2.747685185185185,
+      "grad_norm": 0.9838647246360779,
+      "learning_rate": 4.470220174186413e-05,
+      "loss": 1.0345,
+      "step": 15433
+    },
+    {
+      "epoch": 2.747863247863248,
+      "grad_norm": 0.8583347201347351,
+      "learning_rate": 4.469053964768222e-05,
+      "loss": 0.6242,
+      "step": 15434
+    },
+    {
+      "epoch": 2.7480413105413106,
+      "grad_norm": 0.832467794418335,
+      "learning_rate": 4.4678878637174304e-05,
+      "loss": 0.7988,
+      "step": 15435
+    },
+    {
+      "epoch": 2.7482193732193734,
+      "grad_norm": 0.7854242324829102,
+      "learning_rate": 4.4667218710568825e-05,
+      "loss": 0.9386,
+      "step": 15436
+    },
+    {
+      "epoch": 2.748397435897436,
+      "grad_norm": 0.7748091816902161,
+      "learning_rate": 4.465555986809423e-05,
+      "loss": 0.7322,
+      "step": 15437
+    },
+    {
+      "epoch": 2.7485754985754984,
+      "grad_norm": 0.818305492401123,
+      "learning_rate": 4.464390210997904e-05,
+      "loss": 0.822,
+      "step": 15438
+    },
+    {
+      "epoch": 2.748753561253561,
+      "grad_norm": 0.8253993391990662,
+      "learning_rate": 4.463224543645151e-05,
+      "loss": 0.7631,
+      "step": 15439
+    },
+    {
+      "epoch": 2.748931623931624,
+      "grad_norm": 0.804768443107605,
+      "learning_rate": 4.46205898477402e-05,
+      "loss": 0.8783,
+      "step": 15440
+    },
+    {
+      "epoch": 2.7491096866096867,
+      "grad_norm": 0.8612813949584961,
+      "learning_rate": 4.460893534407332e-05,
+      "loss": 0.944,
+      "step": 15441
+    },
+    {
+      "epoch": 2.7492877492877494,
+      "grad_norm": 0.8149600625038147,
+      "learning_rate": 4.459728192567932e-05,
+      "loss": 0.8592,
+      "step": 15442
+    },
+    {
+      "epoch": 2.7494658119658117,
+      "grad_norm": 0.996081531047821,
+      "learning_rate": 4.4585629592786496e-05,
+      "loss": 0.8648,
+      "step": 15443
+    },
+    {
+      "epoch": 2.7496438746438745,
+      "grad_norm": 0.8563137650489807,
+      "learning_rate": 4.457397834562314e-05,
+      "loss": 0.6645,
+      "step": 15444
+    },
+    {
+      "epoch": 2.7496438746438745,
+      "eval_loss": 1.1326396465301514,
+      "eval_runtime": 24.3757,
+      "eval_samples_per_second": 42.707,
+      "eval_steps_per_second": 21.374,
+      "step": 15444
+    },
+    {
+      "epoch": 2.7498219373219372,
+      "grad_norm": 0.7415599226951599,
+      "learning_rate": 4.4562328184417547e-05,
+      "loss": 0.7481,
+      "step": 15445
+    },
+    {
+      "epoch": 2.75,
+      "grad_norm": 0.8192741274833679,
+      "learning_rate": 4.455067910939796e-05,
+      "loss": 0.8367,
+      "step": 15446
+    },
+    {
+      "epoch": 2.7501780626780628,
+      "grad_norm": 0.8514624834060669,
+      "learning_rate": 4.4539031120792604e-05,
+      "loss": 0.7107,
+      "step": 15447
+    },
+    {
+      "epoch": 2.7503561253561255,
+      "grad_norm": 0.8594211339950562,
+      "learning_rate": 4.4527384218829796e-05,
+      "loss": 0.9332,
+      "step": 15448
+    },
+    {
+      "epoch": 2.7505341880341883,
+      "grad_norm": 0.7828420996665955,
+      "learning_rate": 4.4515738403737585e-05,
+      "loss": 0.719,
+      "step": 15449
+    },
+    {
+      "epoch": 2.7507122507122506,
+      "grad_norm": 0.9195737242698669,
+      "learning_rate": 4.4504093675744285e-05,
+      "loss": 0.8581,
+      "step": 15450
+    },
+    {
+      "epoch": 2.7508903133903133,
+      "grad_norm": 0.8090249300003052,
+      "learning_rate": 4.449245003507793e-05,
+      "loss": 0.8012,
+      "step": 15451
+    },
+    {
+      "epoch": 2.751068376068376,
+      "grad_norm": 0.9179023504257202,
+      "learning_rate": 4.4480807481966736e-05,
+      "loss": 0.8968,
+      "step": 15452
+    },
+    {
+      "epoch": 2.751246438746439,
+      "grad_norm": 0.8140867352485657,
+      "learning_rate": 4.446916601663879e-05,
+      "loss": 0.791,
+      "step": 15453
+    },
+    {
+      "epoch": 2.7514245014245016,
+      "grad_norm": 0.8313645720481873,
+      "learning_rate": 4.445752563932214e-05,
+      "loss": 0.8658,
+      "step": 15454
+    },
+    {
+      "epoch": 2.751602564102564,
+      "grad_norm": 1.0235247611999512,
+      "learning_rate": 4.444588635024497e-05,
+      "loss": 0.9383,
+      "step": 15455
+    },
+    {
+      "epoch": 2.7517806267806266,
+      "grad_norm": 0.8270257711410522,
+      "learning_rate": 4.443424814963518e-05,
+      "loss": 0.7666,
+      "step": 15456
+    },
+    {
+      "epoch": 2.7519586894586894,
+      "grad_norm": 0.8711290955543518,
+      "learning_rate": 4.442261103772092e-05,
+      "loss": 1.0819,
+      "step": 15457
+    },
+    {
+      "epoch": 2.752136752136752,
+      "grad_norm": 0.7851848602294922,
+      "learning_rate": 4.441097501473013e-05,
+      "loss": 0.8788,
+      "step": 15458
+    },
+    {
+      "epoch": 2.752314814814815,
+      "grad_norm": 0.9703593850135803,
+      "learning_rate": 4.4399340080890816e-05,
+      "loss": 0.8661,
+      "step": 15459
+    },
+    {
+      "epoch": 2.7524928774928776,
+      "grad_norm": 0.8575040102005005,
+      "learning_rate": 4.438770623643093e-05,
+      "loss": 0.8318,
+      "step": 15460
+    },
+    {
+      "epoch": 2.7526709401709404,
+      "grad_norm": 0.9393935799598694,
+      "learning_rate": 4.43760734815784e-05,
+      "loss": 0.8899,
+      "step": 15461
+    },
+    {
+      "epoch": 2.7528490028490027,
+      "grad_norm": 0.9310712814331055,
+      "learning_rate": 4.4364441816561185e-05,
+      "loss": 0.9519,
+      "step": 15462
+    },
+    {
+      "epoch": 2.7530270655270654,
+      "grad_norm": 0.8066901564598083,
+      "learning_rate": 4.435281124160715e-05,
+      "loss": 0.8102,
+      "step": 15463
+    },
+    {
+      "epoch": 2.753205128205128,
+      "grad_norm": 0.8681934475898743,
+      "learning_rate": 4.434118175694415e-05,
+      "loss": 0.7745,
+      "step": 15464
+    },
+    {
+      "epoch": 2.753383190883191,
+      "grad_norm": 0.7921330332756042,
+      "learning_rate": 4.432955336280014e-05,
+      "loss": 0.8396,
+      "step": 15465
+    },
+    {
+      "epoch": 2.7535612535612537,
+      "grad_norm": 0.8818981051445007,
+      "learning_rate": 4.4317926059402816e-05,
+      "loss": 0.9268,
+      "step": 15466
+    },
+    {
+      "epoch": 2.753739316239316,
+      "grad_norm": 0.8018338084220886,
+      "learning_rate": 4.4306299846980096e-05,
+      "loss": 0.9018,
+      "step": 15467
+    },
+    {
+      "epoch": 2.7539173789173788,
+      "grad_norm": 0.8704143762588501,
+      "learning_rate": 4.4294674725759734e-05,
+      "loss": 0.9517,
+      "step": 15468
+    },
+    {
+      "epoch": 2.7540954415954415,
+      "grad_norm": 1.1460380554199219,
+      "learning_rate": 4.4283050695969506e-05,
+      "loss": 0.7642,
+      "step": 15469
+    },
+    {
+      "epoch": 2.7542735042735043,
+      "grad_norm": 0.8134510517120361,
+      "learning_rate": 4.427142775783716e-05,
+      "loss": 1.0405,
+      "step": 15470
+    },
+    {
+      "epoch": 2.754451566951567,
+      "grad_norm": 0.8054876327514648,
+      "learning_rate": 4.425980591159038e-05,
+      "loss": 0.8929,
+      "step": 15471
+    },
+    {
+      "epoch": 2.7546296296296298,
+      "grad_norm": 0.8607433438301086,
+      "learning_rate": 4.4248185157456953e-05,
+      "loss": 0.9187,
+      "step": 15472
+    },
+    {
+      "epoch": 2.7548076923076925,
+      "grad_norm": 0.7448357939720154,
+      "learning_rate": 4.423656549566453e-05,
+      "loss": 0.7298,
+      "step": 15473
+    },
+    {
+      "epoch": 2.754985754985755,
+      "grad_norm": 0.9228075742721558,
+      "learning_rate": 4.422494692644076e-05,
+      "loss": 0.8704,
+      "step": 15474
+    },
+    {
+      "epoch": 2.7551638176638176,
+      "grad_norm": 0.8781694173812866,
+      "learning_rate": 4.421332945001329e-05,
+      "loss": 0.9555,
+      "step": 15475
+    },
+    {
+      "epoch": 2.7553418803418803,
+      "grad_norm": 0.8632338643074036,
+      "learning_rate": 4.420171306660975e-05,
+      "loss": 0.7322,
+      "step": 15476
+    },
+    {
+      "epoch": 2.755519943019943,
+      "grad_norm": 0.9808199405670166,
+      "learning_rate": 4.4190097776457716e-05,
+      "loss": 0.619,
+      "step": 15477
+    },
+    {
+      "epoch": 2.755698005698006,
+      "grad_norm": 1.025109052658081,
+      "learning_rate": 4.41784835797848e-05,
+      "loss": 1.0617,
+      "step": 15478
+    },
+    {
+      "epoch": 2.755876068376068,
+      "grad_norm": 0.8132767677307129,
+      "learning_rate": 4.416687047681849e-05,
+      "loss": 1.0045,
+      "step": 15479
+    },
+    {
+      "epoch": 2.756054131054131,
+      "grad_norm": 0.9630453586578369,
+      "learning_rate": 4.415525846778645e-05,
+      "loss": 0.9576,
+      "step": 15480
+    },
+    {
+      "epoch": 2.7562321937321936,
+      "grad_norm": 0.9891922473907471,
+      "learning_rate": 4.4143647552916034e-05,
+      "loss": 0.9333,
+      "step": 15481
+    },
+    {
+      "epoch": 2.7564102564102564,
+      "grad_norm": 0.7974509000778198,
+      "learning_rate": 4.413203773243486e-05,
+      "loss": 0.7809,
+      "step": 15482
+    },
+    {
+      "epoch": 2.756588319088319,
+      "grad_norm": 0.8173473477363586,
+      "learning_rate": 4.412042900657034e-05,
+      "loss": 1.1023,
+      "step": 15483
+    },
+    {
+      "epoch": 2.756766381766382,
+      "grad_norm": 0.8502877950668335,
+      "learning_rate": 4.410882137554994e-05,
+      "loss": 0.8705,
+      "step": 15484
+    },
+    {
+      "epoch": 2.7569444444444446,
+      "grad_norm": 0.8519158959388733,
+      "learning_rate": 4.4097214839601074e-05,
+      "loss": 0.8901,
+      "step": 15485
+    },
+    {
+      "epoch": 2.7571225071225074,
+      "grad_norm": 0.7851125001907349,
+      "learning_rate": 4.4085609398951164e-05,
+      "loss": 1.0612,
+      "step": 15486
+    },
+    {
+      "epoch": 2.7573005698005697,
+      "grad_norm": 0.9585029482841492,
+      "learning_rate": 4.407400505382758e-05,
+      "loss": 0.9229,
+      "step": 15487
+    },
+    {
+      "epoch": 2.7574786324786325,
+      "grad_norm": 0.775071918964386,
+      "learning_rate": 4.4062401804457686e-05,
+      "loss": 0.8246,
+      "step": 15488
+    },
+    {
+      "epoch": 2.757656695156695,
+      "grad_norm": 0.8049488067626953,
+      "learning_rate": 4.405079965106881e-05,
+      "loss": 0.9681,
+      "step": 15489
+    },
+    {
+      "epoch": 2.757834757834758,
+      "grad_norm": 0.9452522993087769,
+      "learning_rate": 4.4039198593888306e-05,
+      "loss": 0.7288,
+      "step": 15490
+    },
+    {
+      "epoch": 2.7580128205128203,
+      "grad_norm": 0.8296085596084595,
+      "learning_rate": 4.402759863314346e-05,
+      "loss": 0.8053,
+      "step": 15491
+    },
+    {
+      "epoch": 2.758190883190883,
+      "grad_norm": 0.8086248636245728,
+      "learning_rate": 4.4015999769061556e-05,
+      "loss": 0.8692,
+      "step": 15492
+    },
+    {
+      "epoch": 2.7583689458689458,
+      "grad_norm": 0.8784860372543335,
+      "learning_rate": 4.4004402001869836e-05,
+      "loss": 1.0503,
+      "step": 15493
+    },
+    {
+      "epoch": 2.7585470085470085,
+      "grad_norm": 0.82901930809021,
+      "learning_rate": 4.399280533179551e-05,
+      "loss": 0.8479,
+      "step": 15494
+    },
+    {
+      "epoch": 2.7587250712250713,
+      "grad_norm": 0.7654509544372559,
+      "learning_rate": 4.3981209759065875e-05,
+      "loss": 0.743,
+      "step": 15495
+    },
+    {
+      "epoch": 2.758903133903134,
+      "grad_norm": 0.8240879774093628,
+      "learning_rate": 4.3969615283908e-05,
+      "loss": 0.8303,
+      "step": 15496
+    },
+    {
+      "epoch": 2.7590811965811968,
+      "grad_norm": 0.9411282539367676,
+      "learning_rate": 4.3958021906549195e-05,
+      "loss": 0.8217,
+      "step": 15497
+    },
+    {
+      "epoch": 2.7592592592592595,
+      "grad_norm": 0.8222329616546631,
+      "learning_rate": 4.394642962721647e-05,
+      "loss": 0.9596,
+      "step": 15498
+    },
+    {
+      "epoch": 2.759437321937322,
+      "grad_norm": 0.8462044596672058,
+      "learning_rate": 4.393483844613704e-05,
+      "loss": 0.8029,
+      "step": 15499
+    },
+    {
+      "epoch": 2.7596153846153846,
+      "grad_norm": 1.0385619401931763,
+      "learning_rate": 4.392324836353798e-05,
+      "loss": 1.0352,
+      "step": 15500
+    },
+    {
+      "epoch": 2.7597934472934473,
+      "grad_norm": 0.9049911499023438,
+      "learning_rate": 4.3911659379646384e-05,
+      "loss": 1.0761,
+      "step": 15501
+    },
+    {
+      "epoch": 2.75997150997151,
+      "grad_norm": 0.8253830671310425,
+      "learning_rate": 4.390007149468932e-05,
+      "loss": 0.7693,
+      "step": 15502
+    },
+    {
+      "epoch": 2.7601495726495724,
+      "grad_norm": 0.7939008474349976,
+      "learning_rate": 4.388848470889381e-05,
+      "loss": 0.8847,
+      "step": 15503
+    },
+    {
+      "epoch": 2.760327635327635,
+      "grad_norm": 1.048941969871521,
+      "learning_rate": 4.387689902248684e-05,
+      "loss": 0.7012,
+      "step": 15504
+    },
+    {
+      "epoch": 2.760505698005698,
+      "grad_norm": 0.8834842443466187,
+      "learning_rate": 4.386531443569553e-05,
+      "loss": 0.9561,
+      "step": 15505
+    },
+    {
+      "epoch": 2.7606837606837606,
+      "grad_norm": 0.9147583842277527,
+      "learning_rate": 4.385373094874669e-05,
+      "loss": 0.9736,
+      "step": 15506
+    },
+    {
+      "epoch": 2.7608618233618234,
+      "grad_norm": 0.6820386648178101,
+      "learning_rate": 4.38421485618674e-05,
+      "loss": 0.4449,
+      "step": 15507
+    },
+    {
+      "epoch": 2.761039886039886,
+      "grad_norm": 0.9519942402839661,
+      "learning_rate": 4.383056727528455e-05,
+      "loss": 1.0385,
+      "step": 15508
+    },
+    {
+      "epoch": 2.761217948717949,
+      "grad_norm": 0.8701474070549011,
+      "learning_rate": 4.381898708922505e-05,
+      "loss": 0.9896,
+      "step": 15509
+    },
+    {
+      "epoch": 2.7613960113960117,
+      "grad_norm": 0.8756018877029419,
+      "learning_rate": 4.38074080039158e-05,
+      "loss": 0.9378,
+      "step": 15510
+    },
+    {
+      "epoch": 2.761574074074074,
+      "grad_norm": 0.8670514225959778,
+      "learning_rate": 4.379583001958362e-05,
+      "loss": 0.9175,
+      "step": 15511
+    },
+    {
+      "epoch": 2.7617521367521367,
+      "grad_norm": 0.8227131366729736,
+      "learning_rate": 4.378425313645547e-05,
+      "loss": 0.7864,
+      "step": 15512
+    },
+    {
+      "epoch": 2.7619301994301995,
+      "grad_norm": 0.9121497273445129,
+      "learning_rate": 4.377267735475802e-05,
+      "loss": 0.877,
+      "step": 15513
+    },
+    {
+      "epoch": 2.762108262108262,
+      "grad_norm": 0.8347102999687195,
+      "learning_rate": 4.3761102674718205e-05,
+      "loss": 0.8223,
+      "step": 15514
+    },
+    {
+      "epoch": 2.7622863247863245,
+      "grad_norm": 0.8657951951026917,
+      "learning_rate": 4.374952909656275e-05,
+      "loss": 0.7117,
+      "step": 15515
+    },
+    {
+      "epoch": 2.7624643874643873,
+      "grad_norm": 0.8934728503227234,
+      "learning_rate": 4.3737956620518414e-05,
+      "loss": 0.84,
+      "step": 15516
+    },
+    {
+      "epoch": 2.76264245014245,
+      "grad_norm": 0.7592045068740845,
+      "learning_rate": 4.3726385246811964e-05,
+      "loss": 0.7856,
+      "step": 15517
+    },
+    {
+      "epoch": 2.7628205128205128,
+      "grad_norm": 0.8480674028396606,
+      "learning_rate": 4.371481497567008e-05,
+      "loss": 0.7805,
+      "step": 15518
+    },
+    {
+      "epoch": 2.7629985754985755,
+      "grad_norm": 1.0231767892837524,
+      "learning_rate": 4.3703245807319437e-05,
+      "loss": 1.1517,
+      "step": 15519
+    },
+    {
+      "epoch": 2.7631766381766383,
+      "grad_norm": 0.9852092862129211,
+      "learning_rate": 4.369167774198684e-05,
+      "loss": 0.8735,
+      "step": 15520
+    },
+    {
+      "epoch": 2.763354700854701,
+      "grad_norm": 0.8751610517501831,
+      "learning_rate": 4.368011077989875e-05,
+      "loss": 0.9975,
+      "step": 15521
+    },
+    {
+      "epoch": 2.763532763532764,
+      "grad_norm": 0.8397828340530396,
+      "learning_rate": 4.3668544921281976e-05,
+      "loss": 1.0675,
+      "step": 15522
+    },
+    {
+      "epoch": 2.763710826210826,
+      "grad_norm": 0.7040372490882874,
+      "learning_rate": 4.3656980166362974e-05,
+      "loss": 0.7123,
+      "step": 15523
+    },
+    {
+      "epoch": 2.763888888888889,
+      "grad_norm": 1.0610599517822266,
+      "learning_rate": 4.364541651536844e-05,
+      "loss": 1.0854,
+      "step": 15524
+    },
+    {
+      "epoch": 2.7640669515669516,
+      "grad_norm": 0.78865647315979,
+      "learning_rate": 4.363385396852491e-05,
+      "loss": 0.924,
+      "step": 15525
+    },
+    {
+      "epoch": 2.7642450142450143,
+      "grad_norm": 0.87164705991745,
+      "learning_rate": 4.362229252605891e-05,
+      "loss": 0.9739,
+      "step": 15526
+    },
+    {
+      "epoch": 2.7644230769230766,
+      "grad_norm": 0.9362281560897827,
+      "learning_rate": 4.361073218819698e-05,
+      "loss": 0.7751,
+      "step": 15527
+    },
+    {
+      "epoch": 2.7646011396011394,
+      "grad_norm": 0.7944566011428833,
+      "learning_rate": 4.3599172955165605e-05,
+      "loss": 0.8913,
+      "step": 15528
+    },
+    {
+      "epoch": 2.764779202279202,
+      "grad_norm": 0.9346068501472473,
+      "learning_rate": 4.358761482719125e-05,
+      "loss": 0.8286,
+      "step": 15529
+    },
+    {
+      "epoch": 2.764957264957265,
+      "grad_norm": 0.8570913076400757,
+      "learning_rate": 4.3576057804500414e-05,
+      "loss": 1.0334,
+      "step": 15530
+    },
+    {
+      "epoch": 2.7651353276353277,
+      "grad_norm": 0.801908552646637,
+      "learning_rate": 4.356450188731953e-05,
+      "loss": 0.9021,
+      "step": 15531
+    },
+    {
+      "epoch": 2.7653133903133904,
+      "grad_norm": 0.848849892616272,
+      "learning_rate": 4.355294707587499e-05,
+      "loss": 0.9132,
+      "step": 15532
+    },
+    {
+      "epoch": 2.765491452991453,
+      "grad_norm": 0.7961751818656921,
+      "learning_rate": 4.35413933703932e-05,
+      "loss": 0.841,
+      "step": 15533
+    },
+    {
+      "epoch": 2.765669515669516,
+      "grad_norm": 0.8609708547592163,
+      "learning_rate": 4.352984077110052e-05,
+      "loss": 0.8176,
+      "step": 15534
+    },
+    {
+      "epoch": 2.765847578347578,
+      "grad_norm": 0.8779369592666626,
+      "learning_rate": 4.35182892782233e-05,
+      "loss": 0.8252,
+      "step": 15535
+    },
+    {
+      "epoch": 2.766025641025641,
+      "grad_norm": 0.7878577709197998,
+      "learning_rate": 4.3506738891987844e-05,
+      "loss": 0.7498,
+      "step": 15536
+    },
+    {
+      "epoch": 2.7662037037037037,
+      "grad_norm": 0.9531580805778503,
+      "learning_rate": 4.3495189612620557e-05,
+      "loss": 0.8438,
+      "step": 15537
+    },
+    {
+      "epoch": 2.7663817663817665,
+      "grad_norm": 0.7791294455528259,
+      "learning_rate": 4.3483641440347564e-05,
+      "loss": 0.9188,
+      "step": 15538
+    },
+    {
+      "epoch": 2.7665598290598292,
+      "grad_norm": 0.8683488965034485,
+      "learning_rate": 4.347209437539527e-05,
+      "loss": 1.0422,
+      "step": 15539
+    },
+    {
+      "epoch": 2.7667378917378915,
+      "grad_norm": 0.8904309272766113,
+      "learning_rate": 4.346054841798984e-05,
+      "loss": 0.8072,
+      "step": 15540
+    },
+    {
+      "epoch": 2.7669159544159543,
+      "grad_norm": 0.7409844398498535,
+      "learning_rate": 4.344900356835753e-05,
+      "loss": 0.7179,
+      "step": 15541
+    },
+    {
+      "epoch": 2.767094017094017,
+      "grad_norm": 0.9663724899291992,
+      "learning_rate": 4.343745982672451e-05,
+      "loss": 1.0568,
+      "step": 15542
+    },
+    {
+      "epoch": 2.76727207977208,
+      "grad_norm": 0.8481591939926147,
+      "learning_rate": 4.342591719331698e-05,
+      "loss": 0.8678,
+      "step": 15543
+    },
+    {
+      "epoch": 2.7674501424501425,
+      "grad_norm": 0.7301938533782959,
+      "learning_rate": 4.341437566836103e-05,
+      "loss": 0.6357,
+      "step": 15544
+    },
+    {
+      "epoch": 2.7676282051282053,
+      "grad_norm": 0.8628479242324829,
+      "learning_rate": 4.340283525208292e-05,
+      "loss": 0.9622,
+      "step": 15545
+    },
+    {
+      "epoch": 2.767806267806268,
+      "grad_norm": 0.953744113445282,
+      "learning_rate": 4.339129594470861e-05,
+      "loss": 0.683,
+      "step": 15546
+    },
+    {
+      "epoch": 2.7679843304843303,
+      "grad_norm": 0.7589353322982788,
+      "learning_rate": 4.3379757746464336e-05,
+      "loss": 0.8343,
+      "step": 15547
+    },
+    {
+      "epoch": 2.768162393162393,
+      "grad_norm": 0.8304651379585266,
+      "learning_rate": 4.336822065757601e-05,
+      "loss": 0.9084,
+      "step": 15548
+    },
+    {
+      "epoch": 2.768340455840456,
+      "grad_norm": 0.8092817068099976,
+      "learning_rate": 4.33566846782698e-05,
+      "loss": 0.8371,
+      "step": 15549
+    },
+    {
+      "epoch": 2.7685185185185186,
+      "grad_norm": 0.8983978033065796,
+      "learning_rate": 4.334514980877169e-05,
+      "loss": 0.9631,
+      "step": 15550
+    },
+    {
+      "epoch": 2.7686965811965814,
+      "grad_norm": 0.766621470451355,
+      "learning_rate": 4.3333616049307636e-05,
+      "loss": 0.768,
+      "step": 15551
+    },
+    {
+      "epoch": 2.7688746438746437,
+      "grad_norm": 0.8753345012664795,
+      "learning_rate": 4.332208340010374e-05,
+      "loss": 0.8854,
+      "step": 15552
+    },
+    {
+      "epoch": 2.7690527065527064,
+      "grad_norm": 0.8396589756011963,
+      "learning_rate": 4.331055186138581e-05,
+      "loss": 0.8322,
+      "step": 15553
+    },
+    {
+      "epoch": 2.769230769230769,
+      "grad_norm": 0.8134872317314148,
+      "learning_rate": 4.3299021433379885e-05,
+      "loss": 0.945,
+      "step": 15554
+    },
+    {
+      "epoch": 2.769408831908832,
+      "grad_norm": 0.8712667226791382,
+      "learning_rate": 4.3287492116311854e-05,
+      "loss": 0.8487,
+      "step": 15555
+    },
+    {
+      "epoch": 2.7695868945868947,
+      "grad_norm": 0.8938018083572388,
+      "learning_rate": 4.32759639104076e-05,
+      "loss": 0.854,
+      "step": 15556
+    },
+    {
+      "epoch": 2.7697649572649574,
+      "grad_norm": 0.8213987946510315,
+      "learning_rate": 4.3264436815893005e-05,
+      "loss": 0.9055,
+      "step": 15557
+    },
+    {
+      "epoch": 2.76994301994302,
+      "grad_norm": 0.9587214589118958,
+      "learning_rate": 4.3252910832993906e-05,
+      "loss": 0.9664,
+      "step": 15558
+    },
+    {
+      "epoch": 2.7701210826210825,
+      "grad_norm": 0.8746159076690674,
+      "learning_rate": 4.3241385961936146e-05,
+      "loss": 0.8394,
+      "step": 15559
+    },
+    {
+      "epoch": 2.7702991452991452,
+      "grad_norm": 0.8395819067955017,
+      "learning_rate": 4.3229862202945517e-05,
+      "loss": 0.7991,
+      "step": 15560
+    },
+    {
+      "epoch": 2.770477207977208,
+      "grad_norm": 0.856765627861023,
+      "learning_rate": 4.321833955624777e-05,
+      "loss": 0.8106,
+      "step": 15561
+    },
+    {
+      "epoch": 2.7706552706552707,
+      "grad_norm": 0.8630124926567078,
+      "learning_rate": 4.3206818022068776e-05,
+      "loss": 0.7293,
+      "step": 15562
+    },
+    {
+      "epoch": 2.7708333333333335,
+      "grad_norm": 0.8196776509284973,
+      "learning_rate": 4.319529760063414e-05,
+      "loss": 0.8831,
+      "step": 15563
+    },
+    {
+      "epoch": 2.771011396011396,
+      "grad_norm": 0.8283860683441162,
+      "learning_rate": 4.3183778292169674e-05,
+      "loss": 0.9249,
+      "step": 15564
+    },
+    {
+      "epoch": 2.7711894586894585,
+      "grad_norm": 0.8983619809150696,
+      "learning_rate": 4.3172260096901054e-05,
+      "loss": 1.2334,
+      "step": 15565
+    },
+    {
+      "epoch": 2.7713675213675213,
+      "grad_norm": 0.8437079191207886,
+      "learning_rate": 4.316074301505395e-05,
+      "loss": 0.771,
+      "step": 15566
+    },
+    {
+      "epoch": 2.771545584045584,
+      "grad_norm": 0.9565808773040771,
+      "learning_rate": 4.314922704685401e-05,
+      "loss": 0.8927,
+      "step": 15567
+    },
+    {
+      "epoch": 2.771723646723647,
+      "grad_norm": 0.7943497896194458,
+      "learning_rate": 4.313771219252687e-05,
+      "loss": 0.656,
+      "step": 15568
+    },
+    {
+      "epoch": 2.7719017094017095,
+      "grad_norm": 0.862404465675354,
+      "learning_rate": 4.3126198452298126e-05,
+      "loss": 0.8783,
+      "step": 15569
+    },
+    {
+      "epoch": 2.7720797720797723,
+      "grad_norm": 0.7928122878074646,
+      "learning_rate": 4.3114685826393365e-05,
+      "loss": 0.9799,
+      "step": 15570
+    },
+    {
+      "epoch": 2.7722578347578346,
+      "grad_norm": 0.8270733952522278,
+      "learning_rate": 4.3103174315038184e-05,
+      "loss": 0.7878,
+      "step": 15571
+    },
+    {
+      "epoch": 2.7724358974358974,
+      "grad_norm": 0.8223987817764282,
+      "learning_rate": 4.309166391845811e-05,
+      "loss": 0.7222,
+      "step": 15572
+    },
+    {
+      "epoch": 2.77261396011396,
+      "grad_norm": 0.8159852027893066,
+      "learning_rate": 4.3080154636878675e-05,
+      "loss": 0.8082,
+      "step": 15573
+    },
+    {
+      "epoch": 2.772792022792023,
+      "grad_norm": 0.882792055606842,
+      "learning_rate": 4.306864647052537e-05,
+      "loss": 1.0659,
+      "step": 15574
+    },
+    {
+      "epoch": 2.7729700854700856,
+      "grad_norm": 0.8734562993049622,
+      "learning_rate": 4.305713941962366e-05,
+      "loss": 0.9301,
+      "step": 15575
+    },
+    {
+      "epoch": 2.773148148148148,
+      "grad_norm": 0.8544983267784119,
+      "learning_rate": 4.304563348439898e-05,
+      "loss": 0.7442,
+      "step": 15576
+    },
+    {
+      "epoch": 2.7733262108262107,
+      "grad_norm": 0.9045799374580383,
+      "learning_rate": 4.303412866507689e-05,
+      "loss": 1.1023,
+      "step": 15577
+    },
+    {
+      "epoch": 2.7735042735042734,
+      "grad_norm": 0.8132993578910828,
+      "learning_rate": 4.3022624961882615e-05,
+      "loss": 0.7032,
+      "step": 15578
+    },
+    {
+      "epoch": 2.773682336182336,
+      "grad_norm": 0.7072446942329407,
+      "learning_rate": 4.30111223750417e-05,
+      "loss": 0.7176,
+      "step": 15579
+    },
+    {
+      "epoch": 2.773860398860399,
+      "grad_norm": 0.8212466239929199,
+      "learning_rate": 4.299962090477945e-05,
+      "loss": 0.7664,
+      "step": 15580
+    },
+    {
+      "epoch": 2.7740384615384617,
+      "grad_norm": 0.7781338095664978,
+      "learning_rate": 4.298812055132122e-05,
+      "loss": 0.7439,
+      "step": 15581
+    },
+    {
+      "epoch": 2.7742165242165244,
+      "grad_norm": 0.9289973378181458,
+      "learning_rate": 4.297662131489234e-05,
+      "loss": 0.9504,
+      "step": 15582
+    },
+    {
+      "epoch": 2.7743945868945867,
+      "grad_norm": 0.8571373224258423,
+      "learning_rate": 4.2965123195718105e-05,
+      "loss": 0.9959,
+      "step": 15583
+    },
+    {
+      "epoch": 2.7745726495726495,
+      "grad_norm": 0.8670883774757385,
+      "learning_rate": 4.29536261940238e-05,
+      "loss": 0.8207,
+      "step": 15584
+    },
+    {
+      "epoch": 2.7747507122507122,
+      "grad_norm": 0.8684807419776917,
+      "learning_rate": 4.294213031003469e-05,
+      "loss": 0.7508,
+      "step": 15585
+    },
+    {
+      "epoch": 2.774928774928775,
+      "grad_norm": 0.7746252417564392,
+      "learning_rate": 4.293063554397597e-05,
+      "loss": 0.909,
+      "step": 15586
+    },
+    {
+      "epoch": 2.7751068376068377,
+      "grad_norm": 0.8363521099090576,
+      "learning_rate": 4.291914189607297e-05,
+      "loss": 0.8564,
+      "step": 15587
+    },
+    {
+      "epoch": 2.7752849002849,
+      "grad_norm": 0.8843217492103577,
+      "learning_rate": 4.2907649366550726e-05,
+      "loss": 0.8187,
+      "step": 15588
+    },
+    {
+      "epoch": 2.775462962962963,
+      "grad_norm": 0.9330897331237793,
+      "learning_rate": 4.2896157955634545e-05,
+      "loss": 0.8179,
+      "step": 15589
+    },
+    {
+      "epoch": 2.7756410256410255,
+      "grad_norm": 0.7950356602668762,
+      "learning_rate": 4.288466766354953e-05,
+      "loss": 0.7091,
+      "step": 15590
+    },
+    {
+      "epoch": 2.7758190883190883,
+      "grad_norm": 0.9085933566093445,
+      "learning_rate": 4.287317849052075e-05,
+      "loss": 0.8015,
+      "step": 15591
+    },
+    {
+      "epoch": 2.775997150997151,
+      "grad_norm": 0.9285191893577576,
+      "learning_rate": 4.286169043677345e-05,
+      "loss": 0.9967,
+      "step": 15592
+    },
+    {
+      "epoch": 2.776175213675214,
+      "grad_norm": 0.8202041387557983,
+      "learning_rate": 4.285020350253256e-05,
+      "loss": 0.9286,
+      "step": 15593
+    },
+    {
+      "epoch": 2.7763532763532766,
+      "grad_norm": 1.0619434118270874,
+      "learning_rate": 4.283871768802328e-05,
+      "loss": 0.7863,
+      "step": 15594
+    },
+    {
+      "epoch": 2.7765313390313393,
+      "grad_norm": 0.8250051140785217,
+      "learning_rate": 4.282723299347052e-05,
+      "loss": 0.8531,
+      "step": 15595
+    },
+    {
+      "epoch": 2.7767094017094016,
+      "grad_norm": 0.8794218897819519,
+      "learning_rate": 4.281574941909939e-05,
+      "loss": 0.906,
+      "step": 15596
+    },
+    {
+      "epoch": 2.7768874643874644,
+      "grad_norm": 0.7725922465324402,
+      "learning_rate": 4.2804266965134866e-05,
+      "loss": 0.9084,
+      "step": 15597
+    },
+    {
+      "epoch": 2.777065527065527,
+      "grad_norm": 0.7845144867897034,
+      "learning_rate": 4.279278563180192e-05,
+      "loss": 0.9768,
+      "step": 15598
+    },
+    {
+      "epoch": 2.77724358974359,
+      "grad_norm": 0.9395498633384705,
+      "learning_rate": 4.27813054193255e-05,
+      "loss": 0.9055,
+      "step": 15599
+    },
+    {
+      "epoch": 2.777421652421652,
+      "grad_norm": 0.8043427467346191,
+      "learning_rate": 4.276982632793054e-05,
+      "loss": 0.8244,
+      "step": 15600
+    },
+    {
+      "epoch": 2.777599715099715,
+      "grad_norm": 0.7874096632003784,
+      "learning_rate": 4.27583483578419e-05,
+      "loss": 0.8861,
+      "step": 15601
+    },
+    {
+      "epoch": 2.7777777777777777,
+      "grad_norm": 0.8874611258506775,
+      "learning_rate": 4.27468715092846e-05,
+      "loss": 1.0457,
+      "step": 15602
+    },
+    {
+      "epoch": 2.7779558404558404,
+      "grad_norm": 1.0025757551193237,
+      "learning_rate": 4.273539578248334e-05,
+      "loss": 1.1114,
+      "step": 15603
+    },
+    {
+      "epoch": 2.778133903133903,
+      "grad_norm": 0.9982876777648926,
+      "learning_rate": 4.272392117766313e-05,
+      "loss": 0.9142,
+      "step": 15604
+    },
+    {
+      "epoch": 2.778311965811966,
+      "grad_norm": 0.8762221932411194,
+      "learning_rate": 4.2712447695048616e-05,
+      "loss": 1.0114,
+      "step": 15605
+    },
+    {
+      "epoch": 2.7784900284900287,
+      "grad_norm": 0.9136927723884583,
+      "learning_rate": 4.2700975334864726e-05,
+      "loss": 0.8224,
+      "step": 15606
+    },
+    {
+      "epoch": 2.7786680911680914,
+      "grad_norm": 0.8845604062080383,
+      "learning_rate": 4.2689504097336184e-05,
+      "loss": 0.8135,
+      "step": 15607
+    },
+    {
+      "epoch": 2.7788461538461537,
+      "grad_norm": 0.8584510087966919,
+      "learning_rate": 4.267803398268777e-05,
+      "loss": 0.8928,
+      "step": 15608
+    },
+    {
+      "epoch": 2.7790242165242165,
+      "grad_norm": 0.711402177810669,
+      "learning_rate": 4.266656499114421e-05,
+      "loss": 0.792,
+      "step": 15609
+    },
+    {
+      "epoch": 2.7792022792022792,
+      "grad_norm": 0.9480760097503662,
+      "learning_rate": 4.2655097122930165e-05,
+      "loss": 0.8707,
+      "step": 15610
+    },
+    {
+      "epoch": 2.779380341880342,
+      "grad_norm": 0.863855242729187,
+      "learning_rate": 4.264363037827041e-05,
+      "loss": 1.0114,
+      "step": 15611
+    },
+    {
+      "epoch": 2.7795584045584043,
+      "grad_norm": 0.8010865449905396,
+      "learning_rate": 4.2632164757389556e-05,
+      "loss": 0.9221,
+      "step": 15612
+    },
+    {
+      "epoch": 2.779736467236467,
+      "grad_norm": 0.7950930595397949,
+      "learning_rate": 4.262070026051227e-05,
+      "loss": 0.6951,
+      "step": 15613
+    },
+    {
+      "epoch": 2.77991452991453,
+      "grad_norm": 0.8252870440483093,
+      "learning_rate": 4.260923688786317e-05,
+      "loss": 0.6976,
+      "step": 15614
+    },
+    {
+      "epoch": 2.7800925925925926,
+      "grad_norm": 0.7855920791625977,
+      "learning_rate": 4.259777463966686e-05,
+      "loss": 0.8277,
+      "step": 15615
+    },
+    {
+      "epoch": 2.7802706552706553,
+      "grad_norm": 0.8783130645751953,
+      "learning_rate": 4.258631351614786e-05,
+      "loss": 0.6995,
+      "step": 15616
+    },
+    {
+      "epoch": 2.780448717948718,
+      "grad_norm": 0.8904485106468201,
+      "learning_rate": 4.257485351753085e-05,
+      "loss": 0.8226,
+      "step": 15617
+    },
+    {
+      "epoch": 2.780626780626781,
+      "grad_norm": 0.8761011958122253,
+      "learning_rate": 4.2563394644040244e-05,
+      "loss": 0.9187,
+      "step": 15618
+    },
+    {
+      "epoch": 2.7808048433048436,
+      "grad_norm": 0.897404670715332,
+      "learning_rate": 4.255193689590067e-05,
+      "loss": 1.0234,
+      "step": 15619
+    },
+    {
+      "epoch": 2.780982905982906,
+      "grad_norm": 0.8966960906982422,
+      "learning_rate": 4.254048027333648e-05,
+      "loss": 0.9,
+      "step": 15620
+    },
+    {
+      "epoch": 2.7811609686609686,
+      "grad_norm": 0.7506237030029297,
+      "learning_rate": 4.2529024776572245e-05,
+      "loss": 0.8939,
+      "step": 15621
+    },
+    {
+      "epoch": 2.7813390313390314,
+      "grad_norm": 0.8073886036872864,
+      "learning_rate": 4.2517570405832396e-05,
+      "loss": 0.7779,
+      "step": 15622
+    },
+    {
+      "epoch": 2.781517094017094,
+      "grad_norm": 0.7928911447525024,
+      "learning_rate": 4.250611716134134e-05,
+      "loss": 0.8278,
+      "step": 15623
+    },
+    {
+      "epoch": 2.7816951566951564,
+      "grad_norm": 0.7301982045173645,
+      "learning_rate": 4.249466504332349e-05,
+      "loss": 0.7515,
+      "step": 15624
+    },
+    {
+      "epoch": 2.781873219373219,
+      "grad_norm": 0.8215289115905762,
+      "learning_rate": 4.248321405200322e-05,
+      "loss": 0.9752,
+      "step": 15625
+    },
+    {
+      "epoch": 2.782051282051282,
+      "grad_norm": 0.8281431198120117,
+      "learning_rate": 4.247176418760486e-05,
+      "loss": 0.9625,
+      "step": 15626
+    },
+    {
+      "epoch": 2.7822293447293447,
+      "grad_norm": 0.9202759265899658,
+      "learning_rate": 4.246031545035283e-05,
+      "loss": 0.8757,
+      "step": 15627
+    },
+    {
+      "epoch": 2.7824074074074074,
+      "grad_norm": 0.8628471493721008,
+      "learning_rate": 4.244886784047133e-05,
+      "loss": 0.7626,
+      "step": 15628
+    },
+    {
+      "epoch": 2.78258547008547,
+      "grad_norm": 0.9345491528511047,
+      "learning_rate": 4.2437421358184747e-05,
+      "loss": 0.8714,
+      "step": 15629
+    },
+    {
+      "epoch": 2.782763532763533,
+      "grad_norm": 0.893713116645813,
+      "learning_rate": 4.2425976003717314e-05,
+      "loss": 0.9953,
+      "step": 15630
+    },
+    {
+      "epoch": 2.7829415954415957,
+      "grad_norm": 0.8794371485710144,
+      "learning_rate": 4.2414531777293286e-05,
+      "loss": 0.7899,
+      "step": 15631
+    },
+    {
+      "epoch": 2.783119658119658,
+      "grad_norm": 0.9003345370292664,
+      "learning_rate": 4.240308867913688e-05,
+      "loss": 0.9607,
+      "step": 15632
+    },
+    {
+      "epoch": 2.7832977207977208,
+      "grad_norm": 0.8352270126342773,
+      "learning_rate": 4.239164670947228e-05,
+      "loss": 1.0315,
+      "step": 15633
+    },
+    {
+      "epoch": 2.7834757834757835,
+      "grad_norm": 0.825252890586853,
+      "learning_rate": 4.238020586852375e-05,
+      "loss": 1.0493,
+      "step": 15634
+    },
+    {
+      "epoch": 2.7836538461538463,
+      "grad_norm": 1.1185758113861084,
+      "learning_rate": 4.2368766156515324e-05,
+      "loss": 0.8962,
+      "step": 15635
+    },
+    {
+      "epoch": 2.7838319088319086,
+      "grad_norm": 0.840336799621582,
+      "learning_rate": 4.235732757367125e-05,
+      "loss": 0.8289,
+      "step": 15636
+    },
+    {
+      "epoch": 2.7840099715099713,
+      "grad_norm": 0.9251887202262878,
+      "learning_rate": 4.2345890120215595e-05,
+      "loss": 0.9306,
+      "step": 15637
+    },
+    {
+      "epoch": 2.784188034188034,
+      "grad_norm": 0.9645969867706299,
+      "learning_rate": 4.233445379637244e-05,
+      "loss": 0.8453,
+      "step": 15638
+    },
+    {
+      "epoch": 2.784366096866097,
+      "grad_norm": 0.9010009765625,
+      "learning_rate": 4.232301860236589e-05,
+      "loss": 0.9796,
+      "step": 15639
+    },
+    {
+      "epoch": 2.7845441595441596,
+      "grad_norm": 0.920427143573761,
+      "learning_rate": 4.231158453841998e-05,
+      "loss": 0.7905,
+      "step": 15640
+    },
+    {
+      "epoch": 2.7847222222222223,
+      "grad_norm": 0.8292316794395447,
+      "learning_rate": 4.2300151604758734e-05,
+      "loss": 0.938,
+      "step": 15641
+    },
+    {
+      "epoch": 2.784900284900285,
+      "grad_norm": 0.8550885319709778,
+      "learning_rate": 4.228871980160615e-05,
+      "loss": 0.7728,
+      "step": 15642
+    },
+    {
+      "epoch": 2.785078347578348,
+      "grad_norm": 0.8785567283630371,
+      "learning_rate": 4.227728912918617e-05,
+      "loss": 1.0367,
+      "step": 15643
+    },
+    {
+      "epoch": 2.78525641025641,
+      "grad_norm": 0.8732814788818359,
+      "learning_rate": 4.226585958772289e-05,
+      "loss": 0.9914,
+      "step": 15644
+    },
+    {
+      "epoch": 2.785434472934473,
+      "grad_norm": 0.9473167061805725,
+      "learning_rate": 4.225443117744008e-05,
+      "loss": 1.1311,
+      "step": 15645
+    },
+    {
+      "epoch": 2.7856125356125356,
+      "grad_norm": 0.8819913864135742,
+      "learning_rate": 4.224300389856177e-05,
+      "loss": 0.8798,
+      "step": 15646
+    },
+    {
+      "epoch": 2.7857905982905984,
+      "grad_norm": 0.835367739200592,
+      "learning_rate": 4.223157775131182e-05,
+      "loss": 0.6977,
+      "step": 15647
+    },
+    {
+      "epoch": 2.7859686609686607,
+      "grad_norm": 0.8122659921646118,
+      "learning_rate": 4.222015273591411e-05,
+      "loss": 0.9656,
+      "step": 15648
+    },
+    {
+      "epoch": 2.7861467236467234,
+      "grad_norm": 0.8085313439369202,
+      "learning_rate": 4.220872885259247e-05,
+      "loss": 0.7456,
+      "step": 15649
+    },
+    {
+      "epoch": 2.786324786324786,
+      "grad_norm": 0.681515097618103,
+      "learning_rate": 4.21973061015707e-05,
+      "loss": 0.5008,
+      "step": 15650
+    },
+    {
+      "epoch": 2.786502849002849,
+      "grad_norm": 0.8021831512451172,
+      "learning_rate": 4.2185884483072676e-05,
+      "loss": 0.8954,
+      "step": 15651
+    },
+    {
+      "epoch": 2.7866809116809117,
+      "grad_norm": 0.9254723787307739,
+      "learning_rate": 4.217446399732216e-05,
+      "loss": 0.7855,
+      "step": 15652
+    },
+    {
+      "epoch": 2.7868589743589745,
+      "grad_norm": 0.8415037989616394,
+      "learning_rate": 4.2163044644542894e-05,
+      "loss": 0.8835,
+      "step": 15653
+    },
+    {
+      "epoch": 2.787037037037037,
+      "grad_norm": 0.9031959772109985,
+      "learning_rate": 4.2151626424958614e-05,
+      "loss": 1.0048,
+      "step": 15654
+    },
+    {
+      "epoch": 2.7872150997151,
+      "grad_norm": Infinity,
+      "learning_rate": 4.2151626424958614e-05,
+      "loss": 0.9344,
+      "step": 15655
+    },
+    {
+      "epoch": 2.7873931623931623,
+      "grad_norm": 0.815680742263794,
+      "learning_rate": 4.214020933879306e-05,
+      "loss": 0.6897,
+      "step": 15656
+    },
+    {
+      "epoch": 2.787571225071225,
+      "grad_norm": 0.9080044627189636,
+      "learning_rate": 4.212879338626989e-05,
+      "loss": 1.0366,
+      "step": 15657
+    },
+    {
+      "epoch": 2.7877492877492878,
+      "grad_norm": 0.8387414813041687,
+      "learning_rate": 4.211737856761281e-05,
+      "loss": 0.9255,
+      "step": 15658
+    },
+    {
+      "epoch": 2.7879273504273505,
+      "grad_norm": 0.9269571304321289,
+      "learning_rate": 4.210596488304542e-05,
+      "loss": 0.8971,
+      "step": 15659
+    },
+    {
+      "epoch": 2.7881054131054133,
+      "grad_norm": 0.7987017035484314,
+      "learning_rate": 4.2094552332791456e-05,
+      "loss": 0.8293,
+      "step": 15660
+    },
+    {
+      "epoch": 2.7882834757834756,
+      "grad_norm": 0.8481683731079102,
+      "learning_rate": 4.208314091707437e-05,
+      "loss": 0.9159,
+      "step": 15661
+    },
+    {
+      "epoch": 2.7884615384615383,
+      "grad_norm": 0.944736897945404,
+      "learning_rate": 4.207173063611788e-05,
+      "loss": 0.9398,
+      "step": 15662
+    },
+    {
+      "epoch": 2.788639601139601,
+      "grad_norm": 0.8471882343292236,
+      "learning_rate": 4.206032149014547e-05,
+      "loss": 0.7534,
+      "step": 15663
+    },
+    {
+      "epoch": 2.788817663817664,
+      "grad_norm": 0.8500807881355286,
+      "learning_rate": 4.2048913479380714e-05,
+      "loss": 0.8874,
+      "step": 15664
+    },
+    {
+      "epoch": 2.7889957264957266,
+      "grad_norm": 0.7949451804161072,
+      "learning_rate": 4.2037506604047115e-05,
+      "loss": 0.8691,
+      "step": 15665
+    },
+    {
+      "epoch": 2.7891737891737893,
+      "grad_norm": 0.8587945103645325,
+      "learning_rate": 4.202610086436817e-05,
+      "loss": 0.8288,
+      "step": 15666
+    },
+    {
+      "epoch": 2.789351851851852,
+      "grad_norm": 0.9155020117759705,
+      "learning_rate": 4.201469626056734e-05,
+      "loss": 0.8384,
+      "step": 15667
+    },
+    {
+      "epoch": 2.7895299145299144,
+      "grad_norm": 0.9402222037315369,
+      "learning_rate": 4.200329279286809e-05,
+      "loss": 0.8239,
+      "step": 15668
+    },
+    {
+      "epoch": 2.789707977207977,
+      "grad_norm": 0.9111437201499939,
+      "learning_rate": 4.19918904614938e-05,
+      "loss": 0.9251,
+      "step": 15669
+    },
+    {
+      "epoch": 2.78988603988604,
+      "grad_norm": 0.9434856176376343,
+      "learning_rate": 4.198048926666795e-05,
+      "loss": 1.0517,
+      "step": 15670
+    },
+    {
+      "epoch": 2.7900641025641026,
+      "grad_norm": 0.9518313407897949,
+      "learning_rate": 4.1969089208613896e-05,
+      "loss": 0.8893,
+      "step": 15671
+    },
+    {
+      "epoch": 2.7902421652421654,
+      "grad_norm": 0.8107752799987793,
+      "learning_rate": 4.1957690287554986e-05,
+      "loss": 0.6548,
+      "step": 15672
+    },
+    {
+      "epoch": 2.7904202279202277,
+      "grad_norm": 0.8361678719520569,
+      "learning_rate": 4.1946292503714556e-05,
+      "loss": 0.9224,
+      "step": 15673
+    },
+    {
+      "epoch": 2.7905982905982905,
+      "grad_norm": 0.7812657952308655,
+      "learning_rate": 4.1934895857315904e-05,
+      "loss": 0.7126,
+      "step": 15674
+    },
+    {
+      "epoch": 2.790776353276353,
+      "grad_norm": 0.9054265022277832,
+      "learning_rate": 4.192350034858241e-05,
+      "loss": 0.6891,
+      "step": 15675
+    },
+    {
+      "epoch": 2.790954415954416,
+      "grad_norm": 0.9675585627555847,
+      "learning_rate": 4.1912105977737214e-05,
+      "loss": 0.8429,
+      "step": 15676
+    },
+    {
+      "epoch": 2.7911324786324787,
+      "grad_norm": 0.9077114462852478,
+      "learning_rate": 4.19007127450037e-05,
+      "loss": 0.8864,
+      "step": 15677
+    },
+    {
+      "epoch": 2.7913105413105415,
+      "grad_norm": 0.9230541586875916,
+      "learning_rate": 4.188932065060497e-05,
+      "loss": 1.0065,
+      "step": 15678
+    },
+    {
+      "epoch": 2.791488603988604,
+      "grad_norm": 0.8667981028556824,
+      "learning_rate": 4.1877929694764315e-05,
+      "loss": 0.7584,
+      "step": 15679
+    },
+    {
+      "epoch": 2.7916666666666665,
+      "grad_norm": 0.8986212015151978,
+      "learning_rate": 4.1866539877704894e-05,
+      "loss": 0.9205,
+      "step": 15680
+    },
+    {
+      "epoch": 2.7918447293447293,
+      "grad_norm": 0.8524685502052307,
+      "learning_rate": 4.185515119964986e-05,
+      "loss": 0.8516,
+      "step": 15681
+    },
+    {
+      "epoch": 2.792022792022792,
+      "grad_norm": 0.8247089385986328,
+      "learning_rate": 4.184376366082234e-05,
+      "loss": 0.8733,
+      "step": 15682
+    },
+    {
+      "epoch": 2.7922008547008548,
+      "grad_norm": 0.8236528635025024,
+      "learning_rate": 4.183237726144549e-05,
+      "loss": 0.8715,
+      "step": 15683
+    },
+    {
+      "epoch": 2.7923789173789175,
+      "grad_norm": 0.8853272199630737,
+      "learning_rate": 4.182099200174232e-05,
+      "loss": 0.7741,
+      "step": 15684
+    },
+    {
+      "epoch": 2.79255698005698,
+      "grad_norm": 0.8243789672851562,
+      "learning_rate": 4.180960788193603e-05,
+      "loss": 0.9196,
+      "step": 15685
+    },
+    {
+      "epoch": 2.7927350427350426,
+      "grad_norm": 0.9670386910438538,
+      "learning_rate": 4.1798224902249515e-05,
+      "loss": 0.828,
+      "step": 15686
+    },
+    {
+      "epoch": 2.7929131054131053,
+      "grad_norm": 0.7831283211708069,
+      "learning_rate": 4.178684306290592e-05,
+      "loss": 0.8389,
+      "step": 15687
+    },
+    {
+      "epoch": 2.793091168091168,
+      "grad_norm": 0.9372588396072388,
+      "learning_rate": 4.177546236412822e-05,
+      "loss": 1.2267,
+      "step": 15688
+    },
+    {
+      "epoch": 2.793269230769231,
+      "grad_norm": 0.9065600633621216,
+      "learning_rate": 4.176408280613937e-05,
+      "loss": 0.9674,
+      "step": 15689
+    },
+    {
+      "epoch": 2.7934472934472936,
+      "grad_norm": 0.8220530152320862,
+      "learning_rate": 4.1752704389162344e-05,
+      "loss": 0.8717,
+      "step": 15690
+    },
+    {
+      "epoch": 2.7936253561253563,
+      "grad_norm": 0.8952174782752991,
+      "learning_rate": 4.174132711342005e-05,
+      "loss": 0.8904,
+      "step": 15691
+    },
+    {
+      "epoch": 2.7938034188034186,
+      "grad_norm": 0.8454076647758484,
+      "learning_rate": 4.172995097913549e-05,
+      "loss": 0.9784,
+      "step": 15692
+    },
+    {
+      "epoch": 2.7939814814814814,
+      "grad_norm": 0.8697866797447205,
+      "learning_rate": 4.171857598653143e-05,
+      "loss": 1.0042,
+      "step": 15693
+    },
+    {
+      "epoch": 2.794159544159544,
+      "grad_norm": 0.8736211657524109,
+      "learning_rate": 4.170720213583084e-05,
+      "loss": 0.8787,
+      "step": 15694
+    },
+    {
+      "epoch": 2.794337606837607,
+      "grad_norm": 1.0082578659057617,
+      "learning_rate": 4.1695829427256525e-05,
+      "loss": 1.2508,
+      "step": 15695
+    },
+    {
+      "epoch": 2.7945156695156697,
+      "grad_norm": 0.8092042207717896,
+      "learning_rate": 4.1684457861031325e-05,
+      "loss": 0.8968,
+      "step": 15696
+    },
+    {
+      "epoch": 2.794693732193732,
+      "grad_norm": 0.847034752368927,
+      "learning_rate": 4.167308743737802e-05,
+      "loss": 0.8019,
+      "step": 15697
+    },
+    {
+      "epoch": 2.7948717948717947,
+      "grad_norm": 0.9059078097343445,
+      "learning_rate": 4.1661718156519414e-05,
+      "loss": 1.0393,
+      "step": 15698
+    },
+    {
+      "epoch": 2.7950498575498575,
+      "grad_norm": 0.8907228112220764,
+      "learning_rate": 4.165035001867822e-05,
+      "loss": 0.7388,
+      "step": 15699
+    },
+    {
+      "epoch": 2.79522792022792,
+      "grad_norm": 0.8089052438735962,
+      "learning_rate": 4.163898302407727e-05,
+      "loss": 0.8052,
+      "step": 15700
+    },
+    {
+      "epoch": 2.795405982905983,
+      "grad_norm": 1.1158883571624756,
+      "learning_rate": 4.162761717293915e-05,
+      "loss": 0.6923,
+      "step": 15701
+    },
+    {
+      "epoch": 2.7955840455840457,
+      "grad_norm": 0.8157755732536316,
+      "learning_rate": 4.1616252465486684e-05,
+      "loss": 0.606,
+      "step": 15702
+    },
+    {
+      "epoch": 2.7957621082621085,
+      "grad_norm": 0.8905386328697205,
+      "learning_rate": 4.1604888901942386e-05,
+      "loss": 0.7798,
+      "step": 15703
+    },
+    {
+      "epoch": 2.7959401709401708,
+      "grad_norm": 0.7655990719795227,
+      "learning_rate": 4.1593526482529034e-05,
+      "loss": 0.7435,
+      "step": 15704
+    },
+    {
+      "epoch": 2.7961182336182335,
+      "grad_norm": 0.900643527507782,
+      "learning_rate": 4.1582165207469195e-05,
+      "loss": 0.8314,
+      "step": 15705
+    },
+    {
+      "epoch": 2.7962962962962963,
+      "grad_norm": 0.7737550735473633,
+      "learning_rate": 4.1570805076985475e-05,
+      "loss": 0.8281,
+      "step": 15706
+    },
+    {
+      "epoch": 2.796474358974359,
+      "grad_norm": 0.8385021090507507,
+      "learning_rate": 4.1559446091300455e-05,
+      "loss": 0.896,
+      "step": 15707
+    },
+    {
+      "epoch": 2.796652421652422,
+      "grad_norm": 0.8830214142799377,
+      "learning_rate": 4.1548088250636687e-05,
+      "loss": 0.8856,
+      "step": 15708
+    },
+    {
+      "epoch": 2.796830484330484,
+      "grad_norm": 0.9748533368110657,
+      "learning_rate": 4.1536731555216676e-05,
+      "loss": 0.8768,
+      "step": 15709
+    },
+    {
+      "epoch": 2.797008547008547,
+      "grad_norm": 0.8918380737304688,
+      "learning_rate": 4.1525376005263e-05,
+      "loss": 0.8332,
+      "step": 15710
+    },
+    {
+      "epoch": 2.7971866096866096,
+      "grad_norm": 0.9205654263496399,
+      "learning_rate": 4.15140216009981e-05,
+      "loss": 0.7698,
+      "step": 15711
+    },
+    {
+      "epoch": 2.7973646723646723,
+      "grad_norm": 0.9631472229957581,
+      "learning_rate": 4.1502668342644455e-05,
+      "loss": 0.9604,
+      "step": 15712
+    },
+    {
+      "epoch": 2.797542735042735,
+      "grad_norm": 0.8770546913146973,
+      "learning_rate": 4.1491316230424516e-05,
+      "loss": 0.7661,
+      "step": 15713
+    },
+    {
+      "epoch": 2.797720797720798,
+      "grad_norm": 0.8872628808021545,
+      "learning_rate": 4.147996526456069e-05,
+      "loss": 1.0847,
+      "step": 15714
+    },
+    {
+      "epoch": 2.7978988603988606,
+      "grad_norm": 0.8924010396003723,
+      "learning_rate": 4.146861544527538e-05,
+      "loss": 0.8159,
+      "step": 15715
+    },
+    {
+      "epoch": 2.7980769230769234,
+      "grad_norm": 0.9251703023910522,
+      "learning_rate": 4.1457266772790923e-05,
+      "loss": 0.7204,
+      "step": 15716
+    },
+    {
+      "epoch": 2.7982549857549857,
+      "grad_norm": 0.8891414403915405,
+      "learning_rate": 4.144591924732979e-05,
+      "loss": 0.8576,
+      "step": 15717
+    },
+    {
+      "epoch": 2.7984330484330484,
+      "grad_norm": 0.9676079154014587,
+      "learning_rate": 4.143457286911415e-05,
+      "loss": 1.1912,
+      "step": 15718
+    },
+    {
+      "epoch": 2.798611111111111,
+      "grad_norm": 0.8125061392784119,
+      "learning_rate": 4.142322763836645e-05,
+      "loss": 0.8111,
+      "step": 15719
+    },
+    {
+      "epoch": 2.798789173789174,
+      "grad_norm": 0.8612900972366333,
+      "learning_rate": 4.141188355530891e-05,
+      "loss": 0.7537,
+      "step": 15720
+    },
+    {
+      "epoch": 2.798967236467236,
+      "grad_norm": 1.0774086713790894,
+      "learning_rate": 4.14005406201638e-05,
+      "loss": 0.9364,
+      "step": 15721
+    },
+    {
+      "epoch": 2.799145299145299,
+      "grad_norm": 0.8296873569488525,
+      "learning_rate": 4.138919883315338e-05,
+      "loss": 0.9329,
+      "step": 15722
+    },
+    {
+      "epoch": 2.7993233618233617,
+      "grad_norm": 0.757978618144989,
+      "learning_rate": 4.137785819449984e-05,
+      "loss": 1.0353,
+      "step": 15723
+    },
+    {
+      "epoch": 2.7995014245014245,
+      "grad_norm": 0.8584328293800354,
+      "learning_rate": 4.136651870442536e-05,
+      "loss": 1.0577,
+      "step": 15724
+    },
+    {
+      "epoch": 2.7996794871794872,
+      "grad_norm": 0.7919153571128845,
+      "learning_rate": 4.135518036315222e-05,
+      "loss": 0.8013,
+      "step": 15725
+    },
+    {
+      "epoch": 2.79985754985755,
+      "grad_norm": 0.968519926071167,
+      "learning_rate": 4.134384317090243e-05,
+      "loss": 0.7684,
+      "step": 15726
+    },
+    {
+      "epoch": 2.8000356125356127,
+      "grad_norm": 0.8565614819526672,
+      "learning_rate": 4.133250712789826e-05,
+      "loss": 0.9177,
+      "step": 15727
+    },
+    {
+      "epoch": 2.8002136752136755,
+      "grad_norm": 0.8614934086799622,
+      "learning_rate": 4.1321172234361647e-05,
+      "loss": 0.9613,
+      "step": 15728
+    },
+    {
+      "epoch": 2.800391737891738,
+      "grad_norm": 0.8621053099632263,
+      "learning_rate": 4.130983849051483e-05,
+      "loss": 0.8254,
+      "step": 15729
+    },
+    {
+      "epoch": 2.8005698005698005,
+      "grad_norm": 0.8108318448066711,
+      "learning_rate": 4.12985058965798e-05,
+      "loss": 0.7577,
+      "step": 15730
+    },
+    {
+      "epoch": 2.8007478632478633,
+      "grad_norm": 0.9211961627006531,
+      "learning_rate": 4.1287174452778564e-05,
+      "loss": 0.8204,
+      "step": 15731
+    },
+    {
+      "epoch": 2.800925925925926,
+      "grad_norm": 0.8582359552383423,
+      "learning_rate": 4.127584415933326e-05,
+      "loss": 0.8915,
+      "step": 15732
+    },
+    {
+      "epoch": 2.8011039886039883,
+      "grad_norm": 0.9122742414474487,
+      "learning_rate": 4.126451501646573e-05,
+      "loss": 1.0896,
+      "step": 15733
+    },
+    {
+      "epoch": 2.801282051282051,
+      "grad_norm": 0.7392016649246216,
+      "learning_rate": 4.125318702439804e-05,
+      "loss": 0.6354,
+      "step": 15734
+    },
+    {
+      "epoch": 2.801460113960114,
+      "grad_norm": 0.8227471113204956,
+      "learning_rate": 4.124186018335213e-05,
+      "loss": 0.7903,
+      "step": 15735
+    },
+    {
+      "epoch": 2.8016381766381766,
+      "grad_norm": 0.8843638300895691,
+      "learning_rate": 4.12305344935499e-05,
+      "loss": 0.7545,
+      "step": 15736
+    },
+    {
+      "epoch": 2.8018162393162394,
+      "grad_norm": 0.9978471994400024,
+      "learning_rate": 4.121920995521327e-05,
+      "loss": 1.0127,
+      "step": 15737
+    },
+    {
+      "epoch": 2.801994301994302,
+      "grad_norm": 0.9969626069068909,
+      "learning_rate": 4.12078865685641e-05,
+      "loss": 0.8137,
+      "step": 15738
+    },
+    {
+      "epoch": 2.802172364672365,
+      "grad_norm": 1.0768957138061523,
+      "learning_rate": 4.119656433382428e-05,
+      "loss": 0.6866,
+      "step": 15739
+    },
+    {
+      "epoch": 2.8023504273504276,
+      "grad_norm": 0.7401831746101379,
+      "learning_rate": 4.1185243251215624e-05,
+      "loss": 0.9103,
+      "step": 15740
+    },
+    {
+      "epoch": 2.80252849002849,
+      "grad_norm": 0.9753470420837402,
+      "learning_rate": 4.1173923320959905e-05,
+      "loss": 0.9499,
+      "step": 15741
+    },
+    {
+      "epoch": 2.8027065527065527,
+      "grad_norm": 0.9174960851669312,
+      "learning_rate": 4.116260454327904e-05,
+      "loss": 1.0355,
+      "step": 15742
+    },
+    {
+      "epoch": 2.8028846153846154,
+      "grad_norm": 0.8292258381843567,
+      "learning_rate": 4.115128691839464e-05,
+      "loss": 0.8806,
+      "step": 15743
+    },
+    {
+      "epoch": 2.803062678062678,
+      "grad_norm": 0.9542452096939087,
+      "learning_rate": 4.1139970446528564e-05,
+      "loss": 0.8378,
+      "step": 15744
+    },
+    {
+      "epoch": 2.8032407407407405,
+      "grad_norm": 0.848686933517456,
+      "learning_rate": 4.1128655127902485e-05,
+      "loss": 0.7939,
+      "step": 15745
+    },
+    {
+      "epoch": 2.8034188034188032,
+      "grad_norm": 0.8277645111083984,
+      "learning_rate": 4.1117340962738125e-05,
+      "loss": 0.9277,
+      "step": 15746
+    },
+    {
+      "epoch": 2.803596866096866,
+      "grad_norm": 0.8613318204879761,
+      "learning_rate": 4.110602795125714e-05,
+      "loss": 0.7622,
+      "step": 15747
+    },
+    {
+      "epoch": 2.8037749287749287,
+      "grad_norm": 0.7106199860572815,
+      "learning_rate": 4.109471609368121e-05,
+      "loss": 0.787,
+      "step": 15748
+    },
+    {
+      "epoch": 2.8039529914529915,
+      "grad_norm": 0.8933543562889099,
+      "learning_rate": 4.108340539023194e-05,
+      "loss": 0.857,
+      "step": 15749
+    },
+    {
+      "epoch": 2.8041310541310542,
+      "grad_norm": 0.8682022094726562,
+      "learning_rate": 4.107209584113092e-05,
+      "loss": 0.7931,
+      "step": 15750
+    },
+    {
+      "epoch": 2.804309116809117,
+      "grad_norm": 0.828279435634613,
+      "learning_rate": 4.106078744659981e-05,
+      "loss": 0.8404,
+      "step": 15751
+    },
+    {
+      "epoch": 2.8044871794871797,
+      "grad_norm": 0.9503956437110901,
+      "learning_rate": 4.1049480206860136e-05,
+      "loss": 0.8588,
+      "step": 15752
+    },
+    {
+      "epoch": 2.804665242165242,
+      "grad_norm": 0.810714602470398,
+      "learning_rate": 4.1038174122133435e-05,
+      "loss": 0.8592,
+      "step": 15753
+    },
+    {
+      "epoch": 2.804843304843305,
+      "grad_norm": 0.9458156824111938,
+      "learning_rate": 4.1026869192641225e-05,
+      "loss": 0.7441,
+      "step": 15754
+    },
+    {
+      "epoch": 2.8050213675213675,
+      "grad_norm": 0.8267046809196472,
+      "learning_rate": 4.1015565418605016e-05,
+      "loss": 0.8574,
+      "step": 15755
+    },
+    {
+      "epoch": 2.8051994301994303,
+      "grad_norm": 0.8413352370262146,
+      "learning_rate": 4.100426280024623e-05,
+      "loss": 0.7598,
+      "step": 15756
+    },
+    {
+      "epoch": 2.8053774928774926,
+      "grad_norm": 0.9205049872398376,
+      "learning_rate": 4.099296133778644e-05,
+      "loss": 0.8346,
+      "step": 15757
+    },
+    {
+      "epoch": 2.8055555555555554,
+      "grad_norm": 0.7986966967582703,
+      "learning_rate": 4.098166103144692e-05,
+      "loss": 0.7607,
+      "step": 15758
+    },
+    {
+      "epoch": 2.805733618233618,
+      "grad_norm": 0.9400181174278259,
+      "learning_rate": 4.097036188144918e-05,
+      "loss": 0.8947,
+      "step": 15759
+    },
+    {
+      "epoch": 2.805911680911681,
+      "grad_norm": 0.9014864563941956,
+      "learning_rate": 4.0959063888014594e-05,
+      "loss": 0.7781,
+      "step": 15760
+    },
+    {
+      "epoch": 2.8060897435897436,
+      "grad_norm": 0.8163666725158691,
+      "learning_rate": 4.094776705136448e-05,
+      "loss": 0.9042,
+      "step": 15761
+    },
+    {
+      "epoch": 2.8062678062678064,
+      "grad_norm": 0.8435617685317993,
+      "learning_rate": 4.0936471371720217e-05,
+      "loss": 0.9229,
+      "step": 15762
+    },
+    {
+      "epoch": 2.806445868945869,
+      "grad_norm": 0.7995414137840271,
+      "learning_rate": 4.0925176849303104e-05,
+      "loss": 0.7294,
+      "step": 15763
+    },
+    {
+      "epoch": 2.806623931623932,
+      "grad_norm": 0.9147883057594299,
+      "learning_rate": 4.091388348433442e-05,
+      "loss": 0.955,
+      "step": 15764
+    },
+    {
+      "epoch": 2.806801994301994,
+      "grad_norm": 0.7326688170433044,
+      "learning_rate": 4.0902591277035474e-05,
+      "loss": 0.5885,
+      "step": 15765
+    },
+    {
+      "epoch": 2.806980056980057,
+      "grad_norm": 0.8756957054138184,
+      "learning_rate": 4.0891300227627425e-05,
+      "loss": 0.9404,
+      "step": 15766
+    },
+    {
+      "epoch": 2.8071581196581197,
+      "grad_norm": 0.8897661566734314,
+      "learning_rate": 4.088001033633165e-05,
+      "loss": 1.0214,
+      "step": 15767
+    },
+    {
+      "epoch": 2.8073361823361824,
+      "grad_norm": 0.8007500171661377,
+      "learning_rate": 4.086872160336919e-05,
+      "loss": 0.6553,
+      "step": 15768
+    },
+    {
+      "epoch": 2.807514245014245,
+      "grad_norm": 0.8175814747810364,
+      "learning_rate": 4.0857434028961316e-05,
+      "loss": 0.8572,
+      "step": 15769
+    },
+    {
+      "epoch": 2.8076923076923075,
+      "grad_norm": 0.8290224671363831,
+      "learning_rate": 4.084614761332917e-05,
+      "loss": 0.9196,
+      "step": 15770
+    },
+    {
+      "epoch": 2.8078703703703702,
+      "grad_norm": 0.9355306625366211,
+      "learning_rate": 4.083486235669385e-05,
+      "loss": 0.8198,
+      "step": 15771
+    },
+    {
+      "epoch": 2.808048433048433,
+      "grad_norm": 0.942641019821167,
+      "learning_rate": 4.082357825927656e-05,
+      "loss": 0.7817,
+      "step": 15772
+    },
+    {
+      "epoch": 2.8082264957264957,
+      "grad_norm": 0.9115625023841858,
+      "learning_rate": 4.081229532129827e-05,
+      "loss": 1.046,
+      "step": 15773
+    },
+    {
+      "epoch": 2.8084045584045585,
+      "grad_norm": 0.8559226393699646,
+      "learning_rate": 4.080101354298016e-05,
+      "loss": 0.7085,
+      "step": 15774
+    },
+    {
+      "epoch": 2.8085826210826212,
+      "grad_norm": 0.8223599195480347,
+      "learning_rate": 4.0789732924543144e-05,
+      "loss": 0.9019,
+      "step": 15775
+    },
+    {
+      "epoch": 2.808760683760684,
+      "grad_norm": 0.8511637449264526,
+      "learning_rate": 4.0778453466208344e-05,
+      "loss": 0.6845,
+      "step": 15776
+    },
+    {
+      "epoch": 2.8089387464387463,
+      "grad_norm": 0.9633409976959229,
+      "learning_rate": 4.076717516819674e-05,
+      "loss": 1.0478,
+      "step": 15777
+    },
+    {
+      "epoch": 2.809116809116809,
+      "grad_norm": 0.8657141327857971,
+      "learning_rate": 4.075589803072928e-05,
+      "loss": 0.8694,
+      "step": 15778
+    },
+    {
+      "epoch": 2.809294871794872,
+      "grad_norm": 0.8126863241195679,
+      "learning_rate": 4.0744622054026936e-05,
+      "loss": 0.8529,
+      "step": 15779
+    },
+    {
+      "epoch": 2.8094729344729346,
+      "grad_norm": 0.8671838045120239,
+      "learning_rate": 4.0733347238310626e-05,
+      "loss": 0.81,
+      "step": 15780
+    },
+    {
+      "epoch": 2.8096509971509973,
+      "grad_norm": 0.8336054086685181,
+      "learning_rate": 4.0722073583801223e-05,
+      "loss": 0.7103,
+      "step": 15781
+    },
+    {
+      "epoch": 2.8098290598290596,
+      "grad_norm": 0.7833893299102783,
+      "learning_rate": 4.071080109071973e-05,
+      "loss": 0.875,
+      "step": 15782
+    },
+    {
+      "epoch": 2.8100071225071224,
+      "grad_norm": 0.9142106771469116,
+      "learning_rate": 4.0699529759286844e-05,
+      "loss": 0.9063,
+      "step": 15783
+    },
+    {
+      "epoch": 2.810185185185185,
+      "grad_norm": 0.7767373919487,
+      "learning_rate": 4.0688259589723565e-05,
+      "loss": 0.841,
+      "step": 15784
+    },
+    {
+      "epoch": 2.810363247863248,
+      "grad_norm": 0.8679327368736267,
+      "learning_rate": 4.067699058225056e-05,
+      "loss": 0.8581,
+      "step": 15785
+    },
+    {
+      "epoch": 2.8105413105413106,
+      "grad_norm": 0.9434911608695984,
+      "learning_rate": 4.066572273708873e-05,
+      "loss": 1.0166,
+      "step": 15786
+    },
+    {
+      "epoch": 2.8107193732193734,
+      "grad_norm": 0.8053399920463562,
+      "learning_rate": 4.06544560544588e-05,
+      "loss": 0.874,
+      "step": 15787
+    },
+    {
+      "epoch": 2.810897435897436,
+      "grad_norm": 1.0207599401474,
+      "learning_rate": 4.0643190534581524e-05,
+      "loss": 0.8296,
+      "step": 15788
+    },
+    {
+      "epoch": 2.8110754985754984,
+      "grad_norm": 0.8879590034484863,
+      "learning_rate": 4.0631926177677626e-05,
+      "loss": 0.8787,
+      "step": 15789
+    },
+    {
+      "epoch": 2.811253561253561,
+      "grad_norm": 1.0816758871078491,
+      "learning_rate": 4.062066298396778e-05,
+      "loss": 0.8129,
+      "step": 15790
+    },
+    {
+      "epoch": 2.811431623931624,
+      "grad_norm": 0.8332961797714233,
+      "learning_rate": 4.060940095367272e-05,
+      "loss": 0.8855,
+      "step": 15791
+    },
+    {
+      "epoch": 2.8116096866096867,
+      "grad_norm": 0.98028963804245,
+      "learning_rate": 4.059814008701308e-05,
+      "loss": 1.0065,
+      "step": 15792
+    },
+    {
+      "epoch": 2.8117877492877494,
+      "grad_norm": 0.8983020782470703,
+      "learning_rate": 4.058688038420949e-05,
+      "loss": 0.8259,
+      "step": 15793
+    },
+    {
+      "epoch": 2.8119658119658117,
+      "grad_norm": 0.8029065728187561,
+      "learning_rate": 4.057562184548255e-05,
+      "loss": 0.7639,
+      "step": 15794
+    },
+    {
+      "epoch": 2.8121438746438745,
+      "grad_norm": 0.8188722729682922,
+      "learning_rate": 4.056436447105286e-05,
+      "loss": 0.7179,
+      "step": 15795
+    },
+    {
+      "epoch": 2.8123219373219372,
+      "grad_norm": 0.8493495583534241,
+      "learning_rate": 4.055310826114095e-05,
+      "loss": 0.9479,
+      "step": 15796
+    },
+    {
+      "epoch": 2.8125,
+      "grad_norm": 0.7770833373069763,
+      "learning_rate": 4.0541853215967466e-05,
+      "loss": 0.6915,
+      "step": 15797
+    },
+    {
+      "epoch": 2.8126780626780628,
+      "grad_norm": 0.8238499760627747,
+      "learning_rate": 4.053059933575277e-05,
+      "loss": 1.0257,
+      "step": 15798
+    },
+    {
+      "epoch": 2.8128561253561255,
+      "grad_norm": 0.8537020683288574,
+      "learning_rate": 4.051934662071754e-05,
+      "loss": 0.7037,
+      "step": 15799
+    },
+    {
+      "epoch": 2.8130341880341883,
+      "grad_norm": 0.8120609521865845,
+      "learning_rate": 4.0508095071082055e-05,
+      "loss": 0.9533,
+      "step": 15800
+    },
+    {
+      "epoch": 2.8132122507122506,
+      "grad_norm": 0.8701691627502441,
+      "learning_rate": 4.0496844687066915e-05,
+      "loss": 0.7129,
+      "step": 15801
+    },
+    {
+      "epoch": 2.8133903133903133,
+      "grad_norm": 0.9007154107093811,
+      "learning_rate": 4.048559546889249e-05,
+      "loss": 0.8879,
+      "step": 15802
+    },
+    {
+      "epoch": 2.813568376068376,
+      "grad_norm": 0.8858364820480347,
+      "learning_rate": 4.047434741677919e-05,
+      "loss": 0.9391,
+      "step": 15803
+    },
+    {
+      "epoch": 2.813746438746439,
+      "grad_norm": 0.8597678542137146,
+      "learning_rate": 4.046310053094739e-05,
+      "loss": 0.7897,
+      "step": 15804
+    },
+    {
+      "epoch": 2.8139245014245016,
+      "grad_norm": 0.8493016362190247,
+      "learning_rate": 4.0451854811617475e-05,
+      "loss": 0.914,
+      "step": 15805
+    },
+    {
+      "epoch": 2.814102564102564,
+      "grad_norm": 0.8347373008728027,
+      "learning_rate": 4.044061025900973e-05,
+      "loss": 0.671,
+      "step": 15806
+    },
+    {
+      "epoch": 2.8142806267806266,
+      "grad_norm": 0.7406242489814758,
+      "learning_rate": 4.042936687334455e-05,
+      "loss": 0.6592,
+      "step": 15807
+    },
+    {
+      "epoch": 2.8144586894586894,
+      "grad_norm": 0.93736732006073,
+      "learning_rate": 4.041812465484214e-05,
+      "loss": 0.8301,
+      "step": 15808
+    },
+    {
+      "epoch": 2.814636752136752,
+      "grad_norm": 0.8744384050369263,
+      "learning_rate": 4.040688360372282e-05,
+      "loss": 0.9104,
+      "step": 15809
+    },
+    {
+      "epoch": 2.814814814814815,
+      "grad_norm": 0.7417266964912415,
+      "learning_rate": 4.0395643720206834e-05,
+      "loss": 0.7698,
+      "step": 15810
+    },
+    {
+      "epoch": 2.8149928774928776,
+      "grad_norm": 0.8601716160774231,
+      "learning_rate": 4.038440500451438e-05,
+      "loss": 0.8459,
+      "step": 15811
+    },
+    {
+      "epoch": 2.8151709401709404,
+      "grad_norm": 0.9801309108734131,
+      "learning_rate": 4.037316745686568e-05,
+      "loss": 0.9555,
+      "step": 15812
+    },
+    {
+      "epoch": 2.8153490028490027,
+      "grad_norm": 0.8559770584106445,
+      "learning_rate": 4.036193107748085e-05,
+      "loss": 0.9912,
+      "step": 15813
+    },
+    {
+      "epoch": 2.8155270655270654,
+      "grad_norm": 0.8155802488327026,
+      "learning_rate": 4.035069586658018e-05,
+      "loss": 0.8471,
+      "step": 15814
+    },
+    {
+      "epoch": 2.815705128205128,
+      "grad_norm": 0.96893310546875,
+      "learning_rate": 4.033946182438364e-05,
+      "loss": 0.9329,
+      "step": 15815
+    },
+    {
+      "epoch": 2.815883190883191,
+      "grad_norm": 0.9981120228767395,
+      "learning_rate": 4.032822895111144e-05,
+      "loss": 1.0333,
+      "step": 15816
+    },
+    {
+      "epoch": 2.8160612535612537,
+      "grad_norm": 0.9491816163063049,
+      "learning_rate": 4.031699724698363e-05,
+      "loss": 0.8838,
+      "step": 15817
+    },
+    {
+      "epoch": 2.816239316239316,
+      "grad_norm": 0.7149206399917603,
+      "learning_rate": 4.030576671222029e-05,
+      "loss": 0.8248,
+      "step": 15818
+    },
+    {
+      "epoch": 2.8164173789173788,
+      "grad_norm": 0.8366861343383789,
+      "learning_rate": 4.029453734704145e-05,
+      "loss": 0.8041,
+      "step": 15819
+    },
+    {
+      "epoch": 2.8165954415954415,
+      "grad_norm": 0.8147984147071838,
+      "learning_rate": 4.0283309151667116e-05,
+      "loss": 0.8702,
+      "step": 15820
+    },
+    {
+      "epoch": 2.8167735042735043,
+      "grad_norm": 0.8462722897529602,
+      "learning_rate": 4.027208212631729e-05,
+      "loss": 0.9899,
+      "step": 15821
+    },
+    {
+      "epoch": 2.816951566951567,
+      "grad_norm": 0.9278651475906372,
+      "learning_rate": 4.0260856271211946e-05,
+      "loss": 0.8684,
+      "step": 15822
+    },
+    {
+      "epoch": 2.8171296296296298,
+      "grad_norm": 0.8845569491386414,
+      "learning_rate": 4.0249631586570993e-05,
+      "loss": 0.6956,
+      "step": 15823
+    },
+    {
+      "epoch": 2.8173076923076925,
+      "grad_norm": 0.7803958654403687,
+      "learning_rate": 4.0238408072614453e-05,
+      "loss": 0.758,
+      "step": 15824
+    },
+    {
+      "epoch": 2.817485754985755,
+      "grad_norm": 0.8920331597328186,
+      "learning_rate": 4.022718572956209e-05,
+      "loss": 0.9131,
+      "step": 15825
+    },
+    {
+      "epoch": 2.8176638176638176,
+      "grad_norm": 0.9914098978042603,
+      "learning_rate": 4.021596455763389e-05,
+      "loss": 0.973,
+      "step": 15826
+    },
+    {
+      "epoch": 2.8178418803418803,
+      "grad_norm": 0.8329834938049316,
+      "learning_rate": 4.020474455704968e-05,
+      "loss": 0.8746,
+      "step": 15827
+    },
+    {
+      "epoch": 2.818019943019943,
+      "grad_norm": 0.7789189219474792,
+      "learning_rate": 4.019352572802928e-05,
+      "loss": 0.7359,
+      "step": 15828
+    },
+    {
+      "epoch": 2.818198005698006,
+      "grad_norm": 0.8405792713165283,
+      "learning_rate": 4.0182308070792505e-05,
+      "loss": 0.7897,
+      "step": 15829
+    },
+    {
+      "epoch": 2.818376068376068,
+      "grad_norm": 0.9215472936630249,
+      "learning_rate": 4.0171091585559116e-05,
+      "loss": 0.9017,
+      "step": 15830
+    },
+    {
+      "epoch": 2.818554131054131,
+      "grad_norm": 0.8310655355453491,
+      "learning_rate": 4.0159876272548933e-05,
+      "loss": 0.7499,
+      "step": 15831
+    },
+    {
+      "epoch": 2.8187321937321936,
+      "grad_norm": 0.8645792603492737,
+      "learning_rate": 4.014866213198167e-05,
+      "loss": 0.9009,
+      "step": 15832
+    },
+    {
+      "epoch": 2.8189102564102564,
+      "grad_norm": 0.8059788942337036,
+      "learning_rate": 4.013744916407703e-05,
+      "loss": 0.7367,
+      "step": 15833
+    },
+    {
+      "epoch": 2.819088319088319,
+      "grad_norm": 0.7990314960479736,
+      "learning_rate": 4.0126237369054745e-05,
+      "loss": 1.0172,
+      "step": 15834
+    },
+    {
+      "epoch": 2.819266381766382,
+      "grad_norm": 0.9321257472038269,
+      "learning_rate": 4.0115026747134446e-05,
+      "loss": 1.1224,
+      "step": 15835
+    },
+    {
+      "epoch": 2.8194444444444446,
+      "grad_norm": 0.9674378037452698,
+      "learning_rate": 4.0103817298535794e-05,
+      "loss": 0.9881,
+      "step": 15836
+    },
+    {
+      "epoch": 2.8196225071225074,
+      "grad_norm": 0.8573030829429626,
+      "learning_rate": 4.009260902347842e-05,
+      "loss": 0.7515,
+      "step": 15837
+    },
+    {
+      "epoch": 2.8198005698005697,
+      "grad_norm": 0.9248984456062317,
+      "learning_rate": 4.00814019221819e-05,
+      "loss": 0.9265,
+      "step": 15838
+    },
+    {
+      "epoch": 2.8199786324786325,
+      "grad_norm": 0.8749010562896729,
+      "learning_rate": 4.00701959948659e-05,
+      "loss": 0.8659,
+      "step": 15839
+    },
+    {
+      "epoch": 2.820156695156695,
+      "grad_norm": 0.8070803880691528,
+      "learning_rate": 4.005899124174986e-05,
+      "loss": 0.8813,
+      "step": 15840
+    },
+    {
+      "epoch": 2.820334757834758,
+      "grad_norm": 1.0711981058120728,
+      "learning_rate": 4.004778766305339e-05,
+      "loss": 1.019,
+      "step": 15841
+    },
+    {
+      "epoch": 2.8205128205128203,
+      "grad_norm": 0.7646795511245728,
+      "learning_rate": 4.0036585258995985e-05,
+      "loss": 0.8326,
+      "step": 15842
+    },
+    {
+      "epoch": 2.820690883190883,
+      "grad_norm": 0.6720184683799744,
+      "learning_rate": 4.002538402979713e-05,
+      "loss": 0.6642,
+      "step": 15843
+    },
+    {
+      "epoch": 2.8208689458689458,
+      "grad_norm": 0.8062998056411743,
+      "learning_rate": 4.001418397567629e-05,
+      "loss": 0.9585,
+      "step": 15844
+    },
+    {
+      "epoch": 2.8210470085470085,
+      "grad_norm": 0.835515558719635,
+      "learning_rate": 4.0002985096852893e-05,
+      "loss": 0.7992,
+      "step": 15845
+    },
+    {
+      "epoch": 2.8212250712250713,
+      "grad_norm": 0.8308731913566589,
+      "learning_rate": 3.9991787393546385e-05,
+      "loss": 0.9982,
+      "step": 15846
+    },
+    {
+      "epoch": 2.821403133903134,
+      "grad_norm": 0.9056837558746338,
+      "learning_rate": 3.998059086597614e-05,
+      "loss": 0.8283,
+      "step": 15847
+    },
+    {
+      "epoch": 2.8215811965811968,
+      "grad_norm": 0.853070080280304,
+      "learning_rate": 3.9969395514361506e-05,
+      "loss": 1.033,
+      "step": 15848
+    },
+    {
+      "epoch": 2.8217592592592595,
+      "grad_norm": 0.9179061055183411,
+      "learning_rate": 3.99582013389219e-05,
+      "loss": 0.929,
+      "step": 15849
+    },
+    {
+      "epoch": 2.821937321937322,
+      "grad_norm": 0.8702627420425415,
+      "learning_rate": 3.9947008339876616e-05,
+      "loss": 0.9994,
+      "step": 15850
+    },
+    {
+      "epoch": 2.8221153846153846,
+      "grad_norm": 0.9594024419784546,
+      "learning_rate": 3.9935816517444935e-05,
+      "loss": 0.88,
+      "step": 15851
+    },
+    {
+      "epoch": 2.8222934472934473,
+      "grad_norm": 0.8474575877189636,
+      "learning_rate": 3.992462587184618e-05,
+      "loss": 0.7817,
+      "step": 15852
+    },
+    {
+      "epoch": 2.82247150997151,
+      "grad_norm": 0.7588878870010376,
+      "learning_rate": 3.9913436403299533e-05,
+      "loss": 0.7911,
+      "step": 15853
+    },
+    {
+      "epoch": 2.8226495726495724,
+      "grad_norm": 0.8467457890510559,
+      "learning_rate": 3.9902248112024366e-05,
+      "loss": 0.7433,
+      "step": 15854
+    },
+    {
+      "epoch": 2.822827635327635,
+      "grad_norm": 0.839137077331543,
+      "learning_rate": 3.989106099823972e-05,
+      "loss": 0.7303,
+      "step": 15855
+    },
+    {
+      "epoch": 2.823005698005698,
+      "grad_norm": 0.8945586085319519,
+      "learning_rate": 3.987987506216495e-05,
+      "loss": 0.6697,
+      "step": 15856
+    },
+    {
+      "epoch": 2.8231837606837606,
+      "grad_norm": 0.749971330165863,
+      "learning_rate": 3.9868690304019064e-05,
+      "loss": 0.65,
+      "step": 15857
+    },
+    {
+      "epoch": 2.8233618233618234,
+      "grad_norm": 0.9841105341911316,
+      "learning_rate": 3.985750672402131e-05,
+      "loss": 0.8382,
+      "step": 15858
+    },
+    {
+      "epoch": 2.823539886039886,
+      "grad_norm": 0.8696077466011047,
+      "learning_rate": 3.984632432239078e-05,
+      "loss": 0.7895,
+      "step": 15859
+    },
+    {
+      "epoch": 2.823717948717949,
+      "grad_norm": 0.7845979928970337,
+      "learning_rate": 3.9835143099346575e-05,
+      "loss": 0.8673,
+      "step": 15860
+    },
+    {
+      "epoch": 2.8238960113960117,
+      "grad_norm": 0.8145211338996887,
+      "learning_rate": 3.982396305510775e-05,
+      "loss": 0.905,
+      "step": 15861
+    },
+    {
+      "epoch": 2.824074074074074,
+      "grad_norm": 0.8533337116241455,
+      "learning_rate": 3.981278418989336e-05,
+      "loss": 0.9597,
+      "step": 15862
+    },
+    {
+      "epoch": 2.8242521367521367,
+      "grad_norm": 0.9430350065231323,
+      "learning_rate": 3.980160650392241e-05,
+      "loss": 0.9043,
+      "step": 15863
+    },
+    {
+      "epoch": 2.8244301994301995,
+      "grad_norm": 0.7469115257263184,
+      "learning_rate": 3.9790429997414e-05,
+      "loss": 0.9074,
+      "step": 15864
+    },
+    {
+      "epoch": 2.824608262108262,
+      "grad_norm": 0.8558746576309204,
+      "learning_rate": 3.977925467058696e-05,
+      "loss": 0.8958,
+      "step": 15865
+    },
+    {
+      "epoch": 2.8247863247863245,
+      "grad_norm": 1.1557669639587402,
+      "learning_rate": 3.976808052366037e-05,
+      "loss": 0.7884,
+      "step": 15866
+    },
+    {
+      "epoch": 2.8249643874643873,
+      "grad_norm": 0.8448477983474731,
+      "learning_rate": 3.975690755685312e-05,
+      "loss": 0.9542,
+      "step": 15867
+    },
+    {
+      "epoch": 2.82514245014245,
+      "grad_norm": 0.833256721496582,
+      "learning_rate": 3.9745735770384086e-05,
+      "loss": 0.9196,
+      "step": 15868
+    },
+    {
+      "epoch": 2.8253205128205128,
+      "grad_norm": 0.9318852424621582,
+      "learning_rate": 3.973456516447226e-05,
+      "loss": 0.7792,
+      "step": 15869
+    },
+    {
+      "epoch": 2.8254985754985755,
+      "grad_norm": 0.832655131816864,
+      "learning_rate": 3.972339573933638e-05,
+      "loss": 0.9526,
+      "step": 15870
+    },
+    {
+      "epoch": 2.8256766381766383,
+      "grad_norm": 0.7546842694282532,
+      "learning_rate": 3.9712227495195406e-05,
+      "loss": 0.5969,
+      "step": 15871
+    },
+    {
+      "epoch": 2.825854700854701,
+      "grad_norm": 0.8538267016410828,
+      "learning_rate": 3.970106043226802e-05,
+      "loss": 0.8369,
+      "step": 15872
+    },
+    {
+      "epoch": 2.826032763532764,
+      "grad_norm": 0.8023465871810913,
+      "learning_rate": 3.968989455077314e-05,
+      "loss": 0.7257,
+      "step": 15873
+    },
+    {
+      "epoch": 2.826210826210826,
+      "grad_norm": 0.7905409932136536,
+      "learning_rate": 3.96787298509295e-05,
+      "loss": 0.7965,
+      "step": 15874
+    },
+    {
+      "epoch": 2.826388888888889,
+      "grad_norm": 0.8316642642021179,
+      "learning_rate": 3.966756633295583e-05,
+      "loss": 0.808,
+      "step": 15875
+    },
+    {
+      "epoch": 2.8265669515669516,
+      "grad_norm": 0.9130233526229858,
+      "learning_rate": 3.965640399707088e-05,
+      "loss": 0.882,
+      "step": 15876
+    },
+    {
+      "epoch": 2.8267450142450143,
+      "grad_norm": 0.873147189617157,
+      "learning_rate": 3.9645242843493325e-05,
+      "loss": 0.8347,
+      "step": 15877
+    },
+    {
+      "epoch": 2.8269230769230766,
+      "grad_norm": 0.8806825280189514,
+      "learning_rate": 3.963408287244183e-05,
+      "loss": 1.0496,
+      "step": 15878
+    },
+    {
+      "epoch": 2.8271011396011394,
+      "grad_norm": 0.8899962306022644,
+      "learning_rate": 3.962292408413516e-05,
+      "loss": 1.0014,
+      "step": 15879
+    },
+    {
+      "epoch": 2.827279202279202,
+      "grad_norm": 0.900303065776825,
+      "learning_rate": 3.961176647879179e-05,
+      "loss": 0.7345,
+      "step": 15880
+    },
+    {
+      "epoch": 2.827457264957265,
+      "grad_norm": 1.1055848598480225,
+      "learning_rate": 3.960061005663049e-05,
+      "loss": 0.9697,
+      "step": 15881
+    },
+    {
+      "epoch": 2.8276353276353277,
+      "grad_norm": 0.891404390335083,
+      "learning_rate": 3.958945481786969e-05,
+      "loss": 0.8878,
+      "step": 15882
+    },
+    {
+      "epoch": 2.8278133903133904,
+      "grad_norm": 0.9403249025344849,
+      "learning_rate": 3.957830076272807e-05,
+      "loss": 0.9536,
+      "step": 15883
+    },
+    {
+      "epoch": 2.827991452991453,
+      "grad_norm": 0.8735896944999695,
+      "learning_rate": 3.9567147891424126e-05,
+      "loss": 0.9113,
+      "step": 15884
+    },
+    {
+      "epoch": 2.828169515669516,
+      "grad_norm": 0.7758442759513855,
+      "learning_rate": 3.9555996204176385e-05,
+      "loss": 0.7336,
+      "step": 15885
+    },
+    {
+      "epoch": 2.828347578347578,
+      "grad_norm": 0.8632972836494446,
+      "learning_rate": 3.9544845701203335e-05,
+      "loss": 0.6883,
+      "step": 15886
+    },
+    {
+      "epoch": 2.828525641025641,
+      "grad_norm": 0.8639035224914551,
+      "learning_rate": 3.953369638272345e-05,
+      "loss": 0.8836,
+      "step": 15887
+    },
+    {
+      "epoch": 2.8287037037037037,
+      "grad_norm": 0.8131114840507507,
+      "learning_rate": 3.952254824895514e-05,
+      "loss": 0.8175,
+      "step": 15888
+    },
+    {
+      "epoch": 2.8288817663817665,
+      "grad_norm": 0.7421914935112,
+      "learning_rate": 3.9511401300116904e-05,
+      "loss": 0.7721,
+      "step": 15889
+    },
+    {
+      "epoch": 2.8290598290598292,
+      "grad_norm": 0.8358926177024841,
+      "learning_rate": 3.95002555364271e-05,
+      "loss": 0.7327,
+      "step": 15890
+    },
+    {
+      "epoch": 2.8292378917378915,
+      "grad_norm": 0.8913134932518005,
+      "learning_rate": 3.9489110958104115e-05,
+      "loss": 1.0415,
+      "step": 15891
+    },
+    {
+      "epoch": 2.8294159544159543,
+      "grad_norm": 0.9675887227058411,
+      "learning_rate": 3.94779675653663e-05,
+      "loss": 0.8295,
+      "step": 15892
+    },
+    {
+      "epoch": 2.829594017094017,
+      "grad_norm": 0.8618438839912415,
+      "learning_rate": 3.946682535843199e-05,
+      "loss": 1.0754,
+      "step": 15893
+    },
+    {
+      "epoch": 2.82977207977208,
+      "grad_norm": 0.820209801197052,
+      "learning_rate": 3.945568433751948e-05,
+      "loss": 0.9484,
+      "step": 15894
+    },
+    {
+      "epoch": 2.8299501424501425,
+      "grad_norm": 0.8641984462738037,
+      "learning_rate": 3.944454450284705e-05,
+      "loss": 0.964,
+      "step": 15895
+    },
+    {
+      "epoch": 2.8301282051282053,
+      "grad_norm": 0.8229194283485413,
+      "learning_rate": 3.943340585463303e-05,
+      "loss": 0.7069,
+      "step": 15896
+    },
+    {
+      "epoch": 2.830306267806268,
+      "grad_norm": 0.7874621748924255,
+      "learning_rate": 3.942226839309554e-05,
+      "loss": 0.7815,
+      "step": 15897
+    },
+    {
+      "epoch": 2.8304843304843303,
+      "grad_norm": 0.8581945896148682,
+      "learning_rate": 3.9411132118452896e-05,
+      "loss": 0.9119,
+      "step": 15898
+    },
+    {
+      "epoch": 2.830662393162393,
+      "grad_norm": 0.9327018857002258,
+      "learning_rate": 3.939999703092326e-05,
+      "loss": 1.1002,
+      "step": 15899
+    },
+    {
+      "epoch": 2.830840455840456,
+      "grad_norm": 0.7793048024177551,
+      "learning_rate": 3.9388863130724794e-05,
+      "loss": 0.6529,
+      "step": 15900
+    },
+    {
+      "epoch": 2.8310185185185186,
+      "grad_norm": 0.9133790135383606,
+      "learning_rate": 3.9377730418075645e-05,
+      "loss": 0.7354,
+      "step": 15901
+    },
+    {
+      "epoch": 2.8311965811965814,
+      "grad_norm": 0.7800240516662598,
+      "learning_rate": 3.936659889319394e-05,
+      "loss": 0.8541,
+      "step": 15902
+    },
+    {
+      "epoch": 2.8313746438746437,
+      "grad_norm": 0.782433271408081,
+      "learning_rate": 3.9355468556297737e-05,
+      "loss": 0.9084,
+      "step": 15903
+    },
+    {
+      "epoch": 2.8315527065527064,
+      "grad_norm": 0.8926814198493958,
+      "learning_rate": 3.9344339407605226e-05,
+      "loss": 1.0252,
+      "step": 15904
+    },
+    {
+      "epoch": 2.831730769230769,
+      "grad_norm": 0.92144376039505,
+      "learning_rate": 3.93332114473343e-05,
+      "loss": 0.7859,
+      "step": 15905
+    },
+    {
+      "epoch": 2.831908831908832,
+      "grad_norm": 0.7403308749198914,
+      "learning_rate": 3.932208467570315e-05,
+      "loss": 0.796,
+      "step": 15906
+    },
+    {
+      "epoch": 2.8320868945868947,
+      "grad_norm": 0.939708411693573,
+      "learning_rate": 3.9310959092929636e-05,
+      "loss": 0.9622,
+      "step": 15907
+    },
+    {
+      "epoch": 2.8322649572649574,
+      "grad_norm": 0.7546647787094116,
+      "learning_rate": 3.929983469923184e-05,
+      "loss": 0.7551,
+      "step": 15908
+    },
+    {
+      "epoch": 2.83244301994302,
+      "grad_norm": 0.8301447033882141,
+      "learning_rate": 3.928871149482768e-05,
+      "loss": 0.9377,
+      "step": 15909
+    },
+    {
+      "epoch": 2.8326210826210825,
+      "grad_norm": 0.8961447477340698,
+      "learning_rate": 3.927758947993508e-05,
+      "loss": 0.8245,
+      "step": 15910
+    },
+    {
+      "epoch": 2.8327991452991452,
+      "grad_norm": 0.7845988869667053,
+      "learning_rate": 3.926646865477204e-05,
+      "loss": 0.8662,
+      "step": 15911
+    },
+    {
+      "epoch": 2.832977207977208,
+      "grad_norm": 0.9339789152145386,
+      "learning_rate": 3.925534901955631e-05,
+      "loss": 0.8533,
+      "step": 15912
+    },
+    {
+      "epoch": 2.8331552706552707,
+      "grad_norm": 0.936855137348175,
+      "learning_rate": 3.924423057450587e-05,
+      "loss": 0.8809,
+      "step": 15913
+    },
+    {
+      "epoch": 2.8333333333333335,
+      "grad_norm": 0.972535252571106,
+      "learning_rate": 3.923311331983852e-05,
+      "loss": 1.0103,
+      "step": 15914
+    },
+    {
+      "epoch": 2.833511396011396,
+      "grad_norm": 0.9115430116653442,
+      "learning_rate": 3.922199725577208e-05,
+      "loss": 1.0041,
+      "step": 15915
+    },
+    {
+      "epoch": 2.8336894586894585,
+      "grad_norm": 0.8378027081489563,
+      "learning_rate": 3.921088238252435e-05,
+      "loss": 0.8475,
+      "step": 15916
+    },
+    {
+      "epoch": 2.8338675213675213,
+      "grad_norm": 0.9383054375648499,
+      "learning_rate": 3.91997687003131e-05,
+      "loss": 0.7305,
+      "step": 15917
+    },
+    {
+      "epoch": 2.834045584045584,
+      "grad_norm": 0.7996332049369812,
+      "learning_rate": 3.918865620935609e-05,
+      "loss": 0.8749,
+      "step": 15918
+    },
+    {
+      "epoch": 2.834223646723647,
+      "grad_norm": 0.8876177668571472,
+      "learning_rate": 3.917754490987103e-05,
+      "loss": 0.8836,
+      "step": 15919
+    },
+    {
+      "epoch": 2.8344017094017095,
+      "grad_norm": 0.8004130125045776,
+      "learning_rate": 3.9166434802075594e-05,
+      "loss": 1.0383,
+      "step": 15920
+    },
+    {
+      "epoch": 2.8345797720797723,
+      "grad_norm": 0.76146000623703,
+      "learning_rate": 3.915532588618756e-05,
+      "loss": 0.9616,
+      "step": 15921
+    },
+    {
+      "epoch": 2.8347578347578346,
+      "grad_norm": 0.9333193898200989,
+      "learning_rate": 3.914421816242446e-05,
+      "loss": 0.7386,
+      "step": 15922
+    },
+    {
+      "epoch": 2.8349358974358974,
+      "grad_norm": 0.9340601563453674,
+      "learning_rate": 3.913311163100403e-05,
+      "loss": 0.8894,
+      "step": 15923
+    },
+    {
+      "epoch": 2.83511396011396,
+      "grad_norm": 0.8401036858558655,
+      "learning_rate": 3.912200629214383e-05,
+      "loss": 0.8366,
+      "step": 15924
+    },
+    {
+      "epoch": 2.835292022792023,
+      "grad_norm": 0.9298731088638306,
+      "learning_rate": 3.911090214606146e-05,
+      "loss": 0.899,
+      "step": 15925
+    },
+    {
+      "epoch": 2.8354700854700856,
+      "grad_norm": 0.8085874915122986,
+      "learning_rate": 3.909979919297446e-05,
+      "loss": 0.7597,
+      "step": 15926
+    },
+    {
+      "epoch": 2.835648148148148,
+      "grad_norm": 0.7841027975082397,
+      "learning_rate": 3.9088697433100396e-05,
+      "loss": 0.897,
+      "step": 15927
+    },
+    {
+      "epoch": 2.8358262108262107,
+      "grad_norm": 1.0678621530532837,
+      "learning_rate": 3.907759686665677e-05,
+      "loss": 0.976,
+      "step": 15928
+    },
+    {
+      "epoch": 2.8360042735042734,
+      "grad_norm": 0.7748154997825623,
+      "learning_rate": 3.906649749386106e-05,
+      "loss": 0.6834,
+      "step": 15929
+    },
+    {
+      "epoch": 2.836182336182336,
+      "grad_norm": 0.8119567632675171,
+      "learning_rate": 3.905539931493076e-05,
+      "loss": 0.8076,
+      "step": 15930
+    },
+    {
+      "epoch": 2.836360398860399,
+      "grad_norm": 0.8723282814025879,
+      "learning_rate": 3.9044302330083326e-05,
+      "loss": 0.9057,
+      "step": 15931
+    },
+    {
+      "epoch": 2.8365384615384617,
+      "grad_norm": 0.7785065174102783,
+      "learning_rate": 3.903320653953616e-05,
+      "loss": 0.8899,
+      "step": 15932
+    },
+    {
+      "epoch": 2.8367165242165244,
+      "grad_norm": 0.9053105115890503,
+      "learning_rate": 3.902211194350667e-05,
+      "loss": 0.9038,
+      "step": 15933
+    },
+    {
+      "epoch": 2.8368945868945867,
+      "grad_norm": 0.8937689065933228,
+      "learning_rate": 3.9011018542212216e-05,
+      "loss": 0.736,
+      "step": 15934
+    },
+    {
+      "epoch": 2.8370726495726495,
+      "grad_norm": 0.7169269323348999,
+      "learning_rate": 3.899992633587014e-05,
+      "loss": 0.6632,
+      "step": 15935
+    },
+    {
+      "epoch": 2.8372507122507122,
+      "grad_norm": 0.8168412446975708,
+      "learning_rate": 3.898883532469785e-05,
+      "loss": 0.8482,
+      "step": 15936
+    },
+    {
+      "epoch": 2.837428774928775,
+      "grad_norm": 0.7374065518379211,
+      "learning_rate": 3.897774550891252e-05,
+      "loss": 0.8141,
+      "step": 15937
+    },
+    {
+      "epoch": 2.8376068376068377,
+      "grad_norm": 0.8844853043556213,
+      "learning_rate": 3.8966656888731546e-05,
+      "loss": 0.8204,
+      "step": 15938
+    },
+    {
+      "epoch": 2.8377849002849,
+      "grad_norm": 0.9031739234924316,
+      "learning_rate": 3.895556946437213e-05,
+      "loss": 0.862,
+      "step": 15939
+    },
+    {
+      "epoch": 2.837962962962963,
+      "grad_norm": 0.8141549229621887,
+      "learning_rate": 3.894448323605154e-05,
+      "loss": 0.6687,
+      "step": 15940
+    },
+    {
+      "epoch": 2.8381410256410255,
+      "grad_norm": 0.796144962310791,
+      "learning_rate": 3.893339820398696e-05,
+      "loss": 0.9021,
+      "step": 15941
+    },
+    {
+      "epoch": 2.8383190883190883,
+      "grad_norm": 0.8840420246124268,
+      "learning_rate": 3.8922314368395584e-05,
+      "loss": 0.9608,
+      "step": 15942
+    },
+    {
+      "epoch": 2.838497150997151,
+      "grad_norm": 0.8297450542449951,
+      "learning_rate": 3.891123172949459e-05,
+      "loss": 0.9442,
+      "step": 15943
+    },
+    {
+      "epoch": 2.838675213675214,
+      "grad_norm": 0.7875503301620483,
+      "learning_rate": 3.89001502875011e-05,
+      "loss": 0.9269,
+      "step": 15944
+    },
+    {
+      "epoch": 2.8388532763532766,
+      "grad_norm": 0.9460122585296631,
+      "learning_rate": 3.8889070042632217e-05,
+      "loss": 0.9459,
+      "step": 15945
+    },
+    {
+      "epoch": 2.8390313390313393,
+      "grad_norm": 0.8144980669021606,
+      "learning_rate": 3.887799099510512e-05,
+      "loss": 0.8409,
+      "step": 15946
+    },
+    {
+      "epoch": 2.8392094017094016,
+      "grad_norm": 0.8182117342948914,
+      "learning_rate": 3.886691314513675e-05,
+      "loss": 0.8093,
+      "step": 15947
+    },
+    {
+      "epoch": 2.8393874643874644,
+      "grad_norm": 0.8287648558616638,
+      "learning_rate": 3.885583649294426e-05,
+      "loss": 0.7792,
+      "step": 15948
+    },
+    {
+      "epoch": 2.839565527065527,
+      "grad_norm": 0.9165690541267395,
+      "learning_rate": 3.884476103874464e-05,
+      "loss": 0.9865,
+      "step": 15949
+    },
+    {
+      "epoch": 2.83974358974359,
+      "grad_norm": 0.7819885015487671,
+      "learning_rate": 3.883368678275485e-05,
+      "loss": 0.7245,
+      "step": 15950
+    },
+    {
+      "epoch": 2.839921652421652,
+      "grad_norm": 0.8354606628417969,
+      "learning_rate": 3.882261372519198e-05,
+      "loss": 1.0513,
+      "step": 15951
+    },
+    {
+      "epoch": 2.840099715099715,
+      "grad_norm": 0.7606815099716187,
+      "learning_rate": 3.881154186627284e-05,
+      "loss": 0.9357,
+      "step": 15952
+    },
+    {
+      "epoch": 2.8402777777777777,
+      "grad_norm": 0.9649691581726074,
+      "learning_rate": 3.88004712062145e-05,
+      "loss": 0.8756,
+      "step": 15953
+    },
+    {
+      "epoch": 2.8404558404558404,
+      "grad_norm": 0.8770344853401184,
+      "learning_rate": 3.878940174523371e-05,
+      "loss": 0.868,
+      "step": 15954
+    },
+    {
+      "epoch": 2.840633903133903,
+      "grad_norm": 0.898287832736969,
+      "learning_rate": 3.877833348354749e-05,
+      "loss": 0.8734,
+      "step": 15955
+    },
+    {
+      "epoch": 2.840811965811966,
+      "grad_norm": 0.84062260389328,
+      "learning_rate": 3.876726642137264e-05,
+      "loss": 0.9362,
+      "step": 15956
+    },
+    {
+      "epoch": 2.8409900284900287,
+      "grad_norm": 0.7898240685462952,
+      "learning_rate": 3.8756200558926013e-05,
+      "loss": 0.7788,
+      "step": 15957
+    },
+    {
+      "epoch": 2.8411680911680914,
+      "grad_norm": 0.7237298488616943,
+      "learning_rate": 3.874513589642441e-05,
+      "loss": 0.8426,
+      "step": 15958
+    },
+    {
+      "epoch": 2.8413461538461537,
+      "grad_norm": 0.9025090932846069,
+      "learning_rate": 3.873407243408462e-05,
+      "loss": 0.7135,
+      "step": 15959
+    },
+    {
+      "epoch": 2.8415242165242165,
+      "grad_norm": 0.807295560836792,
+      "learning_rate": 3.872301017212337e-05,
+      "loss": 0.6889,
+      "step": 15960
+    },
+    {
+      "epoch": 2.8417022792022792,
+      "grad_norm": 0.8537244200706482,
+      "learning_rate": 3.8711949110757525e-05,
+      "loss": 0.798,
+      "step": 15961
+    },
+    {
+      "epoch": 2.841880341880342,
+      "grad_norm": 0.8148910999298096,
+      "learning_rate": 3.870088925020366e-05,
+      "loss": 0.8783,
+      "step": 15962
+    },
+    {
+      "epoch": 2.8420584045584043,
+      "grad_norm": 0.8254446983337402,
+      "learning_rate": 3.868983059067859e-05,
+      "loss": 0.7043,
+      "step": 15963
+    },
+    {
+      "epoch": 2.842236467236467,
+      "grad_norm": 0.8392706513404846,
+      "learning_rate": 3.867877313239886e-05,
+      "loss": 0.9942,
+      "step": 15964
+    },
+    {
+      "epoch": 2.84241452991453,
+      "grad_norm": 0.8974948525428772,
+      "learning_rate": 3.8667716875581217e-05,
+      "loss": 0.7646,
+      "step": 15965
+    },
+    {
+      "epoch": 2.8425925925925926,
+      "grad_norm": 0.9764110445976257,
+      "learning_rate": 3.8656661820442264e-05,
+      "loss": 0.8803,
+      "step": 15966
+    },
+    {
+      "epoch": 2.8427706552706553,
+      "grad_norm": 0.9663669466972351,
+      "learning_rate": 3.864560796719855e-05,
+      "loss": 0.8764,
+      "step": 15967
+    },
+    {
+      "epoch": 2.842948717948718,
+      "grad_norm": 0.837733268737793,
+      "learning_rate": 3.863455531606677e-05,
+      "loss": 0.8992,
+      "step": 15968
+    },
+    {
+      "epoch": 2.843126780626781,
+      "grad_norm": 0.8458481431007385,
+      "learning_rate": 3.8623503867263335e-05,
+      "loss": 0.9025,
+      "step": 15969
+    },
+    {
+      "epoch": 2.8433048433048436,
+      "grad_norm": 0.901089072227478,
+      "learning_rate": 3.861245362100488e-05,
+      "loss": 0.8025,
+      "step": 15970
+    },
+    {
+      "epoch": 2.843482905982906,
+      "grad_norm": 0.9032089114189148,
+      "learning_rate": 3.860140457750786e-05,
+      "loss": 0.7217,
+      "step": 15971
+    },
+    {
+      "epoch": 2.8436609686609686,
+      "grad_norm": 0.7998839616775513,
+      "learning_rate": 3.859035673698879e-05,
+      "loss": 0.9127,
+      "step": 15972
+    },
+    {
+      "epoch": 2.8438390313390314,
+      "grad_norm": 0.8568583726882935,
+      "learning_rate": 3.85793100996641e-05,
+      "loss": 0.8847,
+      "step": 15973
+    },
+    {
+      "epoch": 2.844017094017094,
+      "grad_norm": 0.8720089793205261,
+      "learning_rate": 3.856826466575024e-05,
+      "loss": 0.7822,
+      "step": 15974
+    },
+    {
+      "epoch": 2.8441951566951564,
+      "grad_norm": 0.8872382640838623,
+      "learning_rate": 3.8557220435463594e-05,
+      "loss": 0.9601,
+      "step": 15975
+    },
+    {
+      "epoch": 2.844373219373219,
+      "grad_norm": 1.1950596570968628,
+      "learning_rate": 3.8546177409020634e-05,
+      "loss": 1.075,
+      "step": 15976
+    },
+    {
+      "epoch": 2.844551282051282,
+      "grad_norm": 0.9111549854278564,
+      "learning_rate": 3.85351355866376e-05,
+      "loss": 1.0103,
+      "step": 15977
+    },
+    {
+      "epoch": 2.8447293447293447,
+      "grad_norm": 0.9310214519500732,
+      "learning_rate": 3.852409496853099e-05,
+      "loss": 1.0163,
+      "step": 15978
+    },
+    {
+      "epoch": 2.8449074074074074,
+      "grad_norm": 0.8177474737167358,
+      "learning_rate": 3.851305555491695e-05,
+      "loss": 0.7488,
+      "step": 15979
+    },
+    {
+      "epoch": 2.84508547008547,
+      "grad_norm": 0.9321249127388,
+      "learning_rate": 3.85020173460119e-05,
+      "loss": 0.9914,
+      "step": 15980
+    },
+    {
+      "epoch": 2.845263532763533,
+      "grad_norm": 0.7649266719818115,
+      "learning_rate": 3.849098034203206e-05,
+      "loss": 0.692,
+      "step": 15981
+    },
+    {
+      "epoch": 2.8454415954415957,
+      "grad_norm": 0.7714266777038574,
+      "learning_rate": 3.847994454319369e-05,
+      "loss": 0.8859,
+      "step": 15982
+    },
+    {
+      "epoch": 2.845619658119658,
+      "grad_norm": 0.9535303711891174,
+      "learning_rate": 3.846890994971302e-05,
+      "loss": 0.8992,
+      "step": 15983
+    },
+    {
+      "epoch": 2.8457977207977208,
+      "grad_norm": 0.8171879649162292,
+      "learning_rate": 3.845787656180623e-05,
+      "loss": 0.7125,
+      "step": 15984
+    },
+    {
+      "epoch": 2.8459757834757835,
+      "grad_norm": 0.8546884655952454,
+      "learning_rate": 3.8446844379689464e-05,
+      "loss": 0.9895,
+      "step": 15985
+    },
+    {
+      "epoch": 2.8461538461538463,
+      "grad_norm": 0.9110364317893982,
+      "learning_rate": 3.843581340357899e-05,
+      "loss": 1.0702,
+      "step": 15986
+    },
+    {
+      "epoch": 2.8463319088319086,
+      "grad_norm": 0.8862065672874451,
+      "learning_rate": 3.84247836336908e-05,
+      "loss": 0.9138,
+      "step": 15987
+    },
+    {
+      "epoch": 2.8465099715099713,
+      "grad_norm": 0.8485249876976013,
+      "learning_rate": 3.84137550702411e-05,
+      "loss": 1.2831,
+      "step": 15988
+    },
+    {
+      "epoch": 2.846688034188034,
+      "grad_norm": 0.8271495699882507,
+      "learning_rate": 3.840272771344593e-05,
+      "loss": 0.9497,
+      "step": 15989
+    },
+    {
+      "epoch": 2.846866096866097,
+      "grad_norm": 0.7829293608665466,
+      "learning_rate": 3.839170156352135e-05,
+      "loss": 0.6503,
+      "step": 15990
+    },
+    {
+      "epoch": 2.8470441595441596,
+      "grad_norm": 0.9366582036018372,
+      "learning_rate": 3.838067662068341e-05,
+      "loss": 0.7805,
+      "step": 15991
+    },
+    {
+      "epoch": 2.8472222222222223,
+      "grad_norm": 0.8666117787361145,
+      "learning_rate": 3.836965288514807e-05,
+      "loss": 0.7721,
+      "step": 15992
+    },
+    {
+      "epoch": 2.847400284900285,
+      "grad_norm": 0.7855546474456787,
+      "learning_rate": 3.835863035713142e-05,
+      "loss": 0.7457,
+      "step": 15993
+    },
+    {
+      "epoch": 2.847578347578348,
+      "grad_norm": 0.8234511017799377,
+      "learning_rate": 3.8347609036849284e-05,
+      "loss": 0.8937,
+      "step": 15994
+    },
+    {
+      "epoch": 2.84775641025641,
+      "grad_norm": 0.8896345496177673,
+      "learning_rate": 3.833658892451773e-05,
+      "loss": 0.9146,
+      "step": 15995
+    },
+    {
+      "epoch": 2.847934472934473,
+      "grad_norm": 0.8099349737167358,
+      "learning_rate": 3.83255700203526e-05,
+      "loss": 0.8353,
+      "step": 15996
+    },
+    {
+      "epoch": 2.8481125356125356,
+      "grad_norm": 0.874100387096405,
+      "learning_rate": 3.831455232456982e-05,
+      "loss": 0.6829,
+      "step": 15997
+    },
+    {
+      "epoch": 2.8482905982905984,
+      "grad_norm": 0.9338345527648926,
+      "learning_rate": 3.830353583738524e-05,
+      "loss": 1.0345,
+      "step": 15998
+    },
+    {
+      "epoch": 2.8484686609686607,
+      "grad_norm": 0.7876978516578674,
+      "learning_rate": 3.829252055901472e-05,
+      "loss": 0.6703,
+      "step": 15999
+    },
+    {
+      "epoch": 2.8486467236467234,
+      "grad_norm": 0.8565872311592102,
+      "learning_rate": 3.828150648967408e-05,
+      "loss": 0.8227,
+      "step": 16000
+    },
+    {
+      "epoch": 2.848824786324786,
+      "grad_norm": 1.0180596113204956,
+      "learning_rate": 3.82704936295791e-05,
+      "loss": 0.8933,
+      "step": 16001
+    },
+    {
+      "epoch": 2.849002849002849,
+      "grad_norm": 0.8450096845626831,
+      "learning_rate": 3.825948197894553e-05,
+      "loss": 0.851,
+      "step": 16002
+    },
+    {
+      "epoch": 2.8491809116809117,
+      "grad_norm": 0.7936033010482788,
+      "learning_rate": 3.824847153798923e-05,
+      "loss": 0.8573,
+      "step": 16003
+    },
+    {
+      "epoch": 2.8493589743589745,
+      "grad_norm": 0.9499372839927673,
+      "learning_rate": 3.8237462306925774e-05,
+      "loss": 0.8269,
+      "step": 16004
+    },
+    {
+      "epoch": 2.849537037037037,
+      "grad_norm": 0.874855101108551,
+      "learning_rate": 3.822645428597099e-05,
+      "loss": 0.9657,
+      "step": 16005
+    },
+    {
+      "epoch": 2.8497150997151,
+      "grad_norm": 0.8966119885444641,
+      "learning_rate": 3.8215447475340506e-05,
+      "loss": 0.9239,
+      "step": 16006
+    },
+    {
+      "epoch": 2.8498931623931623,
+      "grad_norm": 0.8341490030288696,
+      "learning_rate": 3.820444187524994e-05,
+      "loss": 1.0051,
+      "step": 16007
+    },
+    {
+      "epoch": 2.850071225071225,
+      "grad_norm": 0.7965613007545471,
+      "learning_rate": 3.8193437485915054e-05,
+      "loss": 0.8591,
+      "step": 16008
+    },
+    {
+      "epoch": 2.8502492877492878,
+      "grad_norm": 0.7846593856811523,
+      "learning_rate": 3.818243430755128e-05,
+      "loss": 0.8095,
+      "step": 16009
+    },
+    {
+      "epoch": 2.8504273504273505,
+      "grad_norm": 0.9422695636749268,
+      "learning_rate": 3.8171432340374334e-05,
+      "loss": 0.8902,
+      "step": 16010
+    },
+    {
+      "epoch": 2.8506054131054133,
+      "grad_norm": 0.8810960650444031,
+      "learning_rate": 3.8160431584599744e-05,
+      "loss": 0.8483,
+      "step": 16011
+    },
+    {
+      "epoch": 2.8507834757834756,
+      "grad_norm": 0.8513348698616028,
+      "learning_rate": 3.814943204044302e-05,
+      "loss": 0.832,
+      "step": 16012
+    },
+    {
+      "epoch": 2.8509615384615383,
+      "grad_norm": 0.8906846046447754,
+      "learning_rate": 3.8138433708119704e-05,
+      "loss": 0.9702,
+      "step": 16013
+    },
+    {
+      "epoch": 2.851139601139601,
+      "grad_norm": 0.9517511129379272,
+      "learning_rate": 3.812743658784526e-05,
+      "loss": 0.8138,
+      "step": 16014
+    },
+    {
+      "epoch": 2.851317663817664,
+      "grad_norm": 0.7989702820777893,
+      "learning_rate": 3.811644067983517e-05,
+      "loss": 0.8653,
+      "step": 16015
+    },
+    {
+      "epoch": 2.8514957264957266,
+      "grad_norm": 0.8255589008331299,
+      "learning_rate": 3.8105445984304874e-05,
+      "loss": 0.9456,
+      "step": 16016
+    },
+    {
+      "epoch": 2.8516737891737893,
+      "grad_norm": 0.7919938564300537,
+      "learning_rate": 3.809445250146977e-05,
+      "loss": 0.6261,
+      "step": 16017
+    },
+    {
+      "epoch": 2.851851851851852,
+      "grad_norm": 0.866316020488739,
+      "learning_rate": 3.808346023154532e-05,
+      "loss": 0.8171,
+      "step": 16018
+    },
+    {
+      "epoch": 2.8520299145299144,
+      "grad_norm": 1.0050057172775269,
+      "learning_rate": 3.8072469174746794e-05,
+      "loss": 0.9094,
+      "step": 16019
+    },
+    {
+      "epoch": 2.852207977207977,
+      "grad_norm": 0.8405657410621643,
+      "learning_rate": 3.806147933128962e-05,
+      "loss": 0.7737,
+      "step": 16020
+    },
+    {
+      "epoch": 2.85238603988604,
+      "grad_norm": 0.8127378821372986,
+      "learning_rate": 3.8050490701389085e-05,
+      "loss": 0.9102,
+      "step": 16021
+    },
+    {
+      "epoch": 2.8525641025641026,
+      "grad_norm": 0.8622255921363831,
+      "learning_rate": 3.8039503285260506e-05,
+      "loss": 0.8815,
+      "step": 16022
+    },
+    {
+      "epoch": 2.8527421652421654,
+      "grad_norm": 0.8802367448806763,
+      "learning_rate": 3.802851708311913e-05,
+      "loss": 1.0123,
+      "step": 16023
+    },
+    {
+      "epoch": 2.8529202279202277,
+      "grad_norm": 0.908149778842926,
+      "learning_rate": 3.801753209518024e-05,
+      "loss": 0.6808,
+      "step": 16024
+    },
+    {
+      "epoch": 2.8530982905982905,
+      "grad_norm": 0.9346339702606201,
+      "learning_rate": 3.8006548321659055e-05,
+      "loss": 1.1107,
+      "step": 16025
+    },
+    {
+      "epoch": 2.853276353276353,
+      "grad_norm": 0.945125937461853,
+      "learning_rate": 3.799556576277077e-05,
+      "loss": 0.6578,
+      "step": 16026
+    },
+    {
+      "epoch": 2.853454415954416,
+      "grad_norm": 0.8294890522956848,
+      "learning_rate": 3.798458441873054e-05,
+      "loss": 0.869,
+      "step": 16027
+    },
+    {
+      "epoch": 2.8536324786324787,
+      "grad_norm": 0.7922961115837097,
+      "learning_rate": 3.797360428975358e-05,
+      "loss": 0.718,
+      "step": 16028
+    },
+    {
+      "epoch": 2.8538105413105415,
+      "grad_norm": 1.0540844202041626,
+      "learning_rate": 3.7962625376055005e-05,
+      "loss": 0.8287,
+      "step": 16029
+    },
+    {
+      "epoch": 2.853988603988604,
+      "grad_norm": 0.9409742951393127,
+      "learning_rate": 3.795164767784991e-05,
+      "loss": 1.0484,
+      "step": 16030
+    },
+    {
+      "epoch": 2.8541666666666665,
+      "grad_norm": 0.7328341603279114,
+      "learning_rate": 3.7940671195353385e-05,
+      "loss": 0.7603,
+      "step": 16031
+    },
+    {
+      "epoch": 2.8543447293447293,
+      "grad_norm": 0.9151208996772766,
+      "learning_rate": 3.792969592878045e-05,
+      "loss": 0.7523,
+      "step": 16032
+    },
+    {
+      "epoch": 2.854522792022792,
+      "grad_norm": 0.7935783267021179,
+      "learning_rate": 3.791872187834626e-05,
+      "loss": 0.7559,
+      "step": 16033
+    },
+    {
+      "epoch": 2.8547008547008548,
+      "grad_norm": 0.8030906915664673,
+      "learning_rate": 3.790774904426568e-05,
+      "loss": 0.7905,
+      "step": 16034
+    },
+    {
+      "epoch": 2.8548789173789175,
+      "grad_norm": 0.8756175637245178,
+      "learning_rate": 3.789677742675384e-05,
+      "loss": 0.9067,
+      "step": 16035
+    },
+    {
+      "epoch": 2.85505698005698,
+      "grad_norm": 0.7602807283401489,
+      "learning_rate": 3.788580702602558e-05,
+      "loss": 0.9192,
+      "step": 16036
+    },
+    {
+      "epoch": 2.8552350427350426,
+      "grad_norm": 0.9411010146141052,
+      "learning_rate": 3.787483784229592e-05,
+      "loss": 0.8192,
+      "step": 16037
+    },
+    {
+      "epoch": 2.8554131054131053,
+      "grad_norm": 0.9473391175270081,
+      "learning_rate": 3.786386987577976e-05,
+      "loss": 0.7845,
+      "step": 16038
+    },
+    {
+      "epoch": 2.855591168091168,
+      "grad_norm": 0.9226218461990356,
+      "learning_rate": 3.7852903126692e-05,
+      "loss": 0.8712,
+      "step": 16039
+    },
+    {
+      "epoch": 2.855769230769231,
+      "grad_norm": 0.9519350528717041,
+      "learning_rate": 3.78419375952475e-05,
+      "loss": 0.9584,
+      "step": 16040
+    },
+    {
+      "epoch": 2.8559472934472936,
+      "grad_norm": 0.825547456741333,
+      "learning_rate": 3.783097328166111e-05,
+      "loss": 0.9279,
+      "step": 16041
+    },
+    {
+      "epoch": 2.8561253561253563,
+      "grad_norm": 0.8645279407501221,
+      "learning_rate": 3.782001018614763e-05,
+      "loss": 0.83,
+      "step": 16042
+    },
+    {
+      "epoch": 2.8563034188034186,
+      "grad_norm": 0.827126145362854,
+      "learning_rate": 3.7809048308921936e-05,
+      "loss": 0.7661,
+      "step": 16043
+    },
+    {
+      "epoch": 2.8564814814814814,
+      "grad_norm": 0.9441137909889221,
+      "learning_rate": 3.779808765019869e-05,
+      "loss": 0.8745,
+      "step": 16044
+    },
+    {
+      "epoch": 2.856659544159544,
+      "grad_norm": 0.8505343794822693,
+      "learning_rate": 3.7787128210192736e-05,
+      "loss": 0.8176,
+      "step": 16045
+    },
+    {
+      "epoch": 2.856837606837607,
+      "grad_norm": 0.8797150254249573,
+      "learning_rate": 3.777616998911876e-05,
+      "loss": 0.7018,
+      "step": 16046
+    },
+    {
+      "epoch": 2.8570156695156697,
+      "grad_norm": 0.8386834263801575,
+      "learning_rate": 3.776521298719144e-05,
+      "loss": 0.9805,
+      "step": 16047
+    },
+    {
+      "epoch": 2.857193732193732,
+      "grad_norm": 0.818373441696167,
+      "learning_rate": 3.775425720462558e-05,
+      "loss": 0.7782,
+      "step": 16048
+    },
+    {
+      "epoch": 2.8573717948717947,
+      "grad_norm": 0.760405957698822,
+      "learning_rate": 3.774330264163566e-05,
+      "loss": 0.7283,
+      "step": 16049
+    },
+    {
+      "epoch": 2.8575498575498575,
+      "grad_norm": 0.9082552194595337,
+      "learning_rate": 3.7732349298436465e-05,
+      "loss": 0.7594,
+      "step": 16050
+    },
+    {
+      "epoch": 2.85772792022792,
+      "grad_norm": 0.859868586063385,
+      "learning_rate": 3.7721397175242477e-05,
+      "loss": 0.7841,
+      "step": 16051
+    },
+    {
+      "epoch": 2.857905982905983,
+      "grad_norm": 1.031545877456665,
+      "learning_rate": 3.771044627226836e-05,
+      "loss": 1.2157,
+      "step": 16052
+    },
+    {
+      "epoch": 2.8580840455840457,
+      "grad_norm": 0.859491765499115,
+      "learning_rate": 3.769949658972867e-05,
+      "loss": 0.7942,
+      "step": 16053
+    },
+    {
+      "epoch": 2.8582621082621085,
+      "grad_norm": 0.775382936000824,
+      "learning_rate": 3.768854812783791e-05,
+      "loss": 0.7321,
+      "step": 16054
+    },
+    {
+      "epoch": 2.8584401709401708,
+      "grad_norm": 0.9268868565559387,
+      "learning_rate": 3.767760088681062e-05,
+      "loss": 1.0104,
+      "step": 16055
+    },
+    {
+      "epoch": 2.8586182336182335,
+      "grad_norm": 0.8408828973770142,
+      "learning_rate": 3.7666654866861274e-05,
+      "loss": 0.5298,
+      "step": 16056
+    },
+    {
+      "epoch": 2.8587962962962963,
+      "grad_norm": 0.8417157530784607,
+      "learning_rate": 3.76557100682043e-05,
+      "loss": 0.9123,
+      "step": 16057
+    },
+    {
+      "epoch": 2.858974358974359,
+      "grad_norm": 0.8079593777656555,
+      "learning_rate": 3.764476649105425e-05,
+      "loss": 0.794,
+      "step": 16058
+    },
+    {
+      "epoch": 2.859152421652422,
+      "grad_norm": 0.9449031352996826,
+      "learning_rate": 3.763382413562541e-05,
+      "loss": 1.1281,
+      "step": 16059
+    },
+    {
+      "epoch": 2.859330484330484,
+      "grad_norm": 0.8985004425048828,
+      "learning_rate": 3.762288300213228e-05,
+      "loss": 0.929,
+      "step": 16060
+    },
+    {
+      "epoch": 2.859508547008547,
+      "grad_norm": 0.9850391149520874,
+      "learning_rate": 3.761194309078913e-05,
+      "loss": 0.9796,
+      "step": 16061
+    },
+    {
+      "epoch": 2.8596866096866096,
+      "grad_norm": 0.9231089949607849,
+      "learning_rate": 3.760100440181038e-05,
+      "loss": 0.6981,
+      "step": 16062
+    },
+    {
+      "epoch": 2.8598646723646723,
+      "grad_norm": 0.8458681702613831,
+      "learning_rate": 3.759006693541033e-05,
+      "loss": 0.9502,
+      "step": 16063
+    },
+    {
+      "epoch": 2.860042735042735,
+      "grad_norm": 0.8494541645050049,
+      "learning_rate": 3.7579130691803266e-05,
+      "loss": 0.7477,
+      "step": 16064
+    },
+    {
+      "epoch": 2.860220797720798,
+      "grad_norm": 0.879878580570221,
+      "learning_rate": 3.756819567120348e-05,
+      "loss": 0.8426,
+      "step": 16065
+    },
+    {
+      "epoch": 2.8603988603988606,
+      "grad_norm": 0.8161541223526001,
+      "learning_rate": 3.7557261873825155e-05,
+      "loss": 0.7411,
+      "step": 16066
+    },
+    {
+      "epoch": 2.8605769230769234,
+      "grad_norm": 0.9438506364822388,
+      "learning_rate": 3.754632929988262e-05,
+      "loss": 0.8494,
+      "step": 16067
+    },
+    {
+      "epoch": 2.8607549857549857,
+      "grad_norm": 0.8552418351173401,
+      "learning_rate": 3.753539794959002e-05,
+      "loss": 0.766,
+      "step": 16068
+    },
+    {
+      "epoch": 2.8609330484330484,
+      "grad_norm": 0.8670600056648254,
+      "learning_rate": 3.7524467823161546e-05,
+      "loss": 0.7462,
+      "step": 16069
+    },
+    {
+      "epoch": 2.861111111111111,
+      "grad_norm": 0.7906678318977356,
+      "learning_rate": 3.751353892081134e-05,
+      "loss": 0.791,
+      "step": 16070
+    },
+    {
+      "epoch": 2.861289173789174,
+      "grad_norm": 0.8461915254592896,
+      "learning_rate": 3.7502611242753536e-05,
+      "loss": 0.6979,
+      "step": 16071
+    },
+    {
+      "epoch": 2.861467236467236,
+      "grad_norm": 0.8197309970855713,
+      "learning_rate": 3.749168478920223e-05,
+      "loss": 0.8095,
+      "step": 16072
+    },
+    {
+      "epoch": 2.861645299145299,
+      "grad_norm": 0.9489047527313232,
+      "learning_rate": 3.7480759560371516e-05,
+      "loss": 0.9462,
+      "step": 16073
+    },
+    {
+      "epoch": 2.8618233618233617,
+      "grad_norm": 0.8539329767227173,
+      "learning_rate": 3.7469835556475405e-05,
+      "loss": 0.853,
+      "step": 16074
+    },
+    {
+      "epoch": 2.8620014245014245,
+      "grad_norm": 0.9104743003845215,
+      "learning_rate": 3.745891277772805e-05,
+      "loss": 0.757,
+      "step": 16075
+    },
+    {
+      "epoch": 2.8621794871794872,
+      "grad_norm": 0.8277523517608643,
+      "learning_rate": 3.744799122434332e-05,
+      "loss": 0.8073,
+      "step": 16076
+    },
+    {
+      "epoch": 2.86235754985755,
+      "grad_norm": 0.866422176361084,
+      "learning_rate": 3.743707089653527e-05,
+      "loss": 0.7201,
+      "step": 16077
+    },
+    {
+      "epoch": 2.8625356125356127,
+      "grad_norm": 0.8722748160362244,
+      "learning_rate": 3.742615179451787e-05,
+      "loss": 0.7901,
+      "step": 16078
+    },
+    {
+      "epoch": 2.8627136752136755,
+      "grad_norm": 0.74676513671875,
+      "learning_rate": 3.741523391850504e-05,
+      "loss": 0.7558,
+      "step": 16079
+    },
+    {
+      "epoch": 2.862891737891738,
+      "grad_norm": 0.7674166560173035,
+      "learning_rate": 3.740431726871069e-05,
+      "loss": 0.8699,
+      "step": 16080
+    },
+    {
+      "epoch": 2.8630698005698005,
+      "grad_norm": 0.9028998613357544,
+      "learning_rate": 3.739340184534871e-05,
+      "loss": 1.136,
+      "step": 16081
+    },
+    {
+      "epoch": 2.8632478632478633,
+      "grad_norm": 0.8240773677825928,
+      "learning_rate": 3.7382487648632936e-05,
+      "loss": 0.9357,
+      "step": 16082
+    },
+    {
+      "epoch": 2.863425925925926,
+      "grad_norm": 0.8877659440040588,
+      "learning_rate": 3.737157467877731e-05,
+      "loss": 0.9167,
+      "step": 16083
+    },
+    {
+      "epoch": 2.8636039886039883,
+      "grad_norm": 0.9677366614341736,
+      "learning_rate": 3.73606629359955e-05,
+      "loss": 0.8342,
+      "step": 16084
+    },
+    {
+      "epoch": 2.863782051282051,
+      "grad_norm": 0.8721164464950562,
+      "learning_rate": 3.734975242050146e-05,
+      "loss": 0.9195,
+      "step": 16085
+    },
+    {
+      "epoch": 2.863960113960114,
+      "grad_norm": 0.9791151881217957,
+      "learning_rate": 3.733884313250879e-05,
+      "loss": 1.0011,
+      "step": 16086
+    },
+    {
+      "epoch": 2.8641381766381766,
+      "grad_norm": 0.7869369983673096,
+      "learning_rate": 3.7327935072231366e-05,
+      "loss": 0.7998,
+      "step": 16087
+    },
+    {
+      "epoch": 2.8643162393162394,
+      "grad_norm": 0.891656756401062,
+      "learning_rate": 3.731702823988287e-05,
+      "loss": 0.7287,
+      "step": 16088
+    },
+    {
+      "epoch": 2.864494301994302,
+      "grad_norm": 0.8720460534095764,
+      "learning_rate": 3.7306122635676955e-05,
+      "loss": 0.9492,
+      "step": 16089
+    },
+    {
+      "epoch": 2.864672364672365,
+      "grad_norm": 0.7878959774971008,
+      "learning_rate": 3.72952182598274e-05,
+      "loss": 0.7652,
+      "step": 16090
+    },
+    {
+      "epoch": 2.8648504273504276,
+      "grad_norm": 0.9350453019142151,
+      "learning_rate": 3.728431511254772e-05,
+      "loss": 0.8661,
+      "step": 16091
+    },
+    {
+      "epoch": 2.86502849002849,
+      "grad_norm": 0.7575289011001587,
+      "learning_rate": 3.727341319405163e-05,
+      "loss": 0.7943,
+      "step": 16092
+    },
+    {
+      "epoch": 2.8652065527065527,
+      "grad_norm": 0.8256776928901672,
+      "learning_rate": 3.7262512504552716e-05,
+      "loss": 1.0025,
+      "step": 16093
+    },
+    {
+      "epoch": 2.8653846153846154,
+      "grad_norm": 0.7274962067604065,
+      "learning_rate": 3.7251613044264536e-05,
+      "loss": 0.7628,
+      "step": 16094
+    },
+    {
+      "epoch": 2.865562678062678,
+      "grad_norm": 0.867734968662262,
+      "learning_rate": 3.7240714813400646e-05,
+      "loss": 0.8931,
+      "step": 16095
+    },
+    {
+      "epoch": 2.8657407407407405,
+      "grad_norm": 0.8225845098495483,
+      "learning_rate": 3.722981781217458e-05,
+      "loss": 0.7934,
+      "step": 16096
+    },
+    {
+      "epoch": 2.8659188034188032,
+      "grad_norm": 0.8180573582649231,
+      "learning_rate": 3.721892204079985e-05,
+      "loss": 0.8119,
+      "step": 16097
+    },
+    {
+      "epoch": 2.866096866096866,
+      "grad_norm": 1.0235565900802612,
+      "learning_rate": 3.720802749948993e-05,
+      "loss": 0.8281,
+      "step": 16098
+    },
+    {
+      "epoch": 2.8662749287749287,
+      "grad_norm": 0.7290656566619873,
+      "learning_rate": 3.719713418845823e-05,
+      "loss": 0.7099,
+      "step": 16099
+    },
+    {
+      "epoch": 2.8664529914529915,
+      "grad_norm": 0.8408772349357605,
+      "learning_rate": 3.718624210791828e-05,
+      "loss": 0.8631,
+      "step": 16100
+    },
+    {
+      "epoch": 2.8666310541310542,
+      "grad_norm": 0.8182529807090759,
+      "learning_rate": 3.717535125808338e-05,
+      "loss": 0.7584,
+      "step": 16101
+    },
+    {
+      "epoch": 2.866809116809117,
+      "grad_norm": 0.8381599187850952,
+      "learning_rate": 3.716446163916699e-05,
+      "loss": 0.8735,
+      "step": 16102
+    },
+    {
+      "epoch": 2.8669871794871797,
+      "grad_norm": 0.8975555300712585,
+      "learning_rate": 3.715357325138245e-05,
+      "loss": 0.7564,
+      "step": 16103
+    },
+    {
+      "epoch": 2.867165242165242,
+      "grad_norm": 0.9531118869781494,
+      "learning_rate": 3.714268609494309e-05,
+      "loss": 0.7627,
+      "step": 16104
+    },
+    {
+      "epoch": 2.867343304843305,
+      "grad_norm": 0.853065550327301,
+      "learning_rate": 3.7131800170062216e-05,
+      "loss": 0.8001,
+      "step": 16105
+    },
+    {
+      "epoch": 2.8675213675213675,
+      "grad_norm": 0.788351833820343,
+      "learning_rate": 3.7120915476953085e-05,
+      "loss": 0.7935,
+      "step": 16106
+    },
+    {
+      "epoch": 2.8676994301994303,
+      "grad_norm": 0.9121149778366089,
+      "learning_rate": 3.711003201582908e-05,
+      "loss": 0.9212,
+      "step": 16107
+    },
+    {
+      "epoch": 2.8678774928774926,
+      "grad_norm": 0.8156226277351379,
+      "learning_rate": 3.7099149786903263e-05,
+      "loss": 0.86,
+      "step": 16108
+    },
+    {
+      "epoch": 2.8680555555555554,
+      "grad_norm": 0.8555662035942078,
+      "learning_rate": 3.708826879038899e-05,
+      "loss": 0.8872,
+      "step": 16109
+    },
+    {
+      "epoch": 2.868233618233618,
+      "grad_norm": 1.0395163297653198,
+      "learning_rate": 3.70773890264994e-05,
+      "loss": 0.9794,
+      "step": 16110
+    },
+    {
+      "epoch": 2.868411680911681,
+      "grad_norm": 0.7535551190376282,
+      "learning_rate": 3.706651049544766e-05,
+      "loss": 0.91,
+      "step": 16111
+    },
+    {
+      "epoch": 2.8685897435897436,
+      "grad_norm": 1.0145034790039062,
+      "learning_rate": 3.705563319744691e-05,
+      "loss": 0.9386,
+      "step": 16112
+    },
+    {
+      "epoch": 2.8687678062678064,
+      "grad_norm": 0.8577025532722473,
+      "learning_rate": 3.704475713271029e-05,
+      "loss": 1.0491,
+      "step": 16113
+    },
+    {
+      "epoch": 2.868945868945869,
+      "grad_norm": 0.8291150331497192,
+      "learning_rate": 3.7033882301450815e-05,
+      "loss": 0.8418,
+      "step": 16114
+    },
+    {
+      "epoch": 2.869123931623932,
+      "grad_norm": 0.7628613710403442,
+      "learning_rate": 3.70230087038817e-05,
+      "loss": 0.681,
+      "step": 16115
+    },
+    {
+      "epoch": 2.869301994301994,
+      "grad_norm": 0.8664639592170715,
+      "learning_rate": 3.701213634021583e-05,
+      "loss": 1.0247,
+      "step": 16116
+    },
+    {
+      "epoch": 2.869480056980057,
+      "grad_norm": 0.9613258838653564,
+      "learning_rate": 3.700126521066635e-05,
+      "loss": 1.1058,
+      "step": 16117
+    },
+    {
+      "epoch": 2.8696581196581197,
+      "grad_norm": 0.8279051780700684,
+      "learning_rate": 3.699039531544619e-05,
+      "loss": 0.859,
+      "step": 16118
+    },
+    {
+      "epoch": 2.8698361823361824,
+      "grad_norm": 0.8285593390464783,
+      "learning_rate": 3.697952665476836e-05,
+      "loss": 0.9008,
+      "step": 16119
+    },
+    {
+      "epoch": 2.870014245014245,
+      "grad_norm": 0.9056670069694519,
+      "learning_rate": 3.696865922884578e-05,
+      "loss": 0.965,
+      "step": 16120
+    },
+    {
+      "epoch": 2.8701923076923075,
+      "grad_norm": 0.7092664837837219,
+      "learning_rate": 3.69577930378914e-05,
+      "loss": 0.7417,
+      "step": 16121
+    },
+    {
+      "epoch": 2.8703703703703702,
+      "grad_norm": 0.8726393580436707,
+      "learning_rate": 3.6946928082118096e-05,
+      "loss": 0.8582,
+      "step": 16122
+    },
+    {
+      "epoch": 2.870548433048433,
+      "grad_norm": 1.0046098232269287,
+      "learning_rate": 3.693606436173875e-05,
+      "loss": 0.9616,
+      "step": 16123
+    },
+    {
+      "epoch": 2.8707264957264957,
+      "grad_norm": 0.7739760875701904,
+      "learning_rate": 3.69252018769662e-05,
+      "loss": 0.8775,
+      "step": 16124
+    },
+    {
+      "epoch": 2.8709045584045585,
+      "grad_norm": 0.9054580926895142,
+      "learning_rate": 3.6914340628013344e-05,
+      "loss": 0.9205,
+      "step": 16125
+    },
+    {
+      "epoch": 2.8710826210826212,
+      "grad_norm": 0.8324142694473267,
+      "learning_rate": 3.690348061509288e-05,
+      "loss": 0.8712,
+      "step": 16126
+    },
+    {
+      "epoch": 2.871260683760684,
+      "grad_norm": 0.9162326455116272,
+      "learning_rate": 3.6892621838417664e-05,
+      "loss": 0.9102,
+      "step": 16127
+    },
+    {
+      "epoch": 2.8714387464387463,
+      "grad_norm": 0.8579963445663452,
+      "learning_rate": 3.688176429820044e-05,
+      "loss": 0.8337,
+      "step": 16128
+    },
+    {
+      "epoch": 2.871616809116809,
+      "grad_norm": 0.7649274468421936,
+      "learning_rate": 3.687090799465388e-05,
+      "loss": 0.6982,
+      "step": 16129
+    },
+    {
+      "epoch": 2.871794871794872,
+      "grad_norm": 0.9612696766853333,
+      "learning_rate": 3.6860052927990816e-05,
+      "loss": 0.8779,
+      "step": 16130
+    },
+    {
+      "epoch": 2.8719729344729346,
+      "grad_norm": 0.982455313205719,
+      "learning_rate": 3.6849199098423795e-05,
+      "loss": 0.8145,
+      "step": 16131
+    },
+    {
+      "epoch": 2.8721509971509973,
+      "grad_norm": 0.7292434573173523,
+      "learning_rate": 3.6838346506165587e-05,
+      "loss": 0.6711,
+      "step": 16132
+    },
+    {
+      "epoch": 2.8723290598290596,
+      "grad_norm": 0.877310574054718,
+      "learning_rate": 3.68274951514287e-05,
+      "loss": 0.7682,
+      "step": 16133
+    },
+    {
+      "epoch": 2.8725071225071224,
+      "grad_norm": 0.9633384943008423,
+      "learning_rate": 3.681664503442586e-05,
+      "loss": 1.0046,
+      "step": 16134
+    },
+    {
+      "epoch": 2.872685185185185,
+      "grad_norm": 0.940661609172821,
+      "learning_rate": 3.680579615536961e-05,
+      "loss": 0.897,
+      "step": 16135
+    },
+    {
+      "epoch": 2.872863247863248,
+      "grad_norm": 1.0335214138031006,
+      "learning_rate": 3.6794948514472505e-05,
+      "loss": 0.8836,
+      "step": 16136
+    },
+    {
+      "epoch": 2.8730413105413106,
+      "grad_norm": 0.8682044148445129,
+      "learning_rate": 3.6784102111947084e-05,
+      "loss": 0.7733,
+      "step": 16137
+    },
+    {
+      "epoch": 2.8732193732193734,
+      "grad_norm": 0.8767847418785095,
+      "learning_rate": 3.677325694800586e-05,
+      "loss": 0.8828,
+      "step": 16138
+    },
+    {
+      "epoch": 2.873397435897436,
+      "grad_norm": 0.954585075378418,
+      "learning_rate": 3.6762413022861305e-05,
+      "loss": 0.9294,
+      "step": 16139
+    },
+    {
+      "epoch": 2.8735754985754984,
+      "grad_norm": 0.8497310876846313,
+      "learning_rate": 3.675157033672596e-05,
+      "loss": 0.9396,
+      "step": 16140
+    },
+    {
+      "epoch": 2.873753561253561,
+      "grad_norm": 0.7619023323059082,
+      "learning_rate": 3.674072888981214e-05,
+      "loss": 0.7467,
+      "step": 16141
+    },
+    {
+      "epoch": 2.873931623931624,
+      "grad_norm": 0.9939205646514893,
+      "learning_rate": 3.6729888682332394e-05,
+      "loss": 0.9122,
+      "step": 16142
+    },
+    {
+      "epoch": 2.8741096866096867,
+      "grad_norm": 0.943192183971405,
+      "learning_rate": 3.671904971449899e-05,
+      "loss": 0.8878,
+      "step": 16143
+    },
+    {
+      "epoch": 2.8742877492877494,
+      "grad_norm": 0.8002169728279114,
+      "learning_rate": 3.6708211986524365e-05,
+      "loss": 0.8337,
+      "step": 16144
+    },
+    {
+      "epoch": 2.8744658119658117,
+      "grad_norm": 0.7558008432388306,
+      "learning_rate": 3.669737549862087e-05,
+      "loss": 0.6592,
+      "step": 16145
+    },
+    {
+      "epoch": 2.8746438746438745,
+      "grad_norm": 0.8732983469963074,
+      "learning_rate": 3.6686540251000756e-05,
+      "loss": 0.9166,
+      "step": 16146
+    },
+    {
+      "epoch": 2.8748219373219372,
+      "grad_norm": 0.8272808194160461,
+      "learning_rate": 3.667570624387643e-05,
+      "loss": 0.6342,
+      "step": 16147
+    },
+    {
+      "epoch": 2.875,
+      "grad_norm": 0.8012139797210693,
+      "learning_rate": 3.666487347746004e-05,
+      "loss": 0.8436,
+      "step": 16148
+    },
+    {
+      "epoch": 2.8751780626780628,
+      "grad_norm": 0.8791360259056091,
+      "learning_rate": 3.66540419519639e-05,
+      "loss": 0.8634,
+      "step": 16149
+    },
+    {
+      "epoch": 2.8753561253561255,
+      "grad_norm": 0.8274601101875305,
+      "learning_rate": 3.6643211667600244e-05,
+      "loss": 0.7488,
+      "step": 16150
+    },
+    {
+      "epoch": 2.8755341880341883,
+      "grad_norm": 0.8390287756919861,
+      "learning_rate": 3.663238262458122e-05,
+      "loss": 0.7084,
+      "step": 16151
+    },
+    {
+      "epoch": 2.8757122507122506,
+      "grad_norm": 0.921089768409729,
+      "learning_rate": 3.662155482311903e-05,
+      "loss": 0.8909,
+      "step": 16152
+    },
+    {
+      "epoch": 2.8758903133903133,
+      "grad_norm": 0.8790102601051331,
+      "learning_rate": 3.661072826342583e-05,
+      "loss": 0.8235,
+      "step": 16153
+    },
+    {
+      "epoch": 2.876068376068376,
+      "grad_norm": 0.8030495643615723,
+      "learning_rate": 3.659990294571368e-05,
+      "loss": 0.874,
+      "step": 16154
+    },
+    {
+      "epoch": 2.876246438746439,
+      "grad_norm": 0.9690510034561157,
+      "learning_rate": 3.6589078870194804e-05,
+      "loss": 1.0926,
+      "step": 16155
+    },
+    {
+      "epoch": 2.8764245014245016,
+      "grad_norm": 0.8150941133499146,
+      "learning_rate": 3.657825603708114e-05,
+      "loss": 0.861,
+      "step": 16156
+    },
+    {
+      "epoch": 2.876602564102564,
+      "grad_norm": 0.8865286111831665,
+      "learning_rate": 3.656743444658486e-05,
+      "loss": 0.8219,
+      "step": 16157
+    },
+    {
+      "epoch": 2.8767806267806266,
+      "grad_norm": 0.8591124415397644,
+      "learning_rate": 3.655661409891786e-05,
+      "loss": 0.889,
+      "step": 16158
+    },
+    {
+      "epoch": 2.8769586894586894,
+      "grad_norm": 0.8625560402870178,
+      "learning_rate": 3.6545794994292256e-05,
+      "loss": 0.9581,
+      "step": 16159
+    },
+    {
+      "epoch": 2.877136752136752,
+      "grad_norm": 0.8699239492416382,
+      "learning_rate": 3.653497713291999e-05,
+      "loss": 0.7275,
+      "step": 16160
+    },
+    {
+      "epoch": 2.877314814814815,
+      "grad_norm": 0.9005762338638306,
+      "learning_rate": 3.652416051501301e-05,
+      "loss": 0.8894,
+      "step": 16161
+    },
+    {
+      "epoch": 2.8774928774928776,
+      "grad_norm": 0.7541293501853943,
+      "learning_rate": 3.651334514078323e-05,
+      "loss": 0.7265,
+      "step": 16162
+    },
+    {
+      "epoch": 2.8776709401709404,
+      "grad_norm": 0.9799004793167114,
+      "learning_rate": 3.650253101044258e-05,
+      "loss": 0.8817,
+      "step": 16163
+    },
+    {
+      "epoch": 2.8778490028490027,
+      "grad_norm": 0.7796139717102051,
+      "learning_rate": 3.64917181242029e-05,
+      "loss": 0.7105,
+      "step": 16164
+    },
+    {
+      "epoch": 2.8780270655270654,
+      "grad_norm": 0.8818691968917847,
+      "learning_rate": 3.648090648227613e-05,
+      "loss": 0.8868,
+      "step": 16165
+    },
+    {
+      "epoch": 2.878205128205128,
+      "grad_norm": 0.7982428073883057,
+      "learning_rate": 3.647009608487399e-05,
+      "loss": 0.9269,
+      "step": 16166
+    },
+    {
+      "epoch": 2.878383190883191,
+      "grad_norm": 0.9602195620536804,
+      "learning_rate": 3.645928693220838e-05,
+      "loss": 0.8264,
+      "step": 16167
+    },
+    {
+      "epoch": 2.8785612535612537,
+      "grad_norm": 0.8941618800163269,
+      "learning_rate": 3.6448479024491054e-05,
+      "loss": 0.8009,
+      "step": 16168
+    },
+    {
+      "epoch": 2.878739316239316,
+      "grad_norm": 0.7777221202850342,
+      "learning_rate": 3.643767236193375e-05,
+      "loss": 0.7734,
+      "step": 16169
+    },
+    {
+      "epoch": 2.8789173789173788,
+      "grad_norm": 0.8050239086151123,
+      "learning_rate": 3.642686694474823e-05,
+      "loss": 0.9732,
+      "step": 16170
+    },
+    {
+      "epoch": 2.8790954415954415,
+      "grad_norm": 0.8437817096710205,
+      "learning_rate": 3.6416062773146156e-05,
+      "loss": 0.8161,
+      "step": 16171
+    },
+    {
+      "epoch": 2.8792735042735043,
+      "grad_norm": 0.7887414693832397,
+      "learning_rate": 3.64052598473393e-05,
+      "loss": 0.9165,
+      "step": 16172
+    },
+    {
+      "epoch": 2.879451566951567,
+      "grad_norm": 0.8671287894248962,
+      "learning_rate": 3.639445816753921e-05,
+      "loss": 0.9519,
+      "step": 16173
+    },
+    {
+      "epoch": 2.8796296296296298,
+      "grad_norm": 0.8444932103157043,
+      "learning_rate": 3.638365773395763e-05,
+      "loss": 0.784,
+      "step": 16174
+    },
+    {
+      "epoch": 2.8798076923076925,
+      "grad_norm": 0.8580447435379028,
+      "learning_rate": 3.637285854680612e-05,
+      "loss": 0.797,
+      "step": 16175
+    },
+    {
+      "epoch": 2.879985754985755,
+      "grad_norm": 0.7977848052978516,
+      "learning_rate": 3.636206060629627e-05,
+      "loss": 0.8552,
+      "step": 16176
+    },
+    {
+      "epoch": 2.8801638176638176,
+      "grad_norm": 0.8350155353546143,
+      "learning_rate": 3.6351263912639644e-05,
+      "loss": 0.9481,
+      "step": 16177
+    },
+    {
+      "epoch": 2.8803418803418803,
+      "grad_norm": 0.8759897351264954,
+      "learning_rate": 3.634046846604778e-05,
+      "loss": 0.9972,
+      "step": 16178
+    },
+    {
+      "epoch": 2.880519943019943,
+      "grad_norm": 0.8259425163269043,
+      "learning_rate": 3.6329674266732194e-05,
+      "loss": 0.8947,
+      "step": 16179
+    },
+    {
+      "epoch": 2.880698005698006,
+      "grad_norm": 0.7992371320724487,
+      "learning_rate": 3.631888131490438e-05,
+      "loss": 0.8346,
+      "step": 16180
+    },
+    {
+      "epoch": 2.880876068376068,
+      "grad_norm": 0.9702637195587158,
+      "learning_rate": 3.6308089610775775e-05,
+      "loss": 0.9543,
+      "step": 16181
+    },
+    {
+      "epoch": 2.881054131054131,
+      "grad_norm": 0.8494347929954529,
+      "learning_rate": 3.6297299154557905e-05,
+      "loss": 0.8672,
+      "step": 16182
+    },
+    {
+      "epoch": 2.8812321937321936,
+      "grad_norm": 0.8098355531692505,
+      "learning_rate": 3.628650994646207e-05,
+      "loss": 0.8356,
+      "step": 16183
+    },
+    {
+      "epoch": 2.8814102564102564,
+      "grad_norm": 0.7736916542053223,
+      "learning_rate": 3.627572198669974e-05,
+      "loss": 0.849,
+      "step": 16184
+    },
+    {
+      "epoch": 2.881588319088319,
+      "grad_norm": 0.806710422039032,
+      "learning_rate": 3.626493527548226e-05,
+      "loss": 0.9479,
+      "step": 16185
+    },
+    {
+      "epoch": 2.881766381766382,
+      "grad_norm": 1.0013649463653564,
+      "learning_rate": 3.625414981302095e-05,
+      "loss": 0.967,
+      "step": 16186
+    },
+    {
+      "epoch": 2.8819444444444446,
+      "grad_norm": 0.8116905093193054,
+      "learning_rate": 3.624336559952723e-05,
+      "loss": 0.8608,
+      "step": 16187
+    },
+    {
+      "epoch": 2.8821225071225074,
+      "grad_norm": 0.7848439812660217,
+      "learning_rate": 3.6232582635212233e-05,
+      "loss": 0.9224,
+      "step": 16188
+    },
+    {
+      "epoch": 2.8823005698005697,
+      "grad_norm": 0.9881102442741394,
+      "learning_rate": 3.622180092028735e-05,
+      "loss": 0.8089,
+      "step": 16189
+    },
+    {
+      "epoch": 2.8824786324786325,
+      "grad_norm": 0.790452778339386,
+      "learning_rate": 3.6211020454963796e-05,
+      "loss": 0.8312,
+      "step": 16190
+    },
+    {
+      "epoch": 2.882656695156695,
+      "grad_norm": 0.8228929042816162,
+      "learning_rate": 3.6200241239452784e-05,
+      "loss": 0.8277,
+      "step": 16191
+    },
+    {
+      "epoch": 2.882834757834758,
+      "grad_norm": 0.871133029460907,
+      "learning_rate": 3.618946327396552e-05,
+      "loss": 0.8861,
+      "step": 16192
+    },
+    {
+      "epoch": 2.8830128205128203,
+      "grad_norm": 0.8964230418205261,
+      "learning_rate": 3.617868655871316e-05,
+      "loss": 0.9987,
+      "step": 16193
+    },
+    {
+      "epoch": 2.883190883190883,
+      "grad_norm": 0.8292636275291443,
+      "learning_rate": 3.6167911093906856e-05,
+      "loss": 0.9823,
+      "step": 16194
+    },
+    {
+      "epoch": 2.8833689458689458,
+      "grad_norm": 0.9594070315361023,
+      "learning_rate": 3.615713687975774e-05,
+      "loss": 0.7963,
+      "step": 16195
+    },
+    {
+      "epoch": 2.8835470085470085,
+      "grad_norm": 1.014891266822815,
+      "learning_rate": 3.6146363916476864e-05,
+      "loss": 0.9203,
+      "step": 16196
+    },
+    {
+      "epoch": 2.8837250712250713,
+      "grad_norm": 0.8205485343933105,
+      "learning_rate": 3.6135592204275424e-05,
+      "loss": 0.894,
+      "step": 16197
+    },
+    {
+      "epoch": 2.883903133903134,
+      "grad_norm": 0.9057072997093201,
+      "learning_rate": 3.6124821743364315e-05,
+      "loss": 0.9131,
+      "step": 16198
+    },
+    {
+      "epoch": 2.8840811965811968,
+      "grad_norm": 0.8080529570579529,
+      "learning_rate": 3.6114052533954665e-05,
+      "loss": 0.9995,
+      "step": 16199
+    },
+    {
+      "epoch": 2.8842592592592595,
+      "grad_norm": 0.762264609336853,
+      "learning_rate": 3.6103284576257446e-05,
+      "loss": 0.915,
+      "step": 16200
+    },
+    {
+      "epoch": 2.884437321937322,
+      "grad_norm": 0.8386275172233582,
+      "learning_rate": 3.609251787048363e-05,
+      "loss": 0.9016,
+      "step": 16201
+    },
+    {
+      "epoch": 2.8846153846153846,
+      "grad_norm": 0.9676657319068909,
+      "learning_rate": 3.608175241684417e-05,
+      "loss": 1.1004,
+      "step": 16202
+    },
+    {
+      "epoch": 2.8847934472934473,
+      "grad_norm": 0.8155630826950073,
+      "learning_rate": 3.607098821554999e-05,
+      "loss": 0.8579,
+      "step": 16203
+    },
+    {
+      "epoch": 2.88497150997151,
+      "grad_norm": 0.8426685333251953,
+      "learning_rate": 3.606022526681201e-05,
+      "loss": 0.8722,
+      "step": 16204
+    },
+    {
+      "epoch": 2.8851495726495724,
+      "grad_norm": 0.7646408081054688,
+      "learning_rate": 3.604946357084105e-05,
+      "loss": 0.7806,
+      "step": 16205
+    },
+    {
+      "epoch": 2.885327635327635,
+      "grad_norm": 0.808560848236084,
+      "learning_rate": 3.603870312784803e-05,
+      "loss": 0.8419,
+      "step": 16206
+    },
+    {
+      "epoch": 2.885505698005698,
+      "grad_norm": 0.7197920083999634,
+      "learning_rate": 3.602794393804376e-05,
+      "loss": 0.7356,
+      "step": 16207
+    },
+    {
+      "epoch": 2.8856837606837606,
+      "grad_norm": 0.9405228495597839,
+      "learning_rate": 3.6017186001639036e-05,
+      "loss": 0.7279,
+      "step": 16208
+    },
+    {
+      "epoch": 2.8858618233618234,
+      "grad_norm": 0.8910958170890808,
+      "learning_rate": 3.600642931884465e-05,
+      "loss": 0.7653,
+      "step": 16209
+    },
+    {
+      "epoch": 2.886039886039886,
+      "grad_norm": 0.7354677319526672,
+      "learning_rate": 3.599567388987134e-05,
+      "loss": 0.6884,
+      "step": 16210
+    },
+    {
+      "epoch": 2.886217948717949,
+      "grad_norm": 0.75583815574646,
+      "learning_rate": 3.598491971492981e-05,
+      "loss": 0.7555,
+      "step": 16211
+    },
+    {
+      "epoch": 2.8863960113960117,
+      "grad_norm": 0.839042603969574,
+      "learning_rate": 3.597416679423086e-05,
+      "loss": 1.0522,
+      "step": 16212
+    },
+    {
+      "epoch": 2.886574074074074,
+      "grad_norm": 0.7965270280838013,
+      "learning_rate": 3.596341512798505e-05,
+      "loss": 0.9101,
+      "step": 16213
+    },
+    {
+      "epoch": 2.8867521367521367,
+      "grad_norm": 0.9170811772346497,
+      "learning_rate": 3.5952664716403154e-05,
+      "loss": 0.9078,
+      "step": 16214
+    },
+    {
+      "epoch": 2.8869301994301995,
+      "grad_norm": 0.8612267374992371,
+      "learning_rate": 3.5941915559695685e-05,
+      "loss": 0.8615,
+      "step": 16215
+    },
+    {
+      "epoch": 2.887108262108262,
+      "grad_norm": 0.9182412028312683,
+      "learning_rate": 3.593116765807335e-05,
+      "loss": 0.8918,
+      "step": 16216
+    },
+    {
+      "epoch": 2.8872863247863245,
+      "grad_norm": 0.9452505707740784,
+      "learning_rate": 3.592042101174669e-05,
+      "loss": 1.0386,
+      "step": 16217
+    },
+    {
+      "epoch": 2.8874643874643873,
+      "grad_norm": 0.8544769287109375,
+      "learning_rate": 3.5909675620926255e-05,
+      "loss": 1.0305,
+      "step": 16218
+    },
+    {
+      "epoch": 2.88764245014245,
+      "grad_norm": 0.8184592127799988,
+      "learning_rate": 3.5898931485822605e-05,
+      "loss": 0.6815,
+      "step": 16219
+    },
+    {
+      "epoch": 2.8878205128205128,
+      "grad_norm": 0.8263654708862305,
+      "learning_rate": 3.5888188606646224e-05,
+      "loss": 1.0097,
+      "step": 16220
+    },
+    {
+      "epoch": 2.8879985754985755,
+      "grad_norm": 0.8290582299232483,
+      "learning_rate": 3.587744698360758e-05,
+      "loss": 0.6539,
+      "step": 16221
+    },
+    {
+      "epoch": 2.8881766381766383,
+      "grad_norm": 0.7936849594116211,
+      "learning_rate": 3.5866706616917226e-05,
+      "loss": 0.9524,
+      "step": 16222
+    },
+    {
+      "epoch": 2.888354700854701,
+      "grad_norm": 0.9449033737182617,
+      "learning_rate": 3.585596750678546e-05,
+      "loss": 0.9243,
+      "step": 16223
+    },
+    {
+      "epoch": 2.888532763532764,
+      "grad_norm": 0.7599559426307678,
+      "learning_rate": 3.58452296534228e-05,
+      "loss": 0.8597,
+      "step": 16224
+    },
+    {
+      "epoch": 2.888710826210826,
+      "grad_norm": 0.8485760688781738,
+      "learning_rate": 3.583449305703959e-05,
+      "loss": 0.7236,
+      "step": 16225
+    },
+    {
+      "epoch": 2.888888888888889,
+      "grad_norm": 0.8510624170303345,
+      "learning_rate": 3.582375771784616e-05,
+      "loss": 0.9081,
+      "step": 16226
+    },
+    {
+      "epoch": 2.8890669515669516,
+      "grad_norm": 0.815827488899231,
+      "learning_rate": 3.581302363605296e-05,
+      "loss": 0.8259,
+      "step": 16227
+    },
+    {
+      "epoch": 2.8892450142450143,
+      "grad_norm": 0.7588803768157959,
+      "learning_rate": 3.580229081187016e-05,
+      "loss": 0.8722,
+      "step": 16228
+    },
+    {
+      "epoch": 2.8894230769230766,
+      "grad_norm": 1.0699365139007568,
+      "learning_rate": 3.579155924550817e-05,
+      "loss": 1.0261,
+      "step": 16229
+    },
+    {
+      "epoch": 2.8896011396011394,
+      "grad_norm": 0.9127700924873352,
+      "learning_rate": 3.5780828937177126e-05,
+      "loss": 0.8993,
+      "step": 16230
+    },
+    {
+      "epoch": 2.889779202279202,
+      "grad_norm": 0.8101344108581543,
+      "learning_rate": 3.577009988708737e-05,
+      "loss": 1.0002,
+      "step": 16231
+    },
+    {
+      "epoch": 2.889957264957265,
+      "grad_norm": 0.8259516358375549,
+      "learning_rate": 3.5759372095449085e-05,
+      "loss": 0.9223,
+      "step": 16232
+    },
+    {
+      "epoch": 2.8901353276353277,
+      "grad_norm": 0.8366032242774963,
+      "learning_rate": 3.574864556247246e-05,
+      "loss": 0.9635,
+      "step": 16233
+    },
+    {
+      "epoch": 2.8903133903133904,
+      "grad_norm": 0.8667864203453064,
+      "learning_rate": 3.573792028836764e-05,
+      "loss": 0.9923,
+      "step": 16234
+    },
+    {
+      "epoch": 2.890491452991453,
+      "grad_norm": 0.8982881903648376,
+      "learning_rate": 3.5727196273344784e-05,
+      "loss": 0.8735,
+      "step": 16235
+    },
+    {
+      "epoch": 2.890669515669516,
+      "grad_norm": 0.8208550214767456,
+      "learning_rate": 3.571647351761398e-05,
+      "loss": 0.809,
+      "step": 16236
+    },
+    {
+      "epoch": 2.890847578347578,
+      "grad_norm": 1.0483866930007935,
+      "learning_rate": 3.5705752021385395e-05,
+      "loss": 0.9106,
+      "step": 16237
+    },
+    {
+      "epoch": 2.891025641025641,
+      "grad_norm": 0.8783283829689026,
+      "learning_rate": 3.5695031784868984e-05,
+      "loss": 0.8061,
+      "step": 16238
+    },
+    {
+      "epoch": 2.8912037037037037,
+      "grad_norm": 0.7530997395515442,
+      "learning_rate": 3.56843128082749e-05,
+      "loss": 0.58,
+      "step": 16239
+    },
+    {
+      "epoch": 2.8913817663817665,
+      "grad_norm": 1.0263420343399048,
+      "learning_rate": 3.567359509181304e-05,
+      "loss": 0.9946,
+      "step": 16240
+    },
+    {
+      "epoch": 2.8915598290598292,
+      "grad_norm": 0.7237655520439148,
+      "learning_rate": 3.5662878635693484e-05,
+      "loss": 0.6941,
+      "step": 16241
+    },
+    {
+      "epoch": 2.8917378917378915,
+      "grad_norm": 0.8415992856025696,
+      "learning_rate": 3.565216344012618e-05,
+      "loss": 0.7908,
+      "step": 16242
+    },
+    {
+      "epoch": 2.8919159544159543,
+      "grad_norm": 0.8285344243049622,
+      "learning_rate": 3.564144950532107e-05,
+      "loss": 0.8044,
+      "step": 16243
+    },
+    {
+      "epoch": 2.892094017094017,
+      "grad_norm": 0.9715448617935181,
+      "learning_rate": 3.5630736831488046e-05,
+      "loss": 0.8236,
+      "step": 16244
+    },
+    {
+      "epoch": 2.89227207977208,
+      "grad_norm": 0.8461307287216187,
+      "learning_rate": 3.5620025418836985e-05,
+      "loss": 0.9222,
+      "step": 16245
+    },
+    {
+      "epoch": 2.8924501424501425,
+      "grad_norm": 0.7191013097763062,
+      "learning_rate": 3.5609315267577836e-05,
+      "loss": 0.6089,
+      "step": 16246
+    },
+    {
+      "epoch": 2.8926282051282053,
+      "grad_norm": 0.8537501692771912,
+      "learning_rate": 3.559860637792038e-05,
+      "loss": 0.8771,
+      "step": 16247
+    },
+    {
+      "epoch": 2.892806267806268,
+      "grad_norm": 0.8684942126274109,
+      "learning_rate": 3.558789875007447e-05,
+      "loss": 0.9009,
+      "step": 16248
+    },
+    {
+      "epoch": 2.8929843304843303,
+      "grad_norm": 1.0619043111801147,
+      "learning_rate": 3.5577192384249856e-05,
+      "loss": 0.7283,
+      "step": 16249
+    },
+    {
+      "epoch": 2.893162393162393,
+      "grad_norm": 0.8889201879501343,
+      "learning_rate": 3.556648728065635e-05,
+      "loss": 0.7538,
+      "step": 16250
+    },
+    {
+      "epoch": 2.893340455840456,
+      "grad_norm": 0.8162542581558228,
+      "learning_rate": 3.555578343950367e-05,
+      "loss": 0.8338,
+      "step": 16251
+    },
+    {
+      "epoch": 2.8935185185185186,
+      "grad_norm": 0.8750036954879761,
+      "learning_rate": 3.5545080861001535e-05,
+      "loss": 0.6956,
+      "step": 16252
+    },
+    {
+      "epoch": 2.8936965811965814,
+      "grad_norm": 0.911232054233551,
+      "learning_rate": 3.553437954535962e-05,
+      "loss": 0.7559,
+      "step": 16253
+    },
+    {
+      "epoch": 2.8938746438746437,
+      "grad_norm": 0.889566957950592,
+      "learning_rate": 3.5523679492787685e-05,
+      "loss": 0.969,
+      "step": 16254
+    },
+    {
+      "epoch": 2.8940527065527064,
+      "grad_norm": 0.933595597743988,
+      "learning_rate": 3.551298070349525e-05,
+      "loss": 0.9767,
+      "step": 16255
+    },
+    {
+      "epoch": 2.894230769230769,
+      "grad_norm": 0.8633596897125244,
+      "learning_rate": 3.550228317769203e-05,
+      "loss": 0.7823,
+      "step": 16256
+    },
+    {
+      "epoch": 2.894408831908832,
+      "grad_norm": 0.8595561981201172,
+      "learning_rate": 3.5491586915587585e-05,
+      "loss": 0.9583,
+      "step": 16257
+    },
+    {
+      "epoch": 2.8945868945868947,
+      "grad_norm": 0.7595796585083008,
+      "learning_rate": 3.548089191739149e-05,
+      "loss": 0.7167,
+      "step": 16258
+    },
+    {
+      "epoch": 2.8947649572649574,
+      "grad_norm": 0.8662711381912231,
+      "learning_rate": 3.54701981833133e-05,
+      "loss": 0.9439,
+      "step": 16259
+    },
+    {
+      "epoch": 2.89494301994302,
+      "grad_norm": 0.8028330206871033,
+      "learning_rate": 3.5459505713562525e-05,
+      "loss": 0.7442,
+      "step": 16260
+    },
+    {
+      "epoch": 2.8951210826210825,
+      "grad_norm": 0.9413794279098511,
+      "learning_rate": 3.5448814508348616e-05,
+      "loss": 0.97,
+      "step": 16261
+    },
+    {
+      "epoch": 2.8952991452991452,
+      "grad_norm": 0.9300761222839355,
+      "learning_rate": 3.543812456788118e-05,
+      "loss": 0.9294,
+      "step": 16262
+    },
+    {
+      "epoch": 2.895477207977208,
+      "grad_norm": 0.7364256381988525,
+      "learning_rate": 3.54274358923695e-05,
+      "loss": 0.5352,
+      "step": 16263
+    },
+    {
+      "epoch": 2.8956552706552707,
+      "grad_norm": 0.8452964425086975,
+      "learning_rate": 3.541674848202314e-05,
+      "loss": 0.7362,
+      "step": 16264
+    },
+    {
+      "epoch": 2.8958333333333335,
+      "grad_norm": 0.8745927214622498,
+      "learning_rate": 3.540606233705137e-05,
+      "loss": 1.0222,
+      "step": 16265
+    },
+    {
+      "epoch": 2.896011396011396,
+      "grad_norm": 0.9202282428741455,
+      "learning_rate": 3.539537745766367e-05,
+      "loss": 1.0853,
+      "step": 16266
+    },
+    {
+      "epoch": 2.8961894586894585,
+      "grad_norm": 0.8450053930282593,
+      "learning_rate": 3.538469384406933e-05,
+      "loss": 0.9295,
+      "step": 16267
+    },
+    {
+      "epoch": 2.8963675213675213,
+      "grad_norm": 0.8761671781539917,
+      "learning_rate": 3.5374011496477656e-05,
+      "loss": 0.9273,
+      "step": 16268
+    },
+    {
+      "epoch": 2.896545584045584,
+      "grad_norm": 0.8562198281288147,
+      "learning_rate": 3.536333041509805e-05,
+      "loss": 0.7654,
+      "step": 16269
+    },
+    {
+      "epoch": 2.896723646723647,
+      "grad_norm": 0.9013510942459106,
+      "learning_rate": 3.535265060013965e-05,
+      "loss": 0.7432,
+      "step": 16270
+    },
+    {
+      "epoch": 2.8969017094017095,
+      "grad_norm": 1.1163274049758911,
+      "learning_rate": 3.534197205181179e-05,
+      "loss": 1.1223,
+      "step": 16271
+    },
+    {
+      "epoch": 2.8970797720797723,
+      "grad_norm": 0.869686484336853,
+      "learning_rate": 3.5331294770323674e-05,
+      "loss": 0.9255,
+      "step": 16272
+    },
+    {
+      "epoch": 2.8972578347578346,
+      "grad_norm": 0.8521125316619873,
+      "learning_rate": 3.53206187558845e-05,
+      "loss": 1.0221,
+      "step": 16273
+    },
+    {
+      "epoch": 2.8974358974358974,
+      "grad_norm": 0.9807026982307434,
+      "learning_rate": 3.530994400870345e-05,
+      "loss": 1.0133,
+      "step": 16274
+    },
+    {
+      "epoch": 2.89761396011396,
+      "grad_norm": 0.9236428141593933,
+      "learning_rate": 3.529927052898967e-05,
+      "loss": 0.8711,
+      "step": 16275
+    },
+    {
+      "epoch": 2.897792022792023,
+      "grad_norm": 0.8108885884284973,
+      "learning_rate": 3.528859831695227e-05,
+      "loss": 0.917,
+      "step": 16276
+    },
+    {
+      "epoch": 2.8979700854700856,
+      "grad_norm": 0.7522720098495483,
+      "learning_rate": 3.527792737280036e-05,
+      "loss": 0.6776,
+      "step": 16277
+    },
+    {
+      "epoch": 2.898148148148148,
+      "grad_norm": 0.9261712431907654,
+      "learning_rate": 3.526725769674297e-05,
+      "loss": 1.0728,
+      "step": 16278
+    },
+    {
+      "epoch": 2.8983262108262107,
+      "grad_norm": 0.849559485912323,
+      "learning_rate": 3.5256589288989285e-05,
+      "loss": 0.9886,
+      "step": 16279
+    },
+    {
+      "epoch": 2.8985042735042734,
+      "grad_norm": 0.9024273157119751,
+      "learning_rate": 3.5245922149748155e-05,
+      "loss": 0.9296,
+      "step": 16280
+    },
+    {
+      "epoch": 2.898682336182336,
+      "grad_norm": 0.8285173773765564,
+      "learning_rate": 3.52352562792287e-05,
+      "loss": 0.8776,
+      "step": 16281
+    },
+    {
+      "epoch": 2.898860398860399,
+      "grad_norm": 0.895517885684967,
+      "learning_rate": 3.522459167763987e-05,
+      "loss": 0.8975,
+      "step": 16282
+    },
+    {
+      "epoch": 2.8990384615384617,
+      "grad_norm": 0.8449265956878662,
+      "learning_rate": 3.521392834519061e-05,
+      "loss": 0.6736,
+      "step": 16283
+    },
+    {
+      "epoch": 2.8992165242165244,
+      "grad_norm": 0.8652997612953186,
+      "learning_rate": 3.520326628208983e-05,
+      "loss": 0.7834,
+      "step": 16284
+    },
+    {
+      "epoch": 2.8993945868945867,
+      "grad_norm": 0.9012393951416016,
+      "learning_rate": 3.519260548854642e-05,
+      "loss": 0.8082,
+      "step": 16285
+    },
+    {
+      "epoch": 2.8995726495726495,
+      "grad_norm": 0.9048463106155396,
+      "learning_rate": 3.5181945964769333e-05,
+      "loss": 0.8078,
+      "step": 16286
+    },
+    {
+      "epoch": 2.8997507122507122,
+      "grad_norm": 0.8788473010063171,
+      "learning_rate": 3.5171287710967314e-05,
+      "loss": 0.8022,
+      "step": 16287
+    },
+    {
+      "epoch": 2.899928774928775,
+      "grad_norm": 0.8322813510894775,
+      "learning_rate": 3.516063072734928e-05,
+      "loss": 0.8855,
+      "step": 16288
+    },
+    {
+      "epoch": 2.9001068376068377,
+      "grad_norm": 0.8762373328208923,
+      "learning_rate": 3.514997501412398e-05,
+      "loss": 0.7858,
+      "step": 16289
+    },
+    {
+      "epoch": 2.9002849002849,
+      "grad_norm": 0.7718746066093445,
+      "learning_rate": 3.513932057150021e-05,
+      "loss": 0.6881,
+      "step": 16290
+    },
+    {
+      "epoch": 2.900462962962963,
+      "grad_norm": 1.0138204097747803,
+      "learning_rate": 3.5128667399686724e-05,
+      "loss": 0.9378,
+      "step": 16291
+    },
+    {
+      "epoch": 2.9006410256410255,
+      "grad_norm": 0.6968120336532593,
+      "learning_rate": 3.5118015498892234e-05,
+      "loss": 0.7135,
+      "step": 16292
+    },
+    {
+      "epoch": 2.9008190883190883,
+      "grad_norm": 0.7925532460212708,
+      "learning_rate": 3.510736486932542e-05,
+      "loss": 0.7414,
+      "step": 16293
+    },
+    {
+      "epoch": 2.900997150997151,
+      "grad_norm": 1.0432425737380981,
+      "learning_rate": 3.5096715511195056e-05,
+      "loss": 0.8957,
+      "step": 16294
+    },
+    {
+      "epoch": 2.901175213675214,
+      "grad_norm": 0.8664390444755554,
+      "learning_rate": 3.508606742470966e-05,
+      "loss": 0.7832,
+      "step": 16295
+    },
+    {
+      "epoch": 2.9013532763532766,
+      "grad_norm": 0.8470353484153748,
+      "learning_rate": 3.507542061007795e-05,
+      "loss": 0.8133,
+      "step": 16296
+    },
+    {
+      "epoch": 2.9015313390313393,
+      "grad_norm": 0.8339848518371582,
+      "learning_rate": 3.5064775067508514e-05,
+      "loss": 0.9275,
+      "step": 16297
+    },
+    {
+      "epoch": 2.9017094017094016,
+      "grad_norm": 0.8686776757240295,
+      "learning_rate": 3.5054130797209916e-05,
+      "loss": 0.9238,
+      "step": 16298
+    },
+    {
+      "epoch": 2.9018874643874644,
+      "grad_norm": 0.8178901076316833,
+      "learning_rate": 3.504348779939071e-05,
+      "loss": 1.0776,
+      "step": 16299
+    },
+    {
+      "epoch": 2.902065527065527,
+      "grad_norm": 0.8446379899978638,
+      "learning_rate": 3.5032846074259426e-05,
+      "loss": 0.715,
+      "step": 16300
+    },
+    {
+      "epoch": 2.90224358974359,
+      "grad_norm": 0.8320762515068054,
+      "learning_rate": 3.502220562202457e-05,
+      "loss": 0.8543,
+      "step": 16301
+    },
+    {
+      "epoch": 2.902421652421652,
+      "grad_norm": 0.9112939238548279,
+      "learning_rate": 3.501156644289462e-05,
+      "loss": 0.8239,
+      "step": 16302
+    },
+    {
+      "epoch": 2.902599715099715,
+      "grad_norm": 0.8749213218688965,
+      "learning_rate": 3.500092853707797e-05,
+      "loss": 0.8057,
+      "step": 16303
+    },
+    {
+      "epoch": 2.9027777777777777,
+      "grad_norm": 0.8245106339454651,
+      "learning_rate": 3.4990291904783143e-05,
+      "loss": 0.8955,
+      "step": 16304
+    },
+    {
+      "epoch": 2.9029558404558404,
+      "grad_norm": 0.7982145547866821,
+      "learning_rate": 3.4979656546218506e-05,
+      "loss": 0.6641,
+      "step": 16305
+    },
+    {
+      "epoch": 2.903133903133903,
+      "grad_norm": 0.8777986168861389,
+      "learning_rate": 3.496902246159244e-05,
+      "loss": 0.7365,
+      "step": 16306
+    },
+    {
+      "epoch": 2.903311965811966,
+      "grad_norm": 0.8463431596755981,
+      "learning_rate": 3.4958389651113275e-05,
+      "loss": 1.0434,
+      "step": 16307
+    },
+    {
+      "epoch": 2.9034900284900287,
+      "grad_norm": 0.849039614200592,
+      "learning_rate": 3.494775811498931e-05,
+      "loss": 0.8813,
+      "step": 16308
+    },
+    {
+      "epoch": 2.9036680911680914,
+      "grad_norm": 0.7352656126022339,
+      "learning_rate": 3.4937127853428976e-05,
+      "loss": 0.7936,
+      "step": 16309
+    },
+    {
+      "epoch": 2.9038461538461537,
+      "grad_norm": 0.764543354511261,
+      "learning_rate": 3.49264988666404e-05,
+      "loss": 0.7253,
+      "step": 16310
+    },
+    {
+      "epoch": 2.9040242165242165,
+      "grad_norm": 0.7932603359222412,
+      "learning_rate": 3.491587115483196e-05,
+      "loss": 0.9217,
+      "step": 16311
+    },
+    {
+      "epoch": 2.9042022792022792,
+      "grad_norm": 0.9001819491386414,
+      "learning_rate": 3.490524471821175e-05,
+      "loss": 0.8628,
+      "step": 16312
+    },
+    {
+      "epoch": 2.904380341880342,
+      "grad_norm": 0.7983253002166748,
+      "learning_rate": 3.4894619556988085e-05,
+      "loss": 0.9858,
+      "step": 16313
+    },
+    {
+      "epoch": 2.9045584045584043,
+      "grad_norm": 0.747611403465271,
+      "learning_rate": 3.488399567136911e-05,
+      "loss": 0.8656,
+      "step": 16314
+    },
+    {
+      "epoch": 2.904736467236467,
+      "grad_norm": 0.8770463466644287,
+      "learning_rate": 3.487337306156296e-05,
+      "loss": 1.0154,
+      "step": 16315
+    },
+    {
+      "epoch": 2.90491452991453,
+      "grad_norm": 0.8757193088531494,
+      "learning_rate": 3.4862751727777797e-05,
+      "loss": 0.7931,
+      "step": 16316
+    },
+    {
+      "epoch": 2.9050925925925926,
+      "grad_norm": 0.8232926726341248,
+      "learning_rate": 3.485213167022169e-05,
+      "loss": 0.8477,
+      "step": 16317
+    },
+    {
+      "epoch": 2.9052706552706553,
+      "grad_norm": 0.8445250391960144,
+      "learning_rate": 3.48415128891027e-05,
+      "loss": 1.0618,
+      "step": 16318
+    },
+    {
+      "epoch": 2.905448717948718,
+      "grad_norm": 0.8172810673713684,
+      "learning_rate": 3.483089538462897e-05,
+      "loss": 0.7882,
+      "step": 16319
+    },
+    {
+      "epoch": 2.905626780626781,
+      "grad_norm": 0.910757303237915,
+      "learning_rate": 3.4820279157008404e-05,
+      "loss": 0.9383,
+      "step": 16320
+    },
+    {
+      "epoch": 2.9058048433048436,
+      "grad_norm": 0.8837474584579468,
+      "learning_rate": 3.480966420644911e-05,
+      "loss": 1.1006,
+      "step": 16321
+    },
+    {
+      "epoch": 2.905982905982906,
+      "grad_norm": 0.7739782333374023,
+      "learning_rate": 3.4799050533159014e-05,
+      "loss": 0.6885,
+      "step": 16322
+    },
+    {
+      "epoch": 2.9061609686609686,
+      "grad_norm": 0.911738932132721,
+      "learning_rate": 3.478843813734609e-05,
+      "loss": 0.993,
+      "step": 16323
+    },
+    {
+      "epoch": 2.9063390313390314,
+      "grad_norm": 0.8834345936775208,
+      "learning_rate": 3.477782701921825e-05,
+      "loss": 0.8724,
+      "step": 16324
+    },
+    {
+      "epoch": 2.906517094017094,
+      "grad_norm": 0.8103434443473816,
+      "learning_rate": 3.476721717898337e-05,
+      "loss": 1.0517,
+      "step": 16325
+    },
+    {
+      "epoch": 2.9066951566951564,
+      "grad_norm": 0.8648924827575684,
+      "learning_rate": 3.475660861684943e-05,
+      "loss": 0.7223,
+      "step": 16326
+    },
+    {
+      "epoch": 2.906873219373219,
+      "grad_norm": 0.8762979507446289,
+      "learning_rate": 3.4746001333024134e-05,
+      "loss": 0.8421,
+      "step": 16327
+    },
+    {
+      "epoch": 2.907051282051282,
+      "grad_norm": 0.9596083164215088,
+      "learning_rate": 3.4735395327715434e-05,
+      "loss": 1.0206,
+      "step": 16328
+    },
+    {
+      "epoch": 2.9072293447293447,
+      "grad_norm": 0.8210311532020569,
+      "learning_rate": 3.472479060113107e-05,
+      "loss": 0.836,
+      "step": 16329
+    },
+    {
+      "epoch": 2.9074074074074074,
+      "grad_norm": 0.8436611890792847,
+      "learning_rate": 3.471418715347886e-05,
+      "loss": 0.8459,
+      "step": 16330
+    },
+    {
+      "epoch": 2.90758547008547,
+      "grad_norm": 0.9176212549209595,
+      "learning_rate": 3.470358498496652e-05,
+      "loss": 0.8227,
+      "step": 16331
+    },
+    {
+      "epoch": 2.907763532763533,
+      "grad_norm": 0.7709631323814392,
+      "learning_rate": 3.4692984095801796e-05,
+      "loss": 0.8096,
+      "step": 16332
+    },
+    {
+      "epoch": 2.9079415954415957,
+      "grad_norm": 0.7727495431900024,
+      "learning_rate": 3.4682384486192346e-05,
+      "loss": 0.6843,
+      "step": 16333
+    },
+    {
+      "epoch": 2.908119658119658,
+      "grad_norm": 0.9743461608886719,
+      "learning_rate": 3.4671786156345955e-05,
+      "loss": 0.7542,
+      "step": 16334
+    },
+    {
+      "epoch": 2.9082977207977208,
+      "grad_norm": 0.9035171270370483,
+      "learning_rate": 3.466118910647014e-05,
+      "loss": 0.8827,
+      "step": 16335
+    },
+    {
+      "epoch": 2.9084757834757835,
+      "grad_norm": 1.1434134244918823,
+      "learning_rate": 3.465059333677266e-05,
+      "loss": 0.8026,
+      "step": 16336
+    },
+    {
+      "epoch": 2.9086538461538463,
+      "grad_norm": 0.8229905962944031,
+      "learning_rate": 3.4639998847461e-05,
+      "loss": 0.7415,
+      "step": 16337
+    },
+    {
+      "epoch": 2.9088319088319086,
+      "grad_norm": 0.9193732738494873,
+      "learning_rate": 3.462940563874281e-05,
+      "loss": 0.9773,
+      "step": 16338
+    },
+    {
+      "epoch": 2.9090099715099713,
+      "grad_norm": 0.8461189270019531,
+      "learning_rate": 3.4618813710825614e-05,
+      "loss": 0.9169,
+      "step": 16339
+    },
+    {
+      "epoch": 2.909188034188034,
+      "grad_norm": 0.9471017718315125,
+      "learning_rate": 3.460822306391696e-05,
+      "loss": 0.9941,
+      "step": 16340
+    },
+    {
+      "epoch": 2.909366096866097,
+      "grad_norm": 0.8515542149543762,
+      "learning_rate": 3.459763369822432e-05,
+      "loss": 0.8209,
+      "step": 16341
+    },
+    {
+      "epoch": 2.9095441595441596,
+      "grad_norm": 0.8520704507827759,
+      "learning_rate": 3.458704561395519e-05,
+      "loss": 0.8443,
+      "step": 16342
+    },
+    {
+      "epoch": 2.9097222222222223,
+      "grad_norm": 0.9236885905265808,
+      "learning_rate": 3.457645881131699e-05,
+      "loss": 0.8407,
+      "step": 16343
+    },
+    {
+      "epoch": 2.909900284900285,
+      "grad_norm": 0.9255889654159546,
+      "learning_rate": 3.4565873290517203e-05,
+      "loss": 0.9754,
+      "step": 16344
+    },
+    {
+      "epoch": 2.910078347578348,
+      "grad_norm": 0.7904002666473389,
+      "learning_rate": 3.455528905176321e-05,
+      "loss": 0.7123,
+      "step": 16345
+    },
+    {
+      "epoch": 2.91025641025641,
+      "grad_norm": 0.821877658367157,
+      "learning_rate": 3.454470609526237e-05,
+      "loss": 0.7024,
+      "step": 16346
+    },
+    {
+      "epoch": 2.910434472934473,
+      "grad_norm": 0.84690260887146,
+      "learning_rate": 3.453412442122205e-05,
+      "loss": 0.7551,
+      "step": 16347
+    },
+    {
+      "epoch": 2.9106125356125356,
+      "grad_norm": 1.0308923721313477,
+      "learning_rate": 3.452354402984955e-05,
+      "loss": 1.0316,
+      "step": 16348
+    },
+    {
+      "epoch": 2.9107905982905984,
+      "grad_norm": 0.7016192674636841,
+      "learning_rate": 3.451296492135221e-05,
+      "loss": 0.6816,
+      "step": 16349
+    },
+    {
+      "epoch": 2.9109686609686607,
+      "grad_norm": 0.942915141582489,
+      "learning_rate": 3.4502387095937237e-05,
+      "loss": 0.9029,
+      "step": 16350
+    },
+    {
+      "epoch": 2.9111467236467234,
+      "grad_norm": 0.7511810660362244,
+      "learning_rate": 3.449181055381201e-05,
+      "loss": 0.641,
+      "step": 16351
+    },
+    {
+      "epoch": 2.911324786324786,
+      "grad_norm": 0.8904419541358948,
+      "learning_rate": 3.44812352951836e-05,
+      "loss": 0.8176,
+      "step": 16352
+    },
+    {
+      "epoch": 2.911502849002849,
+      "grad_norm": 0.7585623264312744,
+      "learning_rate": 3.447066132025931e-05,
+      "loss": 0.7815,
+      "step": 16353
+    },
+    {
+      "epoch": 2.9116809116809117,
+      "grad_norm": 0.8587543964385986,
+      "learning_rate": 3.446008862924629e-05,
+      "loss": 0.951,
+      "step": 16354
+    },
+    {
+      "epoch": 2.9118589743589745,
+      "grad_norm": 0.7638232707977295,
+      "learning_rate": 3.444951722235169e-05,
+      "loss": 0.6784,
+      "step": 16355
+    },
+    {
+      "epoch": 2.912037037037037,
+      "grad_norm": 0.7942266464233398,
+      "learning_rate": 3.4438947099782624e-05,
+      "loss": 0.7211,
+      "step": 16356
+    },
+    {
+      "epoch": 2.9122150997151,
+      "grad_norm": 0.7207983732223511,
+      "learning_rate": 3.4428378261746195e-05,
+      "loss": 0.7515,
+      "step": 16357
+    },
+    {
+      "epoch": 2.9123931623931623,
+      "grad_norm": 0.8683337569236755,
+      "learning_rate": 3.4417810708449495e-05,
+      "loss": 0.8724,
+      "step": 16358
+    },
+    {
+      "epoch": 2.912571225071225,
+      "grad_norm": 0.8052859902381897,
+      "learning_rate": 3.440724444009955e-05,
+      "loss": 0.8331,
+      "step": 16359
+    },
+    {
+      "epoch": 2.9127492877492878,
+      "grad_norm": 0.7425459027290344,
+      "learning_rate": 3.439667945690336e-05,
+      "loss": 0.5711,
+      "step": 16360
+    },
+    {
+      "epoch": 2.9129273504273505,
+      "grad_norm": 0.8537404537200928,
+      "learning_rate": 3.438611575906803e-05,
+      "loss": 0.8283,
+      "step": 16361
+    },
+    {
+      "epoch": 2.9131054131054133,
+      "grad_norm": 0.794684648513794,
+      "learning_rate": 3.437555334680038e-05,
+      "loss": 0.679,
+      "step": 16362
+    },
+    {
+      "epoch": 2.9132834757834756,
+      "grad_norm": 0.8330501914024353,
+      "learning_rate": 3.436499222030748e-05,
+      "loss": 0.6892,
+      "step": 16363
+    },
+    {
+      "epoch": 2.9134615384615383,
+      "grad_norm": 0.9036495089530945,
+      "learning_rate": 3.435443237979621e-05,
+      "loss": 0.8349,
+      "step": 16364
+    },
+    {
+      "epoch": 2.913639601139601,
+      "grad_norm": 0.776745080947876,
+      "learning_rate": 3.434387382547344e-05,
+      "loss": 0.6691,
+      "step": 16365
+    },
+    {
+      "epoch": 2.913817663817664,
+      "grad_norm": 0.8921083807945251,
+      "learning_rate": 3.4333316557546145e-05,
+      "loss": 0.9497,
+      "step": 16366
+    },
+    {
+      "epoch": 2.9139957264957266,
+      "grad_norm": 0.8435728549957275,
+      "learning_rate": 3.4322760576221023e-05,
+      "loss": 0.787,
+      "step": 16367
+    },
+    {
+      "epoch": 2.9141737891737893,
+      "grad_norm": 0.7281554341316223,
+      "learning_rate": 3.4312205881705015e-05,
+      "loss": 0.7604,
+      "step": 16368
+    },
+    {
+      "epoch": 2.914351851851852,
+      "grad_norm": 0.8474677801132202,
+      "learning_rate": 3.430165247420488e-05,
+      "loss": 0.9052,
+      "step": 16369
+    },
+    {
+      "epoch": 2.9145299145299144,
+      "grad_norm": 0.9927265048027039,
+      "learning_rate": 3.42911003539274e-05,
+      "loss": 0.7486,
+      "step": 16370
+    },
+    {
+      "epoch": 2.914707977207977,
+      "grad_norm": 0.7754966020584106,
+      "learning_rate": 3.4280549521079286e-05,
+      "loss": 0.8368,
+      "step": 16371
+    },
+    {
+      "epoch": 2.91488603988604,
+      "grad_norm": 0.7774887084960938,
+      "learning_rate": 3.4269999975867295e-05,
+      "loss": 0.7014,
+      "step": 16372
+    },
+    {
+      "epoch": 2.9150641025641026,
+      "grad_norm": 0.9286267161369324,
+      "learning_rate": 3.4259451718498115e-05,
+      "loss": 1.0414,
+      "step": 16373
+    },
+    {
+      "epoch": 2.9152421652421654,
+      "grad_norm": 0.8269815444946289,
+      "learning_rate": 3.424890474917841e-05,
+      "loss": 0.9731,
+      "step": 16374
+    },
+    {
+      "epoch": 2.9154202279202277,
+      "grad_norm": 0.8319926857948303,
+      "learning_rate": 3.42383590681148e-05,
+      "loss": 0.9047,
+      "step": 16375
+    },
+    {
+      "epoch": 2.9155982905982905,
+      "grad_norm": 0.8976882696151733,
+      "learning_rate": 3.4227814675514e-05,
+      "loss": 0.8608,
+      "step": 16376
+    },
+    {
+      "epoch": 2.915776353276353,
+      "grad_norm": 0.80831378698349,
+      "learning_rate": 3.421727157158248e-05,
+      "loss": 0.9213,
+      "step": 16377
+    },
+    {
+      "epoch": 2.915954415954416,
+      "grad_norm": 0.881201982498169,
+      "learning_rate": 3.42067297565269e-05,
+      "loss": 0.9015,
+      "step": 16378
+    },
+    {
+      "epoch": 2.9161324786324787,
+      "grad_norm": 0.7797132134437561,
+      "learning_rate": 3.4196189230553775e-05,
+      "loss": 0.9002,
+      "step": 16379
+    },
+    {
+      "epoch": 2.9163105413105415,
+      "grad_norm": 1.0163915157318115,
+      "learning_rate": 3.4185649993869626e-05,
+      "loss": 1.0185,
+      "step": 16380
+    },
+    {
+      "epoch": 2.916488603988604,
+      "grad_norm": 0.8868339657783508,
+      "learning_rate": 3.417511204668096e-05,
+      "loss": 0.7316,
+      "step": 16381
+    },
+    {
+      "epoch": 2.9166666666666665,
+      "grad_norm": 0.8215135931968689,
+      "learning_rate": 3.416457538919422e-05,
+      "loss": 0.8693,
+      "step": 16382
+    },
+    {
+      "epoch": 2.9168447293447293,
+      "grad_norm": 0.8070623278617859,
+      "learning_rate": 3.4154040021615876e-05,
+      "loss": 0.8532,
+      "step": 16383
+    },
+    {
+      "epoch": 2.917022792022792,
+      "grad_norm": 0.8857349157333374,
+      "learning_rate": 3.4143505944152284e-05,
+      "loss": 0.961,
+      "step": 16384
+    },
+    {
+      "epoch": 2.9172008547008548,
+      "grad_norm": 0.8146688342094421,
+      "learning_rate": 3.4132973157009936e-05,
+      "loss": 0.8597,
+      "step": 16385
+    },
+    {
+      "epoch": 2.9173789173789175,
+      "grad_norm": 0.8757439851760864,
+      "learning_rate": 3.4122441660395156e-05,
+      "loss": 0.7683,
+      "step": 16386
+    },
+    {
+      "epoch": 2.91755698005698,
+      "grad_norm": 0.8319665789604187,
+      "learning_rate": 3.4111911454514266e-05,
+      "loss": 0.8034,
+      "step": 16387
+    },
+    {
+      "epoch": 2.9177350427350426,
+      "grad_norm": 0.8103782534599304,
+      "learning_rate": 3.410138253957361e-05,
+      "loss": 0.8469,
+      "step": 16388
+    },
+    {
+      "epoch": 2.9179131054131053,
+      "grad_norm": 0.8783053755760193,
+      "learning_rate": 3.4090854915779466e-05,
+      "loss": 0.7997,
+      "step": 16389
+    },
+    {
+      "epoch": 2.918091168091168,
+      "grad_norm": 0.9793184995651245,
+      "learning_rate": 3.408032858333808e-05,
+      "loss": 0.8934,
+      "step": 16390
+    },
+    {
+      "epoch": 2.918269230769231,
+      "grad_norm": 0.8603301048278809,
+      "learning_rate": 3.406980354245578e-05,
+      "loss": 0.847,
+      "step": 16391
+    },
+    {
+      "epoch": 2.9184472934472936,
+      "grad_norm": 0.9763472080230713,
+      "learning_rate": 3.405927979333866e-05,
+      "loss": 0.9491,
+      "step": 16392
+    },
+    {
+      "epoch": 2.9186253561253563,
+      "grad_norm": 0.9340085983276367,
+      "learning_rate": 3.4048757336193036e-05,
+      "loss": 0.8913,
+      "step": 16393
+    },
+    {
+      "epoch": 2.9188034188034186,
+      "grad_norm": 1.051066517829895,
+      "learning_rate": 3.4038236171224946e-05,
+      "loss": 0.9903,
+      "step": 16394
+    },
+    {
+      "epoch": 2.9189814814814814,
+      "grad_norm": 0.8462432026863098,
+      "learning_rate": 3.402771629864062e-05,
+      "loss": 0.868,
+      "step": 16395
+    },
+    {
+      "epoch": 2.919159544159544,
+      "grad_norm": 0.9149147868156433,
+      "learning_rate": 3.401719771864615e-05,
+      "loss": 0.9215,
+      "step": 16396
+    },
+    {
+      "epoch": 2.919337606837607,
+      "grad_norm": 0.7974846959114075,
+      "learning_rate": 3.4006680431447636e-05,
+      "loss": 0.9577,
+      "step": 16397
+    },
+    {
+      "epoch": 2.9195156695156697,
+      "grad_norm": 0.9298149943351746,
+      "learning_rate": 3.399616443725112e-05,
+      "loss": 0.8977,
+      "step": 16398
+    },
+    {
+      "epoch": 2.919693732193732,
+      "grad_norm": 0.8799013495445251,
+      "learning_rate": 3.398564973626265e-05,
+      "loss": 1.0161,
+      "step": 16399
+    },
+    {
+      "epoch": 2.9198717948717947,
+      "grad_norm": 0.8730150461196899,
+      "learning_rate": 3.3975136328688206e-05,
+      "loss": 0.7953,
+      "step": 16400
+    },
+    {
+      "epoch": 2.9200498575498575,
+      "grad_norm": 1.0036572217941284,
+      "learning_rate": 3.396462421473387e-05,
+      "loss": 0.5941,
+      "step": 16401
+    },
+    {
+      "epoch": 2.92022792022792,
+      "grad_norm": 0.8903291821479797,
+      "learning_rate": 3.39541133946055e-05,
+      "loss": 0.7715,
+      "step": 16402
+    },
+    {
+      "epoch": 2.920405982905983,
+      "grad_norm": 0.8254680633544922,
+      "learning_rate": 3.39436038685091e-05,
+      "loss": 0.7268,
+      "step": 16403
+    },
+    {
+      "epoch": 2.9205840455840457,
+      "grad_norm": 0.8490110039710999,
+      "learning_rate": 3.393309563665057e-05,
+      "loss": 0.8684,
+      "step": 16404
+    },
+    {
+      "epoch": 2.9207621082621085,
+      "grad_norm": 0.8137654066085815,
+      "learning_rate": 3.392258869923575e-05,
+      "loss": 0.7068,
+      "step": 16405
+    },
+    {
+      "epoch": 2.9209401709401708,
+      "grad_norm": 0.8084680438041687,
+      "learning_rate": 3.39120830564706e-05,
+      "loss": 0.8214,
+      "step": 16406
+    },
+    {
+      "epoch": 2.9211182336182335,
+      "grad_norm": 0.9359310269355774,
+      "learning_rate": 3.3901578708560835e-05,
+      "loss": 1.0247,
+      "step": 16407
+    },
+    {
+      "epoch": 2.9212962962962963,
+      "grad_norm": 0.826842725276947,
+      "learning_rate": 3.38910756557124e-05,
+      "loss": 0.8018,
+      "step": 16408
+    },
+    {
+      "epoch": 2.921474358974359,
+      "grad_norm": 0.9049538373947144,
+      "learning_rate": 3.388057389813093e-05,
+      "loss": 0.8825,
+      "step": 16409
+    },
+    {
+      "epoch": 2.921652421652422,
+      "grad_norm": 0.7966105937957764,
+      "learning_rate": 3.387007343602231e-05,
+      "loss": 0.8716,
+      "step": 16410
+    },
+    {
+      "epoch": 2.921830484330484,
+      "grad_norm": 0.8647517561912537,
+      "learning_rate": 3.385957426959222e-05,
+      "loss": 0.9947,
+      "step": 16411
+    },
+    {
+      "epoch": 2.922008547008547,
+      "grad_norm": 0.8983362317085266,
+      "learning_rate": 3.384907639904638e-05,
+      "loss": 0.8701,
+      "step": 16412
+    },
+    {
+      "epoch": 2.9221866096866096,
+      "grad_norm": 0.7970262765884399,
+      "learning_rate": 3.3838579824590465e-05,
+      "loss": 0.8921,
+      "step": 16413
+    },
+    {
+      "epoch": 2.9223646723646723,
+      "grad_norm": 0.8115224242210388,
+      "learning_rate": 3.382808454643015e-05,
+      "loss": 0.5948,
+      "step": 16414
+    },
+    {
+      "epoch": 2.922542735042735,
+      "grad_norm": 0.832318902015686,
+      "learning_rate": 3.381759056477102e-05,
+      "loss": 0.8399,
+      "step": 16415
+    },
+    {
+      "epoch": 2.922720797720798,
+      "grad_norm": 0.9396497011184692,
+      "learning_rate": 3.380709787981878e-05,
+      "loss": 0.7322,
+      "step": 16416
+    },
+    {
+      "epoch": 2.9228988603988606,
+      "grad_norm": 0.7956545352935791,
+      "learning_rate": 3.3796606491778904e-05,
+      "loss": 0.8566,
+      "step": 16417
+    },
+    {
+      "epoch": 2.9230769230769234,
+      "grad_norm": 0.8257092833518982,
+      "learning_rate": 3.378611640085705e-05,
+      "loss": 0.8682,
+      "step": 16418
+    },
+    {
+      "epoch": 2.9232549857549857,
+      "grad_norm": 0.7565430402755737,
+      "learning_rate": 3.377562760725863e-05,
+      "loss": 0.8513,
+      "step": 16419
+    },
+    {
+      "epoch": 2.9234330484330484,
+      "grad_norm": 0.769944965839386,
+      "learning_rate": 3.3765140111189265e-05,
+      "loss": 0.8869,
+      "step": 16420
+    },
+    {
+      "epoch": 2.923611111111111,
+      "grad_norm": 0.8117483854293823,
+      "learning_rate": 3.375465391285438e-05,
+      "loss": 0.8718,
+      "step": 16421
+    },
+    {
+      "epoch": 2.923789173789174,
+      "grad_norm": 0.8011773824691772,
+      "learning_rate": 3.374416901245944e-05,
+      "loss": 0.9326,
+      "step": 16422
+    },
+    {
+      "epoch": 2.923967236467236,
+      "grad_norm": 0.8096779584884644,
+      "learning_rate": 3.373368541020987e-05,
+      "loss": 0.9317,
+      "step": 16423
+    },
+    {
+      "epoch": 2.924145299145299,
+      "grad_norm": 0.8383152484893799,
+      "learning_rate": 3.3723203106311055e-05,
+      "loss": 0.9399,
+      "step": 16424
+    },
+    {
+      "epoch": 2.9243233618233617,
+      "grad_norm": 0.9268670678138733,
+      "learning_rate": 3.371272210096842e-05,
+      "loss": 0.6867,
+      "step": 16425
+    },
+    {
+      "epoch": 2.9245014245014245,
+      "grad_norm": 0.941338837146759,
+      "learning_rate": 3.3702242394387294e-05,
+      "loss": 0.9164,
+      "step": 16426
+    },
+    {
+      "epoch": 2.9246794871794872,
+      "grad_norm": 0.7108882665634155,
+      "learning_rate": 3.3691763986773014e-05,
+      "loss": 0.6956,
+      "step": 16427
+    },
+    {
+      "epoch": 2.92485754985755,
+      "grad_norm": 0.9144331812858582,
+      "learning_rate": 3.3681286878330876e-05,
+      "loss": 0.8148,
+      "step": 16428
+    },
+    {
+      "epoch": 2.9250356125356127,
+      "grad_norm": 0.882851243019104,
+      "learning_rate": 3.367081106926615e-05,
+      "loss": 0.846,
+      "step": 16429
+    },
+    {
+      "epoch": 2.9252136752136755,
+      "grad_norm": 0.9508523941040039,
+      "learning_rate": 3.366033655978409e-05,
+      "loss": 0.9637,
+      "step": 16430
+    },
+    {
+      "epoch": 2.925391737891738,
+      "grad_norm": 0.7770804166793823,
+      "learning_rate": 3.3649863350089935e-05,
+      "loss": 0.6673,
+      "step": 16431
+    },
+    {
+      "epoch": 2.9255698005698005,
+      "grad_norm": 0.8342770934104919,
+      "learning_rate": 3.3639391440388845e-05,
+      "loss": 1.0366,
+      "step": 16432
+    },
+    {
+      "epoch": 2.9257478632478633,
+      "grad_norm": 0.8854461908340454,
+      "learning_rate": 3.362892083088609e-05,
+      "loss": 0.7885,
+      "step": 16433
+    },
+    {
+      "epoch": 2.925925925925926,
+      "grad_norm": 0.8054807186126709,
+      "learning_rate": 3.36184515217867e-05,
+      "loss": 0.7705,
+      "step": 16434
+    },
+    {
+      "epoch": 2.9261039886039883,
+      "grad_norm": 0.9037294983863831,
+      "learning_rate": 3.360798351329587e-05,
+      "loss": 1.077,
+      "step": 16435
+    },
+    {
+      "epoch": 2.926282051282051,
+      "grad_norm": 0.8743478059768677,
+      "learning_rate": 3.35975168056187e-05,
+      "loss": 0.8633,
+      "step": 16436
+    },
+    {
+      "epoch": 2.926460113960114,
+      "grad_norm": 0.8879369497299194,
+      "learning_rate": 3.3587051398960245e-05,
+      "loss": 0.8059,
+      "step": 16437
+    },
+    {
+      "epoch": 2.9266381766381766,
+      "grad_norm": 0.9445768594741821,
+      "learning_rate": 3.3576587293525564e-05,
+      "loss": 0.861,
+      "step": 16438
+    },
+    {
+      "epoch": 2.9268162393162394,
+      "grad_norm": 0.8131009340286255,
+      "learning_rate": 3.356612448951967e-05,
+      "loss": 0.8299,
+      "step": 16439
+    },
+    {
+      "epoch": 2.926994301994302,
+      "grad_norm": 0.8781847953796387,
+      "learning_rate": 3.3555662987147515e-05,
+      "loss": 0.8447,
+      "step": 16440
+    },
+    {
+      "epoch": 2.927172364672365,
+      "grad_norm": 0.9173591732978821,
+      "learning_rate": 3.3545202786614206e-05,
+      "loss": 0.9297,
+      "step": 16441
+    },
+    {
+      "epoch": 2.9273504273504276,
+      "grad_norm": 0.9596586227416992,
+      "learning_rate": 3.353474388812452e-05,
+      "loss": 0.9109,
+      "step": 16442
+    },
+    {
+      "epoch": 2.92752849002849,
+      "grad_norm": 0.8325486183166504,
+      "learning_rate": 3.352428629188349e-05,
+      "loss": 0.6822,
+      "step": 16443
+    },
+    {
+      "epoch": 2.9277065527065527,
+      "grad_norm": 0.8758354187011719,
+      "learning_rate": 3.351382999809599e-05,
+      "loss": 0.9522,
+      "step": 16444
+    },
+    {
+      "epoch": 2.9278846153846154,
+      "grad_norm": 0.909718930721283,
+      "learning_rate": 3.3503375006966866e-05,
+      "loss": 0.8226,
+      "step": 16445
+    },
+    {
+      "epoch": 2.928062678062678,
+      "grad_norm": 0.9259094595909119,
+      "learning_rate": 3.3492921318700974e-05,
+      "loss": 0.9895,
+      "step": 16446
+    },
+    {
+      "epoch": 2.9282407407407405,
+      "grad_norm": 0.7815300226211548,
+      "learning_rate": 3.348246893350311e-05,
+      "loss": 0.8854,
+      "step": 16447
+    },
+    {
+      "epoch": 2.9284188034188032,
+      "grad_norm": 0.8576910495758057,
+      "learning_rate": 3.3472017851578154e-05,
+      "loss": 0.9379,
+      "step": 16448
+    },
+    {
+      "epoch": 2.928596866096866,
+      "grad_norm": 0.9139176607131958,
+      "learning_rate": 3.3461568073130735e-05,
+      "loss": 0.9372,
+      "step": 16449
+    },
+    {
+      "epoch": 2.9287749287749287,
+      "grad_norm": 0.8304639458656311,
+      "learning_rate": 3.34511195983657e-05,
+      "loss": 0.7105,
+      "step": 16450
+    },
+    {
+      "epoch": 2.9289529914529915,
+      "grad_norm": 0.8689056634902954,
+      "learning_rate": 3.344067242748774e-05,
+      "loss": 0.9518,
+      "step": 16451
+    },
+    {
+      "epoch": 2.9291310541310542,
+      "grad_norm": 0.9180546998977661,
+      "learning_rate": 3.343022656070154e-05,
+      "loss": 0.982,
+      "step": 16452
+    },
+    {
+      "epoch": 2.929309116809117,
+      "grad_norm": 0.9291700124740601,
+      "learning_rate": 3.341978199821175e-05,
+      "loss": 1.0003,
+      "step": 16453
+    },
+    {
+      "epoch": 2.9294871794871797,
+      "grad_norm": 0.970675528049469,
+      "learning_rate": 3.340933874022304e-05,
+      "loss": 0.8941,
+      "step": 16454
+    },
+    {
+      "epoch": 2.929665242165242,
+      "grad_norm": 0.8425672650337219,
+      "learning_rate": 3.339889678693999e-05,
+      "loss": 0.7173,
+      "step": 16455
+    },
+    {
+      "epoch": 2.929843304843305,
+      "grad_norm": 0.9666314721107483,
+      "learning_rate": 3.3388456138567225e-05,
+      "loss": 1.0403,
+      "step": 16456
+    },
+    {
+      "epoch": 2.9300213675213675,
+      "grad_norm": 0.9586226344108582,
+      "learning_rate": 3.337801679530924e-05,
+      "loss": 0.7691,
+      "step": 16457
+    },
+    {
+      "epoch": 2.9301994301994303,
+      "grad_norm": 0.888327419757843,
+      "learning_rate": 3.33675787573707e-05,
+      "loss": 0.7019,
+      "step": 16458
+    },
+    {
+      "epoch": 2.9303774928774926,
+      "grad_norm": 0.7410684823989868,
+      "learning_rate": 3.335714202495596e-05,
+      "loss": 0.9935,
+      "step": 16459
+    },
+    {
+      "epoch": 2.9305555555555554,
+      "grad_norm": 1.0408822298049927,
+      "learning_rate": 3.3346706598269617e-05,
+      "loss": 0.9454,
+      "step": 16460
+    },
+    {
+      "epoch": 2.930733618233618,
+      "grad_norm": 0.9065089821815491,
+      "learning_rate": 3.333627247751611e-05,
+      "loss": 0.9665,
+      "step": 16461
+    },
+    {
+      "epoch": 2.930911680911681,
+      "grad_norm": 0.8904961943626404,
+      "learning_rate": 3.332583966289985e-05,
+      "loss": 0.9402,
+      "step": 16462
+    },
+    {
+      "epoch": 2.9310897435897436,
+      "grad_norm": 0.920364260673523,
+      "learning_rate": 3.331540815462526e-05,
+      "loss": 0.8524,
+      "step": 16463
+    },
+    {
+      "epoch": 2.9312678062678064,
+      "grad_norm": 0.9185073375701904,
+      "learning_rate": 3.330497795289669e-05,
+      "loss": 0.8982,
+      "step": 16464
+    },
+    {
+      "epoch": 2.931445868945869,
+      "grad_norm": 0.9365581274032593,
+      "learning_rate": 3.32945490579186e-05,
+      "loss": 0.9272,
+      "step": 16465
+    },
+    {
+      "epoch": 2.931623931623932,
+      "grad_norm": 0.9139016270637512,
+      "learning_rate": 3.328412146989518e-05,
+      "loss": 1.0019,
+      "step": 16466
+    },
+    {
+      "epoch": 2.931801994301994,
+      "grad_norm": 0.9021140336990356,
+      "learning_rate": 3.327369518903085e-05,
+      "loss": 0.866,
+      "step": 16467
+    },
+    {
+      "epoch": 2.931980056980057,
+      "grad_norm": 0.8053449988365173,
+      "learning_rate": 3.326327021552984e-05,
+      "loss": 0.8915,
+      "step": 16468
+    },
+    {
+      "epoch": 2.9321581196581197,
+      "grad_norm": 1.0013985633850098,
+      "learning_rate": 3.325284654959643e-05,
+      "loss": 0.9063,
+      "step": 16469
+    },
+    {
+      "epoch": 2.9323361823361824,
+      "grad_norm": 0.8236168622970581,
+      "learning_rate": 3.324242419143483e-05,
+      "loss": 0.7113,
+      "step": 16470
+    },
+    {
+      "epoch": 2.932514245014245,
+      "grad_norm": 0.7256088852882385,
+      "learning_rate": 3.323200314124925e-05,
+      "loss": 0.6009,
+      "step": 16471
+    },
+    {
+      "epoch": 2.9326923076923075,
+      "grad_norm": 0.9991055727005005,
+      "learning_rate": 3.322158339924384e-05,
+      "loss": 1.039,
+      "step": 16472
+    },
+    {
+      "epoch": 2.9328703703703702,
+      "grad_norm": 0.8769686818122864,
+      "learning_rate": 3.3211164965622845e-05,
+      "loss": 0.989,
+      "step": 16473
+    },
+    {
+      "epoch": 2.933048433048433,
+      "grad_norm": 0.8226488828659058,
+      "learning_rate": 3.320074784059026e-05,
+      "loss": 0.9269,
+      "step": 16474
+    },
+    {
+      "epoch": 2.9332264957264957,
+      "grad_norm": 0.7763178944587708,
+      "learning_rate": 3.31903320243503e-05,
+      "loss": 0.9245,
+      "step": 16475
+    },
+    {
+      "epoch": 2.9334045584045585,
+      "grad_norm": 0.9204390645027161,
+      "learning_rate": 3.3179917517107e-05,
+      "loss": 1.2374,
+      "step": 16476
+    },
+    {
+      "epoch": 2.9335826210826212,
+      "grad_norm": 0.9970325827598572,
+      "learning_rate": 3.31695043190644e-05,
+      "loss": 0.7862,
+      "step": 16477
+    },
+    {
+      "epoch": 2.933760683760684,
+      "grad_norm": 0.8539462685585022,
+      "learning_rate": 3.315909243042654e-05,
+      "loss": 0.8015,
+      "step": 16478
+    },
+    {
+      "epoch": 2.9339387464387463,
+      "grad_norm": 0.8208832144737244,
+      "learning_rate": 3.314868185139742e-05,
+      "loss": 0.8571,
+      "step": 16479
+    },
+    {
+      "epoch": 2.934116809116809,
+      "grad_norm": 0.8628700375556946,
+      "learning_rate": 3.313827258218101e-05,
+      "loss": 0.8274,
+      "step": 16480
+    },
+    {
+      "epoch": 2.934294871794872,
+      "grad_norm": 1.0472661256790161,
+      "learning_rate": 3.312786462298124e-05,
+      "loss": 0.9252,
+      "step": 16481
+    },
+    {
+      "epoch": 2.9344729344729346,
+      "grad_norm": 0.8292158842086792,
+      "learning_rate": 3.311745797400202e-05,
+      "loss": 0.8548,
+      "step": 16482
+    },
+    {
+      "epoch": 2.9346509971509973,
+      "grad_norm": 0.85657799243927,
+      "learning_rate": 3.310705263544731e-05,
+      "loss": 0.7657,
+      "step": 16483
+    },
+    {
+      "epoch": 2.9348290598290596,
+      "grad_norm": 0.7560283541679382,
+      "learning_rate": 3.309664860752095e-05,
+      "loss": 0.8063,
+      "step": 16484
+    },
+    {
+      "epoch": 2.9350071225071224,
+      "grad_norm": 0.7933059334754944,
+      "learning_rate": 3.308624589042677e-05,
+      "loss": 0.7102,
+      "step": 16485
+    },
+    {
+      "epoch": 2.935185185185185,
+      "grad_norm": 0.8430653810501099,
+      "learning_rate": 3.3075844484368615e-05,
+      "loss": 0.8815,
+      "step": 16486
+    },
+    {
+      "epoch": 2.935363247863248,
+      "grad_norm": 0.8113032579421997,
+      "learning_rate": 3.306544438955021e-05,
+      "loss": 0.8144,
+      "step": 16487
+    },
+    {
+      "epoch": 2.9355413105413106,
+      "grad_norm": 0.8344797492027283,
+      "learning_rate": 3.3055045606175474e-05,
+      "loss": 0.7164,
+      "step": 16488
+    },
+    {
+      "epoch": 2.9357193732193734,
+      "grad_norm": 1.0109050273895264,
+      "learning_rate": 3.3044648134447964e-05,
+      "loss": 0.6619,
+      "step": 16489
+    },
+    {
+      "epoch": 2.935897435897436,
+      "grad_norm": 0.8746094703674316,
+      "learning_rate": 3.303425197457156e-05,
+      "loss": 0.73,
+      "step": 16490
+    },
+    {
+      "epoch": 2.9360754985754984,
+      "grad_norm": 0.919924795627594,
+      "learning_rate": 3.302385712674981e-05,
+      "loss": 1.1008,
+      "step": 16491
+    },
+    {
+      "epoch": 2.936253561253561,
+      "grad_norm": 0.8407595753669739,
+      "learning_rate": 3.301346359118648e-05,
+      "loss": 1.0963,
+      "step": 16492
+    },
+    {
+      "epoch": 2.936431623931624,
+      "grad_norm": 0.8145756721496582,
+      "learning_rate": 3.3003071368085184e-05,
+      "loss": 0.805,
+      "step": 16493
+    },
+    {
+      "epoch": 2.9366096866096867,
+      "grad_norm": 0.8298826813697815,
+      "learning_rate": 3.299268045764953e-05,
+      "loss": 0.7075,
+      "step": 16494
+    },
+    {
+      "epoch": 2.9367877492877494,
+      "grad_norm": 1.1027754545211792,
+      "learning_rate": 3.2982290860083106e-05,
+      "loss": 1.0381,
+      "step": 16495
+    },
+    {
+      "epoch": 2.9369658119658117,
+      "grad_norm": 0.8431075215339661,
+      "learning_rate": 3.2971902575589476e-05,
+      "loss": 0.9013,
+      "step": 16496
+    },
+    {
+      "epoch": 2.9371438746438745,
+      "grad_norm": 1.0045086145401,
+      "learning_rate": 3.296151560437214e-05,
+      "loss": 0.9054,
+      "step": 16497
+    },
+    {
+      "epoch": 2.9373219373219372,
+      "grad_norm": 0.8290889263153076,
+      "learning_rate": 3.295112994663471e-05,
+      "loss": 0.8054,
+      "step": 16498
+    },
+    {
+      "epoch": 2.9375,
+      "grad_norm": 0.7854097485542297,
+      "learning_rate": 3.2940745602580544e-05,
+      "loss": 0.7585,
+      "step": 16499
+    },
+    {
+      "epoch": 2.9376780626780628,
+      "grad_norm": 0.7470735907554626,
+      "learning_rate": 3.29303625724132e-05,
+      "loss": 0.7122,
+      "step": 16500
+    },
+    {
+      "epoch": 2.9378561253561255,
+      "grad_norm": 0.847463071346283,
+      "learning_rate": 3.2919980856336075e-05,
+      "loss": 0.7949,
+      "step": 16501
+    },
+    {
+      "epoch": 2.9380341880341883,
+      "grad_norm": 0.9595068097114563,
+      "learning_rate": 3.290960045455257e-05,
+      "loss": 0.8335,
+      "step": 16502
+    },
+    {
+      "epoch": 2.9382122507122506,
+      "grad_norm": 0.9636897444725037,
+      "learning_rate": 3.289922136726609e-05,
+      "loss": 0.7454,
+      "step": 16503
+    },
+    {
+      "epoch": 2.9383903133903133,
+      "grad_norm": 0.9030486941337585,
+      "learning_rate": 3.288884359467993e-05,
+      "loss": 0.8346,
+      "step": 16504
+    },
+    {
+      "epoch": 2.938568376068376,
+      "grad_norm": 0.8722931146621704,
+      "learning_rate": 3.287846713699755e-05,
+      "loss": 1.0182,
+      "step": 16505
+    },
+    {
+      "epoch": 2.938746438746439,
+      "grad_norm": 0.7612178325653076,
+      "learning_rate": 3.286809199442209e-05,
+      "loss": 0.8759,
+      "step": 16506
+    },
+    {
+      "epoch": 2.9389245014245016,
+      "grad_norm": 0.951334536075592,
+      "learning_rate": 3.2857718167156956e-05,
+      "loss": 0.8035,
+      "step": 16507
+    },
+    {
+      "epoch": 2.939102564102564,
+      "grad_norm": 0.8043029308319092,
+      "learning_rate": 3.284734565540536e-05,
+      "loss": 0.7652,
+      "step": 16508
+    },
+    {
+      "epoch": 2.9392806267806266,
+      "grad_norm": 0.8762648701667786,
+      "learning_rate": 3.283697445937053e-05,
+      "loss": 0.9362,
+      "step": 16509
+    },
+    {
+      "epoch": 2.9394586894586894,
+      "grad_norm": 0.8046880960464478,
+      "learning_rate": 3.282660457925566e-05,
+      "loss": 0.7757,
+      "step": 16510
+    },
+    {
+      "epoch": 2.939636752136752,
+      "grad_norm": 0.7703250050544739,
+      "learning_rate": 3.281623601526394e-05,
+      "loss": 0.6699,
+      "step": 16511
+    },
+    {
+      "epoch": 2.939814814814815,
+      "grad_norm": 0.9165888428688049,
+      "learning_rate": 3.280586876759847e-05,
+      "loss": 0.8321,
+      "step": 16512
+    },
+    {
+      "epoch": 2.9399928774928776,
+      "grad_norm": 0.7389699816703796,
+      "learning_rate": 3.279550283646249e-05,
+      "loss": 0.7621,
+      "step": 16513
+    },
+    {
+      "epoch": 2.9401709401709404,
+      "grad_norm": 0.9256302714347839,
+      "learning_rate": 3.278513822205897e-05,
+      "loss": 0.8634,
+      "step": 16514
+    },
+    {
+      "epoch": 2.9403490028490027,
+      "grad_norm": 0.8384902477264404,
+      "learning_rate": 3.277477492459109e-05,
+      "loss": 0.7645,
+      "step": 16515
+    },
+    {
+      "epoch": 2.9405270655270654,
+      "grad_norm": 0.9446337819099426,
+      "learning_rate": 3.276441294426178e-05,
+      "loss": 0.9729,
+      "step": 16516
+    },
+    {
+      "epoch": 2.940705128205128,
+      "grad_norm": 0.920237123966217,
+      "learning_rate": 3.275405228127417e-05,
+      "loss": 0.7834,
+      "step": 16517
+    },
+    {
+      "epoch": 2.940883190883191,
+      "grad_norm": 0.8432943224906921,
+      "learning_rate": 3.274369293583121e-05,
+      "loss": 0.8246,
+      "step": 16518
+    },
+    {
+      "epoch": 2.9410612535612537,
+      "grad_norm": 0.8046762943267822,
+      "learning_rate": 3.2733334908135885e-05,
+      "loss": 0.9363,
+      "step": 16519
+    },
+    {
+      "epoch": 2.941239316239316,
+      "grad_norm": 0.9555963277816772,
+      "learning_rate": 3.2722978198391106e-05,
+      "loss": 0.8699,
+      "step": 16520
+    },
+    {
+      "epoch": 2.9414173789173788,
+      "grad_norm": 0.8619177937507629,
+      "learning_rate": 3.2712622806799834e-05,
+      "loss": 0.8482,
+      "step": 16521
+    },
+    {
+      "epoch": 2.9415954415954415,
+      "grad_norm": 0.8801655769348145,
+      "learning_rate": 3.27022687335649e-05,
+      "loss": 0.9601,
+      "step": 16522
+    },
+    {
+      "epoch": 2.9417735042735043,
+      "grad_norm": 0.9054547548294067,
+      "learning_rate": 3.2691915978889244e-05,
+      "loss": 0.8752,
+      "step": 16523
+    },
+    {
+      "epoch": 2.941951566951567,
+      "grad_norm": 0.9078481197357178,
+      "learning_rate": 3.2681564542975675e-05,
+      "loss": 0.8225,
+      "step": 16524
+    },
+    {
+      "epoch": 2.9421296296296298,
+      "grad_norm": 0.9574032425880432,
+      "learning_rate": 3.267121442602701e-05,
+      "loss": 0.9133,
+      "step": 16525
+    },
+    {
+      "epoch": 2.9423076923076925,
+      "grad_norm": 0.8808075189590454,
+      "learning_rate": 3.2660865628246026e-05,
+      "loss": 0.8544,
+      "step": 16526
+    },
+    {
+      "epoch": 2.942485754985755,
+      "grad_norm": 0.8007816076278687,
+      "learning_rate": 3.26505181498355e-05,
+      "loss": 0.7846,
+      "step": 16527
+    },
+    {
+      "epoch": 2.9426638176638176,
+      "grad_norm": 0.8623418211936951,
+      "learning_rate": 3.264017199099816e-05,
+      "loss": 0.9056,
+      "step": 16528
+    },
+    {
+      "epoch": 2.9428418803418803,
+      "grad_norm": 0.870961606502533,
+      "learning_rate": 3.2629827151936695e-05,
+      "loss": 0.9883,
+      "step": 16529
+    },
+    {
+      "epoch": 2.943019943019943,
+      "grad_norm": 0.9122142791748047,
+      "learning_rate": 3.2619483632853885e-05,
+      "loss": 0.8012,
+      "step": 16530
+    },
+    {
+      "epoch": 2.943198005698006,
+      "grad_norm": 0.9072979688644409,
+      "learning_rate": 3.260914143395225e-05,
+      "loss": 0.9999,
+      "step": 16531
+    },
+    {
+      "epoch": 2.943376068376068,
+      "grad_norm": 0.8746095299720764,
+      "learning_rate": 3.259880055543454e-05,
+      "loss": 0.8022,
+      "step": 16532
+    },
+    {
+      "epoch": 2.943554131054131,
+      "grad_norm": 1.0012668371200562,
+      "learning_rate": 3.2588460997503314e-05,
+      "loss": 0.8909,
+      "step": 16533
+    },
+    {
+      "epoch": 2.9437321937321936,
+      "grad_norm": 0.913070023059845,
+      "learning_rate": 3.2578122760361154e-05,
+      "loss": 0.7729,
+      "step": 16534
+    },
+    {
+      "epoch": 2.9439102564102564,
+      "grad_norm": 0.791753888130188,
+      "learning_rate": 3.2567785844210616e-05,
+      "loss": 0.9731,
+      "step": 16535
+    },
+    {
+      "epoch": 2.944088319088319,
+      "grad_norm": 0.9673776030540466,
+      "learning_rate": 3.255745024925425e-05,
+      "loss": 0.8008,
+      "step": 16536
+    },
+    {
+      "epoch": 2.944266381766382,
+      "grad_norm": 0.9382752180099487,
+      "learning_rate": 3.254711597569454e-05,
+      "loss": 0.9611,
+      "step": 16537
+    },
+    {
+      "epoch": 2.9444444444444446,
+      "grad_norm": 0.8816630840301514,
+      "learning_rate": 3.2536783023733975e-05,
+      "loss": 0.7565,
+      "step": 16538
+    },
+    {
+      "epoch": 2.9446225071225074,
+      "grad_norm": 0.8474457859992981,
+      "learning_rate": 3.2526451393574964e-05,
+      "loss": 0.7766,
+      "step": 16539
+    },
+    {
+      "epoch": 2.9448005698005697,
+      "grad_norm": 0.9510074257850647,
+      "learning_rate": 3.251612108542005e-05,
+      "loss": 0.9899,
+      "step": 16540
+    },
+    {
+      "epoch": 2.9449786324786325,
+      "grad_norm": 0.776924192905426,
+      "learning_rate": 3.250579209947149e-05,
+      "loss": 0.7845,
+      "step": 16541
+    },
+    {
+      "epoch": 2.945156695156695,
+      "grad_norm": 0.9543585181236267,
+      "learning_rate": 3.2495464435931756e-05,
+      "loss": 1.1313,
+      "step": 16542
+    },
+    {
+      "epoch": 2.945334757834758,
+      "grad_norm": 0.9087918400764465,
+      "learning_rate": 3.2485138095003164e-05,
+      "loss": 0.774,
+      "step": 16543
+    },
+    {
+      "epoch": 2.9455128205128203,
+      "grad_norm": 0.7719675302505493,
+      "learning_rate": 3.247481307688801e-05,
+      "loss": 0.7755,
+      "step": 16544
+    },
+    {
+      "epoch": 2.945690883190883,
+      "grad_norm": 0.8550716638565063,
+      "learning_rate": 3.246448938178869e-05,
+      "loss": 0.8455,
+      "step": 16545
+    },
+    {
+      "epoch": 2.9458689458689458,
+      "grad_norm": 0.8585572838783264,
+      "learning_rate": 3.2454167009907346e-05,
+      "loss": 0.7048,
+      "step": 16546
+    },
+    {
+      "epoch": 2.9460470085470085,
+      "grad_norm": 0.819939136505127,
+      "learning_rate": 3.2443845961446315e-05,
+      "loss": 0.7671,
+      "step": 16547
+    },
+    {
+      "epoch": 2.9462250712250713,
+      "grad_norm": 0.8811594247817993,
+      "learning_rate": 3.243352623660778e-05,
+      "loss": 0.9244,
+      "step": 16548
+    },
+    {
+      "epoch": 2.946403133903134,
+      "grad_norm": 0.8128607869148254,
+      "learning_rate": 3.242320783559395e-05,
+      "loss": 0.8103,
+      "step": 16549
+    },
+    {
+      "epoch": 2.9465811965811968,
+      "grad_norm": 0.778759241104126,
+      "learning_rate": 3.2412890758606985e-05,
+      "loss": 0.6261,
+      "step": 16550
+    },
+    {
+      "epoch": 2.9467592592592595,
+      "grad_norm": 1.0277715921401978,
+      "learning_rate": 3.240257500584901e-05,
+      "loss": 0.8728,
+      "step": 16551
+    },
+    {
+      "epoch": 2.946937321937322,
+      "grad_norm": 0.7647507786750793,
+      "learning_rate": 3.239226057752217e-05,
+      "loss": 0.7142,
+      "step": 16552
+    },
+    {
+      "epoch": 2.9471153846153846,
+      "grad_norm": 0.8381546139717102,
+      "learning_rate": 3.238194747382855e-05,
+      "loss": 0.737,
+      "step": 16553
+    },
+    {
+      "epoch": 2.9472934472934473,
+      "grad_norm": 0.7928317189216614,
+      "learning_rate": 3.237163569497016e-05,
+      "loss": 0.8926,
+      "step": 16554
+    },
+    {
+      "epoch": 2.94747150997151,
+      "grad_norm": 0.7086058259010315,
+      "learning_rate": 3.236132524114914e-05,
+      "loss": 0.6845,
+      "step": 16555
+    },
+    {
+      "epoch": 2.9476495726495724,
+      "grad_norm": 0.83002769947052,
+      "learning_rate": 3.235101611256739e-05,
+      "loss": 0.7365,
+      "step": 16556
+    },
+    {
+      "epoch": 2.947827635327635,
+      "grad_norm": 0.9012778401374817,
+      "learning_rate": 3.234070830942698e-05,
+      "loss": 0.9999,
+      "step": 16557
+    },
+    {
+      "epoch": 2.948005698005698,
+      "grad_norm": 0.8554810881614685,
+      "learning_rate": 3.233040183192985e-05,
+      "loss": 0.6699,
+      "step": 16558
+    },
+    {
+      "epoch": 2.9481837606837606,
+      "grad_norm": 0.9322055578231812,
+      "learning_rate": 3.2320096680277915e-05,
+      "loss": 0.8969,
+      "step": 16559
+    },
+    {
+      "epoch": 2.9483618233618234,
+      "grad_norm": 0.8755966424942017,
+      "learning_rate": 3.2309792854673095e-05,
+      "loss": 0.8398,
+      "step": 16560
+    },
+    {
+      "epoch": 2.948539886039886,
+      "grad_norm": 0.8203766345977783,
+      "learning_rate": 3.229949035531726e-05,
+      "loss": 0.8354,
+      "step": 16561
+    },
+    {
+      "epoch": 2.948717948717949,
+      "grad_norm": 0.8970799446105957,
+      "learning_rate": 3.228918918241229e-05,
+      "loss": 0.9074,
+      "step": 16562
+    },
+    {
+      "epoch": 2.9488960113960117,
+      "grad_norm": 0.8263736963272095,
+      "learning_rate": 3.227888933615997e-05,
+      "loss": 0.8896,
+      "step": 16563
+    },
+    {
+      "epoch": 2.949074074074074,
+      "grad_norm": 1.0277043581008911,
+      "learning_rate": 3.2268590816762155e-05,
+      "loss": 0.8706,
+      "step": 16564
+    },
+    {
+      "epoch": 2.9492521367521367,
+      "grad_norm": 0.8965407013893127,
+      "learning_rate": 3.225829362442061e-05,
+      "loss": 0.6796,
+      "step": 16565
+    },
+    {
+      "epoch": 2.9494301994301995,
+      "grad_norm": 0.8175839185714722,
+      "learning_rate": 3.224799775933708e-05,
+      "loss": 0.9243,
+      "step": 16566
+    },
+    {
+      "epoch": 2.949608262108262,
+      "grad_norm": 0.7979576587677002,
+      "learning_rate": 3.2237703221713286e-05,
+      "loss": 0.8008,
+      "step": 16567
+    },
+    {
+      "epoch": 2.9497863247863245,
+      "grad_norm": 1.054843783378601,
+      "learning_rate": 3.2227410011750945e-05,
+      "loss": 1.0279,
+      "step": 16568
+    },
+    {
+      "epoch": 2.9499643874643873,
+      "grad_norm": 0.7947831749916077,
+      "learning_rate": 3.221711812965168e-05,
+      "loss": 0.6767,
+      "step": 16569
+    },
+    {
+      "epoch": 2.95014245014245,
+      "grad_norm": 0.8702623844146729,
+      "learning_rate": 3.220682757561725e-05,
+      "loss": 0.9844,
+      "step": 16570
+    },
+    {
+      "epoch": 2.9503205128205128,
+      "grad_norm": 0.8653056621551514,
+      "learning_rate": 3.2196538349849123e-05,
+      "loss": 1.0747,
+      "step": 16571
+    },
+    {
+      "epoch": 2.9504985754985755,
+      "grad_norm": 0.9718163013458252,
+      "learning_rate": 3.2186250452549026e-05,
+      "loss": 0.7793,
+      "step": 16572
+    },
+    {
+      "epoch": 2.9506766381766383,
+      "grad_norm": 0.8626788854598999,
+      "learning_rate": 3.217596388391848e-05,
+      "loss": 0.9145,
+      "step": 16573
+    },
+    {
+      "epoch": 2.950854700854701,
+      "grad_norm": 0.9753466844558716,
+      "learning_rate": 3.2165678644159025e-05,
+      "loss": 0.9784,
+      "step": 16574
+    },
+    {
+      "epoch": 2.951032763532764,
+      "grad_norm": 0.795011043548584,
+      "learning_rate": 3.2155394733472186e-05,
+      "loss": 0.7457,
+      "step": 16575
+    },
+    {
+      "epoch": 2.951210826210826,
+      "grad_norm": 0.842991828918457,
+      "learning_rate": 3.2145112152059454e-05,
+      "loss": 0.9126,
+      "step": 16576
+    },
+    {
+      "epoch": 2.951388888888889,
+      "grad_norm": 0.8642476797103882,
+      "learning_rate": 3.213483090012228e-05,
+      "loss": 0.7213,
+      "step": 16577
+    },
+    {
+      "epoch": 2.9515669515669516,
+      "grad_norm": 0.9114034175872803,
+      "learning_rate": 3.212455097786214e-05,
+      "loss": 0.651,
+      "step": 16578
+    },
+    {
+      "epoch": 2.9517450142450143,
+      "grad_norm": 1.0575958490371704,
+      "learning_rate": 3.211427238548037e-05,
+      "loss": 0.9727,
+      "step": 16579
+    },
+    {
+      "epoch": 2.9519230769230766,
+      "grad_norm": 0.8618924617767334,
+      "learning_rate": 3.210399512317849e-05,
+      "loss": 0.8593,
+      "step": 16580
+    },
+    {
+      "epoch": 2.9521011396011394,
+      "grad_norm": 0.768313467502594,
+      "learning_rate": 3.209371919115771e-05,
+      "loss": 0.8305,
+      "step": 16581
+    },
+    {
+      "epoch": 2.952279202279202,
+      "grad_norm": 0.8160355091094971,
+      "learning_rate": 3.208344458961947e-05,
+      "loss": 0.7266,
+      "step": 16582
+    },
+    {
+      "epoch": 2.952457264957265,
+      "grad_norm": 0.908545196056366,
+      "learning_rate": 3.207317131876506e-05,
+      "loss": 0.8022,
+      "step": 16583
+    },
+    {
+      "epoch": 2.9526353276353277,
+      "grad_norm": 1.2593516111373901,
+      "learning_rate": 3.206289937879571e-05,
+      "loss": 1.0274,
+      "step": 16584
+    },
+    {
+      "epoch": 2.9528133903133904,
+      "grad_norm": 0.7789214849472046,
+      "learning_rate": 3.2052628769912795e-05,
+      "loss": 0.8993,
+      "step": 16585
+    },
+    {
+      "epoch": 2.952991452991453,
+      "grad_norm": 0.8475270867347717,
+      "learning_rate": 3.20423594923174e-05,
+      "loss": 1.0145,
+      "step": 16586
+    },
+    {
+      "epoch": 2.953169515669516,
+      "grad_norm": 1.0083874464035034,
+      "learning_rate": 3.203209154621086e-05,
+      "loss": 1.02,
+      "step": 16587
+    },
+    {
+      "epoch": 2.953347578347578,
+      "grad_norm": 0.7013131380081177,
+      "learning_rate": 3.2021824931794245e-05,
+      "loss": 0.5449,
+      "step": 16588
+    },
+    {
+      "epoch": 2.953525641025641,
+      "grad_norm": 0.8298764824867249,
+      "learning_rate": 3.201155964926878e-05,
+      "loss": 0.9914,
+      "step": 16589
+    },
+    {
+      "epoch": 2.9537037037037037,
+      "grad_norm": 0.9371963143348694,
+      "learning_rate": 3.200129569883556e-05,
+      "loss": 0.7349,
+      "step": 16590
+    },
+    {
+      "epoch": 2.9538817663817665,
+      "grad_norm": 0.7932438850402832,
+      "learning_rate": 3.199103308069571e-05,
+      "loss": 0.7911,
+      "step": 16591
+    },
+    {
+      "epoch": 2.9540598290598292,
+      "grad_norm": 0.8415567278862,
+      "learning_rate": 3.198077179505029e-05,
+      "loss": 0.766,
+      "step": 16592
+    },
+    {
+      "epoch": 2.9542378917378915,
+      "grad_norm": 0.9155336618423462,
+      "learning_rate": 3.197051184210035e-05,
+      "loss": 0.9622,
+      "step": 16593
+    },
+    {
+      "epoch": 2.9544159544159543,
+      "grad_norm": 0.8972458839416504,
+      "learning_rate": 3.196025322204688e-05,
+      "loss": 1.0202,
+      "step": 16594
+    },
+    {
+      "epoch": 2.954594017094017,
+      "grad_norm": 0.9620199799537659,
+      "learning_rate": 3.194999593509096e-05,
+      "loss": 0.8114,
+      "step": 16595
+    },
+    {
+      "epoch": 2.95477207977208,
+      "grad_norm": 0.819244921207428,
+      "learning_rate": 3.1939739981433456e-05,
+      "loss": 0.7858,
+      "step": 16596
+    },
+    {
+      "epoch": 2.9549501424501425,
+      "grad_norm": 0.8560270667076111,
+      "learning_rate": 3.192948536127542e-05,
+      "loss": 0.9163,
+      "step": 16597
+    },
+    {
+      "epoch": 2.9551282051282053,
+      "grad_norm": 0.9105932116508484,
+      "learning_rate": 3.191923207481765e-05,
+      "loss": 0.7372,
+      "step": 16598
+    },
+    {
+      "epoch": 2.955306267806268,
+      "grad_norm": 0.888846218585968,
+      "learning_rate": 3.190898012226114e-05,
+      "loss": 0.8624,
+      "step": 16599
+    },
+    {
+      "epoch": 2.9554843304843303,
+      "grad_norm": 0.8116851449012756,
+      "learning_rate": 3.1898729503806726e-05,
+      "loss": 0.8549,
+      "step": 16600
+    },
+    {
+      "epoch": 2.955662393162393,
+      "grad_norm": 0.8970577120780945,
+      "learning_rate": 3.188848021965522e-05,
+      "loss": 0.8228,
+      "step": 16601
+    },
+    {
+      "epoch": 2.955840455840456,
+      "grad_norm": 0.8820711374282837,
+      "learning_rate": 3.187823227000747e-05,
+      "loss": 0.7976,
+      "step": 16602
+    },
+    {
+      "epoch": 2.9560185185185186,
+      "grad_norm": 0.9349139928817749,
+      "learning_rate": 3.1867985655064205e-05,
+      "loss": 0.8965,
+      "step": 16603
+    },
+    {
+      "epoch": 2.9561965811965814,
+      "grad_norm": 0.939132571220398,
+      "learning_rate": 3.185774037502627e-05,
+      "loss": 0.9152,
+      "step": 16604
+    },
+    {
+      "epoch": 2.9563746438746437,
+      "grad_norm": 1.029159665107727,
+      "learning_rate": 3.184749643009435e-05,
+      "loss": 1.0904,
+      "step": 16605
+    },
+    {
+      "epoch": 2.9565527065527064,
+      "grad_norm": 0.966548502445221,
+      "learning_rate": 3.183725382046917e-05,
+      "loss": 0.9508,
+      "step": 16606
+    },
+    {
+      "epoch": 2.956730769230769,
+      "grad_norm": 0.8782771229743958,
+      "learning_rate": 3.18270125463514e-05,
+      "loss": 0.9929,
+      "step": 16607
+    },
+    {
+      "epoch": 2.956908831908832,
+      "grad_norm": 0.9193231463432312,
+      "learning_rate": 3.18167726079417e-05,
+      "loss": 0.9942,
+      "step": 16608
+    },
+    {
+      "epoch": 2.9570868945868947,
+      "grad_norm": 0.8940062522888184,
+      "learning_rate": 3.180653400544071e-05,
+      "loss": 1.0594,
+      "step": 16609
+    },
+    {
+      "epoch": 2.9572649572649574,
+      "grad_norm": 0.8237268328666687,
+      "learning_rate": 3.179629673904903e-05,
+      "loss": 0.8211,
+      "step": 16610
+    },
+    {
+      "epoch": 2.95744301994302,
+      "grad_norm": 0.92745041847229,
+      "learning_rate": 3.17860608089672e-05,
+      "loss": 0.8287,
+      "step": 16611
+    },
+    {
+      "epoch": 2.9576210826210825,
+      "grad_norm": 0.8203856348991394,
+      "learning_rate": 3.177582621539586e-05,
+      "loss": 0.9457,
+      "step": 16612
+    },
+    {
+      "epoch": 2.9577991452991452,
+      "grad_norm": 0.8719314932823181,
+      "learning_rate": 3.176559295853543e-05,
+      "loss": 0.8022,
+      "step": 16613
+    },
+    {
+      "epoch": 2.957977207977208,
+      "grad_norm": 0.8742199540138245,
+      "learning_rate": 3.175536103858648e-05,
+      "loss": 0.976,
+      "step": 16614
+    },
+    {
+      "epoch": 2.9581552706552707,
+      "grad_norm": 0.8677577972412109,
+      "learning_rate": 3.174513045574947e-05,
+      "loss": 0.9513,
+      "step": 16615
+    },
+    {
+      "epoch": 2.9583333333333335,
+      "grad_norm": 0.809138298034668,
+      "learning_rate": 3.173490121022485e-05,
+      "loss": 0.7001,
+      "step": 16616
+    },
+    {
+      "epoch": 2.958511396011396,
+      "grad_norm": 0.9672527313232422,
+      "learning_rate": 3.1724673302213025e-05,
+      "loss": 0.9932,
+      "step": 16617
+    },
+    {
+      "epoch": 2.9586894586894585,
+      "grad_norm": 0.8101853728294373,
+      "learning_rate": 3.17144467319144e-05,
+      "loss": 0.8236,
+      "step": 16618
+    },
+    {
+      "epoch": 2.9588675213675213,
+      "grad_norm": 0.8965981006622314,
+      "learning_rate": 3.170422149952931e-05,
+      "loss": 0.9353,
+      "step": 16619
+    },
+    {
+      "epoch": 2.959045584045584,
+      "grad_norm": 0.8267533183097839,
+      "learning_rate": 3.1693997605258184e-05,
+      "loss": 0.6445,
+      "step": 16620
+    },
+    {
+      "epoch": 2.959223646723647,
+      "grad_norm": 0.9084979891777039,
+      "learning_rate": 3.168377504930122e-05,
+      "loss": 0.9311,
+      "step": 16621
+    },
+    {
+      "epoch": 2.9594017094017095,
+      "grad_norm": 0.8414687514305115,
+      "learning_rate": 3.1673553831858805e-05,
+      "loss": 0.7663,
+      "step": 16622
+    },
+    {
+      "epoch": 2.9595797720797723,
+      "grad_norm": 0.8619266748428345,
+      "learning_rate": 3.166333395313116e-05,
+      "loss": 0.8524,
+      "step": 16623
+    },
+    {
+      "epoch": 2.9597578347578346,
+      "grad_norm": 0.9963071346282959,
+      "learning_rate": 3.1653115413318534e-05,
+      "loss": 0.813,
+      "step": 16624
+    },
+    {
+      "epoch": 2.9599358974358974,
+      "grad_norm": 0.8264978528022766,
+      "learning_rate": 3.164289821262113e-05,
+      "loss": 0.9716,
+      "step": 16625
+    },
+    {
+      "epoch": 2.96011396011396,
+      "grad_norm": 0.8166584372520447,
+      "learning_rate": 3.163268235123911e-05,
+      "loss": 0.7492,
+      "step": 16626
+    },
+    {
+      "epoch": 2.960292022792023,
+      "grad_norm": 0.809241533279419,
+      "learning_rate": 3.1622467829372724e-05,
+      "loss": 0.9033,
+      "step": 16627
+    },
+    {
+      "epoch": 2.9604700854700856,
+      "grad_norm": 1.0441617965698242,
+      "learning_rate": 3.161225464722197e-05,
+      "loss": 0.7305,
+      "step": 16628
+    },
+    {
+      "epoch": 2.960648148148148,
+      "grad_norm": 0.9226490259170532,
+      "learning_rate": 3.160204280498705e-05,
+      "loss": 0.6896,
+      "step": 16629
+    },
+    {
+      "epoch": 2.9608262108262107,
+      "grad_norm": 0.7890266180038452,
+      "learning_rate": 3.159183230286803e-05,
+      "loss": 0.873,
+      "step": 16630
+    },
+    {
+      "epoch": 2.9610042735042734,
+      "grad_norm": 0.8491777181625366,
+      "learning_rate": 3.1581623141064934e-05,
+      "loss": 0.8828,
+      "step": 16631
+    },
+    {
+      "epoch": 2.961182336182336,
+      "grad_norm": 0.7522078156471252,
+      "learning_rate": 3.157141531977782e-05,
+      "loss": 0.7717,
+      "step": 16632
+    },
+    {
+      "epoch": 2.961360398860399,
+      "grad_norm": 0.9260183572769165,
+      "learning_rate": 3.156120883920667e-05,
+      "loss": 0.9715,
+      "step": 16633
+    },
+    {
+      "epoch": 2.9615384615384617,
+      "grad_norm": 0.6620128154754639,
+      "learning_rate": 3.1551003699551465e-05,
+      "loss": 0.5624,
+      "step": 16634
+    },
+    {
+      "epoch": 2.9617165242165244,
+      "grad_norm": 0.7571594715118408,
+      "learning_rate": 3.154079990101214e-05,
+      "loss": 0.6911,
+      "step": 16635
+    },
+    {
+      "epoch": 2.9618945868945867,
+      "grad_norm": 0.8571279644966125,
+      "learning_rate": 3.153059744378861e-05,
+      "loss": 0.9057,
+      "step": 16636
+    },
+    {
+      "epoch": 2.9620726495726495,
+      "grad_norm": 0.8895478248596191,
+      "learning_rate": 3.152039632808085e-05,
+      "loss": 0.7097,
+      "step": 16637
+    },
+    {
+      "epoch": 2.9622507122507122,
+      "grad_norm": 0.8340024352073669,
+      "learning_rate": 3.15101965540886e-05,
+      "loss": 0.7719,
+      "step": 16638
+    },
+    {
+      "epoch": 2.962428774928775,
+      "grad_norm": 0.8263829946517944,
+      "learning_rate": 3.149999812201182e-05,
+      "loss": 0.8561,
+      "step": 16639
+    },
+    {
+      "epoch": 2.9626068376068377,
+      "grad_norm": 0.9083819389343262,
+      "learning_rate": 3.148980103205027e-05,
+      "loss": 0.9319,
+      "step": 16640
+    },
+    {
+      "epoch": 2.9627849002849,
+      "grad_norm": 0.9346078038215637,
+      "learning_rate": 3.147960528440372e-05,
+      "loss": 0.8036,
+      "step": 16641
+    },
+    {
+      "epoch": 2.962962962962963,
+      "grad_norm": 0.907319188117981,
+      "learning_rate": 3.146941087927203e-05,
+      "loss": 0.9228,
+      "step": 16642
+    },
+    {
+      "epoch": 2.9631410256410255,
+      "grad_norm": 0.7912126183509827,
+      "learning_rate": 3.1459217816854815e-05,
+      "loss": 0.7996,
+      "step": 16643
+    },
+    {
+      "epoch": 2.9633190883190883,
+      "grad_norm": 0.8844919800758362,
+      "learning_rate": 3.1449026097351896e-05,
+      "loss": 1.0235,
+      "step": 16644
+    },
+    {
+      "epoch": 2.963497150997151,
+      "grad_norm": 0.7468230128288269,
+      "learning_rate": 3.143883572096286e-05,
+      "loss": 0.7292,
+      "step": 16645
+    },
+    {
+      "epoch": 2.963675213675214,
+      "grad_norm": 0.8521941900253296,
+      "learning_rate": 3.142864668788744e-05,
+      "loss": 0.9012,
+      "step": 16646
+    },
+    {
+      "epoch": 2.9638532763532766,
+      "grad_norm": 0.9340695738792419,
+      "learning_rate": 3.141845899832524e-05,
+      "loss": 0.8736,
+      "step": 16647
+    },
+    {
+      "epoch": 2.9640313390313393,
+      "grad_norm": 0.859395444393158,
+      "learning_rate": 3.140827265247588e-05,
+      "loss": 0.796,
+      "step": 16648
+    },
+    {
+      "epoch": 2.9642094017094016,
+      "grad_norm": 0.8320850729942322,
+      "learning_rate": 3.139808765053892e-05,
+      "loss": 0.86,
+      "step": 16649
+    },
+    {
+      "epoch": 2.9643874643874644,
+      "grad_norm": 0.8911257386207581,
+      "learning_rate": 3.138790399271393e-05,
+      "loss": 0.966,
+      "step": 16650
+    },
+    {
+      "epoch": 2.964565527065527,
+      "grad_norm": 0.8817025423049927,
+      "learning_rate": 3.13777216792004e-05,
+      "loss": 0.983,
+      "step": 16651
+    },
+    {
+      "epoch": 2.96474358974359,
+      "grad_norm": 0.7765538692474365,
+      "learning_rate": 3.136754071019793e-05,
+      "loss": 0.643,
+      "step": 16652
+    },
+    {
+      "epoch": 2.964921652421652,
+      "grad_norm": 0.7961843609809875,
+      "learning_rate": 3.135736108590586e-05,
+      "loss": 0.733,
+      "step": 16653
+    },
+    {
+      "epoch": 2.965099715099715,
+      "grad_norm": 0.7910877466201782,
+      "learning_rate": 3.134718280652373e-05,
+      "loss": 0.7291,
+      "step": 16654
+    },
+    {
+      "epoch": 2.9652777777777777,
+      "grad_norm": 0.9326547384262085,
+      "learning_rate": 3.1337005872250956e-05,
+      "loss": 0.8834,
+      "step": 16655
+    },
+    {
+      "epoch": 2.9654558404558404,
+      "grad_norm": 0.9362995624542236,
+      "learning_rate": 3.132683028328691e-05,
+      "loss": 0.9693,
+      "step": 16656
+    },
+    {
+      "epoch": 2.965633903133903,
+      "grad_norm": 0.8322434425354004,
+      "learning_rate": 3.131665603983096e-05,
+      "loss": 0.9782,
+      "step": 16657
+    },
+    {
+      "epoch": 2.965811965811966,
+      "grad_norm": 0.8336097598075867,
+      "learning_rate": 3.130648314208247e-05,
+      "loss": 0.9995,
+      "step": 16658
+    },
+    {
+      "epoch": 2.9659900284900287,
+      "grad_norm": 0.8637044429779053,
+      "learning_rate": 3.129631159024074e-05,
+      "loss": 0.8291,
+      "step": 16659
+    },
+    {
+      "epoch": 2.9661680911680914,
+      "grad_norm": 0.7853943109512329,
+      "learning_rate": 3.128614138450506e-05,
+      "loss": 0.9016,
+      "step": 16660
+    },
+    {
+      "epoch": 2.9663461538461537,
+      "grad_norm": 0.9506492614746094,
+      "learning_rate": 3.1275972525074674e-05,
+      "loss": 0.9762,
+      "step": 16661
+    },
+    {
+      "epoch": 2.9665242165242165,
+      "grad_norm": 0.9375113248825073,
+      "learning_rate": 3.126580501214887e-05,
+      "loss": 0.8981,
+      "step": 16662
+    },
+    {
+      "epoch": 2.9667022792022792,
+      "grad_norm": 0.9404717683792114,
+      "learning_rate": 3.125563884592684e-05,
+      "loss": 0.7506,
+      "step": 16663
+    },
+    {
+      "epoch": 2.966880341880342,
+      "grad_norm": 0.9678782820701599,
+      "learning_rate": 3.124547402660776e-05,
+      "loss": 0.896,
+      "step": 16664
+    },
+    {
+      "epoch": 2.9670584045584043,
+      "grad_norm": 0.8530639410018921,
+      "learning_rate": 3.12353105543908e-05,
+      "loss": 0.8792,
+      "step": 16665
+    },
+    {
+      "epoch": 2.967236467236467,
+      "grad_norm": 0.8015365600585938,
+      "learning_rate": 3.122514842947504e-05,
+      "loss": 0.7052,
+      "step": 16666
+    },
+    {
+      "epoch": 2.96741452991453,
+      "grad_norm": 0.8753054141998291,
+      "learning_rate": 3.121498765205969e-05,
+      "loss": 0.9032,
+      "step": 16667
+    },
+    {
+      "epoch": 2.9675925925925926,
+      "grad_norm": 1.0053389072418213,
+      "learning_rate": 3.1204828222343716e-05,
+      "loss": 1.0236,
+      "step": 16668
+    },
+    {
+      "epoch": 2.9677706552706553,
+      "grad_norm": 0.8506449460983276,
+      "learning_rate": 3.119467014052628e-05,
+      "loss": 0.6762,
+      "step": 16669
+    },
+    {
+      "epoch": 2.967948717948718,
+      "grad_norm": 0.9376404881477356,
+      "learning_rate": 3.118451340680629e-05,
+      "loss": 0.9391,
+      "step": 16670
+    },
+    {
+      "epoch": 2.968126780626781,
+      "grad_norm": 0.8381097912788391,
+      "learning_rate": 3.117435802138284e-05,
+      "loss": 0.7343,
+      "step": 16671
+    },
+    {
+      "epoch": 2.9683048433048436,
+      "grad_norm": 0.8525682091712952,
+      "learning_rate": 3.116420398445488e-05,
+      "loss": 0.7928,
+      "step": 16672
+    },
+    {
+      "epoch": 2.968482905982906,
+      "grad_norm": 0.9399489760398865,
+      "learning_rate": 3.115405129622133e-05,
+      "loss": 0.975,
+      "step": 16673
+    },
+    {
+      "epoch": 2.9686609686609686,
+      "grad_norm": 0.8394346237182617,
+      "learning_rate": 3.114389995688114e-05,
+      "loss": 0.7591,
+      "step": 16674
+    },
+    {
+      "epoch": 2.9688390313390314,
+      "grad_norm": 0.8935068845748901,
+      "learning_rate": 3.11337499666332e-05,
+      "loss": 0.8899,
+      "step": 16675
+    },
+    {
+      "epoch": 2.969017094017094,
+      "grad_norm": 0.8111040592193604,
+      "learning_rate": 3.112360132567633e-05,
+      "loss": 0.795,
+      "step": 16676
+    },
+    {
+      "epoch": 2.9691951566951564,
+      "grad_norm": 0.8854177594184875,
+      "learning_rate": 3.1113454034209486e-05,
+      "loss": 1.0677,
+      "step": 16677
+    },
+    {
+      "epoch": 2.969373219373219,
+      "grad_norm": 0.9821479916572571,
+      "learning_rate": 3.110330809243134e-05,
+      "loss": 0.7859,
+      "step": 16678
+    },
+    {
+      "epoch": 2.969551282051282,
+      "grad_norm": 0.9066275954246521,
+      "learning_rate": 3.109316350054079e-05,
+      "loss": 1.1727,
+      "step": 16679
+    },
+    {
+      "epoch": 2.9697293447293447,
+      "grad_norm": 0.981238603591919,
+      "learning_rate": 3.108302025873656e-05,
+      "loss": 1.0036,
+      "step": 16680
+    },
+    {
+      "epoch": 2.9699074074074074,
+      "grad_norm": 0.8290690779685974,
+      "learning_rate": 3.107287836721737e-05,
+      "loss": 0.911,
+      "step": 16681
+    },
+    {
+      "epoch": 2.97008547008547,
+      "grad_norm": 0.8419190049171448,
+      "learning_rate": 3.106273782618196e-05,
+      "loss": 0.688,
+      "step": 16682
+    },
+    {
+      "epoch": 2.970263532763533,
+      "grad_norm": 0.9250679612159729,
+      "learning_rate": 3.1052598635828964e-05,
+      "loss": 0.8506,
+      "step": 16683
+    },
+    {
+      "epoch": 2.9704415954415957,
+      "grad_norm": 0.9414278864860535,
+      "learning_rate": 3.104246079635713e-05,
+      "loss": 0.8501,
+      "step": 16684
+    },
+    {
+      "epoch": 2.970619658119658,
+      "grad_norm": 0.9107208847999573,
+      "learning_rate": 3.1032324307964974e-05,
+      "loss": 0.8234,
+      "step": 16685
+    },
+    {
+      "epoch": 2.9707977207977208,
+      "grad_norm": 0.8691245317459106,
+      "learning_rate": 3.102218917085119e-05,
+      "loss": 0.9341,
+      "step": 16686
+    },
+    {
+      "epoch": 2.9709757834757835,
+      "grad_norm": 0.8816282153129578,
+      "learning_rate": 3.101205538521431e-05,
+      "loss": 0.9412,
+      "step": 16687
+    },
+    {
+      "epoch": 2.9711538461538463,
+      "grad_norm": 0.8969736695289612,
+      "learning_rate": 3.100192295125289e-05,
+      "loss": 0.9468,
+      "step": 16688
+    },
+    {
+      "epoch": 2.9713319088319086,
+      "grad_norm": 1.0070735216140747,
+      "learning_rate": 3.099179186916548e-05,
+      "loss": 0.8587,
+      "step": 16689
+    },
+    {
+      "epoch": 2.9715099715099713,
+      "grad_norm": 0.9801154136657715,
+      "learning_rate": 3.0981662139150537e-05,
+      "loss": 0.8999,
+      "step": 16690
+    },
+    {
+      "epoch": 2.971688034188034,
+      "grad_norm": 0.9949473142623901,
+      "learning_rate": 3.0971533761406526e-05,
+      "loss": 1.0552,
+      "step": 16691
+    },
+    {
+      "epoch": 2.971866096866097,
+      "grad_norm": 0.9654440879821777,
+      "learning_rate": 3.096140673613198e-05,
+      "loss": 0.8618,
+      "step": 16692
+    },
+    {
+      "epoch": 2.9720441595441596,
+      "grad_norm": 0.9247317910194397,
+      "learning_rate": 3.0951281063525185e-05,
+      "loss": 0.8915,
+      "step": 16693
+    },
+    {
+      "epoch": 2.9722222222222223,
+      "grad_norm": 1.0254271030426025,
+      "learning_rate": 3.094115674378467e-05,
+      "loss": 0.9098,
+      "step": 16694
+    },
+    {
+      "epoch": 2.972400284900285,
+      "grad_norm": 0.7822396159172058,
+      "learning_rate": 3.0931033777108666e-05,
+      "loss": 0.8497,
+      "step": 16695
+    },
+    {
+      "epoch": 2.972578347578348,
+      "grad_norm": 0.9368909001350403,
+      "learning_rate": 3.092091216369561e-05,
+      "loss": 0.8978,
+      "step": 16696
+    },
+    {
+      "epoch": 2.97275641025641,
+      "grad_norm": 1.0181694030761719,
+      "learning_rate": 3.0910791903743786e-05,
+      "loss": 1.0605,
+      "step": 16697
+    },
+    {
+      "epoch": 2.972934472934473,
+      "grad_norm": 0.8718426823616028,
+      "learning_rate": 3.090067299745146e-05,
+      "loss": 0.9844,
+      "step": 16698
+    },
+    {
+      "epoch": 2.9731125356125356,
+      "grad_norm": 0.8327153325080872,
+      "learning_rate": 3.089055544501692e-05,
+      "loss": 0.8795,
+      "step": 16699
+    },
+    {
+      "epoch": 2.9732905982905984,
+      "grad_norm": 0.7970293164253235,
+      "learning_rate": 3.088043924663838e-05,
+      "loss": 0.794,
+      "step": 16700
+    },
+    {
+      "epoch": 2.9734686609686607,
+      "grad_norm": 0.8885688185691833,
+      "learning_rate": 3.0870324402514014e-05,
+      "loss": 0.984,
+      "step": 16701
+    },
+    {
+      "epoch": 2.9736467236467234,
+      "grad_norm": 0.9126071333885193,
+      "learning_rate": 3.086021091284207e-05,
+      "loss": 0.8962,
+      "step": 16702
+    },
+    {
+      "epoch": 2.973824786324786,
+      "grad_norm": 0.9220738410949707,
+      "learning_rate": 3.085009877782067e-05,
+      "loss": 0.9129,
+      "step": 16703
+    },
+    {
+      "epoch": 2.974002849002849,
+      "grad_norm": 0.8738197684288025,
+      "learning_rate": 3.0839987997647935e-05,
+      "loss": 0.871,
+      "step": 16704
+    },
+    {
+      "epoch": 2.9741809116809117,
+      "grad_norm": 0.7505079507827759,
+      "learning_rate": 3.0829878572521964e-05,
+      "loss": 0.7934,
+      "step": 16705
+    },
+    {
+      "epoch": 2.9743589743589745,
+      "grad_norm": 0.8328925371170044,
+      "learning_rate": 3.081977050264084e-05,
+      "loss": 0.8431,
+      "step": 16706
+    },
+    {
+      "epoch": 2.974537037037037,
+      "grad_norm": 0.8248029947280884,
+      "learning_rate": 3.08096637882026e-05,
+      "loss": 0.7877,
+      "step": 16707
+    },
+    {
+      "epoch": 2.9747150997151,
+      "grad_norm": 1.1086161136627197,
+      "learning_rate": 3.079955842940524e-05,
+      "loss": 1.1196,
+      "step": 16708
+    },
+    {
+      "epoch": 2.9748931623931623,
+      "grad_norm": 0.8271856307983398,
+      "learning_rate": 3.078945442644684e-05,
+      "loss": 0.946,
+      "step": 16709
+    },
+    {
+      "epoch": 2.975071225071225,
+      "grad_norm": 0.9062432050704956,
+      "learning_rate": 3.0779351779525246e-05,
+      "loss": 0.9369,
+      "step": 16710
+    },
+    {
+      "epoch": 2.9752492877492878,
+      "grad_norm": 0.9849454760551453,
+      "learning_rate": 3.0769250488838496e-05,
+      "loss": 0.7748,
+      "step": 16711
+    },
+    {
+      "epoch": 2.9754273504273505,
+      "grad_norm": 0.8420920372009277,
+      "learning_rate": 3.075915055458447e-05,
+      "loss": 0.9954,
+      "step": 16712
+    },
+    {
+      "epoch": 2.9756054131054133,
+      "grad_norm": 1.027130126953125,
+      "learning_rate": 3.074905197696104e-05,
+      "loss": 1.1053,
+      "step": 16713
+    },
+    {
+      "epoch": 2.9757834757834756,
+      "grad_norm": 0.8968556523323059,
+      "learning_rate": 3.073895475616609e-05,
+      "loss": 0.7487,
+      "step": 16714
+    },
+    {
+      "epoch": 2.9759615384615383,
+      "grad_norm": 0.9237984418869019,
+      "learning_rate": 3.072885889239745e-05,
+      "loss": 1.2853,
+      "step": 16715
+    },
+    {
+      "epoch": 2.976139601139601,
+      "grad_norm": 0.7972691059112549,
+      "learning_rate": 3.071876438585292e-05,
+      "loss": 0.6724,
+      "step": 16716
+    },
+    {
+      "epoch": 2.976317663817664,
+      "grad_norm": 0.8158389925956726,
+      "learning_rate": 3.0708671236730266e-05,
+      "loss": 0.8302,
+      "step": 16717
+    },
+    {
+      "epoch": 2.9764957264957266,
+      "grad_norm": 0.7947682738304138,
+      "learning_rate": 3.0698579445227236e-05,
+      "loss": 0.7817,
+      "step": 16718
+    },
+    {
+      "epoch": 2.9766737891737893,
+      "grad_norm": 0.847747802734375,
+      "learning_rate": 3.068848901154166e-05,
+      "loss": 0.8391,
+      "step": 16719
+    },
+    {
+      "epoch": 2.976851851851852,
+      "grad_norm": 0.7906867265701294,
+      "learning_rate": 3.067839993587107e-05,
+      "loss": 0.8935,
+      "step": 16720
+    },
+    {
+      "epoch": 2.9770299145299144,
+      "grad_norm": 0.9028356075286865,
+      "learning_rate": 3.066831221841328e-05,
+      "loss": 0.821,
+      "step": 16721
+    },
+    {
+      "epoch": 2.977207977207977,
+      "grad_norm": 0.8177092671394348,
+      "learning_rate": 3.065822585936589e-05,
+      "loss": 0.7956,
+      "step": 16722
+    },
+    {
+      "epoch": 2.97738603988604,
+      "grad_norm": 0.771271824836731,
+      "learning_rate": 3.064814085892647e-05,
+      "loss": 0.8675,
+      "step": 16723
+    },
+    {
+      "epoch": 2.9775641025641026,
+      "grad_norm": 0.8575150966644287,
+      "learning_rate": 3.063805721729274e-05,
+      "loss": 0.6999,
+      "step": 16724
+    },
+    {
+      "epoch": 2.9777421652421654,
+      "grad_norm": 0.8909936547279358,
+      "learning_rate": 3.062797493466212e-05,
+      "loss": 0.9325,
+      "step": 16725
+    },
+    {
+      "epoch": 2.9779202279202277,
+      "grad_norm": 0.836875319480896,
+      "learning_rate": 3.061789401123226e-05,
+      "loss": 0.9276,
+      "step": 16726
+    },
+    {
+      "epoch": 2.9780982905982905,
+      "grad_norm": 0.8965426683425903,
+      "learning_rate": 3.060781444720065e-05,
+      "loss": 0.7728,
+      "step": 16727
+    },
+    {
+      "epoch": 2.978276353276353,
+      "grad_norm": 0.8298100829124451,
+      "learning_rate": 3.059773624276475e-05,
+      "loss": 0.926,
+      "step": 16728
+    },
+    {
+      "epoch": 2.978454415954416,
+      "grad_norm": 0.9369875192642212,
+      "learning_rate": 3.058765939812204e-05,
+      "loss": 0.8598,
+      "step": 16729
+    },
+    {
+      "epoch": 2.9786324786324787,
+      "grad_norm": 1.019013524055481,
+      "learning_rate": 3.057758391346995e-05,
+      "loss": 0.9075,
+      "step": 16730
+    },
+    {
+      "epoch": 2.9788105413105415,
+      "grad_norm": 0.8597549200057983,
+      "learning_rate": 3.05675097890059e-05,
+      "loss": 0.8279,
+      "step": 16731
+    },
+    {
+      "epoch": 2.978988603988604,
+      "grad_norm": 0.8331323862075806,
+      "learning_rate": 3.055743702492726e-05,
+      "loss": 0.8366,
+      "step": 16732
+    },
+    {
+      "epoch": 2.9791666666666665,
+      "grad_norm": 0.7987640500068665,
+      "learning_rate": 3.054736562143135e-05,
+      "loss": 0.7618,
+      "step": 16733
+    },
+    {
+      "epoch": 2.9793447293447293,
+      "grad_norm": 0.84430330991745,
+      "learning_rate": 3.0537295578715606e-05,
+      "loss": 0.9791,
+      "step": 16734
+    },
+    {
+      "epoch": 2.979522792022792,
+      "grad_norm": 0.7811259627342224,
+      "learning_rate": 3.052722689697719e-05,
+      "loss": 0.7342,
+      "step": 16735
+    },
+    {
+      "epoch": 2.9797008547008548,
+      "grad_norm": 0.8311488032341003,
+      "learning_rate": 3.0517159576413477e-05,
+      "loss": 0.64,
+      "step": 16736
+    },
+    {
+      "epoch": 2.9798789173789175,
+      "grad_norm": 0.8865106105804443,
+      "learning_rate": 3.0507093617221683e-05,
+      "loss": 0.9196,
+      "step": 16737
+    },
+    {
+      "epoch": 2.98005698005698,
+      "grad_norm": 0.7360345125198364,
+      "learning_rate": 3.0497029019599033e-05,
+      "loss": 0.8709,
+      "step": 16738
+    },
+    {
+      "epoch": 2.9802350427350426,
+      "grad_norm": 0.8705546855926514,
+      "learning_rate": 3.0486965783742726e-05,
+      "loss": 0.7567,
+      "step": 16739
+    },
+    {
+      "epoch": 2.9804131054131053,
+      "grad_norm": 0.8622585535049438,
+      "learning_rate": 3.0476903909849908e-05,
+      "loss": 0.7909,
+      "step": 16740
+    },
+    {
+      "epoch": 2.980591168091168,
+      "grad_norm": 0.7877208590507507,
+      "learning_rate": 3.046684339811775e-05,
+      "loss": 0.7046,
+      "step": 16741
+    },
+    {
+      "epoch": 2.980769230769231,
+      "grad_norm": 0.915133535861969,
+      "learning_rate": 3.045678424874332e-05,
+      "loss": 0.9142,
+      "step": 16742
+    },
+    {
+      "epoch": 2.9809472934472936,
+      "grad_norm": 0.8539748191833496,
+      "learning_rate": 3.044672646192377e-05,
+      "loss": 0.9333,
+      "step": 16743
+    },
+    {
+      "epoch": 2.9811253561253563,
+      "grad_norm": 1.1402504444122314,
+      "learning_rate": 3.043667003785612e-05,
+      "loss": 0.8041,
+      "step": 16744
+    },
+    {
+      "epoch": 2.9813034188034186,
+      "grad_norm": 0.7355749607086182,
+      "learning_rate": 3.042661497673742e-05,
+      "loss": 0.8057,
+      "step": 16745
+    },
+    {
+      "epoch": 2.9814814814814814,
+      "grad_norm": 0.8524863719940186,
+      "learning_rate": 3.041656127876469e-05,
+      "loss": 0.7118,
+      "step": 16746
+    },
+    {
+      "epoch": 2.981659544159544,
+      "grad_norm": 0.9434519410133362,
+      "learning_rate": 3.040650894413487e-05,
+      "loss": 0.8104,
+      "step": 16747
+    },
+    {
+      "epoch": 2.981837606837607,
+      "grad_norm": 0.8716524839401245,
+      "learning_rate": 3.0396457973044923e-05,
+      "loss": 0.7502,
+      "step": 16748
+    },
+    {
+      "epoch": 2.9820156695156697,
+      "grad_norm": 0.942481279373169,
+      "learning_rate": 3.0386408365691855e-05,
+      "loss": 0.8506,
+      "step": 16749
+    },
+    {
+      "epoch": 2.982193732193732,
+      "grad_norm": 0.7921879291534424,
+      "learning_rate": 3.037636012227244e-05,
+      "loss": 0.7005,
+      "step": 16750
+    },
+    {
+      "epoch": 2.9823717948717947,
+      "grad_norm": 0.8415821194648743,
+      "learning_rate": 3.0366313242983645e-05,
+      "loss": 0.987,
+      "step": 16751
+    },
+    {
+      "epoch": 2.9825498575498575,
+      "grad_norm": 0.9556678533554077,
+      "learning_rate": 3.0356267728022293e-05,
+      "loss": 0.9198,
+      "step": 16752
+    },
+    {
+      "epoch": 2.98272792022792,
+      "grad_norm": 0.856810450553894,
+      "learning_rate": 3.0346223577585208e-05,
+      "loss": 0.9634,
+      "step": 16753
+    },
+    {
+      "epoch": 2.982905982905983,
+      "grad_norm": 0.8425500392913818,
+      "learning_rate": 3.0336180791869172e-05,
+      "loss": 0.8725,
+      "step": 16754
+    },
+    {
+      "epoch": 2.9830840455840457,
+      "grad_norm": 0.773210883140564,
+      "learning_rate": 3.032613937107096e-05,
+      "loss": 0.8363,
+      "step": 16755
+    },
+    {
+      "epoch": 2.9832621082621085,
+      "grad_norm": 0.7771822214126587,
+      "learning_rate": 3.0316099315387324e-05,
+      "loss": 0.8586,
+      "step": 16756
+    },
+    {
+      "epoch": 2.9834401709401708,
+      "grad_norm": 0.8261004686355591,
+      "learning_rate": 3.0306060625014954e-05,
+      "loss": 0.8185,
+      "step": 16757
+    },
+    {
+      "epoch": 2.9836182336182335,
+      "grad_norm": 1.074456810951233,
+      "learning_rate": 3.029602330015052e-05,
+      "loss": 0.9008,
+      "step": 16758
+    },
+    {
+      "epoch": 2.9837962962962963,
+      "grad_norm": 0.8160609006881714,
+      "learning_rate": 3.0285987340990774e-05,
+      "loss": 0.8494,
+      "step": 16759
+    },
+    {
+      "epoch": 2.983974358974359,
+      "grad_norm": 0.8296660780906677,
+      "learning_rate": 3.0275952747732227e-05,
+      "loss": 0.9089,
+      "step": 16760
+    },
+    {
+      "epoch": 2.984152421652422,
+      "grad_norm": 0.815726637840271,
+      "learning_rate": 3.0265919520571572e-05,
+      "loss": 0.8605,
+      "step": 16761
+    },
+    {
+      "epoch": 2.984330484330484,
+      "grad_norm": 0.7600420117378235,
+      "learning_rate": 3.0255887659705374e-05,
+      "loss": 0.6833,
+      "step": 16762
+    },
+    {
+      "epoch": 2.984508547008547,
+      "grad_norm": 0.8039379119873047,
+      "learning_rate": 3.024585716533014e-05,
+      "loss": 0.8156,
+      "step": 16763
+    },
+    {
+      "epoch": 2.9846866096866096,
+      "grad_norm": 0.8797104358673096,
+      "learning_rate": 3.02358280376425e-05,
+      "loss": 0.9965,
+      "step": 16764
+    },
+    {
+      "epoch": 2.9848646723646723,
+      "grad_norm": 0.8681818246841431,
+      "learning_rate": 3.0225800276838822e-05,
+      "loss": 0.8967,
+      "step": 16765
+    },
+    {
+      "epoch": 2.985042735042735,
+      "grad_norm": 0.804710328578949,
+      "learning_rate": 3.0215773883115706e-05,
+      "loss": 0.7424,
+      "step": 16766
+    },
+    {
+      "epoch": 2.985220797720798,
+      "grad_norm": 0.7346279621124268,
+      "learning_rate": 3.0205748856669467e-05,
+      "loss": 0.6986,
+      "step": 16767
+    },
+    {
+      "epoch": 2.9853988603988606,
+      "grad_norm": 0.8896076083183289,
+      "learning_rate": 3.0195725197696634e-05,
+      "loss": 0.8472,
+      "step": 16768
+    },
+    {
+      "epoch": 2.9855769230769234,
+      "grad_norm": 0.8281870484352112,
+      "learning_rate": 3.0185702906393555e-05,
+      "loss": 0.9873,
+      "step": 16769
+    },
+    {
+      "epoch": 2.9857549857549857,
+      "grad_norm": 0.7376837134361267,
+      "learning_rate": 3.0175681982956607e-05,
+      "loss": 1.0171,
+      "step": 16770
+    },
+    {
+      "epoch": 2.9859330484330484,
+      "grad_norm": 0.9346923828125,
+      "learning_rate": 3.016566242758212e-05,
+      "loss": 0.8742,
+      "step": 16771
+    },
+    {
+      "epoch": 2.986111111111111,
+      "grad_norm": 0.918557345867157,
+      "learning_rate": 3.0155644240466406e-05,
+      "loss": 1.0504,
+      "step": 16772
+    },
+    {
+      "epoch": 2.986289173789174,
+      "grad_norm": 0.8885583281517029,
+      "learning_rate": 3.0145627421805733e-05,
+      "loss": 0.7462,
+      "step": 16773
+    },
+    {
+      "epoch": 2.986467236467236,
+      "grad_norm": 0.8475548028945923,
+      "learning_rate": 3.013561197179644e-05,
+      "loss": 0.7353,
+      "step": 16774
+    },
+    {
+      "epoch": 2.986645299145299,
+      "grad_norm": 0.8925435543060303,
+      "learning_rate": 3.0125597890634626e-05,
+      "loss": 0.8462,
+      "step": 16775
+    },
+    {
+      "epoch": 2.9868233618233617,
+      "grad_norm": 0.923281729221344,
+      "learning_rate": 3.0115585178516648e-05,
+      "loss": 0.9725,
+      "step": 16776
+    },
+    {
+      "epoch": 2.9870014245014245,
+      "grad_norm": 0.9134986996650696,
+      "learning_rate": 3.0105573835638533e-05,
+      "loss": 0.9428,
+      "step": 16777
+    },
+    {
+      "epoch": 2.9871794871794872,
+      "grad_norm": 0.8284963369369507,
+      "learning_rate": 3.009556386219654e-05,
+      "loss": 0.9134,
+      "step": 16778
+    },
+    {
+      "epoch": 2.98735754985755,
+      "grad_norm": 0.9571327567100525,
+      "learning_rate": 3.0085555258386756e-05,
+      "loss": 0.8407,
+      "step": 16779
+    },
+    {
+      "epoch": 2.9875356125356127,
+      "grad_norm": 0.8699741363525391,
+      "learning_rate": 3.0075548024405254e-05,
+      "loss": 0.862,
+      "step": 16780
+    },
+    {
+      "epoch": 2.9877136752136755,
+      "grad_norm": 0.9462196826934814,
+      "learning_rate": 3.006554216044819e-05,
+      "loss": 0.8936,
+      "step": 16781
+    },
+    {
+      "epoch": 2.987891737891738,
+      "grad_norm": 0.9415904879570007,
+      "learning_rate": 3.0055537666711496e-05,
+      "loss": 1.0089,
+      "step": 16782
+    },
+    {
+      "epoch": 2.9880698005698005,
+      "grad_norm": 0.8529258370399475,
+      "learning_rate": 3.0045534543391275e-05,
+      "loss": 0.7896,
+      "step": 16783
+    },
+    {
+      "epoch": 2.9882478632478633,
+      "grad_norm": 0.7967036962509155,
+      "learning_rate": 3.0035532790683486e-05,
+      "loss": 0.7661,
+      "step": 16784
+    },
+    {
+      "epoch": 2.988425925925926,
+      "grad_norm": 1.087546944618225,
+      "learning_rate": 3.0025532408784097e-05,
+      "loss": 0.7232,
+      "step": 16785
+    },
+    {
+      "epoch": 2.9886039886039883,
+      "grad_norm": 1.0032312870025635,
+      "learning_rate": 3.001553339788903e-05,
+      "loss": 0.8462,
+      "step": 16786
+    },
+    {
+      "epoch": 2.988782051282051,
+      "grad_norm": 0.7726916074752808,
+      "learning_rate": 3.0005535758194216e-05,
+      "loss": 0.8318,
+      "step": 16787
+    },
+    {
+      "epoch": 2.988960113960114,
+      "grad_norm": 0.8814677000045776,
+      "learning_rate": 2.999553948989552e-05,
+      "loss": 0.8058,
+      "step": 16788
+    },
+    {
+      "epoch": 2.9891381766381766,
+      "grad_norm": 0.8866276741027832,
+      "learning_rate": 2.9985544593188818e-05,
+      "loss": 0.8576,
+      "step": 16789
+    },
+    {
+      "epoch": 2.9893162393162394,
+      "grad_norm": 0.9877329468727112,
+      "learning_rate": 2.997555106826988e-05,
+      "loss": 0.8675,
+      "step": 16790
+    },
+    {
+      "epoch": 2.989494301994302,
+      "grad_norm": 0.842682421207428,
+      "learning_rate": 2.9965558915334636e-05,
+      "loss": 1.094,
+      "step": 16791
+    },
+    {
+      "epoch": 2.989672364672365,
+      "grad_norm": 0.8740942478179932,
+      "learning_rate": 2.9955568134578703e-05,
+      "loss": 0.904,
+      "step": 16792
+    },
+    {
+      "epoch": 2.9898504273504276,
+      "grad_norm": 0.9166956543922424,
+      "learning_rate": 2.9945578726197944e-05,
+      "loss": 1.0695,
+      "step": 16793
+    },
+    {
+      "epoch": 2.99002849002849,
+      "grad_norm": 0.7124375104904175,
+      "learning_rate": 2.993559069038805e-05,
+      "loss": 0.6606,
+      "step": 16794
+    },
+    {
+      "epoch": 2.9902065527065527,
+      "grad_norm": 0.8711931109428406,
+      "learning_rate": 2.99256040273447e-05,
+      "loss": 0.9049,
+      "step": 16795
+    },
+    {
+      "epoch": 2.9903846153846154,
+      "grad_norm": 0.8475961089134216,
+      "learning_rate": 2.9915618737263584e-05,
+      "loss": 0.8487,
+      "step": 16796
+    },
+    {
+      "epoch": 2.990562678062678,
+      "grad_norm": 0.8962427377700806,
+      "learning_rate": 2.9905634820340324e-05,
+      "loss": 0.9318,
+      "step": 16797
+    },
+    {
+      "epoch": 2.9907407407407405,
+      "grad_norm": 0.9120275378227234,
+      "learning_rate": 2.9895652276770514e-05,
+      "loss": 0.8787,
+      "step": 16798
+    },
+    {
+      "epoch": 2.9909188034188032,
+      "grad_norm": 0.8273578882217407,
+      "learning_rate": 2.9885671106749822e-05,
+      "loss": 0.8566,
+      "step": 16799
+    },
+    {
+      "epoch": 2.991096866096866,
+      "grad_norm": 0.8050937056541443,
+      "learning_rate": 2.9875691310473697e-05,
+      "loss": 1.1276,
+      "step": 16800
+    },
+    {
+      "epoch": 2.9912749287749287,
+      "grad_norm": 0.8434747457504272,
+      "learning_rate": 2.9865712888137766e-05,
+      "loss": 0.6286,
+      "step": 16801
+    },
+    {
+      "epoch": 2.9914529914529915,
+      "grad_norm": 0.7851191759109497,
+      "learning_rate": 2.9855735839937493e-05,
+      "loss": 0.8116,
+      "step": 16802
+    },
+    {
+      "epoch": 2.9916310541310542,
+      "grad_norm": 0.8858240842819214,
+      "learning_rate": 2.984576016606837e-05,
+      "loss": 0.8663,
+      "step": 16803
+    },
+    {
+      "epoch": 2.991809116809117,
+      "grad_norm": 0.8688439726829529,
+      "learning_rate": 2.9835785866725842e-05,
+      "loss": 0.9229,
+      "step": 16804
+    },
+    {
+      "epoch": 2.9919871794871797,
+      "grad_norm": 0.7873746752738953,
+      "learning_rate": 2.9825812942105302e-05,
+      "loss": 0.779,
+      "step": 16805
+    },
+    {
+      "epoch": 2.992165242165242,
+      "grad_norm": 0.9577028751373291,
+      "learning_rate": 2.9815841392402255e-05,
+      "loss": 0.7068,
+      "step": 16806
+    },
+    {
+      "epoch": 2.992343304843305,
+      "grad_norm": 0.8219005465507507,
+      "learning_rate": 2.980587121781193e-05,
+      "loss": 0.7712,
+      "step": 16807
+    },
+    {
+      "epoch": 2.9925213675213675,
+      "grad_norm": 0.9321123957633972,
+      "learning_rate": 2.9795902418529776e-05,
+      "loss": 0.9309,
+      "step": 16808
+    },
+    {
+      "epoch": 2.9926994301994303,
+      "grad_norm": 0.8010317087173462,
+      "learning_rate": 2.9785934994751074e-05,
+      "loss": 0.8973,
+      "step": 16809
+    },
+    {
+      "epoch": 2.9928774928774926,
+      "grad_norm": 0.8819838762283325,
+      "learning_rate": 2.9775968946671117e-05,
+      "loss": 0.6899,
+      "step": 16810
+    },
+    {
+      "epoch": 2.9930555555555554,
+      "grad_norm": 0.8337511420249939,
+      "learning_rate": 2.976600427448518e-05,
+      "loss": 0.8005,
+      "step": 16811
+    },
+    {
+      "epoch": 2.993233618233618,
+      "grad_norm": 1.0560593605041504,
+      "learning_rate": 2.975604097838849e-05,
+      "loss": 0.8804,
+      "step": 16812
+    },
+    {
+      "epoch": 2.993411680911681,
+      "grad_norm": 0.8482444882392883,
+      "learning_rate": 2.9746079058576247e-05,
+      "loss": 0.8606,
+      "step": 16813
+    },
+    {
+      "epoch": 2.9935897435897436,
+      "grad_norm": 0.8289902210235596,
+      "learning_rate": 2.973611851524366e-05,
+      "loss": 0.8163,
+      "step": 16814
+    },
+    {
+      "epoch": 2.9937678062678064,
+      "grad_norm": 0.780939519405365,
+      "learning_rate": 2.9726159348585826e-05,
+      "loss": 0.7041,
+      "step": 16815
+    },
+    {
+      "epoch": 2.993945868945869,
+      "grad_norm": 0.8907291889190674,
+      "learning_rate": 2.9716201558797984e-05,
+      "loss": 0.6484,
+      "step": 16816
+    },
+    {
+      "epoch": 2.994123931623932,
+      "grad_norm": 0.9351929426193237,
+      "learning_rate": 2.9706245146075118e-05,
+      "loss": 0.7232,
+      "step": 16817
+    },
+    {
+      "epoch": 2.994301994301994,
+      "grad_norm": 0.934117317199707,
+      "learning_rate": 2.969629011061238e-05,
+      "loss": 0.9189,
+      "step": 16818
+    },
+    {
+      "epoch": 2.994480056980057,
+      "grad_norm": 0.8751780390739441,
+      "learning_rate": 2.968633645260479e-05,
+      "loss": 0.9695,
+      "step": 16819
+    },
+    {
+      "epoch": 2.9946581196581197,
+      "grad_norm": 0.921024739742279,
+      "learning_rate": 2.967638417224734e-05,
+      "loss": 0.9011,
+      "step": 16820
+    },
+    {
+      "epoch": 2.9948361823361824,
+      "grad_norm": 0.7881745100021362,
+      "learning_rate": 2.9666433269735126e-05,
+      "loss": 0.8001,
+      "step": 16821
+    },
+    {
+      "epoch": 2.995014245014245,
+      "grad_norm": 0.8107823729515076,
+      "learning_rate": 2.9656483745262985e-05,
+      "loss": 0.9239,
+      "step": 16822
+    },
+    {
+      "epoch": 2.9951923076923075,
+      "grad_norm": 0.8491915464401245,
+      "learning_rate": 2.964653559902595e-05,
+      "loss": 0.8193,
+      "step": 16823
+    },
+    {
+      "epoch": 2.9953703703703702,
+      "grad_norm": 0.9521864056587219,
+      "learning_rate": 2.9636588831218904e-05,
+      "loss": 1.0745,
+      "step": 16824
+    },
+    {
+      "epoch": 2.995548433048433,
+      "grad_norm": 0.9784126281738281,
+      "learning_rate": 2.9626643442036727e-05,
+      "loss": 0.9899,
+      "step": 16825
+    },
+    {
+      "epoch": 2.9957264957264957,
+      "grad_norm": 0.8035039305686951,
+      "learning_rate": 2.961669943167429e-05,
+      "loss": 0.9172,
+      "step": 16826
+    },
+    {
+      "epoch": 2.9959045584045585,
+      "grad_norm": 0.9292920827865601,
+      "learning_rate": 2.9606756800326408e-05,
+      "loss": 0.9228,
+      "step": 16827
+    },
+    {
+      "epoch": 2.9960826210826212,
+      "grad_norm": 0.9243139028549194,
+      "learning_rate": 2.9596815548187908e-05,
+      "loss": 0.8698,
+      "step": 16828
+    },
+    {
+      "epoch": 2.996260683760684,
+      "grad_norm": 0.9847014546394348,
+      "learning_rate": 2.958687567545355e-05,
+      "loss": 0.8885,
+      "step": 16829
+    },
+    {
+      "epoch": 2.9964387464387463,
+      "grad_norm": 0.9667131900787354,
+      "learning_rate": 2.9576937182318078e-05,
+      "loss": 0.7951,
+      "step": 16830
+    },
+    {
+      "epoch": 2.996616809116809,
+      "grad_norm": 0.8431822061538696,
+      "learning_rate": 2.956700006897628e-05,
+      "loss": 1.0645,
+      "step": 16831
+    },
+    {
+      "epoch": 2.996794871794872,
+      "grad_norm": 0.7381225228309631,
+      "learning_rate": 2.955706433562274e-05,
+      "loss": 0.5631,
+      "step": 16832
+    },
+    {
+      "epoch": 2.9969729344729346,
+      "grad_norm": 0.7975440621376038,
+      "learning_rate": 2.9547129982452228e-05,
+      "loss": 0.8324,
+      "step": 16833
+    },
+    {
+      "epoch": 2.9971509971509973,
+      "grad_norm": 0.9004024863243103,
+      "learning_rate": 2.9537197009659344e-05,
+      "loss": 1.045,
+      "step": 16834
+    },
+    {
+      "epoch": 2.9973290598290596,
+      "grad_norm": 0.786651074886322,
+      "learning_rate": 2.952726541743871e-05,
+      "loss": 0.7569,
+      "step": 16835
+    },
+    {
+      "epoch": 2.9975071225071224,
+      "grad_norm": 0.8053869605064392,
+      "learning_rate": 2.951733520598492e-05,
+      "loss": 0.8327,
+      "step": 16836
+    },
+    {
+      "epoch": 2.997685185185185,
+      "grad_norm": 0.8725607991218567,
+      "learning_rate": 2.9507406375492543e-05,
+      "loss": 0.7702,
+      "step": 16837
+    },
+    {
+      "epoch": 2.997863247863248,
+      "grad_norm": 0.9187145233154297,
+      "learning_rate": 2.9497478926156087e-05,
+      "loss": 0.8946,
+      "step": 16838
+    },
+    {
+      "epoch": 2.9980413105413106,
+      "grad_norm": 0.9324785470962524,
+      "learning_rate": 2.9487552858170076e-05,
+      "loss": 0.8669,
+      "step": 16839
+    },
+    {
+      "epoch": 2.9982193732193734,
+      "grad_norm": 0.8439409136772156,
+      "learning_rate": 2.9477628171728964e-05,
+      "loss": 0.7187,
+      "step": 16840
+    },
+    {
+      "epoch": 2.998397435897436,
+      "grad_norm": 0.7890669703483582,
+      "learning_rate": 2.9467704867027258e-05,
+      "loss": 0.7856,
+      "step": 16841
+    },
+    {
+      "epoch": 2.9985754985754984,
+      "grad_norm": 0.7931693196296692,
+      "learning_rate": 2.9457782944259362e-05,
+      "loss": 0.7212,
+      "step": 16842
+    },
+    {
+      "epoch": 2.998753561253561,
+      "grad_norm": 0.8256354331970215,
+      "learning_rate": 2.9447862403619665e-05,
+      "loss": 0.8255,
+      "step": 16843
+    },
+    {
+      "epoch": 2.998931623931624,
+      "grad_norm": 0.7902492880821228,
+      "learning_rate": 2.9437943245302547e-05,
+      "loss": 0.8298,
+      "step": 16844
+    },
+    {
+      "epoch": 2.9991096866096867,
+      "grad_norm": 1.0743845701217651,
+      "learning_rate": 2.9428025469502318e-05,
+      "loss": 0.92,
+      "step": 16845
+    },
+    {
+      "epoch": 2.9992877492877494,
+      "grad_norm": 0.9262487292289734,
+      "learning_rate": 2.9418109076413404e-05,
+      "loss": 0.9108,
+      "step": 16846
+    },
+    {
+      "epoch": 2.9994658119658117,
+      "grad_norm": 0.855722963809967,
+      "learning_rate": 2.9408194066229944e-05,
+      "loss": 0.8596,
+      "step": 16847
+    },
+    {
+      "epoch": 2.9996438746438745,
+      "grad_norm": 0.8279969692230225,
+      "learning_rate": 2.939828043914634e-05,
+      "loss": 0.8913,
+      "step": 16848
+    },
+    {
+      "epoch": 2.9996438746438745,
+      "eval_loss": 1.1308976411819458,
+      "eval_runtime": 24.7916,
+      "eval_samples_per_second": 41.99,
+      "eval_steps_per_second": 21.015,
+      "step": 16848
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 22464,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 4,
+  "save_steps": 5616,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 6.50817528379392e+17,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-16848/training_args.bin b/checkpoint-16848/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..1245f6a2afbe9a6eefbb6d141231d555e0b0bf84
--- /dev/null
+++ b/checkpoint-16848/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:86de370014ed2be86ea27c820b434ceec5e097da2b5f9b08d0eac9aa564d8961
+size 6200
diff --git a/checkpoint-22464/README.md b/checkpoint-22464/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..719b4726992f7d0707a4253e9123dec35e4de390
--- /dev/null
+++ b/checkpoint-22464/README.md
@@ -0,0 +1,202 @@
+---
+base_model: openlm-research/open_llama_3b_v2
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/checkpoint-22464/adapter_config.json b/checkpoint-22464/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..6b6f20a570fc808390da3f2e001093ac1e56c1da
--- /dev/null
+++ b/checkpoint-22464/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "openlm-research/open_llama_3b_v2",
+  "bias": "none",
+  "fan_in_fan_out": null,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "down_proj",
+    "o_proj",
+    "q_proj",
+    "up_proj",
+    "k_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-22464/adapter_model.safetensors b/checkpoint-22464/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d0a93a30f0bf40bec09a58b6ccc07491c3553cfe
--- /dev/null
+++ b/checkpoint-22464/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:930ff2e666a5de6fb8ff8ade97c1215a559ff9fb63379555eedaffda38295a26
+size 50899792
diff --git a/checkpoint-22464/optimizer.pt b/checkpoint-22464/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4cef3c803b0c82cbbc48b1f436cdf6413d841409
--- /dev/null
+++ b/checkpoint-22464/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6f1587d9d9345add3c41e800393eca55316e9950b5cf74334903865c353653cc
+size 26231684
diff --git a/checkpoint-22464/rng_state.pth b/checkpoint-22464/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..d726c03142c02ea5fd2cae01a48760edaf4acd07
--- /dev/null
+++ b/checkpoint-22464/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:68c0a7bb5c807ad5bc1c4dff28401d8ba88a8d5d80ef889d2be2ca17beb56b13
+size 14244
diff --git a/checkpoint-22464/scheduler.pt b/checkpoint-22464/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a7dc01f8cd8f408ff40f1bf6a8019e6ed38eec82
--- /dev/null
+++ b/checkpoint-22464/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:90ea528e0afe11e8605def0efb5c9acd5db48044014557508bbd87d9badb7309
+size 1064
diff --git a/checkpoint-22464/special_tokens_map.json b/checkpoint-22464/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..72ecfeeb7e14d244c936169d2ed139eeae235ef1
--- /dev/null
+++ b/checkpoint-22464/special_tokens_map.json
@@ -0,0 +1,24 @@
+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/checkpoint-22464/tokenizer.model b/checkpoint-22464/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..98866ff8ae3631f331c57923c921a0c9ad22b97d
--- /dev/null
+++ b/checkpoint-22464/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:91b289e85fa20fd375d8b33dc12f77616f18abc6359804471d1fafcb425fecb8
+size 511574
diff --git a/checkpoint-22464/tokenizer_config.json b/checkpoint-22464/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c218d1b7228e3ad6055bdcf0ec15c4f188dc7d79
--- /dev/null
+++ b/checkpoint-22464/tokenizer_config.json
@@ -0,0 +1,43 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": true,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": true,
+  "model_max_length": 2048,
+  "pad_token": "</s>",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false,
+  "use_fast": true
+}
diff --git a/checkpoint-22464/trainer_state.json b/checkpoint-22464/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..d921e7eefc054473118d8e98b9155471da13c5d8
--- /dev/null
+++ b/checkpoint-22464/trainer_state.json
@@ -0,0 +1,157417 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.9994658119658117,
+  "eval_steps": 1404,
+  "global_step": 22464,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.00017806267806267807,
+      "grad_norm": 0.2854898273944855,
+      "learning_rate": 1e-05,
+      "loss": 1.1997,
+      "step": 1
+    },
+    {
+      "epoch": 0.00017806267806267807,
+      "eval_loss": 1.3698358535766602,
+      "eval_runtime": 24.1591,
+      "eval_samples_per_second": 43.089,
+      "eval_steps_per_second": 21.565,
+      "step": 1
+    },
+    {
+      "epoch": 0.00035612535612535614,
+      "grad_norm": 0.3508087396621704,
+      "learning_rate": 2e-05,
+      "loss": 1.4134,
+      "step": 2
+    },
+    {
+      "epoch": 0.0005341880341880342,
+      "grad_norm": 0.27050870656967163,
+      "learning_rate": 3e-05,
+      "loss": 1.3447,
+      "step": 3
+    },
+    {
+      "epoch": 0.0007122507122507123,
+      "grad_norm": 0.27706292271614075,
+      "learning_rate": 4e-05,
+      "loss": 1.0354,
+      "step": 4
+    },
+    {
+      "epoch": 0.0008903133903133903,
+      "grad_norm": 0.30398961901664734,
+      "learning_rate": 5e-05,
+      "loss": 1.1441,
+      "step": 5
+    },
+    {
+      "epoch": 0.0010683760683760685,
+      "grad_norm": 0.3103881776332855,
+      "learning_rate": 6e-05,
+      "loss": 1.341,
+      "step": 6
+    },
+    {
+      "epoch": 0.0012464387464387464,
+      "grad_norm": 0.5191189646720886,
+      "learning_rate": 7e-05,
+      "loss": 1.3457,
+      "step": 7
+    },
+    {
+      "epoch": 0.0014245014245014246,
+      "grad_norm": 0.4449467360973358,
+      "learning_rate": 8e-05,
+      "loss": 1.5051,
+      "step": 8
+    },
+    {
+      "epoch": 0.0016025641025641025,
+      "grad_norm": 0.3914581537246704,
+      "learning_rate": 9e-05,
+      "loss": 1.5525,
+      "step": 9
+    },
+    {
+      "epoch": 0.0017806267806267807,
+      "grad_norm": 0.37746086716651917,
+      "learning_rate": 0.0001,
+      "loss": 1.3266,
+      "step": 10
+    },
+    {
+      "epoch": 0.001958689458689459,
+      "grad_norm": 0.35226109623908997,
+      "learning_rate": 0.00011000000000000002,
+      "loss": 1.5416,
+      "step": 11
+    },
+    {
+      "epoch": 0.002136752136752137,
+      "grad_norm": 0.3343672454357147,
+      "learning_rate": 0.00012,
+      "loss": 1.3221,
+      "step": 12
+    },
+    {
+      "epoch": 0.0023148148148148147,
+      "grad_norm": 0.47298333048820496,
+      "learning_rate": 0.00013000000000000002,
+      "loss": 1.2999,
+      "step": 13
+    },
+    {
+      "epoch": 0.002492877492877493,
+      "grad_norm": 0.377814918756485,
+      "learning_rate": 0.00014,
+      "loss": 1.1688,
+      "step": 14
+    },
+    {
+      "epoch": 0.002670940170940171,
+      "grad_norm": 0.46344801783561707,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 1.3565,
+      "step": 15
+    },
+    {
+      "epoch": 0.002849002849002849,
+      "grad_norm": 0.49615249037742615,
+      "learning_rate": 0.00016,
+      "loss": 1.5692,
+      "step": 16
+    },
+    {
+      "epoch": 0.003027065527065527,
+      "grad_norm": 0.5109946131706238,
+      "learning_rate": 0.00017,
+      "loss": 1.2991,
+      "step": 17
+    },
+    {
+      "epoch": 0.003205128205128205,
+      "grad_norm": 0.5125070214271545,
+      "learning_rate": 0.00018,
+      "loss": 1.3309,
+      "step": 18
+    },
+    {
+      "epoch": 0.003383190883190883,
+      "grad_norm": 0.4517767131328583,
+      "learning_rate": 0.00019,
+      "loss": 1.357,
+      "step": 19
+    },
+    {
+      "epoch": 0.0035612535612535613,
+      "grad_norm": 0.47267794609069824,
+      "learning_rate": 0.0002,
+      "loss": 1.1301,
+      "step": 20
+    },
+    {
+      "epoch": 0.0037393162393162395,
+      "grad_norm": 0.46823424100875854,
+      "learning_rate": 0.00019999999902035388,
+      "loss": 1.1195,
+      "step": 21
+    },
+    {
+      "epoch": 0.003917378917378918,
+      "grad_norm": 0.440036803483963,
+      "learning_rate": 0.00019999999608141548,
+      "loss": 1.2822,
+      "step": 22
+    },
+    {
+      "epoch": 0.004095441595441595,
+      "grad_norm": 0.371101975440979,
+      "learning_rate": 0.00019999999118318492,
+      "loss": 1.132,
+      "step": 23
+    },
+    {
+      "epoch": 0.004273504273504274,
+      "grad_norm": 0.44691094756126404,
+      "learning_rate": 0.00019999998432566226,
+      "loss": 1.2968,
+      "step": 24
+    },
+    {
+      "epoch": 0.004451566951566952,
+      "grad_norm": 0.5462725162506104,
+      "learning_rate": 0.0001999999755088476,
+      "loss": 1.1714,
+      "step": 25
+    },
+    {
+      "epoch": 0.004629629629629629,
+      "grad_norm": 0.39860013127326965,
+      "learning_rate": 0.0001999999647327412,
+      "loss": 1.0407,
+      "step": 26
+    },
+    {
+      "epoch": 0.004807692307692308,
+      "grad_norm": 0.5031934380531311,
+      "learning_rate": 0.0001999999519973432,
+      "loss": 1.2773,
+      "step": 27
+    },
+    {
+      "epoch": 0.004985754985754986,
+      "grad_norm": 0.42162764072418213,
+      "learning_rate": 0.0001999999373026539,
+      "loss": 1.2824,
+      "step": 28
+    },
+    {
+      "epoch": 0.005163817663817663,
+      "grad_norm": 0.40964868664741516,
+      "learning_rate": 0.00019999992064867353,
+      "loss": 1.226,
+      "step": 29
+    },
+    {
+      "epoch": 0.005341880341880342,
+      "grad_norm": 0.41650915145874023,
+      "learning_rate": 0.00019999990203540245,
+      "loss": 1.2677,
+      "step": 30
+    },
+    {
+      "epoch": 0.00551994301994302,
+      "grad_norm": 0.40052226185798645,
+      "learning_rate": 0.00019999988146284103,
+      "loss": 0.9443,
+      "step": 31
+    },
+    {
+      "epoch": 0.005698005698005698,
+      "grad_norm": 0.5198387503623962,
+      "learning_rate": 0.00019999985893098964,
+      "loss": 1.3043,
+      "step": 32
+    },
+    {
+      "epoch": 0.005876068376068376,
+      "grad_norm": 0.50941002368927,
+      "learning_rate": 0.00019999983443984878,
+      "loss": 1.2002,
+      "step": 33
+    },
+    {
+      "epoch": 0.006054131054131054,
+      "grad_norm": 0.30082932114601135,
+      "learning_rate": 0.00019999980798941888,
+      "loss": 0.9904,
+      "step": 34
+    },
+    {
+      "epoch": 0.006232193732193732,
+      "grad_norm": 0.4228935241699219,
+      "learning_rate": 0.00019999977957970048,
+      "loss": 1.1137,
+      "step": 35
+    },
+    {
+      "epoch": 0.00641025641025641,
+      "grad_norm": 0.41294750571250916,
+      "learning_rate": 0.0001999997492106941,
+      "loss": 1.3385,
+      "step": 36
+    },
+    {
+      "epoch": 0.006588319088319089,
+      "grad_norm": 0.4415493905544281,
+      "learning_rate": 0.00019999971688240041,
+      "loss": 1.1695,
+      "step": 37
+    },
+    {
+      "epoch": 0.006766381766381766,
+      "grad_norm": 0.3726460933685303,
+      "learning_rate": 0.00019999968259482,
+      "loss": 1.1734,
+      "step": 38
+    },
+    {
+      "epoch": 0.006944444444444444,
+      "grad_norm": 0.3969627320766449,
+      "learning_rate": 0.0001999996463479535,
+      "loss": 1.1209,
+      "step": 39
+    },
+    {
+      "epoch": 0.007122507122507123,
+      "grad_norm": 0.3779667913913727,
+      "learning_rate": 0.0001999996081418017,
+      "loss": 1.1635,
+      "step": 40
+    },
+    {
+      "epoch": 0.0073005698005698,
+      "grad_norm": 0.3933636546134949,
+      "learning_rate": 0.0001999995679763653,
+      "loss": 1.1514,
+      "step": 41
+    },
+    {
+      "epoch": 0.007478632478632479,
+      "grad_norm": 0.3567957282066345,
+      "learning_rate": 0.00019999952585164507,
+      "loss": 1.2488,
+      "step": 42
+    },
+    {
+      "epoch": 0.007656695156695157,
+      "grad_norm": 0.32506081461906433,
+      "learning_rate": 0.00019999948176764186,
+      "loss": 1.149,
+      "step": 43
+    },
+    {
+      "epoch": 0.007834757834757835,
+      "grad_norm": 0.46588361263275146,
+      "learning_rate": 0.0001999994357243566,
+      "loss": 1.4263,
+      "step": 44
+    },
+    {
+      "epoch": 0.008012820512820512,
+      "grad_norm": 0.5070307850837708,
+      "learning_rate": 0.00019999938772179005,
+      "loss": 1.0698,
+      "step": 45
+    },
+    {
+      "epoch": 0.00819088319088319,
+      "grad_norm": 0.38199326395988464,
+      "learning_rate": 0.00019999933775994327,
+      "loss": 0.9907,
+      "step": 46
+    },
+    {
+      "epoch": 0.00836894586894587,
+      "grad_norm": 0.43684661388397217,
+      "learning_rate": 0.0001999992858388172,
+      "loss": 1.2905,
+      "step": 47
+    },
+    {
+      "epoch": 0.008547008547008548,
+      "grad_norm": 0.44482162594795227,
+      "learning_rate": 0.00019999923195841284,
+      "loss": 1.2153,
+      "step": 48
+    },
+    {
+      "epoch": 0.008725071225071225,
+      "grad_norm": 0.4259667694568634,
+      "learning_rate": 0.0001999991761187313,
+      "loss": 1.1582,
+      "step": 49
+    },
+    {
+      "epoch": 0.008903133903133903,
+      "grad_norm": 0.41649091243743896,
+      "learning_rate": 0.00019999911831977357,
+      "loss": 1.0185,
+      "step": 50
+    },
+    {
+      "epoch": 0.009081196581196582,
+      "grad_norm": 0.4179716110229492,
+      "learning_rate": 0.0001999990585615409,
+      "loss": 1.3579,
+      "step": 51
+    },
+    {
+      "epoch": 0.009259259259259259,
+      "grad_norm": 0.3372558355331421,
+      "learning_rate": 0.00019999899684403438,
+      "loss": 1.0638,
+      "step": 52
+    },
+    {
+      "epoch": 0.009437321937321937,
+      "grad_norm": 0.41294020414352417,
+      "learning_rate": 0.00019999893316725525,
+      "loss": 1.1932,
+      "step": 53
+    },
+    {
+      "epoch": 0.009615384615384616,
+      "grad_norm": 0.4407919645309448,
+      "learning_rate": 0.00019999886753120473,
+      "loss": 1.4129,
+      "step": 54
+    },
+    {
+      "epoch": 0.009793447293447293,
+      "grad_norm": 0.47948843240737915,
+      "learning_rate": 0.00019999879993588414,
+      "loss": 1.2424,
+      "step": 55
+    },
+    {
+      "epoch": 0.009971509971509971,
+      "grad_norm": 0.3535355031490326,
+      "learning_rate": 0.00019999873038129484,
+      "loss": 1.0145,
+      "step": 56
+    },
+    {
+      "epoch": 0.01014957264957265,
+      "grad_norm": 0.5067078471183777,
+      "learning_rate": 0.00019999865886743813,
+      "loss": 1.4708,
+      "step": 57
+    },
+    {
+      "epoch": 0.010327635327635327,
+      "grad_norm": 0.42862898111343384,
+      "learning_rate": 0.0001999985853943154,
+      "loss": 1.0399,
+      "step": 58
+    },
+    {
+      "epoch": 0.010505698005698005,
+      "grad_norm": 0.4769059419631958,
+      "learning_rate": 0.00019999850996192816,
+      "loss": 1.1258,
+      "step": 59
+    },
+    {
+      "epoch": 0.010683760683760684,
+      "grad_norm": 0.4065442383289337,
+      "learning_rate": 0.0001999984325702778,
+      "loss": 1.2077,
+      "step": 60
+    },
+    {
+      "epoch": 0.010861823361823363,
+      "grad_norm": 0.5318329930305481,
+      "learning_rate": 0.0001999983532193659,
+      "loss": 1.2298,
+      "step": 61
+    },
+    {
+      "epoch": 0.01103988603988604,
+      "grad_norm": 0.4777173101902008,
+      "learning_rate": 0.000199998271909194,
+      "loss": 1.3195,
+      "step": 62
+    },
+    {
+      "epoch": 0.011217948717948718,
+      "grad_norm": 0.37553808093070984,
+      "learning_rate": 0.0001999981886397637,
+      "loss": 1.1188,
+      "step": 63
+    },
+    {
+      "epoch": 0.011396011396011397,
+      "grad_norm": 0.3920556902885437,
+      "learning_rate": 0.0001999981034110766,
+      "loss": 1.1448,
+      "step": 64
+    },
+    {
+      "epoch": 0.011574074074074073,
+      "grad_norm": 0.454272598028183,
+      "learning_rate": 0.0001999980162231344,
+      "loss": 1.0812,
+      "step": 65
+    },
+    {
+      "epoch": 0.011752136752136752,
+      "grad_norm": 0.4354456663131714,
+      "learning_rate": 0.00019999792707593882,
+      "loss": 1.1174,
+      "step": 66
+    },
+    {
+      "epoch": 0.01193019943019943,
+      "grad_norm": 0.5030252933502197,
+      "learning_rate": 0.00019999783596949156,
+      "loss": 1.2925,
+      "step": 67
+    },
+    {
+      "epoch": 0.012108262108262107,
+      "grad_norm": 0.5141571164131165,
+      "learning_rate": 0.00019999774290379446,
+      "loss": 1.6193,
+      "step": 68
+    },
+    {
+      "epoch": 0.012286324786324786,
+      "grad_norm": 0.417298287153244,
+      "learning_rate": 0.0001999976478788493,
+      "loss": 1.1875,
+      "step": 69
+    },
+    {
+      "epoch": 0.012464387464387465,
+      "grad_norm": 0.4642415940761566,
+      "learning_rate": 0.00019999755089465795,
+      "loss": 1.4138,
+      "step": 70
+    },
+    {
+      "epoch": 0.012642450142450143,
+      "grad_norm": 0.43184754252433777,
+      "learning_rate": 0.0001999974519512223,
+      "loss": 1.0697,
+      "step": 71
+    },
+    {
+      "epoch": 0.01282051282051282,
+      "grad_norm": 0.46698349714279175,
+      "learning_rate": 0.00019999735104854436,
+      "loss": 0.709,
+      "step": 72
+    },
+    {
+      "epoch": 0.012998575498575499,
+      "grad_norm": 0.37253814935684204,
+      "learning_rate": 0.000199997248186626,
+      "loss": 1.2084,
+      "step": 73
+    },
+    {
+      "epoch": 0.013176638176638177,
+      "grad_norm": 0.3851388692855835,
+      "learning_rate": 0.0001999971433654693,
+      "loss": 1.0548,
+      "step": 74
+    },
+    {
+      "epoch": 0.013354700854700854,
+      "grad_norm": 0.4434688985347748,
+      "learning_rate": 0.00019999703658507635,
+      "loss": 1.4084,
+      "step": 75
+    },
+    {
+      "epoch": 0.013532763532763533,
+      "grad_norm": 0.43164482712745667,
+      "learning_rate": 0.00019999692784544913,
+      "loss": 1.4872,
+      "step": 76
+    },
+    {
+      "epoch": 0.013710826210826211,
+      "grad_norm": 0.4224303364753723,
+      "learning_rate": 0.00019999681714658984,
+      "loss": 1.2221,
+      "step": 77
+    },
+    {
+      "epoch": 0.013888888888888888,
+      "grad_norm": 0.35588955879211426,
+      "learning_rate": 0.00019999670448850069,
+      "loss": 0.84,
+      "step": 78
+    },
+    {
+      "epoch": 0.014066951566951567,
+      "grad_norm": 0.3970590829849243,
+      "learning_rate": 0.0001999965898711838,
+      "loss": 1.1886,
+      "step": 79
+    },
+    {
+      "epoch": 0.014245014245014245,
+      "grad_norm": 0.4331924319267273,
+      "learning_rate": 0.00019999647329464146,
+      "loss": 1.179,
+      "step": 80
+    },
+    {
+      "epoch": 0.014423076923076924,
+      "grad_norm": 0.4226946234703064,
+      "learning_rate": 0.00019999635475887598,
+      "loss": 1.1496,
+      "step": 81
+    },
+    {
+      "epoch": 0.0146011396011396,
+      "grad_norm": 0.381592720746994,
+      "learning_rate": 0.00019999623426388962,
+      "loss": 1.1774,
+      "step": 82
+    },
+    {
+      "epoch": 0.01477920227920228,
+      "grad_norm": 0.4190855622291565,
+      "learning_rate": 0.00019999611180968478,
+      "loss": 1.1491,
+      "step": 83
+    },
+    {
+      "epoch": 0.014957264957264958,
+      "grad_norm": 0.3904292583465576,
+      "learning_rate": 0.00019999598739626389,
+      "loss": 1.1275,
+      "step": 84
+    },
+    {
+      "epoch": 0.015135327635327635,
+      "grad_norm": 0.4515478014945984,
+      "learning_rate": 0.0001999958610236293,
+      "loss": 1.2404,
+      "step": 85
+    },
+    {
+      "epoch": 0.015313390313390313,
+      "grad_norm": 0.48341724276542664,
+      "learning_rate": 0.00019999573269178359,
+      "loss": 1.3572,
+      "step": 86
+    },
+    {
+      "epoch": 0.015491452991452992,
+      "grad_norm": 0.42150333523750305,
+      "learning_rate": 0.00019999560240072914,
+      "loss": 1.0203,
+      "step": 87
+    },
+    {
+      "epoch": 0.01566951566951567,
+      "grad_norm": 0.45445525646209717,
+      "learning_rate": 0.00019999547015046867,
+      "loss": 1.0677,
+      "step": 88
+    },
+    {
+      "epoch": 0.01584757834757835,
+      "grad_norm": 0.3581015467643738,
+      "learning_rate": 0.00019999533594100463,
+      "loss": 1.0693,
+      "step": 89
+    },
+    {
+      "epoch": 0.016025641025641024,
+      "grad_norm": 0.4430878758430481,
+      "learning_rate": 0.00019999519977233971,
+      "loss": 1.1591,
+      "step": 90
+    },
+    {
+      "epoch": 0.016203703703703703,
+      "grad_norm": 0.3940352201461792,
+      "learning_rate": 0.0001999950616444766,
+      "loss": 1.1325,
+      "step": 91
+    },
+    {
+      "epoch": 0.01638176638176638,
+      "grad_norm": 0.4521673321723938,
+      "learning_rate": 0.00019999492155741794,
+      "loss": 1.3288,
+      "step": 92
+    },
+    {
+      "epoch": 0.01655982905982906,
+      "grad_norm": 0.3988296687602997,
+      "learning_rate": 0.00019999477951116658,
+      "loss": 1.0023,
+      "step": 93
+    },
+    {
+      "epoch": 0.01673789173789174,
+      "grad_norm": 0.38709723949432373,
+      "learning_rate": 0.00019999463550572516,
+      "loss": 1.2623,
+      "step": 94
+    },
+    {
+      "epoch": 0.016915954415954417,
+      "grad_norm": 0.35376182198524475,
+      "learning_rate": 0.00019999448954109662,
+      "loss": 1.0643,
+      "step": 95
+    },
+    {
+      "epoch": 0.017094017094017096,
+      "grad_norm": 0.49547120928764343,
+      "learning_rate": 0.00019999434161728377,
+      "loss": 1.2121,
+      "step": 96
+    },
+    {
+      "epoch": 0.01727207977207977,
+      "grad_norm": 0.49593672156333923,
+      "learning_rate": 0.00019999419173428952,
+      "loss": 1.1635,
+      "step": 97
+    },
+    {
+      "epoch": 0.01745014245014245,
+      "grad_norm": 0.4146541953086853,
+      "learning_rate": 0.0001999940398921168,
+      "loss": 1.1452,
+      "step": 98
+    },
+    {
+      "epoch": 0.017628205128205128,
+      "grad_norm": 0.5177254676818848,
+      "learning_rate": 0.00019999388609076858,
+      "loss": 1.2178,
+      "step": 99
+    },
+    {
+      "epoch": 0.017806267806267807,
+      "grad_norm": 0.4012768864631653,
+      "learning_rate": 0.0001999937303302479,
+      "loss": 0.9222,
+      "step": 100
+    },
+    {
+      "epoch": 0.017984330484330485,
+      "grad_norm": 0.4597131907939911,
+      "learning_rate": 0.00019999357261055777,
+      "loss": 0.979,
+      "step": 101
+    },
+    {
+      "epoch": 0.018162393162393164,
+      "grad_norm": 0.6190966963768005,
+      "learning_rate": 0.00019999341293170132,
+      "loss": 1.3909,
+      "step": 102
+    },
+    {
+      "epoch": 0.01834045584045584,
+      "grad_norm": 0.4576462209224701,
+      "learning_rate": 0.00019999325129368164,
+      "loss": 1.073,
+      "step": 103
+    },
+    {
+      "epoch": 0.018518518518518517,
+      "grad_norm": 0.4036749005317688,
+      "learning_rate": 0.00019999308769650192,
+      "loss": 1.1354,
+      "step": 104
+    },
+    {
+      "epoch": 0.018696581196581196,
+      "grad_norm": 0.4722452759742737,
+      "learning_rate": 0.00019999292214016538,
+      "loss": 1.2039,
+      "step": 105
+    },
+    {
+      "epoch": 0.018874643874643875,
+      "grad_norm": 0.5338274240493774,
+      "learning_rate": 0.00019999275462467527,
+      "loss": 1.225,
+      "step": 106
+    },
+    {
+      "epoch": 0.019052706552706553,
+      "grad_norm": 0.4301491677761078,
+      "learning_rate": 0.00019999258515003484,
+      "loss": 1.0601,
+      "step": 107
+    },
+    {
+      "epoch": 0.019230769230769232,
+      "grad_norm": 0.33271175622940063,
+      "learning_rate": 0.0001999924137162474,
+      "loss": 0.8441,
+      "step": 108
+    },
+    {
+      "epoch": 0.01940883190883191,
+      "grad_norm": 0.4648784399032593,
+      "learning_rate": 0.0001999922403233163,
+      "loss": 1.2038,
+      "step": 109
+    },
+    {
+      "epoch": 0.019586894586894586,
+      "grad_norm": 0.37915176153182983,
+      "learning_rate": 0.00019999206497124504,
+      "loss": 1.0923,
+      "step": 110
+    },
+    {
+      "epoch": 0.019764957264957264,
+      "grad_norm": 0.3865506052970886,
+      "learning_rate": 0.00019999188766003695,
+      "loss": 0.9535,
+      "step": 111
+    },
+    {
+      "epoch": 0.019943019943019943,
+      "grad_norm": 0.35739636421203613,
+      "learning_rate": 0.0001999917083896955,
+      "loss": 1.2688,
+      "step": 112
+    },
+    {
+      "epoch": 0.02012108262108262,
+      "grad_norm": 0.3943796157836914,
+      "learning_rate": 0.0001999915271602243,
+      "loss": 1.1097,
+      "step": 113
+    },
+    {
+      "epoch": 0.0202991452991453,
+      "grad_norm": 0.44758161902427673,
+      "learning_rate": 0.0001999913439716268,
+      "loss": 1.2698,
+      "step": 114
+    },
+    {
+      "epoch": 0.02047720797720798,
+      "grad_norm": 0.3749747574329376,
+      "learning_rate": 0.00019999115882390664,
+      "loss": 1.1091,
+      "step": 115
+    },
+    {
+      "epoch": 0.020655270655270654,
+      "grad_norm": 0.3479487895965576,
+      "learning_rate": 0.00019999097171706745,
+      "loss": 1.0049,
+      "step": 116
+    },
+    {
+      "epoch": 0.020833333333333332,
+      "grad_norm": 0.4491243064403534,
+      "learning_rate": 0.00019999078265111285,
+      "loss": 1.1857,
+      "step": 117
+    },
+    {
+      "epoch": 0.02101139601139601,
+      "grad_norm": 0.345289021730423,
+      "learning_rate": 0.00019999059162604662,
+      "loss": 1.1397,
+      "step": 118
+    },
+    {
+      "epoch": 0.02118945868945869,
+      "grad_norm": 0.5467649698257446,
+      "learning_rate": 0.00019999039864187243,
+      "loss": 1.2196,
+      "step": 119
+    },
+    {
+      "epoch": 0.021367521367521368,
+      "grad_norm": 0.36446481943130493,
+      "learning_rate": 0.00019999020369859409,
+      "loss": 0.796,
+      "step": 120
+    },
+    {
+      "epoch": 0.021545584045584047,
+      "grad_norm": 0.4225841760635376,
+      "learning_rate": 0.00019999000679621543,
+      "loss": 0.9684,
+      "step": 121
+    },
+    {
+      "epoch": 0.021723646723646725,
+      "grad_norm": 0.4205594062805176,
+      "learning_rate": 0.0001999898079347403,
+      "loss": 1.2762,
+      "step": 122
+    },
+    {
+      "epoch": 0.0219017094017094,
+      "grad_norm": 0.43773892521858215,
+      "learning_rate": 0.00019998960711417257,
+      "loss": 1.117,
+      "step": 123
+    },
+    {
+      "epoch": 0.02207977207977208,
+      "grad_norm": 0.41279685497283936,
+      "learning_rate": 0.00019998940433451623,
+      "loss": 1.1502,
+      "step": 124
+    },
+    {
+      "epoch": 0.022257834757834757,
+      "grad_norm": 0.4090803563594818,
+      "learning_rate": 0.0001999891995957752,
+      "loss": 1.2591,
+      "step": 125
+    },
+    {
+      "epoch": 0.022435897435897436,
+      "grad_norm": 0.6000410914421082,
+      "learning_rate": 0.0001999889928979535,
+      "loss": 1.4321,
+      "step": 126
+    },
+    {
+      "epoch": 0.022613960113960115,
+      "grad_norm": 0.524264395236969,
+      "learning_rate": 0.00019998878424105524,
+      "loss": 1.1849,
+      "step": 127
+    },
+    {
+      "epoch": 0.022792022792022793,
+      "grad_norm": 0.4581047296524048,
+      "learning_rate": 0.00019998857362508443,
+      "loss": 1.0598,
+      "step": 128
+    },
+    {
+      "epoch": 0.022970085470085472,
+      "grad_norm": 0.42663446068763733,
+      "learning_rate": 0.00019998836105004526,
+      "loss": 1.1909,
+      "step": 129
+    },
+    {
+      "epoch": 0.023148148148148147,
+      "grad_norm": 0.45709118247032166,
+      "learning_rate": 0.00019998814651594183,
+      "loss": 1.2104,
+      "step": 130
+    },
+    {
+      "epoch": 0.023326210826210825,
+      "grad_norm": 0.39528369903564453,
+      "learning_rate": 0.0001999879300227784,
+      "loss": 1.3073,
+      "step": 131
+    },
+    {
+      "epoch": 0.023504273504273504,
+      "grad_norm": 0.46896448731422424,
+      "learning_rate": 0.00019998771157055914,
+      "loss": 1.3202,
+      "step": 132
+    },
+    {
+      "epoch": 0.023682336182336183,
+      "grad_norm": 0.4386129677295685,
+      "learning_rate": 0.00019998749115928842,
+      "loss": 1.2196,
+      "step": 133
+    },
+    {
+      "epoch": 0.02386039886039886,
+      "grad_norm": 0.45920488238334656,
+      "learning_rate": 0.00019998726878897051,
+      "loss": 1.3668,
+      "step": 134
+    },
+    {
+      "epoch": 0.02403846153846154,
+      "grad_norm": 0.4115797281265259,
+      "learning_rate": 0.0001999870444596098,
+      "loss": 1.1052,
+      "step": 135
+    },
+    {
+      "epoch": 0.024216524216524215,
+      "grad_norm": 0.3860839903354645,
+      "learning_rate": 0.0001999868181712106,
+      "loss": 1.0344,
+      "step": 136
+    },
+    {
+      "epoch": 0.024394586894586893,
+      "grad_norm": 0.42514732480049133,
+      "learning_rate": 0.00019998658992377742,
+      "loss": 1.1979,
+      "step": 137
+    },
+    {
+      "epoch": 0.024572649572649572,
+      "grad_norm": 0.36001840233802795,
+      "learning_rate": 0.00019998635971731475,
+      "loss": 1.4536,
+      "step": 138
+    },
+    {
+      "epoch": 0.02475071225071225,
+      "grad_norm": 0.3739112317562103,
+      "learning_rate": 0.00019998612755182707,
+      "loss": 1.0097,
+      "step": 139
+    },
+    {
+      "epoch": 0.02492877492877493,
+      "grad_norm": 0.37545472383499146,
+      "learning_rate": 0.00019998589342731888,
+      "loss": 0.829,
+      "step": 140
+    },
+    {
+      "epoch": 0.025106837606837608,
+      "grad_norm": 0.38660728931427,
+      "learning_rate": 0.0001999856573437948,
+      "loss": 1.1324,
+      "step": 141
+    },
+    {
+      "epoch": 0.025284900284900286,
+      "grad_norm": 0.3741356432437897,
+      "learning_rate": 0.00019998541930125953,
+      "loss": 1.0934,
+      "step": 142
+    },
+    {
+      "epoch": 0.02546296296296296,
+      "grad_norm": 0.41900336742401123,
+      "learning_rate": 0.00019998517929971764,
+      "loss": 1.0336,
+      "step": 143
+    },
+    {
+      "epoch": 0.02564102564102564,
+      "grad_norm": 0.4167572259902954,
+      "learning_rate": 0.00019998493733917384,
+      "loss": 1.2571,
+      "step": 144
+    },
+    {
+      "epoch": 0.02581908831908832,
+      "grad_norm": 0.39437636733055115,
+      "learning_rate": 0.0001999846934196329,
+      "loss": 1.2283,
+      "step": 145
+    },
+    {
+      "epoch": 0.025997150997150997,
+      "grad_norm": 0.39129480719566345,
+      "learning_rate": 0.00019998444754109964,
+      "loss": 0.9893,
+      "step": 146
+    },
+    {
+      "epoch": 0.026175213675213676,
+      "grad_norm": 0.45533549785614014,
+      "learning_rate": 0.0001999841997035788,
+      "loss": 1.0793,
+      "step": 147
+    },
+    {
+      "epoch": 0.026353276353276354,
+      "grad_norm": 0.3741768002510071,
+      "learning_rate": 0.00019998394990707524,
+      "loss": 1.2179,
+      "step": 148
+    },
+    {
+      "epoch": 0.026531339031339033,
+      "grad_norm": 0.4066533148288727,
+      "learning_rate": 0.0001999836981515939,
+      "loss": 1.1443,
+      "step": 149
+    },
+    {
+      "epoch": 0.026709401709401708,
+      "grad_norm": 0.4851688742637634,
+      "learning_rate": 0.0001999834444371397,
+      "loss": 1.1668,
+      "step": 150
+    },
+    {
+      "epoch": 0.026887464387464387,
+      "grad_norm": 0.428091436624527,
+      "learning_rate": 0.0001999831887637176,
+      "loss": 1.2676,
+      "step": 151
+    },
+    {
+      "epoch": 0.027065527065527065,
+      "grad_norm": 0.4024655222892761,
+      "learning_rate": 0.0001999829311313326,
+      "loss": 1.3115,
+      "step": 152
+    },
+    {
+      "epoch": 0.027243589743589744,
+      "grad_norm": 0.43983033299446106,
+      "learning_rate": 0.00019998267153998976,
+      "loss": 1.1019,
+      "step": 153
+    },
+    {
+      "epoch": 0.027421652421652423,
+      "grad_norm": 0.4317505359649658,
+      "learning_rate": 0.0001999824099896942,
+      "loss": 1.3129,
+      "step": 154
+    },
+    {
+      "epoch": 0.0275997150997151,
+      "grad_norm": 0.43107882142066956,
+      "learning_rate": 0.000199982146480451,
+      "loss": 1.2134,
+      "step": 155
+    },
+    {
+      "epoch": 0.027777777777777776,
+      "grad_norm": 0.3939448297023773,
+      "learning_rate": 0.00019998188101226532,
+      "loss": 1.0321,
+      "step": 156
+    },
+    {
+      "epoch": 0.027955840455840455,
+      "grad_norm": 0.4641847610473633,
+      "learning_rate": 0.00019998161358514237,
+      "loss": 1.2369,
+      "step": 157
+    },
+    {
+      "epoch": 0.028133903133903133,
+      "grad_norm": 0.3538529872894287,
+      "learning_rate": 0.0001999813441990874,
+      "loss": 1.2061,
+      "step": 158
+    },
+    {
+      "epoch": 0.028311965811965812,
+      "grad_norm": 0.3277950584888458,
+      "learning_rate": 0.0001999810728541057,
+      "loss": 0.9419,
+      "step": 159
+    },
+    {
+      "epoch": 0.02849002849002849,
+      "grad_norm": 0.424710750579834,
+      "learning_rate": 0.00019998079955020254,
+      "loss": 1.3302,
+      "step": 160
+    },
+    {
+      "epoch": 0.02866809116809117,
+      "grad_norm": 0.4120834469795227,
+      "learning_rate": 0.00019998052428738333,
+      "loss": 1.079,
+      "step": 161
+    },
+    {
+      "epoch": 0.028846153846153848,
+      "grad_norm": 0.45811930298805237,
+      "learning_rate": 0.00019998024706565346,
+      "loss": 1.1259,
+      "step": 162
+    },
+    {
+      "epoch": 0.029024216524216523,
+      "grad_norm": 0.3873266875743866,
+      "learning_rate": 0.0001999799678850183,
+      "loss": 1.2124,
+      "step": 163
+    },
+    {
+      "epoch": 0.0292022792022792,
+      "grad_norm": 0.5806412696838379,
+      "learning_rate": 0.00019997968674548337,
+      "loss": 1.3467,
+      "step": 164
+    },
+    {
+      "epoch": 0.02938034188034188,
+      "grad_norm": 0.3906802833080292,
+      "learning_rate": 0.00019997940364705418,
+      "loss": 1.1438,
+      "step": 165
+    },
+    {
+      "epoch": 0.02955840455840456,
+      "grad_norm": 0.45201995968818665,
+      "learning_rate": 0.00019997911858973626,
+      "loss": 1.1469,
+      "step": 166
+    },
+    {
+      "epoch": 0.029736467236467237,
+      "grad_norm": 0.4965892732143402,
+      "learning_rate": 0.0001999788315735352,
+      "loss": 1.0829,
+      "step": 167
+    },
+    {
+      "epoch": 0.029914529914529916,
+      "grad_norm": 0.32578057050704956,
+      "learning_rate": 0.0001999785425984566,
+      "loss": 1.0432,
+      "step": 168
+    },
+    {
+      "epoch": 0.03009259259259259,
+      "grad_norm": 0.4146028161048889,
+      "learning_rate": 0.00019997825166450617,
+      "loss": 1.1657,
+      "step": 169
+    },
+    {
+      "epoch": 0.03027065527065527,
+      "grad_norm": 0.4342964291572571,
+      "learning_rate": 0.0001999779587716896,
+      "loss": 1.2038,
+      "step": 170
+    },
+    {
+      "epoch": 0.030448717948717948,
+      "grad_norm": 0.40128546953201294,
+      "learning_rate": 0.00019997766392001258,
+      "loss": 1.3044,
+      "step": 171
+    },
+    {
+      "epoch": 0.030626780626780627,
+      "grad_norm": 0.4357539117336273,
+      "learning_rate": 0.00019997736710948094,
+      "loss": 1.2143,
+      "step": 172
+    },
+    {
+      "epoch": 0.030804843304843305,
+      "grad_norm": 0.4821035861968994,
+      "learning_rate": 0.00019997706834010045,
+      "loss": 1.0469,
+      "step": 173
+    },
+    {
+      "epoch": 0.030982905982905984,
+      "grad_norm": 0.3966675102710724,
+      "learning_rate": 0.000199976767611877,
+      "loss": 1.2122,
+      "step": 174
+    },
+    {
+      "epoch": 0.031160968660968662,
+      "grad_norm": 0.4265064299106598,
+      "learning_rate": 0.00019997646492481648,
+      "loss": 1.0871,
+      "step": 175
+    },
+    {
+      "epoch": 0.03133903133903134,
+      "grad_norm": 0.3445652723312378,
+      "learning_rate": 0.00019997616027892485,
+      "loss": 1.0412,
+      "step": 176
+    },
+    {
+      "epoch": 0.031517094017094016,
+      "grad_norm": 0.47187718749046326,
+      "learning_rate": 0.000199975853674208,
+      "loss": 1.0822,
+      "step": 177
+    },
+    {
+      "epoch": 0.0316951566951567,
+      "grad_norm": 0.37751707434654236,
+      "learning_rate": 0.000199975545110672,
+      "loss": 1.1439,
+      "step": 178
+    },
+    {
+      "epoch": 0.03187321937321937,
+      "grad_norm": 0.38792455196380615,
+      "learning_rate": 0.00019997523458832286,
+      "loss": 0.8604,
+      "step": 179
+    },
+    {
+      "epoch": 0.03205128205128205,
+      "grad_norm": 0.35199594497680664,
+      "learning_rate": 0.00019997492210716667,
+      "loss": 1.0819,
+      "step": 180
+    },
+    {
+      "epoch": 0.03222934472934473,
+      "grad_norm": 0.4828922748565674,
+      "learning_rate": 0.00019997460766720958,
+      "loss": 1.1879,
+      "step": 181
+    },
+    {
+      "epoch": 0.032407407407407406,
+      "grad_norm": 0.46153363585472107,
+      "learning_rate": 0.00019997429126845774,
+      "loss": 1.1592,
+      "step": 182
+    },
+    {
+      "epoch": 0.03258547008547009,
+      "grad_norm": 0.4844890832901001,
+      "learning_rate": 0.0001999739729109173,
+      "loss": 1.1334,
+      "step": 183
+    },
+    {
+      "epoch": 0.03276353276353276,
+      "grad_norm": 0.414617121219635,
+      "learning_rate": 0.00019997365259459457,
+      "loss": 1.0547,
+      "step": 184
+    },
+    {
+      "epoch": 0.032941595441595445,
+      "grad_norm": 0.46544626355171204,
+      "learning_rate": 0.00019997333031949581,
+      "loss": 1.4067,
+      "step": 185
+    },
+    {
+      "epoch": 0.03311965811965812,
+      "grad_norm": 0.48489415645599365,
+      "learning_rate": 0.0001999730060856273,
+      "loss": 1.4027,
+      "step": 186
+    },
+    {
+      "epoch": 0.033297720797720795,
+      "grad_norm": 0.3963346481323242,
+      "learning_rate": 0.0001999726798929954,
+      "loss": 1.1327,
+      "step": 187
+    },
+    {
+      "epoch": 0.03347578347578348,
+      "grad_norm": 0.3809385895729065,
+      "learning_rate": 0.00019997235174160652,
+      "loss": 1.3475,
+      "step": 188
+    },
+    {
+      "epoch": 0.03365384615384615,
+      "grad_norm": 0.3866960406303406,
+      "learning_rate": 0.0001999720216314671,
+      "loss": 1.1576,
+      "step": 189
+    },
+    {
+      "epoch": 0.033831908831908834,
+      "grad_norm": 0.34976935386657715,
+      "learning_rate": 0.00019997168956258356,
+      "loss": 0.9361,
+      "step": 190
+    },
+    {
+      "epoch": 0.03400997150997151,
+      "grad_norm": 0.38681939244270325,
+      "learning_rate": 0.00019997135553496243,
+      "loss": 1.1796,
+      "step": 191
+    },
+    {
+      "epoch": 0.03418803418803419,
+      "grad_norm": 0.41905197501182556,
+      "learning_rate": 0.0001999710195486103,
+      "loss": 1.1714,
+      "step": 192
+    },
+    {
+      "epoch": 0.03436609686609687,
+      "grad_norm": 0.42356589436531067,
+      "learning_rate": 0.0001999706816035337,
+      "loss": 1.0022,
+      "step": 193
+    },
+    {
+      "epoch": 0.03454415954415954,
+      "grad_norm": 0.3929740786552429,
+      "learning_rate": 0.00019997034169973925,
+      "loss": 1.3769,
+      "step": 194
+    },
+    {
+      "epoch": 0.034722222222222224,
+      "grad_norm": 0.4325186312198639,
+      "learning_rate": 0.00019996999983723366,
+      "loss": 1.3057,
+      "step": 195
+    },
+    {
+      "epoch": 0.0349002849002849,
+      "grad_norm": 0.3954029381275177,
+      "learning_rate": 0.00019996965601602355,
+      "loss": 1.1958,
+      "step": 196
+    },
+    {
+      "epoch": 0.03507834757834758,
+      "grad_norm": 0.34454262256622314,
+      "learning_rate": 0.00019996931023611572,
+      "loss": 1.0972,
+      "step": 197
+    },
+    {
+      "epoch": 0.035256410256410256,
+      "grad_norm": 0.48900291323661804,
+      "learning_rate": 0.0001999689624975169,
+      "loss": 1.213,
+      "step": 198
+    },
+    {
+      "epoch": 0.03543447293447293,
+      "grad_norm": 0.35214388370513916,
+      "learning_rate": 0.00019996861280023397,
+      "loss": 1.0285,
+      "step": 199
+    },
+    {
+      "epoch": 0.03561253561253561,
+      "grad_norm": 0.49393126368522644,
+      "learning_rate": 0.00019996826114427373,
+      "loss": 1.2313,
+      "step": 200
+    },
+    {
+      "epoch": 0.03579059829059829,
+      "grad_norm": 0.3994458019733429,
+      "learning_rate": 0.00019996790752964305,
+      "loss": 1.0474,
+      "step": 201
+    },
+    {
+      "epoch": 0.03596866096866097,
+      "grad_norm": 0.5387318730354309,
+      "learning_rate": 0.0001999675519563489,
+      "loss": 1.3067,
+      "step": 202
+    },
+    {
+      "epoch": 0.036146723646723646,
+      "grad_norm": 0.4976751208305359,
+      "learning_rate": 0.00019996719442439824,
+      "loss": 1.2593,
+      "step": 203
+    },
+    {
+      "epoch": 0.03632478632478633,
+      "grad_norm": 0.47052907943725586,
+      "learning_rate": 0.0001999668349337981,
+      "loss": 1.1036,
+      "step": 204
+    },
+    {
+      "epoch": 0.036502849002849,
+      "grad_norm": 0.39616644382476807,
+      "learning_rate": 0.00019996647348455543,
+      "loss": 1.0481,
+      "step": 205
+    },
+    {
+      "epoch": 0.03668091168091168,
+      "grad_norm": 0.42987677454948425,
+      "learning_rate": 0.00019996611007667742,
+      "loss": 1.0923,
+      "step": 206
+    },
+    {
+      "epoch": 0.03685897435897436,
+      "grad_norm": 0.47065848112106323,
+      "learning_rate": 0.00019996574471017113,
+      "loss": 1.1403,
+      "step": 207
+    },
+    {
+      "epoch": 0.037037037037037035,
+      "grad_norm": 0.4363015592098236,
+      "learning_rate": 0.00019996537738504373,
+      "loss": 1.253,
+      "step": 208
+    },
+    {
+      "epoch": 0.03721509971509972,
+      "grad_norm": 0.4038296937942505,
+      "learning_rate": 0.00019996500810130243,
+      "loss": 1.1679,
+      "step": 209
+    },
+    {
+      "epoch": 0.03739316239316239,
+      "grad_norm": 0.5038532018661499,
+      "learning_rate": 0.00019996463685895445,
+      "loss": 1.1182,
+      "step": 210
+    },
+    {
+      "epoch": 0.037571225071225074,
+      "grad_norm": 0.37740692496299744,
+      "learning_rate": 0.00019996426365800706,
+      "loss": 1.0465,
+      "step": 211
+    },
+    {
+      "epoch": 0.03774928774928775,
+      "grad_norm": 0.47794604301452637,
+      "learning_rate": 0.00019996388849846759,
+      "loss": 1.2836,
+      "step": 212
+    },
+    {
+      "epoch": 0.037927350427350424,
+      "grad_norm": 0.38460609316825867,
+      "learning_rate": 0.0001999635113803434,
+      "loss": 1.2099,
+      "step": 213
+    },
+    {
+      "epoch": 0.038105413105413107,
+      "grad_norm": 0.42016157507896423,
+      "learning_rate": 0.0001999631323036418,
+      "loss": 1.152,
+      "step": 214
+    },
+    {
+      "epoch": 0.03828347578347578,
+      "grad_norm": 0.4024946391582489,
+      "learning_rate": 0.00019996275126837033,
+      "loss": 1.1534,
+      "step": 215
+    },
+    {
+      "epoch": 0.038461538461538464,
+      "grad_norm": 0.4573793411254883,
+      "learning_rate": 0.00019996236827453642,
+      "loss": 1.2019,
+      "step": 216
+    },
+    {
+      "epoch": 0.03863960113960114,
+      "grad_norm": 0.3642503321170807,
+      "learning_rate": 0.0001999619833221475,
+      "loss": 1.0541,
+      "step": 217
+    },
+    {
+      "epoch": 0.03881766381766382,
+      "grad_norm": 0.38492897152900696,
+      "learning_rate": 0.0001999615964112112,
+      "loss": 1.1269,
+      "step": 218
+    },
+    {
+      "epoch": 0.038995726495726496,
+      "grad_norm": 0.427219420671463,
+      "learning_rate": 0.0001999612075417351,
+      "loss": 1.1126,
+      "step": 219
+    },
+    {
+      "epoch": 0.03917378917378917,
+      "grad_norm": 0.40781742334365845,
+      "learning_rate": 0.00019996081671372676,
+      "loss": 1.2207,
+      "step": 220
+    },
+    {
+      "epoch": 0.03935185185185185,
+      "grad_norm": 0.39229512214660645,
+      "learning_rate": 0.00019996042392719386,
+      "loss": 1.0403,
+      "step": 221
+    },
+    {
+      "epoch": 0.03952991452991453,
+      "grad_norm": 0.42038577795028687,
+      "learning_rate": 0.0001999600291821441,
+      "loss": 1.2157,
+      "step": 222
+    },
+    {
+      "epoch": 0.03970797720797721,
+      "grad_norm": 0.3963491916656494,
+      "learning_rate": 0.00019995963247858525,
+      "loss": 1.0532,
+      "step": 223
+    },
+    {
+      "epoch": 0.039886039886039885,
+      "grad_norm": 0.4389874041080475,
+      "learning_rate": 0.00019995923381652502,
+      "loss": 1.4279,
+      "step": 224
+    },
+    {
+      "epoch": 0.04006410256410257,
+      "grad_norm": 0.357312947511673,
+      "learning_rate": 0.00019995883319597123,
+      "loss": 0.9871,
+      "step": 225
+    },
+    {
+      "epoch": 0.04024216524216524,
+      "grad_norm": 0.3644427955150604,
+      "learning_rate": 0.00019995843061693181,
+      "loss": 1.0879,
+      "step": 226
+    },
+    {
+      "epoch": 0.04042022792022792,
+      "grad_norm": 0.4074651002883911,
+      "learning_rate": 0.00019995802607941453,
+      "loss": 1.2138,
+      "step": 227
+    },
+    {
+      "epoch": 0.0405982905982906,
+      "grad_norm": 0.40709465742111206,
+      "learning_rate": 0.0001999576195834274,
+      "loss": 1.1905,
+      "step": 228
+    },
+    {
+      "epoch": 0.040776353276353275,
+      "grad_norm": 0.4280182719230652,
+      "learning_rate": 0.00019995721112897838,
+      "loss": 1.2331,
+      "step": 229
+    },
+    {
+      "epoch": 0.04095441595441596,
+      "grad_norm": 0.37846076488494873,
+      "learning_rate": 0.00019995680071607544,
+      "loss": 1.078,
+      "step": 230
+    },
+    {
+      "epoch": 0.04113247863247863,
+      "grad_norm": 0.3877260088920593,
+      "learning_rate": 0.0001999563883447266,
+      "loss": 1.0309,
+      "step": 231
+    },
+    {
+      "epoch": 0.04131054131054131,
+      "grad_norm": 0.42886826395988464,
+      "learning_rate": 0.00019995597401494,
+      "loss": 1.0403,
+      "step": 232
+    },
+    {
+      "epoch": 0.04148860398860399,
+      "grad_norm": 0.4316534101963043,
+      "learning_rate": 0.00019995555772672372,
+      "loss": 1.2418,
+      "step": 233
+    },
+    {
+      "epoch": 0.041666666666666664,
+      "grad_norm": 0.45768865942955017,
+      "learning_rate": 0.00019995513948008593,
+      "loss": 1.233,
+      "step": 234
+    },
+    {
+      "epoch": 0.041844729344729346,
+      "grad_norm": 0.5647913813591003,
+      "learning_rate": 0.00019995471927503481,
+      "loss": 1.1346,
+      "step": 235
+    },
+    {
+      "epoch": 0.04202279202279202,
+      "grad_norm": 0.3797492980957031,
+      "learning_rate": 0.00019995429711157863,
+      "loss": 1.1574,
+      "step": 236
+    },
+    {
+      "epoch": 0.042200854700854704,
+      "grad_norm": 0.4392767548561096,
+      "learning_rate": 0.00019995387298972562,
+      "loss": 0.8988,
+      "step": 237
+    },
+    {
+      "epoch": 0.04237891737891738,
+      "grad_norm": 0.37331557273864746,
+      "learning_rate": 0.0001999534469094841,
+      "loss": 1.0439,
+      "step": 238
+    },
+    {
+      "epoch": 0.042556980056980054,
+      "grad_norm": 0.3785935938358307,
+      "learning_rate": 0.00019995301887086245,
+      "loss": 0.9839,
+      "step": 239
+    },
+    {
+      "epoch": 0.042735042735042736,
+      "grad_norm": 0.4351862668991089,
+      "learning_rate": 0.00019995258887386898,
+      "loss": 1.2653,
+      "step": 240
+    },
+    {
+      "epoch": 0.04291310541310541,
+      "grad_norm": 0.399475634098053,
+      "learning_rate": 0.0001999521569185122,
+      "loss": 0.9877,
+      "step": 241
+    },
+    {
+      "epoch": 0.04309116809116809,
+      "grad_norm": 0.42332810163497925,
+      "learning_rate": 0.00019995172300480053,
+      "loss": 1.2403,
+      "step": 242
+    },
+    {
+      "epoch": 0.04326923076923077,
+      "grad_norm": 0.4397708475589752,
+      "learning_rate": 0.00019995128713274247,
+      "loss": 0.9316,
+      "step": 243
+    },
+    {
+      "epoch": 0.04344729344729345,
+      "grad_norm": 0.3614110052585602,
+      "learning_rate": 0.00019995084930234658,
+      "loss": 1.1088,
+      "step": 244
+    },
+    {
+      "epoch": 0.043625356125356125,
+      "grad_norm": 0.39433717727661133,
+      "learning_rate": 0.0001999504095136214,
+      "loss": 1.2002,
+      "step": 245
+    },
+    {
+      "epoch": 0.0438034188034188,
+      "grad_norm": 0.33088216185569763,
+      "learning_rate": 0.0001999499677665756,
+      "loss": 0.8796,
+      "step": 246
+    },
+    {
+      "epoch": 0.04398148148148148,
+      "grad_norm": 0.5239143967628479,
+      "learning_rate": 0.00019994952406121784,
+      "loss": 1.2808,
+      "step": 247
+    },
+    {
+      "epoch": 0.04415954415954416,
+      "grad_norm": 0.42156723141670227,
+      "learning_rate": 0.00019994907839755675,
+      "loss": 1.1775,
+      "step": 248
+    },
+    {
+      "epoch": 0.04433760683760684,
+      "grad_norm": 0.42569902539253235,
+      "learning_rate": 0.0001999486307756011,
+      "loss": 1.001,
+      "step": 249
+    },
+    {
+      "epoch": 0.044515669515669515,
+      "grad_norm": 0.38241544365882874,
+      "learning_rate": 0.00019994818119535964,
+      "loss": 1.1064,
+      "step": 250
+    },
+    {
+      "epoch": 0.0446937321937322,
+      "grad_norm": 0.4185071885585785,
+      "learning_rate": 0.0001999477296568412,
+      "loss": 1.2109,
+      "step": 251
+    },
+    {
+      "epoch": 0.04487179487179487,
+      "grad_norm": 0.4189644157886505,
+      "learning_rate": 0.00019994727616005464,
+      "loss": 1.2902,
+      "step": 252
+    },
+    {
+      "epoch": 0.04504985754985755,
+      "grad_norm": 0.34671884775161743,
+      "learning_rate": 0.0001999468207050088,
+      "loss": 0.9429,
+      "step": 253
+    },
+    {
+      "epoch": 0.04522792022792023,
+      "grad_norm": 0.42391687631607056,
+      "learning_rate": 0.00019994636329171266,
+      "loss": 0.7179,
+      "step": 254
+    },
+    {
+      "epoch": 0.045405982905982904,
+      "grad_norm": 0.3803195655345917,
+      "learning_rate": 0.00019994590392017513,
+      "loss": 1.0318,
+      "step": 255
+    },
+    {
+      "epoch": 0.045584045584045586,
+      "grad_norm": 0.3389956057071686,
+      "learning_rate": 0.00019994544259040525,
+      "loss": 1.0485,
+      "step": 256
+    },
+    {
+      "epoch": 0.04576210826210826,
+      "grad_norm": 0.4927038550376892,
+      "learning_rate": 0.000199944979302412,
+      "loss": 1.3426,
+      "step": 257
+    },
+    {
+      "epoch": 0.045940170940170943,
+      "grad_norm": 0.33200421929359436,
+      "learning_rate": 0.00019994451405620453,
+      "loss": 1.0071,
+      "step": 258
+    },
+    {
+      "epoch": 0.04611823361823362,
+      "grad_norm": 0.38028615713119507,
+      "learning_rate": 0.00019994404685179195,
+      "loss": 1.0985,
+      "step": 259
+    },
+    {
+      "epoch": 0.046296296296296294,
+      "grad_norm": 0.3752151429653168,
+      "learning_rate": 0.00019994357768918333,
+      "loss": 0.9209,
+      "step": 260
+    },
+    {
+      "epoch": 0.046474358974358976,
+      "grad_norm": 0.43030866980552673,
+      "learning_rate": 0.00019994310656838796,
+      "loss": 0.9921,
+      "step": 261
+    },
+    {
+      "epoch": 0.04665242165242165,
+      "grad_norm": 0.4402460753917694,
+      "learning_rate": 0.00019994263348941502,
+      "loss": 1.1051,
+      "step": 262
+    },
+    {
+      "epoch": 0.04683048433048433,
+      "grad_norm": 0.43012720346450806,
+      "learning_rate": 0.0001999421584522738,
+      "loss": 1.1839,
+      "step": 263
+    },
+    {
+      "epoch": 0.04700854700854701,
+      "grad_norm": 0.4195305407047272,
+      "learning_rate": 0.0001999416814569736,
+      "loss": 1.1749,
+      "step": 264
+    },
+    {
+      "epoch": 0.04718660968660968,
+      "grad_norm": 0.45623287558555603,
+      "learning_rate": 0.00019994120250352372,
+      "loss": 1.2433,
+      "step": 265
+    },
+    {
+      "epoch": 0.047364672364672365,
+      "grad_norm": 0.4736156761646271,
+      "learning_rate": 0.00019994072159193363,
+      "loss": 1.2882,
+      "step": 266
+    },
+    {
+      "epoch": 0.04754273504273504,
+      "grad_norm": 0.36698561906814575,
+      "learning_rate": 0.0001999402387222127,
+      "loss": 1.1486,
+      "step": 267
+    },
+    {
+      "epoch": 0.04772079772079772,
+      "grad_norm": 0.3854144215583801,
+      "learning_rate": 0.00019993975389437038,
+      "loss": 0.8115,
+      "step": 268
+    },
+    {
+      "epoch": 0.0478988603988604,
+      "grad_norm": 0.41512808203697205,
+      "learning_rate": 0.0001999392671084162,
+      "loss": 1.0959,
+      "step": 269
+    },
+    {
+      "epoch": 0.04807692307692308,
+      "grad_norm": 0.3869563341140747,
+      "learning_rate": 0.0001999387783643597,
+      "loss": 1.087,
+      "step": 270
+    },
+    {
+      "epoch": 0.048254985754985755,
+      "grad_norm": 0.4649744927883148,
+      "learning_rate": 0.00019993828766221044,
+      "loss": 1.0011,
+      "step": 271
+    },
+    {
+      "epoch": 0.04843304843304843,
+      "grad_norm": 0.40331923961639404,
+      "learning_rate": 0.00019993779500197803,
+      "loss": 1.1463,
+      "step": 272
+    },
+    {
+      "epoch": 0.04861111111111111,
+      "grad_norm": 0.3826279938220978,
+      "learning_rate": 0.0001999373003836721,
+      "loss": 1.1491,
+      "step": 273
+    },
+    {
+      "epoch": 0.04878917378917379,
+      "grad_norm": 0.3967166543006897,
+      "learning_rate": 0.00019993680380730243,
+      "loss": 1.1462,
+      "step": 274
+    },
+    {
+      "epoch": 0.04896723646723647,
+      "grad_norm": 0.4298507869243622,
+      "learning_rate": 0.00019993630527287865,
+      "loss": 1.2471,
+      "step": 275
+    },
+    {
+      "epoch": 0.049145299145299144,
+      "grad_norm": 0.41486215591430664,
+      "learning_rate": 0.0001999358047804106,
+      "loss": 1.287,
+      "step": 276
+    },
+    {
+      "epoch": 0.049323361823361826,
+      "grad_norm": 0.3914124369621277,
+      "learning_rate": 0.00019993530232990803,
+      "loss": 1.0935,
+      "step": 277
+    },
+    {
+      "epoch": 0.0495014245014245,
+      "grad_norm": 0.39888378977775574,
+      "learning_rate": 0.00019993479792138082,
+      "loss": 1.2347,
+      "step": 278
+    },
+    {
+      "epoch": 0.049679487179487176,
+      "grad_norm": 0.3911665678024292,
+      "learning_rate": 0.00019993429155483884,
+      "loss": 1.0917,
+      "step": 279
+    },
+    {
+      "epoch": 0.04985754985754986,
+      "grad_norm": 0.42871445417404175,
+      "learning_rate": 0.00019993378323029197,
+      "loss": 1.0277,
+      "step": 280
+    },
+    {
+      "epoch": 0.050035612535612534,
+      "grad_norm": 0.35397860407829285,
+      "learning_rate": 0.00019993327294775027,
+      "loss": 0.9549,
+      "step": 281
+    },
+    {
+      "epoch": 0.050213675213675216,
+      "grad_norm": 0.4528059959411621,
+      "learning_rate": 0.00019993276070722364,
+      "loss": 1.2338,
+      "step": 282
+    },
+    {
+      "epoch": 0.05039173789173789,
+      "grad_norm": 0.354735791683197,
+      "learning_rate": 0.00019993224650872218,
+      "loss": 1.1892,
+      "step": 283
+    },
+    {
+      "epoch": 0.05056980056980057,
+      "grad_norm": 0.44407567381858826,
+      "learning_rate": 0.00019993173035225592,
+      "loss": 1.1621,
+      "step": 284
+    },
+    {
+      "epoch": 0.05074786324786325,
+      "grad_norm": 0.4177244305610657,
+      "learning_rate": 0.000199931212237835,
+      "loss": 1.1184,
+      "step": 285
+    },
+    {
+      "epoch": 0.05092592592592592,
+      "grad_norm": 0.5627759695053101,
+      "learning_rate": 0.0001999306921654696,
+      "loss": 1.0755,
+      "step": 286
+    },
+    {
+      "epoch": 0.051103988603988605,
+      "grad_norm": 0.46767523884773254,
+      "learning_rate": 0.00019993017013516986,
+      "loss": 1.2654,
+      "step": 287
+    },
+    {
+      "epoch": 0.05128205128205128,
+      "grad_norm": 0.4163128733634949,
+      "learning_rate": 0.000199929646146946,
+      "loss": 1.1307,
+      "step": 288
+    },
+    {
+      "epoch": 0.05146011396011396,
+      "grad_norm": 0.36954161524772644,
+      "learning_rate": 0.00019992912020080832,
+      "loss": 0.8274,
+      "step": 289
+    },
+    {
+      "epoch": 0.05163817663817664,
+      "grad_norm": 0.4770594835281372,
+      "learning_rate": 0.00019992859229676712,
+      "loss": 1.2235,
+      "step": 290
+    },
+    {
+      "epoch": 0.05181623931623932,
+      "grad_norm": 0.4174608290195465,
+      "learning_rate": 0.00019992806243483274,
+      "loss": 1.2893,
+      "step": 291
+    },
+    {
+      "epoch": 0.051994301994301995,
+      "grad_norm": 0.3794898986816406,
+      "learning_rate": 0.00019992753061501555,
+      "loss": 1.104,
+      "step": 292
+    },
+    {
+      "epoch": 0.05217236467236467,
+      "grad_norm": 0.3912592828273773,
+      "learning_rate": 0.000199926996837326,
+      "loss": 1.0043,
+      "step": 293
+    },
+    {
+      "epoch": 0.05235042735042735,
+      "grad_norm": 0.39641159772872925,
+      "learning_rate": 0.00019992646110177448,
+      "loss": 1.083,
+      "step": 294
+    },
+    {
+      "epoch": 0.05252849002849003,
+      "grad_norm": 0.3518857955932617,
+      "learning_rate": 0.00019992592340837157,
+      "loss": 0.9275,
+      "step": 295
+    },
+    {
+      "epoch": 0.05270655270655271,
+      "grad_norm": 0.3955721855163574,
+      "learning_rate": 0.00019992538375712777,
+      "loss": 1.0153,
+      "step": 296
+    },
+    {
+      "epoch": 0.052884615384615384,
+      "grad_norm": 0.3837333023548126,
+      "learning_rate": 0.00019992484214805364,
+      "loss": 1.1664,
+      "step": 297
+    },
+    {
+      "epoch": 0.053062678062678066,
+      "grad_norm": 0.39400920271873474,
+      "learning_rate": 0.0001999242985811598,
+      "loss": 1.0532,
+      "step": 298
+    },
+    {
+      "epoch": 0.05324074074074074,
+      "grad_norm": 0.39258649945259094,
+      "learning_rate": 0.00019992375305645692,
+      "loss": 1.0081,
+      "step": 299
+    },
+    {
+      "epoch": 0.053418803418803416,
+      "grad_norm": 0.49768248200416565,
+      "learning_rate": 0.00019992320557395566,
+      "loss": 1.2553,
+      "step": 300
+    },
+    {
+      "epoch": 0.0535968660968661,
+      "grad_norm": 0.364776074886322,
+      "learning_rate": 0.00019992265613366677,
+      "loss": 1.0582,
+      "step": 301
+    },
+    {
+      "epoch": 0.053774928774928774,
+      "grad_norm": 0.47317907214164734,
+      "learning_rate": 0.00019992210473560097,
+      "loss": 1.3114,
+      "step": 302
+    },
+    {
+      "epoch": 0.053952991452991456,
+      "grad_norm": 0.3706119656562805,
+      "learning_rate": 0.00019992155137976917,
+      "loss": 0.9554,
+      "step": 303
+    },
+    {
+      "epoch": 0.05413105413105413,
+      "grad_norm": 0.42809563875198364,
+      "learning_rate": 0.0001999209960661821,
+      "loss": 1.306,
+      "step": 304
+    },
+    {
+      "epoch": 0.054309116809116806,
+      "grad_norm": 0.4514487385749817,
+      "learning_rate": 0.00019992043879485066,
+      "loss": 1.0147,
+      "step": 305
+    },
+    {
+      "epoch": 0.05448717948717949,
+      "grad_norm": 0.36672836542129517,
+      "learning_rate": 0.0001999198795657858,
+      "loss": 1.1392,
+      "step": 306
+    },
+    {
+      "epoch": 0.05466524216524216,
+      "grad_norm": 0.4206554889678955,
+      "learning_rate": 0.00019991931837899847,
+      "loss": 1.2405,
+      "step": 307
+    },
+    {
+      "epoch": 0.054843304843304845,
+      "grad_norm": 0.46168261766433716,
+      "learning_rate": 0.00019991875523449966,
+      "loss": 1.2707,
+      "step": 308
+    },
+    {
+      "epoch": 0.05502136752136752,
+      "grad_norm": 0.39503365755081177,
+      "learning_rate": 0.00019991819013230039,
+      "loss": 1.0776,
+      "step": 309
+    },
+    {
+      "epoch": 0.0551994301994302,
+      "grad_norm": 0.35244834423065186,
+      "learning_rate": 0.00019991762307241178,
+      "loss": 1.0864,
+      "step": 310
+    },
+    {
+      "epoch": 0.05537749287749288,
+      "grad_norm": 0.3865319490432739,
+      "learning_rate": 0.0001999170540548449,
+      "loss": 1.3659,
+      "step": 311
+    },
+    {
+      "epoch": 0.05555555555555555,
+      "grad_norm": 0.3666876554489136,
+      "learning_rate": 0.0001999164830796109,
+      "loss": 0.9884,
+      "step": 312
+    },
+    {
+      "epoch": 0.055733618233618235,
+      "grad_norm": 0.4278281629085541,
+      "learning_rate": 0.00019991591014672096,
+      "loss": 1.1522,
+      "step": 313
+    },
+    {
+      "epoch": 0.05591168091168091,
+      "grad_norm": 0.4172627031803131,
+      "learning_rate": 0.0001999153352561863,
+      "loss": 1.2527,
+      "step": 314
+    },
+    {
+      "epoch": 0.05608974358974359,
+      "grad_norm": 0.38872212171554565,
+      "learning_rate": 0.00019991475840801823,
+      "loss": 1.2985,
+      "step": 315
+    },
+    {
+      "epoch": 0.05626780626780627,
+      "grad_norm": 0.4160458445549011,
+      "learning_rate": 0.00019991417960222804,
+      "loss": 1.1347,
+      "step": 316
+    },
+    {
+      "epoch": 0.05644586894586895,
+      "grad_norm": 0.5169723033905029,
+      "learning_rate": 0.00019991359883882705,
+      "loss": 1.0819,
+      "step": 317
+    },
+    {
+      "epoch": 0.056623931623931624,
+      "grad_norm": 0.42306259274482727,
+      "learning_rate": 0.0001999130161178266,
+      "loss": 1.3139,
+      "step": 318
+    },
+    {
+      "epoch": 0.0568019943019943,
+      "grad_norm": 0.41975873708724976,
+      "learning_rate": 0.00019991243143923816,
+      "loss": 1.2277,
+      "step": 319
+    },
+    {
+      "epoch": 0.05698005698005698,
+      "grad_norm": 0.3873472511768341,
+      "learning_rate": 0.00019991184480307324,
+      "loss": 1.156,
+      "step": 320
+    },
+    {
+      "epoch": 0.057158119658119656,
+      "grad_norm": 0.43656104803085327,
+      "learning_rate": 0.0001999112562093432,
+      "loss": 1.2344,
+      "step": 321
+    },
+    {
+      "epoch": 0.05733618233618234,
+      "grad_norm": 0.3738791048526764,
+      "learning_rate": 0.00019991066565805968,
+      "loss": 0.9573,
+      "step": 322
+    },
+    {
+      "epoch": 0.05751424501424501,
+      "grad_norm": 0.3838156461715698,
+      "learning_rate": 0.00019991007314923418,
+      "loss": 0.9274,
+      "step": 323
+    },
+    {
+      "epoch": 0.057692307692307696,
+      "grad_norm": 0.4564770758152008,
+      "learning_rate": 0.00019990947868287837,
+      "loss": 1.0756,
+      "step": 324
+    },
+    {
+      "epoch": 0.05787037037037037,
+      "grad_norm": 0.4560079872608185,
+      "learning_rate": 0.00019990888225900386,
+      "loss": 1.1508,
+      "step": 325
+    },
+    {
+      "epoch": 0.058048433048433046,
+      "grad_norm": 0.44356057047843933,
+      "learning_rate": 0.00019990828387762236,
+      "loss": 1.2323,
+      "step": 326
+    },
+    {
+      "epoch": 0.05822649572649573,
+      "grad_norm": 0.46390119194984436,
+      "learning_rate": 0.00019990768353874553,
+      "loss": 1.0031,
+      "step": 327
+    },
+    {
+      "epoch": 0.0584045584045584,
+      "grad_norm": 0.4502357244491577,
+      "learning_rate": 0.00019990708124238525,
+      "loss": 1.3454,
+      "step": 328
+    },
+    {
+      "epoch": 0.058582621082621085,
+      "grad_norm": 0.3979945182800293,
+      "learning_rate": 0.0001999064769885532,
+      "loss": 1.2833,
+      "step": 329
+    },
+    {
+      "epoch": 0.05876068376068376,
+      "grad_norm": 0.3899286687374115,
+      "learning_rate": 0.00019990587077726128,
+      "loss": 1.0175,
+      "step": 330
+    },
+    {
+      "epoch": 0.05893874643874644,
+      "grad_norm": 0.41422948241233826,
+      "learning_rate": 0.00019990526260852139,
+      "loss": 1.1151,
+      "step": 331
+    },
+    {
+      "epoch": 0.05911680911680912,
+      "grad_norm": 0.4266608953475952,
+      "learning_rate": 0.0001999046524823454,
+      "loss": 1.1119,
+      "step": 332
+    },
+    {
+      "epoch": 0.05929487179487179,
+      "grad_norm": 0.46563324332237244,
+      "learning_rate": 0.00019990404039874524,
+      "loss": 1.2358,
+      "step": 333
+    },
+    {
+      "epoch": 0.059472934472934474,
+      "grad_norm": 0.4404347240924835,
+      "learning_rate": 0.00019990342635773297,
+      "loss": 1.1748,
+      "step": 334
+    },
+    {
+      "epoch": 0.05965099715099715,
+      "grad_norm": 0.5133237838745117,
+      "learning_rate": 0.00019990281035932062,
+      "loss": 1.1649,
+      "step": 335
+    },
+    {
+      "epoch": 0.05982905982905983,
+      "grad_norm": 0.3593895435333252,
+      "learning_rate": 0.00019990219240352018,
+      "loss": 1.0318,
+      "step": 336
+    },
+    {
+      "epoch": 0.06000712250712251,
+      "grad_norm": 0.40554583072662354,
+      "learning_rate": 0.00019990157249034384,
+      "loss": 1.1202,
+      "step": 337
+    },
+    {
+      "epoch": 0.06018518518518518,
+      "grad_norm": 0.3770706057548523,
+      "learning_rate": 0.00019990095061980372,
+      "loss": 0.9908,
+      "step": 338
+    },
+    {
+      "epoch": 0.060363247863247864,
+      "grad_norm": 0.39676955342292786,
+      "learning_rate": 0.000199900326791912,
+      "loss": 0.8176,
+      "step": 339
+    },
+    {
+      "epoch": 0.06054131054131054,
+      "grad_norm": 0.41448578238487244,
+      "learning_rate": 0.00019989970100668086,
+      "loss": 1.2877,
+      "step": 340
+    },
+    {
+      "epoch": 0.06071937321937322,
+      "grad_norm": 0.4200015068054199,
+      "learning_rate": 0.00019989907326412265,
+      "loss": 1.2293,
+      "step": 341
+    },
+    {
+      "epoch": 0.060897435897435896,
+      "grad_norm": 0.47350621223449707,
+      "learning_rate": 0.0001998984435642496,
+      "loss": 1.2331,
+      "step": 342
+    },
+    {
+      "epoch": 0.06107549857549858,
+      "grad_norm": 0.47050634026527405,
+      "learning_rate": 0.00019989781190707406,
+      "loss": 0.8888,
+      "step": 343
+    },
+    {
+      "epoch": 0.06125356125356125,
+      "grad_norm": 0.4994896948337555,
+      "learning_rate": 0.00019989717829260842,
+      "loss": 1.0921,
+      "step": 344
+    },
+    {
+      "epoch": 0.06143162393162393,
+      "grad_norm": 0.36340200901031494,
+      "learning_rate": 0.0001998965427208651,
+      "loss": 0.9777,
+      "step": 345
+    },
+    {
+      "epoch": 0.06160968660968661,
+      "grad_norm": 0.3538152873516083,
+      "learning_rate": 0.00019989590519185654,
+      "loss": 1.0055,
+      "step": 346
+    },
+    {
+      "epoch": 0.061787749287749286,
+      "grad_norm": 0.5388944149017334,
+      "learning_rate": 0.00019989526570559526,
+      "loss": 1.1001,
+      "step": 347
+    },
+    {
+      "epoch": 0.06196581196581197,
+      "grad_norm": 0.4411574602127075,
+      "learning_rate": 0.00019989462426209373,
+      "loss": 1.0038,
+      "step": 348
+    },
+    {
+      "epoch": 0.06214387464387464,
+      "grad_norm": 0.3930876851081848,
+      "learning_rate": 0.00019989398086136455,
+      "loss": 1.1534,
+      "step": 349
+    },
+    {
+      "epoch": 0.062321937321937325,
+      "grad_norm": 0.47357070446014404,
+      "learning_rate": 0.00019989333550342033,
+      "loss": 1.2687,
+      "step": 350
+    },
+    {
+      "epoch": 0.0625,
+      "grad_norm": 0.40302303433418274,
+      "learning_rate": 0.00019989268818827372,
+      "loss": 1.1894,
+      "step": 351
+    },
+    {
+      "epoch": 0.06267806267806268,
+      "grad_norm": 0.4470510184764862,
+      "learning_rate": 0.00019989203891593738,
+      "loss": 1.2207,
+      "step": 352
+    },
+    {
+      "epoch": 0.06285612535612535,
+      "grad_norm": 0.42235100269317627,
+      "learning_rate": 0.00019989138768642406,
+      "loss": 1.2086,
+      "step": 353
+    },
+    {
+      "epoch": 0.06303418803418803,
+      "grad_norm": 0.38305309414863586,
+      "learning_rate": 0.0001998907344997465,
+      "loss": 1.0473,
+      "step": 354
+    },
+    {
+      "epoch": 0.06321225071225071,
+      "grad_norm": 0.3893027901649475,
+      "learning_rate": 0.0001998900793559175,
+      "loss": 1.1746,
+      "step": 355
+    },
+    {
+      "epoch": 0.0633903133903134,
+      "grad_norm": 0.41206735372543335,
+      "learning_rate": 0.0001998894222549499,
+      "loss": 1.188,
+      "step": 356
+    },
+    {
+      "epoch": 0.06356837606837606,
+      "grad_norm": 0.3700513243675232,
+      "learning_rate": 0.00019988876319685658,
+      "loss": 0.9862,
+      "step": 357
+    },
+    {
+      "epoch": 0.06374643874643875,
+      "grad_norm": 0.3708794116973877,
+      "learning_rate": 0.0001998881021816504,
+      "loss": 1.2003,
+      "step": 358
+    },
+    {
+      "epoch": 0.06392450142450143,
+      "grad_norm": 0.4058014154434204,
+      "learning_rate": 0.00019988743920934442,
+      "loss": 1.2311,
+      "step": 359
+    },
+    {
+      "epoch": 0.0641025641025641,
+      "grad_norm": 0.39134132862091064,
+      "learning_rate": 0.00019988677427995155,
+      "loss": 1.001,
+      "step": 360
+    },
+    {
+      "epoch": 0.06428062678062678,
+      "grad_norm": 0.3853437602519989,
+      "learning_rate": 0.00019988610739348484,
+      "loss": 1.0725,
+      "step": 361
+    },
+    {
+      "epoch": 0.06445868945868946,
+      "grad_norm": 0.47114330530166626,
+      "learning_rate": 0.00019988543854995735,
+      "loss": 1.2196,
+      "step": 362
+    },
+    {
+      "epoch": 0.06463675213675214,
+      "grad_norm": 0.40465688705444336,
+      "learning_rate": 0.00019988476774938216,
+      "loss": 1.1869,
+      "step": 363
+    },
+    {
+      "epoch": 0.06481481481481481,
+      "grad_norm": 0.40301886200904846,
+      "learning_rate": 0.00019988409499177245,
+      "loss": 1.1765,
+      "step": 364
+    },
+    {
+      "epoch": 0.0649928774928775,
+      "grad_norm": 0.43443185091018677,
+      "learning_rate": 0.0001998834202771414,
+      "loss": 1.2022,
+      "step": 365
+    },
+    {
+      "epoch": 0.06517094017094018,
+      "grad_norm": 0.4712986350059509,
+      "learning_rate": 0.00019988274360550217,
+      "loss": 1.156,
+      "step": 366
+    },
+    {
+      "epoch": 0.06534900284900284,
+      "grad_norm": 0.4524450898170471,
+      "learning_rate": 0.00019988206497686815,
+      "loss": 1.2917,
+      "step": 367
+    },
+    {
+      "epoch": 0.06552706552706553,
+      "grad_norm": 0.40302205085754395,
+      "learning_rate": 0.0001998813843912525,
+      "loss": 0.9993,
+      "step": 368
+    },
+    {
+      "epoch": 0.06570512820512821,
+      "grad_norm": 0.39435216784477234,
+      "learning_rate": 0.00019988070184866864,
+      "loss": 1.0914,
+      "step": 369
+    },
+    {
+      "epoch": 0.06588319088319089,
+      "grad_norm": 0.39267390966415405,
+      "learning_rate": 0.00019988001734912988,
+      "loss": 1.3138,
+      "step": 370
+    },
+    {
+      "epoch": 0.06606125356125356,
+      "grad_norm": 0.38351675868034363,
+      "learning_rate": 0.00019987933089264968,
+      "loss": 1.0997,
+      "step": 371
+    },
+    {
+      "epoch": 0.06623931623931624,
+      "grad_norm": 0.3294839859008789,
+      "learning_rate": 0.00019987864247924145,
+      "loss": 0.9656,
+      "step": 372
+    },
+    {
+      "epoch": 0.06641737891737892,
+      "grad_norm": 0.45333364605903625,
+      "learning_rate": 0.00019987795210891872,
+      "loss": 1.095,
+      "step": 373
+    },
+    {
+      "epoch": 0.06659544159544159,
+      "grad_norm": 0.4362282454967499,
+      "learning_rate": 0.00019987725978169501,
+      "loss": 1.2103,
+      "step": 374
+    },
+    {
+      "epoch": 0.06677350427350427,
+      "grad_norm": 0.41314780712127686,
+      "learning_rate": 0.00019987656549758385,
+      "loss": 1.2115,
+      "step": 375
+    },
+    {
+      "epoch": 0.06695156695156695,
+      "grad_norm": 0.4230864644050598,
+      "learning_rate": 0.00019987586925659888,
+      "loss": 1.17,
+      "step": 376
+    },
+    {
+      "epoch": 0.06712962962962964,
+      "grad_norm": 0.4703855812549591,
+      "learning_rate": 0.00019987517105875372,
+      "loss": 1.367,
+      "step": 377
+    },
+    {
+      "epoch": 0.0673076923076923,
+      "grad_norm": 0.4671297073364258,
+      "learning_rate": 0.00019987447090406206,
+      "loss": 1.2543,
+      "step": 378
+    },
+    {
+      "epoch": 0.06748575498575499,
+      "grad_norm": 0.43746981024742126,
+      "learning_rate": 0.0001998737687925376,
+      "loss": 1.214,
+      "step": 379
+    },
+    {
+      "epoch": 0.06766381766381767,
+      "grad_norm": 0.40889596939086914,
+      "learning_rate": 0.00019987306472419412,
+      "loss": 1.0496,
+      "step": 380
+    },
+    {
+      "epoch": 0.06784188034188034,
+      "grad_norm": 0.3677358627319336,
+      "learning_rate": 0.0001998723586990454,
+      "loss": 1.1242,
+      "step": 381
+    },
+    {
+      "epoch": 0.06801994301994302,
+      "grad_norm": 0.3892628848552704,
+      "learning_rate": 0.00019987165071710527,
+      "loss": 1.0246,
+      "step": 382
+    },
+    {
+      "epoch": 0.0681980056980057,
+      "grad_norm": 0.4281293749809265,
+      "learning_rate": 0.00019987094077838764,
+      "loss": 1.2817,
+      "step": 383
+    },
+    {
+      "epoch": 0.06837606837606838,
+      "grad_norm": 0.45030340552330017,
+      "learning_rate": 0.00019987022888290636,
+      "loss": 1.159,
+      "step": 384
+    },
+    {
+      "epoch": 0.06855413105413105,
+      "grad_norm": 0.6327905058860779,
+      "learning_rate": 0.00019986951503067545,
+      "loss": 0.9577,
+      "step": 385
+    },
+    {
+      "epoch": 0.06873219373219373,
+      "grad_norm": 0.40339627861976624,
+      "learning_rate": 0.0001998687992217088,
+      "loss": 1.138,
+      "step": 386
+    },
+    {
+      "epoch": 0.06891025641025642,
+      "grad_norm": 0.4018291234970093,
+      "learning_rate": 0.00019986808145602052,
+      "loss": 0.9109,
+      "step": 387
+    },
+    {
+      "epoch": 0.06908831908831908,
+      "grad_norm": 0.41566264629364014,
+      "learning_rate": 0.00019986736173362464,
+      "loss": 1.1516,
+      "step": 388
+    },
+    {
+      "epoch": 0.06926638176638177,
+      "grad_norm": 0.3569067418575287,
+      "learning_rate": 0.00019986664005453527,
+      "loss": 1.2329,
+      "step": 389
+    },
+    {
+      "epoch": 0.06944444444444445,
+      "grad_norm": 0.3959648907184601,
+      "learning_rate": 0.0001998659164187665,
+      "loss": 1.1041,
+      "step": 390
+    },
+    {
+      "epoch": 0.06962250712250712,
+      "grad_norm": 0.42853206396102905,
+      "learning_rate": 0.00019986519082633257,
+      "loss": 1.0859,
+      "step": 391
+    },
+    {
+      "epoch": 0.0698005698005698,
+      "grad_norm": 0.42005518078804016,
+      "learning_rate": 0.0001998644632772477,
+      "loss": 1.2017,
+      "step": 392
+    },
+    {
+      "epoch": 0.06997863247863248,
+      "grad_norm": 0.4296947419643402,
+      "learning_rate": 0.00019986373377152612,
+      "loss": 1.1464,
+      "step": 393
+    },
+    {
+      "epoch": 0.07015669515669516,
+      "grad_norm": 0.394747793674469,
+      "learning_rate": 0.0001998630023091821,
+      "loss": 1.0316,
+      "step": 394
+    },
+    {
+      "epoch": 0.07033475783475783,
+      "grad_norm": 0.3779357969760895,
+      "learning_rate": 0.00019986226889023002,
+      "loss": 1.1081,
+      "step": 395
+    },
+    {
+      "epoch": 0.07051282051282051,
+      "grad_norm": 0.4271804690361023,
+      "learning_rate": 0.00019986153351468424,
+      "loss": 0.985,
+      "step": 396
+    },
+    {
+      "epoch": 0.0706908831908832,
+      "grad_norm": 0.49412235617637634,
+      "learning_rate": 0.00019986079618255912,
+      "loss": 1.2606,
+      "step": 397
+    },
+    {
+      "epoch": 0.07086894586894586,
+      "grad_norm": 0.43657439947128296,
+      "learning_rate": 0.00019986005689386915,
+      "loss": 1.2266,
+      "step": 398
+    },
+    {
+      "epoch": 0.07104700854700854,
+      "grad_norm": 0.4060729444026947,
+      "learning_rate": 0.0001998593156486288,
+      "loss": 1.1787,
+      "step": 399
+    },
+    {
+      "epoch": 0.07122507122507123,
+      "grad_norm": 0.387046217918396,
+      "learning_rate": 0.00019985857244685264,
+      "loss": 0.9411,
+      "step": 400
+    },
+    {
+      "epoch": 0.07140313390313391,
+      "grad_norm": 0.4243999123573303,
+      "learning_rate": 0.00019985782728855516,
+      "loss": 1.2024,
+      "step": 401
+    },
+    {
+      "epoch": 0.07158119658119658,
+      "grad_norm": 0.43113812804222107,
+      "learning_rate": 0.000199857080173751,
+      "loss": 1.1246,
+      "step": 402
+    },
+    {
+      "epoch": 0.07175925925925926,
+      "grad_norm": 0.4653271436691284,
+      "learning_rate": 0.0001998563311024548,
+      "loss": 1.2343,
+      "step": 403
+    },
+    {
+      "epoch": 0.07193732193732194,
+      "grad_norm": 0.43260812759399414,
+      "learning_rate": 0.0001998555800746812,
+      "loss": 0.9543,
+      "step": 404
+    },
+    {
+      "epoch": 0.07211538461538461,
+      "grad_norm": 0.4635484516620636,
+      "learning_rate": 0.00019985482709044495,
+      "loss": 1.1091,
+      "step": 405
+    },
+    {
+      "epoch": 0.07229344729344729,
+      "grad_norm": 0.38362643122673035,
+      "learning_rate": 0.00019985407214976076,
+      "loss": 1.2584,
+      "step": 406
+    },
+    {
+      "epoch": 0.07247150997150997,
+      "grad_norm": 0.4068310558795929,
+      "learning_rate": 0.00019985331525264351,
+      "loss": 1.1944,
+      "step": 407
+    },
+    {
+      "epoch": 0.07264957264957266,
+      "grad_norm": 0.43909943103790283,
+      "learning_rate": 0.00019985255639910795,
+      "loss": 1.3748,
+      "step": 408
+    },
+    {
+      "epoch": 0.07282763532763532,
+      "grad_norm": 0.48674601316452026,
+      "learning_rate": 0.000199851795589169,
+      "loss": 1.2684,
+      "step": 409
+    },
+    {
+      "epoch": 0.073005698005698,
+      "grad_norm": 0.4218580722808838,
+      "learning_rate": 0.0001998510328228415,
+      "loss": 1.168,
+      "step": 410
+    },
+    {
+      "epoch": 0.07318376068376069,
+      "grad_norm": 0.4688236117362976,
+      "learning_rate": 0.00019985026810014046,
+      "loss": 1.3088,
+      "step": 411
+    },
+    {
+      "epoch": 0.07336182336182336,
+      "grad_norm": 0.3863612711429596,
+      "learning_rate": 0.00019984950142108083,
+      "loss": 1.0261,
+      "step": 412
+    },
+    {
+      "epoch": 0.07353988603988604,
+      "grad_norm": 0.4177640378475189,
+      "learning_rate": 0.00019984873278567765,
+      "loss": 1.1985,
+      "step": 413
+    },
+    {
+      "epoch": 0.07371794871794872,
+      "grad_norm": 0.4645586311817169,
+      "learning_rate": 0.00019984796219394592,
+      "loss": 1.2463,
+      "step": 414
+    },
+    {
+      "epoch": 0.0738960113960114,
+      "grad_norm": 0.5051766633987427,
+      "learning_rate": 0.00019984718964590083,
+      "loss": 1.3031,
+      "step": 415
+    },
+    {
+      "epoch": 0.07407407407407407,
+      "grad_norm": 0.4200040400028229,
+      "learning_rate": 0.0001998464151415575,
+      "loss": 1.0842,
+      "step": 416
+    },
+    {
+      "epoch": 0.07425213675213675,
+      "grad_norm": 0.34211036562919617,
+      "learning_rate": 0.000199845638680931,
+      "loss": 0.9659,
+      "step": 417
+    },
+    {
+      "epoch": 0.07443019943019943,
+      "grad_norm": 0.3553323447704315,
+      "learning_rate": 0.00019984486026403668,
+      "loss": 1.0102,
+      "step": 418
+    },
+    {
+      "epoch": 0.0746082621082621,
+      "grad_norm": 0.4967300295829773,
+      "learning_rate": 0.00019984407989088974,
+      "loss": 1.3125,
+      "step": 419
+    },
+    {
+      "epoch": 0.07478632478632478,
+      "grad_norm": 0.41649797558784485,
+      "learning_rate": 0.00019984329756150544,
+      "loss": 1.3092,
+      "step": 420
+    },
+    {
+      "epoch": 0.07496438746438747,
+      "grad_norm": 0.43825802206993103,
+      "learning_rate": 0.00019984251327589912,
+      "loss": 1.3678,
+      "step": 421
+    },
+    {
+      "epoch": 0.07514245014245015,
+      "grad_norm": 0.363394170999527,
+      "learning_rate": 0.00019984172703408617,
+      "loss": 1.305,
+      "step": 422
+    },
+    {
+      "epoch": 0.07532051282051282,
+      "grad_norm": 0.411563903093338,
+      "learning_rate": 0.000199840938836082,
+      "loss": 1.4248,
+      "step": 423
+    },
+    {
+      "epoch": 0.0754985754985755,
+      "grad_norm": 0.40548190474510193,
+      "learning_rate": 0.000199840148681902,
+      "loss": 1.1081,
+      "step": 424
+    },
+    {
+      "epoch": 0.07567663817663818,
+      "grad_norm": 0.3781099021434784,
+      "learning_rate": 0.00019983935657156171,
+      "loss": 1.185,
+      "step": 425
+    },
+    {
+      "epoch": 0.07585470085470085,
+      "grad_norm": 0.46597573161125183,
+      "learning_rate": 0.00019983856250507662,
+      "loss": 1.119,
+      "step": 426
+    },
+    {
+      "epoch": 0.07603276353276353,
+      "grad_norm": 0.3988197147846222,
+      "learning_rate": 0.00019983776648246232,
+      "loss": 1.206,
+      "step": 427
+    },
+    {
+      "epoch": 0.07621082621082621,
+      "grad_norm": 0.41210901737213135,
+      "learning_rate": 0.00019983696850373433,
+      "loss": 1.1843,
+      "step": 428
+    },
+    {
+      "epoch": 0.0763888888888889,
+      "grad_norm": 0.41870948672294617,
+      "learning_rate": 0.00019983616856890837,
+      "loss": 1.2248,
+      "step": 429
+    },
+    {
+      "epoch": 0.07656695156695156,
+      "grad_norm": 0.4320056140422821,
+      "learning_rate": 0.00019983536667800007,
+      "loss": 0.9743,
+      "step": 430
+    },
+    {
+      "epoch": 0.07674501424501425,
+      "grad_norm": 0.48455503582954407,
+      "learning_rate": 0.00019983456283102517,
+      "loss": 1.0438,
+      "step": 431
+    },
+    {
+      "epoch": 0.07692307692307693,
+      "grad_norm": 0.38712427020072937,
+      "learning_rate": 0.00019983375702799935,
+      "loss": 1.2041,
+      "step": 432
+    },
+    {
+      "epoch": 0.0771011396011396,
+      "grad_norm": 0.3578857481479645,
+      "learning_rate": 0.0001998329492689385,
+      "loss": 1.1623,
+      "step": 433
+    },
+    {
+      "epoch": 0.07727920227920228,
+      "grad_norm": 0.43065932393074036,
+      "learning_rate": 0.00019983213955385834,
+      "loss": 1.3033,
+      "step": 434
+    },
+    {
+      "epoch": 0.07745726495726496,
+      "grad_norm": 0.4882095754146576,
+      "learning_rate": 0.00019983132788277484,
+      "loss": 1.1635,
+      "step": 435
+    },
+    {
+      "epoch": 0.07763532763532764,
+      "grad_norm": 0.3429015874862671,
+      "learning_rate": 0.00019983051425570382,
+      "loss": 0.7289,
+      "step": 436
+    },
+    {
+      "epoch": 0.07781339031339031,
+      "grad_norm": 0.4320310056209564,
+      "learning_rate": 0.00019982969867266128,
+      "loss": 1.3685,
+      "step": 437
+    },
+    {
+      "epoch": 0.07799145299145299,
+      "grad_norm": 0.39891982078552246,
+      "learning_rate": 0.00019982888113366314,
+      "loss": 1.0444,
+      "step": 438
+    },
+    {
+      "epoch": 0.07816951566951567,
+      "grad_norm": 0.3675695061683655,
+      "learning_rate": 0.00019982806163872547,
+      "loss": 1.0527,
+      "step": 439
+    },
+    {
+      "epoch": 0.07834757834757834,
+      "grad_norm": 0.42824694514274597,
+      "learning_rate": 0.0001998272401878643,
+      "loss": 1.166,
+      "step": 440
+    },
+    {
+      "epoch": 0.07852564102564102,
+      "grad_norm": 0.3721694350242615,
+      "learning_rate": 0.00019982641678109575,
+      "loss": 1.1328,
+      "step": 441
+    },
+    {
+      "epoch": 0.0787037037037037,
+      "grad_norm": 0.33899208903312683,
+      "learning_rate": 0.00019982559141843592,
+      "loss": 1.016,
+      "step": 442
+    },
+    {
+      "epoch": 0.07888176638176639,
+      "grad_norm": 0.4029340147972107,
+      "learning_rate": 0.000199824764099901,
+      "loss": 1.0076,
+      "step": 443
+    },
+    {
+      "epoch": 0.07905982905982906,
+      "grad_norm": 0.4169132113456726,
+      "learning_rate": 0.0001998239348255072,
+      "loss": 1.208,
+      "step": 444
+    },
+    {
+      "epoch": 0.07923789173789174,
+      "grad_norm": 0.3865824043750763,
+      "learning_rate": 0.00019982310359527075,
+      "loss": 1.067,
+      "step": 445
+    },
+    {
+      "epoch": 0.07941595441595442,
+      "grad_norm": 0.4218919277191162,
+      "learning_rate": 0.00019982227040920796,
+      "loss": 1.195,
+      "step": 446
+    },
+    {
+      "epoch": 0.07959401709401709,
+      "grad_norm": 0.40504586696624756,
+      "learning_rate": 0.00019982143526733512,
+      "loss": 1.0188,
+      "step": 447
+    },
+    {
+      "epoch": 0.07977207977207977,
+      "grad_norm": 0.38330578804016113,
+      "learning_rate": 0.00019982059816966863,
+      "loss": 1.0484,
+      "step": 448
+    },
+    {
+      "epoch": 0.07995014245014245,
+      "grad_norm": 0.43731689453125,
+      "learning_rate": 0.00019981975911622488,
+      "loss": 1.074,
+      "step": 449
+    },
+    {
+      "epoch": 0.08012820512820513,
+      "grad_norm": 0.40858447551727295,
+      "learning_rate": 0.00019981891810702033,
+      "loss": 1.0008,
+      "step": 450
+    },
+    {
+      "epoch": 0.0803062678062678,
+      "grad_norm": 0.4031754732131958,
+      "learning_rate": 0.00019981807514207143,
+      "loss": 1.2179,
+      "step": 451
+    },
+    {
+      "epoch": 0.08048433048433049,
+      "grad_norm": 0.41920867562294006,
+      "learning_rate": 0.00019981723022139466,
+      "loss": 1.1406,
+      "step": 452
+    },
+    {
+      "epoch": 0.08066239316239317,
+      "grad_norm": 0.40305474400520325,
+      "learning_rate": 0.00019981638334500668,
+      "loss": 1.098,
+      "step": 453
+    },
+    {
+      "epoch": 0.08084045584045584,
+      "grad_norm": 0.4564182460308075,
+      "learning_rate": 0.00019981553451292396,
+      "loss": 1.419,
+      "step": 454
+    },
+    {
+      "epoch": 0.08101851851851852,
+      "grad_norm": 0.3832945227622986,
+      "learning_rate": 0.00019981468372516322,
+      "loss": 1.0919,
+      "step": 455
+    },
+    {
+      "epoch": 0.0811965811965812,
+      "grad_norm": 0.43062624335289,
+      "learning_rate": 0.0001998138309817411,
+      "loss": 1.0458,
+      "step": 456
+    },
+    {
+      "epoch": 0.08137464387464387,
+      "grad_norm": 0.3871173560619354,
+      "learning_rate": 0.0001998129762826743,
+      "loss": 1.1391,
+      "step": 457
+    },
+    {
+      "epoch": 0.08155270655270655,
+      "grad_norm": 0.43423157930374146,
+      "learning_rate": 0.0001998121196279796,
+      "loss": 1.1132,
+      "step": 458
+    },
+    {
+      "epoch": 0.08173076923076923,
+      "grad_norm": 0.4341012239456177,
+      "learning_rate": 0.00019981126101767372,
+      "loss": 1.113,
+      "step": 459
+    },
+    {
+      "epoch": 0.08190883190883191,
+      "grad_norm": 0.36748576164245605,
+      "learning_rate": 0.00019981040045177352,
+      "loss": 0.8108,
+      "step": 460
+    },
+    {
+      "epoch": 0.08208689458689458,
+      "grad_norm": 0.43133220076560974,
+      "learning_rate": 0.00019980953793029586,
+      "loss": 1.1861,
+      "step": 461
+    },
+    {
+      "epoch": 0.08226495726495726,
+      "grad_norm": 0.37204909324645996,
+      "learning_rate": 0.00019980867345325767,
+      "loss": 0.9222,
+      "step": 462
+    },
+    {
+      "epoch": 0.08244301994301995,
+      "grad_norm": 0.43370047211647034,
+      "learning_rate": 0.00019980780702067582,
+      "loss": 1.2984,
+      "step": 463
+    },
+    {
+      "epoch": 0.08262108262108261,
+      "grad_norm": 0.4991510808467865,
+      "learning_rate": 0.00019980693863256736,
+      "loss": 1.2222,
+      "step": 464
+    },
+    {
+      "epoch": 0.0827991452991453,
+      "grad_norm": 0.44318175315856934,
+      "learning_rate": 0.00019980606828894927,
+      "loss": 1.2262,
+      "step": 465
+    },
+    {
+      "epoch": 0.08297720797720798,
+      "grad_norm": 0.380231648683548,
+      "learning_rate": 0.0001998051959898386,
+      "loss": 1.0274,
+      "step": 466
+    },
+    {
+      "epoch": 0.08315527065527066,
+      "grad_norm": 0.39519667625427246,
+      "learning_rate": 0.0001998043217352524,
+      "loss": 1.2499,
+      "step": 467
+    },
+    {
+      "epoch": 0.08333333333333333,
+      "grad_norm": 0.457499235868454,
+      "learning_rate": 0.0001998034455252079,
+      "loss": 1.0751,
+      "step": 468
+    },
+    {
+      "epoch": 0.08351139601139601,
+      "grad_norm": 0.368522584438324,
+      "learning_rate": 0.00019980256735972215,
+      "loss": 1.0776,
+      "step": 469
+    },
+    {
+      "epoch": 0.08368945868945869,
+      "grad_norm": 0.3768427073955536,
+      "learning_rate": 0.00019980168723881243,
+      "loss": 1.2198,
+      "step": 470
+    },
+    {
+      "epoch": 0.08386752136752136,
+      "grad_norm": 0.37045565247535706,
+      "learning_rate": 0.000199800805162496,
+      "loss": 1.1816,
+      "step": 471
+    },
+    {
+      "epoch": 0.08404558404558404,
+      "grad_norm": 0.4219281077384949,
+      "learning_rate": 0.0001997999211307901,
+      "loss": 1.0515,
+      "step": 472
+    },
+    {
+      "epoch": 0.08422364672364673,
+      "grad_norm": 0.3815271258354187,
+      "learning_rate": 0.00019979903514371207,
+      "loss": 1.1709,
+      "step": 473
+    },
+    {
+      "epoch": 0.08440170940170941,
+      "grad_norm": 0.4566493630409241,
+      "learning_rate": 0.00019979814720127924,
+      "loss": 1.3063,
+      "step": 474
+    },
+    {
+      "epoch": 0.08457977207977208,
+      "grad_norm": 0.4043879806995392,
+      "learning_rate": 0.000199797257303509,
+      "loss": 1.0549,
+      "step": 475
+    },
+    {
+      "epoch": 0.08475783475783476,
+      "grad_norm": 0.3897830545902252,
+      "learning_rate": 0.00019979636545041886,
+      "loss": 1.1483,
+      "step": 476
+    },
+    {
+      "epoch": 0.08493589743589744,
+      "grad_norm": 0.36097025871276855,
+      "learning_rate": 0.00019979547164202622,
+      "loss": 1.1196,
+      "step": 477
+    },
+    {
+      "epoch": 0.08511396011396011,
+      "grad_norm": 0.3766986131668091,
+      "learning_rate": 0.00019979457587834863,
+      "loss": 1.0131,
+      "step": 478
+    },
+    {
+      "epoch": 0.08529202279202279,
+      "grad_norm": 0.39460286498069763,
+      "learning_rate": 0.00019979367815940364,
+      "loss": 1.1729,
+      "step": 479
+    },
+    {
+      "epoch": 0.08547008547008547,
+      "grad_norm": 0.4137469232082367,
+      "learning_rate": 0.00019979277848520885,
+      "loss": 1.2569,
+      "step": 480
+    },
+    {
+      "epoch": 0.08564814814814815,
+      "grad_norm": 0.464688777923584,
+      "learning_rate": 0.00019979187685578183,
+      "loss": 1.2064,
+      "step": 481
+    },
+    {
+      "epoch": 0.08582621082621082,
+      "grad_norm": 0.4245518147945404,
+      "learning_rate": 0.0001997909732711403,
+      "loss": 0.9812,
+      "step": 482
+    },
+    {
+      "epoch": 0.0860042735042735,
+      "grad_norm": 0.43368837237358093,
+      "learning_rate": 0.00019979006773130197,
+      "loss": 1.2822,
+      "step": 483
+    },
+    {
+      "epoch": 0.08618233618233619,
+      "grad_norm": 0.4232824444770813,
+      "learning_rate": 0.00019978916023628452,
+      "loss": 1.1446,
+      "step": 484
+    },
+    {
+      "epoch": 0.08636039886039885,
+      "grad_norm": 0.4183506369590759,
+      "learning_rate": 0.00019978825078610578,
+      "loss": 1.2605,
+      "step": 485
+    },
+    {
+      "epoch": 0.08653846153846154,
+      "grad_norm": 0.4391268491744995,
+      "learning_rate": 0.00019978733938078356,
+      "loss": 1.2165,
+      "step": 486
+    },
+    {
+      "epoch": 0.08671652421652422,
+      "grad_norm": 0.4139612317085266,
+      "learning_rate": 0.0001997864260203357,
+      "loss": 0.9389,
+      "step": 487
+    },
+    {
+      "epoch": 0.0868945868945869,
+      "grad_norm": 0.4058656096458435,
+      "learning_rate": 0.00019978551070478013,
+      "loss": 1.0652,
+      "step": 488
+    },
+    {
+      "epoch": 0.08707264957264957,
+      "grad_norm": 0.42333099246025085,
+      "learning_rate": 0.00019978459343413473,
+      "loss": 1.119,
+      "step": 489
+    },
+    {
+      "epoch": 0.08725071225071225,
+      "grad_norm": 0.4573031961917877,
+      "learning_rate": 0.00019978367420841754,
+      "loss": 1.1546,
+      "step": 490
+    },
+    {
+      "epoch": 0.08742877492877493,
+      "grad_norm": 0.4161617159843445,
+      "learning_rate": 0.00019978275302764655,
+      "loss": 1.0836,
+      "step": 491
+    },
+    {
+      "epoch": 0.0876068376068376,
+      "grad_norm": 0.422145277261734,
+      "learning_rate": 0.00019978182989183977,
+      "loss": 1.1908,
+      "step": 492
+    },
+    {
+      "epoch": 0.08778490028490028,
+      "grad_norm": 0.4588126838207245,
+      "learning_rate": 0.00019978090480101532,
+      "loss": 1.1758,
+      "step": 493
+    },
+    {
+      "epoch": 0.08796296296296297,
+      "grad_norm": 0.4425722062587738,
+      "learning_rate": 0.00019977997775519132,
+      "loss": 1.088,
+      "step": 494
+    },
+    {
+      "epoch": 0.08814102564102565,
+      "grad_norm": 0.37860307097435,
+      "learning_rate": 0.00019977904875438594,
+      "loss": 1.1532,
+      "step": 495
+    },
+    {
+      "epoch": 0.08831908831908832,
+      "grad_norm": 0.40435823798179626,
+      "learning_rate": 0.00019977811779861733,
+      "loss": 1.1271,
+      "step": 496
+    },
+    {
+      "epoch": 0.088497150997151,
+      "grad_norm": 0.42578884959220886,
+      "learning_rate": 0.0001997771848879038,
+      "loss": 0.9889,
+      "step": 497
+    },
+    {
+      "epoch": 0.08867521367521368,
+      "grad_norm": 0.3439478874206543,
+      "learning_rate": 0.00019977625002226361,
+      "loss": 1.1273,
+      "step": 498
+    },
+    {
+      "epoch": 0.08885327635327635,
+      "grad_norm": 0.362341970205307,
+      "learning_rate": 0.00019977531320171504,
+      "loss": 1.0214,
+      "step": 499
+    },
+    {
+      "epoch": 0.08903133903133903,
+      "grad_norm": 0.4305768609046936,
+      "learning_rate": 0.0001997743744262765,
+      "loss": 1.2648,
+      "step": 500
+    },
+    {
+      "epoch": 0.08920940170940171,
+      "grad_norm": 0.35900023579597473,
+      "learning_rate": 0.00019977343369596636,
+      "loss": 1.0274,
+      "step": 501
+    },
+    {
+      "epoch": 0.0893874643874644,
+      "grad_norm": 0.4950818717479706,
+      "learning_rate": 0.00019977249101080306,
+      "loss": 1.1483,
+      "step": 502
+    },
+    {
+      "epoch": 0.08956552706552706,
+      "grad_norm": 0.3800346553325653,
+      "learning_rate": 0.00019977154637080503,
+      "loss": 1.0636,
+      "step": 503
+    },
+    {
+      "epoch": 0.08974358974358974,
+      "grad_norm": 0.46202352643013,
+      "learning_rate": 0.0001997705997759908,
+      "loss": 1.1544,
+      "step": 504
+    },
+    {
+      "epoch": 0.08992165242165243,
+      "grad_norm": 0.36818403005599976,
+      "learning_rate": 0.00019976965122637895,
+      "loss": 0.9824,
+      "step": 505
+    },
+    {
+      "epoch": 0.0900997150997151,
+      "grad_norm": 0.40248095989227295,
+      "learning_rate": 0.00019976870072198805,
+      "loss": 1.1002,
+      "step": 506
+    },
+    {
+      "epoch": 0.09027777777777778,
+      "grad_norm": 0.3841850459575653,
+      "learning_rate": 0.00019976774826283667,
+      "loss": 1.2433,
+      "step": 507
+    },
+    {
+      "epoch": 0.09045584045584046,
+      "grad_norm": 0.46892330050468445,
+      "learning_rate": 0.0001997667938489435,
+      "loss": 1.3194,
+      "step": 508
+    },
+    {
+      "epoch": 0.09063390313390314,
+      "grad_norm": 0.39059561491012573,
+      "learning_rate": 0.0001997658374803273,
+      "loss": 1.1778,
+      "step": 509
+    },
+    {
+      "epoch": 0.09081196581196581,
+      "grad_norm": 0.3793235421180725,
+      "learning_rate": 0.00019976487915700672,
+      "loss": 1.0659,
+      "step": 510
+    },
+    {
+      "epoch": 0.09099002849002849,
+      "grad_norm": 0.39067742228507996,
+      "learning_rate": 0.00019976391887900058,
+      "loss": 1.107,
+      "step": 511
+    },
+    {
+      "epoch": 0.09116809116809117,
+      "grad_norm": 0.40121713280677795,
+      "learning_rate": 0.00019976295664632772,
+      "loss": 1.102,
+      "step": 512
+    },
+    {
+      "epoch": 0.09134615384615384,
+      "grad_norm": 0.49830010533332825,
+      "learning_rate": 0.00019976199245900697,
+      "loss": 1.1701,
+      "step": 513
+    },
+    {
+      "epoch": 0.09152421652421652,
+      "grad_norm": 0.4536968171596527,
+      "learning_rate": 0.0001997610263170572,
+      "loss": 1.1067,
+      "step": 514
+    },
+    {
+      "epoch": 0.0917022792022792,
+      "grad_norm": 0.3832971453666687,
+      "learning_rate": 0.00019976005822049735,
+      "loss": 1.0991,
+      "step": 515
+    },
+    {
+      "epoch": 0.09188034188034189,
+      "grad_norm": 0.4093509614467621,
+      "learning_rate": 0.0001997590881693464,
+      "loss": 1.0565,
+      "step": 516
+    },
+    {
+      "epoch": 0.09205840455840456,
+      "grad_norm": 0.46073687076568604,
+      "learning_rate": 0.0001997581161636233,
+      "loss": 1.0057,
+      "step": 517
+    },
+    {
+      "epoch": 0.09223646723646724,
+      "grad_norm": 0.5001922845840454,
+      "learning_rate": 0.0001997571422033472,
+      "loss": 1.2639,
+      "step": 518
+    },
+    {
+      "epoch": 0.09241452991452992,
+      "grad_norm": 0.4620618224143982,
+      "learning_rate": 0.00019975616628853713,
+      "loss": 1.0966,
+      "step": 519
+    },
+    {
+      "epoch": 0.09259259259259259,
+      "grad_norm": 0.3788183927536011,
+      "learning_rate": 0.0001997551884192122,
+      "loss": 0.9783,
+      "step": 520
+    },
+    {
+      "epoch": 0.09277065527065527,
+      "grad_norm": 0.45589539408683777,
+      "learning_rate": 0.00019975420859539154,
+      "loss": 1.2194,
+      "step": 521
+    },
+    {
+      "epoch": 0.09294871794871795,
+      "grad_norm": 0.40747523307800293,
+      "learning_rate": 0.00019975322681709443,
+      "loss": 1.0349,
+      "step": 522
+    },
+    {
+      "epoch": 0.09312678062678063,
+      "grad_norm": 0.5045142769813538,
+      "learning_rate": 0.00019975224308434002,
+      "loss": 1.1373,
+      "step": 523
+    },
+    {
+      "epoch": 0.0933048433048433,
+      "grad_norm": 0.40352702140808105,
+      "learning_rate": 0.00019975125739714767,
+      "loss": 1.1236,
+      "step": 524
+    },
+    {
+      "epoch": 0.09348290598290598,
+      "grad_norm": 0.4301735758781433,
+      "learning_rate": 0.0001997502697555366,
+      "loss": 1.2932,
+      "step": 525
+    },
+    {
+      "epoch": 0.09366096866096867,
+      "grad_norm": 0.36800238490104675,
+      "learning_rate": 0.00019974928015952624,
+      "loss": 1.0734,
+      "step": 526
+    },
+    {
+      "epoch": 0.09383903133903133,
+      "grad_norm": 0.4027230143547058,
+      "learning_rate": 0.00019974828860913594,
+      "loss": 1.2776,
+      "step": 527
+    },
+    {
+      "epoch": 0.09401709401709402,
+      "grad_norm": 0.42497140169143677,
+      "learning_rate": 0.0001997472951043851,
+      "loss": 1.248,
+      "step": 528
+    },
+    {
+      "epoch": 0.0941951566951567,
+      "grad_norm": 0.3888593018054962,
+      "learning_rate": 0.00019974629964529325,
+      "loss": 1.0231,
+      "step": 529
+    },
+    {
+      "epoch": 0.09437321937321937,
+      "grad_norm": 0.3761361241340637,
+      "learning_rate": 0.00019974530223187986,
+      "loss": 1.0216,
+      "step": 530
+    },
+    {
+      "epoch": 0.09455128205128205,
+      "grad_norm": 0.42192980647087097,
+      "learning_rate": 0.00019974430286416448,
+      "loss": 1.0731,
+      "step": 531
+    },
+    {
+      "epoch": 0.09472934472934473,
+      "grad_norm": 0.44244512915611267,
+      "learning_rate": 0.00019974330154216667,
+      "loss": 1.2793,
+      "step": 532
+    },
+    {
+      "epoch": 0.09490740740740741,
+      "grad_norm": 0.378252774477005,
+      "learning_rate": 0.0001997422982659061,
+      "loss": 1.0462,
+      "step": 533
+    },
+    {
+      "epoch": 0.09508547008547008,
+      "grad_norm": 0.45589110255241394,
+      "learning_rate": 0.00019974129303540236,
+      "loss": 1.1884,
+      "step": 534
+    },
+    {
+      "epoch": 0.09526353276353276,
+      "grad_norm": 0.33930808305740356,
+      "learning_rate": 0.0001997402858506752,
+      "loss": 0.8381,
+      "step": 535
+    },
+    {
+      "epoch": 0.09544159544159544,
+      "grad_norm": 0.45408427715301514,
+      "learning_rate": 0.0001997392767117443,
+      "loss": 1.2379,
+      "step": 536
+    },
+    {
+      "epoch": 0.09561965811965811,
+      "grad_norm": 0.44125741720199585,
+      "learning_rate": 0.0001997382656186295,
+      "loss": 1.1941,
+      "step": 537
+    },
+    {
+      "epoch": 0.0957977207977208,
+      "grad_norm": 0.4075697660446167,
+      "learning_rate": 0.00019973725257135054,
+      "loss": 1.0142,
+      "step": 538
+    },
+    {
+      "epoch": 0.09597578347578348,
+      "grad_norm": 0.4258415102958679,
+      "learning_rate": 0.00019973623756992733,
+      "loss": 1.0447,
+      "step": 539
+    },
+    {
+      "epoch": 0.09615384615384616,
+      "grad_norm": 0.2738485038280487,
+      "learning_rate": 0.0001997352206143797,
+      "loss": 0.5521,
+      "step": 540
+    },
+    {
+      "epoch": 0.09633190883190883,
+      "grad_norm": 0.38815587759017944,
+      "learning_rate": 0.00019973420170472762,
+      "loss": 1.1052,
+      "step": 541
+    },
+    {
+      "epoch": 0.09650997150997151,
+      "grad_norm": 0.3909834027290344,
+      "learning_rate": 0.00019973318084099106,
+      "loss": 1.0494,
+      "step": 542
+    },
+    {
+      "epoch": 0.09668803418803419,
+      "grad_norm": 0.4517597258090973,
+      "learning_rate": 0.00019973215802318996,
+      "loss": 1.0611,
+      "step": 543
+    },
+    {
+      "epoch": 0.09686609686609686,
+      "grad_norm": 0.48659002780914307,
+      "learning_rate": 0.00019973113325134442,
+      "loss": 0.9967,
+      "step": 544
+    },
+    {
+      "epoch": 0.09704415954415954,
+      "grad_norm": 0.4039791524410248,
+      "learning_rate": 0.0001997301065254745,
+      "loss": 1.251,
+      "step": 545
+    },
+    {
+      "epoch": 0.09722222222222222,
+      "grad_norm": 0.3985383212566376,
+      "learning_rate": 0.0001997290778456003,
+      "loss": 1.2263,
+      "step": 546
+    },
+    {
+      "epoch": 0.0974002849002849,
+      "grad_norm": 0.4540637731552124,
+      "learning_rate": 0.00019972804721174199,
+      "loss": 1.2084,
+      "step": 547
+    },
+    {
+      "epoch": 0.09757834757834757,
+      "grad_norm": 0.36867982149124146,
+      "learning_rate": 0.00019972701462391977,
+      "loss": 0.9704,
+      "step": 548
+    },
+    {
+      "epoch": 0.09775641025641026,
+      "grad_norm": 0.40199780464172363,
+      "learning_rate": 0.00019972598008215385,
+      "loss": 1.1121,
+      "step": 549
+    },
+    {
+      "epoch": 0.09793447293447294,
+      "grad_norm": 0.42728984355926514,
+      "learning_rate": 0.00019972494358646455,
+      "loss": 1.1606,
+      "step": 550
+    },
+    {
+      "epoch": 0.0981125356125356,
+      "grad_norm": 0.4212374687194824,
+      "learning_rate": 0.0001997239051368721,
+      "loss": 1.3093,
+      "step": 551
+    },
+    {
+      "epoch": 0.09829059829059829,
+      "grad_norm": 0.3972226083278656,
+      "learning_rate": 0.0001997228647333969,
+      "loss": 1.1218,
+      "step": 552
+    },
+    {
+      "epoch": 0.09846866096866097,
+      "grad_norm": 0.43649932742118835,
+      "learning_rate": 0.00019972182237605935,
+      "loss": 1.2532,
+      "step": 553
+    },
+    {
+      "epoch": 0.09864672364672365,
+      "grad_norm": 0.3812280595302582,
+      "learning_rate": 0.0001997207780648798,
+      "loss": 1.0409,
+      "step": 554
+    },
+    {
+      "epoch": 0.09882478632478632,
+      "grad_norm": 0.41684821248054504,
+      "learning_rate": 0.00019971973179987878,
+      "loss": 0.9569,
+      "step": 555
+    },
+    {
+      "epoch": 0.099002849002849,
+      "grad_norm": 0.38081470131874084,
+      "learning_rate": 0.00019971868358107674,
+      "loss": 1.1615,
+      "step": 556
+    },
+    {
+      "epoch": 0.09918091168091168,
+      "grad_norm": 0.3702073097229004,
+      "learning_rate": 0.0001997176334084943,
+      "loss": 1.3907,
+      "step": 557
+    },
+    {
+      "epoch": 0.09935897435897435,
+      "grad_norm": 0.3625728189945221,
+      "learning_rate": 0.00019971658128215193,
+      "loss": 1.1897,
+      "step": 558
+    },
+    {
+      "epoch": 0.09953703703703703,
+      "grad_norm": 0.3815405070781708,
+      "learning_rate": 0.0001997155272020703,
+      "loss": 1.1473,
+      "step": 559
+    },
+    {
+      "epoch": 0.09971509971509972,
+      "grad_norm": 0.48664286732673645,
+      "learning_rate": 0.00019971447116827004,
+      "loss": 1.2462,
+      "step": 560
+    },
+    {
+      "epoch": 0.0998931623931624,
+      "grad_norm": 0.3708696663379669,
+      "learning_rate": 0.0001997134131807719,
+      "loss": 1.0979,
+      "step": 561
+    },
+    {
+      "epoch": 0.10007122507122507,
+      "grad_norm": 0.44511324167251587,
+      "learning_rate": 0.00019971235323959654,
+      "loss": 1.2313,
+      "step": 562
+    },
+    {
+      "epoch": 0.10024928774928775,
+      "grad_norm": 0.3687448799610138,
+      "learning_rate": 0.00019971129134476473,
+      "loss": 1.1526,
+      "step": 563
+    },
+    {
+      "epoch": 0.10042735042735043,
+      "grad_norm": 0.4506866931915283,
+      "learning_rate": 0.00019971022749629735,
+      "loss": 1.0003,
+      "step": 564
+    },
+    {
+      "epoch": 0.1006054131054131,
+      "grad_norm": 0.41910406947135925,
+      "learning_rate": 0.00019970916169421515,
+      "loss": 1.013,
+      "step": 565
+    },
+    {
+      "epoch": 0.10078347578347578,
+      "grad_norm": 0.39728936553001404,
+      "learning_rate": 0.0001997080939385391,
+      "loss": 1.0501,
+      "step": 566
+    },
+    {
+      "epoch": 0.10096153846153846,
+      "grad_norm": 0.41415902972221375,
+      "learning_rate": 0.00019970702422929005,
+      "loss": 1.0791,
+      "step": 567
+    },
+    {
+      "epoch": 0.10113960113960115,
+      "grad_norm": 0.45630788803100586,
+      "learning_rate": 0.00019970595256648896,
+      "loss": 1.2884,
+      "step": 568
+    },
+    {
+      "epoch": 0.10131766381766381,
+      "grad_norm": 0.4371698796749115,
+      "learning_rate": 0.00019970487895015686,
+      "loss": 1.0684,
+      "step": 569
+    },
+    {
+      "epoch": 0.1014957264957265,
+      "grad_norm": 0.4350591003894806,
+      "learning_rate": 0.00019970380338031477,
+      "loss": 1.2415,
+      "step": 570
+    },
+    {
+      "epoch": 0.10167378917378918,
+      "grad_norm": 0.4232708215713501,
+      "learning_rate": 0.00019970272585698382,
+      "loss": 1.2656,
+      "step": 571
+    },
+    {
+      "epoch": 0.10185185185185185,
+      "grad_norm": 0.3917689919471741,
+      "learning_rate": 0.00019970164638018502,
+      "loss": 1.0178,
+      "step": 572
+    },
+    {
+      "epoch": 0.10202991452991453,
+      "grad_norm": 0.4262804388999939,
+      "learning_rate": 0.0001997005649499396,
+      "loss": 1.1805,
+      "step": 573
+    },
+    {
+      "epoch": 0.10220797720797721,
+      "grad_norm": 0.5217884182929993,
+      "learning_rate": 0.0001996994815662687,
+      "loss": 1.2392,
+      "step": 574
+    },
+    {
+      "epoch": 0.10238603988603989,
+      "grad_norm": 0.4273875057697296,
+      "learning_rate": 0.00019969839622919358,
+      "loss": 1.0844,
+      "step": 575
+    },
+    {
+      "epoch": 0.10256410256410256,
+      "grad_norm": 0.41588085889816284,
+      "learning_rate": 0.00019969730893873547,
+      "loss": 1.2437,
+      "step": 576
+    },
+    {
+      "epoch": 0.10274216524216524,
+      "grad_norm": 0.41617709398269653,
+      "learning_rate": 0.0001996962196949157,
+      "loss": 0.9519,
+      "step": 577
+    },
+    {
+      "epoch": 0.10292022792022792,
+      "grad_norm": 0.4832979142665863,
+      "learning_rate": 0.00019969512849775565,
+      "loss": 1.1889,
+      "step": 578
+    },
+    {
+      "epoch": 0.10309829059829059,
+      "grad_norm": 0.3936060965061188,
+      "learning_rate": 0.0001996940353472766,
+      "loss": 0.9888,
+      "step": 579
+    },
+    {
+      "epoch": 0.10327635327635327,
+      "grad_norm": 0.4147680997848511,
+      "learning_rate": 0.00019969294024350004,
+      "loss": 1.0733,
+      "step": 580
+    },
+    {
+      "epoch": 0.10345441595441596,
+      "grad_norm": 0.37791356444358826,
+      "learning_rate": 0.00019969184318644742,
+      "loss": 1.212,
+      "step": 581
+    },
+    {
+      "epoch": 0.10363247863247864,
+      "grad_norm": 0.44297221302986145,
+      "learning_rate": 0.00019969074417614023,
+      "loss": 1.0535,
+      "step": 582
+    },
+    {
+      "epoch": 0.10381054131054131,
+      "grad_norm": 0.4032835066318512,
+      "learning_rate": 0.0001996896432126,
+      "loss": 1.1869,
+      "step": 583
+    },
+    {
+      "epoch": 0.10398860398860399,
+      "grad_norm": 0.49271953105926514,
+      "learning_rate": 0.00019968854029584827,
+      "loss": 1.1661,
+      "step": 584
+    },
+    {
+      "epoch": 0.10416666666666667,
+      "grad_norm": 0.362699031829834,
+      "learning_rate": 0.0001996874354259067,
+      "loss": 0.868,
+      "step": 585
+    },
+    {
+      "epoch": 0.10434472934472934,
+      "grad_norm": 0.401795357465744,
+      "learning_rate": 0.0001996863286027969,
+      "loss": 1.1045,
+      "step": 586
+    },
+    {
+      "epoch": 0.10452279202279202,
+      "grad_norm": 0.45380479097366333,
+      "learning_rate": 0.00019968521982654058,
+      "loss": 0.8503,
+      "step": 587
+    },
+    {
+      "epoch": 0.1047008547008547,
+      "grad_norm": 0.49759066104888916,
+      "learning_rate": 0.00019968410909715947,
+      "loss": 1.4073,
+      "step": 588
+    },
+    {
+      "epoch": 0.10487891737891739,
+      "grad_norm": 0.4421198070049286,
+      "learning_rate": 0.0001996829964146753,
+      "loss": 1.1512,
+      "step": 589
+    },
+    {
+      "epoch": 0.10505698005698005,
+      "grad_norm": 0.46675658226013184,
+      "learning_rate": 0.00019968188177910988,
+      "loss": 1.0132,
+      "step": 590
+    },
+    {
+      "epoch": 0.10523504273504274,
+      "grad_norm": 0.5710657238960266,
+      "learning_rate": 0.00019968076519048507,
+      "loss": 1.267,
+      "step": 591
+    },
+    {
+      "epoch": 0.10541310541310542,
+      "grad_norm": 0.4655563235282898,
+      "learning_rate": 0.00019967964664882276,
+      "loss": 1.1204,
+      "step": 592
+    },
+    {
+      "epoch": 0.10559116809116809,
+      "grad_norm": 0.3895256519317627,
+      "learning_rate": 0.00019967852615414478,
+      "loss": 1.0814,
+      "step": 593
+    },
+    {
+      "epoch": 0.10576923076923077,
+      "grad_norm": 0.424216091632843,
+      "learning_rate": 0.00019967740370647322,
+      "loss": 1.1663,
+      "step": 594
+    },
+    {
+      "epoch": 0.10594729344729345,
+      "grad_norm": 0.3978985846042633,
+      "learning_rate": 0.00019967627930582996,
+      "loss": 0.909,
+      "step": 595
+    },
+    {
+      "epoch": 0.10612535612535613,
+      "grad_norm": 0.47064995765686035,
+      "learning_rate": 0.00019967515295223705,
+      "loss": 1.2351,
+      "step": 596
+    },
+    {
+      "epoch": 0.1063034188034188,
+      "grad_norm": 0.42449644207954407,
+      "learning_rate": 0.0001996740246457166,
+      "loss": 0.9739,
+      "step": 597
+    },
+    {
+      "epoch": 0.10648148148148148,
+      "grad_norm": 0.39033401012420654,
+      "learning_rate": 0.00019967289438629066,
+      "loss": 1.0933,
+      "step": 598
+    },
+    {
+      "epoch": 0.10665954415954416,
+      "grad_norm": 0.4398612678050995,
+      "learning_rate": 0.00019967176217398143,
+      "loss": 1.2479,
+      "step": 599
+    },
+    {
+      "epoch": 0.10683760683760683,
+      "grad_norm": 0.3946632742881775,
+      "learning_rate": 0.00019967062800881107,
+      "loss": 1.0417,
+      "step": 600
+    },
+    {
+      "epoch": 0.10701566951566951,
+      "grad_norm": 0.5083445906639099,
+      "learning_rate": 0.0001996694918908018,
+      "loss": 1.1109,
+      "step": 601
+    },
+    {
+      "epoch": 0.1071937321937322,
+      "grad_norm": 0.477724552154541,
+      "learning_rate": 0.00019966835381997585,
+      "loss": 1.2891,
+      "step": 602
+    },
+    {
+      "epoch": 0.10737179487179487,
+      "grad_norm": 0.4110167920589447,
+      "learning_rate": 0.0001996672137963556,
+      "loss": 1.0555,
+      "step": 603
+    },
+    {
+      "epoch": 0.10754985754985755,
+      "grad_norm": 0.44078320264816284,
+      "learning_rate": 0.00019966607181996334,
+      "loss": 0.9188,
+      "step": 604
+    },
+    {
+      "epoch": 0.10772792022792023,
+      "grad_norm": 0.41251105070114136,
+      "learning_rate": 0.00019966492789082142,
+      "loss": 1.2592,
+      "step": 605
+    },
+    {
+      "epoch": 0.10790598290598291,
+      "grad_norm": 0.37701505422592163,
+      "learning_rate": 0.00019966378200895227,
+      "loss": 1.0233,
+      "step": 606
+    },
+    {
+      "epoch": 0.10808404558404558,
+      "grad_norm": 0.44624966382980347,
+      "learning_rate": 0.00019966263417437835,
+      "loss": 1.2273,
+      "step": 607
+    },
+    {
+      "epoch": 0.10826210826210826,
+      "grad_norm": 0.3618549108505249,
+      "learning_rate": 0.00019966148438712214,
+      "loss": 0.9101,
+      "step": 608
+    },
+    {
+      "epoch": 0.10844017094017094,
+      "grad_norm": 0.384574294090271,
+      "learning_rate": 0.00019966033264720616,
+      "loss": 1.1769,
+      "step": 609
+    },
+    {
+      "epoch": 0.10861823361823361,
+      "grad_norm": 0.50872403383255,
+      "learning_rate": 0.000199659178954653,
+      "loss": 1.1213,
+      "step": 610
+    },
+    {
+      "epoch": 0.1087962962962963,
+      "grad_norm": 0.39736685156822205,
+      "learning_rate": 0.00019965802330948527,
+      "loss": 1.275,
+      "step": 611
+    },
+    {
+      "epoch": 0.10897435897435898,
+      "grad_norm": 0.484660267829895,
+      "learning_rate": 0.00019965686571172557,
+      "loss": 1.1671,
+      "step": 612
+    },
+    {
+      "epoch": 0.10915242165242166,
+      "grad_norm": 0.41420218348503113,
+      "learning_rate": 0.0001996557061613966,
+      "loss": 0.9541,
+      "step": 613
+    },
+    {
+      "epoch": 0.10933048433048433,
+      "grad_norm": 0.4057196080684662,
+      "learning_rate": 0.00019965454465852112,
+      "loss": 1.0145,
+      "step": 614
+    },
+    {
+      "epoch": 0.10950854700854701,
+      "grad_norm": 0.4559510052204132,
+      "learning_rate": 0.00019965338120312182,
+      "loss": 1.0889,
+      "step": 615
+    },
+    {
+      "epoch": 0.10968660968660969,
+      "grad_norm": 0.40960055589675903,
+      "learning_rate": 0.00019965221579522154,
+      "loss": 1.1447,
+      "step": 616
+    },
+    {
+      "epoch": 0.10986467236467236,
+      "grad_norm": 0.4701732099056244,
+      "learning_rate": 0.0001996510484348431,
+      "loss": 1.2871,
+      "step": 617
+    },
+    {
+      "epoch": 0.11004273504273504,
+      "grad_norm": 0.38420796394348145,
+      "learning_rate": 0.0001996498791220094,
+      "loss": 1.058,
+      "step": 618
+    },
+    {
+      "epoch": 0.11022079772079772,
+      "grad_norm": 0.4014730453491211,
+      "learning_rate": 0.00019964870785674327,
+      "loss": 1.023,
+      "step": 619
+    },
+    {
+      "epoch": 0.1103988603988604,
+      "grad_norm": 0.38846179842948914,
+      "learning_rate": 0.00019964753463906773,
+      "loss": 0.9834,
+      "step": 620
+    },
+    {
+      "epoch": 0.11057692307692307,
+      "grad_norm": 0.5120236277580261,
+      "learning_rate": 0.00019964635946900577,
+      "loss": 1.2347,
+      "step": 621
+    },
+    {
+      "epoch": 0.11075498575498575,
+      "grad_norm": 0.40483301877975464,
+      "learning_rate": 0.00019964518234658038,
+      "loss": 1.131,
+      "step": 622
+    },
+    {
+      "epoch": 0.11093304843304844,
+      "grad_norm": 0.445782870054245,
+      "learning_rate": 0.00019964400327181464,
+      "loss": 0.9349,
+      "step": 623
+    },
+    {
+      "epoch": 0.1111111111111111,
+      "grad_norm": 0.490460604429245,
+      "learning_rate": 0.00019964282224473165,
+      "loss": 1.0257,
+      "step": 624
+    },
+    {
+      "epoch": 0.11128917378917379,
+      "grad_norm": 0.37585243582725525,
+      "learning_rate": 0.00019964163926535454,
+      "loss": 0.9724,
+      "step": 625
+    },
+    {
+      "epoch": 0.11146723646723647,
+      "grad_norm": 0.4160473346710205,
+      "learning_rate": 0.00019964045433370651,
+      "loss": 0.874,
+      "step": 626
+    },
+    {
+      "epoch": 0.11164529914529915,
+      "grad_norm": 0.442425012588501,
+      "learning_rate": 0.00019963926744981074,
+      "loss": 1.064,
+      "step": 627
+    },
+    {
+      "epoch": 0.11182336182336182,
+      "grad_norm": 0.4451471269130707,
+      "learning_rate": 0.00019963807861369054,
+      "loss": 1.2343,
+      "step": 628
+    },
+    {
+      "epoch": 0.1120014245014245,
+      "grad_norm": 0.5018183588981628,
+      "learning_rate": 0.00019963688782536913,
+      "loss": 1.1226,
+      "step": 629
+    },
+    {
+      "epoch": 0.11217948717948718,
+      "grad_norm": 0.43723925948143005,
+      "learning_rate": 0.0001996356950848699,
+      "loss": 1.0178,
+      "step": 630
+    },
+    {
+      "epoch": 0.11235754985754985,
+      "grad_norm": 0.4794611930847168,
+      "learning_rate": 0.0001996345003922162,
+      "loss": 0.9695,
+      "step": 631
+    },
+    {
+      "epoch": 0.11253561253561253,
+      "grad_norm": 0.5021790266036987,
+      "learning_rate": 0.00019963330374743143,
+      "loss": 1.1748,
+      "step": 632
+    },
+    {
+      "epoch": 0.11271367521367522,
+      "grad_norm": 0.47228625416755676,
+      "learning_rate": 0.00019963210515053906,
+      "loss": 1.2138,
+      "step": 633
+    },
+    {
+      "epoch": 0.1128917378917379,
+      "grad_norm": 0.4261155128479004,
+      "learning_rate": 0.00019963090460156256,
+      "loss": 0.9428,
+      "step": 634
+    },
+    {
+      "epoch": 0.11306980056980057,
+      "grad_norm": 0.3279525339603424,
+      "learning_rate": 0.00019962970210052542,
+      "loss": 0.7803,
+      "step": 635
+    },
+    {
+      "epoch": 0.11324786324786325,
+      "grad_norm": 0.5106086730957031,
+      "learning_rate": 0.00019962849764745125,
+      "loss": 1.113,
+      "step": 636
+    },
+    {
+      "epoch": 0.11342592592592593,
+      "grad_norm": 0.38272222876548767,
+      "learning_rate": 0.00019962729124236363,
+      "loss": 0.896,
+      "step": 637
+    },
+    {
+      "epoch": 0.1136039886039886,
+      "grad_norm": 0.39532098174095154,
+      "learning_rate": 0.0001996260828852862,
+      "loss": 0.9308,
+      "step": 638
+    },
+    {
+      "epoch": 0.11378205128205128,
+      "grad_norm": 0.44947221875190735,
+      "learning_rate": 0.00019962487257624262,
+      "loss": 1.207,
+      "step": 639
+    },
+    {
+      "epoch": 0.11396011396011396,
+      "grad_norm": 0.40684598684310913,
+      "learning_rate": 0.00019962366031525664,
+      "loss": 1.11,
+      "step": 640
+    },
+    {
+      "epoch": 0.11413817663817664,
+      "grad_norm": 0.4296625852584839,
+      "learning_rate": 0.00019962244610235194,
+      "loss": 1.2784,
+      "step": 641
+    },
+    {
+      "epoch": 0.11431623931623931,
+      "grad_norm": 0.4560794532299042,
+      "learning_rate": 0.0001996212299375524,
+      "loss": 1.1191,
+      "step": 642
+    },
+    {
+      "epoch": 0.114494301994302,
+      "grad_norm": 0.40246087312698364,
+      "learning_rate": 0.00019962001182088177,
+      "loss": 1.1401,
+      "step": 643
+    },
+    {
+      "epoch": 0.11467236467236468,
+      "grad_norm": 0.3938910663127899,
+      "learning_rate": 0.000199618791752364,
+      "loss": 1.0959,
+      "step": 644
+    },
+    {
+      "epoch": 0.11485042735042734,
+      "grad_norm": 0.4123380184173584,
+      "learning_rate": 0.00019961756973202287,
+      "loss": 1.2824,
+      "step": 645
+    },
+    {
+      "epoch": 0.11502849002849003,
+      "grad_norm": 0.41085442900657654,
+      "learning_rate": 0.00019961634575988243,
+      "loss": 1.1137,
+      "step": 646
+    },
+    {
+      "epoch": 0.11520655270655271,
+      "grad_norm": 0.38276201486587524,
+      "learning_rate": 0.0001996151198359667,
+      "loss": 1.0747,
+      "step": 647
+    },
+    {
+      "epoch": 0.11538461538461539,
+      "grad_norm": 0.49269407987594604,
+      "learning_rate": 0.00019961389196029953,
+      "loss": 1.1731,
+      "step": 648
+    },
+    {
+      "epoch": 0.11556267806267806,
+      "grad_norm": 0.5152469277381897,
+      "learning_rate": 0.00019961266213290512,
+      "loss": 1.3574,
+      "step": 649
+    },
+    {
+      "epoch": 0.11574074074074074,
+      "grad_norm": 0.4835714101791382,
+      "learning_rate": 0.0001996114303538075,
+      "loss": 1.2859,
+      "step": 650
+    },
+    {
+      "epoch": 0.11591880341880342,
+      "grad_norm": 0.4284524917602539,
+      "learning_rate": 0.00019961019662303087,
+      "loss": 1.1103,
+      "step": 651
+    },
+    {
+      "epoch": 0.11609686609686609,
+      "grad_norm": 0.3933276832103729,
+      "learning_rate": 0.00019960896094059933,
+      "loss": 1.2647,
+      "step": 652
+    },
+    {
+      "epoch": 0.11627492877492877,
+      "grad_norm": 0.33749741315841675,
+      "learning_rate": 0.00019960772330653712,
+      "loss": 0.819,
+      "step": 653
+    },
+    {
+      "epoch": 0.11645299145299146,
+      "grad_norm": 0.48122069239616394,
+      "learning_rate": 0.00019960648372086852,
+      "loss": 1.2781,
+      "step": 654
+    },
+    {
+      "epoch": 0.11663105413105414,
+      "grad_norm": 0.4681607186794281,
+      "learning_rate": 0.00019960524218361775,
+      "loss": 0.9723,
+      "step": 655
+    },
+    {
+      "epoch": 0.1168091168091168,
+      "grad_norm": 0.3974960148334503,
+      "learning_rate": 0.0001996039986948092,
+      "loss": 1.0302,
+      "step": 656
+    },
+    {
+      "epoch": 0.11698717948717949,
+      "grad_norm": 0.43180662393569946,
+      "learning_rate": 0.0001996027532544672,
+      "loss": 1.3265,
+      "step": 657
+    },
+    {
+      "epoch": 0.11716524216524217,
+      "grad_norm": 0.4481917917728424,
+      "learning_rate": 0.00019960150586261613,
+      "loss": 1.136,
+      "step": 658
+    },
+    {
+      "epoch": 0.11734330484330484,
+      "grad_norm": 0.43428945541381836,
+      "learning_rate": 0.00019960025651928045,
+      "loss": 1.2412,
+      "step": 659
+    },
+    {
+      "epoch": 0.11752136752136752,
+      "grad_norm": 0.36211395263671875,
+      "learning_rate": 0.00019959900522448467,
+      "loss": 0.9563,
+      "step": 660
+    },
+    {
+      "epoch": 0.1176994301994302,
+      "grad_norm": 0.43585848808288574,
+      "learning_rate": 0.0001995977519782533,
+      "loss": 1.1677,
+      "step": 661
+    },
+    {
+      "epoch": 0.11787749287749288,
+      "grad_norm": 0.4232597351074219,
+      "learning_rate": 0.00019959649678061086,
+      "loss": 1.1187,
+      "step": 662
+    },
+    {
+      "epoch": 0.11805555555555555,
+      "grad_norm": 0.3304753303527832,
+      "learning_rate": 0.00019959523963158194,
+      "loss": 0.8473,
+      "step": 663
+    },
+    {
+      "epoch": 0.11823361823361823,
+      "grad_norm": 0.37600061297416687,
+      "learning_rate": 0.0001995939805311912,
+      "loss": 1.1227,
+      "step": 664
+    },
+    {
+      "epoch": 0.11841168091168092,
+      "grad_norm": 0.33417847752571106,
+      "learning_rate": 0.0001995927194794633,
+      "loss": 1.0315,
+      "step": 665
+    },
+    {
+      "epoch": 0.11858974358974358,
+      "grad_norm": 0.46799129247665405,
+      "learning_rate": 0.00019959145647642298,
+      "loss": 1.135,
+      "step": 666
+    },
+    {
+      "epoch": 0.11876780626780627,
+      "grad_norm": 0.4141576886177063,
+      "learning_rate": 0.0001995901915220949,
+      "loss": 1.0956,
+      "step": 667
+    },
+    {
+      "epoch": 0.11894586894586895,
+      "grad_norm": 0.3824596405029297,
+      "learning_rate": 0.0001995889246165039,
+      "loss": 1.1782,
+      "step": 668
+    },
+    {
+      "epoch": 0.11912393162393162,
+      "grad_norm": 0.4087786376476288,
+      "learning_rate": 0.00019958765575967484,
+      "loss": 0.9704,
+      "step": 669
+    },
+    {
+      "epoch": 0.1193019943019943,
+      "grad_norm": 0.5161317586898804,
+      "learning_rate": 0.00019958638495163252,
+      "loss": 1.2207,
+      "step": 670
+    },
+    {
+      "epoch": 0.11948005698005698,
+      "grad_norm": 0.4782274067401886,
+      "learning_rate": 0.0001995851121924019,
+      "loss": 1.1257,
+      "step": 671
+    },
+    {
+      "epoch": 0.11965811965811966,
+      "grad_norm": 0.40617331862449646,
+      "learning_rate": 0.00019958383748200782,
+      "loss": 1.1153,
+      "step": 672
+    },
+    {
+      "epoch": 0.11983618233618233,
+      "grad_norm": 0.40149980783462524,
+      "learning_rate": 0.00019958256082047533,
+      "loss": 0.9785,
+      "step": 673
+    },
+    {
+      "epoch": 0.12001424501424501,
+      "grad_norm": 0.4378886818885803,
+      "learning_rate": 0.00019958128220782942,
+      "loss": 1.1355,
+      "step": 674
+    },
+    {
+      "epoch": 0.1201923076923077,
+      "grad_norm": 0.4449596703052521,
+      "learning_rate": 0.0001995800016440952,
+      "loss": 1.0325,
+      "step": 675
+    },
+    {
+      "epoch": 0.12037037037037036,
+      "grad_norm": 0.4268079698085785,
+      "learning_rate": 0.00019957871912929765,
+      "loss": 1.1901,
+      "step": 676
+    },
+    {
+      "epoch": 0.12054843304843305,
+      "grad_norm": 0.4250091016292572,
+      "learning_rate": 0.00019957743466346198,
+      "loss": 1.0084,
+      "step": 677
+    },
+    {
+      "epoch": 0.12072649572649573,
+      "grad_norm": 0.40724286437034607,
+      "learning_rate": 0.0001995761482466133,
+      "loss": 1.0866,
+      "step": 678
+    },
+    {
+      "epoch": 0.12090455840455841,
+      "grad_norm": 0.42478349804878235,
+      "learning_rate": 0.00019957485987877688,
+      "loss": 1.1909,
+      "step": 679
+    },
+    {
+      "epoch": 0.12108262108262108,
+      "grad_norm": 0.371362566947937,
+      "learning_rate": 0.0001995735695599779,
+      "loss": 1.083,
+      "step": 680
+    },
+    {
+      "epoch": 0.12126068376068376,
+      "grad_norm": 0.4715283513069153,
+      "learning_rate": 0.0001995722772902417,
+      "loss": 1.2942,
+      "step": 681
+    },
+    {
+      "epoch": 0.12143874643874644,
+      "grad_norm": 0.3611983060836792,
+      "learning_rate": 0.00019957098306959355,
+      "loss": 0.9878,
+      "step": 682
+    },
+    {
+      "epoch": 0.12161680911680911,
+      "grad_norm": 0.4764883816242218,
+      "learning_rate": 0.00019956968689805883,
+      "loss": 1.0082,
+      "step": 683
+    },
+    {
+      "epoch": 0.12179487179487179,
+      "grad_norm": 0.33170604705810547,
+      "learning_rate": 0.00019956838877566293,
+      "loss": 0.8529,
+      "step": 684
+    },
+    {
+      "epoch": 0.12197293447293447,
+      "grad_norm": 0.46896886825561523,
+      "learning_rate": 0.00019956708870243133,
+      "loss": 1.0745,
+      "step": 685
+    },
+    {
+      "epoch": 0.12215099715099716,
+      "grad_norm": 0.4120674431324005,
+      "learning_rate": 0.00019956578667838941,
+      "loss": 1.1828,
+      "step": 686
+    },
+    {
+      "epoch": 0.12232905982905982,
+      "grad_norm": 0.45671191811561584,
+      "learning_rate": 0.00019956448270356275,
+      "loss": 1.3484,
+      "step": 687
+    },
+    {
+      "epoch": 0.1225071225071225,
+      "grad_norm": 0.4023838937282562,
+      "learning_rate": 0.00019956317677797687,
+      "loss": 0.9623,
+      "step": 688
+    },
+    {
+      "epoch": 0.12268518518518519,
+      "grad_norm": 0.5205856561660767,
+      "learning_rate": 0.00019956186890165737,
+      "loss": 1.2221,
+      "step": 689
+    },
+    {
+      "epoch": 0.12286324786324786,
+      "grad_norm": 0.43956050276756287,
+      "learning_rate": 0.00019956055907462987,
+      "loss": 1.1051,
+      "step": 690
+    },
+    {
+      "epoch": 0.12304131054131054,
+      "grad_norm": 0.4341758191585541,
+      "learning_rate": 0.00019955924729692003,
+      "loss": 0.8972,
+      "step": 691
+    },
+    {
+      "epoch": 0.12321937321937322,
+      "grad_norm": 0.42025020718574524,
+      "learning_rate": 0.00019955793356855357,
+      "loss": 1.1137,
+      "step": 692
+    },
+    {
+      "epoch": 0.1233974358974359,
+      "grad_norm": 0.44375079870224,
+      "learning_rate": 0.0001995566178895562,
+      "loss": 1.2783,
+      "step": 693
+    },
+    {
+      "epoch": 0.12357549857549857,
+      "grad_norm": 0.4703320264816284,
+      "learning_rate": 0.00019955530025995372,
+      "loss": 1.1991,
+      "step": 694
+    },
+    {
+      "epoch": 0.12375356125356125,
+      "grad_norm": 0.43781620264053345,
+      "learning_rate": 0.00019955398067977195,
+      "loss": 1.2316,
+      "step": 695
+    },
+    {
+      "epoch": 0.12393162393162394,
+      "grad_norm": 0.4362877607345581,
+      "learning_rate": 0.0001995526591490367,
+      "loss": 1.1374,
+      "step": 696
+    },
+    {
+      "epoch": 0.1241096866096866,
+      "grad_norm": 0.4434499442577362,
+      "learning_rate": 0.00019955133566777392,
+      "loss": 1.1034,
+      "step": 697
+    },
+    {
+      "epoch": 0.12428774928774929,
+      "grad_norm": 0.46613508462905884,
+      "learning_rate": 0.00019955001023600955,
+      "loss": 1.2252,
+      "step": 698
+    },
+    {
+      "epoch": 0.12446581196581197,
+      "grad_norm": 0.46226736903190613,
+      "learning_rate": 0.00019954868285376945,
+      "loss": 1.0296,
+      "step": 699
+    },
+    {
+      "epoch": 0.12464387464387465,
+      "grad_norm": 0.4460904002189636,
+      "learning_rate": 0.00019954735352107977,
+      "loss": 1.0553,
+      "step": 700
+    },
+    {
+      "epoch": 0.12482193732193732,
+      "grad_norm": 0.36708924174308777,
+      "learning_rate": 0.00019954602223796648,
+      "loss": 0.9384,
+      "step": 701
+    },
+    {
+      "epoch": 0.125,
+      "grad_norm": 0.3780093491077423,
+      "learning_rate": 0.00019954468900445566,
+      "loss": 0.9062,
+      "step": 702
+    },
+    {
+      "epoch": 0.12517806267806267,
+      "grad_norm": 0.41797417402267456,
+      "learning_rate": 0.00019954335382057345,
+      "loss": 1.0344,
+      "step": 703
+    },
+    {
+      "epoch": 0.12535612535612536,
+      "grad_norm": 0.43710798025131226,
+      "learning_rate": 0.00019954201668634597,
+      "loss": 1.1324,
+      "step": 704
+    },
+    {
+      "epoch": 0.12553418803418803,
+      "grad_norm": 0.4732789695262909,
+      "learning_rate": 0.00019954067760179952,
+      "loss": 1.1419,
+      "step": 705
+    },
+    {
+      "epoch": 0.1257122507122507,
+      "grad_norm": 0.43248575925827026,
+      "learning_rate": 0.00019953933656696022,
+      "loss": 1.5112,
+      "step": 706
+    },
+    {
+      "epoch": 0.1258903133903134,
+      "grad_norm": 0.4074753522872925,
+      "learning_rate": 0.00019953799358185442,
+      "loss": 0.9751,
+      "step": 707
+    },
+    {
+      "epoch": 0.12606837606837606,
+      "grad_norm": 0.4586823880672455,
+      "learning_rate": 0.0001995366486465084,
+      "loss": 1.267,
+      "step": 708
+    },
+    {
+      "epoch": 0.12624643874643873,
+      "grad_norm": 0.4716857075691223,
+      "learning_rate": 0.0001995353017609485,
+      "loss": 1.1636,
+      "step": 709
+    },
+    {
+      "epoch": 0.12642450142450143,
+      "grad_norm": 0.5214398503303528,
+      "learning_rate": 0.00019953395292520115,
+      "loss": 1.2317,
+      "step": 710
+    },
+    {
+      "epoch": 0.1266025641025641,
+      "grad_norm": 0.42961129546165466,
+      "learning_rate": 0.00019953260213929276,
+      "loss": 1.0271,
+      "step": 711
+    },
+    {
+      "epoch": 0.1267806267806268,
+      "grad_norm": 0.4764653444290161,
+      "learning_rate": 0.00019953124940324979,
+      "loss": 1.1747,
+      "step": 712
+    },
+    {
+      "epoch": 0.12695868945868946,
+      "grad_norm": 0.4420304000377655,
+      "learning_rate": 0.00019952989471709874,
+      "loss": 0.9783,
+      "step": 713
+    },
+    {
+      "epoch": 0.12713675213675213,
+      "grad_norm": 0.44114625453948975,
+      "learning_rate": 0.00019952853808086616,
+      "loss": 1.1953,
+      "step": 714
+    },
+    {
+      "epoch": 0.12731481481481483,
+      "grad_norm": 0.501923143863678,
+      "learning_rate": 0.0001995271794945786,
+      "loss": 0.9886,
+      "step": 715
+    },
+    {
+      "epoch": 0.1274928774928775,
+      "grad_norm": 0.42266538739204407,
+      "learning_rate": 0.00019952581895826276,
+      "loss": 1.2033,
+      "step": 716
+    },
+    {
+      "epoch": 0.12767094017094016,
+      "grad_norm": 0.37770554423332214,
+      "learning_rate": 0.00019952445647194523,
+      "loss": 1.0164,
+      "step": 717
+    },
+    {
+      "epoch": 0.12784900284900286,
+      "grad_norm": 0.369266152381897,
+      "learning_rate": 0.00019952309203565268,
+      "loss": 0.9186,
+      "step": 718
+    },
+    {
+      "epoch": 0.12802706552706553,
+      "grad_norm": 0.40446221828460693,
+      "learning_rate": 0.00019952172564941193,
+      "loss": 1.1576,
+      "step": 719
+    },
+    {
+      "epoch": 0.1282051282051282,
+      "grad_norm": 0.504172146320343,
+      "learning_rate": 0.00019952035731324967,
+      "loss": 1.2695,
+      "step": 720
+    },
+    {
+      "epoch": 0.1283831908831909,
+      "grad_norm": 0.37284108996391296,
+      "learning_rate": 0.0001995189870271928,
+      "loss": 1.0288,
+      "step": 721
+    },
+    {
+      "epoch": 0.12856125356125356,
+      "grad_norm": 0.41811618208885193,
+      "learning_rate": 0.00019951761479126805,
+      "loss": 1.2241,
+      "step": 722
+    },
+    {
+      "epoch": 0.12873931623931623,
+      "grad_norm": 0.44706249237060547,
+      "learning_rate": 0.0001995162406055024,
+      "loss": 1.0831,
+      "step": 723
+    },
+    {
+      "epoch": 0.12891737891737892,
+      "grad_norm": 0.426572322845459,
+      "learning_rate": 0.00019951486446992273,
+      "loss": 1.0047,
+      "step": 724
+    },
+    {
+      "epoch": 0.1290954415954416,
+      "grad_norm": 0.4446277618408203,
+      "learning_rate": 0.00019951348638455602,
+      "loss": 1.0827,
+      "step": 725
+    },
+    {
+      "epoch": 0.12927350427350429,
+      "grad_norm": 0.3934919834136963,
+      "learning_rate": 0.00019951210634942926,
+      "loss": 0.9808,
+      "step": 726
+    },
+    {
+      "epoch": 0.12945156695156695,
+      "grad_norm": 0.4316558241844177,
+      "learning_rate": 0.0001995107243645695,
+      "loss": 1.3341,
+      "step": 727
+    },
+    {
+      "epoch": 0.12962962962962962,
+      "grad_norm": 0.43074217438697815,
+      "learning_rate": 0.00019950934043000382,
+      "loss": 1.007,
+      "step": 728
+    },
+    {
+      "epoch": 0.12980769230769232,
+      "grad_norm": 0.5212171673774719,
+      "learning_rate": 0.0001995079545457593,
+      "loss": 1.1822,
+      "step": 729
+    },
+    {
+      "epoch": 0.129985754985755,
+      "grad_norm": 0.3749600946903229,
+      "learning_rate": 0.00019950656671186313,
+      "loss": 0.9657,
+      "step": 730
+    },
+    {
+      "epoch": 0.13016381766381765,
+      "grad_norm": 0.36626043915748596,
+      "learning_rate": 0.00019950517692834252,
+      "loss": 1.1274,
+      "step": 731
+    },
+    {
+      "epoch": 0.13034188034188035,
+      "grad_norm": 0.4635467529296875,
+      "learning_rate": 0.00019950378519522467,
+      "loss": 1.2305,
+      "step": 732
+    },
+    {
+      "epoch": 0.13051994301994302,
+      "grad_norm": 0.4077455699443817,
+      "learning_rate": 0.00019950239151253683,
+      "loss": 0.9485,
+      "step": 733
+    },
+    {
+      "epoch": 0.1306980056980057,
+      "grad_norm": 0.4222758114337921,
+      "learning_rate": 0.0001995009958803063,
+      "loss": 1.0376,
+      "step": 734
+    },
+    {
+      "epoch": 0.13087606837606838,
+      "grad_norm": 0.4330402612686157,
+      "learning_rate": 0.0001994995982985605,
+      "loss": 1.1774,
+      "step": 735
+    },
+    {
+      "epoch": 0.13105413105413105,
+      "grad_norm": 0.42275673151016235,
+      "learning_rate": 0.00019949819876732673,
+      "loss": 1.1238,
+      "step": 736
+    },
+    {
+      "epoch": 0.13123219373219372,
+      "grad_norm": 0.45576968789100647,
+      "learning_rate": 0.00019949679728663246,
+      "loss": 1.0428,
+      "step": 737
+    },
+    {
+      "epoch": 0.13141025641025642,
+      "grad_norm": 0.5508752465248108,
+      "learning_rate": 0.00019949539385650514,
+      "loss": 1.3221,
+      "step": 738
+    },
+    {
+      "epoch": 0.13158831908831908,
+      "grad_norm": 0.4115872383117676,
+      "learning_rate": 0.00019949398847697225,
+      "loss": 1.0301,
+      "step": 739
+    },
+    {
+      "epoch": 0.13176638176638178,
+      "grad_norm": 0.4662442207336426,
+      "learning_rate": 0.00019949258114806132,
+      "loss": 1.3263,
+      "step": 740
+    },
+    {
+      "epoch": 0.13194444444444445,
+      "grad_norm": 0.6077266931533813,
+      "learning_rate": 0.00019949117186979999,
+      "loss": 1.0269,
+      "step": 741
+    },
+    {
+      "epoch": 0.13212250712250712,
+      "grad_norm": 0.47039318084716797,
+      "learning_rate": 0.00019948976064221579,
+      "loss": 1.3782,
+      "step": 742
+    },
+    {
+      "epoch": 0.1323005698005698,
+      "grad_norm": 0.4773450493812561,
+      "learning_rate": 0.0001994883474653364,
+      "loss": 1.289,
+      "step": 743
+    },
+    {
+      "epoch": 0.13247863247863248,
+      "grad_norm": 0.40180155634880066,
+      "learning_rate": 0.00019948693233918952,
+      "loss": 0.8691,
+      "step": 744
+    },
+    {
+      "epoch": 0.13265669515669515,
+      "grad_norm": 0.45216289162635803,
+      "learning_rate": 0.00019948551526380288,
+      "loss": 1.071,
+      "step": 745
+    },
+    {
+      "epoch": 0.13283475783475784,
+      "grad_norm": 0.4289272427558899,
+      "learning_rate": 0.0001994840962392042,
+      "loss": 1.0422,
+      "step": 746
+    },
+    {
+      "epoch": 0.1330128205128205,
+      "grad_norm": 0.4617730379104614,
+      "learning_rate": 0.00019948267526542134,
+      "loss": 1.0835,
+      "step": 747
+    },
+    {
+      "epoch": 0.13319088319088318,
+      "grad_norm": 0.42710617184638977,
+      "learning_rate": 0.00019948125234248208,
+      "loss": 1.0535,
+      "step": 748
+    },
+    {
+      "epoch": 0.13336894586894588,
+      "grad_norm": 0.43433234095573425,
+      "learning_rate": 0.0001994798274704144,
+      "loss": 0.9313,
+      "step": 749
+    },
+    {
+      "epoch": 0.13354700854700854,
+      "grad_norm": 0.46270284056663513,
+      "learning_rate": 0.0001994784006492461,
+      "loss": 1.0903,
+      "step": 750
+    },
+    {
+      "epoch": 0.1337250712250712,
+      "grad_norm": 0.5319814682006836,
+      "learning_rate": 0.00019947697187900517,
+      "loss": 1.2329,
+      "step": 751
+    },
+    {
+      "epoch": 0.1339031339031339,
+      "grad_norm": 0.3511372208595276,
+      "learning_rate": 0.00019947554115971967,
+      "loss": 0.7116,
+      "step": 752
+    },
+    {
+      "epoch": 0.13408119658119658,
+      "grad_norm": 0.4103890359401703,
+      "learning_rate": 0.00019947410849141756,
+      "loss": 1.1527,
+      "step": 753
+    },
+    {
+      "epoch": 0.13425925925925927,
+      "grad_norm": 0.5390757322311401,
+      "learning_rate": 0.00019947267387412695,
+      "loss": 1.1682,
+      "step": 754
+    },
+    {
+      "epoch": 0.13443732193732194,
+      "grad_norm": 0.29939723014831543,
+      "learning_rate": 0.0001994712373078759,
+      "loss": 0.5848,
+      "step": 755
+    },
+    {
+      "epoch": 0.1346153846153846,
+      "grad_norm": 0.4605920612812042,
+      "learning_rate": 0.0001994697987926926,
+      "loss": 0.9448,
+      "step": 756
+    },
+    {
+      "epoch": 0.1347934472934473,
+      "grad_norm": 0.426213800907135,
+      "learning_rate": 0.00019946835832860527,
+      "loss": 1.0487,
+      "step": 757
+    },
+    {
+      "epoch": 0.13497150997150997,
+      "grad_norm": 0.4209515154361725,
+      "learning_rate": 0.00019946691591564203,
+      "loss": 1.0951,
+      "step": 758
+    },
+    {
+      "epoch": 0.13514957264957264,
+      "grad_norm": 0.39555591344833374,
+      "learning_rate": 0.0001994654715538312,
+      "loss": 0.8754,
+      "step": 759
+    },
+    {
+      "epoch": 0.13532763532763534,
+      "grad_norm": 0.4065483510494232,
+      "learning_rate": 0.0001994640252432011,
+      "loss": 0.9451,
+      "step": 760
+    },
+    {
+      "epoch": 0.135505698005698,
+      "grad_norm": 0.4489104151725769,
+      "learning_rate": 0.00019946257698378003,
+      "loss": 1.2031,
+      "step": 761
+    },
+    {
+      "epoch": 0.13568376068376067,
+      "grad_norm": 0.39928409457206726,
+      "learning_rate": 0.0001994611267755964,
+      "loss": 1.1124,
+      "step": 762
+    },
+    {
+      "epoch": 0.13586182336182337,
+      "grad_norm": 0.4145409166812897,
+      "learning_rate": 0.00019945967461867858,
+      "loss": 1.083,
+      "step": 763
+    },
+    {
+      "epoch": 0.13603988603988604,
+      "grad_norm": 0.43508613109588623,
+      "learning_rate": 0.00019945822051305507,
+      "loss": 1.1119,
+      "step": 764
+    },
+    {
+      "epoch": 0.1362179487179487,
+      "grad_norm": 0.5186598300933838,
+      "learning_rate": 0.0001994567644587543,
+      "loss": 1.3256,
+      "step": 765
+    },
+    {
+      "epoch": 0.1363960113960114,
+      "grad_norm": 0.4615778625011444,
+      "learning_rate": 0.00019945530645580487,
+      "loss": 1.3906,
+      "step": 766
+    },
+    {
+      "epoch": 0.13657407407407407,
+      "grad_norm": 0.4838152527809143,
+      "learning_rate": 0.00019945384650423532,
+      "loss": 0.8169,
+      "step": 767
+    },
+    {
+      "epoch": 0.13675213675213677,
+      "grad_norm": 0.49253368377685547,
+      "learning_rate": 0.0001994523846040742,
+      "loss": 1.1613,
+      "step": 768
+    },
+    {
+      "epoch": 0.13693019943019943,
+      "grad_norm": 0.4697009325027466,
+      "learning_rate": 0.00019945092075535024,
+      "loss": 1.1722,
+      "step": 769
+    },
+    {
+      "epoch": 0.1371082621082621,
+      "grad_norm": 0.47162383794784546,
+      "learning_rate": 0.00019944945495809204,
+      "loss": 1.054,
+      "step": 770
+    },
+    {
+      "epoch": 0.1372863247863248,
+      "grad_norm": 0.4653547704219818,
+      "learning_rate": 0.00019944798721232835,
+      "loss": 1.1791,
+      "step": 771
+    },
+    {
+      "epoch": 0.13746438746438747,
+      "grad_norm": 0.4244011640548706,
+      "learning_rate": 0.000199446517518088,
+      "loss": 1.1557,
+      "step": 772
+    },
+    {
+      "epoch": 0.13764245014245013,
+      "grad_norm": 0.43812859058380127,
+      "learning_rate": 0.00019944504587539967,
+      "loss": 1.1567,
+      "step": 773
+    },
+    {
+      "epoch": 0.13782051282051283,
+      "grad_norm": 0.3984275162220001,
+      "learning_rate": 0.00019944357228429227,
+      "loss": 1.0715,
+      "step": 774
+    },
+    {
+      "epoch": 0.1379985754985755,
+      "grad_norm": 0.3794248104095459,
+      "learning_rate": 0.0001994420967447946,
+      "loss": 0.9377,
+      "step": 775
+    },
+    {
+      "epoch": 0.13817663817663817,
+      "grad_norm": 0.4214578866958618,
+      "learning_rate": 0.00019944061925693566,
+      "loss": 1.0112,
+      "step": 776
+    },
+    {
+      "epoch": 0.13835470085470086,
+      "grad_norm": 0.4738999605178833,
+      "learning_rate": 0.00019943913982074435,
+      "loss": 0.8718,
+      "step": 777
+    },
+    {
+      "epoch": 0.13853276353276353,
+      "grad_norm": 0.43455326557159424,
+      "learning_rate": 0.00019943765843624965,
+      "loss": 1.1343,
+      "step": 778
+    },
+    {
+      "epoch": 0.1387108262108262,
+      "grad_norm": 0.44973456859588623,
+      "learning_rate": 0.00019943617510348062,
+      "loss": 1.0487,
+      "step": 779
+    },
+    {
+      "epoch": 0.1388888888888889,
+      "grad_norm": 0.4216597080230713,
+      "learning_rate": 0.00019943468982246628,
+      "loss": 1.0765,
+      "step": 780
+    },
+    {
+      "epoch": 0.13906695156695156,
+      "grad_norm": 0.5089883208274841,
+      "learning_rate": 0.00019943320259323578,
+      "loss": 1.3137,
+      "step": 781
+    },
+    {
+      "epoch": 0.13924501424501423,
+      "grad_norm": 0.4358222782611847,
+      "learning_rate": 0.00019943171341581822,
+      "loss": 1.1891,
+      "step": 782
+    },
+    {
+      "epoch": 0.13942307692307693,
+      "grad_norm": 0.40918609499931335,
+      "learning_rate": 0.00019943022229024275,
+      "loss": 1.279,
+      "step": 783
+    },
+    {
+      "epoch": 0.1396011396011396,
+      "grad_norm": 0.4614863395690918,
+      "learning_rate": 0.00019942872921653866,
+      "loss": 1.2477,
+      "step": 784
+    },
+    {
+      "epoch": 0.1397792022792023,
+      "grad_norm": 0.4141528904438019,
+      "learning_rate": 0.00019942723419473515,
+      "loss": 0.9622,
+      "step": 785
+    },
+    {
+      "epoch": 0.13995726495726496,
+      "grad_norm": 0.536139726638794,
+      "learning_rate": 0.00019942573722486154,
+      "loss": 1.2127,
+      "step": 786
+    },
+    {
+      "epoch": 0.14013532763532763,
+      "grad_norm": 0.4968845546245575,
+      "learning_rate": 0.0001994242383069471,
+      "loss": 1.2965,
+      "step": 787
+    },
+    {
+      "epoch": 0.14031339031339032,
+      "grad_norm": 0.3897174894809723,
+      "learning_rate": 0.00019942273744102132,
+      "loss": 0.9907,
+      "step": 788
+    },
+    {
+      "epoch": 0.140491452991453,
+      "grad_norm": 0.466307669878006,
+      "learning_rate": 0.0001994212346271135,
+      "loss": 1.2021,
+      "step": 789
+    },
+    {
+      "epoch": 0.14066951566951566,
+      "grad_norm": 0.49283576011657715,
+      "learning_rate": 0.0001994197298652531,
+      "loss": 1.0969,
+      "step": 790
+    },
+    {
+      "epoch": 0.14084757834757836,
+      "grad_norm": 0.4686102271080017,
+      "learning_rate": 0.00019941822315546964,
+      "loss": 1.0125,
+      "step": 791
+    },
+    {
+      "epoch": 0.14102564102564102,
+      "grad_norm": 0.4389997124671936,
+      "learning_rate": 0.0001994167144977926,
+      "loss": 1.1294,
+      "step": 792
+    },
+    {
+      "epoch": 0.1412037037037037,
+      "grad_norm": 0.38539355993270874,
+      "learning_rate": 0.00019941520389225162,
+      "loss": 1.1231,
+      "step": 793
+    },
+    {
+      "epoch": 0.1413817663817664,
+      "grad_norm": 0.4860847592353821,
+      "learning_rate": 0.00019941369133887618,
+      "loss": 1.2268,
+      "step": 794
+    },
+    {
+      "epoch": 0.14155982905982906,
+      "grad_norm": 0.4567467272281647,
+      "learning_rate": 0.00019941217683769598,
+      "loss": 1.1482,
+      "step": 795
+    },
+    {
+      "epoch": 0.14173789173789172,
+      "grad_norm": 0.5549420714378357,
+      "learning_rate": 0.00019941066038874067,
+      "loss": 1.1899,
+      "step": 796
+    },
+    {
+      "epoch": 0.14191595441595442,
+      "grad_norm": 0.3950003385543823,
+      "learning_rate": 0.00019940914199204,
+      "loss": 0.96,
+      "step": 797
+    },
+    {
+      "epoch": 0.1420940170940171,
+      "grad_norm": 0.43845999240875244,
+      "learning_rate": 0.00019940762164762373,
+      "loss": 1.0338,
+      "step": 798
+    },
+    {
+      "epoch": 0.14227207977207978,
+      "grad_norm": 0.468537300825119,
+      "learning_rate": 0.00019940609935552157,
+      "loss": 1.2416,
+      "step": 799
+    },
+    {
+      "epoch": 0.14245014245014245,
+      "grad_norm": 0.4292038679122925,
+      "learning_rate": 0.0001994045751157634,
+      "loss": 1.1397,
+      "step": 800
+    },
+    {
+      "epoch": 0.14262820512820512,
+      "grad_norm": 0.3800995647907257,
+      "learning_rate": 0.00019940304892837908,
+      "loss": 0.939,
+      "step": 801
+    },
+    {
+      "epoch": 0.14280626780626782,
+      "grad_norm": 0.38004353642463684,
+      "learning_rate": 0.00019940152079339852,
+      "loss": 1.0485,
+      "step": 802
+    },
+    {
+      "epoch": 0.14298433048433049,
+      "grad_norm": 0.4658142924308777,
+      "learning_rate": 0.00019939999071085163,
+      "loss": 1.1561,
+      "step": 803
+    },
+    {
+      "epoch": 0.14316239316239315,
+      "grad_norm": 0.4235048294067383,
+      "learning_rate": 0.0001993984586807684,
+      "loss": 1.0516,
+      "step": 804
+    },
+    {
+      "epoch": 0.14334045584045585,
+      "grad_norm": 0.42925819754600525,
+      "learning_rate": 0.00019939692470317887,
+      "loss": 1.2238,
+      "step": 805
+    },
+    {
+      "epoch": 0.14351851851851852,
+      "grad_norm": 0.43701639771461487,
+      "learning_rate": 0.00019939538877811308,
+      "loss": 1.0129,
+      "step": 806
+    },
+    {
+      "epoch": 0.14369658119658119,
+      "grad_norm": 0.42786353826522827,
+      "learning_rate": 0.00019939385090560113,
+      "loss": 1.1355,
+      "step": 807
+    },
+    {
+      "epoch": 0.14387464387464388,
+      "grad_norm": 0.371218740940094,
+      "learning_rate": 0.00019939231108567312,
+      "loss": 0.9712,
+      "step": 808
+    },
+    {
+      "epoch": 0.14405270655270655,
+      "grad_norm": 0.4834294617176056,
+      "learning_rate": 0.00019939076931835926,
+      "loss": 1.1375,
+      "step": 809
+    },
+    {
+      "epoch": 0.14423076923076922,
+      "grad_norm": 0.4700150191783905,
+      "learning_rate": 0.00019938922560368974,
+      "loss": 1.1943,
+      "step": 810
+    },
+    {
+      "epoch": 0.14440883190883191,
+      "grad_norm": 0.4430996775627136,
+      "learning_rate": 0.0001993876799416948,
+      "loss": 1.1976,
+      "step": 811
+    },
+    {
+      "epoch": 0.14458689458689458,
+      "grad_norm": 0.4161672592163086,
+      "learning_rate": 0.00019938613233240476,
+      "loss": 1.0291,
+      "step": 812
+    },
+    {
+      "epoch": 0.14476495726495728,
+      "grad_norm": 0.39838850498199463,
+      "learning_rate": 0.0001993845827758499,
+      "loss": 1.2103,
+      "step": 813
+    },
+    {
+      "epoch": 0.14494301994301995,
+      "grad_norm": 0.429198294878006,
+      "learning_rate": 0.00019938303127206057,
+      "loss": 0.9971,
+      "step": 814
+    },
+    {
+      "epoch": 0.14512108262108261,
+      "grad_norm": 0.4589254856109619,
+      "learning_rate": 0.00019938147782106719,
+      "loss": 1.2392,
+      "step": 815
+    },
+    {
+      "epoch": 0.1452991452991453,
+      "grad_norm": 0.42506635189056396,
+      "learning_rate": 0.00019937992242290023,
+      "loss": 1.0827,
+      "step": 816
+    },
+    {
+      "epoch": 0.14547720797720798,
+      "grad_norm": 0.3778113126754761,
+      "learning_rate": 0.00019937836507759012,
+      "loss": 1.021,
+      "step": 817
+    },
+    {
+      "epoch": 0.14565527065527065,
+      "grad_norm": 0.43071216344833374,
+      "learning_rate": 0.0001993768057851674,
+      "loss": 1.273,
+      "step": 818
+    },
+    {
+      "epoch": 0.14583333333333334,
+      "grad_norm": 0.4944681227207184,
+      "learning_rate": 0.00019937524454566262,
+      "loss": 1.3037,
+      "step": 819
+    },
+    {
+      "epoch": 0.146011396011396,
+      "grad_norm": 0.4438824951648712,
+      "learning_rate": 0.00019937368135910632,
+      "loss": 1.1383,
+      "step": 820
+    },
+    {
+      "epoch": 0.14618945868945868,
+      "grad_norm": 0.400215744972229,
+      "learning_rate": 0.0001993721162255292,
+      "loss": 1.0669,
+      "step": 821
+    },
+    {
+      "epoch": 0.14636752136752137,
+      "grad_norm": 0.4341452121734619,
+      "learning_rate": 0.00019937054914496185,
+      "loss": 1.1431,
+      "step": 822
+    },
+    {
+      "epoch": 0.14654558404558404,
+      "grad_norm": 0.3941744267940521,
+      "learning_rate": 0.00019936898011743503,
+      "loss": 1.1593,
+      "step": 823
+    },
+    {
+      "epoch": 0.1467236467236467,
+      "grad_norm": 0.4318541884422302,
+      "learning_rate": 0.00019936740914297947,
+      "loss": 1.2814,
+      "step": 824
+    },
+    {
+      "epoch": 0.1469017094017094,
+      "grad_norm": 0.44488632678985596,
+      "learning_rate": 0.00019936583622162595,
+      "loss": 1.1054,
+      "step": 825
+    },
+    {
+      "epoch": 0.14707977207977208,
+      "grad_norm": 0.38701096177101135,
+      "learning_rate": 0.00019936426135340528,
+      "loss": 1.1086,
+      "step": 826
+    },
+    {
+      "epoch": 0.14725783475783477,
+      "grad_norm": 0.45794424414634705,
+      "learning_rate": 0.0001993626845383483,
+      "loss": 1.2395,
+      "step": 827
+    },
+    {
+      "epoch": 0.14743589743589744,
+      "grad_norm": 0.49237680435180664,
+      "learning_rate": 0.00019936110577648596,
+      "loss": 1.3483,
+      "step": 828
+    },
+    {
+      "epoch": 0.1476139601139601,
+      "grad_norm": 0.481666624546051,
+      "learning_rate": 0.00019935952506784914,
+      "loss": 1.1848,
+      "step": 829
+    },
+    {
+      "epoch": 0.1477920227920228,
+      "grad_norm": 0.4015209376811981,
+      "learning_rate": 0.00019935794241246883,
+      "loss": 1.0624,
+      "step": 830
+    },
+    {
+      "epoch": 0.14797008547008547,
+      "grad_norm": 0.47975999116897583,
+      "learning_rate": 0.00019935635781037606,
+      "loss": 1.1595,
+      "step": 831
+    },
+    {
+      "epoch": 0.14814814814814814,
+      "grad_norm": 0.4440356492996216,
+      "learning_rate": 0.00019935477126160181,
+      "loss": 1.1325,
+      "step": 832
+    },
+    {
+      "epoch": 0.14832621082621084,
+      "grad_norm": 0.4167410731315613,
+      "learning_rate": 0.00019935318276617723,
+      "loss": 1.0662,
+      "step": 833
+    },
+    {
+      "epoch": 0.1485042735042735,
+      "grad_norm": 0.4107447862625122,
+      "learning_rate": 0.0001993515923241334,
+      "loss": 0.8816,
+      "step": 834
+    },
+    {
+      "epoch": 0.14868233618233617,
+      "grad_norm": 0.4020158648490906,
+      "learning_rate": 0.00019934999993550154,
+      "loss": 0.9797,
+      "step": 835
+    },
+    {
+      "epoch": 0.14886039886039887,
+      "grad_norm": 0.4186473786830902,
+      "learning_rate": 0.0001993484056003128,
+      "loss": 1.1243,
+      "step": 836
+    },
+    {
+      "epoch": 0.14903846153846154,
+      "grad_norm": 0.5534794926643372,
+      "learning_rate": 0.00019934680931859842,
+      "loss": 1.1189,
+      "step": 837
+    },
+    {
+      "epoch": 0.1492165242165242,
+      "grad_norm": 0.37901270389556885,
+      "learning_rate": 0.0001993452110903897,
+      "loss": 0.9241,
+      "step": 838
+    },
+    {
+      "epoch": 0.1493945868945869,
+      "grad_norm": 0.41773587465286255,
+      "learning_rate": 0.00019934361091571793,
+      "loss": 0.9467,
+      "step": 839
+    },
+    {
+      "epoch": 0.14957264957264957,
+      "grad_norm": 0.4962073564529419,
+      "learning_rate": 0.00019934200879461448,
+      "loss": 1.2423,
+      "step": 840
+    },
+    {
+      "epoch": 0.14975071225071226,
+      "grad_norm": 0.38565897941589355,
+      "learning_rate": 0.00019934040472711074,
+      "loss": 1.1545,
+      "step": 841
+    },
+    {
+      "epoch": 0.14992877492877493,
+      "grad_norm": 0.4295346736907959,
+      "learning_rate": 0.0001993387987132381,
+      "loss": 1.2482,
+      "step": 842
+    },
+    {
+      "epoch": 0.1501068376068376,
+      "grad_norm": 0.4279189705848694,
+      "learning_rate": 0.0001993371907530281,
+      "loss": 1.1135,
+      "step": 843
+    },
+    {
+      "epoch": 0.1502849002849003,
+      "grad_norm": 0.44649168848991394,
+      "learning_rate": 0.0001993355808465122,
+      "loss": 1.0734,
+      "step": 844
+    },
+    {
+      "epoch": 0.15046296296296297,
+      "grad_norm": 0.453707218170166,
+      "learning_rate": 0.0001993339689937219,
+      "loss": 1.0992,
+      "step": 845
+    },
+    {
+      "epoch": 0.15064102564102563,
+      "grad_norm": 0.5113263726234436,
+      "learning_rate": 0.00019933235519468886,
+      "loss": 1.1792,
+      "step": 846
+    },
+    {
+      "epoch": 0.15081908831908833,
+      "grad_norm": 0.5822970271110535,
+      "learning_rate": 0.00019933073944944466,
+      "loss": 1.367,
+      "step": 847
+    },
+    {
+      "epoch": 0.150997150997151,
+      "grad_norm": 0.3946528732776642,
+      "learning_rate": 0.00019932912175802097,
+      "loss": 0.9781,
+      "step": 848
+    },
+    {
+      "epoch": 0.15117521367521367,
+      "grad_norm": 0.5429860949516296,
+      "learning_rate": 0.00019932750212044945,
+      "loss": 0.9783,
+      "step": 849
+    },
+    {
+      "epoch": 0.15135327635327636,
+      "grad_norm": 0.45847952365875244,
+      "learning_rate": 0.0001993258805367619,
+      "loss": 1.1352,
+      "step": 850
+    },
+    {
+      "epoch": 0.15153133903133903,
+      "grad_norm": 0.42770692706108093,
+      "learning_rate": 0.00019932425700699004,
+      "loss": 1.2365,
+      "step": 851
+    },
+    {
+      "epoch": 0.1517094017094017,
+      "grad_norm": 0.41845405101776123,
+      "learning_rate": 0.00019932263153116565,
+      "loss": 1.2642,
+      "step": 852
+    },
+    {
+      "epoch": 0.1518874643874644,
+      "grad_norm": 0.4641731083393097,
+      "learning_rate": 0.00019932100410932066,
+      "loss": 1.2009,
+      "step": 853
+    },
+    {
+      "epoch": 0.15206552706552706,
+      "grad_norm": 0.4128672778606415,
+      "learning_rate": 0.00019931937474148689,
+      "loss": 1.1981,
+      "step": 854
+    },
+    {
+      "epoch": 0.15224358974358973,
+      "grad_norm": 0.4730764925479889,
+      "learning_rate": 0.00019931774342769632,
+      "loss": 1.2145,
+      "step": 855
+    },
+    {
+      "epoch": 0.15242165242165243,
+      "grad_norm": 0.36611825227737427,
+      "learning_rate": 0.00019931611016798089,
+      "loss": 0.8504,
+      "step": 856
+    },
+    {
+      "epoch": 0.1525997150997151,
+      "grad_norm": 0.40944692492485046,
+      "learning_rate": 0.00019931447496237254,
+      "loss": 1.2853,
+      "step": 857
+    },
+    {
+      "epoch": 0.1527777777777778,
+      "grad_norm": 0.4521993398666382,
+      "learning_rate": 0.0001993128378109034,
+      "loss": 1.0198,
+      "step": 858
+    },
+    {
+      "epoch": 0.15295584045584046,
+      "grad_norm": 0.42113015055656433,
+      "learning_rate": 0.0001993111987136055,
+      "loss": 1.1284,
+      "step": 859
+    },
+    {
+      "epoch": 0.15313390313390313,
+      "grad_norm": 0.4117624759674072,
+      "learning_rate": 0.00019930955767051098,
+      "loss": 1.0445,
+      "step": 860
+    },
+    {
+      "epoch": 0.15331196581196582,
+      "grad_norm": 0.4807964265346527,
+      "learning_rate": 0.00019930791468165197,
+      "loss": 1.1378,
+      "step": 861
+    },
+    {
+      "epoch": 0.1534900284900285,
+      "grad_norm": 0.4186483323574066,
+      "learning_rate": 0.00019930626974706063,
+      "loss": 1.1636,
+      "step": 862
+    },
+    {
+      "epoch": 0.15366809116809116,
+      "grad_norm": 0.3764737844467163,
+      "learning_rate": 0.00019930462286676926,
+      "loss": 0.9523,
+      "step": 863
+    },
+    {
+      "epoch": 0.15384615384615385,
+      "grad_norm": 0.4283556044101715,
+      "learning_rate": 0.00019930297404081008,
+      "loss": 1.1008,
+      "step": 864
+    },
+    {
+      "epoch": 0.15402421652421652,
+      "grad_norm": 0.4485796093940735,
+      "learning_rate": 0.00019930132326921541,
+      "loss": 1.0834,
+      "step": 865
+    },
+    {
+      "epoch": 0.1542022792022792,
+      "grad_norm": 0.3882720172405243,
+      "learning_rate": 0.0001992996705520176,
+      "loss": 1.1086,
+      "step": 866
+    },
+    {
+      "epoch": 0.1543803418803419,
+      "grad_norm": 0.44698455929756165,
+      "learning_rate": 0.00019929801588924902,
+      "loss": 1.1437,
+      "step": 867
+    },
+    {
+      "epoch": 0.15455840455840456,
+      "grad_norm": 0.46978411078453064,
+      "learning_rate": 0.00019929635928094208,
+      "loss": 1.091,
+      "step": 868
+    },
+    {
+      "epoch": 0.15473646723646722,
+      "grad_norm": 0.4717854857444763,
+      "learning_rate": 0.00019929470072712927,
+      "loss": 1.1959,
+      "step": 869
+    },
+    {
+      "epoch": 0.15491452991452992,
+      "grad_norm": 0.4324854016304016,
+      "learning_rate": 0.00019929304022784305,
+      "loss": 1.2062,
+      "step": 870
+    },
+    {
+      "epoch": 0.1550925925925926,
+      "grad_norm": 0.3948180675506592,
+      "learning_rate": 0.00019929137778311597,
+      "loss": 1.1101,
+      "step": 871
+    },
+    {
+      "epoch": 0.15527065527065528,
+      "grad_norm": 0.40345287322998047,
+      "learning_rate": 0.0001992897133929806,
+      "loss": 0.8894,
+      "step": 872
+    },
+    {
+      "epoch": 0.15544871794871795,
+      "grad_norm": 0.44931963086128235,
+      "learning_rate": 0.00019928804705746957,
+      "loss": 0.9389,
+      "step": 873
+    },
+    {
+      "epoch": 0.15562678062678062,
+      "grad_norm": 0.529196560382843,
+      "learning_rate": 0.0001992863787766155,
+      "loss": 1.3362,
+      "step": 874
+    },
+    {
+      "epoch": 0.15580484330484332,
+      "grad_norm": 0.41218671202659607,
+      "learning_rate": 0.0001992847085504511,
+      "loss": 1.0727,
+      "step": 875
+    },
+    {
+      "epoch": 0.15598290598290598,
+      "grad_norm": 0.44074541330337524,
+      "learning_rate": 0.00019928303637900907,
+      "loss": 1.1091,
+      "step": 876
+    },
+    {
+      "epoch": 0.15616096866096865,
+      "grad_norm": 0.5264310240745544,
+      "learning_rate": 0.00019928136226232218,
+      "loss": 1.201,
+      "step": 877
+    },
+    {
+      "epoch": 0.15633903133903135,
+      "grad_norm": 0.4255099594593048,
+      "learning_rate": 0.00019927968620042324,
+      "loss": 1.2514,
+      "step": 878
+    },
+    {
+      "epoch": 0.15651709401709402,
+      "grad_norm": 0.4030280113220215,
+      "learning_rate": 0.0001992780081933451,
+      "loss": 1.0422,
+      "step": 879
+    },
+    {
+      "epoch": 0.15669515669515668,
+      "grad_norm": 0.5270203948020935,
+      "learning_rate": 0.00019927632824112058,
+      "loss": 1.2476,
+      "step": 880
+    },
+    {
+      "epoch": 0.15687321937321938,
+      "grad_norm": 0.37767237424850464,
+      "learning_rate": 0.00019927464634378268,
+      "loss": 1.0768,
+      "step": 881
+    },
+    {
+      "epoch": 0.15705128205128205,
+      "grad_norm": 0.4535936415195465,
+      "learning_rate": 0.0001992729625013643,
+      "loss": 1.2097,
+      "step": 882
+    },
+    {
+      "epoch": 0.15722934472934472,
+      "grad_norm": 0.4282119870185852,
+      "learning_rate": 0.00019927127671389843,
+      "loss": 1.0904,
+      "step": 883
+    },
+    {
+      "epoch": 0.1574074074074074,
+      "grad_norm": 0.3924157917499542,
+      "learning_rate": 0.0001992695889814181,
+      "loss": 0.9692,
+      "step": 884
+    },
+    {
+      "epoch": 0.15758547008547008,
+      "grad_norm": 0.525075376033783,
+      "learning_rate": 0.0001992678993039564,
+      "loss": 1.0292,
+      "step": 885
+    },
+    {
+      "epoch": 0.15776353276353278,
+      "grad_norm": 0.4388505518436432,
+      "learning_rate": 0.00019926620768154644,
+      "loss": 1.1944,
+      "step": 886
+    },
+    {
+      "epoch": 0.15794159544159544,
+      "grad_norm": 0.4362235963344574,
+      "learning_rate": 0.00019926451411422132,
+      "loss": 0.97,
+      "step": 887
+    },
+    {
+      "epoch": 0.1581196581196581,
+      "grad_norm": 0.4265296459197998,
+      "learning_rate": 0.0001992628186020143,
+      "loss": 0.9196,
+      "step": 888
+    },
+    {
+      "epoch": 0.1582977207977208,
+      "grad_norm": 0.4019876718521118,
+      "learning_rate": 0.0001992611211449585,
+      "loss": 1.1368,
+      "step": 889
+    },
+    {
+      "epoch": 0.15847578347578348,
+      "grad_norm": 0.5003397464752197,
+      "learning_rate": 0.00019925942174308726,
+      "loss": 1.2582,
+      "step": 890
+    },
+    {
+      "epoch": 0.15865384615384615,
+      "grad_norm": 0.4774404466152191,
+      "learning_rate": 0.00019925772039643382,
+      "loss": 1.2277,
+      "step": 891
+    },
+    {
+      "epoch": 0.15883190883190884,
+      "grad_norm": 0.4590449333190918,
+      "learning_rate": 0.00019925601710503153,
+      "loss": 1.1679,
+      "step": 892
+    },
+    {
+      "epoch": 0.1590099715099715,
+      "grad_norm": 0.4221442639827728,
+      "learning_rate": 0.0001992543118689138,
+      "loss": 1.1626,
+      "step": 893
+    },
+    {
+      "epoch": 0.15918803418803418,
+      "grad_norm": 0.47613003849983215,
+      "learning_rate": 0.00019925260468811403,
+      "loss": 1.1509,
+      "step": 894
+    },
+    {
+      "epoch": 0.15936609686609687,
+      "grad_norm": 0.41706812381744385,
+      "learning_rate": 0.0001992508955626656,
+      "loss": 1.0366,
+      "step": 895
+    },
+    {
+      "epoch": 0.15954415954415954,
+      "grad_norm": 0.5064654350280762,
+      "learning_rate": 0.00019924918449260205,
+      "loss": 1.0729,
+      "step": 896
+    },
+    {
+      "epoch": 0.1597222222222222,
+      "grad_norm": 0.5019610524177551,
+      "learning_rate": 0.00019924747147795696,
+      "loss": 1.0642,
+      "step": 897
+    },
+    {
+      "epoch": 0.1599002849002849,
+      "grad_norm": 0.4345671534538269,
+      "learning_rate": 0.00019924575651876378,
+      "loss": 1.1747,
+      "step": 898
+    },
+    {
+      "epoch": 0.16007834757834757,
+      "grad_norm": 0.4397568702697754,
+      "learning_rate": 0.0001992440396150562,
+      "loss": 1.282,
+      "step": 899
+    },
+    {
+      "epoch": 0.16025641025641027,
+      "grad_norm": 0.520187497138977,
+      "learning_rate": 0.0001992423207668678,
+      "loss": 0.976,
+      "step": 900
+    },
+    {
+      "epoch": 0.16043447293447294,
+      "grad_norm": 0.39329993724823,
+      "learning_rate": 0.0001992405999742323,
+      "loss": 0.9829,
+      "step": 901
+    },
+    {
+      "epoch": 0.1606125356125356,
+      "grad_norm": 0.42361345887184143,
+      "learning_rate": 0.00019923887723718339,
+      "loss": 1.139,
+      "step": 902
+    },
+    {
+      "epoch": 0.1607905982905983,
+      "grad_norm": 0.3846314251422882,
+      "learning_rate": 0.00019923715255575482,
+      "loss": 0.8262,
+      "step": 903
+    },
+    {
+      "epoch": 0.16096866096866097,
+      "grad_norm": 0.39258381724357605,
+      "learning_rate": 0.0001992354259299804,
+      "loss": 0.9638,
+      "step": 904
+    },
+    {
+      "epoch": 0.16114672364672364,
+      "grad_norm": 0.4000850319862366,
+      "learning_rate": 0.00019923369735989397,
+      "loss": 0.91,
+      "step": 905
+    },
+    {
+      "epoch": 0.16132478632478633,
+      "grad_norm": 0.46303513646125793,
+      "learning_rate": 0.00019923196684552936,
+      "loss": 1.1447,
+      "step": 906
+    },
+    {
+      "epoch": 0.161502849002849,
+      "grad_norm": 0.38437438011169434,
+      "learning_rate": 0.0001992302343869205,
+      "loss": 1.0212,
+      "step": 907
+    },
+    {
+      "epoch": 0.16168091168091167,
+      "grad_norm": 0.44585472345352173,
+      "learning_rate": 0.00019922849998410135,
+      "loss": 1.1964,
+      "step": 908
+    },
+    {
+      "epoch": 0.16185897435897437,
+      "grad_norm": 0.41959813237190247,
+      "learning_rate": 0.00019922676363710583,
+      "loss": 0.9925,
+      "step": 909
+    },
+    {
+      "epoch": 0.16203703703703703,
+      "grad_norm": 0.47442761063575745,
+      "learning_rate": 0.00019922502534596803,
+      "loss": 0.9237,
+      "step": 910
+    },
+    {
+      "epoch": 0.1622150997150997,
+      "grad_norm": 0.5065128207206726,
+      "learning_rate": 0.00019922328511072198,
+      "loss": 1.2573,
+      "step": 911
+    },
+    {
+      "epoch": 0.1623931623931624,
+      "grad_norm": 0.4739879369735718,
+      "learning_rate": 0.0001992215429314018,
+      "loss": 1.4416,
+      "step": 912
+    },
+    {
+      "epoch": 0.16257122507122507,
+      "grad_norm": 0.48763832449913025,
+      "learning_rate": 0.00019921979880804157,
+      "loss": 1.0408,
+      "step": 913
+    },
+    {
+      "epoch": 0.16274928774928774,
+      "grad_norm": 0.4841614067554474,
+      "learning_rate": 0.0001992180527406755,
+      "loss": 1.1826,
+      "step": 914
+    },
+    {
+      "epoch": 0.16292735042735043,
+      "grad_norm": 0.49433308839797974,
+      "learning_rate": 0.0001992163047293378,
+      "loss": 1.3552,
+      "step": 915
+    },
+    {
+      "epoch": 0.1631054131054131,
+      "grad_norm": 0.4985002875328064,
+      "learning_rate": 0.0001992145547740627,
+      "loss": 1.2639,
+      "step": 916
+    },
+    {
+      "epoch": 0.1632834757834758,
+      "grad_norm": 0.40348032116889954,
+      "learning_rate": 0.00019921280287488448,
+      "loss": 1.1731,
+      "step": 917
+    },
+    {
+      "epoch": 0.16346153846153846,
+      "grad_norm": 0.5166002511978149,
+      "learning_rate": 0.0001992110490318375,
+      "loss": 1.0692,
+      "step": 918
+    },
+    {
+      "epoch": 0.16363960113960113,
+      "grad_norm": 0.44233468174934387,
+      "learning_rate": 0.00019920929324495615,
+      "loss": 1.0488,
+      "step": 919
+    },
+    {
+      "epoch": 0.16381766381766383,
+      "grad_norm": 0.43709903955459595,
+      "learning_rate": 0.00019920753551427476,
+      "loss": 0.8884,
+      "step": 920
+    },
+    {
+      "epoch": 0.1639957264957265,
+      "grad_norm": 0.4054167568683624,
+      "learning_rate": 0.00019920577583982778,
+      "loss": 0.9872,
+      "step": 921
+    },
+    {
+      "epoch": 0.16417378917378916,
+      "grad_norm": 0.4657362997531891,
+      "learning_rate": 0.0001992040142216497,
+      "loss": 1.4402,
+      "step": 922
+    },
+    {
+      "epoch": 0.16435185185185186,
+      "grad_norm": 0.42550426721572876,
+      "learning_rate": 0.0001992022506597751,
+      "loss": 1.0456,
+      "step": 923
+    },
+    {
+      "epoch": 0.16452991452991453,
+      "grad_norm": 0.49346762895584106,
+      "learning_rate": 0.00019920048515423842,
+      "loss": 1.527,
+      "step": 924
+    },
+    {
+      "epoch": 0.1647079772079772,
+      "grad_norm": 0.3970337510108948,
+      "learning_rate": 0.0001991987177050743,
+      "loss": 1.0363,
+      "step": 925
+    },
+    {
+      "epoch": 0.1648860398860399,
+      "grad_norm": 0.4027378559112549,
+      "learning_rate": 0.0001991969483123174,
+      "loss": 0.8416,
+      "step": 926
+    },
+    {
+      "epoch": 0.16506410256410256,
+      "grad_norm": 0.4181644916534424,
+      "learning_rate": 0.00019919517697600237,
+      "loss": 1.2253,
+      "step": 927
+    },
+    {
+      "epoch": 0.16524216524216523,
+      "grad_norm": 0.43686383962631226,
+      "learning_rate": 0.0001991934036961639,
+      "loss": 1.0808,
+      "step": 928
+    },
+    {
+      "epoch": 0.16542022792022792,
+      "grad_norm": 0.4242876172065735,
+      "learning_rate": 0.0001991916284728367,
+      "loss": 0.9483,
+      "step": 929
+    },
+    {
+      "epoch": 0.1655982905982906,
+      "grad_norm": 0.3690609037876129,
+      "learning_rate": 0.00019918985130605563,
+      "loss": 0.9495,
+      "step": 930
+    },
+    {
+      "epoch": 0.1657763532763533,
+      "grad_norm": 0.42184555530548096,
+      "learning_rate": 0.00019918807219585546,
+      "loss": 1.0966,
+      "step": 931
+    },
+    {
+      "epoch": 0.16595441595441596,
+      "grad_norm": 0.4342746138572693,
+      "learning_rate": 0.00019918629114227106,
+      "loss": 1.0875,
+      "step": 932
+    },
+    {
+      "epoch": 0.16613247863247863,
+      "grad_norm": 0.4191494286060333,
+      "learning_rate": 0.00019918450814533737,
+      "loss": 1.0777,
+      "step": 933
+    },
+    {
+      "epoch": 0.16631054131054132,
+      "grad_norm": 0.37124550342559814,
+      "learning_rate": 0.00019918272320508922,
+      "loss": 1.0131,
+      "step": 934
+    },
+    {
+      "epoch": 0.166488603988604,
+      "grad_norm": 0.4475722014904022,
+      "learning_rate": 0.00019918093632156168,
+      "loss": 1.1185,
+      "step": 935
+    },
+    {
+      "epoch": 0.16666666666666666,
+      "grad_norm": 0.4629058241844177,
+      "learning_rate": 0.0001991791474947897,
+      "loss": 1.0353,
+      "step": 936
+    },
+    {
+      "epoch": 0.16684472934472935,
+      "grad_norm": 0.48192909359931946,
+      "learning_rate": 0.00019917735672480834,
+      "loss": 1.1628,
+      "step": 937
+    },
+    {
+      "epoch": 0.16702279202279202,
+      "grad_norm": 0.5542252063751221,
+      "learning_rate": 0.00019917556401165273,
+      "loss": 1.3133,
+      "step": 938
+    },
+    {
+      "epoch": 0.1672008547008547,
+      "grad_norm": 0.4172651171684265,
+      "learning_rate": 0.00019917376935535796,
+      "loss": 1.1733,
+      "step": 939
+    },
+    {
+      "epoch": 0.16737891737891739,
+      "grad_norm": 0.4424920380115509,
+      "learning_rate": 0.0001991719727559592,
+      "loss": 1.0262,
+      "step": 940
+    },
+    {
+      "epoch": 0.16755698005698005,
+      "grad_norm": 0.4551742970943451,
+      "learning_rate": 0.00019917017421349162,
+      "loss": 1.0883,
+      "step": 941
+    },
+    {
+      "epoch": 0.16773504273504272,
+      "grad_norm": 0.45929640531539917,
+      "learning_rate": 0.00019916837372799048,
+      "loss": 1.1836,
+      "step": 942
+    },
+    {
+      "epoch": 0.16791310541310542,
+      "grad_norm": 0.4609353542327881,
+      "learning_rate": 0.0001991665712994911,
+      "loss": 1.0682,
+      "step": 943
+    },
+    {
+      "epoch": 0.16809116809116809,
+      "grad_norm": 0.42617303133010864,
+      "learning_rate": 0.00019916476692802873,
+      "loss": 1.074,
+      "step": 944
+    },
+    {
+      "epoch": 0.16826923076923078,
+      "grad_norm": 0.41919493675231934,
+      "learning_rate": 0.00019916296061363875,
+      "loss": 1.0969,
+      "step": 945
+    },
+    {
+      "epoch": 0.16844729344729345,
+      "grad_norm": 0.450979083776474,
+      "learning_rate": 0.00019916115235635656,
+      "loss": 1.1686,
+      "step": 946
+    },
+    {
+      "epoch": 0.16862535612535612,
+      "grad_norm": 0.42166751623153687,
+      "learning_rate": 0.00019915934215621758,
+      "loss": 0.9273,
+      "step": 947
+    },
+    {
+      "epoch": 0.16880341880341881,
+      "grad_norm": 0.4404160976409912,
+      "learning_rate": 0.00019915753001325729,
+      "loss": 1.1663,
+      "step": 948
+    },
+    {
+      "epoch": 0.16898148148148148,
+      "grad_norm": 0.42025226354599,
+      "learning_rate": 0.0001991557159275111,
+      "loss": 0.9433,
+      "step": 949
+    },
+    {
+      "epoch": 0.16915954415954415,
+      "grad_norm": 0.4277796745300293,
+      "learning_rate": 0.00019915389989901474,
+      "loss": 0.8475,
+      "step": 950
+    },
+    {
+      "epoch": 0.16933760683760685,
+      "grad_norm": 0.5162755250930786,
+      "learning_rate": 0.00019915208192780365,
+      "loss": 1.1155,
+      "step": 951
+    },
+    {
+      "epoch": 0.16951566951566951,
+      "grad_norm": 0.4214856028556824,
+      "learning_rate": 0.00019915026201391346,
+      "loss": 1.173,
+      "step": 952
+    },
+    {
+      "epoch": 0.16969373219373218,
+      "grad_norm": 0.4713292419910431,
+      "learning_rate": 0.00019914844015737985,
+      "loss": 1.1615,
+      "step": 953
+    },
+    {
+      "epoch": 0.16987179487179488,
+      "grad_norm": 0.461179256439209,
+      "learning_rate": 0.00019914661635823854,
+      "loss": 1.1169,
+      "step": 954
+    },
+    {
+      "epoch": 0.17004985754985755,
+      "grad_norm": 0.46200552582740784,
+      "learning_rate": 0.00019914479061652527,
+      "loss": 1.0274,
+      "step": 955
+    },
+    {
+      "epoch": 0.17022792022792022,
+      "grad_norm": 0.40968334674835205,
+      "learning_rate": 0.00019914296293227572,
+      "loss": 1.066,
+      "step": 956
+    },
+    {
+      "epoch": 0.1704059829059829,
+      "grad_norm": 0.40877434611320496,
+      "learning_rate": 0.0001991411333055258,
+      "loss": 1.1595,
+      "step": 957
+    },
+    {
+      "epoch": 0.17058404558404558,
+      "grad_norm": 0.42940187454223633,
+      "learning_rate": 0.00019913930173631132,
+      "loss": 1.0364,
+      "step": 958
+    },
+    {
+      "epoch": 0.17076210826210828,
+      "grad_norm": 0.49648910760879517,
+      "learning_rate": 0.00019913746822466819,
+      "loss": 1.0763,
+      "step": 959
+    },
+    {
+      "epoch": 0.17094017094017094,
+      "grad_norm": 0.4353426396846771,
+      "learning_rate": 0.00019913563277063228,
+      "loss": 0.9698,
+      "step": 960
+    },
+    {
+      "epoch": 0.1711182336182336,
+      "grad_norm": 0.45079681277275085,
+      "learning_rate": 0.00019913379537423958,
+      "loss": 1.2244,
+      "step": 961
+    },
+    {
+      "epoch": 0.1712962962962963,
+      "grad_norm": 0.4276828467845917,
+      "learning_rate": 0.00019913195603552607,
+      "loss": 0.9976,
+      "step": 962
+    },
+    {
+      "epoch": 0.17147435897435898,
+      "grad_norm": 0.41122403740882874,
+      "learning_rate": 0.00019913011475452785,
+      "loss": 1.0077,
+      "step": 963
+    },
+    {
+      "epoch": 0.17165242165242164,
+      "grad_norm": 0.43170276284217834,
+      "learning_rate": 0.00019912827153128096,
+      "loss": 1.1402,
+      "step": 964
+    },
+    {
+      "epoch": 0.17183048433048434,
+      "grad_norm": 0.37950268387794495,
+      "learning_rate": 0.0001991264263658215,
+      "loss": 0.9818,
+      "step": 965
+    },
+    {
+      "epoch": 0.172008547008547,
+      "grad_norm": 0.477333128452301,
+      "learning_rate": 0.00019912457925818562,
+      "loss": 1.1756,
+      "step": 966
+    },
+    {
+      "epoch": 0.17218660968660968,
+      "grad_norm": 0.4326401352882385,
+      "learning_rate": 0.00019912273020840954,
+      "loss": 1.3718,
+      "step": 967
+    },
+    {
+      "epoch": 0.17236467236467237,
+      "grad_norm": 0.37711042165756226,
+      "learning_rate": 0.00019912087921652945,
+      "loss": 0.9011,
+      "step": 968
+    },
+    {
+      "epoch": 0.17254273504273504,
+      "grad_norm": 0.50013667345047,
+      "learning_rate": 0.00019911902628258162,
+      "loss": 1.1163,
+      "step": 969
+    },
+    {
+      "epoch": 0.1727207977207977,
+      "grad_norm": 0.41913339495658875,
+      "learning_rate": 0.0001991171714066024,
+      "loss": 1.2614,
+      "step": 970
+    },
+    {
+      "epoch": 0.1728988603988604,
+      "grad_norm": 0.4075855612754822,
+      "learning_rate": 0.00019911531458862813,
+      "loss": 0.8984,
+      "step": 971
+    },
+    {
+      "epoch": 0.17307692307692307,
+      "grad_norm": 0.40277954936027527,
+      "learning_rate": 0.00019911345582869513,
+      "loss": 1.0851,
+      "step": 972
+    },
+    {
+      "epoch": 0.17325498575498577,
+      "grad_norm": 0.4312847852706909,
+      "learning_rate": 0.00019911159512683987,
+      "loss": 1.1273,
+      "step": 973
+    },
+    {
+      "epoch": 0.17343304843304844,
+      "grad_norm": 0.40303611755371094,
+      "learning_rate": 0.0001991097324830988,
+      "loss": 0.9645,
+      "step": 974
+    },
+    {
+      "epoch": 0.1736111111111111,
+      "grad_norm": 0.45560577511787415,
+      "learning_rate": 0.00019910786789750838,
+      "loss": 1.0864,
+      "step": 975
+    },
+    {
+      "epoch": 0.1737891737891738,
+      "grad_norm": 0.43775680661201477,
+      "learning_rate": 0.00019910600137010517,
+      "loss": 1.028,
+      "step": 976
+    },
+    {
+      "epoch": 0.17396723646723647,
+      "grad_norm": 0.3917224407196045,
+      "learning_rate": 0.00019910413290092572,
+      "loss": 1.0491,
+      "step": 977
+    },
+    {
+      "epoch": 0.17414529914529914,
+      "grad_norm": 0.4068751037120819,
+      "learning_rate": 0.0001991022624900067,
+      "loss": 1.0476,
+      "step": 978
+    },
+    {
+      "epoch": 0.17432336182336183,
+      "grad_norm": 0.4463370144367218,
+      "learning_rate": 0.0001991003901373847,
+      "loss": 1.0612,
+      "step": 979
+    },
+    {
+      "epoch": 0.1745014245014245,
+      "grad_norm": 0.46949052810668945,
+      "learning_rate": 0.0001990985158430964,
+      "loss": 1.3099,
+      "step": 980
+    },
+    {
+      "epoch": 0.17467948717948717,
+      "grad_norm": 0.4250012934207916,
+      "learning_rate": 0.00019909663960717856,
+      "loss": 0.9903,
+      "step": 981
+    },
+    {
+      "epoch": 0.17485754985754987,
+      "grad_norm": 0.5293903946876526,
+      "learning_rate": 0.0001990947614296679,
+      "loss": 0.9908,
+      "step": 982
+    },
+    {
+      "epoch": 0.17503561253561253,
+      "grad_norm": 0.3838284909725189,
+      "learning_rate": 0.0001990928813106013,
+      "loss": 0.716,
+      "step": 983
+    },
+    {
+      "epoch": 0.1752136752136752,
+      "grad_norm": 0.4597751200199127,
+      "learning_rate": 0.0001990909992500155,
+      "loss": 1.0126,
+      "step": 984
+    },
+    {
+      "epoch": 0.1753917378917379,
+      "grad_norm": 0.4844081699848175,
+      "learning_rate": 0.0001990891152479474,
+      "loss": 1.1043,
+      "step": 985
+    },
+    {
+      "epoch": 0.17556980056980057,
+      "grad_norm": 0.4763399660587311,
+      "learning_rate": 0.00019908722930443392,
+      "loss": 1.019,
+      "step": 986
+    },
+    {
+      "epoch": 0.17574786324786323,
+      "grad_norm": 0.4670077860355377,
+      "learning_rate": 0.00019908534141951204,
+      "loss": 1.1382,
+      "step": 987
+    },
+    {
+      "epoch": 0.17592592592592593,
+      "grad_norm": 0.39372730255126953,
+      "learning_rate": 0.00019908345159321873,
+      "loss": 1.1219,
+      "step": 988
+    },
+    {
+      "epoch": 0.1761039886039886,
+      "grad_norm": 0.41869843006134033,
+      "learning_rate": 0.00019908155982559098,
+      "loss": 0.9461,
+      "step": 989
+    },
+    {
+      "epoch": 0.1762820512820513,
+      "grad_norm": 0.4398406147956848,
+      "learning_rate": 0.00019907966611666593,
+      "loss": 1.1328,
+      "step": 990
+    },
+    {
+      "epoch": 0.17646011396011396,
+      "grad_norm": 0.4315733015537262,
+      "learning_rate": 0.0001990777704664806,
+      "loss": 1.0974,
+      "step": 991
+    },
+    {
+      "epoch": 0.17663817663817663,
+      "grad_norm": 0.42859575152397156,
+      "learning_rate": 0.00019907587287507222,
+      "loss": 1.2637,
+      "step": 992
+    },
+    {
+      "epoch": 0.17681623931623933,
+      "grad_norm": 0.47928622364997864,
+      "learning_rate": 0.0001990739733424779,
+      "loss": 1.0699,
+      "step": 993
+    },
+    {
+      "epoch": 0.176994301994302,
+      "grad_norm": 0.4443826973438263,
+      "learning_rate": 0.00019907207186873488,
+      "loss": 1.0547,
+      "step": 994
+    },
+    {
+      "epoch": 0.17717236467236466,
+      "grad_norm": 0.4108099937438965,
+      "learning_rate": 0.00019907016845388043,
+      "loss": 1.1401,
+      "step": 995
+    },
+    {
+      "epoch": 0.17735042735042736,
+      "grad_norm": 0.4474675953388214,
+      "learning_rate": 0.00019906826309795182,
+      "loss": 1.0712,
+      "step": 996
+    },
+    {
+      "epoch": 0.17752849002849003,
+      "grad_norm": 0.4149756133556366,
+      "learning_rate": 0.00019906635580098638,
+      "loss": 0.9585,
+      "step": 997
+    },
+    {
+      "epoch": 0.1777065527065527,
+      "grad_norm": 0.4875968098640442,
+      "learning_rate": 0.00019906444656302152,
+      "loss": 1.0659,
+      "step": 998
+    },
+    {
+      "epoch": 0.1778846153846154,
+      "grad_norm": 0.5494784116744995,
+      "learning_rate": 0.0001990625353840946,
+      "loss": 1.2858,
+      "step": 999
+    },
+    {
+      "epoch": 0.17806267806267806,
+      "grad_norm": 0.425062358379364,
+      "learning_rate": 0.0001990606222642431,
+      "loss": 1.1826,
+      "step": 1000
+    },
+    {
+      "epoch": 0.17824074074074073,
+      "grad_norm": 0.3890725374221802,
+      "learning_rate": 0.00019905870720350445,
+      "loss": 0.9568,
+      "step": 1001
+    },
+    {
+      "epoch": 0.17841880341880342,
+      "grad_norm": 0.3884070813655853,
+      "learning_rate": 0.00019905679020191624,
+      "loss": 0.9674,
+      "step": 1002
+    },
+    {
+      "epoch": 0.1785968660968661,
+      "grad_norm": 0.49496129155158997,
+      "learning_rate": 0.00019905487125951597,
+      "loss": 0.9143,
+      "step": 1003
+    },
+    {
+      "epoch": 0.1787749287749288,
+      "grad_norm": 0.43448135256767273,
+      "learning_rate": 0.00019905295037634128,
+      "loss": 1.2677,
+      "step": 1004
+    },
+    {
+      "epoch": 0.17895299145299146,
+      "grad_norm": 0.47327905893325806,
+      "learning_rate": 0.00019905102755242982,
+      "loss": 0.9089,
+      "step": 1005
+    },
+    {
+      "epoch": 0.17913105413105412,
+      "grad_norm": 0.4962378442287445,
+      "learning_rate": 0.00019904910278781922,
+      "loss": 1.1748,
+      "step": 1006
+    },
+    {
+      "epoch": 0.17930911680911682,
+      "grad_norm": 0.4343934655189514,
+      "learning_rate": 0.0001990471760825472,
+      "loss": 1.2176,
+      "step": 1007
+    },
+    {
+      "epoch": 0.1794871794871795,
+      "grad_norm": 0.4695793092250824,
+      "learning_rate": 0.0001990452474366515,
+      "loss": 1.1822,
+      "step": 1008
+    },
+    {
+      "epoch": 0.17966524216524216,
+      "grad_norm": 0.4156060516834259,
+      "learning_rate": 0.00019904331685016995,
+      "loss": 0.8231,
+      "step": 1009
+    },
+    {
+      "epoch": 0.17984330484330485,
+      "grad_norm": 0.5068191885948181,
+      "learning_rate": 0.00019904138432314035,
+      "loss": 1.1363,
+      "step": 1010
+    },
+    {
+      "epoch": 0.18002136752136752,
+      "grad_norm": 0.5189786553382874,
+      "learning_rate": 0.00019903944985560058,
+      "loss": 1.3131,
+      "step": 1011
+    },
+    {
+      "epoch": 0.1801994301994302,
+      "grad_norm": 0.5126828551292419,
+      "learning_rate": 0.00019903751344758848,
+      "loss": 1.0305,
+      "step": 1012
+    },
+    {
+      "epoch": 0.18037749287749288,
+      "grad_norm": 0.41045933961868286,
+      "learning_rate": 0.00019903557509914205,
+      "loss": 1.2726,
+      "step": 1013
+    },
+    {
+      "epoch": 0.18055555555555555,
+      "grad_norm": 0.4141713082790375,
+      "learning_rate": 0.0001990336348102993,
+      "loss": 0.9606,
+      "step": 1014
+    },
+    {
+      "epoch": 0.18073361823361822,
+      "grad_norm": 0.42652079463005066,
+      "learning_rate": 0.00019903169258109812,
+      "loss": 1.0235,
+      "step": 1015
+    },
+    {
+      "epoch": 0.18091168091168092,
+      "grad_norm": 0.42098379135131836,
+      "learning_rate": 0.0001990297484115767,
+      "loss": 1.0602,
+      "step": 1016
+    },
+    {
+      "epoch": 0.18108974358974358,
+      "grad_norm": 0.49920013546943665,
+      "learning_rate": 0.0001990278023017731,
+      "loss": 1.3322,
+      "step": 1017
+    },
+    {
+      "epoch": 0.18126780626780628,
+      "grad_norm": 0.412304550409317,
+      "learning_rate": 0.00019902585425172537,
+      "loss": 1.1011,
+      "step": 1018
+    },
+    {
+      "epoch": 0.18144586894586895,
+      "grad_norm": 0.44226935505867004,
+      "learning_rate": 0.00019902390426147177,
+      "loss": 0.9777,
+      "step": 1019
+    },
+    {
+      "epoch": 0.18162393162393162,
+      "grad_norm": 0.4685269594192505,
+      "learning_rate": 0.00019902195233105046,
+      "loss": 1.3587,
+      "step": 1020
+    },
+    {
+      "epoch": 0.1818019943019943,
+      "grad_norm": 0.4500584304332733,
+      "learning_rate": 0.00019901999846049968,
+      "loss": 0.9888,
+      "step": 1021
+    },
+    {
+      "epoch": 0.18198005698005698,
+      "grad_norm": 0.48566994071006775,
+      "learning_rate": 0.00019901804264985774,
+      "loss": 1.2364,
+      "step": 1022
+    },
+    {
+      "epoch": 0.18215811965811965,
+      "grad_norm": 0.4063156247138977,
+      "learning_rate": 0.00019901608489916294,
+      "loss": 1.2224,
+      "step": 1023
+    },
+    {
+      "epoch": 0.18233618233618235,
+      "grad_norm": 0.471276193857193,
+      "learning_rate": 0.00019901412520845367,
+      "loss": 0.9926,
+      "step": 1024
+    },
+    {
+      "epoch": 0.182514245014245,
+      "grad_norm": 0.5165421366691589,
+      "learning_rate": 0.00019901216357776829,
+      "loss": 0.9595,
+      "step": 1025
+    },
+    {
+      "epoch": 0.18269230769230768,
+      "grad_norm": 0.4746754467487335,
+      "learning_rate": 0.0001990102000071452,
+      "loss": 1.2057,
+      "step": 1026
+    },
+    {
+      "epoch": 0.18287037037037038,
+      "grad_norm": 0.44803035259246826,
+      "learning_rate": 0.00019900823449662297,
+      "loss": 1.2114,
+      "step": 1027
+    },
+    {
+      "epoch": 0.18304843304843305,
+      "grad_norm": 0.47256240248680115,
+      "learning_rate": 0.00019900626704624005,
+      "loss": 1.112,
+      "step": 1028
+    },
+    {
+      "epoch": 0.18322649572649571,
+      "grad_norm": 0.4253387153148651,
+      "learning_rate": 0.000199004297656035,
+      "loss": 0.9899,
+      "step": 1029
+    },
+    {
+      "epoch": 0.1834045584045584,
+      "grad_norm": 0.44958099722862244,
+      "learning_rate": 0.00019900232632604636,
+      "loss": 1.1445,
+      "step": 1030
+    },
+    {
+      "epoch": 0.18358262108262108,
+      "grad_norm": 0.5296537280082703,
+      "learning_rate": 0.00019900035305631285,
+      "loss": 1.2502,
+      "step": 1031
+    },
+    {
+      "epoch": 0.18376068376068377,
+      "grad_norm": 0.5057148933410645,
+      "learning_rate": 0.00019899837784687302,
+      "loss": 1.1426,
+      "step": 1032
+    },
+    {
+      "epoch": 0.18393874643874644,
+      "grad_norm": 0.41463762521743774,
+      "learning_rate": 0.00019899640069776566,
+      "loss": 1.1854,
+      "step": 1033
+    },
+    {
+      "epoch": 0.1841168091168091,
+      "grad_norm": 0.45800045132637024,
+      "learning_rate": 0.00019899442160902945,
+      "loss": 1.2438,
+      "step": 1034
+    },
+    {
+      "epoch": 0.1842948717948718,
+      "grad_norm": 0.43450453877449036,
+      "learning_rate": 0.00019899244058070324,
+      "loss": 1.0598,
+      "step": 1035
+    },
+    {
+      "epoch": 0.18447293447293447,
+      "grad_norm": 0.4141148626804352,
+      "learning_rate": 0.00019899045761282577,
+      "loss": 1.0465,
+      "step": 1036
+    },
+    {
+      "epoch": 0.18465099715099714,
+      "grad_norm": 0.3938458263874054,
+      "learning_rate": 0.0001989884727054359,
+      "loss": 1.0142,
+      "step": 1037
+    },
+    {
+      "epoch": 0.18482905982905984,
+      "grad_norm": 0.43898263573646545,
+      "learning_rate": 0.00019898648585857257,
+      "loss": 0.9212,
+      "step": 1038
+    },
+    {
+      "epoch": 0.1850071225071225,
+      "grad_norm": 0.4425487816333771,
+      "learning_rate": 0.00019898449707227465,
+      "loss": 1.2987,
+      "step": 1039
+    },
+    {
+      "epoch": 0.18518518518518517,
+      "grad_norm": 0.4537975490093231,
+      "learning_rate": 0.00019898250634658115,
+      "loss": 1.2023,
+      "step": 1040
+    },
+    {
+      "epoch": 0.18536324786324787,
+      "grad_norm": 0.4107198119163513,
+      "learning_rate": 0.00019898051368153104,
+      "loss": 0.8443,
+      "step": 1041
+    },
+    {
+      "epoch": 0.18554131054131054,
+      "grad_norm": 0.4389404058456421,
+      "learning_rate": 0.0001989785190771634,
+      "loss": 1.0502,
+      "step": 1042
+    },
+    {
+      "epoch": 0.1857193732193732,
+      "grad_norm": 0.4288824796676636,
+      "learning_rate": 0.00019897652253351726,
+      "loss": 1.01,
+      "step": 1043
+    },
+    {
+      "epoch": 0.1858974358974359,
+      "grad_norm": 0.50815349817276,
+      "learning_rate": 0.00019897452405063178,
+      "loss": 1.0308,
+      "step": 1044
+    },
+    {
+      "epoch": 0.18607549857549857,
+      "grad_norm": 0.45252710580825806,
+      "learning_rate": 0.0001989725236285461,
+      "loss": 1.0967,
+      "step": 1045
+    },
+    {
+      "epoch": 0.18625356125356127,
+      "grad_norm": 0.45049402117729187,
+      "learning_rate": 0.00019897052126729943,
+      "loss": 1.0141,
+      "step": 1046
+    },
+    {
+      "epoch": 0.18643162393162394,
+      "grad_norm": 0.49637508392333984,
+      "learning_rate": 0.00019896851696693098,
+      "loss": 1.0997,
+      "step": 1047
+    },
+    {
+      "epoch": 0.1866096866096866,
+      "grad_norm": 0.4465886056423187,
+      "learning_rate": 0.00019896651072748005,
+      "loss": 1.1415,
+      "step": 1048
+    },
+    {
+      "epoch": 0.1867877492877493,
+      "grad_norm": 0.5309500694274902,
+      "learning_rate": 0.00019896450254898592,
+      "loss": 1.1028,
+      "step": 1049
+    },
+    {
+      "epoch": 0.18696581196581197,
+      "grad_norm": 0.3516653776168823,
+      "learning_rate": 0.00019896249243148793,
+      "loss": 0.9841,
+      "step": 1050
+    },
+    {
+      "epoch": 0.18714387464387464,
+      "grad_norm": 0.4529176950454712,
+      "learning_rate": 0.0001989604803750255,
+      "loss": 1.1335,
+      "step": 1051
+    },
+    {
+      "epoch": 0.18732193732193733,
+      "grad_norm": 0.47694942355155945,
+      "learning_rate": 0.000198958466379638,
+      "loss": 1.2383,
+      "step": 1052
+    },
+    {
+      "epoch": 0.1875,
+      "grad_norm": 0.5524206757545471,
+      "learning_rate": 0.0001989564504453649,
+      "loss": 1.3668,
+      "step": 1053
+    },
+    {
+      "epoch": 0.18767806267806267,
+      "grad_norm": 0.39203691482543945,
+      "learning_rate": 0.00019895443257224576,
+      "loss": 1.2203,
+      "step": 1054
+    },
+    {
+      "epoch": 0.18785612535612536,
+      "grad_norm": 0.4164120852947235,
+      "learning_rate": 0.00019895241276032005,
+      "loss": 0.8954,
+      "step": 1055
+    },
+    {
+      "epoch": 0.18803418803418803,
+      "grad_norm": 0.41217970848083496,
+      "learning_rate": 0.0001989503910096274,
+      "loss": 1.0238,
+      "step": 1056
+    },
+    {
+      "epoch": 0.1882122507122507,
+      "grad_norm": 0.44038307666778564,
+      "learning_rate": 0.00019894836732020735,
+      "loss": 0.8159,
+      "step": 1057
+    },
+    {
+      "epoch": 0.1883903133903134,
+      "grad_norm": 0.45780670642852783,
+      "learning_rate": 0.0001989463416920996,
+      "loss": 1.2864,
+      "step": 1058
+    },
+    {
+      "epoch": 0.18856837606837606,
+      "grad_norm": 0.5197559595108032,
+      "learning_rate": 0.00019894431412534384,
+      "loss": 1.0756,
+      "step": 1059
+    },
+    {
+      "epoch": 0.18874643874643873,
+      "grad_norm": 0.43283385038375854,
+      "learning_rate": 0.00019894228461997979,
+      "loss": 1.0642,
+      "step": 1060
+    },
+    {
+      "epoch": 0.18892450142450143,
+      "grad_norm": 0.4657376706600189,
+      "learning_rate": 0.00019894025317604717,
+      "loss": 1.1159,
+      "step": 1061
+    },
+    {
+      "epoch": 0.1891025641025641,
+      "grad_norm": 0.4474908113479614,
+      "learning_rate": 0.00019893821979358588,
+      "loss": 1.2006,
+      "step": 1062
+    },
+    {
+      "epoch": 0.1892806267806268,
+      "grad_norm": 0.43878164887428284,
+      "learning_rate": 0.00019893618447263566,
+      "loss": 1.1599,
+      "step": 1063
+    },
+    {
+      "epoch": 0.18945868945868946,
+      "grad_norm": 0.4598735272884369,
+      "learning_rate": 0.00019893414721323645,
+      "loss": 1.3346,
+      "step": 1064
+    },
+    {
+      "epoch": 0.18963675213675213,
+      "grad_norm": 0.3947420120239258,
+      "learning_rate": 0.00019893210801542812,
+      "loss": 1.1201,
+      "step": 1065
+    },
+    {
+      "epoch": 0.18981481481481483,
+      "grad_norm": 0.3401558995246887,
+      "learning_rate": 0.00019893006687925064,
+      "loss": 0.7568,
+      "step": 1066
+    },
+    {
+      "epoch": 0.1899928774928775,
+      "grad_norm": 0.4400341808795929,
+      "learning_rate": 0.00019892802380474405,
+      "loss": 1.1706,
+      "step": 1067
+    },
+    {
+      "epoch": 0.19017094017094016,
+      "grad_norm": 0.42394164204597473,
+      "learning_rate": 0.00019892597879194829,
+      "loss": 1.0163,
+      "step": 1068
+    },
+    {
+      "epoch": 0.19034900284900286,
+      "grad_norm": 0.42904096841812134,
+      "learning_rate": 0.00019892393184090353,
+      "loss": 0.9193,
+      "step": 1069
+    },
+    {
+      "epoch": 0.19052706552706553,
+      "grad_norm": 0.497601181268692,
+      "learning_rate": 0.00019892188295164977,
+      "loss": 1.0377,
+      "step": 1070
+    },
+    {
+      "epoch": 0.1907051282051282,
+      "grad_norm": 0.4536020755767822,
+      "learning_rate": 0.00019891983212422723,
+      "loss": 1.0946,
+      "step": 1071
+    },
+    {
+      "epoch": 0.1908831908831909,
+      "grad_norm": 0.44916942715644836,
+      "learning_rate": 0.00019891777935867607,
+      "loss": 1.0563,
+      "step": 1072
+    },
+    {
+      "epoch": 0.19106125356125356,
+      "grad_norm": 0.4256889820098877,
+      "learning_rate": 0.0001989157246550365,
+      "loss": 1.0988,
+      "step": 1073
+    },
+    {
+      "epoch": 0.19123931623931623,
+      "grad_norm": 0.5559163689613342,
+      "learning_rate": 0.0001989136680133488,
+      "loss": 0.9155,
+      "step": 1074
+    },
+    {
+      "epoch": 0.19141737891737892,
+      "grad_norm": 0.391804963350296,
+      "learning_rate": 0.00019891160943365322,
+      "loss": 0.9314,
+      "step": 1075
+    },
+    {
+      "epoch": 0.1915954415954416,
+      "grad_norm": 0.4535716474056244,
+      "learning_rate": 0.00019890954891599015,
+      "loss": 1.0768,
+      "step": 1076
+    },
+    {
+      "epoch": 0.19177350427350429,
+      "grad_norm": 0.46770521998405457,
+      "learning_rate": 0.00019890748646039991,
+      "loss": 0.8406,
+      "step": 1077
+    },
+    {
+      "epoch": 0.19195156695156695,
+      "grad_norm": 0.4875394403934479,
+      "learning_rate": 0.00019890542206692295,
+      "loss": 1.1055,
+      "step": 1078
+    },
+    {
+      "epoch": 0.19212962962962962,
+      "grad_norm": 0.5072727203369141,
+      "learning_rate": 0.0001989033557355997,
+      "loss": 1.3093,
+      "step": 1079
+    },
+    {
+      "epoch": 0.19230769230769232,
+      "grad_norm": 0.4419287443161011,
+      "learning_rate": 0.00019890128746647068,
+      "loss": 1.1916,
+      "step": 1080
+    },
+    {
+      "epoch": 0.192485754985755,
+      "grad_norm": 0.45803651213645935,
+      "learning_rate": 0.00019889921725957637,
+      "loss": 1.2579,
+      "step": 1081
+    },
+    {
+      "epoch": 0.19266381766381765,
+      "grad_norm": 0.4832262098789215,
+      "learning_rate": 0.0001988971451149573,
+      "loss": 1.3217,
+      "step": 1082
+    },
+    {
+      "epoch": 0.19284188034188035,
+      "grad_norm": 0.4819786250591278,
+      "learning_rate": 0.00019889507103265416,
+      "loss": 1.0979,
+      "step": 1083
+    },
+    {
+      "epoch": 0.19301994301994302,
+      "grad_norm": 0.49360713362693787,
+      "learning_rate": 0.0001988929950127075,
+      "loss": 1.0987,
+      "step": 1084
+    },
+    {
+      "epoch": 0.1931980056980057,
+      "grad_norm": 0.44209200143814087,
+      "learning_rate": 0.00019889091705515806,
+      "loss": 1.2616,
+      "step": 1085
+    },
+    {
+      "epoch": 0.19337606837606838,
+      "grad_norm": 0.41626206040382385,
+      "learning_rate": 0.00019888883716004654,
+      "loss": 1.0922,
+      "step": 1086
+    },
+    {
+      "epoch": 0.19355413105413105,
+      "grad_norm": 0.4916635751724243,
+      "learning_rate": 0.00019888675532741366,
+      "loss": 0.9331,
+      "step": 1087
+    },
+    {
+      "epoch": 0.19373219373219372,
+      "grad_norm": 0.4493125379085541,
+      "learning_rate": 0.00019888467155730025,
+      "loss": 1.1261,
+      "step": 1088
+    },
+    {
+      "epoch": 0.19391025641025642,
+      "grad_norm": 0.3755671977996826,
+      "learning_rate": 0.00019888258584974708,
+      "loss": 0.9821,
+      "step": 1089
+    },
+    {
+      "epoch": 0.19408831908831908,
+      "grad_norm": 0.41917556524276733,
+      "learning_rate": 0.00019888049820479507,
+      "loss": 1.251,
+      "step": 1090
+    },
+    {
+      "epoch": 0.19426638176638178,
+      "grad_norm": 0.46184420585632324,
+      "learning_rate": 0.0001988784086224851,
+      "loss": 1.1731,
+      "step": 1091
+    },
+    {
+      "epoch": 0.19444444444444445,
+      "grad_norm": 0.4783691465854645,
+      "learning_rate": 0.00019887631710285812,
+      "loss": 1.1635,
+      "step": 1092
+    },
+    {
+      "epoch": 0.19462250712250712,
+      "grad_norm": 0.4710482060909271,
+      "learning_rate": 0.00019887422364595512,
+      "loss": 1.0229,
+      "step": 1093
+    },
+    {
+      "epoch": 0.1948005698005698,
+      "grad_norm": 0.4738706648349762,
+      "learning_rate": 0.00019887212825181707,
+      "loss": 1.128,
+      "step": 1094
+    },
+    {
+      "epoch": 0.19497863247863248,
+      "grad_norm": 0.45665010809898376,
+      "learning_rate": 0.00019887003092048508,
+      "loss": 1.0425,
+      "step": 1095
+    },
+    {
+      "epoch": 0.19515669515669515,
+      "grad_norm": 0.42740485072135925,
+      "learning_rate": 0.0001988679316520002,
+      "loss": 1.0738,
+      "step": 1096
+    },
+    {
+      "epoch": 0.19533475783475784,
+      "grad_norm": 0.5977092385292053,
+      "learning_rate": 0.0001988658304464036,
+      "loss": 1.2687,
+      "step": 1097
+    },
+    {
+      "epoch": 0.1955128205128205,
+      "grad_norm": 0.4411074221134186,
+      "learning_rate": 0.0001988637273037364,
+      "loss": 1.287,
+      "step": 1098
+    },
+    {
+      "epoch": 0.19569088319088318,
+      "grad_norm": 0.4409518539905548,
+      "learning_rate": 0.00019886162222403986,
+      "loss": 1.0515,
+      "step": 1099
+    },
+    {
+      "epoch": 0.19586894586894588,
+      "grad_norm": 0.4926736652851105,
+      "learning_rate": 0.0001988595152073552,
+      "loss": 1.1388,
+      "step": 1100
+    },
+    {
+      "epoch": 0.19604700854700854,
+      "grad_norm": 0.4607115387916565,
+      "learning_rate": 0.00019885740625372368,
+      "loss": 0.9803,
+      "step": 1101
+    },
+    {
+      "epoch": 0.1962250712250712,
+      "grad_norm": 0.4725342094898224,
+      "learning_rate": 0.0001988552953631867,
+      "loss": 1.199,
+      "step": 1102
+    },
+    {
+      "epoch": 0.1964031339031339,
+      "grad_norm": 0.48014503717422485,
+      "learning_rate": 0.00019885318253578548,
+      "loss": 1.1868,
+      "step": 1103
+    },
+    {
+      "epoch": 0.19658119658119658,
+      "grad_norm": 0.3872644603252411,
+      "learning_rate": 0.00019885106777156155,
+      "loss": 0.9182,
+      "step": 1104
+    },
+    {
+      "epoch": 0.19675925925925927,
+      "grad_norm": 0.4737720787525177,
+      "learning_rate": 0.00019884895107055627,
+      "loss": 1.1513,
+      "step": 1105
+    },
+    {
+      "epoch": 0.19693732193732194,
+      "grad_norm": 0.4144562780857086,
+      "learning_rate": 0.00019884683243281116,
+      "loss": 1.1711,
+      "step": 1106
+    },
+    {
+      "epoch": 0.1971153846153846,
+      "grad_norm": 0.4672079384326935,
+      "learning_rate": 0.00019884471185836769,
+      "loss": 1.0386,
+      "step": 1107
+    },
+    {
+      "epoch": 0.1972934472934473,
+      "grad_norm": 0.4558824598789215,
+      "learning_rate": 0.0001988425893472674,
+      "loss": 1.0535,
+      "step": 1108
+    },
+    {
+      "epoch": 0.19747150997150997,
+      "grad_norm": 0.5149834752082825,
+      "learning_rate": 0.00019884046489955192,
+      "loss": 1.0296,
+      "step": 1109
+    },
+    {
+      "epoch": 0.19764957264957264,
+      "grad_norm": 0.43444496393203735,
+      "learning_rate": 0.00019883833851526287,
+      "loss": 1.1475,
+      "step": 1110
+    },
+    {
+      "epoch": 0.19782763532763534,
+      "grad_norm": 0.46062374114990234,
+      "learning_rate": 0.00019883621019444188,
+      "loss": 1.183,
+      "step": 1111
+    },
+    {
+      "epoch": 0.198005698005698,
+      "grad_norm": 0.4893282949924469,
+      "learning_rate": 0.00019883407993713065,
+      "loss": 1.3733,
+      "step": 1112
+    },
+    {
+      "epoch": 0.19818376068376067,
+      "grad_norm": 0.5434843897819519,
+      "learning_rate": 0.00019883194774337096,
+      "loss": 1.2505,
+      "step": 1113
+    },
+    {
+      "epoch": 0.19836182336182337,
+      "grad_norm": 0.4698035418987274,
+      "learning_rate": 0.00019882981361320456,
+      "loss": 1.0152,
+      "step": 1114
+    },
+    {
+      "epoch": 0.19853988603988604,
+      "grad_norm": 0.4582163989543915,
+      "learning_rate": 0.00019882767754667325,
+      "loss": 1.1718,
+      "step": 1115
+    },
+    {
+      "epoch": 0.1987179487179487,
+      "grad_norm": 0.48744696378707886,
+      "learning_rate": 0.0001988255395438189,
+      "loss": 1.2923,
+      "step": 1116
+    },
+    {
+      "epoch": 0.1988960113960114,
+      "grad_norm": 0.4172030985355377,
+      "learning_rate": 0.0001988233996046834,
+      "loss": 0.8098,
+      "step": 1117
+    },
+    {
+      "epoch": 0.19907407407407407,
+      "grad_norm": 0.4556557834148407,
+      "learning_rate": 0.00019882125772930867,
+      "loss": 0.9654,
+      "step": 1118
+    },
+    {
+      "epoch": 0.19925213675213677,
+      "grad_norm": 0.4363219141960144,
+      "learning_rate": 0.00019881911391773666,
+      "loss": 1.0333,
+      "step": 1119
+    },
+    {
+      "epoch": 0.19943019943019943,
+      "grad_norm": 0.4336536228656769,
+      "learning_rate": 0.0001988169681700094,
+      "loss": 1.091,
+      "step": 1120
+    },
+    {
+      "epoch": 0.1996082621082621,
+      "grad_norm": 0.42073166370391846,
+      "learning_rate": 0.00019881482048616893,
+      "loss": 0.9687,
+      "step": 1121
+    },
+    {
+      "epoch": 0.1997863247863248,
+      "grad_norm": 0.4330587685108185,
+      "learning_rate": 0.00019881267086625733,
+      "loss": 1.0512,
+      "step": 1122
+    },
+    {
+      "epoch": 0.19996438746438747,
+      "grad_norm": 0.4602276682853699,
+      "learning_rate": 0.0001988105193103167,
+      "loss": 1.1806,
+      "step": 1123
+    },
+    {
+      "epoch": 0.20014245014245013,
+      "grad_norm": 0.4271257817745209,
+      "learning_rate": 0.0001988083658183892,
+      "loss": 1.1079,
+      "step": 1124
+    },
+    {
+      "epoch": 0.20032051282051283,
+      "grad_norm": 0.35446426272392273,
+      "learning_rate": 0.00019880621039051707,
+      "loss": 0.6769,
+      "step": 1125
+    },
+    {
+      "epoch": 0.2004985754985755,
+      "grad_norm": 0.413753479719162,
+      "learning_rate": 0.00019880405302674244,
+      "loss": 1.1088,
+      "step": 1126
+    },
+    {
+      "epoch": 0.20067663817663817,
+      "grad_norm": 0.4423675835132599,
+      "learning_rate": 0.00019880189372710767,
+      "loss": 1.1371,
+      "step": 1127
+    },
+    {
+      "epoch": 0.20085470085470086,
+      "grad_norm": 0.41865605115890503,
+      "learning_rate": 0.00019879973249165502,
+      "loss": 1.0027,
+      "step": 1128
+    },
+    {
+      "epoch": 0.20103276353276353,
+      "grad_norm": 0.4109594225883484,
+      "learning_rate": 0.00019879756932042686,
+      "loss": 0.8734,
+      "step": 1129
+    },
+    {
+      "epoch": 0.2012108262108262,
+      "grad_norm": 0.42326363921165466,
+      "learning_rate": 0.00019879540421346555,
+      "loss": 0.9722,
+      "step": 1130
+    },
+    {
+      "epoch": 0.2013888888888889,
+      "grad_norm": 0.4601542055606842,
+      "learning_rate": 0.00019879323717081354,
+      "loss": 1.1251,
+      "step": 1131
+    },
+    {
+      "epoch": 0.20156695156695156,
+      "grad_norm": 0.4704367518424988,
+      "learning_rate": 0.00019879106819251327,
+      "loss": 0.9457,
+      "step": 1132
+    },
+    {
+      "epoch": 0.20174501424501423,
+      "grad_norm": 0.465023934841156,
+      "learning_rate": 0.00019878889727860724,
+      "loss": 0.9633,
+      "step": 1133
+    },
+    {
+      "epoch": 0.20192307692307693,
+      "grad_norm": 0.4572450518608093,
+      "learning_rate": 0.00019878672442913796,
+      "loss": 1.1965,
+      "step": 1134
+    },
+    {
+      "epoch": 0.2021011396011396,
+      "grad_norm": 0.4323410391807556,
+      "learning_rate": 0.00019878454964414807,
+      "loss": 1.1296,
+      "step": 1135
+    },
+    {
+      "epoch": 0.2022792022792023,
+      "grad_norm": 0.4513751268386841,
+      "learning_rate": 0.00019878237292368013,
+      "loss": 1.0571,
+      "step": 1136
+    },
+    {
+      "epoch": 0.20245726495726496,
+      "grad_norm": 0.45504096150398254,
+      "learning_rate": 0.00019878019426777677,
+      "loss": 1.0316,
+      "step": 1137
+    },
+    {
+      "epoch": 0.20263532763532763,
+      "grad_norm": 0.45715275406837463,
+      "learning_rate": 0.0001987780136764807,
+      "loss": 1.0528,
+      "step": 1138
+    },
+    {
+      "epoch": 0.20281339031339032,
+      "grad_norm": 0.4934465289115906,
+      "learning_rate": 0.00019877583114983466,
+      "loss": 1.3238,
+      "step": 1139
+    },
+    {
+      "epoch": 0.202991452991453,
+      "grad_norm": 0.4304082989692688,
+      "learning_rate": 0.0001987736466878814,
+      "loss": 1.1774,
+      "step": 1140
+    },
+    {
+      "epoch": 0.20316951566951566,
+      "grad_norm": 0.49721968173980713,
+      "learning_rate": 0.00019877146029066372,
+      "loss": 1.1767,
+      "step": 1141
+    },
+    {
+      "epoch": 0.20334757834757836,
+      "grad_norm": 0.3629468083381653,
+      "learning_rate": 0.00019876927195822445,
+      "loss": 0.8588,
+      "step": 1142
+    },
+    {
+      "epoch": 0.20352564102564102,
+      "grad_norm": 0.49310383200645447,
+      "learning_rate": 0.00019876708169060648,
+      "loss": 1.0588,
+      "step": 1143
+    },
+    {
+      "epoch": 0.2037037037037037,
+      "grad_norm": 0.4270328879356384,
+      "learning_rate": 0.00019876488948785271,
+      "loss": 1.1523,
+      "step": 1144
+    },
+    {
+      "epoch": 0.2038817663817664,
+      "grad_norm": 0.4559730887413025,
+      "learning_rate": 0.0001987626953500061,
+      "loss": 1.1736,
+      "step": 1145
+    },
+    {
+      "epoch": 0.20405982905982906,
+      "grad_norm": 0.5335259437561035,
+      "learning_rate": 0.00019876049927710962,
+      "loss": 0.991,
+      "step": 1146
+    },
+    {
+      "epoch": 0.20423789173789172,
+      "grad_norm": 0.43500083684921265,
+      "learning_rate": 0.0001987583012692063,
+      "loss": 1.0631,
+      "step": 1147
+    },
+    {
+      "epoch": 0.20441595441595442,
+      "grad_norm": 0.4135417938232422,
+      "learning_rate": 0.00019875610132633927,
+      "loss": 1.0896,
+      "step": 1148
+    },
+    {
+      "epoch": 0.2045940170940171,
+      "grad_norm": 0.4078896641731262,
+      "learning_rate": 0.00019875389944855153,
+      "loss": 1.0395,
+      "step": 1149
+    },
+    {
+      "epoch": 0.20477207977207978,
+      "grad_norm": 0.46612194180488586,
+      "learning_rate": 0.00019875169563588632,
+      "loss": 1.0541,
+      "step": 1150
+    },
+    {
+      "epoch": 0.20495014245014245,
+      "grad_norm": 0.5093224048614502,
+      "learning_rate": 0.00019874948988838674,
+      "loss": 1.1486,
+      "step": 1151
+    },
+    {
+      "epoch": 0.20512820512820512,
+      "grad_norm": 0.5079755187034607,
+      "learning_rate": 0.00019874728220609607,
+      "loss": 1.2614,
+      "step": 1152
+    },
+    {
+      "epoch": 0.20530626780626782,
+      "grad_norm": 0.43663498759269714,
+      "learning_rate": 0.0001987450725890575,
+      "loss": 1.0683,
+      "step": 1153
+    },
+    {
+      "epoch": 0.20548433048433049,
+      "grad_norm": 0.5029327273368835,
+      "learning_rate": 0.00019874286103731435,
+      "loss": 1.1934,
+      "step": 1154
+    },
+    {
+      "epoch": 0.20566239316239315,
+      "grad_norm": 0.48770397901535034,
+      "learning_rate": 0.00019874064755090999,
+      "loss": 1.1634,
+      "step": 1155
+    },
+    {
+      "epoch": 0.20584045584045585,
+      "grad_norm": 0.46826690435409546,
+      "learning_rate": 0.00019873843212988776,
+      "loss": 1.0621,
+      "step": 1156
+    },
+    {
+      "epoch": 0.20601851851851852,
+      "grad_norm": 0.4810047149658203,
+      "learning_rate": 0.00019873621477429105,
+      "loss": 1.0879,
+      "step": 1157
+    },
+    {
+      "epoch": 0.20619658119658119,
+      "grad_norm": 0.4769522249698639,
+      "learning_rate": 0.00019873399548416335,
+      "loss": 1.1365,
+      "step": 1158
+    },
+    {
+      "epoch": 0.20637464387464388,
+      "grad_norm": 0.4221782982349396,
+      "learning_rate": 0.00019873177425954806,
+      "loss": 1.1168,
+      "step": 1159
+    },
+    {
+      "epoch": 0.20655270655270655,
+      "grad_norm": 0.4084923565387726,
+      "learning_rate": 0.00019872955110048876,
+      "loss": 1.2364,
+      "step": 1160
+    },
+    {
+      "epoch": 0.20673076923076922,
+      "grad_norm": 0.4781704545021057,
+      "learning_rate": 0.00019872732600702904,
+      "loss": 1.19,
+      "step": 1161
+    },
+    {
+      "epoch": 0.20690883190883191,
+      "grad_norm": 0.3984242081642151,
+      "learning_rate": 0.0001987250989792124,
+      "loss": 1.0568,
+      "step": 1162
+    },
+    {
+      "epoch": 0.20708689458689458,
+      "grad_norm": 0.4601972997188568,
+      "learning_rate": 0.00019872287001708257,
+      "loss": 1.1625,
+      "step": 1163
+    },
+    {
+      "epoch": 0.20726495726495728,
+      "grad_norm": 0.4853581190109253,
+      "learning_rate": 0.00019872063912068316,
+      "loss": 1.2304,
+      "step": 1164
+    },
+    {
+      "epoch": 0.20744301994301995,
+      "grad_norm": 0.41779839992523193,
+      "learning_rate": 0.0001987184062900579,
+      "loss": 0.9807,
+      "step": 1165
+    },
+    {
+      "epoch": 0.20762108262108261,
+      "grad_norm": 0.4945356249809265,
+      "learning_rate": 0.00019871617152525056,
+      "loss": 1.1861,
+      "step": 1166
+    },
+    {
+      "epoch": 0.2077991452991453,
+      "grad_norm": 0.47432294487953186,
+      "learning_rate": 0.00019871393482630487,
+      "loss": 1.1448,
+      "step": 1167
+    },
+    {
+      "epoch": 0.20797720797720798,
+      "grad_norm": 0.44647398591041565,
+      "learning_rate": 0.00019871169619326473,
+      "loss": 1.096,
+      "step": 1168
+    },
+    {
+      "epoch": 0.20815527065527065,
+      "grad_norm": 0.4643072783946991,
+      "learning_rate": 0.00019870945562617393,
+      "loss": 1.1561,
+      "step": 1169
+    },
+    {
+      "epoch": 0.20833333333333334,
+      "grad_norm": 0.4544340968132019,
+      "learning_rate": 0.0001987072131250764,
+      "loss": 1.0764,
+      "step": 1170
+    },
+    {
+      "epoch": 0.208511396011396,
+      "grad_norm": 0.6036561727523804,
+      "learning_rate": 0.00019870496869001607,
+      "loss": 1.3961,
+      "step": 1171
+    },
+    {
+      "epoch": 0.20868945868945868,
+      "grad_norm": 0.41348758339881897,
+      "learning_rate": 0.00019870272232103695,
+      "loss": 1.2219,
+      "step": 1172
+    },
+    {
+      "epoch": 0.20886752136752137,
+      "grad_norm": 0.4184056222438812,
+      "learning_rate": 0.000198700474018183,
+      "loss": 1.1115,
+      "step": 1173
+    },
+    {
+      "epoch": 0.20904558404558404,
+      "grad_norm": 0.41920599341392517,
+      "learning_rate": 0.0001986982237814983,
+      "loss": 0.9207,
+      "step": 1174
+    },
+    {
+      "epoch": 0.2092236467236467,
+      "grad_norm": 0.4710249602794647,
+      "learning_rate": 0.00019869597161102694,
+      "loss": 1.1342,
+      "step": 1175
+    },
+    {
+      "epoch": 0.2094017094017094,
+      "grad_norm": 0.46897777915000916,
+      "learning_rate": 0.000198693717506813,
+      "loss": 0.983,
+      "step": 1176
+    },
+    {
+      "epoch": 0.20957977207977208,
+      "grad_norm": 0.4817039370536804,
+      "learning_rate": 0.00019869146146890074,
+      "loss": 1.0923,
+      "step": 1177
+    },
+    {
+      "epoch": 0.20975783475783477,
+      "grad_norm": 0.4806751012802124,
+      "learning_rate": 0.00019868920349733427,
+      "loss": 1.2296,
+      "step": 1178
+    },
+    {
+      "epoch": 0.20993589743589744,
+      "grad_norm": 0.44182994961738586,
+      "learning_rate": 0.0001986869435921579,
+      "loss": 1.1856,
+      "step": 1179
+    },
+    {
+      "epoch": 0.2101139601139601,
+      "grad_norm": 0.4282805621623993,
+      "learning_rate": 0.00019868468175341584,
+      "loss": 1.0046,
+      "step": 1180
+    },
+    {
+      "epoch": 0.2102920227920228,
+      "grad_norm": 0.5011838674545288,
+      "learning_rate": 0.00019868241798115242,
+      "loss": 1.2401,
+      "step": 1181
+    },
+    {
+      "epoch": 0.21047008547008547,
+      "grad_norm": 0.4282447397708893,
+      "learning_rate": 0.00019868015227541208,
+      "loss": 0.9338,
+      "step": 1182
+    },
+    {
+      "epoch": 0.21064814814814814,
+      "grad_norm": 0.4348810911178589,
+      "learning_rate": 0.00019867788463623912,
+      "loss": 0.926,
+      "step": 1183
+    },
+    {
+      "epoch": 0.21082621082621084,
+      "grad_norm": 0.41518425941467285,
+      "learning_rate": 0.00019867561506367799,
+      "loss": 1.2723,
+      "step": 1184
+    },
+    {
+      "epoch": 0.2110042735042735,
+      "grad_norm": 0.47346001863479614,
+      "learning_rate": 0.00019867334355777315,
+      "loss": 1.1931,
+      "step": 1185
+    },
+    {
+      "epoch": 0.21118233618233617,
+      "grad_norm": 0.4071715474128723,
+      "learning_rate": 0.00019867107011856914,
+      "loss": 0.9619,
+      "step": 1186
+    },
+    {
+      "epoch": 0.21136039886039887,
+      "grad_norm": 0.4803447425365448,
+      "learning_rate": 0.00019866879474611046,
+      "loss": 1.2,
+      "step": 1187
+    },
+    {
+      "epoch": 0.21153846153846154,
+      "grad_norm": 0.4827699661254883,
+      "learning_rate": 0.00019866651744044172,
+      "loss": 1.0938,
+      "step": 1188
+    },
+    {
+      "epoch": 0.2117165242165242,
+      "grad_norm": 0.4528424143791199,
+      "learning_rate": 0.00019866423820160756,
+      "loss": 0.9721,
+      "step": 1189
+    },
+    {
+      "epoch": 0.2118945868945869,
+      "grad_norm": 0.43566834926605225,
+      "learning_rate": 0.0001986619570296526,
+      "loss": 1.0352,
+      "step": 1190
+    },
+    {
+      "epoch": 0.21207264957264957,
+      "grad_norm": 0.4516540467739105,
+      "learning_rate": 0.0001986596739246215,
+      "loss": 1.1333,
+      "step": 1191
+    },
+    {
+      "epoch": 0.21225071225071226,
+      "grad_norm": 0.4456641376018524,
+      "learning_rate": 0.00019865738888655908,
+      "loss": 1.2813,
+      "step": 1192
+    },
+    {
+      "epoch": 0.21242877492877493,
+      "grad_norm": 0.47048309445381165,
+      "learning_rate": 0.00019865510191551008,
+      "loss": 1.1067,
+      "step": 1193
+    },
+    {
+      "epoch": 0.2126068376068376,
+      "grad_norm": 0.4604061543941498,
+      "learning_rate": 0.00019865281301151928,
+      "loss": 0.925,
+      "step": 1194
+    },
+    {
+      "epoch": 0.2127849002849003,
+      "grad_norm": 0.49341437220573425,
+      "learning_rate": 0.00019865052217463153,
+      "loss": 1.2319,
+      "step": 1195
+    },
+    {
+      "epoch": 0.21296296296296297,
+      "grad_norm": 0.5099014639854431,
+      "learning_rate": 0.00019864822940489173,
+      "loss": 1.139,
+      "step": 1196
+    },
+    {
+      "epoch": 0.21314102564102563,
+      "grad_norm": 0.41396936774253845,
+      "learning_rate": 0.0001986459347023448,
+      "loss": 1.0594,
+      "step": 1197
+    },
+    {
+      "epoch": 0.21331908831908833,
+      "grad_norm": 0.46071869134902954,
+      "learning_rate": 0.0001986436380670357,
+      "loss": 1.0815,
+      "step": 1198
+    },
+    {
+      "epoch": 0.213497150997151,
+      "grad_norm": 0.507882297039032,
+      "learning_rate": 0.00019864133949900942,
+      "loss": 1.3841,
+      "step": 1199
+    },
+    {
+      "epoch": 0.21367521367521367,
+      "grad_norm": 0.45680439472198486,
+      "learning_rate": 0.00019863903899831103,
+      "loss": 1.0945,
+      "step": 1200
+    },
+    {
+      "epoch": 0.21385327635327636,
+      "grad_norm": 0.44277429580688477,
+      "learning_rate": 0.00019863673656498555,
+      "loss": 1.1655,
+      "step": 1201
+    },
+    {
+      "epoch": 0.21403133903133903,
+      "grad_norm": 0.43890756368637085,
+      "learning_rate": 0.00019863443219907812,
+      "loss": 1.1186,
+      "step": 1202
+    },
+    {
+      "epoch": 0.2142094017094017,
+      "grad_norm": 0.3910178542137146,
+      "learning_rate": 0.0001986321259006339,
+      "loss": 1.0817,
+      "step": 1203
+    },
+    {
+      "epoch": 0.2143874643874644,
+      "grad_norm": 0.3803878128528595,
+      "learning_rate": 0.00019862981766969803,
+      "loss": 0.8022,
+      "step": 1204
+    },
+    {
+      "epoch": 0.21456552706552706,
+      "grad_norm": 0.4495108425617218,
+      "learning_rate": 0.0001986275075063158,
+      "loss": 1.2212,
+      "step": 1205
+    },
+    {
+      "epoch": 0.21474358974358973,
+      "grad_norm": 0.5211976766586304,
+      "learning_rate": 0.00019862519541053244,
+      "loss": 1.2771,
+      "step": 1206
+    },
+    {
+      "epoch": 0.21492165242165243,
+      "grad_norm": 0.4313061535358429,
+      "learning_rate": 0.00019862288138239325,
+      "loss": 1.1205,
+      "step": 1207
+    },
+    {
+      "epoch": 0.2150997150997151,
+      "grad_norm": 0.47110888361930847,
+      "learning_rate": 0.00019862056542194355,
+      "loss": 1.1835,
+      "step": 1208
+    },
+    {
+      "epoch": 0.2152777777777778,
+      "grad_norm": 0.5129403471946716,
+      "learning_rate": 0.00019861824752922876,
+      "loss": 1.1655,
+      "step": 1209
+    },
+    {
+      "epoch": 0.21545584045584046,
+      "grad_norm": 0.4353938102722168,
+      "learning_rate": 0.00019861592770429427,
+      "loss": 1.2794,
+      "step": 1210
+    },
+    {
+      "epoch": 0.21563390313390313,
+      "grad_norm": 0.48590636253356934,
+      "learning_rate": 0.0001986136059471855,
+      "loss": 1.2003,
+      "step": 1211
+    },
+    {
+      "epoch": 0.21581196581196582,
+      "grad_norm": 0.4738406836986542,
+      "learning_rate": 0.00019861128225794804,
+      "loss": 1.2271,
+      "step": 1212
+    },
+    {
+      "epoch": 0.2159900284900285,
+      "grad_norm": 0.45983126759529114,
+      "learning_rate": 0.0001986089566366273,
+      "loss": 1.1896,
+      "step": 1213
+    },
+    {
+      "epoch": 0.21616809116809116,
+      "grad_norm": 0.37296006083488464,
+      "learning_rate": 0.00019860662908326892,
+      "loss": 1.079,
+      "step": 1214
+    },
+    {
+      "epoch": 0.21634615384615385,
+      "grad_norm": 0.4442676305770874,
+      "learning_rate": 0.00019860429959791845,
+      "loss": 1.1754,
+      "step": 1215
+    },
+    {
+      "epoch": 0.21652421652421652,
+      "grad_norm": 0.4950128495693207,
+      "learning_rate": 0.0001986019681806216,
+      "loss": 1.1571,
+      "step": 1216
+    },
+    {
+      "epoch": 0.2167022792022792,
+      "grad_norm": 0.4374556541442871,
+      "learning_rate": 0.000198599634831424,
+      "loss": 1.1003,
+      "step": 1217
+    },
+    {
+      "epoch": 0.2168803418803419,
+      "grad_norm": 0.47301414608955383,
+      "learning_rate": 0.00019859729955037136,
+      "loss": 1.1426,
+      "step": 1218
+    },
+    {
+      "epoch": 0.21705840455840456,
+      "grad_norm": 0.41213178634643555,
+      "learning_rate": 0.00019859496233750947,
+      "loss": 1.0659,
+      "step": 1219
+    },
+    {
+      "epoch": 0.21723646723646722,
+      "grad_norm": 0.41601964831352234,
+      "learning_rate": 0.0001985926231928841,
+      "loss": 1.0248,
+      "step": 1220
+    },
+    {
+      "epoch": 0.21741452991452992,
+      "grad_norm": 0.46328839659690857,
+      "learning_rate": 0.0001985902821165411,
+      "loss": 1.0405,
+      "step": 1221
+    },
+    {
+      "epoch": 0.2175925925925926,
+      "grad_norm": 0.43287959694862366,
+      "learning_rate": 0.0001985879391085263,
+      "loss": 0.9202,
+      "step": 1222
+    },
+    {
+      "epoch": 0.21777065527065528,
+      "grad_norm": 0.4770444631576538,
+      "learning_rate": 0.00019858559416888568,
+      "loss": 1.0911,
+      "step": 1223
+    },
+    {
+      "epoch": 0.21794871794871795,
+      "grad_norm": 0.4756585955619812,
+      "learning_rate": 0.00019858324729766507,
+      "loss": 1.1566,
+      "step": 1224
+    },
+    {
+      "epoch": 0.21812678062678062,
+      "grad_norm": 0.4337233006954193,
+      "learning_rate": 0.00019858089849491054,
+      "loss": 0.9084,
+      "step": 1225
+    },
+    {
+      "epoch": 0.21830484330484332,
+      "grad_norm": 0.5165579319000244,
+      "learning_rate": 0.00019857854776066813,
+      "loss": 1.4154,
+      "step": 1226
+    },
+    {
+      "epoch": 0.21848290598290598,
+      "grad_norm": 0.4280378520488739,
+      "learning_rate": 0.00019857619509498382,
+      "loss": 1.1291,
+      "step": 1227
+    },
+    {
+      "epoch": 0.21866096866096865,
+      "grad_norm": 0.5375089049339294,
+      "learning_rate": 0.00019857384049790376,
+      "loss": 1.2985,
+      "step": 1228
+    },
+    {
+      "epoch": 0.21883903133903135,
+      "grad_norm": 0.4708811640739441,
+      "learning_rate": 0.00019857148396947401,
+      "loss": 1.0589,
+      "step": 1229
+    },
+    {
+      "epoch": 0.21901709401709402,
+      "grad_norm": 0.4744570255279541,
+      "learning_rate": 0.00019856912550974084,
+      "loss": 1.1269,
+      "step": 1230
+    },
+    {
+      "epoch": 0.21919515669515668,
+      "grad_norm": 0.5355265736579895,
+      "learning_rate": 0.00019856676511875043,
+      "loss": 1.1441,
+      "step": 1231
+    },
+    {
+      "epoch": 0.21937321937321938,
+      "grad_norm": 0.42718183994293213,
+      "learning_rate": 0.00019856440279654897,
+      "loss": 1.0244,
+      "step": 1232
+    },
+    {
+      "epoch": 0.21955128205128205,
+      "grad_norm": 0.5162127614021301,
+      "learning_rate": 0.00019856203854318283,
+      "loss": 1.2674,
+      "step": 1233
+    },
+    {
+      "epoch": 0.21972934472934472,
+      "grad_norm": 0.5180695652961731,
+      "learning_rate": 0.00019855967235869827,
+      "loss": 1.2472,
+      "step": 1234
+    },
+    {
+      "epoch": 0.2199074074074074,
+      "grad_norm": 0.4290023744106293,
+      "learning_rate": 0.00019855730424314167,
+      "loss": 1.0502,
+      "step": 1235
+    },
+    {
+      "epoch": 0.22008547008547008,
+      "grad_norm": 0.4418254792690277,
+      "learning_rate": 0.00019855493419655945,
+      "loss": 1.0589,
+      "step": 1236
+    },
+    {
+      "epoch": 0.22026353276353278,
+      "grad_norm": 0.4074663817882538,
+      "learning_rate": 0.000198552562218998,
+      "loss": 0.9197,
+      "step": 1237
+    },
+    {
+      "epoch": 0.22044159544159544,
+      "grad_norm": 0.4526660740375519,
+      "learning_rate": 0.00019855018831050383,
+      "loss": 1.2578,
+      "step": 1238
+    },
+    {
+      "epoch": 0.2206196581196581,
+      "grad_norm": 0.4747827649116516,
+      "learning_rate": 0.00019854781247112343,
+      "loss": 1.0841,
+      "step": 1239
+    },
+    {
+      "epoch": 0.2207977207977208,
+      "grad_norm": 0.41567128896713257,
+      "learning_rate": 0.00019854543470090334,
+      "loss": 1.0737,
+      "step": 1240
+    },
+    {
+      "epoch": 0.22097578347578348,
+      "grad_norm": 0.4793100953102112,
+      "learning_rate": 0.00019854305499989022,
+      "loss": 1.1972,
+      "step": 1241
+    },
+    {
+      "epoch": 0.22115384615384615,
+      "grad_norm": 0.41755473613739014,
+      "learning_rate": 0.00019854067336813058,
+      "loss": 1.2529,
+      "step": 1242
+    },
+    {
+      "epoch": 0.22133190883190884,
+      "grad_norm": 0.40421152114868164,
+      "learning_rate": 0.0001985382898056712,
+      "loss": 1.0549,
+      "step": 1243
+    },
+    {
+      "epoch": 0.2215099715099715,
+      "grad_norm": 0.45779645442962646,
+      "learning_rate": 0.0001985359043125587,
+      "loss": 1.1586,
+      "step": 1244
+    },
+    {
+      "epoch": 0.22168803418803418,
+      "grad_norm": 0.4380546808242798,
+      "learning_rate": 0.00019853351688883987,
+      "loss": 1.1024,
+      "step": 1245
+    },
+    {
+      "epoch": 0.22186609686609687,
+      "grad_norm": 0.39917269349098206,
+      "learning_rate": 0.00019853112753456142,
+      "loss": 0.9823,
+      "step": 1246
+    },
+    {
+      "epoch": 0.22204415954415954,
+      "grad_norm": 0.4228038489818573,
+      "learning_rate": 0.00019852873624977022,
+      "loss": 1.1684,
+      "step": 1247
+    },
+    {
+      "epoch": 0.2222222222222222,
+      "grad_norm": 0.4462146759033203,
+      "learning_rate": 0.00019852634303451315,
+      "loss": 0.9027,
+      "step": 1248
+    },
+    {
+      "epoch": 0.2224002849002849,
+      "grad_norm": 0.5682163834571838,
+      "learning_rate": 0.000198523947888837,
+      "loss": 1.141,
+      "step": 1249
+    },
+    {
+      "epoch": 0.22257834757834757,
+      "grad_norm": 0.44866830110549927,
+      "learning_rate": 0.0001985215508127888,
+      "loss": 1.0759,
+      "step": 1250
+    },
+    {
+      "epoch": 0.22275641025641027,
+      "grad_norm": 0.4034106135368347,
+      "learning_rate": 0.00019851915180641548,
+      "loss": 1.0675,
+      "step": 1251
+    },
+    {
+      "epoch": 0.22293447293447294,
+      "grad_norm": 0.4780726432800293,
+      "learning_rate": 0.00019851675086976397,
+      "loss": 1.0283,
+      "step": 1252
+    },
+    {
+      "epoch": 0.2231125356125356,
+      "grad_norm": 0.48892372846603394,
+      "learning_rate": 0.00019851434800288145,
+      "loss": 1.1159,
+      "step": 1253
+    },
+    {
+      "epoch": 0.2232905982905983,
+      "grad_norm": 0.42629215121269226,
+      "learning_rate": 0.0001985119432058149,
+      "loss": 1.0292,
+      "step": 1254
+    },
+    {
+      "epoch": 0.22346866096866097,
+      "grad_norm": 0.4496444761753082,
+      "learning_rate": 0.00019850953647861146,
+      "loss": 1.0252,
+      "step": 1255
+    },
+    {
+      "epoch": 0.22364672364672364,
+      "grad_norm": 0.4371408224105835,
+      "learning_rate": 0.00019850712782131828,
+      "loss": 1.1104,
+      "step": 1256
+    },
+    {
+      "epoch": 0.22382478632478633,
+      "grad_norm": 0.4910794496536255,
+      "learning_rate": 0.00019850471723398258,
+      "loss": 1.1928,
+      "step": 1257
+    },
+    {
+      "epoch": 0.224002849002849,
+      "grad_norm": 0.41235068440437317,
+      "learning_rate": 0.00019850230471665157,
+      "loss": 1.1261,
+      "step": 1258
+    },
+    {
+      "epoch": 0.22418091168091167,
+      "grad_norm": 0.4507700502872467,
+      "learning_rate": 0.0001984998902693725,
+      "loss": 1.0602,
+      "step": 1259
+    },
+    {
+      "epoch": 0.22435897435897437,
+      "grad_norm": 0.4654198884963989,
+      "learning_rate": 0.00019849747389219272,
+      "loss": 1.1258,
+      "step": 1260
+    },
+    {
+      "epoch": 0.22453703703703703,
+      "grad_norm": 0.439807653427124,
+      "learning_rate": 0.00019849505558515952,
+      "loss": 1.2312,
+      "step": 1261
+    },
+    {
+      "epoch": 0.2247150997150997,
+      "grad_norm": 0.4309258759021759,
+      "learning_rate": 0.00019849263534832035,
+      "loss": 1.0083,
+      "step": 1262
+    },
+    {
+      "epoch": 0.2248931623931624,
+      "grad_norm": 0.4920141100883484,
+      "learning_rate": 0.00019849021318172255,
+      "loss": 1.0254,
+      "step": 1263
+    },
+    {
+      "epoch": 0.22507122507122507,
+      "grad_norm": 0.5333457589149475,
+      "learning_rate": 0.00019848778908541367,
+      "loss": 1.3017,
+      "step": 1264
+    },
+    {
+      "epoch": 0.22524928774928774,
+      "grad_norm": 0.4096757769584656,
+      "learning_rate": 0.0001984853630594411,
+      "loss": 0.9531,
+      "step": 1265
+    },
+    {
+      "epoch": 0.22542735042735043,
+      "grad_norm": 0.5744075775146484,
+      "learning_rate": 0.00019848293510385244,
+      "loss": 1.1414,
+      "step": 1266
+    },
+    {
+      "epoch": 0.2256054131054131,
+      "grad_norm": 0.44707193970680237,
+      "learning_rate": 0.00019848050521869529,
+      "loss": 1.1926,
+      "step": 1267
+    },
+    {
+      "epoch": 0.2257834757834758,
+      "grad_norm": 0.4162999391555786,
+      "learning_rate": 0.00019847807340401716,
+      "loss": 1.1354,
+      "step": 1268
+    },
+    {
+      "epoch": 0.22596153846153846,
+      "grad_norm": 0.4273204207420349,
+      "learning_rate": 0.0001984756396598658,
+      "loss": 0.9956,
+      "step": 1269
+    },
+    {
+      "epoch": 0.22613960113960113,
+      "grad_norm": 0.5670466423034668,
+      "learning_rate": 0.00019847320398628878,
+      "loss": 1.2384,
+      "step": 1270
+    },
+    {
+      "epoch": 0.22631766381766383,
+      "grad_norm": 0.424544095993042,
+      "learning_rate": 0.00019847076638333395,
+      "loss": 0.9963,
+      "step": 1271
+    },
+    {
+      "epoch": 0.2264957264957265,
+      "grad_norm": 0.3716120719909668,
+      "learning_rate": 0.000198468326851049,
+      "loss": 0.865,
+      "step": 1272
+    },
+    {
+      "epoch": 0.22667378917378916,
+      "grad_norm": 0.4472847282886505,
+      "learning_rate": 0.00019846588538948172,
+      "loss": 1.174,
+      "step": 1273
+    },
+    {
+      "epoch": 0.22685185185185186,
+      "grad_norm": 0.4599195718765259,
+      "learning_rate": 0.00019846344199867994,
+      "loss": 1.289,
+      "step": 1274
+    },
+    {
+      "epoch": 0.22702991452991453,
+      "grad_norm": 0.4303213357925415,
+      "learning_rate": 0.0001984609966786916,
+      "loss": 1.1606,
+      "step": 1275
+    },
+    {
+      "epoch": 0.2272079772079772,
+      "grad_norm": 0.44893527030944824,
+      "learning_rate": 0.00019845854942956455,
+      "loss": 1.1043,
+      "step": 1276
+    },
+    {
+      "epoch": 0.2273860398860399,
+      "grad_norm": 0.40033379197120667,
+      "learning_rate": 0.00019845610025134676,
+      "loss": 1.1434,
+      "step": 1277
+    },
+    {
+      "epoch": 0.22756410256410256,
+      "grad_norm": 0.4385402202606201,
+      "learning_rate": 0.00019845364914408616,
+      "loss": 0.9943,
+      "step": 1278
+    },
+    {
+      "epoch": 0.22774216524216523,
+      "grad_norm": 0.42123618721961975,
+      "learning_rate": 0.0001984511961078309,
+      "loss": 1.0911,
+      "step": 1279
+    },
+    {
+      "epoch": 0.22792022792022792,
+      "grad_norm": 0.5558577179908752,
+      "learning_rate": 0.00019844874114262893,
+      "loss": 1.3893,
+      "step": 1280
+    },
+    {
+      "epoch": 0.2280982905982906,
+      "grad_norm": 0.3996453583240509,
+      "learning_rate": 0.00019844628424852835,
+      "loss": 0.8951,
+      "step": 1281
+    },
+    {
+      "epoch": 0.2282763532763533,
+      "grad_norm": 0.3943425714969635,
+      "learning_rate": 0.0001984438254255774,
+      "loss": 1.0595,
+      "step": 1282
+    },
+    {
+      "epoch": 0.22845441595441596,
+      "grad_norm": 0.4429021179676056,
+      "learning_rate": 0.00019844136467382414,
+      "loss": 1.0853,
+      "step": 1283
+    },
+    {
+      "epoch": 0.22863247863247863,
+      "grad_norm": 0.4515686631202698,
+      "learning_rate": 0.00019843890199331687,
+      "loss": 1.0829,
+      "step": 1284
+    },
+    {
+      "epoch": 0.22881054131054132,
+      "grad_norm": 0.5157768726348877,
+      "learning_rate": 0.00019843643738410378,
+      "loss": 1.334,
+      "step": 1285
+    },
+    {
+      "epoch": 0.228988603988604,
+      "grad_norm": 0.45833173394203186,
+      "learning_rate": 0.0001984339708462332,
+      "loss": 1.1353,
+      "step": 1286
+    },
+    {
+      "epoch": 0.22916666666666666,
+      "grad_norm": 0.46610337495803833,
+      "learning_rate": 0.00019843150237975344,
+      "loss": 1.1338,
+      "step": 1287
+    },
+    {
+      "epoch": 0.22934472934472935,
+      "grad_norm": 0.5076978802680969,
+      "learning_rate": 0.00019842903198471286,
+      "loss": 1.1811,
+      "step": 1288
+    },
+    {
+      "epoch": 0.22952279202279202,
+      "grad_norm": 0.4297824800014496,
+      "learning_rate": 0.00019842655966115986,
+      "loss": 1.1799,
+      "step": 1289
+    },
+    {
+      "epoch": 0.2297008547008547,
+      "grad_norm": 0.5304586291313171,
+      "learning_rate": 0.0001984240854091429,
+      "loss": 1.1315,
+      "step": 1290
+    },
+    {
+      "epoch": 0.22987891737891739,
+      "grad_norm": 0.45359212160110474,
+      "learning_rate": 0.00019842160922871042,
+      "loss": 1.1037,
+      "step": 1291
+    },
+    {
+      "epoch": 0.23005698005698005,
+      "grad_norm": 0.4416881203651428,
+      "learning_rate": 0.00019841913111991096,
+      "loss": 1.122,
+      "step": 1292
+    },
+    {
+      "epoch": 0.23023504273504272,
+      "grad_norm": 0.46682995557785034,
+      "learning_rate": 0.0001984166510827931,
+      "loss": 0.9808,
+      "step": 1293
+    },
+    {
+      "epoch": 0.23041310541310542,
+      "grad_norm": 0.44172337651252747,
+      "learning_rate": 0.00019841416911740538,
+      "loss": 0.9167,
+      "step": 1294
+    },
+    {
+      "epoch": 0.23059116809116809,
+      "grad_norm": 0.40562742948532104,
+      "learning_rate": 0.0001984116852237965,
+      "loss": 0.9547,
+      "step": 1295
+    },
+    {
+      "epoch": 0.23076923076923078,
+      "grad_norm": 0.4040384888648987,
+      "learning_rate": 0.00019840919940201503,
+      "loss": 1.1039,
+      "step": 1296
+    },
+    {
+      "epoch": 0.23094729344729345,
+      "grad_norm": 0.5094077587127686,
+      "learning_rate": 0.00019840671165210973,
+      "loss": 1.2283,
+      "step": 1297
+    },
+    {
+      "epoch": 0.23112535612535612,
+      "grad_norm": 0.48553213477134705,
+      "learning_rate": 0.00019840422197412938,
+      "loss": 1.0927,
+      "step": 1298
+    },
+    {
+      "epoch": 0.23130341880341881,
+      "grad_norm": 0.5197509527206421,
+      "learning_rate": 0.00019840173036812266,
+      "loss": 1.2154,
+      "step": 1299
+    },
+    {
+      "epoch": 0.23148148148148148,
+      "grad_norm": 0.42069005966186523,
+      "learning_rate": 0.0001983992368341385,
+      "loss": 1.0076,
+      "step": 1300
+    },
+    {
+      "epoch": 0.23165954415954415,
+      "grad_norm": 0.475204735994339,
+      "learning_rate": 0.00019839674137222567,
+      "loss": 1.1682,
+      "step": 1301
+    },
+    {
+      "epoch": 0.23183760683760685,
+      "grad_norm": 0.55730140209198,
+      "learning_rate": 0.0001983942439824331,
+      "loss": 1.2948,
+      "step": 1302
+    },
+    {
+      "epoch": 0.23201566951566951,
+      "grad_norm": 0.4533313512802124,
+      "learning_rate": 0.00019839174466480973,
+      "loss": 1.2691,
+      "step": 1303
+    },
+    {
+      "epoch": 0.23219373219373218,
+      "grad_norm": 0.4733520746231079,
+      "learning_rate": 0.0001983892434194045,
+      "loss": 1.2232,
+      "step": 1304
+    },
+    {
+      "epoch": 0.23237179487179488,
+      "grad_norm": 0.5085756182670593,
+      "learning_rate": 0.00019838674024626643,
+      "loss": 1.1347,
+      "step": 1305
+    },
+    {
+      "epoch": 0.23254985754985755,
+      "grad_norm": 0.4679976999759674,
+      "learning_rate": 0.00019838423514544456,
+      "loss": 1.0018,
+      "step": 1306
+    },
+    {
+      "epoch": 0.23272792022792022,
+      "grad_norm": 0.4234481751918793,
+      "learning_rate": 0.00019838172811698795,
+      "loss": 1.0472,
+      "step": 1307
+    },
+    {
+      "epoch": 0.2329059829059829,
+      "grad_norm": 0.5749204158782959,
+      "learning_rate": 0.00019837921916094579,
+      "loss": 1.2239,
+      "step": 1308
+    },
+    {
+      "epoch": 0.23308404558404558,
+      "grad_norm": 0.46715882420539856,
+      "learning_rate": 0.0001983767082773672,
+      "loss": 1.1924,
+      "step": 1309
+    },
+    {
+      "epoch": 0.23326210826210828,
+      "grad_norm": 0.5079745054244995,
+      "learning_rate": 0.00019837419546630137,
+      "loss": 1.1086,
+      "step": 1310
+    },
+    {
+      "epoch": 0.23344017094017094,
+      "grad_norm": 0.4419243037700653,
+      "learning_rate": 0.0001983716807277975,
+      "loss": 1.1911,
+      "step": 1311
+    },
+    {
+      "epoch": 0.2336182336182336,
+      "grad_norm": 0.5107570290565491,
+      "learning_rate": 0.00019836916406190493,
+      "loss": 1.1071,
+      "step": 1312
+    },
+    {
+      "epoch": 0.2337962962962963,
+      "grad_norm": 0.5295659303665161,
+      "learning_rate": 0.00019836664546867293,
+      "loss": 1.2905,
+      "step": 1313
+    },
+    {
+      "epoch": 0.23397435897435898,
+      "grad_norm": 0.4844837784767151,
+      "learning_rate": 0.00019836412494815084,
+      "loss": 1.3507,
+      "step": 1314
+    },
+    {
+      "epoch": 0.23415242165242164,
+      "grad_norm": 0.6166049242019653,
+      "learning_rate": 0.00019836160250038808,
+      "loss": 1.2822,
+      "step": 1315
+    },
+    {
+      "epoch": 0.23433048433048434,
+      "grad_norm": 0.3229198753833771,
+      "learning_rate": 0.00019835907812543402,
+      "loss": 0.4959,
+      "step": 1316
+    },
+    {
+      "epoch": 0.234508547008547,
+      "grad_norm": 0.5788772702217102,
+      "learning_rate": 0.00019835655182333815,
+      "loss": 1.0832,
+      "step": 1317
+    },
+    {
+      "epoch": 0.23468660968660968,
+      "grad_norm": 0.525705099105835,
+      "learning_rate": 0.00019835402359414997,
+      "loss": 1.0968,
+      "step": 1318
+    },
+    {
+      "epoch": 0.23486467236467237,
+      "grad_norm": 0.5007779002189636,
+      "learning_rate": 0.000198351493437919,
+      "loss": 1.2788,
+      "step": 1319
+    },
+    {
+      "epoch": 0.23504273504273504,
+      "grad_norm": 0.4276871383190155,
+      "learning_rate": 0.00019834896135469484,
+      "loss": 1.0419,
+      "step": 1320
+    },
+    {
+      "epoch": 0.2352207977207977,
+      "grad_norm": 0.5359070301055908,
+      "learning_rate": 0.00019834642734452708,
+      "loss": 1.1308,
+      "step": 1321
+    },
+    {
+      "epoch": 0.2353988603988604,
+      "grad_norm": 0.4854908883571625,
+      "learning_rate": 0.0001983438914074654,
+      "loss": 1.1211,
+      "step": 1322
+    },
+    {
+      "epoch": 0.23557692307692307,
+      "grad_norm": 0.4913707375526428,
+      "learning_rate": 0.0001983413535435594,
+      "loss": 1.2392,
+      "step": 1323
+    },
+    {
+      "epoch": 0.23575498575498577,
+      "grad_norm": 0.46755748987197876,
+      "learning_rate": 0.0001983388137528589,
+      "loss": 0.9348,
+      "step": 1324
+    },
+    {
+      "epoch": 0.23593304843304844,
+      "grad_norm": 0.4592570960521698,
+      "learning_rate": 0.0001983362720354136,
+      "loss": 1.1339,
+      "step": 1325
+    },
+    {
+      "epoch": 0.2361111111111111,
+      "grad_norm": 0.5121711492538452,
+      "learning_rate": 0.00019833372839127335,
+      "loss": 1.2973,
+      "step": 1326
+    },
+    {
+      "epoch": 0.2362891737891738,
+      "grad_norm": 0.4809017479419708,
+      "learning_rate": 0.000198331182820488,
+      "loss": 0.9849,
+      "step": 1327
+    },
+    {
+      "epoch": 0.23646723646723647,
+      "grad_norm": 0.42340895533561707,
+      "learning_rate": 0.00019832863532310733,
+      "loss": 1.0731,
+      "step": 1328
+    },
+    {
+      "epoch": 0.23664529914529914,
+      "grad_norm": 0.5388045310974121,
+      "learning_rate": 0.00019832608589918135,
+      "loss": 1.0729,
+      "step": 1329
+    },
+    {
+      "epoch": 0.23682336182336183,
+      "grad_norm": 0.43075770139694214,
+      "learning_rate": 0.00019832353454875992,
+      "loss": 1.1684,
+      "step": 1330
+    },
+    {
+      "epoch": 0.2370014245014245,
+      "grad_norm": 0.554927408695221,
+      "learning_rate": 0.00019832098127189313,
+      "loss": 1.0842,
+      "step": 1331
+    },
+    {
+      "epoch": 0.23717948717948717,
+      "grad_norm": 0.5359260439872742,
+      "learning_rate": 0.0001983184260686309,
+      "loss": 1.2399,
+      "step": 1332
+    },
+    {
+      "epoch": 0.23735754985754987,
+      "grad_norm": 0.5141251087188721,
+      "learning_rate": 0.0001983158689390234,
+      "loss": 1.3752,
+      "step": 1333
+    },
+    {
+      "epoch": 0.23753561253561253,
+      "grad_norm": 0.4578750431537628,
+      "learning_rate": 0.00019831330988312067,
+      "loss": 1.0965,
+      "step": 1334
+    },
+    {
+      "epoch": 0.2377136752136752,
+      "grad_norm": 0.47974497079849243,
+      "learning_rate": 0.00019831074890097286,
+      "loss": 1.3379,
+      "step": 1335
+    },
+    {
+      "epoch": 0.2378917378917379,
+      "grad_norm": 0.4618176817893982,
+      "learning_rate": 0.00019830818599263014,
+      "loss": 1.274,
+      "step": 1336
+    },
+    {
+      "epoch": 0.23806980056980057,
+      "grad_norm": 0.4279816448688507,
+      "learning_rate": 0.00019830562115814276,
+      "loss": 0.996,
+      "step": 1337
+    },
+    {
+      "epoch": 0.23824786324786323,
+      "grad_norm": 0.4255026876926422,
+      "learning_rate": 0.0001983030543975609,
+      "loss": 0.969,
+      "step": 1338
+    },
+    {
+      "epoch": 0.23842592592592593,
+      "grad_norm": 0.4551412761211395,
+      "learning_rate": 0.00019830048571093493,
+      "loss": 1.0204,
+      "step": 1339
+    },
+    {
+      "epoch": 0.2386039886039886,
+      "grad_norm": 0.4747903048992157,
+      "learning_rate": 0.00019829791509831513,
+      "loss": 1.1816,
+      "step": 1340
+    },
+    {
+      "epoch": 0.2387820512820513,
+      "grad_norm": 0.47187140583992004,
+      "learning_rate": 0.00019829534255975188,
+      "loss": 1.1205,
+      "step": 1341
+    },
+    {
+      "epoch": 0.23896011396011396,
+      "grad_norm": 0.49332180619239807,
+      "learning_rate": 0.0001982927680952956,
+      "loss": 1.2657,
+      "step": 1342
+    },
+    {
+      "epoch": 0.23913817663817663,
+      "grad_norm": 0.5162837505340576,
+      "learning_rate": 0.0001982901917049967,
+      "loss": 1.2247,
+      "step": 1343
+    },
+    {
+      "epoch": 0.23931623931623933,
+      "grad_norm": 0.43407055735588074,
+      "learning_rate": 0.0001982876133889057,
+      "loss": 1.0038,
+      "step": 1344
+    },
+    {
+      "epoch": 0.239494301994302,
+      "grad_norm": 0.5132251977920532,
+      "learning_rate": 0.00019828503314707306,
+      "loss": 1.0678,
+      "step": 1345
+    },
+    {
+      "epoch": 0.23967236467236466,
+      "grad_norm": 0.46295464038848877,
+      "learning_rate": 0.00019828245097954937,
+      "loss": 1.1802,
+      "step": 1346
+    },
+    {
+      "epoch": 0.23985042735042736,
+      "grad_norm": 0.4682658314704895,
+      "learning_rate": 0.00019827986688638523,
+      "loss": 1.0249,
+      "step": 1347
+    },
+    {
+      "epoch": 0.24002849002849003,
+      "grad_norm": 0.49990561604499817,
+      "learning_rate": 0.00019827728086763125,
+      "loss": 1.0691,
+      "step": 1348
+    },
+    {
+      "epoch": 0.2402065527065527,
+      "grad_norm": 0.39090847969055176,
+      "learning_rate": 0.00019827469292333806,
+      "loss": 0.8367,
+      "step": 1349
+    },
+    {
+      "epoch": 0.2403846153846154,
+      "grad_norm": 0.5023905634880066,
+      "learning_rate": 0.00019827210305355645,
+      "loss": 1.0675,
+      "step": 1350
+    },
+    {
+      "epoch": 0.24056267806267806,
+      "grad_norm": 0.4744076430797577,
+      "learning_rate": 0.00019826951125833715,
+      "loss": 1.3166,
+      "step": 1351
+    },
+    {
+      "epoch": 0.24074074074074073,
+      "grad_norm": 0.44914689660072327,
+      "learning_rate": 0.00019826691753773088,
+      "loss": 0.9818,
+      "step": 1352
+    },
+    {
+      "epoch": 0.24091880341880342,
+      "grad_norm": 0.44391971826553345,
+      "learning_rate": 0.00019826432189178853,
+      "loss": 1.0448,
+      "step": 1353
+    },
+    {
+      "epoch": 0.2410968660968661,
+      "grad_norm": 0.46102839708328247,
+      "learning_rate": 0.00019826172432056086,
+      "loss": 0.9952,
+      "step": 1354
+    },
+    {
+      "epoch": 0.2412749287749288,
+      "grad_norm": 0.4796878695487976,
+      "learning_rate": 0.00019825912482409884,
+      "loss": 1.0977,
+      "step": 1355
+    },
+    {
+      "epoch": 0.24145299145299146,
+      "grad_norm": 0.5003768801689148,
+      "learning_rate": 0.0001982565234024534,
+      "loss": 1.3149,
+      "step": 1356
+    },
+    {
+      "epoch": 0.24163105413105412,
+      "grad_norm": 0.43475663661956787,
+      "learning_rate": 0.00019825392005567551,
+      "loss": 1.0527,
+      "step": 1357
+    },
+    {
+      "epoch": 0.24180911680911682,
+      "grad_norm": 0.46120527386665344,
+      "learning_rate": 0.00019825131478381613,
+      "loss": 1.2333,
+      "step": 1358
+    },
+    {
+      "epoch": 0.2419871794871795,
+      "grad_norm": 0.43748101592063904,
+      "learning_rate": 0.00019824870758692638,
+      "loss": 0.9788,
+      "step": 1359
+    },
+    {
+      "epoch": 0.24216524216524216,
+      "grad_norm": 0.5275192856788635,
+      "learning_rate": 0.00019824609846505727,
+      "loss": 1.1473,
+      "step": 1360
+    },
+    {
+      "epoch": 0.24234330484330485,
+      "grad_norm": 0.346463143825531,
+      "learning_rate": 0.00019824348741825993,
+      "loss": 0.6824,
+      "step": 1361
+    },
+    {
+      "epoch": 0.24252136752136752,
+      "grad_norm": 0.5004115700721741,
+      "learning_rate": 0.00019824087444658556,
+      "loss": 1.1853,
+      "step": 1362
+    },
+    {
+      "epoch": 0.2426994301994302,
+      "grad_norm": 0.42746666073799133,
+      "learning_rate": 0.00019823825955008533,
+      "loss": 0.9355,
+      "step": 1363
+    },
+    {
+      "epoch": 0.24287749287749288,
+      "grad_norm": 0.4099743068218231,
+      "learning_rate": 0.00019823564272881047,
+      "loss": 1.0753,
+      "step": 1364
+    },
+    {
+      "epoch": 0.24305555555555555,
+      "grad_norm": 0.5262967944145203,
+      "learning_rate": 0.00019823302398281226,
+      "loss": 1.2324,
+      "step": 1365
+    },
+    {
+      "epoch": 0.24323361823361822,
+      "grad_norm": 0.436069518327713,
+      "learning_rate": 0.000198230403312142,
+      "loss": 1.1887,
+      "step": 1366
+    },
+    {
+      "epoch": 0.24341168091168092,
+      "grad_norm": 0.38252368569374084,
+      "learning_rate": 0.00019822778071685107,
+      "loss": 1.0211,
+      "step": 1367
+    },
+    {
+      "epoch": 0.24358974358974358,
+      "grad_norm": 0.48024141788482666,
+      "learning_rate": 0.00019822515619699081,
+      "loss": 1.065,
+      "step": 1368
+    },
+    {
+      "epoch": 0.24376780626780628,
+      "grad_norm": 0.47421589493751526,
+      "learning_rate": 0.00019822252975261267,
+      "loss": 1.0433,
+      "step": 1369
+    },
+    {
+      "epoch": 0.24394586894586895,
+      "grad_norm": 0.46094807982444763,
+      "learning_rate": 0.00019821990138376808,
+      "loss": 1.1427,
+      "step": 1370
+    },
+    {
+      "epoch": 0.24412393162393162,
+      "grad_norm": 0.5093680620193481,
+      "learning_rate": 0.00019821727109050856,
+      "loss": 1.1086,
+      "step": 1371
+    },
+    {
+      "epoch": 0.2443019943019943,
+      "grad_norm": 0.41084879636764526,
+      "learning_rate": 0.00019821463887288566,
+      "loss": 1.0068,
+      "step": 1372
+    },
+    {
+      "epoch": 0.24448005698005698,
+      "grad_norm": 0.4991084635257721,
+      "learning_rate": 0.0001982120047309509,
+      "loss": 1.1884,
+      "step": 1373
+    },
+    {
+      "epoch": 0.24465811965811965,
+      "grad_norm": 0.39198383688926697,
+      "learning_rate": 0.00019820936866475595,
+      "loss": 0.9776,
+      "step": 1374
+    },
+    {
+      "epoch": 0.24483618233618235,
+      "grad_norm": 0.4517424702644348,
+      "learning_rate": 0.00019820673067435244,
+      "loss": 1.1491,
+      "step": 1375
+    },
+    {
+      "epoch": 0.245014245014245,
+      "grad_norm": 0.45881983637809753,
+      "learning_rate": 0.00019820409075979202,
+      "loss": 1.1198,
+      "step": 1376
+    },
+    {
+      "epoch": 0.24519230769230768,
+      "grad_norm": 0.4498792290687561,
+      "learning_rate": 0.00019820144892112646,
+      "loss": 1.0897,
+      "step": 1377
+    },
+    {
+      "epoch": 0.24537037037037038,
+      "grad_norm": 0.4128037393093109,
+      "learning_rate": 0.00019819880515840752,
+      "loss": 0.9415,
+      "step": 1378
+    },
+    {
+      "epoch": 0.24554843304843305,
+      "grad_norm": 0.4340885281562805,
+      "learning_rate": 0.00019819615947168698,
+      "loss": 1.201,
+      "step": 1379
+    },
+    {
+      "epoch": 0.24572649572649571,
+      "grad_norm": 0.43814027309417725,
+      "learning_rate": 0.00019819351186101667,
+      "loss": 1.1039,
+      "step": 1380
+    },
+    {
+      "epoch": 0.2459045584045584,
+      "grad_norm": 0.40115082263946533,
+      "learning_rate": 0.00019819086232644845,
+      "loss": 1.2599,
+      "step": 1381
+    },
+    {
+      "epoch": 0.24608262108262108,
+      "grad_norm": 0.4947351813316345,
+      "learning_rate": 0.00019818821086803426,
+      "loss": 1.252,
+      "step": 1382
+    },
+    {
+      "epoch": 0.24626068376068377,
+      "grad_norm": 0.45179441571235657,
+      "learning_rate": 0.0001981855574858261,
+      "loss": 1.1323,
+      "step": 1383
+    },
+    {
+      "epoch": 0.24643874643874644,
+      "grad_norm": 0.47159844636917114,
+      "learning_rate": 0.00019818290217987587,
+      "loss": 1.2053,
+      "step": 1384
+    },
+    {
+      "epoch": 0.2466168091168091,
+      "grad_norm": 0.4358448386192322,
+      "learning_rate": 0.0001981802449502356,
+      "loss": 1.1174,
+      "step": 1385
+    },
+    {
+      "epoch": 0.2467948717948718,
+      "grad_norm": 0.4588233530521393,
+      "learning_rate": 0.00019817758579695745,
+      "loss": 1.1098,
+      "step": 1386
+    },
+    {
+      "epoch": 0.24697293447293447,
+      "grad_norm": 0.4955112636089325,
+      "learning_rate": 0.00019817492472009338,
+      "loss": 1.258,
+      "step": 1387
+    },
+    {
+      "epoch": 0.24715099715099714,
+      "grad_norm": 0.4226941764354706,
+      "learning_rate": 0.00019817226171969565,
+      "loss": 1.0976,
+      "step": 1388
+    },
+    {
+      "epoch": 0.24732905982905984,
+      "grad_norm": 0.4076840579509735,
+      "learning_rate": 0.00019816959679581637,
+      "loss": 1.0121,
+      "step": 1389
+    },
+    {
+      "epoch": 0.2475071225071225,
+      "grad_norm": 0.4395063519477844,
+      "learning_rate": 0.0001981669299485078,
+      "loss": 1.3153,
+      "step": 1390
+    },
+    {
+      "epoch": 0.24768518518518517,
+      "grad_norm": 0.41010400652885437,
+      "learning_rate": 0.0001981642611778221,
+      "loss": 1.0717,
+      "step": 1391
+    },
+    {
+      "epoch": 0.24786324786324787,
+      "grad_norm": 0.43459352850914,
+      "learning_rate": 0.00019816159048381167,
+      "loss": 1.1077,
+      "step": 1392
+    },
+    {
+      "epoch": 0.24804131054131054,
+      "grad_norm": 0.46291449666023254,
+      "learning_rate": 0.00019815891786652875,
+      "loss": 1.0257,
+      "step": 1393
+    },
+    {
+      "epoch": 0.2482193732193732,
+      "grad_norm": 0.46408146619796753,
+      "learning_rate": 0.00019815624332602578,
+      "loss": 0.7899,
+      "step": 1394
+    },
+    {
+      "epoch": 0.2483974358974359,
+      "grad_norm": 0.4763357937335968,
+      "learning_rate": 0.00019815356686235508,
+      "loss": 0.9857,
+      "step": 1395
+    },
+    {
+      "epoch": 0.24857549857549857,
+      "grad_norm": 0.4766457676887512,
+      "learning_rate": 0.00019815088847556918,
+      "loss": 1.0589,
+      "step": 1396
+    },
+    {
+      "epoch": 0.24875356125356127,
+      "grad_norm": 0.4486583173274994,
+      "learning_rate": 0.0001981482081657205,
+      "loss": 1.2572,
+      "step": 1397
+    },
+    {
+      "epoch": 0.24893162393162394,
+      "grad_norm": 0.468878835439682,
+      "learning_rate": 0.00019814552593286155,
+      "loss": 1.101,
+      "step": 1398
+    },
+    {
+      "epoch": 0.2491096866096866,
+      "grad_norm": 0.4230278730392456,
+      "learning_rate": 0.0001981428417770449,
+      "loss": 0.9457,
+      "step": 1399
+    },
+    {
+      "epoch": 0.2492877492877493,
+      "grad_norm": 0.45630761981010437,
+      "learning_rate": 0.00019814015569832315,
+      "loss": 1.0665,
+      "step": 1400
+    },
+    {
+      "epoch": 0.24946581196581197,
+      "grad_norm": 0.5780113935470581,
+      "learning_rate": 0.00019813746769674893,
+      "loss": 1.1064,
+      "step": 1401
+    },
+    {
+      "epoch": 0.24964387464387464,
+      "grad_norm": 0.4343436658382416,
+      "learning_rate": 0.0001981347777723749,
+      "loss": 1.1132,
+      "step": 1402
+    },
+    {
+      "epoch": 0.24982193732193733,
+      "grad_norm": 0.4879056513309479,
+      "learning_rate": 0.0001981320859252537,
+      "loss": 1.1301,
+      "step": 1403
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.5248328447341919,
+      "learning_rate": 0.00019812939215543818,
+      "loss": 1.1468,
+      "step": 1404
+    },
+    {
+      "epoch": 0.25,
+      "eval_loss": 1.115895390510559,
+      "eval_runtime": 25.0474,
+      "eval_samples_per_second": 41.561,
+      "eval_steps_per_second": 20.801,
+      "step": 1404
+    },
+    {
+      "epoch": 0.2501780626780627,
+      "grad_norm": 0.5076769590377808,
+      "learning_rate": 0.00019812669646298106,
+      "loss": 1.1428,
+      "step": 1405
+    },
+    {
+      "epoch": 0.25035612535612534,
+      "grad_norm": 0.5510252714157104,
+      "learning_rate": 0.00019812399884793514,
+      "loss": 1.3383,
+      "step": 1406
+    },
+    {
+      "epoch": 0.25053418803418803,
+      "grad_norm": 0.48918986320495605,
+      "learning_rate": 0.0001981212993103533,
+      "loss": 1.1507,
+      "step": 1407
+    },
+    {
+      "epoch": 0.25071225071225073,
+      "grad_norm": 0.4678935110569,
+      "learning_rate": 0.00019811859785028846,
+      "loss": 1.13,
+      "step": 1408
+    },
+    {
+      "epoch": 0.25089031339031337,
+      "grad_norm": 0.5155254602432251,
+      "learning_rate": 0.0001981158944677935,
+      "loss": 1.1194,
+      "step": 1409
+    },
+    {
+      "epoch": 0.25106837606837606,
+      "grad_norm": 0.4533839523792267,
+      "learning_rate": 0.00019811318916292142,
+      "loss": 0.9464,
+      "step": 1410
+    },
+    {
+      "epoch": 0.25124643874643876,
+      "grad_norm": 0.5142433047294617,
+      "learning_rate": 0.00019811048193572517,
+      "loss": 1.0837,
+      "step": 1411
+    },
+    {
+      "epoch": 0.2514245014245014,
+      "grad_norm": 0.4330446124076843,
+      "learning_rate": 0.00019810777278625788,
+      "loss": 0.9117,
+      "step": 1412
+    },
+    {
+      "epoch": 0.2516025641025641,
+      "grad_norm": 0.44806256890296936,
+      "learning_rate": 0.00019810506171457254,
+      "loss": 1.1643,
+      "step": 1413
+    },
+    {
+      "epoch": 0.2517806267806268,
+      "grad_norm": 0.43526285886764526,
+      "learning_rate": 0.00019810234872072235,
+      "loss": 0.9776,
+      "step": 1414
+    },
+    {
+      "epoch": 0.25195868945868943,
+      "grad_norm": 0.47394511103630066,
+      "learning_rate": 0.00019809963380476039,
+      "loss": 1.0935,
+      "step": 1415
+    },
+    {
+      "epoch": 0.25213675213675213,
+      "grad_norm": 0.48961278796195984,
+      "learning_rate": 0.00019809691696673993,
+      "loss": 1.179,
+      "step": 1416
+    },
+    {
+      "epoch": 0.2523148148148148,
+      "grad_norm": 0.43153589963912964,
+      "learning_rate": 0.00019809419820671412,
+      "loss": 0.906,
+      "step": 1417
+    },
+    {
+      "epoch": 0.25249287749287747,
+      "grad_norm": 0.41187527775764465,
+      "learning_rate": 0.00019809147752473632,
+      "loss": 0.899,
+      "step": 1418
+    },
+    {
+      "epoch": 0.25267094017094016,
+      "grad_norm": 0.5003183484077454,
+      "learning_rate": 0.00019808875492085973,
+      "loss": 1.0606,
+      "step": 1419
+    },
+    {
+      "epoch": 0.25284900284900286,
+      "grad_norm": 0.4430316984653473,
+      "learning_rate": 0.00019808603039513778,
+      "loss": 0.9167,
+      "step": 1420
+    },
+    {
+      "epoch": 0.25302706552706555,
+      "grad_norm": 0.4577699601650238,
+      "learning_rate": 0.00019808330394762382,
+      "loss": 1.1184,
+      "step": 1421
+    },
+    {
+      "epoch": 0.2532051282051282,
+      "grad_norm": 0.42656826972961426,
+      "learning_rate": 0.0001980805755783713,
+      "loss": 0.9335,
+      "step": 1422
+    },
+    {
+      "epoch": 0.2533831908831909,
+      "grad_norm": 0.40980881452560425,
+      "learning_rate": 0.0001980778452874336,
+      "loss": 0.9756,
+      "step": 1423
+    },
+    {
+      "epoch": 0.2535612535612536,
+      "grad_norm": 0.5752090811729431,
+      "learning_rate": 0.00019807511307486423,
+      "loss": 1.1694,
+      "step": 1424
+    },
+    {
+      "epoch": 0.2537393162393162,
+      "grad_norm": 0.5000349283218384,
+      "learning_rate": 0.00019807237894071681,
+      "loss": 0.9515,
+      "step": 1425
+    },
+    {
+      "epoch": 0.2539173789173789,
+      "grad_norm": 0.5159069299697876,
+      "learning_rate": 0.00019806964288504483,
+      "loss": 1.4014,
+      "step": 1426
+    },
+    {
+      "epoch": 0.2540954415954416,
+      "grad_norm": 0.5377941131591797,
+      "learning_rate": 0.00019806690490790194,
+      "loss": 1.2832,
+      "step": 1427
+    },
+    {
+      "epoch": 0.25427350427350426,
+      "grad_norm": 0.4565938711166382,
+      "learning_rate": 0.00019806416500934174,
+      "loss": 1.0629,
+      "step": 1428
+    },
+    {
+      "epoch": 0.25445156695156695,
+      "grad_norm": 0.49867144227027893,
+      "learning_rate": 0.00019806142318941797,
+      "loss": 1.2011,
+      "step": 1429
+    },
+    {
+      "epoch": 0.25462962962962965,
+      "grad_norm": 0.5111994743347168,
+      "learning_rate": 0.00019805867944818427,
+      "loss": 0.8925,
+      "step": 1430
+    },
+    {
+      "epoch": 0.2548076923076923,
+      "grad_norm": 0.5204268097877502,
+      "learning_rate": 0.00019805593378569448,
+      "loss": 1.2956,
+      "step": 1431
+    },
+    {
+      "epoch": 0.254985754985755,
+      "grad_norm": 0.3889026939868927,
+      "learning_rate": 0.00019805318620200234,
+      "loss": 1.0355,
+      "step": 1432
+    },
+    {
+      "epoch": 0.2551638176638177,
+      "grad_norm": 0.46825656294822693,
+      "learning_rate": 0.00019805043669716174,
+      "loss": 1.0444,
+      "step": 1433
+    },
+    {
+      "epoch": 0.2553418803418803,
+      "grad_norm": 0.4509420394897461,
+      "learning_rate": 0.00019804768527122648,
+      "loss": 1.0423,
+      "step": 1434
+    },
+    {
+      "epoch": 0.255519943019943,
+      "grad_norm": 0.4514774978160858,
+      "learning_rate": 0.0001980449319242505,
+      "loss": 1.1588,
+      "step": 1435
+    },
+    {
+      "epoch": 0.2556980056980057,
+      "grad_norm": 0.43019044399261475,
+      "learning_rate": 0.0001980421766562878,
+      "loss": 0.9939,
+      "step": 1436
+    },
+    {
+      "epoch": 0.25587606837606836,
+      "grad_norm": 0.5056091547012329,
+      "learning_rate": 0.00019803941946739228,
+      "loss": 1.1238,
+      "step": 1437
+    },
+    {
+      "epoch": 0.25605413105413105,
+      "grad_norm": 0.48664605617523193,
+      "learning_rate": 0.000198036660357618,
+      "loss": 1.0702,
+      "step": 1438
+    },
+    {
+      "epoch": 0.25623219373219375,
+      "grad_norm": 0.4500972032546997,
+      "learning_rate": 0.000198033899327019,
+      "loss": 0.9365,
+      "step": 1439
+    },
+    {
+      "epoch": 0.2564102564102564,
+      "grad_norm": 0.4800589382648468,
+      "learning_rate": 0.0001980311363756494,
+      "loss": 1.1159,
+      "step": 1440
+    },
+    {
+      "epoch": 0.2565883190883191,
+      "grad_norm": 0.3486495316028595,
+      "learning_rate": 0.0001980283715035633,
+      "loss": 0.6029,
+      "step": 1441
+    },
+    {
+      "epoch": 0.2567663817663818,
+      "grad_norm": 0.46258702874183655,
+      "learning_rate": 0.00019802560471081493,
+      "loss": 1.025,
+      "step": 1442
+    },
+    {
+      "epoch": 0.2569444444444444,
+      "grad_norm": 0.4846673607826233,
+      "learning_rate": 0.00019802283599745844,
+      "loss": 1.1105,
+      "step": 1443
+    },
+    {
+      "epoch": 0.2571225071225071,
+      "grad_norm": 0.4586990475654602,
+      "learning_rate": 0.00019802006536354813,
+      "loss": 0.9897,
+      "step": 1444
+    },
+    {
+      "epoch": 0.2573005698005698,
+      "grad_norm": 0.5177786350250244,
+      "learning_rate": 0.00019801729280913825,
+      "loss": 1.2558,
+      "step": 1445
+    },
+    {
+      "epoch": 0.25747863247863245,
+      "grad_norm": 0.43213751912117004,
+      "learning_rate": 0.00019801451833428312,
+      "loss": 1.0961,
+      "step": 1446
+    },
+    {
+      "epoch": 0.25765669515669515,
+      "grad_norm": 0.42974478006362915,
+      "learning_rate": 0.00019801174193903714,
+      "loss": 1.0659,
+      "step": 1447
+    },
+    {
+      "epoch": 0.25783475783475784,
+      "grad_norm": 0.4424504339694977,
+      "learning_rate": 0.00019800896362345464,
+      "loss": 0.9805,
+      "step": 1448
+    },
+    {
+      "epoch": 0.25801282051282054,
+      "grad_norm": 0.4734833836555481,
+      "learning_rate": 0.0001980061833875901,
+      "loss": 1.255,
+      "step": 1449
+    },
+    {
+      "epoch": 0.2581908831908832,
+      "grad_norm": 0.41024845838546753,
+      "learning_rate": 0.000198003401231498,
+      "loss": 1.0908,
+      "step": 1450
+    },
+    {
+      "epoch": 0.2583689458689459,
+      "grad_norm": 0.43603816628456116,
+      "learning_rate": 0.00019800061715523283,
+      "loss": 1.0611,
+      "step": 1451
+    },
+    {
+      "epoch": 0.25854700854700857,
+      "grad_norm": 0.4871339499950409,
+      "learning_rate": 0.00019799783115884915,
+      "loss": 1.1851,
+      "step": 1452
+    },
+    {
+      "epoch": 0.2587250712250712,
+      "grad_norm": 0.49758270382881165,
+      "learning_rate": 0.00019799504324240157,
+      "loss": 1.1936,
+      "step": 1453
+    },
+    {
+      "epoch": 0.2589031339031339,
+      "grad_norm": 0.4201010763645172,
+      "learning_rate": 0.00019799225340594466,
+      "loss": 1.1567,
+      "step": 1454
+    },
+    {
+      "epoch": 0.2590811965811966,
+      "grad_norm": 0.4200313091278076,
+      "learning_rate": 0.00019798946164953309,
+      "loss": 0.9666,
+      "step": 1455
+    },
+    {
+      "epoch": 0.25925925925925924,
+      "grad_norm": 0.43001702427864075,
+      "learning_rate": 0.0001979866679732216,
+      "loss": 1.0104,
+      "step": 1456
+    },
+    {
+      "epoch": 0.25943732193732194,
+      "grad_norm": 0.46733465790748596,
+      "learning_rate": 0.0001979838723770649,
+      "loss": 1.0927,
+      "step": 1457
+    },
+    {
+      "epoch": 0.25961538461538464,
+      "grad_norm": 0.4513280391693115,
+      "learning_rate": 0.00019798107486111773,
+      "loss": 1.0282,
+      "step": 1458
+    },
+    {
+      "epoch": 0.2597934472934473,
+      "grad_norm": 0.40411749482154846,
+      "learning_rate": 0.00019797827542543495,
+      "loss": 1.0789,
+      "step": 1459
+    },
+    {
+      "epoch": 0.25997150997151,
+      "grad_norm": 0.4359099268913269,
+      "learning_rate": 0.0001979754740700714,
+      "loss": 1.0616,
+      "step": 1460
+    },
+    {
+      "epoch": 0.26014957264957267,
+      "grad_norm": 0.4979047477245331,
+      "learning_rate": 0.00019797267079508198,
+      "loss": 1.2948,
+      "step": 1461
+    },
+    {
+      "epoch": 0.2603276353276353,
+      "grad_norm": 0.44698619842529297,
+      "learning_rate": 0.0001979698656005216,
+      "loss": 0.9198,
+      "step": 1462
+    },
+    {
+      "epoch": 0.260505698005698,
+      "grad_norm": 0.48437631130218506,
+      "learning_rate": 0.00019796705848644516,
+      "loss": 1.3207,
+      "step": 1463
+    },
+    {
+      "epoch": 0.2606837606837607,
+      "grad_norm": 0.4382587671279907,
+      "learning_rate": 0.00019796424945290778,
+      "loss": 1.1315,
+      "step": 1464
+    },
+    {
+      "epoch": 0.26086182336182334,
+      "grad_norm": 0.4565944969654083,
+      "learning_rate": 0.0001979614384999644,
+      "loss": 1.1893,
+      "step": 1465
+    },
+    {
+      "epoch": 0.26103988603988604,
+      "grad_norm": 0.4705163836479187,
+      "learning_rate": 0.00019795862562767017,
+      "loss": 1.1132,
+      "step": 1466
+    },
+    {
+      "epoch": 0.26121794871794873,
+      "grad_norm": 0.525184690952301,
+      "learning_rate": 0.00019795581083608012,
+      "loss": 1.2111,
+      "step": 1467
+    },
+    {
+      "epoch": 0.2613960113960114,
+      "grad_norm": 0.45215457677841187,
+      "learning_rate": 0.00019795299412524945,
+      "loss": 1.1851,
+      "step": 1468
+    },
+    {
+      "epoch": 0.26157407407407407,
+      "grad_norm": 0.4336663484573364,
+      "learning_rate": 0.00019795017549523335,
+      "loss": 1.0147,
+      "step": 1469
+    },
+    {
+      "epoch": 0.26175213675213677,
+      "grad_norm": 0.5327649712562561,
+      "learning_rate": 0.00019794735494608703,
+      "loss": 1.1743,
+      "step": 1470
+    },
+    {
+      "epoch": 0.2619301994301994,
+      "grad_norm": 0.49972307682037354,
+      "learning_rate": 0.00019794453247786578,
+      "loss": 1.1624,
+      "step": 1471
+    },
+    {
+      "epoch": 0.2621082621082621,
+      "grad_norm": 0.43475785851478577,
+      "learning_rate": 0.00019794170809062485,
+      "loss": 0.9888,
+      "step": 1472
+    },
+    {
+      "epoch": 0.2622863247863248,
+      "grad_norm": 0.428838849067688,
+      "learning_rate": 0.0001979388817844196,
+      "loss": 0.9154,
+      "step": 1473
+    },
+    {
+      "epoch": 0.26246438746438744,
+      "grad_norm": 0.508568286895752,
+      "learning_rate": 0.00019793605355930544,
+      "loss": 1.1679,
+      "step": 1474
+    },
+    {
+      "epoch": 0.26264245014245013,
+      "grad_norm": 0.47791770100593567,
+      "learning_rate": 0.00019793322341533776,
+      "loss": 1.1375,
+      "step": 1475
+    },
+    {
+      "epoch": 0.26282051282051283,
+      "grad_norm": 0.41909220814704895,
+      "learning_rate": 0.00019793039135257196,
+      "loss": 1.0235,
+      "step": 1476
+    },
+    {
+      "epoch": 0.26299857549857547,
+      "grad_norm": 0.5564408302307129,
+      "learning_rate": 0.00019792755737106361,
+      "loss": 1.0756,
+      "step": 1477
+    },
+    {
+      "epoch": 0.26317663817663817,
+      "grad_norm": 0.42813625931739807,
+      "learning_rate": 0.0001979247214708682,
+      "loss": 0.8213,
+      "step": 1478
+    },
+    {
+      "epoch": 0.26335470085470086,
+      "grad_norm": 0.44495970010757446,
+      "learning_rate": 0.00019792188365204126,
+      "loss": 0.9654,
+      "step": 1479
+    },
+    {
+      "epoch": 0.26353276353276356,
+      "grad_norm": 0.47473424673080444,
+      "learning_rate": 0.00019791904391463846,
+      "loss": 1.1643,
+      "step": 1480
+    },
+    {
+      "epoch": 0.2637108262108262,
+      "grad_norm": 0.40189051628112793,
+      "learning_rate": 0.0001979162022587154,
+      "loss": 0.8687,
+      "step": 1481
+    },
+    {
+      "epoch": 0.2638888888888889,
+      "grad_norm": 0.44629937410354614,
+      "learning_rate": 0.00019791335868432776,
+      "loss": 1.0284,
+      "step": 1482
+    },
+    {
+      "epoch": 0.2640669515669516,
+      "grad_norm": 0.511275053024292,
+      "learning_rate": 0.00019791051319153124,
+      "loss": 1.2217,
+      "step": 1483
+    },
+    {
+      "epoch": 0.26424501424501423,
+      "grad_norm": 0.5136445164680481,
+      "learning_rate": 0.00019790766578038163,
+      "loss": 1.1129,
+      "step": 1484
+    },
+    {
+      "epoch": 0.2644230769230769,
+      "grad_norm": 0.4450451135635376,
+      "learning_rate": 0.00019790481645093469,
+      "loss": 0.9912,
+      "step": 1485
+    },
+    {
+      "epoch": 0.2646011396011396,
+      "grad_norm": 0.39455199241638184,
+      "learning_rate": 0.00019790196520324621,
+      "loss": 1.0887,
+      "step": 1486
+    },
+    {
+      "epoch": 0.26477920227920226,
+      "grad_norm": 0.4444045126438141,
+      "learning_rate": 0.00019789911203737216,
+      "loss": 1.1559,
+      "step": 1487
+    },
+    {
+      "epoch": 0.26495726495726496,
+      "grad_norm": 0.4769677221775055,
+      "learning_rate": 0.0001978962569533683,
+      "loss": 1.147,
+      "step": 1488
+    },
+    {
+      "epoch": 0.26513532763532766,
+      "grad_norm": 0.40226617455482483,
+      "learning_rate": 0.0001978933999512907,
+      "loss": 1.0966,
+      "step": 1489
+    },
+    {
+      "epoch": 0.2653133903133903,
+      "grad_norm": 0.4640974700450897,
+      "learning_rate": 0.00019789054103119526,
+      "loss": 1.1002,
+      "step": 1490
+    },
+    {
+      "epoch": 0.265491452991453,
+      "grad_norm": 0.48251107335090637,
+      "learning_rate": 0.00019788768019313806,
+      "loss": 1.07,
+      "step": 1491
+    },
+    {
+      "epoch": 0.2656695156695157,
+      "grad_norm": 0.4836949408054352,
+      "learning_rate": 0.00019788481743717506,
+      "loss": 1.2992,
+      "step": 1492
+    },
+    {
+      "epoch": 0.26584757834757833,
+      "grad_norm": 0.4253857135772705,
+      "learning_rate": 0.00019788195276336244,
+      "loss": 1.1326,
+      "step": 1493
+    },
+    {
+      "epoch": 0.266025641025641,
+      "grad_norm": 0.5161862373352051,
+      "learning_rate": 0.0001978790861717563,
+      "loss": 1.2131,
+      "step": 1494
+    },
+    {
+      "epoch": 0.2662037037037037,
+      "grad_norm": 0.5223346948623657,
+      "learning_rate": 0.00019787621766241274,
+      "loss": 1.0933,
+      "step": 1495
+    },
+    {
+      "epoch": 0.26638176638176636,
+      "grad_norm": 0.37622541189193726,
+      "learning_rate": 0.000197873347235388,
+      "loss": 0.8919,
+      "step": 1496
+    },
+    {
+      "epoch": 0.26655982905982906,
+      "grad_norm": 0.4425419569015503,
+      "learning_rate": 0.0001978704748907384,
+      "loss": 1.0411,
+      "step": 1497
+    },
+    {
+      "epoch": 0.26673789173789175,
+      "grad_norm": 0.4536985456943512,
+      "learning_rate": 0.00019786760062852015,
+      "loss": 1.2747,
+      "step": 1498
+    },
+    {
+      "epoch": 0.2669159544159544,
+      "grad_norm": 0.4998049736022949,
+      "learning_rate": 0.00019786472444878955,
+      "loss": 1.3214,
+      "step": 1499
+    },
+    {
+      "epoch": 0.2670940170940171,
+      "grad_norm": 0.42104312777519226,
+      "learning_rate": 0.00019786184635160295,
+      "loss": 0.7878,
+      "step": 1500
+    },
+    {
+      "epoch": 0.2672720797720798,
+      "grad_norm": 0.5354288220405579,
+      "learning_rate": 0.00019785896633701678,
+      "loss": 1.0642,
+      "step": 1501
+    },
+    {
+      "epoch": 0.2674501424501424,
+      "grad_norm": 0.4681485891342163,
+      "learning_rate": 0.00019785608440508744,
+      "loss": 1.1737,
+      "step": 1502
+    },
+    {
+      "epoch": 0.2676282051282051,
+      "grad_norm": 0.49107062816619873,
+      "learning_rate": 0.0001978532005558714,
+      "loss": 1.1507,
+      "step": 1503
+    },
+    {
+      "epoch": 0.2678062678062678,
+      "grad_norm": 0.4173283576965332,
+      "learning_rate": 0.0001978503147894252,
+      "loss": 1.0538,
+      "step": 1504
+    },
+    {
+      "epoch": 0.26798433048433046,
+      "grad_norm": 0.49354055523872375,
+      "learning_rate": 0.0001978474271058053,
+      "loss": 1.1043,
+      "step": 1505
+    },
+    {
+      "epoch": 0.26816239316239315,
+      "grad_norm": 0.5787215232849121,
+      "learning_rate": 0.00019784453750506834,
+      "loss": 0.9245,
+      "step": 1506
+    },
+    {
+      "epoch": 0.26834045584045585,
+      "grad_norm": 0.48982590436935425,
+      "learning_rate": 0.00019784164598727095,
+      "loss": 1.2007,
+      "step": 1507
+    },
+    {
+      "epoch": 0.26851851851851855,
+      "grad_norm": 0.4971007704734802,
+      "learning_rate": 0.00019783875255246973,
+      "loss": 1.1174,
+      "step": 1508
+    },
+    {
+      "epoch": 0.2686965811965812,
+      "grad_norm": 0.5200340151786804,
+      "learning_rate": 0.00019783585720072142,
+      "loss": 1.1967,
+      "step": 1509
+    },
+    {
+      "epoch": 0.2688746438746439,
+      "grad_norm": 0.47911885380744934,
+      "learning_rate": 0.00019783295993208271,
+      "loss": 1.162,
+      "step": 1510
+    },
+    {
+      "epoch": 0.2690527065527066,
+      "grad_norm": 0.4764275848865509,
+      "learning_rate": 0.00019783006074661037,
+      "loss": 1.1358,
+      "step": 1511
+    },
+    {
+      "epoch": 0.2692307692307692,
+      "grad_norm": 0.478545606136322,
+      "learning_rate": 0.00019782715964436124,
+      "loss": 1.0096,
+      "step": 1512
+    },
+    {
+      "epoch": 0.2694088319088319,
+      "grad_norm": 0.5512787699699402,
+      "learning_rate": 0.00019782425662539212,
+      "loss": 1.1799,
+      "step": 1513
+    },
+    {
+      "epoch": 0.2695868945868946,
+      "grad_norm": 0.5495108962059021,
+      "learning_rate": 0.00019782135168975988,
+      "loss": 1.0959,
+      "step": 1514
+    },
+    {
+      "epoch": 0.26976495726495725,
+      "grad_norm": 0.42052868008613586,
+      "learning_rate": 0.0001978184448375215,
+      "loss": 1.1872,
+      "step": 1515
+    },
+    {
+      "epoch": 0.26994301994301995,
+      "grad_norm": 0.4994426965713501,
+      "learning_rate": 0.0001978155360687339,
+      "loss": 1.0568,
+      "step": 1516
+    },
+    {
+      "epoch": 0.27012108262108264,
+      "grad_norm": 0.459577351808548,
+      "learning_rate": 0.00019781262538345402,
+      "loss": 1.0315,
+      "step": 1517
+    },
+    {
+      "epoch": 0.2702991452991453,
+      "grad_norm": 0.4792841374874115,
+      "learning_rate": 0.00019780971278173895,
+      "loss": 1.2055,
+      "step": 1518
+    },
+    {
+      "epoch": 0.270477207977208,
+      "grad_norm": 0.5017708539962769,
+      "learning_rate": 0.00019780679826364575,
+      "loss": 1.157,
+      "step": 1519
+    },
+    {
+      "epoch": 0.2706552706552707,
+      "grad_norm": 0.5197349786758423,
+      "learning_rate": 0.00019780388182923152,
+      "loss": 0.9101,
+      "step": 1520
+    },
+    {
+      "epoch": 0.2708333333333333,
+      "grad_norm": 0.4226742684841156,
+      "learning_rate": 0.00019780096347855338,
+      "loss": 1.0525,
+      "step": 1521
+    },
+    {
+      "epoch": 0.271011396011396,
+      "grad_norm": 0.5058164596557617,
+      "learning_rate": 0.00019779804321166852,
+      "loss": 0.931,
+      "step": 1522
+    },
+    {
+      "epoch": 0.2711894586894587,
+      "grad_norm": 0.44492244720458984,
+      "learning_rate": 0.00019779512102863418,
+      "loss": 1.0641,
+      "step": 1523
+    },
+    {
+      "epoch": 0.27136752136752135,
+      "grad_norm": 0.5348989963531494,
+      "learning_rate": 0.00019779219692950758,
+      "loss": 1.1692,
+      "step": 1524
+    },
+    {
+      "epoch": 0.27154558404558404,
+      "grad_norm": 0.4631774425506592,
+      "learning_rate": 0.00019778927091434602,
+      "loss": 1.0876,
+      "step": 1525
+    },
+    {
+      "epoch": 0.27172364672364674,
+      "grad_norm": 0.45957499742507935,
+      "learning_rate": 0.00019778634298320684,
+      "loss": 0.9527,
+      "step": 1526
+    },
+    {
+      "epoch": 0.2719017094017094,
+      "grad_norm": 0.4506755769252777,
+      "learning_rate": 0.00019778341313614743,
+      "loss": 1.086,
+      "step": 1527
+    },
+    {
+      "epoch": 0.2720797720797721,
+      "grad_norm": 0.4900587797164917,
+      "learning_rate": 0.00019778048137322513,
+      "loss": 0.9911,
+      "step": 1528
+    },
+    {
+      "epoch": 0.27225783475783477,
+      "grad_norm": 0.478127658367157,
+      "learning_rate": 0.00019777754769449745,
+      "loss": 1.2083,
+      "step": 1529
+    },
+    {
+      "epoch": 0.2724358974358974,
+      "grad_norm": 0.47220897674560547,
+      "learning_rate": 0.00019777461210002183,
+      "loss": 1.0313,
+      "step": 1530
+    },
+    {
+      "epoch": 0.2726139601139601,
+      "grad_norm": 0.4526277184486389,
+      "learning_rate": 0.0001977716745898558,
+      "loss": 1.2648,
+      "step": 1531
+    },
+    {
+      "epoch": 0.2727920227920228,
+      "grad_norm": 0.42907601594924927,
+      "learning_rate": 0.00019776873516405688,
+      "loss": 0.8645,
+      "step": 1532
+    },
+    {
+      "epoch": 0.27297008547008544,
+      "grad_norm": 0.43440163135528564,
+      "learning_rate": 0.00019776579382268272,
+      "loss": 0.9702,
+      "step": 1533
+    },
+    {
+      "epoch": 0.27314814814814814,
+      "grad_norm": 0.48213550448417664,
+      "learning_rate": 0.0001977628505657909,
+      "loss": 0.998,
+      "step": 1534
+    },
+    {
+      "epoch": 0.27332621082621084,
+      "grad_norm": 0.43385565280914307,
+      "learning_rate": 0.00019775990539343914,
+      "loss": 1.0575,
+      "step": 1535
+    },
+    {
+      "epoch": 0.27350427350427353,
+      "grad_norm": 0.45706847310066223,
+      "learning_rate": 0.00019775695830568507,
+      "loss": 1.3024,
+      "step": 1536
+    },
+    {
+      "epoch": 0.27368233618233617,
+      "grad_norm": 0.45769137144088745,
+      "learning_rate": 0.00019775400930258652,
+      "loss": 1.0987,
+      "step": 1537
+    },
+    {
+      "epoch": 0.27386039886039887,
+      "grad_norm": 0.44682395458221436,
+      "learning_rate": 0.00019775105838420117,
+      "loss": 1.1327,
+      "step": 1538
+    },
+    {
+      "epoch": 0.27403846153846156,
+      "grad_norm": 0.5923072099685669,
+      "learning_rate": 0.00019774810555058694,
+      "loss": 1.4766,
+      "step": 1539
+    },
+    {
+      "epoch": 0.2742165242165242,
+      "grad_norm": 0.4327206015586853,
+      "learning_rate": 0.0001977451508018016,
+      "loss": 1.1175,
+      "step": 1540
+    },
+    {
+      "epoch": 0.2743945868945869,
+      "grad_norm": 0.48036691546440125,
+      "learning_rate": 0.00019774219413790315,
+      "loss": 1.1189,
+      "step": 1541
+    },
+    {
+      "epoch": 0.2745726495726496,
+      "grad_norm": 0.41371914744377136,
+      "learning_rate": 0.00019773923555894935,
+      "loss": 1.1366,
+      "step": 1542
+    },
+    {
+      "epoch": 0.27475071225071224,
+      "grad_norm": 0.4452378749847412,
+      "learning_rate": 0.00019773627506499832,
+      "loss": 0.9517,
+      "step": 1543
+    },
+    {
+      "epoch": 0.27492877492877493,
+      "grad_norm": 0.469098299741745,
+      "learning_rate": 0.00019773331265610802,
+      "loss": 1.0848,
+      "step": 1544
+    },
+    {
+      "epoch": 0.27510683760683763,
+      "grad_norm": 0.5390294790267944,
+      "learning_rate": 0.00019773034833233646,
+      "loss": 0.8589,
+      "step": 1545
+    },
+    {
+      "epoch": 0.27528490028490027,
+      "grad_norm": 0.5368238091468811,
+      "learning_rate": 0.00019772738209374174,
+      "loss": 1.2954,
+      "step": 1546
+    },
+    {
+      "epoch": 0.27546296296296297,
+      "grad_norm": 0.4705318510532379,
+      "learning_rate": 0.00019772441394038198,
+      "loss": 1.2252,
+      "step": 1547
+    },
+    {
+      "epoch": 0.27564102564102566,
+      "grad_norm": 0.4682813286781311,
+      "learning_rate": 0.00019772144387231533,
+      "loss": 1.0855,
+      "step": 1548
+    },
+    {
+      "epoch": 0.2758190883190883,
+      "grad_norm": 0.46876460313796997,
+      "learning_rate": 0.0001977184718896,
+      "loss": 1.1959,
+      "step": 1549
+    },
+    {
+      "epoch": 0.275997150997151,
+      "grad_norm": 0.4172806441783905,
+      "learning_rate": 0.00019771549799229416,
+      "loss": 1.2166,
+      "step": 1550
+    },
+    {
+      "epoch": 0.2761752136752137,
+      "grad_norm": 0.5088075399398804,
+      "learning_rate": 0.0001977125221804562,
+      "loss": 1.1285,
+      "step": 1551
+    },
+    {
+      "epoch": 0.27635327635327633,
+      "grad_norm": 0.4728628396987915,
+      "learning_rate": 0.0001977095444541443,
+      "loss": 1.2985,
+      "step": 1552
+    },
+    {
+      "epoch": 0.27653133903133903,
+      "grad_norm": 0.4431236684322357,
+      "learning_rate": 0.00019770656481341684,
+      "loss": 1.1298,
+      "step": 1553
+    },
+    {
+      "epoch": 0.2767094017094017,
+      "grad_norm": 0.474065363407135,
+      "learning_rate": 0.00019770358325833223,
+      "loss": 1.1915,
+      "step": 1554
+    },
+    {
+      "epoch": 0.27688746438746437,
+      "grad_norm": 0.45718875527381897,
+      "learning_rate": 0.00019770059978894885,
+      "loss": 1.0626,
+      "step": 1555
+    },
+    {
+      "epoch": 0.27706552706552706,
+      "grad_norm": 0.49300211668014526,
+      "learning_rate": 0.00019769761440532522,
+      "loss": 1.0134,
+      "step": 1556
+    },
+    {
+      "epoch": 0.27724358974358976,
+      "grad_norm": 0.4389498829841614,
+      "learning_rate": 0.00019769462710751974,
+      "loss": 1.0292,
+      "step": 1557
+    },
+    {
+      "epoch": 0.2774216524216524,
+      "grad_norm": 0.47330448031425476,
+      "learning_rate": 0.000197691637895591,
+      "loss": 1.1273,
+      "step": 1558
+    },
+    {
+      "epoch": 0.2775997150997151,
+      "grad_norm": 0.5322058200836182,
+      "learning_rate": 0.00019768864676959755,
+      "loss": 1.059,
+      "step": 1559
+    },
+    {
+      "epoch": 0.2777777777777778,
+      "grad_norm": 0.4714536964893341,
+      "learning_rate": 0.000197685653729598,
+      "loss": 1.1987,
+      "step": 1560
+    },
+    {
+      "epoch": 0.27795584045584043,
+      "grad_norm": 0.48687809705734253,
+      "learning_rate": 0.00019768265877565097,
+      "loss": 1.3206,
+      "step": 1561
+    },
+    {
+      "epoch": 0.2781339031339031,
+      "grad_norm": 0.46066713333129883,
+      "learning_rate": 0.00019767966190781518,
+      "loss": 1.0845,
+      "step": 1562
+    },
+    {
+      "epoch": 0.2783119658119658,
+      "grad_norm": 0.44372090697288513,
+      "learning_rate": 0.00019767666312614935,
+      "loss": 1.0942,
+      "step": 1563
+    },
+    {
+      "epoch": 0.27849002849002846,
+      "grad_norm": 0.4615907073020935,
+      "learning_rate": 0.00019767366243071216,
+      "loss": 1.071,
+      "step": 1564
+    },
+    {
+      "epoch": 0.27866809116809116,
+      "grad_norm": 0.502097487449646,
+      "learning_rate": 0.0001976706598215625,
+      "loss": 1.1164,
+      "step": 1565
+    },
+    {
+      "epoch": 0.27884615384615385,
+      "grad_norm": 0.4371815621852875,
+      "learning_rate": 0.00019766765529875913,
+      "loss": 1.0252,
+      "step": 1566
+    },
+    {
+      "epoch": 0.27902421652421655,
+      "grad_norm": 0.43035808205604553,
+      "learning_rate": 0.00019766464886236093,
+      "loss": 1.073,
+      "step": 1567
+    },
+    {
+      "epoch": 0.2792022792022792,
+      "grad_norm": 0.49721601605415344,
+      "learning_rate": 0.00019766164051242683,
+      "loss": 1.0316,
+      "step": 1568
+    },
+    {
+      "epoch": 0.2793803418803419,
+      "grad_norm": 0.44866231083869934,
+      "learning_rate": 0.00019765863024901576,
+      "loss": 1.0951,
+      "step": 1569
+    },
+    {
+      "epoch": 0.2795584045584046,
+      "grad_norm": 0.46318337321281433,
+      "learning_rate": 0.0001976556180721867,
+      "loss": 0.9836,
+      "step": 1570
+    },
+    {
+      "epoch": 0.2797364672364672,
+      "grad_norm": 0.4227696657180786,
+      "learning_rate": 0.00019765260398199868,
+      "loss": 1.0414,
+      "step": 1571
+    },
+    {
+      "epoch": 0.2799145299145299,
+      "grad_norm": 0.6062980890274048,
+      "learning_rate": 0.00019764958797851073,
+      "loss": 1.137,
+      "step": 1572
+    },
+    {
+      "epoch": 0.2800925925925926,
+      "grad_norm": 0.4856833219528198,
+      "learning_rate": 0.00019764657006178196,
+      "loss": 1.1361,
+      "step": 1573
+    },
+    {
+      "epoch": 0.28027065527065526,
+      "grad_norm": 0.45612895488739014,
+      "learning_rate": 0.00019764355023187146,
+      "loss": 1.0005,
+      "step": 1574
+    },
+    {
+      "epoch": 0.28044871794871795,
+      "grad_norm": 0.4143696129322052,
+      "learning_rate": 0.00019764052848883845,
+      "loss": 1.051,
+      "step": 1575
+    },
+    {
+      "epoch": 0.28062678062678065,
+      "grad_norm": 0.4532071352005005,
+      "learning_rate": 0.00019763750483274212,
+      "loss": 1.0595,
+      "step": 1576
+    },
+    {
+      "epoch": 0.2808048433048433,
+      "grad_norm": 0.4940357208251953,
+      "learning_rate": 0.0001976344792636417,
+      "loss": 1.0983,
+      "step": 1577
+    },
+    {
+      "epoch": 0.280982905982906,
+      "grad_norm": 0.44405099749565125,
+      "learning_rate": 0.0001976314517815965,
+      "loss": 1.0846,
+      "step": 1578
+    },
+    {
+      "epoch": 0.2811609686609687,
+      "grad_norm": 0.5508625507354736,
+      "learning_rate": 0.00019762842238666578,
+      "loss": 1.1722,
+      "step": 1579
+    },
+    {
+      "epoch": 0.2813390313390313,
+      "grad_norm": 0.5241084694862366,
+      "learning_rate": 0.00019762539107890894,
+      "loss": 1.351,
+      "step": 1580
+    },
+    {
+      "epoch": 0.281517094017094,
+      "grad_norm": 0.5307353734970093,
+      "learning_rate": 0.00019762235785838537,
+      "loss": 1.1868,
+      "step": 1581
+    },
+    {
+      "epoch": 0.2816951566951567,
+      "grad_norm": 0.45697924494743347,
+      "learning_rate": 0.00019761932272515447,
+      "loss": 1.1982,
+      "step": 1582
+    },
+    {
+      "epoch": 0.28187321937321935,
+      "grad_norm": 0.412483811378479,
+      "learning_rate": 0.00019761628567927574,
+      "loss": 1.0433,
+      "step": 1583
+    },
+    {
+      "epoch": 0.28205128205128205,
+      "grad_norm": 0.4614165425300598,
+      "learning_rate": 0.00019761324672080868,
+      "loss": 1.104,
+      "step": 1584
+    },
+    {
+      "epoch": 0.28222934472934474,
+      "grad_norm": 0.47644901275634766,
+      "learning_rate": 0.00019761020584981284,
+      "loss": 1.1037,
+      "step": 1585
+    },
+    {
+      "epoch": 0.2824074074074074,
+      "grad_norm": 0.4985184669494629,
+      "learning_rate": 0.00019760716306634773,
+      "loss": 1.2213,
+      "step": 1586
+    },
+    {
+      "epoch": 0.2825854700854701,
+      "grad_norm": 0.508301317691803,
+      "learning_rate": 0.00019760411837047305,
+      "loss": 1.1315,
+      "step": 1587
+    },
+    {
+      "epoch": 0.2827635327635328,
+      "grad_norm": 0.5346587300300598,
+      "learning_rate": 0.00019760107176224845,
+      "loss": 1.2281,
+      "step": 1588
+    },
+    {
+      "epoch": 0.2829415954415954,
+      "grad_norm": 0.5106825232505798,
+      "learning_rate": 0.00019759802324173357,
+      "loss": 1.2904,
+      "step": 1589
+    },
+    {
+      "epoch": 0.2831196581196581,
+      "grad_norm": 0.46458688378334045,
+      "learning_rate": 0.00019759497280898817,
+      "loss": 1.0861,
+      "step": 1590
+    },
+    {
+      "epoch": 0.2832977207977208,
+      "grad_norm": 0.49115365743637085,
+      "learning_rate": 0.00019759192046407201,
+      "loss": 1.0529,
+      "step": 1591
+    },
+    {
+      "epoch": 0.28347578347578345,
+      "grad_norm": 0.5114167332649231,
+      "learning_rate": 0.0001975888662070449,
+      "loss": 1.2555,
+      "step": 1592
+    },
+    {
+      "epoch": 0.28365384615384615,
+      "grad_norm": 0.45844775438308716,
+      "learning_rate": 0.0001975858100379667,
+      "loss": 1.0662,
+      "step": 1593
+    },
+    {
+      "epoch": 0.28383190883190884,
+      "grad_norm": 0.4684161841869354,
+      "learning_rate": 0.00019758275195689727,
+      "loss": 1.0537,
+      "step": 1594
+    },
+    {
+      "epoch": 0.28400997150997154,
+      "grad_norm": 0.4816220998764038,
+      "learning_rate": 0.0001975796919638965,
+      "loss": 1.126,
+      "step": 1595
+    },
+    {
+      "epoch": 0.2841880341880342,
+      "grad_norm": 0.46578118205070496,
+      "learning_rate": 0.0001975766300590244,
+      "loss": 0.9651,
+      "step": 1596
+    },
+    {
+      "epoch": 0.2843660968660969,
+      "grad_norm": 0.4181675612926483,
+      "learning_rate": 0.0001975735662423409,
+      "loss": 1.0888,
+      "step": 1597
+    },
+    {
+      "epoch": 0.28454415954415957,
+      "grad_norm": 0.49417954683303833,
+      "learning_rate": 0.00019757050051390609,
+      "loss": 1.1878,
+      "step": 1598
+    },
+    {
+      "epoch": 0.2847222222222222,
+      "grad_norm": 0.47264960408210754,
+      "learning_rate": 0.00019756743287377998,
+      "loss": 1.027,
+      "step": 1599
+    },
+    {
+      "epoch": 0.2849002849002849,
+      "grad_norm": 0.47686338424682617,
+      "learning_rate": 0.0001975643633220227,
+      "loss": 1.1307,
+      "step": 1600
+    },
+    {
+      "epoch": 0.2850783475783476,
+      "grad_norm": 0.5571266412734985,
+      "learning_rate": 0.00019756129185869443,
+      "loss": 0.984,
+      "step": 1601
+    },
+    {
+      "epoch": 0.28525641025641024,
+      "grad_norm": 0.46942809224128723,
+      "learning_rate": 0.00019755821848385527,
+      "loss": 1.0397,
+      "step": 1602
+    },
+    {
+      "epoch": 0.28543447293447294,
+      "grad_norm": 0.6325890421867371,
+      "learning_rate": 0.00019755514319756551,
+      "loss": 1.0918,
+      "step": 1603
+    },
+    {
+      "epoch": 0.28561253561253563,
+      "grad_norm": 0.5297608375549316,
+      "learning_rate": 0.00019755206599988533,
+      "loss": 0.9911,
+      "step": 1604
+    },
+    {
+      "epoch": 0.2857905982905983,
+      "grad_norm": 0.4736945331096649,
+      "learning_rate": 0.00019754898689087512,
+      "loss": 1.0786,
+      "step": 1605
+    },
+    {
+      "epoch": 0.28596866096866097,
+      "grad_norm": 0.5048685669898987,
+      "learning_rate": 0.00019754590587059512,
+      "loss": 0.9834,
+      "step": 1606
+    },
+    {
+      "epoch": 0.28614672364672367,
+      "grad_norm": 0.3823149502277374,
+      "learning_rate": 0.00019754282293910574,
+      "loss": 0.8341,
+      "step": 1607
+    },
+    {
+      "epoch": 0.2863247863247863,
+      "grad_norm": 0.44071945548057556,
+      "learning_rate": 0.00019753973809646738,
+      "loss": 1.131,
+      "step": 1608
+    },
+    {
+      "epoch": 0.286502849002849,
+      "grad_norm": 0.44182759523391724,
+      "learning_rate": 0.00019753665134274043,
+      "loss": 1.0321,
+      "step": 1609
+    },
+    {
+      "epoch": 0.2866809116809117,
+      "grad_norm": 0.4486250877380371,
+      "learning_rate": 0.00019753356267798546,
+      "loss": 0.9941,
+      "step": 1610
+    },
+    {
+      "epoch": 0.28685897435897434,
+      "grad_norm": 0.42796584963798523,
+      "learning_rate": 0.00019753047210226292,
+      "loss": 1.0235,
+      "step": 1611
+    },
+    {
+      "epoch": 0.28703703703703703,
+      "grad_norm": 0.47294023633003235,
+      "learning_rate": 0.00019752737961563336,
+      "loss": 1.11,
+      "step": 1612
+    },
+    {
+      "epoch": 0.28721509971509973,
+      "grad_norm": 0.44550734758377075,
+      "learning_rate": 0.00019752428521815742,
+      "loss": 1.0849,
+      "step": 1613
+    },
+    {
+      "epoch": 0.28739316239316237,
+      "grad_norm": 0.44189929962158203,
+      "learning_rate": 0.0001975211889098957,
+      "loss": 0.8904,
+      "step": 1614
+    },
+    {
+      "epoch": 0.28757122507122507,
+      "grad_norm": 0.5302733182907104,
+      "learning_rate": 0.00019751809069090885,
+      "loss": 1.2348,
+      "step": 1615
+    },
+    {
+      "epoch": 0.28774928774928776,
+      "grad_norm": 0.5951390862464905,
+      "learning_rate": 0.00019751499056125762,
+      "loss": 1.3035,
+      "step": 1616
+    },
+    {
+      "epoch": 0.2879273504273504,
+      "grad_norm": 0.5431534647941589,
+      "learning_rate": 0.0001975118885210027,
+      "loss": 1.0016,
+      "step": 1617
+    },
+    {
+      "epoch": 0.2881054131054131,
+      "grad_norm": 0.47301986813545227,
+      "learning_rate": 0.00019750878457020489,
+      "loss": 1.2245,
+      "step": 1618
+    },
+    {
+      "epoch": 0.2882834757834758,
+      "grad_norm": 0.44785359501838684,
+      "learning_rate": 0.00019750567870892497,
+      "loss": 1.122,
+      "step": 1619
+    },
+    {
+      "epoch": 0.28846153846153844,
+      "grad_norm": 0.49494361877441406,
+      "learning_rate": 0.00019750257093722383,
+      "loss": 0.9421,
+      "step": 1620
+    },
+    {
+      "epoch": 0.28863960113960113,
+      "grad_norm": 0.4484521150588989,
+      "learning_rate": 0.00019749946125516242,
+      "loss": 1.2146,
+      "step": 1621
+    },
+    {
+      "epoch": 0.28881766381766383,
+      "grad_norm": 0.4635269343852997,
+      "learning_rate": 0.00019749634966280156,
+      "loss": 0.976,
+      "step": 1622
+    },
+    {
+      "epoch": 0.28899572649572647,
+      "grad_norm": 0.5532249808311462,
+      "learning_rate": 0.00019749323616020226,
+      "loss": 1.1818,
+      "step": 1623
+    },
+    {
+      "epoch": 0.28917378917378916,
+      "grad_norm": 0.4730629622936249,
+      "learning_rate": 0.00019749012074742552,
+      "loss": 1.0321,
+      "step": 1624
+    },
+    {
+      "epoch": 0.28935185185185186,
+      "grad_norm": 0.47437289357185364,
+      "learning_rate": 0.0001974870034245324,
+      "loss": 1.1572,
+      "step": 1625
+    },
+    {
+      "epoch": 0.28952991452991456,
+      "grad_norm": 0.4796304404735565,
+      "learning_rate": 0.00019748388419158394,
+      "loss": 1.1667,
+      "step": 1626
+    },
+    {
+      "epoch": 0.2897079772079772,
+      "grad_norm": 0.42686304450035095,
+      "learning_rate": 0.0001974807630486413,
+      "loss": 0.9824,
+      "step": 1627
+    },
+    {
+      "epoch": 0.2898860398860399,
+      "grad_norm": 0.4444865584373474,
+      "learning_rate": 0.00019747763999576558,
+      "loss": 1.2789,
+      "step": 1628
+    },
+    {
+      "epoch": 0.2900641025641026,
+      "grad_norm": 0.5039985179901123,
+      "learning_rate": 0.000197474515033018,
+      "loss": 1.1488,
+      "step": 1629
+    },
+    {
+      "epoch": 0.29024216524216523,
+      "grad_norm": 0.581479549407959,
+      "learning_rate": 0.00019747138816045978,
+      "loss": 1.1232,
+      "step": 1630
+    },
+    {
+      "epoch": 0.2904202279202279,
+      "grad_norm": 0.5415821075439453,
+      "learning_rate": 0.00019746825937815222,
+      "loss": 1.2326,
+      "step": 1631
+    },
+    {
+      "epoch": 0.2905982905982906,
+      "grad_norm": 0.45528364181518555,
+      "learning_rate": 0.00019746512868615656,
+      "loss": 1.0246,
+      "step": 1632
+    },
+    {
+      "epoch": 0.29077635327635326,
+      "grad_norm": 0.5255574584007263,
+      "learning_rate": 0.00019746199608453418,
+      "loss": 1.0592,
+      "step": 1633
+    },
+    {
+      "epoch": 0.29095441595441596,
+      "grad_norm": 0.5064096450805664,
+      "learning_rate": 0.00019745886157334646,
+      "loss": 1.3439,
+      "step": 1634
+    },
+    {
+      "epoch": 0.29113247863247865,
+      "grad_norm": 0.500848650932312,
+      "learning_rate": 0.00019745572515265475,
+      "loss": 1.1212,
+      "step": 1635
+    },
+    {
+      "epoch": 0.2913105413105413,
+      "grad_norm": 0.5229088068008423,
+      "learning_rate": 0.00019745258682252062,
+      "loss": 1.1019,
+      "step": 1636
+    },
+    {
+      "epoch": 0.291488603988604,
+      "grad_norm": 0.4494398832321167,
+      "learning_rate": 0.00019744944658300545,
+      "loss": 1.1298,
+      "step": 1637
+    },
+    {
+      "epoch": 0.2916666666666667,
+      "grad_norm": 0.48383277654647827,
+      "learning_rate": 0.00019744630443417082,
+      "loss": 1.206,
+      "step": 1638
+    },
+    {
+      "epoch": 0.2918447293447293,
+      "grad_norm": 0.4870131313800812,
+      "learning_rate": 0.00019744316037607828,
+      "loss": 1.2096,
+      "step": 1639
+    },
+    {
+      "epoch": 0.292022792022792,
+      "grad_norm": 0.4153090715408325,
+      "learning_rate": 0.00019744001440878944,
+      "loss": 1.0478,
+      "step": 1640
+    },
+    {
+      "epoch": 0.2922008547008547,
+      "grad_norm": 0.4262249171733856,
+      "learning_rate": 0.0001974368665323659,
+      "loss": 1.0393,
+      "step": 1641
+    },
+    {
+      "epoch": 0.29237891737891736,
+      "grad_norm": 0.46131134033203125,
+      "learning_rate": 0.00019743371674686938,
+      "loss": 1.0908,
+      "step": 1642
+    },
+    {
+      "epoch": 0.29255698005698005,
+      "grad_norm": 0.44877463579177856,
+      "learning_rate": 0.0001974305650523616,
+      "loss": 1.1906,
+      "step": 1643
+    },
+    {
+      "epoch": 0.29273504273504275,
+      "grad_norm": 0.5199326276779175,
+      "learning_rate": 0.00019742741144890432,
+      "loss": 1.1147,
+      "step": 1644
+    },
+    {
+      "epoch": 0.2929131054131054,
+      "grad_norm": 0.48142504692077637,
+      "learning_rate": 0.00019742425593655924,
+      "loss": 1.1951,
+      "step": 1645
+    },
+    {
+      "epoch": 0.2930911680911681,
+      "grad_norm": 0.5672988891601562,
+      "learning_rate": 0.0001974210985153883,
+      "loss": 1.1817,
+      "step": 1646
+    },
+    {
+      "epoch": 0.2932692307692308,
+      "grad_norm": 0.38135233521461487,
+      "learning_rate": 0.00019741793918545326,
+      "loss": 0.8567,
+      "step": 1647
+    },
+    {
+      "epoch": 0.2934472934472934,
+      "grad_norm": 0.6153588891029358,
+      "learning_rate": 0.0001974147779468161,
+      "loss": 1.0593,
+      "step": 1648
+    },
+    {
+      "epoch": 0.2936253561253561,
+      "grad_norm": 0.38935527205467224,
+      "learning_rate": 0.0001974116147995387,
+      "loss": 0.9907,
+      "step": 1649
+    },
+    {
+      "epoch": 0.2938034188034188,
+      "grad_norm": 0.467351496219635,
+      "learning_rate": 0.0001974084497436831,
+      "loss": 1.091,
+      "step": 1650
+    },
+    {
+      "epoch": 0.29398148148148145,
+      "grad_norm": 0.45613420009613037,
+      "learning_rate": 0.00019740528277931128,
+      "loss": 0.6789,
+      "step": 1651
+    },
+    {
+      "epoch": 0.29415954415954415,
+      "grad_norm": 0.4045158326625824,
+      "learning_rate": 0.00019740211390648524,
+      "loss": 1.0727,
+      "step": 1652
+    },
+    {
+      "epoch": 0.29433760683760685,
+      "grad_norm": 0.5122803449630737,
+      "learning_rate": 0.00019739894312526714,
+      "loss": 1.2297,
+      "step": 1653
+    },
+    {
+      "epoch": 0.29451566951566954,
+      "grad_norm": 0.44304123520851135,
+      "learning_rate": 0.00019739577043571908,
+      "loss": 0.9562,
+      "step": 1654
+    },
+    {
+      "epoch": 0.2946937321937322,
+      "grad_norm": 0.6070618629455566,
+      "learning_rate": 0.00019739259583790322,
+      "loss": 1.2745,
+      "step": 1655
+    },
+    {
+      "epoch": 0.2948717948717949,
+      "grad_norm": 0.48815637826919556,
+      "learning_rate": 0.00019738941933188176,
+      "loss": 1.0574,
+      "step": 1656
+    },
+    {
+      "epoch": 0.2950498575498576,
+      "grad_norm": 0.5067802667617798,
+      "learning_rate": 0.00019738624091771693,
+      "loss": 1.1874,
+      "step": 1657
+    },
+    {
+      "epoch": 0.2952279202279202,
+      "grad_norm": 0.4956928491592407,
+      "learning_rate": 0.000197383060595471,
+      "loss": 1.1085,
+      "step": 1658
+    },
+    {
+      "epoch": 0.2954059829059829,
+      "grad_norm": 0.46313008666038513,
+      "learning_rate": 0.00019737987836520633,
+      "loss": 1.0548,
+      "step": 1659
+    },
+    {
+      "epoch": 0.2955840455840456,
+      "grad_norm": 0.49944064021110535,
+      "learning_rate": 0.0001973766942269852,
+      "loss": 1.1485,
+      "step": 1660
+    },
+    {
+      "epoch": 0.29576210826210825,
+      "grad_norm": 0.4743517339229584,
+      "learning_rate": 0.00019737350818087003,
+      "loss": 0.9279,
+      "step": 1661
+    },
+    {
+      "epoch": 0.29594017094017094,
+      "grad_norm": 0.45935431122779846,
+      "learning_rate": 0.00019737032022692326,
+      "loss": 0.9574,
+      "step": 1662
+    },
+    {
+      "epoch": 0.29611823361823364,
+      "grad_norm": 0.4550873637199402,
+      "learning_rate": 0.00019736713036520734,
+      "loss": 1.1642,
+      "step": 1663
+    },
+    {
+      "epoch": 0.2962962962962963,
+      "grad_norm": 0.45252951979637146,
+      "learning_rate": 0.00019736393859578474,
+      "loss": 1.0113,
+      "step": 1664
+    },
+    {
+      "epoch": 0.296474358974359,
+      "grad_norm": 0.5147238969802856,
+      "learning_rate": 0.00019736074491871804,
+      "loss": 1.1604,
+      "step": 1665
+    },
+    {
+      "epoch": 0.29665242165242167,
+      "grad_norm": 0.5122934579849243,
+      "learning_rate": 0.00019735754933406977,
+      "loss": 0.9525,
+      "step": 1666
+    },
+    {
+      "epoch": 0.2968304843304843,
+      "grad_norm": 0.438620001077652,
+      "learning_rate": 0.00019735435184190257,
+      "loss": 1.0728,
+      "step": 1667
+    },
+    {
+      "epoch": 0.297008547008547,
+      "grad_norm": 0.41970670223236084,
+      "learning_rate": 0.00019735115244227908,
+      "loss": 0.9782,
+      "step": 1668
+    },
+    {
+      "epoch": 0.2971866096866097,
+      "grad_norm": 0.5447152256965637,
+      "learning_rate": 0.000197347951135262,
+      "loss": 1.0633,
+      "step": 1669
+    },
+    {
+      "epoch": 0.29736467236467234,
+      "grad_norm": 0.4846996068954468,
+      "learning_rate": 0.00019734474792091407,
+      "loss": 0.9019,
+      "step": 1670
+    },
+    {
+      "epoch": 0.29754273504273504,
+      "grad_norm": 0.4721437990665436,
+      "learning_rate": 0.00019734154279929796,
+      "loss": 1.1793,
+      "step": 1671
+    },
+    {
+      "epoch": 0.29772079772079774,
+      "grad_norm": 0.4659852385520935,
+      "learning_rate": 0.00019733833577047655,
+      "loss": 1.1503,
+      "step": 1672
+    },
+    {
+      "epoch": 0.2978988603988604,
+      "grad_norm": 0.3733183443546295,
+      "learning_rate": 0.00019733512683451268,
+      "loss": 0.7763,
+      "step": 1673
+    },
+    {
+      "epoch": 0.2980769230769231,
+      "grad_norm": 0.4898292124271393,
+      "learning_rate": 0.0001973319159914692,
+      "loss": 1.3146,
+      "step": 1674
+    },
+    {
+      "epoch": 0.29825498575498577,
+      "grad_norm": 0.41774725914001465,
+      "learning_rate": 0.00019732870324140899,
+      "loss": 1.2069,
+      "step": 1675
+    },
+    {
+      "epoch": 0.2984330484330484,
+      "grad_norm": 0.4607912003993988,
+      "learning_rate": 0.000197325488584395,
+      "loss": 1.2255,
+      "step": 1676
+    },
+    {
+      "epoch": 0.2986111111111111,
+      "grad_norm": 0.4692424237728119,
+      "learning_rate": 0.00019732227202049025,
+      "loss": 1.0793,
+      "step": 1677
+    },
+    {
+      "epoch": 0.2987891737891738,
+      "grad_norm": 0.5925022959709167,
+      "learning_rate": 0.00019731905354975778,
+      "loss": 1.0297,
+      "step": 1678
+    },
+    {
+      "epoch": 0.29896723646723644,
+      "grad_norm": 0.44047990441322327,
+      "learning_rate": 0.00019731583317226056,
+      "loss": 1.0982,
+      "step": 1679
+    },
+    {
+      "epoch": 0.29914529914529914,
+      "grad_norm": 0.5863066911697388,
+      "learning_rate": 0.0001973126108880618,
+      "loss": 1.0284,
+      "step": 1680
+    },
+    {
+      "epoch": 0.29932336182336183,
+      "grad_norm": 0.48962152004241943,
+      "learning_rate": 0.00019730938669722457,
+      "loss": 1.1861,
+      "step": 1681
+    },
+    {
+      "epoch": 0.29950142450142453,
+      "grad_norm": 0.5445577502250671,
+      "learning_rate": 0.00019730616059981205,
+      "loss": 1.2574,
+      "step": 1682
+    },
+    {
+      "epoch": 0.29967948717948717,
+      "grad_norm": 0.49327564239501953,
+      "learning_rate": 0.00019730293259588743,
+      "loss": 0.9578,
+      "step": 1683
+    },
+    {
+      "epoch": 0.29985754985754987,
+      "grad_norm": 0.4252840578556061,
+      "learning_rate": 0.00019729970268551398,
+      "loss": 1.0083,
+      "step": 1684
+    },
+    {
+      "epoch": 0.30003561253561256,
+      "grad_norm": 0.5140926241874695,
+      "learning_rate": 0.000197296470868755,
+      "loss": 1.3263,
+      "step": 1685
+    },
+    {
+      "epoch": 0.3002136752136752,
+      "grad_norm": 0.5143948197364807,
+      "learning_rate": 0.00019729323714567375,
+      "loss": 1.0424,
+      "step": 1686
+    },
+    {
+      "epoch": 0.3003917378917379,
+      "grad_norm": 0.3811354339122772,
+      "learning_rate": 0.00019729000151633367,
+      "loss": 0.6319,
+      "step": 1687
+    },
+    {
+      "epoch": 0.3005698005698006,
+      "grad_norm": 0.5249716639518738,
+      "learning_rate": 0.0001972867639807981,
+      "loss": 1.0173,
+      "step": 1688
+    },
+    {
+      "epoch": 0.30074786324786323,
+      "grad_norm": 0.41832098364830017,
+      "learning_rate": 0.00019728352453913048,
+      "loss": 1.0503,
+      "step": 1689
+    },
+    {
+      "epoch": 0.30092592592592593,
+      "grad_norm": 0.5961149334907532,
+      "learning_rate": 0.00019728028319139428,
+      "loss": 1.1843,
+      "step": 1690
+    },
+    {
+      "epoch": 0.3011039886039886,
+      "grad_norm": 0.44083690643310547,
+      "learning_rate": 0.00019727703993765303,
+      "loss": 1.1311,
+      "step": 1691
+    },
+    {
+      "epoch": 0.30128205128205127,
+      "grad_norm": 0.4368111491203308,
+      "learning_rate": 0.00019727379477797022,
+      "loss": 0.9463,
+      "step": 1692
+    },
+    {
+      "epoch": 0.30146011396011396,
+      "grad_norm": 0.5289376974105835,
+      "learning_rate": 0.00019727054771240954,
+      "loss": 0.9836,
+      "step": 1693
+    },
+    {
+      "epoch": 0.30163817663817666,
+      "grad_norm": 0.4132843613624573,
+      "learning_rate": 0.00019726729874103448,
+      "loss": 1.1052,
+      "step": 1694
+    },
+    {
+      "epoch": 0.3018162393162393,
+      "grad_norm": 0.4919086992740631,
+      "learning_rate": 0.00019726404786390877,
+      "loss": 1.2219,
+      "step": 1695
+    },
+    {
+      "epoch": 0.301994301994302,
+      "grad_norm": 0.42561691999435425,
+      "learning_rate": 0.0001972607950810961,
+      "loss": 1.0756,
+      "step": 1696
+    },
+    {
+      "epoch": 0.3021723646723647,
+      "grad_norm": 0.5030396580696106,
+      "learning_rate": 0.0001972575403926602,
+      "loss": 1.2207,
+      "step": 1697
+    },
+    {
+      "epoch": 0.30235042735042733,
+      "grad_norm": 0.4779801666736603,
+      "learning_rate": 0.0001972542837986648,
+      "loss": 1.194,
+      "step": 1698
+    },
+    {
+      "epoch": 0.30252849002849,
+      "grad_norm": 0.45395568013191223,
+      "learning_rate": 0.00019725102529917377,
+      "loss": 1.0775,
+      "step": 1699
+    },
+    {
+      "epoch": 0.3027065527065527,
+      "grad_norm": 0.6540699005126953,
+      "learning_rate": 0.0001972477648942509,
+      "loss": 1.181,
+      "step": 1700
+    },
+    {
+      "epoch": 0.30288461538461536,
+      "grad_norm": 0.46281275153160095,
+      "learning_rate": 0.00019724450258396008,
+      "loss": 0.629,
+      "step": 1701
+    },
+    {
+      "epoch": 0.30306267806267806,
+      "grad_norm": 0.3452845811843872,
+      "learning_rate": 0.00019724123836836527,
+      "loss": 0.51,
+      "step": 1702
+    },
+    {
+      "epoch": 0.30324074074074076,
+      "grad_norm": 0.4507991671562195,
+      "learning_rate": 0.00019723797224753038,
+      "loss": 1.0258,
+      "step": 1703
+    },
+    {
+      "epoch": 0.3034188034188034,
+      "grad_norm": 0.5385412573814392,
+      "learning_rate": 0.0001972347042215194,
+      "loss": 1.0232,
+      "step": 1704
+    },
+    {
+      "epoch": 0.3035968660968661,
+      "grad_norm": 0.4460466504096985,
+      "learning_rate": 0.00019723143429039642,
+      "loss": 1.1307,
+      "step": 1705
+    },
+    {
+      "epoch": 0.3037749287749288,
+      "grad_norm": 0.5229718685150146,
+      "learning_rate": 0.00019722816245422545,
+      "loss": 1.0964,
+      "step": 1706
+    },
+    {
+      "epoch": 0.30395299145299143,
+      "grad_norm": 0.4776979088783264,
+      "learning_rate": 0.00019722488871307058,
+      "loss": 1.2678,
+      "step": 1707
+    },
+    {
+      "epoch": 0.3041310541310541,
+      "grad_norm": 0.5371831655502319,
+      "learning_rate": 0.00019722161306699601,
+      "loss": 1.2808,
+      "step": 1708
+    },
+    {
+      "epoch": 0.3043091168091168,
+      "grad_norm": 0.45322108268737793,
+      "learning_rate": 0.0001972183355160659,
+      "loss": 1.0775,
+      "step": 1709
+    },
+    {
+      "epoch": 0.30448717948717946,
+      "grad_norm": 0.5036569833755493,
+      "learning_rate": 0.00019721505606034448,
+      "loss": 1.1859,
+      "step": 1710
+    },
+    {
+      "epoch": 0.30466524216524216,
+      "grad_norm": 0.5425969958305359,
+      "learning_rate": 0.00019721177469989593,
+      "loss": 1.0173,
+      "step": 1711
+    },
+    {
+      "epoch": 0.30484330484330485,
+      "grad_norm": 0.5638980269432068,
+      "learning_rate": 0.00019720849143478462,
+      "loss": 1.182,
+      "step": 1712
+    },
+    {
+      "epoch": 0.30502136752136755,
+      "grad_norm": 0.5160546898841858,
+      "learning_rate": 0.00019720520626507486,
+      "loss": 0.9853,
+      "step": 1713
+    },
+    {
+      "epoch": 0.3051994301994302,
+      "grad_norm": 0.5079004168510437,
+      "learning_rate": 0.000197201919190831,
+      "loss": 1.3154,
+      "step": 1714
+    },
+    {
+      "epoch": 0.3053774928774929,
+      "grad_norm": 0.4590355455875397,
+      "learning_rate": 0.00019719863021211745,
+      "loss": 1.007,
+      "step": 1715
+    },
+    {
+      "epoch": 0.3055555555555556,
+      "grad_norm": 0.49656423926353455,
+      "learning_rate": 0.00019719533932899865,
+      "loss": 1.2187,
+      "step": 1716
+    },
+    {
+      "epoch": 0.3057336182336182,
+      "grad_norm": 0.46426209807395935,
+      "learning_rate": 0.0001971920465415391,
+      "loss": 1.3007,
+      "step": 1717
+    },
+    {
+      "epoch": 0.3059116809116809,
+      "grad_norm": 0.5211917757987976,
+      "learning_rate": 0.00019718875184980328,
+      "loss": 1.2256,
+      "step": 1718
+    },
+    {
+      "epoch": 0.3060897435897436,
+      "grad_norm": 0.42953309416770935,
+      "learning_rate": 0.00019718545525385578,
+      "loss": 1.2838,
+      "step": 1719
+    },
+    {
+      "epoch": 0.30626780626780625,
+      "grad_norm": 0.4893105924129486,
+      "learning_rate": 0.00019718215675376116,
+      "loss": 1.052,
+      "step": 1720
+    },
+    {
+      "epoch": 0.30644586894586895,
+      "grad_norm": 0.4833602011203766,
+      "learning_rate": 0.00019717885634958405,
+      "loss": 1.069,
+      "step": 1721
+    },
+    {
+      "epoch": 0.30662393162393164,
+      "grad_norm": 0.502176821231842,
+      "learning_rate": 0.0001971755540413891,
+      "loss": 1.1659,
+      "step": 1722
+    },
+    {
+      "epoch": 0.3068019943019943,
+      "grad_norm": 0.4648856818675995,
+      "learning_rate": 0.00019717224982924108,
+      "loss": 1.1873,
+      "step": 1723
+    },
+    {
+      "epoch": 0.306980056980057,
+      "grad_norm": 0.405429869890213,
+      "learning_rate": 0.00019716894371320465,
+      "loss": 0.99,
+      "step": 1724
+    },
+    {
+      "epoch": 0.3071581196581197,
+      "grad_norm": 0.4306945204734802,
+      "learning_rate": 0.00019716563569334463,
+      "loss": 0.8751,
+      "step": 1725
+    },
+    {
+      "epoch": 0.3073361823361823,
+      "grad_norm": 0.49424824118614197,
+      "learning_rate": 0.00019716232576972583,
+      "loss": 0.9205,
+      "step": 1726
+    },
+    {
+      "epoch": 0.307514245014245,
+      "grad_norm": 0.5044034123420715,
+      "learning_rate": 0.00019715901394241306,
+      "loss": 1.2042,
+      "step": 1727
+    },
+    {
+      "epoch": 0.3076923076923077,
+      "grad_norm": 0.512180507183075,
+      "learning_rate": 0.00019715570021147126,
+      "loss": 1.1644,
+      "step": 1728
+    },
+    {
+      "epoch": 0.30787037037037035,
+      "grad_norm": 0.4377981126308441,
+      "learning_rate": 0.00019715238457696538,
+      "loss": 1.1625,
+      "step": 1729
+    },
+    {
+      "epoch": 0.30804843304843305,
+      "grad_norm": 0.49107855558395386,
+      "learning_rate": 0.00019714906703896027,
+      "loss": 1.1037,
+      "step": 1730
+    },
+    {
+      "epoch": 0.30822649572649574,
+      "grad_norm": 0.47342559695243835,
+      "learning_rate": 0.00019714574759752105,
+      "loss": 1.3186,
+      "step": 1731
+    },
+    {
+      "epoch": 0.3084045584045584,
+      "grad_norm": 0.487177312374115,
+      "learning_rate": 0.0001971424262527127,
+      "loss": 1.1196,
+      "step": 1732
+    },
+    {
+      "epoch": 0.3085826210826211,
+      "grad_norm": 0.5290025472640991,
+      "learning_rate": 0.0001971391030046003,
+      "loss": 1.2103,
+      "step": 1733
+    },
+    {
+      "epoch": 0.3087606837606838,
+      "grad_norm": 0.4587760269641876,
+      "learning_rate": 0.00019713577785324896,
+      "loss": 1.1017,
+      "step": 1734
+    },
+    {
+      "epoch": 0.3089387464387464,
+      "grad_norm": 0.45323294401168823,
+      "learning_rate": 0.00019713245079872388,
+      "loss": 1.0,
+      "step": 1735
+    },
+    {
+      "epoch": 0.3091168091168091,
+      "grad_norm": 0.43414804339408875,
+      "learning_rate": 0.00019712912184109013,
+      "loss": 1.0341,
+      "step": 1736
+    },
+    {
+      "epoch": 0.3092948717948718,
+      "grad_norm": 0.49604663252830505,
+      "learning_rate": 0.00019712579098041304,
+      "loss": 0.9437,
+      "step": 1737
+    },
+    {
+      "epoch": 0.30947293447293445,
+      "grad_norm": 0.48580703139305115,
+      "learning_rate": 0.00019712245821675785,
+      "loss": 1.2622,
+      "step": 1738
+    },
+    {
+      "epoch": 0.30965099715099714,
+      "grad_norm": 0.45333603024482727,
+      "learning_rate": 0.00019711912355018982,
+      "loss": 1.2063,
+      "step": 1739
+    },
+    {
+      "epoch": 0.30982905982905984,
+      "grad_norm": 0.5990764498710632,
+      "learning_rate": 0.00019711578698077432,
+      "loss": 1.5097,
+      "step": 1740
+    },
+    {
+      "epoch": 0.31000712250712253,
+      "grad_norm": 0.4386102259159088,
+      "learning_rate": 0.0001971124485085767,
+      "loss": 1.1283,
+      "step": 1741
+    },
+    {
+      "epoch": 0.3101851851851852,
+      "grad_norm": 0.4476035237312317,
+      "learning_rate": 0.00019710910813366242,
+      "loss": 0.8922,
+      "step": 1742
+    },
+    {
+      "epoch": 0.31036324786324787,
+      "grad_norm": 0.5276228785514832,
+      "learning_rate": 0.00019710576585609685,
+      "loss": 1.2373,
+      "step": 1743
+    },
+    {
+      "epoch": 0.31054131054131057,
+      "grad_norm": 0.4885637164115906,
+      "learning_rate": 0.00019710242167594557,
+      "loss": 1.0881,
+      "step": 1744
+    },
+    {
+      "epoch": 0.3107193732193732,
+      "grad_norm": 0.421132355928421,
+      "learning_rate": 0.000197099075593274,
+      "loss": 1.0544,
+      "step": 1745
+    },
+    {
+      "epoch": 0.3108974358974359,
+      "grad_norm": 0.5257927179336548,
+      "learning_rate": 0.00019709572760814777,
+      "loss": 1.265,
+      "step": 1746
+    },
+    {
+      "epoch": 0.3110754985754986,
+      "grad_norm": 0.5164850950241089,
+      "learning_rate": 0.00019709237772063247,
+      "loss": 0.9593,
+      "step": 1747
+    },
+    {
+      "epoch": 0.31125356125356124,
+      "grad_norm": 0.5176383256912231,
+      "learning_rate": 0.00019708902593079374,
+      "loss": 1.0194,
+      "step": 1748
+    },
+    {
+      "epoch": 0.31143162393162394,
+      "grad_norm": 0.4620790481567383,
+      "learning_rate": 0.00019708567223869716,
+      "loss": 0.9241,
+      "step": 1749
+    },
+    {
+      "epoch": 0.31160968660968663,
+      "grad_norm": 0.48307979106903076,
+      "learning_rate": 0.00019708231664440854,
+      "loss": 1.2314,
+      "step": 1750
+    },
+    {
+      "epoch": 0.31178774928774927,
+      "grad_norm": 0.4931468069553375,
+      "learning_rate": 0.00019707895914799364,
+      "loss": 1.2065,
+      "step": 1751
+    },
+    {
+      "epoch": 0.31196581196581197,
+      "grad_norm": 0.5035979747772217,
+      "learning_rate": 0.00019707559974951818,
+      "loss": 1.1867,
+      "step": 1752
+    },
+    {
+      "epoch": 0.31214387464387466,
+      "grad_norm": 0.47543632984161377,
+      "learning_rate": 0.00019707223844904795,
+      "loss": 1.0603,
+      "step": 1753
+    },
+    {
+      "epoch": 0.3123219373219373,
+      "grad_norm": 0.49929797649383545,
+      "learning_rate": 0.00019706887524664892,
+      "loss": 1.0597,
+      "step": 1754
+    },
+    {
+      "epoch": 0.3125,
+      "grad_norm": 0.5075222253799438,
+      "learning_rate": 0.00019706551014238687,
+      "loss": 1.1398,
+      "step": 1755
+    },
+    {
+      "epoch": 0.3126780626780627,
+      "grad_norm": 0.5096884369850159,
+      "learning_rate": 0.00019706214313632784,
+      "loss": 1.1382,
+      "step": 1756
+    },
+    {
+      "epoch": 0.31285612535612534,
+      "grad_norm": 0.4629988372325897,
+      "learning_rate": 0.0001970587742285377,
+      "loss": 1.0009,
+      "step": 1757
+    },
+    {
+      "epoch": 0.31303418803418803,
+      "grad_norm": 0.5244084596633911,
+      "learning_rate": 0.00019705540341908253,
+      "loss": 1.047,
+      "step": 1758
+    },
+    {
+      "epoch": 0.31321225071225073,
+      "grad_norm": 0.5136716961860657,
+      "learning_rate": 0.00019705203070802832,
+      "loss": 1.29,
+      "step": 1759
+    },
+    {
+      "epoch": 0.31339031339031337,
+      "grad_norm": 0.43991541862487793,
+      "learning_rate": 0.0001970486560954412,
+      "loss": 0.9605,
+      "step": 1760
+    },
+    {
+      "epoch": 0.31356837606837606,
+      "grad_norm": 0.4633477032184601,
+      "learning_rate": 0.00019704527958138725,
+      "loss": 1.1507,
+      "step": 1761
+    },
+    {
+      "epoch": 0.31374643874643876,
+      "grad_norm": 0.4419999420642853,
+      "learning_rate": 0.00019704190116593266,
+      "loss": 0.9262,
+      "step": 1762
+    },
+    {
+      "epoch": 0.3139245014245014,
+      "grad_norm": 0.49359434843063354,
+      "learning_rate": 0.00019703852084914357,
+      "loss": 0.9348,
+      "step": 1763
+    },
+    {
+      "epoch": 0.3141025641025641,
+      "grad_norm": 0.5072139501571655,
+      "learning_rate": 0.00019703513863108627,
+      "loss": 1.1592,
+      "step": 1764
+    },
+    {
+      "epoch": 0.3142806267806268,
+      "grad_norm": 0.45969831943511963,
+      "learning_rate": 0.00019703175451182698,
+      "loss": 1.1519,
+      "step": 1765
+    },
+    {
+      "epoch": 0.31445868945868943,
+      "grad_norm": 0.5148758292198181,
+      "learning_rate": 0.00019702836849143208,
+      "loss": 1.1673,
+      "step": 1766
+    },
+    {
+      "epoch": 0.31463675213675213,
+      "grad_norm": 0.43033209443092346,
+      "learning_rate": 0.0001970249805699678,
+      "loss": 0.9256,
+      "step": 1767
+    },
+    {
+      "epoch": 0.3148148148148148,
+      "grad_norm": 0.48143425583839417,
+      "learning_rate": 0.00019702159074750058,
+      "loss": 1.08,
+      "step": 1768
+    },
+    {
+      "epoch": 0.31499287749287747,
+      "grad_norm": 0.4780619740486145,
+      "learning_rate": 0.00019701819902409685,
+      "loss": 1.1198,
+      "step": 1769
+    },
+    {
+      "epoch": 0.31517094017094016,
+      "grad_norm": 0.4662075936794281,
+      "learning_rate": 0.00019701480539982305,
+      "loss": 0.8424,
+      "step": 1770
+    },
+    {
+      "epoch": 0.31534900284900286,
+      "grad_norm": 0.503901481628418,
+      "learning_rate": 0.00019701140987474566,
+      "loss": 1.1026,
+      "step": 1771
+    },
+    {
+      "epoch": 0.31552706552706555,
+      "grad_norm": 0.5197132229804993,
+      "learning_rate": 0.00019700801244893124,
+      "loss": 1.2148,
+      "step": 1772
+    },
+    {
+      "epoch": 0.3157051282051282,
+      "grad_norm": 0.4746309220790863,
+      "learning_rate": 0.00019700461312244634,
+      "loss": 1.0906,
+      "step": 1773
+    },
+    {
+      "epoch": 0.3158831908831909,
+      "grad_norm": 0.5277339816093445,
+      "learning_rate": 0.00019700121189535752,
+      "loss": 1.0588,
+      "step": 1774
+    },
+    {
+      "epoch": 0.3160612535612536,
+      "grad_norm": 0.436002254486084,
+      "learning_rate": 0.00019699780876773147,
+      "loss": 1.0341,
+      "step": 1775
+    },
+    {
+      "epoch": 0.3162393162393162,
+      "grad_norm": 0.5171145796775818,
+      "learning_rate": 0.00019699440373963486,
+      "loss": 1.282,
+      "step": 1776
+    },
+    {
+      "epoch": 0.3164173789173789,
+      "grad_norm": 0.38382846117019653,
+      "learning_rate": 0.00019699099681113436,
+      "loss": 0.8908,
+      "step": 1777
+    },
+    {
+      "epoch": 0.3165954415954416,
+      "grad_norm": 0.4621630609035492,
+      "learning_rate": 0.0001969875879822968,
+      "loss": 1.1074,
+      "step": 1778
+    },
+    {
+      "epoch": 0.31677350427350426,
+      "grad_norm": 0.5543130040168762,
+      "learning_rate": 0.00019698417725318892,
+      "loss": 0.9682,
+      "step": 1779
+    },
+    {
+      "epoch": 0.31695156695156695,
+      "grad_norm": 0.49534836411476135,
+      "learning_rate": 0.00019698076462387753,
+      "loss": 1.107,
+      "step": 1780
+    },
+    {
+      "epoch": 0.31712962962962965,
+      "grad_norm": 0.48844948410987854,
+      "learning_rate": 0.00019697735009442956,
+      "loss": 1.1295,
+      "step": 1781
+    },
+    {
+      "epoch": 0.3173076923076923,
+      "grad_norm": 0.5070686936378479,
+      "learning_rate": 0.00019697393366491185,
+      "loss": 1.083,
+      "step": 1782
+    },
+    {
+      "epoch": 0.317485754985755,
+      "grad_norm": 0.47817620635032654,
+      "learning_rate": 0.00019697051533539134,
+      "loss": 1.3014,
+      "step": 1783
+    },
+    {
+      "epoch": 0.3176638176638177,
+      "grad_norm": 0.538488507270813,
+      "learning_rate": 0.00019696709510593502,
+      "loss": 1.0354,
+      "step": 1784
+    },
+    {
+      "epoch": 0.3178418803418803,
+      "grad_norm": 0.5141439437866211,
+      "learning_rate": 0.0001969636729766099,
+      "loss": 1.2912,
+      "step": 1785
+    },
+    {
+      "epoch": 0.318019943019943,
+      "grad_norm": 0.5009665489196777,
+      "learning_rate": 0.00019696024894748306,
+      "loss": 0.9014,
+      "step": 1786
+    },
+    {
+      "epoch": 0.3181980056980057,
+      "grad_norm": 0.46199744939804077,
+      "learning_rate": 0.00019695682301862155,
+      "loss": 1.0532,
+      "step": 1787
+    },
+    {
+      "epoch": 0.31837606837606836,
+      "grad_norm": 0.4649423062801361,
+      "learning_rate": 0.0001969533951900925,
+      "loss": 0.8608,
+      "step": 1788
+    },
+    {
+      "epoch": 0.31855413105413105,
+      "grad_norm": 0.516909658908844,
+      "learning_rate": 0.0001969499654619631,
+      "loss": 1.1385,
+      "step": 1789
+    },
+    {
+      "epoch": 0.31873219373219375,
+      "grad_norm": 0.46016669273376465,
+      "learning_rate": 0.00019694653383430048,
+      "loss": 0.9168,
+      "step": 1790
+    },
+    {
+      "epoch": 0.3189102564102564,
+      "grad_norm": 0.4794938564300537,
+      "learning_rate": 0.00019694310030717193,
+      "loss": 1.0244,
+      "step": 1791
+    },
+    {
+      "epoch": 0.3190883190883191,
+      "grad_norm": 0.46577662229537964,
+      "learning_rate": 0.00019693966488064471,
+      "loss": 1.0954,
+      "step": 1792
+    },
+    {
+      "epoch": 0.3192663817663818,
+      "grad_norm": 0.4866746962070465,
+      "learning_rate": 0.00019693622755478614,
+      "loss": 1.2925,
+      "step": 1793
+    },
+    {
+      "epoch": 0.3194444444444444,
+      "grad_norm": 0.4841702878475189,
+      "learning_rate": 0.00019693278832966357,
+      "loss": 1.119,
+      "step": 1794
+    },
+    {
+      "epoch": 0.3196225071225071,
+      "grad_norm": 0.4835243821144104,
+      "learning_rate": 0.00019692934720534435,
+      "loss": 1.1702,
+      "step": 1795
+    },
+    {
+      "epoch": 0.3198005698005698,
+      "grad_norm": 0.5200608968734741,
+      "learning_rate": 0.00019692590418189594,
+      "loss": 1.1989,
+      "step": 1796
+    },
+    {
+      "epoch": 0.31997863247863245,
+      "grad_norm": 0.5147821307182312,
+      "learning_rate": 0.00019692245925938577,
+      "loss": 1.1417,
+      "step": 1797
+    },
+    {
+      "epoch": 0.32015669515669515,
+      "grad_norm": 0.5145614743232727,
+      "learning_rate": 0.00019691901243788136,
+      "loss": 1.0571,
+      "step": 1798
+    },
+    {
+      "epoch": 0.32033475783475784,
+      "grad_norm": 0.5416026711463928,
+      "learning_rate": 0.00019691556371745022,
+      "loss": 1.188,
+      "step": 1799
+    },
+    {
+      "epoch": 0.32051282051282054,
+      "grad_norm": 0.5140644311904907,
+      "learning_rate": 0.00019691211309815995,
+      "loss": 1.1795,
+      "step": 1800
+    },
+    {
+      "epoch": 0.3206908831908832,
+      "grad_norm": 0.44219106435775757,
+      "learning_rate": 0.00019690866058007817,
+      "loss": 0.9215,
+      "step": 1801
+    },
+    {
+      "epoch": 0.3208689458689459,
+      "grad_norm": 0.49523603916168213,
+      "learning_rate": 0.00019690520616327245,
+      "loss": 1.1117,
+      "step": 1802
+    },
+    {
+      "epoch": 0.32104700854700857,
+      "grad_norm": 0.5818293690681458,
+      "learning_rate": 0.0001969017498478105,
+      "loss": 1.16,
+      "step": 1803
+    },
+    {
+      "epoch": 0.3212250712250712,
+      "grad_norm": 0.5175749659538269,
+      "learning_rate": 0.0001968982916337601,
+      "loss": 1.1999,
+      "step": 1804
+    },
+    {
+      "epoch": 0.3214031339031339,
+      "grad_norm": 0.49916017055511475,
+      "learning_rate": 0.00019689483152118898,
+      "loss": 0.9505,
+      "step": 1805
+    },
+    {
+      "epoch": 0.3215811965811966,
+      "grad_norm": 0.46849536895751953,
+      "learning_rate": 0.00019689136951016488,
+      "loss": 0.9627,
+      "step": 1806
+    },
+    {
+      "epoch": 0.32175925925925924,
+      "grad_norm": 0.4226818382740021,
+      "learning_rate": 0.00019688790560075568,
+      "loss": 1.037,
+      "step": 1807
+    },
+    {
+      "epoch": 0.32193732193732194,
+      "grad_norm": 0.4697103798389435,
+      "learning_rate": 0.00019688443979302923,
+      "loss": 1.1431,
+      "step": 1808
+    },
+    {
+      "epoch": 0.32211538461538464,
+      "grad_norm": 0.4999365508556366,
+      "learning_rate": 0.00019688097208705343,
+      "loss": 1.171,
+      "step": 1809
+    },
+    {
+      "epoch": 0.3222934472934473,
+      "grad_norm": 0.5229731798171997,
+      "learning_rate": 0.00019687750248289625,
+      "loss": 1.3395,
+      "step": 1810
+    },
+    {
+      "epoch": 0.32247150997151,
+      "grad_norm": 0.512525737285614,
+      "learning_rate": 0.00019687403098062566,
+      "loss": 1.1438,
+      "step": 1811
+    },
+    {
+      "epoch": 0.32264957264957267,
+      "grad_norm": 0.4558548927307129,
+      "learning_rate": 0.00019687055758030967,
+      "loss": 1.0012,
+      "step": 1812
+    },
+    {
+      "epoch": 0.3228276353276353,
+      "grad_norm": 0.45195743441581726,
+      "learning_rate": 0.00019686708228201636,
+      "loss": 1.0222,
+      "step": 1813
+    },
+    {
+      "epoch": 0.323005698005698,
+      "grad_norm": 0.5023126602172852,
+      "learning_rate": 0.00019686360508581373,
+      "loss": 1.2128,
+      "step": 1814
+    },
+    {
+      "epoch": 0.3231837606837607,
+      "grad_norm": 0.46516045928001404,
+      "learning_rate": 0.00019686012599177003,
+      "loss": 0.989,
+      "step": 1815
+    },
+    {
+      "epoch": 0.32336182336182334,
+      "grad_norm": 0.4142672121524811,
+      "learning_rate": 0.00019685664499995338,
+      "loss": 1.0144,
+      "step": 1816
+    },
+    {
+      "epoch": 0.32353988603988604,
+      "grad_norm": 0.4511009752750397,
+      "learning_rate": 0.0001968531621104319,
+      "loss": 0.885,
+      "step": 1817
+    },
+    {
+      "epoch": 0.32371794871794873,
+      "grad_norm": 0.49583545327186584,
+      "learning_rate": 0.00019684967732327396,
+      "loss": 1.0986,
+      "step": 1818
+    },
+    {
+      "epoch": 0.3238960113960114,
+      "grad_norm": 0.5872161388397217,
+      "learning_rate": 0.0001968461906385478,
+      "loss": 1.1482,
+      "step": 1819
+    },
+    {
+      "epoch": 0.32407407407407407,
+      "grad_norm": 0.4509563148021698,
+      "learning_rate": 0.00019684270205632168,
+      "loss": 1.0578,
+      "step": 1820
+    },
+    {
+      "epoch": 0.32425213675213677,
+      "grad_norm": 0.501345157623291,
+      "learning_rate": 0.00019683921157666402,
+      "loss": 1.1792,
+      "step": 1821
+    },
+    {
+      "epoch": 0.3244301994301994,
+      "grad_norm": 0.48257577419281006,
+      "learning_rate": 0.00019683571919964314,
+      "loss": 1.0448,
+      "step": 1822
+    },
+    {
+      "epoch": 0.3246082621082621,
+      "grad_norm": 0.5399422645568848,
+      "learning_rate": 0.00019683222492532752,
+      "loss": 1.0579,
+      "step": 1823
+    },
+    {
+      "epoch": 0.3247863247863248,
+      "grad_norm": 0.4382506012916565,
+      "learning_rate": 0.0001968287287537856,
+      "loss": 1.0246,
+      "step": 1824
+    },
+    {
+      "epoch": 0.32496438746438744,
+      "grad_norm": 0.49247491359710693,
+      "learning_rate": 0.00019682523068508586,
+      "loss": 1.318,
+      "step": 1825
+    },
+    {
+      "epoch": 0.32514245014245013,
+      "grad_norm": 0.49067625403404236,
+      "learning_rate": 0.0001968217307192969,
+      "loss": 1.1028,
+      "step": 1826
+    },
+    {
+      "epoch": 0.32532051282051283,
+      "grad_norm": 0.4832286238670349,
+      "learning_rate": 0.00019681822885648723,
+      "loss": 1.0996,
+      "step": 1827
+    },
+    {
+      "epoch": 0.32549857549857547,
+      "grad_norm": 0.47144386172294617,
+      "learning_rate": 0.0001968147250967255,
+      "loss": 1.0707,
+      "step": 1828
+    },
+    {
+      "epoch": 0.32567663817663817,
+      "grad_norm": 0.46299225091934204,
+      "learning_rate": 0.0001968112194400803,
+      "loss": 1.0461,
+      "step": 1829
+    },
+    {
+      "epoch": 0.32585470085470086,
+      "grad_norm": 0.4880816340446472,
+      "learning_rate": 0.00019680771188662044,
+      "loss": 1.1198,
+      "step": 1830
+    },
+    {
+      "epoch": 0.32603276353276356,
+      "grad_norm": 0.43837276101112366,
+      "learning_rate": 0.00019680420243641452,
+      "loss": 1.0599,
+      "step": 1831
+    },
+    {
+      "epoch": 0.3262108262108262,
+      "grad_norm": 0.453168660402298,
+      "learning_rate": 0.0001968006910895314,
+      "loss": 1.0327,
+      "step": 1832
+    },
+    {
+      "epoch": 0.3263888888888889,
+      "grad_norm": 0.45183828473091125,
+      "learning_rate": 0.00019679717784603975,
+      "loss": 1.1381,
+      "step": 1833
+    },
+    {
+      "epoch": 0.3265669515669516,
+      "grad_norm": 0.5326765775680542,
+      "learning_rate": 0.00019679366270600852,
+      "loss": 1.3169,
+      "step": 1834
+    },
+    {
+      "epoch": 0.32674501424501423,
+      "grad_norm": 0.47468429803848267,
+      "learning_rate": 0.00019679014566950653,
+      "loss": 1.1816,
+      "step": 1835
+    },
+    {
+      "epoch": 0.3269230769230769,
+      "grad_norm": 0.5096879005432129,
+      "learning_rate": 0.0001967866267366027,
+      "loss": 1.1162,
+      "step": 1836
+    },
+    {
+      "epoch": 0.3271011396011396,
+      "grad_norm": 0.491514652967453,
+      "learning_rate": 0.00019678310590736598,
+      "loss": 1.2793,
+      "step": 1837
+    },
+    {
+      "epoch": 0.32727920227920226,
+      "grad_norm": 0.601439356803894,
+      "learning_rate": 0.00019677958318186533,
+      "loss": 0.9851,
+      "step": 1838
+    },
+    {
+      "epoch": 0.32745726495726496,
+      "grad_norm": 0.45270970463752747,
+      "learning_rate": 0.0001967760585601698,
+      "loss": 1.0042,
+      "step": 1839
+    },
+    {
+      "epoch": 0.32763532763532766,
+      "grad_norm": 0.48864325881004333,
+      "learning_rate": 0.00019677253204234847,
+      "loss": 1.0835,
+      "step": 1840
+    },
+    {
+      "epoch": 0.3278133903133903,
+      "grad_norm": 0.5855685472488403,
+      "learning_rate": 0.00019676900362847037,
+      "loss": 1.193,
+      "step": 1841
+    },
+    {
+      "epoch": 0.327991452991453,
+      "grad_norm": 0.7181013822555542,
+      "learning_rate": 0.00019676547331860466,
+      "loss": 1.2028,
+      "step": 1842
+    },
+    {
+      "epoch": 0.3281695156695157,
+      "grad_norm": 0.4517378807067871,
+      "learning_rate": 0.00019676194111282054,
+      "loss": 1.013,
+      "step": 1843
+    },
+    {
+      "epoch": 0.32834757834757833,
+      "grad_norm": 0.5477756857872009,
+      "learning_rate": 0.00019675840701118718,
+      "loss": 1.2311,
+      "step": 1844
+    },
+    {
+      "epoch": 0.328525641025641,
+      "grad_norm": 0.5194997191429138,
+      "learning_rate": 0.00019675487101377382,
+      "loss": 1.0953,
+      "step": 1845
+    },
+    {
+      "epoch": 0.3287037037037037,
+      "grad_norm": 0.44454067945480347,
+      "learning_rate": 0.00019675133312064977,
+      "loss": 0.8505,
+      "step": 1846
+    },
+    {
+      "epoch": 0.32888176638176636,
+      "grad_norm": 0.3938713073730469,
+      "learning_rate": 0.00019674779333188428,
+      "loss": 0.8525,
+      "step": 1847
+    },
+    {
+      "epoch": 0.32905982905982906,
+      "grad_norm": 0.4927884340286255,
+      "learning_rate": 0.00019674425164754682,
+      "loss": 1.2477,
+      "step": 1848
+    },
+    {
+      "epoch": 0.32923789173789175,
+      "grad_norm": 0.4516635239124298,
+      "learning_rate": 0.0001967407080677067,
+      "loss": 0.8333,
+      "step": 1849
+    },
+    {
+      "epoch": 0.3294159544159544,
+      "grad_norm": 0.47105780243873596,
+      "learning_rate": 0.00019673716259243336,
+      "loss": 1.0989,
+      "step": 1850
+    },
+    {
+      "epoch": 0.3295940170940171,
+      "grad_norm": 0.5192127823829651,
+      "learning_rate": 0.00019673361522179627,
+      "loss": 1.1164,
+      "step": 1851
+    },
+    {
+      "epoch": 0.3297720797720798,
+      "grad_norm": 0.5222696661949158,
+      "learning_rate": 0.00019673006595586495,
+      "loss": 1.3191,
+      "step": 1852
+    },
+    {
+      "epoch": 0.3299501424501424,
+      "grad_norm": 0.6046679019927979,
+      "learning_rate": 0.0001967265147947089,
+      "loss": 0.9782,
+      "step": 1853
+    },
+    {
+      "epoch": 0.3301282051282051,
+      "grad_norm": 0.47928622364997864,
+      "learning_rate": 0.00019672296173839775,
+      "loss": 1.2247,
+      "step": 1854
+    },
+    {
+      "epoch": 0.3303062678062678,
+      "grad_norm": 0.5435982346534729,
+      "learning_rate": 0.00019671940678700107,
+      "loss": 1.1647,
+      "step": 1855
+    },
+    {
+      "epoch": 0.33048433048433046,
+      "grad_norm": 0.46878984570503235,
+      "learning_rate": 0.00019671584994058856,
+      "loss": 1.132,
+      "step": 1856
+    },
+    {
+      "epoch": 0.33066239316239315,
+      "grad_norm": 0.5336877107620239,
+      "learning_rate": 0.00019671229119922986,
+      "loss": 1.0583,
+      "step": 1857
+    },
+    {
+      "epoch": 0.33084045584045585,
+      "grad_norm": 0.4811093807220459,
+      "learning_rate": 0.0001967087305629947,
+      "loss": 1.0089,
+      "step": 1858
+    },
+    {
+      "epoch": 0.33101851851851855,
+      "grad_norm": 0.5140184760093689,
+      "learning_rate": 0.0001967051680319529,
+      "loss": 1.2335,
+      "step": 1859
+    },
+    {
+      "epoch": 0.3311965811965812,
+      "grad_norm": 0.5855883955955505,
+      "learning_rate": 0.00019670160360617418,
+      "loss": 1.1107,
+      "step": 1860
+    },
+    {
+      "epoch": 0.3313746438746439,
+      "grad_norm": 0.5081531405448914,
+      "learning_rate": 0.00019669803728572844,
+      "loss": 1.0669,
+      "step": 1861
+    },
+    {
+      "epoch": 0.3315527065527066,
+      "grad_norm": 0.48749417066574097,
+      "learning_rate": 0.0001966944690706855,
+      "loss": 1.1465,
+      "step": 1862
+    },
+    {
+      "epoch": 0.3317307692307692,
+      "grad_norm": 0.5175687670707703,
+      "learning_rate": 0.00019669089896111536,
+      "loss": 1.254,
+      "step": 1863
+    },
+    {
+      "epoch": 0.3319088319088319,
+      "grad_norm": 0.4198860824108124,
+      "learning_rate": 0.0001966873269570879,
+      "loss": 0.9811,
+      "step": 1864
+    },
+    {
+      "epoch": 0.3320868945868946,
+      "grad_norm": 0.5220273733139038,
+      "learning_rate": 0.0001966837530586731,
+      "loss": 1.277,
+      "step": 1865
+    },
+    {
+      "epoch": 0.33226495726495725,
+      "grad_norm": 0.551954448223114,
+      "learning_rate": 0.00019668017726594101,
+      "loss": 1.0627,
+      "step": 1866
+    },
+    {
+      "epoch": 0.33244301994301995,
+      "grad_norm": 0.5289301872253418,
+      "learning_rate": 0.00019667659957896166,
+      "loss": 1.4525,
+      "step": 1867
+    },
+    {
+      "epoch": 0.33262108262108264,
+      "grad_norm": 0.5190161466598511,
+      "learning_rate": 0.00019667301999780522,
+      "loss": 1.1064,
+      "step": 1868
+    },
+    {
+      "epoch": 0.3327991452991453,
+      "grad_norm": 0.437637060880661,
+      "learning_rate": 0.00019666943852254172,
+      "loss": 1.1304,
+      "step": 1869
+    },
+    {
+      "epoch": 0.332977207977208,
+      "grad_norm": 0.4801286458969116,
+      "learning_rate": 0.00019666585515324138,
+      "loss": 1.032,
+      "step": 1870
+    },
+    {
+      "epoch": 0.3331552706552707,
+      "grad_norm": 0.5041908621788025,
+      "learning_rate": 0.00019666226988997445,
+      "loss": 1.2611,
+      "step": 1871
+    },
+    {
+      "epoch": 0.3333333333333333,
+      "grad_norm": 0.4529375731945038,
+      "learning_rate": 0.00019665868273281115,
+      "loss": 1.1346,
+      "step": 1872
+    },
+    {
+      "epoch": 0.333511396011396,
+      "grad_norm": 0.4797019064426422,
+      "learning_rate": 0.00019665509368182172,
+      "loss": 1.1716,
+      "step": 1873
+    },
+    {
+      "epoch": 0.3336894586894587,
+      "grad_norm": 0.5505055785179138,
+      "learning_rate": 0.00019665150273707652,
+      "loss": 0.9729,
+      "step": 1874
+    },
+    {
+      "epoch": 0.33386752136752135,
+      "grad_norm": 0.4228051006793976,
+      "learning_rate": 0.00019664790989864592,
+      "loss": 0.9023,
+      "step": 1875
+    },
+    {
+      "epoch": 0.33404558404558404,
+      "grad_norm": 0.4926959276199341,
+      "learning_rate": 0.00019664431516660028,
+      "loss": 1.0999,
+      "step": 1876
+    },
+    {
+      "epoch": 0.33422364672364674,
+      "grad_norm": 0.4273219704627991,
+      "learning_rate": 0.00019664071854101005,
+      "loss": 1.1039,
+      "step": 1877
+    },
+    {
+      "epoch": 0.3344017094017094,
+      "grad_norm": 0.48438936471939087,
+      "learning_rate": 0.00019663712002194566,
+      "loss": 1.1308,
+      "step": 1878
+    },
+    {
+      "epoch": 0.3345797720797721,
+      "grad_norm": 0.5102053284645081,
+      "learning_rate": 0.0001966335196094777,
+      "loss": 1.0618,
+      "step": 1879
+    },
+    {
+      "epoch": 0.33475783475783477,
+      "grad_norm": 0.4357300400733948,
+      "learning_rate": 0.00019662991730367663,
+      "loss": 1.0521,
+      "step": 1880
+    },
+    {
+      "epoch": 0.3349358974358974,
+      "grad_norm": 0.5052695870399475,
+      "learning_rate": 0.00019662631310461308,
+      "loss": 0.9579,
+      "step": 1881
+    },
+    {
+      "epoch": 0.3351139601139601,
+      "grad_norm": 0.4889117181301117,
+      "learning_rate": 0.00019662270701235762,
+      "loss": 1.0304,
+      "step": 1882
+    },
+    {
+      "epoch": 0.3352920227920228,
+      "grad_norm": 0.4671195149421692,
+      "learning_rate": 0.000196619099026981,
+      "loss": 1.2228,
+      "step": 1883
+    },
+    {
+      "epoch": 0.33547008547008544,
+      "grad_norm": 0.4700174331665039,
+      "learning_rate": 0.0001966154891485538,
+      "loss": 0.9634,
+      "step": 1884
+    },
+    {
+      "epoch": 0.33564814814814814,
+      "grad_norm": 0.488817423582077,
+      "learning_rate": 0.00019661187737714676,
+      "loss": 1.2499,
+      "step": 1885
+    },
+    {
+      "epoch": 0.33582621082621084,
+      "grad_norm": 0.5336169600486755,
+      "learning_rate": 0.00019660826371283073,
+      "loss": 1.251,
+      "step": 1886
+    },
+    {
+      "epoch": 0.33600427350427353,
+      "grad_norm": 0.5054540038108826,
+      "learning_rate": 0.00019660464815567642,
+      "loss": 1.221,
+      "step": 1887
+    },
+    {
+      "epoch": 0.33618233618233617,
+      "grad_norm": 0.5078747868537903,
+      "learning_rate": 0.00019660103070575472,
+      "loss": 0.9792,
+      "step": 1888
+    },
+    {
+      "epoch": 0.33636039886039887,
+      "grad_norm": 0.498571515083313,
+      "learning_rate": 0.0001965974113631365,
+      "loss": 1.1682,
+      "step": 1889
+    },
+    {
+      "epoch": 0.33653846153846156,
+      "grad_norm": 0.49969518184661865,
+      "learning_rate": 0.00019659379012789264,
+      "loss": 1.0012,
+      "step": 1890
+    },
+    {
+      "epoch": 0.3367165242165242,
+      "grad_norm": 0.4238094687461853,
+      "learning_rate": 0.00019659016700009416,
+      "loss": 1.0455,
+      "step": 1891
+    },
+    {
+      "epoch": 0.3368945868945869,
+      "grad_norm": 0.5139104723930359,
+      "learning_rate": 0.000196586541979812,
+      "loss": 0.9979,
+      "step": 1892
+    },
+    {
+      "epoch": 0.3370726495726496,
+      "grad_norm": 0.5446547269821167,
+      "learning_rate": 0.00019658291506711715,
+      "loss": 0.9271,
+      "step": 1893
+    },
+    {
+      "epoch": 0.33725071225071224,
+      "grad_norm": 0.5284572839736938,
+      "learning_rate": 0.00019657928626208077,
+      "loss": 1.0356,
+      "step": 1894
+    },
+    {
+      "epoch": 0.33742877492877493,
+      "grad_norm": 0.49936217069625854,
+      "learning_rate": 0.00019657565556477387,
+      "loss": 0.9785,
+      "step": 1895
+    },
+    {
+      "epoch": 0.33760683760683763,
+      "grad_norm": 0.4678729772567749,
+      "learning_rate": 0.00019657202297526763,
+      "loss": 1.2135,
+      "step": 1896
+    },
+    {
+      "epoch": 0.33778490028490027,
+      "grad_norm": 0.46844249963760376,
+      "learning_rate": 0.0001965683884936332,
+      "loss": 0.9369,
+      "step": 1897
+    },
+    {
+      "epoch": 0.33796296296296297,
+      "grad_norm": 0.4307389557361603,
+      "learning_rate": 0.0001965647521199418,
+      "loss": 0.9301,
+      "step": 1898
+    },
+    {
+      "epoch": 0.33814102564102566,
+      "grad_norm": 0.48227834701538086,
+      "learning_rate": 0.00019656111385426468,
+      "loss": 1.3169,
+      "step": 1899
+    },
+    {
+      "epoch": 0.3383190883190883,
+      "grad_norm": 0.45860713720321655,
+      "learning_rate": 0.00019655747369667315,
+      "loss": 0.9835,
+      "step": 1900
+    },
+    {
+      "epoch": 0.338497150997151,
+      "grad_norm": 0.5522414445877075,
+      "learning_rate": 0.00019655383164723846,
+      "loss": 1.363,
+      "step": 1901
+    },
+    {
+      "epoch": 0.3386752136752137,
+      "grad_norm": 0.5283710360527039,
+      "learning_rate": 0.000196550187706032,
+      "loss": 1.1499,
+      "step": 1902
+    },
+    {
+      "epoch": 0.33885327635327633,
+      "grad_norm": 0.4419134259223938,
+      "learning_rate": 0.00019654654187312525,
+      "loss": 1.2039,
+      "step": 1903
+    },
+    {
+      "epoch": 0.33903133903133903,
+      "grad_norm": 0.49066096544265747,
+      "learning_rate": 0.00019654289414858952,
+      "loss": 0.9707,
+      "step": 1904
+    },
+    {
+      "epoch": 0.3392094017094017,
+      "grad_norm": 0.4619338810443878,
+      "learning_rate": 0.00019653924453249633,
+      "loss": 1.0849,
+      "step": 1905
+    },
+    {
+      "epoch": 0.33938746438746437,
+      "grad_norm": 0.5191119313240051,
+      "learning_rate": 0.0001965355930249172,
+      "loss": 1.1387,
+      "step": 1906
+    },
+    {
+      "epoch": 0.33956552706552706,
+      "grad_norm": 0.5245711207389832,
+      "learning_rate": 0.00019653193962592368,
+      "loss": 1.3435,
+      "step": 1907
+    },
+    {
+      "epoch": 0.33974358974358976,
+      "grad_norm": 0.49562904238700867,
+      "learning_rate": 0.0001965282843355873,
+      "loss": 1.2781,
+      "step": 1908
+    },
+    {
+      "epoch": 0.3399216524216524,
+      "grad_norm": 0.4661353826522827,
+      "learning_rate": 0.0001965246271539797,
+      "loss": 0.9317,
+      "step": 1909
+    },
+    {
+      "epoch": 0.3400997150997151,
+      "grad_norm": 0.4723222851753235,
+      "learning_rate": 0.00019652096808117254,
+      "loss": 1.0733,
+      "step": 1910
+    },
+    {
+      "epoch": 0.3402777777777778,
+      "grad_norm": 0.4358505308628082,
+      "learning_rate": 0.00019651730711723754,
+      "loss": 1.1461,
+      "step": 1911
+    },
+    {
+      "epoch": 0.34045584045584043,
+      "grad_norm": 0.462422251701355,
+      "learning_rate": 0.00019651364426224638,
+      "loss": 1.0914,
+      "step": 1912
+    },
+    {
+      "epoch": 0.3406339031339031,
+      "grad_norm": 0.47952914237976074,
+      "learning_rate": 0.0001965099795162709,
+      "loss": 1.0392,
+      "step": 1913
+    },
+    {
+      "epoch": 0.3408119658119658,
+      "grad_norm": 0.5036373734474182,
+      "learning_rate": 0.00019650631287938282,
+      "loss": 1.4002,
+      "step": 1914
+    },
+    {
+      "epoch": 0.34099002849002846,
+      "grad_norm": 0.5130090713500977,
+      "learning_rate": 0.000196502644351654,
+      "loss": 1.3499,
+      "step": 1915
+    },
+    {
+      "epoch": 0.34116809116809116,
+      "grad_norm": 0.4426332414150238,
+      "learning_rate": 0.00019649897393315635,
+      "loss": 1.0726,
+      "step": 1916
+    },
+    {
+      "epoch": 0.34134615384615385,
+      "grad_norm": 0.5580727458000183,
+      "learning_rate": 0.00019649530162396176,
+      "loss": 1.1164,
+      "step": 1917
+    },
+    {
+      "epoch": 0.34152421652421655,
+      "grad_norm": 0.545001745223999,
+      "learning_rate": 0.00019649162742414218,
+      "loss": 0.962,
+      "step": 1918
+    },
+    {
+      "epoch": 0.3417022792022792,
+      "grad_norm": 0.5225808024406433,
+      "learning_rate": 0.00019648795133376962,
+      "loss": 1.1415,
+      "step": 1919
+    },
+    {
+      "epoch": 0.3418803418803419,
+      "grad_norm": 0.48210129141807556,
+      "learning_rate": 0.0001964842733529161,
+      "loss": 1.1188,
+      "step": 1920
+    },
+    {
+      "epoch": 0.3420584045584046,
+      "grad_norm": 0.4515395164489746,
+      "learning_rate": 0.00019648059348165365,
+      "loss": 1.0828,
+      "step": 1921
+    },
+    {
+      "epoch": 0.3422364672364672,
+      "grad_norm": 0.5802633166313171,
+      "learning_rate": 0.0001964769117200544,
+      "loss": 1.3137,
+      "step": 1922
+    },
+    {
+      "epoch": 0.3424145299145299,
+      "grad_norm": 0.4432032108306885,
+      "learning_rate": 0.00019647322806819046,
+      "loss": 1.0523,
+      "step": 1923
+    },
+    {
+      "epoch": 0.3425925925925926,
+      "grad_norm": 0.4697614908218384,
+      "learning_rate": 0.00019646954252613402,
+      "loss": 0.8426,
+      "step": 1924
+    },
+    {
+      "epoch": 0.34277065527065526,
+      "grad_norm": 0.4610968232154846,
+      "learning_rate": 0.0001964658550939573,
+      "loss": 0.9826,
+      "step": 1925
+    },
+    {
+      "epoch": 0.34294871794871795,
+      "grad_norm": 0.5278257727622986,
+      "learning_rate": 0.00019646216577173258,
+      "loss": 1.1064,
+      "step": 1926
+    },
+    {
+      "epoch": 0.34312678062678065,
+      "grad_norm": 0.5686144232749939,
+      "learning_rate": 0.00019645847455953205,
+      "loss": 0.9138,
+      "step": 1927
+    },
+    {
+      "epoch": 0.3433048433048433,
+      "grad_norm": 0.42894792556762695,
+      "learning_rate": 0.0001964547814574281,
+      "loss": 1.0461,
+      "step": 1928
+    },
+    {
+      "epoch": 0.343482905982906,
+      "grad_norm": 0.5567317605018616,
+      "learning_rate": 0.0001964510864654931,
+      "loss": 0.8787,
+      "step": 1929
+    },
+    {
+      "epoch": 0.3436609686609687,
+      "grad_norm": 0.5015586614608765,
+      "learning_rate": 0.0001964473895837994,
+      "loss": 1.1406,
+      "step": 1930
+    },
+    {
+      "epoch": 0.3438390313390313,
+      "grad_norm": 0.47391530871391296,
+      "learning_rate": 0.00019644369081241948,
+      "loss": 1.0685,
+      "step": 1931
+    },
+    {
+      "epoch": 0.344017094017094,
+      "grad_norm": 0.546037495136261,
+      "learning_rate": 0.00019643999015142574,
+      "loss": 1.2349,
+      "step": 1932
+    },
+    {
+      "epoch": 0.3441951566951567,
+      "grad_norm": 0.4724953770637512,
+      "learning_rate": 0.00019643628760089078,
+      "loss": 1.0621,
+      "step": 1933
+    },
+    {
+      "epoch": 0.34437321937321935,
+      "grad_norm": 0.5644593834877014,
+      "learning_rate": 0.00019643258316088703,
+      "loss": 1.2559,
+      "step": 1934
+    },
+    {
+      "epoch": 0.34455128205128205,
+      "grad_norm": 0.500815749168396,
+      "learning_rate": 0.00019642887683148718,
+      "loss": 1.0439,
+      "step": 1935
+    },
+    {
+      "epoch": 0.34472934472934474,
+      "grad_norm": 0.4932316541671753,
+      "learning_rate": 0.0001964251686127638,
+      "loss": 1.0404,
+      "step": 1936
+    },
+    {
+      "epoch": 0.3449074074074074,
+      "grad_norm": 0.48494651913642883,
+      "learning_rate": 0.00019642145850478954,
+      "loss": 0.9951,
+      "step": 1937
+    },
+    {
+      "epoch": 0.3450854700854701,
+      "grad_norm": 0.5191963315010071,
+      "learning_rate": 0.00019641774650763706,
+      "loss": 1.1258,
+      "step": 1938
+    },
+    {
+      "epoch": 0.3452635327635328,
+      "grad_norm": 0.4439312815666199,
+      "learning_rate": 0.00019641403262137918,
+      "loss": 1.1158,
+      "step": 1939
+    },
+    {
+      "epoch": 0.3454415954415954,
+      "grad_norm": 0.4829137921333313,
+      "learning_rate": 0.0001964103168460886,
+      "loss": 1.0531,
+      "step": 1940
+    },
+    {
+      "epoch": 0.3456196581196581,
+      "grad_norm": 0.49433329701423645,
+      "learning_rate": 0.00019640659918183811,
+      "loss": 1.1295,
+      "step": 1941
+    },
+    {
+      "epoch": 0.3457977207977208,
+      "grad_norm": 0.5351347923278809,
+      "learning_rate": 0.00019640287962870062,
+      "loss": 1.2379,
+      "step": 1942
+    },
+    {
+      "epoch": 0.34597578347578345,
+      "grad_norm": 0.4845680892467499,
+      "learning_rate": 0.00019639915818674895,
+      "loss": 1.0197,
+      "step": 1943
+    },
+    {
+      "epoch": 0.34615384615384615,
+      "grad_norm": 0.5312514901161194,
+      "learning_rate": 0.00019639543485605604,
+      "loss": 0.9734,
+      "step": 1944
+    },
+    {
+      "epoch": 0.34633190883190884,
+      "grad_norm": 0.4571874737739563,
+      "learning_rate": 0.00019639170963669478,
+      "loss": 1.1012,
+      "step": 1945
+    },
+    {
+      "epoch": 0.34650997150997154,
+      "grad_norm": 0.4449031949043274,
+      "learning_rate": 0.00019638798252873824,
+      "loss": 1.1393,
+      "step": 1946
+    },
+    {
+      "epoch": 0.3466880341880342,
+      "grad_norm": 0.47470834851264954,
+      "learning_rate": 0.0001963842535322594,
+      "loss": 0.981,
+      "step": 1947
+    },
+    {
+      "epoch": 0.3468660968660969,
+      "grad_norm": 0.5386981964111328,
+      "learning_rate": 0.00019638052264733132,
+      "loss": 1.1247,
+      "step": 1948
+    },
+    {
+      "epoch": 0.34704415954415957,
+      "grad_norm": 0.535589873790741,
+      "learning_rate": 0.00019637678987402714,
+      "loss": 1.3157,
+      "step": 1949
+    },
+    {
+      "epoch": 0.3472222222222222,
+      "grad_norm": 0.49338245391845703,
+      "learning_rate": 0.00019637305521242,
+      "loss": 1.1066,
+      "step": 1950
+    },
+    {
+      "epoch": 0.3474002849002849,
+      "grad_norm": 0.4247688353061676,
+      "learning_rate": 0.00019636931866258298,
+      "loss": 1.0039,
+      "step": 1951
+    },
+    {
+      "epoch": 0.3475783475783476,
+      "grad_norm": 0.5351517200469971,
+      "learning_rate": 0.00019636558022458934,
+      "loss": 1.0344,
+      "step": 1952
+    },
+    {
+      "epoch": 0.34775641025641024,
+      "grad_norm": 0.4633362889289856,
+      "learning_rate": 0.00019636183989851238,
+      "loss": 1.1383,
+      "step": 1953
+    },
+    {
+      "epoch": 0.34793447293447294,
+      "grad_norm": 0.553709089756012,
+      "learning_rate": 0.00019635809768442535,
+      "loss": 1.0389,
+      "step": 1954
+    },
+    {
+      "epoch": 0.34811253561253563,
+      "grad_norm": 0.479374498128891,
+      "learning_rate": 0.00019635435358240154,
+      "loss": 1.1774,
+      "step": 1955
+    },
+    {
+      "epoch": 0.3482905982905983,
+      "grad_norm": 0.5274081230163574,
+      "learning_rate": 0.0001963506075925143,
+      "loss": 1.1809,
+      "step": 1956
+    },
+    {
+      "epoch": 0.34846866096866097,
+      "grad_norm": 0.45398542284965515,
+      "learning_rate": 0.0001963468597148371,
+      "loss": 1.0502,
+      "step": 1957
+    },
+    {
+      "epoch": 0.34864672364672367,
+      "grad_norm": 0.48201611638069153,
+      "learning_rate": 0.00019634310994944332,
+      "loss": 1.0557,
+      "step": 1958
+    },
+    {
+      "epoch": 0.3488247863247863,
+      "grad_norm": 0.6407544016838074,
+      "learning_rate": 0.00019633935829640642,
+      "loss": 1.2138,
+      "step": 1959
+    },
+    {
+      "epoch": 0.349002849002849,
+      "grad_norm": 0.5385687351226807,
+      "learning_rate": 0.00019633560475579995,
+      "loss": 1.3496,
+      "step": 1960
+    },
+    {
+      "epoch": 0.3491809116809117,
+      "grad_norm": 0.5260964035987854,
+      "learning_rate": 0.0001963318493276974,
+      "loss": 1.0253,
+      "step": 1961
+    },
+    {
+      "epoch": 0.34935897435897434,
+      "grad_norm": 0.48478585481643677,
+      "learning_rate": 0.00019632809201217238,
+      "loss": 1.137,
+      "step": 1962
+    },
+    {
+      "epoch": 0.34953703703703703,
+      "grad_norm": 0.620033860206604,
+      "learning_rate": 0.0001963243328092985,
+      "loss": 1.3445,
+      "step": 1963
+    },
+    {
+      "epoch": 0.34971509971509973,
+      "grad_norm": 0.5149700045585632,
+      "learning_rate": 0.00019632057171914942,
+      "loss": 1.1042,
+      "step": 1964
+    },
+    {
+      "epoch": 0.34989316239316237,
+      "grad_norm": 0.42695048451423645,
+      "learning_rate": 0.0001963168087417988,
+      "loss": 0.8789,
+      "step": 1965
+    },
+    {
+      "epoch": 0.35007122507122507,
+      "grad_norm": 0.5281283855438232,
+      "learning_rate": 0.00019631304387732044,
+      "loss": 1.1155,
+      "step": 1966
+    },
+    {
+      "epoch": 0.35024928774928776,
+      "grad_norm": 0.4994089901447296,
+      "learning_rate": 0.00019630927712578804,
+      "loss": 1.1226,
+      "step": 1967
+    },
+    {
+      "epoch": 0.3504273504273504,
+      "grad_norm": 0.4433288276195526,
+      "learning_rate": 0.0001963055084872754,
+      "loss": 1.0262,
+      "step": 1968
+    },
+    {
+      "epoch": 0.3506054131054131,
+      "grad_norm": 0.46541857719421387,
+      "learning_rate": 0.0001963017379618564,
+      "loss": 1.1438,
+      "step": 1969
+    },
+    {
+      "epoch": 0.3507834757834758,
+      "grad_norm": 0.5097604393959045,
+      "learning_rate": 0.00019629796554960488,
+      "loss": 0.9641,
+      "step": 1970
+    },
+    {
+      "epoch": 0.35096153846153844,
+      "grad_norm": 0.49461981654167175,
+      "learning_rate": 0.00019629419125059478,
+      "loss": 1.1765,
+      "step": 1971
+    },
+    {
+      "epoch": 0.35113960113960113,
+      "grad_norm": 0.4763339161872864,
+      "learning_rate": 0.00019629041506490005,
+      "loss": 1.0527,
+      "step": 1972
+    },
+    {
+      "epoch": 0.35131766381766383,
+      "grad_norm": 0.4528443217277527,
+      "learning_rate": 0.00019628663699259463,
+      "loss": 1.1409,
+      "step": 1973
+    },
+    {
+      "epoch": 0.35149572649572647,
+      "grad_norm": 0.4436309039592743,
+      "learning_rate": 0.00019628285703375258,
+      "loss": 1.0459,
+      "step": 1974
+    },
+    {
+      "epoch": 0.35167378917378916,
+      "grad_norm": 0.5146129727363586,
+      "learning_rate": 0.00019627907518844797,
+      "loss": 1.2527,
+      "step": 1975
+    },
+    {
+      "epoch": 0.35185185185185186,
+      "grad_norm": 0.5202171802520752,
+      "learning_rate": 0.0001962752914567549,
+      "loss": 1.226,
+      "step": 1976
+    },
+    {
+      "epoch": 0.35202991452991456,
+      "grad_norm": 0.5267411470413208,
+      "learning_rate": 0.00019627150583874747,
+      "loss": 1.0898,
+      "step": 1977
+    },
+    {
+      "epoch": 0.3522079772079772,
+      "grad_norm": 0.546840250492096,
+      "learning_rate": 0.00019626771833449987,
+      "loss": 1.1716,
+      "step": 1978
+    },
+    {
+      "epoch": 0.3523860398860399,
+      "grad_norm": 0.5525290966033936,
+      "learning_rate": 0.0001962639289440863,
+      "loss": 1.1762,
+      "step": 1979
+    },
+    {
+      "epoch": 0.3525641025641026,
+      "grad_norm": 0.48967215418815613,
+      "learning_rate": 0.000196260137667581,
+      "loss": 1.1884,
+      "step": 1980
+    },
+    {
+      "epoch": 0.35274216524216523,
+      "grad_norm": 0.5908235907554626,
+      "learning_rate": 0.0001962563445050583,
+      "loss": 1.1887,
+      "step": 1981
+    },
+    {
+      "epoch": 0.3529202279202279,
+      "grad_norm": 0.46708086133003235,
+      "learning_rate": 0.00019625254945659245,
+      "loss": 0.8842,
+      "step": 1982
+    },
+    {
+      "epoch": 0.3530982905982906,
+      "grad_norm": 0.41652458906173706,
+      "learning_rate": 0.00019624875252225788,
+      "loss": 1.0268,
+      "step": 1983
+    },
+    {
+      "epoch": 0.35327635327635326,
+      "grad_norm": 0.5084529519081116,
+      "learning_rate": 0.00019624495370212892,
+      "loss": 1.0547,
+      "step": 1984
+    },
+    {
+      "epoch": 0.35345441595441596,
+      "grad_norm": 0.5667507648468018,
+      "learning_rate": 0.00019624115299628003,
+      "loss": 1.0656,
+      "step": 1985
+    },
+    {
+      "epoch": 0.35363247863247865,
+      "grad_norm": 0.5022873282432556,
+      "learning_rate": 0.00019623735040478568,
+      "loss": 1.0627,
+      "step": 1986
+    },
+    {
+      "epoch": 0.3538105413105413,
+      "grad_norm": 0.48342058062553406,
+      "learning_rate": 0.00019623354592772035,
+      "loss": 1.0976,
+      "step": 1987
+    },
+    {
+      "epoch": 0.353988603988604,
+      "grad_norm": 0.48117366433143616,
+      "learning_rate": 0.0001962297395651586,
+      "loss": 1.0515,
+      "step": 1988
+    },
+    {
+      "epoch": 0.3541666666666667,
+      "grad_norm": 0.492564857006073,
+      "learning_rate": 0.000196225931317175,
+      "loss": 1.1957,
+      "step": 1989
+    },
+    {
+      "epoch": 0.3543447293447293,
+      "grad_norm": 0.4756208658218384,
+      "learning_rate": 0.00019622212118384417,
+      "loss": 1.007,
+      "step": 1990
+    },
+    {
+      "epoch": 0.354522792022792,
+      "grad_norm": 0.581930935382843,
+      "learning_rate": 0.00019621830916524076,
+      "loss": 1.232,
+      "step": 1991
+    },
+    {
+      "epoch": 0.3547008547008547,
+      "grad_norm": 0.480064332485199,
+      "learning_rate": 0.00019621449526143947,
+      "loss": 1.2693,
+      "step": 1992
+    },
+    {
+      "epoch": 0.35487891737891736,
+      "grad_norm": 0.5679123401641846,
+      "learning_rate": 0.000196210679472515,
+      "loss": 1.2985,
+      "step": 1993
+    },
+    {
+      "epoch": 0.35505698005698005,
+      "grad_norm": 0.43757280707359314,
+      "learning_rate": 0.00019620686179854213,
+      "loss": 1.1387,
+      "step": 1994
+    },
+    {
+      "epoch": 0.35523504273504275,
+      "grad_norm": 0.4950634837150574,
+      "learning_rate": 0.00019620304223959566,
+      "loss": 1.1809,
+      "step": 1995
+    },
+    {
+      "epoch": 0.3554131054131054,
+      "grad_norm": 0.5574113726615906,
+      "learning_rate": 0.00019619922079575043,
+      "loss": 1.2434,
+      "step": 1996
+    },
+    {
+      "epoch": 0.3555911680911681,
+      "grad_norm": 0.5154930949211121,
+      "learning_rate": 0.00019619539746708128,
+      "loss": 1.1747,
+      "step": 1997
+    },
+    {
+      "epoch": 0.3557692307692308,
+      "grad_norm": 0.4377825856208801,
+      "learning_rate": 0.00019619157225366315,
+      "loss": 0.9547,
+      "step": 1998
+    },
+    {
+      "epoch": 0.3559472934472934,
+      "grad_norm": 0.530714213848114,
+      "learning_rate": 0.00019618774515557097,
+      "loss": 1.2057,
+      "step": 1999
+    },
+    {
+      "epoch": 0.3561253561253561,
+      "grad_norm": 0.5703464150428772,
+      "learning_rate": 0.00019618391617287978,
+      "loss": 1.3068,
+      "step": 2000
+    },
+    {
+      "epoch": 0.3563034188034188,
+      "grad_norm": 0.4862228333950043,
+      "learning_rate": 0.0001961800853056645,
+      "loss": 1.0077,
+      "step": 2001
+    },
+    {
+      "epoch": 0.35648148148148145,
+      "grad_norm": 0.5575395822525024,
+      "learning_rate": 0.00019617625255400028,
+      "loss": 1.03,
+      "step": 2002
+    },
+    {
+      "epoch": 0.35665954415954415,
+      "grad_norm": 0.4826279580593109,
+      "learning_rate": 0.0001961724179179622,
+      "loss": 1.268,
+      "step": 2003
+    },
+    {
+      "epoch": 0.35683760683760685,
+      "grad_norm": 0.49423274397850037,
+      "learning_rate": 0.00019616858139762534,
+      "loss": 1.1305,
+      "step": 2004
+    },
+    {
+      "epoch": 0.35701566951566954,
+      "grad_norm": 0.5208541750907898,
+      "learning_rate": 0.00019616474299306491,
+      "loss": 1.1651,
+      "step": 2005
+    },
+    {
+      "epoch": 0.3571937321937322,
+      "grad_norm": 0.5324164032936096,
+      "learning_rate": 0.0001961609027043561,
+      "loss": 1.1406,
+      "step": 2006
+    },
+    {
+      "epoch": 0.3573717948717949,
+      "grad_norm": 0.45385462045669556,
+      "learning_rate": 0.00019615706053157416,
+      "loss": 1.0716,
+      "step": 2007
+    },
+    {
+      "epoch": 0.3575498575498576,
+      "grad_norm": 0.5016173720359802,
+      "learning_rate": 0.00019615321647479438,
+      "loss": 1.0878,
+      "step": 2008
+    },
+    {
+      "epoch": 0.3577279202279202,
+      "grad_norm": 0.5073097348213196,
+      "learning_rate": 0.00019614937053409205,
+      "loss": 1.237,
+      "step": 2009
+    },
+    {
+      "epoch": 0.3579059829059829,
+      "grad_norm": 0.48880141973495483,
+      "learning_rate": 0.00019614552270954256,
+      "loss": 0.8794,
+      "step": 2010
+    },
+    {
+      "epoch": 0.3580840455840456,
+      "grad_norm": 0.43902209401130676,
+      "learning_rate": 0.00019614167300122126,
+      "loss": 0.912,
+      "step": 2011
+    },
+    {
+      "epoch": 0.35826210826210825,
+      "grad_norm": 0.42809322476387024,
+      "learning_rate": 0.0001961378214092036,
+      "loss": 0.7804,
+      "step": 2012
+    },
+    {
+      "epoch": 0.35844017094017094,
+      "grad_norm": 0.4464281499385834,
+      "learning_rate": 0.00019613396793356503,
+      "loss": 1.0004,
+      "step": 2013
+    },
+    {
+      "epoch": 0.35861823361823364,
+      "grad_norm": 0.49085676670074463,
+      "learning_rate": 0.00019613011257438109,
+      "loss": 1.1087,
+      "step": 2014
+    },
+    {
+      "epoch": 0.3587962962962963,
+      "grad_norm": 0.4997732937335968,
+      "learning_rate": 0.00019612625533172725,
+      "loss": 0.9591,
+      "step": 2015
+    },
+    {
+      "epoch": 0.358974358974359,
+      "grad_norm": 0.48442545533180237,
+      "learning_rate": 0.00019612239620567912,
+      "loss": 0.9744,
+      "step": 2016
+    },
+    {
+      "epoch": 0.35915242165242167,
+      "grad_norm": 0.4989205002784729,
+      "learning_rate": 0.00019611853519631233,
+      "loss": 0.9844,
+      "step": 2017
+    },
+    {
+      "epoch": 0.3593304843304843,
+      "grad_norm": 0.6107521653175354,
+      "learning_rate": 0.00019611467230370248,
+      "loss": 1.147,
+      "step": 2018
+    },
+    {
+      "epoch": 0.359508547008547,
+      "grad_norm": 0.5594844818115234,
+      "learning_rate": 0.00019611080752792535,
+      "loss": 1.3195,
+      "step": 2019
+    },
+    {
+      "epoch": 0.3596866096866097,
+      "grad_norm": 0.4786946475505829,
+      "learning_rate": 0.00019610694086905656,
+      "loss": 1.2108,
+      "step": 2020
+    },
+    {
+      "epoch": 0.35986467236467234,
+      "grad_norm": 0.5186030268669128,
+      "learning_rate": 0.0001961030723271719,
+      "loss": 1.0008,
+      "step": 2021
+    },
+    {
+      "epoch": 0.36004273504273504,
+      "grad_norm": 0.4520573318004608,
+      "learning_rate": 0.0001960992019023472,
+      "loss": 1.1307,
+      "step": 2022
+    },
+    {
+      "epoch": 0.36022079772079774,
+      "grad_norm": 0.4983210563659668,
+      "learning_rate": 0.00019609532959465823,
+      "loss": 1.1486,
+      "step": 2023
+    },
+    {
+      "epoch": 0.3603988603988604,
+      "grad_norm": 0.6209200024604797,
+      "learning_rate": 0.00019609145540418094,
+      "loss": 1.2566,
+      "step": 2024
+    },
+    {
+      "epoch": 0.3605769230769231,
+      "grad_norm": 0.47047603130340576,
+      "learning_rate": 0.00019608757933099117,
+      "loss": 1.1588,
+      "step": 2025
+    },
+    {
+      "epoch": 0.36075498575498577,
+      "grad_norm": 0.5147389769554138,
+      "learning_rate": 0.0001960837013751649,
+      "loss": 1.2113,
+      "step": 2026
+    },
+    {
+      "epoch": 0.3609330484330484,
+      "grad_norm": 0.45826098322868347,
+      "learning_rate": 0.00019607982153677808,
+      "loss": 1.13,
+      "step": 2027
+    },
+    {
+      "epoch": 0.3611111111111111,
+      "grad_norm": 0.5699561834335327,
+      "learning_rate": 0.00019607593981590675,
+      "loss": 1.2476,
+      "step": 2028
+    },
+    {
+      "epoch": 0.3612891737891738,
+      "grad_norm": 0.5349239110946655,
+      "learning_rate": 0.000196072056212627,
+      "loss": 1.2295,
+      "step": 2029
+    },
+    {
+      "epoch": 0.36146723646723644,
+      "grad_norm": 0.6212165355682373,
+      "learning_rate": 0.00019606817072701484,
+      "loss": 1.1965,
+      "step": 2030
+    },
+    {
+      "epoch": 0.36164529914529914,
+      "grad_norm": 0.4870990216732025,
+      "learning_rate": 0.00019606428335914645,
+      "loss": 1.4464,
+      "step": 2031
+    },
+    {
+      "epoch": 0.36182336182336183,
+      "grad_norm": 0.42427828907966614,
+      "learning_rate": 0.00019606039410909797,
+      "loss": 1.1546,
+      "step": 2032
+    },
+    {
+      "epoch": 0.36200142450142453,
+      "grad_norm": 0.5081788301467896,
+      "learning_rate": 0.0001960565029769456,
+      "loss": 1.1867,
+      "step": 2033
+    },
+    {
+      "epoch": 0.36217948717948717,
+      "grad_norm": 0.4813104271888733,
+      "learning_rate": 0.00019605260996276565,
+      "loss": 1.3726,
+      "step": 2034
+    },
+    {
+      "epoch": 0.36235754985754987,
+      "grad_norm": 0.4648851156234741,
+      "learning_rate": 0.0001960487150666343,
+      "loss": 1.2434,
+      "step": 2035
+    },
+    {
+      "epoch": 0.36253561253561256,
+      "grad_norm": 0.484161913394928,
+      "learning_rate": 0.00019604481828862792,
+      "loss": 1.1309,
+      "step": 2036
+    },
+    {
+      "epoch": 0.3627136752136752,
+      "grad_norm": 0.4929439127445221,
+      "learning_rate": 0.00019604091962882283,
+      "loss": 1.1007,
+      "step": 2037
+    },
+    {
+      "epoch": 0.3628917378917379,
+      "grad_norm": 0.45599642395973206,
+      "learning_rate": 0.00019603701908729544,
+      "loss": 1.2628,
+      "step": 2038
+    },
+    {
+      "epoch": 0.3630698005698006,
+      "grad_norm": 0.45295149087905884,
+      "learning_rate": 0.00019603311666412213,
+      "loss": 0.9808,
+      "step": 2039
+    },
+    {
+      "epoch": 0.36324786324786323,
+      "grad_norm": 0.48681163787841797,
+      "learning_rate": 0.00019602921235937942,
+      "loss": 1.0574,
+      "step": 2040
+    },
+    {
+      "epoch": 0.36342592592592593,
+      "grad_norm": 0.41232365369796753,
+      "learning_rate": 0.00019602530617314378,
+      "loss": 1.0454,
+      "step": 2041
+    },
+    {
+      "epoch": 0.3636039886039886,
+      "grad_norm": 0.46214723587036133,
+      "learning_rate": 0.00019602139810549174,
+      "loss": 0.9985,
+      "step": 2042
+    },
+    {
+      "epoch": 0.36378205128205127,
+      "grad_norm": 0.44307878613471985,
+      "learning_rate": 0.00019601748815649989,
+      "loss": 0.9683,
+      "step": 2043
+    },
+    {
+      "epoch": 0.36396011396011396,
+      "grad_norm": 0.4809451401233673,
+      "learning_rate": 0.00019601357632624477,
+      "loss": 1.028,
+      "step": 2044
+    },
+    {
+      "epoch": 0.36413817663817666,
+      "grad_norm": 0.4638497531414032,
+      "learning_rate": 0.0001960096626148031,
+      "loss": 0.9851,
+      "step": 2045
+    },
+    {
+      "epoch": 0.3643162393162393,
+      "grad_norm": 0.5942164063453674,
+      "learning_rate": 0.00019600574702225153,
+      "loss": 1.1606,
+      "step": 2046
+    },
+    {
+      "epoch": 0.364494301994302,
+      "grad_norm": 0.5171293616294861,
+      "learning_rate": 0.00019600182954866675,
+      "loss": 1.2335,
+      "step": 2047
+    },
+    {
+      "epoch": 0.3646723646723647,
+      "grad_norm": 0.5294404625892639,
+      "learning_rate": 0.00019599791019412558,
+      "loss": 1.0966,
+      "step": 2048
+    },
+    {
+      "epoch": 0.36485042735042733,
+      "grad_norm": 0.46117448806762695,
+      "learning_rate": 0.00019599398895870477,
+      "loss": 1.0565,
+      "step": 2049
+    },
+    {
+      "epoch": 0.36502849002849,
+      "grad_norm": 0.5385118126869202,
+      "learning_rate": 0.00019599006584248118,
+      "loss": 1.0076,
+      "step": 2050
+    },
+    {
+      "epoch": 0.3652065527065527,
+      "grad_norm": 0.4915166199207306,
+      "learning_rate": 0.00019598614084553165,
+      "loss": 0.9686,
+      "step": 2051
+    },
+    {
+      "epoch": 0.36538461538461536,
+      "grad_norm": 0.46769094467163086,
+      "learning_rate": 0.00019598221396793303,
+      "loss": 1.1217,
+      "step": 2052
+    },
+    {
+      "epoch": 0.36556267806267806,
+      "grad_norm": 0.5440493822097778,
+      "learning_rate": 0.00019597828520976236,
+      "loss": 1.2344,
+      "step": 2053
+    },
+    {
+      "epoch": 0.36574074074074076,
+      "grad_norm": 0.616727352142334,
+      "learning_rate": 0.00019597435457109657,
+      "loss": 1.2953,
+      "step": 2054
+    },
+    {
+      "epoch": 0.3659188034188034,
+      "grad_norm": 0.4859183430671692,
+      "learning_rate": 0.00019597042205201265,
+      "loss": 1.16,
+      "step": 2055
+    },
+    {
+      "epoch": 0.3660968660968661,
+      "grad_norm": 0.47056329250335693,
+      "learning_rate": 0.0001959664876525877,
+      "loss": 0.9982,
+      "step": 2056
+    },
+    {
+      "epoch": 0.3662749287749288,
+      "grad_norm": 0.48347967863082886,
+      "learning_rate": 0.00019596255137289875,
+      "loss": 1.0966,
+      "step": 2057
+    },
+    {
+      "epoch": 0.36645299145299143,
+      "grad_norm": 0.5068454742431641,
+      "learning_rate": 0.00019595861321302296,
+      "loss": 1.2891,
+      "step": 2058
+    },
+    {
+      "epoch": 0.3666310541310541,
+      "grad_norm": 0.5702359080314636,
+      "learning_rate": 0.00019595467317303747,
+      "loss": 1.1394,
+      "step": 2059
+    },
+    {
+      "epoch": 0.3668091168091168,
+      "grad_norm": 0.5028812885284424,
+      "learning_rate": 0.0001959507312530195,
+      "loss": 1.2324,
+      "step": 2060
+    },
+    {
+      "epoch": 0.36698717948717946,
+      "grad_norm": 0.4672880172729492,
+      "learning_rate": 0.00019594678745304628,
+      "loss": 1.0581,
+      "step": 2061
+    },
+    {
+      "epoch": 0.36716524216524216,
+      "grad_norm": 0.5233900547027588,
+      "learning_rate": 0.00019594284177319504,
+      "loss": 1.138,
+      "step": 2062
+    },
+    {
+      "epoch": 0.36734330484330485,
+      "grad_norm": 0.46871712803840637,
+      "learning_rate": 0.00019593889421354316,
+      "loss": 1.2159,
+      "step": 2063
+    },
+    {
+      "epoch": 0.36752136752136755,
+      "grad_norm": 0.5180533528327942,
+      "learning_rate": 0.00019593494477416793,
+      "loss": 1.1116,
+      "step": 2064
+    },
+    {
+      "epoch": 0.3676994301994302,
+      "grad_norm": 0.5398494005203247,
+      "learning_rate": 0.0001959309934551467,
+      "loss": 1.2038,
+      "step": 2065
+    },
+    {
+      "epoch": 0.3678774928774929,
+      "grad_norm": 0.4850373864173889,
+      "learning_rate": 0.000195927040256557,
+      "loss": 1.4315,
+      "step": 2066
+    },
+    {
+      "epoch": 0.3680555555555556,
+      "grad_norm": 0.49190905690193176,
+      "learning_rate": 0.0001959230851784762,
+      "loss": 0.9993,
+      "step": 2067
+    },
+    {
+      "epoch": 0.3682336182336182,
+      "grad_norm": 0.4546903073787689,
+      "learning_rate": 0.00019591912822098178,
+      "loss": 1.0979,
+      "step": 2068
+    },
+    {
+      "epoch": 0.3684116809116809,
+      "grad_norm": 0.4726468622684479,
+      "learning_rate": 0.00019591516938415133,
+      "loss": 1.1629,
+      "step": 2069
+    },
+    {
+      "epoch": 0.3685897435897436,
+      "grad_norm": 0.47856009006500244,
+      "learning_rate": 0.00019591120866806235,
+      "loss": 1.2048,
+      "step": 2070
+    },
+    {
+      "epoch": 0.36876780626780625,
+      "grad_norm": 0.46847718954086304,
+      "learning_rate": 0.0001959072460727925,
+      "loss": 1.0958,
+      "step": 2071
+    },
+    {
+      "epoch": 0.36894586894586895,
+      "grad_norm": 0.47164350748062134,
+      "learning_rate": 0.0001959032815984194,
+      "loss": 1.1912,
+      "step": 2072
+    },
+    {
+      "epoch": 0.36912393162393164,
+      "grad_norm": 0.4838213324546814,
+      "learning_rate": 0.0001958993152450207,
+      "loss": 1.1466,
+      "step": 2073
+    },
+    {
+      "epoch": 0.3693019943019943,
+      "grad_norm": 0.47234636545181274,
+      "learning_rate": 0.00019589534701267412,
+      "loss": 0.9475,
+      "step": 2074
+    },
+    {
+      "epoch": 0.369480056980057,
+      "grad_norm": 0.4913126826286316,
+      "learning_rate": 0.00019589137690145746,
+      "loss": 1.1571,
+      "step": 2075
+    },
+    {
+      "epoch": 0.3696581196581197,
+      "grad_norm": 0.4696233570575714,
+      "learning_rate": 0.00019588740491144842,
+      "loss": 0.9797,
+      "step": 2076
+    },
+    {
+      "epoch": 0.3698361823361823,
+      "grad_norm": 0.46146106719970703,
+      "learning_rate": 0.00019588343104272492,
+      "loss": 1.027,
+      "step": 2077
+    },
+    {
+      "epoch": 0.370014245014245,
+      "grad_norm": 0.4920627176761627,
+      "learning_rate": 0.00019587945529536474,
+      "loss": 1.1008,
+      "step": 2078
+    },
+    {
+      "epoch": 0.3701923076923077,
+      "grad_norm": 0.4854249954223633,
+      "learning_rate": 0.0001958754776694458,
+      "loss": 1.0759,
+      "step": 2079
+    },
+    {
+      "epoch": 0.37037037037037035,
+      "grad_norm": 0.4884897768497467,
+      "learning_rate": 0.00019587149816504608,
+      "loss": 1.1403,
+      "step": 2080
+    },
+    {
+      "epoch": 0.37054843304843305,
+      "grad_norm": 0.5062584280967712,
+      "learning_rate": 0.00019586751678224345,
+      "loss": 1.0185,
+      "step": 2081
+    },
+    {
+      "epoch": 0.37072649572649574,
+      "grad_norm": 0.44697675108909607,
+      "learning_rate": 0.000195863533521116,
+      "loss": 1.0462,
+      "step": 2082
+    },
+    {
+      "epoch": 0.3709045584045584,
+      "grad_norm": 0.5122885704040527,
+      "learning_rate": 0.00019585954838174176,
+      "loss": 1.108,
+      "step": 2083
+    },
+    {
+      "epoch": 0.3710826210826211,
+      "grad_norm": 0.486650288105011,
+      "learning_rate": 0.0001958555613641988,
+      "loss": 1.126,
+      "step": 2084
+    },
+    {
+      "epoch": 0.3712606837606838,
+      "grad_norm": 0.5296297669410706,
+      "learning_rate": 0.00019585157246856523,
+      "loss": 1.1757,
+      "step": 2085
+    },
+    {
+      "epoch": 0.3714387464387464,
+      "grad_norm": 0.4935721457004547,
+      "learning_rate": 0.0001958475816949192,
+      "loss": 1.1654,
+      "step": 2086
+    },
+    {
+      "epoch": 0.3716168091168091,
+      "grad_norm": 0.6226509213447571,
+      "learning_rate": 0.00019584358904333891,
+      "loss": 1.1981,
+      "step": 2087
+    },
+    {
+      "epoch": 0.3717948717948718,
+      "grad_norm": 0.44094228744506836,
+      "learning_rate": 0.0001958395945139026,
+      "loss": 0.8468,
+      "step": 2088
+    },
+    {
+      "epoch": 0.37197293447293445,
+      "grad_norm": 0.5335884690284729,
+      "learning_rate": 0.00019583559810668858,
+      "loss": 1.1597,
+      "step": 2089
+    },
+    {
+      "epoch": 0.37215099715099714,
+      "grad_norm": 0.4585414528846741,
+      "learning_rate": 0.000195831599821775,
+      "loss": 0.9343,
+      "step": 2090
+    },
+    {
+      "epoch": 0.37232905982905984,
+      "grad_norm": 0.533087432384491,
+      "learning_rate": 0.00019582759965924035,
+      "loss": 1.1209,
+      "step": 2091
+    },
+    {
+      "epoch": 0.37250712250712253,
+      "grad_norm": 0.5302683711051941,
+      "learning_rate": 0.00019582359761916295,
+      "loss": 1.236,
+      "step": 2092
+    },
+    {
+      "epoch": 0.3726851851851852,
+      "grad_norm": 0.4522508382797241,
+      "learning_rate": 0.00019581959370162122,
+      "loss": 1.0196,
+      "step": 2093
+    },
+    {
+      "epoch": 0.37286324786324787,
+      "grad_norm": 0.52391517162323,
+      "learning_rate": 0.00019581558790669358,
+      "loss": 1.0077,
+      "step": 2094
+    },
+    {
+      "epoch": 0.37304131054131057,
+      "grad_norm": 0.47144797444343567,
+      "learning_rate": 0.00019581158023445854,
+      "loss": 1.0956,
+      "step": 2095
+    },
+    {
+      "epoch": 0.3732193732193732,
+      "grad_norm": 0.4486723244190216,
+      "learning_rate": 0.00019580757068499459,
+      "loss": 0.8697,
+      "step": 2096
+    },
+    {
+      "epoch": 0.3733974358974359,
+      "grad_norm": 0.4626580476760864,
+      "learning_rate": 0.00019580355925838034,
+      "loss": 0.8489,
+      "step": 2097
+    },
+    {
+      "epoch": 0.3735754985754986,
+      "grad_norm": 0.5647920370101929,
+      "learning_rate": 0.00019579954595469438,
+      "loss": 1.1458,
+      "step": 2098
+    },
+    {
+      "epoch": 0.37375356125356124,
+      "grad_norm": 0.4734349846839905,
+      "learning_rate": 0.00019579553077401528,
+      "loss": 1.1036,
+      "step": 2099
+    },
+    {
+      "epoch": 0.37393162393162394,
+      "grad_norm": 0.5624295473098755,
+      "learning_rate": 0.00019579151371642176,
+      "loss": 0.9793,
+      "step": 2100
+    },
+    {
+      "epoch": 0.37410968660968663,
+      "grad_norm": 0.47507283091545105,
+      "learning_rate": 0.00019578749478199256,
+      "loss": 1.0371,
+      "step": 2101
+    },
+    {
+      "epoch": 0.37428774928774927,
+      "grad_norm": 0.550865113735199,
+      "learning_rate": 0.00019578347397080633,
+      "loss": 1.046,
+      "step": 2102
+    },
+    {
+      "epoch": 0.37446581196581197,
+      "grad_norm": 0.5249403715133667,
+      "learning_rate": 0.00019577945128294193,
+      "loss": 1.3185,
+      "step": 2103
+    },
+    {
+      "epoch": 0.37464387464387466,
+      "grad_norm": 0.4921024739742279,
+      "learning_rate": 0.00019577542671847815,
+      "loss": 1.0758,
+      "step": 2104
+    },
+    {
+      "epoch": 0.3748219373219373,
+      "grad_norm": 0.5351784825325012,
+      "learning_rate": 0.00019577140027749384,
+      "loss": 1.067,
+      "step": 2105
+    },
+    {
+      "epoch": 0.375,
+      "grad_norm": 0.44420507550239563,
+      "learning_rate": 0.00019576737196006787,
+      "loss": 1.1065,
+      "step": 2106
+    },
+    {
+      "epoch": 0.3751780626780627,
+      "grad_norm": 0.531384289264679,
+      "learning_rate": 0.0001957633417662792,
+      "loss": 1.1634,
+      "step": 2107
+    },
+    {
+      "epoch": 0.37535612535612534,
+      "grad_norm": 0.5167618989944458,
+      "learning_rate": 0.00019575930969620677,
+      "loss": 1.1646,
+      "step": 2108
+    },
+    {
+      "epoch": 0.37553418803418803,
+      "grad_norm": 0.41487228870391846,
+      "learning_rate": 0.0001957552757499296,
+      "loss": 0.793,
+      "step": 2109
+    },
+    {
+      "epoch": 0.37571225071225073,
+      "grad_norm": 0.5110787153244019,
+      "learning_rate": 0.00019575123992752672,
+      "loss": 1.1752,
+      "step": 2110
+    },
+    {
+      "epoch": 0.37589031339031337,
+      "grad_norm": 0.4422051012516022,
+      "learning_rate": 0.00019574720222907717,
+      "loss": 1.0102,
+      "step": 2111
+    },
+    {
+      "epoch": 0.37606837606837606,
+      "grad_norm": 0.4757538139820099,
+      "learning_rate": 0.0001957431626546601,
+      "loss": 1.0467,
+      "step": 2112
+    },
+    {
+      "epoch": 0.37624643874643876,
+      "grad_norm": 0.4736764430999756,
+      "learning_rate": 0.00019573912120435466,
+      "loss": 1.3048,
+      "step": 2113
+    },
+    {
+      "epoch": 0.3764245014245014,
+      "grad_norm": 0.49894335865974426,
+      "learning_rate": 0.00019573507787824004,
+      "loss": 1.0502,
+      "step": 2114
+    },
+    {
+      "epoch": 0.3766025641025641,
+      "grad_norm": 0.48120981454849243,
+      "learning_rate": 0.00019573103267639543,
+      "loss": 1.2405,
+      "step": 2115
+    },
+    {
+      "epoch": 0.3767806267806268,
+      "grad_norm": 0.4826737642288208,
+      "learning_rate": 0.0001957269855989001,
+      "loss": 1.1189,
+      "step": 2116
+    },
+    {
+      "epoch": 0.37695868945868943,
+      "grad_norm": 0.4736921489238739,
+      "learning_rate": 0.0001957229366458333,
+      "loss": 1.2862,
+      "step": 2117
+    },
+    {
+      "epoch": 0.37713675213675213,
+      "grad_norm": 0.3895208537578583,
+      "learning_rate": 0.00019571888581727446,
+      "loss": 1.0573,
+      "step": 2118
+    },
+    {
+      "epoch": 0.3773148148148148,
+      "grad_norm": 0.5107510089874268,
+      "learning_rate": 0.00019571483311330284,
+      "loss": 1.2913,
+      "step": 2119
+    },
+    {
+      "epoch": 0.37749287749287747,
+      "grad_norm": 0.4543241262435913,
+      "learning_rate": 0.00019571077853399794,
+      "loss": 0.949,
+      "step": 2120
+    },
+    {
+      "epoch": 0.37767094017094016,
+      "grad_norm": 0.46897491812705994,
+      "learning_rate": 0.00019570672207943913,
+      "loss": 1.2235,
+      "step": 2121
+    },
+    {
+      "epoch": 0.37784900284900286,
+      "grad_norm": 0.4812130630016327,
+      "learning_rate": 0.0001957026637497059,
+      "loss": 0.8857,
+      "step": 2122
+    },
+    {
+      "epoch": 0.37802706552706555,
+      "grad_norm": 0.47452476620674133,
+      "learning_rate": 0.00019569860354487782,
+      "loss": 1.0549,
+      "step": 2123
+    },
+    {
+      "epoch": 0.3782051282051282,
+      "grad_norm": 0.49879950284957886,
+      "learning_rate": 0.00019569454146503438,
+      "loss": 1.0475,
+      "step": 2124
+    },
+    {
+      "epoch": 0.3783831908831909,
+      "grad_norm": 0.4246445894241333,
+      "learning_rate": 0.00019569047751025518,
+      "loss": 0.8788,
+      "step": 2125
+    },
+    {
+      "epoch": 0.3785612535612536,
+      "grad_norm": 0.4868565499782562,
+      "learning_rate": 0.00019568641168061986,
+      "loss": 1.1801,
+      "step": 2126
+    },
+    {
+      "epoch": 0.3787393162393162,
+      "grad_norm": 0.46723654866218567,
+      "learning_rate": 0.0001956823439762081,
+      "loss": 1.1661,
+      "step": 2127
+    },
+    {
+      "epoch": 0.3789173789173789,
+      "grad_norm": 0.4989059269428253,
+      "learning_rate": 0.00019567827439709954,
+      "loss": 1.3037,
+      "step": 2128
+    },
+    {
+      "epoch": 0.3790954415954416,
+      "grad_norm": 0.441307932138443,
+      "learning_rate": 0.00019567420294337395,
+      "loss": 1.0197,
+      "step": 2129
+    },
+    {
+      "epoch": 0.37927350427350426,
+      "grad_norm": 0.5200160145759583,
+      "learning_rate": 0.0001956701296151111,
+      "loss": 1.3366,
+      "step": 2130
+    },
+    {
+      "epoch": 0.37945156695156695,
+      "grad_norm": 0.43610256910324097,
+      "learning_rate": 0.00019566605441239082,
+      "loss": 1.0148,
+      "step": 2131
+    },
+    {
+      "epoch": 0.37962962962962965,
+      "grad_norm": 0.4160982370376587,
+      "learning_rate": 0.00019566197733529293,
+      "loss": 1.0758,
+      "step": 2132
+    },
+    {
+      "epoch": 0.3798076923076923,
+      "grad_norm": 0.5007950663566589,
+      "learning_rate": 0.00019565789838389726,
+      "loss": 1.1937,
+      "step": 2133
+    },
+    {
+      "epoch": 0.379985754985755,
+      "grad_norm": 0.4991525113582611,
+      "learning_rate": 0.00019565381755828385,
+      "loss": 1.1788,
+      "step": 2134
+    },
+    {
+      "epoch": 0.3801638176638177,
+      "grad_norm": 0.6313113570213318,
+      "learning_rate": 0.00019564973485853258,
+      "loss": 1.1241,
+      "step": 2135
+    },
+    {
+      "epoch": 0.3803418803418803,
+      "grad_norm": 0.49736538529396057,
+      "learning_rate": 0.0001956456502847234,
+      "loss": 1.0299,
+      "step": 2136
+    },
+    {
+      "epoch": 0.380519943019943,
+      "grad_norm": 0.4384380578994751,
+      "learning_rate": 0.00019564156383693643,
+      "loss": 1.132,
+      "step": 2137
+    },
+    {
+      "epoch": 0.3806980056980057,
+      "grad_norm": 0.4696183502674103,
+      "learning_rate": 0.00019563747551525168,
+      "loss": 1.1145,
+      "step": 2138
+    },
+    {
+      "epoch": 0.38087606837606836,
+      "grad_norm": 0.42039749026298523,
+      "learning_rate": 0.0001956333853197493,
+      "loss": 0.9549,
+      "step": 2139
+    },
+    {
+      "epoch": 0.38105413105413105,
+      "grad_norm": 0.5547221899032593,
+      "learning_rate": 0.00019562929325050936,
+      "loss": 1.0476,
+      "step": 2140
+    },
+    {
+      "epoch": 0.38123219373219375,
+      "grad_norm": 0.4803301692008972,
+      "learning_rate": 0.0001956251993076121,
+      "loss": 1.1285,
+      "step": 2141
+    },
+    {
+      "epoch": 0.3814102564102564,
+      "grad_norm": 0.609501838684082,
+      "learning_rate": 0.00019562110349113766,
+      "loss": 1.2375,
+      "step": 2142
+    },
+    {
+      "epoch": 0.3815883190883191,
+      "grad_norm": 0.5134759545326233,
+      "learning_rate": 0.00019561700580116639,
+      "loss": 1.0895,
+      "step": 2143
+    },
+    {
+      "epoch": 0.3817663817663818,
+      "grad_norm": 0.5086711049079895,
+      "learning_rate": 0.00019561290623777846,
+      "loss": 1.1139,
+      "step": 2144
+    },
+    {
+      "epoch": 0.3819444444444444,
+      "grad_norm": 0.5371596813201904,
+      "learning_rate": 0.00019560880480105428,
+      "loss": 0.9302,
+      "step": 2145
+    },
+    {
+      "epoch": 0.3821225071225071,
+      "grad_norm": 0.4966319799423218,
+      "learning_rate": 0.00019560470149107418,
+      "loss": 1.2485,
+      "step": 2146
+    },
+    {
+      "epoch": 0.3823005698005698,
+      "grad_norm": 0.5296950340270996,
+      "learning_rate": 0.00019560059630791855,
+      "loss": 1.4449,
+      "step": 2147
+    },
+    {
+      "epoch": 0.38247863247863245,
+      "grad_norm": 0.5564194321632385,
+      "learning_rate": 0.00019559648925166783,
+      "loss": 1.0817,
+      "step": 2148
+    },
+    {
+      "epoch": 0.38265669515669515,
+      "grad_norm": 0.5763841867446899,
+      "learning_rate": 0.0001955923803224025,
+      "loss": 1.1915,
+      "step": 2149
+    },
+    {
+      "epoch": 0.38283475783475784,
+      "grad_norm": 0.4782295823097229,
+      "learning_rate": 0.00019558826952020304,
+      "loss": 1.1317,
+      "step": 2150
+    },
+    {
+      "epoch": 0.38301282051282054,
+      "grad_norm": 0.4876856207847595,
+      "learning_rate": 0.00019558415684515002,
+      "loss": 1.2113,
+      "step": 2151
+    },
+    {
+      "epoch": 0.3831908831908832,
+      "grad_norm": 0.4894421398639679,
+      "learning_rate": 0.00019558004229732398,
+      "loss": 1.0761,
+      "step": 2152
+    },
+    {
+      "epoch": 0.3833689458689459,
+      "grad_norm": 0.47914227843284607,
+      "learning_rate": 0.0001955759258768056,
+      "loss": 1.0869,
+      "step": 2153
+    },
+    {
+      "epoch": 0.38354700854700857,
+      "grad_norm": 0.43933629989624023,
+      "learning_rate": 0.00019557180758367543,
+      "loss": 1.0581,
+      "step": 2154
+    },
+    {
+      "epoch": 0.3837250712250712,
+      "grad_norm": 0.4078103005886078,
+      "learning_rate": 0.00019556768741801428,
+      "loss": 1.065,
+      "step": 2155
+    },
+    {
+      "epoch": 0.3839031339031339,
+      "grad_norm": 0.5112793445587158,
+      "learning_rate": 0.00019556356537990278,
+      "loss": 1.2023,
+      "step": 2156
+    },
+    {
+      "epoch": 0.3840811965811966,
+      "grad_norm": 0.4699678122997284,
+      "learning_rate": 0.00019555944146942177,
+      "loss": 1.2459,
+      "step": 2157
+    },
+    {
+      "epoch": 0.38425925925925924,
+      "grad_norm": 0.4723528027534485,
+      "learning_rate": 0.00019555531568665198,
+      "loss": 1.2204,
+      "step": 2158
+    },
+    {
+      "epoch": 0.38443732193732194,
+      "grad_norm": 0.4648225009441376,
+      "learning_rate": 0.00019555118803167432,
+      "loss": 1.1355,
+      "step": 2159
+    },
+    {
+      "epoch": 0.38461538461538464,
+      "grad_norm": 0.49861815571784973,
+      "learning_rate": 0.00019554705850456961,
+      "loss": 1.1301,
+      "step": 2160
+    },
+    {
+      "epoch": 0.3847934472934473,
+      "grad_norm": 0.4076344966888428,
+      "learning_rate": 0.00019554292710541874,
+      "loss": 0.8997,
+      "step": 2161
+    },
+    {
+      "epoch": 0.38497150997151,
+      "grad_norm": 0.5510796308517456,
+      "learning_rate": 0.00019553879383430272,
+      "loss": 1.0594,
+      "step": 2162
+    },
+    {
+      "epoch": 0.38514957264957267,
+      "grad_norm": 0.55793696641922,
+      "learning_rate": 0.00019553465869130249,
+      "loss": 1.1284,
+      "step": 2163
+    },
+    {
+      "epoch": 0.3853276353276353,
+      "grad_norm": 0.5096491575241089,
+      "learning_rate": 0.00019553052167649906,
+      "loss": 1.0419,
+      "step": 2164
+    },
+    {
+      "epoch": 0.385505698005698,
+      "grad_norm": 0.49077361822128296,
+      "learning_rate": 0.0001955263827899735,
+      "loss": 1.1632,
+      "step": 2165
+    },
+    {
+      "epoch": 0.3856837606837607,
+      "grad_norm": 0.5546894073486328,
+      "learning_rate": 0.00019552224203180693,
+      "loss": 1.1487,
+      "step": 2166
+    },
+    {
+      "epoch": 0.38586182336182334,
+      "grad_norm": 0.4930037260055542,
+      "learning_rate": 0.00019551809940208047,
+      "loss": 1.2668,
+      "step": 2167
+    },
+    {
+      "epoch": 0.38603988603988604,
+      "grad_norm": 0.5600671172142029,
+      "learning_rate": 0.00019551395490087525,
+      "loss": 1.3988,
+      "step": 2168
+    },
+    {
+      "epoch": 0.38621794871794873,
+      "grad_norm": 0.45897629857063293,
+      "learning_rate": 0.0001955098085282725,
+      "loss": 0.7792,
+      "step": 2169
+    },
+    {
+      "epoch": 0.3863960113960114,
+      "grad_norm": 0.46138936281204224,
+      "learning_rate": 0.00019550566028435346,
+      "loss": 1.1749,
+      "step": 2170
+    },
+    {
+      "epoch": 0.38657407407407407,
+      "grad_norm": 0.5136167407035828,
+      "learning_rate": 0.0001955015101691994,
+      "loss": 1.0153,
+      "step": 2171
+    },
+    {
+      "epoch": 0.38675213675213677,
+      "grad_norm": 0.4886440336704254,
+      "learning_rate": 0.00019549735818289165,
+      "loss": 1.0006,
+      "step": 2172
+    },
+    {
+      "epoch": 0.3869301994301994,
+      "grad_norm": 0.4339776635169983,
+      "learning_rate": 0.00019549320432551154,
+      "loss": 1.0109,
+      "step": 2173
+    },
+    {
+      "epoch": 0.3871082621082621,
+      "grad_norm": 0.48729443550109863,
+      "learning_rate": 0.00019548904859714044,
+      "loss": 1.2016,
+      "step": 2174
+    },
+    {
+      "epoch": 0.3872863247863248,
+      "grad_norm": 0.5128757357597351,
+      "learning_rate": 0.0001954848909978598,
+      "loss": 1.085,
+      "step": 2175
+    },
+    {
+      "epoch": 0.38746438746438744,
+      "grad_norm": 0.49636292457580566,
+      "learning_rate": 0.0001954807315277511,
+      "loss": 1.0671,
+      "step": 2176
+    },
+    {
+      "epoch": 0.38764245014245013,
+      "grad_norm": 0.4946988821029663,
+      "learning_rate": 0.00019547657018689578,
+      "loss": 1.2091,
+      "step": 2177
+    },
+    {
+      "epoch": 0.38782051282051283,
+      "grad_norm": 0.49004554748535156,
+      "learning_rate": 0.00019547240697537544,
+      "loss": 1.0241,
+      "step": 2178
+    },
+    {
+      "epoch": 0.38799857549857547,
+      "grad_norm": 0.48750075697898865,
+      "learning_rate": 0.00019546824189327157,
+      "loss": 1.1082,
+      "step": 2179
+    },
+    {
+      "epoch": 0.38817663817663817,
+      "grad_norm": 0.47726166248321533,
+      "learning_rate": 0.00019546407494066585,
+      "loss": 1.1275,
+      "step": 2180
+    },
+    {
+      "epoch": 0.38835470085470086,
+      "grad_norm": 0.5253444910049438,
+      "learning_rate": 0.00019545990611763986,
+      "loss": 1.0164,
+      "step": 2181
+    },
+    {
+      "epoch": 0.38853276353276356,
+      "grad_norm": 0.4470371603965759,
+      "learning_rate": 0.00019545573542427533,
+      "loss": 1.0138,
+      "step": 2182
+    },
+    {
+      "epoch": 0.3887108262108262,
+      "grad_norm": 0.6645087599754333,
+      "learning_rate": 0.00019545156286065397,
+      "loss": 1.0884,
+      "step": 2183
+    },
+    {
+      "epoch": 0.3888888888888889,
+      "grad_norm": 0.498775839805603,
+      "learning_rate": 0.0001954473884268575,
+      "loss": 1.1035,
+      "step": 2184
+    },
+    {
+      "epoch": 0.3890669515669516,
+      "grad_norm": 0.5830566883087158,
+      "learning_rate": 0.00019544321212296772,
+      "loss": 1.1665,
+      "step": 2185
+    },
+    {
+      "epoch": 0.38924501424501423,
+      "grad_norm": 0.48162809014320374,
+      "learning_rate": 0.00019543903394906646,
+      "loss": 1.1035,
+      "step": 2186
+    },
+    {
+      "epoch": 0.3894230769230769,
+      "grad_norm": 0.46334075927734375,
+      "learning_rate": 0.0001954348539052356,
+      "loss": 0.9764,
+      "step": 2187
+    },
+    {
+      "epoch": 0.3896011396011396,
+      "grad_norm": 0.6343515515327454,
+      "learning_rate": 0.00019543067199155704,
+      "loss": 0.9474,
+      "step": 2188
+    },
+    {
+      "epoch": 0.38977920227920226,
+      "grad_norm": 0.4867806136608124,
+      "learning_rate": 0.0001954264882081127,
+      "loss": 1.1161,
+      "step": 2189
+    },
+    {
+      "epoch": 0.38995726495726496,
+      "grad_norm": 0.49305734038352966,
+      "learning_rate": 0.00019542230255498454,
+      "loss": 1.1825,
+      "step": 2190
+    },
+    {
+      "epoch": 0.39013532763532766,
+      "grad_norm": 0.518465518951416,
+      "learning_rate": 0.00019541811503225457,
+      "loss": 1.0695,
+      "step": 2191
+    },
+    {
+      "epoch": 0.3903133903133903,
+      "grad_norm": 0.4892457127571106,
+      "learning_rate": 0.00019541392564000488,
+      "loss": 1.3113,
+      "step": 2192
+    },
+    {
+      "epoch": 0.390491452991453,
+      "grad_norm": 0.5150920152664185,
+      "learning_rate": 0.00019540973437831753,
+      "loss": 1.0735,
+      "step": 2193
+    },
+    {
+      "epoch": 0.3906695156695157,
+      "grad_norm": 0.5414708256721497,
+      "learning_rate": 0.00019540554124727462,
+      "loss": 1.0773,
+      "step": 2194
+    },
+    {
+      "epoch": 0.39084757834757833,
+      "grad_norm": 0.49826398491859436,
+      "learning_rate": 0.0001954013462469583,
+      "loss": 1.0542,
+      "step": 2195
+    },
+    {
+      "epoch": 0.391025641025641,
+      "grad_norm": 0.5203596949577332,
+      "learning_rate": 0.0001953971493774508,
+      "loss": 1.178,
+      "step": 2196
+    },
+    {
+      "epoch": 0.3912037037037037,
+      "grad_norm": 0.45095738768577576,
+      "learning_rate": 0.00019539295063883432,
+      "loss": 1.1254,
+      "step": 2197
+    },
+    {
+      "epoch": 0.39138176638176636,
+      "grad_norm": 0.4938857853412628,
+      "learning_rate": 0.00019538875003119113,
+      "loss": 1.1061,
+      "step": 2198
+    },
+    {
+      "epoch": 0.39155982905982906,
+      "grad_norm": 0.5260919332504272,
+      "learning_rate": 0.00019538454755460354,
+      "loss": 1.3292,
+      "step": 2199
+    },
+    {
+      "epoch": 0.39173789173789175,
+      "grad_norm": 0.46527108550071716,
+      "learning_rate": 0.00019538034320915388,
+      "loss": 1.2074,
+      "step": 2200
+    },
+    {
+      "epoch": 0.3919159544159544,
+      "grad_norm": 0.5608304738998413,
+      "learning_rate": 0.00019537613699492453,
+      "loss": 1.0385,
+      "step": 2201
+    },
+    {
+      "epoch": 0.3920940170940171,
+      "grad_norm": 0.5056684613227844,
+      "learning_rate": 0.00019537192891199792,
+      "loss": 1.1513,
+      "step": 2202
+    },
+    {
+      "epoch": 0.3922720797720798,
+      "grad_norm": 0.3764426112174988,
+      "learning_rate": 0.00019536771896045644,
+      "loss": 0.8966,
+      "step": 2203
+    },
+    {
+      "epoch": 0.3924501424501424,
+      "grad_norm": 0.4983638823032379,
+      "learning_rate": 0.0001953635071403827,
+      "loss": 1.097,
+      "step": 2204
+    },
+    {
+      "epoch": 0.3926282051282051,
+      "grad_norm": 0.5733919739723206,
+      "learning_rate": 0.00019535929345185904,
+      "loss": 1.4992,
+      "step": 2205
+    },
+    {
+      "epoch": 0.3928062678062678,
+      "grad_norm": 0.632064163684845,
+      "learning_rate": 0.00019535507789496817,
+      "loss": 1.0611,
+      "step": 2206
+    },
+    {
+      "epoch": 0.39298433048433046,
+      "grad_norm": 0.409978449344635,
+      "learning_rate": 0.00019535086046979262,
+      "loss": 0.7172,
+      "step": 2207
+    },
+    {
+      "epoch": 0.39316239316239315,
+      "grad_norm": 0.40910813212394714,
+      "learning_rate": 0.00019534664117641502,
+      "loss": 0.8803,
+      "step": 2208
+    },
+    {
+      "epoch": 0.39334045584045585,
+      "grad_norm": 0.4696179926395416,
+      "learning_rate": 0.00019534242001491807,
+      "loss": 1.1551,
+      "step": 2209
+    },
+    {
+      "epoch": 0.39351851851851855,
+      "grad_norm": 0.538425862789154,
+      "learning_rate": 0.00019533819698538444,
+      "loss": 1.1296,
+      "step": 2210
+    },
+    {
+      "epoch": 0.3936965811965812,
+      "grad_norm": 0.5913630723953247,
+      "learning_rate": 0.00019533397208789692,
+      "loss": 0.9757,
+      "step": 2211
+    },
+    {
+      "epoch": 0.3938746438746439,
+      "grad_norm": 0.5649870038032532,
+      "learning_rate": 0.00019532974532253822,
+      "loss": 0.9976,
+      "step": 2212
+    },
+    {
+      "epoch": 0.3940527065527066,
+      "grad_norm": 0.5012063980102539,
+      "learning_rate": 0.00019532551668939121,
+      "loss": 0.9969,
+      "step": 2213
+    },
+    {
+      "epoch": 0.3942307692307692,
+      "grad_norm": 0.5098594427108765,
+      "learning_rate": 0.00019532128618853872,
+      "loss": 1.1229,
+      "step": 2214
+    },
+    {
+      "epoch": 0.3944088319088319,
+      "grad_norm": 0.4753342568874359,
+      "learning_rate": 0.0001953170538200636,
+      "loss": 1.0808,
+      "step": 2215
+    },
+    {
+      "epoch": 0.3945868945868946,
+      "grad_norm": 0.4770098626613617,
+      "learning_rate": 0.00019531281958404888,
+      "loss": 1.0656,
+      "step": 2216
+    },
+    {
+      "epoch": 0.39476495726495725,
+      "grad_norm": 0.6007979512214661,
+      "learning_rate": 0.00019530858348057746,
+      "loss": 1.0093,
+      "step": 2217
+    },
+    {
+      "epoch": 0.39494301994301995,
+      "grad_norm": 0.4501650929450989,
+      "learning_rate": 0.00019530434550973227,
+      "loss": 0.8557,
+      "step": 2218
+    },
+    {
+      "epoch": 0.39512108262108264,
+      "grad_norm": 0.5123980641365051,
+      "learning_rate": 0.00019530010567159645,
+      "loss": 0.9833,
+      "step": 2219
+    },
+    {
+      "epoch": 0.3952991452991453,
+      "grad_norm": 0.4623969495296478,
+      "learning_rate": 0.000195295863966253,
+      "loss": 0.913,
+      "step": 2220
+    },
+    {
+      "epoch": 0.395477207977208,
+      "grad_norm": 0.4341880679130554,
+      "learning_rate": 0.0001952916203937851,
+      "loss": 1.0234,
+      "step": 2221
+    },
+    {
+      "epoch": 0.3956552706552707,
+      "grad_norm": 0.5935006141662598,
+      "learning_rate": 0.00019528737495427581,
+      "loss": 1.061,
+      "step": 2222
+    },
+    {
+      "epoch": 0.3958333333333333,
+      "grad_norm": 0.44835174083709717,
+      "learning_rate": 0.00019528312764780837,
+      "loss": 1.1567,
+      "step": 2223
+    },
+    {
+      "epoch": 0.396011396011396,
+      "grad_norm": 0.5476976633071899,
+      "learning_rate": 0.00019527887847446595,
+      "loss": 1.2304,
+      "step": 2224
+    },
+    {
+      "epoch": 0.3961894586894587,
+      "grad_norm": 0.4487939774990082,
+      "learning_rate": 0.00019527462743433187,
+      "loss": 1.1813,
+      "step": 2225
+    },
+    {
+      "epoch": 0.39636752136752135,
+      "grad_norm": 0.4053241014480591,
+      "learning_rate": 0.00019527037452748936,
+      "loss": 0.7899,
+      "step": 2226
+    },
+    {
+      "epoch": 0.39654558404558404,
+      "grad_norm": 0.534570574760437,
+      "learning_rate": 0.00019526611975402176,
+      "loss": 1.0681,
+      "step": 2227
+    },
+    {
+      "epoch": 0.39672364672364674,
+      "grad_norm": 0.46096158027648926,
+      "learning_rate": 0.00019526186311401246,
+      "loss": 0.9234,
+      "step": 2228
+    },
+    {
+      "epoch": 0.3969017094017094,
+      "grad_norm": 0.47363516688346863,
+      "learning_rate": 0.00019525760460754483,
+      "loss": 1.0197,
+      "step": 2229
+    },
+    {
+      "epoch": 0.3970797720797721,
+      "grad_norm": 0.46317258477211,
+      "learning_rate": 0.00019525334423470234,
+      "loss": 1.2103,
+      "step": 2230
+    },
+    {
+      "epoch": 0.39725783475783477,
+      "grad_norm": 0.4924237132072449,
+      "learning_rate": 0.0001952490819955684,
+      "loss": 1.3299,
+      "step": 2231
+    },
+    {
+      "epoch": 0.3974358974358974,
+      "grad_norm": 0.5419978499412537,
+      "learning_rate": 0.0001952448178902266,
+      "loss": 1.2526,
+      "step": 2232
+    },
+    {
+      "epoch": 0.3976139601139601,
+      "grad_norm": 0.5003267526626587,
+      "learning_rate": 0.00019524055191876043,
+      "loss": 1.1073,
+      "step": 2233
+    },
+    {
+      "epoch": 0.3977920227920228,
+      "grad_norm": 0.621789276599884,
+      "learning_rate": 0.00019523628408125347,
+      "loss": 1.3409,
+      "step": 2234
+    },
+    {
+      "epoch": 0.39797008547008544,
+      "grad_norm": 0.44235602021217346,
+      "learning_rate": 0.0001952320143777894,
+      "loss": 0.9799,
+      "step": 2235
+    },
+    {
+      "epoch": 0.39814814814814814,
+      "grad_norm": 0.49954718351364136,
+      "learning_rate": 0.0001952277428084518,
+      "loss": 1.2227,
+      "step": 2236
+    },
+    {
+      "epoch": 0.39832621082621084,
+      "grad_norm": 0.5113739967346191,
+      "learning_rate": 0.00019522346937332443,
+      "loss": 1.1644,
+      "step": 2237
+    },
+    {
+      "epoch": 0.39850427350427353,
+      "grad_norm": 0.5026139616966248,
+      "learning_rate": 0.00019521919407249096,
+      "loss": 1.0823,
+      "step": 2238
+    },
+    {
+      "epoch": 0.39868233618233617,
+      "grad_norm": 0.4943205714225769,
+      "learning_rate": 0.0001952149169060352,
+      "loss": 1.0961,
+      "step": 2239
+    },
+    {
+      "epoch": 0.39886039886039887,
+      "grad_norm": 0.4680631458759308,
+      "learning_rate": 0.00019521063787404094,
+      "loss": 0.9787,
+      "step": 2240
+    },
+    {
+      "epoch": 0.39903846153846156,
+      "grad_norm": 0.5511566400527954,
+      "learning_rate": 0.00019520635697659202,
+      "loss": 1.2543,
+      "step": 2241
+    },
+    {
+      "epoch": 0.3992165242165242,
+      "grad_norm": 0.5494263172149658,
+      "learning_rate": 0.00019520207421377229,
+      "loss": 1.1978,
+      "step": 2242
+    },
+    {
+      "epoch": 0.3993945868945869,
+      "grad_norm": 0.4850340485572815,
+      "learning_rate": 0.00019519778958566568,
+      "loss": 0.8531,
+      "step": 2243
+    },
+    {
+      "epoch": 0.3995726495726496,
+      "grad_norm": 0.47168150544166565,
+      "learning_rate": 0.00019519350309235613,
+      "loss": 1.0746,
+      "step": 2244
+    },
+    {
+      "epoch": 0.39975071225071224,
+      "grad_norm": 0.571133553981781,
+      "learning_rate": 0.00019518921473392765,
+      "loss": 1.2984,
+      "step": 2245
+    },
+    {
+      "epoch": 0.39992877492877493,
+      "grad_norm": 0.4636089503765106,
+      "learning_rate": 0.00019518492451046427,
+      "loss": 1.019,
+      "step": 2246
+    },
+    {
+      "epoch": 0.40010683760683763,
+      "grad_norm": 0.4573518931865692,
+      "learning_rate": 0.00019518063242205,
+      "loss": 1.1042,
+      "step": 2247
+    },
+    {
+      "epoch": 0.40028490028490027,
+      "grad_norm": 0.49098989367485046,
+      "learning_rate": 0.00019517633846876894,
+      "loss": 1.1224,
+      "step": 2248
+    },
+    {
+      "epoch": 0.40046296296296297,
+      "grad_norm": 0.5475491881370544,
+      "learning_rate": 0.00019517204265070523,
+      "loss": 1.0984,
+      "step": 2249
+    },
+    {
+      "epoch": 0.40064102564102566,
+      "grad_norm": 0.45498281717300415,
+      "learning_rate": 0.00019516774496794307,
+      "loss": 0.8883,
+      "step": 2250
+    },
+    {
+      "epoch": 0.4008190883190883,
+      "grad_norm": 0.4908423125743866,
+      "learning_rate": 0.00019516344542056666,
+      "loss": 1.328,
+      "step": 2251
+    },
+    {
+      "epoch": 0.400997150997151,
+      "grad_norm": 0.5474920272827148,
+      "learning_rate": 0.0001951591440086602,
+      "loss": 1.3825,
+      "step": 2252
+    },
+    {
+      "epoch": 0.4011752136752137,
+      "grad_norm": 0.5165615081787109,
+      "learning_rate": 0.000195154840732308,
+      "loss": 1.33,
+      "step": 2253
+    },
+    {
+      "epoch": 0.40135327635327633,
+      "grad_norm": 0.5185585021972656,
+      "learning_rate": 0.00019515053559159435,
+      "loss": 1.1689,
+      "step": 2254
+    },
+    {
+      "epoch": 0.40153133903133903,
+      "grad_norm": 0.5468854904174805,
+      "learning_rate": 0.00019514622858660363,
+      "loss": 1.2708,
+      "step": 2255
+    },
+    {
+      "epoch": 0.4017094017094017,
+      "grad_norm": 0.47556906938552856,
+      "learning_rate": 0.0001951419197174202,
+      "loss": 1.0488,
+      "step": 2256
+    },
+    {
+      "epoch": 0.40188746438746437,
+      "grad_norm": 0.5521323084831238,
+      "learning_rate": 0.0001951376089841285,
+      "loss": 1.0868,
+      "step": 2257
+    },
+    {
+      "epoch": 0.40206552706552706,
+      "grad_norm": 0.6029638051986694,
+      "learning_rate": 0.00019513329638681296,
+      "loss": 1.1735,
+      "step": 2258
+    },
+    {
+      "epoch": 0.40224358974358976,
+      "grad_norm": 0.4897766411304474,
+      "learning_rate": 0.00019512898192555812,
+      "loss": 1.1687,
+      "step": 2259
+    },
+    {
+      "epoch": 0.4024216524216524,
+      "grad_norm": 0.45527184009552,
+      "learning_rate": 0.00019512466560044848,
+      "loss": 1.0352,
+      "step": 2260
+    },
+    {
+      "epoch": 0.4025997150997151,
+      "grad_norm": 0.5025625824928284,
+      "learning_rate": 0.00019512034741156863,
+      "loss": 1.2503,
+      "step": 2261
+    },
+    {
+      "epoch": 0.4027777777777778,
+      "grad_norm": 0.46415451169013977,
+      "learning_rate": 0.00019511602735900317,
+      "loss": 1.032,
+      "step": 2262
+    },
+    {
+      "epoch": 0.40295584045584043,
+      "grad_norm": 0.4812934398651123,
+      "learning_rate": 0.00019511170544283678,
+      "loss": 1.0523,
+      "step": 2263
+    },
+    {
+      "epoch": 0.4031339031339031,
+      "grad_norm": 0.49937039613723755,
+      "learning_rate": 0.00019510738166315404,
+      "loss": 1.2238,
+      "step": 2264
+    },
+    {
+      "epoch": 0.4033119658119658,
+      "grad_norm": 0.5428698062896729,
+      "learning_rate": 0.00019510305602003975,
+      "loss": 1.0361,
+      "step": 2265
+    },
+    {
+      "epoch": 0.40349002849002846,
+      "grad_norm": 0.44836854934692383,
+      "learning_rate": 0.0001950987285135786,
+      "loss": 1.169,
+      "step": 2266
+    },
+    {
+      "epoch": 0.40366809116809116,
+      "grad_norm": 0.5071489214897156,
+      "learning_rate": 0.00019509439914385549,
+      "loss": 1.1567,
+      "step": 2267
+    },
+    {
+      "epoch": 0.40384615384615385,
+      "grad_norm": 0.5204613208770752,
+      "learning_rate": 0.00019509006791095513,
+      "loss": 0.9949,
+      "step": 2268
+    },
+    {
+      "epoch": 0.40402421652421655,
+      "grad_norm": 0.4583234488964081,
+      "learning_rate": 0.00019508573481496238,
+      "loss": 0.9051,
+      "step": 2269
+    },
+    {
+      "epoch": 0.4042022792022792,
+      "grad_norm": 0.5436791181564331,
+      "learning_rate": 0.00019508139985596222,
+      "loss": 1.3239,
+      "step": 2270
+    },
+    {
+      "epoch": 0.4043803418803419,
+      "grad_norm": 0.48774269223213196,
+      "learning_rate": 0.00019507706303403954,
+      "loss": 1.2102,
+      "step": 2271
+    },
+    {
+      "epoch": 0.4045584045584046,
+      "grad_norm": 0.4742540717124939,
+      "learning_rate": 0.00019507272434927933,
+      "loss": 1.1137,
+      "step": 2272
+    },
+    {
+      "epoch": 0.4047364672364672,
+      "grad_norm": 0.531148374080658,
+      "learning_rate": 0.00019506838380176658,
+      "loss": 1.3162,
+      "step": 2273
+    },
+    {
+      "epoch": 0.4049145299145299,
+      "grad_norm": 0.5002314448356628,
+      "learning_rate": 0.0001950640413915863,
+      "loss": 1.0743,
+      "step": 2274
+    },
+    {
+      "epoch": 0.4050925925925926,
+      "grad_norm": 0.39826446771621704,
+      "learning_rate": 0.00019505969711882366,
+      "loss": 0.7698,
+      "step": 2275
+    },
+    {
+      "epoch": 0.40527065527065526,
+      "grad_norm": 0.5177471041679382,
+      "learning_rate": 0.00019505535098356371,
+      "loss": 1.1821,
+      "step": 2276
+    },
+    {
+      "epoch": 0.40544871794871795,
+      "grad_norm": 0.467241108417511,
+      "learning_rate": 0.00019505100298589158,
+      "loss": 0.8036,
+      "step": 2277
+    },
+    {
+      "epoch": 0.40562678062678065,
+      "grad_norm": 0.43711844086647034,
+      "learning_rate": 0.00019504665312589255,
+      "loss": 0.8667,
+      "step": 2278
+    },
+    {
+      "epoch": 0.4058048433048433,
+      "grad_norm": 0.4929116368293762,
+      "learning_rate": 0.00019504230140365177,
+      "loss": 1.1279,
+      "step": 2279
+    },
+    {
+      "epoch": 0.405982905982906,
+      "grad_norm": 0.5279183983802795,
+      "learning_rate": 0.00019503794781925452,
+      "loss": 1.1318,
+      "step": 2280
+    },
+    {
+      "epoch": 0.4061609686609687,
+      "grad_norm": 0.549217939376831,
+      "learning_rate": 0.00019503359237278608,
+      "loss": 1.2007,
+      "step": 2281
+    },
+    {
+      "epoch": 0.4063390313390313,
+      "grad_norm": 0.5485880374908447,
+      "learning_rate": 0.00019502923506433187,
+      "loss": 1.1079,
+      "step": 2282
+    },
+    {
+      "epoch": 0.406517094017094,
+      "grad_norm": 0.48379644751548767,
+      "learning_rate": 0.0001950248758939772,
+      "loss": 0.9978,
+      "step": 2283
+    },
+    {
+      "epoch": 0.4066951566951567,
+      "grad_norm": 0.5943657755851746,
+      "learning_rate": 0.00019502051486180744,
+      "loss": 1.0466,
+      "step": 2284
+    },
+    {
+      "epoch": 0.40687321937321935,
+      "grad_norm": 0.5721273422241211,
+      "learning_rate": 0.00019501615196790812,
+      "loss": 1.2674,
+      "step": 2285
+    },
+    {
+      "epoch": 0.40705128205128205,
+      "grad_norm": 0.47624221444129944,
+      "learning_rate": 0.00019501178721236464,
+      "loss": 1.089,
+      "step": 2286
+    },
+    {
+      "epoch": 0.40722934472934474,
+      "grad_norm": 0.5091297030448914,
+      "learning_rate": 0.0001950074205952626,
+      "loss": 1.2035,
+      "step": 2287
+    },
+    {
+      "epoch": 0.4074074074074074,
+      "grad_norm": 0.45206236839294434,
+      "learning_rate": 0.0001950030521166875,
+      "loss": 0.9188,
+      "step": 2288
+    },
+    {
+      "epoch": 0.4075854700854701,
+      "grad_norm": 0.5563844442367554,
+      "learning_rate": 0.00019499868177672497,
+      "loss": 1.3444,
+      "step": 2289
+    },
+    {
+      "epoch": 0.4077635327635328,
+      "grad_norm": 0.4971138536930084,
+      "learning_rate": 0.00019499430957546055,
+      "loss": 1.1615,
+      "step": 2290
+    },
+    {
+      "epoch": 0.4079415954415954,
+      "grad_norm": 0.49355944991111755,
+      "learning_rate": 0.00019498993551298,
+      "loss": 1.1528,
+      "step": 2291
+    },
+    {
+      "epoch": 0.4081196581196581,
+      "grad_norm": 0.534705638885498,
+      "learning_rate": 0.000194985559589369,
+      "loss": 1.197,
+      "step": 2292
+    },
+    {
+      "epoch": 0.4082977207977208,
+      "grad_norm": 0.5113020539283752,
+      "learning_rate": 0.0001949811818047133,
+      "loss": 1.109,
+      "step": 2293
+    },
+    {
+      "epoch": 0.40847578347578345,
+      "grad_norm": 0.4823366701602936,
+      "learning_rate": 0.00019497680215909858,
+      "loss": 1.168,
+      "step": 2294
+    },
+    {
+      "epoch": 0.40865384615384615,
+      "grad_norm": 0.500792920589447,
+      "learning_rate": 0.00019497242065261077,
+      "loss": 1.1567,
+      "step": 2295
+    },
+    {
+      "epoch": 0.40883190883190884,
+      "grad_norm": 0.5047918558120728,
+      "learning_rate": 0.00019496803728533566,
+      "loss": 1.0515,
+      "step": 2296
+    },
+    {
+      "epoch": 0.40900997150997154,
+      "grad_norm": 0.474624365568161,
+      "learning_rate": 0.00019496365205735913,
+      "loss": 1.1747,
+      "step": 2297
+    },
+    {
+      "epoch": 0.4091880341880342,
+      "grad_norm": 0.5522183179855347,
+      "learning_rate": 0.0001949592649687671,
+      "loss": 1.1506,
+      "step": 2298
+    },
+    {
+      "epoch": 0.4093660968660969,
+      "grad_norm": 0.4526083767414093,
+      "learning_rate": 0.00019495487601964553,
+      "loss": 0.9968,
+      "step": 2299
+    },
+    {
+      "epoch": 0.40954415954415957,
+      "grad_norm": 0.545845091342926,
+      "learning_rate": 0.00019495048521008044,
+      "loss": 1.146,
+      "step": 2300
+    },
+    {
+      "epoch": 0.4097222222222222,
+      "grad_norm": 0.5475544333457947,
+      "learning_rate": 0.00019494609254015784,
+      "loss": 1.0101,
+      "step": 2301
+    },
+    {
+      "epoch": 0.4099002849002849,
+      "grad_norm": 0.43419042229652405,
+      "learning_rate": 0.00019494169800996373,
+      "loss": 1.065,
+      "step": 2302
+    },
+    {
+      "epoch": 0.4100783475783476,
+      "grad_norm": 0.44998374581336975,
+      "learning_rate": 0.00019493730161958435,
+      "loss": 0.9948,
+      "step": 2303
+    },
+    {
+      "epoch": 0.41025641025641024,
+      "grad_norm": 0.5401661992073059,
+      "learning_rate": 0.0001949329033691057,
+      "loss": 1.0473,
+      "step": 2304
+    },
+    {
+      "epoch": 0.41043447293447294,
+      "grad_norm": 0.48064103722572327,
+      "learning_rate": 0.00019492850325861404,
+      "loss": 1.0486,
+      "step": 2305
+    },
+    {
+      "epoch": 0.41061253561253563,
+      "grad_norm": 0.5398300290107727,
+      "learning_rate": 0.00019492410128819557,
+      "loss": 1.0314,
+      "step": 2306
+    },
+    {
+      "epoch": 0.4107905982905983,
+      "grad_norm": 0.4771125912666321,
+      "learning_rate": 0.0001949196974579365,
+      "loss": 0.9855,
+      "step": 2307
+    },
+    {
+      "epoch": 0.41096866096866097,
+      "grad_norm": 0.5375809669494629,
+      "learning_rate": 0.00019491529176792315,
+      "loss": 1.0777,
+      "step": 2308
+    },
+    {
+      "epoch": 0.41114672364672367,
+      "grad_norm": 0.48424094915390015,
+      "learning_rate": 0.00019491088421824183,
+      "loss": 1.0751,
+      "step": 2309
+    },
+    {
+      "epoch": 0.4113247863247863,
+      "grad_norm": 0.5054880380630493,
+      "learning_rate": 0.00019490647480897887,
+      "loss": 1.2457,
+      "step": 2310
+    },
+    {
+      "epoch": 0.411502849002849,
+      "grad_norm": 0.47118356823921204,
+      "learning_rate": 0.0001949020635402207,
+      "loss": 1.0445,
+      "step": 2311
+    },
+    {
+      "epoch": 0.4116809116809117,
+      "grad_norm": 0.47171851992607117,
+      "learning_rate": 0.00019489765041205375,
+      "loss": 1.0062,
+      "step": 2312
+    },
+    {
+      "epoch": 0.41185897435897434,
+      "grad_norm": 0.5703238844871521,
+      "learning_rate": 0.00019489323542456447,
+      "loss": 1.5639,
+      "step": 2313
+    },
+    {
+      "epoch": 0.41203703703703703,
+      "grad_norm": 0.5045075416564941,
+      "learning_rate": 0.00019488881857783935,
+      "loss": 1.1665,
+      "step": 2314
+    },
+    {
+      "epoch": 0.41221509971509973,
+      "grad_norm": 0.46835362911224365,
+      "learning_rate": 0.00019488439987196495,
+      "loss": 1.2078,
+      "step": 2315
+    },
+    {
+      "epoch": 0.41239316239316237,
+      "grad_norm": 0.5187196731567383,
+      "learning_rate": 0.00019487997930702785,
+      "loss": 1.1049,
+      "step": 2316
+    },
+    {
+      "epoch": 0.41257122507122507,
+      "grad_norm": 0.5190554857254028,
+      "learning_rate": 0.00019487555688311463,
+      "loss": 1.331,
+      "step": 2317
+    },
+    {
+      "epoch": 0.41274928774928776,
+      "grad_norm": 0.7394969463348389,
+      "learning_rate": 0.00019487113260031197,
+      "loss": 0.9646,
+      "step": 2318
+    },
+    {
+      "epoch": 0.4129273504273504,
+      "grad_norm": 0.532982349395752,
+      "learning_rate": 0.00019486670645870656,
+      "loss": 1.166,
+      "step": 2319
+    },
+    {
+      "epoch": 0.4131054131054131,
+      "grad_norm": 0.48659515380859375,
+      "learning_rate": 0.00019486227845838509,
+      "loss": 1.0016,
+      "step": 2320
+    },
+    {
+      "epoch": 0.4132834757834758,
+      "grad_norm": 0.5364453196525574,
+      "learning_rate": 0.00019485784859943434,
+      "loss": 1.3877,
+      "step": 2321
+    },
+    {
+      "epoch": 0.41346153846153844,
+      "grad_norm": 0.49788740277290344,
+      "learning_rate": 0.0001948534168819411,
+      "loss": 1.2949,
+      "step": 2322
+    },
+    {
+      "epoch": 0.41363960113960113,
+      "grad_norm": 0.5125377774238586,
+      "learning_rate": 0.00019484898330599217,
+      "loss": 0.9769,
+      "step": 2323
+    },
+    {
+      "epoch": 0.41381766381766383,
+      "grad_norm": 0.5434861779212952,
+      "learning_rate": 0.00019484454787167447,
+      "loss": 1.254,
+      "step": 2324
+    },
+    {
+      "epoch": 0.41399572649572647,
+      "grad_norm": 0.5324583053588867,
+      "learning_rate": 0.00019484011057907487,
+      "loss": 0.9788,
+      "step": 2325
+    },
+    {
+      "epoch": 0.41417378917378916,
+      "grad_norm": 0.4806961715221405,
+      "learning_rate": 0.00019483567142828033,
+      "loss": 1.0089,
+      "step": 2326
+    },
+    {
+      "epoch": 0.41435185185185186,
+      "grad_norm": 0.5152947306632996,
+      "learning_rate": 0.0001948312304193778,
+      "loss": 1.15,
+      "step": 2327
+    },
+    {
+      "epoch": 0.41452991452991456,
+      "grad_norm": 0.6030138731002808,
+      "learning_rate": 0.0001948267875524543,
+      "loss": 1.196,
+      "step": 2328
+    },
+    {
+      "epoch": 0.4147079772079772,
+      "grad_norm": 0.4504946768283844,
+      "learning_rate": 0.0001948223428275969,
+      "loss": 0.8742,
+      "step": 2329
+    },
+    {
+      "epoch": 0.4148860398860399,
+      "grad_norm": 0.5195745825767517,
+      "learning_rate": 0.00019481789624489263,
+      "loss": 1.0104,
+      "step": 2330
+    },
+    {
+      "epoch": 0.4150641025641026,
+      "grad_norm": 0.5269250869750977,
+      "learning_rate": 0.0001948134478044287,
+      "loss": 1.2284,
+      "step": 2331
+    },
+    {
+      "epoch": 0.41524216524216523,
+      "grad_norm": 0.5302315354347229,
+      "learning_rate": 0.00019480899750629218,
+      "loss": 1.1374,
+      "step": 2332
+    },
+    {
+      "epoch": 0.4154202279202279,
+      "grad_norm": 0.5501471161842346,
+      "learning_rate": 0.0001948045453505703,
+      "loss": 1.214,
+      "step": 2333
+    },
+    {
+      "epoch": 0.4155982905982906,
+      "grad_norm": 0.4674588739871979,
+      "learning_rate": 0.0001948000913373503,
+      "loss": 1.0568,
+      "step": 2334
+    },
+    {
+      "epoch": 0.41577635327635326,
+      "grad_norm": 0.5262266993522644,
+      "learning_rate": 0.0001947956354667195,
+      "loss": 1.111,
+      "step": 2335
+    },
+    {
+      "epoch": 0.41595441595441596,
+      "grad_norm": 0.4549071788787842,
+      "learning_rate": 0.00019479117773876507,
+      "loss": 1.2655,
+      "step": 2336
+    },
+    {
+      "epoch": 0.41613247863247865,
+      "grad_norm": 0.48897311091423035,
+      "learning_rate": 0.00019478671815357447,
+      "loss": 1.0543,
+      "step": 2337
+    },
+    {
+      "epoch": 0.4163105413105413,
+      "grad_norm": 0.5544867515563965,
+      "learning_rate": 0.000194782256711235,
+      "loss": 1.2276,
+      "step": 2338
+    },
+    {
+      "epoch": 0.416488603988604,
+      "grad_norm": 0.5050773024559021,
+      "learning_rate": 0.0001947777934118341,
+      "loss": 0.9781,
+      "step": 2339
+    },
+    {
+      "epoch": 0.4166666666666667,
+      "grad_norm": 0.4831899106502533,
+      "learning_rate": 0.00019477332825545925,
+      "loss": 1.0213,
+      "step": 2340
+    },
+    {
+      "epoch": 0.4168447293447293,
+      "grad_norm": 0.5392552614212036,
+      "learning_rate": 0.0001947688612421979,
+      "loss": 1.3251,
+      "step": 2341
+    },
+    {
+      "epoch": 0.417022792022792,
+      "grad_norm": 0.5003608465194702,
+      "learning_rate": 0.00019476439237213754,
+      "loss": 1.0714,
+      "step": 2342
+    },
+    {
+      "epoch": 0.4172008547008547,
+      "grad_norm": 0.5016986727714539,
+      "learning_rate": 0.00019475992164536582,
+      "loss": 1.0656,
+      "step": 2343
+    },
+    {
+      "epoch": 0.41737891737891736,
+      "grad_norm": 0.5139234066009521,
+      "learning_rate": 0.00019475544906197024,
+      "loss": 1.1317,
+      "step": 2344
+    },
+    {
+      "epoch": 0.41755698005698005,
+      "grad_norm": 0.582478940486908,
+      "learning_rate": 0.00019475097462203847,
+      "loss": 1.4209,
+      "step": 2345
+    },
+    {
+      "epoch": 0.41773504273504275,
+      "grad_norm": 0.5248767137527466,
+      "learning_rate": 0.00019474649832565823,
+      "loss": 1.2965,
+      "step": 2346
+    },
+    {
+      "epoch": 0.4179131054131054,
+      "grad_norm": 0.4977390170097351,
+      "learning_rate": 0.00019474202017291713,
+      "loss": 1.3319,
+      "step": 2347
+    },
+    {
+      "epoch": 0.4180911680911681,
+      "grad_norm": 0.4868984818458557,
+      "learning_rate": 0.00019473754016390298,
+      "loss": 1.0595,
+      "step": 2348
+    },
+    {
+      "epoch": 0.4182692307692308,
+      "grad_norm": 0.5965346693992615,
+      "learning_rate": 0.00019473305829870353,
+      "loss": 1.2289,
+      "step": 2349
+    },
+    {
+      "epoch": 0.4184472934472934,
+      "grad_norm": 0.46590209007263184,
+      "learning_rate": 0.0001947285745774066,
+      "loss": 1.0468,
+      "step": 2350
+    },
+    {
+      "epoch": 0.4186253561253561,
+      "grad_norm": 0.497811883687973,
+      "learning_rate": 0.0001947240890001,
+      "loss": 1.1247,
+      "step": 2351
+    },
+    {
+      "epoch": 0.4188034188034188,
+      "grad_norm": 0.5348289012908936,
+      "learning_rate": 0.0001947196015668717,
+      "loss": 0.9496,
+      "step": 2352
+    },
+    {
+      "epoch": 0.41898148148148145,
+      "grad_norm": 0.5086174607276917,
+      "learning_rate": 0.0001947151122778095,
+      "loss": 0.8869,
+      "step": 2353
+    },
+    {
+      "epoch": 0.41915954415954415,
+      "grad_norm": 0.4844677150249481,
+      "learning_rate": 0.00019471062113300146,
+      "loss": 0.847,
+      "step": 2354
+    },
+    {
+      "epoch": 0.41933760683760685,
+      "grad_norm": 0.5395866632461548,
+      "learning_rate": 0.00019470612813253556,
+      "loss": 0.9684,
+      "step": 2355
+    },
+    {
+      "epoch": 0.41951566951566954,
+      "grad_norm": 0.479403018951416,
+      "learning_rate": 0.0001947016332764998,
+      "loss": 1.0532,
+      "step": 2356
+    },
+    {
+      "epoch": 0.4196937321937322,
+      "grad_norm": 0.5499961376190186,
+      "learning_rate": 0.00019469713656498227,
+      "loss": 1.2565,
+      "step": 2357
+    },
+    {
+      "epoch": 0.4198717948717949,
+      "grad_norm": 0.5865352153778076,
+      "learning_rate": 0.00019469263799807104,
+      "loss": 1.1349,
+      "step": 2358
+    },
+    {
+      "epoch": 0.4200498575498576,
+      "grad_norm": 0.4454309046268463,
+      "learning_rate": 0.00019468813757585432,
+      "loss": 0.9631,
+      "step": 2359
+    },
+    {
+      "epoch": 0.4202279202279202,
+      "grad_norm": 0.48426875472068787,
+      "learning_rate": 0.00019468363529842023,
+      "loss": 0.9795,
+      "step": 2360
+    },
+    {
+      "epoch": 0.4204059829059829,
+      "grad_norm": 0.47428226470947266,
+      "learning_rate": 0.00019467913116585697,
+      "loss": 0.9316,
+      "step": 2361
+    },
+    {
+      "epoch": 0.4205840455840456,
+      "grad_norm": 0.5193758010864258,
+      "learning_rate": 0.00019467462517825282,
+      "loss": 1.235,
+      "step": 2362
+    },
+    {
+      "epoch": 0.42076210826210825,
+      "grad_norm": 0.49845513701438904,
+      "learning_rate": 0.00019467011733569607,
+      "loss": 1.2413,
+      "step": 2363
+    },
+    {
+      "epoch": 0.42094017094017094,
+      "grad_norm": 0.45483845472335815,
+      "learning_rate": 0.00019466560763827502,
+      "loss": 1.2817,
+      "step": 2364
+    },
+    {
+      "epoch": 0.42111823361823364,
+      "grad_norm": 0.43345287442207336,
+      "learning_rate": 0.00019466109608607806,
+      "loss": 0.8568,
+      "step": 2365
+    },
+    {
+      "epoch": 0.4212962962962963,
+      "grad_norm": 0.4467088282108307,
+      "learning_rate": 0.00019465658267919352,
+      "loss": 1.1408,
+      "step": 2366
+    },
+    {
+      "epoch": 0.421474358974359,
+      "grad_norm": 0.6705610156059265,
+      "learning_rate": 0.00019465206741770992,
+      "loss": 1.445,
+      "step": 2367
+    },
+    {
+      "epoch": 0.42165242165242167,
+      "grad_norm": 0.5037859678268433,
+      "learning_rate": 0.00019464755030171565,
+      "loss": 0.8682,
+      "step": 2368
+    },
+    {
+      "epoch": 0.4218304843304843,
+      "grad_norm": 0.49576324224472046,
+      "learning_rate": 0.00019464303133129928,
+      "loss": 0.8387,
+      "step": 2369
+    },
+    {
+      "epoch": 0.422008547008547,
+      "grad_norm": 0.5222806334495544,
+      "learning_rate": 0.00019463851050654927,
+      "loss": 1.1443,
+      "step": 2370
+    },
+    {
+      "epoch": 0.4221866096866097,
+      "grad_norm": 0.4966863989830017,
+      "learning_rate": 0.00019463398782755426,
+      "loss": 1.1555,
+      "step": 2371
+    },
+    {
+      "epoch": 0.42236467236467234,
+      "grad_norm": 0.6140168309211731,
+      "learning_rate": 0.00019462946329440285,
+      "loss": 1.2264,
+      "step": 2372
+    },
+    {
+      "epoch": 0.42254273504273504,
+      "grad_norm": 0.4906651973724365,
+      "learning_rate": 0.0001946249369071837,
+      "loss": 1.2459,
+      "step": 2373
+    },
+    {
+      "epoch": 0.42272079772079774,
+      "grad_norm": 0.5956700444221497,
+      "learning_rate": 0.00019462040866598544,
+      "loss": 1.1521,
+      "step": 2374
+    },
+    {
+      "epoch": 0.4228988603988604,
+      "grad_norm": 0.46044886112213135,
+      "learning_rate": 0.00019461587857089687,
+      "loss": 1.2084,
+      "step": 2375
+    },
+    {
+      "epoch": 0.4230769230769231,
+      "grad_norm": 0.5109430551528931,
+      "learning_rate": 0.00019461134662200668,
+      "loss": 1.2684,
+      "step": 2376
+    },
+    {
+      "epoch": 0.42325498575498577,
+      "grad_norm": 0.4373733103275299,
+      "learning_rate": 0.0001946068128194037,
+      "loss": 1.0451,
+      "step": 2377
+    },
+    {
+      "epoch": 0.4234330484330484,
+      "grad_norm": 0.553817868232727,
+      "learning_rate": 0.00019460227716317673,
+      "loss": 1.1052,
+      "step": 2378
+    },
+    {
+      "epoch": 0.4236111111111111,
+      "grad_norm": 0.5742647647857666,
+      "learning_rate": 0.00019459773965341468,
+      "loss": 1.1647,
+      "step": 2379
+    },
+    {
+      "epoch": 0.4237891737891738,
+      "grad_norm": 0.5461940169334412,
+      "learning_rate": 0.00019459320029020642,
+      "loss": 1.0953,
+      "step": 2380
+    },
+    {
+      "epoch": 0.42396723646723644,
+      "grad_norm": 0.5837802290916443,
+      "learning_rate": 0.0001945886590736409,
+      "loss": 1.1303,
+      "step": 2381
+    },
+    {
+      "epoch": 0.42414529914529914,
+      "grad_norm": 0.5316985249519348,
+      "learning_rate": 0.0001945841160038071,
+      "loss": 1.1204,
+      "step": 2382
+    },
+    {
+      "epoch": 0.42432336182336183,
+      "grad_norm": 0.5846191048622131,
+      "learning_rate": 0.00019457957108079404,
+      "loss": 1.2622,
+      "step": 2383
+    },
+    {
+      "epoch": 0.42450142450142453,
+      "grad_norm": 0.43266957998275757,
+      "learning_rate": 0.00019457502430469075,
+      "loss": 0.9834,
+      "step": 2384
+    },
+    {
+      "epoch": 0.42467948717948717,
+      "grad_norm": 0.514081597328186,
+      "learning_rate": 0.00019457047567558632,
+      "loss": 0.8413,
+      "step": 2385
+    },
+    {
+      "epoch": 0.42485754985754987,
+      "grad_norm": 0.4831700325012207,
+      "learning_rate": 0.00019456592519356987,
+      "loss": 0.9244,
+      "step": 2386
+    },
+    {
+      "epoch": 0.42503561253561256,
+      "grad_norm": 0.5612850785255432,
+      "learning_rate": 0.00019456137285873057,
+      "loss": 0.9438,
+      "step": 2387
+    },
+    {
+      "epoch": 0.4252136752136752,
+      "grad_norm": 0.5197352766990662,
+      "learning_rate": 0.00019455681867115758,
+      "loss": 1.1095,
+      "step": 2388
+    },
+    {
+      "epoch": 0.4253917378917379,
+      "grad_norm": 0.5045261979103088,
+      "learning_rate": 0.00019455226263094018,
+      "loss": 1.0007,
+      "step": 2389
+    },
+    {
+      "epoch": 0.4255698005698006,
+      "grad_norm": 0.5167570114135742,
+      "learning_rate": 0.00019454770473816758,
+      "loss": 1.1335,
+      "step": 2390
+    },
+    {
+      "epoch": 0.42574786324786323,
+      "grad_norm": 0.49262070655822754,
+      "learning_rate": 0.00019454314499292913,
+      "loss": 1.0436,
+      "step": 2391
+    },
+    {
+      "epoch": 0.42592592592592593,
+      "grad_norm": 0.4489207863807678,
+      "learning_rate": 0.00019453858339531417,
+      "loss": 1.0138,
+      "step": 2392
+    },
+    {
+      "epoch": 0.4261039886039886,
+      "grad_norm": 0.6024920344352722,
+      "learning_rate": 0.00019453401994541203,
+      "loss": 1.1921,
+      "step": 2393
+    },
+    {
+      "epoch": 0.42628205128205127,
+      "grad_norm": 0.46807861328125,
+      "learning_rate": 0.00019452945464331215,
+      "loss": 1.0947,
+      "step": 2394
+    },
+    {
+      "epoch": 0.42646011396011396,
+      "grad_norm": 0.48776543140411377,
+      "learning_rate": 0.00019452488748910397,
+      "loss": 1.0029,
+      "step": 2395
+    },
+    {
+      "epoch": 0.42663817663817666,
+      "grad_norm": 0.4798663556575775,
+      "learning_rate": 0.000194520318482877,
+      "loss": 0.7863,
+      "step": 2396
+    },
+    {
+      "epoch": 0.4268162393162393,
+      "grad_norm": 0.5067816972732544,
+      "learning_rate": 0.0001945157476247207,
+      "loss": 1.0049,
+      "step": 2397
+    },
+    {
+      "epoch": 0.426994301994302,
+      "grad_norm": 0.5179638266563416,
+      "learning_rate": 0.00019451117491472468,
+      "loss": 1.1851,
+      "step": 2398
+    },
+    {
+      "epoch": 0.4271723646723647,
+      "grad_norm": 0.4782430827617645,
+      "learning_rate": 0.00019450660035297854,
+      "loss": 1.125,
+      "step": 2399
+    },
+    {
+      "epoch": 0.42735042735042733,
+      "grad_norm": 0.560077965259552,
+      "learning_rate": 0.00019450202393957186,
+      "loss": 1.1843,
+      "step": 2400
+    },
+    {
+      "epoch": 0.42752849002849,
+      "grad_norm": 0.5247970223426819,
+      "learning_rate": 0.00019449744567459436,
+      "loss": 1.1576,
+      "step": 2401
+    },
+    {
+      "epoch": 0.4277065527065527,
+      "grad_norm": 0.6414062976837158,
+      "learning_rate": 0.00019449286555813568,
+      "loss": 1.1833,
+      "step": 2402
+    },
+    {
+      "epoch": 0.42788461538461536,
+      "grad_norm": 0.5006586909294128,
+      "learning_rate": 0.00019448828359028563,
+      "loss": 1.1778,
+      "step": 2403
+    },
+    {
+      "epoch": 0.42806267806267806,
+      "grad_norm": 0.4946450889110565,
+      "learning_rate": 0.0001944836997711339,
+      "loss": 1.1611,
+      "step": 2404
+    },
+    {
+      "epoch": 0.42824074074074076,
+      "grad_norm": 0.4601200222969055,
+      "learning_rate": 0.00019447911410077037,
+      "loss": 1.2456,
+      "step": 2405
+    },
+    {
+      "epoch": 0.4284188034188034,
+      "grad_norm": 0.4653947651386261,
+      "learning_rate": 0.00019447452657928485,
+      "loss": 1.0941,
+      "step": 2406
+    },
+    {
+      "epoch": 0.4285968660968661,
+      "grad_norm": 0.5015713572502136,
+      "learning_rate": 0.00019446993720676726,
+      "loss": 1.3113,
+      "step": 2407
+    },
+    {
+      "epoch": 0.4287749287749288,
+      "grad_norm": 0.5803143978118896,
+      "learning_rate": 0.0001944653459833075,
+      "loss": 1.0568,
+      "step": 2408
+    },
+    {
+      "epoch": 0.42895299145299143,
+      "grad_norm": 0.5259647965431213,
+      "learning_rate": 0.0001944607529089955,
+      "loss": 1.1243,
+      "step": 2409
+    },
+    {
+      "epoch": 0.4291310541310541,
+      "grad_norm": 0.5150414109230042,
+      "learning_rate": 0.00019445615798392124,
+      "loss": 1.0676,
+      "step": 2410
+    },
+    {
+      "epoch": 0.4293091168091168,
+      "grad_norm": 0.5848649740219116,
+      "learning_rate": 0.0001944515612081748,
+      "loss": 1.0671,
+      "step": 2411
+    },
+    {
+      "epoch": 0.42948717948717946,
+      "grad_norm": 0.5696990489959717,
+      "learning_rate": 0.00019444696258184626,
+      "loss": 1.3323,
+      "step": 2412
+    },
+    {
+      "epoch": 0.42966524216524216,
+      "grad_norm": 0.49822330474853516,
+      "learning_rate": 0.00019444236210502567,
+      "loss": 1.1004,
+      "step": 2413
+    },
+    {
+      "epoch": 0.42984330484330485,
+      "grad_norm": 0.4683490991592407,
+      "learning_rate": 0.00019443775977780317,
+      "loss": 0.9768,
+      "step": 2414
+    },
+    {
+      "epoch": 0.43002136752136755,
+      "grad_norm": 0.5703811049461365,
+      "learning_rate": 0.00019443315560026893,
+      "loss": 1.154,
+      "step": 2415
+    },
+    {
+      "epoch": 0.4301994301994302,
+      "grad_norm": 0.5121861100196838,
+      "learning_rate": 0.0001944285495725132,
+      "loss": 1.1388,
+      "step": 2416
+    },
+    {
+      "epoch": 0.4303774928774929,
+      "grad_norm": 0.4864094853401184,
+      "learning_rate": 0.00019442394169462619,
+      "loss": 0.9214,
+      "step": 2417
+    },
+    {
+      "epoch": 0.4305555555555556,
+      "grad_norm": 0.5234864354133606,
+      "learning_rate": 0.0001944193319666982,
+      "loss": 1.2787,
+      "step": 2418
+    },
+    {
+      "epoch": 0.4307336182336182,
+      "grad_norm": 0.5137650370597839,
+      "learning_rate": 0.00019441472038881955,
+      "loss": 1.1406,
+      "step": 2419
+    },
+    {
+      "epoch": 0.4309116809116809,
+      "grad_norm": 0.49687784910202026,
+      "learning_rate": 0.00019441010696108054,
+      "loss": 0.93,
+      "step": 2420
+    },
+    {
+      "epoch": 0.4310897435897436,
+      "grad_norm": 0.5078722834587097,
+      "learning_rate": 0.00019440549168357163,
+      "loss": 1.1417,
+      "step": 2421
+    },
+    {
+      "epoch": 0.43126780626780625,
+      "grad_norm": 0.4483391046524048,
+      "learning_rate": 0.00019440087455638324,
+      "loss": 0.9016,
+      "step": 2422
+    },
+    {
+      "epoch": 0.43144586894586895,
+      "grad_norm": 0.5963045954704285,
+      "learning_rate": 0.00019439625557960576,
+      "loss": 1.1567,
+      "step": 2423
+    },
+    {
+      "epoch": 0.43162393162393164,
+      "grad_norm": 0.5534471273422241,
+      "learning_rate": 0.0001943916347533298,
+      "loss": 1.1409,
+      "step": 2424
+    },
+    {
+      "epoch": 0.4318019943019943,
+      "grad_norm": 0.6400241851806641,
+      "learning_rate": 0.0001943870120776458,
+      "loss": 1.2041,
+      "step": 2425
+    },
+    {
+      "epoch": 0.431980056980057,
+      "grad_norm": 0.4599420726299286,
+      "learning_rate": 0.0001943823875526444,
+      "loss": 1.023,
+      "step": 2426
+    },
+    {
+      "epoch": 0.4321581196581197,
+      "grad_norm": 0.4799708425998688,
+      "learning_rate": 0.00019437776117841614,
+      "loss": 1.0872,
+      "step": 2427
+    },
+    {
+      "epoch": 0.4323361823361823,
+      "grad_norm": 0.5138532519340515,
+      "learning_rate": 0.00019437313295505172,
+      "loss": 1.1175,
+      "step": 2428
+    },
+    {
+      "epoch": 0.432514245014245,
+      "grad_norm": 0.538223147392273,
+      "learning_rate": 0.00019436850288264183,
+      "loss": 1.1203,
+      "step": 2429
+    },
+    {
+      "epoch": 0.4326923076923077,
+      "grad_norm": 0.458044171333313,
+      "learning_rate": 0.00019436387096127713,
+      "loss": 1.0383,
+      "step": 2430
+    },
+    {
+      "epoch": 0.43287037037037035,
+      "grad_norm": 0.5928303599357605,
+      "learning_rate": 0.00019435923719104842,
+      "loss": 1.1191,
+      "step": 2431
+    },
+    {
+      "epoch": 0.43304843304843305,
+      "grad_norm": 0.5818437933921814,
+      "learning_rate": 0.00019435460157204645,
+      "loss": 1.0352,
+      "step": 2432
+    },
+    {
+      "epoch": 0.43322649572649574,
+      "grad_norm": 0.487341046333313,
+      "learning_rate": 0.0001943499641043621,
+      "loss": 1.2608,
+      "step": 2433
+    },
+    {
+      "epoch": 0.4334045584045584,
+      "grad_norm": 0.4737292230129242,
+      "learning_rate": 0.0001943453247880862,
+      "loss": 1.0084,
+      "step": 2434
+    },
+    {
+      "epoch": 0.4335826210826211,
+      "grad_norm": 0.4251207709312439,
+      "learning_rate": 0.0001943406836233096,
+      "loss": 0.9163,
+      "step": 2435
+    },
+    {
+      "epoch": 0.4337606837606838,
+      "grad_norm": 0.49468478560447693,
+      "learning_rate": 0.00019433604061012331,
+      "loss": 1.0293,
+      "step": 2436
+    },
+    {
+      "epoch": 0.4339387464387464,
+      "grad_norm": 0.47120022773742676,
+      "learning_rate": 0.00019433139574861826,
+      "loss": 1.0097,
+      "step": 2437
+    },
+    {
+      "epoch": 0.4341168091168091,
+      "grad_norm": 0.5060358047485352,
+      "learning_rate": 0.00019432674903888548,
+      "loss": 1.0683,
+      "step": 2438
+    },
+    {
+      "epoch": 0.4342948717948718,
+      "grad_norm": 0.5455917119979858,
+      "learning_rate": 0.00019432210048101598,
+      "loss": 0.8886,
+      "step": 2439
+    },
+    {
+      "epoch": 0.43447293447293445,
+      "grad_norm": 0.7960546612739563,
+      "learning_rate": 0.00019431745007510086,
+      "loss": 0.8648,
+      "step": 2440
+    },
+    {
+      "epoch": 0.43465099715099714,
+      "grad_norm": 0.5069689154624939,
+      "learning_rate": 0.00019431279782123126,
+      "loss": 1.1315,
+      "step": 2441
+    },
+    {
+      "epoch": 0.43482905982905984,
+      "grad_norm": 0.5597776174545288,
+      "learning_rate": 0.0001943081437194983,
+      "loss": 1.2281,
+      "step": 2442
+    },
+    {
+      "epoch": 0.43500712250712253,
+      "grad_norm": 0.4527420997619629,
+      "learning_rate": 0.00019430348776999315,
+      "loss": 0.7576,
+      "step": 2443
+    },
+    {
+      "epoch": 0.4351851851851852,
+      "grad_norm": 0.5625936388969421,
+      "learning_rate": 0.00019429882997280706,
+      "loss": 1.0302,
+      "step": 2444
+    },
+    {
+      "epoch": 0.43536324786324787,
+      "grad_norm": 0.5173513293266296,
+      "learning_rate": 0.0001942941703280313,
+      "loss": 1.2255,
+      "step": 2445
+    },
+    {
+      "epoch": 0.43554131054131057,
+      "grad_norm": 0.45889151096343994,
+      "learning_rate": 0.00019428950883575714,
+      "loss": 0.9322,
+      "step": 2446
+    },
+    {
+      "epoch": 0.4357193732193732,
+      "grad_norm": 0.5288477540016174,
+      "learning_rate": 0.00019428484549607593,
+      "loss": 1.0572,
+      "step": 2447
+    },
+    {
+      "epoch": 0.4358974358974359,
+      "grad_norm": 0.48328033089637756,
+      "learning_rate": 0.00019428018030907902,
+      "loss": 1.1213,
+      "step": 2448
+    },
+    {
+      "epoch": 0.4360754985754986,
+      "grad_norm": 0.5146737098693848,
+      "learning_rate": 0.00019427551327485786,
+      "loss": 0.9633,
+      "step": 2449
+    },
+    {
+      "epoch": 0.43625356125356124,
+      "grad_norm": 0.5138360261917114,
+      "learning_rate": 0.00019427084439350382,
+      "loss": 1.0561,
+      "step": 2450
+    },
+    {
+      "epoch": 0.43643162393162394,
+      "grad_norm": 0.5192533135414124,
+      "learning_rate": 0.00019426617366510843,
+      "loss": 1.1704,
+      "step": 2451
+    },
+    {
+      "epoch": 0.43660968660968663,
+      "grad_norm": 0.4819495379924774,
+      "learning_rate": 0.00019426150108976318,
+      "loss": 1.0958,
+      "step": 2452
+    },
+    {
+      "epoch": 0.43678774928774927,
+      "grad_norm": 0.4626680910587311,
+      "learning_rate": 0.00019425682666755965,
+      "loss": 1.1872,
+      "step": 2453
+    },
+    {
+      "epoch": 0.43696581196581197,
+      "grad_norm": 0.5773931741714478,
+      "learning_rate": 0.00019425215039858937,
+      "loss": 1.0722,
+      "step": 2454
+    },
+    {
+      "epoch": 0.43714387464387466,
+      "grad_norm": 0.5003872513771057,
+      "learning_rate": 0.00019424747228294402,
+      "loss": 1.0561,
+      "step": 2455
+    },
+    {
+      "epoch": 0.4373219373219373,
+      "grad_norm": 0.47370314598083496,
+      "learning_rate": 0.0001942427923207152,
+      "loss": 1.1619,
+      "step": 2456
+    },
+    {
+      "epoch": 0.4375,
+      "grad_norm": 0.466421514749527,
+      "learning_rate": 0.00019423811051199466,
+      "loss": 1.1311,
+      "step": 2457
+    },
+    {
+      "epoch": 0.4376780626780627,
+      "grad_norm": 0.44564682245254517,
+      "learning_rate": 0.00019423342685687413,
+      "loss": 1.1889,
+      "step": 2458
+    },
+    {
+      "epoch": 0.43785612535612534,
+      "grad_norm": 0.40986698865890503,
+      "learning_rate": 0.00019422874135544533,
+      "loss": 0.7312,
+      "step": 2459
+    },
+    {
+      "epoch": 0.43803418803418803,
+      "grad_norm": 0.4714358448982239,
+      "learning_rate": 0.0001942240540078001,
+      "loss": 0.9273,
+      "step": 2460
+    },
+    {
+      "epoch": 0.43821225071225073,
+      "grad_norm": 0.5298398733139038,
+      "learning_rate": 0.00019421936481403025,
+      "loss": 1.3377,
+      "step": 2461
+    },
+    {
+      "epoch": 0.43839031339031337,
+      "grad_norm": 0.6326695680618286,
+      "learning_rate": 0.0001942146737742277,
+      "loss": 1.0258,
+      "step": 2462
+    },
+    {
+      "epoch": 0.43856837606837606,
+      "grad_norm": 0.5087653994560242,
+      "learning_rate": 0.00019420998088848427,
+      "loss": 1.0007,
+      "step": 2463
+    },
+    {
+      "epoch": 0.43874643874643876,
+      "grad_norm": 0.4895429313182831,
+      "learning_rate": 0.00019420528615689202,
+      "loss": 1.0032,
+      "step": 2464
+    },
+    {
+      "epoch": 0.4389245014245014,
+      "grad_norm": 0.5029937028884888,
+      "learning_rate": 0.00019420058957954285,
+      "loss": 1.2877,
+      "step": 2465
+    },
+    {
+      "epoch": 0.4391025641025641,
+      "grad_norm": 0.4953192174434662,
+      "learning_rate": 0.00019419589115652884,
+      "loss": 1.0759,
+      "step": 2466
+    },
+    {
+      "epoch": 0.4392806267806268,
+      "grad_norm": 0.5081778168678284,
+      "learning_rate": 0.000194191190887942,
+      "loss": 0.8816,
+      "step": 2467
+    },
+    {
+      "epoch": 0.43945868945868943,
+      "grad_norm": 0.5065913200378418,
+      "learning_rate": 0.00019418648877387446,
+      "loss": 1.0362,
+      "step": 2468
+    },
+    {
+      "epoch": 0.43963675213675213,
+      "grad_norm": 0.540600061416626,
+      "learning_rate": 0.00019418178481441832,
+      "loss": 1.0911,
+      "step": 2469
+    },
+    {
+      "epoch": 0.4398148148148148,
+      "grad_norm": 0.5122954845428467,
+      "learning_rate": 0.00019417707900966572,
+      "loss": 0.9866,
+      "step": 2470
+    },
+    {
+      "epoch": 0.43999287749287747,
+      "grad_norm": 0.5380190014839172,
+      "learning_rate": 0.00019417237135970893,
+      "loss": 1.2775,
+      "step": 2471
+    },
+    {
+      "epoch": 0.44017094017094016,
+      "grad_norm": 1.2977570295333862,
+      "learning_rate": 0.00019416766186464016,
+      "loss": 1.3993,
+      "step": 2472
+    },
+    {
+      "epoch": 0.44034900284900286,
+      "grad_norm": 0.48105308413505554,
+      "learning_rate": 0.00019416295052455165,
+      "loss": 0.9369,
+      "step": 2473
+    },
+    {
+      "epoch": 0.44052706552706555,
+      "grad_norm": 0.4742157459259033,
+      "learning_rate": 0.00019415823733953574,
+      "loss": 1.101,
+      "step": 2474
+    },
+    {
+      "epoch": 0.4407051282051282,
+      "grad_norm": 0.4958631694316864,
+      "learning_rate": 0.00019415352230968473,
+      "loss": 0.9906,
+      "step": 2475
+    },
+    {
+      "epoch": 0.4408831908831909,
+      "grad_norm": 0.5808146595954895,
+      "learning_rate": 0.00019414880543509107,
+      "loss": 1.2315,
+      "step": 2476
+    },
+    {
+      "epoch": 0.4410612535612536,
+      "grad_norm": 0.4294755160808563,
+      "learning_rate": 0.00019414408671584714,
+      "loss": 0.8275,
+      "step": 2477
+    },
+    {
+      "epoch": 0.4412393162393162,
+      "grad_norm": 0.5346055626869202,
+      "learning_rate": 0.0001941393661520454,
+      "loss": 1.2432,
+      "step": 2478
+    },
+    {
+      "epoch": 0.4414173789173789,
+      "grad_norm": 0.5827590227127075,
+      "learning_rate": 0.00019413464374377833,
+      "loss": 1.3204,
+      "step": 2479
+    },
+    {
+      "epoch": 0.4415954415954416,
+      "grad_norm": 0.45688143372535706,
+      "learning_rate": 0.00019412991949113847,
+      "loss": 0.9307,
+      "step": 2480
+    },
+    {
+      "epoch": 0.44177350427350426,
+      "grad_norm": 0.512999415397644,
+      "learning_rate": 0.0001941251933942184,
+      "loss": 1.2808,
+      "step": 2481
+    },
+    {
+      "epoch": 0.44195156695156695,
+      "grad_norm": 0.4546334445476532,
+      "learning_rate": 0.00019412046545311064,
+      "loss": 1.0156,
+      "step": 2482
+    },
+    {
+      "epoch": 0.44212962962962965,
+      "grad_norm": 0.48552581667900085,
+      "learning_rate": 0.00019411573566790793,
+      "loss": 1.3798,
+      "step": 2483
+    },
+    {
+      "epoch": 0.4423076923076923,
+      "grad_norm": 0.511970579624176,
+      "learning_rate": 0.00019411100403870287,
+      "loss": 1.065,
+      "step": 2484
+    },
+    {
+      "epoch": 0.442485754985755,
+      "grad_norm": 0.6367824077606201,
+      "learning_rate": 0.00019410627056558815,
+      "loss": 1.3242,
+      "step": 2485
+    },
+    {
+      "epoch": 0.4426638176638177,
+      "grad_norm": 0.48913368582725525,
+      "learning_rate": 0.00019410153524865659,
+      "loss": 0.9761,
+      "step": 2486
+    },
+    {
+      "epoch": 0.4428418803418803,
+      "grad_norm": 0.5077710151672363,
+      "learning_rate": 0.0001940967980880009,
+      "loss": 1.1023,
+      "step": 2487
+    },
+    {
+      "epoch": 0.443019943019943,
+      "grad_norm": 0.4956335723400116,
+      "learning_rate": 0.00019409205908371395,
+      "loss": 1.1788,
+      "step": 2488
+    },
+    {
+      "epoch": 0.4431980056980057,
+      "grad_norm": 0.4726616442203522,
+      "learning_rate": 0.00019408731823588853,
+      "loss": 1.1445,
+      "step": 2489
+    },
+    {
+      "epoch": 0.44337606837606836,
+      "grad_norm": 0.5676438212394714,
+      "learning_rate": 0.00019408257554461757,
+      "loss": 1.0344,
+      "step": 2490
+    },
+    {
+      "epoch": 0.44355413105413105,
+      "grad_norm": 0.537656843662262,
+      "learning_rate": 0.000194077831009994,
+      "loss": 0.9876,
+      "step": 2491
+    },
+    {
+      "epoch": 0.44373219373219375,
+      "grad_norm": 0.517905592918396,
+      "learning_rate": 0.00019407308463211074,
+      "loss": 1.1389,
+      "step": 2492
+    },
+    {
+      "epoch": 0.4439102564102564,
+      "grad_norm": 0.49227026104927063,
+      "learning_rate": 0.0001940683364110608,
+      "loss": 1.0351,
+      "step": 2493
+    },
+    {
+      "epoch": 0.4440883190883191,
+      "grad_norm": 0.5131173729896545,
+      "learning_rate": 0.00019406358634693725,
+      "loss": 1.0351,
+      "step": 2494
+    },
+    {
+      "epoch": 0.4442663817663818,
+      "grad_norm": 0.5064495205879211,
+      "learning_rate": 0.0001940588344398331,
+      "loss": 1.0248,
+      "step": 2495
+    },
+    {
+      "epoch": 0.4444444444444444,
+      "grad_norm": 0.44107526540756226,
+      "learning_rate": 0.00019405408068984148,
+      "loss": 0.8068,
+      "step": 2496
+    },
+    {
+      "epoch": 0.4446225071225071,
+      "grad_norm": 0.6711848378181458,
+      "learning_rate": 0.00019404932509705554,
+      "loss": 1.059,
+      "step": 2497
+    },
+    {
+      "epoch": 0.4448005698005698,
+      "grad_norm": 0.5862596035003662,
+      "learning_rate": 0.00019404456766156845,
+      "loss": 1.2012,
+      "step": 2498
+    },
+    {
+      "epoch": 0.44497863247863245,
+      "grad_norm": 0.5528512001037598,
+      "learning_rate": 0.0001940398083834734,
+      "loss": 1.1121,
+      "step": 2499
+    },
+    {
+      "epoch": 0.44515669515669515,
+      "grad_norm": 0.5326655507087708,
+      "learning_rate": 0.0001940350472628637,
+      "loss": 1.166,
+      "step": 2500
+    },
+    {
+      "epoch": 0.44533475783475784,
+      "grad_norm": 0.5384873747825623,
+      "learning_rate": 0.00019403028429983252,
+      "loss": 1.4111,
+      "step": 2501
+    },
+    {
+      "epoch": 0.44551282051282054,
+      "grad_norm": 0.5142310857772827,
+      "learning_rate": 0.0001940255194944733,
+      "loss": 1.3353,
+      "step": 2502
+    },
+    {
+      "epoch": 0.4456908831908832,
+      "grad_norm": 0.49124231934547424,
+      "learning_rate": 0.0001940207528468793,
+      "loss": 1.1443,
+      "step": 2503
+    },
+    {
+      "epoch": 0.4458689458689459,
+      "grad_norm": 0.509713888168335,
+      "learning_rate": 0.000194015984357144,
+      "loss": 1.1857,
+      "step": 2504
+    },
+    {
+      "epoch": 0.44604700854700857,
+      "grad_norm": 0.5211275219917297,
+      "learning_rate": 0.00019401121402536078,
+      "loss": 0.9911,
+      "step": 2505
+    },
+    {
+      "epoch": 0.4462250712250712,
+      "grad_norm": 0.480340838432312,
+      "learning_rate": 0.00019400644185162312,
+      "loss": 1.1018,
+      "step": 2506
+    },
+    {
+      "epoch": 0.4464031339031339,
+      "grad_norm": 0.4212559163570404,
+      "learning_rate": 0.00019400166783602448,
+      "loss": 0.7501,
+      "step": 2507
+    },
+    {
+      "epoch": 0.4465811965811966,
+      "grad_norm": 0.5110511183738708,
+      "learning_rate": 0.00019399689197865846,
+      "loss": 1.1244,
+      "step": 2508
+    },
+    {
+      "epoch": 0.44675925925925924,
+      "grad_norm": 0.5604230165481567,
+      "learning_rate": 0.0001939921142796186,
+      "loss": 1.1066,
+      "step": 2509
+    },
+    {
+      "epoch": 0.44693732193732194,
+      "grad_norm": 0.5578675270080566,
+      "learning_rate": 0.0001939873347389985,
+      "loss": 1.0514,
+      "step": 2510
+    },
+    {
+      "epoch": 0.44711538461538464,
+      "grad_norm": 0.520908772945404,
+      "learning_rate": 0.00019398255335689184,
+      "loss": 1.1217,
+      "step": 2511
+    },
+    {
+      "epoch": 0.4472934472934473,
+      "grad_norm": 0.4405131936073303,
+      "learning_rate": 0.00019397777013339224,
+      "loss": 1.043,
+      "step": 2512
+    },
+    {
+      "epoch": 0.44747150997151,
+      "grad_norm": 0.5217751860618591,
+      "learning_rate": 0.0001939729850685935,
+      "loss": 1.1301,
+      "step": 2513
+    },
+    {
+      "epoch": 0.44764957264957267,
+      "grad_norm": 0.6151493191719055,
+      "learning_rate": 0.00019396819816258932,
+      "loss": 1.3498,
+      "step": 2514
+    },
+    {
+      "epoch": 0.4478276353276353,
+      "grad_norm": 0.5622836947441101,
+      "learning_rate": 0.0001939634094154735,
+      "loss": 1.146,
+      "step": 2515
+    },
+    {
+      "epoch": 0.448005698005698,
+      "grad_norm": 0.4671688973903656,
+      "learning_rate": 0.00019395861882733984,
+      "loss": 0.9456,
+      "step": 2516
+    },
+    {
+      "epoch": 0.4481837606837607,
+      "grad_norm": 0.453951358795166,
+      "learning_rate": 0.00019395382639828223,
+      "loss": 1.0042,
+      "step": 2517
+    },
+    {
+      "epoch": 0.44836182336182334,
+      "grad_norm": 0.5150699615478516,
+      "learning_rate": 0.0001939490321283946,
+      "loss": 1.166,
+      "step": 2518
+    },
+    {
+      "epoch": 0.44853988603988604,
+      "grad_norm": 0.5718298554420471,
+      "learning_rate": 0.0001939442360177708,
+      "loss": 1.2033,
+      "step": 2519
+    },
+    {
+      "epoch": 0.44871794871794873,
+      "grad_norm": 0.5306782126426697,
+      "learning_rate": 0.00019393943806650488,
+      "loss": 1.0765,
+      "step": 2520
+    },
+    {
+      "epoch": 0.4488960113960114,
+      "grad_norm": 0.47633033990859985,
+      "learning_rate": 0.0001939346382746908,
+      "loss": 0.9957,
+      "step": 2521
+    },
+    {
+      "epoch": 0.44907407407407407,
+      "grad_norm": 0.496441513299942,
+      "learning_rate": 0.00019392983664242262,
+      "loss": 1.2016,
+      "step": 2522
+    },
+    {
+      "epoch": 0.44925213675213677,
+      "grad_norm": 0.45956477522850037,
+      "learning_rate": 0.00019392503316979442,
+      "loss": 1.026,
+      "step": 2523
+    },
+    {
+      "epoch": 0.4494301994301994,
+      "grad_norm": 0.5400575995445251,
+      "learning_rate": 0.0001939202278569003,
+      "loss": 1.0785,
+      "step": 2524
+    },
+    {
+      "epoch": 0.4496082621082621,
+      "grad_norm": 0.4847868084907532,
+      "learning_rate": 0.00019391542070383442,
+      "loss": 1.013,
+      "step": 2525
+    },
+    {
+      "epoch": 0.4497863247863248,
+      "grad_norm": 0.4694063663482666,
+      "learning_rate": 0.00019391061171069094,
+      "loss": 0.8793,
+      "step": 2526
+    },
+    {
+      "epoch": 0.44996438746438744,
+      "grad_norm": 0.5158169269561768,
+      "learning_rate": 0.00019390580087756413,
+      "loss": 0.9602,
+      "step": 2527
+    },
+    {
+      "epoch": 0.45014245014245013,
+      "grad_norm": 0.5404585003852844,
+      "learning_rate": 0.00019390098820454822,
+      "loss": 1.2247,
+      "step": 2528
+    },
+    {
+      "epoch": 0.45032051282051283,
+      "grad_norm": 0.5302738547325134,
+      "learning_rate": 0.00019389617369173752,
+      "loss": 0.918,
+      "step": 2529
+    },
+    {
+      "epoch": 0.45049857549857547,
+      "grad_norm": 0.5065485835075378,
+      "learning_rate": 0.00019389135733922634,
+      "loss": 1.0934,
+      "step": 2530
+    },
+    {
+      "epoch": 0.45067663817663817,
+      "grad_norm": 0.5491471886634827,
+      "learning_rate": 0.00019388653914710903,
+      "loss": 1.0736,
+      "step": 2531
+    },
+    {
+      "epoch": 0.45085470085470086,
+      "grad_norm": 0.4850206971168518,
+      "learning_rate": 0.00019388171911548005,
+      "loss": 1.2401,
+      "step": 2532
+    },
+    {
+      "epoch": 0.45103276353276356,
+      "grad_norm": 0.5419789552688599,
+      "learning_rate": 0.0001938768972444338,
+      "loss": 1.269,
+      "step": 2533
+    },
+    {
+      "epoch": 0.4512108262108262,
+      "grad_norm": 0.4209023714065552,
+      "learning_rate": 0.00019387207353406476,
+      "loss": 1.0544,
+      "step": 2534
+    },
+    {
+      "epoch": 0.4513888888888889,
+      "grad_norm": 0.578588604927063,
+      "learning_rate": 0.00019386724798446743,
+      "loss": 1.0564,
+      "step": 2535
+    },
+    {
+      "epoch": 0.4515669515669516,
+      "grad_norm": 0.5277524590492249,
+      "learning_rate": 0.00019386242059573638,
+      "loss": 1.1497,
+      "step": 2536
+    },
+    {
+      "epoch": 0.45174501424501423,
+      "grad_norm": 0.5536073446273804,
+      "learning_rate": 0.0001938575913679662,
+      "loss": 1.2213,
+      "step": 2537
+    },
+    {
+      "epoch": 0.4519230769230769,
+      "grad_norm": 0.5572254657745361,
+      "learning_rate": 0.00019385276030125143,
+      "loss": 1.0231,
+      "step": 2538
+    },
+    {
+      "epoch": 0.4521011396011396,
+      "grad_norm": 0.493847131729126,
+      "learning_rate": 0.00019384792739568686,
+      "loss": 0.9385,
+      "step": 2539
+    },
+    {
+      "epoch": 0.45227920227920226,
+      "grad_norm": 0.4641396403312683,
+      "learning_rate": 0.00019384309265136707,
+      "loss": 0.9332,
+      "step": 2540
+    },
+    {
+      "epoch": 0.45245726495726496,
+      "grad_norm": 0.5439442992210388,
+      "learning_rate": 0.00019383825606838681,
+      "loss": 1.317,
+      "step": 2541
+    },
+    {
+      "epoch": 0.45263532763532766,
+      "grad_norm": 0.7050970792770386,
+      "learning_rate": 0.00019383341764684086,
+      "loss": 0.9508,
+      "step": 2542
+    },
+    {
+      "epoch": 0.4528133903133903,
+      "grad_norm": 0.5013265013694763,
+      "learning_rate": 0.000193828577386824,
+      "loss": 1.2704,
+      "step": 2543
+    },
+    {
+      "epoch": 0.452991452991453,
+      "grad_norm": 0.47641924023628235,
+      "learning_rate": 0.0001938237352884311,
+      "loss": 1.0101,
+      "step": 2544
+    },
+    {
+      "epoch": 0.4531695156695157,
+      "grad_norm": 0.5223637819290161,
+      "learning_rate": 0.000193818891351757,
+      "loss": 1.0548,
+      "step": 2545
+    },
+    {
+      "epoch": 0.45334757834757833,
+      "grad_norm": 0.49065667390823364,
+      "learning_rate": 0.0001938140455768966,
+      "loss": 1.0927,
+      "step": 2546
+    },
+    {
+      "epoch": 0.453525641025641,
+      "grad_norm": 0.4808312654495239,
+      "learning_rate": 0.0001938091979639449,
+      "loss": 1.0599,
+      "step": 2547
+    },
+    {
+      "epoch": 0.4537037037037037,
+      "grad_norm": 0.5157489776611328,
+      "learning_rate": 0.0001938043485129968,
+      "loss": 1.2596,
+      "step": 2548
+    },
+    {
+      "epoch": 0.45388176638176636,
+      "grad_norm": 0.5983387231826782,
+      "learning_rate": 0.0001937994972241474,
+      "loss": 1.2276,
+      "step": 2549
+    },
+    {
+      "epoch": 0.45405982905982906,
+      "grad_norm": 0.49776506423950195,
+      "learning_rate": 0.00019379464409749163,
+      "loss": 1.3666,
+      "step": 2550
+    },
+    {
+      "epoch": 0.45423789173789175,
+      "grad_norm": 0.4693490266799927,
+      "learning_rate": 0.00019378978913312471,
+      "loss": 1.087,
+      "step": 2551
+    },
+    {
+      "epoch": 0.4544159544159544,
+      "grad_norm": 0.4754335880279541,
+      "learning_rate": 0.00019378493233114167,
+      "loss": 1.1282,
+      "step": 2552
+    },
+    {
+      "epoch": 0.4545940170940171,
+      "grad_norm": 0.5852862000465393,
+      "learning_rate": 0.00019378007369163776,
+      "loss": 1.1113,
+      "step": 2553
+    },
+    {
+      "epoch": 0.4547720797720798,
+      "grad_norm": 0.47442635893821716,
+      "learning_rate": 0.00019377521321470805,
+      "loss": 0.983,
+      "step": 2554
+    },
+    {
+      "epoch": 0.4549501424501424,
+      "grad_norm": 0.47432273626327515,
+      "learning_rate": 0.00019377035090044787,
+      "loss": 1.0169,
+      "step": 2555
+    },
+    {
+      "epoch": 0.4551282051282051,
+      "grad_norm": 0.4929196834564209,
+      "learning_rate": 0.00019376548674895246,
+      "loss": 1.0182,
+      "step": 2556
+    },
+    {
+      "epoch": 0.4553062678062678,
+      "grad_norm": 0.5433184504508972,
+      "learning_rate": 0.00019376062076031708,
+      "loss": 1.1339,
+      "step": 2557
+    },
+    {
+      "epoch": 0.45548433048433046,
+      "grad_norm": 0.47430408000946045,
+      "learning_rate": 0.00019375575293463715,
+      "loss": 1.1589,
+      "step": 2558
+    },
+    {
+      "epoch": 0.45566239316239315,
+      "grad_norm": 0.46641045808792114,
+      "learning_rate": 0.000193750883272008,
+      "loss": 1.029,
+      "step": 2559
+    },
+    {
+      "epoch": 0.45584045584045585,
+      "grad_norm": 0.44476228952407837,
+      "learning_rate": 0.00019374601177252502,
+      "loss": 0.8494,
+      "step": 2560
+    },
+    {
+      "epoch": 0.45601851851851855,
+      "grad_norm": 0.4886183440685272,
+      "learning_rate": 0.00019374113843628366,
+      "loss": 1.1374,
+      "step": 2561
+    },
+    {
+      "epoch": 0.4561965811965812,
+      "grad_norm": 0.4786703288555145,
+      "learning_rate": 0.00019373626326337946,
+      "loss": 1.2861,
+      "step": 2562
+    },
+    {
+      "epoch": 0.4563746438746439,
+      "grad_norm": 0.5752716660499573,
+      "learning_rate": 0.0001937313862539079,
+      "loss": 1.2365,
+      "step": 2563
+    },
+    {
+      "epoch": 0.4565527065527066,
+      "grad_norm": 0.519176185131073,
+      "learning_rate": 0.00019372650740796452,
+      "loss": 1.2264,
+      "step": 2564
+    },
+    {
+      "epoch": 0.4567307692307692,
+      "grad_norm": 0.5927292704582214,
+      "learning_rate": 0.00019372162672564493,
+      "loss": 0.8979,
+      "step": 2565
+    },
+    {
+      "epoch": 0.4569088319088319,
+      "grad_norm": 0.5467435121536255,
+      "learning_rate": 0.00019371674420704478,
+      "loss": 1.1016,
+      "step": 2566
+    },
+    {
+      "epoch": 0.4570868945868946,
+      "grad_norm": 0.49593284726142883,
+      "learning_rate": 0.00019371185985225968,
+      "loss": 0.982,
+      "step": 2567
+    },
+    {
+      "epoch": 0.45726495726495725,
+      "grad_norm": 0.5696587562561035,
+      "learning_rate": 0.00019370697366138538,
+      "loss": 0.979,
+      "step": 2568
+    },
+    {
+      "epoch": 0.45744301994301995,
+      "grad_norm": 0.4455752968788147,
+      "learning_rate": 0.00019370208563451757,
+      "loss": 0.8832,
+      "step": 2569
+    },
+    {
+      "epoch": 0.45762108262108264,
+      "grad_norm": 0.5072923302650452,
+      "learning_rate": 0.00019369719577175203,
+      "loss": 1.1046,
+      "step": 2570
+    },
+    {
+      "epoch": 0.4577991452991453,
+      "grad_norm": 0.45119982957839966,
+      "learning_rate": 0.0001936923040731846,
+      "loss": 1.0083,
+      "step": 2571
+    },
+    {
+      "epoch": 0.457977207977208,
+      "grad_norm": 0.5062251091003418,
+      "learning_rate": 0.00019368741053891108,
+      "loss": 1.2771,
+      "step": 2572
+    },
+    {
+      "epoch": 0.4581552706552707,
+      "grad_norm": 0.5511104464530945,
+      "learning_rate": 0.0001936825151690274,
+      "loss": 1.0039,
+      "step": 2573
+    },
+    {
+      "epoch": 0.4583333333333333,
+      "grad_norm": 0.4721006453037262,
+      "learning_rate": 0.0001936776179636294,
+      "loss": 1.3246,
+      "step": 2574
+    },
+    {
+      "epoch": 0.458511396011396,
+      "grad_norm": 0.5021488666534424,
+      "learning_rate": 0.0001936727189228131,
+      "loss": 1.1733,
+      "step": 2575
+    },
+    {
+      "epoch": 0.4586894586894587,
+      "grad_norm": 0.5755292177200317,
+      "learning_rate": 0.0001936678180466745,
+      "loss": 1.2241,
+      "step": 2576
+    },
+    {
+      "epoch": 0.45886752136752135,
+      "grad_norm": 0.4501610994338989,
+      "learning_rate": 0.00019366291533530952,
+      "loss": 1.0503,
+      "step": 2577
+    },
+    {
+      "epoch": 0.45904558404558404,
+      "grad_norm": 0.4067458212375641,
+      "learning_rate": 0.00019365801078881432,
+      "loss": 0.8259,
+      "step": 2578
+    },
+    {
+      "epoch": 0.45922364672364674,
+      "grad_norm": 0.539730429649353,
+      "learning_rate": 0.0001936531044072849,
+      "loss": 1.1964,
+      "step": 2579
+    },
+    {
+      "epoch": 0.4594017094017094,
+      "grad_norm": 0.5624797344207764,
+      "learning_rate": 0.0001936481961908175,
+      "loss": 1.2059,
+      "step": 2580
+    },
+    {
+      "epoch": 0.4595797720797721,
+      "grad_norm": 0.43679240345954895,
+      "learning_rate": 0.00019364328613950824,
+      "loss": 1.1371,
+      "step": 2581
+    },
+    {
+      "epoch": 0.45975783475783477,
+      "grad_norm": 0.5214769244194031,
+      "learning_rate": 0.00019363837425345328,
+      "loss": 1.109,
+      "step": 2582
+    },
+    {
+      "epoch": 0.4599358974358974,
+      "grad_norm": 0.4522894024848938,
+      "learning_rate": 0.00019363346053274892,
+      "loss": 1.0532,
+      "step": 2583
+    },
+    {
+      "epoch": 0.4601139601139601,
+      "grad_norm": 0.44980281591415405,
+      "learning_rate": 0.0001936285449774914,
+      "loss": 0.9352,
+      "step": 2584
+    },
+    {
+      "epoch": 0.4602920227920228,
+      "grad_norm": 0.5697414875030518,
+      "learning_rate": 0.00019362362758777705,
+      "loss": 1.2171,
+      "step": 2585
+    },
+    {
+      "epoch": 0.46047008547008544,
+      "grad_norm": 0.4636315107345581,
+      "learning_rate": 0.00019361870836370217,
+      "loss": 1.0662,
+      "step": 2586
+    },
+    {
+      "epoch": 0.46064814814814814,
+      "grad_norm": 0.5144017338752747,
+      "learning_rate": 0.00019361378730536321,
+      "loss": 1.0681,
+      "step": 2587
+    },
+    {
+      "epoch": 0.46082621082621084,
+      "grad_norm": 0.5007636547088623,
+      "learning_rate": 0.00019360886441285654,
+      "loss": 1.2058,
+      "step": 2588
+    },
+    {
+      "epoch": 0.46100427350427353,
+      "grad_norm": 0.5024117231369019,
+      "learning_rate": 0.00019360393968627864,
+      "loss": 1.065,
+      "step": 2589
+    },
+    {
+      "epoch": 0.46118233618233617,
+      "grad_norm": 0.48105588555336,
+      "learning_rate": 0.00019359901312572596,
+      "loss": 1.0887,
+      "step": 2590
+    },
+    {
+      "epoch": 0.46136039886039887,
+      "grad_norm": 0.5381982326507568,
+      "learning_rate": 0.00019359408473129506,
+      "loss": 1.2754,
+      "step": 2591
+    },
+    {
+      "epoch": 0.46153846153846156,
+      "grad_norm": 0.5051333904266357,
+      "learning_rate": 0.0001935891545030825,
+      "loss": 0.9334,
+      "step": 2592
+    },
+    {
+      "epoch": 0.4617165242165242,
+      "grad_norm": 0.43818601965904236,
+      "learning_rate": 0.0001935842224411849,
+      "loss": 1.0967,
+      "step": 2593
+    },
+    {
+      "epoch": 0.4618945868945869,
+      "grad_norm": 0.4727257490158081,
+      "learning_rate": 0.0001935792885456988,
+      "loss": 0.8136,
+      "step": 2594
+    },
+    {
+      "epoch": 0.4620726495726496,
+      "grad_norm": 0.5505291223526001,
+      "learning_rate": 0.00019357435281672098,
+      "loss": 1.3113,
+      "step": 2595
+    },
+    {
+      "epoch": 0.46225071225071224,
+      "grad_norm": 0.4705682396888733,
+      "learning_rate": 0.0001935694152543481,
+      "loss": 0.9863,
+      "step": 2596
+    },
+    {
+      "epoch": 0.46242877492877493,
+      "grad_norm": 0.49653419852256775,
+      "learning_rate": 0.0001935644758586769,
+      "loss": 1.035,
+      "step": 2597
+    },
+    {
+      "epoch": 0.46260683760683763,
+      "grad_norm": 0.4788367748260498,
+      "learning_rate": 0.00019355953462980415,
+      "loss": 1.1253,
+      "step": 2598
+    },
+    {
+      "epoch": 0.46278490028490027,
+      "grad_norm": 0.5295125842094421,
+      "learning_rate": 0.00019355459156782668,
+      "loss": 1.0853,
+      "step": 2599
+    },
+    {
+      "epoch": 0.46296296296296297,
+      "grad_norm": 0.4878056049346924,
+      "learning_rate": 0.00019354964667284133,
+      "loss": 1.1381,
+      "step": 2600
+    },
+    {
+      "epoch": 0.46314102564102566,
+      "grad_norm": 0.5442031025886536,
+      "learning_rate": 0.00019354469994494497,
+      "loss": 1.1349,
+      "step": 2601
+    },
+    {
+      "epoch": 0.4633190883190883,
+      "grad_norm": 0.4845225214958191,
+      "learning_rate": 0.00019353975138423457,
+      "loss": 1.0538,
+      "step": 2602
+    },
+    {
+      "epoch": 0.463497150997151,
+      "grad_norm": 0.4957871437072754,
+      "learning_rate": 0.00019353480099080703,
+      "loss": 1.2765,
+      "step": 2603
+    },
+    {
+      "epoch": 0.4636752136752137,
+      "grad_norm": 0.5414339303970337,
+      "learning_rate": 0.00019352984876475936,
+      "loss": 1.1015,
+      "step": 2604
+    },
+    {
+      "epoch": 0.46385327635327633,
+      "grad_norm": 0.5171043872833252,
+      "learning_rate": 0.0001935248947061886,
+      "loss": 0.9995,
+      "step": 2605
+    },
+    {
+      "epoch": 0.46403133903133903,
+      "grad_norm": 0.46040529012680054,
+      "learning_rate": 0.0001935199388151918,
+      "loss": 1.1126,
+      "step": 2606
+    },
+    {
+      "epoch": 0.4642094017094017,
+      "grad_norm": 0.5327033400535583,
+      "learning_rate": 0.00019351498109186613,
+      "loss": 1.1983,
+      "step": 2607
+    },
+    {
+      "epoch": 0.46438746438746437,
+      "grad_norm": 0.4451361298561096,
+      "learning_rate": 0.0001935100215363086,
+      "loss": 0.9689,
+      "step": 2608
+    },
+    {
+      "epoch": 0.46456552706552706,
+      "grad_norm": 0.5462809801101685,
+      "learning_rate": 0.00019350506014861646,
+      "loss": 1.036,
+      "step": 2609
+    },
+    {
+      "epoch": 0.46474358974358976,
+      "grad_norm": 0.4907000958919525,
+      "learning_rate": 0.00019350009692888694,
+      "loss": 1.0724,
+      "step": 2610
+    },
+    {
+      "epoch": 0.4649216524216524,
+      "grad_norm": 0.47523510456085205,
+      "learning_rate": 0.00019349513187721723,
+      "loss": 0.9214,
+      "step": 2611
+    },
+    {
+      "epoch": 0.4650997150997151,
+      "grad_norm": 0.539732813835144,
+      "learning_rate": 0.0001934901649937046,
+      "loss": 1.1166,
+      "step": 2612
+    },
+    {
+      "epoch": 0.4652777777777778,
+      "grad_norm": 0.4827860891819,
+      "learning_rate": 0.00019348519627844643,
+      "loss": 1.1613,
+      "step": 2613
+    },
+    {
+      "epoch": 0.46545584045584043,
+      "grad_norm": 0.5385223031044006,
+      "learning_rate": 0.00019348022573154,
+      "loss": 1.0105,
+      "step": 2614
+    },
+    {
+      "epoch": 0.4656339031339031,
+      "grad_norm": 0.4629383087158203,
+      "learning_rate": 0.0001934752533530828,
+      "loss": 1.0298,
+      "step": 2615
+    },
+    {
+      "epoch": 0.4658119658119658,
+      "grad_norm": 0.599371075630188,
+      "learning_rate": 0.00019347027914317212,
+      "loss": 1.3158,
+      "step": 2616
+    },
+    {
+      "epoch": 0.46599002849002846,
+      "grad_norm": 0.5954698324203491,
+      "learning_rate": 0.00019346530310190553,
+      "loss": 1.1882,
+      "step": 2617
+    },
+    {
+      "epoch": 0.46616809116809116,
+      "grad_norm": 0.49185171723365784,
+      "learning_rate": 0.00019346032522938046,
+      "loss": 1.0977,
+      "step": 2618
+    },
+    {
+      "epoch": 0.46634615384615385,
+      "grad_norm": 0.5145422220230103,
+      "learning_rate": 0.0001934553455256945,
+      "loss": 0.9948,
+      "step": 2619
+    },
+    {
+      "epoch": 0.46652421652421655,
+      "grad_norm": 0.6809412837028503,
+      "learning_rate": 0.00019345036399094517,
+      "loss": 1.5798,
+      "step": 2620
+    },
+    {
+      "epoch": 0.4667022792022792,
+      "grad_norm": 0.4606841206550598,
+      "learning_rate": 0.00019344538062523005,
+      "loss": 0.7357,
+      "step": 2621
+    },
+    {
+      "epoch": 0.4668803418803419,
+      "grad_norm": 0.49036628007888794,
+      "learning_rate": 0.00019344039542864685,
+      "loss": 1.1518,
+      "step": 2622
+    },
+    {
+      "epoch": 0.4670584045584046,
+      "grad_norm": 0.47904539108276367,
+      "learning_rate": 0.0001934354084012932,
+      "loss": 0.9929,
+      "step": 2623
+    },
+    {
+      "epoch": 0.4672364672364672,
+      "grad_norm": 0.5224666595458984,
+      "learning_rate": 0.0001934304195432668,
+      "loss": 1.2544,
+      "step": 2624
+    },
+    {
+      "epoch": 0.4674145299145299,
+      "grad_norm": 0.4902483820915222,
+      "learning_rate": 0.00019342542885466543,
+      "loss": 1.0301,
+      "step": 2625
+    },
+    {
+      "epoch": 0.4675925925925926,
+      "grad_norm": 0.46824702620506287,
+      "learning_rate": 0.00019342043633558683,
+      "loss": 0.9364,
+      "step": 2626
+    },
+    {
+      "epoch": 0.46777065527065526,
+      "grad_norm": 0.46272051334381104,
+      "learning_rate": 0.00019341544198612888,
+      "loss": 1.056,
+      "step": 2627
+    },
+    {
+      "epoch": 0.46794871794871795,
+      "grad_norm": 0.6216606497764587,
+      "learning_rate": 0.0001934104458063894,
+      "loss": 1.0825,
+      "step": 2628
+    },
+    {
+      "epoch": 0.46812678062678065,
+      "grad_norm": 0.5024014115333557,
+      "learning_rate": 0.00019340544779646623,
+      "loss": 1.1832,
+      "step": 2629
+    },
+    {
+      "epoch": 0.4683048433048433,
+      "grad_norm": 0.5547130107879639,
+      "learning_rate": 0.00019340044795645737,
+      "loss": 1.1335,
+      "step": 2630
+    },
+    {
+      "epoch": 0.468482905982906,
+      "grad_norm": 0.5439161658287048,
+      "learning_rate": 0.0001933954462864608,
+      "loss": 1.0229,
+      "step": 2631
+    },
+    {
+      "epoch": 0.4686609686609687,
+      "grad_norm": 0.4782990515232086,
+      "learning_rate": 0.0001933904427865744,
+      "loss": 1.2318,
+      "step": 2632
+    },
+    {
+      "epoch": 0.4688390313390313,
+      "grad_norm": 0.5872140526771545,
+      "learning_rate": 0.00019338543745689633,
+      "loss": 1.0132,
+      "step": 2633
+    },
+    {
+      "epoch": 0.469017094017094,
+      "grad_norm": 0.44163307547569275,
+      "learning_rate": 0.00019338043029752458,
+      "loss": 1.0091,
+      "step": 2634
+    },
+    {
+      "epoch": 0.4691951566951567,
+      "grad_norm": 0.541081428527832,
+      "learning_rate": 0.0001933754213085573,
+      "loss": 1.2155,
+      "step": 2635
+    },
+    {
+      "epoch": 0.46937321937321935,
+      "grad_norm": 0.4761527478694916,
+      "learning_rate": 0.00019337041049009255,
+      "loss": 1.1138,
+      "step": 2636
+    },
+    {
+      "epoch": 0.46955128205128205,
+      "grad_norm": 0.46414369344711304,
+      "learning_rate": 0.0001933653978422286,
+      "loss": 0.9903,
+      "step": 2637
+    },
+    {
+      "epoch": 0.46972934472934474,
+      "grad_norm": 0.5337086915969849,
+      "learning_rate": 0.00019336038336506363,
+      "loss": 1.2873,
+      "step": 2638
+    },
+    {
+      "epoch": 0.4699074074074074,
+      "grad_norm": 0.5065379738807678,
+      "learning_rate": 0.00019335536705869592,
+      "loss": 1.1436,
+      "step": 2639
+    },
+    {
+      "epoch": 0.4700854700854701,
+      "grad_norm": 0.5539217591285706,
+      "learning_rate": 0.0001933503489232237,
+      "loss": 1.2881,
+      "step": 2640
+    },
+    {
+      "epoch": 0.4702635327635328,
+      "grad_norm": 0.48303213715553284,
+      "learning_rate": 0.0001933453289587453,
+      "loss": 1.0209,
+      "step": 2641
+    },
+    {
+      "epoch": 0.4704415954415954,
+      "grad_norm": 0.6986871957778931,
+      "learning_rate": 0.00019334030716535908,
+      "loss": 1.1979,
+      "step": 2642
+    },
+    {
+      "epoch": 0.4706196581196581,
+      "grad_norm": 0.46137234568595886,
+      "learning_rate": 0.00019333528354316347,
+      "loss": 1.0682,
+      "step": 2643
+    },
+    {
+      "epoch": 0.4707977207977208,
+      "grad_norm": 0.4726654291152954,
+      "learning_rate": 0.00019333025809225684,
+      "loss": 1.1712,
+      "step": 2644
+    },
+    {
+      "epoch": 0.47097578347578345,
+      "grad_norm": 0.46188637614250183,
+      "learning_rate": 0.0001933252308127377,
+      "loss": 1.0183,
+      "step": 2645
+    },
+    {
+      "epoch": 0.47115384615384615,
+      "grad_norm": 0.5323259830474854,
+      "learning_rate": 0.0001933202017047045,
+      "loss": 0.935,
+      "step": 2646
+    },
+    {
+      "epoch": 0.47133190883190884,
+      "grad_norm": 0.5004189014434814,
+      "learning_rate": 0.00019331517076825582,
+      "loss": 1.1331,
+      "step": 2647
+    },
+    {
+      "epoch": 0.47150997150997154,
+      "grad_norm": 0.5443634986877441,
+      "learning_rate": 0.0001933101380034902,
+      "loss": 1.0514,
+      "step": 2648
+    },
+    {
+      "epoch": 0.4716880341880342,
+      "grad_norm": 0.504180371761322,
+      "learning_rate": 0.0001933051034105063,
+      "loss": 1.3099,
+      "step": 2649
+    },
+    {
+      "epoch": 0.4718660968660969,
+      "grad_norm": 0.5092344284057617,
+      "learning_rate": 0.0001933000669894027,
+      "loss": 1.0716,
+      "step": 2650
+    },
+    {
+      "epoch": 0.47204415954415957,
+      "grad_norm": 0.5236422419548035,
+      "learning_rate": 0.0001932950287402781,
+      "loss": 1.0981,
+      "step": 2651
+    },
+    {
+      "epoch": 0.4722222222222222,
+      "grad_norm": 0.6228063702583313,
+      "learning_rate": 0.0001932899886632312,
+      "loss": 1.3398,
+      "step": 2652
+    },
+    {
+      "epoch": 0.4724002849002849,
+      "grad_norm": 0.5112748146057129,
+      "learning_rate": 0.00019328494675836078,
+      "loss": 1.0151,
+      "step": 2653
+    },
+    {
+      "epoch": 0.4725783475783476,
+      "grad_norm": 0.5554201602935791,
+      "learning_rate": 0.00019327990302576563,
+      "loss": 1.404,
+      "step": 2654
+    },
+    {
+      "epoch": 0.47275641025641024,
+      "grad_norm": 0.5050725340843201,
+      "learning_rate": 0.0001932748574655445,
+      "loss": 0.951,
+      "step": 2655
+    },
+    {
+      "epoch": 0.47293447293447294,
+      "grad_norm": 0.5161749720573425,
+      "learning_rate": 0.00019326981007779636,
+      "loss": 1.2425,
+      "step": 2656
+    },
+    {
+      "epoch": 0.47311253561253563,
+      "grad_norm": 0.4865442216396332,
+      "learning_rate": 0.00019326476086262002,
+      "loss": 1.1175,
+      "step": 2657
+    },
+    {
+      "epoch": 0.4732905982905983,
+      "grad_norm": 0.5276186466217041,
+      "learning_rate": 0.0001932597098201144,
+      "loss": 1.3687,
+      "step": 2658
+    },
+    {
+      "epoch": 0.47346866096866097,
+      "grad_norm": 0.509139358997345,
+      "learning_rate": 0.00019325465695037855,
+      "loss": 1.0546,
+      "step": 2659
+    },
+    {
+      "epoch": 0.47364672364672367,
+      "grad_norm": 0.49815434217453003,
+      "learning_rate": 0.00019324960225351138,
+      "loss": 1.0807,
+      "step": 2660
+    },
+    {
+      "epoch": 0.4738247863247863,
+      "grad_norm": 0.5059618353843689,
+      "learning_rate": 0.00019324454572961197,
+      "loss": 1.0827,
+      "step": 2661
+    },
+    {
+      "epoch": 0.474002849002849,
+      "grad_norm": 0.5698565244674683,
+      "learning_rate": 0.00019323948737877942,
+      "loss": 1.2019,
+      "step": 2662
+    },
+    {
+      "epoch": 0.4741809116809117,
+      "grad_norm": 0.49661511182785034,
+      "learning_rate": 0.00019323442720111276,
+      "loss": 1.1447,
+      "step": 2663
+    },
+    {
+      "epoch": 0.47435897435897434,
+      "grad_norm": 0.46442747116088867,
+      "learning_rate": 0.0001932293651967112,
+      "loss": 0.8796,
+      "step": 2664
+    },
+    {
+      "epoch": 0.47453703703703703,
+      "grad_norm": 0.48306044936180115,
+      "learning_rate": 0.00019322430136567388,
+      "loss": 1.1358,
+      "step": 2665
+    },
+    {
+      "epoch": 0.47471509971509973,
+      "grad_norm": 0.5677350759506226,
+      "learning_rate": 0.00019321923570810005,
+      "loss": 1.1026,
+      "step": 2666
+    },
+    {
+      "epoch": 0.47489316239316237,
+      "grad_norm": 0.3700144588947296,
+      "learning_rate": 0.0001932141682240889,
+      "loss": 0.7514,
+      "step": 2667
+    },
+    {
+      "epoch": 0.47507122507122507,
+      "grad_norm": 0.6003054976463318,
+      "learning_rate": 0.0001932090989137398,
+      "loss": 1.1591,
+      "step": 2668
+    },
+    {
+      "epoch": 0.47524928774928776,
+      "grad_norm": 0.520298421382904,
+      "learning_rate": 0.00019320402777715204,
+      "loss": 1.339,
+      "step": 2669
+    },
+    {
+      "epoch": 0.4754273504273504,
+      "grad_norm": 0.46453598141670227,
+      "learning_rate": 0.00019319895481442493,
+      "loss": 0.9879,
+      "step": 2670
+    },
+    {
+      "epoch": 0.4756054131054131,
+      "grad_norm": 0.5247363448143005,
+      "learning_rate": 0.00019319388002565793,
+      "loss": 0.9862,
+      "step": 2671
+    },
+    {
+      "epoch": 0.4757834757834758,
+      "grad_norm": 0.5498613715171814,
+      "learning_rate": 0.00019318880341095046,
+      "loss": 1.2224,
+      "step": 2672
+    },
+    {
+      "epoch": 0.47596153846153844,
+      "grad_norm": 0.565838098526001,
+      "learning_rate": 0.00019318372497040192,
+      "loss": 1.0712,
+      "step": 2673
+    },
+    {
+      "epoch": 0.47613960113960113,
+      "grad_norm": 0.5797489881515503,
+      "learning_rate": 0.00019317864470411191,
+      "loss": 1.0176,
+      "step": 2674
+    },
+    {
+      "epoch": 0.47631766381766383,
+      "grad_norm": 0.5114326477050781,
+      "learning_rate": 0.0001931735626121799,
+      "loss": 1.1027,
+      "step": 2675
+    },
+    {
+      "epoch": 0.47649572649572647,
+      "grad_norm": 0.5396515727043152,
+      "learning_rate": 0.00019316847869470547,
+      "loss": 1.1782,
+      "step": 2676
+    },
+    {
+      "epoch": 0.47667378917378916,
+      "grad_norm": 0.4812076985836029,
+      "learning_rate": 0.00019316339295178824,
+      "loss": 1.1196,
+      "step": 2677
+    },
+    {
+      "epoch": 0.47685185185185186,
+      "grad_norm": 0.4875647723674774,
+      "learning_rate": 0.00019315830538352787,
+      "loss": 1.1407,
+      "step": 2678
+    },
+    {
+      "epoch": 0.47702991452991456,
+      "grad_norm": 0.5036377906799316,
+      "learning_rate": 0.00019315321599002404,
+      "loss": 0.9842,
+      "step": 2679
+    },
+    {
+      "epoch": 0.4772079772079772,
+      "grad_norm": 0.5054177641868591,
+      "learning_rate": 0.00019314812477137645,
+      "loss": 0.8196,
+      "step": 2680
+    },
+    {
+      "epoch": 0.4773860398860399,
+      "grad_norm": 0.5050665736198425,
+      "learning_rate": 0.00019314303172768483,
+      "loss": 0.8463,
+      "step": 2681
+    },
+    {
+      "epoch": 0.4775641025641026,
+      "grad_norm": 0.5179004669189453,
+      "learning_rate": 0.000193137936859049,
+      "loss": 1.2485,
+      "step": 2682
+    },
+    {
+      "epoch": 0.47774216524216523,
+      "grad_norm": 0.44986143708229065,
+      "learning_rate": 0.00019313284016556876,
+      "loss": 0.9855,
+      "step": 2683
+    },
+    {
+      "epoch": 0.4779202279202279,
+      "grad_norm": 0.5594347715377808,
+      "learning_rate": 0.00019312774164734398,
+      "loss": 1.0987,
+      "step": 2684
+    },
+    {
+      "epoch": 0.4780982905982906,
+      "grad_norm": 0.4837244749069214,
+      "learning_rate": 0.0001931226413044746,
+      "loss": 1.1119,
+      "step": 2685
+    },
+    {
+      "epoch": 0.47827635327635326,
+      "grad_norm": 0.489145427942276,
+      "learning_rate": 0.0001931175391370605,
+      "loss": 1.1962,
+      "step": 2686
+    },
+    {
+      "epoch": 0.47845441595441596,
+      "grad_norm": 0.503568708896637,
+      "learning_rate": 0.00019311243514520164,
+      "loss": 0.9668,
+      "step": 2687
+    },
+    {
+      "epoch": 0.47863247863247865,
+      "grad_norm": 0.5401005744934082,
+      "learning_rate": 0.00019310732932899805,
+      "loss": 1.3072,
+      "step": 2688
+    },
+    {
+      "epoch": 0.4788105413105413,
+      "grad_norm": 0.526523768901825,
+      "learning_rate": 0.00019310222168854971,
+      "loss": 1.1387,
+      "step": 2689
+    },
+    {
+      "epoch": 0.478988603988604,
+      "grad_norm": 0.5223183631896973,
+      "learning_rate": 0.00019309711222395678,
+      "loss": 1.1391,
+      "step": 2690
+    },
+    {
+      "epoch": 0.4791666666666667,
+      "grad_norm": 0.5840879082679749,
+      "learning_rate": 0.00019309200093531933,
+      "loss": 1.1543,
+      "step": 2691
+    },
+    {
+      "epoch": 0.4793447293447293,
+      "grad_norm": 0.5173699259757996,
+      "learning_rate": 0.00019308688782273753,
+      "loss": 1.1889,
+      "step": 2692
+    },
+    {
+      "epoch": 0.479522792022792,
+      "grad_norm": 0.5417894124984741,
+      "learning_rate": 0.00019308177288631146,
+      "loss": 1.299,
+      "step": 2693
+    },
+    {
+      "epoch": 0.4797008547008547,
+      "grad_norm": 0.4890797734260559,
+      "learning_rate": 0.0001930766561261415,
+      "loss": 1.1516,
+      "step": 2694
+    },
+    {
+      "epoch": 0.47987891737891736,
+      "grad_norm": 0.5422119498252869,
+      "learning_rate": 0.00019307153754232772,
+      "loss": 1.0301,
+      "step": 2695
+    },
+    {
+      "epoch": 0.48005698005698005,
+      "grad_norm": 0.5838702917098999,
+      "learning_rate": 0.00019306641713497057,
+      "loss": 1.265,
+      "step": 2696
+    },
+    {
+      "epoch": 0.48023504273504275,
+      "grad_norm": 0.5020943284034729,
+      "learning_rate": 0.00019306129490417027,
+      "loss": 1.1119,
+      "step": 2697
+    },
+    {
+      "epoch": 0.4804131054131054,
+      "grad_norm": 0.412993460893631,
+      "learning_rate": 0.00019305617085002723,
+      "loss": 0.8083,
+      "step": 2698
+    },
+    {
+      "epoch": 0.4805911680911681,
+      "grad_norm": 0.6270101070404053,
+      "learning_rate": 0.00019305104497264184,
+      "loss": 1.3355,
+      "step": 2699
+    },
+    {
+      "epoch": 0.4807692307692308,
+      "grad_norm": 0.45256730914115906,
+      "learning_rate": 0.0001930459172721145,
+      "loss": 1.0368,
+      "step": 2700
+    },
+    {
+      "epoch": 0.4809472934472934,
+      "grad_norm": 0.5351749658584595,
+      "learning_rate": 0.0001930407877485457,
+      "loss": 1.135,
+      "step": 2701
+    },
+    {
+      "epoch": 0.4811253561253561,
+      "grad_norm": 0.49324163794517517,
+      "learning_rate": 0.00019303565640203593,
+      "loss": 0.9383,
+      "step": 2702
+    },
+    {
+      "epoch": 0.4813034188034188,
+      "grad_norm": 0.5434361100196838,
+      "learning_rate": 0.00019303052323268576,
+      "loss": 1.2605,
+      "step": 2703
+    },
+    {
+      "epoch": 0.48148148148148145,
+      "grad_norm": 0.5858064889907837,
+      "learning_rate": 0.00019302538824059572,
+      "loss": 1.0846,
+      "step": 2704
+    },
+    {
+      "epoch": 0.48165954415954415,
+      "grad_norm": 0.5753700733184814,
+      "learning_rate": 0.00019302025142586647,
+      "loss": 1.0371,
+      "step": 2705
+    },
+    {
+      "epoch": 0.48183760683760685,
+      "grad_norm": 0.43102699518203735,
+      "learning_rate": 0.00019301511278859858,
+      "loss": 0.9189,
+      "step": 2706
+    },
+    {
+      "epoch": 0.48201566951566954,
+      "grad_norm": 0.4731025993824005,
+      "learning_rate": 0.0001930099723288928,
+      "loss": 1.1291,
+      "step": 2707
+    },
+    {
+      "epoch": 0.4821937321937322,
+      "grad_norm": 0.5685615539550781,
+      "learning_rate": 0.00019300483004684987,
+      "loss": 1.1006,
+      "step": 2708
+    },
+    {
+      "epoch": 0.4823717948717949,
+      "grad_norm": 0.4368155896663666,
+      "learning_rate": 0.00019299968594257044,
+      "loss": 0.9959,
+      "step": 2709
+    },
+    {
+      "epoch": 0.4825498575498576,
+      "grad_norm": 0.5594738125801086,
+      "learning_rate": 0.00019299454001615537,
+      "loss": 1.0826,
+      "step": 2710
+    },
+    {
+      "epoch": 0.4827279202279202,
+      "grad_norm": 0.48876598477363586,
+      "learning_rate": 0.00019298939226770548,
+      "loss": 1.1556,
+      "step": 2711
+    },
+    {
+      "epoch": 0.4829059829059829,
+      "grad_norm": 0.548039436340332,
+      "learning_rate": 0.00019298424269732157,
+      "loss": 1.158,
+      "step": 2712
+    },
+    {
+      "epoch": 0.4830840455840456,
+      "grad_norm": 0.4957645535469055,
+      "learning_rate": 0.00019297909130510464,
+      "loss": 0.9824,
+      "step": 2713
+    },
+    {
+      "epoch": 0.48326210826210825,
+      "grad_norm": 0.5197011232376099,
+      "learning_rate": 0.00019297393809115555,
+      "loss": 1.1074,
+      "step": 2714
+    },
+    {
+      "epoch": 0.48344017094017094,
+      "grad_norm": 0.5742064118385315,
+      "learning_rate": 0.00019296878305557526,
+      "loss": 1.0431,
+      "step": 2715
+    },
+    {
+      "epoch": 0.48361823361823364,
+      "grad_norm": 0.5698413252830505,
+      "learning_rate": 0.0001929636261984648,
+      "loss": 1.0713,
+      "step": 2716
+    },
+    {
+      "epoch": 0.4837962962962963,
+      "grad_norm": 0.48126333951950073,
+      "learning_rate": 0.0001929584675199252,
+      "loss": 0.9274,
+      "step": 2717
+    },
+    {
+      "epoch": 0.483974358974359,
+      "grad_norm": 0.49299830198287964,
+      "learning_rate": 0.00019295330702005754,
+      "loss": 0.9392,
+      "step": 2718
+    },
+    {
+      "epoch": 0.48415242165242167,
+      "grad_norm": 0.4780774414539337,
+      "learning_rate": 0.0001929481446989629,
+      "loss": 1.1459,
+      "step": 2719
+    },
+    {
+      "epoch": 0.4843304843304843,
+      "grad_norm": 0.5462654829025269,
+      "learning_rate": 0.00019294298055674248,
+      "loss": 1.0635,
+      "step": 2720
+    },
+    {
+      "epoch": 0.484508547008547,
+      "grad_norm": 0.5371061563491821,
+      "learning_rate": 0.00019293781459349743,
+      "loss": 1.3578,
+      "step": 2721
+    },
+    {
+      "epoch": 0.4846866096866097,
+      "grad_norm": 0.46308520436286926,
+      "learning_rate": 0.00019293264680932893,
+      "loss": 0.9001,
+      "step": 2722
+    },
+    {
+      "epoch": 0.48486467236467234,
+      "grad_norm": 0.5149807929992676,
+      "learning_rate": 0.0001929274772043383,
+      "loss": 0.6908,
+      "step": 2723
+    },
+    {
+      "epoch": 0.48504273504273504,
+      "grad_norm": 0.5435031056404114,
+      "learning_rate": 0.00019292230577862678,
+      "loss": 1.2143,
+      "step": 2724
+    },
+    {
+      "epoch": 0.48522079772079774,
+      "grad_norm": 0.44217726588249207,
+      "learning_rate": 0.00019291713253229568,
+      "loss": 0.9303,
+      "step": 2725
+    },
+    {
+      "epoch": 0.4853988603988604,
+      "grad_norm": 0.6120226383209229,
+      "learning_rate": 0.00019291195746544643,
+      "loss": 1.3801,
+      "step": 2726
+    },
+    {
+      "epoch": 0.4855769230769231,
+      "grad_norm": 0.5014316439628601,
+      "learning_rate": 0.00019290678057818037,
+      "loss": 1.0631,
+      "step": 2727
+    },
+    {
+      "epoch": 0.48575498575498577,
+      "grad_norm": 0.5667829513549805,
+      "learning_rate": 0.00019290160187059895,
+      "loss": 1.3166,
+      "step": 2728
+    },
+    {
+      "epoch": 0.4859330484330484,
+      "grad_norm": 0.5011509656906128,
+      "learning_rate": 0.0001928964213428036,
+      "loss": 1.1887,
+      "step": 2729
+    },
+    {
+      "epoch": 0.4861111111111111,
+      "grad_norm": 0.48317405581474304,
+      "learning_rate": 0.00019289123899489586,
+      "loss": 1.1125,
+      "step": 2730
+    },
+    {
+      "epoch": 0.4862891737891738,
+      "grad_norm": 0.4669005870819092,
+      "learning_rate": 0.00019288605482697726,
+      "loss": 1.0091,
+      "step": 2731
+    },
+    {
+      "epoch": 0.48646723646723644,
+      "grad_norm": 0.4330739974975586,
+      "learning_rate": 0.00019288086883914937,
+      "loss": 0.9789,
+      "step": 2732
+    },
+    {
+      "epoch": 0.48664529914529914,
+      "grad_norm": 0.48482781648635864,
+      "learning_rate": 0.0001928756810315138,
+      "loss": 1.1922,
+      "step": 2733
+    },
+    {
+      "epoch": 0.48682336182336183,
+      "grad_norm": 0.5781838297843933,
+      "learning_rate": 0.0001928704914041722,
+      "loss": 1.1793,
+      "step": 2734
+    },
+    {
+      "epoch": 0.48700142450142453,
+      "grad_norm": 0.5955413579940796,
+      "learning_rate": 0.00019286529995722623,
+      "loss": 1.1001,
+      "step": 2735
+    },
+    {
+      "epoch": 0.48717948717948717,
+      "grad_norm": 0.49204322695732117,
+      "learning_rate": 0.00019286010669077763,
+      "loss": 0.9219,
+      "step": 2736
+    },
+    {
+      "epoch": 0.48735754985754987,
+      "grad_norm": 0.5853500962257385,
+      "learning_rate": 0.00019285491160492813,
+      "loss": 1.1133,
+      "step": 2737
+    },
+    {
+      "epoch": 0.48753561253561256,
+      "grad_norm": 0.5555846095085144,
+      "learning_rate": 0.0001928497146997795,
+      "loss": 1.0915,
+      "step": 2738
+    },
+    {
+      "epoch": 0.4877136752136752,
+      "grad_norm": 0.5166759490966797,
+      "learning_rate": 0.00019284451597543364,
+      "loss": 0.9349,
+      "step": 2739
+    },
+    {
+      "epoch": 0.4878917378917379,
+      "grad_norm": 0.47816506028175354,
+      "learning_rate": 0.00019283931543199234,
+      "loss": 0.8978,
+      "step": 2740
+    },
+    {
+      "epoch": 0.4880698005698006,
+      "grad_norm": 0.5632442831993103,
+      "learning_rate": 0.0001928341130695575,
+      "loss": 1.0491,
+      "step": 2741
+    },
+    {
+      "epoch": 0.48824786324786323,
+      "grad_norm": 0.6532769799232483,
+      "learning_rate": 0.00019282890888823107,
+      "loss": 1.2779,
+      "step": 2742
+    },
+    {
+      "epoch": 0.48842592592592593,
+      "grad_norm": 0.5733640789985657,
+      "learning_rate": 0.000192823702888115,
+      "loss": 1.4127,
+      "step": 2743
+    },
+    {
+      "epoch": 0.4886039886039886,
+      "grad_norm": 0.5701746344566345,
+      "learning_rate": 0.00019281849506931132,
+      "loss": 1.138,
+      "step": 2744
+    },
+    {
+      "epoch": 0.48878205128205127,
+      "grad_norm": 0.5227449536323547,
+      "learning_rate": 0.000192813285431922,
+      "loss": 1.1831,
+      "step": 2745
+    },
+    {
+      "epoch": 0.48896011396011396,
+      "grad_norm": 0.48457080125808716,
+      "learning_rate": 0.00019280807397604915,
+      "loss": 1.2468,
+      "step": 2746
+    },
+    {
+      "epoch": 0.48913817663817666,
+      "grad_norm": 0.4596176743507385,
+      "learning_rate": 0.0001928028607017949,
+      "loss": 1.1098,
+      "step": 2747
+    },
+    {
+      "epoch": 0.4893162393162393,
+      "grad_norm": 0.5204966068267822,
+      "learning_rate": 0.00019279764560926142,
+      "loss": 1.1501,
+      "step": 2748
+    },
+    {
+      "epoch": 0.489494301994302,
+      "grad_norm": 0.5179490447044373,
+      "learning_rate": 0.0001927924286985508,
+      "loss": 1.2601,
+      "step": 2749
+    },
+    {
+      "epoch": 0.4896723646723647,
+      "grad_norm": 0.4563423693180084,
+      "learning_rate": 0.00019278720996976533,
+      "loss": 1.081,
+      "step": 2750
+    },
+    {
+      "epoch": 0.48985042735042733,
+      "grad_norm": 0.4906339943408966,
+      "learning_rate": 0.00019278198942300717,
+      "loss": 1.157,
+      "step": 2751
+    },
+    {
+      "epoch": 0.49002849002849,
+      "grad_norm": 0.42241403460502625,
+      "learning_rate": 0.00019277676705837873,
+      "loss": 1.0333,
+      "step": 2752
+    },
+    {
+      "epoch": 0.4902065527065527,
+      "grad_norm": 0.6310175657272339,
+      "learning_rate": 0.00019277154287598226,
+      "loss": 1.1225,
+      "step": 2753
+    },
+    {
+      "epoch": 0.49038461538461536,
+      "grad_norm": 0.5109034776687622,
+      "learning_rate": 0.0001927663168759201,
+      "loss": 1.1619,
+      "step": 2754
+    },
+    {
+      "epoch": 0.49056267806267806,
+      "grad_norm": 0.4809598922729492,
+      "learning_rate": 0.00019276108905829465,
+      "loss": 1.0423,
+      "step": 2755
+    },
+    {
+      "epoch": 0.49074074074074076,
+      "grad_norm": 0.557502806186676,
+      "learning_rate": 0.00019275585942320837,
+      "loss": 0.8783,
+      "step": 2756
+    },
+    {
+      "epoch": 0.4909188034188034,
+      "grad_norm": 0.5434393882751465,
+      "learning_rate": 0.0001927506279707637,
+      "loss": 1.1701,
+      "step": 2757
+    },
+    {
+      "epoch": 0.4910968660968661,
+      "grad_norm": 0.49278944730758667,
+      "learning_rate": 0.00019274539470106317,
+      "loss": 1.0447,
+      "step": 2758
+    },
+    {
+      "epoch": 0.4912749287749288,
+      "grad_norm": 0.5634264349937439,
+      "learning_rate": 0.00019274015961420927,
+      "loss": 1.0639,
+      "step": 2759
+    },
+    {
+      "epoch": 0.49145299145299143,
+      "grad_norm": 0.5632645487785339,
+      "learning_rate": 0.00019273492271030464,
+      "loss": 0.9223,
+      "step": 2760
+    },
+    {
+      "epoch": 0.4916310541310541,
+      "grad_norm": 0.5949172377586365,
+      "learning_rate": 0.00019272968398945177,
+      "loss": 0.894,
+      "step": 2761
+    },
+    {
+      "epoch": 0.4918091168091168,
+      "grad_norm": 0.5375374555587769,
+      "learning_rate": 0.00019272444345175342,
+      "loss": 1.0311,
+      "step": 2762
+    },
+    {
+      "epoch": 0.49198717948717946,
+      "grad_norm": 0.5211305022239685,
+      "learning_rate": 0.00019271920109731222,
+      "loss": 1.1531,
+      "step": 2763
+    },
+    {
+      "epoch": 0.49216524216524216,
+      "grad_norm": 0.44022253155708313,
+      "learning_rate": 0.00019271395692623084,
+      "loss": 0.9147,
+      "step": 2764
+    },
+    {
+      "epoch": 0.49234330484330485,
+      "grad_norm": 0.4682174623012543,
+      "learning_rate": 0.0001927087109386121,
+      "loss": 1.081,
+      "step": 2765
+    },
+    {
+      "epoch": 0.49252136752136755,
+      "grad_norm": 0.4971517324447632,
+      "learning_rate": 0.0001927034631345588,
+      "loss": 1.1017,
+      "step": 2766
+    },
+    {
+      "epoch": 0.4926994301994302,
+      "grad_norm": 0.5015294551849365,
+      "learning_rate": 0.00019269821351417364,
+      "loss": 1.1093,
+      "step": 2767
+    },
+    {
+      "epoch": 0.4928774928774929,
+      "grad_norm": 0.5512694716453552,
+      "learning_rate": 0.00019269296207755958,
+      "loss": 0.9657,
+      "step": 2768
+    },
+    {
+      "epoch": 0.4930555555555556,
+      "grad_norm": 0.4914868474006653,
+      "learning_rate": 0.00019268770882481948,
+      "loss": 1.0379,
+      "step": 2769
+    },
+    {
+      "epoch": 0.4932336182336182,
+      "grad_norm": 0.567337691783905,
+      "learning_rate": 0.00019268245375605626,
+      "loss": 1.004,
+      "step": 2770
+    },
+    {
+      "epoch": 0.4934116809116809,
+      "grad_norm": 0.518489420413971,
+      "learning_rate": 0.0001926771968713729,
+      "loss": 1.0734,
+      "step": 2771
+    },
+    {
+      "epoch": 0.4935897435897436,
+      "grad_norm": 0.567742109298706,
+      "learning_rate": 0.00019267193817087237,
+      "loss": 1.1276,
+      "step": 2772
+    },
+    {
+      "epoch": 0.49376780626780625,
+      "grad_norm": 0.5287964344024658,
+      "learning_rate": 0.00019266667765465773,
+      "loss": 1.1429,
+      "step": 2773
+    },
+    {
+      "epoch": 0.49394586894586895,
+      "grad_norm": 0.5302085876464844,
+      "learning_rate": 0.00019266141532283207,
+      "loss": 1.0934,
+      "step": 2774
+    },
+    {
+      "epoch": 0.49412393162393164,
+      "grad_norm": 0.5569987297058105,
+      "learning_rate": 0.00019265615117549842,
+      "loss": 1.1453,
+      "step": 2775
+    },
+    {
+      "epoch": 0.4943019943019943,
+      "grad_norm": 0.519695520401001,
+      "learning_rate": 0.00019265088521275997,
+      "loss": 1.1255,
+      "step": 2776
+    },
+    {
+      "epoch": 0.494480056980057,
+      "grad_norm": 0.5073211193084717,
+      "learning_rate": 0.0001926456174347199,
+      "loss": 1.0609,
+      "step": 2777
+    },
+    {
+      "epoch": 0.4946581196581197,
+      "grad_norm": 0.45028239488601685,
+      "learning_rate": 0.00019264034784148142,
+      "loss": 0.9098,
+      "step": 2778
+    },
+    {
+      "epoch": 0.4948361823361823,
+      "grad_norm": 0.6641215682029724,
+      "learning_rate": 0.00019263507643314776,
+      "loss": 0.8903,
+      "step": 2779
+    },
+    {
+      "epoch": 0.495014245014245,
+      "grad_norm": 0.5281413793563843,
+      "learning_rate": 0.00019262980320982224,
+      "loss": 1.2906,
+      "step": 2780
+    },
+    {
+      "epoch": 0.4951923076923077,
+      "grad_norm": 0.6256437301635742,
+      "learning_rate": 0.0001926245281716081,
+      "loss": 1.4142,
+      "step": 2781
+    },
+    {
+      "epoch": 0.49537037037037035,
+      "grad_norm": 0.5422517657279968,
+      "learning_rate": 0.00019261925131860877,
+      "loss": 1.1606,
+      "step": 2782
+    },
+    {
+      "epoch": 0.49554843304843305,
+      "grad_norm": 0.46938949823379517,
+      "learning_rate": 0.0001926139726509276,
+      "loss": 1.0333,
+      "step": 2783
+    },
+    {
+      "epoch": 0.49572649572649574,
+      "grad_norm": 0.5799683928489685,
+      "learning_rate": 0.000192608692168668,
+      "loss": 1.0333,
+      "step": 2784
+    },
+    {
+      "epoch": 0.4959045584045584,
+      "grad_norm": 0.5231602787971497,
+      "learning_rate": 0.0001926034098719335,
+      "loss": 1.1847,
+      "step": 2785
+    },
+    {
+      "epoch": 0.4960826210826211,
+      "grad_norm": 0.477845698595047,
+      "learning_rate": 0.00019259812576082752,
+      "loss": 1.0746,
+      "step": 2786
+    },
+    {
+      "epoch": 0.4962606837606838,
+      "grad_norm": 0.5490350723266602,
+      "learning_rate": 0.00019259283983545365,
+      "loss": 1.2462,
+      "step": 2787
+    },
+    {
+      "epoch": 0.4964387464387464,
+      "grad_norm": 0.5788847208023071,
+      "learning_rate": 0.0001925875520959154,
+      "loss": 1.3485,
+      "step": 2788
+    },
+    {
+      "epoch": 0.4966168091168091,
+      "grad_norm": 0.46184736490249634,
+      "learning_rate": 0.00019258226254231643,
+      "loss": 0.8673,
+      "step": 2789
+    },
+    {
+      "epoch": 0.4967948717948718,
+      "grad_norm": 0.4890633225440979,
+      "learning_rate": 0.0001925769711747603,
+      "loss": 0.9474,
+      "step": 2790
+    },
+    {
+      "epoch": 0.49697293447293445,
+      "grad_norm": 0.5719282627105713,
+      "learning_rate": 0.00019257167799335078,
+      "loss": 1.2532,
+      "step": 2791
+    },
+    {
+      "epoch": 0.49715099715099714,
+      "grad_norm": 0.5385584235191345,
+      "learning_rate": 0.0001925663829981915,
+      "loss": 1.1326,
+      "step": 2792
+    },
+    {
+      "epoch": 0.49732905982905984,
+      "grad_norm": 0.5339545011520386,
+      "learning_rate": 0.00019256108618938625,
+      "loss": 1.1362,
+      "step": 2793
+    },
+    {
+      "epoch": 0.49750712250712253,
+      "grad_norm": 0.5017803907394409,
+      "learning_rate": 0.00019255578756703878,
+      "loss": 1.0449,
+      "step": 2794
+    },
+    {
+      "epoch": 0.4976851851851852,
+      "grad_norm": 0.6004226803779602,
+      "learning_rate": 0.00019255048713125294,
+      "loss": 0.9346,
+      "step": 2795
+    },
+    {
+      "epoch": 0.49786324786324787,
+      "grad_norm": 0.44581490755081177,
+      "learning_rate": 0.00019254518488213255,
+      "loss": 1.038,
+      "step": 2796
+    },
+    {
+      "epoch": 0.49804131054131057,
+      "grad_norm": 0.5180951356887817,
+      "learning_rate": 0.00019253988081978151,
+      "loss": 1.0479,
+      "step": 2797
+    },
+    {
+      "epoch": 0.4982193732193732,
+      "grad_norm": 0.53944993019104,
+      "learning_rate": 0.00019253457494430376,
+      "loss": 1.2598,
+      "step": 2798
+    },
+    {
+      "epoch": 0.4983974358974359,
+      "grad_norm": 0.5633010268211365,
+      "learning_rate": 0.00019252926725580322,
+      "loss": 1.205,
+      "step": 2799
+    },
+    {
+      "epoch": 0.4985754985754986,
+      "grad_norm": 0.6653175950050354,
+      "learning_rate": 0.0001925239577543839,
+      "loss": 1.2383,
+      "step": 2800
+    },
+    {
+      "epoch": 0.49875356125356124,
+      "grad_norm": 0.5083333849906921,
+      "learning_rate": 0.00019251864644014984,
+      "loss": 1.0649,
+      "step": 2801
+    },
+    {
+      "epoch": 0.49893162393162394,
+      "grad_norm": 0.4842020571231842,
+      "learning_rate": 0.00019251333331320506,
+      "loss": 1.1991,
+      "step": 2802
+    },
+    {
+      "epoch": 0.49910968660968663,
+      "grad_norm": 0.47987112402915955,
+      "learning_rate": 0.00019250801837365373,
+      "loss": 1.1686,
+      "step": 2803
+    },
+    {
+      "epoch": 0.49928774928774927,
+      "grad_norm": 0.5316333770751953,
+      "learning_rate": 0.00019250270162159992,
+      "loss": 1.1759,
+      "step": 2804
+    },
+    {
+      "epoch": 0.49946581196581197,
+      "grad_norm": 0.5015079379081726,
+      "learning_rate": 0.00019249738305714787,
+      "loss": 0.9424,
+      "step": 2805
+    },
+    {
+      "epoch": 0.49964387464387466,
+      "grad_norm": 0.6488274931907654,
+      "learning_rate": 0.00019249206268040172,
+      "loss": 1.066,
+      "step": 2806
+    },
+    {
+      "epoch": 0.4998219373219373,
+      "grad_norm": 0.40364864468574524,
+      "learning_rate": 0.00019248674049146574,
+      "loss": 0.6998,
+      "step": 2807
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.5535672903060913,
+      "learning_rate": 0.00019248141649044423,
+      "loss": 1.2207,
+      "step": 2808
+    },
+    {
+      "epoch": 0.5,
+      "eval_loss": 1.1072274446487427,
+      "eval_runtime": 28.6913,
+      "eval_samples_per_second": 36.283,
+      "eval_steps_per_second": 18.159,
+      "step": 2808
+    },
+    {
+      "epoch": 0.5001780626780626,
+      "grad_norm": 0.4834389090538025,
+      "learning_rate": 0.00019247609067744143,
+      "loss": 1.1686,
+      "step": 2809
+    },
+    {
+      "epoch": 0.5003561253561254,
+      "grad_norm": 0.5007249712944031,
+      "learning_rate": 0.00019247076305256176,
+      "loss": 1.1343,
+      "step": 2810
+    },
+    {
+      "epoch": 0.500534188034188,
+      "grad_norm": 0.4773348271846771,
+      "learning_rate": 0.00019246543361590957,
+      "loss": 0.9324,
+      "step": 2811
+    },
+    {
+      "epoch": 0.5007122507122507,
+      "grad_norm": 0.47324609756469727,
+      "learning_rate": 0.0001924601023675893,
+      "loss": 1.0223,
+      "step": 2812
+    },
+    {
+      "epoch": 0.5008903133903134,
+      "grad_norm": 0.5583845973014832,
+      "learning_rate": 0.00019245476930770537,
+      "loss": 1.1328,
+      "step": 2813
+    },
+    {
+      "epoch": 0.5010683760683761,
+      "grad_norm": 0.4814579486846924,
+      "learning_rate": 0.00019244943443636232,
+      "loss": 1.0528,
+      "step": 2814
+    },
+    {
+      "epoch": 0.5012464387464387,
+      "grad_norm": 0.4996104836463928,
+      "learning_rate": 0.00019244409775366465,
+      "loss": 1.2482,
+      "step": 2815
+    },
+    {
+      "epoch": 0.5014245014245015,
+      "grad_norm": 0.47870904207229614,
+      "learning_rate": 0.0001924387592597169,
+      "loss": 0.9452,
+      "step": 2816
+    },
+    {
+      "epoch": 0.5016025641025641,
+      "grad_norm": 0.5617441534996033,
+      "learning_rate": 0.0001924334189546237,
+      "loss": 1.378,
+      "step": 2817
+    },
+    {
+      "epoch": 0.5017806267806267,
+      "grad_norm": 0.4872083365917206,
+      "learning_rate": 0.00019242807683848967,
+      "loss": 1.1571,
+      "step": 2818
+    },
+    {
+      "epoch": 0.5019586894586895,
+      "grad_norm": 0.5147804021835327,
+      "learning_rate": 0.00019242273291141947,
+      "loss": 1.1086,
+      "step": 2819
+    },
+    {
+      "epoch": 0.5021367521367521,
+      "grad_norm": 0.4698995351791382,
+      "learning_rate": 0.00019241738717351784,
+      "loss": 1.1579,
+      "step": 2820
+    },
+    {
+      "epoch": 0.5023148148148148,
+      "grad_norm": 0.5158926844596863,
+      "learning_rate": 0.00019241203962488946,
+      "loss": 1.2763,
+      "step": 2821
+    },
+    {
+      "epoch": 0.5024928774928775,
+      "grad_norm": 0.5218976736068726,
+      "learning_rate": 0.00019240669026563914,
+      "loss": 1.0633,
+      "step": 2822
+    },
+    {
+      "epoch": 0.5026709401709402,
+      "grad_norm": 0.5511452555656433,
+      "learning_rate": 0.0001924013390958717,
+      "loss": 0.9939,
+      "step": 2823
+    },
+    {
+      "epoch": 0.5028490028490028,
+      "grad_norm": 0.5227555632591248,
+      "learning_rate": 0.00019239598611569191,
+      "loss": 1.2478,
+      "step": 2824
+    },
+    {
+      "epoch": 0.5030270655270656,
+      "grad_norm": 0.5444719791412354,
+      "learning_rate": 0.00019239063132520475,
+      "loss": 1.1574,
+      "step": 2825
+    },
+    {
+      "epoch": 0.5032051282051282,
+      "grad_norm": 0.4752781093120575,
+      "learning_rate": 0.0001923852747245151,
+      "loss": 0.9034,
+      "step": 2826
+    },
+    {
+      "epoch": 0.5033831908831908,
+      "grad_norm": 0.5286496877670288,
+      "learning_rate": 0.00019237991631372792,
+      "loss": 1.1391,
+      "step": 2827
+    },
+    {
+      "epoch": 0.5035612535612536,
+      "grad_norm": 0.5009933710098267,
+      "learning_rate": 0.00019237455609294815,
+      "loss": 1.2178,
+      "step": 2828
+    },
+    {
+      "epoch": 0.5037393162393162,
+      "grad_norm": 0.5012276768684387,
+      "learning_rate": 0.00019236919406228085,
+      "loss": 0.9877,
+      "step": 2829
+    },
+    {
+      "epoch": 0.5039173789173789,
+      "grad_norm": 0.576508104801178,
+      "learning_rate": 0.00019236383022183106,
+      "loss": 1.1299,
+      "step": 2830
+    },
+    {
+      "epoch": 0.5040954415954416,
+      "grad_norm": 0.4716590642929077,
+      "learning_rate": 0.0001923584645717039,
+      "loss": 1.0451,
+      "step": 2831
+    },
+    {
+      "epoch": 0.5042735042735043,
+      "grad_norm": 0.5817418098449707,
+      "learning_rate": 0.00019235309711200448,
+      "loss": 1.0911,
+      "step": 2832
+    },
+    {
+      "epoch": 0.5044515669515669,
+      "grad_norm": 0.5695745944976807,
+      "learning_rate": 0.000192347727842838,
+      "loss": 1.0229,
+      "step": 2833
+    },
+    {
+      "epoch": 0.5046296296296297,
+      "grad_norm": 0.49127066135406494,
+      "learning_rate": 0.00019234235676430958,
+      "loss": 1.1377,
+      "step": 2834
+    },
+    {
+      "epoch": 0.5048076923076923,
+      "grad_norm": 0.5426172614097595,
+      "learning_rate": 0.00019233698387652453,
+      "loss": 1.2427,
+      "step": 2835
+    },
+    {
+      "epoch": 0.5049857549857549,
+      "grad_norm": 0.5342385172843933,
+      "learning_rate": 0.0001923316091795881,
+      "loss": 1.1427,
+      "step": 2836
+    },
+    {
+      "epoch": 0.5051638176638177,
+      "grad_norm": 0.5480486750602722,
+      "learning_rate": 0.00019232623267360558,
+      "loss": 1.0647,
+      "step": 2837
+    },
+    {
+      "epoch": 0.5053418803418803,
+      "grad_norm": 0.4584530293941498,
+      "learning_rate": 0.00019232085435868235,
+      "loss": 1.0461,
+      "step": 2838
+    },
+    {
+      "epoch": 0.5055199430199431,
+      "grad_norm": 0.5992119908332825,
+      "learning_rate": 0.00019231547423492371,
+      "loss": 1.1456,
+      "step": 2839
+    },
+    {
+      "epoch": 0.5056980056980057,
+      "grad_norm": 0.514018177986145,
+      "learning_rate": 0.00019231009230243515,
+      "loss": 1.2559,
+      "step": 2840
+    },
+    {
+      "epoch": 0.5058760683760684,
+      "grad_norm": 0.5392283797264099,
+      "learning_rate": 0.0001923047085613221,
+      "loss": 1.044,
+      "step": 2841
+    },
+    {
+      "epoch": 0.5060541310541311,
+      "grad_norm": 0.4486566483974457,
+      "learning_rate": 0.00019229932301169,
+      "loss": 1.0679,
+      "step": 2842
+    },
+    {
+      "epoch": 0.5062321937321937,
+      "grad_norm": 0.4523460566997528,
+      "learning_rate": 0.00019229393565364442,
+      "loss": 1.1651,
+      "step": 2843
+    },
+    {
+      "epoch": 0.5064102564102564,
+      "grad_norm": 0.6032688021659851,
+      "learning_rate": 0.0001922885464872909,
+      "loss": 1.15,
+      "step": 2844
+    },
+    {
+      "epoch": 0.5065883190883191,
+      "grad_norm": 0.5883688926696777,
+      "learning_rate": 0.000192283155512735,
+      "loss": 1.2179,
+      "step": 2845
+    },
+    {
+      "epoch": 0.5067663817663818,
+      "grad_norm": 0.5534378886222839,
+      "learning_rate": 0.00019227776273008238,
+      "loss": 1.0387,
+      "step": 2846
+    },
+    {
+      "epoch": 0.5069444444444444,
+      "grad_norm": 0.5899033546447754,
+      "learning_rate": 0.00019227236813943872,
+      "loss": 1.0812,
+      "step": 2847
+    },
+    {
+      "epoch": 0.5071225071225072,
+      "grad_norm": 0.5718855261802673,
+      "learning_rate": 0.00019226697174090965,
+      "loss": 1.1375,
+      "step": 2848
+    },
+    {
+      "epoch": 0.5073005698005698,
+      "grad_norm": 0.5080967545509338,
+      "learning_rate": 0.00019226157353460094,
+      "loss": 1.1421,
+      "step": 2849
+    },
+    {
+      "epoch": 0.5074786324786325,
+      "grad_norm": 0.5253677368164062,
+      "learning_rate": 0.0001922561735206184,
+      "loss": 1.0166,
+      "step": 2850
+    },
+    {
+      "epoch": 0.5076566951566952,
+      "grad_norm": 0.47797444462776184,
+      "learning_rate": 0.00019225077169906772,
+      "loss": 1.0504,
+      "step": 2851
+    },
+    {
+      "epoch": 0.5078347578347578,
+      "grad_norm": 0.4911690652370453,
+      "learning_rate": 0.0001922453680700548,
+      "loss": 1.0629,
+      "step": 2852
+    },
+    {
+      "epoch": 0.5080128205128205,
+      "grad_norm": 0.49678200483322144,
+      "learning_rate": 0.00019223996263368557,
+      "loss": 1.1672,
+      "step": 2853
+    },
+    {
+      "epoch": 0.5081908831908832,
+      "grad_norm": 0.5451810359954834,
+      "learning_rate": 0.00019223455539006586,
+      "loss": 1.3031,
+      "step": 2854
+    },
+    {
+      "epoch": 0.5083689458689459,
+      "grad_norm": 0.5708984136581421,
+      "learning_rate": 0.00019222914633930166,
+      "loss": 1.0986,
+      "step": 2855
+    },
+    {
+      "epoch": 0.5085470085470085,
+      "grad_norm": 0.47232356667518616,
+      "learning_rate": 0.00019222373548149888,
+      "loss": 1.0449,
+      "step": 2856
+    },
+    {
+      "epoch": 0.5087250712250713,
+      "grad_norm": 0.6027610898017883,
+      "learning_rate": 0.0001922183228167636,
+      "loss": 0.862,
+      "step": 2857
+    },
+    {
+      "epoch": 0.5089031339031339,
+      "grad_norm": 0.5211802124977112,
+      "learning_rate": 0.00019221290834520188,
+      "loss": 1.1048,
+      "step": 2858
+    },
+    {
+      "epoch": 0.5090811965811965,
+      "grad_norm": 0.45101237297058105,
+      "learning_rate": 0.00019220749206691972,
+      "loss": 1.0046,
+      "step": 2859
+    },
+    {
+      "epoch": 0.5092592592592593,
+      "grad_norm": 0.5526158213615417,
+      "learning_rate": 0.00019220207398202335,
+      "loss": 1.2275,
+      "step": 2860
+    },
+    {
+      "epoch": 0.5094373219373219,
+      "grad_norm": 0.48322010040283203,
+      "learning_rate": 0.00019219665409061885,
+      "loss": 0.9974,
+      "step": 2861
+    },
+    {
+      "epoch": 0.5096153846153846,
+      "grad_norm": 0.4775219261646271,
+      "learning_rate": 0.00019219123239281244,
+      "loss": 1.1852,
+      "step": 2862
+    },
+    {
+      "epoch": 0.5097934472934473,
+      "grad_norm": 0.46184200048446655,
+      "learning_rate": 0.00019218580888871034,
+      "loss": 0.9393,
+      "step": 2863
+    },
+    {
+      "epoch": 0.50997150997151,
+      "grad_norm": 0.47495174407958984,
+      "learning_rate": 0.00019218038357841883,
+      "loss": 0.9631,
+      "step": 2864
+    },
+    {
+      "epoch": 0.5101495726495726,
+      "grad_norm": 0.48600029945373535,
+      "learning_rate": 0.00019217495646204418,
+      "loss": 1.0498,
+      "step": 2865
+    },
+    {
+      "epoch": 0.5103276353276354,
+      "grad_norm": 0.5801547169685364,
+      "learning_rate": 0.00019216952753969274,
+      "loss": 1.2181,
+      "step": 2866
+    },
+    {
+      "epoch": 0.510505698005698,
+      "grad_norm": 0.5082106590270996,
+      "learning_rate": 0.00019216409681147085,
+      "loss": 1.2009,
+      "step": 2867
+    },
+    {
+      "epoch": 0.5106837606837606,
+      "grad_norm": 0.4184330701828003,
+      "learning_rate": 0.00019215866427748493,
+      "loss": 0.8462,
+      "step": 2868
+    },
+    {
+      "epoch": 0.5108618233618234,
+      "grad_norm": 0.518099844455719,
+      "learning_rate": 0.00019215322993784147,
+      "loss": 1.2091,
+      "step": 2869
+    },
+    {
+      "epoch": 0.511039886039886,
+      "grad_norm": 0.569464921951294,
+      "learning_rate": 0.0001921477937926469,
+      "loss": 1.0264,
+      "step": 2870
+    },
+    {
+      "epoch": 0.5112179487179487,
+      "grad_norm": 0.526767909526825,
+      "learning_rate": 0.00019214235584200768,
+      "loss": 1.1192,
+      "step": 2871
+    },
+    {
+      "epoch": 0.5113960113960114,
+      "grad_norm": 0.6511057019233704,
+      "learning_rate": 0.00019213691608603047,
+      "loss": 1.3193,
+      "step": 2872
+    },
+    {
+      "epoch": 0.5115740740740741,
+      "grad_norm": 0.48536401987075806,
+      "learning_rate": 0.00019213147452482173,
+      "loss": 1.1671,
+      "step": 2873
+    },
+    {
+      "epoch": 0.5117521367521367,
+      "grad_norm": 0.7972469329833984,
+      "learning_rate": 0.00019212603115848818,
+      "loss": 1.1393,
+      "step": 2874
+    },
+    {
+      "epoch": 0.5119301994301995,
+      "grad_norm": 0.5543264746665955,
+      "learning_rate": 0.00019212058598713642,
+      "loss": 1.1436,
+      "step": 2875
+    },
+    {
+      "epoch": 0.5121082621082621,
+      "grad_norm": 0.49688720703125,
+      "learning_rate": 0.0001921151390108731,
+      "loss": 1.0897,
+      "step": 2876
+    },
+    {
+      "epoch": 0.5122863247863247,
+      "grad_norm": 0.4928736090660095,
+      "learning_rate": 0.000192109690229805,
+      "loss": 1.2426,
+      "step": 2877
+    },
+    {
+      "epoch": 0.5124643874643875,
+      "grad_norm": 0.4917896091938019,
+      "learning_rate": 0.0001921042396440389,
+      "loss": 1.0047,
+      "step": 2878
+    },
+    {
+      "epoch": 0.5126424501424501,
+      "grad_norm": 0.5485204458236694,
+      "learning_rate": 0.00019209878725368152,
+      "loss": 1.2615,
+      "step": 2879
+    },
+    {
+      "epoch": 0.5128205128205128,
+      "grad_norm": 0.5229470133781433,
+      "learning_rate": 0.0001920933330588397,
+      "loss": 1.3249,
+      "step": 2880
+    },
+    {
+      "epoch": 0.5129985754985755,
+      "grad_norm": 0.4783077538013458,
+      "learning_rate": 0.00019208787705962037,
+      "loss": 1.2004,
+      "step": 2881
+    },
+    {
+      "epoch": 0.5131766381766382,
+      "grad_norm": 0.5106910467147827,
+      "learning_rate": 0.00019208241925613035,
+      "loss": 1.1745,
+      "step": 2882
+    },
+    {
+      "epoch": 0.5133547008547008,
+      "grad_norm": 0.5308730006217957,
+      "learning_rate": 0.00019207695964847666,
+      "loss": 0.9706,
+      "step": 2883
+    },
+    {
+      "epoch": 0.5135327635327636,
+      "grad_norm": 0.5489775538444519,
+      "learning_rate": 0.00019207149823676617,
+      "loss": 1.0073,
+      "step": 2884
+    },
+    {
+      "epoch": 0.5137108262108262,
+      "grad_norm": 0.4992835521697998,
+      "learning_rate": 0.00019206603502110596,
+      "loss": 1.1053,
+      "step": 2885
+    },
+    {
+      "epoch": 0.5138888888888888,
+      "grad_norm": 0.5304922461509705,
+      "learning_rate": 0.00019206057000160302,
+      "loss": 1.0565,
+      "step": 2886
+    },
+    {
+      "epoch": 0.5140669515669516,
+      "grad_norm": 0.46411609649658203,
+      "learning_rate": 0.00019205510317836448,
+      "loss": 0.9202,
+      "step": 2887
+    },
+    {
+      "epoch": 0.5142450142450142,
+      "grad_norm": 0.5236835479736328,
+      "learning_rate": 0.0001920496345514974,
+      "loss": 0.9075,
+      "step": 2888
+    },
+    {
+      "epoch": 0.5144230769230769,
+      "grad_norm": 0.4416964054107666,
+      "learning_rate": 0.00019204416412110895,
+      "loss": 0.9225,
+      "step": 2889
+    },
+    {
+      "epoch": 0.5146011396011396,
+      "grad_norm": 0.5470940470695496,
+      "learning_rate": 0.00019203869188730633,
+      "loss": 1.2195,
+      "step": 2890
+    },
+    {
+      "epoch": 0.5147792022792023,
+      "grad_norm": 0.5380414128303528,
+      "learning_rate": 0.0001920332178501967,
+      "loss": 1.0731,
+      "step": 2891
+    },
+    {
+      "epoch": 0.5149572649572649,
+      "grad_norm": 0.4405716359615326,
+      "learning_rate": 0.00019202774200988737,
+      "loss": 0.8739,
+      "step": 2892
+    },
+    {
+      "epoch": 0.5151353276353277,
+      "grad_norm": 0.5222984552383423,
+      "learning_rate": 0.0001920222643664856,
+      "loss": 1.1806,
+      "step": 2893
+    },
+    {
+      "epoch": 0.5153133903133903,
+      "grad_norm": 0.48545539379119873,
+      "learning_rate": 0.0001920167849200987,
+      "loss": 0.9939,
+      "step": 2894
+    },
+    {
+      "epoch": 0.5154914529914529,
+      "grad_norm": 0.45078009366989136,
+      "learning_rate": 0.0001920113036708341,
+      "loss": 1.0085,
+      "step": 2895
+    },
+    {
+      "epoch": 0.5156695156695157,
+      "grad_norm": 0.5029830932617188,
+      "learning_rate": 0.00019200582061879913,
+      "loss": 1.1095,
+      "step": 2896
+    },
+    {
+      "epoch": 0.5158475783475783,
+      "grad_norm": 0.5316143035888672,
+      "learning_rate": 0.00019200033576410118,
+      "loss": 0.9883,
+      "step": 2897
+    },
+    {
+      "epoch": 0.5160256410256411,
+      "grad_norm": 0.5282100439071655,
+      "learning_rate": 0.0001919948491068478,
+      "loss": 1.1441,
+      "step": 2898
+    },
+    {
+      "epoch": 0.5162037037037037,
+      "grad_norm": 0.5145367980003357,
+      "learning_rate": 0.00019198936064714647,
+      "loss": 1.1999,
+      "step": 2899
+    },
+    {
+      "epoch": 0.5163817663817664,
+      "grad_norm": 0.5385651588439941,
+      "learning_rate": 0.00019198387038510468,
+      "loss": 1.1831,
+      "step": 2900
+    },
+    {
+      "epoch": 0.5165598290598291,
+      "grad_norm": 0.4971916377544403,
+      "learning_rate": 0.00019197837832083002,
+      "loss": 1.2518,
+      "step": 2901
+    },
+    {
+      "epoch": 0.5167378917378918,
+      "grad_norm": 0.5253807306289673,
+      "learning_rate": 0.00019197288445443016,
+      "loss": 1.0788,
+      "step": 2902
+    },
+    {
+      "epoch": 0.5169159544159544,
+      "grad_norm": 0.49724945425987244,
+      "learning_rate": 0.00019196738878601263,
+      "loss": 1.0985,
+      "step": 2903
+    },
+    {
+      "epoch": 0.5170940170940171,
+      "grad_norm": 0.5327325463294983,
+      "learning_rate": 0.0001919618913156852,
+      "loss": 1.2862,
+      "step": 2904
+    },
+    {
+      "epoch": 0.5172720797720798,
+      "grad_norm": 0.639999270439148,
+      "learning_rate": 0.00019195639204355554,
+      "loss": 1.2052,
+      "step": 2905
+    },
+    {
+      "epoch": 0.5174501424501424,
+      "grad_norm": 0.4630785584449768,
+      "learning_rate": 0.0001919508909697314,
+      "loss": 1.1157,
+      "step": 2906
+    },
+    {
+      "epoch": 0.5176282051282052,
+      "grad_norm": 0.513949990272522,
+      "learning_rate": 0.00019194538809432055,
+      "loss": 1.0047,
+      "step": 2907
+    },
+    {
+      "epoch": 0.5178062678062678,
+      "grad_norm": 0.488034725189209,
+      "learning_rate": 0.0001919398834174308,
+      "loss": 0.9008,
+      "step": 2908
+    },
+    {
+      "epoch": 0.5179843304843305,
+      "grad_norm": 0.4892788529396057,
+      "learning_rate": 0.00019193437693917006,
+      "loss": 1.1024,
+      "step": 2909
+    },
+    {
+      "epoch": 0.5181623931623932,
+      "grad_norm": 0.5503842830657959,
+      "learning_rate": 0.00019192886865964618,
+      "loss": 1.2283,
+      "step": 2910
+    },
+    {
+      "epoch": 0.5183404558404558,
+      "grad_norm": 0.48885393142700195,
+      "learning_rate": 0.00019192335857896707,
+      "loss": 0.9522,
+      "step": 2911
+    },
+    {
+      "epoch": 0.5185185185185185,
+      "grad_norm": 0.5479527115821838,
+      "learning_rate": 0.00019191784669724072,
+      "loss": 1.1616,
+      "step": 2912
+    },
+    {
+      "epoch": 0.5186965811965812,
+      "grad_norm": 0.42701148986816406,
+      "learning_rate": 0.00019191233301457506,
+      "loss": 0.8434,
+      "step": 2913
+    },
+    {
+      "epoch": 0.5188746438746439,
+      "grad_norm": 0.4273422658443451,
+      "learning_rate": 0.00019190681753107822,
+      "loss": 0.8316,
+      "step": 2914
+    },
+    {
+      "epoch": 0.5190527065527065,
+      "grad_norm": 0.5047736763954163,
+      "learning_rate": 0.00019190130024685818,
+      "loss": 1.171,
+      "step": 2915
+    },
+    {
+      "epoch": 0.5192307692307693,
+      "grad_norm": 0.5221177935600281,
+      "learning_rate": 0.00019189578116202307,
+      "loss": 1.0256,
+      "step": 2916
+    },
+    {
+      "epoch": 0.5194088319088319,
+      "grad_norm": 0.4782322943210602,
+      "learning_rate": 0.00019189026027668105,
+      "loss": 0.8598,
+      "step": 2917
+    },
+    {
+      "epoch": 0.5195868945868946,
+      "grad_norm": 0.5627185702323914,
+      "learning_rate": 0.00019188473759094022,
+      "loss": 1.1825,
+      "step": 2918
+    },
+    {
+      "epoch": 0.5197649572649573,
+      "grad_norm": 0.5036423206329346,
+      "learning_rate": 0.00019187921310490888,
+      "loss": 1.0881,
+      "step": 2919
+    },
+    {
+      "epoch": 0.51994301994302,
+      "grad_norm": 0.4271143972873688,
+      "learning_rate": 0.0001918736868186952,
+      "loss": 0.9265,
+      "step": 2920
+    },
+    {
+      "epoch": 0.5201210826210826,
+      "grad_norm": 0.5427432656288147,
+      "learning_rate": 0.00019186815873240747,
+      "loss": 1.196,
+      "step": 2921
+    },
+    {
+      "epoch": 0.5202991452991453,
+      "grad_norm": 0.5494198203086853,
+      "learning_rate": 0.00019186262884615402,
+      "loss": 1.1207,
+      "step": 2922
+    },
+    {
+      "epoch": 0.520477207977208,
+      "grad_norm": 0.5305119752883911,
+      "learning_rate": 0.0001918570971600432,
+      "loss": 1.0393,
+      "step": 2923
+    },
+    {
+      "epoch": 0.5206552706552706,
+      "grad_norm": 0.46713170409202576,
+      "learning_rate": 0.00019185156367418333,
+      "loss": 0.9583,
+      "step": 2924
+    },
+    {
+      "epoch": 0.5208333333333334,
+      "grad_norm": 0.597776472568512,
+      "learning_rate": 0.00019184602838868292,
+      "loss": 1.2978,
+      "step": 2925
+    },
+    {
+      "epoch": 0.521011396011396,
+      "grad_norm": 0.520976722240448,
+      "learning_rate": 0.00019184049130365036,
+      "loss": 1.0515,
+      "step": 2926
+    },
+    {
+      "epoch": 0.5211894586894587,
+      "grad_norm": 0.5266290307044983,
+      "learning_rate": 0.00019183495241919415,
+      "loss": 1.0437,
+      "step": 2927
+    },
+    {
+      "epoch": 0.5213675213675214,
+      "grad_norm": 0.50911545753479,
+      "learning_rate": 0.00019182941173542285,
+      "loss": 0.9977,
+      "step": 2928
+    },
+    {
+      "epoch": 0.521545584045584,
+      "grad_norm": 0.4924670457839966,
+      "learning_rate": 0.00019182386925244496,
+      "loss": 0.9309,
+      "step": 2929
+    },
+    {
+      "epoch": 0.5217236467236467,
+      "grad_norm": 0.4979301393032074,
+      "learning_rate": 0.00019181832497036912,
+      "loss": 0.87,
+      "step": 2930
+    },
+    {
+      "epoch": 0.5219017094017094,
+      "grad_norm": 0.6307916045188904,
+      "learning_rate": 0.0001918127788893039,
+      "loss": 1.2159,
+      "step": 2931
+    },
+    {
+      "epoch": 0.5220797720797721,
+      "grad_norm": 0.4915660619735718,
+      "learning_rate": 0.00019180723100935802,
+      "loss": 1.0828,
+      "step": 2932
+    },
+    {
+      "epoch": 0.5222578347578347,
+      "grad_norm": 0.4312742352485657,
+      "learning_rate": 0.00019180168133064017,
+      "loss": 1.0496,
+      "step": 2933
+    },
+    {
+      "epoch": 0.5224358974358975,
+      "grad_norm": 0.6006124019622803,
+      "learning_rate": 0.00019179612985325908,
+      "loss": 1.0751,
+      "step": 2934
+    },
+    {
+      "epoch": 0.5226139601139601,
+      "grad_norm": 0.5332220196723938,
+      "learning_rate": 0.0001917905765773235,
+      "loss": 1.2601,
+      "step": 2935
+    },
+    {
+      "epoch": 0.5227920227920227,
+      "grad_norm": 0.4877954423427582,
+      "learning_rate": 0.00019178502150294223,
+      "loss": 1.2279,
+      "step": 2936
+    },
+    {
+      "epoch": 0.5229700854700855,
+      "grad_norm": 0.5975968837738037,
+      "learning_rate": 0.00019177946463022418,
+      "loss": 1.3371,
+      "step": 2937
+    },
+    {
+      "epoch": 0.5231481481481481,
+      "grad_norm": 0.5363923907279968,
+      "learning_rate": 0.00019177390595927815,
+      "loss": 1.0705,
+      "step": 2938
+    },
+    {
+      "epoch": 0.5233262108262108,
+      "grad_norm": 0.4314909875392914,
+      "learning_rate": 0.0001917683454902131,
+      "loss": 0.9172,
+      "step": 2939
+    },
+    {
+      "epoch": 0.5235042735042735,
+      "grad_norm": 0.46187883615493774,
+      "learning_rate": 0.0001917627832231379,
+      "loss": 1.1201,
+      "step": 2940
+    },
+    {
+      "epoch": 0.5236823361823362,
+      "grad_norm": 0.4648260772228241,
+      "learning_rate": 0.00019175721915816162,
+      "loss": 1.1307,
+      "step": 2941
+    },
+    {
+      "epoch": 0.5238603988603988,
+      "grad_norm": 0.4427165687084198,
+      "learning_rate": 0.00019175165329539325,
+      "loss": 0.9459,
+      "step": 2942
+    },
+    {
+      "epoch": 0.5240384615384616,
+      "grad_norm": 0.4645056128501892,
+      "learning_rate": 0.0001917460856349418,
+      "loss": 0.9176,
+      "step": 2943
+    },
+    {
+      "epoch": 0.5242165242165242,
+      "grad_norm": 0.4939568042755127,
+      "learning_rate": 0.0001917405161769164,
+      "loss": 1.1056,
+      "step": 2944
+    },
+    {
+      "epoch": 0.5243945868945868,
+      "grad_norm": 0.6057310104370117,
+      "learning_rate": 0.00019173494492142617,
+      "loss": 1.2714,
+      "step": 2945
+    },
+    {
+      "epoch": 0.5245726495726496,
+      "grad_norm": 0.5038546323776245,
+      "learning_rate": 0.00019172937186858025,
+      "loss": 0.911,
+      "step": 2946
+    },
+    {
+      "epoch": 0.5247507122507122,
+      "grad_norm": 0.5521321296691895,
+      "learning_rate": 0.00019172379701848784,
+      "loss": 1.0781,
+      "step": 2947
+    },
+    {
+      "epoch": 0.5249287749287749,
+      "grad_norm": 0.516979455947876,
+      "learning_rate": 0.00019171822037125817,
+      "loss": 1.1051,
+      "step": 2948
+    },
+    {
+      "epoch": 0.5251068376068376,
+      "grad_norm": 0.5443150997161865,
+      "learning_rate": 0.0001917126419270005,
+      "loss": 1.0802,
+      "step": 2949
+    },
+    {
+      "epoch": 0.5252849002849003,
+      "grad_norm": 0.5373311042785645,
+      "learning_rate": 0.00019170706168582412,
+      "loss": 0.9313,
+      "step": 2950
+    },
+    {
+      "epoch": 0.5254629629629629,
+      "grad_norm": 0.7511917948722839,
+      "learning_rate": 0.0001917014796478384,
+      "loss": 1.1958,
+      "step": 2951
+    },
+    {
+      "epoch": 0.5256410256410257,
+      "grad_norm": 0.49893468618392944,
+      "learning_rate": 0.00019169589581315263,
+      "loss": 0.9387,
+      "step": 2952
+    },
+    {
+      "epoch": 0.5258190883190883,
+      "grad_norm": 0.48010289669036865,
+      "learning_rate": 0.00019169031018187628,
+      "loss": 1.2459,
+      "step": 2953
+    },
+    {
+      "epoch": 0.5259971509971509,
+      "grad_norm": 0.48768678307533264,
+      "learning_rate": 0.0001916847227541188,
+      "loss": 1.0127,
+      "step": 2954
+    },
+    {
+      "epoch": 0.5261752136752137,
+      "grad_norm": 0.5973068475723267,
+      "learning_rate": 0.00019167913352998963,
+      "loss": 1.1685,
+      "step": 2955
+    },
+    {
+      "epoch": 0.5263532763532763,
+      "grad_norm": 0.5567806959152222,
+      "learning_rate": 0.00019167354250959826,
+      "loss": 1.142,
+      "step": 2956
+    },
+    {
+      "epoch": 0.5265313390313391,
+      "grad_norm": 0.47819700837135315,
+      "learning_rate": 0.00019166794969305428,
+      "loss": 0.712,
+      "step": 2957
+    },
+    {
+      "epoch": 0.5267094017094017,
+      "grad_norm": 0.5191744565963745,
+      "learning_rate": 0.00019166235508046725,
+      "loss": 1.2208,
+      "step": 2958
+    },
+    {
+      "epoch": 0.5268874643874644,
+      "grad_norm": 0.4987856149673462,
+      "learning_rate": 0.00019165675867194675,
+      "loss": 1.0466,
+      "step": 2959
+    },
+    {
+      "epoch": 0.5270655270655271,
+      "grad_norm": 0.5017665028572083,
+      "learning_rate": 0.0001916511604676025,
+      "loss": 1.1236,
+      "step": 2960
+    },
+    {
+      "epoch": 0.5272435897435898,
+      "grad_norm": 0.5115348696708679,
+      "learning_rate": 0.00019164556046754415,
+      "loss": 1.1497,
+      "step": 2961
+    },
+    {
+      "epoch": 0.5274216524216524,
+      "grad_norm": 0.4934345781803131,
+      "learning_rate": 0.0001916399586718814,
+      "loss": 1.0183,
+      "step": 2962
+    },
+    {
+      "epoch": 0.5275997150997151,
+      "grad_norm": 0.5033719539642334,
+      "learning_rate": 0.00019163435508072404,
+      "loss": 1.0256,
+      "step": 2963
+    },
+    {
+      "epoch": 0.5277777777777778,
+      "grad_norm": 0.5325372219085693,
+      "learning_rate": 0.00019162874969418184,
+      "loss": 1.1384,
+      "step": 2964
+    },
+    {
+      "epoch": 0.5279558404558404,
+      "grad_norm": 0.4901772141456604,
+      "learning_rate": 0.00019162314251236465,
+      "loss": 1.0831,
+      "step": 2965
+    },
+    {
+      "epoch": 0.5281339031339032,
+      "grad_norm": 0.4743805229663849,
+      "learning_rate": 0.0001916175335353823,
+      "loss": 1.1894,
+      "step": 2966
+    },
+    {
+      "epoch": 0.5283119658119658,
+      "grad_norm": 0.5439450740814209,
+      "learning_rate": 0.00019161192276334466,
+      "loss": 1.2066,
+      "step": 2967
+    },
+    {
+      "epoch": 0.5284900284900285,
+      "grad_norm": 0.5123090744018555,
+      "learning_rate": 0.00019160631019636174,
+      "loss": 1.1829,
+      "step": 2968
+    },
+    {
+      "epoch": 0.5286680911680912,
+      "grad_norm": 0.5995343923568726,
+      "learning_rate": 0.00019160069583454346,
+      "loss": 1.4872,
+      "step": 2969
+    },
+    {
+      "epoch": 0.5288461538461539,
+      "grad_norm": 0.4596657156944275,
+      "learning_rate": 0.00019159507967799985,
+      "loss": 0.8948,
+      "step": 2970
+    },
+    {
+      "epoch": 0.5290242165242165,
+      "grad_norm": 0.5533682107925415,
+      "learning_rate": 0.0001915894617268409,
+      "loss": 1.1779,
+      "step": 2971
+    },
+    {
+      "epoch": 0.5292022792022792,
+      "grad_norm": 0.3860718309879303,
+      "learning_rate": 0.00019158384198117673,
+      "loss": 0.6424,
+      "step": 2972
+    },
+    {
+      "epoch": 0.5293803418803419,
+      "grad_norm": 0.47424063086509705,
+      "learning_rate": 0.0001915782204411174,
+      "loss": 1.1592,
+      "step": 2973
+    },
+    {
+      "epoch": 0.5295584045584045,
+      "grad_norm": 0.5050228834152222,
+      "learning_rate": 0.00019157259710677309,
+      "loss": 1.1971,
+      "step": 2974
+    },
+    {
+      "epoch": 0.5297364672364673,
+      "grad_norm": 0.6080113649368286,
+      "learning_rate": 0.00019156697197825396,
+      "loss": 1.1511,
+      "step": 2975
+    },
+    {
+      "epoch": 0.5299145299145299,
+      "grad_norm": 0.4805932641029358,
+      "learning_rate": 0.00019156134505567024,
+      "loss": 1.1033,
+      "step": 2976
+    },
+    {
+      "epoch": 0.5300925925925926,
+      "grad_norm": 0.4835345447063446,
+      "learning_rate": 0.00019155571633913215,
+      "loss": 1.1832,
+      "step": 2977
+    },
+    {
+      "epoch": 0.5302706552706553,
+      "grad_norm": 0.5183725953102112,
+      "learning_rate": 0.00019155008582875,
+      "loss": 0.9221,
+      "step": 2978
+    },
+    {
+      "epoch": 0.530448717948718,
+      "grad_norm": 0.48015761375427246,
+      "learning_rate": 0.00019154445352463412,
+      "loss": 1.045,
+      "step": 2979
+    },
+    {
+      "epoch": 0.5306267806267806,
+      "grad_norm": 0.4670043885707855,
+      "learning_rate": 0.0001915388194268948,
+      "loss": 0.9025,
+      "step": 2980
+    },
+    {
+      "epoch": 0.5308048433048433,
+      "grad_norm": 0.5048824548721313,
+      "learning_rate": 0.0001915331835356425,
+      "loss": 1.0681,
+      "step": 2981
+    },
+    {
+      "epoch": 0.530982905982906,
+      "grad_norm": 0.4785633981227875,
+      "learning_rate": 0.00019152754585098758,
+      "loss": 1.0097,
+      "step": 2982
+    },
+    {
+      "epoch": 0.5311609686609686,
+      "grad_norm": 0.4829573333263397,
+      "learning_rate": 0.00019152190637304056,
+      "loss": 1.0856,
+      "step": 2983
+    },
+    {
+      "epoch": 0.5313390313390314,
+      "grad_norm": 0.5425563454627991,
+      "learning_rate": 0.00019151626510191189,
+      "loss": 1.2313,
+      "step": 2984
+    },
+    {
+      "epoch": 0.531517094017094,
+      "grad_norm": 0.5532251596450806,
+      "learning_rate": 0.0001915106220377121,
+      "loss": 1.0328,
+      "step": 2985
+    },
+    {
+      "epoch": 0.5316951566951567,
+      "grad_norm": 0.47016972303390503,
+      "learning_rate": 0.0001915049771805518,
+      "loss": 1.2003,
+      "step": 2986
+    },
+    {
+      "epoch": 0.5318732193732194,
+      "grad_norm": 0.5241743326187134,
+      "learning_rate": 0.00019149933053054153,
+      "loss": 1.046,
+      "step": 2987
+    },
+    {
+      "epoch": 0.532051282051282,
+      "grad_norm": 0.5043526887893677,
+      "learning_rate": 0.00019149368208779197,
+      "loss": 1.0022,
+      "step": 2988
+    },
+    {
+      "epoch": 0.5322293447293447,
+      "grad_norm": 0.5563312768936157,
+      "learning_rate": 0.00019148803185241374,
+      "loss": 1.1017,
+      "step": 2989
+    },
+    {
+      "epoch": 0.5324074074074074,
+      "grad_norm": 0.5414231419563293,
+      "learning_rate": 0.00019148237982451763,
+      "loss": 0.9649,
+      "step": 2990
+    },
+    {
+      "epoch": 0.5325854700854701,
+      "grad_norm": 0.5452231764793396,
+      "learning_rate": 0.0001914767260042143,
+      "loss": 1.2281,
+      "step": 2991
+    },
+    {
+      "epoch": 0.5327635327635327,
+      "grad_norm": 0.5500698685646057,
+      "learning_rate": 0.00019147107039161454,
+      "loss": 1.2865,
+      "step": 2992
+    },
+    {
+      "epoch": 0.5329415954415955,
+      "grad_norm": 0.49747416377067566,
+      "learning_rate": 0.00019146541298682918,
+      "loss": 1.1296,
+      "step": 2993
+    },
+    {
+      "epoch": 0.5331196581196581,
+      "grad_norm": 0.5684167742729187,
+      "learning_rate": 0.00019145975378996903,
+      "loss": 1.0685,
+      "step": 2994
+    },
+    {
+      "epoch": 0.5332977207977208,
+      "grad_norm": 0.5411235690116882,
+      "learning_rate": 0.00019145409280114502,
+      "loss": 1.1372,
+      "step": 2995
+    },
+    {
+      "epoch": 0.5334757834757835,
+      "grad_norm": 0.5006675720214844,
+      "learning_rate": 0.00019144843002046806,
+      "loss": 1.0688,
+      "step": 2996
+    },
+    {
+      "epoch": 0.5336538461538461,
+      "grad_norm": 0.4591315686702728,
+      "learning_rate": 0.00019144276544804908,
+      "loss": 1.1071,
+      "step": 2997
+    },
+    {
+      "epoch": 0.5338319088319088,
+      "grad_norm": 0.5615306496620178,
+      "learning_rate": 0.000191437099083999,
+      "loss": 1.1033,
+      "step": 2998
+    },
+    {
+      "epoch": 0.5340099715099715,
+      "grad_norm": 0.4986817240715027,
+      "learning_rate": 0.00019143143092842897,
+      "loss": 1.176,
+      "step": 2999
+    },
+    {
+      "epoch": 0.5341880341880342,
+      "grad_norm": 0.5017120242118835,
+      "learning_rate": 0.00019142576098144995,
+      "loss": 1.0174,
+      "step": 3000
+    },
+    {
+      "epoch": 0.5343660968660968,
+      "grad_norm": 0.508298397064209,
+      "learning_rate": 0.0001914200892431731,
+      "loss": 1.164,
+      "step": 3001
+    },
+    {
+      "epoch": 0.5345441595441596,
+      "grad_norm": 0.48068809509277344,
+      "learning_rate": 0.0001914144157137095,
+      "loss": 0.7959,
+      "step": 3002
+    },
+    {
+      "epoch": 0.5347222222222222,
+      "grad_norm": 0.6347028017044067,
+      "learning_rate": 0.0001914087403931703,
+      "loss": 1.1727,
+      "step": 3003
+    },
+    {
+      "epoch": 0.5349002849002849,
+      "grad_norm": 0.5558401942253113,
+      "learning_rate": 0.00019140306328166676,
+      "loss": 1.2282,
+      "step": 3004
+    },
+    {
+      "epoch": 0.5350783475783476,
+      "grad_norm": 0.5093596577644348,
+      "learning_rate": 0.00019139738437931004,
+      "loss": 1.3258,
+      "step": 3005
+    },
+    {
+      "epoch": 0.5352564102564102,
+      "grad_norm": 0.4653106927871704,
+      "learning_rate": 0.0001913917036862114,
+      "loss": 1.1062,
+      "step": 3006
+    },
+    {
+      "epoch": 0.5354344729344729,
+      "grad_norm": 0.48085781931877136,
+      "learning_rate": 0.00019138602120248222,
+      "loss": 0.9019,
+      "step": 3007
+    },
+    {
+      "epoch": 0.5356125356125356,
+      "grad_norm": 0.5174745321273804,
+      "learning_rate": 0.0001913803369282338,
+      "loss": 1.044,
+      "step": 3008
+    },
+    {
+      "epoch": 0.5357905982905983,
+      "grad_norm": 0.5359669327735901,
+      "learning_rate": 0.00019137465086357746,
+      "loss": 1.0723,
+      "step": 3009
+    },
+    {
+      "epoch": 0.5359686609686609,
+      "grad_norm": 0.5583470463752747,
+      "learning_rate": 0.00019136896300862467,
+      "loss": 1.2192,
+      "step": 3010
+    },
+    {
+      "epoch": 0.5361467236467237,
+      "grad_norm": 0.4905693829059601,
+      "learning_rate": 0.00019136327336348688,
+      "loss": 1.2372,
+      "step": 3011
+    },
+    {
+      "epoch": 0.5363247863247863,
+      "grad_norm": 0.5741264820098877,
+      "learning_rate": 0.0001913575819282755,
+      "loss": 1.1703,
+      "step": 3012
+    },
+    {
+      "epoch": 0.5365028490028491,
+      "grad_norm": 0.577033281326294,
+      "learning_rate": 0.0001913518887031021,
+      "loss": 1.1555,
+      "step": 3013
+    },
+    {
+      "epoch": 0.5366809116809117,
+      "grad_norm": 0.46795153617858887,
+      "learning_rate": 0.00019134619368807822,
+      "loss": 0.8583,
+      "step": 3014
+    },
+    {
+      "epoch": 0.5368589743589743,
+      "grad_norm": 0.5973345637321472,
+      "learning_rate": 0.0001913404968833154,
+      "loss": 1.1509,
+      "step": 3015
+    },
+    {
+      "epoch": 0.5370370370370371,
+      "grad_norm": 0.62020343542099,
+      "learning_rate": 0.00019133479828892531,
+      "loss": 1.0781,
+      "step": 3016
+    },
+    {
+      "epoch": 0.5372150997150997,
+      "grad_norm": 0.5342286229133606,
+      "learning_rate": 0.00019132909790501958,
+      "loss": 1.1556,
+      "step": 3017
+    },
+    {
+      "epoch": 0.5373931623931624,
+      "grad_norm": 0.49612846970558167,
+      "learning_rate": 0.0001913233957317099,
+      "loss": 0.9027,
+      "step": 3018
+    },
+    {
+      "epoch": 0.5375712250712251,
+      "grad_norm": 0.5403908491134644,
+      "learning_rate": 0.00019131769176910796,
+      "loss": 1.1125,
+      "step": 3019
+    },
+    {
+      "epoch": 0.5377492877492878,
+      "grad_norm": 0.4952050447463989,
+      "learning_rate": 0.0001913119860173256,
+      "loss": 1.2329,
+      "step": 3020
+    },
+    {
+      "epoch": 0.5379273504273504,
+      "grad_norm": 0.5877819657325745,
+      "learning_rate": 0.0001913062784764745,
+      "loss": 1.2855,
+      "step": 3021
+    },
+    {
+      "epoch": 0.5381054131054132,
+      "grad_norm": 0.49312907457351685,
+      "learning_rate": 0.00019130056914666655,
+      "loss": 1.0212,
+      "step": 3022
+    },
+    {
+      "epoch": 0.5382834757834758,
+      "grad_norm": 0.45544490218162537,
+      "learning_rate": 0.00019129485802801366,
+      "loss": 0.9748,
+      "step": 3023
+    },
+    {
+      "epoch": 0.5384615384615384,
+      "grad_norm": 0.5535242557525635,
+      "learning_rate": 0.00019128914512062762,
+      "loss": 1.2134,
+      "step": 3024
+    },
+    {
+      "epoch": 0.5386396011396012,
+      "grad_norm": 0.45369696617126465,
+      "learning_rate": 0.00019128343042462044,
+      "loss": 0.9964,
+      "step": 3025
+    },
+    {
+      "epoch": 0.5388176638176638,
+      "grad_norm": 0.6240725517272949,
+      "learning_rate": 0.00019127771394010406,
+      "loss": 1.425,
+      "step": 3026
+    },
+    {
+      "epoch": 0.5389957264957265,
+      "grad_norm": 0.4859573245048523,
+      "learning_rate": 0.0001912719956671905,
+      "loss": 1.087,
+      "step": 3027
+    },
+    {
+      "epoch": 0.5391737891737892,
+      "grad_norm": 0.47529762983322144,
+      "learning_rate": 0.0001912662756059918,
+      "loss": 0.9517,
+      "step": 3028
+    },
+    {
+      "epoch": 0.5393518518518519,
+      "grad_norm": 0.5317288637161255,
+      "learning_rate": 0.00019126055375661997,
+      "loss": 1.0945,
+      "step": 3029
+    },
+    {
+      "epoch": 0.5395299145299145,
+      "grad_norm": 0.55974280834198,
+      "learning_rate": 0.00019125483011918722,
+      "loss": 1.0794,
+      "step": 3030
+    },
+    {
+      "epoch": 0.5397079772079773,
+      "grad_norm": 0.48579123616218567,
+      "learning_rate": 0.0001912491046938056,
+      "loss": 1.1421,
+      "step": 3031
+    },
+    {
+      "epoch": 0.5398860398860399,
+      "grad_norm": 0.4917181134223938,
+      "learning_rate": 0.00019124337748058733,
+      "loss": 0.9708,
+      "step": 3032
+    },
+    {
+      "epoch": 0.5400641025641025,
+      "grad_norm": 0.525291383266449,
+      "learning_rate": 0.00019123764847964466,
+      "loss": 1.064,
+      "step": 3033
+    },
+    {
+      "epoch": 0.5402421652421653,
+      "grad_norm": 0.5733301639556885,
+      "learning_rate": 0.00019123191769108977,
+      "loss": 1.2142,
+      "step": 3034
+    },
+    {
+      "epoch": 0.5404202279202279,
+      "grad_norm": 0.5400987863540649,
+      "learning_rate": 0.00019122618511503494,
+      "loss": 1.1309,
+      "step": 3035
+    },
+    {
+      "epoch": 0.5405982905982906,
+      "grad_norm": 0.6261051893234253,
+      "learning_rate": 0.00019122045075159257,
+      "loss": 1.2112,
+      "step": 3036
+    },
+    {
+      "epoch": 0.5407763532763533,
+      "grad_norm": 0.5483576059341431,
+      "learning_rate": 0.0001912147146008749,
+      "loss": 1.2705,
+      "step": 3037
+    },
+    {
+      "epoch": 0.540954415954416,
+      "grad_norm": 0.5442137122154236,
+      "learning_rate": 0.00019120897666299443,
+      "loss": 1.2512,
+      "step": 3038
+    },
+    {
+      "epoch": 0.5411324786324786,
+      "grad_norm": 0.5680811405181885,
+      "learning_rate": 0.00019120323693806355,
+      "loss": 1.392,
+      "step": 3039
+    },
+    {
+      "epoch": 0.5413105413105413,
+      "grad_norm": 0.5237287878990173,
+      "learning_rate": 0.00019119749542619466,
+      "loss": 1.1599,
+      "step": 3040
+    },
+    {
+      "epoch": 0.541488603988604,
+      "grad_norm": 0.48119300603866577,
+      "learning_rate": 0.00019119175212750032,
+      "loss": 1.0976,
+      "step": 3041
+    },
+    {
+      "epoch": 0.5416666666666666,
+      "grad_norm": 0.507033109664917,
+      "learning_rate": 0.00019118600704209302,
+      "loss": 1.0181,
+      "step": 3042
+    },
+    {
+      "epoch": 0.5418447293447294,
+      "grad_norm": 0.484672874212265,
+      "learning_rate": 0.00019118026017008531,
+      "loss": 1.1636,
+      "step": 3043
+    },
+    {
+      "epoch": 0.542022792022792,
+      "grad_norm": 0.4923502206802368,
+      "learning_rate": 0.00019117451151158985,
+      "loss": 1.0388,
+      "step": 3044
+    },
+    {
+      "epoch": 0.5422008547008547,
+      "grad_norm": 0.4882057309150696,
+      "learning_rate": 0.00019116876106671922,
+      "loss": 1.131,
+      "step": 3045
+    },
+    {
+      "epoch": 0.5423789173789174,
+      "grad_norm": 0.6068355441093445,
+      "learning_rate": 0.0001911630088355861,
+      "loss": 1.3218,
+      "step": 3046
+    },
+    {
+      "epoch": 0.54255698005698,
+      "grad_norm": 0.5012881755828857,
+      "learning_rate": 0.0001911572548183032,
+      "loss": 1.0514,
+      "step": 3047
+    },
+    {
+      "epoch": 0.5427350427350427,
+      "grad_norm": 0.49849793314933777,
+      "learning_rate": 0.00019115149901498328,
+      "loss": 1.0003,
+      "step": 3048
+    },
+    {
+      "epoch": 0.5429131054131054,
+      "grad_norm": 0.4934251010417938,
+      "learning_rate": 0.00019114574142573904,
+      "loss": 1.0319,
+      "step": 3049
+    },
+    {
+      "epoch": 0.5430911680911681,
+      "grad_norm": 0.4947762191295624,
+      "learning_rate": 0.00019113998205068334,
+      "loss": 1.0906,
+      "step": 3050
+    },
+    {
+      "epoch": 0.5432692307692307,
+      "grad_norm": 0.5449416041374207,
+      "learning_rate": 0.00019113422088992907,
+      "loss": 0.9093,
+      "step": 3051
+    },
+    {
+      "epoch": 0.5434472934472935,
+      "grad_norm": 0.49395284056663513,
+      "learning_rate": 0.00019112845794358902,
+      "loss": 1.0071,
+      "step": 3052
+    },
+    {
+      "epoch": 0.5436253561253561,
+      "grad_norm": 0.5478728413581848,
+      "learning_rate": 0.00019112269321177613,
+      "loss": 1.2124,
+      "step": 3053
+    },
+    {
+      "epoch": 0.5438034188034188,
+      "grad_norm": 0.6205173134803772,
+      "learning_rate": 0.0001911169266946034,
+      "loss": 1.021,
+      "step": 3054
+    },
+    {
+      "epoch": 0.5439814814814815,
+      "grad_norm": 0.4777783751487732,
+      "learning_rate": 0.00019111115839218372,
+      "loss": 0.9192,
+      "step": 3055
+    },
+    {
+      "epoch": 0.5441595441595442,
+      "grad_norm": 0.5541689991950989,
+      "learning_rate": 0.00019110538830463018,
+      "loss": 1.1248,
+      "step": 3056
+    },
+    {
+      "epoch": 0.5443376068376068,
+      "grad_norm": 0.4750942289829254,
+      "learning_rate": 0.0001910996164320558,
+      "loss": 1.3147,
+      "step": 3057
+    },
+    {
+      "epoch": 0.5445156695156695,
+      "grad_norm": 0.6283948421478271,
+      "learning_rate": 0.0001910938427745737,
+      "loss": 1.0919,
+      "step": 3058
+    },
+    {
+      "epoch": 0.5446937321937322,
+      "grad_norm": 0.552725076675415,
+      "learning_rate": 0.00019108806733229698,
+      "loss": 1.3807,
+      "step": 3059
+    },
+    {
+      "epoch": 0.5448717948717948,
+      "grad_norm": 0.4832848310470581,
+      "learning_rate": 0.0001910822901053388,
+      "loss": 1.0705,
+      "step": 3060
+    },
+    {
+      "epoch": 0.5450498575498576,
+      "grad_norm": 0.6468375325202942,
+      "learning_rate": 0.00019107651109381233,
+      "loss": 1.0766,
+      "step": 3061
+    },
+    {
+      "epoch": 0.5452279202279202,
+      "grad_norm": 0.5464920401573181,
+      "learning_rate": 0.00019107073029783083,
+      "loss": 1.0453,
+      "step": 3062
+    },
+    {
+      "epoch": 0.5454059829059829,
+      "grad_norm": 0.5321210026741028,
+      "learning_rate": 0.0001910649477175076,
+      "loss": 1.2326,
+      "step": 3063
+    },
+    {
+      "epoch": 0.5455840455840456,
+      "grad_norm": 0.5572962164878845,
+      "learning_rate": 0.00019105916335295582,
+      "loss": 1.0673,
+      "step": 3064
+    },
+    {
+      "epoch": 0.5457621082621082,
+      "grad_norm": 0.5239177942276001,
+      "learning_rate": 0.00019105337720428894,
+      "loss": 1.04,
+      "step": 3065
+    },
+    {
+      "epoch": 0.5459401709401709,
+      "grad_norm": 0.5633319616317749,
+      "learning_rate": 0.00019104758927162023,
+      "loss": 0.9606,
+      "step": 3066
+    },
+    {
+      "epoch": 0.5461182336182336,
+      "grad_norm": 0.5317914485931396,
+      "learning_rate": 0.0001910417995550632,
+      "loss": 1.0651,
+      "step": 3067
+    },
+    {
+      "epoch": 0.5462962962962963,
+      "grad_norm": 0.5126453638076782,
+      "learning_rate": 0.00019103600805473118,
+      "loss": 1.0316,
+      "step": 3068
+    },
+    {
+      "epoch": 0.5464743589743589,
+      "grad_norm": 0.5262107253074646,
+      "learning_rate": 0.00019103021477073773,
+      "loss": 1.0752,
+      "step": 3069
+    },
+    {
+      "epoch": 0.5466524216524217,
+      "grad_norm": 0.5384877324104309,
+      "learning_rate": 0.0001910244197031963,
+      "loss": 1.1731,
+      "step": 3070
+    },
+    {
+      "epoch": 0.5468304843304843,
+      "grad_norm": 0.5126553773880005,
+      "learning_rate": 0.00019101862285222048,
+      "loss": 1.2229,
+      "step": 3071
+    },
+    {
+      "epoch": 0.5470085470085471,
+      "grad_norm": 0.4841194450855255,
+      "learning_rate": 0.0001910128242179238,
+      "loss": 0.9955,
+      "step": 3072
+    },
+    {
+      "epoch": 0.5471866096866097,
+      "grad_norm": 0.526546061038971,
+      "learning_rate": 0.00019100702380041987,
+      "loss": 1.2436,
+      "step": 3073
+    },
+    {
+      "epoch": 0.5473646723646723,
+      "grad_norm": 0.5085833072662354,
+      "learning_rate": 0.0001910012215998224,
+      "loss": 1.011,
+      "step": 3074
+    },
+    {
+      "epoch": 0.5475427350427351,
+      "grad_norm": 0.5149994492530823,
+      "learning_rate": 0.000190995417616245,
+      "loss": 0.8632,
+      "step": 3075
+    },
+    {
+      "epoch": 0.5477207977207977,
+      "grad_norm": 0.48079630732536316,
+      "learning_rate": 0.00019098961184980145,
+      "loss": 1.1115,
+      "step": 3076
+    },
+    {
+      "epoch": 0.5478988603988604,
+      "grad_norm": 0.5769477486610413,
+      "learning_rate": 0.00019098380430060546,
+      "loss": 0.9544,
+      "step": 3077
+    },
+    {
+      "epoch": 0.5480769230769231,
+      "grad_norm": 0.5260093808174133,
+      "learning_rate": 0.0001909779949687708,
+      "loss": 1.2354,
+      "step": 3078
+    },
+    {
+      "epoch": 0.5482549857549858,
+      "grad_norm": 0.5518734455108643,
+      "learning_rate": 0.00019097218385441135,
+      "loss": 1.1944,
+      "step": 3079
+    },
+    {
+      "epoch": 0.5484330484330484,
+      "grad_norm": 0.5436808466911316,
+      "learning_rate": 0.00019096637095764095,
+      "loss": 1.0717,
+      "step": 3080
+    },
+    {
+      "epoch": 0.5486111111111112,
+      "grad_norm": 0.4749584197998047,
+      "learning_rate": 0.00019096055627857344,
+      "loss": 1.0417,
+      "step": 3081
+    },
+    {
+      "epoch": 0.5487891737891738,
+      "grad_norm": 0.5485591292381287,
+      "learning_rate": 0.0001909547398173228,
+      "loss": 1.2515,
+      "step": 3082
+    },
+    {
+      "epoch": 0.5489672364672364,
+      "grad_norm": 0.5751016736030579,
+      "learning_rate": 0.00019094892157400296,
+      "loss": 1.2112,
+      "step": 3083
+    },
+    {
+      "epoch": 0.5491452991452992,
+      "grad_norm": 0.5404475331306458,
+      "learning_rate": 0.00019094310154872795,
+      "loss": 0.4334,
+      "step": 3084
+    },
+    {
+      "epoch": 0.5493233618233618,
+      "grad_norm": 0.5198020935058594,
+      "learning_rate": 0.00019093727974161178,
+      "loss": 0.9759,
+      "step": 3085
+    },
+    {
+      "epoch": 0.5495014245014245,
+      "grad_norm": 0.4893439710140228,
+      "learning_rate": 0.0001909314561527685,
+      "loss": 1.1287,
+      "step": 3086
+    },
+    {
+      "epoch": 0.5496794871794872,
+      "grad_norm": 0.5675956606864929,
+      "learning_rate": 0.00019092563078231228,
+      "loss": 1.234,
+      "step": 3087
+    },
+    {
+      "epoch": 0.5498575498575499,
+      "grad_norm": 0.5539132356643677,
+      "learning_rate": 0.00019091980363035714,
+      "loss": 1.2378,
+      "step": 3088
+    },
+    {
+      "epoch": 0.5500356125356125,
+      "grad_norm": 0.5194353461265564,
+      "learning_rate": 0.00019091397469701735,
+      "loss": 1.1338,
+      "step": 3089
+    },
+    {
+      "epoch": 0.5502136752136753,
+      "grad_norm": 0.5143756866455078,
+      "learning_rate": 0.0001909081439824071,
+      "loss": 0.9118,
+      "step": 3090
+    },
+    {
+      "epoch": 0.5503917378917379,
+      "grad_norm": 0.5624327659606934,
+      "learning_rate": 0.0001909023114866406,
+      "loss": 1.035,
+      "step": 3091
+    },
+    {
+      "epoch": 0.5505698005698005,
+      "grad_norm": 0.5285067558288574,
+      "learning_rate": 0.0001908964772098321,
+      "loss": 1.0451,
+      "step": 3092
+    },
+    {
+      "epoch": 0.5507478632478633,
+      "grad_norm": 0.5730587244033813,
+      "learning_rate": 0.000190890641152096,
+      "loss": 1.0672,
+      "step": 3093
+    },
+    {
+      "epoch": 0.5509259259259259,
+      "grad_norm": 0.5822951197624207,
+      "learning_rate": 0.0001908848033135466,
+      "loss": 1.1791,
+      "step": 3094
+    },
+    {
+      "epoch": 0.5511039886039886,
+      "grad_norm": 0.596161961555481,
+      "learning_rate": 0.00019087896369429826,
+      "loss": 1.0954,
+      "step": 3095
+    },
+    {
+      "epoch": 0.5512820512820513,
+      "grad_norm": 0.5138190984725952,
+      "learning_rate": 0.00019087312229446542,
+      "loss": 0.896,
+      "step": 3096
+    },
+    {
+      "epoch": 0.551460113960114,
+      "grad_norm": 0.5061872601509094,
+      "learning_rate": 0.0001908672791141625,
+      "loss": 1.1017,
+      "step": 3097
+    },
+    {
+      "epoch": 0.5516381766381766,
+      "grad_norm": 0.5189547538757324,
+      "learning_rate": 0.00019086143415350404,
+      "loss": 1.2906,
+      "step": 3098
+    },
+    {
+      "epoch": 0.5518162393162394,
+      "grad_norm": 0.5640039443969727,
+      "learning_rate": 0.00019085558741260448,
+      "loss": 1.1001,
+      "step": 3099
+    },
+    {
+      "epoch": 0.551994301994302,
+      "grad_norm": 0.453867107629776,
+      "learning_rate": 0.00019084973889157844,
+      "loss": 0.9731,
+      "step": 3100
+    },
+    {
+      "epoch": 0.5521723646723646,
+      "grad_norm": 0.5431303977966309,
+      "learning_rate": 0.0001908438885905405,
+      "loss": 1.3511,
+      "step": 3101
+    },
+    {
+      "epoch": 0.5523504273504274,
+      "grad_norm": 0.47693368792533875,
+      "learning_rate": 0.00019083803650960527,
+      "loss": 1.0426,
+      "step": 3102
+    },
+    {
+      "epoch": 0.55252849002849,
+      "grad_norm": 0.4663422703742981,
+      "learning_rate": 0.00019083218264888743,
+      "loss": 1.05,
+      "step": 3103
+    },
+    {
+      "epoch": 0.5527065527065527,
+      "grad_norm": 0.561354398727417,
+      "learning_rate": 0.00019082632700850164,
+      "loss": 0.9608,
+      "step": 3104
+    },
+    {
+      "epoch": 0.5528846153846154,
+      "grad_norm": 0.4981916844844818,
+      "learning_rate": 0.00019082046958856266,
+      "loss": 1.1935,
+      "step": 3105
+    },
+    {
+      "epoch": 0.5530626780626781,
+      "grad_norm": 0.5301326513290405,
+      "learning_rate": 0.0001908146103891852,
+      "loss": 1.0646,
+      "step": 3106
+    },
+    {
+      "epoch": 0.5532407407407407,
+      "grad_norm": 0.5023610591888428,
+      "learning_rate": 0.00019080874941048416,
+      "loss": 1.127,
+      "step": 3107
+    },
+    {
+      "epoch": 0.5534188034188035,
+      "grad_norm": 0.5172514319419861,
+      "learning_rate": 0.00019080288665257426,
+      "loss": 1.0435,
+      "step": 3108
+    },
+    {
+      "epoch": 0.5535968660968661,
+      "grad_norm": 0.6340598464012146,
+      "learning_rate": 0.00019079702211557048,
+      "loss": 1.3528,
+      "step": 3109
+    },
+    {
+      "epoch": 0.5537749287749287,
+      "grad_norm": 0.46882256865501404,
+      "learning_rate": 0.0001907911557995876,
+      "loss": 1.1361,
+      "step": 3110
+    },
+    {
+      "epoch": 0.5539529914529915,
+      "grad_norm": 0.6401382088661194,
+      "learning_rate": 0.00019078528770474068,
+      "loss": 1.2415,
+      "step": 3111
+    },
+    {
+      "epoch": 0.5541310541310541,
+      "grad_norm": 0.5141328573226929,
+      "learning_rate": 0.00019077941783114463,
+      "loss": 1.0505,
+      "step": 3112
+    },
+    {
+      "epoch": 0.5543091168091168,
+      "grad_norm": 0.522318959236145,
+      "learning_rate": 0.00019077354617891444,
+      "loss": 1.0964,
+      "step": 3113
+    },
+    {
+      "epoch": 0.5544871794871795,
+      "grad_norm": 0.539551854133606,
+      "learning_rate": 0.00019076767274816517,
+      "loss": 1.0735,
+      "step": 3114
+    },
+    {
+      "epoch": 0.5546652421652422,
+      "grad_norm": 0.495320200920105,
+      "learning_rate": 0.00019076179753901195,
+      "loss": 0.9754,
+      "step": 3115
+    },
+    {
+      "epoch": 0.5548433048433048,
+      "grad_norm": 0.5499199628829956,
+      "learning_rate": 0.00019075592055156984,
+      "loss": 1.0043,
+      "step": 3116
+    },
+    {
+      "epoch": 0.5550213675213675,
+      "grad_norm": 0.5352509617805481,
+      "learning_rate": 0.00019075004178595396,
+      "loss": 1.1701,
+      "step": 3117
+    },
+    {
+      "epoch": 0.5551994301994302,
+      "grad_norm": 0.5392300486564636,
+      "learning_rate": 0.00019074416124227953,
+      "loss": 1.1612,
+      "step": 3118
+    },
+    {
+      "epoch": 0.5553774928774928,
+      "grad_norm": 0.5195050835609436,
+      "learning_rate": 0.0001907382789206618,
+      "loss": 1.0934,
+      "step": 3119
+    },
+    {
+      "epoch": 0.5555555555555556,
+      "grad_norm": 0.5276884436607361,
+      "learning_rate": 0.000190732394821216,
+      "loss": 0.9011,
+      "step": 3120
+    },
+    {
+      "epoch": 0.5557336182336182,
+      "grad_norm": 0.6115903258323669,
+      "learning_rate": 0.00019072650894405734,
+      "loss": 1.3065,
+      "step": 3121
+    },
+    {
+      "epoch": 0.5559116809116809,
+      "grad_norm": 0.5752483010292053,
+      "learning_rate": 0.00019072062128930127,
+      "loss": 1.0063,
+      "step": 3122
+    },
+    {
+      "epoch": 0.5560897435897436,
+      "grad_norm": 0.5508273243904114,
+      "learning_rate": 0.00019071473185706302,
+      "loss": 1.2598,
+      "step": 3123
+    },
+    {
+      "epoch": 0.5562678062678063,
+      "grad_norm": 0.49712198972702026,
+      "learning_rate": 0.00019070884064745808,
+      "loss": 0.924,
+      "step": 3124
+    },
+    {
+      "epoch": 0.5564458689458689,
+      "grad_norm": 0.572849452495575,
+      "learning_rate": 0.00019070294766060185,
+      "loss": 0.9683,
+      "step": 3125
+    },
+    {
+      "epoch": 0.5566239316239316,
+      "grad_norm": 0.4807920753955841,
+      "learning_rate": 0.00019069705289660976,
+      "loss": 1.0998,
+      "step": 3126
+    },
+    {
+      "epoch": 0.5568019943019943,
+      "grad_norm": 0.5543031096458435,
+      "learning_rate": 0.0001906911563555973,
+      "loss": 1.0878,
+      "step": 3127
+    },
+    {
+      "epoch": 0.5569800569800569,
+      "grad_norm": 0.5710418820381165,
+      "learning_rate": 0.00019068525803768007,
+      "loss": 1.0381,
+      "step": 3128
+    },
+    {
+      "epoch": 0.5571581196581197,
+      "grad_norm": 0.5169163346290588,
+      "learning_rate": 0.00019067935794297357,
+      "loss": 1.1149,
+      "step": 3129
+    },
+    {
+      "epoch": 0.5573361823361823,
+      "grad_norm": 0.6474376916885376,
+      "learning_rate": 0.00019067345607159345,
+      "loss": 0.9828,
+      "step": 3130
+    },
+    {
+      "epoch": 0.5575142450142451,
+      "grad_norm": 0.5029847621917725,
+      "learning_rate": 0.0001906675524236553,
+      "loss": 0.797,
+      "step": 3131
+    },
+    {
+      "epoch": 0.5576923076923077,
+      "grad_norm": 0.5681431293487549,
+      "learning_rate": 0.00019066164699927478,
+      "loss": 1.1565,
+      "step": 3132
+    },
+    {
+      "epoch": 0.5578703703703703,
+      "grad_norm": 0.5654549598693848,
+      "learning_rate": 0.00019065573979856764,
+      "loss": 1.2488,
+      "step": 3133
+    },
+    {
+      "epoch": 0.5580484330484331,
+      "grad_norm": 0.47653043270111084,
+      "learning_rate": 0.0001906498308216496,
+      "loss": 1.0428,
+      "step": 3134
+    },
+    {
+      "epoch": 0.5582264957264957,
+      "grad_norm": 0.5068467259407043,
+      "learning_rate": 0.00019064392006863643,
+      "loss": 0.9659,
+      "step": 3135
+    },
+    {
+      "epoch": 0.5584045584045584,
+      "grad_norm": 0.7076661586761475,
+      "learning_rate": 0.00019063800753964393,
+      "loss": 1.1289,
+      "step": 3136
+    },
+    {
+      "epoch": 0.5585826210826211,
+      "grad_norm": 0.551456868648529,
+      "learning_rate": 0.000190632093234788,
+      "loss": 1.1925,
+      "step": 3137
+    },
+    {
+      "epoch": 0.5587606837606838,
+      "grad_norm": 0.518276035785675,
+      "learning_rate": 0.00019062617715418442,
+      "loss": 0.8681,
+      "step": 3138
+    },
+    {
+      "epoch": 0.5589387464387464,
+      "grad_norm": 0.5272278785705566,
+      "learning_rate": 0.0001906202592979492,
+      "loss": 1.0865,
+      "step": 3139
+    },
+    {
+      "epoch": 0.5591168091168092,
+      "grad_norm": 0.5344942212104797,
+      "learning_rate": 0.00019061433966619822,
+      "loss": 1.1647,
+      "step": 3140
+    },
+    {
+      "epoch": 0.5592948717948718,
+      "grad_norm": 0.5833460092544556,
+      "learning_rate": 0.00019060841825904753,
+      "loss": 1.3403,
+      "step": 3141
+    },
+    {
+      "epoch": 0.5594729344729344,
+      "grad_norm": 0.5707054734230042,
+      "learning_rate": 0.00019060249507661306,
+      "loss": 1.1236,
+      "step": 3142
+    },
+    {
+      "epoch": 0.5596509971509972,
+      "grad_norm": 0.5446065664291382,
+      "learning_rate": 0.00019059657011901094,
+      "loss": 1.017,
+      "step": 3143
+    },
+    {
+      "epoch": 0.5598290598290598,
+      "grad_norm": 0.5285109281539917,
+      "learning_rate": 0.0001905906433863572,
+      "loss": 1.3186,
+      "step": 3144
+    },
+    {
+      "epoch": 0.5600071225071225,
+      "grad_norm": 0.5308659672737122,
+      "learning_rate": 0.00019058471487876802,
+      "loss": 0.8464,
+      "step": 3145
+    },
+    {
+      "epoch": 0.5601851851851852,
+      "grad_norm": 0.5218054056167603,
+      "learning_rate": 0.00019057878459635948,
+      "loss": 1.0219,
+      "step": 3146
+    },
+    {
+      "epoch": 0.5603632478632479,
+      "grad_norm": 0.45067787170410156,
+      "learning_rate": 0.00019057285253924785,
+      "loss": 1.0364,
+      "step": 3147
+    },
+    {
+      "epoch": 0.5605413105413105,
+      "grad_norm": 0.4856041669845581,
+      "learning_rate": 0.0001905669187075493,
+      "loss": 1.1928,
+      "step": 3148
+    },
+    {
+      "epoch": 0.5607193732193733,
+      "grad_norm": 0.506912112236023,
+      "learning_rate": 0.00019056098310138016,
+      "loss": 1.119,
+      "step": 3149
+    },
+    {
+      "epoch": 0.5608974358974359,
+      "grad_norm": 0.49049463868141174,
+      "learning_rate": 0.00019055504572085662,
+      "loss": 1.2165,
+      "step": 3150
+    },
+    {
+      "epoch": 0.5610754985754985,
+      "grad_norm": 0.5250293612480164,
+      "learning_rate": 0.0001905491065660951,
+      "loss": 1.1427,
+      "step": 3151
+    },
+    {
+      "epoch": 0.5612535612535613,
+      "grad_norm": 0.43438446521759033,
+      "learning_rate": 0.00019054316563721195,
+      "loss": 0.884,
+      "step": 3152
+    },
+    {
+      "epoch": 0.5614316239316239,
+      "grad_norm": 0.5386807918548584,
+      "learning_rate": 0.00019053722293432354,
+      "loss": 1.1494,
+      "step": 3153
+    },
+    {
+      "epoch": 0.5616096866096866,
+      "grad_norm": 0.5403809547424316,
+      "learning_rate": 0.00019053127845754632,
+      "loss": 1.1743,
+      "step": 3154
+    },
+    {
+      "epoch": 0.5617877492877493,
+      "grad_norm": 0.4759823977947235,
+      "learning_rate": 0.00019052533220699678,
+      "loss": 1.0716,
+      "step": 3155
+    },
+    {
+      "epoch": 0.561965811965812,
+      "grad_norm": 0.45332327485084534,
+      "learning_rate": 0.0001905193841827914,
+      "loss": 0.8405,
+      "step": 3156
+    },
+    {
+      "epoch": 0.5621438746438746,
+      "grad_norm": 0.5617053508758545,
+      "learning_rate": 0.00019051343438504671,
+      "loss": 1.0422,
+      "step": 3157
+    },
+    {
+      "epoch": 0.5623219373219374,
+      "grad_norm": 0.5088049173355103,
+      "learning_rate": 0.00019050748281387931,
+      "loss": 1.0067,
+      "step": 3158
+    },
+    {
+      "epoch": 0.5625,
+      "grad_norm": 0.5174484848976135,
+      "learning_rate": 0.00019050152946940578,
+      "loss": 1.0623,
+      "step": 3159
+    },
+    {
+      "epoch": 0.5626780626780626,
+      "grad_norm": 0.6093568801879883,
+      "learning_rate": 0.0001904955743517428,
+      "loss": 1.24,
+      "step": 3160
+    },
+    {
+      "epoch": 0.5628561253561254,
+      "grad_norm": 0.49063584208488464,
+      "learning_rate": 0.00019048961746100703,
+      "loss": 0.8563,
+      "step": 3161
+    },
+    {
+      "epoch": 0.563034188034188,
+      "grad_norm": 0.583940863609314,
+      "learning_rate": 0.00019048365879731517,
+      "loss": 1.0695,
+      "step": 3162
+    },
+    {
+      "epoch": 0.5632122507122507,
+      "grad_norm": 0.4943268597126007,
+      "learning_rate": 0.000190477698360784,
+      "loss": 0.8606,
+      "step": 3163
+    },
+    {
+      "epoch": 0.5633903133903134,
+      "grad_norm": 0.5050932168960571,
+      "learning_rate": 0.00019047173615153028,
+      "loss": 1.1591,
+      "step": 3164
+    },
+    {
+      "epoch": 0.5635683760683761,
+      "grad_norm": 0.5445677638053894,
+      "learning_rate": 0.0001904657721696708,
+      "loss": 1.262,
+      "step": 3165
+    },
+    {
+      "epoch": 0.5637464387464387,
+      "grad_norm": 0.5445297360420227,
+      "learning_rate": 0.00019045980641532246,
+      "loss": 1.223,
+      "step": 3166
+    },
+    {
+      "epoch": 0.5639245014245015,
+      "grad_norm": 0.5098413228988647,
+      "learning_rate": 0.00019045383888860213,
+      "loss": 1.0829,
+      "step": 3167
+    },
+    {
+      "epoch": 0.5641025641025641,
+      "grad_norm": 0.484998881816864,
+      "learning_rate": 0.0001904478695896267,
+      "loss": 1.0711,
+      "step": 3168
+    },
+    {
+      "epoch": 0.5642806267806267,
+      "grad_norm": 0.5515334010124207,
+      "learning_rate": 0.0001904418985185132,
+      "loss": 1.1583,
+      "step": 3169
+    },
+    {
+      "epoch": 0.5644586894586895,
+      "grad_norm": 0.545460045337677,
+      "learning_rate": 0.00019043592567537853,
+      "loss": 1.2321,
+      "step": 3170
+    },
+    {
+      "epoch": 0.5646367521367521,
+      "grad_norm": 0.5463964343070984,
+      "learning_rate": 0.0001904299510603398,
+      "loss": 1.1019,
+      "step": 3171
+    },
+    {
+      "epoch": 0.5648148148148148,
+      "grad_norm": 0.5619220733642578,
+      "learning_rate": 0.000190423974673514,
+      "loss": 1.1001,
+      "step": 3172
+    },
+    {
+      "epoch": 0.5649928774928775,
+      "grad_norm": 0.4448916018009186,
+      "learning_rate": 0.00019041799651501825,
+      "loss": 1.057,
+      "step": 3173
+    },
+    {
+      "epoch": 0.5651709401709402,
+      "grad_norm": 0.6073006987571716,
+      "learning_rate": 0.00019041201658496975,
+      "loss": 1.0306,
+      "step": 3174
+    },
+    {
+      "epoch": 0.5653490028490028,
+      "grad_norm": 0.5342072248458862,
+      "learning_rate": 0.0001904060348834855,
+      "loss": 0.9231,
+      "step": 3175
+    },
+    {
+      "epoch": 0.5655270655270656,
+      "grad_norm": 0.4505697786808014,
+      "learning_rate": 0.0001904000514106829,
+      "loss": 1.1134,
+      "step": 3176
+    },
+    {
+      "epoch": 0.5657051282051282,
+      "grad_norm": 0.5627852082252502,
+      "learning_rate": 0.00019039406616667902,
+      "loss": 1.2138,
+      "step": 3177
+    },
+    {
+      "epoch": 0.5658831908831908,
+      "grad_norm": 0.499734103679657,
+      "learning_rate": 0.0001903880791515912,
+      "loss": 1.1074,
+      "step": 3178
+    },
+    {
+      "epoch": 0.5660612535612536,
+      "grad_norm": 0.4768189489841461,
+      "learning_rate": 0.00019038209036553676,
+      "loss": 0.9442,
+      "step": 3179
+    },
+    {
+      "epoch": 0.5662393162393162,
+      "grad_norm": 0.5265373587608337,
+      "learning_rate": 0.00019037609980863298,
+      "loss": 1.0907,
+      "step": 3180
+    },
+    {
+      "epoch": 0.5664173789173789,
+      "grad_norm": 0.5506128072738647,
+      "learning_rate": 0.00019037010748099728,
+      "loss": 1.2541,
+      "step": 3181
+    },
+    {
+      "epoch": 0.5665954415954416,
+      "grad_norm": 0.44860872626304626,
+      "learning_rate": 0.00019036411338274703,
+      "loss": 0.893,
+      "step": 3182
+    },
+    {
+      "epoch": 0.5667735042735043,
+      "grad_norm": 0.4901522994041443,
+      "learning_rate": 0.00019035811751399973,
+      "loss": 1.0469,
+      "step": 3183
+    },
+    {
+      "epoch": 0.5669515669515669,
+      "grad_norm": 0.500868022441864,
+      "learning_rate": 0.0001903521198748728,
+      "loss": 1.0527,
+      "step": 3184
+    },
+    {
+      "epoch": 0.5671296296296297,
+      "grad_norm": 0.5508102774620056,
+      "learning_rate": 0.00019034612046548376,
+      "loss": 1.283,
+      "step": 3185
+    },
+    {
+      "epoch": 0.5673076923076923,
+      "grad_norm": 0.5079495906829834,
+      "learning_rate": 0.0001903401192859502,
+      "loss": 1.0808,
+      "step": 3186
+    },
+    {
+      "epoch": 0.5674857549857549,
+      "grad_norm": 0.5758788585662842,
+      "learning_rate": 0.00019033411633638964,
+      "loss": 1.1301,
+      "step": 3187
+    },
+    {
+      "epoch": 0.5676638176638177,
+      "grad_norm": 0.46557924151420593,
+      "learning_rate": 0.00019032811161691972,
+      "loss": 1.0205,
+      "step": 3188
+    },
+    {
+      "epoch": 0.5678418803418803,
+      "grad_norm": 0.5665056109428406,
+      "learning_rate": 0.0001903221051276581,
+      "loss": 1.1926,
+      "step": 3189
+    },
+    {
+      "epoch": 0.5680199430199431,
+      "grad_norm": 0.5948992967605591,
+      "learning_rate": 0.00019031609686872246,
+      "loss": 1.2724,
+      "step": 3190
+    },
+    {
+      "epoch": 0.5681980056980057,
+      "grad_norm": 0.6189367771148682,
+      "learning_rate": 0.00019031008684023055,
+      "loss": 1.2762,
+      "step": 3191
+    },
+    {
+      "epoch": 0.5683760683760684,
+      "grad_norm": 0.49511992931365967,
+      "learning_rate": 0.00019030407504230006,
+      "loss": 1.0117,
+      "step": 3192
+    },
+    {
+      "epoch": 0.5685541310541311,
+      "grad_norm": 0.5358837842941284,
+      "learning_rate": 0.00019029806147504878,
+      "loss": 0.944,
+      "step": 3193
+    },
+    {
+      "epoch": 0.5687321937321937,
+      "grad_norm": 0.458636999130249,
+      "learning_rate": 0.00019029204613859463,
+      "loss": 0.8174,
+      "step": 3194
+    },
+    {
+      "epoch": 0.5689102564102564,
+      "grad_norm": 0.5168304443359375,
+      "learning_rate": 0.00019028602903305535,
+      "loss": 1.1533,
+      "step": 3195
+    },
+    {
+      "epoch": 0.5690883190883191,
+      "grad_norm": 0.5334134697914124,
+      "learning_rate": 0.00019028001015854892,
+      "loss": 1.1868,
+      "step": 3196
+    },
+    {
+      "epoch": 0.5692663817663818,
+      "grad_norm": 0.5649123191833496,
+      "learning_rate": 0.0001902739895151932,
+      "loss": 0.9876,
+      "step": 3197
+    },
+    {
+      "epoch": 0.5694444444444444,
+      "grad_norm": 0.5647651553153992,
+      "learning_rate": 0.0001902679671031062,
+      "loss": 1.0805,
+      "step": 3198
+    },
+    {
+      "epoch": 0.5696225071225072,
+      "grad_norm": 0.5251876711845398,
+      "learning_rate": 0.00019026194292240587,
+      "loss": 1.2335,
+      "step": 3199
+    },
+    {
+      "epoch": 0.5698005698005698,
+      "grad_norm": 0.5268014669418335,
+      "learning_rate": 0.0001902559169732103,
+      "loss": 1.19,
+      "step": 3200
+    },
+    {
+      "epoch": 0.5699786324786325,
+      "grad_norm": 0.5301041007041931,
+      "learning_rate": 0.00019024988925563752,
+      "loss": 1.1173,
+      "step": 3201
+    },
+    {
+      "epoch": 0.5701566951566952,
+      "grad_norm": 0.4531562030315399,
+      "learning_rate": 0.00019024385976980566,
+      "loss": 0.7576,
+      "step": 3202
+    },
+    {
+      "epoch": 0.5703347578347578,
+      "grad_norm": 0.5779716372489929,
+      "learning_rate": 0.00019023782851583282,
+      "loss": 1.1719,
+      "step": 3203
+    },
+    {
+      "epoch": 0.5705128205128205,
+      "grad_norm": 0.4886093735694885,
+      "learning_rate": 0.00019023179549383716,
+      "loss": 1.085,
+      "step": 3204
+    },
+    {
+      "epoch": 0.5706908831908832,
+      "grad_norm": 0.510117769241333,
+      "learning_rate": 0.0001902257607039369,
+      "loss": 0.8931,
+      "step": 3205
+    },
+    {
+      "epoch": 0.5708689458689459,
+      "grad_norm": 0.5195479393005371,
+      "learning_rate": 0.00019021972414625036,
+      "loss": 0.9922,
+      "step": 3206
+    },
+    {
+      "epoch": 0.5710470085470085,
+      "grad_norm": 0.5791407227516174,
+      "learning_rate": 0.00019021368582089568,
+      "loss": 1.112,
+      "step": 3207
+    },
+    {
+      "epoch": 0.5712250712250713,
+      "grad_norm": 0.5056005716323853,
+      "learning_rate": 0.00019020764572799122,
+      "loss": 0.8474,
+      "step": 3208
+    },
+    {
+      "epoch": 0.5714031339031339,
+      "grad_norm": 0.5060068964958191,
+      "learning_rate": 0.00019020160386765537,
+      "loss": 1.071,
+      "step": 3209
+    },
+    {
+      "epoch": 0.5715811965811965,
+      "grad_norm": 0.5396568775177002,
+      "learning_rate": 0.00019019556024000648,
+      "loss": 1.0436,
+      "step": 3210
+    },
+    {
+      "epoch": 0.5717592592592593,
+      "grad_norm": 0.6552190780639648,
+      "learning_rate": 0.0001901895148451629,
+      "loss": 0.9869,
+      "step": 3211
+    },
+    {
+      "epoch": 0.5719373219373219,
+      "grad_norm": 0.5177004337310791,
+      "learning_rate": 0.00019018346768324314,
+      "loss": 1.0193,
+      "step": 3212
+    },
+    {
+      "epoch": 0.5721153846153846,
+      "grad_norm": 0.5192117094993591,
+      "learning_rate": 0.0001901774187543657,
+      "loss": 1.1263,
+      "step": 3213
+    },
+    {
+      "epoch": 0.5722934472934473,
+      "grad_norm": 0.4857729971408844,
+      "learning_rate": 0.00019017136805864906,
+      "loss": 0.9808,
+      "step": 3214
+    },
+    {
+      "epoch": 0.57247150997151,
+      "grad_norm": 0.5800918936729431,
+      "learning_rate": 0.00019016531559621177,
+      "loss": 1.2334,
+      "step": 3215
+    },
+    {
+      "epoch": 0.5726495726495726,
+      "grad_norm": 0.4812086522579193,
+      "learning_rate": 0.00019015926136717242,
+      "loss": 1.2409,
+      "step": 3216
+    },
+    {
+      "epoch": 0.5728276353276354,
+      "grad_norm": 0.5128398537635803,
+      "learning_rate": 0.00019015320537164963,
+      "loss": 0.9036,
+      "step": 3217
+    },
+    {
+      "epoch": 0.573005698005698,
+      "grad_norm": 0.4761141538619995,
+      "learning_rate": 0.00019014714760976205,
+      "loss": 1.1058,
+      "step": 3218
+    },
+    {
+      "epoch": 0.5731837606837606,
+      "grad_norm": 0.5850459933280945,
+      "learning_rate": 0.0001901410880816284,
+      "loss": 1.1011,
+      "step": 3219
+    },
+    {
+      "epoch": 0.5733618233618234,
+      "grad_norm": 0.5648714303970337,
+      "learning_rate": 0.00019013502678736738,
+      "loss": 1.0479,
+      "step": 3220
+    },
+    {
+      "epoch": 0.573539886039886,
+      "grad_norm": 0.5835902094841003,
+      "learning_rate": 0.00019012896372709774,
+      "loss": 1.0555,
+      "step": 3221
+    },
+    {
+      "epoch": 0.5737179487179487,
+      "grad_norm": 0.5155113935470581,
+      "learning_rate": 0.00019012289890093828,
+      "loss": 0.9488,
+      "step": 3222
+    },
+    {
+      "epoch": 0.5738960113960114,
+      "grad_norm": 0.5064889788627625,
+      "learning_rate": 0.00019011683230900784,
+      "loss": 0.9144,
+      "step": 3223
+    },
+    {
+      "epoch": 0.5740740740740741,
+      "grad_norm": 0.53825843334198,
+      "learning_rate": 0.00019011076395142527,
+      "loss": 1.0713,
+      "step": 3224
+    },
+    {
+      "epoch": 0.5742521367521367,
+      "grad_norm": 0.5341386198997498,
+      "learning_rate": 0.00019010469382830947,
+      "loss": 1.1438,
+      "step": 3225
+    },
+    {
+      "epoch": 0.5744301994301995,
+      "grad_norm": 0.5300050973892212,
+      "learning_rate": 0.00019009862193977936,
+      "loss": 1.0114,
+      "step": 3226
+    },
+    {
+      "epoch": 0.5746082621082621,
+      "grad_norm": 0.6033682823181152,
+      "learning_rate": 0.0001900925482859539,
+      "loss": 1.0458,
+      "step": 3227
+    },
+    {
+      "epoch": 0.5747863247863247,
+      "grad_norm": 0.5108983516693115,
+      "learning_rate": 0.00019008647286695215,
+      "loss": 1.1211,
+      "step": 3228
+    },
+    {
+      "epoch": 0.5749643874643875,
+      "grad_norm": 0.5263782739639282,
+      "learning_rate": 0.00019008039568289308,
+      "loss": 0.8647,
+      "step": 3229
+    },
+    {
+      "epoch": 0.5751424501424501,
+      "grad_norm": 0.47119566798210144,
+      "learning_rate": 0.0001900743167338958,
+      "loss": 1.019,
+      "step": 3230
+    },
+    {
+      "epoch": 0.5753205128205128,
+      "grad_norm": 0.56391841173172,
+      "learning_rate": 0.00019006823602007937,
+      "loss": 0.9791,
+      "step": 3231
+    },
+    {
+      "epoch": 0.5754985754985755,
+      "grad_norm": 0.5364985466003418,
+      "learning_rate": 0.000190062153541563,
+      "loss": 1.1355,
+      "step": 3232
+    },
+    {
+      "epoch": 0.5756766381766382,
+      "grad_norm": 0.5098565220832825,
+      "learning_rate": 0.00019005606929846578,
+      "loss": 0.987,
+      "step": 3233
+    },
+    {
+      "epoch": 0.5758547008547008,
+      "grad_norm": 0.6640968918800354,
+      "learning_rate": 0.00019004998329090692,
+      "loss": 1.1165,
+      "step": 3234
+    },
+    {
+      "epoch": 0.5760327635327636,
+      "grad_norm": 0.5044721961021423,
+      "learning_rate": 0.00019004389551900578,
+      "loss": 0.8643,
+      "step": 3235
+    },
+    {
+      "epoch": 0.5762108262108262,
+      "grad_norm": 0.4822785258293152,
+      "learning_rate": 0.00019003780598288153,
+      "loss": 1.0735,
+      "step": 3236
+    },
+    {
+      "epoch": 0.5763888888888888,
+      "grad_norm": 0.505261242389679,
+      "learning_rate": 0.00019003171468265348,
+      "loss": 1.0001,
+      "step": 3237
+    },
+    {
+      "epoch": 0.5765669515669516,
+      "grad_norm": 0.5020412802696228,
+      "learning_rate": 0.00019002562161844102,
+      "loss": 0.9601,
+      "step": 3238
+    },
+    {
+      "epoch": 0.5767450142450142,
+      "grad_norm": 0.4920475482940674,
+      "learning_rate": 0.00019001952679036354,
+      "loss": 1.0111,
+      "step": 3239
+    },
+    {
+      "epoch": 0.5769230769230769,
+      "grad_norm": 0.5638813376426697,
+      "learning_rate": 0.00019001343019854042,
+      "loss": 1.1456,
+      "step": 3240
+    },
+    {
+      "epoch": 0.5771011396011396,
+      "grad_norm": 0.5519235134124756,
+      "learning_rate": 0.0001900073318430911,
+      "loss": 0.9258,
+      "step": 3241
+    },
+    {
+      "epoch": 0.5772792022792023,
+      "grad_norm": 0.5207770466804504,
+      "learning_rate": 0.0001900012317241351,
+      "loss": 0.9859,
+      "step": 3242
+    },
+    {
+      "epoch": 0.5774572649572649,
+      "grad_norm": 0.5493707656860352,
+      "learning_rate": 0.00018999512984179195,
+      "loss": 1.1183,
+      "step": 3243
+    },
+    {
+      "epoch": 0.5776353276353277,
+      "grad_norm": 0.4504764676094055,
+      "learning_rate": 0.00018998902619618116,
+      "loss": 0.9363,
+      "step": 3244
+    },
+    {
+      "epoch": 0.5778133903133903,
+      "grad_norm": 0.5232836604118347,
+      "learning_rate": 0.00018998292078742233,
+      "loss": 1.1887,
+      "step": 3245
+    },
+    {
+      "epoch": 0.5779914529914529,
+      "grad_norm": 0.5715088248252869,
+      "learning_rate": 0.0001899768136156351,
+      "loss": 1.4524,
+      "step": 3246
+    },
+    {
+      "epoch": 0.5781695156695157,
+      "grad_norm": 0.59555584192276,
+      "learning_rate": 0.0001899707046809391,
+      "loss": 1.0922,
+      "step": 3247
+    },
+    {
+      "epoch": 0.5783475783475783,
+      "grad_norm": 0.4500894546508789,
+      "learning_rate": 0.00018996459398345404,
+      "loss": 1.0087,
+      "step": 3248
+    },
+    {
+      "epoch": 0.5785256410256411,
+      "grad_norm": 0.49126625061035156,
+      "learning_rate": 0.00018995848152329967,
+      "loss": 1.1512,
+      "step": 3249
+    },
+    {
+      "epoch": 0.5787037037037037,
+      "grad_norm": 0.4096335172653198,
+      "learning_rate": 0.00018995236730059574,
+      "loss": 0.7633,
+      "step": 3250
+    },
+    {
+      "epoch": 0.5788817663817664,
+      "grad_norm": 0.5364313721656799,
+      "learning_rate": 0.00018994625131546199,
+      "loss": 1.295,
+      "step": 3251
+    },
+    {
+      "epoch": 0.5790598290598291,
+      "grad_norm": 0.4897502660751343,
+      "learning_rate": 0.00018994013356801834,
+      "loss": 1.2197,
+      "step": 3252
+    },
+    {
+      "epoch": 0.5792378917378918,
+      "grad_norm": 0.5101368427276611,
+      "learning_rate": 0.00018993401405838456,
+      "loss": 1.1129,
+      "step": 3253
+    },
+    {
+      "epoch": 0.5794159544159544,
+      "grad_norm": 0.5426377654075623,
+      "learning_rate": 0.00018992789278668063,
+      "loss": 1.188,
+      "step": 3254
+    },
+    {
+      "epoch": 0.5795940170940171,
+      "grad_norm": 0.5066362023353577,
+      "learning_rate": 0.00018992176975302644,
+      "loss": 1.2802,
+      "step": 3255
+    },
+    {
+      "epoch": 0.5797720797720798,
+      "grad_norm": 0.5418947339057922,
+      "learning_rate": 0.00018991564495754196,
+      "loss": 1.1675,
+      "step": 3256
+    },
+    {
+      "epoch": 0.5799501424501424,
+      "grad_norm": 0.5139963626861572,
+      "learning_rate": 0.0001899095184003472,
+      "loss": 0.9717,
+      "step": 3257
+    },
+    {
+      "epoch": 0.5801282051282052,
+      "grad_norm": 0.5167285799980164,
+      "learning_rate": 0.00018990339008156219,
+      "loss": 1.1529,
+      "step": 3258
+    },
+    {
+      "epoch": 0.5803062678062678,
+      "grad_norm": 0.53471440076828,
+      "learning_rate": 0.00018989726000130704,
+      "loss": 1.0711,
+      "step": 3259
+    },
+    {
+      "epoch": 0.5804843304843305,
+      "grad_norm": 0.49875229597091675,
+      "learning_rate": 0.0001898911281597018,
+      "loss": 1.1095,
+      "step": 3260
+    },
+    {
+      "epoch": 0.5806623931623932,
+      "grad_norm": 0.4473155438899994,
+      "learning_rate": 0.00018988499455686663,
+      "loss": 0.836,
+      "step": 3261
+    },
+    {
+      "epoch": 0.5808404558404558,
+      "grad_norm": 0.6181996464729309,
+      "learning_rate": 0.00018987885919292174,
+      "loss": 1.2787,
+      "step": 3262
+    },
+    {
+      "epoch": 0.5810185185185185,
+      "grad_norm": 0.4996899664402008,
+      "learning_rate": 0.00018987272206798733,
+      "loss": 1.2132,
+      "step": 3263
+    },
+    {
+      "epoch": 0.5811965811965812,
+      "grad_norm": 0.49979713559150696,
+      "learning_rate": 0.00018986658318218358,
+      "loss": 0.8388,
+      "step": 3264
+    },
+    {
+      "epoch": 0.5813746438746439,
+      "grad_norm": 0.5288876295089722,
+      "learning_rate": 0.00018986044253563084,
+      "loss": 1.1871,
+      "step": 3265
+    },
+    {
+      "epoch": 0.5815527065527065,
+      "grad_norm": 0.534063458442688,
+      "learning_rate": 0.00018985430012844937,
+      "loss": 0.96,
+      "step": 3266
+    },
+    {
+      "epoch": 0.5817307692307693,
+      "grad_norm": 0.5081285834312439,
+      "learning_rate": 0.00018984815596075953,
+      "loss": 1.1577,
+      "step": 3267
+    },
+    {
+      "epoch": 0.5819088319088319,
+      "grad_norm": 0.5648202896118164,
+      "learning_rate": 0.00018984201003268176,
+      "loss": 1.2235,
+      "step": 3268
+    },
+    {
+      "epoch": 0.5820868945868946,
+      "grad_norm": 0.495061993598938,
+      "learning_rate": 0.00018983586234433642,
+      "loss": 1.056,
+      "step": 3269
+    },
+    {
+      "epoch": 0.5822649572649573,
+      "grad_norm": 0.47149857878685,
+      "learning_rate": 0.000189829712895844,
+      "loss": 1.0844,
+      "step": 3270
+    },
+    {
+      "epoch": 0.58244301994302,
+      "grad_norm": 0.6107062697410583,
+      "learning_rate": 0.00018982356168732492,
+      "loss": 0.9868,
+      "step": 3271
+    },
+    {
+      "epoch": 0.5826210826210826,
+      "grad_norm": 0.7355940341949463,
+      "learning_rate": 0.00018981740871889974,
+      "loss": 1.1448,
+      "step": 3272
+    },
+    {
+      "epoch": 0.5827991452991453,
+      "grad_norm": 0.5950441956520081,
+      "learning_rate": 0.00018981125399068907,
+      "loss": 0.9618,
+      "step": 3273
+    },
+    {
+      "epoch": 0.582977207977208,
+      "grad_norm": 0.47607290744781494,
+      "learning_rate": 0.0001898050975028134,
+      "loss": 0.957,
+      "step": 3274
+    },
+    {
+      "epoch": 0.5831552706552706,
+      "grad_norm": 0.541164755821228,
+      "learning_rate": 0.00018979893925539338,
+      "loss": 1.1426,
+      "step": 3275
+    },
+    {
+      "epoch": 0.5833333333333334,
+      "grad_norm": 0.5240640044212341,
+      "learning_rate": 0.00018979277924854974,
+      "loss": 1.1421,
+      "step": 3276
+    },
+    {
+      "epoch": 0.583511396011396,
+      "grad_norm": 0.48155727982521057,
+      "learning_rate": 0.00018978661748240307,
+      "loss": 1.0069,
+      "step": 3277
+    },
+    {
+      "epoch": 0.5836894586894587,
+      "grad_norm": 0.5559938549995422,
+      "learning_rate": 0.00018978045395707418,
+      "loss": 1.1227,
+      "step": 3278
+    },
+    {
+      "epoch": 0.5838675213675214,
+      "grad_norm": 0.5244291424751282,
+      "learning_rate": 0.0001897742886726838,
+      "loss": 1.1103,
+      "step": 3279
+    },
+    {
+      "epoch": 0.584045584045584,
+      "grad_norm": 0.5277758240699768,
+      "learning_rate": 0.00018976812162935268,
+      "loss": 1.2125,
+      "step": 3280
+    },
+    {
+      "epoch": 0.5842236467236467,
+      "grad_norm": 0.5415039658546448,
+      "learning_rate": 0.00018976195282720173,
+      "loss": 1.146,
+      "step": 3281
+    },
+    {
+      "epoch": 0.5844017094017094,
+      "grad_norm": 0.5152051448822021,
+      "learning_rate": 0.00018975578226635177,
+      "loss": 1.0092,
+      "step": 3282
+    },
+    {
+      "epoch": 0.5845797720797721,
+      "grad_norm": 0.5489452481269836,
+      "learning_rate": 0.00018974960994692371,
+      "loss": 1.2425,
+      "step": 3283
+    },
+    {
+      "epoch": 0.5847578347578347,
+      "grad_norm": 0.491274356842041,
+      "learning_rate": 0.00018974343586903848,
+      "loss": 0.9559,
+      "step": 3284
+    },
+    {
+      "epoch": 0.5849358974358975,
+      "grad_norm": 0.5783739686012268,
+      "learning_rate": 0.00018973726003281707,
+      "loss": 1.1971,
+      "step": 3285
+    },
+    {
+      "epoch": 0.5851139601139601,
+      "grad_norm": 0.5056472420692444,
+      "learning_rate": 0.00018973108243838045,
+      "loss": 1.0313,
+      "step": 3286
+    },
+    {
+      "epoch": 0.5852920227920227,
+      "grad_norm": 0.4939729571342468,
+      "learning_rate": 0.00018972490308584962,
+      "loss": 1.1061,
+      "step": 3287
+    },
+    {
+      "epoch": 0.5854700854700855,
+      "grad_norm": 0.4889580011367798,
+      "learning_rate": 0.00018971872197534576,
+      "loss": 0.9157,
+      "step": 3288
+    },
+    {
+      "epoch": 0.5856481481481481,
+      "grad_norm": 0.40889349579811096,
+      "learning_rate": 0.00018971253910698993,
+      "loss": 0.8083,
+      "step": 3289
+    },
+    {
+      "epoch": 0.5858262108262108,
+      "grad_norm": 0.5221503973007202,
+      "learning_rate": 0.00018970635448090322,
+      "loss": 0.9995,
+      "step": 3290
+    },
+    {
+      "epoch": 0.5860042735042735,
+      "grad_norm": 0.47060561180114746,
+      "learning_rate": 0.00018970016809720687,
+      "loss": 0.9738,
+      "step": 3291
+    },
+    {
+      "epoch": 0.5861823361823362,
+      "grad_norm": 0.6083170771598816,
+      "learning_rate": 0.000189693979956022,
+      "loss": 1.188,
+      "step": 3292
+    },
+    {
+      "epoch": 0.5863603988603988,
+      "grad_norm": 0.4696751534938812,
+      "learning_rate": 0.00018968779005746998,
+      "loss": 1.089,
+      "step": 3293
+    },
+    {
+      "epoch": 0.5865384615384616,
+      "grad_norm": 0.5081014633178711,
+      "learning_rate": 0.00018968159840167202,
+      "loss": 1.1869,
+      "step": 3294
+    },
+    {
+      "epoch": 0.5867165242165242,
+      "grad_norm": 0.48042431473731995,
+      "learning_rate": 0.0001896754049887494,
+      "loss": 0.964,
+      "step": 3295
+    },
+    {
+      "epoch": 0.5868945868945868,
+      "grad_norm": 0.5075193643569946,
+      "learning_rate": 0.00018966920981882353,
+      "loss": 1.1884,
+      "step": 3296
+    },
+    {
+      "epoch": 0.5870726495726496,
+      "grad_norm": 0.5734842419624329,
+      "learning_rate": 0.00018966301289201576,
+      "loss": 1.1475,
+      "step": 3297
+    },
+    {
+      "epoch": 0.5872507122507122,
+      "grad_norm": 0.5525311231613159,
+      "learning_rate": 0.00018965681420844753,
+      "loss": 1.241,
+      "step": 3298
+    },
+    {
+      "epoch": 0.5874287749287749,
+      "grad_norm": 0.48142680525779724,
+      "learning_rate": 0.00018965061376824025,
+      "loss": 1.0871,
+      "step": 3299
+    },
+    {
+      "epoch": 0.5876068376068376,
+      "grad_norm": 0.5360350608825684,
+      "learning_rate": 0.00018964441157151544,
+      "loss": 1.1895,
+      "step": 3300
+    },
+    {
+      "epoch": 0.5877849002849003,
+      "grad_norm": 0.5207685232162476,
+      "learning_rate": 0.00018963820761839457,
+      "loss": 0.9323,
+      "step": 3301
+    },
+    {
+      "epoch": 0.5879629629629629,
+      "grad_norm": 0.453620970249176,
+      "learning_rate": 0.00018963200190899926,
+      "loss": 0.802,
+      "step": 3302
+    },
+    {
+      "epoch": 0.5881410256410257,
+      "grad_norm": 0.5198796391487122,
+      "learning_rate": 0.00018962579444345106,
+      "loss": 1.0243,
+      "step": 3303
+    },
+    {
+      "epoch": 0.5883190883190883,
+      "grad_norm": 0.5597525835037231,
+      "learning_rate": 0.0001896195852218716,
+      "loss": 0.9351,
+      "step": 3304
+    },
+    {
+      "epoch": 0.5884971509971509,
+      "grad_norm": 0.5738299489021301,
+      "learning_rate": 0.00018961337424438254,
+      "loss": 1.3737,
+      "step": 3305
+    },
+    {
+      "epoch": 0.5886752136752137,
+      "grad_norm": 0.5569949150085449,
+      "learning_rate": 0.00018960716151110554,
+      "loss": 1.0469,
+      "step": 3306
+    },
+    {
+      "epoch": 0.5888532763532763,
+      "grad_norm": 0.5088010430335999,
+      "learning_rate": 0.00018960094702216238,
+      "loss": 1.0982,
+      "step": 3307
+    },
+    {
+      "epoch": 0.5890313390313391,
+      "grad_norm": 0.5127636790275574,
+      "learning_rate": 0.0001895947307776748,
+      "loss": 0.9986,
+      "step": 3308
+    },
+    {
+      "epoch": 0.5892094017094017,
+      "grad_norm": 0.5160682797431946,
+      "learning_rate": 0.00018958851277776456,
+      "loss": 1.0219,
+      "step": 3309
+    },
+    {
+      "epoch": 0.5893874643874644,
+      "grad_norm": 0.5380711555480957,
+      "learning_rate": 0.00018958229302255356,
+      "loss": 1.118,
+      "step": 3310
+    },
+    {
+      "epoch": 0.5895655270655271,
+      "grad_norm": 0.5571228861808777,
+      "learning_rate": 0.0001895760715121636,
+      "loss": 1.0302,
+      "step": 3311
+    },
+    {
+      "epoch": 0.5897435897435898,
+      "grad_norm": 0.542266309261322,
+      "learning_rate": 0.00018956984824671657,
+      "loss": 1.0372,
+      "step": 3312
+    },
+    {
+      "epoch": 0.5899216524216524,
+      "grad_norm": 0.48350459337234497,
+      "learning_rate": 0.00018956362322633446,
+      "loss": 1.2,
+      "step": 3313
+    },
+    {
+      "epoch": 0.5900997150997151,
+      "grad_norm": 0.5001645088195801,
+      "learning_rate": 0.0001895573964511392,
+      "loss": 0.9749,
+      "step": 3314
+    },
+    {
+      "epoch": 0.5902777777777778,
+      "grad_norm": 0.5227531790733337,
+      "learning_rate": 0.00018955116792125276,
+      "loss": 1.025,
+      "step": 3315
+    },
+    {
+      "epoch": 0.5904558404558404,
+      "grad_norm": 0.522251546382904,
+      "learning_rate": 0.00018954493763679727,
+      "loss": 1.0821,
+      "step": 3316
+    },
+    {
+      "epoch": 0.5906339031339032,
+      "grad_norm": 0.5423251390457153,
+      "learning_rate": 0.00018953870559789467,
+      "loss": 1.0961,
+      "step": 3317
+    },
+    {
+      "epoch": 0.5908119658119658,
+      "grad_norm": 0.5615720748901367,
+      "learning_rate": 0.0001895324718046672,
+      "loss": 1.1209,
+      "step": 3318
+    },
+    {
+      "epoch": 0.5909900284900285,
+      "grad_norm": 0.44746771454811096,
+      "learning_rate": 0.00018952623625723692,
+      "loss": 0.9935,
+      "step": 3319
+    },
+    {
+      "epoch": 0.5911680911680912,
+      "grad_norm": 0.5993229150772095,
+      "learning_rate": 0.00018951999895572597,
+      "loss": 1.1409,
+      "step": 3320
+    },
+    {
+      "epoch": 0.5913461538461539,
+      "grad_norm": 0.4969801902770996,
+      "learning_rate": 0.00018951375990025666,
+      "loss": 1.1568,
+      "step": 3321
+    },
+    {
+      "epoch": 0.5915242165242165,
+      "grad_norm": 0.6001267433166504,
+      "learning_rate": 0.00018950751909095116,
+      "loss": 1.1135,
+      "step": 3322
+    },
+    {
+      "epoch": 0.5917022792022792,
+      "grad_norm": 0.5386021733283997,
+      "learning_rate": 0.00018950127652793172,
+      "loss": 0.947,
+      "step": 3323
+    },
+    {
+      "epoch": 0.5918803418803419,
+      "grad_norm": 0.49043843150138855,
+      "learning_rate": 0.00018949503221132074,
+      "loss": 0.9581,
+      "step": 3324
+    },
+    {
+      "epoch": 0.5920584045584045,
+      "grad_norm": 0.5241141319274902,
+      "learning_rate": 0.00018948878614124048,
+      "loss": 1.0797,
+      "step": 3325
+    },
+    {
+      "epoch": 0.5922364672364673,
+      "grad_norm": 0.5755026340484619,
+      "learning_rate": 0.00018948253831781338,
+      "loss": 1.1046,
+      "step": 3326
+    },
+    {
+      "epoch": 0.5924145299145299,
+      "grad_norm": 0.5004449486732483,
+      "learning_rate": 0.00018947628874116179,
+      "loss": 1.1416,
+      "step": 3327
+    },
+    {
+      "epoch": 0.5925925925925926,
+      "grad_norm": 0.53347247838974,
+      "learning_rate": 0.00018947003741140821,
+      "loss": 1.2718,
+      "step": 3328
+    },
+    {
+      "epoch": 0.5927706552706553,
+      "grad_norm": 0.6473469138145447,
+      "learning_rate": 0.0001894637843286751,
+      "loss": 1.2255,
+      "step": 3329
+    },
+    {
+      "epoch": 0.592948717948718,
+      "grad_norm": 0.4750518798828125,
+      "learning_rate": 0.00018945752949308498,
+      "loss": 1.0537,
+      "step": 3330
+    },
+    {
+      "epoch": 0.5931267806267806,
+      "grad_norm": 0.5636306405067444,
+      "learning_rate": 0.00018945127290476043,
+      "loss": 0.9906,
+      "step": 3331
+    },
+    {
+      "epoch": 0.5933048433048433,
+      "grad_norm": 0.4871736466884613,
+      "learning_rate": 0.00018944501456382397,
+      "loss": 1.0549,
+      "step": 3332
+    },
+    {
+      "epoch": 0.593482905982906,
+      "grad_norm": 0.5554637312889099,
+      "learning_rate": 0.0001894387544703983,
+      "loss": 1.1587,
+      "step": 3333
+    },
+    {
+      "epoch": 0.5936609686609686,
+      "grad_norm": 0.5385799407958984,
+      "learning_rate": 0.000189432492624606,
+      "loss": 0.9565,
+      "step": 3334
+    },
+    {
+      "epoch": 0.5938390313390314,
+      "grad_norm": 0.4996553063392639,
+      "learning_rate": 0.00018942622902656976,
+      "loss": 1.0456,
+      "step": 3335
+    },
+    {
+      "epoch": 0.594017094017094,
+      "grad_norm": 0.46810707449913025,
+      "learning_rate": 0.00018941996367641237,
+      "loss": 1.119,
+      "step": 3336
+    },
+    {
+      "epoch": 0.5941951566951567,
+      "grad_norm": 0.5672653913497925,
+      "learning_rate": 0.0001894136965742565,
+      "loss": 1.1317,
+      "step": 3337
+    },
+    {
+      "epoch": 0.5943732193732194,
+      "grad_norm": 0.4790053367614746,
+      "learning_rate": 0.00018940742772022504,
+      "loss": 1.0967,
+      "step": 3338
+    },
+    {
+      "epoch": 0.594551282051282,
+      "grad_norm": 0.5935906171798706,
+      "learning_rate": 0.00018940115711444072,
+      "loss": 1.3044,
+      "step": 3339
+    },
+    {
+      "epoch": 0.5947293447293447,
+      "grad_norm": 0.4790516793727875,
+      "learning_rate": 0.00018939488475702647,
+      "loss": 1.074,
+      "step": 3340
+    },
+    {
+      "epoch": 0.5949074074074074,
+      "grad_norm": 0.474588006734848,
+      "learning_rate": 0.00018938861064810516,
+      "loss": 1.1476,
+      "step": 3341
+    },
+    {
+      "epoch": 0.5950854700854701,
+      "grad_norm": 0.4908665120601654,
+      "learning_rate": 0.0001893823347877997,
+      "loss": 1.216,
+      "step": 3342
+    },
+    {
+      "epoch": 0.5952635327635327,
+      "grad_norm": 0.531650960445404,
+      "learning_rate": 0.00018937605717623307,
+      "loss": 1.1057,
+      "step": 3343
+    },
+    {
+      "epoch": 0.5954415954415955,
+      "grad_norm": 0.5581082105636597,
+      "learning_rate": 0.00018936977781352823,
+      "loss": 0.7972,
+      "step": 3344
+    },
+    {
+      "epoch": 0.5956196581196581,
+      "grad_norm": 0.42370662093162537,
+      "learning_rate": 0.00018936349669980827,
+      "loss": 0.8888,
+      "step": 3345
+    },
+    {
+      "epoch": 0.5957977207977208,
+      "grad_norm": 0.5817318558692932,
+      "learning_rate": 0.00018935721383519624,
+      "loss": 1.2801,
+      "step": 3346
+    },
+    {
+      "epoch": 0.5959757834757835,
+      "grad_norm": 0.4766376316547394,
+      "learning_rate": 0.00018935092921981524,
+      "loss": 1.0918,
+      "step": 3347
+    },
+    {
+      "epoch": 0.5961538461538461,
+      "grad_norm": 0.5567346811294556,
+      "learning_rate": 0.00018934464285378836,
+      "loss": 1.0269,
+      "step": 3348
+    },
+    {
+      "epoch": 0.5963319088319088,
+      "grad_norm": 0.5285565257072449,
+      "learning_rate": 0.0001893383547372388,
+      "loss": 1.1887,
+      "step": 3349
+    },
+    {
+      "epoch": 0.5965099715099715,
+      "grad_norm": 0.49052694439888,
+      "learning_rate": 0.00018933206487028979,
+      "loss": 1.0773,
+      "step": 3350
+    },
+    {
+      "epoch": 0.5966880341880342,
+      "grad_norm": 0.6175199151039124,
+      "learning_rate": 0.0001893257732530645,
+      "loss": 1.0192,
+      "step": 3351
+    },
+    {
+      "epoch": 0.5968660968660968,
+      "grad_norm": 0.56049644947052,
+      "learning_rate": 0.00018931947988568628,
+      "loss": 0.9516,
+      "step": 3352
+    },
+    {
+      "epoch": 0.5970441595441596,
+      "grad_norm": 0.47873660922050476,
+      "learning_rate": 0.00018931318476827838,
+      "loss": 0.8174,
+      "step": 3353
+    },
+    {
+      "epoch": 0.5972222222222222,
+      "grad_norm": 0.4748854339122772,
+      "learning_rate": 0.00018930688790096416,
+      "loss": 1.0238,
+      "step": 3354
+    },
+    {
+      "epoch": 0.5974002849002849,
+      "grad_norm": 0.5382232666015625,
+      "learning_rate": 0.00018930058928386698,
+      "loss": 1.0815,
+      "step": 3355
+    },
+    {
+      "epoch": 0.5975783475783476,
+      "grad_norm": 0.5038299560546875,
+      "learning_rate": 0.00018929428891711027,
+      "loss": 1.0472,
+      "step": 3356
+    },
+    {
+      "epoch": 0.5977564102564102,
+      "grad_norm": 0.5185908079147339,
+      "learning_rate": 0.00018928798680081744,
+      "loss": 1.0435,
+      "step": 3357
+    },
+    {
+      "epoch": 0.5979344729344729,
+      "grad_norm": 0.5169877409934998,
+      "learning_rate": 0.00018928168293511202,
+      "loss": 1.0437,
+      "step": 3358
+    },
+    {
+      "epoch": 0.5981125356125356,
+      "grad_norm": 0.5218369960784912,
+      "learning_rate": 0.00018927537732011749,
+      "loss": 1.082,
+      "step": 3359
+    },
+    {
+      "epoch": 0.5982905982905983,
+      "grad_norm": 0.5358219742774963,
+      "learning_rate": 0.0001892690699559574,
+      "loss": 1.2523,
+      "step": 3360
+    },
+    {
+      "epoch": 0.5984686609686609,
+      "grad_norm": 0.47716647386550903,
+      "learning_rate": 0.0001892627608427553,
+      "loss": 1.2069,
+      "step": 3361
+    },
+    {
+      "epoch": 0.5986467236467237,
+      "grad_norm": 0.5484169125556946,
+      "learning_rate": 0.00018925644998063482,
+      "loss": 1.2016,
+      "step": 3362
+    },
+    {
+      "epoch": 0.5988247863247863,
+      "grad_norm": 0.46814846992492676,
+      "learning_rate": 0.00018925013736971965,
+      "loss": 0.7989,
+      "step": 3363
+    },
+    {
+      "epoch": 0.5990028490028491,
+      "grad_norm": 0.5391258001327515,
+      "learning_rate": 0.0001892438230101334,
+      "loss": 1.224,
+      "step": 3364
+    },
+    {
+      "epoch": 0.5991809116809117,
+      "grad_norm": 0.5248384475708008,
+      "learning_rate": 0.00018923750690199987,
+      "loss": 1.1532,
+      "step": 3365
+    },
+    {
+      "epoch": 0.5993589743589743,
+      "grad_norm": 0.5074637532234192,
+      "learning_rate": 0.00018923118904544273,
+      "loss": 1.0968,
+      "step": 3366
+    },
+    {
+      "epoch": 0.5995370370370371,
+      "grad_norm": 0.5260029435157776,
+      "learning_rate": 0.00018922486944058581,
+      "loss": 1.1311,
+      "step": 3367
+    },
+    {
+      "epoch": 0.5997150997150997,
+      "grad_norm": 0.48497965931892395,
+      "learning_rate": 0.00018921854808755294,
+      "loss": 1.1208,
+      "step": 3368
+    },
+    {
+      "epoch": 0.5998931623931624,
+      "grad_norm": 0.5108651518821716,
+      "learning_rate": 0.00018921222498646792,
+      "loss": 1.147,
+      "step": 3369
+    },
+    {
+      "epoch": 0.6000712250712251,
+      "grad_norm": 0.5243437886238098,
+      "learning_rate": 0.00018920590013745471,
+      "loss": 0.9614,
+      "step": 3370
+    },
+    {
+      "epoch": 0.6002492877492878,
+      "grad_norm": 0.47022634744644165,
+      "learning_rate": 0.00018919957354063719,
+      "loss": 1.0579,
+      "step": 3371
+    },
+    {
+      "epoch": 0.6004273504273504,
+      "grad_norm": 0.6461413502693176,
+      "learning_rate": 0.00018919324519613931,
+      "loss": 1.2126,
+      "step": 3372
+    },
+    {
+      "epoch": 0.6006054131054132,
+      "grad_norm": 0.4654616713523865,
+      "learning_rate": 0.00018918691510408508,
+      "loss": 1.1476,
+      "step": 3373
+    },
+    {
+      "epoch": 0.6007834757834758,
+      "grad_norm": 0.48571303486824036,
+      "learning_rate": 0.00018918058326459854,
+      "loss": 1.2093,
+      "step": 3374
+    },
+    {
+      "epoch": 0.6009615384615384,
+      "grad_norm": 0.5255016684532166,
+      "learning_rate": 0.00018917424967780368,
+      "loss": 1.1538,
+      "step": 3375
+    },
+    {
+      "epoch": 0.6011396011396012,
+      "grad_norm": 0.5059894323348999,
+      "learning_rate": 0.00018916791434382468,
+      "loss": 1.0556,
+      "step": 3376
+    },
+    {
+      "epoch": 0.6013176638176638,
+      "grad_norm": 0.4581229090690613,
+      "learning_rate": 0.00018916157726278561,
+      "loss": 1.1468,
+      "step": 3377
+    },
+    {
+      "epoch": 0.6014957264957265,
+      "grad_norm": 0.5701818466186523,
+      "learning_rate": 0.00018915523843481067,
+      "loss": 1.3641,
+      "step": 3378
+    },
+    {
+      "epoch": 0.6016737891737892,
+      "grad_norm": 0.5007243752479553,
+      "learning_rate": 0.00018914889786002403,
+      "loss": 1.2705,
+      "step": 3379
+    },
+    {
+      "epoch": 0.6018518518518519,
+      "grad_norm": 0.5192995071411133,
+      "learning_rate": 0.0001891425555385499,
+      "loss": 0.9922,
+      "step": 3380
+    },
+    {
+      "epoch": 0.6020299145299145,
+      "grad_norm": 0.5880612134933472,
+      "learning_rate": 0.00018913621147051258,
+      "loss": 0.8783,
+      "step": 3381
+    },
+    {
+      "epoch": 0.6022079772079773,
+      "grad_norm": 0.5161563158035278,
+      "learning_rate": 0.0001891298656560364,
+      "loss": 0.9634,
+      "step": 3382
+    },
+    {
+      "epoch": 0.6023860398860399,
+      "grad_norm": 0.48450782895088196,
+      "learning_rate": 0.00018912351809524563,
+      "loss": 0.809,
+      "step": 3383
+    },
+    {
+      "epoch": 0.6025641025641025,
+      "grad_norm": 0.621537983417511,
+      "learning_rate": 0.00018911716878826465,
+      "loss": 1.2031,
+      "step": 3384
+    },
+    {
+      "epoch": 0.6027421652421653,
+      "grad_norm": 0.6014544367790222,
+      "learning_rate": 0.00018911081773521787,
+      "loss": 1.1552,
+      "step": 3385
+    },
+    {
+      "epoch": 0.6029202279202279,
+      "grad_norm": 0.49995481967926025,
+      "learning_rate": 0.00018910446493622976,
+      "loss": 0.8569,
+      "step": 3386
+    },
+    {
+      "epoch": 0.6030982905982906,
+      "grad_norm": 0.5157307386398315,
+      "learning_rate": 0.00018909811039142472,
+      "loss": 0.9515,
+      "step": 3387
+    },
+    {
+      "epoch": 0.6032763532763533,
+      "grad_norm": 0.5164140462875366,
+      "learning_rate": 0.0001890917541009273,
+      "loss": 0.9803,
+      "step": 3388
+    },
+    {
+      "epoch": 0.603454415954416,
+      "grad_norm": 0.5555596947669983,
+      "learning_rate": 0.00018908539606486206,
+      "loss": 1.2994,
+      "step": 3389
+    },
+    {
+      "epoch": 0.6036324786324786,
+      "grad_norm": 0.605697512626648,
+      "learning_rate": 0.00018907903628335353,
+      "loss": 1.2865,
+      "step": 3390
+    },
+    {
+      "epoch": 0.6038105413105413,
+      "grad_norm": 0.5700713992118835,
+      "learning_rate": 0.0001890726747565263,
+      "loss": 1.2493,
+      "step": 3391
+    },
+    {
+      "epoch": 0.603988603988604,
+      "grad_norm": 0.5516746044158936,
+      "learning_rate": 0.0001890663114845051,
+      "loss": 1.2743,
+      "step": 3392
+    },
+    {
+      "epoch": 0.6041666666666666,
+      "grad_norm": 0.5233162641525269,
+      "learning_rate": 0.0001890599464674145,
+      "loss": 0.9237,
+      "step": 3393
+    },
+    {
+      "epoch": 0.6043447293447294,
+      "grad_norm": 0.5709942579269409,
+      "learning_rate": 0.00018905357970537925,
+      "loss": 0.9922,
+      "step": 3394
+    },
+    {
+      "epoch": 0.604522792022792,
+      "grad_norm": 0.48403796553611755,
+      "learning_rate": 0.0001890472111985241,
+      "loss": 1.1255,
+      "step": 3395
+    },
+    {
+      "epoch": 0.6047008547008547,
+      "grad_norm": 0.628718376159668,
+      "learning_rate": 0.00018904084094697386,
+      "loss": 1.1458,
+      "step": 3396
+    },
+    {
+      "epoch": 0.6048789173789174,
+      "grad_norm": 0.46822869777679443,
+      "learning_rate": 0.00018903446895085328,
+      "loss": 0.8727,
+      "step": 3397
+    },
+    {
+      "epoch": 0.60505698005698,
+      "grad_norm": 0.505584180355072,
+      "learning_rate": 0.00018902809521028724,
+      "loss": 1.1595,
+      "step": 3398
+    },
+    {
+      "epoch": 0.6052350427350427,
+      "grad_norm": 0.4494974911212921,
+      "learning_rate": 0.00018902171972540058,
+      "loss": 0.6685,
+      "step": 3399
+    },
+    {
+      "epoch": 0.6054131054131054,
+      "grad_norm": 0.5101519227027893,
+      "learning_rate": 0.0001890153424963183,
+      "loss": 0.9313,
+      "step": 3400
+    },
+    {
+      "epoch": 0.6055911680911681,
+      "grad_norm": 0.5081079602241516,
+      "learning_rate": 0.00018900896352316528,
+      "loss": 1.2588,
+      "step": 3401
+    },
+    {
+      "epoch": 0.6057692307692307,
+      "grad_norm": 0.5784309506416321,
+      "learning_rate": 0.00018900258280606653,
+      "loss": 1.2077,
+      "step": 3402
+    },
+    {
+      "epoch": 0.6059472934472935,
+      "grad_norm": 0.4506312608718872,
+      "learning_rate": 0.00018899620034514705,
+      "loss": 1.05,
+      "step": 3403
+    },
+    {
+      "epoch": 0.6061253561253561,
+      "grad_norm": 0.5243048071861267,
+      "learning_rate": 0.0001889898161405319,
+      "loss": 1.2295,
+      "step": 3404
+    },
+    {
+      "epoch": 0.6063034188034188,
+      "grad_norm": 0.5447196364402771,
+      "learning_rate": 0.00018898343019234615,
+      "loss": 1.1476,
+      "step": 3405
+    },
+    {
+      "epoch": 0.6064814814814815,
+      "grad_norm": 0.46813663840293884,
+      "learning_rate": 0.00018897704250071492,
+      "loss": 1.2113,
+      "step": 3406
+    },
+    {
+      "epoch": 0.6066595441595442,
+      "grad_norm": 0.5340631604194641,
+      "learning_rate": 0.00018897065306576342,
+      "loss": 1.1656,
+      "step": 3407
+    },
+    {
+      "epoch": 0.6068376068376068,
+      "grad_norm": 0.513708233833313,
+      "learning_rate": 0.00018896426188761675,
+      "loss": 1.1616,
+      "step": 3408
+    },
+    {
+      "epoch": 0.6070156695156695,
+      "grad_norm": 0.594601035118103,
+      "learning_rate": 0.00018895786896640023,
+      "loss": 1.2564,
+      "step": 3409
+    },
+    {
+      "epoch": 0.6071937321937322,
+      "grad_norm": 0.45067599415779114,
+      "learning_rate": 0.000188951474302239,
+      "loss": 1.0107,
+      "step": 3410
+    },
+    {
+      "epoch": 0.6073717948717948,
+      "grad_norm": 0.5394250750541687,
+      "learning_rate": 0.00018894507789525843,
+      "loss": 1.4081,
+      "step": 3411
+    },
+    {
+      "epoch": 0.6075498575498576,
+      "grad_norm": 0.5612049102783203,
+      "learning_rate": 0.00018893867974558383,
+      "loss": 1.1015,
+      "step": 3412
+    },
+    {
+      "epoch": 0.6077279202279202,
+      "grad_norm": 0.4794061779975891,
+      "learning_rate": 0.00018893227985334056,
+      "loss": 1.2103,
+      "step": 3413
+    },
+    {
+      "epoch": 0.6079059829059829,
+      "grad_norm": 0.6060562133789062,
+      "learning_rate": 0.00018892587821865402,
+      "loss": 1.3693,
+      "step": 3414
+    },
+    {
+      "epoch": 0.6080840455840456,
+      "grad_norm": 0.44624534249305725,
+      "learning_rate": 0.00018891947484164963,
+      "loss": 0.8209,
+      "step": 3415
+    },
+    {
+      "epoch": 0.6082621082621082,
+      "grad_norm": 0.49297213554382324,
+      "learning_rate": 0.0001889130697224528,
+      "loss": 1.2027,
+      "step": 3416
+    },
+    {
+      "epoch": 0.6084401709401709,
+      "grad_norm": 0.4431746304035187,
+      "learning_rate": 0.0001889066628611891,
+      "loss": 1.0347,
+      "step": 3417
+    },
+    {
+      "epoch": 0.6086182336182336,
+      "grad_norm": 0.5425933599472046,
+      "learning_rate": 0.00018890025425798404,
+      "loss": 1.0556,
+      "step": 3418
+    },
+    {
+      "epoch": 0.6087962962962963,
+      "grad_norm": 0.5502763390541077,
+      "learning_rate": 0.00018889384391296315,
+      "loss": 1.2362,
+      "step": 3419
+    },
+    {
+      "epoch": 0.6089743589743589,
+      "grad_norm": 0.5442292094230652,
+      "learning_rate": 0.00018888743182625203,
+      "loss": 1.1306,
+      "step": 3420
+    },
+    {
+      "epoch": 0.6091524216524217,
+      "grad_norm": 0.4651123583316803,
+      "learning_rate": 0.00018888101799797636,
+      "loss": 0.9305,
+      "step": 3421
+    },
+    {
+      "epoch": 0.6093304843304843,
+      "grad_norm": 0.4713892340660095,
+      "learning_rate": 0.00018887460242826177,
+      "loss": 1.0789,
+      "step": 3422
+    },
+    {
+      "epoch": 0.6095085470085471,
+      "grad_norm": 0.5283244848251343,
+      "learning_rate": 0.00018886818511723398,
+      "loss": 1.345,
+      "step": 3423
+    },
+    {
+      "epoch": 0.6096866096866097,
+      "grad_norm": 0.5527324080467224,
+      "learning_rate": 0.0001888617660650187,
+      "loss": 1.1297,
+      "step": 3424
+    },
+    {
+      "epoch": 0.6098646723646723,
+      "grad_norm": 0.5412901043891907,
+      "learning_rate": 0.00018885534527174168,
+      "loss": 1.1213,
+      "step": 3425
+    },
+    {
+      "epoch": 0.6100427350427351,
+      "grad_norm": 0.5295354127883911,
+      "learning_rate": 0.00018884892273752878,
+      "loss": 1.1217,
+      "step": 3426
+    },
+    {
+      "epoch": 0.6102207977207977,
+      "grad_norm": 0.461900532245636,
+      "learning_rate": 0.0001888424984625058,
+      "loss": 0.827,
+      "step": 3427
+    },
+    {
+      "epoch": 0.6103988603988604,
+      "grad_norm": 0.4922671616077423,
+      "learning_rate": 0.00018883607244679865,
+      "loss": 1.2216,
+      "step": 3428
+    },
+    {
+      "epoch": 0.6105769230769231,
+      "grad_norm": 0.5080927014350891,
+      "learning_rate": 0.00018882964469053317,
+      "loss": 1.2446,
+      "step": 3429
+    },
+    {
+      "epoch": 0.6107549857549858,
+      "grad_norm": 0.5523943901062012,
+      "learning_rate": 0.00018882321519383534,
+      "loss": 1.3346,
+      "step": 3430
+    },
+    {
+      "epoch": 0.6109330484330484,
+      "grad_norm": 0.5105271935462952,
+      "learning_rate": 0.0001888167839568311,
+      "loss": 1.1311,
+      "step": 3431
+    },
+    {
+      "epoch": 0.6111111111111112,
+      "grad_norm": 0.5635872483253479,
+      "learning_rate": 0.0001888103509796465,
+      "loss": 1.1875,
+      "step": 3432
+    },
+    {
+      "epoch": 0.6112891737891738,
+      "grad_norm": 0.4619547426700592,
+      "learning_rate": 0.00018880391626240755,
+      "loss": 0.9176,
+      "step": 3433
+    },
+    {
+      "epoch": 0.6114672364672364,
+      "grad_norm": 0.5896356105804443,
+      "learning_rate": 0.00018879747980524034,
+      "loss": 1.0251,
+      "step": 3434
+    },
+    {
+      "epoch": 0.6116452991452992,
+      "grad_norm": 0.49062737822532654,
+      "learning_rate": 0.000188791041608271,
+      "loss": 1.1598,
+      "step": 3435
+    },
+    {
+      "epoch": 0.6118233618233618,
+      "grad_norm": 0.45717164874076843,
+      "learning_rate": 0.00018878460167162558,
+      "loss": 0.8647,
+      "step": 3436
+    },
+    {
+      "epoch": 0.6120014245014245,
+      "grad_norm": 0.5903525352478027,
+      "learning_rate": 0.00018877815999543038,
+      "loss": 0.9671,
+      "step": 3437
+    },
+    {
+      "epoch": 0.6121794871794872,
+      "grad_norm": 0.5315384268760681,
+      "learning_rate": 0.00018877171657981153,
+      "loss": 1.1759,
+      "step": 3438
+    },
+    {
+      "epoch": 0.6123575498575499,
+      "grad_norm": 0.5650150775909424,
+      "learning_rate": 0.0001887652714248953,
+      "loss": 1.0128,
+      "step": 3439
+    },
+    {
+      "epoch": 0.6125356125356125,
+      "grad_norm": 0.49841752648353577,
+      "learning_rate": 0.000188758824530808,
+      "loss": 1.1259,
+      "step": 3440
+    },
+    {
+      "epoch": 0.6127136752136753,
+      "grad_norm": 0.4985620975494385,
+      "learning_rate": 0.00018875237589767593,
+      "loss": 1.0158,
+      "step": 3441
+    },
+    {
+      "epoch": 0.6128917378917379,
+      "grad_norm": 0.45266565680503845,
+      "learning_rate": 0.00018874592552562536,
+      "loss": 0.93,
+      "step": 3442
+    },
+    {
+      "epoch": 0.6130698005698005,
+      "grad_norm": 0.5696130990982056,
+      "learning_rate": 0.00018873947341478274,
+      "loss": 1.1432,
+      "step": 3443
+    },
+    {
+      "epoch": 0.6132478632478633,
+      "grad_norm": 0.5211645364761353,
+      "learning_rate": 0.00018873301956527451,
+      "loss": 1.1317,
+      "step": 3444
+    },
+    {
+      "epoch": 0.6134259259259259,
+      "grad_norm": 0.4991866946220398,
+      "learning_rate": 0.00018872656397722707,
+      "loss": 1.0362,
+      "step": 3445
+    },
+    {
+      "epoch": 0.6136039886039886,
+      "grad_norm": 0.5109508037567139,
+      "learning_rate": 0.00018872010665076694,
+      "loss": 1.2728,
+      "step": 3446
+    },
+    {
+      "epoch": 0.6137820512820513,
+      "grad_norm": 0.5838373899459839,
+      "learning_rate": 0.00018871364758602058,
+      "loss": 1.1131,
+      "step": 3447
+    },
+    {
+      "epoch": 0.613960113960114,
+      "grad_norm": 0.5139824151992798,
+      "learning_rate": 0.00018870718678311462,
+      "loss": 1.238,
+      "step": 3448
+    },
+    {
+      "epoch": 0.6141381766381766,
+      "grad_norm": 0.4852082431316376,
+      "learning_rate": 0.00018870072424217562,
+      "loss": 1.0677,
+      "step": 3449
+    },
+    {
+      "epoch": 0.6143162393162394,
+      "grad_norm": 0.5312315225601196,
+      "learning_rate": 0.00018869425996333018,
+      "loss": 1.178,
+      "step": 3450
+    },
+    {
+      "epoch": 0.614494301994302,
+      "grad_norm": 0.6343565583229065,
+      "learning_rate": 0.00018868779394670492,
+      "loss": 0.8839,
+      "step": 3451
+    },
+    {
+      "epoch": 0.6146723646723646,
+      "grad_norm": 0.6029773950576782,
+      "learning_rate": 0.00018868132619242662,
+      "loss": 1.1188,
+      "step": 3452
+    },
+    {
+      "epoch": 0.6148504273504274,
+      "grad_norm": 0.5246016383171082,
+      "learning_rate": 0.00018867485670062193,
+      "loss": 1.0797,
+      "step": 3453
+    },
+    {
+      "epoch": 0.61502849002849,
+      "grad_norm": 0.49307698011398315,
+      "learning_rate": 0.00018866838547141763,
+      "loss": 0.9749,
+      "step": 3454
+    },
+    {
+      "epoch": 0.6152065527065527,
+      "grad_norm": 0.5232903361320496,
+      "learning_rate": 0.00018866191250494052,
+      "loss": 1.0785,
+      "step": 3455
+    },
+    {
+      "epoch": 0.6153846153846154,
+      "grad_norm": 0.5545645356178284,
+      "learning_rate": 0.0001886554378013174,
+      "loss": 1.0496,
+      "step": 3456
+    },
+    {
+      "epoch": 0.6155626780626781,
+      "grad_norm": 0.493945837020874,
+      "learning_rate": 0.00018864896136067515,
+      "loss": 0.9248,
+      "step": 3457
+    },
+    {
+      "epoch": 0.6157407407407407,
+      "grad_norm": 0.5223548412322998,
+      "learning_rate": 0.00018864248318314065,
+      "loss": 1.0617,
+      "step": 3458
+    },
+    {
+      "epoch": 0.6159188034188035,
+      "grad_norm": 0.5666514039039612,
+      "learning_rate": 0.00018863600326884082,
+      "loss": 0.9981,
+      "step": 3459
+    },
+    {
+      "epoch": 0.6160968660968661,
+      "grad_norm": 0.4648127257823944,
+      "learning_rate": 0.00018862952161790265,
+      "loss": 0.917,
+      "step": 3460
+    },
+    {
+      "epoch": 0.6162749287749287,
+      "grad_norm": 0.590326189994812,
+      "learning_rate": 0.0001886230382304531,
+      "loss": 1.044,
+      "step": 3461
+    },
+    {
+      "epoch": 0.6164529914529915,
+      "grad_norm": 0.5511625409126282,
+      "learning_rate": 0.00018861655310661925,
+      "loss": 1.0988,
+      "step": 3462
+    },
+    {
+      "epoch": 0.6166310541310541,
+      "grad_norm": 0.567182183265686,
+      "learning_rate": 0.0001886100662465281,
+      "loss": 1.3017,
+      "step": 3463
+    },
+    {
+      "epoch": 0.6168091168091168,
+      "grad_norm": 0.5708897709846497,
+      "learning_rate": 0.0001886035776503068,
+      "loss": 0.9123,
+      "step": 3464
+    },
+    {
+      "epoch": 0.6169871794871795,
+      "grad_norm": 0.4945180416107178,
+      "learning_rate": 0.0001885970873180824,
+      "loss": 1.1645,
+      "step": 3465
+    },
+    {
+      "epoch": 0.6171652421652422,
+      "grad_norm": 0.4713336229324341,
+      "learning_rate": 0.00018859059524998215,
+      "loss": 1.0546,
+      "step": 3466
+    },
+    {
+      "epoch": 0.6173433048433048,
+      "grad_norm": 0.532859206199646,
+      "learning_rate": 0.0001885841014461332,
+      "loss": 1.0795,
+      "step": 3467
+    },
+    {
+      "epoch": 0.6175213675213675,
+      "grad_norm": 0.5165733695030212,
+      "learning_rate": 0.00018857760590666284,
+      "loss": 1.1284,
+      "step": 3468
+    },
+    {
+      "epoch": 0.6176994301994302,
+      "grad_norm": 0.48623126745224,
+      "learning_rate": 0.00018857110863169826,
+      "loss": 0.8618,
+      "step": 3469
+    },
+    {
+      "epoch": 0.6178774928774928,
+      "grad_norm": 0.628559947013855,
+      "learning_rate": 0.0001885646096213668,
+      "loss": 1.1089,
+      "step": 3470
+    },
+    {
+      "epoch": 0.6180555555555556,
+      "grad_norm": 0.503545880317688,
+      "learning_rate": 0.0001885581088757958,
+      "loss": 1.2311,
+      "step": 3471
+    },
+    {
+      "epoch": 0.6182336182336182,
+      "grad_norm": 0.6172101497650146,
+      "learning_rate": 0.00018855160639511264,
+      "loss": 1.2651,
+      "step": 3472
+    },
+    {
+      "epoch": 0.6184116809116809,
+      "grad_norm": 0.49572527408599854,
+      "learning_rate": 0.00018854510217944465,
+      "loss": 1.1026,
+      "step": 3473
+    },
+    {
+      "epoch": 0.6185897435897436,
+      "grad_norm": 0.5373549461364746,
+      "learning_rate": 0.00018853859622891938,
+      "loss": 1.2562,
+      "step": 3474
+    },
+    {
+      "epoch": 0.6187678062678063,
+      "grad_norm": 0.5272396206855774,
+      "learning_rate": 0.0001885320885436642,
+      "loss": 1.1763,
+      "step": 3475
+    },
+    {
+      "epoch": 0.6189458689458689,
+      "grad_norm": 0.46584269404411316,
+      "learning_rate": 0.00018852557912380665,
+      "loss": 1.1762,
+      "step": 3476
+    },
+    {
+      "epoch": 0.6191239316239316,
+      "grad_norm": 0.4798245131969452,
+      "learning_rate": 0.0001885190679694743,
+      "loss": 0.9229,
+      "step": 3477
+    },
+    {
+      "epoch": 0.6193019943019943,
+      "grad_norm": 0.5221366286277771,
+      "learning_rate": 0.0001885125550807947,
+      "loss": 1.1078,
+      "step": 3478
+    },
+    {
+      "epoch": 0.6194800569800569,
+      "grad_norm": 0.5051897168159485,
+      "learning_rate": 0.0001885060404578954,
+      "loss": 1.0055,
+      "step": 3479
+    },
+    {
+      "epoch": 0.6196581196581197,
+      "grad_norm": 0.492662250995636,
+      "learning_rate": 0.00018849952410090413,
+      "loss": 1.1172,
+      "step": 3480
+    },
+    {
+      "epoch": 0.6198361823361823,
+      "grad_norm": 0.4906775951385498,
+      "learning_rate": 0.00018849300600994853,
+      "loss": 1.1223,
+      "step": 3481
+    },
+    {
+      "epoch": 0.6200142450142451,
+      "grad_norm": 0.5032641291618347,
+      "learning_rate": 0.0001884864861851563,
+      "loss": 0.9541,
+      "step": 3482
+    },
+    {
+      "epoch": 0.6201923076923077,
+      "grad_norm": 0.5262296795845032,
+      "learning_rate": 0.00018847996462665521,
+      "loss": 1.021,
+      "step": 3483
+    },
+    {
+      "epoch": 0.6203703703703703,
+      "grad_norm": 0.5253522992134094,
+      "learning_rate": 0.00018847344133457295,
+      "loss": 0.9075,
+      "step": 3484
+    },
+    {
+      "epoch": 0.6205484330484331,
+      "grad_norm": 0.4204299747943878,
+      "learning_rate": 0.00018846691630903744,
+      "loss": 0.895,
+      "step": 3485
+    },
+    {
+      "epoch": 0.6207264957264957,
+      "grad_norm": 0.557604193687439,
+      "learning_rate": 0.0001884603895501765,
+      "loss": 1.1758,
+      "step": 3486
+    },
+    {
+      "epoch": 0.6209045584045584,
+      "grad_norm": 0.5981321930885315,
+      "learning_rate": 0.00018845386105811795,
+      "loss": 1.1087,
+      "step": 3487
+    },
+    {
+      "epoch": 0.6210826210826211,
+      "grad_norm": 0.5285581946372986,
+      "learning_rate": 0.00018844733083298975,
+      "loss": 1.0692,
+      "step": 3488
+    },
+    {
+      "epoch": 0.6212606837606838,
+      "grad_norm": 0.5403170585632324,
+      "learning_rate": 0.00018844079887491986,
+      "loss": 1.1998,
+      "step": 3489
+    },
+    {
+      "epoch": 0.6214387464387464,
+      "grad_norm": 0.5471615791320801,
+      "learning_rate": 0.0001884342651840362,
+      "loss": 0.9556,
+      "step": 3490
+    },
+    {
+      "epoch": 0.6216168091168092,
+      "grad_norm": 0.6126871705055237,
+      "learning_rate": 0.00018842772976046686,
+      "loss": 1.2629,
+      "step": 3491
+    },
+    {
+      "epoch": 0.6217948717948718,
+      "grad_norm": 0.45669353008270264,
+      "learning_rate": 0.00018842119260433982,
+      "loss": 1.0203,
+      "step": 3492
+    },
+    {
+      "epoch": 0.6219729344729344,
+      "grad_norm": 0.4998520612716675,
+      "learning_rate": 0.0001884146537157832,
+      "loss": 1.0271,
+      "step": 3493
+    },
+    {
+      "epoch": 0.6221509971509972,
+      "grad_norm": 0.5820242166519165,
+      "learning_rate": 0.00018840811309492507,
+      "loss": 1.0321,
+      "step": 3494
+    },
+    {
+      "epoch": 0.6223290598290598,
+      "grad_norm": 0.581676185131073,
+      "learning_rate": 0.00018840157074189367,
+      "loss": 0.9219,
+      "step": 3495
+    },
+    {
+      "epoch": 0.6225071225071225,
+      "grad_norm": 0.6044120788574219,
+      "learning_rate": 0.0001883950266568171,
+      "loss": 1.1621,
+      "step": 3496
+    },
+    {
+      "epoch": 0.6226851851851852,
+      "grad_norm": 0.5448858737945557,
+      "learning_rate": 0.0001883884808398236,
+      "loss": 1.0686,
+      "step": 3497
+    },
+    {
+      "epoch": 0.6228632478632479,
+      "grad_norm": 0.4921551048755646,
+      "learning_rate": 0.00018838193329104143,
+      "loss": 1.2259,
+      "step": 3498
+    },
+    {
+      "epoch": 0.6230413105413105,
+      "grad_norm": 0.5374335646629333,
+      "learning_rate": 0.00018837538401059888,
+      "loss": 1.2608,
+      "step": 3499
+    },
+    {
+      "epoch": 0.6232193732193733,
+      "grad_norm": 0.5123008489608765,
+      "learning_rate": 0.0001883688329986243,
+      "loss": 0.8682,
+      "step": 3500
+    },
+    {
+      "epoch": 0.6233974358974359,
+      "grad_norm": 0.566145122051239,
+      "learning_rate": 0.00018836228025524595,
+      "loss": 1.1807,
+      "step": 3501
+    },
+    {
+      "epoch": 0.6235754985754985,
+      "grad_norm": 0.6658587455749512,
+      "learning_rate": 0.00018835572578059233,
+      "loss": 1.1641,
+      "step": 3502
+    },
+    {
+      "epoch": 0.6237535612535613,
+      "grad_norm": 0.4992465078830719,
+      "learning_rate": 0.00018834916957479177,
+      "loss": 0.9125,
+      "step": 3503
+    },
+    {
+      "epoch": 0.6239316239316239,
+      "grad_norm": 0.5081812739372253,
+      "learning_rate": 0.00018834261163797278,
+      "loss": 1.0939,
+      "step": 3504
+    },
+    {
+      "epoch": 0.6241096866096866,
+      "grad_norm": 0.5168607234954834,
+      "learning_rate": 0.0001883360519702638,
+      "loss": 1.2382,
+      "step": 3505
+    },
+    {
+      "epoch": 0.6242877492877493,
+      "grad_norm": 0.5517697334289551,
+      "learning_rate": 0.00018832949057179344,
+      "loss": 1.206,
+      "step": 3506
+    },
+    {
+      "epoch": 0.624465811965812,
+      "grad_norm": 0.4505497217178345,
+      "learning_rate": 0.00018832292744269013,
+      "loss": 0.8485,
+      "step": 3507
+    },
+    {
+      "epoch": 0.6246438746438746,
+      "grad_norm": 0.5230690240859985,
+      "learning_rate": 0.0001883163625830826,
+      "loss": 1.1701,
+      "step": 3508
+    },
+    {
+      "epoch": 0.6248219373219374,
+      "grad_norm": 0.5062205195426941,
+      "learning_rate": 0.00018830979599309937,
+      "loss": 1.0602,
+      "step": 3509
+    },
+    {
+      "epoch": 0.625,
+      "grad_norm": 0.49922460317611694,
+      "learning_rate": 0.00018830322767286913,
+      "loss": 1.1937,
+      "step": 3510
+    },
+    {
+      "epoch": 0.6251780626780626,
+      "grad_norm": 0.4637366831302643,
+      "learning_rate": 0.0001882966576225206,
+      "loss": 1.038,
+      "step": 3511
+    },
+    {
+      "epoch": 0.6253561253561254,
+      "grad_norm": 0.5330080389976501,
+      "learning_rate": 0.00018829008584218246,
+      "loss": 0.9308,
+      "step": 3512
+    },
+    {
+      "epoch": 0.625534188034188,
+      "grad_norm": 0.5443428754806519,
+      "learning_rate": 0.0001882835123319835,
+      "loss": 1.0006,
+      "step": 3513
+    },
+    {
+      "epoch": 0.6257122507122507,
+      "grad_norm": 0.5534018874168396,
+      "learning_rate": 0.00018827693709205253,
+      "loss": 1.2383,
+      "step": 3514
+    },
+    {
+      "epoch": 0.6258903133903134,
+      "grad_norm": 0.49207547307014465,
+      "learning_rate": 0.00018827036012251832,
+      "loss": 0.9804,
+      "step": 3515
+    },
+    {
+      "epoch": 0.6260683760683761,
+      "grad_norm": 0.4900086224079132,
+      "learning_rate": 0.0001882637814235098,
+      "loss": 1.012,
+      "step": 3516
+    },
+    {
+      "epoch": 0.6262464387464387,
+      "grad_norm": 0.5267475247383118,
+      "learning_rate": 0.00018825720099515585,
+      "loss": 1.1104,
+      "step": 3517
+    },
+    {
+      "epoch": 0.6264245014245015,
+      "grad_norm": 0.5711902379989624,
+      "learning_rate": 0.00018825061883758534,
+      "loss": 1.0616,
+      "step": 3518
+    },
+    {
+      "epoch": 0.6266025641025641,
+      "grad_norm": 0.5007771849632263,
+      "learning_rate": 0.0001882440349509273,
+      "loss": 0.9578,
+      "step": 3519
+    },
+    {
+      "epoch": 0.6267806267806267,
+      "grad_norm": 0.5657192468643188,
+      "learning_rate": 0.00018823744933531075,
+      "loss": 1.2768,
+      "step": 3520
+    },
+    {
+      "epoch": 0.6269586894586895,
+      "grad_norm": 0.6077173352241516,
+      "learning_rate": 0.00018823086199086462,
+      "loss": 1.147,
+      "step": 3521
+    },
+    {
+      "epoch": 0.6271367521367521,
+      "grad_norm": 0.5114718079566956,
+      "learning_rate": 0.000188224272917718,
+      "loss": 1.1176,
+      "step": 3522
+    },
+    {
+      "epoch": 0.6273148148148148,
+      "grad_norm": 0.4831676185131073,
+      "learning_rate": 0.0001882176821160001,
+      "loss": 0.8021,
+      "step": 3523
+    },
+    {
+      "epoch": 0.6274928774928775,
+      "grad_norm": 0.6327390670776367,
+      "learning_rate": 0.00018821108958583994,
+      "loss": 0.9449,
+      "step": 3524
+    },
+    {
+      "epoch": 0.6276709401709402,
+      "grad_norm": 0.5541796684265137,
+      "learning_rate": 0.00018820449532736672,
+      "loss": 1.2018,
+      "step": 3525
+    },
+    {
+      "epoch": 0.6278490028490028,
+      "grad_norm": 0.5224639773368835,
+      "learning_rate": 0.00018819789934070968,
+      "loss": 1.0138,
+      "step": 3526
+    },
+    {
+      "epoch": 0.6280270655270656,
+      "grad_norm": 0.49359360337257385,
+      "learning_rate": 0.00018819130162599798,
+      "loss": 1.0768,
+      "step": 3527
+    },
+    {
+      "epoch": 0.6282051282051282,
+      "grad_norm": 0.5525050759315491,
+      "learning_rate": 0.00018818470218336092,
+      "loss": 1.0883,
+      "step": 3528
+    },
+    {
+      "epoch": 0.6283831908831908,
+      "grad_norm": 0.5563427209854126,
+      "learning_rate": 0.00018817810101292787,
+      "loss": 1.1491,
+      "step": 3529
+    },
+    {
+      "epoch": 0.6285612535612536,
+      "grad_norm": 0.49363306164741516,
+      "learning_rate": 0.00018817149811482803,
+      "loss": 1.1409,
+      "step": 3530
+    },
+    {
+      "epoch": 0.6287393162393162,
+      "grad_norm": 0.5102340579032898,
+      "learning_rate": 0.00018816489348919086,
+      "loss": 1.1914,
+      "step": 3531
+    },
+    {
+      "epoch": 0.6289173789173789,
+      "grad_norm": 0.5173332691192627,
+      "learning_rate": 0.00018815828713614576,
+      "loss": 0.9308,
+      "step": 3532
+    },
+    {
+      "epoch": 0.6290954415954416,
+      "grad_norm": 0.5093010067939758,
+      "learning_rate": 0.00018815167905582216,
+      "loss": 0.9429,
+      "step": 3533
+    },
+    {
+      "epoch": 0.6292735042735043,
+      "grad_norm": 0.5453153848648071,
+      "learning_rate": 0.00018814506924834954,
+      "loss": 1.0147,
+      "step": 3534
+    },
+    {
+      "epoch": 0.6294515669515669,
+      "grad_norm": 0.5850773453712463,
+      "learning_rate": 0.00018813845771385737,
+      "loss": 1.3372,
+      "step": 3535
+    },
+    {
+      "epoch": 0.6296296296296297,
+      "grad_norm": 0.5095621943473816,
+      "learning_rate": 0.00018813184445247525,
+      "loss": 1.0515,
+      "step": 3536
+    },
+    {
+      "epoch": 0.6298076923076923,
+      "grad_norm": 0.6216054558753967,
+      "learning_rate": 0.00018812522946433266,
+      "loss": 0.8703,
+      "step": 3537
+    },
+    {
+      "epoch": 0.6299857549857549,
+      "grad_norm": 0.4945531189441681,
+      "learning_rate": 0.00018811861274955932,
+      "loss": 1.1485,
+      "step": 3538
+    },
+    {
+      "epoch": 0.6301638176638177,
+      "grad_norm": 0.47882601618766785,
+      "learning_rate": 0.00018811199430828477,
+      "loss": 1.1107,
+      "step": 3539
+    },
+    {
+      "epoch": 0.6303418803418803,
+      "grad_norm": 0.5005326867103577,
+      "learning_rate": 0.00018810537414063876,
+      "loss": 1.0237,
+      "step": 3540
+    },
+    {
+      "epoch": 0.6305199430199431,
+      "grad_norm": 0.5382370352745056,
+      "learning_rate": 0.00018809875224675093,
+      "loss": 0.9965,
+      "step": 3541
+    },
+    {
+      "epoch": 0.6306980056980057,
+      "grad_norm": 0.47002625465393066,
+      "learning_rate": 0.0001880921286267511,
+      "loss": 1.065,
+      "step": 3542
+    },
+    {
+      "epoch": 0.6308760683760684,
+      "grad_norm": 0.4519105851650238,
+      "learning_rate": 0.00018808550328076897,
+      "loss": 0.9312,
+      "step": 3543
+    },
+    {
+      "epoch": 0.6310541310541311,
+      "grad_norm": 0.45360881090164185,
+      "learning_rate": 0.0001880788762089344,
+      "loss": 1.0739,
+      "step": 3544
+    },
+    {
+      "epoch": 0.6312321937321937,
+      "grad_norm": 0.5578218698501587,
+      "learning_rate": 0.00018807224741137723,
+      "loss": 1.2478,
+      "step": 3545
+    },
+    {
+      "epoch": 0.6314102564102564,
+      "grad_norm": 0.4838615655899048,
+      "learning_rate": 0.0001880656168882273,
+      "loss": 1.0221,
+      "step": 3546
+    },
+    {
+      "epoch": 0.6315883190883191,
+      "grad_norm": 0.5733556747436523,
+      "learning_rate": 0.0001880589846396146,
+      "loss": 1.1249,
+      "step": 3547
+    },
+    {
+      "epoch": 0.6317663817663818,
+      "grad_norm": 0.4939686954021454,
+      "learning_rate": 0.00018805235066566894,
+      "loss": 0.8559,
+      "step": 3548
+    },
+    {
+      "epoch": 0.6319444444444444,
+      "grad_norm": 0.5072234869003296,
+      "learning_rate": 0.00018804571496652044,
+      "loss": 1.0842,
+      "step": 3549
+    },
+    {
+      "epoch": 0.6321225071225072,
+      "grad_norm": 0.4640493392944336,
+      "learning_rate": 0.00018803907754229903,
+      "loss": 1.0728,
+      "step": 3550
+    },
+    {
+      "epoch": 0.6323005698005698,
+      "grad_norm": 0.5314788818359375,
+      "learning_rate": 0.00018803243839313481,
+      "loss": 1.0752,
+      "step": 3551
+    },
+    {
+      "epoch": 0.6324786324786325,
+      "grad_norm": 0.5511462092399597,
+      "learning_rate": 0.0001880257975191578,
+      "loss": 1.0238,
+      "step": 3552
+    },
+    {
+      "epoch": 0.6326566951566952,
+      "grad_norm": 0.4980711042881012,
+      "learning_rate": 0.00018801915492049816,
+      "loss": 1.0981,
+      "step": 3553
+    },
+    {
+      "epoch": 0.6328347578347578,
+      "grad_norm": 0.7746123671531677,
+      "learning_rate": 0.00018801251059728604,
+      "loss": 1.0968,
+      "step": 3554
+    },
+    {
+      "epoch": 0.6330128205128205,
+      "grad_norm": 0.5006106495857239,
+      "learning_rate": 0.00018800586454965155,
+      "loss": 1.1802,
+      "step": 3555
+    },
+    {
+      "epoch": 0.6331908831908832,
+      "grad_norm": 0.49427780508995056,
+      "learning_rate": 0.000187999216777725,
+      "loss": 1.1257,
+      "step": 3556
+    },
+    {
+      "epoch": 0.6333689458689459,
+      "grad_norm": 0.5484146475791931,
+      "learning_rate": 0.00018799256728163662,
+      "loss": 1.1344,
+      "step": 3557
+    },
+    {
+      "epoch": 0.6335470085470085,
+      "grad_norm": 0.5007877349853516,
+      "learning_rate": 0.00018798591606151662,
+      "loss": 1.1328,
+      "step": 3558
+    },
+    {
+      "epoch": 0.6337250712250713,
+      "grad_norm": 0.5068148970603943,
+      "learning_rate": 0.00018797926311749544,
+      "loss": 0.976,
+      "step": 3559
+    },
+    {
+      "epoch": 0.6339031339031339,
+      "grad_norm": 0.44936859607696533,
+      "learning_rate": 0.00018797260844970334,
+      "loss": 0.9735,
+      "step": 3560
+    },
+    {
+      "epoch": 0.6340811965811965,
+      "grad_norm": 0.4592931866645813,
+      "learning_rate": 0.0001879659520582707,
+      "loss": 1.1306,
+      "step": 3561
+    },
+    {
+      "epoch": 0.6342592592592593,
+      "grad_norm": 0.4664020836353302,
+      "learning_rate": 0.00018795929394332795,
+      "loss": 1.0577,
+      "step": 3562
+    },
+    {
+      "epoch": 0.6344373219373219,
+      "grad_norm": 0.5638116002082825,
+      "learning_rate": 0.00018795263410500556,
+      "loss": 1.1747,
+      "step": 3563
+    },
+    {
+      "epoch": 0.6346153846153846,
+      "grad_norm": 0.524736225605011,
+      "learning_rate": 0.00018794597254343401,
+      "loss": 0.8964,
+      "step": 3564
+    },
+    {
+      "epoch": 0.6347934472934473,
+      "grad_norm": 0.4645404517650604,
+      "learning_rate": 0.00018793930925874386,
+      "loss": 0.8673,
+      "step": 3565
+    },
+    {
+      "epoch": 0.63497150997151,
+      "grad_norm": 0.4800064265727997,
+      "learning_rate": 0.00018793264425106558,
+      "loss": 1.0334,
+      "step": 3566
+    },
+    {
+      "epoch": 0.6351495726495726,
+      "grad_norm": 0.6202501058578491,
+      "learning_rate": 0.0001879259775205298,
+      "loss": 1.1061,
+      "step": 3567
+    },
+    {
+      "epoch": 0.6353276353276354,
+      "grad_norm": 0.503383457660675,
+      "learning_rate": 0.00018791930906726718,
+      "loss": 0.8545,
+      "step": 3568
+    },
+    {
+      "epoch": 0.635505698005698,
+      "grad_norm": 0.5256780982017517,
+      "learning_rate": 0.00018791263889140832,
+      "loss": 1.0785,
+      "step": 3569
+    },
+    {
+      "epoch": 0.6356837606837606,
+      "grad_norm": 0.47562023997306824,
+      "learning_rate": 0.00018790596699308392,
+      "loss": 1.0041,
+      "step": 3570
+    },
+    {
+      "epoch": 0.6358618233618234,
+      "grad_norm": 0.5103238224983215,
+      "learning_rate": 0.00018789929337242469,
+      "loss": 1.1488,
+      "step": 3571
+    },
+    {
+      "epoch": 0.636039886039886,
+      "grad_norm": 0.5023695826530457,
+      "learning_rate": 0.0001878926180295614,
+      "loss": 1.0696,
+      "step": 3572
+    },
+    {
+      "epoch": 0.6362179487179487,
+      "grad_norm": 0.5302290916442871,
+      "learning_rate": 0.00018788594096462487,
+      "loss": 1.0554,
+      "step": 3573
+    },
+    {
+      "epoch": 0.6363960113960114,
+      "grad_norm": 0.4798361361026764,
+      "learning_rate": 0.00018787926217774588,
+      "loss": 0.8872,
+      "step": 3574
+    },
+    {
+      "epoch": 0.6365740740740741,
+      "grad_norm": 0.5529209971427917,
+      "learning_rate": 0.00018787258166905527,
+      "loss": 1.0976,
+      "step": 3575
+    },
+    {
+      "epoch": 0.6367521367521367,
+      "grad_norm": 0.49757125973701477,
+      "learning_rate": 0.00018786589943868402,
+      "loss": 1.0049,
+      "step": 3576
+    },
+    {
+      "epoch": 0.6369301994301995,
+      "grad_norm": 0.5497848391532898,
+      "learning_rate": 0.00018785921548676295,
+      "loss": 1.2272,
+      "step": 3577
+    },
+    {
+      "epoch": 0.6371082621082621,
+      "grad_norm": 0.5061752200126648,
+      "learning_rate": 0.0001878525298134231,
+      "loss": 1.0307,
+      "step": 3578
+    },
+    {
+      "epoch": 0.6372863247863247,
+      "grad_norm": 0.5427432656288147,
+      "learning_rate": 0.00018784584241879538,
+      "loss": 1.1064,
+      "step": 3579
+    },
+    {
+      "epoch": 0.6374643874643875,
+      "grad_norm": 0.48312774300575256,
+      "learning_rate": 0.0001878391533030109,
+      "loss": 1.078,
+      "step": 3580
+    },
+    {
+      "epoch": 0.6376424501424501,
+      "grad_norm": 0.5059898495674133,
+      "learning_rate": 0.00018783246246620067,
+      "loss": 1.0922,
+      "step": 3581
+    },
+    {
+      "epoch": 0.6378205128205128,
+      "grad_norm": 0.5144124031066895,
+      "learning_rate": 0.00018782576990849581,
+      "loss": 1.0909,
+      "step": 3582
+    },
+    {
+      "epoch": 0.6379985754985755,
+      "grad_norm": 0.5535032153129578,
+      "learning_rate": 0.0001878190756300274,
+      "loss": 1.2579,
+      "step": 3583
+    },
+    {
+      "epoch": 0.6381766381766382,
+      "grad_norm": 0.49145692586898804,
+      "learning_rate": 0.00018781237963092667,
+      "loss": 1.0823,
+      "step": 3584
+    },
+    {
+      "epoch": 0.6383547008547008,
+      "grad_norm": 0.5245576500892639,
+      "learning_rate": 0.00018780568191132472,
+      "loss": 0.9595,
+      "step": 3585
+    },
+    {
+      "epoch": 0.6385327635327636,
+      "grad_norm": 0.5026637315750122,
+      "learning_rate": 0.00018779898247135287,
+      "loss": 1.153,
+      "step": 3586
+    },
+    {
+      "epoch": 0.6387108262108262,
+      "grad_norm": 0.5092771053314209,
+      "learning_rate": 0.00018779228131114234,
+      "loss": 1.0661,
+      "step": 3587
+    },
+    {
+      "epoch": 0.6388888888888888,
+      "grad_norm": 0.517387330532074,
+      "learning_rate": 0.00018778557843082444,
+      "loss": 1.0113,
+      "step": 3588
+    },
+    {
+      "epoch": 0.6390669515669516,
+      "grad_norm": 0.5149948000907898,
+      "learning_rate": 0.00018777887383053047,
+      "loss": 0.9483,
+      "step": 3589
+    },
+    {
+      "epoch": 0.6392450142450142,
+      "grad_norm": 0.4854544997215271,
+      "learning_rate": 0.00018777216751039185,
+      "loss": 1.22,
+      "step": 3590
+    },
+    {
+      "epoch": 0.6394230769230769,
+      "grad_norm": 0.5317271947860718,
+      "learning_rate": 0.0001877654594705399,
+      "loss": 1.2483,
+      "step": 3591
+    },
+    {
+      "epoch": 0.6396011396011396,
+      "grad_norm": 0.4554755687713623,
+      "learning_rate": 0.0001877587497111061,
+      "loss": 0.9864,
+      "step": 3592
+    },
+    {
+      "epoch": 0.6397792022792023,
+      "grad_norm": 0.4833736717700958,
+      "learning_rate": 0.0001877520382322219,
+      "loss": 0.8895,
+      "step": 3593
+    },
+    {
+      "epoch": 0.6399572649572649,
+      "grad_norm": 0.5018072724342346,
+      "learning_rate": 0.00018774532503401878,
+      "loss": 1.2523,
+      "step": 3594
+    },
+    {
+      "epoch": 0.6401353276353277,
+      "grad_norm": 0.4478762447834015,
+      "learning_rate": 0.00018773861011662832,
+      "loss": 0.8833,
+      "step": 3595
+    },
+    {
+      "epoch": 0.6403133903133903,
+      "grad_norm": 0.5686985850334167,
+      "learning_rate": 0.00018773189348018205,
+      "loss": 0.9934,
+      "step": 3596
+    },
+    {
+      "epoch": 0.6404914529914529,
+      "grad_norm": 0.5144175291061401,
+      "learning_rate": 0.00018772517512481157,
+      "loss": 0.8149,
+      "step": 3597
+    },
+    {
+      "epoch": 0.6406695156695157,
+      "grad_norm": 0.5359936356544495,
+      "learning_rate": 0.00018771845505064852,
+      "loss": 1.1822,
+      "step": 3598
+    },
+    {
+      "epoch": 0.6408475783475783,
+      "grad_norm": 0.532573938369751,
+      "learning_rate": 0.00018771173325782457,
+      "loss": 1.0361,
+      "step": 3599
+    },
+    {
+      "epoch": 0.6410256410256411,
+      "grad_norm": 0.46121537685394287,
+      "learning_rate": 0.00018770500974647138,
+      "loss": 1.0792,
+      "step": 3600
+    },
+    {
+      "epoch": 0.6412037037037037,
+      "grad_norm": 0.4804821312427521,
+      "learning_rate": 0.00018769828451672076,
+      "loss": 1.1119,
+      "step": 3601
+    },
+    {
+      "epoch": 0.6413817663817664,
+      "grad_norm": 0.4955114722251892,
+      "learning_rate": 0.00018769155756870443,
+      "loss": 0.9312,
+      "step": 3602
+    },
+    {
+      "epoch": 0.6415598290598291,
+      "grad_norm": 0.4987298250198364,
+      "learning_rate": 0.00018768482890255415,
+      "loss": 1.2326,
+      "step": 3603
+    },
+    {
+      "epoch": 0.6417378917378918,
+      "grad_norm": 0.47216179966926575,
+      "learning_rate": 0.0001876780985184018,
+      "loss": 1.0114,
+      "step": 3604
+    },
+    {
+      "epoch": 0.6419159544159544,
+      "grad_norm": 0.5891931653022766,
+      "learning_rate": 0.0001876713664163793,
+      "loss": 1.2963,
+      "step": 3605
+    },
+    {
+      "epoch": 0.6420940170940171,
+      "grad_norm": 0.4645081162452698,
+      "learning_rate": 0.00018766463259661846,
+      "loss": 1.0874,
+      "step": 3606
+    },
+    {
+      "epoch": 0.6422720797720798,
+      "grad_norm": 0.5275476574897766,
+      "learning_rate": 0.00018765789705925125,
+      "loss": 0.9453,
+      "step": 3607
+    },
+    {
+      "epoch": 0.6424501424501424,
+      "grad_norm": 0.5884957313537598,
+      "learning_rate": 0.00018765115980440964,
+      "loss": 1.0796,
+      "step": 3608
+    },
+    {
+      "epoch": 0.6426282051282052,
+      "grad_norm": 0.4843178987503052,
+      "learning_rate": 0.00018764442083222567,
+      "loss": 1.1657,
+      "step": 3609
+    },
+    {
+      "epoch": 0.6428062678062678,
+      "grad_norm": 0.5188381671905518,
+      "learning_rate": 0.00018763768014283126,
+      "loss": 1.1109,
+      "step": 3610
+    },
+    {
+      "epoch": 0.6429843304843305,
+      "grad_norm": 0.4101468324661255,
+      "learning_rate": 0.00018763093773635863,
+      "loss": 0.895,
+      "step": 3611
+    },
+    {
+      "epoch": 0.6431623931623932,
+      "grad_norm": 0.4552084505558014,
+      "learning_rate": 0.00018762419361293979,
+      "loss": 0.9418,
+      "step": 3612
+    },
+    {
+      "epoch": 0.6433404558404558,
+      "grad_norm": 0.5924661159515381,
+      "learning_rate": 0.0001876174477727069,
+      "loss": 1.2562,
+      "step": 3613
+    },
+    {
+      "epoch": 0.6435185185185185,
+      "grad_norm": 0.5072348713874817,
+      "learning_rate": 0.00018761070021579212,
+      "loss": 1.1501,
+      "step": 3614
+    },
+    {
+      "epoch": 0.6436965811965812,
+      "grad_norm": 0.5312697887420654,
+      "learning_rate": 0.0001876039509423277,
+      "loss": 1.0751,
+      "step": 3615
+    },
+    {
+      "epoch": 0.6438746438746439,
+      "grad_norm": 0.6046462059020996,
+      "learning_rate": 0.0001875971999524458,
+      "loss": 1.0927,
+      "step": 3616
+    },
+    {
+      "epoch": 0.6440527065527065,
+      "grad_norm": 0.4992375373840332,
+      "learning_rate": 0.00018759044724627876,
+      "loss": 0.96,
+      "step": 3617
+    },
+    {
+      "epoch": 0.6442307692307693,
+      "grad_norm": 0.4983134865760803,
+      "learning_rate": 0.00018758369282395886,
+      "loss": 1.0599,
+      "step": 3618
+    },
+    {
+      "epoch": 0.6444088319088319,
+      "grad_norm": 0.5655683279037476,
+      "learning_rate": 0.00018757693668561843,
+      "loss": 1.2372,
+      "step": 3619
+    },
+    {
+      "epoch": 0.6445868945868946,
+      "grad_norm": 0.4968827962875366,
+      "learning_rate": 0.00018757017883138985,
+      "loss": 1.1639,
+      "step": 3620
+    },
+    {
+      "epoch": 0.6447649572649573,
+      "grad_norm": 0.5831420421600342,
+      "learning_rate": 0.00018756341926140553,
+      "loss": 0.9002,
+      "step": 3621
+    },
+    {
+      "epoch": 0.64494301994302,
+      "grad_norm": 0.4828467071056366,
+      "learning_rate": 0.0001875566579757979,
+      "loss": 0.9201,
+      "step": 3622
+    },
+    {
+      "epoch": 0.6451210826210826,
+      "grad_norm": 0.5067087411880493,
+      "learning_rate": 0.00018754989497469943,
+      "loss": 0.9874,
+      "step": 3623
+    },
+    {
+      "epoch": 0.6452991452991453,
+      "grad_norm": 0.5182318091392517,
+      "learning_rate": 0.00018754313025824267,
+      "loss": 1.1291,
+      "step": 3624
+    },
+    {
+      "epoch": 0.645477207977208,
+      "grad_norm": 0.472200483083725,
+      "learning_rate": 0.0001875363638265601,
+      "loss": 1.0286,
+      "step": 3625
+    },
+    {
+      "epoch": 0.6456552706552706,
+      "grad_norm": 0.4597308039665222,
+      "learning_rate": 0.0001875295956797843,
+      "loss": 0.7517,
+      "step": 3626
+    },
+    {
+      "epoch": 0.6458333333333334,
+      "grad_norm": 0.5358221530914307,
+      "learning_rate": 0.00018752282581804798,
+      "loss": 1.2264,
+      "step": 3627
+    },
+    {
+      "epoch": 0.646011396011396,
+      "grad_norm": 0.5268992781639099,
+      "learning_rate": 0.00018751605424148363,
+      "loss": 1.0801,
+      "step": 3628
+    },
+    {
+      "epoch": 0.6461894586894587,
+      "grad_norm": 0.5917379260063171,
+      "learning_rate": 0.00018750928095022403,
+      "loss": 0.9538,
+      "step": 3629
+    },
+    {
+      "epoch": 0.6463675213675214,
+      "grad_norm": 0.44506707787513733,
+      "learning_rate": 0.00018750250594440183,
+      "loss": 0.9818,
+      "step": 3630
+    },
+    {
+      "epoch": 0.646545584045584,
+      "grad_norm": 0.5578880906105042,
+      "learning_rate": 0.00018749572922414982,
+      "loss": 0.9958,
+      "step": 3631
+    },
+    {
+      "epoch": 0.6467236467236467,
+      "grad_norm": 0.5155318975448608,
+      "learning_rate": 0.00018748895078960076,
+      "loss": 1.2888,
+      "step": 3632
+    },
+    {
+      "epoch": 0.6469017094017094,
+      "grad_norm": 0.5117297768592834,
+      "learning_rate": 0.0001874821706408874,
+      "loss": 1.0452,
+      "step": 3633
+    },
+    {
+      "epoch": 0.6470797720797721,
+      "grad_norm": 0.5169841647148132,
+      "learning_rate": 0.00018747538877814267,
+      "loss": 1.1649,
+      "step": 3634
+    },
+    {
+      "epoch": 0.6472578347578347,
+      "grad_norm": 0.5001181960105896,
+      "learning_rate": 0.00018746860520149942,
+      "loss": 1.1472,
+      "step": 3635
+    },
+    {
+      "epoch": 0.6474358974358975,
+      "grad_norm": 0.6289856433868408,
+      "learning_rate": 0.00018746181991109056,
+      "loss": 1.0351,
+      "step": 3636
+    },
+    {
+      "epoch": 0.6476139601139601,
+      "grad_norm": 0.5490612983703613,
+      "learning_rate": 0.00018745503290704897,
+      "loss": 0.8938,
+      "step": 3637
+    },
+    {
+      "epoch": 0.6477920227920227,
+      "grad_norm": 0.47378283739089966,
+      "learning_rate": 0.00018744824418950775,
+      "loss": 0.937,
+      "step": 3638
+    },
+    {
+      "epoch": 0.6479700854700855,
+      "grad_norm": 0.6079059839248657,
+      "learning_rate": 0.0001874414537585998,
+      "loss": 1.0486,
+      "step": 3639
+    },
+    {
+      "epoch": 0.6481481481481481,
+      "grad_norm": 0.5351769924163818,
+      "learning_rate": 0.00018743466161445823,
+      "loss": 1.0316,
+      "step": 3640
+    },
+    {
+      "epoch": 0.6483262108262108,
+      "grad_norm": 0.5516425967216492,
+      "learning_rate": 0.0001874278677572161,
+      "loss": 1.1552,
+      "step": 3641
+    },
+    {
+      "epoch": 0.6485042735042735,
+      "grad_norm": 0.5027523636817932,
+      "learning_rate": 0.0001874210721870065,
+      "loss": 1.0491,
+      "step": 3642
+    },
+    {
+      "epoch": 0.6486823361823362,
+      "grad_norm": 0.5596168041229248,
+      "learning_rate": 0.00018741427490396258,
+      "loss": 1.0256,
+      "step": 3643
+    },
+    {
+      "epoch": 0.6488603988603988,
+      "grad_norm": 0.5601046681404114,
+      "learning_rate": 0.00018740747590821751,
+      "loss": 1.1604,
+      "step": 3644
+    },
+    {
+      "epoch": 0.6490384615384616,
+      "grad_norm": 0.49749523401260376,
+      "learning_rate": 0.0001874006751999046,
+      "loss": 1.0532,
+      "step": 3645
+    },
+    {
+      "epoch": 0.6492165242165242,
+      "grad_norm": 0.6226113438606262,
+      "learning_rate": 0.00018739387277915697,
+      "loss": 1.1402,
+      "step": 3646
+    },
+    {
+      "epoch": 0.6493945868945868,
+      "grad_norm": 0.6142009496688843,
+      "learning_rate": 0.00018738706864610794,
+      "loss": 1.2437,
+      "step": 3647
+    },
+    {
+      "epoch": 0.6495726495726496,
+      "grad_norm": 0.48814916610717773,
+      "learning_rate": 0.00018738026280089084,
+      "loss": 0.8429,
+      "step": 3648
+    },
+    {
+      "epoch": 0.6497507122507122,
+      "grad_norm": 0.5717982053756714,
+      "learning_rate": 0.00018737345524363902,
+      "loss": 1.1095,
+      "step": 3649
+    },
+    {
+      "epoch": 0.6499287749287749,
+      "grad_norm": 0.5150009989738464,
+      "learning_rate": 0.00018736664597448582,
+      "loss": 1.199,
+      "step": 3650
+    },
+    {
+      "epoch": 0.6501068376068376,
+      "grad_norm": 0.58461594581604,
+      "learning_rate": 0.00018735983499356472,
+      "loss": 1.0704,
+      "step": 3651
+    },
+    {
+      "epoch": 0.6502849002849003,
+      "grad_norm": 0.5108643770217896,
+      "learning_rate": 0.0001873530223010091,
+      "loss": 1.2039,
+      "step": 3652
+    },
+    {
+      "epoch": 0.6504629629629629,
+      "grad_norm": 0.513306736946106,
+      "learning_rate": 0.00018734620789695247,
+      "loss": 1.1448,
+      "step": 3653
+    },
+    {
+      "epoch": 0.6506410256410257,
+      "grad_norm": 0.5139986872673035,
+      "learning_rate": 0.00018733939178152835,
+      "loss": 1.0023,
+      "step": 3654
+    },
+    {
+      "epoch": 0.6508190883190883,
+      "grad_norm": 0.5187703967094421,
+      "learning_rate": 0.00018733257395487027,
+      "loss": 1.1304,
+      "step": 3655
+    },
+    {
+      "epoch": 0.6509971509971509,
+      "grad_norm": 0.5470501184463501,
+      "learning_rate": 0.00018732575441711183,
+      "loss": 1.0272,
+      "step": 3656
+    },
+    {
+      "epoch": 0.6511752136752137,
+      "grad_norm": 0.537309467792511,
+      "learning_rate": 0.00018731893316838665,
+      "loss": 1.0806,
+      "step": 3657
+    },
+    {
+      "epoch": 0.6513532763532763,
+      "grad_norm": 0.5187864899635315,
+      "learning_rate": 0.00018731211020882836,
+      "loss": 1.0154,
+      "step": 3658
+    },
+    {
+      "epoch": 0.6515313390313391,
+      "grad_norm": 0.48373252153396606,
+      "learning_rate": 0.00018730528553857062,
+      "loss": 1.0135,
+      "step": 3659
+    },
+    {
+      "epoch": 0.6517094017094017,
+      "grad_norm": 0.5645000338554382,
+      "learning_rate": 0.00018729845915774716,
+      "loss": 0.8924,
+      "step": 3660
+    },
+    {
+      "epoch": 0.6518874643874644,
+      "grad_norm": 0.5722129940986633,
+      "learning_rate": 0.00018729163106649178,
+      "loss": 1.2416,
+      "step": 3661
+    },
+    {
+      "epoch": 0.6520655270655271,
+      "grad_norm": 0.5904877185821533,
+      "learning_rate": 0.00018728480126493823,
+      "loss": 0.9792,
+      "step": 3662
+    },
+    {
+      "epoch": 0.6522435897435898,
+      "grad_norm": 0.5224713087081909,
+      "learning_rate": 0.00018727796975322026,
+      "loss": 1.079,
+      "step": 3663
+    },
+    {
+      "epoch": 0.6524216524216524,
+      "grad_norm": 0.5667217969894409,
+      "learning_rate": 0.00018727113653147184,
+      "loss": 1.1397,
+      "step": 3664
+    },
+    {
+      "epoch": 0.6525997150997151,
+      "grad_norm": 0.5274622440338135,
+      "learning_rate": 0.00018726430159982677,
+      "loss": 1.0569,
+      "step": 3665
+    },
+    {
+      "epoch": 0.6527777777777778,
+      "grad_norm": 0.5745310187339783,
+      "learning_rate": 0.00018725746495841896,
+      "loss": 1.2129,
+      "step": 3666
+    },
+    {
+      "epoch": 0.6529558404558404,
+      "grad_norm": 0.6123398542404175,
+      "learning_rate": 0.0001872506266073824,
+      "loss": 1.186,
+      "step": 3667
+    },
+    {
+      "epoch": 0.6531339031339032,
+      "grad_norm": 0.4983387291431427,
+      "learning_rate": 0.00018724378654685106,
+      "loss": 1.1957,
+      "step": 3668
+    },
+    {
+      "epoch": 0.6533119658119658,
+      "grad_norm": 0.5584192276000977,
+      "learning_rate": 0.00018723694477695897,
+      "loss": 1.0939,
+      "step": 3669
+    },
+    {
+      "epoch": 0.6534900284900285,
+      "grad_norm": 0.5318745374679565,
+      "learning_rate": 0.00018723010129784016,
+      "loss": 1.1869,
+      "step": 3670
+    },
+    {
+      "epoch": 0.6536680911680912,
+      "grad_norm": 0.4607617259025574,
+      "learning_rate": 0.0001872232561096287,
+      "loss": 0.8447,
+      "step": 3671
+    },
+    {
+      "epoch": 0.6538461538461539,
+      "grad_norm": 0.5312213897705078,
+      "learning_rate": 0.00018721640921245874,
+      "loss": 1.0623,
+      "step": 3672
+    },
+    {
+      "epoch": 0.6540242165242165,
+      "grad_norm": 0.5099136233329773,
+      "learning_rate": 0.0001872095606064644,
+      "loss": 0.7174,
+      "step": 3673
+    },
+    {
+      "epoch": 0.6542022792022792,
+      "grad_norm": 0.6894404888153076,
+      "learning_rate": 0.0001872027102917799,
+      "loss": 1.0251,
+      "step": 3674
+    },
+    {
+      "epoch": 0.6543803418803419,
+      "grad_norm": 0.5758535861968994,
+      "learning_rate": 0.00018719585826853944,
+      "loss": 1.1655,
+      "step": 3675
+    },
+    {
+      "epoch": 0.6545584045584045,
+      "grad_norm": 0.521824061870575,
+      "learning_rate": 0.0001871890045368773,
+      "loss": 1.1653,
+      "step": 3676
+    },
+    {
+      "epoch": 0.6547364672364673,
+      "grad_norm": 0.5370712280273438,
+      "learning_rate": 0.00018718214909692771,
+      "loss": 1.3152,
+      "step": 3677
+    },
+    {
+      "epoch": 0.6549145299145299,
+      "grad_norm": 0.4459827244281769,
+      "learning_rate": 0.000187175291948825,
+      "loss": 1.0953,
+      "step": 3678
+    },
+    {
+      "epoch": 0.6550925925925926,
+      "grad_norm": 0.44131460785865784,
+      "learning_rate": 0.00018716843309270353,
+      "loss": 0.8568,
+      "step": 3679
+    },
+    {
+      "epoch": 0.6552706552706553,
+      "grad_norm": 0.5529624819755554,
+      "learning_rate": 0.00018716157252869772,
+      "loss": 1.2085,
+      "step": 3680
+    },
+    {
+      "epoch": 0.655448717948718,
+      "grad_norm": 0.44604751467704773,
+      "learning_rate": 0.00018715471025694194,
+      "loss": 0.9605,
+      "step": 3681
+    },
+    {
+      "epoch": 0.6556267806267806,
+      "grad_norm": 0.4662449359893799,
+      "learning_rate": 0.0001871478462775707,
+      "loss": 1.2092,
+      "step": 3682
+    },
+    {
+      "epoch": 0.6558048433048433,
+      "grad_norm": 0.42632922530174255,
+      "learning_rate": 0.0001871409805907184,
+      "loss": 0.9141,
+      "step": 3683
+    },
+    {
+      "epoch": 0.655982905982906,
+      "grad_norm": 0.534009575843811,
+      "learning_rate": 0.00018713411319651958,
+      "loss": 1.0147,
+      "step": 3684
+    },
+    {
+      "epoch": 0.6561609686609686,
+      "grad_norm": 0.5433241724967957,
+      "learning_rate": 0.00018712724409510888,
+      "loss": 1.1998,
+      "step": 3685
+    },
+    {
+      "epoch": 0.6563390313390314,
+      "grad_norm": 0.4771319627761841,
+      "learning_rate": 0.0001871203732866208,
+      "loss": 1.0384,
+      "step": 3686
+    },
+    {
+      "epoch": 0.656517094017094,
+      "grad_norm": 0.507641077041626,
+      "learning_rate": 0.00018711350077119,
+      "loss": 0.9608,
+      "step": 3687
+    },
+    {
+      "epoch": 0.6566951566951567,
+      "grad_norm": 0.5069413185119629,
+      "learning_rate": 0.00018710662654895108,
+      "loss": 1.055,
+      "step": 3688
+    },
+    {
+      "epoch": 0.6568732193732194,
+      "grad_norm": 0.512340247631073,
+      "learning_rate": 0.00018709975062003876,
+      "loss": 0.9506,
+      "step": 3689
+    },
+    {
+      "epoch": 0.657051282051282,
+      "grad_norm": 0.5156390070915222,
+      "learning_rate": 0.00018709287298458778,
+      "loss": 1.0089,
+      "step": 3690
+    },
+    {
+      "epoch": 0.6572293447293447,
+      "grad_norm": 0.5101696252822876,
+      "learning_rate": 0.0001870859936427329,
+      "loss": 1.0441,
+      "step": 3691
+    },
+    {
+      "epoch": 0.6574074074074074,
+      "grad_norm": 0.4394689202308655,
+      "learning_rate": 0.00018707911259460884,
+      "loss": 0.9124,
+      "step": 3692
+    },
+    {
+      "epoch": 0.6575854700854701,
+      "grad_norm": 0.4842554032802582,
+      "learning_rate": 0.00018707222984035043,
+      "loss": 1.0051,
+      "step": 3693
+    },
+    {
+      "epoch": 0.6577635327635327,
+      "grad_norm": 0.6418108344078064,
+      "learning_rate": 0.00018706534538009262,
+      "loss": 1.1165,
+      "step": 3694
+    },
+    {
+      "epoch": 0.6579415954415955,
+      "grad_norm": 0.5596832036972046,
+      "learning_rate": 0.00018705845921397022,
+      "loss": 1.1127,
+      "step": 3695
+    },
+    {
+      "epoch": 0.6581196581196581,
+      "grad_norm": 0.6692909002304077,
+      "learning_rate": 0.00018705157134211813,
+      "loss": 1.2403,
+      "step": 3696
+    },
+    {
+      "epoch": 0.6582977207977208,
+      "grad_norm": 0.5046468377113342,
+      "learning_rate": 0.00018704468176467134,
+      "loss": 1.1016,
+      "step": 3697
+    },
+    {
+      "epoch": 0.6584757834757835,
+      "grad_norm": 0.6723586320877075,
+      "learning_rate": 0.00018703779048176485,
+      "loss": 1.1777,
+      "step": 3698
+    },
+    {
+      "epoch": 0.6586538461538461,
+      "grad_norm": 0.5269754528999329,
+      "learning_rate": 0.00018703089749353365,
+      "loss": 1.1441,
+      "step": 3699
+    },
+    {
+      "epoch": 0.6588319088319088,
+      "grad_norm": 0.5303323268890381,
+      "learning_rate": 0.0001870240028001128,
+      "loss": 1.07,
+      "step": 3700
+    },
+    {
+      "epoch": 0.6590099715099715,
+      "grad_norm": 0.4795511066913605,
+      "learning_rate": 0.00018701710640163738,
+      "loss": 1.0189,
+      "step": 3701
+    },
+    {
+      "epoch": 0.6591880341880342,
+      "grad_norm": 0.514659583568573,
+      "learning_rate": 0.00018701020829824255,
+      "loss": 1.0792,
+      "step": 3702
+    },
+    {
+      "epoch": 0.6593660968660968,
+      "grad_norm": 0.5407463312149048,
+      "learning_rate": 0.0001870033084900634,
+      "loss": 0.9346,
+      "step": 3703
+    },
+    {
+      "epoch": 0.6595441595441596,
+      "grad_norm": 0.5358424186706543,
+      "learning_rate": 0.0001869964069772352,
+      "loss": 1.1242,
+      "step": 3704
+    },
+    {
+      "epoch": 0.6597222222222222,
+      "grad_norm": 0.470825731754303,
+      "learning_rate": 0.00018698950375989307,
+      "loss": 0.9952,
+      "step": 3705
+    },
+    {
+      "epoch": 0.6599002849002849,
+      "grad_norm": 0.5711592435836792,
+      "learning_rate": 0.00018698259883817236,
+      "loss": 1.1678,
+      "step": 3706
+    },
+    {
+      "epoch": 0.6600783475783476,
+      "grad_norm": 0.5298995971679688,
+      "learning_rate": 0.00018697569221220832,
+      "loss": 0.869,
+      "step": 3707
+    },
+    {
+      "epoch": 0.6602564102564102,
+      "grad_norm": 0.5453875064849854,
+      "learning_rate": 0.00018696878388213626,
+      "loss": 0.9706,
+      "step": 3708
+    },
+    {
+      "epoch": 0.6604344729344729,
+      "grad_norm": 0.6219926476478577,
+      "learning_rate": 0.00018696187384809154,
+      "loss": 1.1902,
+      "step": 3709
+    },
+    {
+      "epoch": 0.6606125356125356,
+      "grad_norm": 0.5972491502761841,
+      "learning_rate": 0.00018695496211020953,
+      "loss": 1.2054,
+      "step": 3710
+    },
+    {
+      "epoch": 0.6607905982905983,
+      "grad_norm": 0.5048904418945312,
+      "learning_rate": 0.0001869480486686257,
+      "loss": 1.0405,
+      "step": 3711
+    },
+    {
+      "epoch": 0.6609686609686609,
+      "grad_norm": 0.5474200248718262,
+      "learning_rate": 0.00018694113352347546,
+      "loss": 1.09,
+      "step": 3712
+    },
+    {
+      "epoch": 0.6611467236467237,
+      "grad_norm": 0.5073318481445312,
+      "learning_rate": 0.00018693421667489432,
+      "loss": 1.0698,
+      "step": 3713
+    },
+    {
+      "epoch": 0.6613247863247863,
+      "grad_norm": 0.5693208575248718,
+      "learning_rate": 0.0001869272981230178,
+      "loss": 0.9664,
+      "step": 3714
+    },
+    {
+      "epoch": 0.6615028490028491,
+      "grad_norm": 0.5678503513336182,
+      "learning_rate": 0.00018692037786798143,
+      "loss": 1.0895,
+      "step": 3715
+    },
+    {
+      "epoch": 0.6616809116809117,
+      "grad_norm": 0.4950976073741913,
+      "learning_rate": 0.00018691345590992082,
+      "loss": 0.9584,
+      "step": 3716
+    },
+    {
+      "epoch": 0.6618589743589743,
+      "grad_norm": 0.4944666624069214,
+      "learning_rate": 0.0001869065322489716,
+      "loss": 0.8607,
+      "step": 3717
+    },
+    {
+      "epoch": 0.6620370370370371,
+      "grad_norm": 0.5197804570198059,
+      "learning_rate": 0.0001868996068852694,
+      "loss": 1.2335,
+      "step": 3718
+    },
+    {
+      "epoch": 0.6622150997150997,
+      "grad_norm": 0.6550365686416626,
+      "learning_rate": 0.00018689267981894994,
+      "loss": 1.0441,
+      "step": 3719
+    },
+    {
+      "epoch": 0.6623931623931624,
+      "grad_norm": 0.5331503748893738,
+      "learning_rate": 0.00018688575105014888,
+      "loss": 1.1696,
+      "step": 3720
+    },
+    {
+      "epoch": 0.6625712250712251,
+      "grad_norm": 0.47304239869117737,
+      "learning_rate": 0.00018687882057900207,
+      "loss": 0.9695,
+      "step": 3721
+    },
+    {
+      "epoch": 0.6627492877492878,
+      "grad_norm": 0.5653772354125977,
+      "learning_rate": 0.00018687188840564524,
+      "loss": 1.2082,
+      "step": 3722
+    },
+    {
+      "epoch": 0.6629273504273504,
+      "grad_norm": 0.5323491096496582,
+      "learning_rate": 0.00018686495453021417,
+      "loss": 0.9106,
+      "step": 3723
+    },
+    {
+      "epoch": 0.6631054131054132,
+      "grad_norm": 0.5612817406654358,
+      "learning_rate": 0.00018685801895284483,
+      "loss": 1.1302,
+      "step": 3724
+    },
+    {
+      "epoch": 0.6632834757834758,
+      "grad_norm": 0.4562164545059204,
+      "learning_rate": 0.000186851081673673,
+      "loss": 0.8886,
+      "step": 3725
+    },
+    {
+      "epoch": 0.6634615384615384,
+      "grad_norm": 0.5006430745124817,
+      "learning_rate": 0.00018684414269283463,
+      "loss": 0.9128,
+      "step": 3726
+    },
+    {
+      "epoch": 0.6636396011396012,
+      "grad_norm": 0.5305442810058594,
+      "learning_rate": 0.0001868372020104657,
+      "loss": 1.1766,
+      "step": 3727
+    },
+    {
+      "epoch": 0.6638176638176638,
+      "grad_norm": 0.6129274368286133,
+      "learning_rate": 0.0001868302596267022,
+      "loss": 1.04,
+      "step": 3728
+    },
+    {
+      "epoch": 0.6639957264957265,
+      "grad_norm": 0.5530399084091187,
+      "learning_rate": 0.00018682331554168013,
+      "loss": 1.4114,
+      "step": 3729
+    },
+    {
+      "epoch": 0.6641737891737892,
+      "grad_norm": 0.5397193431854248,
+      "learning_rate": 0.00018681636975553557,
+      "loss": 1.1945,
+      "step": 3730
+    },
+    {
+      "epoch": 0.6643518518518519,
+      "grad_norm": 0.5510205030441284,
+      "learning_rate": 0.00018680942226840456,
+      "loss": 1.0489,
+      "step": 3731
+    },
+    {
+      "epoch": 0.6645299145299145,
+      "grad_norm": 0.5519221425056458,
+      "learning_rate": 0.00018680247308042324,
+      "loss": 1.1633,
+      "step": 3732
+    },
+    {
+      "epoch": 0.6647079772079773,
+      "grad_norm": 0.4848768711090088,
+      "learning_rate": 0.00018679552219172784,
+      "loss": 0.8716,
+      "step": 3733
+    },
+    {
+      "epoch": 0.6648860398860399,
+      "grad_norm": 0.5490246415138245,
+      "learning_rate": 0.0001867885696024544,
+      "loss": 1.1347,
+      "step": 3734
+    },
+    {
+      "epoch": 0.6650641025641025,
+      "grad_norm": 0.5281458497047424,
+      "learning_rate": 0.00018678161531273928,
+      "loss": 1.0987,
+      "step": 3735
+    },
+    {
+      "epoch": 0.6652421652421653,
+      "grad_norm": 0.5313079953193665,
+      "learning_rate": 0.00018677465932271867,
+      "loss": 0.9705,
+      "step": 3736
+    },
+    {
+      "epoch": 0.6654202279202279,
+      "grad_norm": 0.5425750017166138,
+      "learning_rate": 0.0001867677016325289,
+      "loss": 1.1847,
+      "step": 3737
+    },
+    {
+      "epoch": 0.6655982905982906,
+      "grad_norm": 0.5796298980712891,
+      "learning_rate": 0.0001867607422423062,
+      "loss": 1.2639,
+      "step": 3738
+    },
+    {
+      "epoch": 0.6657763532763533,
+      "grad_norm": 0.49738675355911255,
+      "learning_rate": 0.00018675378115218702,
+      "loss": 1.0536,
+      "step": 3739
+    },
+    {
+      "epoch": 0.665954415954416,
+      "grad_norm": 0.665250301361084,
+      "learning_rate": 0.0001867468183623077,
+      "loss": 1.2836,
+      "step": 3740
+    },
+    {
+      "epoch": 0.6661324786324786,
+      "grad_norm": 0.5184717178344727,
+      "learning_rate": 0.00018673985387280469,
+      "loss": 1.0497,
+      "step": 3741
+    },
+    {
+      "epoch": 0.6663105413105413,
+      "grad_norm": 0.5129656791687012,
+      "learning_rate": 0.00018673288768381442,
+      "loss": 1.2041,
+      "step": 3742
+    },
+    {
+      "epoch": 0.666488603988604,
+      "grad_norm": 0.5308768153190613,
+      "learning_rate": 0.00018672591979547337,
+      "loss": 1.2092,
+      "step": 3743
+    },
+    {
+      "epoch": 0.6666666666666666,
+      "grad_norm": 0.5059141516685486,
+      "learning_rate": 0.00018671895020791812,
+      "loss": 1.1929,
+      "step": 3744
+    },
+    {
+      "epoch": 0.6668447293447294,
+      "grad_norm": 0.5237857103347778,
+      "learning_rate": 0.00018671197892128517,
+      "loss": 1.2538,
+      "step": 3745
+    },
+    {
+      "epoch": 0.667022792022792,
+      "grad_norm": 0.450000137090683,
+      "learning_rate": 0.0001867050059357111,
+      "loss": 0.7138,
+      "step": 3746
+    },
+    {
+      "epoch": 0.6672008547008547,
+      "grad_norm": 0.5413795709609985,
+      "learning_rate": 0.00018669803125133258,
+      "loss": 1.1383,
+      "step": 3747
+    },
+    {
+      "epoch": 0.6673789173789174,
+      "grad_norm": 0.4657825529575348,
+      "learning_rate": 0.00018669105486828622,
+      "loss": 1.0518,
+      "step": 3748
+    },
+    {
+      "epoch": 0.66755698005698,
+      "grad_norm": 0.6198551654815674,
+      "learning_rate": 0.00018668407678670875,
+      "loss": 1.2697,
+      "step": 3749
+    },
+    {
+      "epoch": 0.6677350427350427,
+      "grad_norm": 0.5112186074256897,
+      "learning_rate": 0.00018667709700673685,
+      "loss": 0.9907,
+      "step": 3750
+    },
+    {
+      "epoch": 0.6679131054131054,
+      "grad_norm": 0.5446593761444092,
+      "learning_rate": 0.00018667011552850728,
+      "loss": 1.0708,
+      "step": 3751
+    },
+    {
+      "epoch": 0.6680911680911681,
+      "grad_norm": 0.5673866271972656,
+      "learning_rate": 0.00018666313235215682,
+      "loss": 1.05,
+      "step": 3752
+    },
+    {
+      "epoch": 0.6682692307692307,
+      "grad_norm": 0.4821988046169281,
+      "learning_rate": 0.00018665614747782235,
+      "loss": 1.0543,
+      "step": 3753
+    },
+    {
+      "epoch": 0.6684472934472935,
+      "grad_norm": 0.5158842206001282,
+      "learning_rate": 0.00018664916090564067,
+      "loss": 1.0331,
+      "step": 3754
+    },
+    {
+      "epoch": 0.6686253561253561,
+      "grad_norm": 0.45486921072006226,
+      "learning_rate": 0.00018664217263574865,
+      "loss": 0.9262,
+      "step": 3755
+    },
+    {
+      "epoch": 0.6688034188034188,
+      "grad_norm": 0.46193036437034607,
+      "learning_rate": 0.00018663518266828327,
+      "loss": 0.9858,
+      "step": 3756
+    },
+    {
+      "epoch": 0.6689814814814815,
+      "grad_norm": 0.5144094824790955,
+      "learning_rate": 0.00018662819100338148,
+      "loss": 1.0302,
+      "step": 3757
+    },
+    {
+      "epoch": 0.6691595441595442,
+      "grad_norm": 0.5246134400367737,
+      "learning_rate": 0.0001866211976411802,
+      "loss": 1.064,
+      "step": 3758
+    },
+    {
+      "epoch": 0.6693376068376068,
+      "grad_norm": 0.4853166937828064,
+      "learning_rate": 0.0001866142025818165,
+      "loss": 0.9481,
+      "step": 3759
+    },
+    {
+      "epoch": 0.6695156695156695,
+      "grad_norm": 0.5029586553573608,
+      "learning_rate": 0.00018660720582542743,
+      "loss": 0.9443,
+      "step": 3760
+    },
+    {
+      "epoch": 0.6696937321937322,
+      "grad_norm": 0.5373172163963318,
+      "learning_rate": 0.0001866002073721501,
+      "loss": 1.1401,
+      "step": 3761
+    },
+    {
+      "epoch": 0.6698717948717948,
+      "grad_norm": 0.6236287951469421,
+      "learning_rate": 0.00018659320722212158,
+      "loss": 1.1255,
+      "step": 3762
+    },
+    {
+      "epoch": 0.6700498575498576,
+      "grad_norm": 0.5470684766769409,
+      "learning_rate": 0.00018658620537547903,
+      "loss": 1.0622,
+      "step": 3763
+    },
+    {
+      "epoch": 0.6702279202279202,
+      "grad_norm": 0.63177090883255,
+      "learning_rate": 0.00018657920183235964,
+      "loss": 0.9736,
+      "step": 3764
+    },
+    {
+      "epoch": 0.6704059829059829,
+      "grad_norm": 0.5456309914588928,
+      "learning_rate": 0.00018657219659290068,
+      "loss": 1.027,
+      "step": 3765
+    },
+    {
+      "epoch": 0.6705840455840456,
+      "grad_norm": 0.4816138744354248,
+      "learning_rate": 0.00018656518965723935,
+      "loss": 0.7801,
+      "step": 3766
+    },
+    {
+      "epoch": 0.6707621082621082,
+      "grad_norm": 0.4811640679836273,
+      "learning_rate": 0.00018655818102551294,
+      "loss": 1.0535,
+      "step": 3767
+    },
+    {
+      "epoch": 0.6709401709401709,
+      "grad_norm": 0.4677673280239105,
+      "learning_rate": 0.00018655117069785884,
+      "loss": 1.1043,
+      "step": 3768
+    },
+    {
+      "epoch": 0.6711182336182336,
+      "grad_norm": 0.5628635883331299,
+      "learning_rate": 0.0001865441586744143,
+      "loss": 1.0392,
+      "step": 3769
+    },
+    {
+      "epoch": 0.6712962962962963,
+      "grad_norm": 0.5484504103660583,
+      "learning_rate": 0.00018653714495531673,
+      "loss": 1.1533,
+      "step": 3770
+    },
+    {
+      "epoch": 0.6714743589743589,
+      "grad_norm": 0.5830571055412292,
+      "learning_rate": 0.0001865301295407036,
+      "loss": 1.2479,
+      "step": 3771
+    },
+    {
+      "epoch": 0.6716524216524217,
+      "grad_norm": 0.5516841411590576,
+      "learning_rate": 0.00018652311243071235,
+      "loss": 1.2152,
+      "step": 3772
+    },
+    {
+      "epoch": 0.6718304843304843,
+      "grad_norm": 0.6360766291618347,
+      "learning_rate": 0.0001865160936254804,
+      "loss": 1.0752,
+      "step": 3773
+    },
+    {
+      "epoch": 0.6720085470085471,
+      "grad_norm": 0.6038610935211182,
+      "learning_rate": 0.00018650907312514533,
+      "loss": 1.2425,
+      "step": 3774
+    },
+    {
+      "epoch": 0.6721866096866097,
+      "grad_norm": 0.49572908878326416,
+      "learning_rate": 0.0001865020509298447,
+      "loss": 1.0057,
+      "step": 3775
+    },
+    {
+      "epoch": 0.6723646723646723,
+      "grad_norm": 0.4551616311073303,
+      "learning_rate": 0.00018649502703971607,
+      "loss": 1.0763,
+      "step": 3776
+    },
+    {
+      "epoch": 0.6725427350427351,
+      "grad_norm": 0.6621482372283936,
+      "learning_rate": 0.00018648800145489706,
+      "loss": 1.0306,
+      "step": 3777
+    },
+    {
+      "epoch": 0.6727207977207977,
+      "grad_norm": 0.5523806810379028,
+      "learning_rate": 0.0001864809741755253,
+      "loss": 0.9906,
+      "step": 3778
+    },
+    {
+      "epoch": 0.6728988603988604,
+      "grad_norm": 0.5527048110961914,
+      "learning_rate": 0.00018647394520173856,
+      "loss": 1.0734,
+      "step": 3779
+    },
+    {
+      "epoch": 0.6730769230769231,
+      "grad_norm": 0.573573887348175,
+      "learning_rate": 0.00018646691453367444,
+      "loss": 1.1409,
+      "step": 3780
+    },
+    {
+      "epoch": 0.6732549857549858,
+      "grad_norm": 0.6273239254951477,
+      "learning_rate": 0.00018645988217147079,
+      "loss": 0.9682,
+      "step": 3781
+    },
+    {
+      "epoch": 0.6734330484330484,
+      "grad_norm": 0.4917762279510498,
+      "learning_rate": 0.00018645284811526534,
+      "loss": 0.9681,
+      "step": 3782
+    },
+    {
+      "epoch": 0.6736111111111112,
+      "grad_norm": 0.4901154339313507,
+      "learning_rate": 0.0001864458123651959,
+      "loss": 1.1828,
+      "step": 3783
+    },
+    {
+      "epoch": 0.6737891737891738,
+      "grad_norm": 0.6292546391487122,
+      "learning_rate": 0.00018643877492140036,
+      "loss": 1.1987,
+      "step": 3784
+    },
+    {
+      "epoch": 0.6739672364672364,
+      "grad_norm": 0.5334137678146362,
+      "learning_rate": 0.0001864317357840166,
+      "loss": 1.0347,
+      "step": 3785
+    },
+    {
+      "epoch": 0.6741452991452992,
+      "grad_norm": 0.6064338684082031,
+      "learning_rate": 0.0001864246949531825,
+      "loss": 1.4154,
+      "step": 3786
+    },
+    {
+      "epoch": 0.6743233618233618,
+      "grad_norm": 0.5442034602165222,
+      "learning_rate": 0.000186417652429036,
+      "loss": 1.2604,
+      "step": 3787
+    },
+    {
+      "epoch": 0.6745014245014245,
+      "grad_norm": 0.490858793258667,
+      "learning_rate": 0.00018641060821171518,
+      "loss": 1.1511,
+      "step": 3788
+    },
+    {
+      "epoch": 0.6746794871794872,
+      "grad_norm": 0.571116030216217,
+      "learning_rate": 0.00018640356230135798,
+      "loss": 1.1479,
+      "step": 3789
+    },
+    {
+      "epoch": 0.6748575498575499,
+      "grad_norm": 0.4857785105705261,
+      "learning_rate": 0.00018639651469810247,
+      "loss": 0.9,
+      "step": 3790
+    },
+    {
+      "epoch": 0.6750356125356125,
+      "grad_norm": 0.5320703983306885,
+      "learning_rate": 0.0001863894654020867,
+      "loss": 1.2284,
+      "step": 3791
+    },
+    {
+      "epoch": 0.6752136752136753,
+      "grad_norm": 0.5586925745010376,
+      "learning_rate": 0.0001863824144134488,
+      "loss": 1.1183,
+      "step": 3792
+    },
+    {
+      "epoch": 0.6753917378917379,
+      "grad_norm": 0.47740885615348816,
+      "learning_rate": 0.000186375361732327,
+      "loss": 1.1512,
+      "step": 3793
+    },
+    {
+      "epoch": 0.6755698005698005,
+      "grad_norm": 0.5867732167243958,
+      "learning_rate": 0.00018636830735885935,
+      "loss": 1.1903,
+      "step": 3794
+    },
+    {
+      "epoch": 0.6757478632478633,
+      "grad_norm": 0.5013887882232666,
+      "learning_rate": 0.0001863612512931842,
+      "loss": 0.8581,
+      "step": 3795
+    },
+    {
+      "epoch": 0.6759259259259259,
+      "grad_norm": 0.6026871204376221,
+      "learning_rate": 0.0001863541935354397,
+      "loss": 0.9581,
+      "step": 3796
+    },
+    {
+      "epoch": 0.6761039886039886,
+      "grad_norm": 0.5238468647003174,
+      "learning_rate": 0.00018634713408576415,
+      "loss": 1.0949,
+      "step": 3797
+    },
+    {
+      "epoch": 0.6762820512820513,
+      "grad_norm": 0.5128598213195801,
+      "learning_rate": 0.00018634007294429585,
+      "loss": 0.8992,
+      "step": 3798
+    },
+    {
+      "epoch": 0.676460113960114,
+      "grad_norm": 0.5092771053314209,
+      "learning_rate": 0.00018633301011117324,
+      "loss": 1.0793,
+      "step": 3799
+    },
+    {
+      "epoch": 0.6766381766381766,
+      "grad_norm": 0.592566728591919,
+      "learning_rate": 0.00018632594558653457,
+      "loss": 1.3242,
+      "step": 3800
+    },
+    {
+      "epoch": 0.6768162393162394,
+      "grad_norm": 0.4953067898750305,
+      "learning_rate": 0.0001863188793705184,
+      "loss": 0.9925,
+      "step": 3801
+    },
+    {
+      "epoch": 0.676994301994302,
+      "grad_norm": 0.4989747107028961,
+      "learning_rate": 0.00018631181146326305,
+      "loss": 1.0677,
+      "step": 3802
+    },
+    {
+      "epoch": 0.6771723646723646,
+      "grad_norm": 0.5375261902809143,
+      "learning_rate": 0.00018630474186490705,
+      "loss": 1.0556,
+      "step": 3803
+    },
+    {
+      "epoch": 0.6773504273504274,
+      "grad_norm": 0.6512624025344849,
+      "learning_rate": 0.00018629767057558894,
+      "loss": 1.2041,
+      "step": 3804
+    },
+    {
+      "epoch": 0.67752849002849,
+      "grad_norm": 0.5428260564804077,
+      "learning_rate": 0.00018629059759544723,
+      "loss": 0.9645,
+      "step": 3805
+    },
+    {
+      "epoch": 0.6777065527065527,
+      "grad_norm": 0.5598662495613098,
+      "learning_rate": 0.00018628352292462052,
+      "loss": 1.1683,
+      "step": 3806
+    },
+    {
+      "epoch": 0.6778846153846154,
+      "grad_norm": 0.49351340532302856,
+      "learning_rate": 0.0001862764465632474,
+      "loss": 1.1622,
+      "step": 3807
+    },
+    {
+      "epoch": 0.6780626780626781,
+      "grad_norm": 0.4796701669692993,
+      "learning_rate": 0.00018626936851146657,
+      "loss": 1.0017,
+      "step": 3808
+    },
+    {
+      "epoch": 0.6782407407407407,
+      "grad_norm": 0.444533109664917,
+      "learning_rate": 0.00018626228876941664,
+      "loss": 0.9145,
+      "step": 3809
+    },
+    {
+      "epoch": 0.6784188034188035,
+      "grad_norm": 0.5197392702102661,
+      "learning_rate": 0.00018625520733723635,
+      "loss": 1.283,
+      "step": 3810
+    },
+    {
+      "epoch": 0.6785968660968661,
+      "grad_norm": 0.48785829544067383,
+      "learning_rate": 0.00018624812421506447,
+      "loss": 1.1084,
+      "step": 3811
+    },
+    {
+      "epoch": 0.6787749287749287,
+      "grad_norm": 0.5083680152893066,
+      "learning_rate": 0.00018624103940303974,
+      "loss": 0.9071,
+      "step": 3812
+    },
+    {
+      "epoch": 0.6789529914529915,
+      "grad_norm": 0.553819477558136,
+      "learning_rate": 0.00018623395290130103,
+      "loss": 0.9986,
+      "step": 3813
+    },
+    {
+      "epoch": 0.6791310541310541,
+      "grad_norm": 0.5347508788108826,
+      "learning_rate": 0.00018622686470998713,
+      "loss": 1.0148,
+      "step": 3814
+    },
+    {
+      "epoch": 0.6793091168091168,
+      "grad_norm": 0.5080769062042236,
+      "learning_rate": 0.00018621977482923693,
+      "loss": 1.0169,
+      "step": 3815
+    },
+    {
+      "epoch": 0.6794871794871795,
+      "grad_norm": 0.5444077849388123,
+      "learning_rate": 0.00018621268325918938,
+      "loss": 1.172,
+      "step": 3816
+    },
+    {
+      "epoch": 0.6796652421652422,
+      "grad_norm": 0.521946132183075,
+      "learning_rate": 0.00018620558999998335,
+      "loss": 1.0247,
+      "step": 3817
+    },
+    {
+      "epoch": 0.6798433048433048,
+      "grad_norm": 0.5257413983345032,
+      "learning_rate": 0.00018619849505175786,
+      "loss": 1.1574,
+      "step": 3818
+    },
+    {
+      "epoch": 0.6800213675213675,
+      "grad_norm": 0.5473007559776306,
+      "learning_rate": 0.00018619139841465193,
+      "loss": 1.1254,
+      "step": 3819
+    },
+    {
+      "epoch": 0.6801994301994302,
+      "grad_norm": 0.5479872226715088,
+      "learning_rate": 0.00018618430008880463,
+      "loss": 1.0196,
+      "step": 3820
+    },
+    {
+      "epoch": 0.6803774928774928,
+      "grad_norm": 0.5918973088264465,
+      "learning_rate": 0.00018617720007435497,
+      "loss": 1.082,
+      "step": 3821
+    },
+    {
+      "epoch": 0.6805555555555556,
+      "grad_norm": 0.5411791801452637,
+      "learning_rate": 0.0001861700983714421,
+      "loss": 0.7723,
+      "step": 3822
+    },
+    {
+      "epoch": 0.6807336182336182,
+      "grad_norm": 0.5466326475143433,
+      "learning_rate": 0.00018616299498020516,
+      "loss": 1.0979,
+      "step": 3823
+    },
+    {
+      "epoch": 0.6809116809116809,
+      "grad_norm": 0.5405182838439941,
+      "learning_rate": 0.00018615588990078332,
+      "loss": 0.8891,
+      "step": 3824
+    },
+    {
+      "epoch": 0.6810897435897436,
+      "grad_norm": 0.5415780544281006,
+      "learning_rate": 0.00018614878313331579,
+      "loss": 1.0927,
+      "step": 3825
+    },
+    {
+      "epoch": 0.6812678062678063,
+      "grad_norm": 0.5284909605979919,
+      "learning_rate": 0.00018614167467794182,
+      "loss": 1.0684,
+      "step": 3826
+    },
+    {
+      "epoch": 0.6814458689458689,
+      "grad_norm": 0.4873995780944824,
+      "learning_rate": 0.00018613456453480062,
+      "loss": 1.1653,
+      "step": 3827
+    },
+    {
+      "epoch": 0.6816239316239316,
+      "grad_norm": 0.5506551265716553,
+      "learning_rate": 0.0001861274527040316,
+      "loss": 0.9876,
+      "step": 3828
+    },
+    {
+      "epoch": 0.6818019943019943,
+      "grad_norm": 0.5031297206878662,
+      "learning_rate": 0.0001861203391857741,
+      "loss": 1.067,
+      "step": 3829
+    },
+    {
+      "epoch": 0.6819800569800569,
+      "grad_norm": 0.622346043586731,
+      "learning_rate": 0.0001861132239801674,
+      "loss": 1.1514,
+      "step": 3830
+    },
+    {
+      "epoch": 0.6821581196581197,
+      "grad_norm": 0.47706183791160583,
+      "learning_rate": 0.000186106107087351,
+      "loss": 0.9857,
+      "step": 3831
+    },
+    {
+      "epoch": 0.6823361823361823,
+      "grad_norm": 0.5082845091819763,
+      "learning_rate": 0.00018609898850746424,
+      "loss": 1.123,
+      "step": 3832
+    },
+    {
+      "epoch": 0.6825142450142451,
+      "grad_norm": 0.5119805932044983,
+      "learning_rate": 0.00018609186824064671,
+      "loss": 1.1386,
+      "step": 3833
+    },
+    {
+      "epoch": 0.6826923076923077,
+      "grad_norm": 0.5247541069984436,
+      "learning_rate": 0.00018608474628703788,
+      "loss": 0.9433,
+      "step": 3834
+    },
+    {
+      "epoch": 0.6828703703703703,
+      "grad_norm": 0.4618282616138458,
+      "learning_rate": 0.00018607762264677722,
+      "loss": 0.8727,
+      "step": 3835
+    },
+    {
+      "epoch": 0.6830484330484331,
+      "grad_norm": 0.6014040112495422,
+      "learning_rate": 0.00018607049732000436,
+      "loss": 1.1823,
+      "step": 3836
+    },
+    {
+      "epoch": 0.6832264957264957,
+      "grad_norm": 0.6489043831825256,
+      "learning_rate": 0.00018606337030685892,
+      "loss": 1.1466,
+      "step": 3837
+    },
+    {
+      "epoch": 0.6834045584045584,
+      "grad_norm": 0.5527763366699219,
+      "learning_rate": 0.00018605624160748053,
+      "loss": 1.3015,
+      "step": 3838
+    },
+    {
+      "epoch": 0.6835826210826211,
+      "grad_norm": 0.5628284215927124,
+      "learning_rate": 0.0001860491112220088,
+      "loss": 1.1504,
+      "step": 3839
+    },
+    {
+      "epoch": 0.6837606837606838,
+      "grad_norm": 0.5414566993713379,
+      "learning_rate": 0.00018604197915058355,
+      "loss": 1.0155,
+      "step": 3840
+    },
+    {
+      "epoch": 0.6839387464387464,
+      "grad_norm": 0.5378929376602173,
+      "learning_rate": 0.00018603484539334443,
+      "loss": 0.8917,
+      "step": 3841
+    },
+    {
+      "epoch": 0.6841168091168092,
+      "grad_norm": 0.5953748822212219,
+      "learning_rate": 0.00018602770995043125,
+      "loss": 1.1971,
+      "step": 3842
+    },
+    {
+      "epoch": 0.6842948717948718,
+      "grad_norm": 0.511813759803772,
+      "learning_rate": 0.00018602057282198376,
+      "loss": 1.1345,
+      "step": 3843
+    },
+    {
+      "epoch": 0.6844729344729344,
+      "grad_norm": 0.5145484209060669,
+      "learning_rate": 0.00018601343400814185,
+      "loss": 1.0786,
+      "step": 3844
+    },
+    {
+      "epoch": 0.6846509971509972,
+      "grad_norm": 0.5199604034423828,
+      "learning_rate": 0.00018600629350904542,
+      "loss": 1.2063,
+      "step": 3845
+    },
+    {
+      "epoch": 0.6848290598290598,
+      "grad_norm": 0.5653825998306274,
+      "learning_rate": 0.0001859991513248343,
+      "loss": 1.0314,
+      "step": 3846
+    },
+    {
+      "epoch": 0.6850071225071225,
+      "grad_norm": 0.5660843849182129,
+      "learning_rate": 0.00018599200745564843,
+      "loss": 1.2754,
+      "step": 3847
+    },
+    {
+      "epoch": 0.6851851851851852,
+      "grad_norm": 0.5225719809532166,
+      "learning_rate": 0.00018598486190162788,
+      "loss": 1.0837,
+      "step": 3848
+    },
+    {
+      "epoch": 0.6853632478632479,
+      "grad_norm": 0.5011669397354126,
+      "learning_rate": 0.00018597771466291252,
+      "loss": 1.1,
+      "step": 3849
+    },
+    {
+      "epoch": 0.6855413105413105,
+      "grad_norm": 0.5923115015029907,
+      "learning_rate": 0.00018597056573964245,
+      "loss": 1.1875,
+      "step": 3850
+    },
+    {
+      "epoch": 0.6857193732193733,
+      "grad_norm": 0.5666482448577881,
+      "learning_rate": 0.00018596341513195776,
+      "loss": 1.1663,
+      "step": 3851
+    },
+    {
+      "epoch": 0.6858974358974359,
+      "grad_norm": 0.5396790504455566,
+      "learning_rate": 0.0001859562628399985,
+      "loss": 1.1179,
+      "step": 3852
+    },
+    {
+      "epoch": 0.6860754985754985,
+      "grad_norm": 0.5709532499313354,
+      "learning_rate": 0.00018594910886390485,
+      "loss": 1.0369,
+      "step": 3853
+    },
+    {
+      "epoch": 0.6862535612535613,
+      "grad_norm": 0.45524322986602783,
+      "learning_rate": 0.00018594195320381692,
+      "loss": 1.0171,
+      "step": 3854
+    },
+    {
+      "epoch": 0.6864316239316239,
+      "grad_norm": 0.6130724549293518,
+      "learning_rate": 0.00018593479585987498,
+      "loss": 1.1944,
+      "step": 3855
+    },
+    {
+      "epoch": 0.6866096866096866,
+      "grad_norm": 0.5079745054244995,
+      "learning_rate": 0.0001859276368322192,
+      "loss": 1.2567,
+      "step": 3856
+    },
+    {
+      "epoch": 0.6867877492877493,
+      "grad_norm": 0.49919846653938293,
+      "learning_rate": 0.00018592047612098992,
+      "loss": 0.9459,
+      "step": 3857
+    },
+    {
+      "epoch": 0.686965811965812,
+      "grad_norm": 0.5776857733726501,
+      "learning_rate": 0.00018591331372632734,
+      "loss": 1.2456,
+      "step": 3858
+    },
+    {
+      "epoch": 0.6871438746438746,
+      "grad_norm": 0.4740692377090454,
+      "learning_rate": 0.00018590614964837188,
+      "loss": 1.0401,
+      "step": 3859
+    },
+    {
+      "epoch": 0.6873219373219374,
+      "grad_norm": 0.5015742182731628,
+      "learning_rate": 0.00018589898388726389,
+      "loss": 1.2052,
+      "step": 3860
+    },
+    {
+      "epoch": 0.6875,
+      "grad_norm": 0.4819730818271637,
+      "learning_rate": 0.0001858918164431437,
+      "loss": 1.007,
+      "step": 3861
+    },
+    {
+      "epoch": 0.6876780626780626,
+      "grad_norm": 0.5510426163673401,
+      "learning_rate": 0.00018588464731615184,
+      "loss": 1.0123,
+      "step": 3862
+    },
+    {
+      "epoch": 0.6878561253561254,
+      "grad_norm": 0.4950829744338989,
+      "learning_rate": 0.00018587747650642867,
+      "loss": 1.033,
+      "step": 3863
+    },
+    {
+      "epoch": 0.688034188034188,
+      "grad_norm": 0.5278680920600891,
+      "learning_rate": 0.0001858703040141148,
+      "loss": 1.0912,
+      "step": 3864
+    },
+    {
+      "epoch": 0.6882122507122507,
+      "grad_norm": 0.6359158158302307,
+      "learning_rate": 0.00018586312983935068,
+      "loss": 1.2868,
+      "step": 3865
+    },
+    {
+      "epoch": 0.6883903133903134,
+      "grad_norm": 0.5098239183425903,
+      "learning_rate": 0.0001858559539822769,
+      "loss": 0.8364,
+      "step": 3866
+    },
+    {
+      "epoch": 0.6885683760683761,
+      "grad_norm": 0.5651038289070129,
+      "learning_rate": 0.000185848776443034,
+      "loss": 1.1983,
+      "step": 3867
+    },
+    {
+      "epoch": 0.6887464387464387,
+      "grad_norm": 0.5305678248405457,
+      "learning_rate": 0.00018584159722176272,
+      "loss": 1.32,
+      "step": 3868
+    },
+    {
+      "epoch": 0.6889245014245015,
+      "grad_norm": 0.5481845140457153,
+      "learning_rate": 0.00018583441631860368,
+      "loss": 1.013,
+      "step": 3869
+    },
+    {
+      "epoch": 0.6891025641025641,
+      "grad_norm": 0.5214795470237732,
+      "learning_rate": 0.00018582723373369753,
+      "loss": 1.172,
+      "step": 3870
+    },
+    {
+      "epoch": 0.6892806267806267,
+      "grad_norm": 0.6282780766487122,
+      "learning_rate": 0.00018582004946718502,
+      "loss": 1.7304,
+      "step": 3871
+    },
+    {
+      "epoch": 0.6894586894586895,
+      "grad_norm": 0.5266988277435303,
+      "learning_rate": 0.0001858128635192069,
+      "loss": 1.1418,
+      "step": 3872
+    },
+    {
+      "epoch": 0.6896367521367521,
+      "grad_norm": 0.4761001467704773,
+      "learning_rate": 0.000185805675889904,
+      "loss": 0.8585,
+      "step": 3873
+    },
+    {
+      "epoch": 0.6898148148148148,
+      "grad_norm": 0.528779923915863,
+      "learning_rate": 0.00018579848657941715,
+      "loss": 1.0036,
+      "step": 3874
+    },
+    {
+      "epoch": 0.6899928774928775,
+      "grad_norm": 0.5427684783935547,
+      "learning_rate": 0.00018579129558788716,
+      "loss": 0.9769,
+      "step": 3875
+    },
+    {
+      "epoch": 0.6901709401709402,
+      "grad_norm": 0.6229544281959534,
+      "learning_rate": 0.00018578410291545495,
+      "loss": 1.2848,
+      "step": 3876
+    },
+    {
+      "epoch": 0.6903490028490028,
+      "grad_norm": 0.6602693200111389,
+      "learning_rate": 0.00018577690856226147,
+      "loss": 1.2713,
+      "step": 3877
+    },
+    {
+      "epoch": 0.6905270655270656,
+      "grad_norm": 0.45884042978286743,
+      "learning_rate": 0.0001857697125284476,
+      "loss": 0.9143,
+      "step": 3878
+    },
+    {
+      "epoch": 0.6907051282051282,
+      "grad_norm": 0.4956444203853607,
+      "learning_rate": 0.00018576251481415443,
+      "loss": 0.9646,
+      "step": 3879
+    },
+    {
+      "epoch": 0.6908831908831908,
+      "grad_norm": 0.473561555147171,
+      "learning_rate": 0.00018575531541952292,
+      "loss": 0.843,
+      "step": 3880
+    },
+    {
+      "epoch": 0.6910612535612536,
+      "grad_norm": 0.4676312506198883,
+      "learning_rate": 0.00018574811434469415,
+      "loss": 0.9464,
+      "step": 3881
+    },
+    {
+      "epoch": 0.6912393162393162,
+      "grad_norm": 0.5452045202255249,
+      "learning_rate": 0.00018574091158980922,
+      "loss": 0.985,
+      "step": 3882
+    },
+    {
+      "epoch": 0.6914173789173789,
+      "grad_norm": 0.6274946331977844,
+      "learning_rate": 0.0001857337071550092,
+      "loss": 1.0357,
+      "step": 3883
+    },
+    {
+      "epoch": 0.6915954415954416,
+      "grad_norm": 0.5533788800239563,
+      "learning_rate": 0.00018572650104043531,
+      "loss": 1.2636,
+      "step": 3884
+    },
+    {
+      "epoch": 0.6917735042735043,
+      "grad_norm": 0.48312318325042725,
+      "learning_rate": 0.00018571929324622872,
+      "loss": 1.2402,
+      "step": 3885
+    },
+    {
+      "epoch": 0.6919515669515669,
+      "grad_norm": 0.6087453961372375,
+      "learning_rate": 0.00018571208377253062,
+      "loss": 1.2961,
+      "step": 3886
+    },
+    {
+      "epoch": 0.6921296296296297,
+      "grad_norm": 0.49156486988067627,
+      "learning_rate": 0.00018570487261948234,
+      "loss": 0.9585,
+      "step": 3887
+    },
+    {
+      "epoch": 0.6923076923076923,
+      "grad_norm": 0.5200015902519226,
+      "learning_rate": 0.0001856976597872251,
+      "loss": 0.9274,
+      "step": 3888
+    },
+    {
+      "epoch": 0.6924857549857549,
+      "grad_norm": 0.5185118913650513,
+      "learning_rate": 0.0001856904452759002,
+      "loss": 1.0015,
+      "step": 3889
+    },
+    {
+      "epoch": 0.6926638176638177,
+      "grad_norm": 0.5859049558639526,
+      "learning_rate": 0.00018568322908564904,
+      "loss": 1.0959,
+      "step": 3890
+    },
+    {
+      "epoch": 0.6928418803418803,
+      "grad_norm": 0.5882301926612854,
+      "learning_rate": 0.00018567601121661302,
+      "loss": 1.3214,
+      "step": 3891
+    },
+    {
+      "epoch": 0.6930199430199431,
+      "grad_norm": 0.6475503444671631,
+      "learning_rate": 0.0001856687916689335,
+      "loss": 1.3265,
+      "step": 3892
+    },
+    {
+      "epoch": 0.6931980056980057,
+      "grad_norm": 0.46175432205200195,
+      "learning_rate": 0.000185661570442752,
+      "loss": 0.8547,
+      "step": 3893
+    },
+    {
+      "epoch": 0.6933760683760684,
+      "grad_norm": 0.5362716913223267,
+      "learning_rate": 0.00018565434753820998,
+      "loss": 0.974,
+      "step": 3894
+    },
+    {
+      "epoch": 0.6935541310541311,
+      "grad_norm": 0.4317963719367981,
+      "learning_rate": 0.00018564712295544896,
+      "loss": 0.7653,
+      "step": 3895
+    },
+    {
+      "epoch": 0.6937321937321937,
+      "grad_norm": 0.5679717659950256,
+      "learning_rate": 0.00018563989669461047,
+      "loss": 1.0691,
+      "step": 3896
+    },
+    {
+      "epoch": 0.6939102564102564,
+      "grad_norm": 0.5058363676071167,
+      "learning_rate": 0.00018563266875583608,
+      "loss": 1.0665,
+      "step": 3897
+    },
+    {
+      "epoch": 0.6940883190883191,
+      "grad_norm": 0.5365496277809143,
+      "learning_rate": 0.00018562543913926746,
+      "loss": 0.9963,
+      "step": 3898
+    },
+    {
+      "epoch": 0.6942663817663818,
+      "grad_norm": 0.49945300817489624,
+      "learning_rate": 0.0001856182078450462,
+      "loss": 0.8668,
+      "step": 3899
+    },
+    {
+      "epoch": 0.6944444444444444,
+      "grad_norm": 0.5869430899620056,
+      "learning_rate": 0.00018561097487331405,
+      "loss": 1.1942,
+      "step": 3900
+    },
+    {
+      "epoch": 0.6946225071225072,
+      "grad_norm": 0.5188950300216675,
+      "learning_rate": 0.0001856037402242127,
+      "loss": 0.9493,
+      "step": 3901
+    },
+    {
+      "epoch": 0.6948005698005698,
+      "grad_norm": 0.510788083076477,
+      "learning_rate": 0.00018559650389788384,
+      "loss": 0.9989,
+      "step": 3902
+    },
+    {
+      "epoch": 0.6949786324786325,
+      "grad_norm": 0.5360601544380188,
+      "learning_rate": 0.0001855892658944693,
+      "loss": 1.2766,
+      "step": 3903
+    },
+    {
+      "epoch": 0.6951566951566952,
+      "grad_norm": 0.522502601146698,
+      "learning_rate": 0.00018558202621411093,
+      "loss": 0.8774,
+      "step": 3904
+    },
+    {
+      "epoch": 0.6953347578347578,
+      "grad_norm": 0.5330635905265808,
+      "learning_rate": 0.00018557478485695052,
+      "loss": 0.972,
+      "step": 3905
+    },
+    {
+      "epoch": 0.6955128205128205,
+      "grad_norm": 0.5387479066848755,
+      "learning_rate": 0.00018556754182312996,
+      "loss": 1.0574,
+      "step": 3906
+    },
+    {
+      "epoch": 0.6956908831908832,
+      "grad_norm": 0.5357984900474548,
+      "learning_rate": 0.00018556029711279116,
+      "loss": 1.396,
+      "step": 3907
+    },
+    {
+      "epoch": 0.6958689458689459,
+      "grad_norm": 0.5647178292274475,
+      "learning_rate": 0.00018555305072607612,
+      "loss": 1.3304,
+      "step": 3908
+    },
+    {
+      "epoch": 0.6960470085470085,
+      "grad_norm": 0.46460914611816406,
+      "learning_rate": 0.00018554580266312673,
+      "loss": 0.9574,
+      "step": 3909
+    },
+    {
+      "epoch": 0.6962250712250713,
+      "grad_norm": 0.6206206679344177,
+      "learning_rate": 0.00018553855292408503,
+      "loss": 1.1637,
+      "step": 3910
+    },
+    {
+      "epoch": 0.6964031339031339,
+      "grad_norm": 0.5899842977523804,
+      "learning_rate": 0.00018553130150909312,
+      "loss": 1.1067,
+      "step": 3911
+    },
+    {
+      "epoch": 0.6965811965811965,
+      "grad_norm": 0.47294262051582336,
+      "learning_rate": 0.000185524048418293,
+      "loss": 1.1516,
+      "step": 3912
+    },
+    {
+      "epoch": 0.6967592592592593,
+      "grad_norm": 0.5791197419166565,
+      "learning_rate": 0.00018551679365182684,
+      "loss": 1.0007,
+      "step": 3913
+    },
+    {
+      "epoch": 0.6969373219373219,
+      "grad_norm": 0.5678651332855225,
+      "learning_rate": 0.00018550953720983672,
+      "loss": 1.2698,
+      "step": 3914
+    },
+    {
+      "epoch": 0.6971153846153846,
+      "grad_norm": 0.6509683728218079,
+      "learning_rate": 0.0001855022790924649,
+      "loss": 1.0354,
+      "step": 3915
+    },
+    {
+      "epoch": 0.6972934472934473,
+      "grad_norm": 0.5176648497581482,
+      "learning_rate": 0.0001854950192998535,
+      "loss": 1.1243,
+      "step": 3916
+    },
+    {
+      "epoch": 0.69747150997151,
+      "grad_norm": 0.520631730556488,
+      "learning_rate": 0.00018548775783214477,
+      "loss": 1.1371,
+      "step": 3917
+    },
+    {
+      "epoch": 0.6976495726495726,
+      "grad_norm": 0.5408333539962769,
+      "learning_rate": 0.00018548049468948108,
+      "loss": 1.1185,
+      "step": 3918
+    },
+    {
+      "epoch": 0.6978276353276354,
+      "grad_norm": 0.5423790216445923,
+      "learning_rate": 0.00018547322987200461,
+      "loss": 1.1539,
+      "step": 3919
+    },
+    {
+      "epoch": 0.698005698005698,
+      "grad_norm": 0.5422113537788391,
+      "learning_rate": 0.0001854659633798578,
+      "loss": 1.171,
+      "step": 3920
+    },
+    {
+      "epoch": 0.6981837606837606,
+      "grad_norm": 0.5113416314125061,
+      "learning_rate": 0.00018545869521318292,
+      "loss": 1.0597,
+      "step": 3921
+    },
+    {
+      "epoch": 0.6983618233618234,
+      "grad_norm": 0.49901214241981506,
+      "learning_rate": 0.00018545142537212248,
+      "loss": 1.1043,
+      "step": 3922
+    },
+    {
+      "epoch": 0.698539886039886,
+      "grad_norm": 0.6606622338294983,
+      "learning_rate": 0.00018544415385681885,
+      "loss": 1.1797,
+      "step": 3923
+    },
+    {
+      "epoch": 0.6987179487179487,
+      "grad_norm": 0.4786234498023987,
+      "learning_rate": 0.00018543688066741454,
+      "loss": 0.9532,
+      "step": 3924
+    },
+    {
+      "epoch": 0.6988960113960114,
+      "grad_norm": 0.5900700688362122,
+      "learning_rate": 0.00018542960580405203,
+      "loss": 1.1171,
+      "step": 3925
+    },
+    {
+      "epoch": 0.6990740740740741,
+      "grad_norm": 0.53485506772995,
+      "learning_rate": 0.00018542232926687383,
+      "loss": 1.1535,
+      "step": 3926
+    },
+    {
+      "epoch": 0.6992521367521367,
+      "grad_norm": 0.5269177556037903,
+      "learning_rate": 0.00018541505105602255,
+      "loss": 1.0287,
+      "step": 3927
+    },
+    {
+      "epoch": 0.6994301994301995,
+      "grad_norm": 0.5185505151748657,
+      "learning_rate": 0.0001854077711716408,
+      "loss": 1.2526,
+      "step": 3928
+    },
+    {
+      "epoch": 0.6996082621082621,
+      "grad_norm": 0.5615512132644653,
+      "learning_rate": 0.00018540048961387115,
+      "loss": 1.0189,
+      "step": 3929
+    },
+    {
+      "epoch": 0.6997863247863247,
+      "grad_norm": 0.4492493271827698,
+      "learning_rate": 0.00018539320638285637,
+      "loss": 0.8917,
+      "step": 3930
+    },
+    {
+      "epoch": 0.6999643874643875,
+      "grad_norm": 0.5062302947044373,
+      "learning_rate": 0.00018538592147873906,
+      "loss": 1.053,
+      "step": 3931
+    },
+    {
+      "epoch": 0.7001424501424501,
+      "grad_norm": 0.5508798956871033,
+      "learning_rate": 0.000185378634901662,
+      "loss": 0.9638,
+      "step": 3932
+    },
+    {
+      "epoch": 0.7003205128205128,
+      "grad_norm": 0.463980108499527,
+      "learning_rate": 0.00018537134665176793,
+      "loss": 1.0945,
+      "step": 3933
+    },
+    {
+      "epoch": 0.7004985754985755,
+      "grad_norm": 0.5027088522911072,
+      "learning_rate": 0.0001853640567291997,
+      "loss": 1.1745,
+      "step": 3934
+    },
+    {
+      "epoch": 0.7006766381766382,
+      "grad_norm": 0.5006551146507263,
+      "learning_rate": 0.00018535676513410009,
+      "loss": 0.8521,
+      "step": 3935
+    },
+    {
+      "epoch": 0.7008547008547008,
+      "grad_norm": 0.5870724320411682,
+      "learning_rate": 0.000185349471866612,
+      "loss": 0.9197,
+      "step": 3936
+    },
+    {
+      "epoch": 0.7010327635327636,
+      "grad_norm": 0.5030696392059326,
+      "learning_rate": 0.00018534217692687825,
+      "loss": 1.1049,
+      "step": 3937
+    },
+    {
+      "epoch": 0.7012108262108262,
+      "grad_norm": 0.5212681889533997,
+      "learning_rate": 0.00018533488031504186,
+      "loss": 1.3397,
+      "step": 3938
+    },
+    {
+      "epoch": 0.7013888888888888,
+      "grad_norm": 0.5649709105491638,
+      "learning_rate": 0.0001853275820312458,
+      "loss": 1.1994,
+      "step": 3939
+    },
+    {
+      "epoch": 0.7015669515669516,
+      "grad_norm": 0.4892779290676117,
+      "learning_rate": 0.00018532028207563297,
+      "loss": 1.1511,
+      "step": 3940
+    },
+    {
+      "epoch": 0.7017450142450142,
+      "grad_norm": 0.4929407835006714,
+      "learning_rate": 0.00018531298044834643,
+      "loss": 1.0792,
+      "step": 3941
+    },
+    {
+      "epoch": 0.7019230769230769,
+      "grad_norm": 0.5645940899848938,
+      "learning_rate": 0.00018530567714952932,
+      "loss": 1.0937,
+      "step": 3942
+    },
+    {
+      "epoch": 0.7021011396011396,
+      "grad_norm": 0.5471178293228149,
+      "learning_rate": 0.00018529837217932466,
+      "loss": 1.193,
+      "step": 3943
+    },
+    {
+      "epoch": 0.7022792022792023,
+      "grad_norm": 0.576627790927887,
+      "learning_rate": 0.00018529106553787558,
+      "loss": 1.1032,
+      "step": 3944
+    },
+    {
+      "epoch": 0.7024572649572649,
+      "grad_norm": 0.5015735626220703,
+      "learning_rate": 0.00018528375722532526,
+      "loss": 1.066,
+      "step": 3945
+    },
+    {
+      "epoch": 0.7026353276353277,
+      "grad_norm": 0.5315404534339905,
+      "learning_rate": 0.00018527644724181683,
+      "loss": 1.2059,
+      "step": 3946
+    },
+    {
+      "epoch": 0.7028133903133903,
+      "grad_norm": 0.5516065955162048,
+      "learning_rate": 0.0001852691355874936,
+      "loss": 1.161,
+      "step": 3947
+    },
+    {
+      "epoch": 0.7029914529914529,
+      "grad_norm": 0.5026212930679321,
+      "learning_rate": 0.0001852618222624988,
+      "loss": 1.2616,
+      "step": 3948
+    },
+    {
+      "epoch": 0.7031695156695157,
+      "grad_norm": 0.49874603748321533,
+      "learning_rate": 0.0001852545072669757,
+      "loss": 0.805,
+      "step": 3949
+    },
+    {
+      "epoch": 0.7033475783475783,
+      "grad_norm": 0.47698748111724854,
+      "learning_rate": 0.00018524719060106763,
+      "loss": 1.2321,
+      "step": 3950
+    },
+    {
+      "epoch": 0.7035256410256411,
+      "grad_norm": 0.5201322436332703,
+      "learning_rate": 0.00018523987226491792,
+      "loss": 1.1577,
+      "step": 3951
+    },
+    {
+      "epoch": 0.7037037037037037,
+      "grad_norm": 0.5506543517112732,
+      "learning_rate": 0.00018523255225867002,
+      "loss": 1.2289,
+      "step": 3952
+    },
+    {
+      "epoch": 0.7038817663817664,
+      "grad_norm": 0.5691256523132324,
+      "learning_rate": 0.0001852252305824673,
+      "loss": 1.1945,
+      "step": 3953
+    },
+    {
+      "epoch": 0.7040598290598291,
+      "grad_norm": 0.5324838757514954,
+      "learning_rate": 0.00018521790723645322,
+      "loss": 1.1037,
+      "step": 3954
+    },
+    {
+      "epoch": 0.7042378917378918,
+      "grad_norm": 0.5238786339759827,
+      "learning_rate": 0.00018521058222077127,
+      "loss": 1.2075,
+      "step": 3955
+    },
+    {
+      "epoch": 0.7044159544159544,
+      "grad_norm": 0.4936453402042389,
+      "learning_rate": 0.00018520325553556498,
+      "loss": 1.0537,
+      "step": 3956
+    },
+    {
+      "epoch": 0.7045940170940171,
+      "grad_norm": 0.6198282837867737,
+      "learning_rate": 0.00018519592718097791,
+      "loss": 1.0728,
+      "step": 3957
+    },
+    {
+      "epoch": 0.7047720797720798,
+      "grad_norm": 0.44729140400886536,
+      "learning_rate": 0.0001851885971571536,
+      "loss": 0.8432,
+      "step": 3958
+    },
+    {
+      "epoch": 0.7049501424501424,
+      "grad_norm": 0.5884211659431458,
+      "learning_rate": 0.00018518126546423572,
+      "loss": 0.9515,
+      "step": 3959
+    },
+    {
+      "epoch": 0.7051282051282052,
+      "grad_norm": 0.5293807983398438,
+      "learning_rate": 0.00018517393210236788,
+      "loss": 1.1178,
+      "step": 3960
+    },
+    {
+      "epoch": 0.7053062678062678,
+      "grad_norm": 0.6036825180053711,
+      "learning_rate": 0.00018516659707169374,
+      "loss": 1.0408,
+      "step": 3961
+    },
+    {
+      "epoch": 0.7054843304843305,
+      "grad_norm": 0.5157122015953064,
+      "learning_rate": 0.0001851592603723571,
+      "loss": 1.2136,
+      "step": 3962
+    },
+    {
+      "epoch": 0.7056623931623932,
+      "grad_norm": 0.5354781150817871,
+      "learning_rate": 0.00018515192200450163,
+      "loss": 0.7165,
+      "step": 3963
+    },
+    {
+      "epoch": 0.7058404558404558,
+      "grad_norm": 0.6073734760284424,
+      "learning_rate": 0.00018514458196827111,
+      "loss": 1.3079,
+      "step": 3964
+    },
+    {
+      "epoch": 0.7060185185185185,
+      "grad_norm": 0.4324839413166046,
+      "learning_rate": 0.0001851372402638094,
+      "loss": 0.7903,
+      "step": 3965
+    },
+    {
+      "epoch": 0.7061965811965812,
+      "grad_norm": 0.6530333161354065,
+      "learning_rate": 0.00018512989689126034,
+      "loss": 1.3179,
+      "step": 3966
+    },
+    {
+      "epoch": 0.7063746438746439,
+      "grad_norm": 0.5500404238700867,
+      "learning_rate": 0.00018512255185076782,
+      "loss": 1.0624,
+      "step": 3967
+    },
+    {
+      "epoch": 0.7065527065527065,
+      "grad_norm": 0.6277863383293152,
+      "learning_rate": 0.00018511520514247567,
+      "loss": 1.1056,
+      "step": 3968
+    },
+    {
+      "epoch": 0.7067307692307693,
+      "grad_norm": 0.580544650554657,
+      "learning_rate": 0.0001851078567665279,
+      "loss": 0.9849,
+      "step": 3969
+    },
+    {
+      "epoch": 0.7069088319088319,
+      "grad_norm": 0.4880999028682709,
+      "learning_rate": 0.00018510050672306848,
+      "loss": 1.0185,
+      "step": 3970
+    },
+    {
+      "epoch": 0.7070868945868946,
+      "grad_norm": 0.4919959306716919,
+      "learning_rate": 0.0001850931550122414,
+      "loss": 1.0334,
+      "step": 3971
+    },
+    {
+      "epoch": 0.7072649572649573,
+      "grad_norm": 0.6001213192939758,
+      "learning_rate": 0.0001850858016341907,
+      "loss": 1.0729,
+      "step": 3972
+    },
+    {
+      "epoch": 0.70744301994302,
+      "grad_norm": 0.538690447807312,
+      "learning_rate": 0.00018507844658906052,
+      "loss": 1.0733,
+      "step": 3973
+    },
+    {
+      "epoch": 0.7076210826210826,
+      "grad_norm": 0.5427643656730652,
+      "learning_rate": 0.00018507108987699487,
+      "loss": 1.1207,
+      "step": 3974
+    },
+    {
+      "epoch": 0.7077991452991453,
+      "grad_norm": 0.43014347553253174,
+      "learning_rate": 0.00018506373149813795,
+      "loss": 0.7958,
+      "step": 3975
+    },
+    {
+      "epoch": 0.707977207977208,
+      "grad_norm": 0.56591796875,
+      "learning_rate": 0.00018505637145263394,
+      "loss": 1.2199,
+      "step": 3976
+    },
+    {
+      "epoch": 0.7081552706552706,
+      "grad_norm": 0.59147047996521,
+      "learning_rate": 0.000185049009740627,
+      "loss": 1.2354,
+      "step": 3977
+    },
+    {
+      "epoch": 0.7083333333333334,
+      "grad_norm": 0.5078346133232117,
+      "learning_rate": 0.00018504164636226137,
+      "loss": 0.976,
+      "step": 3978
+    },
+    {
+      "epoch": 0.708511396011396,
+      "grad_norm": 0.533302366733551,
+      "learning_rate": 0.00018503428131768135,
+      "loss": 0.9653,
+      "step": 3979
+    },
+    {
+      "epoch": 0.7086894586894587,
+      "grad_norm": 0.4985341727733612,
+      "learning_rate": 0.00018502691460703122,
+      "loss": 1.1485,
+      "step": 3980
+    },
+    {
+      "epoch": 0.7088675213675214,
+      "grad_norm": 0.5143141150474548,
+      "learning_rate": 0.00018501954623045532,
+      "loss": 1.148,
+      "step": 3981
+    },
+    {
+      "epoch": 0.709045584045584,
+      "grad_norm": 0.507189154624939,
+      "learning_rate": 0.00018501217618809804,
+      "loss": 0.9306,
+      "step": 3982
+    },
+    {
+      "epoch": 0.7092236467236467,
+      "grad_norm": 0.5246604084968567,
+      "learning_rate": 0.00018500480448010377,
+      "loss": 0.9116,
+      "step": 3983
+    },
+    {
+      "epoch": 0.7094017094017094,
+      "grad_norm": 0.5321049094200134,
+      "learning_rate": 0.00018499743110661693,
+      "loss": 0.9607,
+      "step": 3984
+    },
+    {
+      "epoch": 0.7095797720797721,
+      "grad_norm": 0.62645423412323,
+      "learning_rate": 0.000184990056067782,
+      "loss": 1.5834,
+      "step": 3985
+    },
+    {
+      "epoch": 0.7097578347578347,
+      "grad_norm": 0.486557275056839,
+      "learning_rate": 0.0001849826793637435,
+      "loss": 1.0598,
+      "step": 3986
+    },
+    {
+      "epoch": 0.7099358974358975,
+      "grad_norm": 0.5122783184051514,
+      "learning_rate": 0.0001849753009946459,
+      "loss": 1.2213,
+      "step": 3987
+    },
+    {
+      "epoch": 0.7101139601139601,
+      "grad_norm": 0.4864068627357483,
+      "learning_rate": 0.0001849679209606338,
+      "loss": 1.2708,
+      "step": 3988
+    },
+    {
+      "epoch": 0.7102920227920227,
+      "grad_norm": 0.5860990881919861,
+      "learning_rate": 0.00018496053926185183,
+      "loss": 1.2421,
+      "step": 3989
+    },
+    {
+      "epoch": 0.7104700854700855,
+      "grad_norm": 0.471194326877594,
+      "learning_rate": 0.00018495315589844453,
+      "loss": 0.879,
+      "step": 3990
+    },
+    {
+      "epoch": 0.7106481481481481,
+      "grad_norm": 0.5626323819160461,
+      "learning_rate": 0.00018494577087055662,
+      "loss": 1.1297,
+      "step": 3991
+    },
+    {
+      "epoch": 0.7108262108262108,
+      "grad_norm": 0.4706762135028839,
+      "learning_rate": 0.0001849383841783328,
+      "loss": 1.0444,
+      "step": 3992
+    },
+    {
+      "epoch": 0.7110042735042735,
+      "grad_norm": 0.5776444673538208,
+      "learning_rate": 0.00018493099582191783,
+      "loss": 1.1773,
+      "step": 3993
+    },
+    {
+      "epoch": 0.7111823361823362,
+      "grad_norm": 0.5493253469467163,
+      "learning_rate": 0.00018492360580145637,
+      "loss": 1.0354,
+      "step": 3994
+    },
+    {
+      "epoch": 0.7113603988603988,
+      "grad_norm": 0.5328514575958252,
+      "learning_rate": 0.0001849162141170933,
+      "loss": 0.9251,
+      "step": 3995
+    },
+    {
+      "epoch": 0.7115384615384616,
+      "grad_norm": 0.5814893841743469,
+      "learning_rate": 0.0001849088207689734,
+      "loss": 1.1066,
+      "step": 3996
+    },
+    {
+      "epoch": 0.7117165242165242,
+      "grad_norm": 0.5476071834564209,
+      "learning_rate": 0.00018490142575724154,
+      "loss": 1.1613,
+      "step": 3997
+    },
+    {
+      "epoch": 0.7118945868945868,
+      "grad_norm": 0.5216463208198547,
+      "learning_rate": 0.00018489402908204258,
+      "loss": 1.2574,
+      "step": 3998
+    },
+    {
+      "epoch": 0.7120726495726496,
+      "grad_norm": 0.5110020637512207,
+      "learning_rate": 0.00018488663074352153,
+      "loss": 1.0663,
+      "step": 3999
+    },
+    {
+      "epoch": 0.7122507122507122,
+      "grad_norm": 0.448090523481369,
+      "learning_rate": 0.00018487923074182326,
+      "loss": 0.6687,
+      "step": 4000
+    },
+    {
+      "epoch": 0.7124287749287749,
+      "grad_norm": 0.4980565011501312,
+      "learning_rate": 0.00018487182907709279,
+      "loss": 1.2365,
+      "step": 4001
+    },
+    {
+      "epoch": 0.7126068376068376,
+      "grad_norm": 0.485831081867218,
+      "learning_rate": 0.00018486442574947511,
+      "loss": 1.0941,
+      "step": 4002
+    },
+    {
+      "epoch": 0.7127849002849003,
+      "grad_norm": 0.4955040216445923,
+      "learning_rate": 0.00018485702075911534,
+      "loss": 1.248,
+      "step": 4003
+    },
+    {
+      "epoch": 0.7129629629629629,
+      "grad_norm": 0.5168375968933105,
+      "learning_rate": 0.00018484961410615845,
+      "loss": 1.1118,
+      "step": 4004
+    },
+    {
+      "epoch": 0.7131410256410257,
+      "grad_norm": 0.5255687832832336,
+      "learning_rate": 0.00018484220579074968,
+      "loss": 1.0558,
+      "step": 4005
+    },
+    {
+      "epoch": 0.7133190883190883,
+      "grad_norm": 0.5502219796180725,
+      "learning_rate": 0.00018483479581303416,
+      "loss": 1.1604,
+      "step": 4006
+    },
+    {
+      "epoch": 0.7134971509971509,
+      "grad_norm": 0.5155881643295288,
+      "learning_rate": 0.000184827384173157,
+      "loss": 0.8246,
+      "step": 4007
+    },
+    {
+      "epoch": 0.7136752136752137,
+      "grad_norm": 0.5321542024612427,
+      "learning_rate": 0.0001848199708712635,
+      "loss": 1.2058,
+      "step": 4008
+    },
+    {
+      "epoch": 0.7138532763532763,
+      "grad_norm": 0.4929848313331604,
+      "learning_rate": 0.00018481255590749884,
+      "loss": 1.4023,
+      "step": 4009
+    },
+    {
+      "epoch": 0.7140313390313391,
+      "grad_norm": 0.5070937871932983,
+      "learning_rate": 0.00018480513928200836,
+      "loss": 1.0561,
+      "step": 4010
+    },
+    {
+      "epoch": 0.7142094017094017,
+      "grad_norm": 0.5750083327293396,
+      "learning_rate": 0.00018479772099493728,
+      "loss": 1.0276,
+      "step": 4011
+    },
+    {
+      "epoch": 0.7143874643874644,
+      "grad_norm": 0.5265933275222778,
+      "learning_rate": 0.00018479030104643108,
+      "loss": 1.0295,
+      "step": 4012
+    },
+    {
+      "epoch": 0.7145655270655271,
+      "grad_norm": 0.526830792427063,
+      "learning_rate": 0.00018478287943663504,
+      "loss": 1.0157,
+      "step": 4013
+    },
+    {
+      "epoch": 0.7147435897435898,
+      "grad_norm": 0.5344091653823853,
+      "learning_rate": 0.00018477545616569458,
+      "loss": 1.1997,
+      "step": 4014
+    },
+    {
+      "epoch": 0.7149216524216524,
+      "grad_norm": 0.4935445189476013,
+      "learning_rate": 0.0001847680312337552,
+      "loss": 1.1858,
+      "step": 4015
+    },
+    {
+      "epoch": 0.7150997150997151,
+      "grad_norm": 0.5291212797164917,
+      "learning_rate": 0.0001847606046409623,
+      "loss": 0.926,
+      "step": 4016
+    },
+    {
+      "epoch": 0.7152777777777778,
+      "grad_norm": 0.559050977230072,
+      "learning_rate": 0.00018475317638746142,
+      "loss": 1.0947,
+      "step": 4017
+    },
+    {
+      "epoch": 0.7154558404558404,
+      "grad_norm": 0.4566570222377777,
+      "learning_rate": 0.00018474574647339814,
+      "loss": 1.0334,
+      "step": 4018
+    },
+    {
+      "epoch": 0.7156339031339032,
+      "grad_norm": 0.5156155824661255,
+      "learning_rate": 0.000184738314898918,
+      "loss": 1.0076,
+      "step": 4019
+    },
+    {
+      "epoch": 0.7158119658119658,
+      "grad_norm": 0.5008716583251953,
+      "learning_rate": 0.00018473088166416662,
+      "loss": 1.0378,
+      "step": 4020
+    },
+    {
+      "epoch": 0.7159900284900285,
+      "grad_norm": 0.49556368589401245,
+      "learning_rate": 0.0001847234467692896,
+      "loss": 1.15,
+      "step": 4021
+    },
+    {
+      "epoch": 0.7161680911680912,
+      "grad_norm": 0.5464680790901184,
+      "learning_rate": 0.00018471601021443265,
+      "loss": 1.2975,
+      "step": 4022
+    },
+    {
+      "epoch": 0.7163461538461539,
+      "grad_norm": 0.6291980147361755,
+      "learning_rate": 0.00018470857199974144,
+      "loss": 1.05,
+      "step": 4023
+    },
+    {
+      "epoch": 0.7165242165242165,
+      "grad_norm": 0.5566631555557251,
+      "learning_rate": 0.00018470113212536176,
+      "loss": 1.1296,
+      "step": 4024
+    },
+    {
+      "epoch": 0.7167022792022792,
+      "grad_norm": 0.5569562911987305,
+      "learning_rate": 0.00018469369059143933,
+      "loss": 1.2484,
+      "step": 4025
+    },
+    {
+      "epoch": 0.7168803418803419,
+      "grad_norm": 0.5804716944694519,
+      "learning_rate": 0.00018468624739812,
+      "loss": 1.0547,
+      "step": 4026
+    },
+    {
+      "epoch": 0.7170584045584045,
+      "grad_norm": 0.6316802501678467,
+      "learning_rate": 0.00018467880254554952,
+      "loss": 1.1188,
+      "step": 4027
+    },
+    {
+      "epoch": 0.7172364672364673,
+      "grad_norm": 0.6131419539451599,
+      "learning_rate": 0.00018467135603387385,
+      "loss": 1.1662,
+      "step": 4028
+    },
+    {
+      "epoch": 0.7174145299145299,
+      "grad_norm": 0.4703124761581421,
+      "learning_rate": 0.00018466390786323883,
+      "loss": 1.038,
+      "step": 4029
+    },
+    {
+      "epoch": 0.7175925925925926,
+      "grad_norm": 0.5718469023704529,
+      "learning_rate": 0.0001846564580337904,
+      "loss": 1.0786,
+      "step": 4030
+    },
+    {
+      "epoch": 0.7177706552706553,
+      "grad_norm": 0.5227612853050232,
+      "learning_rate": 0.00018464900654567457,
+      "loss": 1.0561,
+      "step": 4031
+    },
+    {
+      "epoch": 0.717948717948718,
+      "grad_norm": 0.5800358057022095,
+      "learning_rate": 0.00018464155339903727,
+      "loss": 1.0944,
+      "step": 4032
+    },
+    {
+      "epoch": 0.7181267806267806,
+      "grad_norm": 0.5562314987182617,
+      "learning_rate": 0.00018463409859402455,
+      "loss": 0.8573,
+      "step": 4033
+    },
+    {
+      "epoch": 0.7183048433048433,
+      "grad_norm": 0.6420153379440308,
+      "learning_rate": 0.0001846266421307825,
+      "loss": 1.088,
+      "step": 4034
+    },
+    {
+      "epoch": 0.718482905982906,
+      "grad_norm": 0.4745902717113495,
+      "learning_rate": 0.00018461918400945718,
+      "loss": 1.1679,
+      "step": 4035
+    },
+    {
+      "epoch": 0.7186609686609686,
+      "grad_norm": 0.5070300102233887,
+      "learning_rate": 0.00018461172423019475,
+      "loss": 1.1984,
+      "step": 4036
+    },
+    {
+      "epoch": 0.7188390313390314,
+      "grad_norm": 0.5339375138282776,
+      "learning_rate": 0.00018460426279314133,
+      "loss": 1.3038,
+      "step": 4037
+    },
+    {
+      "epoch": 0.719017094017094,
+      "grad_norm": 0.5947147607803345,
+      "learning_rate": 0.00018459679969844313,
+      "loss": 1.0103,
+      "step": 4038
+    },
+    {
+      "epoch": 0.7191951566951567,
+      "grad_norm": 0.5493791699409485,
+      "learning_rate": 0.00018458933494624642,
+      "loss": 1.1001,
+      "step": 4039
+    },
+    {
+      "epoch": 0.7193732193732194,
+      "grad_norm": 0.5700310468673706,
+      "learning_rate": 0.00018458186853669736,
+      "loss": 0.9006,
+      "step": 4040
+    },
+    {
+      "epoch": 0.719551282051282,
+      "grad_norm": 0.60371994972229,
+      "learning_rate": 0.0001845744004699423,
+      "loss": 1.3001,
+      "step": 4041
+    },
+    {
+      "epoch": 0.7197293447293447,
+      "grad_norm": 0.5469261407852173,
+      "learning_rate": 0.00018456693074612757,
+      "loss": 1.1745,
+      "step": 4042
+    },
+    {
+      "epoch": 0.7199074074074074,
+      "grad_norm": 0.5179165601730347,
+      "learning_rate": 0.00018455945936539947,
+      "loss": 0.9883,
+      "step": 4043
+    },
+    {
+      "epoch": 0.7200854700854701,
+      "grad_norm": 0.5396696329116821,
+      "learning_rate": 0.00018455198632790447,
+      "loss": 1.1277,
+      "step": 4044
+    },
+    {
+      "epoch": 0.7202635327635327,
+      "grad_norm": 0.4559909403324127,
+      "learning_rate": 0.00018454451163378888,
+      "loss": 0.9644,
+      "step": 4045
+    },
+    {
+      "epoch": 0.7204415954415955,
+      "grad_norm": 0.49863892793655396,
+      "learning_rate": 0.00018453703528319927,
+      "loss": 1.1276,
+      "step": 4046
+    },
+    {
+      "epoch": 0.7206196581196581,
+      "grad_norm": 0.4790710508823395,
+      "learning_rate": 0.000184529557276282,
+      "loss": 0.9443,
+      "step": 4047
+    },
+    {
+      "epoch": 0.7207977207977208,
+      "grad_norm": 0.541999876499176,
+      "learning_rate": 0.0001845220776131837,
+      "loss": 1.0681,
+      "step": 4048
+    },
+    {
+      "epoch": 0.7209757834757835,
+      "grad_norm": 0.5119109153747559,
+      "learning_rate": 0.00018451459629405088,
+      "loss": 1.2078,
+      "step": 4049
+    },
+    {
+      "epoch": 0.7211538461538461,
+      "grad_norm": 0.6141307353973389,
+      "learning_rate": 0.00018450711331903006,
+      "loss": 1.1071,
+      "step": 4050
+    },
+    {
+      "epoch": 0.7213319088319088,
+      "grad_norm": 0.48679864406585693,
+      "learning_rate": 0.00018449962868826795,
+      "loss": 0.9713,
+      "step": 4051
+    },
+    {
+      "epoch": 0.7215099715099715,
+      "grad_norm": 0.5548661947250366,
+      "learning_rate": 0.0001844921424019111,
+      "loss": 1.2099,
+      "step": 4052
+    },
+    {
+      "epoch": 0.7216880341880342,
+      "grad_norm": 0.5000107884407043,
+      "learning_rate": 0.00018448465446010626,
+      "loss": 1.0184,
+      "step": 4053
+    },
+    {
+      "epoch": 0.7218660968660968,
+      "grad_norm": 0.6131454110145569,
+      "learning_rate": 0.00018447716486300013,
+      "loss": 1.2581,
+      "step": 4054
+    },
+    {
+      "epoch": 0.7220441595441596,
+      "grad_norm": 0.5145987868309021,
+      "learning_rate": 0.0001844696736107394,
+      "loss": 1.1646,
+      "step": 4055
+    },
+    {
+      "epoch": 0.7222222222222222,
+      "grad_norm": 0.4361337125301361,
+      "learning_rate": 0.00018446218070347094,
+      "loss": 0.8239,
+      "step": 4056
+    },
+    {
+      "epoch": 0.7224002849002849,
+      "grad_norm": 0.5549173355102539,
+      "learning_rate": 0.00018445468614134146,
+      "loss": 1.1935,
+      "step": 4057
+    },
+    {
+      "epoch": 0.7225783475783476,
+      "grad_norm": 0.5569297671318054,
+      "learning_rate": 0.00018444718992449789,
+      "loss": 1.0137,
+      "step": 4058
+    },
+    {
+      "epoch": 0.7227564102564102,
+      "grad_norm": 0.44866305589675903,
+      "learning_rate": 0.00018443969205308704,
+      "loss": 0.987,
+      "step": 4059
+    },
+    {
+      "epoch": 0.7229344729344729,
+      "grad_norm": 0.5142943263053894,
+      "learning_rate": 0.0001844321925272558,
+      "loss": 1.0837,
+      "step": 4060
+    },
+    {
+      "epoch": 0.7231125356125356,
+      "grad_norm": 0.4922119379043579,
+      "learning_rate": 0.0001844246913471512,
+      "loss": 0.8477,
+      "step": 4061
+    },
+    {
+      "epoch": 0.7232905982905983,
+      "grad_norm": 0.5245375633239746,
+      "learning_rate": 0.0001844171885129201,
+      "loss": 0.9985,
+      "step": 4062
+    },
+    {
+      "epoch": 0.7234686609686609,
+      "grad_norm": 0.45562678575515747,
+      "learning_rate": 0.00018440968402470956,
+      "loss": 0.8678,
+      "step": 4063
+    },
+    {
+      "epoch": 0.7236467236467237,
+      "grad_norm": 0.5388376712799072,
+      "learning_rate": 0.0001844021778826666,
+      "loss": 1.0586,
+      "step": 4064
+    },
+    {
+      "epoch": 0.7238247863247863,
+      "grad_norm": 0.48945263028144836,
+      "learning_rate": 0.00018439467008693833,
+      "loss": 1.0547,
+      "step": 4065
+    },
+    {
+      "epoch": 0.7240028490028491,
+      "grad_norm": 0.5202330350875854,
+      "learning_rate": 0.00018438716063767178,
+      "loss": 1.3142,
+      "step": 4066
+    },
+    {
+      "epoch": 0.7241809116809117,
+      "grad_norm": 0.5432567000389099,
+      "learning_rate": 0.00018437964953501413,
+      "loss": 1.0192,
+      "step": 4067
+    },
+    {
+      "epoch": 0.7243589743589743,
+      "grad_norm": 0.5220325589179993,
+      "learning_rate": 0.00018437213677911253,
+      "loss": 1.0904,
+      "step": 4068
+    },
+    {
+      "epoch": 0.7245370370370371,
+      "grad_norm": 0.45711690187454224,
+      "learning_rate": 0.00018436462237011417,
+      "loss": 1.0417,
+      "step": 4069
+    },
+    {
+      "epoch": 0.7247150997150997,
+      "grad_norm": 0.560778021812439,
+      "learning_rate": 0.0001843571063081663,
+      "loss": 1.2316,
+      "step": 4070
+    },
+    {
+      "epoch": 0.7248931623931624,
+      "grad_norm": 0.591533362865448,
+      "learning_rate": 0.0001843495885934162,
+      "loss": 1.0294,
+      "step": 4071
+    },
+    {
+      "epoch": 0.7250712250712251,
+      "grad_norm": 0.5550443530082703,
+      "learning_rate": 0.00018434206922601106,
+      "loss": 1.0162,
+      "step": 4072
+    },
+    {
+      "epoch": 0.7252492877492878,
+      "grad_norm": 0.5744053721427917,
+      "learning_rate": 0.00018433454820609833,
+      "loss": 1.2774,
+      "step": 4073
+    },
+    {
+      "epoch": 0.7254273504273504,
+      "grad_norm": 0.6210703253746033,
+      "learning_rate": 0.0001843270255338253,
+      "loss": 1.2526,
+      "step": 4074
+    },
+    {
+      "epoch": 0.7256054131054132,
+      "grad_norm": 0.49684277176856995,
+      "learning_rate": 0.0001843195012093394,
+      "loss": 1.0786,
+      "step": 4075
+    },
+    {
+      "epoch": 0.7257834757834758,
+      "grad_norm": 0.5851606130599976,
+      "learning_rate": 0.00018431197523278802,
+      "loss": 1.14,
+      "step": 4076
+    },
+    {
+      "epoch": 0.7259615384615384,
+      "grad_norm": 0.5494425296783447,
+      "learning_rate": 0.00018430444760431862,
+      "loss": 1.211,
+      "step": 4077
+    },
+    {
+      "epoch": 0.7261396011396012,
+      "grad_norm": 0.5247658491134644,
+      "learning_rate": 0.00018429691832407867,
+      "loss": 0.8031,
+      "step": 4078
+    },
+    {
+      "epoch": 0.7263176638176638,
+      "grad_norm": 0.5012249946594238,
+      "learning_rate": 0.00018428938739221574,
+      "loss": 1.1258,
+      "step": 4079
+    },
+    {
+      "epoch": 0.7264957264957265,
+      "grad_norm": 0.5226427912712097,
+      "learning_rate": 0.0001842818548088774,
+      "loss": 1.0029,
+      "step": 4080
+    },
+    {
+      "epoch": 0.7266737891737892,
+      "grad_norm": 0.45008543133735657,
+      "learning_rate": 0.00018427432057421114,
+      "loss": 1.0681,
+      "step": 4081
+    },
+    {
+      "epoch": 0.7268518518518519,
+      "grad_norm": 0.5127285122871399,
+      "learning_rate": 0.00018426678468836467,
+      "loss": 1.1069,
+      "step": 4082
+    },
+    {
+      "epoch": 0.7270299145299145,
+      "grad_norm": 0.5406150221824646,
+      "learning_rate": 0.0001842592471514856,
+      "loss": 1.052,
+      "step": 4083
+    },
+    {
+      "epoch": 0.7272079772079773,
+      "grad_norm": 0.5001157522201538,
+      "learning_rate": 0.0001842517079637216,
+      "loss": 0.9157,
+      "step": 4084
+    },
+    {
+      "epoch": 0.7273860398860399,
+      "grad_norm": 0.6169779300689697,
+      "learning_rate": 0.00018424416712522042,
+      "loss": 1.3133,
+      "step": 4085
+    },
+    {
+      "epoch": 0.7275641025641025,
+      "grad_norm": 0.4891316890716553,
+      "learning_rate": 0.00018423662463612974,
+      "loss": 0.9505,
+      "step": 4086
+    },
+    {
+      "epoch": 0.7277421652421653,
+      "grad_norm": 0.5883708596229553,
+      "learning_rate": 0.00018422908049659743,
+      "loss": 1.2797,
+      "step": 4087
+    },
+    {
+      "epoch": 0.7279202279202279,
+      "grad_norm": 0.6679072976112366,
+      "learning_rate": 0.00018422153470677125,
+      "loss": 1.1096,
+      "step": 4088
+    },
+    {
+      "epoch": 0.7280982905982906,
+      "grad_norm": 0.5178479552268982,
+      "learning_rate": 0.00018421398726679904,
+      "loss": 1.0299,
+      "step": 4089
+    },
+    {
+      "epoch": 0.7282763532763533,
+      "grad_norm": 0.6343900561332703,
+      "learning_rate": 0.0001842064381768287,
+      "loss": 1.2983,
+      "step": 4090
+    },
+    {
+      "epoch": 0.728454415954416,
+      "grad_norm": 0.43816515803337097,
+      "learning_rate": 0.0001841988874370081,
+      "loss": 0.9452,
+      "step": 4091
+    },
+    {
+      "epoch": 0.7286324786324786,
+      "grad_norm": 0.579790472984314,
+      "learning_rate": 0.00018419133504748528,
+      "loss": 1.1037,
+      "step": 4092
+    },
+    {
+      "epoch": 0.7288105413105413,
+      "grad_norm": 0.571374773979187,
+      "learning_rate": 0.00018418378100840807,
+      "loss": 1.1655,
+      "step": 4093
+    },
+    {
+      "epoch": 0.728988603988604,
+      "grad_norm": 0.5163514018058777,
+      "learning_rate": 0.0001841762253199246,
+      "loss": 1.1579,
+      "step": 4094
+    },
+    {
+      "epoch": 0.7291666666666666,
+      "grad_norm": 0.6553022265434265,
+      "learning_rate": 0.0001841686679821828,
+      "loss": 0.9664,
+      "step": 4095
+    },
+    {
+      "epoch": 0.7293447293447294,
+      "grad_norm": 0.5072969198226929,
+      "learning_rate": 0.00018416110899533084,
+      "loss": 0.9416,
+      "step": 4096
+    },
+    {
+      "epoch": 0.729522792022792,
+      "grad_norm": 0.5103251338005066,
+      "learning_rate": 0.00018415354835951675,
+      "loss": 1.0715,
+      "step": 4097
+    },
+    {
+      "epoch": 0.7297008547008547,
+      "grad_norm": 0.49752289056777954,
+      "learning_rate": 0.00018414598607488874,
+      "loss": 1.1848,
+      "step": 4098
+    },
+    {
+      "epoch": 0.7298789173789174,
+      "grad_norm": 0.5361882448196411,
+      "learning_rate": 0.00018413842214159488,
+      "loss": 1.1035,
+      "step": 4099
+    },
+    {
+      "epoch": 0.73005698005698,
+      "grad_norm": 0.5167670249938965,
+      "learning_rate": 0.00018413085655978343,
+      "loss": 1.0015,
+      "step": 4100
+    },
+    {
+      "epoch": 0.7302350427350427,
+      "grad_norm": 0.5930629372596741,
+      "learning_rate": 0.00018412328932960263,
+      "loss": 0.9766,
+      "step": 4101
+    },
+    {
+      "epoch": 0.7304131054131054,
+      "grad_norm": 0.5234778523445129,
+      "learning_rate": 0.00018411572045120073,
+      "loss": 1.0317,
+      "step": 4102
+    },
+    {
+      "epoch": 0.7305911680911681,
+      "grad_norm": 0.5361374020576477,
+      "learning_rate": 0.000184108149924726,
+      "loss": 1.1228,
+      "step": 4103
+    },
+    {
+      "epoch": 0.7307692307692307,
+      "grad_norm": 0.5845770239830017,
+      "learning_rate": 0.0001841005777503268,
+      "loss": 0.9541,
+      "step": 4104
+    },
+    {
+      "epoch": 0.7309472934472935,
+      "grad_norm": 0.49320483207702637,
+      "learning_rate": 0.0001840930039281515,
+      "loss": 0.9445,
+      "step": 4105
+    },
+    {
+      "epoch": 0.7311253561253561,
+      "grad_norm": 0.5391250252723694,
+      "learning_rate": 0.00018408542845834845,
+      "loss": 1.1983,
+      "step": 4106
+    },
+    {
+      "epoch": 0.7313034188034188,
+      "grad_norm": 0.4890393316745758,
+      "learning_rate": 0.00018407785134106613,
+      "loss": 0.8353,
+      "step": 4107
+    },
+    {
+      "epoch": 0.7314814814814815,
+      "grad_norm": 0.5839747190475464,
+      "learning_rate": 0.00018407027257645296,
+      "loss": 1.4074,
+      "step": 4108
+    },
+    {
+      "epoch": 0.7316595441595442,
+      "grad_norm": 0.5957708358764648,
+      "learning_rate": 0.0001840626921646574,
+      "loss": 1.1032,
+      "step": 4109
+    },
+    {
+      "epoch": 0.7318376068376068,
+      "grad_norm": 0.5029017925262451,
+      "learning_rate": 0.00018405511010582805,
+      "loss": 1.095,
+      "step": 4110
+    },
+    {
+      "epoch": 0.7320156695156695,
+      "grad_norm": 0.6054347157478333,
+      "learning_rate": 0.00018404752640011345,
+      "loss": 1.0366,
+      "step": 4111
+    },
+    {
+      "epoch": 0.7321937321937322,
+      "grad_norm": 0.5476830005645752,
+      "learning_rate": 0.00018403994104766212,
+      "loss": 1.0976,
+      "step": 4112
+    },
+    {
+      "epoch": 0.7323717948717948,
+      "grad_norm": 0.5000962615013123,
+      "learning_rate": 0.00018403235404862277,
+      "loss": 1.0809,
+      "step": 4113
+    },
+    {
+      "epoch": 0.7325498575498576,
+      "grad_norm": 0.5119251012802124,
+      "learning_rate": 0.00018402476540314394,
+      "loss": 1.0176,
+      "step": 4114
+    },
+    {
+      "epoch": 0.7327279202279202,
+      "grad_norm": 0.5825830698013306,
+      "learning_rate": 0.00018401717511137445,
+      "loss": 1.2357,
+      "step": 4115
+    },
+    {
+      "epoch": 0.7329059829059829,
+      "grad_norm": 0.5702941417694092,
+      "learning_rate": 0.0001840095831734629,
+      "loss": 1.1549,
+      "step": 4116
+    },
+    {
+      "epoch": 0.7330840455840456,
+      "grad_norm": 0.5660699605941772,
+      "learning_rate": 0.00018400198958955807,
+      "loss": 1.1778,
+      "step": 4117
+    },
+    {
+      "epoch": 0.7332621082621082,
+      "grad_norm": 0.5241161584854126,
+      "learning_rate": 0.0001839943943598088,
+      "loss": 0.8587,
+      "step": 4118
+    },
+    {
+      "epoch": 0.7334401709401709,
+      "grad_norm": 0.581194281578064,
+      "learning_rate": 0.0001839867974843638,
+      "loss": 1.2169,
+      "step": 4119
+    },
+    {
+      "epoch": 0.7336182336182336,
+      "grad_norm": 0.4342379570007324,
+      "learning_rate": 0.00018397919896337198,
+      "loss": 0.9182,
+      "step": 4120
+    },
+    {
+      "epoch": 0.7337962962962963,
+      "grad_norm": 0.5708567500114441,
+      "learning_rate": 0.00018397159879698224,
+      "loss": 1.1781,
+      "step": 4121
+    },
+    {
+      "epoch": 0.7339743589743589,
+      "grad_norm": 0.5827265977859497,
+      "learning_rate": 0.00018396399698534344,
+      "loss": 1.2905,
+      "step": 4122
+    },
+    {
+      "epoch": 0.7341524216524217,
+      "grad_norm": 0.5274056792259216,
+      "learning_rate": 0.00018395639352860457,
+      "loss": 1.1786,
+      "step": 4123
+    },
+    {
+      "epoch": 0.7343304843304843,
+      "grad_norm": 0.5094266533851624,
+      "learning_rate": 0.00018394878842691452,
+      "loss": 1.2016,
+      "step": 4124
+    },
+    {
+      "epoch": 0.7345085470085471,
+      "grad_norm": 0.48779475688934326,
+      "learning_rate": 0.0001839411816804224,
+      "loss": 1.0562,
+      "step": 4125
+    },
+    {
+      "epoch": 0.7346866096866097,
+      "grad_norm": 0.5805709958076477,
+      "learning_rate": 0.00018393357328927716,
+      "loss": 1.1705,
+      "step": 4126
+    },
+    {
+      "epoch": 0.7348646723646723,
+      "grad_norm": 0.4910700023174286,
+      "learning_rate": 0.00018392596325362791,
+      "loss": 1.0682,
+      "step": 4127
+    },
+    {
+      "epoch": 0.7350427350427351,
+      "grad_norm": 0.5297428369522095,
+      "learning_rate": 0.0001839183515736238,
+      "loss": 0.9505,
+      "step": 4128
+    },
+    {
+      "epoch": 0.7352207977207977,
+      "grad_norm": 0.45442086458206177,
+      "learning_rate": 0.00018391073824941385,
+      "loss": 0.9548,
+      "step": 4129
+    },
+    {
+      "epoch": 0.7353988603988604,
+      "grad_norm": 0.49299946427345276,
+      "learning_rate": 0.00018390312328114733,
+      "loss": 1.0868,
+      "step": 4130
+    },
+    {
+      "epoch": 0.7355769230769231,
+      "grad_norm": 0.4839940369129181,
+      "learning_rate": 0.0001838955066689734,
+      "loss": 0.9565,
+      "step": 4131
+    },
+    {
+      "epoch": 0.7357549857549858,
+      "grad_norm": 0.48600608110427856,
+      "learning_rate": 0.00018388788841304128,
+      "loss": 1.2353,
+      "step": 4132
+    },
+    {
+      "epoch": 0.7359330484330484,
+      "grad_norm": 0.4893583357334137,
+      "learning_rate": 0.0001838802685135003,
+      "loss": 0.9595,
+      "step": 4133
+    },
+    {
+      "epoch": 0.7361111111111112,
+      "grad_norm": 0.4587398171424866,
+      "learning_rate": 0.00018387264697049963,
+      "loss": 1.1222,
+      "step": 4134
+    },
+    {
+      "epoch": 0.7362891737891738,
+      "grad_norm": 0.5361055731773376,
+      "learning_rate": 0.00018386502378418872,
+      "loss": 1.3304,
+      "step": 4135
+    },
+    {
+      "epoch": 0.7364672364672364,
+      "grad_norm": 0.5556629300117493,
+      "learning_rate": 0.00018385739895471686,
+      "loss": 1.0358,
+      "step": 4136
+    },
+    {
+      "epoch": 0.7366452991452992,
+      "grad_norm": 0.45555856823921204,
+      "learning_rate": 0.00018384977248223346,
+      "loss": 1.0081,
+      "step": 4137
+    },
+    {
+      "epoch": 0.7368233618233618,
+      "grad_norm": 0.5606052875518799,
+      "learning_rate": 0.00018384214436688797,
+      "loss": 0.9367,
+      "step": 4138
+    },
+    {
+      "epoch": 0.7370014245014245,
+      "grad_norm": 0.5428356528282166,
+      "learning_rate": 0.00018383451460882982,
+      "loss": 1.1391,
+      "step": 4139
+    },
+    {
+      "epoch": 0.7371794871794872,
+      "grad_norm": 0.4891330897808075,
+      "learning_rate": 0.00018382688320820853,
+      "loss": 0.9805,
+      "step": 4140
+    },
+    {
+      "epoch": 0.7373575498575499,
+      "grad_norm": 0.5407996773719788,
+      "learning_rate": 0.0001838192501651736,
+      "loss": 1.0532,
+      "step": 4141
+    },
+    {
+      "epoch": 0.7375356125356125,
+      "grad_norm": 0.5241971611976624,
+      "learning_rate": 0.00018381161547987454,
+      "loss": 0.9509,
+      "step": 4142
+    },
+    {
+      "epoch": 0.7377136752136753,
+      "grad_norm": 0.5370210409164429,
+      "learning_rate": 0.000183803979152461,
+      "loss": 1.2342,
+      "step": 4143
+    },
+    {
+      "epoch": 0.7378917378917379,
+      "grad_norm": 0.5470060706138611,
+      "learning_rate": 0.00018379634118308259,
+      "loss": 0.9621,
+      "step": 4144
+    },
+    {
+      "epoch": 0.7380698005698005,
+      "grad_norm": 0.546313464641571,
+      "learning_rate": 0.00018378870157188893,
+      "loss": 1.1253,
+      "step": 4145
+    },
+    {
+      "epoch": 0.7382478632478633,
+      "grad_norm": 0.502027153968811,
+      "learning_rate": 0.00018378106031902974,
+      "loss": 1.1919,
+      "step": 4146
+    },
+    {
+      "epoch": 0.7384259259259259,
+      "grad_norm": 0.5282283425331116,
+      "learning_rate": 0.0001837734174246547,
+      "loss": 1.0088,
+      "step": 4147
+    },
+    {
+      "epoch": 0.7386039886039886,
+      "grad_norm": 0.5152897238731384,
+      "learning_rate": 0.00018376577288891355,
+      "loss": 1.0813,
+      "step": 4148
+    },
+    {
+      "epoch": 0.7387820512820513,
+      "grad_norm": 0.5002804398536682,
+      "learning_rate": 0.0001837581267119561,
+      "loss": 0.9797,
+      "step": 4149
+    },
+    {
+      "epoch": 0.738960113960114,
+      "grad_norm": 0.5698176026344299,
+      "learning_rate": 0.00018375047889393215,
+      "loss": 1.1099,
+      "step": 4150
+    },
+    {
+      "epoch": 0.7391381766381766,
+      "grad_norm": 0.5384604930877686,
+      "learning_rate": 0.00018374282943499156,
+      "loss": 1.1944,
+      "step": 4151
+    },
+    {
+      "epoch": 0.7393162393162394,
+      "grad_norm": 0.5483044385910034,
+      "learning_rate": 0.00018373517833528418,
+      "loss": 1.1734,
+      "step": 4152
+    },
+    {
+      "epoch": 0.739494301994302,
+      "grad_norm": 0.4824066162109375,
+      "learning_rate": 0.0001837275255949599,
+      "loss": 0.9515,
+      "step": 4153
+    },
+    {
+      "epoch": 0.7396723646723646,
+      "grad_norm": 0.45413634181022644,
+      "learning_rate": 0.00018371987121416873,
+      "loss": 0.7534,
+      "step": 4154
+    },
+    {
+      "epoch": 0.7398504273504274,
+      "grad_norm": 0.5874246954917908,
+      "learning_rate": 0.00018371221519306055,
+      "loss": 0.9464,
+      "step": 4155
+    },
+    {
+      "epoch": 0.74002849002849,
+      "grad_norm": 0.5219913125038147,
+      "learning_rate": 0.00018370455753178544,
+      "loss": 1.0494,
+      "step": 4156
+    },
+    {
+      "epoch": 0.7402065527065527,
+      "grad_norm": 0.5937709212303162,
+      "learning_rate": 0.00018369689823049341,
+      "loss": 1.0529,
+      "step": 4157
+    },
+    {
+      "epoch": 0.7403846153846154,
+      "grad_norm": 0.5204295516014099,
+      "learning_rate": 0.00018368923728933449,
+      "loss": 1.0602,
+      "step": 4158
+    },
+    {
+      "epoch": 0.7405626780626781,
+      "grad_norm": 0.5422890186309814,
+      "learning_rate": 0.00018368157470845885,
+      "loss": 0.9261,
+      "step": 4159
+    },
+    {
+      "epoch": 0.7407407407407407,
+      "grad_norm": 0.6163852214813232,
+      "learning_rate": 0.00018367391048801655,
+      "loss": 1.2771,
+      "step": 4160
+    },
+    {
+      "epoch": 0.7409188034188035,
+      "grad_norm": 0.5070751309394836,
+      "learning_rate": 0.00018366624462815785,
+      "loss": 1.0401,
+      "step": 4161
+    },
+    {
+      "epoch": 0.7410968660968661,
+      "grad_norm": 0.4477100968360901,
+      "learning_rate": 0.00018365857712903283,
+      "loss": 1.1463,
+      "step": 4162
+    },
+    {
+      "epoch": 0.7412749287749287,
+      "grad_norm": 0.5421462655067444,
+      "learning_rate": 0.0001836509079907918,
+      "loss": 0.9373,
+      "step": 4163
+    },
+    {
+      "epoch": 0.7414529914529915,
+      "grad_norm": 0.6162141561508179,
+      "learning_rate": 0.000183643237213585,
+      "loss": 1.1827,
+      "step": 4164
+    },
+    {
+      "epoch": 0.7416310541310541,
+      "grad_norm": 0.5653836131095886,
+      "learning_rate": 0.00018363556479756272,
+      "loss": 1.0689,
+      "step": 4165
+    },
+    {
+      "epoch": 0.7418091168091168,
+      "grad_norm": 0.57053542137146,
+      "learning_rate": 0.00018362789074287527,
+      "loss": 1.0289,
+      "step": 4166
+    },
+    {
+      "epoch": 0.7419871794871795,
+      "grad_norm": 0.5603055953979492,
+      "learning_rate": 0.00018362021504967304,
+      "loss": 1.1926,
+      "step": 4167
+    },
+    {
+      "epoch": 0.7421652421652422,
+      "grad_norm": 0.5460166335105896,
+      "learning_rate": 0.0001836125377181064,
+      "loss": 1.1488,
+      "step": 4168
+    },
+    {
+      "epoch": 0.7423433048433048,
+      "grad_norm": 0.5097107887268066,
+      "learning_rate": 0.00018360485874832579,
+      "loss": 1.0781,
+      "step": 4169
+    },
+    {
+      "epoch": 0.7425213675213675,
+      "grad_norm": 0.6280624270439148,
+      "learning_rate": 0.00018359717814048164,
+      "loss": 1.3625,
+      "step": 4170
+    },
+    {
+      "epoch": 0.7426994301994302,
+      "grad_norm": 0.4528210759162903,
+      "learning_rate": 0.0001835894958947244,
+      "loss": 0.8417,
+      "step": 4171
+    },
+    {
+      "epoch": 0.7428774928774928,
+      "grad_norm": 0.48735132813453674,
+      "learning_rate": 0.00018358181201120468,
+      "loss": 0.9544,
+      "step": 4172
+    },
+    {
+      "epoch": 0.7430555555555556,
+      "grad_norm": 0.48388174176216125,
+      "learning_rate": 0.00018357412649007296,
+      "loss": 1.0663,
+      "step": 4173
+    },
+    {
+      "epoch": 0.7432336182336182,
+      "grad_norm": 0.5435357689857483,
+      "learning_rate": 0.00018356643933147986,
+      "loss": 1.2074,
+      "step": 4174
+    },
+    {
+      "epoch": 0.7434116809116809,
+      "grad_norm": 0.49890074133872986,
+      "learning_rate": 0.00018355875053557594,
+      "loss": 1.1322,
+      "step": 4175
+    },
+    {
+      "epoch": 0.7435897435897436,
+      "grad_norm": 0.5680708885192871,
+      "learning_rate": 0.0001835510601025119,
+      "loss": 1.1964,
+      "step": 4176
+    },
+    {
+      "epoch": 0.7437678062678063,
+      "grad_norm": 0.5002360939979553,
+      "learning_rate": 0.00018354336803243842,
+      "loss": 1.1396,
+      "step": 4177
+    },
+    {
+      "epoch": 0.7439458689458689,
+      "grad_norm": 0.5202965140342712,
+      "learning_rate": 0.00018353567432550616,
+      "loss": 1.1498,
+      "step": 4178
+    },
+    {
+      "epoch": 0.7441239316239316,
+      "grad_norm": 0.514492928981781,
+      "learning_rate": 0.00018352797898186588,
+      "loss": 1.0959,
+      "step": 4179
+    },
+    {
+      "epoch": 0.7443019943019943,
+      "grad_norm": 0.6395383477210999,
+      "learning_rate": 0.0001835202820016684,
+      "loss": 1.2867,
+      "step": 4180
+    },
+    {
+      "epoch": 0.7444800569800569,
+      "grad_norm": 0.5489062070846558,
+      "learning_rate": 0.00018351258338506447,
+      "loss": 1.1638,
+      "step": 4181
+    },
+    {
+      "epoch": 0.7446581196581197,
+      "grad_norm": 0.5705671906471252,
+      "learning_rate": 0.00018350488313220498,
+      "loss": 0.9493,
+      "step": 4182
+    },
+    {
+      "epoch": 0.7448361823361823,
+      "grad_norm": 0.5404297709465027,
+      "learning_rate": 0.00018349718124324076,
+      "loss": 0.9876,
+      "step": 4183
+    },
+    {
+      "epoch": 0.7450142450142451,
+      "grad_norm": 0.5841003060340881,
+      "learning_rate": 0.0001834894777183227,
+      "loss": 1.1225,
+      "step": 4184
+    },
+    {
+      "epoch": 0.7451923076923077,
+      "grad_norm": 0.49774688482284546,
+      "learning_rate": 0.00018348177255760178,
+      "loss": 1.1442,
+      "step": 4185
+    },
+    {
+      "epoch": 0.7453703703703703,
+      "grad_norm": 0.5212422609329224,
+      "learning_rate": 0.00018347406576122894,
+      "loss": 1.101,
+      "step": 4186
+    },
+    {
+      "epoch": 0.7455484330484331,
+      "grad_norm": 0.615024983882904,
+      "learning_rate": 0.00018346635732935517,
+      "loss": 1.4188,
+      "step": 4187
+    },
+    {
+      "epoch": 0.7457264957264957,
+      "grad_norm": 0.46818843483924866,
+      "learning_rate": 0.00018345864726213154,
+      "loss": 1.0071,
+      "step": 4188
+    },
+    {
+      "epoch": 0.7459045584045584,
+      "grad_norm": 0.4921121895313263,
+      "learning_rate": 0.00018345093555970906,
+      "loss": 1.015,
+      "step": 4189
+    },
+    {
+      "epoch": 0.7460826210826211,
+      "grad_norm": 0.5042136311531067,
+      "learning_rate": 0.00018344322222223889,
+      "loss": 0.9974,
+      "step": 4190
+    },
+    {
+      "epoch": 0.7462606837606838,
+      "grad_norm": 0.5872490406036377,
+      "learning_rate": 0.0001834355072498721,
+      "loss": 1.3166,
+      "step": 4191
+    },
+    {
+      "epoch": 0.7464387464387464,
+      "grad_norm": 0.559117317199707,
+      "learning_rate": 0.00018342779064275984,
+      "loss": 1.2227,
+      "step": 4192
+    },
+    {
+      "epoch": 0.7466168091168092,
+      "grad_norm": 0.5269635319709778,
+      "learning_rate": 0.00018342007240105336,
+      "loss": 1.0281,
+      "step": 4193
+    },
+    {
+      "epoch": 0.7467948717948718,
+      "grad_norm": 0.4608335793018341,
+      "learning_rate": 0.00018341235252490387,
+      "loss": 0.98,
+      "step": 4194
+    },
+    {
+      "epoch": 0.7469729344729344,
+      "grad_norm": 0.5818259119987488,
+      "learning_rate": 0.00018340463101446255,
+      "loss": 1.1544,
+      "step": 4195
+    },
+    {
+      "epoch": 0.7471509971509972,
+      "grad_norm": 0.5577529668807983,
+      "learning_rate": 0.00018339690786988079,
+      "loss": 1.3059,
+      "step": 4196
+    },
+    {
+      "epoch": 0.7473290598290598,
+      "grad_norm": 0.5430468320846558,
+      "learning_rate": 0.00018338918309130983,
+      "loss": 1.2766,
+      "step": 4197
+    },
+    {
+      "epoch": 0.7475071225071225,
+      "grad_norm": 0.4941701591014862,
+      "learning_rate": 0.0001833814566789011,
+      "loss": 1.193,
+      "step": 4198
+    },
+    {
+      "epoch": 0.7476851851851852,
+      "grad_norm": 0.5471884608268738,
+      "learning_rate": 0.00018337372863280589,
+      "loss": 1.2261,
+      "step": 4199
+    },
+    {
+      "epoch": 0.7478632478632479,
+      "grad_norm": 0.4641438126564026,
+      "learning_rate": 0.0001833659989531757,
+      "loss": 0.7953,
+      "step": 4200
+    },
+    {
+      "epoch": 0.7480413105413105,
+      "grad_norm": 0.5244714617729187,
+      "learning_rate": 0.0001833582676401619,
+      "loss": 0.9344,
+      "step": 4201
+    },
+    {
+      "epoch": 0.7482193732193733,
+      "grad_norm": 0.5964360237121582,
+      "learning_rate": 0.00018335053469391603,
+      "loss": 1.2072,
+      "step": 4202
+    },
+    {
+      "epoch": 0.7483974358974359,
+      "grad_norm": 0.4929158091545105,
+      "learning_rate": 0.00018334280011458954,
+      "loss": 1.2183,
+      "step": 4203
+    },
+    {
+      "epoch": 0.7485754985754985,
+      "grad_norm": 0.46221864223480225,
+      "learning_rate": 0.00018333506390233405,
+      "loss": 1.1957,
+      "step": 4204
+    },
+    {
+      "epoch": 0.7487535612535613,
+      "grad_norm": 0.6301732659339905,
+      "learning_rate": 0.0001833273260573011,
+      "loss": 1.0582,
+      "step": 4205
+    },
+    {
+      "epoch": 0.7489316239316239,
+      "grad_norm": 0.5606021881103516,
+      "learning_rate": 0.0001833195865796423,
+      "loss": 1.4034,
+      "step": 4206
+    },
+    {
+      "epoch": 0.7491096866096866,
+      "grad_norm": 0.44856077432632446,
+      "learning_rate": 0.00018331184546950926,
+      "loss": 0.8421,
+      "step": 4207
+    },
+    {
+      "epoch": 0.7492877492877493,
+      "grad_norm": 0.5487226247787476,
+      "learning_rate": 0.00018330410272705366,
+      "loss": 1.238,
+      "step": 4208
+    },
+    {
+      "epoch": 0.749465811965812,
+      "grad_norm": 0.6043636798858643,
+      "learning_rate": 0.00018329635835242724,
+      "loss": 1.1215,
+      "step": 4209
+    },
+    {
+      "epoch": 0.7496438746438746,
+      "grad_norm": 0.5145319104194641,
+      "learning_rate": 0.00018328861234578173,
+      "loss": 1.1002,
+      "step": 4210
+    },
+    {
+      "epoch": 0.7498219373219374,
+      "grad_norm": 0.5667078495025635,
+      "learning_rate": 0.00018328086470726884,
+      "loss": 1.2994,
+      "step": 4211
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.5117634534835815,
+      "learning_rate": 0.00018327311543704043,
+      "loss": 0.9448,
+      "step": 4212
+    },
+    {
+      "epoch": 0.75,
+      "eval_loss": 1.0982474088668823,
+      "eval_runtime": 24.6617,
+      "eval_samples_per_second": 42.211,
+      "eval_steps_per_second": 21.126,
+      "step": 4212
+    },
+    {
+      "epoch": 0.7501780626780626,
+      "grad_norm": 0.5451585054397583,
+      "learning_rate": 0.00018326536453524826,
+      "loss": 0.9023,
+      "step": 4213
+    },
+    {
+      "epoch": 0.7503561253561254,
+      "grad_norm": 0.6585208773612976,
+      "learning_rate": 0.0001832576120020443,
+      "loss": 1.2798,
+      "step": 4214
+    },
+    {
+      "epoch": 0.750534188034188,
+      "grad_norm": 0.6444812417030334,
+      "learning_rate": 0.00018324985783758037,
+      "loss": 1.3999,
+      "step": 4215
+    },
+    {
+      "epoch": 0.7507122507122507,
+      "grad_norm": 0.6178330779075623,
+      "learning_rate": 0.0001832421020420084,
+      "loss": 1.1846,
+      "step": 4216
+    },
+    {
+      "epoch": 0.7508903133903134,
+      "grad_norm": 0.509969174861908,
+      "learning_rate": 0.00018323434461548036,
+      "loss": 1.1831,
+      "step": 4217
+    },
+    {
+      "epoch": 0.7510683760683761,
+      "grad_norm": 0.5558911561965942,
+      "learning_rate": 0.00018322658555814826,
+      "loss": 1.1599,
+      "step": 4218
+    },
+    {
+      "epoch": 0.7512464387464387,
+      "grad_norm": 0.5714917778968811,
+      "learning_rate": 0.0001832188248701641,
+      "loss": 0.9702,
+      "step": 4219
+    },
+    {
+      "epoch": 0.7514245014245015,
+      "grad_norm": 0.6136442422866821,
+      "learning_rate": 0.00018321106255167995,
+      "loss": 0.9376,
+      "step": 4220
+    },
+    {
+      "epoch": 0.7516025641025641,
+      "grad_norm": 0.5832077264785767,
+      "learning_rate": 0.00018320329860284785,
+      "loss": 1.2564,
+      "step": 4221
+    },
+    {
+      "epoch": 0.7517806267806267,
+      "grad_norm": 0.45330923795700073,
+      "learning_rate": 0.00018319553302381997,
+      "loss": 0.9321,
+      "step": 4222
+    },
+    {
+      "epoch": 0.7519586894586895,
+      "grad_norm": 0.5278468132019043,
+      "learning_rate": 0.00018318776581474847,
+      "loss": 1.1334,
+      "step": 4223
+    },
+    {
+      "epoch": 0.7521367521367521,
+      "grad_norm": 0.49267473816871643,
+      "learning_rate": 0.00018317999697578549,
+      "loss": 1.1577,
+      "step": 4224
+    },
+    {
+      "epoch": 0.7523148148148148,
+      "grad_norm": 0.5372124314308167,
+      "learning_rate": 0.00018317222650708325,
+      "loss": 1.037,
+      "step": 4225
+    },
+    {
+      "epoch": 0.7524928774928775,
+      "grad_norm": 0.5879829525947571,
+      "learning_rate": 0.000183164454408794,
+      "loss": 1.1312,
+      "step": 4226
+    },
+    {
+      "epoch": 0.7526709401709402,
+      "grad_norm": 0.5363932251930237,
+      "learning_rate": 0.00018315668068107004,
+      "loss": 1.174,
+      "step": 4227
+    },
+    {
+      "epoch": 0.7528490028490028,
+      "grad_norm": 0.5585991740226746,
+      "learning_rate": 0.00018314890532406366,
+      "loss": 1.2106,
+      "step": 4228
+    },
+    {
+      "epoch": 0.7530270655270656,
+      "grad_norm": 0.49395787715911865,
+      "learning_rate": 0.0001831411283379272,
+      "loss": 1.1163,
+      "step": 4229
+    },
+    {
+      "epoch": 0.7532051282051282,
+      "grad_norm": 0.5081066489219666,
+      "learning_rate": 0.00018313334972281306,
+      "loss": 1.184,
+      "step": 4230
+    },
+    {
+      "epoch": 0.7533831908831908,
+      "grad_norm": 0.40304034948349,
+      "learning_rate": 0.0001831255694788736,
+      "loss": 0.7548,
+      "step": 4231
+    },
+    {
+      "epoch": 0.7535612535612536,
+      "grad_norm": 0.4999815821647644,
+      "learning_rate": 0.0001831177876062613,
+      "loss": 1.0092,
+      "step": 4232
+    },
+    {
+      "epoch": 0.7537393162393162,
+      "grad_norm": 0.48917025327682495,
+      "learning_rate": 0.00018311000410512862,
+      "loss": 1.0354,
+      "step": 4233
+    },
+    {
+      "epoch": 0.7539173789173789,
+      "grad_norm": 0.475606769323349,
+      "learning_rate": 0.00018310221897562806,
+      "loss": 0.8728,
+      "step": 4234
+    },
+    {
+      "epoch": 0.7540954415954416,
+      "grad_norm": 0.630439817905426,
+      "learning_rate": 0.00018309443221791214,
+      "loss": 1.1436,
+      "step": 4235
+    },
+    {
+      "epoch": 0.7542735042735043,
+      "grad_norm": 0.524740993976593,
+      "learning_rate": 0.00018308664383213344,
+      "loss": 1.0487,
+      "step": 4236
+    },
+    {
+      "epoch": 0.7544515669515669,
+      "grad_norm": 0.4734523892402649,
+      "learning_rate": 0.0001830788538184445,
+      "loss": 1.0681,
+      "step": 4237
+    },
+    {
+      "epoch": 0.7546296296296297,
+      "grad_norm": 0.5767266750335693,
+      "learning_rate": 0.00018307106217699807,
+      "loss": 1.0599,
+      "step": 4238
+    },
+    {
+      "epoch": 0.7548076923076923,
+      "grad_norm": 0.6276642084121704,
+      "learning_rate": 0.0001830632689079467,
+      "loss": 1.2837,
+      "step": 4239
+    },
+    {
+      "epoch": 0.7549857549857549,
+      "grad_norm": 0.5539988279342651,
+      "learning_rate": 0.00018305547401144316,
+      "loss": 0.9072,
+      "step": 4240
+    },
+    {
+      "epoch": 0.7551638176638177,
+      "grad_norm": 0.4551292061805725,
+      "learning_rate": 0.00018304767748764014,
+      "loss": 1.0204,
+      "step": 4241
+    },
+    {
+      "epoch": 0.7553418803418803,
+      "grad_norm": 0.47344550490379333,
+      "learning_rate": 0.00018303987933669034,
+      "loss": 1.0473,
+      "step": 4242
+    },
+    {
+      "epoch": 0.7555199430199431,
+      "grad_norm": 0.6050213575363159,
+      "learning_rate": 0.00018303207955874665,
+      "loss": 1.1552,
+      "step": 4243
+    },
+    {
+      "epoch": 0.7556980056980057,
+      "grad_norm": 0.48943889141082764,
+      "learning_rate": 0.00018302427815396186,
+      "loss": 1.0002,
+      "step": 4244
+    },
+    {
+      "epoch": 0.7558760683760684,
+      "grad_norm": 0.5664682984352112,
+      "learning_rate": 0.00018301647512248878,
+      "loss": 1.1865,
+      "step": 4245
+    },
+    {
+      "epoch": 0.7560541310541311,
+      "grad_norm": 0.5702242255210876,
+      "learning_rate": 0.00018300867046448034,
+      "loss": 1.3029,
+      "step": 4246
+    },
+    {
+      "epoch": 0.7562321937321937,
+      "grad_norm": 0.593207836151123,
+      "learning_rate": 0.00018300086418008942,
+      "loss": 1.109,
+      "step": 4247
+    },
+    {
+      "epoch": 0.7564102564102564,
+      "grad_norm": 0.5887887477874756,
+      "learning_rate": 0.000182993056269469,
+      "loss": 1.3022,
+      "step": 4248
+    },
+    {
+      "epoch": 0.7565883190883191,
+      "grad_norm": 0.5277966260910034,
+      "learning_rate": 0.00018298524673277203,
+      "loss": 1.1738,
+      "step": 4249
+    },
+    {
+      "epoch": 0.7567663817663818,
+      "grad_norm": 0.589347779750824,
+      "learning_rate": 0.00018297743557015155,
+      "loss": 1.0185,
+      "step": 4250
+    },
+    {
+      "epoch": 0.7569444444444444,
+      "grad_norm": 0.49920859932899475,
+      "learning_rate": 0.0001829696227817606,
+      "loss": 1.118,
+      "step": 4251
+    },
+    {
+      "epoch": 0.7571225071225072,
+      "grad_norm": 0.502565324306488,
+      "learning_rate": 0.0001829618083677522,
+      "loss": 1.1856,
+      "step": 4252
+    },
+    {
+      "epoch": 0.7573005698005698,
+      "grad_norm": 0.49814435839653015,
+      "learning_rate": 0.00018295399232827955,
+      "loss": 1.0432,
+      "step": 4253
+    },
+    {
+      "epoch": 0.7574786324786325,
+      "grad_norm": 0.5087502598762512,
+      "learning_rate": 0.00018294617466349574,
+      "loss": 1.2325,
+      "step": 4254
+    },
+    {
+      "epoch": 0.7576566951566952,
+      "grad_norm": 0.5107288956642151,
+      "learning_rate": 0.00018293835537355394,
+      "loss": 1.0487,
+      "step": 4255
+    },
+    {
+      "epoch": 0.7578347578347578,
+      "grad_norm": 0.524725615978241,
+      "learning_rate": 0.00018293053445860732,
+      "loss": 1.1821,
+      "step": 4256
+    },
+    {
+      "epoch": 0.7580128205128205,
+      "grad_norm": 0.5234082937240601,
+      "learning_rate": 0.0001829227119188092,
+      "loss": 0.8896,
+      "step": 4257
+    },
+    {
+      "epoch": 0.7581908831908832,
+      "grad_norm": 0.5102918744087219,
+      "learning_rate": 0.00018291488775431275,
+      "loss": 1.0246,
+      "step": 4258
+    },
+    {
+      "epoch": 0.7583689458689459,
+      "grad_norm": 0.5552714467048645,
+      "learning_rate": 0.00018290706196527135,
+      "loss": 1.0193,
+      "step": 4259
+    },
+    {
+      "epoch": 0.7585470085470085,
+      "grad_norm": 0.5395022630691528,
+      "learning_rate": 0.00018289923455183825,
+      "loss": 1.3203,
+      "step": 4260
+    },
+    {
+      "epoch": 0.7587250712250713,
+      "grad_norm": 0.7474865913391113,
+      "learning_rate": 0.00018289140551416692,
+      "loss": 1.182,
+      "step": 4261
+    },
+    {
+      "epoch": 0.7589031339031339,
+      "grad_norm": 0.4892016649246216,
+      "learning_rate": 0.00018288357485241066,
+      "loss": 0.968,
+      "step": 4262
+    },
+    {
+      "epoch": 0.7590811965811965,
+      "grad_norm": 0.4627816081047058,
+      "learning_rate": 0.00018287574256672291,
+      "loss": 0.6895,
+      "step": 4263
+    },
+    {
+      "epoch": 0.7592592592592593,
+      "grad_norm": 0.6221280097961426,
+      "learning_rate": 0.00018286790865725715,
+      "loss": 0.9691,
+      "step": 4264
+    },
+    {
+      "epoch": 0.7594373219373219,
+      "grad_norm": 0.5542295575141907,
+      "learning_rate": 0.0001828600731241669,
+      "loss": 0.9996,
+      "step": 4265
+    },
+    {
+      "epoch": 0.7596153846153846,
+      "grad_norm": 0.5570770502090454,
+      "learning_rate": 0.00018285223596760562,
+      "loss": 1.1996,
+      "step": 4266
+    },
+    {
+      "epoch": 0.7597934472934473,
+      "grad_norm": 0.5495262742042542,
+      "learning_rate": 0.00018284439718772687,
+      "loss": 1.1572,
+      "step": 4267
+    },
+    {
+      "epoch": 0.75997150997151,
+      "grad_norm": 0.5006741881370544,
+      "learning_rate": 0.00018283655678468427,
+      "loss": 1.1215,
+      "step": 4268
+    },
+    {
+      "epoch": 0.7601495726495726,
+      "grad_norm": 0.4682157635688782,
+      "learning_rate": 0.00018282871475863144,
+      "loss": 1.0547,
+      "step": 4269
+    },
+    {
+      "epoch": 0.7603276353276354,
+      "grad_norm": 0.6275840997695923,
+      "learning_rate": 0.00018282087110972197,
+      "loss": 1.3855,
+      "step": 4270
+    },
+    {
+      "epoch": 0.760505698005698,
+      "grad_norm": 0.5341474413871765,
+      "learning_rate": 0.0001828130258381096,
+      "loss": 1.2024,
+      "step": 4271
+    },
+    {
+      "epoch": 0.7606837606837606,
+      "grad_norm": 0.4330833852291107,
+      "learning_rate": 0.000182805178943948,
+      "loss": 1.0508,
+      "step": 4272
+    },
+    {
+      "epoch": 0.7608618233618234,
+      "grad_norm": 0.6276537179946899,
+      "learning_rate": 0.00018279733042739094,
+      "loss": 1.1635,
+      "step": 4273
+    },
+    {
+      "epoch": 0.761039886039886,
+      "grad_norm": 0.5370199084281921,
+      "learning_rate": 0.00018278948028859217,
+      "loss": 1.0579,
+      "step": 4274
+    },
+    {
+      "epoch": 0.7612179487179487,
+      "grad_norm": 0.524959921836853,
+      "learning_rate": 0.00018278162852770552,
+      "loss": 1.0972,
+      "step": 4275
+    },
+    {
+      "epoch": 0.7613960113960114,
+      "grad_norm": 0.5029389262199402,
+      "learning_rate": 0.00018277377514488486,
+      "loss": 0.959,
+      "step": 4276
+    },
+    {
+      "epoch": 0.7615740740740741,
+      "grad_norm": 0.49772894382476807,
+      "learning_rate": 0.00018276592014028397,
+      "loss": 1.2773,
+      "step": 4277
+    },
+    {
+      "epoch": 0.7617521367521367,
+      "grad_norm": 0.5195719003677368,
+      "learning_rate": 0.00018275806351405685,
+      "loss": 1.0676,
+      "step": 4278
+    },
+    {
+      "epoch": 0.7619301994301995,
+      "grad_norm": 0.5167942643165588,
+      "learning_rate": 0.00018275020526635735,
+      "loss": 1.0615,
+      "step": 4279
+    },
+    {
+      "epoch": 0.7621082621082621,
+      "grad_norm": 0.4958035945892334,
+      "learning_rate": 0.0001827423453973395,
+      "loss": 0.9605,
+      "step": 4280
+    },
+    {
+      "epoch": 0.7622863247863247,
+      "grad_norm": 0.6256808042526245,
+      "learning_rate": 0.00018273448390715728,
+      "loss": 1.2526,
+      "step": 4281
+    },
+    {
+      "epoch": 0.7624643874643875,
+      "grad_norm": 0.5062580108642578,
+      "learning_rate": 0.0001827266207959647,
+      "loss": 1.0604,
+      "step": 4282
+    },
+    {
+      "epoch": 0.7626424501424501,
+      "grad_norm": 0.5080778002738953,
+      "learning_rate": 0.00018271875606391583,
+      "loss": 1.1246,
+      "step": 4283
+    },
+    {
+      "epoch": 0.7628205128205128,
+      "grad_norm": 0.5069389939308167,
+      "learning_rate": 0.00018271088971116479,
+      "loss": 1.3158,
+      "step": 4284
+    },
+    {
+      "epoch": 0.7629985754985755,
+      "grad_norm": 0.7280121445655823,
+      "learning_rate": 0.00018270302173786567,
+      "loss": 1.2066,
+      "step": 4285
+    },
+    {
+      "epoch": 0.7631766381766382,
+      "grad_norm": 0.6523470282554626,
+      "learning_rate": 0.00018269515214417267,
+      "loss": 1.3236,
+      "step": 4286
+    },
+    {
+      "epoch": 0.7633547008547008,
+      "grad_norm": 0.5799322724342346,
+      "learning_rate": 0.00018268728093023988,
+      "loss": 0.9786,
+      "step": 4287
+    },
+    {
+      "epoch": 0.7635327635327636,
+      "grad_norm": 0.46675166487693787,
+      "learning_rate": 0.00018267940809622163,
+      "loss": 0.8131,
+      "step": 4288
+    },
+    {
+      "epoch": 0.7637108262108262,
+      "grad_norm": 0.5566182732582092,
+      "learning_rate": 0.00018267153364227214,
+      "loss": 1.0565,
+      "step": 4289
+    },
+    {
+      "epoch": 0.7638888888888888,
+      "grad_norm": 0.532028079032898,
+      "learning_rate": 0.00018266365756854566,
+      "loss": 0.952,
+      "step": 4290
+    },
+    {
+      "epoch": 0.7640669515669516,
+      "grad_norm": 0.5082666873931885,
+      "learning_rate": 0.00018265577987519653,
+      "loss": 1.0704,
+      "step": 4291
+    },
+    {
+      "epoch": 0.7642450142450142,
+      "grad_norm": 0.5223562717437744,
+      "learning_rate": 0.00018264790056237912,
+      "loss": 1.1161,
+      "step": 4292
+    },
+    {
+      "epoch": 0.7644230769230769,
+      "grad_norm": 0.48472318053245544,
+      "learning_rate": 0.00018264001963024778,
+      "loss": 0.8784,
+      "step": 4293
+    },
+    {
+      "epoch": 0.7646011396011396,
+      "grad_norm": 0.5901281833648682,
+      "learning_rate": 0.0001826321370789569,
+      "loss": 1.1031,
+      "step": 4294
+    },
+    {
+      "epoch": 0.7647792022792023,
+      "grad_norm": 0.570350706577301,
+      "learning_rate": 0.000182624252908661,
+      "loss": 0.9047,
+      "step": 4295
+    },
+    {
+      "epoch": 0.7649572649572649,
+      "grad_norm": 0.568373441696167,
+      "learning_rate": 0.00018261636711951445,
+      "loss": 1.0106,
+      "step": 4296
+    },
+    {
+      "epoch": 0.7651353276353277,
+      "grad_norm": 0.6175880432128906,
+      "learning_rate": 0.00018260847971167182,
+      "loss": 1.3531,
+      "step": 4297
+    },
+    {
+      "epoch": 0.7653133903133903,
+      "grad_norm": 0.5682594776153564,
+      "learning_rate": 0.00018260059068528762,
+      "loss": 1.1261,
+      "step": 4298
+    },
+    {
+      "epoch": 0.7654914529914529,
+      "grad_norm": 0.5050225257873535,
+      "learning_rate": 0.00018259270004051644,
+      "loss": 1.0921,
+      "step": 4299
+    },
+    {
+      "epoch": 0.7656695156695157,
+      "grad_norm": 0.5416565537452698,
+      "learning_rate": 0.0001825848077775129,
+      "loss": 1.0881,
+      "step": 4300
+    },
+    {
+      "epoch": 0.7658475783475783,
+      "grad_norm": 0.5418867468833923,
+      "learning_rate": 0.0001825769138964316,
+      "loss": 1.2069,
+      "step": 4301
+    },
+    {
+      "epoch": 0.7660256410256411,
+      "grad_norm": 0.5447866320610046,
+      "learning_rate": 0.00018256901839742718,
+      "loss": 1.1827,
+      "step": 4302
+    },
+    {
+      "epoch": 0.7662037037037037,
+      "grad_norm": 0.5482802987098694,
+      "learning_rate": 0.00018256112128065439,
+      "loss": 1.0492,
+      "step": 4303
+    },
+    {
+      "epoch": 0.7663817663817664,
+      "grad_norm": 0.5059601664543152,
+      "learning_rate": 0.0001825532225462679,
+      "loss": 1.0996,
+      "step": 4304
+    },
+    {
+      "epoch": 0.7665598290598291,
+      "grad_norm": 0.5153701901435852,
+      "learning_rate": 0.00018254532219442258,
+      "loss": 1.3237,
+      "step": 4305
+    },
+    {
+      "epoch": 0.7667378917378918,
+      "grad_norm": 0.5370768904685974,
+      "learning_rate": 0.0001825374202252731,
+      "loss": 0.9925,
+      "step": 4306
+    },
+    {
+      "epoch": 0.7669159544159544,
+      "grad_norm": 0.4516580402851105,
+      "learning_rate": 0.00018252951663897432,
+      "loss": 1.0749,
+      "step": 4307
+    },
+    {
+      "epoch": 0.7670940170940171,
+      "grad_norm": 0.5565171837806702,
+      "learning_rate": 0.0001825216114356811,
+      "loss": 1.1617,
+      "step": 4308
+    },
+    {
+      "epoch": 0.7672720797720798,
+      "grad_norm": 0.5212662220001221,
+      "learning_rate": 0.00018251370461554834,
+      "loss": 1.1108,
+      "step": 4309
+    },
+    {
+      "epoch": 0.7674501424501424,
+      "grad_norm": 0.49061715602874756,
+      "learning_rate": 0.00018250579617873095,
+      "loss": 1.0881,
+      "step": 4310
+    },
+    {
+      "epoch": 0.7676282051282052,
+      "grad_norm": 0.5535751581192017,
+      "learning_rate": 0.00018249788612538387,
+      "loss": 0.9341,
+      "step": 4311
+    },
+    {
+      "epoch": 0.7678062678062678,
+      "grad_norm": 0.5425209403038025,
+      "learning_rate": 0.00018248997445566208,
+      "loss": 1.1858,
+      "step": 4312
+    },
+    {
+      "epoch": 0.7679843304843305,
+      "grad_norm": 0.6224395036697388,
+      "learning_rate": 0.0001824820611697206,
+      "loss": 1.0836,
+      "step": 4313
+    },
+    {
+      "epoch": 0.7681623931623932,
+      "grad_norm": 0.4895690977573395,
+      "learning_rate": 0.00018247414626771445,
+      "loss": 0.8598,
+      "step": 4314
+    },
+    {
+      "epoch": 0.7683404558404558,
+      "grad_norm": 0.5279615521430969,
+      "learning_rate": 0.00018246622974979877,
+      "loss": 1.1742,
+      "step": 4315
+    },
+    {
+      "epoch": 0.7685185185185185,
+      "grad_norm": 0.45300471782684326,
+      "learning_rate": 0.0001824583116161286,
+      "loss": 0.8872,
+      "step": 4316
+    },
+    {
+      "epoch": 0.7686965811965812,
+      "grad_norm": 0.6499692797660828,
+      "learning_rate": 0.00018245039186685916,
+      "loss": 1.2495,
+      "step": 4317
+    },
+    {
+      "epoch": 0.7688746438746439,
+      "grad_norm": 0.48151278495788574,
+      "learning_rate": 0.00018244247050214552,
+      "loss": 1.2382,
+      "step": 4318
+    },
+    {
+      "epoch": 0.7690527065527065,
+      "grad_norm": 0.6597028374671936,
+      "learning_rate": 0.0001824345475221429,
+      "loss": 1.3453,
+      "step": 4319
+    },
+    {
+      "epoch": 0.7692307692307693,
+      "grad_norm": 0.4536992609500885,
+      "learning_rate": 0.0001824266229270066,
+      "loss": 1.1141,
+      "step": 4320
+    },
+    {
+      "epoch": 0.7694088319088319,
+      "grad_norm": 0.5489405393600464,
+      "learning_rate": 0.00018241869671689184,
+      "loss": 1.0333,
+      "step": 4321
+    },
+    {
+      "epoch": 0.7695868945868946,
+      "grad_norm": 0.5741586089134216,
+      "learning_rate": 0.00018241076889195394,
+      "loss": 0.9939,
+      "step": 4322
+    },
+    {
+      "epoch": 0.7697649572649573,
+      "grad_norm": 0.47170960903167725,
+      "learning_rate": 0.00018240283945234823,
+      "loss": 0.9878,
+      "step": 4323
+    },
+    {
+      "epoch": 0.76994301994302,
+      "grad_norm": 0.4729093313217163,
+      "learning_rate": 0.00018239490839823004,
+      "loss": 1.0087,
+      "step": 4324
+    },
+    {
+      "epoch": 0.7701210826210826,
+      "grad_norm": 0.49869823455810547,
+      "learning_rate": 0.0001823869757297548,
+      "loss": 1.169,
+      "step": 4325
+    },
+    {
+      "epoch": 0.7702991452991453,
+      "grad_norm": 0.5118468403816223,
+      "learning_rate": 0.0001823790414470779,
+      "loss": 1.1092,
+      "step": 4326
+    },
+    {
+      "epoch": 0.770477207977208,
+      "grad_norm": 0.5076048970222473,
+      "learning_rate": 0.0001823711055503548,
+      "loss": 1.1028,
+      "step": 4327
+    },
+    {
+      "epoch": 0.7706552706552706,
+      "grad_norm": 0.5661569237709045,
+      "learning_rate": 0.00018236316803974098,
+      "loss": 1.1114,
+      "step": 4328
+    },
+    {
+      "epoch": 0.7708333333333334,
+      "grad_norm": 0.5542354583740234,
+      "learning_rate": 0.000182355228915392,
+      "loss": 1.0931,
+      "step": 4329
+    },
+    {
+      "epoch": 0.771011396011396,
+      "grad_norm": 0.5476680994033813,
+      "learning_rate": 0.0001823472881774634,
+      "loss": 1.036,
+      "step": 4330
+    },
+    {
+      "epoch": 0.7711894586894587,
+      "grad_norm": 0.5449798703193665,
+      "learning_rate": 0.00018233934582611073,
+      "loss": 1.0682,
+      "step": 4331
+    },
+    {
+      "epoch": 0.7713675213675214,
+      "grad_norm": 0.61089026927948,
+      "learning_rate": 0.00018233140186148963,
+      "loss": 1.0748,
+      "step": 4332
+    },
+    {
+      "epoch": 0.771545584045584,
+      "grad_norm": 0.5015206336975098,
+      "learning_rate": 0.00018232345628375576,
+      "loss": 1.2032,
+      "step": 4333
+    },
+    {
+      "epoch": 0.7717236467236467,
+      "grad_norm": 0.579289972782135,
+      "learning_rate": 0.00018231550909306475,
+      "loss": 1.0764,
+      "step": 4334
+    },
+    {
+      "epoch": 0.7719017094017094,
+      "grad_norm": 0.5889299511909485,
+      "learning_rate": 0.00018230756028957235,
+      "loss": 1.1768,
+      "step": 4335
+    },
+    {
+      "epoch": 0.7720797720797721,
+      "grad_norm": 0.5328249335289001,
+      "learning_rate": 0.00018229960987343428,
+      "loss": 1.0055,
+      "step": 4336
+    },
+    {
+      "epoch": 0.7722578347578347,
+      "grad_norm": 0.5766382217407227,
+      "learning_rate": 0.0001822916578448063,
+      "loss": 0.9923,
+      "step": 4337
+    },
+    {
+      "epoch": 0.7724358974358975,
+      "grad_norm": 0.6448187828063965,
+      "learning_rate": 0.00018228370420384423,
+      "loss": 1.1135,
+      "step": 4338
+    },
+    {
+      "epoch": 0.7726139601139601,
+      "grad_norm": 0.5505210757255554,
+      "learning_rate": 0.00018227574895070394,
+      "loss": 1.2048,
+      "step": 4339
+    },
+    {
+      "epoch": 0.7727920227920227,
+      "grad_norm": 0.6278925538063049,
+      "learning_rate": 0.00018226779208554126,
+      "loss": 1.1045,
+      "step": 4340
+    },
+    {
+      "epoch": 0.7729700854700855,
+      "grad_norm": 0.5345009565353394,
+      "learning_rate": 0.00018225983360851207,
+      "loss": 1.0102,
+      "step": 4341
+    },
+    {
+      "epoch": 0.7731481481481481,
+      "grad_norm": 0.566633403301239,
+      "learning_rate": 0.00018225187351977233,
+      "loss": 1.0038,
+      "step": 4342
+    },
+    {
+      "epoch": 0.7733262108262108,
+      "grad_norm": 0.5066078901290894,
+      "learning_rate": 0.000182243911819478,
+      "loss": 1.0339,
+      "step": 4343
+    },
+    {
+      "epoch": 0.7735042735042735,
+      "grad_norm": 0.5614920258522034,
+      "learning_rate": 0.00018223594850778503,
+      "loss": 1.1021,
+      "step": 4344
+    },
+    {
+      "epoch": 0.7736823361823362,
+      "grad_norm": 0.7747337818145752,
+      "learning_rate": 0.0001822279835848495,
+      "loss": 1.1129,
+      "step": 4345
+    },
+    {
+      "epoch": 0.7738603988603988,
+      "grad_norm": 0.7066529989242554,
+      "learning_rate": 0.00018222001705082744,
+      "loss": 1.3234,
+      "step": 4346
+    },
+    {
+      "epoch": 0.7740384615384616,
+      "grad_norm": 0.6340884566307068,
+      "learning_rate": 0.00018221204890587497,
+      "loss": 1.0726,
+      "step": 4347
+    },
+    {
+      "epoch": 0.7742165242165242,
+      "grad_norm": 0.5401145815849304,
+      "learning_rate": 0.00018220407915014818,
+      "loss": 0.9904,
+      "step": 4348
+    },
+    {
+      "epoch": 0.7743945868945868,
+      "grad_norm": 0.5069159269332886,
+      "learning_rate": 0.00018219610778380315,
+      "loss": 1.0654,
+      "step": 4349
+    },
+    {
+      "epoch": 0.7745726495726496,
+      "grad_norm": 0.5422839522361755,
+      "learning_rate": 0.00018218813480699623,
+      "loss": 1.1741,
+      "step": 4350
+    },
+    {
+      "epoch": 0.7747507122507122,
+      "grad_norm": 0.5550300478935242,
+      "learning_rate": 0.0001821801602198835,
+      "loss": 1.0033,
+      "step": 4351
+    },
+    {
+      "epoch": 0.7749287749287749,
+      "grad_norm": 0.5987736582756042,
+      "learning_rate": 0.00018217218402262123,
+      "loss": 0.935,
+      "step": 4352
+    },
+    {
+      "epoch": 0.7751068376068376,
+      "grad_norm": 0.6137008666992188,
+      "learning_rate": 0.00018216420621536573,
+      "loss": 1.17,
+      "step": 4353
+    },
+    {
+      "epoch": 0.7752849002849003,
+      "grad_norm": 0.47124359011650085,
+      "learning_rate": 0.0001821562267982733,
+      "loss": 0.8316,
+      "step": 4354
+    },
+    {
+      "epoch": 0.7754629629629629,
+      "grad_norm": 0.5057868361473083,
+      "learning_rate": 0.00018214824577150024,
+      "loss": 1.0246,
+      "step": 4355
+    },
+    {
+      "epoch": 0.7756410256410257,
+      "grad_norm": 0.604055643081665,
+      "learning_rate": 0.00018214026313520299,
+      "loss": 1.1272,
+      "step": 4356
+    },
+    {
+      "epoch": 0.7758190883190883,
+      "grad_norm": 0.6690384149551392,
+      "learning_rate": 0.0001821322788895379,
+      "loss": 1.0464,
+      "step": 4357
+    },
+    {
+      "epoch": 0.7759971509971509,
+      "grad_norm": 0.5458958745002747,
+      "learning_rate": 0.0001821242930346614,
+      "loss": 1.1712,
+      "step": 4358
+    },
+    {
+      "epoch": 0.7761752136752137,
+      "grad_norm": 0.6448663473129272,
+      "learning_rate": 0.00018211630557073,
+      "loss": 1.1125,
+      "step": 4359
+    },
+    {
+      "epoch": 0.7763532763532763,
+      "grad_norm": 0.49889448285102844,
+      "learning_rate": 0.00018210831649790018,
+      "loss": 1.097,
+      "step": 4360
+    },
+    {
+      "epoch": 0.7765313390313391,
+      "grad_norm": 0.5118046998977661,
+      "learning_rate": 0.00018210032581632843,
+      "loss": 1.009,
+      "step": 4361
+    },
+    {
+      "epoch": 0.7767094017094017,
+      "grad_norm": 0.5450068116188049,
+      "learning_rate": 0.00018209233352617135,
+      "loss": 1.1138,
+      "step": 4362
+    },
+    {
+      "epoch": 0.7768874643874644,
+      "grad_norm": 0.6147481203079224,
+      "learning_rate": 0.00018208433962758558,
+      "loss": 1.212,
+      "step": 4363
+    },
+    {
+      "epoch": 0.7770655270655271,
+      "grad_norm": 0.554176926612854,
+      "learning_rate": 0.00018207634412072764,
+      "loss": 1.1271,
+      "step": 4364
+    },
+    {
+      "epoch": 0.7772435897435898,
+      "grad_norm": 0.5872851014137268,
+      "learning_rate": 0.00018206834700575426,
+      "loss": 1.2793,
+      "step": 4365
+    },
+    {
+      "epoch": 0.7774216524216524,
+      "grad_norm": 0.5135685205459595,
+      "learning_rate": 0.00018206034828282207,
+      "loss": 0.9642,
+      "step": 4366
+    },
+    {
+      "epoch": 0.7775997150997151,
+      "grad_norm": 0.5699490308761597,
+      "learning_rate": 0.00018205234795208786,
+      "loss": 0.9086,
+      "step": 4367
+    },
+    {
+      "epoch": 0.7777777777777778,
+      "grad_norm": 0.5908057689666748,
+      "learning_rate": 0.00018204434601370832,
+      "loss": 1.1973,
+      "step": 4368
+    },
+    {
+      "epoch": 0.7779558404558404,
+      "grad_norm": 0.5777581334114075,
+      "learning_rate": 0.00018203634246784025,
+      "loss": 1.0447,
+      "step": 4369
+    },
+    {
+      "epoch": 0.7781339031339032,
+      "grad_norm": 0.4822927713394165,
+      "learning_rate": 0.00018202833731464048,
+      "loss": 0.814,
+      "step": 4370
+    },
+    {
+      "epoch": 0.7783119658119658,
+      "grad_norm": 0.5343610644340515,
+      "learning_rate": 0.0001820203305542658,
+      "loss": 1.2785,
+      "step": 4371
+    },
+    {
+      "epoch": 0.7784900284900285,
+      "grad_norm": 0.5462222695350647,
+      "learning_rate": 0.00018201232218687316,
+      "loss": 1.1785,
+      "step": 4372
+    },
+    {
+      "epoch": 0.7786680911680912,
+      "grad_norm": 0.5177609324455261,
+      "learning_rate": 0.00018200431221261943,
+      "loss": 1.111,
+      "step": 4373
+    },
+    {
+      "epoch": 0.7788461538461539,
+      "grad_norm": 0.5324625968933105,
+      "learning_rate": 0.00018199630063166157,
+      "loss": 1.0738,
+      "step": 4374
+    },
+    {
+      "epoch": 0.7790242165242165,
+      "grad_norm": 0.6392876505851746,
+      "learning_rate": 0.0001819882874441565,
+      "loss": 1.1758,
+      "step": 4375
+    },
+    {
+      "epoch": 0.7792022792022792,
+      "grad_norm": 0.49964696168899536,
+      "learning_rate": 0.00018198027265026127,
+      "loss": 1.0556,
+      "step": 4376
+    },
+    {
+      "epoch": 0.7793803418803419,
+      "grad_norm": 0.6090660691261292,
+      "learning_rate": 0.00018197225625013287,
+      "loss": 1.0102,
+      "step": 4377
+    },
+    {
+      "epoch": 0.7795584045584045,
+      "grad_norm": 0.5242345929145813,
+      "learning_rate": 0.00018196423824392842,
+      "loss": 0.8335,
+      "step": 4378
+    },
+    {
+      "epoch": 0.7797364672364673,
+      "grad_norm": 0.5265036225318909,
+      "learning_rate": 0.00018195621863180498,
+      "loss": 1.0781,
+      "step": 4379
+    },
+    {
+      "epoch": 0.7799145299145299,
+      "grad_norm": 0.5115378499031067,
+      "learning_rate": 0.0001819481974139197,
+      "loss": 1.1658,
+      "step": 4380
+    },
+    {
+      "epoch": 0.7800925925925926,
+      "grad_norm": 0.6489549875259399,
+      "learning_rate": 0.00018194017459042972,
+      "loss": 1.0572,
+      "step": 4381
+    },
+    {
+      "epoch": 0.7802706552706553,
+      "grad_norm": 0.5800202488899231,
+      "learning_rate": 0.0001819321501614922,
+      "loss": 0.9593,
+      "step": 4382
+    },
+    {
+      "epoch": 0.780448717948718,
+      "grad_norm": 0.5608528256416321,
+      "learning_rate": 0.00018192412412726443,
+      "loss": 1.0324,
+      "step": 4383
+    },
+    {
+      "epoch": 0.7806267806267806,
+      "grad_norm": 0.5596401691436768,
+      "learning_rate": 0.00018191609648790362,
+      "loss": 1.071,
+      "step": 4384
+    },
+    {
+      "epoch": 0.7808048433048433,
+      "grad_norm": 0.5712903141975403,
+      "learning_rate": 0.00018190806724356707,
+      "loss": 0.9011,
+      "step": 4385
+    },
+    {
+      "epoch": 0.780982905982906,
+      "grad_norm": 0.5079438090324402,
+      "learning_rate": 0.0001819000363944121,
+      "loss": 1.1194,
+      "step": 4386
+    },
+    {
+      "epoch": 0.7811609686609686,
+      "grad_norm": 0.5785079598426819,
+      "learning_rate": 0.00018189200394059602,
+      "loss": 1.1703,
+      "step": 4387
+    },
+    {
+      "epoch": 0.7813390313390314,
+      "grad_norm": 0.6901816129684448,
+      "learning_rate": 0.00018188396988227625,
+      "loss": 1.6689,
+      "step": 4388
+    },
+    {
+      "epoch": 0.781517094017094,
+      "grad_norm": 0.48107922077178955,
+      "learning_rate": 0.00018187593421961022,
+      "loss": 1.0116,
+      "step": 4389
+    },
+    {
+      "epoch": 0.7816951566951567,
+      "grad_norm": 0.5843084454536438,
+      "learning_rate": 0.0001818678969527553,
+      "loss": 1.1172,
+      "step": 4390
+    },
+    {
+      "epoch": 0.7818732193732194,
+      "grad_norm": 0.479034423828125,
+      "learning_rate": 0.00018185985808186902,
+      "loss": 0.811,
+      "step": 4391
+    },
+    {
+      "epoch": 0.782051282051282,
+      "grad_norm": 0.5864158272743225,
+      "learning_rate": 0.00018185181760710888,
+      "loss": 0.9522,
+      "step": 4392
+    },
+    {
+      "epoch": 0.7822293447293447,
+      "grad_norm": 0.4824625551700592,
+      "learning_rate": 0.00018184377552863242,
+      "loss": 0.9039,
+      "step": 4393
+    },
+    {
+      "epoch": 0.7824074074074074,
+      "grad_norm": 0.580102801322937,
+      "learning_rate": 0.00018183573184659717,
+      "loss": 1.2382,
+      "step": 4394
+    },
+    {
+      "epoch": 0.7825854700854701,
+      "grad_norm": 0.5300056338310242,
+      "learning_rate": 0.00018182768656116073,
+      "loss": 1.2268,
+      "step": 4395
+    },
+    {
+      "epoch": 0.7827635327635327,
+      "grad_norm": 0.5548123121261597,
+      "learning_rate": 0.00018181963967248078,
+      "loss": 1.0628,
+      "step": 4396
+    },
+    {
+      "epoch": 0.7829415954415955,
+      "grad_norm": 0.5485070943832397,
+      "learning_rate": 0.00018181159118071496,
+      "loss": 0.9628,
+      "step": 4397
+    },
+    {
+      "epoch": 0.7831196581196581,
+      "grad_norm": 0.47405415773391724,
+      "learning_rate": 0.00018180354108602095,
+      "loss": 1.1413,
+      "step": 4398
+    },
+    {
+      "epoch": 0.7832977207977208,
+      "grad_norm": 0.5545752644538879,
+      "learning_rate": 0.0001817954893885565,
+      "loss": 1.3807,
+      "step": 4399
+    },
+    {
+      "epoch": 0.7834757834757835,
+      "grad_norm": 0.5339497327804565,
+      "learning_rate": 0.00018178743608847933,
+      "loss": 0.9978,
+      "step": 4400
+    },
+    {
+      "epoch": 0.7836538461538461,
+      "grad_norm": 0.5006352663040161,
+      "learning_rate": 0.00018177938118594725,
+      "loss": 0.8873,
+      "step": 4401
+    },
+    {
+      "epoch": 0.7838319088319088,
+      "grad_norm": 0.4845179319381714,
+      "learning_rate": 0.00018177132468111812,
+      "loss": 0.8866,
+      "step": 4402
+    },
+    {
+      "epoch": 0.7840099715099715,
+      "grad_norm": 0.5240967869758606,
+      "learning_rate": 0.0001817632665741497,
+      "loss": 1.0347,
+      "step": 4403
+    },
+    {
+      "epoch": 0.7841880341880342,
+      "grad_norm": 0.5311884880065918,
+      "learning_rate": 0.00018175520686519993,
+      "loss": 1.2065,
+      "step": 4404
+    },
+    {
+      "epoch": 0.7843660968660968,
+      "grad_norm": 0.5562815070152283,
+      "learning_rate": 0.00018174714555442673,
+      "loss": 1.1272,
+      "step": 4405
+    },
+    {
+      "epoch": 0.7845441595441596,
+      "grad_norm": 0.5524366497993469,
+      "learning_rate": 0.00018173908264198802,
+      "loss": 1.2337,
+      "step": 4406
+    },
+    {
+      "epoch": 0.7847222222222222,
+      "grad_norm": 0.5612216591835022,
+      "learning_rate": 0.0001817310181280418,
+      "loss": 1.1809,
+      "step": 4407
+    },
+    {
+      "epoch": 0.7849002849002849,
+      "grad_norm": 0.5315343737602234,
+      "learning_rate": 0.000181722952012746,
+      "loss": 1.0491,
+      "step": 4408
+    },
+    {
+      "epoch": 0.7850783475783476,
+      "grad_norm": 0.5233435034751892,
+      "learning_rate": 0.00018171488429625878,
+      "loss": 1.0457,
+      "step": 4409
+    },
+    {
+      "epoch": 0.7852564102564102,
+      "grad_norm": 0.7809093594551086,
+      "learning_rate": 0.00018170681497873813,
+      "loss": 1.1578,
+      "step": 4410
+    },
+    {
+      "epoch": 0.7854344729344729,
+      "grad_norm": 0.49659839272499084,
+      "learning_rate": 0.00018169874406034217,
+      "loss": 1.0815,
+      "step": 4411
+    },
+    {
+      "epoch": 0.7856125356125356,
+      "grad_norm": 0.5020765066146851,
+      "learning_rate": 0.00018169067154122904,
+      "loss": 1.1985,
+      "step": 4412
+    },
+    {
+      "epoch": 0.7857905982905983,
+      "grad_norm": 0.6408432126045227,
+      "learning_rate": 0.0001816825974215569,
+      "loss": 1.2272,
+      "step": 4413
+    },
+    {
+      "epoch": 0.7859686609686609,
+      "grad_norm": 0.5062605142593384,
+      "learning_rate": 0.00018167452170148396,
+      "loss": 0.9663,
+      "step": 4414
+    },
+    {
+      "epoch": 0.7861467236467237,
+      "grad_norm": 0.5100119113922119,
+      "learning_rate": 0.0001816664443811684,
+      "loss": 1.0256,
+      "step": 4415
+    },
+    {
+      "epoch": 0.7863247863247863,
+      "grad_norm": 0.5277643799781799,
+      "learning_rate": 0.00018165836546076854,
+      "loss": 1.2885,
+      "step": 4416
+    },
+    {
+      "epoch": 0.7865028490028491,
+      "grad_norm": 0.5568150281906128,
+      "learning_rate": 0.0001816502849404426,
+      "loss": 1.2673,
+      "step": 4417
+    },
+    {
+      "epoch": 0.7866809116809117,
+      "grad_norm": 0.5061392188072205,
+      "learning_rate": 0.00018164220282034896,
+      "loss": 1.072,
+      "step": 4418
+    },
+    {
+      "epoch": 0.7868589743589743,
+      "grad_norm": 0.5383077263832092,
+      "learning_rate": 0.00018163411910064597,
+      "loss": 1.0621,
+      "step": 4419
+    },
+    {
+      "epoch": 0.7870370370370371,
+      "grad_norm": 0.5167948007583618,
+      "learning_rate": 0.00018162603378149198,
+      "loss": 1.099,
+      "step": 4420
+    },
+    {
+      "epoch": 0.7872150997150997,
+      "grad_norm": 0.5084534287452698,
+      "learning_rate": 0.0001816179468630454,
+      "loss": 1.3984,
+      "step": 4421
+    },
+    {
+      "epoch": 0.7873931623931624,
+      "grad_norm": 0.608762264251709,
+      "learning_rate": 0.00018160985834546475,
+      "loss": 1.3553,
+      "step": 4422
+    },
+    {
+      "epoch": 0.7875712250712251,
+      "grad_norm": 0.4900866746902466,
+      "learning_rate": 0.00018160176822890842,
+      "loss": 1.0009,
+      "step": 4423
+    },
+    {
+      "epoch": 0.7877492877492878,
+      "grad_norm": 0.5928917527198792,
+      "learning_rate": 0.00018159367651353496,
+      "loss": 1.0523,
+      "step": 4424
+    },
+    {
+      "epoch": 0.7879273504273504,
+      "grad_norm": 0.624422013759613,
+      "learning_rate": 0.0001815855831995029,
+      "loss": 1.0519,
+      "step": 4425
+    },
+    {
+      "epoch": 0.7881054131054132,
+      "grad_norm": 0.5140150785446167,
+      "learning_rate": 0.00018157748828697082,
+      "loss": 1.048,
+      "step": 4426
+    },
+    {
+      "epoch": 0.7882834757834758,
+      "grad_norm": 0.47006943821907043,
+      "learning_rate": 0.00018156939177609732,
+      "loss": 1.0067,
+      "step": 4427
+    },
+    {
+      "epoch": 0.7884615384615384,
+      "grad_norm": 0.5178864002227783,
+      "learning_rate": 0.00018156129366704105,
+      "loss": 1.0583,
+      "step": 4428
+    },
+    {
+      "epoch": 0.7886396011396012,
+      "grad_norm": 0.5279985666275024,
+      "learning_rate": 0.00018155319395996066,
+      "loss": 1.3023,
+      "step": 4429
+    },
+    {
+      "epoch": 0.7888176638176638,
+      "grad_norm": 0.5238787531852722,
+      "learning_rate": 0.00018154509265501482,
+      "loss": 1.0851,
+      "step": 4430
+    },
+    {
+      "epoch": 0.7889957264957265,
+      "grad_norm": 0.5914917588233948,
+      "learning_rate": 0.00018153698975236228,
+      "loss": 0.9291,
+      "step": 4431
+    },
+    {
+      "epoch": 0.7891737891737892,
+      "grad_norm": 0.5046082735061646,
+      "learning_rate": 0.00018152888525216183,
+      "loss": 0.9951,
+      "step": 4432
+    },
+    {
+      "epoch": 0.7893518518518519,
+      "grad_norm": 0.5042256116867065,
+      "learning_rate": 0.00018152077915457225,
+      "loss": 1.0243,
+      "step": 4433
+    },
+    {
+      "epoch": 0.7895299145299145,
+      "grad_norm": 0.5950339436531067,
+      "learning_rate": 0.0001815126714597523,
+      "loss": 0.9803,
+      "step": 4434
+    },
+    {
+      "epoch": 0.7897079772079773,
+      "grad_norm": 0.5163764953613281,
+      "learning_rate": 0.0001815045621678609,
+      "loss": 1.0353,
+      "step": 4435
+    },
+    {
+      "epoch": 0.7898860398860399,
+      "grad_norm": 0.5166211128234863,
+      "learning_rate": 0.00018149645127905691,
+      "loss": 0.9649,
+      "step": 4436
+    },
+    {
+      "epoch": 0.7900641025641025,
+      "grad_norm": 0.5239769220352173,
+      "learning_rate": 0.00018148833879349927,
+      "loss": 0.9747,
+      "step": 4437
+    },
+    {
+      "epoch": 0.7902421652421653,
+      "grad_norm": 0.5803237557411194,
+      "learning_rate": 0.00018148022471134692,
+      "loss": 1.315,
+      "step": 4438
+    },
+    {
+      "epoch": 0.7904202279202279,
+      "grad_norm": 0.5141370296478271,
+      "learning_rate": 0.00018147210903275877,
+      "loss": 1.0547,
+      "step": 4439
+    },
+    {
+      "epoch": 0.7905982905982906,
+      "grad_norm": 0.545788586139679,
+      "learning_rate": 0.00018146399175789394,
+      "loss": 1.0797,
+      "step": 4440
+    },
+    {
+      "epoch": 0.7907763532763533,
+      "grad_norm": 0.5273314714431763,
+      "learning_rate": 0.0001814558728869114,
+      "loss": 0.7928,
+      "step": 4441
+    },
+    {
+      "epoch": 0.790954415954416,
+      "grad_norm": 0.4614652693271637,
+      "learning_rate": 0.00018144775241997024,
+      "loss": 0.8826,
+      "step": 4442
+    },
+    {
+      "epoch": 0.7911324786324786,
+      "grad_norm": 0.6203590631484985,
+      "learning_rate": 0.00018143963035722958,
+      "loss": 1.2891,
+      "step": 4443
+    },
+    {
+      "epoch": 0.7913105413105413,
+      "grad_norm": 0.4870408773422241,
+      "learning_rate": 0.0001814315066988485,
+      "loss": 1.0717,
+      "step": 4444
+    },
+    {
+      "epoch": 0.791488603988604,
+      "grad_norm": 0.6468982696533203,
+      "learning_rate": 0.00018142338144498625,
+      "loss": 1.3398,
+      "step": 4445
+    },
+    {
+      "epoch": 0.7916666666666666,
+      "grad_norm": 0.4727918207645416,
+      "learning_rate": 0.00018141525459580197,
+      "loss": 1.0195,
+      "step": 4446
+    },
+    {
+      "epoch": 0.7918447293447294,
+      "grad_norm": 0.5080479979515076,
+      "learning_rate": 0.0001814071261514549,
+      "loss": 1.0163,
+      "step": 4447
+    },
+    {
+      "epoch": 0.792022792022792,
+      "grad_norm": 0.5380908250808716,
+      "learning_rate": 0.0001813989961121043,
+      "loss": 1.1673,
+      "step": 4448
+    },
+    {
+      "epoch": 0.7922008547008547,
+      "grad_norm": 0.5020384192466736,
+      "learning_rate": 0.00018139086447790945,
+      "loss": 0.8591,
+      "step": 4449
+    },
+    {
+      "epoch": 0.7923789173789174,
+      "grad_norm": 0.5279949903488159,
+      "learning_rate": 0.0001813827312490297,
+      "loss": 1.1221,
+      "step": 4450
+    },
+    {
+      "epoch": 0.79255698005698,
+      "grad_norm": 0.6739233732223511,
+      "learning_rate": 0.00018137459642562437,
+      "loss": 1.2704,
+      "step": 4451
+    },
+    {
+      "epoch": 0.7927350427350427,
+      "grad_norm": 0.5112259984016418,
+      "learning_rate": 0.00018136646000785288,
+      "loss": 1.1161,
+      "step": 4452
+    },
+    {
+      "epoch": 0.7929131054131054,
+      "grad_norm": 0.5244031548500061,
+      "learning_rate": 0.00018135832199587463,
+      "loss": 0.7866,
+      "step": 4453
+    },
+    {
+      "epoch": 0.7930911680911681,
+      "grad_norm": 0.5803347229957581,
+      "learning_rate": 0.0001813501823898491,
+      "loss": 0.994,
+      "step": 4454
+    },
+    {
+      "epoch": 0.7932692307692307,
+      "grad_norm": 0.6191152930259705,
+      "learning_rate": 0.00018134204118993568,
+      "loss": 1.0725,
+      "step": 4455
+    },
+    {
+      "epoch": 0.7934472934472935,
+      "grad_norm": 0.549735963344574,
+      "learning_rate": 0.00018133389839629396,
+      "loss": 0.9915,
+      "step": 4456
+    },
+    {
+      "epoch": 0.7936253561253561,
+      "grad_norm": 0.4940381646156311,
+      "learning_rate": 0.00018132575400908347,
+      "loss": 1.1815,
+      "step": 4457
+    },
+    {
+      "epoch": 0.7938034188034188,
+      "grad_norm": 0.5009099245071411,
+      "learning_rate": 0.00018131760802846377,
+      "loss": 1.0833,
+      "step": 4458
+    },
+    {
+      "epoch": 0.7939814814814815,
+      "grad_norm": 0.595853865146637,
+      "learning_rate": 0.00018130946045459445,
+      "loss": 1.2774,
+      "step": 4459
+    },
+    {
+      "epoch": 0.7941595441595442,
+      "grad_norm": 0.534794807434082,
+      "learning_rate": 0.00018130131128763513,
+      "loss": 1.0891,
+      "step": 4460
+    },
+    {
+      "epoch": 0.7943376068376068,
+      "grad_norm": 0.5828582048416138,
+      "learning_rate": 0.00018129316052774557,
+      "loss": 1.0786,
+      "step": 4461
+    },
+    {
+      "epoch": 0.7945156695156695,
+      "grad_norm": 0.4750654697418213,
+      "learning_rate": 0.00018128500817508533,
+      "loss": 1.0818,
+      "step": 4462
+    },
+    {
+      "epoch": 0.7946937321937322,
+      "grad_norm": 0.5626576542854309,
+      "learning_rate": 0.00018127685422981426,
+      "loss": 1.0807,
+      "step": 4463
+    },
+    {
+      "epoch": 0.7948717948717948,
+      "grad_norm": 0.6434760093688965,
+      "learning_rate": 0.00018126869869209203,
+      "loss": 1.0908,
+      "step": 4464
+    },
+    {
+      "epoch": 0.7950498575498576,
+      "grad_norm": 0.5577414631843567,
+      "learning_rate": 0.00018126054156207853,
+      "loss": 1.0281,
+      "step": 4465
+    },
+    {
+      "epoch": 0.7952279202279202,
+      "grad_norm": 0.5001249313354492,
+      "learning_rate": 0.00018125238283993347,
+      "loss": 0.9083,
+      "step": 4466
+    },
+    {
+      "epoch": 0.7954059829059829,
+      "grad_norm": 0.5298314690589905,
+      "learning_rate": 0.00018124422252581676,
+      "loss": 0.971,
+      "step": 4467
+    },
+    {
+      "epoch": 0.7955840455840456,
+      "grad_norm": 0.4872737228870392,
+      "learning_rate": 0.00018123606061988832,
+      "loss": 1.0515,
+      "step": 4468
+    },
+    {
+      "epoch": 0.7957621082621082,
+      "grad_norm": 0.5895398259162903,
+      "learning_rate": 0.00018122789712230798,
+      "loss": 1.0771,
+      "step": 4469
+    },
+    {
+      "epoch": 0.7959401709401709,
+      "grad_norm": 0.5212514996528625,
+      "learning_rate": 0.00018121973203323577,
+      "loss": 1.0365,
+      "step": 4470
+    },
+    {
+      "epoch": 0.7961182336182336,
+      "grad_norm": 0.4679451584815979,
+      "learning_rate": 0.0001812115653528316,
+      "loss": 0.9445,
+      "step": 4471
+    },
+    {
+      "epoch": 0.7962962962962963,
+      "grad_norm": 0.5852653980255127,
+      "learning_rate": 0.00018120339708125552,
+      "loss": 1.1781,
+      "step": 4472
+    },
+    {
+      "epoch": 0.7964743589743589,
+      "grad_norm": 0.6081342697143555,
+      "learning_rate": 0.00018119522721866756,
+      "loss": 1.3881,
+      "step": 4473
+    },
+    {
+      "epoch": 0.7966524216524217,
+      "grad_norm": 0.5254155993461609,
+      "learning_rate": 0.00018118705576522777,
+      "loss": 1.2198,
+      "step": 4474
+    },
+    {
+      "epoch": 0.7968304843304843,
+      "grad_norm": 0.5959419012069702,
+      "learning_rate": 0.00018117888272109632,
+      "loss": 1.0922,
+      "step": 4475
+    },
+    {
+      "epoch": 0.7970085470085471,
+      "grad_norm": 0.6243147253990173,
+      "learning_rate": 0.0001811707080864333,
+      "loss": 1.1782,
+      "step": 4476
+    },
+    {
+      "epoch": 0.7971866096866097,
+      "grad_norm": 0.5336906909942627,
+      "learning_rate": 0.0001811625318613988,
+      "loss": 1.167,
+      "step": 4477
+    },
+    {
+      "epoch": 0.7973646723646723,
+      "grad_norm": 0.5287907719612122,
+      "learning_rate": 0.00018115435404615315,
+      "loss": 0.9923,
+      "step": 4478
+    },
+    {
+      "epoch": 0.7975427350427351,
+      "grad_norm": 0.48941442370414734,
+      "learning_rate": 0.0001811461746408565,
+      "loss": 0.863,
+      "step": 4479
+    },
+    {
+      "epoch": 0.7977207977207977,
+      "grad_norm": 0.48465651273727417,
+      "learning_rate": 0.0001811379936456691,
+      "loss": 1.147,
+      "step": 4480
+    },
+    {
+      "epoch": 0.7978988603988604,
+      "grad_norm": 0.5676067471504211,
+      "learning_rate": 0.0001811298110607513,
+      "loss": 1.3121,
+      "step": 4481
+    },
+    {
+      "epoch": 0.7980769230769231,
+      "grad_norm": 0.4894018769264221,
+      "learning_rate": 0.00018112162688626337,
+      "loss": 1.1831,
+      "step": 4482
+    },
+    {
+      "epoch": 0.7982549857549858,
+      "grad_norm": 0.5626382827758789,
+      "learning_rate": 0.0001811134411223657,
+      "loss": 1.1977,
+      "step": 4483
+    },
+    {
+      "epoch": 0.7984330484330484,
+      "grad_norm": 0.564119815826416,
+      "learning_rate": 0.00018110525376921862,
+      "loss": 1.2686,
+      "step": 4484
+    },
+    {
+      "epoch": 0.7986111111111112,
+      "grad_norm": 0.6385740041732788,
+      "learning_rate": 0.00018109706482698256,
+      "loss": 1.2418,
+      "step": 4485
+    },
+    {
+      "epoch": 0.7987891737891738,
+      "grad_norm": 0.5550164580345154,
+      "learning_rate": 0.00018108887429581802,
+      "loss": 1.081,
+      "step": 4486
+    },
+    {
+      "epoch": 0.7989672364672364,
+      "grad_norm": 0.5583973526954651,
+      "learning_rate": 0.00018108068217588544,
+      "loss": 1.1757,
+      "step": 4487
+    },
+    {
+      "epoch": 0.7991452991452992,
+      "grad_norm": 0.5533342957496643,
+      "learning_rate": 0.00018107248846734527,
+      "loss": 1.1947,
+      "step": 4488
+    },
+    {
+      "epoch": 0.7993233618233618,
+      "grad_norm": 0.5291479229927063,
+      "learning_rate": 0.00018106429317035815,
+      "loss": 1.2769,
+      "step": 4489
+    },
+    {
+      "epoch": 0.7995014245014245,
+      "grad_norm": 0.4680160582065582,
+      "learning_rate": 0.00018105609628508458,
+      "loss": 0.7059,
+      "step": 4490
+    },
+    {
+      "epoch": 0.7996794871794872,
+      "grad_norm": 0.5364881157875061,
+      "learning_rate": 0.00018104789781168517,
+      "loss": 1.0566,
+      "step": 4491
+    },
+    {
+      "epoch": 0.7998575498575499,
+      "grad_norm": 0.5917307734489441,
+      "learning_rate": 0.0001810396977503206,
+      "loss": 1.2263,
+      "step": 4492
+    },
+    {
+      "epoch": 0.8000356125356125,
+      "grad_norm": 0.6013199090957642,
+      "learning_rate": 0.0001810314961011515,
+      "loss": 1.2053,
+      "step": 4493
+    },
+    {
+      "epoch": 0.8002136752136753,
+      "grad_norm": 0.6005663275718689,
+      "learning_rate": 0.0001810232928643385,
+      "loss": 1.2241,
+      "step": 4494
+    },
+    {
+      "epoch": 0.8003917378917379,
+      "grad_norm": 0.49207603931427,
+      "learning_rate": 0.00018101508804004246,
+      "loss": 1.0661,
+      "step": 4495
+    },
+    {
+      "epoch": 0.8005698005698005,
+      "grad_norm": 0.4834063947200775,
+      "learning_rate": 0.00018100688162842401,
+      "loss": 1.1745,
+      "step": 4496
+    },
+    {
+      "epoch": 0.8007478632478633,
+      "grad_norm": 0.5347156524658203,
+      "learning_rate": 0.000180998673629644,
+      "loss": 1.0679,
+      "step": 4497
+    },
+    {
+      "epoch": 0.8009259259259259,
+      "grad_norm": 0.5815600156784058,
+      "learning_rate": 0.00018099046404386327,
+      "loss": 1.2652,
+      "step": 4498
+    },
+    {
+      "epoch": 0.8011039886039886,
+      "grad_norm": 0.5291135311126709,
+      "learning_rate": 0.00018098225287124263,
+      "loss": 1.2072,
+      "step": 4499
+    },
+    {
+      "epoch": 0.8012820512820513,
+      "grad_norm": 0.5779497027397156,
+      "learning_rate": 0.000180974040111943,
+      "loss": 1.3277,
+      "step": 4500
+    },
+    {
+      "epoch": 0.801460113960114,
+      "grad_norm": 0.44566696882247925,
+      "learning_rate": 0.0001809658257661252,
+      "loss": 0.7702,
+      "step": 4501
+    },
+    {
+      "epoch": 0.8016381766381766,
+      "grad_norm": 0.5407577753067017,
+      "learning_rate": 0.00018095760983395027,
+      "loss": 1.2894,
+      "step": 4502
+    },
+    {
+      "epoch": 0.8018162393162394,
+      "grad_norm": 0.4771903455257416,
+      "learning_rate": 0.00018094939231557916,
+      "loss": 1.045,
+      "step": 4503
+    },
+    {
+      "epoch": 0.801994301994302,
+      "grad_norm": 0.5970945358276367,
+      "learning_rate": 0.00018094117321117286,
+      "loss": 1.2059,
+      "step": 4504
+    },
+    {
+      "epoch": 0.8021723646723646,
+      "grad_norm": 0.4959338903427124,
+      "learning_rate": 0.0001809329525208924,
+      "loss": 1.155,
+      "step": 4505
+    },
+    {
+      "epoch": 0.8023504273504274,
+      "grad_norm": 0.5142548084259033,
+      "learning_rate": 0.00018092473024489887,
+      "loss": 0.9413,
+      "step": 4506
+    },
+    {
+      "epoch": 0.80252849002849,
+      "grad_norm": 0.5336433053016663,
+      "learning_rate": 0.00018091650638335334,
+      "loss": 1.0699,
+      "step": 4507
+    },
+    {
+      "epoch": 0.8027065527065527,
+      "grad_norm": 0.47770628333091736,
+      "learning_rate": 0.00018090828093641698,
+      "loss": 1.1515,
+      "step": 4508
+    },
+    {
+      "epoch": 0.8028846153846154,
+      "grad_norm": 0.5443438291549683,
+      "learning_rate": 0.00018090005390425091,
+      "loss": 1.189,
+      "step": 4509
+    },
+    {
+      "epoch": 0.8030626780626781,
+      "grad_norm": 0.523179829120636,
+      "learning_rate": 0.00018089182528701632,
+      "loss": 1.1272,
+      "step": 4510
+    },
+    {
+      "epoch": 0.8032407407407407,
+      "grad_norm": 0.49628451466560364,
+      "learning_rate": 0.00018088359508487448,
+      "loss": 0.9754,
+      "step": 4511
+    },
+    {
+      "epoch": 0.8034188034188035,
+      "grad_norm": 0.5933086276054382,
+      "learning_rate": 0.00018087536329798663,
+      "loss": 1.2111,
+      "step": 4512
+    },
+    {
+      "epoch": 0.8035968660968661,
+      "grad_norm": 0.4565310776233673,
+      "learning_rate": 0.00018086712992651402,
+      "loss": 0.7729,
+      "step": 4513
+    },
+    {
+      "epoch": 0.8037749287749287,
+      "grad_norm": 0.5013461112976074,
+      "learning_rate": 0.00018085889497061798,
+      "loss": 1.2178,
+      "step": 4514
+    },
+    {
+      "epoch": 0.8039529914529915,
+      "grad_norm": 0.5170024633407593,
+      "learning_rate": 0.00018085065843045987,
+      "loss": 0.9181,
+      "step": 4515
+    },
+    {
+      "epoch": 0.8041310541310541,
+      "grad_norm": 0.583363950252533,
+      "learning_rate": 0.00018084242030620104,
+      "loss": 1.1542,
+      "step": 4516
+    },
+    {
+      "epoch": 0.8043091168091168,
+      "grad_norm": 0.46835777163505554,
+      "learning_rate": 0.00018083418059800297,
+      "loss": 0.8954,
+      "step": 4517
+    },
+    {
+      "epoch": 0.8044871794871795,
+      "grad_norm": 0.5145657062530518,
+      "learning_rate": 0.000180825939306027,
+      "loss": 1.0417,
+      "step": 4518
+    },
+    {
+      "epoch": 0.8046652421652422,
+      "grad_norm": 0.47216105461120605,
+      "learning_rate": 0.00018081769643043467,
+      "loss": 0.9516,
+      "step": 4519
+    },
+    {
+      "epoch": 0.8048433048433048,
+      "grad_norm": 0.5059915781021118,
+      "learning_rate": 0.0001808094519713875,
+      "loss": 1.1643,
+      "step": 4520
+    },
+    {
+      "epoch": 0.8050213675213675,
+      "grad_norm": 0.5406439900398254,
+      "learning_rate": 0.00018080120592904692,
+      "loss": 1.2038,
+      "step": 4521
+    },
+    {
+      "epoch": 0.8051994301994302,
+      "grad_norm": 0.6123420000076294,
+      "learning_rate": 0.0001807929583035746,
+      "loss": 1.4004,
+      "step": 4522
+    },
+    {
+      "epoch": 0.8053774928774928,
+      "grad_norm": 0.49699845910072327,
+      "learning_rate": 0.00018078470909513208,
+      "loss": 1.0347,
+      "step": 4523
+    },
+    {
+      "epoch": 0.8055555555555556,
+      "grad_norm": 0.5369421243667603,
+      "learning_rate": 0.000180776458303881,
+      "loss": 1.0418,
+      "step": 4524
+    },
+    {
+      "epoch": 0.8057336182336182,
+      "grad_norm": 0.5407396554946899,
+      "learning_rate": 0.00018076820592998301,
+      "loss": 0.9546,
+      "step": 4525
+    },
+    {
+      "epoch": 0.8059116809116809,
+      "grad_norm": 0.5749752521514893,
+      "learning_rate": 0.00018075995197359984,
+      "loss": 1.1438,
+      "step": 4526
+    },
+    {
+      "epoch": 0.8060897435897436,
+      "grad_norm": 0.5523102283477783,
+      "learning_rate": 0.00018075169643489317,
+      "loss": 1.1312,
+      "step": 4527
+    },
+    {
+      "epoch": 0.8062678062678063,
+      "grad_norm": 0.5767508149147034,
+      "learning_rate": 0.00018074343931402472,
+      "loss": 1.1951,
+      "step": 4528
+    },
+    {
+      "epoch": 0.8064458689458689,
+      "grad_norm": 0.5262924432754517,
+      "learning_rate": 0.00018073518061115633,
+      "loss": 1.1985,
+      "step": 4529
+    },
+    {
+      "epoch": 0.8066239316239316,
+      "grad_norm": 0.4742378294467926,
+      "learning_rate": 0.0001807269203264498,
+      "loss": 1.0126,
+      "step": 4530
+    },
+    {
+      "epoch": 0.8068019943019943,
+      "grad_norm": 0.5190158486366272,
+      "learning_rate": 0.00018071865846006692,
+      "loss": 0.9985,
+      "step": 4531
+    },
+    {
+      "epoch": 0.8069800569800569,
+      "grad_norm": 0.5910618305206299,
+      "learning_rate": 0.00018071039501216964,
+      "loss": 1.2776,
+      "step": 4532
+    },
+    {
+      "epoch": 0.8071581196581197,
+      "grad_norm": 0.5363098382949829,
+      "learning_rate": 0.00018070212998291983,
+      "loss": 1.3346,
+      "step": 4533
+    },
+    {
+      "epoch": 0.8073361823361823,
+      "grad_norm": 0.47711408138275146,
+      "learning_rate": 0.0001806938633724794,
+      "loss": 1.04,
+      "step": 4534
+    },
+    {
+      "epoch": 0.8075142450142451,
+      "grad_norm": 0.5092964172363281,
+      "learning_rate": 0.0001806855951810104,
+      "loss": 1.1409,
+      "step": 4535
+    },
+    {
+      "epoch": 0.8076923076923077,
+      "grad_norm": 0.5828777551651001,
+      "learning_rate": 0.00018067732540867472,
+      "loss": 1.3048,
+      "step": 4536
+    },
+    {
+      "epoch": 0.8078703703703703,
+      "grad_norm": 0.5779826045036316,
+      "learning_rate": 0.00018066905405563445,
+      "loss": 1.1599,
+      "step": 4537
+    },
+    {
+      "epoch": 0.8080484330484331,
+      "grad_norm": 0.49908435344696045,
+      "learning_rate": 0.00018066078112205167,
+      "loss": 1.1502,
+      "step": 4538
+    },
+    {
+      "epoch": 0.8082264957264957,
+      "grad_norm": 0.4772704839706421,
+      "learning_rate": 0.0001806525066080884,
+      "loss": 0.7925,
+      "step": 4539
+    },
+    {
+      "epoch": 0.8084045584045584,
+      "grad_norm": 0.4298383295536041,
+      "learning_rate": 0.00018064423051390683,
+      "loss": 0.7322,
+      "step": 4540
+    },
+    {
+      "epoch": 0.8085826210826211,
+      "grad_norm": 0.49349579215049744,
+      "learning_rate": 0.0001806359528396691,
+      "loss": 1.0021,
+      "step": 4541
+    },
+    {
+      "epoch": 0.8087606837606838,
+      "grad_norm": 0.4698609411716461,
+      "learning_rate": 0.00018062767358553735,
+      "loss": 0.9751,
+      "step": 4542
+    },
+    {
+      "epoch": 0.8089387464387464,
+      "grad_norm": 0.4949014186859131,
+      "learning_rate": 0.00018061939275167385,
+      "loss": 0.9553,
+      "step": 4543
+    },
+    {
+      "epoch": 0.8091168091168092,
+      "grad_norm": 0.5604463815689087,
+      "learning_rate": 0.0001806111103382408,
+      "loss": 0.9894,
+      "step": 4544
+    },
+    {
+      "epoch": 0.8092948717948718,
+      "grad_norm": 0.5761561989784241,
+      "learning_rate": 0.00018060282634540053,
+      "loss": 1.258,
+      "step": 4545
+    },
+    {
+      "epoch": 0.8094729344729344,
+      "grad_norm": 0.5239115357398987,
+      "learning_rate": 0.00018059454077331527,
+      "loss": 0.9189,
+      "step": 4546
+    },
+    {
+      "epoch": 0.8096509971509972,
+      "grad_norm": 0.47902220487594604,
+      "learning_rate": 0.00018058625362214742,
+      "loss": 1.0389,
+      "step": 4547
+    },
+    {
+      "epoch": 0.8098290598290598,
+      "grad_norm": 0.6274173259735107,
+      "learning_rate": 0.00018057796489205936,
+      "loss": 1.3368,
+      "step": 4548
+    },
+    {
+      "epoch": 0.8100071225071225,
+      "grad_norm": 0.5789401531219482,
+      "learning_rate": 0.00018056967458321345,
+      "loss": 1.1473,
+      "step": 4549
+    },
+    {
+      "epoch": 0.8101851851851852,
+      "grad_norm": 0.5850043296813965,
+      "learning_rate": 0.0001805613826957721,
+      "loss": 1.2224,
+      "step": 4550
+    },
+    {
+      "epoch": 0.8103632478632479,
+      "grad_norm": 0.6310738921165466,
+      "learning_rate": 0.00018055308922989788,
+      "loss": 1.0707,
+      "step": 4551
+    },
+    {
+      "epoch": 0.8105413105413105,
+      "grad_norm": 0.5198429822921753,
+      "learning_rate": 0.00018054479418575317,
+      "loss": 0.8984,
+      "step": 4552
+    },
+    {
+      "epoch": 0.8107193732193733,
+      "grad_norm": 0.5757743120193481,
+      "learning_rate": 0.00018053649756350054,
+      "loss": 1.2007,
+      "step": 4553
+    },
+    {
+      "epoch": 0.8108974358974359,
+      "grad_norm": 0.5109567642211914,
+      "learning_rate": 0.0001805281993633025,
+      "loss": 1.0696,
+      "step": 4554
+    },
+    {
+      "epoch": 0.8110754985754985,
+      "grad_norm": 0.5030225515365601,
+      "learning_rate": 0.00018051989958532173,
+      "loss": 0.9667,
+      "step": 4555
+    },
+    {
+      "epoch": 0.8112535612535613,
+      "grad_norm": 0.5291743874549866,
+      "learning_rate": 0.00018051159822972079,
+      "loss": 1.0219,
+      "step": 4556
+    },
+    {
+      "epoch": 0.8114316239316239,
+      "grad_norm": 0.5874896049499512,
+      "learning_rate": 0.00018050329529666233,
+      "loss": 0.8589,
+      "step": 4557
+    },
+    {
+      "epoch": 0.8116096866096866,
+      "grad_norm": 0.673284113407135,
+      "learning_rate": 0.000180494990786309,
+      "loss": 1.1902,
+      "step": 4558
+    },
+    {
+      "epoch": 0.8117877492877493,
+      "grad_norm": 0.4742524027824402,
+      "learning_rate": 0.00018048668469882354,
+      "loss": 1.0578,
+      "step": 4559
+    },
+    {
+      "epoch": 0.811965811965812,
+      "grad_norm": 0.5519167184829712,
+      "learning_rate": 0.0001804783770343687,
+      "loss": 1.083,
+      "step": 4560
+    },
+    {
+      "epoch": 0.8121438746438746,
+      "grad_norm": 0.5669941306114197,
+      "learning_rate": 0.00018047006779310727,
+      "loss": 1.0784,
+      "step": 4561
+    },
+    {
+      "epoch": 0.8123219373219374,
+      "grad_norm": 0.512759804725647,
+      "learning_rate": 0.000180461756975202,
+      "loss": 1.0361,
+      "step": 4562
+    },
+    {
+      "epoch": 0.8125,
+      "grad_norm": 0.5721749067306519,
+      "learning_rate": 0.00018045344458081575,
+      "loss": 1.0246,
+      "step": 4563
+    },
+    {
+      "epoch": 0.8126780626780626,
+      "grad_norm": 0.566430389881134,
+      "learning_rate": 0.00018044513061011137,
+      "loss": 1.1452,
+      "step": 4564
+    },
+    {
+      "epoch": 0.8128561253561254,
+      "grad_norm": 0.49391916394233704,
+      "learning_rate": 0.00018043681506325177,
+      "loss": 0.89,
+      "step": 4565
+    },
+    {
+      "epoch": 0.813034188034188,
+      "grad_norm": 0.5379437804222107,
+      "learning_rate": 0.00018042849794039988,
+      "loss": 1.1289,
+      "step": 4566
+    },
+    {
+      "epoch": 0.8132122507122507,
+      "grad_norm": 0.5667982697486877,
+      "learning_rate": 0.00018042017924171865,
+      "loss": 1.1596,
+      "step": 4567
+    },
+    {
+      "epoch": 0.8133903133903134,
+      "grad_norm": 0.6214209794998169,
+      "learning_rate": 0.00018041185896737109,
+      "loss": 1.0622,
+      "step": 4568
+    },
+    {
+      "epoch": 0.8135683760683761,
+      "grad_norm": 0.5442491173744202,
+      "learning_rate": 0.00018040353711752015,
+      "loss": 1.0536,
+      "step": 4569
+    },
+    {
+      "epoch": 0.8137464387464387,
+      "grad_norm": 0.5266172885894775,
+      "learning_rate": 0.00018039521369232894,
+      "loss": 1.0576,
+      "step": 4570
+    },
+    {
+      "epoch": 0.8139245014245015,
+      "grad_norm": 0.6057912111282349,
+      "learning_rate": 0.00018038688869196053,
+      "loss": 1.3067,
+      "step": 4571
+    },
+    {
+      "epoch": 0.8141025641025641,
+      "grad_norm": 0.489869087934494,
+      "learning_rate": 0.00018037856211657803,
+      "loss": 1.0279,
+      "step": 4572
+    },
+    {
+      "epoch": 0.8142806267806267,
+      "grad_norm": 0.5497978329658508,
+      "learning_rate": 0.00018037023396634457,
+      "loss": 1.1568,
+      "step": 4573
+    },
+    {
+      "epoch": 0.8144586894586895,
+      "grad_norm": 0.5243251919746399,
+      "learning_rate": 0.0001803619042414233,
+      "loss": 0.9767,
+      "step": 4574
+    },
+    {
+      "epoch": 0.8146367521367521,
+      "grad_norm": 0.503032922744751,
+      "learning_rate": 0.0001803535729419775,
+      "loss": 1.065,
+      "step": 4575
+    },
+    {
+      "epoch": 0.8148148148148148,
+      "grad_norm": 0.49955418705940247,
+      "learning_rate": 0.00018034524006817034,
+      "loss": 1.2752,
+      "step": 4576
+    },
+    {
+      "epoch": 0.8149928774928775,
+      "grad_norm": 0.5746406316757202,
+      "learning_rate": 0.00018033690562016508,
+      "loss": 1.098,
+      "step": 4577
+    },
+    {
+      "epoch": 0.8151709401709402,
+      "grad_norm": 0.5224192142486572,
+      "learning_rate": 0.00018032856959812507,
+      "loss": 1.1284,
+      "step": 4578
+    },
+    {
+      "epoch": 0.8153490028490028,
+      "grad_norm": 0.5484535694122314,
+      "learning_rate": 0.00018032023200221362,
+      "loss": 0.9182,
+      "step": 4579
+    },
+    {
+      "epoch": 0.8155270655270656,
+      "grad_norm": 0.5003355741500854,
+      "learning_rate": 0.00018031189283259405,
+      "loss": 1.136,
+      "step": 4580
+    },
+    {
+      "epoch": 0.8157051282051282,
+      "grad_norm": 0.5395768284797668,
+      "learning_rate": 0.00018030355208942977,
+      "loss": 1.2349,
+      "step": 4581
+    },
+    {
+      "epoch": 0.8158831908831908,
+      "grad_norm": 0.561966598033905,
+      "learning_rate": 0.0001802952097728842,
+      "loss": 0.999,
+      "step": 4582
+    },
+    {
+      "epoch": 0.8160612535612536,
+      "grad_norm": 0.4886479675769806,
+      "learning_rate": 0.00018028686588312083,
+      "loss": 0.9165,
+      "step": 4583
+    },
+    {
+      "epoch": 0.8162393162393162,
+      "grad_norm": 0.4769509732723236,
+      "learning_rate": 0.00018027852042030307,
+      "loss": 1.1377,
+      "step": 4584
+    },
+    {
+      "epoch": 0.8164173789173789,
+      "grad_norm": 0.4723633825778961,
+      "learning_rate": 0.00018027017338459448,
+      "loss": 1.0274,
+      "step": 4585
+    },
+    {
+      "epoch": 0.8165954415954416,
+      "grad_norm": 0.5773285627365112,
+      "learning_rate": 0.00018026182477615859,
+      "loss": 1.1468,
+      "step": 4586
+    },
+    {
+      "epoch": 0.8167735042735043,
+      "grad_norm": 0.5529203414916992,
+      "learning_rate": 0.00018025347459515895,
+      "loss": 1.0815,
+      "step": 4587
+    },
+    {
+      "epoch": 0.8169515669515669,
+      "grad_norm": 0.5449469685554504,
+      "learning_rate": 0.00018024512284175922,
+      "loss": 1.1637,
+      "step": 4588
+    },
+    {
+      "epoch": 0.8171296296296297,
+      "grad_norm": 0.5155341625213623,
+      "learning_rate": 0.00018023676951612298,
+      "loss": 1.1842,
+      "step": 4589
+    },
+    {
+      "epoch": 0.8173076923076923,
+      "grad_norm": 0.5569564700126648,
+      "learning_rate": 0.00018022841461841393,
+      "loss": 0.9254,
+      "step": 4590
+    },
+    {
+      "epoch": 0.8174857549857549,
+      "grad_norm": 0.45203131437301636,
+      "learning_rate": 0.00018022005814879573,
+      "loss": 0.9561,
+      "step": 4591
+    },
+    {
+      "epoch": 0.8176638176638177,
+      "grad_norm": 0.5735056400299072,
+      "learning_rate": 0.00018021170010743218,
+      "loss": 1.1402,
+      "step": 4592
+    },
+    {
+      "epoch": 0.8178418803418803,
+      "grad_norm": 0.6075260043144226,
+      "learning_rate": 0.00018020334049448697,
+      "loss": 0.8601,
+      "step": 4593
+    },
+    {
+      "epoch": 0.8180199430199431,
+      "grad_norm": 0.522682785987854,
+      "learning_rate": 0.0001801949793101239,
+      "loss": 1.0088,
+      "step": 4594
+    },
+    {
+      "epoch": 0.8181980056980057,
+      "grad_norm": 0.5648437142372131,
+      "learning_rate": 0.00018018661655450682,
+      "loss": 0.8359,
+      "step": 4595
+    },
+    {
+      "epoch": 0.8183760683760684,
+      "grad_norm": 0.5406472086906433,
+      "learning_rate": 0.00018017825222779954,
+      "loss": 1.1553,
+      "step": 4596
+    },
+    {
+      "epoch": 0.8185541310541311,
+      "grad_norm": 0.4917788803577423,
+      "learning_rate": 0.000180169886330166,
+      "loss": 1.2198,
+      "step": 4597
+    },
+    {
+      "epoch": 0.8187321937321937,
+      "grad_norm": 0.6293069124221802,
+      "learning_rate": 0.00018016151886177004,
+      "loss": 1.0245,
+      "step": 4598
+    },
+    {
+      "epoch": 0.8189102564102564,
+      "grad_norm": 0.47277843952178955,
+      "learning_rate": 0.00018015314982277564,
+      "loss": 1.1141,
+      "step": 4599
+    },
+    {
+      "epoch": 0.8190883190883191,
+      "grad_norm": 0.6132395267486572,
+      "learning_rate": 0.0001801447792133468,
+      "loss": 1.1227,
+      "step": 4600
+    },
+    {
+      "epoch": 0.8192663817663818,
+      "grad_norm": 0.46839597821235657,
+      "learning_rate": 0.00018013640703364747,
+      "loss": 0.9239,
+      "step": 4601
+    },
+    {
+      "epoch": 0.8194444444444444,
+      "grad_norm": 0.5055009722709656,
+      "learning_rate": 0.00018012803328384171,
+      "loss": 0.8486,
+      "step": 4602
+    },
+    {
+      "epoch": 0.8196225071225072,
+      "grad_norm": 0.5094841718673706,
+      "learning_rate": 0.00018011965796409362,
+      "loss": 0.9969,
+      "step": 4603
+    },
+    {
+      "epoch": 0.8198005698005698,
+      "grad_norm": 0.6177363395690918,
+      "learning_rate": 0.00018011128107456726,
+      "loss": 1.242,
+      "step": 4604
+    },
+    {
+      "epoch": 0.8199786324786325,
+      "grad_norm": 0.5280042290687561,
+      "learning_rate": 0.00018010290261542676,
+      "loss": 1.1569,
+      "step": 4605
+    },
+    {
+      "epoch": 0.8201566951566952,
+      "grad_norm": 0.5259367227554321,
+      "learning_rate": 0.00018009452258683625,
+      "loss": 0.9993,
+      "step": 4606
+    },
+    {
+      "epoch": 0.8203347578347578,
+      "grad_norm": 0.464469850063324,
+      "learning_rate": 0.00018008614098896,
+      "loss": 1.0288,
+      "step": 4607
+    },
+    {
+      "epoch": 0.8205128205128205,
+      "grad_norm": 0.6136324405670166,
+      "learning_rate": 0.00018007775782196214,
+      "loss": 1.1541,
+      "step": 4608
+    },
+    {
+      "epoch": 0.8206908831908832,
+      "grad_norm": 0.5376590490341187,
+      "learning_rate": 0.000180069373086007,
+      "loss": 1.0624,
+      "step": 4609
+    },
+    {
+      "epoch": 0.8208689458689459,
+      "grad_norm": 0.662916362285614,
+      "learning_rate": 0.0001800609867812588,
+      "loss": 1.1502,
+      "step": 4610
+    },
+    {
+      "epoch": 0.8210470085470085,
+      "grad_norm": 0.5153383612632751,
+      "learning_rate": 0.00018005259890788188,
+      "loss": 0.9789,
+      "step": 4611
+    },
+    {
+      "epoch": 0.8212250712250713,
+      "grad_norm": 0.5042359232902527,
+      "learning_rate": 0.00018004420946604057,
+      "loss": 0.9585,
+      "step": 4612
+    },
+    {
+      "epoch": 0.8214031339031339,
+      "grad_norm": 0.5395993590354919,
+      "learning_rate": 0.00018003581845589927,
+      "loss": 1.159,
+      "step": 4613
+    },
+    {
+      "epoch": 0.8215811965811965,
+      "grad_norm": 0.5561928749084473,
+      "learning_rate": 0.00018002742587762237,
+      "loss": 1.1604,
+      "step": 4614
+    },
+    {
+      "epoch": 0.8217592592592593,
+      "grad_norm": 0.5602710843086243,
+      "learning_rate": 0.00018001903173137432,
+      "loss": 0.9922,
+      "step": 4615
+    },
+    {
+      "epoch": 0.8219373219373219,
+      "grad_norm": 0.5529088377952576,
+      "learning_rate": 0.00018001063601731955,
+      "loss": 1.0943,
+      "step": 4616
+    },
+    {
+      "epoch": 0.8221153846153846,
+      "grad_norm": 0.5156456828117371,
+      "learning_rate": 0.00018000223873562254,
+      "loss": 1.1399,
+      "step": 4617
+    },
+    {
+      "epoch": 0.8222934472934473,
+      "grad_norm": 0.4868306517601013,
+      "learning_rate": 0.0001799938398864479,
+      "loss": 1.0692,
+      "step": 4618
+    },
+    {
+      "epoch": 0.82247150997151,
+      "grad_norm": 0.5372915267944336,
+      "learning_rate": 0.0001799854394699601,
+      "loss": 1.2675,
+      "step": 4619
+    },
+    {
+      "epoch": 0.8226495726495726,
+      "grad_norm": 0.6101839542388916,
+      "learning_rate": 0.0001799770374863238,
+      "loss": 0.9586,
+      "step": 4620
+    },
+    {
+      "epoch": 0.8228276353276354,
+      "grad_norm": 0.5034586787223816,
+      "learning_rate": 0.00017996863393570357,
+      "loss": 1.0885,
+      "step": 4621
+    },
+    {
+      "epoch": 0.823005698005698,
+      "grad_norm": 0.5608823299407959,
+      "learning_rate": 0.0001799602288182641,
+      "loss": 1.0002,
+      "step": 4622
+    },
+    {
+      "epoch": 0.8231837606837606,
+      "grad_norm": 0.5700048208236694,
+      "learning_rate": 0.00017995182213417,
+      "loss": 1.1484,
+      "step": 4623
+    },
+    {
+      "epoch": 0.8233618233618234,
+      "grad_norm": 0.5283229351043701,
+      "learning_rate": 0.00017994341388358608,
+      "loss": 1.0744,
+      "step": 4624
+    },
+    {
+      "epoch": 0.823539886039886,
+      "grad_norm": 0.5215758681297302,
+      "learning_rate": 0.00017993500406667703,
+      "loss": 1.2686,
+      "step": 4625
+    },
+    {
+      "epoch": 0.8237179487179487,
+      "grad_norm": 0.528883159160614,
+      "learning_rate": 0.0001799265926836076,
+      "loss": 1.1393,
+      "step": 4626
+    },
+    {
+      "epoch": 0.8238960113960114,
+      "grad_norm": 0.5589834451675415,
+      "learning_rate": 0.00017991817973454265,
+      "loss": 1.1744,
+      "step": 4627
+    },
+    {
+      "epoch": 0.8240740740740741,
+      "grad_norm": 0.49817174673080444,
+      "learning_rate": 0.00017990976521964697,
+      "loss": 1.0544,
+      "step": 4628
+    },
+    {
+      "epoch": 0.8242521367521367,
+      "grad_norm": 0.613961398601532,
+      "learning_rate": 0.00017990134913908542,
+      "loss": 1.0951,
+      "step": 4629
+    },
+    {
+      "epoch": 0.8244301994301995,
+      "grad_norm": 0.47278255224227905,
+      "learning_rate": 0.00017989293149302295,
+      "loss": 0.9742,
+      "step": 4630
+    },
+    {
+      "epoch": 0.8246082621082621,
+      "grad_norm": 0.49807092547416687,
+      "learning_rate": 0.00017988451228162443,
+      "loss": 1.0985,
+      "step": 4631
+    },
+    {
+      "epoch": 0.8247863247863247,
+      "grad_norm": 0.5624374747276306,
+      "learning_rate": 0.00017987609150505485,
+      "loss": 1.2446,
+      "step": 4632
+    },
+    {
+      "epoch": 0.8249643874643875,
+      "grad_norm": 0.4863535761833191,
+      "learning_rate": 0.00017986766916347916,
+      "loss": 1.0239,
+      "step": 4633
+    },
+    {
+      "epoch": 0.8251424501424501,
+      "grad_norm": 0.679585874080658,
+      "learning_rate": 0.00017985924525706245,
+      "loss": 1.1698,
+      "step": 4634
+    },
+    {
+      "epoch": 0.8253205128205128,
+      "grad_norm": 0.5545455813407898,
+      "learning_rate": 0.00017985081978596967,
+      "loss": 1.0926,
+      "step": 4635
+    },
+    {
+      "epoch": 0.8254985754985755,
+      "grad_norm": 0.5303109288215637,
+      "learning_rate": 0.000179842392750366,
+      "loss": 1.0978,
+      "step": 4636
+    },
+    {
+      "epoch": 0.8256766381766382,
+      "grad_norm": 0.6053299307823181,
+      "learning_rate": 0.00017983396415041644,
+      "loss": 1.0596,
+      "step": 4637
+    },
+    {
+      "epoch": 0.8258547008547008,
+      "grad_norm": 0.5241885185241699,
+      "learning_rate": 0.00017982553398628625,
+      "loss": 0.8541,
+      "step": 4638
+    },
+    {
+      "epoch": 0.8260327635327636,
+      "grad_norm": 0.5934443473815918,
+      "learning_rate": 0.00017981710225814052,
+      "loss": 1.145,
+      "step": 4639
+    },
+    {
+      "epoch": 0.8262108262108262,
+      "grad_norm": 0.5341619849205017,
+      "learning_rate": 0.00017980866896614447,
+      "loss": 1.0745,
+      "step": 4640
+    },
+    {
+      "epoch": 0.8263888888888888,
+      "grad_norm": 0.6732913851737976,
+      "learning_rate": 0.00017980023411046336,
+      "loss": 1.0775,
+      "step": 4641
+    },
+    {
+      "epoch": 0.8265669515669516,
+      "grad_norm": 0.5134359002113342,
+      "learning_rate": 0.0001797917976912624,
+      "loss": 1.0298,
+      "step": 4642
+    },
+    {
+      "epoch": 0.8267450142450142,
+      "grad_norm": 0.5234783887863159,
+      "learning_rate": 0.00017978335970870698,
+      "loss": 1.1069,
+      "step": 4643
+    },
+    {
+      "epoch": 0.8269230769230769,
+      "grad_norm": 0.4776439964771271,
+      "learning_rate": 0.00017977492016296232,
+      "loss": 0.6367,
+      "step": 4644
+    },
+    {
+      "epoch": 0.8271011396011396,
+      "grad_norm": 0.53763347864151,
+      "learning_rate": 0.0001797664790541938,
+      "loss": 1.1356,
+      "step": 4645
+    },
+    {
+      "epoch": 0.8272792022792023,
+      "grad_norm": 0.5082212686538696,
+      "learning_rate": 0.00017975803638256682,
+      "loss": 0.7873,
+      "step": 4646
+    },
+    {
+      "epoch": 0.8274572649572649,
+      "grad_norm": 0.5156424641609192,
+      "learning_rate": 0.00017974959214824685,
+      "loss": 1.084,
+      "step": 4647
+    },
+    {
+      "epoch": 0.8276353276353277,
+      "grad_norm": 0.5275198817253113,
+      "learning_rate": 0.00017974114635139926,
+      "loss": 1.1219,
+      "step": 4648
+    },
+    {
+      "epoch": 0.8278133903133903,
+      "grad_norm": 0.5548223257064819,
+      "learning_rate": 0.00017973269899218956,
+      "loss": 1.0808,
+      "step": 4649
+    },
+    {
+      "epoch": 0.8279914529914529,
+      "grad_norm": 0.535347580909729,
+      "learning_rate": 0.00017972425007078323,
+      "loss": 1.1211,
+      "step": 4650
+    },
+    {
+      "epoch": 0.8281695156695157,
+      "grad_norm": 0.5299580693244934,
+      "learning_rate": 0.00017971579958734587,
+      "loss": 0.9911,
+      "step": 4651
+    },
+    {
+      "epoch": 0.8283475783475783,
+      "grad_norm": 0.4863550066947937,
+      "learning_rate": 0.000179707347542043,
+      "loss": 0.9122,
+      "step": 4652
+    },
+    {
+      "epoch": 0.8285256410256411,
+      "grad_norm": 0.5284972190856934,
+      "learning_rate": 0.00017969889393504022,
+      "loss": 1.0424,
+      "step": 4653
+    },
+    {
+      "epoch": 0.8287037037037037,
+      "grad_norm": 0.5305661559104919,
+      "learning_rate": 0.00017969043876650317,
+      "loss": 1.1122,
+      "step": 4654
+    },
+    {
+      "epoch": 0.8288817663817664,
+      "grad_norm": 0.5645657777786255,
+      "learning_rate": 0.00017968198203659755,
+      "loss": 1.2195,
+      "step": 4655
+    },
+    {
+      "epoch": 0.8290598290598291,
+      "grad_norm": 0.521649181842804,
+      "learning_rate": 0.000179673523745489,
+      "loss": 1.2684,
+      "step": 4656
+    },
+    {
+      "epoch": 0.8292378917378918,
+      "grad_norm": 0.5984422564506531,
+      "learning_rate": 0.00017966506389334322,
+      "loss": 0.9894,
+      "step": 4657
+    },
+    {
+      "epoch": 0.8294159544159544,
+      "grad_norm": 0.5318729281425476,
+      "learning_rate": 0.00017965660248032603,
+      "loss": 1.2929,
+      "step": 4658
+    },
+    {
+      "epoch": 0.8295940170940171,
+      "grad_norm": 0.4666081368923187,
+      "learning_rate": 0.0001796481395066032,
+      "loss": 0.9646,
+      "step": 4659
+    },
+    {
+      "epoch": 0.8297720797720798,
+      "grad_norm": 0.5780388116836548,
+      "learning_rate": 0.00017963967497234054,
+      "loss": 1.1043,
+      "step": 4660
+    },
+    {
+      "epoch": 0.8299501424501424,
+      "grad_norm": 0.44089245796203613,
+      "learning_rate": 0.00017963120887770387,
+      "loss": 0.8932,
+      "step": 4661
+    },
+    {
+      "epoch": 0.8301282051282052,
+      "grad_norm": 0.5198349356651306,
+      "learning_rate": 0.0001796227412228591,
+      "loss": 0.9378,
+      "step": 4662
+    },
+    {
+      "epoch": 0.8303062678062678,
+      "grad_norm": 0.5298343896865845,
+      "learning_rate": 0.00017961427200797206,
+      "loss": 1.0272,
+      "step": 4663
+    },
+    {
+      "epoch": 0.8304843304843305,
+      "grad_norm": 0.5087099671363831,
+      "learning_rate": 0.0001796058012332088,
+      "loss": 0.989,
+      "step": 4664
+    },
+    {
+      "epoch": 0.8306623931623932,
+      "grad_norm": 0.504228949546814,
+      "learning_rate": 0.0001795973288987352,
+      "loss": 1.0134,
+      "step": 4665
+    },
+    {
+      "epoch": 0.8308404558404558,
+      "grad_norm": 0.6788033843040466,
+      "learning_rate": 0.00017958885500471728,
+      "loss": 0.8856,
+      "step": 4666
+    },
+    {
+      "epoch": 0.8310185185185185,
+      "grad_norm": 0.5166172385215759,
+      "learning_rate": 0.00017958037955132113,
+      "loss": 0.8711,
+      "step": 4667
+    },
+    {
+      "epoch": 0.8311965811965812,
+      "grad_norm": 0.5712400078773499,
+      "learning_rate": 0.00017957190253871272,
+      "loss": 1.0418,
+      "step": 4668
+    },
+    {
+      "epoch": 0.8313746438746439,
+      "grad_norm": 0.5531231164932251,
+      "learning_rate": 0.0001795634239670582,
+      "loss": 0.9021,
+      "step": 4669
+    },
+    {
+      "epoch": 0.8315527065527065,
+      "grad_norm": 0.6165615916252136,
+      "learning_rate": 0.00017955494383652365,
+      "loss": 1.0927,
+      "step": 4670
+    },
+    {
+      "epoch": 0.8317307692307693,
+      "grad_norm": 0.5920368432998657,
+      "learning_rate": 0.00017954646214727525,
+      "loss": 1.231,
+      "step": 4671
+    },
+    {
+      "epoch": 0.8319088319088319,
+      "grad_norm": 0.5037244558334351,
+      "learning_rate": 0.00017953797889947915,
+      "loss": 0.85,
+      "step": 4672
+    },
+    {
+      "epoch": 0.8320868945868946,
+      "grad_norm": 0.5618211627006531,
+      "learning_rate": 0.0001795294940933016,
+      "loss": 1.145,
+      "step": 4673
+    },
+    {
+      "epoch": 0.8322649572649573,
+      "grad_norm": 0.6275593042373657,
+      "learning_rate": 0.00017952100772890877,
+      "loss": 0.9061,
+      "step": 4674
+    },
+    {
+      "epoch": 0.83244301994302,
+      "grad_norm": 0.5376096367835999,
+      "learning_rate": 0.00017951251980646702,
+      "loss": 1.1948,
+      "step": 4675
+    },
+    {
+      "epoch": 0.8326210826210826,
+      "grad_norm": 0.5162268877029419,
+      "learning_rate": 0.0001795040303261426,
+      "loss": 1.2158,
+      "step": 4676
+    },
+    {
+      "epoch": 0.8327991452991453,
+      "grad_norm": 0.5730512142181396,
+      "learning_rate": 0.0001794955392881019,
+      "loss": 0.9962,
+      "step": 4677
+    },
+    {
+      "epoch": 0.832977207977208,
+      "grad_norm": 0.5128712058067322,
+      "learning_rate": 0.00017948704669251122,
+      "loss": 1.2797,
+      "step": 4678
+    },
+    {
+      "epoch": 0.8331552706552706,
+      "grad_norm": 0.5173979997634888,
+      "learning_rate": 0.00017947855253953697,
+      "loss": 1.1093,
+      "step": 4679
+    },
+    {
+      "epoch": 0.8333333333333334,
+      "grad_norm": 0.504646897315979,
+      "learning_rate": 0.0001794700568293456,
+      "loss": 1.3171,
+      "step": 4680
+    },
+    {
+      "epoch": 0.833511396011396,
+      "grad_norm": 0.5638105869293213,
+      "learning_rate": 0.00017946155956210356,
+      "loss": 0.9224,
+      "step": 4681
+    },
+    {
+      "epoch": 0.8336894586894587,
+      "grad_norm": 0.5289680361747742,
+      "learning_rate": 0.00017945306073797733,
+      "loss": 0.8919,
+      "step": 4682
+    },
+    {
+      "epoch": 0.8338675213675214,
+      "grad_norm": 0.5224629044532776,
+      "learning_rate": 0.0001794445603571334,
+      "loss": 1.0345,
+      "step": 4683
+    },
+    {
+      "epoch": 0.834045584045584,
+      "grad_norm": 0.5342282056808472,
+      "learning_rate": 0.00017943605841973836,
+      "loss": 1.2305,
+      "step": 4684
+    },
+    {
+      "epoch": 0.8342236467236467,
+      "grad_norm": 0.6118032336235046,
+      "learning_rate": 0.00017942755492595874,
+      "loss": 1.0316,
+      "step": 4685
+    },
+    {
+      "epoch": 0.8344017094017094,
+      "grad_norm": 0.49112311005592346,
+      "learning_rate": 0.00017941904987596121,
+      "loss": 0.9809,
+      "step": 4686
+    },
+    {
+      "epoch": 0.8345797720797721,
+      "grad_norm": 0.5044063925743103,
+      "learning_rate": 0.0001794105432699124,
+      "loss": 0.834,
+      "step": 4687
+    },
+    {
+      "epoch": 0.8347578347578347,
+      "grad_norm": 0.4849987328052521,
+      "learning_rate": 0.00017940203510797892,
+      "loss": 0.9971,
+      "step": 4688
+    },
+    {
+      "epoch": 0.8349358974358975,
+      "grad_norm": 0.5539469122886658,
+      "learning_rate": 0.00017939352539032748,
+      "loss": 1.1599,
+      "step": 4689
+    },
+    {
+      "epoch": 0.8351139601139601,
+      "grad_norm": 0.5474258065223694,
+      "learning_rate": 0.00017938501411712485,
+      "loss": 1.25,
+      "step": 4690
+    },
+    {
+      "epoch": 0.8352920227920227,
+      "grad_norm": 0.4880213737487793,
+      "learning_rate": 0.0001793765012885378,
+      "loss": 1.1471,
+      "step": 4691
+    },
+    {
+      "epoch": 0.8354700854700855,
+      "grad_norm": 0.5602759718894958,
+      "learning_rate": 0.00017936798690473309,
+      "loss": 1.0723,
+      "step": 4692
+    },
+    {
+      "epoch": 0.8356481481481481,
+      "grad_norm": 0.627775251865387,
+      "learning_rate": 0.00017935947096587755,
+      "loss": 1.3768,
+      "step": 4693
+    },
+    {
+      "epoch": 0.8358262108262108,
+      "grad_norm": 0.5324847102165222,
+      "learning_rate": 0.00017935095347213804,
+      "loss": 0.9945,
+      "step": 4694
+    },
+    {
+      "epoch": 0.8360042735042735,
+      "grad_norm": 0.5244048237800598,
+      "learning_rate": 0.0001793424344236814,
+      "loss": 1.1725,
+      "step": 4695
+    },
+    {
+      "epoch": 0.8361823361823362,
+      "grad_norm": 0.5420708656311035,
+      "learning_rate": 0.00017933391382067462,
+      "loss": 1.1267,
+      "step": 4696
+    },
+    {
+      "epoch": 0.8363603988603988,
+      "grad_norm": 0.5285456776618958,
+      "learning_rate": 0.00017932539166328458,
+      "loss": 1.0368,
+      "step": 4697
+    },
+    {
+      "epoch": 0.8365384615384616,
+      "grad_norm": 0.5330373048782349,
+      "learning_rate": 0.00017931686795167825,
+      "loss": 1.1082,
+      "step": 4698
+    },
+    {
+      "epoch": 0.8367165242165242,
+      "grad_norm": 0.5516682267189026,
+      "learning_rate": 0.0001793083426860227,
+      "loss": 1.1833,
+      "step": 4699
+    },
+    {
+      "epoch": 0.8368945868945868,
+      "grad_norm": 0.5229935646057129,
+      "learning_rate": 0.0001792998158664849,
+      "loss": 0.8527,
+      "step": 4700
+    },
+    {
+      "epoch": 0.8370726495726496,
+      "grad_norm": 0.4821490943431854,
+      "learning_rate": 0.00017929128749323195,
+      "loss": 1.1201,
+      "step": 4701
+    },
+    {
+      "epoch": 0.8372507122507122,
+      "grad_norm": 0.6276404857635498,
+      "learning_rate": 0.0001792827575664309,
+      "loss": 1.0986,
+      "step": 4702
+    },
+    {
+      "epoch": 0.8374287749287749,
+      "grad_norm": 0.5681334733963013,
+      "learning_rate": 0.00017927422608624897,
+      "loss": 1.3821,
+      "step": 4703
+    },
+    {
+      "epoch": 0.8376068376068376,
+      "grad_norm": 0.5257087349891663,
+      "learning_rate": 0.00017926569305285324,
+      "loss": 1.1033,
+      "step": 4704
+    },
+    {
+      "epoch": 0.8377849002849003,
+      "grad_norm": 0.5665168166160583,
+      "learning_rate": 0.0001792571584664109,
+      "loss": 1.104,
+      "step": 4705
+    },
+    {
+      "epoch": 0.8379629629629629,
+      "grad_norm": 0.5202076435089111,
+      "learning_rate": 0.00017924862232708918,
+      "loss": 1.052,
+      "step": 4706
+    },
+    {
+      "epoch": 0.8381410256410257,
+      "grad_norm": 0.5103010535240173,
+      "learning_rate": 0.00017924008463505534,
+      "loss": 1.1348,
+      "step": 4707
+    },
+    {
+      "epoch": 0.8383190883190883,
+      "grad_norm": 0.6811865568161011,
+      "learning_rate": 0.00017923154539047667,
+      "loss": 1.2804,
+      "step": 4708
+    },
+    {
+      "epoch": 0.8384971509971509,
+      "grad_norm": 0.46808311343193054,
+      "learning_rate": 0.00017922300459352042,
+      "loss": 0.9302,
+      "step": 4709
+    },
+    {
+      "epoch": 0.8386752136752137,
+      "grad_norm": 0.47713059186935425,
+      "learning_rate": 0.00017921446224435398,
+      "loss": 0.78,
+      "step": 4710
+    },
+    {
+      "epoch": 0.8388532763532763,
+      "grad_norm": 0.7579890489578247,
+      "learning_rate": 0.0001792059183431447,
+      "loss": 1.4776,
+      "step": 4711
+    },
+    {
+      "epoch": 0.8390313390313391,
+      "grad_norm": 0.6009423136711121,
+      "learning_rate": 0.00017919737289006,
+      "loss": 1.2679,
+      "step": 4712
+    },
+    {
+      "epoch": 0.8392094017094017,
+      "grad_norm": 0.56390780210495,
+      "learning_rate": 0.00017918882588526729,
+      "loss": 1.0402,
+      "step": 4713
+    },
+    {
+      "epoch": 0.8393874643874644,
+      "grad_norm": 0.5698862075805664,
+      "learning_rate": 0.00017918027732893404,
+      "loss": 1.2336,
+      "step": 4714
+    },
+    {
+      "epoch": 0.8395655270655271,
+      "grad_norm": 0.5016305446624756,
+      "learning_rate": 0.0001791717272212277,
+      "loss": 1.0373,
+      "step": 4715
+    },
+    {
+      "epoch": 0.8397435897435898,
+      "grad_norm": 0.5886971950531006,
+      "learning_rate": 0.0001791631755623159,
+      "loss": 1.1062,
+      "step": 4716
+    },
+    {
+      "epoch": 0.8399216524216524,
+      "grad_norm": 0.647833526134491,
+      "learning_rate": 0.00017915462235236607,
+      "loss": 1.0464,
+      "step": 4717
+    },
+    {
+      "epoch": 0.8400997150997151,
+      "grad_norm": 0.4961194396018982,
+      "learning_rate": 0.00017914606759154587,
+      "loss": 1.0763,
+      "step": 4718
+    },
+    {
+      "epoch": 0.8402777777777778,
+      "grad_norm": 0.47041359543800354,
+      "learning_rate": 0.00017913751128002288,
+      "loss": 1.0685,
+      "step": 4719
+    },
+    {
+      "epoch": 0.8404558404558404,
+      "grad_norm": 0.5752858519554138,
+      "learning_rate": 0.00017912895341796475,
+      "loss": 1.0577,
+      "step": 4720
+    },
+    {
+      "epoch": 0.8406339031339032,
+      "grad_norm": 0.5233224034309387,
+      "learning_rate": 0.00017912039400553914,
+      "loss": 1.1484,
+      "step": 4721
+    },
+    {
+      "epoch": 0.8408119658119658,
+      "grad_norm": 0.5327485203742981,
+      "learning_rate": 0.00017911183304291378,
+      "loss": 1.0028,
+      "step": 4722
+    },
+    {
+      "epoch": 0.8409900284900285,
+      "grad_norm": 0.5320752263069153,
+      "learning_rate": 0.00017910327053025638,
+      "loss": 1.1247,
+      "step": 4723
+    },
+    {
+      "epoch": 0.8411680911680912,
+      "grad_norm": 0.529617965221405,
+      "learning_rate": 0.00017909470646773477,
+      "loss": 1.1698,
+      "step": 4724
+    },
+    {
+      "epoch": 0.8413461538461539,
+      "grad_norm": 0.5055609345436096,
+      "learning_rate": 0.00017908614085551664,
+      "loss": 1.0925,
+      "step": 4725
+    },
+    {
+      "epoch": 0.8415242165242165,
+      "grad_norm": 0.5356255769729614,
+      "learning_rate": 0.00017907757369376985,
+      "loss": 1.0354,
+      "step": 4726
+    },
+    {
+      "epoch": 0.8417022792022792,
+      "grad_norm": 0.582834780216217,
+      "learning_rate": 0.00017906900498266233,
+      "loss": 1.1248,
+      "step": 4727
+    },
+    {
+      "epoch": 0.8418803418803419,
+      "grad_norm": 0.5750834941864014,
+      "learning_rate": 0.00017906043472236188,
+      "loss": 1.0119,
+      "step": 4728
+    },
+    {
+      "epoch": 0.8420584045584045,
+      "grad_norm": 0.5923320055007935,
+      "learning_rate": 0.00017905186291303644,
+      "loss": 1.0662,
+      "step": 4729
+    },
+    {
+      "epoch": 0.8422364672364673,
+      "grad_norm": 0.4767811894416809,
+      "learning_rate": 0.00017904328955485396,
+      "loss": 1.0911,
+      "step": 4730
+    },
+    {
+      "epoch": 0.8424145299145299,
+      "grad_norm": 0.5294556021690369,
+      "learning_rate": 0.00017903471464798245,
+      "loss": 1.2861,
+      "step": 4731
+    },
+    {
+      "epoch": 0.8425925925925926,
+      "grad_norm": 0.599117636680603,
+      "learning_rate": 0.00017902613819258985,
+      "loss": 1.1707,
+      "step": 4732
+    },
+    {
+      "epoch": 0.8427706552706553,
+      "grad_norm": 0.5912977457046509,
+      "learning_rate": 0.00017901756018884424,
+      "loss": 1.1884,
+      "step": 4733
+    },
+    {
+      "epoch": 0.842948717948718,
+      "grad_norm": 0.587676465511322,
+      "learning_rate": 0.0001790089806369137,
+      "loss": 1.1054,
+      "step": 4734
+    },
+    {
+      "epoch": 0.8431267806267806,
+      "grad_norm": 0.6271800398826599,
+      "learning_rate": 0.0001790003995369663,
+      "loss": 1.2094,
+      "step": 4735
+    },
+    {
+      "epoch": 0.8433048433048433,
+      "grad_norm": 0.47198590636253357,
+      "learning_rate": 0.00017899181688917017,
+      "loss": 0.9561,
+      "step": 4736
+    },
+    {
+      "epoch": 0.843482905982906,
+      "grad_norm": 0.690732479095459,
+      "learning_rate": 0.00017898323269369351,
+      "loss": 1.1629,
+      "step": 4737
+    },
+    {
+      "epoch": 0.8436609686609686,
+      "grad_norm": 0.4926888048648834,
+      "learning_rate": 0.00017897464695070445,
+      "loss": 1.1097,
+      "step": 4738
+    },
+    {
+      "epoch": 0.8438390313390314,
+      "grad_norm": 0.7071278691291809,
+      "learning_rate": 0.00017896605966037128,
+      "loss": 1.195,
+      "step": 4739
+    },
+    {
+      "epoch": 0.844017094017094,
+      "grad_norm": 0.5650486350059509,
+      "learning_rate": 0.00017895747082286216,
+      "loss": 1.0107,
+      "step": 4740
+    },
+    {
+      "epoch": 0.8441951566951567,
+      "grad_norm": 0.5291931629180908,
+      "learning_rate": 0.00017894888043834545,
+      "loss": 1.0104,
+      "step": 4741
+    },
+    {
+      "epoch": 0.8443732193732194,
+      "grad_norm": 0.5751241445541382,
+      "learning_rate": 0.00017894028850698942,
+      "loss": 1.2482,
+      "step": 4742
+    },
+    {
+      "epoch": 0.844551282051282,
+      "grad_norm": 0.5833632349967957,
+      "learning_rate": 0.0001789316950289624,
+      "loss": 1.0552,
+      "step": 4743
+    },
+    {
+      "epoch": 0.8447293447293447,
+      "grad_norm": 0.543729841709137,
+      "learning_rate": 0.00017892310000443282,
+      "loss": 1.1453,
+      "step": 4744
+    },
+    {
+      "epoch": 0.8449074074074074,
+      "grad_norm": 0.5674204230308533,
+      "learning_rate": 0.00017891450343356902,
+      "loss": 1.0757,
+      "step": 4745
+    },
+    {
+      "epoch": 0.8450854700854701,
+      "grad_norm": 0.5161892771720886,
+      "learning_rate": 0.00017890590531653946,
+      "loss": 1.1163,
+      "step": 4746
+    },
+    {
+      "epoch": 0.8452635327635327,
+      "grad_norm": 0.49907612800598145,
+      "learning_rate": 0.00017889730565351258,
+      "loss": 1.0356,
+      "step": 4747
+    },
+    {
+      "epoch": 0.8454415954415955,
+      "grad_norm": 0.4994732439517975,
+      "learning_rate": 0.00017888870444465692,
+      "loss": 1.026,
+      "step": 4748
+    },
+    {
+      "epoch": 0.8456196581196581,
+      "grad_norm": 0.6397520303726196,
+      "learning_rate": 0.00017888010169014095,
+      "loss": 0.957,
+      "step": 4749
+    },
+    {
+      "epoch": 0.8457977207977208,
+      "grad_norm": 0.5379729270935059,
+      "learning_rate": 0.00017887149739013327,
+      "loss": 1.1664,
+      "step": 4750
+    },
+    {
+      "epoch": 0.8459757834757835,
+      "grad_norm": 0.4487382769584656,
+      "learning_rate": 0.00017886289154480246,
+      "loss": 0.9377,
+      "step": 4751
+    },
+    {
+      "epoch": 0.8461538461538461,
+      "grad_norm": 0.5645943880081177,
+      "learning_rate": 0.00017885428415431707,
+      "loss": 1.273,
+      "step": 4752
+    },
+    {
+      "epoch": 0.8463319088319088,
+      "grad_norm": 0.5535289645195007,
+      "learning_rate": 0.00017884567521884577,
+      "loss": 1.1779,
+      "step": 4753
+    },
+    {
+      "epoch": 0.8465099715099715,
+      "grad_norm": 0.5039721131324768,
+      "learning_rate": 0.0001788370647385573,
+      "loss": 1.0237,
+      "step": 4754
+    },
+    {
+      "epoch": 0.8466880341880342,
+      "grad_norm": 0.4543854892253876,
+      "learning_rate": 0.00017882845271362032,
+      "loss": 0.8149,
+      "step": 4755
+    },
+    {
+      "epoch": 0.8468660968660968,
+      "grad_norm": 0.5095639824867249,
+      "learning_rate": 0.00017881983914420352,
+      "loss": 1.0141,
+      "step": 4756
+    },
+    {
+      "epoch": 0.8470441595441596,
+      "grad_norm": 0.5341798663139343,
+      "learning_rate": 0.00017881122403047575,
+      "loss": 1.1885,
+      "step": 4757
+    },
+    {
+      "epoch": 0.8472222222222222,
+      "grad_norm": 0.5595062971115112,
+      "learning_rate": 0.00017880260737260573,
+      "loss": 0.8939,
+      "step": 4758
+    },
+    {
+      "epoch": 0.8474002849002849,
+      "grad_norm": 0.5355880260467529,
+      "learning_rate": 0.00017879398917076232,
+      "loss": 1.2434,
+      "step": 4759
+    },
+    {
+      "epoch": 0.8475783475783476,
+      "grad_norm": 0.49477261304855347,
+      "learning_rate": 0.0001787853694251144,
+      "loss": 0.979,
+      "step": 4760
+    },
+    {
+      "epoch": 0.8477564102564102,
+      "grad_norm": 0.5154359340667725,
+      "learning_rate": 0.00017877674813583078,
+      "loss": 1.0957,
+      "step": 4761
+    },
+    {
+      "epoch": 0.8479344729344729,
+      "grad_norm": 0.5651070475578308,
+      "learning_rate": 0.00017876812530308046,
+      "loss": 1.1884,
+      "step": 4762
+    },
+    {
+      "epoch": 0.8481125356125356,
+      "grad_norm": 0.537277340888977,
+      "learning_rate": 0.00017875950092703232,
+      "loss": 1.0272,
+      "step": 4763
+    },
+    {
+      "epoch": 0.8482905982905983,
+      "grad_norm": 0.5259691476821899,
+      "learning_rate": 0.00017875087500785538,
+      "loss": 1.1493,
+      "step": 4764
+    },
+    {
+      "epoch": 0.8484686609686609,
+      "grad_norm": 0.5491300225257874,
+      "learning_rate": 0.00017874224754571867,
+      "loss": 0.8316,
+      "step": 4765
+    },
+    {
+      "epoch": 0.8486467236467237,
+      "grad_norm": 0.5493744611740112,
+      "learning_rate": 0.00017873361854079116,
+      "loss": 1.2328,
+      "step": 4766
+    },
+    {
+      "epoch": 0.8488247863247863,
+      "grad_norm": 0.571002185344696,
+      "learning_rate": 0.00017872498799324197,
+      "loss": 1.1384,
+      "step": 4767
+    },
+    {
+      "epoch": 0.8490028490028491,
+      "grad_norm": 0.538152813911438,
+      "learning_rate": 0.00017871635590324013,
+      "loss": 1.0581,
+      "step": 4768
+    },
+    {
+      "epoch": 0.8491809116809117,
+      "grad_norm": 0.5214923620223999,
+      "learning_rate": 0.00017870772227095486,
+      "loss": 1.0612,
+      "step": 4769
+    },
+    {
+      "epoch": 0.8493589743589743,
+      "grad_norm": 0.5714883804321289,
+      "learning_rate": 0.0001786990870965553,
+      "loss": 0.9076,
+      "step": 4770
+    },
+    {
+      "epoch": 0.8495370370370371,
+      "grad_norm": 0.4181775450706482,
+      "learning_rate": 0.00017869045038021054,
+      "loss": 0.8366,
+      "step": 4771
+    },
+    {
+      "epoch": 0.8497150997150997,
+      "grad_norm": 0.6266027688980103,
+      "learning_rate": 0.00017868181212208993,
+      "loss": 1.2047,
+      "step": 4772
+    },
+    {
+      "epoch": 0.8498931623931624,
+      "grad_norm": 0.5423732399940491,
+      "learning_rate": 0.0001786731723223626,
+      "loss": 1.3878,
+      "step": 4773
+    },
+    {
+      "epoch": 0.8500712250712251,
+      "grad_norm": 0.5512300133705139,
+      "learning_rate": 0.00017866453098119793,
+      "loss": 1.1132,
+      "step": 4774
+    },
+    {
+      "epoch": 0.8502492877492878,
+      "grad_norm": 0.5767185688018799,
+      "learning_rate": 0.00017865588809876519,
+      "loss": 0.97,
+      "step": 4775
+    },
+    {
+      "epoch": 0.8504273504273504,
+      "grad_norm": 0.5305790305137634,
+      "learning_rate": 0.00017864724367523368,
+      "loss": 1.1158,
+      "step": 4776
+    },
+    {
+      "epoch": 0.8506054131054132,
+      "grad_norm": 0.49702391028404236,
+      "learning_rate": 0.00017863859771077284,
+      "loss": 0.9669,
+      "step": 4777
+    },
+    {
+      "epoch": 0.8507834757834758,
+      "grad_norm": 0.5490063428878784,
+      "learning_rate": 0.00017862995020555205,
+      "loss": 1.0646,
+      "step": 4778
+    },
+    {
+      "epoch": 0.8509615384615384,
+      "grad_norm": 0.5308689475059509,
+      "learning_rate": 0.00017862130115974068,
+      "loss": 0.8922,
+      "step": 4779
+    },
+    {
+      "epoch": 0.8511396011396012,
+      "grad_norm": 0.5412983894348145,
+      "learning_rate": 0.00017861265057350826,
+      "loss": 1.1444,
+      "step": 4780
+    },
+    {
+      "epoch": 0.8513176638176638,
+      "grad_norm": 0.5857377052307129,
+      "learning_rate": 0.00017860399844702425,
+      "loss": 1.1643,
+      "step": 4781
+    },
+    {
+      "epoch": 0.8514957264957265,
+      "grad_norm": 0.599273681640625,
+      "learning_rate": 0.00017859534478045815,
+      "loss": 1.169,
+      "step": 4782
+    },
+    {
+      "epoch": 0.8516737891737892,
+      "grad_norm": 0.5677087903022766,
+      "learning_rate": 0.00017858668957397957,
+      "loss": 1.0793,
+      "step": 4783
+    },
+    {
+      "epoch": 0.8518518518518519,
+      "grad_norm": 0.5648362636566162,
+      "learning_rate": 0.00017857803282775807,
+      "loss": 1.1932,
+      "step": 4784
+    },
+    {
+      "epoch": 0.8520299145299145,
+      "grad_norm": 0.5138826966285706,
+      "learning_rate": 0.00017856937454196323,
+      "loss": 1.0011,
+      "step": 4785
+    },
+    {
+      "epoch": 0.8522079772079773,
+      "grad_norm": 0.5951429009437561,
+      "learning_rate": 0.0001785607147167647,
+      "loss": 1.3198,
+      "step": 4786
+    },
+    {
+      "epoch": 0.8523860398860399,
+      "grad_norm": 0.5341953039169312,
+      "learning_rate": 0.00017855205335233216,
+      "loss": 0.9094,
+      "step": 4787
+    },
+    {
+      "epoch": 0.8525641025641025,
+      "grad_norm": 0.5193579196929932,
+      "learning_rate": 0.00017854339044883535,
+      "loss": 0.892,
+      "step": 4788
+    },
+    {
+      "epoch": 0.8527421652421653,
+      "grad_norm": 0.5053097009658813,
+      "learning_rate": 0.00017853472600644392,
+      "loss": 1.0589,
+      "step": 4789
+    },
+    {
+      "epoch": 0.8529202279202279,
+      "grad_norm": 0.5819617509841919,
+      "learning_rate": 0.0001785260600253277,
+      "loss": 1.2646,
+      "step": 4790
+    },
+    {
+      "epoch": 0.8530982905982906,
+      "grad_norm": 0.5327470302581787,
+      "learning_rate": 0.00017851739250565645,
+      "loss": 1.056,
+      "step": 4791
+    },
+    {
+      "epoch": 0.8532763532763533,
+      "grad_norm": 0.5131269097328186,
+      "learning_rate": 0.0001785087234476,
+      "loss": 1.1192,
+      "step": 4792
+    },
+    {
+      "epoch": 0.853454415954416,
+      "grad_norm": 0.4698086977005005,
+      "learning_rate": 0.00017850005285132821,
+      "loss": 0.9849,
+      "step": 4793
+    },
+    {
+      "epoch": 0.8536324786324786,
+      "grad_norm": 0.5503947734832764,
+      "learning_rate": 0.00017849138071701092,
+      "loss": 1.1139,
+      "step": 4794
+    },
+    {
+      "epoch": 0.8538105413105413,
+      "grad_norm": 0.5120903849601746,
+      "learning_rate": 0.0001784827070448181,
+      "loss": 0.9801,
+      "step": 4795
+    },
+    {
+      "epoch": 0.853988603988604,
+      "grad_norm": 0.47650405764579773,
+      "learning_rate": 0.00017847403183491968,
+      "loss": 1.0268,
+      "step": 4796
+    },
+    {
+      "epoch": 0.8541666666666666,
+      "grad_norm": 0.5773387551307678,
+      "learning_rate": 0.0001784653550874856,
+      "loss": 1.0336,
+      "step": 4797
+    },
+    {
+      "epoch": 0.8543447293447294,
+      "grad_norm": 0.545531153678894,
+      "learning_rate": 0.00017845667680268593,
+      "loss": 1.0532,
+      "step": 4798
+    },
+    {
+      "epoch": 0.854522792022792,
+      "grad_norm": 0.533161461353302,
+      "learning_rate": 0.0001784479969806906,
+      "loss": 1.1964,
+      "step": 4799
+    },
+    {
+      "epoch": 0.8547008547008547,
+      "grad_norm": 0.5880789160728455,
+      "learning_rate": 0.00017843931562166977,
+      "loss": 1.1588,
+      "step": 4800
+    },
+    {
+      "epoch": 0.8548789173789174,
+      "grad_norm": 0.5381524562835693,
+      "learning_rate": 0.00017843063272579346,
+      "loss": 1.1533,
+      "step": 4801
+    },
+    {
+      "epoch": 0.85505698005698,
+      "grad_norm": 0.6280176639556885,
+      "learning_rate": 0.00017842194829323187,
+      "loss": 1.0084,
+      "step": 4802
+    },
+    {
+      "epoch": 0.8552350427350427,
+      "grad_norm": 0.5098552703857422,
+      "learning_rate": 0.0001784132623241551,
+      "loss": 1.0804,
+      "step": 4803
+    },
+    {
+      "epoch": 0.8554131054131054,
+      "grad_norm": 0.5406526923179626,
+      "learning_rate": 0.00017840457481873328,
+      "loss": 1.2571,
+      "step": 4804
+    },
+    {
+      "epoch": 0.8555911680911681,
+      "grad_norm": 0.5859003663063049,
+      "learning_rate": 0.00017839588577713678,
+      "loss": 1.2462,
+      "step": 4805
+    },
+    {
+      "epoch": 0.8557692307692307,
+      "grad_norm": 0.6209002137184143,
+      "learning_rate": 0.00017838719519953572,
+      "loss": 1.307,
+      "step": 4806
+    },
+    {
+      "epoch": 0.8559472934472935,
+      "grad_norm": 0.525753915309906,
+      "learning_rate": 0.00017837850308610037,
+      "loss": 1.2957,
+      "step": 4807
+    },
+    {
+      "epoch": 0.8561253561253561,
+      "grad_norm": 0.5096195340156555,
+      "learning_rate": 0.0001783698094370011,
+      "loss": 1.1433,
+      "step": 4808
+    },
+    {
+      "epoch": 0.8563034188034188,
+      "grad_norm": 0.5873076915740967,
+      "learning_rate": 0.0001783611142524082,
+      "loss": 1.2271,
+      "step": 4809
+    },
+    {
+      "epoch": 0.8564814814814815,
+      "grad_norm": 0.5093944668769836,
+      "learning_rate": 0.0001783524175324921,
+      "loss": 0.8788,
+      "step": 4810
+    },
+    {
+      "epoch": 0.8566595441595442,
+      "grad_norm": 0.5485084652900696,
+      "learning_rate": 0.00017834371927742307,
+      "loss": 1.256,
+      "step": 4811
+    },
+    {
+      "epoch": 0.8568376068376068,
+      "grad_norm": 0.5808873772621155,
+      "learning_rate": 0.00017833501948737163,
+      "loss": 0.9287,
+      "step": 4812
+    },
+    {
+      "epoch": 0.8570156695156695,
+      "grad_norm": 0.5113978385925293,
+      "learning_rate": 0.00017832631816250822,
+      "loss": 1.0372,
+      "step": 4813
+    },
+    {
+      "epoch": 0.8571937321937322,
+      "grad_norm": 0.5877016186714172,
+      "learning_rate": 0.0001783176153030033,
+      "loss": 1.3023,
+      "step": 4814
+    },
+    {
+      "epoch": 0.8573717948717948,
+      "grad_norm": 0.534328043460846,
+      "learning_rate": 0.00017830891090902742,
+      "loss": 1.1023,
+      "step": 4815
+    },
+    {
+      "epoch": 0.8575498575498576,
+      "grad_norm": 0.5781638026237488,
+      "learning_rate": 0.0001783002049807511,
+      "loss": 0.9562,
+      "step": 4816
+    },
+    {
+      "epoch": 0.8577279202279202,
+      "grad_norm": 0.5760263204574585,
+      "learning_rate": 0.00017829149751834487,
+      "loss": 0.8733,
+      "step": 4817
+    },
+    {
+      "epoch": 0.8579059829059829,
+      "grad_norm": 0.3887255787849426,
+      "learning_rate": 0.00017828278852197944,
+      "loss": 0.5949,
+      "step": 4818
+    },
+    {
+      "epoch": 0.8580840455840456,
+      "grad_norm": 0.47814446687698364,
+      "learning_rate": 0.00017827407799182537,
+      "loss": 1.0698,
+      "step": 4819
+    },
+    {
+      "epoch": 0.8582621082621082,
+      "grad_norm": 0.5520272254943848,
+      "learning_rate": 0.00017826536592805334,
+      "loss": 1.1314,
+      "step": 4820
+    },
+    {
+      "epoch": 0.8584401709401709,
+      "grad_norm": 0.5285319685935974,
+      "learning_rate": 0.00017825665233083405,
+      "loss": 1.1618,
+      "step": 4821
+    },
+    {
+      "epoch": 0.8586182336182336,
+      "grad_norm": 0.6080102324485779,
+      "learning_rate": 0.0001782479372003382,
+      "loss": 1.3817,
+      "step": 4822
+    },
+    {
+      "epoch": 0.8587962962962963,
+      "grad_norm": 0.7474410533905029,
+      "learning_rate": 0.00017823922053673662,
+      "loss": 1.1321,
+      "step": 4823
+    },
+    {
+      "epoch": 0.8589743589743589,
+      "grad_norm": 0.559283435344696,
+      "learning_rate": 0.0001782305023402,
+      "loss": 1.1894,
+      "step": 4824
+    },
+    {
+      "epoch": 0.8591524216524217,
+      "grad_norm": 0.5620571374893188,
+      "learning_rate": 0.00017822178261089918,
+      "loss": 1.134,
+      "step": 4825
+    },
+    {
+      "epoch": 0.8593304843304843,
+      "grad_norm": 0.5553044676780701,
+      "learning_rate": 0.00017821306134900504,
+      "loss": 1.3222,
+      "step": 4826
+    },
+    {
+      "epoch": 0.8595085470085471,
+      "grad_norm": 0.6177778244018555,
+      "learning_rate": 0.00017820433855468846,
+      "loss": 1.2545,
+      "step": 4827
+    },
+    {
+      "epoch": 0.8596866096866097,
+      "grad_norm": 0.656233012676239,
+      "learning_rate": 0.0001781956142281203,
+      "loss": 1.1346,
+      "step": 4828
+    },
+    {
+      "epoch": 0.8598646723646723,
+      "grad_norm": 0.6710973381996155,
+      "learning_rate": 0.0001781868883694715,
+      "loss": 1.1361,
+      "step": 4829
+    },
+    {
+      "epoch": 0.8600427350427351,
+      "grad_norm": 0.5093601942062378,
+      "learning_rate": 0.0001781781609789131,
+      "loss": 1.0509,
+      "step": 4830
+    },
+    {
+      "epoch": 0.8602207977207977,
+      "grad_norm": 0.5707578063011169,
+      "learning_rate": 0.00017816943205661598,
+      "loss": 1.0964,
+      "step": 4831
+    },
+    {
+      "epoch": 0.8603988603988604,
+      "grad_norm": 0.6159597635269165,
+      "learning_rate": 0.00017816070160275125,
+      "loss": 1.0322,
+      "step": 4832
+    },
+    {
+      "epoch": 0.8605769230769231,
+      "grad_norm": 0.5430580377578735,
+      "learning_rate": 0.0001781519696174899,
+      "loss": 1.2464,
+      "step": 4833
+    },
+    {
+      "epoch": 0.8607549857549858,
+      "grad_norm": 0.48104700446128845,
+      "learning_rate": 0.0001781432361010031,
+      "loss": 1.1031,
+      "step": 4834
+    },
+    {
+      "epoch": 0.8609330484330484,
+      "grad_norm": 0.5304946303367615,
+      "learning_rate": 0.0001781345010534619,
+      "loss": 1.0281,
+      "step": 4835
+    },
+    {
+      "epoch": 0.8611111111111112,
+      "grad_norm": 0.5230711698532104,
+      "learning_rate": 0.00017812576447503742,
+      "loss": 0.9499,
+      "step": 4836
+    },
+    {
+      "epoch": 0.8612891737891738,
+      "grad_norm": 0.5363606214523315,
+      "learning_rate": 0.00017811702636590093,
+      "loss": 1.1358,
+      "step": 4837
+    },
+    {
+      "epoch": 0.8614672364672364,
+      "grad_norm": 0.5880044102668762,
+      "learning_rate": 0.00017810828672622358,
+      "loss": 1.1765,
+      "step": 4838
+    },
+    {
+      "epoch": 0.8616452991452992,
+      "grad_norm": 0.5194395184516907,
+      "learning_rate": 0.0001780995455561766,
+      "loss": 1.1622,
+      "step": 4839
+    },
+    {
+      "epoch": 0.8618233618233618,
+      "grad_norm": 0.5114264488220215,
+      "learning_rate": 0.00017809080285593126,
+      "loss": 1.0081,
+      "step": 4840
+    },
+    {
+      "epoch": 0.8620014245014245,
+      "grad_norm": 0.6174240112304688,
+      "learning_rate": 0.00017808205862565886,
+      "loss": 1.0745,
+      "step": 4841
+    },
+    {
+      "epoch": 0.8621794871794872,
+      "grad_norm": 0.5662630200386047,
+      "learning_rate": 0.0001780733128655307,
+      "loss": 1.3369,
+      "step": 4842
+    },
+    {
+      "epoch": 0.8623575498575499,
+      "grad_norm": 0.5917882919311523,
+      "learning_rate": 0.00017806456557571817,
+      "loss": 1.1631,
+      "step": 4843
+    },
+    {
+      "epoch": 0.8625356125356125,
+      "grad_norm": 0.5305736660957336,
+      "learning_rate": 0.00017805581675639265,
+      "loss": 0.9875,
+      "step": 4844
+    },
+    {
+      "epoch": 0.8627136752136753,
+      "grad_norm": 0.5181219577789307,
+      "learning_rate": 0.00017804706640772556,
+      "loss": 0.9918,
+      "step": 4845
+    },
+    {
+      "epoch": 0.8628917378917379,
+      "grad_norm": 0.5467997789382935,
+      "learning_rate": 0.00017803831452988832,
+      "loss": 1.1395,
+      "step": 4846
+    },
+    {
+      "epoch": 0.8630698005698005,
+      "grad_norm": 0.5494031310081482,
+      "learning_rate": 0.00017802956112305241,
+      "loss": 1.0312,
+      "step": 4847
+    },
+    {
+      "epoch": 0.8632478632478633,
+      "grad_norm": 0.5804065465927124,
+      "learning_rate": 0.00017802080618738931,
+      "loss": 1.1555,
+      "step": 4848
+    },
+    {
+      "epoch": 0.8634259259259259,
+      "grad_norm": 0.5424801111221313,
+      "learning_rate": 0.00017801204972307067,
+      "loss": 1.0215,
+      "step": 4849
+    },
+    {
+      "epoch": 0.8636039886039886,
+      "grad_norm": 0.5321891903877258,
+      "learning_rate": 0.0001780032917302679,
+      "loss": 1.0187,
+      "step": 4850
+    },
+    {
+      "epoch": 0.8637820512820513,
+      "grad_norm": 0.5543400049209595,
+      "learning_rate": 0.0001779945322091527,
+      "loss": 1.1972,
+      "step": 4851
+    },
+    {
+      "epoch": 0.863960113960114,
+      "grad_norm": 0.566649317741394,
+      "learning_rate": 0.00017798577115989668,
+      "loss": 1.0758,
+      "step": 4852
+    },
+    {
+      "epoch": 0.8641381766381766,
+      "grad_norm": 0.5538444519042969,
+      "learning_rate": 0.00017797700858267145,
+      "loss": 1.1338,
+      "step": 4853
+    },
+    {
+      "epoch": 0.8643162393162394,
+      "grad_norm": 0.5641313791275024,
+      "learning_rate": 0.0001779682444776487,
+      "loss": 1.256,
+      "step": 4854
+    },
+    {
+      "epoch": 0.864494301994302,
+      "grad_norm": 0.6377350091934204,
+      "learning_rate": 0.00017795947884500016,
+      "loss": 1.144,
+      "step": 4855
+    },
+    {
+      "epoch": 0.8646723646723646,
+      "grad_norm": 0.5581876039505005,
+      "learning_rate": 0.0001779507116848976,
+      "loss": 1.3163,
+      "step": 4856
+    },
+    {
+      "epoch": 0.8648504273504274,
+      "grad_norm": 0.5416772365570068,
+      "learning_rate": 0.0001779419429975128,
+      "loss": 1.0219,
+      "step": 4857
+    },
+    {
+      "epoch": 0.86502849002849,
+      "grad_norm": 0.5450608730316162,
+      "learning_rate": 0.0001779331727830175,
+      "loss": 1.0093,
+      "step": 4858
+    },
+    {
+      "epoch": 0.8652065527065527,
+      "grad_norm": 0.5151242017745972,
+      "learning_rate": 0.00017792440104158358,
+      "loss": 1.067,
+      "step": 4859
+    },
+    {
+      "epoch": 0.8653846153846154,
+      "grad_norm": 0.5225046873092651,
+      "learning_rate": 0.0001779156277733829,
+      "loss": 1.0432,
+      "step": 4860
+    },
+    {
+      "epoch": 0.8655626780626781,
+      "grad_norm": 0.5168602466583252,
+      "learning_rate": 0.00017790685297858737,
+      "loss": 0.9665,
+      "step": 4861
+    },
+    {
+      "epoch": 0.8657407407407407,
+      "grad_norm": 0.5749059319496155,
+      "learning_rate": 0.00017789807665736889,
+      "loss": 1.1607,
+      "step": 4862
+    },
+    {
+      "epoch": 0.8659188034188035,
+      "grad_norm": 0.45656394958496094,
+      "learning_rate": 0.00017788929880989938,
+      "loss": 0.8362,
+      "step": 4863
+    },
+    {
+      "epoch": 0.8660968660968661,
+      "grad_norm": 0.5090615749359131,
+      "learning_rate": 0.00017788051943635086,
+      "loss": 0.9553,
+      "step": 4864
+    },
+    {
+      "epoch": 0.8662749287749287,
+      "grad_norm": 0.5381240248680115,
+      "learning_rate": 0.0001778717385368954,
+      "loss": 1.1391,
+      "step": 4865
+    },
+    {
+      "epoch": 0.8664529914529915,
+      "grad_norm": 0.522720456123352,
+      "learning_rate": 0.00017786295611170493,
+      "loss": 1.1869,
+      "step": 4866
+    },
+    {
+      "epoch": 0.8666310541310541,
+      "grad_norm": 0.530986487865448,
+      "learning_rate": 0.0001778541721609516,
+      "loss": 1.1046,
+      "step": 4867
+    },
+    {
+      "epoch": 0.8668091168091168,
+      "grad_norm": 0.5065864324569702,
+      "learning_rate": 0.0001778453866848075,
+      "loss": 1.008,
+      "step": 4868
+    },
+    {
+      "epoch": 0.8669871794871795,
+      "grad_norm": 0.5541394352912903,
+      "learning_rate": 0.00017783659968344476,
+      "loss": 1.0004,
+      "step": 4869
+    },
+    {
+      "epoch": 0.8671652421652422,
+      "grad_norm": 0.5059576630592346,
+      "learning_rate": 0.00017782781115703556,
+      "loss": 1.128,
+      "step": 4870
+    },
+    {
+      "epoch": 0.8673433048433048,
+      "grad_norm": 0.5052187442779541,
+      "learning_rate": 0.00017781902110575203,
+      "loss": 0.8544,
+      "step": 4871
+    },
+    {
+      "epoch": 0.8675213675213675,
+      "grad_norm": 0.5383397340774536,
+      "learning_rate": 0.00017781022952976646,
+      "loss": 1.1411,
+      "step": 4872
+    },
+    {
+      "epoch": 0.8676994301994302,
+      "grad_norm": 0.4760429859161377,
+      "learning_rate": 0.00017780143642925106,
+      "loss": 0.8246,
+      "step": 4873
+    },
+    {
+      "epoch": 0.8678774928774928,
+      "grad_norm": 0.5480535626411438,
+      "learning_rate": 0.00017779264180437817,
+      "loss": 1.013,
+      "step": 4874
+    },
+    {
+      "epoch": 0.8680555555555556,
+      "grad_norm": 0.5303317904472351,
+      "learning_rate": 0.00017778384565532004,
+      "loss": 1.0201,
+      "step": 4875
+    },
+    {
+      "epoch": 0.8682336182336182,
+      "grad_norm": 0.5365355014801025,
+      "learning_rate": 0.00017777504798224903,
+      "loss": 1.1107,
+      "step": 4876
+    },
+    {
+      "epoch": 0.8684116809116809,
+      "grad_norm": 0.5173360705375671,
+      "learning_rate": 0.00017776624878533754,
+      "loss": 1.0808,
+      "step": 4877
+    },
+    {
+      "epoch": 0.8685897435897436,
+      "grad_norm": 0.5088842511177063,
+      "learning_rate": 0.00017775744806475792,
+      "loss": 0.995,
+      "step": 4878
+    },
+    {
+      "epoch": 0.8687678062678063,
+      "grad_norm": 0.5796698927879333,
+      "learning_rate": 0.00017774864582068264,
+      "loss": 1.1485,
+      "step": 4879
+    },
+    {
+      "epoch": 0.8689458689458689,
+      "grad_norm": 0.5719375610351562,
+      "learning_rate": 0.00017773984205328417,
+      "loss": 1.0133,
+      "step": 4880
+    },
+    {
+      "epoch": 0.8691239316239316,
+      "grad_norm": 0.6396418213844299,
+      "learning_rate": 0.00017773103676273498,
+      "loss": 1.0932,
+      "step": 4881
+    },
+    {
+      "epoch": 0.8693019943019943,
+      "grad_norm": 0.5602468252182007,
+      "learning_rate": 0.00017772222994920763,
+      "loss": 0.9702,
+      "step": 4882
+    },
+    {
+      "epoch": 0.8694800569800569,
+      "grad_norm": 0.5167748332023621,
+      "learning_rate": 0.00017771342161287457,
+      "loss": 1.0528,
+      "step": 4883
+    },
+    {
+      "epoch": 0.8696581196581197,
+      "grad_norm": 0.5572916865348816,
+      "learning_rate": 0.00017770461175390848,
+      "loss": 1.1341,
+      "step": 4884
+    },
+    {
+      "epoch": 0.8698361823361823,
+      "grad_norm": 0.6666276454925537,
+      "learning_rate": 0.00017769580037248195,
+      "loss": 1.1948,
+      "step": 4885
+    },
+    {
+      "epoch": 0.8700142450142451,
+      "grad_norm": 0.5348601937294006,
+      "learning_rate": 0.0001776869874687676,
+      "loss": 1.0562,
+      "step": 4886
+    },
+    {
+      "epoch": 0.8701923076923077,
+      "grad_norm": 0.5449648499488831,
+      "learning_rate": 0.00017767817304293812,
+      "loss": 0.988,
+      "step": 4887
+    },
+    {
+      "epoch": 0.8703703703703703,
+      "grad_norm": 0.5995045304298401,
+      "learning_rate": 0.0001776693570951662,
+      "loss": 1.2526,
+      "step": 4888
+    },
+    {
+      "epoch": 0.8705484330484331,
+      "grad_norm": 0.6575320959091187,
+      "learning_rate": 0.00017766053962562457,
+      "loss": 1.1717,
+      "step": 4889
+    },
+    {
+      "epoch": 0.8707264957264957,
+      "grad_norm": 0.5882139801979065,
+      "learning_rate": 0.00017765172063448597,
+      "loss": 1.238,
+      "step": 4890
+    },
+    {
+      "epoch": 0.8709045584045584,
+      "grad_norm": 0.5908389091491699,
+      "learning_rate": 0.00017764290012192325,
+      "loss": 1.0606,
+      "step": 4891
+    },
+    {
+      "epoch": 0.8710826210826211,
+      "grad_norm": 0.6169339418411255,
+      "learning_rate": 0.00017763407808810917,
+      "loss": 1.1456,
+      "step": 4892
+    },
+    {
+      "epoch": 0.8712606837606838,
+      "grad_norm": 0.5916035771369934,
+      "learning_rate": 0.0001776252545332166,
+      "loss": 1.0026,
+      "step": 4893
+    },
+    {
+      "epoch": 0.8714387464387464,
+      "grad_norm": 0.539995551109314,
+      "learning_rate": 0.00017761642945741843,
+      "loss": 1.2397,
+      "step": 4894
+    },
+    {
+      "epoch": 0.8716168091168092,
+      "grad_norm": 0.5346137881278992,
+      "learning_rate": 0.00017760760286088755,
+      "loss": 1.1232,
+      "step": 4895
+    },
+    {
+      "epoch": 0.8717948717948718,
+      "grad_norm": 0.570202112197876,
+      "learning_rate": 0.00017759877474379692,
+      "loss": 1.0708,
+      "step": 4896
+    },
+    {
+      "epoch": 0.8719729344729344,
+      "grad_norm": 0.5023398399353027,
+      "learning_rate": 0.00017758994510631948,
+      "loss": 1.1056,
+      "step": 4897
+    },
+    {
+      "epoch": 0.8721509971509972,
+      "grad_norm": 0.5447137951850891,
+      "learning_rate": 0.00017758111394862826,
+      "loss": 0.8776,
+      "step": 4898
+    },
+    {
+      "epoch": 0.8723290598290598,
+      "grad_norm": 0.5193906426429749,
+      "learning_rate": 0.00017757228127089625,
+      "loss": 0.9959,
+      "step": 4899
+    },
+    {
+      "epoch": 0.8725071225071225,
+      "grad_norm": 0.5958787798881531,
+      "learning_rate": 0.00017756344707329656,
+      "loss": 1.092,
+      "step": 4900
+    },
+    {
+      "epoch": 0.8726851851851852,
+      "grad_norm": 0.521045982837677,
+      "learning_rate": 0.00017755461135600221,
+      "loss": 0.9864,
+      "step": 4901
+    },
+    {
+      "epoch": 0.8728632478632479,
+      "grad_norm": 0.5257635116577148,
+      "learning_rate": 0.00017754577411918638,
+      "loss": 1.216,
+      "step": 4902
+    },
+    {
+      "epoch": 0.8730413105413105,
+      "grad_norm": 0.5425964593887329,
+      "learning_rate": 0.0001775369353630222,
+      "loss": 1.1432,
+      "step": 4903
+    },
+    {
+      "epoch": 0.8732193732193733,
+      "grad_norm": 0.47995322942733765,
+      "learning_rate": 0.00017752809508768286,
+      "loss": 1.0227,
+      "step": 4904
+    },
+    {
+      "epoch": 0.8733974358974359,
+      "grad_norm": 0.5747429728507996,
+      "learning_rate": 0.0001775192532933415,
+      "loss": 0.9984,
+      "step": 4905
+    },
+    {
+      "epoch": 0.8735754985754985,
+      "grad_norm": 0.5745723247528076,
+      "learning_rate": 0.00017751040998017142,
+      "loss": 1.2559,
+      "step": 4906
+    },
+    {
+      "epoch": 0.8737535612535613,
+      "grad_norm": 0.6114141941070557,
+      "learning_rate": 0.0001775015651483459,
+      "loss": 1.3224,
+      "step": 4907
+    },
+    {
+      "epoch": 0.8739316239316239,
+      "grad_norm": 0.4757187068462372,
+      "learning_rate": 0.00017749271879803817,
+      "loss": 1.0352,
+      "step": 4908
+    },
+    {
+      "epoch": 0.8741096866096866,
+      "grad_norm": 0.48644450306892395,
+      "learning_rate": 0.0001774838709294216,
+      "loss": 1.0876,
+      "step": 4909
+    },
+    {
+      "epoch": 0.8742877492877493,
+      "grad_norm": 0.5652037262916565,
+      "learning_rate": 0.00017747502154266955,
+      "loss": 0.9189,
+      "step": 4910
+    },
+    {
+      "epoch": 0.874465811965812,
+      "grad_norm": 0.5289644002914429,
+      "learning_rate": 0.00017746617063795538,
+      "loss": 0.9431,
+      "step": 4911
+    },
+    {
+      "epoch": 0.8746438746438746,
+      "grad_norm": 0.594656229019165,
+      "learning_rate": 0.00017745731821545253,
+      "loss": 1.2408,
+      "step": 4912
+    },
+    {
+      "epoch": 0.8748219373219374,
+      "grad_norm": 0.5693240165710449,
+      "learning_rate": 0.0001774484642753344,
+      "loss": 1.347,
+      "step": 4913
+    },
+    {
+      "epoch": 0.875,
+      "grad_norm": 0.5291008949279785,
+      "learning_rate": 0.00017743960881777456,
+      "loss": 1.161,
+      "step": 4914
+    },
+    {
+      "epoch": 0.8751780626780626,
+      "grad_norm": 0.5958300232887268,
+      "learning_rate": 0.00017743075184294642,
+      "loss": 1.2058,
+      "step": 4915
+    },
+    {
+      "epoch": 0.8753561253561254,
+      "grad_norm": 0.513884425163269,
+      "learning_rate": 0.00017742189335102354,
+      "loss": 1.0952,
+      "step": 4916
+    },
+    {
+      "epoch": 0.875534188034188,
+      "grad_norm": 0.5860681533813477,
+      "learning_rate": 0.00017741303334217948,
+      "loss": 1.1801,
+      "step": 4917
+    },
+    {
+      "epoch": 0.8757122507122507,
+      "grad_norm": 0.47962820529937744,
+      "learning_rate": 0.00017740417181658788,
+      "loss": 1.0785,
+      "step": 4918
+    },
+    {
+      "epoch": 0.8758903133903134,
+      "grad_norm": 0.5110440254211426,
+      "learning_rate": 0.00017739530877442227,
+      "loss": 1.1385,
+      "step": 4919
+    },
+    {
+      "epoch": 0.8760683760683761,
+      "grad_norm": 0.5106285214424133,
+      "learning_rate": 0.00017738644421585643,
+      "loss": 1.1204,
+      "step": 4920
+    },
+    {
+      "epoch": 0.8762464387464387,
+      "grad_norm": 0.5709205865859985,
+      "learning_rate": 0.00017737757814106393,
+      "loss": 1.0108,
+      "step": 4921
+    },
+    {
+      "epoch": 0.8764245014245015,
+      "grad_norm": 0.5850250124931335,
+      "learning_rate": 0.0001773687105502185,
+      "loss": 1.0059,
+      "step": 4922
+    },
+    {
+      "epoch": 0.8766025641025641,
+      "grad_norm": 0.5194727778434753,
+      "learning_rate": 0.00017735984144349396,
+      "loss": 0.9466,
+      "step": 4923
+    },
+    {
+      "epoch": 0.8767806267806267,
+      "grad_norm": 0.5246787667274475,
+      "learning_rate": 0.000177350970821064,
+      "loss": 1.1336,
+      "step": 4924
+    },
+    {
+      "epoch": 0.8769586894586895,
+      "grad_norm": 0.5798323154449463,
+      "learning_rate": 0.00017734209868310244,
+      "loss": 1.1641,
+      "step": 4925
+    },
+    {
+      "epoch": 0.8771367521367521,
+      "grad_norm": 0.5188565850257874,
+      "learning_rate": 0.00017733322502978314,
+      "loss": 0.9959,
+      "step": 4926
+    },
+    {
+      "epoch": 0.8773148148148148,
+      "grad_norm": 0.5969653725624084,
+      "learning_rate": 0.00017732434986127995,
+      "loss": 1.2162,
+      "step": 4927
+    },
+    {
+      "epoch": 0.8774928774928775,
+      "grad_norm": 0.5520089268684387,
+      "learning_rate": 0.00017731547317776674,
+      "loss": 1.0163,
+      "step": 4928
+    },
+    {
+      "epoch": 0.8776709401709402,
+      "grad_norm": 0.48789507150650024,
+      "learning_rate": 0.00017730659497941745,
+      "loss": 0.9757,
+      "step": 4929
+    },
+    {
+      "epoch": 0.8778490028490028,
+      "grad_norm": 0.6034960746765137,
+      "learning_rate": 0.000177297715266406,
+      "loss": 1.1278,
+      "step": 4930
+    },
+    {
+      "epoch": 0.8780270655270656,
+      "grad_norm": 0.53016597032547,
+      "learning_rate": 0.00017728883403890638,
+      "loss": 1.0637,
+      "step": 4931
+    },
+    {
+      "epoch": 0.8782051282051282,
+      "grad_norm": 0.5073726177215576,
+      "learning_rate": 0.00017727995129709266,
+      "loss": 1.1491,
+      "step": 4932
+    },
+    {
+      "epoch": 0.8783831908831908,
+      "grad_norm": 0.540605366230011,
+      "learning_rate": 0.00017727106704113878,
+      "loss": 1.0133,
+      "step": 4933
+    },
+    {
+      "epoch": 0.8785612535612536,
+      "grad_norm": 0.5346775054931641,
+      "learning_rate": 0.0001772621812712189,
+      "loss": 1.1781,
+      "step": 4934
+    },
+    {
+      "epoch": 0.8787393162393162,
+      "grad_norm": 0.5659036040306091,
+      "learning_rate": 0.00017725329398750702,
+      "loss": 1.1023,
+      "step": 4935
+    },
+    {
+      "epoch": 0.8789173789173789,
+      "grad_norm": 0.591063380241394,
+      "learning_rate": 0.00017724440519017738,
+      "loss": 1.0298,
+      "step": 4936
+    },
+    {
+      "epoch": 0.8790954415954416,
+      "grad_norm": 0.5173781514167786,
+      "learning_rate": 0.0001772355148794041,
+      "loss": 1.0483,
+      "step": 4937
+    },
+    {
+      "epoch": 0.8792735042735043,
+      "grad_norm": 0.5405352711677551,
+      "learning_rate": 0.0001772266230553613,
+      "loss": 1.0716,
+      "step": 4938
+    },
+    {
+      "epoch": 0.8794515669515669,
+      "grad_norm": 0.518442690372467,
+      "learning_rate": 0.00017721772971822323,
+      "loss": 1.1373,
+      "step": 4939
+    },
+    {
+      "epoch": 0.8796296296296297,
+      "grad_norm": 0.533673107624054,
+      "learning_rate": 0.0001772088348681642,
+      "loss": 1.0489,
+      "step": 4940
+    },
+    {
+      "epoch": 0.8798076923076923,
+      "grad_norm": 0.46117857098579407,
+      "learning_rate": 0.0001771999385053584,
+      "loss": 1.0297,
+      "step": 4941
+    },
+    {
+      "epoch": 0.8799857549857549,
+      "grad_norm": 0.4687997102737427,
+      "learning_rate": 0.0001771910406299802,
+      "loss": 1.071,
+      "step": 4942
+    },
+    {
+      "epoch": 0.8801638176638177,
+      "grad_norm": 0.5064153075218201,
+      "learning_rate": 0.0001771821412422039,
+      "loss": 0.9518,
+      "step": 4943
+    },
+    {
+      "epoch": 0.8803418803418803,
+      "grad_norm": 0.6561978459358215,
+      "learning_rate": 0.00017717324034220385,
+      "loss": 1.11,
+      "step": 4944
+    },
+    {
+      "epoch": 0.8805199430199431,
+      "grad_norm": 0.5551498532295227,
+      "learning_rate": 0.00017716433793015454,
+      "loss": 0.9719,
+      "step": 4945
+    },
+    {
+      "epoch": 0.8806980056980057,
+      "grad_norm": 0.47059500217437744,
+      "learning_rate": 0.00017715543400623025,
+      "loss": 0.8891,
+      "step": 4946
+    },
+    {
+      "epoch": 0.8808760683760684,
+      "grad_norm": 0.5035740733146667,
+      "learning_rate": 0.00017714652857060554,
+      "loss": 0.9671,
+      "step": 4947
+    },
+    {
+      "epoch": 0.8810541310541311,
+      "grad_norm": 0.4599960446357727,
+      "learning_rate": 0.00017713762162345487,
+      "loss": 0.9588,
+      "step": 4948
+    },
+    {
+      "epoch": 0.8812321937321937,
+      "grad_norm": 0.5087231397628784,
+      "learning_rate": 0.0001771287131649527,
+      "loss": 1.1433,
+      "step": 4949
+    },
+    {
+      "epoch": 0.8814102564102564,
+      "grad_norm": 0.5609854459762573,
+      "learning_rate": 0.00017711980319527366,
+      "loss": 1.2022,
+      "step": 4950
+    },
+    {
+      "epoch": 0.8815883190883191,
+      "grad_norm": 0.49460700154304504,
+      "learning_rate": 0.00017711089171459227,
+      "loss": 1.019,
+      "step": 4951
+    },
+    {
+      "epoch": 0.8817663817663818,
+      "grad_norm": 0.5047259330749512,
+      "learning_rate": 0.00017710197872308314,
+      "loss": 0.8301,
+      "step": 4952
+    },
+    {
+      "epoch": 0.8819444444444444,
+      "grad_norm": 0.5784406065940857,
+      "learning_rate": 0.0001770930642209209,
+      "loss": 0.9336,
+      "step": 4953
+    },
+    {
+      "epoch": 0.8821225071225072,
+      "grad_norm": 0.5037121772766113,
+      "learning_rate": 0.00017708414820828022,
+      "loss": 1.0199,
+      "step": 4954
+    },
+    {
+      "epoch": 0.8823005698005698,
+      "grad_norm": 0.5683804750442505,
+      "learning_rate": 0.00017707523068533575,
+      "loss": 0.9758,
+      "step": 4955
+    },
+    {
+      "epoch": 0.8824786324786325,
+      "grad_norm": 0.5167922973632812,
+      "learning_rate": 0.0001770663116522623,
+      "loss": 1.0389,
+      "step": 4956
+    },
+    {
+      "epoch": 0.8826566951566952,
+      "grad_norm": 0.5813606381416321,
+      "learning_rate": 0.0001770573911092345,
+      "loss": 1.3998,
+      "step": 4957
+    },
+    {
+      "epoch": 0.8828347578347578,
+      "grad_norm": 0.5280475616455078,
+      "learning_rate": 0.00017704846905642723,
+      "loss": 1.0545,
+      "step": 4958
+    },
+    {
+      "epoch": 0.8830128205128205,
+      "grad_norm": 0.5421732068061829,
+      "learning_rate": 0.00017703954549401528,
+      "loss": 0.899,
+      "step": 4959
+    },
+    {
+      "epoch": 0.8831908831908832,
+      "grad_norm": 0.5177720189094543,
+      "learning_rate": 0.00017703062042217344,
+      "loss": 0.975,
+      "step": 4960
+    },
+    {
+      "epoch": 0.8833689458689459,
+      "grad_norm": 0.639327883720398,
+      "learning_rate": 0.00017702169384107666,
+      "loss": 1.1936,
+      "step": 4961
+    },
+    {
+      "epoch": 0.8835470085470085,
+      "grad_norm": 0.5201572179794312,
+      "learning_rate": 0.00017701276575089975,
+      "loss": 0.9891,
+      "step": 4962
+    },
+    {
+      "epoch": 0.8837250712250713,
+      "grad_norm": 0.5304145216941833,
+      "learning_rate": 0.00017700383615181767,
+      "loss": 1.0569,
+      "step": 4963
+    },
+    {
+      "epoch": 0.8839031339031339,
+      "grad_norm": 0.6068132519721985,
+      "learning_rate": 0.00017699490504400538,
+      "loss": 1.2653,
+      "step": 4964
+    },
+    {
+      "epoch": 0.8840811965811965,
+      "grad_norm": 0.597895085811615,
+      "learning_rate": 0.00017698597242763787,
+      "loss": 1.2577,
+      "step": 4965
+    },
+    {
+      "epoch": 0.8842592592592593,
+      "grad_norm": 0.5356902480125427,
+      "learning_rate": 0.00017697703830289017,
+      "loss": 1.1056,
+      "step": 4966
+    },
+    {
+      "epoch": 0.8844373219373219,
+      "grad_norm": 0.5429540872573853,
+      "learning_rate": 0.0001769681026699373,
+      "loss": 1.0951,
+      "step": 4967
+    },
+    {
+      "epoch": 0.8846153846153846,
+      "grad_norm": 0.5789309144020081,
+      "learning_rate": 0.00017695916552895436,
+      "loss": 1.0786,
+      "step": 4968
+    },
+    {
+      "epoch": 0.8847934472934473,
+      "grad_norm": 0.5621341466903687,
+      "learning_rate": 0.0001769502268801164,
+      "loss": 1.0645,
+      "step": 4969
+    },
+    {
+      "epoch": 0.88497150997151,
+      "grad_norm": 0.5879453420639038,
+      "learning_rate": 0.00017694128672359865,
+      "loss": 1.2171,
+      "step": 4970
+    },
+    {
+      "epoch": 0.8851495726495726,
+      "grad_norm": 0.5005951523780823,
+      "learning_rate": 0.0001769323450595762,
+      "loss": 1.0725,
+      "step": 4971
+    },
+    {
+      "epoch": 0.8853276353276354,
+      "grad_norm": 0.5439660549163818,
+      "learning_rate": 0.00017692340188822425,
+      "loss": 1.162,
+      "step": 4972
+    },
+    {
+      "epoch": 0.885505698005698,
+      "grad_norm": 0.6309837698936462,
+      "learning_rate": 0.00017691445720971802,
+      "loss": 1.2861,
+      "step": 4973
+    },
+    {
+      "epoch": 0.8856837606837606,
+      "grad_norm": 0.4997463822364807,
+      "learning_rate": 0.00017690551102423282,
+      "loss": 1.1887,
+      "step": 4974
+    },
+    {
+      "epoch": 0.8858618233618234,
+      "grad_norm": 0.5430852174758911,
+      "learning_rate": 0.00017689656333194385,
+      "loss": 1.1231,
+      "step": 4975
+    },
+    {
+      "epoch": 0.886039886039886,
+      "grad_norm": 0.5414215922355652,
+      "learning_rate": 0.00017688761413302644,
+      "loss": 1.2345,
+      "step": 4976
+    },
+    {
+      "epoch": 0.8862179487179487,
+      "grad_norm": 0.5594443082809448,
+      "learning_rate": 0.00017687866342765601,
+      "loss": 1.0775,
+      "step": 4977
+    },
+    {
+      "epoch": 0.8863960113960114,
+      "grad_norm": 0.5827134847640991,
+      "learning_rate": 0.00017686971121600787,
+      "loss": 1.0609,
+      "step": 4978
+    },
+    {
+      "epoch": 0.8865740740740741,
+      "grad_norm": 0.5075414776802063,
+      "learning_rate": 0.00017686075749825738,
+      "loss": 0.796,
+      "step": 4979
+    },
+    {
+      "epoch": 0.8867521367521367,
+      "grad_norm": 0.6007544994354248,
+      "learning_rate": 0.00017685180227458003,
+      "loss": 1.1716,
+      "step": 4980
+    },
+    {
+      "epoch": 0.8869301994301995,
+      "grad_norm": 0.6458030343055725,
+      "learning_rate": 0.00017684284554515128,
+      "loss": 1.1945,
+      "step": 4981
+    },
+    {
+      "epoch": 0.8871082621082621,
+      "grad_norm": 0.5519212484359741,
+      "learning_rate": 0.00017683388731014657,
+      "loss": 1.2571,
+      "step": 4982
+    },
+    {
+      "epoch": 0.8872863247863247,
+      "grad_norm": 0.5079960227012634,
+      "learning_rate": 0.00017682492756974146,
+      "loss": 1.1186,
+      "step": 4983
+    },
+    {
+      "epoch": 0.8874643874643875,
+      "grad_norm": 0.63576740026474,
+      "learning_rate": 0.00017681596632411147,
+      "loss": 1.389,
+      "step": 4984
+    },
+    {
+      "epoch": 0.8876424501424501,
+      "grad_norm": 0.43325698375701904,
+      "learning_rate": 0.0001768070035734322,
+      "loss": 0.7757,
+      "step": 4985
+    },
+    {
+      "epoch": 0.8878205128205128,
+      "grad_norm": 0.49492064118385315,
+      "learning_rate": 0.00017679803931787923,
+      "loss": 1.0096,
+      "step": 4986
+    },
+    {
+      "epoch": 0.8879985754985755,
+      "grad_norm": 0.5561224222183228,
+      "learning_rate": 0.00017678907355762825,
+      "loss": 0.952,
+      "step": 4987
+    },
+    {
+      "epoch": 0.8881766381766382,
+      "grad_norm": 0.5392457246780396,
+      "learning_rate": 0.00017678010629285486,
+      "loss": 1.0442,
+      "step": 4988
+    },
+    {
+      "epoch": 0.8883547008547008,
+      "grad_norm": 0.4659234881401062,
+      "learning_rate": 0.00017677113752373482,
+      "loss": 0.8668,
+      "step": 4989
+    },
+    {
+      "epoch": 0.8885327635327636,
+      "grad_norm": 0.5139175057411194,
+      "learning_rate": 0.0001767621672504438,
+      "loss": 0.8386,
+      "step": 4990
+    },
+    {
+      "epoch": 0.8887108262108262,
+      "grad_norm": 0.5395823121070862,
+      "learning_rate": 0.00017675319547315755,
+      "loss": 0.9754,
+      "step": 4991
+    },
+    {
+      "epoch": 0.8888888888888888,
+      "grad_norm": 0.4751867949962616,
+      "learning_rate": 0.0001767442221920519,
+      "loss": 0.8775,
+      "step": 4992
+    },
+    {
+      "epoch": 0.8890669515669516,
+      "grad_norm": 0.5728281736373901,
+      "learning_rate": 0.00017673524740730265,
+      "loss": 1.2807,
+      "step": 4993
+    },
+    {
+      "epoch": 0.8892450142450142,
+      "grad_norm": 0.5545622110366821,
+      "learning_rate": 0.00017672627111908558,
+      "loss": 1.0039,
+      "step": 4994
+    },
+    {
+      "epoch": 0.8894230769230769,
+      "grad_norm": 0.5127374529838562,
+      "learning_rate": 0.00017671729332757665,
+      "loss": 1.0505,
+      "step": 4995
+    },
+    {
+      "epoch": 0.8896011396011396,
+      "grad_norm": 0.5238714218139648,
+      "learning_rate": 0.00017670831403295175,
+      "loss": 1.1775,
+      "step": 4996
+    },
+    {
+      "epoch": 0.8897792022792023,
+      "grad_norm": 0.5610160827636719,
+      "learning_rate": 0.00017669933323538674,
+      "loss": 1.0555,
+      "step": 4997
+    },
+    {
+      "epoch": 0.8899572649572649,
+      "grad_norm": 0.5481634736061096,
+      "learning_rate": 0.00017669035093505762,
+      "loss": 1.0802,
+      "step": 4998
+    },
+    {
+      "epoch": 0.8901353276353277,
+      "grad_norm": 0.4725174307823181,
+      "learning_rate": 0.0001766813671321404,
+      "loss": 0.9611,
+      "step": 4999
+    },
+    {
+      "epoch": 0.8903133903133903,
+      "grad_norm": 0.5184635519981384,
+      "learning_rate": 0.0001766723818268111,
+      "loss": 1.1659,
+      "step": 5000
+    },
+    {
+      "epoch": 0.8904914529914529,
+      "grad_norm": 0.5503578186035156,
+      "learning_rate": 0.00017666339501924575,
+      "loss": 1.2165,
+      "step": 5001
+    },
+    {
+      "epoch": 0.8906695156695157,
+      "grad_norm": 0.5299594402313232,
+      "learning_rate": 0.0001766544067096204,
+      "loss": 1.0196,
+      "step": 5002
+    },
+    {
+      "epoch": 0.8908475783475783,
+      "grad_norm": 0.5673944354057312,
+      "learning_rate": 0.00017664541689811118,
+      "loss": 1.2058,
+      "step": 5003
+    },
+    {
+      "epoch": 0.8910256410256411,
+      "grad_norm": 0.6057320833206177,
+      "learning_rate": 0.00017663642558489426,
+      "loss": 1.0136,
+      "step": 5004
+    },
+    {
+      "epoch": 0.8912037037037037,
+      "grad_norm": 0.4767026901245117,
+      "learning_rate": 0.00017662743277014578,
+      "loss": 0.8522,
+      "step": 5005
+    },
+    {
+      "epoch": 0.8913817663817664,
+      "grad_norm": 0.5346270203590393,
+      "learning_rate": 0.00017661843845404192,
+      "loss": 1.1568,
+      "step": 5006
+    },
+    {
+      "epoch": 0.8915598290598291,
+      "grad_norm": 0.5365738868713379,
+      "learning_rate": 0.00017660944263675891,
+      "loss": 1.0488,
+      "step": 5007
+    },
+    {
+      "epoch": 0.8917378917378918,
+      "grad_norm": 0.5536269545555115,
+      "learning_rate": 0.00017660044531847305,
+      "loss": 1.1216,
+      "step": 5008
+    },
+    {
+      "epoch": 0.8919159544159544,
+      "grad_norm": 0.6325978636741638,
+      "learning_rate": 0.00017659144649936055,
+      "loss": 1.2843,
+      "step": 5009
+    },
+    {
+      "epoch": 0.8920940170940171,
+      "grad_norm": 0.5890641212463379,
+      "learning_rate": 0.00017658244617959777,
+      "loss": 1.1976,
+      "step": 5010
+    },
+    {
+      "epoch": 0.8922720797720798,
+      "grad_norm": 0.604870080947876,
+      "learning_rate": 0.00017657344435936107,
+      "loss": 1.2881,
+      "step": 5011
+    },
+    {
+      "epoch": 0.8924501424501424,
+      "grad_norm": 0.49805206060409546,
+      "learning_rate": 0.00017656444103882676,
+      "loss": 0.8998,
+      "step": 5012
+    },
+    {
+      "epoch": 0.8926282051282052,
+      "grad_norm": 0.506926953792572,
+      "learning_rate": 0.0001765554362181713,
+      "loss": 1.0731,
+      "step": 5013
+    },
+    {
+      "epoch": 0.8928062678062678,
+      "grad_norm": 0.5353260636329651,
+      "learning_rate": 0.0001765464298975711,
+      "loss": 1.0676,
+      "step": 5014
+    },
+    {
+      "epoch": 0.8929843304843305,
+      "grad_norm": 0.5641853213310242,
+      "learning_rate": 0.0001765374220772026,
+      "loss": 0.9606,
+      "step": 5015
+    },
+    {
+      "epoch": 0.8931623931623932,
+      "grad_norm": 0.5049327611923218,
+      "learning_rate": 0.00017652841275724233,
+      "loss": 1.009,
+      "step": 5016
+    },
+    {
+      "epoch": 0.8933404558404558,
+      "grad_norm": 0.6255155205726624,
+      "learning_rate": 0.0001765194019378668,
+      "loss": 1.138,
+      "step": 5017
+    },
+    {
+      "epoch": 0.8935185185185185,
+      "grad_norm": 0.5816851854324341,
+      "learning_rate": 0.00017651038961925247,
+      "loss": 1.3398,
+      "step": 5018
+    },
+    {
+      "epoch": 0.8936965811965812,
+      "grad_norm": 0.5188020467758179,
+      "learning_rate": 0.00017650137580157605,
+      "loss": 1.0126,
+      "step": 5019
+    },
+    {
+      "epoch": 0.8938746438746439,
+      "grad_norm": 0.5231554508209229,
+      "learning_rate": 0.00017649236048501406,
+      "loss": 1.0328,
+      "step": 5020
+    },
+    {
+      "epoch": 0.8940527065527065,
+      "grad_norm": 0.7638634443283081,
+      "learning_rate": 0.0001764833436697432,
+      "loss": 1.3016,
+      "step": 5021
+    },
+    {
+      "epoch": 0.8942307692307693,
+      "grad_norm": 0.5354094505310059,
+      "learning_rate": 0.00017647432535594008,
+      "loss": 1.0646,
+      "step": 5022
+    },
+    {
+      "epoch": 0.8944088319088319,
+      "grad_norm": 0.6938086748123169,
+      "learning_rate": 0.0001764653055437814,
+      "loss": 1.2051,
+      "step": 5023
+    },
+    {
+      "epoch": 0.8945868945868946,
+      "grad_norm": 0.5546849370002747,
+      "learning_rate": 0.00017645628423344393,
+      "loss": 1.0671,
+      "step": 5024
+    },
+    {
+      "epoch": 0.8947649572649573,
+      "grad_norm": 0.49294665455818176,
+      "learning_rate": 0.0001764472614251044,
+      "loss": 1.0328,
+      "step": 5025
+    },
+    {
+      "epoch": 0.89494301994302,
+      "grad_norm": 0.5965796113014221,
+      "learning_rate": 0.00017643823711893956,
+      "loss": 1.0741,
+      "step": 5026
+    },
+    {
+      "epoch": 0.8951210826210826,
+      "grad_norm": 0.4846448302268982,
+      "learning_rate": 0.00017642921131512626,
+      "loss": 1.0409,
+      "step": 5027
+    },
+    {
+      "epoch": 0.8952991452991453,
+      "grad_norm": 0.5767390131950378,
+      "learning_rate": 0.00017642018401384135,
+      "loss": 1.018,
+      "step": 5028
+    },
+    {
+      "epoch": 0.895477207977208,
+      "grad_norm": 0.503027617931366,
+      "learning_rate": 0.00017641115521526167,
+      "loss": 1.0002,
+      "step": 5029
+    },
+    {
+      "epoch": 0.8956552706552706,
+      "grad_norm": 0.6668619513511658,
+      "learning_rate": 0.00017640212491956412,
+      "loss": 1.2154,
+      "step": 5030
+    },
+    {
+      "epoch": 0.8958333333333334,
+      "grad_norm": 0.5544148683547974,
+      "learning_rate": 0.00017639309312692566,
+      "loss": 1.2701,
+      "step": 5031
+    },
+    {
+      "epoch": 0.896011396011396,
+      "grad_norm": 0.6026872992515564,
+      "learning_rate": 0.00017638405983752323,
+      "loss": 0.9335,
+      "step": 5032
+    },
+    {
+      "epoch": 0.8961894586894587,
+      "grad_norm": 0.6288694143295288,
+      "learning_rate": 0.00017637502505153384,
+      "loss": 0.9075,
+      "step": 5033
+    },
+    {
+      "epoch": 0.8963675213675214,
+      "grad_norm": 0.4890204966068268,
+      "learning_rate": 0.00017636598876913446,
+      "loss": 0.8492,
+      "step": 5034
+    },
+    {
+      "epoch": 0.896545584045584,
+      "grad_norm": 0.5746598243713379,
+      "learning_rate": 0.00017635695099050218,
+      "loss": 1.1557,
+      "step": 5035
+    },
+    {
+      "epoch": 0.8967236467236467,
+      "grad_norm": 0.5165683031082153,
+      "learning_rate": 0.00017634791171581405,
+      "loss": 1.0899,
+      "step": 5036
+    },
+    {
+      "epoch": 0.8969017094017094,
+      "grad_norm": 0.4621037244796753,
+      "learning_rate": 0.0001763388709452472,
+      "loss": 1.0457,
+      "step": 5037
+    },
+    {
+      "epoch": 0.8970797720797721,
+      "grad_norm": 0.532358705997467,
+      "learning_rate": 0.00017632982867897876,
+      "loss": 1.139,
+      "step": 5038
+    },
+    {
+      "epoch": 0.8972578347578347,
+      "grad_norm": 0.5794399976730347,
+      "learning_rate": 0.00017632078491718587,
+      "loss": 1.031,
+      "step": 5039
+    },
+    {
+      "epoch": 0.8974358974358975,
+      "grad_norm": 0.5031905174255371,
+      "learning_rate": 0.00017631173966004576,
+      "loss": 0.9508,
+      "step": 5040
+    },
+    {
+      "epoch": 0.8976139601139601,
+      "grad_norm": 0.6528840065002441,
+      "learning_rate": 0.00017630269290773564,
+      "loss": 0.9974,
+      "step": 5041
+    },
+    {
+      "epoch": 0.8977920227920227,
+      "grad_norm": 0.6007558703422546,
+      "learning_rate": 0.00017629364466043273,
+      "loss": 1.0993,
+      "step": 5042
+    },
+    {
+      "epoch": 0.8979700854700855,
+      "grad_norm": 0.5104095339775085,
+      "learning_rate": 0.00017628459491831437,
+      "loss": 0.9175,
+      "step": 5043
+    },
+    {
+      "epoch": 0.8981481481481481,
+      "grad_norm": 0.5285516977310181,
+      "learning_rate": 0.00017627554368155782,
+      "loss": 0.998,
+      "step": 5044
+    },
+    {
+      "epoch": 0.8983262108262108,
+      "grad_norm": 0.5629046559333801,
+      "learning_rate": 0.00017626649095034045,
+      "loss": 1.2021,
+      "step": 5045
+    },
+    {
+      "epoch": 0.8985042735042735,
+      "grad_norm": 0.57548987865448,
+      "learning_rate": 0.00017625743672483962,
+      "loss": 1.2076,
+      "step": 5046
+    },
+    {
+      "epoch": 0.8986823361823362,
+      "grad_norm": 0.4883024990558624,
+      "learning_rate": 0.0001762483810052327,
+      "loss": 0.9761,
+      "step": 5047
+    },
+    {
+      "epoch": 0.8988603988603988,
+      "grad_norm": 0.6378034949302673,
+      "learning_rate": 0.0001762393237916972,
+      "loss": 1.2266,
+      "step": 5048
+    },
+    {
+      "epoch": 0.8990384615384616,
+      "grad_norm": 0.5201624035835266,
+      "learning_rate": 0.0001762302650844105,
+      "loss": 1.247,
+      "step": 5049
+    },
+    {
+      "epoch": 0.8992165242165242,
+      "grad_norm": 0.5438048243522644,
+      "learning_rate": 0.0001762212048835501,
+      "loss": 0.993,
+      "step": 5050
+    },
+    {
+      "epoch": 0.8993945868945868,
+      "grad_norm": 0.5928253531455994,
+      "learning_rate": 0.00017621214318929354,
+      "loss": 1.0469,
+      "step": 5051
+    },
+    {
+      "epoch": 0.8995726495726496,
+      "grad_norm": 0.6437996625900269,
+      "learning_rate": 0.00017620308000181831,
+      "loss": 1.3136,
+      "step": 5052
+    },
+    {
+      "epoch": 0.8997507122507122,
+      "grad_norm": 0.5961456298828125,
+      "learning_rate": 0.00017619401532130208,
+      "loss": 1.1495,
+      "step": 5053
+    },
+    {
+      "epoch": 0.8999287749287749,
+      "grad_norm": 0.497388631105423,
+      "learning_rate": 0.0001761849491479224,
+      "loss": 0.7783,
+      "step": 5054
+    },
+    {
+      "epoch": 0.9001068376068376,
+      "grad_norm": 0.5984451174736023,
+      "learning_rate": 0.00017617588148185687,
+      "loss": 1.3115,
+      "step": 5055
+    },
+    {
+      "epoch": 0.9002849002849003,
+      "grad_norm": 0.549163818359375,
+      "learning_rate": 0.0001761668123232832,
+      "loss": 1.1649,
+      "step": 5056
+    },
+    {
+      "epoch": 0.9004629629629629,
+      "grad_norm": 0.5831968188285828,
+      "learning_rate": 0.00017615774167237903,
+      "loss": 1.1749,
+      "step": 5057
+    },
+    {
+      "epoch": 0.9006410256410257,
+      "grad_norm": 0.5111076235771179,
+      "learning_rate": 0.00017614866952932214,
+      "loss": 0.8936,
+      "step": 5058
+    },
+    {
+      "epoch": 0.9008190883190883,
+      "grad_norm": 0.5740947723388672,
+      "learning_rate": 0.00017613959589429028,
+      "loss": 1.2606,
+      "step": 5059
+    },
+    {
+      "epoch": 0.9009971509971509,
+      "grad_norm": 0.5881099700927734,
+      "learning_rate": 0.0001761305207674612,
+      "loss": 1.3682,
+      "step": 5060
+    },
+    {
+      "epoch": 0.9011752136752137,
+      "grad_norm": 0.5007091760635376,
+      "learning_rate": 0.00017612144414901268,
+      "loss": 0.7788,
+      "step": 5061
+    },
+    {
+      "epoch": 0.9013532763532763,
+      "grad_norm": 0.5127760171890259,
+      "learning_rate": 0.00017611236603912262,
+      "loss": 1.0519,
+      "step": 5062
+    },
+    {
+      "epoch": 0.9015313390313391,
+      "grad_norm": 0.6185184121131897,
+      "learning_rate": 0.00017610328643796882,
+      "loss": 1.1672,
+      "step": 5063
+    },
+    {
+      "epoch": 0.9017094017094017,
+      "grad_norm": 0.49707287549972534,
+      "learning_rate": 0.00017609420534572926,
+      "loss": 1.1865,
+      "step": 5064
+    },
+    {
+      "epoch": 0.9018874643874644,
+      "grad_norm": 0.5667552351951599,
+      "learning_rate": 0.0001760851227625818,
+      "loss": 1.1388,
+      "step": 5065
+    },
+    {
+      "epoch": 0.9020655270655271,
+      "grad_norm": 0.50298011302948,
+      "learning_rate": 0.00017607603868870442,
+      "loss": 0.9552,
+      "step": 5066
+    },
+    {
+      "epoch": 0.9022435897435898,
+      "grad_norm": 0.5709219574928284,
+      "learning_rate": 0.0001760669531242751,
+      "loss": 1.2636,
+      "step": 5067
+    },
+    {
+      "epoch": 0.9024216524216524,
+      "grad_norm": 0.4943496286869049,
+      "learning_rate": 0.0001760578660694718,
+      "loss": 0.8951,
+      "step": 5068
+    },
+    {
+      "epoch": 0.9025997150997151,
+      "grad_norm": 0.5475931167602539,
+      "learning_rate": 0.00017604877752447267,
+      "loss": 1.1442,
+      "step": 5069
+    },
+    {
+      "epoch": 0.9027777777777778,
+      "grad_norm": 0.5280239582061768,
+      "learning_rate": 0.0001760396874894557,
+      "loss": 0.9537,
+      "step": 5070
+    },
+    {
+      "epoch": 0.9029558404558404,
+      "grad_norm": 0.5480797290802002,
+      "learning_rate": 0.000176030595964599,
+      "loss": 1.1557,
+      "step": 5071
+    },
+    {
+      "epoch": 0.9031339031339032,
+      "grad_norm": 0.5232734680175781,
+      "learning_rate": 0.00017602150295008073,
+      "loss": 1.0219,
+      "step": 5072
+    },
+    {
+      "epoch": 0.9033119658119658,
+      "grad_norm": 0.5448359251022339,
+      "learning_rate": 0.000176012408446079,
+      "loss": 1.1964,
+      "step": 5073
+    },
+    {
+      "epoch": 0.9034900284900285,
+      "grad_norm": 0.4841914474964142,
+      "learning_rate": 0.00017600331245277206,
+      "loss": 1.0667,
+      "step": 5074
+    },
+    {
+      "epoch": 0.9036680911680912,
+      "grad_norm": 0.5407083630561829,
+      "learning_rate": 0.0001759942149703381,
+      "loss": 1.1895,
+      "step": 5075
+    },
+    {
+      "epoch": 0.9038461538461539,
+      "grad_norm": 0.5140416026115417,
+      "learning_rate": 0.00017598511599895534,
+      "loss": 0.9402,
+      "step": 5076
+    },
+    {
+      "epoch": 0.9040242165242165,
+      "grad_norm": 0.6333765983581543,
+      "learning_rate": 0.00017597601553880207,
+      "loss": 1.239,
+      "step": 5077
+    },
+    {
+      "epoch": 0.9042022792022792,
+      "grad_norm": 0.4996028244495392,
+      "learning_rate": 0.00017596691359005664,
+      "loss": 1.0259,
+      "step": 5078
+    },
+    {
+      "epoch": 0.9043803418803419,
+      "grad_norm": 0.591892421245575,
+      "learning_rate": 0.00017595781015289732,
+      "loss": 1.2148,
+      "step": 5079
+    },
+    {
+      "epoch": 0.9045584045584045,
+      "grad_norm": 0.736499011516571,
+      "learning_rate": 0.0001759487052275025,
+      "loss": 1.1373,
+      "step": 5080
+    },
+    {
+      "epoch": 0.9047364672364673,
+      "grad_norm": 0.5951572060585022,
+      "learning_rate": 0.00017593959881405057,
+      "loss": 1.1833,
+      "step": 5081
+    },
+    {
+      "epoch": 0.9049145299145299,
+      "grad_norm": 0.5092006325721741,
+      "learning_rate": 0.00017593049091271996,
+      "loss": 0.8841,
+      "step": 5082
+    },
+    {
+      "epoch": 0.9050925925925926,
+      "grad_norm": 0.5679013729095459,
+      "learning_rate": 0.0001759213815236891,
+      "loss": 1.1056,
+      "step": 5083
+    },
+    {
+      "epoch": 0.9052706552706553,
+      "grad_norm": 0.5708174109458923,
+      "learning_rate": 0.0001759122706471365,
+      "loss": 1.1952,
+      "step": 5084
+    },
+    {
+      "epoch": 0.905448717948718,
+      "grad_norm": 0.5726733803749084,
+      "learning_rate": 0.00017590315828324067,
+      "loss": 1.1013,
+      "step": 5085
+    },
+    {
+      "epoch": 0.9056267806267806,
+      "grad_norm": 0.5821273326873779,
+      "learning_rate": 0.00017589404443218008,
+      "loss": 1.2323,
+      "step": 5086
+    },
+    {
+      "epoch": 0.9058048433048433,
+      "grad_norm": 0.5811445713043213,
+      "learning_rate": 0.00017588492909413337,
+      "loss": 1.2241,
+      "step": 5087
+    },
+    {
+      "epoch": 0.905982905982906,
+      "grad_norm": 0.5377545952796936,
+      "learning_rate": 0.0001758758122692791,
+      "loss": 0.9777,
+      "step": 5088
+    },
+    {
+      "epoch": 0.9061609686609686,
+      "grad_norm": 0.5985640287399292,
+      "learning_rate": 0.0001758666939577959,
+      "loss": 0.9737,
+      "step": 5089
+    },
+    {
+      "epoch": 0.9063390313390314,
+      "grad_norm": 0.6038222908973694,
+      "learning_rate": 0.00017585757415986247,
+      "loss": 1.2116,
+      "step": 5090
+    },
+    {
+      "epoch": 0.906517094017094,
+      "grad_norm": 0.6752246022224426,
+      "learning_rate": 0.00017584845287565743,
+      "loss": 1.1975,
+      "step": 5091
+    },
+    {
+      "epoch": 0.9066951566951567,
+      "grad_norm": 0.5400625467300415,
+      "learning_rate": 0.0001758393301053595,
+      "loss": 0.9669,
+      "step": 5092
+    },
+    {
+      "epoch": 0.9068732193732194,
+      "grad_norm": 0.5637784004211426,
+      "learning_rate": 0.00017583020584914746,
+      "loss": 1.2672,
+      "step": 5093
+    },
+    {
+      "epoch": 0.907051282051282,
+      "grad_norm": 0.4825877249240875,
+      "learning_rate": 0.00017582108010720006,
+      "loss": 0.9719,
+      "step": 5094
+    },
+    {
+      "epoch": 0.9072293447293447,
+      "grad_norm": 0.49902790784835815,
+      "learning_rate": 0.00017581195287969613,
+      "loss": 0.7941,
+      "step": 5095
+    },
+    {
+      "epoch": 0.9074074074074074,
+      "grad_norm": 0.5991541743278503,
+      "learning_rate": 0.0001758028241668144,
+      "loss": 1.049,
+      "step": 5096
+    },
+    {
+      "epoch": 0.9075854700854701,
+      "grad_norm": 0.5788859724998474,
+      "learning_rate": 0.00017579369396873384,
+      "loss": 1.0318,
+      "step": 5097
+    },
+    {
+      "epoch": 0.9077635327635327,
+      "grad_norm": 0.5914160013198853,
+      "learning_rate": 0.0001757845622856333,
+      "loss": 1.1007,
+      "step": 5098
+    },
+    {
+      "epoch": 0.9079415954415955,
+      "grad_norm": 0.5361711382865906,
+      "learning_rate": 0.00017577542911769166,
+      "loss": 1.0694,
+      "step": 5099
+    },
+    {
+      "epoch": 0.9081196581196581,
+      "grad_norm": 0.5752849578857422,
+      "learning_rate": 0.00017576629446508792,
+      "loss": 1.1184,
+      "step": 5100
+    },
+    {
+      "epoch": 0.9082977207977208,
+      "grad_norm": 0.6042249798774719,
+      "learning_rate": 0.000175757158328001,
+      "loss": 1.2808,
+      "step": 5101
+    },
+    {
+      "epoch": 0.9084757834757835,
+      "grad_norm": 0.508352518081665,
+      "learning_rate": 0.00017574802070661,
+      "loss": 1.0038,
+      "step": 5102
+    },
+    {
+      "epoch": 0.9086538461538461,
+      "grad_norm": 0.5667358040809631,
+      "learning_rate": 0.00017573888160109385,
+      "loss": 1.0208,
+      "step": 5103
+    },
+    {
+      "epoch": 0.9088319088319088,
+      "grad_norm": 0.653619647026062,
+      "learning_rate": 0.00017572974101163165,
+      "loss": 1.2053,
+      "step": 5104
+    },
+    {
+      "epoch": 0.9090099715099715,
+      "grad_norm": 0.5069597363471985,
+      "learning_rate": 0.00017572059893840246,
+      "loss": 0.8634,
+      "step": 5105
+    },
+    {
+      "epoch": 0.9091880341880342,
+      "grad_norm": 0.6160602569580078,
+      "learning_rate": 0.00017571145538158547,
+      "loss": 1.2626,
+      "step": 5106
+    },
+    {
+      "epoch": 0.9093660968660968,
+      "grad_norm": 0.6335833668708801,
+      "learning_rate": 0.00017570231034135978,
+      "loss": 1.3381,
+      "step": 5107
+    },
+    {
+      "epoch": 0.9095441595441596,
+      "grad_norm": 0.5140398740768433,
+      "learning_rate": 0.00017569316381790454,
+      "loss": 1.1258,
+      "step": 5108
+    },
+    {
+      "epoch": 0.9097222222222222,
+      "grad_norm": 0.5682975649833679,
+      "learning_rate": 0.00017568401581139905,
+      "loss": 1.3367,
+      "step": 5109
+    },
+    {
+      "epoch": 0.9099002849002849,
+      "grad_norm": 0.49765729904174805,
+      "learning_rate": 0.00017567486632202246,
+      "loss": 1.1891,
+      "step": 5110
+    },
+    {
+      "epoch": 0.9100783475783476,
+      "grad_norm": 0.5139224529266357,
+      "learning_rate": 0.00017566571534995406,
+      "loss": 0.9768,
+      "step": 5111
+    },
+    {
+      "epoch": 0.9102564102564102,
+      "grad_norm": 0.5510922074317932,
+      "learning_rate": 0.00017565656289537316,
+      "loss": 1.1552,
+      "step": 5112
+    },
+    {
+      "epoch": 0.9104344729344729,
+      "grad_norm": 0.6243364810943604,
+      "learning_rate": 0.00017564740895845908,
+      "loss": 1.1341,
+      "step": 5113
+    },
+    {
+      "epoch": 0.9106125356125356,
+      "grad_norm": 0.5334977507591248,
+      "learning_rate": 0.00017563825353939116,
+      "loss": 1.0894,
+      "step": 5114
+    },
+    {
+      "epoch": 0.9107905982905983,
+      "grad_norm": 0.5195826292037964,
+      "learning_rate": 0.00017562909663834878,
+      "loss": 1.1011,
+      "step": 5115
+    },
+    {
+      "epoch": 0.9109686609686609,
+      "grad_norm": 0.5298168063163757,
+      "learning_rate": 0.00017561993825551138,
+      "loss": 1.0079,
+      "step": 5116
+    },
+    {
+      "epoch": 0.9111467236467237,
+      "grad_norm": 0.5858965516090393,
+      "learning_rate": 0.00017561077839105835,
+      "loss": 1.2746,
+      "step": 5117
+    },
+    {
+      "epoch": 0.9113247863247863,
+      "grad_norm": 0.5572476387023926,
+      "learning_rate": 0.0001756016170451692,
+      "loss": 0.8169,
+      "step": 5118
+    },
+    {
+      "epoch": 0.9115028490028491,
+      "grad_norm": 0.5247095823287964,
+      "learning_rate": 0.0001755924542180234,
+      "loss": 1.1206,
+      "step": 5119
+    },
+    {
+      "epoch": 0.9116809116809117,
+      "grad_norm": 0.5605118274688721,
+      "learning_rate": 0.0001755832899098005,
+      "loss": 1.371,
+      "step": 5120
+    },
+    {
+      "epoch": 0.9118589743589743,
+      "grad_norm": 0.5732316970825195,
+      "learning_rate": 0.00017557412412068005,
+      "loss": 1.1248,
+      "step": 5121
+    },
+    {
+      "epoch": 0.9120370370370371,
+      "grad_norm": 0.6167279481887817,
+      "learning_rate": 0.0001755649568508416,
+      "loss": 0.94,
+      "step": 5122
+    },
+    {
+      "epoch": 0.9122150997150997,
+      "grad_norm": 0.5497499108314514,
+      "learning_rate": 0.00017555578810046483,
+      "loss": 1.0112,
+      "step": 5123
+    },
+    {
+      "epoch": 0.9123931623931624,
+      "grad_norm": 0.540762186050415,
+      "learning_rate": 0.00017554661786972931,
+      "loss": 1.1058,
+      "step": 5124
+    },
+    {
+      "epoch": 0.9125712250712251,
+      "grad_norm": 0.5943556427955627,
+      "learning_rate": 0.0001755374461588148,
+      "loss": 0.9086,
+      "step": 5125
+    },
+    {
+      "epoch": 0.9127492877492878,
+      "grad_norm": 0.5300756692886353,
+      "learning_rate": 0.0001755282729679009,
+      "loss": 1.1566,
+      "step": 5126
+    },
+    {
+      "epoch": 0.9129273504273504,
+      "grad_norm": 0.5390434861183167,
+      "learning_rate": 0.00017551909829716743,
+      "loss": 1.1395,
+      "step": 5127
+    },
+    {
+      "epoch": 0.9131054131054132,
+      "grad_norm": 0.627434492111206,
+      "learning_rate": 0.00017550992214679405,
+      "loss": 1.1537,
+      "step": 5128
+    },
+    {
+      "epoch": 0.9132834757834758,
+      "grad_norm": 0.4806903302669525,
+      "learning_rate": 0.00017550074451696063,
+      "loss": 0.7905,
+      "step": 5129
+    },
+    {
+      "epoch": 0.9134615384615384,
+      "grad_norm": 0.5714817047119141,
+      "learning_rate": 0.00017549156540784696,
+      "loss": 1.1042,
+      "step": 5130
+    },
+    {
+      "epoch": 0.9136396011396012,
+      "grad_norm": 0.5839236378669739,
+      "learning_rate": 0.0001754823848196329,
+      "loss": 1.0383,
+      "step": 5131
+    },
+    {
+      "epoch": 0.9138176638176638,
+      "grad_norm": 0.6089872717857361,
+      "learning_rate": 0.0001754732027524983,
+      "loss": 0.9399,
+      "step": 5132
+    },
+    {
+      "epoch": 0.9139957264957265,
+      "grad_norm": 0.4937956631183624,
+      "learning_rate": 0.00017546401920662307,
+      "loss": 0.7382,
+      "step": 5133
+    },
+    {
+      "epoch": 0.9141737891737892,
+      "grad_norm": 0.5918676257133484,
+      "learning_rate": 0.00017545483418218716,
+      "loss": 1.2207,
+      "step": 5134
+    },
+    {
+      "epoch": 0.9143518518518519,
+      "grad_norm": 0.5825346112251282,
+      "learning_rate": 0.0001754456476793705,
+      "loss": 0.9669,
+      "step": 5135
+    },
+    {
+      "epoch": 0.9145299145299145,
+      "grad_norm": 0.49829617142677307,
+      "learning_rate": 0.0001754364596983531,
+      "loss": 1.2247,
+      "step": 5136
+    },
+    {
+      "epoch": 0.9147079772079773,
+      "grad_norm": 0.5128271579742432,
+      "learning_rate": 0.00017542727023931497,
+      "loss": 0.9563,
+      "step": 5137
+    },
+    {
+      "epoch": 0.9148860398860399,
+      "grad_norm": 0.5789414644241333,
+      "learning_rate": 0.00017541807930243622,
+      "loss": 1.22,
+      "step": 5138
+    },
+    {
+      "epoch": 0.9150641025641025,
+      "grad_norm": 0.44155433773994446,
+      "learning_rate": 0.00017540888688789683,
+      "loss": 0.9897,
+      "step": 5139
+    },
+    {
+      "epoch": 0.9152421652421653,
+      "grad_norm": 0.550464391708374,
+      "learning_rate": 0.00017539969299587696,
+      "loss": 1.0624,
+      "step": 5140
+    },
+    {
+      "epoch": 0.9154202279202279,
+      "grad_norm": 0.5019831657409668,
+      "learning_rate": 0.0001753904976265567,
+      "loss": 0.9045,
+      "step": 5141
+    },
+    {
+      "epoch": 0.9155982905982906,
+      "grad_norm": 0.589658796787262,
+      "learning_rate": 0.0001753813007801163,
+      "loss": 1.0454,
+      "step": 5142
+    },
+    {
+      "epoch": 0.9157763532763533,
+      "grad_norm": 0.5945459008216858,
+      "learning_rate": 0.00017537210245673586,
+      "loss": 1.0042,
+      "step": 5143
+    },
+    {
+      "epoch": 0.915954415954416,
+      "grad_norm": 0.5409809947013855,
+      "learning_rate": 0.00017536290265659566,
+      "loss": 1.0609,
+      "step": 5144
+    },
+    {
+      "epoch": 0.9161324786324786,
+      "grad_norm": 0.5302975177764893,
+      "learning_rate": 0.00017535370137987597,
+      "loss": 1.1394,
+      "step": 5145
+    },
+    {
+      "epoch": 0.9163105413105413,
+      "grad_norm": 0.5253351330757141,
+      "learning_rate": 0.00017534449862675698,
+      "loss": 1.2249,
+      "step": 5146
+    },
+    {
+      "epoch": 0.916488603988604,
+      "grad_norm": 0.6363829970359802,
+      "learning_rate": 0.00017533529439741908,
+      "loss": 1.1333,
+      "step": 5147
+    },
+    {
+      "epoch": 0.9166666666666666,
+      "grad_norm": 0.4703354835510254,
+      "learning_rate": 0.0001753260886920426,
+      "loss": 0.9971,
+      "step": 5148
+    },
+    {
+      "epoch": 0.9168447293447294,
+      "grad_norm": 0.6394907236099243,
+      "learning_rate": 0.00017531688151080786,
+      "loss": 1.5942,
+      "step": 5149
+    },
+    {
+      "epoch": 0.917022792022792,
+      "grad_norm": 0.5573459267616272,
+      "learning_rate": 0.00017530767285389527,
+      "loss": 0.9669,
+      "step": 5150
+    },
+    {
+      "epoch": 0.9172008547008547,
+      "grad_norm": 0.5000962615013123,
+      "learning_rate": 0.00017529846272148532,
+      "loss": 1.2151,
+      "step": 5151
+    },
+    {
+      "epoch": 0.9173789173789174,
+      "grad_norm": 0.5550395846366882,
+      "learning_rate": 0.0001752892511137584,
+      "loss": 1.1765,
+      "step": 5152
+    },
+    {
+      "epoch": 0.91755698005698,
+      "grad_norm": 0.5461394786834717,
+      "learning_rate": 0.00017528003803089496,
+      "loss": 1.1136,
+      "step": 5153
+    },
+    {
+      "epoch": 0.9177350427350427,
+      "grad_norm": 0.5512672662734985,
+      "learning_rate": 0.00017527082347307558,
+      "loss": 1.1727,
+      "step": 5154
+    },
+    {
+      "epoch": 0.9179131054131054,
+      "grad_norm": 0.5210778713226318,
+      "learning_rate": 0.0001752616074404808,
+      "loss": 1.09,
+      "step": 5155
+    },
+    {
+      "epoch": 0.9180911680911681,
+      "grad_norm": 0.5214943289756775,
+      "learning_rate": 0.00017525238993329115,
+      "loss": 0.9654,
+      "step": 5156
+    },
+    {
+      "epoch": 0.9182692307692307,
+      "grad_norm": 0.5822862386703491,
+      "learning_rate": 0.00017524317095168724,
+      "loss": 1.0951,
+      "step": 5157
+    },
+    {
+      "epoch": 0.9184472934472935,
+      "grad_norm": 0.43948012590408325,
+      "learning_rate": 0.0001752339504958497,
+      "loss": 0.6984,
+      "step": 5158
+    },
+    {
+      "epoch": 0.9186253561253561,
+      "grad_norm": 0.5024449229240417,
+      "learning_rate": 0.00017522472856595916,
+      "loss": 0.983,
+      "step": 5159
+    },
+    {
+      "epoch": 0.9188034188034188,
+      "grad_norm": 0.5815144181251526,
+      "learning_rate": 0.00017521550516219636,
+      "loss": 0.9784,
+      "step": 5160
+    },
+    {
+      "epoch": 0.9189814814814815,
+      "grad_norm": 0.5519825220108032,
+      "learning_rate": 0.00017520628028474197,
+      "loss": 1.064,
+      "step": 5161
+    },
+    {
+      "epoch": 0.9191595441595442,
+      "grad_norm": 0.5615749955177307,
+      "learning_rate": 0.00017519705393377675,
+      "loss": 1.1284,
+      "step": 5162
+    },
+    {
+      "epoch": 0.9193376068376068,
+      "grad_norm": 0.5929917693138123,
+      "learning_rate": 0.00017518782610948148,
+      "loss": 1.1221,
+      "step": 5163
+    },
+    {
+      "epoch": 0.9195156695156695,
+      "grad_norm": 0.7116361856460571,
+      "learning_rate": 0.00017517859681203692,
+      "loss": 1.0188,
+      "step": 5164
+    },
+    {
+      "epoch": 0.9196937321937322,
+      "grad_norm": 0.5095893740653992,
+      "learning_rate": 0.00017516936604162396,
+      "loss": 1.0724,
+      "step": 5165
+    },
+    {
+      "epoch": 0.9198717948717948,
+      "grad_norm": 0.5701385736465454,
+      "learning_rate": 0.00017516013379842337,
+      "loss": 1.0572,
+      "step": 5166
+    },
+    {
+      "epoch": 0.9200498575498576,
+      "grad_norm": 0.518412709236145,
+      "learning_rate": 0.00017515090008261613,
+      "loss": 1.0514,
+      "step": 5167
+    },
+    {
+      "epoch": 0.9202279202279202,
+      "grad_norm": 0.5324261784553528,
+      "learning_rate": 0.00017514166489438312,
+      "loss": 1.1708,
+      "step": 5168
+    },
+    {
+      "epoch": 0.9204059829059829,
+      "grad_norm": 0.5640990138053894,
+      "learning_rate": 0.00017513242823390525,
+      "loss": 1.2846,
+      "step": 5169
+    },
+    {
+      "epoch": 0.9205840455840456,
+      "grad_norm": 0.510352373123169,
+      "learning_rate": 0.00017512319010136356,
+      "loss": 1.0763,
+      "step": 5170
+    },
+    {
+      "epoch": 0.9207621082621082,
+      "grad_norm": 0.4994175136089325,
+      "learning_rate": 0.00017511395049693898,
+      "loss": 0.9665,
+      "step": 5171
+    },
+    {
+      "epoch": 0.9209401709401709,
+      "grad_norm": 0.43196994066238403,
+      "learning_rate": 0.00017510470942081258,
+      "loss": 0.761,
+      "step": 5172
+    },
+    {
+      "epoch": 0.9211182336182336,
+      "grad_norm": 0.558977484703064,
+      "learning_rate": 0.00017509546687316543,
+      "loss": 1.0758,
+      "step": 5173
+    },
+    {
+      "epoch": 0.9212962962962963,
+      "grad_norm": 0.573302149772644,
+      "learning_rate": 0.0001750862228541786,
+      "loss": 0.9635,
+      "step": 5174
+    },
+    {
+      "epoch": 0.9214743589743589,
+      "grad_norm": 0.5083786845207214,
+      "learning_rate": 0.00017507697736403321,
+      "loss": 1.0311,
+      "step": 5175
+    },
+    {
+      "epoch": 0.9216524216524217,
+      "grad_norm": 0.5478954911231995,
+      "learning_rate": 0.00017506773040291043,
+      "loss": 1.074,
+      "step": 5176
+    },
+    {
+      "epoch": 0.9218304843304843,
+      "grad_norm": 0.522376537322998,
+      "learning_rate": 0.00017505848197099137,
+      "loss": 1.1162,
+      "step": 5177
+    },
+    {
+      "epoch": 0.9220085470085471,
+      "grad_norm": 0.5946292281150818,
+      "learning_rate": 0.0001750492320684573,
+      "loss": 0.9494,
+      "step": 5178
+    },
+    {
+      "epoch": 0.9221866096866097,
+      "grad_norm": 0.5423247814178467,
+      "learning_rate": 0.00017503998069548943,
+      "loss": 1.0558,
+      "step": 5179
+    },
+    {
+      "epoch": 0.9223646723646723,
+      "grad_norm": 0.49960651993751526,
+      "learning_rate": 0.000175030727852269,
+      "loss": 1.0748,
+      "step": 5180
+    },
+    {
+      "epoch": 0.9225427350427351,
+      "grad_norm": 0.6066586375236511,
+      "learning_rate": 0.00017502147353897732,
+      "loss": 1.2066,
+      "step": 5181
+    },
+    {
+      "epoch": 0.9227207977207977,
+      "grad_norm": 0.57244473695755,
+      "learning_rate": 0.00017501221775579576,
+      "loss": 1.048,
+      "step": 5182
+    },
+    {
+      "epoch": 0.9228988603988604,
+      "grad_norm": 0.512464165687561,
+      "learning_rate": 0.00017500296050290557,
+      "loss": 1.1405,
+      "step": 5183
+    },
+    {
+      "epoch": 0.9230769230769231,
+      "grad_norm": 0.5380734801292419,
+      "learning_rate": 0.00017499370178048818,
+      "loss": 1.0641,
+      "step": 5184
+    },
+    {
+      "epoch": 0.9232549857549858,
+      "grad_norm": 0.47102874517440796,
+      "learning_rate": 0.000174984441588725,
+      "loss": 0.7948,
+      "step": 5185
+    },
+    {
+      "epoch": 0.9234330484330484,
+      "grad_norm": 0.6702211499214172,
+      "learning_rate": 0.00017497517992779747,
+      "loss": 1.3009,
+      "step": 5186
+    },
+    {
+      "epoch": 0.9236111111111112,
+      "grad_norm": 0.4685834050178528,
+      "learning_rate": 0.000174965916797887,
+      "loss": 0.8136,
+      "step": 5187
+    },
+    {
+      "epoch": 0.9237891737891738,
+      "grad_norm": 0.5414277911186218,
+      "learning_rate": 0.00017495665219917513,
+      "loss": 0.9708,
+      "step": 5188
+    },
+    {
+      "epoch": 0.9239672364672364,
+      "grad_norm": 0.5253050923347473,
+      "learning_rate": 0.0001749473861318434,
+      "loss": 1.0691,
+      "step": 5189
+    },
+    {
+      "epoch": 0.9241452991452992,
+      "grad_norm": 0.6009906530380249,
+      "learning_rate": 0.00017493811859607328,
+      "loss": 1.2023,
+      "step": 5190
+    },
+    {
+      "epoch": 0.9243233618233618,
+      "grad_norm": 0.5519336462020874,
+      "learning_rate": 0.00017492884959204643,
+      "loss": 1.189,
+      "step": 5191
+    },
+    {
+      "epoch": 0.9245014245014245,
+      "grad_norm": 0.5024857521057129,
+      "learning_rate": 0.0001749195791199444,
+      "loss": 0.8685,
+      "step": 5192
+    },
+    {
+      "epoch": 0.9246794871794872,
+      "grad_norm": 0.5735679864883423,
+      "learning_rate": 0.00017491030717994887,
+      "loss": 1.1903,
+      "step": 5193
+    },
+    {
+      "epoch": 0.9248575498575499,
+      "grad_norm": 0.5338658094406128,
+      "learning_rate": 0.00017490103377224147,
+      "loss": 1.0442,
+      "step": 5194
+    },
+    {
+      "epoch": 0.9250356125356125,
+      "grad_norm": 0.46669119596481323,
+      "learning_rate": 0.0001748917588970039,
+      "loss": 0.6343,
+      "step": 5195
+    },
+    {
+      "epoch": 0.9252136752136753,
+      "grad_norm": 0.510910153388977,
+      "learning_rate": 0.00017488248255441793,
+      "loss": 0.9334,
+      "step": 5196
+    },
+    {
+      "epoch": 0.9253917378917379,
+      "grad_norm": 0.5732216238975525,
+      "learning_rate": 0.00017487320474466524,
+      "loss": 1.0483,
+      "step": 5197
+    },
+    {
+      "epoch": 0.9255698005698005,
+      "grad_norm": 0.5864318609237671,
+      "learning_rate": 0.00017486392546792762,
+      "loss": 1.0669,
+      "step": 5198
+    },
+    {
+      "epoch": 0.9257478632478633,
+      "grad_norm": 0.5074281096458435,
+      "learning_rate": 0.00017485464472438692,
+      "loss": 1.0636,
+      "step": 5199
+    },
+    {
+      "epoch": 0.9259259259259259,
+      "grad_norm": 0.5833215117454529,
+      "learning_rate": 0.00017484536251422496,
+      "loss": 1.2005,
+      "step": 5200
+    },
+    {
+      "epoch": 0.9261039886039886,
+      "grad_norm": 0.5624990463256836,
+      "learning_rate": 0.0001748360788376236,
+      "loss": 1.1623,
+      "step": 5201
+    },
+    {
+      "epoch": 0.9262820512820513,
+      "grad_norm": 0.5618230104446411,
+      "learning_rate": 0.00017482679369476472,
+      "loss": 1.0495,
+      "step": 5202
+    },
+    {
+      "epoch": 0.926460113960114,
+      "grad_norm": 0.6254985332489014,
+      "learning_rate": 0.00017481750708583024,
+      "loss": 0.9521,
+      "step": 5203
+    },
+    {
+      "epoch": 0.9266381766381766,
+      "grad_norm": 0.5488203763961792,
+      "learning_rate": 0.00017480821901100216,
+      "loss": 1.0689,
+      "step": 5204
+    },
+    {
+      "epoch": 0.9268162393162394,
+      "grad_norm": 0.6157993674278259,
+      "learning_rate": 0.00017479892947046245,
+      "loss": 1.2852,
+      "step": 5205
+    },
+    {
+      "epoch": 0.926994301994302,
+      "grad_norm": 0.49653390049934387,
+      "learning_rate": 0.00017478963846439305,
+      "loss": 0.8616,
+      "step": 5206
+    },
+    {
+      "epoch": 0.9271723646723646,
+      "grad_norm": 0.5079081058502197,
+      "learning_rate": 0.00017478034599297603,
+      "loss": 1.0192,
+      "step": 5207
+    },
+    {
+      "epoch": 0.9273504273504274,
+      "grad_norm": 0.5392495393753052,
+      "learning_rate": 0.00017477105205639354,
+      "loss": 1.115,
+      "step": 5208
+    },
+    {
+      "epoch": 0.92752849002849,
+      "grad_norm": 0.5336191654205322,
+      "learning_rate": 0.00017476175665482756,
+      "loss": 1.1892,
+      "step": 5209
+    },
+    {
+      "epoch": 0.9277065527065527,
+      "grad_norm": 0.631712019443512,
+      "learning_rate": 0.00017475245978846026,
+      "loss": 0.9619,
+      "step": 5210
+    },
+    {
+      "epoch": 0.9278846153846154,
+      "grad_norm": 0.5123951435089111,
+      "learning_rate": 0.0001747431614574738,
+      "loss": 1.1477,
+      "step": 5211
+    },
+    {
+      "epoch": 0.9280626780626781,
+      "grad_norm": 0.5045743584632874,
+      "learning_rate": 0.00017473386166205038,
+      "loss": 0.9749,
+      "step": 5212
+    },
+    {
+      "epoch": 0.9282407407407407,
+      "grad_norm": 0.5296525359153748,
+      "learning_rate": 0.00017472456040237217,
+      "loss": 1.0736,
+      "step": 5213
+    },
+    {
+      "epoch": 0.9284188034188035,
+      "grad_norm": 0.6304933428764343,
+      "learning_rate": 0.00017471525767862145,
+      "loss": 1.2444,
+      "step": 5214
+    },
+    {
+      "epoch": 0.9285968660968661,
+      "grad_norm": 0.4851958155632019,
+      "learning_rate": 0.00017470595349098044,
+      "loss": 0.9049,
+      "step": 5215
+    },
+    {
+      "epoch": 0.9287749287749287,
+      "grad_norm": 0.5730679631233215,
+      "learning_rate": 0.00017469664783963148,
+      "loss": 1.0773,
+      "step": 5216
+    },
+    {
+      "epoch": 0.9289529914529915,
+      "grad_norm": 0.6020415425300598,
+      "learning_rate": 0.00017468734072475684,
+      "loss": 1.3247,
+      "step": 5217
+    },
+    {
+      "epoch": 0.9291310541310541,
+      "grad_norm": 0.47981077432632446,
+      "learning_rate": 0.00017467803214653893,
+      "loss": 1.0009,
+      "step": 5218
+    },
+    {
+      "epoch": 0.9293091168091168,
+      "grad_norm": 0.5787527561187744,
+      "learning_rate": 0.0001746687221051601,
+      "loss": 1.2523,
+      "step": 5219
+    },
+    {
+      "epoch": 0.9294871794871795,
+      "grad_norm": 0.4495891332626343,
+      "learning_rate": 0.00017465941060080278,
+      "loss": 0.7364,
+      "step": 5220
+    },
+    {
+      "epoch": 0.9296652421652422,
+      "grad_norm": 0.5721768140792847,
+      "learning_rate": 0.0001746500976336494,
+      "loss": 1.015,
+      "step": 5221
+    },
+    {
+      "epoch": 0.9298433048433048,
+      "grad_norm": 0.5500208735466003,
+      "learning_rate": 0.0001746407832038824,
+      "loss": 1.053,
+      "step": 5222
+    },
+    {
+      "epoch": 0.9300213675213675,
+      "grad_norm": 0.5784386992454529,
+      "learning_rate": 0.00017463146731168437,
+      "loss": 0.9784,
+      "step": 5223
+    },
+    {
+      "epoch": 0.9301994301994302,
+      "grad_norm": 0.4960322082042694,
+      "learning_rate": 0.00017462214995723772,
+      "loss": 0.8674,
+      "step": 5224
+    },
+    {
+      "epoch": 0.9303774928774928,
+      "grad_norm": 0.5005537271499634,
+      "learning_rate": 0.00017461283114072508,
+      "loss": 1.0486,
+      "step": 5225
+    },
+    {
+      "epoch": 0.9305555555555556,
+      "grad_norm": 0.5064167380332947,
+      "learning_rate": 0.000174603510862329,
+      "loss": 0.9722,
+      "step": 5226
+    },
+    {
+      "epoch": 0.9307336182336182,
+      "grad_norm": 0.583558976650238,
+      "learning_rate": 0.0001745941891222321,
+      "loss": 0.9957,
+      "step": 5227
+    },
+    {
+      "epoch": 0.9309116809116809,
+      "grad_norm": 0.4982515871524811,
+      "learning_rate": 0.00017458486592061704,
+      "loss": 0.958,
+      "step": 5228
+    },
+    {
+      "epoch": 0.9310897435897436,
+      "grad_norm": 0.526549756526947,
+      "learning_rate": 0.0001745755412576664,
+      "loss": 1.1172,
+      "step": 5229
+    },
+    {
+      "epoch": 0.9312678062678063,
+      "grad_norm": 0.6129719018936157,
+      "learning_rate": 0.000174566215133563,
+      "loss": 1.2524,
+      "step": 5230
+    },
+    {
+      "epoch": 0.9314458689458689,
+      "grad_norm": 0.5385653972625732,
+      "learning_rate": 0.00017455688754848948,
+      "loss": 1.1655,
+      "step": 5231
+    },
+    {
+      "epoch": 0.9316239316239316,
+      "grad_norm": 0.5646410584449768,
+      "learning_rate": 0.0001745475585026287,
+      "loss": 0.9026,
+      "step": 5232
+    },
+    {
+      "epoch": 0.9318019943019943,
+      "grad_norm": 0.549223780632019,
+      "learning_rate": 0.0001745382279961633,
+      "loss": 0.804,
+      "step": 5233
+    },
+    {
+      "epoch": 0.9319800569800569,
+      "grad_norm": 0.48547953367233276,
+      "learning_rate": 0.0001745288960292762,
+      "loss": 1.0224,
+      "step": 5234
+    },
+    {
+      "epoch": 0.9321581196581197,
+      "grad_norm": 0.5260967016220093,
+      "learning_rate": 0.00017451956260215016,
+      "loss": 0.9688,
+      "step": 5235
+    },
+    {
+      "epoch": 0.9323361823361823,
+      "grad_norm": 0.6261999011039734,
+      "learning_rate": 0.00017451022771496812,
+      "loss": 1.2539,
+      "step": 5236
+    },
+    {
+      "epoch": 0.9325142450142451,
+      "grad_norm": 0.5801421999931335,
+      "learning_rate": 0.00017450089136791298,
+      "loss": 1.11,
+      "step": 5237
+    },
+    {
+      "epoch": 0.9326923076923077,
+      "grad_norm": 0.5833573937416077,
+      "learning_rate": 0.0001744915535611676,
+      "loss": 0.9328,
+      "step": 5238
+    },
+    {
+      "epoch": 0.9328703703703703,
+      "grad_norm": 0.5422634482383728,
+      "learning_rate": 0.00017448221429491496,
+      "loss": 1.034,
+      "step": 5239
+    },
+    {
+      "epoch": 0.9330484330484331,
+      "grad_norm": 0.5105658769607544,
+      "learning_rate": 0.00017447287356933808,
+      "loss": 0.8924,
+      "step": 5240
+    },
+    {
+      "epoch": 0.9332264957264957,
+      "grad_norm": 0.5114831924438477,
+      "learning_rate": 0.00017446353138461995,
+      "loss": 0.9328,
+      "step": 5241
+    },
+    {
+      "epoch": 0.9334045584045584,
+      "grad_norm": 0.5105039477348328,
+      "learning_rate": 0.00017445418774094358,
+      "loss": 1.0468,
+      "step": 5242
+    },
+    {
+      "epoch": 0.9335826210826211,
+      "grad_norm": 0.593250036239624,
+      "learning_rate": 0.00017444484263849208,
+      "loss": 1.0603,
+      "step": 5243
+    },
+    {
+      "epoch": 0.9337606837606838,
+      "grad_norm": 0.600788414478302,
+      "learning_rate": 0.00017443549607744853,
+      "loss": 1.1506,
+      "step": 5244
+    },
+    {
+      "epoch": 0.9339387464387464,
+      "grad_norm": 0.5394418239593506,
+      "learning_rate": 0.00017442614805799605,
+      "loss": 1.038,
+      "step": 5245
+    },
+    {
+      "epoch": 0.9341168091168092,
+      "grad_norm": 0.5446375608444214,
+      "learning_rate": 0.00017441679858031786,
+      "loss": 1.079,
+      "step": 5246
+    },
+    {
+      "epoch": 0.9342948717948718,
+      "grad_norm": 0.5859794616699219,
+      "learning_rate": 0.00017440744764459702,
+      "loss": 1.1453,
+      "step": 5247
+    },
+    {
+      "epoch": 0.9344729344729344,
+      "grad_norm": 0.4899081289768219,
+      "learning_rate": 0.00017439809525101688,
+      "loss": 1.163,
+      "step": 5248
+    },
+    {
+      "epoch": 0.9346509971509972,
+      "grad_norm": 0.652846097946167,
+      "learning_rate": 0.00017438874139976055,
+      "loss": 1.1819,
+      "step": 5249
+    },
+    {
+      "epoch": 0.9348290598290598,
+      "grad_norm": 0.5402514934539795,
+      "learning_rate": 0.00017437938609101138,
+      "loss": 1.0159,
+      "step": 5250
+    },
+    {
+      "epoch": 0.9350071225071225,
+      "grad_norm": 0.565864086151123,
+      "learning_rate": 0.00017437002932495265,
+      "loss": 1.1121,
+      "step": 5251
+    },
+    {
+      "epoch": 0.9351851851851852,
+      "grad_norm": 0.611786425113678,
+      "learning_rate": 0.0001743606711017677,
+      "loss": 1.2511,
+      "step": 5252
+    },
+    {
+      "epoch": 0.9353632478632479,
+      "grad_norm": 0.5706882476806641,
+      "learning_rate": 0.00017435131142163988,
+      "loss": 1.128,
+      "step": 5253
+    },
+    {
+      "epoch": 0.9355413105413105,
+      "grad_norm": 0.5369367003440857,
+      "learning_rate": 0.00017434195028475253,
+      "loss": 1.0562,
+      "step": 5254
+    },
+    {
+      "epoch": 0.9357193732193733,
+      "grad_norm": 0.49957552552223206,
+      "learning_rate": 0.0001743325876912891,
+      "loss": 1.0568,
+      "step": 5255
+    },
+    {
+      "epoch": 0.9358974358974359,
+      "grad_norm": 0.5398106575012207,
+      "learning_rate": 0.00017432322364143305,
+      "loss": 1.1502,
+      "step": 5256
+    },
+    {
+      "epoch": 0.9360754985754985,
+      "grad_norm": 0.6522027254104614,
+      "learning_rate": 0.00017431385813536783,
+      "loss": 1.0591,
+      "step": 5257
+    },
+    {
+      "epoch": 0.9362535612535613,
+      "grad_norm": 0.5872012972831726,
+      "learning_rate": 0.00017430449117327693,
+      "loss": 1.3737,
+      "step": 5258
+    },
+    {
+      "epoch": 0.9364316239316239,
+      "grad_norm": 0.5124474167823792,
+      "learning_rate": 0.00017429512275534382,
+      "loss": 1.0727,
+      "step": 5259
+    },
+    {
+      "epoch": 0.9366096866096866,
+      "grad_norm": 0.5103365778923035,
+      "learning_rate": 0.00017428575288175218,
+      "loss": 1.0339,
+      "step": 5260
+    },
+    {
+      "epoch": 0.9367877492877493,
+      "grad_norm": 0.585483729839325,
+      "learning_rate": 0.0001742763815526855,
+      "loss": 1.1844,
+      "step": 5261
+    },
+    {
+      "epoch": 0.936965811965812,
+      "grad_norm": 0.5855562090873718,
+      "learning_rate": 0.00017426700876832746,
+      "loss": 1.3234,
+      "step": 5262
+    },
+    {
+      "epoch": 0.9371438746438746,
+      "grad_norm": 0.5774588584899902,
+      "learning_rate": 0.00017425763452886162,
+      "loss": 1.0937,
+      "step": 5263
+    },
+    {
+      "epoch": 0.9373219373219374,
+      "grad_norm": 0.5718343257904053,
+      "learning_rate": 0.00017424825883447168,
+      "loss": 1.0783,
+      "step": 5264
+    },
+    {
+      "epoch": 0.9375,
+      "grad_norm": 0.5414558053016663,
+      "learning_rate": 0.00017423888168534136,
+      "loss": 1.1244,
+      "step": 5265
+    },
+    {
+      "epoch": 0.9376780626780626,
+      "grad_norm": 0.5818275809288025,
+      "learning_rate": 0.00017422950308165438,
+      "loss": 1.247,
+      "step": 5266
+    },
+    {
+      "epoch": 0.9378561253561254,
+      "grad_norm": 0.586398184299469,
+      "learning_rate": 0.00017422012302359448,
+      "loss": 1.0515,
+      "step": 5267
+    },
+    {
+      "epoch": 0.938034188034188,
+      "grad_norm": 0.5236606001853943,
+      "learning_rate": 0.00017421074151134544,
+      "loss": 1.1907,
+      "step": 5268
+    },
+    {
+      "epoch": 0.9382122507122507,
+      "grad_norm": 0.5108010172843933,
+      "learning_rate": 0.0001742013585450911,
+      "loss": 1.1125,
+      "step": 5269
+    },
+    {
+      "epoch": 0.9383903133903134,
+      "grad_norm": 0.4956454038619995,
+      "learning_rate": 0.00017419197412501527,
+      "loss": 1.0305,
+      "step": 5270
+    },
+    {
+      "epoch": 0.9385683760683761,
+      "grad_norm": 0.5432302951812744,
+      "learning_rate": 0.0001741825882513018,
+      "loss": 1.1946,
+      "step": 5271
+    },
+    {
+      "epoch": 0.9387464387464387,
+      "grad_norm": 0.5119295716285706,
+      "learning_rate": 0.00017417320092413463,
+      "loss": 0.875,
+      "step": 5272
+    },
+    {
+      "epoch": 0.9389245014245015,
+      "grad_norm": 0.49740248918533325,
+      "learning_rate": 0.0001741638121436977,
+      "loss": 1.1093,
+      "step": 5273
+    },
+    {
+      "epoch": 0.9391025641025641,
+      "grad_norm": 0.5069027543067932,
+      "learning_rate": 0.00017415442191017491,
+      "loss": 1.2498,
+      "step": 5274
+    },
+    {
+      "epoch": 0.9392806267806267,
+      "grad_norm": 0.570264995098114,
+      "learning_rate": 0.00017414503022375027,
+      "loss": 1.0192,
+      "step": 5275
+    },
+    {
+      "epoch": 0.9394586894586895,
+      "grad_norm": 0.48129352927207947,
+      "learning_rate": 0.00017413563708460776,
+      "loss": 0.8467,
+      "step": 5276
+    },
+    {
+      "epoch": 0.9396367521367521,
+      "grad_norm": 0.5214534401893616,
+      "learning_rate": 0.00017412624249293148,
+      "loss": 0.9723,
+      "step": 5277
+    },
+    {
+      "epoch": 0.9398148148148148,
+      "grad_norm": 0.5150161385536194,
+      "learning_rate": 0.00017411684644890544,
+      "loss": 1.0906,
+      "step": 5278
+    },
+    {
+      "epoch": 0.9399928774928775,
+      "grad_norm": 0.5695852637290955,
+      "learning_rate": 0.00017410744895271377,
+      "loss": 1.2891,
+      "step": 5279
+    },
+    {
+      "epoch": 0.9401709401709402,
+      "grad_norm": 0.5613594651222229,
+      "learning_rate": 0.00017409805000454055,
+      "loss": 1.1373,
+      "step": 5280
+    },
+    {
+      "epoch": 0.9403490028490028,
+      "grad_norm": 0.5134239196777344,
+      "learning_rate": 0.00017408864960457004,
+      "loss": 1.1081,
+      "step": 5281
+    },
+    {
+      "epoch": 0.9405270655270656,
+      "grad_norm": 0.5256397724151611,
+      "learning_rate": 0.00017407924775298628,
+      "loss": 1.058,
+      "step": 5282
+    },
+    {
+      "epoch": 0.9407051282051282,
+      "grad_norm": 0.5145402550697327,
+      "learning_rate": 0.00017406984444997357,
+      "loss": 1.0667,
+      "step": 5283
+    },
+    {
+      "epoch": 0.9408831908831908,
+      "grad_norm": 0.5435704588890076,
+      "learning_rate": 0.0001740604396957161,
+      "loss": 1.2275,
+      "step": 5284
+    },
+    {
+      "epoch": 0.9410612535612536,
+      "grad_norm": 0.5798762440681458,
+      "learning_rate": 0.0001740510334903982,
+      "loss": 1.2061,
+      "step": 5285
+    },
+    {
+      "epoch": 0.9412393162393162,
+      "grad_norm": 0.5461057424545288,
+      "learning_rate": 0.00017404162583420414,
+      "loss": 1.1585,
+      "step": 5286
+    },
+    {
+      "epoch": 0.9414173789173789,
+      "grad_norm": 0.5090487003326416,
+      "learning_rate": 0.00017403221672731818,
+      "loss": 1.2496,
+      "step": 5287
+    },
+    {
+      "epoch": 0.9415954415954416,
+      "grad_norm": 0.5171035528182983,
+      "learning_rate": 0.00017402280616992476,
+      "loss": 1.1947,
+      "step": 5288
+    },
+    {
+      "epoch": 0.9417735042735043,
+      "grad_norm": 0.5292364358901978,
+      "learning_rate": 0.00017401339416220818,
+      "loss": 1.0182,
+      "step": 5289
+    },
+    {
+      "epoch": 0.9419515669515669,
+      "grad_norm": 0.5011499524116516,
+      "learning_rate": 0.00017400398070435293,
+      "loss": 1.3363,
+      "step": 5290
+    },
+    {
+      "epoch": 0.9421296296296297,
+      "grad_norm": 0.4821554720401764,
+      "learning_rate": 0.0001739945657965434,
+      "loss": 0.9077,
+      "step": 5291
+    },
+    {
+      "epoch": 0.9423076923076923,
+      "grad_norm": 0.5849515199661255,
+      "learning_rate": 0.00017398514943896403,
+      "loss": 1.1582,
+      "step": 5292
+    },
+    {
+      "epoch": 0.9424857549857549,
+      "grad_norm": 0.49826139211654663,
+      "learning_rate": 0.00017397573163179937,
+      "loss": 1.1025,
+      "step": 5293
+    },
+    {
+      "epoch": 0.9426638176638177,
+      "grad_norm": 0.6031842827796936,
+      "learning_rate": 0.00017396631237523392,
+      "loss": 1.1932,
+      "step": 5294
+    },
+    {
+      "epoch": 0.9428418803418803,
+      "grad_norm": 0.6013330221176147,
+      "learning_rate": 0.00017395689166945224,
+      "loss": 1.2078,
+      "step": 5295
+    },
+    {
+      "epoch": 0.9430199430199431,
+      "grad_norm": 0.5147021412849426,
+      "learning_rate": 0.00017394746951463893,
+      "loss": 0.9988,
+      "step": 5296
+    },
+    {
+      "epoch": 0.9431980056980057,
+      "grad_norm": 0.5721762776374817,
+      "learning_rate": 0.0001739380459109785,
+      "loss": 1.1442,
+      "step": 5297
+    },
+    {
+      "epoch": 0.9433760683760684,
+      "grad_norm": 0.49272531270980835,
+      "learning_rate": 0.0001739286208586557,
+      "loss": 1.0481,
+      "step": 5298
+    },
+    {
+      "epoch": 0.9435541310541311,
+      "grad_norm": 0.6545688509941101,
+      "learning_rate": 0.00017391919435785514,
+      "loss": 1.1393,
+      "step": 5299
+    },
+    {
+      "epoch": 0.9437321937321937,
+      "grad_norm": 0.617756724357605,
+      "learning_rate": 0.00017390976640876152,
+      "loss": 1.1108,
+      "step": 5300
+    },
+    {
+      "epoch": 0.9439102564102564,
+      "grad_norm": 0.4870470464229584,
+      "learning_rate": 0.00017390033701155955,
+      "loss": 0.9028,
+      "step": 5301
+    },
+    {
+      "epoch": 0.9440883190883191,
+      "grad_norm": 0.5250138640403748,
+      "learning_rate": 0.000173890906166434,
+      "loss": 1.0326,
+      "step": 5302
+    },
+    {
+      "epoch": 0.9442663817663818,
+      "grad_norm": 0.5879467129707336,
+      "learning_rate": 0.00017388147387356964,
+      "loss": 1.1569,
+      "step": 5303
+    },
+    {
+      "epoch": 0.9444444444444444,
+      "grad_norm": 0.4790486991405487,
+      "learning_rate": 0.00017387204013315127,
+      "loss": 0.967,
+      "step": 5304
+    },
+    {
+      "epoch": 0.9446225071225072,
+      "grad_norm": 0.5884372591972351,
+      "learning_rate": 0.0001738626049453637,
+      "loss": 1.1342,
+      "step": 5305
+    },
+    {
+      "epoch": 0.9448005698005698,
+      "grad_norm": 0.4633975028991699,
+      "learning_rate": 0.00017385316831039187,
+      "loss": 0.8942,
+      "step": 5306
+    },
+    {
+      "epoch": 0.9449786324786325,
+      "grad_norm": 0.5301823019981384,
+      "learning_rate": 0.0001738437302284206,
+      "loss": 1.1683,
+      "step": 5307
+    },
+    {
+      "epoch": 0.9451566951566952,
+      "grad_norm": 0.5476770997047424,
+      "learning_rate": 0.00017383429069963484,
+      "loss": 1.1574,
+      "step": 5308
+    },
+    {
+      "epoch": 0.9453347578347578,
+      "grad_norm": 0.47689101099967957,
+      "learning_rate": 0.00017382484972421953,
+      "loss": 1.0792,
+      "step": 5309
+    },
+    {
+      "epoch": 0.9455128205128205,
+      "grad_norm": 0.526063084602356,
+      "learning_rate": 0.00017381540730235963,
+      "loss": 0.9012,
+      "step": 5310
+    },
+    {
+      "epoch": 0.9456908831908832,
+      "grad_norm": 0.5667058229446411,
+      "learning_rate": 0.0001738059634342402,
+      "loss": 1.0908,
+      "step": 5311
+    },
+    {
+      "epoch": 0.9458689458689459,
+      "grad_norm": 0.5402196645736694,
+      "learning_rate": 0.00017379651812004623,
+      "loss": 0.943,
+      "step": 5312
+    },
+    {
+      "epoch": 0.9460470085470085,
+      "grad_norm": 0.5288932919502258,
+      "learning_rate": 0.00017378707135996276,
+      "loss": 1.0055,
+      "step": 5313
+    },
+    {
+      "epoch": 0.9462250712250713,
+      "grad_norm": 0.5607456564903259,
+      "learning_rate": 0.00017377762315417492,
+      "loss": 1.2073,
+      "step": 5314
+    },
+    {
+      "epoch": 0.9464031339031339,
+      "grad_norm": 0.5737698674201965,
+      "learning_rate": 0.00017376817350286781,
+      "loss": 1.0001,
+      "step": 5315
+    },
+    {
+      "epoch": 0.9465811965811965,
+      "grad_norm": 0.6562079787254333,
+      "learning_rate": 0.00017375872240622657,
+      "loss": 1.1503,
+      "step": 5316
+    },
+    {
+      "epoch": 0.9467592592592593,
+      "grad_norm": 0.5407183170318604,
+      "learning_rate": 0.0001737492698644364,
+      "loss": 1.1169,
+      "step": 5317
+    },
+    {
+      "epoch": 0.9469373219373219,
+      "grad_norm": 0.5504152178764343,
+      "learning_rate": 0.00017373981587768248,
+      "loss": 1.0468,
+      "step": 5318
+    },
+    {
+      "epoch": 0.9471153846153846,
+      "grad_norm": 0.4813530743122101,
+      "learning_rate": 0.00017373036044615006,
+      "loss": 0.9707,
+      "step": 5319
+    },
+    {
+      "epoch": 0.9472934472934473,
+      "grad_norm": 0.5810509920120239,
+      "learning_rate": 0.00017372090357002437,
+      "loss": 1.4949,
+      "step": 5320
+    },
+    {
+      "epoch": 0.94747150997151,
+      "grad_norm": 0.5250222086906433,
+      "learning_rate": 0.00017371144524949074,
+      "loss": 1.0818,
+      "step": 5321
+    },
+    {
+      "epoch": 0.9476495726495726,
+      "grad_norm": 0.4852280914783478,
+      "learning_rate": 0.00017370198548473444,
+      "loss": 1.1793,
+      "step": 5322
+    },
+    {
+      "epoch": 0.9478276353276354,
+      "grad_norm": 0.5392420291900635,
+      "learning_rate": 0.00017369252427594086,
+      "loss": 1.153,
+      "step": 5323
+    },
+    {
+      "epoch": 0.948005698005698,
+      "grad_norm": 0.521294116973877,
+      "learning_rate": 0.00017368306162329533,
+      "loss": 0.8572,
+      "step": 5324
+    },
+    {
+      "epoch": 0.9481837606837606,
+      "grad_norm": 0.5579673647880554,
+      "learning_rate": 0.0001736735975269833,
+      "loss": 1.0452,
+      "step": 5325
+    },
+    {
+      "epoch": 0.9483618233618234,
+      "grad_norm": 0.6027318835258484,
+      "learning_rate": 0.0001736641319871901,
+      "loss": 1.3475,
+      "step": 5326
+    },
+    {
+      "epoch": 0.948539886039886,
+      "grad_norm": 0.5600738525390625,
+      "learning_rate": 0.00017365466500410132,
+      "loss": 1.0338,
+      "step": 5327
+    },
+    {
+      "epoch": 0.9487179487179487,
+      "grad_norm": 0.5691532492637634,
+      "learning_rate": 0.00017364519657790236,
+      "loss": 1.129,
+      "step": 5328
+    },
+    {
+      "epoch": 0.9488960113960114,
+      "grad_norm": 0.5161463022232056,
+      "learning_rate": 0.0001736357267087788,
+      "loss": 1.0438,
+      "step": 5329
+    },
+    {
+      "epoch": 0.9490740740740741,
+      "grad_norm": 0.5049656629562378,
+      "learning_rate": 0.0001736262553969161,
+      "loss": 0.9484,
+      "step": 5330
+    },
+    {
+      "epoch": 0.9492521367521367,
+      "grad_norm": 0.5477150678634644,
+      "learning_rate": 0.00017361678264249988,
+      "loss": 0.8995,
+      "step": 5331
+    },
+    {
+      "epoch": 0.9494301994301995,
+      "grad_norm": 0.5679608583450317,
+      "learning_rate": 0.0001736073084457157,
+      "loss": 1.241,
+      "step": 5332
+    },
+    {
+      "epoch": 0.9496082621082621,
+      "grad_norm": 0.5748196840286255,
+      "learning_rate": 0.00017359783280674926,
+      "loss": 1.0046,
+      "step": 5333
+    },
+    {
+      "epoch": 0.9497863247863247,
+      "grad_norm": 0.5677094459533691,
+      "learning_rate": 0.00017358835572578617,
+      "loss": 1.2913,
+      "step": 5334
+    },
+    {
+      "epoch": 0.9499643874643875,
+      "grad_norm": 0.49663659930229187,
+      "learning_rate": 0.0001735788772030121,
+      "loss": 1.0388,
+      "step": 5335
+    },
+    {
+      "epoch": 0.9501424501424501,
+      "grad_norm": 0.5687218904495239,
+      "learning_rate": 0.0001735693972386128,
+      "loss": 1.1631,
+      "step": 5336
+    },
+    {
+      "epoch": 0.9503205128205128,
+      "grad_norm": 0.520708441734314,
+      "learning_rate": 0.00017355991583277395,
+      "loss": 1.0744,
+      "step": 5337
+    },
+    {
+      "epoch": 0.9504985754985755,
+      "grad_norm": 0.5738952159881592,
+      "learning_rate": 0.00017355043298568137,
+      "loss": 1.318,
+      "step": 5338
+    },
+    {
+      "epoch": 0.9506766381766382,
+      "grad_norm": 0.5378455519676208,
+      "learning_rate": 0.00017354094869752085,
+      "loss": 0.9827,
+      "step": 5339
+    },
+    {
+      "epoch": 0.9508547008547008,
+      "grad_norm": 0.5047366619110107,
+      "learning_rate": 0.0001735314629684782,
+      "loss": 1.0966,
+      "step": 5340
+    },
+    {
+      "epoch": 0.9510327635327636,
+      "grad_norm": 0.5526043772697449,
+      "learning_rate": 0.0001735219757987393,
+      "loss": 1.059,
+      "step": 5341
+    },
+    {
+      "epoch": 0.9512108262108262,
+      "grad_norm": 0.5741400718688965,
+      "learning_rate": 0.00017351248718849003,
+      "loss": 1.1232,
+      "step": 5342
+    },
+    {
+      "epoch": 0.9513888888888888,
+      "grad_norm": 0.5421118140220642,
+      "learning_rate": 0.00017350299713791626,
+      "loss": 1.0427,
+      "step": 5343
+    },
+    {
+      "epoch": 0.9515669515669516,
+      "grad_norm": 0.4857081472873688,
+      "learning_rate": 0.00017349350564720392,
+      "loss": 0.8663,
+      "step": 5344
+    },
+    {
+      "epoch": 0.9517450142450142,
+      "grad_norm": 0.5411618947982788,
+      "learning_rate": 0.00017348401271653904,
+      "loss": 1.0317,
+      "step": 5345
+    },
+    {
+      "epoch": 0.9519230769230769,
+      "grad_norm": 0.5246246457099915,
+      "learning_rate": 0.00017347451834610756,
+      "loss": 1.0076,
+      "step": 5346
+    },
+    {
+      "epoch": 0.9521011396011396,
+      "grad_norm": 0.5278927683830261,
+      "learning_rate": 0.00017346502253609556,
+      "loss": 0.931,
+      "step": 5347
+    },
+    {
+      "epoch": 0.9522792022792023,
+      "grad_norm": 0.5934548377990723,
+      "learning_rate": 0.00017345552528668902,
+      "loss": 1.3205,
+      "step": 5348
+    },
+    {
+      "epoch": 0.9524572649572649,
+      "grad_norm": 0.5466100573539734,
+      "learning_rate": 0.00017344602659807406,
+      "loss": 0.8725,
+      "step": 5349
+    },
+    {
+      "epoch": 0.9526353276353277,
+      "grad_norm": 0.5220118761062622,
+      "learning_rate": 0.00017343652647043678,
+      "loss": 1.1642,
+      "step": 5350
+    },
+    {
+      "epoch": 0.9528133903133903,
+      "grad_norm": 0.6166301965713501,
+      "learning_rate": 0.0001734270249039633,
+      "loss": 0.8152,
+      "step": 5351
+    },
+    {
+      "epoch": 0.9529914529914529,
+      "grad_norm": 0.5173428058624268,
+      "learning_rate": 0.00017341752189883983,
+      "loss": 0.9296,
+      "step": 5352
+    },
+    {
+      "epoch": 0.9531695156695157,
+      "grad_norm": 0.5363461375236511,
+      "learning_rate": 0.0001734080174552525,
+      "loss": 1.3546,
+      "step": 5353
+    },
+    {
+      "epoch": 0.9533475783475783,
+      "grad_norm": 0.5333831906318665,
+      "learning_rate": 0.0001733985115733876,
+      "loss": 1.0401,
+      "step": 5354
+    },
+    {
+      "epoch": 0.9535256410256411,
+      "grad_norm": 0.5179334878921509,
+      "learning_rate": 0.00017338900425343132,
+      "loss": 1.1254,
+      "step": 5355
+    },
+    {
+      "epoch": 0.9537037037037037,
+      "grad_norm": 0.5171303153038025,
+      "learning_rate": 0.00017337949549556993,
+      "loss": 1.0518,
+      "step": 5356
+    },
+    {
+      "epoch": 0.9538817663817664,
+      "grad_norm": 0.5164596438407898,
+      "learning_rate": 0.00017336998529998978,
+      "loss": 0.8732,
+      "step": 5357
+    },
+    {
+      "epoch": 0.9540598290598291,
+      "grad_norm": 0.5555717349052429,
+      "learning_rate": 0.00017336047366687719,
+      "loss": 1.2312,
+      "step": 5358
+    },
+    {
+      "epoch": 0.9542378917378918,
+      "grad_norm": 0.45685622096061707,
+      "learning_rate": 0.00017335096059641847,
+      "loss": 0.8882,
+      "step": 5359
+    },
+    {
+      "epoch": 0.9544159544159544,
+      "grad_norm": 0.5260133743286133,
+      "learning_rate": 0.0001733414460888001,
+      "loss": 1.0952,
+      "step": 5360
+    },
+    {
+      "epoch": 0.9545940170940171,
+      "grad_norm": 0.4597703814506531,
+      "learning_rate": 0.0001733319301442084,
+      "loss": 1.0835,
+      "step": 5361
+    },
+    {
+      "epoch": 0.9547720797720798,
+      "grad_norm": 0.5279495120048523,
+      "learning_rate": 0.0001733224127628299,
+      "loss": 1.0295,
+      "step": 5362
+    },
+    {
+      "epoch": 0.9549501424501424,
+      "grad_norm": 0.48919400572776794,
+      "learning_rate": 0.00017331289394485104,
+      "loss": 0.9693,
+      "step": 5363
+    },
+    {
+      "epoch": 0.9551282051282052,
+      "grad_norm": 0.5639515519142151,
+      "learning_rate": 0.0001733033736904583,
+      "loss": 1.0893,
+      "step": 5364
+    },
+    {
+      "epoch": 0.9553062678062678,
+      "grad_norm": 0.49761319160461426,
+      "learning_rate": 0.00017329385199983823,
+      "loss": 1.038,
+      "step": 5365
+    },
+    {
+      "epoch": 0.9554843304843305,
+      "grad_norm": 0.5503305792808533,
+      "learning_rate": 0.0001732843288731774,
+      "loss": 0.9976,
+      "step": 5366
+    },
+    {
+      "epoch": 0.9556623931623932,
+      "grad_norm": 0.5633028745651245,
+      "learning_rate": 0.00017327480431066235,
+      "loss": 1.0602,
+      "step": 5367
+    },
+    {
+      "epoch": 0.9558404558404558,
+      "grad_norm": 0.48074454069137573,
+      "learning_rate": 0.00017326527831247973,
+      "loss": 1.0286,
+      "step": 5368
+    },
+    {
+      "epoch": 0.9560185185185185,
+      "grad_norm": 0.506597638130188,
+      "learning_rate": 0.0001732557508788162,
+      "loss": 0.9061,
+      "step": 5369
+    },
+    {
+      "epoch": 0.9561965811965812,
+      "grad_norm": 0.6570749282836914,
+      "learning_rate": 0.0001732462220098584,
+      "loss": 1.0852,
+      "step": 5370
+    },
+    {
+      "epoch": 0.9563746438746439,
+      "grad_norm": 0.5607653856277466,
+      "learning_rate": 0.00017323669170579302,
+      "loss": 1.0486,
+      "step": 5371
+    },
+    {
+      "epoch": 0.9565527065527065,
+      "grad_norm": 0.6047050356864929,
+      "learning_rate": 0.0001732271599668068,
+      "loss": 1.2175,
+      "step": 5372
+    },
+    {
+      "epoch": 0.9567307692307693,
+      "grad_norm": 0.5506869554519653,
+      "learning_rate": 0.00017321762679308651,
+      "loss": 1.0114,
+      "step": 5373
+    },
+    {
+      "epoch": 0.9569088319088319,
+      "grad_norm": 0.5868638157844543,
+      "learning_rate": 0.00017320809218481891,
+      "loss": 1.2983,
+      "step": 5374
+    },
+    {
+      "epoch": 0.9570868945868946,
+      "grad_norm": 0.539619505405426,
+      "learning_rate": 0.00017319855614219084,
+      "loss": 1.2361,
+      "step": 5375
+    },
+    {
+      "epoch": 0.9572649572649573,
+      "grad_norm": 0.5525495409965515,
+      "learning_rate": 0.0001731890186653891,
+      "loss": 1.1316,
+      "step": 5376
+    },
+    {
+      "epoch": 0.95744301994302,
+      "grad_norm": 0.5549767017364502,
+      "learning_rate": 0.0001731794797546006,
+      "loss": 1.0547,
+      "step": 5377
+    },
+    {
+      "epoch": 0.9576210826210826,
+      "grad_norm": 0.5356076955795288,
+      "learning_rate": 0.00017316993941001222,
+      "loss": 0.9942,
+      "step": 5378
+    },
+    {
+      "epoch": 0.9577991452991453,
+      "grad_norm": 0.5365784168243408,
+      "learning_rate": 0.00017316039763181084,
+      "loss": 1.226,
+      "step": 5379
+    },
+    {
+      "epoch": 0.957977207977208,
+      "grad_norm": 0.5190927386283875,
+      "learning_rate": 0.00017315085442018343,
+      "loss": 1.1704,
+      "step": 5380
+    },
+    {
+      "epoch": 0.9581552706552706,
+      "grad_norm": 0.526658833026886,
+      "learning_rate": 0.00017314130977531705,
+      "loss": 1.109,
+      "step": 5381
+    },
+    {
+      "epoch": 0.9583333333333334,
+      "grad_norm": 0.5373684763908386,
+      "learning_rate": 0.0001731317636973986,
+      "loss": 1.0018,
+      "step": 5382
+    },
+    {
+      "epoch": 0.958511396011396,
+      "grad_norm": 0.5714904069900513,
+      "learning_rate": 0.00017312221618661516,
+      "loss": 1.1855,
+      "step": 5383
+    },
+    {
+      "epoch": 0.9586894586894587,
+      "grad_norm": 0.5707863569259644,
+      "learning_rate": 0.00017311266724315377,
+      "loss": 0.9482,
+      "step": 5384
+    },
+    {
+      "epoch": 0.9588675213675214,
+      "grad_norm": 0.5856872797012329,
+      "learning_rate": 0.00017310311686720157,
+      "loss": 0.9543,
+      "step": 5385
+    },
+    {
+      "epoch": 0.959045584045584,
+      "grad_norm": 0.5041963458061218,
+      "learning_rate": 0.00017309356505894568,
+      "loss": 1.1427,
+      "step": 5386
+    },
+    {
+      "epoch": 0.9592236467236467,
+      "grad_norm": 0.5409179925918579,
+      "learning_rate": 0.00017308401181857316,
+      "loss": 0.8432,
+      "step": 5387
+    },
+    {
+      "epoch": 0.9594017094017094,
+      "grad_norm": 0.5248702764511108,
+      "learning_rate": 0.00017307445714627128,
+      "loss": 1.1403,
+      "step": 5388
+    },
+    {
+      "epoch": 0.9595797720797721,
+      "grad_norm": 0.50718092918396,
+      "learning_rate": 0.00017306490104222722,
+      "loss": 0.9066,
+      "step": 5389
+    },
+    {
+      "epoch": 0.9597578347578347,
+      "grad_norm": 0.5563821196556091,
+      "learning_rate": 0.0001730553435066282,
+      "loss": 1.0204,
+      "step": 5390
+    },
+    {
+      "epoch": 0.9599358974358975,
+      "grad_norm": 0.5696987509727478,
+      "learning_rate": 0.00017304578453966146,
+      "loss": 1.1405,
+      "step": 5391
+    },
+    {
+      "epoch": 0.9601139601139601,
+      "grad_norm": 0.5927395224571228,
+      "learning_rate": 0.00017303622414151435,
+      "loss": 1.0398,
+      "step": 5392
+    },
+    {
+      "epoch": 0.9602920227920227,
+      "grad_norm": 0.5375707745552063,
+      "learning_rate": 0.0001730266623123741,
+      "loss": 0.9519,
+      "step": 5393
+    },
+    {
+      "epoch": 0.9604700854700855,
+      "grad_norm": 0.457998126745224,
+      "learning_rate": 0.00017301709905242815,
+      "loss": 0.8743,
+      "step": 5394
+    },
+    {
+      "epoch": 0.9606481481481481,
+      "grad_norm": 0.5427796244621277,
+      "learning_rate": 0.00017300753436186382,
+      "loss": 1.078,
+      "step": 5395
+    },
+    {
+      "epoch": 0.9608262108262108,
+      "grad_norm": 0.5458595752716064,
+      "learning_rate": 0.0001729979682408685,
+      "loss": 1.1081,
+      "step": 5396
+    },
+    {
+      "epoch": 0.9610042735042735,
+      "grad_norm": 0.5495280027389526,
+      "learning_rate": 0.00017298840068962962,
+      "loss": 1.0141,
+      "step": 5397
+    },
+    {
+      "epoch": 0.9611823361823362,
+      "grad_norm": 0.5878560543060303,
+      "learning_rate": 0.00017297883170833465,
+      "loss": 1.302,
+      "step": 5398
+    },
+    {
+      "epoch": 0.9613603988603988,
+      "grad_norm": 0.5452881455421448,
+      "learning_rate": 0.00017296926129717108,
+      "loss": 0.9929,
+      "step": 5399
+    },
+    {
+      "epoch": 0.9615384615384616,
+      "grad_norm": 0.6021811366081238,
+      "learning_rate": 0.0001729596894563264,
+      "loss": 1.2629,
+      "step": 5400
+    },
+    {
+      "epoch": 0.9617165242165242,
+      "grad_norm": 0.5820204615592957,
+      "learning_rate": 0.0001729501161859882,
+      "loss": 1.0662,
+      "step": 5401
+    },
+    {
+      "epoch": 0.9618945868945868,
+      "grad_norm": 0.4953218102455139,
+      "learning_rate": 0.000172940541486344,
+      "loss": 1.047,
+      "step": 5402
+    },
+    {
+      "epoch": 0.9620726495726496,
+      "grad_norm": 0.5409793853759766,
+      "learning_rate": 0.00017293096535758143,
+      "loss": 1.1993,
+      "step": 5403
+    },
+    {
+      "epoch": 0.9622507122507122,
+      "grad_norm": 0.49702873826026917,
+      "learning_rate": 0.00017292138779988805,
+      "loss": 1.2471,
+      "step": 5404
+    },
+    {
+      "epoch": 0.9624287749287749,
+      "grad_norm": 0.5743489861488342,
+      "learning_rate": 0.00017291180881345158,
+      "loss": 1.0816,
+      "step": 5405
+    },
+    {
+      "epoch": 0.9626068376068376,
+      "grad_norm": 0.5747945308685303,
+      "learning_rate": 0.00017290222839845968,
+      "loss": 1.3548,
+      "step": 5406
+    },
+    {
+      "epoch": 0.9627849002849003,
+      "grad_norm": 0.5341345071792603,
+      "learning_rate": 0.00017289264655510005,
+      "loss": 1.0435,
+      "step": 5407
+    },
+    {
+      "epoch": 0.9629629629629629,
+      "grad_norm": 0.5719689130783081,
+      "learning_rate": 0.00017288306328356044,
+      "loss": 1.2319,
+      "step": 5408
+    },
+    {
+      "epoch": 0.9631410256410257,
+      "grad_norm": 0.4783279597759247,
+      "learning_rate": 0.0001728734785840286,
+      "loss": 0.9397,
+      "step": 5409
+    },
+    {
+      "epoch": 0.9633190883190883,
+      "grad_norm": 0.4730507731437683,
+      "learning_rate": 0.00017286389245669233,
+      "loss": 0.9384,
+      "step": 5410
+    },
+    {
+      "epoch": 0.9634971509971509,
+      "grad_norm": 0.5309939384460449,
+      "learning_rate": 0.00017285430490173944,
+      "loss": 1.098,
+      "step": 5411
+    },
+    {
+      "epoch": 0.9636752136752137,
+      "grad_norm": 0.5177853107452393,
+      "learning_rate": 0.0001728447159193578,
+      "loss": 1.2777,
+      "step": 5412
+    },
+    {
+      "epoch": 0.9638532763532763,
+      "grad_norm": 0.6437913775444031,
+      "learning_rate": 0.00017283512550973526,
+      "loss": 1.2661,
+      "step": 5413
+    },
+    {
+      "epoch": 0.9640313390313391,
+      "grad_norm": 0.6096072196960449,
+      "learning_rate": 0.00017282553367305975,
+      "loss": 0.9569,
+      "step": 5414
+    },
+    {
+      "epoch": 0.9642094017094017,
+      "grad_norm": 0.5104934573173523,
+      "learning_rate": 0.00017281594040951918,
+      "loss": 0.9666,
+      "step": 5415
+    },
+    {
+      "epoch": 0.9643874643874644,
+      "grad_norm": 0.6178240776062012,
+      "learning_rate": 0.00017280634571930153,
+      "loss": 1.1277,
+      "step": 5416
+    },
+    {
+      "epoch": 0.9645655270655271,
+      "grad_norm": 0.5749034881591797,
+      "learning_rate": 0.0001727967496025948,
+      "loss": 1.245,
+      "step": 5417
+    },
+    {
+      "epoch": 0.9647435897435898,
+      "grad_norm": 0.5036978721618652,
+      "learning_rate": 0.00017278715205958694,
+      "loss": 1.3049,
+      "step": 5418
+    },
+    {
+      "epoch": 0.9649216524216524,
+      "grad_norm": 0.5593041777610779,
+      "learning_rate": 0.00017277755309046605,
+      "loss": 1.2304,
+      "step": 5419
+    },
+    {
+      "epoch": 0.9650997150997151,
+      "grad_norm": 0.5446555614471436,
+      "learning_rate": 0.0001727679526954202,
+      "loss": 0.732,
+      "step": 5420
+    },
+    {
+      "epoch": 0.9652777777777778,
+      "grad_norm": 0.6063070297241211,
+      "learning_rate": 0.00017275835087463747,
+      "loss": 1.3723,
+      "step": 5421
+    },
+    {
+      "epoch": 0.9654558404558404,
+      "grad_norm": 0.4994211792945862,
+      "learning_rate": 0.00017274874762830602,
+      "loss": 1.0505,
+      "step": 5422
+    },
+    {
+      "epoch": 0.9656339031339032,
+      "grad_norm": 0.49396973848342896,
+      "learning_rate": 0.00017273914295661395,
+      "loss": 0.8691,
+      "step": 5423
+    },
+    {
+      "epoch": 0.9658119658119658,
+      "grad_norm": 0.5067027807235718,
+      "learning_rate": 0.0001727295368597495,
+      "loss": 0.9744,
+      "step": 5424
+    },
+    {
+      "epoch": 0.9659900284900285,
+      "grad_norm": 0.6720643043518066,
+      "learning_rate": 0.00017271992933790085,
+      "loss": 1.1513,
+      "step": 5425
+    },
+    {
+      "epoch": 0.9661680911680912,
+      "grad_norm": 0.5494341254234314,
+      "learning_rate": 0.00017271032039125624,
+      "loss": 0.8295,
+      "step": 5426
+    },
+    {
+      "epoch": 0.9663461538461539,
+      "grad_norm": 0.644332230091095,
+      "learning_rate": 0.00017270071002000394,
+      "loss": 1.0043,
+      "step": 5427
+    },
+    {
+      "epoch": 0.9665242165242165,
+      "grad_norm": 0.5658500790596008,
+      "learning_rate": 0.00017269109822433225,
+      "loss": 1.2575,
+      "step": 5428
+    },
+    {
+      "epoch": 0.9667022792022792,
+      "grad_norm": 0.5163155794143677,
+      "learning_rate": 0.00017268148500442952,
+      "loss": 1.1391,
+      "step": 5429
+    },
+    {
+      "epoch": 0.9668803418803419,
+      "grad_norm": 0.5113703608512878,
+      "learning_rate": 0.00017267187036048404,
+      "loss": 1.0819,
+      "step": 5430
+    },
+    {
+      "epoch": 0.9670584045584045,
+      "grad_norm": 0.6339422464370728,
+      "learning_rate": 0.00017266225429268426,
+      "loss": 1.0733,
+      "step": 5431
+    },
+    {
+      "epoch": 0.9672364672364673,
+      "grad_norm": 0.5158288478851318,
+      "learning_rate": 0.0001726526368012185,
+      "loss": 0.9518,
+      "step": 5432
+    },
+    {
+      "epoch": 0.9674145299145299,
+      "grad_norm": 0.593717634677887,
+      "learning_rate": 0.00017264301788627527,
+      "loss": 0.9416,
+      "step": 5433
+    },
+    {
+      "epoch": 0.9675925925925926,
+      "grad_norm": 0.49593186378479004,
+      "learning_rate": 0.00017263339754804301,
+      "loss": 1.0307,
+      "step": 5434
+    },
+    {
+      "epoch": 0.9677706552706553,
+      "grad_norm": 0.44032949209213257,
+      "learning_rate": 0.00017262377578671024,
+      "loss": 0.7884,
+      "step": 5435
+    },
+    {
+      "epoch": 0.967948717948718,
+      "grad_norm": 0.513073742389679,
+      "learning_rate": 0.00017261415260246538,
+      "loss": 0.9797,
+      "step": 5436
+    },
+    {
+      "epoch": 0.9681267806267806,
+      "grad_norm": 0.5737422108650208,
+      "learning_rate": 0.0001726045279954971,
+      "loss": 1.0487,
+      "step": 5437
+    },
+    {
+      "epoch": 0.9683048433048433,
+      "grad_norm": 0.5385867953300476,
+      "learning_rate": 0.0001725949019659939,
+      "loss": 1.4166,
+      "step": 5438
+    },
+    {
+      "epoch": 0.968482905982906,
+      "grad_norm": 0.5224326848983765,
+      "learning_rate": 0.00017258527451414438,
+      "loss": 1.195,
+      "step": 5439
+    },
+    {
+      "epoch": 0.9686609686609686,
+      "grad_norm": 0.5305148363113403,
+      "learning_rate": 0.0001725756456401372,
+      "loss": 1.0301,
+      "step": 5440
+    },
+    {
+      "epoch": 0.9688390313390314,
+      "grad_norm": 0.532588005065918,
+      "learning_rate": 0.000172566015344161,
+      "loss": 1.1269,
+      "step": 5441
+    },
+    {
+      "epoch": 0.969017094017094,
+      "grad_norm": 0.5812515020370483,
+      "learning_rate": 0.0001725563836264045,
+      "loss": 1.1787,
+      "step": 5442
+    },
+    {
+      "epoch": 0.9691951566951567,
+      "grad_norm": 0.4962109327316284,
+      "learning_rate": 0.00017254675048705638,
+      "loss": 1.0639,
+      "step": 5443
+    },
+    {
+      "epoch": 0.9693732193732194,
+      "grad_norm": 0.5094883441925049,
+      "learning_rate": 0.00017253711592630534,
+      "loss": 1.0922,
+      "step": 5444
+    },
+    {
+      "epoch": 0.969551282051282,
+      "grad_norm": 0.5728049874305725,
+      "learning_rate": 0.00017252747994434025,
+      "loss": 1.1237,
+      "step": 5445
+    },
+    {
+      "epoch": 0.9697293447293447,
+      "grad_norm": 0.5406180620193481,
+      "learning_rate": 0.00017251784254134983,
+      "loss": 1.1161,
+      "step": 5446
+    },
+    {
+      "epoch": 0.9699074074074074,
+      "grad_norm": 0.5724552869796753,
+      "learning_rate": 0.00017250820371752292,
+      "loss": 1.2205,
+      "step": 5447
+    },
+    {
+      "epoch": 0.9700854700854701,
+      "grad_norm": 0.5698846578598022,
+      "learning_rate": 0.0001724985634730484,
+      "loss": 1.1472,
+      "step": 5448
+    },
+    {
+      "epoch": 0.9702635327635327,
+      "grad_norm": 0.5315805673599243,
+      "learning_rate": 0.0001724889218081151,
+      "loss": 1.0253,
+      "step": 5449
+    },
+    {
+      "epoch": 0.9704415954415955,
+      "grad_norm": 0.5970377326011658,
+      "learning_rate": 0.000172479278722912,
+      "loss": 1.3033,
+      "step": 5450
+    },
+    {
+      "epoch": 0.9706196581196581,
+      "grad_norm": 0.6149488687515259,
+      "learning_rate": 0.00017246963421762798,
+      "loss": 1.0689,
+      "step": 5451
+    },
+    {
+      "epoch": 0.9707977207977208,
+      "grad_norm": 0.4848574995994568,
+      "learning_rate": 0.00017245998829245202,
+      "loss": 0.8829,
+      "step": 5452
+    },
+    {
+      "epoch": 0.9709757834757835,
+      "grad_norm": 0.6073294281959534,
+      "learning_rate": 0.00017245034094757312,
+      "loss": 1.2378,
+      "step": 5453
+    },
+    {
+      "epoch": 0.9711538461538461,
+      "grad_norm": 0.6362034678459167,
+      "learning_rate": 0.00017244069218318026,
+      "loss": 1.3606,
+      "step": 5454
+    },
+    {
+      "epoch": 0.9713319088319088,
+      "grad_norm": 0.5353880524635315,
+      "learning_rate": 0.00017243104199946257,
+      "loss": 1.1288,
+      "step": 5455
+    },
+    {
+      "epoch": 0.9715099715099715,
+      "grad_norm": 0.5096352100372314,
+      "learning_rate": 0.00017242139039660902,
+      "loss": 1.0056,
+      "step": 5456
+    },
+    {
+      "epoch": 0.9716880341880342,
+      "grad_norm": 0.5086682438850403,
+      "learning_rate": 0.00017241173737480884,
+      "loss": 1.091,
+      "step": 5457
+    },
+    {
+      "epoch": 0.9718660968660968,
+      "grad_norm": 0.5034295320510864,
+      "learning_rate": 0.000172402082934251,
+      "loss": 0.9749,
+      "step": 5458
+    },
+    {
+      "epoch": 0.9720441595441596,
+      "grad_norm": 0.5205379724502563,
+      "learning_rate": 0.0001723924270751248,
+      "loss": 1.1068,
+      "step": 5459
+    },
+    {
+      "epoch": 0.9722222222222222,
+      "grad_norm": 0.5904826521873474,
+      "learning_rate": 0.00017238276979761937,
+      "loss": 1.0613,
+      "step": 5460
+    },
+    {
+      "epoch": 0.9724002849002849,
+      "grad_norm": 0.6415045261383057,
+      "learning_rate": 0.0001723731111019239,
+      "loss": 1.2126,
+      "step": 5461
+    },
+    {
+      "epoch": 0.9725783475783476,
+      "grad_norm": 0.5769147872924805,
+      "learning_rate": 0.0001723634509882277,
+      "loss": 1.337,
+      "step": 5462
+    },
+    {
+      "epoch": 0.9727564102564102,
+      "grad_norm": 0.5585111975669861,
+      "learning_rate": 0.00017235378945671998,
+      "loss": 1.3922,
+      "step": 5463
+    },
+    {
+      "epoch": 0.9729344729344729,
+      "grad_norm": 0.5788411498069763,
+      "learning_rate": 0.00017234412650759008,
+      "loss": 0.8532,
+      "step": 5464
+    },
+    {
+      "epoch": 0.9731125356125356,
+      "grad_norm": 0.5617673397064209,
+      "learning_rate": 0.00017233446214102728,
+      "loss": 1.2575,
+      "step": 5465
+    },
+    {
+      "epoch": 0.9732905982905983,
+      "grad_norm": 0.4227815568447113,
+      "learning_rate": 0.00017232479635722093,
+      "loss": 1.0618,
+      "step": 5466
+    },
+    {
+      "epoch": 0.9734686609686609,
+      "grad_norm": 0.49751797318458557,
+      "learning_rate": 0.00017231512915636047,
+      "loss": 0.7714,
+      "step": 5467
+    },
+    {
+      "epoch": 0.9736467236467237,
+      "grad_norm": 0.5983800292015076,
+      "learning_rate": 0.0001723054605386353,
+      "loss": 1.2297,
+      "step": 5468
+    },
+    {
+      "epoch": 0.9738247863247863,
+      "grad_norm": 0.543394923210144,
+      "learning_rate": 0.0001722957905042348,
+      "loss": 1.0078,
+      "step": 5469
+    },
+    {
+      "epoch": 0.9740028490028491,
+      "grad_norm": 0.5633566975593567,
+      "learning_rate": 0.00017228611905334846,
+      "loss": 1.0938,
+      "step": 5470
+    },
+    {
+      "epoch": 0.9741809116809117,
+      "grad_norm": 0.49377235770225525,
+      "learning_rate": 0.00017227644618616578,
+      "loss": 1.096,
+      "step": 5471
+    },
+    {
+      "epoch": 0.9743589743589743,
+      "grad_norm": 0.4963362216949463,
+      "learning_rate": 0.00017226677190287627,
+      "loss": 1.0003,
+      "step": 5472
+    },
+    {
+      "epoch": 0.9745370370370371,
+      "grad_norm": 0.4483006000518799,
+      "learning_rate": 0.00017225709620366953,
+      "loss": 0.8623,
+      "step": 5473
+    },
+    {
+      "epoch": 0.9747150997150997,
+      "grad_norm": 0.5429352521896362,
+      "learning_rate": 0.00017224741908873506,
+      "loss": 1.1383,
+      "step": 5474
+    },
+    {
+      "epoch": 0.9748931623931624,
+      "grad_norm": 0.5871657729148865,
+      "learning_rate": 0.0001722377405582625,
+      "loss": 1.2005,
+      "step": 5475
+    },
+    {
+      "epoch": 0.9750712250712251,
+      "grad_norm": 0.6002383828163147,
+      "learning_rate": 0.0001722280606124415,
+      "loss": 1.0696,
+      "step": 5476
+    },
+    {
+      "epoch": 0.9752492877492878,
+      "grad_norm": 0.5351617336273193,
+      "learning_rate": 0.00017221837925146164,
+      "loss": 1.243,
+      "step": 5477
+    },
+    {
+      "epoch": 0.9754273504273504,
+      "grad_norm": 0.46613118052482605,
+      "learning_rate": 0.00017220869647551268,
+      "loss": 1.0344,
+      "step": 5478
+    },
+    {
+      "epoch": 0.9756054131054132,
+      "grad_norm": 0.6015593409538269,
+      "learning_rate": 0.00017219901228478432,
+      "loss": 1.082,
+      "step": 5479
+    },
+    {
+      "epoch": 0.9757834757834758,
+      "grad_norm": 0.5829521417617798,
+      "learning_rate": 0.0001721893266794663,
+      "loss": 0.8683,
+      "step": 5480
+    },
+    {
+      "epoch": 0.9759615384615384,
+      "grad_norm": 0.6344960927963257,
+      "learning_rate": 0.00017217963965974838,
+      "loss": 1.1048,
+      "step": 5481
+    },
+    {
+      "epoch": 0.9761396011396012,
+      "grad_norm": 0.5586308240890503,
+      "learning_rate": 0.00017216995122582034,
+      "loss": 0.9657,
+      "step": 5482
+    },
+    {
+      "epoch": 0.9763176638176638,
+      "grad_norm": 0.48625239729881287,
+      "learning_rate": 0.00017216026137787204,
+      "loss": 1.1026,
+      "step": 5483
+    },
+    {
+      "epoch": 0.9764957264957265,
+      "grad_norm": 0.5625223517417908,
+      "learning_rate": 0.00017215057011609332,
+      "loss": 1.1579,
+      "step": 5484
+    },
+    {
+      "epoch": 0.9766737891737892,
+      "grad_norm": 0.6016653776168823,
+      "learning_rate": 0.0001721408774406741,
+      "loss": 1.1777,
+      "step": 5485
+    },
+    {
+      "epoch": 0.9768518518518519,
+      "grad_norm": 0.5444921851158142,
+      "learning_rate": 0.00017213118335180418,
+      "loss": 1.119,
+      "step": 5486
+    },
+    {
+      "epoch": 0.9770299145299145,
+      "grad_norm": 0.5574755668640137,
+      "learning_rate": 0.0001721214878496736,
+      "loss": 1.1128,
+      "step": 5487
+    },
+    {
+      "epoch": 0.9772079772079773,
+      "grad_norm": 0.5486113429069519,
+      "learning_rate": 0.00017211179093447226,
+      "loss": 1.1673,
+      "step": 5488
+    },
+    {
+      "epoch": 0.9773860398860399,
+      "grad_norm": 0.5545483231544495,
+      "learning_rate": 0.00017210209260639018,
+      "loss": 1.1748,
+      "step": 5489
+    },
+    {
+      "epoch": 0.9775641025641025,
+      "grad_norm": 0.5756667256355286,
+      "learning_rate": 0.0001720923928656174,
+      "loss": 1.2377,
+      "step": 5490
+    },
+    {
+      "epoch": 0.9777421652421653,
+      "grad_norm": 0.5744972229003906,
+      "learning_rate": 0.00017208269171234392,
+      "loss": 1.1242,
+      "step": 5491
+    },
+    {
+      "epoch": 0.9779202279202279,
+      "grad_norm": 0.6109468340873718,
+      "learning_rate": 0.00017207298914675984,
+      "loss": 1.1948,
+      "step": 5492
+    },
+    {
+      "epoch": 0.9780982905982906,
+      "grad_norm": 0.5195167660713196,
+      "learning_rate": 0.00017206328516905525,
+      "loss": 1.0941,
+      "step": 5493
+    },
+    {
+      "epoch": 0.9782763532763533,
+      "grad_norm": 0.5549042224884033,
+      "learning_rate": 0.0001720535797794203,
+      "loss": 1.1503,
+      "step": 5494
+    },
+    {
+      "epoch": 0.978454415954416,
+      "grad_norm": 0.6317743062973022,
+      "learning_rate": 0.0001720438729780451,
+      "loss": 1.3468,
+      "step": 5495
+    },
+    {
+      "epoch": 0.9786324786324786,
+      "grad_norm": 0.5932528972625732,
+      "learning_rate": 0.0001720341647651199,
+      "loss": 1.105,
+      "step": 5496
+    },
+    {
+      "epoch": 0.9788105413105413,
+      "grad_norm": 0.607880175113678,
+      "learning_rate": 0.00017202445514083488,
+      "loss": 1.1465,
+      "step": 5497
+    },
+    {
+      "epoch": 0.978988603988604,
+      "grad_norm": 0.49227309226989746,
+      "learning_rate": 0.00017201474410538027,
+      "loss": 0.9075,
+      "step": 5498
+    },
+    {
+      "epoch": 0.9791666666666666,
+      "grad_norm": 0.5059443116188049,
+      "learning_rate": 0.00017200503165894636,
+      "loss": 1.0483,
+      "step": 5499
+    },
+    {
+      "epoch": 0.9793447293447294,
+      "grad_norm": 0.5792799592018127,
+      "learning_rate": 0.0001719953178017234,
+      "loss": 1.0987,
+      "step": 5500
+    },
+    {
+      "epoch": 0.979522792022792,
+      "grad_norm": 0.5010457038879395,
+      "learning_rate": 0.00017198560253390177,
+      "loss": 1.1051,
+      "step": 5501
+    },
+    {
+      "epoch": 0.9797008547008547,
+      "grad_norm": 0.5866543054580688,
+      "learning_rate": 0.0001719758858556718,
+      "loss": 1.2824,
+      "step": 5502
+    },
+    {
+      "epoch": 0.9798789173789174,
+      "grad_norm": 0.5392137169837952,
+      "learning_rate": 0.00017196616776722382,
+      "loss": 0.886,
+      "step": 5503
+    },
+    {
+      "epoch": 0.98005698005698,
+      "grad_norm": 0.5200899839401245,
+      "learning_rate": 0.00017195644826874834,
+      "loss": 1.1504,
+      "step": 5504
+    },
+    {
+      "epoch": 0.9802350427350427,
+      "grad_norm": 0.533159077167511,
+      "learning_rate": 0.00017194672736043569,
+      "loss": 1.1216,
+      "step": 5505
+    },
+    {
+      "epoch": 0.9804131054131054,
+      "grad_norm": 0.5543524622917175,
+      "learning_rate": 0.0001719370050424764,
+      "loss": 1.0161,
+      "step": 5506
+    },
+    {
+      "epoch": 0.9805911680911681,
+      "grad_norm": 0.5315365195274353,
+      "learning_rate": 0.00017192728131506092,
+      "loss": 1.0509,
+      "step": 5507
+    },
+    {
+      "epoch": 0.9807692307692307,
+      "grad_norm": 0.5406147837638855,
+      "learning_rate": 0.00017191755617837977,
+      "loss": 1.0695,
+      "step": 5508
+    },
+    {
+      "epoch": 0.9809472934472935,
+      "grad_norm": 0.4563386142253876,
+      "learning_rate": 0.00017190782963262354,
+      "loss": 0.995,
+      "step": 5509
+    },
+    {
+      "epoch": 0.9811253561253561,
+      "grad_norm": 0.5456405282020569,
+      "learning_rate": 0.00017189810167798274,
+      "loss": 1.0546,
+      "step": 5510
+    },
+    {
+      "epoch": 0.9813034188034188,
+      "grad_norm": 0.6275575160980225,
+      "learning_rate": 0.00017188837231464795,
+      "loss": 1.0432,
+      "step": 5511
+    },
+    {
+      "epoch": 0.9814814814814815,
+      "grad_norm": 0.49735602736473083,
+      "learning_rate": 0.0001718786415428099,
+      "loss": 1.035,
+      "step": 5512
+    },
+    {
+      "epoch": 0.9816595441595442,
+      "grad_norm": 0.5234259963035583,
+      "learning_rate": 0.00017186890936265916,
+      "loss": 1.0918,
+      "step": 5513
+    },
+    {
+      "epoch": 0.9818376068376068,
+      "grad_norm": 0.5091170072555542,
+      "learning_rate": 0.00017185917577438643,
+      "loss": 1.0239,
+      "step": 5514
+    },
+    {
+      "epoch": 0.9820156695156695,
+      "grad_norm": 0.6155703067779541,
+      "learning_rate": 0.00017184944077818244,
+      "loss": 1.2366,
+      "step": 5515
+    },
+    {
+      "epoch": 0.9821937321937322,
+      "grad_norm": 0.5074070692062378,
+      "learning_rate": 0.0001718397043742379,
+      "loss": 1.0318,
+      "step": 5516
+    },
+    {
+      "epoch": 0.9823717948717948,
+      "grad_norm": 0.5234423279762268,
+      "learning_rate": 0.0001718299665627436,
+      "loss": 1.0322,
+      "step": 5517
+    },
+    {
+      "epoch": 0.9825498575498576,
+      "grad_norm": 0.5783474445343018,
+      "learning_rate": 0.0001718202273438903,
+      "loss": 0.9486,
+      "step": 5518
+    },
+    {
+      "epoch": 0.9827279202279202,
+      "grad_norm": 0.5708683133125305,
+      "learning_rate": 0.00017181048671786886,
+      "loss": 1.0785,
+      "step": 5519
+    },
+    {
+      "epoch": 0.9829059829059829,
+      "grad_norm": 0.5985961556434631,
+      "learning_rate": 0.00017180074468487009,
+      "loss": 1.198,
+      "step": 5520
+    },
+    {
+      "epoch": 0.9830840455840456,
+      "grad_norm": 0.5711352229118347,
+      "learning_rate": 0.0001717910012450849,
+      "loss": 1.0386,
+      "step": 5521
+    },
+    {
+      "epoch": 0.9832621082621082,
+      "grad_norm": 0.5338063836097717,
+      "learning_rate": 0.00017178125639870416,
+      "loss": 1.1594,
+      "step": 5522
+    },
+    {
+      "epoch": 0.9834401709401709,
+      "grad_norm": 0.6144943237304688,
+      "learning_rate": 0.00017177151014591881,
+      "loss": 1.1083,
+      "step": 5523
+    },
+    {
+      "epoch": 0.9836182336182336,
+      "grad_norm": 0.547285795211792,
+      "learning_rate": 0.00017176176248691983,
+      "loss": 1.1507,
+      "step": 5524
+    },
+    {
+      "epoch": 0.9837962962962963,
+      "grad_norm": 0.5807644724845886,
+      "learning_rate": 0.00017175201342189817,
+      "loss": 1.3044,
+      "step": 5525
+    },
+    {
+      "epoch": 0.9839743589743589,
+      "grad_norm": 0.5229477882385254,
+      "learning_rate": 0.00017174226295104485,
+      "loss": 1.2622,
+      "step": 5526
+    },
+    {
+      "epoch": 0.9841524216524217,
+      "grad_norm": 0.6100695133209229,
+      "learning_rate": 0.00017173251107455094,
+      "loss": 1.2026,
+      "step": 5527
+    },
+    {
+      "epoch": 0.9843304843304843,
+      "grad_norm": 0.5410884618759155,
+      "learning_rate": 0.00017172275779260744,
+      "loss": 1.2964,
+      "step": 5528
+    },
+    {
+      "epoch": 0.9845085470085471,
+      "grad_norm": 0.5937406420707703,
+      "learning_rate": 0.00017171300310540554,
+      "loss": 1.1435,
+      "step": 5529
+    },
+    {
+      "epoch": 0.9846866096866097,
+      "grad_norm": 0.56817227602005,
+      "learning_rate": 0.00017170324701313634,
+      "loss": 1.0099,
+      "step": 5530
+    },
+    {
+      "epoch": 0.9848646723646723,
+      "grad_norm": 0.5776323080062866,
+      "learning_rate": 0.00017169348951599092,
+      "loss": 1.3539,
+      "step": 5531
+    },
+    {
+      "epoch": 0.9850427350427351,
+      "grad_norm": 0.5208535194396973,
+      "learning_rate": 0.0001716837306141605,
+      "loss": 1.2306,
+      "step": 5532
+    },
+    {
+      "epoch": 0.9852207977207977,
+      "grad_norm": 0.552173376083374,
+      "learning_rate": 0.0001716739703078363,
+      "loss": 1.0551,
+      "step": 5533
+    },
+    {
+      "epoch": 0.9853988603988604,
+      "grad_norm": 0.5327515602111816,
+      "learning_rate": 0.00017166420859720955,
+      "loss": 1.2443,
+      "step": 5534
+    },
+    {
+      "epoch": 0.9855769230769231,
+      "grad_norm": 0.5255244374275208,
+      "learning_rate": 0.0001716544454824715,
+      "loss": 1.005,
+      "step": 5535
+    },
+    {
+      "epoch": 0.9857549857549858,
+      "grad_norm": 0.4753847122192383,
+      "learning_rate": 0.00017164468096381343,
+      "loss": 1.0081,
+      "step": 5536
+    },
+    {
+      "epoch": 0.9859330484330484,
+      "grad_norm": 0.5261829495429993,
+      "learning_rate": 0.00017163491504142665,
+      "loss": 1.2249,
+      "step": 5537
+    },
+    {
+      "epoch": 0.9861111111111112,
+      "grad_norm": 0.46499499678611755,
+      "learning_rate": 0.00017162514771550255,
+      "loss": 0.8759,
+      "step": 5538
+    },
+    {
+      "epoch": 0.9862891737891738,
+      "grad_norm": 0.5233004689216614,
+      "learning_rate": 0.00017161537898623247,
+      "loss": 1.0474,
+      "step": 5539
+    },
+    {
+      "epoch": 0.9864672364672364,
+      "grad_norm": 0.46905553340911865,
+      "learning_rate": 0.00017160560885380778,
+      "loss": 0.9033,
+      "step": 5540
+    },
+    {
+      "epoch": 0.9866452991452992,
+      "grad_norm": 0.5816231369972229,
+      "learning_rate": 0.00017159583731841998,
+      "loss": 1.0628,
+      "step": 5541
+    },
+    {
+      "epoch": 0.9868233618233618,
+      "grad_norm": 0.4575413167476654,
+      "learning_rate": 0.00017158606438026045,
+      "loss": 1.0446,
+      "step": 5542
+    },
+    {
+      "epoch": 0.9870014245014245,
+      "grad_norm": 0.5968109965324402,
+      "learning_rate": 0.00017157629003952067,
+      "loss": 1.032,
+      "step": 5543
+    },
+    {
+      "epoch": 0.9871794871794872,
+      "grad_norm": 0.5316148400306702,
+      "learning_rate": 0.00017156651429639218,
+      "loss": 0.9167,
+      "step": 5544
+    },
+    {
+      "epoch": 0.9873575498575499,
+      "grad_norm": 0.5185125470161438,
+      "learning_rate": 0.00017155673715106651,
+      "loss": 1.1527,
+      "step": 5545
+    },
+    {
+      "epoch": 0.9875356125356125,
+      "grad_norm": 0.5167772769927979,
+      "learning_rate": 0.00017154695860373525,
+      "loss": 0.9954,
+      "step": 5546
+    },
+    {
+      "epoch": 0.9877136752136753,
+      "grad_norm": 0.6406680345535278,
+      "learning_rate": 0.00017153717865458994,
+      "loss": 1.2758,
+      "step": 5547
+    },
+    {
+      "epoch": 0.9878917378917379,
+      "grad_norm": 0.5223956108093262,
+      "learning_rate": 0.00017152739730382223,
+      "loss": 1.1526,
+      "step": 5548
+    },
+    {
+      "epoch": 0.9880698005698005,
+      "grad_norm": 0.6131790280342102,
+      "learning_rate": 0.00017151761455162375,
+      "loss": 1.1024,
+      "step": 5549
+    },
+    {
+      "epoch": 0.9882478632478633,
+      "grad_norm": 0.5574753880500793,
+      "learning_rate": 0.00017150783039818616,
+      "loss": 0.9733,
+      "step": 5550
+    },
+    {
+      "epoch": 0.9884259259259259,
+      "grad_norm": 0.5417882800102234,
+      "learning_rate": 0.0001714980448437012,
+      "loss": 1.2244,
+      "step": 5551
+    },
+    {
+      "epoch": 0.9886039886039886,
+      "grad_norm": 0.6217474341392517,
+      "learning_rate": 0.0001714882578883606,
+      "loss": 0.9224,
+      "step": 5552
+    },
+    {
+      "epoch": 0.9887820512820513,
+      "grad_norm": 0.5846285223960876,
+      "learning_rate": 0.00017147846953235606,
+      "loss": 1.2429,
+      "step": 5553
+    },
+    {
+      "epoch": 0.988960113960114,
+      "grad_norm": 0.5924782752990723,
+      "learning_rate": 0.00017146867977587936,
+      "loss": 0.9907,
+      "step": 5554
+    },
+    {
+      "epoch": 0.9891381766381766,
+      "grad_norm": 0.5756853818893433,
+      "learning_rate": 0.00017145888861912242,
+      "loss": 1.1266,
+      "step": 5555
+    },
+    {
+      "epoch": 0.9893162393162394,
+      "grad_norm": 0.5277376770973206,
+      "learning_rate": 0.00017144909606227693,
+      "loss": 1.1676,
+      "step": 5556
+    },
+    {
+      "epoch": 0.989494301994302,
+      "grad_norm": 0.5138902068138123,
+      "learning_rate": 0.00017143930210553485,
+      "loss": 0.9864,
+      "step": 5557
+    },
+    {
+      "epoch": 0.9896723646723646,
+      "grad_norm": 0.8072507977485657,
+      "learning_rate": 0.00017142950674908805,
+      "loss": 1.111,
+      "step": 5558
+    },
+    {
+      "epoch": 0.9898504273504274,
+      "grad_norm": 0.5641721487045288,
+      "learning_rate": 0.00017141970999312844,
+      "loss": 0.9106,
+      "step": 5559
+    },
+    {
+      "epoch": 0.99002849002849,
+      "grad_norm": 0.5260798931121826,
+      "learning_rate": 0.000171409911837848,
+      "loss": 1.1609,
+      "step": 5560
+    },
+    {
+      "epoch": 0.9902065527065527,
+      "grad_norm": 0.5398530960083008,
+      "learning_rate": 0.00017140011228343864,
+      "loss": 1.0368,
+      "step": 5561
+    },
+    {
+      "epoch": 0.9903846153846154,
+      "grad_norm": 0.6011313199996948,
+      "learning_rate": 0.00017139031133009245,
+      "loss": 1.1314,
+      "step": 5562
+    },
+    {
+      "epoch": 0.9905626780626781,
+      "grad_norm": 0.6194971203804016,
+      "learning_rate": 0.00017138050897800135,
+      "loss": 1.3493,
+      "step": 5563
+    },
+    {
+      "epoch": 0.9907407407407407,
+      "grad_norm": 0.5779356956481934,
+      "learning_rate": 0.0001713707052273575,
+      "loss": 0.943,
+      "step": 5564
+    },
+    {
+      "epoch": 0.9909188034188035,
+      "grad_norm": 0.5321127772331238,
+      "learning_rate": 0.00017136090007835293,
+      "loss": 0.7914,
+      "step": 5565
+    },
+    {
+      "epoch": 0.9910968660968661,
+      "grad_norm": 0.5470426678657532,
+      "learning_rate": 0.00017135109353117977,
+      "loss": 1.2113,
+      "step": 5566
+    },
+    {
+      "epoch": 0.9912749287749287,
+      "grad_norm": 0.5551436543464661,
+      "learning_rate": 0.00017134128558603012,
+      "loss": 0.8932,
+      "step": 5567
+    },
+    {
+      "epoch": 0.9914529914529915,
+      "grad_norm": 0.45770928263664246,
+      "learning_rate": 0.0001713314762430962,
+      "loss": 1.0061,
+      "step": 5568
+    },
+    {
+      "epoch": 0.9916310541310541,
+      "grad_norm": 0.5578967332839966,
+      "learning_rate": 0.00017132166550257017,
+      "loss": 1.148,
+      "step": 5569
+    },
+    {
+      "epoch": 0.9918091168091168,
+      "grad_norm": 0.5086452960968018,
+      "learning_rate": 0.0001713118533646443,
+      "loss": 0.9803,
+      "step": 5570
+    },
+    {
+      "epoch": 0.9919871794871795,
+      "grad_norm": 0.4714745879173279,
+      "learning_rate": 0.00017130203982951078,
+      "loss": 1.0176,
+      "step": 5571
+    },
+    {
+      "epoch": 0.9921652421652422,
+      "grad_norm": 0.6254406571388245,
+      "learning_rate": 0.0001712922248973619,
+      "loss": 1.0932,
+      "step": 5572
+    },
+    {
+      "epoch": 0.9923433048433048,
+      "grad_norm": 0.5005003809928894,
+      "learning_rate": 0.00017128240856838998,
+      "loss": 1.0783,
+      "step": 5573
+    },
+    {
+      "epoch": 0.9925213675213675,
+      "grad_norm": 0.5668206214904785,
+      "learning_rate": 0.00017127259084278733,
+      "loss": 1.0404,
+      "step": 5574
+    },
+    {
+      "epoch": 0.9926994301994302,
+      "grad_norm": 0.4976036250591278,
+      "learning_rate": 0.00017126277172074632,
+      "loss": 1.1437,
+      "step": 5575
+    },
+    {
+      "epoch": 0.9928774928774928,
+      "grad_norm": 0.567546546459198,
+      "learning_rate": 0.00017125295120245935,
+      "loss": 1.2188,
+      "step": 5576
+    },
+    {
+      "epoch": 0.9930555555555556,
+      "grad_norm": 0.5614372491836548,
+      "learning_rate": 0.0001712431292881188,
+      "loss": 0.9187,
+      "step": 5577
+    },
+    {
+      "epoch": 0.9932336182336182,
+      "grad_norm": 0.6117973327636719,
+      "learning_rate": 0.00017123330597791712,
+      "loss": 1.1285,
+      "step": 5578
+    },
+    {
+      "epoch": 0.9934116809116809,
+      "grad_norm": 0.6000342965126038,
+      "learning_rate": 0.00017122348127204676,
+      "loss": 0.9837,
+      "step": 5579
+    },
+    {
+      "epoch": 0.9935897435897436,
+      "grad_norm": 0.5453050136566162,
+      "learning_rate": 0.0001712136551707003,
+      "loss": 0.8771,
+      "step": 5580
+    },
+    {
+      "epoch": 0.9937678062678063,
+      "grad_norm": 0.49603891372680664,
+      "learning_rate": 0.00017120382767407018,
+      "loss": 1.0754,
+      "step": 5581
+    },
+    {
+      "epoch": 0.9939458689458689,
+      "grad_norm": 0.48031488060951233,
+      "learning_rate": 0.00017119399878234894,
+      "loss": 0.6933,
+      "step": 5582
+    },
+    {
+      "epoch": 0.9941239316239316,
+      "grad_norm": 0.6048742532730103,
+      "learning_rate": 0.0001711841684957292,
+      "loss": 0.9696,
+      "step": 5583
+    },
+    {
+      "epoch": 0.9943019943019943,
+      "grad_norm": 0.5183123350143433,
+      "learning_rate": 0.00017117433681440355,
+      "loss": 1.1313,
+      "step": 5584
+    },
+    {
+      "epoch": 0.9944800569800569,
+      "grad_norm": 0.504916250705719,
+      "learning_rate": 0.00017116450373856466,
+      "loss": 1.0273,
+      "step": 5585
+    },
+    {
+      "epoch": 0.9946581196581197,
+      "grad_norm": 0.5804886817932129,
+      "learning_rate": 0.0001711546692684051,
+      "loss": 1.1162,
+      "step": 5586
+    },
+    {
+      "epoch": 0.9948361823361823,
+      "grad_norm": 0.5531938672065735,
+      "learning_rate": 0.0001711448334041176,
+      "loss": 1.2893,
+      "step": 5587
+    },
+    {
+      "epoch": 0.9950142450142451,
+      "grad_norm": 0.5079928636550903,
+      "learning_rate": 0.00017113499614589492,
+      "loss": 1.0393,
+      "step": 5588
+    },
+    {
+      "epoch": 0.9951923076923077,
+      "grad_norm": 0.5421964526176453,
+      "learning_rate": 0.00017112515749392973,
+      "loss": 0.8844,
+      "step": 5589
+    },
+    {
+      "epoch": 0.9953703703703703,
+      "grad_norm": 0.4834558367729187,
+      "learning_rate": 0.00017111531744841486,
+      "loss": 1.0187,
+      "step": 5590
+    },
+    {
+      "epoch": 0.9955484330484331,
+      "grad_norm": 0.6704340577125549,
+      "learning_rate": 0.00017110547600954307,
+      "loss": 0.8524,
+      "step": 5591
+    },
+    {
+      "epoch": 0.9957264957264957,
+      "grad_norm": 0.4578927159309387,
+      "learning_rate": 0.00017109563317750718,
+      "loss": 1.059,
+      "step": 5592
+    },
+    {
+      "epoch": 0.9959045584045584,
+      "grad_norm": 0.5563494563102722,
+      "learning_rate": 0.00017108578895250006,
+      "loss": 1.1211,
+      "step": 5593
+    },
+    {
+      "epoch": 0.9960826210826211,
+      "grad_norm": 0.5272170901298523,
+      "learning_rate": 0.00017107594333471454,
+      "loss": 0.9224,
+      "step": 5594
+    },
+    {
+      "epoch": 0.9962606837606838,
+      "grad_norm": 0.5697501301765442,
+      "learning_rate": 0.00017106609632434357,
+      "loss": 1.2223,
+      "step": 5595
+    },
+    {
+      "epoch": 0.9964387464387464,
+      "grad_norm": 0.5385653376579285,
+      "learning_rate": 0.00017105624792158007,
+      "loss": 1.0809,
+      "step": 5596
+    },
+    {
+      "epoch": 0.9966168091168092,
+      "grad_norm": 0.5608006119728088,
+      "learning_rate": 0.000171046398126617,
+      "loss": 1.3936,
+      "step": 5597
+    },
+    {
+      "epoch": 0.9967948717948718,
+      "grad_norm": 0.5063132643699646,
+      "learning_rate": 0.00017103654693964736,
+      "loss": 1.2086,
+      "step": 5598
+    },
+    {
+      "epoch": 0.9969729344729344,
+      "grad_norm": 0.6014235019683838,
+      "learning_rate": 0.00017102669436086415,
+      "loss": 1.1231,
+      "step": 5599
+    },
+    {
+      "epoch": 0.9971509971509972,
+      "grad_norm": 0.49549567699432373,
+      "learning_rate": 0.00017101684039046036,
+      "loss": 1.0013,
+      "step": 5600
+    },
+    {
+      "epoch": 0.9973290598290598,
+      "grad_norm": 0.517464816570282,
+      "learning_rate": 0.00017100698502862916,
+      "loss": 1.1143,
+      "step": 5601
+    },
+    {
+      "epoch": 0.9975071225071225,
+      "grad_norm": 0.514281153678894,
+      "learning_rate": 0.00017099712827556358,
+      "loss": 1.0336,
+      "step": 5602
+    },
+    {
+      "epoch": 0.9976851851851852,
+      "grad_norm": 0.5378567576408386,
+      "learning_rate": 0.00017098727013145672,
+      "loss": 0.8278,
+      "step": 5603
+    },
+    {
+      "epoch": 0.9978632478632479,
+      "grad_norm": 0.5098404884338379,
+      "learning_rate": 0.0001709774105965018,
+      "loss": 0.9902,
+      "step": 5604
+    },
+    {
+      "epoch": 0.9980413105413105,
+      "grad_norm": 0.6231759190559387,
+      "learning_rate": 0.00017096754967089198,
+      "loss": 1.0564,
+      "step": 5605
+    },
+    {
+      "epoch": 0.9982193732193733,
+      "grad_norm": 0.47434380650520325,
+      "learning_rate": 0.00017095768735482042,
+      "loss": 0.7457,
+      "step": 5606
+    },
+    {
+      "epoch": 0.9983974358974359,
+      "grad_norm": 0.5771013498306274,
+      "learning_rate": 0.00017094782364848035,
+      "loss": 1.1191,
+      "step": 5607
+    },
+    {
+      "epoch": 0.9985754985754985,
+      "grad_norm": 0.5617234706878662,
+      "learning_rate": 0.00017093795855206508,
+      "loss": 1.0779,
+      "step": 5608
+    },
+    {
+      "epoch": 0.9987535612535613,
+      "grad_norm": 0.6573554873466492,
+      "learning_rate": 0.00017092809206576792,
+      "loss": 1.0191,
+      "step": 5609
+    },
+    {
+      "epoch": 0.9989316239316239,
+      "grad_norm": 0.482834130525589,
+      "learning_rate": 0.00017091822418978207,
+      "loss": 1.0119,
+      "step": 5610
+    },
+    {
+      "epoch": 0.9991096866096866,
+      "grad_norm": 0.47496405243873596,
+      "learning_rate": 0.000170908354924301,
+      "loss": 0.8297,
+      "step": 5611
+    },
+    {
+      "epoch": 0.9992877492877493,
+      "grad_norm": 0.5013265013694763,
+      "learning_rate": 0.00017089848426951796,
+      "loss": 1.1511,
+      "step": 5612
+    },
+    {
+      "epoch": 0.999465811965812,
+      "grad_norm": 0.5402522683143616,
+      "learning_rate": 0.00017088861222562643,
+      "loss": 1.1401,
+      "step": 5613
+    },
+    {
+      "epoch": 0.9996438746438746,
+      "grad_norm": 0.546302318572998,
+      "learning_rate": 0.00017087873879281977,
+      "loss": 0.8611,
+      "step": 5614
+    },
+    {
+      "epoch": 0.9998219373219374,
+      "grad_norm": 0.44279807806015015,
+      "learning_rate": 0.0001708688639712915,
+      "loss": 0.79,
+      "step": 5615
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.5514659285545349,
+      "learning_rate": 0.00017085898776123502,
+      "loss": 1.0709,
+      "step": 5616
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 1.093075156211853,
+      "eval_runtime": 24.6155,
+      "eval_samples_per_second": 42.29,
+      "eval_steps_per_second": 21.166,
+      "step": 5616
+    },
+    {
+      "epoch": 1.0001780626780628,
+      "grad_norm": 0.6290156841278076,
+      "learning_rate": 0.0001708491101628439,
+      "loss": 1.1786,
+      "step": 5617
+    },
+    {
+      "epoch": 1.0001780626780628,
+      "grad_norm": 0.4703841209411621,
+      "learning_rate": 0.00017083923117631162,
+      "loss": 0.9548,
+      "step": 5618
+    },
+    {
+      "epoch": 1.0003561253561253,
+      "grad_norm": 0.4518105089664459,
+      "learning_rate": 0.0001708293508018318,
+      "loss": 1.0089,
+      "step": 5619
+    },
+    {
+      "epoch": 1.000534188034188,
+      "grad_norm": 0.5658619403839111,
+      "learning_rate": 0.00017081946903959794,
+      "loss": 0.9466,
+      "step": 5620
+    },
+    {
+      "epoch": 1.0007122507122508,
+      "grad_norm": 0.6153838634490967,
+      "learning_rate": 0.00017080958588980372,
+      "loss": 1.2898,
+      "step": 5621
+    },
+    {
+      "epoch": 1.0008903133903133,
+      "grad_norm": 0.5245628952980042,
+      "learning_rate": 0.00017079970135264275,
+      "loss": 1.1702,
+      "step": 5622
+    },
+    {
+      "epoch": 1.001068376068376,
+      "grad_norm": 0.5291880965232849,
+      "learning_rate": 0.00017078981542830875,
+      "loss": 1.0779,
+      "step": 5623
+    },
+    {
+      "epoch": 1.0012464387464388,
+      "grad_norm": 0.500579297542572,
+      "learning_rate": 0.0001707799281169953,
+      "loss": 0.9587,
+      "step": 5624
+    },
+    {
+      "epoch": 1.0014245014245013,
+      "grad_norm": 0.45739707350730896,
+      "learning_rate": 0.00017077003941889625,
+      "loss": 0.9373,
+      "step": 5625
+    },
+    {
+      "epoch": 1.001602564102564,
+      "grad_norm": 0.5513401031494141,
+      "learning_rate": 0.00017076014933420526,
+      "loss": 1.0368,
+      "step": 5626
+    },
+    {
+      "epoch": 1.0017806267806268,
+      "grad_norm": 0.46513232588768005,
+      "learning_rate": 0.00017075025786311612,
+      "loss": 0.9422,
+      "step": 5627
+    },
+    {
+      "epoch": 1.0019586894586894,
+      "grad_norm": 0.4530394673347473,
+      "learning_rate": 0.00017074036500582267,
+      "loss": 0.8211,
+      "step": 5628
+    },
+    {
+      "epoch": 1.0021367521367521,
+      "grad_norm": 0.5612013339996338,
+      "learning_rate": 0.00017073047076251872,
+      "loss": 0.9466,
+      "step": 5629
+    },
+    {
+      "epoch": 1.0023148148148149,
+      "grad_norm": 0.4976879954338074,
+      "learning_rate": 0.00017072057513339812,
+      "loss": 0.8059,
+      "step": 5630
+    },
+    {
+      "epoch": 1.0024928774928774,
+      "grad_norm": 0.4842833876609802,
+      "learning_rate": 0.00017071067811865476,
+      "loss": 0.6554,
+      "step": 5631
+    },
+    {
+      "epoch": 1.0026709401709402,
+      "grad_norm": 0.5446373224258423,
+      "learning_rate": 0.00017070077971848257,
+      "loss": 1.1001,
+      "step": 5632
+    },
+    {
+      "epoch": 1.002849002849003,
+      "grad_norm": 0.5996584892272949,
+      "learning_rate": 0.00017069087993307544,
+      "loss": 1.0317,
+      "step": 5633
+    },
+    {
+      "epoch": 1.0030270655270654,
+      "grad_norm": 0.5369443297386169,
+      "learning_rate": 0.00017068097876262738,
+      "loss": 0.8019,
+      "step": 5634
+    },
+    {
+      "epoch": 1.0032051282051282,
+      "grad_norm": 0.4985966682434082,
+      "learning_rate": 0.00017067107620733236,
+      "loss": 1.0121,
+      "step": 5635
+    },
+    {
+      "epoch": 1.003383190883191,
+      "grad_norm": 0.5262824892997742,
+      "learning_rate": 0.0001706611722673844,
+      "loss": 1.0157,
+      "step": 5636
+    },
+    {
+      "epoch": 1.0035612535612535,
+      "grad_norm": 0.5912795066833496,
+      "learning_rate": 0.00017065126694297756,
+      "loss": 1.0327,
+      "step": 5637
+    },
+    {
+      "epoch": 1.0037393162393162,
+      "grad_norm": 0.5866343379020691,
+      "learning_rate": 0.00017064136023430595,
+      "loss": 1.1194,
+      "step": 5638
+    },
+    {
+      "epoch": 1.003917378917379,
+      "grad_norm": 0.5009918808937073,
+      "learning_rate": 0.0001706314521415636,
+      "loss": 1.0467,
+      "step": 5639
+    },
+    {
+      "epoch": 1.0040954415954415,
+      "grad_norm": 0.5455304384231567,
+      "learning_rate": 0.00017062154266494464,
+      "loss": 0.8749,
+      "step": 5640
+    },
+    {
+      "epoch": 1.0042735042735043,
+      "grad_norm": 0.5648258328437805,
+      "learning_rate": 0.00017061163180464328,
+      "loss": 0.9408,
+      "step": 5641
+    },
+    {
+      "epoch": 1.004451566951567,
+      "grad_norm": 0.5276365876197815,
+      "learning_rate": 0.00017060171956085368,
+      "loss": 0.9681,
+      "step": 5642
+    },
+    {
+      "epoch": 1.0046296296296295,
+      "grad_norm": 0.5212745070457458,
+      "learning_rate": 0.00017059180593377007,
+      "loss": 0.9188,
+      "step": 5643
+    },
+    {
+      "epoch": 1.0048076923076923,
+      "grad_norm": 0.540626585483551,
+      "learning_rate": 0.00017058189092358664,
+      "loss": 1.0809,
+      "step": 5644
+    },
+    {
+      "epoch": 1.004985754985755,
+      "grad_norm": 0.5592377781867981,
+      "learning_rate": 0.00017057197453049767,
+      "loss": 0.8589,
+      "step": 5645
+    },
+    {
+      "epoch": 1.0051638176638176,
+      "grad_norm": 0.5115051865577698,
+      "learning_rate": 0.00017056205675469746,
+      "loss": 0.8006,
+      "step": 5646
+    },
+    {
+      "epoch": 1.0053418803418803,
+      "grad_norm": 0.5031117796897888,
+      "learning_rate": 0.00017055213759638034,
+      "loss": 0.9242,
+      "step": 5647
+    },
+    {
+      "epoch": 1.005519943019943,
+      "grad_norm": 0.5342774987220764,
+      "learning_rate": 0.00017054221705574066,
+      "loss": 0.8268,
+      "step": 5648
+    },
+    {
+      "epoch": 1.0056980056980056,
+      "grad_norm": 0.44480493664741516,
+      "learning_rate": 0.00017053229513297276,
+      "loss": 0.6892,
+      "step": 5649
+    },
+    {
+      "epoch": 1.0058760683760684,
+      "grad_norm": 0.5032621622085571,
+      "learning_rate": 0.00017052237182827105,
+      "loss": 0.971,
+      "step": 5650
+    },
+    {
+      "epoch": 1.006054131054131,
+      "grad_norm": 0.5611015558242798,
+      "learning_rate": 0.00017051244714182996,
+      "loss": 0.9403,
+      "step": 5651
+    },
+    {
+      "epoch": 1.0062321937321936,
+      "grad_norm": 0.5064613223075867,
+      "learning_rate": 0.00017050252107384393,
+      "loss": 0.9718,
+      "step": 5652
+    },
+    {
+      "epoch": 1.0064102564102564,
+      "grad_norm": 0.6458395719528198,
+      "learning_rate": 0.0001704925936245075,
+      "loss": 1.1161,
+      "step": 5653
+    },
+    {
+      "epoch": 1.0065883190883191,
+      "grad_norm": 0.527418315410614,
+      "learning_rate": 0.00017048266479401512,
+      "loss": 0.9315,
+      "step": 5654
+    },
+    {
+      "epoch": 1.0067663817663817,
+      "grad_norm": 0.5127941370010376,
+      "learning_rate": 0.00017047273458256133,
+      "loss": 0.8206,
+      "step": 5655
+    },
+    {
+      "epoch": 1.0069444444444444,
+      "grad_norm": 0.6257100105285645,
+      "learning_rate": 0.00017046280299034067,
+      "loss": 0.9854,
+      "step": 5656
+    },
+    {
+      "epoch": 1.0071225071225072,
+      "grad_norm": 0.5081700682640076,
+      "learning_rate": 0.0001704528700175478,
+      "loss": 0.9478,
+      "step": 5657
+    },
+    {
+      "epoch": 1.0073005698005697,
+      "grad_norm": 0.598127543926239,
+      "learning_rate": 0.00017044293566437725,
+      "loss": 1.0721,
+      "step": 5658
+    },
+    {
+      "epoch": 1.0074786324786325,
+      "grad_norm": 0.5429877638816833,
+      "learning_rate": 0.00017043299993102376,
+      "loss": 0.9732,
+      "step": 5659
+    },
+    {
+      "epoch": 1.0076566951566952,
+      "grad_norm": 0.6006619334220886,
+      "learning_rate": 0.00017042306281768194,
+      "loss": 1.1262,
+      "step": 5660
+    },
+    {
+      "epoch": 1.0078347578347577,
+      "grad_norm": 0.48933324217796326,
+      "learning_rate": 0.00017041312432454646,
+      "loss": 0.8596,
+      "step": 5661
+    },
+    {
+      "epoch": 1.0080128205128205,
+      "grad_norm": 0.5902166366577148,
+      "learning_rate": 0.0001704031844518121,
+      "loss": 1.1035,
+      "step": 5662
+    },
+    {
+      "epoch": 1.0081908831908832,
+      "grad_norm": 0.523597776889801,
+      "learning_rate": 0.0001703932431996736,
+      "loss": 0.7117,
+      "step": 5663
+    },
+    {
+      "epoch": 1.0083689458689458,
+      "grad_norm": 0.6313928365707397,
+      "learning_rate": 0.00017038330056832573,
+      "loss": 1.0204,
+      "step": 5664
+    },
+    {
+      "epoch": 1.0085470085470085,
+      "grad_norm": 0.5627471804618835,
+      "learning_rate": 0.00017037335655796328,
+      "loss": 0.7648,
+      "step": 5665
+    },
+    {
+      "epoch": 1.0087250712250713,
+      "grad_norm": 0.5817851424217224,
+      "learning_rate": 0.0001703634111687811,
+      "loss": 1.0452,
+      "step": 5666
+    },
+    {
+      "epoch": 1.0089031339031338,
+      "grad_norm": 0.5143535137176514,
+      "learning_rate": 0.00017035346440097407,
+      "loss": 0.9788,
+      "step": 5667
+    },
+    {
+      "epoch": 1.0090811965811965,
+      "grad_norm": 0.5331187844276428,
+      "learning_rate": 0.000170343516254737,
+      "loss": 0.7584,
+      "step": 5668
+    },
+    {
+      "epoch": 1.0092592592592593,
+      "grad_norm": 0.5723634362220764,
+      "learning_rate": 0.00017033356673026487,
+      "loss": 0.9435,
+      "step": 5669
+    },
+    {
+      "epoch": 1.0094373219373218,
+      "grad_norm": 0.6012297868728638,
+      "learning_rate": 0.00017032361582775265,
+      "loss": 1.142,
+      "step": 5670
+    },
+    {
+      "epoch": 1.0096153846153846,
+      "grad_norm": 0.6161282658576965,
+      "learning_rate": 0.00017031366354739523,
+      "loss": 1.2823,
+      "step": 5671
+    },
+    {
+      "epoch": 1.0097934472934473,
+      "grad_norm": 0.5088054537773132,
+      "learning_rate": 0.00017030370988938763,
+      "loss": 0.9743,
+      "step": 5672
+    },
+    {
+      "epoch": 1.0099715099715099,
+      "grad_norm": 0.512003481388092,
+      "learning_rate": 0.0001702937548539249,
+      "loss": 0.9112,
+      "step": 5673
+    },
+    {
+      "epoch": 1.0101495726495726,
+      "grad_norm": 0.5565149784088135,
+      "learning_rate": 0.00017028379844120207,
+      "loss": 1.0074,
+      "step": 5674
+    },
+    {
+      "epoch": 1.0103276353276354,
+      "grad_norm": 0.6463099718093872,
+      "learning_rate": 0.00017027384065141418,
+      "loss": 1.175,
+      "step": 5675
+    },
+    {
+      "epoch": 1.010505698005698,
+      "grad_norm": 0.46999064087867737,
+      "learning_rate": 0.00017026388148475637,
+      "loss": 0.8429,
+      "step": 5676
+    },
+    {
+      "epoch": 1.0106837606837606,
+      "grad_norm": 0.5617384910583496,
+      "learning_rate": 0.00017025392094142377,
+      "loss": 1.045,
+      "step": 5677
+    },
+    {
+      "epoch": 1.0108618233618234,
+      "grad_norm": 0.5156623721122742,
+      "learning_rate": 0.00017024395902161154,
+      "loss": 1.016,
+      "step": 5678
+    },
+    {
+      "epoch": 1.0110398860398861,
+      "grad_norm": 0.5693390369415283,
+      "learning_rate": 0.00017023399572551484,
+      "loss": 0.8616,
+      "step": 5679
+    },
+    {
+      "epoch": 1.0112179487179487,
+      "grad_norm": 0.5234879851341248,
+      "learning_rate": 0.00017022403105332892,
+      "loss": 0.9244,
+      "step": 5680
+    },
+    {
+      "epoch": 1.0113960113960114,
+      "grad_norm": 0.6513097286224365,
+      "learning_rate": 0.00017021406500524893,
+      "loss": 0.9565,
+      "step": 5681
+    },
+    {
+      "epoch": 1.0115740740740742,
+      "grad_norm": 0.5788878202438354,
+      "learning_rate": 0.00017020409758147022,
+      "loss": 0.8994,
+      "step": 5682
+    },
+    {
+      "epoch": 1.0117521367521367,
+      "grad_norm": 0.5495247840881348,
+      "learning_rate": 0.00017019412878218807,
+      "loss": 0.9371,
+      "step": 5683
+    },
+    {
+      "epoch": 1.0119301994301995,
+      "grad_norm": 0.639045238494873,
+      "learning_rate": 0.00017018415860759777,
+      "loss": 1.0297,
+      "step": 5684
+    },
+    {
+      "epoch": 1.0121082621082622,
+      "grad_norm": 0.5167784690856934,
+      "learning_rate": 0.0001701741870578947,
+      "loss": 0.8974,
+      "step": 5685
+    },
+    {
+      "epoch": 1.0122863247863247,
+      "grad_norm": 0.6131011247634888,
+      "learning_rate": 0.00017016421413327417,
+      "loss": 1.13,
+      "step": 5686
+    },
+    {
+      "epoch": 1.0124643874643875,
+      "grad_norm": 0.4804688096046448,
+      "learning_rate": 0.00017015423983393166,
+      "loss": 1.0098,
+      "step": 5687
+    },
+    {
+      "epoch": 1.0126424501424502,
+      "grad_norm": 0.6605221629142761,
+      "learning_rate": 0.00017014426416006253,
+      "loss": 1.1123,
+      "step": 5688
+    },
+    {
+      "epoch": 1.0128205128205128,
+      "grad_norm": 0.5523666739463806,
+      "learning_rate": 0.00017013428711186226,
+      "loss": 0.8226,
+      "step": 5689
+    },
+    {
+      "epoch": 1.0129985754985755,
+      "grad_norm": 0.6012941598892212,
+      "learning_rate": 0.00017012430868952632,
+      "loss": 0.8915,
+      "step": 5690
+    },
+    {
+      "epoch": 1.0131766381766383,
+      "grad_norm": 0.5830875039100647,
+      "learning_rate": 0.00017011432889325022,
+      "loss": 1.021,
+      "step": 5691
+    },
+    {
+      "epoch": 1.0133547008547008,
+      "grad_norm": 0.5546056032180786,
+      "learning_rate": 0.0001701043477232295,
+      "loss": 0.7656,
+      "step": 5692
+    },
+    {
+      "epoch": 1.0135327635327636,
+      "grad_norm": 0.5592601299285889,
+      "learning_rate": 0.0001700943651796597,
+      "loss": 1.0172,
+      "step": 5693
+    },
+    {
+      "epoch": 1.0137108262108263,
+      "grad_norm": 0.5708866715431213,
+      "learning_rate": 0.00017008438126273645,
+      "loss": 1.0012,
+      "step": 5694
+    },
+    {
+      "epoch": 1.0138888888888888,
+      "grad_norm": 0.6856338381767273,
+      "learning_rate": 0.0001700743959726553,
+      "loss": 1.1278,
+      "step": 5695
+    },
+    {
+      "epoch": 1.0140669515669516,
+      "grad_norm": 0.6523802876472473,
+      "learning_rate": 0.000170064409309612,
+      "loss": 1.0406,
+      "step": 5696
+    },
+    {
+      "epoch": 1.0142450142450143,
+      "grad_norm": 0.6653079986572266,
+      "learning_rate": 0.00017005442127380208,
+      "loss": 1.1086,
+      "step": 5697
+    },
+    {
+      "epoch": 1.0144230769230769,
+      "grad_norm": 0.5841104388237,
+      "learning_rate": 0.00017004443186542133,
+      "loss": 0.9335,
+      "step": 5698
+    },
+    {
+      "epoch": 1.0146011396011396,
+      "grad_norm": 0.5696784257888794,
+      "learning_rate": 0.0001700344410846654,
+      "loss": 1.0247,
+      "step": 5699
+    },
+    {
+      "epoch": 1.0147792022792024,
+      "grad_norm": 0.7135653495788574,
+      "learning_rate": 0.00017002444893173013,
+      "loss": 1.0259,
+      "step": 5700
+    },
+    {
+      "epoch": 1.014957264957265,
+      "grad_norm": 0.5806999802589417,
+      "learning_rate": 0.00017001445540681124,
+      "loss": 1.0053,
+      "step": 5701
+    },
+    {
+      "epoch": 1.0151353276353277,
+      "grad_norm": 0.5298715829849243,
+      "learning_rate": 0.0001700044605101045,
+      "loss": 0.9415,
+      "step": 5702
+    },
+    {
+      "epoch": 1.0153133903133904,
+      "grad_norm": 0.5817379951477051,
+      "learning_rate": 0.0001699944642418058,
+      "loss": 1.0906,
+      "step": 5703
+    },
+    {
+      "epoch": 1.015491452991453,
+      "grad_norm": 0.6564923524856567,
+      "learning_rate": 0.00016998446660211098,
+      "loss": 0.9933,
+      "step": 5704
+    },
+    {
+      "epoch": 1.0156695156695157,
+      "grad_norm": 0.6547308564186096,
+      "learning_rate": 0.00016997446759121592,
+      "loss": 1.0045,
+      "step": 5705
+    },
+    {
+      "epoch": 1.0158475783475784,
+      "grad_norm": 0.5763013958930969,
+      "learning_rate": 0.00016996446720931652,
+      "loss": 1.0898,
+      "step": 5706
+    },
+    {
+      "epoch": 1.016025641025641,
+      "grad_norm": 0.6118074059486389,
+      "learning_rate": 0.00016995446545660871,
+      "loss": 0.9398,
+      "step": 5707
+    },
+    {
+      "epoch": 1.0162037037037037,
+      "grad_norm": 0.6810526251792908,
+      "learning_rate": 0.0001699444623332885,
+      "loss": 1.0968,
+      "step": 5708
+    },
+    {
+      "epoch": 1.0163817663817665,
+      "grad_norm": 0.5292752981185913,
+      "learning_rate": 0.00016993445783955184,
+      "loss": 0.7549,
+      "step": 5709
+    },
+    {
+      "epoch": 1.016559829059829,
+      "grad_norm": 0.6014277935028076,
+      "learning_rate": 0.00016992445197559474,
+      "loss": 1.1711,
+      "step": 5710
+    },
+    {
+      "epoch": 1.0167378917378918,
+      "grad_norm": 0.5089772343635559,
+      "learning_rate": 0.00016991444474161326,
+      "loss": 0.9188,
+      "step": 5711
+    },
+    {
+      "epoch": 1.0169159544159545,
+      "grad_norm": 0.567193865776062,
+      "learning_rate": 0.0001699044361378035,
+      "loss": 0.7462,
+      "step": 5712
+    },
+    {
+      "epoch": 1.017094017094017,
+      "grad_norm": 0.5638598799705505,
+      "learning_rate": 0.00016989442616436147,
+      "loss": 0.9643,
+      "step": 5713
+    },
+    {
+      "epoch": 1.0172720797720798,
+      "grad_norm": 0.5634039640426636,
+      "learning_rate": 0.0001698844148214834,
+      "loss": 1.0141,
+      "step": 5714
+    },
+    {
+      "epoch": 1.0174501424501425,
+      "grad_norm": 0.5326652526855469,
+      "learning_rate": 0.00016987440210936537,
+      "loss": 0.865,
+      "step": 5715
+    },
+    {
+      "epoch": 1.017628205128205,
+      "grad_norm": 0.5858046412467957,
+      "learning_rate": 0.0001698643880282036,
+      "loss": 0.9561,
+      "step": 5716
+    },
+    {
+      "epoch": 1.0178062678062678,
+      "grad_norm": 0.6424698829650879,
+      "learning_rate": 0.00016985437257819428,
+      "loss": 1.0169,
+      "step": 5717
+    },
+    {
+      "epoch": 1.0179843304843306,
+      "grad_norm": 0.6294280290603638,
+      "learning_rate": 0.00016984435575953364,
+      "loss": 1.0438,
+      "step": 5718
+    },
+    {
+      "epoch": 1.018162393162393,
+      "grad_norm": 0.5533088445663452,
+      "learning_rate": 0.00016983433757241788,
+      "loss": 0.8901,
+      "step": 5719
+    },
+    {
+      "epoch": 1.0183404558404558,
+      "grad_norm": 0.5148718953132629,
+      "learning_rate": 0.00016982431801704342,
+      "loss": 0.9201,
+      "step": 5720
+    },
+    {
+      "epoch": 1.0185185185185186,
+      "grad_norm": 0.5609371662139893,
+      "learning_rate": 0.00016981429709360645,
+      "loss": 0.9347,
+      "step": 5721
+    },
+    {
+      "epoch": 1.0186965811965811,
+      "grad_norm": 0.5502731204032898,
+      "learning_rate": 0.00016980427480230338,
+      "loss": 1.0508,
+      "step": 5722
+    },
+    {
+      "epoch": 1.0188746438746439,
+      "grad_norm": 0.5880394577980042,
+      "learning_rate": 0.00016979425114333055,
+      "loss": 1.1258,
+      "step": 5723
+    },
+    {
+      "epoch": 1.0190527065527066,
+      "grad_norm": 0.5569866895675659,
+      "learning_rate": 0.0001697842261168843,
+      "loss": 0.9186,
+      "step": 5724
+    },
+    {
+      "epoch": 1.0192307692307692,
+      "grad_norm": 0.7468093037605286,
+      "learning_rate": 0.00016977419972316116,
+      "loss": 1.2066,
+      "step": 5725
+    },
+    {
+      "epoch": 1.019408831908832,
+      "grad_norm": 0.6041515469551086,
+      "learning_rate": 0.00016976417196235753,
+      "loss": 0.939,
+      "step": 5726
+    },
+    {
+      "epoch": 1.0195868945868947,
+      "grad_norm": 0.6102641224861145,
+      "learning_rate": 0.00016975414283466983,
+      "loss": 0.8334,
+      "step": 5727
+    },
+    {
+      "epoch": 1.0197649572649572,
+      "grad_norm": 0.5418640375137329,
+      "learning_rate": 0.00016974411234029467,
+      "loss": 0.8072,
+      "step": 5728
+    },
+    {
+      "epoch": 1.01994301994302,
+      "grad_norm": 0.6569705605506897,
+      "learning_rate": 0.00016973408047942843,
+      "loss": 1.103,
+      "step": 5729
+    },
+    {
+      "epoch": 1.0201210826210827,
+      "grad_norm": 0.5778102278709412,
+      "learning_rate": 0.00016972404725226778,
+      "loss": 0.9353,
+      "step": 5730
+    },
+    {
+      "epoch": 1.0202991452991452,
+      "grad_norm": 0.5474382638931274,
+      "learning_rate": 0.0001697140126590093,
+      "loss": 1.0009,
+      "step": 5731
+    },
+    {
+      "epoch": 1.020477207977208,
+      "grad_norm": 0.5869506597518921,
+      "learning_rate": 0.00016970397669984947,
+      "loss": 1.0027,
+      "step": 5732
+    },
+    {
+      "epoch": 1.0206552706552707,
+      "grad_norm": 0.5078117251396179,
+      "learning_rate": 0.00016969393937498508,
+      "loss": 0.8316,
+      "step": 5733
+    },
+    {
+      "epoch": 1.0208333333333333,
+      "grad_norm": 0.5488452911376953,
+      "learning_rate": 0.0001696839006846127,
+      "loss": 0.8438,
+      "step": 5734
+    },
+    {
+      "epoch": 1.021011396011396,
+      "grad_norm": 0.5921052098274231,
+      "learning_rate": 0.00016967386062892908,
+      "loss": 0.9147,
+      "step": 5735
+    },
+    {
+      "epoch": 1.0211894586894588,
+      "grad_norm": 0.5486881136894226,
+      "learning_rate": 0.00016966381920813085,
+      "loss": 0.7619,
+      "step": 5736
+    },
+    {
+      "epoch": 1.0213675213675213,
+      "grad_norm": 0.5250689387321472,
+      "learning_rate": 0.00016965377642241483,
+      "loss": 0.9192,
+      "step": 5737
+    },
+    {
+      "epoch": 1.021545584045584,
+      "grad_norm": 0.5355087518692017,
+      "learning_rate": 0.00016964373227197773,
+      "loss": 0.954,
+      "step": 5738
+    },
+    {
+      "epoch": 1.0217236467236468,
+      "grad_norm": 0.6758780479431152,
+      "learning_rate": 0.0001696336867570164,
+      "loss": 1.1257,
+      "step": 5739
+    },
+    {
+      "epoch": 1.0219017094017093,
+      "grad_norm": 0.6361044049263,
+      "learning_rate": 0.00016962363987772756,
+      "loss": 1.0889,
+      "step": 5740
+    },
+    {
+      "epoch": 1.022079772079772,
+      "grad_norm": 0.5802326798439026,
+      "learning_rate": 0.00016961359163430819,
+      "loss": 0.8966,
+      "step": 5741
+    },
+    {
+      "epoch": 1.0222578347578348,
+      "grad_norm": 0.5535712242126465,
+      "learning_rate": 0.00016960354202695508,
+      "loss": 1.0007,
+      "step": 5742
+    },
+    {
+      "epoch": 1.0224358974358974,
+      "grad_norm": 0.5469220280647278,
+      "learning_rate": 0.00016959349105586516,
+      "loss": 0.8202,
+      "step": 5743
+    },
+    {
+      "epoch": 1.02261396011396,
+      "grad_norm": 0.5533008575439453,
+      "learning_rate": 0.00016958343872123534,
+      "loss": 0.9576,
+      "step": 5744
+    },
+    {
+      "epoch": 1.0227920227920229,
+      "grad_norm": 0.615132749080658,
+      "learning_rate": 0.00016957338502326258,
+      "loss": 0.8719,
+      "step": 5745
+    },
+    {
+      "epoch": 1.0229700854700854,
+      "grad_norm": 0.519075334072113,
+      "learning_rate": 0.0001695633299621439,
+      "loss": 0.8309,
+      "step": 5746
+    },
+    {
+      "epoch": 1.0231481481481481,
+      "grad_norm": 0.6249759197235107,
+      "learning_rate": 0.00016955327353807624,
+      "loss": 1.151,
+      "step": 5747
+    },
+    {
+      "epoch": 1.023326210826211,
+      "grad_norm": 0.560299277305603,
+      "learning_rate": 0.00016954321575125668,
+      "loss": 0.7889,
+      "step": 5748
+    },
+    {
+      "epoch": 1.0235042735042734,
+      "grad_norm": 0.5735262036323547,
+      "learning_rate": 0.0001695331566018823,
+      "loss": 0.8794,
+      "step": 5749
+    },
+    {
+      "epoch": 1.0236823361823362,
+      "grad_norm": 0.5893994569778442,
+      "learning_rate": 0.00016952309609015012,
+      "loss": 0.9696,
+      "step": 5750
+    },
+    {
+      "epoch": 1.023860398860399,
+      "grad_norm": 0.6064512133598328,
+      "learning_rate": 0.0001695130342162573,
+      "loss": 0.9771,
+      "step": 5751
+    },
+    {
+      "epoch": 1.0240384615384615,
+      "grad_norm": 0.5833427309989929,
+      "learning_rate": 0.00016950297098040099,
+      "loss": 1.1768,
+      "step": 5752
+    },
+    {
+      "epoch": 1.0242165242165242,
+      "grad_norm": 0.5940282344818115,
+      "learning_rate": 0.00016949290638277833,
+      "loss": 1.0758,
+      "step": 5753
+    },
+    {
+      "epoch": 1.024394586894587,
+      "grad_norm": 0.5267124772071838,
+      "learning_rate": 0.00016948284042358656,
+      "loss": 0.772,
+      "step": 5754
+    },
+    {
+      "epoch": 1.0245726495726495,
+      "grad_norm": 0.6217982172966003,
+      "learning_rate": 0.00016947277310302284,
+      "loss": 0.8583,
+      "step": 5755
+    },
+    {
+      "epoch": 1.0247507122507122,
+      "grad_norm": 0.6192215085029602,
+      "learning_rate": 0.00016946270442128443,
+      "loss": 0.9148,
+      "step": 5756
+    },
+    {
+      "epoch": 1.024928774928775,
+      "grad_norm": 0.5337123870849609,
+      "learning_rate": 0.00016945263437856867,
+      "loss": 1.0054,
+      "step": 5757
+    },
+    {
+      "epoch": 1.0251068376068375,
+      "grad_norm": 0.5462040901184082,
+      "learning_rate": 0.00016944256297507276,
+      "loss": 1.1097,
+      "step": 5758
+    },
+    {
+      "epoch": 1.0252849002849003,
+      "grad_norm": 0.5606170892715454,
+      "learning_rate": 0.00016943249021099415,
+      "loss": 1.0192,
+      "step": 5759
+    },
+    {
+      "epoch": 1.025462962962963,
+      "grad_norm": 0.636974573135376,
+      "learning_rate": 0.00016942241608653008,
+      "loss": 1.0241,
+      "step": 5760
+    },
+    {
+      "epoch": 1.0256410256410255,
+      "grad_norm": 0.4895164966583252,
+      "learning_rate": 0.00016941234060187797,
+      "loss": 0.9057,
+      "step": 5761
+    },
+    {
+      "epoch": 1.0258190883190883,
+      "grad_norm": 0.5810303092002869,
+      "learning_rate": 0.00016940226375723527,
+      "loss": 1.0809,
+      "step": 5762
+    },
+    {
+      "epoch": 1.025997150997151,
+      "grad_norm": 0.6043853163719177,
+      "learning_rate": 0.00016939218555279937,
+      "loss": 1.0685,
+      "step": 5763
+    },
+    {
+      "epoch": 1.0261752136752136,
+      "grad_norm": 0.5827188491821289,
+      "learning_rate": 0.00016938210598876774,
+      "loss": 1.0236,
+      "step": 5764
+    },
+    {
+      "epoch": 1.0263532763532763,
+      "grad_norm": 0.6677887439727783,
+      "learning_rate": 0.0001693720250653379,
+      "loss": 1.0586,
+      "step": 5765
+    },
+    {
+      "epoch": 1.026531339031339,
+      "grad_norm": 0.558051347732544,
+      "learning_rate": 0.0001693619427827073,
+      "loss": 0.745,
+      "step": 5766
+    },
+    {
+      "epoch": 1.0267094017094016,
+      "grad_norm": 0.6336706280708313,
+      "learning_rate": 0.0001693518591410735,
+      "loss": 1.0658,
+      "step": 5767
+    },
+    {
+      "epoch": 1.0268874643874644,
+      "grad_norm": 0.7077126502990723,
+      "learning_rate": 0.00016934177414063416,
+      "loss": 1.18,
+      "step": 5768
+    },
+    {
+      "epoch": 1.0270655270655271,
+      "grad_norm": 0.5342326760292053,
+      "learning_rate": 0.00016933168778158675,
+      "loss": 0.8347,
+      "step": 5769
+    },
+    {
+      "epoch": 1.0272435897435896,
+      "grad_norm": 0.6116416454315186,
+      "learning_rate": 0.00016932160006412895,
+      "loss": 1.0648,
+      "step": 5770
+    },
+    {
+      "epoch": 1.0274216524216524,
+      "grad_norm": 0.5411320924758911,
+      "learning_rate": 0.0001693115109884584,
+      "loss": 1.0756,
+      "step": 5771
+    },
+    {
+      "epoch": 1.0275997150997151,
+      "grad_norm": 0.5549847483634949,
+      "learning_rate": 0.00016930142055477277,
+      "loss": 0.7259,
+      "step": 5772
+    },
+    {
+      "epoch": 1.0277777777777777,
+      "grad_norm": 0.549010694026947,
+      "learning_rate": 0.00016929132876326977,
+      "loss": 0.9488,
+      "step": 5773
+    },
+    {
+      "epoch": 1.0279558404558404,
+      "grad_norm": 0.6302017569541931,
+      "learning_rate": 0.00016928123561414714,
+      "loss": 0.8851,
+      "step": 5774
+    },
+    {
+      "epoch": 1.0281339031339032,
+      "grad_norm": 0.5831273198127747,
+      "learning_rate": 0.00016927114110760257,
+      "loss": 0.7841,
+      "step": 5775
+    },
+    {
+      "epoch": 1.0283119658119657,
+      "grad_norm": 0.5528474450111389,
+      "learning_rate": 0.00016926104524383394,
+      "loss": 1.0108,
+      "step": 5776
+    },
+    {
+      "epoch": 1.0284900284900285,
+      "grad_norm": 0.6279126405715942,
+      "learning_rate": 0.00016925094802303897,
+      "loss": 0.8632,
+      "step": 5777
+    },
+    {
+      "epoch": 1.0286680911680912,
+      "grad_norm": 0.6783218383789062,
+      "learning_rate": 0.00016924084944541554,
+      "loss": 1.0746,
+      "step": 5778
+    },
+    {
+      "epoch": 1.0288461538461537,
+      "grad_norm": 0.5823925137519836,
+      "learning_rate": 0.00016923074951116153,
+      "loss": 1.0486,
+      "step": 5779
+    },
+    {
+      "epoch": 1.0290242165242165,
+      "grad_norm": 0.6095981597900391,
+      "learning_rate": 0.00016922064822047473,
+      "loss": 0.8113,
+      "step": 5780
+    },
+    {
+      "epoch": 1.0292022792022792,
+      "grad_norm": 0.7887664437294006,
+      "learning_rate": 0.00016921054557355317,
+      "loss": 1.2411,
+      "step": 5781
+    },
+    {
+      "epoch": 1.0293803418803418,
+      "grad_norm": 0.6511263251304626,
+      "learning_rate": 0.00016920044157059475,
+      "loss": 0.924,
+      "step": 5782
+    },
+    {
+      "epoch": 1.0295584045584045,
+      "grad_norm": 0.6045661568641663,
+      "learning_rate": 0.00016919033621179744,
+      "loss": 0.8373,
+      "step": 5783
+    },
+    {
+      "epoch": 1.0297364672364673,
+      "grad_norm": 0.6914188861846924,
+      "learning_rate": 0.0001691802294973592,
+      "loss": 0.9589,
+      "step": 5784
+    },
+    {
+      "epoch": 1.0299145299145298,
+      "grad_norm": 0.6483730673789978,
+      "learning_rate": 0.00016917012142747805,
+      "loss": 0.9871,
+      "step": 5785
+    },
+    {
+      "epoch": 1.0300925925925926,
+      "grad_norm": 0.5775033235549927,
+      "learning_rate": 0.0001691600120023521,
+      "loss": 1.0591,
+      "step": 5786
+    },
+    {
+      "epoch": 1.0302706552706553,
+      "grad_norm": 0.6206814646720886,
+      "learning_rate": 0.00016914990122217932,
+      "loss": 0.9126,
+      "step": 5787
+    },
+    {
+      "epoch": 1.0304487179487178,
+      "grad_norm": 0.5422028303146362,
+      "learning_rate": 0.00016913978908715796,
+      "loss": 0.8227,
+      "step": 5788
+    },
+    {
+      "epoch": 1.0306267806267806,
+      "grad_norm": 0.5824416875839233,
+      "learning_rate": 0.000169129675597486,
+      "loss": 1.111,
+      "step": 5789
+    },
+    {
+      "epoch": 1.0308048433048433,
+      "grad_norm": 0.5419015884399414,
+      "learning_rate": 0.00016911956075336165,
+      "loss": 0.8941,
+      "step": 5790
+    },
+    {
+      "epoch": 1.0309829059829059,
+      "grad_norm": 0.6171557903289795,
+      "learning_rate": 0.0001691094445549831,
+      "loss": 0.8679,
+      "step": 5791
+    },
+    {
+      "epoch": 1.0311609686609686,
+      "grad_norm": 0.6136980056762695,
+      "learning_rate": 0.00016909932700254855,
+      "loss": 0.9266,
+      "step": 5792
+    },
+    {
+      "epoch": 1.0313390313390314,
+      "grad_norm": 0.6275020241737366,
+      "learning_rate": 0.00016908920809625624,
+      "loss": 1.0828,
+      "step": 5793
+    },
+    {
+      "epoch": 1.0315170940170941,
+      "grad_norm": 0.6538251638412476,
+      "learning_rate": 0.0001690790878363044,
+      "loss": 0.8413,
+      "step": 5794
+    },
+    {
+      "epoch": 1.0316951566951567,
+      "grad_norm": 0.5981295108795166,
+      "learning_rate": 0.00016906896622289136,
+      "loss": 0.9845,
+      "step": 5795
+    },
+    {
+      "epoch": 1.0318732193732194,
+      "grad_norm": 0.5390967130661011,
+      "learning_rate": 0.00016905884325621538,
+      "loss": 0.8755,
+      "step": 5796
+    },
+    {
+      "epoch": 1.032051282051282,
+      "grad_norm": 0.5534448623657227,
+      "learning_rate": 0.00016904871893647482,
+      "loss": 1.1868,
+      "step": 5797
+    },
+    {
+      "epoch": 1.0322293447293447,
+      "grad_norm": 0.664556086063385,
+      "learning_rate": 0.00016903859326386806,
+      "loss": 1.1418,
+      "step": 5798
+    },
+    {
+      "epoch": 1.0324074074074074,
+      "grad_norm": 0.5737143158912659,
+      "learning_rate": 0.00016902846623859346,
+      "loss": 1.124,
+      "step": 5799
+    },
+    {
+      "epoch": 1.0325854700854702,
+      "grad_norm": 0.6499935388565063,
+      "learning_rate": 0.0001690183378608495,
+      "loss": 1.0331,
+      "step": 5800
+    },
+    {
+      "epoch": 1.0327635327635327,
+      "grad_norm": 0.5721518993377686,
+      "learning_rate": 0.00016900820813083454,
+      "loss": 0.8664,
+      "step": 5801
+    },
+    {
+      "epoch": 1.0329415954415955,
+      "grad_norm": 0.5651140809059143,
+      "learning_rate": 0.0001689980770487471,
+      "loss": 1.1661,
+      "step": 5802
+    },
+    {
+      "epoch": 1.0331196581196582,
+      "grad_norm": 0.5935871005058289,
+      "learning_rate": 0.0001689879446147857,
+      "loss": 0.8722,
+      "step": 5803
+    },
+    {
+      "epoch": 1.0332977207977208,
+      "grad_norm": 0.5627842545509338,
+      "learning_rate": 0.00016897781082914884,
+      "loss": 1.0036,
+      "step": 5804
+    },
+    {
+      "epoch": 1.0334757834757835,
+      "grad_norm": 0.5866895914077759,
+      "learning_rate": 0.00016896767569203502,
+      "loss": 0.9739,
+      "step": 5805
+    },
+    {
+      "epoch": 1.0336538461538463,
+      "grad_norm": 0.5568059682846069,
+      "learning_rate": 0.0001689575392036429,
+      "loss": 0.7081,
+      "step": 5806
+    },
+    {
+      "epoch": 1.0338319088319088,
+      "grad_norm": 0.6054235100746155,
+      "learning_rate": 0.00016894740136417103,
+      "loss": 1.1168,
+      "step": 5807
+    },
+    {
+      "epoch": 1.0340099715099715,
+      "grad_norm": 0.5215454697608948,
+      "learning_rate": 0.00016893726217381805,
+      "loss": 0.9172,
+      "step": 5808
+    },
+    {
+      "epoch": 1.0341880341880343,
+      "grad_norm": 0.5415732860565186,
+      "learning_rate": 0.00016892712163278263,
+      "loss": 0.7812,
+      "step": 5809
+    },
+    {
+      "epoch": 1.0343660968660968,
+      "grad_norm": 0.6341692805290222,
+      "learning_rate": 0.00016891697974126345,
+      "loss": 1.0658,
+      "step": 5810
+    },
+    {
+      "epoch": 1.0345441595441596,
+      "grad_norm": 0.6326245665550232,
+      "learning_rate": 0.00016890683649945922,
+      "loss": 1.0134,
+      "step": 5811
+    },
+    {
+      "epoch": 1.0347222222222223,
+      "grad_norm": 0.5729571580886841,
+      "learning_rate": 0.00016889669190756868,
+      "loss": 0.9139,
+      "step": 5812
+    },
+    {
+      "epoch": 1.0349002849002849,
+      "grad_norm": 0.5912853479385376,
+      "learning_rate": 0.00016888654596579054,
+      "loss": 1.122,
+      "step": 5813
+    },
+    {
+      "epoch": 1.0350783475783476,
+      "grad_norm": 0.8410450220108032,
+      "learning_rate": 0.00016887639867432368,
+      "loss": 1.3009,
+      "step": 5814
+    },
+    {
+      "epoch": 1.0352564102564104,
+      "grad_norm": 0.5416620969772339,
+      "learning_rate": 0.00016886625003336683,
+      "loss": 0.8751,
+      "step": 5815
+    },
+    {
+      "epoch": 1.0354344729344729,
+      "grad_norm": 0.6367851495742798,
+      "learning_rate": 0.0001688561000431189,
+      "loss": 0.956,
+      "step": 5816
+    },
+    {
+      "epoch": 1.0356125356125356,
+      "grad_norm": 0.4618827700614929,
+      "learning_rate": 0.0001688459487037787,
+      "loss": 0.5313,
+      "step": 5817
+    },
+    {
+      "epoch": 1.0357905982905984,
+      "grad_norm": 0.7139244079589844,
+      "learning_rate": 0.00016883579601554516,
+      "loss": 1.0787,
+      "step": 5818
+    },
+    {
+      "epoch": 1.035968660968661,
+      "grad_norm": 0.6896135210990906,
+      "learning_rate": 0.00016882564197861715,
+      "loss": 0.932,
+      "step": 5819
+    },
+    {
+      "epoch": 1.0361467236467237,
+      "grad_norm": 0.5889739394187927,
+      "learning_rate": 0.00016881548659319372,
+      "loss": 0.8852,
+      "step": 5820
+    },
+    {
+      "epoch": 1.0363247863247864,
+      "grad_norm": 0.5954701900482178,
+      "learning_rate": 0.00016880532985947375,
+      "loss": 0.8192,
+      "step": 5821
+    },
+    {
+      "epoch": 1.036502849002849,
+      "grad_norm": 0.6665091514587402,
+      "learning_rate": 0.00016879517177765627,
+      "loss": 0.9578,
+      "step": 5822
+    },
+    {
+      "epoch": 1.0366809116809117,
+      "grad_norm": 0.5990539789199829,
+      "learning_rate": 0.00016878501234794034,
+      "loss": 0.9797,
+      "step": 5823
+    },
+    {
+      "epoch": 1.0368589743589745,
+      "grad_norm": 0.596755862236023,
+      "learning_rate": 0.00016877485157052496,
+      "loss": 1.173,
+      "step": 5824
+    },
+    {
+      "epoch": 1.037037037037037,
+      "grad_norm": 0.544658362865448,
+      "learning_rate": 0.00016876468944560923,
+      "loss": 1.0742,
+      "step": 5825
+    },
+    {
+      "epoch": 1.0372150997150997,
+      "grad_norm": 0.5841910243034363,
+      "learning_rate": 0.00016875452597339225,
+      "loss": 1.029,
+      "step": 5826
+    },
+    {
+      "epoch": 1.0373931623931625,
+      "grad_norm": 0.6508592963218689,
+      "learning_rate": 0.00016874436115407317,
+      "loss": 0.9883,
+      "step": 5827
+    },
+    {
+      "epoch": 1.037571225071225,
+      "grad_norm": 0.590050458908081,
+      "learning_rate": 0.00016873419498785114,
+      "loss": 1.0713,
+      "step": 5828
+    },
+    {
+      "epoch": 1.0377492877492878,
+      "grad_norm": 0.5386307239532471,
+      "learning_rate": 0.00016872402747492534,
+      "loss": 1.0159,
+      "step": 5829
+    },
+    {
+      "epoch": 1.0379273504273505,
+      "grad_norm": 0.6173896193504333,
+      "learning_rate": 0.00016871385861549497,
+      "loss": 1.0056,
+      "step": 5830
+    },
+    {
+      "epoch": 1.038105413105413,
+      "grad_norm": 0.5377787351608276,
+      "learning_rate": 0.0001687036884097593,
+      "loss": 0.8708,
+      "step": 5831
+    },
+    {
+      "epoch": 1.0382834757834758,
+      "grad_norm": 0.5753569006919861,
+      "learning_rate": 0.00016869351685791756,
+      "loss": 1.0529,
+      "step": 5832
+    },
+    {
+      "epoch": 1.0384615384615385,
+      "grad_norm": 0.6085895299911499,
+      "learning_rate": 0.00016868334396016906,
+      "loss": 1.1017,
+      "step": 5833
+    },
+    {
+      "epoch": 1.038639601139601,
+      "grad_norm": 0.6320509910583496,
+      "learning_rate": 0.0001686731697167131,
+      "loss": 1.0543,
+      "step": 5834
+    },
+    {
+      "epoch": 1.0388176638176638,
+      "grad_norm": 0.5691760778427124,
+      "learning_rate": 0.00016866299412774907,
+      "loss": 0.9975,
+      "step": 5835
+    },
+    {
+      "epoch": 1.0389957264957266,
+      "grad_norm": 0.5990765690803528,
+      "learning_rate": 0.0001686528171934763,
+      "loss": 0.8776,
+      "step": 5836
+    },
+    {
+      "epoch": 1.039173789173789,
+      "grad_norm": 0.6650477647781372,
+      "learning_rate": 0.00016864263891409415,
+      "loss": 1.0652,
+      "step": 5837
+    },
+    {
+      "epoch": 1.0393518518518519,
+      "grad_norm": 0.6050353646278381,
+      "learning_rate": 0.00016863245928980212,
+      "loss": 0.9313,
+      "step": 5838
+    },
+    {
+      "epoch": 1.0395299145299146,
+      "grad_norm": 0.587505578994751,
+      "learning_rate": 0.0001686222783207996,
+      "loss": 0.9892,
+      "step": 5839
+    },
+    {
+      "epoch": 1.0397079772079771,
+      "grad_norm": 0.6310170292854309,
+      "learning_rate": 0.00016861209600728608,
+      "loss": 1.1045,
+      "step": 5840
+    },
+    {
+      "epoch": 1.03988603988604,
+      "grad_norm": 0.5683430433273315,
+      "learning_rate": 0.0001686019123494611,
+      "loss": 1.0507,
+      "step": 5841
+    },
+    {
+      "epoch": 1.0400641025641026,
+      "grad_norm": 0.6621488332748413,
+      "learning_rate": 0.00016859172734752414,
+      "loss": 0.9255,
+      "step": 5842
+    },
+    {
+      "epoch": 1.0402421652421652,
+      "grad_norm": 0.6197706460952759,
+      "learning_rate": 0.00016858154100167475,
+      "loss": 1.0031,
+      "step": 5843
+    },
+    {
+      "epoch": 1.040420227920228,
+      "grad_norm": 0.6805898547172546,
+      "learning_rate": 0.00016857135331211257,
+      "loss": 0.9901,
+      "step": 5844
+    },
+    {
+      "epoch": 1.0405982905982907,
+      "grad_norm": 0.5512405633926392,
+      "learning_rate": 0.00016856116427903714,
+      "loss": 1.0033,
+      "step": 5845
+    },
+    {
+      "epoch": 1.0407763532763532,
+      "grad_norm": 0.5643384456634521,
+      "learning_rate": 0.00016855097390264815,
+      "loss": 0.9136,
+      "step": 5846
+    },
+    {
+      "epoch": 1.040954415954416,
+      "grad_norm": 0.48351922631263733,
+      "learning_rate": 0.0001685407821831452,
+      "loss": 0.6163,
+      "step": 5847
+    },
+    {
+      "epoch": 1.0411324786324787,
+      "grad_norm": 0.6256039142608643,
+      "learning_rate": 0.00016853058912072802,
+      "loss": 0.9409,
+      "step": 5848
+    },
+    {
+      "epoch": 1.0413105413105412,
+      "grad_norm": 0.6539996862411499,
+      "learning_rate": 0.00016852039471559627,
+      "loss": 0.9367,
+      "step": 5849
+    },
+    {
+      "epoch": 1.041488603988604,
+      "grad_norm": 0.6192609667778015,
+      "learning_rate": 0.00016851019896794975,
+      "loss": 0.9631,
+      "step": 5850
+    },
+    {
+      "epoch": 1.0416666666666667,
+      "grad_norm": 0.613563060760498,
+      "learning_rate": 0.0001685000018779882,
+      "loss": 0.9132,
+      "step": 5851
+    },
+    {
+      "epoch": 1.0418447293447293,
+      "grad_norm": 0.6004200577735901,
+      "learning_rate": 0.0001684898034459114,
+      "loss": 1.1313,
+      "step": 5852
+    },
+    {
+      "epoch": 1.042022792022792,
+      "grad_norm": 0.6158567070960999,
+      "learning_rate": 0.0001684796036719192,
+      "loss": 1.0253,
+      "step": 5853
+    },
+    {
+      "epoch": 1.0422008547008548,
+      "grad_norm": 0.6362335085868835,
+      "learning_rate": 0.00016846940255621143,
+      "loss": 0.93,
+      "step": 5854
+    },
+    {
+      "epoch": 1.0423789173789173,
+      "grad_norm": 0.6148427128791809,
+      "learning_rate": 0.00016845920009898787,
+      "loss": 0.9122,
+      "step": 5855
+    },
+    {
+      "epoch": 1.04255698005698,
+      "grad_norm": 0.5119984149932861,
+      "learning_rate": 0.00016844899630044858,
+      "loss": 0.7954,
+      "step": 5856
+    },
+    {
+      "epoch": 1.0427350427350428,
+      "grad_norm": 0.571849524974823,
+      "learning_rate": 0.00016843879116079338,
+      "loss": 0.8588,
+      "step": 5857
+    },
+    {
+      "epoch": 1.0429131054131053,
+      "grad_norm": 0.6173384785652161,
+      "learning_rate": 0.00016842858468022221,
+      "loss": 1.0475,
+      "step": 5858
+    },
+    {
+      "epoch": 1.043091168091168,
+      "grad_norm": 0.566114068031311,
+      "learning_rate": 0.0001684183768589351,
+      "loss": 0.8485,
+      "step": 5859
+    },
+    {
+      "epoch": 1.0432692307692308,
+      "grad_norm": 0.653134286403656,
+      "learning_rate": 0.000168408167697132,
+      "loss": 0.9976,
+      "step": 5860
+    },
+    {
+      "epoch": 1.0434472934472934,
+      "grad_norm": 0.63815838098526,
+      "learning_rate": 0.00016839795719501296,
+      "loss": 0.7091,
+      "step": 5861
+    },
+    {
+      "epoch": 1.0436253561253561,
+      "grad_norm": 0.5109001994132996,
+      "learning_rate": 0.00016838774535277805,
+      "loss": 0.7668,
+      "step": 5862
+    },
+    {
+      "epoch": 1.0438034188034189,
+      "grad_norm": 0.6741907596588135,
+      "learning_rate": 0.0001683775321706273,
+      "loss": 1.0493,
+      "step": 5863
+    },
+    {
+      "epoch": 1.0439814814814814,
+      "grad_norm": 0.6006115674972534,
+      "learning_rate": 0.0001683673176487609,
+      "loss": 0.9784,
+      "step": 5864
+    },
+    {
+      "epoch": 1.0441595441595442,
+      "grad_norm": 0.5504778027534485,
+      "learning_rate": 0.0001683571017873789,
+      "loss": 0.9718,
+      "step": 5865
+    },
+    {
+      "epoch": 1.044337606837607,
+      "grad_norm": 0.5713102221488953,
+      "learning_rate": 0.00016834688458668148,
+      "loss": 1.12,
+      "step": 5866
+    },
+    {
+      "epoch": 1.0445156695156694,
+      "grad_norm": 0.7878454923629761,
+      "learning_rate": 0.00016833666604686886,
+      "loss": 1.1803,
+      "step": 5867
+    },
+    {
+      "epoch": 1.0446937321937322,
+      "grad_norm": 0.582697331905365,
+      "learning_rate": 0.00016832644616814122,
+      "loss": 0.943,
+      "step": 5868
+    },
+    {
+      "epoch": 1.044871794871795,
+      "grad_norm": 0.5300645232200623,
+      "learning_rate": 0.00016831622495069878,
+      "loss": 0.9087,
+      "step": 5869
+    },
+    {
+      "epoch": 1.0450498575498575,
+      "grad_norm": 0.5627666115760803,
+      "learning_rate": 0.00016830600239474186,
+      "loss": 1.081,
+      "step": 5870
+    },
+    {
+      "epoch": 1.0452279202279202,
+      "grad_norm": 0.6760496497154236,
+      "learning_rate": 0.0001682957785004707,
+      "loss": 1.1098,
+      "step": 5871
+    },
+    {
+      "epoch": 1.045405982905983,
+      "grad_norm": 0.6424084901809692,
+      "learning_rate": 0.00016828555326808565,
+      "loss": 0.9657,
+      "step": 5872
+    },
+    {
+      "epoch": 1.0455840455840455,
+      "grad_norm": 0.5523313283920288,
+      "learning_rate": 0.000168275326697787,
+      "loss": 1.0163,
+      "step": 5873
+    },
+    {
+      "epoch": 1.0457621082621082,
+      "grad_norm": 0.5582337975502014,
+      "learning_rate": 0.00016826509878977518,
+      "loss": 0.8825,
+      "step": 5874
+    },
+    {
+      "epoch": 1.045940170940171,
+      "grad_norm": 0.5603214502334595,
+      "learning_rate": 0.00016825486954425055,
+      "loss": 0.9032,
+      "step": 5875
+    },
+    {
+      "epoch": 1.0461182336182335,
+      "grad_norm": 0.5944222807884216,
+      "learning_rate": 0.00016824463896141355,
+      "loss": 0.9384,
+      "step": 5876
+    },
+    {
+      "epoch": 1.0462962962962963,
+      "grad_norm": 0.6220229268074036,
+      "learning_rate": 0.00016823440704146457,
+      "loss": 0.8962,
+      "step": 5877
+    },
+    {
+      "epoch": 1.046474358974359,
+      "grad_norm": 0.5607972145080566,
+      "learning_rate": 0.0001682241737846042,
+      "loss": 0.9385,
+      "step": 5878
+    },
+    {
+      "epoch": 1.0466524216524216,
+      "grad_norm": 0.6206870079040527,
+      "learning_rate": 0.00016821393919103282,
+      "loss": 1.0597,
+      "step": 5879
+    },
+    {
+      "epoch": 1.0468304843304843,
+      "grad_norm": 0.5126399993896484,
+      "learning_rate": 0.000168203703260951,
+      "loss": 0.9403,
+      "step": 5880
+    },
+    {
+      "epoch": 1.047008547008547,
+      "grad_norm": 0.6569282412528992,
+      "learning_rate": 0.00016819346599455929,
+      "loss": 0.8124,
+      "step": 5881
+    },
+    {
+      "epoch": 1.0471866096866096,
+      "grad_norm": 0.6670137047767639,
+      "learning_rate": 0.0001681832273920583,
+      "loss": 1.1927,
+      "step": 5882
+    },
+    {
+      "epoch": 1.0473646723646723,
+      "grad_norm": 0.5403243899345398,
+      "learning_rate": 0.00016817298745364862,
+      "loss": 0.8539,
+      "step": 5883
+    },
+    {
+      "epoch": 1.047542735042735,
+      "grad_norm": 0.5500505566596985,
+      "learning_rate": 0.00016816274617953086,
+      "loss": 1.1064,
+      "step": 5884
+    },
+    {
+      "epoch": 1.0477207977207976,
+      "grad_norm": 0.5482703447341919,
+      "learning_rate": 0.00016815250356990566,
+      "loss": 0.7276,
+      "step": 5885
+    },
+    {
+      "epoch": 1.0478988603988604,
+      "grad_norm": 0.6290771961212158,
+      "learning_rate": 0.00016814225962497373,
+      "loss": 0.9018,
+      "step": 5886
+    },
+    {
+      "epoch": 1.0480769230769231,
+      "grad_norm": 0.6404094696044922,
+      "learning_rate": 0.00016813201434493578,
+      "loss": 1.0638,
+      "step": 5887
+    },
+    {
+      "epoch": 1.0482549857549857,
+      "grad_norm": 0.5484994053840637,
+      "learning_rate": 0.0001681217677299926,
+      "loss": 1.0033,
+      "step": 5888
+    },
+    {
+      "epoch": 1.0484330484330484,
+      "grad_norm": 0.6474852561950684,
+      "learning_rate": 0.0001681115197803448,
+      "loss": 1.1017,
+      "step": 5889
+    },
+    {
+      "epoch": 1.0486111111111112,
+      "grad_norm": 0.6186243295669556,
+      "learning_rate": 0.0001681012704961933,
+      "loss": 0.9978,
+      "step": 5890
+    },
+    {
+      "epoch": 1.0487891737891737,
+      "grad_norm": 0.6244034767150879,
+      "learning_rate": 0.00016809101987773887,
+      "loss": 0.9906,
+      "step": 5891
+    },
+    {
+      "epoch": 1.0489672364672364,
+      "grad_norm": 0.5893426537513733,
+      "learning_rate": 0.00016808076792518235,
+      "loss": 0.9345,
+      "step": 5892
+    },
+    {
+      "epoch": 1.0491452991452992,
+      "grad_norm": 0.6283876299858093,
+      "learning_rate": 0.0001680705146387246,
+      "loss": 1.0041,
+      "step": 5893
+    },
+    {
+      "epoch": 1.0493233618233617,
+      "grad_norm": 0.6075255870819092,
+      "learning_rate": 0.00016806026001856656,
+      "loss": 1.0661,
+      "step": 5894
+    },
+    {
+      "epoch": 1.0495014245014245,
+      "grad_norm": 0.5350496768951416,
+      "learning_rate": 0.00016805000406490907,
+      "loss": 0.6789,
+      "step": 5895
+    },
+    {
+      "epoch": 1.0496794871794872,
+      "grad_norm": 0.5380373597145081,
+      "learning_rate": 0.00016803974677795312,
+      "loss": 0.8889,
+      "step": 5896
+    },
+    {
+      "epoch": 1.0498575498575498,
+      "grad_norm": 0.6145668029785156,
+      "learning_rate": 0.0001680294881578997,
+      "loss": 0.8952,
+      "step": 5897
+    },
+    {
+      "epoch": 1.0500356125356125,
+      "grad_norm": 0.5666532516479492,
+      "learning_rate": 0.00016801922820494972,
+      "loss": 0.9697,
+      "step": 5898
+    },
+    {
+      "epoch": 1.0502136752136753,
+      "grad_norm": 0.5352747440338135,
+      "learning_rate": 0.0001680089669193043,
+      "loss": 0.9619,
+      "step": 5899
+    },
+    {
+      "epoch": 1.0503917378917378,
+      "grad_norm": 0.5405527949333191,
+      "learning_rate": 0.00016799870430116444,
+      "loss": 0.8733,
+      "step": 5900
+    },
+    {
+      "epoch": 1.0505698005698005,
+      "grad_norm": 0.5936748385429382,
+      "learning_rate": 0.00016798844035073124,
+      "loss": 0.8746,
+      "step": 5901
+    },
+    {
+      "epoch": 1.0507478632478633,
+      "grad_norm": 0.539652943611145,
+      "learning_rate": 0.00016797817506820578,
+      "loss": 0.8743,
+      "step": 5902
+    },
+    {
+      "epoch": 1.0509259259259258,
+      "grad_norm": 0.644528865814209,
+      "learning_rate": 0.00016796790845378915,
+      "loss": 0.9251,
+      "step": 5903
+    },
+    {
+      "epoch": 1.0511039886039886,
+      "grad_norm": 0.5429201126098633,
+      "learning_rate": 0.00016795764050768258,
+      "loss": 0.747,
+      "step": 5904
+    },
+    {
+      "epoch": 1.0512820512820513,
+      "grad_norm": 0.6432006359100342,
+      "learning_rate": 0.00016794737123008725,
+      "loss": 0.9166,
+      "step": 5905
+    },
+    {
+      "epoch": 1.0514601139601139,
+      "grad_norm": 0.6084117293357849,
+      "learning_rate": 0.00016793710062120427,
+      "loss": 1.0778,
+      "step": 5906
+    },
+    {
+      "epoch": 1.0516381766381766,
+      "grad_norm": 0.5351580381393433,
+      "learning_rate": 0.00016792682868123495,
+      "loss": 0.9124,
+      "step": 5907
+    },
+    {
+      "epoch": 1.0518162393162394,
+      "grad_norm": 0.7078854441642761,
+      "learning_rate": 0.00016791655541038053,
+      "loss": 1.1209,
+      "step": 5908
+    },
+    {
+      "epoch": 1.051994301994302,
+      "grad_norm": 0.5943832993507385,
+      "learning_rate": 0.0001679062808088423,
+      "loss": 0.9077,
+      "step": 5909
+    },
+    {
+      "epoch": 1.0521723646723646,
+      "grad_norm": 0.5216894745826721,
+      "learning_rate": 0.00016789600487682156,
+      "loss": 0.9866,
+      "step": 5910
+    },
+    {
+      "epoch": 1.0523504273504274,
+      "grad_norm": 0.738451361656189,
+      "learning_rate": 0.00016788572761451963,
+      "loss": 1.1611,
+      "step": 5911
+    },
+    {
+      "epoch": 1.05252849002849,
+      "grad_norm": 0.6411251425743103,
+      "learning_rate": 0.00016787544902213791,
+      "loss": 1.1481,
+      "step": 5912
+    },
+    {
+      "epoch": 1.0527065527065527,
+      "grad_norm": 0.6768319010734558,
+      "learning_rate": 0.00016786516909987774,
+      "loss": 0.8614,
+      "step": 5913
+    },
+    {
+      "epoch": 1.0528846153846154,
+      "grad_norm": 0.5838070511817932,
+      "learning_rate": 0.0001678548878479406,
+      "loss": 0.9719,
+      "step": 5914
+    },
+    {
+      "epoch": 1.0530626780626782,
+      "grad_norm": 0.541522741317749,
+      "learning_rate": 0.00016784460526652784,
+      "loss": 0.767,
+      "step": 5915
+    },
+    {
+      "epoch": 1.0532407407407407,
+      "grad_norm": 0.6064762473106384,
+      "learning_rate": 0.000167834321355841,
+      "loss": 1.0792,
+      "step": 5916
+    },
+    {
+      "epoch": 1.0534188034188035,
+      "grad_norm": 0.5515492558479309,
+      "learning_rate": 0.00016782403611608152,
+      "loss": 0.7897,
+      "step": 5917
+    },
+    {
+      "epoch": 1.0535968660968662,
+      "grad_norm": 0.6326262950897217,
+      "learning_rate": 0.000167813749547451,
+      "loss": 0.9279,
+      "step": 5918
+    },
+    {
+      "epoch": 1.0537749287749287,
+      "grad_norm": 0.6262009739875793,
+      "learning_rate": 0.0001678034616501509,
+      "loss": 0.9752,
+      "step": 5919
+    },
+    {
+      "epoch": 1.0539529914529915,
+      "grad_norm": 0.6049023270606995,
+      "learning_rate": 0.00016779317242438278,
+      "loss": 0.9167,
+      "step": 5920
+    },
+    {
+      "epoch": 1.0541310541310542,
+      "grad_norm": 0.6286031007766724,
+      "learning_rate": 0.0001677828818703483,
+      "loss": 1.1277,
+      "step": 5921
+    },
+    {
+      "epoch": 1.0543091168091168,
+      "grad_norm": 0.662086009979248,
+      "learning_rate": 0.00016777258998824907,
+      "loss": 1.0824,
+      "step": 5922
+    },
+    {
+      "epoch": 1.0544871794871795,
+      "grad_norm": 0.5358783006668091,
+      "learning_rate": 0.00016776229677828672,
+      "loss": 0.825,
+      "step": 5923
+    },
+    {
+      "epoch": 1.0546652421652423,
+      "grad_norm": 0.490326464176178,
+      "learning_rate": 0.00016775200224066294,
+      "loss": 0.7916,
+      "step": 5924
+    },
+    {
+      "epoch": 1.0548433048433048,
+      "grad_norm": 0.5940443277359009,
+      "learning_rate": 0.0001677417063755794,
+      "loss": 1.0121,
+      "step": 5925
+    },
+    {
+      "epoch": 1.0550213675213675,
+      "grad_norm": 0.5974507927894592,
+      "learning_rate": 0.00016773140918323787,
+      "loss": 0.7629,
+      "step": 5926
+    },
+    {
+      "epoch": 1.0551994301994303,
+      "grad_norm": 0.5747174024581909,
+      "learning_rate": 0.00016772111066384003,
+      "loss": 0.9373,
+      "step": 5927
+    },
+    {
+      "epoch": 1.0553774928774928,
+      "grad_norm": 0.5998024940490723,
+      "learning_rate": 0.00016771081081758772,
+      "loss": 0.8543,
+      "step": 5928
+    },
+    {
+      "epoch": 1.0555555555555556,
+      "grad_norm": 0.5771155953407288,
+      "learning_rate": 0.00016770050964468275,
+      "loss": 0.9108,
+      "step": 5929
+    },
+    {
+      "epoch": 1.0557336182336183,
+      "grad_norm": 0.5695661306381226,
+      "learning_rate": 0.00016769020714532692,
+      "loss": 0.8055,
+      "step": 5930
+    },
+    {
+      "epoch": 1.0559116809116809,
+      "grad_norm": 0.6164212226867676,
+      "learning_rate": 0.0001676799033197221,
+      "loss": 1.0917,
+      "step": 5931
+    },
+    {
+      "epoch": 1.0560897435897436,
+      "grad_norm": 0.6092487573623657,
+      "learning_rate": 0.00016766959816807018,
+      "loss": 0.9276,
+      "step": 5932
+    },
+    {
+      "epoch": 1.0562678062678064,
+      "grad_norm": 0.5595401525497437,
+      "learning_rate": 0.00016765929169057305,
+      "loss": 0.9435,
+      "step": 5933
+    },
+    {
+      "epoch": 1.056445868945869,
+      "grad_norm": 0.5875109434127808,
+      "learning_rate": 0.00016764898388743263,
+      "loss": 0.959,
+      "step": 5934
+    },
+    {
+      "epoch": 1.0566239316239316,
+      "grad_norm": 0.6045668721199036,
+      "learning_rate": 0.00016763867475885088,
+      "loss": 0.8636,
+      "step": 5935
+    },
+    {
+      "epoch": 1.0568019943019944,
+      "grad_norm": 0.6088171005249023,
+      "learning_rate": 0.00016762836430502987,
+      "loss": 0.6807,
+      "step": 5936
+    },
+    {
+      "epoch": 1.056980056980057,
+      "grad_norm": 0.6293274760246277,
+      "learning_rate": 0.00016761805252617148,
+      "loss": 1.042,
+      "step": 5937
+    },
+    {
+      "epoch": 1.0571581196581197,
+      "grad_norm": 0.588472843170166,
+      "learning_rate": 0.00016760773942247785,
+      "loss": 0.8896,
+      "step": 5938
+    },
+    {
+      "epoch": 1.0573361823361824,
+      "grad_norm": 0.4412326216697693,
+      "learning_rate": 0.000167597424994151,
+      "loss": 0.6727,
+      "step": 5939
+    },
+    {
+      "epoch": 1.057514245014245,
+      "grad_norm": 0.6086825132369995,
+      "learning_rate": 0.00016758710924139302,
+      "loss": 0.9908,
+      "step": 5940
+    },
+    {
+      "epoch": 1.0576923076923077,
+      "grad_norm": 0.6424705386161804,
+      "learning_rate": 0.00016757679216440608,
+      "loss": 1.0182,
+      "step": 5941
+    },
+    {
+      "epoch": 1.0578703703703705,
+      "grad_norm": 0.6610676050186157,
+      "learning_rate": 0.00016756647376339222,
+      "loss": 0.9645,
+      "step": 5942
+    },
+    {
+      "epoch": 1.058048433048433,
+      "grad_norm": 0.598292887210846,
+      "learning_rate": 0.0001675561540385537,
+      "loss": 0.9694,
+      "step": 5943
+    },
+    {
+      "epoch": 1.0582264957264957,
+      "grad_norm": 0.6941167116165161,
+      "learning_rate": 0.00016754583299009266,
+      "loss": 1.0786,
+      "step": 5944
+    },
+    {
+      "epoch": 1.0584045584045585,
+      "grad_norm": 0.6543232798576355,
+      "learning_rate": 0.00016753551061821133,
+      "loss": 1.0488,
+      "step": 5945
+    },
+    {
+      "epoch": 1.058582621082621,
+      "grad_norm": 0.606159508228302,
+      "learning_rate": 0.000167525186923112,
+      "loss": 0.9448,
+      "step": 5946
+    },
+    {
+      "epoch": 1.0587606837606838,
+      "grad_norm": 0.5051791071891785,
+      "learning_rate": 0.00016751486190499685,
+      "loss": 0.7485,
+      "step": 5947
+    },
+    {
+      "epoch": 1.0589387464387465,
+      "grad_norm": 0.6459367275238037,
+      "learning_rate": 0.00016750453556406826,
+      "loss": 1.0055,
+      "step": 5948
+    },
+    {
+      "epoch": 1.059116809116809,
+      "grad_norm": 0.551591157913208,
+      "learning_rate": 0.00016749420790052852,
+      "loss": 0.9717,
+      "step": 5949
+    },
+    {
+      "epoch": 1.0592948717948718,
+      "grad_norm": 0.5899214148521423,
+      "learning_rate": 0.00016748387891458,
+      "loss": 0.7774,
+      "step": 5950
+    },
+    {
+      "epoch": 1.0594729344729346,
+      "grad_norm": 0.582379162311554,
+      "learning_rate": 0.00016747354860642503,
+      "loss": 0.953,
+      "step": 5951
+    },
+    {
+      "epoch": 1.059650997150997,
+      "grad_norm": 0.6035816073417664,
+      "learning_rate": 0.00016746321697626605,
+      "loss": 1.1175,
+      "step": 5952
+    },
+    {
+      "epoch": 1.0598290598290598,
+      "grad_norm": 0.6476401686668396,
+      "learning_rate": 0.00016745288402430548,
+      "loss": 0.9448,
+      "step": 5953
+    },
+    {
+      "epoch": 1.0600071225071226,
+      "grad_norm": 0.6126405596733093,
+      "learning_rate": 0.00016744254975074578,
+      "loss": 0.882,
+      "step": 5954
+    },
+    {
+      "epoch": 1.0601851851851851,
+      "grad_norm": 0.5333579182624817,
+      "learning_rate": 0.0001674322141557894,
+      "loss": 0.9539,
+      "step": 5955
+    },
+    {
+      "epoch": 1.0603632478632479,
+      "grad_norm": 0.6085022687911987,
+      "learning_rate": 0.0001674218772396389,
+      "loss": 1.0028,
+      "step": 5956
+    },
+    {
+      "epoch": 1.0605413105413106,
+      "grad_norm": 0.5809528827667236,
+      "learning_rate": 0.0001674115390024967,
+      "loss": 0.84,
+      "step": 5957
+    },
+    {
+      "epoch": 1.0607193732193732,
+      "grad_norm": 0.5820229649543762,
+      "learning_rate": 0.00016740119944456548,
+      "loss": 0.9563,
+      "step": 5958
+    },
+    {
+      "epoch": 1.060897435897436,
+      "grad_norm": 0.6349015831947327,
+      "learning_rate": 0.00016739085856604775,
+      "loss": 0.9739,
+      "step": 5959
+    },
+    {
+      "epoch": 1.0610754985754987,
+      "grad_norm": 0.6346020102500916,
+      "learning_rate": 0.00016738051636714616,
+      "loss": 0.907,
+      "step": 5960
+    },
+    {
+      "epoch": 1.0612535612535612,
+      "grad_norm": 0.5850573778152466,
+      "learning_rate": 0.0001673701728480633,
+      "loss": 1.0688,
+      "step": 5961
+    },
+    {
+      "epoch": 1.061431623931624,
+      "grad_norm": 0.6258122324943542,
+      "learning_rate": 0.00016735982800900184,
+      "loss": 0.9997,
+      "step": 5962
+    },
+    {
+      "epoch": 1.0616096866096867,
+      "grad_norm": 0.6744239330291748,
+      "learning_rate": 0.00016734948185016452,
+      "loss": 0.9431,
+      "step": 5963
+    },
+    {
+      "epoch": 1.0617877492877492,
+      "grad_norm": 0.5769457817077637,
+      "learning_rate": 0.000167339134371754,
+      "loss": 0.9658,
+      "step": 5964
+    },
+    {
+      "epoch": 1.061965811965812,
+      "grad_norm": 0.6385112404823303,
+      "learning_rate": 0.000167328785573973,
+      "loss": 1.0199,
+      "step": 5965
+    },
+    {
+      "epoch": 1.0621438746438747,
+      "grad_norm": 0.536522388458252,
+      "learning_rate": 0.00016731843545702435,
+      "loss": 0.8496,
+      "step": 5966
+    },
+    {
+      "epoch": 1.0623219373219372,
+      "grad_norm": 0.5978497862815857,
+      "learning_rate": 0.00016730808402111075,
+      "loss": 0.8536,
+      "step": 5967
+    },
+    {
+      "epoch": 1.0625,
+      "grad_norm": 0.6091681122779846,
+      "learning_rate": 0.0001672977312664351,
+      "loss": 1.0241,
+      "step": 5968
+    },
+    {
+      "epoch": 1.0626780626780628,
+      "grad_norm": 0.5807273387908936,
+      "learning_rate": 0.0001672873771932002,
+      "loss": 1.0522,
+      "step": 5969
+    },
+    {
+      "epoch": 1.0628561253561253,
+      "grad_norm": 0.6511965990066528,
+      "learning_rate": 0.0001672770218016089,
+      "loss": 0.8908,
+      "step": 5970
+    },
+    {
+      "epoch": 1.063034188034188,
+      "grad_norm": 0.6241721510887146,
+      "learning_rate": 0.00016726666509186416,
+      "loss": 0.9854,
+      "step": 5971
+    },
+    {
+      "epoch": 1.0632122507122508,
+      "grad_norm": 0.6112468242645264,
+      "learning_rate": 0.0001672563070641688,
+      "loss": 1.0091,
+      "step": 5972
+    },
+    {
+      "epoch": 1.0633903133903133,
+      "grad_norm": 0.6135509014129639,
+      "learning_rate": 0.00016724594771872587,
+      "loss": 0.8891,
+      "step": 5973
+    },
+    {
+      "epoch": 1.063568376068376,
+      "grad_norm": 0.608384370803833,
+      "learning_rate": 0.00016723558705573823,
+      "loss": 1.017,
+      "step": 5974
+    },
+    {
+      "epoch": 1.0637464387464388,
+      "grad_norm": 0.6578485369682312,
+      "learning_rate": 0.00016722522507540895,
+      "loss": 0.9165,
+      "step": 5975
+    },
+    {
+      "epoch": 1.0639245014245013,
+      "grad_norm": 0.562588095664978,
+      "learning_rate": 0.00016721486177794106,
+      "loss": 0.7989,
+      "step": 5976
+    },
+    {
+      "epoch": 1.064102564102564,
+      "grad_norm": 0.5541409254074097,
+      "learning_rate": 0.00016720449716353753,
+      "loss": 0.8917,
+      "step": 5977
+    },
+    {
+      "epoch": 1.0642806267806268,
+      "grad_norm": 0.551167905330658,
+      "learning_rate": 0.0001671941312324015,
+      "loss": 0.824,
+      "step": 5978
+    },
+    {
+      "epoch": 1.0644586894586894,
+      "grad_norm": 0.6280582547187805,
+      "learning_rate": 0.0001671837639847361,
+      "loss": 0.9708,
+      "step": 5979
+    },
+    {
+      "epoch": 1.0646367521367521,
+      "grad_norm": 0.6389226913452148,
+      "learning_rate": 0.00016717339542074436,
+      "loss": 1.0081,
+      "step": 5980
+    },
+    {
+      "epoch": 1.0648148148148149,
+      "grad_norm": 0.6677889823913574,
+      "learning_rate": 0.0001671630255406295,
+      "loss": 1.2709,
+      "step": 5981
+    },
+    {
+      "epoch": 1.0649928774928774,
+      "grad_norm": 0.5748161673545837,
+      "learning_rate": 0.00016715265434459465,
+      "loss": 0.9157,
+      "step": 5982
+    },
+    {
+      "epoch": 1.0651709401709402,
+      "grad_norm": 0.6677651405334473,
+      "learning_rate": 0.00016714228183284304,
+      "loss": 1.1097,
+      "step": 5983
+    },
+    {
+      "epoch": 1.065349002849003,
+      "grad_norm": 0.6253604292869568,
+      "learning_rate": 0.0001671319080055779,
+      "loss": 0.9819,
+      "step": 5984
+    },
+    {
+      "epoch": 1.0655270655270654,
+      "grad_norm": 0.5548844337463379,
+      "learning_rate": 0.0001671215328630025,
+      "loss": 0.9324,
+      "step": 5985
+    },
+    {
+      "epoch": 1.0657051282051282,
+      "grad_norm": 0.622062623500824,
+      "learning_rate": 0.00016711115640532004,
+      "loss": 0.8749,
+      "step": 5986
+    },
+    {
+      "epoch": 1.065883190883191,
+      "grad_norm": 0.6496043801307678,
+      "learning_rate": 0.00016710077863273394,
+      "loss": 1.0642,
+      "step": 5987
+    },
+    {
+      "epoch": 1.0660612535612535,
+      "grad_norm": 0.6140534281730652,
+      "learning_rate": 0.00016709039954544746,
+      "loss": 0.8928,
+      "step": 5988
+    },
+    {
+      "epoch": 1.0662393162393162,
+      "grad_norm": 0.6387218236923218,
+      "learning_rate": 0.00016708001914366393,
+      "loss": 0.9525,
+      "step": 5989
+    },
+    {
+      "epoch": 1.066417378917379,
+      "grad_norm": 0.6119858026504517,
+      "learning_rate": 0.0001670696374275868,
+      "loss": 0.8663,
+      "step": 5990
+    },
+    {
+      "epoch": 1.0665954415954415,
+      "grad_norm": 0.6722040772438049,
+      "learning_rate": 0.00016705925439741947,
+      "loss": 1.1173,
+      "step": 5991
+    },
+    {
+      "epoch": 1.0667735042735043,
+      "grad_norm": 0.8226081132888794,
+      "learning_rate": 0.00016704887005336534,
+      "loss": 1.0572,
+      "step": 5992
+    },
+    {
+      "epoch": 1.066951566951567,
+      "grad_norm": 0.7248596549034119,
+      "learning_rate": 0.00016703848439562785,
+      "loss": 1.0493,
+      "step": 5993
+    },
+    {
+      "epoch": 1.0671296296296295,
+      "grad_norm": 0.7185787558555603,
+      "learning_rate": 0.00016702809742441058,
+      "loss": 1.1366,
+      "step": 5994
+    },
+    {
+      "epoch": 1.0673076923076923,
+      "grad_norm": 0.6118780970573425,
+      "learning_rate": 0.00016701770913991694,
+      "loss": 0.9557,
+      "step": 5995
+    },
+    {
+      "epoch": 1.067485754985755,
+      "grad_norm": 0.6472596526145935,
+      "learning_rate": 0.0001670073195423505,
+      "loss": 0.9977,
+      "step": 5996
+    },
+    {
+      "epoch": 1.0676638176638176,
+      "grad_norm": 0.7110133767127991,
+      "learning_rate": 0.00016699692863191484,
+      "loss": 1.1932,
+      "step": 5997
+    },
+    {
+      "epoch": 1.0678418803418803,
+      "grad_norm": 0.5827305912971497,
+      "learning_rate": 0.00016698653640881354,
+      "loss": 0.7641,
+      "step": 5998
+    },
+    {
+      "epoch": 1.068019943019943,
+      "grad_norm": 0.527208149433136,
+      "learning_rate": 0.00016697614287325017,
+      "loss": 0.7683,
+      "step": 5999
+    },
+    {
+      "epoch": 1.0681980056980056,
+      "grad_norm": 0.6680626273155212,
+      "learning_rate": 0.00016696574802542848,
+      "loss": 1.1748,
+      "step": 6000
+    },
+    {
+      "epoch": 1.0683760683760684,
+      "grad_norm": 0.5947227478027344,
+      "learning_rate": 0.00016695535186555204,
+      "loss": 1.0894,
+      "step": 6001
+    },
+    {
+      "epoch": 1.068554131054131,
+      "grad_norm": 0.5828250646591187,
+      "learning_rate": 0.00016694495439382456,
+      "loss": 0.9895,
+      "step": 6002
+    },
+    {
+      "epoch": 1.0687321937321936,
+      "grad_norm": 0.5897728204727173,
+      "learning_rate": 0.00016693455561044978,
+      "loss": 0.9686,
+      "step": 6003
+    },
+    {
+      "epoch": 1.0689102564102564,
+      "grad_norm": 0.5441751480102539,
+      "learning_rate": 0.0001669241555156314,
+      "loss": 0.8948,
+      "step": 6004
+    },
+    {
+      "epoch": 1.0690883190883191,
+      "grad_norm": 0.694199800491333,
+      "learning_rate": 0.00016691375410957324,
+      "loss": 1.0824,
+      "step": 6005
+    },
+    {
+      "epoch": 1.0692663817663817,
+      "grad_norm": 0.6077630519866943,
+      "learning_rate": 0.00016690335139247906,
+      "loss": 1.0931,
+      "step": 6006
+    },
+    {
+      "epoch": 1.0694444444444444,
+      "grad_norm": 0.6558539867401123,
+      "learning_rate": 0.0001668929473645527,
+      "loss": 1.0099,
+      "step": 6007
+    },
+    {
+      "epoch": 1.0696225071225072,
+      "grad_norm": 0.5722812414169312,
+      "learning_rate": 0.00016688254202599798,
+      "loss": 0.7999,
+      "step": 6008
+    },
+    {
+      "epoch": 1.0698005698005697,
+      "grad_norm": 0.5915400981903076,
+      "learning_rate": 0.0001668721353770188,
+      "loss": 0.7866,
+      "step": 6009
+    },
+    {
+      "epoch": 1.0699786324786325,
+      "grad_norm": 0.5290952324867249,
+      "learning_rate": 0.00016686172741781901,
+      "loss": 0.793,
+      "step": 6010
+    },
+    {
+      "epoch": 1.0701566951566952,
+      "grad_norm": 0.5501774549484253,
+      "learning_rate": 0.00016685131814860263,
+      "loss": 0.8775,
+      "step": 6011
+    },
+    {
+      "epoch": 1.0703347578347577,
+      "grad_norm": 0.6192594766616821,
+      "learning_rate": 0.00016684090756957347,
+      "loss": 1.1686,
+      "step": 6012
+    },
+    {
+      "epoch": 1.0705128205128205,
+      "grad_norm": 0.6640267968177795,
+      "learning_rate": 0.00016683049568093561,
+      "loss": 1.1789,
+      "step": 6013
+    },
+    {
+      "epoch": 1.0706908831908832,
+      "grad_norm": 0.552893877029419,
+      "learning_rate": 0.00016682008248289303,
+      "loss": 0.7957,
+      "step": 6014
+    },
+    {
+      "epoch": 1.0708689458689458,
+      "grad_norm": 0.6406302452087402,
+      "learning_rate": 0.00016680966797564972,
+      "loss": 1.1174,
+      "step": 6015
+    },
+    {
+      "epoch": 1.0710470085470085,
+      "grad_norm": Infinity,
+      "learning_rate": 0.00016680966797564972,
+      "loss": 0.9168,
+      "step": 6016
+    },
+    {
+      "epoch": 1.0712250712250713,
+      "grad_norm": 0.6384762525558472,
+      "learning_rate": 0.00016679925215940975,
+      "loss": 0.9831,
+      "step": 6017
+    },
+    {
+      "epoch": 1.071403133903134,
+      "grad_norm": 0.5906224846839905,
+      "learning_rate": 0.0001667888350343772,
+      "loss": 0.9167,
+      "step": 6018
+    },
+    {
+      "epoch": 1.0715811965811965,
+      "grad_norm": 0.658044695854187,
+      "learning_rate": 0.00016677841660075617,
+      "loss": 1.0075,
+      "step": 6019
+    },
+    {
+      "epoch": 1.0717592592592593,
+      "grad_norm": 0.6313242316246033,
+      "learning_rate": 0.00016676799685875078,
+      "loss": 0.8551,
+      "step": 6020
+    },
+    {
+      "epoch": 1.0719373219373218,
+      "grad_norm": 0.5891841053962708,
+      "learning_rate": 0.00016675757580856518,
+      "loss": 0.8475,
+      "step": 6021
+    },
+    {
+      "epoch": 1.0721153846153846,
+      "grad_norm": 0.581317126750946,
+      "learning_rate": 0.00016674715345040358,
+      "loss": 0.9308,
+      "step": 6022
+    },
+    {
+      "epoch": 1.0722934472934473,
+      "grad_norm": 0.5952537655830383,
+      "learning_rate": 0.00016673672978447017,
+      "loss": 0.9104,
+      "step": 6023
+    },
+    {
+      "epoch": 1.07247150997151,
+      "grad_norm": 0.5934227705001831,
+      "learning_rate": 0.00016672630481096915,
+      "loss": 0.9882,
+      "step": 6024
+    },
+    {
+      "epoch": 1.0726495726495726,
+      "grad_norm": 0.5867539048194885,
+      "learning_rate": 0.00016671587853010482,
+      "loss": 1.0186,
+      "step": 6025
+    },
+    {
+      "epoch": 1.0728276353276354,
+      "grad_norm": 0.6002280116081238,
+      "learning_rate": 0.00016670545094208143,
+      "loss": 0.92,
+      "step": 6026
+    },
+    {
+      "epoch": 1.073005698005698,
+      "grad_norm": 0.6261683702468872,
+      "learning_rate": 0.0001666950220471033,
+      "loss": 0.9293,
+      "step": 6027
+    },
+    {
+      "epoch": 1.0731837606837606,
+      "grad_norm": 0.6128147840499878,
+      "learning_rate": 0.00016668459184537477,
+      "loss": 1.0787,
+      "step": 6028
+    },
+    {
+      "epoch": 1.0733618233618234,
+      "grad_norm": 0.62148118019104,
+      "learning_rate": 0.00016667416033710016,
+      "loss": 0.8843,
+      "step": 6029
+    },
+    {
+      "epoch": 1.0735398860398861,
+      "grad_norm": 0.7166166305541992,
+      "learning_rate": 0.0001666637275224839,
+      "loss": 0.8877,
+      "step": 6030
+    },
+    {
+      "epoch": 1.0737179487179487,
+      "grad_norm": 0.5275574922561646,
+      "learning_rate": 0.0001666532934017304,
+      "loss": 0.9604,
+      "step": 6031
+    },
+    {
+      "epoch": 1.0738960113960114,
+      "grad_norm": 0.8132784962654114,
+      "learning_rate": 0.00016664285797504406,
+      "loss": 1.0203,
+      "step": 6032
+    },
+    {
+      "epoch": 1.074074074074074,
+      "grad_norm": 0.5887695550918579,
+      "learning_rate": 0.00016663242124262935,
+      "loss": 0.8819,
+      "step": 6033
+    },
+    {
+      "epoch": 1.0742521367521367,
+      "grad_norm": 0.5552900433540344,
+      "learning_rate": 0.00016662198320469078,
+      "loss": 0.7542,
+      "step": 6034
+    },
+    {
+      "epoch": 1.0744301994301995,
+      "grad_norm": 0.6228970885276794,
+      "learning_rate": 0.0001666115438614328,
+      "loss": 1.0362,
+      "step": 6035
+    },
+    {
+      "epoch": 1.0746082621082622,
+      "grad_norm": 0.7193471789360046,
+      "learning_rate": 0.00016660110321306003,
+      "loss": 1.3073,
+      "step": 6036
+    },
+    {
+      "epoch": 1.0747863247863247,
+      "grad_norm": 0.6167412996292114,
+      "learning_rate": 0.000166590661259777,
+      "loss": 0.941,
+      "step": 6037
+    },
+    {
+      "epoch": 1.0749643874643875,
+      "grad_norm": 0.5716922879219055,
+      "learning_rate": 0.00016658021800178827,
+      "loss": 0.83,
+      "step": 6038
+    },
+    {
+      "epoch": 1.0751424501424502,
+      "grad_norm": 0.6404047012329102,
+      "learning_rate": 0.00016656977343929848,
+      "loss": 1.0617,
+      "step": 6039
+    },
+    {
+      "epoch": 1.0753205128205128,
+      "grad_norm": 0.531395435333252,
+      "learning_rate": 0.00016655932757251226,
+      "loss": 0.7785,
+      "step": 6040
+    },
+    {
+      "epoch": 1.0754985754985755,
+      "grad_norm": 0.6468462347984314,
+      "learning_rate": 0.0001665488804016343,
+      "loss": 0.7893,
+      "step": 6041
+    },
+    {
+      "epoch": 1.0756766381766383,
+      "grad_norm": 0.6539653539657593,
+      "learning_rate": 0.00016653843192686925,
+      "loss": 1.1011,
+      "step": 6042
+    },
+    {
+      "epoch": 1.0758547008547008,
+      "grad_norm": 0.630107045173645,
+      "learning_rate": 0.0001665279821484219,
+      "loss": 0.9262,
+      "step": 6043
+    },
+    {
+      "epoch": 1.0760327635327636,
+      "grad_norm": 0.5875992774963379,
+      "learning_rate": 0.00016651753106649688,
+      "loss": 1.0501,
+      "step": 6044
+    },
+    {
+      "epoch": 1.0762108262108263,
+      "grad_norm": 0.573428750038147,
+      "learning_rate": 0.00016650707868129904,
+      "loss": 1.0672,
+      "step": 6045
+    },
+    {
+      "epoch": 1.0763888888888888,
+      "grad_norm": 0.6215469241142273,
+      "learning_rate": 0.00016649662499303316,
+      "loss": 0.868,
+      "step": 6046
+    },
+    {
+      "epoch": 1.0765669515669516,
+      "grad_norm": 0.6666893362998962,
+      "learning_rate": 0.00016648617000190402,
+      "loss": 1.0965,
+      "step": 6047
+    },
+    {
+      "epoch": 1.0767450142450143,
+      "grad_norm": 0.8343498706817627,
+      "learning_rate": 0.00016647571370811653,
+      "loss": 1.2302,
+      "step": 6048
+    },
+    {
+      "epoch": 1.0769230769230769,
+      "grad_norm": 0.591147780418396,
+      "learning_rate": 0.0001664652561118755,
+      "loss": 0.9698,
+      "step": 6049
+    },
+    {
+      "epoch": 1.0771011396011396,
+      "grad_norm": 0.573375940322876,
+      "learning_rate": 0.00016645479721338584,
+      "loss": 0.8798,
+      "step": 6050
+    },
+    {
+      "epoch": 1.0772792022792024,
+      "grad_norm": 0.4956737160682678,
+      "learning_rate": 0.00016644433701285246,
+      "loss": 0.6523,
+      "step": 6051
+    },
+    {
+      "epoch": 1.077457264957265,
+      "grad_norm": 0.6896619200706482,
+      "learning_rate": 0.00016643387551048034,
+      "loss": 0.8911,
+      "step": 6052
+    },
+    {
+      "epoch": 1.0776353276353277,
+      "grad_norm": 0.5820416808128357,
+      "learning_rate": 0.00016642341270647445,
+      "loss": 1.1486,
+      "step": 6053
+    },
+    {
+      "epoch": 1.0778133903133904,
+      "grad_norm": 0.611132025718689,
+      "learning_rate": 0.00016641294860103976,
+      "loss": 1.0705,
+      "step": 6054
+    },
+    {
+      "epoch": 1.077991452991453,
+      "grad_norm": 0.6705698370933533,
+      "learning_rate": 0.00016640248319438133,
+      "loss": 0.9826,
+      "step": 6055
+    },
+    {
+      "epoch": 1.0781695156695157,
+      "grad_norm": 0.5987013578414917,
+      "learning_rate": 0.00016639201648670416,
+      "loss": 1.0409,
+      "step": 6056
+    },
+    {
+      "epoch": 1.0783475783475784,
+      "grad_norm": 0.6707149744033813,
+      "learning_rate": 0.00016638154847821332,
+      "loss": 1.1332,
+      "step": 6057
+    },
+    {
+      "epoch": 1.078525641025641,
+      "grad_norm": 0.6400678157806396,
+      "learning_rate": 0.00016637107916911393,
+      "loss": 1.2559,
+      "step": 6058
+    },
+    {
+      "epoch": 1.0787037037037037,
+      "grad_norm": 0.6370311379432678,
+      "learning_rate": 0.00016636060855961115,
+      "loss": 0.9752,
+      "step": 6059
+    },
+    {
+      "epoch": 1.0788817663817665,
+      "grad_norm": 0.6116052269935608,
+      "learning_rate": 0.00016635013664991012,
+      "loss": 0.8364,
+      "step": 6060
+    },
+    {
+      "epoch": 1.079059829059829,
+      "grad_norm": 0.7932127714157104,
+      "learning_rate": 0.00016633966344021593,
+      "loss": 0.939,
+      "step": 6061
+    },
+    {
+      "epoch": 1.0792378917378918,
+      "grad_norm": 0.576249897480011,
+      "learning_rate": 0.00016632918893073385,
+      "loss": 0.8911,
+      "step": 6062
+    },
+    {
+      "epoch": 1.0794159544159545,
+      "grad_norm": 0.5456888675689697,
+      "learning_rate": 0.00016631871312166915,
+      "loss": 0.8646,
+      "step": 6063
+    },
+    {
+      "epoch": 1.079594017094017,
+      "grad_norm": 0.717522919178009,
+      "learning_rate": 0.000166308236013227,
+      "loss": 1.0814,
+      "step": 6064
+    },
+    {
+      "epoch": 1.0797720797720798,
+      "grad_norm": 0.6637256145477295,
+      "learning_rate": 0.0001662977576056127,
+      "loss": 1.22,
+      "step": 6065
+    },
+    {
+      "epoch": 1.0799501424501425,
+      "grad_norm": 0.5846666693687439,
+      "learning_rate": 0.0001662872778990316,
+      "loss": 1.1745,
+      "step": 6066
+    },
+    {
+      "epoch": 1.080128205128205,
+      "grad_norm": 0.6611326336860657,
+      "learning_rate": 0.00016627679689368895,
+      "loss": 1.1262,
+      "step": 6067
+    },
+    {
+      "epoch": 1.0803062678062678,
+      "grad_norm": 0.6022892594337463,
+      "learning_rate": 0.00016626631458979015,
+      "loss": 0.9741,
+      "step": 6068
+    },
+    {
+      "epoch": 1.0804843304843306,
+      "grad_norm": 0.5862685441970825,
+      "learning_rate": 0.00016625583098754058,
+      "loss": 0.914,
+      "step": 6069
+    },
+    {
+      "epoch": 1.080662393162393,
+      "grad_norm": 0.7089241147041321,
+      "learning_rate": 0.00016624534608714563,
+      "loss": 1.0614,
+      "step": 6070
+    },
+    {
+      "epoch": 1.0808404558404558,
+      "grad_norm": 0.5286028981208801,
+      "learning_rate": 0.00016623485988881076,
+      "loss": 0.8756,
+      "step": 6071
+    },
+    {
+      "epoch": 1.0810185185185186,
+      "grad_norm": 0.6437101364135742,
+      "learning_rate": 0.00016622437239274137,
+      "loss": 0.7222,
+      "step": 6072
+    },
+    {
+      "epoch": 1.0811965811965811,
+      "grad_norm": 0.6197740435600281,
+      "learning_rate": 0.000166213883599143,
+      "loss": 0.7876,
+      "step": 6073
+    },
+    {
+      "epoch": 1.0813746438746439,
+      "grad_norm": 0.5889328122138977,
+      "learning_rate": 0.0001662033935082211,
+      "loss": 0.9587,
+      "step": 6074
+    },
+    {
+      "epoch": 1.0815527065527066,
+      "grad_norm": 0.5353847742080688,
+      "learning_rate": 0.00016619290212018125,
+      "loss": 0.8664,
+      "step": 6075
+    },
+    {
+      "epoch": 1.0817307692307692,
+      "grad_norm": 0.7202061414718628,
+      "learning_rate": 0.00016618240943522898,
+      "loss": 1.0429,
+      "step": 6076
+    },
+    {
+      "epoch": 1.081908831908832,
+      "grad_norm": 0.5831515192985535,
+      "learning_rate": 0.0001661719154535699,
+      "loss": 1.0323,
+      "step": 6077
+    },
+    {
+      "epoch": 1.0820868945868947,
+      "grad_norm": 0.6270500421524048,
+      "learning_rate": 0.00016616142017540953,
+      "loss": 0.9272,
+      "step": 6078
+    },
+    {
+      "epoch": 1.0822649572649572,
+      "grad_norm": 0.6064695119857788,
+      "learning_rate": 0.00016615092360095364,
+      "loss": 1.0629,
+      "step": 6079
+    },
+    {
+      "epoch": 1.08244301994302,
+      "grad_norm": 0.5578122138977051,
+      "learning_rate": 0.00016614042573040777,
+      "loss": 0.8601,
+      "step": 6080
+    },
+    {
+      "epoch": 1.0826210826210827,
+      "grad_norm": 0.5920688509941101,
+      "learning_rate": 0.0001661299265639777,
+      "loss": 1.0082,
+      "step": 6081
+    },
+    {
+      "epoch": 1.0827991452991452,
+      "grad_norm": 0.6191682815551758,
+      "learning_rate": 0.0001661194261018691,
+      "loss": 0.9645,
+      "step": 6082
+    },
+    {
+      "epoch": 1.082977207977208,
+      "grad_norm": 0.6403279304504395,
+      "learning_rate": 0.00016610892434428765,
+      "loss": 0.9263,
+      "step": 6083
+    },
+    {
+      "epoch": 1.0831552706552707,
+      "grad_norm": 0.579502284526825,
+      "learning_rate": 0.00016609842129143915,
+      "loss": 0.8997,
+      "step": 6084
+    },
+    {
+      "epoch": 1.0833333333333333,
+      "grad_norm": 0.5831437706947327,
+      "learning_rate": 0.00016608791694352944,
+      "loss": 1.0703,
+      "step": 6085
+    },
+    {
+      "epoch": 1.083511396011396,
+      "grad_norm": 0.6188452243804932,
+      "learning_rate": 0.00016607741130076424,
+      "loss": 0.8856,
+      "step": 6086
+    },
+    {
+      "epoch": 1.0836894586894588,
+      "grad_norm": 0.7413692474365234,
+      "learning_rate": 0.00016606690436334946,
+      "loss": 1.1995,
+      "step": 6087
+    },
+    {
+      "epoch": 1.0838675213675213,
+      "grad_norm": 0.5552099347114563,
+      "learning_rate": 0.00016605639613149093,
+      "loss": 0.8514,
+      "step": 6088
+    },
+    {
+      "epoch": 1.084045584045584,
+      "grad_norm": 0.5906503200531006,
+      "learning_rate": 0.00016604588660539452,
+      "loss": 0.9431,
+      "step": 6089
+    },
+    {
+      "epoch": 1.0842236467236468,
+      "grad_norm": 0.5326111316680908,
+      "learning_rate": 0.0001660353757852662,
+      "loss": 0.8306,
+      "step": 6090
+    },
+    {
+      "epoch": 1.0844017094017093,
+      "grad_norm": 0.7273091673851013,
+      "learning_rate": 0.0001660248636713118,
+      "loss": 1.1109,
+      "step": 6091
+    },
+    {
+      "epoch": 1.084579772079772,
+      "grad_norm": 0.66513592004776,
+      "learning_rate": 0.00016601435026373737,
+      "loss": 1.0621,
+      "step": 6092
+    },
+    {
+      "epoch": 1.0847578347578348,
+      "grad_norm": 0.6470831632614136,
+      "learning_rate": 0.00016600383556274892,
+      "loss": 1.1075,
+      "step": 6093
+    },
+    {
+      "epoch": 1.0849358974358974,
+      "grad_norm": 0.6308658719062805,
+      "learning_rate": 0.0001659933195685524,
+      "loss": 0.9832,
+      "step": 6094
+    },
+    {
+      "epoch": 1.08511396011396,
+      "grad_norm": 0.6569336652755737,
+      "learning_rate": 0.00016598280228135388,
+      "loss": 0.9754,
+      "step": 6095
+    },
+    {
+      "epoch": 1.0852920227920229,
+      "grad_norm": 0.5672318339347839,
+      "learning_rate": 0.0001659722837013594,
+      "loss": 0.9075,
+      "step": 6096
+    },
+    {
+      "epoch": 1.0854700854700854,
+      "grad_norm": 0.6397247314453125,
+      "learning_rate": 0.00016596176382877506,
+      "loss": 1.0358,
+      "step": 6097
+    },
+    {
+      "epoch": 1.0856481481481481,
+      "grad_norm": 0.6046154499053955,
+      "learning_rate": 0.000165951242663807,
+      "loss": 0.9036,
+      "step": 6098
+    },
+    {
+      "epoch": 1.085826210826211,
+      "grad_norm": 0.7190790176391602,
+      "learning_rate": 0.00016594072020666134,
+      "loss": 1.05,
+      "step": 6099
+    },
+    {
+      "epoch": 1.0860042735042734,
+      "grad_norm": 0.636986255645752,
+      "learning_rate": 0.00016593019645754425,
+      "loss": 1.0648,
+      "step": 6100
+    },
+    {
+      "epoch": 1.0861823361823362,
+      "grad_norm": 0.7239426374435425,
+      "learning_rate": 0.00016591967141666193,
+      "loss": 1.3332,
+      "step": 6101
+    },
+    {
+      "epoch": 1.086360398860399,
+      "grad_norm": 0.5623281002044678,
+      "learning_rate": 0.00016590914508422054,
+      "loss": 0.997,
+      "step": 6102
+    },
+    {
+      "epoch": 1.0865384615384615,
+      "grad_norm": 0.5559574365615845,
+      "learning_rate": 0.00016589861746042642,
+      "loss": 0.9309,
+      "step": 6103
+    },
+    {
+      "epoch": 1.0867165242165242,
+      "grad_norm": 0.6056998372077942,
+      "learning_rate": 0.00016588808854548574,
+      "loss": 1.05,
+      "step": 6104
+    },
+    {
+      "epoch": 1.086894586894587,
+      "grad_norm": 0.6419603228569031,
+      "learning_rate": 0.00016587755833960487,
+      "loss": 0.8933,
+      "step": 6105
+    },
+    {
+      "epoch": 1.0870726495726495,
+      "grad_norm": 0.5236496329307556,
+      "learning_rate": 0.00016586702684299006,
+      "loss": 1.0061,
+      "step": 6106
+    },
+    {
+      "epoch": 1.0872507122507122,
+      "grad_norm": 0.5764613747596741,
+      "learning_rate": 0.0001658564940558477,
+      "loss": 1.0218,
+      "step": 6107
+    },
+    {
+      "epoch": 1.087428774928775,
+      "grad_norm": 0.6049391627311707,
+      "learning_rate": 0.00016584595997838416,
+      "loss": 0.8157,
+      "step": 6108
+    },
+    {
+      "epoch": 1.0876068376068375,
+      "grad_norm": 0.585422933101654,
+      "learning_rate": 0.0001658354246108058,
+      "loss": 1.2761,
+      "step": 6109
+    },
+    {
+      "epoch": 1.0877849002849003,
+      "grad_norm": 0.6420125365257263,
+      "learning_rate": 0.00016582488795331907,
+      "loss": 1.1978,
+      "step": 6110
+    },
+    {
+      "epoch": 1.087962962962963,
+      "grad_norm": 0.646091878414154,
+      "learning_rate": 0.00016581435000613038,
+      "loss": 0.8946,
+      "step": 6111
+    },
+    {
+      "epoch": 1.0881410256410255,
+      "grad_norm": 0.6563934087753296,
+      "learning_rate": 0.00016580381076944625,
+      "loss": 1.0625,
+      "step": 6112
+    },
+    {
+      "epoch": 1.0883190883190883,
+      "grad_norm": 0.6796613931655884,
+      "learning_rate": 0.0001657932702434731,
+      "loss": 0.9401,
+      "step": 6113
+    },
+    {
+      "epoch": 1.088497150997151,
+      "grad_norm": 0.6248648762702942,
+      "learning_rate": 0.00016578272842841753,
+      "loss": 0.8558,
+      "step": 6114
+    },
+    {
+      "epoch": 1.0886752136752136,
+      "grad_norm": 0.5136269330978394,
+      "learning_rate": 0.00016577218532448605,
+      "loss": 0.6424,
+      "step": 6115
+    },
+    {
+      "epoch": 1.0888532763532763,
+      "grad_norm": 0.5581641793251038,
+      "learning_rate": 0.00016576164093188523,
+      "loss": 0.7923,
+      "step": 6116
+    },
+    {
+      "epoch": 1.089031339031339,
+      "grad_norm": 0.630352258682251,
+      "learning_rate": 0.0001657510952508216,
+      "loss": 0.9115,
+      "step": 6117
+    },
+    {
+      "epoch": 1.0892094017094016,
+      "grad_norm": 0.6167593002319336,
+      "learning_rate": 0.0001657405482815019,
+      "loss": 1.1112,
+      "step": 6118
+    },
+    {
+      "epoch": 1.0893874643874644,
+      "grad_norm": 0.5908578634262085,
+      "learning_rate": 0.00016573000002413271,
+      "loss": 1.0359,
+      "step": 6119
+    },
+    {
+      "epoch": 1.0895655270655271,
+      "grad_norm": 0.6326140761375427,
+      "learning_rate": 0.00016571945047892073,
+      "loss": 1.0459,
+      "step": 6120
+    },
+    {
+      "epoch": 1.0897435897435896,
+      "grad_norm": 0.7273572683334351,
+      "learning_rate": 0.00016570889964607262,
+      "loss": 1.0901,
+      "step": 6121
+    },
+    {
+      "epoch": 1.0899216524216524,
+      "grad_norm": 0.6168062090873718,
+      "learning_rate": 0.00016569834752579513,
+      "loss": 0.8739,
+      "step": 6122
+    },
+    {
+      "epoch": 1.0900997150997151,
+      "grad_norm": 0.5620378255844116,
+      "learning_rate": 0.00016568779411829497,
+      "loss": 0.9614,
+      "step": 6123
+    },
+    {
+      "epoch": 1.0902777777777777,
+      "grad_norm": 0.6319156885147095,
+      "learning_rate": 0.00016567723942377899,
+      "loss": 1.1031,
+      "step": 6124
+    },
+    {
+      "epoch": 1.0904558404558404,
+      "grad_norm": 0.6590072512626648,
+      "learning_rate": 0.00016566668344245388,
+      "loss": 1.0086,
+      "step": 6125
+    },
+    {
+      "epoch": 1.0906339031339032,
+      "grad_norm": 0.5823387503623962,
+      "learning_rate": 0.00016565612617452656,
+      "loss": 0.8886,
+      "step": 6126
+    },
+    {
+      "epoch": 1.0908119658119657,
+      "grad_norm": 0.5795989632606506,
+      "learning_rate": 0.00016564556762020381,
+      "loss": 0.7683,
+      "step": 6127
+    },
+    {
+      "epoch": 1.0909900284900285,
+      "grad_norm": 0.5940101742744446,
+      "learning_rate": 0.00016563500777969255,
+      "loss": 0.8873,
+      "step": 6128
+    },
+    {
+      "epoch": 1.0911680911680912,
+      "grad_norm": 0.5708247423171997,
+      "learning_rate": 0.00016562444665319963,
+      "loss": 0.7382,
+      "step": 6129
+    },
+    {
+      "epoch": 1.0913461538461537,
+      "grad_norm": 0.6339239478111267,
+      "learning_rate": 0.00016561388424093202,
+      "loss": 0.9323,
+      "step": 6130
+    },
+    {
+      "epoch": 1.0915242165242165,
+      "grad_norm": 0.720000147819519,
+      "learning_rate": 0.00016560332054309663,
+      "loss": 1.0437,
+      "step": 6131
+    },
+    {
+      "epoch": 1.0917022792022792,
+      "grad_norm": 0.686580240726471,
+      "learning_rate": 0.00016559275555990048,
+      "loss": 0.9841,
+      "step": 6132
+    },
+    {
+      "epoch": 1.091880341880342,
+      "grad_norm": 0.6067900061607361,
+      "learning_rate": 0.00016558218929155053,
+      "loss": 1.0862,
+      "step": 6133
+    },
+    {
+      "epoch": 1.0920584045584045,
+      "grad_norm": 0.6678896546363831,
+      "learning_rate": 0.00016557162173825384,
+      "loss": 0.8509,
+      "step": 6134
+    },
+    {
+      "epoch": 1.0922364672364673,
+      "grad_norm": 0.53044193983078,
+      "learning_rate": 0.0001655610529002174,
+      "loss": 0.9227,
+      "step": 6135
+    },
+    {
+      "epoch": 1.0924145299145298,
+      "grad_norm": 0.6499412655830383,
+      "learning_rate": 0.00016555048277764836,
+      "loss": 1.0867,
+      "step": 6136
+    },
+    {
+      "epoch": 1.0925925925925926,
+      "grad_norm": 0.6543099284172058,
+      "learning_rate": 0.00016553991137075374,
+      "loss": 0.849,
+      "step": 6137
+    },
+    {
+      "epoch": 1.0927706552706553,
+      "grad_norm": 0.5772737860679626,
+      "learning_rate": 0.0001655293386797407,
+      "loss": 0.8475,
+      "step": 6138
+    },
+    {
+      "epoch": 1.092948717948718,
+      "grad_norm": 0.616348385810852,
+      "learning_rate": 0.00016551876470481642,
+      "loss": 0.9205,
+      "step": 6139
+    },
+    {
+      "epoch": 1.0931267806267806,
+      "grad_norm": 0.7151142954826355,
+      "learning_rate": 0.00016550818944618801,
+      "loss": 1.1389,
+      "step": 6140
+    },
+    {
+      "epoch": 1.0933048433048433,
+      "grad_norm": 0.6566469669342041,
+      "learning_rate": 0.00016549761290406275,
+      "loss": 0.8216,
+      "step": 6141
+    },
+    {
+      "epoch": 1.0934829059829059,
+      "grad_norm": 0.7075428366661072,
+      "learning_rate": 0.00016548703507864783,
+      "loss": 1.065,
+      "step": 6142
+    },
+    {
+      "epoch": 1.0936609686609686,
+      "grad_norm": 0.6589360237121582,
+      "learning_rate": 0.00016547645597015046,
+      "loss": 0.9899,
+      "step": 6143
+    },
+    {
+      "epoch": 1.0938390313390314,
+      "grad_norm": 0.6445585489273071,
+      "learning_rate": 0.00016546587557877797,
+      "loss": 1.1629,
+      "step": 6144
+    },
+    {
+      "epoch": 1.0940170940170941,
+      "grad_norm": 0.6216462850570679,
+      "learning_rate": 0.00016545529390473763,
+      "loss": 0.9685,
+      "step": 6145
+    },
+    {
+      "epoch": 1.0941951566951567,
+      "grad_norm": 0.6195303797721863,
+      "learning_rate": 0.0001654447109482368,
+      "loss": 1.144,
+      "step": 6146
+    },
+    {
+      "epoch": 1.0943732193732194,
+      "grad_norm": 0.6625444293022156,
+      "learning_rate": 0.0001654341267094828,
+      "loss": 0.9886,
+      "step": 6147
+    },
+    {
+      "epoch": 1.094551282051282,
+      "grad_norm": 0.6449851393699646,
+      "learning_rate": 0.000165423541188683,
+      "loss": 0.9568,
+      "step": 6148
+    },
+    {
+      "epoch": 1.0947293447293447,
+      "grad_norm": 0.6490375995635986,
+      "learning_rate": 0.00016541295438604484,
+      "loss": 1.1304,
+      "step": 6149
+    },
+    {
+      "epoch": 1.0949074074074074,
+      "grad_norm": 0.6771987676620483,
+      "learning_rate": 0.00016540236630177574,
+      "loss": 1.0426,
+      "step": 6150
+    },
+    {
+      "epoch": 1.0950854700854702,
+      "grad_norm": 0.5214568376541138,
+      "learning_rate": 0.00016539177693608307,
+      "loss": 0.6742,
+      "step": 6151
+    },
+    {
+      "epoch": 1.0952635327635327,
+      "grad_norm": 0.6005097031593323,
+      "learning_rate": 0.00016538118628917442,
+      "loss": 0.9901,
+      "step": 6152
+    },
+    {
+      "epoch": 1.0954415954415955,
+      "grad_norm": 0.6449539065361023,
+      "learning_rate": 0.0001653705943612572,
+      "loss": 0.9654,
+      "step": 6153
+    },
+    {
+      "epoch": 1.095619658119658,
+      "grad_norm": 0.6443646550178528,
+      "learning_rate": 0.00016536000115253903,
+      "loss": 0.9084,
+      "step": 6154
+    },
+    {
+      "epoch": 1.0957977207977208,
+      "grad_norm": 0.6072495579719543,
+      "learning_rate": 0.0001653494066632274,
+      "loss": 0.6308,
+      "step": 6155
+    },
+    {
+      "epoch": 1.0959757834757835,
+      "grad_norm": 0.5751157999038696,
+      "learning_rate": 0.00016533881089352988,
+      "loss": 0.96,
+      "step": 6156
+    },
+    {
+      "epoch": 1.0961538461538463,
+      "grad_norm": 0.6310713291168213,
+      "learning_rate": 0.0001653282138436541,
+      "loss": 1.0997,
+      "step": 6157
+    },
+    {
+      "epoch": 1.0963319088319088,
+      "grad_norm": 0.5573651790618896,
+      "learning_rate": 0.00016531761551380765,
+      "loss": 0.9738,
+      "step": 6158
+    },
+    {
+      "epoch": 1.0965099715099715,
+      "grad_norm": 0.5615308880805969,
+      "learning_rate": 0.00016530701590419824,
+      "loss": 0.9658,
+      "step": 6159
+    },
+    {
+      "epoch": 1.0966880341880343,
+      "grad_norm": 0.6471942663192749,
+      "learning_rate": 0.0001652964150150335,
+      "loss": 1.0763,
+      "step": 6160
+    },
+    {
+      "epoch": 1.0968660968660968,
+      "grad_norm": 0.6305427551269531,
+      "learning_rate": 0.00016528581284652117,
+      "loss": 1.112,
+      "step": 6161
+    },
+    {
+      "epoch": 1.0970441595441596,
+      "grad_norm": 0.6881145238876343,
+      "learning_rate": 0.00016527520939886892,
+      "loss": 0.8476,
+      "step": 6162
+    },
+    {
+      "epoch": 1.0972222222222223,
+      "grad_norm": 0.6507891416549683,
+      "learning_rate": 0.00016526460467228458,
+      "loss": 1.1097,
+      "step": 6163
+    },
+    {
+      "epoch": 1.0974002849002849,
+      "grad_norm": 0.5960137844085693,
+      "learning_rate": 0.00016525399866697586,
+      "loss": 0.9934,
+      "step": 6164
+    },
+    {
+      "epoch": 1.0975783475783476,
+      "grad_norm": 0.6001808643341064,
+      "learning_rate": 0.0001652433913831506,
+      "loss": 1.0782,
+      "step": 6165
+    },
+    {
+      "epoch": 1.0977564102564104,
+      "grad_norm": 0.5639005303382874,
+      "learning_rate": 0.00016523278282101663,
+      "loss": 1.0929,
+      "step": 6166
+    },
+    {
+      "epoch": 1.0979344729344729,
+      "grad_norm": 0.5962058305740356,
+      "learning_rate": 0.00016522217298078177,
+      "loss": 1.0315,
+      "step": 6167
+    },
+    {
+      "epoch": 1.0981125356125356,
+      "grad_norm": 0.6920329928398132,
+      "learning_rate": 0.0001652115618626539,
+      "loss": 0.9176,
+      "step": 6168
+    },
+    {
+      "epoch": 1.0982905982905984,
+      "grad_norm": 0.6963527202606201,
+      "learning_rate": 0.00016520094946684098,
+      "loss": 1.2136,
+      "step": 6169
+    },
+    {
+      "epoch": 1.098468660968661,
+      "grad_norm": 0.5855711102485657,
+      "learning_rate": 0.00016519033579355093,
+      "loss": 0.8453,
+      "step": 6170
+    },
+    {
+      "epoch": 1.0986467236467237,
+      "grad_norm": 0.6454927325248718,
+      "learning_rate": 0.0001651797208429916,
+      "loss": 1.0747,
+      "step": 6171
+    },
+    {
+      "epoch": 1.0988247863247864,
+      "grad_norm": 0.644585907459259,
+      "learning_rate": 0.00016516910461537108,
+      "loss": 0.8165,
+      "step": 6172
+    },
+    {
+      "epoch": 1.099002849002849,
+      "grad_norm": 0.6488069891929626,
+      "learning_rate": 0.00016515848711089732,
+      "loss": 1.1048,
+      "step": 6173
+    },
+    {
+      "epoch": 1.0991809116809117,
+      "grad_norm": 0.5867953896522522,
+      "learning_rate": 0.00016514786832977834,
+      "loss": 0.63,
+      "step": 6174
+    },
+    {
+      "epoch": 1.0993589743589745,
+      "grad_norm": 0.560591459274292,
+      "learning_rate": 0.00016513724827222227,
+      "loss": 0.9255,
+      "step": 6175
+    },
+    {
+      "epoch": 1.099537037037037,
+      "grad_norm": 0.675262451171875,
+      "learning_rate": 0.00016512662693843707,
+      "loss": 0.7637,
+      "step": 6176
+    },
+    {
+      "epoch": 1.0997150997150997,
+      "grad_norm": 0.6515669822692871,
+      "learning_rate": 0.00016511600432863091,
+      "loss": 0.7579,
+      "step": 6177
+    },
+    {
+      "epoch": 1.0998931623931625,
+      "grad_norm": 0.683409571647644,
+      "learning_rate": 0.00016510538044301192,
+      "loss": 0.9183,
+      "step": 6178
+    },
+    {
+      "epoch": 1.100071225071225,
+      "grad_norm": 0.6194507479667664,
+      "learning_rate": 0.00016509475528178827,
+      "loss": 1.16,
+      "step": 6179
+    },
+    {
+      "epoch": 1.1002492877492878,
+      "grad_norm": 0.6192209720611572,
+      "learning_rate": 0.0001650841288451681,
+      "loss": 1.1392,
+      "step": 6180
+    },
+    {
+      "epoch": 1.1004273504273505,
+      "grad_norm": 0.6029189825057983,
+      "learning_rate": 0.0001650735011333596,
+      "loss": 1.1453,
+      "step": 6181
+    },
+    {
+      "epoch": 1.100605413105413,
+      "grad_norm": 0.7040731310844421,
+      "learning_rate": 0.00016506287214657105,
+      "loss": 0.9367,
+      "step": 6182
+    },
+    {
+      "epoch": 1.1007834757834758,
+      "grad_norm": 0.5909842252731323,
+      "learning_rate": 0.00016505224188501067,
+      "loss": 0.6463,
+      "step": 6183
+    },
+    {
+      "epoch": 1.1009615384615385,
+      "grad_norm": 0.6129698157310486,
+      "learning_rate": 0.00016504161034888674,
+      "loss": 0.9432,
+      "step": 6184
+    },
+    {
+      "epoch": 1.101139601139601,
+      "grad_norm": 0.6181607842445374,
+      "learning_rate": 0.00016503097753840757,
+      "loss": 0.9934,
+      "step": 6185
+    },
+    {
+      "epoch": 1.1013176638176638,
+      "grad_norm": 0.6463226675987244,
+      "learning_rate": 0.0001650203434537815,
+      "loss": 0.8471,
+      "step": 6186
+    },
+    {
+      "epoch": 1.1014957264957266,
+      "grad_norm": 0.5999348163604736,
+      "learning_rate": 0.00016500970809521688,
+      "loss": 0.9418,
+      "step": 6187
+    },
+    {
+      "epoch": 1.101673789173789,
+      "grad_norm": 0.629504919052124,
+      "learning_rate": 0.00016499907146292204,
+      "loss": 0.9699,
+      "step": 6188
+    },
+    {
+      "epoch": 1.1018518518518519,
+      "grad_norm": 0.694767951965332,
+      "learning_rate": 0.00016498843355710542,
+      "loss": 0.8793,
+      "step": 6189
+    },
+    {
+      "epoch": 1.1020299145299146,
+      "grad_norm": 0.6205509901046753,
+      "learning_rate": 0.00016497779437797547,
+      "loss": 0.8384,
+      "step": 6190
+    },
+    {
+      "epoch": 1.1022079772079771,
+      "grad_norm": 0.6256579756736755,
+      "learning_rate": 0.0001649671539257406,
+      "loss": 0.9275,
+      "step": 6191
+    },
+    {
+      "epoch": 1.10238603988604,
+      "grad_norm": 0.6593793034553528,
+      "learning_rate": 0.00016495651220060933,
+      "loss": 1.0495,
+      "step": 6192
+    },
+    {
+      "epoch": 1.1025641025641026,
+      "grad_norm": 0.7809221148490906,
+      "learning_rate": 0.00016494586920279012,
+      "loss": 1.0485,
+      "step": 6193
+    },
+    {
+      "epoch": 1.1027421652421652,
+      "grad_norm": 0.6147717833518982,
+      "learning_rate": 0.0001649352249324915,
+      "loss": 0.8739,
+      "step": 6194
+    },
+    {
+      "epoch": 1.102920227920228,
+      "grad_norm": 0.565411388874054,
+      "learning_rate": 0.00016492457938992208,
+      "loss": 0.9759,
+      "step": 6195
+    },
+    {
+      "epoch": 1.1030982905982907,
+      "grad_norm": 0.596370279788971,
+      "learning_rate": 0.00016491393257529036,
+      "loss": 0.9658,
+      "step": 6196
+    },
+    {
+      "epoch": 1.1032763532763532,
+      "grad_norm": 0.6334326863288879,
+      "learning_rate": 0.00016490328448880498,
+      "loss": 0.8785,
+      "step": 6197
+    },
+    {
+      "epoch": 1.103454415954416,
+      "grad_norm": 0.5538334846496582,
+      "learning_rate": 0.0001648926351306746,
+      "loss": 0.7174,
+      "step": 6198
+    },
+    {
+      "epoch": 1.1036324786324787,
+      "grad_norm": 0.6249658465385437,
+      "learning_rate": 0.00016488198450110778,
+      "loss": 0.8579,
+      "step": 6199
+    },
+    {
+      "epoch": 1.1038105413105412,
+      "grad_norm": 0.6128895878791809,
+      "learning_rate": 0.00016487133260031329,
+      "loss": 0.8538,
+      "step": 6200
+    },
+    {
+      "epoch": 1.103988603988604,
+      "grad_norm": 0.5808702707290649,
+      "learning_rate": 0.0001648606794284998,
+      "loss": 0.8143,
+      "step": 6201
+    },
+    {
+      "epoch": 1.1041666666666667,
+      "grad_norm": 0.671419084072113,
+      "learning_rate": 0.00016485002498587602,
+      "loss": 1.1268,
+      "step": 6202
+    },
+    {
+      "epoch": 1.1043447293447293,
+      "grad_norm": 0.5706788897514343,
+      "learning_rate": 0.00016483936927265075,
+      "loss": 0.9558,
+      "step": 6203
+    },
+    {
+      "epoch": 1.104522792022792,
+      "grad_norm": 0.5700307488441467,
+      "learning_rate": 0.00016482871228903266,
+      "loss": 0.9616,
+      "step": 6204
+    },
+    {
+      "epoch": 1.1047008547008548,
+      "grad_norm": 0.5764816403388977,
+      "learning_rate": 0.0001648180540352307,
+      "loss": 0.8692,
+      "step": 6205
+    },
+    {
+      "epoch": 1.1048789173789173,
+      "grad_norm": 0.5786563754081726,
+      "learning_rate": 0.00016480739451145358,
+      "loss": 0.9406,
+      "step": 6206
+    },
+    {
+      "epoch": 1.10505698005698,
+      "grad_norm": 0.6112591624259949,
+      "learning_rate": 0.0001647967337179102,
+      "loss": 0.8999,
+      "step": 6207
+    },
+    {
+      "epoch": 1.1052350427350428,
+      "grad_norm": 0.5708907246589661,
+      "learning_rate": 0.00016478607165480944,
+      "loss": 0.9236,
+      "step": 6208
+    },
+    {
+      "epoch": 1.1054131054131053,
+      "grad_norm": 0.6742013692855835,
+      "learning_rate": 0.00016477540832236014,
+      "loss": 1.0911,
+      "step": 6209
+    },
+    {
+      "epoch": 1.105591168091168,
+      "grad_norm": 0.6382617354393005,
+      "learning_rate": 0.0001647647437207713,
+      "loss": 0.7901,
+      "step": 6210
+    },
+    {
+      "epoch": 1.1057692307692308,
+      "grad_norm": 0.6241547465324402,
+      "learning_rate": 0.00016475407785025188,
+      "loss": 1.0048,
+      "step": 6211
+    },
+    {
+      "epoch": 1.1059472934472934,
+      "grad_norm": 0.6452877521514893,
+      "learning_rate": 0.00016474341071101077,
+      "loss": 0.8902,
+      "step": 6212
+    },
+    {
+      "epoch": 1.1061253561253561,
+      "grad_norm": 0.6212326288223267,
+      "learning_rate": 0.00016473274230325704,
+      "loss": 1.078,
+      "step": 6213
+    },
+    {
+      "epoch": 1.1063034188034189,
+      "grad_norm": 0.6870912909507751,
+      "learning_rate": 0.00016472207262719968,
+      "loss": 0.9127,
+      "step": 6214
+    },
+    {
+      "epoch": 1.1064814814814814,
+      "grad_norm": 0.6286750435829163,
+      "learning_rate": 0.00016471140168304777,
+      "loss": 1.0271,
+      "step": 6215
+    },
+    {
+      "epoch": 1.1066595441595442,
+      "grad_norm": 0.645806074142456,
+      "learning_rate": 0.00016470072947101036,
+      "loss": 1.1514,
+      "step": 6216
+    },
+    {
+      "epoch": 1.106837606837607,
+      "grad_norm": 0.6800320148468018,
+      "learning_rate": 0.00016469005599129653,
+      "loss": 0.9322,
+      "step": 6217
+    },
+    {
+      "epoch": 1.1070156695156694,
+      "grad_norm": 0.5898309946060181,
+      "learning_rate": 0.0001646793812441155,
+      "loss": 1.065,
+      "step": 6218
+    },
+    {
+      "epoch": 1.1071937321937322,
+      "grad_norm": 0.6000019907951355,
+      "learning_rate": 0.00016466870522967634,
+      "loss": 0.911,
+      "step": 6219
+    },
+    {
+      "epoch": 1.107371794871795,
+      "grad_norm": 0.6164331436157227,
+      "learning_rate": 0.0001646580279481882,
+      "loss": 0.8421,
+      "step": 6220
+    },
+    {
+      "epoch": 1.1075498575498575,
+      "grad_norm": 0.6410242319107056,
+      "learning_rate": 0.00016464734939986036,
+      "loss": 0.9688,
+      "step": 6221
+    },
+    {
+      "epoch": 1.1077279202279202,
+      "grad_norm": 0.7153300046920776,
+      "learning_rate": 0.00016463666958490197,
+      "loss": 1.0722,
+      "step": 6222
+    },
+    {
+      "epoch": 1.107905982905983,
+      "grad_norm": 0.6977026462554932,
+      "learning_rate": 0.00016462598850352234,
+      "loss": 1.0192,
+      "step": 6223
+    },
+    {
+      "epoch": 1.1080840455840455,
+      "grad_norm": 0.6379461884498596,
+      "learning_rate": 0.0001646153061559307,
+      "loss": 1.0474,
+      "step": 6224
+    },
+    {
+      "epoch": 1.1082621082621082,
+      "grad_norm": 0.6135090589523315,
+      "learning_rate": 0.00016460462254233634,
+      "loss": 1.0082,
+      "step": 6225
+    },
+    {
+      "epoch": 1.108440170940171,
+      "grad_norm": 0.6326230764389038,
+      "learning_rate": 0.00016459393766294866,
+      "loss": 1.1097,
+      "step": 6226
+    },
+    {
+      "epoch": 1.1086182336182335,
+      "grad_norm": 0.6636839509010315,
+      "learning_rate": 0.0001645832515179769,
+      "loss": 0.9689,
+      "step": 6227
+    },
+    {
+      "epoch": 1.1087962962962963,
+      "grad_norm": 0.5713129043579102,
+      "learning_rate": 0.00016457256410763052,
+      "loss": 0.8642,
+      "step": 6228
+    },
+    {
+      "epoch": 1.108974358974359,
+      "grad_norm": 0.584204912185669,
+      "learning_rate": 0.00016456187543211888,
+      "loss": 0.9957,
+      "step": 6229
+    },
+    {
+      "epoch": 1.1091524216524216,
+      "grad_norm": 0.5920230746269226,
+      "learning_rate": 0.0001645511854916514,
+      "loss": 0.7297,
+      "step": 6230
+    },
+    {
+      "epoch": 1.1093304843304843,
+      "grad_norm": 0.6207385063171387,
+      "learning_rate": 0.0001645404942864375,
+      "loss": 0.868,
+      "step": 6231
+    },
+    {
+      "epoch": 1.109508547008547,
+      "grad_norm": 0.7267234921455383,
+      "learning_rate": 0.00016452980181668673,
+      "loss": 1.0248,
+      "step": 6232
+    },
+    {
+      "epoch": 1.1096866096866096,
+      "grad_norm": 0.5925650596618652,
+      "learning_rate": 0.00016451910808260852,
+      "loss": 1.1075,
+      "step": 6233
+    },
+    {
+      "epoch": 1.1098646723646723,
+      "grad_norm": 0.5632196664810181,
+      "learning_rate": 0.00016450841308441244,
+      "loss": 0.9865,
+      "step": 6234
+    },
+    {
+      "epoch": 1.110042735042735,
+      "grad_norm": 0.6115161180496216,
+      "learning_rate": 0.000164497716822308,
+      "loss": 1.1343,
+      "step": 6235
+    },
+    {
+      "epoch": 1.1102207977207976,
+      "grad_norm": 0.634398341178894,
+      "learning_rate": 0.00016448701929650477,
+      "loss": 1.1039,
+      "step": 6236
+    },
+    {
+      "epoch": 1.1103988603988604,
+      "grad_norm": 0.5843468308448792,
+      "learning_rate": 0.00016447632050721237,
+      "loss": 0.8462,
+      "step": 6237
+    },
+    {
+      "epoch": 1.1105769230769231,
+      "grad_norm": 0.799375593662262,
+      "learning_rate": 0.0001644656204546404,
+      "loss": 0.9861,
+      "step": 6238
+    },
+    {
+      "epoch": 1.1107549857549857,
+      "grad_norm": 0.600289523601532,
+      "learning_rate": 0.0001644549191389985,
+      "loss": 1.0323,
+      "step": 6239
+    },
+    {
+      "epoch": 1.1109330484330484,
+      "grad_norm": 0.6154919266700745,
+      "learning_rate": 0.00016444421656049637,
+      "loss": 0.9158,
+      "step": 6240
+    },
+    {
+      "epoch": 1.1111111111111112,
+      "grad_norm": 0.6685689687728882,
+      "learning_rate": 0.00016443351271934367,
+      "loss": 1.0429,
+      "step": 6241
+    },
+    {
+      "epoch": 1.1112891737891737,
+      "grad_norm": 0.699978232383728,
+      "learning_rate": 0.00016442280761575016,
+      "loss": 1.072,
+      "step": 6242
+    },
+    {
+      "epoch": 1.1114672364672364,
+      "grad_norm": 0.6461396217346191,
+      "learning_rate": 0.00016441210124992556,
+      "loss": 0.9758,
+      "step": 6243
+    },
+    {
+      "epoch": 1.1116452991452992,
+      "grad_norm": 0.6463284492492676,
+      "learning_rate": 0.00016440139362207962,
+      "loss": 0.9205,
+      "step": 6244
+    },
+    {
+      "epoch": 1.1118233618233617,
+      "grad_norm": 0.6587556004524231,
+      "learning_rate": 0.00016439068473242217,
+      "loss": 1.0027,
+      "step": 6245
+    },
+    {
+      "epoch": 1.1120014245014245,
+      "grad_norm": 0.6896520256996155,
+      "learning_rate": 0.000164379974581163,
+      "loss": 0.9788,
+      "step": 6246
+    },
+    {
+      "epoch": 1.1121794871794872,
+      "grad_norm": 0.6766142845153809,
+      "learning_rate": 0.000164369263168512,
+      "loss": 0.9647,
+      "step": 6247
+    },
+    {
+      "epoch": 1.1123575498575498,
+      "grad_norm": 0.7024297118186951,
+      "learning_rate": 0.00016435855049467898,
+      "loss": 1.1163,
+      "step": 6248
+    },
+    {
+      "epoch": 1.1125356125356125,
+      "grad_norm": 0.6654963493347168,
+      "learning_rate": 0.00016434783655987385,
+      "loss": 0.9302,
+      "step": 6249
+    },
+    {
+      "epoch": 1.1127136752136753,
+      "grad_norm": 0.6973692774772644,
+      "learning_rate": 0.0001643371213643065,
+      "loss": 0.9585,
+      "step": 6250
+    },
+    {
+      "epoch": 1.1128917378917378,
+      "grad_norm": 0.7153545022010803,
+      "learning_rate": 0.000164326404908187,
+      "loss": 1.0485,
+      "step": 6251
+    },
+    {
+      "epoch": 1.1130698005698005,
+      "grad_norm": 0.6114685535430908,
+      "learning_rate": 0.00016431568719172516,
+      "loss": 0.8881,
+      "step": 6252
+    },
+    {
+      "epoch": 1.1132478632478633,
+      "grad_norm": 0.6500731706619263,
+      "learning_rate": 0.00016430496821513103,
+      "loss": 1.0658,
+      "step": 6253
+    },
+    {
+      "epoch": 1.113425925925926,
+      "grad_norm": 0.5800092220306396,
+      "learning_rate": 0.00016429424797861466,
+      "loss": 0.9158,
+      "step": 6254
+    },
+    {
+      "epoch": 1.1136039886039886,
+      "grad_norm": 0.6653759479522705,
+      "learning_rate": 0.00016428352648238602,
+      "loss": 0.9762,
+      "step": 6255
+    },
+    {
+      "epoch": 1.1137820512820513,
+      "grad_norm": 0.649208128452301,
+      "learning_rate": 0.00016427280372665525,
+      "loss": 1.1184,
+      "step": 6256
+    },
+    {
+      "epoch": 1.1139601139601139,
+      "grad_norm": 0.6665199398994446,
+      "learning_rate": 0.00016426207971163238,
+      "loss": 0.9417,
+      "step": 6257
+    },
+    {
+      "epoch": 1.1141381766381766,
+      "grad_norm": 0.6110978126525879,
+      "learning_rate": 0.00016425135443752758,
+      "loss": 1.1531,
+      "step": 6258
+    },
+    {
+      "epoch": 1.1143162393162394,
+      "grad_norm": 0.6517077088356018,
+      "learning_rate": 0.00016424062790455093,
+      "loss": 0.9055,
+      "step": 6259
+    },
+    {
+      "epoch": 1.114494301994302,
+      "grad_norm": 0.6278966665267944,
+      "learning_rate": 0.00016422990011291265,
+      "loss": 1.0087,
+      "step": 6260
+    },
+    {
+      "epoch": 1.1146723646723646,
+      "grad_norm": 0.5818809270858765,
+      "learning_rate": 0.00016421917106282288,
+      "loss": 1.0202,
+      "step": 6261
+    },
+    {
+      "epoch": 1.1148504273504274,
+      "grad_norm": 0.5670005679130554,
+      "learning_rate": 0.00016420844075449187,
+      "loss": 0.841,
+      "step": 6262
+    },
+    {
+      "epoch": 1.11502849002849,
+      "grad_norm": 0.6584762334823608,
+      "learning_rate": 0.00016419770918812984,
+      "loss": 1.0322,
+      "step": 6263
+    },
+    {
+      "epoch": 1.1152065527065527,
+      "grad_norm": 0.6023790836334229,
+      "learning_rate": 0.00016418697636394705,
+      "loss": 0.9152,
+      "step": 6264
+    },
+    {
+      "epoch": 1.1153846153846154,
+      "grad_norm": 0.6234691739082336,
+      "learning_rate": 0.00016417624228215382,
+      "loss": 0.9555,
+      "step": 6265
+    },
+    {
+      "epoch": 1.1155626780626782,
+      "grad_norm": 0.6690816879272461,
+      "learning_rate": 0.00016416550694296045,
+      "loss": 0.9341,
+      "step": 6266
+    },
+    {
+      "epoch": 1.1157407407407407,
+      "grad_norm": 0.6030237078666687,
+      "learning_rate": 0.00016415477034657723,
+      "loss": 1.0442,
+      "step": 6267
+    },
+    {
+      "epoch": 1.1159188034188035,
+      "grad_norm": 0.5954633951187134,
+      "learning_rate": 0.00016414403249321455,
+      "loss": 0.9132,
+      "step": 6268
+    },
+    {
+      "epoch": 1.116096866096866,
+      "grad_norm": 0.7876830101013184,
+      "learning_rate": 0.0001641332933830828,
+      "loss": 0.9456,
+      "step": 6269
+    },
+    {
+      "epoch": 1.1162749287749287,
+      "grad_norm": 0.6776009798049927,
+      "learning_rate": 0.00016412255301639244,
+      "loss": 0.9022,
+      "step": 6270
+    },
+    {
+      "epoch": 1.1164529914529915,
+      "grad_norm": 0.6094426512718201,
+      "learning_rate": 0.0001641118113933538,
+      "loss": 0.9629,
+      "step": 6271
+    },
+    {
+      "epoch": 1.1166310541310542,
+      "grad_norm": 0.5818213820457458,
+      "learning_rate": 0.00016410106851417742,
+      "loss": 0.9049,
+      "step": 6272
+    },
+    {
+      "epoch": 1.1168091168091168,
+      "grad_norm": 0.5668078064918518,
+      "learning_rate": 0.00016409032437907377,
+      "loss": 1.0011,
+      "step": 6273
+    },
+    {
+      "epoch": 1.1169871794871795,
+      "grad_norm": 0.6984922289848328,
+      "learning_rate": 0.00016407957898825334,
+      "loss": 0.9454,
+      "step": 6274
+    },
+    {
+      "epoch": 1.1171652421652423,
+      "grad_norm": 0.5509830117225647,
+      "learning_rate": 0.00016406883234192668,
+      "loss": 0.9132,
+      "step": 6275
+    },
+    {
+      "epoch": 1.1173433048433048,
+      "grad_norm": 0.5117461681365967,
+      "learning_rate": 0.00016405808444030435,
+      "loss": 0.7675,
+      "step": 6276
+    },
+    {
+      "epoch": 1.1175213675213675,
+      "grad_norm": 0.6358339786529541,
+      "learning_rate": 0.00016404733528359688,
+      "loss": 0.9777,
+      "step": 6277
+    },
+    {
+      "epoch": 1.1176994301994303,
+      "grad_norm": 0.5870591402053833,
+      "learning_rate": 0.00016403658487201494,
+      "loss": 0.8576,
+      "step": 6278
+    },
+    {
+      "epoch": 1.1178774928774928,
+      "grad_norm": 0.6532407999038696,
+      "learning_rate": 0.00016402583320576915,
+      "loss": 1.1787,
+      "step": 6279
+    },
+    {
+      "epoch": 1.1180555555555556,
+      "grad_norm": 0.6374639272689819,
+      "learning_rate": 0.00016401508028507017,
+      "loss": 0.9298,
+      "step": 6280
+    },
+    {
+      "epoch": 1.1182336182336183,
+      "grad_norm": 0.7280316352844238,
+      "learning_rate": 0.00016400432611012869,
+      "loss": 1.1081,
+      "step": 6281
+    },
+    {
+      "epoch": 1.1184116809116809,
+      "grad_norm": 0.6070699095726013,
+      "learning_rate": 0.00016399357068115538,
+      "loss": 0.9107,
+      "step": 6282
+    },
+    {
+      "epoch": 1.1185897435897436,
+      "grad_norm": 0.6701489686965942,
+      "learning_rate": 0.00016398281399836097,
+      "loss": 1.0879,
+      "step": 6283
+    },
+    {
+      "epoch": 1.1187678062678064,
+      "grad_norm": 0.6343162655830383,
+      "learning_rate": 0.00016397205606195626,
+      "loss": 0.8552,
+      "step": 6284
+    },
+    {
+      "epoch": 1.118945868945869,
+      "grad_norm": 0.6450608968734741,
+      "learning_rate": 0.00016396129687215198,
+      "loss": 1.1119,
+      "step": 6285
+    },
+    {
+      "epoch": 1.1191239316239316,
+      "grad_norm": 0.7219904661178589,
+      "learning_rate": 0.00016395053642915896,
+      "loss": 0.9081,
+      "step": 6286
+    },
+    {
+      "epoch": 1.1193019943019944,
+      "grad_norm": 0.6189733147621155,
+      "learning_rate": 0.00016393977473318802,
+      "loss": 0.9818,
+      "step": 6287
+    },
+    {
+      "epoch": 1.119480056980057,
+      "grad_norm": 0.6310907602310181,
+      "learning_rate": 0.00016392901178445004,
+      "loss": 1.0334,
+      "step": 6288
+    },
+    {
+      "epoch": 1.1196581196581197,
+      "grad_norm": 0.6556720733642578,
+      "learning_rate": 0.00016391824758315587,
+      "loss": 1.0452,
+      "step": 6289
+    },
+    {
+      "epoch": 1.1198361823361824,
+      "grad_norm": 0.6697782278060913,
+      "learning_rate": 0.00016390748212951638,
+      "loss": 0.9627,
+      "step": 6290
+    },
+    {
+      "epoch": 1.120014245014245,
+      "grad_norm": 0.6341549754142761,
+      "learning_rate": 0.00016389671542374256,
+      "loss": 1.112,
+      "step": 6291
+    },
+    {
+      "epoch": 1.1201923076923077,
+      "grad_norm": 0.6913946270942688,
+      "learning_rate": 0.00016388594746604535,
+      "loss": 0.9622,
+      "step": 6292
+    },
+    {
+      "epoch": 1.1203703703703705,
+      "grad_norm": 0.695488691329956,
+      "learning_rate": 0.0001638751782566357,
+      "loss": 1.0951,
+      "step": 6293
+    },
+    {
+      "epoch": 1.120548433048433,
+      "grad_norm": 0.6965359449386597,
+      "learning_rate": 0.00016386440779572463,
+      "loss": 1.1742,
+      "step": 6294
+    },
+    {
+      "epoch": 1.1207264957264957,
+      "grad_norm": 0.624679684638977,
+      "learning_rate": 0.00016385363608352314,
+      "loss": 0.9756,
+      "step": 6295
+    },
+    {
+      "epoch": 1.1209045584045585,
+      "grad_norm": 0.7511318922042847,
+      "learning_rate": 0.0001638428631202423,
+      "loss": 0.907,
+      "step": 6296
+    },
+    {
+      "epoch": 1.121082621082621,
+      "grad_norm": 0.5334641337394714,
+      "learning_rate": 0.00016383208890609317,
+      "loss": 0.7932,
+      "step": 6297
+    },
+    {
+      "epoch": 1.1212606837606838,
+      "grad_norm": 0.7518552541732788,
+      "learning_rate": 0.00016382131344128687,
+      "loss": 1.1556,
+      "step": 6298
+    },
+    {
+      "epoch": 1.1214387464387465,
+      "grad_norm": 0.618618369102478,
+      "learning_rate": 0.00016381053672603449,
+      "loss": 1.1027,
+      "step": 6299
+    },
+    {
+      "epoch": 1.121616809116809,
+      "grad_norm": 0.638956606388092,
+      "learning_rate": 0.00016379975876054724,
+      "loss": 1.0377,
+      "step": 6300
+    },
+    {
+      "epoch": 1.1217948717948718,
+      "grad_norm": 0.8031370639801025,
+      "learning_rate": 0.0001637889795450362,
+      "loss": 1.0821,
+      "step": 6301
+    },
+    {
+      "epoch": 1.1219729344729346,
+      "grad_norm": 0.6710168123245239,
+      "learning_rate": 0.00016377819907971265,
+      "loss": 1.2896,
+      "step": 6302
+    },
+    {
+      "epoch": 1.122150997150997,
+      "grad_norm": 0.5850739479064941,
+      "learning_rate": 0.00016376741736478777,
+      "loss": 1.0836,
+      "step": 6303
+    },
+    {
+      "epoch": 1.1223290598290598,
+      "grad_norm": 0.6410611271858215,
+      "learning_rate": 0.0001637566344004728,
+      "loss": 1.0395,
+      "step": 6304
+    },
+    {
+      "epoch": 1.1225071225071226,
+      "grad_norm": 0.6884660720825195,
+      "learning_rate": 0.00016374585018697903,
+      "loss": 0.871,
+      "step": 6305
+    },
+    {
+      "epoch": 1.1226851851851851,
+      "grad_norm": 0.622207522392273,
+      "learning_rate": 0.00016373506472451777,
+      "loss": 0.9897,
+      "step": 6306
+    },
+    {
+      "epoch": 1.1228632478632479,
+      "grad_norm": 0.6018275618553162,
+      "learning_rate": 0.00016372427801330028,
+      "loss": 0.8398,
+      "step": 6307
+    },
+    {
+      "epoch": 1.1230413105413106,
+      "grad_norm": 0.6451539993286133,
+      "learning_rate": 0.00016371349005353796,
+      "loss": 0.9878,
+      "step": 6308
+    },
+    {
+      "epoch": 1.1232193732193732,
+      "grad_norm": 0.5549424886703491,
+      "learning_rate": 0.00016370270084544215,
+      "loss": 0.844,
+      "step": 6309
+    },
+    {
+      "epoch": 1.123397435897436,
+      "grad_norm": 0.6082940697669983,
+      "learning_rate": 0.00016369191038922423,
+      "loss": 1.0704,
+      "step": 6310
+    },
+    {
+      "epoch": 1.1235754985754987,
+      "grad_norm": 0.6423100829124451,
+      "learning_rate": 0.00016368111868509563,
+      "loss": 1.0639,
+      "step": 6311
+    },
+    {
+      "epoch": 1.1237535612535612,
+      "grad_norm": 0.6274200081825256,
+      "learning_rate": 0.00016367032573326784,
+      "loss": 0.9996,
+      "step": 6312
+    },
+    {
+      "epoch": 1.123931623931624,
+      "grad_norm": 0.6618558168411255,
+      "learning_rate": 0.00016365953153395227,
+      "loss": 0.8074,
+      "step": 6313
+    },
+    {
+      "epoch": 1.1241096866096867,
+      "grad_norm": 0.7624069452285767,
+      "learning_rate": 0.00016364873608736038,
+      "loss": 0.9741,
+      "step": 6314
+    },
+    {
+      "epoch": 1.1242877492877492,
+      "grad_norm": 0.5391361117362976,
+      "learning_rate": 0.00016363793939370375,
+      "loss": 0.6992,
+      "step": 6315
+    },
+    {
+      "epoch": 1.124465811965812,
+      "grad_norm": 0.7564396858215332,
+      "learning_rate": 0.0001636271414531939,
+      "loss": 1.1971,
+      "step": 6316
+    },
+    {
+      "epoch": 1.1246438746438747,
+      "grad_norm": 0.6584066152572632,
+      "learning_rate": 0.00016361634226604239,
+      "loss": 1.0842,
+      "step": 6317
+    },
+    {
+      "epoch": 1.1248219373219372,
+      "grad_norm": 0.6851227283477783,
+      "learning_rate": 0.00016360554183246078,
+      "loss": 1.0879,
+      "step": 6318
+    },
+    {
+      "epoch": 1.125,
+      "grad_norm": 0.5699417591094971,
+      "learning_rate": 0.00016359474015266074,
+      "loss": 0.782,
+      "step": 6319
+    },
+    {
+      "epoch": 1.1251780626780628,
+      "grad_norm": 0.5495570302009583,
+      "learning_rate": 0.00016358393722685385,
+      "loss": 1.076,
+      "step": 6320
+    },
+    {
+      "epoch": 1.1253561253561253,
+      "grad_norm": 0.5872206091880798,
+      "learning_rate": 0.0001635731330552518,
+      "loss": 0.8601,
+      "step": 6321
+    },
+    {
+      "epoch": 1.125534188034188,
+      "grad_norm": 0.7012827396392822,
+      "learning_rate": 0.00016356232763806627,
+      "loss": 1.0443,
+      "step": 6322
+    },
+    {
+      "epoch": 1.1257122507122508,
+      "grad_norm": 0.6645881533622742,
+      "learning_rate": 0.00016355152097550897,
+      "loss": 1.0027,
+      "step": 6323
+    },
+    {
+      "epoch": 1.1258903133903133,
+      "grad_norm": 0.7376120090484619,
+      "learning_rate": 0.00016354071306779163,
+      "loss": 1.1941,
+      "step": 6324
+    },
+    {
+      "epoch": 1.126068376068376,
+      "grad_norm": 0.648932695388794,
+      "learning_rate": 0.000163529903915126,
+      "loss": 1.096,
+      "step": 6325
+    },
+    {
+      "epoch": 1.1262464387464388,
+      "grad_norm": 0.6186314821243286,
+      "learning_rate": 0.0001635190935177239,
+      "loss": 1.011,
+      "step": 6326
+    },
+    {
+      "epoch": 1.1264245014245013,
+      "grad_norm": 0.5964710116386414,
+      "learning_rate": 0.0001635082818757971,
+      "loss": 0.8893,
+      "step": 6327
+    },
+    {
+      "epoch": 1.126602564102564,
+      "grad_norm": 0.5264934301376343,
+      "learning_rate": 0.00016349746898955747,
+      "loss": 0.7325,
+      "step": 6328
+    },
+    {
+      "epoch": 1.1267806267806268,
+      "grad_norm": 0.6523048877716064,
+      "learning_rate": 0.00016348665485921678,
+      "loss": 1.0488,
+      "step": 6329
+    },
+    {
+      "epoch": 1.1269586894586894,
+      "grad_norm": 0.6878600120544434,
+      "learning_rate": 0.00016347583948498703,
+      "loss": 1.0926,
+      "step": 6330
+    },
+    {
+      "epoch": 1.1271367521367521,
+      "grad_norm": 0.592656672000885,
+      "learning_rate": 0.00016346502286708004,
+      "loss": 0.978,
+      "step": 6331
+    },
+    {
+      "epoch": 1.1273148148148149,
+      "grad_norm": 0.6338315606117249,
+      "learning_rate": 0.00016345420500570777,
+      "loss": 1.1048,
+      "step": 6332
+    },
+    {
+      "epoch": 1.1274928774928774,
+      "grad_norm": 0.5955204367637634,
+      "learning_rate": 0.00016344338590108218,
+      "loss": 0.88,
+      "step": 6333
+    },
+    {
+      "epoch": 1.1276709401709402,
+      "grad_norm": 0.690448522567749,
+      "learning_rate": 0.0001634325655534152,
+      "loss": 1.0564,
+      "step": 6334
+    },
+    {
+      "epoch": 1.127849002849003,
+      "grad_norm": 0.6125795841217041,
+      "learning_rate": 0.00016342174396291888,
+      "loss": 1.0608,
+      "step": 6335
+    },
+    {
+      "epoch": 1.1280270655270654,
+      "grad_norm": 0.6387807726860046,
+      "learning_rate": 0.00016341092112980523,
+      "loss": 0.9581,
+      "step": 6336
+    },
+    {
+      "epoch": 1.1282051282051282,
+      "grad_norm": 0.6247823238372803,
+      "learning_rate": 0.0001634000970542863,
+      "loss": 0.932,
+      "step": 6337
+    },
+    {
+      "epoch": 1.128383190883191,
+      "grad_norm": 0.5928077697753906,
+      "learning_rate": 0.0001633892717365742,
+      "loss": 0.8963,
+      "step": 6338
+    },
+    {
+      "epoch": 1.1285612535612535,
+      "grad_norm": 0.5922074913978577,
+      "learning_rate": 0.000163378445176881,
+      "loss": 0.9772,
+      "step": 6339
+    },
+    {
+      "epoch": 1.1287393162393162,
+      "grad_norm": 0.6573056578636169,
+      "learning_rate": 0.00016336761737541878,
+      "loss": 0.8233,
+      "step": 6340
+    },
+    {
+      "epoch": 1.128917378917379,
+      "grad_norm": 0.627772867679596,
+      "learning_rate": 0.0001633567883323998,
+      "loss": 0.9618,
+      "step": 6341
+    },
+    {
+      "epoch": 1.1290954415954415,
+      "grad_norm": 0.6066579818725586,
+      "learning_rate": 0.0001633459580480361,
+      "loss": 0.9066,
+      "step": 6342
+    },
+    {
+      "epoch": 1.1292735042735043,
+      "grad_norm": 0.670295000076294,
+      "learning_rate": 0.00016333512652253997,
+      "loss": 0.8003,
+      "step": 6343
+    },
+    {
+      "epoch": 1.129451566951567,
+      "grad_norm": 0.6402488946914673,
+      "learning_rate": 0.0001633242937561236,
+      "loss": 0.998,
+      "step": 6344
+    },
+    {
+      "epoch": 1.1296296296296295,
+      "grad_norm": 0.7224995493888855,
+      "learning_rate": 0.00016331345974899923,
+      "loss": 1.0308,
+      "step": 6345
+    },
+    {
+      "epoch": 1.1298076923076923,
+      "grad_norm": 0.5019716620445251,
+      "learning_rate": 0.00016330262450137917,
+      "loss": 0.6874,
+      "step": 6346
+    },
+    {
+      "epoch": 1.129985754985755,
+      "grad_norm": 0.5774167776107788,
+      "learning_rate": 0.00016329178801347566,
+      "loss": 0.8287,
+      "step": 6347
+    },
+    {
+      "epoch": 1.1301638176638176,
+      "grad_norm": 0.7797795534133911,
+      "learning_rate": 0.00016328095028550103,
+      "loss": 1.2145,
+      "step": 6348
+    },
+    {
+      "epoch": 1.1303418803418803,
+      "grad_norm": 0.5384017825126648,
+      "learning_rate": 0.00016327011131766765,
+      "loss": 0.8022,
+      "step": 6349
+    },
+    {
+      "epoch": 1.130519943019943,
+      "grad_norm": 0.6350888609886169,
+      "learning_rate": 0.00016325927111018786,
+      "loss": 1.1178,
+      "step": 6350
+    },
+    {
+      "epoch": 1.1306980056980056,
+      "grad_norm": 0.6386831998825073,
+      "learning_rate": 0.0001632484296632741,
+      "loss": 0.967,
+      "step": 6351
+    },
+    {
+      "epoch": 1.1308760683760684,
+      "grad_norm": 0.6214167475700378,
+      "learning_rate": 0.0001632375869771387,
+      "loss": 0.9416,
+      "step": 6352
+    },
+    {
+      "epoch": 1.131054131054131,
+      "grad_norm": 0.6145567297935486,
+      "learning_rate": 0.00016322674305199416,
+      "loss": 0.9175,
+      "step": 6353
+    },
+    {
+      "epoch": 1.1312321937321936,
+      "grad_norm": 0.7027857303619385,
+      "learning_rate": 0.00016321589788805297,
+      "loss": 1.0063,
+      "step": 6354
+    },
+    {
+      "epoch": 1.1314102564102564,
+      "grad_norm": 0.6942669153213501,
+      "learning_rate": 0.00016320505148552755,
+      "loss": 0.9191,
+      "step": 6355
+    },
+    {
+      "epoch": 1.1315883190883191,
+      "grad_norm": 0.6388658285140991,
+      "learning_rate": 0.0001631942038446304,
+      "loss": 0.993,
+      "step": 6356
+    },
+    {
+      "epoch": 1.131766381766382,
+      "grad_norm": 0.6627292633056641,
+      "learning_rate": 0.00016318335496557415,
+      "loss": 1.0055,
+      "step": 6357
+    },
+    {
+      "epoch": 1.1319444444444444,
+      "grad_norm": 0.7997342944145203,
+      "learning_rate": 0.0001631725048485713,
+      "loss": 0.9019,
+      "step": 6358
+    },
+    {
+      "epoch": 1.1321225071225072,
+      "grad_norm": 0.8817830681800842,
+      "learning_rate": 0.00016316165349383445,
+      "loss": 0.9793,
+      "step": 6359
+    },
+    {
+      "epoch": 1.1323005698005697,
+      "grad_norm": 0.5629408955574036,
+      "learning_rate": 0.00016315080090157621,
+      "loss": 0.6139,
+      "step": 6360
+    },
+    {
+      "epoch": 1.1324786324786325,
+      "grad_norm": 0.647220253944397,
+      "learning_rate": 0.0001631399470720092,
+      "loss": 0.9776,
+      "step": 6361
+    },
+    {
+      "epoch": 1.1326566951566952,
+      "grad_norm": 0.6762630939483643,
+      "learning_rate": 0.0001631290920053461,
+      "loss": 1.1027,
+      "step": 6362
+    },
+    {
+      "epoch": 1.132834757834758,
+      "grad_norm": 0.5862727761268616,
+      "learning_rate": 0.00016311823570179957,
+      "loss": 1.1359,
+      "step": 6363
+    },
+    {
+      "epoch": 1.1330128205128205,
+      "grad_norm": 0.7042981386184692,
+      "learning_rate": 0.00016310737816158235,
+      "loss": 1.142,
+      "step": 6364
+    },
+    {
+      "epoch": 1.1331908831908832,
+      "grad_norm": 0.5990639328956604,
+      "learning_rate": 0.00016309651938490712,
+      "loss": 0.9306,
+      "step": 6365
+    },
+    {
+      "epoch": 1.1333689458689458,
+      "grad_norm": 0.5894871950149536,
+      "learning_rate": 0.00016308565937198669,
+      "loss": 0.8343,
+      "step": 6366
+    },
+    {
+      "epoch": 1.1335470085470085,
+      "grad_norm": 0.6863628029823303,
+      "learning_rate": 0.0001630747981230338,
+      "loss": 0.9552,
+      "step": 6367
+    },
+    {
+      "epoch": 1.1337250712250713,
+      "grad_norm": 0.7438958287239075,
+      "learning_rate": 0.00016306393563826128,
+      "loss": 1.0422,
+      "step": 6368
+    },
+    {
+      "epoch": 1.133903133903134,
+      "grad_norm": 0.5695775747299194,
+      "learning_rate": 0.00016305307191788194,
+      "loss": 0.8633,
+      "step": 6369
+    },
+    {
+      "epoch": 1.1340811965811965,
+      "grad_norm": 0.6257741451263428,
+      "learning_rate": 0.00016304220696210863,
+      "loss": 1.0333,
+      "step": 6370
+    },
+    {
+      "epoch": 1.1342592592592593,
+      "grad_norm": 0.6366072297096252,
+      "learning_rate": 0.00016303134077115425,
+      "loss": 1.1452,
+      "step": 6371
+    },
+    {
+      "epoch": 1.1344373219373218,
+      "grad_norm": 0.624569296836853,
+      "learning_rate": 0.00016302047334523168,
+      "loss": 1.0569,
+      "step": 6372
+    },
+    {
+      "epoch": 1.1346153846153846,
+      "grad_norm": 0.5585938096046448,
+      "learning_rate": 0.00016300960468455382,
+      "loss": 0.9612,
+      "step": 6373
+    },
+    {
+      "epoch": 1.1347934472934473,
+      "grad_norm": 0.5738831162452698,
+      "learning_rate": 0.00016299873478933368,
+      "loss": 0.9206,
+      "step": 6374
+    },
+    {
+      "epoch": 1.13497150997151,
+      "grad_norm": 0.6797143220901489,
+      "learning_rate": 0.00016298786365978417,
+      "loss": 1.0748,
+      "step": 6375
+    },
+    {
+      "epoch": 1.1351495726495726,
+      "grad_norm": 0.6341326832771301,
+      "learning_rate": 0.00016297699129611833,
+      "loss": 0.9901,
+      "step": 6376
+    },
+    {
+      "epoch": 1.1353276353276354,
+      "grad_norm": 0.6568490862846375,
+      "learning_rate": 0.00016296611769854916,
+      "loss": 1.0598,
+      "step": 6377
+    },
+    {
+      "epoch": 1.135505698005698,
+      "grad_norm": 0.6151928901672363,
+      "learning_rate": 0.00016295524286728973,
+      "loss": 0.8352,
+      "step": 6378
+    },
+    {
+      "epoch": 1.1356837606837606,
+      "grad_norm": 0.7209593057632446,
+      "learning_rate": 0.0001629443668025531,
+      "loss": 0.9945,
+      "step": 6379
+    },
+    {
+      "epoch": 1.1358618233618234,
+      "grad_norm": 0.6600689888000488,
+      "learning_rate": 0.00016293348950455235,
+      "loss": 1.0572,
+      "step": 6380
+    },
+    {
+      "epoch": 1.1360398860398861,
+      "grad_norm": 0.5587523579597473,
+      "learning_rate": 0.0001629226109735006,
+      "loss": 0.8526,
+      "step": 6381
+    },
+    {
+      "epoch": 1.1362179487179487,
+      "grad_norm": 0.6184542775154114,
+      "learning_rate": 0.00016291173120961102,
+      "loss": 0.8246,
+      "step": 6382
+    },
+    {
+      "epoch": 1.1363960113960114,
+      "grad_norm": 0.6604713797569275,
+      "learning_rate": 0.00016290085021309673,
+      "loss": 1.0349,
+      "step": 6383
+    },
+    {
+      "epoch": 1.136574074074074,
+      "grad_norm": 0.5880835056304932,
+      "learning_rate": 0.00016288996798417097,
+      "loss": 0.8726,
+      "step": 6384
+    },
+    {
+      "epoch": 1.1367521367521367,
+      "grad_norm": 0.5770880579948425,
+      "learning_rate": 0.00016287908452304692,
+      "loss": 0.7639,
+      "step": 6385
+    },
+    {
+      "epoch": 1.1369301994301995,
+      "grad_norm": 0.5719713568687439,
+      "learning_rate": 0.00016286819982993782,
+      "loss": 0.9717,
+      "step": 6386
+    },
+    {
+      "epoch": 1.1371082621082622,
+      "grad_norm": 0.7028461694717407,
+      "learning_rate": 0.00016285731390505695,
+      "loss": 1.0147,
+      "step": 6387
+    },
+    {
+      "epoch": 1.1372863247863247,
+      "grad_norm": 0.5396828651428223,
+      "learning_rate": 0.00016284642674861756,
+      "loss": 0.8119,
+      "step": 6388
+    },
+    {
+      "epoch": 1.1374643874643875,
+      "grad_norm": 0.592580258846283,
+      "learning_rate": 0.00016283553836083303,
+      "loss": 1.0914,
+      "step": 6389
+    },
+    {
+      "epoch": 1.13764245014245,
+      "grad_norm": 0.634596586227417,
+      "learning_rate": 0.00016282464874191663,
+      "loss": 1.1037,
+      "step": 6390
+    },
+    {
+      "epoch": 1.1378205128205128,
+      "grad_norm": 0.6462705731391907,
+      "learning_rate": 0.00016281375789208176,
+      "loss": 1.1523,
+      "step": 6391
+    },
+    {
+      "epoch": 1.1379985754985755,
+      "grad_norm": 0.6527917385101318,
+      "learning_rate": 0.0001628028658115418,
+      "loss": 1.0415,
+      "step": 6392
+    },
+    {
+      "epoch": 1.1381766381766383,
+      "grad_norm": 0.6309964060783386,
+      "learning_rate": 0.00016279197250051013,
+      "loss": 0.9747,
+      "step": 6393
+    },
+    {
+      "epoch": 1.1383547008547008,
+      "grad_norm": 0.6342993974685669,
+      "learning_rate": 0.00016278107795920018,
+      "loss": 0.9897,
+      "step": 6394
+    },
+    {
+      "epoch": 1.1385327635327636,
+      "grad_norm": 0.7149887084960938,
+      "learning_rate": 0.00016277018218782544,
+      "loss": 0.9659,
+      "step": 6395
+    },
+    {
+      "epoch": 1.138710826210826,
+      "grad_norm": 0.7219462394714355,
+      "learning_rate": 0.00016275928518659938,
+      "loss": 0.9301,
+      "step": 6396
+    },
+    {
+      "epoch": 1.1388888888888888,
+      "grad_norm": 0.6649485230445862,
+      "learning_rate": 0.0001627483869557355,
+      "loss": 0.9012,
+      "step": 6397
+    },
+    {
+      "epoch": 1.1390669515669516,
+      "grad_norm": 0.6910027861595154,
+      "learning_rate": 0.00016273748749544731,
+      "loss": 0.956,
+      "step": 6398
+    },
+    {
+      "epoch": 1.1392450142450143,
+      "grad_norm": 0.6369016766548157,
+      "learning_rate": 0.00016272658680594837,
+      "loss": 0.8027,
+      "step": 6399
+    },
+    {
+      "epoch": 1.1394230769230769,
+      "grad_norm": 0.6540524959564209,
+      "learning_rate": 0.00016271568488745227,
+      "loss": 1.2397,
+      "step": 6400
+    },
+    {
+      "epoch": 1.1396011396011396,
+      "grad_norm": 0.5912376046180725,
+      "learning_rate": 0.00016270478174017263,
+      "loss": 0.8453,
+      "step": 6401
+    },
+    {
+      "epoch": 1.1397792022792024,
+      "grad_norm": 0.6847240924835205,
+      "learning_rate": 0.00016269387736432303,
+      "loss": 0.9776,
+      "step": 6402
+    },
+    {
+      "epoch": 1.139957264957265,
+      "grad_norm": 0.6465024352073669,
+      "learning_rate": 0.00016268297176011716,
+      "loss": 0.8971,
+      "step": 6403
+    },
+    {
+      "epoch": 1.1401353276353277,
+      "grad_norm": 0.6639063954353333,
+      "learning_rate": 0.00016267206492776866,
+      "loss": 0.9756,
+      "step": 6404
+    },
+    {
+      "epoch": 1.1403133903133904,
+      "grad_norm": 0.6343763470649719,
+      "learning_rate": 0.00016266115686749123,
+      "loss": 0.9368,
+      "step": 6405
+    },
+    {
+      "epoch": 1.140491452991453,
+      "grad_norm": 0.7144993543624878,
+      "learning_rate": 0.0001626502475794986,
+      "loss": 0.9285,
+      "step": 6406
+    },
+    {
+      "epoch": 1.1406695156695157,
+      "grad_norm": 0.6217414736747742,
+      "learning_rate": 0.00016263933706400451,
+      "loss": 0.8867,
+      "step": 6407
+    },
+    {
+      "epoch": 1.1408475783475784,
+      "grad_norm": 0.6843730807304382,
+      "learning_rate": 0.00016262842532122274,
+      "loss": 0.9863,
+      "step": 6408
+    },
+    {
+      "epoch": 1.141025641025641,
+      "grad_norm": 0.6866166591644287,
+      "learning_rate": 0.00016261751235136705,
+      "loss": 1.0517,
+      "step": 6409
+    },
+    {
+      "epoch": 1.1412037037037037,
+      "grad_norm": 0.6650584936141968,
+      "learning_rate": 0.0001626065981546513,
+      "loss": 1.0629,
+      "step": 6410
+    },
+    {
+      "epoch": 1.1413817663817665,
+      "grad_norm": 0.5805012583732605,
+      "learning_rate": 0.00016259568273128933,
+      "loss": 0.8175,
+      "step": 6411
+    },
+    {
+      "epoch": 1.141559829059829,
+      "grad_norm": 0.7005903124809265,
+      "learning_rate": 0.00016258476608149497,
+      "loss": 1.0267,
+      "step": 6412
+    },
+    {
+      "epoch": 1.1417378917378918,
+      "grad_norm": 0.6293461322784424,
+      "learning_rate": 0.00016257384820548217,
+      "loss": 1.1034,
+      "step": 6413
+    },
+    {
+      "epoch": 1.1419159544159545,
+      "grad_norm": 0.6281774640083313,
+      "learning_rate": 0.00016256292910346476,
+      "loss": 1.0775,
+      "step": 6414
+    },
+    {
+      "epoch": 1.142094017094017,
+      "grad_norm": 0.5912862420082092,
+      "learning_rate": 0.0001625520087756567,
+      "loss": 0.9589,
+      "step": 6415
+    },
+    {
+      "epoch": 1.1422720797720798,
+      "grad_norm": 0.5813978314399719,
+      "learning_rate": 0.00016254108722227198,
+      "loss": 0.9195,
+      "step": 6416
+    },
+    {
+      "epoch": 1.1424501424501425,
+      "grad_norm": 0.650805652141571,
+      "learning_rate": 0.00016253016444352458,
+      "loss": 1.0207,
+      "step": 6417
+    },
+    {
+      "epoch": 1.142628205128205,
+      "grad_norm": 0.6909520030021667,
+      "learning_rate": 0.00016251924043962851,
+      "loss": 0.9854,
+      "step": 6418
+    },
+    {
+      "epoch": 1.1428062678062678,
+      "grad_norm": 0.6054595112800598,
+      "learning_rate": 0.0001625083152107978,
+      "loss": 0.852,
+      "step": 6419
+    },
+    {
+      "epoch": 1.1429843304843306,
+      "grad_norm": 0.601078987121582,
+      "learning_rate": 0.00016249738875724647,
+      "loss": 0.9609,
+      "step": 6420
+    },
+    {
+      "epoch": 1.143162393162393,
+      "grad_norm": 0.5340180397033691,
+      "learning_rate": 0.00016248646107918868,
+      "loss": 0.8364,
+      "step": 6421
+    },
+    {
+      "epoch": 1.1433404558404558,
+      "grad_norm": 0.6687821745872498,
+      "learning_rate": 0.00016247553217683846,
+      "loss": 1.005,
+      "step": 6422
+    },
+    {
+      "epoch": 1.1435185185185186,
+      "grad_norm": 0.6347902417182922,
+      "learning_rate": 0.00016246460205040998,
+      "loss": 1.026,
+      "step": 6423
+    },
+    {
+      "epoch": 1.1436965811965811,
+      "grad_norm": 0.6136734485626221,
+      "learning_rate": 0.00016245367070011736,
+      "loss": 0.7811,
+      "step": 6424
+    },
+    {
+      "epoch": 1.1438746438746439,
+      "grad_norm": 0.6591334342956543,
+      "learning_rate": 0.00016244273812617482,
+      "loss": 0.991,
+      "step": 6425
+    },
+    {
+      "epoch": 1.1440527065527066,
+      "grad_norm": 0.6062475442886353,
+      "learning_rate": 0.00016243180432879656,
+      "loss": 0.9879,
+      "step": 6426
+    },
+    {
+      "epoch": 1.1442307692307692,
+      "grad_norm": 0.5941380858421326,
+      "learning_rate": 0.00016242086930819678,
+      "loss": 0.9771,
+      "step": 6427
+    },
+    {
+      "epoch": 1.144408831908832,
+      "grad_norm": 0.7320533990859985,
+      "learning_rate": 0.00016240993306458973,
+      "loss": 1.0919,
+      "step": 6428
+    },
+    {
+      "epoch": 1.1445868945868947,
+      "grad_norm": 0.6998075246810913,
+      "learning_rate": 0.00016239899559818962,
+      "loss": 1.0721,
+      "step": 6429
+    },
+    {
+      "epoch": 1.1447649572649572,
+      "grad_norm": 0.847931444644928,
+      "learning_rate": 0.0001623880569092109,
+      "loss": 0.8759,
+      "step": 6430
+    },
+    {
+      "epoch": 1.14494301994302,
+      "grad_norm": 0.6670104265213013,
+      "learning_rate": 0.00016237711699786775,
+      "loss": 1.0515,
+      "step": 6431
+    },
+    {
+      "epoch": 1.1451210826210827,
+      "grad_norm": 0.601759672164917,
+      "learning_rate": 0.00016236617586437463,
+      "loss": 0.7298,
+      "step": 6432
+    },
+    {
+      "epoch": 1.1452991452991452,
+      "grad_norm": 0.6411594152450562,
+      "learning_rate": 0.00016235523350894578,
+      "loss": 0.9336,
+      "step": 6433
+    },
+    {
+      "epoch": 1.145477207977208,
+      "grad_norm": 0.6485120058059692,
+      "learning_rate": 0.0001623442899317957,
+      "loss": 1.1215,
+      "step": 6434
+    },
+    {
+      "epoch": 1.1456552706552707,
+      "grad_norm": 0.6041508316993713,
+      "learning_rate": 0.00016233334513313875,
+      "loss": 0.8917,
+      "step": 6435
+    },
+    {
+      "epoch": 1.1458333333333333,
+      "grad_norm": 0.6292745471000671,
+      "learning_rate": 0.0001623223991131894,
+      "loss": 0.9976,
+      "step": 6436
+    },
+    {
+      "epoch": 1.146011396011396,
+      "grad_norm": 0.5442200303077698,
+      "learning_rate": 0.0001623114518721621,
+      "loss": 0.8072,
+      "step": 6437
+    },
+    {
+      "epoch": 1.1461894586894588,
+      "grad_norm": 0.6668170094490051,
+      "learning_rate": 0.00016230050341027136,
+      "loss": 0.9641,
+      "step": 6438
+    },
+    {
+      "epoch": 1.1463675213675213,
+      "grad_norm": 0.644186794757843,
+      "learning_rate": 0.00016228955372773164,
+      "loss": 0.9248,
+      "step": 6439
+    },
+    {
+      "epoch": 1.146545584045584,
+      "grad_norm": 0.6661991477012634,
+      "learning_rate": 0.00016227860282475753,
+      "loss": 0.8719,
+      "step": 6440
+    },
+    {
+      "epoch": 1.1467236467236468,
+      "grad_norm": 0.5232062935829163,
+      "learning_rate": 0.00016226765070156355,
+      "loss": 0.5418,
+      "step": 6441
+    },
+    {
+      "epoch": 1.1469017094017093,
+      "grad_norm": 0.573176383972168,
+      "learning_rate": 0.00016225669735836436,
+      "loss": 1.0858,
+      "step": 6442
+    },
+    {
+      "epoch": 1.147079772079772,
+      "grad_norm": 0.6137439608573914,
+      "learning_rate": 0.00016224574279537446,
+      "loss": 1.1205,
+      "step": 6443
+    },
+    {
+      "epoch": 1.1472578347578348,
+      "grad_norm": 0.6328136920928955,
+      "learning_rate": 0.00016223478701280855,
+      "loss": 0.8957,
+      "step": 6444
+    },
+    {
+      "epoch": 1.1474358974358974,
+      "grad_norm": 0.6687374114990234,
+      "learning_rate": 0.00016222383001088126,
+      "loss": 1.0318,
+      "step": 6445
+    },
+    {
+      "epoch": 1.14761396011396,
+      "grad_norm": 0.6057115793228149,
+      "learning_rate": 0.0001622128717898073,
+      "loss": 0.9575,
+      "step": 6446
+    },
+    {
+      "epoch": 1.1477920227920229,
+      "grad_norm": 0.6758735775947571,
+      "learning_rate": 0.0001622019123498013,
+      "loss": 1.2273,
+      "step": 6447
+    },
+    {
+      "epoch": 1.1479700854700854,
+      "grad_norm": 0.6233550310134888,
+      "learning_rate": 0.0001621909516910781,
+      "loss": 0.7875,
+      "step": 6448
+    },
+    {
+      "epoch": 1.1481481481481481,
+      "grad_norm": 0.6371827721595764,
+      "learning_rate": 0.0001621799898138524,
+      "loss": 1.0488,
+      "step": 6449
+    },
+    {
+      "epoch": 1.148326210826211,
+      "grad_norm": 0.6179831624031067,
+      "learning_rate": 0.00016216902671833892,
+      "loss": 0.9792,
+      "step": 6450
+    },
+    {
+      "epoch": 1.1485042735042734,
+      "grad_norm": 0.6234193444252014,
+      "learning_rate": 0.00016215806240475256,
+      "loss": 0.927,
+      "step": 6451
+    },
+    {
+      "epoch": 1.1486823361823362,
+      "grad_norm": 0.6940563917160034,
+      "learning_rate": 0.00016214709687330803,
+      "loss": 1.047,
+      "step": 6452
+    },
+    {
+      "epoch": 1.148860398860399,
+      "grad_norm": 0.6567606925964355,
+      "learning_rate": 0.00016213613012422027,
+      "loss": 0.9695,
+      "step": 6453
+    },
+    {
+      "epoch": 1.1490384615384615,
+      "grad_norm": 0.7374183535575867,
+      "learning_rate": 0.0001621251621577041,
+      "loss": 1.0443,
+      "step": 6454
+    },
+    {
+      "epoch": 1.1492165242165242,
+      "grad_norm": 0.6789869666099548,
+      "learning_rate": 0.00016211419297397443,
+      "loss": 1.0319,
+      "step": 6455
+    },
+    {
+      "epoch": 1.149394586894587,
+      "grad_norm": 0.6225521564483643,
+      "learning_rate": 0.00016210322257324619,
+      "loss": 1.0529,
+      "step": 6456
+    },
+    {
+      "epoch": 1.1495726495726495,
+      "grad_norm": 0.619701623916626,
+      "learning_rate": 0.00016209225095573432,
+      "loss": 0.962,
+      "step": 6457
+    },
+    {
+      "epoch": 1.1497507122507122,
+      "grad_norm": 0.6132834553718567,
+      "learning_rate": 0.00016208127812165375,
+      "loss": 0.9588,
+      "step": 6458
+    },
+    {
+      "epoch": 1.149928774928775,
+      "grad_norm": 0.6005367040634155,
+      "learning_rate": 0.00016207030407121954,
+      "loss": 0.9497,
+      "step": 6459
+    },
+    {
+      "epoch": 1.1501068376068375,
+      "grad_norm": 0.575309157371521,
+      "learning_rate": 0.00016205932880464664,
+      "loss": 1.0035,
+      "step": 6460
+    },
+    {
+      "epoch": 1.1502849002849003,
+      "grad_norm": 0.5958710312843323,
+      "learning_rate": 0.0001620483523221501,
+      "loss": 1.0004,
+      "step": 6461
+    },
+    {
+      "epoch": 1.150462962962963,
+      "grad_norm": 0.5934719443321228,
+      "learning_rate": 0.000162037374623945,
+      "loss": 0.8694,
+      "step": 6462
+    },
+    {
+      "epoch": 1.1506410256410255,
+      "grad_norm": 0.6042510271072388,
+      "learning_rate": 0.00016202639571024643,
+      "loss": 0.8598,
+      "step": 6463
+    },
+    {
+      "epoch": 1.1508190883190883,
+      "grad_norm": 0.6206158399581909,
+      "learning_rate": 0.00016201541558126946,
+      "loss": 0.961,
+      "step": 6464
+    },
+    {
+      "epoch": 1.150997150997151,
+      "grad_norm": 0.5997715592384338,
+      "learning_rate": 0.00016200443423722925,
+      "loss": 0.8686,
+      "step": 6465
+    },
+    {
+      "epoch": 1.1511752136752136,
+      "grad_norm": 0.742457926273346,
+      "learning_rate": 0.00016199345167834098,
+      "loss": 1.1113,
+      "step": 6466
+    },
+    {
+      "epoch": 1.1513532763532763,
+      "grad_norm": 0.6772766709327698,
+      "learning_rate": 0.00016198246790481976,
+      "loss": 1.0717,
+      "step": 6467
+    },
+    {
+      "epoch": 1.151531339031339,
+      "grad_norm": 0.6127712726593018,
+      "learning_rate": 0.0001619714829168809,
+      "loss": 0.8887,
+      "step": 6468
+    },
+    {
+      "epoch": 1.1517094017094016,
+      "grad_norm": 0.5585067272186279,
+      "learning_rate": 0.00016196049671473954,
+      "loss": 1.0144,
+      "step": 6469
+    },
+    {
+      "epoch": 1.1518874643874644,
+      "grad_norm": 0.6269431710243225,
+      "learning_rate": 0.00016194950929861092,
+      "loss": 1.0206,
+      "step": 6470
+    },
+    {
+      "epoch": 1.1520655270655271,
+      "grad_norm": 0.6270785331726074,
+      "learning_rate": 0.0001619385206687104,
+      "loss": 1.0517,
+      "step": 6471
+    },
+    {
+      "epoch": 1.1522435897435896,
+      "grad_norm": 0.744712233543396,
+      "learning_rate": 0.00016192753082525322,
+      "loss": 1.0699,
+      "step": 6472
+    },
+    {
+      "epoch": 1.1524216524216524,
+      "grad_norm": 0.7025929689407349,
+      "learning_rate": 0.00016191653976845474,
+      "loss": 0.951,
+      "step": 6473
+    },
+    {
+      "epoch": 1.1525997150997151,
+      "grad_norm": 0.6175379753112793,
+      "learning_rate": 0.00016190554749853024,
+      "loss": 1.2153,
+      "step": 6474
+    },
+    {
+      "epoch": 1.1527777777777777,
+      "grad_norm": 0.6212149858474731,
+      "learning_rate": 0.00016189455401569513,
+      "loss": 1.0428,
+      "step": 6475
+    },
+    {
+      "epoch": 1.1529558404558404,
+      "grad_norm": 0.6716817617416382,
+      "learning_rate": 0.00016188355932016484,
+      "loss": 1.179,
+      "step": 6476
+    },
+    {
+      "epoch": 1.1531339031339032,
+      "grad_norm": 0.6247739791870117,
+      "learning_rate": 0.00016187256341215476,
+      "loss": 0.9451,
+      "step": 6477
+    },
+    {
+      "epoch": 1.153311965811966,
+      "grad_norm": 0.6223008036613464,
+      "learning_rate": 0.00016186156629188032,
+      "loss": 0.9915,
+      "step": 6478
+    },
+    {
+      "epoch": 1.1534900284900285,
+      "grad_norm": 0.5610866546630859,
+      "learning_rate": 0.000161850567959557,
+      "loss": 0.7741,
+      "step": 6479
+    },
+    {
+      "epoch": 1.1536680911680912,
+      "grad_norm": 0.6241226196289062,
+      "learning_rate": 0.0001618395684154003,
+      "loss": 1.2193,
+      "step": 6480
+    },
+    {
+      "epoch": 1.1538461538461537,
+      "grad_norm": 0.703789472579956,
+      "learning_rate": 0.00016182856765962567,
+      "loss": 1.0725,
+      "step": 6481
+    },
+    {
+      "epoch": 1.1540242165242165,
+      "grad_norm": 0.6802006959915161,
+      "learning_rate": 0.00016181756569244872,
+      "loss": 1.0908,
+      "step": 6482
+    },
+    {
+      "epoch": 1.1542022792022792,
+      "grad_norm": 0.6504136919975281,
+      "learning_rate": 0.000161806562514085,
+      "loss": 0.9706,
+      "step": 6483
+    },
+    {
+      "epoch": 1.154380341880342,
+      "grad_norm": 0.7217034101486206,
+      "learning_rate": 0.00016179555812475003,
+      "loss": 0.9084,
+      "step": 6484
+    },
+    {
+      "epoch": 1.1545584045584045,
+      "grad_norm": 0.5919039249420166,
+      "learning_rate": 0.0001617845525246595,
+      "loss": 0.949,
+      "step": 6485
+    },
+    {
+      "epoch": 1.1547364672364673,
+      "grad_norm": 0.6160184741020203,
+      "learning_rate": 0.00016177354571402902,
+      "loss": 0.8144,
+      "step": 6486
+    },
+    {
+      "epoch": 1.1549145299145298,
+      "grad_norm": 0.7323806285858154,
+      "learning_rate": 0.00016176253769307426,
+      "loss": 1.0528,
+      "step": 6487
+    },
+    {
+      "epoch": 1.1550925925925926,
+      "grad_norm": 0.6051317453384399,
+      "learning_rate": 0.0001617515284620108,
+      "loss": 0.9558,
+      "step": 6488
+    },
+    {
+      "epoch": 1.1552706552706553,
+      "grad_norm": 0.6418905258178711,
+      "learning_rate": 0.00016174051802105447,
+      "loss": 1.062,
+      "step": 6489
+    },
+    {
+      "epoch": 1.155448717948718,
+      "grad_norm": 0.6914883852005005,
+      "learning_rate": 0.00016172950637042096,
+      "loss": 0.9999,
+      "step": 6490
+    },
+    {
+      "epoch": 1.1556267806267806,
+      "grad_norm": 0.5558316707611084,
+      "learning_rate": 0.000161718493510326,
+      "loss": 0.9561,
+      "step": 6491
+    },
+    {
+      "epoch": 1.1558048433048433,
+      "grad_norm": 0.6632496118545532,
+      "learning_rate": 0.00016170747944098531,
+      "loss": 1.0133,
+      "step": 6492
+    },
+    {
+      "epoch": 1.1559829059829059,
+      "grad_norm": 0.6407149434089661,
+      "learning_rate": 0.00016169646416261478,
+      "loss": 1.0563,
+      "step": 6493
+    },
+    {
+      "epoch": 1.1561609686609686,
+      "grad_norm": 0.8128494024276733,
+      "learning_rate": 0.0001616854476754302,
+      "loss": 1.1559,
+      "step": 6494
+    },
+    {
+      "epoch": 1.1563390313390314,
+      "grad_norm": 0.6403429508209229,
+      "learning_rate": 0.00016167442997964742,
+      "loss": 1.0983,
+      "step": 6495
+    },
+    {
+      "epoch": 1.1565170940170941,
+      "grad_norm": 0.76612788438797,
+      "learning_rate": 0.0001616634110754823,
+      "loss": 0.973,
+      "step": 6496
+    },
+    {
+      "epoch": 1.1566951566951567,
+      "grad_norm": 0.6914355754852295,
+      "learning_rate": 0.0001616523909631507,
+      "loss": 0.9307,
+      "step": 6497
+    },
+    {
+      "epoch": 1.1568732193732194,
+      "grad_norm": 0.546602725982666,
+      "learning_rate": 0.00016164136964286863,
+      "loss": 1.0328,
+      "step": 6498
+    },
+    {
+      "epoch": 1.157051282051282,
+      "grad_norm": 0.5695818662643433,
+      "learning_rate": 0.00016163034711485193,
+      "loss": 0.9607,
+      "step": 6499
+    },
+    {
+      "epoch": 1.1572293447293447,
+      "grad_norm": 0.5649738311767578,
+      "learning_rate": 0.00016161932337931662,
+      "loss": 1.1521,
+      "step": 6500
+    },
+    {
+      "epoch": 1.1574074074074074,
+      "grad_norm": 0.6437582969665527,
+      "learning_rate": 0.00016160829843647867,
+      "loss": 0.9613,
+      "step": 6501
+    },
+    {
+      "epoch": 1.1575854700854702,
+      "grad_norm": 0.5841929316520691,
+      "learning_rate": 0.0001615972722865541,
+      "loss": 0.8187,
+      "step": 6502
+    },
+    {
+      "epoch": 1.1577635327635327,
+      "grad_norm": 0.6481246948242188,
+      "learning_rate": 0.00016158624492975892,
+      "loss": 1.0447,
+      "step": 6503
+    },
+    {
+      "epoch": 1.1579415954415955,
+      "grad_norm": 0.629804790019989,
+      "learning_rate": 0.0001615752163663092,
+      "loss": 0.9034,
+      "step": 6504
+    },
+    {
+      "epoch": 1.158119658119658,
+      "grad_norm": 0.5797054171562195,
+      "learning_rate": 0.00016156418659642104,
+      "loss": 0.8168,
+      "step": 6505
+    },
+    {
+      "epoch": 1.1582977207977208,
+      "grad_norm": 0.588424563407898,
+      "learning_rate": 0.00016155315562031052,
+      "loss": 0.828,
+      "step": 6506
+    },
+    {
+      "epoch": 1.1584757834757835,
+      "grad_norm": 0.7120068669319153,
+      "learning_rate": 0.0001615421234381938,
+      "loss": 1.0637,
+      "step": 6507
+    },
+    {
+      "epoch": 1.1586538461538463,
+      "grad_norm": 0.6635081768035889,
+      "learning_rate": 0.00016153109005028702,
+      "loss": 0.9838,
+      "step": 6508
+    },
+    {
+      "epoch": 1.1588319088319088,
+      "grad_norm": 0.6080414056777954,
+      "learning_rate": 0.00016152005545680634,
+      "loss": 0.983,
+      "step": 6509
+    },
+    {
+      "epoch": 1.1590099715099715,
+      "grad_norm": 0.7131237983703613,
+      "learning_rate": 0.00016150901965796796,
+      "loss": 1.1053,
+      "step": 6510
+    },
+    {
+      "epoch": 1.159188034188034,
+      "grad_norm": 0.6051005125045776,
+      "learning_rate": 0.00016149798265398813,
+      "loss": 0.9903,
+      "step": 6511
+    },
+    {
+      "epoch": 1.1593660968660968,
+      "grad_norm": 0.6193733811378479,
+      "learning_rate": 0.00016148694444508306,
+      "loss": 1.0478,
+      "step": 6512
+    },
+    {
+      "epoch": 1.1595441595441596,
+      "grad_norm": 0.567888081073761,
+      "learning_rate": 0.00016147590503146905,
+      "loss": 0.7995,
+      "step": 6513
+    },
+    {
+      "epoch": 1.1597222222222223,
+      "grad_norm": 0.6889783143997192,
+      "learning_rate": 0.00016146486441336242,
+      "loss": 0.9684,
+      "step": 6514
+    },
+    {
+      "epoch": 1.1599002849002849,
+      "grad_norm": 0.6470308303833008,
+      "learning_rate": 0.0001614538225909794,
+      "loss": 0.9824,
+      "step": 6515
+    },
+    {
+      "epoch": 1.1600783475783476,
+      "grad_norm": 0.6833886504173279,
+      "learning_rate": 0.00016144277956453638,
+      "loss": 0.9845,
+      "step": 6516
+    },
+    {
+      "epoch": 1.1602564102564104,
+      "grad_norm": 0.5827815532684326,
+      "learning_rate": 0.00016143173533424978,
+      "loss": 0.9476,
+      "step": 6517
+    },
+    {
+      "epoch": 1.1604344729344729,
+      "grad_norm": 0.6701242327690125,
+      "learning_rate": 0.00016142068990033593,
+      "loss": 1.0839,
+      "step": 6518
+    },
+    {
+      "epoch": 1.1606125356125356,
+      "grad_norm": 0.5844996571540833,
+      "learning_rate": 0.00016140964326301122,
+      "loss": 0.8861,
+      "step": 6519
+    },
+    {
+      "epoch": 1.1607905982905984,
+      "grad_norm": 0.5831994414329529,
+      "learning_rate": 0.00016139859542249214,
+      "loss": 0.9817,
+      "step": 6520
+    },
+    {
+      "epoch": 1.160968660968661,
+      "grad_norm": 0.6830124855041504,
+      "learning_rate": 0.0001613875463789951,
+      "loss": 0.8749,
+      "step": 6521
+    },
+    {
+      "epoch": 1.1611467236467237,
+      "grad_norm": 0.6003018021583557,
+      "learning_rate": 0.00016137649613273667,
+      "loss": 0.9593,
+      "step": 6522
+    },
+    {
+      "epoch": 1.1613247863247864,
+      "grad_norm": 0.5973994731903076,
+      "learning_rate": 0.00016136544468393327,
+      "loss": 1.0384,
+      "step": 6523
+    },
+    {
+      "epoch": 1.161502849002849,
+      "grad_norm": 0.6702523827552795,
+      "learning_rate": 0.00016135439203280143,
+      "loss": 1.0431,
+      "step": 6524
+    },
+    {
+      "epoch": 1.1616809116809117,
+      "grad_norm": 0.6160697937011719,
+      "learning_rate": 0.00016134333817955775,
+      "loss": 1.0339,
+      "step": 6525
+    },
+    {
+      "epoch": 1.1618589743589745,
+      "grad_norm": 0.7078264355659485,
+      "learning_rate": 0.0001613322831244188,
+      "loss": 1.0285,
+      "step": 6526
+    },
+    {
+      "epoch": 1.162037037037037,
+      "grad_norm": 0.5744216442108154,
+      "learning_rate": 0.00016132122686760117,
+      "loss": 0.6589,
+      "step": 6527
+    },
+    {
+      "epoch": 1.1622150997150997,
+      "grad_norm": 0.6802098155021667,
+      "learning_rate": 0.00016131016940932146,
+      "loss": 0.9532,
+      "step": 6528
+    },
+    {
+      "epoch": 1.1623931623931625,
+      "grad_norm": 0.6523237228393555,
+      "learning_rate": 0.00016129911074979635,
+      "loss": 0.9409,
+      "step": 6529
+    },
+    {
+      "epoch": 1.162571225071225,
+      "grad_norm": 0.710307776927948,
+      "learning_rate": 0.00016128805088924252,
+      "loss": 1.2536,
+      "step": 6530
+    },
+    {
+      "epoch": 1.1627492877492878,
+      "grad_norm": 0.6349819898605347,
+      "learning_rate": 0.0001612769898278766,
+      "loss": 1.0857,
+      "step": 6531
+    },
+    {
+      "epoch": 1.1629273504273505,
+      "grad_norm": 0.5348139405250549,
+      "learning_rate": 0.00016126592756591542,
+      "loss": 0.5969,
+      "step": 6532
+    },
+    {
+      "epoch": 1.163105413105413,
+      "grad_norm": 0.635619580745697,
+      "learning_rate": 0.00016125486410357564,
+      "loss": 0.9885,
+      "step": 6533
+    },
+    {
+      "epoch": 1.1632834757834758,
+      "grad_norm": 0.6434559226036072,
+      "learning_rate": 0.000161243799441074,
+      "loss": 0.8377,
+      "step": 6534
+    },
+    {
+      "epoch": 1.1634615384615385,
+      "grad_norm": 0.6509647369384766,
+      "learning_rate": 0.00016123273357862737,
+      "loss": 0.8393,
+      "step": 6535
+    },
+    {
+      "epoch": 1.163639601139601,
+      "grad_norm": 0.6179081797599792,
+      "learning_rate": 0.0001612216665164525,
+      "loss": 0.9143,
+      "step": 6536
+    },
+    {
+      "epoch": 1.1638176638176638,
+      "grad_norm": 0.5923223495483398,
+      "learning_rate": 0.0001612105982547663,
+      "loss": 1.0185,
+      "step": 6537
+    },
+    {
+      "epoch": 1.1639957264957266,
+      "grad_norm": 0.702150285243988,
+      "learning_rate": 0.00016119952879378556,
+      "loss": 0.863,
+      "step": 6538
+    },
+    {
+      "epoch": 1.164173789173789,
+      "grad_norm": 0.6596643328666687,
+      "learning_rate": 0.00016118845813372715,
+      "loss": 1.0089,
+      "step": 6539
+    },
+    {
+      "epoch": 1.1643518518518519,
+      "grad_norm": 0.7675769329071045,
+      "learning_rate": 0.00016117738627480804,
+      "loss": 1.0179,
+      "step": 6540
+    },
+    {
+      "epoch": 1.1645299145299146,
+      "grad_norm": 0.6742541193962097,
+      "learning_rate": 0.00016116631321724513,
+      "loss": 1.0663,
+      "step": 6541
+    },
+    {
+      "epoch": 1.1647079772079771,
+      "grad_norm": 0.7379785776138306,
+      "learning_rate": 0.0001611552389612554,
+      "loss": 1.0162,
+      "step": 6542
+    },
+    {
+      "epoch": 1.16488603988604,
+      "grad_norm": 0.5729365944862366,
+      "learning_rate": 0.00016114416350705577,
+      "loss": 0.8146,
+      "step": 6543
+    },
+    {
+      "epoch": 1.1650641025641026,
+      "grad_norm": 0.6481349468231201,
+      "learning_rate": 0.00016113308685486327,
+      "loss": 1.0748,
+      "step": 6544
+    },
+    {
+      "epoch": 1.1652421652421652,
+      "grad_norm": 0.5588181018829346,
+      "learning_rate": 0.00016112200900489493,
+      "loss": 0.7511,
+      "step": 6545
+    },
+    {
+      "epoch": 1.165420227920228,
+      "grad_norm": 0.674363911151886,
+      "learning_rate": 0.0001611109299573678,
+      "loss": 0.9852,
+      "step": 6546
+    },
+    {
+      "epoch": 1.1655982905982907,
+      "grad_norm": 0.6712620854377747,
+      "learning_rate": 0.00016109984971249893,
+      "loss": 0.9558,
+      "step": 6547
+    },
+    {
+      "epoch": 1.1657763532763532,
+      "grad_norm": 0.5260626077651978,
+      "learning_rate": 0.00016108876827050544,
+      "loss": 0.7008,
+      "step": 6548
+    },
+    {
+      "epoch": 1.165954415954416,
+      "grad_norm": 0.6056292057037354,
+      "learning_rate": 0.00016107768563160445,
+      "loss": 0.7756,
+      "step": 6549
+    },
+    {
+      "epoch": 1.1661324786324787,
+      "grad_norm": 0.5725821256637573,
+      "learning_rate": 0.00016106660179601308,
+      "loss": 0.8228,
+      "step": 6550
+    },
+    {
+      "epoch": 1.1663105413105412,
+      "grad_norm": 0.6708397269248962,
+      "learning_rate": 0.00016105551676394848,
+      "loss": 1.0711,
+      "step": 6551
+    },
+    {
+      "epoch": 1.166488603988604,
+      "grad_norm": 0.645453155040741,
+      "learning_rate": 0.00016104443053562787,
+      "loss": 0.9299,
+      "step": 6552
+    },
+    {
+      "epoch": 1.1666666666666667,
+      "grad_norm": 0.6743524074554443,
+      "learning_rate": 0.00016103334311126847,
+      "loss": 0.8977,
+      "step": 6553
+    },
+    {
+      "epoch": 1.1668447293447293,
+      "grad_norm": 0.7248545289039612,
+      "learning_rate": 0.0001610222544910875,
+      "loss": 1.2135,
+      "step": 6554
+    },
+    {
+      "epoch": 1.167022792022792,
+      "grad_norm": 0.5798853635787964,
+      "learning_rate": 0.00016101116467530217,
+      "loss": 0.857,
+      "step": 6555
+    },
+    {
+      "epoch": 1.1672008547008548,
+      "grad_norm": 0.6828082799911499,
+      "learning_rate": 0.00016100007366412985,
+      "loss": 0.9405,
+      "step": 6556
+    },
+    {
+      "epoch": 1.1673789173789173,
+      "grad_norm": 0.6820163130760193,
+      "learning_rate": 0.0001609889814577878,
+      "loss": 0.9144,
+      "step": 6557
+    },
+    {
+      "epoch": 1.16755698005698,
+      "grad_norm": 0.6482275128364563,
+      "learning_rate": 0.00016097788805649333,
+      "loss": 0.8586,
+      "step": 6558
+    },
+    {
+      "epoch": 1.1677350427350428,
+      "grad_norm": 0.6404715180397034,
+      "learning_rate": 0.00016096679346046385,
+      "loss": 0.7018,
+      "step": 6559
+    },
+    {
+      "epoch": 1.1679131054131053,
+      "grad_norm": 0.6315203309059143,
+      "learning_rate": 0.0001609556976699167,
+      "loss": 0.9602,
+      "step": 6560
+    },
+    {
+      "epoch": 1.168091168091168,
+      "grad_norm": 0.5521387457847595,
+      "learning_rate": 0.00016094460068506925,
+      "loss": 0.9294,
+      "step": 6561
+    },
+    {
+      "epoch": 1.1682692307692308,
+      "grad_norm": 0.583372175693512,
+      "learning_rate": 0.00016093350250613895,
+      "loss": 1.077,
+      "step": 6562
+    },
+    {
+      "epoch": 1.1684472934472934,
+      "grad_norm": 0.5990512371063232,
+      "learning_rate": 0.00016092240313334325,
+      "loss": 1.0102,
+      "step": 6563
+    },
+    {
+      "epoch": 1.1686253561253561,
+      "grad_norm": 0.675128161907196,
+      "learning_rate": 0.00016091130256689964,
+      "loss": 1.0407,
+      "step": 6564
+    },
+    {
+      "epoch": 1.1688034188034189,
+      "grad_norm": 0.48797324299812317,
+      "learning_rate": 0.00016090020080702556,
+      "loss": 0.7821,
+      "step": 6565
+    },
+    {
+      "epoch": 1.1689814814814814,
+      "grad_norm": 0.7487484216690063,
+      "learning_rate": 0.00016088909785393857,
+      "loss": 1.0444,
+      "step": 6566
+    },
+    {
+      "epoch": 1.1691595441595442,
+      "grad_norm": 0.6288858652114868,
+      "learning_rate": 0.00016087799370785618,
+      "loss": 1.1854,
+      "step": 6567
+    },
+    {
+      "epoch": 1.169337606837607,
+      "grad_norm": 0.6639021635055542,
+      "learning_rate": 0.000160866888368996,
+      "loss": 0.9632,
+      "step": 6568
+    },
+    {
+      "epoch": 1.1695156695156694,
+      "grad_norm": 0.6553738713264465,
+      "learning_rate": 0.00016085578183757556,
+      "loss": 1.2765,
+      "step": 6569
+    },
+    {
+      "epoch": 1.1696937321937322,
+      "grad_norm": 0.7489066123962402,
+      "learning_rate": 0.00016084467411381248,
+      "loss": 1.0705,
+      "step": 6570
+    },
+    {
+      "epoch": 1.169871794871795,
+      "grad_norm": 0.7079828381538391,
+      "learning_rate": 0.00016083356519792444,
+      "loss": 0.8256,
+      "step": 6571
+    },
+    {
+      "epoch": 1.1700498575498575,
+      "grad_norm": 0.7065926790237427,
+      "learning_rate": 0.00016082245509012902,
+      "loss": 1.0439,
+      "step": 6572
+    },
+    {
+      "epoch": 1.1702279202279202,
+      "grad_norm": 0.6113346815109253,
+      "learning_rate": 0.00016081134379064395,
+      "loss": 0.9153,
+      "step": 6573
+    },
+    {
+      "epoch": 1.170405982905983,
+      "grad_norm": 0.6094171404838562,
+      "learning_rate": 0.0001608002312996869,
+      "loss": 0.9723,
+      "step": 6574
+    },
+    {
+      "epoch": 1.1705840455840455,
+      "grad_norm": 0.6208072900772095,
+      "learning_rate": 0.00016078911761747565,
+      "loss": 0.948,
+      "step": 6575
+    },
+    {
+      "epoch": 1.1707621082621082,
+      "grad_norm": 0.5736680626869202,
+      "learning_rate": 0.00016077800274422792,
+      "loss": 0.9155,
+      "step": 6576
+    },
+    {
+      "epoch": 1.170940170940171,
+      "grad_norm": 0.6793957948684692,
+      "learning_rate": 0.0001607668866801615,
+      "loss": 0.9574,
+      "step": 6577
+    },
+    {
+      "epoch": 1.1711182336182335,
+      "grad_norm": 0.6251805424690247,
+      "learning_rate": 0.00016075576942549413,
+      "loss": 1.0319,
+      "step": 6578
+    },
+    {
+      "epoch": 1.1712962962962963,
+      "grad_norm": 0.628882110118866,
+      "learning_rate": 0.0001607446509804437,
+      "loss": 0.9336,
+      "step": 6579
+    },
+    {
+      "epoch": 1.171474358974359,
+      "grad_norm": 0.6712356805801392,
+      "learning_rate": 0.000160733531345228,
+      "loss": 1.0958,
+      "step": 6580
+    },
+    {
+      "epoch": 1.1716524216524216,
+      "grad_norm": 0.599365770816803,
+      "learning_rate": 0.0001607224105200649,
+      "loss": 0.9814,
+      "step": 6581
+    },
+    {
+      "epoch": 1.1718304843304843,
+      "grad_norm": 0.5798245668411255,
+      "learning_rate": 0.00016071128850517235,
+      "loss": 1.0355,
+      "step": 6582
+    },
+    {
+      "epoch": 1.172008547008547,
+      "grad_norm": 0.7646229863166809,
+      "learning_rate": 0.00016070016530076817,
+      "loss": 0.9976,
+      "step": 6583
+    },
+    {
+      "epoch": 1.1721866096866096,
+      "grad_norm": 0.6371127367019653,
+      "learning_rate": 0.0001606890409070704,
+      "loss": 0.9588,
+      "step": 6584
+    },
+    {
+      "epoch": 1.1723646723646723,
+      "grad_norm": 0.6497066617012024,
+      "learning_rate": 0.0001606779153242969,
+      "loss": 0.8817,
+      "step": 6585
+    },
+    {
+      "epoch": 1.172542735042735,
+      "grad_norm": 0.7255781888961792,
+      "learning_rate": 0.0001606667885526657,
+      "loss": 1.1319,
+      "step": 6586
+    },
+    {
+      "epoch": 1.1727207977207976,
+      "grad_norm": 0.67711341381073,
+      "learning_rate": 0.00016065566059239483,
+      "loss": 1.0755,
+      "step": 6587
+    },
+    {
+      "epoch": 1.1728988603988604,
+      "grad_norm": 0.6159650087356567,
+      "learning_rate": 0.00016064453144370227,
+      "loss": 0.9892,
+      "step": 6588
+    },
+    {
+      "epoch": 1.1730769230769231,
+      "grad_norm": 0.658938467502594,
+      "learning_rate": 0.00016063340110680609,
+      "loss": 0.9131,
+      "step": 6589
+    },
+    {
+      "epoch": 1.1732549857549857,
+      "grad_norm": 0.6754795908927917,
+      "learning_rate": 0.00016062226958192438,
+      "loss": 1.0119,
+      "step": 6590
+    },
+    {
+      "epoch": 1.1734330484330484,
+      "grad_norm": 0.6453405022621155,
+      "learning_rate": 0.00016061113686927523,
+      "loss": 0.997,
+      "step": 6591
+    },
+    {
+      "epoch": 1.1736111111111112,
+      "grad_norm": 0.6580284237861633,
+      "learning_rate": 0.00016060000296907675,
+      "loss": 0.8432,
+      "step": 6592
+    },
+    {
+      "epoch": 1.173789173789174,
+      "grad_norm": 0.6588153839111328,
+      "learning_rate": 0.00016058886788154712,
+      "loss": 1.0725,
+      "step": 6593
+    },
+    {
+      "epoch": 1.1739672364672364,
+      "grad_norm": 0.6247910857200623,
+      "learning_rate": 0.00016057773160690447,
+      "loss": 0.8736,
+      "step": 6594
+    },
+    {
+      "epoch": 1.1741452991452992,
+      "grad_norm": 0.579594075679779,
+      "learning_rate": 0.000160566594145367,
+      "loss": 0.8809,
+      "step": 6595
+    },
+    {
+      "epoch": 1.1743233618233617,
+      "grad_norm": 0.6738116145133972,
+      "learning_rate": 0.00016055545549715293,
+      "loss": 0.825,
+      "step": 6596
+    },
+    {
+      "epoch": 1.1745014245014245,
+      "grad_norm": 0.6658982634544373,
+      "learning_rate": 0.00016054431566248054,
+      "loss": 1.0809,
+      "step": 6597
+    },
+    {
+      "epoch": 1.1746794871794872,
+      "grad_norm": 0.5367915630340576,
+      "learning_rate": 0.00016053317464156803,
+      "loss": 0.9005,
+      "step": 6598
+    },
+    {
+      "epoch": 1.17485754985755,
+      "grad_norm": 0.7243228554725647,
+      "learning_rate": 0.00016052203243463372,
+      "loss": 1.0573,
+      "step": 6599
+    },
+    {
+      "epoch": 1.1750356125356125,
+      "grad_norm": 0.6359432935714722,
+      "learning_rate": 0.0001605108890418959,
+      "loss": 0.8569,
+      "step": 6600
+    },
+    {
+      "epoch": 1.1752136752136753,
+      "grad_norm": 0.6565225720405579,
+      "learning_rate": 0.0001604997444635729,
+      "loss": 0.9748,
+      "step": 6601
+    },
+    {
+      "epoch": 1.1753917378917378,
+      "grad_norm": 0.7124663591384888,
+      "learning_rate": 0.0001604885986998831,
+      "loss": 1.0271,
+      "step": 6602
+    },
+    {
+      "epoch": 1.1755698005698005,
+      "grad_norm": 0.659766435623169,
+      "learning_rate": 0.00016047745175104487,
+      "loss": 1.0635,
+      "step": 6603
+    },
+    {
+      "epoch": 1.1757478632478633,
+      "grad_norm": 0.5874318480491638,
+      "learning_rate": 0.00016046630361727656,
+      "loss": 0.9257,
+      "step": 6604
+    },
+    {
+      "epoch": 1.175925925925926,
+      "grad_norm": 0.587345540523529,
+      "learning_rate": 0.0001604551542987967,
+      "loss": 1.0759,
+      "step": 6605
+    },
+    {
+      "epoch": 1.1761039886039886,
+      "grad_norm": 0.733567476272583,
+      "learning_rate": 0.00016044400379582364,
+      "loss": 0.9877,
+      "step": 6606
+    },
+    {
+      "epoch": 1.1762820512820513,
+      "grad_norm": 0.6538317203521729,
+      "learning_rate": 0.0001604328521085759,
+      "loss": 1.0094,
+      "step": 6607
+    },
+    {
+      "epoch": 1.1764601139601139,
+      "grad_norm": 0.6279696822166443,
+      "learning_rate": 0.00016042169923727195,
+      "loss": 1.1049,
+      "step": 6608
+    },
+    {
+      "epoch": 1.1766381766381766,
+      "grad_norm": 0.6949752569198608,
+      "learning_rate": 0.00016041054518213033,
+      "loss": 1.1418,
+      "step": 6609
+    },
+    {
+      "epoch": 1.1768162393162394,
+      "grad_norm": 0.6144010424613953,
+      "learning_rate": 0.00016039938994336957,
+      "loss": 1.0306,
+      "step": 6610
+    },
+    {
+      "epoch": 1.176994301994302,
+      "grad_norm": 0.5868683457374573,
+      "learning_rate": 0.00016038823352120823,
+      "loss": 0.9894,
+      "step": 6611
+    },
+    {
+      "epoch": 1.1771723646723646,
+      "grad_norm": 0.7181115746498108,
+      "learning_rate": 0.0001603770759158649,
+      "loss": 1.1674,
+      "step": 6612
+    },
+    {
+      "epoch": 1.1773504273504274,
+      "grad_norm": 0.6271308064460754,
+      "learning_rate": 0.00016036591712755818,
+      "loss": 0.9726,
+      "step": 6613
+    },
+    {
+      "epoch": 1.17752849002849,
+      "grad_norm": 0.6922675371170044,
+      "learning_rate": 0.00016035475715650668,
+      "loss": 0.9142,
+      "step": 6614
+    },
+    {
+      "epoch": 1.1777065527065527,
+      "grad_norm": 0.6838833689689636,
+      "learning_rate": 0.00016034359600292913,
+      "loss": 1.1627,
+      "step": 6615
+    },
+    {
+      "epoch": 1.1778846153846154,
+      "grad_norm": 0.6628252267837524,
+      "learning_rate": 0.00016033243366704418,
+      "loss": 0.739,
+      "step": 6616
+    },
+    {
+      "epoch": 1.1780626780626782,
+      "grad_norm": 0.6367576122283936,
+      "learning_rate": 0.0001603212701490705,
+      "loss": 0.9015,
+      "step": 6617
+    },
+    {
+      "epoch": 1.1782407407407407,
+      "grad_norm": 0.6498967409133911,
+      "learning_rate": 0.00016031010544922687,
+      "loss": 0.9645,
+      "step": 6618
+    },
+    {
+      "epoch": 1.1784188034188035,
+      "grad_norm": 0.468795508146286,
+      "learning_rate": 0.00016029893956773198,
+      "loss": 0.7305,
+      "step": 6619
+    },
+    {
+      "epoch": 1.178596866096866,
+      "grad_norm": 0.6355500817298889,
+      "learning_rate": 0.00016028777250480465,
+      "loss": 0.9183,
+      "step": 6620
+    },
+    {
+      "epoch": 1.1787749287749287,
+      "grad_norm": 0.7582615613937378,
+      "learning_rate": 0.0001602766042606636,
+      "loss": 1.1641,
+      "step": 6621
+    },
+    {
+      "epoch": 1.1789529914529915,
+      "grad_norm": 0.580035924911499,
+      "learning_rate": 0.00016026543483552776,
+      "loss": 0.9164,
+      "step": 6622
+    },
+    {
+      "epoch": 1.1791310541310542,
+      "grad_norm": 0.6198559999465942,
+      "learning_rate": 0.00016025426422961592,
+      "loss": 0.9803,
+      "step": 6623
+    },
+    {
+      "epoch": 1.1793091168091168,
+      "grad_norm": 0.59112149477005,
+      "learning_rate": 0.0001602430924431469,
+      "loss": 0.8645,
+      "step": 6624
+    },
+    {
+      "epoch": 1.1794871794871795,
+      "grad_norm": 0.6200533509254456,
+      "learning_rate": 0.00016023191947633965,
+      "loss": 1.068,
+      "step": 6625
+    },
+    {
+      "epoch": 1.179665242165242,
+      "grad_norm": 0.6077516078948975,
+      "learning_rate": 0.00016022074532941305,
+      "loss": 1.0017,
+      "step": 6626
+    },
+    {
+      "epoch": 1.1798433048433048,
+      "grad_norm": 0.6770145893096924,
+      "learning_rate": 0.00016020957000258606,
+      "loss": 0.9022,
+      "step": 6627
+    },
+    {
+      "epoch": 1.1800213675213675,
+      "grad_norm": 0.6478054523468018,
+      "learning_rate": 0.0001601983934960776,
+      "loss": 0.8615,
+      "step": 6628
+    },
+    {
+      "epoch": 1.1801994301994303,
+      "grad_norm": 0.6528988480567932,
+      "learning_rate": 0.00016018721581010666,
+      "loss": 1.0015,
+      "step": 6629
+    },
+    {
+      "epoch": 1.1803774928774928,
+      "grad_norm": 0.6160712242126465,
+      "learning_rate": 0.0001601760369448923,
+      "loss": 0.9382,
+      "step": 6630
+    },
+    {
+      "epoch": 1.1805555555555556,
+      "grad_norm": 0.5755789875984192,
+      "learning_rate": 0.00016016485690065345,
+      "loss": 1.0551,
+      "step": 6631
+    },
+    {
+      "epoch": 1.180733618233618,
+      "grad_norm": 0.8495022654533386,
+      "learning_rate": 0.00016015367567760925,
+      "loss": 0.9295,
+      "step": 6632
+    },
+    {
+      "epoch": 1.1809116809116809,
+      "grad_norm": 0.6010929346084595,
+      "learning_rate": 0.0001601424932759787,
+      "loss": 1.0413,
+      "step": 6633
+    },
+    {
+      "epoch": 1.1810897435897436,
+      "grad_norm": 0.6953579187393188,
+      "learning_rate": 0.00016013130969598093,
+      "loss": 1.0149,
+      "step": 6634
+    },
+    {
+      "epoch": 1.1812678062678064,
+      "grad_norm": 0.6949529647827148,
+      "learning_rate": 0.0001601201249378351,
+      "loss": 0.9992,
+      "step": 6635
+    },
+    {
+      "epoch": 1.181445868945869,
+      "grad_norm": 0.6471893787384033,
+      "learning_rate": 0.00016010893900176028,
+      "loss": 0.7985,
+      "step": 6636
+    },
+    {
+      "epoch": 1.1816239316239316,
+      "grad_norm": 0.6524858474731445,
+      "learning_rate": 0.00016009775188797568,
+      "loss": 0.9517,
+      "step": 6637
+    },
+    {
+      "epoch": 1.1818019943019944,
+      "grad_norm": 0.639214038848877,
+      "learning_rate": 0.00016008656359670046,
+      "loss": 1.0357,
+      "step": 6638
+    },
+    {
+      "epoch": 1.181980056980057,
+      "grad_norm": 0.6039628386497498,
+      "learning_rate": 0.00016007537412815386,
+      "loss": 1.0536,
+      "step": 6639
+    },
+    {
+      "epoch": 1.1821581196581197,
+      "grad_norm": 0.653540313243866,
+      "learning_rate": 0.00016006418348255507,
+      "loss": 0.9414,
+      "step": 6640
+    },
+    {
+      "epoch": 1.1823361823361824,
+      "grad_norm": 0.6331741809844971,
+      "learning_rate": 0.0001600529916601234,
+      "loss": 1.0352,
+      "step": 6641
+    },
+    {
+      "epoch": 1.182514245014245,
+      "grad_norm": 0.7552719712257385,
+      "learning_rate": 0.00016004179866107812,
+      "loss": 1.1103,
+      "step": 6642
+    },
+    {
+      "epoch": 1.1826923076923077,
+      "grad_norm": 0.6795875430107117,
+      "learning_rate": 0.00016003060448563852,
+      "loss": 1.1246,
+      "step": 6643
+    },
+    {
+      "epoch": 1.1828703703703705,
+      "grad_norm": 0.6308842301368713,
+      "learning_rate": 0.0001600194091340239,
+      "loss": 0.9532,
+      "step": 6644
+    },
+    {
+      "epoch": 1.183048433048433,
+      "grad_norm": 0.5640553832054138,
+      "learning_rate": 0.00016000821260645366,
+      "loss": 0.7491,
+      "step": 6645
+    },
+    {
+      "epoch": 1.1832264957264957,
+      "grad_norm": 0.5611832141876221,
+      "learning_rate": 0.00015999701490314712,
+      "loss": 0.9239,
+      "step": 6646
+    },
+    {
+      "epoch": 1.1834045584045585,
+      "grad_norm": 0.5881187915802002,
+      "learning_rate": 0.00015998581602432374,
+      "loss": 0.9246,
+      "step": 6647
+    },
+    {
+      "epoch": 1.183582621082621,
+      "grad_norm": 0.7291010022163391,
+      "learning_rate": 0.00015997461597020291,
+      "loss": 1.0314,
+      "step": 6648
+    },
+    {
+      "epoch": 1.1837606837606838,
+      "grad_norm": 0.6784794926643372,
+      "learning_rate": 0.00015996341474100402,
+      "loss": 1.0011,
+      "step": 6649
+    },
+    {
+      "epoch": 1.1839387464387465,
+      "grad_norm": 0.7083746194839478,
+      "learning_rate": 0.00015995221233694663,
+      "loss": 1.0336,
+      "step": 6650
+    },
+    {
+      "epoch": 1.184116809116809,
+      "grad_norm": 0.7081790566444397,
+      "learning_rate": 0.00015994100875825015,
+      "loss": 1.2386,
+      "step": 6651
+    },
+    {
+      "epoch": 1.1842948717948718,
+      "grad_norm": 0.5938812494277954,
+      "learning_rate": 0.00015992980400513415,
+      "loss": 0.7549,
+      "step": 6652
+    },
+    {
+      "epoch": 1.1844729344729346,
+      "grad_norm": 0.7084267139434814,
+      "learning_rate": 0.00015991859807781811,
+      "loss": 1.1194,
+      "step": 6653
+    },
+    {
+      "epoch": 1.184650997150997,
+      "grad_norm": 0.6391362547874451,
+      "learning_rate": 0.0001599073909765216,
+      "loss": 1.0857,
+      "step": 6654
+    },
+    {
+      "epoch": 1.1848290598290598,
+      "grad_norm": 0.8074106574058533,
+      "learning_rate": 0.00015989618270146423,
+      "loss": 1.1715,
+      "step": 6655
+    },
+    {
+      "epoch": 1.1850071225071226,
+      "grad_norm": 0.5778565406799316,
+      "learning_rate": 0.0001598849732528656,
+      "loss": 0.8843,
+      "step": 6656
+    },
+    {
+      "epoch": 1.1851851851851851,
+      "grad_norm": 0.6955079436302185,
+      "learning_rate": 0.00015987376263094526,
+      "loss": 1.0281,
+      "step": 6657
+    },
+    {
+      "epoch": 1.1853632478632479,
+      "grad_norm": 0.6789296269416809,
+      "learning_rate": 0.00015986255083592297,
+      "loss": 0.9739,
+      "step": 6658
+    },
+    {
+      "epoch": 1.1855413105413106,
+      "grad_norm": 0.6294292211532593,
+      "learning_rate": 0.00015985133786801834,
+      "loss": 1.0692,
+      "step": 6659
+    },
+    {
+      "epoch": 1.1857193732193732,
+      "grad_norm": 0.5604581832885742,
+      "learning_rate": 0.00015984012372745107,
+      "loss": 0.9059,
+      "step": 6660
+    },
+    {
+      "epoch": 1.185897435897436,
+      "grad_norm": 0.6727550625801086,
+      "learning_rate": 0.00015982890841444088,
+      "loss": 1.049,
+      "step": 6661
+    },
+    {
+      "epoch": 1.1860754985754987,
+      "grad_norm": 0.620914101600647,
+      "learning_rate": 0.0001598176919292075,
+      "loss": 1.1021,
+      "step": 6662
+    },
+    {
+      "epoch": 1.1862535612535612,
+      "grad_norm": 0.6696683168411255,
+      "learning_rate": 0.00015980647427197076,
+      "loss": 0.9053,
+      "step": 6663
+    },
+    {
+      "epoch": 1.186431623931624,
+      "grad_norm": 0.6713385581970215,
+      "learning_rate": 0.00015979525544295036,
+      "loss": 0.9596,
+      "step": 6664
+    },
+    {
+      "epoch": 1.1866096866096867,
+      "grad_norm": 0.7643477320671082,
+      "learning_rate": 0.00015978403544236614,
+      "loss": 0.882,
+      "step": 6665
+    },
+    {
+      "epoch": 1.1867877492877492,
+      "grad_norm": 0.5890966057777405,
+      "learning_rate": 0.00015977281427043794,
+      "loss": 1.0215,
+      "step": 6666
+    },
+    {
+      "epoch": 1.186965811965812,
+      "grad_norm": 0.7287502288818359,
+      "learning_rate": 0.0001597615919273856,
+      "loss": 1.0111,
+      "step": 6667
+    },
+    {
+      "epoch": 1.1871438746438747,
+      "grad_norm": 0.5713803172111511,
+      "learning_rate": 0.00015975036841342903,
+      "loss": 1.0068,
+      "step": 6668
+    },
+    {
+      "epoch": 1.1873219373219372,
+      "grad_norm": 0.5113094449043274,
+      "learning_rate": 0.0001597391437287881,
+      "loss": 0.9018,
+      "step": 6669
+    },
+    {
+      "epoch": 1.1875,
+      "grad_norm": 0.585640013217926,
+      "learning_rate": 0.00015972791787368276,
+      "loss": 1.0375,
+      "step": 6670
+    },
+    {
+      "epoch": 1.1876780626780628,
+      "grad_norm": 0.5778326392173767,
+      "learning_rate": 0.00015971669084833293,
+      "loss": 0.9975,
+      "step": 6671
+    },
+    {
+      "epoch": 1.1878561253561253,
+      "grad_norm": 0.6707763075828552,
+      "learning_rate": 0.0001597054626529586,
+      "loss": 1.0048,
+      "step": 6672
+    },
+    {
+      "epoch": 1.188034188034188,
+      "grad_norm": 0.6113292574882507,
+      "learning_rate": 0.00015969423328777974,
+      "loss": 1.1447,
+      "step": 6673
+    },
+    {
+      "epoch": 1.1882122507122508,
+      "grad_norm": 0.6075651049613953,
+      "learning_rate": 0.00015968300275301638,
+      "loss": 0.9212,
+      "step": 6674
+    },
+    {
+      "epoch": 1.1883903133903133,
+      "grad_norm": 0.6990494132041931,
+      "learning_rate": 0.00015967177104888857,
+      "loss": 0.9952,
+      "step": 6675
+    },
+    {
+      "epoch": 1.188568376068376,
+      "grad_norm": 0.6228706240653992,
+      "learning_rate": 0.00015966053817561638,
+      "loss": 1.0187,
+      "step": 6676
+    },
+    {
+      "epoch": 1.1887464387464388,
+      "grad_norm": 0.6387844085693359,
+      "learning_rate": 0.00015964930413341985,
+      "loss": 1.1614,
+      "step": 6677
+    },
+    {
+      "epoch": 1.1889245014245013,
+      "grad_norm": 0.6501925587654114,
+      "learning_rate": 0.00015963806892251915,
+      "loss": 1.0366,
+      "step": 6678
+    },
+    {
+      "epoch": 1.189102564102564,
+      "grad_norm": 0.6923739910125732,
+      "learning_rate": 0.00015962683254313435,
+      "loss": 1.1992,
+      "step": 6679
+    },
+    {
+      "epoch": 1.1892806267806268,
+      "grad_norm": 0.6640275120735168,
+      "learning_rate": 0.00015961559499548563,
+      "loss": 0.8883,
+      "step": 6680
+    },
+    {
+      "epoch": 1.1894586894586894,
+      "grad_norm": 0.6493857502937317,
+      "learning_rate": 0.00015960435627979317,
+      "loss": 1.1368,
+      "step": 6681
+    },
+    {
+      "epoch": 1.1896367521367521,
+      "grad_norm": 0.6357189416885376,
+      "learning_rate": 0.0001595931163962772,
+      "loss": 1.0502,
+      "step": 6682
+    },
+    {
+      "epoch": 1.1898148148148149,
+      "grad_norm": 0.5756343007087708,
+      "learning_rate": 0.0001595818753451579,
+      "loss": 0.9871,
+      "step": 6683
+    },
+    {
+      "epoch": 1.1899928774928774,
+      "grad_norm": 0.7369210124015808,
+      "learning_rate": 0.0001595706331266555,
+      "loss": 1.3229,
+      "step": 6684
+    },
+    {
+      "epoch": 1.1901709401709402,
+      "grad_norm": 0.7140820622444153,
+      "learning_rate": 0.0001595593897409903,
+      "loss": 1.1154,
+      "step": 6685
+    },
+    {
+      "epoch": 1.190349002849003,
+      "grad_norm": 0.696973443031311,
+      "learning_rate": 0.00015954814518838255,
+      "loss": 0.9806,
+      "step": 6686
+    },
+    {
+      "epoch": 1.1905270655270654,
+      "grad_norm": 0.5299260020256042,
+      "learning_rate": 0.00015953689946905262,
+      "loss": 0.771,
+      "step": 6687
+    },
+    {
+      "epoch": 1.1907051282051282,
+      "grad_norm": 0.6814879775047302,
+      "learning_rate": 0.00015952565258322085,
+      "loss": 0.8444,
+      "step": 6688
+    },
+    {
+      "epoch": 1.190883190883191,
+      "grad_norm": 0.6215870976448059,
+      "learning_rate": 0.00015951440453110754,
+      "loss": 1.0743,
+      "step": 6689
+    },
+    {
+      "epoch": 1.1910612535612535,
+      "grad_norm": 0.7017203569412231,
+      "learning_rate": 0.00015950315531293308,
+      "loss": 1.185,
+      "step": 6690
+    },
+    {
+      "epoch": 1.1912393162393162,
+      "grad_norm": 0.7147250175476074,
+      "learning_rate": 0.00015949190492891795,
+      "loss": 1.0646,
+      "step": 6691
+    },
+    {
+      "epoch": 1.191417378917379,
+      "grad_norm": 0.5867117047309875,
+      "learning_rate": 0.00015948065337928252,
+      "loss": 1.0554,
+      "step": 6692
+    },
+    {
+      "epoch": 1.1915954415954415,
+      "grad_norm": 0.6813527345657349,
+      "learning_rate": 0.0001594694006642472,
+      "loss": 1.1451,
+      "step": 6693
+    },
+    {
+      "epoch": 1.1917735042735043,
+      "grad_norm": 0.5192593932151794,
+      "learning_rate": 0.00015945814678403256,
+      "loss": 0.7886,
+      "step": 6694
+    },
+    {
+      "epoch": 1.191951566951567,
+      "grad_norm": 0.6537744402885437,
+      "learning_rate": 0.00015944689173885904,
+      "loss": 0.9905,
+      "step": 6695
+    },
+    {
+      "epoch": 1.1921296296296295,
+      "grad_norm": 0.7350276112556458,
+      "learning_rate": 0.00015943563552894716,
+      "loss": 0.9009,
+      "step": 6696
+    },
+    {
+      "epoch": 1.1923076923076923,
+      "grad_norm": 0.7086381316184998,
+      "learning_rate": 0.00015942437815451746,
+      "loss": 0.9117,
+      "step": 6697
+    },
+    {
+      "epoch": 1.192485754985755,
+      "grad_norm": 0.6774969696998596,
+      "learning_rate": 0.00015941311961579054,
+      "loss": 1.1172,
+      "step": 6698
+    },
+    {
+      "epoch": 1.1926638176638176,
+      "grad_norm": 0.7034362554550171,
+      "learning_rate": 0.00015940185991298694,
+      "loss": 0.8054,
+      "step": 6699
+    },
+    {
+      "epoch": 1.1928418803418803,
+      "grad_norm": 0.66145920753479,
+      "learning_rate": 0.00015939059904632728,
+      "loss": 0.7417,
+      "step": 6700
+    },
+    {
+      "epoch": 1.193019943019943,
+      "grad_norm": 0.6590890884399414,
+      "learning_rate": 0.00015937933701603223,
+      "loss": 0.9169,
+      "step": 6701
+    },
+    {
+      "epoch": 1.1931980056980056,
+      "grad_norm": 0.7492850422859192,
+      "learning_rate": 0.0001593680738223224,
+      "loss": 1.0529,
+      "step": 6702
+    },
+    {
+      "epoch": 1.1933760683760684,
+      "grad_norm": 0.7103236317634583,
+      "learning_rate": 0.00015935680946541848,
+      "loss": 1.1377,
+      "step": 6703
+    },
+    {
+      "epoch": 1.193554131054131,
+      "grad_norm": 0.6164175868034363,
+      "learning_rate": 0.00015934554394554122,
+      "loss": 0.8636,
+      "step": 6704
+    },
+    {
+      "epoch": 1.1937321937321936,
+      "grad_norm": 0.6667410135269165,
+      "learning_rate": 0.0001593342772629113,
+      "loss": 1.0073,
+      "step": 6705
+    },
+    {
+      "epoch": 1.1939102564102564,
+      "grad_norm": 0.6785695552825928,
+      "learning_rate": 0.00015932300941774944,
+      "loss": 1.0752,
+      "step": 6706
+    },
+    {
+      "epoch": 1.1940883190883191,
+      "grad_norm": 0.6446872353553772,
+      "learning_rate": 0.0001593117404102765,
+      "loss": 0.9509,
+      "step": 6707
+    },
+    {
+      "epoch": 1.194266381766382,
+      "grad_norm": 0.6607686877250671,
+      "learning_rate": 0.00015930047024071317,
+      "loss": 1.0902,
+      "step": 6708
+    },
+    {
+      "epoch": 1.1944444444444444,
+      "grad_norm": 0.664804995059967,
+      "learning_rate": 0.0001592891989092803,
+      "loss": 0.9783,
+      "step": 6709
+    },
+    {
+      "epoch": 1.1946225071225072,
+      "grad_norm": 0.7147907018661499,
+      "learning_rate": 0.00015927792641619876,
+      "loss": 1.0558,
+      "step": 6710
+    },
+    {
+      "epoch": 1.1948005698005697,
+      "grad_norm": 0.6858944296836853,
+      "learning_rate": 0.0001592666527616894,
+      "loss": 1.0514,
+      "step": 6711
+    },
+    {
+      "epoch": 1.1949786324786325,
+      "grad_norm": 0.598463773727417,
+      "learning_rate": 0.0001592553779459731,
+      "loss": 0.8927,
+      "step": 6712
+    },
+    {
+      "epoch": 1.1951566951566952,
+      "grad_norm": 0.6872668862342834,
+      "learning_rate": 0.00015924410196927076,
+      "loss": 1.016,
+      "step": 6713
+    },
+    {
+      "epoch": 1.195334757834758,
+      "grad_norm": 0.6547996401786804,
+      "learning_rate": 0.00015923282483180326,
+      "loss": 1.1573,
+      "step": 6714
+    },
+    {
+      "epoch": 1.1955128205128205,
+      "grad_norm": 0.6254705786705017,
+      "learning_rate": 0.00015922154653379167,
+      "loss": 1.0179,
+      "step": 6715
+    },
+    {
+      "epoch": 1.1956908831908832,
+      "grad_norm": 0.6049207448959351,
+      "learning_rate": 0.00015921026707545684,
+      "loss": 1.0713,
+      "step": 6716
+    },
+    {
+      "epoch": 1.1958689458689458,
+      "grad_norm": 0.6042858958244324,
+      "learning_rate": 0.0001591989864570199,
+      "loss": 0.919,
+      "step": 6717
+    },
+    {
+      "epoch": 1.1960470085470085,
+      "grad_norm": 0.6521187424659729,
+      "learning_rate": 0.0001591877046787017,
+      "loss": 1.0112,
+      "step": 6718
+    },
+    {
+      "epoch": 1.1962250712250713,
+      "grad_norm": 0.766260027885437,
+      "learning_rate": 0.00015917642174072348,
+      "loss": 0.9774,
+      "step": 6719
+    },
+    {
+      "epoch": 1.196403133903134,
+      "grad_norm": 0.7066532373428345,
+      "learning_rate": 0.00015916513764330613,
+      "loss": 1.1112,
+      "step": 6720
+    },
+    {
+      "epoch": 1.1965811965811965,
+      "grad_norm": 0.7351508140563965,
+      "learning_rate": 0.00015915385238667083,
+      "loss": 0.9841,
+      "step": 6721
+    },
+    {
+      "epoch": 1.1967592592592593,
+      "grad_norm": 0.6133812069892883,
+      "learning_rate": 0.0001591425659710387,
+      "loss": 0.8629,
+      "step": 6722
+    },
+    {
+      "epoch": 1.1969373219373218,
+      "grad_norm": 0.7244157791137695,
+      "learning_rate": 0.00015913127839663083,
+      "loss": 1.1584,
+      "step": 6723
+    },
+    {
+      "epoch": 1.1971153846153846,
+      "grad_norm": 0.5986210107803345,
+      "learning_rate": 0.00015911998966366842,
+      "loss": 0.8507,
+      "step": 6724
+    },
+    {
+      "epoch": 1.1972934472934473,
+      "grad_norm": 0.6087439060211182,
+      "learning_rate": 0.00015910869977237257,
+      "loss": 0.884,
+      "step": 6725
+    },
+    {
+      "epoch": 1.19747150997151,
+      "grad_norm": 0.7546007633209229,
+      "learning_rate": 0.00015909740872296457,
+      "loss": 1.1449,
+      "step": 6726
+    },
+    {
+      "epoch": 1.1976495726495726,
+      "grad_norm": 0.6437731385231018,
+      "learning_rate": 0.0001590861165156656,
+      "loss": 0.7845,
+      "step": 6727
+    },
+    {
+      "epoch": 1.1978276353276354,
+      "grad_norm": 0.6281737089157104,
+      "learning_rate": 0.00015907482315069693,
+      "loss": 0.8969,
+      "step": 6728
+    },
+    {
+      "epoch": 1.198005698005698,
+      "grad_norm": 0.6196113228797913,
+      "learning_rate": 0.00015906352862827983,
+      "loss": 1.0264,
+      "step": 6729
+    },
+    {
+      "epoch": 1.1981837606837606,
+      "grad_norm": 0.5990965962409973,
+      "learning_rate": 0.00015905223294863553,
+      "loss": 1.0017,
+      "step": 6730
+    },
+    {
+      "epoch": 1.1983618233618234,
+      "grad_norm": 0.6509191393852234,
+      "learning_rate": 0.00015904093611198542,
+      "loss": 1.1066,
+      "step": 6731
+    },
+    {
+      "epoch": 1.1985398860398861,
+      "grad_norm": 0.6648043990135193,
+      "learning_rate": 0.00015902963811855085,
+      "loss": 1.077,
+      "step": 6732
+    },
+    {
+      "epoch": 1.1987179487179487,
+      "grad_norm": 0.7071963548660278,
+      "learning_rate": 0.00015901833896855307,
+      "loss": 1.1346,
+      "step": 6733
+    },
+    {
+      "epoch": 1.1988960113960114,
+      "grad_norm": 0.5889959335327148,
+      "learning_rate": 0.0001590070386622136,
+      "loss": 0.9525,
+      "step": 6734
+    },
+    {
+      "epoch": 1.199074074074074,
+      "grad_norm": 0.6233037710189819,
+      "learning_rate": 0.00015899573719975376,
+      "loss": 1.0513,
+      "step": 6735
+    },
+    {
+      "epoch": 1.1992521367521367,
+      "grad_norm": 0.7912302613258362,
+      "learning_rate": 0.000158984434581395,
+      "loss": 0.8749,
+      "step": 6736
+    },
+    {
+      "epoch": 1.1994301994301995,
+      "grad_norm": 0.5783160924911499,
+      "learning_rate": 0.0001589731308073588,
+      "loss": 0.7173,
+      "step": 6737
+    },
+    {
+      "epoch": 1.1996082621082622,
+      "grad_norm": 0.718950092792511,
+      "learning_rate": 0.00015896182587786658,
+      "loss": 1.0815,
+      "step": 6738
+    },
+    {
+      "epoch": 1.1997863247863247,
+      "grad_norm": 0.6700926423072815,
+      "learning_rate": 0.0001589505197931399,
+      "loss": 1.0817,
+      "step": 6739
+    },
+    {
+      "epoch": 1.1999643874643875,
+      "grad_norm": 0.7614455223083496,
+      "learning_rate": 0.0001589392125534002,
+      "loss": 0.9707,
+      "step": 6740
+    },
+    {
+      "epoch": 1.20014245014245,
+      "grad_norm": 0.6998619437217712,
+      "learning_rate": 0.00015892790415886906,
+      "loss": 1.0541,
+      "step": 6741
+    },
+    {
+      "epoch": 1.2003205128205128,
+      "grad_norm": 0.6127668619155884,
+      "learning_rate": 0.0001589165946097681,
+      "loss": 0.9147,
+      "step": 6742
+    },
+    {
+      "epoch": 1.2004985754985755,
+      "grad_norm": 0.7112005352973938,
+      "learning_rate": 0.00015890528390631885,
+      "loss": 0.868,
+      "step": 6743
+    },
+    {
+      "epoch": 1.2006766381766383,
+      "grad_norm": 0.6631024479866028,
+      "learning_rate": 0.0001588939720487429,
+      "loss": 0.9277,
+      "step": 6744
+    },
+    {
+      "epoch": 1.2008547008547008,
+      "grad_norm": 0.6106321215629578,
+      "learning_rate": 0.00015888265903726188,
+      "loss": 1.0223,
+      "step": 6745
+    },
+    {
+      "epoch": 1.2010327635327636,
+      "grad_norm": 0.6400851607322693,
+      "learning_rate": 0.00015887134487209753,
+      "loss": 1.1279,
+      "step": 6746
+    },
+    {
+      "epoch": 1.201210826210826,
+      "grad_norm": 0.6298650503158569,
+      "learning_rate": 0.00015886002955347147,
+      "loss": 0.9481,
+      "step": 6747
+    },
+    {
+      "epoch": 1.2013888888888888,
+      "grad_norm": 0.647974967956543,
+      "learning_rate": 0.00015884871308160538,
+      "loss": 1.1513,
+      "step": 6748
+    },
+    {
+      "epoch": 1.2015669515669516,
+      "grad_norm": 0.6770651936531067,
+      "learning_rate": 0.000158837395456721,
+      "loss": 0.9914,
+      "step": 6749
+    },
+    {
+      "epoch": 1.2017450142450143,
+      "grad_norm": 0.6708947420120239,
+      "learning_rate": 0.0001588260766790401,
+      "loss": 1.1848,
+      "step": 6750
+    },
+    {
+      "epoch": 1.2019230769230769,
+      "grad_norm": 0.5624440908432007,
+      "learning_rate": 0.00015881475674878442,
+      "loss": 0.9848,
+      "step": 6751
+    },
+    {
+      "epoch": 1.2021011396011396,
+      "grad_norm": 0.5512633919715881,
+      "learning_rate": 0.00015880343566617575,
+      "loss": 1.0308,
+      "step": 6752
+    },
+    {
+      "epoch": 1.2022792022792024,
+      "grad_norm": 0.5621042251586914,
+      "learning_rate": 0.0001587921134314359,
+      "loss": 0.8724,
+      "step": 6753
+    },
+    {
+      "epoch": 1.202457264957265,
+      "grad_norm": 0.6881251931190491,
+      "learning_rate": 0.00015878079004478675,
+      "loss": 0.9771,
+      "step": 6754
+    },
+    {
+      "epoch": 1.2026353276353277,
+      "grad_norm": 0.729998767375946,
+      "learning_rate": 0.0001587694655064501,
+      "loss": 1.002,
+      "step": 6755
+    },
+    {
+      "epoch": 1.2028133903133904,
+      "grad_norm": 0.5972567200660706,
+      "learning_rate": 0.00015875813981664787,
+      "loss": 1.0571,
+      "step": 6756
+    },
+    {
+      "epoch": 1.202991452991453,
+      "grad_norm": 0.6319229006767273,
+      "learning_rate": 0.00015874681297560196,
+      "loss": 0.9294,
+      "step": 6757
+    },
+    {
+      "epoch": 1.2031695156695157,
+      "grad_norm": 0.6751521825790405,
+      "learning_rate": 0.00015873548498353428,
+      "loss": 0.783,
+      "step": 6758
+    },
+    {
+      "epoch": 1.2033475783475784,
+      "grad_norm": 0.6476554870605469,
+      "learning_rate": 0.00015872415584066677,
+      "loss": 0.8939,
+      "step": 6759
+    },
+    {
+      "epoch": 1.203525641025641,
+      "grad_norm": 0.6530960202217102,
+      "learning_rate": 0.0001587128255472214,
+      "loss": 0.9828,
+      "step": 6760
+    },
+    {
+      "epoch": 1.2037037037037037,
+      "grad_norm": 0.6708502173423767,
+      "learning_rate": 0.00015870149410342023,
+      "loss": 0.9285,
+      "step": 6761
+    },
+    {
+      "epoch": 1.2038817663817665,
+      "grad_norm": 0.7749543190002441,
+      "learning_rate": 0.0001586901615094852,
+      "loss": 1.1295,
+      "step": 6762
+    },
+    {
+      "epoch": 1.204059829059829,
+      "grad_norm": 0.6750495433807373,
+      "learning_rate": 0.00015867882776563836,
+      "loss": 1.0562,
+      "step": 6763
+    },
+    {
+      "epoch": 1.2042378917378918,
+      "grad_norm": 0.6892416477203369,
+      "learning_rate": 0.00015866749287210178,
+      "loss": 0.7207,
+      "step": 6764
+    },
+    {
+      "epoch": 1.2044159544159545,
+      "grad_norm": 0.7066485285758972,
+      "learning_rate": 0.00015865615682909758,
+      "loss": 1.0489,
+      "step": 6765
+    },
+    {
+      "epoch": 1.204594017094017,
+      "grad_norm": 0.5669938325881958,
+      "learning_rate": 0.00015864481963684783,
+      "loss": 0.8149,
+      "step": 6766
+    },
+    {
+      "epoch": 1.2047720797720798,
+      "grad_norm": 0.6467341780662537,
+      "learning_rate": 0.0001586334812955746,
+      "loss": 0.9595,
+      "step": 6767
+    },
+    {
+      "epoch": 1.2049501424501425,
+      "grad_norm": 0.6026045680046082,
+      "learning_rate": 0.0001586221418055002,
+      "loss": 0.9832,
+      "step": 6768
+    },
+    {
+      "epoch": 1.205128205128205,
+      "grad_norm": 0.7655174732208252,
+      "learning_rate": 0.00015861080116684665,
+      "loss": 0.9796,
+      "step": 6769
+    },
+    {
+      "epoch": 1.2053062678062678,
+      "grad_norm": 0.6386621594429016,
+      "learning_rate": 0.00015859945937983624,
+      "loss": 0.9368,
+      "step": 6770
+    },
+    {
+      "epoch": 1.2054843304843306,
+      "grad_norm": 0.7088032364845276,
+      "learning_rate": 0.0001585881164446911,
+      "loss": 1.0167,
+      "step": 6771
+    },
+    {
+      "epoch": 1.205662393162393,
+      "grad_norm": 0.6015275716781616,
+      "learning_rate": 0.0001585767723616336,
+      "loss": 0.8551,
+      "step": 6772
+    },
+    {
+      "epoch": 1.2058404558404558,
+      "grad_norm": 0.7013260722160339,
+      "learning_rate": 0.00015856542713088583,
+      "loss": 0.8009,
+      "step": 6773
+    },
+    {
+      "epoch": 1.2060185185185186,
+      "grad_norm": 0.6931240558624268,
+      "learning_rate": 0.00015855408075267024,
+      "loss": 0.9964,
+      "step": 6774
+    },
+    {
+      "epoch": 1.2061965811965811,
+      "grad_norm": 0.7274388670921326,
+      "learning_rate": 0.00015854273322720908,
+      "loss": 1.0991,
+      "step": 6775
+    },
+    {
+      "epoch": 1.2063746438746439,
+      "grad_norm": 0.6353716254234314,
+      "learning_rate": 0.00015853138455472466,
+      "loss": 1.0893,
+      "step": 6776
+    },
+    {
+      "epoch": 1.2065527065527066,
+      "grad_norm": 0.6958979368209839,
+      "learning_rate": 0.00015852003473543932,
+      "loss": 1.0238,
+      "step": 6777
+    },
+    {
+      "epoch": 1.2067307692307692,
+      "grad_norm": 0.626838743686676,
+      "learning_rate": 0.00015850868376957551,
+      "loss": 0.9384,
+      "step": 6778
+    },
+    {
+      "epoch": 1.206908831908832,
+      "grad_norm": 0.5455024242401123,
+      "learning_rate": 0.00015849733165735556,
+      "loss": 0.8068,
+      "step": 6779
+    },
+    {
+      "epoch": 1.2070868945868947,
+      "grad_norm": 0.6337353587150574,
+      "learning_rate": 0.0001584859783990019,
+      "loss": 1.1341,
+      "step": 6780
+    },
+    {
+      "epoch": 1.2072649572649572,
+      "grad_norm": 0.6318019032478333,
+      "learning_rate": 0.000158474623994737,
+      "loss": 1.1095,
+      "step": 6781
+    },
+    {
+      "epoch": 1.20744301994302,
+      "grad_norm": 0.8183810710906982,
+      "learning_rate": 0.00015846326844478332,
+      "loss": 1.1471,
+      "step": 6782
+    },
+    {
+      "epoch": 1.2076210826210827,
+      "grad_norm": 0.6140483021736145,
+      "learning_rate": 0.00015845191174936334,
+      "loss": 0.8538,
+      "step": 6783
+    },
+    {
+      "epoch": 1.2077991452991452,
+      "grad_norm": 0.7570197582244873,
+      "learning_rate": 0.0001584405539086996,
+      "loss": 1.427,
+      "step": 6784
+    },
+    {
+      "epoch": 1.207977207977208,
+      "grad_norm": 0.7616991996765137,
+      "learning_rate": 0.00015842919492301455,
+      "loss": 1.2214,
+      "step": 6785
+    },
+    {
+      "epoch": 1.2081552706552707,
+      "grad_norm": 0.561996579170227,
+      "learning_rate": 0.00015841783479253084,
+      "loss": 0.8916,
+      "step": 6786
+    },
+    {
+      "epoch": 1.2083333333333333,
+      "grad_norm": 0.6124222874641418,
+      "learning_rate": 0.000158406473517471,
+      "loss": 0.9637,
+      "step": 6787
+    },
+    {
+      "epoch": 1.208511396011396,
+      "grad_norm": 0.6053098440170288,
+      "learning_rate": 0.00015839511109805762,
+      "loss": 1.0365,
+      "step": 6788
+    },
+    {
+      "epoch": 1.2086894586894588,
+      "grad_norm": 0.6451675295829773,
+      "learning_rate": 0.00015838374753451338,
+      "loss": 1.0497,
+      "step": 6789
+    },
+    {
+      "epoch": 1.2088675213675213,
+      "grad_norm": 0.6789399981498718,
+      "learning_rate": 0.00015837238282706087,
+      "loss": 0.9286,
+      "step": 6790
+    },
+    {
+      "epoch": 1.209045584045584,
+      "grad_norm": 0.5742998123168945,
+      "learning_rate": 0.0001583610169759228,
+      "loss": 1.082,
+      "step": 6791
+    },
+    {
+      "epoch": 1.2092236467236468,
+      "grad_norm": 0.6813693642616272,
+      "learning_rate": 0.0001583496499813218,
+      "loss": 0.9785,
+      "step": 6792
+    },
+    {
+      "epoch": 1.2094017094017093,
+      "grad_norm": 0.6150603890419006,
+      "learning_rate": 0.0001583382818434806,
+      "loss": 0.9533,
+      "step": 6793
+    },
+    {
+      "epoch": 1.209579772079772,
+      "grad_norm": 0.6905919909477234,
+      "learning_rate": 0.000158326912562622,
+      "loss": 1.0132,
+      "step": 6794
+    },
+    {
+      "epoch": 1.2097578347578348,
+      "grad_norm": 0.5861411094665527,
+      "learning_rate": 0.0001583155421389687,
+      "loss": 0.7071,
+      "step": 6795
+    },
+    {
+      "epoch": 1.2099358974358974,
+      "grad_norm": 0.6822740435600281,
+      "learning_rate": 0.0001583041705727435,
+      "loss": 1.1366,
+      "step": 6796
+    },
+    {
+      "epoch": 1.21011396011396,
+      "grad_norm": 0.6013675928115845,
+      "learning_rate": 0.00015829279786416916,
+      "loss": 0.9232,
+      "step": 6797
+    },
+    {
+      "epoch": 1.2102920227920229,
+      "grad_norm": 0.650675356388092,
+      "learning_rate": 0.00015828142401346857,
+      "loss": 0.887,
+      "step": 6798
+    },
+    {
+      "epoch": 1.2104700854700854,
+      "grad_norm": 0.6764078736305237,
+      "learning_rate": 0.00015827004902086456,
+      "loss": 0.8423,
+      "step": 6799
+    },
+    {
+      "epoch": 1.2106481481481481,
+      "grad_norm": 0.6460821628570557,
+      "learning_rate": 0.00015825867288657994,
+      "loss": 1.0074,
+      "step": 6800
+    },
+    {
+      "epoch": 1.210826210826211,
+      "grad_norm": 0.692562997341156,
+      "learning_rate": 0.00015824729561083768,
+      "loss": 0.7978,
+      "step": 6801
+    },
+    {
+      "epoch": 1.2110042735042734,
+      "grad_norm": 0.7255034446716309,
+      "learning_rate": 0.00015823591719386066,
+      "loss": 1.071,
+      "step": 6802
+    },
+    {
+      "epoch": 1.2111823361823362,
+      "grad_norm": 0.6598904728889465,
+      "learning_rate": 0.0001582245376358718,
+      "loss": 0.9736,
+      "step": 6803
+    },
+    {
+      "epoch": 1.211360398860399,
+      "grad_norm": 0.6372483968734741,
+      "learning_rate": 0.0001582131569370941,
+      "loss": 0.9029,
+      "step": 6804
+    },
+    {
+      "epoch": 1.2115384615384615,
+      "grad_norm": 0.5907173156738281,
+      "learning_rate": 0.00015820177509775048,
+      "loss": 0.918,
+      "step": 6805
+    },
+    {
+      "epoch": 1.2117165242165242,
+      "grad_norm": 0.6252630949020386,
+      "learning_rate": 0.00015819039211806404,
+      "loss": 0.7801,
+      "step": 6806
+    },
+    {
+      "epoch": 1.211894586894587,
+      "grad_norm": 0.5793096423149109,
+      "learning_rate": 0.0001581790079982577,
+      "loss": 0.5769,
+      "step": 6807
+    },
+    {
+      "epoch": 1.2120726495726495,
+      "grad_norm": 0.7267270684242249,
+      "learning_rate": 0.00015816762273855454,
+      "loss": 1.1428,
+      "step": 6808
+    },
+    {
+      "epoch": 1.2122507122507122,
+      "grad_norm": 0.7481234073638916,
+      "learning_rate": 0.00015815623633917767,
+      "loss": 1.0209,
+      "step": 6809
+    },
+    {
+      "epoch": 1.212428774928775,
+      "grad_norm": 0.6114386916160583,
+      "learning_rate": 0.00015814484880035017,
+      "loss": 0.9073,
+      "step": 6810
+    },
+    {
+      "epoch": 1.2126068376068375,
+      "grad_norm": 0.6871182322502136,
+      "learning_rate": 0.00015813346012229516,
+      "loss": 1.151,
+      "step": 6811
+    },
+    {
+      "epoch": 1.2127849002849003,
+      "grad_norm": 0.6380293965339661,
+      "learning_rate": 0.0001581220703052357,
+      "loss": 1.0981,
+      "step": 6812
+    },
+    {
+      "epoch": 1.212962962962963,
+      "grad_norm": 0.6013718247413635,
+      "learning_rate": 0.00015811067934939503,
+      "loss": 0.8832,
+      "step": 6813
+    },
+    {
+      "epoch": 1.2131410256410255,
+      "grad_norm": 0.5816897749900818,
+      "learning_rate": 0.00015809928725499632,
+      "loss": 1.063,
+      "step": 6814
+    },
+    {
+      "epoch": 1.2133190883190883,
+      "grad_norm": 0.5970914363861084,
+      "learning_rate": 0.00015808789402226278,
+      "loss": 1.1177,
+      "step": 6815
+    },
+    {
+      "epoch": 1.213497150997151,
+      "grad_norm": 0.7624936103820801,
+      "learning_rate": 0.00015807649965141762,
+      "loss": 1.048,
+      "step": 6816
+    },
+    {
+      "epoch": 1.2136752136752136,
+      "grad_norm": 0.636263906955719,
+      "learning_rate": 0.0001580651041426841,
+      "loss": 0.9743,
+      "step": 6817
+    },
+    {
+      "epoch": 1.2138532763532763,
+      "grad_norm": 0.641090452671051,
+      "learning_rate": 0.00015805370749628547,
+      "loss": 1.0227,
+      "step": 6818
+    },
+    {
+      "epoch": 1.214031339031339,
+      "grad_norm": 0.6484021544456482,
+      "learning_rate": 0.00015804230971244504,
+      "loss": 0.9615,
+      "step": 6819
+    },
+    {
+      "epoch": 1.2142094017094016,
+      "grad_norm": 0.6473353505134583,
+      "learning_rate": 0.00015803091079138613,
+      "loss": 1.0507,
+      "step": 6820
+    },
+    {
+      "epoch": 1.2143874643874644,
+      "grad_norm": 0.5477129220962524,
+      "learning_rate": 0.00015801951073333206,
+      "loss": 0.7928,
+      "step": 6821
+    },
+    {
+      "epoch": 1.2145655270655271,
+      "grad_norm": 0.7256210446357727,
+      "learning_rate": 0.0001580081095385062,
+      "loss": 1.0172,
+      "step": 6822
+    },
+    {
+      "epoch": 1.2147435897435896,
+      "grad_norm": 0.5785418748855591,
+      "learning_rate": 0.00015799670720713195,
+      "loss": 0.8478,
+      "step": 6823
+    },
+    {
+      "epoch": 1.2149216524216524,
+      "grad_norm": 0.6782996654510498,
+      "learning_rate": 0.00015798530373943267,
+      "loss": 1.1819,
+      "step": 6824
+    },
+    {
+      "epoch": 1.2150997150997151,
+      "grad_norm": 0.6513699293136597,
+      "learning_rate": 0.00015797389913563186,
+      "loss": 0.9626,
+      "step": 6825
+    },
+    {
+      "epoch": 1.2152777777777777,
+      "grad_norm": 0.6503037214279175,
+      "learning_rate": 0.0001579624933959529,
+      "loss": 1.0282,
+      "step": 6826
+    },
+    {
+      "epoch": 1.2154558404558404,
+      "grad_norm": 0.581501841545105,
+      "learning_rate": 0.0001579510865206193,
+      "loss": 0.8976,
+      "step": 6827
+    },
+    {
+      "epoch": 1.2156339031339032,
+      "grad_norm": 0.6696721911430359,
+      "learning_rate": 0.00015793967850985454,
+      "loss": 0.6418,
+      "step": 6828
+    },
+    {
+      "epoch": 1.215811965811966,
+      "grad_norm": 0.6577274203300476,
+      "learning_rate": 0.00015792826936388213,
+      "loss": 1.0615,
+      "step": 6829
+    },
+    {
+      "epoch": 1.2159900284900285,
+      "grad_norm": 0.66291743516922,
+      "learning_rate": 0.00015791685908292564,
+      "loss": 0.8582,
+      "step": 6830
+    },
+    {
+      "epoch": 1.2161680911680912,
+      "grad_norm": 0.6548362374305725,
+      "learning_rate": 0.0001579054476672086,
+      "loss": 1.0343,
+      "step": 6831
+    },
+    {
+      "epoch": 1.2163461538461537,
+      "grad_norm": 0.6381218433380127,
+      "learning_rate": 0.00015789403511695457,
+      "loss": 0.8133,
+      "step": 6832
+    },
+    {
+      "epoch": 1.2165242165242165,
+      "grad_norm": 0.7217492461204529,
+      "learning_rate": 0.00015788262143238722,
+      "loss": 0.9183,
+      "step": 6833
+    },
+    {
+      "epoch": 1.2167022792022792,
+      "grad_norm": 0.610454797744751,
+      "learning_rate": 0.00015787120661373013,
+      "loss": 0.8488,
+      "step": 6834
+    },
+    {
+      "epoch": 1.216880341880342,
+      "grad_norm": 0.592771053314209,
+      "learning_rate": 0.00015785979066120696,
+      "loss": 0.8673,
+      "step": 6835
+    },
+    {
+      "epoch": 1.2170584045584045,
+      "grad_norm": 0.5787834525108337,
+      "learning_rate": 0.00015784837357504138,
+      "loss": 0.7945,
+      "step": 6836
+    },
+    {
+      "epoch": 1.2172364672364673,
+      "grad_norm": 0.6814196109771729,
+      "learning_rate": 0.0001578369553554571,
+      "loss": 0.8906,
+      "step": 6837
+    },
+    {
+      "epoch": 1.2174145299145298,
+      "grad_norm": 0.6383981108665466,
+      "learning_rate": 0.00015782553600267787,
+      "loss": 0.8962,
+      "step": 6838
+    },
+    {
+      "epoch": 1.2175925925925926,
+      "grad_norm": 0.6733864545822144,
+      "learning_rate": 0.0001578141155169273,
+      "loss": 1.2077,
+      "step": 6839
+    },
+    {
+      "epoch": 1.2177706552706553,
+      "grad_norm": 0.5891284346580505,
+      "learning_rate": 0.0001578026938984293,
+      "loss": 0.9477,
+      "step": 6840
+    },
+    {
+      "epoch": 1.217948717948718,
+      "grad_norm": 0.7220266461372375,
+      "learning_rate": 0.00015779127114740757,
+      "loss": 1.0343,
+      "step": 6841
+    },
+    {
+      "epoch": 1.2181267806267806,
+      "grad_norm": 0.6566546559333801,
+      "learning_rate": 0.0001577798472640859,
+      "loss": 0.9576,
+      "step": 6842
+    },
+    {
+      "epoch": 1.2183048433048433,
+      "grad_norm": 0.6428449153900146,
+      "learning_rate": 0.0001577684222486882,
+      "loss": 0.8957,
+      "step": 6843
+    },
+    {
+      "epoch": 1.2184829059829059,
+      "grad_norm": 0.6542909741401672,
+      "learning_rate": 0.00015775699610143823,
+      "loss": 0.9942,
+      "step": 6844
+    },
+    {
+      "epoch": 1.2186609686609686,
+      "grad_norm": 0.7101675868034363,
+      "learning_rate": 0.00015774556882255992,
+      "loss": 1.015,
+      "step": 6845
+    },
+    {
+      "epoch": 1.2188390313390314,
+      "grad_norm": 0.6606267094612122,
+      "learning_rate": 0.00015773414041227713,
+      "loss": 1.1406,
+      "step": 6846
+    },
+    {
+      "epoch": 1.2190170940170941,
+      "grad_norm": 0.67124342918396,
+      "learning_rate": 0.00015772271087081383,
+      "loss": 1.2392,
+      "step": 6847
+    },
+    {
+      "epoch": 1.2191951566951567,
+      "grad_norm": 0.6615056991577148,
+      "learning_rate": 0.0001577112801983939,
+      "loss": 1.1583,
+      "step": 6848
+    },
+    {
+      "epoch": 1.2193732193732194,
+      "grad_norm": 0.6941317319869995,
+      "learning_rate": 0.0001576998483952413,
+      "loss": 1.0255,
+      "step": 6849
+    },
+    {
+      "epoch": 1.219551282051282,
+      "grad_norm": 0.5740683674812317,
+      "learning_rate": 0.00015768841546158005,
+      "loss": 1.0393,
+      "step": 6850
+    },
+    {
+      "epoch": 1.2197293447293447,
+      "grad_norm": 0.7143667340278625,
+      "learning_rate": 0.00015767698139763415,
+      "loss": 0.7564,
+      "step": 6851
+    },
+    {
+      "epoch": 1.2199074074074074,
+      "grad_norm": 0.6730484366416931,
+      "learning_rate": 0.00015766554620362758,
+      "loss": 1.2221,
+      "step": 6852
+    },
+    {
+      "epoch": 1.2200854700854702,
+      "grad_norm": 0.6883087754249573,
+      "learning_rate": 0.00015765410987978444,
+      "loss": 1.0156,
+      "step": 6853
+    },
+    {
+      "epoch": 1.2202635327635327,
+      "grad_norm": 0.6585961580276489,
+      "learning_rate": 0.00015764267242632875,
+      "loss": 1.0888,
+      "step": 6854
+    },
+    {
+      "epoch": 1.2204415954415955,
+      "grad_norm": 0.6325246691703796,
+      "learning_rate": 0.00015763123384348465,
+      "loss": 0.973,
+      "step": 6855
+    },
+    {
+      "epoch": 1.220619658119658,
+      "grad_norm": 0.5930588245391846,
+      "learning_rate": 0.00015761979413147627,
+      "loss": 0.8551,
+      "step": 6856
+    },
+    {
+      "epoch": 1.2207977207977208,
+      "grad_norm": 0.6440611481666565,
+      "learning_rate": 0.0001576083532905277,
+      "loss": 0.8396,
+      "step": 6857
+    },
+    {
+      "epoch": 1.2209757834757835,
+      "grad_norm": 0.6796659231185913,
+      "learning_rate": 0.00015759691132086315,
+      "loss": 1.0662,
+      "step": 6858
+    },
+    {
+      "epoch": 1.2211538461538463,
+      "grad_norm": 0.6813400983810425,
+      "learning_rate": 0.00015758546822270674,
+      "loss": 1.0457,
+      "step": 6859
+    },
+    {
+      "epoch": 1.2213319088319088,
+      "grad_norm": 0.6871716976165771,
+      "learning_rate": 0.00015757402399628272,
+      "loss": 1.1675,
+      "step": 6860
+    },
+    {
+      "epoch": 1.2215099715099715,
+      "grad_norm": 0.6431481838226318,
+      "learning_rate": 0.00015756257864181524,
+      "loss": 0.9366,
+      "step": 6861
+    },
+    {
+      "epoch": 1.221688034188034,
+      "grad_norm": 0.6061800718307495,
+      "learning_rate": 0.00015755113215952868,
+      "loss": 0.9267,
+      "step": 6862
+    },
+    {
+      "epoch": 1.2218660968660968,
+      "grad_norm": 0.5755770206451416,
+      "learning_rate": 0.00015753968454964722,
+      "loss": 0.7342,
+      "step": 6863
+    },
+    {
+      "epoch": 1.2220441595441596,
+      "grad_norm": 0.571345329284668,
+      "learning_rate": 0.00015752823581239515,
+      "loss": 0.8943,
+      "step": 6864
+    },
+    {
+      "epoch": 1.2222222222222223,
+      "grad_norm": 0.6925615668296814,
+      "learning_rate": 0.0001575167859479968,
+      "loss": 0.8801,
+      "step": 6865
+    },
+    {
+      "epoch": 1.2224002849002849,
+      "grad_norm": 0.6812975406646729,
+      "learning_rate": 0.00015750533495667655,
+      "loss": 0.9567,
+      "step": 6866
+    },
+    {
+      "epoch": 1.2225783475783476,
+      "grad_norm": 0.8216777443885803,
+      "learning_rate": 0.00015749388283865868,
+      "loss": 1.0908,
+      "step": 6867
+    },
+    {
+      "epoch": 1.2227564102564104,
+      "grad_norm": 0.6051010489463806,
+      "learning_rate": 0.00015748242959416763,
+      "loss": 0.8851,
+      "step": 6868
+    },
+    {
+      "epoch": 1.2229344729344729,
+      "grad_norm": 0.7750816345214844,
+      "learning_rate": 0.00015747097522342775,
+      "loss": 1.1526,
+      "step": 6869
+    },
+    {
+      "epoch": 1.2231125356125356,
+      "grad_norm": 0.6240930557250977,
+      "learning_rate": 0.00015745951972666355,
+      "loss": 1.0603,
+      "step": 6870
+    },
+    {
+      "epoch": 1.2232905982905984,
+      "grad_norm": 0.7228875160217285,
+      "learning_rate": 0.00015744806310409937,
+      "loss": 1.1028,
+      "step": 6871
+    },
+    {
+      "epoch": 1.223468660968661,
+      "grad_norm": 0.724075436592102,
+      "learning_rate": 0.00015743660535595978,
+      "loss": 0.8983,
+      "step": 6872
+    },
+    {
+      "epoch": 1.2236467236467237,
+      "grad_norm": 0.6398203372955322,
+      "learning_rate": 0.00015742514648246916,
+      "loss": 1.0548,
+      "step": 6873
+    },
+    {
+      "epoch": 1.2238247863247864,
+      "grad_norm": 0.7024285793304443,
+      "learning_rate": 0.00015741368648385212,
+      "loss": 1.0172,
+      "step": 6874
+    },
+    {
+      "epoch": 1.224002849002849,
+      "grad_norm": 0.6717609763145447,
+      "learning_rate": 0.00015740222536033316,
+      "loss": 0.9002,
+      "step": 6875
+    },
+    {
+      "epoch": 1.2241809116809117,
+      "grad_norm": 0.5886133313179016,
+      "learning_rate": 0.00015739076311213686,
+      "loss": 0.8614,
+      "step": 6876
+    },
+    {
+      "epoch": 1.2243589743589745,
+      "grad_norm": 0.6856684684753418,
+      "learning_rate": 0.00015737929973948776,
+      "loss": 1.1633,
+      "step": 6877
+    },
+    {
+      "epoch": 1.224537037037037,
+      "grad_norm": 0.6771421432495117,
+      "learning_rate": 0.00015736783524261045,
+      "loss": 1.0921,
+      "step": 6878
+    },
+    {
+      "epoch": 1.2247150997150997,
+      "grad_norm": 0.5016412138938904,
+      "learning_rate": 0.0001573563696217296,
+      "loss": 0.6732,
+      "step": 6879
+    },
+    {
+      "epoch": 1.2248931623931625,
+      "grad_norm": 0.7595276236534119,
+      "learning_rate": 0.00015734490287706984,
+      "loss": 1.0427,
+      "step": 6880
+    },
+    {
+      "epoch": 1.225071225071225,
+      "grad_norm": 0.6664281487464905,
+      "learning_rate": 0.00015733343500885582,
+      "loss": 1.2836,
+      "step": 6881
+    },
+    {
+      "epoch": 1.2252492877492878,
+      "grad_norm": 0.6662577390670776,
+      "learning_rate": 0.00015732196601731224,
+      "loss": 1.1288,
+      "step": 6882
+    },
+    {
+      "epoch": 1.2254273504273505,
+      "grad_norm": 0.6238988041877747,
+      "learning_rate": 0.00015731049590266385,
+      "loss": 1.0809,
+      "step": 6883
+    },
+    {
+      "epoch": 1.225605413105413,
+      "grad_norm": 0.6483062505722046,
+      "learning_rate": 0.00015729902466513532,
+      "loss": 0.9992,
+      "step": 6884
+    },
+    {
+      "epoch": 1.2257834757834758,
+      "grad_norm": 0.6890861988067627,
+      "learning_rate": 0.0001572875523049514,
+      "loss": 1.1844,
+      "step": 6885
+    },
+    {
+      "epoch": 1.2259615384615385,
+      "grad_norm": 0.7087607383728027,
+      "learning_rate": 0.00015727607882233695,
+      "loss": 1.013,
+      "step": 6886
+    },
+    {
+      "epoch": 1.226139601139601,
+      "grad_norm": 0.709048867225647,
+      "learning_rate": 0.00015726460421751668,
+      "loss": 0.9748,
+      "step": 6887
+    },
+    {
+      "epoch": 1.2263176638176638,
+      "grad_norm": 0.5918150544166565,
+      "learning_rate": 0.00015725312849071546,
+      "loss": 0.9978,
+      "step": 6888
+    },
+    {
+      "epoch": 1.2264957264957266,
+      "grad_norm": 0.4343377947807312,
+      "learning_rate": 0.0001572416516421581,
+      "loss": 0.6233,
+      "step": 6889
+    },
+    {
+      "epoch": 1.226673789173789,
+      "grad_norm": 0.6360403895378113,
+      "learning_rate": 0.00015723017367206952,
+      "loss": 0.9698,
+      "step": 6890
+    },
+    {
+      "epoch": 1.2268518518518519,
+      "grad_norm": 0.7261984944343567,
+      "learning_rate": 0.00015721869458067454,
+      "loss": 1.0426,
+      "step": 6891
+    },
+    {
+      "epoch": 1.2270299145299146,
+      "grad_norm": 0.6806774139404297,
+      "learning_rate": 0.0001572072143681981,
+      "loss": 0.9692,
+      "step": 6892
+    },
+    {
+      "epoch": 1.2272079772079771,
+      "grad_norm": 0.7140612006187439,
+      "learning_rate": 0.00015719573303486515,
+      "loss": 1.0828,
+      "step": 6893
+    },
+    {
+      "epoch": 1.22738603988604,
+      "grad_norm": 0.5383326411247253,
+      "learning_rate": 0.0001571842505809006,
+      "loss": 1.012,
+      "step": 6894
+    },
+    {
+      "epoch": 1.2275641025641026,
+      "grad_norm": 0.5992259383201599,
+      "learning_rate": 0.0001571727670065295,
+      "loss": 0.876,
+      "step": 6895
+    },
+    {
+      "epoch": 1.2277421652421652,
+      "grad_norm": 0.636696457862854,
+      "learning_rate": 0.00015716128231197676,
+      "loss": 1.1001,
+      "step": 6896
+    },
+    {
+      "epoch": 1.227920227920228,
+      "grad_norm": 0.5980371236801147,
+      "learning_rate": 0.00015714979649746744,
+      "loss": 0.937,
+      "step": 6897
+    },
+    {
+      "epoch": 1.2280982905982907,
+      "grad_norm": 0.7678794860839844,
+      "learning_rate": 0.00015713830956322656,
+      "loss": 1.1965,
+      "step": 6898
+    },
+    {
+      "epoch": 1.2282763532763532,
+      "grad_norm": 0.6918835639953613,
+      "learning_rate": 0.00015712682150947923,
+      "loss": 0.8578,
+      "step": 6899
+    },
+    {
+      "epoch": 1.228454415954416,
+      "grad_norm": 0.6463451385498047,
+      "learning_rate": 0.00015711533233645048,
+      "loss": 1.009,
+      "step": 6900
+    },
+    {
+      "epoch": 1.2286324786324787,
+      "grad_norm": 0.6720646023750305,
+      "learning_rate": 0.00015710384204436549,
+      "loss": 1.0031,
+      "step": 6901
+    },
+    {
+      "epoch": 1.2288105413105412,
+      "grad_norm": 0.6618736982345581,
+      "learning_rate": 0.00015709235063344926,
+      "loss": 0.9017,
+      "step": 6902
+    },
+    {
+      "epoch": 1.228988603988604,
+      "grad_norm": 0.6789427399635315,
+      "learning_rate": 0.0001570808581039271,
+      "loss": 1.1289,
+      "step": 6903
+    },
+    {
+      "epoch": 1.2291666666666667,
+      "grad_norm": 0.6395950317382812,
+      "learning_rate": 0.00015706936445602403,
+      "loss": 1.1051,
+      "step": 6904
+    },
+    {
+      "epoch": 1.2293447293447293,
+      "grad_norm": 0.7023917436599731,
+      "learning_rate": 0.00015705786968996533,
+      "loss": 1.2876,
+      "step": 6905
+    },
+    {
+      "epoch": 1.229522792022792,
+      "grad_norm": 0.7473352551460266,
+      "learning_rate": 0.00015704637380597623,
+      "loss": 1.237,
+      "step": 6906
+    },
+    {
+      "epoch": 1.2297008547008548,
+      "grad_norm": 0.6952672004699707,
+      "learning_rate": 0.00015703487680428192,
+      "loss": 1.0674,
+      "step": 6907
+    },
+    {
+      "epoch": 1.2298789173789173,
+      "grad_norm": 0.5968644022941589,
+      "learning_rate": 0.0001570233786851077,
+      "loss": 0.9169,
+      "step": 6908
+    },
+    {
+      "epoch": 1.23005698005698,
+      "grad_norm": 0.7219798564910889,
+      "learning_rate": 0.0001570118794486788,
+      "loss": 1.0556,
+      "step": 6909
+    },
+    {
+      "epoch": 1.2302350427350428,
+      "grad_norm": 0.6603400707244873,
+      "learning_rate": 0.0001570003790952206,
+      "loss": 0.9596,
+      "step": 6910
+    },
+    {
+      "epoch": 1.2304131054131053,
+      "grad_norm": 0.5972838401794434,
+      "learning_rate": 0.0001569888776249583,
+      "loss": 0.9168,
+      "step": 6911
+    },
+    {
+      "epoch": 1.230591168091168,
+      "grad_norm": 0.792585551738739,
+      "learning_rate": 0.00015697737503811738,
+      "loss": 1.1074,
+      "step": 6912
+    },
+    {
+      "epoch": 1.2307692307692308,
+      "grad_norm": 0.5845609903335571,
+      "learning_rate": 0.00015696587133492314,
+      "loss": 0.8413,
+      "step": 6913
+    },
+    {
+      "epoch": 1.2309472934472934,
+      "grad_norm": 0.6603896021842957,
+      "learning_rate": 0.000156954366515601,
+      "loss": 0.9109,
+      "step": 6914
+    },
+    {
+      "epoch": 1.2311253561253561,
+      "grad_norm": 0.6367142796516418,
+      "learning_rate": 0.00015694286058037636,
+      "loss": 1.0119,
+      "step": 6915
+    },
+    {
+      "epoch": 1.2313034188034189,
+      "grad_norm": 0.693854570388794,
+      "learning_rate": 0.00015693135352947465,
+      "loss": 1.0925,
+      "step": 6916
+    },
+    {
+      "epoch": 1.2314814814814814,
+      "grad_norm": 0.6570404171943665,
+      "learning_rate": 0.00015691984536312135,
+      "loss": 0.9731,
+      "step": 6917
+    },
+    {
+      "epoch": 1.2316595441595442,
+      "grad_norm": 0.6778639554977417,
+      "learning_rate": 0.0001569083360815419,
+      "loss": 1.1415,
+      "step": 6918
+    },
+    {
+      "epoch": 1.231837606837607,
+      "grad_norm": 0.6656233668327332,
+      "learning_rate": 0.00015689682568496182,
+      "loss": 0.8603,
+      "step": 6919
+    },
+    {
+      "epoch": 1.2320156695156694,
+      "grad_norm": 0.6569861173629761,
+      "learning_rate": 0.00015688531417360665,
+      "loss": 0.8374,
+      "step": 6920
+    },
+    {
+      "epoch": 1.2321937321937322,
+      "grad_norm": 0.6746888160705566,
+      "learning_rate": 0.0001568738015477019,
+      "loss": 1.1395,
+      "step": 6921
+    },
+    {
+      "epoch": 1.232371794871795,
+      "grad_norm": 0.6180813908576965,
+      "learning_rate": 0.00015686228780747316,
+      "loss": 1.0049,
+      "step": 6922
+    },
+    {
+      "epoch": 1.2325498575498575,
+      "grad_norm": 0.7326146960258484,
+      "learning_rate": 0.000156850772953146,
+      "loss": 1.2389,
+      "step": 6923
+    },
+    {
+      "epoch": 1.2327279202279202,
+      "grad_norm": 0.5912215709686279,
+      "learning_rate": 0.00015683925698494608,
+      "loss": 1.0174,
+      "step": 6924
+    },
+    {
+      "epoch": 1.232905982905983,
+      "grad_norm": 0.5214745402336121,
+      "learning_rate": 0.00015682773990309895,
+      "loss": 0.5778,
+      "step": 6925
+    },
+    {
+      "epoch": 1.2330840455840455,
+      "grad_norm": 0.6862079501152039,
+      "learning_rate": 0.00015681622170783034,
+      "loss": 0.896,
+      "step": 6926
+    },
+    {
+      "epoch": 1.2332621082621082,
+      "grad_norm": 0.7858926057815552,
+      "learning_rate": 0.00015680470239936586,
+      "loss": 1.0714,
+      "step": 6927
+    },
+    {
+      "epoch": 1.233440170940171,
+      "grad_norm": 0.6706146597862244,
+      "learning_rate": 0.00015679318197793127,
+      "loss": 1.0157,
+      "step": 6928
+    },
+    {
+      "epoch": 1.2336182336182335,
+      "grad_norm": 0.6657105088233948,
+      "learning_rate": 0.00015678166044375225,
+      "loss": 0.9674,
+      "step": 6929
+    },
+    {
+      "epoch": 1.2337962962962963,
+      "grad_norm": 0.6790838837623596,
+      "learning_rate": 0.0001567701377970545,
+      "loss": 0.9744,
+      "step": 6930
+    },
+    {
+      "epoch": 1.233974358974359,
+      "grad_norm": 0.6469771862030029,
+      "learning_rate": 0.00015675861403806386,
+      "loss": 1.0205,
+      "step": 6931
+    },
+    {
+      "epoch": 1.2341524216524216,
+      "grad_norm": 0.4926300346851349,
+      "learning_rate": 0.0001567470891670061,
+      "loss": 0.6336,
+      "step": 6932
+    },
+    {
+      "epoch": 1.2343304843304843,
+      "grad_norm": 0.6762157082557678,
+      "learning_rate": 0.000156735563184107,
+      "loss": 1.059,
+      "step": 6933
+    },
+    {
+      "epoch": 1.234508547008547,
+      "grad_norm": 0.6998521685600281,
+      "learning_rate": 0.0001567240360895924,
+      "loss": 1.0586,
+      "step": 6934
+    },
+    {
+      "epoch": 1.2346866096866096,
+      "grad_norm": 0.5947706699371338,
+      "learning_rate": 0.00015671250788368814,
+      "loss": 0.8815,
+      "step": 6935
+    },
+    {
+      "epoch": 1.2348646723646723,
+      "grad_norm": 0.6966122984886169,
+      "learning_rate": 0.0001567009785666201,
+      "loss": 1.0105,
+      "step": 6936
+    },
+    {
+      "epoch": 1.235042735042735,
+      "grad_norm": 0.6747866272926331,
+      "learning_rate": 0.0001566894481386142,
+      "loss": 0.8783,
+      "step": 6937
+    },
+    {
+      "epoch": 1.2352207977207976,
+      "grad_norm": 0.6348921060562134,
+      "learning_rate": 0.0001566779165998963,
+      "loss": 0.7813,
+      "step": 6938
+    },
+    {
+      "epoch": 1.2353988603988604,
+      "grad_norm": 0.596466600894928,
+      "learning_rate": 0.00015666638395069236,
+      "loss": 0.8689,
+      "step": 6939
+    },
+    {
+      "epoch": 1.2355769230769231,
+      "grad_norm": 0.6926795244216919,
+      "learning_rate": 0.00015665485019122834,
+      "loss": 1.0266,
+      "step": 6940
+    },
+    {
+      "epoch": 1.2357549857549857,
+      "grad_norm": 0.6590100526809692,
+      "learning_rate": 0.00015664331532173022,
+      "loss": 1.128,
+      "step": 6941
+    },
+    {
+      "epoch": 1.2359330484330484,
+      "grad_norm": 0.7422109246253967,
+      "learning_rate": 0.00015663177934242402,
+      "loss": 0.8495,
+      "step": 6942
+    },
+    {
+      "epoch": 1.2361111111111112,
+      "grad_norm": 0.6463228464126587,
+      "learning_rate": 0.0001566202422535357,
+      "loss": 1.0941,
+      "step": 6943
+    },
+    {
+      "epoch": 1.236289173789174,
+      "grad_norm": 0.7278686761856079,
+      "learning_rate": 0.0001566087040552914,
+      "loss": 1.2039,
+      "step": 6944
+    },
+    {
+      "epoch": 1.2364672364672364,
+      "grad_norm": 0.6917086839675903,
+      "learning_rate": 0.00015659716474791712,
+      "loss": 1.042,
+      "step": 6945
+    },
+    {
+      "epoch": 1.2366452991452992,
+      "grad_norm": 0.637205183506012,
+      "learning_rate": 0.00015658562433163898,
+      "loss": 1.0379,
+      "step": 6946
+    },
+    {
+      "epoch": 1.2368233618233617,
+      "grad_norm": 0.6706623435020447,
+      "learning_rate": 0.00015657408280668307,
+      "loss": 1.0347,
+      "step": 6947
+    },
+    {
+      "epoch": 1.2370014245014245,
+      "grad_norm": 0.6435480713844299,
+      "learning_rate": 0.00015656254017327553,
+      "loss": 0.7708,
+      "step": 6948
+    },
+    {
+      "epoch": 1.2371794871794872,
+      "grad_norm": 0.5703113675117493,
+      "learning_rate": 0.0001565509964316425,
+      "loss": 0.8786,
+      "step": 6949
+    },
+    {
+      "epoch": 1.23735754985755,
+      "grad_norm": 0.6438127160072327,
+      "learning_rate": 0.00015653945158201018,
+      "loss": 0.9435,
+      "step": 6950
+    },
+    {
+      "epoch": 1.2375356125356125,
+      "grad_norm": 0.68101966381073,
+      "learning_rate": 0.00015652790562460474,
+      "loss": 1.1062,
+      "step": 6951
+    },
+    {
+      "epoch": 1.2377136752136753,
+      "grad_norm": 0.661230206489563,
+      "learning_rate": 0.00015651635855965242,
+      "loss": 1.0113,
+      "step": 6952
+    },
+    {
+      "epoch": 1.2378917378917378,
+      "grad_norm": 0.6399117708206177,
+      "learning_rate": 0.0001565048103873795,
+      "loss": 1.1423,
+      "step": 6953
+    },
+    {
+      "epoch": 1.2380698005698005,
+      "grad_norm": 0.7614672780036926,
+      "learning_rate": 0.00015649326110801215,
+      "loss": 1.0359,
+      "step": 6954
+    },
+    {
+      "epoch": 1.2382478632478633,
+      "grad_norm": 0.6461986303329468,
+      "learning_rate": 0.00015648171072177674,
+      "loss": 1.0145,
+      "step": 6955
+    },
+    {
+      "epoch": 1.238425925925926,
+      "grad_norm": 0.5902668833732605,
+      "learning_rate": 0.0001564701592288995,
+      "loss": 0.9451,
+      "step": 6956
+    },
+    {
+      "epoch": 1.2386039886039886,
+      "grad_norm": 0.5686020255088806,
+      "learning_rate": 0.00015645860662960682,
+      "loss": 0.7512,
+      "step": 6957
+    },
+    {
+      "epoch": 1.2387820512820513,
+      "grad_norm": 0.6640077829360962,
+      "learning_rate": 0.00015644705292412503,
+      "loss": 0.7133,
+      "step": 6958
+    },
+    {
+      "epoch": 1.2389601139601139,
+      "grad_norm": 0.7402132749557495,
+      "learning_rate": 0.00015643549811268049,
+      "loss": 1.0903,
+      "step": 6959
+    },
+    {
+      "epoch": 1.2391381766381766,
+      "grad_norm": 0.62332683801651,
+      "learning_rate": 0.00015642394219549962,
+      "loss": 0.9378,
+      "step": 6960
+    },
+    {
+      "epoch": 1.2393162393162394,
+      "grad_norm": 0.6374901533126831,
+      "learning_rate": 0.00015641238517280877,
+      "loss": 1.0746,
+      "step": 6961
+    },
+    {
+      "epoch": 1.239494301994302,
+      "grad_norm": 0.5939112901687622,
+      "learning_rate": 0.00015640082704483443,
+      "loss": 0.7185,
+      "step": 6962
+    },
+    {
+      "epoch": 1.2396723646723646,
+      "grad_norm": 0.8378096222877502,
+      "learning_rate": 0.00015638926781180306,
+      "loss": 1.1932,
+      "step": 6963
+    },
+    {
+      "epoch": 1.2398504273504274,
+      "grad_norm": 0.5707982778549194,
+      "learning_rate": 0.0001563777074739411,
+      "loss": 0.9834,
+      "step": 6964
+    },
+    {
+      "epoch": 1.24002849002849,
+      "grad_norm": 0.6339748501777649,
+      "learning_rate": 0.00015636614603147512,
+      "loss": 1.0307,
+      "step": 6965
+    },
+    {
+      "epoch": 1.2402065527065527,
+      "grad_norm": 0.7353155016899109,
+      "learning_rate": 0.00015635458348463156,
+      "loss": 1.0311,
+      "step": 6966
+    },
+    {
+      "epoch": 1.2403846153846154,
+      "grad_norm": 0.8307726979255676,
+      "learning_rate": 0.00015634301983363704,
+      "loss": 1.0673,
+      "step": 6967
+    },
+    {
+      "epoch": 1.2405626780626782,
+      "grad_norm": 0.5299199819564819,
+      "learning_rate": 0.00015633145507871807,
+      "loss": 0.6649,
+      "step": 6968
+    },
+    {
+      "epoch": 1.2407407407407407,
+      "grad_norm": 0.6162533760070801,
+      "learning_rate": 0.00015631988922010126,
+      "loss": 0.8096,
+      "step": 6969
+    },
+    {
+      "epoch": 1.2409188034188035,
+      "grad_norm": 0.6212689876556396,
+      "learning_rate": 0.0001563083222580132,
+      "loss": 1.0371,
+      "step": 6970
+    },
+    {
+      "epoch": 1.241096866096866,
+      "grad_norm": 0.6148123145103455,
+      "learning_rate": 0.00015629675419268055,
+      "loss": 1.0439,
+      "step": 6971
+    },
+    {
+      "epoch": 1.2412749287749287,
+      "grad_norm": 0.6163684129714966,
+      "learning_rate": 0.00015628518502432994,
+      "loss": 0.9075,
+      "step": 6972
+    },
+    {
+      "epoch": 1.2414529914529915,
+      "grad_norm": 0.5127472877502441,
+      "learning_rate": 0.00015627361475318807,
+      "loss": 0.6138,
+      "step": 6973
+    },
+    {
+      "epoch": 1.2416310541310542,
+      "grad_norm": 0.6508103013038635,
+      "learning_rate": 0.0001562620433794816,
+      "loss": 0.9608,
+      "step": 6974
+    },
+    {
+      "epoch": 1.2418091168091168,
+      "grad_norm": 0.6711046695709229,
+      "learning_rate": 0.0001562504709034373,
+      "loss": 1.1494,
+      "step": 6975
+    },
+    {
+      "epoch": 1.2419871794871795,
+      "grad_norm": 0.6831514835357666,
+      "learning_rate": 0.00015623889732528182,
+      "loss": 0.9664,
+      "step": 6976
+    },
+    {
+      "epoch": 1.242165242165242,
+      "grad_norm": 0.693732738494873,
+      "learning_rate": 0.00015622732264524198,
+      "loss": 0.9055,
+      "step": 6977
+    },
+    {
+      "epoch": 1.2423433048433048,
+      "grad_norm": 0.8475173711776733,
+      "learning_rate": 0.00015621574686354456,
+      "loss": 1.2014,
+      "step": 6978
+    },
+    {
+      "epoch": 1.2425213675213675,
+      "grad_norm": 0.6342347264289856,
+      "learning_rate": 0.0001562041699804164,
+      "loss": 1.0691,
+      "step": 6979
+    },
+    {
+      "epoch": 1.2426994301994303,
+      "grad_norm": 0.620517373085022,
+      "learning_rate": 0.00015619259199608422,
+      "loss": 0.7318,
+      "step": 6980
+    },
+    {
+      "epoch": 1.2428774928774928,
+      "grad_norm": 0.589567244052887,
+      "learning_rate": 0.000156181012910775,
+      "loss": 1.0656,
+      "step": 6981
+    },
+    {
+      "epoch": 1.2430555555555556,
+      "grad_norm": 0.7570258975028992,
+      "learning_rate": 0.00015616943272471546,
+      "loss": 1.0517,
+      "step": 6982
+    },
+    {
+      "epoch": 1.243233618233618,
+      "grad_norm": 0.6232032775878906,
+      "learning_rate": 0.00015615785143813262,
+      "loss": 0.8867,
+      "step": 6983
+    },
+    {
+      "epoch": 1.2434116809116809,
+      "grad_norm": 0.630095899105072,
+      "learning_rate": 0.0001561462690512533,
+      "loss": 0.9287,
+      "step": 6984
+    },
+    {
+      "epoch": 1.2435897435897436,
+      "grad_norm": 0.7410848140716553,
+      "learning_rate": 0.00015613468556430454,
+      "loss": 1.162,
+      "step": 6985
+    },
+    {
+      "epoch": 1.2437678062678064,
+      "grad_norm": 0.7574684023857117,
+      "learning_rate": 0.00015612310097751317,
+      "loss": 1.2118,
+      "step": 6986
+    },
+    {
+      "epoch": 1.243945868945869,
+      "grad_norm": 0.580760657787323,
+      "learning_rate": 0.0001561115152911062,
+      "loss": 1.0612,
+      "step": 6987
+    },
+    {
+      "epoch": 1.2441239316239316,
+      "grad_norm": 0.6105104088783264,
+      "learning_rate": 0.00015609992850531073,
+      "loss": 0.9262,
+      "step": 6988
+    },
+    {
+      "epoch": 1.2443019943019944,
+      "grad_norm": 0.669435441493988,
+      "learning_rate": 0.00015608834062035362,
+      "loss": 0.9595,
+      "step": 6989
+    },
+    {
+      "epoch": 1.244480056980057,
+      "grad_norm": 0.6530314683914185,
+      "learning_rate": 0.00015607675163646206,
+      "loss": 0.7987,
+      "step": 6990
+    },
+    {
+      "epoch": 1.2446581196581197,
+      "grad_norm": 0.5801477432250977,
+      "learning_rate": 0.00015606516155386297,
+      "loss": 0.7667,
+      "step": 6991
+    },
+    {
+      "epoch": 1.2448361823361824,
+      "grad_norm": 0.5773885250091553,
+      "learning_rate": 0.00015605357037278355,
+      "loss": 0.847,
+      "step": 6992
+    },
+    {
+      "epoch": 1.245014245014245,
+      "grad_norm": 0.5399810075759888,
+      "learning_rate": 0.00015604197809345082,
+      "loss": 0.9284,
+      "step": 6993
+    },
+    {
+      "epoch": 1.2451923076923077,
+      "grad_norm": 0.5910452604293823,
+      "learning_rate": 0.000156030384716092,
+      "loss": 1.0004,
+      "step": 6994
+    },
+    {
+      "epoch": 1.2453703703703705,
+      "grad_norm": 0.5979224443435669,
+      "learning_rate": 0.00015601879024093414,
+      "loss": 0.9027,
+      "step": 6995
+    },
+    {
+      "epoch": 1.245548433048433,
+      "grad_norm": 0.6092126369476318,
+      "learning_rate": 0.0001560071946682045,
+      "loss": 0.9755,
+      "step": 6996
+    },
+    {
+      "epoch": 1.2457264957264957,
+      "grad_norm": 0.6536708474159241,
+      "learning_rate": 0.0001559955979981302,
+      "loss": 1.1828,
+      "step": 6997
+    },
+    {
+      "epoch": 1.2459045584045585,
+      "grad_norm": 0.6602030992507935,
+      "learning_rate": 0.00015598400023093847,
+      "loss": 1.0395,
+      "step": 6998
+    },
+    {
+      "epoch": 1.246082621082621,
+      "grad_norm": 0.6864825487136841,
+      "learning_rate": 0.00015597240136685657,
+      "loss": 1.083,
+      "step": 6999
+    },
+    {
+      "epoch": 1.2462606837606838,
+      "grad_norm": 0.6194674968719482,
+      "learning_rate": 0.0001559608014061117,
+      "loss": 1.0461,
+      "step": 7000
+    },
+    {
+      "epoch": 1.2464387464387465,
+      "grad_norm": 0.5879074335098267,
+      "learning_rate": 0.00015594920034893122,
+      "loss": 1.076,
+      "step": 7001
+    },
+    {
+      "epoch": 1.246616809116809,
+      "grad_norm": 0.6514387726783752,
+      "learning_rate": 0.00015593759819554234,
+      "loss": 1.0396,
+      "step": 7002
+    },
+    {
+      "epoch": 1.2467948717948718,
+      "grad_norm": 0.5988301634788513,
+      "learning_rate": 0.00015592599494617247,
+      "loss": 0.9501,
+      "step": 7003
+    },
+    {
+      "epoch": 1.2469729344729346,
+      "grad_norm": 0.6282773613929749,
+      "learning_rate": 0.00015591439060104887,
+      "loss": 1.1002,
+      "step": 7004
+    },
+    {
+      "epoch": 1.247150997150997,
+      "grad_norm": 0.6910465955734253,
+      "learning_rate": 0.00015590278516039896,
+      "loss": 1.1771,
+      "step": 7005
+    },
+    {
+      "epoch": 1.2473290598290598,
+      "grad_norm": 0.6097282767295837,
+      "learning_rate": 0.00015589117862445007,
+      "loss": 1.0707,
+      "step": 7006
+    },
+    {
+      "epoch": 1.2475071225071226,
+      "grad_norm": 0.7076875567436218,
+      "learning_rate": 0.00015587957099342967,
+      "loss": 1.0078,
+      "step": 7007
+    },
+    {
+      "epoch": 1.2476851851851851,
+      "grad_norm": 0.6776556372642517,
+      "learning_rate": 0.00015586796226756518,
+      "loss": 0.8971,
+      "step": 7008
+    },
+    {
+      "epoch": 1.2478632478632479,
+      "grad_norm": 0.6506341695785522,
+      "learning_rate": 0.00015585635244708398,
+      "loss": 0.9727,
+      "step": 7009
+    },
+    {
+      "epoch": 1.2480413105413106,
+      "grad_norm": 0.624724805355072,
+      "learning_rate": 0.00015584474153221357,
+      "loss": 0.9858,
+      "step": 7010
+    },
+    {
+      "epoch": 1.2482193732193732,
+      "grad_norm": 0.6070096492767334,
+      "learning_rate": 0.0001558331295231815,
+      "loss": 0.9385,
+      "step": 7011
+    },
+    {
+      "epoch": 1.248397435897436,
+      "grad_norm": 0.6948656439781189,
+      "learning_rate": 0.00015582151642021524,
+      "loss": 0.9425,
+      "step": 7012
+    },
+    {
+      "epoch": 1.2485754985754987,
+      "grad_norm": 0.6559088230133057,
+      "learning_rate": 0.0001558099022235423,
+      "loss": 1.0002,
+      "step": 7013
+    },
+    {
+      "epoch": 1.2487535612535612,
+      "grad_norm": 0.6097117066383362,
+      "learning_rate": 0.00015579828693339026,
+      "loss": 1.0234,
+      "step": 7014
+    },
+    {
+      "epoch": 1.248931623931624,
+      "grad_norm": 0.6612260341644287,
+      "learning_rate": 0.00015578667054998673,
+      "loss": 1.1376,
+      "step": 7015
+    },
+    {
+      "epoch": 1.2491096866096867,
+      "grad_norm": 0.6305607557296753,
+      "learning_rate": 0.00015577505307355925,
+      "loss": 0.9127,
+      "step": 7016
+    },
+    {
+      "epoch": 1.2492877492877492,
+      "grad_norm": 0.6648319959640503,
+      "learning_rate": 0.00015576343450433549,
+      "loss": 0.8697,
+      "step": 7017
+    },
+    {
+      "epoch": 1.249465811965812,
+      "grad_norm": 0.7642946839332581,
+      "learning_rate": 0.00015575181484254303,
+      "loss": 1.0998,
+      "step": 7018
+    },
+    {
+      "epoch": 1.2496438746438747,
+      "grad_norm": 0.6775243282318115,
+      "learning_rate": 0.00015574019408840962,
+      "loss": 1.0186,
+      "step": 7019
+    },
+    {
+      "epoch": 1.2498219373219372,
+      "grad_norm": 0.6075591444969177,
+      "learning_rate": 0.00015572857224216286,
+      "loss": 0.9592,
+      "step": 7020
+    },
+    {
+      "epoch": 1.2498219373219372,
+      "eval_loss": 1.105136752128601,
+      "eval_runtime": 24.4793,
+      "eval_samples_per_second": 42.526,
+      "eval_steps_per_second": 21.283,
+      "step": 7020
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 0.5856962203979492,
+      "learning_rate": 0.0001557169493040305,
+      "loss": 0.8336,
+      "step": 7021
+    },
+    {
+      "epoch": 1.2501780626780628,
+      "grad_norm": 0.6451364159584045,
+      "learning_rate": 0.00015570532527424028,
+      "loss": 0.8805,
+      "step": 7022
+    },
+    {
+      "epoch": 1.2503561253561253,
+      "grad_norm": 0.6266474723815918,
+      "learning_rate": 0.00015569370015301991,
+      "loss": 1.0023,
+      "step": 7023
+    },
+    {
+      "epoch": 1.250534188034188,
+      "grad_norm": 0.5547378063201904,
+      "learning_rate": 0.00015568207394059722,
+      "loss": 0.7385,
+      "step": 7024
+    },
+    {
+      "epoch": 1.2507122507122508,
+      "grad_norm": 0.604169487953186,
+      "learning_rate": 0.0001556704466371999,
+      "loss": 0.9194,
+      "step": 7025
+    },
+    {
+      "epoch": 1.2508903133903133,
+      "grad_norm": 0.7054405212402344,
+      "learning_rate": 0.00015565881824305586,
+      "loss": 1.1864,
+      "step": 7026
+    },
+    {
+      "epoch": 1.251068376068376,
+      "grad_norm": 0.6429929733276367,
+      "learning_rate": 0.0001556471887583929,
+      "loss": 1.0129,
+      "step": 7027
+    },
+    {
+      "epoch": 1.2512464387464388,
+      "grad_norm": 0.695957362651825,
+      "learning_rate": 0.00015563555818343887,
+      "loss": 1.2994,
+      "step": 7028
+    },
+    {
+      "epoch": 1.2514245014245013,
+      "grad_norm": 0.5889938473701477,
+      "learning_rate": 0.0001556239265184216,
+      "loss": 1.0109,
+      "step": 7029
+    },
+    {
+      "epoch": 1.251602564102564,
+      "grad_norm": 0.6424569487571716,
+      "learning_rate": 0.0001556122937635691,
+      "loss": 0.8585,
+      "step": 7030
+    },
+    {
+      "epoch": 1.2517806267806268,
+      "grad_norm": 0.5561244487762451,
+      "learning_rate": 0.0001556006599191092,
+      "loss": 0.9994,
+      "step": 7031
+    },
+    {
+      "epoch": 1.2519586894586894,
+      "grad_norm": 0.6355302333831787,
+      "learning_rate": 0.00015558902498526988,
+      "loss": 0.9495,
+      "step": 7032
+    },
+    {
+      "epoch": 1.2521367521367521,
+      "grad_norm": 0.6272686719894409,
+      "learning_rate": 0.00015557738896227908,
+      "loss": 0.7611,
+      "step": 7033
+    },
+    {
+      "epoch": 1.2523148148148149,
+      "grad_norm": 0.7069199085235596,
+      "learning_rate": 0.00015556575185036482,
+      "loss": 1.0612,
+      "step": 7034
+    },
+    {
+      "epoch": 1.2524928774928774,
+      "grad_norm": 0.6635094285011292,
+      "learning_rate": 0.00015555411364975505,
+      "loss": 1.1182,
+      "step": 7035
+    },
+    {
+      "epoch": 1.2526709401709402,
+      "grad_norm": 0.6112014651298523,
+      "learning_rate": 0.00015554247436067785,
+      "loss": 0.8677,
+      "step": 7036
+    },
+    {
+      "epoch": 1.252849002849003,
+      "grad_norm": 0.678963303565979,
+      "learning_rate": 0.00015553083398336126,
+      "loss": 1.1421,
+      "step": 7037
+    },
+    {
+      "epoch": 1.2530270655270654,
+      "grad_norm": 0.6291939616203308,
+      "learning_rate": 0.0001555191925180333,
+      "loss": 0.9157,
+      "step": 7038
+    },
+    {
+      "epoch": 1.2532051282051282,
+      "grad_norm": 0.6519795656204224,
+      "learning_rate": 0.0001555075499649221,
+      "loss": 1.0074,
+      "step": 7039
+    },
+    {
+      "epoch": 1.253383190883191,
+      "grad_norm": 0.6063529849052429,
+      "learning_rate": 0.00015549590632425576,
+      "loss": 1.0205,
+      "step": 7040
+    },
+    {
+      "epoch": 1.2535612535612537,
+      "grad_norm": 0.7055633664131165,
+      "learning_rate": 0.00015548426159626242,
+      "loss": 1.0254,
+      "step": 7041
+    },
+    {
+      "epoch": 1.2537393162393162,
+      "grad_norm": 0.6783022880554199,
+      "learning_rate": 0.00015547261578117025,
+      "loss": 1.1017,
+      "step": 7042
+    },
+    {
+      "epoch": 1.253917378917379,
+      "grad_norm": 0.7055003643035889,
+      "learning_rate": 0.0001554609688792074,
+      "loss": 1.0269,
+      "step": 7043
+    },
+    {
+      "epoch": 1.2540954415954415,
+      "grad_norm": 0.6465007662773132,
+      "learning_rate": 0.0001554493208906021,
+      "loss": 1.0492,
+      "step": 7044
+    },
+    {
+      "epoch": 1.2542735042735043,
+      "grad_norm": 0.6443775296211243,
+      "learning_rate": 0.0001554376718155825,
+      "loss": 0.9778,
+      "step": 7045
+    },
+    {
+      "epoch": 1.254451566951567,
+      "grad_norm": 0.695214569568634,
+      "learning_rate": 0.0001554260216543769,
+      "loss": 0.8792,
+      "step": 7046
+    },
+    {
+      "epoch": 1.2546296296296298,
+      "grad_norm": 0.6777814626693726,
+      "learning_rate": 0.00015541437040721354,
+      "loss": 0.8944,
+      "step": 7047
+    },
+    {
+      "epoch": 1.2548076923076923,
+      "grad_norm": 0.6269369721412659,
+      "learning_rate": 0.0001554027180743207,
+      "loss": 0.8825,
+      "step": 7048
+    },
+    {
+      "epoch": 1.254985754985755,
+      "grad_norm": 0.6197061538696289,
+      "learning_rate": 0.0001553910646559267,
+      "loss": 0.9823,
+      "step": 7049
+    },
+    {
+      "epoch": 1.2551638176638176,
+      "grad_norm": 0.681347131729126,
+      "learning_rate": 0.00015537941015225984,
+      "loss": 0.995,
+      "step": 7050
+    },
+    {
+      "epoch": 1.2553418803418803,
+      "grad_norm": 0.6224286556243896,
+      "learning_rate": 0.00015536775456354848,
+      "loss": 0.7714,
+      "step": 7051
+    },
+    {
+      "epoch": 1.255519943019943,
+      "grad_norm": 0.6113278269767761,
+      "learning_rate": 0.00015535609789002098,
+      "loss": 0.9859,
+      "step": 7052
+    },
+    {
+      "epoch": 1.2556980056980058,
+      "grad_norm": 0.6985422372817993,
+      "learning_rate": 0.00015534444013190577,
+      "loss": 0.8785,
+      "step": 7053
+    },
+    {
+      "epoch": 1.2558760683760684,
+      "grad_norm": 0.5602933168411255,
+      "learning_rate": 0.00015533278128943118,
+      "loss": 0.8341,
+      "step": 7054
+    },
+    {
+      "epoch": 1.256054131054131,
+      "grad_norm": 0.587684690952301,
+      "learning_rate": 0.0001553211213628257,
+      "loss": 0.7933,
+      "step": 7055
+    },
+    {
+      "epoch": 1.2562321937321936,
+      "grad_norm": 0.692997932434082,
+      "learning_rate": 0.0001553094603523178,
+      "loss": 1.0957,
+      "step": 7056
+    },
+    {
+      "epoch": 1.2564102564102564,
+      "grad_norm": 0.6925587058067322,
+      "learning_rate": 0.00015529779825813588,
+      "loss": 0.8602,
+      "step": 7057
+    },
+    {
+      "epoch": 1.2565883190883191,
+      "grad_norm": 0.6383063197135925,
+      "learning_rate": 0.0001552861350805085,
+      "loss": 0.9933,
+      "step": 7058
+    },
+    {
+      "epoch": 1.256766381766382,
+      "grad_norm": 0.6520544290542603,
+      "learning_rate": 0.00015527447081966413,
+      "loss": 0.9498,
+      "step": 7059
+    },
+    {
+      "epoch": 1.2569444444444444,
+      "grad_norm": 0.7353914380073547,
+      "learning_rate": 0.00015526280547583133,
+      "loss": 1.1071,
+      "step": 7060
+    },
+    {
+      "epoch": 1.2571225071225072,
+      "grad_norm": 0.7141618132591248,
+      "learning_rate": 0.00015525113904923864,
+      "loss": 0.8333,
+      "step": 7061
+    },
+    {
+      "epoch": 1.2573005698005697,
+      "grad_norm": 0.6194499731063843,
+      "learning_rate": 0.00015523947154011468,
+      "loss": 0.9421,
+      "step": 7062
+    },
+    {
+      "epoch": 1.2574786324786325,
+      "grad_norm": 0.7514514327049255,
+      "learning_rate": 0.00015522780294868803,
+      "loss": 1.226,
+      "step": 7063
+    },
+    {
+      "epoch": 1.2576566951566952,
+      "grad_norm": 0.762923538684845,
+      "learning_rate": 0.0001552161332751873,
+      "loss": 1.1893,
+      "step": 7064
+    },
+    {
+      "epoch": 1.257834757834758,
+      "grad_norm": 0.6265730261802673,
+      "learning_rate": 0.00015520446251984113,
+      "loss": 0.6604,
+      "step": 7065
+    },
+    {
+      "epoch": 1.2580128205128205,
+      "grad_norm": 0.6447750329971313,
+      "learning_rate": 0.0001551927906828782,
+      "loss": 0.9814,
+      "step": 7066
+    },
+    {
+      "epoch": 1.2581908831908832,
+      "grad_norm": 0.5791042447090149,
+      "learning_rate": 0.00015518111776452722,
+      "loss": 0.8283,
+      "step": 7067
+    },
+    {
+      "epoch": 1.2583689458689458,
+      "grad_norm": 0.5267777442932129,
+      "learning_rate": 0.00015516944376501682,
+      "loss": 0.5748,
+      "step": 7068
+    },
+    {
+      "epoch": 1.2585470085470085,
+      "grad_norm": 0.7343912720680237,
+      "learning_rate": 0.0001551577686845758,
+      "loss": 1.1777,
+      "step": 7069
+    },
+    {
+      "epoch": 1.2587250712250713,
+      "grad_norm": 0.645746111869812,
+      "learning_rate": 0.00015514609252343284,
+      "loss": 0.9356,
+      "step": 7070
+    },
+    {
+      "epoch": 1.258903133903134,
+      "grad_norm": 0.6993104219436646,
+      "learning_rate": 0.0001551344152818168,
+      "loss": 1.06,
+      "step": 7071
+    },
+    {
+      "epoch": 1.2590811965811965,
+      "grad_norm": 0.6661365628242493,
+      "learning_rate": 0.0001551227369599564,
+      "loss": 1.061,
+      "step": 7072
+    },
+    {
+      "epoch": 1.2592592592592593,
+      "grad_norm": 0.7833736538887024,
+      "learning_rate": 0.0001551110575580805,
+      "loss": 0.9674,
+      "step": 7073
+    },
+    {
+      "epoch": 1.2594373219373218,
+      "grad_norm": 0.5878575444221497,
+      "learning_rate": 0.00015509937707641787,
+      "loss": 0.9002,
+      "step": 7074
+    },
+    {
+      "epoch": 1.2596153846153846,
+      "grad_norm": 0.6402907371520996,
+      "learning_rate": 0.00015508769551519745,
+      "loss": 1.0157,
+      "step": 7075
+    },
+    {
+      "epoch": 1.2597934472934473,
+      "grad_norm": 0.6794611215591431,
+      "learning_rate": 0.00015507601287464805,
+      "loss": 1.052,
+      "step": 7076
+    },
+    {
+      "epoch": 1.25997150997151,
+      "grad_norm": 0.706922173500061,
+      "learning_rate": 0.0001550643291549986,
+      "loss": 1.0814,
+      "step": 7077
+    },
+    {
+      "epoch": 1.2601495726495726,
+      "grad_norm": 0.6722953915596008,
+      "learning_rate": 0.000155052644356478,
+      "loss": 1.1402,
+      "step": 7078
+    },
+    {
+      "epoch": 1.2603276353276354,
+      "grad_norm": 0.6619611978530884,
+      "learning_rate": 0.00015504095847931518,
+      "loss": 0.9583,
+      "step": 7079
+    },
+    {
+      "epoch": 1.260505698005698,
+      "grad_norm": 0.5645583271980286,
+      "learning_rate": 0.00015502927152373914,
+      "loss": 0.6746,
+      "step": 7080
+    },
+    {
+      "epoch": 1.2606837606837606,
+      "grad_norm": 0.6634977459907532,
+      "learning_rate": 0.00015501758348997882,
+      "loss": 1.0451,
+      "step": 7081
+    },
+    {
+      "epoch": 1.2608618233618234,
+      "grad_norm": 0.7167651057243347,
+      "learning_rate": 0.00015500589437826326,
+      "loss": 0.931,
+      "step": 7082
+    },
+    {
+      "epoch": 1.2610398860398861,
+      "grad_norm": 0.6179340481758118,
+      "learning_rate": 0.00015499420418882146,
+      "loss": 1.0953,
+      "step": 7083
+    },
+    {
+      "epoch": 1.2612179487179487,
+      "grad_norm": 0.6948468685150146,
+      "learning_rate": 0.00015498251292188247,
+      "loss": 1.0277,
+      "step": 7084
+    },
+    {
+      "epoch": 1.2613960113960114,
+      "grad_norm": 0.6256045699119568,
+      "learning_rate": 0.00015497082057767532,
+      "loss": 1.0154,
+      "step": 7085
+    },
+    {
+      "epoch": 1.261574074074074,
+      "grad_norm": 0.6457428336143494,
+      "learning_rate": 0.0001549591271564292,
+      "loss": 0.9693,
+      "step": 7086
+    },
+    {
+      "epoch": 1.2617521367521367,
+      "grad_norm": 0.722259521484375,
+      "learning_rate": 0.0001549474326583731,
+      "loss": 0.9176,
+      "step": 7087
+    },
+    {
+      "epoch": 1.2619301994301995,
+      "grad_norm": 0.742477297782898,
+      "learning_rate": 0.0001549357370837362,
+      "loss": 0.9813,
+      "step": 7088
+    },
+    {
+      "epoch": 1.2621082621082622,
+      "grad_norm": 0.5981723666191101,
+      "learning_rate": 0.0001549240404327477,
+      "loss": 0.8943,
+      "step": 7089
+    },
+    {
+      "epoch": 1.2622863247863247,
+      "grad_norm": 0.6266574859619141,
+      "learning_rate": 0.00015491234270563665,
+      "loss": 0.8439,
+      "step": 7090
+    },
+    {
+      "epoch": 1.2624643874643875,
+      "grad_norm": 0.6723998188972473,
+      "learning_rate": 0.00015490064390263238,
+      "loss": 1.2278,
+      "step": 7091
+    },
+    {
+      "epoch": 1.26264245014245,
+      "grad_norm": 0.6628100275993347,
+      "learning_rate": 0.00015488894402396398,
+      "loss": 0.9526,
+      "step": 7092
+    },
+    {
+      "epoch": 1.2628205128205128,
+      "grad_norm": 0.6661350727081299,
+      "learning_rate": 0.0001548772430698608,
+      "loss": 0.974,
+      "step": 7093
+    },
+    {
+      "epoch": 1.2629985754985755,
+      "grad_norm": 0.8210669755935669,
+      "learning_rate": 0.000154865541040552,
+      "loss": 1.1142,
+      "step": 7094
+    },
+    {
+      "epoch": 1.2631766381766383,
+      "grad_norm": 0.6329003572463989,
+      "learning_rate": 0.0001548538379362669,
+      "loss": 0.8485,
+      "step": 7095
+    },
+    {
+      "epoch": 1.2633547008547008,
+      "grad_norm": 0.6288384795188904,
+      "learning_rate": 0.0001548421337572348,
+      "loss": 0.816,
+      "step": 7096
+    },
+    {
+      "epoch": 1.2635327635327636,
+      "grad_norm": 0.631060004234314,
+      "learning_rate": 0.00015483042850368504,
+      "loss": 0.8237,
+      "step": 7097
+    },
+    {
+      "epoch": 1.263710826210826,
+      "grad_norm": 0.7343839406967163,
+      "learning_rate": 0.0001548187221758469,
+      "loss": 1.1507,
+      "step": 7098
+    },
+    {
+      "epoch": 1.2638888888888888,
+      "grad_norm": 0.6313042640686035,
+      "learning_rate": 0.0001548070147739498,
+      "loss": 0.7762,
+      "step": 7099
+    },
+    {
+      "epoch": 1.2640669515669516,
+      "grad_norm": 0.6449850797653198,
+      "learning_rate": 0.00015479530629822308,
+      "loss": 0.9225,
+      "step": 7100
+    },
+    {
+      "epoch": 1.2642450142450143,
+      "grad_norm": 0.6371589303016663,
+      "learning_rate": 0.00015478359674889617,
+      "loss": 1.0088,
+      "step": 7101
+    },
+    {
+      "epoch": 1.2644230769230769,
+      "grad_norm": 0.6483678221702576,
+      "learning_rate": 0.00015477188612619849,
+      "loss": 0.6234,
+      "step": 7102
+    },
+    {
+      "epoch": 1.2646011396011396,
+      "grad_norm": 0.6945441365242004,
+      "learning_rate": 0.00015476017443035947,
+      "loss": 1.123,
+      "step": 7103
+    },
+    {
+      "epoch": 1.2647792022792022,
+      "grad_norm": 0.6356340050697327,
+      "learning_rate": 0.00015474846166160856,
+      "loss": 0.9923,
+      "step": 7104
+    },
+    {
+      "epoch": 1.264957264957265,
+      "grad_norm": 0.6774702668190002,
+      "learning_rate": 0.00015473674782017532,
+      "loss": 0.9694,
+      "step": 7105
+    },
+    {
+      "epoch": 1.2651353276353277,
+      "grad_norm": 0.6332793831825256,
+      "learning_rate": 0.0001547250329062892,
+      "loss": 1.0633,
+      "step": 7106
+    },
+    {
+      "epoch": 1.2653133903133904,
+      "grad_norm": 0.6563684344291687,
+      "learning_rate": 0.00015471331692017972,
+      "loss": 1.0893,
+      "step": 7107
+    },
+    {
+      "epoch": 1.265491452991453,
+      "grad_norm": 0.7318371534347534,
+      "learning_rate": 0.0001547015998620765,
+      "loss": 1.1777,
+      "step": 7108
+    },
+    {
+      "epoch": 1.2656695156695157,
+      "grad_norm": 0.7099173069000244,
+      "learning_rate": 0.000154689881732209,
+      "loss": 1.1717,
+      "step": 7109
+    },
+    {
+      "epoch": 1.2658475783475782,
+      "grad_norm": 0.661078691482544,
+      "learning_rate": 0.00015467816253080693,
+      "loss": 1.0448,
+      "step": 7110
+    },
+    {
+      "epoch": 1.266025641025641,
+      "grad_norm": 0.6206802129745483,
+      "learning_rate": 0.0001546664422580998,
+      "loss": 0.9334,
+      "step": 7111
+    },
+    {
+      "epoch": 1.2662037037037037,
+      "grad_norm": 0.6514355540275574,
+      "learning_rate": 0.00015465472091431728,
+      "loss": 0.9533,
+      "step": 7112
+    },
+    {
+      "epoch": 1.2663817663817665,
+      "grad_norm": 0.6090209484100342,
+      "learning_rate": 0.0001546429984996891,
+      "loss": 0.9206,
+      "step": 7113
+    },
+    {
+      "epoch": 1.266559829059829,
+      "grad_norm": 0.6345987915992737,
+      "learning_rate": 0.00015463127501444488,
+      "loss": 1.0537,
+      "step": 7114
+    },
+    {
+      "epoch": 1.2667378917378918,
+      "grad_norm": 0.6095160245895386,
+      "learning_rate": 0.0001546195504588143,
+      "loss": 0.8652,
+      "step": 7115
+    },
+    {
+      "epoch": 1.2669159544159543,
+      "grad_norm": 0.6751621961593628,
+      "learning_rate": 0.00015460782483302707,
+      "loss": 0.9001,
+      "step": 7116
+    },
+    {
+      "epoch": 1.267094017094017,
+      "grad_norm": 0.6261575222015381,
+      "learning_rate": 0.00015459609813731295,
+      "loss": 0.929,
+      "step": 7117
+    },
+    {
+      "epoch": 1.2672720797720798,
+      "grad_norm": 0.589495837688446,
+      "learning_rate": 0.0001545843703719017,
+      "loss": 0.9023,
+      "step": 7118
+    },
+    {
+      "epoch": 1.2674501424501425,
+      "grad_norm": 0.6364617943763733,
+      "learning_rate": 0.00015457264153702311,
+      "loss": 0.8261,
+      "step": 7119
+    },
+    {
+      "epoch": 1.267628205128205,
+      "grad_norm": 0.6685599684715271,
+      "learning_rate": 0.00015456091163290698,
+      "loss": 1.1267,
+      "step": 7120
+    },
+    {
+      "epoch": 1.2678062678062678,
+      "grad_norm": 0.6440932750701904,
+      "learning_rate": 0.0001545491806597831,
+      "loss": 0.9643,
+      "step": 7121
+    },
+    {
+      "epoch": 1.2679843304843303,
+      "grad_norm": 0.7641597390174866,
+      "learning_rate": 0.00015453744861788137,
+      "loss": 1.1577,
+      "step": 7122
+    },
+    {
+      "epoch": 1.268162393162393,
+      "grad_norm": 0.6965937614440918,
+      "learning_rate": 0.00015452571550743163,
+      "loss": 0.7835,
+      "step": 7123
+    },
+    {
+      "epoch": 1.2683404558404558,
+      "grad_norm": 0.6332844495773315,
+      "learning_rate": 0.00015451398132866376,
+      "loss": 0.9794,
+      "step": 7124
+    },
+    {
+      "epoch": 1.2685185185185186,
+      "grad_norm": 0.6719903349876404,
+      "learning_rate": 0.00015450224608180765,
+      "loss": 0.9795,
+      "step": 7125
+    },
+    {
+      "epoch": 1.2686965811965811,
+      "grad_norm": 0.567414402961731,
+      "learning_rate": 0.00015449050976709328,
+      "loss": 0.9737,
+      "step": 7126
+    },
+    {
+      "epoch": 1.2688746438746439,
+      "grad_norm": 0.6810645461082458,
+      "learning_rate": 0.0001544787723847505,
+      "loss": 1.2358,
+      "step": 7127
+    },
+    {
+      "epoch": 1.2690527065527066,
+      "grad_norm": 0.6693191528320312,
+      "learning_rate": 0.00015446703393500938,
+      "loss": 0.9475,
+      "step": 7128
+    },
+    {
+      "epoch": 1.2692307692307692,
+      "grad_norm": 0.7077522277832031,
+      "learning_rate": 0.00015445529441809988,
+      "loss": 1.013,
+      "step": 7129
+    },
+    {
+      "epoch": 1.269408831908832,
+      "grad_norm": 0.6596258878707886,
+      "learning_rate": 0.000154443553834252,
+      "loss": 1.1506,
+      "step": 7130
+    },
+    {
+      "epoch": 1.2695868945868947,
+      "grad_norm": 0.6721500754356384,
+      "learning_rate": 0.0001544318121836958,
+      "loss": 0.8848,
+      "step": 7131
+    },
+    {
+      "epoch": 1.2697649572649572,
+      "grad_norm": 0.6943998336791992,
+      "learning_rate": 0.00015442006946666132,
+      "loss": 1.1118,
+      "step": 7132
+    },
+    {
+      "epoch": 1.26994301994302,
+      "grad_norm": 0.6132234930992126,
+      "learning_rate": 0.0001544083256833786,
+      "loss": 0.9932,
+      "step": 7133
+    },
+    {
+      "epoch": 1.2701210826210827,
+      "grad_norm": 0.7337939739227295,
+      "learning_rate": 0.00015439658083407775,
+      "loss": 1.0973,
+      "step": 7134
+    },
+    {
+      "epoch": 1.2702991452991452,
+      "grad_norm": 0.6551772356033325,
+      "learning_rate": 0.00015438483491898893,
+      "loss": 1.0006,
+      "step": 7135
+    },
+    {
+      "epoch": 1.270477207977208,
+      "grad_norm": 0.660068929195404,
+      "learning_rate": 0.00015437308793834223,
+      "loss": 0.9291,
+      "step": 7136
+    },
+    {
+      "epoch": 1.2706552706552707,
+      "grad_norm": 0.7622788548469543,
+      "learning_rate": 0.00015436133989236783,
+      "loss": 1.0782,
+      "step": 7137
+    },
+    {
+      "epoch": 1.2708333333333333,
+      "grad_norm": 0.848494291305542,
+      "learning_rate": 0.00015434959078129587,
+      "loss": 1.2001,
+      "step": 7138
+    },
+    {
+      "epoch": 1.271011396011396,
+      "grad_norm": 0.6222602725028992,
+      "learning_rate": 0.0001543378406053566,
+      "loss": 1.011,
+      "step": 7139
+    },
+    {
+      "epoch": 1.2711894586894588,
+      "grad_norm": 0.6164663434028625,
+      "learning_rate": 0.00015432608936478026,
+      "loss": 1.0282,
+      "step": 7140
+    },
+    {
+      "epoch": 1.2713675213675213,
+      "grad_norm": 0.7236546277999878,
+      "learning_rate": 0.000154314337059797,
+      "loss": 1.0112,
+      "step": 7141
+    },
+    {
+      "epoch": 1.271545584045584,
+      "grad_norm": 0.6891111135482788,
+      "learning_rate": 0.00015430258369063715,
+      "loss": 1.1191,
+      "step": 7142
+    },
+    {
+      "epoch": 1.2717236467236468,
+      "grad_norm": 0.6600295901298523,
+      "learning_rate": 0.00015429082925753099,
+      "loss": 0.9561,
+      "step": 7143
+    },
+    {
+      "epoch": 1.2719017094017093,
+      "grad_norm": 0.6819902062416077,
+      "learning_rate": 0.0001542790737607088,
+      "loss": 1.0631,
+      "step": 7144
+    },
+    {
+      "epoch": 1.272079772079772,
+      "grad_norm": 0.6518470644950867,
+      "learning_rate": 0.0001542673172004009,
+      "loss": 1.0806,
+      "step": 7145
+    },
+    {
+      "epoch": 1.2722578347578348,
+      "grad_norm": 0.737501859664917,
+      "learning_rate": 0.00015425555957683767,
+      "loss": 1.0144,
+      "step": 7146
+    },
+    {
+      "epoch": 1.2724358974358974,
+      "grad_norm": 0.6245740652084351,
+      "learning_rate": 0.00015424380089024944,
+      "loss": 1.0612,
+      "step": 7147
+    },
+    {
+      "epoch": 1.27261396011396,
+      "grad_norm": 0.7118125557899475,
+      "learning_rate": 0.0001542320411408666,
+      "loss": 1.1458,
+      "step": 7148
+    },
+    {
+      "epoch": 1.2727920227920229,
+      "grad_norm": 0.6965761780738831,
+      "learning_rate": 0.00015422028032891958,
+      "loss": 0.8052,
+      "step": 7149
+    },
+    {
+      "epoch": 1.2729700854700854,
+      "grad_norm": 0.7661466598510742,
+      "learning_rate": 0.0001542085184546388,
+      "loss": 1.1245,
+      "step": 7150
+    },
+    {
+      "epoch": 1.2731481481481481,
+      "grad_norm": 0.7238876223564148,
+      "learning_rate": 0.00015419675551825475,
+      "loss": 0.9346,
+      "step": 7151
+    },
+    {
+      "epoch": 1.273326210826211,
+      "grad_norm": 0.669562041759491,
+      "learning_rate": 0.0001541849915199978,
+      "loss": 0.7816,
+      "step": 7152
+    },
+    {
+      "epoch": 1.2735042735042734,
+      "grad_norm": 0.6799174547195435,
+      "learning_rate": 0.00015417322646009855,
+      "loss": 1.047,
+      "step": 7153
+    },
+    {
+      "epoch": 1.2736823361823362,
+      "grad_norm": 0.6012796759605408,
+      "learning_rate": 0.00015416146033878745,
+      "loss": 1.0101,
+      "step": 7154
+    },
+    {
+      "epoch": 1.273860398860399,
+      "grad_norm": 0.7008427977561951,
+      "learning_rate": 0.00015414969315629505,
+      "loss": 1.1321,
+      "step": 7155
+    },
+    {
+      "epoch": 1.2740384615384617,
+      "grad_norm": 0.6555556058883667,
+      "learning_rate": 0.0001541379249128519,
+      "loss": 0.9926,
+      "step": 7156
+    },
+    {
+      "epoch": 1.2742165242165242,
+      "grad_norm": 0.6324251294136047,
+      "learning_rate": 0.00015412615560868854,
+      "loss": 0.9051,
+      "step": 7157
+    },
+    {
+      "epoch": 1.274394586894587,
+      "grad_norm": 0.6035568714141846,
+      "learning_rate": 0.0001541143852440356,
+      "loss": 0.8248,
+      "step": 7158
+    },
+    {
+      "epoch": 1.2745726495726495,
+      "grad_norm": 0.6733569502830505,
+      "learning_rate": 0.0001541026138191237,
+      "loss": 0.9149,
+      "step": 7159
+    },
+    {
+      "epoch": 1.2747507122507122,
+      "grad_norm": 0.8306798338890076,
+      "learning_rate": 0.0001540908413341835,
+      "loss": 1.0694,
+      "step": 7160
+    },
+    {
+      "epoch": 1.274928774928775,
+      "grad_norm": 0.6649713516235352,
+      "learning_rate": 0.00015407906778944563,
+      "loss": 1.1358,
+      "step": 7161
+    },
+    {
+      "epoch": 1.2751068376068377,
+      "grad_norm": 0.6889697909355164,
+      "learning_rate": 0.00015406729318514074,
+      "loss": 1.0096,
+      "step": 7162
+    },
+    {
+      "epoch": 1.2752849002849003,
+      "grad_norm": 0.6948645114898682,
+      "learning_rate": 0.0001540555175214996,
+      "loss": 1.0649,
+      "step": 7163
+    },
+    {
+      "epoch": 1.275462962962963,
+      "grad_norm": 0.6844844818115234,
+      "learning_rate": 0.0001540437407987528,
+      "loss": 0.884,
+      "step": 7164
+    },
+    {
+      "epoch": 1.2756410256410255,
+      "grad_norm": 0.7124526500701904,
+      "learning_rate": 0.00015403196301713124,
+      "loss": 1.1307,
+      "step": 7165
+    },
+    {
+      "epoch": 1.2758190883190883,
+      "grad_norm": 0.7328375577926636,
+      "learning_rate": 0.00015402018417686556,
+      "loss": 1.0348,
+      "step": 7166
+    },
+    {
+      "epoch": 1.275997150997151,
+      "grad_norm": 0.5872696042060852,
+      "learning_rate": 0.00015400840427818663,
+      "loss": 0.9827,
+      "step": 7167
+    },
+    {
+      "epoch": 1.2761752136752138,
+      "grad_norm": 0.6370702385902405,
+      "learning_rate": 0.00015399662332132519,
+      "loss": 0.9171,
+      "step": 7168
+    },
+    {
+      "epoch": 1.2763532763532763,
+      "grad_norm": 0.6481866240501404,
+      "learning_rate": 0.00015398484130651205,
+      "loss": 0.8704,
+      "step": 7169
+    },
+    {
+      "epoch": 1.276531339031339,
+      "grad_norm": 0.598739743232727,
+      "learning_rate": 0.00015397305823397812,
+      "loss": 0.8097,
+      "step": 7170
+    },
+    {
+      "epoch": 1.2767094017094016,
+      "grad_norm": 0.5941228270530701,
+      "learning_rate": 0.00015396127410395423,
+      "loss": 0.8853,
+      "step": 7171
+    },
+    {
+      "epoch": 1.2768874643874644,
+      "grad_norm": 0.6485885381698608,
+      "learning_rate": 0.00015394948891667127,
+      "loss": 0.702,
+      "step": 7172
+    },
+    {
+      "epoch": 1.2770655270655271,
+      "grad_norm": 0.5314942598342896,
+      "learning_rate": 0.00015393770267236017,
+      "loss": 0.7899,
+      "step": 7173
+    },
+    {
+      "epoch": 1.2772435897435899,
+      "grad_norm": 0.6113781929016113,
+      "learning_rate": 0.00015392591537125182,
+      "loss": 0.9871,
+      "step": 7174
+    },
+    {
+      "epoch": 1.2774216524216524,
+      "grad_norm": 0.5625866651535034,
+      "learning_rate": 0.00015391412701357715,
+      "loss": 0.8246,
+      "step": 7175
+    },
+    {
+      "epoch": 1.2775997150997151,
+      "grad_norm": 0.6006998419761658,
+      "learning_rate": 0.00015390233759956718,
+      "loss": 0.899,
+      "step": 7176
+    },
+    {
+      "epoch": 1.2777777777777777,
+      "grad_norm": 0.6916918158531189,
+      "learning_rate": 0.0001538905471294529,
+      "loss": 1.0443,
+      "step": 7177
+    },
+    {
+      "epoch": 1.2779558404558404,
+      "grad_norm": 0.6263536810874939,
+      "learning_rate": 0.00015387875560346525,
+      "loss": 0.9159,
+      "step": 7178
+    },
+    {
+      "epoch": 1.2781339031339032,
+      "grad_norm": 0.6563085913658142,
+      "learning_rate": 0.00015386696302183535,
+      "loss": 0.994,
+      "step": 7179
+    },
+    {
+      "epoch": 1.278311965811966,
+      "grad_norm": 0.6312007904052734,
+      "learning_rate": 0.00015385516938479416,
+      "loss": 0.9148,
+      "step": 7180
+    },
+    {
+      "epoch": 1.2784900284900285,
+      "grad_norm": 0.6408209204673767,
+      "learning_rate": 0.00015384337469257284,
+      "loss": 1.0508,
+      "step": 7181
+    },
+    {
+      "epoch": 1.2786680911680912,
+      "grad_norm": 0.656234085559845,
+      "learning_rate": 0.00015383157894540244,
+      "loss": 0.9952,
+      "step": 7182
+    },
+    {
+      "epoch": 1.2788461538461537,
+      "grad_norm": 0.7401639819145203,
+      "learning_rate": 0.00015381978214351407,
+      "loss": 1.1615,
+      "step": 7183
+    },
+    {
+      "epoch": 1.2790242165242165,
+      "grad_norm": 0.5746055841445923,
+      "learning_rate": 0.00015380798428713885,
+      "loss": 0.9142,
+      "step": 7184
+    },
+    {
+      "epoch": 1.2792022792022792,
+      "grad_norm": 0.8061720728874207,
+      "learning_rate": 0.00015379618537650797,
+      "loss": 1.13,
+      "step": 7185
+    },
+    {
+      "epoch": 1.279380341880342,
+      "grad_norm": 0.6336073875427246,
+      "learning_rate": 0.0001537843854118526,
+      "loss": 1.0581,
+      "step": 7186
+    },
+    {
+      "epoch": 1.2795584045584045,
+      "grad_norm": 0.6549856066703796,
+      "learning_rate": 0.0001537725843934039,
+      "loss": 1.09,
+      "step": 7187
+    },
+    {
+      "epoch": 1.2797364672364673,
+      "grad_norm": 0.5759010910987854,
+      "learning_rate": 0.00015376078232139315,
+      "loss": 0.8441,
+      "step": 7188
+    },
+    {
+      "epoch": 1.2799145299145298,
+      "grad_norm": 0.5733884572982788,
+      "learning_rate": 0.00015374897919605152,
+      "loss": 0.9086,
+      "step": 7189
+    },
+    {
+      "epoch": 1.2800925925925926,
+      "grad_norm": 0.6505870819091797,
+      "learning_rate": 0.0001537371750176103,
+      "loss": 1.1683,
+      "step": 7190
+    },
+    {
+      "epoch": 1.2802706552706553,
+      "grad_norm": 0.6744688153266907,
+      "learning_rate": 0.00015372536978630077,
+      "loss": 0.9483,
+      "step": 7191
+    },
+    {
+      "epoch": 1.280448717948718,
+      "grad_norm": 0.598098874092102,
+      "learning_rate": 0.0001537135635023542,
+      "loss": 0.7747,
+      "step": 7192
+    },
+    {
+      "epoch": 1.2806267806267806,
+      "grad_norm": 0.6711761951446533,
+      "learning_rate": 0.00015370175616600195,
+      "loss": 1.1897,
+      "step": 7193
+    },
+    {
+      "epoch": 1.2808048433048433,
+      "grad_norm": 0.6207453608512878,
+      "learning_rate": 0.00015368994777747536,
+      "loss": 1.0063,
+      "step": 7194
+    },
+    {
+      "epoch": 1.2809829059829059,
+      "grad_norm": 0.6701686382293701,
+      "learning_rate": 0.00015367813833700575,
+      "loss": 1.0864,
+      "step": 7195
+    },
+    {
+      "epoch": 1.2811609686609686,
+      "grad_norm": 0.5916469693183899,
+      "learning_rate": 0.00015366632784482456,
+      "loss": 0.8786,
+      "step": 7196
+    },
+    {
+      "epoch": 1.2813390313390314,
+      "grad_norm": 0.6567547917366028,
+      "learning_rate": 0.00015365451630116312,
+      "loss": 0.9977,
+      "step": 7197
+    },
+    {
+      "epoch": 1.2815170940170941,
+      "grad_norm": 0.7287433743476868,
+      "learning_rate": 0.00015364270370625294,
+      "loss": 1.1248,
+      "step": 7198
+    },
+    {
+      "epoch": 1.2816951566951567,
+      "grad_norm": 0.7736039161682129,
+      "learning_rate": 0.0001536308900603254,
+      "loss": 0.9832,
+      "step": 7199
+    },
+    {
+      "epoch": 1.2818732193732194,
+      "grad_norm": 0.6799852252006531,
+      "learning_rate": 0.00015361907536361194,
+      "loss": 1.0275,
+      "step": 7200
+    },
+    {
+      "epoch": 1.282051282051282,
+      "grad_norm": 0.5975812673568726,
+      "learning_rate": 0.00015360725961634407,
+      "loss": 1.0516,
+      "step": 7201
+    },
+    {
+      "epoch": 1.2822293447293447,
+      "grad_norm": 0.616307258605957,
+      "learning_rate": 0.00015359544281875337,
+      "loss": 0.8095,
+      "step": 7202
+    },
+    {
+      "epoch": 1.2824074074074074,
+      "grad_norm": 0.6357580423355103,
+      "learning_rate": 0.00015358362497107126,
+      "loss": 0.9186,
+      "step": 7203
+    },
+    {
+      "epoch": 1.2825854700854702,
+      "grad_norm": 0.679333508014679,
+      "learning_rate": 0.00015357180607352935,
+      "loss": 0.9433,
+      "step": 7204
+    },
+    {
+      "epoch": 1.2827635327635327,
+      "grad_norm": 0.6345439553260803,
+      "learning_rate": 0.00015355998612635914,
+      "loss": 0.9186,
+      "step": 7205
+    },
+    {
+      "epoch": 1.2829415954415955,
+      "grad_norm": 0.6256508827209473,
+      "learning_rate": 0.00015354816512979231,
+      "loss": 0.9984,
+      "step": 7206
+    },
+    {
+      "epoch": 1.283119658119658,
+      "grad_norm": 0.7973852753639221,
+      "learning_rate": 0.00015353634308406044,
+      "loss": 1.1145,
+      "step": 7207
+    },
+    {
+      "epoch": 1.2832977207977208,
+      "grad_norm": 0.711125910282135,
+      "learning_rate": 0.0001535245199893951,
+      "loss": 1.1947,
+      "step": 7208
+    },
+    {
+      "epoch": 1.2834757834757835,
+      "grad_norm": 0.6096055507659912,
+      "learning_rate": 0.00015351269584602798,
+      "loss": 1.0078,
+      "step": 7209
+    },
+    {
+      "epoch": 1.2836538461538463,
+      "grad_norm": 0.7089232802391052,
+      "learning_rate": 0.00015350087065419077,
+      "loss": 1.112,
+      "step": 7210
+    },
+    {
+      "epoch": 1.2838319088319088,
+      "grad_norm": 0.716199517250061,
+      "learning_rate": 0.00015348904441411508,
+      "loss": 1.1015,
+      "step": 7211
+    },
+    {
+      "epoch": 1.2840099715099715,
+      "grad_norm": 0.6374632716178894,
+      "learning_rate": 0.00015347721712603276,
+      "loss": 1.0519,
+      "step": 7212
+    },
+    {
+      "epoch": 1.284188034188034,
+      "grad_norm": 0.6500036716461182,
+      "learning_rate": 0.0001534653887901754,
+      "loss": 1.1719,
+      "step": 7213
+    },
+    {
+      "epoch": 1.2843660968660968,
+      "grad_norm": 0.7249937653541565,
+      "learning_rate": 0.00015345355940677485,
+      "loss": 1.0188,
+      "step": 7214
+    },
+    {
+      "epoch": 1.2845441595441596,
+      "grad_norm": 0.6645919680595398,
+      "learning_rate": 0.00015344172897606285,
+      "loss": 0.9788,
+      "step": 7215
+    },
+    {
+      "epoch": 1.2847222222222223,
+      "grad_norm": 0.7032710313796997,
+      "learning_rate": 0.00015342989749827113,
+      "loss": 1.1093,
+      "step": 7216
+    },
+    {
+      "epoch": 1.2849002849002849,
+      "grad_norm": 0.622767984867096,
+      "learning_rate": 0.0001534180649736316,
+      "loss": 0.8978,
+      "step": 7217
+    },
+    {
+      "epoch": 1.2850783475783476,
+      "grad_norm": 0.7499693036079407,
+      "learning_rate": 0.00015340623140237605,
+      "loss": 1.2232,
+      "step": 7218
+    },
+    {
+      "epoch": 1.2852564102564101,
+      "grad_norm": 0.6308625936508179,
+      "learning_rate": 0.00015339439678473636,
+      "loss": 0.8621,
+      "step": 7219
+    },
+    {
+      "epoch": 1.2854344729344729,
+      "grad_norm": 0.6513667106628418,
+      "learning_rate": 0.00015338256112094434,
+      "loss": 1.0541,
+      "step": 7220
+    },
+    {
+      "epoch": 1.2856125356125356,
+      "grad_norm": 0.6080937385559082,
+      "learning_rate": 0.00015337072441123193,
+      "loss": 0.8474,
+      "step": 7221
+    },
+    {
+      "epoch": 1.2857905982905984,
+      "grad_norm": 0.6742652058601379,
+      "learning_rate": 0.00015335888665583104,
+      "loss": 1.0172,
+      "step": 7222
+    },
+    {
+      "epoch": 1.285968660968661,
+      "grad_norm": 0.620810866355896,
+      "learning_rate": 0.00015334704785497364,
+      "loss": 1.049,
+      "step": 7223
+    },
+    {
+      "epoch": 1.2861467236467237,
+      "grad_norm": 0.5733018517494202,
+      "learning_rate": 0.00015333520800889165,
+      "loss": 0.7371,
+      "step": 7224
+    },
+    {
+      "epoch": 1.2863247863247862,
+      "grad_norm": 0.6447640061378479,
+      "learning_rate": 0.00015332336711781702,
+      "loss": 0.9925,
+      "step": 7225
+    },
+    {
+      "epoch": 1.286502849002849,
+      "grad_norm": 0.6764999628067017,
+      "learning_rate": 0.00015331152518198183,
+      "loss": 0.9052,
+      "step": 7226
+    },
+    {
+      "epoch": 1.2866809116809117,
+      "grad_norm": 0.6492836475372314,
+      "learning_rate": 0.00015329968220161803,
+      "loss": 0.9493,
+      "step": 7227
+    },
+    {
+      "epoch": 1.2868589743589745,
+      "grad_norm": 0.666157603263855,
+      "learning_rate": 0.00015328783817695766,
+      "loss": 1.0626,
+      "step": 7228
+    },
+    {
+      "epoch": 1.287037037037037,
+      "grad_norm": 0.7098026871681213,
+      "learning_rate": 0.00015327599310823283,
+      "loss": 1.0461,
+      "step": 7229
+    },
+    {
+      "epoch": 1.2872150997150997,
+      "grad_norm": 0.637778103351593,
+      "learning_rate": 0.00015326414699567555,
+      "loss": 0.9383,
+      "step": 7230
+    },
+    {
+      "epoch": 1.2873931623931623,
+      "grad_norm": 0.6816399693489075,
+      "learning_rate": 0.00015325229983951798,
+      "loss": 1.0647,
+      "step": 7231
+    },
+    {
+      "epoch": 1.287571225071225,
+      "grad_norm": 0.668689489364624,
+      "learning_rate": 0.0001532404516399922,
+      "loss": 1.0479,
+      "step": 7232
+    },
+    {
+      "epoch": 1.2877492877492878,
+      "grad_norm": 0.6459103226661682,
+      "learning_rate": 0.0001532286023973304,
+      "loss": 1.1751,
+      "step": 7233
+    },
+    {
+      "epoch": 1.2879273504273505,
+      "grad_norm": 0.679999589920044,
+      "learning_rate": 0.00015321675211176468,
+      "loss": 0.7541,
+      "step": 7234
+    },
+    {
+      "epoch": 1.288105413105413,
+      "grad_norm": 0.5415067672729492,
+      "learning_rate": 0.00015320490078352724,
+      "loss": 0.822,
+      "step": 7235
+    },
+    {
+      "epoch": 1.2882834757834758,
+      "grad_norm": 0.6817963719367981,
+      "learning_rate": 0.00015319304841285032,
+      "loss": 0.9424,
+      "step": 7236
+    },
+    {
+      "epoch": 1.2884615384615383,
+      "grad_norm": 0.6187505125999451,
+      "learning_rate": 0.0001531811949999661,
+      "loss": 0.8596,
+      "step": 7237
+    },
+    {
+      "epoch": 1.288639601139601,
+      "grad_norm": 0.6737838387489319,
+      "learning_rate": 0.00015316934054510685,
+      "loss": 1.0046,
+      "step": 7238
+    },
+    {
+      "epoch": 1.2888176638176638,
+      "grad_norm": 0.6445996761322021,
+      "learning_rate": 0.00015315748504850482,
+      "loss": 1.01,
+      "step": 7239
+    },
+    {
+      "epoch": 1.2889957264957266,
+      "grad_norm": 0.7279136180877686,
+      "learning_rate": 0.0001531456285103923,
+      "loss": 0.9066,
+      "step": 7240
+    },
+    {
+      "epoch": 1.289173789173789,
+      "grad_norm": 0.6619178652763367,
+      "learning_rate": 0.00015313377093100153,
+      "loss": 0.8977,
+      "step": 7241
+    },
+    {
+      "epoch": 1.2893518518518519,
+      "grad_norm": 0.7644323110580444,
+      "learning_rate": 0.000153121912310565,
+      "loss": 1.3085,
+      "step": 7242
+    },
+    {
+      "epoch": 1.2895299145299146,
+      "grad_norm": 0.645882248878479,
+      "learning_rate": 0.00015311005264931487,
+      "loss": 1.0337,
+      "step": 7243
+    },
+    {
+      "epoch": 1.2897079772079771,
+      "grad_norm": 0.6868017911911011,
+      "learning_rate": 0.0001530981919474836,
+      "loss": 0.9616,
+      "step": 7244
+    },
+    {
+      "epoch": 1.28988603988604,
+      "grad_norm": 0.7176693677902222,
+      "learning_rate": 0.00015308633020530362,
+      "loss": 1.1975,
+      "step": 7245
+    },
+    {
+      "epoch": 1.2900641025641026,
+      "grad_norm": 0.7358015775680542,
+      "learning_rate": 0.00015307446742300718,
+      "loss": 0.9308,
+      "step": 7246
+    },
+    {
+      "epoch": 1.2902421652421652,
+      "grad_norm": 0.7330248355865479,
+      "learning_rate": 0.00015306260360082688,
+      "loss": 0.9518,
+      "step": 7247
+    },
+    {
+      "epoch": 1.290420227920228,
+      "grad_norm": 0.6571981310844421,
+      "learning_rate": 0.00015305073873899503,
+      "loss": 0.9531,
+      "step": 7248
+    },
+    {
+      "epoch": 1.2905982905982907,
+      "grad_norm": 0.5968486666679382,
+      "learning_rate": 0.00015303887283774417,
+      "loss": 0.9245,
+      "step": 7249
+    },
+    {
+      "epoch": 1.2907763532763532,
+      "grad_norm": 0.6398176550865173,
+      "learning_rate": 0.0001530270058973068,
+      "loss": 1.0452,
+      "step": 7250
+    },
+    {
+      "epoch": 1.290954415954416,
+      "grad_norm": 0.5462267994880676,
+      "learning_rate": 0.00015301513791791542,
+      "loss": 0.8451,
+      "step": 7251
+    },
+    {
+      "epoch": 1.2911324786324787,
+      "grad_norm": 0.7536166906356812,
+      "learning_rate": 0.00015300326889980252,
+      "loss": 1.0086,
+      "step": 7252
+    },
+    {
+      "epoch": 1.2913105413105412,
+      "grad_norm": 0.6208569407463074,
+      "learning_rate": 0.00015299139884320065,
+      "loss": 0.7437,
+      "step": 7253
+    },
+    {
+      "epoch": 1.291488603988604,
+      "grad_norm": 0.7025452852249146,
+      "learning_rate": 0.00015297952774834242,
+      "loss": 0.8874,
+      "step": 7254
+    },
+    {
+      "epoch": 1.2916666666666667,
+      "grad_norm": 0.6758308410644531,
+      "learning_rate": 0.00015296765561546041,
+      "loss": 1.0378,
+      "step": 7255
+    },
+    {
+      "epoch": 1.2918447293447293,
+      "grad_norm": 0.7170431613922119,
+      "learning_rate": 0.00015295578244478724,
+      "loss": 1.0111,
+      "step": 7256
+    },
+    {
+      "epoch": 1.292022792022792,
+      "grad_norm": 0.6263511180877686,
+      "learning_rate": 0.00015294390823655544,
+      "loss": 0.7836,
+      "step": 7257
+    },
+    {
+      "epoch": 1.2922008547008548,
+      "grad_norm": 0.5887803435325623,
+      "learning_rate": 0.0001529320329909978,
+      "loss": 1.068,
+      "step": 7258
+    },
+    {
+      "epoch": 1.2923789173789173,
+      "grad_norm": 0.5955889821052551,
+      "learning_rate": 0.00015292015670834692,
+      "loss": 0.8903,
+      "step": 7259
+    },
+    {
+      "epoch": 1.29255698005698,
+      "grad_norm": 0.630449652671814,
+      "learning_rate": 0.00015290827938883552,
+      "loss": 1.1096,
+      "step": 7260
+    },
+    {
+      "epoch": 1.2927350427350428,
+      "grad_norm": 0.7405480146408081,
+      "learning_rate": 0.00015289640103269625,
+      "loss": 1.0648,
+      "step": 7261
+    },
+    {
+      "epoch": 1.2929131054131053,
+      "grad_norm": 0.6082221865653992,
+      "learning_rate": 0.00015288452164016191,
+      "loss": 0.9266,
+      "step": 7262
+    },
+    {
+      "epoch": 1.293091168091168,
+      "grad_norm": 0.6211720108985901,
+      "learning_rate": 0.00015287264121146524,
+      "loss": 0.849,
+      "step": 7263
+    },
+    {
+      "epoch": 1.2932692307692308,
+      "grad_norm": 0.6481043100357056,
+      "learning_rate": 0.00015286075974683898,
+      "loss": 0.7761,
+      "step": 7264
+    },
+    {
+      "epoch": 1.2934472934472934,
+      "grad_norm": 0.5957167744636536,
+      "learning_rate": 0.00015284887724651593,
+      "loss": 0.8942,
+      "step": 7265
+    },
+    {
+      "epoch": 1.2936253561253561,
+      "grad_norm": 0.7272268533706665,
+      "learning_rate": 0.00015283699371072894,
+      "loss": 1.0913,
+      "step": 7266
+    },
+    {
+      "epoch": 1.2938034188034189,
+      "grad_norm": 0.5902758836746216,
+      "learning_rate": 0.0001528251091397108,
+      "loss": 1.1045,
+      "step": 7267
+    },
+    {
+      "epoch": 1.2939814814814814,
+      "grad_norm": 0.6382482051849365,
+      "learning_rate": 0.00015281322353369436,
+      "loss": 0.9265,
+      "step": 7268
+    },
+    {
+      "epoch": 1.2941595441595442,
+      "grad_norm": 0.6556048393249512,
+      "learning_rate": 0.00015280133689291256,
+      "loss": 1.0536,
+      "step": 7269
+    },
+    {
+      "epoch": 1.294337606837607,
+      "grad_norm": 0.680895209312439,
+      "learning_rate": 0.00015278944921759822,
+      "loss": 0.9996,
+      "step": 7270
+    },
+    {
+      "epoch": 1.2945156695156697,
+      "grad_norm": 0.670317530632019,
+      "learning_rate": 0.00015277756050798428,
+      "loss": 1.1402,
+      "step": 7271
+    },
+    {
+      "epoch": 1.2946937321937322,
+      "grad_norm": 0.6312688589096069,
+      "learning_rate": 0.0001527656707643037,
+      "loss": 1.0669,
+      "step": 7272
+    },
+    {
+      "epoch": 1.294871794871795,
+      "grad_norm": 0.6267009973526001,
+      "learning_rate": 0.0001527537799867894,
+      "loss": 0.8985,
+      "step": 7273
+    },
+    {
+      "epoch": 1.2950498575498575,
+      "grad_norm": 0.7069001197814941,
+      "learning_rate": 0.00015274188817567436,
+      "loss": 0.9478,
+      "step": 7274
+    },
+    {
+      "epoch": 1.2952279202279202,
+      "grad_norm": 0.7229067087173462,
+      "learning_rate": 0.00015272999533119162,
+      "loss": 0.9005,
+      "step": 7275
+    },
+    {
+      "epoch": 1.295405982905983,
+      "grad_norm": 0.6254632472991943,
+      "learning_rate": 0.00015271810145357412,
+      "loss": 0.9746,
+      "step": 7276
+    },
+    {
+      "epoch": 1.2955840455840457,
+      "grad_norm": 0.6772669553756714,
+      "learning_rate": 0.00015270620654305494,
+      "loss": 1.1714,
+      "step": 7277
+    },
+    {
+      "epoch": 1.2957621082621082,
+      "grad_norm": 0.605576753616333,
+      "learning_rate": 0.00015269431059986713,
+      "loss": 0.7735,
+      "step": 7278
+    },
+    {
+      "epoch": 1.295940170940171,
+      "grad_norm": 0.7144771814346313,
+      "learning_rate": 0.00015268241362424378,
+      "loss": 0.9757,
+      "step": 7279
+    },
+    {
+      "epoch": 1.2961182336182335,
+      "grad_norm": 0.5275486707687378,
+      "learning_rate": 0.00015267051561641798,
+      "loss": 0.5669,
+      "step": 7280
+    },
+    {
+      "epoch": 1.2962962962962963,
+      "grad_norm": 0.6619452238082886,
+      "learning_rate": 0.00015265861657662284,
+      "loss": 0.9511,
+      "step": 7281
+    },
+    {
+      "epoch": 1.296474358974359,
+      "grad_norm": 0.6788223385810852,
+      "learning_rate": 0.00015264671650509147,
+      "loss": 1.2649,
+      "step": 7282
+    },
+    {
+      "epoch": 1.2966524216524218,
+      "grad_norm": 0.6198732852935791,
+      "learning_rate": 0.00015263481540205706,
+      "loss": 1.0659,
+      "step": 7283
+    },
+    {
+      "epoch": 1.2968304843304843,
+      "grad_norm": 0.6038815975189209,
+      "learning_rate": 0.0001526229132677528,
+      "loss": 1.0655,
+      "step": 7284
+    },
+    {
+      "epoch": 1.297008547008547,
+      "grad_norm": 0.7616196870803833,
+      "learning_rate": 0.00015261101010241186,
+      "loss": 1.131,
+      "step": 7285
+    },
+    {
+      "epoch": 1.2971866096866096,
+      "grad_norm": 0.7002527713775635,
+      "learning_rate": 0.00015259910590626746,
+      "loss": 1.1375,
+      "step": 7286
+    },
+    {
+      "epoch": 1.2973646723646723,
+      "grad_norm": 0.6067437529563904,
+      "learning_rate": 0.00015258720067955284,
+      "loss": 0.9306,
+      "step": 7287
+    },
+    {
+      "epoch": 1.297542735042735,
+      "grad_norm": 0.653232216835022,
+      "learning_rate": 0.00015257529442250128,
+      "loss": 1.107,
+      "step": 7288
+    },
+    {
+      "epoch": 1.2977207977207978,
+      "grad_norm": 0.6969175934791565,
+      "learning_rate": 0.00015256338713534603,
+      "loss": 0.8365,
+      "step": 7289
+    },
+    {
+      "epoch": 1.2978988603988604,
+      "grad_norm": 0.6176731586456299,
+      "learning_rate": 0.00015255147881832043,
+      "loss": 0.9707,
+      "step": 7290
+    },
+    {
+      "epoch": 1.2980769230769231,
+      "grad_norm": 0.6543741822242737,
+      "learning_rate": 0.00015253956947165772,
+      "loss": 0.7714,
+      "step": 7291
+    },
+    {
+      "epoch": 1.2982549857549857,
+      "grad_norm": 0.5224920511245728,
+      "learning_rate": 0.00015252765909559135,
+      "loss": 0.7469,
+      "step": 7292
+    },
+    {
+      "epoch": 1.2984330484330484,
+      "grad_norm": 0.638708770275116,
+      "learning_rate": 0.00015251574769035455,
+      "loss": 1.0965,
+      "step": 7293
+    },
+    {
+      "epoch": 1.2986111111111112,
+      "grad_norm": 0.6742943525314331,
+      "learning_rate": 0.0001525038352561808,
+      "loss": 1.1286,
+      "step": 7294
+    },
+    {
+      "epoch": 1.298789173789174,
+      "grad_norm": 0.6027839183807373,
+      "learning_rate": 0.00015249192179330346,
+      "loss": 0.8824,
+      "step": 7295
+    },
+    {
+      "epoch": 1.2989672364672364,
+      "grad_norm": 0.7462167143821716,
+      "learning_rate": 0.00015248000730195597,
+      "loss": 0.94,
+      "step": 7296
+    },
+    {
+      "epoch": 1.2991452991452992,
+      "grad_norm": 0.6972534656524658,
+      "learning_rate": 0.00015246809178237172,
+      "loss": 1.0664,
+      "step": 7297
+    },
+    {
+      "epoch": 1.2993233618233617,
+      "grad_norm": 0.569949209690094,
+      "learning_rate": 0.0001524561752347842,
+      "loss": 0.691,
+      "step": 7298
+    },
+    {
+      "epoch": 1.2995014245014245,
+      "grad_norm": 0.6066586375236511,
+      "learning_rate": 0.00015244425765942695,
+      "loss": 1.083,
+      "step": 7299
+    },
+    {
+      "epoch": 1.2996794871794872,
+      "grad_norm": 0.6927483677864075,
+      "learning_rate": 0.00015243233905653337,
+      "loss": 1.0068,
+      "step": 7300
+    },
+    {
+      "epoch": 1.29985754985755,
+      "grad_norm": 0.752824604511261,
+      "learning_rate": 0.00015242041942633704,
+      "loss": 0.9946,
+      "step": 7301
+    },
+    {
+      "epoch": 1.3000356125356125,
+      "grad_norm": 0.6532080173492432,
+      "learning_rate": 0.0001524084987690715,
+      "loss": 1.2326,
+      "step": 7302
+    },
+    {
+      "epoch": 1.3002136752136753,
+      "grad_norm": 0.7954180836677551,
+      "learning_rate": 0.0001523965770849703,
+      "loss": 1.1105,
+      "step": 7303
+    },
+    {
+      "epoch": 1.3003917378917378,
+      "grad_norm": 0.5971781015396118,
+      "learning_rate": 0.000152384654374267,
+      "loss": 1.0984,
+      "step": 7304
+    },
+    {
+      "epoch": 1.3005698005698005,
+      "grad_norm": 0.7778682112693787,
+      "learning_rate": 0.0001523727306371952,
+      "loss": 1.0795,
+      "step": 7305
+    },
+    {
+      "epoch": 1.3007478632478633,
+      "grad_norm": 0.6712004542350769,
+      "learning_rate": 0.00015236080587398856,
+      "loss": 1.0814,
+      "step": 7306
+    },
+    {
+      "epoch": 1.300925925925926,
+      "grad_norm": 0.581048846244812,
+      "learning_rate": 0.00015234888008488066,
+      "loss": 0.9868,
+      "step": 7307
+    },
+    {
+      "epoch": 1.3011039886039886,
+      "grad_norm": 0.697695791721344,
+      "learning_rate": 0.00015233695327010523,
+      "loss": 1.1045,
+      "step": 7308
+    },
+    {
+      "epoch": 1.3012820512820513,
+      "grad_norm": 0.6858421564102173,
+      "learning_rate": 0.00015232502542989593,
+      "loss": 1.0769,
+      "step": 7309
+    },
+    {
+      "epoch": 1.3014601139601139,
+      "grad_norm": 0.6312826871871948,
+      "learning_rate": 0.00015231309656448642,
+      "loss": 0.9523,
+      "step": 7310
+    },
+    {
+      "epoch": 1.3016381766381766,
+      "grad_norm": 0.9243300557136536,
+      "learning_rate": 0.0001523011666741105,
+      "loss": 0.947,
+      "step": 7311
+    },
+    {
+      "epoch": 1.3018162393162394,
+      "grad_norm": 0.6808217763900757,
+      "learning_rate": 0.00015228923575900184,
+      "loss": 0.8631,
+      "step": 7312
+    },
+    {
+      "epoch": 1.301994301994302,
+      "grad_norm": 0.6713891625404358,
+      "learning_rate": 0.00015227730381939424,
+      "loss": 0.9157,
+      "step": 7313
+    },
+    {
+      "epoch": 1.3021723646723646,
+      "grad_norm": 0.6802582740783691,
+      "learning_rate": 0.00015226537085552146,
+      "loss": 1.041,
+      "step": 7314
+    },
+    {
+      "epoch": 1.3023504273504274,
+      "grad_norm": 0.6543951034545898,
+      "learning_rate": 0.0001522534368676173,
+      "loss": 0.8709,
+      "step": 7315
+    },
+    {
+      "epoch": 1.30252849002849,
+      "grad_norm": 0.6290678381919861,
+      "learning_rate": 0.0001522415018559156,
+      "loss": 1.0568,
+      "step": 7316
+    },
+    {
+      "epoch": 1.3027065527065527,
+      "grad_norm": 0.6590015292167664,
+      "learning_rate": 0.0001522295658206502,
+      "loss": 0.9919,
+      "step": 7317
+    },
+    {
+      "epoch": 1.3028846153846154,
+      "grad_norm": 0.6374103426933289,
+      "learning_rate": 0.00015221762876205494,
+      "loss": 0.878,
+      "step": 7318
+    },
+    {
+      "epoch": 1.3030626780626782,
+      "grad_norm": 0.7247048616409302,
+      "learning_rate": 0.00015220569068036372,
+      "loss": 1.061,
+      "step": 7319
+    },
+    {
+      "epoch": 1.3032407407407407,
+      "grad_norm": 0.6450991630554199,
+      "learning_rate": 0.00015219375157581047,
+      "loss": 0.9389,
+      "step": 7320
+    },
+    {
+      "epoch": 1.3034188034188035,
+      "grad_norm": 0.8039840459823608,
+      "learning_rate": 0.00015218181144862903,
+      "loss": 1.0692,
+      "step": 7321
+    },
+    {
+      "epoch": 1.303596866096866,
+      "grad_norm": 0.6539456248283386,
+      "learning_rate": 0.00015216987029905346,
+      "loss": 1.0478,
+      "step": 7322
+    },
+    {
+      "epoch": 1.3037749287749287,
+      "grad_norm": 0.60880047082901,
+      "learning_rate": 0.00015215792812731758,
+      "loss": 0.8412,
+      "step": 7323
+    },
+    {
+      "epoch": 1.3039529914529915,
+      "grad_norm": 0.6757258176803589,
+      "learning_rate": 0.0001521459849336555,
+      "loss": 0.896,
+      "step": 7324
+    },
+    {
+      "epoch": 1.3041310541310542,
+      "grad_norm": 0.6735622882843018,
+      "learning_rate": 0.00015213404071830116,
+      "loss": 1.1078,
+      "step": 7325
+    },
+    {
+      "epoch": 1.3043091168091168,
+      "grad_norm": 0.7321233749389648,
+      "learning_rate": 0.00015212209548148858,
+      "loss": 1.1021,
+      "step": 7326
+    },
+    {
+      "epoch": 1.3044871794871795,
+      "grad_norm": 0.6678910851478577,
+      "learning_rate": 0.00015211014922345182,
+      "loss": 1.0043,
+      "step": 7327
+    },
+    {
+      "epoch": 1.304665242165242,
+      "grad_norm": 0.6876940727233887,
+      "learning_rate": 0.0001520982019444249,
+      "loss": 1.0376,
+      "step": 7328
+    },
+    {
+      "epoch": 1.3048433048433048,
+      "grad_norm": 0.6171853542327881,
+      "learning_rate": 0.00015208625364464195,
+      "loss": 0.839,
+      "step": 7329
+    },
+    {
+      "epoch": 1.3050213675213675,
+      "grad_norm": 0.6449569463729858,
+      "learning_rate": 0.0001520743043243371,
+      "loss": 1.0908,
+      "step": 7330
+    },
+    {
+      "epoch": 1.3051994301994303,
+      "grad_norm": 0.6894628405570984,
+      "learning_rate": 0.00015206235398374443,
+      "loss": 1.0263,
+      "step": 7331
+    },
+    {
+      "epoch": 1.3053774928774928,
+      "grad_norm": 0.5853552222251892,
+      "learning_rate": 0.00015205040262309804,
+      "loss": 0.8342,
+      "step": 7332
+    },
+    {
+      "epoch": 1.3055555555555556,
+      "grad_norm": 0.5934799313545227,
+      "learning_rate": 0.00015203845024263214,
+      "loss": 0.9464,
+      "step": 7333
+    },
+    {
+      "epoch": 1.305733618233618,
+      "grad_norm": 0.668927788734436,
+      "learning_rate": 0.00015202649684258095,
+      "loss": 0.9018,
+      "step": 7334
+    },
+    {
+      "epoch": 1.3059116809116809,
+      "grad_norm": 0.676810085773468,
+      "learning_rate": 0.0001520145424231786,
+      "loss": 0.9284,
+      "step": 7335
+    },
+    {
+      "epoch": 1.3060897435897436,
+      "grad_norm": 0.6223878264427185,
+      "learning_rate": 0.00015200258698465935,
+      "loss": 1.0779,
+      "step": 7336
+    },
+    {
+      "epoch": 1.3062678062678064,
+      "grad_norm": 0.6092363595962524,
+      "learning_rate": 0.00015199063052725745,
+      "loss": 0.8602,
+      "step": 7337
+    },
+    {
+      "epoch": 1.306445868945869,
+      "grad_norm": 0.7668731212615967,
+      "learning_rate": 0.00015197867305120712,
+      "loss": 1.0756,
+      "step": 7338
+    },
+    {
+      "epoch": 1.3066239316239316,
+      "grad_norm": 0.6485331654548645,
+      "learning_rate": 0.00015196671455674268,
+      "loss": 1.0193,
+      "step": 7339
+    },
+    {
+      "epoch": 1.3068019943019942,
+      "grad_norm": 0.5661036372184753,
+      "learning_rate": 0.0001519547550440984,
+      "loss": 0.8321,
+      "step": 7340
+    },
+    {
+      "epoch": 1.306980056980057,
+      "grad_norm": 0.6270507574081421,
+      "learning_rate": 0.00015194279451350866,
+      "loss": 0.6403,
+      "step": 7341
+    },
+    {
+      "epoch": 1.3071581196581197,
+      "grad_norm": 0.7283764481544495,
+      "learning_rate": 0.00015193083296520773,
+      "loss": 1.0401,
+      "step": 7342
+    },
+    {
+      "epoch": 1.3073361823361824,
+      "grad_norm": 0.658835232257843,
+      "learning_rate": 0.00015191887039943,
+      "loss": 1.0172,
+      "step": 7343
+    },
+    {
+      "epoch": 1.307514245014245,
+      "grad_norm": 0.6288984417915344,
+      "learning_rate": 0.00015190690681640988,
+      "loss": 0.8649,
+      "step": 7344
+    },
+    {
+      "epoch": 1.3076923076923077,
+      "grad_norm": 0.666442334651947,
+      "learning_rate": 0.00015189494221638176,
+      "loss": 1.0757,
+      "step": 7345
+    },
+    {
+      "epoch": 1.3078703703703702,
+      "grad_norm": 0.6116433143615723,
+      "learning_rate": 0.00015188297659958003,
+      "loss": 0.9244,
+      "step": 7346
+    },
+    {
+      "epoch": 1.308048433048433,
+      "grad_norm": 0.6378964185714722,
+      "learning_rate": 0.0001518710099662392,
+      "loss": 0.9629,
+      "step": 7347
+    },
+    {
+      "epoch": 1.3082264957264957,
+      "grad_norm": 0.6258945465087891,
+      "learning_rate": 0.00015185904231659357,
+      "loss": 0.8524,
+      "step": 7348
+    },
+    {
+      "epoch": 1.3084045584045585,
+      "grad_norm": 0.6498504877090454,
+      "learning_rate": 0.0001518470736508778,
+      "loss": 0.9685,
+      "step": 7349
+    },
+    {
+      "epoch": 1.308582621082621,
+      "grad_norm": 0.6928247809410095,
+      "learning_rate": 0.00015183510396932635,
+      "loss": 0.9054,
+      "step": 7350
+    },
+    {
+      "epoch": 1.3087606837606838,
+      "grad_norm": 0.6350936889648438,
+      "learning_rate": 0.0001518231332721737,
+      "loss": 1.0039,
+      "step": 7351
+    },
+    {
+      "epoch": 1.3089387464387463,
+      "grad_norm": 0.6652286648750305,
+      "learning_rate": 0.00015181116155965437,
+      "loss": 0.8946,
+      "step": 7352
+    },
+    {
+      "epoch": 1.309116809116809,
+      "grad_norm": 0.6554864048957825,
+      "learning_rate": 0.000151799188832003,
+      "loss": 0.9518,
+      "step": 7353
+    },
+    {
+      "epoch": 1.3092948717948718,
+      "grad_norm": 0.7523114085197449,
+      "learning_rate": 0.0001517872150894541,
+      "loss": 0.9462,
+      "step": 7354
+    },
+    {
+      "epoch": 1.3094729344729346,
+      "grad_norm": 0.7113336324691772,
+      "learning_rate": 0.0001517752403322423,
+      "loss": 1.2347,
+      "step": 7355
+    },
+    {
+      "epoch": 1.309650997150997,
+      "grad_norm": 0.6461622714996338,
+      "learning_rate": 0.00015176326456060223,
+      "loss": 0.8891,
+      "step": 7356
+    },
+    {
+      "epoch": 1.3098290598290598,
+      "grad_norm": 0.7429143190383911,
+      "learning_rate": 0.00015175128777476852,
+      "loss": 1.1944,
+      "step": 7357
+    },
+    {
+      "epoch": 1.3100071225071226,
+      "grad_norm": 0.6816306114196777,
+      "learning_rate": 0.00015173930997497585,
+      "loss": 1.1445,
+      "step": 7358
+    },
+    {
+      "epoch": 1.3101851851851851,
+      "grad_norm": 0.6644450426101685,
+      "learning_rate": 0.00015172733116145884,
+      "loss": 0.9808,
+      "step": 7359
+    },
+    {
+      "epoch": 1.3103632478632479,
+      "grad_norm": 0.6921063661575317,
+      "learning_rate": 0.00015171535133445225,
+      "loss": 1.0162,
+      "step": 7360
+    },
+    {
+      "epoch": 1.3105413105413106,
+      "grad_norm": 0.6386187672615051,
+      "learning_rate": 0.00015170337049419082,
+      "loss": 0.9951,
+      "step": 7361
+    },
+    {
+      "epoch": 1.3107193732193732,
+      "grad_norm": 0.6505418419837952,
+      "learning_rate": 0.0001516913886409092,
+      "loss": 0.8872,
+      "step": 7362
+    },
+    {
+      "epoch": 1.310897435897436,
+      "grad_norm": 0.6415576934814453,
+      "learning_rate": 0.00015167940577484222,
+      "loss": 1.056,
+      "step": 7363
+    },
+    {
+      "epoch": 1.3110754985754987,
+      "grad_norm": 0.6691195964813232,
+      "learning_rate": 0.00015166742189622458,
+      "loss": 1.0561,
+      "step": 7364
+    },
+    {
+      "epoch": 1.3112535612535612,
+      "grad_norm": 0.6376257538795471,
+      "learning_rate": 0.00015165543700529122,
+      "loss": 0.8499,
+      "step": 7365
+    },
+    {
+      "epoch": 1.311431623931624,
+      "grad_norm": 0.6270790696144104,
+      "learning_rate": 0.00015164345110227684,
+      "loss": 1.0244,
+      "step": 7366
+    },
+    {
+      "epoch": 1.3116096866096867,
+      "grad_norm": 0.7120122313499451,
+      "learning_rate": 0.0001516314641874163,
+      "loss": 1.0476,
+      "step": 7367
+    },
+    {
+      "epoch": 1.3117877492877492,
+      "grad_norm": 0.6152660250663757,
+      "learning_rate": 0.0001516194762609445,
+      "loss": 0.897,
+      "step": 7368
+    },
+    {
+      "epoch": 1.311965811965812,
+      "grad_norm": 0.7578088045120239,
+      "learning_rate": 0.00015160748732309626,
+      "loss": 1.1609,
+      "step": 7369
+    },
+    {
+      "epoch": 1.3121438746438747,
+      "grad_norm": 0.6594924330711365,
+      "learning_rate": 0.00015159549737410656,
+      "loss": 1.1706,
+      "step": 7370
+    },
+    {
+      "epoch": 1.3123219373219372,
+      "grad_norm": 0.6559173464775085,
+      "learning_rate": 0.00015158350641421024,
+      "loss": 0.9452,
+      "step": 7371
+    },
+    {
+      "epoch": 1.3125,
+      "grad_norm": 0.6667516231536865,
+      "learning_rate": 0.00015157151444364226,
+      "loss": 0.8153,
+      "step": 7372
+    },
+    {
+      "epoch": 1.3126780626780628,
+      "grad_norm": 0.7054803371429443,
+      "learning_rate": 0.00015155952146263761,
+      "loss": 0.9887,
+      "step": 7373
+    },
+    {
+      "epoch": 1.3128561253561253,
+      "grad_norm": 0.7035902142524719,
+      "learning_rate": 0.00015154752747143123,
+      "loss": 1.1832,
+      "step": 7374
+    },
+    {
+      "epoch": 1.313034188034188,
+      "grad_norm": 0.6297488212585449,
+      "learning_rate": 0.00015153553247025813,
+      "loss": 0.9602,
+      "step": 7375
+    },
+    {
+      "epoch": 1.3132122507122508,
+      "grad_norm": 0.6851378083229065,
+      "learning_rate": 0.00015152353645935335,
+      "loss": 1.0743,
+      "step": 7376
+    },
+    {
+      "epoch": 1.3133903133903133,
+      "grad_norm": 0.6215537786483765,
+      "learning_rate": 0.00015151153943895187,
+      "loss": 0.9484,
+      "step": 7377
+    },
+    {
+      "epoch": 1.313568376068376,
+      "grad_norm": 0.6848666071891785,
+      "learning_rate": 0.0001514995414092888,
+      "loss": 1.0978,
+      "step": 7378
+    },
+    {
+      "epoch": 1.3137464387464388,
+      "grad_norm": 0.7527492642402649,
+      "learning_rate": 0.00015148754237059918,
+      "loss": 1.083,
+      "step": 7379
+    },
+    {
+      "epoch": 1.3139245014245013,
+      "grad_norm": 0.6264588236808777,
+      "learning_rate": 0.00015147554232311814,
+      "loss": 0.9995,
+      "step": 7380
+    },
+    {
+      "epoch": 1.314102564102564,
+      "grad_norm": 0.6666619181632996,
+      "learning_rate": 0.00015146354126708075,
+      "loss": 1.0156,
+      "step": 7381
+    },
+    {
+      "epoch": 1.3142806267806268,
+      "grad_norm": 0.6626597046852112,
+      "learning_rate": 0.00015145153920272222,
+      "loss": 1.0047,
+      "step": 7382
+    },
+    {
+      "epoch": 1.3144586894586894,
+      "grad_norm": 0.5975428223609924,
+      "learning_rate": 0.0001514395361302776,
+      "loss": 0.806,
+      "step": 7383
+    },
+    {
+      "epoch": 1.3146367521367521,
+      "grad_norm": 0.6509957909584045,
+      "learning_rate": 0.00015142753204998218,
+      "loss": 0.8871,
+      "step": 7384
+    },
+    {
+      "epoch": 1.3148148148148149,
+      "grad_norm": 0.6672926545143127,
+      "learning_rate": 0.00015141552696207108,
+      "loss": 0.9616,
+      "step": 7385
+    },
+    {
+      "epoch": 1.3149928774928774,
+      "grad_norm": 0.6965435147285461,
+      "learning_rate": 0.00015140352086677954,
+      "loss": 1.124,
+      "step": 7386
+    },
+    {
+      "epoch": 1.3151709401709402,
+      "grad_norm": 0.6559258103370667,
+      "learning_rate": 0.00015139151376434277,
+      "loss": 1.0271,
+      "step": 7387
+    },
+    {
+      "epoch": 1.315349002849003,
+      "grad_norm": 0.7613587379455566,
+      "learning_rate": 0.00015137950565499608,
+      "loss": 1.0349,
+      "step": 7388
+    },
+    {
+      "epoch": 1.3155270655270654,
+      "grad_norm": 0.7001944780349731,
+      "learning_rate": 0.0001513674965389747,
+      "loss": 0.8551,
+      "step": 7389
+    },
+    {
+      "epoch": 1.3157051282051282,
+      "grad_norm": 0.6087043285369873,
+      "learning_rate": 0.0001513554864165139,
+      "loss": 0.7118,
+      "step": 7390
+    },
+    {
+      "epoch": 1.315883190883191,
+      "grad_norm": 0.71526700258255,
+      "learning_rate": 0.00015134347528784908,
+      "loss": 1.0478,
+      "step": 7391
+    },
+    {
+      "epoch": 1.3160612535612537,
+      "grad_norm": 0.6182073950767517,
+      "learning_rate": 0.00015133146315321548,
+      "loss": 0.9474,
+      "step": 7392
+    },
+    {
+      "epoch": 1.3162393162393162,
+      "grad_norm": 0.7771387696266174,
+      "learning_rate": 0.0001513194500128485,
+      "loss": 1.0544,
+      "step": 7393
+    },
+    {
+      "epoch": 1.316417378917379,
+      "grad_norm": 0.7108260989189148,
+      "learning_rate": 0.00015130743586698353,
+      "loss": 0.8813,
+      "step": 7394
+    },
+    {
+      "epoch": 1.3165954415954415,
+      "grad_norm": 0.7057309150695801,
+      "learning_rate": 0.0001512954207158559,
+      "loss": 0.899,
+      "step": 7395
+    },
+    {
+      "epoch": 1.3167735042735043,
+      "grad_norm": 0.6139237880706787,
+      "learning_rate": 0.00015128340455970106,
+      "loss": 0.8885,
+      "step": 7396
+    },
+    {
+      "epoch": 1.316951566951567,
+      "grad_norm": 0.7166598439216614,
+      "learning_rate": 0.00015127138739875443,
+      "loss": 0.9792,
+      "step": 7397
+    },
+    {
+      "epoch": 1.3171296296296298,
+      "grad_norm": 0.6916186809539795,
+      "learning_rate": 0.00015125936923325153,
+      "loss": 0.8871,
+      "step": 7398
+    },
+    {
+      "epoch": 1.3173076923076923,
+      "grad_norm": 0.7189087271690369,
+      "learning_rate": 0.0001512473500634277,
+      "loss": 0.8302,
+      "step": 7399
+    },
+    {
+      "epoch": 1.317485754985755,
+      "grad_norm": 0.5739200115203857,
+      "learning_rate": 0.00015123532988951853,
+      "loss": 0.9137,
+      "step": 7400
+    },
+    {
+      "epoch": 1.3176638176638176,
+      "grad_norm": 0.7661057114601135,
+      "learning_rate": 0.00015122330871175952,
+      "loss": 1.1255,
+      "step": 7401
+    },
+    {
+      "epoch": 1.3178418803418803,
+      "grad_norm": 0.6487592458724976,
+      "learning_rate": 0.00015121128653038617,
+      "loss": 1.0519,
+      "step": 7402
+    },
+    {
+      "epoch": 1.318019943019943,
+      "grad_norm": 0.693134605884552,
+      "learning_rate": 0.00015119926334563406,
+      "loss": 0.9585,
+      "step": 7403
+    },
+    {
+      "epoch": 1.3181980056980058,
+      "grad_norm": 0.5895997285842896,
+      "learning_rate": 0.0001511872391577387,
+      "loss": 0.8033,
+      "step": 7404
+    },
+    {
+      "epoch": 1.3183760683760684,
+      "grad_norm": 0.654876172542572,
+      "learning_rate": 0.00015117521396693575,
+      "loss": 1.0082,
+      "step": 7405
+    },
+    {
+      "epoch": 1.318554131054131,
+      "grad_norm": 0.5877239108085632,
+      "learning_rate": 0.0001511631877734608,
+      "loss": 1.0147,
+      "step": 7406
+    },
+    {
+      "epoch": 1.3187321937321936,
+      "grad_norm": 0.6109837889671326,
+      "learning_rate": 0.00015115116057754944,
+      "loss": 0.7498,
+      "step": 7407
+    },
+    {
+      "epoch": 1.3189102564102564,
+      "grad_norm": 0.643856942653656,
+      "learning_rate": 0.00015113913237943736,
+      "loss": 1.0417,
+      "step": 7408
+    },
+    {
+      "epoch": 1.3190883190883191,
+      "grad_norm": 0.654077410697937,
+      "learning_rate": 0.00015112710317936022,
+      "loss": 1.1809,
+      "step": 7409
+    },
+    {
+      "epoch": 1.319266381766382,
+      "grad_norm": 0.6785375475883484,
+      "learning_rate": 0.00015111507297755367,
+      "loss": 0.9447,
+      "step": 7410
+    },
+    {
+      "epoch": 1.3194444444444444,
+      "grad_norm": 0.6513382196426392,
+      "learning_rate": 0.00015110304177425347,
+      "loss": 0.8286,
+      "step": 7411
+    },
+    {
+      "epoch": 1.3196225071225072,
+      "grad_norm": 0.6536405682563782,
+      "learning_rate": 0.00015109100956969533,
+      "loss": 1.1959,
+      "step": 7412
+    },
+    {
+      "epoch": 1.3198005698005697,
+      "grad_norm": 0.6633172035217285,
+      "learning_rate": 0.00015107897636411498,
+      "loss": 0.8839,
+      "step": 7413
+    },
+    {
+      "epoch": 1.3199786324786325,
+      "grad_norm": 0.5773791670799255,
+      "learning_rate": 0.00015106694215774821,
+      "loss": 0.9785,
+      "step": 7414
+    },
+    {
+      "epoch": 1.3201566951566952,
+      "grad_norm": 0.7005468010902405,
+      "learning_rate": 0.00015105490695083078,
+      "loss": 1.0752,
+      "step": 7415
+    },
+    {
+      "epoch": 1.320334757834758,
+      "grad_norm": 0.6509538888931274,
+      "learning_rate": 0.0001510428707435985,
+      "loss": 0.9886,
+      "step": 7416
+    },
+    {
+      "epoch": 1.3205128205128205,
+      "grad_norm": 0.6607788801193237,
+      "learning_rate": 0.0001510308335362872,
+      "loss": 0.9756,
+      "step": 7417
+    },
+    {
+      "epoch": 1.3206908831908832,
+      "grad_norm": 0.5977858304977417,
+      "learning_rate": 0.00015101879532913274,
+      "loss": 1.0574,
+      "step": 7418
+    },
+    {
+      "epoch": 1.3208689458689458,
+      "grad_norm": 0.6478607058525085,
+      "learning_rate": 0.00015100675612237096,
+      "loss": 1.0076,
+      "step": 7419
+    },
+    {
+      "epoch": 1.3210470085470085,
+      "grad_norm": 0.6386681199073792,
+      "learning_rate": 0.00015099471591623775,
+      "loss": 0.9639,
+      "step": 7420
+    },
+    {
+      "epoch": 1.3212250712250713,
+      "grad_norm": 0.6348143815994263,
+      "learning_rate": 0.000150982674710969,
+      "loss": 1.0226,
+      "step": 7421
+    },
+    {
+      "epoch": 1.321403133903134,
+      "grad_norm": 0.6737388372421265,
+      "learning_rate": 0.00015097063250680068,
+      "loss": 0.9985,
+      "step": 7422
+    },
+    {
+      "epoch": 1.3215811965811965,
+      "grad_norm": 0.7302656769752502,
+      "learning_rate": 0.00015095858930396866,
+      "loss": 0.9969,
+      "step": 7423
+    },
+    {
+      "epoch": 1.3217592592592593,
+      "grad_norm": 0.7062691450119019,
+      "learning_rate": 0.00015094654510270898,
+      "loss": 0.9137,
+      "step": 7424
+    },
+    {
+      "epoch": 1.3219373219373218,
+      "grad_norm": 0.6289888620376587,
+      "learning_rate": 0.00015093449990325754,
+      "loss": 0.9231,
+      "step": 7425
+    },
+    {
+      "epoch": 1.3221153846153846,
+      "grad_norm": 0.643284261226654,
+      "learning_rate": 0.0001509224537058504,
+      "loss": 0.8981,
+      "step": 7426
+    },
+    {
+      "epoch": 1.3222934472934473,
+      "grad_norm": 0.7019244432449341,
+      "learning_rate": 0.00015091040651072355,
+      "loss": 0.9994,
+      "step": 7427
+    },
+    {
+      "epoch": 1.32247150997151,
+      "grad_norm": 0.5982088446617126,
+      "learning_rate": 0.0001508983583181131,
+      "loss": 0.9365,
+      "step": 7428
+    },
+    {
+      "epoch": 1.3226495726495726,
+      "grad_norm": 0.6086063385009766,
+      "learning_rate": 0.00015088630912825498,
+      "loss": 0.8621,
+      "step": 7429
+    },
+    {
+      "epoch": 1.3228276353276354,
+      "grad_norm": 0.6829213500022888,
+      "learning_rate": 0.00015087425894138535,
+      "loss": 1.1959,
+      "step": 7430
+    },
+    {
+      "epoch": 1.323005698005698,
+      "grad_norm": 0.6538017392158508,
+      "learning_rate": 0.00015086220775774033,
+      "loss": 0.9412,
+      "step": 7431
+    },
+    {
+      "epoch": 1.3231837606837606,
+      "grad_norm": 0.6334070563316345,
+      "learning_rate": 0.00015085015557755597,
+      "loss": 0.9044,
+      "step": 7432
+    },
+    {
+      "epoch": 1.3233618233618234,
+      "grad_norm": 0.6514624357223511,
+      "learning_rate": 0.00015083810240106845,
+      "loss": 0.8859,
+      "step": 7433
+    },
+    {
+      "epoch": 1.3235398860398861,
+      "grad_norm": 0.7130434513092041,
+      "learning_rate": 0.00015082604822851397,
+      "loss": 1.2845,
+      "step": 7434
+    },
+    {
+      "epoch": 1.3237179487179487,
+      "grad_norm": 0.609419584274292,
+      "learning_rate": 0.00015081399306012862,
+      "loss": 1.0725,
+      "step": 7435
+    },
+    {
+      "epoch": 1.3238960113960114,
+      "grad_norm": 0.586807370185852,
+      "learning_rate": 0.0001508019368961486,
+      "loss": 0.9032,
+      "step": 7436
+    },
+    {
+      "epoch": 1.324074074074074,
+      "grad_norm": 0.6937291026115417,
+      "learning_rate": 0.0001507898797368102,
+      "loss": 0.7975,
+      "step": 7437
+    },
+    {
+      "epoch": 1.3242521367521367,
+      "grad_norm": 0.6804966330528259,
+      "learning_rate": 0.00015077782158234962,
+      "loss": 1.1018,
+      "step": 7438
+    },
+    {
+      "epoch": 1.3244301994301995,
+      "grad_norm": 0.6110677123069763,
+      "learning_rate": 0.0001507657624330031,
+      "loss": 0.7988,
+      "step": 7439
+    },
+    {
+      "epoch": 1.3246082621082622,
+      "grad_norm": 0.6340961456298828,
+      "learning_rate": 0.0001507537022890069,
+      "loss": 0.844,
+      "step": 7440
+    },
+    {
+      "epoch": 1.3247863247863247,
+      "grad_norm": 0.7291021943092346,
+      "learning_rate": 0.00015074164115059735,
+      "loss": 0.9867,
+      "step": 7441
+    },
+    {
+      "epoch": 1.3249643874643875,
+      "grad_norm": 0.6818505525588989,
+      "learning_rate": 0.00015072957901801076,
+      "loss": 1.1541,
+      "step": 7442
+    },
+    {
+      "epoch": 1.32514245014245,
+      "grad_norm": 0.6174707412719727,
+      "learning_rate": 0.00015071751589148345,
+      "loss": 1.1679,
+      "step": 7443
+    },
+    {
+      "epoch": 1.3253205128205128,
+      "grad_norm": 0.6481367945671082,
+      "learning_rate": 0.00015070545177125176,
+      "loss": 1.0955,
+      "step": 7444
+    },
+    {
+      "epoch": 1.3254985754985755,
+      "grad_norm": 0.6752339005470276,
+      "learning_rate": 0.00015069338665755203,
+      "loss": 0.8651,
+      "step": 7445
+    },
+    {
+      "epoch": 1.3256766381766383,
+      "grad_norm": 0.6608055830001831,
+      "learning_rate": 0.00015068132055062077,
+      "loss": 0.9553,
+      "step": 7446
+    },
+    {
+      "epoch": 1.3258547008547008,
+      "grad_norm": 0.5933246612548828,
+      "learning_rate": 0.00015066925345069425,
+      "loss": 0.8584,
+      "step": 7447
+    },
+    {
+      "epoch": 1.3260327635327636,
+      "grad_norm": 0.6301844716072083,
+      "learning_rate": 0.000150657185358009,
+      "loss": 0.8583,
+      "step": 7448
+    },
+    {
+      "epoch": 1.326210826210826,
+      "grad_norm": 0.7359434962272644,
+      "learning_rate": 0.00015064511627280145,
+      "loss": 1.0905,
+      "step": 7449
+    },
+    {
+      "epoch": 1.3263888888888888,
+      "grad_norm": 0.6334579586982727,
+      "learning_rate": 0.00015063304619530806,
+      "loss": 0.9814,
+      "step": 7450
+    },
+    {
+      "epoch": 1.3265669515669516,
+      "grad_norm": 0.6974197626113892,
+      "learning_rate": 0.00015062097512576528,
+      "loss": 0.9302,
+      "step": 7451
+    },
+    {
+      "epoch": 1.3267450142450143,
+      "grad_norm": 0.6895849704742432,
+      "learning_rate": 0.00015060890306440965,
+      "loss": 1.0175,
+      "step": 7452
+    },
+    {
+      "epoch": 1.3269230769230769,
+      "grad_norm": 0.5938003659248352,
+      "learning_rate": 0.00015059683001147767,
+      "loss": 0.8084,
+      "step": 7453
+    },
+    {
+      "epoch": 1.3271011396011396,
+      "grad_norm": 0.6821470856666565,
+      "learning_rate": 0.00015058475596720596,
+      "loss": 0.9897,
+      "step": 7454
+    },
+    {
+      "epoch": 1.3272792022792022,
+      "grad_norm": 0.5507164001464844,
+      "learning_rate": 0.00015057268093183104,
+      "loss": 0.7012,
+      "step": 7455
+    },
+    {
+      "epoch": 1.327457264957265,
+      "grad_norm": 0.6216199398040771,
+      "learning_rate": 0.00015056060490558945,
+      "loss": 1.0281,
+      "step": 7456
+    },
+    {
+      "epoch": 1.3276353276353277,
+      "grad_norm": 0.6674157977104187,
+      "learning_rate": 0.00015054852788871787,
+      "loss": 0.8776,
+      "step": 7457
+    },
+    {
+      "epoch": 1.3278133903133904,
+      "grad_norm": 0.666963517665863,
+      "learning_rate": 0.0001505364498814529,
+      "loss": 1.0742,
+      "step": 7458
+    },
+    {
+      "epoch": 1.327991452991453,
+      "grad_norm": 0.6205331683158875,
+      "learning_rate": 0.00015052437088403114,
+      "loss": 1.1109,
+      "step": 7459
+    },
+    {
+      "epoch": 1.3281695156695157,
+      "grad_norm": 0.6402750611305237,
+      "learning_rate": 0.00015051229089668933,
+      "loss": 1.0648,
+      "step": 7460
+    },
+    {
+      "epoch": 1.3283475783475782,
+      "grad_norm": 0.7445703744888306,
+      "learning_rate": 0.00015050020991966406,
+      "loss": 0.8989,
+      "step": 7461
+    },
+    {
+      "epoch": 1.328525641025641,
+      "grad_norm": 0.8131299614906311,
+      "learning_rate": 0.00015048812795319212,
+      "loss": 0.9552,
+      "step": 7462
+    },
+    {
+      "epoch": 1.3287037037037037,
+      "grad_norm": 0.7007313966751099,
+      "learning_rate": 0.00015047604499751017,
+      "loss": 0.9899,
+      "step": 7463
+    },
+    {
+      "epoch": 1.3288817663817665,
+      "grad_norm": 0.60536789894104,
+      "learning_rate": 0.000150463961052855,
+      "loss": 0.7694,
+      "step": 7464
+    },
+    {
+      "epoch": 1.329059829059829,
+      "grad_norm": 0.6910434365272522,
+      "learning_rate": 0.00015045187611946331,
+      "loss": 0.9575,
+      "step": 7465
+    },
+    {
+      "epoch": 1.3292378917378918,
+      "grad_norm": 0.7693352103233337,
+      "learning_rate": 0.00015043979019757194,
+      "loss": 1.1987,
+      "step": 7466
+    },
+    {
+      "epoch": 1.3294159544159543,
+      "grad_norm": 0.6675218939781189,
+      "learning_rate": 0.00015042770328741763,
+      "loss": 1.0099,
+      "step": 7467
+    },
+    {
+      "epoch": 1.329594017094017,
+      "grad_norm": 0.8040883541107178,
+      "learning_rate": 0.00015041561538923722,
+      "loss": 0.9493,
+      "step": 7468
+    },
+    {
+      "epoch": 1.3297720797720798,
+      "grad_norm": 0.6765826344490051,
+      "learning_rate": 0.00015040352650326762,
+      "loss": 1.1035,
+      "step": 7469
+    },
+    {
+      "epoch": 1.3299501424501425,
+      "grad_norm": 0.7099924087524414,
+      "learning_rate": 0.0001503914366297456,
+      "loss": 0.9198,
+      "step": 7470
+    },
+    {
+      "epoch": 1.330128205128205,
+      "grad_norm": 0.6673682928085327,
+      "learning_rate": 0.00015037934576890804,
+      "loss": 1.0234,
+      "step": 7471
+    },
+    {
+      "epoch": 1.3303062678062678,
+      "grad_norm": 0.7022300958633423,
+      "learning_rate": 0.00015036725392099184,
+      "loss": 1.3875,
+      "step": 7472
+    },
+    {
+      "epoch": 1.3304843304843303,
+      "grad_norm": 0.6997060179710388,
+      "learning_rate": 0.00015035516108623394,
+      "loss": 0.8114,
+      "step": 7473
+    },
+    {
+      "epoch": 1.330662393162393,
+      "grad_norm": 0.6262350678443909,
+      "learning_rate": 0.00015034306726487127,
+      "loss": 1.128,
+      "step": 7474
+    },
+    {
+      "epoch": 1.3308404558404558,
+      "grad_norm": 0.6330382227897644,
+      "learning_rate": 0.00015033097245714078,
+      "loss": 0.9032,
+      "step": 7475
+    },
+    {
+      "epoch": 1.3310185185185186,
+      "grad_norm": 0.6527551412582397,
+      "learning_rate": 0.00015031887666327944,
+      "loss": 0.9311,
+      "step": 7476
+    },
+    {
+      "epoch": 1.3311965811965811,
+      "grad_norm": 0.6754798889160156,
+      "learning_rate": 0.00015030677988352422,
+      "loss": 1.0626,
+      "step": 7477
+    },
+    {
+      "epoch": 1.3313746438746439,
+      "grad_norm": 0.6397945284843445,
+      "learning_rate": 0.00015029468211811216,
+      "loss": 0.9222,
+      "step": 7478
+    },
+    {
+      "epoch": 1.3315527065527066,
+      "grad_norm": 0.8163481950759888,
+      "learning_rate": 0.0001502825833672803,
+      "loss": 1.1827,
+      "step": 7479
+    },
+    {
+      "epoch": 1.3317307692307692,
+      "grad_norm": 0.6645621657371521,
+      "learning_rate": 0.00015027048363126566,
+      "loss": 0.9744,
+      "step": 7480
+    },
+    {
+      "epoch": 1.331908831908832,
+      "grad_norm": 0.6943182349205017,
+      "learning_rate": 0.0001502583829103053,
+      "loss": 1.1597,
+      "step": 7481
+    },
+    {
+      "epoch": 1.3320868945868947,
+      "grad_norm": 0.6283710598945618,
+      "learning_rate": 0.00015024628120463636,
+      "loss": 0.9514,
+      "step": 7482
+    },
+    {
+      "epoch": 1.3322649572649572,
+      "grad_norm": 0.6159678101539612,
+      "learning_rate": 0.0001502341785144959,
+      "loss": 0.9752,
+      "step": 7483
+    },
+    {
+      "epoch": 1.33244301994302,
+      "grad_norm": 0.6259802579879761,
+      "learning_rate": 0.00015022207484012107,
+      "loss": 0.9356,
+      "step": 7484
+    },
+    {
+      "epoch": 1.3326210826210827,
+      "grad_norm": 0.7322365641593933,
+      "learning_rate": 0.00015020997018174904,
+      "loss": 1.2072,
+      "step": 7485
+    },
+    {
+      "epoch": 1.3327991452991452,
+      "grad_norm": 0.6323443651199341,
+      "learning_rate": 0.0001501978645396169,
+      "loss": 1.1661,
+      "step": 7486
+    },
+    {
+      "epoch": 1.332977207977208,
+      "grad_norm": 0.7811527848243713,
+      "learning_rate": 0.00015018575791396187,
+      "loss": 1.0304,
+      "step": 7487
+    },
+    {
+      "epoch": 1.3331552706552707,
+      "grad_norm": 0.7221232056617737,
+      "learning_rate": 0.0001501736503050212,
+      "loss": 0.8838,
+      "step": 7488
+    },
+    {
+      "epoch": 1.3333333333333333,
+      "grad_norm": 0.6980099081993103,
+      "learning_rate": 0.00015016154171303207,
+      "loss": 1.1841,
+      "step": 7489
+    },
+    {
+      "epoch": 1.333511396011396,
+      "grad_norm": 0.6802879571914673,
+      "learning_rate": 0.00015014943213823175,
+      "loss": 0.959,
+      "step": 7490
+    },
+    {
+      "epoch": 1.3336894586894588,
+      "grad_norm": 0.637698233127594,
+      "learning_rate": 0.00015013732158085746,
+      "loss": 1.0517,
+      "step": 7491
+    },
+    {
+      "epoch": 1.3338675213675213,
+      "grad_norm": 0.6386787295341492,
+      "learning_rate": 0.0001501252100411465,
+      "loss": 0.7125,
+      "step": 7492
+    },
+    {
+      "epoch": 1.334045584045584,
+      "grad_norm": 0.6287358403205872,
+      "learning_rate": 0.0001501130975193362,
+      "loss": 0.8913,
+      "step": 7493
+    },
+    {
+      "epoch": 1.3342236467236468,
+      "grad_norm": 0.6142337322235107,
+      "learning_rate": 0.00015010098401566386,
+      "loss": 0.8149,
+      "step": 7494
+    },
+    {
+      "epoch": 1.3344017094017093,
+      "grad_norm": 0.6369916200637817,
+      "learning_rate": 0.0001500888695303668,
+      "loss": 1.0186,
+      "step": 7495
+    },
+    {
+      "epoch": 1.334579772079772,
+      "grad_norm": 0.7526934146881104,
+      "learning_rate": 0.0001500767540636824,
+      "loss": 1.2421,
+      "step": 7496
+    },
+    {
+      "epoch": 1.3347578347578348,
+      "grad_norm": 0.7278095483779907,
+      "learning_rate": 0.00015006463761584802,
+      "loss": 0.9856,
+      "step": 7497
+    },
+    {
+      "epoch": 1.3349358974358974,
+      "grad_norm": 0.6165127158164978,
+      "learning_rate": 0.00015005252018710104,
+      "loss": 1.0041,
+      "step": 7498
+    },
+    {
+      "epoch": 1.33511396011396,
+      "grad_norm": 0.637856662273407,
+      "learning_rate": 0.00015004040177767896,
+      "loss": 0.9134,
+      "step": 7499
+    },
+    {
+      "epoch": 1.3352920227920229,
+      "grad_norm": 0.661227285861969,
+      "learning_rate": 0.00015002828238781912,
+      "loss": 1.0393,
+      "step": 7500
+    },
+    {
+      "epoch": 1.3354700854700854,
+      "grad_norm": 0.6061869859695435,
+      "learning_rate": 0.000150016162017759,
+      "loss": 0.8453,
+      "step": 7501
+    },
+    {
+      "epoch": 1.3356481481481481,
+      "grad_norm": 0.6938419938087463,
+      "learning_rate": 0.0001500040406677361,
+      "loss": 1.0338,
+      "step": 7502
+    },
+    {
+      "epoch": 1.335826210826211,
+      "grad_norm": 0.6672863960266113,
+      "learning_rate": 0.0001499919183379879,
+      "loss": 0.8765,
+      "step": 7503
+    },
+    {
+      "epoch": 1.3360042735042734,
+      "grad_norm": 0.6200515031814575,
+      "learning_rate": 0.00014997979502875193,
+      "loss": 0.8286,
+      "step": 7504
+    },
+    {
+      "epoch": 1.3361823361823362,
+      "grad_norm": 0.6287549138069153,
+      "learning_rate": 0.00014996767074026567,
+      "loss": 0.9761,
+      "step": 7505
+    },
+    {
+      "epoch": 1.336360398860399,
+      "grad_norm": 0.6036837100982666,
+      "learning_rate": 0.0001499555454727667,
+      "loss": 1.0506,
+      "step": 7506
+    },
+    {
+      "epoch": 1.3365384615384617,
+      "grad_norm": 0.6875260472297668,
+      "learning_rate": 0.0001499434192264926,
+      "loss": 1.001,
+      "step": 7507
+    },
+    {
+      "epoch": 1.3367165242165242,
+      "grad_norm": 0.6558469533920288,
+      "learning_rate": 0.00014993129200168096,
+      "loss": 0.6874,
+      "step": 7508
+    },
+    {
+      "epoch": 1.336894586894587,
+      "grad_norm": 0.604167103767395,
+      "learning_rate": 0.00014991916379856934,
+      "loss": 1.0173,
+      "step": 7509
+    },
+    {
+      "epoch": 1.3370726495726495,
+      "grad_norm": 0.5941442251205444,
+      "learning_rate": 0.00014990703461739544,
+      "loss": 0.8569,
+      "step": 7510
+    },
+    {
+      "epoch": 1.3372507122507122,
+      "grad_norm": 0.7645071148872375,
+      "learning_rate": 0.00014989490445839687,
+      "loss": 1.0172,
+      "step": 7511
+    },
+    {
+      "epoch": 1.337428774928775,
+      "grad_norm": 0.5491678714752197,
+      "learning_rate": 0.00014988277332181126,
+      "loss": 0.8018,
+      "step": 7512
+    },
+    {
+      "epoch": 1.3376068376068377,
+      "grad_norm": 0.583322286605835,
+      "learning_rate": 0.00014987064120787635,
+      "loss": 0.8704,
+      "step": 7513
+    },
+    {
+      "epoch": 1.3377849002849003,
+      "grad_norm": 0.7385724186897278,
+      "learning_rate": 0.00014985850811682984,
+      "loss": 1.1121,
+      "step": 7514
+    },
+    {
+      "epoch": 1.337962962962963,
+      "grad_norm": 0.6842585206031799,
+      "learning_rate": 0.00014984637404890941,
+      "loss": 0.914,
+      "step": 7515
+    },
+    {
+      "epoch": 1.3381410256410255,
+      "grad_norm": 0.6771186590194702,
+      "learning_rate": 0.00014983423900435285,
+      "loss": 1.0838,
+      "step": 7516
+    },
+    {
+      "epoch": 1.3383190883190883,
+      "grad_norm": 0.7562049031257629,
+      "learning_rate": 0.00014982210298339788,
+      "loss": 1.123,
+      "step": 7517
+    },
+    {
+      "epoch": 1.338497150997151,
+      "grad_norm": 0.7617804408073425,
+      "learning_rate": 0.0001498099659862823,
+      "loss": 0.9438,
+      "step": 7518
+    },
+    {
+      "epoch": 1.3386752136752138,
+      "grad_norm": 0.561958909034729,
+      "learning_rate": 0.00014979782801324392,
+      "loss": 0.8739,
+      "step": 7519
+    },
+    {
+      "epoch": 1.3388532763532763,
+      "grad_norm": 0.7726154923439026,
+      "learning_rate": 0.00014978568906452052,
+      "loss": 1.1306,
+      "step": 7520
+    },
+    {
+      "epoch": 1.339031339031339,
+      "grad_norm": 0.6658660173416138,
+      "learning_rate": 0.00014977354914035002,
+      "loss": 1.0214,
+      "step": 7521
+    },
+    {
+      "epoch": 1.3392094017094016,
+      "grad_norm": 0.6385402679443359,
+      "learning_rate": 0.00014976140824097015,
+      "loss": 0.8851,
+      "step": 7522
+    },
+    {
+      "epoch": 1.3393874643874644,
+      "grad_norm": 0.6315767168998718,
+      "learning_rate": 0.0001497492663666189,
+      "loss": 0.986,
+      "step": 7523
+    },
+    {
+      "epoch": 1.3395655270655271,
+      "grad_norm": 0.6379088759422302,
+      "learning_rate": 0.0001497371235175341,
+      "loss": 0.9322,
+      "step": 7524
+    },
+    {
+      "epoch": 1.3397435897435899,
+      "grad_norm": 0.6605859994888306,
+      "learning_rate": 0.0001497249796939537,
+      "loss": 1.1112,
+      "step": 7525
+    },
+    {
+      "epoch": 1.3399216524216524,
+      "grad_norm": 0.7342822551727295,
+      "learning_rate": 0.0001497128348961156,
+      "loss": 0.9798,
+      "step": 7526
+    },
+    {
+      "epoch": 1.3400997150997151,
+      "grad_norm": 0.5667192935943604,
+      "learning_rate": 0.0001497006891242578,
+      "loss": 0.7493,
+      "step": 7527
+    },
+    {
+      "epoch": 1.3402777777777777,
+      "grad_norm": 0.6106827855110168,
+      "learning_rate": 0.0001496885423786182,
+      "loss": 1.0924,
+      "step": 7528
+    },
+    {
+      "epoch": 1.3404558404558404,
+      "grad_norm": 0.6207202076911926,
+      "learning_rate": 0.00014967639465943486,
+      "loss": 1.1123,
+      "step": 7529
+    },
+    {
+      "epoch": 1.3406339031339032,
+      "grad_norm": 0.6272760033607483,
+      "learning_rate": 0.00014966424596694574,
+      "loss": 0.9275,
+      "step": 7530
+    },
+    {
+      "epoch": 1.340811965811966,
+      "grad_norm": 0.6485986113548279,
+      "learning_rate": 0.0001496520963013889,
+      "loss": 1.1491,
+      "step": 7531
+    },
+    {
+      "epoch": 1.3409900284900285,
+      "grad_norm": 0.5743561387062073,
+      "learning_rate": 0.00014963994566300238,
+      "loss": 1.1101,
+      "step": 7532
+    },
+    {
+      "epoch": 1.3411680911680912,
+      "grad_norm": 0.6508657336235046,
+      "learning_rate": 0.00014962779405202424,
+      "loss": 1.0368,
+      "step": 7533
+    },
+    {
+      "epoch": 1.3413461538461537,
+      "grad_norm": 0.6598748564720154,
+      "learning_rate": 0.00014961564146869259,
+      "loss": 1.1064,
+      "step": 7534
+    },
+    {
+      "epoch": 1.3415242165242165,
+      "grad_norm": 0.6722840070724487,
+      "learning_rate": 0.00014960348791324547,
+      "loss": 0.9758,
+      "step": 7535
+    },
+    {
+      "epoch": 1.3417022792022792,
+      "grad_norm": 0.5807220935821533,
+      "learning_rate": 0.00014959133338592108,
+      "loss": 0.9936,
+      "step": 7536
+    },
+    {
+      "epoch": 1.341880341880342,
+      "grad_norm": 0.6318647265434265,
+      "learning_rate": 0.00014957917788695752,
+      "loss": 0.907,
+      "step": 7537
+    },
+    {
+      "epoch": 1.3420584045584045,
+      "grad_norm": 0.6725485324859619,
+      "learning_rate": 0.00014956702141659295,
+      "loss": 0.988,
+      "step": 7538
+    },
+    {
+      "epoch": 1.3422364672364673,
+      "grad_norm": 0.6675217747688293,
+      "learning_rate": 0.0001495548639750656,
+      "loss": 1.0194,
+      "step": 7539
+    },
+    {
+      "epoch": 1.3424145299145298,
+      "grad_norm": 0.6976884603500366,
+      "learning_rate": 0.0001495427055626136,
+      "loss": 1.2515,
+      "step": 7540
+    },
+    {
+      "epoch": 1.3425925925925926,
+      "grad_norm": 0.654941737651825,
+      "learning_rate": 0.0001495305461794752,
+      "loss": 1.2072,
+      "step": 7541
+    },
+    {
+      "epoch": 1.3427706552706553,
+      "grad_norm": 0.7085291743278503,
+      "learning_rate": 0.00014951838582588864,
+      "loss": 0.9772,
+      "step": 7542
+    },
+    {
+      "epoch": 1.342948717948718,
+      "grad_norm": 0.6319566965103149,
+      "learning_rate": 0.00014950622450209217,
+      "loss": 1.0162,
+      "step": 7543
+    },
+    {
+      "epoch": 1.3431267806267806,
+      "grad_norm": 0.6272495985031128,
+      "learning_rate": 0.00014949406220832407,
+      "loss": 0.7985,
+      "step": 7544
+    },
+    {
+      "epoch": 1.3433048433048433,
+      "grad_norm": 0.6352069973945618,
+      "learning_rate": 0.00014948189894482266,
+      "loss": 1.0041,
+      "step": 7545
+    },
+    {
+      "epoch": 1.3434829059829059,
+      "grad_norm": 0.6071867346763611,
+      "learning_rate": 0.0001494697347118262,
+      "loss": 0.9486,
+      "step": 7546
+    },
+    {
+      "epoch": 1.3436609686609686,
+      "grad_norm": 0.6458829641342163,
+      "learning_rate": 0.00014945756950957308,
+      "loss": 0.9417,
+      "step": 7547
+    },
+    {
+      "epoch": 1.3438390313390314,
+      "grad_norm": 0.6472262740135193,
+      "learning_rate": 0.0001494454033383016,
+      "loss": 1.056,
+      "step": 7548
+    },
+    {
+      "epoch": 1.3440170940170941,
+      "grad_norm": 0.6985635161399841,
+      "learning_rate": 0.00014943323619825017,
+      "loss": 1.0483,
+      "step": 7549
+    },
+    {
+      "epoch": 1.3441951566951567,
+      "grad_norm": 0.6379460096359253,
+      "learning_rate": 0.00014942106808965718,
+      "loss": 0.9552,
+      "step": 7550
+    },
+    {
+      "epoch": 1.3443732193732194,
+      "grad_norm": 0.7036557793617249,
+      "learning_rate": 0.00014940889901276098,
+      "loss": 0.9647,
+      "step": 7551
+    },
+    {
+      "epoch": 1.344551282051282,
+      "grad_norm": 0.6697289943695068,
+      "learning_rate": 0.0001493967289678001,
+      "loss": 0.9029,
+      "step": 7552
+    },
+    {
+      "epoch": 1.3447293447293447,
+      "grad_norm": 0.6336250901222229,
+      "learning_rate": 0.00014938455795501286,
+      "loss": 0.9458,
+      "step": 7553
+    },
+    {
+      "epoch": 1.3449074074074074,
+      "grad_norm": 0.7279673218727112,
+      "learning_rate": 0.00014937238597463785,
+      "loss": 1.0228,
+      "step": 7554
+    },
+    {
+      "epoch": 1.3450854700854702,
+      "grad_norm": 0.6514406204223633,
+      "learning_rate": 0.00014936021302691349,
+      "loss": 0.8265,
+      "step": 7555
+    },
+    {
+      "epoch": 1.3452635327635327,
+      "grad_norm": 0.6405338644981384,
+      "learning_rate": 0.0001493480391120783,
+      "loss": 0.9516,
+      "step": 7556
+    },
+    {
+      "epoch": 1.3454415954415955,
+      "grad_norm": 0.6442672610282898,
+      "learning_rate": 0.00014933586423037076,
+      "loss": 0.9279,
+      "step": 7557
+    },
+    {
+      "epoch": 1.345619658119658,
+      "grad_norm": 0.7588633894920349,
+      "learning_rate": 0.00014932368838202945,
+      "loss": 1.0976,
+      "step": 7558
+    },
+    {
+      "epoch": 1.3457977207977208,
+      "grad_norm": 0.5536739230155945,
+      "learning_rate": 0.00014931151156729296,
+      "loss": 0.713,
+      "step": 7559
+    },
+    {
+      "epoch": 1.3459757834757835,
+      "grad_norm": 0.6897570490837097,
+      "learning_rate": 0.00014929933378639981,
+      "loss": 0.9521,
+      "step": 7560
+    },
+    {
+      "epoch": 1.3461538461538463,
+      "grad_norm": 0.6654927134513855,
+      "learning_rate": 0.00014928715503958863,
+      "loss": 0.8506,
+      "step": 7561
+    },
+    {
+      "epoch": 1.3463319088319088,
+      "grad_norm": 0.655806839466095,
+      "learning_rate": 0.00014927497532709808,
+      "loss": 0.8636,
+      "step": 7562
+    },
+    {
+      "epoch": 1.3465099715099715,
+      "grad_norm": 0.6547064185142517,
+      "learning_rate": 0.00014926279464916667,
+      "loss": 0.9155,
+      "step": 7563
+    },
+    {
+      "epoch": 1.346688034188034,
+      "grad_norm": 0.7555415034294128,
+      "learning_rate": 0.00014925061300603316,
+      "loss": 0.8791,
+      "step": 7564
+    },
+    {
+      "epoch": 1.3468660968660968,
+      "grad_norm": 0.7439392805099487,
+      "learning_rate": 0.0001492384303979362,
+      "loss": 1.1669,
+      "step": 7565
+    },
+    {
+      "epoch": 1.3470441595441596,
+      "grad_norm": 0.6016925573348999,
+      "learning_rate": 0.0001492262468251145,
+      "loss": 0.9811,
+      "step": 7566
+    },
+    {
+      "epoch": 1.3472222222222223,
+      "grad_norm": 0.644652783870697,
+      "learning_rate": 0.00014921406228780675,
+      "loss": 0.7096,
+      "step": 7567
+    },
+    {
+      "epoch": 1.3474002849002849,
+      "grad_norm": 0.721814751625061,
+      "learning_rate": 0.00014920187678625166,
+      "loss": 0.9933,
+      "step": 7568
+    },
+    {
+      "epoch": 1.3475783475783476,
+      "grad_norm": 0.6212092638015747,
+      "learning_rate": 0.000149189690320688,
+      "loss": 0.8499,
+      "step": 7569
+    },
+    {
+      "epoch": 1.3477564102564101,
+      "grad_norm": 0.6235958337783813,
+      "learning_rate": 0.00014917750289135455,
+      "loss": 0.9189,
+      "step": 7570
+    },
+    {
+      "epoch": 1.3479344729344729,
+      "grad_norm": 0.6309674978256226,
+      "learning_rate": 0.0001491653144984901,
+      "loss": 0.9744,
+      "step": 7571
+    },
+    {
+      "epoch": 1.3481125356125356,
+      "grad_norm": 0.7606496214866638,
+      "learning_rate": 0.00014915312514233344,
+      "loss": 1.0181,
+      "step": 7572
+    },
+    {
+      "epoch": 1.3482905982905984,
+      "grad_norm": 0.6892654895782471,
+      "learning_rate": 0.00014914093482312342,
+      "loss": 0.9517,
+      "step": 7573
+    },
+    {
+      "epoch": 1.348468660968661,
+      "grad_norm": 0.6746503114700317,
+      "learning_rate": 0.0001491287435410988,
+      "loss": 1.056,
+      "step": 7574
+    },
+    {
+      "epoch": 1.3486467236467237,
+      "grad_norm": 0.5892919301986694,
+      "learning_rate": 0.00014911655129649858,
+      "loss": 1.0515,
+      "step": 7575
+    },
+    {
+      "epoch": 1.3488247863247862,
+      "grad_norm": 0.6278096437454224,
+      "learning_rate": 0.0001491043580895615,
+      "loss": 0.864,
+      "step": 7576
+    },
+    {
+      "epoch": 1.349002849002849,
+      "grad_norm": 0.7017706632614136,
+      "learning_rate": 0.0001490921639205266,
+      "loss": 1.0618,
+      "step": 7577
+    },
+    {
+      "epoch": 1.3491809116809117,
+      "grad_norm": 0.7318746447563171,
+      "learning_rate": 0.00014907996878963268,
+      "loss": 0.9905,
+      "step": 7578
+    },
+    {
+      "epoch": 1.3493589743589745,
+      "grad_norm": 0.6485885977745056,
+      "learning_rate": 0.00014906777269711873,
+      "loss": 1.0498,
+      "step": 7579
+    },
+    {
+      "epoch": 1.349537037037037,
+      "grad_norm": 0.644902229309082,
+      "learning_rate": 0.00014905557564322372,
+      "loss": 0.885,
+      "step": 7580
+    },
+    {
+      "epoch": 1.3497150997150997,
+      "grad_norm": 0.6567610502243042,
+      "learning_rate": 0.0001490433776281866,
+      "loss": 0.8938,
+      "step": 7581
+    },
+    {
+      "epoch": 1.3498931623931623,
+      "grad_norm": 0.6233102679252625,
+      "learning_rate": 0.0001490311786522464,
+      "loss": 0.9007,
+      "step": 7582
+    },
+    {
+      "epoch": 1.350071225071225,
+      "grad_norm": 0.6962146759033203,
+      "learning_rate": 0.00014901897871564206,
+      "loss": 0.9257,
+      "step": 7583
+    },
+    {
+      "epoch": 1.3502492877492878,
+      "grad_norm": 0.6986933350563049,
+      "learning_rate": 0.00014900677781861266,
+      "loss": 1.0089,
+      "step": 7584
+    },
+    {
+      "epoch": 1.3504273504273505,
+      "grad_norm": 0.7527925968170166,
+      "learning_rate": 0.00014899457596139729,
+      "loss": 1.0762,
+      "step": 7585
+    },
+    {
+      "epoch": 1.350605413105413,
+      "grad_norm": 0.69191974401474,
+      "learning_rate": 0.00014898237314423494,
+      "loss": 0.9829,
+      "step": 7586
+    },
+    {
+      "epoch": 1.3507834757834758,
+      "grad_norm": 0.7866443395614624,
+      "learning_rate": 0.00014897016936736478,
+      "loss": 1.0911,
+      "step": 7587
+    },
+    {
+      "epoch": 1.3509615384615383,
+      "grad_norm": 0.7087522745132446,
+      "learning_rate": 0.00014895796463102587,
+      "loss": 1.0693,
+      "step": 7588
+    },
+    {
+      "epoch": 1.351139601139601,
+      "grad_norm": 0.704276442527771,
+      "learning_rate": 0.00014894575893545736,
+      "loss": 0.9082,
+      "step": 7589
+    },
+    {
+      "epoch": 1.3513176638176638,
+      "grad_norm": 0.7074487805366516,
+      "learning_rate": 0.00014893355228089833,
+      "loss": 0.8731,
+      "step": 7590
+    },
+    {
+      "epoch": 1.3514957264957266,
+      "grad_norm": 0.6542425155639648,
+      "learning_rate": 0.00014892134466758803,
+      "loss": 0.9325,
+      "step": 7591
+    },
+    {
+      "epoch": 1.351673789173789,
+      "grad_norm": 0.6577230095863342,
+      "learning_rate": 0.0001489091360957656,
+      "loss": 0.8468,
+      "step": 7592
+    },
+    {
+      "epoch": 1.3518518518518519,
+      "grad_norm": 0.638534426689148,
+      "learning_rate": 0.00014889692656567025,
+      "loss": 0.8598,
+      "step": 7593
+    },
+    {
+      "epoch": 1.3520299145299146,
+      "grad_norm": 0.751133918762207,
+      "learning_rate": 0.0001488847160775412,
+      "loss": 1.0006,
+      "step": 7594
+    },
+    {
+      "epoch": 1.3522079772079771,
+      "grad_norm": 0.6272708773612976,
+      "learning_rate": 0.00014887250463161767,
+      "loss": 0.8782,
+      "step": 7595
+    },
+    {
+      "epoch": 1.35238603988604,
+      "grad_norm": 0.7242439985275269,
+      "learning_rate": 0.00014886029222813897,
+      "loss": 1.2443,
+      "step": 7596
+    },
+    {
+      "epoch": 1.3525641025641026,
+      "grad_norm": 0.6199275851249695,
+      "learning_rate": 0.0001488480788673443,
+      "loss": 0.9211,
+      "step": 7597
+    },
+    {
+      "epoch": 1.3527421652421652,
+      "grad_norm": 0.6401306986808777,
+      "learning_rate": 0.00014883586454947305,
+      "loss": 0.8808,
+      "step": 7598
+    },
+    {
+      "epoch": 1.352920227920228,
+      "grad_norm": 0.6340938806533813,
+      "learning_rate": 0.00014882364927476443,
+      "loss": 0.9406,
+      "step": 7599
+    },
+    {
+      "epoch": 1.3530982905982907,
+      "grad_norm": 0.6388604044914246,
+      "learning_rate": 0.00014881143304345783,
+      "loss": 1.0674,
+      "step": 7600
+    },
+    {
+      "epoch": 1.3532763532763532,
+      "grad_norm": 0.7562061548233032,
+      "learning_rate": 0.00014879921585579263,
+      "loss": 1.0959,
+      "step": 7601
+    },
+    {
+      "epoch": 1.353454415954416,
+      "grad_norm": 0.6303606033325195,
+      "learning_rate": 0.00014878699771200815,
+      "loss": 0.9641,
+      "step": 7602
+    },
+    {
+      "epoch": 1.3536324786324787,
+      "grad_norm": 0.8623232841491699,
+      "learning_rate": 0.00014877477861234382,
+      "loss": 1.1529,
+      "step": 7603
+    },
+    {
+      "epoch": 1.3538105413105412,
+      "grad_norm": 0.6607624888420105,
+      "learning_rate": 0.00014876255855703896,
+      "loss": 0.6291,
+      "step": 7604
+    },
+    {
+      "epoch": 1.353988603988604,
+      "grad_norm": 0.6226931214332581,
+      "learning_rate": 0.0001487503375463331,
+      "loss": 0.7485,
+      "step": 7605
+    },
+    {
+      "epoch": 1.3541666666666667,
+      "grad_norm": 0.7626705169677734,
+      "learning_rate": 0.00014873811558046565,
+      "loss": 0.9694,
+      "step": 7606
+    },
+    {
+      "epoch": 1.3543447293447293,
+      "grad_norm": 0.5436057448387146,
+      "learning_rate": 0.00014872589265967605,
+      "loss": 0.6173,
+      "step": 7607
+    },
+    {
+      "epoch": 1.354522792022792,
+      "grad_norm": 0.7822177410125732,
+      "learning_rate": 0.00014871366878420382,
+      "loss": 1.0048,
+      "step": 7608
+    },
+    {
+      "epoch": 1.3547008547008548,
+      "grad_norm": 0.6955201625823975,
+      "learning_rate": 0.00014870144395428848,
+      "loss": 0.9487,
+      "step": 7609
+    },
+    {
+      "epoch": 1.3548789173789173,
+      "grad_norm": 0.6625505685806274,
+      "learning_rate": 0.00014868921817016943,
+      "loss": 0.9389,
+      "step": 7610
+    },
+    {
+      "epoch": 1.35505698005698,
+      "grad_norm": 0.6625354886054993,
+      "learning_rate": 0.00014867699143208634,
+      "loss": 0.9538,
+      "step": 7611
+    },
+    {
+      "epoch": 1.3552350427350428,
+      "grad_norm": 0.7426592707633972,
+      "learning_rate": 0.00014866476374027874,
+      "loss": 1.2566,
+      "step": 7612
+    },
+    {
+      "epoch": 1.3554131054131053,
+      "grad_norm": 0.6856544017791748,
+      "learning_rate": 0.00014865253509498616,
+      "loss": 0.9663,
+      "step": 7613
+    },
+    {
+      "epoch": 1.355591168091168,
+      "grad_norm": 0.6343915462493896,
+      "learning_rate": 0.00014864030549644825,
+      "loss": 0.9416,
+      "step": 7614
+    },
+    {
+      "epoch": 1.3557692307692308,
+      "grad_norm": 0.6319553256034851,
+      "learning_rate": 0.00014862807494490454,
+      "loss": 0.9335,
+      "step": 7615
+    },
+    {
+      "epoch": 1.3559472934472934,
+      "grad_norm": 0.6919772624969482,
+      "learning_rate": 0.00014861584344059476,
+      "loss": 0.8516,
+      "step": 7616
+    },
+    {
+      "epoch": 1.3561253561253561,
+      "grad_norm": 0.6405790448188782,
+      "learning_rate": 0.00014860361098375851,
+      "loss": 1.1278,
+      "step": 7617
+    },
+    {
+      "epoch": 1.3563034188034189,
+      "grad_norm": 0.7591732144355774,
+      "learning_rate": 0.00014859137757463548,
+      "loss": 1.0961,
+      "step": 7618
+    },
+    {
+      "epoch": 1.3564814814814814,
+      "grad_norm": 0.6166727542877197,
+      "learning_rate": 0.0001485791432134653,
+      "loss": 0.9358,
+      "step": 7619
+    },
+    {
+      "epoch": 1.3566595441595442,
+      "grad_norm": 0.7068707346916199,
+      "learning_rate": 0.00014856690790048777,
+      "loss": 0.8325,
+      "step": 7620
+    },
+    {
+      "epoch": 1.356837606837607,
+      "grad_norm": 0.8465402722358704,
+      "learning_rate": 0.00014855467163594257,
+      "loss": 1.0047,
+      "step": 7621
+    },
+    {
+      "epoch": 1.3570156695156697,
+      "grad_norm": 0.7403460741043091,
+      "learning_rate": 0.00014854243442006943,
+      "loss": 1.0907,
+      "step": 7622
+    },
+    {
+      "epoch": 1.3571937321937322,
+      "grad_norm": 0.6939566135406494,
+      "learning_rate": 0.00014853019625310813,
+      "loss": 0.9156,
+      "step": 7623
+    },
+    {
+      "epoch": 1.357371794871795,
+      "grad_norm": 0.6425924897193909,
+      "learning_rate": 0.0001485179571352984,
+      "loss": 0.8156,
+      "step": 7624
+    },
+    {
+      "epoch": 1.3575498575498575,
+      "grad_norm": 0.7091902494430542,
+      "learning_rate": 0.00014850571706688013,
+      "loss": 1.0483,
+      "step": 7625
+    },
+    {
+      "epoch": 1.3577279202279202,
+      "grad_norm": 0.663342297077179,
+      "learning_rate": 0.00014849347604809312,
+      "loss": 1.0405,
+      "step": 7626
+    },
+    {
+      "epoch": 1.357905982905983,
+      "grad_norm": 0.6727671027183533,
+      "learning_rate": 0.00014848123407917716,
+      "loss": 1.0389,
+      "step": 7627
+    },
+    {
+      "epoch": 1.3580840455840457,
+      "grad_norm": 0.6572692394256592,
+      "learning_rate": 0.0001484689911603721,
+      "loss": 1.0489,
+      "step": 7628
+    },
+    {
+      "epoch": 1.3582621082621082,
+      "grad_norm": 0.7629066109657288,
+      "learning_rate": 0.0001484567472919179,
+      "loss": 1.0372,
+      "step": 7629
+    },
+    {
+      "epoch": 1.358440170940171,
+      "grad_norm": 0.7848913669586182,
+      "learning_rate": 0.00014844450247405435,
+      "loss": 0.9437,
+      "step": 7630
+    },
+    {
+      "epoch": 1.3586182336182335,
+      "grad_norm": 0.715949535369873,
+      "learning_rate": 0.00014843225670702143,
+      "loss": 1.1949,
+      "step": 7631
+    },
+    {
+      "epoch": 1.3587962962962963,
+      "grad_norm": 0.6498245596885681,
+      "learning_rate": 0.00014842000999105905,
+      "loss": 0.8845,
+      "step": 7632
+    },
+    {
+      "epoch": 1.358974358974359,
+      "grad_norm": 0.7251074910163879,
+      "learning_rate": 0.00014840776232640716,
+      "loss": 1.093,
+      "step": 7633
+    },
+    {
+      "epoch": 1.3591524216524218,
+      "grad_norm": 0.6223580837249756,
+      "learning_rate": 0.0001483955137133057,
+      "loss": 1.0344,
+      "step": 7634
+    },
+    {
+      "epoch": 1.3593304843304843,
+      "grad_norm": 0.6504943370819092,
+      "learning_rate": 0.00014838326415199472,
+      "loss": 1.109,
+      "step": 7635
+    },
+    {
+      "epoch": 1.359508547008547,
+      "grad_norm": 0.5912374258041382,
+      "learning_rate": 0.00014837101364271416,
+      "loss": 1.0756,
+      "step": 7636
+    },
+    {
+      "epoch": 1.3596866096866096,
+      "grad_norm": 0.6116467714309692,
+      "learning_rate": 0.00014835876218570408,
+      "loss": 0.7871,
+      "step": 7637
+    },
+    {
+      "epoch": 1.3598646723646723,
+      "grad_norm": 0.7013412117958069,
+      "learning_rate": 0.0001483465097812045,
+      "loss": 1.0003,
+      "step": 7638
+    },
+    {
+      "epoch": 1.360042735042735,
+      "grad_norm": 0.5930750370025635,
+      "learning_rate": 0.00014833425642945552,
+      "loss": 0.9926,
+      "step": 7639
+    },
+    {
+      "epoch": 1.3602207977207978,
+      "grad_norm": 0.732955276966095,
+      "learning_rate": 0.00014832200213069717,
+      "loss": 1.2801,
+      "step": 7640
+    },
+    {
+      "epoch": 1.3603988603988604,
+      "grad_norm": 0.6836149096488953,
+      "learning_rate": 0.00014830974688516958,
+      "loss": 0.9292,
+      "step": 7641
+    },
+    {
+      "epoch": 1.3605769230769231,
+      "grad_norm": 0.6531919836997986,
+      "learning_rate": 0.00014829749069311283,
+      "loss": 0.9551,
+      "step": 7642
+    },
+    {
+      "epoch": 1.3607549857549857,
+      "grad_norm": 0.719093382358551,
+      "learning_rate": 0.0001482852335547671,
+      "loss": 0.8588,
+      "step": 7643
+    },
+    {
+      "epoch": 1.3609330484330484,
+      "grad_norm": 0.6144105792045593,
+      "learning_rate": 0.00014827297547037252,
+      "loss": 0.9033,
+      "step": 7644
+    },
+    {
+      "epoch": 1.3611111111111112,
+      "grad_norm": 0.789241373538971,
+      "learning_rate": 0.00014826071644016926,
+      "loss": 1.1916,
+      "step": 7645
+    },
+    {
+      "epoch": 1.361289173789174,
+      "grad_norm": 0.6137418746948242,
+      "learning_rate": 0.0001482484564643975,
+      "loss": 0.9648,
+      "step": 7646
+    },
+    {
+      "epoch": 1.3614672364672364,
+      "grad_norm": 0.6789261698722839,
+      "learning_rate": 0.00014823619554329745,
+      "loss": 0.829,
+      "step": 7647
+    },
+    {
+      "epoch": 1.3616452991452992,
+      "grad_norm": 0.6508790254592896,
+      "learning_rate": 0.0001482239336771094,
+      "loss": 0.942,
+      "step": 7648
+    },
+    {
+      "epoch": 1.3618233618233617,
+      "grad_norm": 0.6725571751594543,
+      "learning_rate": 0.00014821167086607353,
+      "loss": 0.8884,
+      "step": 7649
+    },
+    {
+      "epoch": 1.3620014245014245,
+      "grad_norm": 0.6252003908157349,
+      "learning_rate": 0.00014819940711043012,
+      "loss": 0.9778,
+      "step": 7650
+    },
+    {
+      "epoch": 1.3621794871794872,
+      "grad_norm": 0.6950626969337463,
+      "learning_rate": 0.00014818714241041943,
+      "loss": 1.2104,
+      "step": 7651
+    },
+    {
+      "epoch": 1.36235754985755,
+      "grad_norm": 0.6527379155158997,
+      "learning_rate": 0.0001481748767662818,
+      "loss": 0.7845,
+      "step": 7652
+    },
+    {
+      "epoch": 1.3625356125356125,
+      "grad_norm": 0.7438235282897949,
+      "learning_rate": 0.00014816261017825755,
+      "loss": 0.9513,
+      "step": 7653
+    },
+    {
+      "epoch": 1.3627136752136753,
+      "grad_norm": 0.6412696838378906,
+      "learning_rate": 0.000148150342646587,
+      "loss": 0.8478,
+      "step": 7654
+    },
+    {
+      "epoch": 1.3628917378917378,
+      "grad_norm": 0.658481240272522,
+      "learning_rate": 0.00014813807417151046,
+      "loss": 0.6816,
+      "step": 7655
+    },
+    {
+      "epoch": 1.3630698005698005,
+      "grad_norm": 0.6170126795768738,
+      "learning_rate": 0.0001481258047532684,
+      "loss": 0.8862,
+      "step": 7656
+    },
+    {
+      "epoch": 1.3632478632478633,
+      "grad_norm": 0.7049173712730408,
+      "learning_rate": 0.0001481135343921012,
+      "loss": 1.0027,
+      "step": 7657
+    },
+    {
+      "epoch": 1.363425925925926,
+      "grad_norm": 0.7780741453170776,
+      "learning_rate": 0.0001481012630882492,
+      "loss": 1.0183,
+      "step": 7658
+    },
+    {
+      "epoch": 1.3636039886039886,
+      "grad_norm": 0.6658362746238708,
+      "learning_rate": 0.00014808899084195286,
+      "loss": 0.878,
+      "step": 7659
+    },
+    {
+      "epoch": 1.3637820512820513,
+      "grad_norm": 0.7192076444625854,
+      "learning_rate": 0.00014807671765345267,
+      "loss": 1.2269,
+      "step": 7660
+    },
+    {
+      "epoch": 1.3639601139601139,
+      "grad_norm": 0.7038660049438477,
+      "learning_rate": 0.00014806444352298903,
+      "loss": 0.889,
+      "step": 7661
+    },
+    {
+      "epoch": 1.3641381766381766,
+      "grad_norm": 0.622803270816803,
+      "learning_rate": 0.00014805216845080249,
+      "loss": 0.9623,
+      "step": 7662
+    },
+    {
+      "epoch": 1.3643162393162394,
+      "grad_norm": 0.9157076478004456,
+      "learning_rate": 0.00014803989243713353,
+      "loss": 1.106,
+      "step": 7663
+    },
+    {
+      "epoch": 1.364494301994302,
+      "grad_norm": 0.6369999647140503,
+      "learning_rate": 0.00014802761548222268,
+      "loss": 0.9755,
+      "step": 7664
+    },
+    {
+      "epoch": 1.3646723646723646,
+      "grad_norm": 0.8318394422531128,
+      "learning_rate": 0.00014801533758631045,
+      "loss": 1.1786,
+      "step": 7665
+    },
+    {
+      "epoch": 1.3648504273504274,
+      "grad_norm": 0.7065796852111816,
+      "learning_rate": 0.00014800305874963744,
+      "loss": 1.2066,
+      "step": 7666
+    },
+    {
+      "epoch": 1.36502849002849,
+      "grad_norm": 0.6570265293121338,
+      "learning_rate": 0.0001479907789724442,
+      "loss": 1.0084,
+      "step": 7667
+    },
+    {
+      "epoch": 1.3652065527065527,
+      "grad_norm": 0.637321949005127,
+      "learning_rate": 0.00014797849825497135,
+      "loss": 0.9075,
+      "step": 7668
+    },
+    {
+      "epoch": 1.3653846153846154,
+      "grad_norm": 0.7656470537185669,
+      "learning_rate": 0.00014796621659745948,
+      "loss": 1.1497,
+      "step": 7669
+    },
+    {
+      "epoch": 1.3655626780626782,
+      "grad_norm": 0.6798120737075806,
+      "learning_rate": 0.0001479539340001493,
+      "loss": 0.8154,
+      "step": 7670
+    },
+    {
+      "epoch": 1.3657407407407407,
+      "grad_norm": 0.7004328966140747,
+      "learning_rate": 0.0001479416504632813,
+      "loss": 1.0513,
+      "step": 7671
+    },
+    {
+      "epoch": 1.3659188034188035,
+      "grad_norm": 0.6551713943481445,
+      "learning_rate": 0.0001479293659870963,
+      "loss": 0.8735,
+      "step": 7672
+    },
+    {
+      "epoch": 1.366096866096866,
+      "grad_norm": 0.7685719132423401,
+      "learning_rate": 0.00014791708057183494,
+      "loss": 1.111,
+      "step": 7673
+    },
+    {
+      "epoch": 1.3662749287749287,
+      "grad_norm": 0.673624575138092,
+      "learning_rate": 0.0001479047942177379,
+      "loss": 0.9418,
+      "step": 7674
+    },
+    {
+      "epoch": 1.3664529914529915,
+      "grad_norm": 0.6281047463417053,
+      "learning_rate": 0.00014789250692504597,
+      "loss": 1.0938,
+      "step": 7675
+    },
+    {
+      "epoch": 1.3666310541310542,
+      "grad_norm": 0.5846312642097473,
+      "learning_rate": 0.0001478802186939998,
+      "loss": 0.6352,
+      "step": 7676
+    },
+    {
+      "epoch": 1.3668091168091168,
+      "grad_norm": 0.7037251591682434,
+      "learning_rate": 0.00014786792952484025,
+      "loss": 1.1775,
+      "step": 7677
+    },
+    {
+      "epoch": 1.3669871794871795,
+      "grad_norm": 0.69822758436203,
+      "learning_rate": 0.00014785563941780808,
+      "loss": 1.0877,
+      "step": 7678
+    },
+    {
+      "epoch": 1.367165242165242,
+      "grad_norm": 0.7229313254356384,
+      "learning_rate": 0.000147843348373144,
+      "loss": 1.0305,
+      "step": 7679
+    },
+    {
+      "epoch": 1.3673433048433048,
+      "grad_norm": 0.665771484375,
+      "learning_rate": 0.00014783105639108897,
+      "loss": 0.9056,
+      "step": 7680
+    },
+    {
+      "epoch": 1.3675213675213675,
+      "grad_norm": 0.6418357491493225,
+      "learning_rate": 0.00014781876347188367,
+      "loss": 0.9374,
+      "step": 7681
+    },
+    {
+      "epoch": 1.3676994301994303,
+      "grad_norm": 0.7255483269691467,
+      "learning_rate": 0.0001478064696157691,
+      "loss": 0.8533,
+      "step": 7682
+    },
+    {
+      "epoch": 1.3678774928774928,
+      "grad_norm": 0.668064534664154,
+      "learning_rate": 0.00014779417482298603,
+      "loss": 0.9002,
+      "step": 7683
+    },
+    {
+      "epoch": 1.3680555555555556,
+      "grad_norm": 0.6797603368759155,
+      "learning_rate": 0.0001477818790937754,
+      "loss": 0.9733,
+      "step": 7684
+    },
+    {
+      "epoch": 1.368233618233618,
+      "grad_norm": 0.6905350685119629,
+      "learning_rate": 0.0001477695824283781,
+      "loss": 0.7985,
+      "step": 7685
+    },
+    {
+      "epoch": 1.3684116809116809,
+      "grad_norm": 0.6846137046813965,
+      "learning_rate": 0.00014775728482703507,
+      "loss": 0.9154,
+      "step": 7686
+    },
+    {
+      "epoch": 1.3685897435897436,
+      "grad_norm": 0.6686832904815674,
+      "learning_rate": 0.00014774498628998726,
+      "loss": 0.926,
+      "step": 7687
+    },
+    {
+      "epoch": 1.3687678062678064,
+      "grad_norm": 0.7050234079360962,
+      "learning_rate": 0.00014773268681747561,
+      "loss": 0.9386,
+      "step": 7688
+    },
+    {
+      "epoch": 1.368945868945869,
+      "grad_norm": 0.7048354744911194,
+      "learning_rate": 0.00014772038640974112,
+      "loss": 1.1483,
+      "step": 7689
+    },
+    {
+      "epoch": 1.3691239316239316,
+      "grad_norm": 0.698192298412323,
+      "learning_rate": 0.0001477080850670248,
+      "loss": 1.1452,
+      "step": 7690
+    },
+    {
+      "epoch": 1.3693019943019942,
+      "grad_norm": 0.6838962435722351,
+      "learning_rate": 0.00014769578278956766,
+      "loss": 0.9789,
+      "step": 7691
+    },
+    {
+      "epoch": 1.369480056980057,
+      "grad_norm": 0.6636955142021179,
+      "learning_rate": 0.00014768347957761074,
+      "loss": 0.931,
+      "step": 7692
+    },
+    {
+      "epoch": 1.3696581196581197,
+      "grad_norm": 0.706030547618866,
+      "learning_rate": 0.0001476711754313951,
+      "loss": 1.1096,
+      "step": 7693
+    },
+    {
+      "epoch": 1.3698361823361824,
+      "grad_norm": 0.6771288514137268,
+      "learning_rate": 0.00014765887035116178,
+      "loss": 0.9641,
+      "step": 7694
+    },
+    {
+      "epoch": 1.370014245014245,
+      "grad_norm": 0.6805008053779602,
+      "learning_rate": 0.00014764656433715188,
+      "loss": 0.8724,
+      "step": 7695
+    },
+    {
+      "epoch": 1.3701923076923077,
+      "grad_norm": 0.6599233746528625,
+      "learning_rate": 0.00014763425738960657,
+      "loss": 0.8477,
+      "step": 7696
+    },
+    {
+      "epoch": 1.3703703703703702,
+      "grad_norm": 0.7036116123199463,
+      "learning_rate": 0.0001476219495087669,
+      "loss": 1.0991,
+      "step": 7697
+    },
+    {
+      "epoch": 1.370548433048433,
+      "grad_norm": 0.6677989363670349,
+      "learning_rate": 0.0001476096406948741,
+      "loss": 1.2397,
+      "step": 7698
+    },
+    {
+      "epoch": 1.3707264957264957,
+      "grad_norm": 0.5652269721031189,
+      "learning_rate": 0.00014759733094816928,
+      "loss": 0.9302,
+      "step": 7699
+    },
+    {
+      "epoch": 1.3709045584045585,
+      "grad_norm": 0.6670156121253967,
+      "learning_rate": 0.00014758502026889362,
+      "loss": 0.8362,
+      "step": 7700
+    },
+    {
+      "epoch": 1.371082621082621,
+      "grad_norm": 0.6705406904220581,
+      "learning_rate": 0.00014757270865728832,
+      "loss": 0.876,
+      "step": 7701
+    },
+    {
+      "epoch": 1.3712606837606838,
+      "grad_norm": 0.6020053625106812,
+      "learning_rate": 0.00014756039611359465,
+      "loss": 0.9182,
+      "step": 7702
+    },
+    {
+      "epoch": 1.3714387464387463,
+      "grad_norm": 0.6370134949684143,
+      "learning_rate": 0.0001475480826380538,
+      "loss": 1.1063,
+      "step": 7703
+    },
+    {
+      "epoch": 1.371616809116809,
+      "grad_norm": 0.6906460523605347,
+      "learning_rate": 0.00014753576823090705,
+      "loss": 0.988,
+      "step": 7704
+    },
+    {
+      "epoch": 1.3717948717948718,
+      "grad_norm": 0.6047569513320923,
+      "learning_rate": 0.00014752345289239567,
+      "loss": 1.15,
+      "step": 7705
+    },
+    {
+      "epoch": 1.3719729344729346,
+      "grad_norm": 0.7019868493080139,
+      "learning_rate": 0.00014751113662276095,
+      "loss": 1.1185,
+      "step": 7706
+    },
+    {
+      "epoch": 1.372150997150997,
+      "grad_norm": 0.6534035801887512,
+      "learning_rate": 0.00014749881942224417,
+      "loss": 0.9006,
+      "step": 7707
+    },
+    {
+      "epoch": 1.3723290598290598,
+      "grad_norm": 0.6111651659011841,
+      "learning_rate": 0.00014748650129108674,
+      "loss": 0.935,
+      "step": 7708
+    },
+    {
+      "epoch": 1.3725071225071226,
+      "grad_norm": 0.6678512096405029,
+      "learning_rate": 0.00014747418222952995,
+      "loss": 0.8771,
+      "step": 7709
+    },
+    {
+      "epoch": 1.3726851851851851,
+      "grad_norm": 0.607829749584198,
+      "learning_rate": 0.00014746186223781518,
+      "loss": 1.0509,
+      "step": 7710
+    },
+    {
+      "epoch": 1.3728632478632479,
+      "grad_norm": 0.7274412512779236,
+      "learning_rate": 0.00014744954131618382,
+      "loss": 0.9545,
+      "step": 7711
+    },
+    {
+      "epoch": 1.3730413105413106,
+      "grad_norm": 0.640333354473114,
+      "learning_rate": 0.00014743721946487723,
+      "loss": 1.018,
+      "step": 7712
+    },
+    {
+      "epoch": 1.3732193732193732,
+      "grad_norm": 0.6772079467773438,
+      "learning_rate": 0.0001474248966841369,
+      "loss": 1.0983,
+      "step": 7713
+    },
+    {
+      "epoch": 1.373397435897436,
+      "grad_norm": 0.49630534648895264,
+      "learning_rate": 0.00014741257297420422,
+      "loss": 0.5238,
+      "step": 7714
+    },
+    {
+      "epoch": 1.3735754985754987,
+      "grad_norm": 0.6316596269607544,
+      "learning_rate": 0.00014740024833532068,
+      "loss": 1.1342,
+      "step": 7715
+    },
+    {
+      "epoch": 1.3737535612535612,
+      "grad_norm": 0.5928404331207275,
+      "learning_rate": 0.00014738792276772775,
+      "loss": 0.7987,
+      "step": 7716
+    },
+    {
+      "epoch": 1.373931623931624,
+      "grad_norm": 0.6773418188095093,
+      "learning_rate": 0.00014737559627166688,
+      "loss": 0.934,
+      "step": 7717
+    },
+    {
+      "epoch": 1.3741096866096867,
+      "grad_norm": 0.7895028591156006,
+      "learning_rate": 0.00014736326884737963,
+      "loss": 0.984,
+      "step": 7718
+    },
+    {
+      "epoch": 1.3742877492877492,
+      "grad_norm": 0.7074753046035767,
+      "learning_rate": 0.00014735094049510752,
+      "loss": 1.0093,
+      "step": 7719
+    },
+    {
+      "epoch": 1.374465811965812,
+      "grad_norm": 0.5389847159385681,
+      "learning_rate": 0.00014733861121509208,
+      "loss": 0.8138,
+      "step": 7720
+    },
+    {
+      "epoch": 1.3746438746438747,
+      "grad_norm": 0.6138495206832886,
+      "learning_rate": 0.00014732628100757493,
+      "loss": 0.9282,
+      "step": 7721
+    },
+    {
+      "epoch": 1.3748219373219372,
+      "grad_norm": 0.7609560489654541,
+      "learning_rate": 0.00014731394987279757,
+      "loss": 0.9859,
+      "step": 7722
+    },
+    {
+      "epoch": 1.375,
+      "grad_norm": 0.6806198954582214,
+      "learning_rate": 0.00014730161781100165,
+      "loss": 0.8932,
+      "step": 7723
+    },
+    {
+      "epoch": 1.3751780626780628,
+      "grad_norm": 0.7229103446006775,
+      "learning_rate": 0.0001472892848224288,
+      "loss": 0.956,
+      "step": 7724
+    },
+    {
+      "epoch": 1.3753561253561253,
+      "grad_norm": 0.6157994866371155,
+      "learning_rate": 0.00014727695090732066,
+      "loss": 1.0285,
+      "step": 7725
+    },
+    {
+      "epoch": 1.375534188034188,
+      "grad_norm": 0.5885980129241943,
+      "learning_rate": 0.00014726461606591885,
+      "loss": 0.9174,
+      "step": 7726
+    },
+    {
+      "epoch": 1.3757122507122508,
+      "grad_norm": 0.6655769944190979,
+      "learning_rate": 0.0001472522802984651,
+      "loss": 0.9059,
+      "step": 7727
+    },
+    {
+      "epoch": 1.3758903133903133,
+      "grad_norm": 0.7075541019439697,
+      "learning_rate": 0.00014723994360520105,
+      "loss": 1.0055,
+      "step": 7728
+    },
+    {
+      "epoch": 1.376068376068376,
+      "grad_norm": 0.6947159171104431,
+      "learning_rate": 0.00014722760598636847,
+      "loss": 0.9782,
+      "step": 7729
+    },
+    {
+      "epoch": 1.3762464387464388,
+      "grad_norm": 0.6629964709281921,
+      "learning_rate": 0.00014721526744220905,
+      "loss": 0.9427,
+      "step": 7730
+    },
+    {
+      "epoch": 1.3764245014245013,
+      "grad_norm": 0.7385284304618835,
+      "learning_rate": 0.00014720292797296453,
+      "loss": 0.9953,
+      "step": 7731
+    },
+    {
+      "epoch": 1.376602564102564,
+      "grad_norm": 0.6123563051223755,
+      "learning_rate": 0.0001471905875788767,
+      "loss": 1.0103,
+      "step": 7732
+    },
+    {
+      "epoch": 1.3767806267806268,
+      "grad_norm": 0.6457047462463379,
+      "learning_rate": 0.00014717824626018732,
+      "loss": 0.9779,
+      "step": 7733
+    },
+    {
+      "epoch": 1.3769586894586894,
+      "grad_norm": 0.6196442246437073,
+      "learning_rate": 0.00014716590401713824,
+      "loss": 0.8747,
+      "step": 7734
+    },
+    {
+      "epoch": 1.3771367521367521,
+      "grad_norm": 0.7932298183441162,
+      "learning_rate": 0.00014715356084997122,
+      "loss": 1.1617,
+      "step": 7735
+    },
+    {
+      "epoch": 1.3773148148148149,
+      "grad_norm": 0.787304699420929,
+      "learning_rate": 0.00014714121675892815,
+      "loss": 1.1383,
+      "step": 7736
+    },
+    {
+      "epoch": 1.3774928774928774,
+      "grad_norm": 0.672795295715332,
+      "learning_rate": 0.00014712887174425085,
+      "loss": 1.2563,
+      "step": 7737
+    },
+    {
+      "epoch": 1.3776709401709402,
+      "grad_norm": 0.6505744457244873,
+      "learning_rate": 0.00014711652580618123,
+      "loss": 0.9194,
+      "step": 7738
+    },
+    {
+      "epoch": 1.377849002849003,
+      "grad_norm": 0.8141193985939026,
+      "learning_rate": 0.00014710417894496115,
+      "loss": 1.1428,
+      "step": 7739
+    },
+    {
+      "epoch": 1.3780270655270654,
+      "grad_norm": 0.6269707679748535,
+      "learning_rate": 0.00014709183116083253,
+      "loss": 0.7164,
+      "step": 7740
+    },
+    {
+      "epoch": 1.3782051282051282,
+      "grad_norm": 0.6737076640129089,
+      "learning_rate": 0.0001470794824540373,
+      "loss": 0.9965,
+      "step": 7741
+    },
+    {
+      "epoch": 1.378383190883191,
+      "grad_norm": 0.6451728343963623,
+      "learning_rate": 0.0001470671328248174,
+      "loss": 1.0539,
+      "step": 7742
+    },
+    {
+      "epoch": 1.3785612535612537,
+      "grad_norm": 0.6480295062065125,
+      "learning_rate": 0.00014705478227341486,
+      "loss": 0.9118,
+      "step": 7743
+    },
+    {
+      "epoch": 1.3787393162393162,
+      "grad_norm": 0.7429090738296509,
+      "learning_rate": 0.00014704243080007154,
+      "loss": 1.0031,
+      "step": 7744
+    },
+    {
+      "epoch": 1.378917378917379,
+      "grad_norm": 0.5601376891136169,
+      "learning_rate": 0.00014703007840502955,
+      "loss": 0.849,
+      "step": 7745
+    },
+    {
+      "epoch": 1.3790954415954415,
+      "grad_norm": 0.7067657113075256,
+      "learning_rate": 0.00014701772508853088,
+      "loss": 1.3067,
+      "step": 7746
+    },
+    {
+      "epoch": 1.3792735042735043,
+      "grad_norm": 0.7016390562057495,
+      "learning_rate": 0.00014700537085081755,
+      "loss": 1.0236,
+      "step": 7747
+    },
+    {
+      "epoch": 1.379451566951567,
+      "grad_norm": 0.6505000591278076,
+      "learning_rate": 0.0001469930156921316,
+      "loss": 1.0121,
+      "step": 7748
+    },
+    {
+      "epoch": 1.3796296296296298,
+      "grad_norm": 0.8515380620956421,
+      "learning_rate": 0.00014698065961271512,
+      "loss": 1.0413,
+      "step": 7749
+    },
+    {
+      "epoch": 1.3798076923076923,
+      "grad_norm": 0.6322008371353149,
+      "learning_rate": 0.00014696830261281025,
+      "loss": 0.8306,
+      "step": 7750
+    },
+    {
+      "epoch": 1.379985754985755,
+      "grad_norm": 0.7090431451797485,
+      "learning_rate": 0.00014695594469265902,
+      "loss": 1.1829,
+      "step": 7751
+    },
+    {
+      "epoch": 1.3801638176638176,
+      "grad_norm": 0.5913167595863342,
+      "learning_rate": 0.00014694358585250363,
+      "loss": 0.9769,
+      "step": 7752
+    },
+    {
+      "epoch": 1.3803418803418803,
+      "grad_norm": 0.7345432639122009,
+      "learning_rate": 0.00014693122609258616,
+      "loss": 0.9928,
+      "step": 7753
+    },
+    {
+      "epoch": 1.380519943019943,
+      "grad_norm": 0.6158214211463928,
+      "learning_rate": 0.00014691886541314884,
+      "loss": 1.1166,
+      "step": 7754
+    },
+    {
+      "epoch": 1.3806980056980058,
+      "grad_norm": 0.6874041557312012,
+      "learning_rate": 0.0001469065038144338,
+      "loss": 1.0808,
+      "step": 7755
+    },
+    {
+      "epoch": 1.3808760683760684,
+      "grad_norm": 0.8135195970535278,
+      "learning_rate": 0.00014689414129668326,
+      "loss": 0.9482,
+      "step": 7756
+    },
+    {
+      "epoch": 1.381054131054131,
+      "grad_norm": 0.6389174461364746,
+      "learning_rate": 0.00014688177786013944,
+      "loss": 1.039,
+      "step": 7757
+    },
+    {
+      "epoch": 1.3812321937321936,
+      "grad_norm": 0.6953016519546509,
+      "learning_rate": 0.00014686941350504454,
+      "loss": 0.9426,
+      "step": 7758
+    },
+    {
+      "epoch": 1.3814102564102564,
+      "grad_norm": 0.8171859383583069,
+      "learning_rate": 0.00014685704823164087,
+      "loss": 1.0393,
+      "step": 7759
+    },
+    {
+      "epoch": 1.3815883190883191,
+      "grad_norm": 0.6968414783477783,
+      "learning_rate": 0.0001468446820401707,
+      "loss": 1.1167,
+      "step": 7760
+    },
+    {
+      "epoch": 1.381766381766382,
+      "grad_norm": 0.6916623711585999,
+      "learning_rate": 0.00014683231493087628,
+      "loss": 1.1886,
+      "step": 7761
+    },
+    {
+      "epoch": 1.3819444444444444,
+      "grad_norm": 0.7351683378219604,
+      "learning_rate": 0.00014681994690399992,
+      "loss": 0.9893,
+      "step": 7762
+    },
+    {
+      "epoch": 1.3821225071225072,
+      "grad_norm": 0.6617491245269775,
+      "learning_rate": 0.00014680757795978395,
+      "loss": 1.0505,
+      "step": 7763
+    },
+    {
+      "epoch": 1.3823005698005697,
+      "grad_norm": 0.6627485156059265,
+      "learning_rate": 0.00014679520809847074,
+      "loss": 0.9878,
+      "step": 7764
+    },
+    {
+      "epoch": 1.3824786324786325,
+      "grad_norm": 0.704636812210083,
+      "learning_rate": 0.00014678283732030264,
+      "loss": 0.8332,
+      "step": 7765
+    },
+    {
+      "epoch": 1.3826566951566952,
+      "grad_norm": 0.698853075504303,
+      "learning_rate": 0.00014677046562552203,
+      "loss": 1.0926,
+      "step": 7766
+    },
+    {
+      "epoch": 1.382834757834758,
+      "grad_norm": 0.6695869565010071,
+      "learning_rate": 0.0001467580930143713,
+      "loss": 1.0626,
+      "step": 7767
+    },
+    {
+      "epoch": 1.3830128205128205,
+      "grad_norm": 0.672173023223877,
+      "learning_rate": 0.00014674571948709286,
+      "loss": 0.8842,
+      "step": 7768
+    },
+    {
+      "epoch": 1.3831908831908832,
+      "grad_norm": 0.6735473871231079,
+      "learning_rate": 0.00014673334504392916,
+      "loss": 0.9382,
+      "step": 7769
+    },
+    {
+      "epoch": 1.3833689458689458,
+      "grad_norm": 0.6864013075828552,
+      "learning_rate": 0.00014672096968512265,
+      "loss": 1.1369,
+      "step": 7770
+    },
+    {
+      "epoch": 1.3835470085470085,
+      "grad_norm": 0.7154954075813293,
+      "learning_rate": 0.0001467085934109158,
+      "loss": 1.1447,
+      "step": 7771
+    },
+    {
+      "epoch": 1.3837250712250713,
+      "grad_norm": 0.5934487581253052,
+      "learning_rate": 0.0001466962162215511,
+      "loss": 0.8923,
+      "step": 7772
+    },
+    {
+      "epoch": 1.383903133903134,
+      "grad_norm": 0.8116832971572876,
+      "learning_rate": 0.00014668383811727097,
+      "loss": 1.0997,
+      "step": 7773
+    },
+    {
+      "epoch": 1.3840811965811965,
+      "grad_norm": 0.8661674857139587,
+      "learning_rate": 0.00014667145909831808,
+      "loss": 1.0112,
+      "step": 7774
+    },
+    {
+      "epoch": 1.3842592592592593,
+      "grad_norm": 0.5173856616020203,
+      "learning_rate": 0.00014665907916493488,
+      "loss": 0.6571,
+      "step": 7775
+    },
+    {
+      "epoch": 1.3844373219373218,
+      "grad_norm": 0.6165067553520203,
+      "learning_rate": 0.00014664669831736395,
+      "loss": 1.0992,
+      "step": 7776
+    },
+    {
+      "epoch": 1.3846153846153846,
+      "grad_norm": 0.6564429998397827,
+      "learning_rate": 0.00014663431655584787,
+      "loss": 0.9103,
+      "step": 7777
+    },
+    {
+      "epoch": 1.3847934472934473,
+      "grad_norm": 0.7162124514579773,
+      "learning_rate": 0.00014662193388062923,
+      "loss": 1.0645,
+      "step": 7778
+    },
+    {
+      "epoch": 1.38497150997151,
+      "grad_norm": 0.6391215920448303,
+      "learning_rate": 0.00014660955029195064,
+      "loss": 0.902,
+      "step": 7779
+    },
+    {
+      "epoch": 1.3851495726495726,
+      "grad_norm": 0.6876635551452637,
+      "learning_rate": 0.00014659716579005475,
+      "loss": 1.0924,
+      "step": 7780
+    },
+    {
+      "epoch": 1.3853276353276354,
+      "grad_norm": 0.7254653573036194,
+      "learning_rate": 0.00014658478037518418,
+      "loss": 1.0135,
+      "step": 7781
+    },
+    {
+      "epoch": 1.385505698005698,
+      "grad_norm": 0.6900535225868225,
+      "learning_rate": 0.00014657239404758162,
+      "loss": 0.983,
+      "step": 7782
+    },
+    {
+      "epoch": 1.3856837606837606,
+      "grad_norm": 0.7477042078971863,
+      "learning_rate": 0.00014656000680748975,
+      "loss": 1.0707,
+      "step": 7783
+    },
+    {
+      "epoch": 1.3858618233618234,
+      "grad_norm": 0.5756927132606506,
+      "learning_rate": 0.00014654761865515124,
+      "loss": 0.8881,
+      "step": 7784
+    },
+    {
+      "epoch": 1.3860398860398861,
+      "grad_norm": 0.6736083626747131,
+      "learning_rate": 0.00014653522959080884,
+      "loss": 1.0193,
+      "step": 7785
+    },
+    {
+      "epoch": 1.3862179487179487,
+      "grad_norm": 0.616179883480072,
+      "learning_rate": 0.0001465228396147053,
+      "loss": 0.8676,
+      "step": 7786
+    },
+    {
+      "epoch": 1.3863960113960114,
+      "grad_norm": 0.7956456542015076,
+      "learning_rate": 0.00014651044872708338,
+      "loss": 0.9787,
+      "step": 7787
+    },
+    {
+      "epoch": 1.386574074074074,
+      "grad_norm": 0.6613463163375854,
+      "learning_rate": 0.00014649805692818578,
+      "loss": 1.0032,
+      "step": 7788
+    },
+    {
+      "epoch": 1.3867521367521367,
+      "grad_norm": 0.6215800642967224,
+      "learning_rate": 0.0001464856642182554,
+      "loss": 1.0123,
+      "step": 7789
+    },
+    {
+      "epoch": 1.3869301994301995,
+      "grad_norm": 0.6701171398162842,
+      "learning_rate": 0.00014647327059753496,
+      "loss": 0.9108,
+      "step": 7790
+    },
+    {
+      "epoch": 1.3871082621082622,
+      "grad_norm": 0.6213465929031372,
+      "learning_rate": 0.00014646087606626736,
+      "loss": 0.9313,
+      "step": 7791
+    },
+    {
+      "epoch": 1.3872863247863247,
+      "grad_norm": 0.7535304427146912,
+      "learning_rate": 0.00014644848062469535,
+      "loss": 1.0813,
+      "step": 7792
+    },
+    {
+      "epoch": 1.3874643874643875,
+      "grad_norm": 0.6778230667114258,
+      "learning_rate": 0.0001464360842730619,
+      "loss": 1.0405,
+      "step": 7793
+    },
+    {
+      "epoch": 1.38764245014245,
+      "grad_norm": 0.7816025614738464,
+      "learning_rate": 0.0001464236870116098,
+      "loss": 0.9228,
+      "step": 7794
+    },
+    {
+      "epoch": 1.3878205128205128,
+      "grad_norm": 0.6815229058265686,
+      "learning_rate": 0.00014641128884058203,
+      "loss": 0.9607,
+      "step": 7795
+    },
+    {
+      "epoch": 1.3879985754985755,
+      "grad_norm": 0.7027714848518372,
+      "learning_rate": 0.00014639888976022145,
+      "loss": 0.9379,
+      "step": 7796
+    },
+    {
+      "epoch": 1.3881766381766383,
+      "grad_norm": 0.7636353373527527,
+      "learning_rate": 0.00014638648977077104,
+      "loss": 1.1186,
+      "step": 7797
+    },
+    {
+      "epoch": 1.3883547008547008,
+      "grad_norm": 0.6732974052429199,
+      "learning_rate": 0.00014637408887247365,
+      "loss": 1.1378,
+      "step": 7798
+    },
+    {
+      "epoch": 1.3885327635327636,
+      "grad_norm": 0.7539397478103638,
+      "learning_rate": 0.0001463616870655724,
+      "loss": 0.999,
+      "step": 7799
+    },
+    {
+      "epoch": 1.388710826210826,
+      "grad_norm": 0.6872972846031189,
+      "learning_rate": 0.00014634928435031013,
+      "loss": 0.9564,
+      "step": 7800
+    },
+    {
+      "epoch": 1.3888888888888888,
+      "grad_norm": 0.6823115348815918,
+      "learning_rate": 0.00014633688072693,
+      "loss": 0.9745,
+      "step": 7801
+    },
+    {
+      "epoch": 1.3890669515669516,
+      "grad_norm": 0.6462571620941162,
+      "learning_rate": 0.00014632447619567488,
+      "loss": 0.8314,
+      "step": 7802
+    },
+    {
+      "epoch": 1.3892450142450143,
+      "grad_norm": 0.7245402932167053,
+      "learning_rate": 0.0001463120707567879,
+      "loss": 0.8291,
+      "step": 7803
+    },
+    {
+      "epoch": 1.3894230769230769,
+      "grad_norm": 0.697179913520813,
+      "learning_rate": 0.00014629966441051208,
+      "loss": 1.017,
+      "step": 7804
+    },
+    {
+      "epoch": 1.3896011396011396,
+      "grad_norm": 0.6304250359535217,
+      "learning_rate": 0.00014628725715709053,
+      "loss": 0.9262,
+      "step": 7805
+    },
+    {
+      "epoch": 1.3897792022792022,
+      "grad_norm": 0.5780240297317505,
+      "learning_rate": 0.00014627484899676634,
+      "loss": 0.6596,
+      "step": 7806
+    },
+    {
+      "epoch": 1.389957264957265,
+      "grad_norm": 0.8030684590339661,
+      "learning_rate": 0.0001462624399297826,
+      "loss": 0.9977,
+      "step": 7807
+    },
+    {
+      "epoch": 1.3901353276353277,
+      "grad_norm": 0.7999774813652039,
+      "learning_rate": 0.00014625002995638246,
+      "loss": 1.1036,
+      "step": 7808
+    },
+    {
+      "epoch": 1.3903133903133904,
+      "grad_norm": 0.7054862976074219,
+      "learning_rate": 0.00014623761907680904,
+      "loss": 1.1435,
+      "step": 7809
+    },
+    {
+      "epoch": 1.390491452991453,
+      "grad_norm": 0.6660647392272949,
+      "learning_rate": 0.00014622520729130556,
+      "loss": 0.703,
+      "step": 7810
+    },
+    {
+      "epoch": 1.3906695156695157,
+      "grad_norm": 0.6339690089225769,
+      "learning_rate": 0.00014621279460011515,
+      "loss": 1.0451,
+      "step": 7811
+    },
+    {
+      "epoch": 1.3908475783475782,
+      "grad_norm": 0.8568736910820007,
+      "learning_rate": 0.00014620038100348102,
+      "loss": 1.009,
+      "step": 7812
+    },
+    {
+      "epoch": 1.391025641025641,
+      "grad_norm": 0.7126797437667847,
+      "learning_rate": 0.00014618796650164642,
+      "loss": 0.9592,
+      "step": 7813
+    },
+    {
+      "epoch": 1.3912037037037037,
+      "grad_norm": 0.6768994331359863,
+      "learning_rate": 0.00014617555109485453,
+      "loss": 1.09,
+      "step": 7814
+    },
+    {
+      "epoch": 1.3913817663817665,
+      "grad_norm": 0.7609471678733826,
+      "learning_rate": 0.00014616313478334864,
+      "loss": 0.9781,
+      "step": 7815
+    },
+    {
+      "epoch": 1.391559829059829,
+      "grad_norm": 0.7107006907463074,
+      "learning_rate": 0.00014615071756737203,
+      "loss": 0.9769,
+      "step": 7816
+    },
+    {
+      "epoch": 1.3917378917378918,
+      "grad_norm": 0.6324763894081116,
+      "learning_rate": 0.00014613829944716802,
+      "loss": 1.089,
+      "step": 7817
+    },
+    {
+      "epoch": 1.3919159544159543,
+      "grad_norm": 0.6617186069488525,
+      "learning_rate": 0.00014612588042297984,
+      "loss": 1.0466,
+      "step": 7818
+    },
+    {
+      "epoch": 1.392094017094017,
+      "grad_norm": 0.7881436944007874,
+      "learning_rate": 0.00014611346049505083,
+      "loss": 1.003,
+      "step": 7819
+    },
+    {
+      "epoch": 1.3922720797720798,
+      "grad_norm": 0.7391049861907959,
+      "learning_rate": 0.00014610103966362437,
+      "loss": 1.0531,
+      "step": 7820
+    },
+    {
+      "epoch": 1.3924501424501425,
+      "grad_norm": 0.6299472451210022,
+      "learning_rate": 0.00014608861792894383,
+      "loss": 0.8433,
+      "step": 7821
+    },
+    {
+      "epoch": 1.392628205128205,
+      "grad_norm": 0.6053452491760254,
+      "learning_rate": 0.00014607619529125255,
+      "loss": 0.7945,
+      "step": 7822
+    },
+    {
+      "epoch": 1.3928062678062678,
+      "grad_norm": 0.7160114645957947,
+      "learning_rate": 0.0001460637717507939,
+      "loss": 1.1604,
+      "step": 7823
+    },
+    {
+      "epoch": 1.3929843304843303,
+      "grad_norm": 0.6308854222297668,
+      "learning_rate": 0.00014605134730781135,
+      "loss": 1.0918,
+      "step": 7824
+    },
+    {
+      "epoch": 1.393162393162393,
+      "grad_norm": 0.7187000513076782,
+      "learning_rate": 0.00014603892196254833,
+      "loss": 1.0594,
+      "step": 7825
+    },
+    {
+      "epoch": 1.3933404558404558,
+      "grad_norm": 0.7516581416130066,
+      "learning_rate": 0.00014602649571524826,
+      "loss": 0.9222,
+      "step": 7826
+    },
+    {
+      "epoch": 1.3935185185185186,
+      "grad_norm": 0.6340481638908386,
+      "learning_rate": 0.00014601406856615463,
+      "loss": 0.8131,
+      "step": 7827
+    },
+    {
+      "epoch": 1.3936965811965811,
+      "grad_norm": 0.8161744475364685,
+      "learning_rate": 0.0001460016405155109,
+      "loss": 0.8695,
+      "step": 7828
+    },
+    {
+      "epoch": 1.3938746438746439,
+      "grad_norm": 0.6926971077919006,
+      "learning_rate": 0.0001459892115635606,
+      "loss": 0.9548,
+      "step": 7829
+    },
+    {
+      "epoch": 1.3940527065527066,
+      "grad_norm": 0.6669796109199524,
+      "learning_rate": 0.0001459767817105472,
+      "loss": 0.9255,
+      "step": 7830
+    },
+    {
+      "epoch": 1.3942307692307692,
+      "grad_norm": 0.6626184582710266,
+      "learning_rate": 0.00014596435095671432,
+      "loss": 1.1141,
+      "step": 7831
+    },
+    {
+      "epoch": 1.394408831908832,
+      "grad_norm": 0.6755738854408264,
+      "learning_rate": 0.00014595191930230546,
+      "loss": 0.9596,
+      "step": 7832
+    },
+    {
+      "epoch": 1.3945868945868947,
+      "grad_norm": 0.6034863591194153,
+      "learning_rate": 0.00014593948674756417,
+      "loss": 0.8088,
+      "step": 7833
+    },
+    {
+      "epoch": 1.3947649572649572,
+      "grad_norm": 0.5638226866722107,
+      "learning_rate": 0.00014592705329273406,
+      "loss": 0.5828,
+      "step": 7834
+    },
+    {
+      "epoch": 1.39494301994302,
+      "grad_norm": 0.6902222633361816,
+      "learning_rate": 0.0001459146189380588,
+      "loss": 0.7954,
+      "step": 7835
+    },
+    {
+      "epoch": 1.3951210826210827,
+      "grad_norm": 0.7579947710037231,
+      "learning_rate": 0.0001459021836837819,
+      "loss": 1.1301,
+      "step": 7836
+    },
+    {
+      "epoch": 1.3952991452991452,
+      "grad_norm": 0.6894911527633667,
+      "learning_rate": 0.00014588974753014712,
+      "loss": 1.082,
+      "step": 7837
+    },
+    {
+      "epoch": 1.395477207977208,
+      "grad_norm": 0.6330230832099915,
+      "learning_rate": 0.000145877310477398,
+      "loss": 0.7614,
+      "step": 7838
+    },
+    {
+      "epoch": 1.3956552706552707,
+      "grad_norm": 0.6164960265159607,
+      "learning_rate": 0.00014586487252577832,
+      "loss": 0.8981,
+      "step": 7839
+    },
+    {
+      "epoch": 1.3958333333333333,
+      "grad_norm": 0.6575061678886414,
+      "learning_rate": 0.0001458524336755317,
+      "loss": 0.9735,
+      "step": 7840
+    },
+    {
+      "epoch": 1.396011396011396,
+      "grad_norm": 0.687921941280365,
+      "learning_rate": 0.00014583999392690195,
+      "loss": 0.9207,
+      "step": 7841
+    },
+    {
+      "epoch": 1.3961894586894588,
+      "grad_norm": 0.6175212860107422,
+      "learning_rate": 0.00014582755328013274,
+      "loss": 1.0444,
+      "step": 7842
+    },
+    {
+      "epoch": 1.3963675213675213,
+      "grad_norm": 0.6351733207702637,
+      "learning_rate": 0.00014581511173546781,
+      "loss": 1.0143,
+      "step": 7843
+    },
+    {
+      "epoch": 1.396545584045584,
+      "grad_norm": 0.7235051989555359,
+      "learning_rate": 0.00014580266929315093,
+      "loss": 0.9108,
+      "step": 7844
+    },
+    {
+      "epoch": 1.3967236467236468,
+      "grad_norm": 0.6432043313980103,
+      "learning_rate": 0.00014579022595342586,
+      "loss": 0.8674,
+      "step": 7845
+    },
+    {
+      "epoch": 1.3969017094017093,
+      "grad_norm": 0.7775412797927856,
+      "learning_rate": 0.00014577778171653648,
+      "loss": 1.0637,
+      "step": 7846
+    },
+    {
+      "epoch": 1.397079772079772,
+      "grad_norm": 0.6748763918876648,
+      "learning_rate": 0.00014576533658272655,
+      "loss": 1.0356,
+      "step": 7847
+    },
+    {
+      "epoch": 1.3972578347578348,
+      "grad_norm": 0.6940401196479797,
+      "learning_rate": 0.00014575289055223994,
+      "loss": 0.9937,
+      "step": 7848
+    },
+    {
+      "epoch": 1.3974358974358974,
+      "grad_norm": 0.6971304416656494,
+      "learning_rate": 0.00014574044362532045,
+      "loss": 0.9753,
+      "step": 7849
+    },
+    {
+      "epoch": 1.39761396011396,
+      "grad_norm": 0.6576017141342163,
+      "learning_rate": 0.00014572799580221197,
+      "loss": 1.1233,
+      "step": 7850
+    },
+    {
+      "epoch": 1.3977920227920229,
+      "grad_norm": 0.6270702481269836,
+      "learning_rate": 0.00014571554708315843,
+      "loss": 0.9771,
+      "step": 7851
+    },
+    {
+      "epoch": 1.3979700854700854,
+      "grad_norm": 0.6898425817489624,
+      "learning_rate": 0.00014570309746840372,
+      "loss": 0.9235,
+      "step": 7852
+    },
+    {
+      "epoch": 1.3981481481481481,
+      "grad_norm": 0.7017102241516113,
+      "learning_rate": 0.00014569064695819174,
+      "loss": 1.1056,
+      "step": 7853
+    },
+    {
+      "epoch": 1.398326210826211,
+      "grad_norm": 0.6298288702964783,
+      "learning_rate": 0.00014567819555276647,
+      "loss": 0.8635,
+      "step": 7854
+    },
+    {
+      "epoch": 1.3985042735042734,
+      "grad_norm": 0.7173134684562683,
+      "learning_rate": 0.00014566574325237182,
+      "loss": 1.0893,
+      "step": 7855
+    },
+    {
+      "epoch": 1.3986823361823362,
+      "grad_norm": 0.7541036605834961,
+      "learning_rate": 0.0001456532900572518,
+      "loss": 1.0996,
+      "step": 7856
+    },
+    {
+      "epoch": 1.398860398860399,
+      "grad_norm": 0.6204771399497986,
+      "learning_rate": 0.0001456408359676504,
+      "loss": 0.7601,
+      "step": 7857
+    },
+    {
+      "epoch": 1.3990384615384617,
+      "grad_norm": 0.629557192325592,
+      "learning_rate": 0.00014562838098381163,
+      "loss": 0.9239,
+      "step": 7858
+    },
+    {
+      "epoch": 1.3992165242165242,
+      "grad_norm": 0.6878390908241272,
+      "learning_rate": 0.00014561592510597954,
+      "loss": 0.9641,
+      "step": 7859
+    },
+    {
+      "epoch": 1.399394586894587,
+      "grad_norm": 0.7490049004554749,
+      "learning_rate": 0.00014560346833439813,
+      "loss": 1.0198,
+      "step": 7860
+    },
+    {
+      "epoch": 1.3995726495726495,
+      "grad_norm": 0.6337960958480835,
+      "learning_rate": 0.0001455910106693115,
+      "loss": 0.8709,
+      "step": 7861
+    },
+    {
+      "epoch": 1.3997507122507122,
+      "grad_norm": 0.6210524439811707,
+      "learning_rate": 0.0001455785521109637,
+      "loss": 1.1049,
+      "step": 7862
+    },
+    {
+      "epoch": 1.399928774928775,
+      "grad_norm": 0.7894936203956604,
+      "learning_rate": 0.00014556609265959887,
+      "loss": 0.8933,
+      "step": 7863
+    },
+    {
+      "epoch": 1.4001068376068377,
+      "grad_norm": 0.6888098120689392,
+      "learning_rate": 0.00014555363231546112,
+      "loss": 0.9738,
+      "step": 7864
+    },
+    {
+      "epoch": 1.4002849002849003,
+      "grad_norm": 0.608799934387207,
+      "learning_rate": 0.00014554117107879456,
+      "loss": 0.9103,
+      "step": 7865
+    },
+    {
+      "epoch": 1.400462962962963,
+      "grad_norm": 0.7390474081039429,
+      "learning_rate": 0.00014552870894984335,
+      "loss": 1.2484,
+      "step": 7866
+    },
+    {
+      "epoch": 1.4006410256410255,
+      "grad_norm": 0.6513381600379944,
+      "learning_rate": 0.00014551624592885169,
+      "loss": 0.8523,
+      "step": 7867
+    },
+    {
+      "epoch": 1.4008190883190883,
+      "grad_norm": 0.6357464790344238,
+      "learning_rate": 0.00014550378201606373,
+      "loss": 0.9594,
+      "step": 7868
+    },
+    {
+      "epoch": 1.400997150997151,
+      "grad_norm": 0.6893286108970642,
+      "learning_rate": 0.0001454913172117237,
+      "loss": 0.9798,
+      "step": 7869
+    },
+    {
+      "epoch": 1.4011752136752138,
+      "grad_norm": 0.6566550731658936,
+      "learning_rate": 0.0001454788515160758,
+      "loss": 1.0532,
+      "step": 7870
+    },
+    {
+      "epoch": 1.4013532763532763,
+      "grad_norm": 0.6442158222198486,
+      "learning_rate": 0.00014546638492936425,
+      "loss": 1.0789,
+      "step": 7871
+    },
+    {
+      "epoch": 1.401531339031339,
+      "grad_norm": 0.7570971846580505,
+      "learning_rate": 0.0001454539174518334,
+      "loss": 0.9806,
+      "step": 7872
+    },
+    {
+      "epoch": 1.4017094017094016,
+      "grad_norm": 0.6180047392845154,
+      "learning_rate": 0.0001454414490837274,
+      "loss": 0.857,
+      "step": 7873
+    },
+    {
+      "epoch": 1.4018874643874644,
+      "grad_norm": 0.7143170237541199,
+      "learning_rate": 0.0001454289798252906,
+      "loss": 0.8815,
+      "step": 7874
+    },
+    {
+      "epoch": 1.4020655270655271,
+      "grad_norm": 0.6388922929763794,
+      "learning_rate": 0.00014541650967676736,
+      "loss": 0.95,
+      "step": 7875
+    },
+    {
+      "epoch": 1.4022435897435899,
+      "grad_norm": 0.7137351632118225,
+      "learning_rate": 0.00014540403863840193,
+      "loss": 0.8973,
+      "step": 7876
+    },
+    {
+      "epoch": 1.4024216524216524,
+      "grad_norm": 0.656315267086029,
+      "learning_rate": 0.0001453915667104387,
+      "loss": 1.149,
+      "step": 7877
+    },
+    {
+      "epoch": 1.4025997150997151,
+      "grad_norm": 0.7234711647033691,
+      "learning_rate": 0.000145379093893122,
+      "loss": 0.9798,
+      "step": 7878
+    },
+    {
+      "epoch": 1.4027777777777777,
+      "grad_norm": 0.6595289707183838,
+      "learning_rate": 0.00014536662018669623,
+      "loss": 1.2704,
+      "step": 7879
+    },
+    {
+      "epoch": 1.4029558404558404,
+      "grad_norm": 0.6760551333427429,
+      "learning_rate": 0.00014535414559140576,
+      "loss": 0.8672,
+      "step": 7880
+    },
+    {
+      "epoch": 1.4031339031339032,
+      "grad_norm": 0.5916706919670105,
+      "learning_rate": 0.000145341670107495,
+      "loss": 0.888,
+      "step": 7881
+    },
+    {
+      "epoch": 1.403311965811966,
+      "grad_norm": 0.7272133231163025,
+      "learning_rate": 0.00014532919373520846,
+      "loss": 1.0466,
+      "step": 7882
+    },
+    {
+      "epoch": 1.4034900284900285,
+      "grad_norm": 0.8512467741966248,
+      "learning_rate": 0.00014531671647479048,
+      "loss": 1.2482,
+      "step": 7883
+    },
+    {
+      "epoch": 1.4036680911680912,
+      "grad_norm": 0.5536492466926575,
+      "learning_rate": 0.0001453042383264856,
+      "loss": 0.7823,
+      "step": 7884
+    },
+    {
+      "epoch": 1.4038461538461537,
+      "grad_norm": 0.7262215614318848,
+      "learning_rate": 0.0001452917592905383,
+      "loss": 0.9713,
+      "step": 7885
+    },
+    {
+      "epoch": 1.4040242165242165,
+      "grad_norm": 0.7146059274673462,
+      "learning_rate": 0.00014527927936719304,
+      "loss": 1.1064,
+      "step": 7886
+    },
+    {
+      "epoch": 1.4042022792022792,
+      "grad_norm": 0.5915318131446838,
+      "learning_rate": 0.00014526679855669436,
+      "loss": 0.8567,
+      "step": 7887
+    },
+    {
+      "epoch": 1.404380341880342,
+      "grad_norm": 0.6548298001289368,
+      "learning_rate": 0.00014525431685928682,
+      "loss": 1.1359,
+      "step": 7888
+    },
+    {
+      "epoch": 1.4045584045584045,
+      "grad_norm": 0.7482563853263855,
+      "learning_rate": 0.0001452418342752149,
+      "loss": 0.9095,
+      "step": 7889
+    },
+    {
+      "epoch": 1.4047364672364673,
+      "grad_norm": 0.6660130023956299,
+      "learning_rate": 0.0001452293508047233,
+      "loss": 1.2343,
+      "step": 7890
+    },
+    {
+      "epoch": 1.4049145299145298,
+      "grad_norm": 0.7457148432731628,
+      "learning_rate": 0.00014521686644805644,
+      "loss": 1.2086,
+      "step": 7891
+    },
+    {
+      "epoch": 1.4050925925925926,
+      "grad_norm": 0.5957929491996765,
+      "learning_rate": 0.00014520438120545906,
+      "loss": 0.9724,
+      "step": 7892
+    },
+    {
+      "epoch": 1.4052706552706553,
+      "grad_norm": 0.6832270622253418,
+      "learning_rate": 0.00014519189507717573,
+      "loss": 0.9903,
+      "step": 7893
+    },
+    {
+      "epoch": 1.405448717948718,
+      "grad_norm": 0.6202489733695984,
+      "learning_rate": 0.00014517940806345109,
+      "loss": 0.962,
+      "step": 7894
+    },
+    {
+      "epoch": 1.4056267806267806,
+      "grad_norm": 0.6419472694396973,
+      "learning_rate": 0.0001451669201645298,
+      "loss": 0.8147,
+      "step": 7895
+    },
+    {
+      "epoch": 1.4058048433048433,
+      "grad_norm": 0.61143958568573,
+      "learning_rate": 0.00014515443138065652,
+      "loss": 0.8674,
+      "step": 7896
+    },
+    {
+      "epoch": 1.4059829059829059,
+      "grad_norm": 0.7527356743812561,
+      "learning_rate": 0.00014514194171207597,
+      "loss": 1.0581,
+      "step": 7897
+    },
+    {
+      "epoch": 1.4061609686609686,
+      "grad_norm": 0.7195194363594055,
+      "learning_rate": 0.00014512945115903285,
+      "loss": 1.0268,
+      "step": 7898
+    },
+    {
+      "epoch": 1.4063390313390314,
+      "grad_norm": 0.7919661998748779,
+      "learning_rate": 0.00014511695972177187,
+      "loss": 1.0259,
+      "step": 7899
+    },
+    {
+      "epoch": 1.4065170940170941,
+      "grad_norm": 0.6774758696556091,
+      "learning_rate": 0.00014510446740053783,
+      "loss": 1.1214,
+      "step": 7900
+    },
+    {
+      "epoch": 1.4066951566951567,
+      "grad_norm": 0.6102406978607178,
+      "learning_rate": 0.0001450919741955754,
+      "loss": 1.1846,
+      "step": 7901
+    },
+    {
+      "epoch": 1.4068732193732194,
+      "grad_norm": 0.7189443707466125,
+      "learning_rate": 0.00014507948010712942,
+      "loss": 0.7758,
+      "step": 7902
+    },
+    {
+      "epoch": 1.407051282051282,
+      "grad_norm": 0.654153048992157,
+      "learning_rate": 0.00014506698513544467,
+      "loss": 0.899,
+      "step": 7903
+    },
+    {
+      "epoch": 1.4072293447293447,
+      "grad_norm": 0.637934684753418,
+      "learning_rate": 0.00014505448928076598,
+      "loss": 0.8301,
+      "step": 7904
+    },
+    {
+      "epoch": 1.4074074074074074,
+      "grad_norm": 0.7504615783691406,
+      "learning_rate": 0.00014504199254333812,
+      "loss": 0.9883,
+      "step": 7905
+    },
+    {
+      "epoch": 1.4075854700854702,
+      "grad_norm": 0.7902522683143616,
+      "learning_rate": 0.00014502949492340602,
+      "loss": 0.9615,
+      "step": 7906
+    },
+    {
+      "epoch": 1.4077635327635327,
+      "grad_norm": 0.5832732319831848,
+      "learning_rate": 0.0001450169964212145,
+      "loss": 0.7136,
+      "step": 7907
+    },
+    {
+      "epoch": 1.4079415954415955,
+      "grad_norm": 0.6025400757789612,
+      "learning_rate": 0.00014500449703700846,
+      "loss": 0.8812,
+      "step": 7908
+    },
+    {
+      "epoch": 1.408119658119658,
+      "grad_norm": 0.6412411332130432,
+      "learning_rate": 0.0001449919967710328,
+      "loss": 0.9346,
+      "step": 7909
+    },
+    {
+      "epoch": 1.4082977207977208,
+      "grad_norm": 0.7546970844268799,
+      "learning_rate": 0.00014497949562353242,
+      "loss": 1.0794,
+      "step": 7910
+    },
+    {
+      "epoch": 1.4084757834757835,
+      "grad_norm": 0.6175593733787537,
+      "learning_rate": 0.00014496699359475222,
+      "loss": 0.8939,
+      "step": 7911
+    },
+    {
+      "epoch": 1.4086538461538463,
+      "grad_norm": 0.6571716666221619,
+      "learning_rate": 0.00014495449068493722,
+      "loss": 1.1003,
+      "step": 7912
+    },
+    {
+      "epoch": 1.4088319088319088,
+      "grad_norm": 0.7038990259170532,
+      "learning_rate": 0.00014494198689433236,
+      "loss": 0.8844,
+      "step": 7913
+    },
+    {
+      "epoch": 1.4090099715099715,
+      "grad_norm": 0.7007337212562561,
+      "learning_rate": 0.00014492948222318263,
+      "loss": 1.2038,
+      "step": 7914
+    },
+    {
+      "epoch": 1.409188034188034,
+      "grad_norm": 0.7318591475486755,
+      "learning_rate": 0.00014491697667173302,
+      "loss": 1.0388,
+      "step": 7915
+    },
+    {
+      "epoch": 1.4093660968660968,
+      "grad_norm": 0.7010329961776733,
+      "learning_rate": 0.00014490447024022855,
+      "loss": 1.1485,
+      "step": 7916
+    },
+    {
+      "epoch": 1.4095441595441596,
+      "grad_norm": 0.7844831347465515,
+      "learning_rate": 0.0001448919629289143,
+      "loss": 1.1417,
+      "step": 7917
+    },
+    {
+      "epoch": 1.4097222222222223,
+      "grad_norm": 0.6953392624855042,
+      "learning_rate": 0.00014487945473803525,
+      "loss": 0.9546,
+      "step": 7918
+    },
+    {
+      "epoch": 1.4099002849002849,
+      "grad_norm": 0.6307587623596191,
+      "learning_rate": 0.00014486694566783655,
+      "loss": 0.9912,
+      "step": 7919
+    },
+    {
+      "epoch": 1.4100783475783476,
+      "grad_norm": 0.6200215816497803,
+      "learning_rate": 0.00014485443571856326,
+      "loss": 1.0998,
+      "step": 7920
+    },
+    {
+      "epoch": 1.4102564102564101,
+      "grad_norm": 0.7096502184867859,
+      "learning_rate": 0.00014484192489046043,
+      "loss": 0.9587,
+      "step": 7921
+    },
+    {
+      "epoch": 1.4104344729344729,
+      "grad_norm": 0.6965526342391968,
+      "learning_rate": 0.00014482941318377327,
+      "loss": 0.8791,
+      "step": 7922
+    },
+    {
+      "epoch": 1.4106125356125356,
+      "grad_norm": 0.7303466200828552,
+      "learning_rate": 0.00014481690059874687,
+      "loss": 1.084,
+      "step": 7923
+    },
+    {
+      "epoch": 1.4107905982905984,
+      "grad_norm": 0.6144066452980042,
+      "learning_rate": 0.00014480438713562638,
+      "loss": 0.9646,
+      "step": 7924
+    },
+    {
+      "epoch": 1.410968660968661,
+      "grad_norm": 0.645222008228302,
+      "learning_rate": 0.00014479187279465704,
+      "loss": 0.728,
+      "step": 7925
+    },
+    {
+      "epoch": 1.4111467236467237,
+      "grad_norm": 0.6069912314414978,
+      "learning_rate": 0.000144779357576084,
+      "loss": 0.842,
+      "step": 7926
+    },
+    {
+      "epoch": 1.4113247863247862,
+      "grad_norm": 0.6212135553359985,
+      "learning_rate": 0.00014476684148015243,
+      "loss": 0.9817,
+      "step": 7927
+    },
+    {
+      "epoch": 1.411502849002849,
+      "grad_norm": 0.6893343329429626,
+      "learning_rate": 0.00014475432450710763,
+      "loss": 1.0265,
+      "step": 7928
+    },
+    {
+      "epoch": 1.4116809116809117,
+      "grad_norm": 0.6842793822288513,
+      "learning_rate": 0.00014474180665719478,
+      "loss": 1.0593,
+      "step": 7929
+    },
+    {
+      "epoch": 1.4118589743589745,
+      "grad_norm": 0.74690842628479,
+      "learning_rate": 0.0001447292879306592,
+      "loss": 0.9096,
+      "step": 7930
+    },
+    {
+      "epoch": 1.412037037037037,
+      "grad_norm": 0.6624761819839478,
+      "learning_rate": 0.00014471676832774613,
+      "loss": 1.2244,
+      "step": 7931
+    },
+    {
+      "epoch": 1.4122150997150997,
+      "grad_norm": 0.6205778121948242,
+      "learning_rate": 0.00014470424784870088,
+      "loss": 1.1,
+      "step": 7932
+    },
+    {
+      "epoch": 1.4123931623931623,
+      "grad_norm": 0.7592337131500244,
+      "learning_rate": 0.00014469172649376875,
+      "loss": 0.963,
+      "step": 7933
+    },
+    {
+      "epoch": 1.412571225071225,
+      "grad_norm": 0.673328697681427,
+      "learning_rate": 0.00014467920426319508,
+      "loss": 0.8923,
+      "step": 7934
+    },
+    {
+      "epoch": 1.4127492877492878,
+      "grad_norm": 0.6064394116401672,
+      "learning_rate": 0.00014466668115722522,
+      "loss": 0.9679,
+      "step": 7935
+    },
+    {
+      "epoch": 1.4129273504273505,
+      "grad_norm": 0.7738677859306335,
+      "learning_rate": 0.00014465415717610454,
+      "loss": 1.0678,
+      "step": 7936
+    },
+    {
+      "epoch": 1.413105413105413,
+      "grad_norm": 0.7013397812843323,
+      "learning_rate": 0.00014464163232007836,
+      "loss": 0.9017,
+      "step": 7937
+    },
+    {
+      "epoch": 1.4132834757834758,
+      "grad_norm": 0.713291347026825,
+      "learning_rate": 0.0001446291065893922,
+      "loss": 1.1953,
+      "step": 7938
+    },
+    {
+      "epoch": 1.4134615384615383,
+      "grad_norm": 0.7538655996322632,
+      "learning_rate": 0.00014461657998429136,
+      "loss": 1.0571,
+      "step": 7939
+    },
+    {
+      "epoch": 1.413639601139601,
+      "grad_norm": 0.6358973383903503,
+      "learning_rate": 0.00014460405250502133,
+      "loss": 0.8552,
+      "step": 7940
+    },
+    {
+      "epoch": 1.4138176638176638,
+      "grad_norm": 0.67508864402771,
+      "learning_rate": 0.00014459152415182756,
+      "loss": 1.0293,
+      "step": 7941
+    },
+    {
+      "epoch": 1.4139957264957266,
+      "grad_norm": 0.7074598670005798,
+      "learning_rate": 0.00014457899492495546,
+      "loss": 1.2102,
+      "step": 7942
+    },
+    {
+      "epoch": 1.414173789173789,
+      "grad_norm": 0.7157037854194641,
+      "learning_rate": 0.00014456646482465058,
+      "loss": 1.0566,
+      "step": 7943
+    },
+    {
+      "epoch": 1.4143518518518519,
+      "grad_norm": 0.7918477058410645,
+      "learning_rate": 0.00014455393385115844,
+      "loss": 1.3727,
+      "step": 7944
+    },
+    {
+      "epoch": 1.4145299145299146,
+      "grad_norm": 0.569144606590271,
+      "learning_rate": 0.0001445414020047245,
+      "loss": 0.7251,
+      "step": 7945
+    },
+    {
+      "epoch": 1.4147079772079771,
+      "grad_norm": 0.7589054107666016,
+      "learning_rate": 0.0001445288692855943,
+      "loss": 1.0155,
+      "step": 7946
+    },
+    {
+      "epoch": 1.41488603988604,
+      "grad_norm": 0.7531685829162598,
+      "learning_rate": 0.0001445163356940134,
+      "loss": 0.8404,
+      "step": 7947
+    },
+    {
+      "epoch": 1.4150641025641026,
+      "grad_norm": 0.5730917453765869,
+      "learning_rate": 0.0001445038012302274,
+      "loss": 0.8215,
+      "step": 7948
+    },
+    {
+      "epoch": 1.4152421652421652,
+      "grad_norm": 0.6960710883140564,
+      "learning_rate": 0.00014449126589448187,
+      "loss": 0.7902,
+      "step": 7949
+    },
+    {
+      "epoch": 1.415420227920228,
+      "grad_norm": 0.8207054138183594,
+      "learning_rate": 0.0001444787296870224,
+      "loss": 1.493,
+      "step": 7950
+    },
+    {
+      "epoch": 1.4155982905982907,
+      "grad_norm": 0.5854668617248535,
+      "learning_rate": 0.00014446619260809462,
+      "loss": 0.9262,
+      "step": 7951
+    },
+    {
+      "epoch": 1.4157763532763532,
+      "grad_norm": 0.5458414554595947,
+      "learning_rate": 0.00014445365465794413,
+      "loss": 0.8431,
+      "step": 7952
+    },
+    {
+      "epoch": 1.415954415954416,
+      "grad_norm": 0.6880569458007812,
+      "learning_rate": 0.00014444111583681666,
+      "loss": 1.0184,
+      "step": 7953
+    },
+    {
+      "epoch": 1.4161324786324787,
+      "grad_norm": 0.6391083598136902,
+      "learning_rate": 0.00014442857614495783,
+      "loss": 0.88,
+      "step": 7954
+    },
+    {
+      "epoch": 1.4163105413105412,
+      "grad_norm": 0.6246135234832764,
+      "learning_rate": 0.00014441603558261335,
+      "loss": 0.776,
+      "step": 7955
+    },
+    {
+      "epoch": 1.416488603988604,
+      "grad_norm": 0.6263493895530701,
+      "learning_rate": 0.00014440349415002893,
+      "loss": 0.9069,
+      "step": 7956
+    },
+    {
+      "epoch": 1.4166666666666667,
+      "grad_norm": 0.7123475670814514,
+      "learning_rate": 0.00014439095184745024,
+      "loss": 0.8339,
+      "step": 7957
+    },
+    {
+      "epoch": 1.4168447293447293,
+      "grad_norm": 0.7171050906181335,
+      "learning_rate": 0.00014437840867512309,
+      "loss": 1.0633,
+      "step": 7958
+    },
+    {
+      "epoch": 1.417022792022792,
+      "grad_norm": 0.7097769975662231,
+      "learning_rate": 0.00014436586463329322,
+      "loss": 1.0852,
+      "step": 7959
+    },
+    {
+      "epoch": 1.4172008547008548,
+      "grad_norm": 0.6889223456382751,
+      "learning_rate": 0.00014435331972220637,
+      "loss": 0.916,
+      "step": 7960
+    },
+    {
+      "epoch": 1.4173789173789173,
+      "grad_norm": 0.6674435138702393,
+      "learning_rate": 0.0001443407739421084,
+      "loss": 0.9307,
+      "step": 7961
+    },
+    {
+      "epoch": 1.41755698005698,
+      "grad_norm": 0.6578894853591919,
+      "learning_rate": 0.00014432822729324503,
+      "loss": 0.8767,
+      "step": 7962
+    },
+    {
+      "epoch": 1.4177350427350428,
+      "grad_norm": 0.7145379781723022,
+      "learning_rate": 0.00014431567977586212,
+      "loss": 0.9962,
+      "step": 7963
+    },
+    {
+      "epoch": 1.4179131054131053,
+      "grad_norm": 0.6916680335998535,
+      "learning_rate": 0.00014430313139020555,
+      "loss": 1.0464,
+      "step": 7964
+    },
+    {
+      "epoch": 1.418091168091168,
+      "grad_norm": 0.6296181678771973,
+      "learning_rate": 0.00014429058213652116,
+      "loss": 1.0699,
+      "step": 7965
+    },
+    {
+      "epoch": 1.4182692307692308,
+      "grad_norm": 0.5640227198600769,
+      "learning_rate": 0.00014427803201505482,
+      "loss": 0.7006,
+      "step": 7966
+    },
+    {
+      "epoch": 1.4184472934472934,
+      "grad_norm": 0.7181212306022644,
+      "learning_rate": 0.0001442654810260524,
+      "loss": 1.1648,
+      "step": 7967
+    },
+    {
+      "epoch": 1.4186253561253561,
+      "grad_norm": 0.6830772757530212,
+      "learning_rate": 0.00014425292916975984,
+      "loss": 1.0641,
+      "step": 7968
+    },
+    {
+      "epoch": 1.4188034188034189,
+      "grad_norm": 0.665716290473938,
+      "learning_rate": 0.00014424037644642307,
+      "loss": 0.8769,
+      "step": 7969
+    },
+    {
+      "epoch": 1.4189814814814814,
+      "grad_norm": 0.8088666796684265,
+      "learning_rate": 0.00014422782285628802,
+      "loss": 1.1496,
+      "step": 7970
+    },
+    {
+      "epoch": 1.4191595441595442,
+      "grad_norm": 0.7186072468757629,
+      "learning_rate": 0.00014421526839960064,
+      "loss": 0.7421,
+      "step": 7971
+    },
+    {
+      "epoch": 1.419337606837607,
+      "grad_norm": 0.6405926942825317,
+      "learning_rate": 0.00014420271307660694,
+      "loss": 1.0139,
+      "step": 7972
+    },
+    {
+      "epoch": 1.4195156695156697,
+      "grad_norm": 0.7097104787826538,
+      "learning_rate": 0.0001441901568875529,
+      "loss": 1.1582,
+      "step": 7973
+    },
+    {
+      "epoch": 1.4196937321937322,
+      "grad_norm": 0.7347947359085083,
+      "learning_rate": 0.00014417759983268452,
+      "loss": 0.9751,
+      "step": 7974
+    },
+    {
+      "epoch": 1.419871794871795,
+      "grad_norm": 0.6999621987342834,
+      "learning_rate": 0.00014416504191224787,
+      "loss": 0.9419,
+      "step": 7975
+    },
+    {
+      "epoch": 1.4200498575498575,
+      "grad_norm": 0.6500616073608398,
+      "learning_rate": 0.00014415248312648897,
+      "loss": 0.9407,
+      "step": 7976
+    },
+    {
+      "epoch": 1.4202279202279202,
+      "grad_norm": 0.6368781328201294,
+      "learning_rate": 0.00014413992347565383,
+      "loss": 1.1224,
+      "step": 7977
+    },
+    {
+      "epoch": 1.420405982905983,
+      "grad_norm": 0.6422648429870605,
+      "learning_rate": 0.00014412736295998864,
+      "loss": 0.9573,
+      "step": 7978
+    },
+    {
+      "epoch": 1.4205840455840457,
+      "grad_norm": 0.744057297706604,
+      "learning_rate": 0.00014411480157973942,
+      "loss": 1.1384,
+      "step": 7979
+    },
+    {
+      "epoch": 1.4207621082621082,
+      "grad_norm": 0.5905839204788208,
+      "learning_rate": 0.00014410223933515232,
+      "loss": 0.8212,
+      "step": 7980
+    },
+    {
+      "epoch": 1.420940170940171,
+      "grad_norm": 0.5905438661575317,
+      "learning_rate": 0.0001440896762264734,
+      "loss": 0.8281,
+      "step": 7981
+    },
+    {
+      "epoch": 1.4211182336182335,
+      "grad_norm": 0.7087140679359436,
+      "learning_rate": 0.00014407711225394892,
+      "loss": 1.0165,
+      "step": 7982
+    },
+    {
+      "epoch": 1.4212962962962963,
+      "grad_norm": 0.6173902153968811,
+      "learning_rate": 0.00014406454741782495,
+      "loss": 0.8823,
+      "step": 7983
+    },
+    {
+      "epoch": 1.421474358974359,
+      "grad_norm": 0.6649761199951172,
+      "learning_rate": 0.00014405198171834772,
+      "loss": 0.9489,
+      "step": 7984
+    },
+    {
+      "epoch": 1.4216524216524218,
+      "grad_norm": 0.619286835193634,
+      "learning_rate": 0.00014403941515576344,
+      "loss": 0.8149,
+      "step": 7985
+    },
+    {
+      "epoch": 1.4218304843304843,
+      "grad_norm": 0.6358469724655151,
+      "learning_rate": 0.0001440268477303183,
+      "loss": 1.0558,
+      "step": 7986
+    },
+    {
+      "epoch": 1.422008547008547,
+      "grad_norm": 0.7239769697189331,
+      "learning_rate": 0.0001440142794422585,
+      "loss": 1.0528,
+      "step": 7987
+    },
+    {
+      "epoch": 1.4221866096866096,
+      "grad_norm": 0.681168794631958,
+      "learning_rate": 0.00014400171029183036,
+      "loss": 1.0867,
+      "step": 7988
+    },
+    {
+      "epoch": 1.4223646723646723,
+      "grad_norm": 0.6741157174110413,
+      "learning_rate": 0.0001439891402792801,
+      "loss": 0.9153,
+      "step": 7989
+    },
+    {
+      "epoch": 1.422542735042735,
+      "grad_norm": 0.5881659984588623,
+      "learning_rate": 0.00014397656940485403,
+      "loss": 0.92,
+      "step": 7990
+    },
+    {
+      "epoch": 1.4227207977207978,
+      "grad_norm": 0.637093722820282,
+      "learning_rate": 0.00014396399766879842,
+      "loss": 0.921,
+      "step": 7991
+    },
+    {
+      "epoch": 1.4228988603988604,
+      "grad_norm": 0.7760605216026306,
+      "learning_rate": 0.0001439514250713596,
+      "loss": 1.1451,
+      "step": 7992
+    },
+    {
+      "epoch": 1.4230769230769231,
+      "grad_norm": 0.6619600653648376,
+      "learning_rate": 0.00014393885161278393,
+      "loss": 1.0365,
+      "step": 7993
+    },
+    {
+      "epoch": 1.4232549857549857,
+      "grad_norm": 0.5354374051094055,
+      "learning_rate": 0.0001439262772933177,
+      "loss": 0.8718,
+      "step": 7994
+    },
+    {
+      "epoch": 1.4234330484330484,
+      "grad_norm": 0.7063560485839844,
+      "learning_rate": 0.00014391370211320735,
+      "loss": 0.8258,
+      "step": 7995
+    },
+    {
+      "epoch": 1.4236111111111112,
+      "grad_norm": 0.6876368522644043,
+      "learning_rate": 0.00014390112607269923,
+      "loss": 0.9579,
+      "step": 7996
+    },
+    {
+      "epoch": 1.423789173789174,
+      "grad_norm": 0.6976612210273743,
+      "learning_rate": 0.00014388854917203974,
+      "loss": 1.0376,
+      "step": 7997
+    },
+    {
+      "epoch": 1.4239672364672364,
+      "grad_norm": 0.6157355308532715,
+      "learning_rate": 0.00014387597141147525,
+      "loss": 0.8743,
+      "step": 7998
+    },
+    {
+      "epoch": 1.4241452991452992,
+      "grad_norm": 0.7273156046867371,
+      "learning_rate": 0.0001438633927912523,
+      "loss": 1.101,
+      "step": 7999
+    },
+    {
+      "epoch": 1.4243233618233617,
+      "grad_norm": 0.918380618095398,
+      "learning_rate": 0.0001438508133116173,
+      "loss": 0.9625,
+      "step": 8000
+    },
+    {
+      "epoch": 1.4245014245014245,
+      "grad_norm": 0.626040518283844,
+      "learning_rate": 0.00014383823297281666,
+      "loss": 0.9552,
+      "step": 8001
+    },
+    {
+      "epoch": 1.4246794871794872,
+      "grad_norm": 0.7320386171340942,
+      "learning_rate": 0.00014382565177509693,
+      "loss": 1.0719,
+      "step": 8002
+    },
+    {
+      "epoch": 1.42485754985755,
+      "grad_norm": 0.7283148169517517,
+      "learning_rate": 0.0001438130697187046,
+      "loss": 1.0455,
+      "step": 8003
+    },
+    {
+      "epoch": 1.4250356125356125,
+      "grad_norm": 0.6614177823066711,
+      "learning_rate": 0.00014380048680388613,
+      "loss": 0.9876,
+      "step": 8004
+    },
+    {
+      "epoch": 1.4252136752136753,
+      "grad_norm": 0.6726453900337219,
+      "learning_rate": 0.00014378790303088817,
+      "loss": 0.9861,
+      "step": 8005
+    },
+    {
+      "epoch": 1.4253917378917378,
+      "grad_norm": 0.7968725562095642,
+      "learning_rate": 0.00014377531839995718,
+      "loss": 1.1662,
+      "step": 8006
+    },
+    {
+      "epoch": 1.4255698005698005,
+      "grad_norm": 0.6510586738586426,
+      "learning_rate": 0.0001437627329113398,
+      "loss": 0.9452,
+      "step": 8007
+    },
+    {
+      "epoch": 1.4257478632478633,
+      "grad_norm": 0.6933155655860901,
+      "learning_rate": 0.00014375014656528253,
+      "loss": 1.0149,
+      "step": 8008
+    },
+    {
+      "epoch": 1.425925925925926,
+      "grad_norm": 0.7141832113265991,
+      "learning_rate": 0.00014373755936203204,
+      "loss": 1.0667,
+      "step": 8009
+    },
+    {
+      "epoch": 1.4261039886039886,
+      "grad_norm": 0.6352181434631348,
+      "learning_rate": 0.00014372497130183494,
+      "loss": 0.8652,
+      "step": 8010
+    },
+    {
+      "epoch": 1.4262820512820513,
+      "grad_norm": 0.7494860291481018,
+      "learning_rate": 0.00014371238238493786,
+      "loss": 0.9592,
+      "step": 8011
+    },
+    {
+      "epoch": 1.4264601139601139,
+      "grad_norm": 0.610556423664093,
+      "learning_rate": 0.00014369979261158746,
+      "loss": 0.7015,
+      "step": 8012
+    },
+    {
+      "epoch": 1.4266381766381766,
+      "grad_norm": 0.7305756211280823,
+      "learning_rate": 0.00014368720198203037,
+      "loss": 0.9681,
+      "step": 8013
+    },
+    {
+      "epoch": 1.4268162393162394,
+      "grad_norm": 0.6964020133018494,
+      "learning_rate": 0.0001436746104965133,
+      "loss": 1.1166,
+      "step": 8014
+    },
+    {
+      "epoch": 1.426994301994302,
+      "grad_norm": 0.7449237108230591,
+      "learning_rate": 0.00014366201815528302,
+      "loss": 1.1331,
+      "step": 8015
+    },
+    {
+      "epoch": 1.4271723646723646,
+      "grad_norm": 0.625834047794342,
+      "learning_rate": 0.00014364942495858615,
+      "loss": 0.8796,
+      "step": 8016
+    },
+    {
+      "epoch": 1.4273504273504274,
+      "grad_norm": 0.664559006690979,
+      "learning_rate": 0.0001436368309066695,
+      "loss": 1.0263,
+      "step": 8017
+    },
+    {
+      "epoch": 1.42752849002849,
+      "grad_norm": Infinity,
+      "learning_rate": 0.0001436368309066695,
+      "loss": 1.0731,
+      "step": 8018
+    },
+    {
+      "epoch": 1.4277065527065527,
+      "grad_norm": 0.6714464426040649,
+      "learning_rate": 0.00014362423599977977,
+      "loss": 0.9345,
+      "step": 8019
+    },
+    {
+      "epoch": 1.4278846153846154,
+      "grad_norm": 0.7595751285552979,
+      "learning_rate": 0.00014361164023816376,
+      "loss": 0.9646,
+      "step": 8020
+    },
+    {
+      "epoch": 1.4280626780626782,
+      "grad_norm": 0.6413954496383667,
+      "learning_rate": 0.00014359904362206828,
+      "loss": 1.0471,
+      "step": 8021
+    },
+    {
+      "epoch": 1.4282407407407407,
+      "grad_norm": 0.7298843264579773,
+      "learning_rate": 0.00014358644615174008,
+      "loss": 0.8932,
+      "step": 8022
+    },
+    {
+      "epoch": 1.4284188034188035,
+      "grad_norm": 0.8022156953811646,
+      "learning_rate": 0.00014357384782742602,
+      "loss": 1.0437,
+      "step": 8023
+    },
+    {
+      "epoch": 1.428596866096866,
+      "grad_norm": 0.7264443635940552,
+      "learning_rate": 0.00014356124864937296,
+      "loss": 0.9368,
+      "step": 8024
+    },
+    {
+      "epoch": 1.4287749287749287,
+      "grad_norm": 0.6819384098052979,
+      "learning_rate": 0.00014354864861782768,
+      "loss": 1.0,
+      "step": 8025
+    },
+    {
+      "epoch": 1.4289529914529915,
+      "grad_norm": 0.5945104956626892,
+      "learning_rate": 0.0001435360477330371,
+      "loss": 0.8108,
+      "step": 8026
+    },
+    {
+      "epoch": 1.4291310541310542,
+      "grad_norm": 0.6497398018836975,
+      "learning_rate": 0.0001435234459952481,
+      "loss": 0.8712,
+      "step": 8027
+    },
+    {
+      "epoch": 1.4293091168091168,
+      "grad_norm": 0.6424077749252319,
+      "learning_rate": 0.0001435108434047076,
+      "loss": 0.9172,
+      "step": 8028
+    },
+    {
+      "epoch": 1.4294871794871795,
+      "grad_norm": 0.6806963086128235,
+      "learning_rate": 0.00014349823996166253,
+      "loss": 1.1648,
+      "step": 8029
+    },
+    {
+      "epoch": 1.429665242165242,
+      "grad_norm": 0.6601083874702454,
+      "learning_rate": 0.00014348563566635977,
+      "loss": 0.9453,
+      "step": 8030
+    },
+    {
+      "epoch": 1.4298433048433048,
+      "grad_norm": 0.7024385929107666,
+      "learning_rate": 0.00014347303051904636,
+      "loss": 1.074,
+      "step": 8031
+    },
+    {
+      "epoch": 1.4300213675213675,
+      "grad_norm": 0.7094005942344666,
+      "learning_rate": 0.00014346042451996918,
+      "loss": 0.9976,
+      "step": 8032
+    },
+    {
+      "epoch": 1.4301994301994303,
+      "grad_norm": 0.6775936484336853,
+      "learning_rate": 0.0001434478176693753,
+      "loss": 0.9039,
+      "step": 8033
+    },
+    {
+      "epoch": 1.4303774928774928,
+      "grad_norm": 0.6920986771583557,
+      "learning_rate": 0.00014343520996751166,
+      "loss": 0.9122,
+      "step": 8034
+    },
+    {
+      "epoch": 1.4305555555555556,
+      "grad_norm": 0.720690906047821,
+      "learning_rate": 0.00014342260141462528,
+      "loss": 1.1028,
+      "step": 8035
+    },
+    {
+      "epoch": 1.430733618233618,
+      "grad_norm": 0.624546229839325,
+      "learning_rate": 0.00014340999201096328,
+      "loss": 0.9083,
+      "step": 8036
+    },
+    {
+      "epoch": 1.4309116809116809,
+      "grad_norm": 0.6560490727424622,
+      "learning_rate": 0.00014339738175677265,
+      "loss": 0.8029,
+      "step": 8037
+    },
+    {
+      "epoch": 1.4310897435897436,
+      "grad_norm": 0.8266100883483887,
+      "learning_rate": 0.00014338477065230047,
+      "loss": 0.9655,
+      "step": 8038
+    },
+    {
+      "epoch": 1.4312678062678064,
+      "grad_norm": 0.6593570113182068,
+      "learning_rate": 0.00014337215869779385,
+      "loss": 1.0299,
+      "step": 8039
+    },
+    {
+      "epoch": 1.431445868945869,
+      "grad_norm": 0.6321794390678406,
+      "learning_rate": 0.00014335954589349986,
+      "loss": 0.8755,
+      "step": 8040
+    },
+    {
+      "epoch": 1.4316239316239316,
+      "grad_norm": 0.7030870318412781,
+      "learning_rate": 0.00014334693223966562,
+      "loss": 1.1226,
+      "step": 8041
+    },
+    {
+      "epoch": 1.4318019943019942,
+      "grad_norm": 0.7794312238693237,
+      "learning_rate": 0.0001433343177365383,
+      "loss": 1.1252,
+      "step": 8042
+    },
+    {
+      "epoch": 1.431980056980057,
+      "grad_norm": 0.6115018129348755,
+      "learning_rate": 0.00014332170238436507,
+      "loss": 0.8753,
+      "step": 8043
+    },
+    {
+      "epoch": 1.4321581196581197,
+      "grad_norm": 0.8525674939155579,
+      "learning_rate": 0.00014330908618339304,
+      "loss": 0.9135,
+      "step": 8044
+    },
+    {
+      "epoch": 1.4323361823361824,
+      "grad_norm": 0.6869912147521973,
+      "learning_rate": 0.00014329646913386948,
+      "loss": 0.868,
+      "step": 8045
+    },
+    {
+      "epoch": 1.432514245014245,
+      "grad_norm": 0.5877542495727539,
+      "learning_rate": 0.0001432838512360415,
+      "loss": 0.9051,
+      "step": 8046
+    },
+    {
+      "epoch": 1.4326923076923077,
+      "grad_norm": 0.6609327793121338,
+      "learning_rate": 0.0001432712324901564,
+      "loss": 0.9084,
+      "step": 8047
+    },
+    {
+      "epoch": 1.4328703703703702,
+      "grad_norm": 0.6318345069885254,
+      "learning_rate": 0.0001432586128964614,
+      "loss": 0.8291,
+      "step": 8048
+    },
+    {
+      "epoch": 1.433048433048433,
+      "grad_norm": 0.6973567008972168,
+      "learning_rate": 0.0001432459924552037,
+      "loss": 0.97,
+      "step": 8049
+    },
+    {
+      "epoch": 1.4332264957264957,
+      "grad_norm": 0.6838201284408569,
+      "learning_rate": 0.00014323337116663062,
+      "loss": 1.0957,
+      "step": 8050
+    },
+    {
+      "epoch": 1.4334045584045585,
+      "grad_norm": 0.7472857236862183,
+      "learning_rate": 0.00014322074903098944,
+      "loss": 1.0981,
+      "step": 8051
+    },
+    {
+      "epoch": 1.433582621082621,
+      "grad_norm": 0.7723061442375183,
+      "learning_rate": 0.0001432081260485275,
+      "loss": 1.2231,
+      "step": 8052
+    },
+    {
+      "epoch": 1.4337606837606838,
+      "grad_norm": 0.681834876537323,
+      "learning_rate": 0.00014319550221949208,
+      "loss": 1.073,
+      "step": 8053
+    },
+    {
+      "epoch": 1.4339387464387463,
+      "grad_norm": 0.6566045880317688,
+      "learning_rate": 0.00014318287754413051,
+      "loss": 1.1298,
+      "step": 8054
+    },
+    {
+      "epoch": 1.434116809116809,
+      "grad_norm": 0.6792440414428711,
+      "learning_rate": 0.00014317025202269015,
+      "loss": 1.2224,
+      "step": 8055
+    },
+    {
+      "epoch": 1.4342948717948718,
+      "grad_norm": 0.7946709394454956,
+      "learning_rate": 0.00014315762565541838,
+      "loss": 1.0728,
+      "step": 8056
+    },
+    {
+      "epoch": 1.4344729344729346,
+      "grad_norm": 0.633466899394989,
+      "learning_rate": 0.00014314499844256262,
+      "loss": 0.944,
+      "step": 8057
+    },
+    {
+      "epoch": 1.434650997150997,
+      "grad_norm": 0.7308502197265625,
+      "learning_rate": 0.00014313237038437023,
+      "loss": 1.0684,
+      "step": 8058
+    },
+    {
+      "epoch": 1.4348290598290598,
+      "grad_norm": 0.6483737230300903,
+      "learning_rate": 0.00014311974148108862,
+      "loss": 1.0843,
+      "step": 8059
+    },
+    {
+      "epoch": 1.4350071225071226,
+      "grad_norm": 0.6301209926605225,
+      "learning_rate": 0.00014310711173296526,
+      "loss": 1.0083,
+      "step": 8060
+    },
+    {
+      "epoch": 1.4351851851851851,
+      "grad_norm": 0.6674302816390991,
+      "learning_rate": 0.00014309448114024757,
+      "loss": 0.9877,
+      "step": 8061
+    },
+    {
+      "epoch": 1.4353632478632479,
+      "grad_norm": 0.6888732314109802,
+      "learning_rate": 0.00014308184970318307,
+      "loss": 0.9937,
+      "step": 8062
+    },
+    {
+      "epoch": 1.4355413105413106,
+      "grad_norm": 0.6922950148582458,
+      "learning_rate": 0.00014306921742201923,
+      "loss": 1.0149,
+      "step": 8063
+    },
+    {
+      "epoch": 1.4357193732193732,
+      "grad_norm": 0.6050686240196228,
+      "learning_rate": 0.00014305658429700352,
+      "loss": 0.7882,
+      "step": 8064
+    },
+    {
+      "epoch": 1.435897435897436,
+      "grad_norm": 0.5080767869949341,
+      "learning_rate": 0.00014304395032838348,
+      "loss": 0.7796,
+      "step": 8065
+    },
+    {
+      "epoch": 1.4360754985754987,
+      "grad_norm": 0.6382707953453064,
+      "learning_rate": 0.00014303131551640668,
+      "loss": 0.965,
+      "step": 8066
+    },
+    {
+      "epoch": 1.4362535612535612,
+      "grad_norm": 0.7153477668762207,
+      "learning_rate": 0.00014301867986132063,
+      "loss": 1.1277,
+      "step": 8067
+    },
+    {
+      "epoch": 1.436431623931624,
+      "grad_norm": 0.6208404898643494,
+      "learning_rate": 0.00014300604336337292,
+      "loss": 0.8246,
+      "step": 8068
+    },
+    {
+      "epoch": 1.4366096866096867,
+      "grad_norm": 0.719695508480072,
+      "learning_rate": 0.0001429934060228111,
+      "loss": 0.7681,
+      "step": 8069
+    },
+    {
+      "epoch": 1.4367877492877492,
+      "grad_norm": 0.6219030618667603,
+      "learning_rate": 0.0001429807678398828,
+      "loss": 1.0425,
+      "step": 8070
+    },
+    {
+      "epoch": 1.436965811965812,
+      "grad_norm": 0.6080238819122314,
+      "learning_rate": 0.00014296812881483566,
+      "loss": 0.8762,
+      "step": 8071
+    },
+    {
+      "epoch": 1.4371438746438747,
+      "grad_norm": 0.6264194846153259,
+      "learning_rate": 0.00014295548894791729,
+      "loss": 1.087,
+      "step": 8072
+    },
+    {
+      "epoch": 1.4373219373219372,
+      "grad_norm": 0.6503600478172302,
+      "learning_rate": 0.00014294284823937535,
+      "loss": 1.0583,
+      "step": 8073
+    },
+    {
+      "epoch": 1.4375,
+      "grad_norm": 0.7623817324638367,
+      "learning_rate": 0.0001429302066894575,
+      "loss": 1.2372,
+      "step": 8074
+    },
+    {
+      "epoch": 1.4376780626780628,
+      "grad_norm": 0.7020344138145447,
+      "learning_rate": 0.00014291756429841144,
+      "loss": 1.2163,
+      "step": 8075
+    },
+    {
+      "epoch": 1.4378561253561253,
+      "grad_norm": 0.7070338129997253,
+      "learning_rate": 0.00014290492106648484,
+      "loss": 0.986,
+      "step": 8076
+    },
+    {
+      "epoch": 1.438034188034188,
+      "grad_norm": 0.6407621502876282,
+      "learning_rate": 0.00014289227699392545,
+      "loss": 0.9329,
+      "step": 8077
+    },
+    {
+      "epoch": 1.4382122507122508,
+      "grad_norm": 0.6836710572242737,
+      "learning_rate": 0.00014287963208098098,
+      "loss": 0.9252,
+      "step": 8078
+    },
+    {
+      "epoch": 1.4383903133903133,
+      "grad_norm": 0.648642897605896,
+      "learning_rate": 0.00014286698632789922,
+      "loss": 1.0457,
+      "step": 8079
+    },
+    {
+      "epoch": 1.438568376068376,
+      "grad_norm": 0.7015881538391113,
+      "learning_rate": 0.0001428543397349279,
+      "loss": 1.0516,
+      "step": 8080
+    },
+    {
+      "epoch": 1.4387464387464388,
+      "grad_norm": 0.6031532883644104,
+      "learning_rate": 0.0001428416923023148,
+      "loss": 0.9423,
+      "step": 8081
+    },
+    {
+      "epoch": 1.4389245014245013,
+      "grad_norm": 0.8235578536987305,
+      "learning_rate": 0.00014282904403030772,
+      "loss": 1.3433,
+      "step": 8082
+    },
+    {
+      "epoch": 1.439102564102564,
+      "grad_norm": 0.7355761528015137,
+      "learning_rate": 0.00014281639491915452,
+      "loss": 1.0128,
+      "step": 8083
+    },
+    {
+      "epoch": 1.4392806267806268,
+      "grad_norm": 0.7429629564285278,
+      "learning_rate": 0.00014280374496910303,
+      "loss": 0.8546,
+      "step": 8084
+    },
+    {
+      "epoch": 1.4394586894586894,
+      "grad_norm": 0.5831776857376099,
+      "learning_rate": 0.00014279109418040105,
+      "loss": 0.9021,
+      "step": 8085
+    },
+    {
+      "epoch": 1.4396367521367521,
+      "grad_norm": 0.6585184931755066,
+      "learning_rate": 0.00014277844255329645,
+      "loss": 0.9256,
+      "step": 8086
+    },
+    {
+      "epoch": 1.4398148148148149,
+      "grad_norm": 0.6412501931190491,
+      "learning_rate": 0.00014276579008803717,
+      "loss": 0.9305,
+      "step": 8087
+    },
+    {
+      "epoch": 1.4399928774928774,
+      "grad_norm": 0.6305423378944397,
+      "learning_rate": 0.00014275313678487102,
+      "loss": 0.9471,
+      "step": 8088
+    },
+    {
+      "epoch": 1.4401709401709402,
+      "grad_norm": 0.7160914540290833,
+      "learning_rate": 0.00014274048264404602,
+      "loss": 0.8798,
+      "step": 8089
+    },
+    {
+      "epoch": 1.440349002849003,
+      "grad_norm": 0.6740858554840088,
+      "learning_rate": 0.00014272782766581004,
+      "loss": 0.9022,
+      "step": 8090
+    },
+    {
+      "epoch": 1.4405270655270654,
+      "grad_norm": 0.7554821968078613,
+      "learning_rate": 0.000142715171850411,
+      "loss": 1.0924,
+      "step": 8091
+    },
+    {
+      "epoch": 1.4407051282051282,
+      "grad_norm": 0.7361162304878235,
+      "learning_rate": 0.00014270251519809694,
+      "loss": 0.9907,
+      "step": 8092
+    },
+    {
+      "epoch": 1.440883190883191,
+      "grad_norm": 0.731813371181488,
+      "learning_rate": 0.0001426898577091158,
+      "loss": 1.1765,
+      "step": 8093
+    },
+    {
+      "epoch": 1.4410612535612537,
+      "grad_norm": 0.6877756714820862,
+      "learning_rate": 0.00014267719938371558,
+      "loss": 1.0536,
+      "step": 8094
+    },
+    {
+      "epoch": 1.4412393162393162,
+      "grad_norm": 0.6724407076835632,
+      "learning_rate": 0.00014266454022214426,
+      "loss": 1.1895,
+      "step": 8095
+    },
+    {
+      "epoch": 1.441417378917379,
+      "grad_norm": 0.6946671605110168,
+      "learning_rate": 0.0001426518802246499,
+      "loss": 1.0437,
+      "step": 8096
+    },
+    {
+      "epoch": 1.4415954415954415,
+      "grad_norm": 0.7032839059829712,
+      "learning_rate": 0.00014263921939148058,
+      "loss": 1.1363,
+      "step": 8097
+    },
+    {
+      "epoch": 1.4417735042735043,
+      "grad_norm": 0.6942192316055298,
+      "learning_rate": 0.00014262655772288434,
+      "loss": 1.315,
+      "step": 8098
+    },
+    {
+      "epoch": 1.441951566951567,
+      "grad_norm": 0.7002301812171936,
+      "learning_rate": 0.00014261389521910922,
+      "loss": 1.0546,
+      "step": 8099
+    },
+    {
+      "epoch": 1.4421296296296298,
+      "grad_norm": 0.7260788083076477,
+      "learning_rate": 0.00014260123188040335,
+      "loss": 0.9374,
+      "step": 8100
+    },
+    {
+      "epoch": 1.4423076923076923,
+      "grad_norm": 0.6629201173782349,
+      "learning_rate": 0.00014258856770701486,
+      "loss": 0.8632,
+      "step": 8101
+    },
+    {
+      "epoch": 1.442485754985755,
+      "grad_norm": 0.6570318937301636,
+      "learning_rate": 0.0001425759026991918,
+      "loss": 1.0102,
+      "step": 8102
+    },
+    {
+      "epoch": 1.4426638176638176,
+      "grad_norm": 0.7696560621261597,
+      "learning_rate": 0.00014256323685718242,
+      "loss": 0.9703,
+      "step": 8103
+    },
+    {
+      "epoch": 1.4428418803418803,
+      "grad_norm": 0.7206611633300781,
+      "learning_rate": 0.00014255057018123482,
+      "loss": 1.1728,
+      "step": 8104
+    },
+    {
+      "epoch": 1.443019943019943,
+      "grad_norm": 0.6871611475944519,
+      "learning_rate": 0.0001425379026715972,
+      "loss": 0.9377,
+      "step": 8105
+    },
+    {
+      "epoch": 1.4431980056980058,
+      "grad_norm": 0.6027442812919617,
+      "learning_rate": 0.00014252523432851775,
+      "loss": 0.9212,
+      "step": 8106
+    },
+    {
+      "epoch": 1.4433760683760684,
+      "grad_norm": 0.7149752378463745,
+      "learning_rate": 0.00014251256515224463,
+      "loss": 0.9654,
+      "step": 8107
+    },
+    {
+      "epoch": 1.443554131054131,
+      "grad_norm": 0.5949522256851196,
+      "learning_rate": 0.00014249989514302614,
+      "loss": 1.0646,
+      "step": 8108
+    },
+    {
+      "epoch": 1.4437321937321936,
+      "grad_norm": 0.7345452904701233,
+      "learning_rate": 0.0001424872243011105,
+      "loss": 0.9801,
+      "step": 8109
+    },
+    {
+      "epoch": 1.4439102564102564,
+      "grad_norm": 0.8045009970664978,
+      "learning_rate": 0.00014247455262674592,
+      "loss": 1.3529,
+      "step": 8110
+    },
+    {
+      "epoch": 1.4440883190883191,
+      "grad_norm": 0.6712123155593872,
+      "learning_rate": 0.00014246188012018073,
+      "loss": 1.0416,
+      "step": 8111
+    },
+    {
+      "epoch": 1.444266381766382,
+      "grad_norm": 0.7811154127120972,
+      "learning_rate": 0.00014244920678166322,
+      "loss": 1.2019,
+      "step": 8112
+    },
+    {
+      "epoch": 1.4444444444444444,
+      "grad_norm": 0.6834486126899719,
+      "learning_rate": 0.00014243653261144167,
+      "loss": 0.986,
+      "step": 8113
+    },
+    {
+      "epoch": 1.4446225071225072,
+      "grad_norm": 0.6901041269302368,
+      "learning_rate": 0.00014242385760976443,
+      "loss": 1.0988,
+      "step": 8114
+    },
+    {
+      "epoch": 1.4448005698005697,
+      "grad_norm": 0.6233634948730469,
+      "learning_rate": 0.00014241118177687982,
+      "loss": 0.7748,
+      "step": 8115
+    },
+    {
+      "epoch": 1.4449786324786325,
+      "grad_norm": 0.6899837851524353,
+      "learning_rate": 0.00014239850511303624,
+      "loss": 0.9734,
+      "step": 8116
+    },
+    {
+      "epoch": 1.4451566951566952,
+      "grad_norm": 0.6316244006156921,
+      "learning_rate": 0.00014238582761848197,
+      "loss": 0.7888,
+      "step": 8117
+    },
+    {
+      "epoch": 1.445334757834758,
+      "grad_norm": 0.6074259877204895,
+      "learning_rate": 0.00014237314929346545,
+      "loss": 0.8843,
+      "step": 8118
+    },
+    {
+      "epoch": 1.4455128205128205,
+      "grad_norm": 0.6112192273139954,
+      "learning_rate": 0.00014236047013823516,
+      "loss": 0.8529,
+      "step": 8119
+    },
+    {
+      "epoch": 1.4456908831908832,
+      "grad_norm": 0.6883894801139832,
+      "learning_rate": 0.0001423477901530394,
+      "loss": 0.9506,
+      "step": 8120
+    },
+    {
+      "epoch": 1.4458689458689458,
+      "grad_norm": 0.7248309254646301,
+      "learning_rate": 0.00014233510933812666,
+      "loss": 0.9573,
+      "step": 8121
+    },
+    {
+      "epoch": 1.4460470085470085,
+      "grad_norm": 0.6853367686271667,
+      "learning_rate": 0.00014232242769374542,
+      "loss": 0.9903,
+      "step": 8122
+    },
+    {
+      "epoch": 1.4462250712250713,
+      "grad_norm": 0.7179274559020996,
+      "learning_rate": 0.0001423097452201441,
+      "loss": 0.9157,
+      "step": 8123
+    },
+    {
+      "epoch": 1.446403133903134,
+      "grad_norm": 0.6704817414283752,
+      "learning_rate": 0.00014229706191757127,
+      "loss": 1.1361,
+      "step": 8124
+    },
+    {
+      "epoch": 1.4465811965811965,
+      "grad_norm": 0.6380739212036133,
+      "learning_rate": 0.00014228437778627533,
+      "loss": 0.9336,
+      "step": 8125
+    },
+    {
+      "epoch": 1.4467592592592593,
+      "grad_norm": 0.6275362372398376,
+      "learning_rate": 0.00014227169282650487,
+      "loss": 0.9617,
+      "step": 8126
+    },
+    {
+      "epoch": 1.4469373219373218,
+      "grad_norm": 0.5644828677177429,
+      "learning_rate": 0.00014225900703850836,
+      "loss": 0.7384,
+      "step": 8127
+    },
+    {
+      "epoch": 1.4471153846153846,
+      "grad_norm": 0.6522284150123596,
+      "learning_rate": 0.00014224632042253443,
+      "loss": 1.1098,
+      "step": 8128
+    },
+    {
+      "epoch": 1.4472934472934473,
+      "grad_norm": 0.6228049993515015,
+      "learning_rate": 0.0001422336329788316,
+      "loss": 1.1061,
+      "step": 8129
+    },
+    {
+      "epoch": 1.44747150997151,
+      "grad_norm": 0.6092000603675842,
+      "learning_rate": 0.00014222094470764848,
+      "loss": 0.808,
+      "step": 8130
+    },
+    {
+      "epoch": 1.4476495726495726,
+      "grad_norm": 0.667435348033905,
+      "learning_rate": 0.00014220825560923363,
+      "loss": 1.1223,
+      "step": 8131
+    },
+    {
+      "epoch": 1.4478276353276354,
+      "grad_norm": 0.6080766320228577,
+      "learning_rate": 0.0001421955656838357,
+      "loss": 1.0099,
+      "step": 8132
+    },
+    {
+      "epoch": 1.448005698005698,
+      "grad_norm": 0.7597638368606567,
+      "learning_rate": 0.00014218287493170332,
+      "loss": 0.9718,
+      "step": 8133
+    },
+    {
+      "epoch": 1.4481837606837606,
+      "grad_norm": 0.574130654335022,
+      "learning_rate": 0.0001421701833530851,
+      "loss": 0.7745,
+      "step": 8134
+    },
+    {
+      "epoch": 1.4483618233618234,
+      "grad_norm": 0.6372822523117065,
+      "learning_rate": 0.0001421574909482298,
+      "loss": 1.0088,
+      "step": 8135
+    },
+    {
+      "epoch": 1.4485398860398861,
+      "grad_norm": 0.6759644746780396,
+      "learning_rate": 0.000142144797717386,
+      "loss": 0.9684,
+      "step": 8136
+    },
+    {
+      "epoch": 1.4487179487179487,
+      "grad_norm": 0.706351637840271,
+      "learning_rate": 0.00014213210366080244,
+      "loss": 1.021,
+      "step": 8137
+    },
+    {
+      "epoch": 1.4488960113960114,
+      "grad_norm": 0.6976894736289978,
+      "learning_rate": 0.0001421194087787278,
+      "loss": 1.1038,
+      "step": 8138
+    },
+    {
+      "epoch": 1.449074074074074,
+      "grad_norm": 0.7322551012039185,
+      "learning_rate": 0.00014210671307141092,
+      "loss": 1.0213,
+      "step": 8139
+    },
+    {
+      "epoch": 1.4492521367521367,
+      "grad_norm": 0.5885626077651978,
+      "learning_rate": 0.0001420940165391004,
+      "loss": 0.821,
+      "step": 8140
+    },
+    {
+      "epoch": 1.4494301994301995,
+      "grad_norm": 0.7009791135787964,
+      "learning_rate": 0.0001420813191820451,
+      "loss": 0.8647,
+      "step": 8141
+    },
+    {
+      "epoch": 1.4496082621082622,
+      "grad_norm": 0.5715423822402954,
+      "learning_rate": 0.00014206862100049375,
+      "loss": 0.873,
+      "step": 8142
+    },
+    {
+      "epoch": 1.4497863247863247,
+      "grad_norm": 1.1452178955078125,
+      "learning_rate": 0.00014205592199469514,
+      "loss": 1.2523,
+      "step": 8143
+    },
+    {
+      "epoch": 1.4499643874643875,
+      "grad_norm": 0.8076814413070679,
+      "learning_rate": 0.00014204322216489814,
+      "loss": 1.1071,
+      "step": 8144
+    },
+    {
+      "epoch": 1.45014245014245,
+      "grad_norm": 0.7325751185417175,
+      "learning_rate": 0.00014203052151135154,
+      "loss": 0.9846,
+      "step": 8145
+    },
+    {
+      "epoch": 1.4503205128205128,
+      "grad_norm": 0.7009061574935913,
+      "learning_rate": 0.00014201782003430417,
+      "loss": 0.8153,
+      "step": 8146
+    },
+    {
+      "epoch": 1.4504985754985755,
+      "grad_norm": 0.6502353549003601,
+      "learning_rate": 0.0001420051177340049,
+      "loss": 0.8959,
+      "step": 8147
+    },
+    {
+      "epoch": 1.4506766381766383,
+      "grad_norm": 0.6134430170059204,
+      "learning_rate": 0.00014199241461070261,
+      "loss": 0.9683,
+      "step": 8148
+    },
+    {
+      "epoch": 1.4508547008547008,
+      "grad_norm": 0.720160722732544,
+      "learning_rate": 0.0001419797106646462,
+      "loss": 0.9579,
+      "step": 8149
+    },
+    {
+      "epoch": 1.4510327635327636,
+      "grad_norm": 0.6141422986984253,
+      "learning_rate": 0.00014196700589608454,
+      "loss": 0.9427,
+      "step": 8150
+    },
+    {
+      "epoch": 1.451210826210826,
+      "grad_norm": 0.6835139393806458,
+      "learning_rate": 0.00014195430030526656,
+      "loss": 1.0374,
+      "step": 8151
+    },
+    {
+      "epoch": 1.4513888888888888,
+      "grad_norm": 0.6829691529273987,
+      "learning_rate": 0.00014194159389244128,
+      "loss": 0.9418,
+      "step": 8152
+    },
+    {
+      "epoch": 1.4515669515669516,
+      "grad_norm": 0.7142195701599121,
+      "learning_rate": 0.00014192888665785755,
+      "loss": 1.1876,
+      "step": 8153
+    },
+    {
+      "epoch": 1.4517450142450143,
+      "grad_norm": 0.6719943284988403,
+      "learning_rate": 0.0001419161786017644,
+      "loss": 1.1417,
+      "step": 8154
+    },
+    {
+      "epoch": 1.4519230769230769,
+      "grad_norm": 0.6478939652442932,
+      "learning_rate": 0.0001419034697244108,
+      "loss": 0.943,
+      "step": 8155
+    },
+    {
+      "epoch": 1.4521011396011396,
+      "grad_norm": 0.6308888792991638,
+      "learning_rate": 0.00014189076002604575,
+      "loss": 0.9842,
+      "step": 8156
+    },
+    {
+      "epoch": 1.4522792022792022,
+      "grad_norm": 0.673559844493866,
+      "learning_rate": 0.00014187804950691827,
+      "loss": 0.8108,
+      "step": 8157
+    },
+    {
+      "epoch": 1.452457264957265,
+      "grad_norm": 0.5895359516143799,
+      "learning_rate": 0.00014186533816727744,
+      "loss": 0.8187,
+      "step": 8158
+    },
+    {
+      "epoch": 1.4526353276353277,
+      "grad_norm": 0.6703287363052368,
+      "learning_rate": 0.00014185262600737225,
+      "loss": 0.9012,
+      "step": 8159
+    },
+    {
+      "epoch": 1.4528133903133904,
+      "grad_norm": 0.697728157043457,
+      "learning_rate": 0.00014183991302745182,
+      "loss": 1.2572,
+      "step": 8160
+    },
+    {
+      "epoch": 1.452991452991453,
+      "grad_norm": 0.599371075630188,
+      "learning_rate": 0.00014182719922776514,
+      "loss": 1.078,
+      "step": 8161
+    },
+    {
+      "epoch": 1.4531695156695157,
+      "grad_norm": 0.6774863600730896,
+      "learning_rate": 0.00014181448460856143,
+      "loss": 1.0607,
+      "step": 8162
+    },
+    {
+      "epoch": 1.4533475783475782,
+      "grad_norm": 0.6872009038925171,
+      "learning_rate": 0.00014180176917008976,
+      "loss": 1.0713,
+      "step": 8163
+    },
+    {
+      "epoch": 1.453525641025641,
+      "grad_norm": 0.7949981093406677,
+      "learning_rate": 0.00014178905291259926,
+      "loss": 1.0471,
+      "step": 8164
+    },
+    {
+      "epoch": 1.4537037037037037,
+      "grad_norm": 0.6592127084732056,
+      "learning_rate": 0.00014177633583633908,
+      "loss": 0.8409,
+      "step": 8165
+    },
+    {
+      "epoch": 1.4538817663817665,
+      "grad_norm": 0.6745635867118835,
+      "learning_rate": 0.00014176361794155837,
+      "loss": 1.0859,
+      "step": 8166
+    },
+    {
+      "epoch": 1.454059829059829,
+      "grad_norm": 0.6661605834960938,
+      "learning_rate": 0.00014175089922850633,
+      "loss": 1.0587,
+      "step": 8167
+    },
+    {
+      "epoch": 1.4542378917378918,
+      "grad_norm": 0.6697571873664856,
+      "learning_rate": 0.00014173817969743212,
+      "loss": 0.8876,
+      "step": 8168
+    },
+    {
+      "epoch": 1.4544159544159543,
+      "grad_norm": 0.6162588000297546,
+      "learning_rate": 0.000141725459348585,
+      "loss": 0.9575,
+      "step": 8169
+    },
+    {
+      "epoch": 1.454594017094017,
+      "grad_norm": 0.6235088109970093,
+      "learning_rate": 0.00014171273818221422,
+      "loss": 0.9209,
+      "step": 8170
+    },
+    {
+      "epoch": 1.4547720797720798,
+      "grad_norm": 0.6744212508201599,
+      "learning_rate": 0.00014170001619856896,
+      "loss": 0.9704,
+      "step": 8171
+    },
+    {
+      "epoch": 1.4549501424501425,
+      "grad_norm": 0.6781345009803772,
+      "learning_rate": 0.0001416872933978985,
+      "loss": 1.1507,
+      "step": 8172
+    },
+    {
+      "epoch": 1.455128205128205,
+      "grad_norm": 0.7160060405731201,
+      "learning_rate": 0.0001416745697804521,
+      "loss": 1.2529,
+      "step": 8173
+    },
+    {
+      "epoch": 1.4553062678062678,
+      "grad_norm": 0.6742389798164368,
+      "learning_rate": 0.00014166184534647913,
+      "loss": 1.0168,
+      "step": 8174
+    },
+    {
+      "epoch": 1.4554843304843303,
+      "grad_norm": 0.6685828566551208,
+      "learning_rate": 0.0001416491200962288,
+      "loss": 1.0807,
+      "step": 8175
+    },
+    {
+      "epoch": 1.455662393162393,
+      "grad_norm": 0.6998327374458313,
+      "learning_rate": 0.0001416363940299505,
+      "loss": 1.1711,
+      "step": 8176
+    },
+    {
+      "epoch": 1.4558404558404558,
+      "grad_norm": 0.7132518291473389,
+      "learning_rate": 0.00014162366714789358,
+      "loss": 1.1392,
+      "step": 8177
+    },
+    {
+      "epoch": 1.4560185185185186,
+      "grad_norm": 0.6995887160301208,
+      "learning_rate": 0.0001416109394503073,
+      "loss": 1.3335,
+      "step": 8178
+    },
+    {
+      "epoch": 1.4561965811965811,
+      "grad_norm": 0.7161234021186829,
+      "learning_rate": 0.00014159821093744115,
+      "loss": 0.9725,
+      "step": 8179
+    },
+    {
+      "epoch": 1.4563746438746439,
+      "grad_norm": 0.7678874135017395,
+      "learning_rate": 0.00014158548160954446,
+      "loss": 1.1578,
+      "step": 8180
+    },
+    {
+      "epoch": 1.4565527065527066,
+      "grad_norm": 0.67372065782547,
+      "learning_rate": 0.00014157275146686662,
+      "loss": 1.0867,
+      "step": 8181
+    },
+    {
+      "epoch": 1.4567307692307692,
+      "grad_norm": 0.7757831811904907,
+      "learning_rate": 0.00014156002050965712,
+      "loss": 0.9768,
+      "step": 8182
+    },
+    {
+      "epoch": 1.456908831908832,
+      "grad_norm": 0.7174801230430603,
+      "learning_rate": 0.00014154728873816533,
+      "loss": 1.1712,
+      "step": 8183
+    },
+    {
+      "epoch": 1.4570868945868947,
+      "grad_norm": 0.5972673892974854,
+      "learning_rate": 0.0001415345561526407,
+      "loss": 0.9571,
+      "step": 8184
+    },
+    {
+      "epoch": 1.4572649572649572,
+      "grad_norm": 0.7999650835990906,
+      "learning_rate": 0.00014152182275333275,
+      "loss": 1.0583,
+      "step": 8185
+    },
+    {
+      "epoch": 1.45744301994302,
+      "grad_norm": 0.6737848520278931,
+      "learning_rate": 0.00014150908854049091,
+      "loss": 1.0562,
+      "step": 8186
+    },
+    {
+      "epoch": 1.4576210826210827,
+      "grad_norm": 0.7756418585777283,
+      "learning_rate": 0.00014149635351436474,
+      "loss": 1.2301,
+      "step": 8187
+    },
+    {
+      "epoch": 1.4577991452991452,
+      "grad_norm": 0.5633914470672607,
+      "learning_rate": 0.00014148361767520374,
+      "loss": 0.8847,
+      "step": 8188
+    },
+    {
+      "epoch": 1.457977207977208,
+      "grad_norm": 0.8462759256362915,
+      "learning_rate": 0.00014147088102325737,
+      "loss": 0.8046,
+      "step": 8189
+    },
+    {
+      "epoch": 1.4581552706552707,
+      "grad_norm": 0.7081632614135742,
+      "learning_rate": 0.00014145814355877526,
+      "loss": 1.0764,
+      "step": 8190
+    },
+    {
+      "epoch": 1.4583333333333333,
+      "grad_norm": 0.7357106804847717,
+      "learning_rate": 0.00014144540528200698,
+      "loss": 1.0202,
+      "step": 8191
+    },
+    {
+      "epoch": 1.458511396011396,
+      "grad_norm": 0.603566586971283,
+      "learning_rate": 0.00014143266619320204,
+      "loss": 0.8214,
+      "step": 8192
+    },
+    {
+      "epoch": 1.4586894586894588,
+      "grad_norm": 0.6829110383987427,
+      "learning_rate": 0.00014141992629261007,
+      "loss": 0.9479,
+      "step": 8193
+    },
+    {
+      "epoch": 1.4588675213675213,
+      "grad_norm": 0.6822739839553833,
+      "learning_rate": 0.00014140718558048072,
+      "loss": 0.9117,
+      "step": 8194
+    },
+    {
+      "epoch": 1.459045584045584,
+      "grad_norm": 0.7383607029914856,
+      "learning_rate": 0.00014139444405706356,
+      "loss": 0.9819,
+      "step": 8195
+    },
+    {
+      "epoch": 1.4592236467236468,
+      "grad_norm": 0.6319897770881653,
+      "learning_rate": 0.00014138170172260826,
+      "loss": 1.0508,
+      "step": 8196
+    },
+    {
+      "epoch": 1.4594017094017093,
+      "grad_norm": 0.6804461479187012,
+      "learning_rate": 0.0001413689585773645,
+      "loss": 0.992,
+      "step": 8197
+    },
+    {
+      "epoch": 1.459579772079772,
+      "grad_norm": 0.6198720335960388,
+      "learning_rate": 0.0001413562146215819,
+      "loss": 1.0113,
+      "step": 8198
+    },
+    {
+      "epoch": 1.4597578347578348,
+      "grad_norm": 0.5968540906906128,
+      "learning_rate": 0.0001413434698555102,
+      "loss": 0.7562,
+      "step": 8199
+    },
+    {
+      "epoch": 1.4599358974358974,
+      "grad_norm": 0.5370334982872009,
+      "learning_rate": 0.00014133072427939913,
+      "loss": 0.9238,
+      "step": 8200
+    },
+    {
+      "epoch": 1.46011396011396,
+      "grad_norm": 0.6652548909187317,
+      "learning_rate": 0.00014131797789349832,
+      "loss": 0.9464,
+      "step": 8201
+    },
+    {
+      "epoch": 1.4602920227920229,
+      "grad_norm": 0.637852668762207,
+      "learning_rate": 0.00014130523069805757,
+      "loss": 1.0395,
+      "step": 8202
+    },
+    {
+      "epoch": 1.4604700854700854,
+      "grad_norm": 0.8186550140380859,
+      "learning_rate": 0.00014129248269332664,
+      "loss": 1.2116,
+      "step": 8203
+    },
+    {
+      "epoch": 1.4606481481481481,
+      "grad_norm": 0.5290196537971497,
+      "learning_rate": 0.00014127973387955528,
+      "loss": 0.7331,
+      "step": 8204
+    },
+    {
+      "epoch": 1.460826210826211,
+      "grad_norm": 0.6516342163085938,
+      "learning_rate": 0.00014126698425699332,
+      "loss": 0.9275,
+      "step": 8205
+    },
+    {
+      "epoch": 1.4610042735042734,
+      "grad_norm": 0.767254114151001,
+      "learning_rate": 0.00014125423382589048,
+      "loss": 0.9355,
+      "step": 8206
+    },
+    {
+      "epoch": 1.4611823361823362,
+      "grad_norm": 0.6476777195930481,
+      "learning_rate": 0.00014124148258649668,
+      "loss": 0.9263,
+      "step": 8207
+    },
+    {
+      "epoch": 1.461360398860399,
+      "grad_norm": 0.6737871766090393,
+      "learning_rate": 0.00014122873053906167,
+      "loss": 0.9815,
+      "step": 8208
+    },
+    {
+      "epoch": 1.4615384615384617,
+      "grad_norm": 0.6311159729957581,
+      "learning_rate": 0.00014121597768383532,
+      "loss": 0.9607,
+      "step": 8209
+    },
+    {
+      "epoch": 1.4617165242165242,
+      "grad_norm": 0.6061250567436218,
+      "learning_rate": 0.00014120322402106752,
+      "loss": 0.7428,
+      "step": 8210
+    },
+    {
+      "epoch": 1.461894586894587,
+      "grad_norm": 0.6916252970695496,
+      "learning_rate": 0.00014119046955100815,
+      "loss": 0.9664,
+      "step": 8211
+    },
+    {
+      "epoch": 1.4620726495726495,
+      "grad_norm": 0.6583660840988159,
+      "learning_rate": 0.00014117771427390706,
+      "loss": 1.0645,
+      "step": 8212
+    },
+    {
+      "epoch": 1.4622507122507122,
+      "grad_norm": 0.7034604549407959,
+      "learning_rate": 0.00014116495819001425,
+      "loss": 0.9223,
+      "step": 8213
+    },
+    {
+      "epoch": 1.462428774928775,
+      "grad_norm": 0.6378605961799622,
+      "learning_rate": 0.00014115220129957954,
+      "loss": 0.7963,
+      "step": 8214
+    },
+    {
+      "epoch": 1.4626068376068377,
+      "grad_norm": 0.6251596212387085,
+      "learning_rate": 0.00014113944360285297,
+      "loss": 0.9852,
+      "step": 8215
+    },
+    {
+      "epoch": 1.4627849002849003,
+      "grad_norm": 0.7055560946464539,
+      "learning_rate": 0.00014112668510008446,
+      "loss": 0.9342,
+      "step": 8216
+    },
+    {
+      "epoch": 1.462962962962963,
+      "grad_norm": 0.6250377893447876,
+      "learning_rate": 0.00014111392579152396,
+      "loss": 0.9886,
+      "step": 8217
+    },
+    {
+      "epoch": 1.4631410256410255,
+      "grad_norm": 0.6011185050010681,
+      "learning_rate": 0.00014110116567742152,
+      "loss": 0.8465,
+      "step": 8218
+    },
+    {
+      "epoch": 1.4633190883190883,
+      "grad_norm": 0.6632489562034607,
+      "learning_rate": 0.0001410884047580271,
+      "loss": 0.8619,
+      "step": 8219
+    },
+    {
+      "epoch": 1.463497150997151,
+      "grad_norm": 0.7194828987121582,
+      "learning_rate": 0.00014107564303359076,
+      "loss": 1.1231,
+      "step": 8220
+    },
+    {
+      "epoch": 1.4636752136752138,
+      "grad_norm": 0.7640393376350403,
+      "learning_rate": 0.0001410628805043625,
+      "loss": 1.1955,
+      "step": 8221
+    },
+    {
+      "epoch": 1.4638532763532763,
+      "grad_norm": 0.9118906259536743,
+      "learning_rate": 0.0001410501171705924,
+      "loss": 1.0555,
+      "step": 8222
+    },
+    {
+      "epoch": 1.464031339031339,
+      "grad_norm": 0.7545066475868225,
+      "learning_rate": 0.00014103735303253053,
+      "loss": 0.9425,
+      "step": 8223
+    },
+    {
+      "epoch": 1.4642094017094016,
+      "grad_norm": 0.6848801970481873,
+      "learning_rate": 0.000141024588090427,
+      "loss": 1.0418,
+      "step": 8224
+    },
+    {
+      "epoch": 1.4643874643874644,
+      "grad_norm": 0.6825160384178162,
+      "learning_rate": 0.00014101182234453185,
+      "loss": 0.9615,
+      "step": 8225
+    },
+    {
+      "epoch": 1.4645655270655271,
+      "grad_norm": 0.8258556723594666,
+      "learning_rate": 0.00014099905579509527,
+      "loss": 1.1237,
+      "step": 8226
+    },
+    {
+      "epoch": 1.4647435897435899,
+      "grad_norm": 0.6427522897720337,
+      "learning_rate": 0.00014098628844236733,
+      "loss": 1.0853,
+      "step": 8227
+    },
+    {
+      "epoch": 1.4649216524216524,
+      "grad_norm": 0.6476351022720337,
+      "learning_rate": 0.00014097352028659825,
+      "loss": 1.1286,
+      "step": 8228
+    },
+    {
+      "epoch": 1.4650997150997151,
+      "grad_norm": 0.7621034383773804,
+      "learning_rate": 0.00014096075132803812,
+      "loss": 1.1402,
+      "step": 8229
+    },
+    {
+      "epoch": 1.4652777777777777,
+      "grad_norm": 0.6629892587661743,
+      "learning_rate": 0.00014094798156693718,
+      "loss": 0.7108,
+      "step": 8230
+    },
+    {
+      "epoch": 1.4654558404558404,
+      "grad_norm": 0.6902043223381042,
+      "learning_rate": 0.00014093521100354557,
+      "loss": 1.1761,
+      "step": 8231
+    },
+    {
+      "epoch": 1.4656339031339032,
+      "grad_norm": 0.7422910928726196,
+      "learning_rate": 0.00014092243963811357,
+      "loss": 0.867,
+      "step": 8232
+    },
+    {
+      "epoch": 1.465811965811966,
+      "grad_norm": 0.7424963712692261,
+      "learning_rate": 0.00014090966747089137,
+      "loss": 1.015,
+      "step": 8233
+    },
+    {
+      "epoch": 1.4659900284900285,
+      "grad_norm": 0.6855891942977905,
+      "learning_rate": 0.0001408968945021292,
+      "loss": 0.9624,
+      "step": 8234
+    },
+    {
+      "epoch": 1.4661680911680912,
+      "grad_norm": 0.5968918204307556,
+      "learning_rate": 0.00014088412073207736,
+      "loss": 0.9243,
+      "step": 8235
+    },
+    {
+      "epoch": 1.4663461538461537,
+      "grad_norm": 0.6153344511985779,
+      "learning_rate": 0.0001408713461609861,
+      "loss": 1.0305,
+      "step": 8236
+    },
+    {
+      "epoch": 1.4665242165242165,
+      "grad_norm": 0.6627458333969116,
+      "learning_rate": 0.0001408585707891057,
+      "loss": 1.1102,
+      "step": 8237
+    },
+    {
+      "epoch": 1.4667022792022792,
+      "grad_norm": 0.6475233435630798,
+      "learning_rate": 0.0001408457946166865,
+      "loss": 1.0045,
+      "step": 8238
+    },
+    {
+      "epoch": 1.466880341880342,
+      "grad_norm": 0.6792858839035034,
+      "learning_rate": 0.00014083301764397876,
+      "loss": 1.0092,
+      "step": 8239
+    },
+    {
+      "epoch": 1.4670584045584045,
+      "grad_norm": 0.6916255354881287,
+      "learning_rate": 0.00014082023987123293,
+      "loss": 1.0761,
+      "step": 8240
+    },
+    {
+      "epoch": 1.4672364672364673,
+      "grad_norm": 0.7901251912117004,
+      "learning_rate": 0.00014080746129869923,
+      "loss": 0.8002,
+      "step": 8241
+    },
+    {
+      "epoch": 1.4674145299145298,
+      "grad_norm": 0.8078263401985168,
+      "learning_rate": 0.00014079468192662812,
+      "loss": 0.9738,
+      "step": 8242
+    },
+    {
+      "epoch": 1.4675925925925926,
+      "grad_norm": 0.6370784640312195,
+      "learning_rate": 0.00014078190175526996,
+      "loss": 1.0256,
+      "step": 8243
+    },
+    {
+      "epoch": 1.4677706552706553,
+      "grad_norm": 0.6087532639503479,
+      "learning_rate": 0.0001407691207848752,
+      "loss": 0.9747,
+      "step": 8244
+    },
+    {
+      "epoch": 1.467948717948718,
+      "grad_norm": 0.6333357691764832,
+      "learning_rate": 0.00014075633901569414,
+      "loss": 1.0135,
+      "step": 8245
+    },
+    {
+      "epoch": 1.4681267806267806,
+      "grad_norm": 0.6914255619049072,
+      "learning_rate": 0.00014074355644797733,
+      "loss": 1.0261,
+      "step": 8246
+    },
+    {
+      "epoch": 1.4683048433048433,
+      "grad_norm": 0.6374734044075012,
+      "learning_rate": 0.00014073077308197513,
+      "loss": 0.9197,
+      "step": 8247
+    },
+    {
+      "epoch": 1.4684829059829059,
+      "grad_norm": 0.8023789525032043,
+      "learning_rate": 0.00014071798891793807,
+      "loss": 1.1085,
+      "step": 8248
+    },
+    {
+      "epoch": 1.4686609686609686,
+      "grad_norm": 0.7722933888435364,
+      "learning_rate": 0.0001407052039561166,
+      "loss": 1.2018,
+      "step": 8249
+    },
+    {
+      "epoch": 1.4688390313390314,
+      "grad_norm": 0.6823393106460571,
+      "learning_rate": 0.0001406924181967612,
+      "loss": 1.088,
+      "step": 8250
+    },
+    {
+      "epoch": 1.4690170940170941,
+      "grad_norm": 0.7037357687950134,
+      "learning_rate": 0.00014067963164012242,
+      "loss": 1.0324,
+      "step": 8251
+    },
+    {
+      "epoch": 1.4691951566951567,
+      "grad_norm": 0.6549737453460693,
+      "learning_rate": 0.00014066684428645074,
+      "loss": 1.152,
+      "step": 8252
+    },
+    {
+      "epoch": 1.4693732193732194,
+      "grad_norm": 0.5349790453910828,
+      "learning_rate": 0.00014065405613599674,
+      "loss": 0.6996,
+      "step": 8253
+    },
+    {
+      "epoch": 1.469551282051282,
+      "grad_norm": 0.6760679483413696,
+      "learning_rate": 0.00014064126718901096,
+      "loss": 0.9856,
+      "step": 8254
+    },
+    {
+      "epoch": 1.4697293447293447,
+      "grad_norm": 0.5912436842918396,
+      "learning_rate": 0.00014062847744574395,
+      "loss": 1.0076,
+      "step": 8255
+    },
+    {
+      "epoch": 1.4699074074074074,
+      "grad_norm": 0.75101637840271,
+      "learning_rate": 0.00014061568690644632,
+      "loss": 1.0033,
+      "step": 8256
+    },
+    {
+      "epoch": 1.4700854700854702,
+      "grad_norm": 0.6233504414558411,
+      "learning_rate": 0.00014060289557136873,
+      "loss": 0.8525,
+      "step": 8257
+    },
+    {
+      "epoch": 1.4702635327635327,
+      "grad_norm": 0.659570038318634,
+      "learning_rate": 0.00014059010344076171,
+      "loss": 0.855,
+      "step": 8258
+    },
+    {
+      "epoch": 1.4704415954415955,
+      "grad_norm": 0.8096539974212646,
+      "learning_rate": 0.00014057731051487593,
+      "loss": 0.9905,
+      "step": 8259
+    },
+    {
+      "epoch": 1.470619658119658,
+      "grad_norm": 0.5829728245735168,
+      "learning_rate": 0.00014056451679396204,
+      "loss": 0.7974,
+      "step": 8260
+    },
+    {
+      "epoch": 1.4707977207977208,
+      "grad_norm": 0.6176979541778564,
+      "learning_rate": 0.0001405517222782707,
+      "loss": 0.9556,
+      "step": 8261
+    },
+    {
+      "epoch": 1.4709757834757835,
+      "grad_norm": 0.6322479248046875,
+      "learning_rate": 0.00014053892696805264,
+      "loss": 0.8837,
+      "step": 8262
+    },
+    {
+      "epoch": 1.4711538461538463,
+      "grad_norm": 0.6886917948722839,
+      "learning_rate": 0.0001405261308635585,
+      "loss": 0.9242,
+      "step": 8263
+    },
+    {
+      "epoch": 1.4713319088319088,
+      "grad_norm": 0.7474521994590759,
+      "learning_rate": 0.00014051333396503901,
+      "loss": 0.9906,
+      "step": 8264
+    },
+    {
+      "epoch": 1.4715099715099715,
+      "grad_norm": 0.7120978832244873,
+      "learning_rate": 0.00014050053627274488,
+      "loss": 1.1074,
+      "step": 8265
+    },
+    {
+      "epoch": 1.471688034188034,
+      "grad_norm": 0.6778998374938965,
+      "learning_rate": 0.0001404877377869269,
+      "loss": 1.0027,
+      "step": 8266
+    },
+    {
+      "epoch": 1.4718660968660968,
+      "grad_norm": 0.6832901239395142,
+      "learning_rate": 0.0001404749385078358,
+      "loss": 0.9399,
+      "step": 8267
+    },
+    {
+      "epoch": 1.4720441595441596,
+      "grad_norm": 0.7428423762321472,
+      "learning_rate": 0.00014046213843572236,
+      "loss": 1.0591,
+      "step": 8268
+    },
+    {
+      "epoch": 1.4722222222222223,
+      "grad_norm": 0.7522720098495483,
+      "learning_rate": 0.00014044933757083737,
+      "loss": 1.1184,
+      "step": 8269
+    },
+    {
+      "epoch": 1.4724002849002849,
+      "grad_norm": 0.7714734673500061,
+      "learning_rate": 0.00014043653591343163,
+      "loss": 1.0783,
+      "step": 8270
+    },
+    {
+      "epoch": 1.4725783475783476,
+      "grad_norm": 0.5860890746116638,
+      "learning_rate": 0.00014042373346375597,
+      "loss": 0.8394,
+      "step": 8271
+    },
+    {
+      "epoch": 1.4727564102564101,
+      "grad_norm": 0.6400395035743713,
+      "learning_rate": 0.0001404109302220612,
+      "loss": 0.9153,
+      "step": 8272
+    },
+    {
+      "epoch": 1.4729344729344729,
+      "grad_norm": 0.7441139817237854,
+      "learning_rate": 0.00014039812618859827,
+      "loss": 0.9224,
+      "step": 8273
+    },
+    {
+      "epoch": 1.4731125356125356,
+      "grad_norm": 0.6030932664871216,
+      "learning_rate": 0.00014038532136361793,
+      "loss": 1.0783,
+      "step": 8274
+    },
+    {
+      "epoch": 1.4732905982905984,
+      "grad_norm": 0.7243345975875854,
+      "learning_rate": 0.0001403725157473711,
+      "loss": 0.9894,
+      "step": 8275
+    },
+    {
+      "epoch": 1.473468660968661,
+      "grad_norm": 0.6880641579627991,
+      "learning_rate": 0.0001403597093401087,
+      "loss": 0.9459,
+      "step": 8276
+    },
+    {
+      "epoch": 1.4736467236467237,
+      "grad_norm": 0.6263882517814636,
+      "learning_rate": 0.00014034690214208165,
+      "loss": 0.8781,
+      "step": 8277
+    },
+    {
+      "epoch": 1.4738247863247862,
+      "grad_norm": 0.7159495949745178,
+      "learning_rate": 0.00014033409415354085,
+      "loss": 1.0511,
+      "step": 8278
+    },
+    {
+      "epoch": 1.474002849002849,
+      "grad_norm": 0.7182226181030273,
+      "learning_rate": 0.00014032128537473727,
+      "loss": 1.1196,
+      "step": 8279
+    },
+    {
+      "epoch": 1.4741809116809117,
+      "grad_norm": 0.744478166103363,
+      "learning_rate": 0.00014030847580592186,
+      "loss": 1.0747,
+      "step": 8280
+    },
+    {
+      "epoch": 1.4743589743589745,
+      "grad_norm": 0.6806797385215759,
+      "learning_rate": 0.00014029566544734558,
+      "loss": 1.1519,
+      "step": 8281
+    },
+    {
+      "epoch": 1.474537037037037,
+      "grad_norm": 0.6813502311706543,
+      "learning_rate": 0.00014028285429925946,
+      "loss": 0.968,
+      "step": 8282
+    },
+    {
+      "epoch": 1.4747150997150997,
+      "grad_norm": 0.639784574508667,
+      "learning_rate": 0.00014027004236191452,
+      "loss": 1.0685,
+      "step": 8283
+    },
+    {
+      "epoch": 1.4748931623931623,
+      "grad_norm": 0.6325878500938416,
+      "learning_rate": 0.00014025722963556173,
+      "loss": 1.0358,
+      "step": 8284
+    },
+    {
+      "epoch": 1.475071225071225,
+      "grad_norm": 0.7012955546379089,
+      "learning_rate": 0.00014024441612045215,
+      "loss": 1.1059,
+      "step": 8285
+    },
+    {
+      "epoch": 1.4752492877492878,
+      "grad_norm": 0.690380334854126,
+      "learning_rate": 0.00014023160181683684,
+      "loss": 0.9628,
+      "step": 8286
+    },
+    {
+      "epoch": 1.4754273504273505,
+      "grad_norm": 0.7178516983985901,
+      "learning_rate": 0.00014021878672496686,
+      "loss": 0.963,
+      "step": 8287
+    },
+    {
+      "epoch": 1.475605413105413,
+      "grad_norm": 0.7049064636230469,
+      "learning_rate": 0.0001402059708450933,
+      "loss": 0.8996,
+      "step": 8288
+    },
+    {
+      "epoch": 1.4757834757834758,
+      "grad_norm": 0.6777819395065308,
+      "learning_rate": 0.00014019315417746728,
+      "loss": 1.0696,
+      "step": 8289
+    },
+    {
+      "epoch": 1.4759615384615383,
+      "grad_norm": 0.5948763489723206,
+      "learning_rate": 0.00014018033672233987,
+      "loss": 0.928,
+      "step": 8290
+    },
+    {
+      "epoch": 1.476139601139601,
+      "grad_norm": 0.7183942198753357,
+      "learning_rate": 0.00014016751847996224,
+      "loss": 1.1053,
+      "step": 8291
+    },
+    {
+      "epoch": 1.4763176638176638,
+      "grad_norm": 0.7426177263259888,
+      "learning_rate": 0.00014015469945058556,
+      "loss": 0.9504,
+      "step": 8292
+    },
+    {
+      "epoch": 1.4764957264957266,
+      "grad_norm": 0.6508159041404724,
+      "learning_rate": 0.0001401418796344609,
+      "loss": 1.1176,
+      "step": 8293
+    },
+    {
+      "epoch": 1.476673789173789,
+      "grad_norm": 0.6954567432403564,
+      "learning_rate": 0.00014012905903183954,
+      "loss": 0.9238,
+      "step": 8294
+    },
+    {
+      "epoch": 1.4768518518518519,
+      "grad_norm": 0.7023960947990417,
+      "learning_rate": 0.0001401162376429726,
+      "loss": 1.2032,
+      "step": 8295
+    },
+    {
+      "epoch": 1.4770299145299146,
+      "grad_norm": 0.7174739837646484,
+      "learning_rate": 0.00014010341546811134,
+      "loss": 0.9385,
+      "step": 8296
+    },
+    {
+      "epoch": 1.4772079772079771,
+      "grad_norm": 0.611980140209198,
+      "learning_rate": 0.00014009059250750695,
+      "loss": 0.9469,
+      "step": 8297
+    },
+    {
+      "epoch": 1.47738603988604,
+      "grad_norm": 0.6362917423248291,
+      "learning_rate": 0.0001400777687614107,
+      "loss": 1.1406,
+      "step": 8298
+    },
+    {
+      "epoch": 1.4775641025641026,
+      "grad_norm": 0.6884697675704956,
+      "learning_rate": 0.00014006494423007381,
+      "loss": 0.7915,
+      "step": 8299
+    },
+    {
+      "epoch": 1.4777421652421652,
+      "grad_norm": 0.6266025304794312,
+      "learning_rate": 0.00014005211891374755,
+      "loss": 0.94,
+      "step": 8300
+    },
+    {
+      "epoch": 1.477920227920228,
+      "grad_norm": 0.6130280494689941,
+      "learning_rate": 0.00014003929281268323,
+      "loss": 0.9369,
+      "step": 8301
+    },
+    {
+      "epoch": 1.4780982905982907,
+      "grad_norm": 0.7244207859039307,
+      "learning_rate": 0.00014002646592713215,
+      "loss": 1.1449,
+      "step": 8302
+    },
+    {
+      "epoch": 1.4782763532763532,
+      "grad_norm": 0.6527345776557922,
+      "learning_rate": 0.0001400136382573456,
+      "loss": 0.7792,
+      "step": 8303
+    },
+    {
+      "epoch": 1.478454415954416,
+      "grad_norm": 0.7102689743041992,
+      "learning_rate": 0.00014000080980357496,
+      "loss": 0.9577,
+      "step": 8304
+    },
+    {
+      "epoch": 1.4786324786324787,
+      "grad_norm": 0.6179325580596924,
+      "learning_rate": 0.00013998798056607154,
+      "loss": 0.827,
+      "step": 8305
+    },
+    {
+      "epoch": 1.4788105413105412,
+      "grad_norm": 0.761234700679779,
+      "learning_rate": 0.00013997515054508668,
+      "loss": 1.0576,
+      "step": 8306
+    },
+    {
+      "epoch": 1.478988603988604,
+      "grad_norm": 0.6200914978981018,
+      "learning_rate": 0.0001399623197408718,
+      "loss": 1.0514,
+      "step": 8307
+    },
+    {
+      "epoch": 1.4791666666666667,
+      "grad_norm": 0.5961193442344666,
+      "learning_rate": 0.0001399494881536783,
+      "loss": 0.7846,
+      "step": 8308
+    },
+    {
+      "epoch": 1.4793447293447293,
+      "grad_norm": 0.645984411239624,
+      "learning_rate": 0.00013993665578375758,
+      "loss": 0.9927,
+      "step": 8309
+    },
+    {
+      "epoch": 1.479522792022792,
+      "grad_norm": 0.7258989810943604,
+      "learning_rate": 0.000139923822631361,
+      "loss": 0.7567,
+      "step": 8310
+    },
+    {
+      "epoch": 1.4797008547008548,
+      "grad_norm": 0.708882212638855,
+      "learning_rate": 0.00013991098869674007,
+      "loss": 1.1147,
+      "step": 8311
+    },
+    {
+      "epoch": 1.4798789173789173,
+      "grad_norm": 0.669262707233429,
+      "learning_rate": 0.00013989815398014624,
+      "loss": 0.7142,
+      "step": 8312
+    },
+    {
+      "epoch": 1.48005698005698,
+      "grad_norm": 0.7398767471313477,
+      "learning_rate": 0.00013988531848183096,
+      "loss": 1.043,
+      "step": 8313
+    },
+    {
+      "epoch": 1.4802350427350428,
+      "grad_norm": 0.753197193145752,
+      "learning_rate": 0.0001398724822020457,
+      "loss": 1.058,
+      "step": 8314
+    },
+    {
+      "epoch": 1.4804131054131053,
+      "grad_norm": 0.663526177406311,
+      "learning_rate": 0.000139859645141042,
+      "loss": 1.1272,
+      "step": 8315
+    },
+    {
+      "epoch": 1.480591168091168,
+      "grad_norm": 0.6537514925003052,
+      "learning_rate": 0.00013984680729907135,
+      "loss": 1.011,
+      "step": 8316
+    },
+    {
+      "epoch": 1.4807692307692308,
+      "grad_norm": 0.707554817199707,
+      "learning_rate": 0.00013983396867638527,
+      "loss": 1.0593,
+      "step": 8317
+    },
+    {
+      "epoch": 1.4809472934472934,
+      "grad_norm": 0.6261475086212158,
+      "learning_rate": 0.00013982112927323533,
+      "loss": 1.0731,
+      "step": 8318
+    },
+    {
+      "epoch": 1.4811253561253561,
+      "grad_norm": 0.6694258451461792,
+      "learning_rate": 0.00013980828908987308,
+      "loss": 1.0703,
+      "step": 8319
+    },
+    {
+      "epoch": 1.4813034188034189,
+      "grad_norm": 0.7793164253234863,
+      "learning_rate": 0.00013979544812655012,
+      "loss": 1.0447,
+      "step": 8320
+    },
+    {
+      "epoch": 1.4814814814814814,
+      "grad_norm": 0.6496448516845703,
+      "learning_rate": 0.00013978260638351802,
+      "loss": 1.0208,
+      "step": 8321
+    },
+    {
+      "epoch": 1.4816595441595442,
+      "grad_norm": 0.5992059111595154,
+      "learning_rate": 0.00013976976386102834,
+      "loss": 0.9717,
+      "step": 8322
+    },
+    {
+      "epoch": 1.481837606837607,
+      "grad_norm": 0.7473567128181458,
+      "learning_rate": 0.0001397569205593328,
+      "loss": 0.9612,
+      "step": 8323
+    },
+    {
+      "epoch": 1.4820156695156697,
+      "grad_norm": 0.657558798789978,
+      "learning_rate": 0.00013974407647868297,
+      "loss": 1.2137,
+      "step": 8324
+    },
+    {
+      "epoch": 1.4821937321937322,
+      "grad_norm": 0.7040614485740662,
+      "learning_rate": 0.00013973123161933055,
+      "loss": 1.007,
+      "step": 8325
+    },
+    {
+      "epoch": 1.482371794871795,
+      "grad_norm": 0.6098681092262268,
+      "learning_rate": 0.00013971838598152717,
+      "loss": 1.0595,
+      "step": 8326
+    },
+    {
+      "epoch": 1.4825498575498575,
+      "grad_norm": 0.7194869518280029,
+      "learning_rate": 0.0001397055395655245,
+      "loss": 0.9632,
+      "step": 8327
+    },
+    {
+      "epoch": 1.4827279202279202,
+      "grad_norm": 0.645972728729248,
+      "learning_rate": 0.00013969269237157426,
+      "loss": 1.0712,
+      "step": 8328
+    },
+    {
+      "epoch": 1.482905982905983,
+      "grad_norm": 0.6580560207366943,
+      "learning_rate": 0.0001396798443999282,
+      "loss": 1.2117,
+      "step": 8329
+    },
+    {
+      "epoch": 1.4830840455840457,
+      "grad_norm": 0.6624418497085571,
+      "learning_rate": 0.00013966699565083802,
+      "loss": 0.8529,
+      "step": 8330
+    },
+    {
+      "epoch": 1.4832621082621082,
+      "grad_norm": 0.659896731376648,
+      "learning_rate": 0.00013965414612455545,
+      "loss": 0.9359,
+      "step": 8331
+    },
+    {
+      "epoch": 1.483440170940171,
+      "grad_norm": 0.6690883636474609,
+      "learning_rate": 0.00013964129582133222,
+      "loss": 0.971,
+      "step": 8332
+    },
+    {
+      "epoch": 1.4836182336182335,
+      "grad_norm": 0.6767334938049316,
+      "learning_rate": 0.00013962844474142022,
+      "loss": 1.0137,
+      "step": 8333
+    },
+    {
+      "epoch": 1.4837962962962963,
+      "grad_norm": 0.6412752270698547,
+      "learning_rate": 0.0001396155928850711,
+      "loss": 1.2812,
+      "step": 8334
+    },
+    {
+      "epoch": 1.483974358974359,
+      "grad_norm": 0.6731469035148621,
+      "learning_rate": 0.0001396027402525368,
+      "loss": 0.8723,
+      "step": 8335
+    },
+    {
+      "epoch": 1.4841524216524218,
+      "grad_norm": 0.7327923774719238,
+      "learning_rate": 0.000139589886844069,
+      "loss": 0.9606,
+      "step": 8336
+    },
+    {
+      "epoch": 1.4843304843304843,
+      "grad_norm": 0.6194515824317932,
+      "learning_rate": 0.00013957703265991963,
+      "loss": 0.8514,
+      "step": 8337
+    },
+    {
+      "epoch": 1.484508547008547,
+      "grad_norm": 0.7250012755393982,
+      "learning_rate": 0.00013956417770034053,
+      "loss": 0.9755,
+      "step": 8338
+    },
+    {
+      "epoch": 1.4846866096866096,
+      "grad_norm": 0.7484263181686401,
+      "learning_rate": 0.00013955132196558358,
+      "loss": 1.0376,
+      "step": 8339
+    },
+    {
+      "epoch": 1.4848646723646723,
+      "grad_norm": 0.7593362331390381,
+      "learning_rate": 0.00013953846545590058,
+      "loss": 1.3011,
+      "step": 8340
+    },
+    {
+      "epoch": 1.485042735042735,
+      "grad_norm": 0.6670466065406799,
+      "learning_rate": 0.00013952560817154352,
+      "loss": 0.9726,
+      "step": 8341
+    },
+    {
+      "epoch": 1.4852207977207978,
+      "grad_norm": 0.8001134395599365,
+      "learning_rate": 0.00013951275011276425,
+      "loss": 1.1447,
+      "step": 8342
+    },
+    {
+      "epoch": 1.4853988603988604,
+      "grad_norm": 0.741450309753418,
+      "learning_rate": 0.00013949989127981475,
+      "loss": 1.1101,
+      "step": 8343
+    },
+    {
+      "epoch": 1.4855769230769231,
+      "grad_norm": 0.6594467163085938,
+      "learning_rate": 0.00013948703167294694,
+      "loss": 1.0205,
+      "step": 8344
+    },
+    {
+      "epoch": 1.4857549857549857,
+      "grad_norm": 0.6303030252456665,
+      "learning_rate": 0.00013947417129241276,
+      "loss": 0.9179,
+      "step": 8345
+    },
+    {
+      "epoch": 1.4859330484330484,
+      "grad_norm": 0.6352720856666565,
+      "learning_rate": 0.00013946131013846418,
+      "loss": 1.158,
+      "step": 8346
+    },
+    {
+      "epoch": 1.4861111111111112,
+      "grad_norm": 0.6720923781394958,
+      "learning_rate": 0.0001394484482113532,
+      "loss": 0.8805,
+      "step": 8347
+    },
+    {
+      "epoch": 1.486289173789174,
+      "grad_norm": 0.7186421751976013,
+      "learning_rate": 0.00013943558551133186,
+      "loss": 0.8951,
+      "step": 8348
+    },
+    {
+      "epoch": 1.4864672364672364,
+      "grad_norm": 0.6038698554039001,
+      "learning_rate": 0.00013942272203865214,
+      "loss": 1.0079,
+      "step": 8349
+    },
+    {
+      "epoch": 1.4866452991452992,
+      "grad_norm": 0.665790319442749,
+      "learning_rate": 0.00013940985779356606,
+      "loss": 0.8853,
+      "step": 8350
+    },
+    {
+      "epoch": 1.4868233618233617,
+      "grad_norm": 0.6941595673561096,
+      "learning_rate": 0.00013939699277632568,
+      "loss": 1.1404,
+      "step": 8351
+    },
+    {
+      "epoch": 1.4870014245014245,
+      "grad_norm": 0.7943871021270752,
+      "learning_rate": 0.00013938412698718305,
+      "loss": 0.9961,
+      "step": 8352
+    },
+    {
+      "epoch": 1.4871794871794872,
+      "grad_norm": 0.6363818645477295,
+      "learning_rate": 0.00013937126042639028,
+      "loss": 0.8621,
+      "step": 8353
+    },
+    {
+      "epoch": 1.48735754985755,
+      "grad_norm": 0.7986421585083008,
+      "learning_rate": 0.00013935839309419943,
+      "loss": 1.0547,
+      "step": 8354
+    },
+    {
+      "epoch": 1.4875356125356125,
+      "grad_norm": 0.5890130400657654,
+      "learning_rate": 0.00013934552499086266,
+      "loss": 0.9863,
+      "step": 8355
+    },
+    {
+      "epoch": 1.4877136752136753,
+      "grad_norm": 0.7915370464324951,
+      "learning_rate": 0.00013933265611663207,
+      "loss": 1.0385,
+      "step": 8356
+    },
+    {
+      "epoch": 1.4878917378917378,
+      "grad_norm": 0.7062503695487976,
+      "learning_rate": 0.00013931978647175973,
+      "loss": 1.0984,
+      "step": 8357
+    },
+    {
+      "epoch": 1.4880698005698005,
+      "grad_norm": 0.6496769785881042,
+      "learning_rate": 0.00013930691605649792,
+      "loss": 1.0884,
+      "step": 8358
+    },
+    {
+      "epoch": 1.4882478632478633,
+      "grad_norm": 0.6527266502380371,
+      "learning_rate": 0.0001392940448710987,
+      "loss": 1.0366,
+      "step": 8359
+    },
+    {
+      "epoch": 1.488425925925926,
+      "grad_norm": 0.6269870400428772,
+      "learning_rate": 0.00013928117291581431,
+      "loss": 0.9097,
+      "step": 8360
+    },
+    {
+      "epoch": 1.4886039886039886,
+      "grad_norm": 0.6581160426139832,
+      "learning_rate": 0.00013926830019089694,
+      "loss": 0.8694,
+      "step": 8361
+    },
+    {
+      "epoch": 1.4887820512820513,
+      "grad_norm": 0.6196219325065613,
+      "learning_rate": 0.0001392554266965988,
+      "loss": 0.8054,
+      "step": 8362
+    },
+    {
+      "epoch": 1.4889601139601139,
+      "grad_norm": 0.6246176362037659,
+      "learning_rate": 0.0001392425524331721,
+      "loss": 0.9309,
+      "step": 8363
+    },
+    {
+      "epoch": 1.4891381766381766,
+      "grad_norm": 0.7293874025344849,
+      "learning_rate": 0.00013922967740086914,
+      "loss": 1.051,
+      "step": 8364
+    },
+    {
+      "epoch": 1.4893162393162394,
+      "grad_norm": 0.6581604480743408,
+      "learning_rate": 0.00013921680159994213,
+      "loss": 0.8475,
+      "step": 8365
+    },
+    {
+      "epoch": 1.489494301994302,
+      "grad_norm": 0.6294612288475037,
+      "learning_rate": 0.00013920392503064335,
+      "loss": 0.6946,
+      "step": 8366
+    },
+    {
+      "epoch": 1.4896723646723646,
+      "grad_norm": 0.5725370645523071,
+      "learning_rate": 0.00013919104769322512,
+      "loss": 0.7838,
+      "step": 8367
+    },
+    {
+      "epoch": 1.4898504273504274,
+      "grad_norm": 0.681520402431488,
+      "learning_rate": 0.00013917816958793967,
+      "loss": 0.99,
+      "step": 8368
+    },
+    {
+      "epoch": 1.49002849002849,
+      "grad_norm": 0.6660219430923462,
+      "learning_rate": 0.00013916529071503943,
+      "loss": 0.9113,
+      "step": 8369
+    },
+    {
+      "epoch": 1.4902065527065527,
+      "grad_norm": 0.7567862272262573,
+      "learning_rate": 0.00013915241107477665,
+      "loss": 1.2498,
+      "step": 8370
+    },
+    {
+      "epoch": 1.4903846153846154,
+      "grad_norm": 0.7366036176681519,
+      "learning_rate": 0.00013913953066740372,
+      "loss": 1.115,
+      "step": 8371
+    },
+    {
+      "epoch": 1.4905626780626782,
+      "grad_norm": 0.6201434135437012,
+      "learning_rate": 0.00013912664949317297,
+      "loss": 0.8447,
+      "step": 8372
+    },
+    {
+      "epoch": 1.4907407407407407,
+      "grad_norm": 0.7618655562400818,
+      "learning_rate": 0.00013911376755233683,
+      "loss": 0.9696,
+      "step": 8373
+    },
+    {
+      "epoch": 1.4909188034188035,
+      "grad_norm": 0.6716726422309875,
+      "learning_rate": 0.00013910088484514764,
+      "loss": 0.9753,
+      "step": 8374
+    },
+    {
+      "epoch": 1.491096866096866,
+      "grad_norm": 0.6745659112930298,
+      "learning_rate": 0.0001390880013718579,
+      "loss": 1.134,
+      "step": 8375
+    },
+    {
+      "epoch": 1.4912749287749287,
+      "grad_norm": 0.7524410486221313,
+      "learning_rate": 0.0001390751171327199,
+      "loss": 1.0235,
+      "step": 8376
+    },
+    {
+      "epoch": 1.4914529914529915,
+      "grad_norm": 0.7409411072731018,
+      "learning_rate": 0.00013906223212798615,
+      "loss": 0.752,
+      "step": 8377
+    },
+    {
+      "epoch": 1.4916310541310542,
+      "grad_norm": 0.7016384601593018,
+      "learning_rate": 0.00013904934635790913,
+      "loss": 1.1712,
+      "step": 8378
+    },
+    {
+      "epoch": 1.4918091168091168,
+      "grad_norm": 0.6537824869155884,
+      "learning_rate": 0.00013903645982274129,
+      "loss": 1.1162,
+      "step": 8379
+    },
+    {
+      "epoch": 1.4919871794871795,
+      "grad_norm": 0.6460806727409363,
+      "learning_rate": 0.0001390235725227351,
+      "loss": 0.9389,
+      "step": 8380
+    },
+    {
+      "epoch": 1.492165242165242,
+      "grad_norm": 0.6405501365661621,
+      "learning_rate": 0.0001390106844581431,
+      "loss": 1.0508,
+      "step": 8381
+    },
+    {
+      "epoch": 1.4923433048433048,
+      "grad_norm": 0.6672594547271729,
+      "learning_rate": 0.00013899779562921775,
+      "loss": 1.0018,
+      "step": 8382
+    },
+    {
+      "epoch": 1.4925213675213675,
+      "grad_norm": 0.6303185820579529,
+      "learning_rate": 0.0001389849060362116,
+      "loss": 0.9964,
+      "step": 8383
+    },
+    {
+      "epoch": 1.4926994301994303,
+      "grad_norm": 0.6981508731842041,
+      "learning_rate": 0.00013897201567937719,
+      "loss": 1.174,
+      "step": 8384
+    },
+    {
+      "epoch": 1.4928774928774928,
+      "grad_norm": 0.6195989847183228,
+      "learning_rate": 0.0001389591245589671,
+      "loss": 0.9254,
+      "step": 8385
+    },
+    {
+      "epoch": 1.4930555555555556,
+      "grad_norm": 0.6232163310050964,
+      "learning_rate": 0.00013894623267523393,
+      "loss": 0.7151,
+      "step": 8386
+    },
+    {
+      "epoch": 1.493233618233618,
+      "grad_norm": 0.673067033290863,
+      "learning_rate": 0.0001389333400284302,
+      "loss": 1.0156,
+      "step": 8387
+    },
+    {
+      "epoch": 1.4934116809116809,
+      "grad_norm": 0.706266462802887,
+      "learning_rate": 0.00013892044661880856,
+      "loss": 0.9387,
+      "step": 8388
+    },
+    {
+      "epoch": 1.4935897435897436,
+      "grad_norm": 0.742640495300293,
+      "learning_rate": 0.00013890755244662161,
+      "loss": 1.1597,
+      "step": 8389
+    },
+    {
+      "epoch": 1.4937678062678064,
+      "grad_norm": 0.6856846809387207,
+      "learning_rate": 0.000138894657512122,
+      "loss": 0.9998,
+      "step": 8390
+    },
+    {
+      "epoch": 1.493945868945869,
+      "grad_norm": 0.7214110493659973,
+      "learning_rate": 0.0001388817618155624,
+      "loss": 1.1867,
+      "step": 8391
+    },
+    {
+      "epoch": 1.4941239316239316,
+      "grad_norm": 0.7346787452697754,
+      "learning_rate": 0.0001388688653571954,
+      "loss": 0.9071,
+      "step": 8392
+    },
+    {
+      "epoch": 1.4943019943019942,
+      "grad_norm": 0.7019181847572327,
+      "learning_rate": 0.00013885596813727373,
+      "loss": 1.0472,
+      "step": 8393
+    },
+    {
+      "epoch": 1.494480056980057,
+      "grad_norm": 0.6780814528465271,
+      "learning_rate": 0.00013884307015605012,
+      "loss": 1.0031,
+      "step": 8394
+    },
+    {
+      "epoch": 1.4946581196581197,
+      "grad_norm": 0.6722873449325562,
+      "learning_rate": 0.0001388301714137772,
+      "loss": 0.8889,
+      "step": 8395
+    },
+    {
+      "epoch": 1.4948361823361824,
+      "grad_norm": 0.6736134886741638,
+      "learning_rate": 0.00013881727191070777,
+      "loss": 0.8695,
+      "step": 8396
+    },
+    {
+      "epoch": 1.495014245014245,
+      "grad_norm": 0.632648766040802,
+      "learning_rate": 0.00013880437164709452,
+      "loss": 0.9391,
+      "step": 8397
+    },
+    {
+      "epoch": 1.4951923076923077,
+      "grad_norm": 0.7004299163818359,
+      "learning_rate": 0.0001387914706231902,
+      "loss": 1.1423,
+      "step": 8398
+    },
+    {
+      "epoch": 1.4953703703703702,
+      "grad_norm": 0.5787134766578674,
+      "learning_rate": 0.0001387785688392476,
+      "loss": 0.9953,
+      "step": 8399
+    },
+    {
+      "epoch": 1.495548433048433,
+      "grad_norm": 0.6671785712242126,
+      "learning_rate": 0.0001387656662955195,
+      "loss": 0.9356,
+      "step": 8400
+    },
+    {
+      "epoch": 1.4957264957264957,
+      "grad_norm": 0.7216096520423889,
+      "learning_rate": 0.0001387527629922587,
+      "loss": 0.9065,
+      "step": 8401
+    },
+    {
+      "epoch": 1.4959045584045585,
+      "grad_norm": 0.6469849348068237,
+      "learning_rate": 0.00013873985892971801,
+      "loss": 1.0664,
+      "step": 8402
+    },
+    {
+      "epoch": 1.496082621082621,
+      "grad_norm": 0.5598217248916626,
+      "learning_rate": 0.00013872695410815027,
+      "loss": 0.8834,
+      "step": 8403
+    },
+    {
+      "epoch": 1.4962606837606838,
+      "grad_norm": 0.6860302686691284,
+      "learning_rate": 0.00013871404852780828,
+      "loss": 0.9061,
+      "step": 8404
+    },
+    {
+      "epoch": 1.4964387464387463,
+      "grad_norm": 0.7101688385009766,
+      "learning_rate": 0.00013870114218894497,
+      "loss": 1.0236,
+      "step": 8405
+    },
+    {
+      "epoch": 1.496616809116809,
+      "grad_norm": 0.6494225859642029,
+      "learning_rate": 0.00013868823509181313,
+      "loss": 0.9631,
+      "step": 8406
+    },
+    {
+      "epoch": 1.4967948717948718,
+      "grad_norm": 0.6804189085960388,
+      "learning_rate": 0.00013867532723666574,
+      "loss": 0.9341,
+      "step": 8407
+    },
+    {
+      "epoch": 1.4969729344729346,
+      "grad_norm": 0.8493942022323608,
+      "learning_rate": 0.00013866241862375562,
+      "loss": 1.1451,
+      "step": 8408
+    },
+    {
+      "epoch": 1.497150997150997,
+      "grad_norm": 0.6248497366905212,
+      "learning_rate": 0.00013864950925333576,
+      "loss": 0.8584,
+      "step": 8409
+    },
+    {
+      "epoch": 1.4973290598290598,
+      "grad_norm": 0.6238769292831421,
+      "learning_rate": 0.00013863659912565903,
+      "loss": 1.1612,
+      "step": 8410
+    },
+    {
+      "epoch": 1.4975071225071226,
+      "grad_norm": 0.8538609147071838,
+      "learning_rate": 0.0001386236882409784,
+      "loss": 1.0817,
+      "step": 8411
+    },
+    {
+      "epoch": 1.4976851851851851,
+      "grad_norm": 0.7301406264305115,
+      "learning_rate": 0.00013861077659954683,
+      "loss": 0.943,
+      "step": 8412
+    },
+    {
+      "epoch": 1.4978632478632479,
+      "grad_norm": 0.6573456525802612,
+      "learning_rate": 0.0001385978642016173,
+      "loss": 1.0154,
+      "step": 8413
+    },
+    {
+      "epoch": 1.4980413105413106,
+      "grad_norm": 0.7634185552597046,
+      "learning_rate": 0.0001385849510474428,
+      "loss": 1.0432,
+      "step": 8414
+    },
+    {
+      "epoch": 1.4982193732193732,
+      "grad_norm": 0.6156686544418335,
+      "learning_rate": 0.00013857203713727633,
+      "loss": 1.0442,
+      "step": 8415
+    },
+    {
+      "epoch": 1.498397435897436,
+      "grad_norm": 0.5386871695518494,
+      "learning_rate": 0.00013855912247137092,
+      "loss": 0.9055,
+      "step": 8416
+    },
+    {
+      "epoch": 1.4985754985754987,
+      "grad_norm": 0.7108574509620667,
+      "learning_rate": 0.00013854620704997962,
+      "loss": 0.9705,
+      "step": 8417
+    },
+    {
+      "epoch": 1.4987535612535612,
+      "grad_norm": 0.7313347458839417,
+      "learning_rate": 0.00013853329087335547,
+      "loss": 0.7541,
+      "step": 8418
+    },
+    {
+      "epoch": 1.498931623931624,
+      "grad_norm": 0.8369119167327881,
+      "learning_rate": 0.0001385203739417515,
+      "loss": 1.1317,
+      "step": 8419
+    },
+    {
+      "epoch": 1.4991096866096867,
+      "grad_norm": 0.6763789057731628,
+      "learning_rate": 0.00013850745625542085,
+      "loss": 0.7909,
+      "step": 8420
+    },
+    {
+      "epoch": 1.4992877492877492,
+      "grad_norm": 0.7369635105133057,
+      "learning_rate": 0.00013849453781461656,
+      "loss": 1.1454,
+      "step": 8421
+    },
+    {
+      "epoch": 1.499465811965812,
+      "grad_norm": 0.7165971398353577,
+      "learning_rate": 0.0001384816186195918,
+      "loss": 1.1927,
+      "step": 8422
+    },
+    {
+      "epoch": 1.4996438746438747,
+      "grad_norm": 0.7502337694168091,
+      "learning_rate": 0.00013846869867059966,
+      "loss": 1.0592,
+      "step": 8423
+    },
+    {
+      "epoch": 1.4998219373219372,
+      "grad_norm": 0.7207813858985901,
+      "learning_rate": 0.00013845577796789326,
+      "loss": 1.1133,
+      "step": 8424
+    },
+    {
+      "epoch": 1.4998219373219372,
+      "eval_loss": 1.1057652235031128,
+      "eval_runtime": 24.7975,
+      "eval_samples_per_second": 41.98,
+      "eval_steps_per_second": 21.01,
+      "step": 8424
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 0.6962727308273315,
+      "learning_rate": 0.00013844285651172576,
+      "loss": 1.0711,
+      "step": 8425
+    },
+    {
+      "epoch": 1.5001780626780628,
+      "grad_norm": 0.6585133075714111,
+      "learning_rate": 0.00013842993430235038,
+      "loss": 0.9793,
+      "step": 8426
+    },
+    {
+      "epoch": 1.5003561253561255,
+      "grad_norm": 0.7045056819915771,
+      "learning_rate": 0.00013841701134002029,
+      "loss": 1.0046,
+      "step": 8427
+    },
+    {
+      "epoch": 1.500534188034188,
+      "grad_norm": 0.6788702011108398,
+      "learning_rate": 0.00013840408762498863,
+      "loss": 0.9539,
+      "step": 8428
+    },
+    {
+      "epoch": 1.5007122507122506,
+      "grad_norm": 0.7253114581108093,
+      "learning_rate": 0.00013839116315750863,
+      "loss": 0.9446,
+      "step": 8429
+    },
+    {
+      "epoch": 1.5008903133903133,
+      "grad_norm": 0.6103765368461609,
+      "learning_rate": 0.0001383782379378336,
+      "loss": 0.7862,
+      "step": 8430
+    },
+    {
+      "epoch": 1.501068376068376,
+      "grad_norm": 0.6662353873252869,
+      "learning_rate": 0.00013836531196621666,
+      "loss": 1.2178,
+      "step": 8431
+    },
+    {
+      "epoch": 1.5012464387464388,
+      "grad_norm": 0.6871803998947144,
+      "learning_rate": 0.00013835238524291117,
+      "loss": 0.9263,
+      "step": 8432
+    },
+    {
+      "epoch": 1.5014245014245016,
+      "grad_norm": 0.62713223695755,
+      "learning_rate": 0.00013833945776817034,
+      "loss": 0.8879,
+      "step": 8433
+    },
+    {
+      "epoch": 1.501602564102564,
+      "grad_norm": 0.6698164343833923,
+      "learning_rate": 0.00013832652954224748,
+      "loss": 0.9847,
+      "step": 8434
+    },
+    {
+      "epoch": 1.5017806267806266,
+      "grad_norm": 0.6855883002281189,
+      "learning_rate": 0.0001383136005653959,
+      "loss": 0.8614,
+      "step": 8435
+    },
+    {
+      "epoch": 1.5019586894586894,
+      "grad_norm": 0.7028802037239075,
+      "learning_rate": 0.0001383006708378689,
+      "loss": 1.0153,
+      "step": 8436
+    },
+    {
+      "epoch": 1.5021367521367521,
+      "grad_norm": 0.6710380911827087,
+      "learning_rate": 0.00013828774035991981,
+      "loss": 1.0163,
+      "step": 8437
+    },
+    {
+      "epoch": 1.5023148148148149,
+      "grad_norm": 0.618984580039978,
+      "learning_rate": 0.000138274809131802,
+      "loss": 1.0015,
+      "step": 8438
+    },
+    {
+      "epoch": 1.5024928774928776,
+      "grad_norm": 0.6881645321846008,
+      "learning_rate": 0.00013826187715376882,
+      "loss": 0.9776,
+      "step": 8439
+    },
+    {
+      "epoch": 1.5026709401709402,
+      "grad_norm": 0.6715859770774841,
+      "learning_rate": 0.00013824894442607358,
+      "loss": 0.9129,
+      "step": 8440
+    },
+    {
+      "epoch": 1.5028490028490027,
+      "grad_norm": 0.5940943360328674,
+      "learning_rate": 0.0001382360109489698,
+      "loss": 1.0724,
+      "step": 8441
+    },
+    {
+      "epoch": 1.5030270655270654,
+      "grad_norm": 0.6536458134651184,
+      "learning_rate": 0.0001382230767227108,
+      "loss": 1.0162,
+      "step": 8442
+    },
+    {
+      "epoch": 1.5032051282051282,
+      "grad_norm": 0.6163156628608704,
+      "learning_rate": 0.00013821014174755,
+      "loss": 1.0521,
+      "step": 8443
+    },
+    {
+      "epoch": 1.503383190883191,
+      "grad_norm": 0.7592282891273499,
+      "learning_rate": 0.00013819720602374082,
+      "loss": 0.9525,
+      "step": 8444
+    },
+    {
+      "epoch": 1.5035612535612537,
+      "grad_norm": 0.6672595143318176,
+      "learning_rate": 0.0001381842695515368,
+      "loss": 0.9359,
+      "step": 8445
+    },
+    {
+      "epoch": 1.5037393162393162,
+      "grad_norm": 0.6395034193992615,
+      "learning_rate": 0.0001381713323311913,
+      "loss": 1.166,
+      "step": 8446
+    },
+    {
+      "epoch": 1.5039173789173788,
+      "grad_norm": 0.5958148837089539,
+      "learning_rate": 0.00013815839436295783,
+      "loss": 0.9885,
+      "step": 8447
+    },
+    {
+      "epoch": 1.5040954415954415,
+      "grad_norm": 0.676555871963501,
+      "learning_rate": 0.0001381454556470899,
+      "loss": 1.0637,
+      "step": 8448
+    },
+    {
+      "epoch": 1.5042735042735043,
+      "grad_norm": 0.642428994178772,
+      "learning_rate": 0.00013813251618384102,
+      "loss": 0.9288,
+      "step": 8449
+    },
+    {
+      "epoch": 1.504451566951567,
+      "grad_norm": 0.6730920076370239,
+      "learning_rate": 0.00013811957597346467,
+      "loss": 1.1345,
+      "step": 8450
+    },
+    {
+      "epoch": 1.5046296296296298,
+      "grad_norm": 0.7824259996414185,
+      "learning_rate": 0.00013810663501621443,
+      "loss": 0.7532,
+      "step": 8451
+    },
+    {
+      "epoch": 1.5048076923076923,
+      "grad_norm": 0.8184825778007507,
+      "learning_rate": 0.00013809369331234386,
+      "loss": 1.2674,
+      "step": 8452
+    },
+    {
+      "epoch": 1.5049857549857548,
+      "grad_norm": 0.7369286417961121,
+      "learning_rate": 0.00013808075086210647,
+      "loss": 1.0978,
+      "step": 8453
+    },
+    {
+      "epoch": 1.5051638176638176,
+      "grad_norm": 0.6336679458618164,
+      "learning_rate": 0.00013806780766575588,
+      "loss": 1.0922,
+      "step": 8454
+    },
+    {
+      "epoch": 1.5053418803418803,
+      "grad_norm": 0.700219452381134,
+      "learning_rate": 0.0001380548637235457,
+      "loss": 1.0908,
+      "step": 8455
+    },
+    {
+      "epoch": 1.505519943019943,
+      "grad_norm": 0.6346127986907959,
+      "learning_rate": 0.0001380419190357295,
+      "loss": 1.1265,
+      "step": 8456
+    },
+    {
+      "epoch": 1.5056980056980058,
+      "grad_norm": 0.8653196096420288,
+      "learning_rate": 0.00013802897360256093,
+      "loss": 1.0466,
+      "step": 8457
+    },
+    {
+      "epoch": 1.5058760683760684,
+      "grad_norm": 0.6589069962501526,
+      "learning_rate": 0.0001380160274242936,
+      "loss": 1.245,
+      "step": 8458
+    },
+    {
+      "epoch": 1.506054131054131,
+      "grad_norm": 0.6527602076530457,
+      "learning_rate": 0.00013800308050118117,
+      "loss": 1.1539,
+      "step": 8459
+    },
+    {
+      "epoch": 1.5062321937321936,
+      "grad_norm": 0.6005436182022095,
+      "learning_rate": 0.00013799013283347734,
+      "loss": 0.899,
+      "step": 8460
+    },
+    {
+      "epoch": 1.5064102564102564,
+      "grad_norm": 0.6954274773597717,
+      "learning_rate": 0.0001379771844214358,
+      "loss": 1.1245,
+      "step": 8461
+    },
+    {
+      "epoch": 1.5065883190883191,
+      "grad_norm": 0.658764660358429,
+      "learning_rate": 0.00013796423526531019,
+      "loss": 0.9884,
+      "step": 8462
+    },
+    {
+      "epoch": 1.506766381766382,
+      "grad_norm": 0.652214527130127,
+      "learning_rate": 0.0001379512853653543,
+      "loss": 0.9711,
+      "step": 8463
+    },
+    {
+      "epoch": 1.5069444444444444,
+      "grad_norm": 0.5680044889450073,
+      "learning_rate": 0.00013793833472182176,
+      "loss": 0.9055,
+      "step": 8464
+    },
+    {
+      "epoch": 1.5071225071225072,
+      "grad_norm": 0.7524166703224182,
+      "learning_rate": 0.0001379253833349664,
+      "loss": 1.1163,
+      "step": 8465
+    },
+    {
+      "epoch": 1.5073005698005697,
+      "grad_norm": 0.692936897277832,
+      "learning_rate": 0.0001379124312050419,
+      "loss": 0.899,
+      "step": 8466
+    },
+    {
+      "epoch": 1.5074786324786325,
+      "grad_norm": 0.6871617436408997,
+      "learning_rate": 0.00013789947833230207,
+      "loss": 0.9416,
+      "step": 8467
+    },
+    {
+      "epoch": 1.5076566951566952,
+      "grad_norm": 0.5983462333679199,
+      "learning_rate": 0.0001378865247170007,
+      "loss": 0.9776,
+      "step": 8468
+    },
+    {
+      "epoch": 1.507834757834758,
+      "grad_norm": 0.6486790180206299,
+      "learning_rate": 0.0001378735703593916,
+      "loss": 0.9346,
+      "step": 8469
+    },
+    {
+      "epoch": 1.5080128205128205,
+      "grad_norm": 0.6843809485435486,
+      "learning_rate": 0.00013786061525972857,
+      "loss": 1.1276,
+      "step": 8470
+    },
+    {
+      "epoch": 1.5081908831908832,
+      "grad_norm": 0.5734516382217407,
+      "learning_rate": 0.00013784765941826538,
+      "loss": 0.6939,
+      "step": 8471
+    },
+    {
+      "epoch": 1.5083689458689458,
+      "grad_norm": 0.6126381754875183,
+      "learning_rate": 0.00013783470283525596,
+      "loss": 0.8609,
+      "step": 8472
+    },
+    {
+      "epoch": 1.5085470085470085,
+      "grad_norm": 0.7570928335189819,
+      "learning_rate": 0.00013782174551095415,
+      "loss": 0.8809,
+      "step": 8473
+    },
+    {
+      "epoch": 1.5087250712250713,
+      "grad_norm": 0.6911360025405884,
+      "learning_rate": 0.00013780878744561377,
+      "loss": 0.9916,
+      "step": 8474
+    },
+    {
+      "epoch": 1.508903133903134,
+      "grad_norm": 0.6651954650878906,
+      "learning_rate": 0.00013779582863948878,
+      "loss": 1.0012,
+      "step": 8475
+    },
+    {
+      "epoch": 1.5090811965811965,
+      "grad_norm": 0.845396876335144,
+      "learning_rate": 0.000137782869092833,
+      "loss": 0.8455,
+      "step": 8476
+    },
+    {
+      "epoch": 1.5092592592592593,
+      "grad_norm": 0.6958050727844238,
+      "learning_rate": 0.00013776990880590042,
+      "loss": 1.0264,
+      "step": 8477
+    },
+    {
+      "epoch": 1.5094373219373218,
+      "grad_norm": 0.6950124502182007,
+      "learning_rate": 0.00013775694777894493,
+      "loss": 1.0547,
+      "step": 8478
+    },
+    {
+      "epoch": 1.5096153846153846,
+      "grad_norm": 0.7243088483810425,
+      "learning_rate": 0.00013774398601222045,
+      "loss": 1.0999,
+      "step": 8479
+    },
+    {
+      "epoch": 1.5097934472934473,
+      "grad_norm": 0.6820448040962219,
+      "learning_rate": 0.00013773102350598097,
+      "loss": 0.823,
+      "step": 8480
+    },
+    {
+      "epoch": 1.50997150997151,
+      "grad_norm": 0.689996063709259,
+      "learning_rate": 0.0001377180602604805,
+      "loss": 1.049,
+      "step": 8481
+    },
+    {
+      "epoch": 1.5101495726495726,
+      "grad_norm": 0.6763314604759216,
+      "learning_rate": 0.000137705096275973,
+      "loss": 0.9633,
+      "step": 8482
+    },
+    {
+      "epoch": 1.5103276353276354,
+      "grad_norm": 0.6760517358779907,
+      "learning_rate": 0.00013769213155271243,
+      "loss": 1.0326,
+      "step": 8483
+    },
+    {
+      "epoch": 1.510505698005698,
+      "grad_norm": 0.7181188464164734,
+      "learning_rate": 0.00013767916609095285,
+      "loss": 0.9629,
+      "step": 8484
+    },
+    {
+      "epoch": 1.5106837606837606,
+      "grad_norm": 0.7102212905883789,
+      "learning_rate": 0.0001376661998909483,
+      "loss": 1.2714,
+      "step": 8485
+    },
+    {
+      "epoch": 1.5108618233618234,
+      "grad_norm": 0.6719805598258972,
+      "learning_rate": 0.00013765323295295278,
+      "loss": 0.7848,
+      "step": 8486
+    },
+    {
+      "epoch": 1.5110398860398861,
+      "grad_norm": 0.6592095494270325,
+      "learning_rate": 0.0001376402652772204,
+      "loss": 0.882,
+      "step": 8487
+    },
+    {
+      "epoch": 1.5112179487179487,
+      "grad_norm": 0.6858693361282349,
+      "learning_rate": 0.00013762729686400522,
+      "loss": 0.9418,
+      "step": 8488
+    },
+    {
+      "epoch": 1.5113960113960114,
+      "grad_norm": 0.7183199524879456,
+      "learning_rate": 0.0001376143277135613,
+      "loss": 1.0611,
+      "step": 8489
+    },
+    {
+      "epoch": 1.511574074074074,
+      "grad_norm": 0.6294263005256653,
+      "learning_rate": 0.00013760135782614277,
+      "loss": 0.864,
+      "step": 8490
+    },
+    {
+      "epoch": 1.5117521367521367,
+      "grad_norm": 0.6762619614601135,
+      "learning_rate": 0.00013758838720200376,
+      "loss": 1.0295,
+      "step": 8491
+    },
+    {
+      "epoch": 1.5119301994301995,
+      "grad_norm": 0.6919726133346558,
+      "learning_rate": 0.00013757541584139834,
+      "loss": 1.0803,
+      "step": 8492
+    },
+    {
+      "epoch": 1.5121082621082622,
+      "grad_norm": 0.6801241040229797,
+      "learning_rate": 0.00013756244374458075,
+      "loss": 1.1394,
+      "step": 8493
+    },
+    {
+      "epoch": 1.5122863247863247,
+      "grad_norm": 0.6758754253387451,
+      "learning_rate": 0.0001375494709118051,
+      "loss": 1.0053,
+      "step": 8494
+    },
+    {
+      "epoch": 1.5124643874643875,
+      "grad_norm": 0.6727001070976257,
+      "learning_rate": 0.00013753649734332555,
+      "loss": 1.1407,
+      "step": 8495
+    },
+    {
+      "epoch": 1.51264245014245,
+      "grad_norm": 0.693913459777832,
+      "learning_rate": 0.00013752352303939632,
+      "loss": 1.1804,
+      "step": 8496
+    },
+    {
+      "epoch": 1.5128205128205128,
+      "grad_norm": 0.6122510433197021,
+      "learning_rate": 0.0001375105480002716,
+      "loss": 0.917,
+      "step": 8497
+    },
+    {
+      "epoch": 1.5129985754985755,
+      "grad_norm": 0.6305009722709656,
+      "learning_rate": 0.00013749757222620562,
+      "loss": 1.1075,
+      "step": 8498
+    },
+    {
+      "epoch": 1.5131766381766383,
+      "grad_norm": 0.7249642610549927,
+      "learning_rate": 0.0001374845957174526,
+      "loss": 0.9107,
+      "step": 8499
+    },
+    {
+      "epoch": 1.5133547008547008,
+      "grad_norm": 0.6922136545181274,
+      "learning_rate": 0.0001374716184742668,
+      "loss": 0.9974,
+      "step": 8500
+    },
+    {
+      "epoch": 1.5135327635327636,
+      "grad_norm": 0.6989904046058655,
+      "learning_rate": 0.00013745864049690245,
+      "loss": 0.9866,
+      "step": 8501
+    },
+    {
+      "epoch": 1.513710826210826,
+      "grad_norm": 0.6284058094024658,
+      "learning_rate": 0.0001374456617856139,
+      "loss": 0.8658,
+      "step": 8502
+    },
+    {
+      "epoch": 1.5138888888888888,
+      "grad_norm": 0.615388810634613,
+      "learning_rate": 0.00013743268234065535,
+      "loss": 0.7876,
+      "step": 8503
+    },
+    {
+      "epoch": 1.5140669515669516,
+      "grad_norm": 0.6212600469589233,
+      "learning_rate": 0.0001374197021622812,
+      "loss": 0.855,
+      "step": 8504
+    },
+    {
+      "epoch": 1.5142450142450143,
+      "grad_norm": 0.6312419772148132,
+      "learning_rate": 0.00013740672125074567,
+      "loss": 0.9252,
+      "step": 8505
+    },
+    {
+      "epoch": 1.5144230769230769,
+      "grad_norm": 0.7094576954841614,
+      "learning_rate": 0.00013739373960630315,
+      "loss": 0.7655,
+      "step": 8506
+    },
+    {
+      "epoch": 1.5146011396011396,
+      "grad_norm": 0.5583470463752747,
+      "learning_rate": 0.000137380757229208,
+      "loss": 0.7855,
+      "step": 8507
+    },
+    {
+      "epoch": 1.5147792022792022,
+      "grad_norm": 0.6798399686813354,
+      "learning_rate": 0.00013736777411971457,
+      "loss": 0.9935,
+      "step": 8508
+    },
+    {
+      "epoch": 1.514957264957265,
+      "grad_norm": 0.7835991978645325,
+      "learning_rate": 0.00013735479027807723,
+      "loss": 1.1603,
+      "step": 8509
+    },
+    {
+      "epoch": 1.5151353276353277,
+      "grad_norm": 0.6230790615081787,
+      "learning_rate": 0.00013734180570455033,
+      "loss": 1.1463,
+      "step": 8510
+    },
+    {
+      "epoch": 1.5153133903133904,
+      "grad_norm": 0.646603524684906,
+      "learning_rate": 0.00013732882039938835,
+      "loss": 0.9564,
+      "step": 8511
+    },
+    {
+      "epoch": 1.515491452991453,
+      "grad_norm": 0.6619647145271301,
+      "learning_rate": 0.0001373158343628457,
+      "loss": 0.8492,
+      "step": 8512
+    },
+    {
+      "epoch": 1.5156695156695157,
+      "grad_norm": 0.6458454132080078,
+      "learning_rate": 0.00013730284759517675,
+      "loss": 1.0049,
+      "step": 8513
+    },
+    {
+      "epoch": 1.5158475783475782,
+      "grad_norm": 0.7415743470191956,
+      "learning_rate": 0.00013728986009663602,
+      "loss": 0.872,
+      "step": 8514
+    },
+    {
+      "epoch": 1.516025641025641,
+      "grad_norm": 0.6198840141296387,
+      "learning_rate": 0.00013727687186747793,
+      "loss": 0.8645,
+      "step": 8515
+    },
+    {
+      "epoch": 1.5162037037037037,
+      "grad_norm": 0.7160853147506714,
+      "learning_rate": 0.00013726388290795697,
+      "loss": 1.0144,
+      "step": 8516
+    },
+    {
+      "epoch": 1.5163817663817665,
+      "grad_norm": 0.6604135632514954,
+      "learning_rate": 0.00013725089321832765,
+      "loss": 0.9827,
+      "step": 8517
+    },
+    {
+      "epoch": 1.5165598290598292,
+      "grad_norm": 0.6480790972709656,
+      "learning_rate": 0.00013723790279884443,
+      "loss": 1.0357,
+      "step": 8518
+    },
+    {
+      "epoch": 1.5167378917378918,
+      "grad_norm": 0.6207128167152405,
+      "learning_rate": 0.00013722491164976187,
+      "loss": 0.9467,
+      "step": 8519
+    },
+    {
+      "epoch": 1.5169159544159543,
+      "grad_norm": 0.6024298667907715,
+      "learning_rate": 0.00013721191977133452,
+      "loss": 0.8821,
+      "step": 8520
+    },
+    {
+      "epoch": 1.517094017094017,
+      "grad_norm": 0.684898316860199,
+      "learning_rate": 0.00013719892716381688,
+      "loss": 0.9823,
+      "step": 8521
+    },
+    {
+      "epoch": 1.5172720797720798,
+      "grad_norm": 0.7460635304450989,
+      "learning_rate": 0.00013718593382746355,
+      "loss": 1.2573,
+      "step": 8522
+    },
+    {
+      "epoch": 1.5174501424501425,
+      "grad_norm": 0.7193243503570557,
+      "learning_rate": 0.00013717293976252907,
+      "loss": 1.0162,
+      "step": 8523
+    },
+    {
+      "epoch": 1.5176282051282053,
+      "grad_norm": 0.6328752040863037,
+      "learning_rate": 0.0001371599449692681,
+      "loss": 0.8183,
+      "step": 8524
+    },
+    {
+      "epoch": 1.5178062678062678,
+      "grad_norm": 0.658784806728363,
+      "learning_rate": 0.00013714694944793517,
+      "loss": 0.9315,
+      "step": 8525
+    },
+    {
+      "epoch": 1.5179843304843303,
+      "grad_norm": 0.7875827550888062,
+      "learning_rate": 0.00013713395319878493,
+      "loss": 1.0889,
+      "step": 8526
+    },
+    {
+      "epoch": 1.518162393162393,
+      "grad_norm": 0.6580079793930054,
+      "learning_rate": 0.00013712095622207203,
+      "loss": 1.0276,
+      "step": 8527
+    },
+    {
+      "epoch": 1.5183404558404558,
+      "grad_norm": 0.6214027404785156,
+      "learning_rate": 0.00013710795851805106,
+      "loss": 0.9692,
+      "step": 8528
+    },
+    {
+      "epoch": 1.5185185185185186,
+      "grad_norm": 0.7839403748512268,
+      "learning_rate": 0.0001370949600869768,
+      "loss": 0.7378,
+      "step": 8529
+    },
+    {
+      "epoch": 1.5186965811965814,
+      "grad_norm": 0.6632764339447021,
+      "learning_rate": 0.0001370819609291038,
+      "loss": 0.9431,
+      "step": 8530
+    },
+    {
+      "epoch": 1.5188746438746439,
+      "grad_norm": 0.7071712017059326,
+      "learning_rate": 0.00013706896104468682,
+      "loss": 0.7684,
+      "step": 8531
+    },
+    {
+      "epoch": 1.5190527065527064,
+      "grad_norm": 0.7494829297065735,
+      "learning_rate": 0.00013705596043398058,
+      "loss": 0.9709,
+      "step": 8532
+    },
+    {
+      "epoch": 1.5192307692307692,
+      "grad_norm": 0.6408106088638306,
+      "learning_rate": 0.00013704295909723973,
+      "loss": 0.8494,
+      "step": 8533
+    },
+    {
+      "epoch": 1.519408831908832,
+      "grad_norm": 0.6043150424957275,
+      "learning_rate": 0.0001370299570347191,
+      "loss": 0.7485,
+      "step": 8534
+    },
+    {
+      "epoch": 1.5195868945868947,
+      "grad_norm": 0.6944992542266846,
+      "learning_rate": 0.00013701695424667336,
+      "loss": 0.8403,
+      "step": 8535
+    },
+    {
+      "epoch": 1.5197649572649574,
+      "grad_norm": 0.7730217576026917,
+      "learning_rate": 0.00013700395073335726,
+      "loss": 0.9122,
+      "step": 8536
+    },
+    {
+      "epoch": 1.51994301994302,
+      "grad_norm": 0.6300255060195923,
+      "learning_rate": 0.00013699094649502564,
+      "loss": 0.9185,
+      "step": 8537
+    },
+    {
+      "epoch": 1.5201210826210825,
+      "grad_norm": 0.648676335811615,
+      "learning_rate": 0.00013697794153193327,
+      "loss": 0.9897,
+      "step": 8538
+    },
+    {
+      "epoch": 1.5202991452991452,
+      "grad_norm": 0.7365788817405701,
+      "learning_rate": 0.00013696493584433494,
+      "loss": 0.958,
+      "step": 8539
+    },
+    {
+      "epoch": 1.520477207977208,
+      "grad_norm": 0.6634557247161865,
+      "learning_rate": 0.00013695192943248552,
+      "loss": 0.9389,
+      "step": 8540
+    },
+    {
+      "epoch": 1.5206552706552707,
+      "grad_norm": 0.6110827922821045,
+      "learning_rate": 0.00013693892229663977,
+      "loss": 0.9341,
+      "step": 8541
+    },
+    {
+      "epoch": 1.5208333333333335,
+      "grad_norm": 0.7207275032997131,
+      "learning_rate": 0.00013692591443705256,
+      "loss": 0.9526,
+      "step": 8542
+    },
+    {
+      "epoch": 1.521011396011396,
+      "grad_norm": 0.7071022391319275,
+      "learning_rate": 0.0001369129058539788,
+      "loss": 0.9572,
+      "step": 8543
+    },
+    {
+      "epoch": 1.5211894586894585,
+      "grad_norm": 0.5898227691650391,
+      "learning_rate": 0.0001368998965476733,
+      "loss": 0.921,
+      "step": 8544
+    },
+    {
+      "epoch": 1.5213675213675213,
+      "grad_norm": 0.7542559504508972,
+      "learning_rate": 0.000136886886518391,
+      "loss": 0.7799,
+      "step": 8545
+    },
+    {
+      "epoch": 1.521545584045584,
+      "grad_norm": 0.6904959678649902,
+      "learning_rate": 0.00013687387576638674,
+      "loss": 0.9601,
+      "step": 8546
+    },
+    {
+      "epoch": 1.5217236467236468,
+      "grad_norm": 0.763414204120636,
+      "learning_rate": 0.00013686086429191553,
+      "loss": 1.0046,
+      "step": 8547
+    },
+    {
+      "epoch": 1.5219017094017095,
+      "grad_norm": 0.6879960298538208,
+      "learning_rate": 0.00013684785209523224,
+      "loss": 0.9615,
+      "step": 8548
+    },
+    {
+      "epoch": 1.522079772079772,
+      "grad_norm": 0.7166057229042053,
+      "learning_rate": 0.00013683483917659186,
+      "loss": 0.9481,
+      "step": 8549
+    },
+    {
+      "epoch": 1.5222578347578346,
+      "grad_norm": 0.6384348273277283,
+      "learning_rate": 0.0001368218255362493,
+      "loss": 1.1037,
+      "step": 8550
+    },
+    {
+      "epoch": 1.5224358974358974,
+      "grad_norm": 0.6564528346061707,
+      "learning_rate": 0.00013680881117445953,
+      "loss": 0.951,
+      "step": 8551
+    },
+    {
+      "epoch": 1.52261396011396,
+      "grad_norm": 0.749301016330719,
+      "learning_rate": 0.00013679579609147762,
+      "loss": 0.9324,
+      "step": 8552
+    },
+    {
+      "epoch": 1.5227920227920229,
+      "grad_norm": 0.8130472898483276,
+      "learning_rate": 0.00013678278028755848,
+      "loss": 1.0178,
+      "step": 8553
+    },
+    {
+      "epoch": 1.5229700854700856,
+      "grad_norm": 0.6763297319412231,
+      "learning_rate": 0.0001367697637629572,
+      "loss": 0.9224,
+      "step": 8554
+    },
+    {
+      "epoch": 1.5231481481481481,
+      "grad_norm": 0.6630885601043701,
+      "learning_rate": 0.00013675674651792878,
+      "loss": 1.0254,
+      "step": 8555
+    },
+    {
+      "epoch": 1.5233262108262107,
+      "grad_norm": 0.7377206087112427,
+      "learning_rate": 0.00013674372855272825,
+      "loss": 1.0413,
+      "step": 8556
+    },
+    {
+      "epoch": 1.5235042735042734,
+      "grad_norm": 0.5270320177078247,
+      "learning_rate": 0.00013673070986761068,
+      "loss": 0.7124,
+      "step": 8557
+    },
+    {
+      "epoch": 1.5236823361823362,
+      "grad_norm": 0.5941976308822632,
+      "learning_rate": 0.00013671769046283116,
+      "loss": 1.0281,
+      "step": 8558
+    },
+    {
+      "epoch": 1.523860398860399,
+      "grad_norm": 0.6131376028060913,
+      "learning_rate": 0.0001367046703386448,
+      "loss": 0.7593,
+      "step": 8559
+    },
+    {
+      "epoch": 1.5240384615384617,
+      "grad_norm": 0.7381763458251953,
+      "learning_rate": 0.00013669164949530664,
+      "loss": 1.148,
+      "step": 8560
+    },
+    {
+      "epoch": 1.5242165242165242,
+      "grad_norm": 0.683274507522583,
+      "learning_rate": 0.00013667862793307185,
+      "loss": 0.8354,
+      "step": 8561
+    },
+    {
+      "epoch": 1.5243945868945867,
+      "grad_norm": 0.6912649273872375,
+      "learning_rate": 0.0001366656056521955,
+      "loss": 0.9043,
+      "step": 8562
+    },
+    {
+      "epoch": 1.5245726495726495,
+      "grad_norm": 0.5999594330787659,
+      "learning_rate": 0.0001366525826529328,
+      "loss": 0.6138,
+      "step": 8563
+    },
+    {
+      "epoch": 1.5247507122507122,
+      "grad_norm": 0.7185927629470825,
+      "learning_rate": 0.00013663955893553892,
+      "loss": 0.895,
+      "step": 8564
+    },
+    {
+      "epoch": 1.524928774928775,
+      "grad_norm": 0.5967002511024475,
+      "learning_rate": 0.00013662653450026893,
+      "loss": 0.9636,
+      "step": 8565
+    },
+    {
+      "epoch": 1.5251068376068377,
+      "grad_norm": 0.7122953534126282,
+      "learning_rate": 0.00013661350934737813,
+      "loss": 0.9465,
+      "step": 8566
+    },
+    {
+      "epoch": 1.5252849002849003,
+      "grad_norm": 0.705326497554779,
+      "learning_rate": 0.00013660048347712163,
+      "loss": 1.121,
+      "step": 8567
+    },
+    {
+      "epoch": 1.5254629629629628,
+      "grad_norm": 0.6023733019828796,
+      "learning_rate": 0.0001365874568897547,
+      "loss": 0.9881,
+      "step": 8568
+    },
+    {
+      "epoch": 1.5256410256410255,
+      "grad_norm": 0.6883122324943542,
+      "learning_rate": 0.0001365744295855326,
+      "loss": 1.2372,
+      "step": 8569
+    },
+    {
+      "epoch": 1.5258190883190883,
+      "grad_norm": 0.718126654624939,
+      "learning_rate": 0.0001365614015647105,
+      "loss": 1.0888,
+      "step": 8570
+    },
+    {
+      "epoch": 1.525997150997151,
+      "grad_norm": 0.6649243831634521,
+      "learning_rate": 0.00013654837282754367,
+      "loss": 1.0458,
+      "step": 8571
+    },
+    {
+      "epoch": 1.5261752136752138,
+      "grad_norm": 0.6959797143936157,
+      "learning_rate": 0.00013653534337428738,
+      "loss": 0.9282,
+      "step": 8572
+    },
+    {
+      "epoch": 1.5263532763532763,
+      "grad_norm": 0.6069976687431335,
+      "learning_rate": 0.00013652231320519697,
+      "loss": 0.9706,
+      "step": 8573
+    },
+    {
+      "epoch": 1.526531339031339,
+      "grad_norm": 0.7085374593734741,
+      "learning_rate": 0.0001365092823205277,
+      "loss": 1.1241,
+      "step": 8574
+    },
+    {
+      "epoch": 1.5267094017094016,
+      "grad_norm": 0.575106143951416,
+      "learning_rate": 0.00013649625072053488,
+      "loss": 0.9814,
+      "step": 8575
+    },
+    {
+      "epoch": 1.5268874643874644,
+      "grad_norm": 0.6541273593902588,
+      "learning_rate": 0.00013648321840547384,
+      "loss": 1.0155,
+      "step": 8576
+    },
+    {
+      "epoch": 1.5270655270655271,
+      "grad_norm": 0.6754382848739624,
+      "learning_rate": 0.0001364701853755999,
+      "loss": 1.0284,
+      "step": 8577
+    },
+    {
+      "epoch": 1.5272435897435899,
+      "grad_norm": 0.6219634413719177,
+      "learning_rate": 0.00013645715163116846,
+      "loss": 1.1539,
+      "step": 8578
+    },
+    {
+      "epoch": 1.5274216524216524,
+      "grad_norm": 0.7625157833099365,
+      "learning_rate": 0.00013644411717243486,
+      "loss": 1.1157,
+      "step": 8579
+    },
+    {
+      "epoch": 1.5275997150997151,
+      "grad_norm": 0.6944296956062317,
+      "learning_rate": 0.0001364310819996545,
+      "loss": 0.8309,
+      "step": 8580
+    },
+    {
+      "epoch": 1.5277777777777777,
+      "grad_norm": 0.7198494672775269,
+      "learning_rate": 0.00013641804611308277,
+      "loss": 1.0883,
+      "step": 8581
+    },
+    {
+      "epoch": 1.5279558404558404,
+      "grad_norm": 0.6398822069168091,
+      "learning_rate": 0.00013640500951297508,
+      "loss": 1.0173,
+      "step": 8582
+    },
+    {
+      "epoch": 1.5281339031339032,
+      "grad_norm": 0.7306683659553528,
+      "learning_rate": 0.00013639197219958682,
+      "loss": 0.9979,
+      "step": 8583
+    },
+    {
+      "epoch": 1.528311965811966,
+      "grad_norm": 0.6873512268066406,
+      "learning_rate": 0.00013637893417317348,
+      "loss": 0.7883,
+      "step": 8584
+    },
+    {
+      "epoch": 1.5284900284900285,
+      "grad_norm": 0.6482085585594177,
+      "learning_rate": 0.00013636589543399052,
+      "loss": 0.9367,
+      "step": 8585
+    },
+    {
+      "epoch": 1.5286680911680912,
+      "grad_norm": 0.8161232471466064,
+      "learning_rate": 0.00013635285598229336,
+      "loss": 1.0582,
+      "step": 8586
+    },
+    {
+      "epoch": 1.5288461538461537,
+      "grad_norm": 0.6722155809402466,
+      "learning_rate": 0.0001363398158183375,
+      "loss": 0.9805,
+      "step": 8587
+    },
+    {
+      "epoch": 1.5290242165242165,
+      "grad_norm": 0.7175397872924805,
+      "learning_rate": 0.00013632677494237845,
+      "loss": 1.0747,
+      "step": 8588
+    },
+    {
+      "epoch": 1.5292022792022792,
+      "grad_norm": 0.6665592789649963,
+      "learning_rate": 0.00013631373335467172,
+      "loss": 1.006,
+      "step": 8589
+    },
+    {
+      "epoch": 1.529380341880342,
+      "grad_norm": 0.7002299427986145,
+      "learning_rate": 0.0001363006910554728,
+      "loss": 1.0702,
+      "step": 8590
+    },
+    {
+      "epoch": 1.5295584045584045,
+      "grad_norm": 0.7712168097496033,
+      "learning_rate": 0.00013628764804503725,
+      "loss": 1.0628,
+      "step": 8591
+    },
+    {
+      "epoch": 1.5297364672364673,
+      "grad_norm": 0.6620795130729675,
+      "learning_rate": 0.0001362746043236206,
+      "loss": 1.01,
+      "step": 8592
+    },
+    {
+      "epoch": 1.5299145299145298,
+      "grad_norm": 0.6374393701553345,
+      "learning_rate": 0.00013626155989147846,
+      "loss": 0.9106,
+      "step": 8593
+    },
+    {
+      "epoch": 1.5300925925925926,
+      "grad_norm": 0.6531631946563721,
+      "learning_rate": 0.00013624851474886636,
+      "loss": 1.0488,
+      "step": 8594
+    },
+    {
+      "epoch": 1.5302706552706553,
+      "grad_norm": 0.6843775510787964,
+      "learning_rate": 0.00013623546889603993,
+      "loss": 0.8599,
+      "step": 8595
+    },
+    {
+      "epoch": 1.530448717948718,
+      "grad_norm": 0.7232706546783447,
+      "learning_rate": 0.00013622242233325476,
+      "loss": 1.0875,
+      "step": 8596
+    },
+    {
+      "epoch": 1.5306267806267806,
+      "grad_norm": 0.695691704750061,
+      "learning_rate": 0.00013620937506076644,
+      "loss": 0.9835,
+      "step": 8597
+    },
+    {
+      "epoch": 1.5308048433048433,
+      "grad_norm": 0.6321248412132263,
+      "learning_rate": 0.00013619632707883065,
+      "loss": 0.9778,
+      "step": 8598
+    },
+    {
+      "epoch": 1.5309829059829059,
+      "grad_norm": 0.6469168663024902,
+      "learning_rate": 0.00013618327838770303,
+      "loss": 0.9968,
+      "step": 8599
+    },
+    {
+      "epoch": 1.5311609686609686,
+      "grad_norm": 0.6798683404922485,
+      "learning_rate": 0.00013617022898763925,
+      "loss": 0.78,
+      "step": 8600
+    },
+    {
+      "epoch": 1.5313390313390314,
+      "grad_norm": 0.6932336091995239,
+      "learning_rate": 0.00013615717887889496,
+      "loss": 0.9473,
+      "step": 8601
+    },
+    {
+      "epoch": 1.5315170940170941,
+      "grad_norm": 0.7304185628890991,
+      "learning_rate": 0.00013614412806172585,
+      "loss": 1.0478,
+      "step": 8602
+    },
+    {
+      "epoch": 1.5316951566951567,
+      "grad_norm": 0.6585272550582886,
+      "learning_rate": 0.00013613107653638763,
+      "loss": 0.8563,
+      "step": 8603
+    },
+    {
+      "epoch": 1.5318732193732194,
+      "grad_norm": 0.6804470419883728,
+      "learning_rate": 0.00013611802430313604,
+      "loss": 0.9839,
+      "step": 8604
+    },
+    {
+      "epoch": 1.532051282051282,
+      "grad_norm": 0.7271378040313721,
+      "learning_rate": 0.0001361049713622268,
+      "loss": 1.0906,
+      "step": 8605
+    },
+    {
+      "epoch": 1.5322293447293447,
+      "grad_norm": 0.7731603980064392,
+      "learning_rate": 0.00013609191771391562,
+      "loss": 1.1318,
+      "step": 8606
+    },
+    {
+      "epoch": 1.5324074074074074,
+      "grad_norm": 0.6143709421157837,
+      "learning_rate": 0.0001360788633584583,
+      "loss": 0.8726,
+      "step": 8607
+    },
+    {
+      "epoch": 1.5325854700854702,
+      "grad_norm": 0.6847203373908997,
+      "learning_rate": 0.00013606580829611056,
+      "loss": 0.9963,
+      "step": 8608
+    },
+    {
+      "epoch": 1.5327635327635327,
+      "grad_norm": 0.7561219334602356,
+      "learning_rate": 0.0001360527525271283,
+      "loss": 0.8873,
+      "step": 8609
+    },
+    {
+      "epoch": 1.5329415954415955,
+      "grad_norm": 0.7997925281524658,
+      "learning_rate": 0.0001360396960517672,
+      "loss": 0.7675,
+      "step": 8610
+    },
+    {
+      "epoch": 1.533119658119658,
+      "grad_norm": 0.7206357717514038,
+      "learning_rate": 0.00013602663887028315,
+      "loss": 1.0084,
+      "step": 8611
+    },
+    {
+      "epoch": 1.5332977207977208,
+      "grad_norm": 0.6454238891601562,
+      "learning_rate": 0.00013601358098293194,
+      "loss": 0.8194,
+      "step": 8612
+    },
+    {
+      "epoch": 1.5334757834757835,
+      "grad_norm": 0.5531884431838989,
+      "learning_rate": 0.0001360005223899694,
+      "loss": 0.8596,
+      "step": 8613
+    },
+    {
+      "epoch": 1.5336538461538463,
+      "grad_norm": 0.659161388874054,
+      "learning_rate": 0.00013598746309165144,
+      "loss": 1.0363,
+      "step": 8614
+    },
+    {
+      "epoch": 1.5338319088319088,
+      "grad_norm": 0.6958948373794556,
+      "learning_rate": 0.00013597440308823385,
+      "loss": 0.9852,
+      "step": 8615
+    },
+    {
+      "epoch": 1.5340099715099715,
+      "grad_norm": 0.7147171497344971,
+      "learning_rate": 0.0001359613423799726,
+      "loss": 1.0506,
+      "step": 8616
+    },
+    {
+      "epoch": 1.534188034188034,
+      "grad_norm": 0.604450523853302,
+      "learning_rate": 0.00013594828096712353,
+      "loss": 0.9344,
+      "step": 8617
+    },
+    {
+      "epoch": 1.5343660968660968,
+      "grad_norm": 0.714547336101532,
+      "learning_rate": 0.00013593521884994257,
+      "loss": 1.1583,
+      "step": 8618
+    },
+    {
+      "epoch": 1.5345441595441596,
+      "grad_norm": 0.6864442825317383,
+      "learning_rate": 0.00013592215602868565,
+      "loss": 0.991,
+      "step": 8619
+    },
+    {
+      "epoch": 1.5347222222222223,
+      "grad_norm": 0.6384446620941162,
+      "learning_rate": 0.00013590909250360873,
+      "loss": 0.8799,
+      "step": 8620
+    },
+    {
+      "epoch": 1.5349002849002849,
+      "grad_norm": 0.7307949662208557,
+      "learning_rate": 0.00013589602827496772,
+      "loss": 1.0276,
+      "step": 8621
+    },
+    {
+      "epoch": 1.5350783475783476,
+      "grad_norm": 0.6620129942893982,
+      "learning_rate": 0.00013588296334301862,
+      "loss": 0.9378,
+      "step": 8622
+    },
+    {
+      "epoch": 1.5352564102564101,
+      "grad_norm": 0.7216851711273193,
+      "learning_rate": 0.00013586989770801735,
+      "loss": 0.8984,
+      "step": 8623
+    },
+    {
+      "epoch": 1.5354344729344729,
+      "grad_norm": 0.7319885492324829,
+      "learning_rate": 0.00013585683137022,
+      "loss": 1.0357,
+      "step": 8624
+    },
+    {
+      "epoch": 1.5356125356125356,
+      "grad_norm": 0.7455703616142273,
+      "learning_rate": 0.00013584376432988247,
+      "loss": 0.9727,
+      "step": 8625
+    },
+    {
+      "epoch": 1.5357905982905984,
+      "grad_norm": 0.7285277247428894,
+      "learning_rate": 0.0001358306965872609,
+      "loss": 1.1132,
+      "step": 8626
+    },
+    {
+      "epoch": 1.535968660968661,
+      "grad_norm": 0.6250096559524536,
+      "learning_rate": 0.00013581762814261124,
+      "loss": 0.8538,
+      "step": 8627
+    },
+    {
+      "epoch": 1.5361467236467237,
+      "grad_norm": 0.6252279281616211,
+      "learning_rate": 0.0001358045589961895,
+      "loss": 0.822,
+      "step": 8628
+    },
+    {
+      "epoch": 1.5363247863247862,
+      "grad_norm": 0.7723368406295776,
+      "learning_rate": 0.0001357914891482519,
+      "loss": 0.9841,
+      "step": 8629
+    },
+    {
+      "epoch": 1.536502849002849,
+      "grad_norm": 0.6855236887931824,
+      "learning_rate": 0.00013577841859905435,
+      "loss": 0.9512,
+      "step": 8630
+    },
+    {
+      "epoch": 1.5366809116809117,
+      "grad_norm": 0.8320944309234619,
+      "learning_rate": 0.00013576534734885303,
+      "loss": 1.0324,
+      "step": 8631
+    },
+    {
+      "epoch": 1.5368589743589745,
+      "grad_norm": 0.6970052123069763,
+      "learning_rate": 0.00013575227539790405,
+      "loss": 0.9874,
+      "step": 8632
+    },
+    {
+      "epoch": 1.5370370370370372,
+      "grad_norm": 0.7774853110313416,
+      "learning_rate": 0.00013573920274646345,
+      "loss": 0.962,
+      "step": 8633
+    },
+    {
+      "epoch": 1.5372150997150997,
+      "grad_norm": 0.6479182839393616,
+      "learning_rate": 0.0001357261293947875,
+      "loss": 0.9438,
+      "step": 8634
+    },
+    {
+      "epoch": 1.5373931623931623,
+      "grad_norm": 0.6855679750442505,
+      "learning_rate": 0.00013571305534313218,
+      "loss": 1.0898,
+      "step": 8635
+    },
+    {
+      "epoch": 1.537571225071225,
+      "grad_norm": 0.6527835726737976,
+      "learning_rate": 0.00013569998059175377,
+      "loss": 0.954,
+      "step": 8636
+    },
+    {
+      "epoch": 1.5377492877492878,
+      "grad_norm": 0.6601176857948303,
+      "learning_rate": 0.00013568690514090837,
+      "loss": 1.0183,
+      "step": 8637
+    },
+    {
+      "epoch": 1.5379273504273505,
+      "grad_norm": 0.6628120541572571,
+      "learning_rate": 0.0001356738289908522,
+      "loss": 1.0651,
+      "step": 8638
+    },
+    {
+      "epoch": 1.5381054131054133,
+      "grad_norm": 0.7492203712463379,
+      "learning_rate": 0.00013566075214184147,
+      "loss": 1.2438,
+      "step": 8639
+    },
+    {
+      "epoch": 1.5382834757834758,
+      "grad_norm": 0.6781343817710876,
+      "learning_rate": 0.00013564767459413237,
+      "loss": 0.9413,
+      "step": 8640
+    },
+    {
+      "epoch": 1.5384615384615383,
+      "grad_norm": 0.6890891790390015,
+      "learning_rate": 0.00013563459634798115,
+      "loss": 0.9912,
+      "step": 8641
+    },
+    {
+      "epoch": 1.538639601139601,
+      "grad_norm": 0.722820520401001,
+      "learning_rate": 0.00013562151740364404,
+      "loss": 1.1799,
+      "step": 8642
+    },
+    {
+      "epoch": 1.5388176638176638,
+      "grad_norm": 0.738369882106781,
+      "learning_rate": 0.0001356084377613773,
+      "loss": 1.1313,
+      "step": 8643
+    },
+    {
+      "epoch": 1.5389957264957266,
+      "grad_norm": 0.6232718229293823,
+      "learning_rate": 0.00013559535742143717,
+      "loss": 0.9035,
+      "step": 8644
+    },
+    {
+      "epoch": 1.5391737891737893,
+      "grad_norm": 0.7371624708175659,
+      "learning_rate": 0.00013558227638407996,
+      "loss": 1.3377,
+      "step": 8645
+    },
+    {
+      "epoch": 1.5393518518518519,
+      "grad_norm": 0.658353865146637,
+      "learning_rate": 0.00013556919464956197,
+      "loss": 0.9591,
+      "step": 8646
+    },
+    {
+      "epoch": 1.5395299145299144,
+      "grad_norm": 0.6205827593803406,
+      "learning_rate": 0.0001355561122181395,
+      "loss": 0.9217,
+      "step": 8647
+    },
+    {
+      "epoch": 1.5397079772079771,
+      "grad_norm": 0.5892502069473267,
+      "learning_rate": 0.00013554302909006888,
+      "loss": 0.8893,
+      "step": 8648
+    },
+    {
+      "epoch": 1.53988603988604,
+      "grad_norm": 1.224568486213684,
+      "learning_rate": 0.0001355299452656064,
+      "loss": 0.8237,
+      "step": 8649
+    },
+    {
+      "epoch": 1.5400641025641026,
+      "grad_norm": 0.7732635736465454,
+      "learning_rate": 0.0001355168607450085,
+      "loss": 1.1043,
+      "step": 8650
+    },
+    {
+      "epoch": 1.5402421652421654,
+      "grad_norm": 0.6365402340888977,
+      "learning_rate": 0.00013550377552853146,
+      "loss": 1.0345,
+      "step": 8651
+    },
+    {
+      "epoch": 1.540420227920228,
+      "grad_norm": 0.7046400904655457,
+      "learning_rate": 0.00013549068961643171,
+      "loss": 1.0361,
+      "step": 8652
+    },
+    {
+      "epoch": 1.5405982905982905,
+      "grad_norm": 0.6760256886482239,
+      "learning_rate": 0.0001354776030089656,
+      "loss": 0.9437,
+      "step": 8653
+    },
+    {
+      "epoch": 1.5407763532763532,
+      "grad_norm": 0.6180984973907471,
+      "learning_rate": 0.00013546451570638958,
+      "loss": 0.9737,
+      "step": 8654
+    },
+    {
+      "epoch": 1.540954415954416,
+      "grad_norm": 0.6221960186958313,
+      "learning_rate": 0.00013545142770896005,
+      "loss": 0.9313,
+      "step": 8655
+    },
+    {
+      "epoch": 1.5411324786324787,
+      "grad_norm": 0.6887816786766052,
+      "learning_rate": 0.0001354383390169334,
+      "loss": 1.1736,
+      "step": 8656
+    },
+    {
+      "epoch": 1.5413105413105415,
+      "grad_norm": 0.5840606093406677,
+      "learning_rate": 0.00013542524963056614,
+      "loss": 0.9269,
+      "step": 8657
+    },
+    {
+      "epoch": 1.541488603988604,
+      "grad_norm": 0.7396654486656189,
+      "learning_rate": 0.00013541215955011472,
+      "loss": 1.1189,
+      "step": 8658
+    },
+    {
+      "epoch": 1.5416666666666665,
+      "grad_norm": 0.780616819858551,
+      "learning_rate": 0.00013539906877583555,
+      "loss": 1.1251,
+      "step": 8659
+    },
+    {
+      "epoch": 1.5418447293447293,
+      "grad_norm": 0.6975206732749939,
+      "learning_rate": 0.0001353859773079852,
+      "loss": 1.2134,
+      "step": 8660
+    },
+    {
+      "epoch": 1.542022792022792,
+      "grad_norm": 0.7572869658470154,
+      "learning_rate": 0.00013537288514682013,
+      "loss": 0.9396,
+      "step": 8661
+    },
+    {
+      "epoch": 1.5422008547008548,
+      "grad_norm": 0.6252159476280212,
+      "learning_rate": 0.00013535979229259686,
+      "loss": 0.8449,
+      "step": 8662
+    },
+    {
+      "epoch": 1.5423789173789175,
+      "grad_norm": 0.7321650981903076,
+      "learning_rate": 0.0001353466987455719,
+      "loss": 1.3263,
+      "step": 8663
+    },
+    {
+      "epoch": 1.54255698005698,
+      "grad_norm": 0.7168700695037842,
+      "learning_rate": 0.00013533360450600177,
+      "loss": 0.8923,
+      "step": 8664
+    },
+    {
+      "epoch": 1.5427350427350426,
+      "grad_norm": 0.5931934714317322,
+      "learning_rate": 0.00013532050957414313,
+      "loss": 0.8448,
+      "step": 8665
+    },
+    {
+      "epoch": 1.5429131054131053,
+      "grad_norm": 0.6621279120445251,
+      "learning_rate": 0.00013530741395025245,
+      "loss": 1.1023,
+      "step": 8666
+    },
+    {
+      "epoch": 1.543091168091168,
+      "grad_norm": 0.7133732438087463,
+      "learning_rate": 0.00013529431763458633,
+      "loss": 0.9986,
+      "step": 8667
+    },
+    {
+      "epoch": 1.5432692307692308,
+      "grad_norm": 0.7589015960693359,
+      "learning_rate": 0.0001352812206274014,
+      "loss": 1.0111,
+      "step": 8668
+    },
+    {
+      "epoch": 1.5434472934472936,
+      "grad_norm": 0.6958192586898804,
+      "learning_rate": 0.0001352681229289542,
+      "loss": 0.9466,
+      "step": 8669
+    },
+    {
+      "epoch": 1.5436253561253561,
+      "grad_norm": 0.7539750337600708,
+      "learning_rate": 0.0001352550245395014,
+      "loss": 1.0974,
+      "step": 8670
+    },
+    {
+      "epoch": 1.5438034188034186,
+      "grad_norm": 0.7003816366195679,
+      "learning_rate": 0.00013524192545929964,
+      "loss": 1.0354,
+      "step": 8671
+    },
+    {
+      "epoch": 1.5439814814814814,
+      "grad_norm": 0.6503025889396667,
+      "learning_rate": 0.00013522882568860558,
+      "loss": 1.0476,
+      "step": 8672
+    },
+    {
+      "epoch": 1.5441595441595442,
+      "grad_norm": 0.6757345199584961,
+      "learning_rate": 0.00013521572522767584,
+      "loss": 0.864,
+      "step": 8673
+    },
+    {
+      "epoch": 1.544337606837607,
+      "grad_norm": 0.6857611536979675,
+      "learning_rate": 0.0001352026240767671,
+      "loss": 1.1627,
+      "step": 8674
+    },
+    {
+      "epoch": 1.5445156695156697,
+      "grad_norm": 0.5775430798530579,
+      "learning_rate": 0.0001351895222361361,
+      "loss": 0.7444,
+      "step": 8675
+    },
+    {
+      "epoch": 1.5446937321937322,
+      "grad_norm": 0.7511499524116516,
+      "learning_rate": 0.00013517641970603952,
+      "loss": 1.1547,
+      "step": 8676
+    },
+    {
+      "epoch": 1.5448717948717947,
+      "grad_norm": 0.6727504730224609,
+      "learning_rate": 0.00013516331648673403,
+      "loss": 1.0829,
+      "step": 8677
+    },
+    {
+      "epoch": 1.5450498575498575,
+      "grad_norm": 0.6128812432289124,
+      "learning_rate": 0.00013515021257847642,
+      "loss": 0.9318,
+      "step": 8678
+    },
+    {
+      "epoch": 1.5452279202279202,
+      "grad_norm": 0.7309781312942505,
+      "learning_rate": 0.00013513710798152343,
+      "loss": 1.0844,
+      "step": 8679
+    },
+    {
+      "epoch": 1.545405982905983,
+      "grad_norm": 0.695655882358551,
+      "learning_rate": 0.00013512400269613176,
+      "loss": 1.113,
+      "step": 8680
+    },
+    {
+      "epoch": 1.5455840455840457,
+      "grad_norm": 0.696441650390625,
+      "learning_rate": 0.00013511089672255824,
+      "loss": 1.0499,
+      "step": 8681
+    },
+    {
+      "epoch": 1.5457621082621082,
+      "grad_norm": 0.6309961080551147,
+      "learning_rate": 0.00013509779006105964,
+      "loss": 0.8759,
+      "step": 8682
+    },
+    {
+      "epoch": 1.5459401709401708,
+      "grad_norm": 0.6155984401702881,
+      "learning_rate": 0.00013508468271189277,
+      "loss": 0.8967,
+      "step": 8683
+    },
+    {
+      "epoch": 1.5461182336182335,
+      "grad_norm": 0.6786884665489197,
+      "learning_rate": 0.00013507157467531442,
+      "loss": 1.0806,
+      "step": 8684
+    },
+    {
+      "epoch": 1.5462962962962963,
+      "grad_norm": 0.6494075059890747,
+      "learning_rate": 0.00013505846595158138,
+      "loss": 1.0196,
+      "step": 8685
+    },
+    {
+      "epoch": 1.546474358974359,
+      "grad_norm": 0.7599824070930481,
+      "learning_rate": 0.00013504535654095055,
+      "loss": 0.8662,
+      "step": 8686
+    },
+    {
+      "epoch": 1.5466524216524218,
+      "grad_norm": 0.6017210483551025,
+      "learning_rate": 0.00013503224644367877,
+      "loss": 0.872,
+      "step": 8687
+    },
+    {
+      "epoch": 1.5468304843304843,
+      "grad_norm": 0.7972410321235657,
+      "learning_rate": 0.00013501913566002288,
+      "loss": 1.0958,
+      "step": 8688
+    },
+    {
+      "epoch": 1.547008547008547,
+      "grad_norm": 0.7572960257530212,
+      "learning_rate": 0.00013500602419023978,
+      "loss": 1.0219,
+      "step": 8689
+    },
+    {
+      "epoch": 1.5471866096866096,
+      "grad_norm": 0.6329224109649658,
+      "learning_rate": 0.00013499291203458635,
+      "loss": 0.8636,
+      "step": 8690
+    },
+    {
+      "epoch": 1.5473646723646723,
+      "grad_norm": 0.6777113080024719,
+      "learning_rate": 0.0001349797991933195,
+      "loss": 1.0297,
+      "step": 8691
+    },
+    {
+      "epoch": 1.547542735042735,
+      "grad_norm": 0.6449527144432068,
+      "learning_rate": 0.00013496668566669617,
+      "loss": 1.0296,
+      "step": 8692
+    },
+    {
+      "epoch": 1.5477207977207978,
+      "grad_norm": 0.8236973881721497,
+      "learning_rate": 0.00013495357145497326,
+      "loss": 0.8569,
+      "step": 8693
+    },
+    {
+      "epoch": 1.5478988603988604,
+      "grad_norm": 0.6753743290901184,
+      "learning_rate": 0.0001349404565584077,
+      "loss": 1.0733,
+      "step": 8694
+    },
+    {
+      "epoch": 1.5480769230769231,
+      "grad_norm": 0.6642967462539673,
+      "learning_rate": 0.0001349273409772565,
+      "loss": 0.9437,
+      "step": 8695
+    },
+    {
+      "epoch": 1.5482549857549857,
+      "grad_norm": 0.6470823884010315,
+      "learning_rate": 0.00013491422471177661,
+      "loss": 0.999,
+      "step": 8696
+    },
+    {
+      "epoch": 1.5484330484330484,
+      "grad_norm": 0.7287036776542664,
+      "learning_rate": 0.000134901107762225,
+      "loss": 0.9396,
+      "step": 8697
+    },
+    {
+      "epoch": 1.5486111111111112,
+      "grad_norm": 0.6258324980735779,
+      "learning_rate": 0.00013488799012885872,
+      "loss": 1.045,
+      "step": 8698
+    },
+    {
+      "epoch": 1.548789173789174,
+      "grad_norm": 0.6540539860725403,
+      "learning_rate": 0.00013487487181193473,
+      "loss": 0.9939,
+      "step": 8699
+    },
+    {
+      "epoch": 1.5489672364672364,
+      "grad_norm": 0.7129563093185425,
+      "learning_rate": 0.00013486175281171003,
+      "loss": 1.2079,
+      "step": 8700
+    },
+    {
+      "epoch": 1.5491452991452992,
+      "grad_norm": 0.6383145451545715,
+      "learning_rate": 0.00013484863312844173,
+      "loss": 0.9999,
+      "step": 8701
+    },
+    {
+      "epoch": 1.5493233618233617,
+      "grad_norm": 0.6310200691223145,
+      "learning_rate": 0.0001348355127623869,
+      "loss": 1.1193,
+      "step": 8702
+    },
+    {
+      "epoch": 1.5495014245014245,
+      "grad_norm": 0.6370054483413696,
+      "learning_rate": 0.0001348223917138025,
+      "loss": 1.0213,
+      "step": 8703
+    },
+    {
+      "epoch": 1.5496794871794872,
+      "grad_norm": 0.7052688598632812,
+      "learning_rate": 0.00013480926998294573,
+      "loss": 0.8773,
+      "step": 8704
+    },
+    {
+      "epoch": 1.54985754985755,
+      "grad_norm": 0.6369579434394836,
+      "learning_rate": 0.00013479614757007355,
+      "loss": 1.0072,
+      "step": 8705
+    },
+    {
+      "epoch": 1.5500356125356125,
+      "grad_norm": 0.7152075171470642,
+      "learning_rate": 0.0001347830244754432,
+      "loss": 1.0409,
+      "step": 8706
+    },
+    {
+      "epoch": 1.5502136752136753,
+      "grad_norm": 0.654183566570282,
+      "learning_rate": 0.00013476990069931173,
+      "loss": 0.9363,
+      "step": 8707
+    },
+    {
+      "epoch": 1.5503917378917378,
+      "grad_norm": 0.6700537204742432,
+      "learning_rate": 0.00013475677624193627,
+      "loss": 0.985,
+      "step": 8708
+    },
+    {
+      "epoch": 1.5505698005698005,
+      "grad_norm": 0.7195445895195007,
+      "learning_rate": 0.00013474365110357402,
+      "loss": 0.988,
+      "step": 8709
+    },
+    {
+      "epoch": 1.5507478632478633,
+      "grad_norm": 0.6019890904426575,
+      "learning_rate": 0.00013473052528448201,
+      "loss": 0.9915,
+      "step": 8710
+    },
+    {
+      "epoch": 1.550925925925926,
+      "grad_norm": 0.7787565588951111,
+      "learning_rate": 0.0001347173987849176,
+      "loss": 0.9676,
+      "step": 8711
+    },
+    {
+      "epoch": 1.5511039886039886,
+      "grad_norm": 0.6997103691101074,
+      "learning_rate": 0.00013470427160513782,
+      "loss": 1.1158,
+      "step": 8712
+    },
+    {
+      "epoch": 1.5512820512820513,
+      "grad_norm": 0.6259464025497437,
+      "learning_rate": 0.00013469114374539998,
+      "loss": 0.8784,
+      "step": 8713
+    },
+    {
+      "epoch": 1.5514601139601139,
+      "grad_norm": 0.6159056425094604,
+      "learning_rate": 0.00013467801520596122,
+      "loss": 0.9184,
+      "step": 8714
+    },
+    {
+      "epoch": 1.5516381766381766,
+      "grad_norm": 0.6823606491088867,
+      "learning_rate": 0.00013466488598707876,
+      "loss": 0.9542,
+      "step": 8715
+    },
+    {
+      "epoch": 1.5518162393162394,
+      "grad_norm": 0.6781585812568665,
+      "learning_rate": 0.0001346517560890099,
+      "loss": 1.1761,
+      "step": 8716
+    },
+    {
+      "epoch": 1.551994301994302,
+      "grad_norm": 0.6313831806182861,
+      "learning_rate": 0.00013463862551201184,
+      "loss": 0.8935,
+      "step": 8717
+    },
+    {
+      "epoch": 1.5521723646723646,
+      "grad_norm": 0.7466186881065369,
+      "learning_rate": 0.0001346254942563419,
+      "loss": 1.0583,
+      "step": 8718
+    },
+    {
+      "epoch": 1.5523504273504274,
+      "grad_norm": 0.7073680758476257,
+      "learning_rate": 0.0001346123623222573,
+      "loss": 0.9863,
+      "step": 8719
+    },
+    {
+      "epoch": 1.55252849002849,
+      "grad_norm": 0.6286870241165161,
+      "learning_rate": 0.00013459922971001536,
+      "loss": 0.9921,
+      "step": 8720
+    },
+    {
+      "epoch": 1.5527065527065527,
+      "grad_norm": 0.6047035455703735,
+      "learning_rate": 0.0001345860964198734,
+      "loss": 0.9155,
+      "step": 8721
+    },
+    {
+      "epoch": 1.5528846153846154,
+      "grad_norm": 0.5909964442253113,
+      "learning_rate": 0.00013457296245208874,
+      "loss": 0.9593,
+      "step": 8722
+    },
+    {
+      "epoch": 1.5530626780626782,
+      "grad_norm": 0.7838597893714905,
+      "learning_rate": 0.00013455982780691869,
+      "loss": 0.8872,
+      "step": 8723
+    },
+    {
+      "epoch": 1.5532407407407407,
+      "grad_norm": 0.6914706230163574,
+      "learning_rate": 0.00013454669248462063,
+      "loss": 0.9104,
+      "step": 8724
+    },
+    {
+      "epoch": 1.5534188034188035,
+      "grad_norm": 0.6777952909469604,
+      "learning_rate": 0.00013453355648545182,
+      "loss": 0.9839,
+      "step": 8725
+    },
+    {
+      "epoch": 1.553596866096866,
+      "grad_norm": 0.7482799291610718,
+      "learning_rate": 0.00013452041980966978,
+      "loss": 1.1164,
+      "step": 8726
+    },
+    {
+      "epoch": 1.5537749287749287,
+      "grad_norm": 0.6616327166557312,
+      "learning_rate": 0.0001345072824575318,
+      "loss": 0.9574,
+      "step": 8727
+    },
+    {
+      "epoch": 1.5539529914529915,
+      "grad_norm": 0.7193203568458557,
+      "learning_rate": 0.00013449414442929532,
+      "loss": 1.0609,
+      "step": 8728
+    },
+    {
+      "epoch": 1.5541310541310542,
+      "grad_norm": 0.6599446535110474,
+      "learning_rate": 0.0001344810057252177,
+      "loss": 0.9574,
+      "step": 8729
+    },
+    {
+      "epoch": 1.5543091168091168,
+      "grad_norm": 0.7221707105636597,
+      "learning_rate": 0.00013446786634555642,
+      "loss": 0.9819,
+      "step": 8730
+    },
+    {
+      "epoch": 1.5544871794871795,
+      "grad_norm": 0.6531312465667725,
+      "learning_rate": 0.0001344547262905689,
+      "loss": 0.9986,
+      "step": 8731
+    },
+    {
+      "epoch": 1.554665242165242,
+      "grad_norm": 0.6879804730415344,
+      "learning_rate": 0.0001344415855605126,
+      "loss": 1.1078,
+      "step": 8732
+    },
+    {
+      "epoch": 1.5548433048433048,
+      "grad_norm": 0.708907425403595,
+      "learning_rate": 0.00013442844415564498,
+      "loss": 1.0221,
+      "step": 8733
+    },
+    {
+      "epoch": 1.5550213675213675,
+      "grad_norm": 0.7957375645637512,
+      "learning_rate": 0.0001344153020762235,
+      "loss": 1.3101,
+      "step": 8734
+    },
+    {
+      "epoch": 1.5551994301994303,
+      "grad_norm": 0.7068197727203369,
+      "learning_rate": 0.00013440215932250567,
+      "loss": 0.8995,
+      "step": 8735
+    },
+    {
+      "epoch": 1.5553774928774928,
+      "grad_norm": 0.6455841064453125,
+      "learning_rate": 0.00013438901589474898,
+      "loss": 0.7244,
+      "step": 8736
+    },
+    {
+      "epoch": 1.5555555555555556,
+      "grad_norm": 0.7500516772270203,
+      "learning_rate": 0.00013437587179321097,
+      "loss": 1.0161,
+      "step": 8737
+    },
+    {
+      "epoch": 1.555733618233618,
+      "grad_norm": 0.5983143448829651,
+      "learning_rate": 0.00013436272701814917,
+      "loss": 0.9922,
+      "step": 8738
+    },
+    {
+      "epoch": 1.5559116809116809,
+      "grad_norm": 0.8761729598045349,
+      "learning_rate": 0.0001343495815698211,
+      "loss": 1.022,
+      "step": 8739
+    },
+    {
+      "epoch": 1.5560897435897436,
+      "grad_norm": 0.6901857852935791,
+      "learning_rate": 0.00013433643544848438,
+      "loss": 1.0668,
+      "step": 8740
+    },
+    {
+      "epoch": 1.5562678062678064,
+      "grad_norm": 0.6770836114883423,
+      "learning_rate": 0.00013432328865439647,
+      "loss": 0.9516,
+      "step": 8741
+    },
+    {
+      "epoch": 1.556445868945869,
+      "grad_norm": 0.6138805150985718,
+      "learning_rate": 0.00013431014118781505,
+      "loss": 0.8682,
+      "step": 8742
+    },
+    {
+      "epoch": 1.5566239316239316,
+      "grad_norm": 0.6796693801879883,
+      "learning_rate": 0.00013429699304899772,
+      "loss": 1.1132,
+      "step": 8743
+    },
+    {
+      "epoch": 1.5568019943019942,
+      "grad_norm": 0.6626394987106323,
+      "learning_rate": 0.000134283844238202,
+      "loss": 0.9273,
+      "step": 8744
+    },
+    {
+      "epoch": 1.556980056980057,
+      "grad_norm": 0.7088519334793091,
+      "learning_rate": 0.00013427069475568563,
+      "loss": 0.8915,
+      "step": 8745
+    },
+    {
+      "epoch": 1.5571581196581197,
+      "grad_norm": 0.6244857311248779,
+      "learning_rate": 0.0001342575446017061,
+      "loss": 0.9466,
+      "step": 8746
+    },
+    {
+      "epoch": 1.5573361823361824,
+      "grad_norm": 0.6969038248062134,
+      "learning_rate": 0.00013424439377652123,
+      "loss": 1.2307,
+      "step": 8747
+    },
+    {
+      "epoch": 1.5575142450142452,
+      "grad_norm": 0.6636740565299988,
+      "learning_rate": 0.0001342312422803886,
+      "loss": 0.9456,
+      "step": 8748
+    },
+    {
+      "epoch": 1.5576923076923077,
+      "grad_norm": 0.7863389253616333,
+      "learning_rate": 0.00013421809011356586,
+      "loss": 1.1888,
+      "step": 8749
+    },
+    {
+      "epoch": 1.5578703703703702,
+      "grad_norm": 0.7504058480262756,
+      "learning_rate": 0.00013420493727631073,
+      "loss": 1.2602,
+      "step": 8750
+    },
+    {
+      "epoch": 1.558048433048433,
+      "grad_norm": 0.7173139452934265,
+      "learning_rate": 0.00013419178376888085,
+      "loss": 1.0726,
+      "step": 8751
+    },
+    {
+      "epoch": 1.5582264957264957,
+      "grad_norm": 0.6517474055290222,
+      "learning_rate": 0.00013417862959153406,
+      "loss": 1.1299,
+      "step": 8752
+    },
+    {
+      "epoch": 1.5584045584045585,
+      "grad_norm": 0.8911739587783813,
+      "learning_rate": 0.00013416547474452803,
+      "loss": 1.105,
+      "step": 8753
+    },
+    {
+      "epoch": 1.5585826210826212,
+      "grad_norm": 0.7116649150848389,
+      "learning_rate": 0.00013415231922812049,
+      "loss": 0.8037,
+      "step": 8754
+    },
+    {
+      "epoch": 1.5587606837606838,
+      "grad_norm": 0.6935904026031494,
+      "learning_rate": 0.00013413916304256916,
+      "loss": 1.2778,
+      "step": 8755
+    },
+    {
+      "epoch": 1.5589387464387463,
+      "grad_norm": 0.652763843536377,
+      "learning_rate": 0.00013412600618813186,
+      "loss": 0.9188,
+      "step": 8756
+    },
+    {
+      "epoch": 1.559116809116809,
+      "grad_norm": 0.6545276641845703,
+      "learning_rate": 0.00013411284866506637,
+      "loss": 1.0116,
+      "step": 8757
+    },
+    {
+      "epoch": 1.5592948717948718,
+      "grad_norm": 0.632165253162384,
+      "learning_rate": 0.0001340996904736305,
+      "loss": 0.8538,
+      "step": 8758
+    },
+    {
+      "epoch": 1.5594729344729346,
+      "grad_norm": 0.6719664931297302,
+      "learning_rate": 0.000134086531614082,
+      "loss": 1.1877,
+      "step": 8759
+    },
+    {
+      "epoch": 1.5596509971509973,
+      "grad_norm": 0.6691158413887024,
+      "learning_rate": 0.00013407337208667873,
+      "loss": 1.0411,
+      "step": 8760
+    },
+    {
+      "epoch": 1.5598290598290598,
+      "grad_norm": 0.7711479067802429,
+      "learning_rate": 0.0001340602118916785,
+      "loss": 0.9995,
+      "step": 8761
+    },
+    {
+      "epoch": 1.5600071225071224,
+      "grad_norm": 0.7229881286621094,
+      "learning_rate": 0.0001340470510293392,
+      "loss": 1.1751,
+      "step": 8762
+    },
+    {
+      "epoch": 1.5601851851851851,
+      "grad_norm": 0.7183271646499634,
+      "learning_rate": 0.00013403388949991864,
+      "loss": 0.9371,
+      "step": 8763
+    },
+    {
+      "epoch": 1.5603632478632479,
+      "grad_norm": 0.8142383098602295,
+      "learning_rate": 0.00013402072730367475,
+      "loss": 1.0199,
+      "step": 8764
+    },
+    {
+      "epoch": 1.5605413105413106,
+      "grad_norm": 0.6349362134933472,
+      "learning_rate": 0.00013400756444086534,
+      "loss": 0.8453,
+      "step": 8765
+    },
+    {
+      "epoch": 1.5607193732193734,
+      "grad_norm": 0.651900589466095,
+      "learning_rate": 0.00013399440091174834,
+      "loss": 0.8952,
+      "step": 8766
+    },
+    {
+      "epoch": 1.560897435897436,
+      "grad_norm": 0.6873346567153931,
+      "learning_rate": 0.00013398123671658172,
+      "loss": 0.9438,
+      "step": 8767
+    },
+    {
+      "epoch": 1.5610754985754984,
+      "grad_norm": 0.7404754757881165,
+      "learning_rate": 0.00013396807185562333,
+      "loss": 1.123,
+      "step": 8768
+    },
+    {
+      "epoch": 1.5612535612535612,
+      "grad_norm": 0.7449641227722168,
+      "learning_rate": 0.00013395490632913111,
+      "loss": 0.9407,
+      "step": 8769
+    },
+    {
+      "epoch": 1.561431623931624,
+      "grad_norm": 0.7393384575843811,
+      "learning_rate": 0.0001339417401373631,
+      "loss": 1.0209,
+      "step": 8770
+    },
+    {
+      "epoch": 1.5616096866096867,
+      "grad_norm": 0.6787426471710205,
+      "learning_rate": 0.00013392857328057713,
+      "loss": 0.9768,
+      "step": 8771
+    },
+    {
+      "epoch": 1.5617877492877494,
+      "grad_norm": 0.6295693516731262,
+      "learning_rate": 0.00013391540575903127,
+      "loss": 0.9011,
+      "step": 8772
+    },
+    {
+      "epoch": 1.561965811965812,
+      "grad_norm": 0.7114503979682922,
+      "learning_rate": 0.00013390223757298354,
+      "loss": 1.0696,
+      "step": 8773
+    },
+    {
+      "epoch": 1.5621438746438745,
+      "grad_norm": 0.7540110349655151,
+      "learning_rate": 0.00013388906872269184,
+      "loss": 1.0071,
+      "step": 8774
+    },
+    {
+      "epoch": 1.5623219373219372,
+      "grad_norm": 0.6472305059432983,
+      "learning_rate": 0.00013387589920841423,
+      "loss": 1.105,
+      "step": 8775
+    },
+    {
+      "epoch": 1.5625,
+      "grad_norm": 0.6936793327331543,
+      "learning_rate": 0.00013386272903040874,
+      "loss": 0.885,
+      "step": 8776
+    },
+    {
+      "epoch": 1.5626780626780628,
+      "grad_norm": 0.7487989068031311,
+      "learning_rate": 0.00013384955818893343,
+      "loss": 0.7842,
+      "step": 8777
+    },
+    {
+      "epoch": 1.5628561253561255,
+      "grad_norm": 0.6109505891799927,
+      "learning_rate": 0.00013383638668424633,
+      "loss": 0.9461,
+      "step": 8778
+    },
+    {
+      "epoch": 1.563034188034188,
+      "grad_norm": 0.6650055646896362,
+      "learning_rate": 0.00013382321451660558,
+      "loss": 1.0463,
+      "step": 8779
+    },
+    {
+      "epoch": 1.5632122507122506,
+      "grad_norm": 0.7147329449653625,
+      "learning_rate": 0.00013381004168626915,
+      "loss": 0.946,
+      "step": 8780
+    },
+    {
+      "epoch": 1.5633903133903133,
+      "grad_norm": 0.6919382810592651,
+      "learning_rate": 0.00013379686819349522,
+      "loss": 0.8946,
+      "step": 8781
+    },
+    {
+      "epoch": 1.563568376068376,
+      "grad_norm": 0.7339401245117188,
+      "learning_rate": 0.00013378369403854184,
+      "loss": 0.9625,
+      "step": 8782
+    },
+    {
+      "epoch": 1.5637464387464388,
+      "grad_norm": 0.6337129473686218,
+      "learning_rate": 0.00013377051922166717,
+      "loss": 1.0854,
+      "step": 8783
+    },
+    {
+      "epoch": 1.5639245014245016,
+      "grad_norm": 0.7301266193389893,
+      "learning_rate": 0.0001337573437431293,
+      "loss": 1.017,
+      "step": 8784
+    },
+    {
+      "epoch": 1.564102564102564,
+      "grad_norm": 0.689540684223175,
+      "learning_rate": 0.00013374416760318644,
+      "loss": 0.8734,
+      "step": 8785
+    },
+    {
+      "epoch": 1.5642806267806266,
+      "grad_norm": 0.7121307849884033,
+      "learning_rate": 0.0001337309908020967,
+      "loss": 1.0827,
+      "step": 8786
+    },
+    {
+      "epoch": 1.5644586894586894,
+      "grad_norm": 0.6715386509895325,
+      "learning_rate": 0.00013371781334011826,
+      "loss": 0.946,
+      "step": 8787
+    },
+    {
+      "epoch": 1.5646367521367521,
+      "grad_norm": 0.6895501613616943,
+      "learning_rate": 0.00013370463521750932,
+      "loss": 1.1113,
+      "step": 8788
+    },
+    {
+      "epoch": 1.5648148148148149,
+      "grad_norm": 0.6592531204223633,
+      "learning_rate": 0.00013369145643452805,
+      "loss": 0.9952,
+      "step": 8789
+    },
+    {
+      "epoch": 1.5649928774928776,
+      "grad_norm": 0.7495190501213074,
+      "learning_rate": 0.0001336782769914327,
+      "loss": 1.0936,
+      "step": 8790
+    },
+    {
+      "epoch": 1.5651709401709402,
+      "grad_norm": 0.7273977398872375,
+      "learning_rate": 0.00013366509688848147,
+      "loss": 1.1749,
+      "step": 8791
+    },
+    {
+      "epoch": 1.5653490028490027,
+      "grad_norm": 0.6447354555130005,
+      "learning_rate": 0.0001336519161259326,
+      "loss": 0.8638,
+      "step": 8792
+    },
+    {
+      "epoch": 1.5655270655270654,
+      "grad_norm": 0.6572020053863525,
+      "learning_rate": 0.00013363873470404432,
+      "loss": 0.8005,
+      "step": 8793
+    },
+    {
+      "epoch": 1.5657051282051282,
+      "grad_norm": 0.676418662071228,
+      "learning_rate": 0.00013362555262307491,
+      "loss": 0.7651,
+      "step": 8794
+    },
+    {
+      "epoch": 1.565883190883191,
+      "grad_norm": 0.6886745095252991,
+      "learning_rate": 0.0001336123698832827,
+      "loss": 1.0765,
+      "step": 8795
+    },
+    {
+      "epoch": 1.5660612535612537,
+      "grad_norm": 0.8134182095527649,
+      "learning_rate": 0.00013359918648492584,
+      "loss": 1.2228,
+      "step": 8796
+    },
+    {
+      "epoch": 1.5662393162393162,
+      "grad_norm": 0.7210384011268616,
+      "learning_rate": 0.00013358600242826277,
+      "loss": 0.8247,
+      "step": 8797
+    },
+    {
+      "epoch": 1.5664173789173788,
+      "grad_norm": 0.7086136341094971,
+      "learning_rate": 0.00013357281771355175,
+      "loss": 1.0323,
+      "step": 8798
+    },
+    {
+      "epoch": 1.5665954415954415,
+      "grad_norm": 0.7419785857200623,
+      "learning_rate": 0.0001335596323410511,
+      "loss": 1.213,
+      "step": 8799
+    },
+    {
+      "epoch": 1.5667735042735043,
+      "grad_norm": 0.6390291452407837,
+      "learning_rate": 0.0001335464463110192,
+      "loss": 1.0403,
+      "step": 8800
+    },
+    {
+      "epoch": 1.566951566951567,
+      "grad_norm": 0.6111941337585449,
+      "learning_rate": 0.00013353325962371434,
+      "loss": 0.9747,
+      "step": 8801
+    },
+    {
+      "epoch": 1.5671296296296298,
+      "grad_norm": 0.6792671084403992,
+      "learning_rate": 0.00013352007227939488,
+      "loss": 1.1179,
+      "step": 8802
+    },
+    {
+      "epoch": 1.5673076923076923,
+      "grad_norm": 0.6656535863876343,
+      "learning_rate": 0.0001335068842783193,
+      "loss": 0.9214,
+      "step": 8803
+    },
+    {
+      "epoch": 1.5674857549857548,
+      "grad_norm": 0.6910907626152039,
+      "learning_rate": 0.0001334936956207459,
+      "loss": 1.0609,
+      "step": 8804
+    },
+    {
+      "epoch": 1.5676638176638176,
+      "grad_norm": 0.65049147605896,
+      "learning_rate": 0.00013348050630693315,
+      "loss": 0.7189,
+      "step": 8805
+    },
+    {
+      "epoch": 1.5678418803418803,
+      "grad_norm": 0.6258065104484558,
+      "learning_rate": 0.0001334673163371394,
+      "loss": 1.0683,
+      "step": 8806
+    },
+    {
+      "epoch": 1.568019943019943,
+      "grad_norm": 0.7518934607505798,
+      "learning_rate": 0.00013345412571162305,
+      "loss": 1.2415,
+      "step": 8807
+    },
+    {
+      "epoch": 1.5681980056980058,
+      "grad_norm": 0.7395275235176086,
+      "learning_rate": 0.00013344093443064267,
+      "loss": 0.9153,
+      "step": 8808
+    },
+    {
+      "epoch": 1.5683760683760684,
+      "grad_norm": 0.6789839267730713,
+      "learning_rate": 0.00013342774249445663,
+      "loss": 0.8051,
+      "step": 8809
+    },
+    {
+      "epoch": 1.568554131054131,
+      "grad_norm": 0.786247193813324,
+      "learning_rate": 0.00013341454990332342,
+      "loss": 1.203,
+      "step": 8810
+    },
+    {
+      "epoch": 1.5687321937321936,
+      "grad_norm": 0.6858161687850952,
+      "learning_rate": 0.00013340135665750153,
+      "loss": 0.9494,
+      "step": 8811
+    },
+    {
+      "epoch": 1.5689102564102564,
+      "grad_norm": 0.7245797514915466,
+      "learning_rate": 0.0001333881627572494,
+      "loss": 1.0544,
+      "step": 8812
+    },
+    {
+      "epoch": 1.5690883190883191,
+      "grad_norm": 0.6176164150238037,
+      "learning_rate": 0.00013337496820282563,
+      "loss": 0.9084,
+      "step": 8813
+    },
+    {
+      "epoch": 1.569266381766382,
+      "grad_norm": 0.7342953681945801,
+      "learning_rate": 0.00013336177299448868,
+      "loss": 1.0006,
+      "step": 8814
+    },
+    {
+      "epoch": 1.5694444444444444,
+      "grad_norm": 0.5183523297309875,
+      "learning_rate": 0.00013334857713249708,
+      "loss": 0.6295,
+      "step": 8815
+    },
+    {
+      "epoch": 1.5696225071225072,
+      "grad_norm": 0.6664513349533081,
+      "learning_rate": 0.00013333538061710936,
+      "loss": 0.7569,
+      "step": 8816
+    },
+    {
+      "epoch": 1.5698005698005697,
+      "grad_norm": 0.7051160931587219,
+      "learning_rate": 0.0001333221834485841,
+      "loss": 0.9917,
+      "step": 8817
+    },
+    {
+      "epoch": 1.5699786324786325,
+      "grad_norm": 0.7888057231903076,
+      "learning_rate": 0.0001333089856271799,
+      "loss": 1.0337,
+      "step": 8818
+    },
+    {
+      "epoch": 1.5701566951566952,
+      "grad_norm": 0.6796144247055054,
+      "learning_rate": 0.00013329578715315534,
+      "loss": 1.0915,
+      "step": 8819
+    },
+    {
+      "epoch": 1.570334757834758,
+      "grad_norm": 0.7442883849143982,
+      "learning_rate": 0.000133282588026769,
+      "loss": 1.1695,
+      "step": 8820
+    },
+    {
+      "epoch": 1.5705128205128205,
+      "grad_norm": 0.6164735555648804,
+      "learning_rate": 0.00013326938824827946,
+      "loss": 1.0143,
+      "step": 8821
+    },
+    {
+      "epoch": 1.5706908831908832,
+      "grad_norm": 0.6526502966880798,
+      "learning_rate": 0.00013325618781794539,
+      "loss": 0.8402,
+      "step": 8822
+    },
+    {
+      "epoch": 1.5708689458689458,
+      "grad_norm": 0.6376087069511414,
+      "learning_rate": 0.00013324298673602535,
+      "loss": 0.7582,
+      "step": 8823
+    },
+    {
+      "epoch": 1.5710470085470085,
+      "grad_norm": 0.6888708472251892,
+      "learning_rate": 0.00013322978500277807,
+      "loss": 0.997,
+      "step": 8824
+    },
+    {
+      "epoch": 1.5712250712250713,
+      "grad_norm": 0.553656280040741,
+      "learning_rate": 0.0001332165826184622,
+      "loss": 0.6917,
+      "step": 8825
+    },
+    {
+      "epoch": 1.571403133903134,
+      "grad_norm": 0.643285870552063,
+      "learning_rate": 0.0001332033795833364,
+      "loss": 0.8689,
+      "step": 8826
+    },
+    {
+      "epoch": 1.5715811965811965,
+      "grad_norm": 0.6210280060768127,
+      "learning_rate": 0.00013319017589765933,
+      "loss": 0.9047,
+      "step": 8827
+    },
+    {
+      "epoch": 1.5717592592592593,
+      "grad_norm": 0.7612366676330566,
+      "learning_rate": 0.0001331769715616897,
+      "loss": 0.9818,
+      "step": 8828
+    },
+    {
+      "epoch": 1.5719373219373218,
+      "grad_norm": 0.5970702171325684,
+      "learning_rate": 0.00013316376657568628,
+      "loss": 0.82,
+      "step": 8829
+    },
+    {
+      "epoch": 1.5721153846153846,
+      "grad_norm": 0.7182583808898926,
+      "learning_rate": 0.0001331505609399077,
+      "loss": 1.0633,
+      "step": 8830
+    },
+    {
+      "epoch": 1.5722934472934473,
+      "grad_norm": 0.7230739593505859,
+      "learning_rate": 0.00013313735465461278,
+      "loss": 0.977,
+      "step": 8831
+    },
+    {
+      "epoch": 1.57247150997151,
+      "grad_norm": 0.6752985119819641,
+      "learning_rate": 0.00013312414772006018,
+      "loss": 0.9666,
+      "step": 8832
+    },
+    {
+      "epoch": 1.5726495726495726,
+      "grad_norm": 0.7724275588989258,
+      "learning_rate": 0.00013311094013650877,
+      "loss": 1.148,
+      "step": 8833
+    },
+    {
+      "epoch": 1.5728276353276354,
+      "grad_norm": 0.7216386198997498,
+      "learning_rate": 0.00013309773190421724,
+      "loss": 0.9935,
+      "step": 8834
+    },
+    {
+      "epoch": 1.573005698005698,
+      "grad_norm": 0.6422320008277893,
+      "learning_rate": 0.0001330845230234444,
+      "loss": 0.9383,
+      "step": 8835
+    },
+    {
+      "epoch": 1.5731837606837606,
+      "grad_norm": 0.669538140296936,
+      "learning_rate": 0.00013307131349444906,
+      "loss": 1.0866,
+      "step": 8836
+    },
+    {
+      "epoch": 1.5733618233618234,
+      "grad_norm": 0.6994584798812866,
+      "learning_rate": 0.00013305810331749003,
+      "loss": 0.7882,
+      "step": 8837
+    },
+    {
+      "epoch": 1.5735398860398861,
+      "grad_norm": 0.8094269633293152,
+      "learning_rate": 0.00013304489249282617,
+      "loss": 1.2316,
+      "step": 8838
+    },
+    {
+      "epoch": 1.5737179487179487,
+      "grad_norm": 0.7180120348930359,
+      "learning_rate": 0.00013303168102071625,
+      "loss": 0.9795,
+      "step": 8839
+    },
+    {
+      "epoch": 1.5738960113960114,
+      "grad_norm": 0.6191438436508179,
+      "learning_rate": 0.00013301846890141918,
+      "loss": 0.8957,
+      "step": 8840
+    },
+    {
+      "epoch": 1.574074074074074,
+      "grad_norm": 0.671094536781311,
+      "learning_rate": 0.00013300525613519382,
+      "loss": 1.059,
+      "step": 8841
+    },
+    {
+      "epoch": 1.5742521367521367,
+      "grad_norm": 0.8062624931335449,
+      "learning_rate": 0.000132992042722299,
+      "loss": 0.9782,
+      "step": 8842
+    },
+    {
+      "epoch": 1.5744301994301995,
+      "grad_norm": 0.6674807667732239,
+      "learning_rate": 0.00013297882866299362,
+      "loss": 0.7765,
+      "step": 8843
+    },
+    {
+      "epoch": 1.5746082621082622,
+      "grad_norm": 0.6369131803512573,
+      "learning_rate": 0.00013296561395753664,
+      "loss": 0.97,
+      "step": 8844
+    },
+    {
+      "epoch": 1.5747863247863247,
+      "grad_norm": 0.7913636565208435,
+      "learning_rate": 0.00013295239860618691,
+      "loss": 1.0458,
+      "step": 8845
+    },
+    {
+      "epoch": 1.5749643874643875,
+      "grad_norm": 0.6722261905670166,
+      "learning_rate": 0.0001329391826092034,
+      "loss": 1.1118,
+      "step": 8846
+    },
+    {
+      "epoch": 1.57514245014245,
+      "grad_norm": 0.6936299800872803,
+      "learning_rate": 0.00013292596596684502,
+      "loss": 1.009,
+      "step": 8847
+    },
+    {
+      "epoch": 1.5753205128205128,
+      "grad_norm": 0.7009961009025574,
+      "learning_rate": 0.00013291274867937073,
+      "loss": 0.9904,
+      "step": 8848
+    },
+    {
+      "epoch": 1.5754985754985755,
+      "grad_norm": 0.6900732517242432,
+      "learning_rate": 0.0001328995307470395,
+      "loss": 1.0488,
+      "step": 8849
+    },
+    {
+      "epoch": 1.5756766381766383,
+      "grad_norm": 0.6389018297195435,
+      "learning_rate": 0.00013288631217011032,
+      "loss": 0.9444,
+      "step": 8850
+    },
+    {
+      "epoch": 1.5758547008547008,
+      "grad_norm": 0.6370900869369507,
+      "learning_rate": 0.00013287309294884216,
+      "loss": 0.7465,
+      "step": 8851
+    },
+    {
+      "epoch": 1.5760327635327636,
+      "grad_norm": 0.6463848948478699,
+      "learning_rate": 0.00013285987308349405,
+      "loss": 0.896,
+      "step": 8852
+    },
+    {
+      "epoch": 1.576210826210826,
+      "grad_norm": 0.6022449731826782,
+      "learning_rate": 0.00013284665257432495,
+      "loss": 0.8822,
+      "step": 8853
+    },
+    {
+      "epoch": 1.5763888888888888,
+      "grad_norm": 0.768189013004303,
+      "learning_rate": 0.00013283343142159396,
+      "loss": 0.9862,
+      "step": 8854
+    },
+    {
+      "epoch": 1.5765669515669516,
+      "grad_norm": 0.6642358303070068,
+      "learning_rate": 0.00013282020962556007,
+      "loss": 1.0713,
+      "step": 8855
+    },
+    {
+      "epoch": 1.5767450142450143,
+      "grad_norm": 0.6883034706115723,
+      "learning_rate": 0.00013280698718648234,
+      "loss": 1.0351,
+      "step": 8856
+    },
+    {
+      "epoch": 1.5769230769230769,
+      "grad_norm": 0.602808952331543,
+      "learning_rate": 0.00013279376410461988,
+      "loss": 0.7615,
+      "step": 8857
+    },
+    {
+      "epoch": 1.5771011396011396,
+      "grad_norm": 0.5968614220619202,
+      "learning_rate": 0.0001327805403802317,
+      "loss": 0.9443,
+      "step": 8858
+    },
+    {
+      "epoch": 1.5772792022792022,
+      "grad_norm": 0.7314837574958801,
+      "learning_rate": 0.00013276731601357696,
+      "loss": 0.8784,
+      "step": 8859
+    },
+    {
+      "epoch": 1.577457264957265,
+      "grad_norm": 0.619754433631897,
+      "learning_rate": 0.0001327540910049147,
+      "loss": 0.954,
+      "step": 8860
+    },
+    {
+      "epoch": 1.5776353276353277,
+      "grad_norm": 0.7195139527320862,
+      "learning_rate": 0.0001327408653545041,
+      "loss": 1.0227,
+      "step": 8861
+    },
+    {
+      "epoch": 1.5778133903133904,
+      "grad_norm": 0.6796214580535889,
+      "learning_rate": 0.0001327276390626042,
+      "loss": 1.0593,
+      "step": 8862
+    },
+    {
+      "epoch": 1.577991452991453,
+      "grad_norm": 0.6576255559921265,
+      "learning_rate": 0.00013271441212947427,
+      "loss": 0.7921,
+      "step": 8863
+    },
+    {
+      "epoch": 1.5781695156695157,
+      "grad_norm": 0.7222092151641846,
+      "learning_rate": 0.00013270118455537336,
+      "loss": 1.0545,
+      "step": 8864
+    },
+    {
+      "epoch": 1.5783475783475782,
+      "grad_norm": 0.7159737348556519,
+      "learning_rate": 0.00013268795634056066,
+      "loss": 0.9664,
+      "step": 8865
+    },
+    {
+      "epoch": 1.578525641025641,
+      "grad_norm": 0.7120481133460999,
+      "learning_rate": 0.00013267472748529536,
+      "loss": 1.0148,
+      "step": 8866
+    },
+    {
+      "epoch": 1.5787037037037037,
+      "grad_norm": 0.7353253364562988,
+      "learning_rate": 0.00013266149798983666,
+      "loss": 0.9288,
+      "step": 8867
+    },
+    {
+      "epoch": 1.5788817663817665,
+      "grad_norm": 0.6652441620826721,
+      "learning_rate": 0.00013264826785444375,
+      "loss": 0.8246,
+      "step": 8868
+    },
+    {
+      "epoch": 1.5790598290598292,
+      "grad_norm": 0.7254189252853394,
+      "learning_rate": 0.00013263503707937584,
+      "loss": 0.9892,
+      "step": 8869
+    },
+    {
+      "epoch": 1.5792378917378918,
+      "grad_norm": 0.6305747032165527,
+      "learning_rate": 0.00013262180566489223,
+      "loss": 0.8931,
+      "step": 8870
+    },
+    {
+      "epoch": 1.5794159544159543,
+      "grad_norm": 0.6560617089271545,
+      "learning_rate": 0.00013260857361125205,
+      "loss": 0.9245,
+      "step": 8871
+    },
+    {
+      "epoch": 1.579594017094017,
+      "grad_norm": 0.7304151654243469,
+      "learning_rate": 0.00013259534091871462,
+      "loss": 1.009,
+      "step": 8872
+    },
+    {
+      "epoch": 1.5797720797720798,
+      "grad_norm": 0.782636821269989,
+      "learning_rate": 0.00013258210758753918,
+      "loss": 1.1123,
+      "step": 8873
+    },
+    {
+      "epoch": 1.5799501424501425,
+      "grad_norm": 0.6992011070251465,
+      "learning_rate": 0.00013256887361798504,
+      "loss": 1.099,
+      "step": 8874
+    },
+    {
+      "epoch": 1.5801282051282053,
+      "grad_norm": 0.7159731984138489,
+      "learning_rate": 0.00013255563901031148,
+      "loss": 1.0257,
+      "step": 8875
+    },
+    {
+      "epoch": 1.5803062678062678,
+      "grad_norm": 0.6055454611778259,
+      "learning_rate": 0.0001325424037647778,
+      "loss": 0.9199,
+      "step": 8876
+    },
+    {
+      "epoch": 1.5804843304843303,
+      "grad_norm": 0.6838310360908508,
+      "learning_rate": 0.00013252916788164334,
+      "loss": 0.8644,
+      "step": 8877
+    },
+    {
+      "epoch": 1.580662393162393,
+      "grad_norm": 0.7067445516586304,
+      "learning_rate": 0.00013251593136116738,
+      "loss": 1.0285,
+      "step": 8878
+    },
+    {
+      "epoch": 1.5808404558404558,
+      "grad_norm": 0.7021774649620056,
+      "learning_rate": 0.00013250269420360928,
+      "loss": 1.1263,
+      "step": 8879
+    },
+    {
+      "epoch": 1.5810185185185186,
+      "grad_norm": 0.6586757302284241,
+      "learning_rate": 0.00013248945640922843,
+      "loss": 0.906,
+      "step": 8880
+    },
+    {
+      "epoch": 1.5811965811965814,
+      "grad_norm": 0.6673910021781921,
+      "learning_rate": 0.00013247621797828418,
+      "loss": 1.0652,
+      "step": 8881
+    },
+    {
+      "epoch": 1.5813746438746439,
+      "grad_norm": 0.6763964295387268,
+      "learning_rate": 0.00013246297891103588,
+      "loss": 1.0227,
+      "step": 8882
+    },
+    {
+      "epoch": 1.5815527065527064,
+      "grad_norm": 0.6536892056465149,
+      "learning_rate": 0.00013244973920774298,
+      "loss": 0.9026,
+      "step": 8883
+    },
+    {
+      "epoch": 1.5817307692307692,
+      "grad_norm": 0.8010411858558655,
+      "learning_rate": 0.0001324364988686648,
+      "loss": 1.1167,
+      "step": 8884
+    },
+    {
+      "epoch": 1.581908831908832,
+      "grad_norm": 0.8159251809120178,
+      "learning_rate": 0.00013242325789406082,
+      "loss": 1.233,
+      "step": 8885
+    },
+    {
+      "epoch": 1.5820868945868947,
+      "grad_norm": 0.6487745046615601,
+      "learning_rate": 0.00013241001628419048,
+      "loss": 0.9888,
+      "step": 8886
+    },
+    {
+      "epoch": 1.5822649572649574,
+      "grad_norm": 0.6750285029411316,
+      "learning_rate": 0.00013239677403931318,
+      "loss": 0.8874,
+      "step": 8887
+    },
+    {
+      "epoch": 1.58244301994302,
+      "grad_norm": 0.7164602875709534,
+      "learning_rate": 0.0001323835311596884,
+      "loss": 1.2029,
+      "step": 8888
+    },
+    {
+      "epoch": 1.5826210826210825,
+      "grad_norm": 0.6081351041793823,
+      "learning_rate": 0.00013237028764557558,
+      "loss": 0.9593,
+      "step": 8889
+    },
+    {
+      "epoch": 1.5827991452991452,
+      "grad_norm": 0.7235409021377563,
+      "learning_rate": 0.00013235704349723424,
+      "loss": 1.5324,
+      "step": 8890
+    },
+    {
+      "epoch": 1.582977207977208,
+      "grad_norm": 0.6658480763435364,
+      "learning_rate": 0.0001323437987149238,
+      "loss": 0.9756,
+      "step": 8891
+    },
+    {
+      "epoch": 1.5831552706552707,
+      "grad_norm": 0.7924265265464783,
+      "learning_rate": 0.00013233055329890387,
+      "loss": 0.9329,
+      "step": 8892
+    },
+    {
+      "epoch": 1.5833333333333335,
+      "grad_norm": 0.6262093186378479,
+      "learning_rate": 0.0001323173072494339,
+      "loss": 0.8288,
+      "step": 8893
+    },
+    {
+      "epoch": 1.583511396011396,
+      "grad_norm": 0.6851989030838013,
+      "learning_rate": 0.0001323040605667734,
+      "loss": 0.9822,
+      "step": 8894
+    },
+    {
+      "epoch": 1.5836894586894585,
+      "grad_norm": 0.6963728666305542,
+      "learning_rate": 0.00013229081325118194,
+      "loss": 1.0416,
+      "step": 8895
+    },
+    {
+      "epoch": 1.5838675213675213,
+      "grad_norm": 0.6017457842826843,
+      "learning_rate": 0.0001322775653029191,
+      "loss": 0.8123,
+      "step": 8896
+    },
+    {
+      "epoch": 1.584045584045584,
+      "grad_norm": 0.7396472096443176,
+      "learning_rate": 0.0001322643167222444,
+      "loss": 1.0339,
+      "step": 8897
+    },
+    {
+      "epoch": 1.5842236467236468,
+      "grad_norm": 0.6360299587249756,
+      "learning_rate": 0.00013225106750941744,
+      "loss": 0.9463,
+      "step": 8898
+    },
+    {
+      "epoch": 1.5844017094017095,
+      "grad_norm": 0.6297624111175537,
+      "learning_rate": 0.00013223781766469783,
+      "loss": 0.9921,
+      "step": 8899
+    },
+    {
+      "epoch": 1.584579772079772,
+      "grad_norm": 0.7722037434577942,
+      "learning_rate": 0.0001322245671883451,
+      "loss": 0.8394,
+      "step": 8900
+    },
+    {
+      "epoch": 1.5847578347578346,
+      "grad_norm": 0.677364706993103,
+      "learning_rate": 0.00013221131608061895,
+      "loss": 1.0954,
+      "step": 8901
+    },
+    {
+      "epoch": 1.5849358974358974,
+      "grad_norm": 0.6954908967018127,
+      "learning_rate": 0.00013219806434177899,
+      "loss": 1.0637,
+      "step": 8902
+    },
+    {
+      "epoch": 1.58511396011396,
+      "grad_norm": 0.7079192996025085,
+      "learning_rate": 0.00013218481197208484,
+      "loss": 1.039,
+      "step": 8903
+    },
+    {
+      "epoch": 1.5852920227920229,
+      "grad_norm": 0.7070451378822327,
+      "learning_rate": 0.00013217155897179611,
+      "loss": 1.0025,
+      "step": 8904
+    },
+    {
+      "epoch": 1.5854700854700856,
+      "grad_norm": 0.6940776705741882,
+      "learning_rate": 0.00013215830534117257,
+      "loss": 0.8039,
+      "step": 8905
+    },
+    {
+      "epoch": 1.5856481481481481,
+      "grad_norm": 0.6545892953872681,
+      "learning_rate": 0.00013214505108047382,
+      "loss": 0.9347,
+      "step": 8906
+    },
+    {
+      "epoch": 1.5858262108262107,
+      "grad_norm": 0.6769635081291199,
+      "learning_rate": 0.00013213179618995957,
+      "loss": 1.0321,
+      "step": 8907
+    },
+    {
+      "epoch": 1.5860042735042734,
+      "grad_norm": 0.6505448222160339,
+      "learning_rate": 0.00013211854066988953,
+      "loss": 1.0558,
+      "step": 8908
+    },
+    {
+      "epoch": 1.5861823361823362,
+      "grad_norm": 0.6764090061187744,
+      "learning_rate": 0.00013210528452052336,
+      "loss": 0.8407,
+      "step": 8909
+    },
+    {
+      "epoch": 1.586360398860399,
+      "grad_norm": 0.6454851627349854,
+      "learning_rate": 0.00013209202774212088,
+      "loss": 0.7439,
+      "step": 8910
+    },
+    {
+      "epoch": 1.5865384615384617,
+      "grad_norm": 0.6911695599555969,
+      "learning_rate": 0.00013207877033494177,
+      "loss": 0.9625,
+      "step": 8911
+    },
+    {
+      "epoch": 1.5867165242165242,
+      "grad_norm": 0.7405226826667786,
+      "learning_rate": 0.0001320655122992458,
+      "loss": 1.054,
+      "step": 8912
+    },
+    {
+      "epoch": 1.5868945868945867,
+      "grad_norm": 0.7362869381904602,
+      "learning_rate": 0.00013205225363529274,
+      "loss": 1.0516,
+      "step": 8913
+    },
+    {
+      "epoch": 1.5870726495726495,
+      "grad_norm": 0.6923766136169434,
+      "learning_rate": 0.0001320389943433423,
+      "loss": 1.2323,
+      "step": 8914
+    },
+    {
+      "epoch": 1.5872507122507122,
+      "grad_norm": 0.7980395555496216,
+      "learning_rate": 0.00013202573442365435,
+      "loss": 1.0229,
+      "step": 8915
+    },
+    {
+      "epoch": 1.587428774928775,
+      "grad_norm": 0.7211610078811646,
+      "learning_rate": 0.00013201247387648868,
+      "loss": 1.0666,
+      "step": 8916
+    },
+    {
+      "epoch": 1.5876068376068377,
+      "grad_norm": 0.6728795766830444,
+      "learning_rate": 0.00013199921270210506,
+      "loss": 1.0322,
+      "step": 8917
+    },
+    {
+      "epoch": 1.5877849002849003,
+      "grad_norm": 0.6226436495780945,
+      "learning_rate": 0.00013198595090076337,
+      "loss": 1.0517,
+      "step": 8918
+    },
+    {
+      "epoch": 1.5879629629629628,
+      "grad_norm": 0.6396511197090149,
+      "learning_rate": 0.0001319726884727234,
+      "loss": 0.8662,
+      "step": 8919
+    },
+    {
+      "epoch": 1.5881410256410255,
+      "grad_norm": 0.5664374828338623,
+      "learning_rate": 0.00013195942541824497,
+      "loss": 0.6601,
+      "step": 8920
+    },
+    {
+      "epoch": 1.5883190883190883,
+      "grad_norm": 0.6556946039199829,
+      "learning_rate": 0.00013194616173758806,
+      "loss": 0.9662,
+      "step": 8921
+    },
+    {
+      "epoch": 1.588497150997151,
+      "grad_norm": 0.7332060933113098,
+      "learning_rate": 0.00013193289743101245,
+      "loss": 0.7687,
+      "step": 8922
+    },
+    {
+      "epoch": 1.5886752136752138,
+      "grad_norm": 0.6103306412696838,
+      "learning_rate": 0.00013191963249877805,
+      "loss": 0.8329,
+      "step": 8923
+    },
+    {
+      "epoch": 1.5888532763532763,
+      "grad_norm": 0.63165283203125,
+      "learning_rate": 0.00013190636694114475,
+      "loss": 0.8336,
+      "step": 8924
+    },
+    {
+      "epoch": 1.589031339031339,
+      "grad_norm": 0.6955820322036743,
+      "learning_rate": 0.00013189310075837246,
+      "loss": 1.0457,
+      "step": 8925
+    },
+    {
+      "epoch": 1.5892094017094016,
+      "grad_norm": 0.6911605596542358,
+      "learning_rate": 0.00013187983395072114,
+      "loss": 0.9389,
+      "step": 8926
+    },
+    {
+      "epoch": 1.5893874643874644,
+      "grad_norm": 0.6493414640426636,
+      "learning_rate": 0.00013186656651845068,
+      "loss": 0.9821,
+      "step": 8927
+    },
+    {
+      "epoch": 1.5895655270655271,
+      "grad_norm": 0.6168226599693298,
+      "learning_rate": 0.00013185329846182107,
+      "loss": 1.0259,
+      "step": 8928
+    },
+    {
+      "epoch": 1.5897435897435899,
+      "grad_norm": 0.6460188627243042,
+      "learning_rate": 0.0001318400297810922,
+      "loss": 0.9836,
+      "step": 8929
+    },
+    {
+      "epoch": 1.5899216524216524,
+      "grad_norm": 0.6630695462226868,
+      "learning_rate": 0.0001318267604765241,
+      "loss": 0.8936,
+      "step": 8930
+    },
+    {
+      "epoch": 1.5900997150997151,
+      "grad_norm": 0.6308651566505432,
+      "learning_rate": 0.00013181349054837676,
+      "loss": 0.9583,
+      "step": 8931
+    },
+    {
+      "epoch": 1.5902777777777777,
+      "grad_norm": 0.6508499979972839,
+      "learning_rate": 0.00013180021999691018,
+      "loss": 0.7647,
+      "step": 8932
+    },
+    {
+      "epoch": 1.5904558404558404,
+      "grad_norm": 0.6625795960426331,
+      "learning_rate": 0.00013178694882238432,
+      "loss": 1.0329,
+      "step": 8933
+    },
+    {
+      "epoch": 1.5906339031339032,
+      "grad_norm": 0.6721987128257751,
+      "learning_rate": 0.00013177367702505924,
+      "loss": 0.9377,
+      "step": 8934
+    },
+    {
+      "epoch": 1.590811965811966,
+      "grad_norm": 0.7295519709587097,
+      "learning_rate": 0.00013176040460519497,
+      "loss": 0.9396,
+      "step": 8935
+    },
+    {
+      "epoch": 1.5909900284900285,
+      "grad_norm": 0.6673944592475891,
+      "learning_rate": 0.0001317471315630515,
+      "loss": 1.0284,
+      "step": 8936
+    },
+    {
+      "epoch": 1.5911680911680912,
+      "grad_norm": 0.6858960390090942,
+      "learning_rate": 0.00013173385789888898,
+      "loss": 1.2022,
+      "step": 8937
+    },
+    {
+      "epoch": 1.5913461538461537,
+      "grad_norm": 0.5836796164512634,
+      "learning_rate": 0.00013172058361296743,
+      "loss": 1.0078,
+      "step": 8938
+    },
+    {
+      "epoch": 1.5915242165242165,
+      "grad_norm": 0.7732513546943665,
+      "learning_rate": 0.00013170730870554694,
+      "loss": 1.0912,
+      "step": 8939
+    },
+    {
+      "epoch": 1.5917022792022792,
+      "grad_norm": 0.7095892429351807,
+      "learning_rate": 0.0001316940331768876,
+      "loss": 1.0506,
+      "step": 8940
+    },
+    {
+      "epoch": 1.591880341880342,
+      "grad_norm": 0.757534384727478,
+      "learning_rate": 0.00013168075702724952,
+      "loss": 1.036,
+      "step": 8941
+    },
+    {
+      "epoch": 1.5920584045584045,
+      "grad_norm": 0.6719361543655396,
+      "learning_rate": 0.00013166748025689282,
+      "loss": 0.9406,
+      "step": 8942
+    },
+    {
+      "epoch": 1.5922364672364673,
+      "grad_norm": 0.6955735087394714,
+      "learning_rate": 0.00013165420286607763,
+      "loss": 0.9325,
+      "step": 8943
+    },
+    {
+      "epoch": 1.5924145299145298,
+      "grad_norm": 0.6810322999954224,
+      "learning_rate": 0.00013164092485506407,
+      "loss": 1.0402,
+      "step": 8944
+    },
+    {
+      "epoch": 1.5925925925925926,
+      "grad_norm": 0.6346224546432495,
+      "learning_rate": 0.00013162764622411233,
+      "loss": 0.9725,
+      "step": 8945
+    },
+    {
+      "epoch": 1.5927706552706553,
+      "grad_norm": 0.728705883026123,
+      "learning_rate": 0.00013161436697348258,
+      "loss": 0.9665,
+      "step": 8946
+    },
+    {
+      "epoch": 1.592948717948718,
+      "grad_norm": 0.6838595271110535,
+      "learning_rate": 0.00013160108710343494,
+      "loss": 0.9771,
+      "step": 8947
+    },
+    {
+      "epoch": 1.5931267806267806,
+      "grad_norm": 0.7052602767944336,
+      "learning_rate": 0.00013158780661422966,
+      "loss": 0.8819,
+      "step": 8948
+    },
+    {
+      "epoch": 1.5933048433048433,
+      "grad_norm": 0.7237630486488342,
+      "learning_rate": 0.00013157452550612697,
+      "loss": 1.0609,
+      "step": 8949
+    },
+    {
+      "epoch": 1.5934829059829059,
+      "grad_norm": 0.6554936766624451,
+      "learning_rate": 0.00013156124377938699,
+      "loss": 0.8592,
+      "step": 8950
+    },
+    {
+      "epoch": 1.5936609686609686,
+      "grad_norm": 0.6125665307044983,
+      "learning_rate": 0.00013154796143427,
+      "loss": 0.8399,
+      "step": 8951
+    },
+    {
+      "epoch": 1.5938390313390314,
+      "grad_norm": 0.6930897235870361,
+      "learning_rate": 0.0001315346784710363,
+      "loss": 0.9965,
+      "step": 8952
+    },
+    {
+      "epoch": 1.5940170940170941,
+      "grad_norm": 0.7808064818382263,
+      "learning_rate": 0.00013152139488994605,
+      "loss": 1.0527,
+      "step": 8953
+    },
+    {
+      "epoch": 1.5941951566951567,
+      "grad_norm": 0.6125522255897522,
+      "learning_rate": 0.0001315081106912595,
+      "loss": 1.1159,
+      "step": 8954
+    },
+    {
+      "epoch": 1.5943732193732194,
+      "grad_norm": 0.5863428711891174,
+      "learning_rate": 0.00013149482587523703,
+      "loss": 0.84,
+      "step": 8955
+    },
+    {
+      "epoch": 1.594551282051282,
+      "grad_norm": 0.7170202732086182,
+      "learning_rate": 0.00013148154044213882,
+      "loss": 1.0821,
+      "step": 8956
+    },
+    {
+      "epoch": 1.5947293447293447,
+      "grad_norm": 0.6409463882446289,
+      "learning_rate": 0.00013146825439222528,
+      "loss": 1.0097,
+      "step": 8957
+    },
+    {
+      "epoch": 1.5949074074074074,
+      "grad_norm": 0.7037690281867981,
+      "learning_rate": 0.00013145496772575666,
+      "loss": 1.1511,
+      "step": 8958
+    },
+    {
+      "epoch": 1.5950854700854702,
+      "grad_norm": 0.6400953531265259,
+      "learning_rate": 0.00013144168044299326,
+      "loss": 1.0809,
+      "step": 8959
+    },
+    {
+      "epoch": 1.5952635327635327,
+      "grad_norm": 0.6129940152168274,
+      "learning_rate": 0.00013142839254419545,
+      "loss": 0.8481,
+      "step": 8960
+    },
+    {
+      "epoch": 1.5954415954415955,
+      "grad_norm": 0.7452271580696106,
+      "learning_rate": 0.00013141510402962358,
+      "loss": 1.0649,
+      "step": 8961
+    },
+    {
+      "epoch": 1.595619658119658,
+      "grad_norm": 0.7407623529434204,
+      "learning_rate": 0.000131401814899538,
+      "loss": 0.9084,
+      "step": 8962
+    },
+    {
+      "epoch": 1.5957977207977208,
+      "grad_norm": 0.7103050947189331,
+      "learning_rate": 0.0001313885251541991,
+      "loss": 0.946,
+      "step": 8963
+    },
+    {
+      "epoch": 1.5959757834757835,
+      "grad_norm": 0.5566636323928833,
+      "learning_rate": 0.00013137523479386727,
+      "loss": 0.6781,
+      "step": 8964
+    },
+    {
+      "epoch": 1.5961538461538463,
+      "grad_norm": 0.8137457966804504,
+      "learning_rate": 0.00013136194381880288,
+      "loss": 0.9273,
+      "step": 8965
+    },
+    {
+      "epoch": 1.5963319088319088,
+      "grad_norm": 0.779330849647522,
+      "learning_rate": 0.0001313486522292663,
+      "loss": 1.1105,
+      "step": 8966
+    },
+    {
+      "epoch": 1.5965099715099715,
+      "grad_norm": 0.6807126998901367,
+      "learning_rate": 0.00013133536002551808,
+      "loss": 1.0728,
+      "step": 8967
+    },
+    {
+      "epoch": 1.596688034188034,
+      "grad_norm": 0.7371507287025452,
+      "learning_rate": 0.00013132206720781853,
+      "loss": 0.979,
+      "step": 8968
+    },
+    {
+      "epoch": 1.5968660968660968,
+      "grad_norm": 0.6811465620994568,
+      "learning_rate": 0.00013130877377642814,
+      "loss": 0.9821,
+      "step": 8969
+    },
+    {
+      "epoch": 1.5970441595441596,
+      "grad_norm": 0.6732743978500366,
+      "learning_rate": 0.00013129547973160738,
+      "loss": 0.8511,
+      "step": 8970
+    },
+    {
+      "epoch": 1.5972222222222223,
+      "grad_norm": 0.594901978969574,
+      "learning_rate": 0.0001312821850736167,
+      "loss": 0.9674,
+      "step": 8971
+    },
+    {
+      "epoch": 1.5974002849002849,
+      "grad_norm": 0.6743764281272888,
+      "learning_rate": 0.00013126888980271657,
+      "loss": 0.9268,
+      "step": 8972
+    },
+    {
+      "epoch": 1.5975783475783476,
+      "grad_norm": 0.7532161474227905,
+      "learning_rate": 0.00013125559391916752,
+      "loss": 1.0474,
+      "step": 8973
+    },
+    {
+      "epoch": 1.5977564102564101,
+      "grad_norm": 0.6331499814987183,
+      "learning_rate": 0.00013124229742323,
+      "loss": 1.05,
+      "step": 8974
+    },
+    {
+      "epoch": 1.5979344729344729,
+      "grad_norm": 0.7418690323829651,
+      "learning_rate": 0.0001312290003151646,
+      "loss": 0.9475,
+      "step": 8975
+    },
+    {
+      "epoch": 1.5981125356125356,
+      "grad_norm": 0.6511179804801941,
+      "learning_rate": 0.0001312157025952318,
+      "loss": 0.9206,
+      "step": 8976
+    },
+    {
+      "epoch": 1.5982905982905984,
+      "grad_norm": 0.6380775570869446,
+      "learning_rate": 0.00013120240426369215,
+      "loss": 0.9953,
+      "step": 8977
+    },
+    {
+      "epoch": 1.598468660968661,
+      "grad_norm": 0.8483675122261047,
+      "learning_rate": 0.00013118910532080623,
+      "loss": 0.9454,
+      "step": 8978
+    },
+    {
+      "epoch": 1.5986467236467237,
+      "grad_norm": 0.6700518727302551,
+      "learning_rate": 0.00013117580576683455,
+      "loss": 1.0413,
+      "step": 8979
+    },
+    {
+      "epoch": 1.5988247863247862,
+      "grad_norm": 0.7750083208084106,
+      "learning_rate": 0.00013116250560203774,
+      "loss": 1.1868,
+      "step": 8980
+    },
+    {
+      "epoch": 1.599002849002849,
+      "grad_norm": 0.7474972009658813,
+      "learning_rate": 0.00013114920482667635,
+      "loss": 1.0876,
+      "step": 8981
+    },
+    {
+      "epoch": 1.5991809116809117,
+      "grad_norm": 0.6920070052146912,
+      "learning_rate": 0.000131135903441011,
+      "loss": 1.0787,
+      "step": 8982
+    },
+    {
+      "epoch": 1.5993589743589745,
+      "grad_norm": 0.7572436928749084,
+      "learning_rate": 0.00013112260144530232,
+      "loss": 0.9798,
+      "step": 8983
+    },
+    {
+      "epoch": 1.5995370370370372,
+      "grad_norm": 0.6983019709587097,
+      "learning_rate": 0.00013110929883981088,
+      "loss": 1.1115,
+      "step": 8984
+    },
+    {
+      "epoch": 1.5997150997150997,
+      "grad_norm": 0.6352120041847229,
+      "learning_rate": 0.0001310959956247974,
+      "loss": 0.9962,
+      "step": 8985
+    },
+    {
+      "epoch": 1.5998931623931623,
+      "grad_norm": 0.596858561038971,
+      "learning_rate": 0.00013108269180052244,
+      "loss": 0.8686,
+      "step": 8986
+    },
+    {
+      "epoch": 1.600071225071225,
+      "grad_norm": 0.6237605214118958,
+      "learning_rate": 0.00013106938736724672,
+      "loss": 0.9166,
+      "step": 8987
+    },
+    {
+      "epoch": 1.6002492877492878,
+      "grad_norm": 0.6818585395812988,
+      "learning_rate": 0.0001310560823252309,
+      "loss": 0.9993,
+      "step": 8988
+    },
+    {
+      "epoch": 1.6004273504273505,
+      "grad_norm": 0.6372287273406982,
+      "learning_rate": 0.00013104277667473564,
+      "loss": 0.8589,
+      "step": 8989
+    },
+    {
+      "epoch": 1.6006054131054133,
+      "grad_norm": 0.6057302355766296,
+      "learning_rate": 0.0001310294704160217,
+      "loss": 0.9325,
+      "step": 8990
+    },
+    {
+      "epoch": 1.6007834757834758,
+      "grad_norm": 0.6999384164810181,
+      "learning_rate": 0.0001310161635493497,
+      "loss": 0.8691,
+      "step": 8991
+    },
+    {
+      "epoch": 1.6009615384615383,
+      "grad_norm": 0.6182113289833069,
+      "learning_rate": 0.00013100285607498045,
+      "loss": 1.0271,
+      "step": 8992
+    },
+    {
+      "epoch": 1.601139601139601,
+      "grad_norm": 0.6681149005889893,
+      "learning_rate": 0.0001309895479931746,
+      "loss": 0.989,
+      "step": 8993
+    },
+    {
+      "epoch": 1.6013176638176638,
+      "grad_norm": 0.6187826991081238,
+      "learning_rate": 0.00013097623930419293,
+      "loss": 0.8051,
+      "step": 8994
+    },
+    {
+      "epoch": 1.6014957264957266,
+      "grad_norm": 0.698793888092041,
+      "learning_rate": 0.00013096293000829621,
+      "loss": 1.0762,
+      "step": 8995
+    },
+    {
+      "epoch": 1.6016737891737893,
+      "grad_norm": 0.693149745464325,
+      "learning_rate": 0.0001309496201057452,
+      "loss": 1.0894,
+      "step": 8996
+    },
+    {
+      "epoch": 1.6018518518518519,
+      "grad_norm": 0.6664052605628967,
+      "learning_rate": 0.00013093630959680068,
+      "loss": 0.9835,
+      "step": 8997
+    },
+    {
+      "epoch": 1.6020299145299144,
+      "grad_norm": 0.6919469833374023,
+      "learning_rate": 0.0001309229984817234,
+      "loss": 0.9062,
+      "step": 8998
+    },
+    {
+      "epoch": 1.6022079772079771,
+      "grad_norm": 0.704781174659729,
+      "learning_rate": 0.00013090968676077427,
+      "loss": 0.8582,
+      "step": 8999
+    },
+    {
+      "epoch": 1.60238603988604,
+      "grad_norm": 0.8055264949798584,
+      "learning_rate": 0.000130896374434214,
+      "loss": 0.9813,
+      "step": 9000
+    },
+    {
+      "epoch": 1.6025641025641026,
+      "grad_norm": 0.6301952004432678,
+      "learning_rate": 0.00013088306150230348,
+      "loss": 0.7056,
+      "step": 9001
+    },
+    {
+      "epoch": 1.6027421652421654,
+      "grad_norm": 0.698544442653656,
+      "learning_rate": 0.00013086974796530347,
+      "loss": 0.9806,
+      "step": 9002
+    },
+    {
+      "epoch": 1.602920227920228,
+      "grad_norm": 0.669548511505127,
+      "learning_rate": 0.00013085643382347491,
+      "loss": 1.0317,
+      "step": 9003
+    },
+    {
+      "epoch": 1.6030982905982905,
+      "grad_norm": 0.6404716372489929,
+      "learning_rate": 0.00013084311907707864,
+      "loss": 0.8885,
+      "step": 9004
+    },
+    {
+      "epoch": 1.6032763532763532,
+      "grad_norm": 0.6968616843223572,
+      "learning_rate": 0.0001308298037263755,
+      "loss": 1.0665,
+      "step": 9005
+    },
+    {
+      "epoch": 1.603454415954416,
+      "grad_norm": 0.849311113357544,
+      "learning_rate": 0.00013081648777162644,
+      "loss": 1.1404,
+      "step": 9006
+    },
+    {
+      "epoch": 1.6036324786324787,
+      "grad_norm": 0.6603094935417175,
+      "learning_rate": 0.00013080317121309223,
+      "loss": 0.8341,
+      "step": 9007
+    },
+    {
+      "epoch": 1.6038105413105415,
+      "grad_norm": 0.6777810454368591,
+      "learning_rate": 0.00013078985405103394,
+      "loss": 1.044,
+      "step": 9008
+    },
+    {
+      "epoch": 1.603988603988604,
+      "grad_norm": 0.6783546209335327,
+      "learning_rate": 0.0001307765362857124,
+      "loss": 1.042,
+      "step": 9009
+    },
+    {
+      "epoch": 1.6041666666666665,
+      "grad_norm": 0.7251788377761841,
+      "learning_rate": 0.00013076321791738858,
+      "loss": 0.9004,
+      "step": 9010
+    },
+    {
+      "epoch": 1.6043447293447293,
+      "grad_norm": 0.7885342240333557,
+      "learning_rate": 0.00013074989894632338,
+      "loss": 1.1966,
+      "step": 9011
+    },
+    {
+      "epoch": 1.604522792022792,
+      "grad_norm": 0.7171013355255127,
+      "learning_rate": 0.0001307365793727778,
+      "loss": 1.2242,
+      "step": 9012
+    },
+    {
+      "epoch": 1.6047008547008548,
+      "grad_norm": 0.6027249693870544,
+      "learning_rate": 0.00013072325919701283,
+      "loss": 0.917,
+      "step": 9013
+    },
+    {
+      "epoch": 1.6048789173789175,
+      "grad_norm": 0.5957151055335999,
+      "learning_rate": 0.00013070993841928936,
+      "loss": 0.9154,
+      "step": 9014
+    },
+    {
+      "epoch": 1.60505698005698,
+      "grad_norm": 0.6190659403800964,
+      "learning_rate": 0.00013069661703986847,
+      "loss": 0.7071,
+      "step": 9015
+    },
+    {
+      "epoch": 1.6052350427350426,
+      "grad_norm": 0.6454868316650391,
+      "learning_rate": 0.00013068329505901117,
+      "loss": 0.8381,
+      "step": 9016
+    },
+    {
+      "epoch": 1.6054131054131053,
+      "grad_norm": 0.6255491375923157,
+      "learning_rate": 0.00013066997247697837,
+      "loss": 0.7515,
+      "step": 9017
+    },
+    {
+      "epoch": 1.605591168091168,
+      "grad_norm": 0.6214072108268738,
+      "learning_rate": 0.0001306566492940312,
+      "loss": 1.0101,
+      "step": 9018
+    },
+    {
+      "epoch": 1.6057692307692308,
+      "grad_norm": 0.7244150638580322,
+      "learning_rate": 0.0001306433255104307,
+      "loss": 1.2558,
+      "step": 9019
+    },
+    {
+      "epoch": 1.6059472934472936,
+      "grad_norm": 0.6162270903587341,
+      "learning_rate": 0.00013063000112643785,
+      "loss": 1.1009,
+      "step": 9020
+    },
+    {
+      "epoch": 1.6061253561253561,
+      "grad_norm": 0.7309414744377136,
+      "learning_rate": 0.0001306166761423138,
+      "loss": 1.1973,
+      "step": 9021
+    },
+    {
+      "epoch": 1.6063034188034186,
+      "grad_norm": 0.7150956392288208,
+      "learning_rate": 0.00013060335055831957,
+      "loss": 0.9136,
+      "step": 9022
+    },
+    {
+      "epoch": 1.6064814814814814,
+      "grad_norm": 0.8187742829322815,
+      "learning_rate": 0.00013059002437471623,
+      "loss": 1.0524,
+      "step": 9023
+    },
+    {
+      "epoch": 1.6066595441595442,
+      "grad_norm": 0.7928692698478699,
+      "learning_rate": 0.00013057669759176493,
+      "loss": 1.0249,
+      "step": 9024
+    },
+    {
+      "epoch": 1.606837606837607,
+      "grad_norm": 0.6929279565811157,
+      "learning_rate": 0.00013056337020972677,
+      "loss": 1.1804,
+      "step": 9025
+    },
+    {
+      "epoch": 1.6070156695156697,
+      "grad_norm": 0.6771654486656189,
+      "learning_rate": 0.00013055004222886285,
+      "loss": 1.0284,
+      "step": 9026
+    },
+    {
+      "epoch": 1.6071937321937322,
+      "grad_norm": 0.6689024567604065,
+      "learning_rate": 0.0001305367136494343,
+      "loss": 1.0431,
+      "step": 9027
+    },
+    {
+      "epoch": 1.6073717948717947,
+      "grad_norm": 0.71135413646698,
+      "learning_rate": 0.0001305233844717023,
+      "loss": 0.9692,
+      "step": 9028
+    },
+    {
+      "epoch": 1.6075498575498575,
+      "grad_norm": 0.5459749698638916,
+      "learning_rate": 0.00013051005469592796,
+      "loss": 0.5643,
+      "step": 9029
+    },
+    {
+      "epoch": 1.6077279202279202,
+      "grad_norm": 0.7225865125656128,
+      "learning_rate": 0.00013049672432237253,
+      "loss": 1.0954,
+      "step": 9030
+    },
+    {
+      "epoch": 1.607905982905983,
+      "grad_norm": 0.6878093481063843,
+      "learning_rate": 0.0001304833933512971,
+      "loss": 0.894,
+      "step": 9031
+    },
+    {
+      "epoch": 1.6080840455840457,
+      "grad_norm": 0.6967248320579529,
+      "learning_rate": 0.00013047006178296288,
+      "loss": 1.0356,
+      "step": 9032
+    },
+    {
+      "epoch": 1.6082621082621082,
+      "grad_norm": 0.6404993534088135,
+      "learning_rate": 0.00013045672961763114,
+      "loss": 0.8528,
+      "step": 9033
+    },
+    {
+      "epoch": 1.6084401709401708,
+      "grad_norm": 0.5919156074523926,
+      "learning_rate": 0.000130443396855563,
+      "loss": 0.7196,
+      "step": 9034
+    },
+    {
+      "epoch": 1.6086182336182335,
+      "grad_norm": 0.6792302131652832,
+      "learning_rate": 0.00013043006349701977,
+      "loss": 0.9519,
+      "step": 9035
+    },
+    {
+      "epoch": 1.6087962962962963,
+      "grad_norm": 0.6263542175292969,
+      "learning_rate": 0.00013041672954226268,
+      "loss": 1.0483,
+      "step": 9036
+    },
+    {
+      "epoch": 1.608974358974359,
+      "grad_norm": 0.5865579843521118,
+      "learning_rate": 0.00013040339499155294,
+      "loss": 0.8794,
+      "step": 9037
+    },
+    {
+      "epoch": 1.6091524216524218,
+      "grad_norm": 0.8383142948150635,
+      "learning_rate": 0.00013039005984515181,
+      "loss": 0.8929,
+      "step": 9038
+    },
+    {
+      "epoch": 1.6093304843304843,
+      "grad_norm": 0.6438691020011902,
+      "learning_rate": 0.00013037672410332063,
+      "loss": 0.9957,
+      "step": 9039
+    },
+    {
+      "epoch": 1.609508547008547,
+      "grad_norm": 0.74748694896698,
+      "learning_rate": 0.0001303633877663206,
+      "loss": 0.9809,
+      "step": 9040
+    },
+    {
+      "epoch": 1.6096866096866096,
+      "grad_norm": 0.6697205901145935,
+      "learning_rate": 0.00013035005083441312,
+      "loss": 0.9556,
+      "step": 9041
+    },
+    {
+      "epoch": 1.6098646723646723,
+      "grad_norm": 0.6577828526496887,
+      "learning_rate": 0.00013033671330785941,
+      "loss": 0.8956,
+      "step": 9042
+    },
+    {
+      "epoch": 1.610042735042735,
+      "grad_norm": 0.6423429846763611,
+      "learning_rate": 0.0001303233751869208,
+      "loss": 0.8467,
+      "step": 9043
+    },
+    {
+      "epoch": 1.6102207977207978,
+      "grad_norm": 0.6552175879478455,
+      "learning_rate": 0.00013031003647185867,
+      "loss": 0.8656,
+      "step": 9044
+    },
+    {
+      "epoch": 1.6103988603988604,
+      "grad_norm": 0.6755174398422241,
+      "learning_rate": 0.00013029669716293433,
+      "loss": 0.7836,
+      "step": 9045
+    },
+    {
+      "epoch": 1.6105769230769231,
+      "grad_norm": 0.6832906007766724,
+      "learning_rate": 0.00013028335726040914,
+      "loss": 1.1531,
+      "step": 9046
+    },
+    {
+      "epoch": 1.6107549857549857,
+      "grad_norm": 0.6498637795448303,
+      "learning_rate": 0.00013027001676454446,
+      "loss": 0.8637,
+      "step": 9047
+    },
+    {
+      "epoch": 1.6109330484330484,
+      "grad_norm": 0.6792944073677063,
+      "learning_rate": 0.0001302566756756017,
+      "loss": 1.0865,
+      "step": 9048
+    },
+    {
+      "epoch": 1.6111111111111112,
+      "grad_norm": 0.6801337003707886,
+      "learning_rate": 0.00013024333399384226,
+      "loss": 1.0738,
+      "step": 9049
+    },
+    {
+      "epoch": 1.611289173789174,
+      "grad_norm": 0.675216794013977,
+      "learning_rate": 0.0001302299917195275,
+      "loss": 1.1074,
+      "step": 9050
+    },
+    {
+      "epoch": 1.6114672364672364,
+      "grad_norm": 0.6418983340263367,
+      "learning_rate": 0.00013021664885291885,
+      "loss": 1.0025,
+      "step": 9051
+    },
+    {
+      "epoch": 1.6116452991452992,
+      "grad_norm": 0.7778789401054382,
+      "learning_rate": 0.0001302033053942777,
+      "loss": 1.0847,
+      "step": 9052
+    },
+    {
+      "epoch": 1.6118233618233617,
+      "grad_norm": 0.7672827243804932,
+      "learning_rate": 0.00013018996134386555,
+      "loss": 1.0565,
+      "step": 9053
+    },
+    {
+      "epoch": 1.6120014245014245,
+      "grad_norm": 0.6770617961883545,
+      "learning_rate": 0.00013017661670194382,
+      "loss": 0.9069,
+      "step": 9054
+    },
+    {
+      "epoch": 1.6121794871794872,
+      "grad_norm": 0.7161242961883545,
+      "learning_rate": 0.00013016327146877393,
+      "loss": 1.1301,
+      "step": 9055
+    },
+    {
+      "epoch": 1.61235754985755,
+      "grad_norm": 0.6923251152038574,
+      "learning_rate": 0.00013014992564461746,
+      "loss": 0.9546,
+      "step": 9056
+    },
+    {
+      "epoch": 1.6125356125356125,
+      "grad_norm": 0.622953474521637,
+      "learning_rate": 0.0001301365792297358,
+      "loss": 0.8152,
+      "step": 9057
+    },
+    {
+      "epoch": 1.6127136752136753,
+      "grad_norm": 0.7477008104324341,
+      "learning_rate": 0.00013012323222439046,
+      "loss": 0.8428,
+      "step": 9058
+    },
+    {
+      "epoch": 1.6128917378917378,
+      "grad_norm": 0.6612883806228638,
+      "learning_rate": 0.000130109884628843,
+      "loss": 1.0678,
+      "step": 9059
+    },
+    {
+      "epoch": 1.6130698005698005,
+      "grad_norm": 0.6406781077384949,
+      "learning_rate": 0.00013009653644335486,
+      "loss": 0.6792,
+      "step": 9060
+    },
+    {
+      "epoch": 1.6132478632478633,
+      "grad_norm": 0.6279141902923584,
+      "learning_rate": 0.00013008318766818763,
+      "loss": 0.9826,
+      "step": 9061
+    },
+    {
+      "epoch": 1.613425925925926,
+      "grad_norm": 0.6616412401199341,
+      "learning_rate": 0.00013006983830360285,
+      "loss": 1.0691,
+      "step": 9062
+    },
+    {
+      "epoch": 1.6136039886039886,
+      "grad_norm": 0.6520406603813171,
+      "learning_rate": 0.000130056488349862,
+      "loss": 0.9487,
+      "step": 9063
+    },
+    {
+      "epoch": 1.6137820512820513,
+      "grad_norm": 0.6378647089004517,
+      "learning_rate": 0.00013004313780722672,
+      "loss": 0.8557,
+      "step": 9064
+    },
+    {
+      "epoch": 1.6139601139601139,
+      "grad_norm": 0.6547569036483765,
+      "learning_rate": 0.00013002978667595857,
+      "loss": 0.879,
+      "step": 9065
+    },
+    {
+      "epoch": 1.6141381766381766,
+      "grad_norm": 0.7347842454910278,
+      "learning_rate": 0.00013001643495631914,
+      "loss": 1.0757,
+      "step": 9066
+    },
+    {
+      "epoch": 1.6143162393162394,
+      "grad_norm": 0.5988406538963318,
+      "learning_rate": 0.00013000308264857002,
+      "loss": 0.6754,
+      "step": 9067
+    },
+    {
+      "epoch": 1.614494301994302,
+      "grad_norm": 0.6949366331100464,
+      "learning_rate": 0.00012998972975297282,
+      "loss": 1.1236,
+      "step": 9068
+    },
+    {
+      "epoch": 1.6146723646723646,
+      "grad_norm": 0.7095484137535095,
+      "learning_rate": 0.00012997637626978913,
+      "loss": 1.0124,
+      "step": 9069
+    },
+    {
+      "epoch": 1.6148504273504274,
+      "grad_norm": 0.6634095311164856,
+      "learning_rate": 0.00012996302219928064,
+      "loss": 1.2018,
+      "step": 9070
+    },
+    {
+      "epoch": 1.61502849002849,
+      "grad_norm": 0.6894524693489075,
+      "learning_rate": 0.000129949667541709,
+      "loss": 0.9959,
+      "step": 9071
+    },
+    {
+      "epoch": 1.6152065527065527,
+      "grad_norm": 0.672334611415863,
+      "learning_rate": 0.00012993631229733582,
+      "loss": 1.0369,
+      "step": 9072
+    },
+    {
+      "epoch": 1.6153846153846154,
+      "grad_norm": 0.725759744644165,
+      "learning_rate": 0.00012992295646642278,
+      "loss": 1.0079,
+      "step": 9073
+    },
+    {
+      "epoch": 1.6155626780626782,
+      "grad_norm": 0.7941585779190063,
+      "learning_rate": 0.00012990960004923154,
+      "loss": 0.9468,
+      "step": 9074
+    },
+    {
+      "epoch": 1.6157407407407407,
+      "grad_norm": 0.6556950807571411,
+      "learning_rate": 0.00012989624304602385,
+      "loss": 0.9915,
+      "step": 9075
+    },
+    {
+      "epoch": 1.6159188034188035,
+      "grad_norm": 0.7515892386436462,
+      "learning_rate": 0.0001298828854570614,
+      "loss": 1.0924,
+      "step": 9076
+    },
+    {
+      "epoch": 1.616096866096866,
+      "grad_norm": 0.6944101452827454,
+      "learning_rate": 0.00012986952728260586,
+      "loss": 0.9632,
+      "step": 9077
+    },
+    {
+      "epoch": 1.6162749287749287,
+      "grad_norm": 0.6286170482635498,
+      "learning_rate": 0.000129856168522919,
+      "loss": 1.0311,
+      "step": 9078
+    },
+    {
+      "epoch": 1.6164529914529915,
+      "grad_norm": 0.8362757563591003,
+      "learning_rate": 0.0001298428091782625,
+      "loss": 1.1232,
+      "step": 9079
+    },
+    {
+      "epoch": 1.6166310541310542,
+      "grad_norm": 0.6199851632118225,
+      "learning_rate": 0.0001298294492488982,
+      "loss": 0.9454,
+      "step": 9080
+    },
+    {
+      "epoch": 1.6168091168091168,
+      "grad_norm": 0.7541791796684265,
+      "learning_rate": 0.0001298160887350878,
+      "loss": 0.9759,
+      "step": 9081
+    },
+    {
+      "epoch": 1.6169871794871795,
+      "grad_norm": 0.6940878033638,
+      "learning_rate": 0.00012980272763709304,
+      "loss": 0.9258,
+      "step": 9082
+    },
+    {
+      "epoch": 1.617165242165242,
+      "grad_norm": 0.6934045553207397,
+      "learning_rate": 0.00012978936595517575,
+      "loss": 1.0142,
+      "step": 9083
+    },
+    {
+      "epoch": 1.6173433048433048,
+      "grad_norm": 0.8147503733634949,
+      "learning_rate": 0.00012977600368959774,
+      "loss": 0.964,
+      "step": 9084
+    },
+    {
+      "epoch": 1.6175213675213675,
+      "grad_norm": 0.6583107709884644,
+      "learning_rate": 0.00012976264084062079,
+      "loss": 1.0315,
+      "step": 9085
+    },
+    {
+      "epoch": 1.6176994301994303,
+      "grad_norm": 0.7192013263702393,
+      "learning_rate": 0.0001297492774085067,
+      "loss": 0.9528,
+      "step": 9086
+    },
+    {
+      "epoch": 1.6178774928774928,
+      "grad_norm": 0.665888786315918,
+      "learning_rate": 0.00012973591339351733,
+      "loss": 1.0188,
+      "step": 9087
+    },
+    {
+      "epoch": 1.6180555555555556,
+      "grad_norm": 0.7170987725257874,
+      "learning_rate": 0.0001297225487959145,
+      "loss": 0.8969,
+      "step": 9088
+    },
+    {
+      "epoch": 1.618233618233618,
+      "grad_norm": 0.6768732070922852,
+      "learning_rate": 0.00012970918361596007,
+      "loss": 1.1951,
+      "step": 9089
+    },
+    {
+      "epoch": 1.6184116809116809,
+      "grad_norm": 0.6640290021896362,
+      "learning_rate": 0.00012969581785391592,
+      "loss": 0.9649,
+      "step": 9090
+    },
+    {
+      "epoch": 1.6185897435897436,
+      "grad_norm": 0.6200813055038452,
+      "learning_rate": 0.00012968245151004392,
+      "loss": 0.9446,
+      "step": 9091
+    },
+    {
+      "epoch": 1.6187678062678064,
+      "grad_norm": 0.6815837621688843,
+      "learning_rate": 0.0001296690845846059,
+      "loss": 1.0506,
+      "step": 9092
+    },
+    {
+      "epoch": 1.618945868945869,
+      "grad_norm": 0.7252637147903442,
+      "learning_rate": 0.0001296557170778638,
+      "loss": 1.1977,
+      "step": 9093
+    },
+    {
+      "epoch": 1.6191239316239316,
+      "grad_norm": 0.5609107613563538,
+      "learning_rate": 0.00012964234899007955,
+      "loss": 0.8009,
+      "step": 9094
+    },
+    {
+      "epoch": 1.6193019943019942,
+      "grad_norm": 0.6539437770843506,
+      "learning_rate": 0.00012962898032151506,
+      "loss": 0.8482,
+      "step": 9095
+    },
+    {
+      "epoch": 1.619480056980057,
+      "grad_norm": 0.6993300914764404,
+      "learning_rate": 0.0001296156110724322,
+      "loss": 1.0725,
+      "step": 9096
+    },
+    {
+      "epoch": 1.6196581196581197,
+      "grad_norm": 0.6768273711204529,
+      "learning_rate": 0.000129602241243093,
+      "loss": 0.9247,
+      "step": 9097
+    },
+    {
+      "epoch": 1.6198361823361824,
+      "grad_norm": 0.6896265745162964,
+      "learning_rate": 0.00012958887083375939,
+      "loss": 0.9526,
+      "step": 9098
+    },
+    {
+      "epoch": 1.6200142450142452,
+      "grad_norm": 0.7475146651268005,
+      "learning_rate": 0.00012957549984469327,
+      "loss": 0.8302,
+      "step": 9099
+    },
+    {
+      "epoch": 1.6201923076923077,
+      "grad_norm": 0.6622769236564636,
+      "learning_rate": 0.00012956212827615674,
+      "loss": 0.9505,
+      "step": 9100
+    },
+    {
+      "epoch": 1.6203703703703702,
+      "grad_norm": 0.6938058137893677,
+      "learning_rate": 0.00012954875612841167,
+      "loss": 0.9757,
+      "step": 9101
+    },
+    {
+      "epoch": 1.620548433048433,
+      "grad_norm": 0.7453510761260986,
+      "learning_rate": 0.0001295353834017201,
+      "loss": 1.0919,
+      "step": 9102
+    },
+    {
+      "epoch": 1.6207264957264957,
+      "grad_norm": 0.7868932485580444,
+      "learning_rate": 0.0001295220100963441,
+      "loss": 0.9265,
+      "step": 9103
+    },
+    {
+      "epoch": 1.6209045584045585,
+      "grad_norm": 0.6779825091362,
+      "learning_rate": 0.00012950863621254558,
+      "loss": 0.98,
+      "step": 9104
+    },
+    {
+      "epoch": 1.6210826210826212,
+      "grad_norm": 0.6825897097587585,
+      "learning_rate": 0.00012949526175058662,
+      "loss": 0.9218,
+      "step": 9105
+    },
+    {
+      "epoch": 1.6212606837606838,
+      "grad_norm": 0.6686047911643982,
+      "learning_rate": 0.00012948188671072934,
+      "loss": 0.9546,
+      "step": 9106
+    },
+    {
+      "epoch": 1.6214387464387463,
+      "grad_norm": 0.7456090450286865,
+      "learning_rate": 0.0001294685110932357,
+      "loss": 1.0819,
+      "step": 9107
+    },
+    {
+      "epoch": 1.621616809116809,
+      "grad_norm": 0.7111441493034363,
+      "learning_rate": 0.0001294551348983678,
+      "loss": 0.9916,
+      "step": 9108
+    },
+    {
+      "epoch": 1.6217948717948718,
+      "grad_norm": 0.6534699201583862,
+      "learning_rate": 0.00012944175812638773,
+      "loss": 1.0374,
+      "step": 9109
+    },
+    {
+      "epoch": 1.6219729344729346,
+      "grad_norm": 0.6046397089958191,
+      "learning_rate": 0.00012942838077755758,
+      "loss": 0.7922,
+      "step": 9110
+    },
+    {
+      "epoch": 1.6221509971509973,
+      "grad_norm": 0.7736679911613464,
+      "learning_rate": 0.00012941500285213942,
+      "loss": 1.0056,
+      "step": 9111
+    },
+    {
+      "epoch": 1.6223290598290598,
+      "grad_norm": 0.6850929260253906,
+      "learning_rate": 0.00012940162435039538,
+      "loss": 0.9538,
+      "step": 9112
+    },
+    {
+      "epoch": 1.6225071225071224,
+      "grad_norm": 0.6305751800537109,
+      "learning_rate": 0.00012938824527258756,
+      "loss": 0.9341,
+      "step": 9113
+    },
+    {
+      "epoch": 1.6226851851851851,
+      "grad_norm": 0.6740923523902893,
+      "learning_rate": 0.0001293748656189782,
+      "loss": 1.0037,
+      "step": 9114
+    },
+    {
+      "epoch": 1.6228632478632479,
+      "grad_norm": 0.6579762101173401,
+      "learning_rate": 0.00012936148538982928,
+      "loss": 1.0022,
+      "step": 9115
+    },
+    {
+      "epoch": 1.6230413105413106,
+      "grad_norm": 0.6500434279441833,
+      "learning_rate": 0.0001293481045854031,
+      "loss": 0.8589,
+      "step": 9116
+    },
+    {
+      "epoch": 1.6232193732193734,
+      "grad_norm": 0.7825912237167358,
+      "learning_rate": 0.00012933472320596177,
+      "loss": 1.0345,
+      "step": 9117
+    },
+    {
+      "epoch": 1.623397435897436,
+      "grad_norm": 0.8341414332389832,
+      "learning_rate": 0.0001293213412517675,
+      "loss": 1.0314,
+      "step": 9118
+    },
+    {
+      "epoch": 1.6235754985754984,
+      "grad_norm": 0.63664311170578,
+      "learning_rate": 0.00012930795872308242,
+      "loss": 0.819,
+      "step": 9119
+    },
+    {
+      "epoch": 1.6237535612535612,
+      "grad_norm": 0.6800840497016907,
+      "learning_rate": 0.00012929457562016878,
+      "loss": 0.95,
+      "step": 9120
+    },
+    {
+      "epoch": 1.623931623931624,
+      "grad_norm": 0.754165530204773,
+      "learning_rate": 0.0001292811919432888,
+      "loss": 1.1193,
+      "step": 9121
+    },
+    {
+      "epoch": 1.6241096866096867,
+      "grad_norm": 0.678871750831604,
+      "learning_rate": 0.00012926780769270465,
+      "loss": 0.9015,
+      "step": 9122
+    },
+    {
+      "epoch": 1.6242877492877494,
+      "grad_norm": 0.6642945408821106,
+      "learning_rate": 0.00012925442286867866,
+      "loss": 0.9095,
+      "step": 9123
+    },
+    {
+      "epoch": 1.624465811965812,
+      "grad_norm": 0.6089697480201721,
+      "learning_rate": 0.000129241037471473,
+      "loss": 0.8994,
+      "step": 9124
+    },
+    {
+      "epoch": 1.6246438746438745,
+      "grad_norm": 0.7320881485939026,
+      "learning_rate": 0.00012922765150134995,
+      "loss": 1.0518,
+      "step": 9125
+    },
+    {
+      "epoch": 1.6248219373219372,
+      "grad_norm": 0.7308032512664795,
+      "learning_rate": 0.0001292142649585718,
+      "loss": 1.0557,
+      "step": 9126
+    },
+    {
+      "epoch": 1.625,
+      "grad_norm": 0.6896602511405945,
+      "learning_rate": 0.0001292008778434008,
+      "loss": 1.145,
+      "step": 9127
+    },
+    {
+      "epoch": 1.6251780626780628,
+      "grad_norm": 0.6112532615661621,
+      "learning_rate": 0.00012918749015609926,
+      "loss": 0.9611,
+      "step": 9128
+    },
+    {
+      "epoch": 1.6253561253561255,
+      "grad_norm": 0.6856057643890381,
+      "learning_rate": 0.00012917410189692947,
+      "loss": 1.0124,
+      "step": 9129
+    },
+    {
+      "epoch": 1.625534188034188,
+      "grad_norm": 0.699252188205719,
+      "learning_rate": 0.00012916071306615378,
+      "loss": 0.8854,
+      "step": 9130
+    },
+    {
+      "epoch": 1.6257122507122506,
+      "grad_norm": 0.6306683421134949,
+      "learning_rate": 0.0001291473236640345,
+      "loss": 1.0722,
+      "step": 9131
+    },
+    {
+      "epoch": 1.6258903133903133,
+      "grad_norm": 0.6358118653297424,
+      "learning_rate": 0.00012913393369083393,
+      "loss": 0.889,
+      "step": 9132
+    },
+    {
+      "epoch": 1.626068376068376,
+      "grad_norm": 0.6953601837158203,
+      "learning_rate": 0.00012912054314681445,
+      "loss": 1.0168,
+      "step": 9133
+    },
+    {
+      "epoch": 1.6262464387464388,
+      "grad_norm": 0.6742331385612488,
+      "learning_rate": 0.00012910715203223844,
+      "loss": 0.8152,
+      "step": 9134
+    },
+    {
+      "epoch": 1.6264245014245016,
+      "grad_norm": 0.5872861742973328,
+      "learning_rate": 0.00012909376034736823,
+      "loss": 0.8702,
+      "step": 9135
+    },
+    {
+      "epoch": 1.626602564102564,
+      "grad_norm": 0.7580631971359253,
+      "learning_rate": 0.00012908036809246623,
+      "loss": 0.994,
+      "step": 9136
+    },
+    {
+      "epoch": 1.6267806267806266,
+      "grad_norm": 0.7544930577278137,
+      "learning_rate": 0.00012906697526779488,
+      "loss": 0.7475,
+      "step": 9137
+    },
+    {
+      "epoch": 1.6269586894586894,
+      "grad_norm": 0.6850766539573669,
+      "learning_rate": 0.00012905358187361647,
+      "loss": 1.0943,
+      "step": 9138
+    },
+    {
+      "epoch": 1.6271367521367521,
+      "grad_norm": 0.6821565628051758,
+      "learning_rate": 0.0001290401879101935,
+      "loss": 1.2928,
+      "step": 9139
+    },
+    {
+      "epoch": 1.6273148148148149,
+      "grad_norm": 0.6961034536361694,
+      "learning_rate": 0.00012902679337778835,
+      "loss": 0.8694,
+      "step": 9140
+    },
+    {
+      "epoch": 1.6274928774928776,
+      "grad_norm": 0.7159550786018372,
+      "learning_rate": 0.00012901339827666353,
+      "loss": 0.8827,
+      "step": 9141
+    },
+    {
+      "epoch": 1.6276709401709402,
+      "grad_norm": 0.7491081953048706,
+      "learning_rate": 0.0001290000026070814,
+      "loss": 0.8159,
+      "step": 9142
+    },
+    {
+      "epoch": 1.6278490028490027,
+      "grad_norm": 0.7107849717140198,
+      "learning_rate": 0.00012898660636930447,
+      "loss": 1.0625,
+      "step": 9143
+    },
+    {
+      "epoch": 1.6280270655270654,
+      "grad_norm": 0.7227210998535156,
+      "learning_rate": 0.0001289732095635952,
+      "loss": 0.9744,
+      "step": 9144
+    },
+    {
+      "epoch": 1.6282051282051282,
+      "grad_norm": 0.7141995429992676,
+      "learning_rate": 0.00012895981219021607,
+      "loss": 0.9836,
+      "step": 9145
+    },
+    {
+      "epoch": 1.628383190883191,
+      "grad_norm": 0.6445552706718445,
+      "learning_rate": 0.00012894641424942958,
+      "loss": 1.0183,
+      "step": 9146
+    },
+    {
+      "epoch": 1.6285612535612537,
+      "grad_norm": 0.698783278465271,
+      "learning_rate": 0.00012893301574149824,
+      "loss": 0.8392,
+      "step": 9147
+    },
+    {
+      "epoch": 1.6287393162393162,
+      "grad_norm": 0.6529116034507751,
+      "learning_rate": 0.00012891961666668458,
+      "loss": 0.9317,
+      "step": 9148
+    },
+    {
+      "epoch": 1.6289173789173788,
+      "grad_norm": 0.7780548930168152,
+      "learning_rate": 0.0001289062170252511,
+      "loss": 1.2406,
+      "step": 9149
+    },
+    {
+      "epoch": 1.6290954415954415,
+      "grad_norm": 0.6500990986824036,
+      "learning_rate": 0.0001288928168174603,
+      "loss": 1.0381,
+      "step": 9150
+    },
+    {
+      "epoch": 1.6292735042735043,
+      "grad_norm": 0.7098208665847778,
+      "learning_rate": 0.00012887941604357482,
+      "loss": 1.2126,
+      "step": 9151
+    },
+    {
+      "epoch": 1.629451566951567,
+      "grad_norm": 0.730648398399353,
+      "learning_rate": 0.0001288660147038572,
+      "loss": 0.8351,
+      "step": 9152
+    },
+    {
+      "epoch": 1.6296296296296298,
+      "grad_norm": 0.5520278215408325,
+      "learning_rate": 0.0001288526127985699,
+      "loss": 0.5877,
+      "step": 9153
+    },
+    {
+      "epoch": 1.6298076923076923,
+      "grad_norm": 0.7611770033836365,
+      "learning_rate": 0.00012883921032797563,
+      "loss": 1.2227,
+      "step": 9154
+    },
+    {
+      "epoch": 1.6299857549857548,
+      "grad_norm": 0.636820375919342,
+      "learning_rate": 0.00012882580729233696,
+      "loss": 0.8305,
+      "step": 9155
+    },
+    {
+      "epoch": 1.6301638176638176,
+      "grad_norm": 0.694492518901825,
+      "learning_rate": 0.00012881240369191644,
+      "loss": 1.0452,
+      "step": 9156
+    },
+    {
+      "epoch": 1.6303418803418803,
+      "grad_norm": 0.67826908826828,
+      "learning_rate": 0.00012879899952697677,
+      "loss": 0.8345,
+      "step": 9157
+    },
+    {
+      "epoch": 1.630519943019943,
+      "grad_norm": 0.5891323685646057,
+      "learning_rate": 0.00012878559479778052,
+      "loss": 0.8367,
+      "step": 9158
+    },
+    {
+      "epoch": 1.6306980056980058,
+      "grad_norm": 0.6766192317008972,
+      "learning_rate": 0.0001287721895045903,
+      "loss": 0.8319,
+      "step": 9159
+    },
+    {
+      "epoch": 1.6308760683760684,
+      "grad_norm": 0.5306392908096313,
+      "learning_rate": 0.0001287587836476688,
+      "loss": 0.7945,
+      "step": 9160
+    },
+    {
+      "epoch": 1.631054131054131,
+      "grad_norm": 0.6677970290184021,
+      "learning_rate": 0.0001287453772272787,
+      "loss": 1.1228,
+      "step": 9161
+    },
+    {
+      "epoch": 1.6312321937321936,
+      "grad_norm": 0.810052752494812,
+      "learning_rate": 0.00012873197024368266,
+      "loss": 0.8395,
+      "step": 9162
+    },
+    {
+      "epoch": 1.6314102564102564,
+      "grad_norm": 0.7619220018386841,
+      "learning_rate": 0.00012871856269714333,
+      "loss": 1.3713,
+      "step": 9163
+    },
+    {
+      "epoch": 1.6315883190883191,
+      "grad_norm": 0.6564521193504333,
+      "learning_rate": 0.00012870515458792342,
+      "loss": 1.0513,
+      "step": 9164
+    },
+    {
+      "epoch": 1.631766381766382,
+      "grad_norm": 0.6874445676803589,
+      "learning_rate": 0.00012869174591628564,
+      "loss": 1.0255,
+      "step": 9165
+    },
+    {
+      "epoch": 1.6319444444444444,
+      "grad_norm": 0.6958737373352051,
+      "learning_rate": 0.0001286783366824927,
+      "loss": 0.9361,
+      "step": 9166
+    },
+    {
+      "epoch": 1.6321225071225072,
+      "grad_norm": 0.6909199357032776,
+      "learning_rate": 0.0001286649268868073,
+      "loss": 0.9855,
+      "step": 9167
+    },
+    {
+      "epoch": 1.6323005698005697,
+      "grad_norm": 0.7671375274658203,
+      "learning_rate": 0.00012865151652949225,
+      "loss": 1.084,
+      "step": 9168
+    },
+    {
+      "epoch": 1.6324786324786325,
+      "grad_norm": 0.750200092792511,
+      "learning_rate": 0.00012863810561081023,
+      "loss": 0.9341,
+      "step": 9169
+    },
+    {
+      "epoch": 1.6326566951566952,
+      "grad_norm": 0.6595860123634338,
+      "learning_rate": 0.00012862469413102402,
+      "loss": 0.9386,
+      "step": 9170
+    },
+    {
+      "epoch": 1.632834757834758,
+      "grad_norm": 0.622373640537262,
+      "learning_rate": 0.0001286112820903964,
+      "loss": 0.7697,
+      "step": 9171
+    },
+    {
+      "epoch": 1.6330128205128205,
+      "grad_norm": 0.9628498554229736,
+      "learning_rate": 0.00012859786948919014,
+      "loss": 1.2629,
+      "step": 9172
+    },
+    {
+      "epoch": 1.6331908831908832,
+      "grad_norm": 0.7610561847686768,
+      "learning_rate": 0.000128584456327668,
+      "loss": 0.9748,
+      "step": 9173
+    },
+    {
+      "epoch": 1.6333689458689458,
+      "grad_norm": 0.6585374474525452,
+      "learning_rate": 0.00012857104260609285,
+      "loss": 0.9049,
+      "step": 9174
+    },
+    {
+      "epoch": 1.6335470085470085,
+      "grad_norm": 0.6996221542358398,
+      "learning_rate": 0.00012855762832472746,
+      "loss": 0.8893,
+      "step": 9175
+    },
+    {
+      "epoch": 1.6337250712250713,
+      "grad_norm": 0.6226270198822021,
+      "learning_rate": 0.00012854421348383466,
+      "loss": 0.8913,
+      "step": 9176
+    },
+    {
+      "epoch": 1.633903133903134,
+      "grad_norm": 0.6570866107940674,
+      "learning_rate": 0.00012853079808367731,
+      "loss": 0.8632,
+      "step": 9177
+    },
+    {
+      "epoch": 1.6340811965811965,
+      "grad_norm": 0.6899664402008057,
+      "learning_rate": 0.00012851738212451826,
+      "loss": 0.8177,
+      "step": 9178
+    },
+    {
+      "epoch": 1.6342592592592593,
+      "grad_norm": 0.75257807970047,
+      "learning_rate": 0.0001285039656066203,
+      "loss": 0.9096,
+      "step": 9179
+    },
+    {
+      "epoch": 1.6344373219373218,
+      "grad_norm": 0.6614963412284851,
+      "learning_rate": 0.00012849054853024638,
+      "loss": 0.9255,
+      "step": 9180
+    },
+    {
+      "epoch": 1.6346153846153846,
+      "grad_norm": 0.7245957851409912,
+      "learning_rate": 0.00012847713089565933,
+      "loss": 1.0122,
+      "step": 9181
+    },
+    {
+      "epoch": 1.6347934472934473,
+      "grad_norm": 0.7332839369773865,
+      "learning_rate": 0.00012846371270312204,
+      "loss": 0.8484,
+      "step": 9182
+    },
+    {
+      "epoch": 1.63497150997151,
+      "grad_norm": 0.628089189529419,
+      "learning_rate": 0.00012845029395289748,
+      "loss": 1.0171,
+      "step": 9183
+    },
+    {
+      "epoch": 1.6351495726495726,
+      "grad_norm": 0.7493528723716736,
+      "learning_rate": 0.00012843687464524848,
+      "loss": 1.1635,
+      "step": 9184
+    },
+    {
+      "epoch": 1.6353276353276354,
+      "grad_norm": 0.6328163146972656,
+      "learning_rate": 0.00012842345478043799,
+      "loss": 1.1254,
+      "step": 9185
+    },
+    {
+      "epoch": 1.635505698005698,
+      "grad_norm": 0.6720291376113892,
+      "learning_rate": 0.00012841003435872894,
+      "loss": 0.9729,
+      "step": 9186
+    },
+    {
+      "epoch": 1.6356837606837606,
+      "grad_norm": 0.6657332181930542,
+      "learning_rate": 0.00012839661338038427,
+      "loss": 1.1047,
+      "step": 9187
+    },
+    {
+      "epoch": 1.6358618233618234,
+      "grad_norm": 0.7416180968284607,
+      "learning_rate": 0.000128383191845667,
+      "loss": 0.9505,
+      "step": 9188
+    },
+    {
+      "epoch": 1.6360398860398861,
+      "grad_norm": 0.8737816214561462,
+      "learning_rate": 0.00012836976975484,
+      "loss": 1.0518,
+      "step": 9189
+    },
+    {
+      "epoch": 1.6362179487179487,
+      "grad_norm": 0.7351877093315125,
+      "learning_rate": 0.0001283563471081663,
+      "loss": 1.1152,
+      "step": 9190
+    },
+    {
+      "epoch": 1.6363960113960114,
+      "grad_norm": 0.6442788243293762,
+      "learning_rate": 0.00012834292390590893,
+      "loss": 0.9432,
+      "step": 9191
+    },
+    {
+      "epoch": 1.636574074074074,
+      "grad_norm": 0.6848029494285583,
+      "learning_rate": 0.0001283295001483308,
+      "loss": 0.8528,
+      "step": 9192
+    },
+    {
+      "epoch": 1.6367521367521367,
+      "grad_norm": 0.6627060174942017,
+      "learning_rate": 0.00012831607583569497,
+      "loss": 1.0222,
+      "step": 9193
+    },
+    {
+      "epoch": 1.6369301994301995,
+      "grad_norm": 0.7319555878639221,
+      "learning_rate": 0.00012830265096826446,
+      "loss": 0.9392,
+      "step": 9194
+    },
+    {
+      "epoch": 1.6371082621082622,
+      "grad_norm": 0.6986424326896667,
+      "learning_rate": 0.0001282892255463023,
+      "loss": 1.2095,
+      "step": 9195
+    },
+    {
+      "epoch": 1.6372863247863247,
+      "grad_norm": 0.6649929881095886,
+      "learning_rate": 0.0001282757995700715,
+      "loss": 0.9426,
+      "step": 9196
+    },
+    {
+      "epoch": 1.6374643874643875,
+      "grad_norm": 0.6789031624794006,
+      "learning_rate": 0.0001282623730398352,
+      "loss": 0.9705,
+      "step": 9197
+    },
+    {
+      "epoch": 1.63764245014245,
+      "grad_norm": 0.6388779878616333,
+      "learning_rate": 0.00012824894595585637,
+      "loss": 1.0698,
+      "step": 9198
+    },
+    {
+      "epoch": 1.6378205128205128,
+      "grad_norm": 0.636832594871521,
+      "learning_rate": 0.00012823551831839814,
+      "loss": 0.9445,
+      "step": 9199
+    },
+    {
+      "epoch": 1.6379985754985755,
+      "grad_norm": 0.670190691947937,
+      "learning_rate": 0.0001282220901277236,
+      "loss": 0.9847,
+      "step": 9200
+    },
+    {
+      "epoch": 1.6381766381766383,
+      "grad_norm": 0.6020209193229675,
+      "learning_rate": 0.0001282086613840958,
+      "loss": 1.0047,
+      "step": 9201
+    },
+    {
+      "epoch": 1.6383547008547008,
+      "grad_norm": 0.6648211479187012,
+      "learning_rate": 0.0001281952320877779,
+      "loss": 0.8717,
+      "step": 9202
+    },
+    {
+      "epoch": 1.6385327635327636,
+      "grad_norm": 0.7207710146903992,
+      "learning_rate": 0.000128181802239033,
+      "loss": 1.1232,
+      "step": 9203
+    },
+    {
+      "epoch": 1.638710826210826,
+      "grad_norm": 0.800992488861084,
+      "learning_rate": 0.0001281683718381242,
+      "loss": 1.0688,
+      "step": 9204
+    },
+    {
+      "epoch": 1.6388888888888888,
+      "grad_norm": 0.789398193359375,
+      "learning_rate": 0.0001281549408853147,
+      "loss": 1.1772,
+      "step": 9205
+    },
+    {
+      "epoch": 1.6390669515669516,
+      "grad_norm": 0.6514480710029602,
+      "learning_rate": 0.0001281415093808676,
+      "loss": 1.1685,
+      "step": 9206
+    },
+    {
+      "epoch": 1.6392450142450143,
+      "grad_norm": 0.6914686560630798,
+      "learning_rate": 0.00012812807732504608,
+      "loss": 1.1307,
+      "step": 9207
+    },
+    {
+      "epoch": 1.6394230769230769,
+      "grad_norm": 0.6788144111633301,
+      "learning_rate": 0.00012811464471811334,
+      "loss": 1.1735,
+      "step": 9208
+    },
+    {
+      "epoch": 1.6396011396011396,
+      "grad_norm": 0.7049870491027832,
+      "learning_rate": 0.00012810121156033252,
+      "loss": 1.0128,
+      "step": 9209
+    },
+    {
+      "epoch": 1.6397792022792022,
+      "grad_norm": 0.7156766057014465,
+      "learning_rate": 0.00012808777785196687,
+      "loss": 0.9503,
+      "step": 9210
+    },
+    {
+      "epoch": 1.639957264957265,
+      "grad_norm": 0.651716411113739,
+      "learning_rate": 0.0001280743435932795,
+      "loss": 1.1227,
+      "step": 9211
+    },
+    {
+      "epoch": 1.6401353276353277,
+      "grad_norm": 0.7276262044906616,
+      "learning_rate": 0.0001280609087845337,
+      "loss": 1.06,
+      "step": 9212
+    },
+    {
+      "epoch": 1.6403133903133904,
+      "grad_norm": 0.6591095924377441,
+      "learning_rate": 0.0001280474734259927,
+      "loss": 1.0861,
+      "step": 9213
+    },
+    {
+      "epoch": 1.640491452991453,
+      "grad_norm": 0.6675926446914673,
+      "learning_rate": 0.00012803403751791975,
+      "loss": 0.9815,
+      "step": 9214
+    },
+    {
+      "epoch": 1.6406695156695157,
+      "grad_norm": 0.6391474008560181,
+      "learning_rate": 0.00012802060106057803,
+      "loss": 0.8027,
+      "step": 9215
+    },
+    {
+      "epoch": 1.6408475783475782,
+      "grad_norm": 0.6384556293487549,
+      "learning_rate": 0.00012800716405423086,
+      "loss": 0.7877,
+      "step": 9216
+    },
+    {
+      "epoch": 1.641025641025641,
+      "grad_norm": 0.661191463470459,
+      "learning_rate": 0.00012799372649914146,
+      "loss": 0.9725,
+      "step": 9217
+    },
+    {
+      "epoch": 1.6412037037037037,
+      "grad_norm": 0.7418332695960999,
+      "learning_rate": 0.0001279802883955732,
+      "loss": 1.1756,
+      "step": 9218
+    },
+    {
+      "epoch": 1.6413817663817665,
+      "grad_norm": 0.6588954329490662,
+      "learning_rate": 0.00012796684974378928,
+      "loss": 1.0428,
+      "step": 9219
+    },
+    {
+      "epoch": 1.6415598290598292,
+      "grad_norm": 0.7566093802452087,
+      "learning_rate": 0.000127953410544053,
+      "loss": 1.1254,
+      "step": 9220
+    },
+    {
+      "epoch": 1.6417378917378918,
+      "grad_norm": 0.6801039576530457,
+      "learning_rate": 0.00012793997079662777,
+      "loss": 1.0854,
+      "step": 9221
+    },
+    {
+      "epoch": 1.6419159544159543,
+      "grad_norm": 0.7262716889381409,
+      "learning_rate": 0.0001279265305017768,
+      "loss": 0.9343,
+      "step": 9222
+    },
+    {
+      "epoch": 1.642094017094017,
+      "grad_norm": 0.628625750541687,
+      "learning_rate": 0.0001279130896597635,
+      "loss": 0.8942,
+      "step": 9223
+    },
+    {
+      "epoch": 1.6422720797720798,
+      "grad_norm": 0.6183576583862305,
+      "learning_rate": 0.0001278996482708512,
+      "loss": 0.9284,
+      "step": 9224
+    },
+    {
+      "epoch": 1.6424501424501425,
+      "grad_norm": 0.7912000417709351,
+      "learning_rate": 0.00012788620633530327,
+      "loss": 1.3043,
+      "step": 9225
+    },
+    {
+      "epoch": 1.6426282051282053,
+      "grad_norm": 0.6982026100158691,
+      "learning_rate": 0.00012787276385338298,
+      "loss": 1.0224,
+      "step": 9226
+    },
+    {
+      "epoch": 1.6428062678062678,
+      "grad_norm": 0.6734985709190369,
+      "learning_rate": 0.00012785932082535386,
+      "loss": 0.8781,
+      "step": 9227
+    },
+    {
+      "epoch": 1.6429843304843303,
+      "grad_norm": 0.8799532055854797,
+      "learning_rate": 0.0001278458772514792,
+      "loss": 1.1482,
+      "step": 9228
+    },
+    {
+      "epoch": 1.643162393162393,
+      "grad_norm": 0.590295672416687,
+      "learning_rate": 0.0001278324331320224,
+      "loss": 0.9502,
+      "step": 9229
+    },
+    {
+      "epoch": 1.6433404558404558,
+      "grad_norm": 0.6562125086784363,
+      "learning_rate": 0.0001278189884672469,
+      "loss": 0.9834,
+      "step": 9230
+    },
+    {
+      "epoch": 1.6435185185185186,
+      "grad_norm": 0.6848936676979065,
+      "learning_rate": 0.00012780554325741612,
+      "loss": 1.0414,
+      "step": 9231
+    },
+    {
+      "epoch": 1.6436965811965814,
+      "grad_norm": 0.5985032320022583,
+      "learning_rate": 0.00012779209750279344,
+      "loss": 0.9469,
+      "step": 9232
+    },
+    {
+      "epoch": 1.6438746438746439,
+      "grad_norm": 0.7500917911529541,
+      "learning_rate": 0.00012777865120364238,
+      "loss": 0.9626,
+      "step": 9233
+    },
+    {
+      "epoch": 1.6440527065527064,
+      "grad_norm": 0.6565709114074707,
+      "learning_rate": 0.00012776520436022634,
+      "loss": 1.0594,
+      "step": 9234
+    },
+    {
+      "epoch": 1.6442307692307692,
+      "grad_norm": 0.8005441427230835,
+      "learning_rate": 0.00012775175697280882,
+      "loss": 1.2379,
+      "step": 9235
+    },
+    {
+      "epoch": 1.644408831908832,
+      "grad_norm": 0.6734150648117065,
+      "learning_rate": 0.00012773830904165326,
+      "loss": 0.9171,
+      "step": 9236
+    },
+    {
+      "epoch": 1.6445868945868947,
+      "grad_norm": 0.6950868368148804,
+      "learning_rate": 0.00012772486056702314,
+      "loss": 1.1782,
+      "step": 9237
+    },
+    {
+      "epoch": 1.6447649572649574,
+      "grad_norm": 0.8009599447250366,
+      "learning_rate": 0.000127711411549182,
+      "loss": 1.0288,
+      "step": 9238
+    },
+    {
+      "epoch": 1.64494301994302,
+      "grad_norm": 0.6227970719337463,
+      "learning_rate": 0.0001276979619883933,
+      "loss": 0.9327,
+      "step": 9239
+    },
+    {
+      "epoch": 1.6451210826210825,
+      "grad_norm": 0.6828190088272095,
+      "learning_rate": 0.00012768451188492058,
+      "loss": 0.9816,
+      "step": 9240
+    },
+    {
+      "epoch": 1.6452991452991452,
+      "grad_norm": 0.9689767360687256,
+      "learning_rate": 0.00012767106123902738,
+      "loss": 0.9049,
+      "step": 9241
+    },
+    {
+      "epoch": 1.645477207977208,
+      "grad_norm": 0.677061140537262,
+      "learning_rate": 0.00012765761005097717,
+      "loss": 0.9472,
+      "step": 9242
+    },
+    {
+      "epoch": 1.6456552706552707,
+      "grad_norm": 0.7227110862731934,
+      "learning_rate": 0.00012764415832103356,
+      "loss": 1.0384,
+      "step": 9243
+    },
+    {
+      "epoch": 1.6458333333333335,
+      "grad_norm": 0.6540094614028931,
+      "learning_rate": 0.0001276307060494601,
+      "loss": 0.8166,
+      "step": 9244
+    },
+    {
+      "epoch": 1.646011396011396,
+      "grad_norm": 0.6921904683113098,
+      "learning_rate": 0.00012761725323652033,
+      "loss": 0.9746,
+      "step": 9245
+    },
+    {
+      "epoch": 1.6461894586894585,
+      "grad_norm": 0.6742660999298096,
+      "learning_rate": 0.0001276037998824779,
+      "loss": 0.8441,
+      "step": 9246
+    },
+    {
+      "epoch": 1.6463675213675213,
+      "grad_norm": 0.6611103415489197,
+      "learning_rate": 0.0001275903459875963,
+      "loss": 1.087,
+      "step": 9247
+    },
+    {
+      "epoch": 1.646545584045584,
+      "grad_norm": 0.6805498003959656,
+      "learning_rate": 0.00012757689155213923,
+      "loss": 0.923,
+      "step": 9248
+    },
+    {
+      "epoch": 1.6467236467236468,
+      "grad_norm": 0.6598179340362549,
+      "learning_rate": 0.00012756343657637024,
+      "loss": 0.9371,
+      "step": 9249
+    },
+    {
+      "epoch": 1.6469017094017095,
+      "grad_norm": 0.7147273421287537,
+      "learning_rate": 0.00012754998106055297,
+      "loss": 1.053,
+      "step": 9250
+    },
+    {
+      "epoch": 1.647079772079772,
+      "grad_norm": 0.72414630651474,
+      "learning_rate": 0.00012753652500495103,
+      "loss": 1.0547,
+      "step": 9251
+    },
+    {
+      "epoch": 1.6472578347578346,
+      "grad_norm": 0.7784913182258606,
+      "learning_rate": 0.00012752306840982811,
+      "loss": 0.9012,
+      "step": 9252
+    },
+    {
+      "epoch": 1.6474358974358974,
+      "grad_norm": 0.644026517868042,
+      "learning_rate": 0.0001275096112754478,
+      "loss": 1.0911,
+      "step": 9253
+    },
+    {
+      "epoch": 1.64761396011396,
+      "grad_norm": 0.691124677658081,
+      "learning_rate": 0.00012749615360207382,
+      "loss": 0.9918,
+      "step": 9254
+    },
+    {
+      "epoch": 1.6477920227920229,
+      "grad_norm": 0.6632972359657288,
+      "learning_rate": 0.00012748269538996986,
+      "loss": 0.9438,
+      "step": 9255
+    },
+    {
+      "epoch": 1.6479700854700856,
+      "grad_norm": 0.6548733115196228,
+      "learning_rate": 0.00012746923663939955,
+      "loss": 1.1082,
+      "step": 9256
+    },
+    {
+      "epoch": 1.6481481481481481,
+      "grad_norm": 0.6737542748451233,
+      "learning_rate": 0.00012745577735062664,
+      "loss": 0.9255,
+      "step": 9257
+    },
+    {
+      "epoch": 1.6483262108262107,
+      "grad_norm": 0.686862051486969,
+      "learning_rate": 0.00012744231752391479,
+      "loss": 0.9493,
+      "step": 9258
+    },
+    {
+      "epoch": 1.6485042735042734,
+      "grad_norm": 0.6096474528312683,
+      "learning_rate": 0.00012742885715952772,
+      "loss": 0.6849,
+      "step": 9259
+    },
+    {
+      "epoch": 1.6486823361823362,
+      "grad_norm": 0.702751636505127,
+      "learning_rate": 0.00012741539625772918,
+      "loss": 1.0335,
+      "step": 9260
+    },
+    {
+      "epoch": 1.648860398860399,
+      "grad_norm": 0.7470958232879639,
+      "learning_rate": 0.0001274019348187829,
+      "loss": 1.105,
+      "step": 9261
+    },
+    {
+      "epoch": 1.6490384615384617,
+      "grad_norm": 0.6642739176750183,
+      "learning_rate": 0.0001273884728429526,
+      "loss": 1.01,
+      "step": 9262
+    },
+    {
+      "epoch": 1.6492165242165242,
+      "grad_norm": 0.6470904350280762,
+      "learning_rate": 0.00012737501033050213,
+      "loss": 0.9009,
+      "step": 9263
+    },
+    {
+      "epoch": 1.6493945868945867,
+      "grad_norm": 0.7487246990203857,
+      "learning_rate": 0.00012736154728169518,
+      "loss": 0.9832,
+      "step": 9264
+    },
+    {
+      "epoch": 1.6495726495726495,
+      "grad_norm": 0.7370779514312744,
+      "learning_rate": 0.00012734808369679553,
+      "loss": 1.0464,
+      "step": 9265
+    },
+    {
+      "epoch": 1.6497507122507122,
+      "grad_norm": 0.7942814826965332,
+      "learning_rate": 0.00012733461957606702,
+      "loss": 1.102,
+      "step": 9266
+    },
+    {
+      "epoch": 1.649928774928775,
+      "grad_norm": 0.6535606980323792,
+      "learning_rate": 0.00012732115491977336,
+      "loss": 1.0655,
+      "step": 9267
+    },
+    {
+      "epoch": 1.6501068376068377,
+      "grad_norm": 0.601716935634613,
+      "learning_rate": 0.00012730768972817847,
+      "loss": 0.8236,
+      "step": 9268
+    },
+    {
+      "epoch": 1.6502849002849003,
+      "grad_norm": 0.7375118732452393,
+      "learning_rate": 0.00012729422400154614,
+      "loss": 0.9313,
+      "step": 9269
+    },
+    {
+      "epoch": 1.6504629629629628,
+      "grad_norm": 0.7360411882400513,
+      "learning_rate": 0.00012728075774014018,
+      "loss": 0.9254,
+      "step": 9270
+    },
+    {
+      "epoch": 1.6506410256410255,
+      "grad_norm": 0.8453929424285889,
+      "learning_rate": 0.00012726729094422444,
+      "loss": 1.0975,
+      "step": 9271
+    },
+    {
+      "epoch": 1.6508190883190883,
+      "grad_norm": 0.5615501999855042,
+      "learning_rate": 0.00012725382361406274,
+      "loss": 0.8243,
+      "step": 9272
+    },
+    {
+      "epoch": 1.650997150997151,
+      "grad_norm": 0.6494898796081543,
+      "learning_rate": 0.000127240355749919,
+      "loss": 0.9766,
+      "step": 9273
+    },
+    {
+      "epoch": 1.6511752136752138,
+      "grad_norm": 0.6544778347015381,
+      "learning_rate": 0.0001272268873520571,
+      "loss": 0.9969,
+      "step": 9274
+    },
+    {
+      "epoch": 1.6513532763532763,
+      "grad_norm": 0.6937400698661804,
+      "learning_rate": 0.00012721341842074092,
+      "loss": 1.0626,
+      "step": 9275
+    },
+    {
+      "epoch": 1.651531339031339,
+      "grad_norm": 0.7068421244621277,
+      "learning_rate": 0.0001271999489562343,
+      "loss": 1.0068,
+      "step": 9276
+    },
+    {
+      "epoch": 1.6517094017094016,
+      "grad_norm": 0.6425052285194397,
+      "learning_rate": 0.0001271864789588012,
+      "loss": 0.8716,
+      "step": 9277
+    },
+    {
+      "epoch": 1.6518874643874644,
+      "grad_norm": 0.6895090341567993,
+      "learning_rate": 0.0001271730084287055,
+      "loss": 1.081,
+      "step": 9278
+    },
+    {
+      "epoch": 1.6520655270655271,
+      "grad_norm": 0.6773712038993835,
+      "learning_rate": 0.00012715953736621116,
+      "loss": 0.7586,
+      "step": 9279
+    },
+    {
+      "epoch": 1.6522435897435899,
+      "grad_norm": 0.6085716485977173,
+      "learning_rate": 0.0001271460657715821,
+      "loss": 0.8627,
+      "step": 9280
+    },
+    {
+      "epoch": 1.6524216524216524,
+      "grad_norm": 0.6415461897850037,
+      "learning_rate": 0.00012713259364508227,
+      "loss": 0.9751,
+      "step": 9281
+    },
+    {
+      "epoch": 1.6525997150997151,
+      "grad_norm": 0.6460939645767212,
+      "learning_rate": 0.00012711912098697565,
+      "loss": 0.9578,
+      "step": 9282
+    },
+    {
+      "epoch": 1.6527777777777777,
+      "grad_norm": 0.6076797246932983,
+      "learning_rate": 0.00012710564779752615,
+      "loss": 0.9627,
+      "step": 9283
+    },
+    {
+      "epoch": 1.6529558404558404,
+      "grad_norm": 0.710782527923584,
+      "learning_rate": 0.00012709217407699783,
+      "loss": 0.8725,
+      "step": 9284
+    },
+    {
+      "epoch": 1.6531339031339032,
+      "grad_norm": 0.6793623566627502,
+      "learning_rate": 0.00012707869982565463,
+      "loss": 0.908,
+      "step": 9285
+    },
+    {
+      "epoch": 1.653311965811966,
+      "grad_norm": 0.6841681003570557,
+      "learning_rate": 0.00012706522504376055,
+      "loss": 0.8546,
+      "step": 9286
+    },
+    {
+      "epoch": 1.6534900284900285,
+      "grad_norm": 0.7908675670623779,
+      "learning_rate": 0.0001270517497315796,
+      "loss": 0.9409,
+      "step": 9287
+    },
+    {
+      "epoch": 1.6536680911680912,
+      "grad_norm": 0.6918683648109436,
+      "learning_rate": 0.0001270382738893758,
+      "loss": 1.0493,
+      "step": 9288
+    },
+    {
+      "epoch": 1.6538461538461537,
+      "grad_norm": 0.6891819834709167,
+      "learning_rate": 0.00012702479751741322,
+      "loss": 1.0675,
+      "step": 9289
+    },
+    {
+      "epoch": 1.6540242165242165,
+      "grad_norm": 0.6965166926383972,
+      "learning_rate": 0.00012701132061595586,
+      "loss": 0.8563,
+      "step": 9290
+    },
+    {
+      "epoch": 1.6542022792022792,
+      "grad_norm": 0.7549001574516296,
+      "learning_rate": 0.00012699784318526779,
+      "loss": 1.1572,
+      "step": 9291
+    },
+    {
+      "epoch": 1.654380341880342,
+      "grad_norm": 0.6100513339042664,
+      "learning_rate": 0.00012698436522561303,
+      "loss": 0.897,
+      "step": 9292
+    },
+    {
+      "epoch": 1.6545584045584045,
+      "grad_norm": 0.6477037668228149,
+      "learning_rate": 0.00012697088673725574,
+      "loss": 0.7961,
+      "step": 9293
+    },
+    {
+      "epoch": 1.6547364672364673,
+      "grad_norm": 0.7402619123458862,
+      "learning_rate": 0.0001269574077204599,
+      "loss": 1.2001,
+      "step": 9294
+    },
+    {
+      "epoch": 1.6549145299145298,
+      "grad_norm": 0.7162346243858337,
+      "learning_rate": 0.0001269439281754897,
+      "loss": 0.9963,
+      "step": 9295
+    },
+    {
+      "epoch": 1.6550925925925926,
+      "grad_norm": 0.6757413744926453,
+      "learning_rate": 0.0001269304481026092,
+      "loss": 1.0476,
+      "step": 9296
+    },
+    {
+      "epoch": 1.6552706552706553,
+      "grad_norm": 0.6455655097961426,
+      "learning_rate": 0.0001269169675020825,
+      "loss": 0.9716,
+      "step": 9297
+    },
+    {
+      "epoch": 1.655448717948718,
+      "grad_norm": 0.7705031037330627,
+      "learning_rate": 0.0001269034863741737,
+      "loss": 0.9886,
+      "step": 9298
+    },
+    {
+      "epoch": 1.6556267806267806,
+      "grad_norm": 0.6084272861480713,
+      "learning_rate": 0.000126890004719147,
+      "loss": 0.8231,
+      "step": 9299
+    },
+    {
+      "epoch": 1.6558048433048433,
+      "grad_norm": 0.7051045298576355,
+      "learning_rate": 0.00012687652253726652,
+      "loss": 0.8673,
+      "step": 9300
+    },
+    {
+      "epoch": 1.6559829059829059,
+      "grad_norm": 0.731675386428833,
+      "learning_rate": 0.0001268630398287964,
+      "loss": 0.8609,
+      "step": 9301
+    },
+    {
+      "epoch": 1.6561609686609686,
+      "grad_norm": 0.6796799302101135,
+      "learning_rate": 0.00012684955659400087,
+      "loss": 1.0157,
+      "step": 9302
+    },
+    {
+      "epoch": 1.6563390313390314,
+      "grad_norm": 0.6270264983177185,
+      "learning_rate": 0.000126836072833144,
+      "loss": 0.8924,
+      "step": 9303
+    },
+    {
+      "epoch": 1.6565170940170941,
+      "grad_norm": 0.7235464453697205,
+      "learning_rate": 0.00012682258854649004,
+      "loss": 0.8904,
+      "step": 9304
+    },
+    {
+      "epoch": 1.6566951566951567,
+      "grad_norm": 0.7644724249839783,
+      "learning_rate": 0.00012680910373430318,
+      "loss": 0.9119,
+      "step": 9305
+    },
+    {
+      "epoch": 1.6568732193732194,
+      "grad_norm": 0.661411702632904,
+      "learning_rate": 0.00012679561839684764,
+      "loss": 1.0066,
+      "step": 9306
+    },
+    {
+      "epoch": 1.657051282051282,
+      "grad_norm": 0.6981723308563232,
+      "learning_rate": 0.0001267821325343876,
+      "loss": 1.2579,
+      "step": 9307
+    },
+    {
+      "epoch": 1.6572293447293447,
+      "grad_norm": 0.6469807028770447,
+      "learning_rate": 0.0001267686461471873,
+      "loss": 0.8678,
+      "step": 9308
+    },
+    {
+      "epoch": 1.6574074074074074,
+      "grad_norm": 0.8255495429039001,
+      "learning_rate": 0.000126755159235511,
+      "loss": 0.9053,
+      "step": 9309
+    },
+    {
+      "epoch": 1.6575854700854702,
+      "grad_norm": 0.6882261037826538,
+      "learning_rate": 0.00012674167179962294,
+      "loss": 0.8364,
+      "step": 9310
+    },
+    {
+      "epoch": 1.6577635327635327,
+      "grad_norm": 0.6816701889038086,
+      "learning_rate": 0.00012672818383978733,
+      "loss": 0.9627,
+      "step": 9311
+    },
+    {
+      "epoch": 1.6579415954415955,
+      "grad_norm": 0.6993424892425537,
+      "learning_rate": 0.00012671469535626852,
+      "loss": 0.8337,
+      "step": 9312
+    },
+    {
+      "epoch": 1.658119658119658,
+      "grad_norm": 0.6271458864212036,
+      "learning_rate": 0.00012670120634933075,
+      "loss": 0.8322,
+      "step": 9313
+    },
+    {
+      "epoch": 1.6582977207977208,
+      "grad_norm": 0.7012003660202026,
+      "learning_rate": 0.00012668771681923827,
+      "loss": 0.8895,
+      "step": 9314
+    },
+    {
+      "epoch": 1.6584757834757835,
+      "grad_norm": 0.6704670190811157,
+      "learning_rate": 0.00012667422676625547,
+      "loss": 1.0544,
+      "step": 9315
+    },
+    {
+      "epoch": 1.6586538461538463,
+      "grad_norm": 0.6189491748809814,
+      "learning_rate": 0.0001266607361906466,
+      "loss": 0.9623,
+      "step": 9316
+    },
+    {
+      "epoch": 1.6588319088319088,
+      "grad_norm": 0.7065694332122803,
+      "learning_rate": 0.000126647245092676,
+      "loss": 0.8874,
+      "step": 9317
+    },
+    {
+      "epoch": 1.6590099715099715,
+      "grad_norm": 0.7473452687263489,
+      "learning_rate": 0.00012663375347260795,
+      "loss": 1.0576,
+      "step": 9318
+    },
+    {
+      "epoch": 1.659188034188034,
+      "grad_norm": 0.6839408874511719,
+      "learning_rate": 0.0001266202613307068,
+      "loss": 0.9127,
+      "step": 9319
+    },
+    {
+      "epoch": 1.6593660968660968,
+      "grad_norm": 0.7154020071029663,
+      "learning_rate": 0.00012660676866723699,
+      "loss": 1.1174,
+      "step": 9320
+    },
+    {
+      "epoch": 1.6595441595441596,
+      "grad_norm": 0.7123729586601257,
+      "learning_rate": 0.0001265932754824628,
+      "loss": 0.9617,
+      "step": 9321
+    },
+    {
+      "epoch": 1.6597222222222223,
+      "grad_norm": 0.7537810802459717,
+      "learning_rate": 0.0001265797817766486,
+      "loss": 1.0333,
+      "step": 9322
+    },
+    {
+      "epoch": 1.6599002849002849,
+      "grad_norm": 0.706551730632782,
+      "learning_rate": 0.00012656628755005884,
+      "loss": 1.0838,
+      "step": 9323
+    },
+    {
+      "epoch": 1.6600783475783476,
+      "grad_norm": 0.8104004859924316,
+      "learning_rate": 0.0001265527928029578,
+      "loss": 0.9807,
+      "step": 9324
+    },
+    {
+      "epoch": 1.6602564102564101,
+      "grad_norm": 0.6892881989479065,
+      "learning_rate": 0.00012653929753560998,
+      "loss": 0.9941,
+      "step": 9325
+    },
+    {
+      "epoch": 1.6604344729344729,
+      "grad_norm": 0.5919203758239746,
+      "learning_rate": 0.00012652580174827974,
+      "loss": 0.9268,
+      "step": 9326
+    },
+    {
+      "epoch": 1.6606125356125356,
+      "grad_norm": 0.6715863347053528,
+      "learning_rate": 0.00012651230544123154,
+      "loss": 1.0912,
+      "step": 9327
+    },
+    {
+      "epoch": 1.6607905982905984,
+      "grad_norm": 0.6765137314796448,
+      "learning_rate": 0.0001264988086147298,
+      "loss": 1.1576,
+      "step": 9328
+    },
+    {
+      "epoch": 1.660968660968661,
+      "grad_norm": 0.6781638860702515,
+      "learning_rate": 0.00012648531126903888,
+      "loss": 1.1162,
+      "step": 9329
+    },
+    {
+      "epoch": 1.6611467236467237,
+      "grad_norm": 0.715871274471283,
+      "learning_rate": 0.00012647181340442337,
+      "loss": 0.714,
+      "step": 9330
+    },
+    {
+      "epoch": 1.6613247863247862,
+      "grad_norm": 0.6237258315086365,
+      "learning_rate": 0.00012645831502114762,
+      "loss": 0.8512,
+      "step": 9331
+    },
+    {
+      "epoch": 1.661502849002849,
+      "grad_norm": 0.6668339967727661,
+      "learning_rate": 0.0001264448161194762,
+      "loss": 1.0384,
+      "step": 9332
+    },
+    {
+      "epoch": 1.6616809116809117,
+      "grad_norm": 0.8316730260848999,
+      "learning_rate": 0.00012643131669967352,
+      "loss": 0.8931,
+      "step": 9333
+    },
+    {
+      "epoch": 1.6618589743589745,
+      "grad_norm": 0.7013183832168579,
+      "learning_rate": 0.00012641781676200406,
+      "loss": 1.0548,
+      "step": 9334
+    },
+    {
+      "epoch": 1.6620370370370372,
+      "grad_norm": 0.6980466842651367,
+      "learning_rate": 0.00012640431630673243,
+      "loss": 0.8988,
+      "step": 9335
+    },
+    {
+      "epoch": 1.6622150997150997,
+      "grad_norm": 0.7045995593070984,
+      "learning_rate": 0.000126390815334123,
+      "loss": 1.107,
+      "step": 9336
+    },
+    {
+      "epoch": 1.6623931623931623,
+      "grad_norm": 0.6699773669242859,
+      "learning_rate": 0.00012637731384444043,
+      "loss": 1.1757,
+      "step": 9337
+    },
+    {
+      "epoch": 1.662571225071225,
+      "grad_norm": 0.6489999294281006,
+      "learning_rate": 0.00012636381183794916,
+      "loss": 0.9282,
+      "step": 9338
+    },
+    {
+      "epoch": 1.6627492877492878,
+      "grad_norm": 0.7085952758789062,
+      "learning_rate": 0.00012635030931491375,
+      "loss": 1.0221,
+      "step": 9339
+    },
+    {
+      "epoch": 1.6629273504273505,
+      "grad_norm": 0.6893135905265808,
+      "learning_rate": 0.00012633680627559878,
+      "loss": 1.0517,
+      "step": 9340
+    },
+    {
+      "epoch": 1.6631054131054133,
+      "grad_norm": 0.5659682154655457,
+      "learning_rate": 0.00012632330272026882,
+      "loss": 0.6294,
+      "step": 9341
+    },
+    {
+      "epoch": 1.6632834757834758,
+      "grad_norm": 0.6889018416404724,
+      "learning_rate": 0.00012630979864918838,
+      "loss": 1.0735,
+      "step": 9342
+    },
+    {
+      "epoch": 1.6634615384615383,
+      "grad_norm": 0.7333424687385559,
+      "learning_rate": 0.00012629629406262212,
+      "loss": 0.9079,
+      "step": 9343
+    },
+    {
+      "epoch": 1.663639601139601,
+      "grad_norm": 0.6340580582618713,
+      "learning_rate": 0.00012628278896083462,
+      "loss": 0.9738,
+      "step": 9344
+    },
+    {
+      "epoch": 1.6638176638176638,
+      "grad_norm": 0.7042564749717712,
+      "learning_rate": 0.00012626928334409044,
+      "loss": 0.959,
+      "step": 9345
+    },
+    {
+      "epoch": 1.6639957264957266,
+      "grad_norm": 0.711757242679596,
+      "learning_rate": 0.00012625577721265424,
+      "loss": 0.8113,
+      "step": 9346
+    },
+    {
+      "epoch": 1.6641737891737893,
+      "grad_norm": 0.7723299264907837,
+      "learning_rate": 0.0001262422705667906,
+      "loss": 1.1724,
+      "step": 9347
+    },
+    {
+      "epoch": 1.6643518518518519,
+      "grad_norm": 0.711334228515625,
+      "learning_rate": 0.00012622876340676422,
+      "loss": 1.0121,
+      "step": 9348
+    },
+    {
+      "epoch": 1.6645299145299144,
+      "grad_norm": 0.6954590678215027,
+      "learning_rate": 0.0001262152557328397,
+      "loss": 1.2093,
+      "step": 9349
+    },
+    {
+      "epoch": 1.6647079772079771,
+      "grad_norm": 0.6341620087623596,
+      "learning_rate": 0.00012620174754528166,
+      "loss": 1.0535,
+      "step": 9350
+    },
+    {
+      "epoch": 1.66488603988604,
+      "grad_norm": 0.6434268355369568,
+      "learning_rate": 0.00012618823884435484,
+      "loss": 0.8964,
+      "step": 9351
+    },
+    {
+      "epoch": 1.6650641025641026,
+      "grad_norm": 0.7685084939002991,
+      "learning_rate": 0.00012617472963032385,
+      "loss": 1.0639,
+      "step": 9352
+    },
+    {
+      "epoch": 1.6652421652421654,
+      "grad_norm": 0.6347958445549011,
+      "learning_rate": 0.00012616121990345345,
+      "loss": 1.0252,
+      "step": 9353
+    },
+    {
+      "epoch": 1.665420227920228,
+      "grad_norm": 0.647722601890564,
+      "learning_rate": 0.0001261477096640083,
+      "loss": 0.9527,
+      "step": 9354
+    },
+    {
+      "epoch": 1.6655982905982905,
+      "grad_norm": 0.5942047834396362,
+      "learning_rate": 0.000126134198912253,
+      "loss": 1.0062,
+      "step": 9355
+    },
+    {
+      "epoch": 1.6657763532763532,
+      "grad_norm": 0.683555006980896,
+      "learning_rate": 0.00012612068764845247,
+      "loss": 0.8101,
+      "step": 9356
+    },
+    {
+      "epoch": 1.665954415954416,
+      "grad_norm": 0.6832289099693298,
+      "learning_rate": 0.00012610717587287128,
+      "loss": 1.1436,
+      "step": 9357
+    },
+    {
+      "epoch": 1.6661324786324787,
+      "grad_norm": 0.7035253047943115,
+      "learning_rate": 0.00012609366358577422,
+      "loss": 0.9724,
+      "step": 9358
+    },
+    {
+      "epoch": 1.6663105413105415,
+      "grad_norm": 0.6471409797668457,
+      "learning_rate": 0.00012608015078742604,
+      "loss": 0.776,
+      "step": 9359
+    },
+    {
+      "epoch": 1.666488603988604,
+      "grad_norm": 0.7069687247276306,
+      "learning_rate": 0.00012606663747809145,
+      "loss": 0.9667,
+      "step": 9360
+    },
+    {
+      "epoch": 1.6666666666666665,
+      "grad_norm": 0.6744135618209839,
+      "learning_rate": 0.00012605312365803525,
+      "loss": 1.1152,
+      "step": 9361
+    },
+    {
+      "epoch": 1.6668447293447293,
+      "grad_norm": 0.7212334275245667,
+      "learning_rate": 0.00012603960932752227,
+      "loss": 1.1543,
+      "step": 9362
+    },
+    {
+      "epoch": 1.667022792022792,
+      "grad_norm": 0.6501669883728027,
+      "learning_rate": 0.0001260260944868172,
+      "loss": 0.8595,
+      "step": 9363
+    },
+    {
+      "epoch": 1.6672008547008548,
+      "grad_norm": 0.6970864534378052,
+      "learning_rate": 0.00012601257913618486,
+      "loss": 0.9364,
+      "step": 9364
+    },
+    {
+      "epoch": 1.6673789173789175,
+      "grad_norm": 0.6802223324775696,
+      "learning_rate": 0.00012599906327589007,
+      "loss": 0.8429,
+      "step": 9365
+    },
+    {
+      "epoch": 1.66755698005698,
+      "grad_norm": 0.6842933893203735,
+      "learning_rate": 0.00012598554690619764,
+      "loss": 1.1255,
+      "step": 9366
+    },
+    {
+      "epoch": 1.6677350427350426,
+      "grad_norm": 0.6547088623046875,
+      "learning_rate": 0.0001259720300273724,
+      "loss": 0.983,
+      "step": 9367
+    },
+    {
+      "epoch": 1.6679131054131053,
+      "grad_norm": 0.620424211025238,
+      "learning_rate": 0.0001259585126396792,
+      "loss": 0.918,
+      "step": 9368
+    },
+    {
+      "epoch": 1.668091168091168,
+      "grad_norm": 0.5659816861152649,
+      "learning_rate": 0.00012594499474338287,
+      "loss": 0.7788,
+      "step": 9369
+    },
+    {
+      "epoch": 1.6682692307692308,
+      "grad_norm": 0.5904595255851746,
+      "learning_rate": 0.00012593147633874826,
+      "loss": 0.801,
+      "step": 9370
+    },
+    {
+      "epoch": 1.6684472934472936,
+      "grad_norm": 0.6444024443626404,
+      "learning_rate": 0.0001259179574260402,
+      "loss": 1.0997,
+      "step": 9371
+    },
+    {
+      "epoch": 1.6686253561253561,
+      "grad_norm": 0.6408827304840088,
+      "learning_rate": 0.00012590443800552365,
+      "loss": 0.9839,
+      "step": 9372
+    },
+    {
+      "epoch": 1.6688034188034186,
+      "grad_norm": 0.752391517162323,
+      "learning_rate": 0.00012589091807746345,
+      "loss": 1.0249,
+      "step": 9373
+    },
+    {
+      "epoch": 1.6689814814814814,
+      "grad_norm": 0.8256397247314453,
+      "learning_rate": 0.00012587739764212448,
+      "loss": 0.9541,
+      "step": 9374
+    },
+    {
+      "epoch": 1.6691595441595442,
+      "grad_norm": 0.7878768444061279,
+      "learning_rate": 0.00012586387669977166,
+      "loss": 1.0071,
+      "step": 9375
+    },
+    {
+      "epoch": 1.669337606837607,
+      "grad_norm": 0.6179735660552979,
+      "learning_rate": 0.0001258503552506699,
+      "loss": 0.8495,
+      "step": 9376
+    },
+    {
+      "epoch": 1.6695156695156697,
+      "grad_norm": 0.6699580550193787,
+      "learning_rate": 0.00012583683329508413,
+      "loss": 0.8999,
+      "step": 9377
+    },
+    {
+      "epoch": 1.6696937321937322,
+      "grad_norm": 0.6542006731033325,
+      "learning_rate": 0.00012582331083327929,
+      "loss": 1.0357,
+      "step": 9378
+    },
+    {
+      "epoch": 1.6698717948717947,
+      "grad_norm": 0.7275210618972778,
+      "learning_rate": 0.0001258097878655203,
+      "loss": 1.0259,
+      "step": 9379
+    },
+    {
+      "epoch": 1.6700498575498575,
+      "grad_norm": 0.6836326122283936,
+      "learning_rate": 0.00012579626439207216,
+      "loss": 1.0428,
+      "step": 9380
+    },
+    {
+      "epoch": 1.6702279202279202,
+      "grad_norm": 0.760123610496521,
+      "learning_rate": 0.00012578274041319978,
+      "loss": 0.9716,
+      "step": 9381
+    },
+    {
+      "epoch": 1.670405982905983,
+      "grad_norm": 0.5525194406509399,
+      "learning_rate": 0.00012576921592916818,
+      "loss": 0.8253,
+      "step": 9382
+    },
+    {
+      "epoch": 1.6705840455840457,
+      "grad_norm": 0.6881270408630371,
+      "learning_rate": 0.00012575569094024232,
+      "loss": 1.0571,
+      "step": 9383
+    },
+    {
+      "epoch": 1.6707621082621082,
+      "grad_norm": 0.6776245832443237,
+      "learning_rate": 0.0001257421654466872,
+      "loss": 0.9119,
+      "step": 9384
+    },
+    {
+      "epoch": 1.6709401709401708,
+      "grad_norm": 0.7903014421463013,
+      "learning_rate": 0.0001257286394487678,
+      "loss": 1.0626,
+      "step": 9385
+    },
+    {
+      "epoch": 1.6711182336182335,
+      "grad_norm": 0.61158287525177,
+      "learning_rate": 0.0001257151129467492,
+      "loss": 0.9378,
+      "step": 9386
+    },
+    {
+      "epoch": 1.6712962962962963,
+      "grad_norm": 0.655189573764801,
+      "learning_rate": 0.00012570158594089637,
+      "loss": 0.9334,
+      "step": 9387
+    },
+    {
+      "epoch": 1.671474358974359,
+      "grad_norm": 0.6707320809364319,
+      "learning_rate": 0.0001256880584314743,
+      "loss": 1.1802,
+      "step": 9388
+    },
+    {
+      "epoch": 1.6716524216524218,
+      "grad_norm": 0.847341775894165,
+      "learning_rate": 0.00012567453041874814,
+      "loss": 1.1169,
+      "step": 9389
+    },
+    {
+      "epoch": 1.6718304843304843,
+      "grad_norm": 0.6136410236358643,
+      "learning_rate": 0.00012566100190298287,
+      "loss": 0.8959,
+      "step": 9390
+    },
+    {
+      "epoch": 1.672008547008547,
+      "grad_norm": 0.7203437089920044,
+      "learning_rate": 0.00012564747288444357,
+      "loss": 0.9803,
+      "step": 9391
+    },
+    {
+      "epoch": 1.6721866096866096,
+      "grad_norm": 0.7832576632499695,
+      "learning_rate": 0.00012563394336339534,
+      "loss": 0.8696,
+      "step": 9392
+    },
+    {
+      "epoch": 1.6723646723646723,
+      "grad_norm": 0.6940804719924927,
+      "learning_rate": 0.00012562041334010323,
+      "loss": 1.0571,
+      "step": 9393
+    },
+    {
+      "epoch": 1.672542735042735,
+      "grad_norm": 0.6042298674583435,
+      "learning_rate": 0.00012560688281483234,
+      "loss": 0.8835,
+      "step": 9394
+    },
+    {
+      "epoch": 1.6727207977207978,
+      "grad_norm": 0.7870675921440125,
+      "learning_rate": 0.00012559335178784776,
+      "loss": 1.1585,
+      "step": 9395
+    },
+    {
+      "epoch": 1.6728988603988604,
+      "grad_norm": 0.7448568940162659,
+      "learning_rate": 0.00012557982025941463,
+      "loss": 0.9699,
+      "step": 9396
+    },
+    {
+      "epoch": 1.6730769230769231,
+      "grad_norm": 0.7226544618606567,
+      "learning_rate": 0.00012556628822979807,
+      "loss": 0.7817,
+      "step": 9397
+    },
+    {
+      "epoch": 1.6732549857549857,
+      "grad_norm": 0.5652043223381042,
+      "learning_rate": 0.0001255527556992632,
+      "loss": 0.8077,
+      "step": 9398
+    },
+    {
+      "epoch": 1.6734330484330484,
+      "grad_norm": 0.6459930539131165,
+      "learning_rate": 0.00012553922266807517,
+      "loss": 1.22,
+      "step": 9399
+    },
+    {
+      "epoch": 1.6736111111111112,
+      "grad_norm": 0.7568991780281067,
+      "learning_rate": 0.00012552568913649912,
+      "loss": 1.1559,
+      "step": 9400
+    },
+    {
+      "epoch": 1.673789173789174,
+      "grad_norm": 0.7462680339813232,
+      "learning_rate": 0.0001255121551048002,
+      "loss": 1.1438,
+      "step": 9401
+    },
+    {
+      "epoch": 1.6739672364672364,
+      "grad_norm": 0.6653871536254883,
+      "learning_rate": 0.0001254986205732436,
+      "loss": 0.9468,
+      "step": 9402
+    },
+    {
+      "epoch": 1.6741452991452992,
+      "grad_norm": 0.6261825561523438,
+      "learning_rate": 0.0001254850855420945,
+      "loss": 0.8558,
+      "step": 9403
+    },
+    {
+      "epoch": 1.6743233618233617,
+      "grad_norm": 0.6442354321479797,
+      "learning_rate": 0.0001254715500116181,
+      "loss": 0.8605,
+      "step": 9404
+    },
+    {
+      "epoch": 1.6745014245014245,
+      "grad_norm": 0.7483665943145752,
+      "learning_rate": 0.00012545801398207958,
+      "loss": 0.9089,
+      "step": 9405
+    },
+    {
+      "epoch": 1.6746794871794872,
+      "grad_norm": 0.7319819927215576,
+      "learning_rate": 0.00012544447745374416,
+      "loss": 0.9937,
+      "step": 9406
+    },
+    {
+      "epoch": 1.67485754985755,
+      "grad_norm": 0.703014075756073,
+      "learning_rate": 0.00012543094042687708,
+      "loss": 0.9597,
+      "step": 9407
+    },
+    {
+      "epoch": 1.6750356125356125,
+      "grad_norm": 0.6593887209892273,
+      "learning_rate": 0.00012541740290174353,
+      "loss": 0.844,
+      "step": 9408
+    },
+    {
+      "epoch": 1.6752136752136753,
+      "grad_norm": 0.6567463874816895,
+      "learning_rate": 0.00012540386487860879,
+      "loss": 1.0744,
+      "step": 9409
+    },
+    {
+      "epoch": 1.6753917378917378,
+      "grad_norm": 0.7784611582756042,
+      "learning_rate": 0.00012539032635773805,
+      "loss": 0.974,
+      "step": 9410
+    },
+    {
+      "epoch": 1.6755698005698005,
+      "grad_norm": 0.6760087609291077,
+      "learning_rate": 0.00012537678733939663,
+      "loss": 0.8948,
+      "step": 9411
+    },
+    {
+      "epoch": 1.6757478632478633,
+      "grad_norm": 0.825965940952301,
+      "learning_rate": 0.0001253632478238498,
+      "loss": 1.1196,
+      "step": 9412
+    },
+    {
+      "epoch": 1.675925925925926,
+      "grad_norm": 0.7215564250946045,
+      "learning_rate": 0.00012534970781136277,
+      "loss": 1.1774,
+      "step": 9413
+    },
+    {
+      "epoch": 1.6761039886039886,
+      "grad_norm": 0.6548578143119812,
+      "learning_rate": 0.00012533616730220094,
+      "loss": 0.8671,
+      "step": 9414
+    },
+    {
+      "epoch": 1.6762820512820513,
+      "grad_norm": 0.7257684469223022,
+      "learning_rate": 0.00012532262629662947,
+      "loss": 1.105,
+      "step": 9415
+    },
+    {
+      "epoch": 1.6764601139601139,
+      "grad_norm": 0.6695847511291504,
+      "learning_rate": 0.00012530908479491378,
+      "loss": 0.9189,
+      "step": 9416
+    },
+    {
+      "epoch": 1.6766381766381766,
+      "grad_norm": 0.684695303440094,
+      "learning_rate": 0.00012529554279731915,
+      "loss": 1.066,
+      "step": 9417
+    },
+    {
+      "epoch": 1.6768162393162394,
+      "grad_norm": 0.7107276320457458,
+      "learning_rate": 0.0001252820003041109,
+      "loss": 0.9311,
+      "step": 9418
+    },
+    {
+      "epoch": 1.676994301994302,
+      "grad_norm": 0.6755440831184387,
+      "learning_rate": 0.0001252684573155544,
+      "loss": 1.1036,
+      "step": 9419
+    },
+    {
+      "epoch": 1.6771723646723646,
+      "grad_norm": 0.7571110725402832,
+      "learning_rate": 0.00012525491383191491,
+      "loss": 1.0244,
+      "step": 9420
+    },
+    {
+      "epoch": 1.6773504273504274,
+      "grad_norm": 0.6960614323616028,
+      "learning_rate": 0.0001252413698534579,
+      "loss": 0.9077,
+      "step": 9421
+    },
+    {
+      "epoch": 1.67752849002849,
+      "grad_norm": 0.6675550937652588,
+      "learning_rate": 0.00012522782538044867,
+      "loss": 1.0543,
+      "step": 9422
+    },
+    {
+      "epoch": 1.6777065527065527,
+      "grad_norm": 0.6637391448020935,
+      "learning_rate": 0.0001252142804131526,
+      "loss": 0.9471,
+      "step": 9423
+    },
+    {
+      "epoch": 1.6778846153846154,
+      "grad_norm": 0.6382880210876465,
+      "learning_rate": 0.00012520073495183508,
+      "loss": 0.9729,
+      "step": 9424
+    },
+    {
+      "epoch": 1.6780626780626782,
+      "grad_norm": 0.731922447681427,
+      "learning_rate": 0.0001251871889967615,
+      "loss": 1.0385,
+      "step": 9425
+    },
+    {
+      "epoch": 1.6782407407407407,
+      "grad_norm": 0.5868890285491943,
+      "learning_rate": 0.00012517364254819728,
+      "loss": 0.8466,
+      "step": 9426
+    },
+    {
+      "epoch": 1.6784188034188035,
+      "grad_norm": 0.8535677790641785,
+      "learning_rate": 0.00012516009560640786,
+      "loss": 1.1009,
+      "step": 9427
+    },
+    {
+      "epoch": 1.678596866096866,
+      "grad_norm": 0.7044199705123901,
+      "learning_rate": 0.0001251465481716586,
+      "loss": 1.0862,
+      "step": 9428
+    },
+    {
+      "epoch": 1.6787749287749287,
+      "grad_norm": 0.7207323312759399,
+      "learning_rate": 0.00012513300024421498,
+      "loss": 1.064,
+      "step": 9429
+    },
+    {
+      "epoch": 1.6789529914529915,
+      "grad_norm": 0.7739703059196472,
+      "learning_rate": 0.0001251194518243424,
+      "loss": 1.1738,
+      "step": 9430
+    },
+    {
+      "epoch": 1.6791310541310542,
+      "grad_norm": 0.6829344630241394,
+      "learning_rate": 0.00012510590291230637,
+      "loss": 1.0555,
+      "step": 9431
+    },
+    {
+      "epoch": 1.6793091168091168,
+      "grad_norm": 0.6760238409042358,
+      "learning_rate": 0.0001250923535083723,
+      "loss": 1.2177,
+      "step": 9432
+    },
+    {
+      "epoch": 1.6794871794871795,
+      "grad_norm": 0.6666911840438843,
+      "learning_rate": 0.0001250788036128057,
+      "loss": 0.8957,
+      "step": 9433
+    },
+    {
+      "epoch": 1.679665242165242,
+      "grad_norm": 0.747797429561615,
+      "learning_rate": 0.00012506525322587207,
+      "loss": 0.9793,
+      "step": 9434
+    },
+    {
+      "epoch": 1.6798433048433048,
+      "grad_norm": 0.6261107325553894,
+      "learning_rate": 0.00012505170234783686,
+      "loss": 0.7781,
+      "step": 9435
+    },
+    {
+      "epoch": 1.6800213675213675,
+      "grad_norm": 0.7055163979530334,
+      "learning_rate": 0.00012503815097896555,
+      "loss": 1.0617,
+      "step": 9436
+    },
+    {
+      "epoch": 1.6801994301994303,
+      "grad_norm": 0.5567409992218018,
+      "learning_rate": 0.00012502459911952371,
+      "loss": 0.7911,
+      "step": 9437
+    },
+    {
+      "epoch": 1.6803774928774928,
+      "grad_norm": 0.7410423159599304,
+      "learning_rate": 0.0001250110467697768,
+      "loss": 1.1041,
+      "step": 9438
+    },
+    {
+      "epoch": 1.6805555555555556,
+      "grad_norm": 0.6185283064842224,
+      "learning_rate": 0.00012499749392999045,
+      "loss": 0.8101,
+      "step": 9439
+    },
+    {
+      "epoch": 1.680733618233618,
+      "grad_norm": 0.6988311409950256,
+      "learning_rate": 0.0001249839406004301,
+      "loss": 0.8579,
+      "step": 9440
+    },
+    {
+      "epoch": 1.6809116809116809,
+      "grad_norm": 0.5588746070861816,
+      "learning_rate": 0.00012497038678136132,
+      "loss": 0.8035,
+      "step": 9441
+    },
+    {
+      "epoch": 1.6810897435897436,
+      "grad_norm": 0.6568905711174011,
+      "learning_rate": 0.0001249568324730497,
+      "loss": 0.7455,
+      "step": 9442
+    },
+    {
+      "epoch": 1.6812678062678064,
+      "grad_norm": 0.6924821138381958,
+      "learning_rate": 0.00012494327767576078,
+      "loss": 1.134,
+      "step": 9443
+    },
+    {
+      "epoch": 1.681445868945869,
+      "grad_norm": 0.6940170526504517,
+      "learning_rate": 0.00012492972238976018,
+      "loss": 0.9719,
+      "step": 9444
+    },
+    {
+      "epoch": 1.6816239316239316,
+      "grad_norm": 0.667465090751648,
+      "learning_rate": 0.00012491616661531343,
+      "loss": 0.953,
+      "step": 9445
+    },
+    {
+      "epoch": 1.6818019943019942,
+      "grad_norm": 0.7693275809288025,
+      "learning_rate": 0.00012490261035268612,
+      "loss": 1.1342,
+      "step": 9446
+    },
+    {
+      "epoch": 1.681980056980057,
+      "grad_norm": 0.7243115305900574,
+      "learning_rate": 0.00012488905360214393,
+      "loss": 1.1847,
+      "step": 9447
+    },
+    {
+      "epoch": 1.6821581196581197,
+      "grad_norm": 0.657357931137085,
+      "learning_rate": 0.00012487549636395245,
+      "loss": 0.8747,
+      "step": 9448
+    },
+    {
+      "epoch": 1.6823361823361824,
+      "grad_norm": 0.7471592426300049,
+      "learning_rate": 0.00012486193863837727,
+      "loss": 1.0472,
+      "step": 9449
+    },
+    {
+      "epoch": 1.6825142450142452,
+      "grad_norm": 0.7476530075073242,
+      "learning_rate": 0.00012484838042568406,
+      "loss": 1.0708,
+      "step": 9450
+    },
+    {
+      "epoch": 1.6826923076923077,
+      "grad_norm": 0.6031121611595154,
+      "learning_rate": 0.00012483482172613846,
+      "loss": 0.8243,
+      "step": 9451
+    },
+    {
+      "epoch": 1.6828703703703702,
+      "grad_norm": 0.6733492016792297,
+      "learning_rate": 0.00012482126254000607,
+      "loss": 0.7808,
+      "step": 9452
+    },
+    {
+      "epoch": 1.683048433048433,
+      "grad_norm": 0.5865318179130554,
+      "learning_rate": 0.00012480770286755265,
+      "loss": 0.829,
+      "step": 9453
+    },
+    {
+      "epoch": 1.6832264957264957,
+      "grad_norm": 0.6805713772773743,
+      "learning_rate": 0.0001247941427090438,
+      "loss": 0.7206,
+      "step": 9454
+    },
+    {
+      "epoch": 1.6834045584045585,
+      "grad_norm": 0.6514836549758911,
+      "learning_rate": 0.0001247805820647453,
+      "loss": 0.9499,
+      "step": 9455
+    },
+    {
+      "epoch": 1.6835826210826212,
+      "grad_norm": 0.7432990074157715,
+      "learning_rate": 0.0001247670209349227,
+      "loss": 1.1324,
+      "step": 9456
+    },
+    {
+      "epoch": 1.6837606837606838,
+      "grad_norm": 0.6348414421081543,
+      "learning_rate": 0.00012475345931984178,
+      "loss": 0.8246,
+      "step": 9457
+    },
+    {
+      "epoch": 1.6839387464387463,
+      "grad_norm": 0.7194374203681946,
+      "learning_rate": 0.00012473989721976825,
+      "loss": 0.9634,
+      "step": 9458
+    },
+    {
+      "epoch": 1.684116809116809,
+      "grad_norm": 0.7869647741317749,
+      "learning_rate": 0.00012472633463496785,
+      "loss": 1.2115,
+      "step": 9459
+    },
+    {
+      "epoch": 1.6842948717948718,
+      "grad_norm": 0.6672070026397705,
+      "learning_rate": 0.00012471277156570623,
+      "loss": 0.9842,
+      "step": 9460
+    },
+    {
+      "epoch": 1.6844729344729346,
+      "grad_norm": 0.6611466407775879,
+      "learning_rate": 0.00012469920801224925,
+      "loss": 0.9343,
+      "step": 9461
+    },
+    {
+      "epoch": 1.6846509971509973,
+      "grad_norm": 0.6715068221092224,
+      "learning_rate": 0.0001246856439748626,
+      "loss": 0.6852,
+      "step": 9462
+    },
+    {
+      "epoch": 1.6848290598290598,
+      "grad_norm": 0.641942024230957,
+      "learning_rate": 0.00012467207945381198,
+      "loss": 0.8863,
+      "step": 9463
+    },
+    {
+      "epoch": 1.6850071225071224,
+      "grad_norm": 0.8414762616157532,
+      "learning_rate": 0.00012465851444936325,
+      "loss": 1.3404,
+      "step": 9464
+    },
+    {
+      "epoch": 1.6851851851851851,
+      "grad_norm": 0.715752363204956,
+      "learning_rate": 0.00012464494896178216,
+      "loss": 1.123,
+      "step": 9465
+    },
+    {
+      "epoch": 1.6853632478632479,
+      "grad_norm": 0.6913973093032837,
+      "learning_rate": 0.00012463138299133447,
+      "loss": 1.0659,
+      "step": 9466
+    },
+    {
+      "epoch": 1.6855413105413106,
+      "grad_norm": 0.6998484134674072,
+      "learning_rate": 0.000124617816538286,
+      "loss": 1.0555,
+      "step": 9467
+    },
+    {
+      "epoch": 1.6857193732193734,
+      "grad_norm": 0.7313308119773865,
+      "learning_rate": 0.00012460424960290256,
+      "loss": 1.0915,
+      "step": 9468
+    },
+    {
+      "epoch": 1.685897435897436,
+      "grad_norm": 0.6790569424629211,
+      "learning_rate": 0.00012459068218544995,
+      "loss": 1.0214,
+      "step": 9469
+    },
+    {
+      "epoch": 1.6860754985754984,
+      "grad_norm": 0.6494466662406921,
+      "learning_rate": 0.00012457711428619402,
+      "loss": 0.9476,
+      "step": 9470
+    },
+    {
+      "epoch": 1.6862535612535612,
+      "grad_norm": 0.8048526048660278,
+      "learning_rate": 0.0001245635459054006,
+      "loss": 1.1852,
+      "step": 9471
+    },
+    {
+      "epoch": 1.686431623931624,
+      "grad_norm": 0.6237879395484924,
+      "learning_rate": 0.0001245499770433355,
+      "loss": 1.0106,
+      "step": 9472
+    },
+    {
+      "epoch": 1.6866096866096867,
+      "grad_norm": 0.6282906532287598,
+      "learning_rate": 0.0001245364077002646,
+      "loss": 0.9858,
+      "step": 9473
+    },
+    {
+      "epoch": 1.6867877492877494,
+      "grad_norm": 0.7239370346069336,
+      "learning_rate": 0.00012452283787645375,
+      "loss": 0.9586,
+      "step": 9474
+    },
+    {
+      "epoch": 1.686965811965812,
+      "grad_norm": 0.6438776850700378,
+      "learning_rate": 0.00012450926757216887,
+      "loss": 0.9198,
+      "step": 9475
+    },
+    {
+      "epoch": 1.6871438746438745,
+      "grad_norm": 0.6451360583305359,
+      "learning_rate": 0.00012449569678767578,
+      "loss": 1.0183,
+      "step": 9476
+    },
+    {
+      "epoch": 1.6873219373219372,
+      "grad_norm": 0.6950216293334961,
+      "learning_rate": 0.0001244821255232404,
+      "loss": 0.9048,
+      "step": 9477
+    },
+    {
+      "epoch": 1.6875,
+      "grad_norm": 0.710489809513092,
+      "learning_rate": 0.00012446855377912865,
+      "loss": 1.1596,
+      "step": 9478
+    },
+    {
+      "epoch": 1.6876780626780628,
+      "grad_norm": 0.6819305419921875,
+      "learning_rate": 0.0001244549815556064,
+      "loss": 0.8486,
+      "step": 9479
+    },
+    {
+      "epoch": 1.6878561253561255,
+      "grad_norm": 0.7185879945755005,
+      "learning_rate": 0.00012444140885293958,
+      "loss": 0.9539,
+      "step": 9480
+    },
+    {
+      "epoch": 1.688034188034188,
+      "grad_norm": 0.8181464672088623,
+      "learning_rate": 0.00012442783567139415,
+      "loss": 1.0038,
+      "step": 9481
+    },
+    {
+      "epoch": 1.6882122507122506,
+      "grad_norm": 0.47161349654197693,
+      "learning_rate": 0.000124414262011236,
+      "loss": 0.67,
+      "step": 9482
+    },
+    {
+      "epoch": 1.6883903133903133,
+      "grad_norm": 0.7752482295036316,
+      "learning_rate": 0.00012440068787273112,
+      "loss": 0.9944,
+      "step": 9483
+    },
+    {
+      "epoch": 1.688568376068376,
+      "grad_norm": 0.7119397521018982,
+      "learning_rate": 0.00012438711325614543,
+      "loss": 0.9098,
+      "step": 9484
+    },
+    {
+      "epoch": 1.6887464387464388,
+      "grad_norm": 0.7161153554916382,
+      "learning_rate": 0.00012437353816174493,
+      "loss": 1.0003,
+      "step": 9485
+    },
+    {
+      "epoch": 1.6889245014245016,
+      "grad_norm": 0.5989507436752319,
+      "learning_rate": 0.0001243599625897956,
+      "loss": 1.0301,
+      "step": 9486
+    },
+    {
+      "epoch": 1.689102564102564,
+      "grad_norm": 0.7906841039657593,
+      "learning_rate": 0.00012434638654056334,
+      "loss": 1.0388,
+      "step": 9487
+    },
+    {
+      "epoch": 1.6892806267806266,
+      "grad_norm": 0.6679551601409912,
+      "learning_rate": 0.00012433281001431428,
+      "loss": 0.9505,
+      "step": 9488
+    },
+    {
+      "epoch": 1.6894586894586894,
+      "grad_norm": 0.7090578675270081,
+      "learning_rate": 0.0001243192330113143,
+      "loss": 0.8616,
+      "step": 9489
+    },
+    {
+      "epoch": 1.6896367521367521,
+      "grad_norm": 0.6401308178901672,
+      "learning_rate": 0.00012430565553182949,
+      "loss": 0.9099,
+      "step": 9490
+    },
+    {
+      "epoch": 1.6898148148148149,
+      "grad_norm": 0.7360149621963501,
+      "learning_rate": 0.00012429207757612586,
+      "loss": 1.0233,
+      "step": 9491
+    },
+    {
+      "epoch": 1.6899928774928776,
+      "grad_norm": 0.6736137270927429,
+      "learning_rate": 0.00012427849914446946,
+      "loss": 0.9803,
+      "step": 9492
+    },
+    {
+      "epoch": 1.6901709401709402,
+      "grad_norm": 0.7728668451309204,
+      "learning_rate": 0.00012426492023712623,
+      "loss": 1.2316,
+      "step": 9493
+    },
+    {
+      "epoch": 1.6903490028490027,
+      "grad_norm": 0.789718508720398,
+      "learning_rate": 0.00012425134085436234,
+      "loss": 1.1218,
+      "step": 9494
+    },
+    {
+      "epoch": 1.6905270655270654,
+      "grad_norm": 0.7314121723175049,
+      "learning_rate": 0.0001242377609964438,
+      "loss": 1.1294,
+      "step": 9495
+    },
+    {
+      "epoch": 1.6907051282051282,
+      "grad_norm": 0.7222046256065369,
+      "learning_rate": 0.0001242241806636367,
+      "loss": 1.0288,
+      "step": 9496
+    },
+    {
+      "epoch": 1.690883190883191,
+      "grad_norm": 0.7546363472938538,
+      "learning_rate": 0.00012421059985620708,
+      "loss": 0.8781,
+      "step": 9497
+    },
+    {
+      "epoch": 1.6910612535612537,
+      "grad_norm": 0.7502550482749939,
+      "learning_rate": 0.00012419701857442104,
+      "loss": 0.927,
+      "step": 9498
+    },
+    {
+      "epoch": 1.6912393162393162,
+      "grad_norm": 0.6244059205055237,
+      "learning_rate": 0.00012418343681854473,
+      "loss": 0.9689,
+      "step": 9499
+    },
+    {
+      "epoch": 1.6914173789173788,
+      "grad_norm": 0.7214263677597046,
+      "learning_rate": 0.00012416985458884417,
+      "loss": 1.0842,
+      "step": 9500
+    },
+    {
+      "epoch": 1.6915954415954415,
+      "grad_norm": 0.6960242390632629,
+      "learning_rate": 0.00012415627188558555,
+      "loss": 0.9766,
+      "step": 9501
+    },
+    {
+      "epoch": 1.6917735042735043,
+      "grad_norm": 0.6687830686569214,
+      "learning_rate": 0.00012414268870903494,
+      "loss": 1.0222,
+      "step": 9502
+    },
+    {
+      "epoch": 1.691951566951567,
+      "grad_norm": 0.8611155152320862,
+      "learning_rate": 0.00012412910505945848,
+      "loss": 1.1792,
+      "step": 9503
+    },
+    {
+      "epoch": 1.6921296296296298,
+      "grad_norm": 0.6655587553977966,
+      "learning_rate": 0.00012411552093712235,
+      "loss": 0.8763,
+      "step": 9504
+    },
+    {
+      "epoch": 1.6923076923076923,
+      "grad_norm": 0.7829837799072266,
+      "learning_rate": 0.00012410193634229268,
+      "loss": 1.0803,
+      "step": 9505
+    },
+    {
+      "epoch": 1.6924857549857548,
+      "grad_norm": 0.7951042652130127,
+      "learning_rate": 0.00012408835127523566,
+      "loss": 1.0925,
+      "step": 9506
+    },
+    {
+      "epoch": 1.6926638176638176,
+      "grad_norm": 0.715495228767395,
+      "learning_rate": 0.0001240747657362174,
+      "loss": 1.2411,
+      "step": 9507
+    },
+    {
+      "epoch": 1.6928418803418803,
+      "grad_norm": 0.6779513359069824,
+      "learning_rate": 0.00012406117972550414,
+      "loss": 0.8886,
+      "step": 9508
+    },
+    {
+      "epoch": 1.693019943019943,
+      "grad_norm": 0.647588312625885,
+      "learning_rate": 0.00012404759324336203,
+      "loss": 1.107,
+      "step": 9509
+    },
+    {
+      "epoch": 1.6931980056980058,
+      "grad_norm": 0.7398989796638489,
+      "learning_rate": 0.00012403400629005726,
+      "loss": 1.0256,
+      "step": 9510
+    },
+    {
+      "epoch": 1.6933760683760684,
+      "grad_norm": 0.7572638392448425,
+      "learning_rate": 0.0001240204188658561,
+      "loss": 0.9662,
+      "step": 9511
+    },
+    {
+      "epoch": 1.693554131054131,
+      "grad_norm": 0.7044163346290588,
+      "learning_rate": 0.00012400683097102473,
+      "loss": 1.1388,
+      "step": 9512
+    },
+    {
+      "epoch": 1.6937321937321936,
+      "grad_norm": 0.7889094948768616,
+      "learning_rate": 0.00012399324260582936,
+      "loss": 1.0453,
+      "step": 9513
+    },
+    {
+      "epoch": 1.6939102564102564,
+      "grad_norm": 0.7977854609489441,
+      "learning_rate": 0.00012397965377053627,
+      "loss": 1.015,
+      "step": 9514
+    },
+    {
+      "epoch": 1.6940883190883191,
+      "grad_norm": 0.6223814487457275,
+      "learning_rate": 0.00012396606446541165,
+      "loss": 0.7985,
+      "step": 9515
+    },
+    {
+      "epoch": 1.694266381766382,
+      "grad_norm": 0.8307462334632874,
+      "learning_rate": 0.0001239524746907218,
+      "loss": 0.8899,
+      "step": 9516
+    },
+    {
+      "epoch": 1.6944444444444444,
+      "grad_norm": 0.7780544757843018,
+      "learning_rate": 0.00012393888444673295,
+      "loss": 0.9406,
+      "step": 9517
+    },
+    {
+      "epoch": 1.6946225071225072,
+      "grad_norm": 0.6894499659538269,
+      "learning_rate": 0.0001239252937337114,
+      "loss": 0.9412,
+      "step": 9518
+    },
+    {
+      "epoch": 1.6948005698005697,
+      "grad_norm": 0.7000680565834045,
+      "learning_rate": 0.00012391170255192342,
+      "loss": 1.0314,
+      "step": 9519
+    },
+    {
+      "epoch": 1.6949786324786325,
+      "grad_norm": 0.6772416830062866,
+      "learning_rate": 0.0001238981109016353,
+      "loss": 0.9153,
+      "step": 9520
+    },
+    {
+      "epoch": 1.6951566951566952,
+      "grad_norm": 0.7069609761238098,
+      "learning_rate": 0.00012388451878311333,
+      "loss": 1.1777,
+      "step": 9521
+    },
+    {
+      "epoch": 1.695334757834758,
+      "grad_norm": 0.6138432621955872,
+      "learning_rate": 0.00012387092619662386,
+      "loss": 0.8085,
+      "step": 9522
+    },
+    {
+      "epoch": 1.6955128205128205,
+      "grad_norm": 0.6122859716415405,
+      "learning_rate": 0.00012385733314243313,
+      "loss": 0.8534,
+      "step": 9523
+    },
+    {
+      "epoch": 1.6956908831908832,
+      "grad_norm": 0.7499903440475464,
+      "learning_rate": 0.00012384373962080755,
+      "loss": 0.9329,
+      "step": 9524
+    },
+    {
+      "epoch": 1.6958689458689458,
+      "grad_norm": 0.6413441896438599,
+      "learning_rate": 0.00012383014563201343,
+      "loss": 0.9609,
+      "step": 9525
+    },
+    {
+      "epoch": 1.6960470085470085,
+      "grad_norm": 0.7467969059944153,
+      "learning_rate": 0.0001238165511763171,
+      "loss": 0.9142,
+      "step": 9526
+    },
+    {
+      "epoch": 1.6962250712250713,
+      "grad_norm": 0.6540884375572205,
+      "learning_rate": 0.00012380295625398494,
+      "loss": 0.9503,
+      "step": 9527
+    },
+    {
+      "epoch": 1.696403133903134,
+      "grad_norm": 0.6298567652702332,
+      "learning_rate": 0.00012378936086528326,
+      "loss": 0.8853,
+      "step": 9528
+    },
+    {
+      "epoch": 1.6965811965811965,
+      "grad_norm": 0.8003417253494263,
+      "learning_rate": 0.00012377576501047845,
+      "loss": 0.969,
+      "step": 9529
+    },
+    {
+      "epoch": 1.6967592592592593,
+      "grad_norm": 0.8318493962287903,
+      "learning_rate": 0.00012376216868983697,
+      "loss": 1.1413,
+      "step": 9530
+    },
+    {
+      "epoch": 1.6969373219373218,
+      "grad_norm": 0.8294426202774048,
+      "learning_rate": 0.00012374857190362515,
+      "loss": 1.1885,
+      "step": 9531
+    },
+    {
+      "epoch": 1.6971153846153846,
+      "grad_norm": 0.7502955198287964,
+      "learning_rate": 0.0001237349746521094,
+      "loss": 1.233,
+      "step": 9532
+    },
+    {
+      "epoch": 1.6972934472934473,
+      "grad_norm": 0.6306588649749756,
+      "learning_rate": 0.00012372137693555612,
+      "loss": 1.2255,
+      "step": 9533
+    },
+    {
+      "epoch": 1.69747150997151,
+      "grad_norm": 0.7802746891975403,
+      "learning_rate": 0.0001237077787542317,
+      "loss": 1.2054,
+      "step": 9534
+    },
+    {
+      "epoch": 1.6976495726495726,
+      "grad_norm": 0.685114860534668,
+      "learning_rate": 0.00012369418010840265,
+      "loss": 0.9865,
+      "step": 9535
+    },
+    {
+      "epoch": 1.6978276353276354,
+      "grad_norm": 0.6656857132911682,
+      "learning_rate": 0.00012368058099833536,
+      "loss": 1.1579,
+      "step": 9536
+    },
+    {
+      "epoch": 1.698005698005698,
+      "grad_norm": 0.6596674919128418,
+      "learning_rate": 0.00012366698142429625,
+      "loss": 0.9104,
+      "step": 9537
+    },
+    {
+      "epoch": 1.6981837606837606,
+      "grad_norm": 0.6025584936141968,
+      "learning_rate": 0.00012365338138655183,
+      "loss": 1.117,
+      "step": 9538
+    },
+    {
+      "epoch": 1.6983618233618234,
+      "grad_norm": 0.671585202217102,
+      "learning_rate": 0.0001236397808853685,
+      "loss": 1.0271,
+      "step": 9539
+    },
+    {
+      "epoch": 1.6985398860398861,
+      "grad_norm": 0.7467984557151794,
+      "learning_rate": 0.0001236261799210128,
+      "loss": 1.0411,
+      "step": 9540
+    },
+    {
+      "epoch": 1.6987179487179487,
+      "grad_norm": 0.6251640915870667,
+      "learning_rate": 0.0001236125784937512,
+      "loss": 0.7154,
+      "step": 9541
+    },
+    {
+      "epoch": 1.6988960113960114,
+      "grad_norm": 0.7560956478118896,
+      "learning_rate": 0.00012359897660385016,
+      "loss": 1.0048,
+      "step": 9542
+    },
+    {
+      "epoch": 1.699074074074074,
+      "grad_norm": 0.6144903302192688,
+      "learning_rate": 0.00012358537425157618,
+      "loss": 1.1294,
+      "step": 9543
+    },
+    {
+      "epoch": 1.6992521367521367,
+      "grad_norm": 0.7839425206184387,
+      "learning_rate": 0.00012357177143719578,
+      "loss": 1.0725,
+      "step": 9544
+    },
+    {
+      "epoch": 1.6994301994301995,
+      "grad_norm": 0.6488651037216187,
+      "learning_rate": 0.00012355816816097553,
+      "loss": 0.9267,
+      "step": 9545
+    },
+    {
+      "epoch": 1.6996082621082622,
+      "grad_norm": 0.6848782896995544,
+      "learning_rate": 0.00012354456442318187,
+      "loss": 1.0426,
+      "step": 9546
+    },
+    {
+      "epoch": 1.6997863247863247,
+      "grad_norm": 0.7164611220359802,
+      "learning_rate": 0.0001235309602240814,
+      "loss": 0.8208,
+      "step": 9547
+    },
+    {
+      "epoch": 1.6999643874643875,
+      "grad_norm": 0.6725530624389648,
+      "learning_rate": 0.0001235173555639406,
+      "loss": 0.9366,
+      "step": 9548
+    },
+    {
+      "epoch": 1.70014245014245,
+      "grad_norm": 0.6958004236221313,
+      "learning_rate": 0.00012350375044302612,
+      "loss": 1.0185,
+      "step": 9549
+    },
+    {
+      "epoch": 1.7003205128205128,
+      "grad_norm": 0.8035947680473328,
+      "learning_rate": 0.00012349014486160445,
+      "loss": 1.065,
+      "step": 9550
+    },
+    {
+      "epoch": 1.7004985754985755,
+      "grad_norm": 0.6705633997917175,
+      "learning_rate": 0.00012347653881994222,
+      "loss": 0.8381,
+      "step": 9551
+    },
+    {
+      "epoch": 1.7006766381766383,
+      "grad_norm": 0.6652300357818604,
+      "learning_rate": 0.00012346293231830596,
+      "loss": 1.1428,
+      "step": 9552
+    },
+    {
+      "epoch": 1.7008547008547008,
+      "grad_norm": 0.6719335913658142,
+      "learning_rate": 0.0001234493253569623,
+      "loss": 1.0138,
+      "step": 9553
+    },
+    {
+      "epoch": 1.7010327635327636,
+      "grad_norm": 0.746981680393219,
+      "learning_rate": 0.0001234357179361778,
+      "loss": 1.1169,
+      "step": 9554
+    },
+    {
+      "epoch": 1.701210826210826,
+      "grad_norm": 0.6768170595169067,
+      "learning_rate": 0.0001234221100562191,
+      "loss": 0.9065,
+      "step": 9555
+    },
+    {
+      "epoch": 1.7013888888888888,
+      "grad_norm": 0.7127171754837036,
+      "learning_rate": 0.00012340850171735278,
+      "loss": 0.9467,
+      "step": 9556
+    },
+    {
+      "epoch": 1.7015669515669516,
+      "grad_norm": 0.6802694797515869,
+      "learning_rate": 0.00012339489291984554,
+      "loss": 0.8938,
+      "step": 9557
+    },
+    {
+      "epoch": 1.7017450142450143,
+      "grad_norm": 0.7101455926895142,
+      "learning_rate": 0.00012338128366396394,
+      "loss": 1.1939,
+      "step": 9558
+    },
+    {
+      "epoch": 1.7019230769230769,
+      "grad_norm": 0.621223509311676,
+      "learning_rate": 0.00012336767394997467,
+      "loss": 0.7583,
+      "step": 9559
+    },
+    {
+      "epoch": 1.7021011396011396,
+      "grad_norm": 0.7130763530731201,
+      "learning_rate": 0.00012335406377814439,
+      "loss": 0.8684,
+      "step": 9560
+    },
+    {
+      "epoch": 1.7022792022792022,
+      "grad_norm": 0.6761086583137512,
+      "learning_rate": 0.00012334045314873972,
+      "loss": 1.0197,
+      "step": 9561
+    },
+    {
+      "epoch": 1.702457264957265,
+      "grad_norm": 0.7030459642410278,
+      "learning_rate": 0.00012332684206202736,
+      "loss": 0.8627,
+      "step": 9562
+    },
+    {
+      "epoch": 1.7026353276353277,
+      "grad_norm": 0.6278037428855896,
+      "learning_rate": 0.000123313230518274,
+      "loss": 0.8953,
+      "step": 9563
+    },
+    {
+      "epoch": 1.7028133903133904,
+      "grad_norm": 0.6450623869895935,
+      "learning_rate": 0.00012329961851774627,
+      "loss": 0.8826,
+      "step": 9564
+    },
+    {
+      "epoch": 1.702991452991453,
+      "grad_norm": 0.7324244976043701,
+      "learning_rate": 0.00012328600606071097,
+      "loss": 1.0133,
+      "step": 9565
+    },
+    {
+      "epoch": 1.7031695156695157,
+      "grad_norm": 0.6560033559799194,
+      "learning_rate": 0.00012327239314743473,
+      "loss": 0.9601,
+      "step": 9566
+    },
+    {
+      "epoch": 1.7033475783475782,
+      "grad_norm": 0.6693514585494995,
+      "learning_rate": 0.0001232587797781843,
+      "loss": 0.9447,
+      "step": 9567
+    },
+    {
+      "epoch": 1.703525641025641,
+      "grad_norm": 0.6403199434280396,
+      "learning_rate": 0.00012324516595322638,
+      "loss": 0.8554,
+      "step": 9568
+    },
+    {
+      "epoch": 1.7037037037037037,
+      "grad_norm": 0.8290280103683472,
+      "learning_rate": 0.00012323155167282774,
+      "loss": 1.1877,
+      "step": 9569
+    },
+    {
+      "epoch": 1.7038817663817665,
+      "grad_norm": 0.7207778692245483,
+      "learning_rate": 0.00012321793693725509,
+      "loss": 1.0978,
+      "step": 9570
+    },
+    {
+      "epoch": 1.7040598290598292,
+      "grad_norm": 0.8794265985488892,
+      "learning_rate": 0.00012320432174677519,
+      "loss": 0.9387,
+      "step": 9571
+    },
+    {
+      "epoch": 1.7042378917378918,
+      "grad_norm": 0.6683359146118164,
+      "learning_rate": 0.00012319070610165484,
+      "loss": 0.9227,
+      "step": 9572
+    },
+    {
+      "epoch": 1.7044159544159543,
+      "grad_norm": 0.7342001795768738,
+      "learning_rate": 0.00012317709000216076,
+      "loss": 0.9453,
+      "step": 9573
+    },
+    {
+      "epoch": 1.704594017094017,
+      "grad_norm": 0.6315770149230957,
+      "learning_rate": 0.00012316347344855973,
+      "loss": 0.8263,
+      "step": 9574
+    },
+    {
+      "epoch": 1.7047720797720798,
+      "grad_norm": 0.7697155475616455,
+      "learning_rate": 0.00012314985644111857,
+      "loss": 1.0238,
+      "step": 9575
+    },
+    {
+      "epoch": 1.7049501424501425,
+      "grad_norm": 0.6674068570137024,
+      "learning_rate": 0.00012313623898010408,
+      "loss": 1.0823,
+      "step": 9576
+    },
+    {
+      "epoch": 1.7051282051282053,
+      "grad_norm": 0.6995484232902527,
+      "learning_rate": 0.00012312262106578304,
+      "loss": 1.2001,
+      "step": 9577
+    },
+    {
+      "epoch": 1.7053062678062678,
+      "grad_norm": 0.7639257907867432,
+      "learning_rate": 0.00012310900269842226,
+      "loss": 1.3438,
+      "step": 9578
+    },
+    {
+      "epoch": 1.7054843304843303,
+      "grad_norm": 0.6486390233039856,
+      "learning_rate": 0.00012309538387828857,
+      "loss": 0.9924,
+      "step": 9579
+    },
+    {
+      "epoch": 1.705662393162393,
+      "grad_norm": 0.6737813949584961,
+      "learning_rate": 0.00012308176460564885,
+      "loss": 0.8722,
+      "step": 9580
+    },
+    {
+      "epoch": 1.7058404558404558,
+      "grad_norm": 0.6462090611457825,
+      "learning_rate": 0.00012306814488076987,
+      "loss": 1.1013,
+      "step": 9581
+    },
+    {
+      "epoch": 1.7060185185185186,
+      "grad_norm": 0.7887832522392273,
+      "learning_rate": 0.00012305452470391852,
+      "loss": 0.9998,
+      "step": 9582
+    },
+    {
+      "epoch": 1.7061965811965814,
+      "grad_norm": 0.6345070004463196,
+      "learning_rate": 0.00012304090407536165,
+      "loss": 1.0305,
+      "step": 9583
+    },
+    {
+      "epoch": 1.7063746438746439,
+      "grad_norm": 0.6398460268974304,
+      "learning_rate": 0.0001230272829953661,
+      "loss": 1.2243,
+      "step": 9584
+    },
+    {
+      "epoch": 1.7065527065527064,
+      "grad_norm": 0.6501944065093994,
+      "learning_rate": 0.00012301366146419879,
+      "loss": 0.9425,
+      "step": 9585
+    },
+    {
+      "epoch": 1.7067307692307692,
+      "grad_norm": 0.6406761407852173,
+      "learning_rate": 0.00012300003948212661,
+      "loss": 0.948,
+      "step": 9586
+    },
+    {
+      "epoch": 1.706908831908832,
+      "grad_norm": 0.7114266157150269,
+      "learning_rate": 0.00012298641704941644,
+      "loss": 1.1291,
+      "step": 9587
+    },
+    {
+      "epoch": 1.7070868945868947,
+      "grad_norm": 0.6653099656105042,
+      "learning_rate": 0.00012297279416633515,
+      "loss": 1.0156,
+      "step": 9588
+    },
+    {
+      "epoch": 1.7072649572649574,
+      "grad_norm": 0.5970917344093323,
+      "learning_rate": 0.0001229591708331497,
+      "loss": 0.9424,
+      "step": 9589
+    },
+    {
+      "epoch": 1.70744301994302,
+      "grad_norm": 0.6861461400985718,
+      "learning_rate": 0.00012294554705012694,
+      "loss": 0.7581,
+      "step": 9590
+    },
+    {
+      "epoch": 1.7076210826210825,
+      "grad_norm": 0.6930568218231201,
+      "learning_rate": 0.00012293192281753393,
+      "loss": 1.0544,
+      "step": 9591
+    },
+    {
+      "epoch": 1.7077991452991452,
+      "grad_norm": 0.7420656085014343,
+      "learning_rate": 0.00012291829813563748,
+      "loss": 0.7092,
+      "step": 9592
+    },
+    {
+      "epoch": 1.707977207977208,
+      "grad_norm": 0.6607801914215088,
+      "learning_rate": 0.0001229046730047046,
+      "loss": 0.5544,
+      "step": 9593
+    },
+    {
+      "epoch": 1.7081552706552707,
+      "grad_norm": 0.8419139385223389,
+      "learning_rate": 0.00012289104742500224,
+      "loss": 1.0443,
+      "step": 9594
+    },
+    {
+      "epoch": 1.7083333333333335,
+      "grad_norm": 0.6774617433547974,
+      "learning_rate": 0.00012287742139679734,
+      "loss": 1.0098,
+      "step": 9595
+    },
+    {
+      "epoch": 1.708511396011396,
+      "grad_norm": 0.7517698407173157,
+      "learning_rate": 0.0001228637949203569,
+      "loss": 1.1145,
+      "step": 9596
+    },
+    {
+      "epoch": 1.7086894586894585,
+      "grad_norm": 0.6048635840415955,
+      "learning_rate": 0.00012285016799594791,
+      "loss": 0.7398,
+      "step": 9597
+    },
+    {
+      "epoch": 1.7088675213675213,
+      "grad_norm": 0.8054425716400146,
+      "learning_rate": 0.00012283654062383734,
+      "loss": 1.0893,
+      "step": 9598
+    },
+    {
+      "epoch": 1.709045584045584,
+      "grad_norm": 0.8694897294044495,
+      "learning_rate": 0.0001228229128042922,
+      "loss": 1.2366,
+      "step": 9599
+    },
+    {
+      "epoch": 1.7092236467236468,
+      "grad_norm": 0.7460638880729675,
+      "learning_rate": 0.00012280928453757946,
+      "loss": 1.1753,
+      "step": 9600
+    },
+    {
+      "epoch": 1.7094017094017095,
+      "grad_norm": 0.6714958548545837,
+      "learning_rate": 0.00012279565582396618,
+      "loss": 1.0473,
+      "step": 9601
+    },
+    {
+      "epoch": 1.709579772079772,
+      "grad_norm": 0.6893340945243835,
+      "learning_rate": 0.00012278202666371937,
+      "loss": 1.2761,
+      "step": 9602
+    },
+    {
+      "epoch": 1.7097578347578346,
+      "grad_norm": 0.6816153526306152,
+      "learning_rate": 0.00012276839705710612,
+      "loss": 0.991,
+      "step": 9603
+    },
+    {
+      "epoch": 1.7099358974358974,
+      "grad_norm": 0.6961633563041687,
+      "learning_rate": 0.0001227547670043934,
+      "loss": 1.0634,
+      "step": 9604
+    },
+    {
+      "epoch": 1.71011396011396,
+      "grad_norm": 0.643734872341156,
+      "learning_rate": 0.0001227411365058483,
+      "loss": 0.8672,
+      "step": 9605
+    },
+    {
+      "epoch": 1.7102920227920229,
+      "grad_norm": 0.7313315272331238,
+      "learning_rate": 0.00012272750556173784,
+      "loss": 1.1152,
+      "step": 9606
+    },
+    {
+      "epoch": 1.7104700854700856,
+      "grad_norm": 0.6464954614639282,
+      "learning_rate": 0.00012271387417232916,
+      "loss": 0.8798,
+      "step": 9607
+    },
+    {
+      "epoch": 1.7106481481481481,
+      "grad_norm": 0.8365204334259033,
+      "learning_rate": 0.00012270024233788929,
+      "loss": 1.213,
+      "step": 9608
+    },
+    {
+      "epoch": 1.7108262108262107,
+      "grad_norm": 0.6460705995559692,
+      "learning_rate": 0.0001226866100586853,
+      "loss": 0.9232,
+      "step": 9609
+    },
+    {
+      "epoch": 1.7110042735042734,
+      "grad_norm": 0.6446022987365723,
+      "learning_rate": 0.00012267297733498434,
+      "loss": 0.8295,
+      "step": 9610
+    },
+    {
+      "epoch": 1.7111823361823362,
+      "grad_norm": 0.7692012190818787,
+      "learning_rate": 0.00012265934416705345,
+      "loss": 1.0715,
+      "step": 9611
+    },
+    {
+      "epoch": 1.711360398860399,
+      "grad_norm": 0.671154260635376,
+      "learning_rate": 0.0001226457105551598,
+      "loss": 0.9752,
+      "step": 9612
+    },
+    {
+      "epoch": 1.7115384615384617,
+      "grad_norm": 0.6525935530662537,
+      "learning_rate": 0.00012263207649957053,
+      "loss": 1.09,
+      "step": 9613
+    },
+    {
+      "epoch": 1.7117165242165242,
+      "grad_norm": 0.6984749436378479,
+      "learning_rate": 0.0001226184420005527,
+      "loss": 0.9956,
+      "step": 9614
+    },
+    {
+      "epoch": 1.7118945868945867,
+      "grad_norm": 0.6769809126853943,
+      "learning_rate": 0.0001226048070583735,
+      "loss": 1.0151,
+      "step": 9615
+    },
+    {
+      "epoch": 1.7120726495726495,
+      "grad_norm": 0.6085978746414185,
+      "learning_rate": 0.00012259117167330005,
+      "loss": 0.8706,
+      "step": 9616
+    },
+    {
+      "epoch": 1.7122507122507122,
+      "grad_norm": 0.7335749268531799,
+      "learning_rate": 0.00012257753584559952,
+      "loss": 1.0575,
+      "step": 9617
+    },
+    {
+      "epoch": 1.712428774928775,
+      "grad_norm": 0.7392038106918335,
+      "learning_rate": 0.0001225638995755391,
+      "loss": 0.8763,
+      "step": 9618
+    },
+    {
+      "epoch": 1.7126068376068377,
+      "grad_norm": 0.6708608865737915,
+      "learning_rate": 0.00012255026286338592,
+      "loss": 1.131,
+      "step": 9619
+    },
+    {
+      "epoch": 1.7127849002849003,
+      "grad_norm": 0.726657509803772,
+      "learning_rate": 0.0001225366257094072,
+      "loss": 1.0569,
+      "step": 9620
+    },
+    {
+      "epoch": 1.7129629629629628,
+      "grad_norm": 0.749098002910614,
+      "learning_rate": 0.0001225229881138701,
+      "loss": 0.9196,
+      "step": 9621
+    },
+    {
+      "epoch": 1.7131410256410255,
+      "grad_norm": 0.6550580263137817,
+      "learning_rate": 0.00012250935007704182,
+      "loss": 1.0244,
+      "step": 9622
+    },
+    {
+      "epoch": 1.7133190883190883,
+      "grad_norm": 0.7714282274246216,
+      "learning_rate": 0.00012249571159918962,
+      "loss": 1.1025,
+      "step": 9623
+    },
+    {
+      "epoch": 1.713497150997151,
+      "grad_norm": 0.7869850397109985,
+      "learning_rate": 0.00012248207268058064,
+      "loss": 0.9238,
+      "step": 9624
+    },
+    {
+      "epoch": 1.7136752136752138,
+      "grad_norm": 0.7187856435775757,
+      "learning_rate": 0.00012246843332148216,
+      "loss": 1.081,
+      "step": 9625
+    },
+    {
+      "epoch": 1.7138532763532763,
+      "grad_norm": 0.6634210348129272,
+      "learning_rate": 0.00012245479352216142,
+      "loss": 1.1944,
+      "step": 9626
+    },
+    {
+      "epoch": 1.714031339031339,
+      "grad_norm": 0.6609212160110474,
+      "learning_rate": 0.00012244115328288567,
+      "loss": 0.9613,
+      "step": 9627
+    },
+    {
+      "epoch": 1.7142094017094016,
+      "grad_norm": 0.7906867861747742,
+      "learning_rate": 0.0001224275126039221,
+      "loss": 1.2692,
+      "step": 9628
+    },
+    {
+      "epoch": 1.7143874643874644,
+      "grad_norm": 0.8037096858024597,
+      "learning_rate": 0.000122413871485538,
+      "loss": 0.9823,
+      "step": 9629
+    },
+    {
+      "epoch": 1.7145655270655271,
+      "grad_norm": 0.7740145921707153,
+      "learning_rate": 0.00012240022992800068,
+      "loss": 1.1937,
+      "step": 9630
+    },
+    {
+      "epoch": 1.7147435897435899,
+      "grad_norm": 0.595372200012207,
+      "learning_rate": 0.00012238658793157738,
+      "loss": 0.9153,
+      "step": 9631
+    },
+    {
+      "epoch": 1.7149216524216524,
+      "grad_norm": 0.6671900749206543,
+      "learning_rate": 0.0001223729454965354,
+      "loss": 1.0895,
+      "step": 9632
+    },
+    {
+      "epoch": 1.7150997150997151,
+      "grad_norm": 0.5805774927139282,
+      "learning_rate": 0.000122359302623142,
+      "loss": 1.0001,
+      "step": 9633
+    },
+    {
+      "epoch": 1.7152777777777777,
+      "grad_norm": 0.8851602673530579,
+      "learning_rate": 0.00012234565931166456,
+      "loss": 1.2828,
+      "step": 9634
+    },
+    {
+      "epoch": 1.7154558404558404,
+      "grad_norm": 0.6960011720657349,
+      "learning_rate": 0.0001223320155623703,
+      "loss": 1.0622,
+      "step": 9635
+    },
+    {
+      "epoch": 1.7156339031339032,
+      "grad_norm": 0.5587009191513062,
+      "learning_rate": 0.0001223183713755266,
+      "loss": 0.83,
+      "step": 9636
+    },
+    {
+      "epoch": 1.715811965811966,
+      "grad_norm": 0.6892730593681335,
+      "learning_rate": 0.00012230472675140076,
+      "loss": 0.9214,
+      "step": 9637
+    },
+    {
+      "epoch": 1.7159900284900285,
+      "grad_norm": 0.6545090079307556,
+      "learning_rate": 0.00012229108169026017,
+      "loss": 0.829,
+      "step": 9638
+    },
+    {
+      "epoch": 1.7161680911680912,
+      "grad_norm": 0.6539101600646973,
+      "learning_rate": 0.00012227743619237213,
+      "loss": 1.0686,
+      "step": 9639
+    },
+    {
+      "epoch": 1.7163461538461537,
+      "grad_norm": 0.5887274146080017,
+      "learning_rate": 0.000122263790258004,
+      "loss": 0.9285,
+      "step": 9640
+    },
+    {
+      "epoch": 1.7165242165242165,
+      "grad_norm": 0.6328918933868408,
+      "learning_rate": 0.00012225014388742313,
+      "loss": 0.9684,
+      "step": 9641
+    },
+    {
+      "epoch": 1.7167022792022792,
+      "grad_norm": 0.6377436518669128,
+      "learning_rate": 0.00012223649708089694,
+      "loss": 0.9425,
+      "step": 9642
+    },
+    {
+      "epoch": 1.716880341880342,
+      "grad_norm": 0.6967392563819885,
+      "learning_rate": 0.00012222284983869275,
+      "loss": 0.9342,
+      "step": 9643
+    },
+    {
+      "epoch": 1.7170584045584045,
+      "grad_norm": 0.7051317691802979,
+      "learning_rate": 0.00012220920216107802,
+      "loss": 1.1843,
+      "step": 9644
+    },
+    {
+      "epoch": 1.7172364672364673,
+      "grad_norm": 0.6864503622055054,
+      "learning_rate": 0.00012219555404832007,
+      "loss": 1.0371,
+      "step": 9645
+    },
+    {
+      "epoch": 1.7174145299145298,
+      "grad_norm": 0.583454430103302,
+      "learning_rate": 0.00012218190550068638,
+      "loss": 0.6774,
+      "step": 9646
+    },
+    {
+      "epoch": 1.7175925925925926,
+      "grad_norm": 0.6755677461624146,
+      "learning_rate": 0.0001221682565184443,
+      "loss": 0.9517,
+      "step": 9647
+    },
+    {
+      "epoch": 1.7177706552706553,
+      "grad_norm": 0.7230031490325928,
+      "learning_rate": 0.0001221546071018613,
+      "loss": 1.0385,
+      "step": 9648
+    },
+    {
+      "epoch": 1.717948717948718,
+      "grad_norm": 0.7381200194358826,
+      "learning_rate": 0.0001221409572512048,
+      "loss": 0.9893,
+      "step": 9649
+    },
+    {
+      "epoch": 1.7181267806267806,
+      "grad_norm": 0.7079094648361206,
+      "learning_rate": 0.0001221273069667422,
+      "loss": 0.7793,
+      "step": 9650
+    },
+    {
+      "epoch": 1.7183048433048433,
+      "grad_norm": 0.6666881442070007,
+      "learning_rate": 0.00012211365624874106,
+      "loss": 0.9752,
+      "step": 9651
+    },
+    {
+      "epoch": 1.7184829059829059,
+      "grad_norm": 0.6196922659873962,
+      "learning_rate": 0.00012210000509746868,
+      "loss": 0.922,
+      "step": 9652
+    },
+    {
+      "epoch": 1.7186609686609686,
+      "grad_norm": 0.657879650592804,
+      "learning_rate": 0.00012208635351319266,
+      "loss": 1.2583,
+      "step": 9653
+    },
+    {
+      "epoch": 1.7188390313390314,
+      "grad_norm": 0.7240459322929382,
+      "learning_rate": 0.00012207270149618043,
+      "loss": 0.8479,
+      "step": 9654
+    },
+    {
+      "epoch": 1.7190170940170941,
+      "grad_norm": 0.8293825387954712,
+      "learning_rate": 0.00012205904904669945,
+      "loss": 0.9092,
+      "step": 9655
+    },
+    {
+      "epoch": 1.7191951566951567,
+      "grad_norm": 0.6907553672790527,
+      "learning_rate": 0.0001220453961650172,
+      "loss": 1.0543,
+      "step": 9656
+    },
+    {
+      "epoch": 1.7193732193732194,
+      "grad_norm": 0.7178300023078918,
+      "learning_rate": 0.00012203174285140124,
+      "loss": 0.9147,
+      "step": 9657
+    },
+    {
+      "epoch": 1.719551282051282,
+      "grad_norm": 0.7037166357040405,
+      "learning_rate": 0.00012201808910611905,
+      "loss": 0.8685,
+      "step": 9658
+    },
+    {
+      "epoch": 1.7197293447293447,
+      "grad_norm": 0.5850751996040344,
+      "learning_rate": 0.00012200443492943813,
+      "loss": 0.72,
+      "step": 9659
+    },
+    {
+      "epoch": 1.7199074074074074,
+      "grad_norm": 0.744239330291748,
+      "learning_rate": 0.00012199078032162603,
+      "loss": 0.9717,
+      "step": 9660
+    },
+    {
+      "epoch": 1.7200854700854702,
+      "grad_norm": 0.6509126424789429,
+      "learning_rate": 0.00012197712528295025,
+      "loss": 0.9768,
+      "step": 9661
+    },
+    {
+      "epoch": 1.7202635327635327,
+      "grad_norm": 0.623220682144165,
+      "learning_rate": 0.00012196346981367837,
+      "loss": 0.9824,
+      "step": 9662
+    },
+    {
+      "epoch": 1.7204415954415955,
+      "grad_norm": 0.6376451849937439,
+      "learning_rate": 0.00012194981391407792,
+      "loss": 0.8228,
+      "step": 9663
+    },
+    {
+      "epoch": 1.720619658119658,
+      "grad_norm": 0.794830322265625,
+      "learning_rate": 0.00012193615758441648,
+      "loss": 0.9168,
+      "step": 9664
+    },
+    {
+      "epoch": 1.7207977207977208,
+      "grad_norm": 0.7812975645065308,
+      "learning_rate": 0.0001219225008249616,
+      "loss": 0.8625,
+      "step": 9665
+    },
+    {
+      "epoch": 1.7209757834757835,
+      "grad_norm": 0.6843218207359314,
+      "learning_rate": 0.0001219088436359808,
+      "loss": 1.0176,
+      "step": 9666
+    },
+    {
+      "epoch": 1.7211538461538463,
+      "grad_norm": 0.6924905180931091,
+      "learning_rate": 0.00012189518601774178,
+      "loss": 0.855,
+      "step": 9667
+    },
+    {
+      "epoch": 1.7213319088319088,
+      "grad_norm": 0.6348826289176941,
+      "learning_rate": 0.00012188152797051202,
+      "loss": 1.1596,
+      "step": 9668
+    },
+    {
+      "epoch": 1.7215099715099715,
+      "grad_norm": 0.7170482873916626,
+      "learning_rate": 0.00012186786949455922,
+      "loss": 0.9811,
+      "step": 9669
+    },
+    {
+      "epoch": 1.721688034188034,
+      "grad_norm": 0.7471763491630554,
+      "learning_rate": 0.00012185421059015094,
+      "loss": 1.0925,
+      "step": 9670
+    },
+    {
+      "epoch": 1.7218660968660968,
+      "grad_norm": 0.6771119236946106,
+      "learning_rate": 0.00012184055125755481,
+      "loss": 0.9403,
+      "step": 9671
+    },
+    {
+      "epoch": 1.7220441595441596,
+      "grad_norm": 0.4335343539714813,
+      "learning_rate": 0.0001218268914970384,
+      "loss": 0.4925,
+      "step": 9672
+    },
+    {
+      "epoch": 1.7222222222222223,
+      "grad_norm": 0.6652585864067078,
+      "learning_rate": 0.00012181323130886943,
+      "loss": 0.7684,
+      "step": 9673
+    },
+    {
+      "epoch": 1.7224002849002849,
+      "grad_norm": 0.6465467810630798,
+      "learning_rate": 0.00012179957069331548,
+      "loss": 0.9011,
+      "step": 9674
+    },
+    {
+      "epoch": 1.7225783475783476,
+      "grad_norm": 0.6725688576698303,
+      "learning_rate": 0.00012178590965064427,
+      "loss": 0.9563,
+      "step": 9675
+    },
+    {
+      "epoch": 1.7227564102564101,
+      "grad_norm": 0.6223418712615967,
+      "learning_rate": 0.00012177224818112341,
+      "loss": 0.9099,
+      "step": 9676
+    },
+    {
+      "epoch": 1.7229344729344729,
+      "grad_norm": 0.79325270652771,
+      "learning_rate": 0.00012175858628502053,
+      "loss": 1.0318,
+      "step": 9677
+    },
+    {
+      "epoch": 1.7231125356125356,
+      "grad_norm": 0.6735602617263794,
+      "learning_rate": 0.0001217449239626034,
+      "loss": 1.0797,
+      "step": 9678
+    },
+    {
+      "epoch": 1.7232905982905984,
+      "grad_norm": 0.7082492113113403,
+      "learning_rate": 0.00012173126121413962,
+      "loss": 1.1341,
+      "step": 9679
+    },
+    {
+      "epoch": 1.723468660968661,
+      "grad_norm": 0.6563859581947327,
+      "learning_rate": 0.00012171759803989696,
+      "loss": 0.8778,
+      "step": 9680
+    },
+    {
+      "epoch": 1.7236467236467237,
+      "grad_norm": 0.6867792010307312,
+      "learning_rate": 0.00012170393444014306,
+      "loss": 0.8301,
+      "step": 9681
+    },
+    {
+      "epoch": 1.7238247863247862,
+      "grad_norm": 0.7870511412620544,
+      "learning_rate": 0.00012169027041514562,
+      "loss": 0.9165,
+      "step": 9682
+    },
+    {
+      "epoch": 1.724002849002849,
+      "grad_norm": 0.8006493449211121,
+      "learning_rate": 0.00012167660596517241,
+      "loss": 1.0395,
+      "step": 9683
+    },
+    {
+      "epoch": 1.7241809116809117,
+      "grad_norm": 0.6936125159263611,
+      "learning_rate": 0.00012166294109049114,
+      "loss": 1.1037,
+      "step": 9684
+    },
+    {
+      "epoch": 1.7243589743589745,
+      "grad_norm": 0.8176514506340027,
+      "learning_rate": 0.00012164927579136956,
+      "loss": 0.8791,
+      "step": 9685
+    },
+    {
+      "epoch": 1.7245370370370372,
+      "grad_norm": 0.6948300004005432,
+      "learning_rate": 0.00012163561006807537,
+      "loss": 0.9292,
+      "step": 9686
+    },
+    {
+      "epoch": 1.7247150997150997,
+      "grad_norm": 0.6237453818321228,
+      "learning_rate": 0.00012162194392087634,
+      "loss": 0.8553,
+      "step": 9687
+    },
+    {
+      "epoch": 1.7248931623931623,
+      "grad_norm": 0.6198007464408875,
+      "learning_rate": 0.00012160827735004021,
+      "loss": 0.9599,
+      "step": 9688
+    },
+    {
+      "epoch": 1.725071225071225,
+      "grad_norm": 0.639838695526123,
+      "learning_rate": 0.00012159461035583482,
+      "loss": 0.9328,
+      "step": 9689
+    },
+    {
+      "epoch": 1.7252492877492878,
+      "grad_norm": 0.7264436483383179,
+      "learning_rate": 0.00012158094293852789,
+      "loss": 1.0247,
+      "step": 9690
+    },
+    {
+      "epoch": 1.7254273504273505,
+      "grad_norm": 0.6320534348487854,
+      "learning_rate": 0.00012156727509838721,
+      "loss": 1.1222,
+      "step": 9691
+    },
+    {
+      "epoch": 1.7256054131054133,
+      "grad_norm": 0.6204122304916382,
+      "learning_rate": 0.00012155360683568056,
+      "loss": 0.9765,
+      "step": 9692
+    },
+    {
+      "epoch": 1.7257834757834758,
+      "grad_norm": 0.7026457190513611,
+      "learning_rate": 0.00012153993815067579,
+      "loss": 1.0178,
+      "step": 9693
+    },
+    {
+      "epoch": 1.7259615384615383,
+      "grad_norm": 0.6471006870269775,
+      "learning_rate": 0.00012152626904364067,
+      "loss": 1.0035,
+      "step": 9694
+    },
+    {
+      "epoch": 1.726139601139601,
+      "grad_norm": 0.6875706911087036,
+      "learning_rate": 0.00012151259951484301,
+      "loss": 0.7921,
+      "step": 9695
+    },
+    {
+      "epoch": 1.7263176638176638,
+      "grad_norm": 0.6963251233100891,
+      "learning_rate": 0.00012149892956455067,
+      "loss": 0.9677,
+      "step": 9696
+    },
+    {
+      "epoch": 1.7264957264957266,
+      "grad_norm": 0.9077282547950745,
+      "learning_rate": 0.00012148525919303142,
+      "loss": 0.9362,
+      "step": 9697
+    },
+    {
+      "epoch": 1.7266737891737893,
+      "grad_norm": 0.7347434163093567,
+      "learning_rate": 0.00012147158840055319,
+      "loss": 0.8712,
+      "step": 9698
+    },
+    {
+      "epoch": 1.7268518518518519,
+      "grad_norm": 0.7206630110740662,
+      "learning_rate": 0.00012145791718738377,
+      "loss": 1.032,
+      "step": 9699
+    },
+    {
+      "epoch": 1.7270299145299144,
+      "grad_norm": 0.7174662947654724,
+      "learning_rate": 0.00012144424555379106,
+      "loss": 0.954,
+      "step": 9700
+    },
+    {
+      "epoch": 1.7272079772079771,
+      "grad_norm": 0.7442345023155212,
+      "learning_rate": 0.0001214305735000429,
+      "loss": 1.0709,
+      "step": 9701
+    },
+    {
+      "epoch": 1.72738603988604,
+      "grad_norm": 0.6154376268386841,
+      "learning_rate": 0.00012141690102640715,
+      "loss": 0.9365,
+      "step": 9702
+    },
+    {
+      "epoch": 1.7275641025641026,
+      "grad_norm": 0.6213796734809875,
+      "learning_rate": 0.00012140322813315172,
+      "loss": 0.8337,
+      "step": 9703
+    },
+    {
+      "epoch": 1.7277421652421654,
+      "grad_norm": 0.7682011127471924,
+      "learning_rate": 0.0001213895548205445,
+      "loss": 1.1579,
+      "step": 9704
+    },
+    {
+      "epoch": 1.727920227920228,
+      "grad_norm": 0.6796970963478088,
+      "learning_rate": 0.0001213758810888534,
+      "loss": 0.8875,
+      "step": 9705
+    },
+    {
+      "epoch": 1.7280982905982905,
+      "grad_norm": 0.7203732132911682,
+      "learning_rate": 0.0001213622069383463,
+      "loss": 0.7827,
+      "step": 9706
+    },
+    {
+      "epoch": 1.7282763532763532,
+      "grad_norm": 0.6151877045631409,
+      "learning_rate": 0.00012134853236929111,
+      "loss": 1.0282,
+      "step": 9707
+    },
+    {
+      "epoch": 1.728454415954416,
+      "grad_norm": 0.6665124297142029,
+      "learning_rate": 0.0001213348573819558,
+      "loss": 1.0636,
+      "step": 9708
+    },
+    {
+      "epoch": 1.7286324786324787,
+      "grad_norm": 0.7334614396095276,
+      "learning_rate": 0.00012132118197660829,
+      "loss": 1.0889,
+      "step": 9709
+    },
+    {
+      "epoch": 1.7288105413105415,
+      "grad_norm": 0.7267759442329407,
+      "learning_rate": 0.00012130750615351649,
+      "loss": 1.096,
+      "step": 9710
+    },
+    {
+      "epoch": 1.728988603988604,
+      "grad_norm": 0.6542944312095642,
+      "learning_rate": 0.00012129382991294837,
+      "loss": 1.0855,
+      "step": 9711
+    },
+    {
+      "epoch": 1.7291666666666665,
+      "grad_norm": 0.694523274898529,
+      "learning_rate": 0.00012128015325517193,
+      "loss": 0.8482,
+      "step": 9712
+    },
+    {
+      "epoch": 1.7293447293447293,
+      "grad_norm": 0.7879082560539246,
+      "learning_rate": 0.00012126647618045504,
+      "loss": 1.2356,
+      "step": 9713
+    },
+    {
+      "epoch": 1.729522792022792,
+      "grad_norm": 0.7108420729637146,
+      "learning_rate": 0.00012125279868906574,
+      "loss": 1.0185,
+      "step": 9714
+    },
+    {
+      "epoch": 1.7297008547008548,
+      "grad_norm": 0.6928725838661194,
+      "learning_rate": 0.000121239120781272,
+      "loss": 1.1507,
+      "step": 9715
+    },
+    {
+      "epoch": 1.7298789173789175,
+      "grad_norm": 0.6195241212844849,
+      "learning_rate": 0.00012122544245734182,
+      "loss": 0.8656,
+      "step": 9716
+    },
+    {
+      "epoch": 1.73005698005698,
+      "grad_norm": 0.5962017774581909,
+      "learning_rate": 0.00012121176371754317,
+      "loss": 0.918,
+      "step": 9717
+    },
+    {
+      "epoch": 1.7302350427350426,
+      "grad_norm": 0.7409394979476929,
+      "learning_rate": 0.00012119808456214407,
+      "loss": 1.0283,
+      "step": 9718
+    },
+    {
+      "epoch": 1.7304131054131053,
+      "grad_norm": 0.6571973562240601,
+      "learning_rate": 0.00012118440499141257,
+      "loss": 1.1015,
+      "step": 9719
+    },
+    {
+      "epoch": 1.730591168091168,
+      "grad_norm": 0.681394100189209,
+      "learning_rate": 0.00012117072500561664,
+      "loss": 0.8247,
+      "step": 9720
+    },
+    {
+      "epoch": 1.7307692307692308,
+      "grad_norm": 0.7278251647949219,
+      "learning_rate": 0.00012115704460502432,
+      "loss": 1.0693,
+      "step": 9721
+    },
+    {
+      "epoch": 1.7309472934472936,
+      "grad_norm": 0.6569405794143677,
+      "learning_rate": 0.0001211433637899037,
+      "loss": 0.8992,
+      "step": 9722
+    },
+    {
+      "epoch": 1.7311253561253561,
+      "grad_norm": 0.6305136680603027,
+      "learning_rate": 0.00012112968256052272,
+      "loss": 0.8543,
+      "step": 9723
+    },
+    {
+      "epoch": 1.7313034188034186,
+      "grad_norm": 0.6111339330673218,
+      "learning_rate": 0.00012111600091714956,
+      "loss": 0.991,
+      "step": 9724
+    },
+    {
+      "epoch": 1.7314814814814814,
+      "grad_norm": 0.646973192691803,
+      "learning_rate": 0.00012110231886005223,
+      "loss": 0.8855,
+      "step": 9725
+    },
+    {
+      "epoch": 1.7316595441595442,
+      "grad_norm": 0.7054407000541687,
+      "learning_rate": 0.00012108863638949879,
+      "loss": 1.0816,
+      "step": 9726
+    },
+    {
+      "epoch": 1.731837606837607,
+      "grad_norm": 0.6592162847518921,
+      "learning_rate": 0.00012107495350575729,
+      "loss": 1.0961,
+      "step": 9727
+    },
+    {
+      "epoch": 1.7320156695156697,
+      "grad_norm": 0.6615595817565918,
+      "learning_rate": 0.00012106127020909587,
+      "loss": 0.9669,
+      "step": 9728
+    },
+    {
+      "epoch": 1.7321937321937322,
+      "grad_norm": 0.9030881524085999,
+      "learning_rate": 0.00012104758649978263,
+      "loss": 0.9438,
+      "step": 9729
+    },
+    {
+      "epoch": 1.7323717948717947,
+      "grad_norm": 0.6776516437530518,
+      "learning_rate": 0.00012103390237808566,
+      "loss": 0.8967,
+      "step": 9730
+    },
+    {
+      "epoch": 1.7325498575498575,
+      "grad_norm": 0.6010605096817017,
+      "learning_rate": 0.00012102021784427306,
+      "loss": 0.8893,
+      "step": 9731
+    },
+    {
+      "epoch": 1.7327279202279202,
+      "grad_norm": 0.6540384292602539,
+      "learning_rate": 0.00012100653289861295,
+      "loss": 0.9328,
+      "step": 9732
+    },
+    {
+      "epoch": 1.732905982905983,
+      "grad_norm": 0.6836950182914734,
+      "learning_rate": 0.00012099284754137345,
+      "loss": 0.9019,
+      "step": 9733
+    },
+    {
+      "epoch": 1.7330840455840457,
+      "grad_norm": 0.7597874402999878,
+      "learning_rate": 0.00012097916177282274,
+      "loss": 1.0093,
+      "step": 9734
+    },
+    {
+      "epoch": 1.7332621082621082,
+      "grad_norm": 0.7686513066291809,
+      "learning_rate": 0.00012096547559322892,
+      "loss": 0.8685,
+      "step": 9735
+    },
+    {
+      "epoch": 1.7334401709401708,
+      "grad_norm": 0.613777220249176,
+      "learning_rate": 0.0001209517890028602,
+      "loss": 0.8317,
+      "step": 9736
+    },
+    {
+      "epoch": 1.7336182336182335,
+      "grad_norm": 0.6788455843925476,
+      "learning_rate": 0.00012093810200198466,
+      "loss": 0.866,
+      "step": 9737
+    },
+    {
+      "epoch": 1.7337962962962963,
+      "grad_norm": 0.616801381111145,
+      "learning_rate": 0.00012092441459087047,
+      "loss": 0.8299,
+      "step": 9738
+    },
+    {
+      "epoch": 1.733974358974359,
+      "grad_norm": 0.731987476348877,
+      "learning_rate": 0.00012091072676978589,
+      "loss": 1.089,
+      "step": 9739
+    },
+    {
+      "epoch": 1.7341524216524218,
+      "grad_norm": 0.7042871117591858,
+      "learning_rate": 0.00012089703853899905,
+      "loss": 0.8667,
+      "step": 9740
+    },
+    {
+      "epoch": 1.7343304843304843,
+      "grad_norm": 0.62722247838974,
+      "learning_rate": 0.00012088334989877817,
+      "loss": 0.9185,
+      "step": 9741
+    },
+    {
+      "epoch": 1.734508547008547,
+      "grad_norm": 0.6354684829711914,
+      "learning_rate": 0.0001208696608493914,
+      "loss": 0.9951,
+      "step": 9742
+    },
+    {
+      "epoch": 1.7346866096866096,
+      "grad_norm": 0.658647894859314,
+      "learning_rate": 0.00012085597139110698,
+      "loss": 0.9324,
+      "step": 9743
+    },
+    {
+      "epoch": 1.7348646723646723,
+      "grad_norm": 0.84359210729599,
+      "learning_rate": 0.00012084228152419312,
+      "loss": 1.0861,
+      "step": 9744
+    },
+    {
+      "epoch": 1.735042735042735,
+      "grad_norm": 0.6293938755989075,
+      "learning_rate": 0.00012082859124891807,
+      "loss": 0.9676,
+      "step": 9745
+    },
+    {
+      "epoch": 1.7352207977207978,
+      "grad_norm": 0.6398760676383972,
+      "learning_rate": 0.00012081490056555004,
+      "loss": 0.8502,
+      "step": 9746
+    },
+    {
+      "epoch": 1.7353988603988604,
+      "grad_norm": 0.6918041706085205,
+      "learning_rate": 0.00012080120947435726,
+      "loss": 1.0081,
+      "step": 9747
+    },
+    {
+      "epoch": 1.7355769230769231,
+      "grad_norm": 0.7374079823493958,
+      "learning_rate": 0.00012078751797560798,
+      "loss": 0.9485,
+      "step": 9748
+    },
+    {
+      "epoch": 1.7357549857549857,
+      "grad_norm": 0.7392128705978394,
+      "learning_rate": 0.00012077382606957049,
+      "loss": 0.9283,
+      "step": 9749
+    },
+    {
+      "epoch": 1.7359330484330484,
+      "grad_norm": 0.701320230960846,
+      "learning_rate": 0.00012076013375651303,
+      "loss": 1.0339,
+      "step": 9750
+    },
+    {
+      "epoch": 1.7361111111111112,
+      "grad_norm": 0.6316696405410767,
+      "learning_rate": 0.00012074644103670387,
+      "loss": 0.9097,
+      "step": 9751
+    },
+    {
+      "epoch": 1.736289173789174,
+      "grad_norm": 0.6892024278640747,
+      "learning_rate": 0.00012073274791041132,
+      "loss": 1.0863,
+      "step": 9752
+    },
+    {
+      "epoch": 1.7364672364672364,
+      "grad_norm": 0.6032847762107849,
+      "learning_rate": 0.00012071905437790361,
+      "loss": 0.9305,
+      "step": 9753
+    },
+    {
+      "epoch": 1.7366452991452992,
+      "grad_norm": 0.6659184098243713,
+      "learning_rate": 0.00012070536043944907,
+      "loss": 0.9793,
+      "step": 9754
+    },
+    {
+      "epoch": 1.7368233618233617,
+      "grad_norm": 0.7413665056228638,
+      "learning_rate": 0.00012069166609531602,
+      "loss": 1.0523,
+      "step": 9755
+    },
+    {
+      "epoch": 1.7370014245014245,
+      "grad_norm": 0.7814368009567261,
+      "learning_rate": 0.00012067797134577275,
+      "loss": 0.9988,
+      "step": 9756
+    },
+    {
+      "epoch": 1.7371794871794872,
+      "grad_norm": 0.6174948811531067,
+      "learning_rate": 0.00012066427619108757,
+      "loss": 0.9002,
+      "step": 9757
+    },
+    {
+      "epoch": 1.73735754985755,
+      "grad_norm": 0.6521819233894348,
+      "learning_rate": 0.00012065058063152885,
+      "loss": 1.1307,
+      "step": 9758
+    },
+    {
+      "epoch": 1.7375356125356125,
+      "grad_norm": 0.6797493696212769,
+      "learning_rate": 0.00012063688466736489,
+      "loss": 0.84,
+      "step": 9759
+    },
+    {
+      "epoch": 1.7377136752136753,
+      "grad_norm": 0.6496474146842957,
+      "learning_rate": 0.00012062318829886404,
+      "loss": 0.86,
+      "step": 9760
+    },
+    {
+      "epoch": 1.7378917378917378,
+      "grad_norm": 0.6701306104660034,
+      "learning_rate": 0.00012060949152629467,
+      "loss": 0.9422,
+      "step": 9761
+    },
+    {
+      "epoch": 1.7380698005698005,
+      "grad_norm": 0.7331172823905945,
+      "learning_rate": 0.00012059579434992512,
+      "loss": 1.1648,
+      "step": 9762
+    },
+    {
+      "epoch": 1.7382478632478633,
+      "grad_norm": 0.63930743932724,
+      "learning_rate": 0.00012058209677002375,
+      "loss": 1.0617,
+      "step": 9763
+    },
+    {
+      "epoch": 1.738425925925926,
+      "grad_norm": 0.668851912021637,
+      "learning_rate": 0.00012056839878685895,
+      "loss": 0.8219,
+      "step": 9764
+    },
+    {
+      "epoch": 1.7386039886039886,
+      "grad_norm": 0.7305747270584106,
+      "learning_rate": 0.00012055470040069912,
+      "loss": 1.0416,
+      "step": 9765
+    },
+    {
+      "epoch": 1.7387820512820513,
+      "grad_norm": 0.6931866407394409,
+      "learning_rate": 0.00012054100161181264,
+      "loss": 1.0588,
+      "step": 9766
+    },
+    {
+      "epoch": 1.7389601139601139,
+      "grad_norm": 0.6565485000610352,
+      "learning_rate": 0.00012052730242046785,
+      "loss": 0.7885,
+      "step": 9767
+    },
+    {
+      "epoch": 1.7391381766381766,
+      "grad_norm": 0.739985466003418,
+      "learning_rate": 0.00012051360282693327,
+      "loss": 1.0973,
+      "step": 9768
+    },
+    {
+      "epoch": 1.7393162393162394,
+      "grad_norm": 0.6477079391479492,
+      "learning_rate": 0.00012049990283147723,
+      "loss": 0.9841,
+      "step": 9769
+    },
+    {
+      "epoch": 1.739494301994302,
+      "grad_norm": 0.7018330097198486,
+      "learning_rate": 0.00012048620243436819,
+      "loss": 1.0869,
+      "step": 9770
+    },
+    {
+      "epoch": 1.7396723646723646,
+      "grad_norm": 0.7087421417236328,
+      "learning_rate": 0.00012047250163587456,
+      "loss": 0.916,
+      "step": 9771
+    },
+    {
+      "epoch": 1.7398504273504274,
+      "grad_norm": 0.8747151494026184,
+      "learning_rate": 0.00012045880043626481,
+      "loss": 0.8245,
+      "step": 9772
+    },
+    {
+      "epoch": 1.74002849002849,
+      "grad_norm": 0.777498722076416,
+      "learning_rate": 0.00012044509883580735,
+      "loss": 1.071,
+      "step": 9773
+    },
+    {
+      "epoch": 1.7402065527065527,
+      "grad_norm": 0.6668971180915833,
+      "learning_rate": 0.00012043139683477062,
+      "loss": 1.0447,
+      "step": 9774
+    },
+    {
+      "epoch": 1.7403846153846154,
+      "grad_norm": 0.6702026724815369,
+      "learning_rate": 0.00012041769443342317,
+      "loss": 0.8688,
+      "step": 9775
+    },
+    {
+      "epoch": 1.7405626780626782,
+      "grad_norm": 0.7866267561912537,
+      "learning_rate": 0.00012040399163203337,
+      "loss": 1.0842,
+      "step": 9776
+    },
+    {
+      "epoch": 1.7407407407407407,
+      "grad_norm": 0.7655110955238342,
+      "learning_rate": 0.00012039028843086977,
+      "loss": 1.2417,
+      "step": 9777
+    },
+    {
+      "epoch": 1.7409188034188035,
+      "grad_norm": 0.7084119915962219,
+      "learning_rate": 0.0001203765848302008,
+      "loss": 0.9844,
+      "step": 9778
+    },
+    {
+      "epoch": 1.741096866096866,
+      "grad_norm": 0.7135398983955383,
+      "learning_rate": 0.00012036288083029497,
+      "loss": 1.1102,
+      "step": 9779
+    },
+    {
+      "epoch": 1.7412749287749287,
+      "grad_norm": 0.6784615516662598,
+      "learning_rate": 0.0001203491764314208,
+      "loss": 1.0349,
+      "step": 9780
+    },
+    {
+      "epoch": 1.7414529914529915,
+      "grad_norm": 0.7170301079750061,
+      "learning_rate": 0.00012033547163384682,
+      "loss": 1.0899,
+      "step": 9781
+    },
+    {
+      "epoch": 1.7416310541310542,
+      "grad_norm": 0.6692060828208923,
+      "learning_rate": 0.0001203217664378415,
+      "loss": 1.0486,
+      "step": 9782
+    },
+    {
+      "epoch": 1.7418091168091168,
+      "grad_norm": 0.6730037927627563,
+      "learning_rate": 0.00012030806084367336,
+      "loss": 0.9684,
+      "step": 9783
+    },
+    {
+      "epoch": 1.7419871794871795,
+      "grad_norm": 0.5983504056930542,
+      "learning_rate": 0.00012029435485161096,
+      "loss": 0.7106,
+      "step": 9784
+    },
+    {
+      "epoch": 1.742165242165242,
+      "grad_norm": 0.6834231615066528,
+      "learning_rate": 0.00012028064846192284,
+      "loss": 0.803,
+      "step": 9785
+    },
+    {
+      "epoch": 1.7423433048433048,
+      "grad_norm": 0.621046245098114,
+      "learning_rate": 0.00012026694167487755,
+      "loss": 0.9129,
+      "step": 9786
+    },
+    {
+      "epoch": 1.7425213675213675,
+      "grad_norm": 0.6348989605903625,
+      "learning_rate": 0.00012025323449074361,
+      "loss": 1.0076,
+      "step": 9787
+    },
+    {
+      "epoch": 1.7426994301994303,
+      "grad_norm": 0.6139974594116211,
+      "learning_rate": 0.00012023952690978966,
+      "loss": 1.0756,
+      "step": 9788
+    },
+    {
+      "epoch": 1.7428774928774928,
+      "grad_norm": 0.6473259925842285,
+      "learning_rate": 0.00012022581893228419,
+      "loss": 1.0568,
+      "step": 9789
+    },
+    {
+      "epoch": 1.7430555555555556,
+      "grad_norm": 0.6133778095245361,
+      "learning_rate": 0.00012021211055849581,
+      "loss": 0.8722,
+      "step": 9790
+    },
+    {
+      "epoch": 1.743233618233618,
+      "grad_norm": 0.6934139728546143,
+      "learning_rate": 0.00012019840178869315,
+      "loss": 1.0329,
+      "step": 9791
+    },
+    {
+      "epoch": 1.7434116809116809,
+      "grad_norm": 0.6730150580406189,
+      "learning_rate": 0.00012018469262314474,
+      "loss": 0.9326,
+      "step": 9792
+    },
+    {
+      "epoch": 1.7435897435897436,
+      "grad_norm": 0.6805521249771118,
+      "learning_rate": 0.0001201709830621192,
+      "loss": 1.0527,
+      "step": 9793
+    },
+    {
+      "epoch": 1.7437678062678064,
+      "grad_norm": 0.6972569823265076,
+      "learning_rate": 0.00012015727310588516,
+      "loss": 1.0024,
+      "step": 9794
+    },
+    {
+      "epoch": 1.743945868945869,
+      "grad_norm": 0.7329187989234924,
+      "learning_rate": 0.00012014356275471122,
+      "loss": 1.1864,
+      "step": 9795
+    },
+    {
+      "epoch": 1.7441239316239316,
+      "grad_norm": 0.7220240831375122,
+      "learning_rate": 0.00012012985200886602,
+      "loss": 0.8831,
+      "step": 9796
+    },
+    {
+      "epoch": 1.7443019943019942,
+      "grad_norm": 0.7829749584197998,
+      "learning_rate": 0.00012011614086861818,
+      "loss": 1.0365,
+      "step": 9797
+    },
+    {
+      "epoch": 1.744480056980057,
+      "grad_norm": 0.7148944735527039,
+      "learning_rate": 0.00012010242933423637,
+      "loss": 1.0413,
+      "step": 9798
+    },
+    {
+      "epoch": 1.7446581196581197,
+      "grad_norm": 0.5607262253761292,
+      "learning_rate": 0.00012008871740598917,
+      "loss": 0.8154,
+      "step": 9799
+    },
+    {
+      "epoch": 1.7448361823361824,
+      "grad_norm": 0.754626452922821,
+      "learning_rate": 0.00012007500508414531,
+      "loss": 1.0569,
+      "step": 9800
+    },
+    {
+      "epoch": 1.7450142450142452,
+      "grad_norm": 0.7216293215751648,
+      "learning_rate": 0.00012006129236897343,
+      "loss": 1.1641,
+      "step": 9801
+    },
+    {
+      "epoch": 1.7451923076923077,
+      "grad_norm": 0.6575515270233154,
+      "learning_rate": 0.0001200475792607422,
+      "loss": 0.9063,
+      "step": 9802
+    },
+    {
+      "epoch": 1.7453703703703702,
+      "grad_norm": 0.7411505579948425,
+      "learning_rate": 0.00012003386575972031,
+      "loss": 0.9791,
+      "step": 9803
+    },
+    {
+      "epoch": 1.745548433048433,
+      "grad_norm": 0.6945903301239014,
+      "learning_rate": 0.0001200201518661764,
+      "loss": 0.8111,
+      "step": 9804
+    },
+    {
+      "epoch": 1.7457264957264957,
+      "grad_norm": 0.5760970115661621,
+      "learning_rate": 0.00012000643758037924,
+      "loss": 1.1054,
+      "step": 9805
+    },
+    {
+      "epoch": 1.7459045584045585,
+      "grad_norm": 0.6732224225997925,
+      "learning_rate": 0.00011999272290259748,
+      "loss": 0.8992,
+      "step": 9806
+    },
+    {
+      "epoch": 1.7460826210826212,
+      "grad_norm": 0.673270046710968,
+      "learning_rate": 0.00011997900783309983,
+      "loss": 1.0554,
+      "step": 9807
+    },
+    {
+      "epoch": 1.7462606837606838,
+      "grad_norm": 0.7233314514160156,
+      "learning_rate": 0.00011996529237215503,
+      "loss": 1.066,
+      "step": 9808
+    },
+    {
+      "epoch": 1.7464387464387463,
+      "grad_norm": 0.7016494274139404,
+      "learning_rate": 0.00011995157652003183,
+      "loss": 0.891,
+      "step": 9809
+    },
+    {
+      "epoch": 1.746616809116809,
+      "grad_norm": 0.9377092719078064,
+      "learning_rate": 0.00011993786027699889,
+      "loss": 0.8626,
+      "step": 9810
+    },
+    {
+      "epoch": 1.7467948717948718,
+      "grad_norm": 0.6825845241546631,
+      "learning_rate": 0.00011992414364332503,
+      "loss": 0.8996,
+      "step": 9811
+    },
+    {
+      "epoch": 1.7469729344729346,
+      "grad_norm": 0.6836053729057312,
+      "learning_rate": 0.00011991042661927896,
+      "loss": 0.9338,
+      "step": 9812
+    },
+    {
+      "epoch": 1.7471509971509973,
+      "grad_norm": 0.6462908387184143,
+      "learning_rate": 0.00011989670920512943,
+      "loss": 1.1185,
+      "step": 9813
+    },
+    {
+      "epoch": 1.7473290598290598,
+      "grad_norm": 0.7191921472549438,
+      "learning_rate": 0.00011988299140114522,
+      "loss": 0.9084,
+      "step": 9814
+    },
+    {
+      "epoch": 1.7475071225071224,
+      "grad_norm": 0.6951598525047302,
+      "learning_rate": 0.00011986927320759508,
+      "loss": 1.0653,
+      "step": 9815
+    },
+    {
+      "epoch": 1.7476851851851851,
+      "grad_norm": 0.7512598037719727,
+      "learning_rate": 0.00011985555462474784,
+      "loss": 1.0259,
+      "step": 9816
+    },
+    {
+      "epoch": 1.7478632478632479,
+      "grad_norm": 0.6885492205619812,
+      "learning_rate": 0.00011984183565287226,
+      "loss": 0.7148,
+      "step": 9817
+    },
+    {
+      "epoch": 1.7480413105413106,
+      "grad_norm": 0.6880139708518982,
+      "learning_rate": 0.00011982811629223709,
+      "loss": 1.1567,
+      "step": 9818
+    },
+    {
+      "epoch": 1.7482193732193734,
+      "grad_norm": 0.7381170392036438,
+      "learning_rate": 0.0001198143965431112,
+      "loss": 0.8483,
+      "step": 9819
+    },
+    {
+      "epoch": 1.748397435897436,
+      "grad_norm": 0.6761063933372498,
+      "learning_rate": 0.00011980067640576333,
+      "loss": 0.9498,
+      "step": 9820
+    },
+    {
+      "epoch": 1.7485754985754984,
+      "grad_norm": 0.6454669237136841,
+      "learning_rate": 0.00011978695588046238,
+      "loss": 0.7336,
+      "step": 9821
+    },
+    {
+      "epoch": 1.7487535612535612,
+      "grad_norm": 0.6026871800422668,
+      "learning_rate": 0.00011977323496747712,
+      "loss": 0.8618,
+      "step": 9822
+    },
+    {
+      "epoch": 1.748931623931624,
+      "grad_norm": 0.6877408027648926,
+      "learning_rate": 0.0001197595136670764,
+      "loss": 0.9146,
+      "step": 9823
+    },
+    {
+      "epoch": 1.7491096866096867,
+      "grad_norm": 0.6874892115592957,
+      "learning_rate": 0.00011974579197952906,
+      "loss": 1.1628,
+      "step": 9824
+    },
+    {
+      "epoch": 1.7492877492877494,
+      "grad_norm": 0.7464384436607361,
+      "learning_rate": 0.00011973206990510393,
+      "loss": 1.007,
+      "step": 9825
+    },
+    {
+      "epoch": 1.749465811965812,
+      "grad_norm": 0.7281473278999329,
+      "learning_rate": 0.00011971834744406986,
+      "loss": 1.0776,
+      "step": 9826
+    },
+    {
+      "epoch": 1.7496438746438745,
+      "grad_norm": 0.6112284660339355,
+      "learning_rate": 0.00011970462459669575,
+      "loss": 0.7616,
+      "step": 9827
+    },
+    {
+      "epoch": 1.7498219373219372,
+      "grad_norm": 0.6498035192489624,
+      "learning_rate": 0.00011969090136325048,
+      "loss": 0.884,
+      "step": 9828
+    },
+    {
+      "epoch": 1.7498219373219372,
+      "eval_loss": 1.1018389463424683,
+      "eval_runtime": 24.5594,
+      "eval_samples_per_second": 42.387,
+      "eval_steps_per_second": 21.214,
+      "step": 9828
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 0.6746426224708557,
+      "learning_rate": 0.00011967717774400289,
+      "loss": 0.9023,
+      "step": 9829
+    },
+    {
+      "epoch": 1.7501780626780628,
+      "grad_norm": 0.6513423323631287,
+      "learning_rate": 0.00011966345373922188,
+      "loss": 0.9786,
+      "step": 9830
+    },
+    {
+      "epoch": 1.7503561253561255,
+      "grad_norm": 0.7053804397583008,
+      "learning_rate": 0.00011964972934917632,
+      "loss": 1.0667,
+      "step": 9831
+    },
+    {
+      "epoch": 1.750534188034188,
+      "grad_norm": 0.6769008040428162,
+      "learning_rate": 0.00011963600457413513,
+      "loss": 0.8596,
+      "step": 9832
+    },
+    {
+      "epoch": 1.7507122507122506,
+      "grad_norm": 0.7162246108055115,
+      "learning_rate": 0.00011962227941436725,
+      "loss": 1.0746,
+      "step": 9833
+    },
+    {
+      "epoch": 1.7508903133903133,
+      "grad_norm": 0.7665811777114868,
+      "learning_rate": 0.00011960855387014156,
+      "loss": 1.0056,
+      "step": 9834
+    },
+    {
+      "epoch": 1.751068376068376,
+      "grad_norm": 0.6186950206756592,
+      "learning_rate": 0.00011959482794172696,
+      "loss": 0.9016,
+      "step": 9835
+    },
+    {
+      "epoch": 1.7512464387464388,
+      "grad_norm": 0.8018904328346252,
+      "learning_rate": 0.00011958110162939245,
+      "loss": 0.9534,
+      "step": 9836
+    },
+    {
+      "epoch": 1.7514245014245016,
+      "grad_norm": 0.8239033818244934,
+      "learning_rate": 0.0001195673749334069,
+      "loss": 1.214,
+      "step": 9837
+    },
+    {
+      "epoch": 1.751602564102564,
+      "grad_norm": 0.7886297106742859,
+      "learning_rate": 0.00011955364785403931,
+      "loss": 0.9672,
+      "step": 9838
+    },
+    {
+      "epoch": 1.7517806267806266,
+      "grad_norm": 0.6463177800178528,
+      "learning_rate": 0.00011953992039155862,
+      "loss": 0.9184,
+      "step": 9839
+    },
+    {
+      "epoch": 1.7519586894586894,
+      "grad_norm": 0.7374706864356995,
+      "learning_rate": 0.00011952619254623374,
+      "loss": 0.9988,
+      "step": 9840
+    },
+    {
+      "epoch": 1.7521367521367521,
+      "grad_norm": 0.7456657886505127,
+      "learning_rate": 0.00011951246431833369,
+      "loss": 1.2197,
+      "step": 9841
+    },
+    {
+      "epoch": 1.7523148148148149,
+      "grad_norm": 0.6644248962402344,
+      "learning_rate": 0.00011949873570812746,
+      "loss": 0.9449,
+      "step": 9842
+    },
+    {
+      "epoch": 1.7524928774928776,
+      "grad_norm": 0.707919180393219,
+      "learning_rate": 0.000119485006715884,
+      "loss": 0.774,
+      "step": 9843
+    },
+    {
+      "epoch": 1.7526709401709402,
+      "grad_norm": 0.6273906826972961,
+      "learning_rate": 0.00011947127734187231,
+      "loss": 0.8682,
+      "step": 9844
+    },
+    {
+      "epoch": 1.7528490028490027,
+      "grad_norm": 0.8335350155830383,
+      "learning_rate": 0.00011945754758636136,
+      "loss": 1.2282,
+      "step": 9845
+    },
+    {
+      "epoch": 1.7530270655270654,
+      "grad_norm": 0.6849051117897034,
+      "learning_rate": 0.00011944381744962022,
+      "loss": 1.1091,
+      "step": 9846
+    },
+    {
+      "epoch": 1.7532051282051282,
+      "grad_norm": 0.8571760058403015,
+      "learning_rate": 0.00011943008693191781,
+      "loss": 0.9806,
+      "step": 9847
+    },
+    {
+      "epoch": 1.753383190883191,
+      "grad_norm": 0.7045019268989563,
+      "learning_rate": 0.00011941635603352328,
+      "loss": 0.9217,
+      "step": 9848
+    },
+    {
+      "epoch": 1.7535612535612537,
+      "grad_norm": 0.6820187568664551,
+      "learning_rate": 0.00011940262475470556,
+      "loss": 0.9983,
+      "step": 9849
+    },
+    {
+      "epoch": 1.7537393162393162,
+      "grad_norm": 0.7400697469711304,
+      "learning_rate": 0.00011938889309573374,
+      "loss": 0.9521,
+      "step": 9850
+    },
+    {
+      "epoch": 1.7539173789173788,
+      "grad_norm": 0.7027658820152283,
+      "learning_rate": 0.00011937516105687678,
+      "loss": 1.0749,
+      "step": 9851
+    },
+    {
+      "epoch": 1.7540954415954415,
+      "grad_norm": 0.6778307557106018,
+      "learning_rate": 0.00011936142863840382,
+      "loss": 1.0249,
+      "step": 9852
+    },
+    {
+      "epoch": 1.7542735042735043,
+      "grad_norm": 0.6787961721420288,
+      "learning_rate": 0.00011934769584058389,
+      "loss": 1.0014,
+      "step": 9853
+    },
+    {
+      "epoch": 1.754451566951567,
+      "grad_norm": 0.7515636086463928,
+      "learning_rate": 0.00011933396266368606,
+      "loss": 1.0351,
+      "step": 9854
+    },
+    {
+      "epoch": 1.7546296296296298,
+      "grad_norm": 0.6620134115219116,
+      "learning_rate": 0.00011932022910797938,
+      "loss": 1.0294,
+      "step": 9855
+    },
+    {
+      "epoch": 1.7548076923076923,
+      "grad_norm": 0.8260951638221741,
+      "learning_rate": 0.00011930649517373294,
+      "loss": 0.9078,
+      "step": 9856
+    },
+    {
+      "epoch": 1.7549857549857548,
+      "grad_norm": 0.7680675983428955,
+      "learning_rate": 0.00011929276086121584,
+      "loss": 0.92,
+      "step": 9857
+    },
+    {
+      "epoch": 1.7551638176638176,
+      "grad_norm": 0.7104191184043884,
+      "learning_rate": 0.00011927902617069717,
+      "loss": 0.9937,
+      "step": 9858
+    },
+    {
+      "epoch": 1.7553418803418803,
+      "grad_norm": 0.7185840606689453,
+      "learning_rate": 0.00011926529110244603,
+      "loss": 0.9775,
+      "step": 9859
+    },
+    {
+      "epoch": 1.755519943019943,
+      "grad_norm": 0.7114652991294861,
+      "learning_rate": 0.00011925155565673151,
+      "loss": 0.883,
+      "step": 9860
+    },
+    {
+      "epoch": 1.7556980056980058,
+      "grad_norm": 0.6906639337539673,
+      "learning_rate": 0.00011923781983382276,
+      "loss": 0.9789,
+      "step": 9861
+    },
+    {
+      "epoch": 1.7558760683760684,
+      "grad_norm": 0.706908106803894,
+      "learning_rate": 0.00011922408363398892,
+      "loss": 1.1186,
+      "step": 9862
+    },
+    {
+      "epoch": 1.756054131054131,
+      "grad_norm": 0.7532939910888672,
+      "learning_rate": 0.00011921034705749908,
+      "loss": 0.977,
+      "step": 9863
+    },
+    {
+      "epoch": 1.7562321937321936,
+      "grad_norm": 0.7397763729095459,
+      "learning_rate": 0.0001191966101046224,
+      "loss": 1.1121,
+      "step": 9864
+    },
+    {
+      "epoch": 1.7564102564102564,
+      "grad_norm": 0.6955398321151733,
+      "learning_rate": 0.00011918287277562801,
+      "loss": 1.0439,
+      "step": 9865
+    },
+    {
+      "epoch": 1.7565883190883191,
+      "grad_norm": 0.7485929727554321,
+      "learning_rate": 0.00011916913507078507,
+      "loss": 1.1644,
+      "step": 9866
+    },
+    {
+      "epoch": 1.756766381766382,
+      "grad_norm": 0.6337487101554871,
+      "learning_rate": 0.00011915539699036274,
+      "loss": 0.8216,
+      "step": 9867
+    },
+    {
+      "epoch": 1.7569444444444444,
+      "grad_norm": 0.6628872752189636,
+      "learning_rate": 0.00011914165853463022,
+      "loss": 0.9584,
+      "step": 9868
+    },
+    {
+      "epoch": 1.7571225071225072,
+      "grad_norm": 0.6577547788619995,
+      "learning_rate": 0.00011912791970385666,
+      "loss": 0.9484,
+      "step": 9869
+    },
+    {
+      "epoch": 1.7573005698005697,
+      "grad_norm": 0.6409304738044739,
+      "learning_rate": 0.00011911418049831127,
+      "loss": 1.1256,
+      "step": 9870
+    },
+    {
+      "epoch": 1.7574786324786325,
+      "grad_norm": 0.7499844431877136,
+      "learning_rate": 0.00011910044091826319,
+      "loss": 0.7991,
+      "step": 9871
+    },
+    {
+      "epoch": 1.7576566951566952,
+      "grad_norm": 0.6786715388298035,
+      "learning_rate": 0.00011908670096398165,
+      "loss": 1.0368,
+      "step": 9872
+    },
+    {
+      "epoch": 1.757834757834758,
+      "grad_norm": 0.6432101130485535,
+      "learning_rate": 0.00011907296063573585,
+      "loss": 0.9059,
+      "step": 9873
+    },
+    {
+      "epoch": 1.7580128205128205,
+      "grad_norm": 0.6542613506317139,
+      "learning_rate": 0.00011905921993379503,
+      "loss": 0.9866,
+      "step": 9874
+    },
+    {
+      "epoch": 1.7581908831908832,
+      "grad_norm": 0.6048218011856079,
+      "learning_rate": 0.00011904547885842838,
+      "loss": 0.9488,
+      "step": 9875
+    },
+    {
+      "epoch": 1.7583689458689458,
+      "grad_norm": 0.7694938778877258,
+      "learning_rate": 0.00011903173740990512,
+      "loss": 1.1026,
+      "step": 9876
+    },
+    {
+      "epoch": 1.7585470085470085,
+      "grad_norm": 0.6621627807617188,
+      "learning_rate": 0.00011901799558849451,
+      "loss": 1.135,
+      "step": 9877
+    },
+    {
+      "epoch": 1.7587250712250713,
+      "grad_norm": 0.6561587452888489,
+      "learning_rate": 0.0001190042533944658,
+      "loss": 0.9322,
+      "step": 9878
+    },
+    {
+      "epoch": 1.758903133903134,
+      "grad_norm": 0.7846759557723999,
+      "learning_rate": 0.00011899051082808821,
+      "loss": 0.9324,
+      "step": 9879
+    },
+    {
+      "epoch": 1.7590811965811965,
+      "grad_norm": 0.6004071831703186,
+      "learning_rate": 0.00011897676788963101,
+      "loss": 0.9641,
+      "step": 9880
+    },
+    {
+      "epoch": 1.7592592592592593,
+      "grad_norm": 0.6731070280075073,
+      "learning_rate": 0.00011896302457936344,
+      "loss": 1.1437,
+      "step": 9881
+    },
+    {
+      "epoch": 1.7594373219373218,
+      "grad_norm": 0.6768675446510315,
+      "learning_rate": 0.00011894928089755481,
+      "loss": 1.0707,
+      "step": 9882
+    },
+    {
+      "epoch": 1.7596153846153846,
+      "grad_norm": 0.8368878960609436,
+      "learning_rate": 0.0001189355368444744,
+      "loss": 1.0435,
+      "step": 9883
+    },
+    {
+      "epoch": 1.7597934472934473,
+      "grad_norm": 0.6132324934005737,
+      "learning_rate": 0.00011892179242039149,
+      "loss": 0.8889,
+      "step": 9884
+    },
+    {
+      "epoch": 1.75997150997151,
+      "grad_norm": 0.7598093152046204,
+      "learning_rate": 0.00011890804762557535,
+      "loss": 1.151,
+      "step": 9885
+    },
+    {
+      "epoch": 1.7601495726495726,
+      "grad_norm": 0.7317715883255005,
+      "learning_rate": 0.00011889430246029527,
+      "loss": 0.9992,
+      "step": 9886
+    },
+    {
+      "epoch": 1.7603276353276354,
+      "grad_norm": 0.7664858102798462,
+      "learning_rate": 0.00011888055692482059,
+      "loss": 0.8398,
+      "step": 9887
+    },
+    {
+      "epoch": 1.760505698005698,
+      "grad_norm": 0.6916853189468384,
+      "learning_rate": 0.00011886681101942063,
+      "loss": 0.9507,
+      "step": 9888
+    },
+    {
+      "epoch": 1.7606837606837606,
+      "grad_norm": 0.7103399634361267,
+      "learning_rate": 0.0001188530647443647,
+      "loss": 0.915,
+      "step": 9889
+    },
+    {
+      "epoch": 1.7608618233618234,
+      "grad_norm": 0.6177804470062256,
+      "learning_rate": 0.00011883931809992215,
+      "loss": 0.721,
+      "step": 9890
+    },
+    {
+      "epoch": 1.7610398860398861,
+      "grad_norm": 0.7523959279060364,
+      "learning_rate": 0.00011882557108636227,
+      "loss": 0.99,
+      "step": 9891
+    },
+    {
+      "epoch": 1.7612179487179487,
+      "grad_norm": 0.6211134791374207,
+      "learning_rate": 0.00011881182370395442,
+      "loss": 0.8089,
+      "step": 9892
+    },
+    {
+      "epoch": 1.7613960113960114,
+      "grad_norm": 0.6660307049751282,
+      "learning_rate": 0.00011879807595296802,
+      "loss": 1.1062,
+      "step": 9893
+    },
+    {
+      "epoch": 1.761574074074074,
+      "grad_norm": 0.7039240598678589,
+      "learning_rate": 0.00011878432783367232,
+      "loss": 0.9739,
+      "step": 9894
+    },
+    {
+      "epoch": 1.7617521367521367,
+      "grad_norm": 0.658064603805542,
+      "learning_rate": 0.00011877057934633675,
+      "loss": 0.9438,
+      "step": 9895
+    },
+    {
+      "epoch": 1.7619301994301995,
+      "grad_norm": 0.8227152228355408,
+      "learning_rate": 0.00011875683049123068,
+      "loss": 0.8385,
+      "step": 9896
+    },
+    {
+      "epoch": 1.7621082621082622,
+      "grad_norm": 0.6622483730316162,
+      "learning_rate": 0.00011874308126862346,
+      "loss": 0.9432,
+      "step": 9897
+    },
+    {
+      "epoch": 1.7622863247863247,
+      "grad_norm": 0.7211357951164246,
+      "learning_rate": 0.00011872933167878453,
+      "loss": 1.2471,
+      "step": 9898
+    },
+    {
+      "epoch": 1.7624643874643875,
+      "grad_norm": 0.6177424192428589,
+      "learning_rate": 0.00011871558172198322,
+      "loss": 0.8892,
+      "step": 9899
+    },
+    {
+      "epoch": 1.76264245014245,
+      "grad_norm": 0.6924285888671875,
+      "learning_rate": 0.00011870183139848898,
+      "loss": 1.021,
+      "step": 9900
+    },
+    {
+      "epoch": 1.7628205128205128,
+      "grad_norm": 0.6168648600578308,
+      "learning_rate": 0.0001186880807085712,
+      "loss": 0.9013,
+      "step": 9901
+    },
+    {
+      "epoch": 1.7629985754985755,
+      "grad_norm": 0.6410452723503113,
+      "learning_rate": 0.00011867432965249929,
+      "loss": 0.6686,
+      "step": 9902
+    },
+    {
+      "epoch": 1.7631766381766383,
+      "grad_norm": 0.6959559917449951,
+      "learning_rate": 0.0001186605782305427,
+      "loss": 0.9814,
+      "step": 9903
+    },
+    {
+      "epoch": 1.7633547008547008,
+      "grad_norm": 0.7456178069114685,
+      "learning_rate": 0.00011864682644297085,
+      "loss": 1.0151,
+      "step": 9904
+    },
+    {
+      "epoch": 1.7635327635327636,
+      "grad_norm": 0.6499991416931152,
+      "learning_rate": 0.00011863307429005317,
+      "loss": 0.83,
+      "step": 9905
+    },
+    {
+      "epoch": 1.763710826210826,
+      "grad_norm": 0.643344521522522,
+      "learning_rate": 0.00011861932177205908,
+      "loss": 0.8853,
+      "step": 9906
+    },
+    {
+      "epoch": 1.7638888888888888,
+      "grad_norm": 0.6570441722869873,
+      "learning_rate": 0.00011860556888925804,
+      "loss": 0.9179,
+      "step": 9907
+    },
+    {
+      "epoch": 1.7640669515669516,
+      "grad_norm": 0.6892307996749878,
+      "learning_rate": 0.00011859181564191957,
+      "loss": 0.9657,
+      "step": 9908
+    },
+    {
+      "epoch": 1.7642450142450143,
+      "grad_norm": 0.648158073425293,
+      "learning_rate": 0.0001185780620303131,
+      "loss": 0.9179,
+      "step": 9909
+    },
+    {
+      "epoch": 1.7644230769230769,
+      "grad_norm": 0.5833603143692017,
+      "learning_rate": 0.00011856430805470808,
+      "loss": 0.8505,
+      "step": 9910
+    },
+    {
+      "epoch": 1.7646011396011396,
+      "grad_norm": 0.8302416205406189,
+      "learning_rate": 0.000118550553715374,
+      "loss": 0.8948,
+      "step": 9911
+    },
+    {
+      "epoch": 1.7647792022792022,
+      "grad_norm": 0.7075300216674805,
+      "learning_rate": 0.00011853679901258035,
+      "loss": 1.2467,
+      "step": 9912
+    },
+    {
+      "epoch": 1.764957264957265,
+      "grad_norm": 0.81916344165802,
+      "learning_rate": 0.00011852304394659666,
+      "loss": 0.9963,
+      "step": 9913
+    },
+    {
+      "epoch": 1.7651353276353277,
+      "grad_norm": 0.6492435932159424,
+      "learning_rate": 0.00011850928851769239,
+      "loss": 1.0704,
+      "step": 9914
+    },
+    {
+      "epoch": 1.7653133903133904,
+      "grad_norm": 0.7301090359687805,
+      "learning_rate": 0.00011849553272613704,
+      "loss": 1.0477,
+      "step": 9915
+    },
+    {
+      "epoch": 1.765491452991453,
+      "grad_norm": 0.7280275821685791,
+      "learning_rate": 0.00011848177657220019,
+      "loss": 0.9124,
+      "step": 9916
+    },
+    {
+      "epoch": 1.7656695156695157,
+      "grad_norm": 0.6948845386505127,
+      "learning_rate": 0.00011846802005615127,
+      "loss": 1.2275,
+      "step": 9917
+    },
+    {
+      "epoch": 1.7658475783475782,
+      "grad_norm": 0.6553834676742554,
+      "learning_rate": 0.0001184542631782599,
+      "loss": 1.2311,
+      "step": 9918
+    },
+    {
+      "epoch": 1.766025641025641,
+      "grad_norm": 0.6899739503860474,
+      "learning_rate": 0.00011844050593879556,
+      "loss": 0.8936,
+      "step": 9919
+    },
+    {
+      "epoch": 1.7662037037037037,
+      "grad_norm": 0.6076815128326416,
+      "learning_rate": 0.00011842674833802782,
+      "loss": 0.8432,
+      "step": 9920
+    },
+    {
+      "epoch": 1.7663817663817665,
+      "grad_norm": 0.7650902271270752,
+      "learning_rate": 0.00011841299037622624,
+      "loss": 1.0447,
+      "step": 9921
+    },
+    {
+      "epoch": 1.7665598290598292,
+      "grad_norm": 0.6864938735961914,
+      "learning_rate": 0.00011839923205366032,
+      "loss": 0.936,
+      "step": 9922
+    },
+    {
+      "epoch": 1.7667378917378918,
+      "grad_norm": 0.7176852226257324,
+      "learning_rate": 0.0001183854733705997,
+      "loss": 0.9764,
+      "step": 9923
+    },
+    {
+      "epoch": 1.7669159544159543,
+      "grad_norm": 0.6513439416885376,
+      "learning_rate": 0.00011837171432731393,
+      "loss": 1.0095,
+      "step": 9924
+    },
+    {
+      "epoch": 1.767094017094017,
+      "grad_norm": 0.8031024932861328,
+      "learning_rate": 0.00011835795492407256,
+      "loss": 1.1348,
+      "step": 9925
+    },
+    {
+      "epoch": 1.7672720797720798,
+      "grad_norm": 0.7659830451011658,
+      "learning_rate": 0.00011834419516114518,
+      "loss": 0.9058,
+      "step": 9926
+    },
+    {
+      "epoch": 1.7674501424501425,
+      "grad_norm": 0.8864039778709412,
+      "learning_rate": 0.00011833043503880145,
+      "loss": 1.0342,
+      "step": 9927
+    },
+    {
+      "epoch": 1.7676282051282053,
+      "grad_norm": 0.6870512962341309,
+      "learning_rate": 0.00011831667455731088,
+      "loss": 0.9361,
+      "step": 9928
+    },
+    {
+      "epoch": 1.7678062678062678,
+      "grad_norm": 0.6458830833435059,
+      "learning_rate": 0.00011830291371694315,
+      "loss": 0.8215,
+      "step": 9929
+    },
+    {
+      "epoch": 1.7679843304843303,
+      "grad_norm": 0.7456086874008179,
+      "learning_rate": 0.00011828915251796787,
+      "loss": 1.1243,
+      "step": 9930
+    },
+    {
+      "epoch": 1.768162393162393,
+      "grad_norm": 0.6834850311279297,
+      "learning_rate": 0.00011827539096065459,
+      "loss": 0.9536,
+      "step": 9931
+    },
+    {
+      "epoch": 1.7683404558404558,
+      "grad_norm": 0.643864631652832,
+      "learning_rate": 0.00011826162904527302,
+      "loss": 1.1707,
+      "step": 9932
+    },
+    {
+      "epoch": 1.7685185185185186,
+      "grad_norm": 0.6312864422798157,
+      "learning_rate": 0.00011824786677209275,
+      "loss": 0.7937,
+      "step": 9933
+    },
+    {
+      "epoch": 1.7686965811965814,
+      "grad_norm": 0.6092729568481445,
+      "learning_rate": 0.00011823410414138343,
+      "loss": 0.8787,
+      "step": 9934
+    },
+    {
+      "epoch": 1.7688746438746439,
+      "grad_norm": 0.6859988570213318,
+      "learning_rate": 0.00011822034115341474,
+      "loss": 0.9691,
+      "step": 9935
+    },
+    {
+      "epoch": 1.7690527065527064,
+      "grad_norm": 0.7219935059547424,
+      "learning_rate": 0.0001182065778084563,
+      "loss": 1.0606,
+      "step": 9936
+    },
+    {
+      "epoch": 1.7692307692307692,
+      "grad_norm": 0.6596202850341797,
+      "learning_rate": 0.00011819281410677778,
+      "loss": 1.0543,
+      "step": 9937
+    },
+    {
+      "epoch": 1.769408831908832,
+      "grad_norm": 0.6616338491439819,
+      "learning_rate": 0.00011817905004864887,
+      "loss": 0.9757,
+      "step": 9938
+    },
+    {
+      "epoch": 1.7695868945868947,
+      "grad_norm": 0.6637360453605652,
+      "learning_rate": 0.00011816528563433924,
+      "loss": 0.925,
+      "step": 9939
+    },
+    {
+      "epoch": 1.7697649572649574,
+      "grad_norm": 0.8422333002090454,
+      "learning_rate": 0.00011815152086411859,
+      "loss": 1.1343,
+      "step": 9940
+    },
+    {
+      "epoch": 1.76994301994302,
+      "grad_norm": 0.6638204455375671,
+      "learning_rate": 0.00011813775573825656,
+      "loss": 1.2136,
+      "step": 9941
+    },
+    {
+      "epoch": 1.7701210826210825,
+      "grad_norm": 0.7258831858634949,
+      "learning_rate": 0.0001181239902570229,
+      "loss": 0.7308,
+      "step": 9942
+    },
+    {
+      "epoch": 1.7702991452991452,
+      "grad_norm": 0.730582594871521,
+      "learning_rate": 0.0001181102244206873,
+      "loss": 1.1097,
+      "step": 9943
+    },
+    {
+      "epoch": 1.770477207977208,
+      "grad_norm": 0.7324019074440002,
+      "learning_rate": 0.00011809645822951946,
+      "loss": 0.9802,
+      "step": 9944
+    },
+    {
+      "epoch": 1.7706552706552707,
+      "grad_norm": 0.5565997958183289,
+      "learning_rate": 0.00011808269168378914,
+      "loss": 0.7079,
+      "step": 9945
+    },
+    {
+      "epoch": 1.7708333333333335,
+      "grad_norm": 0.6395503282546997,
+      "learning_rate": 0.00011806892478376601,
+      "loss": 1.0048,
+      "step": 9946
+    },
+    {
+      "epoch": 1.771011396011396,
+      "grad_norm": 0.7670905590057373,
+      "learning_rate": 0.00011805515752971985,
+      "loss": 1.2509,
+      "step": 9947
+    },
+    {
+      "epoch": 1.7711894586894585,
+      "grad_norm": 0.5945813655853271,
+      "learning_rate": 0.00011804138992192037,
+      "loss": 0.8856,
+      "step": 9948
+    },
+    {
+      "epoch": 1.7713675213675213,
+      "grad_norm": 0.7355493307113647,
+      "learning_rate": 0.00011802762196063737,
+      "loss": 0.9629,
+      "step": 9949
+    },
+    {
+      "epoch": 1.771545584045584,
+      "grad_norm": 0.7024806141853333,
+      "learning_rate": 0.00011801385364614055,
+      "loss": 1.1351,
+      "step": 9950
+    },
+    {
+      "epoch": 1.7717236467236468,
+      "grad_norm": 0.6553003191947937,
+      "learning_rate": 0.00011800008497869968,
+      "loss": 0.911,
+      "step": 9951
+    },
+    {
+      "epoch": 1.7719017094017095,
+      "grad_norm": 0.6883971691131592,
+      "learning_rate": 0.00011798631595858454,
+      "loss": 1.0099,
+      "step": 9952
+    },
+    {
+      "epoch": 1.772079772079772,
+      "grad_norm": 0.7106832265853882,
+      "learning_rate": 0.00011797254658606489,
+      "loss": 1.0298,
+      "step": 9953
+    },
+    {
+      "epoch": 1.7722578347578346,
+      "grad_norm": 0.7902877926826477,
+      "learning_rate": 0.00011795877686141055,
+      "loss": 1.0572,
+      "step": 9954
+    },
+    {
+      "epoch": 1.7724358974358974,
+      "grad_norm": 0.7105007171630859,
+      "learning_rate": 0.00011794500678489126,
+      "loss": 1.1725,
+      "step": 9955
+    },
+    {
+      "epoch": 1.77261396011396,
+      "grad_norm": 0.7314959764480591,
+      "learning_rate": 0.00011793123635677685,
+      "loss": 1.1074,
+      "step": 9956
+    },
+    {
+      "epoch": 1.7727920227920229,
+      "grad_norm": 0.6358618140220642,
+      "learning_rate": 0.00011791746557733712,
+      "loss": 0.8786,
+      "step": 9957
+    },
+    {
+      "epoch": 1.7729700854700856,
+      "grad_norm": 0.6441367864608765,
+      "learning_rate": 0.00011790369444684187,
+      "loss": 1.1332,
+      "step": 9958
+    },
+    {
+      "epoch": 1.7731481481481481,
+      "grad_norm": 0.686787486076355,
+      "learning_rate": 0.0001178899229655609,
+      "loss": 0.9566,
+      "step": 9959
+    },
+    {
+      "epoch": 1.7733262108262107,
+      "grad_norm": 0.653840184211731,
+      "learning_rate": 0.00011787615113376407,
+      "loss": 0.8763,
+      "step": 9960
+    },
+    {
+      "epoch": 1.7735042735042734,
+      "grad_norm": 0.7106643915176392,
+      "learning_rate": 0.00011786237895172119,
+      "loss": 0.9929,
+      "step": 9961
+    },
+    {
+      "epoch": 1.7736823361823362,
+      "grad_norm": 0.6634044051170349,
+      "learning_rate": 0.0001178486064197021,
+      "loss": 0.7467,
+      "step": 9962
+    },
+    {
+      "epoch": 1.773860398860399,
+      "grad_norm": 0.7087352871894836,
+      "learning_rate": 0.00011783483353797663,
+      "loss": 1.0104,
+      "step": 9963
+    },
+    {
+      "epoch": 1.7740384615384617,
+      "grad_norm": 0.8088061213493347,
+      "learning_rate": 0.00011782106030681466,
+      "loss": 1.0376,
+      "step": 9964
+    },
+    {
+      "epoch": 1.7742165242165242,
+      "grad_norm": 0.7204688787460327,
+      "learning_rate": 0.00011780728672648604,
+      "loss": 0.8556,
+      "step": 9965
+    },
+    {
+      "epoch": 1.7743945868945867,
+      "grad_norm": 0.7893314957618713,
+      "learning_rate": 0.0001177935127972606,
+      "loss": 0.9764,
+      "step": 9966
+    },
+    {
+      "epoch": 1.7745726495726495,
+      "grad_norm": 0.6098896265029907,
+      "learning_rate": 0.00011777973851940826,
+      "loss": 0.9407,
+      "step": 9967
+    },
+    {
+      "epoch": 1.7747507122507122,
+      "grad_norm": 0.6420868039131165,
+      "learning_rate": 0.0001177659638931989,
+      "loss": 1.1328,
+      "step": 9968
+    },
+    {
+      "epoch": 1.774928774928775,
+      "grad_norm": 0.7732378244400024,
+      "learning_rate": 0.00011775218891890234,
+      "loss": 1.1236,
+      "step": 9969
+    },
+    {
+      "epoch": 1.7751068376068377,
+      "grad_norm": 0.6591582894325256,
+      "learning_rate": 0.00011773841359678855,
+      "loss": 1.1523,
+      "step": 9970
+    },
+    {
+      "epoch": 1.7752849002849003,
+      "grad_norm": 0.6337170004844666,
+      "learning_rate": 0.00011772463792712738,
+      "loss": 1.1998,
+      "step": 9971
+    },
+    {
+      "epoch": 1.7754629629629628,
+      "grad_norm": 0.6400532126426697,
+      "learning_rate": 0.00011771086191018874,
+      "loss": 0.9543,
+      "step": 9972
+    },
+    {
+      "epoch": 1.7756410256410255,
+      "grad_norm": 0.6431527733802795,
+      "learning_rate": 0.00011769708554624257,
+      "loss": 0.8164,
+      "step": 9973
+    },
+    {
+      "epoch": 1.7758190883190883,
+      "grad_norm": 0.7303599119186401,
+      "learning_rate": 0.00011768330883555876,
+      "loss": 0.9553,
+      "step": 9974
+    },
+    {
+      "epoch": 1.775997150997151,
+      "grad_norm": 0.7838605642318726,
+      "learning_rate": 0.00011766953177840725,
+      "loss": 0.9759,
+      "step": 9975
+    },
+    {
+      "epoch": 1.7761752136752138,
+      "grad_norm": 0.6505265831947327,
+      "learning_rate": 0.00011765575437505796,
+      "loss": 0.8527,
+      "step": 9976
+    },
+    {
+      "epoch": 1.7763532763532763,
+      "grad_norm": 0.7336180806159973,
+      "learning_rate": 0.00011764197662578086,
+      "loss": 1.1098,
+      "step": 9977
+    },
+    {
+      "epoch": 1.776531339031339,
+      "grad_norm": 0.7040138244628906,
+      "learning_rate": 0.00011762819853084586,
+      "loss": 1.1289,
+      "step": 9978
+    },
+    {
+      "epoch": 1.7767094017094016,
+      "grad_norm": 0.6414867043495178,
+      "learning_rate": 0.00011761442009052293,
+      "loss": 1.0826,
+      "step": 9979
+    },
+    {
+      "epoch": 1.7768874643874644,
+      "grad_norm": 0.6760666370391846,
+      "learning_rate": 0.00011760064130508204,
+      "loss": 1.0188,
+      "step": 9980
+    },
+    {
+      "epoch": 1.7770655270655271,
+      "grad_norm": 0.7864978909492493,
+      "learning_rate": 0.00011758686217479316,
+      "loss": 1.1938,
+      "step": 9981
+    },
+    {
+      "epoch": 1.7772435897435899,
+      "grad_norm": 0.7964870929718018,
+      "learning_rate": 0.00011757308269992622,
+      "loss": 0.9876,
+      "step": 9982
+    },
+    {
+      "epoch": 1.7774216524216524,
+      "grad_norm": 0.5158692002296448,
+      "learning_rate": 0.00011755930288075123,
+      "loss": 0.6508,
+      "step": 9983
+    },
+    {
+      "epoch": 1.7775997150997151,
+      "grad_norm": 0.7208606600761414,
+      "learning_rate": 0.00011754552271753819,
+      "loss": 1.0738,
+      "step": 9984
+    },
+    {
+      "epoch": 1.7777777777777777,
+      "grad_norm": 0.6811334490776062,
+      "learning_rate": 0.00011753174221055705,
+      "loss": 1.1216,
+      "step": 9985
+    },
+    {
+      "epoch": 1.7779558404558404,
+      "grad_norm": 0.6389986276626587,
+      "learning_rate": 0.00011751796136007787,
+      "loss": 0.9664,
+      "step": 9986
+    },
+    {
+      "epoch": 1.7781339031339032,
+      "grad_norm": 0.7081875205039978,
+      "learning_rate": 0.00011750418016637064,
+      "loss": 0.9365,
+      "step": 9987
+    },
+    {
+      "epoch": 1.778311965811966,
+      "grad_norm": 0.7291778326034546,
+      "learning_rate": 0.00011749039862970535,
+      "loss": 1.3222,
+      "step": 9988
+    },
+    {
+      "epoch": 1.7784900284900285,
+      "grad_norm": 0.6790453791618347,
+      "learning_rate": 0.000117476616750352,
+      "loss": 0.9537,
+      "step": 9989
+    },
+    {
+      "epoch": 1.7786680911680912,
+      "grad_norm": 0.6271076202392578,
+      "learning_rate": 0.00011746283452858069,
+      "loss": 0.9842,
+      "step": 9990
+    },
+    {
+      "epoch": 1.7788461538461537,
+      "grad_norm": 0.675628662109375,
+      "learning_rate": 0.00011744905196466138,
+      "loss": 0.8675,
+      "step": 9991
+    },
+    {
+      "epoch": 1.7790242165242165,
+      "grad_norm": 0.7328314185142517,
+      "learning_rate": 0.00011743526905886417,
+      "loss": 0.9793,
+      "step": 9992
+    },
+    {
+      "epoch": 1.7792022792022792,
+      "grad_norm": 0.698764979839325,
+      "learning_rate": 0.00011742148581145908,
+      "loss": 0.9527,
+      "step": 9993
+    },
+    {
+      "epoch": 1.779380341880342,
+      "grad_norm": 0.6911364793777466,
+      "learning_rate": 0.00011740770222271616,
+      "loss": 1.1069,
+      "step": 9994
+    },
+    {
+      "epoch": 1.7795584045584045,
+      "grad_norm": 0.6990836262702942,
+      "learning_rate": 0.00011739391829290547,
+      "loss": 0.9132,
+      "step": 9995
+    },
+    {
+      "epoch": 1.7797364672364673,
+      "grad_norm": 0.7056801319122314,
+      "learning_rate": 0.0001173801340222971,
+      "loss": 1.053,
+      "step": 9996
+    },
+    {
+      "epoch": 1.7799145299145298,
+      "grad_norm": 0.7453791499137878,
+      "learning_rate": 0.0001173663494111611,
+      "loss": 0.8806,
+      "step": 9997
+    },
+    {
+      "epoch": 1.7800925925925926,
+      "grad_norm": 0.7211771011352539,
+      "learning_rate": 0.00011735256445976757,
+      "loss": 0.9968,
+      "step": 9998
+    },
+    {
+      "epoch": 1.7802706552706553,
+      "grad_norm": 0.7259734272956848,
+      "learning_rate": 0.00011733877916838656,
+      "loss": 1.167,
+      "step": 9999
+    },
+    {
+      "epoch": 1.780448717948718,
+      "grad_norm": 0.6931926012039185,
+      "learning_rate": 0.00011732499353728821,
+      "loss": 1.0634,
+      "step": 10000
+    },
+    {
+      "epoch": 1.7806267806267806,
+      "grad_norm": 0.6900074481964111,
+      "learning_rate": 0.00011731120756674259,
+      "loss": 0.9718,
+      "step": 10001
+    },
+    {
+      "epoch": 1.7808048433048433,
+      "grad_norm": 0.6817582845687866,
+      "learning_rate": 0.00011729742125701984,
+      "loss": 1.0896,
+      "step": 10002
+    },
+    {
+      "epoch": 1.7809829059829059,
+      "grad_norm": 0.6901891231536865,
+      "learning_rate": 0.00011728363460839003,
+      "loss": 1.0163,
+      "step": 10003
+    },
+    {
+      "epoch": 1.7811609686609686,
+      "grad_norm": 0.9138323664665222,
+      "learning_rate": 0.00011726984762112328,
+      "loss": 1.1713,
+      "step": 10004
+    },
+    {
+      "epoch": 1.7813390313390314,
+      "grad_norm": 0.6105810403823853,
+      "learning_rate": 0.00011725606029548977,
+      "loss": 0.9331,
+      "step": 10005
+    },
+    {
+      "epoch": 1.7815170940170941,
+      "grad_norm": 0.5605259537696838,
+      "learning_rate": 0.0001172422726317596,
+      "loss": 0.7154,
+      "step": 10006
+    },
+    {
+      "epoch": 1.7816951566951567,
+      "grad_norm": 0.6950963735580444,
+      "learning_rate": 0.00011722848463020292,
+      "loss": 1.0093,
+      "step": 10007
+    },
+    {
+      "epoch": 1.7818732193732194,
+      "grad_norm": 0.6806309819221497,
+      "learning_rate": 0.00011721469629108988,
+      "loss": 0.8662,
+      "step": 10008
+    },
+    {
+      "epoch": 1.782051282051282,
+      "grad_norm": 0.7528520226478577,
+      "learning_rate": 0.00011720090761469063,
+      "loss": 0.8567,
+      "step": 10009
+    },
+    {
+      "epoch": 1.7822293447293447,
+      "grad_norm": 0.6617229580879211,
+      "learning_rate": 0.00011718711860127529,
+      "loss": 1.0378,
+      "step": 10010
+    },
+    {
+      "epoch": 1.7824074074074074,
+      "grad_norm": 0.6468376517295837,
+      "learning_rate": 0.00011717332925111411,
+      "loss": 1.0658,
+      "step": 10011
+    },
+    {
+      "epoch": 1.7825854700854702,
+      "grad_norm": 0.7141897082328796,
+      "learning_rate": 0.00011715953956447721,
+      "loss": 1.023,
+      "step": 10012
+    },
+    {
+      "epoch": 1.7827635327635327,
+      "grad_norm": 0.5777570605278015,
+      "learning_rate": 0.00011714574954163475,
+      "loss": 0.9154,
+      "step": 10013
+    },
+    {
+      "epoch": 1.7829415954415955,
+      "grad_norm": 0.7536137700080872,
+      "learning_rate": 0.00011713195918285695,
+      "loss": 0.9651,
+      "step": 10014
+    },
+    {
+      "epoch": 1.783119658119658,
+      "grad_norm": 0.6977683305740356,
+      "learning_rate": 0.00011711816848841402,
+      "loss": 0.7977,
+      "step": 10015
+    },
+    {
+      "epoch": 1.7832977207977208,
+      "grad_norm": 0.6522472500801086,
+      "learning_rate": 0.00011710437745857614,
+      "loss": 0.8834,
+      "step": 10016
+    },
+    {
+      "epoch": 1.7834757834757835,
+      "grad_norm": 0.6263057589530945,
+      "learning_rate": 0.0001170905860936135,
+      "loss": 1.0576,
+      "step": 10017
+    },
+    {
+      "epoch": 1.7836538461538463,
+      "grad_norm": 0.6470699310302734,
+      "learning_rate": 0.00011707679439379635,
+      "loss": 0.9412,
+      "step": 10018
+    },
+    {
+      "epoch": 1.7838319088319088,
+      "grad_norm": Infinity,
+      "learning_rate": 0.00011707679439379635,
+      "loss": 1.1746,
+      "step": 10019
+    },
+    {
+      "epoch": 1.7840099715099715,
+      "grad_norm": 0.6022017002105713,
+      "learning_rate": 0.00011706300235939485,
+      "loss": 0.8945,
+      "step": 10020
+    },
+    {
+      "epoch": 1.784188034188034,
+      "grad_norm": 0.637208104133606,
+      "learning_rate": 0.00011704920999067927,
+      "loss": 1.0215,
+      "step": 10021
+    },
+    {
+      "epoch": 1.7843660968660968,
+      "grad_norm": 0.7467851042747498,
+      "learning_rate": 0.00011703541728791987,
+      "loss": 1.0341,
+      "step": 10022
+    },
+    {
+      "epoch": 1.7845441595441596,
+      "grad_norm": 0.7562711238861084,
+      "learning_rate": 0.00011702162425138683,
+      "loss": 0.9748,
+      "step": 10023
+    },
+    {
+      "epoch": 1.7847222222222223,
+      "grad_norm": 0.6480089426040649,
+      "learning_rate": 0.00011700783088135043,
+      "loss": 1.05,
+      "step": 10024
+    },
+    {
+      "epoch": 1.7849002849002849,
+      "grad_norm": 0.6293981671333313,
+      "learning_rate": 0.00011699403717808091,
+      "loss": 1.0376,
+      "step": 10025
+    },
+    {
+      "epoch": 1.7850783475783476,
+      "grad_norm": 0.6821253895759583,
+      "learning_rate": 0.00011698024314184853,
+      "loss": 1.0542,
+      "step": 10026
+    },
+    {
+      "epoch": 1.7852564102564101,
+      "grad_norm": 0.6681216359138489,
+      "learning_rate": 0.00011696644877292356,
+      "loss": 1.0018,
+      "step": 10027
+    },
+    {
+      "epoch": 1.7854344729344729,
+      "grad_norm": 0.6788804531097412,
+      "learning_rate": 0.00011695265407157628,
+      "loss": 1.1823,
+      "step": 10028
+    },
+    {
+      "epoch": 1.7856125356125356,
+      "grad_norm": 0.6147881150245667,
+      "learning_rate": 0.00011693885903807697,
+      "loss": 0.9246,
+      "step": 10029
+    },
+    {
+      "epoch": 1.7857905982905984,
+      "grad_norm": 0.7952296137809753,
+      "learning_rate": 0.00011692506367269588,
+      "loss": 1.0528,
+      "step": 10030
+    },
+    {
+      "epoch": 1.785968660968661,
+      "grad_norm": 0.6985954642295837,
+      "learning_rate": 0.00011691126797570333,
+      "loss": 0.9173,
+      "step": 10031
+    },
+    {
+      "epoch": 1.7861467236467237,
+      "grad_norm": 0.6211223602294922,
+      "learning_rate": 0.00011689747194736961,
+      "loss": 0.7527,
+      "step": 10032
+    },
+    {
+      "epoch": 1.7863247863247862,
+      "grad_norm": 0.7531208992004395,
+      "learning_rate": 0.00011688367558796507,
+      "loss": 1.1087,
+      "step": 10033
+    },
+    {
+      "epoch": 1.786502849002849,
+      "grad_norm": 0.7742924690246582,
+      "learning_rate": 0.00011686987889775996,
+      "loss": 1.1512,
+      "step": 10034
+    },
+    {
+      "epoch": 1.7866809116809117,
+      "grad_norm": 0.7046231627464294,
+      "learning_rate": 0.00011685608187702459,
+      "loss": 1.0516,
+      "step": 10035
+    },
+    {
+      "epoch": 1.7868589743589745,
+      "grad_norm": 0.6264076232910156,
+      "learning_rate": 0.00011684228452602933,
+      "loss": 0.8938,
+      "step": 10036
+    },
+    {
+      "epoch": 1.7870370370370372,
+      "grad_norm": 0.6342145800590515,
+      "learning_rate": 0.00011682848684504448,
+      "loss": 0.8177,
+      "step": 10037
+    },
+    {
+      "epoch": 1.7872150997150997,
+      "grad_norm": 0.6609861254692078,
+      "learning_rate": 0.00011681468883434041,
+      "loss": 0.9692,
+      "step": 10038
+    },
+    {
+      "epoch": 1.7873931623931623,
+      "grad_norm": 0.7918622493743896,
+      "learning_rate": 0.00011680089049418743,
+      "loss": 0.8246,
+      "step": 10039
+    },
+    {
+      "epoch": 1.787571225071225,
+      "grad_norm": 0.697712779045105,
+      "learning_rate": 0.00011678709182485592,
+      "loss": 0.8981,
+      "step": 10040
+    },
+    {
+      "epoch": 1.7877492877492878,
+      "grad_norm": 0.6747658252716064,
+      "learning_rate": 0.00011677329282661617,
+      "loss": 1.1243,
+      "step": 10041
+    },
+    {
+      "epoch": 1.7879273504273505,
+      "grad_norm": 0.6525771617889404,
+      "learning_rate": 0.00011675949349973863,
+      "loss": 0.852,
+      "step": 10042
+    },
+    {
+      "epoch": 1.7881054131054133,
+      "grad_norm": 0.7062464952468872,
+      "learning_rate": 0.00011674569384449363,
+      "loss": 1.2582,
+      "step": 10043
+    },
+    {
+      "epoch": 1.7882834757834758,
+      "grad_norm": 0.6453786492347717,
+      "learning_rate": 0.00011673189386115154,
+      "loss": 0.868,
+      "step": 10044
+    },
+    {
+      "epoch": 1.7884615384615383,
+      "grad_norm": 0.7939708232879639,
+      "learning_rate": 0.00011671809354998273,
+      "loss": 0.7553,
+      "step": 10045
+    },
+    {
+      "epoch": 1.788639601139601,
+      "grad_norm": 0.6466066837310791,
+      "learning_rate": 0.00011670429291125761,
+      "loss": 0.942,
+      "step": 10046
+    },
+    {
+      "epoch": 1.7888176638176638,
+      "grad_norm": 0.7380510568618774,
+      "learning_rate": 0.00011669049194524657,
+      "loss": 1.044,
+      "step": 10047
+    },
+    {
+      "epoch": 1.7889957264957266,
+      "grad_norm": 0.6719707250595093,
+      "learning_rate": 0.00011667669065222002,
+      "loss": 1.1624,
+      "step": 10048
+    },
+    {
+      "epoch": 1.7891737891737893,
+      "grad_norm": 0.6996603012084961,
+      "learning_rate": 0.00011666288903244837,
+      "loss": 1.001,
+      "step": 10049
+    },
+    {
+      "epoch": 1.7893518518518519,
+      "grad_norm": 0.696590006351471,
+      "learning_rate": 0.00011664908708620202,
+      "loss": 1.17,
+      "step": 10050
+    },
+    {
+      "epoch": 1.7895299145299144,
+      "grad_norm": 0.7226764559745789,
+      "learning_rate": 0.00011663528481375137,
+      "loss": 1.0762,
+      "step": 10051
+    },
+    {
+      "epoch": 1.7897079772079771,
+      "grad_norm": 0.6117866635322571,
+      "learning_rate": 0.00011662148221536689,
+      "loss": 0.9199,
+      "step": 10052
+    },
+    {
+      "epoch": 1.78988603988604,
+      "grad_norm": 0.6424985527992249,
+      "learning_rate": 0.000116607679291319,
+      "loss": 1.1672,
+      "step": 10053
+    },
+    {
+      "epoch": 1.7900641025641026,
+      "grad_norm": 0.6390290856361389,
+      "learning_rate": 0.00011659387604187813,
+      "loss": 1.1895,
+      "step": 10054
+    },
+    {
+      "epoch": 1.7902421652421654,
+      "grad_norm": 0.6553205251693726,
+      "learning_rate": 0.00011658007246731473,
+      "loss": 1.0967,
+      "step": 10055
+    },
+    {
+      "epoch": 1.790420227920228,
+      "grad_norm": 0.7737570405006409,
+      "learning_rate": 0.00011656626856789922,
+      "loss": 0.9637,
+      "step": 10056
+    },
+    {
+      "epoch": 1.7905982905982905,
+      "grad_norm": 0.644296407699585,
+      "learning_rate": 0.00011655246434390212,
+      "loss": 0.9933,
+      "step": 10057
+    },
+    {
+      "epoch": 1.7907763532763532,
+      "grad_norm": 0.8154410123825073,
+      "learning_rate": 0.00011653865979559388,
+      "loss": 0.9623,
+      "step": 10058
+    },
+    {
+      "epoch": 1.790954415954416,
+      "grad_norm": 0.7181384563446045,
+      "learning_rate": 0.00011652485492324495,
+      "loss": 0.9113,
+      "step": 10059
+    },
+    {
+      "epoch": 1.7911324786324787,
+      "grad_norm": 0.7835097908973694,
+      "learning_rate": 0.00011651104972712582,
+      "loss": 1.0804,
+      "step": 10060
+    },
+    {
+      "epoch": 1.7913105413105415,
+      "grad_norm": 0.6843693852424622,
+      "learning_rate": 0.00011649724420750691,
+      "loss": 1.0242,
+      "step": 10061
+    },
+    {
+      "epoch": 1.791488603988604,
+      "grad_norm": 0.8364703059196472,
+      "learning_rate": 0.00011648343836465885,
+      "loss": 0.8445,
+      "step": 10062
+    },
+    {
+      "epoch": 1.7916666666666665,
+      "grad_norm": 0.7122092843055725,
+      "learning_rate": 0.00011646963219885201,
+      "loss": 1.0453,
+      "step": 10063
+    },
+    {
+      "epoch": 1.7918447293447293,
+      "grad_norm": 0.7018755078315735,
+      "learning_rate": 0.00011645582571035696,
+      "loss": 0.9753,
+      "step": 10064
+    },
+    {
+      "epoch": 1.792022792022792,
+      "grad_norm": 0.6522594094276428,
+      "learning_rate": 0.00011644201889944419,
+      "loss": 1.0328,
+      "step": 10065
+    },
+    {
+      "epoch": 1.7922008547008548,
+      "grad_norm": 0.70301353931427,
+      "learning_rate": 0.00011642821176638419,
+      "loss": 0.9143,
+      "step": 10066
+    },
+    {
+      "epoch": 1.7923789173789175,
+      "grad_norm": 0.6255469918251038,
+      "learning_rate": 0.0001164144043114475,
+      "loss": 0.9527,
+      "step": 10067
+    },
+    {
+      "epoch": 1.79255698005698,
+      "grad_norm": 0.6780602931976318,
+      "learning_rate": 0.0001164005965349047,
+      "loss": 0.9192,
+      "step": 10068
+    },
+    {
+      "epoch": 1.7927350427350426,
+      "grad_norm": 0.6025984287261963,
+      "learning_rate": 0.00011638678843702626,
+      "loss": 0.9055,
+      "step": 10069
+    },
+    {
+      "epoch": 1.7929131054131053,
+      "grad_norm": 0.6430829763412476,
+      "learning_rate": 0.00011637298001808275,
+      "loss": 0.9359,
+      "step": 10070
+    },
+    {
+      "epoch": 1.793091168091168,
+      "grad_norm": 0.6388106942176819,
+      "learning_rate": 0.0001163591712783447,
+      "loss": 0.8847,
+      "step": 10071
+    },
+    {
+      "epoch": 1.7932692307692308,
+      "grad_norm": 0.706347644329071,
+      "learning_rate": 0.00011634536221808265,
+      "loss": 0.9055,
+      "step": 10072
+    },
+    {
+      "epoch": 1.7934472934472936,
+      "grad_norm": 0.661226749420166,
+      "learning_rate": 0.00011633155283756721,
+      "loss": 1.118,
+      "step": 10073
+    },
+    {
+      "epoch": 1.7936253561253561,
+      "grad_norm": 0.543207049369812,
+      "learning_rate": 0.00011631774313706891,
+      "loss": 0.8856,
+      "step": 10074
+    },
+    {
+      "epoch": 1.7938034188034186,
+      "grad_norm": 0.6514154672622681,
+      "learning_rate": 0.00011630393311685835,
+      "loss": 0.8967,
+      "step": 10075
+    },
+    {
+      "epoch": 1.7939814814814814,
+      "grad_norm": 0.8669198155403137,
+      "learning_rate": 0.00011629012277720607,
+      "loss": 1.0362,
+      "step": 10076
+    },
+    {
+      "epoch": 1.7941595441595442,
+      "grad_norm": 0.7256068587303162,
+      "learning_rate": 0.00011627631211838266,
+      "loss": 1.1948,
+      "step": 10077
+    },
+    {
+      "epoch": 1.794337606837607,
+      "grad_norm": 0.6504935622215271,
+      "learning_rate": 0.00011626250114065875,
+      "loss": 0.8309,
+      "step": 10078
+    },
+    {
+      "epoch": 1.7945156695156697,
+      "grad_norm": 0.6964160799980164,
+      "learning_rate": 0.0001162486898443049,
+      "loss": 0.9593,
+      "step": 10079
+    },
+    {
+      "epoch": 1.7946937321937322,
+      "grad_norm": 0.668727695941925,
+      "learning_rate": 0.00011623487822959174,
+      "loss": 0.8897,
+      "step": 10080
+    },
+    {
+      "epoch": 1.7948717948717947,
+      "grad_norm": 0.6907223463058472,
+      "learning_rate": 0.00011622106629678986,
+      "loss": 0.897,
+      "step": 10081
+    },
+    {
+      "epoch": 1.7950498575498575,
+      "grad_norm": 0.6652865409851074,
+      "learning_rate": 0.00011620725404616985,
+      "loss": 0.9321,
+      "step": 10082
+    },
+    {
+      "epoch": 1.7952279202279202,
+      "grad_norm": 0.6523811221122742,
+      "learning_rate": 0.00011619344147800239,
+      "loss": 0.8991,
+      "step": 10083
+    },
+    {
+      "epoch": 1.795405982905983,
+      "grad_norm": 0.6162952184677124,
+      "learning_rate": 0.0001161796285925581,
+      "loss": 0.8061,
+      "step": 10084
+    },
+    {
+      "epoch": 1.7955840455840457,
+      "grad_norm": 0.670606791973114,
+      "learning_rate": 0.0001161658153901076,
+      "loss": 0.9341,
+      "step": 10085
+    },
+    {
+      "epoch": 1.7957621082621082,
+      "grad_norm": 0.6372489333152771,
+      "learning_rate": 0.00011615200187092148,
+      "loss": 1.1049,
+      "step": 10086
+    },
+    {
+      "epoch": 1.7959401709401708,
+      "grad_norm": 0.7311037182807922,
+      "learning_rate": 0.00011613818803527045,
+      "loss": 1.0881,
+      "step": 10087
+    },
+    {
+      "epoch": 1.7961182336182335,
+      "grad_norm": 0.7440751194953918,
+      "learning_rate": 0.00011612437388342518,
+      "loss": 0.9487,
+      "step": 10088
+    },
+    {
+      "epoch": 1.7962962962962963,
+      "grad_norm": 0.6605934500694275,
+      "learning_rate": 0.00011611055941565629,
+      "loss": 0.8757,
+      "step": 10089
+    },
+    {
+      "epoch": 1.796474358974359,
+      "grad_norm": 0.7546001076698303,
+      "learning_rate": 0.00011609674463223446,
+      "loss": 0.9368,
+      "step": 10090
+    },
+    {
+      "epoch": 1.7966524216524218,
+      "grad_norm": 0.7001389861106873,
+      "learning_rate": 0.00011608292953343036,
+      "loss": 0.9098,
+      "step": 10091
+    },
+    {
+      "epoch": 1.7968304843304843,
+      "grad_norm": 0.6898102760314941,
+      "learning_rate": 0.00011606911411951462,
+      "loss": 0.8821,
+      "step": 10092
+    },
+    {
+      "epoch": 1.797008547008547,
+      "grad_norm": 0.7020773887634277,
+      "learning_rate": 0.00011605529839075801,
+      "loss": 1.2775,
+      "step": 10093
+    },
+    {
+      "epoch": 1.7971866096866096,
+      "grad_norm": 0.6061446070671082,
+      "learning_rate": 0.0001160414823474312,
+      "loss": 1.0156,
+      "step": 10094
+    },
+    {
+      "epoch": 1.7973646723646723,
+      "grad_norm": 0.6746069192886353,
+      "learning_rate": 0.00011602766598980484,
+      "loss": 0.8223,
+      "step": 10095
+    },
+    {
+      "epoch": 1.797542735042735,
+      "grad_norm": 0.655829131603241,
+      "learning_rate": 0.00011601384931814967,
+      "loss": 0.9482,
+      "step": 10096
+    },
+    {
+      "epoch": 1.7977207977207978,
+      "grad_norm": 0.6762703061103821,
+      "learning_rate": 0.00011600003233273636,
+      "loss": 1.0191,
+      "step": 10097
+    },
+    {
+      "epoch": 1.7978988603988604,
+      "grad_norm": 0.7610527276992798,
+      "learning_rate": 0.00011598621503383566,
+      "loss": 1.0771,
+      "step": 10098
+    },
+    {
+      "epoch": 1.7980769230769231,
+      "grad_norm": 0.6857240200042725,
+      "learning_rate": 0.0001159723974217183,
+      "loss": 0.8325,
+      "step": 10099
+    },
+    {
+      "epoch": 1.7982549857549857,
+      "grad_norm": 0.6897954940795898,
+      "learning_rate": 0.00011595857949665501,
+      "loss": 1.0064,
+      "step": 10100
+    },
+    {
+      "epoch": 1.7984330484330484,
+      "grad_norm": 0.7023211717605591,
+      "learning_rate": 0.00011594476125891649,
+      "loss": 1.1346,
+      "step": 10101
+    },
+    {
+      "epoch": 1.7986111111111112,
+      "grad_norm": 0.8131003975868225,
+      "learning_rate": 0.00011593094270877347,
+      "loss": 1.0384,
+      "step": 10102
+    },
+    {
+      "epoch": 1.798789173789174,
+      "grad_norm": 0.6504445672035217,
+      "learning_rate": 0.00011591712384649676,
+      "loss": 0.8172,
+      "step": 10103
+    },
+    {
+      "epoch": 1.7989672364672364,
+      "grad_norm": 0.7379748821258545,
+      "learning_rate": 0.00011590330467235704,
+      "loss": 1.0118,
+      "step": 10104
+    },
+    {
+      "epoch": 1.7991452991452992,
+      "grad_norm": 0.8867329955101013,
+      "learning_rate": 0.0001158894851866251,
+      "loss": 1.023,
+      "step": 10105
+    },
+    {
+      "epoch": 1.7993233618233617,
+      "grad_norm": 0.7057412266731262,
+      "learning_rate": 0.00011587566538957173,
+      "loss": 0.8415,
+      "step": 10106
+    },
+    {
+      "epoch": 1.7995014245014245,
+      "grad_norm": 0.7479654550552368,
+      "learning_rate": 0.00011586184528146769,
+      "loss": 0.9663,
+      "step": 10107
+    },
+    {
+      "epoch": 1.7996794871794872,
+      "grad_norm": 0.6280845403671265,
+      "learning_rate": 0.00011584802486258368,
+      "loss": 0.973,
+      "step": 10108
+    },
+    {
+      "epoch": 1.79985754985755,
+      "grad_norm": 0.6735749840736389,
+      "learning_rate": 0.00011583420413319059,
+      "loss": 0.8631,
+      "step": 10109
+    },
+    {
+      "epoch": 1.8000356125356125,
+      "grad_norm": 0.5940406918525696,
+      "learning_rate": 0.00011582038309355918,
+      "loss": 0.8533,
+      "step": 10110
+    },
+    {
+      "epoch": 1.8002136752136753,
+      "grad_norm": 0.6923874020576477,
+      "learning_rate": 0.00011580656174396021,
+      "loss": 1.1105,
+      "step": 10111
+    },
+    {
+      "epoch": 1.8003917378917378,
+      "grad_norm": 0.6996715664863586,
+      "learning_rate": 0.00011579274008466447,
+      "loss": 0.9952,
+      "step": 10112
+    },
+    {
+      "epoch": 1.8005698005698005,
+      "grad_norm": 0.656561553478241,
+      "learning_rate": 0.00011577891811594281,
+      "loss": 0.9621,
+      "step": 10113
+    },
+    {
+      "epoch": 1.8007478632478633,
+      "grad_norm": 0.7121242880821228,
+      "learning_rate": 0.00011576509583806605,
+      "loss": 0.8658,
+      "step": 10114
+    },
+    {
+      "epoch": 1.800925925925926,
+      "grad_norm": 0.7864459753036499,
+      "learning_rate": 0.00011575127325130498,
+      "loss": 0.9867,
+      "step": 10115
+    },
+    {
+      "epoch": 1.8011039886039886,
+      "grad_norm": 0.6086452007293701,
+      "learning_rate": 0.00011573745035593042,
+      "loss": 0.8625,
+      "step": 10116
+    },
+    {
+      "epoch": 1.8012820512820513,
+      "grad_norm": 0.6553642749786377,
+      "learning_rate": 0.00011572362715221321,
+      "loss": 0.8475,
+      "step": 10117
+    },
+    {
+      "epoch": 1.8014601139601139,
+      "grad_norm": 0.6677348017692566,
+      "learning_rate": 0.00011570980364042419,
+      "loss": 0.9672,
+      "step": 10118
+    },
+    {
+      "epoch": 1.8016381766381766,
+      "grad_norm": 0.6275015473365784,
+      "learning_rate": 0.0001156959798208342,
+      "loss": 0.8663,
+      "step": 10119
+    },
+    {
+      "epoch": 1.8018162393162394,
+      "grad_norm": 0.787568211555481,
+      "learning_rate": 0.0001156821556937141,
+      "loss": 1.0188,
+      "step": 10120
+    },
+    {
+      "epoch": 1.801994301994302,
+      "grad_norm": 0.6983163356781006,
+      "learning_rate": 0.00011566833125933473,
+      "loss": 1.0767,
+      "step": 10121
+    },
+    {
+      "epoch": 1.8021723646723646,
+      "grad_norm": 0.7008936405181885,
+      "learning_rate": 0.00011565450651796695,
+      "loss": 1.0116,
+      "step": 10122
+    },
+    {
+      "epoch": 1.8023504273504274,
+      "grad_norm": 0.7694976925849915,
+      "learning_rate": 0.00011564068146988163,
+      "loss": 1.0227,
+      "step": 10123
+    },
+    {
+      "epoch": 1.80252849002849,
+      "grad_norm": 0.9530014991760254,
+      "learning_rate": 0.00011562685611534967,
+      "loss": 0.907,
+      "step": 10124
+    },
+    {
+      "epoch": 1.8027065527065527,
+      "grad_norm": 0.6714984178543091,
+      "learning_rate": 0.00011561303045464189,
+      "loss": 0.9501,
+      "step": 10125
+    },
+    {
+      "epoch": 1.8028846153846154,
+      "grad_norm": 0.7233797311782837,
+      "learning_rate": 0.00011559920448802925,
+      "loss": 1.021,
+      "step": 10126
+    },
+    {
+      "epoch": 1.8030626780626782,
+      "grad_norm": 0.7600540518760681,
+      "learning_rate": 0.0001155853782157826,
+      "loss": 1.1056,
+      "step": 10127
+    },
+    {
+      "epoch": 1.8032407407407407,
+      "grad_norm": 0.7836297750473022,
+      "learning_rate": 0.00011557155163817281,
+      "loss": 0.9906,
+      "step": 10128
+    },
+    {
+      "epoch": 1.8034188034188035,
+      "grad_norm": 0.7161104083061218,
+      "learning_rate": 0.00011555772475547084,
+      "loss": 0.9541,
+      "step": 10129
+    },
+    {
+      "epoch": 1.803596866096866,
+      "grad_norm": 0.6613732576370239,
+      "learning_rate": 0.00011554389756794757,
+      "loss": 0.9188,
+      "step": 10130
+    },
+    {
+      "epoch": 1.8037749287749287,
+      "grad_norm": 0.6415915489196777,
+      "learning_rate": 0.00011553007007587391,
+      "loss": 0.9928,
+      "step": 10131
+    },
+    {
+      "epoch": 1.8039529914529915,
+      "grad_norm": 0.7730516195297241,
+      "learning_rate": 0.0001155162422795208,
+      "loss": 1.0654,
+      "step": 10132
+    },
+    {
+      "epoch": 1.8041310541310542,
+      "grad_norm": 0.6769654750823975,
+      "learning_rate": 0.00011550241417915913,
+      "loss": 1.0678,
+      "step": 10133
+    },
+    {
+      "epoch": 1.8043091168091168,
+      "grad_norm": 0.6542425751686096,
+      "learning_rate": 0.00011548858577505988,
+      "loss": 0.9796,
+      "step": 10134
+    },
+    {
+      "epoch": 1.8044871794871795,
+      "grad_norm": 0.7282404899597168,
+      "learning_rate": 0.00011547475706749395,
+      "loss": 1.0314,
+      "step": 10135
+    },
+    {
+      "epoch": 1.804665242165242,
+      "grad_norm": 0.6450245976448059,
+      "learning_rate": 0.00011546092805673232,
+      "loss": 0.9564,
+      "step": 10136
+    },
+    {
+      "epoch": 1.8048433048433048,
+      "grad_norm": 0.65577632188797,
+      "learning_rate": 0.0001154470987430459,
+      "loss": 1.0219,
+      "step": 10137
+    },
+    {
+      "epoch": 1.8050213675213675,
+      "grad_norm": 0.7151737809181213,
+      "learning_rate": 0.00011543326912670567,
+      "loss": 0.9245,
+      "step": 10138
+    },
+    {
+      "epoch": 1.8051994301994303,
+      "grad_norm": 0.6695905327796936,
+      "learning_rate": 0.00011541943920798259,
+      "loss": 0.9535,
+      "step": 10139
+    },
+    {
+      "epoch": 1.8053774928774928,
+      "grad_norm": 0.7443813681602478,
+      "learning_rate": 0.00011540560898714767,
+      "loss": 1.1697,
+      "step": 10140
+    },
+    {
+      "epoch": 1.8055555555555556,
+      "grad_norm": 0.5701992511749268,
+      "learning_rate": 0.0001153917784644718,
+      "loss": 0.7868,
+      "step": 10141
+    },
+    {
+      "epoch": 1.805733618233618,
+      "grad_norm": 0.6992354989051819,
+      "learning_rate": 0.00011537794764022605,
+      "loss": 0.9856,
+      "step": 10142
+    },
+    {
+      "epoch": 1.8059116809116809,
+      "grad_norm": 0.6354477405548096,
+      "learning_rate": 0.00011536411651468131,
+      "loss": 0.8752,
+      "step": 10143
+    },
+    {
+      "epoch": 1.8060897435897436,
+      "grad_norm": 0.6952932476997375,
+      "learning_rate": 0.00011535028508810864,
+      "loss": 0.9446,
+      "step": 10144
+    },
+    {
+      "epoch": 1.8062678062678064,
+      "grad_norm": 0.5527541637420654,
+      "learning_rate": 0.00011533645336077901,
+      "loss": 0.5486,
+      "step": 10145
+    },
+    {
+      "epoch": 1.806445868945869,
+      "grad_norm": 0.685046374797821,
+      "learning_rate": 0.00011532262133296345,
+      "loss": 0.9529,
+      "step": 10146
+    },
+    {
+      "epoch": 1.8066239316239316,
+      "grad_norm": 0.6927558779716492,
+      "learning_rate": 0.00011530878900493296,
+      "loss": 1.1758,
+      "step": 10147
+    },
+    {
+      "epoch": 1.8068019943019942,
+      "grad_norm": 0.6758309602737427,
+      "learning_rate": 0.00011529495637695855,
+      "loss": 1.0076,
+      "step": 10148
+    },
+    {
+      "epoch": 1.806980056980057,
+      "grad_norm": 0.6739441156387329,
+      "learning_rate": 0.00011528112344931121,
+      "loss": 1.1914,
+      "step": 10149
+    },
+    {
+      "epoch": 1.8071581196581197,
+      "grad_norm": 0.7031944394111633,
+      "learning_rate": 0.00011526729022226204,
+      "loss": 0.783,
+      "step": 10150
+    },
+    {
+      "epoch": 1.8073361823361824,
+      "grad_norm": 0.6476930975914001,
+      "learning_rate": 0.00011525345669608202,
+      "loss": 0.9595,
+      "step": 10151
+    },
+    {
+      "epoch": 1.8075142450142452,
+      "grad_norm": 0.710498571395874,
+      "learning_rate": 0.00011523962287104222,
+      "loss": 0.8821,
+      "step": 10152
+    },
+    {
+      "epoch": 1.8076923076923077,
+      "grad_norm": 0.6664412617683411,
+      "learning_rate": 0.00011522578874741365,
+      "loss": 1.0182,
+      "step": 10153
+    },
+    {
+      "epoch": 1.8078703703703702,
+      "grad_norm": 0.8374263048171997,
+      "learning_rate": 0.00011521195432546737,
+      "loss": 0.9394,
+      "step": 10154
+    },
+    {
+      "epoch": 1.808048433048433,
+      "grad_norm": 0.6770764589309692,
+      "learning_rate": 0.00011519811960547447,
+      "loss": 1.0568,
+      "step": 10155
+    },
+    {
+      "epoch": 1.8082264957264957,
+      "grad_norm": 0.7014045715332031,
+      "learning_rate": 0.00011518428458770595,
+      "loss": 1.1705,
+      "step": 10156
+    },
+    {
+      "epoch": 1.8084045584045585,
+      "grad_norm": 0.6590061187744141,
+      "learning_rate": 0.00011517044927243295,
+      "loss": 1.1233,
+      "step": 10157
+    },
+    {
+      "epoch": 1.8085826210826212,
+      "grad_norm": 0.6093801856040955,
+      "learning_rate": 0.00011515661365992647,
+      "loss": 0.953,
+      "step": 10158
+    },
+    {
+      "epoch": 1.8087606837606838,
+      "grad_norm": 0.6197089552879333,
+      "learning_rate": 0.00011514277775045768,
+      "loss": 0.9414,
+      "step": 10159
+    },
+    {
+      "epoch": 1.8089387464387463,
+      "grad_norm": 0.7530463337898254,
+      "learning_rate": 0.00011512894154429759,
+      "loss": 0.9168,
+      "step": 10160
+    },
+    {
+      "epoch": 1.809116809116809,
+      "grad_norm": 0.6051347851753235,
+      "learning_rate": 0.00011511510504171735,
+      "loss": 0.9132,
+      "step": 10161
+    },
+    {
+      "epoch": 1.8092948717948718,
+      "grad_norm": 0.6388311982154846,
+      "learning_rate": 0.000115101268242988,
+      "loss": 0.6551,
+      "step": 10162
+    },
+    {
+      "epoch": 1.8094729344729346,
+      "grad_norm": 0.7040972709655762,
+      "learning_rate": 0.00011508743114838063,
+      "loss": 0.9409,
+      "step": 10163
+    },
+    {
+      "epoch": 1.8096509971509973,
+      "grad_norm": 0.7669548392295837,
+      "learning_rate": 0.00011507359375816644,
+      "loss": 1.0376,
+      "step": 10164
+    },
+    {
+      "epoch": 1.8098290598290598,
+      "grad_norm": 0.7309662699699402,
+      "learning_rate": 0.00011505975607261646,
+      "loss": 0.9071,
+      "step": 10165
+    },
+    {
+      "epoch": 1.8100071225071224,
+      "grad_norm": 0.6624547839164734,
+      "learning_rate": 0.00011504591809200187,
+      "loss": 1.0765,
+      "step": 10166
+    },
+    {
+      "epoch": 1.8101851851851851,
+      "grad_norm": 0.7719045281410217,
+      "learning_rate": 0.00011503207981659376,
+      "loss": 0.9244,
+      "step": 10167
+    },
+    {
+      "epoch": 1.8103632478632479,
+      "grad_norm": 0.6701484322547913,
+      "learning_rate": 0.0001150182412466633,
+      "loss": 0.9475,
+      "step": 10168
+    },
+    {
+      "epoch": 1.8105413105413106,
+      "grad_norm": 0.5604981184005737,
+      "learning_rate": 0.00011500440238248154,
+      "loss": 0.6268,
+      "step": 10169
+    },
+    {
+      "epoch": 1.8107193732193734,
+      "grad_norm": 0.6736510992050171,
+      "learning_rate": 0.00011499056322431973,
+      "loss": 0.9088,
+      "step": 10170
+    },
+    {
+      "epoch": 1.810897435897436,
+      "grad_norm": 0.7428455948829651,
+      "learning_rate": 0.00011497672377244897,
+      "loss": 0.9298,
+      "step": 10171
+    },
+    {
+      "epoch": 1.8110754985754984,
+      "grad_norm": 0.6543142795562744,
+      "learning_rate": 0.00011496288402714042,
+      "loss": 0.8863,
+      "step": 10172
+    },
+    {
+      "epoch": 1.8112535612535612,
+      "grad_norm": 0.6809250712394714,
+      "learning_rate": 0.00011494904398866524,
+      "loss": 0.977,
+      "step": 10173
+    },
+    {
+      "epoch": 1.811431623931624,
+      "grad_norm": 0.8105120062828064,
+      "learning_rate": 0.00011493520365729456,
+      "loss": 1.2115,
+      "step": 10174
+    },
+    {
+      "epoch": 1.8116096866096867,
+      "grad_norm": 0.6985095143318176,
+      "learning_rate": 0.00011492136303329964,
+      "loss": 0.8233,
+      "step": 10175
+    },
+    {
+      "epoch": 1.8117877492877494,
+      "grad_norm": 0.7198361754417419,
+      "learning_rate": 0.00011490752211695158,
+      "loss": 1.0552,
+      "step": 10176
+    },
+    {
+      "epoch": 1.811965811965812,
+      "grad_norm": 0.7077036499977112,
+      "learning_rate": 0.0001148936809085216,
+      "loss": 0.9171,
+      "step": 10177
+    },
+    {
+      "epoch": 1.8121438746438745,
+      "grad_norm": 0.9362925887107849,
+      "learning_rate": 0.00011487983940828089,
+      "loss": 0.9042,
+      "step": 10178
+    },
+    {
+      "epoch": 1.8123219373219372,
+      "grad_norm": 0.6732819676399231,
+      "learning_rate": 0.0001148659976165006,
+      "loss": 1.1033,
+      "step": 10179
+    },
+    {
+      "epoch": 1.8125,
+      "grad_norm": 0.747702419757843,
+      "learning_rate": 0.00011485215553345201,
+      "loss": 1.0692,
+      "step": 10180
+    },
+    {
+      "epoch": 1.8126780626780628,
+      "grad_norm": 0.7011259198188782,
+      "learning_rate": 0.00011483831315940627,
+      "loss": 0.9278,
+      "step": 10181
+    },
+    {
+      "epoch": 1.8128561253561255,
+      "grad_norm": 0.8542702198028564,
+      "learning_rate": 0.00011482447049463462,
+      "loss": 0.9476,
+      "step": 10182
+    },
+    {
+      "epoch": 1.813034188034188,
+      "grad_norm": 0.6975166201591492,
+      "learning_rate": 0.00011481062753940825,
+      "loss": 0.9486,
+      "step": 10183
+    },
+    {
+      "epoch": 1.8132122507122506,
+      "grad_norm": 0.8239036798477173,
+      "learning_rate": 0.0001147967842939984,
+      "loss": 1.0518,
+      "step": 10184
+    },
+    {
+      "epoch": 1.8133903133903133,
+      "grad_norm": 0.7559717297554016,
+      "learning_rate": 0.00011478294075867628,
+      "loss": 1.1877,
+      "step": 10185
+    },
+    {
+      "epoch": 1.813568376068376,
+      "grad_norm": 0.6755532026290894,
+      "learning_rate": 0.00011476909693371318,
+      "loss": 0.9287,
+      "step": 10186
+    },
+    {
+      "epoch": 1.8137464387464388,
+      "grad_norm": 0.6561332941055298,
+      "learning_rate": 0.0001147552528193803,
+      "loss": 0.83,
+      "step": 10187
+    },
+    {
+      "epoch": 1.8139245014245016,
+      "grad_norm": 0.7223508954048157,
+      "learning_rate": 0.00011474140841594887,
+      "loss": 1.1259,
+      "step": 10188
+    },
+    {
+      "epoch": 1.814102564102564,
+      "grad_norm": 0.7920593023300171,
+      "learning_rate": 0.0001147275637236902,
+      "loss": 1.0925,
+      "step": 10189
+    },
+    {
+      "epoch": 1.8142806267806266,
+      "grad_norm": 0.6896616816520691,
+      "learning_rate": 0.00011471371874287546,
+      "loss": 1.0204,
+      "step": 10190
+    },
+    {
+      "epoch": 1.8144586894586894,
+      "grad_norm": 0.6149865388870239,
+      "learning_rate": 0.00011469987347377602,
+      "loss": 1.1249,
+      "step": 10191
+    },
+    {
+      "epoch": 1.8146367521367521,
+      "grad_norm": 0.6650002598762512,
+      "learning_rate": 0.00011468602791666307,
+      "loss": 0.9723,
+      "step": 10192
+    },
+    {
+      "epoch": 1.8148148148148149,
+      "grad_norm": 0.7298738956451416,
+      "learning_rate": 0.00011467218207180792,
+      "loss": 1.0225,
+      "step": 10193
+    },
+    {
+      "epoch": 1.8149928774928776,
+      "grad_norm": 0.8075628876686096,
+      "learning_rate": 0.00011465833593948183,
+      "loss": 1.0429,
+      "step": 10194
+    },
+    {
+      "epoch": 1.8151709401709402,
+      "grad_norm": 0.8196593523025513,
+      "learning_rate": 0.0001146444895199561,
+      "loss": 0.9148,
+      "step": 10195
+    },
+    {
+      "epoch": 1.8153490028490027,
+      "grad_norm": 0.6394698023796082,
+      "learning_rate": 0.00011463064281350204,
+      "loss": 0.9781,
+      "step": 10196
+    },
+    {
+      "epoch": 1.8155270655270654,
+      "grad_norm": 0.7302836775779724,
+      "learning_rate": 0.00011461679582039091,
+      "loss": 1.0394,
+      "step": 10197
+    },
+    {
+      "epoch": 1.8157051282051282,
+      "grad_norm": 0.7066670060157776,
+      "learning_rate": 0.00011460294854089404,
+      "loss": 1.1153,
+      "step": 10198
+    },
+    {
+      "epoch": 1.815883190883191,
+      "grad_norm": 0.6471068263053894,
+      "learning_rate": 0.0001145891009752827,
+      "loss": 1.1533,
+      "step": 10199
+    },
+    {
+      "epoch": 1.8160612535612537,
+      "grad_norm": 0.6842355132102966,
+      "learning_rate": 0.00011457525312382826,
+      "loss": 0.953,
+      "step": 10200
+    },
+    {
+      "epoch": 1.8162393162393162,
+      "grad_norm": 0.6720319986343384,
+      "learning_rate": 0.00011456140498680202,
+      "loss": 1.003,
+      "step": 10201
+    },
+    {
+      "epoch": 1.8164173789173788,
+      "grad_norm": 0.632017970085144,
+      "learning_rate": 0.00011454755656447527,
+      "loss": 0.8148,
+      "step": 10202
+    },
+    {
+      "epoch": 1.8165954415954415,
+      "grad_norm": 0.7193828225135803,
+      "learning_rate": 0.00011453370785711939,
+      "loss": 1.0098,
+      "step": 10203
+    },
+    {
+      "epoch": 1.8167735042735043,
+      "grad_norm": 0.7098045349121094,
+      "learning_rate": 0.00011451985886500566,
+      "loss": 1.1276,
+      "step": 10204
+    },
+    {
+      "epoch": 1.816951566951567,
+      "grad_norm": 0.7076733708381653,
+      "learning_rate": 0.00011450600958840547,
+      "loss": 1.1216,
+      "step": 10205
+    },
+    {
+      "epoch": 1.8171296296296298,
+      "grad_norm": 0.6864610314369202,
+      "learning_rate": 0.00011449216002759018,
+      "loss": 0.9896,
+      "step": 10206
+    },
+    {
+      "epoch": 1.8173076923076923,
+      "grad_norm": 0.737727701663971,
+      "learning_rate": 0.0001144783101828311,
+      "loss": 0.9447,
+      "step": 10207
+    },
+    {
+      "epoch": 1.8174857549857548,
+      "grad_norm": 0.6562525033950806,
+      "learning_rate": 0.00011446446005439964,
+      "loss": 1.1208,
+      "step": 10208
+    },
+    {
+      "epoch": 1.8176638176638176,
+      "grad_norm": 0.7203826308250427,
+      "learning_rate": 0.0001144506096425671,
+      "loss": 1.1339,
+      "step": 10209
+    },
+    {
+      "epoch": 1.8178418803418803,
+      "grad_norm": 0.6657233834266663,
+      "learning_rate": 0.00011443675894760489,
+      "loss": 0.8307,
+      "step": 10210
+    },
+    {
+      "epoch": 1.818019943019943,
+      "grad_norm": 0.7032586932182312,
+      "learning_rate": 0.00011442290796978437,
+      "loss": 0.8546,
+      "step": 10211
+    },
+    {
+      "epoch": 1.8181980056980058,
+      "grad_norm": 0.6989460587501526,
+      "learning_rate": 0.00011440905670937696,
+      "loss": 1.0749,
+      "step": 10212
+    },
+    {
+      "epoch": 1.8183760683760684,
+      "grad_norm": 0.6461085677146912,
+      "learning_rate": 0.00011439520516665399,
+      "loss": 0.984,
+      "step": 10213
+    },
+    {
+      "epoch": 1.818554131054131,
+      "grad_norm": 0.7077372670173645,
+      "learning_rate": 0.00011438135334188689,
+      "loss": 1.0813,
+      "step": 10214
+    },
+    {
+      "epoch": 1.8187321937321936,
+      "grad_norm": 0.6724075675010681,
+      "learning_rate": 0.00011436750123534704,
+      "loss": 0.9975,
+      "step": 10215
+    },
+    {
+      "epoch": 1.8189102564102564,
+      "grad_norm": 0.6205753684043884,
+      "learning_rate": 0.00011435364884730583,
+      "loss": 0.7414,
+      "step": 10216
+    },
+    {
+      "epoch": 1.8190883190883191,
+      "grad_norm": 0.6416093707084656,
+      "learning_rate": 0.00011433979617803472,
+      "loss": 1.0024,
+      "step": 10217
+    },
+    {
+      "epoch": 1.819266381766382,
+      "grad_norm": 0.7817183136940002,
+      "learning_rate": 0.00011432594322780508,
+      "loss": 1.0577,
+      "step": 10218
+    },
+    {
+      "epoch": 1.8194444444444444,
+      "grad_norm": 0.688220202922821,
+      "learning_rate": 0.00011431208999688835,
+      "loss": 1.0301,
+      "step": 10219
+    },
+    {
+      "epoch": 1.8196225071225072,
+      "grad_norm": 0.6464754343032837,
+      "learning_rate": 0.0001142982364855559,
+      "loss": 1.0608,
+      "step": 10220
+    },
+    {
+      "epoch": 1.8198005698005697,
+      "grad_norm": 0.6607306599617004,
+      "learning_rate": 0.00011428438269407926,
+      "loss": 1.1203,
+      "step": 10221
+    },
+    {
+      "epoch": 1.8199786324786325,
+      "grad_norm": 0.5779942870140076,
+      "learning_rate": 0.00011427052862272982,
+      "loss": 0.7895,
+      "step": 10222
+    },
+    {
+      "epoch": 1.8201566951566952,
+      "grad_norm": 0.7599068880081177,
+      "learning_rate": 0.000114256674271779,
+      "loss": 0.883,
+      "step": 10223
+    },
+    {
+      "epoch": 1.820334757834758,
+      "grad_norm": 0.6578865051269531,
+      "learning_rate": 0.00011424281964149824,
+      "loss": 1.101,
+      "step": 10224
+    },
+    {
+      "epoch": 1.8205128205128205,
+      "grad_norm": 0.7090746760368347,
+      "learning_rate": 0.00011422896473215905,
+      "loss": 0.9514,
+      "step": 10225
+    },
+    {
+      "epoch": 1.8206908831908832,
+      "grad_norm": 0.7537758946418762,
+      "learning_rate": 0.00011421510954403281,
+      "loss": 1.2193,
+      "step": 10226
+    },
+    {
+      "epoch": 1.8208689458689458,
+      "grad_norm": 0.670183002948761,
+      "learning_rate": 0.00011420125407739106,
+      "loss": 1.1408,
+      "step": 10227
+    },
+    {
+      "epoch": 1.8210470085470085,
+      "grad_norm": 0.742520809173584,
+      "learning_rate": 0.00011418739833250524,
+      "loss": 0.8826,
+      "step": 10228
+    },
+    {
+      "epoch": 1.8212250712250713,
+      "grad_norm": 0.6542800664901733,
+      "learning_rate": 0.00011417354230964683,
+      "loss": 1.0039,
+      "step": 10229
+    },
+    {
+      "epoch": 1.821403133903134,
+      "grad_norm": 0.6713709235191345,
+      "learning_rate": 0.00011415968600908727,
+      "loss": 0.9351,
+      "step": 10230
+    },
+    {
+      "epoch": 1.8215811965811965,
+      "grad_norm": 0.6794951558113098,
+      "learning_rate": 0.0001141458294310981,
+      "loss": 0.9491,
+      "step": 10231
+    },
+    {
+      "epoch": 1.8217592592592593,
+      "grad_norm": 0.6921972632408142,
+      "learning_rate": 0.00011413197257595079,
+      "loss": 1.1342,
+      "step": 10232
+    },
+    {
+      "epoch": 1.8219373219373218,
+      "grad_norm": 0.702586829662323,
+      "learning_rate": 0.00011411811544391682,
+      "loss": 0.9992,
+      "step": 10233
+    },
+    {
+      "epoch": 1.8221153846153846,
+      "grad_norm": 0.8147975206375122,
+      "learning_rate": 0.00011410425803526772,
+      "loss": 1.0507,
+      "step": 10234
+    },
+    {
+      "epoch": 1.8222934472934473,
+      "grad_norm": 0.66419517993927,
+      "learning_rate": 0.00011409040035027496,
+      "loss": 1.0426,
+      "step": 10235
+    },
+    {
+      "epoch": 1.82247150997151,
+      "grad_norm": 0.6132485866546631,
+      "learning_rate": 0.00011407654238921011,
+      "loss": 0.9859,
+      "step": 10236
+    },
+    {
+      "epoch": 1.8226495726495726,
+      "grad_norm": 0.7522366046905518,
+      "learning_rate": 0.00011406268415234462,
+      "loss": 0.9379,
+      "step": 10237
+    },
+    {
+      "epoch": 1.8228276353276354,
+      "grad_norm": 0.6335554122924805,
+      "learning_rate": 0.00011404882563995007,
+      "loss": 0.9322,
+      "step": 10238
+    },
+    {
+      "epoch": 1.823005698005698,
+      "grad_norm": 0.7577497363090515,
+      "learning_rate": 0.00011403496685229797,
+      "loss": 1.1383,
+      "step": 10239
+    },
+    {
+      "epoch": 1.8231837606837606,
+      "grad_norm": 0.6796886920928955,
+      "learning_rate": 0.00011402110778965982,
+      "loss": 1.0092,
+      "step": 10240
+    },
+    {
+      "epoch": 1.8233618233618234,
+      "grad_norm": 0.7676617503166199,
+      "learning_rate": 0.0001140072484523072,
+      "loss": 1.0137,
+      "step": 10241
+    },
+    {
+      "epoch": 1.8235398860398861,
+      "grad_norm": 0.7807821035385132,
+      "learning_rate": 0.00011399338884051165,
+      "loss": 0.8987,
+      "step": 10242
+    },
+    {
+      "epoch": 1.8237179487179487,
+      "grad_norm": 0.7169568538665771,
+      "learning_rate": 0.00011397952895454473,
+      "loss": 0.8984,
+      "step": 10243
+    },
+    {
+      "epoch": 1.8238960113960114,
+      "grad_norm": 0.6564654111862183,
+      "learning_rate": 0.00011396566879467793,
+      "loss": 1.0255,
+      "step": 10244
+    },
+    {
+      "epoch": 1.824074074074074,
+      "grad_norm": 0.7290034294128418,
+      "learning_rate": 0.00011395180836118292,
+      "loss": 0.9962,
+      "step": 10245
+    },
+    {
+      "epoch": 1.8242521367521367,
+      "grad_norm": 0.6610758900642395,
+      "learning_rate": 0.00011393794765433115,
+      "loss": 1.102,
+      "step": 10246
+    },
+    {
+      "epoch": 1.8244301994301995,
+      "grad_norm": 0.6875932216644287,
+      "learning_rate": 0.0001139240866743943,
+      "loss": 0.9963,
+      "step": 10247
+    },
+    {
+      "epoch": 1.8246082621082622,
+      "grad_norm": 0.7595645189285278,
+      "learning_rate": 0.00011391022542164387,
+      "loss": 1.1285,
+      "step": 10248
+    },
+    {
+      "epoch": 1.8247863247863247,
+      "grad_norm": 0.6752721667289734,
+      "learning_rate": 0.0001138963638963515,
+      "loss": 0.9447,
+      "step": 10249
+    },
+    {
+      "epoch": 1.8249643874643875,
+      "grad_norm": 0.6697955131530762,
+      "learning_rate": 0.00011388250209878873,
+      "loss": 1.0804,
+      "step": 10250
+    },
+    {
+      "epoch": 1.82514245014245,
+      "grad_norm": 0.6546956896781921,
+      "learning_rate": 0.00011386864002922713,
+      "loss": 0.9626,
+      "step": 10251
+    },
+    {
+      "epoch": 1.8253205128205128,
+      "grad_norm": 0.8002896904945374,
+      "learning_rate": 0.00011385477768793838,
+      "loss": 1.1933,
+      "step": 10252
+    },
+    {
+      "epoch": 1.8254985754985755,
+      "grad_norm": 0.6566781401634216,
+      "learning_rate": 0.00011384091507519403,
+      "loss": 0.9802,
+      "step": 10253
+    },
+    {
+      "epoch": 1.8256766381766383,
+      "grad_norm": 0.617420494556427,
+      "learning_rate": 0.00011382705219126572,
+      "loss": 1.1098,
+      "step": 10254
+    },
+    {
+      "epoch": 1.8258547008547008,
+      "grad_norm": 0.6558036208152771,
+      "learning_rate": 0.00011381318903642504,
+      "loss": 1.0291,
+      "step": 10255
+    },
+    {
+      "epoch": 1.8260327635327636,
+      "grad_norm": 0.6295637488365173,
+      "learning_rate": 0.00011379932561094358,
+      "loss": 1.0792,
+      "step": 10256
+    },
+    {
+      "epoch": 1.826210826210826,
+      "grad_norm": 0.7475154399871826,
+      "learning_rate": 0.00011378546191509303,
+      "loss": 1.1362,
+      "step": 10257
+    },
+    {
+      "epoch": 1.8263888888888888,
+      "grad_norm": 0.6814939379692078,
+      "learning_rate": 0.00011377159794914498,
+      "loss": 0.9131,
+      "step": 10258
+    },
+    {
+      "epoch": 1.8265669515669516,
+      "grad_norm": 0.6726876497268677,
+      "learning_rate": 0.00011375773371337111,
+      "loss": 0.9147,
+      "step": 10259
+    },
+    {
+      "epoch": 1.8267450142450143,
+      "grad_norm": 0.785943865776062,
+      "learning_rate": 0.00011374386920804298,
+      "loss": 1.0137,
+      "step": 10260
+    },
+    {
+      "epoch": 1.8269230769230769,
+      "grad_norm": 0.7614478468894958,
+      "learning_rate": 0.0001137300044334323,
+      "loss": 1.2118,
+      "step": 10261
+    },
+    {
+      "epoch": 1.8271011396011396,
+      "grad_norm": 0.7317564487457275,
+      "learning_rate": 0.00011371613938981072,
+      "loss": 1.0602,
+      "step": 10262
+    },
+    {
+      "epoch": 1.8272792022792022,
+      "grad_norm": 0.6716432571411133,
+      "learning_rate": 0.00011370227407744987,
+      "loss": 0.952,
+      "step": 10263
+    },
+    {
+      "epoch": 1.827457264957265,
+      "grad_norm": 0.6946425437927246,
+      "learning_rate": 0.00011368840849662139,
+      "loss": 1.0554,
+      "step": 10264
+    },
+    {
+      "epoch": 1.8276353276353277,
+      "grad_norm": 0.6692264080047607,
+      "learning_rate": 0.00011367454264759703,
+      "loss": 0.8944,
+      "step": 10265
+    },
+    {
+      "epoch": 1.8278133903133904,
+      "grad_norm": 0.6931505799293518,
+      "learning_rate": 0.00011366067653064838,
+      "loss": 0.9045,
+      "step": 10266
+    },
+    {
+      "epoch": 1.827991452991453,
+      "grad_norm": 0.7233194708824158,
+      "learning_rate": 0.00011364681014604716,
+      "loss": 0.9441,
+      "step": 10267
+    },
+    {
+      "epoch": 1.8281695156695157,
+      "grad_norm": 0.6451242566108704,
+      "learning_rate": 0.00011363294349406506,
+      "loss": 0.9948,
+      "step": 10268
+    },
+    {
+      "epoch": 1.8283475783475782,
+      "grad_norm": 0.6993351578712463,
+      "learning_rate": 0.00011361907657497375,
+      "loss": 1.1057,
+      "step": 10269
+    },
+    {
+      "epoch": 1.828525641025641,
+      "grad_norm": 0.7241137623786926,
+      "learning_rate": 0.00011360520938904493,
+      "loss": 0.974,
+      "step": 10270
+    },
+    {
+      "epoch": 1.8287037037037037,
+      "grad_norm": 0.6349480152130127,
+      "learning_rate": 0.00011359134193655027,
+      "loss": 0.9026,
+      "step": 10271
+    },
+    {
+      "epoch": 1.8288817663817665,
+      "grad_norm": 0.6916826963424683,
+      "learning_rate": 0.00011357747421776151,
+      "loss": 0.9153,
+      "step": 10272
+    },
+    {
+      "epoch": 1.8290598290598292,
+      "grad_norm": 0.879770040512085,
+      "learning_rate": 0.00011356360623295037,
+      "loss": 1.0818,
+      "step": 10273
+    },
+    {
+      "epoch": 1.8292378917378918,
+      "grad_norm": 0.6293807029724121,
+      "learning_rate": 0.00011354973798238853,
+      "loss": 1.1164,
+      "step": 10274
+    },
+    {
+      "epoch": 1.8294159544159543,
+      "grad_norm": 0.7070622444152832,
+      "learning_rate": 0.0001135358694663477,
+      "loss": 0.8795,
+      "step": 10275
+    },
+    {
+      "epoch": 1.829594017094017,
+      "grad_norm": 0.6847673654556274,
+      "learning_rate": 0.00011352200068509962,
+      "loss": 0.9173,
+      "step": 10276
+    },
+    {
+      "epoch": 1.8297720797720798,
+      "grad_norm": 0.6552146077156067,
+      "learning_rate": 0.00011350813163891605,
+      "loss": 1.0425,
+      "step": 10277
+    },
+    {
+      "epoch": 1.8299501424501425,
+      "grad_norm": 0.6432808041572571,
+      "learning_rate": 0.0001134942623280687,
+      "loss": 0.9418,
+      "step": 10278
+    },
+    {
+      "epoch": 1.8301282051282053,
+      "grad_norm": 0.7412393093109131,
+      "learning_rate": 0.00011348039275282931,
+      "loss": 1.1212,
+      "step": 10279
+    },
+    {
+      "epoch": 1.8303062678062678,
+      "grad_norm": 0.6543423533439636,
+      "learning_rate": 0.00011346652291346965,
+      "loss": 1.0553,
+      "step": 10280
+    },
+    {
+      "epoch": 1.8304843304843303,
+      "grad_norm": 0.7159286141395569,
+      "learning_rate": 0.00011345265281026138,
+      "loss": 1.0582,
+      "step": 10281
+    },
+    {
+      "epoch": 1.830662393162393,
+      "grad_norm": 0.6443323493003845,
+      "learning_rate": 0.00011343878244347639,
+      "loss": 0.9462,
+      "step": 10282
+    },
+    {
+      "epoch": 1.8308404558404558,
+      "grad_norm": 0.7592014074325562,
+      "learning_rate": 0.00011342491181338634,
+      "loss": 1.2718,
+      "step": 10283
+    },
+    {
+      "epoch": 1.8310185185185186,
+      "grad_norm": 0.627109944820404,
+      "learning_rate": 0.00011341104092026302,
+      "loss": 1.0177,
+      "step": 10284
+    },
+    {
+      "epoch": 1.8311965811965814,
+      "grad_norm": 0.8061598539352417,
+      "learning_rate": 0.00011339716976437827,
+      "loss": 0.9416,
+      "step": 10285
+    },
+    {
+      "epoch": 1.8313746438746439,
+      "grad_norm": 0.6584261059761047,
+      "learning_rate": 0.00011338329834600377,
+      "loss": 0.8297,
+      "step": 10286
+    },
+    {
+      "epoch": 1.8315527065527064,
+      "grad_norm": 0.6329470276832581,
+      "learning_rate": 0.00011336942666541133,
+      "loss": 0.8386,
+      "step": 10287
+    },
+    {
+      "epoch": 1.8317307692307692,
+      "grad_norm": 0.6833979487419128,
+      "learning_rate": 0.00011335555472287275,
+      "loss": 0.9407,
+      "step": 10288
+    },
+    {
+      "epoch": 1.831908831908832,
+      "grad_norm": 0.7663840651512146,
+      "learning_rate": 0.00011334168251865985,
+      "loss": 1.0018,
+      "step": 10289
+    },
+    {
+      "epoch": 1.8320868945868947,
+      "grad_norm": 0.7751262784004211,
+      "learning_rate": 0.00011332781005304436,
+      "loss": 1.0576,
+      "step": 10290
+    },
+    {
+      "epoch": 1.8322649572649574,
+      "grad_norm": 0.6857370138168335,
+      "learning_rate": 0.00011331393732629814,
+      "loss": 0.9888,
+      "step": 10291
+    },
+    {
+      "epoch": 1.83244301994302,
+      "grad_norm": 0.7534535527229309,
+      "learning_rate": 0.00011330006433869296,
+      "loss": 1.0834,
+      "step": 10292
+    },
+    {
+      "epoch": 1.8326210826210825,
+      "grad_norm": 0.6785250306129456,
+      "learning_rate": 0.00011328619109050065,
+      "loss": 1.0471,
+      "step": 10293
+    },
+    {
+      "epoch": 1.8327991452991452,
+      "grad_norm": 0.7023689150810242,
+      "learning_rate": 0.00011327231758199303,
+      "loss": 1.0652,
+      "step": 10294
+    },
+    {
+      "epoch": 1.832977207977208,
+      "grad_norm": 0.6776610612869263,
+      "learning_rate": 0.00011325844381344192,
+      "loss": 0.9504,
+      "step": 10295
+    },
+    {
+      "epoch": 1.8331552706552707,
+      "grad_norm": 0.7704112529754639,
+      "learning_rate": 0.00011324456978511917,
+      "loss": 0.9712,
+      "step": 10296
+    },
+    {
+      "epoch": 1.8333333333333335,
+      "grad_norm": 0.601502537727356,
+      "learning_rate": 0.00011323069549729654,
+      "loss": 1.075,
+      "step": 10297
+    },
+    {
+      "epoch": 1.833511396011396,
+      "grad_norm": 0.6282439231872559,
+      "learning_rate": 0.00011321682095024596,
+      "loss": 0.9238,
+      "step": 10298
+    },
+    {
+      "epoch": 1.8336894586894585,
+      "grad_norm": 0.6873499155044556,
+      "learning_rate": 0.00011320294614423921,
+      "loss": 1.0464,
+      "step": 10299
+    },
+    {
+      "epoch": 1.8338675213675213,
+      "grad_norm": 0.6063792705535889,
+      "learning_rate": 0.00011318907107954815,
+      "loss": 0.9732,
+      "step": 10300
+    },
+    {
+      "epoch": 1.834045584045584,
+      "grad_norm": 0.5830921530723572,
+      "learning_rate": 0.00011317519575644464,
+      "loss": 0.7568,
+      "step": 10301
+    },
+    {
+      "epoch": 1.8342236467236468,
+      "grad_norm": 0.6394222378730774,
+      "learning_rate": 0.00011316132017520053,
+      "loss": 0.9958,
+      "step": 10302
+    },
+    {
+      "epoch": 1.8344017094017095,
+      "grad_norm": 0.7052412033081055,
+      "learning_rate": 0.00011314744433608773,
+      "loss": 0.9129,
+      "step": 10303
+    },
+    {
+      "epoch": 1.834579772079772,
+      "grad_norm": 0.7287624478340149,
+      "learning_rate": 0.00011313356823937801,
+      "loss": 0.8608,
+      "step": 10304
+    },
+    {
+      "epoch": 1.8347578347578346,
+      "grad_norm": 0.702937662601471,
+      "learning_rate": 0.00011311969188534334,
+      "loss": 1.3074,
+      "step": 10305
+    },
+    {
+      "epoch": 1.8349358974358974,
+      "grad_norm": 0.6693850159645081,
+      "learning_rate": 0.00011310581527425557,
+      "loss": 0.928,
+      "step": 10306
+    },
+    {
+      "epoch": 1.83511396011396,
+      "grad_norm": 0.8153932094573975,
+      "learning_rate": 0.00011309193840638654,
+      "loss": 1.1771,
+      "step": 10307
+    },
+    {
+      "epoch": 1.8352920227920229,
+      "grad_norm": 0.6517418622970581,
+      "learning_rate": 0.00011307806128200821,
+      "loss": 0.9634,
+      "step": 10308
+    },
+    {
+      "epoch": 1.8354700854700856,
+      "grad_norm": 0.6626226305961609,
+      "learning_rate": 0.00011306418390139245,
+      "loss": 0.9371,
+      "step": 10309
+    },
+    {
+      "epoch": 1.8356481481481481,
+      "grad_norm": 0.7397477030754089,
+      "learning_rate": 0.0001130503062648111,
+      "loss": 0.9398,
+      "step": 10310
+    },
+    {
+      "epoch": 1.8358262108262107,
+      "grad_norm": 0.6790265440940857,
+      "learning_rate": 0.00011303642837253614,
+      "loss": 0.9728,
+      "step": 10311
+    },
+    {
+      "epoch": 1.8360042735042734,
+      "grad_norm": 0.6266449093818665,
+      "learning_rate": 0.00011302255022483941,
+      "loss": 0.847,
+      "step": 10312
+    },
+    {
+      "epoch": 1.8361823361823362,
+      "grad_norm": 0.791657030582428,
+      "learning_rate": 0.00011300867182199288,
+      "loss": 0.8342,
+      "step": 10313
+    },
+    {
+      "epoch": 1.836360398860399,
+      "grad_norm": 0.7128583788871765,
+      "learning_rate": 0.00011299479316426846,
+      "loss": 0.9591,
+      "step": 10314
+    },
+    {
+      "epoch": 1.8365384615384617,
+      "grad_norm": 0.659928023815155,
+      "learning_rate": 0.00011298091425193806,
+      "loss": 1.0282,
+      "step": 10315
+    },
+    {
+      "epoch": 1.8367165242165242,
+      "grad_norm": 0.6641396284103394,
+      "learning_rate": 0.00011296703508527363,
+      "loss": 1.0161,
+      "step": 10316
+    },
+    {
+      "epoch": 1.8368945868945867,
+      "grad_norm": 0.7921316027641296,
+      "learning_rate": 0.00011295315566454702,
+      "loss": 0.8897,
+      "step": 10317
+    },
+    {
+      "epoch": 1.8370726495726495,
+      "grad_norm": 0.6900694966316223,
+      "learning_rate": 0.00011293927599003029,
+      "loss": 1.0094,
+      "step": 10318
+    },
+    {
+      "epoch": 1.8372507122507122,
+      "grad_norm": 0.8054366707801819,
+      "learning_rate": 0.0001129253960619953,
+      "loss": 0.9489,
+      "step": 10319
+    },
+    {
+      "epoch": 1.837428774928775,
+      "grad_norm": 0.6623767018318176,
+      "learning_rate": 0.00011291151588071405,
+      "loss": 0.92,
+      "step": 10320
+    },
+    {
+      "epoch": 1.8376068376068377,
+      "grad_norm": 0.6143901348114014,
+      "learning_rate": 0.00011289763544645846,
+      "loss": 0.8093,
+      "step": 10321
+    },
+    {
+      "epoch": 1.8377849002849003,
+      "grad_norm": 0.8207027316093445,
+      "learning_rate": 0.00011288375475950046,
+      "loss": 1.2402,
+      "step": 10322
+    },
+    {
+      "epoch": 1.8379629629629628,
+      "grad_norm": 0.6759985685348511,
+      "learning_rate": 0.00011286987382011209,
+      "loss": 0.9179,
+      "step": 10323
+    },
+    {
+      "epoch": 1.8381410256410255,
+      "grad_norm": 0.745439887046814,
+      "learning_rate": 0.00011285599262856523,
+      "loss": 0.8157,
+      "step": 10324
+    },
+    {
+      "epoch": 1.8383190883190883,
+      "grad_norm": 0.6873317360877991,
+      "learning_rate": 0.00011284211118513194,
+      "loss": 0.8681,
+      "step": 10325
+    },
+    {
+      "epoch": 1.838497150997151,
+      "grad_norm": 0.7060160040855408,
+      "learning_rate": 0.00011282822949008416,
+      "loss": 1.0833,
+      "step": 10326
+    },
+    {
+      "epoch": 1.8386752136752138,
+      "grad_norm": 0.8079642653465271,
+      "learning_rate": 0.00011281434754369389,
+      "loss": 0.8639,
+      "step": 10327
+    },
+    {
+      "epoch": 1.8388532763532763,
+      "grad_norm": 0.6434001922607422,
+      "learning_rate": 0.00011280046534623303,
+      "loss": 0.9269,
+      "step": 10328
+    },
+    {
+      "epoch": 1.839031339031339,
+      "grad_norm": 0.7005292773246765,
+      "learning_rate": 0.0001127865828979737,
+      "loss": 1.1475,
+      "step": 10329
+    },
+    {
+      "epoch": 1.8392094017094016,
+      "grad_norm": 0.7004852890968323,
+      "learning_rate": 0.00011277270019918784,
+      "loss": 0.9467,
+      "step": 10330
+    },
+    {
+      "epoch": 1.8393874643874644,
+      "grad_norm": 0.7542549967765808,
+      "learning_rate": 0.00011275881725014743,
+      "loss": 1.0371,
+      "step": 10331
+    },
+    {
+      "epoch": 1.8395655270655271,
+      "grad_norm": 0.674051821231842,
+      "learning_rate": 0.00011274493405112452,
+      "loss": 1.1097,
+      "step": 10332
+    },
+    {
+      "epoch": 1.8397435897435899,
+      "grad_norm": 0.8136405348777771,
+      "learning_rate": 0.00011273105060239107,
+      "loss": 0.9718,
+      "step": 10333
+    },
+    {
+      "epoch": 1.8399216524216524,
+      "grad_norm": 0.6524073481559753,
+      "learning_rate": 0.00011271716690421916,
+      "loss": 0.9953,
+      "step": 10334
+    },
+    {
+      "epoch": 1.8400997150997151,
+      "grad_norm": 0.7436625957489014,
+      "learning_rate": 0.00011270328295688077,
+      "loss": 1.0722,
+      "step": 10335
+    },
+    {
+      "epoch": 1.8402777777777777,
+      "grad_norm": 0.6815723180770874,
+      "learning_rate": 0.00011268939876064795,
+      "loss": 1.0924,
+      "step": 10336
+    },
+    {
+      "epoch": 1.8404558404558404,
+      "grad_norm": 0.6923388242721558,
+      "learning_rate": 0.0001126755143157927,
+      "loss": 0.921,
+      "step": 10337
+    },
+    {
+      "epoch": 1.8406339031339032,
+      "grad_norm": 0.7464849948883057,
+      "learning_rate": 0.00011266162962258708,
+      "loss": 1.0549,
+      "step": 10338
+    },
+    {
+      "epoch": 1.840811965811966,
+      "grad_norm": 0.6621805429458618,
+      "learning_rate": 0.00011264774468130315,
+      "loss": 1.0764,
+      "step": 10339
+    },
+    {
+      "epoch": 1.8409900284900285,
+      "grad_norm": 0.7370132803916931,
+      "learning_rate": 0.00011263385949221295,
+      "loss": 0.7818,
+      "step": 10340
+    },
+    {
+      "epoch": 1.8411680911680912,
+      "grad_norm": 0.673100471496582,
+      "learning_rate": 0.00011261997405558848,
+      "loss": 1.04,
+      "step": 10341
+    },
+    {
+      "epoch": 1.8413461538461537,
+      "grad_norm": 0.5978201031684875,
+      "learning_rate": 0.00011260608837170183,
+      "loss": 0.9644,
+      "step": 10342
+    },
+    {
+      "epoch": 1.8415242165242165,
+      "grad_norm": 0.6868628263473511,
+      "learning_rate": 0.00011259220244082507,
+      "loss": 0.9533,
+      "step": 10343
+    },
+    {
+      "epoch": 1.8417022792022792,
+      "grad_norm": 0.6580314636230469,
+      "learning_rate": 0.0001125783162632303,
+      "loss": 0.9506,
+      "step": 10344
+    },
+    {
+      "epoch": 1.841880341880342,
+      "grad_norm": 0.7238291501998901,
+      "learning_rate": 0.00011256442983918951,
+      "loss": 0.8663,
+      "step": 10345
+    },
+    {
+      "epoch": 1.8420584045584045,
+      "grad_norm": 0.5838520526885986,
+      "learning_rate": 0.00011255054316897484,
+      "loss": 0.9606,
+      "step": 10346
+    },
+    {
+      "epoch": 1.8422364672364673,
+      "grad_norm": 0.7102842926979065,
+      "learning_rate": 0.00011253665625285836,
+      "loss": 0.801,
+      "step": 10347
+    },
+    {
+      "epoch": 1.8424145299145298,
+      "grad_norm": 0.6449147462844849,
+      "learning_rate": 0.0001125227690911121,
+      "loss": 1.0827,
+      "step": 10348
+    },
+    {
+      "epoch": 1.8425925925925926,
+      "grad_norm": 0.6355304718017578,
+      "learning_rate": 0.00011250888168400823,
+      "loss": 1.0369,
+      "step": 10349
+    },
+    {
+      "epoch": 1.8427706552706553,
+      "grad_norm": 0.678977906703949,
+      "learning_rate": 0.0001124949940318188,
+      "loss": 0.9491,
+      "step": 10350
+    },
+    {
+      "epoch": 1.842948717948718,
+      "grad_norm": 0.6366633772850037,
+      "learning_rate": 0.00011248110613481592,
+      "loss": 0.7272,
+      "step": 10351
+    },
+    {
+      "epoch": 1.8431267806267806,
+      "grad_norm": 0.6639098525047302,
+      "learning_rate": 0.00011246721799327171,
+      "loss": 1.0313,
+      "step": 10352
+    },
+    {
+      "epoch": 1.8433048433048433,
+      "grad_norm": 0.6034720540046692,
+      "learning_rate": 0.00011245332960745822,
+      "loss": 0.7141,
+      "step": 10353
+    },
+    {
+      "epoch": 1.8434829059829059,
+      "grad_norm": 0.8118346333503723,
+      "learning_rate": 0.00011243944097764763,
+      "loss": 1.171,
+      "step": 10354
+    },
+    {
+      "epoch": 1.8436609686609686,
+      "grad_norm": 0.6706618070602417,
+      "learning_rate": 0.00011242555210411203,
+      "loss": 0.9578,
+      "step": 10355
+    },
+    {
+      "epoch": 1.8438390313390314,
+      "grad_norm": 0.619562029838562,
+      "learning_rate": 0.00011241166298712355,
+      "loss": 0.9883,
+      "step": 10356
+    },
+    {
+      "epoch": 1.8440170940170941,
+      "grad_norm": 0.6471936106681824,
+      "learning_rate": 0.00011239777362695434,
+      "loss": 0.8897,
+      "step": 10357
+    },
+    {
+      "epoch": 1.8441951566951567,
+      "grad_norm": 0.7179005742073059,
+      "learning_rate": 0.00011238388402387645,
+      "loss": 0.9646,
+      "step": 10358
+    },
+    {
+      "epoch": 1.8443732193732194,
+      "grad_norm": 0.7726966738700867,
+      "learning_rate": 0.00011236999417816214,
+      "loss": 0.8855,
+      "step": 10359
+    },
+    {
+      "epoch": 1.844551282051282,
+      "grad_norm": 0.6733565330505371,
+      "learning_rate": 0.00011235610409008346,
+      "loss": 1.0379,
+      "step": 10360
+    },
+    {
+      "epoch": 1.8447293447293447,
+      "grad_norm": 0.7317814826965332,
+      "learning_rate": 0.0001123422137599126,
+      "loss": 0.8528,
+      "step": 10361
+    },
+    {
+      "epoch": 1.8449074074074074,
+      "grad_norm": 0.6727005839347839,
+      "learning_rate": 0.0001123283231879217,
+      "loss": 0.9612,
+      "step": 10362
+    },
+    {
+      "epoch": 1.8450854700854702,
+      "grad_norm": 0.6350542306900024,
+      "learning_rate": 0.00011231443237438289,
+      "loss": 0.9939,
+      "step": 10363
+    },
+    {
+      "epoch": 1.8452635327635327,
+      "grad_norm": 0.693148672580719,
+      "learning_rate": 0.00011230054131956836,
+      "loss": 1.0149,
+      "step": 10364
+    },
+    {
+      "epoch": 1.8454415954415955,
+      "grad_norm": 0.7263579368591309,
+      "learning_rate": 0.0001122866500237503,
+      "loss": 1.1044,
+      "step": 10365
+    },
+    {
+      "epoch": 1.845619658119658,
+      "grad_norm": 0.7044230699539185,
+      "learning_rate": 0.00011227275848720085,
+      "loss": 1.0677,
+      "step": 10366
+    },
+    {
+      "epoch": 1.8457977207977208,
+      "grad_norm": 0.6895326972007751,
+      "learning_rate": 0.00011225886671019219,
+      "loss": 1.1025,
+      "step": 10367
+    },
+    {
+      "epoch": 1.8459757834757835,
+      "grad_norm": 0.6045145988464355,
+      "learning_rate": 0.00011224497469299651,
+      "loss": 0.8079,
+      "step": 10368
+    },
+    {
+      "epoch": 1.8461538461538463,
+      "grad_norm": 0.6613210439682007,
+      "learning_rate": 0.00011223108243588599,
+      "loss": 1.0345,
+      "step": 10369
+    },
+    {
+      "epoch": 1.8463319088319088,
+      "grad_norm": 0.6288960576057434,
+      "learning_rate": 0.0001122171899391328,
+      "loss": 1.0166,
+      "step": 10370
+    },
+    {
+      "epoch": 1.8465099715099715,
+      "grad_norm": 0.6158748865127563,
+      "learning_rate": 0.00011220329720300917,
+      "loss": 0.895,
+      "step": 10371
+    },
+    {
+      "epoch": 1.846688034188034,
+      "grad_norm": 0.6583057641983032,
+      "learning_rate": 0.00011218940422778728,
+      "loss": 0.8059,
+      "step": 10372
+    },
+    {
+      "epoch": 1.8468660968660968,
+      "grad_norm": 0.6761550903320312,
+      "learning_rate": 0.00011217551101373932,
+      "loss": 0.9253,
+      "step": 10373
+    },
+    {
+      "epoch": 1.8470441595441596,
+      "grad_norm": 0.5969263315200806,
+      "learning_rate": 0.0001121616175611375,
+      "loss": 0.8549,
+      "step": 10374
+    },
+    {
+      "epoch": 1.8472222222222223,
+      "grad_norm": 0.7994722723960876,
+      "learning_rate": 0.00011214772387025407,
+      "loss": 0.9918,
+      "step": 10375
+    },
+    {
+      "epoch": 1.8474002849002849,
+      "grad_norm": 0.6949167847633362,
+      "learning_rate": 0.00011213382994136123,
+      "loss": 1.1853,
+      "step": 10376
+    },
+    {
+      "epoch": 1.8475783475783476,
+      "grad_norm": 0.7356176376342773,
+      "learning_rate": 0.00011211993577473121,
+      "loss": 0.8809,
+      "step": 10377
+    },
+    {
+      "epoch": 1.8477564102564101,
+      "grad_norm": 0.7110268473625183,
+      "learning_rate": 0.0001121060413706362,
+      "loss": 0.9805,
+      "step": 10378
+    },
+    {
+      "epoch": 1.8479344729344729,
+      "grad_norm": 0.6509962677955627,
+      "learning_rate": 0.00011209214672934846,
+      "loss": 0.8899,
+      "step": 10379
+    },
+    {
+      "epoch": 1.8481125356125356,
+      "grad_norm": 0.6103082299232483,
+      "learning_rate": 0.00011207825185114025,
+      "loss": 0.8576,
+      "step": 10380
+    },
+    {
+      "epoch": 1.8482905982905984,
+      "grad_norm": 0.6261070966720581,
+      "learning_rate": 0.00011206435673628377,
+      "loss": 0.8884,
+      "step": 10381
+    },
+    {
+      "epoch": 1.848468660968661,
+      "grad_norm": 0.7629222273826599,
+      "learning_rate": 0.00011205046138505126,
+      "loss": 1.1714,
+      "step": 10382
+    },
+    {
+      "epoch": 1.8486467236467237,
+      "grad_norm": 0.617957353591919,
+      "learning_rate": 0.000112036565797715,
+      "loss": 0.9546,
+      "step": 10383
+    },
+    {
+      "epoch": 1.8488247863247862,
+      "grad_norm": 0.6926987171173096,
+      "learning_rate": 0.00011202266997454724,
+      "loss": 0.8842,
+      "step": 10384
+    },
+    {
+      "epoch": 1.849002849002849,
+      "grad_norm": 0.602758526802063,
+      "learning_rate": 0.00011200877391582025,
+      "loss": 0.9782,
+      "step": 10385
+    },
+    {
+      "epoch": 1.8491809116809117,
+      "grad_norm": 0.706731915473938,
+      "learning_rate": 0.00011199487762180627,
+      "loss": 0.8176,
+      "step": 10386
+    },
+    {
+      "epoch": 1.8493589743589745,
+      "grad_norm": 0.7135118842124939,
+      "learning_rate": 0.0001119809810927776,
+      "loss": 0.9277,
+      "step": 10387
+    },
+    {
+      "epoch": 1.8495370370370372,
+      "grad_norm": 0.7484592199325562,
+      "learning_rate": 0.00011196708432900647,
+      "loss": 1.0733,
+      "step": 10388
+    },
+    {
+      "epoch": 1.8497150997150997,
+      "grad_norm": 0.7087157964706421,
+      "learning_rate": 0.00011195318733076519,
+      "loss": 0.9443,
+      "step": 10389
+    },
+    {
+      "epoch": 1.8498931623931623,
+      "grad_norm": 0.6511468291282654,
+      "learning_rate": 0.00011193929009832602,
+      "loss": 0.955,
+      "step": 10390
+    },
+    {
+      "epoch": 1.850071225071225,
+      "grad_norm": 0.6386628746986389,
+      "learning_rate": 0.0001119253926319613,
+      "loss": 1.0357,
+      "step": 10391
+    },
+    {
+      "epoch": 1.8502492877492878,
+      "grad_norm": 0.6400021314620972,
+      "learning_rate": 0.00011191149493194327,
+      "loss": 0.8094,
+      "step": 10392
+    },
+    {
+      "epoch": 1.8504273504273505,
+      "grad_norm": 0.7942537069320679,
+      "learning_rate": 0.00011189759699854423,
+      "loss": 0.9717,
+      "step": 10393
+    },
+    {
+      "epoch": 1.8506054131054133,
+      "grad_norm": 0.7230474948883057,
+      "learning_rate": 0.00011188369883203647,
+      "loss": 0.9043,
+      "step": 10394
+    },
+    {
+      "epoch": 1.8507834757834758,
+      "grad_norm": 0.8837162852287292,
+      "learning_rate": 0.00011186980043269235,
+      "loss": 1.2821,
+      "step": 10395
+    },
+    {
+      "epoch": 1.8509615384615383,
+      "grad_norm": 0.7260291576385498,
+      "learning_rate": 0.00011185590180078413,
+      "loss": 1.1672,
+      "step": 10396
+    },
+    {
+      "epoch": 1.851139601139601,
+      "grad_norm": 0.6290066242218018,
+      "learning_rate": 0.00011184200293658415,
+      "loss": 0.8942,
+      "step": 10397
+    },
+    {
+      "epoch": 1.8513176638176638,
+      "grad_norm": 0.6571013331413269,
+      "learning_rate": 0.00011182810384036475,
+      "loss": 1.0753,
+      "step": 10398
+    },
+    {
+      "epoch": 1.8514957264957266,
+      "grad_norm": 0.6494737267494202,
+      "learning_rate": 0.00011181420451239817,
+      "loss": 0.8833,
+      "step": 10399
+    },
+    {
+      "epoch": 1.8516737891737893,
+      "grad_norm": 0.7383694648742676,
+      "learning_rate": 0.00011180030495295684,
+      "loss": 1.0094,
+      "step": 10400
+    },
+    {
+      "epoch": 1.8518518518518519,
+      "grad_norm": 0.6713876724243164,
+      "learning_rate": 0.00011178640516231302,
+      "loss": 0.975,
+      "step": 10401
+    },
+    {
+      "epoch": 1.8520299145299144,
+      "grad_norm": 0.8041042685508728,
+      "learning_rate": 0.00011177250514073912,
+      "loss": 1.1419,
+      "step": 10402
+    },
+    {
+      "epoch": 1.8522079772079771,
+      "grad_norm": 0.7035061120986938,
+      "learning_rate": 0.00011175860488850738,
+      "loss": 1.0921,
+      "step": 10403
+    },
+    {
+      "epoch": 1.85238603988604,
+      "grad_norm": 0.6135673522949219,
+      "learning_rate": 0.00011174470440589022,
+      "loss": 0.9611,
+      "step": 10404
+    },
+    {
+      "epoch": 1.8525641025641026,
+      "grad_norm": 0.7868386507034302,
+      "learning_rate": 0.00011173080369315999,
+      "loss": 0.8561,
+      "step": 10405
+    },
+    {
+      "epoch": 1.8527421652421654,
+      "grad_norm": 0.6575735211372375,
+      "learning_rate": 0.00011171690275058902,
+      "loss": 1.0256,
+      "step": 10406
+    },
+    {
+      "epoch": 1.852920227920228,
+      "grad_norm": 0.7514392137527466,
+      "learning_rate": 0.00011170300157844969,
+      "loss": 1.0868,
+      "step": 10407
+    },
+    {
+      "epoch": 1.8530982905982905,
+      "grad_norm": 0.6915257573127747,
+      "learning_rate": 0.00011168910017701436,
+      "loss": 1.1223,
+      "step": 10408
+    },
+    {
+      "epoch": 1.8532763532763532,
+      "grad_norm": 0.7406772971153259,
+      "learning_rate": 0.00011167519854655535,
+      "loss": 1.0922,
+      "step": 10409
+    },
+    {
+      "epoch": 1.853454415954416,
+      "grad_norm": 0.6632742881774902,
+      "learning_rate": 0.0001116612966873451,
+      "loss": 0.9082,
+      "step": 10410
+    },
+    {
+      "epoch": 1.8536324786324787,
+      "grad_norm": 0.8154461979866028,
+      "learning_rate": 0.00011164739459965598,
+      "loss": 1.1126,
+      "step": 10411
+    },
+    {
+      "epoch": 1.8538105413105415,
+      "grad_norm": 0.895764172077179,
+      "learning_rate": 0.00011163349228376037,
+      "loss": 1.0589,
+      "step": 10412
+    },
+    {
+      "epoch": 1.853988603988604,
+      "grad_norm": 0.6746504902839661,
+      "learning_rate": 0.00011161958973993063,
+      "loss": 1.0184,
+      "step": 10413
+    },
+    {
+      "epoch": 1.8541666666666665,
+      "grad_norm": 0.7271263003349304,
+      "learning_rate": 0.00011160568696843916,
+      "loss": 0.9989,
+      "step": 10414
+    },
+    {
+      "epoch": 1.8543447293447293,
+      "grad_norm": 0.7503132820129395,
+      "learning_rate": 0.00011159178396955836,
+      "loss": 1.0783,
+      "step": 10415
+    },
+    {
+      "epoch": 1.854522792022792,
+      "grad_norm": 0.6768177151679993,
+      "learning_rate": 0.00011157788074356066,
+      "loss": 0.9916,
+      "step": 10416
+    },
+    {
+      "epoch": 1.8547008547008548,
+      "grad_norm": 0.6804978251457214,
+      "learning_rate": 0.00011156397729071842,
+      "loss": 0.9534,
+      "step": 10417
+    },
+    {
+      "epoch": 1.8548789173789175,
+      "grad_norm": 0.7144617438316345,
+      "learning_rate": 0.00011155007361130408,
+      "loss": 0.991,
+      "step": 10418
+    },
+    {
+      "epoch": 1.85505698005698,
+      "grad_norm": 0.6816750168800354,
+      "learning_rate": 0.00011153616970559,
+      "loss": 0.9551,
+      "step": 10419
+    },
+    {
+      "epoch": 1.8552350427350426,
+      "grad_norm": 0.6620030999183655,
+      "learning_rate": 0.00011152226557384866,
+      "loss": 0.8854,
+      "step": 10420
+    },
+    {
+      "epoch": 1.8554131054131053,
+      "grad_norm": 0.8400058746337891,
+      "learning_rate": 0.00011150836121635249,
+      "loss": 1.1593,
+      "step": 10421
+    },
+    {
+      "epoch": 1.855591168091168,
+      "grad_norm": 0.6666815280914307,
+      "learning_rate": 0.00011149445663337385,
+      "loss": 1.2112,
+      "step": 10422
+    },
+    {
+      "epoch": 1.8557692307692308,
+      "grad_norm": 0.7298431396484375,
+      "learning_rate": 0.00011148055182518522,
+      "loss": 0.9721,
+      "step": 10423
+    },
+    {
+      "epoch": 1.8559472934472936,
+      "grad_norm": 0.66816645860672,
+      "learning_rate": 0.00011146664679205903,
+      "loss": 1.0945,
+      "step": 10424
+    },
+    {
+      "epoch": 1.8561253561253561,
+      "grad_norm": 0.5979483127593994,
+      "learning_rate": 0.00011145274153426771,
+      "loss": 1.0176,
+      "step": 10425
+    },
+    {
+      "epoch": 1.8563034188034186,
+      "grad_norm": 0.6579445600509644,
+      "learning_rate": 0.00011143883605208372,
+      "loss": 0.9143,
+      "step": 10426
+    },
+    {
+      "epoch": 1.8564814814814814,
+      "grad_norm": 0.6871697902679443,
+      "learning_rate": 0.0001114249303457795,
+      "loss": 1.071,
+      "step": 10427
+    },
+    {
+      "epoch": 1.8566595441595442,
+      "grad_norm": 0.6683333516120911,
+      "learning_rate": 0.0001114110244156275,
+      "loss": 0.7809,
+      "step": 10428
+    },
+    {
+      "epoch": 1.856837606837607,
+      "grad_norm": 0.6122907996177673,
+      "learning_rate": 0.0001113971182619002,
+      "loss": 0.8329,
+      "step": 10429
+    },
+    {
+      "epoch": 1.8570156695156697,
+      "grad_norm": 0.6510575413703918,
+      "learning_rate": 0.00011138321188487,
+      "loss": 1.0068,
+      "step": 10430
+    },
+    {
+      "epoch": 1.8571937321937322,
+      "grad_norm": 0.6417793035507202,
+      "learning_rate": 0.00011136930528480945,
+      "loss": 1.0093,
+      "step": 10431
+    },
+    {
+      "epoch": 1.8573717948717947,
+      "grad_norm": 0.595824658870697,
+      "learning_rate": 0.00011135539846199096,
+      "loss": 0.9856,
+      "step": 10432
+    },
+    {
+      "epoch": 1.8575498575498575,
+      "grad_norm": 0.7594470381736755,
+      "learning_rate": 0.00011134149141668704,
+      "loss": 0.8173,
+      "step": 10433
+    },
+    {
+      "epoch": 1.8577279202279202,
+      "grad_norm": 0.7078324556350708,
+      "learning_rate": 0.00011132758414917016,
+      "loss": 1.0236,
+      "step": 10434
+    },
+    {
+      "epoch": 1.857905982905983,
+      "grad_norm": 0.6830437779426575,
+      "learning_rate": 0.00011131367665971275,
+      "loss": 0.8483,
+      "step": 10435
+    },
+    {
+      "epoch": 1.8580840455840457,
+      "grad_norm": 0.6856399774551392,
+      "learning_rate": 0.0001112997689485874,
+      "loss": 0.8729,
+      "step": 10436
+    },
+    {
+      "epoch": 1.8582621082621082,
+      "grad_norm": 0.6530426144599915,
+      "learning_rate": 0.00011128586101606653,
+      "loss": 0.8616,
+      "step": 10437
+    },
+    {
+      "epoch": 1.8584401709401708,
+      "grad_norm": 0.6341808438301086,
+      "learning_rate": 0.00011127195286242267,
+      "loss": 0.896,
+      "step": 10438
+    },
+    {
+      "epoch": 1.8586182336182335,
+      "grad_norm": 0.6278257966041565,
+      "learning_rate": 0.00011125804448792831,
+      "loss": 0.8309,
+      "step": 10439
+    },
+    {
+      "epoch": 1.8587962962962963,
+      "grad_norm": 0.708705723285675,
+      "learning_rate": 0.00011124413589285594,
+      "loss": 1.1065,
+      "step": 10440
+    },
+    {
+      "epoch": 1.858974358974359,
+      "grad_norm": 0.6845232248306274,
+      "learning_rate": 0.00011123022707747808,
+      "loss": 0.9292,
+      "step": 10441
+    },
+    {
+      "epoch": 1.8591524216524218,
+      "grad_norm": 0.749204695224762,
+      "learning_rate": 0.00011121631804206726,
+      "loss": 1.0487,
+      "step": 10442
+    },
+    {
+      "epoch": 1.8593304843304843,
+      "grad_norm": 0.7123128771781921,
+      "learning_rate": 0.00011120240878689599,
+      "loss": 0.9138,
+      "step": 10443
+    },
+    {
+      "epoch": 1.859508547008547,
+      "grad_norm": 0.6862115263938904,
+      "learning_rate": 0.00011118849931223679,
+      "loss": 1.0675,
+      "step": 10444
+    },
+    {
+      "epoch": 1.8596866096866096,
+      "grad_norm": 0.7245760560035706,
+      "learning_rate": 0.00011117458961836215,
+      "loss": 0.9643,
+      "step": 10445
+    },
+    {
+      "epoch": 1.8598646723646723,
+      "grad_norm": 0.701574444770813,
+      "learning_rate": 0.0001111606797055447,
+      "loss": 1.0022,
+      "step": 10446
+    },
+    {
+      "epoch": 1.860042735042735,
+      "grad_norm": 0.7292088270187378,
+      "learning_rate": 0.0001111467695740569,
+      "loss": 0.9465,
+      "step": 10447
+    },
+    {
+      "epoch": 1.8602207977207978,
+      "grad_norm": 0.7045044302940369,
+      "learning_rate": 0.0001111328592241713,
+      "loss": 1.0942,
+      "step": 10448
+    },
+    {
+      "epoch": 1.8603988603988604,
+      "grad_norm": 0.7181426286697388,
+      "learning_rate": 0.00011111894865616046,
+      "loss": 1.2108,
+      "step": 10449
+    },
+    {
+      "epoch": 1.8605769230769231,
+      "grad_norm": 0.6083306074142456,
+      "learning_rate": 0.00011110503787029689,
+      "loss": 0.929,
+      "step": 10450
+    },
+    {
+      "epoch": 1.8607549857549857,
+      "grad_norm": 0.6847347617149353,
+      "learning_rate": 0.00011109112686685319,
+      "loss": 1.0911,
+      "step": 10451
+    },
+    {
+      "epoch": 1.8609330484330484,
+      "grad_norm": 0.7131744027137756,
+      "learning_rate": 0.0001110772156461019,
+      "loss": 0.9649,
+      "step": 10452
+    },
+    {
+      "epoch": 1.8611111111111112,
+      "grad_norm": 0.7920312881469727,
+      "learning_rate": 0.00011106330420831559,
+      "loss": 0.9965,
+      "step": 10453
+    },
+    {
+      "epoch": 1.861289173789174,
+      "grad_norm": 0.6640987992286682,
+      "learning_rate": 0.00011104939255376681,
+      "loss": 1.2346,
+      "step": 10454
+    },
+    {
+      "epoch": 1.8614672364672364,
+      "grad_norm": 0.5878208875656128,
+      "learning_rate": 0.00011103548068272811,
+      "loss": 0.8565,
+      "step": 10455
+    },
+    {
+      "epoch": 1.8616452991452992,
+      "grad_norm": 0.6636882424354553,
+      "learning_rate": 0.0001110215685954721,
+      "loss": 0.8556,
+      "step": 10456
+    },
+    {
+      "epoch": 1.8618233618233617,
+      "grad_norm": 0.5985570549964905,
+      "learning_rate": 0.00011100765629227137,
+      "loss": 1.0291,
+      "step": 10457
+    },
+    {
+      "epoch": 1.8620014245014245,
+      "grad_norm": 0.7546643614768982,
+      "learning_rate": 0.00011099374377339846,
+      "loss": 1.0199,
+      "step": 10458
+    },
+    {
+      "epoch": 1.8621794871794872,
+      "grad_norm": 0.6529727578163147,
+      "learning_rate": 0.00011097983103912602,
+      "loss": 1.0826,
+      "step": 10459
+    },
+    {
+      "epoch": 1.86235754985755,
+      "grad_norm": 0.6394338607788086,
+      "learning_rate": 0.00011096591808972654,
+      "loss": 0.9896,
+      "step": 10460
+    },
+    {
+      "epoch": 1.8625356125356125,
+      "grad_norm": 0.6508805751800537,
+      "learning_rate": 0.00011095200492547271,
+      "loss": 0.9659,
+      "step": 10461
+    },
+    {
+      "epoch": 1.8627136752136753,
+      "grad_norm": 0.7085812091827393,
+      "learning_rate": 0.00011093809154663705,
+      "loss": 0.9998,
+      "step": 10462
+    },
+    {
+      "epoch": 1.8628917378917378,
+      "grad_norm": 0.6488457322120667,
+      "learning_rate": 0.00011092417795349226,
+      "loss": 0.9757,
+      "step": 10463
+    },
+    {
+      "epoch": 1.8630698005698005,
+      "grad_norm": 0.6405763626098633,
+      "learning_rate": 0.0001109102641463109,
+      "loss": 0.8188,
+      "step": 10464
+    },
+    {
+      "epoch": 1.8632478632478633,
+      "grad_norm": 0.713361918926239,
+      "learning_rate": 0.00011089635012536554,
+      "loss": 0.886,
+      "step": 10465
+    },
+    {
+      "epoch": 1.863425925925926,
+      "grad_norm": 0.5752255916595459,
+      "learning_rate": 0.00011088243589092886,
+      "loss": 1.0223,
+      "step": 10466
+    },
+    {
+      "epoch": 1.8636039886039886,
+      "grad_norm": 0.6722734570503235,
+      "learning_rate": 0.00011086852144327344,
+      "loss": 0.9499,
+      "step": 10467
+    },
+    {
+      "epoch": 1.8637820512820513,
+      "grad_norm": 0.5516420006752014,
+      "learning_rate": 0.00011085460678267194,
+      "loss": 0.7767,
+      "step": 10468
+    },
+    {
+      "epoch": 1.8639601139601139,
+      "grad_norm": 0.731257438659668,
+      "learning_rate": 0.00011084069190939697,
+      "loss": 1.2299,
+      "step": 10469
+    },
+    {
+      "epoch": 1.8641381766381766,
+      "grad_norm": 0.7977055907249451,
+      "learning_rate": 0.00011082677682372114,
+      "loss": 0.9109,
+      "step": 10470
+    },
+    {
+      "epoch": 1.8643162393162394,
+      "grad_norm": 0.679900586605072,
+      "learning_rate": 0.0001108128615259171,
+      "loss": 0.9319,
+      "step": 10471
+    },
+    {
+      "epoch": 1.864494301994302,
+      "grad_norm": 0.7428545951843262,
+      "learning_rate": 0.00011079894601625754,
+      "loss": 0.8585,
+      "step": 10472
+    },
+    {
+      "epoch": 1.8646723646723646,
+      "grad_norm": 0.6560967564582825,
+      "learning_rate": 0.00011078503029501504,
+      "loss": 1.0069,
+      "step": 10473
+    },
+    {
+      "epoch": 1.8648504273504274,
+      "grad_norm": 0.636202871799469,
+      "learning_rate": 0.00011077111436246228,
+      "loss": 1.0329,
+      "step": 10474
+    },
+    {
+      "epoch": 1.86502849002849,
+      "grad_norm": 0.6666205525398254,
+      "learning_rate": 0.00011075719821887191,
+      "loss": 1.0123,
+      "step": 10475
+    },
+    {
+      "epoch": 1.8652065527065527,
+      "grad_norm": 0.7089471220970154,
+      "learning_rate": 0.00011074328186451657,
+      "loss": 0.7851,
+      "step": 10476
+    },
+    {
+      "epoch": 1.8653846153846154,
+      "grad_norm": 0.6054788827896118,
+      "learning_rate": 0.00011072936529966895,
+      "loss": 0.8224,
+      "step": 10477
+    },
+    {
+      "epoch": 1.8655626780626782,
+      "grad_norm": 0.6009029150009155,
+      "learning_rate": 0.00011071544852460172,
+      "loss": 0.865,
+      "step": 10478
+    },
+    {
+      "epoch": 1.8657407407407407,
+      "grad_norm": 0.6238716244697571,
+      "learning_rate": 0.00011070153153958753,
+      "loss": 0.8685,
+      "step": 10479
+    },
+    {
+      "epoch": 1.8659188034188035,
+      "grad_norm": 0.719985842704773,
+      "learning_rate": 0.00011068761434489903,
+      "loss": 1.2204,
+      "step": 10480
+    },
+    {
+      "epoch": 1.866096866096866,
+      "grad_norm": 0.72972172498703,
+      "learning_rate": 0.00011067369694080895,
+      "loss": 1.0454,
+      "step": 10481
+    },
+    {
+      "epoch": 1.8662749287749287,
+      "grad_norm": 0.6741998791694641,
+      "learning_rate": 0.00011065977932758995,
+      "loss": 0.9992,
+      "step": 10482
+    },
+    {
+      "epoch": 1.8664529914529915,
+      "grad_norm": 0.6150268912315369,
+      "learning_rate": 0.00011064586150551472,
+      "loss": 0.8866,
+      "step": 10483
+    },
+    {
+      "epoch": 1.8666310541310542,
+      "grad_norm": 0.8253782391548157,
+      "learning_rate": 0.00011063194347485597,
+      "loss": 1.1173,
+      "step": 10484
+    },
+    {
+      "epoch": 1.8668091168091168,
+      "grad_norm": 0.7176247835159302,
+      "learning_rate": 0.00011061802523588636,
+      "loss": 1.0414,
+      "step": 10485
+    },
+    {
+      "epoch": 1.8669871794871795,
+      "grad_norm": 0.6372736096382141,
+      "learning_rate": 0.00011060410678887858,
+      "loss": 1.0548,
+      "step": 10486
+    },
+    {
+      "epoch": 1.867165242165242,
+      "grad_norm": 0.7107454538345337,
+      "learning_rate": 0.00011059018813410538,
+      "loss": 1.2298,
+      "step": 10487
+    },
+    {
+      "epoch": 1.8673433048433048,
+      "grad_norm": 0.7113911509513855,
+      "learning_rate": 0.00011057626927183944,
+      "loss": 0.9598,
+      "step": 10488
+    },
+    {
+      "epoch": 1.8675213675213675,
+      "grad_norm": 0.6734410524368286,
+      "learning_rate": 0.00011056235020235346,
+      "loss": 0.9475,
+      "step": 10489
+    },
+    {
+      "epoch": 1.8676994301994303,
+      "grad_norm": 0.6875202655792236,
+      "learning_rate": 0.0001105484309259202,
+      "loss": 1.0735,
+      "step": 10490
+    },
+    {
+      "epoch": 1.8678774928774928,
+      "grad_norm": 0.6908353567123413,
+      "learning_rate": 0.0001105345114428123,
+      "loss": 1.0558,
+      "step": 10491
+    },
+    {
+      "epoch": 1.8680555555555556,
+      "grad_norm": 0.6283324360847473,
+      "learning_rate": 0.00011052059175330256,
+      "loss": 0.8872,
+      "step": 10492
+    },
+    {
+      "epoch": 1.868233618233618,
+      "grad_norm": 0.6422587633132935,
+      "learning_rate": 0.00011050667185766368,
+      "loss": 1.1022,
+      "step": 10493
+    },
+    {
+      "epoch": 1.8684116809116809,
+      "grad_norm": 0.7075859904289246,
+      "learning_rate": 0.0001104927517561684,
+      "loss": 1.1389,
+      "step": 10494
+    },
+    {
+      "epoch": 1.8685897435897436,
+      "grad_norm": 0.5896905064582825,
+      "learning_rate": 0.00011047883144908944,
+      "loss": 0.7732,
+      "step": 10495
+    },
+    {
+      "epoch": 1.8687678062678064,
+      "grad_norm": 0.7647629976272583,
+      "learning_rate": 0.00011046491093669953,
+      "loss": 0.9983,
+      "step": 10496
+    },
+    {
+      "epoch": 1.868945868945869,
+      "grad_norm": 0.5864735841751099,
+      "learning_rate": 0.00011045099021927144,
+      "loss": 0.8427,
+      "step": 10497
+    },
+    {
+      "epoch": 1.8691239316239316,
+      "grad_norm": 0.6766837239265442,
+      "learning_rate": 0.00011043706929707791,
+      "loss": 0.9595,
+      "step": 10498
+    },
+    {
+      "epoch": 1.8693019943019942,
+      "grad_norm": 0.5480074286460876,
+      "learning_rate": 0.00011042314817039168,
+      "loss": 0.691,
+      "step": 10499
+    },
+    {
+      "epoch": 1.869480056980057,
+      "grad_norm": 0.6259615421295166,
+      "learning_rate": 0.00011040922683948553,
+      "loss": 0.9991,
+      "step": 10500
+    },
+    {
+      "epoch": 1.8696581196581197,
+      "grad_norm": 0.5950598120689392,
+      "learning_rate": 0.00011039530530463218,
+      "loss": 0.7413,
+      "step": 10501
+    },
+    {
+      "epoch": 1.8698361823361824,
+      "grad_norm": 0.8099377751350403,
+      "learning_rate": 0.00011038138356610441,
+      "loss": 1.1351,
+      "step": 10502
+    },
+    {
+      "epoch": 1.8700142450142452,
+      "grad_norm": 0.6716185212135315,
+      "learning_rate": 0.00011036746162417501,
+      "loss": 1.1057,
+      "step": 10503
+    },
+    {
+      "epoch": 1.8701923076923077,
+      "grad_norm": 0.7993219494819641,
+      "learning_rate": 0.00011035353947911675,
+      "loss": 1.2095,
+      "step": 10504
+    },
+    {
+      "epoch": 1.8703703703703702,
+      "grad_norm": 0.6381276249885559,
+      "learning_rate": 0.00011033961713120237,
+      "loss": 1.0261,
+      "step": 10505
+    },
+    {
+      "epoch": 1.870548433048433,
+      "grad_norm": 0.6326032280921936,
+      "learning_rate": 0.00011032569458070469,
+      "loss": 0.8664,
+      "step": 10506
+    },
+    {
+      "epoch": 1.8707264957264957,
+      "grad_norm": 0.6864820718765259,
+      "learning_rate": 0.00011031177182789644,
+      "loss": 0.9959,
+      "step": 10507
+    },
+    {
+      "epoch": 1.8709045584045585,
+      "grad_norm": 0.6341838240623474,
+      "learning_rate": 0.00011029784887305048,
+      "loss": 0.8029,
+      "step": 10508
+    },
+    {
+      "epoch": 1.8710826210826212,
+      "grad_norm": 0.6559172868728638,
+      "learning_rate": 0.00011028392571643957,
+      "loss": 0.9282,
+      "step": 10509
+    },
+    {
+      "epoch": 1.8712606837606838,
+      "grad_norm": 0.6976849436759949,
+      "learning_rate": 0.0001102700023583365,
+      "loss": 1.0198,
+      "step": 10510
+    },
+    {
+      "epoch": 1.8714387464387463,
+      "grad_norm": 0.7159395217895508,
+      "learning_rate": 0.00011025607879901402,
+      "loss": 1.1585,
+      "step": 10511
+    },
+    {
+      "epoch": 1.871616809116809,
+      "grad_norm": 0.7168624997138977,
+      "learning_rate": 0.000110242155038745,
+      "loss": 1.0558,
+      "step": 10512
+    },
+    {
+      "epoch": 1.8717948717948718,
+      "grad_norm": 0.5784319043159485,
+      "learning_rate": 0.00011022823107780224,
+      "loss": 0.9481,
+      "step": 10513
+    },
+    {
+      "epoch": 1.8719729344729346,
+      "grad_norm": 0.6602259874343872,
+      "learning_rate": 0.00011021430691645856,
+      "loss": 1.0538,
+      "step": 10514
+    },
+    {
+      "epoch": 1.8721509971509973,
+      "grad_norm": 0.6874588131904602,
+      "learning_rate": 0.00011020038255498672,
+      "loss": 1.1396,
+      "step": 10515
+    },
+    {
+      "epoch": 1.8723290598290598,
+      "grad_norm": 0.7311663031578064,
+      "learning_rate": 0.00011018645799365956,
+      "loss": 1.084,
+      "step": 10516
+    },
+    {
+      "epoch": 1.8725071225071224,
+      "grad_norm": 0.7097118496894836,
+      "learning_rate": 0.00011017253323274996,
+      "loss": 0.9872,
+      "step": 10517
+    },
+    {
+      "epoch": 1.8726851851851851,
+      "grad_norm": 0.6667875051498413,
+      "learning_rate": 0.00011015860827253068,
+      "loss": 1.105,
+      "step": 10518
+    },
+    {
+      "epoch": 1.8728632478632479,
+      "grad_norm": 0.6807677745819092,
+      "learning_rate": 0.0001101446831132746,
+      "loss": 0.9093,
+      "step": 10519
+    },
+    {
+      "epoch": 1.8730413105413106,
+      "grad_norm": 0.6885797381401062,
+      "learning_rate": 0.0001101307577552545,
+      "loss": 0.8479,
+      "step": 10520
+    },
+    {
+      "epoch": 1.8732193732193734,
+      "grad_norm": 0.6269213557243347,
+      "learning_rate": 0.00011011683219874323,
+      "loss": 0.9457,
+      "step": 10521
+    },
+    {
+      "epoch": 1.873397435897436,
+      "grad_norm": 0.7096766829490662,
+      "learning_rate": 0.00011010290644401364,
+      "loss": 1.0971,
+      "step": 10522
+    },
+    {
+      "epoch": 1.8735754985754984,
+      "grad_norm": 0.6909209489822388,
+      "learning_rate": 0.00011008898049133863,
+      "loss": 0.9928,
+      "step": 10523
+    },
+    {
+      "epoch": 1.8737535612535612,
+      "grad_norm": 0.6586211323738098,
+      "learning_rate": 0.000110075054340991,
+      "loss": 0.818,
+      "step": 10524
+    },
+    {
+      "epoch": 1.873931623931624,
+      "grad_norm": 0.5934817790985107,
+      "learning_rate": 0.0001100611279932436,
+      "loss": 0.7698,
+      "step": 10525
+    },
+    {
+      "epoch": 1.8741096866096867,
+      "grad_norm": 0.6361709237098694,
+      "learning_rate": 0.00011004720144836931,
+      "loss": 0.9465,
+      "step": 10526
+    },
+    {
+      "epoch": 1.8742877492877494,
+      "grad_norm": 0.6742212176322937,
+      "learning_rate": 0.00011003327470664095,
+      "loss": 1.0998,
+      "step": 10527
+    },
+    {
+      "epoch": 1.874465811965812,
+      "grad_norm": 0.6634946465492249,
+      "learning_rate": 0.00011001934776833143,
+      "loss": 0.8328,
+      "step": 10528
+    },
+    {
+      "epoch": 1.8746438746438745,
+      "grad_norm": 0.6754063963890076,
+      "learning_rate": 0.0001100054206337136,
+      "loss": 1.147,
+      "step": 10529
+    },
+    {
+      "epoch": 1.8748219373219372,
+      "grad_norm": 0.5951135158538818,
+      "learning_rate": 0.00010999149330306036,
+      "loss": 0.8956,
+      "step": 10530
+    },
+    {
+      "epoch": 1.875,
+      "grad_norm": 0.6140317320823669,
+      "learning_rate": 0.00010997756577664455,
+      "loss": 0.9368,
+      "step": 10531
+    },
+    {
+      "epoch": 1.8751780626780628,
+      "grad_norm": 0.6419258713722229,
+      "learning_rate": 0.00010996363805473904,
+      "loss": 0.9817,
+      "step": 10532
+    },
+    {
+      "epoch": 1.8753561253561255,
+      "grad_norm": 0.7173396348953247,
+      "learning_rate": 0.00010994971013761677,
+      "loss": 0.9638,
+      "step": 10533
+    },
+    {
+      "epoch": 1.875534188034188,
+      "grad_norm": 0.8125925660133362,
+      "learning_rate": 0.0001099357820255506,
+      "loss": 1.0996,
+      "step": 10534
+    },
+    {
+      "epoch": 1.8757122507122506,
+      "grad_norm": 0.6191564798355103,
+      "learning_rate": 0.00010992185371881341,
+      "loss": 0.8266,
+      "step": 10535
+    },
+    {
+      "epoch": 1.8758903133903133,
+      "grad_norm": 0.6632885336875916,
+      "learning_rate": 0.0001099079252176781,
+      "loss": 1.1884,
+      "step": 10536
+    },
+    {
+      "epoch": 1.876068376068376,
+      "grad_norm": 0.7323372960090637,
+      "learning_rate": 0.00010989399652241759,
+      "loss": 1.0842,
+      "step": 10537
+    },
+    {
+      "epoch": 1.8762464387464388,
+      "grad_norm": 0.7553854584693909,
+      "learning_rate": 0.00010988006763330476,
+      "loss": 0.9948,
+      "step": 10538
+    },
+    {
+      "epoch": 1.8764245014245016,
+      "grad_norm": 0.5887658596038818,
+      "learning_rate": 0.00010986613855061255,
+      "loss": 0.7653,
+      "step": 10539
+    },
+    {
+      "epoch": 1.876602564102564,
+      "grad_norm": 0.6849574446678162,
+      "learning_rate": 0.00010985220927461384,
+      "loss": 1.152,
+      "step": 10540
+    },
+    {
+      "epoch": 1.8767806267806266,
+      "grad_norm": 0.6985000371932983,
+      "learning_rate": 0.00010983827980558155,
+      "loss": 0.9869,
+      "step": 10541
+    },
+    {
+      "epoch": 1.8769586894586894,
+      "grad_norm": 0.6885373592376709,
+      "learning_rate": 0.00010982435014378858,
+      "loss": 1.1803,
+      "step": 10542
+    },
+    {
+      "epoch": 1.8771367521367521,
+      "grad_norm": 0.7610142827033997,
+      "learning_rate": 0.00010981042028950788,
+      "loss": 0.9219,
+      "step": 10543
+    },
+    {
+      "epoch": 1.8773148148148149,
+      "grad_norm": 0.6545612215995789,
+      "learning_rate": 0.00010979649024301242,
+      "loss": 1.0337,
+      "step": 10544
+    },
+    {
+      "epoch": 1.8774928774928776,
+      "grad_norm": 0.7307698130607605,
+      "learning_rate": 0.00010978256000457505,
+      "loss": 0.9726,
+      "step": 10545
+    },
+    {
+      "epoch": 1.8776709401709402,
+      "grad_norm": 0.68310546875,
+      "learning_rate": 0.00010976862957446877,
+      "loss": 1.161,
+      "step": 10546
+    },
+    {
+      "epoch": 1.8778490028490027,
+      "grad_norm": 0.6114758253097534,
+      "learning_rate": 0.00010975469895296646,
+      "loss": 0.8863,
+      "step": 10547
+    },
+    {
+      "epoch": 1.8780270655270654,
+      "grad_norm": 0.732390820980072,
+      "learning_rate": 0.00010974076814034106,
+      "loss": 1.0339,
+      "step": 10548
+    },
+    {
+      "epoch": 1.8782051282051282,
+      "grad_norm": 0.6741712689399719,
+      "learning_rate": 0.0001097268371368656,
+      "loss": 1.0024,
+      "step": 10549
+    },
+    {
+      "epoch": 1.878383190883191,
+      "grad_norm": 0.6374897360801697,
+      "learning_rate": 0.00010971290594281294,
+      "loss": 0.91,
+      "step": 10550
+    },
+    {
+      "epoch": 1.8785612535612537,
+      "grad_norm": 0.6434261202812195,
+      "learning_rate": 0.00010969897455845608,
+      "loss": 1.0048,
+      "step": 10551
+    },
+    {
+      "epoch": 1.8787393162393162,
+      "grad_norm": 0.6573047041893005,
+      "learning_rate": 0.00010968504298406794,
+      "loss": 1.118,
+      "step": 10552
+    },
+    {
+      "epoch": 1.8789173789173788,
+      "grad_norm": 0.6686552166938782,
+      "learning_rate": 0.00010967111121992152,
+      "loss": 1.089,
+      "step": 10553
+    },
+    {
+      "epoch": 1.8790954415954415,
+      "grad_norm": 0.7899606823921204,
+      "learning_rate": 0.00010965717926628976,
+      "loss": 1.059,
+      "step": 10554
+    },
+    {
+      "epoch": 1.8792735042735043,
+      "grad_norm": 0.5808879733085632,
+      "learning_rate": 0.00010964324712344564,
+      "loss": 0.9369,
+      "step": 10555
+    },
+    {
+      "epoch": 1.879451566951567,
+      "grad_norm": 0.6322834491729736,
+      "learning_rate": 0.00010962931479166211,
+      "loss": 0.8783,
+      "step": 10556
+    },
+    {
+      "epoch": 1.8796296296296298,
+      "grad_norm": 0.647002637386322,
+      "learning_rate": 0.00010961538227121218,
+      "loss": 0.9468,
+      "step": 10557
+    },
+    {
+      "epoch": 1.8798076923076923,
+      "grad_norm": 0.6581854820251465,
+      "learning_rate": 0.0001096014495623688,
+      "loss": 1.0077,
+      "step": 10558
+    },
+    {
+      "epoch": 1.8799857549857548,
+      "grad_norm": 0.6879259943962097,
+      "learning_rate": 0.00010958751666540496,
+      "loss": 0.976,
+      "step": 10559
+    },
+    {
+      "epoch": 1.8801638176638176,
+      "grad_norm": 0.7055090665817261,
+      "learning_rate": 0.00010957358358059364,
+      "loss": 0.8903,
+      "step": 10560
+    },
+    {
+      "epoch": 1.8803418803418803,
+      "grad_norm": 0.6865016222000122,
+      "learning_rate": 0.00010955965030820782,
+      "loss": 0.9872,
+      "step": 10561
+    },
+    {
+      "epoch": 1.880519943019943,
+      "grad_norm": 0.663436770439148,
+      "learning_rate": 0.00010954571684852055,
+      "loss": 1.0485,
+      "step": 10562
+    },
+    {
+      "epoch": 1.8806980056980058,
+      "grad_norm": 0.6861656904220581,
+      "learning_rate": 0.00010953178320180475,
+      "loss": 1.0691,
+      "step": 10563
+    },
+    {
+      "epoch": 1.8808760683760684,
+      "grad_norm": 0.8045449256896973,
+      "learning_rate": 0.0001095178493683335,
+      "loss": 1.1534,
+      "step": 10564
+    },
+    {
+      "epoch": 1.881054131054131,
+      "grad_norm": 0.6493151187896729,
+      "learning_rate": 0.00010950391534837973,
+      "loss": 0.8756,
+      "step": 10565
+    },
+    {
+      "epoch": 1.8812321937321936,
+      "grad_norm": 0.7057121992111206,
+      "learning_rate": 0.00010948998114221651,
+      "loss": 1.1709,
+      "step": 10566
+    },
+    {
+      "epoch": 1.8814102564102564,
+      "grad_norm": 0.7708197236061096,
+      "learning_rate": 0.0001094760467501168,
+      "loss": 1.0037,
+      "step": 10567
+    },
+    {
+      "epoch": 1.8815883190883191,
+      "grad_norm": 0.7234642505645752,
+      "learning_rate": 0.00010946211217235364,
+      "loss": 1.0757,
+      "step": 10568
+    },
+    {
+      "epoch": 1.881766381766382,
+      "grad_norm": 0.6964395642280579,
+      "learning_rate": 0.00010944817740920006,
+      "loss": 1.0769,
+      "step": 10569
+    },
+    {
+      "epoch": 1.8819444444444444,
+      "grad_norm": 0.7465848922729492,
+      "learning_rate": 0.00010943424246092906,
+      "loss": 0.9772,
+      "step": 10570
+    },
+    {
+      "epoch": 1.8821225071225072,
+      "grad_norm": 0.7145788073539734,
+      "learning_rate": 0.0001094203073278137,
+      "loss": 0.9638,
+      "step": 10571
+    },
+    {
+      "epoch": 1.8823005698005697,
+      "grad_norm": 0.7421764135360718,
+      "learning_rate": 0.00010940637201012698,
+      "loss": 1.0324,
+      "step": 10572
+    },
+    {
+      "epoch": 1.8824786324786325,
+      "grad_norm": 0.7373253107070923,
+      "learning_rate": 0.0001093924365081419,
+      "loss": 1.1554,
+      "step": 10573
+    },
+    {
+      "epoch": 1.8826566951566952,
+      "grad_norm": 0.6861984729766846,
+      "learning_rate": 0.00010937850082213156,
+      "loss": 0.9899,
+      "step": 10574
+    },
+    {
+      "epoch": 1.882834757834758,
+      "grad_norm": 0.6173393130302429,
+      "learning_rate": 0.000109364564952369,
+      "loss": 0.8495,
+      "step": 10575
+    },
+    {
+      "epoch": 1.8830128205128205,
+      "grad_norm": 0.6871610879898071,
+      "learning_rate": 0.00010935062889912723,
+      "loss": 1.2164,
+      "step": 10576
+    },
+    {
+      "epoch": 1.8831908831908832,
+      "grad_norm": 0.7062903642654419,
+      "learning_rate": 0.00010933669266267931,
+      "loss": 1.1077,
+      "step": 10577
+    },
+    {
+      "epoch": 1.8833689458689458,
+      "grad_norm": 0.6574689745903015,
+      "learning_rate": 0.00010932275624329828,
+      "loss": 0.9326,
+      "step": 10578
+    },
+    {
+      "epoch": 1.8835470085470085,
+      "grad_norm": 0.636385440826416,
+      "learning_rate": 0.00010930881964125723,
+      "loss": 1.0581,
+      "step": 10579
+    },
+    {
+      "epoch": 1.8837250712250713,
+      "grad_norm": 0.6178432106971741,
+      "learning_rate": 0.0001092948828568292,
+      "loss": 1.1288,
+      "step": 10580
+    },
+    {
+      "epoch": 1.883903133903134,
+      "grad_norm": 0.6509431600570679,
+      "learning_rate": 0.00010928094589028721,
+      "loss": 1.0113,
+      "step": 10581
+    },
+    {
+      "epoch": 1.8840811965811965,
+      "grad_norm": 0.6543706059455872,
+      "learning_rate": 0.00010926700874190441,
+      "loss": 1.0041,
+      "step": 10582
+    },
+    {
+      "epoch": 1.8842592592592593,
+      "grad_norm": 0.6815463304519653,
+      "learning_rate": 0.0001092530714119538,
+      "loss": 1.0892,
+      "step": 10583
+    },
+    {
+      "epoch": 1.8844373219373218,
+      "grad_norm": 0.6787421107292175,
+      "learning_rate": 0.00010923913390070846,
+      "loss": 1.2693,
+      "step": 10584
+    },
+    {
+      "epoch": 1.8846153846153846,
+      "grad_norm": 0.6953850984573364,
+      "learning_rate": 0.00010922519620844151,
+      "loss": 0.9848,
+      "step": 10585
+    },
+    {
+      "epoch": 1.8847934472934473,
+      "grad_norm": 0.7061360478401184,
+      "learning_rate": 0.000109211258335426,
+      "loss": 0.949,
+      "step": 10586
+    },
+    {
+      "epoch": 1.88497150997151,
+      "grad_norm": 0.6845372915267944,
+      "learning_rate": 0.00010919732028193504,
+      "loss": 0.9554,
+      "step": 10587
+    },
+    {
+      "epoch": 1.8851495726495726,
+      "grad_norm": 0.6524720788002014,
+      "learning_rate": 0.00010918338204824165,
+      "loss": 1.1037,
+      "step": 10588
+    },
+    {
+      "epoch": 1.8853276353276354,
+      "grad_norm": 0.6410523653030396,
+      "learning_rate": 0.00010916944363461899,
+      "loss": 0.9085,
+      "step": 10589
+    },
+    {
+      "epoch": 1.885505698005698,
+      "grad_norm": 0.7109059691429138,
+      "learning_rate": 0.00010915550504134014,
+      "loss": 1.0526,
+      "step": 10590
+    },
+    {
+      "epoch": 1.8856837606837606,
+      "grad_norm": 0.7781991362571716,
+      "learning_rate": 0.00010914156626867818,
+      "loss": 0.9737,
+      "step": 10591
+    },
+    {
+      "epoch": 1.8858618233618234,
+      "grad_norm": 0.7173767685890198,
+      "learning_rate": 0.00010912762731690623,
+      "loss": 0.8862,
+      "step": 10592
+    },
+    {
+      "epoch": 1.8860398860398861,
+      "grad_norm": 0.7650504112243652,
+      "learning_rate": 0.00010911368818629732,
+      "loss": 1.2175,
+      "step": 10593
+    },
+    {
+      "epoch": 1.8862179487179487,
+      "grad_norm": 0.6316116452217102,
+      "learning_rate": 0.00010909974887712468,
+      "loss": 0.8332,
+      "step": 10594
+    },
+    {
+      "epoch": 1.8863960113960114,
+      "grad_norm": 0.6504800319671631,
+      "learning_rate": 0.00010908580938966138,
+      "loss": 0.8864,
+      "step": 10595
+    },
+    {
+      "epoch": 1.886574074074074,
+      "grad_norm": 0.675507128238678,
+      "learning_rate": 0.00010907186972418049,
+      "loss": 0.8523,
+      "step": 10596
+    },
+    {
+      "epoch": 1.8867521367521367,
+      "grad_norm": 0.6535763144493103,
+      "learning_rate": 0.00010905792988095515,
+      "loss": 1.0786,
+      "step": 10597
+    },
+    {
+      "epoch": 1.8869301994301995,
+      "grad_norm": 0.7071853280067444,
+      "learning_rate": 0.0001090439898602585,
+      "loss": 0.9319,
+      "step": 10598
+    },
+    {
+      "epoch": 1.8871082621082622,
+      "grad_norm": 0.699466347694397,
+      "learning_rate": 0.00010903004966236365,
+      "loss": 0.9573,
+      "step": 10599
+    },
+    {
+      "epoch": 1.8872863247863247,
+      "grad_norm": 0.7099201083183289,
+      "learning_rate": 0.00010901610928754375,
+      "loss": 0.9447,
+      "step": 10600
+    },
+    {
+      "epoch": 1.8874643874643875,
+      "grad_norm": 0.6140450835227966,
+      "learning_rate": 0.00010900216873607189,
+      "loss": 1.0227,
+      "step": 10601
+    },
+    {
+      "epoch": 1.88764245014245,
+      "grad_norm": 0.6613629460334778,
+      "learning_rate": 0.00010898822800822127,
+      "loss": 1.0152,
+      "step": 10602
+    },
+    {
+      "epoch": 1.8878205128205128,
+      "grad_norm": 0.7334819436073303,
+      "learning_rate": 0.00010897428710426498,
+      "loss": 1.1452,
+      "step": 10603
+    },
+    {
+      "epoch": 1.8879985754985755,
+      "grad_norm": 0.6819368004798889,
+      "learning_rate": 0.00010896034602447616,
+      "loss": 1.0504,
+      "step": 10604
+    },
+    {
+      "epoch": 1.8881766381766383,
+      "grad_norm": 0.6781361103057861,
+      "learning_rate": 0.00010894640476912799,
+      "loss": 0.8719,
+      "step": 10605
+    },
+    {
+      "epoch": 1.8883547008547008,
+      "grad_norm": 0.621960461139679,
+      "learning_rate": 0.00010893246333849361,
+      "loss": 0.9264,
+      "step": 10606
+    },
+    {
+      "epoch": 1.8885327635327636,
+      "grad_norm": 0.6350592374801636,
+      "learning_rate": 0.00010891852173284615,
+      "loss": 1.0042,
+      "step": 10607
+    },
+    {
+      "epoch": 1.888710826210826,
+      "grad_norm": 0.6650694012641907,
+      "learning_rate": 0.00010890457995245879,
+      "loss": 1.1387,
+      "step": 10608
+    },
+    {
+      "epoch": 1.8888888888888888,
+      "grad_norm": 0.6515723466873169,
+      "learning_rate": 0.00010889063799760468,
+      "loss": 0.9508,
+      "step": 10609
+    },
+    {
+      "epoch": 1.8890669515669516,
+      "grad_norm": 0.6368890404701233,
+      "learning_rate": 0.000108876695868557,
+      "loss": 0.8051,
+      "step": 10610
+    },
+    {
+      "epoch": 1.8892450142450143,
+      "grad_norm": 0.7971013188362122,
+      "learning_rate": 0.00010886275356558888,
+      "loss": 0.8629,
+      "step": 10611
+    },
+    {
+      "epoch": 1.8894230769230769,
+      "grad_norm": 0.6739095449447632,
+      "learning_rate": 0.00010884881108897353,
+      "loss": 0.9606,
+      "step": 10612
+    },
+    {
+      "epoch": 1.8896011396011396,
+      "grad_norm": 0.7754076719284058,
+      "learning_rate": 0.00010883486843898412,
+      "loss": 1.0751,
+      "step": 10613
+    },
+    {
+      "epoch": 1.8897792022792022,
+      "grad_norm": 0.6538285613059998,
+      "learning_rate": 0.00010882092561589379,
+      "loss": 0.9288,
+      "step": 10614
+    },
+    {
+      "epoch": 1.889957264957265,
+      "grad_norm": 0.7373257875442505,
+      "learning_rate": 0.00010880698261997577,
+      "loss": 0.9884,
+      "step": 10615
+    },
+    {
+      "epoch": 1.8901353276353277,
+      "grad_norm": 0.6575660109519958,
+      "learning_rate": 0.00010879303945150321,
+      "loss": 1.0307,
+      "step": 10616
+    },
+    {
+      "epoch": 1.8903133903133904,
+      "grad_norm": 0.7500179409980774,
+      "learning_rate": 0.00010877909611074932,
+      "loss": 1.0812,
+      "step": 10617
+    },
+    {
+      "epoch": 1.890491452991453,
+      "grad_norm": 0.7607308030128479,
+      "learning_rate": 0.00010876515259798727,
+      "loss": 0.9746,
+      "step": 10618
+    },
+    {
+      "epoch": 1.8906695156695157,
+      "grad_norm": 0.7930253744125366,
+      "learning_rate": 0.00010875120891349024,
+      "loss": 0.7911,
+      "step": 10619
+    },
+    {
+      "epoch": 1.8908475783475782,
+      "grad_norm": 0.635254979133606,
+      "learning_rate": 0.00010873726505753148,
+      "loss": 1.0468,
+      "step": 10620
+    },
+    {
+      "epoch": 1.891025641025641,
+      "grad_norm": 0.7579759359359741,
+      "learning_rate": 0.00010872332103038414,
+      "loss": 0.9558,
+      "step": 10621
+    },
+    {
+      "epoch": 1.8912037037037037,
+      "grad_norm": 0.5841903686523438,
+      "learning_rate": 0.00010870937683232146,
+      "loss": 0.913,
+      "step": 10622
+    },
+    {
+      "epoch": 1.8913817663817665,
+      "grad_norm": 0.7088860273361206,
+      "learning_rate": 0.00010869543246361664,
+      "loss": 1.0814,
+      "step": 10623
+    },
+    {
+      "epoch": 1.8915598290598292,
+      "grad_norm": 0.6713772416114807,
+      "learning_rate": 0.00010868148792454285,
+      "loss": 0.9972,
+      "step": 10624
+    },
+    {
+      "epoch": 1.8917378917378918,
+      "grad_norm": 0.6733243465423584,
+      "learning_rate": 0.00010866754321537338,
+      "loss": 0.9596,
+      "step": 10625
+    },
+    {
+      "epoch": 1.8919159544159543,
+      "grad_norm": 0.7747747898101807,
+      "learning_rate": 0.00010865359833638138,
+      "loss": 1.0871,
+      "step": 10626
+    },
+    {
+      "epoch": 1.892094017094017,
+      "grad_norm": 0.677175760269165,
+      "learning_rate": 0.00010863965328784011,
+      "loss": 0.9939,
+      "step": 10627
+    },
+    {
+      "epoch": 1.8922720797720798,
+      "grad_norm": 0.7883930206298828,
+      "learning_rate": 0.00010862570807002279,
+      "loss": 1.0708,
+      "step": 10628
+    },
+    {
+      "epoch": 1.8924501424501425,
+      "grad_norm": 0.7003030180931091,
+      "learning_rate": 0.00010861176268320261,
+      "loss": 0.9791,
+      "step": 10629
+    },
+    {
+      "epoch": 1.8926282051282053,
+      "grad_norm": 0.7450358271598816,
+      "learning_rate": 0.00010859781712765284,
+      "loss": 0.9672,
+      "step": 10630
+    },
+    {
+      "epoch": 1.8928062678062678,
+      "grad_norm": 0.7776696085929871,
+      "learning_rate": 0.00010858387140364672,
+      "loss": 1.1037,
+      "step": 10631
+    },
+    {
+      "epoch": 1.8929843304843303,
+      "grad_norm": 0.6896173357963562,
+      "learning_rate": 0.00010856992551145745,
+      "loss": 1.0048,
+      "step": 10632
+    },
+    {
+      "epoch": 1.893162393162393,
+      "grad_norm": 0.5997697710990906,
+      "learning_rate": 0.00010855597945135834,
+      "loss": 0.8025,
+      "step": 10633
+    },
+    {
+      "epoch": 1.8933404558404558,
+      "grad_norm": 0.8781484365463257,
+      "learning_rate": 0.00010854203322362251,
+      "loss": 1.0014,
+      "step": 10634
+    },
+    {
+      "epoch": 1.8935185185185186,
+      "grad_norm": 0.6348843574523926,
+      "learning_rate": 0.00010852808682852334,
+      "loss": 0.9857,
+      "step": 10635
+    },
+    {
+      "epoch": 1.8936965811965814,
+      "grad_norm": 0.9704267978668213,
+      "learning_rate": 0.000108514140266334,
+      "loss": 1.0522,
+      "step": 10636
+    },
+    {
+      "epoch": 1.8938746438746439,
+      "grad_norm": 0.70372074842453,
+      "learning_rate": 0.00010850019353732779,
+      "loss": 1.1044,
+      "step": 10637
+    },
+    {
+      "epoch": 1.8940527065527064,
+      "grad_norm": 0.6528043150901794,
+      "learning_rate": 0.00010848624664177793,
+      "loss": 0.9328,
+      "step": 10638
+    },
+    {
+      "epoch": 1.8942307692307692,
+      "grad_norm": 0.6299768090248108,
+      "learning_rate": 0.00010847229957995768,
+      "loss": 1.0099,
+      "step": 10639
+    },
+    {
+      "epoch": 1.894408831908832,
+      "grad_norm": 0.6347038149833679,
+      "learning_rate": 0.00010845835235214034,
+      "loss": 1.1354,
+      "step": 10640
+    },
+    {
+      "epoch": 1.8945868945868947,
+      "grad_norm": 0.7087811827659607,
+      "learning_rate": 0.00010844440495859913,
+      "loss": 1.0543,
+      "step": 10641
+    },
+    {
+      "epoch": 1.8947649572649574,
+      "grad_norm": 0.7386305332183838,
+      "learning_rate": 0.00010843045739960738,
+      "loss": 0.9192,
+      "step": 10642
+    },
+    {
+      "epoch": 1.89494301994302,
+      "grad_norm": 0.6047097444534302,
+      "learning_rate": 0.00010841650967543833,
+      "loss": 0.8668,
+      "step": 10643
+    },
+    {
+      "epoch": 1.8951210826210825,
+      "grad_norm": 0.6779503226280212,
+      "learning_rate": 0.00010840256178636523,
+      "loss": 0.9263,
+      "step": 10644
+    },
+    {
+      "epoch": 1.8952991452991452,
+      "grad_norm": 0.7398194670677185,
+      "learning_rate": 0.00010838861373266138,
+      "loss": 0.9534,
+      "step": 10645
+    },
+    {
+      "epoch": 1.895477207977208,
+      "grad_norm": 0.8138558864593506,
+      "learning_rate": 0.00010837466551460011,
+      "loss": 0.9835,
+      "step": 10646
+    },
+    {
+      "epoch": 1.8956552706552707,
+      "grad_norm": 0.8847818374633789,
+      "learning_rate": 0.00010836071713245466,
+      "loss": 0.9769,
+      "step": 10647
+    },
+    {
+      "epoch": 1.8958333333333335,
+      "grad_norm": 0.6824164390563965,
+      "learning_rate": 0.0001083467685864983,
+      "loss": 0.9901,
+      "step": 10648
+    },
+    {
+      "epoch": 1.896011396011396,
+      "grad_norm": 0.6318182945251465,
+      "learning_rate": 0.00010833281987700436,
+      "loss": 0.7677,
+      "step": 10649
+    },
+    {
+      "epoch": 1.8961894586894585,
+      "grad_norm": 0.7372074127197266,
+      "learning_rate": 0.00010831887100424612,
+      "loss": 0.9858,
+      "step": 10650
+    },
+    {
+      "epoch": 1.8963675213675213,
+      "grad_norm": 0.7246516346931458,
+      "learning_rate": 0.00010830492196849688,
+      "loss": 0.9644,
+      "step": 10651
+    },
+    {
+      "epoch": 1.896545584045584,
+      "grad_norm": 0.6517095565795898,
+      "learning_rate": 0.00010829097277002997,
+      "loss": 1.1733,
+      "step": 10652
+    },
+    {
+      "epoch": 1.8967236467236468,
+      "grad_norm": 0.6931695342063904,
+      "learning_rate": 0.00010827702340911867,
+      "loss": 0.9923,
+      "step": 10653
+    },
+    {
+      "epoch": 1.8969017094017095,
+      "grad_norm": 0.6210272312164307,
+      "learning_rate": 0.00010826307388603628,
+      "loss": 0.8757,
+      "step": 10654
+    },
+    {
+      "epoch": 1.897079772079772,
+      "grad_norm": 0.7011165618896484,
+      "learning_rate": 0.00010824912420105611,
+      "loss": 1.0011,
+      "step": 10655
+    },
+    {
+      "epoch": 1.8972578347578346,
+      "grad_norm": 0.7431246638298035,
+      "learning_rate": 0.0001082351743544515,
+      "loss": 1.1498,
+      "step": 10656
+    },
+    {
+      "epoch": 1.8974358974358974,
+      "grad_norm": 0.7099978923797607,
+      "learning_rate": 0.00010822122434649576,
+      "loss": 1.0673,
+      "step": 10657
+    },
+    {
+      "epoch": 1.89761396011396,
+      "grad_norm": 0.7375551462173462,
+      "learning_rate": 0.00010820727417746219,
+      "loss": 1.0157,
+      "step": 10658
+    },
+    {
+      "epoch": 1.8977920227920229,
+      "grad_norm": 0.8155642151832581,
+      "learning_rate": 0.00010819332384762413,
+      "loss": 1.229,
+      "step": 10659
+    },
+    {
+      "epoch": 1.8979700854700856,
+      "grad_norm": 0.6917914748191833,
+      "learning_rate": 0.00010817937335725493,
+      "loss": 0.9701,
+      "step": 10660
+    },
+    {
+      "epoch": 1.8981481481481481,
+      "grad_norm": 0.8498218059539795,
+      "learning_rate": 0.00010816542270662786,
+      "loss": 1.0123,
+      "step": 10661
+    },
+    {
+      "epoch": 1.8983262108262107,
+      "grad_norm": 0.7234359979629517,
+      "learning_rate": 0.00010815147189601634,
+      "loss": 1.0755,
+      "step": 10662
+    },
+    {
+      "epoch": 1.8985042735042734,
+      "grad_norm": 0.6997553110122681,
+      "learning_rate": 0.00010813752092569365,
+      "loss": 1.1594,
+      "step": 10663
+    },
+    {
+      "epoch": 1.8986823361823362,
+      "grad_norm": 0.6519457101821899,
+      "learning_rate": 0.00010812356979593314,
+      "loss": 0.9609,
+      "step": 10664
+    },
+    {
+      "epoch": 1.898860398860399,
+      "grad_norm": 0.7215374708175659,
+      "learning_rate": 0.00010810961850700813,
+      "loss": 1.1392,
+      "step": 10665
+    },
+    {
+      "epoch": 1.8990384615384617,
+      "grad_norm": 0.7766093611717224,
+      "learning_rate": 0.00010809566705919202,
+      "loss": 1.0256,
+      "step": 10666
+    },
+    {
+      "epoch": 1.8992165242165242,
+      "grad_norm": 0.6520358920097351,
+      "learning_rate": 0.00010808171545275814,
+      "loss": 1.0434,
+      "step": 10667
+    },
+    {
+      "epoch": 1.8993945868945867,
+      "grad_norm": 0.7454953193664551,
+      "learning_rate": 0.00010806776368797982,
+      "loss": 1.2323,
+      "step": 10668
+    },
+    {
+      "epoch": 1.8995726495726495,
+      "grad_norm": 0.6891530752182007,
+      "learning_rate": 0.00010805381176513043,
+      "loss": 1.1104,
+      "step": 10669
+    },
+    {
+      "epoch": 1.8997507122507122,
+      "grad_norm": 0.6609626412391663,
+      "learning_rate": 0.00010803985968448331,
+      "loss": 0.8565,
+      "step": 10670
+    },
+    {
+      "epoch": 1.899928774928775,
+      "grad_norm": 0.6650999188423157,
+      "learning_rate": 0.00010802590744631187,
+      "loss": 1.1003,
+      "step": 10671
+    },
+    {
+      "epoch": 1.9001068376068377,
+      "grad_norm": 0.5794292092323303,
+      "learning_rate": 0.00010801195505088945,
+      "loss": 0.528,
+      "step": 10672
+    },
+    {
+      "epoch": 1.9002849002849003,
+      "grad_norm": 1.0802743434906006,
+      "learning_rate": 0.00010799800249848939,
+      "loss": 0.8861,
+      "step": 10673
+    },
+    {
+      "epoch": 1.9004629629629628,
+      "grad_norm": 0.650833249092102,
+      "learning_rate": 0.00010798404978938513,
+      "loss": 0.9962,
+      "step": 10674
+    },
+    {
+      "epoch": 1.9006410256410255,
+      "grad_norm": 0.7290451526641846,
+      "learning_rate": 0.00010797009692384994,
+      "loss": 1.0764,
+      "step": 10675
+    },
+    {
+      "epoch": 1.9008190883190883,
+      "grad_norm": 0.6273928880691528,
+      "learning_rate": 0.00010795614390215727,
+      "loss": 0.9478,
+      "step": 10676
+    },
+    {
+      "epoch": 1.900997150997151,
+      "grad_norm": 0.6939455270767212,
+      "learning_rate": 0.00010794219072458052,
+      "loss": 0.8991,
+      "step": 10677
+    },
+    {
+      "epoch": 1.9011752136752138,
+      "grad_norm": 0.7455828189849854,
+      "learning_rate": 0.00010792823739139302,
+      "loss": 0.8902,
+      "step": 10678
+    },
+    {
+      "epoch": 1.9013532763532763,
+      "grad_norm": 0.6894607543945312,
+      "learning_rate": 0.00010791428390286817,
+      "loss": 0.9355,
+      "step": 10679
+    },
+    {
+      "epoch": 1.901531339031339,
+      "grad_norm": 0.6844658851623535,
+      "learning_rate": 0.00010790033025927936,
+      "loss": 0.9835,
+      "step": 10680
+    },
+    {
+      "epoch": 1.9017094017094016,
+      "grad_norm": 0.6646730899810791,
+      "learning_rate": 0.00010788637646090001,
+      "loss": 0.9376,
+      "step": 10681
+    },
+    {
+      "epoch": 1.9018874643874644,
+      "grad_norm": 0.6494864225387573,
+      "learning_rate": 0.00010787242250800349,
+      "loss": 0.8533,
+      "step": 10682
+    },
+    {
+      "epoch": 1.9020655270655271,
+      "grad_norm": 0.686198353767395,
+      "learning_rate": 0.0001078584684008632,
+      "loss": 0.8075,
+      "step": 10683
+    },
+    {
+      "epoch": 1.9022435897435899,
+      "grad_norm": 0.7014855742454529,
+      "learning_rate": 0.00010784451413975256,
+      "loss": 1.0805,
+      "step": 10684
+    },
+    {
+      "epoch": 1.9024216524216524,
+      "grad_norm": 0.7191864252090454,
+      "learning_rate": 0.00010783055972494496,
+      "loss": 0.9375,
+      "step": 10685
+    },
+    {
+      "epoch": 1.9025997150997151,
+      "grad_norm": 0.8114212155342102,
+      "learning_rate": 0.00010781660515671379,
+      "loss": 0.9716,
+      "step": 10686
+    },
+    {
+      "epoch": 1.9027777777777777,
+      "grad_norm": 0.7423529028892517,
+      "learning_rate": 0.0001078026504353325,
+      "loss": 0.9066,
+      "step": 10687
+    },
+    {
+      "epoch": 1.9029558404558404,
+      "grad_norm": 0.6517882347106934,
+      "learning_rate": 0.00010778869556107447,
+      "loss": 0.9908,
+      "step": 10688
+    },
+    {
+      "epoch": 1.9031339031339032,
+      "grad_norm": 0.6983367800712585,
+      "learning_rate": 0.00010777474053421315,
+      "loss": 1.1048,
+      "step": 10689
+    },
+    {
+      "epoch": 1.903311965811966,
+      "grad_norm": 0.597766101360321,
+      "learning_rate": 0.00010776078535502193,
+      "loss": 0.84,
+      "step": 10690
+    },
+    {
+      "epoch": 1.9034900284900285,
+      "grad_norm": 0.7335455417633057,
+      "learning_rate": 0.00010774683002377422,
+      "loss": 1.0387,
+      "step": 10691
+    },
+    {
+      "epoch": 1.9036680911680912,
+      "grad_norm": 0.6742176413536072,
+      "learning_rate": 0.0001077328745407435,
+      "loss": 0.9743,
+      "step": 10692
+    },
+    {
+      "epoch": 1.9038461538461537,
+      "grad_norm": 0.7954961657524109,
+      "learning_rate": 0.00010771891890620316,
+      "loss": 1.1025,
+      "step": 10693
+    },
+    {
+      "epoch": 1.9040242165242165,
+      "grad_norm": 0.733351469039917,
+      "learning_rate": 0.00010770496312042664,
+      "loss": 1.028,
+      "step": 10694
+    },
+    {
+      "epoch": 1.9042022792022792,
+      "grad_norm": 0.7059772610664368,
+      "learning_rate": 0.00010769100718368734,
+      "loss": 1.0103,
+      "step": 10695
+    },
+    {
+      "epoch": 1.904380341880342,
+      "grad_norm": 0.6234813332557678,
+      "learning_rate": 0.00010767705109625877,
+      "loss": 0.6893,
+      "step": 10696
+    },
+    {
+      "epoch": 1.9045584045584045,
+      "grad_norm": 0.6670311689376831,
+      "learning_rate": 0.0001076630948584143,
+      "loss": 1.1386,
+      "step": 10697
+    },
+    {
+      "epoch": 1.9047364672364673,
+      "grad_norm": 0.7444894909858704,
+      "learning_rate": 0.00010764913847042744,
+      "loss": 0.8524,
+      "step": 10698
+    },
+    {
+      "epoch": 1.9049145299145298,
+      "grad_norm": 0.6252964735031128,
+      "learning_rate": 0.00010763518193257158,
+      "loss": 0.9407,
+      "step": 10699
+    },
+    {
+      "epoch": 1.9050925925925926,
+      "grad_norm": 0.7794382572174072,
+      "learning_rate": 0.0001076212252451202,
+      "loss": 1.05,
+      "step": 10700
+    },
+    {
+      "epoch": 1.9052706552706553,
+      "grad_norm": 0.6313693523406982,
+      "learning_rate": 0.00010760726840834671,
+      "loss": 0.8667,
+      "step": 10701
+    },
+    {
+      "epoch": 1.905448717948718,
+      "grad_norm": 0.6766461730003357,
+      "learning_rate": 0.00010759331142252462,
+      "loss": 0.9675,
+      "step": 10702
+    },
+    {
+      "epoch": 1.9056267806267806,
+      "grad_norm": 0.7457365393638611,
+      "learning_rate": 0.00010757935428792739,
+      "loss": 0.9177,
+      "step": 10703
+    },
+    {
+      "epoch": 1.9058048433048433,
+      "grad_norm": 0.6649872064590454,
+      "learning_rate": 0.00010756539700482844,
+      "loss": 0.8703,
+      "step": 10704
+    },
+    {
+      "epoch": 1.9059829059829059,
+      "grad_norm": 0.8418740034103394,
+      "learning_rate": 0.00010755143957350127,
+      "loss": 0.8993,
+      "step": 10705
+    },
+    {
+      "epoch": 1.9061609686609686,
+      "grad_norm": 0.6767167448997498,
+      "learning_rate": 0.00010753748199421929,
+      "loss": 1.0063,
+      "step": 10706
+    },
+    {
+      "epoch": 1.9063390313390314,
+      "grad_norm": 0.6959242820739746,
+      "learning_rate": 0.00010752352426725603,
+      "loss": 1.0516,
+      "step": 10707
+    },
+    {
+      "epoch": 1.9065170940170941,
+      "grad_norm": 0.7106529474258423,
+      "learning_rate": 0.00010750956639288493,
+      "loss": 0.9596,
+      "step": 10708
+    },
+    {
+      "epoch": 1.9066951566951567,
+      "grad_norm": 0.7611243724822998,
+      "learning_rate": 0.00010749560837137949,
+      "loss": 1.0739,
+      "step": 10709
+    },
+    {
+      "epoch": 1.9068732193732194,
+      "grad_norm": 0.6684338450431824,
+      "learning_rate": 0.00010748165020301317,
+      "loss": 1.1437,
+      "step": 10710
+    },
+    {
+      "epoch": 1.907051282051282,
+      "grad_norm": 0.5957385897636414,
+      "learning_rate": 0.00010746769188805945,
+      "loss": 0.8802,
+      "step": 10711
+    },
+    {
+      "epoch": 1.9072293447293447,
+      "grad_norm": 0.69919353723526,
+      "learning_rate": 0.00010745373342679184,
+      "loss": 1.1891,
+      "step": 10712
+    },
+    {
+      "epoch": 1.9074074074074074,
+      "grad_norm": 0.7562127709388733,
+      "learning_rate": 0.0001074397748194838,
+      "loss": 0.8717,
+      "step": 10713
+    },
+    {
+      "epoch": 1.9075854700854702,
+      "grad_norm": 0.6420038938522339,
+      "learning_rate": 0.00010742581606640882,
+      "loss": 1.1196,
+      "step": 10714
+    },
+    {
+      "epoch": 1.9077635327635327,
+      "grad_norm": 0.7545611262321472,
+      "learning_rate": 0.00010741185716784039,
+      "loss": 1.161,
+      "step": 10715
+    },
+    {
+      "epoch": 1.9079415954415955,
+      "grad_norm": 0.6467727422714233,
+      "learning_rate": 0.000107397898124052,
+      "loss": 0.8029,
+      "step": 10716
+    },
+    {
+      "epoch": 1.908119658119658,
+      "grad_norm": 0.6129235625267029,
+      "learning_rate": 0.00010738393893531722,
+      "loss": 0.8802,
+      "step": 10717
+    },
+    {
+      "epoch": 1.9082977207977208,
+      "grad_norm": 0.6416113376617432,
+      "learning_rate": 0.00010736997960190946,
+      "loss": 0.8465,
+      "step": 10718
+    },
+    {
+      "epoch": 1.9084757834757835,
+      "grad_norm": 0.6609050631523132,
+      "learning_rate": 0.00010735602012410229,
+      "loss": 0.9484,
+      "step": 10719
+    },
+    {
+      "epoch": 1.9086538461538463,
+      "grad_norm": 0.6302639842033386,
+      "learning_rate": 0.00010734206050216913,
+      "loss": 0.898,
+      "step": 10720
+    },
+    {
+      "epoch": 1.9088319088319088,
+      "grad_norm": 0.7291215658187866,
+      "learning_rate": 0.00010732810073638358,
+      "loss": 0.9544,
+      "step": 10721
+    },
+    {
+      "epoch": 1.9090099715099715,
+      "grad_norm": 0.6436966061592102,
+      "learning_rate": 0.0001073141408270191,
+      "loss": 0.956,
+      "step": 10722
+    },
+    {
+      "epoch": 1.909188034188034,
+      "grad_norm": 0.6247875094413757,
+      "learning_rate": 0.00010730018077434924,
+      "loss": 0.8704,
+      "step": 10723
+    },
+    {
+      "epoch": 1.9093660968660968,
+      "grad_norm": 0.7599029541015625,
+      "learning_rate": 0.00010728622057864753,
+      "loss": 1.2024,
+      "step": 10724
+    },
+    {
+      "epoch": 1.9095441595441596,
+      "grad_norm": 0.6894544959068298,
+      "learning_rate": 0.00010727226024018744,
+      "loss": 1.1226,
+      "step": 10725
+    },
+    {
+      "epoch": 1.9097222222222223,
+      "grad_norm": 0.6920733451843262,
+      "learning_rate": 0.0001072582997592425,
+      "loss": 0.7682,
+      "step": 10726
+    },
+    {
+      "epoch": 1.9099002849002849,
+      "grad_norm": 0.6013005375862122,
+      "learning_rate": 0.00010724433913608627,
+      "loss": 0.9462,
+      "step": 10727
+    },
+    {
+      "epoch": 1.9100783475783476,
+      "grad_norm": 0.7466302514076233,
+      "learning_rate": 0.00010723037837099225,
+      "loss": 0.9507,
+      "step": 10728
+    },
+    {
+      "epoch": 1.9102564102564101,
+      "grad_norm": 0.7070091962814331,
+      "learning_rate": 0.00010721641746423401,
+      "loss": 1.0704,
+      "step": 10729
+    },
+    {
+      "epoch": 1.9104344729344729,
+      "grad_norm": 0.6747950315475464,
+      "learning_rate": 0.00010720245641608506,
+      "loss": 0.7899,
+      "step": 10730
+    },
+    {
+      "epoch": 1.9106125356125356,
+      "grad_norm": 0.7338371276855469,
+      "learning_rate": 0.00010718849522681891,
+      "loss": 0.9574,
+      "step": 10731
+    },
+    {
+      "epoch": 1.9107905982905984,
+      "grad_norm": 0.6923216581344604,
+      "learning_rate": 0.00010717453389670915,
+      "loss": 1.0725,
+      "step": 10732
+    },
+    {
+      "epoch": 1.910968660968661,
+      "grad_norm": 0.6050783395767212,
+      "learning_rate": 0.0001071605724260293,
+      "loss": 0.9224,
+      "step": 10733
+    },
+    {
+      "epoch": 1.9111467236467237,
+      "grad_norm": 0.6854597330093384,
+      "learning_rate": 0.00010714661081505291,
+      "loss": 0.9749,
+      "step": 10734
+    },
+    {
+      "epoch": 1.9113247863247862,
+      "grad_norm": 0.7661508321762085,
+      "learning_rate": 0.00010713264906405351,
+      "loss": 1.1564,
+      "step": 10735
+    },
+    {
+      "epoch": 1.911502849002849,
+      "grad_norm": 0.6389622688293457,
+      "learning_rate": 0.00010711868717330467,
+      "loss": 0.8148,
+      "step": 10736
+    },
+    {
+      "epoch": 1.9116809116809117,
+      "grad_norm": 0.6318161487579346,
+      "learning_rate": 0.00010710472514307996,
+      "loss": 0.7833,
+      "step": 10737
+    },
+    {
+      "epoch": 1.9118589743589745,
+      "grad_norm": 0.8646727800369263,
+      "learning_rate": 0.00010709076297365292,
+      "loss": 1.2682,
+      "step": 10738
+    },
+    {
+      "epoch": 1.9120370370370372,
+      "grad_norm": 0.6085501909255981,
+      "learning_rate": 0.0001070768006652971,
+      "loss": 0.8706,
+      "step": 10739
+    },
+    {
+      "epoch": 1.9122150997150997,
+      "grad_norm": 0.8259731531143188,
+      "learning_rate": 0.00010706283821828607,
+      "loss": 0.9014,
+      "step": 10740
+    },
+    {
+      "epoch": 1.9123931623931623,
+      "grad_norm": 0.6509148478507996,
+      "learning_rate": 0.0001070488756328934,
+      "loss": 0.8814,
+      "step": 10741
+    },
+    {
+      "epoch": 1.912571225071225,
+      "grad_norm": 0.7241966128349304,
+      "learning_rate": 0.00010703491290939264,
+      "loss": 0.9925,
+      "step": 10742
+    },
+    {
+      "epoch": 1.9127492877492878,
+      "grad_norm": 0.7736822366714478,
+      "learning_rate": 0.00010702095004805738,
+      "loss": 1.0881,
+      "step": 10743
+    },
+    {
+      "epoch": 1.9129273504273505,
+      "grad_norm": 0.6912824511528015,
+      "learning_rate": 0.00010700698704916123,
+      "loss": 1.2334,
+      "step": 10744
+    },
+    {
+      "epoch": 1.9131054131054133,
+      "grad_norm": 0.825065553188324,
+      "learning_rate": 0.0001069930239129777,
+      "loss": 0.9783,
+      "step": 10745
+    },
+    {
+      "epoch": 1.9132834757834758,
+      "grad_norm": 0.7650560140609741,
+      "learning_rate": 0.00010697906063978038,
+      "loss": 0.9788,
+      "step": 10746
+    },
+    {
+      "epoch": 1.9134615384615383,
+      "grad_norm": 0.7368232607841492,
+      "learning_rate": 0.00010696509722984287,
+      "loss": 0.8704,
+      "step": 10747
+    },
+    {
+      "epoch": 1.913639601139601,
+      "grad_norm": 0.6630628108978271,
+      "learning_rate": 0.00010695113368343875,
+      "loss": 1.1993,
+      "step": 10748
+    },
+    {
+      "epoch": 1.9138176638176638,
+      "grad_norm": 0.6842190027236938,
+      "learning_rate": 0.0001069371700008416,
+      "loss": 0.9128,
+      "step": 10749
+    },
+    {
+      "epoch": 1.9139957264957266,
+      "grad_norm": 0.591655969619751,
+      "learning_rate": 0.00010692320618232503,
+      "loss": 1.0607,
+      "step": 10750
+    },
+    {
+      "epoch": 1.9141737891737893,
+      "grad_norm": 0.74644535779953,
+      "learning_rate": 0.0001069092422281626,
+      "loss": 1.0937,
+      "step": 10751
+    },
+    {
+      "epoch": 1.9143518518518519,
+      "grad_norm": 0.7123813629150391,
+      "learning_rate": 0.00010689527813862792,
+      "loss": 0.9043,
+      "step": 10752
+    },
+    {
+      "epoch": 1.9145299145299144,
+      "grad_norm": 0.6850089430809021,
+      "learning_rate": 0.0001068813139139946,
+      "loss": 1.0908,
+      "step": 10753
+    },
+    {
+      "epoch": 1.9147079772079771,
+      "grad_norm": 0.5882078409194946,
+      "learning_rate": 0.00010686734955453623,
+      "loss": 0.829,
+      "step": 10754
+    },
+    {
+      "epoch": 1.91488603988604,
+      "grad_norm": 0.6741717457771301,
+      "learning_rate": 0.00010685338506052642,
+      "loss": 0.9197,
+      "step": 10755
+    },
+    {
+      "epoch": 1.9150641025641026,
+      "grad_norm": 0.6597354412078857,
+      "learning_rate": 0.00010683942043223876,
+      "loss": 0.8778,
+      "step": 10756
+    },
+    {
+      "epoch": 1.9152421652421654,
+      "grad_norm": 0.6682151556015015,
+      "learning_rate": 0.00010682545566994684,
+      "loss": 0.9305,
+      "step": 10757
+    },
+    {
+      "epoch": 1.915420227920228,
+      "grad_norm": 0.8283176422119141,
+      "learning_rate": 0.00010681149077392431,
+      "loss": 1.0164,
+      "step": 10758
+    },
+    {
+      "epoch": 1.9155982905982905,
+      "grad_norm": 0.648845374584198,
+      "learning_rate": 0.00010679752574444477,
+      "loss": 1.0114,
+      "step": 10759
+    },
+    {
+      "epoch": 1.9157763532763532,
+      "grad_norm": 0.755913496017456,
+      "learning_rate": 0.00010678356058178182,
+      "loss": 1.1142,
+      "step": 10760
+    },
+    {
+      "epoch": 1.915954415954416,
+      "grad_norm": 0.7334780097007751,
+      "learning_rate": 0.00010676959528620911,
+      "loss": 0.8758,
+      "step": 10761
+    },
+    {
+      "epoch": 1.9161324786324787,
+      "grad_norm": 0.9132041335105896,
+      "learning_rate": 0.00010675562985800025,
+      "loss": 0.995,
+      "step": 10762
+    },
+    {
+      "epoch": 1.9163105413105415,
+      "grad_norm": 0.7070860266685486,
+      "learning_rate": 0.00010674166429742882,
+      "loss": 0.9856,
+      "step": 10763
+    },
+    {
+      "epoch": 1.916488603988604,
+      "grad_norm": 0.7143638134002686,
+      "learning_rate": 0.00010672769860476853,
+      "loss": 1.0612,
+      "step": 10764
+    },
+    {
+      "epoch": 1.9166666666666665,
+      "grad_norm": 0.815717339515686,
+      "learning_rate": 0.00010671373278029293,
+      "loss": 1.1539,
+      "step": 10765
+    },
+    {
+      "epoch": 1.9168447293447293,
+      "grad_norm": 0.6379499435424805,
+      "learning_rate": 0.0001066997668242757,
+      "loss": 0.8295,
+      "step": 10766
+    },
+    {
+      "epoch": 1.917022792022792,
+      "grad_norm": 0.6482511758804321,
+      "learning_rate": 0.00010668580073699044,
+      "loss": 1.0079,
+      "step": 10767
+    },
+    {
+      "epoch": 1.9172008547008548,
+      "grad_norm": 0.7382873296737671,
+      "learning_rate": 0.00010667183451871082,
+      "loss": 0.8973,
+      "step": 10768
+    },
+    {
+      "epoch": 1.9173789173789175,
+      "grad_norm": 0.7818579077720642,
+      "learning_rate": 0.00010665786816971044,
+      "loss": 1.2131,
+      "step": 10769
+    },
+    {
+      "epoch": 1.91755698005698,
+      "grad_norm": 0.6960901021957397,
+      "learning_rate": 0.000106643901690263,
+      "loss": 1.1466,
+      "step": 10770
+    },
+    {
+      "epoch": 1.9177350427350426,
+      "grad_norm": 0.696966826915741,
+      "learning_rate": 0.00010662993508064208,
+      "loss": 0.854,
+      "step": 10771
+    },
+    {
+      "epoch": 1.9179131054131053,
+      "grad_norm": 0.6745442152023315,
+      "learning_rate": 0.00010661596834112133,
+      "loss": 0.9559,
+      "step": 10772
+    },
+    {
+      "epoch": 1.918091168091168,
+      "grad_norm": 0.7436230778694153,
+      "learning_rate": 0.00010660200147197447,
+      "loss": 1.1367,
+      "step": 10773
+    },
+    {
+      "epoch": 1.9182692307692308,
+      "grad_norm": 0.6051676869392395,
+      "learning_rate": 0.00010658803447347509,
+      "loss": 1.05,
+      "step": 10774
+    },
+    {
+      "epoch": 1.9184472934472936,
+      "grad_norm": 0.5662530660629272,
+      "learning_rate": 0.00010657406734589686,
+      "loss": 0.8697,
+      "step": 10775
+    },
+    {
+      "epoch": 1.9186253561253561,
+      "grad_norm": 0.6640757322311401,
+      "learning_rate": 0.00010656010008951344,
+      "loss": 1.0636,
+      "step": 10776
+    },
+    {
+      "epoch": 1.9188034188034186,
+      "grad_norm": 0.6994011998176575,
+      "learning_rate": 0.00010654613270459848,
+      "loss": 0.9326,
+      "step": 10777
+    },
+    {
+      "epoch": 1.9189814814814814,
+      "grad_norm": 0.6827420592308044,
+      "learning_rate": 0.00010653216519142563,
+      "loss": 0.8667,
+      "step": 10778
+    },
+    {
+      "epoch": 1.9191595441595442,
+      "grad_norm": 0.6814691424369812,
+      "learning_rate": 0.00010651819755026862,
+      "loss": 0.828,
+      "step": 10779
+    },
+    {
+      "epoch": 1.919337606837607,
+      "grad_norm": 0.7033611536026001,
+      "learning_rate": 0.00010650422978140103,
+      "loss": 1.0427,
+      "step": 10780
+    },
+    {
+      "epoch": 1.9195156695156697,
+      "grad_norm": 0.7098833322525024,
+      "learning_rate": 0.00010649026188509657,
+      "loss": 1.1723,
+      "step": 10781
+    },
+    {
+      "epoch": 1.9196937321937322,
+      "grad_norm": 0.7184767723083496,
+      "learning_rate": 0.00010647629386162893,
+      "loss": 0.852,
+      "step": 10782
+    },
+    {
+      "epoch": 1.9198717948717947,
+      "grad_norm": 0.6682565808296204,
+      "learning_rate": 0.00010646232571127175,
+      "loss": 0.8827,
+      "step": 10783
+    },
+    {
+      "epoch": 1.9200498575498575,
+      "grad_norm": 0.6699280142784119,
+      "learning_rate": 0.00010644835743429873,
+      "loss": 0.8346,
+      "step": 10784
+    },
+    {
+      "epoch": 1.9202279202279202,
+      "grad_norm": 0.8041857481002808,
+      "learning_rate": 0.00010643438903098355,
+      "loss": 0.9622,
+      "step": 10785
+    },
+    {
+      "epoch": 1.920405982905983,
+      "grad_norm": 0.7315110564231873,
+      "learning_rate": 0.00010642042050159986,
+      "loss": 1.0443,
+      "step": 10786
+    },
+    {
+      "epoch": 1.9205840455840457,
+      "grad_norm": 0.5850204229354858,
+      "learning_rate": 0.0001064064518464214,
+      "loss": 1.0155,
+      "step": 10787
+    },
+    {
+      "epoch": 1.9207621082621082,
+      "grad_norm": 0.7320640683174133,
+      "learning_rate": 0.00010639248306572178,
+      "loss": 1.1556,
+      "step": 10788
+    },
+    {
+      "epoch": 1.9209401709401708,
+      "grad_norm": 0.689804196357727,
+      "learning_rate": 0.00010637851415977478,
+      "loss": 1.1058,
+      "step": 10789
+    },
+    {
+      "epoch": 1.9211182336182335,
+      "grad_norm": 0.6433262228965759,
+      "learning_rate": 0.000106364545128854,
+      "loss": 1.0916,
+      "step": 10790
+    },
+    {
+      "epoch": 1.9212962962962963,
+      "grad_norm": 0.6802626252174377,
+      "learning_rate": 0.00010635057597323323,
+      "loss": 1.126,
+      "step": 10791
+    },
+    {
+      "epoch": 1.921474358974359,
+      "grad_norm": 0.7503384351730347,
+      "learning_rate": 0.00010633660669318608,
+      "loss": 0.9354,
+      "step": 10792
+    },
+    {
+      "epoch": 1.9216524216524218,
+      "grad_norm": 0.6370253562927246,
+      "learning_rate": 0.00010632263728898629,
+      "loss": 0.9976,
+      "step": 10793
+    },
+    {
+      "epoch": 1.9218304843304843,
+      "grad_norm": 0.7566042542457581,
+      "learning_rate": 0.00010630866776090755,
+      "loss": 1.0311,
+      "step": 10794
+    },
+    {
+      "epoch": 1.922008547008547,
+      "grad_norm": 0.7011943459510803,
+      "learning_rate": 0.0001062946981092236,
+      "loss": 0.8777,
+      "step": 10795
+    },
+    {
+      "epoch": 1.9221866096866096,
+      "grad_norm": 0.6621114015579224,
+      "learning_rate": 0.00010628072833420811,
+      "loss": 0.9615,
+      "step": 10796
+    },
+    {
+      "epoch": 1.9223646723646723,
+      "grad_norm": 0.6863150000572205,
+      "learning_rate": 0.00010626675843613478,
+      "loss": 1.071,
+      "step": 10797
+    },
+    {
+      "epoch": 1.922542735042735,
+      "grad_norm": 0.597970724105835,
+      "learning_rate": 0.00010625278841527733,
+      "loss": 0.8661,
+      "step": 10798
+    },
+    {
+      "epoch": 1.9227207977207978,
+      "grad_norm": 0.5958755612373352,
+      "learning_rate": 0.00010623881827190947,
+      "loss": 0.9075,
+      "step": 10799
+    },
+    {
+      "epoch": 1.9228988603988604,
+      "grad_norm": 0.7764523029327393,
+      "learning_rate": 0.00010622484800630494,
+      "loss": 1.0576,
+      "step": 10800
+    },
+    {
+      "epoch": 1.9230769230769231,
+      "grad_norm": 0.774156391620636,
+      "learning_rate": 0.00010621087761873748,
+      "loss": 0.9273,
+      "step": 10801
+    },
+    {
+      "epoch": 1.9232549857549857,
+      "grad_norm": 0.6321687698364258,
+      "learning_rate": 0.00010619690710948074,
+      "loss": 0.8805,
+      "step": 10802
+    },
+    {
+      "epoch": 1.9234330484330484,
+      "grad_norm": 0.659538984298706,
+      "learning_rate": 0.00010618293647880846,
+      "loss": 0.9845,
+      "step": 10803
+    },
+    {
+      "epoch": 1.9236111111111112,
+      "grad_norm": 0.6931299567222595,
+      "learning_rate": 0.00010616896572699442,
+      "loss": 1.2005,
+      "step": 10804
+    },
+    {
+      "epoch": 1.923789173789174,
+      "grad_norm": 0.6054762005805969,
+      "learning_rate": 0.00010615499485431228,
+      "loss": 0.825,
+      "step": 10805
+    },
+    {
+      "epoch": 1.9239672364672364,
+      "grad_norm": 0.6631526947021484,
+      "learning_rate": 0.00010614102386103584,
+      "loss": 0.9149,
+      "step": 10806
+    },
+    {
+      "epoch": 1.9241452991452992,
+      "grad_norm": 0.6667893528938293,
+      "learning_rate": 0.00010612705274743878,
+      "loss": 1.014,
+      "step": 10807
+    },
+    {
+      "epoch": 1.9243233618233617,
+      "grad_norm": 0.861302375793457,
+      "learning_rate": 0.00010611308151379482,
+      "loss": 1.0809,
+      "step": 10808
+    },
+    {
+      "epoch": 1.9245014245014245,
+      "grad_norm": 0.6997994780540466,
+      "learning_rate": 0.00010609911016037777,
+      "loss": 0.8897,
+      "step": 10809
+    },
+    {
+      "epoch": 1.9246794871794872,
+      "grad_norm": 0.5689206123352051,
+      "learning_rate": 0.00010608513868746131,
+      "loss": 0.7517,
+      "step": 10810
+    },
+    {
+      "epoch": 1.92485754985755,
+      "grad_norm": 0.5972287654876709,
+      "learning_rate": 0.00010607116709531918,
+      "loss": 0.9015,
+      "step": 10811
+    },
+    {
+      "epoch": 1.9250356125356125,
+      "grad_norm": 0.7115643620491028,
+      "learning_rate": 0.00010605719538422519,
+      "loss": 0.6974,
+      "step": 10812
+    },
+    {
+      "epoch": 1.9252136752136753,
+      "grad_norm": 0.6548098921775818,
+      "learning_rate": 0.00010604322355445297,
+      "loss": 0.7075,
+      "step": 10813
+    },
+    {
+      "epoch": 1.9253917378917378,
+      "grad_norm": 0.6666337847709656,
+      "learning_rate": 0.00010602925160627639,
+      "loss": 1.0389,
+      "step": 10814
+    },
+    {
+      "epoch": 1.9255698005698005,
+      "grad_norm": 0.7754444479942322,
+      "learning_rate": 0.00010601527953996913,
+      "loss": 1.0674,
+      "step": 10815
+    },
+    {
+      "epoch": 1.9257478632478633,
+      "grad_norm": 0.6602712869644165,
+      "learning_rate": 0.00010600130735580498,
+      "loss": 1.2622,
+      "step": 10816
+    },
+    {
+      "epoch": 1.925925925925926,
+      "grad_norm": 0.6974020004272461,
+      "learning_rate": 0.00010598733505405767,
+      "loss": 0.9748,
+      "step": 10817
+    },
+    {
+      "epoch": 1.9261039886039886,
+      "grad_norm": 0.6236271858215332,
+      "learning_rate": 0.00010597336263500095,
+      "loss": 0.9463,
+      "step": 10818
+    },
+    {
+      "epoch": 1.9262820512820513,
+      "grad_norm": 0.6856079697608948,
+      "learning_rate": 0.00010595939009890859,
+      "loss": 0.9484,
+      "step": 10819
+    },
+    {
+      "epoch": 1.9264601139601139,
+      "grad_norm": 0.7300925850868225,
+      "learning_rate": 0.00010594541744605437,
+      "loss": 0.9702,
+      "step": 10820
+    },
+    {
+      "epoch": 1.9266381766381766,
+      "grad_norm": 0.6546478867530823,
+      "learning_rate": 0.00010593144467671208,
+      "loss": 0.8235,
+      "step": 10821
+    },
+    {
+      "epoch": 1.9268162393162394,
+      "grad_norm": 0.7215169072151184,
+      "learning_rate": 0.00010591747179115543,
+      "loss": 0.9986,
+      "step": 10822
+    },
+    {
+      "epoch": 1.926994301994302,
+      "grad_norm": 0.7304712533950806,
+      "learning_rate": 0.00010590349878965822,
+      "loss": 1.099,
+      "step": 10823
+    },
+    {
+      "epoch": 1.9271723646723646,
+      "grad_norm": 0.5883305668830872,
+      "learning_rate": 0.0001058895256724942,
+      "loss": 1.0647,
+      "step": 10824
+    },
+    {
+      "epoch": 1.9273504273504274,
+      "grad_norm": 0.8067272305488586,
+      "learning_rate": 0.00010587555243993716,
+      "loss": 1.0295,
+      "step": 10825
+    },
+    {
+      "epoch": 1.92752849002849,
+      "grad_norm": 0.6607550978660583,
+      "learning_rate": 0.00010586157909226089,
+      "loss": 0.8669,
+      "step": 10826
+    },
+    {
+      "epoch": 1.9277065527065527,
+      "grad_norm": 0.7256106734275818,
+      "learning_rate": 0.00010584760562973914,
+      "loss": 1.1674,
+      "step": 10827
+    },
+    {
+      "epoch": 1.9278846153846154,
+      "grad_norm": 0.6584621071815491,
+      "learning_rate": 0.00010583363205264574,
+      "loss": 0.8901,
+      "step": 10828
+    },
+    {
+      "epoch": 1.9280626780626782,
+      "grad_norm": 0.7200617790222168,
+      "learning_rate": 0.00010581965836125439,
+      "loss": 1.0463,
+      "step": 10829
+    },
+    {
+      "epoch": 1.9282407407407407,
+      "grad_norm": 0.7244223952293396,
+      "learning_rate": 0.00010580568455583894,
+      "loss": 1.0973,
+      "step": 10830
+    },
+    {
+      "epoch": 1.9284188034188035,
+      "grad_norm": 0.7678009867668152,
+      "learning_rate": 0.00010579171063667317,
+      "loss": 1.1753,
+      "step": 10831
+    },
+    {
+      "epoch": 1.928596866096866,
+      "grad_norm": 0.6455881595611572,
+      "learning_rate": 0.00010577773660403085,
+      "loss": 0.8988,
+      "step": 10832
+    },
+    {
+      "epoch": 1.9287749287749287,
+      "grad_norm": 0.6804864406585693,
+      "learning_rate": 0.0001057637624581858,
+      "loss": 0.8156,
+      "step": 10833
+    },
+    {
+      "epoch": 1.9289529914529915,
+      "grad_norm": 0.7874828577041626,
+      "learning_rate": 0.00010574978819941176,
+      "loss": 1.1876,
+      "step": 10834
+    },
+    {
+      "epoch": 1.9291310541310542,
+      "grad_norm": 0.7396490573883057,
+      "learning_rate": 0.00010573581382798261,
+      "loss": 0.8709,
+      "step": 10835
+    },
+    {
+      "epoch": 1.9293091168091168,
+      "grad_norm": 0.6800381541252136,
+      "learning_rate": 0.00010572183934417209,
+      "loss": 0.9906,
+      "step": 10836
+    },
+    {
+      "epoch": 1.9294871794871795,
+      "grad_norm": 0.7077754139900208,
+      "learning_rate": 0.000105707864748254,
+      "loss": 0.9785,
+      "step": 10837
+    },
+    {
+      "epoch": 1.929665242165242,
+      "grad_norm": 0.693249523639679,
+      "learning_rate": 0.00010569389004050216,
+      "loss": 0.9515,
+      "step": 10838
+    },
+    {
+      "epoch": 1.9298433048433048,
+      "grad_norm": 0.706924319267273,
+      "learning_rate": 0.00010567991522119037,
+      "loss": 1.074,
+      "step": 10839
+    },
+    {
+      "epoch": 1.9300213675213675,
+      "grad_norm": 0.6504101157188416,
+      "learning_rate": 0.00010566594029059244,
+      "loss": 1.0635,
+      "step": 10840
+    },
+    {
+      "epoch": 1.9301994301994303,
+      "grad_norm": 0.7620238661766052,
+      "learning_rate": 0.00010565196524898219,
+      "loss": 0.944,
+      "step": 10841
+    },
+    {
+      "epoch": 1.9303774928774928,
+      "grad_norm": 0.6713484525680542,
+      "learning_rate": 0.00010563799009663344,
+      "loss": 0.749,
+      "step": 10842
+    },
+    {
+      "epoch": 1.9305555555555556,
+      "grad_norm": 0.9279242157936096,
+      "learning_rate": 0.00010562401483381997,
+      "loss": 0.961,
+      "step": 10843
+    },
+    {
+      "epoch": 1.930733618233618,
+      "grad_norm": 0.6710723638534546,
+      "learning_rate": 0.00010561003946081558,
+      "loss": 1.1288,
+      "step": 10844
+    },
+    {
+      "epoch": 1.9309116809116809,
+      "grad_norm": 0.7751701474189758,
+      "learning_rate": 0.00010559606397789416,
+      "loss": 0.9435,
+      "step": 10845
+    },
+    {
+      "epoch": 1.9310897435897436,
+      "grad_norm": 0.6741766929626465,
+      "learning_rate": 0.00010558208838532948,
+      "loss": 1.0299,
+      "step": 10846
+    },
+    {
+      "epoch": 1.9312678062678064,
+      "grad_norm": 0.6988041400909424,
+      "learning_rate": 0.00010556811268339539,
+      "loss": 1.0236,
+      "step": 10847
+    },
+    {
+      "epoch": 1.931445868945869,
+      "grad_norm": 0.6353505253791809,
+      "learning_rate": 0.00010555413687236568,
+      "loss": 1.0361,
+      "step": 10848
+    },
+    {
+      "epoch": 1.9316239316239316,
+      "grad_norm": 0.7162703275680542,
+      "learning_rate": 0.0001055401609525142,
+      "loss": 1.0931,
+      "step": 10849
+    },
+    {
+      "epoch": 1.9318019943019942,
+      "grad_norm": 0.61545330286026,
+      "learning_rate": 0.00010552618492411476,
+      "loss": 0.8829,
+      "step": 10850
+    },
+    {
+      "epoch": 1.931980056980057,
+      "grad_norm": 0.6304612159729004,
+      "learning_rate": 0.00010551220878744124,
+      "loss": 0.8574,
+      "step": 10851
+    },
+    {
+      "epoch": 1.9321581196581197,
+      "grad_norm": 0.6372067928314209,
+      "learning_rate": 0.00010549823254276743,
+      "loss": 1.0949,
+      "step": 10852
+    },
+    {
+      "epoch": 1.9323361823361824,
+      "grad_norm": 0.6952856779098511,
+      "learning_rate": 0.00010548425619036715,
+      "loss": 0.9232,
+      "step": 10853
+    },
+    {
+      "epoch": 1.9325142450142452,
+      "grad_norm": 0.6510106325149536,
+      "learning_rate": 0.00010547027973051427,
+      "loss": 1.0753,
+      "step": 10854
+    },
+    {
+      "epoch": 1.9326923076923077,
+      "grad_norm": 0.6377716064453125,
+      "learning_rate": 0.00010545630316348263,
+      "loss": 0.8466,
+      "step": 10855
+    },
+    {
+      "epoch": 1.9328703703703702,
+      "grad_norm": 0.7366968393325806,
+      "learning_rate": 0.00010544232648954606,
+      "loss": 0.9351,
+      "step": 10856
+    },
+    {
+      "epoch": 1.933048433048433,
+      "grad_norm": 0.703652024269104,
+      "learning_rate": 0.00010542834970897843,
+      "loss": 1.0032,
+      "step": 10857
+    },
+    {
+      "epoch": 1.9332264957264957,
+      "grad_norm": 0.6685494780540466,
+      "learning_rate": 0.00010541437282205355,
+      "loss": 0.8818,
+      "step": 10858
+    },
+    {
+      "epoch": 1.9334045584045585,
+      "grad_norm": 0.6594362854957581,
+      "learning_rate": 0.00010540039582904527,
+      "loss": 0.9535,
+      "step": 10859
+    },
+    {
+      "epoch": 1.9335826210826212,
+      "grad_norm": 0.8003259301185608,
+      "learning_rate": 0.00010538641873022744,
+      "loss": 0.8852,
+      "step": 10860
+    },
+    {
+      "epoch": 1.9337606837606838,
+      "grad_norm": 0.6567012071609497,
+      "learning_rate": 0.00010537244152587393,
+      "loss": 1.0832,
+      "step": 10861
+    },
+    {
+      "epoch": 1.9339387464387463,
+      "grad_norm": 0.6714941263198853,
+      "learning_rate": 0.00010535846421625862,
+      "loss": 1.1047,
+      "step": 10862
+    },
+    {
+      "epoch": 1.934116809116809,
+      "grad_norm": 0.6998924612998962,
+      "learning_rate": 0.00010534448680165531,
+      "loss": 0.8827,
+      "step": 10863
+    },
+    {
+      "epoch": 1.9342948717948718,
+      "grad_norm": 0.6065765619277954,
+      "learning_rate": 0.0001053305092823379,
+      "loss": 0.5773,
+      "step": 10864
+    },
+    {
+      "epoch": 1.9344729344729346,
+      "grad_norm": 0.7678273916244507,
+      "learning_rate": 0.0001053165316585802,
+      "loss": 0.9199,
+      "step": 10865
+    },
+    {
+      "epoch": 1.9346509971509973,
+      "grad_norm": 0.7071540951728821,
+      "learning_rate": 0.00010530255393065613,
+      "loss": 1.0292,
+      "step": 10866
+    },
+    {
+      "epoch": 1.9348290598290598,
+      "grad_norm": 0.6329835057258606,
+      "learning_rate": 0.00010528857609883956,
+      "loss": 0.9915,
+      "step": 10867
+    },
+    {
+      "epoch": 1.9350071225071224,
+      "grad_norm": 0.6274038553237915,
+      "learning_rate": 0.00010527459816340427,
+      "loss": 0.8499,
+      "step": 10868
+    },
+    {
+      "epoch": 1.9351851851851851,
+      "grad_norm": 0.6564371585845947,
+      "learning_rate": 0.00010526062012462424,
+      "loss": 1.1707,
+      "step": 10869
+    },
+    {
+      "epoch": 1.9353632478632479,
+      "grad_norm": 0.8561269044876099,
+      "learning_rate": 0.00010524664198277326,
+      "loss": 1.148,
+      "step": 10870
+    },
+    {
+      "epoch": 1.9355413105413106,
+      "grad_norm": 0.6322671175003052,
+      "learning_rate": 0.00010523266373812521,
+      "loss": 0.9165,
+      "step": 10871
+    },
+    {
+      "epoch": 1.9357193732193734,
+      "grad_norm": 0.7602947354316711,
+      "learning_rate": 0.00010521868539095403,
+      "loss": 0.9647,
+      "step": 10872
+    },
+    {
+      "epoch": 1.935897435897436,
+      "grad_norm": 0.5962168574333191,
+      "learning_rate": 0.00010520470694153353,
+      "loss": 0.8585,
+      "step": 10873
+    },
+    {
+      "epoch": 1.9360754985754984,
+      "grad_norm": 0.7498637437820435,
+      "learning_rate": 0.00010519072839013757,
+      "loss": 0.9828,
+      "step": 10874
+    },
+    {
+      "epoch": 1.9362535612535612,
+      "grad_norm": 0.6841256022453308,
+      "learning_rate": 0.00010517674973704012,
+      "loss": 0.9991,
+      "step": 10875
+    },
+    {
+      "epoch": 1.936431623931624,
+      "grad_norm": 0.8281826972961426,
+      "learning_rate": 0.00010516277098251499,
+      "loss": 1.028,
+      "step": 10876
+    },
+    {
+      "epoch": 1.9366096866096867,
+      "grad_norm": 0.6673563718795776,
+      "learning_rate": 0.0001051487921268361,
+      "loss": 1.1594,
+      "step": 10877
+    },
+    {
+      "epoch": 1.9367877492877494,
+      "grad_norm": 0.7833667993545532,
+      "learning_rate": 0.00010513481317027733,
+      "loss": 0.7675,
+      "step": 10878
+    },
+    {
+      "epoch": 1.936965811965812,
+      "grad_norm": 0.6087225675582886,
+      "learning_rate": 0.00010512083411311253,
+      "loss": 0.7803,
+      "step": 10879
+    },
+    {
+      "epoch": 1.9371438746438745,
+      "grad_norm": 0.6758120656013489,
+      "learning_rate": 0.00010510685495561563,
+      "loss": 1.0621,
+      "step": 10880
+    },
+    {
+      "epoch": 1.9373219373219372,
+      "grad_norm": 0.6720096468925476,
+      "learning_rate": 0.00010509287569806055,
+      "loss": 0.8502,
+      "step": 10881
+    },
+    {
+      "epoch": 1.9375,
+      "grad_norm": 0.6233887672424316,
+      "learning_rate": 0.00010507889634072113,
+      "loss": 1.0127,
+      "step": 10882
+    },
+    {
+      "epoch": 1.9376780626780628,
+      "grad_norm": 0.667742908000946,
+      "learning_rate": 0.00010506491688387127,
+      "loss": 0.9086,
+      "step": 10883
+    },
+    {
+      "epoch": 1.9378561253561255,
+      "grad_norm": 0.6533677577972412,
+      "learning_rate": 0.00010505093732778492,
+      "loss": 0.9724,
+      "step": 10884
+    },
+    {
+      "epoch": 1.938034188034188,
+      "grad_norm": 0.7171359062194824,
+      "learning_rate": 0.00010503695767273591,
+      "loss": 0.9915,
+      "step": 10885
+    },
+    {
+      "epoch": 1.9382122507122506,
+      "grad_norm": 0.723655641078949,
+      "learning_rate": 0.0001050229779189982,
+      "loss": 0.8981,
+      "step": 10886
+    },
+    {
+      "epoch": 1.9383903133903133,
+      "grad_norm": 0.6863494515419006,
+      "learning_rate": 0.00010500899806684568,
+      "loss": 1.2577,
+      "step": 10887
+    },
+    {
+      "epoch": 1.938568376068376,
+      "grad_norm": 0.8174706697463989,
+      "learning_rate": 0.00010499501811655224,
+      "loss": 0.9848,
+      "step": 10888
+    },
+    {
+      "epoch": 1.9387464387464388,
+      "grad_norm": 0.6378024220466614,
+      "learning_rate": 0.00010498103806839179,
+      "loss": 0.8499,
+      "step": 10889
+    },
+    {
+      "epoch": 1.9389245014245016,
+      "grad_norm": 0.6734544634819031,
+      "learning_rate": 0.00010496705792263823,
+      "loss": 0.8446,
+      "step": 10890
+    },
+    {
+      "epoch": 1.939102564102564,
+      "grad_norm": 0.6802361607551575,
+      "learning_rate": 0.00010495307767956551,
+      "loss": 0.9285,
+      "step": 10891
+    },
+    {
+      "epoch": 1.9392806267806266,
+      "grad_norm": 0.7821299433708191,
+      "learning_rate": 0.00010493909733944752,
+      "loss": 1.08,
+      "step": 10892
+    },
+    {
+      "epoch": 1.9394586894586894,
+      "grad_norm": 0.6204990148544312,
+      "learning_rate": 0.00010492511690255818,
+      "loss": 0.7861,
+      "step": 10893
+    },
+    {
+      "epoch": 1.9396367521367521,
+      "grad_norm": 0.6386391520500183,
+      "learning_rate": 0.0001049111363691714,
+      "loss": 0.9162,
+      "step": 10894
+    },
+    {
+      "epoch": 1.9398148148148149,
+      "grad_norm": 0.6885092854499817,
+      "learning_rate": 0.0001048971557395611,
+      "loss": 1.0026,
+      "step": 10895
+    },
+    {
+      "epoch": 1.9399928774928776,
+      "grad_norm": 0.6962558627128601,
+      "learning_rate": 0.00010488317501400122,
+      "loss": 1.146,
+      "step": 10896
+    },
+    {
+      "epoch": 1.9401709401709402,
+      "grad_norm": 0.6283716559410095,
+      "learning_rate": 0.00010486919419276566,
+      "loss": 1.0268,
+      "step": 10897
+    },
+    {
+      "epoch": 1.9403490028490027,
+      "grad_norm": 0.7183622717857361,
+      "learning_rate": 0.00010485521327612835,
+      "loss": 1.0123,
+      "step": 10898
+    },
+    {
+      "epoch": 1.9405270655270654,
+      "grad_norm": 0.6354197263717651,
+      "learning_rate": 0.00010484123226436321,
+      "loss": 0.871,
+      "step": 10899
+    },
+    {
+      "epoch": 1.9407051282051282,
+      "grad_norm": 0.804358184337616,
+      "learning_rate": 0.00010482725115774421,
+      "loss": 1.1001,
+      "step": 10900
+    },
+    {
+      "epoch": 1.940883190883191,
+      "grad_norm": 0.6896754503250122,
+      "learning_rate": 0.00010481326995654524,
+      "loss": 1.0976,
+      "step": 10901
+    },
+    {
+      "epoch": 1.9410612535612537,
+      "grad_norm": 0.9108015894889832,
+      "learning_rate": 0.00010479928866104023,
+      "loss": 0.8785,
+      "step": 10902
+    },
+    {
+      "epoch": 1.9412393162393162,
+      "grad_norm": 0.6963121294975281,
+      "learning_rate": 0.00010478530727150316,
+      "loss": 1.0458,
+      "step": 10903
+    },
+    {
+      "epoch": 1.9414173789173788,
+      "grad_norm": 0.6657114624977112,
+      "learning_rate": 0.00010477132578820792,
+      "loss": 0.8188,
+      "step": 10904
+    },
+    {
+      "epoch": 1.9415954415954415,
+      "grad_norm": 0.671716034412384,
+      "learning_rate": 0.00010475734421142847,
+      "loss": 1.0915,
+      "step": 10905
+    },
+    {
+      "epoch": 1.9417735042735043,
+      "grad_norm": 0.6790717244148254,
+      "learning_rate": 0.0001047433625414387,
+      "loss": 0.9688,
+      "step": 10906
+    },
+    {
+      "epoch": 1.941951566951567,
+      "grad_norm": 0.6411764621734619,
+      "learning_rate": 0.00010472938077851264,
+      "loss": 1.0387,
+      "step": 10907
+    },
+    {
+      "epoch": 1.9421296296296298,
+      "grad_norm": 0.8579615950584412,
+      "learning_rate": 0.00010471539892292417,
+      "loss": 1.1635,
+      "step": 10908
+    },
+    {
+      "epoch": 1.9423076923076923,
+      "grad_norm": 0.7031029462814331,
+      "learning_rate": 0.00010470141697494726,
+      "loss": 0.9813,
+      "step": 10909
+    },
+    {
+      "epoch": 1.9424857549857548,
+      "grad_norm": 0.6657388806343079,
+      "learning_rate": 0.00010468743493485584,
+      "loss": 0.7947,
+      "step": 10910
+    },
+    {
+      "epoch": 1.9426638176638176,
+      "grad_norm": 0.6364194750785828,
+      "learning_rate": 0.00010467345280292389,
+      "loss": 0.8554,
+      "step": 10911
+    },
+    {
+      "epoch": 1.9428418803418803,
+      "grad_norm": 0.7394127249717712,
+      "learning_rate": 0.00010465947057942534,
+      "loss": 0.822,
+      "step": 10912
+    },
+    {
+      "epoch": 1.943019943019943,
+      "grad_norm": 0.6557473540306091,
+      "learning_rate": 0.00010464548826463411,
+      "loss": 1.0025,
+      "step": 10913
+    },
+    {
+      "epoch": 1.9431980056980058,
+      "grad_norm": 0.6530601382255554,
+      "learning_rate": 0.00010463150585882422,
+      "loss": 1.0828,
+      "step": 10914
+    },
+    {
+      "epoch": 1.9433760683760684,
+      "grad_norm": 0.7376404404640198,
+      "learning_rate": 0.00010461752336226957,
+      "loss": 0.9413,
+      "step": 10915
+    },
+    {
+      "epoch": 1.943554131054131,
+      "grad_norm": 0.7110656499862671,
+      "learning_rate": 0.00010460354077524417,
+      "loss": 0.9162,
+      "step": 10916
+    },
+    {
+      "epoch": 1.9437321937321936,
+      "grad_norm": 0.6515666246414185,
+      "learning_rate": 0.00010458955809802194,
+      "loss": 0.9211,
+      "step": 10917
+    },
+    {
+      "epoch": 1.9439102564102564,
+      "grad_norm": 0.6888720989227295,
+      "learning_rate": 0.00010457557533087683,
+      "loss": 1.0632,
+      "step": 10918
+    },
+    {
+      "epoch": 1.9440883190883191,
+      "grad_norm": 0.7246627807617188,
+      "learning_rate": 0.00010456159247408286,
+      "loss": 0.9807,
+      "step": 10919
+    },
+    {
+      "epoch": 1.944266381766382,
+      "grad_norm": 0.727834165096283,
+      "learning_rate": 0.00010454760952791394,
+      "loss": 1.0793,
+      "step": 10920
+    },
+    {
+      "epoch": 1.9444444444444444,
+      "grad_norm": 0.6365306377410889,
+      "learning_rate": 0.00010453362649264407,
+      "loss": 1.0415,
+      "step": 10921
+    },
+    {
+      "epoch": 1.9446225071225072,
+      "grad_norm": 0.7187839150428772,
+      "learning_rate": 0.0001045196433685472,
+      "loss": 1.007,
+      "step": 10922
+    },
+    {
+      "epoch": 1.9448005698005697,
+      "grad_norm": 0.5905138254165649,
+      "learning_rate": 0.00010450566015589732,
+      "loss": 0.9818,
+      "step": 10923
+    },
+    {
+      "epoch": 1.9449786324786325,
+      "grad_norm": 0.7008894085884094,
+      "learning_rate": 0.00010449167685496837,
+      "loss": 0.8444,
+      "step": 10924
+    },
+    {
+      "epoch": 1.9451566951566952,
+      "grad_norm": 0.6126312017440796,
+      "learning_rate": 0.00010447769346603435,
+      "loss": 0.7207,
+      "step": 10925
+    },
+    {
+      "epoch": 1.945334757834758,
+      "grad_norm": 0.7513176202774048,
+      "learning_rate": 0.00010446370998936922,
+      "loss": 0.8693,
+      "step": 10926
+    },
+    {
+      "epoch": 1.9455128205128205,
+      "grad_norm": 0.6382531523704529,
+      "learning_rate": 0.00010444972642524697,
+      "loss": 0.8379,
+      "step": 10927
+    },
+    {
+      "epoch": 1.9456908831908832,
+      "grad_norm": 0.7062170505523682,
+      "learning_rate": 0.0001044357427739416,
+      "loss": 1.0525,
+      "step": 10928
+    },
+    {
+      "epoch": 1.9458689458689458,
+      "grad_norm": 0.6954067349433899,
+      "learning_rate": 0.00010442175903572703,
+      "loss": 1.0238,
+      "step": 10929
+    },
+    {
+      "epoch": 1.9460470085470085,
+      "grad_norm": 0.7257117033004761,
+      "learning_rate": 0.00010440777521087731,
+      "loss": 1.1413,
+      "step": 10930
+    },
+    {
+      "epoch": 1.9462250712250713,
+      "grad_norm": 0.6617701053619385,
+      "learning_rate": 0.00010439379129966635,
+      "loss": 1.0089,
+      "step": 10931
+    },
+    {
+      "epoch": 1.946403133903134,
+      "grad_norm": 0.6860800385475159,
+      "learning_rate": 0.00010437980730236821,
+      "loss": 1.1778,
+      "step": 10932
+    },
+    {
+      "epoch": 1.9465811965811965,
+      "grad_norm": 0.846235454082489,
+      "learning_rate": 0.00010436582321925684,
+      "loss": 0.9851,
+      "step": 10933
+    },
+    {
+      "epoch": 1.9467592592592593,
+      "grad_norm": 0.6385617852210999,
+      "learning_rate": 0.00010435183905060623,
+      "loss": 0.9542,
+      "step": 10934
+    },
+    {
+      "epoch": 1.9469373219373218,
+      "grad_norm": 0.7137401700019836,
+      "learning_rate": 0.00010433785479669038,
+      "loss": 1.0499,
+      "step": 10935
+    },
+    {
+      "epoch": 1.9471153846153846,
+      "grad_norm": 0.6269308924674988,
+      "learning_rate": 0.00010432387045778324,
+      "loss": 0.8929,
+      "step": 10936
+    },
+    {
+      "epoch": 1.9472934472934473,
+      "grad_norm": 0.7903163433074951,
+      "learning_rate": 0.00010430988603415888,
+      "loss": 0.9812,
+      "step": 10937
+    },
+    {
+      "epoch": 1.94747150997151,
+      "grad_norm": 0.6006736159324646,
+      "learning_rate": 0.00010429590152609121,
+      "loss": 0.7959,
+      "step": 10938
+    },
+    {
+      "epoch": 1.9476495726495726,
+      "grad_norm": 0.6061521768569946,
+      "learning_rate": 0.00010428191693385431,
+      "loss": 0.8748,
+      "step": 10939
+    },
+    {
+      "epoch": 1.9478276353276354,
+      "grad_norm": 0.6637623906135559,
+      "learning_rate": 0.00010426793225772216,
+      "loss": 0.7047,
+      "step": 10940
+    },
+    {
+      "epoch": 1.948005698005698,
+      "grad_norm": 0.7650586366653442,
+      "learning_rate": 0.00010425394749796874,
+      "loss": 1.0018,
+      "step": 10941
+    },
+    {
+      "epoch": 1.9481837606837606,
+      "grad_norm": 0.6575125455856323,
+      "learning_rate": 0.000104239962654868,
+      "loss": 0.8915,
+      "step": 10942
+    },
+    {
+      "epoch": 1.9483618233618234,
+      "grad_norm": 0.6315393447875977,
+      "learning_rate": 0.00010422597772869404,
+      "loss": 1.1884,
+      "step": 10943
+    },
+    {
+      "epoch": 1.9485398860398861,
+      "grad_norm": 0.7607148885726929,
+      "learning_rate": 0.00010421199271972083,
+      "loss": 0.9341,
+      "step": 10944
+    },
+    {
+      "epoch": 1.9487179487179487,
+      "grad_norm": 0.6491827964782715,
+      "learning_rate": 0.00010419800762822239,
+      "loss": 0.9991,
+      "step": 10945
+    },
+    {
+      "epoch": 1.9488960113960114,
+      "grad_norm": 0.6294243335723877,
+      "learning_rate": 0.00010418402245447265,
+      "loss": 0.9253,
+      "step": 10946
+    },
+    {
+      "epoch": 1.949074074074074,
+      "grad_norm": 0.6472215056419373,
+      "learning_rate": 0.00010417003719874571,
+      "loss": 1.0402,
+      "step": 10947
+    },
+    {
+      "epoch": 1.9492521367521367,
+      "grad_norm": 0.7377899885177612,
+      "learning_rate": 0.00010415605186131559,
+      "loss": 1.046,
+      "step": 10948
+    },
+    {
+      "epoch": 1.9494301994301995,
+      "grad_norm": 0.6391907334327698,
+      "learning_rate": 0.00010414206644245623,
+      "loss": 0.8529,
+      "step": 10949
+    },
+    {
+      "epoch": 1.9496082621082622,
+      "grad_norm": 0.7101355195045471,
+      "learning_rate": 0.0001041280809424417,
+      "loss": 0.925,
+      "step": 10950
+    },
+    {
+      "epoch": 1.9497863247863247,
+      "grad_norm": 0.7891978025436401,
+      "learning_rate": 0.00010411409536154597,
+      "loss": 1.0691,
+      "step": 10951
+    },
+    {
+      "epoch": 1.9499643874643875,
+      "grad_norm": 0.7225242853164673,
+      "learning_rate": 0.00010410010970004311,
+      "loss": 1.158,
+      "step": 10952
+    },
+    {
+      "epoch": 1.95014245014245,
+      "grad_norm": 0.6073256731033325,
+      "learning_rate": 0.00010408612395820714,
+      "loss": 0.9977,
+      "step": 10953
+    },
+    {
+      "epoch": 1.9503205128205128,
+      "grad_norm": 0.6373769044876099,
+      "learning_rate": 0.00010407213813631203,
+      "loss": 1.019,
+      "step": 10954
+    },
+    {
+      "epoch": 1.9504985754985755,
+      "grad_norm": 0.7451884746551514,
+      "learning_rate": 0.00010405815223463184,
+      "loss": 0.9497,
+      "step": 10955
+    },
+    {
+      "epoch": 1.9506766381766383,
+      "grad_norm": 0.7760418057441711,
+      "learning_rate": 0.00010404416625344058,
+      "loss": 1.0378,
+      "step": 10956
+    },
+    {
+      "epoch": 1.9508547008547008,
+      "grad_norm": 0.7057808041572571,
+      "learning_rate": 0.00010403018019301228,
+      "loss": 0.8953,
+      "step": 10957
+    },
+    {
+      "epoch": 1.9510327635327636,
+      "grad_norm": 0.6599584817886353,
+      "learning_rate": 0.00010401619405362095,
+      "loss": 0.8859,
+      "step": 10958
+    },
+    {
+      "epoch": 1.951210826210826,
+      "grad_norm": 0.6977253556251526,
+      "learning_rate": 0.00010400220783554069,
+      "loss": 0.9038,
+      "step": 10959
+    },
+    {
+      "epoch": 1.9513888888888888,
+      "grad_norm": 0.6930267810821533,
+      "learning_rate": 0.00010398822153904546,
+      "loss": 1.1547,
+      "step": 10960
+    },
+    {
+      "epoch": 1.9515669515669516,
+      "grad_norm": 0.6301694512367249,
+      "learning_rate": 0.00010397423516440931,
+      "loss": 0.8875,
+      "step": 10961
+    },
+    {
+      "epoch": 1.9517450142450143,
+      "grad_norm": 0.7447484135627747,
+      "learning_rate": 0.00010396024871190628,
+      "loss": 1.0454,
+      "step": 10962
+    },
+    {
+      "epoch": 1.9519230769230769,
+      "grad_norm": 0.8666765093803406,
+      "learning_rate": 0.00010394626218181041,
+      "loss": 1.2211,
+      "step": 10963
+    },
+    {
+      "epoch": 1.9521011396011396,
+      "grad_norm": 0.599354088306427,
+      "learning_rate": 0.00010393227557439573,
+      "loss": 1.0419,
+      "step": 10964
+    },
+    {
+      "epoch": 1.9522792022792022,
+      "grad_norm": 0.6991702914237976,
+      "learning_rate": 0.00010391828888993627,
+      "loss": 0.8217,
+      "step": 10965
+    },
+    {
+      "epoch": 1.952457264957265,
+      "grad_norm": 0.7467028498649597,
+      "learning_rate": 0.0001039043021287061,
+      "loss": 0.8708,
+      "step": 10966
+    },
+    {
+      "epoch": 1.9526353276353277,
+      "grad_norm": 0.6806215047836304,
+      "learning_rate": 0.0001038903152909792,
+      "loss": 1.218,
+      "step": 10967
+    },
+    {
+      "epoch": 1.9528133903133904,
+      "grad_norm": 0.6704212427139282,
+      "learning_rate": 0.00010387632837702968,
+      "loss": 0.8428,
+      "step": 10968
+    },
+    {
+      "epoch": 1.952991452991453,
+      "grad_norm": 0.6843154430389404,
+      "learning_rate": 0.00010386234138713155,
+      "loss": 0.9729,
+      "step": 10969
+    },
+    {
+      "epoch": 1.9531695156695157,
+      "grad_norm": 0.6619821190834045,
+      "learning_rate": 0.00010384835432155888,
+      "loss": 1.021,
+      "step": 10970
+    },
+    {
+      "epoch": 1.9533475783475782,
+      "grad_norm": 0.6249803900718689,
+      "learning_rate": 0.0001038343671805857,
+      "loss": 0.9321,
+      "step": 10971
+    },
+    {
+      "epoch": 1.953525641025641,
+      "grad_norm": 0.7361689805984497,
+      "learning_rate": 0.00010382037996448604,
+      "loss": 0.9451,
+      "step": 10972
+    },
+    {
+      "epoch": 1.9537037037037037,
+      "grad_norm": 0.6464847922325134,
+      "learning_rate": 0.00010380639267353398,
+      "loss": 1.0188,
+      "step": 10973
+    },
+    {
+      "epoch": 1.9538817663817665,
+      "grad_norm": 0.5975635647773743,
+      "learning_rate": 0.00010379240530800356,
+      "loss": 0.9025,
+      "step": 10974
+    },
+    {
+      "epoch": 1.9540598290598292,
+      "grad_norm": 0.6734475493431091,
+      "learning_rate": 0.00010377841786816884,
+      "loss": 1.0742,
+      "step": 10975
+    },
+    {
+      "epoch": 1.9542378917378918,
+      "grad_norm": 0.7318592667579651,
+      "learning_rate": 0.00010376443035430386,
+      "loss": 1.1082,
+      "step": 10976
+    },
+    {
+      "epoch": 1.9544159544159543,
+      "grad_norm": 0.7696142792701721,
+      "learning_rate": 0.00010375044276668271,
+      "loss": 0.8421,
+      "step": 10977
+    },
+    {
+      "epoch": 1.954594017094017,
+      "grad_norm": 0.68442302942276,
+      "learning_rate": 0.00010373645510557939,
+      "loss": 1.0794,
+      "step": 10978
+    },
+    {
+      "epoch": 1.9547720797720798,
+      "grad_norm": 0.7582547068595886,
+      "learning_rate": 0.00010372246737126801,
+      "loss": 1.0332,
+      "step": 10979
+    },
+    {
+      "epoch": 1.9549501424501425,
+      "grad_norm": 0.6529998183250427,
+      "learning_rate": 0.00010370847956402262,
+      "loss": 1.1833,
+      "step": 10980
+    },
+    {
+      "epoch": 1.9551282051282053,
+      "grad_norm": 0.7565605044364929,
+      "learning_rate": 0.00010369449168411729,
+      "loss": 1.0494,
+      "step": 10981
+    },
+    {
+      "epoch": 1.9553062678062678,
+      "grad_norm": 0.6346915364265442,
+      "learning_rate": 0.00010368050373182605,
+      "loss": 1.0052,
+      "step": 10982
+    },
+    {
+      "epoch": 1.9554843304843303,
+      "grad_norm": 0.7021830081939697,
+      "learning_rate": 0.00010366651570742298,
+      "loss": 0.9716,
+      "step": 10983
+    },
+    {
+      "epoch": 1.955662393162393,
+      "grad_norm": 0.6464530825614929,
+      "learning_rate": 0.00010365252761118218,
+      "loss": 0.9802,
+      "step": 10984
+    },
+    {
+      "epoch": 1.9558404558404558,
+      "grad_norm": 0.6845090985298157,
+      "learning_rate": 0.00010363853944337768,
+      "loss": 0.9529,
+      "step": 10985
+    },
+    {
+      "epoch": 1.9560185185185186,
+      "grad_norm": 0.7178115248680115,
+      "learning_rate": 0.00010362455120428356,
+      "loss": 0.9968,
+      "step": 10986
+    },
+    {
+      "epoch": 1.9561965811965814,
+      "grad_norm": 0.6131038069725037,
+      "learning_rate": 0.00010361056289417385,
+      "loss": 1.0559,
+      "step": 10987
+    },
+    {
+      "epoch": 1.9563746438746439,
+      "grad_norm": 0.6946909427642822,
+      "learning_rate": 0.0001035965745133227,
+      "loss": 1.0457,
+      "step": 10988
+    },
+    {
+      "epoch": 1.9565527065527064,
+      "grad_norm": 0.7376706600189209,
+      "learning_rate": 0.00010358258606200413,
+      "loss": 0.7775,
+      "step": 10989
+    },
+    {
+      "epoch": 1.9567307692307692,
+      "grad_norm": 0.6864920854568481,
+      "learning_rate": 0.00010356859754049225,
+      "loss": 0.8798,
+      "step": 10990
+    },
+    {
+      "epoch": 1.956908831908832,
+      "grad_norm": 0.6301153302192688,
+      "learning_rate": 0.0001035546089490611,
+      "loss": 0.8757,
+      "step": 10991
+    },
+    {
+      "epoch": 1.9570868945868947,
+      "grad_norm": 0.7184807062149048,
+      "learning_rate": 0.00010354062028798474,
+      "loss": 1.0783,
+      "step": 10992
+    },
+    {
+      "epoch": 1.9572649572649574,
+      "grad_norm": 0.7138563394546509,
+      "learning_rate": 0.00010352663155753732,
+      "loss": 1.0328,
+      "step": 10993
+    },
+    {
+      "epoch": 1.95744301994302,
+      "grad_norm": 0.6565547585487366,
+      "learning_rate": 0.00010351264275799286,
+      "loss": 1.1312,
+      "step": 10994
+    },
+    {
+      "epoch": 1.9576210826210825,
+      "grad_norm": 0.7055862545967102,
+      "learning_rate": 0.00010349865388962547,
+      "loss": 1.0787,
+      "step": 10995
+    },
+    {
+      "epoch": 1.9577991452991452,
+      "grad_norm": 0.6184022426605225,
+      "learning_rate": 0.00010348466495270926,
+      "loss": 0.9635,
+      "step": 10996
+    },
+    {
+      "epoch": 1.957977207977208,
+      "grad_norm": 0.6563652753829956,
+      "learning_rate": 0.0001034706759475182,
+      "loss": 0.772,
+      "step": 10997
+    },
+    {
+      "epoch": 1.9581552706552707,
+      "grad_norm": 0.6103591322898865,
+      "learning_rate": 0.00010345668687432651,
+      "loss": 0.8113,
+      "step": 10998
+    },
+    {
+      "epoch": 1.9583333333333335,
+      "grad_norm": 0.6715512275695801,
+      "learning_rate": 0.0001034426977334082,
+      "loss": 1.1841,
+      "step": 10999
+    },
+    {
+      "epoch": 1.958511396011396,
+      "grad_norm": 0.680092453956604,
+      "learning_rate": 0.00010342870852503739,
+      "loss": 0.9992,
+      "step": 11000
+    },
+    {
+      "epoch": 1.9586894586894585,
+      "grad_norm": 0.828472375869751,
+      "learning_rate": 0.00010341471924948816,
+      "loss": 1.0975,
+      "step": 11001
+    },
+    {
+      "epoch": 1.9588675213675213,
+      "grad_norm": 0.758441686630249,
+      "learning_rate": 0.00010340072990703463,
+      "loss": 1.0632,
+      "step": 11002
+    },
+    {
+      "epoch": 1.959045584045584,
+      "grad_norm": 0.6847560405731201,
+      "learning_rate": 0.00010338674049795079,
+      "loss": 1.0054,
+      "step": 11003
+    },
+    {
+      "epoch": 1.9592236467236468,
+      "grad_norm": 0.707626223564148,
+      "learning_rate": 0.00010337275102251085,
+      "loss": 0.9427,
+      "step": 11004
+    },
+    {
+      "epoch": 1.9594017094017095,
+      "grad_norm": 0.769036591053009,
+      "learning_rate": 0.00010335876148098887,
+      "loss": 1.0424,
+      "step": 11005
+    },
+    {
+      "epoch": 1.959579772079772,
+      "grad_norm": 0.822695791721344,
+      "learning_rate": 0.00010334477187365892,
+      "loss": 1.1573,
+      "step": 11006
+    },
+    {
+      "epoch": 1.9597578347578346,
+      "grad_norm": 0.6290286183357239,
+      "learning_rate": 0.00010333078220079513,
+      "loss": 0.936,
+      "step": 11007
+    },
+    {
+      "epoch": 1.9599358974358974,
+      "grad_norm": 0.6802252531051636,
+      "learning_rate": 0.00010331679246267155,
+      "loss": 0.8049,
+      "step": 11008
+    },
+    {
+      "epoch": 1.96011396011396,
+      "grad_norm": 0.6652607321739197,
+      "learning_rate": 0.00010330280265956232,
+      "loss": 0.926,
+      "step": 11009
+    },
+    {
+      "epoch": 1.9602920227920229,
+      "grad_norm": 0.7057216763496399,
+      "learning_rate": 0.00010328881279174154,
+      "loss": 0.9464,
+      "step": 11010
+    },
+    {
+      "epoch": 1.9604700854700856,
+      "grad_norm": 0.6951601505279541,
+      "learning_rate": 0.00010327482285948331,
+      "loss": 0.9882,
+      "step": 11011
+    },
+    {
+      "epoch": 1.9606481481481481,
+      "grad_norm": 0.6537632942199707,
+      "learning_rate": 0.00010326083286306174,
+      "loss": 0.8663,
+      "step": 11012
+    },
+    {
+      "epoch": 1.9608262108262107,
+      "grad_norm": 0.7252047657966614,
+      "learning_rate": 0.0001032468428027509,
+      "loss": 1.1377,
+      "step": 11013
+    },
+    {
+      "epoch": 1.9610042735042734,
+      "grad_norm": 0.6494104266166687,
+      "learning_rate": 0.00010323285267882492,
+      "loss": 0.8072,
+      "step": 11014
+    },
+    {
+      "epoch": 1.9611823361823362,
+      "grad_norm": 0.8463460206985474,
+      "learning_rate": 0.00010321886249155792,
+      "loss": 1.22,
+      "step": 11015
+    },
+    {
+      "epoch": 1.961360398860399,
+      "grad_norm": 0.6071396470069885,
+      "learning_rate": 0.00010320487224122401,
+      "loss": 0.7975,
+      "step": 11016
+    },
+    {
+      "epoch": 1.9615384615384617,
+      "grad_norm": 0.6546960473060608,
+      "learning_rate": 0.00010319088192809725,
+      "loss": 1.1729,
+      "step": 11017
+    },
+    {
+      "epoch": 1.9617165242165242,
+      "grad_norm": 0.7399442791938782,
+      "learning_rate": 0.00010317689155245178,
+      "loss": 1.092,
+      "step": 11018
+    },
+    {
+      "epoch": 1.9618945868945867,
+      "grad_norm": 0.7103837728500366,
+      "learning_rate": 0.00010316290111456175,
+      "loss": 0.8436,
+      "step": 11019
+    },
+    {
+      "epoch": 1.9620726495726495,
+      "grad_norm": 0.6990065574645996,
+      "learning_rate": 0.00010314891061470125,
+      "loss": 0.9003,
+      "step": 11020
+    },
+    {
+      "epoch": 1.9622507122507122,
+      "grad_norm": 0.7945666313171387,
+      "learning_rate": 0.00010313492005314438,
+      "loss": 0.8812,
+      "step": 11021
+    },
+    {
+      "epoch": 1.962428774928775,
+      "grad_norm": 0.6177538633346558,
+      "learning_rate": 0.00010312092943016527,
+      "loss": 1.0091,
+      "step": 11022
+    },
+    {
+      "epoch": 1.9626068376068377,
+      "grad_norm": 0.7260771989822388,
+      "learning_rate": 0.000103106938746038,
+      "loss": 0.9376,
+      "step": 11023
+    },
+    {
+      "epoch": 1.9627849002849003,
+      "grad_norm": 0.6726518273353577,
+      "learning_rate": 0.00010309294800103674,
+      "loss": 0.8048,
+      "step": 11024
+    },
+    {
+      "epoch": 1.9629629629629628,
+      "grad_norm": 0.8759992122650146,
+      "learning_rate": 0.00010307895719543562,
+      "loss": 1.0248,
+      "step": 11025
+    },
+    {
+      "epoch": 1.9631410256410255,
+      "grad_norm": 0.683437168598175,
+      "learning_rate": 0.00010306496632950868,
+      "loss": 1.0314,
+      "step": 11026
+    },
+    {
+      "epoch": 1.9633190883190883,
+      "grad_norm": 0.7255756258964539,
+      "learning_rate": 0.00010305097540353012,
+      "loss": 0.9828,
+      "step": 11027
+    },
+    {
+      "epoch": 1.963497150997151,
+      "grad_norm": 0.6904804706573486,
+      "learning_rate": 0.000103036984417774,
+      "loss": 0.9054,
+      "step": 11028
+    },
+    {
+      "epoch": 1.9636752136752138,
+      "grad_norm": 0.6906846761703491,
+      "learning_rate": 0.00010302299337251451,
+      "loss": 1.0287,
+      "step": 11029
+    },
+    {
+      "epoch": 1.9638532763532763,
+      "grad_norm": 0.6677078008651733,
+      "learning_rate": 0.00010300900226802575,
+      "loss": 0.8742,
+      "step": 11030
+    },
+    {
+      "epoch": 1.964031339031339,
+      "grad_norm": 0.6144888997077942,
+      "learning_rate": 0.00010299501110458183,
+      "loss": 0.6942,
+      "step": 11031
+    },
+    {
+      "epoch": 1.9642094017094016,
+      "grad_norm": 0.753010094165802,
+      "learning_rate": 0.0001029810198824569,
+      "loss": 0.9018,
+      "step": 11032
+    },
+    {
+      "epoch": 1.9643874643874644,
+      "grad_norm": 0.6872276663780212,
+      "learning_rate": 0.00010296702860192505,
+      "loss": 1.1647,
+      "step": 11033
+    },
+    {
+      "epoch": 1.9645655270655271,
+      "grad_norm": 0.709000289440155,
+      "learning_rate": 0.00010295303726326047,
+      "loss": 0.9143,
+      "step": 11034
+    },
+    {
+      "epoch": 1.9647435897435899,
+      "grad_norm": 0.6507021188735962,
+      "learning_rate": 0.00010293904586673723,
+      "loss": 1.006,
+      "step": 11035
+    },
+    {
+      "epoch": 1.9649216524216524,
+      "grad_norm": 0.6789946556091309,
+      "learning_rate": 0.00010292505441262952,
+      "loss": 0.9049,
+      "step": 11036
+    },
+    {
+      "epoch": 1.9650997150997151,
+      "grad_norm": 0.7156081795692444,
+      "learning_rate": 0.00010291106290121143,
+      "loss": 0.9195,
+      "step": 11037
+    },
+    {
+      "epoch": 1.9652777777777777,
+      "grad_norm": 0.6770932078361511,
+      "learning_rate": 0.0001028970713327571,
+      "loss": 0.9524,
+      "step": 11038
+    },
+    {
+      "epoch": 1.9654558404558404,
+      "grad_norm": 0.7304288148880005,
+      "learning_rate": 0.00010288307970754067,
+      "loss": 0.9276,
+      "step": 11039
+    },
+    {
+      "epoch": 1.9656339031339032,
+      "grad_norm": 0.7603645324707031,
+      "learning_rate": 0.0001028690880258363,
+      "loss": 1.2157,
+      "step": 11040
+    },
+    {
+      "epoch": 1.965811965811966,
+      "grad_norm": 0.6875246167182922,
+      "learning_rate": 0.00010285509628791811,
+      "loss": 1.0269,
+      "step": 11041
+    },
+    {
+      "epoch": 1.9659900284900285,
+      "grad_norm": 0.7234818935394287,
+      "learning_rate": 0.00010284110449406026,
+      "loss": 0.9695,
+      "step": 11042
+    },
+    {
+      "epoch": 1.9661680911680912,
+      "grad_norm": 0.7322804927825928,
+      "learning_rate": 0.00010282711264453684,
+      "loss": 0.9752,
+      "step": 11043
+    },
+    {
+      "epoch": 1.9663461538461537,
+      "grad_norm": 0.7524822950363159,
+      "learning_rate": 0.00010281312073962202,
+      "loss": 1.2144,
+      "step": 11044
+    },
+    {
+      "epoch": 1.9665242165242165,
+      "grad_norm": 0.6623101234436035,
+      "learning_rate": 0.00010279912877958995,
+      "loss": 1.1334,
+      "step": 11045
+    },
+    {
+      "epoch": 1.9667022792022792,
+      "grad_norm": 0.7814893126487732,
+      "learning_rate": 0.00010278513676471477,
+      "loss": 1.266,
+      "step": 11046
+    },
+    {
+      "epoch": 1.966880341880342,
+      "grad_norm": 0.7129884362220764,
+      "learning_rate": 0.00010277114469527063,
+      "loss": 1.0918,
+      "step": 11047
+    },
+    {
+      "epoch": 1.9670584045584045,
+      "grad_norm": 0.6996828317642212,
+      "learning_rate": 0.00010275715257153164,
+      "loss": 0.9269,
+      "step": 11048
+    },
+    {
+      "epoch": 1.9672364672364673,
+      "grad_norm": 0.6439059972763062,
+      "learning_rate": 0.00010274316039377198,
+      "loss": 1.1998,
+      "step": 11049
+    },
+    {
+      "epoch": 1.9674145299145298,
+      "grad_norm": 0.6837672591209412,
+      "learning_rate": 0.00010272916816226581,
+      "loss": 0.8899,
+      "step": 11050
+    },
+    {
+      "epoch": 1.9675925925925926,
+      "grad_norm": 0.702583909034729,
+      "learning_rate": 0.00010271517587728726,
+      "loss": 1.1862,
+      "step": 11051
+    },
+    {
+      "epoch": 1.9677706552706553,
+      "grad_norm": 0.6627798676490784,
+      "learning_rate": 0.00010270118353911047,
+      "loss": 0.898,
+      "step": 11052
+    },
+    {
+      "epoch": 1.967948717948718,
+      "grad_norm": 0.7628579139709473,
+      "learning_rate": 0.00010268719114800957,
+      "loss": 1.006,
+      "step": 11053
+    },
+    {
+      "epoch": 1.9681267806267806,
+      "grad_norm": 0.6425395607948303,
+      "learning_rate": 0.00010267319870425877,
+      "loss": 0.962,
+      "step": 11054
+    },
+    {
+      "epoch": 1.9683048433048433,
+      "grad_norm": 0.7462666630744934,
+      "learning_rate": 0.00010265920620813219,
+      "loss": 1.0703,
+      "step": 11055
+    },
+    {
+      "epoch": 1.9684829059829059,
+      "grad_norm": 0.67641681432724,
+      "learning_rate": 0.00010264521365990401,
+      "loss": 1.1077,
+      "step": 11056
+    },
+    {
+      "epoch": 1.9686609686609686,
+      "grad_norm": 0.6716381311416626,
+      "learning_rate": 0.0001026312210598483,
+      "loss": 1.1048,
+      "step": 11057
+    },
+    {
+      "epoch": 1.9688390313390314,
+      "grad_norm": 0.7207448482513428,
+      "learning_rate": 0.00010261722840823935,
+      "loss": 0.9236,
+      "step": 11058
+    },
+    {
+      "epoch": 1.9690170940170941,
+      "grad_norm": 0.7208544015884399,
+      "learning_rate": 0.0001026032357053512,
+      "loss": 1.0814,
+      "step": 11059
+    },
+    {
+      "epoch": 1.9691951566951567,
+      "grad_norm": 0.6076363325119019,
+      "learning_rate": 0.00010258924295145807,
+      "loss": 0.9388,
+      "step": 11060
+    },
+    {
+      "epoch": 1.9693732193732194,
+      "grad_norm": 0.6460439562797546,
+      "learning_rate": 0.00010257525014683411,
+      "loss": 0.9506,
+      "step": 11061
+    },
+    {
+      "epoch": 1.969551282051282,
+      "grad_norm": 0.7449939250946045,
+      "learning_rate": 0.00010256125729175348,
+      "loss": 1.0209,
+      "step": 11062
+    },
+    {
+      "epoch": 1.9697293447293447,
+      "grad_norm": 0.640885055065155,
+      "learning_rate": 0.00010254726438649031,
+      "loss": 1.0235,
+      "step": 11063
+    },
+    {
+      "epoch": 1.9699074074074074,
+      "grad_norm": 0.6872261166572571,
+      "learning_rate": 0.00010253327143131879,
+      "loss": 0.9217,
+      "step": 11064
+    },
+    {
+      "epoch": 1.9700854700854702,
+      "grad_norm": 0.6213285326957703,
+      "learning_rate": 0.0001025192784265131,
+      "loss": 0.8204,
+      "step": 11065
+    },
+    {
+      "epoch": 1.9702635327635327,
+      "grad_norm": 0.6594449281692505,
+      "learning_rate": 0.00010250528537234736,
+      "loss": 0.9789,
+      "step": 11066
+    },
+    {
+      "epoch": 1.9704415954415955,
+      "grad_norm": 0.7098729610443115,
+      "learning_rate": 0.00010249129226909577,
+      "loss": 1.2551,
+      "step": 11067
+    },
+    {
+      "epoch": 1.970619658119658,
+      "grad_norm": 0.7455953359603882,
+      "learning_rate": 0.0001024772991170325,
+      "loss": 1.0281,
+      "step": 11068
+    },
+    {
+      "epoch": 1.9707977207977208,
+      "grad_norm": 0.6657416224479675,
+      "learning_rate": 0.00010246330591643166,
+      "loss": 0.9421,
+      "step": 11069
+    },
+    {
+      "epoch": 1.9709757834757835,
+      "grad_norm": 0.6480659246444702,
+      "learning_rate": 0.00010244931266756748,
+      "loss": 0.9424,
+      "step": 11070
+    },
+    {
+      "epoch": 1.9711538461538463,
+      "grad_norm": 0.6440510749816895,
+      "learning_rate": 0.00010243531937071411,
+      "loss": 0.9651,
+      "step": 11071
+    },
+    {
+      "epoch": 1.9713319088319088,
+      "grad_norm": 0.6329794526100159,
+      "learning_rate": 0.00010242132602614571,
+      "loss": 0.9233,
+      "step": 11072
+    },
+    {
+      "epoch": 1.9715099715099715,
+      "grad_norm": 0.6694819927215576,
+      "learning_rate": 0.00010240733263413646,
+      "loss": 0.884,
+      "step": 11073
+    },
+    {
+      "epoch": 1.971688034188034,
+      "grad_norm": 0.7702556848526001,
+      "learning_rate": 0.0001023933391949605,
+      "loss": 1.216,
+      "step": 11074
+    },
+    {
+      "epoch": 1.9718660968660968,
+      "grad_norm": 0.6587536931037903,
+      "learning_rate": 0.00010237934570889207,
+      "loss": 0.9324,
+      "step": 11075
+    },
+    {
+      "epoch": 1.9720441595441596,
+      "grad_norm": 0.7919837832450867,
+      "learning_rate": 0.00010236535217620529,
+      "loss": 1.0011,
+      "step": 11076
+    },
+    {
+      "epoch": 1.9722222222222223,
+      "grad_norm": 0.6604606509208679,
+      "learning_rate": 0.00010235135859717433,
+      "loss": 0.929,
+      "step": 11077
+    },
+    {
+      "epoch": 1.9724002849002849,
+      "grad_norm": 0.7158446907997131,
+      "learning_rate": 0.0001023373649720734,
+      "loss": 0.8912,
+      "step": 11078
+    },
+    {
+      "epoch": 1.9725783475783476,
+      "grad_norm": 0.7450904846191406,
+      "learning_rate": 0.00010232337130117666,
+      "loss": 1.0782,
+      "step": 11079
+    },
+    {
+      "epoch": 1.9727564102564101,
+      "grad_norm": 0.6687077283859253,
+      "learning_rate": 0.00010230937758475827,
+      "loss": 1.0662,
+      "step": 11080
+    },
+    {
+      "epoch": 1.9729344729344729,
+      "grad_norm": 0.7188364267349243,
+      "learning_rate": 0.00010229538382309245,
+      "loss": 1.024,
+      "step": 11081
+    },
+    {
+      "epoch": 1.9731125356125356,
+      "grad_norm": 0.6787814497947693,
+      "learning_rate": 0.00010228139001645334,
+      "loss": 0.9559,
+      "step": 11082
+    },
+    {
+      "epoch": 1.9732905982905984,
+      "grad_norm": 0.6834072470664978,
+      "learning_rate": 0.00010226739616511513,
+      "loss": 0.8143,
+      "step": 11083
+    },
+    {
+      "epoch": 1.973468660968661,
+      "grad_norm": 0.6651090979576111,
+      "learning_rate": 0.00010225340226935201,
+      "loss": 1.05,
+      "step": 11084
+    },
+    {
+      "epoch": 1.9736467236467237,
+      "grad_norm": 0.7125018835067749,
+      "learning_rate": 0.00010223940832943813,
+      "loss": 1.0275,
+      "step": 11085
+    },
+    {
+      "epoch": 1.9738247863247862,
+      "grad_norm": 0.6886870861053467,
+      "learning_rate": 0.00010222541434564772,
+      "loss": 1.0972,
+      "step": 11086
+    },
+    {
+      "epoch": 1.974002849002849,
+      "grad_norm": 0.7068913578987122,
+      "learning_rate": 0.00010221142031825492,
+      "loss": 0.9248,
+      "step": 11087
+    },
+    {
+      "epoch": 1.9741809116809117,
+      "grad_norm": 0.7752319574356079,
+      "learning_rate": 0.00010219742624753397,
+      "loss": 0.9754,
+      "step": 11088
+    },
+    {
+      "epoch": 1.9743589743589745,
+      "grad_norm": 0.7915459871292114,
+      "learning_rate": 0.00010218343213375896,
+      "loss": 1.2589,
+      "step": 11089
+    },
+    {
+      "epoch": 1.9745370370370372,
+      "grad_norm": 0.6597068309783936,
+      "learning_rate": 0.00010216943797720418,
+      "loss": 1.0004,
+      "step": 11090
+    },
+    {
+      "epoch": 1.9747150997150997,
+      "grad_norm": 0.7060620188713074,
+      "learning_rate": 0.00010215544377814375,
+      "loss": 0.9968,
+      "step": 11091
+    },
+    {
+      "epoch": 1.9748931623931623,
+      "grad_norm": 0.6815677881240845,
+      "learning_rate": 0.0001021414495368519,
+      "loss": 0.8889,
+      "step": 11092
+    },
+    {
+      "epoch": 1.975071225071225,
+      "grad_norm": 0.6872935891151428,
+      "learning_rate": 0.00010212745525360277,
+      "loss": 1.1582,
+      "step": 11093
+    },
+    {
+      "epoch": 1.9752492877492878,
+      "grad_norm": 0.6781140565872192,
+      "learning_rate": 0.00010211346092867056,
+      "loss": 0.9988,
+      "step": 11094
+    },
+    {
+      "epoch": 1.9754273504273505,
+      "grad_norm": 0.6959224343299866,
+      "learning_rate": 0.00010209946656232949,
+      "loss": 1.1097,
+      "step": 11095
+    },
+    {
+      "epoch": 1.9756054131054133,
+      "grad_norm": 0.7205058336257935,
+      "learning_rate": 0.00010208547215485376,
+      "loss": 0.9951,
+      "step": 11096
+    },
+    {
+      "epoch": 1.9757834757834758,
+      "grad_norm": 0.6968751549720764,
+      "learning_rate": 0.00010207147770651748,
+      "loss": 0.9313,
+      "step": 11097
+    },
+    {
+      "epoch": 1.9759615384615383,
+      "grad_norm": 0.6688823103904724,
+      "learning_rate": 0.00010205748321759494,
+      "loss": 0.9439,
+      "step": 11098
+    },
+    {
+      "epoch": 1.976139601139601,
+      "grad_norm": 0.6169568300247192,
+      "learning_rate": 0.00010204348868836028,
+      "loss": 1.123,
+      "step": 11099
+    },
+    {
+      "epoch": 1.9763176638176638,
+      "grad_norm": 0.6995537281036377,
+      "learning_rate": 0.00010202949411908768,
+      "loss": 1.1928,
+      "step": 11100
+    },
+    {
+      "epoch": 1.9764957264957266,
+      "grad_norm": 0.7102637887001038,
+      "learning_rate": 0.00010201549951005138,
+      "loss": 1.0265,
+      "step": 11101
+    },
+    {
+      "epoch": 1.9766737891737893,
+      "grad_norm": 0.6820045113563538,
+      "learning_rate": 0.00010200150486152558,
+      "loss": 0.9309,
+      "step": 11102
+    },
+    {
+      "epoch": 1.9768518518518519,
+      "grad_norm": 0.7050938010215759,
+      "learning_rate": 0.00010198751017378443,
+      "loss": 1.0047,
+      "step": 11103
+    },
+    {
+      "epoch": 1.9770299145299144,
+      "grad_norm": 0.6418201923370361,
+      "learning_rate": 0.00010197351544710214,
+      "loss": 1.1172,
+      "step": 11104
+    },
+    {
+      "epoch": 1.9772079772079771,
+      "grad_norm": 0.6681215763092041,
+      "learning_rate": 0.0001019595206817529,
+      "loss": 1.0621,
+      "step": 11105
+    },
+    {
+      "epoch": 1.97738603988604,
+      "grad_norm": 0.7725709676742554,
+      "learning_rate": 0.00010194552587801094,
+      "loss": 1.0044,
+      "step": 11106
+    },
+    {
+      "epoch": 1.9775641025641026,
+      "grad_norm": 0.6870455741882324,
+      "learning_rate": 0.00010193153103615045,
+      "loss": 1.2652,
+      "step": 11107
+    },
+    {
+      "epoch": 1.9777421652421654,
+      "grad_norm": 0.6352108120918274,
+      "learning_rate": 0.00010191753615644561,
+      "loss": 1.1081,
+      "step": 11108
+    },
+    {
+      "epoch": 1.977920227920228,
+      "grad_norm": 0.7322626113891602,
+      "learning_rate": 0.00010190354123917066,
+      "loss": 1.0003,
+      "step": 11109
+    },
+    {
+      "epoch": 1.9780982905982905,
+      "grad_norm": 0.6240935921669006,
+      "learning_rate": 0.00010188954628459972,
+      "loss": 0.8925,
+      "step": 11110
+    },
+    {
+      "epoch": 1.9782763532763532,
+      "grad_norm": 0.6648945212364197,
+      "learning_rate": 0.00010187555129300708,
+      "loss": 1.0882,
+      "step": 11111
+    },
+    {
+      "epoch": 1.978454415954416,
+      "grad_norm": 0.6704208850860596,
+      "learning_rate": 0.00010186155626466692,
+      "loss": 0.8873,
+      "step": 11112
+    },
+    {
+      "epoch": 1.9786324786324787,
+      "grad_norm": 0.6716459393501282,
+      "learning_rate": 0.00010184756119985341,
+      "loss": 1.0045,
+      "step": 11113
+    },
+    {
+      "epoch": 1.9788105413105415,
+      "grad_norm": 0.81277996301651,
+      "learning_rate": 0.0001018335660988408,
+      "loss": 0.8867,
+      "step": 11114
+    },
+    {
+      "epoch": 1.978988603988604,
+      "grad_norm": 0.7008311748504639,
+      "learning_rate": 0.00010181957096190323,
+      "loss": 0.9391,
+      "step": 11115
+    },
+    {
+      "epoch": 1.9791666666666665,
+      "grad_norm": 0.727676272392273,
+      "learning_rate": 0.00010180557578931498,
+      "loss": 1.0157,
+      "step": 11116
+    },
+    {
+      "epoch": 1.9793447293447293,
+      "grad_norm": 0.7058015465736389,
+      "learning_rate": 0.00010179158058135018,
+      "loss": 1.0,
+      "step": 11117
+    },
+    {
+      "epoch": 1.979522792022792,
+      "grad_norm": 0.7770412564277649,
+      "learning_rate": 0.00010177758533828312,
+      "loss": 1.0428,
+      "step": 11118
+    },
+    {
+      "epoch": 1.9797008547008548,
+      "grad_norm": 0.6557414531707764,
+      "learning_rate": 0.00010176359006038798,
+      "loss": 0.8557,
+      "step": 11119
+    },
+    {
+      "epoch": 1.9798789173789175,
+      "grad_norm": 0.7681090235710144,
+      "learning_rate": 0.00010174959474793894,
+      "loss": 0.867,
+      "step": 11120
+    },
+    {
+      "epoch": 1.98005698005698,
+      "grad_norm": 0.7915860414505005,
+      "learning_rate": 0.0001017355994012102,
+      "loss": 0.9961,
+      "step": 11121
+    },
+    {
+      "epoch": 1.9802350427350426,
+      "grad_norm": 0.8039166927337646,
+      "learning_rate": 0.00010172160402047604,
+      "loss": 1.1378,
+      "step": 11122
+    },
+    {
+      "epoch": 1.9804131054131053,
+      "grad_norm": 0.6641189455986023,
+      "learning_rate": 0.0001017076086060106,
+      "loss": 0.8914,
+      "step": 11123
+    },
+    {
+      "epoch": 1.980591168091168,
+      "grad_norm": 0.7673811316490173,
+      "learning_rate": 0.00010169361315808812,
+      "loss": 1.018,
+      "step": 11124
+    },
+    {
+      "epoch": 1.9807692307692308,
+      "grad_norm": 0.7320558428764343,
+      "learning_rate": 0.00010167961767698279,
+      "loss": 1.0515,
+      "step": 11125
+    },
+    {
+      "epoch": 1.9809472934472936,
+      "grad_norm": 0.5717357993125916,
+      "learning_rate": 0.00010166562216296886,
+      "loss": 0.7619,
+      "step": 11126
+    },
+    {
+      "epoch": 1.9811253561253561,
+      "grad_norm": 0.6638465523719788,
+      "learning_rate": 0.00010165162661632052,
+      "loss": 1.0161,
+      "step": 11127
+    },
+    {
+      "epoch": 1.9813034188034186,
+      "grad_norm": 0.7293243408203125,
+      "learning_rate": 0.00010163763103731201,
+      "loss": 1.063,
+      "step": 11128
+    },
+    {
+      "epoch": 1.9814814814814814,
+      "grad_norm": 0.634694516658783,
+      "learning_rate": 0.00010162363542621752,
+      "loss": 0.8945,
+      "step": 11129
+    },
+    {
+      "epoch": 1.9816595441595442,
+      "grad_norm": 0.7086902856826782,
+      "learning_rate": 0.00010160963978331122,
+      "loss": 1.0542,
+      "step": 11130
+    },
+    {
+      "epoch": 1.981837606837607,
+      "grad_norm": 0.5939825773239136,
+      "learning_rate": 0.00010159564410886742,
+      "loss": 0.7822,
+      "step": 11131
+    },
+    {
+      "epoch": 1.9820156695156697,
+      "grad_norm": 0.722183346748352,
+      "learning_rate": 0.00010158164840316027,
+      "loss": 1.0252,
+      "step": 11132
+    },
+    {
+      "epoch": 1.9821937321937322,
+      "grad_norm": 0.7300103306770325,
+      "learning_rate": 0.000101567652666464,
+      "loss": 0.9099,
+      "step": 11133
+    },
+    {
+      "epoch": 1.9823717948717947,
+      "grad_norm": 0.7148736119270325,
+      "learning_rate": 0.00010155365689905285,
+      "loss": 1.0149,
+      "step": 11134
+    },
+    {
+      "epoch": 1.9825498575498575,
+      "grad_norm": 0.8214462995529175,
+      "learning_rate": 0.000101539661101201,
+      "loss": 1.0127,
+      "step": 11135
+    },
+    {
+      "epoch": 1.9827279202279202,
+      "grad_norm": 0.7111126780509949,
+      "learning_rate": 0.00010152566527318265,
+      "loss": 1.045,
+      "step": 11136
+    },
+    {
+      "epoch": 1.982905982905983,
+      "grad_norm": 0.6640021800994873,
+      "learning_rate": 0.00010151166941527213,
+      "loss": 0.9618,
+      "step": 11137
+    },
+    {
+      "epoch": 1.9830840455840457,
+      "grad_norm": 0.7177722454071045,
+      "learning_rate": 0.00010149767352774358,
+      "loss": 1.0373,
+      "step": 11138
+    },
+    {
+      "epoch": 1.9832621082621082,
+      "grad_norm": 0.6728883981704712,
+      "learning_rate": 0.00010148367761087121,
+      "loss": 0.9886,
+      "step": 11139
+    },
+    {
+      "epoch": 1.9834401709401708,
+      "grad_norm": 0.7060428857803345,
+      "learning_rate": 0.00010146968166492926,
+      "loss": 1.042,
+      "step": 11140
+    },
+    {
+      "epoch": 1.9836182336182335,
+      "grad_norm": 0.706253707408905,
+      "learning_rate": 0.00010145568569019192,
+      "loss": 1.2249,
+      "step": 11141
+    },
+    {
+      "epoch": 1.9837962962962963,
+      "grad_norm": 0.618221640586853,
+      "learning_rate": 0.00010144168968693348,
+      "loss": 0.9223,
+      "step": 11142
+    },
+    {
+      "epoch": 1.983974358974359,
+      "grad_norm": 0.7005748748779297,
+      "learning_rate": 0.00010142769365542814,
+      "loss": 1.2735,
+      "step": 11143
+    },
+    {
+      "epoch": 1.9841524216524218,
+      "grad_norm": 0.6059799194335938,
+      "learning_rate": 0.0001014136975959501,
+      "loss": 0.7216,
+      "step": 11144
+    },
+    {
+      "epoch": 1.9843304843304843,
+      "grad_norm": 0.7169116735458374,
+      "learning_rate": 0.00010139970150877358,
+      "loss": 0.9541,
+      "step": 11145
+    },
+    {
+      "epoch": 1.984508547008547,
+      "grad_norm": 0.7402058839797974,
+      "learning_rate": 0.00010138570539417281,
+      "loss": 1.1268,
+      "step": 11146
+    },
+    {
+      "epoch": 1.9846866096866096,
+      "grad_norm": 0.7204117178916931,
+      "learning_rate": 0.00010137170925242201,
+      "loss": 1.1557,
+      "step": 11147
+    },
+    {
+      "epoch": 1.9848646723646723,
+      "grad_norm": 0.589163064956665,
+      "learning_rate": 0.00010135771308379545,
+      "loss": 0.9863,
+      "step": 11148
+    },
+    {
+      "epoch": 1.985042735042735,
+      "grad_norm": 0.6342785358428955,
+      "learning_rate": 0.00010134371688856732,
+      "loss": 0.9294,
+      "step": 11149
+    },
+    {
+      "epoch": 1.9852207977207978,
+      "grad_norm": 0.7144256234169006,
+      "learning_rate": 0.00010132972066701183,
+      "loss": 0.9428,
+      "step": 11150
+    },
+    {
+      "epoch": 1.9853988603988604,
+      "grad_norm": 0.658032238483429,
+      "learning_rate": 0.00010131572441940322,
+      "loss": 0.9749,
+      "step": 11151
+    },
+    {
+      "epoch": 1.9855769230769231,
+      "grad_norm": 0.7609163522720337,
+      "learning_rate": 0.00010130172814601576,
+      "loss": 1.1771,
+      "step": 11152
+    },
+    {
+      "epoch": 1.9857549857549857,
+      "grad_norm": 0.6531760692596436,
+      "learning_rate": 0.00010128773184712361,
+      "loss": 0.8529,
+      "step": 11153
+    },
+    {
+      "epoch": 1.9859330484330484,
+      "grad_norm": 0.6983599066734314,
+      "learning_rate": 0.00010127373552300103,
+      "loss": 1.0307,
+      "step": 11154
+    },
+    {
+      "epoch": 1.9861111111111112,
+      "grad_norm": 0.7121559381484985,
+      "learning_rate": 0.00010125973917392224,
+      "loss": 0.9426,
+      "step": 11155
+    },
+    {
+      "epoch": 1.986289173789174,
+      "grad_norm": 0.6282170414924622,
+      "learning_rate": 0.0001012457428001615,
+      "loss": 0.8983,
+      "step": 11156
+    },
+    {
+      "epoch": 1.9864672364672364,
+      "grad_norm": 0.6960387825965881,
+      "learning_rate": 0.000101231746401993,
+      "loss": 0.9001,
+      "step": 11157
+    },
+    {
+      "epoch": 1.9866452991452992,
+      "grad_norm": 0.7523152232170105,
+      "learning_rate": 0.000101217749979691,
+      "loss": 1.3462,
+      "step": 11158
+    },
+    {
+      "epoch": 1.9868233618233617,
+      "grad_norm": 0.71713787317276,
+      "learning_rate": 0.00010120375353352971,
+      "loss": 1.0147,
+      "step": 11159
+    },
+    {
+      "epoch": 1.9870014245014245,
+      "grad_norm": 0.7304390072822571,
+      "learning_rate": 0.00010118975706378339,
+      "loss": 0.8436,
+      "step": 11160
+    },
+    {
+      "epoch": 1.9871794871794872,
+      "grad_norm": 0.789968729019165,
+      "learning_rate": 0.00010117576057072622,
+      "loss": 1.1162,
+      "step": 11161
+    },
+    {
+      "epoch": 1.98735754985755,
+      "grad_norm": 0.6752170920372009,
+      "learning_rate": 0.00010116176405463249,
+      "loss": 1.0619,
+      "step": 11162
+    },
+    {
+      "epoch": 1.9875356125356125,
+      "grad_norm": 0.681398868560791,
+      "learning_rate": 0.0001011477675157764,
+      "loss": 0.8981,
+      "step": 11163
+    },
+    {
+      "epoch": 1.9877136752136753,
+      "grad_norm": 0.61469566822052,
+      "learning_rate": 0.0001011337709544322,
+      "loss": 1.0139,
+      "step": 11164
+    },
+    {
+      "epoch": 1.9878917378917378,
+      "grad_norm": 0.7524265050888062,
+      "learning_rate": 0.0001011197743708741,
+      "loss": 1.1571,
+      "step": 11165
+    },
+    {
+      "epoch": 1.9880698005698005,
+      "grad_norm": 0.6289594173431396,
+      "learning_rate": 0.00010110577776537633,
+      "loss": 0.93,
+      "step": 11166
+    },
+    {
+      "epoch": 1.9882478632478633,
+      "grad_norm": 0.6991903781890869,
+      "learning_rate": 0.00010109178113821318,
+      "loss": 1.1176,
+      "step": 11167
+    },
+    {
+      "epoch": 1.988425925925926,
+      "grad_norm": 0.7604053020477295,
+      "learning_rate": 0.00010107778448965883,
+      "loss": 1.0497,
+      "step": 11168
+    },
+    {
+      "epoch": 1.9886039886039886,
+      "grad_norm": 0.7166453003883362,
+      "learning_rate": 0.00010106378781998753,
+      "loss": 1.1237,
+      "step": 11169
+    },
+    {
+      "epoch": 1.9887820512820513,
+      "grad_norm": 0.6071686744689941,
+      "learning_rate": 0.00010104979112947352,
+      "loss": 0.8934,
+      "step": 11170
+    },
+    {
+      "epoch": 1.9889601139601139,
+      "grad_norm": 0.6618169546127319,
+      "learning_rate": 0.00010103579441839101,
+      "loss": 1.0596,
+      "step": 11171
+    },
+    {
+      "epoch": 1.9891381766381766,
+      "grad_norm": 0.6838458776473999,
+      "learning_rate": 0.0001010217976870143,
+      "loss": 1.0167,
+      "step": 11172
+    },
+    {
+      "epoch": 1.9893162393162394,
+      "grad_norm": 0.6369979381561279,
+      "learning_rate": 0.00010100780093561757,
+      "loss": 0.9001,
+      "step": 11173
+    },
+    {
+      "epoch": 1.989494301994302,
+      "grad_norm": 0.661313533782959,
+      "learning_rate": 0.00010099380416447508,
+      "loss": 0.8952,
+      "step": 11174
+    },
+    {
+      "epoch": 1.9896723646723646,
+      "grad_norm": 0.6991600394248962,
+      "learning_rate": 0.00010097980737386106,
+      "loss": 1.0083,
+      "step": 11175
+    },
+    {
+      "epoch": 1.9898504273504274,
+      "grad_norm": 0.618748664855957,
+      "learning_rate": 0.00010096581056404972,
+      "loss": 0.8797,
+      "step": 11176
+    },
+    {
+      "epoch": 1.99002849002849,
+      "grad_norm": 0.7039223909378052,
+      "learning_rate": 0.00010095181373531535,
+      "loss": 1.0385,
+      "step": 11177
+    },
+    {
+      "epoch": 1.9902065527065527,
+      "grad_norm": 0.7598999738693237,
+      "learning_rate": 0.00010093781688793216,
+      "loss": 0.9205,
+      "step": 11178
+    },
+    {
+      "epoch": 1.9903846153846154,
+      "grad_norm": 0.6355955600738525,
+      "learning_rate": 0.00010092382002217441,
+      "loss": 0.8646,
+      "step": 11179
+    },
+    {
+      "epoch": 1.9905626780626782,
+      "grad_norm": 0.8024569153785706,
+      "learning_rate": 0.00010090982313831634,
+      "loss": 1.1678,
+      "step": 11180
+    },
+    {
+      "epoch": 1.9907407407407407,
+      "grad_norm": 0.5960529446601868,
+      "learning_rate": 0.00010089582623663216,
+      "loss": 0.8277,
+      "step": 11181
+    },
+    {
+      "epoch": 1.9909188034188035,
+      "grad_norm": 0.6323728561401367,
+      "learning_rate": 0.00010088182931739609,
+      "loss": 0.948,
+      "step": 11182
+    },
+    {
+      "epoch": 1.991096866096866,
+      "grad_norm": 0.7532381415367126,
+      "learning_rate": 0.00010086783238088244,
+      "loss": 1.2948,
+      "step": 11183
+    },
+    {
+      "epoch": 1.9912749287749287,
+      "grad_norm": 0.5740166306495667,
+      "learning_rate": 0.00010085383542736543,
+      "loss": 0.7019,
+      "step": 11184
+    },
+    {
+      "epoch": 1.9914529914529915,
+      "grad_norm": 0.616985559463501,
+      "learning_rate": 0.00010083983845711929,
+      "loss": 1.0802,
+      "step": 11185
+    },
+    {
+      "epoch": 1.9916310541310542,
+      "grad_norm": 0.7505929470062256,
+      "learning_rate": 0.00010082584147041824,
+      "loss": 1.0523,
+      "step": 11186
+    },
+    {
+      "epoch": 1.9918091168091168,
+      "grad_norm": 0.7147656679153442,
+      "learning_rate": 0.00010081184446753653,
+      "loss": 1.0019,
+      "step": 11187
+    },
+    {
+      "epoch": 1.9919871794871795,
+      "grad_norm": 0.7301992774009705,
+      "learning_rate": 0.00010079784744874845,
+      "loss": 1.0329,
+      "step": 11188
+    },
+    {
+      "epoch": 1.992165242165242,
+      "grad_norm": 0.6847206354141235,
+      "learning_rate": 0.00010078385041432819,
+      "loss": 1.0367,
+      "step": 11189
+    },
+    {
+      "epoch": 1.9923433048433048,
+      "grad_norm": 0.7310990691184998,
+      "learning_rate": 0.00010076985336455,
+      "loss": 1.1675,
+      "step": 11190
+    },
+    {
+      "epoch": 1.9925213675213675,
+      "grad_norm": 0.6916858553886414,
+      "learning_rate": 0.00010075585629968813,
+      "loss": 0.8615,
+      "step": 11191
+    },
+    {
+      "epoch": 1.9926994301994303,
+      "grad_norm": 0.6519390344619751,
+      "learning_rate": 0.00010074185922001685,
+      "loss": 0.8105,
+      "step": 11192
+    },
+    {
+      "epoch": 1.9928774928774928,
+      "grad_norm": 0.7437400817871094,
+      "learning_rate": 0.00010072786212581036,
+      "loss": 0.9993,
+      "step": 11193
+    },
+    {
+      "epoch": 1.9930555555555556,
+      "grad_norm": 0.5048928260803223,
+      "learning_rate": 0.00010071386501734292,
+      "loss": 0.7912,
+      "step": 11194
+    },
+    {
+      "epoch": 1.993233618233618,
+      "grad_norm": 0.8042343258857727,
+      "learning_rate": 0.00010069986789488882,
+      "loss": 0.9156,
+      "step": 11195
+    },
+    {
+      "epoch": 1.9934116809116809,
+      "grad_norm": 0.7188669443130493,
+      "learning_rate": 0.0001006858707587222,
+      "loss": 1.0474,
+      "step": 11196
+    },
+    {
+      "epoch": 1.9935897435897436,
+      "grad_norm": 0.7377660870552063,
+      "learning_rate": 0.00010067187360911738,
+      "loss": 0.7013,
+      "step": 11197
+    },
+    {
+      "epoch": 1.9937678062678064,
+      "grad_norm": 0.6684696078300476,
+      "learning_rate": 0.00010065787644634861,
+      "loss": 0.9199,
+      "step": 11198
+    },
+    {
+      "epoch": 1.993945868945869,
+      "grad_norm": 0.7341524958610535,
+      "learning_rate": 0.00010064387927069012,
+      "loss": 1.0925,
+      "step": 11199
+    },
+    {
+      "epoch": 1.9941239316239316,
+      "grad_norm": 0.685745120048523,
+      "learning_rate": 0.00010062988208241614,
+      "loss": 1.083,
+      "step": 11200
+    },
+    {
+      "epoch": 1.9943019943019942,
+      "grad_norm": 0.6923556327819824,
+      "learning_rate": 0.00010061588488180096,
+      "loss": 1.2728,
+      "step": 11201
+    },
+    {
+      "epoch": 1.994480056980057,
+      "grad_norm": 0.6663293242454529,
+      "learning_rate": 0.00010060188766911876,
+      "loss": 1.0937,
+      "step": 11202
+    },
+    {
+      "epoch": 1.9946581196581197,
+      "grad_norm": 0.7963639497756958,
+      "learning_rate": 0.00010058789044464383,
+      "loss": 1.0592,
+      "step": 11203
+    },
+    {
+      "epoch": 1.9948361823361824,
+      "grad_norm": 0.6362990140914917,
+      "learning_rate": 0.00010057389320865042,
+      "loss": 0.8872,
+      "step": 11204
+    },
+    {
+      "epoch": 1.9950142450142452,
+      "grad_norm": 0.7752974033355713,
+      "learning_rate": 0.00010055989596141278,
+      "loss": 1.043,
+      "step": 11205
+    },
+    {
+      "epoch": 1.9951923076923077,
+      "grad_norm": 0.7125133275985718,
+      "learning_rate": 0.00010054589870320512,
+      "loss": 1.0015,
+      "step": 11206
+    },
+    {
+      "epoch": 1.9953703703703702,
+      "grad_norm": 0.7102736830711365,
+      "learning_rate": 0.00010053190143430169,
+      "loss": 1.0052,
+      "step": 11207
+    },
+    {
+      "epoch": 1.995548433048433,
+      "grad_norm": 0.8628628849983215,
+      "learning_rate": 0.00010051790415497677,
+      "loss": 1.2351,
+      "step": 11208
+    },
+    {
+      "epoch": 1.9957264957264957,
+      "grad_norm": 0.7233129739761353,
+      "learning_rate": 0.00010050390686550462,
+      "loss": 1.0848,
+      "step": 11209
+    },
+    {
+      "epoch": 1.9959045584045585,
+      "grad_norm": 0.5936228036880493,
+      "learning_rate": 0.00010048990956615944,
+      "loss": 0.7998,
+      "step": 11210
+    },
+    {
+      "epoch": 1.9960826210826212,
+      "grad_norm": 0.7345388531684875,
+      "learning_rate": 0.0001004759122572155,
+      "loss": 1.0329,
+      "step": 11211
+    },
+    {
+      "epoch": 1.9962606837606838,
+      "grad_norm": 0.7344130873680115,
+      "learning_rate": 0.00010046191493894703,
+      "loss": 1.1563,
+      "step": 11212
+    },
+    {
+      "epoch": 1.9964387464387463,
+      "grad_norm": 0.6979942321777344,
+      "learning_rate": 0.00010044791761162833,
+      "loss": 0.9269,
+      "step": 11213
+    },
+    {
+      "epoch": 1.996616809116809,
+      "grad_norm": 0.67514967918396,
+      "learning_rate": 0.0001004339202755336,
+      "loss": 0.9028,
+      "step": 11214
+    },
+    {
+      "epoch": 1.9967948717948718,
+      "grad_norm": 0.6379111409187317,
+      "learning_rate": 0.00010041992293093712,
+      "loss": 0.7816,
+      "step": 11215
+    },
+    {
+      "epoch": 1.9969729344729346,
+      "grad_norm": 0.693976104259491,
+      "learning_rate": 0.00010040592557811308,
+      "loss": 0.8411,
+      "step": 11216
+    },
+    {
+      "epoch": 1.9971509971509973,
+      "grad_norm": 0.5952646732330322,
+      "learning_rate": 0.0001003919282173358,
+      "loss": 0.8681,
+      "step": 11217
+    },
+    {
+      "epoch": 1.9973290598290598,
+      "grad_norm": 0.7452160716056824,
+      "learning_rate": 0.00010037793084887948,
+      "loss": 1.0198,
+      "step": 11218
+    },
+    {
+      "epoch": 1.9975071225071224,
+      "grad_norm": 0.6683938503265381,
+      "learning_rate": 0.00010036393347301841,
+      "loss": 0.8162,
+      "step": 11219
+    },
+    {
+      "epoch": 1.9976851851851851,
+      "grad_norm": 0.6849120855331421,
+      "learning_rate": 0.00010034993609002683,
+      "loss": 1.0668,
+      "step": 11220
+    },
+    {
+      "epoch": 1.9978632478632479,
+      "grad_norm": 0.8782517910003662,
+      "learning_rate": 0.00010033593870017897,
+      "loss": 1.222,
+      "step": 11221
+    },
+    {
+      "epoch": 1.9980413105413106,
+      "grad_norm": 0.6482772827148438,
+      "learning_rate": 0.00010032194130374908,
+      "loss": 0.7722,
+      "step": 11222
+    },
+    {
+      "epoch": 1.9982193732193734,
+      "grad_norm": 0.8595399260520935,
+      "learning_rate": 0.00010030794390101142,
+      "loss": 1.3004,
+      "step": 11223
+    },
+    {
+      "epoch": 1.998397435897436,
+      "grad_norm": 0.7258931994438171,
+      "learning_rate": 0.00010029394649224024,
+      "loss": 0.8825,
+      "step": 11224
+    },
+    {
+      "epoch": 1.9985754985754984,
+      "grad_norm": 0.6291348934173584,
+      "learning_rate": 0.00010027994907770981,
+      "loss": 0.8681,
+      "step": 11225
+    },
+    {
+      "epoch": 1.9987535612535612,
+      "grad_norm": 0.7528844475746155,
+      "learning_rate": 0.00010026595165769434,
+      "loss": 1.1443,
+      "step": 11226
+    },
+    {
+      "epoch": 1.998931623931624,
+      "grad_norm": 0.654017984867096,
+      "learning_rate": 0.0001002519542324681,
+      "loss": 0.8585,
+      "step": 11227
+    },
+    {
+      "epoch": 1.9991096866096867,
+      "grad_norm": 0.6812533736228943,
+      "learning_rate": 0.00010023795680230532,
+      "loss": 0.8757,
+      "step": 11228
+    },
+    {
+      "epoch": 1.9992877492877494,
+      "grad_norm": 0.7120179533958435,
+      "learning_rate": 0.0001002239593674803,
+      "loss": 1.0159,
+      "step": 11229
+    },
+    {
+      "epoch": 1.999465811965812,
+      "grad_norm": 0.6943802237510681,
+      "learning_rate": 0.00010020996192826725,
+      "loss": 1.0193,
+      "step": 11230
+    },
+    {
+      "epoch": 1.9996438746438745,
+      "grad_norm": 0.7227906584739685,
+      "learning_rate": 0.00010019596448494047,
+      "loss": 1.1536,
+      "step": 11231
+    },
+    {
+      "epoch": 1.9998219373219372,
+      "grad_norm": 0.6233312487602234,
+      "learning_rate": 0.00010018196703777411,
+      "loss": 0.9117,
+      "step": 11232
+    },
+    {
+      "epoch": 1.9998219373219372,
+      "eval_loss": 1.0963108539581299,
+      "eval_runtime": 24.4478,
+      "eval_samples_per_second": 42.58,
+      "eval_steps_per_second": 21.311,
+      "step": 11232
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 0.67911696434021,
+      "learning_rate": 0.00010016796958704254,
+      "loss": 0.9516,
+      "step": 11233
+    },
+    {
+      "epoch": 2.0001780626780628,
+      "grad_norm": 0.7372198700904846,
+      "learning_rate": 0.00010015397213301992,
+      "loss": 1.3066,
+      "step": 11234
+    },
+    {
+      "epoch": 2.0001780626780628,
+      "grad_norm": 0.7573498487472534,
+      "learning_rate": 0.00010013997467598055,
+      "loss": 1.0204,
+      "step": 11235
+    },
+    {
+      "epoch": 2.0003561253561255,
+      "grad_norm": 0.6862889528274536,
+      "learning_rate": 0.00010012597721619863,
+      "loss": 0.9447,
+      "step": 11236
+    },
+    {
+      "epoch": 2.0005341880341883,
+      "grad_norm": 0.5977628231048584,
+      "learning_rate": 0.00010011197975394851,
+      "loss": 0.9849,
+      "step": 11237
+    },
+    {
+      "epoch": 2.0007122507122506,
+      "grad_norm": 0.620206892490387,
+      "learning_rate": 0.00010009798228950431,
+      "loss": 0.7498,
+      "step": 11238
+    },
+    {
+      "epoch": 2.0008903133903133,
+      "grad_norm": 0.5694536566734314,
+      "learning_rate": 0.0001000839848231404,
+      "loss": 0.7092,
+      "step": 11239
+    },
+    {
+      "epoch": 2.001068376068376,
+      "grad_norm": 0.5880212783813477,
+      "learning_rate": 0.00010006998735513098,
+      "loss": 0.9057,
+      "step": 11240
+    },
+    {
+      "epoch": 2.001246438746439,
+      "grad_norm": 0.6152323484420776,
+      "learning_rate": 0.00010005598988575029,
+      "loss": 0.8356,
+      "step": 11241
+    },
+    {
+      "epoch": 2.0014245014245016,
+      "grad_norm": 0.6827659010887146,
+      "learning_rate": 0.00010004199241527261,
+      "loss": 0.8302,
+      "step": 11242
+    },
+    {
+      "epoch": 2.0016025641025643,
+      "grad_norm": 0.5883491635322571,
+      "learning_rate": 0.00010002799494397215,
+      "loss": 0.8616,
+      "step": 11243
+    },
+    {
+      "epoch": 2.0017806267806266,
+      "grad_norm": 0.7649462819099426,
+      "learning_rate": 0.00010001399747212322,
+      "loss": 1.1643,
+      "step": 11244
+    },
+    {
+      "epoch": 2.0019586894586894,
+      "grad_norm": 0.6435316205024719,
+      "learning_rate": 0.0001,
+      "loss": 0.9239,
+      "step": 11245
+    },
+    {
+      "epoch": 2.002136752136752,
+      "grad_norm": 0.5662951469421387,
+      "learning_rate": 9.99860025278768e-05,
+      "loss": 0.9111,
+      "step": 11246
+    },
+    {
+      "epoch": 2.002314814814815,
+      "grad_norm": 0.6234064102172852,
+      "learning_rate": 9.997200505602787e-05,
+      "loss": 0.4852,
+      "step": 11247
+    },
+    {
+      "epoch": 2.0024928774928776,
+      "grad_norm": 0.6322146058082581,
+      "learning_rate": 9.995800758472741e-05,
+      "loss": 0.8599,
+      "step": 11248
+    },
+    {
+      "epoch": 2.0026709401709404,
+      "grad_norm": 0.6131469011306763,
+      "learning_rate": 9.994401011424972e-05,
+      "loss": 0.8504,
+      "step": 11249
+    },
+    {
+      "epoch": 2.0028490028490027,
+      "grad_norm": 0.6809168457984924,
+      "learning_rate": 9.993001264486903e-05,
+      "loss": 0.761,
+      "step": 11250
+    },
+    {
+      "epoch": 2.0030270655270654,
+      "grad_norm": 0.6721677184104919,
+      "learning_rate": 9.991601517685962e-05,
+      "loss": 0.9146,
+      "step": 11251
+    },
+    {
+      "epoch": 2.003205128205128,
+      "grad_norm": 0.6395483016967773,
+      "learning_rate": 9.990201771049569e-05,
+      "loss": 0.8583,
+      "step": 11252
+    },
+    {
+      "epoch": 2.003383190883191,
+      "grad_norm": 0.8524805903434753,
+      "learning_rate": 9.988802024605153e-05,
+      "loss": 0.859,
+      "step": 11253
+    },
+    {
+      "epoch": 2.0035612535612537,
+      "grad_norm": 0.6186681389808655,
+      "learning_rate": 9.987402278380136e-05,
+      "loss": 0.6695,
+      "step": 11254
+    },
+    {
+      "epoch": 2.0037393162393164,
+      "grad_norm": 0.593245267868042,
+      "learning_rate": 9.98600253240195e-05,
+      "loss": 0.7104,
+      "step": 11255
+    },
+    {
+      "epoch": 2.0039173789173788,
+      "grad_norm": 0.6806482672691345,
+      "learning_rate": 9.98460278669801e-05,
+      "loss": 0.6208,
+      "step": 11256
+    },
+    {
+      "epoch": 2.0040954415954415,
+      "grad_norm": 0.7329097390174866,
+      "learning_rate": 9.983203041295753e-05,
+      "loss": 0.8264,
+      "step": 11257
+    },
+    {
+      "epoch": 2.0042735042735043,
+      "grad_norm": 0.7579078078269958,
+      "learning_rate": 9.981803296222591e-05,
+      "loss": 0.7572,
+      "step": 11258
+    },
+    {
+      "epoch": 2.004451566951567,
+      "grad_norm": 0.7945193648338318,
+      "learning_rate": 9.980403551505958e-05,
+      "loss": 0.7916,
+      "step": 11259
+    },
+    {
+      "epoch": 2.0046296296296298,
+      "grad_norm": 0.5528121590614319,
+      "learning_rate": 9.979003807173276e-05,
+      "loss": 0.5609,
+      "step": 11260
+    },
+    {
+      "epoch": 2.0048076923076925,
+      "grad_norm": 0.7321668863296509,
+      "learning_rate": 9.977604063251973e-05,
+      "loss": 0.9041,
+      "step": 11261
+    },
+    {
+      "epoch": 2.004985754985755,
+      "grad_norm": 0.6553691029548645,
+      "learning_rate": 9.976204319769469e-05,
+      "loss": 0.853,
+      "step": 11262
+    },
+    {
+      "epoch": 2.0051638176638176,
+      "grad_norm": 0.789152979850769,
+      "learning_rate": 9.974804576753194e-05,
+      "loss": 0.9909,
+      "step": 11263
+    },
+    {
+      "epoch": 2.0053418803418803,
+      "grad_norm": 0.6342231631278992,
+      "learning_rate": 9.973404834230568e-05,
+      "loss": 0.8841,
+      "step": 11264
+    },
+    {
+      "epoch": 2.005519943019943,
+      "grad_norm": 0.671882688999176,
+      "learning_rate": 9.97200509222902e-05,
+      "loss": 0.8147,
+      "step": 11265
+    },
+    {
+      "epoch": 2.005698005698006,
+      "grad_norm": 0.8409315347671509,
+      "learning_rate": 9.970605350775978e-05,
+      "loss": 1.0466,
+      "step": 11266
+    },
+    {
+      "epoch": 2.0058760683760686,
+      "grad_norm": 0.6155081987380981,
+      "learning_rate": 9.969205609898858e-05,
+      "loss": 0.9461,
+      "step": 11267
+    },
+    {
+      "epoch": 2.006054131054131,
+      "grad_norm": 0.656370997428894,
+      "learning_rate": 9.967805869625093e-05,
+      "loss": 0.6152,
+      "step": 11268
+    },
+    {
+      "epoch": 2.0062321937321936,
+      "grad_norm": 0.6441524624824524,
+      "learning_rate": 9.966406129982103e-05,
+      "loss": 0.9023,
+      "step": 11269
+    },
+    {
+      "epoch": 2.0064102564102564,
+      "grad_norm": 0.7976031303405762,
+      "learning_rate": 9.96500639099732e-05,
+      "loss": 0.8886,
+      "step": 11270
+    },
+    {
+      "epoch": 2.006588319088319,
+      "grad_norm": 0.6888235807418823,
+      "learning_rate": 9.963606652698159e-05,
+      "loss": 0.7216,
+      "step": 11271
+    },
+    {
+      "epoch": 2.006766381766382,
+      "grad_norm": 0.8439735770225525,
+      "learning_rate": 9.962206915112054e-05,
+      "loss": 1.0161,
+      "step": 11272
+    },
+    {
+      "epoch": 2.0069444444444446,
+      "grad_norm": 0.6425265669822693,
+      "learning_rate": 9.960807178266423e-05,
+      "loss": 0.8718,
+      "step": 11273
+    },
+    {
+      "epoch": 2.007122507122507,
+      "grad_norm": 0.7393937110900879,
+      "learning_rate": 9.959407442188696e-05,
+      "loss": 0.7615,
+      "step": 11274
+    },
+    {
+      "epoch": 2.0073005698005697,
+      "grad_norm": 0.5919229984283447,
+      "learning_rate": 9.958007706906292e-05,
+      "loss": 0.6761,
+      "step": 11275
+    },
+    {
+      "epoch": 2.0074786324786325,
+      "grad_norm": 0.765935480594635,
+      "learning_rate": 9.956607972446644e-05,
+      "loss": 0.9057,
+      "step": 11276
+    },
+    {
+      "epoch": 2.007656695156695,
+      "grad_norm": 0.7091122269630432,
+      "learning_rate": 9.955208238837169e-05,
+      "loss": 0.8322,
+      "step": 11277
+    },
+    {
+      "epoch": 2.007834757834758,
+      "grad_norm": 0.850652813911438,
+      "learning_rate": 9.953808506105299e-05,
+      "loss": 0.9942,
+      "step": 11278
+    },
+    {
+      "epoch": 2.0080128205128207,
+      "grad_norm": 0.7341200113296509,
+      "learning_rate": 9.952408774278452e-05,
+      "loss": 0.7826,
+      "step": 11279
+    },
+    {
+      "epoch": 2.008190883190883,
+      "grad_norm": 0.6891999840736389,
+      "learning_rate": 9.95100904338406e-05,
+      "loss": 0.8939,
+      "step": 11280
+    },
+    {
+      "epoch": 2.0083689458689458,
+      "grad_norm": 0.800881028175354,
+      "learning_rate": 9.94960931344954e-05,
+      "loss": 0.8036,
+      "step": 11281
+    },
+    {
+      "epoch": 2.0085470085470085,
+      "grad_norm": 0.7483115792274475,
+      "learning_rate": 9.948209584502328e-05,
+      "loss": 0.7203,
+      "step": 11282
+    },
+    {
+      "epoch": 2.0087250712250713,
+      "grad_norm": 0.7314630150794983,
+      "learning_rate": 9.946809856569833e-05,
+      "loss": 0.8907,
+      "step": 11283
+    },
+    {
+      "epoch": 2.008903133903134,
+      "grad_norm": 0.7317429184913635,
+      "learning_rate": 9.945410129679493e-05,
+      "loss": 0.8971,
+      "step": 11284
+    },
+    {
+      "epoch": 2.0090811965811968,
+      "grad_norm": 0.6968898177146912,
+      "learning_rate": 9.944010403858726e-05,
+      "loss": 0.8638,
+      "step": 11285
+    },
+    {
+      "epoch": 2.009259259259259,
+      "grad_norm": 0.6680058240890503,
+      "learning_rate": 9.942610679134957e-05,
+      "loss": 0.7524,
+      "step": 11286
+    },
+    {
+      "epoch": 2.009437321937322,
+      "grad_norm": 0.6863839030265808,
+      "learning_rate": 9.941210955535618e-05,
+      "loss": 0.9647,
+      "step": 11287
+    },
+    {
+      "epoch": 2.0096153846153846,
+      "grad_norm": 0.7137607336044312,
+      "learning_rate": 9.939811233088125e-05,
+      "loss": 0.7089,
+      "step": 11288
+    },
+    {
+      "epoch": 2.0097934472934473,
+      "grad_norm": 0.8341759443283081,
+      "learning_rate": 9.938411511819907e-05,
+      "loss": 0.9461,
+      "step": 11289
+    },
+    {
+      "epoch": 2.00997150997151,
+      "grad_norm": 0.7326228022575378,
+      "learning_rate": 9.937011791758384e-05,
+      "loss": 0.8795,
+      "step": 11290
+    },
+    {
+      "epoch": 2.010149572649573,
+      "grad_norm": 0.6795905232429504,
+      "learning_rate": 9.935612072930989e-05,
+      "loss": 0.8298,
+      "step": 11291
+    },
+    {
+      "epoch": 2.010327635327635,
+      "grad_norm": 0.7060360312461853,
+      "learning_rate": 9.934212355365139e-05,
+      "loss": 0.8483,
+      "step": 11292
+    },
+    {
+      "epoch": 2.010505698005698,
+      "grad_norm": 0.7532246112823486,
+      "learning_rate": 9.932812639088265e-05,
+      "loss": 0.9061,
+      "step": 11293
+    },
+    {
+      "epoch": 2.0106837606837606,
+      "grad_norm": 0.6563972234725952,
+      "learning_rate": 9.931412924127781e-05,
+      "loss": 0.8511,
+      "step": 11294
+    },
+    {
+      "epoch": 2.0108618233618234,
+      "grad_norm": 0.6672948002815247,
+      "learning_rate": 9.930013210511125e-05,
+      "loss": 0.7875,
+      "step": 11295
+    },
+    {
+      "epoch": 2.011039886039886,
+      "grad_norm": 0.7173593640327454,
+      "learning_rate": 9.928613498265709e-05,
+      "loss": 0.8602,
+      "step": 11296
+    },
+    {
+      "epoch": 2.011217948717949,
+      "grad_norm": 0.7399459481239319,
+      "learning_rate": 9.927213787418968e-05,
+      "loss": 0.8711,
+      "step": 11297
+    },
+    {
+      "epoch": 2.011396011396011,
+      "grad_norm": 0.7693262696266174,
+      "learning_rate": 9.925814077998317e-05,
+      "loss": 0.9927,
+      "step": 11298
+    },
+    {
+      "epoch": 2.011574074074074,
+      "grad_norm": 0.7998616695404053,
+      "learning_rate": 9.92441437003119e-05,
+      "loss": 1.1585,
+      "step": 11299
+    },
+    {
+      "epoch": 2.0117521367521367,
+      "grad_norm": 0.7239874005317688,
+      "learning_rate": 9.923014663545002e-05,
+      "loss": 0.8736,
+      "step": 11300
+    },
+    {
+      "epoch": 2.0119301994301995,
+      "grad_norm": 0.8565806150436401,
+      "learning_rate": 9.921614958567186e-05,
+      "loss": 0.9768,
+      "step": 11301
+    },
+    {
+      "epoch": 2.012108262108262,
+      "grad_norm": 0.6341429948806763,
+      "learning_rate": 9.920215255125158e-05,
+      "loss": 0.6553,
+      "step": 11302
+    },
+    {
+      "epoch": 2.012286324786325,
+      "grad_norm": 0.824182391166687,
+      "learning_rate": 9.91881555324635e-05,
+      "loss": 1.0138,
+      "step": 11303
+    },
+    {
+      "epoch": 2.0124643874643873,
+      "grad_norm": 0.6309344172477722,
+      "learning_rate": 9.917415852958178e-05,
+      "loss": 0.619,
+      "step": 11304
+    },
+    {
+      "epoch": 2.01264245014245,
+      "grad_norm": 0.7469239830970764,
+      "learning_rate": 9.916016154288071e-05,
+      "loss": 0.8537,
+      "step": 11305
+    },
+    {
+      "epoch": 2.0128205128205128,
+      "grad_norm": 0.7433663606643677,
+      "learning_rate": 9.914616457263459e-05,
+      "loss": 0.8518,
+      "step": 11306
+    },
+    {
+      "epoch": 2.0129985754985755,
+      "grad_norm": 0.6550318002700806,
+      "learning_rate": 9.913216761911755e-05,
+      "loss": 0.8021,
+      "step": 11307
+    },
+    {
+      "epoch": 2.0131766381766383,
+      "grad_norm": 0.7360837459564209,
+      "learning_rate": 9.911817068260392e-05,
+      "loss": 0.7002,
+      "step": 11308
+    },
+    {
+      "epoch": 2.013354700854701,
+      "grad_norm": 0.7208407521247864,
+      "learning_rate": 9.910417376336786e-05,
+      "loss": 0.8633,
+      "step": 11309
+    },
+    {
+      "epoch": 2.0135327635327633,
+      "grad_norm": 0.7758026719093323,
+      "learning_rate": 9.909017686168369e-05,
+      "loss": 0.764,
+      "step": 11310
+    },
+    {
+      "epoch": 2.013710826210826,
+      "grad_norm": 0.8215547204017639,
+      "learning_rate": 9.90761799778256e-05,
+      "loss": 0.7062,
+      "step": 11311
+    },
+    {
+      "epoch": 2.013888888888889,
+      "grad_norm": 0.6731052994728088,
+      "learning_rate": 9.906218311206786e-05,
+      "loss": 0.902,
+      "step": 11312
+    },
+    {
+      "epoch": 2.0140669515669516,
+      "grad_norm": 0.74113929271698,
+      "learning_rate": 9.904818626468466e-05,
+      "loss": 0.7229,
+      "step": 11313
+    },
+    {
+      "epoch": 2.0142450142450143,
+      "grad_norm": 0.6673575639724731,
+      "learning_rate": 9.90341894359503e-05,
+      "loss": 0.7299,
+      "step": 11314
+    },
+    {
+      "epoch": 2.014423076923077,
+      "grad_norm": 0.7665545344352722,
+      "learning_rate": 9.902019262613897e-05,
+      "loss": 0.6993,
+      "step": 11315
+    },
+    {
+      "epoch": 2.0146011396011394,
+      "grad_norm": 0.6423895359039307,
+      "learning_rate": 9.900619583552497e-05,
+      "loss": 0.7344,
+      "step": 11316
+    },
+    {
+      "epoch": 2.014779202279202,
+      "grad_norm": 0.7071038484573364,
+      "learning_rate": 9.899219906438245e-05,
+      "loss": 0.6951,
+      "step": 11317
+    },
+    {
+      "epoch": 2.014957264957265,
+      "grad_norm": 0.689984142780304,
+      "learning_rate": 9.897820231298574e-05,
+      "loss": 0.8496,
+      "step": 11318
+    },
+    {
+      "epoch": 2.0151353276353277,
+      "grad_norm": 0.8747256398200989,
+      "learning_rate": 9.896420558160901e-05,
+      "loss": 0.9752,
+      "step": 11319
+    },
+    {
+      "epoch": 2.0153133903133904,
+      "grad_norm": 0.6828433275222778,
+      "learning_rate": 9.895020887052651e-05,
+      "loss": 0.8369,
+      "step": 11320
+    },
+    {
+      "epoch": 2.015491452991453,
+      "grad_norm": 0.7334261536598206,
+      "learning_rate": 9.89362121800125e-05,
+      "loss": 0.7744,
+      "step": 11321
+    },
+    {
+      "epoch": 2.0156695156695155,
+      "grad_norm": 0.7896139621734619,
+      "learning_rate": 9.892221551034122e-05,
+      "loss": 0.8353,
+      "step": 11322
+    },
+    {
+      "epoch": 2.015847578347578,
+      "grad_norm": 0.6673476099967957,
+      "learning_rate": 9.890821886178684e-05,
+      "loss": 0.8644,
+      "step": 11323
+    },
+    {
+      "epoch": 2.016025641025641,
+      "grad_norm": 0.7475691437721252,
+      "learning_rate": 9.889422223462368e-05,
+      "loss": 0.8034,
+      "step": 11324
+    },
+    {
+      "epoch": 2.0162037037037037,
+      "grad_norm": 0.9086315631866455,
+      "learning_rate": 9.888022562912593e-05,
+      "loss": 1.1878,
+      "step": 11325
+    },
+    {
+      "epoch": 2.0163817663817665,
+      "grad_norm": 0.6634678244590759,
+      "learning_rate": 9.88662290455678e-05,
+      "loss": 0.9655,
+      "step": 11326
+    },
+    {
+      "epoch": 2.0165598290598292,
+      "grad_norm": 0.7184932827949524,
+      "learning_rate": 9.885223248422361e-05,
+      "loss": 0.5964,
+      "step": 11327
+    },
+    {
+      "epoch": 2.0167378917378915,
+      "grad_norm": 0.6319148540496826,
+      "learning_rate": 9.883823594536751e-05,
+      "loss": 0.5692,
+      "step": 11328
+    },
+    {
+      "epoch": 2.0169159544159543,
+      "grad_norm": 0.6232550144195557,
+      "learning_rate": 9.88242394292738e-05,
+      "loss": 0.6492,
+      "step": 11329
+    },
+    {
+      "epoch": 2.017094017094017,
+      "grad_norm": 0.7149667143821716,
+      "learning_rate": 9.881024293621663e-05,
+      "loss": 0.7023,
+      "step": 11330
+    },
+    {
+      "epoch": 2.01727207977208,
+      "grad_norm": 0.8871679902076721,
+      "learning_rate": 9.879624646647031e-05,
+      "loss": 0.954,
+      "step": 11331
+    },
+    {
+      "epoch": 2.0174501424501425,
+      "grad_norm": 0.6905941367149353,
+      "learning_rate": 9.878225002030901e-05,
+      "loss": 0.8534,
+      "step": 11332
+    },
+    {
+      "epoch": 2.0176282051282053,
+      "grad_norm": 0.8891478776931763,
+      "learning_rate": 9.876825359800703e-05,
+      "loss": 0.8324,
+      "step": 11333
+    },
+    {
+      "epoch": 2.0178062678062676,
+      "grad_norm": 0.8125092387199402,
+      "learning_rate": 9.875425719983852e-05,
+      "loss": 0.9604,
+      "step": 11334
+    },
+    {
+      "epoch": 2.0179843304843303,
+      "grad_norm": 0.7362027764320374,
+      "learning_rate": 9.874026082607778e-05,
+      "loss": 0.7879,
+      "step": 11335
+    },
+    {
+      "epoch": 2.018162393162393,
+      "grad_norm": 0.6763492226600647,
+      "learning_rate": 9.872626447699899e-05,
+      "loss": 0.8839,
+      "step": 11336
+    },
+    {
+      "epoch": 2.018340455840456,
+      "grad_norm": 0.7350467443466187,
+      "learning_rate": 9.871226815287644e-05,
+      "loss": 0.834,
+      "step": 11337
+    },
+    {
+      "epoch": 2.0185185185185186,
+      "grad_norm": 0.7768327593803406,
+      "learning_rate": 9.869827185398428e-05,
+      "loss": 1.1123,
+      "step": 11338
+    },
+    {
+      "epoch": 2.0186965811965814,
+      "grad_norm": 0.9218043088912964,
+      "learning_rate": 9.868427558059681e-05,
+      "loss": 0.9439,
+      "step": 11339
+    },
+    {
+      "epoch": 2.0188746438746437,
+      "grad_norm": 0.6613419651985168,
+      "learning_rate": 9.867027933298819e-05,
+      "loss": 0.836,
+      "step": 11340
+    },
+    {
+      "epoch": 2.0190527065527064,
+      "grad_norm": 0.7251055240631104,
+      "learning_rate": 9.865628311143273e-05,
+      "loss": 0.973,
+      "step": 11341
+    },
+    {
+      "epoch": 2.019230769230769,
+      "grad_norm": 0.6571859121322632,
+      "learning_rate": 9.864228691620458e-05,
+      "loss": 0.8811,
+      "step": 11342
+    },
+    {
+      "epoch": 2.019408831908832,
+      "grad_norm": 0.7552264928817749,
+      "learning_rate": 9.862829074757802e-05,
+      "loss": 0.9128,
+      "step": 11343
+    },
+    {
+      "epoch": 2.0195868945868947,
+      "grad_norm": 0.6724083423614502,
+      "learning_rate": 9.861429460582723e-05,
+      "loss": 0.8894,
+      "step": 11344
+    },
+    {
+      "epoch": 2.0197649572649574,
+      "grad_norm": 0.8309593200683594,
+      "learning_rate": 9.860029849122644e-05,
+      "loss": 0.9374,
+      "step": 11345
+    },
+    {
+      "epoch": 2.0199430199430197,
+      "grad_norm": 0.7709865570068359,
+      "learning_rate": 9.858630240404993e-05,
+      "loss": 0.8195,
+      "step": 11346
+    },
+    {
+      "epoch": 2.0201210826210825,
+      "grad_norm": 0.8163080811500549,
+      "learning_rate": 9.857230634457187e-05,
+      "loss": 0.9329,
+      "step": 11347
+    },
+    {
+      "epoch": 2.0202991452991452,
+      "grad_norm": 0.8424021005630493,
+      "learning_rate": 9.855831031306653e-05,
+      "loss": 0.8732,
+      "step": 11348
+    },
+    {
+      "epoch": 2.020477207977208,
+      "grad_norm": 0.7816365361213684,
+      "learning_rate": 9.854431430980808e-05,
+      "loss": 0.8858,
+      "step": 11349
+    },
+    {
+      "epoch": 2.0206552706552707,
+      "grad_norm": 0.7559000253677368,
+      "learning_rate": 9.853031833507075e-05,
+      "loss": 0.7146,
+      "step": 11350
+    },
+    {
+      "epoch": 2.0208333333333335,
+      "grad_norm": 0.6723140478134155,
+      "learning_rate": 9.85163223891288e-05,
+      "loss": 0.813,
+      "step": 11351
+    },
+    {
+      "epoch": 2.021011396011396,
+      "grad_norm": 0.757641077041626,
+      "learning_rate": 9.850232647225646e-05,
+      "loss": 0.794,
+      "step": 11352
+    },
+    {
+      "epoch": 2.0211894586894585,
+      "grad_norm": 0.8217115998268127,
+      "learning_rate": 9.848833058472787e-05,
+      "loss": 1.0407,
+      "step": 11353
+    },
+    {
+      "epoch": 2.0213675213675213,
+      "grad_norm": 0.8016467690467834,
+      "learning_rate": 9.847433472681736e-05,
+      "loss": 0.8967,
+      "step": 11354
+    },
+    {
+      "epoch": 2.021545584045584,
+      "grad_norm": 0.7703533172607422,
+      "learning_rate": 9.846033889879903e-05,
+      "loss": 0.9669,
+      "step": 11355
+    },
+    {
+      "epoch": 2.021723646723647,
+      "grad_norm": 0.7372044920921326,
+      "learning_rate": 9.84463431009472e-05,
+      "loss": 0.8581,
+      "step": 11356
+    },
+    {
+      "epoch": 2.0219017094017095,
+      "grad_norm": 0.7676188945770264,
+      "learning_rate": 9.8432347333536e-05,
+      "loss": 0.8498,
+      "step": 11357
+    },
+    {
+      "epoch": 2.0220797720797723,
+      "grad_norm": 0.7485190629959106,
+      "learning_rate": 9.841835159683977e-05,
+      "loss": 0.8492,
+      "step": 11358
+    },
+    {
+      "epoch": 2.0222578347578346,
+      "grad_norm": 0.7287883758544922,
+      "learning_rate": 9.840435589113262e-05,
+      "loss": 0.9072,
+      "step": 11359
+    },
+    {
+      "epoch": 2.0224358974358974,
+      "grad_norm": 0.7719354033470154,
+      "learning_rate": 9.83903602166888e-05,
+      "loss": 0.7657,
+      "step": 11360
+    },
+    {
+      "epoch": 2.02261396011396,
+      "grad_norm": 0.7679458260536194,
+      "learning_rate": 9.837636457378251e-05,
+      "loss": 0.7098,
+      "step": 11361
+    },
+    {
+      "epoch": 2.022792022792023,
+      "grad_norm": 0.7496665120124817,
+      "learning_rate": 9.836236896268803e-05,
+      "loss": 0.8459,
+      "step": 11362
+    },
+    {
+      "epoch": 2.0229700854700856,
+      "grad_norm": 0.8511863350868225,
+      "learning_rate": 9.834837338367949e-05,
+      "loss": 0.9782,
+      "step": 11363
+    },
+    {
+      "epoch": 2.0231481481481484,
+      "grad_norm": 0.5752342343330383,
+      "learning_rate": 9.833437783703114e-05,
+      "loss": 0.4539,
+      "step": 11364
+    },
+    {
+      "epoch": 2.0233262108262107,
+      "grad_norm": 0.6654593348503113,
+      "learning_rate": 9.832038232301722e-05,
+      "loss": 0.8009,
+      "step": 11365
+    },
+    {
+      "epoch": 2.0235042735042734,
+      "grad_norm": 0.7296777963638306,
+      "learning_rate": 9.83063868419119e-05,
+      "loss": 0.7841,
+      "step": 11366
+    },
+    {
+      "epoch": 2.023682336182336,
+      "grad_norm": 0.8404465913772583,
+      "learning_rate": 9.829239139398943e-05,
+      "loss": 0.9152,
+      "step": 11367
+    },
+    {
+      "epoch": 2.023860398860399,
+      "grad_norm": 0.6407002806663513,
+      "learning_rate": 9.827839597952397e-05,
+      "loss": 0.6953,
+      "step": 11368
+    },
+    {
+      "epoch": 2.0240384615384617,
+      "grad_norm": 0.8107042908668518,
+      "learning_rate": 9.826440059878982e-05,
+      "loss": 0.8726,
+      "step": 11369
+    },
+    {
+      "epoch": 2.0242165242165244,
+      "grad_norm": 0.803804874420166,
+      "learning_rate": 9.825040525206108e-05,
+      "loss": 0.8906,
+      "step": 11370
+    },
+    {
+      "epoch": 2.0243945868945867,
+      "grad_norm": 0.7625358700752258,
+      "learning_rate": 9.823640993961205e-05,
+      "loss": 0.8938,
+      "step": 11371
+    },
+    {
+      "epoch": 2.0245726495726495,
+      "grad_norm": 0.690793514251709,
+      "learning_rate": 9.822241466171686e-05,
+      "loss": 0.7926,
+      "step": 11372
+    },
+    {
+      "epoch": 2.0247507122507122,
+      "grad_norm": 0.7006554007530212,
+      "learning_rate": 9.820841941864983e-05,
+      "loss": 0.793,
+      "step": 11373
+    },
+    {
+      "epoch": 2.024928774928775,
+      "grad_norm": 0.8029078841209412,
+      "learning_rate": 9.819442421068504e-05,
+      "loss": 0.867,
+      "step": 11374
+    },
+    {
+      "epoch": 2.0251068376068377,
+      "grad_norm": 0.6999112367630005,
+      "learning_rate": 9.818042903809678e-05,
+      "loss": 0.688,
+      "step": 11375
+    },
+    {
+      "epoch": 2.0252849002849005,
+      "grad_norm": 0.6848462224006653,
+      "learning_rate": 9.816643390115923e-05,
+      "loss": 0.7337,
+      "step": 11376
+    },
+    {
+      "epoch": 2.025462962962963,
+      "grad_norm": 0.7698155641555786,
+      "learning_rate": 9.815243880014663e-05,
+      "loss": 0.9712,
+      "step": 11377
+    },
+    {
+      "epoch": 2.0256410256410255,
+      "grad_norm": 0.8449836373329163,
+      "learning_rate": 9.81384437353331e-05,
+      "loss": 0.9144,
+      "step": 11378
+    },
+    {
+      "epoch": 2.0258190883190883,
+      "grad_norm": 0.6340110301971436,
+      "learning_rate": 9.812444870699296e-05,
+      "loss": 0.6365,
+      "step": 11379
+    },
+    {
+      "epoch": 2.025997150997151,
+      "grad_norm": 0.7104073762893677,
+      "learning_rate": 9.81104537154003e-05,
+      "loss": 0.7781,
+      "step": 11380
+    },
+    {
+      "epoch": 2.026175213675214,
+      "grad_norm": 0.7287606000900269,
+      "learning_rate": 9.809645876082939e-05,
+      "loss": 0.9351,
+      "step": 11381
+    },
+    {
+      "epoch": 2.0263532763532766,
+      "grad_norm": 0.9640787243843079,
+      "learning_rate": 9.80824638435544e-05,
+      "loss": 0.8745,
+      "step": 11382
+    },
+    {
+      "epoch": 2.026531339031339,
+      "grad_norm": 0.5718010067939758,
+      "learning_rate": 9.806846896384959e-05,
+      "loss": 0.4711,
+      "step": 11383
+    },
+    {
+      "epoch": 2.0267094017094016,
+      "grad_norm": 0.7903527021408081,
+      "learning_rate": 9.805447412198907e-05,
+      "loss": 0.8241,
+      "step": 11384
+    },
+    {
+      "epoch": 2.0268874643874644,
+      "grad_norm": 0.8579357862472534,
+      "learning_rate": 9.80404793182471e-05,
+      "loss": 0.8621,
+      "step": 11385
+    },
+    {
+      "epoch": 2.027065527065527,
+      "grad_norm": 0.8466464877128601,
+      "learning_rate": 9.802648455289787e-05,
+      "loss": 0.8772,
+      "step": 11386
+    },
+    {
+      "epoch": 2.02724358974359,
+      "grad_norm": 0.7888286709785461,
+      "learning_rate": 9.801248982621557e-05,
+      "loss": 0.8352,
+      "step": 11387
+    },
+    {
+      "epoch": 2.0274216524216526,
+      "grad_norm": 0.6967005133628845,
+      "learning_rate": 9.799849513847444e-05,
+      "loss": 0.7936,
+      "step": 11388
+    },
+    {
+      "epoch": 2.027599715099715,
+      "grad_norm": 0.6987027525901794,
+      "learning_rate": 9.79845004899486e-05,
+      "loss": 0.7778,
+      "step": 11389
+    },
+    {
+      "epoch": 2.0277777777777777,
+      "grad_norm": 0.7414312362670898,
+      "learning_rate": 9.797050588091233e-05,
+      "loss": 0.9017,
+      "step": 11390
+    },
+    {
+      "epoch": 2.0279558404558404,
+      "grad_norm": 0.7932028770446777,
+      "learning_rate": 9.795651131163974e-05,
+      "loss": 0.8662,
+      "step": 11391
+    },
+    {
+      "epoch": 2.028133903133903,
+      "grad_norm": 0.8166332244873047,
+      "learning_rate": 9.79425167824051e-05,
+      "loss": 1.0489,
+      "step": 11392
+    },
+    {
+      "epoch": 2.028311965811966,
+      "grad_norm": 0.7265253663063049,
+      "learning_rate": 9.792852229348251e-05,
+      "loss": 0.9458,
+      "step": 11393
+    },
+    {
+      "epoch": 2.0284900284900287,
+      "grad_norm": 0.7374703288078308,
+      "learning_rate": 9.791452784514629e-05,
+      "loss": 0.9203,
+      "step": 11394
+    },
+    {
+      "epoch": 2.028668091168091,
+      "grad_norm": 0.6912441253662109,
+      "learning_rate": 9.790053343767052e-05,
+      "loss": 0.8986,
+      "step": 11395
+    },
+    {
+      "epoch": 2.0288461538461537,
+      "grad_norm": 0.871231734752655,
+      "learning_rate": 9.788653907132946e-05,
+      "loss": 0.6811,
+      "step": 11396
+    },
+    {
+      "epoch": 2.0290242165242165,
+      "grad_norm": 0.7361812591552734,
+      "learning_rate": 9.787254474639726e-05,
+      "loss": 0.7868,
+      "step": 11397
+    },
+    {
+      "epoch": 2.0292022792022792,
+      "grad_norm": 0.6828895211219788,
+      "learning_rate": 9.785855046314815e-05,
+      "loss": 0.7739,
+      "step": 11398
+    },
+    {
+      "epoch": 2.029380341880342,
+      "grad_norm": 0.7203328609466553,
+      "learning_rate": 9.784455622185626e-05,
+      "loss": 0.6474,
+      "step": 11399
+    },
+    {
+      "epoch": 2.0295584045584047,
+      "grad_norm": 0.774886429309845,
+      "learning_rate": 9.783056202279587e-05,
+      "loss": 0.8073,
+      "step": 11400
+    },
+    {
+      "epoch": 2.029736467236467,
+      "grad_norm": 0.6479005813598633,
+      "learning_rate": 9.781656786624106e-05,
+      "loss": 0.7237,
+      "step": 11401
+    },
+    {
+      "epoch": 2.02991452991453,
+      "grad_norm": 0.7269866466522217,
+      "learning_rate": 9.78025737524661e-05,
+      "loss": 0.9089,
+      "step": 11402
+    },
+    {
+      "epoch": 2.0300925925925926,
+      "grad_norm": 0.7265415191650391,
+      "learning_rate": 9.778857968174509e-05,
+      "loss": 0.827,
+      "step": 11403
+    },
+    {
+      "epoch": 2.0302706552706553,
+      "grad_norm": 0.8174277544021606,
+      "learning_rate": 9.777458565435227e-05,
+      "loss": 0.6752,
+      "step": 11404
+    },
+    {
+      "epoch": 2.030448717948718,
+      "grad_norm": 0.9333333969116211,
+      "learning_rate": 9.77605916705619e-05,
+      "loss": 0.9542,
+      "step": 11405
+    },
+    {
+      "epoch": 2.030626780626781,
+      "grad_norm": 0.6854027509689331,
+      "learning_rate": 9.774659773064801e-05,
+      "loss": 0.8526,
+      "step": 11406
+    },
+    {
+      "epoch": 2.030804843304843,
+      "grad_norm": 0.7711043357849121,
+      "learning_rate": 9.773260383488489e-05,
+      "loss": 0.9009,
+      "step": 11407
+    },
+    {
+      "epoch": 2.030982905982906,
+      "grad_norm": 0.6915287971496582,
+      "learning_rate": 9.771860998354667e-05,
+      "loss": 0.9635,
+      "step": 11408
+    },
+    {
+      "epoch": 2.0311609686609686,
+      "grad_norm": 0.7978841066360474,
+      "learning_rate": 9.770461617690758e-05,
+      "loss": 0.7563,
+      "step": 11409
+    },
+    {
+      "epoch": 2.0313390313390314,
+      "grad_norm": 0.6686414480209351,
+      "learning_rate": 9.769062241524172e-05,
+      "loss": 0.8282,
+      "step": 11410
+    },
+    {
+      "epoch": 2.031517094017094,
+      "grad_norm": 0.7024029493331909,
+      "learning_rate": 9.767662869882335e-05,
+      "loss": 0.9176,
+      "step": 11411
+    },
+    {
+      "epoch": 2.031695156695157,
+      "grad_norm": 0.6945844292640686,
+      "learning_rate": 9.766263502792659e-05,
+      "loss": 0.86,
+      "step": 11412
+    },
+    {
+      "epoch": 2.031873219373219,
+      "grad_norm": 0.7351676821708679,
+      "learning_rate": 9.764864140282569e-05,
+      "loss": 0.865,
+      "step": 11413
+    },
+    {
+      "epoch": 2.032051282051282,
+      "grad_norm": 0.7663825750350952,
+      "learning_rate": 9.763464782379472e-05,
+      "loss": 0.9309,
+      "step": 11414
+    },
+    {
+      "epoch": 2.0322293447293447,
+      "grad_norm": 0.7552894949913025,
+      "learning_rate": 9.762065429110798e-05,
+      "loss": 0.8366,
+      "step": 11415
+    },
+    {
+      "epoch": 2.0324074074074074,
+      "grad_norm": 0.6852208971977234,
+      "learning_rate": 9.760666080503951e-05,
+      "loss": 0.9095,
+      "step": 11416
+    },
+    {
+      "epoch": 2.03258547008547,
+      "grad_norm": 0.7759820222854614,
+      "learning_rate": 9.759266736586358e-05,
+      "loss": 0.7461,
+      "step": 11417
+    },
+    {
+      "epoch": 2.032763532763533,
+      "grad_norm": 0.6514183878898621,
+      "learning_rate": 9.757867397385431e-05,
+      "loss": 0.5479,
+      "step": 11418
+    },
+    {
+      "epoch": 2.0329415954415953,
+      "grad_norm": 0.7703103423118591,
+      "learning_rate": 9.756468062928593e-05,
+      "loss": 0.9588,
+      "step": 11419
+    },
+    {
+      "epoch": 2.033119658119658,
+      "grad_norm": 0.6937198638916016,
+      "learning_rate": 9.755068733243255e-05,
+      "loss": 0.8661,
+      "step": 11420
+    },
+    {
+      "epoch": 2.0332977207977208,
+      "grad_norm": 0.6675645112991333,
+      "learning_rate": 9.753669408356835e-05,
+      "loss": 0.7484,
+      "step": 11421
+    },
+    {
+      "epoch": 2.0334757834757835,
+      "grad_norm": 0.6653266549110413,
+      "learning_rate": 9.752270088296753e-05,
+      "loss": 0.7217,
+      "step": 11422
+    },
+    {
+      "epoch": 2.0336538461538463,
+      "grad_norm": 0.7893908023834229,
+      "learning_rate": 9.750870773090425e-05,
+      "loss": 0.6346,
+      "step": 11423
+    },
+    {
+      "epoch": 2.033831908831909,
+      "grad_norm": 0.7442745566368103,
+      "learning_rate": 9.749471462765265e-05,
+      "loss": 0.9452,
+      "step": 11424
+    },
+    {
+      "epoch": 2.0340099715099713,
+      "grad_norm": 0.8270035982131958,
+      "learning_rate": 9.748072157348691e-05,
+      "loss": 0.9906,
+      "step": 11425
+    },
+    {
+      "epoch": 2.034188034188034,
+      "grad_norm": 0.7195143699645996,
+      "learning_rate": 9.746672856868123e-05,
+      "loss": 0.9564,
+      "step": 11426
+    },
+    {
+      "epoch": 2.034366096866097,
+      "grad_norm": 0.75486820936203,
+      "learning_rate": 9.745273561350971e-05,
+      "loss": 0.9052,
+      "step": 11427
+    },
+    {
+      "epoch": 2.0345441595441596,
+      "grad_norm": 0.6710293889045715,
+      "learning_rate": 9.743874270824655e-05,
+      "loss": 0.6801,
+      "step": 11428
+    },
+    {
+      "epoch": 2.0347222222222223,
+      "grad_norm": 0.704175591468811,
+      "learning_rate": 9.742474985316588e-05,
+      "loss": 0.8619,
+      "step": 11429
+    },
+    {
+      "epoch": 2.034900284900285,
+      "grad_norm": 0.7941717505455017,
+      "learning_rate": 9.741075704854196e-05,
+      "loss": 0.8318,
+      "step": 11430
+    },
+    {
+      "epoch": 2.0350783475783474,
+      "grad_norm": 0.8592050671577454,
+      "learning_rate": 9.739676429464881e-05,
+      "loss": 0.8203,
+      "step": 11431
+    },
+    {
+      "epoch": 2.03525641025641,
+      "grad_norm": 0.9149407148361206,
+      "learning_rate": 9.738277159176068e-05,
+      "loss": 0.87,
+      "step": 11432
+    },
+    {
+      "epoch": 2.035434472934473,
+      "grad_norm": 0.780890941619873,
+      "learning_rate": 9.736877894015169e-05,
+      "loss": 0.6971,
+      "step": 11433
+    },
+    {
+      "epoch": 2.0356125356125356,
+      "grad_norm": 0.7540209293365479,
+      "learning_rate": 9.735478634009605e-05,
+      "loss": 0.8927,
+      "step": 11434
+    },
+    {
+      "epoch": 2.0357905982905984,
+      "grad_norm": 0.8556281924247742,
+      "learning_rate": 9.734079379186782e-05,
+      "loss": 0.7498,
+      "step": 11435
+    },
+    {
+      "epoch": 2.035968660968661,
+      "grad_norm": 0.8710931539535522,
+      "learning_rate": 9.732680129574128e-05,
+      "loss": 0.6009,
+      "step": 11436
+    },
+    {
+      "epoch": 2.0361467236467234,
+      "grad_norm": 0.6873082518577576,
+      "learning_rate": 9.731280885199045e-05,
+      "loss": 0.8441,
+      "step": 11437
+    },
+    {
+      "epoch": 2.036324786324786,
+      "grad_norm": 0.8333037495613098,
+      "learning_rate": 9.729881646088958e-05,
+      "loss": 0.888,
+      "step": 11438
+    },
+    {
+      "epoch": 2.036502849002849,
+      "grad_norm": 0.859365701675415,
+      "learning_rate": 9.728482412271277e-05,
+      "loss": 1.0272,
+      "step": 11439
+    },
+    {
+      "epoch": 2.0366809116809117,
+      "grad_norm": 0.7239334583282471,
+      "learning_rate": 9.727083183773423e-05,
+      "loss": 0.9428,
+      "step": 11440
+    },
+    {
+      "epoch": 2.0368589743589745,
+      "grad_norm": 0.8341524004936218,
+      "learning_rate": 9.725683960622804e-05,
+      "loss": 0.9275,
+      "step": 11441
+    },
+    {
+      "epoch": 2.037037037037037,
+      "grad_norm": 0.6992602348327637,
+      "learning_rate": 9.724284742846838e-05,
+      "loss": 0.8492,
+      "step": 11442
+    },
+    {
+      "epoch": 2.0372150997150995,
+      "grad_norm": 0.7429133057594299,
+      "learning_rate": 9.72288553047294e-05,
+      "loss": 0.8246,
+      "step": 11443
+    },
+    {
+      "epoch": 2.0373931623931623,
+      "grad_norm": 0.7765250205993652,
+      "learning_rate": 9.721486323528522e-05,
+      "loss": 0.8624,
+      "step": 11444
+    },
+    {
+      "epoch": 2.037571225071225,
+      "grad_norm": 0.9104889631271362,
+      "learning_rate": 9.720087122041007e-05,
+      "loss": 0.8369,
+      "step": 11445
+    },
+    {
+      "epoch": 2.0377492877492878,
+      "grad_norm": 0.6483191251754761,
+      "learning_rate": 9.718687926037798e-05,
+      "loss": 0.7347,
+      "step": 11446
+    },
+    {
+      "epoch": 2.0379273504273505,
+      "grad_norm": 0.7816178202629089,
+      "learning_rate": 9.717288735546317e-05,
+      "loss": 0.8607,
+      "step": 11447
+    },
+    {
+      "epoch": 2.0381054131054133,
+      "grad_norm": 0.6909009218215942,
+      "learning_rate": 9.715889550593975e-05,
+      "loss": 0.8764,
+      "step": 11448
+    },
+    {
+      "epoch": 2.0382834757834756,
+      "grad_norm": 0.8101255297660828,
+      "learning_rate": 9.71449037120819e-05,
+      "loss": 0.8858,
+      "step": 11449
+    },
+    {
+      "epoch": 2.0384615384615383,
+      "grad_norm": 0.7476511001586914,
+      "learning_rate": 9.71309119741637e-05,
+      "loss": 0.8765,
+      "step": 11450
+    },
+    {
+      "epoch": 2.038639601139601,
+      "grad_norm": 0.7514875531196594,
+      "learning_rate": 9.711692029245934e-05,
+      "loss": 0.942,
+      "step": 11451
+    },
+    {
+      "epoch": 2.038817663817664,
+      "grad_norm": 0.7400087118148804,
+      "learning_rate": 9.710292866724292e-05,
+      "loss": 0.8327,
+      "step": 11452
+    },
+    {
+      "epoch": 2.0389957264957266,
+      "grad_norm": 0.832979142665863,
+      "learning_rate": 9.70889370987886e-05,
+      "loss": 0.9714,
+      "step": 11453
+    },
+    {
+      "epoch": 2.0391737891737893,
+      "grad_norm": 0.6918326616287231,
+      "learning_rate": 9.70749455873705e-05,
+      "loss": 0.7765,
+      "step": 11454
+    },
+    {
+      "epoch": 2.0393518518518516,
+      "grad_norm": 0.8286036849021912,
+      "learning_rate": 9.70609541332628e-05,
+      "loss": 0.9138,
+      "step": 11455
+    },
+    {
+      "epoch": 2.0395299145299144,
+      "grad_norm": 0.6436729431152344,
+      "learning_rate": 9.704696273673955e-05,
+      "loss": 0.738,
+      "step": 11456
+    },
+    {
+      "epoch": 2.039707977207977,
+      "grad_norm": 0.7057681679725647,
+      "learning_rate": 9.703297139807496e-05,
+      "loss": 0.8107,
+      "step": 11457
+    },
+    {
+      "epoch": 2.03988603988604,
+      "grad_norm": 0.7444550395011902,
+      "learning_rate": 9.701898011754313e-05,
+      "loss": 0.8188,
+      "step": 11458
+    },
+    {
+      "epoch": 2.0400641025641026,
+      "grad_norm": 0.7622130513191223,
+      "learning_rate": 9.70049888954182e-05,
+      "loss": 0.8451,
+      "step": 11459
+    },
+    {
+      "epoch": 2.0402421652421654,
+      "grad_norm": 0.8166092038154602,
+      "learning_rate": 9.699099773197426e-05,
+      "loss": 0.9399,
+      "step": 11460
+    },
+    {
+      "epoch": 2.0404202279202277,
+      "grad_norm": 0.7235924601554871,
+      "learning_rate": 9.697700662748552e-05,
+      "loss": 0.7863,
+      "step": 11461
+    },
+    {
+      "epoch": 2.0405982905982905,
+      "grad_norm": 0.7150312662124634,
+      "learning_rate": 9.696301558222601e-05,
+      "loss": 0.8288,
+      "step": 11462
+    },
+    {
+      "epoch": 2.040776353276353,
+      "grad_norm": 0.8007016777992249,
+      "learning_rate": 9.694902459646993e-05,
+      "loss": 0.9203,
+      "step": 11463
+    },
+    {
+      "epoch": 2.040954415954416,
+      "grad_norm": 0.7665491700172424,
+      "learning_rate": 9.693503367049134e-05,
+      "loss": 0.7956,
+      "step": 11464
+    },
+    {
+      "epoch": 2.0411324786324787,
+      "grad_norm": 0.7499460577964783,
+      "learning_rate": 9.692104280456439e-05,
+      "loss": 0.9973,
+      "step": 11465
+    },
+    {
+      "epoch": 2.0413105413105415,
+      "grad_norm": 0.7598159909248352,
+      "learning_rate": 9.690705199896327e-05,
+      "loss": 0.95,
+      "step": 11466
+    },
+    {
+      "epoch": 2.041488603988604,
+      "grad_norm": 0.7699945569038391,
+      "learning_rate": 9.689306125396201e-05,
+      "loss": 0.8731,
+      "step": 11467
+    },
+    {
+      "epoch": 2.0416666666666665,
+      "grad_norm": 0.6724731922149658,
+      "learning_rate": 9.687907056983476e-05,
+      "loss": 0.906,
+      "step": 11468
+    },
+    {
+      "epoch": 2.0418447293447293,
+      "grad_norm": 0.9238275289535522,
+      "learning_rate": 9.686507994685562e-05,
+      "loss": 0.8397,
+      "step": 11469
+    },
+    {
+      "epoch": 2.042022792022792,
+      "grad_norm": 0.744969367980957,
+      "learning_rate": 9.685108938529876e-05,
+      "loss": 0.8436,
+      "step": 11470
+    },
+    {
+      "epoch": 2.0422008547008548,
+      "grad_norm": 0.6983298063278198,
+      "learning_rate": 9.683709888543824e-05,
+      "loss": 0.8235,
+      "step": 11471
+    },
+    {
+      "epoch": 2.0423789173789175,
+      "grad_norm": 0.7098708748817444,
+      "learning_rate": 9.682310844754824e-05,
+      "loss": 0.8235,
+      "step": 11472
+    },
+    {
+      "epoch": 2.04255698005698,
+      "grad_norm": 0.7492793798446655,
+      "learning_rate": 9.680911807190277e-05,
+      "loss": 0.7988,
+      "step": 11473
+    },
+    {
+      "epoch": 2.0427350427350426,
+      "grad_norm": 0.6952250003814697,
+      "learning_rate": 9.679512775877604e-05,
+      "loss": 0.7928,
+      "step": 11474
+    },
+    {
+      "epoch": 2.0429131054131053,
+      "grad_norm": 0.6442983150482178,
+      "learning_rate": 9.678113750844209e-05,
+      "loss": 0.8206,
+      "step": 11475
+    },
+    {
+      "epoch": 2.043091168091168,
+      "grad_norm": 0.7408245205879211,
+      "learning_rate": 9.67671473211751e-05,
+      "loss": 0.6941,
+      "step": 11476
+    },
+    {
+      "epoch": 2.043269230769231,
+      "grad_norm": 0.8277738094329834,
+      "learning_rate": 9.675315719724913e-05,
+      "loss": 1.3153,
+      "step": 11477
+    },
+    {
+      "epoch": 2.0434472934472936,
+      "grad_norm": 0.7535714507102966,
+      "learning_rate": 9.67391671369383e-05,
+      "loss": 0.9238,
+      "step": 11478
+    },
+    {
+      "epoch": 2.0436253561253563,
+      "grad_norm": 0.8341996073722839,
+      "learning_rate": 9.67251771405167e-05,
+      "loss": 0.8149,
+      "step": 11479
+    },
+    {
+      "epoch": 2.0438034188034186,
+      "grad_norm": 0.7365956902503967,
+      "learning_rate": 9.671118720825849e-05,
+      "loss": 0.799,
+      "step": 11480
+    },
+    {
+      "epoch": 2.0439814814814814,
+      "grad_norm": 0.7630738615989685,
+      "learning_rate": 9.669719734043769e-05,
+      "loss": 0.9284,
+      "step": 11481
+    },
+    {
+      "epoch": 2.044159544159544,
+      "grad_norm": 0.659172773361206,
+      "learning_rate": 9.668320753732848e-05,
+      "loss": 0.7594,
+      "step": 11482
+    },
+    {
+      "epoch": 2.044337606837607,
+      "grad_norm": 0.7724705934524536,
+      "learning_rate": 9.66692177992049e-05,
+      "loss": 0.8623,
+      "step": 11483
+    },
+    {
+      "epoch": 2.0445156695156697,
+      "grad_norm": 0.7140040993690491,
+      "learning_rate": 9.665522812634108e-05,
+      "loss": 0.851,
+      "step": 11484
+    },
+    {
+      "epoch": 2.0446937321937324,
+      "grad_norm": 0.9072890877723694,
+      "learning_rate": 9.664123851901115e-05,
+      "loss": 0.9459,
+      "step": 11485
+    },
+    {
+      "epoch": 2.0448717948717947,
+      "grad_norm": 0.8145443201065063,
+      "learning_rate": 9.662724897748915e-05,
+      "loss": 0.9067,
+      "step": 11486
+    },
+    {
+      "epoch": 2.0450498575498575,
+      "grad_norm": 0.8471246957778931,
+      "learning_rate": 9.661325950204922e-05,
+      "loss": 0.7194,
+      "step": 11487
+    },
+    {
+      "epoch": 2.04522792022792,
+      "grad_norm": 0.8465375304222107,
+      "learning_rate": 9.659927009296541e-05,
+      "loss": 0.9495,
+      "step": 11488
+    },
+    {
+      "epoch": 2.045405982905983,
+      "grad_norm": 0.7597832083702087,
+      "learning_rate": 9.658528075051185e-05,
+      "loss": 0.7526,
+      "step": 11489
+    },
+    {
+      "epoch": 2.0455840455840457,
+      "grad_norm": 0.8013564944267273,
+      "learning_rate": 9.657129147496261e-05,
+      "loss": 1.0514,
+      "step": 11490
+    },
+    {
+      "epoch": 2.0457621082621085,
+      "grad_norm": 0.8695764541625977,
+      "learning_rate": 9.655730226659182e-05,
+      "loss": 0.9925,
+      "step": 11491
+    },
+    {
+      "epoch": 2.0459401709401708,
+      "grad_norm": 0.7295607328414917,
+      "learning_rate": 9.65433131256735e-05,
+      "loss": 0.8652,
+      "step": 11492
+    },
+    {
+      "epoch": 2.0461182336182335,
+      "grad_norm": 0.7819971442222595,
+      "learning_rate": 9.652932405248181e-05,
+      "loss": 0.8601,
+      "step": 11493
+    },
+    {
+      "epoch": 2.0462962962962963,
+      "grad_norm": 0.7244205474853516,
+      "learning_rate": 9.651533504729078e-05,
+      "loss": 0.752,
+      "step": 11494
+    },
+    {
+      "epoch": 2.046474358974359,
+      "grad_norm": 0.7774363160133362,
+      "learning_rate": 9.650134611037456e-05,
+      "loss": 0.8638,
+      "step": 11495
+    },
+    {
+      "epoch": 2.046652421652422,
+      "grad_norm": 0.7955372929573059,
+      "learning_rate": 9.648735724200715e-05,
+      "loss": 0.9662,
+      "step": 11496
+    },
+    {
+      "epoch": 2.0468304843304845,
+      "grad_norm": 0.7114127278327942,
+      "learning_rate": 9.647336844246273e-05,
+      "loss": 0.9523,
+      "step": 11497
+    },
+    {
+      "epoch": 2.047008547008547,
+      "grad_norm": 0.7449100017547607,
+      "learning_rate": 9.645937971201527e-05,
+      "loss": 0.7898,
+      "step": 11498
+    },
+    {
+      "epoch": 2.0471866096866096,
+      "grad_norm": 0.7541512846946716,
+      "learning_rate": 9.644539105093895e-05,
+      "loss": 0.9286,
+      "step": 11499
+    },
+    {
+      "epoch": 2.0473646723646723,
+      "grad_norm": 0.6816682815551758,
+      "learning_rate": 9.643140245950778e-05,
+      "loss": 0.7757,
+      "step": 11500
+    },
+    {
+      "epoch": 2.047542735042735,
+      "grad_norm": 0.7222850918769836,
+      "learning_rate": 9.641741393799591e-05,
+      "loss": 0.8415,
+      "step": 11501
+    },
+    {
+      "epoch": 2.047720797720798,
+      "grad_norm": 0.7605552077293396,
+      "learning_rate": 9.640342548667732e-05,
+      "loss": 0.8875,
+      "step": 11502
+    },
+    {
+      "epoch": 2.0478988603988606,
+      "grad_norm": 0.7442240118980408,
+      "learning_rate": 9.638943710582615e-05,
+      "loss": 0.8755,
+      "step": 11503
+    },
+    {
+      "epoch": 2.048076923076923,
+      "grad_norm": 0.7065736651420593,
+      "learning_rate": 9.637544879571648e-05,
+      "loss": 0.6885,
+      "step": 11504
+    },
+    {
+      "epoch": 2.0482549857549857,
+      "grad_norm": 0.6400303244590759,
+      "learning_rate": 9.636146055662232e-05,
+      "loss": 0.5775,
+      "step": 11505
+    },
+    {
+      "epoch": 2.0484330484330484,
+      "grad_norm": 0.7955389022827148,
+      "learning_rate": 9.634747238881783e-05,
+      "loss": 1.0182,
+      "step": 11506
+    },
+    {
+      "epoch": 2.048611111111111,
+      "grad_norm": 0.8283255696296692,
+      "learning_rate": 9.6333484292577e-05,
+      "loss": 0.9247,
+      "step": 11507
+    },
+    {
+      "epoch": 2.048789173789174,
+      "grad_norm": 0.7619521617889404,
+      "learning_rate": 9.631949626817399e-05,
+      "loss": 0.8355,
+      "step": 11508
+    },
+    {
+      "epoch": 2.0489672364672367,
+      "grad_norm": 0.7204191088676453,
+      "learning_rate": 9.630550831588273e-05,
+      "loss": 0.8571,
+      "step": 11509
+    },
+    {
+      "epoch": 2.049145299145299,
+      "grad_norm": 0.7568399310112,
+      "learning_rate": 9.629152043597738e-05,
+      "loss": 0.7349,
+      "step": 11510
+    },
+    {
+      "epoch": 2.0493233618233617,
+      "grad_norm": 0.8594959378242493,
+      "learning_rate": 9.627753262873199e-05,
+      "loss": 0.6918,
+      "step": 11511
+    },
+    {
+      "epoch": 2.0495014245014245,
+      "grad_norm": 0.6345391869544983,
+      "learning_rate": 9.626354489442064e-05,
+      "loss": 0.871,
+      "step": 11512
+    },
+    {
+      "epoch": 2.0496794871794872,
+      "grad_norm": 0.7671827673912048,
+      "learning_rate": 9.624955723331732e-05,
+      "loss": 0.8518,
+      "step": 11513
+    },
+    {
+      "epoch": 2.04985754985755,
+      "grad_norm": 0.7182049751281738,
+      "learning_rate": 9.623556964569616e-05,
+      "loss": 0.889,
+      "step": 11514
+    },
+    {
+      "epoch": 2.0500356125356127,
+      "grad_norm": 0.8342016339302063,
+      "learning_rate": 9.622158213183118e-05,
+      "loss": 0.9712,
+      "step": 11515
+    },
+    {
+      "epoch": 2.050213675213675,
+      "grad_norm": 0.7007761001586914,
+      "learning_rate": 9.620759469199649e-05,
+      "loss": 0.8113,
+      "step": 11516
+    },
+    {
+      "epoch": 2.050391737891738,
+      "grad_norm": 0.7129531502723694,
+      "learning_rate": 9.619360732646605e-05,
+      "loss": 0.8666,
+      "step": 11517
+    },
+    {
+      "epoch": 2.0505698005698005,
+      "grad_norm": 0.7505812048912048,
+      "learning_rate": 9.6179620035514e-05,
+      "loss": 0.7321,
+      "step": 11518
+    },
+    {
+      "epoch": 2.0507478632478633,
+      "grad_norm": 0.7407607436180115,
+      "learning_rate": 9.616563281941433e-05,
+      "loss": 0.9275,
+      "step": 11519
+    },
+    {
+      "epoch": 2.050925925925926,
+      "grad_norm": 0.769345223903656,
+      "learning_rate": 9.615164567844116e-05,
+      "loss": 0.9731,
+      "step": 11520
+    },
+    {
+      "epoch": 2.051103988603989,
+      "grad_norm": 0.7782812118530273,
+      "learning_rate": 9.613765861286846e-05,
+      "loss": 0.9702,
+      "step": 11521
+    },
+    {
+      "epoch": 2.051282051282051,
+      "grad_norm": 0.7071413993835449,
+      "learning_rate": 9.612367162297037e-05,
+      "loss": 0.8451,
+      "step": 11522
+    },
+    {
+      "epoch": 2.051460113960114,
+      "grad_norm": 0.7598503232002258,
+      "learning_rate": 9.610968470902082e-05,
+      "loss": 0.8641,
+      "step": 11523
+    },
+    {
+      "epoch": 2.0516381766381766,
+      "grad_norm": 0.7951003313064575,
+      "learning_rate": 9.609569787129394e-05,
+      "loss": 0.9131,
+      "step": 11524
+    },
+    {
+      "epoch": 2.0518162393162394,
+      "grad_norm": 0.8029175996780396,
+      "learning_rate": 9.608171111006374e-05,
+      "loss": 0.8618,
+      "step": 11525
+    },
+    {
+      "epoch": 2.051994301994302,
+      "grad_norm": 0.6993120908737183,
+      "learning_rate": 9.606772442560428e-05,
+      "loss": 0.8487,
+      "step": 11526
+    },
+    {
+      "epoch": 2.052172364672365,
+      "grad_norm": 0.8039231896400452,
+      "learning_rate": 9.605373781818961e-05,
+      "loss": 1.0102,
+      "step": 11527
+    },
+    {
+      "epoch": 2.052350427350427,
+      "grad_norm": 0.714849054813385,
+      "learning_rate": 9.603975128809373e-05,
+      "loss": 0.8977,
+      "step": 11528
+    },
+    {
+      "epoch": 2.05252849002849,
+      "grad_norm": 0.8728037476539612,
+      "learning_rate": 9.60257648355907e-05,
+      "loss": 0.8004,
+      "step": 11529
+    },
+    {
+      "epoch": 2.0527065527065527,
+      "grad_norm": 0.764776885509491,
+      "learning_rate": 9.601177846095454e-05,
+      "loss": 0.9205,
+      "step": 11530
+    },
+    {
+      "epoch": 2.0528846153846154,
+      "grad_norm": 0.6948725581169128,
+      "learning_rate": 9.599779216445934e-05,
+      "loss": 0.7864,
+      "step": 11531
+    },
+    {
+      "epoch": 2.053062678062678,
+      "grad_norm": 0.7663996815681458,
+      "learning_rate": 9.598380594637903e-05,
+      "loss": 0.8877,
+      "step": 11532
+    },
+    {
+      "epoch": 2.053240740740741,
+      "grad_norm": 0.7584146857261658,
+      "learning_rate": 9.596981980698776e-05,
+      "loss": 1.1328,
+      "step": 11533
+    },
+    {
+      "epoch": 2.0534188034188032,
+      "grad_norm": 0.7701094150543213,
+      "learning_rate": 9.595583374655945e-05,
+      "loss": 0.7551,
+      "step": 11534
+    },
+    {
+      "epoch": 2.053596866096866,
+      "grad_norm": 0.7745714783668518,
+      "learning_rate": 9.594184776536821e-05,
+      "loss": 0.8862,
+      "step": 11535
+    },
+    {
+      "epoch": 2.0537749287749287,
+      "grad_norm": 0.7832430005073547,
+      "learning_rate": 9.5927861863688e-05,
+      "loss": 0.8736,
+      "step": 11536
+    },
+    {
+      "epoch": 2.0539529914529915,
+      "grad_norm": 0.7354840040206909,
+      "learning_rate": 9.591387604179291e-05,
+      "loss": 0.8183,
+      "step": 11537
+    },
+    {
+      "epoch": 2.0541310541310542,
+      "grad_norm": 0.7516480684280396,
+      "learning_rate": 9.589989029995691e-05,
+      "loss": 0.924,
+      "step": 11538
+    },
+    {
+      "epoch": 2.054309116809117,
+      "grad_norm": 0.7942310571670532,
+      "learning_rate": 9.588590463845405e-05,
+      "loss": 1.0283,
+      "step": 11539
+    },
+    {
+      "epoch": 2.0544871794871793,
+      "grad_norm": 0.7716572880744934,
+      "learning_rate": 9.587191905755832e-05,
+      "loss": 0.8686,
+      "step": 11540
+    },
+    {
+      "epoch": 2.054665242165242,
+      "grad_norm": 0.8075140118598938,
+      "learning_rate": 9.585793355754381e-05,
+      "loss": 0.8731,
+      "step": 11541
+    },
+    {
+      "epoch": 2.054843304843305,
+      "grad_norm": 0.8119283318519592,
+      "learning_rate": 9.584394813868444e-05,
+      "loss": 0.9543,
+      "step": 11542
+    },
+    {
+      "epoch": 2.0550213675213675,
+      "grad_norm": 0.6476314067840576,
+      "learning_rate": 9.582996280125427e-05,
+      "loss": 0.6943,
+      "step": 11543
+    },
+    {
+      "epoch": 2.0551994301994303,
+      "grad_norm": 0.7617185711860657,
+      "learning_rate": 9.581597754552737e-05,
+      "loss": 0.6942,
+      "step": 11544
+    },
+    {
+      "epoch": 2.055377492877493,
+      "grad_norm": 0.879355788230896,
+      "learning_rate": 9.580199237177765e-05,
+      "loss": 0.825,
+      "step": 11545
+    },
+    {
+      "epoch": 2.0555555555555554,
+      "grad_norm": 0.8229055404663086,
+      "learning_rate": 9.578800728027919e-05,
+      "loss": 0.9973,
+      "step": 11546
+    },
+    {
+      "epoch": 2.055733618233618,
+      "grad_norm": 0.7808930277824402,
+      "learning_rate": 9.577402227130596e-05,
+      "loss": 0.9525,
+      "step": 11547
+    },
+    {
+      "epoch": 2.055911680911681,
+      "grad_norm": 0.870499849319458,
+      "learning_rate": 9.576003734513201e-05,
+      "loss": 0.8874,
+      "step": 11548
+    },
+    {
+      "epoch": 2.0560897435897436,
+      "grad_norm": 0.8254318833351135,
+      "learning_rate": 9.57460525020313e-05,
+      "loss": 0.6656,
+      "step": 11549
+    },
+    {
+      "epoch": 2.0562678062678064,
+      "grad_norm": 0.8358132243156433,
+      "learning_rate": 9.573206774227786e-05,
+      "loss": 0.7946,
+      "step": 11550
+    },
+    {
+      "epoch": 2.056445868945869,
+      "grad_norm": 0.636366605758667,
+      "learning_rate": 9.571808306614568e-05,
+      "loss": 0.6757,
+      "step": 11551
+    },
+    {
+      "epoch": 2.0566239316239314,
+      "grad_norm": 0.8884546160697937,
+      "learning_rate": 9.57040984739088e-05,
+      "loss": 0.6775,
+      "step": 11552
+    },
+    {
+      "epoch": 2.056801994301994,
+      "grad_norm": 0.7240797877311707,
+      "learning_rate": 9.569011396584115e-05,
+      "loss": 0.8033,
+      "step": 11553
+    },
+    {
+      "epoch": 2.056980056980057,
+      "grad_norm": 0.8730767965316772,
+      "learning_rate": 9.567612954221678e-05,
+      "loss": 0.9577,
+      "step": 11554
+    },
+    {
+      "epoch": 2.0571581196581197,
+      "grad_norm": 0.6785064339637756,
+      "learning_rate": 9.566214520330966e-05,
+      "loss": 0.6241,
+      "step": 11555
+    },
+    {
+      "epoch": 2.0573361823361824,
+      "grad_norm": 0.7757805585861206,
+      "learning_rate": 9.564816094939382e-05,
+      "loss": 0.7926,
+      "step": 11556
+    },
+    {
+      "epoch": 2.057514245014245,
+      "grad_norm": 0.7630164623260498,
+      "learning_rate": 9.563417678074319e-05,
+      "loss": 0.8547,
+      "step": 11557
+    },
+    {
+      "epoch": 2.0576923076923075,
+      "grad_norm": 0.7690725922584534,
+      "learning_rate": 9.562019269763184e-05,
+      "loss": 0.9172,
+      "step": 11558
+    },
+    {
+      "epoch": 2.0578703703703702,
+      "grad_norm": 0.81644207239151,
+      "learning_rate": 9.560620870033367e-05,
+      "loss": 0.811,
+      "step": 11559
+    },
+    {
+      "epoch": 2.058048433048433,
+      "grad_norm": 0.8240723013877869,
+      "learning_rate": 9.559222478912273e-05,
+      "loss": 0.9094,
+      "step": 11560
+    },
+    {
+      "epoch": 2.0582264957264957,
+      "grad_norm": 0.7168204188346863,
+      "learning_rate": 9.557824096427297e-05,
+      "loss": 1.0617,
+      "step": 11561
+    },
+    {
+      "epoch": 2.0584045584045585,
+      "grad_norm": 0.6648391485214233,
+      "learning_rate": 9.556425722605846e-05,
+      "loss": 0.6556,
+      "step": 11562
+    },
+    {
+      "epoch": 2.0585826210826212,
+      "grad_norm": 0.7291145324707031,
+      "learning_rate": 9.555027357475305e-05,
+      "loss": 0.784,
+      "step": 11563
+    },
+    {
+      "epoch": 2.0587606837606836,
+      "grad_norm": 0.6910824775695801,
+      "learning_rate": 9.553629001063079e-05,
+      "loss": 0.9332,
+      "step": 11564
+    },
+    {
+      "epoch": 2.0589387464387463,
+      "grad_norm": 0.757247805595398,
+      "learning_rate": 9.552230653396566e-05,
+      "loss": 0.6598,
+      "step": 11565
+    },
+    {
+      "epoch": 2.059116809116809,
+      "grad_norm": 0.7778435349464417,
+      "learning_rate": 9.550832314503163e-05,
+      "loss": 0.8899,
+      "step": 11566
+    },
+    {
+      "epoch": 2.059294871794872,
+      "grad_norm": 0.7827669978141785,
+      "learning_rate": 9.54943398441027e-05,
+      "loss": 0.8036,
+      "step": 11567
+    },
+    {
+      "epoch": 2.0594729344729346,
+      "grad_norm": 0.7462462186813354,
+      "learning_rate": 9.54803566314528e-05,
+      "loss": 0.9306,
+      "step": 11568
+    },
+    {
+      "epoch": 2.0596509971509973,
+      "grad_norm": 0.8088639974594116,
+      "learning_rate": 9.546637350735597e-05,
+      "loss": 0.8766,
+      "step": 11569
+    },
+    {
+      "epoch": 2.0598290598290596,
+      "grad_norm": 0.6477743983268738,
+      "learning_rate": 9.545239047208607e-05,
+      "loss": 0.7239,
+      "step": 11570
+    },
+    {
+      "epoch": 2.0600071225071224,
+      "grad_norm": 0.7535004615783691,
+      "learning_rate": 9.543840752591718e-05,
+      "loss": 0.8891,
+      "step": 11571
+    },
+    {
+      "epoch": 2.060185185185185,
+      "grad_norm": 0.7085242867469788,
+      "learning_rate": 9.542442466912316e-05,
+      "loss": 0.8105,
+      "step": 11572
+    },
+    {
+      "epoch": 2.060363247863248,
+      "grad_norm": 0.8129137754440308,
+      "learning_rate": 9.541044190197811e-05,
+      "loss": 0.6955,
+      "step": 11573
+    },
+    {
+      "epoch": 2.0605413105413106,
+      "grad_norm": 0.7160677909851074,
+      "learning_rate": 9.539645922475586e-05,
+      "loss": 0.833,
+      "step": 11574
+    },
+    {
+      "epoch": 2.0607193732193734,
+      "grad_norm": 0.6983035206794739,
+      "learning_rate": 9.538247663773044e-05,
+      "loss": 0.6439,
+      "step": 11575
+    },
+    {
+      "epoch": 2.0608974358974357,
+      "grad_norm": 0.8732622861862183,
+      "learning_rate": 9.536849414117578e-05,
+      "loss": 0.7763,
+      "step": 11576
+    },
+    {
+      "epoch": 2.0610754985754984,
+      "grad_norm": 0.7745480537414551,
+      "learning_rate": 9.535451173536591e-05,
+      "loss": 0.8272,
+      "step": 11577
+    },
+    {
+      "epoch": 2.061253561253561,
+      "grad_norm": 0.8210037350654602,
+      "learning_rate": 9.53405294205747e-05,
+      "loss": 0.9539,
+      "step": 11578
+    },
+    {
+      "epoch": 2.061431623931624,
+      "grad_norm": 0.6742323637008667,
+      "learning_rate": 9.532654719707617e-05,
+      "loss": 0.6525,
+      "step": 11579
+    },
+    {
+      "epoch": 2.0616096866096867,
+      "grad_norm": 0.8312603831291199,
+      "learning_rate": 9.531256506514418e-05,
+      "loss": 0.7776,
+      "step": 11580
+    },
+    {
+      "epoch": 2.0617877492877494,
+      "grad_norm": 0.7817347049713135,
+      "learning_rate": 9.529858302505278e-05,
+      "loss": 1.0148,
+      "step": 11581
+    },
+    {
+      "epoch": 2.0619658119658117,
+      "grad_norm": 0.751153290271759,
+      "learning_rate": 9.528460107707584e-05,
+      "loss": 0.8064,
+      "step": 11582
+    },
+    {
+      "epoch": 2.0621438746438745,
+      "grad_norm": 0.7483627200126648,
+      "learning_rate": 9.527061922148737e-05,
+      "loss": 0.8706,
+      "step": 11583
+    },
+    {
+      "epoch": 2.0623219373219372,
+      "grad_norm": 0.7044979929924011,
+      "learning_rate": 9.525663745856132e-05,
+      "loss": 0.8008,
+      "step": 11584
+    },
+    {
+      "epoch": 2.0625,
+      "grad_norm": 0.8249054551124573,
+      "learning_rate": 9.524265578857157e-05,
+      "loss": 0.9339,
+      "step": 11585
+    },
+    {
+      "epoch": 2.0626780626780628,
+      "grad_norm": 0.7184668183326721,
+      "learning_rate": 9.522867421179211e-05,
+      "loss": 0.9191,
+      "step": 11586
+    },
+    {
+      "epoch": 2.0628561253561255,
+      "grad_norm": 0.8135001063346863,
+      "learning_rate": 9.521469272849685e-05,
+      "loss": 0.966,
+      "step": 11587
+    },
+    {
+      "epoch": 2.0630341880341883,
+      "grad_norm": 0.8151242733001709,
+      "learning_rate": 9.520071133895978e-05,
+      "loss": 0.7947,
+      "step": 11588
+    },
+    {
+      "epoch": 2.0632122507122506,
+      "grad_norm": 0.8044771552085876,
+      "learning_rate": 9.518673004345477e-05,
+      "loss": 0.893,
+      "step": 11589
+    },
+    {
+      "epoch": 2.0633903133903133,
+      "grad_norm": 0.730505645275116,
+      "learning_rate": 9.517274884225581e-05,
+      "loss": 0.8691,
+      "step": 11590
+    },
+    {
+      "epoch": 2.063568376068376,
+      "grad_norm": 0.7419933676719666,
+      "learning_rate": 9.515876773563678e-05,
+      "loss": 0.809,
+      "step": 11591
+    },
+    {
+      "epoch": 2.063746438746439,
+      "grad_norm": 0.7809683084487915,
+      "learning_rate": 9.514478672387169e-05,
+      "loss": 0.6926,
+      "step": 11592
+    },
+    {
+      "epoch": 2.0639245014245016,
+      "grad_norm": 1.0065315961837769,
+      "learning_rate": 9.513080580723435e-05,
+      "loss": 0.7506,
+      "step": 11593
+    },
+    {
+      "epoch": 2.064102564102564,
+      "grad_norm": 0.7424543499946594,
+      "learning_rate": 9.511682498599883e-05,
+      "loss": 0.9158,
+      "step": 11594
+    },
+    {
+      "epoch": 2.0642806267806266,
+      "grad_norm": 0.6907097697257996,
+      "learning_rate": 9.510284426043893e-05,
+      "loss": 0.8795,
+      "step": 11595
+    },
+    {
+      "epoch": 2.0644586894586894,
+      "grad_norm": 0.7849169969558716,
+      "learning_rate": 9.508886363082864e-05,
+      "loss": 1.0065,
+      "step": 11596
+    },
+    {
+      "epoch": 2.064636752136752,
+      "grad_norm": 0.7421438694000244,
+      "learning_rate": 9.507488309744183e-05,
+      "loss": 0.7574,
+      "step": 11597
+    },
+    {
+      "epoch": 2.064814814814815,
+      "grad_norm": 0.6636283993721008,
+      "learning_rate": 9.506090266055252e-05,
+      "loss": 0.7489,
+      "step": 11598
+    },
+    {
+      "epoch": 2.0649928774928776,
+      "grad_norm": 0.7133244872093201,
+      "learning_rate": 9.504692232043452e-05,
+      "loss": 0.7652,
+      "step": 11599
+    },
+    {
+      "epoch": 2.0651709401709404,
+      "grad_norm": 0.7891597151756287,
+      "learning_rate": 9.50329420773618e-05,
+      "loss": 0.7268,
+      "step": 11600
+    },
+    {
+      "epoch": 2.0653490028490027,
+      "grad_norm": 0.8578699827194214,
+      "learning_rate": 9.501896193160822e-05,
+      "loss": 0.9872,
+      "step": 11601
+    },
+    {
+      "epoch": 2.0655270655270654,
+      "grad_norm": 0.7071980834007263,
+      "learning_rate": 9.500498188344777e-05,
+      "loss": 0.7278,
+      "step": 11602
+    },
+    {
+      "epoch": 2.065705128205128,
+      "grad_norm": 0.8434318900108337,
+      "learning_rate": 9.499100193315436e-05,
+      "loss": 0.8811,
+      "step": 11603
+    },
+    {
+      "epoch": 2.065883190883191,
+      "grad_norm": 0.7429414391517639,
+      "learning_rate": 9.49770220810018e-05,
+      "loss": 0.8256,
+      "step": 11604
+    },
+    {
+      "epoch": 2.0660612535612537,
+      "grad_norm": 0.7059712409973145,
+      "learning_rate": 9.496304232726412e-05,
+      "loss": 0.8012,
+      "step": 11605
+    },
+    {
+      "epoch": 2.0662393162393164,
+      "grad_norm": 0.7095850706100464,
+      "learning_rate": 9.49490626722151e-05,
+      "loss": 0.8767,
+      "step": 11606
+    },
+    {
+      "epoch": 2.0664173789173788,
+      "grad_norm": 0.8135038018226624,
+      "learning_rate": 9.493508311612874e-05,
+      "loss": 1.1402,
+      "step": 11607
+    },
+    {
+      "epoch": 2.0665954415954415,
+      "grad_norm": 0.9023036360740662,
+      "learning_rate": 9.492110365927888e-05,
+      "loss": 0.7307,
+      "step": 11608
+    },
+    {
+      "epoch": 2.0667735042735043,
+      "grad_norm": 0.6990833282470703,
+      "learning_rate": 9.490712430193949e-05,
+      "loss": 0.7996,
+      "step": 11609
+    },
+    {
+      "epoch": 2.066951566951567,
+      "grad_norm": 0.7765957713127136,
+      "learning_rate": 9.489314504438437e-05,
+      "loss": 0.9721,
+      "step": 11610
+    },
+    {
+      "epoch": 2.0671296296296298,
+      "grad_norm": 0.883575975894928,
+      "learning_rate": 9.487916588688749e-05,
+      "loss": 0.8778,
+      "step": 11611
+    },
+    {
+      "epoch": 2.0673076923076925,
+      "grad_norm": 0.7226536870002747,
+      "learning_rate": 9.48651868297227e-05,
+      "loss": 1.0438,
+      "step": 11612
+    },
+    {
+      "epoch": 2.067485754985755,
+      "grad_norm": 0.7399018406867981,
+      "learning_rate": 9.485120787316394e-05,
+      "loss": 0.8154,
+      "step": 11613
+    },
+    {
+      "epoch": 2.0676638176638176,
+      "grad_norm": 1.0130186080932617,
+      "learning_rate": 9.483722901748502e-05,
+      "loss": 0.9832,
+      "step": 11614
+    },
+    {
+      "epoch": 2.0678418803418803,
+      "grad_norm": 0.8163331151008606,
+      "learning_rate": 9.482325026295993e-05,
+      "loss": 0.8854,
+      "step": 11615
+    },
+    {
+      "epoch": 2.068019943019943,
+      "grad_norm": 0.7165096998214722,
+      "learning_rate": 9.480927160986244e-05,
+      "loss": 0.7636,
+      "step": 11616
+    },
+    {
+      "epoch": 2.068198005698006,
+      "grad_norm": 0.8579450845718384,
+      "learning_rate": 9.479529305846652e-05,
+      "loss": 0.9105,
+      "step": 11617
+    },
+    {
+      "epoch": 2.0683760683760686,
+      "grad_norm": 0.7062679529190063,
+      "learning_rate": 9.4781314609046e-05,
+      "loss": 0.7601,
+      "step": 11618
+    },
+    {
+      "epoch": 2.068554131054131,
+      "grad_norm": 0.8178739547729492,
+      "learning_rate": 9.476733626187483e-05,
+      "loss": 0.8393,
+      "step": 11619
+    },
+    {
+      "epoch": 2.0687321937321936,
+      "grad_norm": 0.6667241454124451,
+      "learning_rate": 9.475335801722678e-05,
+      "loss": 0.8791,
+      "step": 11620
+    },
+    {
+      "epoch": 2.0689102564102564,
+      "grad_norm": 0.6603145599365234,
+      "learning_rate": 9.47393798753758e-05,
+      "loss": 0.7073,
+      "step": 11621
+    },
+    {
+      "epoch": 2.069088319088319,
+      "grad_norm": 0.7719821333885193,
+      "learning_rate": 9.472540183659573e-05,
+      "loss": 0.9195,
+      "step": 11622
+    },
+    {
+      "epoch": 2.069266381766382,
+      "grad_norm": 0.8059320449829102,
+      "learning_rate": 9.471142390116045e-05,
+      "loss": 0.927,
+      "step": 11623
+    },
+    {
+      "epoch": 2.0694444444444446,
+      "grad_norm": 0.6513992547988892,
+      "learning_rate": 9.469744606934388e-05,
+      "loss": 0.857,
+      "step": 11624
+    },
+    {
+      "epoch": 2.069622507122507,
+      "grad_norm": 0.6948497295379639,
+      "learning_rate": 9.468346834141979e-05,
+      "loss": 0.65,
+      "step": 11625
+    },
+    {
+      "epoch": 2.0698005698005697,
+      "grad_norm": 0.8086618781089783,
+      "learning_rate": 9.466949071766213e-05,
+      "loss": 0.8328,
+      "step": 11626
+    },
+    {
+      "epoch": 2.0699786324786325,
+      "grad_norm": 0.794731616973877,
+      "learning_rate": 9.465551319834468e-05,
+      "loss": 0.8641,
+      "step": 11627
+    },
+    {
+      "epoch": 2.070156695156695,
+      "grad_norm": 0.7312739491462708,
+      "learning_rate": 9.46415357837414e-05,
+      "loss": 0.7878,
+      "step": 11628
+    },
+    {
+      "epoch": 2.070334757834758,
+      "grad_norm": 0.8025211691856384,
+      "learning_rate": 9.462755847412606e-05,
+      "loss": 0.8624,
+      "step": 11629
+    },
+    {
+      "epoch": 2.0705128205128207,
+      "grad_norm": 0.7296801209449768,
+      "learning_rate": 9.461358126977259e-05,
+      "loss": 0.7299,
+      "step": 11630
+    },
+    {
+      "epoch": 2.070690883190883,
+      "grad_norm": 0.7176340222358704,
+      "learning_rate": 9.459960417095477e-05,
+      "loss": 0.7374,
+      "step": 11631
+    },
+    {
+      "epoch": 2.0708689458689458,
+      "grad_norm": 0.7656565308570862,
+      "learning_rate": 9.45856271779465e-05,
+      "loss": 0.7791,
+      "step": 11632
+    },
+    {
+      "epoch": 2.0710470085470085,
+      "grad_norm": 0.7232711315155029,
+      "learning_rate": 9.457165029102159e-05,
+      "loss": 0.85,
+      "step": 11633
+    },
+    {
+      "epoch": 2.0712250712250713,
+      "grad_norm": 0.7342440485954285,
+      "learning_rate": 9.455767351045397e-05,
+      "loss": 0.8423,
+      "step": 11634
+    },
+    {
+      "epoch": 2.071403133903134,
+      "grad_norm": 0.7844834923744202,
+      "learning_rate": 9.45436968365174e-05,
+      "loss": 0.9179,
+      "step": 11635
+    },
+    {
+      "epoch": 2.0715811965811968,
+      "grad_norm": 0.8880203366279602,
+      "learning_rate": 9.452972026948575e-05,
+      "loss": 0.7139,
+      "step": 11636
+    },
+    {
+      "epoch": 2.071759259259259,
+      "grad_norm": 0.7611206769943237,
+      "learning_rate": 9.451574380963286e-05,
+      "loss": 0.8915,
+      "step": 11637
+    },
+    {
+      "epoch": 2.071937321937322,
+      "grad_norm": 0.8123503923416138,
+      "learning_rate": 9.450176745723262e-05,
+      "loss": 0.7011,
+      "step": 11638
+    },
+    {
+      "epoch": 2.0721153846153846,
+      "grad_norm": 0.7703253030776978,
+      "learning_rate": 9.448779121255879e-05,
+      "loss": 0.8493,
+      "step": 11639
+    },
+    {
+      "epoch": 2.0722934472934473,
+      "grad_norm": 0.755836009979248,
+      "learning_rate": 9.447381507588527e-05,
+      "loss": 0.9145,
+      "step": 11640
+    },
+    {
+      "epoch": 2.07247150997151,
+      "grad_norm": 0.7879568338394165,
+      "learning_rate": 9.445983904748583e-05,
+      "loss": 0.7761,
+      "step": 11641
+    },
+    {
+      "epoch": 2.072649572649573,
+      "grad_norm": 0.6695574522018433,
+      "learning_rate": 9.444586312763434e-05,
+      "loss": 0.8594,
+      "step": 11642
+    },
+    {
+      "epoch": 2.072827635327635,
+      "grad_norm": 0.6734640002250671,
+      "learning_rate": 9.443188731660462e-05,
+      "loss": 0.7324,
+      "step": 11643
+    },
+    {
+      "epoch": 2.073005698005698,
+      "grad_norm": 0.7823841571807861,
+      "learning_rate": 9.441791161467051e-05,
+      "loss": 1.0223,
+      "step": 11644
+    },
+    {
+      "epoch": 2.0731837606837606,
+      "grad_norm": 0.8152045011520386,
+      "learning_rate": 9.440393602210585e-05,
+      "loss": 1.0364,
+      "step": 11645
+    },
+    {
+      "epoch": 2.0733618233618234,
+      "grad_norm": 0.8664864897727966,
+      "learning_rate": 9.438996053918441e-05,
+      "loss": 0.7607,
+      "step": 11646
+    },
+    {
+      "epoch": 2.073539886039886,
+      "grad_norm": 0.7949544787406921,
+      "learning_rate": 9.437598516618006e-05,
+      "loss": 0.7644,
+      "step": 11647
+    },
+    {
+      "epoch": 2.073717948717949,
+      "grad_norm": 0.767045259475708,
+      "learning_rate": 9.436200990336657e-05,
+      "loss": 0.7563,
+      "step": 11648
+    },
+    {
+      "epoch": 2.073896011396011,
+      "grad_norm": 0.669129490852356,
+      "learning_rate": 9.434803475101782e-05,
+      "loss": 0.7644,
+      "step": 11649
+    },
+    {
+      "epoch": 2.074074074074074,
+      "grad_norm": 0.7969587445259094,
+      "learning_rate": 9.433405970940755e-05,
+      "loss": 1.0249,
+      "step": 11650
+    },
+    {
+      "epoch": 2.0742521367521367,
+      "grad_norm": 0.6744855642318726,
+      "learning_rate": 9.432008477880966e-05,
+      "loss": 0.7478,
+      "step": 11651
+    },
+    {
+      "epoch": 2.0744301994301995,
+      "grad_norm": 0.6236920356750488,
+      "learning_rate": 9.430610995949786e-05,
+      "loss": 0.7309,
+      "step": 11652
+    },
+    {
+      "epoch": 2.074608262108262,
+      "grad_norm": 0.7952008843421936,
+      "learning_rate": 9.429213525174603e-05,
+      "loss": 0.927,
+      "step": 11653
+    },
+    {
+      "epoch": 2.074786324786325,
+      "grad_norm": 0.7075965404510498,
+      "learning_rate": 9.427816065582792e-05,
+      "loss": 0.8494,
+      "step": 11654
+    },
+    {
+      "epoch": 2.0749643874643873,
+      "grad_norm": 0.8018102049827576,
+      "learning_rate": 9.426418617201744e-05,
+      "loss": 0.9261,
+      "step": 11655
+    },
+    {
+      "epoch": 2.07514245014245,
+      "grad_norm": 0.7155446410179138,
+      "learning_rate": 9.425021180058824e-05,
+      "loss": 0.8296,
+      "step": 11656
+    },
+    {
+      "epoch": 2.0753205128205128,
+      "grad_norm": 0.6611294150352478,
+      "learning_rate": 9.423623754181425e-05,
+      "loss": 0.687,
+      "step": 11657
+    },
+    {
+      "epoch": 2.0754985754985755,
+      "grad_norm": 0.706280529499054,
+      "learning_rate": 9.422226339596917e-05,
+      "loss": 0.8664,
+      "step": 11658
+    },
+    {
+      "epoch": 2.0756766381766383,
+      "grad_norm": 0.7512072324752808,
+      "learning_rate": 9.420828936332687e-05,
+      "loss": 0.9074,
+      "step": 11659
+    },
+    {
+      "epoch": 2.075854700854701,
+      "grad_norm": 0.8833743333816528,
+      "learning_rate": 9.419431544416108e-05,
+      "loss": 1.0541,
+      "step": 11660
+    },
+    {
+      "epoch": 2.0760327635327633,
+      "grad_norm": 0.6991413235664368,
+      "learning_rate": 9.418034163874564e-05,
+      "loss": 0.7543,
+      "step": 11661
+    },
+    {
+      "epoch": 2.076210826210826,
+      "grad_norm": 0.784294605255127,
+      "learning_rate": 9.41663679473543e-05,
+      "loss": 0.8156,
+      "step": 11662
+    },
+    {
+      "epoch": 2.076388888888889,
+      "grad_norm": 0.7716241478919983,
+      "learning_rate": 9.415239437026086e-05,
+      "loss": 0.9613,
+      "step": 11663
+    },
+    {
+      "epoch": 2.0765669515669516,
+      "grad_norm": 0.8247698545455933,
+      "learning_rate": 9.413842090773914e-05,
+      "loss": 0.8811,
+      "step": 11664
+    },
+    {
+      "epoch": 2.0767450142450143,
+      "grad_norm": 0.7988204956054688,
+      "learning_rate": 9.412444756006283e-05,
+      "loss": 1.017,
+      "step": 11665
+    },
+    {
+      "epoch": 2.076923076923077,
+      "grad_norm": 0.8069472908973694,
+      "learning_rate": 9.411047432750583e-05,
+      "loss": 0.8292,
+      "step": 11666
+    },
+    {
+      "epoch": 2.0771011396011394,
+      "grad_norm": 0.8177345991134644,
+      "learning_rate": 9.40965012103418e-05,
+      "loss": 0.8004,
+      "step": 11667
+    },
+    {
+      "epoch": 2.077279202279202,
+      "grad_norm": 0.8589172959327698,
+      "learning_rate": 9.40825282088446e-05,
+      "loss": 0.8274,
+      "step": 11668
+    },
+    {
+      "epoch": 2.077457264957265,
+      "grad_norm": 0.677379846572876,
+      "learning_rate": 9.406855532328792e-05,
+      "loss": 0.7044,
+      "step": 11669
+    },
+    {
+      "epoch": 2.0776353276353277,
+      "grad_norm": 0.9417888522148132,
+      "learning_rate": 9.405458255394564e-05,
+      "loss": 0.8418,
+      "step": 11670
+    },
+    {
+      "epoch": 2.0778133903133904,
+      "grad_norm": 0.7226679921150208,
+      "learning_rate": 9.404060990109141e-05,
+      "loss": 0.8496,
+      "step": 11671
+    },
+    {
+      "epoch": 2.077991452991453,
+      "grad_norm": 0.7451614737510681,
+      "learning_rate": 9.402663736499909e-05,
+      "loss": 0.7569,
+      "step": 11672
+    },
+    {
+      "epoch": 2.0781695156695155,
+      "grad_norm": 0.6516944169998169,
+      "learning_rate": 9.401266494594235e-05,
+      "loss": 0.5591,
+      "step": 11673
+    },
+    {
+      "epoch": 2.078347578347578,
+      "grad_norm": 0.7473219633102417,
+      "learning_rate": 9.399869264419507e-05,
+      "loss": 1.0098,
+      "step": 11674
+    },
+    {
+      "epoch": 2.078525641025641,
+      "grad_norm": 0.8346691131591797,
+      "learning_rate": 9.398472046003088e-05,
+      "loss": 1.2654,
+      "step": 11675
+    },
+    {
+      "epoch": 2.0787037037037037,
+      "grad_norm": 0.8611979484558105,
+      "learning_rate": 9.397074839372366e-05,
+      "loss": 0.571,
+      "step": 11676
+    },
+    {
+      "epoch": 2.0788817663817665,
+      "grad_norm": 0.8093259334564209,
+      "learning_rate": 9.395677644554705e-05,
+      "loss": 1.0383,
+      "step": 11677
+    },
+    {
+      "epoch": 2.0790598290598292,
+      "grad_norm": 0.7954222559928894,
+      "learning_rate": 9.394280461577488e-05,
+      "loss": 0.8078,
+      "step": 11678
+    },
+    {
+      "epoch": 2.0792378917378915,
+      "grad_norm": 0.8380635380744934,
+      "learning_rate": 9.392883290468083e-05,
+      "loss": 1.0368,
+      "step": 11679
+    },
+    {
+      "epoch": 2.0794159544159543,
+      "grad_norm": 0.8427146077156067,
+      "learning_rate": 9.391486131253874e-05,
+      "loss": 0.8638,
+      "step": 11680
+    },
+    {
+      "epoch": 2.079594017094017,
+      "grad_norm": 0.7211564779281616,
+      "learning_rate": 9.390088983962227e-05,
+      "loss": 0.8211,
+      "step": 11681
+    },
+    {
+      "epoch": 2.07977207977208,
+      "grad_norm": 0.7480773329734802,
+      "learning_rate": 9.388691848620517e-05,
+      "loss": 0.9313,
+      "step": 11682
+    },
+    {
+      "epoch": 2.0799501424501425,
+      "grad_norm": 0.8421902060508728,
+      "learning_rate": 9.387294725256123e-05,
+      "loss": 0.8808,
+      "step": 11683
+    },
+    {
+      "epoch": 2.0801282051282053,
+      "grad_norm": 0.7753815650939941,
+      "learning_rate": 9.385897613896416e-05,
+      "loss": 0.7738,
+      "step": 11684
+    },
+    {
+      "epoch": 2.0803062678062676,
+      "grad_norm": 0.8053030967712402,
+      "learning_rate": 9.384500514568773e-05,
+      "loss": 0.9194,
+      "step": 11685
+    },
+    {
+      "epoch": 2.0804843304843303,
+      "grad_norm": 0.7628602981567383,
+      "learning_rate": 9.383103427300559e-05,
+      "loss": 0.7866,
+      "step": 11686
+    },
+    {
+      "epoch": 2.080662393162393,
+      "grad_norm": 0.7087932825088501,
+      "learning_rate": 9.381706352119156e-05,
+      "loss": 0.8817,
+      "step": 11687
+    },
+    {
+      "epoch": 2.080840455840456,
+      "grad_norm": 0.8687152862548828,
+      "learning_rate": 9.380309289051929e-05,
+      "loss": 0.8902,
+      "step": 11688
+    },
+    {
+      "epoch": 2.0810185185185186,
+      "grad_norm": 0.8181152939796448,
+      "learning_rate": 9.378912238126256e-05,
+      "loss": 0.9244,
+      "step": 11689
+    },
+    {
+      "epoch": 2.0811965811965814,
+      "grad_norm": 0.7961983680725098,
+      "learning_rate": 9.377515199369506e-05,
+      "loss": 0.714,
+      "step": 11690
+    },
+    {
+      "epoch": 2.0813746438746437,
+      "grad_norm": 0.8307793736457825,
+      "learning_rate": 9.376118172809056e-05,
+      "loss": 0.9573,
+      "step": 11691
+    },
+    {
+      "epoch": 2.0815527065527064,
+      "grad_norm": 0.7349256277084351,
+      "learning_rate": 9.374721158472269e-05,
+      "loss": 0.7533,
+      "step": 11692
+    },
+    {
+      "epoch": 2.081730769230769,
+      "grad_norm": 0.7625117897987366,
+      "learning_rate": 9.373324156386526e-05,
+      "loss": 0.8387,
+      "step": 11693
+    },
+    {
+      "epoch": 2.081908831908832,
+      "grad_norm": 0.9537683129310608,
+      "learning_rate": 9.371927166579191e-05,
+      "loss": 0.9444,
+      "step": 11694
+    },
+    {
+      "epoch": 2.0820868945868947,
+      "grad_norm": 0.7170497179031372,
+      "learning_rate": 9.370530189077644e-05,
+      "loss": 0.9132,
+      "step": 11695
+    },
+    {
+      "epoch": 2.0822649572649574,
+      "grad_norm": 0.7750041484832764,
+      "learning_rate": 9.369133223909246e-05,
+      "loss": 0.6635,
+      "step": 11696
+    },
+    {
+      "epoch": 2.08244301994302,
+      "grad_norm": 0.8990386128425598,
+      "learning_rate": 9.367736271101373e-05,
+      "loss": 0.8692,
+      "step": 11697
+    },
+    {
+      "epoch": 2.0826210826210825,
+      "grad_norm": 0.5909343361854553,
+      "learning_rate": 9.366339330681393e-05,
+      "loss": 0.6811,
+      "step": 11698
+    },
+    {
+      "epoch": 2.0827991452991452,
+      "grad_norm": 0.7783302068710327,
+      "learning_rate": 9.364942402676682e-05,
+      "loss": 1.1024,
+      "step": 11699
+    },
+    {
+      "epoch": 2.082977207977208,
+      "grad_norm": 0.8926466703414917,
+      "learning_rate": 9.3635454871146e-05,
+      "loss": 0.971,
+      "step": 11700
+    },
+    {
+      "epoch": 2.0831552706552707,
+      "grad_norm": 0.7374816536903381,
+      "learning_rate": 9.362148584022527e-05,
+      "loss": 0.7151,
+      "step": 11701
+    },
+    {
+      "epoch": 2.0833333333333335,
+      "grad_norm": 0.7491161227226257,
+      "learning_rate": 9.360751693427823e-05,
+      "loss": 0.9213,
+      "step": 11702
+    },
+    {
+      "epoch": 2.083511396011396,
+      "grad_norm": 0.726859986782074,
+      "learning_rate": 9.359354815357862e-05,
+      "loss": 0.8412,
+      "step": 11703
+    },
+    {
+      "epoch": 2.0836894586894585,
+      "grad_norm": 0.756703794002533,
+      "learning_rate": 9.357957949840015e-05,
+      "loss": 0.8074,
+      "step": 11704
+    },
+    {
+      "epoch": 2.0838675213675213,
+      "grad_norm": 0.8475984334945679,
+      "learning_rate": 9.356561096901646e-05,
+      "loss": 0.8926,
+      "step": 11705
+    },
+    {
+      "epoch": 2.084045584045584,
+      "grad_norm": 0.9776971936225891,
+      "learning_rate": 9.355164256570129e-05,
+      "loss": 0.8543,
+      "step": 11706
+    },
+    {
+      "epoch": 2.084223646723647,
+      "grad_norm": 0.7185834646224976,
+      "learning_rate": 9.353767428872826e-05,
+      "loss": 0.6946,
+      "step": 11707
+    },
+    {
+      "epoch": 2.0844017094017095,
+      "grad_norm": 0.7075535655021667,
+      "learning_rate": 9.352370613837109e-05,
+      "loss": 0.7171,
+      "step": 11708
+    },
+    {
+      "epoch": 2.0845797720797723,
+      "grad_norm": 0.8549726009368896,
+      "learning_rate": 9.350973811490343e-05,
+      "loss": 0.9028,
+      "step": 11709
+    },
+    {
+      "epoch": 2.0847578347578346,
+      "grad_norm": 0.731235682964325,
+      "learning_rate": 9.3495770218599e-05,
+      "loss": 0.7703,
+      "step": 11710
+    },
+    {
+      "epoch": 2.0849358974358974,
+      "grad_norm": 0.8660612106323242,
+      "learning_rate": 9.34818024497314e-05,
+      "loss": 0.8464,
+      "step": 11711
+    },
+    {
+      "epoch": 2.08511396011396,
+      "grad_norm": 0.7687711715698242,
+      "learning_rate": 9.346783480857439e-05,
+      "loss": 0.8199,
+      "step": 11712
+    },
+    {
+      "epoch": 2.085292022792023,
+      "grad_norm": 0.6802884936332703,
+      "learning_rate": 9.345386729540155e-05,
+      "loss": 0.7537,
+      "step": 11713
+    },
+    {
+      "epoch": 2.0854700854700856,
+      "grad_norm": 0.7688863277435303,
+      "learning_rate": 9.34398999104866e-05,
+      "loss": 0.8374,
+      "step": 11714
+    },
+    {
+      "epoch": 2.0856481481481484,
+      "grad_norm": 0.7872602939605713,
+      "learning_rate": 9.342593265410315e-05,
+      "loss": 0.8786,
+      "step": 11715
+    },
+    {
+      "epoch": 2.0858262108262107,
+      "grad_norm": 0.9752106666564941,
+      "learning_rate": 9.341196552652496e-05,
+      "loss": 0.9572,
+      "step": 11716
+    },
+    {
+      "epoch": 2.0860042735042734,
+      "grad_norm": 0.7023422718048096,
+      "learning_rate": 9.339799852802555e-05,
+      "loss": 0.7613,
+      "step": 11717
+    },
+    {
+      "epoch": 2.086182336182336,
+      "grad_norm": 0.8366875052452087,
+      "learning_rate": 9.338403165887868e-05,
+      "loss": 0.8206,
+      "step": 11718
+    },
+    {
+      "epoch": 2.086360398860399,
+      "grad_norm": 0.8534985184669495,
+      "learning_rate": 9.337006491935794e-05,
+      "loss": 0.8549,
+      "step": 11719
+    },
+    {
+      "epoch": 2.0865384615384617,
+      "grad_norm": 0.7902935743331909,
+      "learning_rate": 9.335609830973707e-05,
+      "loss": 0.8399,
+      "step": 11720
+    },
+    {
+      "epoch": 2.0867165242165244,
+      "grad_norm": 0.8064647316932678,
+      "learning_rate": 9.334213183028958e-05,
+      "loss": 0.7978,
+      "step": 11721
+    },
+    {
+      "epoch": 2.0868945868945867,
+      "grad_norm": 0.816412627696991,
+      "learning_rate": 9.332816548128919e-05,
+      "loss": 0.8814,
+      "step": 11722
+    },
+    {
+      "epoch": 2.0870726495726495,
+      "grad_norm": 0.7778908610343933,
+      "learning_rate": 9.33141992630096e-05,
+      "loss": 0.9916,
+      "step": 11723
+    },
+    {
+      "epoch": 2.0872507122507122,
+      "grad_norm": 0.7899400591850281,
+      "learning_rate": 9.330023317572433e-05,
+      "loss": 0.5682,
+      "step": 11724
+    },
+    {
+      "epoch": 2.087428774928775,
+      "grad_norm": 0.6770033836364746,
+      "learning_rate": 9.32862672197071e-05,
+      "loss": 0.7327,
+      "step": 11725
+    },
+    {
+      "epoch": 2.0876068376068377,
+      "grad_norm": 0.8385946750640869,
+      "learning_rate": 9.327230139523148e-05,
+      "loss": 0.7793,
+      "step": 11726
+    },
+    {
+      "epoch": 2.0877849002849005,
+      "grad_norm": 0.708091139793396,
+      "learning_rate": 9.32583357025712e-05,
+      "loss": 0.6199,
+      "step": 11727
+    },
+    {
+      "epoch": 2.087962962962963,
+      "grad_norm": 0.8172122836112976,
+      "learning_rate": 9.324437014199978e-05,
+      "loss": 0.749,
+      "step": 11728
+    },
+    {
+      "epoch": 2.0881410256410255,
+      "grad_norm": 0.818324625492096,
+      "learning_rate": 9.323040471379091e-05,
+      "loss": 0.6605,
+      "step": 11729
+    },
+    {
+      "epoch": 2.0883190883190883,
+      "grad_norm": 0.9074803590774536,
+      "learning_rate": 9.321643941821819e-05,
+      "loss": 0.9231,
+      "step": 11730
+    },
+    {
+      "epoch": 2.088497150997151,
+      "grad_norm": 0.7559560537338257,
+      "learning_rate": 9.320247425555527e-05,
+      "loss": 0.7291,
+      "step": 11731
+    },
+    {
+      "epoch": 2.088675213675214,
+      "grad_norm": 0.8001563549041748,
+      "learning_rate": 9.318850922607571e-05,
+      "loss": 0.7781,
+      "step": 11732
+    },
+    {
+      "epoch": 2.0888532763532766,
+      "grad_norm": 0.7365888953208923,
+      "learning_rate": 9.31745443300532e-05,
+      "loss": 0.8243,
+      "step": 11733
+    },
+    {
+      "epoch": 2.089031339031339,
+      "grad_norm": 0.7861692309379578,
+      "learning_rate": 9.316057956776126e-05,
+      "loss": 0.7568,
+      "step": 11734
+    },
+    {
+      "epoch": 2.0892094017094016,
+      "grad_norm": 0.8399034738540649,
+      "learning_rate": 9.314661493947363e-05,
+      "loss": 0.8019,
+      "step": 11735
+    },
+    {
+      "epoch": 2.0893874643874644,
+      "grad_norm": 0.7718507051467896,
+      "learning_rate": 9.313265044546378e-05,
+      "loss": 0.909,
+      "step": 11736
+    },
+    {
+      "epoch": 2.089565527065527,
+      "grad_norm": 0.8940733671188354,
+      "learning_rate": 9.311868608600543e-05,
+      "loss": 0.7154,
+      "step": 11737
+    },
+    {
+      "epoch": 2.08974358974359,
+      "grad_norm": 0.8506718873977661,
+      "learning_rate": 9.31047218613721e-05,
+      "loss": 0.8367,
+      "step": 11738
+    },
+    {
+      "epoch": 2.0899216524216526,
+      "grad_norm": 0.8431367874145508,
+      "learning_rate": 9.309075777183743e-05,
+      "loss": 0.9532,
+      "step": 11739
+    },
+    {
+      "epoch": 2.090099715099715,
+      "grad_norm": 0.7683414220809937,
+      "learning_rate": 9.307679381767499e-05,
+      "loss": 0.9301,
+      "step": 11740
+    },
+    {
+      "epoch": 2.0902777777777777,
+      "grad_norm": 0.7601380348205566,
+      "learning_rate": 9.306282999915839e-05,
+      "loss": 0.8462,
+      "step": 11741
+    },
+    {
+      "epoch": 2.0904558404558404,
+      "grad_norm": 0.7531782388687134,
+      "learning_rate": 9.304886631656127e-05,
+      "loss": 0.8012,
+      "step": 11742
+    },
+    {
+      "epoch": 2.090633903133903,
+      "grad_norm": 0.7869617938995361,
+      "learning_rate": 9.303490277015714e-05,
+      "loss": 0.6645,
+      "step": 11743
+    },
+    {
+      "epoch": 2.090811965811966,
+      "grad_norm": 0.8042751550674438,
+      "learning_rate": 9.302093936021964e-05,
+      "loss": 1.1078,
+      "step": 11744
+    },
+    {
+      "epoch": 2.0909900284900287,
+      "grad_norm": 0.750350296497345,
+      "learning_rate": 9.300697608702231e-05,
+      "loss": 0.8552,
+      "step": 11745
+    },
+    {
+      "epoch": 2.091168091168091,
+      "grad_norm": 0.7624406814575195,
+      "learning_rate": 9.29930129508388e-05,
+      "loss": 0.861,
+      "step": 11746
+    },
+    {
+      "epoch": 2.0913461538461537,
+      "grad_norm": 0.7634474635124207,
+      "learning_rate": 9.29790499519426e-05,
+      "loss": 0.9483,
+      "step": 11747
+    },
+    {
+      "epoch": 2.0915242165242165,
+      "grad_norm": 0.7312899231910706,
+      "learning_rate": 9.296508709060738e-05,
+      "loss": 0.655,
+      "step": 11748
+    },
+    {
+      "epoch": 2.0917022792022792,
+      "grad_norm": 0.8181857466697693,
+      "learning_rate": 9.295112436710662e-05,
+      "loss": 0.7912,
+      "step": 11749
+    },
+    {
+      "epoch": 2.091880341880342,
+      "grad_norm": 0.6349542737007141,
+      "learning_rate": 9.293716178171396e-05,
+      "loss": 0.6268,
+      "step": 11750
+    },
+    {
+      "epoch": 2.0920584045584047,
+      "grad_norm": 0.8832548260688782,
+      "learning_rate": 9.292319933470291e-05,
+      "loss": 0.7805,
+      "step": 11751
+    },
+    {
+      "epoch": 2.092236467236467,
+      "grad_norm": 0.7251408100128174,
+      "learning_rate": 9.290923702634712e-05,
+      "loss": 0.7553,
+      "step": 11752
+    },
+    {
+      "epoch": 2.09241452991453,
+      "grad_norm": 0.8794457912445068,
+      "learning_rate": 9.289527485692006e-05,
+      "loss": 0.9187,
+      "step": 11753
+    },
+    {
+      "epoch": 2.0925925925925926,
+      "grad_norm": 0.7768839597702026,
+      "learning_rate": 9.288131282669534e-05,
+      "loss": 0.9267,
+      "step": 11754
+    },
+    {
+      "epoch": 2.0927706552706553,
+      "grad_norm": 0.744144856929779,
+      "learning_rate": 9.28673509359465e-05,
+      "loss": 0.714,
+      "step": 11755
+    },
+    {
+      "epoch": 2.092948717948718,
+      "grad_norm": 0.9117433428764343,
+      "learning_rate": 9.285338918494714e-05,
+      "loss": 0.9965,
+      "step": 11756
+    },
+    {
+      "epoch": 2.093126780626781,
+      "grad_norm": 0.8105267286300659,
+      "learning_rate": 9.283942757397073e-05,
+      "loss": 0.7517,
+      "step": 11757
+    },
+    {
+      "epoch": 2.093304843304843,
+      "grad_norm": 0.7348153591156006,
+      "learning_rate": 9.28254661032909e-05,
+      "loss": 0.7101,
+      "step": 11758
+    },
+    {
+      "epoch": 2.093482905982906,
+      "grad_norm": 0.7625702023506165,
+      "learning_rate": 9.281150477318113e-05,
+      "loss": 0.6863,
+      "step": 11759
+    },
+    {
+      "epoch": 2.0936609686609686,
+      "grad_norm": 0.7987569570541382,
+      "learning_rate": 9.2797543583915e-05,
+      "loss": 0.8848,
+      "step": 11760
+    },
+    {
+      "epoch": 2.0938390313390314,
+      "grad_norm": 0.706235408782959,
+      "learning_rate": 9.278358253576601e-05,
+      "loss": 0.7375,
+      "step": 11761
+    },
+    {
+      "epoch": 2.094017094017094,
+      "grad_norm": 0.9716742038726807,
+      "learning_rate": 9.276962162900774e-05,
+      "loss": 0.8602,
+      "step": 11762
+    },
+    {
+      "epoch": 2.094195156695157,
+      "grad_norm": 0.7711777687072754,
+      "learning_rate": 9.275566086391377e-05,
+      "loss": 0.8553,
+      "step": 11763
+    },
+    {
+      "epoch": 2.094373219373219,
+      "grad_norm": 0.8542511463165283,
+      "learning_rate": 9.274170024075751e-05,
+      "loss": 0.8412,
+      "step": 11764
+    },
+    {
+      "epoch": 2.094551282051282,
+      "grad_norm": 0.8255360126495361,
+      "learning_rate": 9.272773975981259e-05,
+      "loss": 1.0245,
+      "step": 11765
+    },
+    {
+      "epoch": 2.0947293447293447,
+      "grad_norm": 0.7416045665740967,
+      "learning_rate": 9.271377942135248e-05,
+      "loss": 0.57,
+      "step": 11766
+    },
+    {
+      "epoch": 2.0949074074074074,
+      "grad_norm": 0.8805620670318604,
+      "learning_rate": 9.269981922565078e-05,
+      "loss": 1.0262,
+      "step": 11767
+    },
+    {
+      "epoch": 2.09508547008547,
+      "grad_norm": 0.7293491363525391,
+      "learning_rate": 9.26858591729809e-05,
+      "loss": 0.7945,
+      "step": 11768
+    },
+    {
+      "epoch": 2.095263532763533,
+      "grad_norm": 0.7949206233024597,
+      "learning_rate": 9.267189926361643e-05,
+      "loss": 0.7071,
+      "step": 11769
+    },
+    {
+      "epoch": 2.0954415954415953,
+      "grad_norm": 0.771806538105011,
+      "learning_rate": 9.265793949783087e-05,
+      "loss": 0.8125,
+      "step": 11770
+    },
+    {
+      "epoch": 2.095619658119658,
+      "grad_norm": 0.7256866693496704,
+      "learning_rate": 9.264397987589776e-05,
+      "loss": 0.7607,
+      "step": 11771
+    },
+    {
+      "epoch": 2.0957977207977208,
+      "grad_norm": 0.8175343871116638,
+      "learning_rate": 9.263002039809055e-05,
+      "loss": 0.8486,
+      "step": 11772
+    },
+    {
+      "epoch": 2.0959757834757835,
+      "grad_norm": 0.7618881464004517,
+      "learning_rate": 9.261606106468282e-05,
+      "loss": 0.8182,
+      "step": 11773
+    },
+    {
+      "epoch": 2.0961538461538463,
+      "grad_norm": 0.7574927806854248,
+      "learning_rate": 9.2602101875948e-05,
+      "loss": 0.8703,
+      "step": 11774
+    },
+    {
+      "epoch": 2.096331908831909,
+      "grad_norm": 0.8639108538627625,
+      "learning_rate": 9.258814283215964e-05,
+      "loss": 0.9044,
+      "step": 11775
+    },
+    {
+      "epoch": 2.0965099715099713,
+      "grad_norm": 0.7221997380256653,
+      "learning_rate": 9.25741839335912e-05,
+      "loss": 0.7599,
+      "step": 11776
+    },
+    {
+      "epoch": 2.096688034188034,
+      "grad_norm": 0.9379764795303345,
+      "learning_rate": 9.256022518051626e-05,
+      "loss": 1.0002,
+      "step": 11777
+    },
+    {
+      "epoch": 2.096866096866097,
+      "grad_norm": 0.8430935740470886,
+      "learning_rate": 9.25462665732082e-05,
+      "loss": 0.7711,
+      "step": 11778
+    },
+    {
+      "epoch": 2.0970441595441596,
+      "grad_norm": 0.8371061086654663,
+      "learning_rate": 9.253230811194057e-05,
+      "loss": 0.9028,
+      "step": 11779
+    },
+    {
+      "epoch": 2.0972222222222223,
+      "grad_norm": 0.6960258483886719,
+      "learning_rate": 9.251834979698684e-05,
+      "loss": 0.7491,
+      "step": 11780
+    },
+    {
+      "epoch": 2.097400284900285,
+      "grad_norm": 0.7736398577690125,
+      "learning_rate": 9.25043916286205e-05,
+      "loss": 0.8985,
+      "step": 11781
+    },
+    {
+      "epoch": 2.0975783475783474,
+      "grad_norm": 0.6901512145996094,
+      "learning_rate": 9.249043360711509e-05,
+      "loss": 0.5881,
+      "step": 11782
+    },
+    {
+      "epoch": 2.09775641025641,
+      "grad_norm": 0.6741603016853333,
+      "learning_rate": 9.247647573274397e-05,
+      "loss": 0.7641,
+      "step": 11783
+    },
+    {
+      "epoch": 2.097934472934473,
+      "grad_norm": 0.736657440662384,
+      "learning_rate": 9.246251800578074e-05,
+      "loss": 0.8286,
+      "step": 11784
+    },
+    {
+      "epoch": 2.0981125356125356,
+      "grad_norm": 0.8235752582550049,
+      "learning_rate": 9.244856042649877e-05,
+      "loss": 0.8835,
+      "step": 11785
+    },
+    {
+      "epoch": 2.0982905982905984,
+      "grad_norm": 0.8083409667015076,
+      "learning_rate": 9.243460299517158e-05,
+      "loss": 0.9032,
+      "step": 11786
+    },
+    {
+      "epoch": 2.098468660968661,
+      "grad_norm": 0.7650952339172363,
+      "learning_rate": 9.242064571207262e-05,
+      "loss": 0.775,
+      "step": 11787
+    },
+    {
+      "epoch": 2.0986467236467234,
+      "grad_norm": 0.7961280345916748,
+      "learning_rate": 9.24066885774754e-05,
+      "loss": 0.6308,
+      "step": 11788
+    },
+    {
+      "epoch": 2.098824786324786,
+      "grad_norm": 0.8032481670379639,
+      "learning_rate": 9.23927315916533e-05,
+      "loss": 0.7544,
+      "step": 11789
+    },
+    {
+      "epoch": 2.099002849002849,
+      "grad_norm": 0.7452995777130127,
+      "learning_rate": 9.237877475487984e-05,
+      "loss": 0.8573,
+      "step": 11790
+    },
+    {
+      "epoch": 2.0991809116809117,
+      "grad_norm": 0.8141751289367676,
+      "learning_rate": 9.236481806742844e-05,
+      "loss": 0.9055,
+      "step": 11791
+    },
+    {
+      "epoch": 2.0993589743589745,
+      "grad_norm": 0.7862252593040466,
+      "learning_rate": 9.235086152957261e-05,
+      "loss": 0.6967,
+      "step": 11792
+    },
+    {
+      "epoch": 2.099537037037037,
+      "grad_norm": 0.771587073802948,
+      "learning_rate": 9.233690514158571e-05,
+      "loss": 0.7544,
+      "step": 11793
+    },
+    {
+      "epoch": 2.0997150997150995,
+      "grad_norm": 0.851445198059082,
+      "learning_rate": 9.23229489037413e-05,
+      "loss": 0.9249,
+      "step": 11794
+    },
+    {
+      "epoch": 2.0998931623931623,
+      "grad_norm": 0.7483612895011902,
+      "learning_rate": 9.23089928163127e-05,
+      "loss": 0.747,
+      "step": 11795
+    },
+    {
+      "epoch": 2.100071225071225,
+      "grad_norm": 0.8493219017982483,
+      "learning_rate": 9.229503687957342e-05,
+      "loss": 0.8898,
+      "step": 11796
+    },
+    {
+      "epoch": 2.1002492877492878,
+      "grad_norm": 0.8331718444824219,
+      "learning_rate": 9.228108109379687e-05,
+      "loss": 0.8943,
+      "step": 11797
+    },
+    {
+      "epoch": 2.1004273504273505,
+      "grad_norm": 0.7756054997444153,
+      "learning_rate": 9.226712545925655e-05,
+      "loss": 0.8586,
+      "step": 11798
+    },
+    {
+      "epoch": 2.1006054131054133,
+      "grad_norm": 0.7292607426643372,
+      "learning_rate": 9.225316997622579e-05,
+      "loss": 0.7591,
+      "step": 11799
+    },
+    {
+      "epoch": 2.1007834757834756,
+      "grad_norm": 0.8575723767280579,
+      "learning_rate": 9.223921464497811e-05,
+      "loss": 1.0147,
+      "step": 11800
+    },
+    {
+      "epoch": 2.1009615384615383,
+      "grad_norm": 0.7882707118988037,
+      "learning_rate": 9.222525946578687e-05,
+      "loss": 0.8297,
+      "step": 11801
+    },
+    {
+      "epoch": 2.101139601139601,
+      "grad_norm": 0.7982630729675293,
+      "learning_rate": 9.221130443892551e-05,
+      "loss": 0.9308,
+      "step": 11802
+    },
+    {
+      "epoch": 2.101317663817664,
+      "grad_norm": 0.7577962279319763,
+      "learning_rate": 9.219734956466752e-05,
+      "loss": 0.8474,
+      "step": 11803
+    },
+    {
+      "epoch": 2.1014957264957266,
+      "grad_norm": 0.7103776335716248,
+      "learning_rate": 9.218339484328621e-05,
+      "loss": 0.7863,
+      "step": 11804
+    },
+    {
+      "epoch": 2.1016737891737893,
+      "grad_norm": 0.8307296633720398,
+      "learning_rate": 9.216944027505505e-05,
+      "loss": 0.7633,
+      "step": 11805
+    },
+    {
+      "epoch": 2.1018518518518516,
+      "grad_norm": 0.8197653293609619,
+      "learning_rate": 9.215548586024743e-05,
+      "loss": 0.8987,
+      "step": 11806
+    },
+    {
+      "epoch": 2.1020299145299144,
+      "grad_norm": 0.9192719459533691,
+      "learning_rate": 9.21415315991368e-05,
+      "loss": 0.7829,
+      "step": 11807
+    },
+    {
+      "epoch": 2.102207977207977,
+      "grad_norm": 0.7249892354011536,
+      "learning_rate": 9.21275774919965e-05,
+      "loss": 0.9143,
+      "step": 11808
+    },
+    {
+      "epoch": 2.10238603988604,
+      "grad_norm": 0.7942582368850708,
+      "learning_rate": 9.211362353910002e-05,
+      "loss": 0.8634,
+      "step": 11809
+    },
+    {
+      "epoch": 2.1025641025641026,
+      "grad_norm": 0.7773341536521912,
+      "learning_rate": 9.209966974072065e-05,
+      "loss": 0.7865,
+      "step": 11810
+    },
+    {
+      "epoch": 2.1027421652421654,
+      "grad_norm": 0.802175760269165,
+      "learning_rate": 9.208571609713185e-05,
+      "loss": 0.7473,
+      "step": 11811
+    },
+    {
+      "epoch": 2.1029202279202277,
+      "grad_norm": 1.0248547792434692,
+      "learning_rate": 9.207176260860701e-05,
+      "loss": 1.0097,
+      "step": 11812
+    },
+    {
+      "epoch": 2.1030982905982905,
+      "grad_norm": 0.5781275629997253,
+      "learning_rate": 9.205780927541954e-05,
+      "loss": 0.5813,
+      "step": 11813
+    },
+    {
+      "epoch": 2.103276353276353,
+      "grad_norm": 0.7252389192581177,
+      "learning_rate": 9.204385609784274e-05,
+      "loss": 0.7978,
+      "step": 11814
+    },
+    {
+      "epoch": 2.103454415954416,
+      "grad_norm": 0.8497771620750427,
+      "learning_rate": 9.20299030761501e-05,
+      "loss": 0.95,
+      "step": 11815
+    },
+    {
+      "epoch": 2.1036324786324787,
+      "grad_norm": 0.8420650362968445,
+      "learning_rate": 9.201595021061491e-05,
+      "loss": 0.9693,
+      "step": 11816
+    },
+    {
+      "epoch": 2.1038105413105415,
+      "grad_norm": 0.8286302089691162,
+      "learning_rate": 9.200199750151063e-05,
+      "loss": 0.9457,
+      "step": 11817
+    },
+    {
+      "epoch": 2.103988603988604,
+      "grad_norm": 0.877740204334259,
+      "learning_rate": 9.198804494911057e-05,
+      "loss": 0.9082,
+      "step": 11818
+    },
+    {
+      "epoch": 2.1041666666666665,
+      "grad_norm": 0.7579863667488098,
+      "learning_rate": 9.197409255368817e-05,
+      "loss": 0.7681,
+      "step": 11819
+    },
+    {
+      "epoch": 2.1043447293447293,
+      "grad_norm": 0.7141458988189697,
+      "learning_rate": 9.19601403155167e-05,
+      "loss": 0.659,
+      "step": 11820
+    },
+    {
+      "epoch": 2.104522792022792,
+      "grad_norm": 0.8493850827217102,
+      "learning_rate": 9.194618823486958e-05,
+      "loss": 0.8197,
+      "step": 11821
+    },
+    {
+      "epoch": 2.1047008547008548,
+      "grad_norm": 0.8319337368011475,
+      "learning_rate": 9.193223631202019e-05,
+      "loss": 0.8955,
+      "step": 11822
+    },
+    {
+      "epoch": 2.1048789173789175,
+      "grad_norm": 0.7180153727531433,
+      "learning_rate": 9.191828454724186e-05,
+      "loss": 0.8068,
+      "step": 11823
+    },
+    {
+      "epoch": 2.10505698005698,
+      "grad_norm": 0.6748450398445129,
+      "learning_rate": 9.190433294080799e-05,
+      "loss": 0.7469,
+      "step": 11824
+    },
+    {
+      "epoch": 2.1052350427350426,
+      "grad_norm": 0.7750198841094971,
+      "learning_rate": 9.189038149299186e-05,
+      "loss": 0.932,
+      "step": 11825
+    },
+    {
+      "epoch": 2.1054131054131053,
+      "grad_norm": 0.7763389945030212,
+      "learning_rate": 9.187643020406688e-05,
+      "loss": 0.8027,
+      "step": 11826
+    },
+    {
+      "epoch": 2.105591168091168,
+      "grad_norm": 0.8382455110549927,
+      "learning_rate": 9.186247907430636e-05,
+      "loss": 0.8288,
+      "step": 11827
+    },
+    {
+      "epoch": 2.105769230769231,
+      "grad_norm": 0.6744221448898315,
+      "learning_rate": 9.184852810398367e-05,
+      "loss": 0.6807,
+      "step": 11828
+    },
+    {
+      "epoch": 2.1059472934472936,
+      "grad_norm": 0.7798452377319336,
+      "learning_rate": 9.183457729337212e-05,
+      "loss": 0.9853,
+      "step": 11829
+    },
+    {
+      "epoch": 2.1061253561253563,
+      "grad_norm": 0.7377058863639832,
+      "learning_rate": 9.182062664274513e-05,
+      "loss": 0.9043,
+      "step": 11830
+    },
+    {
+      "epoch": 2.1063034188034186,
+      "grad_norm": 0.8190791010856628,
+      "learning_rate": 9.180667615237589e-05,
+      "loss": 0.9786,
+      "step": 11831
+    },
+    {
+      "epoch": 2.1064814814814814,
+      "grad_norm": 0.7629963755607605,
+      "learning_rate": 9.179272582253785e-05,
+      "loss": 0.9168,
+      "step": 11832
+    },
+    {
+      "epoch": 2.106659544159544,
+      "grad_norm": 0.7753663063049316,
+      "learning_rate": 9.177877565350426e-05,
+      "loss": 0.963,
+      "step": 11833
+    },
+    {
+      "epoch": 2.106837606837607,
+      "grad_norm": 0.7842921614646912,
+      "learning_rate": 9.176482564554855e-05,
+      "loss": 0.8194,
+      "step": 11834
+    },
+    {
+      "epoch": 2.1070156695156697,
+      "grad_norm": 0.6640288233757019,
+      "learning_rate": 9.175087579894393e-05,
+      "loss": 0.6227,
+      "step": 11835
+    },
+    {
+      "epoch": 2.1071937321937324,
+      "grad_norm": 0.8474540710449219,
+      "learning_rate": 9.173692611396376e-05,
+      "loss": 0.7817,
+      "step": 11836
+    },
+    {
+      "epoch": 2.1073717948717947,
+      "grad_norm": 0.7123007774353027,
+      "learning_rate": 9.172297659088135e-05,
+      "loss": 0.9508,
+      "step": 11837
+    },
+    {
+      "epoch": 2.1075498575498575,
+      "grad_norm": 0.7418060898780823,
+      "learning_rate": 9.170902722997007e-05,
+      "loss": 0.8832,
+      "step": 11838
+    },
+    {
+      "epoch": 2.10772792022792,
+      "grad_norm": 0.7899464964866638,
+      "learning_rate": 9.169507803150313e-05,
+      "loss": 0.8474,
+      "step": 11839
+    },
+    {
+      "epoch": 2.107905982905983,
+      "grad_norm": 0.7543701529502869,
+      "learning_rate": 9.168112899575388e-05,
+      "loss": 0.8113,
+      "step": 11840
+    },
+    {
+      "epoch": 2.1080840455840457,
+      "grad_norm": 0.8057922720909119,
+      "learning_rate": 9.166718012299565e-05,
+      "loss": 0.998,
+      "step": 11841
+    },
+    {
+      "epoch": 2.1082621082621085,
+      "grad_norm": 0.7879176139831543,
+      "learning_rate": 9.16532314135017e-05,
+      "loss": 1.0509,
+      "step": 11842
+    },
+    {
+      "epoch": 2.1084401709401708,
+      "grad_norm": 0.8796642422676086,
+      "learning_rate": 9.163928286754537e-05,
+      "loss": 1.0481,
+      "step": 11843
+    },
+    {
+      "epoch": 2.1086182336182335,
+      "grad_norm": 0.7158889174461365,
+      "learning_rate": 9.16253344853999e-05,
+      "loss": 0.796,
+      "step": 11844
+    },
+    {
+      "epoch": 2.1087962962962963,
+      "grad_norm": 0.8020899295806885,
+      "learning_rate": 9.161138626733863e-05,
+      "loss": 0.822,
+      "step": 11845
+    },
+    {
+      "epoch": 2.108974358974359,
+      "grad_norm": 0.7217469215393066,
+      "learning_rate": 9.159743821363478e-05,
+      "loss": 1.0037,
+      "step": 11846
+    },
+    {
+      "epoch": 2.109152421652422,
+      "grad_norm": 0.762450098991394,
+      "learning_rate": 9.158349032456171e-05,
+      "loss": 1.0047,
+      "step": 11847
+    },
+    {
+      "epoch": 2.1093304843304845,
+      "grad_norm": 0.7227019667625427,
+      "learning_rate": 9.156954260039263e-05,
+      "loss": 0.8034,
+      "step": 11848
+    },
+    {
+      "epoch": 2.109508547008547,
+      "grad_norm": 0.7358957529067993,
+      "learning_rate": 9.155559504140089e-05,
+      "loss": 0.9483,
+      "step": 11849
+    },
+    {
+      "epoch": 2.1096866096866096,
+      "grad_norm": 0.7039931416511536,
+      "learning_rate": 9.154164764785968e-05,
+      "loss": 0.9255,
+      "step": 11850
+    },
+    {
+      "epoch": 2.1098646723646723,
+      "grad_norm": 0.8479618430137634,
+      "learning_rate": 9.152770042004234e-05,
+      "loss": 0.7379,
+      "step": 11851
+    },
+    {
+      "epoch": 2.110042735042735,
+      "grad_norm": 0.8320785164833069,
+      "learning_rate": 9.151375335822208e-05,
+      "loss": 0.944,
+      "step": 11852
+    },
+    {
+      "epoch": 2.110220797720798,
+      "grad_norm": 0.8186322450637817,
+      "learning_rate": 9.149980646267225e-05,
+      "loss": 0.7757,
+      "step": 11853
+    },
+    {
+      "epoch": 2.1103988603988606,
+      "grad_norm": 0.7816671133041382,
+      "learning_rate": 9.148585973366601e-05,
+      "loss": 0.8592,
+      "step": 11854
+    },
+    {
+      "epoch": 2.110576923076923,
+      "grad_norm": 0.8747152090072632,
+      "learning_rate": 9.147191317147671e-05,
+      "loss": 1.0852,
+      "step": 11855
+    },
+    {
+      "epoch": 2.1107549857549857,
+      "grad_norm": 0.7762712240219116,
+      "learning_rate": 9.14579667763775e-05,
+      "loss": 0.8466,
+      "step": 11856
+    },
+    {
+      "epoch": 2.1109330484330484,
+      "grad_norm": 0.8426344394683838,
+      "learning_rate": 9.144402054864171e-05,
+      "loss": 0.9949,
+      "step": 11857
+    },
+    {
+      "epoch": 2.111111111111111,
+      "grad_norm": 0.7581121921539307,
+      "learning_rate": 9.143007448854256e-05,
+      "loss": 0.748,
+      "step": 11858
+    },
+    {
+      "epoch": 2.111289173789174,
+      "grad_norm": 0.837939977645874,
+      "learning_rate": 9.141612859635333e-05,
+      "loss": 0.9479,
+      "step": 11859
+    },
+    {
+      "epoch": 2.1114672364672367,
+      "grad_norm": 0.7402070760726929,
+      "learning_rate": 9.140218287234718e-05,
+      "loss": 0.7829,
+      "step": 11860
+    },
+    {
+      "epoch": 2.111645299145299,
+      "grad_norm": 0.7125605344772339,
+      "learning_rate": 9.13882373167974e-05,
+      "loss": 1.0175,
+      "step": 11861
+    },
+    {
+      "epoch": 2.1118233618233617,
+      "grad_norm": 0.8021374344825745,
+      "learning_rate": 9.137429192997723e-05,
+      "loss": 0.9258,
+      "step": 11862
+    },
+    {
+      "epoch": 2.1120014245014245,
+      "grad_norm": 0.7860891222953796,
+      "learning_rate": 9.136034671215988e-05,
+      "loss": 0.7351,
+      "step": 11863
+    },
+    {
+      "epoch": 2.1121794871794872,
+      "grad_norm": 0.8324207067489624,
+      "learning_rate": 9.134640166361864e-05,
+      "loss": 0.8933,
+      "step": 11864
+    },
+    {
+      "epoch": 2.11235754985755,
+      "grad_norm": 0.8209179639816284,
+      "learning_rate": 9.133245678462663e-05,
+      "loss": 0.6983,
+      "step": 11865
+    },
+    {
+      "epoch": 2.1125356125356127,
+      "grad_norm": 0.7071694731712341,
+      "learning_rate": 9.131851207545716e-05,
+      "loss": 0.7796,
+      "step": 11866
+    },
+    {
+      "epoch": 2.112713675213675,
+      "grad_norm": 0.8126310110092163,
+      "learning_rate": 9.130456753638339e-05,
+      "loss": 0.8887,
+      "step": 11867
+    },
+    {
+      "epoch": 2.112891737891738,
+      "grad_norm": 0.7713829874992371,
+      "learning_rate": 9.129062316767855e-05,
+      "loss": 0.8169,
+      "step": 11868
+    },
+    {
+      "epoch": 2.1130698005698005,
+      "grad_norm": 0.8065944314002991,
+      "learning_rate": 9.127667896961585e-05,
+      "loss": 0.9295,
+      "step": 11869
+    },
+    {
+      "epoch": 2.1132478632478633,
+      "grad_norm": 0.7433435320854187,
+      "learning_rate": 9.126273494246856e-05,
+      "loss": 1.089,
+      "step": 11870
+    },
+    {
+      "epoch": 2.113425925925926,
+      "grad_norm": 0.8168141841888428,
+      "learning_rate": 9.124879108650978e-05,
+      "loss": 0.7914,
+      "step": 11871
+    },
+    {
+      "epoch": 2.113603988603989,
+      "grad_norm": 0.7703335285186768,
+      "learning_rate": 9.123484740201276e-05,
+      "loss": 1.0599,
+      "step": 11872
+    },
+    {
+      "epoch": 2.113782051282051,
+      "grad_norm": 0.810584545135498,
+      "learning_rate": 9.12209038892507e-05,
+      "loss": 0.886,
+      "step": 11873
+    },
+    {
+      "epoch": 2.113960113960114,
+      "grad_norm": 0.8441819548606873,
+      "learning_rate": 9.120696054849683e-05,
+      "loss": 0.9069,
+      "step": 11874
+    },
+    {
+      "epoch": 2.1141381766381766,
+      "grad_norm": 0.816067636013031,
+      "learning_rate": 9.119301738002425e-05,
+      "loss": 0.9084,
+      "step": 11875
+    },
+    {
+      "epoch": 2.1143162393162394,
+      "grad_norm": 0.8595525622367859,
+      "learning_rate": 9.117907438410622e-05,
+      "loss": 0.916,
+      "step": 11876
+    },
+    {
+      "epoch": 2.114494301994302,
+      "grad_norm": 0.8604792356491089,
+      "learning_rate": 9.116513156101589e-05,
+      "loss": 1.1207,
+      "step": 11877
+    },
+    {
+      "epoch": 2.114672364672365,
+      "grad_norm": 0.673664927482605,
+      "learning_rate": 9.115118891102649e-05,
+      "loss": 0.9767,
+      "step": 11878
+    },
+    {
+      "epoch": 2.114850427350427,
+      "grad_norm": 0.7064382433891296,
+      "learning_rate": 9.113724643441113e-05,
+      "loss": 0.91,
+      "step": 11879
+    },
+    {
+      "epoch": 2.11502849002849,
+      "grad_norm": 0.7256918549537659,
+      "learning_rate": 9.112330413144301e-05,
+      "loss": 0.9061,
+      "step": 11880
+    },
+    {
+      "epoch": 2.1152065527065527,
+      "grad_norm": 0.7914155721664429,
+      "learning_rate": 9.110936200239534e-05,
+      "loss": 0.6652,
+      "step": 11881
+    },
+    {
+      "epoch": 2.1153846153846154,
+      "grad_norm": 0.7484595775604248,
+      "learning_rate": 9.109542004754122e-05,
+      "loss": 0.8049,
+      "step": 11882
+    },
+    {
+      "epoch": 2.115562678062678,
+      "grad_norm": 0.8062677979469299,
+      "learning_rate": 9.108147826715387e-05,
+      "loss": 0.8671,
+      "step": 11883
+    },
+    {
+      "epoch": 2.115740740740741,
+      "grad_norm": 0.9595313668251038,
+      "learning_rate": 9.10675366615064e-05,
+      "loss": 0.995,
+      "step": 11884
+    },
+    {
+      "epoch": 2.1159188034188032,
+      "grad_norm": 0.7263179421424866,
+      "learning_rate": 9.105359523087203e-05,
+      "loss": 0.9177,
+      "step": 11885
+    },
+    {
+      "epoch": 2.116096866096866,
+      "grad_norm": 0.900650680065155,
+      "learning_rate": 9.103965397552385e-05,
+      "loss": 0.8599,
+      "step": 11886
+    },
+    {
+      "epoch": 2.1162749287749287,
+      "grad_norm": 0.7682752013206482,
+      "learning_rate": 9.102571289573506e-05,
+      "loss": 0.8942,
+      "step": 11887
+    },
+    {
+      "epoch": 2.1164529914529915,
+      "grad_norm": 0.7076446413993835,
+      "learning_rate": 9.101177199177874e-05,
+      "loss": 0.7498,
+      "step": 11888
+    },
+    {
+      "epoch": 2.1166310541310542,
+      "grad_norm": 0.711475133895874,
+      "learning_rate": 9.099783126392813e-05,
+      "loss": 0.7035,
+      "step": 11889
+    },
+    {
+      "epoch": 2.116809116809117,
+      "grad_norm": 0.6720870137214661,
+      "learning_rate": 9.098389071245627e-05,
+      "loss": 0.7315,
+      "step": 11890
+    },
+    {
+      "epoch": 2.1169871794871793,
+      "grad_norm": 0.8207699656486511,
+      "learning_rate": 9.096995033763639e-05,
+      "loss": 0.7465,
+      "step": 11891
+    },
+    {
+      "epoch": 2.117165242165242,
+      "grad_norm": 0.9032317996025085,
+      "learning_rate": 9.095601013974153e-05,
+      "loss": 0.9209,
+      "step": 11892
+    },
+    {
+      "epoch": 2.117343304843305,
+      "grad_norm": 0.886545717716217,
+      "learning_rate": 9.094207011904489e-05,
+      "loss": 0.9411,
+      "step": 11893
+    },
+    {
+      "epoch": 2.1175213675213675,
+      "grad_norm": 0.8235130906105042,
+      "learning_rate": 9.092813027581953e-05,
+      "loss": 0.9264,
+      "step": 11894
+    },
+    {
+      "epoch": 2.1176994301994303,
+      "grad_norm": 0.7530205845832825,
+      "learning_rate": 9.091419061033867e-05,
+      "loss": 0.8926,
+      "step": 11895
+    },
+    {
+      "epoch": 2.117877492877493,
+      "grad_norm": 0.8329548835754395,
+      "learning_rate": 9.090025112287533e-05,
+      "loss": 0.9615,
+      "step": 11896
+    },
+    {
+      "epoch": 2.1180555555555554,
+      "grad_norm": 0.8184738755226135,
+      "learning_rate": 9.088631181370269e-05,
+      "loss": 0.9069,
+      "step": 11897
+    },
+    {
+      "epoch": 2.118233618233618,
+      "grad_norm": 0.8071370720863342,
+      "learning_rate": 9.087237268309381e-05,
+      "loss": 0.8721,
+      "step": 11898
+    },
+    {
+      "epoch": 2.118411680911681,
+      "grad_norm": 0.8995245695114136,
+      "learning_rate": 9.085843373132187e-05,
+      "loss": 0.8815,
+      "step": 11899
+    },
+    {
+      "epoch": 2.1185897435897436,
+      "grad_norm": 0.7601714730262756,
+      "learning_rate": 9.084449495865989e-05,
+      "loss": 0.6824,
+      "step": 11900
+    },
+    {
+      "epoch": 2.1187678062678064,
+      "grad_norm": 0.8499618172645569,
+      "learning_rate": 9.083055636538101e-05,
+      "loss": 0.9868,
+      "step": 11901
+    },
+    {
+      "epoch": 2.118945868945869,
+      "grad_norm": 0.8190310001373291,
+      "learning_rate": 9.081661795175837e-05,
+      "loss": 0.8156,
+      "step": 11902
+    },
+    {
+      "epoch": 2.1191239316239314,
+      "grad_norm": 0.8340418934822083,
+      "learning_rate": 9.080267971806498e-05,
+      "loss": 1.0153,
+      "step": 11903
+    },
+    {
+      "epoch": 2.119301994301994,
+      "grad_norm": 0.8460756540298462,
+      "learning_rate": 9.0788741664574e-05,
+      "loss": 0.8752,
+      "step": 11904
+    },
+    {
+      "epoch": 2.119480056980057,
+      "grad_norm": 0.7457373738288879,
+      "learning_rate": 9.077480379155848e-05,
+      "loss": 0.9105,
+      "step": 11905
+    },
+    {
+      "epoch": 2.1196581196581197,
+      "grad_norm": 0.7883822917938232,
+      "learning_rate": 9.076086609929155e-05,
+      "loss": 0.8782,
+      "step": 11906
+    },
+    {
+      "epoch": 2.1198361823361824,
+      "grad_norm": 0.912143886089325,
+      "learning_rate": 9.074692858804622e-05,
+      "loss": 0.9898,
+      "step": 11907
+    },
+    {
+      "epoch": 2.120014245014245,
+      "grad_norm": 0.7801905274391174,
+      "learning_rate": 9.073299125809562e-05,
+      "loss": 1.091,
+      "step": 11908
+    },
+    {
+      "epoch": 2.1201923076923075,
+      "grad_norm": 0.6836256384849548,
+      "learning_rate": 9.071905410971279e-05,
+      "loss": 0.7967,
+      "step": 11909
+    },
+    {
+      "epoch": 2.1203703703703702,
+      "grad_norm": 0.7656795382499695,
+      "learning_rate": 9.070511714317085e-05,
+      "loss": 0.9696,
+      "step": 11910
+    },
+    {
+      "epoch": 2.120548433048433,
+      "grad_norm": 0.7010015249252319,
+      "learning_rate": 9.06911803587428e-05,
+      "loss": 0.6501,
+      "step": 11911
+    },
+    {
+      "epoch": 2.1207264957264957,
+      "grad_norm": 0.6673064827919006,
+      "learning_rate": 9.067724375670174e-05,
+      "loss": 0.5663,
+      "step": 11912
+    },
+    {
+      "epoch": 2.1209045584045585,
+      "grad_norm": 0.8683220148086548,
+      "learning_rate": 9.06633073373207e-05,
+      "loss": 0.9722,
+      "step": 11913
+    },
+    {
+      "epoch": 2.1210826210826212,
+      "grad_norm": 0.7793976068496704,
+      "learning_rate": 9.06493711008728e-05,
+      "loss": 0.7595,
+      "step": 11914
+    },
+    {
+      "epoch": 2.1212606837606836,
+      "grad_norm": 0.7803528308868408,
+      "learning_rate": 9.0635435047631e-05,
+      "loss": 0.9262,
+      "step": 11915
+    },
+    {
+      "epoch": 2.1214387464387463,
+      "grad_norm": 0.8067244291305542,
+      "learning_rate": 9.062149917786846e-05,
+      "loss": 0.9376,
+      "step": 11916
+    },
+    {
+      "epoch": 2.121616809116809,
+      "grad_norm": 0.7389153838157654,
+      "learning_rate": 9.060756349185812e-05,
+      "loss": 0.7414,
+      "step": 11917
+    },
+    {
+      "epoch": 2.121794871794872,
+      "grad_norm": 0.7717151045799255,
+      "learning_rate": 9.059362798987308e-05,
+      "loss": 0.7261,
+      "step": 11918
+    },
+    {
+      "epoch": 2.1219729344729346,
+      "grad_norm": 0.7668650150299072,
+      "learning_rate": 9.057969267218632e-05,
+      "loss": 0.8145,
+      "step": 11919
+    },
+    {
+      "epoch": 2.1221509971509973,
+      "grad_norm": 1.0015910863876343,
+      "learning_rate": 9.056575753907093e-05,
+      "loss": 0.8997,
+      "step": 11920
+    },
+    {
+      "epoch": 2.1223290598290596,
+      "grad_norm": 0.8731024861335754,
+      "learning_rate": 9.055182259079997e-05,
+      "loss": 1.0101,
+      "step": 11921
+    },
+    {
+      "epoch": 2.1225071225071224,
+      "grad_norm": 0.7662718892097473,
+      "learning_rate": 9.053788782764637e-05,
+      "loss": 0.697,
+      "step": 11922
+    },
+    {
+      "epoch": 2.122685185185185,
+      "grad_norm": 0.7783135771751404,
+      "learning_rate": 9.05239532498832e-05,
+      "loss": 0.8506,
+      "step": 11923
+    },
+    {
+      "epoch": 2.122863247863248,
+      "grad_norm": 0.8667652606964111,
+      "learning_rate": 9.05100188577835e-05,
+      "loss": 0.9851,
+      "step": 11924
+    },
+    {
+      "epoch": 2.1230413105413106,
+      "grad_norm": 0.7785412073135376,
+      "learning_rate": 9.049608465162028e-05,
+      "loss": 0.5924,
+      "step": 11925
+    },
+    {
+      "epoch": 2.1232193732193734,
+      "grad_norm": 0.7968559861183167,
+      "learning_rate": 9.04821506316665e-05,
+      "loss": 0.8114,
+      "step": 11926
+    },
+    {
+      "epoch": 2.123397435897436,
+      "grad_norm": 0.8065921068191528,
+      "learning_rate": 9.046821679819527e-05,
+      "loss": 0.9045,
+      "step": 11927
+    },
+    {
+      "epoch": 2.1235754985754984,
+      "grad_norm": 0.7509779930114746,
+      "learning_rate": 9.045428315147948e-05,
+      "loss": 0.7337,
+      "step": 11928
+    },
+    {
+      "epoch": 2.123753561253561,
+      "grad_norm": 0.8174976110458374,
+      "learning_rate": 9.044034969179219e-05,
+      "loss": 1.0113,
+      "step": 11929
+    },
+    {
+      "epoch": 2.123931623931624,
+      "grad_norm": 0.8723294734954834,
+      "learning_rate": 9.042641641940638e-05,
+      "loss": 0.9657,
+      "step": 11930
+    },
+    {
+      "epoch": 2.1241096866096867,
+      "grad_norm": 0.7412081360816956,
+      "learning_rate": 9.041248333459509e-05,
+      "loss": 0.9311,
+      "step": 11931
+    },
+    {
+      "epoch": 2.1242877492877494,
+      "grad_norm": 0.7376424670219421,
+      "learning_rate": 9.039855043763124e-05,
+      "loss": 0.7039,
+      "step": 11932
+    },
+    {
+      "epoch": 2.1244658119658117,
+      "grad_norm": 0.8002118468284607,
+      "learning_rate": 9.038461772878786e-05,
+      "loss": 0.9555,
+      "step": 11933
+    },
+    {
+      "epoch": 2.1246438746438745,
+      "grad_norm": 0.7221434712409973,
+      "learning_rate": 9.03706852083379e-05,
+      "loss": 0.8462,
+      "step": 11934
+    },
+    {
+      "epoch": 2.1248219373219372,
+      "grad_norm": 0.8506385684013367,
+      "learning_rate": 9.035675287655441e-05,
+      "loss": 0.7977,
+      "step": 11935
+    },
+    {
+      "epoch": 2.125,
+      "grad_norm": 0.8088411688804626,
+      "learning_rate": 9.034282073371025e-05,
+      "loss": 1.0146,
+      "step": 11936
+    },
+    {
+      "epoch": 2.1251780626780628,
+      "grad_norm": 0.9231638312339783,
+      "learning_rate": 9.032888878007853e-05,
+      "loss": 0.7017,
+      "step": 11937
+    },
+    {
+      "epoch": 2.1253561253561255,
+      "grad_norm": 0.721066951751709,
+      "learning_rate": 9.03149570159321e-05,
+      "loss": 0.7662,
+      "step": 11938
+    },
+    {
+      "epoch": 2.1255341880341883,
+      "grad_norm": 0.7804762721061707,
+      "learning_rate": 9.030102544154395e-05,
+      "loss": 0.6835,
+      "step": 11939
+    },
+    {
+      "epoch": 2.1257122507122506,
+      "grad_norm": 0.9728445410728455,
+      "learning_rate": 9.028709405718707e-05,
+      "loss": 0.9161,
+      "step": 11940
+    },
+    {
+      "epoch": 2.1258903133903133,
+      "grad_norm": 0.8209855556488037,
+      "learning_rate": 9.02731628631344e-05,
+      "loss": 0.7492,
+      "step": 11941
+    },
+    {
+      "epoch": 2.126068376068376,
+      "grad_norm": 0.7054622769355774,
+      "learning_rate": 9.025923185965896e-05,
+      "loss": 0.7908,
+      "step": 11942
+    },
+    {
+      "epoch": 2.126246438746439,
+      "grad_norm": 0.73018878698349,
+      "learning_rate": 9.024530104703358e-05,
+      "loss": 0.7902,
+      "step": 11943
+    },
+    {
+      "epoch": 2.1264245014245016,
+      "grad_norm": 0.73788982629776,
+      "learning_rate": 9.023137042553127e-05,
+      "loss": 0.8473,
+      "step": 11944
+    },
+    {
+      "epoch": 2.126602564102564,
+      "grad_norm": 0.7733396291732788,
+      "learning_rate": 9.021743999542495e-05,
+      "loss": 0.9595,
+      "step": 11945
+    },
+    {
+      "epoch": 2.1267806267806266,
+      "grad_norm": 0.9066760540008545,
+      "learning_rate": 9.020350975698761e-05,
+      "loss": 0.8517,
+      "step": 11946
+    },
+    {
+      "epoch": 2.1269586894586894,
+      "grad_norm": 0.7552717328071594,
+      "learning_rate": 9.018957971049211e-05,
+      "loss": 0.6802,
+      "step": 11947
+    },
+    {
+      "epoch": 2.127136752136752,
+      "grad_norm": 0.7437541484832764,
+      "learning_rate": 9.017564985621144e-05,
+      "loss": 0.9365,
+      "step": 11948
+    },
+    {
+      "epoch": 2.127314814814815,
+      "grad_norm": 0.8216256499290466,
+      "learning_rate": 9.016172019441847e-05,
+      "loss": 0.9019,
+      "step": 11949
+    },
+    {
+      "epoch": 2.1274928774928776,
+      "grad_norm": 0.752247154712677,
+      "learning_rate": 9.014779072538621e-05,
+      "loss": 0.7771,
+      "step": 11950
+    },
+    {
+      "epoch": 2.1276709401709404,
+      "grad_norm": 0.7714348435401917,
+      "learning_rate": 9.013386144938748e-05,
+      "loss": 0.8495,
+      "step": 11951
+    },
+    {
+      "epoch": 2.1278490028490027,
+      "grad_norm": 0.8347537517547607,
+      "learning_rate": 9.011993236669529e-05,
+      "loss": 0.861,
+      "step": 11952
+    },
+    {
+      "epoch": 2.1280270655270654,
+      "grad_norm": 0.8180193901062012,
+      "learning_rate": 9.010600347758245e-05,
+      "loss": 0.9059,
+      "step": 11953
+    },
+    {
+      "epoch": 2.128205128205128,
+      "grad_norm": 0.7328528761863708,
+      "learning_rate": 9.009207478232193e-05,
+      "loss": 0.9144,
+      "step": 11954
+    },
+    {
+      "epoch": 2.128383190883191,
+      "grad_norm": 0.7590839862823486,
+      "learning_rate": 9.007814628118661e-05,
+      "loss": 0.8642,
+      "step": 11955
+    },
+    {
+      "epoch": 2.1285612535612537,
+      "grad_norm": 0.7962782382965088,
+      "learning_rate": 9.006421797444945e-05,
+      "loss": 0.8958,
+      "step": 11956
+    },
+    {
+      "epoch": 2.128739316239316,
+      "grad_norm": 0.7302426695823669,
+      "learning_rate": 9.005028986238325e-05,
+      "loss": 0.9419,
+      "step": 11957
+    },
+    {
+      "epoch": 2.1289173789173788,
+      "grad_norm": 0.9223780632019043,
+      "learning_rate": 9.003636194526098e-05,
+      "loss": 0.7631,
+      "step": 11958
+    },
+    {
+      "epoch": 2.1290954415954415,
+      "grad_norm": 0.728225588798523,
+      "learning_rate": 9.002243422335547e-05,
+      "loss": 0.7705,
+      "step": 11959
+    },
+    {
+      "epoch": 2.1292735042735043,
+      "grad_norm": 0.8519338369369507,
+      "learning_rate": 9.000850669693964e-05,
+      "loss": 0.8962,
+      "step": 11960
+    },
+    {
+      "epoch": 2.129451566951567,
+      "grad_norm": 0.8920532464981079,
+      "learning_rate": 8.999457936628641e-05,
+      "loss": 0.618,
+      "step": 11961
+    },
+    {
+      "epoch": 2.1296296296296298,
+      "grad_norm": 0.9719427824020386,
+      "learning_rate": 8.998065223166857e-05,
+      "loss": 0.9142,
+      "step": 11962
+    },
+    {
+      "epoch": 2.1298076923076925,
+      "grad_norm": 0.8130887150764465,
+      "learning_rate": 8.996672529335908e-05,
+      "loss": 1.0246,
+      "step": 11963
+    },
+    {
+      "epoch": 2.129985754985755,
+      "grad_norm": 0.7682677507400513,
+      "learning_rate": 8.995279855163073e-05,
+      "loss": 0.7964,
+      "step": 11964
+    },
+    {
+      "epoch": 2.1301638176638176,
+      "grad_norm": 0.8507778644561768,
+      "learning_rate": 8.993887200675641e-05,
+      "loss": 0.756,
+      "step": 11965
+    },
+    {
+      "epoch": 2.1303418803418803,
+      "grad_norm": 0.815487802028656,
+      "learning_rate": 8.992494565900901e-05,
+      "loss": 0.7596,
+      "step": 11966
+    },
+    {
+      "epoch": 2.130519943019943,
+      "grad_norm": 0.8560892939567566,
+      "learning_rate": 8.991101950866138e-05,
+      "loss": 0.9939,
+      "step": 11967
+    },
+    {
+      "epoch": 2.130698005698006,
+      "grad_norm": 0.8737899661064148,
+      "learning_rate": 8.989709355598635e-05,
+      "loss": 0.9235,
+      "step": 11968
+    },
+    {
+      "epoch": 2.1308760683760686,
+      "grad_norm": 0.8434267640113831,
+      "learning_rate": 8.98831678012568e-05,
+      "loss": 0.7832,
+      "step": 11969
+    },
+    {
+      "epoch": 2.131054131054131,
+      "grad_norm": 0.8286582827568054,
+      "learning_rate": 8.986924224474553e-05,
+      "loss": 1.0591,
+      "step": 11970
+    },
+    {
+      "epoch": 2.1312321937321936,
+      "grad_norm": 0.8023663759231567,
+      "learning_rate": 8.985531688672546e-05,
+      "loss": 0.935,
+      "step": 11971
+    },
+    {
+      "epoch": 2.1314102564102564,
+      "grad_norm": 0.6504420042037964,
+      "learning_rate": 8.984139172746933e-05,
+      "loss": 0.79,
+      "step": 11972
+    },
+    {
+      "epoch": 2.131588319088319,
+      "grad_norm": 0.8969349265098572,
+      "learning_rate": 8.982746676725009e-05,
+      "loss": 1.0531,
+      "step": 11973
+    },
+    {
+      "epoch": 2.131766381766382,
+      "grad_norm": 0.802094042301178,
+      "learning_rate": 8.981354200634046e-05,
+      "loss": 0.8873,
+      "step": 11974
+    },
+    {
+      "epoch": 2.1319444444444446,
+      "grad_norm": 0.7630797624588013,
+      "learning_rate": 8.979961744501332e-05,
+      "loss": 0.9299,
+      "step": 11975
+    },
+    {
+      "epoch": 2.132122507122507,
+      "grad_norm": 0.8395546674728394,
+      "learning_rate": 8.978569308354148e-05,
+      "loss": 0.922,
+      "step": 11976
+    },
+    {
+      "epoch": 2.1323005698005697,
+      "grad_norm": 0.9325534701347351,
+      "learning_rate": 8.97717689221978e-05,
+      "loss": 0.9156,
+      "step": 11977
+    },
+    {
+      "epoch": 2.1324786324786325,
+      "grad_norm": 0.8139503002166748,
+      "learning_rate": 8.975784496125502e-05,
+      "loss": 0.8882,
+      "step": 11978
+    },
+    {
+      "epoch": 2.132656695156695,
+      "grad_norm": 1.0311007499694824,
+      "learning_rate": 8.974392120098599e-05,
+      "loss": 1.0068,
+      "step": 11979
+    },
+    {
+      "epoch": 2.132834757834758,
+      "grad_norm": 0.9328663945198059,
+      "learning_rate": 8.972999764166354e-05,
+      "loss": 0.8313,
+      "step": 11980
+    },
+    {
+      "epoch": 2.1330128205128207,
+      "grad_norm": 0.747276782989502,
+      "learning_rate": 8.971607428356044e-05,
+      "loss": 0.9302,
+      "step": 11981
+    },
+    {
+      "epoch": 2.133190883190883,
+      "grad_norm": 0.7572789788246155,
+      "learning_rate": 8.970215112694953e-05,
+      "loss": 0.8016,
+      "step": 11982
+    },
+    {
+      "epoch": 2.1333689458689458,
+      "grad_norm": 0.8988085389137268,
+      "learning_rate": 8.968822817210354e-05,
+      "loss": 0.9307,
+      "step": 11983
+    },
+    {
+      "epoch": 2.1335470085470085,
+      "grad_norm": 0.7537818551063538,
+      "learning_rate": 8.967430541929532e-05,
+      "loss": 0.6423,
+      "step": 11984
+    },
+    {
+      "epoch": 2.1337250712250713,
+      "grad_norm": 0.7470884323120117,
+      "learning_rate": 8.966038286879763e-05,
+      "loss": 0.7753,
+      "step": 11985
+    },
+    {
+      "epoch": 2.133903133903134,
+      "grad_norm": 0.8670676946640015,
+      "learning_rate": 8.964646052088328e-05,
+      "loss": 1.0407,
+      "step": 11986
+    },
+    {
+      "epoch": 2.1340811965811968,
+      "grad_norm": 0.8322215676307678,
+      "learning_rate": 8.9632538375825e-05,
+      "loss": 0.6498,
+      "step": 11987
+    },
+    {
+      "epoch": 2.134259259259259,
+      "grad_norm": 0.7089048027992249,
+      "learning_rate": 8.961861643389562e-05,
+      "loss": 0.8778,
+      "step": 11988
+    },
+    {
+      "epoch": 2.134437321937322,
+      "grad_norm": 0.7980125546455383,
+      "learning_rate": 8.960469469536786e-05,
+      "loss": 0.7797,
+      "step": 11989
+    },
+    {
+      "epoch": 2.1346153846153846,
+      "grad_norm": 0.9979715943336487,
+      "learning_rate": 8.959077316051452e-05,
+      "loss": 0.7388,
+      "step": 11990
+    },
+    {
+      "epoch": 2.1347934472934473,
+      "grad_norm": 1.0040662288665771,
+      "learning_rate": 8.957685182960833e-05,
+      "loss": 0.954,
+      "step": 11991
+    },
+    {
+      "epoch": 2.13497150997151,
+      "grad_norm": 0.7885099053382874,
+      "learning_rate": 8.956293070292214e-05,
+      "loss": 0.9232,
+      "step": 11992
+    },
+    {
+      "epoch": 2.135149572649573,
+      "grad_norm": 0.7242771983146667,
+      "learning_rate": 8.954900978072859e-05,
+      "loss": 0.8614,
+      "step": 11993
+    },
+    {
+      "epoch": 2.135327635327635,
+      "grad_norm": 0.7970352172851562,
+      "learning_rate": 8.95350890633005e-05,
+      "loss": 0.8959,
+      "step": 11994
+    },
+    {
+      "epoch": 2.135505698005698,
+      "grad_norm": 0.8587128520011902,
+      "learning_rate": 8.952116855091059e-05,
+      "loss": 0.9981,
+      "step": 11995
+    },
+    {
+      "epoch": 2.1356837606837606,
+      "grad_norm": 0.8206220269203186,
+      "learning_rate": 8.950724824383164e-05,
+      "loss": 1.0271,
+      "step": 11996
+    },
+    {
+      "epoch": 2.1358618233618234,
+      "grad_norm": 0.8085001707077026,
+      "learning_rate": 8.949332814233635e-05,
+      "loss": 0.925,
+      "step": 11997
+    },
+    {
+      "epoch": 2.136039886039886,
+      "grad_norm": 0.8361417651176453,
+      "learning_rate": 8.947940824669748e-05,
+      "loss": 0.8744,
+      "step": 11998
+    },
+    {
+      "epoch": 2.136217948717949,
+      "grad_norm": 0.7548407316207886,
+      "learning_rate": 8.946548855718773e-05,
+      "loss": 0.7365,
+      "step": 11999
+    },
+    {
+      "epoch": 2.136396011396011,
+      "grad_norm": 0.8671223521232605,
+      "learning_rate": 8.945156907407983e-05,
+      "loss": 0.8958,
+      "step": 12000
+    },
+    {
+      "epoch": 2.136574074074074,
+      "grad_norm": 0.8007429838180542,
+      "learning_rate": 8.943764979764656e-05,
+      "loss": 0.955,
+      "step": 12001
+    },
+    {
+      "epoch": 2.1367521367521367,
+      "grad_norm": 0.7834315299987793,
+      "learning_rate": 8.942373072816057e-05,
+      "loss": 0.9226,
+      "step": 12002
+    },
+    {
+      "epoch": 2.1369301994301995,
+      "grad_norm": 0.896920919418335,
+      "learning_rate": 8.940981186589466e-05,
+      "loss": 0.8779,
+      "step": 12003
+    },
+    {
+      "epoch": 2.137108262108262,
+      "grad_norm": 0.7473411560058594,
+      "learning_rate": 8.939589321112143e-05,
+      "loss": 0.8993,
+      "step": 12004
+    },
+    {
+      "epoch": 2.137286324786325,
+      "grad_norm": 0.8071674704551697,
+      "learning_rate": 8.938197476411367e-05,
+      "loss": 0.9998,
+      "step": 12005
+    },
+    {
+      "epoch": 2.1374643874643873,
+      "grad_norm": 0.839290976524353,
+      "learning_rate": 8.936805652514404e-05,
+      "loss": 0.8311,
+      "step": 12006
+    },
+    {
+      "epoch": 2.13764245014245,
+      "grad_norm": 0.7217035293579102,
+      "learning_rate": 8.93541384944853e-05,
+      "loss": 0.8009,
+      "step": 12007
+    },
+    {
+      "epoch": 2.1378205128205128,
+      "grad_norm": 0.7392259836196899,
+      "learning_rate": 8.934022067241004e-05,
+      "loss": 0.9854,
+      "step": 12008
+    },
+    {
+      "epoch": 2.1379985754985755,
+      "grad_norm": 0.7470507621765137,
+      "learning_rate": 8.932630305919107e-05,
+      "loss": 0.8111,
+      "step": 12009
+    },
+    {
+      "epoch": 2.1381766381766383,
+      "grad_norm": 0.7988318204879761,
+      "learning_rate": 8.931238565510098e-05,
+      "loss": 0.8492,
+      "step": 12010
+    },
+    {
+      "epoch": 2.138354700854701,
+      "grad_norm": 0.9267526268959045,
+      "learning_rate": 8.929846846041251e-05,
+      "loss": 1.2238,
+      "step": 12011
+    },
+    {
+      "epoch": 2.1385327635327633,
+      "grad_norm": 0.8036465644836426,
+      "learning_rate": 8.92845514753983e-05,
+      "loss": 0.8837,
+      "step": 12012
+    },
+    {
+      "epoch": 2.138710826210826,
+      "grad_norm": 0.809256911277771,
+      "learning_rate": 8.927063470033109e-05,
+      "loss": 0.8836,
+      "step": 12013
+    },
+    {
+      "epoch": 2.138888888888889,
+      "grad_norm": 0.754692792892456,
+      "learning_rate": 8.925671813548345e-05,
+      "loss": 0.9469,
+      "step": 12014
+    },
+    {
+      "epoch": 2.1390669515669516,
+      "grad_norm": 0.9183036088943481,
+      "learning_rate": 8.924280178112814e-05,
+      "loss": 0.7654,
+      "step": 12015
+    },
+    {
+      "epoch": 2.1392450142450143,
+      "grad_norm": 0.82411128282547,
+      "learning_rate": 8.922888563753775e-05,
+      "loss": 0.9132,
+      "step": 12016
+    },
+    {
+      "epoch": 2.139423076923077,
+      "grad_norm": 0.8455918431282043,
+      "learning_rate": 8.9214969704985e-05,
+      "loss": 1.0041,
+      "step": 12017
+    },
+    {
+      "epoch": 2.1396011396011394,
+      "grad_norm": 0.9235896468162537,
+      "learning_rate": 8.92010539837425e-05,
+      "loss": 0.7842,
+      "step": 12018
+    },
+    {
+      "epoch": 2.139779202279202,
+      "grad_norm": 0.8965059518814087,
+      "learning_rate": 8.918713847408289e-05,
+      "loss": 1.0665,
+      "step": 12019
+    },
+    {
+      "epoch": 2.139957264957265,
+      "grad_norm": Infinity,
+      "learning_rate": 8.918713847408289e-05,
+      "loss": 1.0773,
+      "step": 12020
+    },
+    {
+      "epoch": 2.1401353276353277,
+      "grad_norm": 0.8859738707542419,
+      "learning_rate": 8.917322317627887e-05,
+      "loss": 0.9175,
+      "step": 12021
+    },
+    {
+      "epoch": 2.1403133903133904,
+      "grad_norm": 0.7828214764595032,
+      "learning_rate": 8.915930809060304e-05,
+      "loss": 0.8433,
+      "step": 12022
+    },
+    {
+      "epoch": 2.140491452991453,
+      "grad_norm": 0.7705734372138977,
+      "learning_rate": 8.914539321732808e-05,
+      "loss": 0.8696,
+      "step": 12023
+    },
+    {
+      "epoch": 2.1406695156695155,
+      "grad_norm": 0.7999989986419678,
+      "learning_rate": 8.913147855672655e-05,
+      "loss": 1.0531,
+      "step": 12024
+    },
+    {
+      "epoch": 2.140847578347578,
+      "grad_norm": 0.7210655212402344,
+      "learning_rate": 8.911756410907118e-05,
+      "loss": 0.6703,
+      "step": 12025
+    },
+    {
+      "epoch": 2.141025641025641,
+      "grad_norm": 0.7153459191322327,
+      "learning_rate": 8.910364987463447e-05,
+      "loss": 0.7166,
+      "step": 12026
+    },
+    {
+      "epoch": 2.1412037037037037,
+      "grad_norm": 0.771530032157898,
+      "learning_rate": 8.908973585368913e-05,
+      "loss": 0.6881,
+      "step": 12027
+    },
+    {
+      "epoch": 2.1413817663817665,
+      "grad_norm": 0.9988116025924683,
+      "learning_rate": 8.907582204650774e-05,
+      "loss": 0.8329,
+      "step": 12028
+    },
+    {
+      "epoch": 2.1415598290598292,
+      "grad_norm": 0.6992440819740295,
+      "learning_rate": 8.906190845336296e-05,
+      "loss": 0.6262,
+      "step": 12029
+    },
+    {
+      "epoch": 2.1417378917378915,
+      "grad_norm": 0.8061181902885437,
+      "learning_rate": 8.904799507452731e-05,
+      "loss": 0.8325,
+      "step": 12030
+    },
+    {
+      "epoch": 2.1419159544159543,
+      "grad_norm": 0.8372871279716492,
+      "learning_rate": 8.903408191027349e-05,
+      "loss": 0.8894,
+      "step": 12031
+    },
+    {
+      "epoch": 2.142094017094017,
+      "grad_norm": 0.803719162940979,
+      "learning_rate": 8.902016896087402e-05,
+      "loss": 0.9031,
+      "step": 12032
+    },
+    {
+      "epoch": 2.14227207977208,
+      "grad_norm": 0.8168890476226807,
+      "learning_rate": 8.900625622660158e-05,
+      "loss": 0.8174,
+      "step": 12033
+    },
+    {
+      "epoch": 2.1424501424501425,
+      "grad_norm": 0.8011388182640076,
+      "learning_rate": 8.899234370772865e-05,
+      "loss": 0.8267,
+      "step": 12034
+    },
+    {
+      "epoch": 2.1426282051282053,
+      "grad_norm": 0.8209220767021179,
+      "learning_rate": 8.897843140452795e-05,
+      "loss": 0.9303,
+      "step": 12035
+    },
+    {
+      "epoch": 2.142806267806268,
+      "grad_norm": 0.773525059223175,
+      "learning_rate": 8.896451931727192e-05,
+      "loss": 0.7037,
+      "step": 12036
+    },
+    {
+      "epoch": 2.1429843304843303,
+      "grad_norm": 0.7568892240524292,
+      "learning_rate": 8.895060744623324e-05,
+      "loss": 0.8568,
+      "step": 12037
+    },
+    {
+      "epoch": 2.143162393162393,
+      "grad_norm": 0.713636040687561,
+      "learning_rate": 8.893669579168444e-05,
+      "loss": 0.7838,
+      "step": 12038
+    },
+    {
+      "epoch": 2.143340455840456,
+      "grad_norm": 0.7462167739868164,
+      "learning_rate": 8.892278435389814e-05,
+      "loss": 0.6311,
+      "step": 12039
+    },
+    {
+      "epoch": 2.1435185185185186,
+      "grad_norm": 0.7164530158042908,
+      "learning_rate": 8.890887313314685e-05,
+      "loss": 1.0228,
+      "step": 12040
+    },
+    {
+      "epoch": 2.1436965811965814,
+      "grad_norm": 0.7540927529335022,
+      "learning_rate": 8.889496212970312e-05,
+      "loss": 0.8958,
+      "step": 12041
+    },
+    {
+      "epoch": 2.1438746438746437,
+      "grad_norm": 0.8119065761566162,
+      "learning_rate": 8.888105134383957e-05,
+      "loss": 0.8925,
+      "step": 12042
+    },
+    {
+      "epoch": 2.1440527065527064,
+      "grad_norm": 0.7905679941177368,
+      "learning_rate": 8.88671407758287e-05,
+      "loss": 0.7579,
+      "step": 12043
+    },
+    {
+      "epoch": 2.144230769230769,
+      "grad_norm": 0.8901177048683167,
+      "learning_rate": 8.885323042594312e-05,
+      "loss": 0.8849,
+      "step": 12044
+    },
+    {
+      "epoch": 2.144408831908832,
+      "grad_norm": 0.6958974599838257,
+      "learning_rate": 8.88393202944553e-05,
+      "loss": 0.8072,
+      "step": 12045
+    },
+    {
+      "epoch": 2.1445868945868947,
+      "grad_norm": 0.790036141872406,
+      "learning_rate": 8.882541038163786e-05,
+      "loss": 0.796,
+      "step": 12046
+    },
+    {
+      "epoch": 2.1447649572649574,
+      "grad_norm": 0.757655680179596,
+      "learning_rate": 8.881150068776324e-05,
+      "loss": 0.8094,
+      "step": 12047
+    },
+    {
+      "epoch": 2.14494301994302,
+      "grad_norm": 0.7525215148925781,
+      "learning_rate": 8.879759121310404e-05,
+      "loss": 0.6746,
+      "step": 12048
+    },
+    {
+      "epoch": 2.1451210826210825,
+      "grad_norm": 0.740566074848175,
+      "learning_rate": 8.878368195793276e-05,
+      "loss": 0.688,
+      "step": 12049
+    },
+    {
+      "epoch": 2.1452991452991452,
+      "grad_norm": 0.7771985530853271,
+      "learning_rate": 8.876977292252196e-05,
+      "loss": 1.0013,
+      "step": 12050
+    },
+    {
+      "epoch": 2.145477207977208,
+      "grad_norm": 0.8582369685173035,
+      "learning_rate": 8.875586410714409e-05,
+      "loss": 0.9185,
+      "step": 12051
+    },
+    {
+      "epoch": 2.1456552706552707,
+      "grad_norm": 0.7992526292800903,
+      "learning_rate": 8.874195551207174e-05,
+      "loss": 0.8388,
+      "step": 12052
+    },
+    {
+      "epoch": 2.1458333333333335,
+      "grad_norm": 0.795129120349884,
+      "learning_rate": 8.872804713757735e-05,
+      "loss": 0.88,
+      "step": 12053
+    },
+    {
+      "epoch": 2.146011396011396,
+      "grad_norm": 0.7467540502548218,
+      "learning_rate": 8.871413898393351e-05,
+      "loss": 0.8092,
+      "step": 12054
+    },
+    {
+      "epoch": 2.1461894586894585,
+      "grad_norm": 0.9468266367912292,
+      "learning_rate": 8.870023105141264e-05,
+      "loss": 0.7759,
+      "step": 12055
+    },
+    {
+      "epoch": 2.1463675213675213,
+      "grad_norm": 0.7893772721290588,
+      "learning_rate": 8.868632334028727e-05,
+      "loss": 0.8508,
+      "step": 12056
+    },
+    {
+      "epoch": 2.146545584045584,
+      "grad_norm": 0.6931375861167908,
+      "learning_rate": 8.867241585082988e-05,
+      "loss": 0.5013,
+      "step": 12057
+    },
+    {
+      "epoch": 2.146723646723647,
+      "grad_norm": 0.8978447318077087,
+      "learning_rate": 8.865850858331301e-05,
+      "loss": 0.9518,
+      "step": 12058
+    },
+    {
+      "epoch": 2.1469017094017095,
+      "grad_norm": 0.7293453812599182,
+      "learning_rate": 8.864460153800906e-05,
+      "loss": 0.731,
+      "step": 12059
+    },
+    {
+      "epoch": 2.1470797720797723,
+      "grad_norm": 0.8537824749946594,
+      "learning_rate": 8.863069471519056e-05,
+      "loss": 0.7935,
+      "step": 12060
+    },
+    {
+      "epoch": 2.1472578347578346,
+      "grad_norm": 0.6527614593505859,
+      "learning_rate": 8.861678811513002e-05,
+      "loss": 0.6579,
+      "step": 12061
+    },
+    {
+      "epoch": 2.1474358974358974,
+      "grad_norm": 0.9407904148101807,
+      "learning_rate": 8.860288173809983e-05,
+      "loss": 0.9057,
+      "step": 12062
+    },
+    {
+      "epoch": 2.14761396011396,
+      "grad_norm": 0.9314194321632385,
+      "learning_rate": 8.858897558437251e-05,
+      "loss": 1.0826,
+      "step": 12063
+    },
+    {
+      "epoch": 2.147792022792023,
+      "grad_norm": 0.7872337102890015,
+      "learning_rate": 8.85750696542205e-05,
+      "loss": 0.8919,
+      "step": 12064
+    },
+    {
+      "epoch": 2.1479700854700856,
+      "grad_norm": 0.8379341959953308,
+      "learning_rate": 8.85611639479163e-05,
+      "loss": 0.8193,
+      "step": 12065
+    },
+    {
+      "epoch": 2.148148148148148,
+      "grad_norm": 0.801295280456543,
+      "learning_rate": 8.85472584657323e-05,
+      "loss": 0.9305,
+      "step": 12066
+    },
+    {
+      "epoch": 2.1483262108262107,
+      "grad_norm": 0.7625086903572083,
+      "learning_rate": 8.853335320794098e-05,
+      "loss": 0.8379,
+      "step": 12067
+    },
+    {
+      "epoch": 2.1485042735042734,
+      "grad_norm": 0.8256231546401978,
+      "learning_rate": 8.851944817481478e-05,
+      "loss": 0.8901,
+      "step": 12068
+    },
+    {
+      "epoch": 2.148682336182336,
+      "grad_norm": 0.6940581202507019,
+      "learning_rate": 8.850554336662618e-05,
+      "loss": 0.6706,
+      "step": 12069
+    },
+    {
+      "epoch": 2.148860398860399,
+      "grad_norm": 0.910836398601532,
+      "learning_rate": 8.849163878364755e-05,
+      "loss": 0.9326,
+      "step": 12070
+    },
+    {
+      "epoch": 2.1490384615384617,
+      "grad_norm": 0.8550460934638977,
+      "learning_rate": 8.847773442615138e-05,
+      "loss": 0.8474,
+      "step": 12071
+    },
+    {
+      "epoch": 2.1492165242165244,
+      "grad_norm": 0.8178627490997314,
+      "learning_rate": 8.846383029441002e-05,
+      "loss": 0.8331,
+      "step": 12072
+    },
+    {
+      "epoch": 2.1493945868945867,
+      "grad_norm": 0.7606281638145447,
+      "learning_rate": 8.844992638869599e-05,
+      "loss": 0.6571,
+      "step": 12073
+    },
+    {
+      "epoch": 2.1495726495726495,
+      "grad_norm": 0.7166888117790222,
+      "learning_rate": 8.84360227092816e-05,
+      "loss": 0.8592,
+      "step": 12074
+    },
+    {
+      "epoch": 2.1497507122507122,
+      "grad_norm": 0.7688186764717102,
+      "learning_rate": 8.84221192564394e-05,
+      "loss": 0.691,
+      "step": 12075
+    },
+    {
+      "epoch": 2.149928774928775,
+      "grad_norm": 0.876740038394928,
+      "learning_rate": 8.840821603044166e-05,
+      "loss": 0.9962,
+      "step": 12076
+    },
+    {
+      "epoch": 2.1501068376068377,
+      "grad_norm": 0.7910363078117371,
+      "learning_rate": 8.839431303156087e-05,
+      "loss": 0.8061,
+      "step": 12077
+    },
+    {
+      "epoch": 2.1502849002849005,
+      "grad_norm": 0.6880493760108948,
+      "learning_rate": 8.83804102600694e-05,
+      "loss": 0.8078,
+      "step": 12078
+    },
+    {
+      "epoch": 2.150462962962963,
+      "grad_norm": 0.7795937061309814,
+      "learning_rate": 8.836650771623963e-05,
+      "loss": 0.8504,
+      "step": 12079
+    },
+    {
+      "epoch": 2.1506410256410255,
+      "grad_norm": 0.7761844992637634,
+      "learning_rate": 8.835260540034403e-05,
+      "loss": 0.7253,
+      "step": 12080
+    },
+    {
+      "epoch": 2.1508190883190883,
+      "grad_norm": 0.7070515751838684,
+      "learning_rate": 8.83387033126549e-05,
+      "loss": 0.7156,
+      "step": 12081
+    },
+    {
+      "epoch": 2.150997150997151,
+      "grad_norm": 0.7666274309158325,
+      "learning_rate": 8.832480145344467e-05,
+      "loss": 0.813,
+      "step": 12082
+    },
+    {
+      "epoch": 2.151175213675214,
+      "grad_norm": 0.9145975708961487,
+      "learning_rate": 8.831089982298568e-05,
+      "loss": 0.8889,
+      "step": 12083
+    },
+    {
+      "epoch": 2.1513532763532766,
+      "grad_norm": 0.7735843062400818,
+      "learning_rate": 8.829699842155035e-05,
+      "loss": 0.7152,
+      "step": 12084
+    },
+    {
+      "epoch": 2.151531339031339,
+      "grad_norm": 0.7625414729118347,
+      "learning_rate": 8.828309724941099e-05,
+      "loss": 0.8752,
+      "step": 12085
+    },
+    {
+      "epoch": 2.1517094017094016,
+      "grad_norm": 0.8874264359474182,
+      "learning_rate": 8.826919630684005e-05,
+      "loss": 0.8175,
+      "step": 12086
+    },
+    {
+      "epoch": 2.1518874643874644,
+      "grad_norm": 0.7425693273544312,
+      "learning_rate": 8.82552955941098e-05,
+      "loss": 0.7505,
+      "step": 12087
+    },
+    {
+      "epoch": 2.152065527065527,
+      "grad_norm": 0.7098270058631897,
+      "learning_rate": 8.824139511149265e-05,
+      "loss": 0.7129,
+      "step": 12088
+    },
+    {
+      "epoch": 2.15224358974359,
+      "grad_norm": 0.8470510840415955,
+      "learning_rate": 8.822749485926092e-05,
+      "loss": 0.9656,
+      "step": 12089
+    },
+    {
+      "epoch": 2.1524216524216526,
+      "grad_norm": 0.7690402865409851,
+      "learning_rate": 8.8213594837687e-05,
+      "loss": 0.9436,
+      "step": 12090
+    },
+    {
+      "epoch": 2.152599715099715,
+      "grad_norm": 0.77431321144104,
+      "learning_rate": 8.819969504704318e-05,
+      "loss": 0.9912,
+      "step": 12091
+    },
+    {
+      "epoch": 2.1527777777777777,
+      "grad_norm": 0.7590892314910889,
+      "learning_rate": 8.818579548760184e-05,
+      "loss": 0.7412,
+      "step": 12092
+    },
+    {
+      "epoch": 2.1529558404558404,
+      "grad_norm": 0.870966374874115,
+      "learning_rate": 8.817189615963528e-05,
+      "loss": 1.0248,
+      "step": 12093
+    },
+    {
+      "epoch": 2.153133903133903,
+      "grad_norm": 0.7989356517791748,
+      "learning_rate": 8.815799706341587e-05,
+      "loss": 0.7104,
+      "step": 12094
+    },
+    {
+      "epoch": 2.153311965811966,
+      "grad_norm": 0.9615582227706909,
+      "learning_rate": 8.814409819921589e-05,
+      "loss": 1.0191,
+      "step": 12095
+    },
+    {
+      "epoch": 2.1534900284900287,
+      "grad_norm": 0.7063159346580505,
+      "learning_rate": 8.81301995673077e-05,
+      "loss": 0.8209,
+      "step": 12096
+    },
+    {
+      "epoch": 2.153668091168091,
+      "grad_norm": 0.8179874420166016,
+      "learning_rate": 8.811630116796356e-05,
+      "loss": 0.9457,
+      "step": 12097
+    },
+    {
+      "epoch": 2.1538461538461537,
+      "grad_norm": 0.7227353453636169,
+      "learning_rate": 8.810240300145582e-05,
+      "loss": 0.8112,
+      "step": 12098
+    },
+    {
+      "epoch": 2.1540242165242165,
+      "grad_norm": 0.7480359077453613,
+      "learning_rate": 8.808850506805677e-05,
+      "loss": 0.6293,
+      "step": 12099
+    },
+    {
+      "epoch": 2.1542022792022792,
+      "grad_norm": 0.7610893845558167,
+      "learning_rate": 8.807460736803871e-05,
+      "loss": 0.911,
+      "step": 12100
+    },
+    {
+      "epoch": 2.154380341880342,
+      "grad_norm": 0.774640679359436,
+      "learning_rate": 8.806070990167399e-05,
+      "loss": 0.8144,
+      "step": 12101
+    },
+    {
+      "epoch": 2.1545584045584047,
+      "grad_norm": 0.7785552144050598,
+      "learning_rate": 8.804681266923482e-05,
+      "loss": 0.8841,
+      "step": 12102
+    },
+    {
+      "epoch": 2.154736467236467,
+      "grad_norm": 0.843715488910675,
+      "learning_rate": 8.803291567099354e-05,
+      "loss": 0.9056,
+      "step": 12103
+    },
+    {
+      "epoch": 2.15491452991453,
+      "grad_norm": 0.7996346354484558,
+      "learning_rate": 8.801901890722241e-05,
+      "loss": 0.8916,
+      "step": 12104
+    },
+    {
+      "epoch": 2.1550925925925926,
+      "grad_norm": 0.9159125685691833,
+      "learning_rate": 8.800512237819376e-05,
+      "loss": 0.8225,
+      "step": 12105
+    },
+    {
+      "epoch": 2.1552706552706553,
+      "grad_norm": 0.8341643810272217,
+      "learning_rate": 8.799122608417976e-05,
+      "loss": 0.8702,
+      "step": 12106
+    },
+    {
+      "epoch": 2.155448717948718,
+      "grad_norm": 0.8075932264328003,
+      "learning_rate": 8.797733002545278e-05,
+      "loss": 0.9167,
+      "step": 12107
+    },
+    {
+      "epoch": 2.155626780626781,
+      "grad_norm": 0.8370183706283569,
+      "learning_rate": 8.7963434202285e-05,
+      "loss": 0.9213,
+      "step": 12108
+    },
+    {
+      "epoch": 2.155804843304843,
+      "grad_norm": 0.7500374913215637,
+      "learning_rate": 8.794953861494877e-05,
+      "loss": 0.7702,
+      "step": 12109
+    },
+    {
+      "epoch": 2.155982905982906,
+      "grad_norm": 0.7347766160964966,
+      "learning_rate": 8.793564326371626e-05,
+      "loss": 0.7057,
+      "step": 12110
+    },
+    {
+      "epoch": 2.1561609686609686,
+      "grad_norm": 0.754917562007904,
+      "learning_rate": 8.79217481488598e-05,
+      "loss": 0.8725,
+      "step": 12111
+    },
+    {
+      "epoch": 2.1563390313390314,
+      "grad_norm": 0.6942774057388306,
+      "learning_rate": 8.790785327065155e-05,
+      "loss": 0.85,
+      "step": 12112
+    },
+    {
+      "epoch": 2.156517094017094,
+      "grad_norm": 0.8082157969474792,
+      "learning_rate": 8.789395862936383e-05,
+      "loss": 1.1462,
+      "step": 12113
+    },
+    {
+      "epoch": 2.156695156695157,
+      "grad_norm": 0.898435652256012,
+      "learning_rate": 8.788006422526881e-05,
+      "loss": 0.8044,
+      "step": 12114
+    },
+    {
+      "epoch": 2.156873219373219,
+      "grad_norm": 0.9474737048149109,
+      "learning_rate": 8.786617005863879e-05,
+      "loss": 0.9089,
+      "step": 12115
+    },
+    {
+      "epoch": 2.157051282051282,
+      "grad_norm": 0.7898718118667603,
+      "learning_rate": 8.785227612974594e-05,
+      "loss": 0.9758,
+      "step": 12116
+    },
+    {
+      "epoch": 2.1572293447293447,
+      "grad_norm": 0.6734052300453186,
+      "learning_rate": 8.783838243886253e-05,
+      "loss": 0.7835,
+      "step": 12117
+    },
+    {
+      "epoch": 2.1574074074074074,
+      "grad_norm": 0.9381069540977478,
+      "learning_rate": 8.782448898626072e-05,
+      "loss": 0.7666,
+      "step": 12118
+    },
+    {
+      "epoch": 2.15758547008547,
+      "grad_norm": 0.8677506446838379,
+      "learning_rate": 8.781059577221276e-05,
+      "loss": 0.7442,
+      "step": 12119
+    },
+    {
+      "epoch": 2.157763532763533,
+      "grad_norm": 0.8244445323944092,
+      "learning_rate": 8.779670279699086e-05,
+      "loss": 0.8104,
+      "step": 12120
+    },
+    {
+      "epoch": 2.1579415954415953,
+      "grad_norm": 0.7984805703163147,
+      "learning_rate": 8.77828100608672e-05,
+      "loss": 1.029,
+      "step": 12121
+    },
+    {
+      "epoch": 2.158119658119658,
+      "grad_norm": 0.7817366123199463,
+      "learning_rate": 8.776891756411405e-05,
+      "loss": 0.6797,
+      "step": 12122
+    },
+    {
+      "epoch": 2.1582977207977208,
+      "grad_norm": 0.7084082365036011,
+      "learning_rate": 8.77550253070035e-05,
+      "loss": 0.688,
+      "step": 12123
+    },
+    {
+      "epoch": 2.1584757834757835,
+      "grad_norm": 0.7659782767295837,
+      "learning_rate": 8.774113328980782e-05,
+      "loss": 0.8691,
+      "step": 12124
+    },
+    {
+      "epoch": 2.1586538461538463,
+      "grad_norm": 0.7010130286216736,
+      "learning_rate": 8.772724151279913e-05,
+      "loss": 0.7587,
+      "step": 12125
+    },
+    {
+      "epoch": 2.158831908831909,
+      "grad_norm": 0.8183525800704956,
+      "learning_rate": 8.771334997624973e-05,
+      "loss": 0.8696,
+      "step": 12126
+    },
+    {
+      "epoch": 2.1590099715099713,
+      "grad_norm": 0.7944908142089844,
+      "learning_rate": 8.769945868043164e-05,
+      "loss": 0.8625,
+      "step": 12127
+    },
+    {
+      "epoch": 2.159188034188034,
+      "grad_norm": 0.7710323333740234,
+      "learning_rate": 8.768556762561713e-05,
+      "loss": 0.7765,
+      "step": 12128
+    },
+    {
+      "epoch": 2.159366096866097,
+      "grad_norm": 0.7416872382164001,
+      "learning_rate": 8.767167681207833e-05,
+      "loss": 0.9151,
+      "step": 12129
+    },
+    {
+      "epoch": 2.1595441595441596,
+      "grad_norm": 0.9230012893676758,
+      "learning_rate": 8.765778624008744e-05,
+      "loss": 0.914,
+      "step": 12130
+    },
+    {
+      "epoch": 2.1597222222222223,
+      "grad_norm": 0.7468557357788086,
+      "learning_rate": 8.764389590991657e-05,
+      "loss": 0.8624,
+      "step": 12131
+    },
+    {
+      "epoch": 2.159900284900285,
+      "grad_norm": 0.7746220827102661,
+      "learning_rate": 8.763000582183791e-05,
+      "loss": 0.9683,
+      "step": 12132
+    },
+    {
+      "epoch": 2.1600783475783474,
+      "grad_norm": 0.8429577350616455,
+      "learning_rate": 8.761611597612356e-05,
+      "loss": 0.8808,
+      "step": 12133
+    },
+    {
+      "epoch": 2.16025641025641,
+      "grad_norm": 0.8117298483848572,
+      "learning_rate": 8.760222637304572e-05,
+      "loss": 0.7067,
+      "step": 12134
+    },
+    {
+      "epoch": 2.160434472934473,
+      "grad_norm": 0.7717329859733582,
+      "learning_rate": 8.758833701287647e-05,
+      "loss": 1.0001,
+      "step": 12135
+    },
+    {
+      "epoch": 2.1606125356125356,
+      "grad_norm": 0.8493856191635132,
+      "learning_rate": 8.7574447895888e-05,
+      "loss": 0.9883,
+      "step": 12136
+    },
+    {
+      "epoch": 2.1607905982905984,
+      "grad_norm": 0.8592587113380432,
+      "learning_rate": 8.75605590223524e-05,
+      "loss": 0.7437,
+      "step": 12137
+    },
+    {
+      "epoch": 2.160968660968661,
+      "grad_norm": 0.6487032771110535,
+      "learning_rate": 8.75466703925418e-05,
+      "loss": 0.6841,
+      "step": 12138
+    },
+    {
+      "epoch": 2.1611467236467234,
+      "grad_norm": 0.8449310660362244,
+      "learning_rate": 8.753278200672832e-05,
+      "loss": 0.9221,
+      "step": 12139
+    },
+    {
+      "epoch": 2.161324786324786,
+      "grad_norm": 0.9603136777877808,
+      "learning_rate": 8.751889386518407e-05,
+      "loss": 0.8664,
+      "step": 12140
+    },
+    {
+      "epoch": 2.161502849002849,
+      "grad_norm": 0.7288493514060974,
+      "learning_rate": 8.750500596818121e-05,
+      "loss": 0.5745,
+      "step": 12141
+    },
+    {
+      "epoch": 2.1616809116809117,
+      "grad_norm": 0.8626441955566406,
+      "learning_rate": 8.749111831599178e-05,
+      "loss": 0.8346,
+      "step": 12142
+    },
+    {
+      "epoch": 2.1618589743589745,
+      "grad_norm": 0.7634188532829285,
+      "learning_rate": 8.74772309088879e-05,
+      "loss": 0.7503,
+      "step": 12143
+    },
+    {
+      "epoch": 2.162037037037037,
+      "grad_norm": 0.8641456365585327,
+      "learning_rate": 8.746334374714167e-05,
+      "loss": 0.9033,
+      "step": 12144
+    },
+    {
+      "epoch": 2.1622150997150995,
+      "grad_norm": 0.8103315234184265,
+      "learning_rate": 8.744945683102517e-05,
+      "loss": 0.8181,
+      "step": 12145
+    },
+    {
+      "epoch": 2.1623931623931623,
+      "grad_norm": 1.2493078708648682,
+      "learning_rate": 8.743557016081047e-05,
+      "loss": 1.0308,
+      "step": 12146
+    },
+    {
+      "epoch": 2.162571225071225,
+      "grad_norm": 0.6447771191596985,
+      "learning_rate": 8.742168373676973e-05,
+      "loss": 0.6886,
+      "step": 12147
+    },
+    {
+      "epoch": 2.1627492877492878,
+      "grad_norm": 0.90229332447052,
+      "learning_rate": 8.740779755917492e-05,
+      "loss": 1.0361,
+      "step": 12148
+    },
+    {
+      "epoch": 2.1629273504273505,
+      "grad_norm": 0.7414017915725708,
+      "learning_rate": 8.739391162829818e-05,
+      "loss": 0.781,
+      "step": 12149
+    },
+    {
+      "epoch": 2.1631054131054133,
+      "grad_norm": 0.8897294998168945,
+      "learning_rate": 8.738002594441154e-05,
+      "loss": 0.7712,
+      "step": 12150
+    },
+    {
+      "epoch": 2.1632834757834756,
+      "grad_norm": 0.8515656590461731,
+      "learning_rate": 8.73661405077871e-05,
+      "loss": 0.8843,
+      "step": 12151
+    },
+    {
+      "epoch": 2.1634615384615383,
+      "grad_norm": 0.7901699542999268,
+      "learning_rate": 8.735225531869686e-05,
+      "loss": 0.8588,
+      "step": 12152
+    },
+    {
+      "epoch": 2.163639601139601,
+      "grad_norm": 0.7262305021286011,
+      "learning_rate": 8.733837037741295e-05,
+      "loss": 0.9257,
+      "step": 12153
+    },
+    {
+      "epoch": 2.163817663817664,
+      "grad_norm": 1.1076871156692505,
+      "learning_rate": 8.732448568420732e-05,
+      "loss": 0.9817,
+      "step": 12154
+    },
+    {
+      "epoch": 2.1639957264957266,
+      "grad_norm": 0.8384785652160645,
+      "learning_rate": 8.731060123935209e-05,
+      "loss": 0.8024,
+      "step": 12155
+    },
+    {
+      "epoch": 2.1641737891737893,
+      "grad_norm": 0.8376259803771973,
+      "learning_rate": 8.729671704311924e-05,
+      "loss": 1.0299,
+      "step": 12156
+    },
+    {
+      "epoch": 2.164351851851852,
+      "grad_norm": 0.8248558044433594,
+      "learning_rate": 8.728283309578089e-05,
+      "loss": 0.9557,
+      "step": 12157
+    },
+    {
+      "epoch": 2.1645299145299144,
+      "grad_norm": 0.7452875375747681,
+      "learning_rate": 8.726894939760894e-05,
+      "loss": 0.8267,
+      "step": 12158
+    },
+    {
+      "epoch": 2.164707977207977,
+      "grad_norm": 0.8329267501831055,
+      "learning_rate": 8.72550659488755e-05,
+      "loss": 0.9288,
+      "step": 12159
+    },
+    {
+      "epoch": 2.16488603988604,
+      "grad_norm": 0.8748268485069275,
+      "learning_rate": 8.724118274985259e-05,
+      "loss": 0.8663,
+      "step": 12160
+    },
+    {
+      "epoch": 2.1650641025641026,
+      "grad_norm": 0.6839116811752319,
+      "learning_rate": 8.722729980081217e-05,
+      "loss": 0.6067,
+      "step": 12161
+    },
+    {
+      "epoch": 2.1652421652421654,
+      "grad_norm": 0.8343674540519714,
+      "learning_rate": 8.721341710202632e-05,
+      "loss": 0.9611,
+      "step": 12162
+    },
+    {
+      "epoch": 2.1654202279202277,
+      "grad_norm": 0.7783843874931335,
+      "learning_rate": 8.719953465376695e-05,
+      "loss": 0.8921,
+      "step": 12163
+    },
+    {
+      "epoch": 2.1655982905982905,
+      "grad_norm": 0.8357030749320984,
+      "learning_rate": 8.718565245630615e-05,
+      "loss": 0.9189,
+      "step": 12164
+    },
+    {
+      "epoch": 2.165776353276353,
+      "grad_norm": 0.8150131702423096,
+      "learning_rate": 8.717177050991582e-05,
+      "loss": 0.7486,
+      "step": 12165
+    },
+    {
+      "epoch": 2.165954415954416,
+      "grad_norm": 0.7282506823539734,
+      "learning_rate": 8.715788881486807e-05,
+      "loss": 0.8894,
+      "step": 12166
+    },
+    {
+      "epoch": 2.1661324786324787,
+      "grad_norm": 0.9958226680755615,
+      "learning_rate": 8.714400737143475e-05,
+      "loss": 1.0359,
+      "step": 12167
+    },
+    {
+      "epoch": 2.1663105413105415,
+      "grad_norm": 0.7162553071975708,
+      "learning_rate": 8.713012617988796e-05,
+      "loss": 0.7728,
+      "step": 12168
+    },
+    {
+      "epoch": 2.166488603988604,
+      "grad_norm": 0.6364821791648865,
+      "learning_rate": 8.711624524049955e-05,
+      "loss": 0.5881,
+      "step": 12169
+    },
+    {
+      "epoch": 2.1666666666666665,
+      "grad_norm": 0.9431148767471313,
+      "learning_rate": 8.710236455354159e-05,
+      "loss": 0.8804,
+      "step": 12170
+    },
+    {
+      "epoch": 2.1668447293447293,
+      "grad_norm": 0.7328855395317078,
+      "learning_rate": 8.708848411928598e-05,
+      "loss": 0.7762,
+      "step": 12171
+    },
+    {
+      "epoch": 2.167022792022792,
+      "grad_norm": 0.7855633497238159,
+      "learning_rate": 8.707460393800472e-05,
+      "loss": 0.7687,
+      "step": 12172
+    },
+    {
+      "epoch": 2.1672008547008548,
+      "grad_norm": 0.8694273233413696,
+      "learning_rate": 8.706072400996973e-05,
+      "loss": 0.7153,
+      "step": 12173
+    },
+    {
+      "epoch": 2.1673789173789175,
+      "grad_norm": 0.7371255159378052,
+      "learning_rate": 8.704684433545299e-05,
+      "loss": 0.7901,
+      "step": 12174
+    },
+    {
+      "epoch": 2.16755698005698,
+      "grad_norm": 0.7719849944114685,
+      "learning_rate": 8.70329649147264e-05,
+      "loss": 0.7569,
+      "step": 12175
+    },
+    {
+      "epoch": 2.1677350427350426,
+      "grad_norm": 0.883618175983429,
+      "learning_rate": 8.701908574806197e-05,
+      "loss": 0.8941,
+      "step": 12176
+    },
+    {
+      "epoch": 2.1679131054131053,
+      "grad_norm": 0.9455791711807251,
+      "learning_rate": 8.700520683573155e-05,
+      "loss": 0.8596,
+      "step": 12177
+    },
+    {
+      "epoch": 2.168091168091168,
+      "grad_norm": 0.7487229108810425,
+      "learning_rate": 8.69913281780071e-05,
+      "loss": 0.7353,
+      "step": 12178
+    },
+    {
+      "epoch": 2.168269230769231,
+      "grad_norm": 0.8050364255905151,
+      "learning_rate": 8.697744977516062e-05,
+      "loss": 0.8564,
+      "step": 12179
+    },
+    {
+      "epoch": 2.1684472934472936,
+      "grad_norm": 0.759355902671814,
+      "learning_rate": 8.69635716274639e-05,
+      "loss": 0.7128,
+      "step": 12180
+    },
+    {
+      "epoch": 2.1686253561253563,
+      "grad_norm": 0.8730760216712952,
+      "learning_rate": 8.694969373518892e-05,
+      "loss": 0.9944,
+      "step": 12181
+    },
+    {
+      "epoch": 2.1688034188034186,
+      "grad_norm": 0.7761130332946777,
+      "learning_rate": 8.693581609860756e-05,
+      "loss": 0.6845,
+      "step": 12182
+    },
+    {
+      "epoch": 2.1689814814814814,
+      "grad_norm": 0.8118788003921509,
+      "learning_rate": 8.692193871799181e-05,
+      "loss": 0.798,
+      "step": 12183
+    },
+    {
+      "epoch": 2.169159544159544,
+      "grad_norm": 0.8340219855308533,
+      "learning_rate": 8.690806159361344e-05,
+      "loss": 0.9754,
+      "step": 12184
+    },
+    {
+      "epoch": 2.169337606837607,
+      "grad_norm": 0.7515831589698792,
+      "learning_rate": 8.689418472574444e-05,
+      "loss": 0.998,
+      "step": 12185
+    },
+    {
+      "epoch": 2.1695156695156697,
+      "grad_norm": 0.7781083583831787,
+      "learning_rate": 8.688030811465665e-05,
+      "loss": 1.0152,
+      "step": 12186
+    },
+    {
+      "epoch": 2.169693732193732,
+      "grad_norm": 0.775097131729126,
+      "learning_rate": 8.6866431760622e-05,
+      "loss": 0.808,
+      "step": 12187
+    },
+    {
+      "epoch": 2.1698717948717947,
+      "grad_norm": 0.8288158178329468,
+      "learning_rate": 8.68525556639123e-05,
+      "loss": 0.9172,
+      "step": 12188
+    },
+    {
+      "epoch": 2.1700498575498575,
+      "grad_norm": 0.7754917740821838,
+      "learning_rate": 8.68386798247995e-05,
+      "loss": 0.719,
+      "step": 12189
+    },
+    {
+      "epoch": 2.17022792022792,
+      "grad_norm": 0.786685585975647,
+      "learning_rate": 8.682480424355539e-05,
+      "loss": 0.8883,
+      "step": 12190
+    },
+    {
+      "epoch": 2.170405982905983,
+      "grad_norm": 0.9970952272415161,
+      "learning_rate": 8.681092892045189e-05,
+      "loss": 0.9258,
+      "step": 12191
+    },
+    {
+      "epoch": 2.1705840455840457,
+      "grad_norm": 0.9732664227485657,
+      "learning_rate": 8.679705385576082e-05,
+      "loss": 0.8916,
+      "step": 12192
+    },
+    {
+      "epoch": 2.1707621082621085,
+      "grad_norm": 0.8557142615318298,
+      "learning_rate": 8.67831790497541e-05,
+      "loss": 0.8908,
+      "step": 12193
+    },
+    {
+      "epoch": 2.1709401709401708,
+      "grad_norm": 0.8564930558204651,
+      "learning_rate": 8.676930450270347e-05,
+      "loss": 1.054,
+      "step": 12194
+    },
+    {
+      "epoch": 2.1711182336182335,
+      "grad_norm": 0.785732090473175,
+      "learning_rate": 8.675543021488087e-05,
+      "loss": 0.7459,
+      "step": 12195
+    },
+    {
+      "epoch": 2.1712962962962963,
+      "grad_norm": 0.8739910125732422,
+      "learning_rate": 8.674155618655809e-05,
+      "loss": 0.8464,
+      "step": 12196
+    },
+    {
+      "epoch": 2.171474358974359,
+      "grad_norm": 0.8624834418296814,
+      "learning_rate": 8.672768241800699e-05,
+      "loss": 0.9405,
+      "step": 12197
+    },
+    {
+      "epoch": 2.171652421652422,
+      "grad_norm": 0.7948583364486694,
+      "learning_rate": 8.671380890949936e-05,
+      "loss": 1.0271,
+      "step": 12198
+    },
+    {
+      "epoch": 2.1718304843304845,
+      "grad_norm": 0.8078029155731201,
+      "learning_rate": 8.669993566130704e-05,
+      "loss": 0.6845,
+      "step": 12199
+    },
+    {
+      "epoch": 2.172008547008547,
+      "grad_norm": 0.7599586844444275,
+      "learning_rate": 8.668606267370187e-05,
+      "loss": 0.8438,
+      "step": 12200
+    },
+    {
+      "epoch": 2.1721866096866096,
+      "grad_norm": 0.8085161447525024,
+      "learning_rate": 8.667218994695562e-05,
+      "loss": 0.8398,
+      "step": 12201
+    },
+    {
+      "epoch": 2.1723646723646723,
+      "grad_norm": 0.9033090472221375,
+      "learning_rate": 8.665831748134019e-05,
+      "loss": 0.8591,
+      "step": 12202
+    },
+    {
+      "epoch": 2.172542735042735,
+      "grad_norm": 0.8638277649879456,
+      "learning_rate": 8.664444527712726e-05,
+      "loss": 0.8276,
+      "step": 12203
+    },
+    {
+      "epoch": 2.172720797720798,
+      "grad_norm": 0.8230745792388916,
+      "learning_rate": 8.663057333458871e-05,
+      "loss": 0.8663,
+      "step": 12204
+    },
+    {
+      "epoch": 2.1728988603988606,
+      "grad_norm": 0.8588439226150513,
+      "learning_rate": 8.661670165399626e-05,
+      "loss": 0.8543,
+      "step": 12205
+    },
+    {
+      "epoch": 2.173076923076923,
+      "grad_norm": 0.8461976051330566,
+      "learning_rate": 8.660283023562177e-05,
+      "loss": 0.8973,
+      "step": 12206
+    },
+    {
+      "epoch": 2.1732549857549857,
+      "grad_norm": 0.7768828868865967,
+      "learning_rate": 8.658895907973697e-05,
+      "loss": 0.7285,
+      "step": 12207
+    },
+    {
+      "epoch": 2.1734330484330484,
+      "grad_norm": 0.7384130954742432,
+      "learning_rate": 8.65750881866137e-05,
+      "loss": 0.7654,
+      "step": 12208
+    },
+    {
+      "epoch": 2.173611111111111,
+      "grad_norm": 0.8700957298278809,
+      "learning_rate": 8.656121755652365e-05,
+      "loss": 1.067,
+      "step": 12209
+    },
+    {
+      "epoch": 2.173789173789174,
+      "grad_norm": 0.8067826628684998,
+      "learning_rate": 8.654734718973863e-05,
+      "loss": 0.9863,
+      "step": 12210
+    },
+    {
+      "epoch": 2.1739672364672367,
+      "grad_norm": 0.7515989542007446,
+      "learning_rate": 8.653347708653039e-05,
+      "loss": 0.8434,
+      "step": 12211
+    },
+    {
+      "epoch": 2.174145299145299,
+      "grad_norm": 0.8280966877937317,
+      "learning_rate": 8.651960724717072e-05,
+      "loss": 1.0065,
+      "step": 12212
+    },
+    {
+      "epoch": 2.1743233618233617,
+      "grad_norm": 0.7988734841346741,
+      "learning_rate": 8.650573767193132e-05,
+      "loss": 0.7892,
+      "step": 12213
+    },
+    {
+      "epoch": 2.1745014245014245,
+      "grad_norm": 0.785323977470398,
+      "learning_rate": 8.649186836108399e-05,
+      "loss": 0.7563,
+      "step": 12214
+    },
+    {
+      "epoch": 2.1746794871794872,
+      "grad_norm": 0.7884892821311951,
+      "learning_rate": 8.64779993149004e-05,
+      "loss": 0.7225,
+      "step": 12215
+    },
+    {
+      "epoch": 2.17485754985755,
+      "grad_norm": 0.9376154541969299,
+      "learning_rate": 8.646413053365235e-05,
+      "loss": 0.8103,
+      "step": 12216
+    },
+    {
+      "epoch": 2.1750356125356127,
+      "grad_norm": 0.9680297374725342,
+      "learning_rate": 8.64502620176115e-05,
+      "loss": 0.8924,
+      "step": 12217
+    },
+    {
+      "epoch": 2.175213675213675,
+      "grad_norm": 1.016848087310791,
+      "learning_rate": 8.643639376704964e-05,
+      "loss": 0.9017,
+      "step": 12218
+    },
+    {
+      "epoch": 2.175391737891738,
+      "grad_norm": 0.790868878364563,
+      "learning_rate": 8.64225257822385e-05,
+      "loss": 0.7597,
+      "step": 12219
+    },
+    {
+      "epoch": 2.1755698005698005,
+      "grad_norm": 0.7539415955543518,
+      "learning_rate": 8.640865806344974e-05,
+      "loss": 0.7801,
+      "step": 12220
+    },
+    {
+      "epoch": 2.1757478632478633,
+      "grad_norm": 0.7288404703140259,
+      "learning_rate": 8.63947906109551e-05,
+      "loss": 0.6753,
+      "step": 12221
+    },
+    {
+      "epoch": 2.175925925925926,
+      "grad_norm": 0.8449869155883789,
+      "learning_rate": 8.638092342502623e-05,
+      "loss": 0.7165,
+      "step": 12222
+    },
+    {
+      "epoch": 2.176103988603989,
+      "grad_norm": 0.8210735321044922,
+      "learning_rate": 8.636705650593495e-05,
+      "loss": 0.8677,
+      "step": 12223
+    },
+    {
+      "epoch": 2.176282051282051,
+      "grad_norm": 0.7431774735450745,
+      "learning_rate": 8.635318985395284e-05,
+      "loss": 0.7914,
+      "step": 12224
+    },
+    {
+      "epoch": 2.176460113960114,
+      "grad_norm": 0.8223997354507446,
+      "learning_rate": 8.633932346935165e-05,
+      "loss": 0.7243,
+      "step": 12225
+    },
+    {
+      "epoch": 2.1766381766381766,
+      "grad_norm": 1.0101778507232666,
+      "learning_rate": 8.632545735240299e-05,
+      "loss": 0.8608,
+      "step": 12226
+    },
+    {
+      "epoch": 2.1768162393162394,
+      "grad_norm": 0.7270255088806152,
+      "learning_rate": 8.631159150337862e-05,
+      "loss": 0.8699,
+      "step": 12227
+    },
+    {
+      "epoch": 2.176994301994302,
+      "grad_norm": 0.8687323331832886,
+      "learning_rate": 8.629772592255016e-05,
+      "loss": 0.9168,
+      "step": 12228
+    },
+    {
+      "epoch": 2.177172364672365,
+      "grad_norm": 0.7623698115348816,
+      "learning_rate": 8.628386061018934e-05,
+      "loss": 0.9012,
+      "step": 12229
+    },
+    {
+      "epoch": 2.177350427350427,
+      "grad_norm": 0.7458708882331848,
+      "learning_rate": 8.626999556656771e-05,
+      "loss": 0.9068,
+      "step": 12230
+    },
+    {
+      "epoch": 2.17752849002849,
+      "grad_norm": 0.8262876868247986,
+      "learning_rate": 8.625613079195704e-05,
+      "loss": 0.9425,
+      "step": 12231
+    },
+    {
+      "epoch": 2.1777065527065527,
+      "grad_norm": 0.8737035989761353,
+      "learning_rate": 8.624226628662893e-05,
+      "loss": 0.9943,
+      "step": 12232
+    },
+    {
+      "epoch": 2.1778846153846154,
+      "grad_norm": 0.8250965476036072,
+      "learning_rate": 8.622840205085505e-05,
+      "loss": 0.9237,
+      "step": 12233
+    },
+    {
+      "epoch": 2.178062678062678,
+      "grad_norm": 0.8689019680023193,
+      "learning_rate": 8.621453808490699e-05,
+      "loss": 0.8544,
+      "step": 12234
+    },
+    {
+      "epoch": 2.178240740740741,
+      "grad_norm": 0.8672708868980408,
+      "learning_rate": 8.620067438905643e-05,
+      "loss": 0.7623,
+      "step": 12235
+    },
+    {
+      "epoch": 2.1784188034188032,
+      "grad_norm": 0.7077436447143555,
+      "learning_rate": 8.6186810963575e-05,
+      "loss": 0.7468,
+      "step": 12236
+    },
+    {
+      "epoch": 2.178596866096866,
+      "grad_norm": 0.838474452495575,
+      "learning_rate": 8.617294780873433e-05,
+      "loss": 0.9207,
+      "step": 12237
+    },
+    {
+      "epoch": 2.1787749287749287,
+      "grad_norm": 0.7240039110183716,
+      "learning_rate": 8.615908492480598e-05,
+      "loss": 0.8981,
+      "step": 12238
+    },
+    {
+      "epoch": 2.1789529914529915,
+      "grad_norm": 0.6995998620986938,
+      "learning_rate": 8.614522231206162e-05,
+      "loss": 0.7131,
+      "step": 12239
+    },
+    {
+      "epoch": 2.1791310541310542,
+      "grad_norm": 0.7011054158210754,
+      "learning_rate": 8.613135997077288e-05,
+      "loss": 0.8138,
+      "step": 12240
+    },
+    {
+      "epoch": 2.179309116809117,
+      "grad_norm": 0.9815019369125366,
+      "learning_rate": 8.611749790121131e-05,
+      "loss": 0.9637,
+      "step": 12241
+    },
+    {
+      "epoch": 2.1794871794871793,
+      "grad_norm": 0.7523870468139648,
+      "learning_rate": 8.610363610364853e-05,
+      "loss": 0.7555,
+      "step": 12242
+    },
+    {
+      "epoch": 2.179665242165242,
+      "grad_norm": 0.8286668658256531,
+      "learning_rate": 8.608977457835612e-05,
+      "loss": 0.7911,
+      "step": 12243
+    },
+    {
+      "epoch": 2.179843304843305,
+      "grad_norm": 0.8183441758155823,
+      "learning_rate": 8.607591332560573e-05,
+      "loss": 0.793,
+      "step": 12244
+    },
+    {
+      "epoch": 2.1800213675213675,
+      "grad_norm": 0.7104299068450928,
+      "learning_rate": 8.606205234566885e-05,
+      "loss": 0.6856,
+      "step": 12245
+    },
+    {
+      "epoch": 2.1801994301994303,
+      "grad_norm": 0.871588945388794,
+      "learning_rate": 8.60481916388171e-05,
+      "loss": 0.8342,
+      "step": 12246
+    },
+    {
+      "epoch": 2.180377492877493,
+      "grad_norm": 0.8002356290817261,
+      "learning_rate": 8.603433120532206e-05,
+      "loss": 0.9451,
+      "step": 12247
+    },
+    {
+      "epoch": 2.1805555555555554,
+      "grad_norm": 0.8223865032196045,
+      "learning_rate": 8.602047104545532e-05,
+      "loss": 0.8446,
+      "step": 12248
+    },
+    {
+      "epoch": 2.180733618233618,
+      "grad_norm": 0.7381762266159058,
+      "learning_rate": 8.600661115948836e-05,
+      "loss": 0.7575,
+      "step": 12249
+    },
+    {
+      "epoch": 2.180911680911681,
+      "grad_norm": 0.8717563152313232,
+      "learning_rate": 8.599275154769284e-05,
+      "loss": 0.9615,
+      "step": 12250
+    },
+    {
+      "epoch": 2.1810897435897436,
+      "grad_norm": 0.7935179471969604,
+      "learning_rate": 8.597889221034022e-05,
+      "loss": 0.8603,
+      "step": 12251
+    },
+    {
+      "epoch": 2.1812678062678064,
+      "grad_norm": 0.6350329518318176,
+      "learning_rate": 8.596503314770208e-05,
+      "loss": 0.4981,
+      "step": 12252
+    },
+    {
+      "epoch": 2.181445868945869,
+      "grad_norm": 0.8739648461341858,
+      "learning_rate": 8.595117436004995e-05,
+      "loss": 0.7872,
+      "step": 12253
+    },
+    {
+      "epoch": 2.1816239316239314,
+      "grad_norm": 0.8199412822723389,
+      "learning_rate": 8.593731584765542e-05,
+      "loss": 0.7905,
+      "step": 12254
+    },
+    {
+      "epoch": 2.181801994301994,
+      "grad_norm": 0.7289649844169617,
+      "learning_rate": 8.592345761078993e-05,
+      "loss": 0.6981,
+      "step": 12255
+    },
+    {
+      "epoch": 2.181980056980057,
+      "grad_norm": 0.8234626650810242,
+      "learning_rate": 8.590959964972506e-05,
+      "loss": 1.0442,
+      "step": 12256
+    },
+    {
+      "epoch": 2.1821581196581197,
+      "grad_norm": 0.7804498076438904,
+      "learning_rate": 8.589574196473229e-05,
+      "loss": 0.9386,
+      "step": 12257
+    },
+    {
+      "epoch": 2.1823361823361824,
+      "grad_norm": 0.9459218382835388,
+      "learning_rate": 8.588188455608317e-05,
+      "loss": 0.8075,
+      "step": 12258
+    },
+    {
+      "epoch": 2.182514245014245,
+      "grad_norm": 0.8133191466331482,
+      "learning_rate": 8.586802742404924e-05,
+      "loss": 1.0275,
+      "step": 12259
+    },
+    {
+      "epoch": 2.1826923076923075,
+      "grad_norm": 0.8302663564682007,
+      "learning_rate": 8.58541705689019e-05,
+      "loss": 0.9887,
+      "step": 12260
+    },
+    {
+      "epoch": 2.1828703703703702,
+      "grad_norm": 0.7839202284812927,
+      "learning_rate": 8.584031399091274e-05,
+      "loss": 1.0256,
+      "step": 12261
+    },
+    {
+      "epoch": 2.183048433048433,
+      "grad_norm": 0.8050578236579895,
+      "learning_rate": 8.582645769035319e-05,
+      "loss": 0.843,
+      "step": 12262
+    },
+    {
+      "epoch": 2.1832264957264957,
+      "grad_norm": 0.749110221862793,
+      "learning_rate": 8.581260166749477e-05,
+      "loss": 0.7683,
+      "step": 12263
+    },
+    {
+      "epoch": 2.1834045584045585,
+      "grad_norm": 0.7982701659202576,
+      "learning_rate": 8.579874592260894e-05,
+      "loss": 0.942,
+      "step": 12264
+    },
+    {
+      "epoch": 2.1835826210826212,
+      "grad_norm": 0.7571866512298584,
+      "learning_rate": 8.57848904559672e-05,
+      "loss": 0.8828,
+      "step": 12265
+    },
+    {
+      "epoch": 2.183760683760684,
+      "grad_norm": 0.7445113658905029,
+      "learning_rate": 8.577103526784098e-05,
+      "loss": 0.8869,
+      "step": 12266
+    },
+    {
+      "epoch": 2.1839387464387463,
+      "grad_norm": 0.7999380230903625,
+      "learning_rate": 8.575718035850177e-05,
+      "loss": 0.9476,
+      "step": 12267
+    },
+    {
+      "epoch": 2.184116809116809,
+      "grad_norm": 0.7188777923583984,
+      "learning_rate": 8.574332572822103e-05,
+      "loss": 0.7961,
+      "step": 12268
+    },
+    {
+      "epoch": 2.184294871794872,
+      "grad_norm": 0.9545742869377136,
+      "learning_rate": 8.572947137727023e-05,
+      "loss": 0.8629,
+      "step": 12269
+    },
+    {
+      "epoch": 2.1844729344729346,
+      "grad_norm": 0.8066838979721069,
+      "learning_rate": 8.571561730592075e-05,
+      "loss": 0.7728,
+      "step": 12270
+    },
+    {
+      "epoch": 2.1846509971509973,
+      "grad_norm": 0.7819525003433228,
+      "learning_rate": 8.57017635144441e-05,
+      "loss": 0.9897,
+      "step": 12271
+    },
+    {
+      "epoch": 2.1848290598290596,
+      "grad_norm": 0.9249349236488342,
+      "learning_rate": 8.568791000311166e-05,
+      "loss": 0.8562,
+      "step": 12272
+    },
+    {
+      "epoch": 2.1850071225071224,
+      "grad_norm": 0.8118993043899536,
+      "learning_rate": 8.567405677219497e-05,
+      "loss": 0.819,
+      "step": 12273
+    },
+    {
+      "epoch": 2.185185185185185,
+      "grad_norm": 0.7858524322509766,
+      "learning_rate": 8.566020382196532e-05,
+      "loss": 1.0708,
+      "step": 12274
+    },
+    {
+      "epoch": 2.185363247863248,
+      "grad_norm": 1.0223300457000732,
+      "learning_rate": 8.564635115269422e-05,
+      "loss": 0.9929,
+      "step": 12275
+    },
+    {
+      "epoch": 2.1855413105413106,
+      "grad_norm": 0.7749526500701904,
+      "learning_rate": 8.5632498764653e-05,
+      "loss": 0.7555,
+      "step": 12276
+    },
+    {
+      "epoch": 2.1857193732193734,
+      "grad_norm": 0.8443665504455566,
+      "learning_rate": 8.561864665811313e-05,
+      "loss": 0.8488,
+      "step": 12277
+    },
+    {
+      "epoch": 2.185897435897436,
+      "grad_norm": 0.7482786178588867,
+      "learning_rate": 8.560479483334603e-05,
+      "loss": 0.6535,
+      "step": 12278
+    },
+    {
+      "epoch": 2.1860754985754984,
+      "grad_norm": 0.7981070876121521,
+      "learning_rate": 8.559094329062305e-05,
+      "loss": 0.7698,
+      "step": 12279
+    },
+    {
+      "epoch": 2.186253561253561,
+      "grad_norm": 0.7612428665161133,
+      "learning_rate": 8.557709203021564e-05,
+      "loss": 0.9086,
+      "step": 12280
+    },
+    {
+      "epoch": 2.186431623931624,
+      "grad_norm": 0.8246445059776306,
+      "learning_rate": 8.556324105239512e-05,
+      "loss": 0.9927,
+      "step": 12281
+    },
+    {
+      "epoch": 2.1866096866096867,
+      "grad_norm": 0.8902820348739624,
+      "learning_rate": 8.554939035743292e-05,
+      "loss": 0.8474,
+      "step": 12282
+    },
+    {
+      "epoch": 2.1867877492877494,
+      "grad_norm": 0.9992623329162598,
+      "learning_rate": 8.553553994560037e-05,
+      "loss": 1.0898,
+      "step": 12283
+    },
+    {
+      "epoch": 2.1869658119658117,
+      "grad_norm": 0.9124125838279724,
+      "learning_rate": 8.552168981716892e-05,
+      "loss": 0.9542,
+      "step": 12284
+    },
+    {
+      "epoch": 2.1871438746438745,
+      "grad_norm": 0.6818730235099792,
+      "learning_rate": 8.550783997240983e-05,
+      "loss": 0.678,
+      "step": 12285
+    },
+    {
+      "epoch": 2.1873219373219372,
+      "grad_norm": 0.8302112817764282,
+      "learning_rate": 8.549399041159455e-05,
+      "loss": 0.9955,
+      "step": 12286
+    },
+    {
+      "epoch": 2.1875,
+      "grad_norm": 0.840419352054596,
+      "learning_rate": 8.548014113499436e-05,
+      "loss": 0.9299,
+      "step": 12287
+    },
+    {
+      "epoch": 2.1876780626780628,
+      "grad_norm": 0.8317474722862244,
+      "learning_rate": 8.546629214288067e-05,
+      "loss": 0.754,
+      "step": 12288
+    },
+    {
+      "epoch": 2.1878561253561255,
+      "grad_norm": 0.879148542881012,
+      "learning_rate": 8.545244343552476e-05,
+      "loss": 0.9238,
+      "step": 12289
+    },
+    {
+      "epoch": 2.1880341880341883,
+      "grad_norm": 0.7899607419967651,
+      "learning_rate": 8.543859501319805e-05,
+      "loss": 0.8277,
+      "step": 12290
+    },
+    {
+      "epoch": 2.1882122507122506,
+      "grad_norm": 0.837785542011261,
+      "learning_rate": 8.542474687617176e-05,
+      "loss": 0.8801,
+      "step": 12291
+    },
+    {
+      "epoch": 2.1883903133903133,
+      "grad_norm": 0.856213390827179,
+      "learning_rate": 8.541089902471733e-05,
+      "loss": 0.7589,
+      "step": 12292
+    },
+    {
+      "epoch": 2.188568376068376,
+      "grad_norm": 0.7915818095207214,
+      "learning_rate": 8.539705145910599e-05,
+      "loss": 0.8071,
+      "step": 12293
+    },
+    {
+      "epoch": 2.188746438746439,
+      "grad_norm": 0.8266519904136658,
+      "learning_rate": 8.538320417960914e-05,
+      "loss": 0.9482,
+      "step": 12294
+    },
+    {
+      "epoch": 2.1889245014245016,
+      "grad_norm": 0.8505687713623047,
+      "learning_rate": 8.536935718649799e-05,
+      "loss": 0.9272,
+      "step": 12295
+    },
+    {
+      "epoch": 2.189102564102564,
+      "grad_norm": 0.7530698776245117,
+      "learning_rate": 8.535551048004394e-05,
+      "loss": 0.7908,
+      "step": 12296
+    },
+    {
+      "epoch": 2.1892806267806266,
+      "grad_norm": 0.7904362678527832,
+      "learning_rate": 8.534166406051818e-05,
+      "loss": 0.9771,
+      "step": 12297
+    },
+    {
+      "epoch": 2.1894586894586894,
+      "grad_norm": 0.7860299944877625,
+      "learning_rate": 8.532781792819209e-05,
+      "loss": 0.605,
+      "step": 12298
+    },
+    {
+      "epoch": 2.189636752136752,
+      "grad_norm": 0.7718655467033386,
+      "learning_rate": 8.531397208333695e-05,
+      "loss": 0.8844,
+      "step": 12299
+    },
+    {
+      "epoch": 2.189814814814815,
+      "grad_norm": 0.8069637417793274,
+      "learning_rate": 8.530012652622397e-05,
+      "loss": 0.8571,
+      "step": 12300
+    },
+    {
+      "epoch": 2.1899928774928776,
+      "grad_norm": 0.8557140231132507,
+      "learning_rate": 8.528628125712455e-05,
+      "loss": 0.7396,
+      "step": 12301
+    },
+    {
+      "epoch": 2.1901709401709404,
+      "grad_norm": 0.8547600507736206,
+      "learning_rate": 8.527243627630983e-05,
+      "loss": 0.8073,
+      "step": 12302
+    },
+    {
+      "epoch": 2.1903490028490027,
+      "grad_norm": 0.8217329382896423,
+      "learning_rate": 8.525859158405114e-05,
+      "loss": 0.9723,
+      "step": 12303
+    },
+    {
+      "epoch": 2.1905270655270654,
+      "grad_norm": 0.896946132183075,
+      "learning_rate": 8.524474718061972e-05,
+      "loss": 0.7896,
+      "step": 12304
+    },
+    {
+      "epoch": 2.190705128205128,
+      "grad_norm": 1.0846823453903198,
+      "learning_rate": 8.523090306628685e-05,
+      "loss": 0.7689,
+      "step": 12305
+    },
+    {
+      "epoch": 2.190883190883191,
+      "grad_norm": 0.7265166640281677,
+      "learning_rate": 8.521705924132373e-05,
+      "loss": 0.8451,
+      "step": 12306
+    },
+    {
+      "epoch": 2.1910612535612537,
+      "grad_norm": 0.8806917071342468,
+      "learning_rate": 8.520321570600162e-05,
+      "loss": 0.8769,
+      "step": 12307
+    },
+    {
+      "epoch": 2.191239316239316,
+      "grad_norm": 0.7528414130210876,
+      "learning_rate": 8.518937246059176e-05,
+      "loss": 0.7137,
+      "step": 12308
+    },
+    {
+      "epoch": 2.1914173789173788,
+      "grad_norm": 0.9313900470733643,
+      "learning_rate": 8.517552950536543e-05,
+      "loss": 0.933,
+      "step": 12309
+    },
+    {
+      "epoch": 2.1915954415954415,
+      "grad_norm": 0.8363727331161499,
+      "learning_rate": 8.516168684059375e-05,
+      "loss": 0.899,
+      "step": 12310
+    },
+    {
+      "epoch": 2.1917735042735043,
+      "grad_norm": 0.7939122915267944,
+      "learning_rate": 8.514784446654803e-05,
+      "loss": 1.0323,
+      "step": 12311
+    },
+    {
+      "epoch": 2.191951566951567,
+      "grad_norm": 0.8744710087776184,
+      "learning_rate": 8.51340023834994e-05,
+      "loss": 0.8738,
+      "step": 12312
+    },
+    {
+      "epoch": 2.1921296296296298,
+      "grad_norm": 0.779353678226471,
+      "learning_rate": 8.512016059171916e-05,
+      "loss": 0.7692,
+      "step": 12313
+    },
+    {
+      "epoch": 2.1923076923076925,
+      "grad_norm": 0.8578362464904785,
+      "learning_rate": 8.510631909147841e-05,
+      "loss": 1.0636,
+      "step": 12314
+    },
+    {
+      "epoch": 2.192485754985755,
+      "grad_norm": 0.7210206985473633,
+      "learning_rate": 8.509247788304846e-05,
+      "loss": 0.6342,
+      "step": 12315
+    },
+    {
+      "epoch": 2.1926638176638176,
+      "grad_norm": 0.7221980690956116,
+      "learning_rate": 8.50786369667004e-05,
+      "loss": 0.7022,
+      "step": 12316
+    },
+    {
+      "epoch": 2.1928418803418803,
+      "grad_norm": 0.7871465086936951,
+      "learning_rate": 8.506479634270544e-05,
+      "loss": 0.9349,
+      "step": 12317
+    },
+    {
+      "epoch": 2.193019943019943,
+      "grad_norm": 0.7396262884140015,
+      "learning_rate": 8.505095601133479e-05,
+      "loss": 0.8644,
+      "step": 12318
+    },
+    {
+      "epoch": 2.193198005698006,
+      "grad_norm": 0.7513349652290344,
+      "learning_rate": 8.503711597285959e-05,
+      "loss": 0.7881,
+      "step": 12319
+    },
+    {
+      "epoch": 2.1933760683760686,
+      "grad_norm": 0.7280148863792419,
+      "learning_rate": 8.502327622755106e-05,
+      "loss": 0.7115,
+      "step": 12320
+    },
+    {
+      "epoch": 2.193554131054131,
+      "grad_norm": 0.792238712310791,
+      "learning_rate": 8.500943677568028e-05,
+      "loss": 0.8193,
+      "step": 12321
+    },
+    {
+      "epoch": 2.1937321937321936,
+      "grad_norm": 0.8709526062011719,
+      "learning_rate": 8.499559761751847e-05,
+      "loss": 0.8957,
+      "step": 12322
+    },
+    {
+      "epoch": 2.1939102564102564,
+      "grad_norm": 0.6865217685699463,
+      "learning_rate": 8.498175875333674e-05,
+      "loss": 0.6853,
+      "step": 12323
+    },
+    {
+      "epoch": 2.194088319088319,
+      "grad_norm": 0.7797526121139526,
+      "learning_rate": 8.496792018340625e-05,
+      "loss": 0.8885,
+      "step": 12324
+    },
+    {
+      "epoch": 2.194266381766382,
+      "grad_norm": 0.8806295394897461,
+      "learning_rate": 8.495408190799814e-05,
+      "loss": 0.9322,
+      "step": 12325
+    },
+    {
+      "epoch": 2.1944444444444446,
+      "grad_norm": 0.8566734790802002,
+      "learning_rate": 8.494024392738355e-05,
+      "loss": 0.9363,
+      "step": 12326
+    },
+    {
+      "epoch": 2.194622507122507,
+      "grad_norm": 0.8058465123176575,
+      "learning_rate": 8.49264062418336e-05,
+      "loss": 0.9007,
+      "step": 12327
+    },
+    {
+      "epoch": 2.1948005698005697,
+      "grad_norm": 0.7895804643630981,
+      "learning_rate": 8.491256885161938e-05,
+      "loss": 0.8486,
+      "step": 12328
+    },
+    {
+      "epoch": 2.1949786324786325,
+      "grad_norm": 0.7626506686210632,
+      "learning_rate": 8.489873175701204e-05,
+      "loss": 0.8208,
+      "step": 12329
+    },
+    {
+      "epoch": 2.195156695156695,
+      "grad_norm": 0.8917649388313293,
+      "learning_rate": 8.488489495828272e-05,
+      "loss": 1.1036,
+      "step": 12330
+    },
+    {
+      "epoch": 2.195334757834758,
+      "grad_norm": 0.7614438533782959,
+      "learning_rate": 8.487105845570242e-05,
+      "loss": 0.7124,
+      "step": 12331
+    },
+    {
+      "epoch": 2.1955128205128207,
+      "grad_norm": 0.7697421312332153,
+      "learning_rate": 8.485722224954237e-05,
+      "loss": 0.8831,
+      "step": 12332
+    },
+    {
+      "epoch": 2.195690883190883,
+      "grad_norm": 0.7449761629104614,
+      "learning_rate": 8.484338634007354e-05,
+      "loss": 0.9115,
+      "step": 12333
+    },
+    {
+      "epoch": 2.1958689458689458,
+      "grad_norm": 0.7099741101264954,
+      "learning_rate": 8.482955072756709e-05,
+      "loss": 0.6907,
+      "step": 12334
+    },
+    {
+      "epoch": 2.1960470085470085,
+      "grad_norm": 0.7856435775756836,
+      "learning_rate": 8.481571541229406e-05,
+      "loss": 1.0565,
+      "step": 12335
+    },
+    {
+      "epoch": 2.1962250712250713,
+      "grad_norm": 0.8374622464179993,
+      "learning_rate": 8.48018803945256e-05,
+      "loss": 0.9191,
+      "step": 12336
+    },
+    {
+      "epoch": 2.196403133903134,
+      "grad_norm": 0.7530848383903503,
+      "learning_rate": 8.478804567453265e-05,
+      "loss": 0.6576,
+      "step": 12337
+    },
+    {
+      "epoch": 2.1965811965811968,
+      "grad_norm": 0.774861216545105,
+      "learning_rate": 8.477421125258637e-05,
+      "loss": 1.0258,
+      "step": 12338
+    },
+    {
+      "epoch": 2.196759259259259,
+      "grad_norm": 0.9623909592628479,
+      "learning_rate": 8.47603771289578e-05,
+      "loss": 1.0192,
+      "step": 12339
+    },
+    {
+      "epoch": 2.196937321937322,
+      "grad_norm": 0.8253501653671265,
+      "learning_rate": 8.474654330391797e-05,
+      "loss": 0.7823,
+      "step": 12340
+    },
+    {
+      "epoch": 2.1971153846153846,
+      "grad_norm": 0.8683596849441528,
+      "learning_rate": 8.473270977773797e-05,
+      "loss": 0.8002,
+      "step": 12341
+    },
+    {
+      "epoch": 2.1972934472934473,
+      "grad_norm": 0.9093332886695862,
+      "learning_rate": 8.471887655068877e-05,
+      "loss": 1.0315,
+      "step": 12342
+    },
+    {
+      "epoch": 2.19747150997151,
+      "grad_norm": 0.7313206791877747,
+      "learning_rate": 8.470504362304147e-05,
+      "loss": 0.8238,
+      "step": 12343
+    },
+    {
+      "epoch": 2.197649572649573,
+      "grad_norm": 0.8464672565460205,
+      "learning_rate": 8.469121099506703e-05,
+      "loss": 0.8104,
+      "step": 12344
+    },
+    {
+      "epoch": 2.197827635327635,
+      "grad_norm": 0.9213936924934387,
+      "learning_rate": 8.467737866703657e-05,
+      "loss": 0.9963,
+      "step": 12345
+    },
+    {
+      "epoch": 2.198005698005698,
+      "grad_norm": 0.8033352494239807,
+      "learning_rate": 8.466354663922099e-05,
+      "loss": 0.9788,
+      "step": 12346
+    },
+    {
+      "epoch": 2.1981837606837606,
+      "grad_norm": 0.7210986018180847,
+      "learning_rate": 8.464971491189141e-05,
+      "loss": 0.7597,
+      "step": 12347
+    },
+    {
+      "epoch": 2.1983618233618234,
+      "grad_norm": 0.8128374814987183,
+      "learning_rate": 8.463588348531872e-05,
+      "loss": 0.9575,
+      "step": 12348
+    },
+    {
+      "epoch": 2.198539886039886,
+      "grad_norm": 0.7276061773300171,
+      "learning_rate": 8.4622052359774e-05,
+      "loss": 0.815,
+      "step": 12349
+    },
+    {
+      "epoch": 2.198717948717949,
+      "grad_norm": 0.7463665008544922,
+      "learning_rate": 8.46082215355282e-05,
+      "loss": 0.9782,
+      "step": 12350
+    },
+    {
+      "epoch": 2.198896011396011,
+      "grad_norm": 0.8288317918777466,
+      "learning_rate": 8.459439101285238e-05,
+      "loss": 0.8206,
+      "step": 12351
+    },
+    {
+      "epoch": 2.199074074074074,
+      "grad_norm": 0.8286055326461792,
+      "learning_rate": 8.458056079201742e-05,
+      "loss": 0.9819,
+      "step": 12352
+    },
+    {
+      "epoch": 2.1992521367521367,
+      "grad_norm": 0.8138381242752075,
+      "learning_rate": 8.456673087329436e-05,
+      "loss": 0.8565,
+      "step": 12353
+    },
+    {
+      "epoch": 2.1994301994301995,
+      "grad_norm": 0.9059311747550964,
+      "learning_rate": 8.455290125695412e-05,
+      "loss": 0.8727,
+      "step": 12354
+    },
+    {
+      "epoch": 2.199608262108262,
+      "grad_norm": 0.6138933300971985,
+      "learning_rate": 8.453907194326773e-05,
+      "loss": 0.5635,
+      "step": 12355
+    },
+    {
+      "epoch": 2.199786324786325,
+      "grad_norm": 0.870585560798645,
+      "learning_rate": 8.452524293250608e-05,
+      "loss": 0.7401,
+      "step": 12356
+    },
+    {
+      "epoch": 2.1999643874643873,
+      "grad_norm": 0.8393024802207947,
+      "learning_rate": 8.451141422494013e-05,
+      "loss": 1.0083,
+      "step": 12357
+    },
+    {
+      "epoch": 2.20014245014245,
+      "grad_norm": 0.7667146325111389,
+      "learning_rate": 8.449758582084091e-05,
+      "loss": 0.8915,
+      "step": 12358
+    },
+    {
+      "epoch": 2.2003205128205128,
+      "grad_norm": 1.0229144096374512,
+      "learning_rate": 8.448375772047923e-05,
+      "loss": 0.8879,
+      "step": 12359
+    },
+    {
+      "epoch": 2.2004985754985755,
+      "grad_norm": 0.7670294046401978,
+      "learning_rate": 8.446992992412611e-05,
+      "loss": 0.8233,
+      "step": 12360
+    },
+    {
+      "epoch": 2.2006766381766383,
+      "grad_norm": 0.7110083103179932,
+      "learning_rate": 8.445610243205244e-05,
+      "loss": 0.6315,
+      "step": 12361
+    },
+    {
+      "epoch": 2.200854700854701,
+      "grad_norm": 0.7801400423049927,
+      "learning_rate": 8.444227524452918e-05,
+      "loss": 0.7758,
+      "step": 12362
+    },
+    {
+      "epoch": 2.2010327635327633,
+      "grad_norm": 0.8762022852897644,
+      "learning_rate": 8.44284483618272e-05,
+      "loss": 0.9308,
+      "step": 12363
+    },
+    {
+      "epoch": 2.201210826210826,
+      "grad_norm": 0.811890184879303,
+      "learning_rate": 8.441462178421742e-05,
+      "loss": 1.0322,
+      "step": 12364
+    },
+    {
+      "epoch": 2.201388888888889,
+      "grad_norm": 0.8128690719604492,
+      "learning_rate": 8.440079551197076e-05,
+      "loss": 1.0669,
+      "step": 12365
+    },
+    {
+      "epoch": 2.2015669515669516,
+      "grad_norm": 0.8925766348838806,
+      "learning_rate": 8.438696954535812e-05,
+      "loss": 0.8848,
+      "step": 12366
+    },
+    {
+      "epoch": 2.2017450142450143,
+      "grad_norm": 0.9104064106941223,
+      "learning_rate": 8.437314388465036e-05,
+      "loss": 0.8227,
+      "step": 12367
+    },
+    {
+      "epoch": 2.201923076923077,
+      "grad_norm": 0.7956777215003967,
+      "learning_rate": 8.43593185301184e-05,
+      "loss": 0.7616,
+      "step": 12368
+    },
+    {
+      "epoch": 2.2021011396011394,
+      "grad_norm": 0.7658423185348511,
+      "learning_rate": 8.434549348203309e-05,
+      "loss": 0.9406,
+      "step": 12369
+    },
+    {
+      "epoch": 2.202279202279202,
+      "grad_norm": 0.7650682926177979,
+      "learning_rate": 8.433166874066532e-05,
+      "loss": 0.9031,
+      "step": 12370
+    },
+    {
+      "epoch": 2.202457264957265,
+      "grad_norm": 0.8613301515579224,
+      "learning_rate": 8.431784430628594e-05,
+      "loss": 0.9184,
+      "step": 12371
+    },
+    {
+      "epoch": 2.2026353276353277,
+      "grad_norm": 0.8446599245071411,
+      "learning_rate": 8.430402017916586e-05,
+      "loss": 0.8639,
+      "step": 12372
+    },
+    {
+      "epoch": 2.2028133903133904,
+      "grad_norm": 0.8082340955734253,
+      "learning_rate": 8.429019635957585e-05,
+      "loss": 0.7365,
+      "step": 12373
+    },
+    {
+      "epoch": 2.202991452991453,
+      "grad_norm": 0.8843092918395996,
+      "learning_rate": 8.427637284778683e-05,
+      "loss": 0.8679,
+      "step": 12374
+    },
+    {
+      "epoch": 2.2031695156695155,
+      "grad_norm": 0.8475705981254578,
+      "learning_rate": 8.426254964406961e-05,
+      "loss": 0.6614,
+      "step": 12375
+    },
+    {
+      "epoch": 2.203347578347578,
+      "grad_norm": 0.9980667233467102,
+      "learning_rate": 8.424872674869507e-05,
+      "loss": 0.9103,
+      "step": 12376
+    },
+    {
+      "epoch": 2.203525641025641,
+      "grad_norm": 0.8033170104026794,
+      "learning_rate": 8.423490416193398e-05,
+      "loss": 0.7668,
+      "step": 12377
+    },
+    {
+      "epoch": 2.2037037037037037,
+      "grad_norm": 0.8275265097618103,
+      "learning_rate": 8.422108188405718e-05,
+      "loss": 0.7448,
+      "step": 12378
+    },
+    {
+      "epoch": 2.2038817663817665,
+      "grad_norm": 0.7622979283332825,
+      "learning_rate": 8.420725991533554e-05,
+      "loss": 0.8121,
+      "step": 12379
+    },
+    {
+      "epoch": 2.2040598290598292,
+      "grad_norm": 0.8580977320671082,
+      "learning_rate": 8.41934382560398e-05,
+      "loss": 0.8437,
+      "step": 12380
+    },
+    {
+      "epoch": 2.2042378917378915,
+      "grad_norm": 0.8443751931190491,
+      "learning_rate": 8.417961690644086e-05,
+      "loss": 0.971,
+      "step": 12381
+    },
+    {
+      "epoch": 2.2044159544159543,
+      "grad_norm": 0.782430112361908,
+      "learning_rate": 8.416579586680939e-05,
+      "loss": 0.8367,
+      "step": 12382
+    },
+    {
+      "epoch": 2.204594017094017,
+      "grad_norm": 0.8664544820785522,
+      "learning_rate": 8.415197513741633e-05,
+      "loss": 0.8288,
+      "step": 12383
+    },
+    {
+      "epoch": 2.20477207977208,
+      "grad_norm": 0.7207586169242859,
+      "learning_rate": 8.413815471853235e-05,
+      "loss": 0.8038,
+      "step": 12384
+    },
+    {
+      "epoch": 2.2049501424501425,
+      "grad_norm": 0.743195652961731,
+      "learning_rate": 8.412433461042828e-05,
+      "loss": 0.705,
+      "step": 12385
+    },
+    {
+      "epoch": 2.2051282051282053,
+      "grad_norm": 0.7891412377357483,
+      "learning_rate": 8.411051481337488e-05,
+      "loss": 0.9729,
+      "step": 12386
+    },
+    {
+      "epoch": 2.205306267806268,
+      "grad_norm": 0.838847815990448,
+      "learning_rate": 8.4096695327643e-05,
+      "loss": 0.9053,
+      "step": 12387
+    },
+    {
+      "epoch": 2.2054843304843303,
+      "grad_norm": 0.7717056274414062,
+      "learning_rate": 8.408287615350328e-05,
+      "loss": 0.7388,
+      "step": 12388
+    },
+    {
+      "epoch": 2.205662393162393,
+      "grad_norm": 0.7209389209747314,
+      "learning_rate": 8.406905729122654e-05,
+      "loss": 0.6411,
+      "step": 12389
+    },
+    {
+      "epoch": 2.205840455840456,
+      "grad_norm": 0.822475790977478,
+      "learning_rate": 8.405523874108354e-05,
+      "loss": 0.9574,
+      "step": 12390
+    },
+    {
+      "epoch": 2.2060185185185186,
+      "grad_norm": 0.9401286840438843,
+      "learning_rate": 8.404142050334504e-05,
+      "loss": 0.8915,
+      "step": 12391
+    },
+    {
+      "epoch": 2.2061965811965814,
+      "grad_norm": 0.8247103691101074,
+      "learning_rate": 8.40276025782817e-05,
+      "loss": 0.8369,
+      "step": 12392
+    },
+    {
+      "epoch": 2.2063746438746437,
+      "grad_norm": 0.8082301020622253,
+      "learning_rate": 8.401378496616437e-05,
+      "loss": 0.9321,
+      "step": 12393
+    },
+    {
+      "epoch": 2.2065527065527064,
+      "grad_norm": 0.8156028389930725,
+      "learning_rate": 8.399996766726367e-05,
+      "loss": 0.7599,
+      "step": 12394
+    },
+    {
+      "epoch": 2.206730769230769,
+      "grad_norm": 0.7941898107528687,
+      "learning_rate": 8.398615068185038e-05,
+      "loss": 0.812,
+      "step": 12395
+    },
+    {
+      "epoch": 2.206908831908832,
+      "grad_norm": 0.7013470530509949,
+      "learning_rate": 8.397233401019518e-05,
+      "loss": 0.7914,
+      "step": 12396
+    },
+    {
+      "epoch": 2.2070868945868947,
+      "grad_norm": 0.6028649210929871,
+      "learning_rate": 8.395851765256881e-05,
+      "loss": 0.5787,
+      "step": 12397
+    },
+    {
+      "epoch": 2.2072649572649574,
+      "grad_norm": 0.9031504392623901,
+      "learning_rate": 8.3944701609242e-05,
+      "loss": 0.8677,
+      "step": 12398
+    },
+    {
+      "epoch": 2.20744301994302,
+      "grad_norm": 0.7370864748954773,
+      "learning_rate": 8.393088588048536e-05,
+      "loss": 0.9025,
+      "step": 12399
+    },
+    {
+      "epoch": 2.2076210826210825,
+      "grad_norm": 0.7764220237731934,
+      "learning_rate": 8.391707046656968e-05,
+      "loss": 0.8805,
+      "step": 12400
+    },
+    {
+      "epoch": 2.2077991452991452,
+      "grad_norm": 0.7456721663475037,
+      "learning_rate": 8.390325536776553e-05,
+      "loss": 0.7739,
+      "step": 12401
+    },
+    {
+      "epoch": 2.207977207977208,
+      "grad_norm": 0.8032360076904297,
+      "learning_rate": 8.388944058434373e-05,
+      "loss": 0.9765,
+      "step": 12402
+    },
+    {
+      "epoch": 2.2081552706552707,
+      "grad_norm": 0.8502830266952515,
+      "learning_rate": 8.387562611657483e-05,
+      "loss": 0.9356,
+      "step": 12403
+    },
+    {
+      "epoch": 2.2083333333333335,
+      "grad_norm": 0.812216579914093,
+      "learning_rate": 8.386181196472956e-05,
+      "loss": 0.8846,
+      "step": 12404
+    },
+    {
+      "epoch": 2.208511396011396,
+      "grad_norm": 0.6996115446090698,
+      "learning_rate": 8.384799812907853e-05,
+      "loss": 0.7035,
+      "step": 12405
+    },
+    {
+      "epoch": 2.2086894586894585,
+      "grad_norm": 0.7909261584281921,
+      "learning_rate": 8.383418460989245e-05,
+      "loss": 0.8025,
+      "step": 12406
+    },
+    {
+      "epoch": 2.2088675213675213,
+      "grad_norm": 0.8278310894966125,
+      "learning_rate": 8.382037140744192e-05,
+      "loss": 0.7982,
+      "step": 12407
+    },
+    {
+      "epoch": 2.209045584045584,
+      "grad_norm": 0.7558199167251587,
+      "learning_rate": 8.380655852199763e-05,
+      "loss": 0.854,
+      "step": 12408
+    },
+    {
+      "epoch": 2.209223646723647,
+      "grad_norm": 0.8516034483909607,
+      "learning_rate": 8.379274595383016e-05,
+      "loss": 0.7497,
+      "step": 12409
+    },
+    {
+      "epoch": 2.2094017094017095,
+      "grad_norm": 0.777004599571228,
+      "learning_rate": 8.377893370321018e-05,
+      "loss": 0.797,
+      "step": 12410
+    },
+    {
+      "epoch": 2.2095797720797723,
+      "grad_norm": 0.8820251822471619,
+      "learning_rate": 8.376512177040829e-05,
+      "loss": 0.9229,
+      "step": 12411
+    },
+    {
+      "epoch": 2.2097578347578346,
+      "grad_norm": 0.8623200058937073,
+      "learning_rate": 8.375131015569514e-05,
+      "loss": 1.011,
+      "step": 12412
+    },
+    {
+      "epoch": 2.2099358974358974,
+      "grad_norm": 0.9192054271697998,
+      "learning_rate": 8.373749885934127e-05,
+      "loss": 0.8711,
+      "step": 12413
+    },
+    {
+      "epoch": 2.21011396011396,
+      "grad_norm": 0.7627860903739929,
+      "learning_rate": 8.372368788161736e-05,
+      "loss": 0.5937,
+      "step": 12414
+    },
+    {
+      "epoch": 2.210292022792023,
+      "grad_norm": 0.74603670835495,
+      "learning_rate": 8.370987722279395e-05,
+      "loss": 0.8238,
+      "step": 12415
+    },
+    {
+      "epoch": 2.2104700854700856,
+      "grad_norm": 0.884469211101532,
+      "learning_rate": 8.369606688314165e-05,
+      "loss": 1.1957,
+      "step": 12416
+    },
+    {
+      "epoch": 2.210648148148148,
+      "grad_norm": 0.8145224452018738,
+      "learning_rate": 8.36822568629311e-05,
+      "loss": 0.8517,
+      "step": 12417
+    },
+    {
+      "epoch": 2.2108262108262107,
+      "grad_norm": 0.8167604207992554,
+      "learning_rate": 8.366844716243279e-05,
+      "loss": 0.9701,
+      "step": 12418
+    },
+    {
+      "epoch": 2.2110042735042734,
+      "grad_norm": 0.7668562531471252,
+      "learning_rate": 8.365463778191736e-05,
+      "loss": 1.0281,
+      "step": 12419
+    },
+    {
+      "epoch": 2.211182336182336,
+      "grad_norm": 0.8455148339271545,
+      "learning_rate": 8.364082872165532e-05,
+      "loss": 0.7812,
+      "step": 12420
+    },
+    {
+      "epoch": 2.211360398860399,
+      "grad_norm": 0.8756504654884338,
+      "learning_rate": 8.362701998191728e-05,
+      "loss": 0.779,
+      "step": 12421
+    },
+    {
+      "epoch": 2.2115384615384617,
+      "grad_norm": 0.8239594101905823,
+      "learning_rate": 8.361321156297374e-05,
+      "loss": 0.8581,
+      "step": 12422
+    },
+    {
+      "epoch": 2.2117165242165244,
+      "grad_norm": 0.7719405889511108,
+      "learning_rate": 8.359940346509533e-05,
+      "loss": 0.7593,
+      "step": 12423
+    },
+    {
+      "epoch": 2.2118945868945867,
+      "grad_norm": 0.8607308268547058,
+      "learning_rate": 8.358559568855249e-05,
+      "loss": 1.0618,
+      "step": 12424
+    },
+    {
+      "epoch": 2.2120726495726495,
+      "grad_norm": 0.750431478023529,
+      "learning_rate": 8.357178823361582e-05,
+      "loss": 0.7779,
+      "step": 12425
+    },
+    {
+      "epoch": 2.2122507122507122,
+      "grad_norm": 0.7770674824714661,
+      "learning_rate": 8.355798110055583e-05,
+      "loss": 0.6837,
+      "step": 12426
+    },
+    {
+      "epoch": 2.212428774928775,
+      "grad_norm": 0.7924200296401978,
+      "learning_rate": 8.354417428964307e-05,
+      "loss": 0.8092,
+      "step": 12427
+    },
+    {
+      "epoch": 2.2126068376068377,
+      "grad_norm": 0.7784677743911743,
+      "learning_rate": 8.3530367801148e-05,
+      "loss": 0.7168,
+      "step": 12428
+    },
+    {
+      "epoch": 2.2127849002849005,
+      "grad_norm": 1.0548151731491089,
+      "learning_rate": 8.351656163534121e-05,
+      "loss": 0.9286,
+      "step": 12429
+    },
+    {
+      "epoch": 2.212962962962963,
+      "grad_norm": 0.8983006477355957,
+      "learning_rate": 8.35027557924931e-05,
+      "loss": 0.878,
+      "step": 12430
+    },
+    {
+      "epoch": 2.2131410256410255,
+      "grad_norm": 0.8136780261993408,
+      "learning_rate": 8.348895027287424e-05,
+      "loss": 0.7901,
+      "step": 12431
+    },
+    {
+      "epoch": 2.2133190883190883,
+      "grad_norm": 0.8186678290367126,
+      "learning_rate": 8.347514507675508e-05,
+      "loss": 0.8994,
+      "step": 12432
+    },
+    {
+      "epoch": 2.213497150997151,
+      "grad_norm": 0.880790650844574,
+      "learning_rate": 8.346134020440617e-05,
+      "loss": 1.0681,
+      "step": 12433
+    },
+    {
+      "epoch": 2.213675213675214,
+      "grad_norm": 0.8061994910240173,
+      "learning_rate": 8.344753565609789e-05,
+      "loss": 0.8466,
+      "step": 12434
+    },
+    {
+      "epoch": 2.2138532763532766,
+      "grad_norm": 0.8041423559188843,
+      "learning_rate": 8.34337314321008e-05,
+      "loss": 0.897,
+      "step": 12435
+    },
+    {
+      "epoch": 2.214031339031339,
+      "grad_norm": 0.5797891616821289,
+      "learning_rate": 8.34199275326853e-05,
+      "loss": 0.4827,
+      "step": 12436
+    },
+    {
+      "epoch": 2.2142094017094016,
+      "grad_norm": 0.7373392581939697,
+      "learning_rate": 8.340612395812188e-05,
+      "loss": 0.779,
+      "step": 12437
+    },
+    {
+      "epoch": 2.2143874643874644,
+      "grad_norm": 0.7852202653884888,
+      "learning_rate": 8.339232070868102e-05,
+      "loss": 0.8001,
+      "step": 12438
+    },
+    {
+      "epoch": 2.214565527065527,
+      "grad_norm": 0.8209689259529114,
+      "learning_rate": 8.337851778463311e-05,
+      "loss": 0.7492,
+      "step": 12439
+    },
+    {
+      "epoch": 2.21474358974359,
+      "grad_norm": 0.9393492937088013,
+      "learning_rate": 8.336471518624867e-05,
+      "loss": 0.884,
+      "step": 12440
+    },
+    {
+      "epoch": 2.2149216524216526,
+      "grad_norm": 0.6966122984886169,
+      "learning_rate": 8.3350912913798e-05,
+      "loss": 0.7364,
+      "step": 12441
+    },
+    {
+      "epoch": 2.215099715099715,
+      "grad_norm": 0.7379066944122314,
+      "learning_rate": 8.333711096755165e-05,
+      "loss": 0.7345,
+      "step": 12442
+    },
+    {
+      "epoch": 2.2152777777777777,
+      "grad_norm": 0.9011021256446838,
+      "learning_rate": 8.332330934777999e-05,
+      "loss": 0.8392,
+      "step": 12443
+    },
+    {
+      "epoch": 2.2154558404558404,
+      "grad_norm": 0.7718381285667419,
+      "learning_rate": 8.330950805475346e-05,
+      "loss": 0.9062,
+      "step": 12444
+    },
+    {
+      "epoch": 2.215633903133903,
+      "grad_norm": 0.8584564328193665,
+      "learning_rate": 8.329570708874241e-05,
+      "loss": 0.9612,
+      "step": 12445
+    },
+    {
+      "epoch": 2.215811965811966,
+      "grad_norm": 0.7711616158485413,
+      "learning_rate": 8.32819064500173e-05,
+      "loss": 0.731,
+      "step": 12446
+    },
+    {
+      "epoch": 2.2159900284900287,
+      "grad_norm": 0.8014609217643738,
+      "learning_rate": 8.326810613884849e-05,
+      "loss": 1.0128,
+      "step": 12447
+    },
+    {
+      "epoch": 2.216168091168091,
+      "grad_norm": 0.7837486863136292,
+      "learning_rate": 8.325430615550642e-05,
+      "loss": 0.8271,
+      "step": 12448
+    },
+    {
+      "epoch": 2.2163461538461537,
+      "grad_norm": 0.9399738907814026,
+      "learning_rate": 8.324050650026139e-05,
+      "loss": 1.0433,
+      "step": 12449
+    },
+    {
+      "epoch": 2.2165242165242165,
+      "grad_norm": 0.8302193284034729,
+      "learning_rate": 8.322670717338385e-05,
+      "loss": 1.0259,
+      "step": 12450
+    },
+    {
+      "epoch": 2.2167022792022792,
+      "grad_norm": 0.7707721590995789,
+      "learning_rate": 8.321290817514411e-05,
+      "loss": 0.6972,
+      "step": 12451
+    },
+    {
+      "epoch": 2.216880341880342,
+      "grad_norm": 0.5814536809921265,
+      "learning_rate": 8.319910950581261e-05,
+      "loss": 0.5846,
+      "step": 12452
+    },
+    {
+      "epoch": 2.2170584045584047,
+      "grad_norm": 0.8249124884605408,
+      "learning_rate": 8.318531116565962e-05,
+      "loss": 0.7417,
+      "step": 12453
+    },
+    {
+      "epoch": 2.217236467236467,
+      "grad_norm": 0.7116015553474426,
+      "learning_rate": 8.317151315495556e-05,
+      "loss": 0.8698,
+      "step": 12454
+    },
+    {
+      "epoch": 2.21741452991453,
+      "grad_norm": 0.8025332689285278,
+      "learning_rate": 8.31577154739707e-05,
+      "loss": 0.825,
+      "step": 12455
+    },
+    {
+      "epoch": 2.2175925925925926,
+      "grad_norm": 0.8962773680686951,
+      "learning_rate": 8.314391812297542e-05,
+      "loss": 0.9987,
+      "step": 12456
+    },
+    {
+      "epoch": 2.2177706552706553,
+      "grad_norm": 0.8446899652481079,
+      "learning_rate": 8.313012110224008e-05,
+      "loss": 0.8554,
+      "step": 12457
+    },
+    {
+      "epoch": 2.217948717948718,
+      "grad_norm": 0.7759326696395874,
+      "learning_rate": 8.311632441203494e-05,
+      "loss": 0.8206,
+      "step": 12458
+    },
+    {
+      "epoch": 2.218126780626781,
+      "grad_norm": 0.9782015085220337,
+      "learning_rate": 8.31025280526304e-05,
+      "loss": 0.8183,
+      "step": 12459
+    },
+    {
+      "epoch": 2.218304843304843,
+      "grad_norm": 0.7445226907730103,
+      "learning_rate": 8.308873202429666e-05,
+      "loss": 0.6819,
+      "step": 12460
+    },
+    {
+      "epoch": 2.218482905982906,
+      "grad_norm": 0.7613980770111084,
+      "learning_rate": 8.307493632730413e-05,
+      "loss": 0.6283,
+      "step": 12461
+    },
+    {
+      "epoch": 2.2186609686609686,
+      "grad_norm": 0.7437549829483032,
+      "learning_rate": 8.306114096192304e-05,
+      "loss": 0.7511,
+      "step": 12462
+    },
+    {
+      "epoch": 2.2188390313390314,
+      "grad_norm": 0.7600140571594238,
+      "learning_rate": 8.304734592842373e-05,
+      "loss": 0.8784,
+      "step": 12463
+    },
+    {
+      "epoch": 2.219017094017094,
+      "grad_norm": 0.9086898565292358,
+      "learning_rate": 8.303355122707644e-05,
+      "loss": 1.0818,
+      "step": 12464
+    },
+    {
+      "epoch": 2.219195156695157,
+      "grad_norm": 0.8674180507659912,
+      "learning_rate": 8.30197568581515e-05,
+      "loss": 0.8925,
+      "step": 12465
+    },
+    {
+      "epoch": 2.219373219373219,
+      "grad_norm": 0.893606960773468,
+      "learning_rate": 8.300596282191911e-05,
+      "loss": 0.9382,
+      "step": 12466
+    },
+    {
+      "epoch": 2.219551282051282,
+      "grad_norm": 0.7664543390274048,
+      "learning_rate": 8.29921691186496e-05,
+      "loss": 0.7893,
+      "step": 12467
+    },
+    {
+      "epoch": 2.2197293447293447,
+      "grad_norm": 0.8730209469795227,
+      "learning_rate": 8.297837574861318e-05,
+      "loss": 1.0509,
+      "step": 12468
+    },
+    {
+      "epoch": 2.2199074074074074,
+      "grad_norm": 0.8138112425804138,
+      "learning_rate": 8.296458271208018e-05,
+      "loss": 0.784,
+      "step": 12469
+    },
+    {
+      "epoch": 2.22008547008547,
+      "grad_norm": 0.8362413644790649,
+      "learning_rate": 8.295079000932073e-05,
+      "loss": 1.0236,
+      "step": 12470
+    },
+    {
+      "epoch": 2.220263532763533,
+      "grad_norm": 0.8422487378120422,
+      "learning_rate": 8.293699764060518e-05,
+      "loss": 0.9677,
+      "step": 12471
+    },
+    {
+      "epoch": 2.2204415954415953,
+      "grad_norm": 0.7290427088737488,
+      "learning_rate": 8.292320560620369e-05,
+      "loss": 0.7514,
+      "step": 12472
+    },
+    {
+      "epoch": 2.220619658119658,
+      "grad_norm": 0.8083370923995972,
+      "learning_rate": 8.290941390638653e-05,
+      "loss": 0.8136,
+      "step": 12473
+    },
+    {
+      "epoch": 2.2207977207977208,
+      "grad_norm": 0.8045510053634644,
+      "learning_rate": 8.289562254142389e-05,
+      "loss": 0.6753,
+      "step": 12474
+    },
+    {
+      "epoch": 2.2209757834757835,
+      "grad_norm": 0.8019934892654419,
+      "learning_rate": 8.288183151158602e-05,
+      "loss": 0.8147,
+      "step": 12475
+    },
+    {
+      "epoch": 2.2211538461538463,
+      "grad_norm": 0.8129584193229675,
+      "learning_rate": 8.286804081714306e-05,
+      "loss": 0.9137,
+      "step": 12476
+    },
+    {
+      "epoch": 2.221331908831909,
+      "grad_norm": 0.9729450345039368,
+      "learning_rate": 8.285425045836526e-05,
+      "loss": 0.7884,
+      "step": 12477
+    },
+    {
+      "epoch": 2.2215099715099713,
+      "grad_norm": 0.755081295967102,
+      "learning_rate": 8.284046043552282e-05,
+      "loss": 0.8496,
+      "step": 12478
+    },
+    {
+      "epoch": 2.221688034188034,
+      "grad_norm": 0.725267767906189,
+      "learning_rate": 8.282667074888589e-05,
+      "loss": 0.7054,
+      "step": 12479
+    },
+    {
+      "epoch": 2.221866096866097,
+      "grad_norm": 0.832098662853241,
+      "learning_rate": 8.281288139872472e-05,
+      "loss": 0.8729,
+      "step": 12480
+    },
+    {
+      "epoch": 2.2220441595441596,
+      "grad_norm": 0.9908086657524109,
+      "learning_rate": 8.27990923853094e-05,
+      "loss": 0.9106,
+      "step": 12481
+    },
+    {
+      "epoch": 2.2222222222222223,
+      "grad_norm": 0.8001172542572021,
+      "learning_rate": 8.278530370891013e-05,
+      "loss": 0.906,
+      "step": 12482
+    },
+    {
+      "epoch": 2.222400284900285,
+      "grad_norm": 0.7607424259185791,
+      "learning_rate": 8.277151536979709e-05,
+      "loss": 0.8125,
+      "step": 12483
+    },
+    {
+      "epoch": 2.2225783475783474,
+      "grad_norm": 0.7850996255874634,
+      "learning_rate": 8.275772736824042e-05,
+      "loss": 0.7017,
+      "step": 12484
+    },
+    {
+      "epoch": 2.22275641025641,
+      "grad_norm": 0.8376613855361938,
+      "learning_rate": 8.274393970451024e-05,
+      "loss": 1.0453,
+      "step": 12485
+    },
+    {
+      "epoch": 2.222934472934473,
+      "grad_norm": 0.7973353266716003,
+      "learning_rate": 8.273015237887673e-05,
+      "loss": 0.8337,
+      "step": 12486
+    },
+    {
+      "epoch": 2.2231125356125356,
+      "grad_norm": 0.7622607350349426,
+      "learning_rate": 8.271636539161e-05,
+      "loss": 0.8574,
+      "step": 12487
+    },
+    {
+      "epoch": 2.2232905982905984,
+      "grad_norm": 0.7839400768280029,
+      "learning_rate": 8.270257874298022e-05,
+      "loss": 0.8857,
+      "step": 12488
+    },
+    {
+      "epoch": 2.223468660968661,
+      "grad_norm": 0.7730473875999451,
+      "learning_rate": 8.268879243325743e-05,
+      "loss": 0.9578,
+      "step": 12489
+    },
+    {
+      "epoch": 2.2236467236467234,
+      "grad_norm": 0.7811899185180664,
+      "learning_rate": 8.267500646271184e-05,
+      "loss": 0.9469,
+      "step": 12490
+    },
+    {
+      "epoch": 2.223824786324786,
+      "grad_norm": 0.8570041060447693,
+      "learning_rate": 8.266122083161347e-05,
+      "loss": 0.8853,
+      "step": 12491
+    },
+    {
+      "epoch": 2.224002849002849,
+      "grad_norm": 0.7989770174026489,
+      "learning_rate": 8.264743554023248e-05,
+      "loss": 0.7467,
+      "step": 12492
+    },
+    {
+      "epoch": 2.2241809116809117,
+      "grad_norm": 0.8287475109100342,
+      "learning_rate": 8.263365058883891e-05,
+      "loss": 0.9987,
+      "step": 12493
+    },
+    {
+      "epoch": 2.2243589743589745,
+      "grad_norm": 0.8879026174545288,
+      "learning_rate": 8.261986597770295e-05,
+      "loss": 0.9503,
+      "step": 12494
+    },
+    {
+      "epoch": 2.224537037037037,
+      "grad_norm": 0.8153596520423889,
+      "learning_rate": 8.260608170709456e-05,
+      "loss": 0.9715,
+      "step": 12495
+    },
+    {
+      "epoch": 2.2247150997150995,
+      "grad_norm": 0.8294584155082703,
+      "learning_rate": 8.259229777728384e-05,
+      "loss": 0.958,
+      "step": 12496
+    },
+    {
+      "epoch": 2.2248931623931623,
+      "grad_norm": 0.76850426197052,
+      "learning_rate": 8.257851418854093e-05,
+      "loss": 0.7666,
+      "step": 12497
+    },
+    {
+      "epoch": 2.225071225071225,
+      "grad_norm": 0.743966817855835,
+      "learning_rate": 8.256473094113582e-05,
+      "loss": 0.8893,
+      "step": 12498
+    },
+    {
+      "epoch": 2.2252492877492878,
+      "grad_norm": 0.7339308857917786,
+      "learning_rate": 8.255094803533863e-05,
+      "loss": 0.7317,
+      "step": 12499
+    },
+    {
+      "epoch": 2.2254273504273505,
+      "grad_norm": 1.0800104141235352,
+      "learning_rate": 8.253716547141932e-05,
+      "loss": 1.0147,
+      "step": 12500
+    },
+    {
+      "epoch": 2.2256054131054133,
+      "grad_norm": 0.8518815636634827,
+      "learning_rate": 8.252338324964802e-05,
+      "loss": 0.9695,
+      "step": 12501
+    },
+    {
+      "epoch": 2.2257834757834756,
+      "grad_norm": 0.8706745505332947,
+      "learning_rate": 8.250960137029469e-05,
+      "loss": 0.7735,
+      "step": 12502
+    },
+    {
+      "epoch": 2.2259615384615383,
+      "grad_norm": 1.0482546091079712,
+      "learning_rate": 8.24958198336294e-05,
+      "loss": 1.0882,
+      "step": 12503
+    },
+    {
+      "epoch": 2.226139601139601,
+      "grad_norm": 0.8025278449058533,
+      "learning_rate": 8.248203863992213e-05,
+      "loss": 0.8573,
+      "step": 12504
+    },
+    {
+      "epoch": 2.226317663817664,
+      "grad_norm": 0.8267400860786438,
+      "learning_rate": 8.246825778944297e-05,
+      "loss": 0.8609,
+      "step": 12505
+    },
+    {
+      "epoch": 2.2264957264957266,
+      "grad_norm": 0.703681230545044,
+      "learning_rate": 8.245447728246184e-05,
+      "loss": 0.6934,
+      "step": 12506
+    },
+    {
+      "epoch": 2.2266737891737893,
+      "grad_norm": 0.807736873626709,
+      "learning_rate": 8.24406971192488e-05,
+      "loss": 0.7258,
+      "step": 12507
+    },
+    {
+      "epoch": 2.226851851851852,
+      "grad_norm": 0.7663748860359192,
+      "learning_rate": 8.24269173000738e-05,
+      "loss": 0.7825,
+      "step": 12508
+    },
+    {
+      "epoch": 2.2270299145299144,
+      "grad_norm": 0.7799240946769714,
+      "learning_rate": 8.24131378252069e-05,
+      "loss": 0.7868,
+      "step": 12509
+    },
+    {
+      "epoch": 2.227207977207977,
+      "grad_norm": 0.8309668302536011,
+      "learning_rate": 8.239935869491799e-05,
+      "loss": 0.7697,
+      "step": 12510
+    },
+    {
+      "epoch": 2.22738603988604,
+      "grad_norm": 0.7257094979286194,
+      "learning_rate": 8.23855799094771e-05,
+      "loss": 0.8168,
+      "step": 12511
+    },
+    {
+      "epoch": 2.2275641025641026,
+      "grad_norm": 0.8902100920677185,
+      "learning_rate": 8.237180146915416e-05,
+      "loss": 0.8606,
+      "step": 12512
+    },
+    {
+      "epoch": 2.2277421652421654,
+      "grad_norm": 0.8100315928459167,
+      "learning_rate": 8.235802337421919e-05,
+      "loss": 0.9225,
+      "step": 12513
+    },
+    {
+      "epoch": 2.2279202279202277,
+      "grad_norm": 0.6804848909378052,
+      "learning_rate": 8.234424562494205e-05,
+      "loss": 0.7047,
+      "step": 12514
+    },
+    {
+      "epoch": 2.2280982905982905,
+      "grad_norm": 0.8664964437484741,
+      "learning_rate": 8.233046822159276e-05,
+      "loss": 1.0255,
+      "step": 12515
+    },
+    {
+      "epoch": 2.228276353276353,
+      "grad_norm": 0.836857795715332,
+      "learning_rate": 8.231669116444128e-05,
+      "loss": 0.9818,
+      "step": 12516
+    },
+    {
+      "epoch": 2.228454415954416,
+      "grad_norm": 0.6999024748802185,
+      "learning_rate": 8.230291445375744e-05,
+      "loss": 0.7298,
+      "step": 12517
+    },
+    {
+      "epoch": 2.2286324786324787,
+      "grad_norm": 0.8676811456680298,
+      "learning_rate": 8.228913808981127e-05,
+      "loss": 0.9592,
+      "step": 12518
+    },
+    {
+      "epoch": 2.2288105413105415,
+      "grad_norm": 0.8088808655738831,
+      "learning_rate": 8.227536207287263e-05,
+      "loss": 1.0021,
+      "step": 12519
+    },
+    {
+      "epoch": 2.228988603988604,
+      "grad_norm": 0.7802120447158813,
+      "learning_rate": 8.226158640321149e-05,
+      "loss": 0.8519,
+      "step": 12520
+    },
+    {
+      "epoch": 2.2291666666666665,
+      "grad_norm": 0.7560334801673889,
+      "learning_rate": 8.224781108109766e-05,
+      "loss": 0.7676,
+      "step": 12521
+    },
+    {
+      "epoch": 2.2293447293447293,
+      "grad_norm": 0.7806954383850098,
+      "learning_rate": 8.223403610680113e-05,
+      "loss": 0.9151,
+      "step": 12522
+    },
+    {
+      "epoch": 2.229522792022792,
+      "grad_norm": 0.7972870469093323,
+      "learning_rate": 8.222026148059173e-05,
+      "loss": 0.8785,
+      "step": 12523
+    },
+    {
+      "epoch": 2.2297008547008548,
+      "grad_norm": 0.7868863344192505,
+      "learning_rate": 8.220648720273941e-05,
+      "loss": 0.8981,
+      "step": 12524
+    },
+    {
+      "epoch": 2.2298789173789175,
+      "grad_norm": 0.7388648390769958,
+      "learning_rate": 8.219271327351397e-05,
+      "loss": 0.7361,
+      "step": 12525
+    },
+    {
+      "epoch": 2.23005698005698,
+      "grad_norm": 0.7367138862609863,
+      "learning_rate": 8.217893969318538e-05,
+      "loss": 0.7357,
+      "step": 12526
+    },
+    {
+      "epoch": 2.2302350427350426,
+      "grad_norm": 0.8345077037811279,
+      "learning_rate": 8.216516646202339e-05,
+      "loss": 0.9671,
+      "step": 12527
+    },
+    {
+      "epoch": 2.2304131054131053,
+      "grad_norm": 0.7875744104385376,
+      "learning_rate": 8.215139358029793e-05,
+      "loss": 0.7991,
+      "step": 12528
+    },
+    {
+      "epoch": 2.230591168091168,
+      "grad_norm": 0.7444638609886169,
+      "learning_rate": 8.213762104827882e-05,
+      "loss": 0.6524,
+      "step": 12529
+    },
+    {
+      "epoch": 2.230769230769231,
+      "grad_norm": 0.6670697927474976,
+      "learning_rate": 8.212384886623597e-05,
+      "loss": 0.639,
+      "step": 12530
+    },
+    {
+      "epoch": 2.2309472934472936,
+      "grad_norm": 0.8348705172538757,
+      "learning_rate": 8.211007703443913e-05,
+      "loss": 0.8904,
+      "step": 12531
+    },
+    {
+      "epoch": 2.2311253561253563,
+      "grad_norm": 0.8458212614059448,
+      "learning_rate": 8.209630555315817e-05,
+      "loss": 0.8398,
+      "step": 12532
+    },
+    {
+      "epoch": 2.2313034188034186,
+      "grad_norm": 0.9043961763381958,
+      "learning_rate": 8.20825344226629e-05,
+      "loss": 0.904,
+      "step": 12533
+    },
+    {
+      "epoch": 2.2314814814814814,
+      "grad_norm": 0.8207734227180481,
+      "learning_rate": 8.206876364322319e-05,
+      "loss": 0.853,
+      "step": 12534
+    },
+    {
+      "epoch": 2.231659544159544,
+      "grad_norm": 0.9311240911483765,
+      "learning_rate": 8.205499321510876e-05,
+      "loss": 0.9807,
+      "step": 12535
+    },
+    {
+      "epoch": 2.231837606837607,
+      "grad_norm": 0.8379791378974915,
+      "learning_rate": 8.204122313858946e-05,
+      "loss": 0.8318,
+      "step": 12536
+    },
+    {
+      "epoch": 2.2320156695156697,
+      "grad_norm": 0.8078454732894897,
+      "learning_rate": 8.202745341393515e-05,
+      "loss": 0.8692,
+      "step": 12537
+    },
+    {
+      "epoch": 2.232193732193732,
+      "grad_norm": 0.7555927038192749,
+      "learning_rate": 8.201368404141547e-05,
+      "loss": 0.8514,
+      "step": 12538
+    },
+    {
+      "epoch": 2.2323717948717947,
+      "grad_norm": 0.7724241018295288,
+      "learning_rate": 8.199991502130035e-05,
+      "loss": 0.5758,
+      "step": 12539
+    },
+    {
+      "epoch": 2.2325498575498575,
+      "grad_norm": 0.7388870120048523,
+      "learning_rate": 8.198614635385946e-05,
+      "loss": 0.6265,
+      "step": 12540
+    },
+    {
+      "epoch": 2.23272792022792,
+      "grad_norm": 0.9006723761558533,
+      "learning_rate": 8.197237803936267e-05,
+      "loss": 0.8238,
+      "step": 12541
+    },
+    {
+      "epoch": 2.232905982905983,
+      "grad_norm": 0.917884349822998,
+      "learning_rate": 8.195861007807962e-05,
+      "loss": 0.9447,
+      "step": 12542
+    },
+    {
+      "epoch": 2.2330840455840457,
+      "grad_norm": 0.81849205493927,
+      "learning_rate": 8.194484247028016e-05,
+      "loss": 0.9071,
+      "step": 12543
+    },
+    {
+      "epoch": 2.2332621082621085,
+      "grad_norm": 0.8572089076042175,
+      "learning_rate": 8.193107521623398e-05,
+      "loss": 0.9068,
+      "step": 12544
+    },
+    {
+      "epoch": 2.2334401709401708,
+      "grad_norm": 0.7870976328849792,
+      "learning_rate": 8.19173083162109e-05,
+      "loss": 0.7595,
+      "step": 12545
+    },
+    {
+      "epoch": 2.2336182336182335,
+      "grad_norm": 0.8728759288787842,
+      "learning_rate": 8.190354177048055e-05,
+      "loss": 1.0974,
+      "step": 12546
+    },
+    {
+      "epoch": 2.2337962962962963,
+      "grad_norm": 0.7679606080055237,
+      "learning_rate": 8.188977557931274e-05,
+      "loss": 0.7068,
+      "step": 12547
+    },
+    {
+      "epoch": 2.233974358974359,
+      "grad_norm": 0.7753520011901855,
+      "learning_rate": 8.187600974297714e-05,
+      "loss": 0.8008,
+      "step": 12548
+    },
+    {
+      "epoch": 2.234152421652422,
+      "grad_norm": 0.7785305976867676,
+      "learning_rate": 8.186224426174348e-05,
+      "loss": 0.8528,
+      "step": 12549
+    },
+    {
+      "epoch": 2.2343304843304845,
+      "grad_norm": 0.7762976288795471,
+      "learning_rate": 8.184847913588145e-05,
+      "loss": 0.9264,
+      "step": 12550
+    },
+    {
+      "epoch": 2.234508547008547,
+      "grad_norm": 1.0543726682662964,
+      "learning_rate": 8.18347143656608e-05,
+      "loss": 0.9201,
+      "step": 12551
+    },
+    {
+      "epoch": 2.2346866096866096,
+      "grad_norm": 0.815389096736908,
+      "learning_rate": 8.182094995135116e-05,
+      "loss": 0.8834,
+      "step": 12552
+    },
+    {
+      "epoch": 2.2348646723646723,
+      "grad_norm": 0.774773895740509,
+      "learning_rate": 8.180718589322225e-05,
+      "loss": 0.8864,
+      "step": 12553
+    },
+    {
+      "epoch": 2.235042735042735,
+      "grad_norm": 0.8139658570289612,
+      "learning_rate": 8.179342219154372e-05,
+      "loss": 0.8696,
+      "step": 12554
+    },
+    {
+      "epoch": 2.235220797720798,
+      "grad_norm": 0.7804924249649048,
+      "learning_rate": 8.177965884658527e-05,
+      "loss": 0.8854,
+      "step": 12555
+    },
+    {
+      "epoch": 2.2353988603988606,
+      "grad_norm": 0.8601226210594177,
+      "learning_rate": 8.176589585861659e-05,
+      "loss": 0.9115,
+      "step": 12556
+    },
+    {
+      "epoch": 2.235576923076923,
+      "grad_norm": 0.7518162727355957,
+      "learning_rate": 8.175213322790726e-05,
+      "loss": 0.7871,
+      "step": 12557
+    },
+    {
+      "epoch": 2.2357549857549857,
+      "grad_norm": 0.7595868110656738,
+      "learning_rate": 8.1738370954727e-05,
+      "loss": 0.7597,
+      "step": 12558
+    },
+    {
+      "epoch": 2.2359330484330484,
+      "grad_norm": 0.8191643357276917,
+      "learning_rate": 8.17246090393454e-05,
+      "loss": 0.9443,
+      "step": 12559
+    },
+    {
+      "epoch": 2.236111111111111,
+      "grad_norm": 0.7854904532432556,
+      "learning_rate": 8.171084748203217e-05,
+      "loss": 0.8547,
+      "step": 12560
+    },
+    {
+      "epoch": 2.236289173789174,
+      "grad_norm": 0.8610023260116577,
+      "learning_rate": 8.169708628305684e-05,
+      "loss": 0.7846,
+      "step": 12561
+    },
+    {
+      "epoch": 2.2364672364672367,
+      "grad_norm": 0.8254715204238892,
+      "learning_rate": 8.168332544268914e-05,
+      "loss": 0.7493,
+      "step": 12562
+    },
+    {
+      "epoch": 2.236645299145299,
+      "grad_norm": 0.8390897512435913,
+      "learning_rate": 8.166956496119857e-05,
+      "loss": 0.9867,
+      "step": 12563
+    },
+    {
+      "epoch": 2.2368233618233617,
+      "grad_norm": 0.8179677128791809,
+      "learning_rate": 8.165580483885483e-05,
+      "loss": 0.8039,
+      "step": 12564
+    },
+    {
+      "epoch": 2.2370014245014245,
+      "grad_norm": 0.6722155809402466,
+      "learning_rate": 8.164204507592745e-05,
+      "loss": 0.695,
+      "step": 12565
+    },
+    {
+      "epoch": 2.2371794871794872,
+      "grad_norm": 0.8228170871734619,
+      "learning_rate": 8.162828567268612e-05,
+      "loss": 1.0414,
+      "step": 12566
+    },
+    {
+      "epoch": 2.23735754985755,
+      "grad_norm": 0.8676900267601013,
+      "learning_rate": 8.161452662940032e-05,
+      "loss": 1.0157,
+      "step": 12567
+    },
+    {
+      "epoch": 2.2375356125356127,
+      "grad_norm": 0.8174694180488586,
+      "learning_rate": 8.16007679463397e-05,
+      "loss": 0.691,
+      "step": 12568
+    },
+    {
+      "epoch": 2.237713675213675,
+      "grad_norm": 0.8137148022651672,
+      "learning_rate": 8.158700962377379e-05,
+      "loss": 1.0022,
+      "step": 12569
+    },
+    {
+      "epoch": 2.237891737891738,
+      "grad_norm": 0.970250129699707,
+      "learning_rate": 8.157325166197221e-05,
+      "loss": 0.7946,
+      "step": 12570
+    },
+    {
+      "epoch": 2.2380698005698005,
+      "grad_norm": 0.7366915941238403,
+      "learning_rate": 8.155949406120446e-05,
+      "loss": 0.9039,
+      "step": 12571
+    },
+    {
+      "epoch": 2.2382478632478633,
+      "grad_norm": 0.878358781337738,
+      "learning_rate": 8.154573682174014e-05,
+      "loss": 0.8172,
+      "step": 12572
+    },
+    {
+      "epoch": 2.238425925925926,
+      "grad_norm": 0.7552989721298218,
+      "learning_rate": 8.153197994384875e-05,
+      "loss": 0.9955,
+      "step": 12573
+    },
+    {
+      "epoch": 2.238603988603989,
+      "grad_norm": 0.8198257684707642,
+      "learning_rate": 8.151822342779985e-05,
+      "loss": 0.8677,
+      "step": 12574
+    },
+    {
+      "epoch": 2.238782051282051,
+      "grad_norm": 0.9128977656364441,
+      "learning_rate": 8.150446727386297e-05,
+      "loss": 0.9531,
+      "step": 12575
+    },
+    {
+      "epoch": 2.238960113960114,
+      "grad_norm": 0.867671549320221,
+      "learning_rate": 8.149071148230762e-05,
+      "loss": 1.0226,
+      "step": 12576
+    },
+    {
+      "epoch": 2.2391381766381766,
+      "grad_norm": 0.8640758395195007,
+      "learning_rate": 8.147695605340337e-05,
+      "loss": 1.1284,
+      "step": 12577
+    },
+    {
+      "epoch": 2.2393162393162394,
+      "grad_norm": 0.7453210353851318,
+      "learning_rate": 8.146320098741964e-05,
+      "loss": 0.7812,
+      "step": 12578
+    },
+    {
+      "epoch": 2.239494301994302,
+      "grad_norm": 0.9207521080970764,
+      "learning_rate": 8.144944628462602e-05,
+      "loss": 0.9955,
+      "step": 12579
+    },
+    {
+      "epoch": 2.239672364672365,
+      "grad_norm": 0.751732349395752,
+      "learning_rate": 8.143569194529193e-05,
+      "loss": 0.7858,
+      "step": 12580
+    },
+    {
+      "epoch": 2.239850427350427,
+      "grad_norm": 0.7955539226531982,
+      "learning_rate": 8.142193796968694e-05,
+      "loss": 0.8482,
+      "step": 12581
+    },
+    {
+      "epoch": 2.24002849002849,
+      "grad_norm": 0.8020164370536804,
+      "learning_rate": 8.140818435808043e-05,
+      "loss": 0.8069,
+      "step": 12582
+    },
+    {
+      "epoch": 2.2402065527065527,
+      "grad_norm": 0.7460235357284546,
+      "learning_rate": 8.139443111074198e-05,
+      "loss": 0.6478,
+      "step": 12583
+    },
+    {
+      "epoch": 2.2403846153846154,
+      "grad_norm": 0.7504379153251648,
+      "learning_rate": 8.138067822794096e-05,
+      "loss": 0.726,
+      "step": 12584
+    },
+    {
+      "epoch": 2.240562678062678,
+      "grad_norm": 0.8214267492294312,
+      "learning_rate": 8.136692570994688e-05,
+      "loss": 1.0114,
+      "step": 12585
+    },
+    {
+      "epoch": 2.240740740740741,
+      "grad_norm": 0.9436941742897034,
+      "learning_rate": 8.135317355702917e-05,
+      "loss": 0.873,
+      "step": 12586
+    },
+    {
+      "epoch": 2.2409188034188032,
+      "grad_norm": 0.7541804909706116,
+      "learning_rate": 8.133942176945733e-05,
+      "loss": 0.8013,
+      "step": 12587
+    },
+    {
+      "epoch": 2.241096866096866,
+      "grad_norm": 0.8725557327270508,
+      "learning_rate": 8.132567034750073e-05,
+      "loss": 0.8506,
+      "step": 12588
+    },
+    {
+      "epoch": 2.2412749287749287,
+      "grad_norm": 0.7766169905662537,
+      "learning_rate": 8.131191929142882e-05,
+      "loss": 0.9076,
+      "step": 12589
+    },
+    {
+      "epoch": 2.2414529914529915,
+      "grad_norm": 0.8852736353874207,
+      "learning_rate": 8.129816860151104e-05,
+      "loss": 0.9278,
+      "step": 12590
+    },
+    {
+      "epoch": 2.2416310541310542,
+      "grad_norm": 0.6939527988433838,
+      "learning_rate": 8.128441827801681e-05,
+      "loss": 0.753,
+      "step": 12591
+    },
+    {
+      "epoch": 2.241809116809117,
+      "grad_norm": 0.8932832479476929,
+      "learning_rate": 8.127066832121551e-05,
+      "loss": 0.8089,
+      "step": 12592
+    },
+    {
+      "epoch": 2.2419871794871793,
+      "grad_norm": 0.7399743795394897,
+      "learning_rate": 8.125691873137656e-05,
+      "loss": 0.6905,
+      "step": 12593
+    },
+    {
+      "epoch": 2.242165242165242,
+      "grad_norm": 0.7664098143577576,
+      "learning_rate": 8.124316950876933e-05,
+      "loss": 0.8698,
+      "step": 12594
+    },
+    {
+      "epoch": 2.242343304843305,
+      "grad_norm": 0.8222574591636658,
+      "learning_rate": 8.122942065366323e-05,
+      "loss": 0.8922,
+      "step": 12595
+    },
+    {
+      "epoch": 2.2425213675213675,
+      "grad_norm": 0.8072433471679688,
+      "learning_rate": 8.121567216632771e-05,
+      "loss": 0.7613,
+      "step": 12596
+    },
+    {
+      "epoch": 2.2426994301994303,
+      "grad_norm": 0.7647300362586975,
+      "learning_rate": 8.120192404703199e-05,
+      "loss": 0.8736,
+      "step": 12597
+    },
+    {
+      "epoch": 2.242877492877493,
+      "grad_norm": 0.7536396980285645,
+      "learning_rate": 8.118817629604559e-05,
+      "loss": 0.7697,
+      "step": 12598
+    },
+    {
+      "epoch": 2.2430555555555554,
+      "grad_norm": 0.7295291423797607,
+      "learning_rate": 8.117442891363774e-05,
+      "loss": 0.8477,
+      "step": 12599
+    },
+    {
+      "epoch": 2.243233618233618,
+      "grad_norm": 0.7677894830703735,
+      "learning_rate": 8.116068190007787e-05,
+      "loss": 0.8113,
+      "step": 12600
+    },
+    {
+      "epoch": 2.243411680911681,
+      "grad_norm": 0.825614869594574,
+      "learning_rate": 8.114693525563529e-05,
+      "loss": 0.915,
+      "step": 12601
+    },
+    {
+      "epoch": 2.2435897435897436,
+      "grad_norm": 0.7841798663139343,
+      "learning_rate": 8.113318898057939e-05,
+      "loss": 0.7028,
+      "step": 12602
+    },
+    {
+      "epoch": 2.2437678062678064,
+      "grad_norm": 1.085337519645691,
+      "learning_rate": 8.111944307517942e-05,
+      "loss": 0.8354,
+      "step": 12603
+    },
+    {
+      "epoch": 2.243945868945869,
+      "grad_norm": 0.7831527590751648,
+      "learning_rate": 8.110569753970475e-05,
+      "loss": 1.0275,
+      "step": 12604
+    },
+    {
+      "epoch": 2.2441239316239314,
+      "grad_norm": 0.800504744052887,
+      "learning_rate": 8.109195237442467e-05,
+      "loss": 0.7006,
+      "step": 12605
+    },
+    {
+      "epoch": 2.244301994301994,
+      "grad_norm": 0.8189738392829895,
+      "learning_rate": 8.107820757960856e-05,
+      "loss": 0.8036,
+      "step": 12606
+    },
+    {
+      "epoch": 2.244480056980057,
+      "grad_norm": 0.8892425298690796,
+      "learning_rate": 8.106446315552562e-05,
+      "loss": 0.8274,
+      "step": 12607
+    },
+    {
+      "epoch": 2.2446581196581197,
+      "grad_norm": 0.8144643306732178,
+      "learning_rate": 8.105071910244521e-05,
+      "loss": 1.0648,
+      "step": 12608
+    },
+    {
+      "epoch": 2.2448361823361824,
+      "grad_norm": 0.914513111114502,
+      "learning_rate": 8.103697542063657e-05,
+      "loss": 0.8999,
+      "step": 12609
+    },
+    {
+      "epoch": 2.245014245014245,
+      "grad_norm": 0.8273763656616211,
+      "learning_rate": 8.102323211036904e-05,
+      "loss": 0.8554,
+      "step": 12610
+    },
+    {
+      "epoch": 2.2451923076923075,
+      "grad_norm": 0.9459149837493896,
+      "learning_rate": 8.100948917191181e-05,
+      "loss": 1.2345,
+      "step": 12611
+    },
+    {
+      "epoch": 2.2453703703703702,
+      "grad_norm": 0.8377025723457336,
+      "learning_rate": 8.099574660553425e-05,
+      "loss": 0.8096,
+      "step": 12612
+    },
+    {
+      "epoch": 2.245548433048433,
+      "grad_norm": 0.8639607429504395,
+      "learning_rate": 8.098200441150551e-05,
+      "loss": 0.8238,
+      "step": 12613
+    },
+    {
+      "epoch": 2.2457264957264957,
+      "grad_norm": 1.0107637643814087,
+      "learning_rate": 8.09682625900949e-05,
+      "loss": 0.8747,
+      "step": 12614
+    },
+    {
+      "epoch": 2.2459045584045585,
+      "grad_norm": 0.8153043985366821,
+      "learning_rate": 8.095452114157164e-05,
+      "loss": 0.9357,
+      "step": 12615
+    },
+    {
+      "epoch": 2.2460826210826212,
+      "grad_norm": 0.8948562741279602,
+      "learning_rate": 8.094078006620497e-05,
+      "loss": 0.8245,
+      "step": 12616
+    },
+    {
+      "epoch": 2.246260683760684,
+      "grad_norm": 0.7983259558677673,
+      "learning_rate": 8.092703936426416e-05,
+      "loss": 0.8936,
+      "step": 12617
+    },
+    {
+      "epoch": 2.2464387464387463,
+      "grad_norm": 0.9016979336738586,
+      "learning_rate": 8.091329903601835e-05,
+      "loss": 1.0685,
+      "step": 12618
+    },
+    {
+      "epoch": 2.246616809116809,
+      "grad_norm": 0.7192493677139282,
+      "learning_rate": 8.089955908173685e-05,
+      "loss": 0.8622,
+      "step": 12619
+    },
+    {
+      "epoch": 2.246794871794872,
+      "grad_norm": 0.78288334608078,
+      "learning_rate": 8.088581950168877e-05,
+      "loss": 0.7874,
+      "step": 12620
+    },
+    {
+      "epoch": 2.2469729344729346,
+      "grad_norm": 0.8438683152198792,
+      "learning_rate": 8.087208029614336e-05,
+      "loss": 0.9262,
+      "step": 12621
+    },
+    {
+      "epoch": 2.2471509971509973,
+      "grad_norm": 0.8384907245635986,
+      "learning_rate": 8.085834146536978e-05,
+      "loss": 0.9069,
+      "step": 12622
+    },
+    {
+      "epoch": 2.2473290598290596,
+      "grad_norm": 0.8209545016288757,
+      "learning_rate": 8.084460300963729e-05,
+      "loss": 0.9457,
+      "step": 12623
+    },
+    {
+      "epoch": 2.2475071225071224,
+      "grad_norm": 0.8220782279968262,
+      "learning_rate": 8.083086492921496e-05,
+      "loss": 0.9224,
+      "step": 12624
+    },
+    {
+      "epoch": 2.247685185185185,
+      "grad_norm": 0.8927256464958191,
+      "learning_rate": 8.081712722437204e-05,
+      "loss": 0.7091,
+      "step": 12625
+    },
+    {
+      "epoch": 2.247863247863248,
+      "grad_norm": 0.8878564238548279,
+      "learning_rate": 8.080338989537764e-05,
+      "loss": 0.8879,
+      "step": 12626
+    },
+    {
+      "epoch": 2.2480413105413106,
+      "grad_norm": 0.8380948305130005,
+      "learning_rate": 8.078965294250097e-05,
+      "loss": 0.8504,
+      "step": 12627
+    },
+    {
+      "epoch": 2.2482193732193734,
+      "grad_norm": 0.8005350828170776,
+      "learning_rate": 8.07759163660111e-05,
+      "loss": 1.2119,
+      "step": 12628
+    },
+    {
+      "epoch": 2.248397435897436,
+      "grad_norm": 0.7990152835845947,
+      "learning_rate": 8.076218016617726e-05,
+      "loss": 0.72,
+      "step": 12629
+    },
+    {
+      "epoch": 2.2485754985754984,
+      "grad_norm": 0.9264963269233704,
+      "learning_rate": 8.07484443432685e-05,
+      "loss": 0.8398,
+      "step": 12630
+    },
+    {
+      "epoch": 2.248753561253561,
+      "grad_norm": 0.9103235602378845,
+      "learning_rate": 8.073470889755402e-05,
+      "loss": 0.9122,
+      "step": 12631
+    },
+    {
+      "epoch": 2.248931623931624,
+      "grad_norm": 0.8042106032371521,
+      "learning_rate": 8.072097382930285e-05,
+      "loss": 0.8065,
+      "step": 12632
+    },
+    {
+      "epoch": 2.2491096866096867,
+      "grad_norm": 0.8464857935905457,
+      "learning_rate": 8.070723913878421e-05,
+      "loss": 0.9117,
+      "step": 12633
+    },
+    {
+      "epoch": 2.2492877492877494,
+      "grad_norm": 0.7476474642753601,
+      "learning_rate": 8.06935048262671e-05,
+      "loss": 1.0252,
+      "step": 12634
+    },
+    {
+      "epoch": 2.2494658119658117,
+      "grad_norm": 0.8098256587982178,
+      "learning_rate": 8.067977089202065e-05,
+      "loss": 0.911,
+      "step": 12635
+    },
+    {
+      "epoch": 2.2496438746438745,
+      "grad_norm": 0.9311509728431702,
+      "learning_rate": 8.066603733631398e-05,
+      "loss": 0.9594,
+      "step": 12636
+    },
+    {
+      "epoch": 2.2496438746438745,
+      "eval_loss": 1.1335573196411133,
+      "eval_runtime": 24.2688,
+      "eval_samples_per_second": 42.895,
+      "eval_steps_per_second": 21.468,
+      "step": 12636
+    },
+    {
+      "epoch": 2.2498219373219372,
+      "grad_norm": 0.7744980454444885,
+      "learning_rate": 8.065230415941612e-05,
+      "loss": 0.8983,
+      "step": 12637
+    },
+    {
+      "epoch": 2.25,
+      "grad_norm": 0.9464056491851807,
+      "learning_rate": 8.06385713615962e-05,
+      "loss": 0.8646,
+      "step": 12638
+    },
+    {
+      "epoch": 2.2501780626780628,
+      "grad_norm": 0.8263896107673645,
+      "learning_rate": 8.062483894312323e-05,
+      "loss": 0.8557,
+      "step": 12639
+    },
+    {
+      "epoch": 2.2503561253561255,
+      "grad_norm": 0.8827885389328003,
+      "learning_rate": 8.06111069042663e-05,
+      "loss": 0.7632,
+      "step": 12640
+    },
+    {
+      "epoch": 2.2505341880341883,
+      "grad_norm": 0.8537881374359131,
+      "learning_rate": 8.059737524529443e-05,
+      "loss": 0.8004,
+      "step": 12641
+    },
+    {
+      "epoch": 2.2507122507122506,
+      "grad_norm": 0.8397842049598694,
+      "learning_rate": 8.058364396647674e-05,
+      "loss": 0.9487,
+      "step": 12642
+    },
+    {
+      "epoch": 2.2508903133903133,
+      "grad_norm": 1.071976661682129,
+      "learning_rate": 8.056991306808217e-05,
+      "loss": 1.0699,
+      "step": 12643
+    },
+    {
+      "epoch": 2.251068376068376,
+      "grad_norm": 0.8712023496627808,
+      "learning_rate": 8.055618255037983e-05,
+      "loss": 0.6518,
+      "step": 12644
+    },
+    {
+      "epoch": 2.251246438746439,
+      "grad_norm": 0.7885438799858093,
+      "learning_rate": 8.054245241363866e-05,
+      "loss": 0.8458,
+      "step": 12645
+    },
+    {
+      "epoch": 2.2514245014245016,
+      "grad_norm": 0.947169840335846,
+      "learning_rate": 8.052872265812774e-05,
+      "loss": 0.6631,
+      "step": 12646
+    },
+    {
+      "epoch": 2.251602564102564,
+      "grad_norm": 0.8554182052612305,
+      "learning_rate": 8.051499328411603e-05,
+      "loss": 0.8945,
+      "step": 12647
+    },
+    {
+      "epoch": 2.2517806267806266,
+      "grad_norm": 0.8081278800964355,
+      "learning_rate": 8.050126429187259e-05,
+      "loss": 0.8969,
+      "step": 12648
+    },
+    {
+      "epoch": 2.2519586894586894,
+      "grad_norm": 0.7826179265975952,
+      "learning_rate": 8.048753568166633e-05,
+      "loss": 0.6965,
+      "step": 12649
+    },
+    {
+      "epoch": 2.252136752136752,
+      "grad_norm": 0.9688517451286316,
+      "learning_rate": 8.04738074537663e-05,
+      "loss": 1.0044,
+      "step": 12650
+    },
+    {
+      "epoch": 2.252314814814815,
+      "grad_norm": 0.7780970931053162,
+      "learning_rate": 8.04600796084414e-05,
+      "loss": 0.8712,
+      "step": 12651
+    },
+    {
+      "epoch": 2.2524928774928776,
+      "grad_norm": 0.8360016942024231,
+      "learning_rate": 8.044635214596073e-05,
+      "loss": 0.9522,
+      "step": 12652
+    },
+    {
+      "epoch": 2.2526709401709404,
+      "grad_norm": 0.8137710094451904,
+      "learning_rate": 8.043262506659311e-05,
+      "loss": 0.7953,
+      "step": 12653
+    },
+    {
+      "epoch": 2.2528490028490027,
+      "grad_norm": 0.8394312858581543,
+      "learning_rate": 8.041889837060755e-05,
+      "loss": 0.77,
+      "step": 12654
+    },
+    {
+      "epoch": 2.2530270655270654,
+      "grad_norm": 0.7245169878005981,
+      "learning_rate": 8.040517205827307e-05,
+      "loss": 0.7657,
+      "step": 12655
+    },
+    {
+      "epoch": 2.253205128205128,
+      "grad_norm": 0.8018792271614075,
+      "learning_rate": 8.039144612985846e-05,
+      "loss": 0.8974,
+      "step": 12656
+    },
+    {
+      "epoch": 2.253383190883191,
+      "grad_norm": 0.8204617500305176,
+      "learning_rate": 8.037772058563278e-05,
+      "loss": 0.8635,
+      "step": 12657
+    },
+    {
+      "epoch": 2.2535612535612537,
+      "grad_norm": 0.906288743019104,
+      "learning_rate": 8.036399542586485e-05,
+      "loss": 1.0498,
+      "step": 12658
+    },
+    {
+      "epoch": 2.253739316239316,
+      "grad_norm": 0.8674196600914001,
+      "learning_rate": 8.035027065082371e-05,
+      "loss": 0.8621,
+      "step": 12659
+    },
+    {
+      "epoch": 2.2539173789173788,
+      "grad_norm": 0.8112890124320984,
+      "learning_rate": 8.033654626077816e-05,
+      "loss": 0.9937,
+      "step": 12660
+    },
+    {
+      "epoch": 2.2540954415954415,
+      "grad_norm": 0.8072839975357056,
+      "learning_rate": 8.032282225599714e-05,
+      "loss": 0.8555,
+      "step": 12661
+    },
+    {
+      "epoch": 2.2542735042735043,
+      "grad_norm": 0.7853979468345642,
+      "learning_rate": 8.030909863674952e-05,
+      "loss": 0.8698,
+      "step": 12662
+    },
+    {
+      "epoch": 2.254451566951567,
+      "grad_norm": 0.7456761598587036,
+      "learning_rate": 8.029537540330426e-05,
+      "loss": 0.6214,
+      "step": 12663
+    },
+    {
+      "epoch": 2.2546296296296298,
+      "grad_norm": 0.7207663059234619,
+      "learning_rate": 8.028165255593015e-05,
+      "loss": 0.7641,
+      "step": 12664
+    },
+    {
+      "epoch": 2.2548076923076925,
+      "grad_norm": 0.6541373133659363,
+      "learning_rate": 8.02679300948961e-05,
+      "loss": 0.7438,
+      "step": 12665
+    },
+    {
+      "epoch": 2.254985754985755,
+      "grad_norm": 0.7535310983657837,
+      "learning_rate": 8.025420802047096e-05,
+      "loss": 0.9417,
+      "step": 12666
+    },
+    {
+      "epoch": 2.2551638176638176,
+      "grad_norm": 0.88471919298172,
+      "learning_rate": 8.024048633292364e-05,
+      "loss": 0.9122,
+      "step": 12667
+    },
+    {
+      "epoch": 2.2553418803418803,
+      "grad_norm": 0.8621570467948914,
+      "learning_rate": 8.02267650325229e-05,
+      "loss": 0.7026,
+      "step": 12668
+    },
+    {
+      "epoch": 2.255519943019943,
+      "grad_norm": 0.8574202060699463,
+      "learning_rate": 8.021304411953767e-05,
+      "loss": 0.8997,
+      "step": 12669
+    },
+    {
+      "epoch": 2.255698005698006,
+      "grad_norm": 0.8038806915283203,
+      "learning_rate": 8.019932359423667e-05,
+      "loss": 0.9386,
+      "step": 12670
+    },
+    {
+      "epoch": 2.255876068376068,
+      "grad_norm": 0.7760711908340454,
+      "learning_rate": 8.018560345688883e-05,
+      "loss": 0.7777,
+      "step": 12671
+    },
+    {
+      "epoch": 2.256054131054131,
+      "grad_norm": 0.7433146834373474,
+      "learning_rate": 8.017188370776292e-05,
+      "loss": 0.7245,
+      "step": 12672
+    },
+    {
+      "epoch": 2.2562321937321936,
+      "grad_norm": 0.8710882067680359,
+      "learning_rate": 8.01581643471278e-05,
+      "loss": 0.9478,
+      "step": 12673
+    },
+    {
+      "epoch": 2.2564102564102564,
+      "grad_norm": 0.7726423740386963,
+      "learning_rate": 8.014444537525218e-05,
+      "loss": 0.8388,
+      "step": 12674
+    },
+    {
+      "epoch": 2.256588319088319,
+      "grad_norm": 0.7967063188552856,
+      "learning_rate": 8.01307267924049e-05,
+      "loss": 0.8226,
+      "step": 12675
+    },
+    {
+      "epoch": 2.256766381766382,
+      "grad_norm": 0.7524598836898804,
+      "learning_rate": 8.011700859885479e-05,
+      "loss": 0.7285,
+      "step": 12676
+    },
+    {
+      "epoch": 2.2569444444444446,
+      "grad_norm": 0.808729887008667,
+      "learning_rate": 8.010329079487055e-05,
+      "loss": 0.7498,
+      "step": 12677
+    },
+    {
+      "epoch": 2.257122507122507,
+      "grad_norm": 0.7842788100242615,
+      "learning_rate": 8.008957338072106e-05,
+      "loss": 0.9216,
+      "step": 12678
+    },
+    {
+      "epoch": 2.2573005698005697,
+      "grad_norm": 0.8905709981918335,
+      "learning_rate": 8.007585635667497e-05,
+      "loss": 0.9254,
+      "step": 12679
+    },
+    {
+      "epoch": 2.2574786324786325,
+      "grad_norm": 0.7495295405387878,
+      "learning_rate": 8.006213972300112e-05,
+      "loss": 0.8407,
+      "step": 12680
+    },
+    {
+      "epoch": 2.257656695156695,
+      "grad_norm": 0.7425774335861206,
+      "learning_rate": 8.004842347996819e-05,
+      "loss": 0.7893,
+      "step": 12681
+    },
+    {
+      "epoch": 2.257834757834758,
+      "grad_norm": 0.8028583526611328,
+      "learning_rate": 8.003470762784498e-05,
+      "loss": 0.8106,
+      "step": 12682
+    },
+    {
+      "epoch": 2.2580128205128207,
+      "grad_norm": 0.8874917030334473,
+      "learning_rate": 8.002099216690017e-05,
+      "loss": 0.97,
+      "step": 12683
+    },
+    {
+      "epoch": 2.258190883190883,
+      "grad_norm": 0.8830558061599731,
+      "learning_rate": 8.000727709740257e-05,
+      "loss": 1.028,
+      "step": 12684
+    },
+    {
+      "epoch": 2.2583689458689458,
+      "grad_norm": 0.8720272779464722,
+      "learning_rate": 7.99935624196208e-05,
+      "loss": 0.9401,
+      "step": 12685
+    },
+    {
+      "epoch": 2.2585470085470085,
+      "grad_norm": 0.736709713935852,
+      "learning_rate": 7.997984813382362e-05,
+      "loss": 0.8479,
+      "step": 12686
+    },
+    {
+      "epoch": 2.2587250712250713,
+      "grad_norm": 0.8028469085693359,
+      "learning_rate": 7.996613424027973e-05,
+      "loss": 0.9291,
+      "step": 12687
+    },
+    {
+      "epoch": 2.258903133903134,
+      "grad_norm": 0.777618944644928,
+      "learning_rate": 7.995242073925784e-05,
+      "loss": 0.7021,
+      "step": 12688
+    },
+    {
+      "epoch": 2.2590811965811968,
+      "grad_norm": 0.8371155261993408,
+      "learning_rate": 7.993870763102659e-05,
+      "loss": 0.8309,
+      "step": 12689
+    },
+    {
+      "epoch": 2.259259259259259,
+      "grad_norm": 0.8853654861450195,
+      "learning_rate": 7.992499491585473e-05,
+      "loss": 0.762,
+      "step": 12690
+    },
+    {
+      "epoch": 2.259437321937322,
+      "grad_norm": 0.742594301700592,
+      "learning_rate": 7.991128259401086e-05,
+      "loss": 0.8025,
+      "step": 12691
+    },
+    {
+      "epoch": 2.2596153846153846,
+      "grad_norm": 1.0678842067718506,
+      "learning_rate": 7.989757066576369e-05,
+      "loss": 0.9127,
+      "step": 12692
+    },
+    {
+      "epoch": 2.2597934472934473,
+      "grad_norm": 0.7917565703392029,
+      "learning_rate": 7.988385913138183e-05,
+      "loss": 0.8078,
+      "step": 12693
+    },
+    {
+      "epoch": 2.25997150997151,
+      "grad_norm": 0.6907288432121277,
+      "learning_rate": 7.987014799113397e-05,
+      "loss": 0.6313,
+      "step": 12694
+    },
+    {
+      "epoch": 2.260149572649573,
+      "grad_norm": 0.9007455706596375,
+      "learning_rate": 7.98564372452888e-05,
+      "loss": 1.0734,
+      "step": 12695
+    },
+    {
+      "epoch": 2.260327635327635,
+      "grad_norm": 0.7732774615287781,
+      "learning_rate": 7.984272689411484e-05,
+      "loss": 0.9925,
+      "step": 12696
+    },
+    {
+      "epoch": 2.260505698005698,
+      "grad_norm": 0.7470823526382446,
+      "learning_rate": 7.982901693788082e-05,
+      "loss": 0.8518,
+      "step": 12697
+    },
+    {
+      "epoch": 2.2606837606837606,
+      "grad_norm": 0.8018864989280701,
+      "learning_rate": 7.981530737685526e-05,
+      "loss": 0.8668,
+      "step": 12698
+    },
+    {
+      "epoch": 2.2608618233618234,
+      "grad_norm": 0.8459745049476624,
+      "learning_rate": 7.980159821130688e-05,
+      "loss": 0.8972,
+      "step": 12699
+    },
+    {
+      "epoch": 2.261039886039886,
+      "grad_norm": 0.8255595564842224,
+      "learning_rate": 7.978788944150419e-05,
+      "loss": 0.9562,
+      "step": 12700
+    },
+    {
+      "epoch": 2.261217948717949,
+      "grad_norm": 0.8243128061294556,
+      "learning_rate": 7.977418106771582e-05,
+      "loss": 0.6634,
+      "step": 12701
+    },
+    {
+      "epoch": 2.261396011396011,
+      "grad_norm": 0.802949845790863,
+      "learning_rate": 7.976047309021034e-05,
+      "loss": 0.8155,
+      "step": 12702
+    },
+    {
+      "epoch": 2.261574074074074,
+      "grad_norm": 0.8480857014656067,
+      "learning_rate": 7.97467655092564e-05,
+      "loss": 0.8568,
+      "step": 12703
+    },
+    {
+      "epoch": 2.2617521367521367,
+      "grad_norm": 0.8777545690536499,
+      "learning_rate": 7.973305832512247e-05,
+      "loss": 0.8688,
+      "step": 12704
+    },
+    {
+      "epoch": 2.2619301994301995,
+      "grad_norm": 0.8334060907363892,
+      "learning_rate": 7.971935153807719e-05,
+      "loss": 0.932,
+      "step": 12705
+    },
+    {
+      "epoch": 2.262108262108262,
+      "grad_norm": 0.836976170539856,
+      "learning_rate": 7.970564514838907e-05,
+      "loss": 0.8205,
+      "step": 12706
+    },
+    {
+      "epoch": 2.262286324786325,
+      "grad_norm": 0.782866895198822,
+      "learning_rate": 7.969193915632667e-05,
+      "loss": 0.8362,
+      "step": 12707
+    },
+    {
+      "epoch": 2.2624643874643873,
+      "grad_norm": 0.9018504619598389,
+      "learning_rate": 7.967823356215854e-05,
+      "loss": 0.8354,
+      "step": 12708
+    },
+    {
+      "epoch": 2.26264245014245,
+      "grad_norm": 0.7974916696548462,
+      "learning_rate": 7.966452836615324e-05,
+      "loss": 0.8035,
+      "step": 12709
+    },
+    {
+      "epoch": 2.2628205128205128,
+      "grad_norm": 0.8745712637901306,
+      "learning_rate": 7.965082356857922e-05,
+      "loss": 0.8803,
+      "step": 12710
+    },
+    {
+      "epoch": 2.2629985754985755,
+      "grad_norm": 0.8667176365852356,
+      "learning_rate": 7.963711916970505e-05,
+      "loss": 0.8005,
+      "step": 12711
+    },
+    {
+      "epoch": 2.2631766381766383,
+      "grad_norm": 0.849998950958252,
+      "learning_rate": 7.962341516979922e-05,
+      "loss": 0.8208,
+      "step": 12712
+    },
+    {
+      "epoch": 2.263354700854701,
+      "grad_norm": 0.803727388381958,
+      "learning_rate": 7.960971156913028e-05,
+      "loss": 0.7232,
+      "step": 12713
+    },
+    {
+      "epoch": 2.263532763532764,
+      "grad_norm": 0.842913031578064,
+      "learning_rate": 7.959600836796664e-05,
+      "loss": 0.8182,
+      "step": 12714
+    },
+    {
+      "epoch": 2.263710826210826,
+      "grad_norm": 0.8191903829574585,
+      "learning_rate": 7.958230556657684e-05,
+      "loss": 0.8353,
+      "step": 12715
+    },
+    {
+      "epoch": 2.263888888888889,
+      "grad_norm": 0.8525017499923706,
+      "learning_rate": 7.95686031652294e-05,
+      "loss": 0.9824,
+      "step": 12716
+    },
+    {
+      "epoch": 2.2640669515669516,
+      "grad_norm": 0.7176641225814819,
+      "learning_rate": 7.955490116419267e-05,
+      "loss": 0.7722,
+      "step": 12717
+    },
+    {
+      "epoch": 2.2642450142450143,
+      "grad_norm": 0.8740555047988892,
+      "learning_rate": 7.954119956373521e-05,
+      "loss": 0.8286,
+      "step": 12718
+    },
+    {
+      "epoch": 2.264423076923077,
+      "grad_norm": 0.7928949594497681,
+      "learning_rate": 7.952749836412543e-05,
+      "loss": 0.9183,
+      "step": 12719
+    },
+    {
+      "epoch": 2.2646011396011394,
+      "grad_norm": 0.787661612033844,
+      "learning_rate": 7.951379756563185e-05,
+      "loss": 0.7741,
+      "step": 12720
+    },
+    {
+      "epoch": 2.264779202279202,
+      "grad_norm": 0.8369856476783752,
+      "learning_rate": 7.950009716852277e-05,
+      "loss": 0.911,
+      "step": 12721
+    },
+    {
+      "epoch": 2.264957264957265,
+      "grad_norm": 0.7838568687438965,
+      "learning_rate": 7.948639717306675e-05,
+      "loss": 0.8532,
+      "step": 12722
+    },
+    {
+      "epoch": 2.2651353276353277,
+      "grad_norm": 0.8287179470062256,
+      "learning_rate": 7.947269757953213e-05,
+      "loss": 0.893,
+      "step": 12723
+    },
+    {
+      "epoch": 2.2653133903133904,
+      "grad_norm": 0.7754728198051453,
+      "learning_rate": 7.945899838818741e-05,
+      "loss": 0.9516,
+      "step": 12724
+    },
+    {
+      "epoch": 2.265491452991453,
+      "grad_norm": 0.7088906764984131,
+      "learning_rate": 7.94452995993009e-05,
+      "loss": 0.6797,
+      "step": 12725
+    },
+    {
+      "epoch": 2.265669515669516,
+      "grad_norm": 0.8004380464553833,
+      "learning_rate": 7.94316012131411e-05,
+      "loss": 0.8583,
+      "step": 12726
+    },
+    {
+      "epoch": 2.265847578347578,
+      "grad_norm": 0.8221408128738403,
+      "learning_rate": 7.941790322997629e-05,
+      "loss": 0.9575,
+      "step": 12727
+    },
+    {
+      "epoch": 2.266025641025641,
+      "grad_norm": 0.8640061020851135,
+      "learning_rate": 7.940420565007492e-05,
+      "loss": 0.9471,
+      "step": 12728
+    },
+    {
+      "epoch": 2.2662037037037037,
+      "grad_norm": 0.8151915669441223,
+      "learning_rate": 7.939050847370536e-05,
+      "loss": 0.7841,
+      "step": 12729
+    },
+    {
+      "epoch": 2.2663817663817665,
+      "grad_norm": 0.7910612225532532,
+      "learning_rate": 7.9376811701136e-05,
+      "loss": 0.8826,
+      "step": 12730
+    },
+    {
+      "epoch": 2.2665598290598292,
+      "grad_norm": 0.7158875465393066,
+      "learning_rate": 7.936311533263514e-05,
+      "loss": 0.8598,
+      "step": 12731
+    },
+    {
+      "epoch": 2.2667378917378915,
+      "grad_norm": 0.6968050003051758,
+      "learning_rate": 7.934941936847119e-05,
+      "loss": 0.742,
+      "step": 12732
+    },
+    {
+      "epoch": 2.2669159544159543,
+      "grad_norm": 0.8630516529083252,
+      "learning_rate": 7.933572380891245e-05,
+      "loss": 0.789,
+      "step": 12733
+    },
+    {
+      "epoch": 2.267094017094017,
+      "grad_norm": 0.8060851097106934,
+      "learning_rate": 7.932202865422726e-05,
+      "loss": 0.8447,
+      "step": 12734
+    },
+    {
+      "epoch": 2.26727207977208,
+      "grad_norm": 0.9570813775062561,
+      "learning_rate": 7.930833390468402e-05,
+      "loss": 0.8948,
+      "step": 12735
+    },
+    {
+      "epoch": 2.2674501424501425,
+      "grad_norm": 0.7649935483932495,
+      "learning_rate": 7.929463956055094e-05,
+      "loss": 0.905,
+      "step": 12736
+    },
+    {
+      "epoch": 2.2676282051282053,
+      "grad_norm": 0.7498226165771484,
+      "learning_rate": 7.928094562209641e-05,
+      "loss": 0.9168,
+      "step": 12737
+    },
+    {
+      "epoch": 2.267806267806268,
+      "grad_norm": 0.7915979027748108,
+      "learning_rate": 7.926725208958869e-05,
+      "loss": 0.8628,
+      "step": 12738
+    },
+    {
+      "epoch": 2.2679843304843303,
+      "grad_norm": 0.7620252966880798,
+      "learning_rate": 7.925355896329615e-05,
+      "loss": 0.8768,
+      "step": 12739
+    },
+    {
+      "epoch": 2.268162393162393,
+      "grad_norm": 0.9785344004631042,
+      "learning_rate": 7.923986624348697e-05,
+      "loss": 0.9579,
+      "step": 12740
+    },
+    {
+      "epoch": 2.268340455840456,
+      "grad_norm": 0.9146337509155273,
+      "learning_rate": 7.922617393042954e-05,
+      "loss": 1.2241,
+      "step": 12741
+    },
+    {
+      "epoch": 2.2685185185185186,
+      "grad_norm": 0.7815660238265991,
+      "learning_rate": 7.921248202439203e-05,
+      "loss": 0.7367,
+      "step": 12742
+    },
+    {
+      "epoch": 2.2686965811965814,
+      "grad_norm": 0.8466008305549622,
+      "learning_rate": 7.919879052564276e-05,
+      "loss": 0.923,
+      "step": 12743
+    },
+    {
+      "epoch": 2.2688746438746437,
+      "grad_norm": 0.742203950881958,
+      "learning_rate": 7.918509943444998e-05,
+      "loss": 0.8282,
+      "step": 12744
+    },
+    {
+      "epoch": 2.2690527065527064,
+      "grad_norm": 0.785446286201477,
+      "learning_rate": 7.917140875108196e-05,
+      "loss": 0.7689,
+      "step": 12745
+    },
+    {
+      "epoch": 2.269230769230769,
+      "grad_norm": 0.912765383720398,
+      "learning_rate": 7.915771847580689e-05,
+      "loss": 0.8259,
+      "step": 12746
+    },
+    {
+      "epoch": 2.269408831908832,
+      "grad_norm": 0.7319221496582031,
+      "learning_rate": 7.914402860889305e-05,
+      "loss": 0.8861,
+      "step": 12747
+    },
+    {
+      "epoch": 2.2695868945868947,
+      "grad_norm": 1.0215193033218384,
+      "learning_rate": 7.913033915060861e-05,
+      "loss": 0.8556,
+      "step": 12748
+    },
+    {
+      "epoch": 2.2697649572649574,
+      "grad_norm": 0.9348630905151367,
+      "learning_rate": 7.911665010122188e-05,
+      "loss": 1.0269,
+      "step": 12749
+    },
+    {
+      "epoch": 2.26994301994302,
+      "grad_norm": 0.7521753311157227,
+      "learning_rate": 7.910296146100096e-05,
+      "loss": 0.8492,
+      "step": 12750
+    },
+    {
+      "epoch": 2.2701210826210825,
+      "grad_norm": 0.7274978756904602,
+      "learning_rate": 7.908927323021414e-05,
+      "loss": 0.7029,
+      "step": 12751
+    },
+    {
+      "epoch": 2.2702991452991452,
+      "grad_norm": 0.8103266954421997,
+      "learning_rate": 7.907558540912954e-05,
+      "loss": 0.5268,
+      "step": 12752
+    },
+    {
+      "epoch": 2.270477207977208,
+      "grad_norm": 0.8645551800727844,
+      "learning_rate": 7.906189799801538e-05,
+      "loss": 0.8172,
+      "step": 12753
+    },
+    {
+      "epoch": 2.2706552706552707,
+      "grad_norm": 0.8652981519699097,
+      "learning_rate": 7.904821099713984e-05,
+      "loss": 0.8711,
+      "step": 12754
+    },
+    {
+      "epoch": 2.2708333333333335,
+      "grad_norm": 0.7020241618156433,
+      "learning_rate": 7.903452440677106e-05,
+      "loss": 0.7202,
+      "step": 12755
+    },
+    {
+      "epoch": 2.271011396011396,
+      "grad_norm": 0.9812583923339844,
+      "learning_rate": 7.902083822717727e-05,
+      "loss": 0.8274,
+      "step": 12756
+    },
+    {
+      "epoch": 2.2711894586894585,
+      "grad_norm": 0.9119269847869873,
+      "learning_rate": 7.900715245862655e-05,
+      "loss": 0.8695,
+      "step": 12757
+    },
+    {
+      "epoch": 2.2713675213675213,
+      "grad_norm": 0.7336047291755676,
+      "learning_rate": 7.899346710138706e-05,
+      "loss": 0.8138,
+      "step": 12758
+    },
+    {
+      "epoch": 2.271545584045584,
+      "grad_norm": 0.900337278842926,
+      "learning_rate": 7.897978215572695e-05,
+      "loss": 0.8346,
+      "step": 12759
+    },
+    {
+      "epoch": 2.271723646723647,
+      "grad_norm": 0.736018717288971,
+      "learning_rate": 7.896609762191437e-05,
+      "loss": 0.7045,
+      "step": 12760
+    },
+    {
+      "epoch": 2.2719017094017095,
+      "grad_norm": 0.8484935760498047,
+      "learning_rate": 7.895241350021737e-05,
+      "loss": 1.05,
+      "step": 12761
+    },
+    {
+      "epoch": 2.2720797720797723,
+      "grad_norm": 0.8032337427139282,
+      "learning_rate": 7.893872979090415e-05,
+      "loss": 0.8024,
+      "step": 12762
+    },
+    {
+      "epoch": 2.2722578347578346,
+      "grad_norm": 0.8957629203796387,
+      "learning_rate": 7.892504649424272e-05,
+      "loss": 0.9593,
+      "step": 12763
+    },
+    {
+      "epoch": 2.2724358974358974,
+      "grad_norm": 0.9227191805839539,
+      "learning_rate": 7.891136361050126e-05,
+      "loss": 0.9978,
+      "step": 12764
+    },
+    {
+      "epoch": 2.27261396011396,
+      "grad_norm": 0.8649391531944275,
+      "learning_rate": 7.88976811399478e-05,
+      "loss": 0.8525,
+      "step": 12765
+    },
+    {
+      "epoch": 2.272792022792023,
+      "grad_norm": 0.8762859106063843,
+      "learning_rate": 7.888399908285046e-05,
+      "loss": 0.9526,
+      "step": 12766
+    },
+    {
+      "epoch": 2.2729700854700856,
+      "grad_norm": 0.8566350340843201,
+      "learning_rate": 7.887031743947729e-05,
+      "loss": 0.7886,
+      "step": 12767
+    },
+    {
+      "epoch": 2.273148148148148,
+      "grad_norm": 0.9285386800765991,
+      "learning_rate": 7.885663621009634e-05,
+      "loss": 1.013,
+      "step": 12768
+    },
+    {
+      "epoch": 2.2733262108262107,
+      "grad_norm": 0.9326284527778625,
+      "learning_rate": 7.884295539497569e-05,
+      "loss": 0.9908,
+      "step": 12769
+    },
+    {
+      "epoch": 2.2735042735042734,
+      "grad_norm": 0.8035810589790344,
+      "learning_rate": 7.882927499438341e-05,
+      "loss": 0.7452,
+      "step": 12770
+    },
+    {
+      "epoch": 2.273682336182336,
+      "grad_norm": 0.831741988658905,
+      "learning_rate": 7.881559500858747e-05,
+      "loss": 0.8782,
+      "step": 12771
+    },
+    {
+      "epoch": 2.273860398860399,
+      "grad_norm": 0.7790034413337708,
+      "learning_rate": 7.880191543785594e-05,
+      "loss": 0.9494,
+      "step": 12772
+    },
+    {
+      "epoch": 2.2740384615384617,
+      "grad_norm": 0.7070899605751038,
+      "learning_rate": 7.878823628245684e-05,
+      "loss": 0.7007,
+      "step": 12773
+    },
+    {
+      "epoch": 2.2742165242165244,
+      "grad_norm": 0.739573061466217,
+      "learning_rate": 7.877455754265818e-05,
+      "loss": 0.758,
+      "step": 12774
+    },
+    {
+      "epoch": 2.2743945868945867,
+      "grad_norm": 1.091391921043396,
+      "learning_rate": 7.876087921872803e-05,
+      "loss": 1.1333,
+      "step": 12775
+    },
+    {
+      "epoch": 2.2745726495726495,
+      "grad_norm": 0.623710036277771,
+      "learning_rate": 7.874720131093427e-05,
+      "loss": 0.7068,
+      "step": 12776
+    },
+    {
+      "epoch": 2.2747507122507122,
+      "grad_norm": 0.7989393472671509,
+      "learning_rate": 7.8733523819545e-05,
+      "loss": 0.77,
+      "step": 12777
+    },
+    {
+      "epoch": 2.274928774928775,
+      "grad_norm": 0.8401352167129517,
+      "learning_rate": 7.87198467448281e-05,
+      "loss": 0.8192,
+      "step": 12778
+    },
+    {
+      "epoch": 2.2751068376068377,
+      "grad_norm": 0.7962843179702759,
+      "learning_rate": 7.870617008705164e-05,
+      "loss": 0.8071,
+      "step": 12779
+    },
+    {
+      "epoch": 2.2752849002849,
+      "grad_norm": 0.9518889784812927,
+      "learning_rate": 7.869249384648351e-05,
+      "loss": 0.8956,
+      "step": 12780
+    },
+    {
+      "epoch": 2.275462962962963,
+      "grad_norm": 0.7469878792762756,
+      "learning_rate": 7.867881802339175e-05,
+      "loss": 0.6816,
+      "step": 12781
+    },
+    {
+      "epoch": 2.2756410256410255,
+      "grad_norm": 0.8888431191444397,
+      "learning_rate": 7.866514261804421e-05,
+      "loss": 0.7906,
+      "step": 12782
+    },
+    {
+      "epoch": 2.2758190883190883,
+      "grad_norm": 0.9856036305427551,
+      "learning_rate": 7.86514676307089e-05,
+      "loss": 0.9371,
+      "step": 12783
+    },
+    {
+      "epoch": 2.275997150997151,
+      "grad_norm": 0.9144912958145142,
+      "learning_rate": 7.863779306165371e-05,
+      "loss": 0.9613,
+      "step": 12784
+    },
+    {
+      "epoch": 2.276175213675214,
+      "grad_norm": 0.7898108959197998,
+      "learning_rate": 7.862411891114665e-05,
+      "loss": 0.8631,
+      "step": 12785
+    },
+    {
+      "epoch": 2.2763532763532766,
+      "grad_norm": 0.8524056077003479,
+      "learning_rate": 7.861044517945552e-05,
+      "loss": 0.7011,
+      "step": 12786
+    },
+    {
+      "epoch": 2.276531339031339,
+      "grad_norm": 0.8811307549476624,
+      "learning_rate": 7.859677186684831e-05,
+      "loss": 0.8138,
+      "step": 12787
+    },
+    {
+      "epoch": 2.2767094017094016,
+      "grad_norm": 0.9247646331787109,
+      "learning_rate": 7.858309897359289e-05,
+      "loss": 0.8971,
+      "step": 12788
+    },
+    {
+      "epoch": 2.2768874643874644,
+      "grad_norm": 0.8655884861946106,
+      "learning_rate": 7.856942649995715e-05,
+      "loss": 0.885,
+      "step": 12789
+    },
+    {
+      "epoch": 2.277065527065527,
+      "grad_norm": 0.9330910444259644,
+      "learning_rate": 7.855575444620897e-05,
+      "loss": 0.8493,
+      "step": 12790
+    },
+    {
+      "epoch": 2.27724358974359,
+      "grad_norm": 0.746694028377533,
+      "learning_rate": 7.854208281261626e-05,
+      "loss": 0.7169,
+      "step": 12791
+    },
+    {
+      "epoch": 2.277421652421652,
+      "grad_norm": 0.9785143136978149,
+      "learning_rate": 7.852841159944685e-05,
+      "loss": 1.023,
+      "step": 12792
+    },
+    {
+      "epoch": 2.277599715099715,
+      "grad_norm": 0.6107021570205688,
+      "learning_rate": 7.851474080696859e-05,
+      "loss": 0.652,
+      "step": 12793
+    },
+    {
+      "epoch": 2.2777777777777777,
+      "grad_norm": 0.9269224405288696,
+      "learning_rate": 7.850107043544937e-05,
+      "loss": 0.8024,
+      "step": 12794
+    },
+    {
+      "epoch": 2.2779558404558404,
+      "grad_norm": 0.8488328456878662,
+      "learning_rate": 7.8487400485157e-05,
+      "loss": 0.8626,
+      "step": 12795
+    },
+    {
+      "epoch": 2.278133903133903,
+      "grad_norm": 0.7187852263450623,
+      "learning_rate": 7.847373095635937e-05,
+      "loss": 0.7416,
+      "step": 12796
+    },
+    {
+      "epoch": 2.278311965811966,
+      "grad_norm": 1.00519859790802,
+      "learning_rate": 7.846006184932422e-05,
+      "loss": 1.0577,
+      "step": 12797
+    },
+    {
+      "epoch": 2.2784900284900287,
+      "grad_norm": 0.8175216913223267,
+      "learning_rate": 7.844639316431945e-05,
+      "loss": 0.9685,
+      "step": 12798
+    },
+    {
+      "epoch": 2.278668091168091,
+      "grad_norm": 0.8239067792892456,
+      "learning_rate": 7.843272490161281e-05,
+      "loss": 0.8714,
+      "step": 12799
+    },
+    {
+      "epoch": 2.2788461538461537,
+      "grad_norm": 0.8089447617530823,
+      "learning_rate": 7.841905706147212e-05,
+      "loss": 0.8397,
+      "step": 12800
+    },
+    {
+      "epoch": 2.2790242165242165,
+      "grad_norm": 0.8505867719650269,
+      "learning_rate": 7.840538964416518e-05,
+      "loss": 0.6872,
+      "step": 12801
+    },
+    {
+      "epoch": 2.2792022792022792,
+      "grad_norm": 0.8512473702430725,
+      "learning_rate": 7.83917226499598e-05,
+      "loss": 1.0422,
+      "step": 12802
+    },
+    {
+      "epoch": 2.279380341880342,
+      "grad_norm": 0.8935198187828064,
+      "learning_rate": 7.837805607912369e-05,
+      "loss": 0.9874,
+      "step": 12803
+    },
+    {
+      "epoch": 2.2795584045584047,
+      "grad_norm": 0.6903477907180786,
+      "learning_rate": 7.836438993192466e-05,
+      "loss": 0.7301,
+      "step": 12804
+    },
+    {
+      "epoch": 2.279736467236467,
+      "grad_norm": 0.7140037417411804,
+      "learning_rate": 7.835072420863046e-05,
+      "loss": 0.8323,
+      "step": 12805
+    },
+    {
+      "epoch": 2.27991452991453,
+      "grad_norm": 0.7974498867988586,
+      "learning_rate": 7.833705890950888e-05,
+      "loss": 0.7784,
+      "step": 12806
+    },
+    {
+      "epoch": 2.2800925925925926,
+      "grad_norm": 0.8191199898719788,
+      "learning_rate": 7.83233940348276e-05,
+      "loss": 0.8584,
+      "step": 12807
+    },
+    {
+      "epoch": 2.2802706552706553,
+      "grad_norm": 0.843112587928772,
+      "learning_rate": 7.83097295848544e-05,
+      "loss": 1.043,
+      "step": 12808
+    },
+    {
+      "epoch": 2.280448717948718,
+      "grad_norm": 0.8029288053512573,
+      "learning_rate": 7.829606555985698e-05,
+      "loss": 0.6806,
+      "step": 12809
+    },
+    {
+      "epoch": 2.280626780626781,
+      "grad_norm": 0.712228536605835,
+      "learning_rate": 7.828240196010311e-05,
+      "loss": 0.7748,
+      "step": 12810
+    },
+    {
+      "epoch": 2.280804843304843,
+      "grad_norm": 0.801659882068634,
+      "learning_rate": 7.82687387858604e-05,
+      "loss": 0.9374,
+      "step": 12811
+    },
+    {
+      "epoch": 2.280982905982906,
+      "grad_norm": 0.8457205891609192,
+      "learning_rate": 7.825507603739666e-05,
+      "loss": 0.9453,
+      "step": 12812
+    },
+    {
+      "epoch": 2.2811609686609686,
+      "grad_norm": 0.9129060506820679,
+      "learning_rate": 7.824141371497948e-05,
+      "loss": 0.9324,
+      "step": 12813
+    },
+    {
+      "epoch": 2.2813390313390314,
+      "grad_norm": 0.947914183139801,
+      "learning_rate": 7.822775181887663e-05,
+      "loss": 0.8275,
+      "step": 12814
+    },
+    {
+      "epoch": 2.281517094017094,
+      "grad_norm": 0.7204358577728271,
+      "learning_rate": 7.821409034935576e-05,
+      "loss": 0.6538,
+      "step": 12815
+    },
+    {
+      "epoch": 2.281695156695157,
+      "grad_norm": 0.8021003603935242,
+      "learning_rate": 7.82004293066845e-05,
+      "loss": 1.0464,
+      "step": 12816
+    },
+    {
+      "epoch": 2.281873219373219,
+      "grad_norm": 0.9530314803123474,
+      "learning_rate": 7.818676869113059e-05,
+      "loss": 0.8854,
+      "step": 12817
+    },
+    {
+      "epoch": 2.282051282051282,
+      "grad_norm": 0.7932098507881165,
+      "learning_rate": 7.81731085029616e-05,
+      "loss": 0.8219,
+      "step": 12818
+    },
+    {
+      "epoch": 2.2822293447293447,
+      "grad_norm": 0.7955925464630127,
+      "learning_rate": 7.815944874244523e-05,
+      "loss": 0.801,
+      "step": 12819
+    },
+    {
+      "epoch": 2.2824074074074074,
+      "grad_norm": 0.8490158915519714,
+      "learning_rate": 7.814578940984907e-05,
+      "loss": 0.8666,
+      "step": 12820
+    },
+    {
+      "epoch": 2.28258547008547,
+      "grad_norm": 0.7325232028961182,
+      "learning_rate": 7.813213050544081e-05,
+      "loss": 0.9579,
+      "step": 12821
+    },
+    {
+      "epoch": 2.282763532763533,
+      "grad_norm": 0.9203488230705261,
+      "learning_rate": 7.811847202948798e-05,
+      "loss": 1.0581,
+      "step": 12822
+    },
+    {
+      "epoch": 2.2829415954415953,
+      "grad_norm": 0.8207429647445679,
+      "learning_rate": 7.810481398225827e-05,
+      "loss": 0.8613,
+      "step": 12823
+    },
+    {
+      "epoch": 2.283119658119658,
+      "grad_norm": 0.872207522392273,
+      "learning_rate": 7.809115636401921e-05,
+      "loss": 0.9155,
+      "step": 12824
+    },
+    {
+      "epoch": 2.2832977207977208,
+      "grad_norm": 0.8032099604606628,
+      "learning_rate": 7.807749917503845e-05,
+      "loss": 0.8294,
+      "step": 12825
+    },
+    {
+      "epoch": 2.2834757834757835,
+      "grad_norm": 0.8824980854988098,
+      "learning_rate": 7.806384241558354e-05,
+      "loss": 0.8618,
+      "step": 12826
+    },
+    {
+      "epoch": 2.2836538461538463,
+      "grad_norm": 0.9057566523551941,
+      "learning_rate": 7.805018608592212e-05,
+      "loss": 0.826,
+      "step": 12827
+    },
+    {
+      "epoch": 2.283831908831909,
+      "grad_norm": 0.8092000484466553,
+      "learning_rate": 7.803653018632164e-05,
+      "loss": 0.8091,
+      "step": 12828
+    },
+    {
+      "epoch": 2.2840099715099713,
+      "grad_norm": 0.8372754454612732,
+      "learning_rate": 7.802287471704976e-05,
+      "loss": 1.108,
+      "step": 12829
+    },
+    {
+      "epoch": 2.284188034188034,
+      "grad_norm": 0.8702181577682495,
+      "learning_rate": 7.800921967837398e-05,
+      "loss": 0.9654,
+      "step": 12830
+    },
+    {
+      "epoch": 2.284366096866097,
+      "grad_norm": 0.9543859958648682,
+      "learning_rate": 7.79955650705619e-05,
+      "loss": 0.9268,
+      "step": 12831
+    },
+    {
+      "epoch": 2.2845441595441596,
+      "grad_norm": 0.7992038726806641,
+      "learning_rate": 7.798191089388096e-05,
+      "loss": 0.8299,
+      "step": 12832
+    },
+    {
+      "epoch": 2.2847222222222223,
+      "grad_norm": 0.8655165433883667,
+      "learning_rate": 7.796825714859874e-05,
+      "loss": 0.9656,
+      "step": 12833
+    },
+    {
+      "epoch": 2.284900284900285,
+      "grad_norm": 0.9013311862945557,
+      "learning_rate": 7.795460383498281e-05,
+      "loss": 0.9373,
+      "step": 12834
+    },
+    {
+      "epoch": 2.285078347578348,
+      "grad_norm": 0.8453806638717651,
+      "learning_rate": 7.794095095330058e-05,
+      "loss": 0.7711,
+      "step": 12835
+    },
+    {
+      "epoch": 2.28525641025641,
+      "grad_norm": 0.8016965985298157,
+      "learning_rate": 7.792729850381959e-05,
+      "loss": 0.7492,
+      "step": 12836
+    },
+    {
+      "epoch": 2.285434472934473,
+      "grad_norm": 0.7191343307495117,
+      "learning_rate": 7.791364648680734e-05,
+      "loss": 0.7541,
+      "step": 12837
+    },
+    {
+      "epoch": 2.2856125356125356,
+      "grad_norm": 0.8210958242416382,
+      "learning_rate": 7.789999490253133e-05,
+      "loss": 0.7448,
+      "step": 12838
+    },
+    {
+      "epoch": 2.2857905982905984,
+      "grad_norm": 0.904022216796875,
+      "learning_rate": 7.788634375125898e-05,
+      "loss": 1.0329,
+      "step": 12839
+    },
+    {
+      "epoch": 2.285968660968661,
+      "grad_norm": 0.8934714794158936,
+      "learning_rate": 7.787269303325779e-05,
+      "loss": 0.8982,
+      "step": 12840
+    },
+    {
+      "epoch": 2.2861467236467234,
+      "grad_norm": 0.9424307942390442,
+      "learning_rate": 7.785904274879521e-05,
+      "loss": 1.0298,
+      "step": 12841
+    },
+    {
+      "epoch": 2.286324786324786,
+      "grad_norm": 0.7753969430923462,
+      "learning_rate": 7.784539289813873e-05,
+      "loss": 0.7811,
+      "step": 12842
+    },
+    {
+      "epoch": 2.286502849002849,
+      "grad_norm": 0.7851901054382324,
+      "learning_rate": 7.78317434815557e-05,
+      "loss": 0.8395,
+      "step": 12843
+    },
+    {
+      "epoch": 2.2866809116809117,
+      "grad_norm": 0.7734000086784363,
+      "learning_rate": 7.781809449931365e-05,
+      "loss": 0.6572,
+      "step": 12844
+    },
+    {
+      "epoch": 2.2868589743589745,
+      "grad_norm": 0.8322952389717102,
+      "learning_rate": 7.780444595167992e-05,
+      "loss": 0.9707,
+      "step": 12845
+    },
+    {
+      "epoch": 2.287037037037037,
+      "grad_norm": 0.8243176341056824,
+      "learning_rate": 7.779079783892203e-05,
+      "loss": 0.8413,
+      "step": 12846
+    },
+    {
+      "epoch": 2.2872150997151,
+      "grad_norm": 0.8600375056266785,
+      "learning_rate": 7.777715016130727e-05,
+      "loss": 0.8471,
+      "step": 12847
+    },
+    {
+      "epoch": 2.2873931623931623,
+      "grad_norm": 0.9846388101577759,
+      "learning_rate": 7.776350291910311e-05,
+      "loss": 1.0187,
+      "step": 12848
+    },
+    {
+      "epoch": 2.287571225071225,
+      "grad_norm": 0.8445034623146057,
+      "learning_rate": 7.774985611257688e-05,
+      "loss": 0.9113,
+      "step": 12849
+    },
+    {
+      "epoch": 2.2877492877492878,
+      "grad_norm": 0.804595947265625,
+      "learning_rate": 7.773620974199604e-05,
+      "loss": 0.8331,
+      "step": 12850
+    },
+    {
+      "epoch": 2.2879273504273505,
+      "grad_norm": 0.7600802779197693,
+      "learning_rate": 7.772256380762789e-05,
+      "loss": 0.8448,
+      "step": 12851
+    },
+    {
+      "epoch": 2.2881054131054133,
+      "grad_norm": 0.7406377792358398,
+      "learning_rate": 7.770891830973984e-05,
+      "loss": 0.7904,
+      "step": 12852
+    },
+    {
+      "epoch": 2.2882834757834756,
+      "grad_norm": 0.7294487357139587,
+      "learning_rate": 7.769527324859924e-05,
+      "loss": 0.8799,
+      "step": 12853
+    },
+    {
+      "epoch": 2.2884615384615383,
+      "grad_norm": 0.8864750266075134,
+      "learning_rate": 7.768162862447342e-05,
+      "loss": 0.9038,
+      "step": 12854
+    },
+    {
+      "epoch": 2.288639601139601,
+      "grad_norm": 0.8933553099632263,
+      "learning_rate": 7.766798443762972e-05,
+      "loss": 0.929,
+      "step": 12855
+    },
+    {
+      "epoch": 2.288817663817664,
+      "grad_norm": 0.8065192103385925,
+      "learning_rate": 7.765434068833545e-05,
+      "loss": 0.9335,
+      "step": 12856
+    },
+    {
+      "epoch": 2.2889957264957266,
+      "grad_norm": 0.8644578456878662,
+      "learning_rate": 7.764069737685802e-05,
+      "loss": 0.7717,
+      "step": 12857
+    },
+    {
+      "epoch": 2.2891737891737893,
+      "grad_norm": 0.8957899212837219,
+      "learning_rate": 7.762705450346462e-05,
+      "loss": 0.8625,
+      "step": 12858
+    },
+    {
+      "epoch": 2.289351851851852,
+      "grad_norm": 0.7164827585220337,
+      "learning_rate": 7.761341206842265e-05,
+      "loss": 0.8018,
+      "step": 12859
+    },
+    {
+      "epoch": 2.2895299145299144,
+      "grad_norm": 0.8752971291542053,
+      "learning_rate": 7.759977007199933e-05,
+      "loss": 0.8517,
+      "step": 12860
+    },
+    {
+      "epoch": 2.289707977207977,
+      "grad_norm": 0.8448139429092407,
+      "learning_rate": 7.758612851446201e-05,
+      "loss": 1.0449,
+      "step": 12861
+    },
+    {
+      "epoch": 2.28988603988604,
+      "grad_norm": 0.81675785779953,
+      "learning_rate": 7.75724873960779e-05,
+      "loss": 1.0952,
+      "step": 12862
+    },
+    {
+      "epoch": 2.2900641025641026,
+      "grad_norm": 0.8215656876564026,
+      "learning_rate": 7.755884671711437e-05,
+      "loss": 0.8419,
+      "step": 12863
+    },
+    {
+      "epoch": 2.2902421652421654,
+      "grad_norm": 0.8270167708396912,
+      "learning_rate": 7.754520647783859e-05,
+      "loss": 0.9065,
+      "step": 12864
+    },
+    {
+      "epoch": 2.2904202279202277,
+      "grad_norm": 0.8222723603248596,
+      "learning_rate": 7.753156667851784e-05,
+      "loss": 0.8536,
+      "step": 12865
+    },
+    {
+      "epoch": 2.2905982905982905,
+      "grad_norm": 0.8383764028549194,
+      "learning_rate": 7.751792731941936e-05,
+      "loss": 0.8829,
+      "step": 12866
+    },
+    {
+      "epoch": 2.290776353276353,
+      "grad_norm": 0.8115772008895874,
+      "learning_rate": 7.750428840081043e-05,
+      "loss": 0.8969,
+      "step": 12867
+    },
+    {
+      "epoch": 2.290954415954416,
+      "grad_norm": 0.8721897602081299,
+      "learning_rate": 7.74906499229582e-05,
+      "loss": 1.031,
+      "step": 12868
+    },
+    {
+      "epoch": 2.2911324786324787,
+      "grad_norm": 0.6958467364311218,
+      "learning_rate": 7.747701188612996e-05,
+      "loss": 0.7528,
+      "step": 12869
+    },
+    {
+      "epoch": 2.2913105413105415,
+      "grad_norm": 0.8352338671684265,
+      "learning_rate": 7.746337429059285e-05,
+      "loss": 0.9297,
+      "step": 12870
+    },
+    {
+      "epoch": 2.291488603988604,
+      "grad_norm": 0.8407408595085144,
+      "learning_rate": 7.744973713661411e-05,
+      "loss": 0.8209,
+      "step": 12871
+    },
+    {
+      "epoch": 2.2916666666666665,
+      "grad_norm": 0.9509777426719666,
+      "learning_rate": 7.743610042446092e-05,
+      "loss": 0.9408,
+      "step": 12872
+    },
+    {
+      "epoch": 2.2918447293447293,
+      "grad_norm": 0.7913112640380859,
+      "learning_rate": 7.742246415440048e-05,
+      "loss": 0.9063,
+      "step": 12873
+    },
+    {
+      "epoch": 2.292022792022792,
+      "grad_norm": 0.90866619348526,
+      "learning_rate": 7.740882832669998e-05,
+      "loss": 1.0178,
+      "step": 12874
+    },
+    {
+      "epoch": 2.2922008547008548,
+      "grad_norm": 0.5832980871200562,
+      "learning_rate": 7.739519294162652e-05,
+      "loss": 0.4138,
+      "step": 12875
+    },
+    {
+      "epoch": 2.2923789173789175,
+      "grad_norm": 0.717993974685669,
+      "learning_rate": 7.738155799944732e-05,
+      "loss": 0.7303,
+      "step": 12876
+    },
+    {
+      "epoch": 2.29255698005698,
+      "grad_norm": 0.7821396589279175,
+      "learning_rate": 7.736792350042948e-05,
+      "loss": 0.829,
+      "step": 12877
+    },
+    {
+      "epoch": 2.2927350427350426,
+      "grad_norm": 0.8877809047698975,
+      "learning_rate": 7.735428944484021e-05,
+      "loss": 0.8883,
+      "step": 12878
+    },
+    {
+      "epoch": 2.2929131054131053,
+      "grad_norm": 0.7754776477813721,
+      "learning_rate": 7.734065583294656e-05,
+      "loss": 0.807,
+      "step": 12879
+    },
+    {
+      "epoch": 2.293091168091168,
+      "grad_norm": 0.851157009601593,
+      "learning_rate": 7.73270226650157e-05,
+      "loss": 0.8859,
+      "step": 12880
+    },
+    {
+      "epoch": 2.293269230769231,
+      "grad_norm": 0.7635365128517151,
+      "learning_rate": 7.731338994131472e-05,
+      "loss": 0.9796,
+      "step": 12881
+    },
+    {
+      "epoch": 2.2934472934472936,
+      "grad_norm": 0.8386050462722778,
+      "learning_rate": 7.729975766211078e-05,
+      "loss": 0.788,
+      "step": 12882
+    },
+    {
+      "epoch": 2.2936253561253563,
+      "grad_norm": 0.7092825174331665,
+      "learning_rate": 7.728612582767088e-05,
+      "loss": 0.6855,
+      "step": 12883
+    },
+    {
+      "epoch": 2.2938034188034186,
+      "grad_norm": 0.8651222586631775,
+      "learning_rate": 7.72724944382622e-05,
+      "loss": 0.8875,
+      "step": 12884
+    },
+    {
+      "epoch": 2.2939814814814814,
+      "grad_norm": 0.89743572473526,
+      "learning_rate": 7.725886349415175e-05,
+      "loss": 0.9256,
+      "step": 12885
+    },
+    {
+      "epoch": 2.294159544159544,
+      "grad_norm": 0.8257600665092468,
+      "learning_rate": 7.724523299560664e-05,
+      "loss": 0.6703,
+      "step": 12886
+    },
+    {
+      "epoch": 2.294337606837607,
+      "grad_norm": 0.8133751153945923,
+      "learning_rate": 7.72316029428939e-05,
+      "loss": 0.8991,
+      "step": 12887
+    },
+    {
+      "epoch": 2.2945156695156697,
+      "grad_norm": 0.7874962687492371,
+      "learning_rate": 7.721797333628065e-05,
+      "loss": 0.8679,
+      "step": 12888
+    },
+    {
+      "epoch": 2.294693732193732,
+      "grad_norm": 0.8284404277801514,
+      "learning_rate": 7.720434417603384e-05,
+      "loss": 0.873,
+      "step": 12889
+    },
+    {
+      "epoch": 2.2948717948717947,
+      "grad_norm": 0.8751698136329651,
+      "learning_rate": 7.719071546242058e-05,
+      "loss": 1.0671,
+      "step": 12890
+    },
+    {
+      "epoch": 2.2950498575498575,
+      "grad_norm": 0.9355120062828064,
+      "learning_rate": 7.717708719570784e-05,
+      "loss": 0.93,
+      "step": 12891
+    },
+    {
+      "epoch": 2.29522792022792,
+      "grad_norm": 0.8643141984939575,
+      "learning_rate": 7.716345937616267e-05,
+      "loss": 0.7635,
+      "step": 12892
+    },
+    {
+      "epoch": 2.295405982905983,
+      "grad_norm": 0.9343852996826172,
+      "learning_rate": 7.714983200405212e-05,
+      "loss": 1.0624,
+      "step": 12893
+    },
+    {
+      "epoch": 2.2955840455840457,
+      "grad_norm": 0.893825352191925,
+      "learning_rate": 7.71362050796431e-05,
+      "loss": 1.0843,
+      "step": 12894
+    },
+    {
+      "epoch": 2.2957621082621085,
+      "grad_norm": 0.920723021030426,
+      "learning_rate": 7.712257860320269e-05,
+      "loss": 0.9681,
+      "step": 12895
+    },
+    {
+      "epoch": 2.2959401709401708,
+      "grad_norm": 0.9275181293487549,
+      "learning_rate": 7.710895257499778e-05,
+      "loss": 0.8904,
+      "step": 12896
+    },
+    {
+      "epoch": 2.2961182336182335,
+      "grad_norm": 0.9343428611755371,
+      "learning_rate": 7.709532699529543e-05,
+      "loss": 0.9338,
+      "step": 12897
+    },
+    {
+      "epoch": 2.2962962962962963,
+      "grad_norm": 0.7457774877548218,
+      "learning_rate": 7.708170186436252e-05,
+      "loss": 0.6521,
+      "step": 12898
+    },
+    {
+      "epoch": 2.296474358974359,
+      "grad_norm": 0.7977834343910217,
+      "learning_rate": 7.706807718246611e-05,
+      "loss": 0.887,
+      "step": 12899
+    },
+    {
+      "epoch": 2.296652421652422,
+      "grad_norm": 0.774459719657898,
+      "learning_rate": 7.705445294987304e-05,
+      "loss": 0.914,
+      "step": 12900
+    },
+    {
+      "epoch": 2.296830484330484,
+      "grad_norm": 0.8464851379394531,
+      "learning_rate": 7.704082916685034e-05,
+      "loss": 1.0116,
+      "step": 12901
+    },
+    {
+      "epoch": 2.297008547008547,
+      "grad_norm": 0.8497290015220642,
+      "learning_rate": 7.702720583366486e-05,
+      "loss": 0.9242,
+      "step": 12902
+    },
+    {
+      "epoch": 2.2971866096866096,
+      "grad_norm": 0.8673670291900635,
+      "learning_rate": 7.70135829505836e-05,
+      "loss": 0.8172,
+      "step": 12903
+    },
+    {
+      "epoch": 2.2973646723646723,
+      "grad_norm": 0.786389172077179,
+      "learning_rate": 7.699996051787341e-05,
+      "loss": 0.6713,
+      "step": 12904
+    },
+    {
+      "epoch": 2.297542735042735,
+      "grad_norm": 0.8441919088363647,
+      "learning_rate": 7.698633853580124e-05,
+      "loss": 0.7835,
+      "step": 12905
+    },
+    {
+      "epoch": 2.297720797720798,
+      "grad_norm": 0.8806493878364563,
+      "learning_rate": 7.697271700463392e-05,
+      "loss": 0.9103,
+      "step": 12906
+    },
+    {
+      "epoch": 2.2978988603988606,
+      "grad_norm": 0.7418580651283264,
+      "learning_rate": 7.69590959246384e-05,
+      "loss": 0.9052,
+      "step": 12907
+    },
+    {
+      "epoch": 2.298076923076923,
+      "grad_norm": 0.7883853316307068,
+      "learning_rate": 7.694547529608152e-05,
+      "loss": 0.7689,
+      "step": 12908
+    },
+    {
+      "epoch": 2.2982549857549857,
+      "grad_norm": 0.7842690944671631,
+      "learning_rate": 7.693185511923017e-05,
+      "loss": 0.9587,
+      "step": 12909
+    },
+    {
+      "epoch": 2.2984330484330484,
+      "grad_norm": 0.884484052658081,
+      "learning_rate": 7.691823539435119e-05,
+      "loss": 0.9562,
+      "step": 12910
+    },
+    {
+      "epoch": 2.298611111111111,
+      "grad_norm": 0.8152852058410645,
+      "learning_rate": 7.690461612171145e-05,
+      "loss": 0.9857,
+      "step": 12911
+    },
+    {
+      "epoch": 2.298789173789174,
+      "grad_norm": 0.8502064943313599,
+      "learning_rate": 7.689099730157776e-05,
+      "loss": 0.7806,
+      "step": 12912
+    },
+    {
+      "epoch": 2.298967236467236,
+      "grad_norm": 0.9655177593231201,
+      "learning_rate": 7.687737893421697e-05,
+      "loss": 0.9693,
+      "step": 12913
+    },
+    {
+      "epoch": 2.299145299145299,
+      "grad_norm": 0.7759003639221191,
+      "learning_rate": 7.686376101989596e-05,
+      "loss": 0.9137,
+      "step": 12914
+    },
+    {
+      "epoch": 2.2993233618233617,
+      "grad_norm": 0.6987054944038391,
+      "learning_rate": 7.685014355888143e-05,
+      "loss": 0.9026,
+      "step": 12915
+    },
+    {
+      "epoch": 2.2995014245014245,
+      "grad_norm": 0.762819230556488,
+      "learning_rate": 7.683652655144027e-05,
+      "loss": 0.8358,
+      "step": 12916
+    },
+    {
+      "epoch": 2.2996794871794872,
+      "grad_norm": 0.8233383893966675,
+      "learning_rate": 7.682290999783924e-05,
+      "loss": 0.8468,
+      "step": 12917
+    },
+    {
+      "epoch": 2.29985754985755,
+      "grad_norm": 0.8558689951896667,
+      "learning_rate": 7.68092938983452e-05,
+      "loss": 0.9018,
+      "step": 12918
+    },
+    {
+      "epoch": 2.3000356125356127,
+      "grad_norm": 0.741760790348053,
+      "learning_rate": 7.67956782532248e-05,
+      "loss": 0.7968,
+      "step": 12919
+    },
+    {
+      "epoch": 2.300213675213675,
+      "grad_norm": 0.9132583737373352,
+      "learning_rate": 7.678206306274495e-05,
+      "loss": 0.9952,
+      "step": 12920
+    },
+    {
+      "epoch": 2.300391737891738,
+      "grad_norm": 0.7656551003456116,
+      "learning_rate": 7.67684483271723e-05,
+      "loss": 0.8772,
+      "step": 12921
+    },
+    {
+      "epoch": 2.3005698005698005,
+      "grad_norm": 0.7407111525535583,
+      "learning_rate": 7.675483404677364e-05,
+      "loss": 0.8199,
+      "step": 12922
+    },
+    {
+      "epoch": 2.3007478632478633,
+      "grad_norm": 0.9602083563804626,
+      "learning_rate": 7.674122022181571e-05,
+      "loss": 1.0837,
+      "step": 12923
+    },
+    {
+      "epoch": 2.300925925925926,
+      "grad_norm": 0.7562392354011536,
+      "learning_rate": 7.672760685256531e-05,
+      "loss": 0.8148,
+      "step": 12924
+    },
+    {
+      "epoch": 2.301103988603989,
+      "grad_norm": 0.9260091185569763,
+      "learning_rate": 7.671399393928906e-05,
+      "loss": 0.9508,
+      "step": 12925
+    },
+    {
+      "epoch": 2.301282051282051,
+      "grad_norm": 0.8745924234390259,
+      "learning_rate": 7.670038148225374e-05,
+      "loss": 0.8688,
+      "step": 12926
+    },
+    {
+      "epoch": 2.301460113960114,
+      "grad_norm": 0.7802116274833679,
+      "learning_rate": 7.668676948172602e-05,
+      "loss": 0.7698,
+      "step": 12927
+    },
+    {
+      "epoch": 2.3016381766381766,
+      "grad_norm": 0.7701709866523743,
+      "learning_rate": 7.667315793797268e-05,
+      "loss": 0.7633,
+      "step": 12928
+    },
+    {
+      "epoch": 2.3018162393162394,
+      "grad_norm": 0.8084021806716919,
+      "learning_rate": 7.66595468512603e-05,
+      "loss": 0.8502,
+      "step": 12929
+    },
+    {
+      "epoch": 2.301994301994302,
+      "grad_norm": 1.0485330820083618,
+      "learning_rate": 7.664593622185568e-05,
+      "loss": 0.8049,
+      "step": 12930
+    },
+    {
+      "epoch": 2.302172364672365,
+      "grad_norm": 0.7852743864059448,
+      "learning_rate": 7.663232605002535e-05,
+      "loss": 0.882,
+      "step": 12931
+    },
+    {
+      "epoch": 2.302350427350427,
+      "grad_norm": 0.7795702815055847,
+      "learning_rate": 7.661871633603607e-05,
+      "loss": 0.7841,
+      "step": 12932
+    },
+    {
+      "epoch": 2.30252849002849,
+      "grad_norm": 0.8882975578308105,
+      "learning_rate": 7.660510708015448e-05,
+      "loss": 1.117,
+      "step": 12933
+    },
+    {
+      "epoch": 2.3027065527065527,
+      "grad_norm": 0.7987662553787231,
+      "learning_rate": 7.65914982826472e-05,
+      "loss": 0.8552,
+      "step": 12934
+    },
+    {
+      "epoch": 2.3028846153846154,
+      "grad_norm": 0.8141679167747498,
+      "learning_rate": 7.657788994378095e-05,
+      "loss": 0.8288,
+      "step": 12935
+    },
+    {
+      "epoch": 2.303062678062678,
+      "grad_norm": 0.8506320118904114,
+      "learning_rate": 7.656428206382222e-05,
+      "loss": 0.7521,
+      "step": 12936
+    },
+    {
+      "epoch": 2.303240740740741,
+      "grad_norm": 0.7666227221488953,
+      "learning_rate": 7.655067464303773e-05,
+      "loss": 0.8394,
+      "step": 12937
+    },
+    {
+      "epoch": 2.3034188034188032,
+      "grad_norm": 0.8018062710762024,
+      "learning_rate": 7.653706768169405e-05,
+      "loss": 0.7566,
+      "step": 12938
+    },
+    {
+      "epoch": 2.303596866096866,
+      "grad_norm": 0.8054059743881226,
+      "learning_rate": 7.652346118005779e-05,
+      "loss": 0.8749,
+      "step": 12939
+    },
+    {
+      "epoch": 2.3037749287749287,
+      "grad_norm": 0.8663263320922852,
+      "learning_rate": 7.650985513839554e-05,
+      "loss": 0.7799,
+      "step": 12940
+    },
+    {
+      "epoch": 2.3039529914529915,
+      "grad_norm": 0.7591161727905273,
+      "learning_rate": 7.64962495569739e-05,
+      "loss": 0.7378,
+      "step": 12941
+    },
+    {
+      "epoch": 2.3041310541310542,
+      "grad_norm": 0.8118969202041626,
+      "learning_rate": 7.64826444360594e-05,
+      "loss": 0.7948,
+      "step": 12942
+    },
+    {
+      "epoch": 2.304309116809117,
+      "grad_norm": 0.6880847811698914,
+      "learning_rate": 7.646903977591865e-05,
+      "loss": 0.9164,
+      "step": 12943
+    },
+    {
+      "epoch": 2.3044871794871793,
+      "grad_norm": 0.814386248588562,
+      "learning_rate": 7.645543557681816e-05,
+      "loss": 0.7998,
+      "step": 12944
+    },
+    {
+      "epoch": 2.304665242165242,
+      "grad_norm": 0.8295530676841736,
+      "learning_rate": 7.644183183902454e-05,
+      "loss": 0.812,
+      "step": 12945
+    },
+    {
+      "epoch": 2.304843304843305,
+      "grad_norm": 0.7872505187988281,
+      "learning_rate": 7.642822856280424e-05,
+      "loss": 0.9073,
+      "step": 12946
+    },
+    {
+      "epoch": 2.3050213675213675,
+      "grad_norm": 0.9217497110366821,
+      "learning_rate": 7.641462574842387e-05,
+      "loss": 0.7762,
+      "step": 12947
+    },
+    {
+      "epoch": 2.3051994301994303,
+      "grad_norm": 0.7502169609069824,
+      "learning_rate": 7.640102339614987e-05,
+      "loss": 0.9374,
+      "step": 12948
+    },
+    {
+      "epoch": 2.305377492877493,
+      "grad_norm": 0.8262767195701599,
+      "learning_rate": 7.638742150624886e-05,
+      "loss": 0.5363,
+      "step": 12949
+    },
+    {
+      "epoch": 2.3055555555555554,
+      "grad_norm": 0.7571384310722351,
+      "learning_rate": 7.637382007898722e-05,
+      "loss": 0.9548,
+      "step": 12950
+    },
+    {
+      "epoch": 2.305733618233618,
+      "grad_norm": 0.7899317145347595,
+      "learning_rate": 7.636021911463152e-05,
+      "loss": 0.7718,
+      "step": 12951
+    },
+    {
+      "epoch": 2.305911680911681,
+      "grad_norm": 0.7772458791732788,
+      "learning_rate": 7.634661861344819e-05,
+      "loss": 0.7158,
+      "step": 12952
+    },
+    {
+      "epoch": 2.3060897435897436,
+      "grad_norm": 0.8279168009757996,
+      "learning_rate": 7.633301857570374e-05,
+      "loss": 0.7835,
+      "step": 12953
+    },
+    {
+      "epoch": 2.3062678062678064,
+      "grad_norm": 0.751268208026886,
+      "learning_rate": 7.631941900166468e-05,
+      "loss": 0.8609,
+      "step": 12954
+    },
+    {
+      "epoch": 2.306445868945869,
+      "grad_norm": 0.8101294636726379,
+      "learning_rate": 7.630581989159736e-05,
+      "loss": 1.0242,
+      "step": 12955
+    },
+    {
+      "epoch": 2.306623931623932,
+      "grad_norm": 0.7707645297050476,
+      "learning_rate": 7.629222124576831e-05,
+      "loss": 0.7969,
+      "step": 12956
+    },
+    {
+      "epoch": 2.306801994301994,
+      "grad_norm": 0.6519944667816162,
+      "learning_rate": 7.627862306444391e-05,
+      "loss": 0.5459,
+      "step": 12957
+    },
+    {
+      "epoch": 2.306980056980057,
+      "grad_norm": 0.7738897800445557,
+      "learning_rate": 7.626502534789063e-05,
+      "loss": 0.957,
+      "step": 12958
+    },
+    {
+      "epoch": 2.3071581196581197,
+      "grad_norm": 0.7059842944145203,
+      "learning_rate": 7.625142809637485e-05,
+      "loss": 0.6316,
+      "step": 12959
+    },
+    {
+      "epoch": 2.3073361823361824,
+      "grad_norm": 0.8380797505378723,
+      "learning_rate": 7.623783131016305e-05,
+      "loss": 0.9685,
+      "step": 12960
+    },
+    {
+      "epoch": 2.307514245014245,
+      "grad_norm": 0.8272121548652649,
+      "learning_rate": 7.622423498952154e-05,
+      "loss": 0.9425,
+      "step": 12961
+    },
+    {
+      "epoch": 2.3076923076923075,
+      "grad_norm": 0.763522744178772,
+      "learning_rate": 7.621063913471678e-05,
+      "loss": 0.7778,
+      "step": 12962
+    },
+    {
+      "epoch": 2.3078703703703702,
+      "grad_norm": 0.8345584273338318,
+      "learning_rate": 7.61970437460151e-05,
+      "loss": 0.9652,
+      "step": 12963
+    },
+    {
+      "epoch": 2.308048433048433,
+      "grad_norm": 0.943286657333374,
+      "learning_rate": 7.618344882368294e-05,
+      "loss": 0.9088,
+      "step": 12964
+    },
+    {
+      "epoch": 2.3082264957264957,
+      "grad_norm": 0.8568450212478638,
+      "learning_rate": 7.616985436798659e-05,
+      "loss": 0.7535,
+      "step": 12965
+    },
+    {
+      "epoch": 2.3084045584045585,
+      "grad_norm": 0.8722548484802246,
+      "learning_rate": 7.615626037919248e-05,
+      "loss": 0.9802,
+      "step": 12966
+    },
+    {
+      "epoch": 2.3085826210826212,
+      "grad_norm": 1.0332363843917847,
+      "learning_rate": 7.614266685756688e-05,
+      "loss": 0.9105,
+      "step": 12967
+    },
+    {
+      "epoch": 2.308760683760684,
+      "grad_norm": 0.7503480315208435,
+      "learning_rate": 7.612907380337619e-05,
+      "loss": 0.7345,
+      "step": 12968
+    },
+    {
+      "epoch": 2.3089387464387463,
+      "grad_norm": 0.7406014204025269,
+      "learning_rate": 7.611548121688668e-05,
+      "loss": 0.9222,
+      "step": 12969
+    },
+    {
+      "epoch": 2.309116809116809,
+      "grad_norm": 0.7574487328529358,
+      "learning_rate": 7.610188909836474e-05,
+      "loss": 0.7709,
+      "step": 12970
+    },
+    {
+      "epoch": 2.309294871794872,
+      "grad_norm": 0.8669037818908691,
+      "learning_rate": 7.608829744807661e-05,
+      "loss": 0.838,
+      "step": 12971
+    },
+    {
+      "epoch": 2.3094729344729346,
+      "grad_norm": 0.7544569373130798,
+      "learning_rate": 7.607470626628861e-05,
+      "loss": 0.6966,
+      "step": 12972
+    },
+    {
+      "epoch": 2.3096509971509973,
+      "grad_norm": 0.8201249241828918,
+      "learning_rate": 7.606111555326706e-05,
+      "loss": 0.9322,
+      "step": 12973
+    },
+    {
+      "epoch": 2.3098290598290596,
+      "grad_norm": 0.7935477495193481,
+      "learning_rate": 7.60475253092782e-05,
+      "loss": 0.7981,
+      "step": 12974
+    },
+    {
+      "epoch": 2.3100071225071224,
+      "grad_norm": 0.8775026798248291,
+      "learning_rate": 7.603393553458838e-05,
+      "loss": 0.8352,
+      "step": 12975
+    },
+    {
+      "epoch": 2.310185185185185,
+      "grad_norm": 0.8422450423240662,
+      "learning_rate": 7.602034622946374e-05,
+      "loss": 0.577,
+      "step": 12976
+    },
+    {
+      "epoch": 2.310363247863248,
+      "grad_norm": 0.8584204316139221,
+      "learning_rate": 7.600675739417067e-05,
+      "loss": 0.9767,
+      "step": 12977
+    },
+    {
+      "epoch": 2.3105413105413106,
+      "grad_norm": 0.7818547487258911,
+      "learning_rate": 7.599316902897528e-05,
+      "loss": 0.7675,
+      "step": 12978
+    },
+    {
+      "epoch": 2.3107193732193734,
+      "grad_norm": 0.93815016746521,
+      "learning_rate": 7.597958113414391e-05,
+      "loss": 0.8517,
+      "step": 12979
+    },
+    {
+      "epoch": 2.310897435897436,
+      "grad_norm": 0.8092408776283264,
+      "learning_rate": 7.596599370994272e-05,
+      "loss": 0.7266,
+      "step": 12980
+    },
+    {
+      "epoch": 2.3110754985754984,
+      "grad_norm": 0.8577243089675903,
+      "learning_rate": 7.595240675663802e-05,
+      "loss": 0.9138,
+      "step": 12981
+    },
+    {
+      "epoch": 2.311253561253561,
+      "grad_norm": 0.8600401878356934,
+      "learning_rate": 7.59388202744959e-05,
+      "loss": 0.8348,
+      "step": 12982
+    },
+    {
+      "epoch": 2.311431623931624,
+      "grad_norm": 0.6399564743041992,
+      "learning_rate": 7.592523426378264e-05,
+      "loss": 0.6649,
+      "step": 12983
+    },
+    {
+      "epoch": 2.3116096866096867,
+      "grad_norm": 0.7916820049285889,
+      "learning_rate": 7.591164872476438e-05,
+      "loss": 0.8048,
+      "step": 12984
+    },
+    {
+      "epoch": 2.3117877492877494,
+      "grad_norm": 0.7748355269432068,
+      "learning_rate": 7.589806365770736e-05,
+      "loss": 1.0101,
+      "step": 12985
+    },
+    {
+      "epoch": 2.3119658119658117,
+      "grad_norm": 0.8463436365127563,
+      "learning_rate": 7.588447906287767e-05,
+      "loss": 0.7547,
+      "step": 12986
+    },
+    {
+      "epoch": 2.3121438746438745,
+      "grad_norm": 0.8257808685302734,
+      "learning_rate": 7.587089494054155e-05,
+      "loss": 0.8093,
+      "step": 12987
+    },
+    {
+      "epoch": 2.3123219373219372,
+      "grad_norm": 0.843781054019928,
+      "learning_rate": 7.58573112909651e-05,
+      "loss": 0.8379,
+      "step": 12988
+    },
+    {
+      "epoch": 2.3125,
+      "grad_norm": 0.8782341480255127,
+      "learning_rate": 7.584372811441452e-05,
+      "loss": 0.9372,
+      "step": 12989
+    },
+    {
+      "epoch": 2.3126780626780628,
+      "grad_norm": 0.8465158343315125,
+      "learning_rate": 7.583014541115585e-05,
+      "loss": 0.8427,
+      "step": 12990
+    },
+    {
+      "epoch": 2.3128561253561255,
+      "grad_norm": 0.7140238285064697,
+      "learning_rate": 7.58165631814553e-05,
+      "loss": 0.7896,
+      "step": 12991
+    },
+    {
+      "epoch": 2.3130341880341883,
+      "grad_norm": 0.9414699077606201,
+      "learning_rate": 7.580298142557898e-05,
+      "loss": 1.0464,
+      "step": 12992
+    },
+    {
+      "epoch": 2.3132122507122506,
+      "grad_norm": 0.7970326542854309,
+      "learning_rate": 7.578940014379293e-05,
+      "loss": 0.696,
+      "step": 12993
+    },
+    {
+      "epoch": 2.3133903133903133,
+      "grad_norm": 0.7377375960350037,
+      "learning_rate": 7.577581933636332e-05,
+      "loss": 0.7205,
+      "step": 12994
+    },
+    {
+      "epoch": 2.313568376068376,
+      "grad_norm": 0.7740316987037659,
+      "learning_rate": 7.576223900355619e-05,
+      "loss": 0.7448,
+      "step": 12995
+    },
+    {
+      "epoch": 2.313746438746439,
+      "grad_norm": 0.7820385098457336,
+      "learning_rate": 7.574865914563767e-05,
+      "loss": 0.7289,
+      "step": 12996
+    },
+    {
+      "epoch": 2.3139245014245016,
+      "grad_norm": 0.8180822134017944,
+      "learning_rate": 7.573507976287376e-05,
+      "loss": 0.8709,
+      "step": 12997
+    },
+    {
+      "epoch": 2.314102564102564,
+      "grad_norm": 0.9008440971374512,
+      "learning_rate": 7.572150085553058e-05,
+      "loss": 0.7938,
+      "step": 12998
+    },
+    {
+      "epoch": 2.3142806267806266,
+      "grad_norm": 0.786400318145752,
+      "learning_rate": 7.570792242387414e-05,
+      "loss": 0.9866,
+      "step": 12999
+    },
+    {
+      "epoch": 2.3144586894586894,
+      "grad_norm": 0.872160792350769,
+      "learning_rate": 7.569434446817052e-05,
+      "loss": 0.7319,
+      "step": 13000
+    },
+    {
+      "epoch": 2.314636752136752,
+      "grad_norm": 0.7858988642692566,
+      "learning_rate": 7.56807669886857e-05,
+      "loss": 0.8786,
+      "step": 13001
+    },
+    {
+      "epoch": 2.314814814814815,
+      "grad_norm": 0.7090579271316528,
+      "learning_rate": 7.566718998568579e-05,
+      "loss": 0.7092,
+      "step": 13002
+    },
+    {
+      "epoch": 2.3149928774928776,
+      "grad_norm": 0.7881498336791992,
+      "learning_rate": 7.565361345943668e-05,
+      "loss": 0.876,
+      "step": 13003
+    },
+    {
+      "epoch": 2.3151709401709404,
+      "grad_norm": 0.8768819570541382,
+      "learning_rate": 7.564003741020447e-05,
+      "loss": 1.0374,
+      "step": 13004
+    },
+    {
+      "epoch": 2.3153490028490027,
+      "grad_norm": 0.7608295679092407,
+      "learning_rate": 7.56264618382551e-05,
+      "loss": 0.7551,
+      "step": 13005
+    },
+    {
+      "epoch": 2.3155270655270654,
+      "grad_norm": 0.6947942972183228,
+      "learning_rate": 7.561288674385462e-05,
+      "loss": 0.8132,
+      "step": 13006
+    },
+    {
+      "epoch": 2.315705128205128,
+      "grad_norm": 0.8722706437110901,
+      "learning_rate": 7.559931212726892e-05,
+      "loss": 0.7486,
+      "step": 13007
+    },
+    {
+      "epoch": 2.315883190883191,
+      "grad_norm": 0.9804681539535522,
+      "learning_rate": 7.558573798876404e-05,
+      "loss": 0.899,
+      "step": 13008
+    },
+    {
+      "epoch": 2.3160612535612537,
+      "grad_norm": 0.9246195554733276,
+      "learning_rate": 7.557216432860587e-05,
+      "loss": 0.7742,
+      "step": 13009
+    },
+    {
+      "epoch": 2.316239316239316,
+      "grad_norm": 0.8792895078659058,
+      "learning_rate": 7.555859114706046e-05,
+      "loss": 0.8299,
+      "step": 13010
+    },
+    {
+      "epoch": 2.3164173789173788,
+      "grad_norm": 0.8280500769615173,
+      "learning_rate": 7.554501844439362e-05,
+      "loss": 0.8708,
+      "step": 13011
+    },
+    {
+      "epoch": 2.3165954415954415,
+      "grad_norm": 0.8560570478439331,
+      "learning_rate": 7.553144622087136e-05,
+      "loss": 0.9571,
+      "step": 13012
+    },
+    {
+      "epoch": 2.3167735042735043,
+      "grad_norm": 0.8504697680473328,
+      "learning_rate": 7.551787447675962e-05,
+      "loss": 0.7609,
+      "step": 13013
+    },
+    {
+      "epoch": 2.316951566951567,
+      "grad_norm": 0.8199480772018433,
+      "learning_rate": 7.550430321232423e-05,
+      "loss": 0.7077,
+      "step": 13014
+    },
+    {
+      "epoch": 2.3171296296296298,
+      "grad_norm": 0.854341447353363,
+      "learning_rate": 7.549073242783115e-05,
+      "loss": 0.9602,
+      "step": 13015
+    },
+    {
+      "epoch": 2.3173076923076925,
+      "grad_norm": 0.7619566917419434,
+      "learning_rate": 7.547716212354623e-05,
+      "loss": 0.9967,
+      "step": 13016
+    },
+    {
+      "epoch": 2.317485754985755,
+      "grad_norm": 0.7371547222137451,
+      "learning_rate": 7.546359229973543e-05,
+      "loss": 0.7239,
+      "step": 13017
+    },
+    {
+      "epoch": 2.3176638176638176,
+      "grad_norm": 0.7797415852546692,
+      "learning_rate": 7.545002295666453e-05,
+      "loss": 0.7472,
+      "step": 13018
+    },
+    {
+      "epoch": 2.3178418803418803,
+      "grad_norm": 0.8549608588218689,
+      "learning_rate": 7.543645409459943e-05,
+      "loss": 0.8968,
+      "step": 13019
+    },
+    {
+      "epoch": 2.318019943019943,
+      "grad_norm": 0.7931239008903503,
+      "learning_rate": 7.542288571380598e-05,
+      "loss": 0.9853,
+      "step": 13020
+    },
+    {
+      "epoch": 2.318198005698006,
+      "grad_norm": 0.797726035118103,
+      "learning_rate": 7.540931781455008e-05,
+      "loss": 0.9366,
+      "step": 13021
+    },
+    {
+      "epoch": 2.318376068376068,
+      "grad_norm": 0.7382092475891113,
+      "learning_rate": 7.539575039709747e-05,
+      "loss": 0.6484,
+      "step": 13022
+    },
+    {
+      "epoch": 2.318554131054131,
+      "grad_norm": 0.83231121301651,
+      "learning_rate": 7.538218346171403e-05,
+      "loss": 1.0184,
+      "step": 13023
+    },
+    {
+      "epoch": 2.3187321937321936,
+      "grad_norm": 0.8613845109939575,
+      "learning_rate": 7.536861700866554e-05,
+      "loss": 0.8019,
+      "step": 13024
+    },
+    {
+      "epoch": 2.3189102564102564,
+      "grad_norm": 0.7736538648605347,
+      "learning_rate": 7.53550510382179e-05,
+      "loss": 0.8228,
+      "step": 13025
+    },
+    {
+      "epoch": 2.319088319088319,
+      "grad_norm": 0.7894346714019775,
+      "learning_rate": 7.534148555063678e-05,
+      "loss": 0.8189,
+      "step": 13026
+    },
+    {
+      "epoch": 2.319266381766382,
+      "grad_norm": 0.7333146333694458,
+      "learning_rate": 7.532792054618807e-05,
+      "loss": 0.8456,
+      "step": 13027
+    },
+    {
+      "epoch": 2.3194444444444446,
+      "grad_norm": 1.0321780443191528,
+      "learning_rate": 7.531435602513745e-05,
+      "loss": 0.8594,
+      "step": 13028
+    },
+    {
+      "epoch": 2.319622507122507,
+      "grad_norm": 0.8658601641654968,
+      "learning_rate": 7.530079198775079e-05,
+      "loss": 0.9638,
+      "step": 13029
+    },
+    {
+      "epoch": 2.3198005698005697,
+      "grad_norm": 0.7287920713424683,
+      "learning_rate": 7.528722843429376e-05,
+      "loss": 0.7001,
+      "step": 13030
+    },
+    {
+      "epoch": 2.3199786324786325,
+      "grad_norm": 0.7398431301116943,
+      "learning_rate": 7.527366536503218e-05,
+      "loss": 0.8595,
+      "step": 13031
+    },
+    {
+      "epoch": 2.320156695156695,
+      "grad_norm": 0.8127739429473877,
+      "learning_rate": 7.526010278023178e-05,
+      "loss": 0.7641,
+      "step": 13032
+    },
+    {
+      "epoch": 2.320334757834758,
+      "grad_norm": 0.776497483253479,
+      "learning_rate": 7.524654068015824e-05,
+      "loss": 0.8666,
+      "step": 13033
+    },
+    {
+      "epoch": 2.3205128205128207,
+      "grad_norm": 0.8524185419082642,
+      "learning_rate": 7.523297906507733e-05,
+      "loss": 0.9815,
+      "step": 13034
+    },
+    {
+      "epoch": 2.320690883190883,
+      "grad_norm": 0.7745016813278198,
+      "learning_rate": 7.521941793525474e-05,
+      "loss": 0.8527,
+      "step": 13035
+    },
+    {
+      "epoch": 2.3208689458689458,
+      "grad_norm": 0.8695911169052124,
+      "learning_rate": 7.52058572909562e-05,
+      "loss": 0.8881,
+      "step": 13036
+    },
+    {
+      "epoch": 2.3210470085470085,
+      "grad_norm": 0.788969099521637,
+      "learning_rate": 7.519229713244736e-05,
+      "loss": 0.7886,
+      "step": 13037
+    },
+    {
+      "epoch": 2.3212250712250713,
+      "grad_norm": 0.776520311832428,
+      "learning_rate": 7.517873745999394e-05,
+      "loss": 0.5986,
+      "step": 13038
+    },
+    {
+      "epoch": 2.321403133903134,
+      "grad_norm": 0.8118561506271362,
+      "learning_rate": 7.516517827386158e-05,
+      "loss": 0.8805,
+      "step": 13039
+    },
+    {
+      "epoch": 2.3215811965811968,
+      "grad_norm": 0.8859134912490845,
+      "learning_rate": 7.515161957431596e-05,
+      "loss": 0.8861,
+      "step": 13040
+    },
+    {
+      "epoch": 2.321759259259259,
+      "grad_norm": 0.8181297779083252,
+      "learning_rate": 7.513806136162273e-05,
+      "loss": 0.9015,
+      "step": 13041
+    },
+    {
+      "epoch": 2.321937321937322,
+      "grad_norm": 0.8488339185714722,
+      "learning_rate": 7.512450363604759e-05,
+      "loss": 1.0423,
+      "step": 13042
+    },
+    {
+      "epoch": 2.3221153846153846,
+      "grad_norm": 0.7755734920501709,
+      "learning_rate": 7.511094639785607e-05,
+      "loss": 0.7595,
+      "step": 13043
+    },
+    {
+      "epoch": 2.3222934472934473,
+      "grad_norm": 0.8437283635139465,
+      "learning_rate": 7.509738964731389e-05,
+      "loss": 0.9011,
+      "step": 13044
+    },
+    {
+      "epoch": 2.32247150997151,
+      "grad_norm": 0.7508310675621033,
+      "learning_rate": 7.508383338468659e-05,
+      "loss": 0.8335,
+      "step": 13045
+    },
+    {
+      "epoch": 2.322649572649573,
+      "grad_norm": 0.8001464605331421,
+      "learning_rate": 7.507027761023987e-05,
+      "loss": 0.9785,
+      "step": 13046
+    },
+    {
+      "epoch": 2.322827635327635,
+      "grad_norm": 0.8142531514167786,
+      "learning_rate": 7.505672232423923e-05,
+      "loss": 0.8577,
+      "step": 13047
+    },
+    {
+      "epoch": 2.323005698005698,
+      "grad_norm": 0.7852125763893127,
+      "learning_rate": 7.504316752695035e-05,
+      "loss": 0.798,
+      "step": 13048
+    },
+    {
+      "epoch": 2.3231837606837606,
+      "grad_norm": 0.8998631238937378,
+      "learning_rate": 7.502961321863871e-05,
+      "loss": 0.9291,
+      "step": 13049
+    },
+    {
+      "epoch": 2.3233618233618234,
+      "grad_norm": 0.8850175738334656,
+      "learning_rate": 7.501605939956995e-05,
+      "loss": 0.9536,
+      "step": 13050
+    },
+    {
+      "epoch": 2.323539886039886,
+      "grad_norm": 0.8305020928382874,
+      "learning_rate": 7.500250607000959e-05,
+      "loss": 0.8695,
+      "step": 13051
+    },
+    {
+      "epoch": 2.323717948717949,
+      "grad_norm": 0.8073359727859497,
+      "learning_rate": 7.498895323022317e-05,
+      "loss": 0.6831,
+      "step": 13052
+    },
+    {
+      "epoch": 2.323896011396011,
+      "grad_norm": 0.8435724973678589,
+      "learning_rate": 7.497540088047632e-05,
+      "loss": 0.9419,
+      "step": 13053
+    },
+    {
+      "epoch": 2.324074074074074,
+      "grad_norm": 0.927147388458252,
+      "learning_rate": 7.496184902103446e-05,
+      "loss": 0.957,
+      "step": 13054
+    },
+    {
+      "epoch": 2.3242521367521367,
+      "grad_norm": 0.7923009395599365,
+      "learning_rate": 7.494829765216319e-05,
+      "loss": 0.839,
+      "step": 13055
+    },
+    {
+      "epoch": 2.3244301994301995,
+      "grad_norm": 0.7830277681350708,
+      "learning_rate": 7.493474677412794e-05,
+      "loss": 1.0236,
+      "step": 13056
+    },
+    {
+      "epoch": 2.324608262108262,
+      "grad_norm": 0.8470967411994934,
+      "learning_rate": 7.492119638719432e-05,
+      "loss": 0.9144,
+      "step": 13057
+    },
+    {
+      "epoch": 2.324786324786325,
+      "grad_norm": 0.7469272613525391,
+      "learning_rate": 7.490764649162771e-05,
+      "loss": 0.7101,
+      "step": 13058
+    },
+    {
+      "epoch": 2.3249643874643873,
+      "grad_norm": 0.9236082434654236,
+      "learning_rate": 7.489409708769366e-05,
+      "loss": 0.6658,
+      "step": 13059
+    },
+    {
+      "epoch": 2.32514245014245,
+      "grad_norm": 0.8271692395210266,
+      "learning_rate": 7.48805481756576e-05,
+      "loss": 0.775,
+      "step": 13060
+    },
+    {
+      "epoch": 2.3253205128205128,
+      "grad_norm": 0.9878279566764832,
+      "learning_rate": 7.486699975578507e-05,
+      "loss": 0.6881,
+      "step": 13061
+    },
+    {
+      "epoch": 2.3254985754985755,
+      "grad_norm": 0.7332003712654114,
+      "learning_rate": 7.485345182834142e-05,
+      "loss": 0.8384,
+      "step": 13062
+    },
+    {
+      "epoch": 2.3256766381766383,
+      "grad_norm": 0.9525214433670044,
+      "learning_rate": 7.483990439359221e-05,
+      "loss": 0.9892,
+      "step": 13063
+    },
+    {
+      "epoch": 2.325854700854701,
+      "grad_norm": 0.7413233518600464,
+      "learning_rate": 7.482635745180273e-05,
+      "loss": 0.8749,
+      "step": 13064
+    },
+    {
+      "epoch": 2.326032763532764,
+      "grad_norm": 0.8286891579627991,
+      "learning_rate": 7.481281100323854e-05,
+      "loss": 1.0313,
+      "step": 13065
+    },
+    {
+      "epoch": 2.326210826210826,
+      "grad_norm": 0.868653416633606,
+      "learning_rate": 7.479926504816495e-05,
+      "loss": 0.7407,
+      "step": 13066
+    },
+    {
+      "epoch": 2.326388888888889,
+      "grad_norm": 0.790052056312561,
+      "learning_rate": 7.478571958684746e-05,
+      "loss": 0.7156,
+      "step": 13067
+    },
+    {
+      "epoch": 2.3265669515669516,
+      "grad_norm": 0.8799049854278564,
+      "learning_rate": 7.477217461955137e-05,
+      "loss": 0.7706,
+      "step": 13068
+    },
+    {
+      "epoch": 2.3267450142450143,
+      "grad_norm": 0.8246361017227173,
+      "learning_rate": 7.475863014654214e-05,
+      "loss": 0.9402,
+      "step": 13069
+    },
+    {
+      "epoch": 2.326923076923077,
+      "grad_norm": 0.870917022228241,
+      "learning_rate": 7.474508616808508e-05,
+      "loss": 0.962,
+      "step": 13070
+    },
+    {
+      "epoch": 2.3271011396011394,
+      "grad_norm": 0.8706079125404358,
+      "learning_rate": 7.473154268444563e-05,
+      "loss": 0.9094,
+      "step": 13071
+    },
+    {
+      "epoch": 2.327279202279202,
+      "grad_norm": 0.9031453728675842,
+      "learning_rate": 7.471799969588912e-05,
+      "loss": 0.8447,
+      "step": 13072
+    },
+    {
+      "epoch": 2.327457264957265,
+      "grad_norm": 0.9153435230255127,
+      "learning_rate": 7.470445720268086e-05,
+      "loss": 0.9935,
+      "step": 13073
+    },
+    {
+      "epoch": 2.3276353276353277,
+      "grad_norm": 0.8236302733421326,
+      "learning_rate": 7.469091520508624e-05,
+      "loss": 0.7911,
+      "step": 13074
+    },
+    {
+      "epoch": 2.3278133903133904,
+      "grad_norm": 0.7344710826873779,
+      "learning_rate": 7.467737370337054e-05,
+      "loss": 0.6565,
+      "step": 13075
+    },
+    {
+      "epoch": 2.327991452991453,
+      "grad_norm": 0.8711966276168823,
+      "learning_rate": 7.466383269779911e-05,
+      "loss": 0.8332,
+      "step": 13076
+    },
+    {
+      "epoch": 2.328169515669516,
+      "grad_norm": 0.836825966835022,
+      "learning_rate": 7.465029218863723e-05,
+      "loss": 0.9359,
+      "step": 13077
+    },
+    {
+      "epoch": 2.328347578347578,
+      "grad_norm": 0.9791260361671448,
+      "learning_rate": 7.463675217615024e-05,
+      "loss": 0.8938,
+      "step": 13078
+    },
+    {
+      "epoch": 2.328525641025641,
+      "grad_norm": 0.7260454893112183,
+      "learning_rate": 7.46232126606034e-05,
+      "loss": 0.6871,
+      "step": 13079
+    },
+    {
+      "epoch": 2.3287037037037037,
+      "grad_norm": 0.7887428998947144,
+      "learning_rate": 7.460967364226197e-05,
+      "loss": 0.8098,
+      "step": 13080
+    },
+    {
+      "epoch": 2.3288817663817665,
+      "grad_norm": 0.8303743004798889,
+      "learning_rate": 7.459613512139124e-05,
+      "loss": 0.8897,
+      "step": 13081
+    },
+    {
+      "epoch": 2.3290598290598292,
+      "grad_norm": 0.7933324575424194,
+      "learning_rate": 7.458259709825652e-05,
+      "loss": 0.754,
+      "step": 13082
+    },
+    {
+      "epoch": 2.3292378917378915,
+      "grad_norm": 0.8998779058456421,
+      "learning_rate": 7.456905957312296e-05,
+      "loss": 0.866,
+      "step": 13083
+    },
+    {
+      "epoch": 2.3294159544159543,
+      "grad_norm": 0.8205044269561768,
+      "learning_rate": 7.455552254625588e-05,
+      "loss": 0.7455,
+      "step": 13084
+    },
+    {
+      "epoch": 2.329594017094017,
+      "grad_norm": 0.8731769323348999,
+      "learning_rate": 7.454198601792046e-05,
+      "loss": 0.8876,
+      "step": 13085
+    },
+    {
+      "epoch": 2.32977207977208,
+      "grad_norm": 0.9183599352836609,
+      "learning_rate": 7.452844998838194e-05,
+      "loss": 1.0991,
+      "step": 13086
+    },
+    {
+      "epoch": 2.3299501424501425,
+      "grad_norm": 0.8820931315422058,
+      "learning_rate": 7.451491445790553e-05,
+      "loss": 0.7591,
+      "step": 13087
+    },
+    {
+      "epoch": 2.3301282051282053,
+      "grad_norm": 0.7837240099906921,
+      "learning_rate": 7.450137942675646e-05,
+      "loss": 0.792,
+      "step": 13088
+    },
+    {
+      "epoch": 2.330306267806268,
+      "grad_norm": 0.8960266709327698,
+      "learning_rate": 7.448784489519984e-05,
+      "loss": 0.9725,
+      "step": 13089
+    },
+    {
+      "epoch": 2.3304843304843303,
+      "grad_norm": 0.8010196685791016,
+      "learning_rate": 7.447431086350092e-05,
+      "loss": 0.8418,
+      "step": 13090
+    },
+    {
+      "epoch": 2.330662393162393,
+      "grad_norm": 0.8652680516242981,
+      "learning_rate": 7.446077733192486e-05,
+      "loss": 0.93,
+      "step": 13091
+    },
+    {
+      "epoch": 2.330840455840456,
+      "grad_norm": 0.9385902285575867,
+      "learning_rate": 7.44472443007368e-05,
+      "loss": 0.968,
+      "step": 13092
+    },
+    {
+      "epoch": 2.3310185185185186,
+      "grad_norm": 0.8097951412200928,
+      "learning_rate": 7.443371177020195e-05,
+      "loss": 0.8715,
+      "step": 13093
+    },
+    {
+      "epoch": 2.3311965811965814,
+      "grad_norm": 0.7931473255157471,
+      "learning_rate": 7.442017974058537e-05,
+      "loss": 0.865,
+      "step": 13094
+    },
+    {
+      "epoch": 2.3313746438746437,
+      "grad_norm": 0.7680486440658569,
+      "learning_rate": 7.440664821215224e-05,
+      "loss": 0.9155,
+      "step": 13095
+    },
+    {
+      "epoch": 2.3315527065527064,
+      "grad_norm": 0.8128345012664795,
+      "learning_rate": 7.439311718516766e-05,
+      "loss": 0.8707,
+      "step": 13096
+    },
+    {
+      "epoch": 2.331730769230769,
+      "grad_norm": 0.9534463286399841,
+      "learning_rate": 7.43795866598968e-05,
+      "loss": 1.1102,
+      "step": 13097
+    },
+    {
+      "epoch": 2.331908831908832,
+      "grad_norm": 0.9140331745147705,
+      "learning_rate": 7.436605663660468e-05,
+      "loss": 0.7744,
+      "step": 13098
+    },
+    {
+      "epoch": 2.3320868945868947,
+      "grad_norm": 0.8316463828086853,
+      "learning_rate": 7.435252711555645e-05,
+      "loss": 0.7201,
+      "step": 13099
+    },
+    {
+      "epoch": 2.3322649572649574,
+      "grad_norm": 0.7714298963546753,
+      "learning_rate": 7.433899809701714e-05,
+      "loss": 0.9111,
+      "step": 13100
+    },
+    {
+      "epoch": 2.33244301994302,
+      "grad_norm": 0.999081552028656,
+      "learning_rate": 7.432546958125188e-05,
+      "loss": 0.8839,
+      "step": 13101
+    },
+    {
+      "epoch": 2.3326210826210825,
+      "grad_norm": 0.8515602350234985,
+      "learning_rate": 7.43119415685257e-05,
+      "loss": 0.7245,
+      "step": 13102
+    },
+    {
+      "epoch": 2.3327991452991452,
+      "grad_norm": 0.9441094398498535,
+      "learning_rate": 7.42984140591037e-05,
+      "loss": 0.8678,
+      "step": 13103
+    },
+    {
+      "epoch": 2.332977207977208,
+      "grad_norm": 0.8220996856689453,
+      "learning_rate": 7.428488705325084e-05,
+      "loss": 0.9737,
+      "step": 13104
+    },
+    {
+      "epoch": 2.3331552706552707,
+      "grad_norm": 0.8164090514183044,
+      "learning_rate": 7.427136055123222e-05,
+      "loss": 0.9138,
+      "step": 13105
+    },
+    {
+      "epoch": 2.3333333333333335,
+      "grad_norm": 0.9672707319259644,
+      "learning_rate": 7.425783455331281e-05,
+      "loss": 0.7723,
+      "step": 13106
+    },
+    {
+      "epoch": 2.333511396011396,
+      "grad_norm": 0.7953858971595764,
+      "learning_rate": 7.424430905975773e-05,
+      "loss": 0.8876,
+      "step": 13107
+    },
+    {
+      "epoch": 2.3336894586894585,
+      "grad_norm": 0.7809541821479797,
+      "learning_rate": 7.423078407083183e-05,
+      "loss": 1.0268,
+      "step": 13108
+    },
+    {
+      "epoch": 2.3338675213675213,
+      "grad_norm": 0.805270254611969,
+      "learning_rate": 7.421725958680025e-05,
+      "loss": 0.7515,
+      "step": 13109
+    },
+    {
+      "epoch": 2.334045584045584,
+      "grad_norm": 0.8066652417182922,
+      "learning_rate": 7.420373560792788e-05,
+      "loss": 1.004,
+      "step": 13110
+    },
+    {
+      "epoch": 2.334223646723647,
+      "grad_norm": 0.9382686018943787,
+      "learning_rate": 7.41902121344797e-05,
+      "loss": 0.8769,
+      "step": 13111
+    },
+    {
+      "epoch": 2.3344017094017095,
+      "grad_norm": 0.7908356785774231,
+      "learning_rate": 7.417668916672074e-05,
+      "loss": 0.8491,
+      "step": 13112
+    },
+    {
+      "epoch": 2.3345797720797723,
+      "grad_norm": 0.7188867330551147,
+      "learning_rate": 7.416316670491588e-05,
+      "loss": 0.7138,
+      "step": 13113
+    },
+    {
+      "epoch": 2.3347578347578346,
+      "grad_norm": 0.8477714657783508,
+      "learning_rate": 7.414964474933012e-05,
+      "loss": 0.9715,
+      "step": 13114
+    },
+    {
+      "epoch": 2.3349358974358974,
+      "grad_norm": 0.8769845366477966,
+      "learning_rate": 7.413612330022835e-05,
+      "loss": 0.9029,
+      "step": 13115
+    },
+    {
+      "epoch": 2.33511396011396,
+      "grad_norm": 0.9011028409004211,
+      "learning_rate": 7.412260235787554e-05,
+      "loss": 1.026,
+      "step": 13116
+    },
+    {
+      "epoch": 2.335292022792023,
+      "grad_norm": 0.7775689363479614,
+      "learning_rate": 7.410908192253656e-05,
+      "loss": 0.8492,
+      "step": 13117
+    },
+    {
+      "epoch": 2.3354700854700856,
+      "grad_norm": 0.9587660431861877,
+      "learning_rate": 7.409556199447637e-05,
+      "loss": 0.8731,
+      "step": 13118
+    },
+    {
+      "epoch": 2.335648148148148,
+      "grad_norm": 0.8117266297340393,
+      "learning_rate": 7.408204257395979e-05,
+      "loss": 0.8827,
+      "step": 13119
+    },
+    {
+      "epoch": 2.3358262108262107,
+      "grad_norm": 0.7382497787475586,
+      "learning_rate": 7.40685236612518e-05,
+      "loss": 0.6617,
+      "step": 13120
+    },
+    {
+      "epoch": 2.3360042735042734,
+      "grad_norm": 0.8630974888801575,
+      "learning_rate": 7.405500525661717e-05,
+      "loss": 0.843,
+      "step": 13121
+    },
+    {
+      "epoch": 2.336182336182336,
+      "grad_norm": 0.7496539950370789,
+      "learning_rate": 7.404148736032083e-05,
+      "loss": 1.0354,
+      "step": 13122
+    },
+    {
+      "epoch": 2.336360398860399,
+      "grad_norm": 0.8409397602081299,
+      "learning_rate": 7.402796997262761e-05,
+      "loss": 0.9848,
+      "step": 13123
+    },
+    {
+      "epoch": 2.3365384615384617,
+      "grad_norm": 0.8018865585327148,
+      "learning_rate": 7.40144530938024e-05,
+      "loss": 0.8929,
+      "step": 13124
+    },
+    {
+      "epoch": 2.3367165242165244,
+      "grad_norm": 0.7378625869750977,
+      "learning_rate": 7.400093672410996e-05,
+      "loss": 0.7749,
+      "step": 13125
+    },
+    {
+      "epoch": 2.3368945868945867,
+      "grad_norm": 0.8906251192092896,
+      "learning_rate": 7.398742086381519e-05,
+      "loss": 1.007,
+      "step": 13126
+    },
+    {
+      "epoch": 2.3370726495726495,
+      "grad_norm": 0.8324725031852722,
+      "learning_rate": 7.397390551318283e-05,
+      "loss": 0.8493,
+      "step": 13127
+    },
+    {
+      "epoch": 2.3372507122507122,
+      "grad_norm": 0.781080961227417,
+      "learning_rate": 7.39603906724778e-05,
+      "loss": 0.696,
+      "step": 13128
+    },
+    {
+      "epoch": 2.337428774928775,
+      "grad_norm": 0.8068976402282715,
+      "learning_rate": 7.394687634196476e-05,
+      "loss": 0.7196,
+      "step": 13129
+    },
+    {
+      "epoch": 2.3376068376068377,
+      "grad_norm": 0.7588358521461487,
+      "learning_rate": 7.393336252190854e-05,
+      "loss": 0.9179,
+      "step": 13130
+    },
+    {
+      "epoch": 2.3377849002849,
+      "grad_norm": 0.8334088325500488,
+      "learning_rate": 7.391984921257398e-05,
+      "loss": 0.94,
+      "step": 13131
+    },
+    {
+      "epoch": 2.337962962962963,
+      "grad_norm": 0.9485353231430054,
+      "learning_rate": 7.390633641422578e-05,
+      "loss": 0.9253,
+      "step": 13132
+    },
+    {
+      "epoch": 2.3381410256410255,
+      "grad_norm": 0.9447978734970093,
+      "learning_rate": 7.389282412712874e-05,
+      "loss": 0.9112,
+      "step": 13133
+    },
+    {
+      "epoch": 2.3383190883190883,
+      "grad_norm": 0.7348376512527466,
+      "learning_rate": 7.387931235154754e-05,
+      "loss": 0.7817,
+      "step": 13134
+    },
+    {
+      "epoch": 2.338497150997151,
+      "grad_norm": 0.8610092401504517,
+      "learning_rate": 7.386580108774699e-05,
+      "loss": 0.8231,
+      "step": 13135
+    },
+    {
+      "epoch": 2.338675213675214,
+      "grad_norm": 0.8314286470413208,
+      "learning_rate": 7.385229033599175e-05,
+      "loss": 0.7323,
+      "step": 13136
+    },
+    {
+      "epoch": 2.3388532763532766,
+      "grad_norm": 0.7775855660438538,
+      "learning_rate": 7.383878009654657e-05,
+      "loss": 0.9897,
+      "step": 13137
+    },
+    {
+      "epoch": 2.339031339031339,
+      "grad_norm": 0.8140097260475159,
+      "learning_rate": 7.382527036967614e-05,
+      "loss": 0.7815,
+      "step": 13138
+    },
+    {
+      "epoch": 2.3392094017094016,
+      "grad_norm": 0.8154003620147705,
+      "learning_rate": 7.38117611556452e-05,
+      "loss": 0.7646,
+      "step": 13139
+    },
+    {
+      "epoch": 2.3393874643874644,
+      "grad_norm": 0.7705643177032471,
+      "learning_rate": 7.379825245471836e-05,
+      "loss": 0.7633,
+      "step": 13140
+    },
+    {
+      "epoch": 2.339565527065527,
+      "grad_norm": 0.7856985330581665,
+      "learning_rate": 7.378474426716035e-05,
+      "loss": 0.6803,
+      "step": 13141
+    },
+    {
+      "epoch": 2.33974358974359,
+      "grad_norm": 0.8384547233581543,
+      "learning_rate": 7.377123659323579e-05,
+      "loss": 0.9092,
+      "step": 13142
+    },
+    {
+      "epoch": 2.339921652421652,
+      "grad_norm": 0.7456032633781433,
+      "learning_rate": 7.375772943320942e-05,
+      "loss": 0.8393,
+      "step": 13143
+    },
+    {
+      "epoch": 2.340099715099715,
+      "grad_norm": 0.9527342319488525,
+      "learning_rate": 7.374422278734579e-05,
+      "loss": 1.0272,
+      "step": 13144
+    },
+    {
+      "epoch": 2.3402777777777777,
+      "grad_norm": 0.8976300954818726,
+      "learning_rate": 7.37307166559096e-05,
+      "loss": 0.7184,
+      "step": 13145
+    },
+    {
+      "epoch": 2.3404558404558404,
+      "grad_norm": 0.7698291540145874,
+      "learning_rate": 7.371721103916542e-05,
+      "loss": 0.6783,
+      "step": 13146
+    },
+    {
+      "epoch": 2.340633903133903,
+      "grad_norm": 0.8646810054779053,
+      "learning_rate": 7.37037059373779e-05,
+      "loss": 0.8559,
+      "step": 13147
+    },
+    {
+      "epoch": 2.340811965811966,
+      "grad_norm": 0.7534750699996948,
+      "learning_rate": 7.369020135081161e-05,
+      "loss": 0.8087,
+      "step": 13148
+    },
+    {
+      "epoch": 2.3409900284900287,
+      "grad_norm": 0.7408546209335327,
+      "learning_rate": 7.367669727973123e-05,
+      "loss": 0.675,
+      "step": 13149
+    },
+    {
+      "epoch": 2.341168091168091,
+      "grad_norm": 0.8753145933151245,
+      "learning_rate": 7.366319372440124e-05,
+      "loss": 0.8163,
+      "step": 13150
+    },
+    {
+      "epoch": 2.3413461538461537,
+      "grad_norm": 0.7065265774726868,
+      "learning_rate": 7.364969068508624e-05,
+      "loss": 0.5786,
+      "step": 13151
+    },
+    {
+      "epoch": 2.3415242165242165,
+      "grad_norm": 0.7976117730140686,
+      "learning_rate": 7.363618816205087e-05,
+      "loss": 0.9053,
+      "step": 13152
+    },
+    {
+      "epoch": 2.3417022792022792,
+      "grad_norm": 0.7261707782745361,
+      "learning_rate": 7.362268615555958e-05,
+      "loss": 0.7677,
+      "step": 13153
+    },
+    {
+      "epoch": 2.341880341880342,
+      "grad_norm": 0.7868889570236206,
+      "learning_rate": 7.360918466587701e-05,
+      "loss": 0.8648,
+      "step": 13154
+    },
+    {
+      "epoch": 2.3420584045584047,
+      "grad_norm": 0.8473666310310364,
+      "learning_rate": 7.35956836932676e-05,
+      "loss": 0.8533,
+      "step": 13155
+    },
+    {
+      "epoch": 2.342236467236467,
+      "grad_norm": 0.7456569671630859,
+      "learning_rate": 7.358218323799594e-05,
+      "loss": 0.7617,
+      "step": 13156
+    },
+    {
+      "epoch": 2.34241452991453,
+      "grad_norm": 0.8130928874015808,
+      "learning_rate": 7.356868330032652e-05,
+      "loss": 0.8667,
+      "step": 13157
+    },
+    {
+      "epoch": 2.3425925925925926,
+      "grad_norm": 0.8743309378623962,
+      "learning_rate": 7.355518388052384e-05,
+      "loss": 0.9196,
+      "step": 13158
+    },
+    {
+      "epoch": 2.3427706552706553,
+      "grad_norm": 0.8228809237480164,
+      "learning_rate": 7.354168497885237e-05,
+      "loss": 0.6509,
+      "step": 13159
+    },
+    {
+      "epoch": 2.342948717948718,
+      "grad_norm": 0.6998807191848755,
+      "learning_rate": 7.352818659557668e-05,
+      "loss": 0.5762,
+      "step": 13160
+    },
+    {
+      "epoch": 2.343126780626781,
+      "grad_norm": 0.8757675290107727,
+      "learning_rate": 7.351468873096114e-05,
+      "loss": 0.9094,
+      "step": 13161
+    },
+    {
+      "epoch": 2.343304843304843,
+      "grad_norm": 0.7495744824409485,
+      "learning_rate": 7.350119138527026e-05,
+      "loss": 0.653,
+      "step": 13162
+    },
+    {
+      "epoch": 2.343482905982906,
+      "grad_norm": 0.8229764103889465,
+      "learning_rate": 7.348769455876849e-05,
+      "loss": 0.8146,
+      "step": 13163
+    },
+    {
+      "epoch": 2.3436609686609686,
+      "grad_norm": 0.8317791819572449,
+      "learning_rate": 7.347419825172029e-05,
+      "loss": 0.7754,
+      "step": 13164
+    },
+    {
+      "epoch": 2.3438390313390314,
+      "grad_norm": 0.8210344910621643,
+      "learning_rate": 7.346070246439005e-05,
+      "loss": 0.876,
+      "step": 13165
+    },
+    {
+      "epoch": 2.344017094017094,
+      "grad_norm": 0.7711526155471802,
+      "learning_rate": 7.344720719704223e-05,
+      "loss": 0.7426,
+      "step": 13166
+    },
+    {
+      "epoch": 2.344195156695157,
+      "grad_norm": 0.8231741189956665,
+      "learning_rate": 7.343371244994119e-05,
+      "loss": 0.8992,
+      "step": 13167
+    },
+    {
+      "epoch": 2.344373219373219,
+      "grad_norm": 0.7145521640777588,
+      "learning_rate": 7.342021822335143e-05,
+      "loss": 0.8787,
+      "step": 13168
+    },
+    {
+      "epoch": 2.344551282051282,
+      "grad_norm": 0.8323171734809875,
+      "learning_rate": 7.340672451753723e-05,
+      "loss": 0.7035,
+      "step": 13169
+    },
+    {
+      "epoch": 2.3447293447293447,
+      "grad_norm": 0.7061881422996521,
+      "learning_rate": 7.339323133276301e-05,
+      "loss": 0.8077,
+      "step": 13170
+    },
+    {
+      "epoch": 2.3449074074074074,
+      "grad_norm": 0.8705938458442688,
+      "learning_rate": 7.33797386692932e-05,
+      "loss": 0.8616,
+      "step": 13171
+    },
+    {
+      "epoch": 2.34508547008547,
+      "grad_norm": 0.8777729868888855,
+      "learning_rate": 7.336624652739208e-05,
+      "loss": 0.9524,
+      "step": 13172
+    },
+    {
+      "epoch": 2.345263532763533,
+      "grad_norm": 0.9099276065826416,
+      "learning_rate": 7.335275490732406e-05,
+      "loss": 0.8248,
+      "step": 13173
+    },
+    {
+      "epoch": 2.3454415954415953,
+      "grad_norm": 0.7963444590568542,
+      "learning_rate": 7.333926380935341e-05,
+      "loss": 0.78,
+      "step": 13174
+    },
+    {
+      "epoch": 2.345619658119658,
+      "grad_norm": 0.9400636553764343,
+      "learning_rate": 7.332577323374454e-05,
+      "loss": 1.0062,
+      "step": 13175
+    },
+    {
+      "epoch": 2.3457977207977208,
+      "grad_norm": 0.7794054746627808,
+      "learning_rate": 7.331228318076171e-05,
+      "loss": 0.8564,
+      "step": 13176
+    },
+    {
+      "epoch": 2.3459757834757835,
+      "grad_norm": 0.7767263054847717,
+      "learning_rate": 7.329879365066927e-05,
+      "loss": 0.8191,
+      "step": 13177
+    },
+    {
+      "epoch": 2.3461538461538463,
+      "grad_norm": 0.7170942425727844,
+      "learning_rate": 7.328530464373148e-05,
+      "loss": 0.7018,
+      "step": 13178
+    },
+    {
+      "epoch": 2.346331908831909,
+      "grad_norm": 0.8246886134147644,
+      "learning_rate": 7.327181616021268e-05,
+      "loss": 0.8498,
+      "step": 13179
+    },
+    {
+      "epoch": 2.3465099715099713,
+      "grad_norm": 0.9531362652778625,
+      "learning_rate": 7.325832820037711e-05,
+      "loss": 0.7031,
+      "step": 13180
+    },
+    {
+      "epoch": 2.346688034188034,
+      "grad_norm": 0.8561878204345703,
+      "learning_rate": 7.324484076448905e-05,
+      "loss": 0.8627,
+      "step": 13181
+    },
+    {
+      "epoch": 2.346866096866097,
+      "grad_norm": 0.7890949845314026,
+      "learning_rate": 7.323135385281274e-05,
+      "loss": 0.7675,
+      "step": 13182
+    },
+    {
+      "epoch": 2.3470441595441596,
+      "grad_norm": 0.72523033618927,
+      "learning_rate": 7.321786746561246e-05,
+      "loss": 0.8847,
+      "step": 13183
+    },
+    {
+      "epoch": 2.3472222222222223,
+      "grad_norm": 0.7866469025611877,
+      "learning_rate": 7.32043816031524e-05,
+      "loss": 0.9657,
+      "step": 13184
+    },
+    {
+      "epoch": 2.347400284900285,
+      "grad_norm": 0.8669828176498413,
+      "learning_rate": 7.319089626569687e-05,
+      "loss": 0.9098,
+      "step": 13185
+    },
+    {
+      "epoch": 2.347578347578348,
+      "grad_norm": 0.7874458432197571,
+      "learning_rate": 7.317741145351e-05,
+      "loss": 0.9545,
+      "step": 13186
+    },
+    {
+      "epoch": 2.34775641025641,
+      "grad_norm": 0.7924689054489136,
+      "learning_rate": 7.316392716685604e-05,
+      "loss": 0.8577,
+      "step": 13187
+    },
+    {
+      "epoch": 2.347934472934473,
+      "grad_norm": 0.731119692325592,
+      "learning_rate": 7.315044340599918e-05,
+      "loss": 0.9251,
+      "step": 13188
+    },
+    {
+      "epoch": 2.3481125356125356,
+      "grad_norm": 0.914900004863739,
+      "learning_rate": 7.313696017120361e-05,
+      "loss": 0.9224,
+      "step": 13189
+    },
+    {
+      "epoch": 2.3482905982905984,
+      "grad_norm": 0.7616490125656128,
+      "learning_rate": 7.312347746273349e-05,
+      "loss": 0.7263,
+      "step": 13190
+    },
+    {
+      "epoch": 2.348468660968661,
+      "grad_norm": 0.8357210159301758,
+      "learning_rate": 7.310999528085301e-05,
+      "loss": 0.8572,
+      "step": 13191
+    },
+    {
+      "epoch": 2.3486467236467234,
+      "grad_norm": 0.8404232263565063,
+      "learning_rate": 7.309651362582633e-05,
+      "loss": 0.6822,
+      "step": 13192
+    },
+    {
+      "epoch": 2.348824786324786,
+      "grad_norm": 0.8992070555686951,
+      "learning_rate": 7.308303249791754e-05,
+      "loss": 0.91,
+      "step": 13193
+    },
+    {
+      "epoch": 2.349002849002849,
+      "grad_norm": 0.8150524497032166,
+      "learning_rate": 7.306955189739084e-05,
+      "loss": 0.984,
+      "step": 13194
+    },
+    {
+      "epoch": 2.3491809116809117,
+      "grad_norm": 0.9042861461639404,
+      "learning_rate": 7.305607182451031e-05,
+      "loss": 1.0111,
+      "step": 13195
+    },
+    {
+      "epoch": 2.3493589743589745,
+      "grad_norm": 0.8402968049049377,
+      "learning_rate": 7.30425922795401e-05,
+      "loss": 0.801,
+      "step": 13196
+    },
+    {
+      "epoch": 2.349537037037037,
+      "grad_norm": 0.7742997407913208,
+      "learning_rate": 7.302911326274428e-05,
+      "loss": 0.659,
+      "step": 13197
+    },
+    {
+      "epoch": 2.3497150997151,
+      "grad_norm": 0.8005271553993225,
+      "learning_rate": 7.301563477438698e-05,
+      "loss": 0.8549,
+      "step": 13198
+    },
+    {
+      "epoch": 2.3498931623931623,
+      "grad_norm": 0.8253805637359619,
+      "learning_rate": 7.300215681473224e-05,
+      "loss": 0.9049,
+      "step": 13199
+    },
+    {
+      "epoch": 2.350071225071225,
+      "grad_norm": 0.8539033532142639,
+      "learning_rate": 7.29886793840442e-05,
+      "loss": 0.9359,
+      "step": 13200
+    },
+    {
+      "epoch": 2.3502492877492878,
+      "grad_norm": 0.827608048915863,
+      "learning_rate": 7.297520248258681e-05,
+      "loss": 1.0105,
+      "step": 13201
+    },
+    {
+      "epoch": 2.3504273504273505,
+      "grad_norm": 0.8418487310409546,
+      "learning_rate": 7.296172611062422e-05,
+      "loss": 0.8138,
+      "step": 13202
+    },
+    {
+      "epoch": 2.3506054131054133,
+      "grad_norm": 0.7853255867958069,
+      "learning_rate": 7.294825026842042e-05,
+      "loss": 0.9279,
+      "step": 13203
+    },
+    {
+      "epoch": 2.3507834757834756,
+      "grad_norm": 0.8454880714416504,
+      "learning_rate": 7.293477495623951e-05,
+      "loss": 0.7687,
+      "step": 13204
+    },
+    {
+      "epoch": 2.3509615384615383,
+      "grad_norm": 0.7620453238487244,
+      "learning_rate": 7.29213001743454e-05,
+      "loss": 0.7567,
+      "step": 13205
+    },
+    {
+      "epoch": 2.351139601139601,
+      "grad_norm": 0.8993792533874512,
+      "learning_rate": 7.290782592300223e-05,
+      "loss": 0.9716,
+      "step": 13206
+    },
+    {
+      "epoch": 2.351317663817664,
+      "grad_norm": 1.1063668727874756,
+      "learning_rate": 7.289435220247387e-05,
+      "loss": 0.9763,
+      "step": 13207
+    },
+    {
+      "epoch": 2.3514957264957266,
+      "grad_norm": 0.8205364346504211,
+      "learning_rate": 7.288087901302439e-05,
+      "loss": 0.9395,
+      "step": 13208
+    },
+    {
+      "epoch": 2.3516737891737893,
+      "grad_norm": 0.680487871170044,
+      "learning_rate": 7.286740635491774e-05,
+      "loss": 0.6252,
+      "step": 13209
+    },
+    {
+      "epoch": 2.351851851851852,
+      "grad_norm": 0.8450767397880554,
+      "learning_rate": 7.285393422841791e-05,
+      "loss": 0.8707,
+      "step": 13210
+    },
+    {
+      "epoch": 2.3520299145299144,
+      "grad_norm": 0.6871187686920166,
+      "learning_rate": 7.284046263378888e-05,
+      "loss": 0.5695,
+      "step": 13211
+    },
+    {
+      "epoch": 2.352207977207977,
+      "grad_norm": 0.7968555688858032,
+      "learning_rate": 7.282699157129451e-05,
+      "loss": 0.7014,
+      "step": 13212
+    },
+    {
+      "epoch": 2.35238603988604,
+      "grad_norm": 0.863798201084137,
+      "learning_rate": 7.281352104119883e-05,
+      "loss": 0.9241,
+      "step": 13213
+    },
+    {
+      "epoch": 2.3525641025641026,
+      "grad_norm": 0.8848825693130493,
+      "learning_rate": 7.28000510437657e-05,
+      "loss": 0.8252,
+      "step": 13214
+    },
+    {
+      "epoch": 2.3527421652421654,
+      "grad_norm": 0.7528855800628662,
+      "learning_rate": 7.278658157925912e-05,
+      "loss": 0.7428,
+      "step": 13215
+    },
+    {
+      "epoch": 2.3529202279202277,
+      "grad_norm": 0.7636159062385559,
+      "learning_rate": 7.277311264794288e-05,
+      "loss": 0.8952,
+      "step": 13216
+    },
+    {
+      "epoch": 2.3530982905982905,
+      "grad_norm": 1.0585514307022095,
+      "learning_rate": 7.2759644250081e-05,
+      "loss": 0.9153,
+      "step": 13217
+    },
+    {
+      "epoch": 2.353276353276353,
+      "grad_norm": 0.7691277265548706,
+      "learning_rate": 7.274617638593725e-05,
+      "loss": 1.057,
+      "step": 13218
+    },
+    {
+      "epoch": 2.353454415954416,
+      "grad_norm": 0.8324813842773438,
+      "learning_rate": 7.273270905577561e-05,
+      "loss": 0.9253,
+      "step": 13219
+    },
+    {
+      "epoch": 2.3536324786324787,
+      "grad_norm": 0.835491418838501,
+      "learning_rate": 7.271924225985984e-05,
+      "loss": 1.0103,
+      "step": 13220
+    },
+    {
+      "epoch": 2.3538105413105415,
+      "grad_norm": 0.8318347930908203,
+      "learning_rate": 7.270577599845389e-05,
+      "loss": 0.8896,
+      "step": 13221
+    },
+    {
+      "epoch": 2.353988603988604,
+      "grad_norm": 0.7801460027694702,
+      "learning_rate": 7.269231027182153e-05,
+      "loss": 0.9274,
+      "step": 13222
+    },
+    {
+      "epoch": 2.3541666666666665,
+      "grad_norm": 0.8195397257804871,
+      "learning_rate": 7.267884508022665e-05,
+      "loss": 0.8126,
+      "step": 13223
+    },
+    {
+      "epoch": 2.3543447293447293,
+      "grad_norm": 0.7978246212005615,
+      "learning_rate": 7.2665380423933e-05,
+      "loss": 0.8426,
+      "step": 13224
+    },
+    {
+      "epoch": 2.354522792022792,
+      "grad_norm": 0.7614684104919434,
+      "learning_rate": 7.265191630320452e-05,
+      "loss": 0.7277,
+      "step": 13225
+    },
+    {
+      "epoch": 2.3547008547008548,
+      "grad_norm": 0.8684967756271362,
+      "learning_rate": 7.263845271830485e-05,
+      "loss": 0.7054,
+      "step": 13226
+    },
+    {
+      "epoch": 2.3548789173789175,
+      "grad_norm": 0.878842830657959,
+      "learning_rate": 7.262498966949791e-05,
+      "loss": 1.0478,
+      "step": 13227
+    },
+    {
+      "epoch": 2.35505698005698,
+      "grad_norm": 0.8321235179901123,
+      "learning_rate": 7.26115271570474e-05,
+      "loss": 0.8119,
+      "step": 13228
+    },
+    {
+      "epoch": 2.3552350427350426,
+      "grad_norm": 0.9144030213356018,
+      "learning_rate": 7.259806518121713e-05,
+      "loss": 0.8626,
+      "step": 13229
+    },
+    {
+      "epoch": 2.3554131054131053,
+      "grad_norm": 0.8437082767486572,
+      "learning_rate": 7.258460374227085e-05,
+      "loss": 0.8439,
+      "step": 13230
+    },
+    {
+      "epoch": 2.355591168091168,
+      "grad_norm": 0.8405697345733643,
+      "learning_rate": 7.257114284047229e-05,
+      "loss": 0.863,
+      "step": 13231
+    },
+    {
+      "epoch": 2.355769230769231,
+      "grad_norm": 0.8674731850624084,
+      "learning_rate": 7.255768247608525e-05,
+      "loss": 0.9823,
+      "step": 13232
+    },
+    {
+      "epoch": 2.3559472934472936,
+      "grad_norm": 0.8844531178474426,
+      "learning_rate": 7.254422264937337e-05,
+      "loss": 0.9018,
+      "step": 13233
+    },
+    {
+      "epoch": 2.3561253561253563,
+      "grad_norm": 0.8394746780395508,
+      "learning_rate": 7.253076336060045e-05,
+      "loss": 0.8407,
+      "step": 13234
+    },
+    {
+      "epoch": 2.3563034188034186,
+      "grad_norm": 0.8759872317314148,
+      "learning_rate": 7.251730461003012e-05,
+      "loss": 0.841,
+      "step": 13235
+    },
+    {
+      "epoch": 2.3564814814814814,
+      "grad_norm": 0.7240089774131775,
+      "learning_rate": 7.250384639792617e-05,
+      "loss": 0.7918,
+      "step": 13236
+    },
+    {
+      "epoch": 2.356659544159544,
+      "grad_norm": 0.8619599342346191,
+      "learning_rate": 7.24903887245522e-05,
+      "loss": 0.761,
+      "step": 13237
+    },
+    {
+      "epoch": 2.356837606837607,
+      "grad_norm": 0.7291443943977356,
+      "learning_rate": 7.247693159017192e-05,
+      "loss": 0.8189,
+      "step": 13238
+    },
+    {
+      "epoch": 2.3570156695156697,
+      "grad_norm": 0.8006066083908081,
+      "learning_rate": 7.246347499504898e-05,
+      "loss": 0.8924,
+      "step": 13239
+    },
+    {
+      "epoch": 2.357193732193732,
+      "grad_norm": 0.7774627208709717,
+      "learning_rate": 7.245001893944707e-05,
+      "loss": 0.946,
+      "step": 13240
+    },
+    {
+      "epoch": 2.3573717948717947,
+      "grad_norm": 0.7643784284591675,
+      "learning_rate": 7.243656342362978e-05,
+      "loss": 0.8717,
+      "step": 13241
+    },
+    {
+      "epoch": 2.3575498575498575,
+      "grad_norm": 0.7197792530059814,
+      "learning_rate": 7.242310844786082e-05,
+      "loss": 0.6792,
+      "step": 13242
+    },
+    {
+      "epoch": 2.35772792022792,
+      "grad_norm": 0.9124938249588013,
+      "learning_rate": 7.240965401240371e-05,
+      "loss": 0.841,
+      "step": 13243
+    },
+    {
+      "epoch": 2.357905982905983,
+      "grad_norm": 0.7350388765335083,
+      "learning_rate": 7.239620011752215e-05,
+      "loss": 0.8294,
+      "step": 13244
+    },
+    {
+      "epoch": 2.3580840455840457,
+      "grad_norm": 0.8814936280250549,
+      "learning_rate": 7.238274676347967e-05,
+      "loss": 0.9732,
+      "step": 13245
+    },
+    {
+      "epoch": 2.3582621082621085,
+      "grad_norm": 0.8379302024841309,
+      "learning_rate": 7.236929395053995e-05,
+      "loss": 0.8896,
+      "step": 13246
+    },
+    {
+      "epoch": 2.3584401709401708,
+      "grad_norm": 0.8200546503067017,
+      "learning_rate": 7.235584167896648e-05,
+      "loss": 0.7991,
+      "step": 13247
+    },
+    {
+      "epoch": 2.3586182336182335,
+      "grad_norm": 0.7842608690261841,
+      "learning_rate": 7.234238994902287e-05,
+      "loss": 0.695,
+      "step": 13248
+    },
+    {
+      "epoch": 2.3587962962962963,
+      "grad_norm": 0.8872218132019043,
+      "learning_rate": 7.232893876097266e-05,
+      "loss": 0.8611,
+      "step": 13249
+    },
+    {
+      "epoch": 2.358974358974359,
+      "grad_norm": 0.8358500599861145,
+      "learning_rate": 7.231548811507942e-05,
+      "loss": 0.7829,
+      "step": 13250
+    },
+    {
+      "epoch": 2.359152421652422,
+      "grad_norm": 0.8269400000572205,
+      "learning_rate": 7.23020380116067e-05,
+      "loss": 0.6904,
+      "step": 13251
+    },
+    {
+      "epoch": 2.359330484330484,
+      "grad_norm": 0.8693541288375854,
+      "learning_rate": 7.2288588450818e-05,
+      "loss": 0.8659,
+      "step": 13252
+    },
+    {
+      "epoch": 2.359508547008547,
+      "grad_norm": 0.858076810836792,
+      "learning_rate": 7.227513943297688e-05,
+      "loss": 0.8824,
+      "step": 13253
+    },
+    {
+      "epoch": 2.3596866096866096,
+      "grad_norm": 0.796541154384613,
+      "learning_rate": 7.226169095834675e-05,
+      "loss": 0.8999,
+      "step": 13254
+    },
+    {
+      "epoch": 2.3598646723646723,
+      "grad_norm": 0.7692779898643494,
+      "learning_rate": 7.22482430271912e-05,
+      "loss": 0.9492,
+      "step": 13255
+    },
+    {
+      "epoch": 2.360042735042735,
+      "grad_norm": 0.9259434342384338,
+      "learning_rate": 7.223479563977364e-05,
+      "loss": 0.9115,
+      "step": 13256
+    },
+    {
+      "epoch": 2.360220797720798,
+      "grad_norm": 0.9048989415168762,
+      "learning_rate": 7.222134879635764e-05,
+      "loss": 0.8057,
+      "step": 13257
+    },
+    {
+      "epoch": 2.3603988603988606,
+      "grad_norm": 0.9342616200447083,
+      "learning_rate": 7.220790249720656e-05,
+      "loss": 0.8554,
+      "step": 13258
+    },
+    {
+      "epoch": 2.360576923076923,
+      "grad_norm": 0.7747787237167358,
+      "learning_rate": 7.219445674258392e-05,
+      "loss": 0.7555,
+      "step": 13259
+    },
+    {
+      "epoch": 2.3607549857549857,
+      "grad_norm": 0.805437445640564,
+      "learning_rate": 7.218101153275311e-05,
+      "loss": 0.7442,
+      "step": 13260
+    },
+    {
+      "epoch": 2.3609330484330484,
+      "grad_norm": 0.9797805547714233,
+      "learning_rate": 7.216756686797764e-05,
+      "loss": 1.0975,
+      "step": 13261
+    },
+    {
+      "epoch": 2.361111111111111,
+      "grad_norm": 0.7361458539962769,
+      "learning_rate": 7.215412274852083e-05,
+      "loss": 0.6597,
+      "step": 13262
+    },
+    {
+      "epoch": 2.361289173789174,
+      "grad_norm": 0.8041569590568542,
+      "learning_rate": 7.21406791746462e-05,
+      "loss": 0.8343,
+      "step": 13263
+    },
+    {
+      "epoch": 2.361467236467236,
+      "grad_norm": 0.8364384770393372,
+      "learning_rate": 7.212723614661703e-05,
+      "loss": 0.9486,
+      "step": 13264
+    },
+    {
+      "epoch": 2.361645299145299,
+      "grad_norm": 0.714241623878479,
+      "learning_rate": 7.21137936646968e-05,
+      "loss": 0.5978,
+      "step": 13265
+    },
+    {
+      "epoch": 2.3618233618233617,
+      "grad_norm": 0.8830710053443909,
+      "learning_rate": 7.210035172914882e-05,
+      "loss": 0.9584,
+      "step": 13266
+    },
+    {
+      "epoch": 2.3620014245014245,
+      "grad_norm": 0.714112401008606,
+      "learning_rate": 7.208691034023653e-05,
+      "loss": 0.8878,
+      "step": 13267
+    },
+    {
+      "epoch": 2.3621794871794872,
+      "grad_norm": 0.7654083371162415,
+      "learning_rate": 7.207346949822322e-05,
+      "loss": 0.822,
+      "step": 13268
+    },
+    {
+      "epoch": 2.36235754985755,
+      "grad_norm": 0.772693395614624,
+      "learning_rate": 7.206002920337225e-05,
+      "loss": 0.7993,
+      "step": 13269
+    },
+    {
+      "epoch": 2.3625356125356127,
+      "grad_norm": 0.9678596258163452,
+      "learning_rate": 7.2046589455947e-05,
+      "loss": 0.948,
+      "step": 13270
+    },
+    {
+      "epoch": 2.362713675213675,
+      "grad_norm": 0.8254278302192688,
+      "learning_rate": 7.203315025621073e-05,
+      "loss": 0.8654,
+      "step": 13271
+    },
+    {
+      "epoch": 2.362891737891738,
+      "grad_norm": 0.7527315020561218,
+      "learning_rate": 7.201971160442685e-05,
+      "loss": 0.6881,
+      "step": 13272
+    },
+    {
+      "epoch": 2.3630698005698005,
+      "grad_norm": 0.7658267021179199,
+      "learning_rate": 7.200627350085853e-05,
+      "loss": 0.7332,
+      "step": 13273
+    },
+    {
+      "epoch": 2.3632478632478633,
+      "grad_norm": 0.8590806126594543,
+      "learning_rate": 7.199283594576916e-05,
+      "loss": 0.879,
+      "step": 13274
+    },
+    {
+      "epoch": 2.363425925925926,
+      "grad_norm": 0.7533347606658936,
+      "learning_rate": 7.197939893942197e-05,
+      "loss": 0.8738,
+      "step": 13275
+    },
+    {
+      "epoch": 2.363603988603989,
+      "grad_norm": Infinity,
+      "learning_rate": 7.197939893942197e-05,
+      "loss": 0.7641,
+      "step": 13276
+    },
+    {
+      "epoch": 2.363782051282051,
+      "grad_norm": 0.6873685121536255,
+      "learning_rate": 7.196596248208029e-05,
+      "loss": 0.4708,
+      "step": 13277
+    },
+    {
+      "epoch": 2.363960113960114,
+      "grad_norm": 0.7659112215042114,
+      "learning_rate": 7.195252657400729e-05,
+      "loss": 0.839,
+      "step": 13278
+    },
+    {
+      "epoch": 2.3641381766381766,
+      "grad_norm": 0.8355028629302979,
+      "learning_rate": 7.193909121546631e-05,
+      "loss": 0.9792,
+      "step": 13279
+    },
+    {
+      "epoch": 2.3643162393162394,
+      "grad_norm": 0.9633997678756714,
+      "learning_rate": 7.192565640672052e-05,
+      "loss": 0.9891,
+      "step": 13280
+    },
+    {
+      "epoch": 2.364494301994302,
+      "grad_norm": 0.7984298467636108,
+      "learning_rate": 7.191222214803318e-05,
+      "loss": 0.8343,
+      "step": 13281
+    },
+    {
+      "epoch": 2.364672364672365,
+      "grad_norm": 0.8239994645118713,
+      "learning_rate": 7.189878843966749e-05,
+      "loss": 0.8586,
+      "step": 13282
+    },
+    {
+      "epoch": 2.364850427350427,
+      "grad_norm": 0.8695420026779175,
+      "learning_rate": 7.188535528188671e-05,
+      "loss": 0.9161,
+      "step": 13283
+    },
+    {
+      "epoch": 2.36502849002849,
+      "grad_norm": 0.8272924423217773,
+      "learning_rate": 7.187192267495393e-05,
+      "loss": 0.8158,
+      "step": 13284
+    },
+    {
+      "epoch": 2.3652065527065527,
+      "grad_norm": 0.8217222690582275,
+      "learning_rate": 7.185849061913243e-05,
+      "loss": 0.892,
+      "step": 13285
+    },
+    {
+      "epoch": 2.3653846153846154,
+      "grad_norm": 0.9041243195533752,
+      "learning_rate": 7.184505911468532e-05,
+      "loss": 0.9093,
+      "step": 13286
+    },
+    {
+      "epoch": 2.365562678062678,
+      "grad_norm": 0.8325521349906921,
+      "learning_rate": 7.183162816187582e-05,
+      "loss": 0.7546,
+      "step": 13287
+    },
+    {
+      "epoch": 2.365740740740741,
+      "grad_norm": 0.9160267114639282,
+      "learning_rate": 7.181819776096704e-05,
+      "loss": 0.9662,
+      "step": 13288
+    },
+    {
+      "epoch": 2.3659188034188032,
+      "grad_norm": 0.8771381974220276,
+      "learning_rate": 7.180476791222215e-05,
+      "loss": 1.0083,
+      "step": 13289
+    },
+    {
+      "epoch": 2.366096866096866,
+      "grad_norm": 0.8251327872276306,
+      "learning_rate": 7.179133861590421e-05,
+      "loss": 0.8209,
+      "step": 13290
+    },
+    {
+      "epoch": 2.3662749287749287,
+      "grad_norm": 0.8760706186294556,
+      "learning_rate": 7.177790987227641e-05,
+      "loss": 0.7479,
+      "step": 13291
+    },
+    {
+      "epoch": 2.3664529914529915,
+      "grad_norm": 0.7857288122177124,
+      "learning_rate": 7.176448168160187e-05,
+      "loss": 0.6511,
+      "step": 13292
+    },
+    {
+      "epoch": 2.3666310541310542,
+      "grad_norm": 0.9548102021217346,
+      "learning_rate": 7.175105404414362e-05,
+      "loss": 0.731,
+      "step": 13293
+    },
+    {
+      "epoch": 2.366809116809117,
+      "grad_norm": 0.7604304552078247,
+      "learning_rate": 7.173762696016484e-05,
+      "loss": 0.8212,
+      "step": 13294
+    },
+    {
+      "epoch": 2.3669871794871793,
+      "grad_norm": 0.9121061563491821,
+      "learning_rate": 7.172420042992849e-05,
+      "loss": 0.939,
+      "step": 13295
+    },
+    {
+      "epoch": 2.367165242165242,
+      "grad_norm": 0.8128613233566284,
+      "learning_rate": 7.171077445369772e-05,
+      "loss": 0.8908,
+      "step": 13296
+    },
+    {
+      "epoch": 2.367343304843305,
+      "grad_norm": 0.9184401035308838,
+      "learning_rate": 7.169734903173555e-05,
+      "loss": 0.957,
+      "step": 13297
+    },
+    {
+      "epoch": 2.3675213675213675,
+      "grad_norm": 0.9234427809715271,
+      "learning_rate": 7.168392416430507e-05,
+      "loss": 0.8403,
+      "step": 13298
+    },
+    {
+      "epoch": 2.3676994301994303,
+      "grad_norm": 0.8810806274414062,
+      "learning_rate": 7.167049985166922e-05,
+      "loss": 0.9754,
+      "step": 13299
+    },
+    {
+      "epoch": 2.367877492877493,
+      "grad_norm": 0.8208937048912048,
+      "learning_rate": 7.165707609409113e-05,
+      "loss": 0.9418,
+      "step": 13300
+    },
+    {
+      "epoch": 2.3680555555555554,
+      "grad_norm": 0.8666219711303711,
+      "learning_rate": 7.164365289183371e-05,
+      "loss": 0.8936,
+      "step": 13301
+    },
+    {
+      "epoch": 2.368233618233618,
+      "grad_norm": 0.9385154843330383,
+      "learning_rate": 7.163023024516002e-05,
+      "loss": 0.8158,
+      "step": 13302
+    },
+    {
+      "epoch": 2.368411680911681,
+      "grad_norm": 1.0415911674499512,
+      "learning_rate": 7.161680815433303e-05,
+      "loss": 1.0445,
+      "step": 13303
+    },
+    {
+      "epoch": 2.3685897435897436,
+      "grad_norm": 0.6882192492485046,
+      "learning_rate": 7.160338661961577e-05,
+      "loss": 0.4929,
+      "step": 13304
+    },
+    {
+      "epoch": 2.3687678062678064,
+      "grad_norm": 0.8695144653320312,
+      "learning_rate": 7.15899656412711e-05,
+      "loss": 0.8991,
+      "step": 13305
+    },
+    {
+      "epoch": 2.368945868945869,
+      "grad_norm": 0.8973569273948669,
+      "learning_rate": 7.157654521956206e-05,
+      "loss": 0.8423,
+      "step": 13306
+    },
+    {
+      "epoch": 2.369123931623932,
+      "grad_norm": 0.7656881213188171,
+      "learning_rate": 7.156312535475155e-05,
+      "loss": 0.7351,
+      "step": 13307
+    },
+    {
+      "epoch": 2.369301994301994,
+      "grad_norm": 0.8023402690887451,
+      "learning_rate": 7.154970604710258e-05,
+      "loss": 0.9943,
+      "step": 13308
+    },
+    {
+      "epoch": 2.369480056980057,
+      "grad_norm": 0.916946530342102,
+      "learning_rate": 7.153628729687797e-05,
+      "loss": 0.8649,
+      "step": 13309
+    },
+    {
+      "epoch": 2.3696581196581197,
+      "grad_norm": 0.8764750361442566,
+      "learning_rate": 7.152286910434068e-05,
+      "loss": 0.9799,
+      "step": 13310
+    },
+    {
+      "epoch": 2.3698361823361824,
+      "grad_norm": 0.8732671737670898,
+      "learning_rate": 7.150945146975364e-05,
+      "loss": 1.0431,
+      "step": 13311
+    },
+    {
+      "epoch": 2.370014245014245,
+      "grad_norm": 0.8447144031524658,
+      "learning_rate": 7.149603439337969e-05,
+      "loss": 0.7805,
+      "step": 13312
+    },
+    {
+      "epoch": 2.3701923076923075,
+      "grad_norm": 0.9017399549484253,
+      "learning_rate": 7.148261787548178e-05,
+      "loss": 0.8102,
+      "step": 13313
+    },
+    {
+      "epoch": 2.3703703703703702,
+      "grad_norm": 0.7187124490737915,
+      "learning_rate": 7.14692019163227e-05,
+      "loss": 0.7327,
+      "step": 13314
+    },
+    {
+      "epoch": 2.370548433048433,
+      "grad_norm": 0.8579949736595154,
+      "learning_rate": 7.145578651616536e-05,
+      "loss": 0.8685,
+      "step": 13315
+    },
+    {
+      "epoch": 2.3707264957264957,
+      "grad_norm": 0.6088887453079224,
+      "learning_rate": 7.144237167527256e-05,
+      "loss": 0.7004,
+      "step": 13316
+    },
+    {
+      "epoch": 2.3709045584045585,
+      "grad_norm": 0.6400231719017029,
+      "learning_rate": 7.142895739390718e-05,
+      "loss": 0.7273,
+      "step": 13317
+    },
+    {
+      "epoch": 2.3710826210826212,
+      "grad_norm": 0.8680049180984497,
+      "learning_rate": 7.141554367233201e-05,
+      "loss": 0.7886,
+      "step": 13318
+    },
+    {
+      "epoch": 2.371260683760684,
+      "grad_norm": 0.8894832134246826,
+      "learning_rate": 7.140213051080991e-05,
+      "loss": 1.0597,
+      "step": 13319
+    },
+    {
+      "epoch": 2.3714387464387463,
+      "grad_norm": 0.7371698021888733,
+      "learning_rate": 7.138871790960365e-05,
+      "loss": 0.8344,
+      "step": 13320
+    },
+    {
+      "epoch": 2.371616809116809,
+      "grad_norm": 0.7396906018257141,
+      "learning_rate": 7.137530586897601e-05,
+      "loss": 0.7185,
+      "step": 13321
+    },
+    {
+      "epoch": 2.371794871794872,
+      "grad_norm": 0.7884365320205688,
+      "learning_rate": 7.136189438918978e-05,
+      "loss": 0.8311,
+      "step": 13322
+    },
+    {
+      "epoch": 2.3719729344729346,
+      "grad_norm": 0.8064826130867004,
+      "learning_rate": 7.13484834705078e-05,
+      "loss": 0.6933,
+      "step": 13323
+    },
+    {
+      "epoch": 2.3721509971509973,
+      "grad_norm": 0.8865584135055542,
+      "learning_rate": 7.13350731131927e-05,
+      "loss": 0.979,
+      "step": 13324
+    },
+    {
+      "epoch": 2.3723290598290596,
+      "grad_norm": 0.7782325148582458,
+      "learning_rate": 7.132166331750736e-05,
+      "loss": 0.8147,
+      "step": 13325
+    },
+    {
+      "epoch": 2.3725071225071224,
+      "grad_norm": 0.8515480160713196,
+      "learning_rate": 7.13082540837144e-05,
+      "loss": 0.7571,
+      "step": 13326
+    },
+    {
+      "epoch": 2.372685185185185,
+      "grad_norm": 0.8665108680725098,
+      "learning_rate": 7.129484541207662e-05,
+      "loss": 0.8171,
+      "step": 13327
+    },
+    {
+      "epoch": 2.372863247863248,
+      "grad_norm": 0.7640653252601624,
+      "learning_rate": 7.128143730285668e-05,
+      "loss": 0.7118,
+      "step": 13328
+    },
+    {
+      "epoch": 2.3730413105413106,
+      "grad_norm": 0.844083309173584,
+      "learning_rate": 7.126802975631735e-05,
+      "loss": 0.8394,
+      "step": 13329
+    },
+    {
+      "epoch": 2.3732193732193734,
+      "grad_norm": 0.8718371391296387,
+      "learning_rate": 7.12546227727213e-05,
+      "loss": 0.8729,
+      "step": 13330
+    },
+    {
+      "epoch": 2.373397435897436,
+      "grad_norm": 0.7254782319068909,
+      "learning_rate": 7.124121635233118e-05,
+      "loss": 0.8178,
+      "step": 13331
+    },
+    {
+      "epoch": 2.3735754985754984,
+      "grad_norm": 0.7211804389953613,
+      "learning_rate": 7.12278104954097e-05,
+      "loss": 0.9415,
+      "step": 13332
+    },
+    {
+      "epoch": 2.373753561253561,
+      "grad_norm": 0.8538317680358887,
+      "learning_rate": 7.121440520221949e-05,
+      "loss": 0.8614,
+      "step": 13333
+    },
+    {
+      "epoch": 2.373931623931624,
+      "grad_norm": 0.8942680358886719,
+      "learning_rate": 7.120100047302324e-05,
+      "loss": 0.985,
+      "step": 13334
+    },
+    {
+      "epoch": 2.3741096866096867,
+      "grad_norm": 0.8282434344291687,
+      "learning_rate": 7.118759630808354e-05,
+      "loss": 0.94,
+      "step": 13335
+    },
+    {
+      "epoch": 2.3742877492877494,
+      "grad_norm": 0.8036409616470337,
+      "learning_rate": 7.117419270766308e-05,
+      "loss": 0.7145,
+      "step": 13336
+    },
+    {
+      "epoch": 2.3744658119658117,
+      "grad_norm": 0.9169675707817078,
+      "learning_rate": 7.116078967202437e-05,
+      "loss": 1.1078,
+      "step": 13337
+    },
+    {
+      "epoch": 2.3746438746438745,
+      "grad_norm": 0.7805418372154236,
+      "learning_rate": 7.114738720143011e-05,
+      "loss": 0.8216,
+      "step": 13338
+    },
+    {
+      "epoch": 2.3748219373219372,
+      "grad_norm": 1.0444506406784058,
+      "learning_rate": 7.113398529614285e-05,
+      "loss": 0.8153,
+      "step": 13339
+    },
+    {
+      "epoch": 2.375,
+      "grad_norm": 0.8254665732383728,
+      "learning_rate": 7.112058395642522e-05,
+      "loss": 0.8127,
+      "step": 13340
+    },
+    {
+      "epoch": 2.3751780626780628,
+      "grad_norm": 0.8327687382698059,
+      "learning_rate": 7.11071831825397e-05,
+      "loss": 0.7014,
+      "step": 13341
+    },
+    {
+      "epoch": 2.3753561253561255,
+      "grad_norm": 0.7473437786102295,
+      "learning_rate": 7.109378297474894e-05,
+      "loss": 0.7621,
+      "step": 13342
+    },
+    {
+      "epoch": 2.3755341880341883,
+      "grad_norm": 0.8537931442260742,
+      "learning_rate": 7.108038333331544e-05,
+      "loss": 0.9302,
+      "step": 13343
+    },
+    {
+      "epoch": 2.3757122507122506,
+      "grad_norm": 0.81959468126297,
+      "learning_rate": 7.106698425850178e-05,
+      "loss": 0.9157,
+      "step": 13344
+    },
+    {
+      "epoch": 2.3758903133903133,
+      "grad_norm": 0.769257128238678,
+      "learning_rate": 7.105358575057043e-05,
+      "loss": 0.8739,
+      "step": 13345
+    },
+    {
+      "epoch": 2.376068376068376,
+      "grad_norm": 0.7428072690963745,
+      "learning_rate": 7.104018780978394e-05,
+      "loss": 0.7001,
+      "step": 13346
+    },
+    {
+      "epoch": 2.376246438746439,
+      "grad_norm": 0.8152543306350708,
+      "learning_rate": 7.102679043640481e-05,
+      "loss": 0.9866,
+      "step": 13347
+    },
+    {
+      "epoch": 2.3764245014245016,
+      "grad_norm": 0.8732424974441528,
+      "learning_rate": 7.101339363069556e-05,
+      "loss": 1.0207,
+      "step": 13348
+    },
+    {
+      "epoch": 2.376602564102564,
+      "grad_norm": 0.759279191493988,
+      "learning_rate": 7.099999739291862e-05,
+      "loss": 0.8703,
+      "step": 13349
+    },
+    {
+      "epoch": 2.3767806267806266,
+      "grad_norm": 0.8751664161682129,
+      "learning_rate": 7.098660172333648e-05,
+      "loss": 0.9805,
+      "step": 13350
+    },
+    {
+      "epoch": 2.3769586894586894,
+      "grad_norm": 0.9646390080451965,
+      "learning_rate": 7.097320662221168e-05,
+      "loss": 0.8623,
+      "step": 13351
+    },
+    {
+      "epoch": 2.377136752136752,
+      "grad_norm": 0.8626869320869446,
+      "learning_rate": 7.095981208980652e-05,
+      "loss": 0.7175,
+      "step": 13352
+    },
+    {
+      "epoch": 2.377314814814815,
+      "grad_norm": 0.8075738549232483,
+      "learning_rate": 7.094641812638354e-05,
+      "loss": 0.7741,
+      "step": 13353
+    },
+    {
+      "epoch": 2.3774928774928776,
+      "grad_norm": 0.7733559608459473,
+      "learning_rate": 7.093302473220513e-05,
+      "loss": 0.8553,
+      "step": 13354
+    },
+    {
+      "epoch": 2.3776709401709404,
+      "grad_norm": 0.7372797727584839,
+      "learning_rate": 7.091963190753376e-05,
+      "loss": 0.8554,
+      "step": 13355
+    },
+    {
+      "epoch": 2.3778490028490027,
+      "grad_norm": 0.804649293422699,
+      "learning_rate": 7.090623965263177e-05,
+      "loss": 0.8704,
+      "step": 13356
+    },
+    {
+      "epoch": 2.3780270655270654,
+      "grad_norm": 0.8370727300643921,
+      "learning_rate": 7.089284796776157e-05,
+      "loss": 0.9786,
+      "step": 13357
+    },
+    {
+      "epoch": 2.378205128205128,
+      "grad_norm": 0.7565299272537231,
+      "learning_rate": 7.087945685318554e-05,
+      "loss": 0.8096,
+      "step": 13358
+    },
+    {
+      "epoch": 2.378383190883191,
+      "grad_norm": 0.9046086072921753,
+      "learning_rate": 7.086606630916611e-05,
+      "loss": 0.8108,
+      "step": 13359
+    },
+    {
+      "epoch": 2.3785612535612537,
+      "grad_norm": 0.8453067541122437,
+      "learning_rate": 7.085267633596552e-05,
+      "loss": 0.8226,
+      "step": 13360
+    },
+    {
+      "epoch": 2.378739316239316,
+      "grad_norm": 0.8499273061752319,
+      "learning_rate": 7.083928693384628e-05,
+      "loss": 1.001,
+      "step": 13361
+    },
+    {
+      "epoch": 2.3789173789173788,
+      "grad_norm": 0.8358726501464844,
+      "learning_rate": 7.082589810307055e-05,
+      "loss": 0.7891,
+      "step": 13362
+    },
+    {
+      "epoch": 2.3790954415954415,
+      "grad_norm": 0.9156573414802551,
+      "learning_rate": 7.081250984390078e-05,
+      "loss": 0.9381,
+      "step": 13363
+    },
+    {
+      "epoch": 2.3792735042735043,
+      "grad_norm": 0.8704338669776917,
+      "learning_rate": 7.079912215659923e-05,
+      "loss": 0.9004,
+      "step": 13364
+    },
+    {
+      "epoch": 2.379451566951567,
+      "grad_norm": 0.8201949000358582,
+      "learning_rate": 7.078573504142824e-05,
+      "loss": 0.7501,
+      "step": 13365
+    },
+    {
+      "epoch": 2.3796296296296298,
+      "grad_norm": 0.9453420639038086,
+      "learning_rate": 7.077234849865008e-05,
+      "loss": 0.9658,
+      "step": 13366
+    },
+    {
+      "epoch": 2.3798076923076925,
+      "grad_norm": 0.8556796908378601,
+      "learning_rate": 7.075896252852703e-05,
+      "loss": 0.8054,
+      "step": 13367
+    },
+    {
+      "epoch": 2.379985754985755,
+      "grad_norm": 0.7961027026176453,
+      "learning_rate": 7.074557713132136e-05,
+      "loss": 0.8065,
+      "step": 13368
+    },
+    {
+      "epoch": 2.3801638176638176,
+      "grad_norm": 0.8777903318405151,
+      "learning_rate": 7.073219230729533e-05,
+      "loss": 0.9399,
+      "step": 13369
+    },
+    {
+      "epoch": 2.3803418803418803,
+      "grad_norm": 0.8569813370704651,
+      "learning_rate": 7.071880805671123e-05,
+      "loss": 0.9424,
+      "step": 13370
+    },
+    {
+      "epoch": 2.380519943019943,
+      "grad_norm": 0.8810455203056335,
+      "learning_rate": 7.070542437983123e-05,
+      "loss": 1.1313,
+      "step": 13371
+    },
+    {
+      "epoch": 2.380698005698006,
+      "grad_norm": 0.8691363334655762,
+      "learning_rate": 7.069204127691761e-05,
+      "loss": 0.9114,
+      "step": 13372
+    },
+    {
+      "epoch": 2.380876068376068,
+      "grad_norm": 0.7922945618629456,
+      "learning_rate": 7.067865874823253e-05,
+      "loss": 0.9158,
+      "step": 13373
+    },
+    {
+      "epoch": 2.381054131054131,
+      "grad_norm": 0.7465389370918274,
+      "learning_rate": 7.066527679403825e-05,
+      "loss": 0.6597,
+      "step": 13374
+    },
+    {
+      "epoch": 2.3812321937321936,
+      "grad_norm": 0.8386009931564331,
+      "learning_rate": 7.065189541459689e-05,
+      "loss": 0.7194,
+      "step": 13375
+    },
+    {
+      "epoch": 2.3814102564102564,
+      "grad_norm": 0.8633689880371094,
+      "learning_rate": 7.063851461017073e-05,
+      "loss": 0.8877,
+      "step": 13376
+    },
+    {
+      "epoch": 2.381588319088319,
+      "grad_norm": 0.8689528107643127,
+      "learning_rate": 7.062513438102184e-05,
+      "loss": 0.8384,
+      "step": 13377
+    },
+    {
+      "epoch": 2.381766381766382,
+      "grad_norm": 0.7648544311523438,
+      "learning_rate": 7.061175472741243e-05,
+      "loss": 0.7669,
+      "step": 13378
+    },
+    {
+      "epoch": 2.3819444444444446,
+      "grad_norm": 0.8502510786056519,
+      "learning_rate": 7.059837564960465e-05,
+      "loss": 0.9379,
+      "step": 13379
+    },
+    {
+      "epoch": 2.382122507122507,
+      "grad_norm": 0.8277843594551086,
+      "learning_rate": 7.058499714786063e-05,
+      "loss": 0.7372,
+      "step": 13380
+    },
+    {
+      "epoch": 2.3823005698005697,
+      "grad_norm": 0.7394976615905762,
+      "learning_rate": 7.057161922244246e-05,
+      "loss": 0.7628,
+      "step": 13381
+    },
+    {
+      "epoch": 2.3824786324786325,
+      "grad_norm": 0.7906123399734497,
+      "learning_rate": 7.05582418736123e-05,
+      "loss": 0.7645,
+      "step": 13382
+    },
+    {
+      "epoch": 2.382656695156695,
+      "grad_norm": 0.7889885902404785,
+      "learning_rate": 7.054486510163221e-05,
+      "loss": 0.8316,
+      "step": 13383
+    },
+    {
+      "epoch": 2.382834757834758,
+      "grad_norm": 0.7983359098434448,
+      "learning_rate": 7.053148890676434e-05,
+      "loss": 0.7925,
+      "step": 13384
+    },
+    {
+      "epoch": 2.3830128205128207,
+      "grad_norm": 0.9067932963371277,
+      "learning_rate": 7.051811328927067e-05,
+      "loss": 0.9385,
+      "step": 13385
+    },
+    {
+      "epoch": 2.383190883190883,
+      "grad_norm": 0.7210679650306702,
+      "learning_rate": 7.05047382494134e-05,
+      "loss": 0.591,
+      "step": 13386
+    },
+    {
+      "epoch": 2.3833689458689458,
+      "grad_norm": 0.9977821707725525,
+      "learning_rate": 7.049136378745445e-05,
+      "loss": 0.8362,
+      "step": 13387
+    },
+    {
+      "epoch": 2.3835470085470085,
+      "grad_norm": 0.9260198473930359,
+      "learning_rate": 7.047798990365595e-05,
+      "loss": 1.0051,
+      "step": 13388
+    },
+    {
+      "epoch": 2.3837250712250713,
+      "grad_norm": 0.8903454542160034,
+      "learning_rate": 7.04646165982799e-05,
+      "loss": 0.7055,
+      "step": 13389
+    },
+    {
+      "epoch": 2.383903133903134,
+      "grad_norm": 0.9634504914283752,
+      "learning_rate": 7.045124387158832e-05,
+      "loss": 0.7681,
+      "step": 13390
+    },
+    {
+      "epoch": 2.3840811965811968,
+      "grad_norm": 0.8645864129066467,
+      "learning_rate": 7.043787172384329e-05,
+      "loss": 0.9271,
+      "step": 13391
+    },
+    {
+      "epoch": 2.384259259259259,
+      "grad_norm": 0.8738446235656738,
+      "learning_rate": 7.04245001553067e-05,
+      "loss": 0.9,
+      "step": 13392
+    },
+    {
+      "epoch": 2.384437321937322,
+      "grad_norm": 0.7869822382926941,
+      "learning_rate": 7.041112916624062e-05,
+      "loss": 0.8639,
+      "step": 13393
+    },
+    {
+      "epoch": 2.3846153846153846,
+      "grad_norm": 0.8728111386299133,
+      "learning_rate": 7.039775875690698e-05,
+      "loss": 1.0367,
+      "step": 13394
+    },
+    {
+      "epoch": 2.3847934472934473,
+      "grad_norm": 0.7883852124214172,
+      "learning_rate": 7.03843889275678e-05,
+      "loss": 0.8338,
+      "step": 13395
+    },
+    {
+      "epoch": 2.38497150997151,
+      "grad_norm": 0.9267113208770752,
+      "learning_rate": 7.037101967848496e-05,
+      "loss": 0.8931,
+      "step": 13396
+    },
+    {
+      "epoch": 2.385149572649573,
+      "grad_norm": 0.8940320611000061,
+      "learning_rate": 7.035765100992048e-05,
+      "loss": 0.8071,
+      "step": 13397
+    },
+    {
+      "epoch": 2.385327635327635,
+      "grad_norm": 0.8109263777732849,
+      "learning_rate": 7.03442829221362e-05,
+      "loss": 0.8083,
+      "step": 13398
+    },
+    {
+      "epoch": 2.385505698005698,
+      "grad_norm": 0.8223438262939453,
+      "learning_rate": 7.033091541539413e-05,
+      "loss": 0.9296,
+      "step": 13399
+    },
+    {
+      "epoch": 2.3856837606837606,
+      "grad_norm": 0.817894697189331,
+      "learning_rate": 7.031754848995612e-05,
+      "loss": 0.9168,
+      "step": 13400
+    },
+    {
+      "epoch": 2.3858618233618234,
+      "grad_norm": 0.831462562084198,
+      "learning_rate": 7.030418214608411e-05,
+      "loss": 0.8613,
+      "step": 13401
+    },
+    {
+      "epoch": 2.386039886039886,
+      "grad_norm": 0.8388770818710327,
+      "learning_rate": 7.029081638403994e-05,
+      "loss": 0.7477,
+      "step": 13402
+    },
+    {
+      "epoch": 2.386217948717949,
+      "grad_norm": 0.9557843804359436,
+      "learning_rate": 7.02774512040855e-05,
+      "loss": 0.8932,
+      "step": 13403
+    },
+    {
+      "epoch": 2.386396011396011,
+      "grad_norm": 0.8249707221984863,
+      "learning_rate": 7.026408660648268e-05,
+      "loss": 1.0301,
+      "step": 13404
+    },
+    {
+      "epoch": 2.386574074074074,
+      "grad_norm": 0.8355069160461426,
+      "learning_rate": 7.025072259149333e-05,
+      "loss": 0.8081,
+      "step": 13405
+    },
+    {
+      "epoch": 2.3867521367521367,
+      "grad_norm": 0.8373300433158875,
+      "learning_rate": 7.023735915937924e-05,
+      "loss": 0.9911,
+      "step": 13406
+    },
+    {
+      "epoch": 2.3869301994301995,
+      "grad_norm": 0.7177539467811584,
+      "learning_rate": 7.022399631040228e-05,
+      "loss": 0.6397,
+      "step": 13407
+    },
+    {
+      "epoch": 2.387108262108262,
+      "grad_norm": 0.7371904253959656,
+      "learning_rate": 7.021063404482426e-05,
+      "loss": 0.8634,
+      "step": 13408
+    },
+    {
+      "epoch": 2.387286324786325,
+      "grad_norm": 0.8919385671615601,
+      "learning_rate": 7.019727236290696e-05,
+      "loss": 0.9514,
+      "step": 13409
+    },
+    {
+      "epoch": 2.3874643874643873,
+      "grad_norm": 0.7673050761222839,
+      "learning_rate": 7.018391126491225e-05,
+      "loss": 0.8957,
+      "step": 13410
+    },
+    {
+      "epoch": 2.38764245014245,
+      "grad_norm": 0.8401889801025391,
+      "learning_rate": 7.01705507511018e-05,
+      "loss": 0.908,
+      "step": 13411
+    },
+    {
+      "epoch": 2.3878205128205128,
+      "grad_norm": 0.822903037071228,
+      "learning_rate": 7.01571908217375e-05,
+      "loss": 0.911,
+      "step": 13412
+    },
+    {
+      "epoch": 2.3879985754985755,
+      "grad_norm": 0.9824740290641785,
+      "learning_rate": 7.014383147708102e-05,
+      "loss": 0.8314,
+      "step": 13413
+    },
+    {
+      "epoch": 2.3881766381766383,
+      "grad_norm": 0.9485064148902893,
+      "learning_rate": 7.013047271739414e-05,
+      "loss": 0.9819,
+      "step": 13414
+    },
+    {
+      "epoch": 2.388354700854701,
+      "grad_norm": 0.7565387487411499,
+      "learning_rate": 7.01171145429386e-05,
+      "loss": 0.9702,
+      "step": 13415
+    },
+    {
+      "epoch": 2.388532763532764,
+      "grad_norm": 0.8159620761871338,
+      "learning_rate": 7.010375695397615e-05,
+      "loss": 0.7302,
+      "step": 13416
+    },
+    {
+      "epoch": 2.388710826210826,
+      "grad_norm": 0.7818536162376404,
+      "learning_rate": 7.009039995076844e-05,
+      "loss": 0.6821,
+      "step": 13417
+    },
+    {
+      "epoch": 2.388888888888889,
+      "grad_norm": 0.7958348989486694,
+      "learning_rate": 7.007704353357724e-05,
+      "loss": 0.7996,
+      "step": 13418
+    },
+    {
+      "epoch": 2.3890669515669516,
+      "grad_norm": 0.8097305297851562,
+      "learning_rate": 7.006368770266421e-05,
+      "loss": 0.8177,
+      "step": 13419
+    },
+    {
+      "epoch": 2.3892450142450143,
+      "grad_norm": 0.9326507449150085,
+      "learning_rate": 7.005033245829105e-05,
+      "loss": 0.9307,
+      "step": 13420
+    },
+    {
+      "epoch": 2.389423076923077,
+      "grad_norm": 0.8954049944877625,
+      "learning_rate": 7.003697780071936e-05,
+      "loss": 0.8527,
+      "step": 13421
+    },
+    {
+      "epoch": 2.3896011396011394,
+      "grad_norm": 0.890548586845398,
+      "learning_rate": 7.00236237302109e-05,
+      "loss": 0.8203,
+      "step": 13422
+    },
+    {
+      "epoch": 2.389779202279202,
+      "grad_norm": 0.7508596181869507,
+      "learning_rate": 7.001027024702722e-05,
+      "loss": 0.7056,
+      "step": 13423
+    },
+    {
+      "epoch": 2.389957264957265,
+      "grad_norm": 0.9403550624847412,
+      "learning_rate": 6.999691735143002e-05,
+      "loss": 0.7336,
+      "step": 13424
+    },
+    {
+      "epoch": 2.3901353276353277,
+      "grad_norm": 0.8187662959098816,
+      "learning_rate": 6.998356504368087e-05,
+      "loss": 0.6897,
+      "step": 13425
+    },
+    {
+      "epoch": 2.3903133903133904,
+      "grad_norm": 0.8584417104721069,
+      "learning_rate": 6.997021332404145e-05,
+      "loss": 0.9143,
+      "step": 13426
+    },
+    {
+      "epoch": 2.390491452991453,
+      "grad_norm": 0.8739892840385437,
+      "learning_rate": 6.995686219277329e-05,
+      "loss": 0.8028,
+      "step": 13427
+    },
+    {
+      "epoch": 2.390669515669516,
+      "grad_norm": 0.9291013479232788,
+      "learning_rate": 6.994351165013799e-05,
+      "loss": 1.0305,
+      "step": 13428
+    },
+    {
+      "epoch": 2.390847578347578,
+      "grad_norm": 0.7937391400337219,
+      "learning_rate": 6.993016169639719e-05,
+      "loss": 0.8326,
+      "step": 13429
+    },
+    {
+      "epoch": 2.391025641025641,
+      "grad_norm": 0.655261754989624,
+      "learning_rate": 6.991681233181236e-05,
+      "loss": 0.7939,
+      "step": 13430
+    },
+    {
+      "epoch": 2.3912037037037037,
+      "grad_norm": 0.9606142640113831,
+      "learning_rate": 6.990346355664515e-05,
+      "loss": 1.1344,
+      "step": 13431
+    },
+    {
+      "epoch": 2.3913817663817665,
+      "grad_norm": 0.8111617565155029,
+      "learning_rate": 6.9890115371157e-05,
+      "loss": 0.8398,
+      "step": 13432
+    },
+    {
+      "epoch": 2.3915598290598292,
+      "grad_norm": 0.8111898899078369,
+      "learning_rate": 6.987676777560955e-05,
+      "loss": 0.9189,
+      "step": 13433
+    },
+    {
+      "epoch": 2.3917378917378915,
+      "grad_norm": 0.7850473523139954,
+      "learning_rate": 6.98634207702642e-05,
+      "loss": 0.9563,
+      "step": 13434
+    },
+    {
+      "epoch": 2.3919159544159543,
+      "grad_norm": 0.7740257978439331,
+      "learning_rate": 6.985007435538256e-05,
+      "loss": 0.7446,
+      "step": 13435
+    },
+    {
+      "epoch": 2.392094017094017,
+      "grad_norm": 0.9354606866836548,
+      "learning_rate": 6.983672853122604e-05,
+      "loss": 0.879,
+      "step": 13436
+    },
+    {
+      "epoch": 2.39227207977208,
+      "grad_norm": 0.8909385800361633,
+      "learning_rate": 6.982338329805622e-05,
+      "loss": 0.9381,
+      "step": 13437
+    },
+    {
+      "epoch": 2.3924501424501425,
+      "grad_norm": 0.7748416066169739,
+      "learning_rate": 6.981003865613448e-05,
+      "loss": 0.7169,
+      "step": 13438
+    },
+    {
+      "epoch": 2.3926282051282053,
+      "grad_norm": 0.7357833981513977,
+      "learning_rate": 6.979669460572234e-05,
+      "loss": 0.669,
+      "step": 13439
+    },
+    {
+      "epoch": 2.392806267806268,
+      "grad_norm": 0.8370460271835327,
+      "learning_rate": 6.978335114708119e-05,
+      "loss": 0.6215,
+      "step": 13440
+    },
+    {
+      "epoch": 2.3929843304843303,
+      "grad_norm": 0.7578476071357727,
+      "learning_rate": 6.977000828047256e-05,
+      "loss": 0.871,
+      "step": 13441
+    },
+    {
+      "epoch": 2.393162393162393,
+      "grad_norm": 0.8111903071403503,
+      "learning_rate": 6.975666600615776e-05,
+      "loss": 0.7888,
+      "step": 13442
+    },
+    {
+      "epoch": 2.393340455840456,
+      "grad_norm": 0.9584433436393738,
+      "learning_rate": 6.974332432439831e-05,
+      "loss": 1.0011,
+      "step": 13443
+    },
+    {
+      "epoch": 2.3935185185185186,
+      "grad_norm": 0.9105294942855835,
+      "learning_rate": 6.972998323545555e-05,
+      "loss": 1.0832,
+      "step": 13444
+    },
+    {
+      "epoch": 2.3936965811965814,
+      "grad_norm": 0.7990328669548035,
+      "learning_rate": 6.971664273959089e-05,
+      "loss": 0.9561,
+      "step": 13445
+    },
+    {
+      "epoch": 2.3938746438746437,
+      "grad_norm": 0.8575631976127625,
+      "learning_rate": 6.970330283706569e-05,
+      "loss": 0.7965,
+      "step": 13446
+    },
+    {
+      "epoch": 2.3940527065527064,
+      "grad_norm": 0.8147784471511841,
+      "learning_rate": 6.968996352814139e-05,
+      "loss": 0.806,
+      "step": 13447
+    },
+    {
+      "epoch": 2.394230769230769,
+      "grad_norm": 0.8284323215484619,
+      "learning_rate": 6.967662481307923e-05,
+      "loss": 0.942,
+      "step": 13448
+    },
+    {
+      "epoch": 2.394408831908832,
+      "grad_norm": 0.8238104581832886,
+      "learning_rate": 6.966328669214062e-05,
+      "loss": 0.9163,
+      "step": 13449
+    },
+    {
+      "epoch": 2.3945868945868947,
+      "grad_norm": 0.8855763673782349,
+      "learning_rate": 6.964994916558692e-05,
+      "loss": 0.7683,
+      "step": 13450
+    },
+    {
+      "epoch": 2.3947649572649574,
+      "grad_norm": 1.02780020236969,
+      "learning_rate": 6.963661223367937e-05,
+      "loss": 0.904,
+      "step": 13451
+    },
+    {
+      "epoch": 2.39494301994302,
+      "grad_norm": 0.8001773953437805,
+      "learning_rate": 6.96232758966794e-05,
+      "loss": 0.8459,
+      "step": 13452
+    },
+    {
+      "epoch": 2.3951210826210825,
+      "grad_norm": 0.755388617515564,
+      "learning_rate": 6.960994015484818e-05,
+      "loss": 0.7759,
+      "step": 13453
+    },
+    {
+      "epoch": 2.3952991452991452,
+      "grad_norm": 0.7774340510368347,
+      "learning_rate": 6.959660500844708e-05,
+      "loss": 0.7353,
+      "step": 13454
+    },
+    {
+      "epoch": 2.395477207977208,
+      "grad_norm": 0.8696026802062988,
+      "learning_rate": 6.958327045773733e-05,
+      "loss": 0.635,
+      "step": 13455
+    },
+    {
+      "epoch": 2.3956552706552707,
+      "grad_norm": 0.8419780731201172,
+      "learning_rate": 6.956993650298025e-05,
+      "loss": 0.8515,
+      "step": 13456
+    },
+    {
+      "epoch": 2.3958333333333335,
+      "grad_norm": 0.9125590324401855,
+      "learning_rate": 6.955660314443699e-05,
+      "loss": 0.9099,
+      "step": 13457
+    },
+    {
+      "epoch": 2.396011396011396,
+      "grad_norm": 0.6847489476203918,
+      "learning_rate": 6.954327038236891e-05,
+      "loss": 0.6652,
+      "step": 13458
+    },
+    {
+      "epoch": 2.3961894586894585,
+      "grad_norm": 0.8674905896186829,
+      "learning_rate": 6.952993821703713e-05,
+      "loss": 0.7049,
+      "step": 13459
+    },
+    {
+      "epoch": 2.3963675213675213,
+      "grad_norm": 0.7777035236358643,
+      "learning_rate": 6.951660664870296e-05,
+      "loss": 0.818,
+      "step": 13460
+    },
+    {
+      "epoch": 2.396545584045584,
+      "grad_norm": 0.8349783420562744,
+      "learning_rate": 6.950327567762751e-05,
+      "loss": 0.9203,
+      "step": 13461
+    },
+    {
+      "epoch": 2.396723646723647,
+      "grad_norm": 0.7589834332466125,
+      "learning_rate": 6.948994530407206e-05,
+      "loss": 1.015,
+      "step": 13462
+    },
+    {
+      "epoch": 2.3969017094017095,
+      "grad_norm": 0.9340610504150391,
+      "learning_rate": 6.947661552829773e-05,
+      "loss": 1.0575,
+      "step": 13463
+    },
+    {
+      "epoch": 2.3970797720797723,
+      "grad_norm": 0.9100959300994873,
+      "learning_rate": 6.946328635056573e-05,
+      "loss": 0.8824,
+      "step": 13464
+    },
+    {
+      "epoch": 2.3972578347578346,
+      "grad_norm": 0.8255945444107056,
+      "learning_rate": 6.944995777113717e-05,
+      "loss": 0.7701,
+      "step": 13465
+    },
+    {
+      "epoch": 2.3974358974358974,
+      "grad_norm": 0.8572675585746765,
+      "learning_rate": 6.943662979027328e-05,
+      "loss": 0.9425,
+      "step": 13466
+    },
+    {
+      "epoch": 2.39761396011396,
+      "grad_norm": 0.8219536542892456,
+      "learning_rate": 6.94233024082351e-05,
+      "loss": 0.8184,
+      "step": 13467
+    },
+    {
+      "epoch": 2.397792022792023,
+      "grad_norm": 0.8260995149612427,
+      "learning_rate": 6.940997562528377e-05,
+      "loss": 0.8324,
+      "step": 13468
+    },
+    {
+      "epoch": 2.3979700854700856,
+      "grad_norm": 0.9707075357437134,
+      "learning_rate": 6.939664944168047e-05,
+      "loss": 0.9865,
+      "step": 13469
+    },
+    {
+      "epoch": 2.398148148148148,
+      "grad_norm": 0.9030438661575317,
+      "learning_rate": 6.938332385768622e-05,
+      "loss": 1.0244,
+      "step": 13470
+    },
+    {
+      "epoch": 2.3983262108262107,
+      "grad_norm": 0.8425108194351196,
+      "learning_rate": 6.936999887356214e-05,
+      "loss": 0.7053,
+      "step": 13471
+    },
+    {
+      "epoch": 2.3985042735042734,
+      "grad_norm": 1.0073270797729492,
+      "learning_rate": 6.93566744895693e-05,
+      "loss": 0.9324,
+      "step": 13472
+    },
+    {
+      "epoch": 2.398682336182336,
+      "grad_norm": 0.7647563219070435,
+      "learning_rate": 6.93433507059688e-05,
+      "loss": 0.7233,
+      "step": 13473
+    },
+    {
+      "epoch": 2.398860398860399,
+      "grad_norm": 0.7632454633712769,
+      "learning_rate": 6.933002752302162e-05,
+      "loss": 0.8678,
+      "step": 13474
+    },
+    {
+      "epoch": 2.3990384615384617,
+      "grad_norm": 0.7943702936172485,
+      "learning_rate": 6.931670494098887e-05,
+      "loss": 0.8805,
+      "step": 13475
+    },
+    {
+      "epoch": 2.3992165242165244,
+      "grad_norm": 0.9440419673919678,
+      "learning_rate": 6.930338296013153e-05,
+      "loss": 1.0103,
+      "step": 13476
+    },
+    {
+      "epoch": 2.3993945868945867,
+      "grad_norm": 0.9119253754615784,
+      "learning_rate": 6.929006158071065e-05,
+      "loss": 1.0235,
+      "step": 13477
+    },
+    {
+      "epoch": 2.3995726495726495,
+      "grad_norm": 0.7750248908996582,
+      "learning_rate": 6.927674080298721e-05,
+      "loss": 0.957,
+      "step": 13478
+    },
+    {
+      "epoch": 2.3997507122507122,
+      "grad_norm": 0.8847192525863647,
+      "learning_rate": 6.926342062722223e-05,
+      "loss": 0.9066,
+      "step": 13479
+    },
+    {
+      "epoch": 2.399928774928775,
+      "grad_norm": 0.814396321773529,
+      "learning_rate": 6.925010105367665e-05,
+      "loss": 1.0001,
+      "step": 13480
+    },
+    {
+      "epoch": 2.4001068376068377,
+      "grad_norm": 0.8323664665222168,
+      "learning_rate": 6.923678208261147e-05,
+      "loss": 1.0027,
+      "step": 13481
+    },
+    {
+      "epoch": 2.4002849002849,
+      "grad_norm": 0.8351104259490967,
+      "learning_rate": 6.92234637142876e-05,
+      "loss": 0.8629,
+      "step": 13482
+    },
+    {
+      "epoch": 2.400462962962963,
+      "grad_norm": 0.9298360347747803,
+      "learning_rate": 6.92101459489661e-05,
+      "loss": 0.9161,
+      "step": 13483
+    },
+    {
+      "epoch": 2.4006410256410255,
+      "grad_norm": 0.9423344135284424,
+      "learning_rate": 6.919682878690777e-05,
+      "loss": 1.4416,
+      "step": 13484
+    },
+    {
+      "epoch": 2.4008190883190883,
+      "grad_norm": 0.8340599536895752,
+      "learning_rate": 6.918351222837363e-05,
+      "loss": 0.8696,
+      "step": 13485
+    },
+    {
+      "epoch": 2.400997150997151,
+      "grad_norm": 0.8533751368522644,
+      "learning_rate": 6.917019627362451e-05,
+      "loss": 1.1383,
+      "step": 13486
+    },
+    {
+      "epoch": 2.401175213675214,
+      "grad_norm": 0.8060563206672668,
+      "learning_rate": 6.91568809229214e-05,
+      "loss": 0.8544,
+      "step": 13487
+    },
+    {
+      "epoch": 2.4013532763532766,
+      "grad_norm": 0.865485668182373,
+      "learning_rate": 6.914356617652511e-05,
+      "loss": 0.9286,
+      "step": 13488
+    },
+    {
+      "epoch": 2.401531339031339,
+      "grad_norm": 0.8785045742988586,
+      "learning_rate": 6.913025203469652e-05,
+      "loss": 0.7339,
+      "step": 13489
+    },
+    {
+      "epoch": 2.4017094017094016,
+      "grad_norm": 0.7718466520309448,
+      "learning_rate": 6.911693849769654e-05,
+      "loss": 0.8821,
+      "step": 13490
+    },
+    {
+      "epoch": 2.4018874643874644,
+      "grad_norm": 0.7274343371391296,
+      "learning_rate": 6.910362556578599e-05,
+      "loss": 0.6179,
+      "step": 13491
+    },
+    {
+      "epoch": 2.402065527065527,
+      "grad_norm": 0.8848530054092407,
+      "learning_rate": 6.909031323922574e-05,
+      "loss": 0.7848,
+      "step": 13492
+    },
+    {
+      "epoch": 2.40224358974359,
+      "grad_norm": 0.7384527325630188,
+      "learning_rate": 6.907700151827657e-05,
+      "loss": 0.5,
+      "step": 13493
+    },
+    {
+      "epoch": 2.402421652421652,
+      "grad_norm": 0.865505576133728,
+      "learning_rate": 6.906369040319936e-05,
+      "loss": 0.7127,
+      "step": 13494
+    },
+    {
+      "epoch": 2.402599715099715,
+      "grad_norm": 0.8588849902153015,
+      "learning_rate": 6.90503798942548e-05,
+      "loss": 0.8833,
+      "step": 13495
+    },
+    {
+      "epoch": 2.4027777777777777,
+      "grad_norm": 0.8570847511291504,
+      "learning_rate": 6.903706999170381e-05,
+      "loss": 0.9765,
+      "step": 13496
+    },
+    {
+      "epoch": 2.4029558404558404,
+      "grad_norm": 0.9193849563598633,
+      "learning_rate": 6.902376069580706e-05,
+      "loss": 0.8654,
+      "step": 13497
+    },
+    {
+      "epoch": 2.403133903133903,
+      "grad_norm": 0.8181582093238831,
+      "learning_rate": 6.901045200682545e-05,
+      "loss": 0.8815,
+      "step": 13498
+    },
+    {
+      "epoch": 2.403311965811966,
+      "grad_norm": 0.783163845539093,
+      "learning_rate": 6.89971439250196e-05,
+      "loss": 0.8383,
+      "step": 13499
+    },
+    {
+      "epoch": 2.4034900284900287,
+      "grad_norm": 1.0679216384887695,
+      "learning_rate": 6.898383645065032e-05,
+      "loss": 1.0525,
+      "step": 13500
+    },
+    {
+      "epoch": 2.403668091168091,
+      "grad_norm": 0.7945899367332458,
+      "learning_rate": 6.897052958397831e-05,
+      "loss": 1.0091,
+      "step": 13501
+    },
+    {
+      "epoch": 2.4038461538461537,
+      "grad_norm": 0.8310369253158569,
+      "learning_rate": 6.895722332526438e-05,
+      "loss": 0.8909,
+      "step": 13502
+    },
+    {
+      "epoch": 2.4040242165242165,
+      "grad_norm": 0.8811371922492981,
+      "learning_rate": 6.894391767476911e-05,
+      "loss": 0.8354,
+      "step": 13503
+    },
+    {
+      "epoch": 2.4042022792022792,
+      "grad_norm": 1.011495590209961,
+      "learning_rate": 6.893061263275332e-05,
+      "loss": 0.8846,
+      "step": 13504
+    },
+    {
+      "epoch": 2.404380341880342,
+      "grad_norm": 0.7587227821350098,
+      "learning_rate": 6.891730819947758e-05,
+      "loss": 0.8886,
+      "step": 13505
+    },
+    {
+      "epoch": 2.4045584045584047,
+      "grad_norm": 0.8367353677749634,
+      "learning_rate": 6.890400437520265e-05,
+      "loss": 1.008,
+      "step": 13506
+    },
+    {
+      "epoch": 2.404736467236467,
+      "grad_norm": 0.7200010418891907,
+      "learning_rate": 6.889070116018911e-05,
+      "loss": 0.8405,
+      "step": 13507
+    },
+    {
+      "epoch": 2.40491452991453,
+      "grad_norm": 0.9391907453536987,
+      "learning_rate": 6.887739855469769e-05,
+      "loss": 0.8904,
+      "step": 13508
+    },
+    {
+      "epoch": 2.4050925925925926,
+      "grad_norm": 0.8687568306922913,
+      "learning_rate": 6.886409655898902e-05,
+      "loss": 0.7145,
+      "step": 13509
+    },
+    {
+      "epoch": 2.4052706552706553,
+      "grad_norm": 0.7382767796516418,
+      "learning_rate": 6.885079517332366e-05,
+      "loss": 0.7639,
+      "step": 13510
+    },
+    {
+      "epoch": 2.405448717948718,
+      "grad_norm": 0.8322962522506714,
+      "learning_rate": 6.883749439796227e-05,
+      "loss": 1.0002,
+      "step": 13511
+    },
+    {
+      "epoch": 2.405626780626781,
+      "grad_norm": 0.815183162689209,
+      "learning_rate": 6.882419423316544e-05,
+      "loss": 0.8628,
+      "step": 13512
+    },
+    {
+      "epoch": 2.405804843304843,
+      "grad_norm": 0.9304860234260559,
+      "learning_rate": 6.881089467919381e-05,
+      "loss": 0.9489,
+      "step": 13513
+    },
+    {
+      "epoch": 2.405982905982906,
+      "grad_norm": 0.9071274995803833,
+      "learning_rate": 6.879759573630784e-05,
+      "loss": 0.8117,
+      "step": 13514
+    },
+    {
+      "epoch": 2.4061609686609686,
+      "grad_norm": 0.9378795027732849,
+      "learning_rate": 6.878429740476822e-05,
+      "loss": 1.22,
+      "step": 13515
+    },
+    {
+      "epoch": 2.4063390313390314,
+      "grad_norm": 0.7354511618614197,
+      "learning_rate": 6.877099968483541e-05,
+      "loss": 0.6696,
+      "step": 13516
+    },
+    {
+      "epoch": 2.406517094017094,
+      "grad_norm": 0.8701893091201782,
+      "learning_rate": 6.875770257677002e-05,
+      "loss": 0.8691,
+      "step": 13517
+    },
+    {
+      "epoch": 2.406695156695157,
+      "grad_norm": 0.8819001913070679,
+      "learning_rate": 6.87444060808325e-05,
+      "loss": 0.7428,
+      "step": 13518
+    },
+    {
+      "epoch": 2.406873219373219,
+      "grad_norm": 0.7339609265327454,
+      "learning_rate": 6.873111019728347e-05,
+      "loss": 0.7959,
+      "step": 13519
+    },
+    {
+      "epoch": 2.407051282051282,
+      "grad_norm": 0.8365123867988586,
+      "learning_rate": 6.871781492638335e-05,
+      "loss": 0.8199,
+      "step": 13520
+    },
+    {
+      "epoch": 2.4072293447293447,
+      "grad_norm": 0.9667043685913086,
+      "learning_rate": 6.870452026839266e-05,
+      "loss": 0.8261,
+      "step": 13521
+    },
+    {
+      "epoch": 2.4074074074074074,
+      "grad_norm": 0.6979679465293884,
+      "learning_rate": 6.869122622357187e-05,
+      "loss": 0.5909,
+      "step": 13522
+    },
+    {
+      "epoch": 2.40758547008547,
+      "grad_norm": 0.7326778769493103,
+      "learning_rate": 6.867793279218152e-05,
+      "loss": 0.9297,
+      "step": 13523
+    },
+    {
+      "epoch": 2.407763532763533,
+      "grad_norm": 0.8808563351631165,
+      "learning_rate": 6.866463997448196e-05,
+      "loss": 0.7481,
+      "step": 13524
+    },
+    {
+      "epoch": 2.4079415954415953,
+      "grad_norm": 0.7830268740653992,
+      "learning_rate": 6.86513477707337e-05,
+      "loss": 0.7902,
+      "step": 13525
+    },
+    {
+      "epoch": 2.408119658119658,
+      "grad_norm": 0.9482602477073669,
+      "learning_rate": 6.863805618119713e-05,
+      "loss": 1.1541,
+      "step": 13526
+    },
+    {
+      "epoch": 2.4082977207977208,
+      "grad_norm": 0.8369114995002747,
+      "learning_rate": 6.862476520613276e-05,
+      "loss": 0.874,
+      "step": 13527
+    },
+    {
+      "epoch": 2.4084757834757835,
+      "grad_norm": 0.9107078909873962,
+      "learning_rate": 6.86114748458009e-05,
+      "loss": 0.9412,
+      "step": 13528
+    },
+    {
+      "epoch": 2.4086538461538463,
+      "grad_norm": 0.8086137771606445,
+      "learning_rate": 6.859818510046199e-05,
+      "loss": 0.8495,
+      "step": 13529
+    },
+    {
+      "epoch": 2.408831908831909,
+      "grad_norm": 0.8824704885482788,
+      "learning_rate": 6.858489597037646e-05,
+      "loss": 0.8967,
+      "step": 13530
+    },
+    {
+      "epoch": 2.4090099715099713,
+      "grad_norm": 0.8514662384986877,
+      "learning_rate": 6.857160745580455e-05,
+      "loss": 0.9171,
+      "step": 13531
+    },
+    {
+      "epoch": 2.409188034188034,
+      "grad_norm": 0.7788167595863342,
+      "learning_rate": 6.855831955700675e-05,
+      "loss": 0.904,
+      "step": 13532
+    },
+    {
+      "epoch": 2.409366096866097,
+      "grad_norm": 0.913113534450531,
+      "learning_rate": 6.854503227424337e-05,
+      "loss": 0.8696,
+      "step": 13533
+    },
+    {
+      "epoch": 2.4095441595441596,
+      "grad_norm": 0.8424487113952637,
+      "learning_rate": 6.853174560777475e-05,
+      "loss": 0.8388,
+      "step": 13534
+    },
+    {
+      "epoch": 2.4097222222222223,
+      "grad_norm": 0.8609711527824402,
+      "learning_rate": 6.851845955786116e-05,
+      "loss": 0.7142,
+      "step": 13535
+    },
+    {
+      "epoch": 2.409900284900285,
+      "grad_norm": 0.8141375184059143,
+      "learning_rate": 6.850517412476301e-05,
+      "loss": 0.7198,
+      "step": 13536
+    },
+    {
+      "epoch": 2.410078347578348,
+      "grad_norm": 0.8615440130233765,
+      "learning_rate": 6.84918893087405e-05,
+      "loss": 0.958,
+      "step": 13537
+    },
+    {
+      "epoch": 2.41025641025641,
+      "grad_norm": 0.7733060717582703,
+      "learning_rate": 6.847860511005401e-05,
+      "loss": 0.7639,
+      "step": 13538
+    },
+    {
+      "epoch": 2.410434472934473,
+      "grad_norm": 0.9519185423851013,
+      "learning_rate": 6.846532152896375e-05,
+      "loss": 0.8239,
+      "step": 13539
+    },
+    {
+      "epoch": 2.4106125356125356,
+      "grad_norm": 0.774053692817688,
+      "learning_rate": 6.845203856573002e-05,
+      "loss": 0.891,
+      "step": 13540
+    },
+    {
+      "epoch": 2.4107905982905984,
+      "grad_norm": 0.8791571259498596,
+      "learning_rate": 6.843875622061304e-05,
+      "loss": 1.0107,
+      "step": 13541
+    },
+    {
+      "epoch": 2.410968660968661,
+      "grad_norm": 0.9431949853897095,
+      "learning_rate": 6.842547449387309e-05,
+      "loss": 0.8575,
+      "step": 13542
+    },
+    {
+      "epoch": 2.4111467236467234,
+      "grad_norm": 1.0521612167358398,
+      "learning_rate": 6.841219338577034e-05,
+      "loss": 0.9446,
+      "step": 13543
+    },
+    {
+      "epoch": 2.411324786324786,
+      "grad_norm": 0.7592857480049133,
+      "learning_rate": 6.83989128965651e-05,
+      "loss": 1.0595,
+      "step": 13544
+    },
+    {
+      "epoch": 2.411502849002849,
+      "grad_norm": 0.9002043604850769,
+      "learning_rate": 6.838563302651747e-05,
+      "loss": 0.9067,
+      "step": 13545
+    },
+    {
+      "epoch": 2.4116809116809117,
+      "grad_norm": 0.7144047021865845,
+      "learning_rate": 6.83723537758877e-05,
+      "loss": 0.6699,
+      "step": 13546
+    },
+    {
+      "epoch": 2.4118589743589745,
+      "grad_norm": 0.8226693868637085,
+      "learning_rate": 6.835907514493594e-05,
+      "loss": 0.987,
+      "step": 13547
+    },
+    {
+      "epoch": 2.412037037037037,
+      "grad_norm": 0.8507830500602722,
+      "learning_rate": 6.834579713392237e-05,
+      "loss": 0.7803,
+      "step": 13548
+    },
+    {
+      "epoch": 2.4122150997151,
+      "grad_norm": 0.727870762348175,
+      "learning_rate": 6.83325197431072e-05,
+      "loss": 0.8071,
+      "step": 13549
+    },
+    {
+      "epoch": 2.4123931623931623,
+      "grad_norm": 0.7601624727249146,
+      "learning_rate": 6.831924297275049e-05,
+      "loss": 0.7627,
+      "step": 13550
+    },
+    {
+      "epoch": 2.412571225071225,
+      "grad_norm": 0.8519877791404724,
+      "learning_rate": 6.830596682311243e-05,
+      "loss": 0.9271,
+      "step": 13551
+    },
+    {
+      "epoch": 2.4127492877492878,
+      "grad_norm": 1.0122307538986206,
+      "learning_rate": 6.829269129445307e-05,
+      "loss": 0.6424,
+      "step": 13552
+    },
+    {
+      "epoch": 2.4129273504273505,
+      "grad_norm": 0.8992687463760376,
+      "learning_rate": 6.827941638703258e-05,
+      "loss": 0.8034,
+      "step": 13553
+    },
+    {
+      "epoch": 2.4131054131054133,
+      "grad_norm": 0.7740746140480042,
+      "learning_rate": 6.826614210111102e-05,
+      "loss": 0.955,
+      "step": 13554
+    },
+    {
+      "epoch": 2.4132834757834756,
+      "grad_norm": 0.8176493048667908,
+      "learning_rate": 6.825286843694852e-05,
+      "loss": 0.7844,
+      "step": 13555
+    },
+    {
+      "epoch": 2.4134615384615383,
+      "grad_norm": 0.8112488985061646,
+      "learning_rate": 6.823959539480507e-05,
+      "loss": 0.8495,
+      "step": 13556
+    },
+    {
+      "epoch": 2.413639601139601,
+      "grad_norm": 0.8186960220336914,
+      "learning_rate": 6.822632297494078e-05,
+      "loss": 0.8922,
+      "step": 13557
+    },
+    {
+      "epoch": 2.413817663817664,
+      "grad_norm": 0.9498438835144043,
+      "learning_rate": 6.821305117761569e-05,
+      "loss": 0.8862,
+      "step": 13558
+    },
+    {
+      "epoch": 2.4139957264957266,
+      "grad_norm": 0.8591099381446838,
+      "learning_rate": 6.819978000308987e-05,
+      "loss": 0.837,
+      "step": 13559
+    },
+    {
+      "epoch": 2.4141737891737893,
+      "grad_norm": 0.8130860328674316,
+      "learning_rate": 6.818650945162324e-05,
+      "loss": 1.0723,
+      "step": 13560
+    },
+    {
+      "epoch": 2.414351851851852,
+      "grad_norm": 0.8800109624862671,
+      "learning_rate": 6.81732395234759e-05,
+      "loss": 0.8067,
+      "step": 13561
+    },
+    {
+      "epoch": 2.4145299145299144,
+      "grad_norm": 0.7786064147949219,
+      "learning_rate": 6.81599702189078e-05,
+      "loss": 0.9,
+      "step": 13562
+    },
+    {
+      "epoch": 2.414707977207977,
+      "grad_norm": 0.8343027234077454,
+      "learning_rate": 6.814670153817898e-05,
+      "loss": 0.7487,
+      "step": 13563
+    },
+    {
+      "epoch": 2.41488603988604,
+      "grad_norm": 0.7904187440872192,
+      "learning_rate": 6.813343348154934e-05,
+      "loss": 0.7904,
+      "step": 13564
+    },
+    {
+      "epoch": 2.4150641025641026,
+      "grad_norm": 0.7609010934829712,
+      "learning_rate": 6.81201660492789e-05,
+      "loss": 0.6734,
+      "step": 13565
+    },
+    {
+      "epoch": 2.4152421652421654,
+      "grad_norm": 0.8402243256568909,
+      "learning_rate": 6.810689924162756e-05,
+      "loss": 0.9581,
+      "step": 13566
+    },
+    {
+      "epoch": 2.4154202279202277,
+      "grad_norm": 0.8557454943656921,
+      "learning_rate": 6.809363305885527e-05,
+      "loss": 0.8387,
+      "step": 13567
+    },
+    {
+      "epoch": 2.4155982905982905,
+      "grad_norm": 0.8983132243156433,
+      "learning_rate": 6.808036750122197e-05,
+      "loss": 0.832,
+      "step": 13568
+    },
+    {
+      "epoch": 2.415776353276353,
+      "grad_norm": 0.8552190065383911,
+      "learning_rate": 6.806710256898755e-05,
+      "loss": 0.9257,
+      "step": 13569
+    },
+    {
+      "epoch": 2.415954415954416,
+      "grad_norm": 1.0639078617095947,
+      "learning_rate": 6.805383826241197e-05,
+      "loss": 0.9743,
+      "step": 13570
+    },
+    {
+      "epoch": 2.4161324786324787,
+      "grad_norm": 0.7951667904853821,
+      "learning_rate": 6.804057458175501e-05,
+      "loss": 0.9326,
+      "step": 13571
+    },
+    {
+      "epoch": 2.4163105413105415,
+      "grad_norm": 0.8652639985084534,
+      "learning_rate": 6.802731152727664e-05,
+      "loss": 1.0114,
+      "step": 13572
+    },
+    {
+      "epoch": 2.416488603988604,
+      "grad_norm": 0.8777487874031067,
+      "learning_rate": 6.801404909923664e-05,
+      "loss": 0.759,
+      "step": 13573
+    },
+    {
+      "epoch": 2.4166666666666665,
+      "grad_norm": 0.7922869920730591,
+      "learning_rate": 6.800078729789497e-05,
+      "loss": 0.8392,
+      "step": 13574
+    },
+    {
+      "epoch": 2.4168447293447293,
+      "grad_norm": 0.9189477562904358,
+      "learning_rate": 6.798752612351133e-05,
+      "loss": 0.9485,
+      "step": 13575
+    },
+    {
+      "epoch": 2.417022792022792,
+      "grad_norm": 0.8752175569534302,
+      "learning_rate": 6.797426557634567e-05,
+      "loss": 0.725,
+      "step": 13576
+    },
+    {
+      "epoch": 2.4172008547008548,
+      "grad_norm": 0.8646897077560425,
+      "learning_rate": 6.79610056566577e-05,
+      "loss": 0.9791,
+      "step": 13577
+    },
+    {
+      "epoch": 2.4173789173789175,
+      "grad_norm": 0.8749415278434753,
+      "learning_rate": 6.794774636470731e-05,
+      "loss": 0.9059,
+      "step": 13578
+    },
+    {
+      "epoch": 2.41755698005698,
+      "grad_norm": 0.9642252922058105,
+      "learning_rate": 6.793448770075422e-05,
+      "loss": 0.9972,
+      "step": 13579
+    },
+    {
+      "epoch": 2.4177350427350426,
+      "grad_norm": 0.8430541157722473,
+      "learning_rate": 6.792122966505827e-05,
+      "loss": 0.7126,
+      "step": 13580
+    },
+    {
+      "epoch": 2.4179131054131053,
+      "grad_norm": 0.8478374481201172,
+      "learning_rate": 6.790797225787913e-05,
+      "loss": 0.8995,
+      "step": 13581
+    },
+    {
+      "epoch": 2.418091168091168,
+      "grad_norm": 0.7008727788925171,
+      "learning_rate": 6.789471547947665e-05,
+      "loss": 0.6694,
+      "step": 13582
+    },
+    {
+      "epoch": 2.418269230769231,
+      "grad_norm": 0.873543918132782,
+      "learning_rate": 6.78814593301105e-05,
+      "loss": 1.0418,
+      "step": 13583
+    },
+    {
+      "epoch": 2.4184472934472936,
+      "grad_norm": 0.7208766341209412,
+      "learning_rate": 6.786820381004047e-05,
+      "loss": 0.8095,
+      "step": 13584
+    },
+    {
+      "epoch": 2.4186253561253563,
+      "grad_norm": 0.7272628545761108,
+      "learning_rate": 6.78549489195262e-05,
+      "loss": 0.5801,
+      "step": 13585
+    },
+    {
+      "epoch": 2.4188034188034186,
+      "grad_norm": 0.7155343294143677,
+      "learning_rate": 6.784169465882747e-05,
+      "loss": 0.748,
+      "step": 13586
+    },
+    {
+      "epoch": 2.4189814814814814,
+      "grad_norm": 0.928404688835144,
+      "learning_rate": 6.78284410282039e-05,
+      "loss": 0.9175,
+      "step": 13587
+    },
+    {
+      "epoch": 2.419159544159544,
+      "grad_norm": 0.7239044308662415,
+      "learning_rate": 6.781518802791519e-05,
+      "loss": 0.5855,
+      "step": 13588
+    },
+    {
+      "epoch": 2.419337606837607,
+      "grad_norm": 0.8126311302185059,
+      "learning_rate": 6.780193565822104e-05,
+      "loss": 0.832,
+      "step": 13589
+    },
+    {
+      "epoch": 2.4195156695156697,
+      "grad_norm": 0.7470774054527283,
+      "learning_rate": 6.778868391938103e-05,
+      "loss": 0.6202,
+      "step": 13590
+    },
+    {
+      "epoch": 2.419693732193732,
+      "grad_norm": 0.9161462187767029,
+      "learning_rate": 6.77754328116549e-05,
+      "loss": 0.8674,
+      "step": 13591
+    },
+    {
+      "epoch": 2.4198717948717947,
+      "grad_norm": 0.7225745320320129,
+      "learning_rate": 6.77621823353022e-05,
+      "loss": 0.8745,
+      "step": 13592
+    },
+    {
+      "epoch": 2.4200498575498575,
+      "grad_norm": 0.8380082845687866,
+      "learning_rate": 6.774893249058257e-05,
+      "loss": 0.5501,
+      "step": 13593
+    },
+    {
+      "epoch": 2.42022792022792,
+      "grad_norm": 0.8031942844390869,
+      "learning_rate": 6.77356832777556e-05,
+      "loss": 0.9925,
+      "step": 13594
+    },
+    {
+      "epoch": 2.420405982905983,
+      "grad_norm": 0.8278502821922302,
+      "learning_rate": 6.772243469708093e-05,
+      "loss": 0.6411,
+      "step": 13595
+    },
+    {
+      "epoch": 2.4205840455840457,
+      "grad_norm": 0.7655481100082397,
+      "learning_rate": 6.770918674881805e-05,
+      "loss": 0.7896,
+      "step": 13596
+    },
+    {
+      "epoch": 2.4207621082621085,
+      "grad_norm": 0.8260186314582825,
+      "learning_rate": 6.769593943322661e-05,
+      "loss": 0.8531,
+      "step": 13597
+    },
+    {
+      "epoch": 2.4209401709401708,
+      "grad_norm": 0.8293251395225525,
+      "learning_rate": 6.76826927505661e-05,
+      "loss": 0.8193,
+      "step": 13598
+    },
+    {
+      "epoch": 2.4211182336182335,
+      "grad_norm": 0.8868293762207031,
+      "learning_rate": 6.766944670109616e-05,
+      "loss": 0.8453,
+      "step": 13599
+    },
+    {
+      "epoch": 2.4212962962962963,
+      "grad_norm": 0.769124448299408,
+      "learning_rate": 6.765620128507619e-05,
+      "loss": 0.7412,
+      "step": 13600
+    },
+    {
+      "epoch": 2.421474358974359,
+      "grad_norm": 0.7727167010307312,
+      "learning_rate": 6.764295650276581e-05,
+      "loss": 0.8721,
+      "step": 13601
+    },
+    {
+      "epoch": 2.421652421652422,
+      "grad_norm": 0.9975818395614624,
+      "learning_rate": 6.762971235442444e-05,
+      "loss": 0.8128,
+      "step": 13602
+    },
+    {
+      "epoch": 2.421830484330484,
+      "grad_norm": 0.8000788688659668,
+      "learning_rate": 6.761646884031164e-05,
+      "loss": 0.8328,
+      "step": 13603
+    },
+    {
+      "epoch": 2.422008547008547,
+      "grad_norm": 0.7196731567382812,
+      "learning_rate": 6.760322596068684e-05,
+      "loss": 0.8912,
+      "step": 13604
+    },
+    {
+      "epoch": 2.4221866096866096,
+      "grad_norm": 0.8092321753501892,
+      "learning_rate": 6.758998371580955e-05,
+      "loss": 0.9066,
+      "step": 13605
+    },
+    {
+      "epoch": 2.4223646723646723,
+      "grad_norm": 0.7664031982421875,
+      "learning_rate": 6.757674210593918e-05,
+      "loss": 0.9854,
+      "step": 13606
+    },
+    {
+      "epoch": 2.422542735042735,
+      "grad_norm": 0.794507622718811,
+      "learning_rate": 6.75635011313352e-05,
+      "loss": 0.9296,
+      "step": 13607
+    },
+    {
+      "epoch": 2.422720797720798,
+      "grad_norm": 0.9127107858657837,
+      "learning_rate": 6.755026079225705e-05,
+      "loss": 0.9516,
+      "step": 13608
+    },
+    {
+      "epoch": 2.4228988603988606,
+      "grad_norm": 0.8025720715522766,
+      "learning_rate": 6.753702108896411e-05,
+      "loss": 1.0664,
+      "step": 13609
+    },
+    {
+      "epoch": 2.423076923076923,
+      "grad_norm": 0.7304871678352356,
+      "learning_rate": 6.752378202171585e-05,
+      "loss": 0.9071,
+      "step": 13610
+    },
+    {
+      "epoch": 2.4232549857549857,
+      "grad_norm": 0.9048241972923279,
+      "learning_rate": 6.751054359077157e-05,
+      "loss": 0.8649,
+      "step": 13611
+    },
+    {
+      "epoch": 2.4234330484330484,
+      "grad_norm": 0.8589995503425598,
+      "learning_rate": 6.749730579639074e-05,
+      "loss": 0.8895,
+      "step": 13612
+    },
+    {
+      "epoch": 2.423611111111111,
+      "grad_norm": 0.8098960518836975,
+      "learning_rate": 6.748406863883265e-05,
+      "loss": 0.6651,
+      "step": 13613
+    },
+    {
+      "epoch": 2.423789173789174,
+      "grad_norm": 0.9875120520591736,
+      "learning_rate": 6.74708321183567e-05,
+      "loss": 0.9605,
+      "step": 13614
+    },
+    {
+      "epoch": 2.423967236467236,
+      "grad_norm": 0.8211431503295898,
+      "learning_rate": 6.74575962352222e-05,
+      "loss": 0.8721,
+      "step": 13615
+    },
+    {
+      "epoch": 2.424145299145299,
+      "grad_norm": 0.9732884168624878,
+      "learning_rate": 6.744436098968855e-05,
+      "loss": 0.7501,
+      "step": 13616
+    },
+    {
+      "epoch": 2.4243233618233617,
+      "grad_norm": 0.9813733696937561,
+      "learning_rate": 6.743112638201496e-05,
+      "loss": 0.9823,
+      "step": 13617
+    },
+    {
+      "epoch": 2.4245014245014245,
+      "grad_norm": 0.8075012564659119,
+      "learning_rate": 6.741789241246083e-05,
+      "loss": 0.7018,
+      "step": 13618
+    },
+    {
+      "epoch": 2.4246794871794872,
+      "grad_norm": 0.7845864295959473,
+      "learning_rate": 6.740465908128539e-05,
+      "loss": 0.7423,
+      "step": 13619
+    },
+    {
+      "epoch": 2.42485754985755,
+      "grad_norm": 0.7754862308502197,
+      "learning_rate": 6.739142638874799e-05,
+      "loss": 0.8735,
+      "step": 13620
+    },
+    {
+      "epoch": 2.4250356125356127,
+      "grad_norm": 0.7971537709236145,
+      "learning_rate": 6.737819433510781e-05,
+      "loss": 0.7663,
+      "step": 13621
+    },
+    {
+      "epoch": 2.425213675213675,
+      "grad_norm": 0.8043563365936279,
+      "learning_rate": 6.736496292062416e-05,
+      "loss": 0.9311,
+      "step": 13622
+    },
+    {
+      "epoch": 2.425391737891738,
+      "grad_norm": 0.8150136470794678,
+      "learning_rate": 6.735173214555628e-05,
+      "loss": 0.9164,
+      "step": 13623
+    },
+    {
+      "epoch": 2.4255698005698005,
+      "grad_norm": 0.9853758811950684,
+      "learning_rate": 6.733850201016338e-05,
+      "loss": 0.6253,
+      "step": 13624
+    },
+    {
+      "epoch": 2.4257478632478633,
+      "grad_norm": 1.2138506174087524,
+      "learning_rate": 6.732527251470465e-05,
+      "loss": 1.0536,
+      "step": 13625
+    },
+    {
+      "epoch": 2.425925925925926,
+      "grad_norm": 0.9306546449661255,
+      "learning_rate": 6.73120436594394e-05,
+      "loss": 0.8607,
+      "step": 13626
+    },
+    {
+      "epoch": 2.426103988603989,
+      "grad_norm": 0.8536837697029114,
+      "learning_rate": 6.729881544462668e-05,
+      "loss": 0.9418,
+      "step": 13627
+    },
+    {
+      "epoch": 2.426282051282051,
+      "grad_norm": 0.8561417460441589,
+      "learning_rate": 6.728558787052574e-05,
+      "loss": 0.9556,
+      "step": 13628
+    },
+    {
+      "epoch": 2.426460113960114,
+      "grad_norm": 0.7499847412109375,
+      "learning_rate": 6.727236093739579e-05,
+      "loss": 0.7795,
+      "step": 13629
+    },
+    {
+      "epoch": 2.4266381766381766,
+      "grad_norm": 0.8541018962860107,
+      "learning_rate": 6.725913464549591e-05,
+      "loss": 1.0322,
+      "step": 13630
+    },
+    {
+      "epoch": 2.4268162393162394,
+      "grad_norm": 0.9659489989280701,
+      "learning_rate": 6.724590899508532e-05,
+      "loss": 1.1907,
+      "step": 13631
+    },
+    {
+      "epoch": 2.426994301994302,
+      "grad_norm": 0.9548102617263794,
+      "learning_rate": 6.723268398642307e-05,
+      "loss": 1.0545,
+      "step": 13632
+    },
+    {
+      "epoch": 2.427172364672365,
+      "grad_norm": 0.8543868660926819,
+      "learning_rate": 6.72194596197683e-05,
+      "loss": 0.8041,
+      "step": 13633
+    },
+    {
+      "epoch": 2.427350427350427,
+      "grad_norm": 0.838178277015686,
+      "learning_rate": 6.720623589538013e-05,
+      "loss": 0.7772,
+      "step": 13634
+    },
+    {
+      "epoch": 2.42752849002849,
+      "grad_norm": 0.8207933306694031,
+      "learning_rate": 6.719301281351768e-05,
+      "loss": 0.9222,
+      "step": 13635
+    },
+    {
+      "epoch": 2.4277065527065527,
+      "grad_norm": 0.7705093026161194,
+      "learning_rate": 6.717979037443996e-05,
+      "loss": 0.8989,
+      "step": 13636
+    },
+    {
+      "epoch": 2.4278846153846154,
+      "grad_norm": 0.8627061247825623,
+      "learning_rate": 6.716656857840609e-05,
+      "loss": 0.8834,
+      "step": 13637
+    },
+    {
+      "epoch": 2.428062678062678,
+      "grad_norm": 0.8404269218444824,
+      "learning_rate": 6.715334742567507e-05,
+      "loss": 0.8087,
+      "step": 13638
+    },
+    {
+      "epoch": 2.428240740740741,
+      "grad_norm": 0.9098958373069763,
+      "learning_rate": 6.7140126916506e-05,
+      "loss": 0.803,
+      "step": 13639
+    },
+    {
+      "epoch": 2.4284188034188032,
+      "grad_norm": 0.7482922673225403,
+      "learning_rate": 6.712690705115785e-05,
+      "loss": 0.8254,
+      "step": 13640
+    },
+    {
+      "epoch": 2.428596866096866,
+      "grad_norm": 0.8636375665664673,
+      "learning_rate": 6.711368782988972e-05,
+      "loss": 0.8788,
+      "step": 13641
+    },
+    {
+      "epoch": 2.4287749287749287,
+      "grad_norm": 0.8261808753013611,
+      "learning_rate": 6.710046925296052e-05,
+      "loss": 0.9135,
+      "step": 13642
+    },
+    {
+      "epoch": 2.4289529914529915,
+      "grad_norm": 1.060263752937317,
+      "learning_rate": 6.70872513206293e-05,
+      "loss": 0.9589,
+      "step": 13643
+    },
+    {
+      "epoch": 2.4291310541310542,
+      "grad_norm": 0.8128657341003418,
+      "learning_rate": 6.7074034033155e-05,
+      "loss": 0.7735,
+      "step": 13644
+    },
+    {
+      "epoch": 2.429309116809117,
+      "grad_norm": 0.9948938488960266,
+      "learning_rate": 6.706081739079663e-05,
+      "loss": 0.7242,
+      "step": 13645
+    },
+    {
+      "epoch": 2.4294871794871793,
+      "grad_norm": 0.8850025534629822,
+      "learning_rate": 6.704760139381311e-05,
+      "loss": 0.9393,
+      "step": 13646
+    },
+    {
+      "epoch": 2.429665242165242,
+      "grad_norm": 0.833534300327301,
+      "learning_rate": 6.703438604246337e-05,
+      "loss": 0.7824,
+      "step": 13647
+    },
+    {
+      "epoch": 2.429843304843305,
+      "grad_norm": 0.7362738251686096,
+      "learning_rate": 6.70211713370064e-05,
+      "loss": 0.7594,
+      "step": 13648
+    },
+    {
+      "epoch": 2.4300213675213675,
+      "grad_norm": 0.97635817527771,
+      "learning_rate": 6.700795727770101e-05,
+      "loss": 1.0097,
+      "step": 13649
+    },
+    {
+      "epoch": 2.4301994301994303,
+      "grad_norm": 0.8434939980506897,
+      "learning_rate": 6.699474386480622e-05,
+      "loss": 0.6639,
+      "step": 13650
+    },
+    {
+      "epoch": 2.430377492877493,
+      "grad_norm": 0.7960709929466248,
+      "learning_rate": 6.69815310985808e-05,
+      "loss": 0.9557,
+      "step": 13651
+    },
+    {
+      "epoch": 2.4305555555555554,
+      "grad_norm": 0.8336359262466431,
+      "learning_rate": 6.696831897928376e-05,
+      "loss": 1.0112,
+      "step": 13652
+    },
+    {
+      "epoch": 2.430733618233618,
+      "grad_norm": 0.8353996872901917,
+      "learning_rate": 6.695510750717384e-05,
+      "loss": 0.8578,
+      "step": 13653
+    },
+    {
+      "epoch": 2.430911680911681,
+      "grad_norm": 0.8968163132667542,
+      "learning_rate": 6.694189668250996e-05,
+      "loss": 1.0412,
+      "step": 13654
+    },
+    {
+      "epoch": 2.4310897435897436,
+      "grad_norm": 0.8091850876808167,
+      "learning_rate": 6.692868650555093e-05,
+      "loss": 0.761,
+      "step": 13655
+    },
+    {
+      "epoch": 2.4312678062678064,
+      "grad_norm": 0.7735705375671387,
+      "learning_rate": 6.691547697655563e-05,
+      "loss": 1.0507,
+      "step": 13656
+    },
+    {
+      "epoch": 2.431445868945869,
+      "grad_norm": 0.7707101702690125,
+      "learning_rate": 6.690226809578279e-05,
+      "loss": 0.8883,
+      "step": 13657
+    },
+    {
+      "epoch": 2.431623931623932,
+      "grad_norm": 0.8384747505187988,
+      "learning_rate": 6.688905986349127e-05,
+      "loss": 0.7772,
+      "step": 13658
+    },
+    {
+      "epoch": 2.431801994301994,
+      "grad_norm": 0.7676185369491577,
+      "learning_rate": 6.687585227993985e-05,
+      "loss": 0.844,
+      "step": 13659
+    },
+    {
+      "epoch": 2.431980056980057,
+      "grad_norm": 0.8745819926261902,
+      "learning_rate": 6.686264534538726e-05,
+      "loss": 0.9996,
+      "step": 13660
+    },
+    {
+      "epoch": 2.4321581196581197,
+      "grad_norm": 0.7455142736434937,
+      "learning_rate": 6.684943906009232e-05,
+      "loss": 0.7133,
+      "step": 13661
+    },
+    {
+      "epoch": 2.4323361823361824,
+      "grad_norm": 0.8742238879203796,
+      "learning_rate": 6.683623342431378e-05,
+      "loss": 0.8155,
+      "step": 13662
+    },
+    {
+      "epoch": 2.432514245014245,
+      "grad_norm": 0.7863791584968567,
+      "learning_rate": 6.68230284383103e-05,
+      "loss": 1.0017,
+      "step": 13663
+    },
+    {
+      "epoch": 2.4326923076923075,
+      "grad_norm": 0.9469232559204102,
+      "learning_rate": 6.68098241023407e-05,
+      "loss": 0.8172,
+      "step": 13664
+    },
+    {
+      "epoch": 2.4328703703703702,
+      "grad_norm": 0.808024525642395,
+      "learning_rate": 6.679662041666362e-05,
+      "loss": 0.8813,
+      "step": 13665
+    },
+    {
+      "epoch": 2.433048433048433,
+      "grad_norm": 0.834863543510437,
+      "learning_rate": 6.67834173815378e-05,
+      "loss": 0.9765,
+      "step": 13666
+    },
+    {
+      "epoch": 2.4332264957264957,
+      "grad_norm": 0.8903583288192749,
+      "learning_rate": 6.677021499722193e-05,
+      "loss": 0.9393,
+      "step": 13667
+    },
+    {
+      "epoch": 2.4334045584045585,
+      "grad_norm": 0.8341929912567139,
+      "learning_rate": 6.675701326397466e-05,
+      "loss": 0.8681,
+      "step": 13668
+    },
+    {
+      "epoch": 2.4335826210826212,
+      "grad_norm": 0.9348049163818359,
+      "learning_rate": 6.674381218205465e-05,
+      "loss": 0.7396,
+      "step": 13669
+    },
+    {
+      "epoch": 2.433760683760684,
+      "grad_norm": 0.8898159861564636,
+      "learning_rate": 6.673061175172055e-05,
+      "loss": 0.8638,
+      "step": 13670
+    },
+    {
+      "epoch": 2.4339387464387463,
+      "grad_norm": 0.8101391792297363,
+      "learning_rate": 6.671741197323105e-05,
+      "loss": 0.8064,
+      "step": 13671
+    },
+    {
+      "epoch": 2.434116809116809,
+      "grad_norm": 0.8756688237190247,
+      "learning_rate": 6.670421284684467e-05,
+      "loss": 0.7718,
+      "step": 13672
+    },
+    {
+      "epoch": 2.434294871794872,
+      "grad_norm": 0.8060923218727112,
+      "learning_rate": 6.669101437282012e-05,
+      "loss": 0.8137,
+      "step": 13673
+    },
+    {
+      "epoch": 2.4344729344729346,
+      "grad_norm": 0.792891800403595,
+      "learning_rate": 6.667781655141589e-05,
+      "loss": 0.9104,
+      "step": 13674
+    },
+    {
+      "epoch": 2.4346509971509973,
+      "grad_norm": 0.8590527772903442,
+      "learning_rate": 6.666461938289068e-05,
+      "loss": 0.9578,
+      "step": 13675
+    },
+    {
+      "epoch": 2.4348290598290596,
+      "grad_norm": 0.8593253493309021,
+      "learning_rate": 6.665142286750297e-05,
+      "loss": 0.8083,
+      "step": 13676
+    },
+    {
+      "epoch": 2.4350071225071224,
+      "grad_norm": 0.8237900733947754,
+      "learning_rate": 6.663822700551137e-05,
+      "loss": 0.8096,
+      "step": 13677
+    },
+    {
+      "epoch": 2.435185185185185,
+      "grad_norm": 0.9017227292060852,
+      "learning_rate": 6.66250317971744e-05,
+      "loss": 0.9599,
+      "step": 13678
+    },
+    {
+      "epoch": 2.435363247863248,
+      "grad_norm": 0.7811765670776367,
+      "learning_rate": 6.661183724275061e-05,
+      "loss": 0.8392,
+      "step": 13679
+    },
+    {
+      "epoch": 2.4355413105413106,
+      "grad_norm": 0.8113176822662354,
+      "learning_rate": 6.659864334249848e-05,
+      "loss": 0.6788,
+      "step": 13680
+    },
+    {
+      "epoch": 2.4357193732193734,
+      "grad_norm": 0.795261561870575,
+      "learning_rate": 6.65854500966766e-05,
+      "loss": 0.783,
+      "step": 13681
+    },
+    {
+      "epoch": 2.435897435897436,
+      "grad_norm": 0.7738518714904785,
+      "learning_rate": 6.657225750554338e-05,
+      "loss": 0.676,
+      "step": 13682
+    },
+    {
+      "epoch": 2.4360754985754984,
+      "grad_norm": 0.8513518571853638,
+      "learning_rate": 6.655906556935737e-05,
+      "loss": 0.8743,
+      "step": 13683
+    },
+    {
+      "epoch": 2.436253561253561,
+      "grad_norm": 0.9595896005630493,
+      "learning_rate": 6.654587428837696e-05,
+      "loss": 0.9333,
+      "step": 13684
+    },
+    {
+      "epoch": 2.436431623931624,
+      "grad_norm": 0.7720373272895813,
+      "learning_rate": 6.653268366286066e-05,
+      "loss": 0.77,
+      "step": 13685
+    },
+    {
+      "epoch": 2.4366096866096867,
+      "grad_norm": 0.9022032022476196,
+      "learning_rate": 6.651949369306689e-05,
+      "loss": 0.8786,
+      "step": 13686
+    },
+    {
+      "epoch": 2.4367877492877494,
+      "grad_norm": 0.795092761516571,
+      "learning_rate": 6.650630437925409e-05,
+      "loss": 0.8557,
+      "step": 13687
+    },
+    {
+      "epoch": 2.4369658119658117,
+      "grad_norm": 0.8517789244651794,
+      "learning_rate": 6.649311572168072e-05,
+      "loss": 0.9781,
+      "step": 13688
+    },
+    {
+      "epoch": 2.4371438746438745,
+      "grad_norm": 1.0523463487625122,
+      "learning_rate": 6.64799277206051e-05,
+      "loss": 0.9046,
+      "step": 13689
+    },
+    {
+      "epoch": 2.4373219373219372,
+      "grad_norm": 0.9401832818984985,
+      "learning_rate": 6.646674037628568e-05,
+      "loss": 1.0966,
+      "step": 13690
+    },
+    {
+      "epoch": 2.4375,
+      "grad_norm": 0.9895738959312439,
+      "learning_rate": 6.645355368898082e-05,
+      "loss": 0.714,
+      "step": 13691
+    },
+    {
+      "epoch": 2.4376780626780628,
+      "grad_norm": 0.8458610773086548,
+      "learning_rate": 6.644036765894892e-05,
+      "loss": 0.6265,
+      "step": 13692
+    },
+    {
+      "epoch": 2.4378561253561255,
+      "grad_norm": 0.7908345460891724,
+      "learning_rate": 6.642718228644826e-05,
+      "loss": 0.7943,
+      "step": 13693
+    },
+    {
+      "epoch": 2.4380341880341883,
+      "grad_norm": 0.8119938373565674,
+      "learning_rate": 6.641399757173725e-05,
+      "loss": 0.5948,
+      "step": 13694
+    },
+    {
+      "epoch": 2.4382122507122506,
+      "grad_norm": 0.8175633549690247,
+      "learning_rate": 6.640081351507417e-05,
+      "loss": 0.9098,
+      "step": 13695
+    },
+    {
+      "epoch": 2.4383903133903133,
+      "grad_norm": 0.8546686768531799,
+      "learning_rate": 6.638763011671736e-05,
+      "loss": 1.0347,
+      "step": 13696
+    },
+    {
+      "epoch": 2.438568376068376,
+      "grad_norm": 0.812406599521637,
+      "learning_rate": 6.637444737692508e-05,
+      "loss": 0.8469,
+      "step": 13697
+    },
+    {
+      "epoch": 2.438746438746439,
+      "grad_norm": 0.7802549004554749,
+      "learning_rate": 6.636126529595572e-05,
+      "loss": 0.7024,
+      "step": 13698
+    },
+    {
+      "epoch": 2.4389245014245016,
+      "grad_norm": 0.8046648502349854,
+      "learning_rate": 6.634808387406744e-05,
+      "loss": 0.8292,
+      "step": 13699
+    },
+    {
+      "epoch": 2.439102564102564,
+      "grad_norm": 0.8544600009918213,
+      "learning_rate": 6.633490311151857e-05,
+      "loss": 0.8033,
+      "step": 13700
+    },
+    {
+      "epoch": 2.4392806267806266,
+      "grad_norm": 0.8327271938323975,
+      "learning_rate": 6.632172300856731e-05,
+      "loss": 0.8641,
+      "step": 13701
+    },
+    {
+      "epoch": 2.4394586894586894,
+      "grad_norm": 0.9563352465629578,
+      "learning_rate": 6.630854356547199e-05,
+      "loss": 0.8144,
+      "step": 13702
+    },
+    {
+      "epoch": 2.439636752136752,
+      "grad_norm": 0.8993256092071533,
+      "learning_rate": 6.629536478249071e-05,
+      "loss": 0.8688,
+      "step": 13703
+    },
+    {
+      "epoch": 2.439814814814815,
+      "grad_norm": 0.8741861581802368,
+      "learning_rate": 6.628218665988178e-05,
+      "loss": 0.9868,
+      "step": 13704
+    },
+    {
+      "epoch": 2.4399928774928776,
+      "grad_norm": 0.7898648381233215,
+      "learning_rate": 6.626900919790332e-05,
+      "loss": 0.7802,
+      "step": 13705
+    },
+    {
+      "epoch": 2.4401709401709404,
+      "grad_norm": 0.7651925086975098,
+      "learning_rate": 6.625583239681357e-05,
+      "loss": 0.8131,
+      "step": 13706
+    },
+    {
+      "epoch": 2.4403490028490027,
+      "grad_norm": 0.7917741537094116,
+      "learning_rate": 6.624265625687071e-05,
+      "loss": 0.8581,
+      "step": 13707
+    },
+    {
+      "epoch": 2.4405270655270654,
+      "grad_norm": 0.7631075978279114,
+      "learning_rate": 6.622948077833284e-05,
+      "loss": 0.6069,
+      "step": 13708
+    },
+    {
+      "epoch": 2.440705128205128,
+      "grad_norm": 0.920765221118927,
+      "learning_rate": 6.621630596145819e-05,
+      "loss": 0.6846,
+      "step": 13709
+    },
+    {
+      "epoch": 2.440883190883191,
+      "grad_norm": 0.822335422039032,
+      "learning_rate": 6.62031318065048e-05,
+      "loss": 1.0309,
+      "step": 13710
+    },
+    {
+      "epoch": 2.4410612535612537,
+      "grad_norm": 0.7978029251098633,
+      "learning_rate": 6.618995831373086e-05,
+      "loss": 0.9593,
+      "step": 13711
+    },
+    {
+      "epoch": 2.441239316239316,
+      "grad_norm": 0.8908950686454773,
+      "learning_rate": 6.617678548339443e-05,
+      "loss": 0.7147,
+      "step": 13712
+    },
+    {
+      "epoch": 2.4414173789173788,
+      "grad_norm": 0.7772884368896484,
+      "learning_rate": 6.616361331575368e-05,
+      "loss": 0.8839,
+      "step": 13713
+    },
+    {
+      "epoch": 2.4415954415954415,
+      "grad_norm": 0.8437771797180176,
+      "learning_rate": 6.615044181106658e-05,
+      "loss": 0.8388,
+      "step": 13714
+    },
+    {
+      "epoch": 2.4417735042735043,
+      "grad_norm": 0.8549850583076477,
+      "learning_rate": 6.613727096959128e-05,
+      "loss": 0.9251,
+      "step": 13715
+    },
+    {
+      "epoch": 2.441951566951567,
+      "grad_norm": 0.8402581810951233,
+      "learning_rate": 6.612410079158579e-05,
+      "loss": 0.8607,
+      "step": 13716
+    },
+    {
+      "epoch": 2.4421296296296298,
+      "grad_norm": 0.8181160688400269,
+      "learning_rate": 6.611093127730821e-05,
+      "loss": 0.6082,
+      "step": 13717
+    },
+    {
+      "epoch": 2.4423076923076925,
+      "grad_norm": 0.9006236791610718,
+      "learning_rate": 6.609776242701651e-05,
+      "loss": 0.9091,
+      "step": 13718
+    },
+    {
+      "epoch": 2.442485754985755,
+      "grad_norm": 0.7759920358657837,
+      "learning_rate": 6.608459424096876e-05,
+      "loss": 0.842,
+      "step": 13719
+    },
+    {
+      "epoch": 2.4426638176638176,
+      "grad_norm": 0.825701117515564,
+      "learning_rate": 6.60714267194229e-05,
+      "loss": 0.9325,
+      "step": 13720
+    },
+    {
+      "epoch": 2.4428418803418803,
+      "grad_norm": 0.7646961212158203,
+      "learning_rate": 6.605825986263697e-05,
+      "loss": 0.8124,
+      "step": 13721
+    },
+    {
+      "epoch": 2.443019943019943,
+      "grad_norm": 0.896112322807312,
+      "learning_rate": 6.604509367086888e-05,
+      "loss": 1.0962,
+      "step": 13722
+    },
+    {
+      "epoch": 2.443198005698006,
+      "grad_norm": 0.8079821467399597,
+      "learning_rate": 6.603192814437672e-05,
+      "loss": 0.8195,
+      "step": 13723
+    },
+    {
+      "epoch": 2.443376068376068,
+      "grad_norm": 0.8901529908180237,
+      "learning_rate": 6.601876328341831e-05,
+      "loss": 0.7886,
+      "step": 13724
+    },
+    {
+      "epoch": 2.443554131054131,
+      "grad_norm": 1.0454550981521606,
+      "learning_rate": 6.600559908825168e-05,
+      "loss": 0.9642,
+      "step": 13725
+    },
+    {
+      "epoch": 2.4437321937321936,
+      "grad_norm": 0.7995026707649231,
+      "learning_rate": 6.599243555913469e-05,
+      "loss": 0.6927,
+      "step": 13726
+    },
+    {
+      "epoch": 2.4439102564102564,
+      "grad_norm": 0.9235756397247314,
+      "learning_rate": 6.597927269632526e-05,
+      "loss": 0.8986,
+      "step": 13727
+    },
+    {
+      "epoch": 2.444088319088319,
+      "grad_norm": 0.7869365215301514,
+      "learning_rate": 6.596611050008137e-05,
+      "loss": 0.7592,
+      "step": 13728
+    },
+    {
+      "epoch": 2.444266381766382,
+      "grad_norm": 0.8172873258590698,
+      "learning_rate": 6.595294897066081e-05,
+      "loss": 0.8048,
+      "step": 13729
+    },
+    {
+      "epoch": 2.4444444444444446,
+      "grad_norm": 0.8021790981292725,
+      "learning_rate": 6.593978810832152e-05,
+      "loss": 0.767,
+      "step": 13730
+    },
+    {
+      "epoch": 2.444622507122507,
+      "grad_norm": 0.7781784534454346,
+      "learning_rate": 6.592662791332129e-05,
+      "loss": 0.8364,
+      "step": 13731
+    },
+    {
+      "epoch": 2.4448005698005697,
+      "grad_norm": 0.8227871656417847,
+      "learning_rate": 6.591346838591803e-05,
+      "loss": 0.8967,
+      "step": 13732
+    },
+    {
+      "epoch": 2.4449786324786325,
+      "grad_norm": 0.8349295854568481,
+      "learning_rate": 6.590030952636952e-05,
+      "loss": 0.8593,
+      "step": 13733
+    },
+    {
+      "epoch": 2.445156695156695,
+      "grad_norm": 1.0261762142181396,
+      "learning_rate": 6.588715133493365e-05,
+      "loss": 0.701,
+      "step": 13734
+    },
+    {
+      "epoch": 2.445334757834758,
+      "grad_norm": 0.8612635731697083,
+      "learning_rate": 6.587399381186814e-05,
+      "loss": 0.9803,
+      "step": 13735
+    },
+    {
+      "epoch": 2.4455128205128207,
+      "grad_norm": 0.7890039682388306,
+      "learning_rate": 6.586083695743086e-05,
+      "loss": 0.8001,
+      "step": 13736
+    },
+    {
+      "epoch": 2.445690883190883,
+      "grad_norm": 0.9934018850326538,
+      "learning_rate": 6.584768077187955e-05,
+      "loss": 1.0089,
+      "step": 13737
+    },
+    {
+      "epoch": 2.4458689458689458,
+      "grad_norm": 0.8232909440994263,
+      "learning_rate": 6.583452525547202e-05,
+      "loss": 0.8088,
+      "step": 13738
+    },
+    {
+      "epoch": 2.4460470085470085,
+      "grad_norm": 0.8635872006416321,
+      "learning_rate": 6.582137040846595e-05,
+      "loss": 1.0007,
+      "step": 13739
+    },
+    {
+      "epoch": 2.4462250712250713,
+      "grad_norm": 0.905575692653656,
+      "learning_rate": 6.580821623111914e-05,
+      "loss": 0.7577,
+      "step": 13740
+    },
+    {
+      "epoch": 2.446403133903134,
+      "grad_norm": 0.8264324069023132,
+      "learning_rate": 6.579506272368931e-05,
+      "loss": 0.9337,
+      "step": 13741
+    },
+    {
+      "epoch": 2.4465811965811968,
+      "grad_norm": 0.9100900292396545,
+      "learning_rate": 6.57819098864342e-05,
+      "loss": 0.8115,
+      "step": 13742
+    },
+    {
+      "epoch": 2.446759259259259,
+      "grad_norm": 0.8536351919174194,
+      "learning_rate": 6.576875771961145e-05,
+      "loss": 0.8612,
+      "step": 13743
+    },
+    {
+      "epoch": 2.446937321937322,
+      "grad_norm": 0.8968019485473633,
+      "learning_rate": 6.57556062234788e-05,
+      "loss": 0.8798,
+      "step": 13744
+    },
+    {
+      "epoch": 2.4471153846153846,
+      "grad_norm": 0.8745046854019165,
+      "learning_rate": 6.574245539829389e-05,
+      "loss": 0.8992,
+      "step": 13745
+    },
+    {
+      "epoch": 2.4472934472934473,
+      "grad_norm": 0.8336703777313232,
+      "learning_rate": 6.57293052443144e-05,
+      "loss": 0.7947,
+      "step": 13746
+    },
+    {
+      "epoch": 2.44747150997151,
+      "grad_norm": 0.8544902801513672,
+      "learning_rate": 6.571615576179801e-05,
+      "loss": 1.0315,
+      "step": 13747
+    },
+    {
+      "epoch": 2.447649572649573,
+      "grad_norm": 0.848242461681366,
+      "learning_rate": 6.570300695100229e-05,
+      "loss": 0.853,
+      "step": 13748
+    },
+    {
+      "epoch": 2.447827635327635,
+      "grad_norm": 0.7753778100013733,
+      "learning_rate": 6.568985881218496e-05,
+      "loss": 0.934,
+      "step": 13749
+    },
+    {
+      "epoch": 2.448005698005698,
+      "grad_norm": 0.8294853568077087,
+      "learning_rate": 6.567671134560351e-05,
+      "loss": 0.9328,
+      "step": 13750
+    },
+    {
+      "epoch": 2.4481837606837606,
+      "grad_norm": 0.8720992803573608,
+      "learning_rate": 6.566356455151565e-05,
+      "loss": 0.875,
+      "step": 13751
+    },
+    {
+      "epoch": 2.4483618233618234,
+      "grad_norm": 0.8204464316368103,
+      "learning_rate": 6.565041843017888e-05,
+      "loss": 0.923,
+      "step": 13752
+    },
+    {
+      "epoch": 2.448539886039886,
+      "grad_norm": 0.940037190914154,
+      "learning_rate": 6.563727298185085e-05,
+      "loss": 1.1596,
+      "step": 13753
+    },
+    {
+      "epoch": 2.448717948717949,
+      "grad_norm": 0.8390263915061951,
+      "learning_rate": 6.562412820678902e-05,
+      "loss": 0.8256,
+      "step": 13754
+    },
+    {
+      "epoch": 2.448896011396011,
+      "grad_norm": 0.8572748303413391,
+      "learning_rate": 6.561098410525106e-05,
+      "loss": 0.7833,
+      "step": 13755
+    },
+    {
+      "epoch": 2.449074074074074,
+      "grad_norm": 0.7981020212173462,
+      "learning_rate": 6.559784067749436e-05,
+      "loss": 0.7609,
+      "step": 13756
+    },
+    {
+      "epoch": 2.4492521367521367,
+      "grad_norm": 0.8356930613517761,
+      "learning_rate": 6.558469792377653e-05,
+      "loss": 0.7542,
+      "step": 13757
+    },
+    {
+      "epoch": 2.4494301994301995,
+      "grad_norm": 0.9340906739234924,
+      "learning_rate": 6.557155584435504e-05,
+      "loss": 0.9898,
+      "step": 13758
+    },
+    {
+      "epoch": 2.449608262108262,
+      "grad_norm": 1.0551100969314575,
+      "learning_rate": 6.555841443948743e-05,
+      "loss": 0.7189,
+      "step": 13759
+    },
+    {
+      "epoch": 2.449786324786325,
+      "grad_norm": 0.9572125673294067,
+      "learning_rate": 6.554527370943111e-05,
+      "loss": 0.8878,
+      "step": 13760
+    },
+    {
+      "epoch": 2.4499643874643873,
+      "grad_norm": 0.8760324716567993,
+      "learning_rate": 6.55321336544436e-05,
+      "loss": 0.75,
+      "step": 13761
+    },
+    {
+      "epoch": 2.45014245014245,
+      "grad_norm": 0.7599226236343384,
+      "learning_rate": 6.55189942747823e-05,
+      "loss": 0.9222,
+      "step": 13762
+    },
+    {
+      "epoch": 2.4503205128205128,
+      "grad_norm": 0.7307319045066833,
+      "learning_rate": 6.550585557070473e-05,
+      "loss": 0.833,
+      "step": 13763
+    },
+    {
+      "epoch": 2.4504985754985755,
+      "grad_norm": 0.8022613525390625,
+      "learning_rate": 6.549271754246822e-05,
+      "loss": 0.9439,
+      "step": 13764
+    },
+    {
+      "epoch": 2.4506766381766383,
+      "grad_norm": 0.7447740435600281,
+      "learning_rate": 6.547958019033024e-05,
+      "loss": 0.7803,
+      "step": 13765
+    },
+    {
+      "epoch": 2.450854700854701,
+      "grad_norm": 0.9021183252334595,
+      "learning_rate": 6.546644351454818e-05,
+      "loss": 0.8373,
+      "step": 13766
+    },
+    {
+      "epoch": 2.451032763532764,
+      "grad_norm": 0.8230152726173401,
+      "learning_rate": 6.545330751537941e-05,
+      "loss": 0.7023,
+      "step": 13767
+    },
+    {
+      "epoch": 2.451210826210826,
+      "grad_norm": 0.9581316113471985,
+      "learning_rate": 6.544017219308132e-05,
+      "loss": 1.0024,
+      "step": 13768
+    },
+    {
+      "epoch": 2.451388888888889,
+      "grad_norm": 0.7969945073127747,
+      "learning_rate": 6.542703754791127e-05,
+      "loss": 0.8996,
+      "step": 13769
+    },
+    {
+      "epoch": 2.4515669515669516,
+      "grad_norm": 0.864604115486145,
+      "learning_rate": 6.54139035801266e-05,
+      "loss": 0.9325,
+      "step": 13770
+    },
+    {
+      "epoch": 2.4517450142450143,
+      "grad_norm": 0.8156671524047852,
+      "learning_rate": 6.540077028998463e-05,
+      "loss": 0.7926,
+      "step": 13771
+    },
+    {
+      "epoch": 2.451923076923077,
+      "grad_norm": 0.8704202175140381,
+      "learning_rate": 6.538763767774272e-05,
+      "loss": 0.8855,
+      "step": 13772
+    },
+    {
+      "epoch": 2.4521011396011394,
+      "grad_norm": 0.7533015012741089,
+      "learning_rate": 6.537450574365811e-05,
+      "loss": 0.9322,
+      "step": 13773
+    },
+    {
+      "epoch": 2.452279202279202,
+      "grad_norm": 0.8272553086280823,
+      "learning_rate": 6.536137448798819e-05,
+      "loss": 0.7474,
+      "step": 13774
+    },
+    {
+      "epoch": 2.452457264957265,
+      "grad_norm": 0.7788257598876953,
+      "learning_rate": 6.534824391099013e-05,
+      "loss": 0.7163,
+      "step": 13775
+    },
+    {
+      "epoch": 2.4526353276353277,
+      "grad_norm": 0.8309275507926941,
+      "learning_rate": 6.533511401292125e-05,
+      "loss": 1.1595,
+      "step": 13776
+    },
+    {
+      "epoch": 2.4528133903133904,
+      "grad_norm": 0.8369085788726807,
+      "learning_rate": 6.53219847940388e-05,
+      "loss": 0.7211,
+      "step": 13777
+    },
+    {
+      "epoch": 2.452991452991453,
+      "grad_norm": 0.8571248054504395,
+      "learning_rate": 6.530885625460007e-05,
+      "loss": 0.729,
+      "step": 13778
+    },
+    {
+      "epoch": 2.453169515669516,
+      "grad_norm": 0.7579928040504456,
+      "learning_rate": 6.529572839486217e-05,
+      "loss": 0.8799,
+      "step": 13779
+    },
+    {
+      "epoch": 2.453347578347578,
+      "grad_norm": 0.822463870048523,
+      "learning_rate": 6.528260121508245e-05,
+      "loss": 0.7948,
+      "step": 13780
+    },
+    {
+      "epoch": 2.453525641025641,
+      "grad_norm": 0.7910317778587341,
+      "learning_rate": 6.526947471551798e-05,
+      "loss": 0.7727,
+      "step": 13781
+    },
+    {
+      "epoch": 2.4537037037037037,
+      "grad_norm": 0.9321692585945129,
+      "learning_rate": 6.525634889642605e-05,
+      "loss": 0.8754,
+      "step": 13782
+    },
+    {
+      "epoch": 2.4538817663817665,
+      "grad_norm": 1.0130813121795654,
+      "learning_rate": 6.524322375806374e-05,
+      "loss": 0.7845,
+      "step": 13783
+    },
+    {
+      "epoch": 2.4540598290598292,
+      "grad_norm": 0.7254214882850647,
+      "learning_rate": 6.52300993006883e-05,
+      "loss": 0.4665,
+      "step": 13784
+    },
+    {
+      "epoch": 2.4542378917378915,
+      "grad_norm": 0.7874964475631714,
+      "learning_rate": 6.521697552455683e-05,
+      "loss": 0.8535,
+      "step": 13785
+    },
+    {
+      "epoch": 2.4544159544159543,
+      "grad_norm": 0.8275010585784912,
+      "learning_rate": 6.520385242992644e-05,
+      "loss": 0.7744,
+      "step": 13786
+    },
+    {
+      "epoch": 2.454594017094017,
+      "grad_norm": 0.7972453236579895,
+      "learning_rate": 6.519073001705431e-05,
+      "loss": 0.9494,
+      "step": 13787
+    },
+    {
+      "epoch": 2.45477207977208,
+      "grad_norm": 0.8763988018035889,
+      "learning_rate": 6.517760828619748e-05,
+      "loss": 0.8043,
+      "step": 13788
+    },
+    {
+      "epoch": 2.4549501424501425,
+      "grad_norm": 0.7948910593986511,
+      "learning_rate": 6.516448723761315e-05,
+      "loss": 0.7218,
+      "step": 13789
+    },
+    {
+      "epoch": 2.4551282051282053,
+      "grad_norm": 0.9416671395301819,
+      "learning_rate": 6.515136687155825e-05,
+      "loss": 0.7866,
+      "step": 13790
+    },
+    {
+      "epoch": 2.455306267806268,
+      "grad_norm": 0.8702704906463623,
+      "learning_rate": 6.513824718828999e-05,
+      "loss": 1.1579,
+      "step": 13791
+    },
+    {
+      "epoch": 2.4554843304843303,
+      "grad_norm": 0.8148752450942993,
+      "learning_rate": 6.51251281880653e-05,
+      "loss": 0.897,
+      "step": 13792
+    },
+    {
+      "epoch": 2.455662393162393,
+      "grad_norm": 0.8088299036026001,
+      "learning_rate": 6.511200987114132e-05,
+      "loss": 0.939,
+      "step": 13793
+    },
+    {
+      "epoch": 2.455840455840456,
+      "grad_norm": 0.9836809635162354,
+      "learning_rate": 6.509889223777499e-05,
+      "loss": 0.8841,
+      "step": 13794
+    },
+    {
+      "epoch": 2.4560185185185186,
+      "grad_norm": 0.7677251696586609,
+      "learning_rate": 6.508577528822342e-05,
+      "loss": 0.7816,
+      "step": 13795
+    },
+    {
+      "epoch": 2.4561965811965814,
+      "grad_norm": 0.835421085357666,
+      "learning_rate": 6.507265902274351e-05,
+      "loss": 0.9302,
+      "step": 13796
+    },
+    {
+      "epoch": 2.4563746438746437,
+      "grad_norm": 0.8892473578453064,
+      "learning_rate": 6.50595434415923e-05,
+      "loss": 0.8281,
+      "step": 13797
+    },
+    {
+      "epoch": 2.4565527065527064,
+      "grad_norm": 0.810459315776825,
+      "learning_rate": 6.504642854502676e-05,
+      "loss": 0.81,
+      "step": 13798
+    },
+    {
+      "epoch": 2.456730769230769,
+      "grad_norm": 0.9277065992355347,
+      "learning_rate": 6.503331433330386e-05,
+      "loss": 0.7956,
+      "step": 13799
+    },
+    {
+      "epoch": 2.456908831908832,
+      "grad_norm": 0.861725389957428,
+      "learning_rate": 6.502020080668051e-05,
+      "loss": 0.7788,
+      "step": 13800
+    },
+    {
+      "epoch": 2.4570868945868947,
+      "grad_norm": 1.0000818967819214,
+      "learning_rate": 6.500708796541366e-05,
+      "loss": 0.9197,
+      "step": 13801
+    },
+    {
+      "epoch": 2.4572649572649574,
+      "grad_norm": 0.920998215675354,
+      "learning_rate": 6.499397580976024e-05,
+      "loss": 0.7816,
+      "step": 13802
+    },
+    {
+      "epoch": 2.45744301994302,
+      "grad_norm": 0.7574821710586548,
+      "learning_rate": 6.498086433997715e-05,
+      "loss": 0.8982,
+      "step": 13803
+    },
+    {
+      "epoch": 2.4576210826210825,
+      "grad_norm": 1.026700496673584,
+      "learning_rate": 6.496775355632125e-05,
+      "loss": 1.131,
+      "step": 13804
+    },
+    {
+      "epoch": 2.4577991452991452,
+      "grad_norm": 0.7532633543014526,
+      "learning_rate": 6.495464345904945e-05,
+      "loss": 0.7998,
+      "step": 13805
+    },
+    {
+      "epoch": 2.457977207977208,
+      "grad_norm": 0.7380105257034302,
+      "learning_rate": 6.494153404841865e-05,
+      "loss": 0.7656,
+      "step": 13806
+    },
+    {
+      "epoch": 2.4581552706552707,
+      "grad_norm": 0.7933080792427063,
+      "learning_rate": 6.492842532468561e-05,
+      "loss": 0.7419,
+      "step": 13807
+    },
+    {
+      "epoch": 2.4583333333333335,
+      "grad_norm": 0.7731907963752747,
+      "learning_rate": 6.491531728810724e-05,
+      "loss": 0.8334,
+      "step": 13808
+    },
+    {
+      "epoch": 2.458511396011396,
+      "grad_norm": 0.7368177771568298,
+      "learning_rate": 6.490220993894035e-05,
+      "loss": 0.6184,
+      "step": 13809
+    },
+    {
+      "epoch": 2.4586894586894585,
+      "grad_norm": 0.8381120562553406,
+      "learning_rate": 6.488910327744178e-05,
+      "loss": 0.7875,
+      "step": 13810
+    },
+    {
+      "epoch": 2.4588675213675213,
+      "grad_norm": 0.910142183303833,
+      "learning_rate": 6.487599730386824e-05,
+      "loss": 0.8216,
+      "step": 13811
+    },
+    {
+      "epoch": 2.459045584045584,
+      "grad_norm": 0.9005017876625061,
+      "learning_rate": 6.48628920184766e-05,
+      "loss": 0.8928,
+      "step": 13812
+    },
+    {
+      "epoch": 2.459223646723647,
+      "grad_norm": 0.8437321782112122,
+      "learning_rate": 6.484978742152358e-05,
+      "loss": 0.9243,
+      "step": 13813
+    },
+    {
+      "epoch": 2.4594017094017095,
+      "grad_norm": 0.9145610928535461,
+      "learning_rate": 6.483668351326599e-05,
+      "loss": 0.9759,
+      "step": 13814
+    },
+    {
+      "epoch": 2.4595797720797723,
+      "grad_norm": 0.8391930460929871,
+      "learning_rate": 6.48235802939605e-05,
+      "loss": 0.8021,
+      "step": 13815
+    },
+    {
+      "epoch": 2.4597578347578346,
+      "grad_norm": 0.8035653233528137,
+      "learning_rate": 6.481047776386394e-05,
+      "loss": 0.8622,
+      "step": 13816
+    },
+    {
+      "epoch": 2.4599358974358974,
+      "grad_norm": 0.7238573431968689,
+      "learning_rate": 6.479737592323291e-05,
+      "loss": 0.6864,
+      "step": 13817
+    },
+    {
+      "epoch": 2.46011396011396,
+      "grad_norm": 0.8629193902015686,
+      "learning_rate": 6.47842747723242e-05,
+      "loss": 0.6107,
+      "step": 13818
+    },
+    {
+      "epoch": 2.460292022792023,
+      "grad_norm": 0.9015333652496338,
+      "learning_rate": 6.477117431139444e-05,
+      "loss": 0.845,
+      "step": 13819
+    },
+    {
+      "epoch": 2.4604700854700856,
+      "grad_norm": 0.8544989824295044,
+      "learning_rate": 6.47580745407004e-05,
+      "loss": 0.7103,
+      "step": 13820
+    },
+    {
+      "epoch": 2.460648148148148,
+      "grad_norm": 0.7665401101112366,
+      "learning_rate": 6.474497546049862e-05,
+      "loss": 0.6864,
+      "step": 13821
+    },
+    {
+      "epoch": 2.4608262108262107,
+      "grad_norm": 0.8640002012252808,
+      "learning_rate": 6.473187707104584e-05,
+      "loss": 0.953,
+      "step": 13822
+    },
+    {
+      "epoch": 2.4610042735042734,
+      "grad_norm": 0.8563477396965027,
+      "learning_rate": 6.471877937259864e-05,
+      "loss": 0.7776,
+      "step": 13823
+    },
+    {
+      "epoch": 2.461182336182336,
+      "grad_norm": 0.8089157938957214,
+      "learning_rate": 6.470568236541371e-05,
+      "loss": 0.8273,
+      "step": 13824
+    },
+    {
+      "epoch": 2.461360398860399,
+      "grad_norm": 0.8710005283355713,
+      "learning_rate": 6.469258604974757e-05,
+      "loss": 0.8827,
+      "step": 13825
+    },
+    {
+      "epoch": 2.4615384615384617,
+      "grad_norm": 0.7780489325523376,
+      "learning_rate": 6.467949042585688e-05,
+      "loss": 0.8035,
+      "step": 13826
+    },
+    {
+      "epoch": 2.4617165242165244,
+      "grad_norm": 1.010976791381836,
+      "learning_rate": 6.466639549399822e-05,
+      "loss": 0.7442,
+      "step": 13827
+    },
+    {
+      "epoch": 2.4618945868945867,
+      "grad_norm": 0.714077889919281,
+      "learning_rate": 6.465330125442812e-05,
+      "loss": 0.694,
+      "step": 13828
+    },
+    {
+      "epoch": 2.4620726495726495,
+      "grad_norm": 0.7408512234687805,
+      "learning_rate": 6.464020770740316e-05,
+      "loss": 0.6709,
+      "step": 13829
+    },
+    {
+      "epoch": 2.4622507122507122,
+      "grad_norm": 0.8433945178985596,
+      "learning_rate": 6.462711485317987e-05,
+      "loss": 0.9127,
+      "step": 13830
+    },
+    {
+      "epoch": 2.462428774928775,
+      "grad_norm": 1.031745195388794,
+      "learning_rate": 6.461402269201481e-05,
+      "loss": 0.9105,
+      "step": 13831
+    },
+    {
+      "epoch": 2.4626068376068377,
+      "grad_norm": 0.8884360790252686,
+      "learning_rate": 6.460093122416444e-05,
+      "loss": 0.9354,
+      "step": 13832
+    },
+    {
+      "epoch": 2.4627849002849,
+      "grad_norm": 0.8466372489929199,
+      "learning_rate": 6.45878404498853e-05,
+      "loss": 0.9554,
+      "step": 13833
+    },
+    {
+      "epoch": 2.462962962962963,
+      "grad_norm": 0.9026118516921997,
+      "learning_rate": 6.457475036943386e-05,
+      "loss": 0.928,
+      "step": 13834
+    },
+    {
+      "epoch": 2.4631410256410255,
+      "grad_norm": 0.9034590721130371,
+      "learning_rate": 6.456166098306661e-05,
+      "loss": 0.7694,
+      "step": 13835
+    },
+    {
+      "epoch": 2.4633190883190883,
+      "grad_norm": 0.8369483947753906,
+      "learning_rate": 6.454857229103998e-05,
+      "loss": 0.928,
+      "step": 13836
+    },
+    {
+      "epoch": 2.463497150997151,
+      "grad_norm": 0.8670645356178284,
+      "learning_rate": 6.453548429361045e-05,
+      "loss": 0.971,
+      "step": 13837
+    },
+    {
+      "epoch": 2.463675213675214,
+      "grad_norm": 0.8415539860725403,
+      "learning_rate": 6.452239699103442e-05,
+      "loss": 0.8461,
+      "step": 13838
+    },
+    {
+      "epoch": 2.4638532763532766,
+      "grad_norm": 0.7434490323066711,
+      "learning_rate": 6.450931038356834e-05,
+      "loss": 0.6677,
+      "step": 13839
+    },
+    {
+      "epoch": 2.464031339031339,
+      "grad_norm": 0.8113850355148315,
+      "learning_rate": 6.449622447146855e-05,
+      "loss": 0.8644,
+      "step": 13840
+    },
+    {
+      "epoch": 2.4642094017094016,
+      "grad_norm": 0.7424083352088928,
+      "learning_rate": 6.448313925499154e-05,
+      "loss": 0.7469,
+      "step": 13841
+    },
+    {
+      "epoch": 2.4643874643874644,
+      "grad_norm": 1.006949782371521,
+      "learning_rate": 6.44700547343936e-05,
+      "loss": 0.8426,
+      "step": 13842
+    },
+    {
+      "epoch": 2.464565527065527,
+      "grad_norm": 1.0643857717514038,
+      "learning_rate": 6.445697090993117e-05,
+      "loss": 0.7793,
+      "step": 13843
+    },
+    {
+      "epoch": 2.46474358974359,
+      "grad_norm": 0.8716835975646973,
+      "learning_rate": 6.444388778186051e-05,
+      "loss": 0.9684,
+      "step": 13844
+    },
+    {
+      "epoch": 2.464921652421652,
+      "grad_norm": 0.8677120804786682,
+      "learning_rate": 6.443080535043802e-05,
+      "loss": 0.9772,
+      "step": 13845
+    },
+    {
+      "epoch": 2.465099715099715,
+      "grad_norm": 0.8955141305923462,
+      "learning_rate": 6.441772361592005e-05,
+      "loss": 0.881,
+      "step": 13846
+    },
+    {
+      "epoch": 2.4652777777777777,
+      "grad_norm": 0.806794285774231,
+      "learning_rate": 6.440464257856283e-05,
+      "loss": 0.6393,
+      "step": 13847
+    },
+    {
+      "epoch": 2.4654558404558404,
+      "grad_norm": 0.8438352346420288,
+      "learning_rate": 6.439156223862272e-05,
+      "loss": 0.8305,
+      "step": 13848
+    },
+    {
+      "epoch": 2.465633903133903,
+      "grad_norm": 0.828960120677948,
+      "learning_rate": 6.437848259635594e-05,
+      "loss": 0.8564,
+      "step": 13849
+    },
+    {
+      "epoch": 2.465811965811966,
+      "grad_norm": 0.790199339389801,
+      "learning_rate": 6.436540365201886e-05,
+      "loss": 0.6573,
+      "step": 13850
+    },
+    {
+      "epoch": 2.4659900284900287,
+      "grad_norm": 0.8476296663284302,
+      "learning_rate": 6.435232540586763e-05,
+      "loss": 0.6979,
+      "step": 13851
+    },
+    {
+      "epoch": 2.466168091168091,
+      "grad_norm": 0.6880464553833008,
+      "learning_rate": 6.433924785815857e-05,
+      "loss": 0.6925,
+      "step": 13852
+    },
+    {
+      "epoch": 2.4663461538461537,
+      "grad_norm": 0.9369434118270874,
+      "learning_rate": 6.432617100914782e-05,
+      "loss": 0.6891,
+      "step": 13853
+    },
+    {
+      "epoch": 2.4665242165242165,
+      "grad_norm": 0.8522159457206726,
+      "learning_rate": 6.431309485909166e-05,
+      "loss": 1.0786,
+      "step": 13854
+    },
+    {
+      "epoch": 2.4667022792022792,
+      "grad_norm": 0.8479002714157104,
+      "learning_rate": 6.430001940824625e-05,
+      "loss": 0.9293,
+      "step": 13855
+    },
+    {
+      "epoch": 2.466880341880342,
+      "grad_norm": 0.8382098078727722,
+      "learning_rate": 6.428694465686787e-05,
+      "loss": 1.1279,
+      "step": 13856
+    },
+    {
+      "epoch": 2.4670584045584047,
+      "grad_norm": 0.9641128182411194,
+      "learning_rate": 6.427387060521255e-05,
+      "loss": 0.7938,
+      "step": 13857
+    },
+    {
+      "epoch": 2.467236467236467,
+      "grad_norm": 0.8458924293518066,
+      "learning_rate": 6.426079725353656e-05,
+      "loss": 0.7804,
+      "step": 13858
+    },
+    {
+      "epoch": 2.46741452991453,
+      "grad_norm": 0.8023849725723267,
+      "learning_rate": 6.424772460209597e-05,
+      "loss": 0.718,
+      "step": 13859
+    },
+    {
+      "epoch": 2.4675925925925926,
+      "grad_norm": 0.800864040851593,
+      "learning_rate": 6.423465265114699e-05,
+      "loss": 0.7627,
+      "step": 13860
+    },
+    {
+      "epoch": 2.4677706552706553,
+      "grad_norm": 0.9159586429595947,
+      "learning_rate": 6.422158140094566e-05,
+      "loss": 0.8348,
+      "step": 13861
+    },
+    {
+      "epoch": 2.467948717948718,
+      "grad_norm": 0.7982872724533081,
+      "learning_rate": 6.420851085174817e-05,
+      "loss": 0.8092,
+      "step": 13862
+    },
+    {
+      "epoch": 2.468126780626781,
+      "grad_norm": 0.8847397565841675,
+      "learning_rate": 6.41954410038105e-05,
+      "loss": 0.9165,
+      "step": 13863
+    },
+    {
+      "epoch": 2.468304843304843,
+      "grad_norm": 0.7885190844535828,
+      "learning_rate": 6.418237185738882e-05,
+      "loss": 0.733,
+      "step": 13864
+    },
+    {
+      "epoch": 2.468482905982906,
+      "grad_norm": 0.902428150177002,
+      "learning_rate": 6.416930341273914e-05,
+      "loss": 0.8345,
+      "step": 13865
+    },
+    {
+      "epoch": 2.4686609686609686,
+      "grad_norm": 0.9344130158424377,
+      "learning_rate": 6.415623567011751e-05,
+      "loss": 0.8651,
+      "step": 13866
+    },
+    {
+      "epoch": 2.4688390313390314,
+      "grad_norm": 0.840679407119751,
+      "learning_rate": 6.414316862978003e-05,
+      "loss": 0.7534,
+      "step": 13867
+    },
+    {
+      "epoch": 2.469017094017094,
+      "grad_norm": 0.8799613118171692,
+      "learning_rate": 6.413010229198263e-05,
+      "loss": 0.9335,
+      "step": 13868
+    },
+    {
+      "epoch": 2.469195156695157,
+      "grad_norm": 1.1401816606521606,
+      "learning_rate": 6.411703665698142e-05,
+      "loss": 0.8902,
+      "step": 13869
+    },
+    {
+      "epoch": 2.469373219373219,
+      "grad_norm": 0.9733933210372925,
+      "learning_rate": 6.410397172503227e-05,
+      "loss": 0.9288,
+      "step": 13870
+    },
+    {
+      "epoch": 2.469551282051282,
+      "grad_norm": 0.9260223507881165,
+      "learning_rate": 6.409090749639128e-05,
+      "loss": 0.9664,
+      "step": 13871
+    },
+    {
+      "epoch": 2.4697293447293447,
+      "grad_norm": 1.0200423002243042,
+      "learning_rate": 6.407784397131433e-05,
+      "loss": 0.9095,
+      "step": 13872
+    },
+    {
+      "epoch": 2.4699074074074074,
+      "grad_norm": 0.8015561103820801,
+      "learning_rate": 6.406478115005743e-05,
+      "loss": 0.8015,
+      "step": 13873
+    },
+    {
+      "epoch": 2.47008547008547,
+      "grad_norm": 0.8035915493965149,
+      "learning_rate": 6.40517190328765e-05,
+      "loss": 0.9275,
+      "step": 13874
+    },
+    {
+      "epoch": 2.470263532763533,
+      "grad_norm": 0.8248090147972107,
+      "learning_rate": 6.403865762002743e-05,
+      "loss": 0.8736,
+      "step": 13875
+    },
+    {
+      "epoch": 2.4704415954415953,
+      "grad_norm": 0.9310920834541321,
+      "learning_rate": 6.402559691176616e-05,
+      "loss": 0.794,
+      "step": 13876
+    },
+    {
+      "epoch": 2.470619658119658,
+      "grad_norm": 0.7796428799629211,
+      "learning_rate": 6.401253690834863e-05,
+      "loss": 0.832,
+      "step": 13877
+    },
+    {
+      "epoch": 2.4707977207977208,
+      "grad_norm": 0.9046199321746826,
+      "learning_rate": 6.399947761003063e-05,
+      "loss": 0.9988,
+      "step": 13878
+    },
+    {
+      "epoch": 2.4709757834757835,
+      "grad_norm": 0.8970019221305847,
+      "learning_rate": 6.398641901706812e-05,
+      "loss": 0.6805,
+      "step": 13879
+    },
+    {
+      "epoch": 2.4711538461538463,
+      "grad_norm": 0.935786247253418,
+      "learning_rate": 6.397336112971688e-05,
+      "loss": 0.9826,
+      "step": 13880
+    },
+    {
+      "epoch": 2.471331908831909,
+      "grad_norm": 0.8497617244720459,
+      "learning_rate": 6.396030394823285e-05,
+      "loss": 0.8842,
+      "step": 13881
+    },
+    {
+      "epoch": 2.4715099715099713,
+      "grad_norm": 0.9159898161888123,
+      "learning_rate": 6.394724747287173e-05,
+      "loss": 0.8592,
+      "step": 13882
+    },
+    {
+      "epoch": 2.471688034188034,
+      "grad_norm": 0.891951858997345,
+      "learning_rate": 6.393419170388943e-05,
+      "loss": 0.8197,
+      "step": 13883
+    },
+    {
+      "epoch": 2.471866096866097,
+      "grad_norm": 0.9038097858428955,
+      "learning_rate": 6.392113664154172e-05,
+      "loss": 0.7535,
+      "step": 13884
+    },
+    {
+      "epoch": 2.4720441595441596,
+      "grad_norm": 0.8502489924430847,
+      "learning_rate": 6.390808228608438e-05,
+      "loss": 0.9183,
+      "step": 13885
+    },
+    {
+      "epoch": 2.4722222222222223,
+      "grad_norm": 0.7442654371261597,
+      "learning_rate": 6.389502863777323e-05,
+      "loss": 0.7741,
+      "step": 13886
+    },
+    {
+      "epoch": 2.472400284900285,
+      "grad_norm": 0.8811324238777161,
+      "learning_rate": 6.388197569686395e-05,
+      "loss": 0.7261,
+      "step": 13887
+    },
+    {
+      "epoch": 2.472578347578348,
+      "grad_norm": 0.9487552046775818,
+      "learning_rate": 6.386892346361239e-05,
+      "loss": 0.779,
+      "step": 13888
+    },
+    {
+      "epoch": 2.47275641025641,
+      "grad_norm": 0.8138917088508606,
+      "learning_rate": 6.385587193827416e-05,
+      "loss": 0.915,
+      "step": 13889
+    },
+    {
+      "epoch": 2.472934472934473,
+      "grad_norm": 0.7842695713043213,
+      "learning_rate": 6.384282112110506e-05,
+      "loss": 0.9458,
+      "step": 13890
+    },
+    {
+      "epoch": 2.4731125356125356,
+      "grad_norm": 0.7852116823196411,
+      "learning_rate": 6.382977101236074e-05,
+      "loss": 0.9515,
+      "step": 13891
+    },
+    {
+      "epoch": 2.4732905982905984,
+      "grad_norm": 0.8429296016693115,
+      "learning_rate": 6.381672161229698e-05,
+      "loss": 0.9466,
+      "step": 13892
+    },
+    {
+      "epoch": 2.473468660968661,
+      "grad_norm": 0.8713327050209045,
+      "learning_rate": 6.380367292116933e-05,
+      "loss": 0.7552,
+      "step": 13893
+    },
+    {
+      "epoch": 2.4736467236467234,
+      "grad_norm": 0.8153441548347473,
+      "learning_rate": 6.379062493923355e-05,
+      "loss": 0.7833,
+      "step": 13894
+    },
+    {
+      "epoch": 2.473824786324786,
+      "grad_norm": 0.8283601999282837,
+      "learning_rate": 6.377757766674526e-05,
+      "loss": 0.8986,
+      "step": 13895
+    },
+    {
+      "epoch": 2.474002849002849,
+      "grad_norm": 0.8116408586502075,
+      "learning_rate": 6.37645311039601e-05,
+      "loss": 0.8549,
+      "step": 13896
+    },
+    {
+      "epoch": 2.4741809116809117,
+      "grad_norm": 0.7999116778373718,
+      "learning_rate": 6.375148525113365e-05,
+      "loss": 0.8082,
+      "step": 13897
+    },
+    {
+      "epoch": 2.4743589743589745,
+      "grad_norm": 0.7307565808296204,
+      "learning_rate": 6.373844010852159e-05,
+      "loss": 0.7946,
+      "step": 13898
+    },
+    {
+      "epoch": 2.474537037037037,
+      "grad_norm": 0.7519806027412415,
+      "learning_rate": 6.372539567637941e-05,
+      "loss": 0.6511,
+      "step": 13899
+    },
+    {
+      "epoch": 2.4747150997151,
+      "grad_norm": 0.8571820259094238,
+      "learning_rate": 6.371235195496279e-05,
+      "loss": 0.8266,
+      "step": 13900
+    },
+    {
+      "epoch": 2.4748931623931623,
+      "grad_norm": 0.8118062019348145,
+      "learning_rate": 6.369930894452723e-05,
+      "loss": 0.8573,
+      "step": 13901
+    },
+    {
+      "epoch": 2.475071225071225,
+      "grad_norm": 0.8729892373085022,
+      "learning_rate": 6.368626664532833e-05,
+      "loss": 0.812,
+      "step": 13902
+    },
+    {
+      "epoch": 2.4752492877492878,
+      "grad_norm": 0.7663209438323975,
+      "learning_rate": 6.367322505762157e-05,
+      "loss": 0.6648,
+      "step": 13903
+    },
+    {
+      "epoch": 2.4754273504273505,
+      "grad_norm": 0.7913058996200562,
+      "learning_rate": 6.366018418166251e-05,
+      "loss": 0.7486,
+      "step": 13904
+    },
+    {
+      "epoch": 2.4756054131054133,
+      "grad_norm": 0.7714928984642029,
+      "learning_rate": 6.364714401770666e-05,
+      "loss": 0.9134,
+      "step": 13905
+    },
+    {
+      "epoch": 2.4757834757834756,
+      "grad_norm": 0.8226378560066223,
+      "learning_rate": 6.363410456600949e-05,
+      "loss": 0.903,
+      "step": 13906
+    },
+    {
+      "epoch": 2.4759615384615383,
+      "grad_norm": 0.8643919825553894,
+      "learning_rate": 6.362106582682653e-05,
+      "loss": 0.9068,
+      "step": 13907
+    },
+    {
+      "epoch": 2.476139601139601,
+      "grad_norm": 0.8390868306159973,
+      "learning_rate": 6.360802780041317e-05,
+      "loss": 0.8938,
+      "step": 13908
+    },
+    {
+      "epoch": 2.476317663817664,
+      "grad_norm": 0.9549261927604675,
+      "learning_rate": 6.359499048702495e-05,
+      "loss": 0.8431,
+      "step": 13909
+    },
+    {
+      "epoch": 2.4764957264957266,
+      "grad_norm": 0.7509152889251709,
+      "learning_rate": 6.358195388691726e-05,
+      "loss": 0.8661,
+      "step": 13910
+    },
+    {
+      "epoch": 2.4766737891737893,
+      "grad_norm": 0.9447416067123413,
+      "learning_rate": 6.356891800034552e-05,
+      "loss": 0.8766,
+      "step": 13911
+    },
+    {
+      "epoch": 2.476851851851852,
+      "grad_norm": 0.8972395062446594,
+      "learning_rate": 6.355588282756515e-05,
+      "loss": 0.8517,
+      "step": 13912
+    },
+    {
+      "epoch": 2.4770299145299144,
+      "grad_norm": 0.8645047545433044,
+      "learning_rate": 6.354284836883156e-05,
+      "loss": 0.8188,
+      "step": 13913
+    },
+    {
+      "epoch": 2.477207977207977,
+      "grad_norm": 0.7939230799674988,
+      "learning_rate": 6.35298146244001e-05,
+      "loss": 0.778,
+      "step": 13914
+    },
+    {
+      "epoch": 2.47738603988604,
+      "grad_norm": 0.7714613676071167,
+      "learning_rate": 6.351678159452618e-05,
+      "loss": 0.7964,
+      "step": 13915
+    },
+    {
+      "epoch": 2.4775641025641026,
+      "grad_norm": 0.9293754696846008,
+      "learning_rate": 6.350374927946512e-05,
+      "loss": 0.7965,
+      "step": 13916
+    },
+    {
+      "epoch": 2.4777421652421654,
+      "grad_norm": 0.8025050759315491,
+      "learning_rate": 6.349071767947233e-05,
+      "loss": 0.8222,
+      "step": 13917
+    },
+    {
+      "epoch": 2.4779202279202277,
+      "grad_norm": 0.7790399193763733,
+      "learning_rate": 6.347768679480304e-05,
+      "loss": 1.0294,
+      "step": 13918
+    },
+    {
+      "epoch": 2.4780982905982905,
+      "grad_norm": 1.0063512325286865,
+      "learning_rate": 6.346465662571261e-05,
+      "loss": 0.953,
+      "step": 13919
+    },
+    {
+      "epoch": 2.478276353276353,
+      "grad_norm": 0.8742708563804626,
+      "learning_rate": 6.345162717245634e-05,
+      "loss": 0.7119,
+      "step": 13920
+    },
+    {
+      "epoch": 2.478454415954416,
+      "grad_norm": 0.760497510433197,
+      "learning_rate": 6.343859843528955e-05,
+      "loss": 0.7446,
+      "step": 13921
+    },
+    {
+      "epoch": 2.4786324786324787,
+      "grad_norm": 0.7635362148284912,
+      "learning_rate": 6.342557041446743e-05,
+      "loss": 0.8286,
+      "step": 13922
+    },
+    {
+      "epoch": 2.4788105413105415,
+      "grad_norm": 0.7876720428466797,
+      "learning_rate": 6.341254311024532e-05,
+      "loss": 0.8007,
+      "step": 13923
+    },
+    {
+      "epoch": 2.478988603988604,
+      "grad_norm": 0.8144401907920837,
+      "learning_rate": 6.339951652287839e-05,
+      "loss": 0.9177,
+      "step": 13924
+    },
+    {
+      "epoch": 2.4791666666666665,
+      "grad_norm": 0.7400189638137817,
+      "learning_rate": 6.338649065262189e-05,
+      "loss": 0.8935,
+      "step": 13925
+    },
+    {
+      "epoch": 2.4793447293447293,
+      "grad_norm": 0.8412175178527832,
+      "learning_rate": 6.337346549973106e-05,
+      "loss": 0.6929,
+      "step": 13926
+    },
+    {
+      "epoch": 2.479522792022792,
+      "grad_norm": 1.0156967639923096,
+      "learning_rate": 6.336044106446108e-05,
+      "loss": 0.9843,
+      "step": 13927
+    },
+    {
+      "epoch": 2.4797008547008548,
+      "grad_norm": 0.8556809425354004,
+      "learning_rate": 6.33474173470672e-05,
+      "loss": 0.81,
+      "step": 13928
+    },
+    {
+      "epoch": 2.4798789173789175,
+      "grad_norm": 0.8103616237640381,
+      "learning_rate": 6.333439434780448e-05,
+      "loss": 0.925,
+      "step": 13929
+    },
+    {
+      "epoch": 2.48005698005698,
+      "grad_norm": 0.9460168480873108,
+      "learning_rate": 6.332137206692817e-05,
+      "loss": 0.7059,
+      "step": 13930
+    },
+    {
+      "epoch": 2.4802350427350426,
+      "grad_norm": 0.847226619720459,
+      "learning_rate": 6.330835050469334e-05,
+      "loss": 1.0139,
+      "step": 13931
+    },
+    {
+      "epoch": 2.4804131054131053,
+      "grad_norm": 0.7639240622520447,
+      "learning_rate": 6.329532966135523e-05,
+      "loss": 0.8141,
+      "step": 13932
+    },
+    {
+      "epoch": 2.480591168091168,
+      "grad_norm": 0.9273494482040405,
+      "learning_rate": 6.328230953716883e-05,
+      "loss": 0.9864,
+      "step": 13933
+    },
+    {
+      "epoch": 2.480769230769231,
+      "grad_norm": 0.7709840536117554,
+      "learning_rate": 6.326929013238934e-05,
+      "loss": 0.7275,
+      "step": 13934
+    },
+    {
+      "epoch": 2.4809472934472936,
+      "grad_norm": 0.755933940410614,
+      "learning_rate": 6.325627144727177e-05,
+      "loss": 0.7785,
+      "step": 13935
+    },
+    {
+      "epoch": 2.4811253561253563,
+      "grad_norm": 0.9058536291122437,
+      "learning_rate": 6.324325348207125e-05,
+      "loss": 0.9694,
+      "step": 13936
+    },
+    {
+      "epoch": 2.4813034188034186,
+      "grad_norm": 0.8490056395530701,
+      "learning_rate": 6.323023623704282e-05,
+      "loss": 0.8882,
+      "step": 13937
+    },
+    {
+      "epoch": 2.4814814814814814,
+      "grad_norm": 0.9559429883956909,
+      "learning_rate": 6.321721971244155e-05,
+      "loss": 0.9243,
+      "step": 13938
+    },
+    {
+      "epoch": 2.481659544159544,
+      "grad_norm": 0.8607096076011658,
+      "learning_rate": 6.320420390852242e-05,
+      "loss": 0.9281,
+      "step": 13939
+    },
+    {
+      "epoch": 2.481837606837607,
+      "grad_norm": 1.1263439655303955,
+      "learning_rate": 6.319118882554049e-05,
+      "loss": 0.8772,
+      "step": 13940
+    },
+    {
+      "epoch": 2.4820156695156697,
+      "grad_norm": 0.9691354632377625,
+      "learning_rate": 6.317817446375074e-05,
+      "loss": 0.9349,
+      "step": 13941
+    },
+    {
+      "epoch": 2.482193732193732,
+      "grad_norm": 0.8636828064918518,
+      "learning_rate": 6.31651608234082e-05,
+      "loss": 0.9021,
+      "step": 13942
+    },
+    {
+      "epoch": 2.4823717948717947,
+      "grad_norm": 0.8405864238739014,
+      "learning_rate": 6.315214790476777e-05,
+      "loss": 0.8917,
+      "step": 13943
+    },
+    {
+      "epoch": 2.4825498575498575,
+      "grad_norm": 0.8082821369171143,
+      "learning_rate": 6.313913570808448e-05,
+      "loss": 0.914,
+      "step": 13944
+    },
+    {
+      "epoch": 2.48272792022792,
+      "grad_norm": 0.8734335899353027,
+      "learning_rate": 6.312612423361328e-05,
+      "loss": 0.6885,
+      "step": 13945
+    },
+    {
+      "epoch": 2.482905982905983,
+      "grad_norm": 0.943190336227417,
+      "learning_rate": 6.311311348160904e-05,
+      "loss": 0.894,
+      "step": 13946
+    },
+    {
+      "epoch": 2.4830840455840457,
+      "grad_norm": 0.9425446391105652,
+      "learning_rate": 6.310010345232673e-05,
+      "loss": 0.7893,
+      "step": 13947
+    },
+    {
+      "epoch": 2.4832621082621085,
+      "grad_norm": 0.9668664932250977,
+      "learning_rate": 6.308709414602123e-05,
+      "loss": 0.9715,
+      "step": 13948
+    },
+    {
+      "epoch": 2.4834401709401708,
+      "grad_norm": 0.7771579027175903,
+      "learning_rate": 6.307408556294747e-05,
+      "loss": 0.872,
+      "step": 13949
+    },
+    {
+      "epoch": 2.4836182336182335,
+      "grad_norm": 0.8183084726333618,
+      "learning_rate": 6.306107770336025e-05,
+      "loss": 1.0008,
+      "step": 13950
+    },
+    {
+      "epoch": 2.4837962962962963,
+      "grad_norm": 0.8284399509429932,
+      "learning_rate": 6.304807056751452e-05,
+      "loss": 0.687,
+      "step": 13951
+    },
+    {
+      "epoch": 2.483974358974359,
+      "grad_norm": 0.8663082718849182,
+      "learning_rate": 6.303506415566504e-05,
+      "loss": 0.8792,
+      "step": 13952
+    },
+    {
+      "epoch": 2.484152421652422,
+      "grad_norm": 0.8272001147270203,
+      "learning_rate": 6.302205846806675e-05,
+      "loss": 0.9255,
+      "step": 13953
+    },
+    {
+      "epoch": 2.484330484330484,
+      "grad_norm": 0.9398671388626099,
+      "learning_rate": 6.300905350497437e-05,
+      "loss": 0.9605,
+      "step": 13954
+    },
+    {
+      "epoch": 2.484508547008547,
+      "grad_norm": 0.7368931770324707,
+      "learning_rate": 6.299604926664276e-05,
+      "loss": 0.6734,
+      "step": 13955
+    },
+    {
+      "epoch": 2.4846866096866096,
+      "grad_norm": 0.8315541744232178,
+      "learning_rate": 6.298304575332668e-05,
+      "loss": 0.9578,
+      "step": 13956
+    },
+    {
+      "epoch": 2.4848646723646723,
+      "grad_norm": 0.8656954169273376,
+      "learning_rate": 6.297004296528095e-05,
+      "loss": 0.8897,
+      "step": 13957
+    },
+    {
+      "epoch": 2.485042735042735,
+      "grad_norm": 0.9047118425369263,
+      "learning_rate": 6.295704090276026e-05,
+      "loss": 1.0308,
+      "step": 13958
+    },
+    {
+      "epoch": 2.485220797720798,
+      "grad_norm": 0.8771422505378723,
+      "learning_rate": 6.294403956601946e-05,
+      "loss": 0.9505,
+      "step": 13959
+    },
+    {
+      "epoch": 2.4853988603988606,
+      "grad_norm": 0.8935427069664001,
+      "learning_rate": 6.293103895531319e-05,
+      "loss": 1.0485,
+      "step": 13960
+    },
+    {
+      "epoch": 2.485576923076923,
+      "grad_norm": 0.7345624566078186,
+      "learning_rate": 6.291803907089621e-05,
+      "loss": 0.7485,
+      "step": 13961
+    },
+    {
+      "epoch": 2.4857549857549857,
+      "grad_norm": 0.6415224075317383,
+      "learning_rate": 6.290503991302324e-05,
+      "loss": 0.6237,
+      "step": 13962
+    },
+    {
+      "epoch": 2.4859330484330484,
+      "grad_norm": 0.8547754883766174,
+      "learning_rate": 6.289204148194896e-05,
+      "loss": 0.9152,
+      "step": 13963
+    },
+    {
+      "epoch": 2.486111111111111,
+      "grad_norm": 0.7434722185134888,
+      "learning_rate": 6.2879043777928e-05,
+      "loss": 0.8499,
+      "step": 13964
+    },
+    {
+      "epoch": 2.486289173789174,
+      "grad_norm": 0.8609980940818787,
+      "learning_rate": 6.286604680121509e-05,
+      "loss": 0.8101,
+      "step": 13965
+    },
+    {
+      "epoch": 2.486467236467236,
+      "grad_norm": 0.8709290027618408,
+      "learning_rate": 6.285305055206486e-05,
+      "loss": 0.8335,
+      "step": 13966
+    },
+    {
+      "epoch": 2.486645299145299,
+      "grad_norm": 0.7758293747901917,
+      "learning_rate": 6.284005503073191e-05,
+      "loss": 0.8408,
+      "step": 13967
+    },
+    {
+      "epoch": 2.4868233618233617,
+      "grad_norm": 0.9778353571891785,
+      "learning_rate": 6.282706023747094e-05,
+      "loss": 0.8595,
+      "step": 13968
+    },
+    {
+      "epoch": 2.4870014245014245,
+      "grad_norm": 0.8438369631767273,
+      "learning_rate": 6.281406617253646e-05,
+      "loss": 0.8843,
+      "step": 13969
+    },
+    {
+      "epoch": 2.4871794871794872,
+      "grad_norm": 0.9704681634902954,
+      "learning_rate": 6.280107283618315e-05,
+      "loss": 0.62,
+      "step": 13970
+    },
+    {
+      "epoch": 2.48735754985755,
+      "grad_norm": 0.7774441242218018,
+      "learning_rate": 6.278808022866549e-05,
+      "loss": 0.9087,
+      "step": 13971
+    },
+    {
+      "epoch": 2.4875356125356127,
+      "grad_norm": 0.8387142419815063,
+      "learning_rate": 6.277508835023813e-05,
+      "loss": 0.854,
+      "step": 13972
+    },
+    {
+      "epoch": 2.487713675213675,
+      "grad_norm": 0.8483029007911682,
+      "learning_rate": 6.276209720115556e-05,
+      "loss": 0.8665,
+      "step": 13973
+    },
+    {
+      "epoch": 2.487891737891738,
+      "grad_norm": 0.8251432180404663,
+      "learning_rate": 6.274910678167239e-05,
+      "loss": 0.7816,
+      "step": 13974
+    },
+    {
+      "epoch": 2.4880698005698005,
+      "grad_norm": 0.8503836989402771,
+      "learning_rate": 6.273611709204304e-05,
+      "loss": 0.939,
+      "step": 13975
+    },
+    {
+      "epoch": 2.4882478632478633,
+      "grad_norm": 0.6545158624649048,
+      "learning_rate": 6.27231281325221e-05,
+      "loss": 0.564,
+      "step": 13976
+    },
+    {
+      "epoch": 2.488425925925926,
+      "grad_norm": 0.7353499531745911,
+      "learning_rate": 6.2710139903364e-05,
+      "loss": 0.7103,
+      "step": 13977
+    },
+    {
+      "epoch": 2.488603988603989,
+      "grad_norm": 0.9032405614852905,
+      "learning_rate": 6.269715240482327e-05,
+      "loss": 0.8802,
+      "step": 13978
+    },
+    {
+      "epoch": 2.488782051282051,
+      "grad_norm": 0.8141019940376282,
+      "learning_rate": 6.268416563715434e-05,
+      "loss": 0.9836,
+      "step": 13979
+    },
+    {
+      "epoch": 2.488960113960114,
+      "grad_norm": 0.9087637066841125,
+      "learning_rate": 6.267117960061167e-05,
+      "loss": 0.8488,
+      "step": 13980
+    },
+    {
+      "epoch": 2.4891381766381766,
+      "grad_norm": 0.6649556756019592,
+      "learning_rate": 6.265819429544969e-05,
+      "loss": 0.604,
+      "step": 13981
+    },
+    {
+      "epoch": 2.4893162393162394,
+      "grad_norm": 0.7872918248176575,
+      "learning_rate": 6.264520972192283e-05,
+      "loss": 0.8537,
+      "step": 13982
+    },
+    {
+      "epoch": 2.489494301994302,
+      "grad_norm": 0.8821072578430176,
+      "learning_rate": 6.263222588028546e-05,
+      "loss": 0.7485,
+      "step": 13983
+    },
+    {
+      "epoch": 2.489672364672365,
+      "grad_norm": 0.8077933192253113,
+      "learning_rate": 6.2619242770792e-05,
+      "loss": 0.9174,
+      "step": 13984
+    },
+    {
+      "epoch": 2.489850427350427,
+      "grad_norm": 0.8087183237075806,
+      "learning_rate": 6.260626039369686e-05,
+      "loss": 0.9184,
+      "step": 13985
+    },
+    {
+      "epoch": 2.49002849002849,
+      "grad_norm": 0.8849205374717712,
+      "learning_rate": 6.259327874925434e-05,
+      "loss": 0.9095,
+      "step": 13986
+    },
+    {
+      "epoch": 2.4902065527065527,
+      "grad_norm": 0.7899976968765259,
+      "learning_rate": 6.258029783771884e-05,
+      "loss": 0.7286,
+      "step": 13987
+    },
+    {
+      "epoch": 2.4903846153846154,
+      "grad_norm": 0.7907543182373047,
+      "learning_rate": 6.256731765934464e-05,
+      "loss": 0.8433,
+      "step": 13988
+    },
+    {
+      "epoch": 2.490562678062678,
+      "grad_norm": 0.7794694304466248,
+      "learning_rate": 6.255433821438614e-05,
+      "loss": 0.7197,
+      "step": 13989
+    },
+    {
+      "epoch": 2.490740740740741,
+      "grad_norm": 0.8443161249160767,
+      "learning_rate": 6.254135950309753e-05,
+      "loss": 0.7982,
+      "step": 13990
+    },
+    {
+      "epoch": 2.4909188034188032,
+      "grad_norm": 0.972024142742157,
+      "learning_rate": 6.252838152573323e-05,
+      "loss": 0.9212,
+      "step": 13991
+    },
+    {
+      "epoch": 2.491096866096866,
+      "grad_norm": 0.8017764687538147,
+      "learning_rate": 6.25154042825474e-05,
+      "loss": 0.9544,
+      "step": 13992
+    },
+    {
+      "epoch": 2.4912749287749287,
+      "grad_norm": 0.8622884154319763,
+      "learning_rate": 6.250242777379442e-05,
+      "loss": 0.8411,
+      "step": 13993
+    },
+    {
+      "epoch": 2.4914529914529915,
+      "grad_norm": 0.7384446263313293,
+      "learning_rate": 6.248945199972842e-05,
+      "loss": 0.8357,
+      "step": 13994
+    },
+    {
+      "epoch": 2.4916310541310542,
+      "grad_norm": 0.7748960256576538,
+      "learning_rate": 6.247647696060372e-05,
+      "loss": 0.7739,
+      "step": 13995
+    },
+    {
+      "epoch": 2.491809116809117,
+      "grad_norm": 0.8295742273330688,
+      "learning_rate": 6.246350265667448e-05,
+      "loss": 0.8032,
+      "step": 13996
+    },
+    {
+      "epoch": 2.4919871794871793,
+      "grad_norm": 0.8604934811592102,
+      "learning_rate": 6.245052908819494e-05,
+      "loss": 0.8738,
+      "step": 13997
+    },
+    {
+      "epoch": 2.492165242165242,
+      "grad_norm": 0.8381406664848328,
+      "learning_rate": 6.243755625541926e-05,
+      "loss": 0.8351,
+      "step": 13998
+    },
+    {
+      "epoch": 2.492343304843305,
+      "grad_norm": 0.9238134026527405,
+      "learning_rate": 6.242458415860168e-05,
+      "loss": 0.9529,
+      "step": 13999
+    },
+    {
+      "epoch": 2.4925213675213675,
+      "grad_norm": 0.9234444499015808,
+      "learning_rate": 6.241161279799628e-05,
+      "loss": 1.1086,
+      "step": 14000
+    },
+    {
+      "epoch": 2.4926994301994303,
+      "grad_norm": 0.8056737780570984,
+      "learning_rate": 6.239864217385727e-05,
+      "loss": 0.7957,
+      "step": 14001
+    },
+    {
+      "epoch": 2.492877492877493,
+      "grad_norm": 0.7877696752548218,
+      "learning_rate": 6.238567228643872e-05,
+      "loss": 0.9577,
+      "step": 14002
+    },
+    {
+      "epoch": 2.4930555555555554,
+      "grad_norm": 0.7437340021133423,
+      "learning_rate": 6.237270313599479e-05,
+      "loss": 0.6171,
+      "step": 14003
+    },
+    {
+      "epoch": 2.493233618233618,
+      "grad_norm": 0.8503403067588806,
+      "learning_rate": 6.235973472277962e-05,
+      "loss": 0.7608,
+      "step": 14004
+    },
+    {
+      "epoch": 2.493411680911681,
+      "grad_norm": 0.8557562232017517,
+      "learning_rate": 6.234676704704722e-05,
+      "loss": 0.8414,
+      "step": 14005
+    },
+    {
+      "epoch": 2.4935897435897436,
+      "grad_norm": 0.9188289046287537,
+      "learning_rate": 6.233380010905174e-05,
+      "loss": 0.89,
+      "step": 14006
+    },
+    {
+      "epoch": 2.4937678062678064,
+      "grad_norm": 0.9433556199073792,
+      "learning_rate": 6.232083390904716e-05,
+      "loss": 0.9316,
+      "step": 14007
+    },
+    {
+      "epoch": 2.493945868945869,
+      "grad_norm": 0.9278882145881653,
+      "learning_rate": 6.230786844728759e-05,
+      "loss": 0.9211,
+      "step": 14008
+    },
+    {
+      "epoch": 2.494123931623932,
+      "grad_norm": 0.8365640640258789,
+      "learning_rate": 6.229490372402702e-05,
+      "loss": 0.9578,
+      "step": 14009
+    },
+    {
+      "epoch": 2.494301994301994,
+      "grad_norm": 0.7987647652626038,
+      "learning_rate": 6.228193973951953e-05,
+      "loss": 0.9279,
+      "step": 14010
+    },
+    {
+      "epoch": 2.494480056980057,
+      "grad_norm": 0.7707502841949463,
+      "learning_rate": 6.226897649401902e-05,
+      "loss": 0.8879,
+      "step": 14011
+    },
+    {
+      "epoch": 2.4946581196581197,
+      "grad_norm": 0.8623191118240356,
+      "learning_rate": 6.225601398777957e-05,
+      "loss": 0.7427,
+      "step": 14012
+    },
+    {
+      "epoch": 2.4948361823361824,
+      "grad_norm": 0.8470782041549683,
+      "learning_rate": 6.22430522210551e-05,
+      "loss": 0.9425,
+      "step": 14013
+    },
+    {
+      "epoch": 2.495014245014245,
+      "grad_norm": 0.9169524908065796,
+      "learning_rate": 6.223009119409963e-05,
+      "loss": 0.9595,
+      "step": 14014
+    },
+    {
+      "epoch": 2.4951923076923075,
+      "grad_norm": 0.8541738986968994,
+      "learning_rate": 6.221713090716701e-05,
+      "loss": 1.0726,
+      "step": 14015
+    },
+    {
+      "epoch": 2.4953703703703702,
+      "grad_norm": 0.8801444172859192,
+      "learning_rate": 6.220417136051126e-05,
+      "loss": 0.8323,
+      "step": 14016
+    },
+    {
+      "epoch": 2.495548433048433,
+      "grad_norm": 0.845448911190033,
+      "learning_rate": 6.219121255438624e-05,
+      "loss": 0.835,
+      "step": 14017
+    },
+    {
+      "epoch": 2.4957264957264957,
+      "grad_norm": 0.7653858661651611,
+      "learning_rate": 6.217825448904588e-05,
+      "loss": 0.7027,
+      "step": 14018
+    },
+    {
+      "epoch": 2.4959045584045585,
+      "grad_norm": 0.7779282927513123,
+      "learning_rate": 6.216529716474404e-05,
+      "loss": 0.7881,
+      "step": 14019
+    },
+    {
+      "epoch": 2.4960826210826212,
+      "grad_norm": 0.8739959597587585,
+      "learning_rate": 6.215234058173465e-05,
+      "loss": 0.9738,
+      "step": 14020
+    },
+    {
+      "epoch": 2.496260683760684,
+      "grad_norm": 0.8388087749481201,
+      "learning_rate": 6.213938474027148e-05,
+      "loss": 1.0128,
+      "step": 14021
+    },
+    {
+      "epoch": 2.4964387464387463,
+      "grad_norm": 0.8963341116905212,
+      "learning_rate": 6.212642964060843e-05,
+      "loss": 0.9669,
+      "step": 14022
+    },
+    {
+      "epoch": 2.496616809116809,
+      "grad_norm": 0.8959031701087952,
+      "learning_rate": 6.211347528299928e-05,
+      "loss": 0.9558,
+      "step": 14023
+    },
+    {
+      "epoch": 2.496794871794872,
+      "grad_norm": 0.8463472127914429,
+      "learning_rate": 6.210052166769791e-05,
+      "loss": 0.9835,
+      "step": 14024
+    },
+    {
+      "epoch": 2.4969729344729346,
+      "grad_norm": 0.7827564477920532,
+      "learning_rate": 6.208756879495812e-05,
+      "loss": 0.8411,
+      "step": 14025
+    },
+    {
+      "epoch": 2.4971509971509973,
+      "grad_norm": 0.8851028084754944,
+      "learning_rate": 6.207461666503363e-05,
+      "loss": 1.0409,
+      "step": 14026
+    },
+    {
+      "epoch": 2.4973290598290596,
+      "grad_norm": 1.0151652097702026,
+      "learning_rate": 6.206166527817825e-05,
+      "loss": 0.6671,
+      "step": 14027
+    },
+    {
+      "epoch": 2.4975071225071224,
+      "grad_norm": 0.7924346923828125,
+      "learning_rate": 6.204871463464572e-05,
+      "loss": 0.9971,
+      "step": 14028
+    },
+    {
+      "epoch": 2.497685185185185,
+      "grad_norm": 0.8524144887924194,
+      "learning_rate": 6.203576473468981e-05,
+      "loss": 0.9228,
+      "step": 14029
+    },
+    {
+      "epoch": 2.497863247863248,
+      "grad_norm": 0.7936401963233948,
+      "learning_rate": 6.20228155785642e-05,
+      "loss": 0.8128,
+      "step": 14030
+    },
+    {
+      "epoch": 2.4980413105413106,
+      "grad_norm": 1.0074050426483154,
+      "learning_rate": 6.200986716652267e-05,
+      "loss": 0.7846,
+      "step": 14031
+    },
+    {
+      "epoch": 2.4982193732193734,
+      "grad_norm": 0.7972239851951599,
+      "learning_rate": 6.199691949881882e-05,
+      "loss": 0.8689,
+      "step": 14032
+    },
+    {
+      "epoch": 2.498397435897436,
+      "grad_norm": 0.8810364007949829,
+      "learning_rate": 6.198397257570643e-05,
+      "loss": 0.7775,
+      "step": 14033
+    },
+    {
+      "epoch": 2.4985754985754984,
+      "grad_norm": 0.8819566965103149,
+      "learning_rate": 6.19710263974391e-05,
+      "loss": 0.8852,
+      "step": 14034
+    },
+    {
+      "epoch": 2.498753561253561,
+      "grad_norm": 0.8020595908164978,
+      "learning_rate": 6.195808096427054e-05,
+      "loss": 0.9691,
+      "step": 14035
+    },
+    {
+      "epoch": 2.498931623931624,
+      "grad_norm": 0.83958899974823,
+      "learning_rate": 6.194513627645433e-05,
+      "loss": 0.8072,
+      "step": 14036
+    },
+    {
+      "epoch": 2.4991096866096867,
+      "grad_norm": 0.7525333166122437,
+      "learning_rate": 6.193219233424414e-05,
+      "loss": 0.7839,
+      "step": 14037
+    },
+    {
+      "epoch": 2.4992877492877494,
+      "grad_norm": 0.8687964677810669,
+      "learning_rate": 6.191924913789353e-05,
+      "loss": 0.9512,
+      "step": 14038
+    },
+    {
+      "epoch": 2.4994658119658117,
+      "grad_norm": 0.9080697298049927,
+      "learning_rate": 6.190630668765617e-05,
+      "loss": 0.8635,
+      "step": 14039
+    },
+    {
+      "epoch": 2.4996438746438745,
+      "grad_norm": 0.8174137473106384,
+      "learning_rate": 6.189336498378557e-05,
+      "loss": 0.9034,
+      "step": 14040
+    },
+    {
+      "epoch": 2.4996438746438745,
+      "eval_loss": 1.1338438987731934,
+      "eval_runtime": 24.4013,
+      "eval_samples_per_second": 42.662,
+      "eval_steps_per_second": 21.351,
+      "step": 14040
+    },
+    {
+      "epoch": 2.4998219373219372,
+      "grad_norm": 0.9711320996284485,
+      "learning_rate": 6.188042402653536e-05,
+      "loss": 0.9892,
+      "step": 14041
+    },
+    {
+      "epoch": 2.5,
+      "grad_norm": 0.8726856112480164,
+      "learning_rate": 6.1867483816159e-05,
+      "loss": 0.7482,
+      "step": 14042
+    },
+    {
+      "epoch": 2.5001780626780628,
+      "grad_norm": 0.875801682472229,
+      "learning_rate": 6.18545443529101e-05,
+      "loss": 0.7599,
+      "step": 14043
+    },
+    {
+      "epoch": 2.5003561253561255,
+      "grad_norm": 0.8867987990379333,
+      "learning_rate": 6.184160563704218e-05,
+      "loss": 0.805,
+      "step": 14044
+    },
+    {
+      "epoch": 2.5005341880341883,
+      "grad_norm": 0.8766322135925293,
+      "learning_rate": 6.18286676688087e-05,
+      "loss": 0.7343,
+      "step": 14045
+    },
+    {
+      "epoch": 2.5007122507122506,
+      "grad_norm": 0.8096646070480347,
+      "learning_rate": 6.181573044846323e-05,
+      "loss": 0.7957,
+      "step": 14046
+    },
+    {
+      "epoch": 2.5008903133903133,
+      "grad_norm": 1.0121821165084839,
+      "learning_rate": 6.180279397625917e-05,
+      "loss": 0.8775,
+      "step": 14047
+    },
+    {
+      "epoch": 2.501068376068376,
+      "grad_norm": 0.79291170835495,
+      "learning_rate": 6.178985825245003e-05,
+      "loss": 0.91,
+      "step": 14048
+    },
+    {
+      "epoch": 2.501246438746439,
+      "grad_norm": 0.83204185962677,
+      "learning_rate": 6.177692327728922e-05,
+      "loss": 0.799,
+      "step": 14049
+    },
+    {
+      "epoch": 2.5014245014245016,
+      "grad_norm": 0.8746328949928284,
+      "learning_rate": 6.176398905103023e-05,
+      "loss": 0.9595,
+      "step": 14050
+    },
+    {
+      "epoch": 2.501602564102564,
+      "grad_norm": 0.7665601968765259,
+      "learning_rate": 6.17510555739264e-05,
+      "loss": 0.7935,
+      "step": 14051
+    },
+    {
+      "epoch": 2.5017806267806266,
+      "grad_norm": 0.8761195540428162,
+      "learning_rate": 6.173812284623122e-05,
+      "loss": 1.017,
+      "step": 14052
+    },
+    {
+      "epoch": 2.5019586894586894,
+      "grad_norm": 0.8847656846046448,
+      "learning_rate": 6.172519086819802e-05,
+      "loss": 0.7684,
+      "step": 14053
+    },
+    {
+      "epoch": 2.502136752136752,
+      "grad_norm": 0.8320107460021973,
+      "learning_rate": 6.171225964008021e-05,
+      "loss": 0.8828,
+      "step": 14054
+    },
+    {
+      "epoch": 2.502314814814815,
+      "grad_norm": 1.0184354782104492,
+      "learning_rate": 6.169932916213111e-05,
+      "loss": 0.887,
+      "step": 14055
+    },
+    {
+      "epoch": 2.5024928774928776,
+      "grad_norm": 0.7870062589645386,
+      "learning_rate": 6.168639943460415e-05,
+      "loss": 0.8595,
+      "step": 14056
+    },
+    {
+      "epoch": 2.5026709401709404,
+      "grad_norm": 0.8314430117607117,
+      "learning_rate": 6.167347045775254e-05,
+      "loss": 0.7822,
+      "step": 14057
+    },
+    {
+      "epoch": 2.5028490028490027,
+      "grad_norm": 0.7521854043006897,
+      "learning_rate": 6.166054223182968e-05,
+      "loss": 0.916,
+      "step": 14058
+    },
+    {
+      "epoch": 2.5030270655270654,
+      "grad_norm": 0.8240202069282532,
+      "learning_rate": 6.164761475708885e-05,
+      "loss": 0.9404,
+      "step": 14059
+    },
+    {
+      "epoch": 2.503205128205128,
+      "grad_norm": 0.8467113375663757,
+      "learning_rate": 6.163468803378338e-05,
+      "loss": 0.7554,
+      "step": 14060
+    },
+    {
+      "epoch": 2.503383190883191,
+      "grad_norm": 0.7639012336730957,
+      "learning_rate": 6.162176206216645e-05,
+      "loss": 0.8141,
+      "step": 14061
+    },
+    {
+      "epoch": 2.5035612535612537,
+      "grad_norm": 0.8578195571899414,
+      "learning_rate": 6.160883684249138e-05,
+      "loss": 0.821,
+      "step": 14062
+    },
+    {
+      "epoch": 2.503739316239316,
+      "grad_norm": 0.7478210926055908,
+      "learning_rate": 6.159591237501139e-05,
+      "loss": 0.5878,
+      "step": 14063
+    },
+    {
+      "epoch": 2.5039173789173788,
+      "grad_norm": 0.7936450242996216,
+      "learning_rate": 6.158298865997972e-05,
+      "loss": 0.9616,
+      "step": 14064
+    },
+    {
+      "epoch": 2.5040954415954415,
+      "grad_norm": 0.9196288585662842,
+      "learning_rate": 6.157006569764963e-05,
+      "loss": 0.8147,
+      "step": 14065
+    },
+    {
+      "epoch": 2.5042735042735043,
+      "grad_norm": 1.0488382577896118,
+      "learning_rate": 6.155714348827422e-05,
+      "loss": 0.7941,
+      "step": 14066
+    },
+    {
+      "epoch": 2.504451566951567,
+      "grad_norm": 0.9195658564567566,
+      "learning_rate": 6.154422203210676e-05,
+      "loss": 1.0186,
+      "step": 14067
+    },
+    {
+      "epoch": 2.5046296296296298,
+      "grad_norm": 0.9088640213012695,
+      "learning_rate": 6.153130132940037e-05,
+      "loss": 0.7611,
+      "step": 14068
+    },
+    {
+      "epoch": 2.5048076923076925,
+      "grad_norm": 0.8168773651123047,
+      "learning_rate": 6.151838138040821e-05,
+      "loss": 0.8466,
+      "step": 14069
+    },
+    {
+      "epoch": 2.504985754985755,
+      "grad_norm": 0.9976982474327087,
+      "learning_rate": 6.150546218538342e-05,
+      "loss": 0.9438,
+      "step": 14070
+    },
+    {
+      "epoch": 2.5051638176638176,
+      "grad_norm": 0.9469537138938904,
+      "learning_rate": 6.149254374457917e-05,
+      "loss": 0.9485,
+      "step": 14071
+    },
+    {
+      "epoch": 2.5053418803418803,
+      "grad_norm": 0.8861194849014282,
+      "learning_rate": 6.147962605824851e-05,
+      "loss": 0.9676,
+      "step": 14072
+    },
+    {
+      "epoch": 2.505519943019943,
+      "grad_norm": 0.9008424878120422,
+      "learning_rate": 6.146670912664457e-05,
+      "loss": 0.7343,
+      "step": 14073
+    },
+    {
+      "epoch": 2.505698005698006,
+      "grad_norm": 0.8957796096801758,
+      "learning_rate": 6.145379295002038e-05,
+      "loss": 0.8538,
+      "step": 14074
+    },
+    {
+      "epoch": 2.505876068376068,
+      "grad_norm": 0.8739160895347595,
+      "learning_rate": 6.14408775286291e-05,
+      "loss": 0.9029,
+      "step": 14075
+    },
+    {
+      "epoch": 2.506054131054131,
+      "grad_norm": 0.7713274955749512,
+      "learning_rate": 6.142796286272368e-05,
+      "loss": 0.6962,
+      "step": 14076
+    },
+    {
+      "epoch": 2.5062321937321936,
+      "grad_norm": 0.8545170426368713,
+      "learning_rate": 6.141504895255725e-05,
+      "loss": 0.9208,
+      "step": 14077
+    },
+    {
+      "epoch": 2.5064102564102564,
+      "grad_norm": 0.8102772235870361,
+      "learning_rate": 6.140213579838274e-05,
+      "loss": 0.7785,
+      "step": 14078
+    },
+    {
+      "epoch": 2.506588319088319,
+      "grad_norm": 0.8055099844932556,
+      "learning_rate": 6.138922340045321e-05,
+      "loss": 0.8502,
+      "step": 14079
+    },
+    {
+      "epoch": 2.506766381766382,
+      "grad_norm": 0.8132893443107605,
+      "learning_rate": 6.137631175902164e-05,
+      "loss": 0.7559,
+      "step": 14080
+    },
+    {
+      "epoch": 2.5069444444444446,
+      "grad_norm": 0.8608863949775696,
+      "learning_rate": 6.136340087434102e-05,
+      "loss": 0.838,
+      "step": 14081
+    },
+    {
+      "epoch": 2.5071225071225074,
+      "grad_norm": 0.8480643630027771,
+      "learning_rate": 6.135049074666428e-05,
+      "loss": 0.9062,
+      "step": 14082
+    },
+    {
+      "epoch": 2.5073005698005697,
+      "grad_norm": 0.7107672691345215,
+      "learning_rate": 6.133758137624437e-05,
+      "loss": 0.7494,
+      "step": 14083
+    },
+    {
+      "epoch": 2.5074786324786325,
+      "grad_norm": 0.812416672706604,
+      "learning_rate": 6.132467276333427e-05,
+      "loss": 0.6428,
+      "step": 14084
+    },
+    {
+      "epoch": 2.507656695156695,
+      "grad_norm": 0.8304431438446045,
+      "learning_rate": 6.131176490818684e-05,
+      "loss": 0.9931,
+      "step": 14085
+    },
+    {
+      "epoch": 2.507834757834758,
+      "grad_norm": 0.8344886302947998,
+      "learning_rate": 6.129885781105507e-05,
+      "loss": 0.921,
+      "step": 14086
+    },
+    {
+      "epoch": 2.5080128205128203,
+      "grad_norm": 0.8137457966804504,
+      "learning_rate": 6.128595147219172e-05,
+      "loss": 0.9113,
+      "step": 14087
+    },
+    {
+      "epoch": 2.508190883190883,
+      "grad_norm": 0.7404686212539673,
+      "learning_rate": 6.127304589184976e-05,
+      "loss": 0.7625,
+      "step": 14088
+    },
+    {
+      "epoch": 2.5083689458689458,
+      "grad_norm": 0.8179733157157898,
+      "learning_rate": 6.126014107028202e-05,
+      "loss": 0.9049,
+      "step": 14089
+    },
+    {
+      "epoch": 2.5085470085470085,
+      "grad_norm": 0.7788520455360413,
+      "learning_rate": 6.124723700774133e-05,
+      "loss": 0.7391,
+      "step": 14090
+    },
+    {
+      "epoch": 2.5087250712250713,
+      "grad_norm": 0.8127198219299316,
+      "learning_rate": 6.123433370448052e-05,
+      "loss": 0.8551,
+      "step": 14091
+    },
+    {
+      "epoch": 2.508903133903134,
+      "grad_norm": 0.8134245276451111,
+      "learning_rate": 6.122143116075245e-05,
+      "loss": 0.7422,
+      "step": 14092
+    },
+    {
+      "epoch": 2.5090811965811968,
+      "grad_norm": 0.9117823243141174,
+      "learning_rate": 6.120852937680983e-05,
+      "loss": 0.8649,
+      "step": 14093
+    },
+    {
+      "epoch": 2.5092592592592595,
+      "grad_norm": 0.8417702913284302,
+      "learning_rate": 6.119562835290553e-05,
+      "loss": 0.6902,
+      "step": 14094
+    },
+    {
+      "epoch": 2.509437321937322,
+      "grad_norm": 0.8655431866645813,
+      "learning_rate": 6.118272808929225e-05,
+      "loss": 1.0778,
+      "step": 14095
+    },
+    {
+      "epoch": 2.5096153846153846,
+      "grad_norm": 0.9228867888450623,
+      "learning_rate": 6.116982858622282e-05,
+      "loss": 0.9991,
+      "step": 14096
+    },
+    {
+      "epoch": 2.5097934472934473,
+      "grad_norm": 0.819505512714386,
+      "learning_rate": 6.115692984394992e-05,
+      "loss": 1.0351,
+      "step": 14097
+    },
+    {
+      "epoch": 2.50997150997151,
+      "grad_norm": 0.8419737219810486,
+      "learning_rate": 6.114403186272628e-05,
+      "loss": 0.92,
+      "step": 14098
+    },
+    {
+      "epoch": 2.5101495726495724,
+      "grad_norm": 0.8294256329536438,
+      "learning_rate": 6.11311346428046e-05,
+      "loss": 0.792,
+      "step": 14099
+    },
+    {
+      "epoch": 2.510327635327635,
+      "grad_norm": 0.8549113273620605,
+      "learning_rate": 6.111823818443765e-05,
+      "loss": 0.9453,
+      "step": 14100
+    },
+    {
+      "epoch": 2.510505698005698,
+      "grad_norm": 1.0194092988967896,
+      "learning_rate": 6.1105342487878e-05,
+      "loss": 0.9747,
+      "step": 14101
+    },
+    {
+      "epoch": 2.5106837606837606,
+      "grad_norm": 0.7633654475212097,
+      "learning_rate": 6.109244755337842e-05,
+      "loss": 0.8227,
+      "step": 14102
+    },
+    {
+      "epoch": 2.5108618233618234,
+      "grad_norm": 0.9679104685783386,
+      "learning_rate": 6.107955338119147e-05,
+      "loss": 1.0407,
+      "step": 14103
+    },
+    {
+      "epoch": 2.511039886039886,
+      "grad_norm": 0.8342793583869934,
+      "learning_rate": 6.10666599715698e-05,
+      "loss": 0.7868,
+      "step": 14104
+    },
+    {
+      "epoch": 2.511217948717949,
+      "grad_norm": 0.9264410734176636,
+      "learning_rate": 6.105376732476609e-05,
+      "loss": 0.802,
+      "step": 14105
+    },
+    {
+      "epoch": 2.5113960113960117,
+      "grad_norm": 0.7511885762214661,
+      "learning_rate": 6.104087544103287e-05,
+      "loss": 0.7561,
+      "step": 14106
+    },
+    {
+      "epoch": 2.511574074074074,
+      "grad_norm": 0.8330591320991516,
+      "learning_rate": 6.102798432062282e-05,
+      "loss": 0.8511,
+      "step": 14107
+    },
+    {
+      "epoch": 2.5117521367521367,
+      "grad_norm": 0.8971241116523743,
+      "learning_rate": 6.1015093963788415e-05,
+      "loss": 0.8854,
+      "step": 14108
+    },
+    {
+      "epoch": 2.5119301994301995,
+      "grad_norm": 0.7926762700080872,
+      "learning_rate": 6.100220437078228e-05,
+      "loss": 0.6803,
+      "step": 14109
+    },
+    {
+      "epoch": 2.512108262108262,
+      "grad_norm": 0.7384431958198547,
+      "learning_rate": 6.098931554185692e-05,
+      "loss": 0.6162,
+      "step": 14110
+    },
+    {
+      "epoch": 2.5122863247863245,
+      "grad_norm": 0.9201281070709229,
+      "learning_rate": 6.097642747726491e-05,
+      "loss": 1.1095,
+      "step": 14111
+    },
+    {
+      "epoch": 2.5124643874643873,
+      "grad_norm": 0.8822020888328552,
+      "learning_rate": 6.0963540177258716e-05,
+      "loss": 0.87,
+      "step": 14112
+    },
+    {
+      "epoch": 2.51264245014245,
+      "grad_norm": 0.8243268728256226,
+      "learning_rate": 6.09506536420909e-05,
+      "loss": 1.0899,
+      "step": 14113
+    },
+    {
+      "epoch": 2.5128205128205128,
+      "grad_norm": 0.8657538294792175,
+      "learning_rate": 6.093776787201386e-05,
+      "loss": 0.8218,
+      "step": 14114
+    },
+    {
+      "epoch": 2.5129985754985755,
+      "grad_norm": 0.8651030659675598,
+      "learning_rate": 6.092488286728013e-05,
+      "loss": 0.7903,
+      "step": 14115
+    },
+    {
+      "epoch": 2.5131766381766383,
+      "grad_norm": 0.8341799378395081,
+      "learning_rate": 6.091199862814214e-05,
+      "loss": 0.8612,
+      "step": 14116
+    },
+    {
+      "epoch": 2.513354700854701,
+      "grad_norm": 0.7693229913711548,
+      "learning_rate": 6.0899115154852384e-05,
+      "loss": 0.819,
+      "step": 14117
+    },
+    {
+      "epoch": 2.513532763532764,
+      "grad_norm": 0.8883055448532104,
+      "learning_rate": 6.088623244766318e-05,
+      "loss": 0.9026,
+      "step": 14118
+    },
+    {
+      "epoch": 2.513710826210826,
+      "grad_norm": 0.7761621475219727,
+      "learning_rate": 6.087335050682703e-05,
+      "loss": 0.7505,
+      "step": 14119
+    },
+    {
+      "epoch": 2.513888888888889,
+      "grad_norm": 0.8152571320533752,
+      "learning_rate": 6.086046933259628e-05,
+      "loss": 0.7637,
+      "step": 14120
+    },
+    {
+      "epoch": 2.5140669515669516,
+      "grad_norm": 0.7990148663520813,
+      "learning_rate": 6.0847588925223376e-05,
+      "loss": 0.8615,
+      "step": 14121
+    },
+    {
+      "epoch": 2.5142450142450143,
+      "grad_norm": 0.844756007194519,
+      "learning_rate": 6.083470928496058e-05,
+      "loss": 0.8696,
+      "step": 14122
+    },
+    {
+      "epoch": 2.5144230769230766,
+      "grad_norm": 0.7533631324768066,
+      "learning_rate": 6.082183041206031e-05,
+      "loss": 0.7674,
+      "step": 14123
+    },
+    {
+      "epoch": 2.5146011396011394,
+      "grad_norm": 0.7914009690284729,
+      "learning_rate": 6.0808952306774905e-05,
+      "loss": 0.7577,
+      "step": 14124
+    },
+    {
+      "epoch": 2.514779202279202,
+      "grad_norm": 0.8341572284698486,
+      "learning_rate": 6.079607496935666e-05,
+      "loss": 0.8899,
+      "step": 14125
+    },
+    {
+      "epoch": 2.514957264957265,
+      "grad_norm": 0.9185548424720764,
+      "learning_rate": 6.078319840005788e-05,
+      "loss": 0.9486,
+      "step": 14126
+    },
+    {
+      "epoch": 2.5151353276353277,
+      "grad_norm": 0.8611742854118347,
+      "learning_rate": 6.0770322599130856e-05,
+      "loss": 0.8267,
+      "step": 14127
+    },
+    {
+      "epoch": 2.5153133903133904,
+      "grad_norm": 0.899135410785675,
+      "learning_rate": 6.0757447566827906e-05,
+      "loss": 1.0829,
+      "step": 14128
+    },
+    {
+      "epoch": 2.515491452991453,
+      "grad_norm": 0.8016429543495178,
+      "learning_rate": 6.074457330340122e-05,
+      "loss": 0.8582,
+      "step": 14129
+    },
+    {
+      "epoch": 2.515669515669516,
+      "grad_norm": 0.7781331539154053,
+      "learning_rate": 6.073169980910307e-05,
+      "loss": 0.8435,
+      "step": 14130
+    },
+    {
+      "epoch": 2.515847578347578,
+      "grad_norm": 0.7605105042457581,
+      "learning_rate": 6.071882708418568e-05,
+      "loss": 0.6961,
+      "step": 14131
+    },
+    {
+      "epoch": 2.516025641025641,
+      "grad_norm": 0.9337655901908875,
+      "learning_rate": 6.0705955128901326e-05,
+      "loss": 0.8673,
+      "step": 14132
+    },
+    {
+      "epoch": 2.5162037037037037,
+      "grad_norm": 0.7868272662162781,
+      "learning_rate": 6.06930839435021e-05,
+      "loss": 0.7526,
+      "step": 14133
+    },
+    {
+      "epoch": 2.5163817663817665,
+      "grad_norm": 0.8722387552261353,
+      "learning_rate": 6.068021352824027e-05,
+      "loss": 0.9541,
+      "step": 14134
+    },
+    {
+      "epoch": 2.5165598290598292,
+      "grad_norm": 0.7682648301124573,
+      "learning_rate": 6.066734388336794e-05,
+      "loss": 0.7191,
+      "step": 14135
+    },
+    {
+      "epoch": 2.5167378917378915,
+      "grad_norm": 0.9540650844573975,
+      "learning_rate": 6.065447500913737e-05,
+      "loss": 1.0638,
+      "step": 14136
+    },
+    {
+      "epoch": 2.5169159544159543,
+      "grad_norm": 0.8276218175888062,
+      "learning_rate": 6.064160690580056e-05,
+      "loss": 0.7967,
+      "step": 14137
+    },
+    {
+      "epoch": 2.517094017094017,
+      "grad_norm": 0.7966098785400391,
+      "learning_rate": 6.062873957360976e-05,
+      "loss": 0.8913,
+      "step": 14138
+    },
+    {
+      "epoch": 2.51727207977208,
+      "grad_norm": 0.9670028686523438,
+      "learning_rate": 6.0615873012816974e-05,
+      "loss": 0.8846,
+      "step": 14139
+    },
+    {
+      "epoch": 2.5174501424501425,
+      "grad_norm": 0.819952666759491,
+      "learning_rate": 6.0603007223674366e-05,
+      "loss": 0.8409,
+      "step": 14140
+    },
+    {
+      "epoch": 2.5176282051282053,
+      "grad_norm": 0.7746681571006775,
+      "learning_rate": 6.0590142206433973e-05,
+      "loss": 0.7382,
+      "step": 14141
+    },
+    {
+      "epoch": 2.517806267806268,
+      "grad_norm": 0.9452744722366333,
+      "learning_rate": 6.057727796134787e-05,
+      "loss": 0.8878,
+      "step": 14142
+    },
+    {
+      "epoch": 2.5179843304843303,
+      "grad_norm": 0.7940170168876648,
+      "learning_rate": 6.0564414488668165e-05,
+      "loss": 0.8289,
+      "step": 14143
+    },
+    {
+      "epoch": 2.518162393162393,
+      "grad_norm": 0.9046176671981812,
+      "learning_rate": 6.0551551788646774e-05,
+      "loss": 0.8596,
+      "step": 14144
+    },
+    {
+      "epoch": 2.518340455840456,
+      "grad_norm": 0.8460658192634583,
+      "learning_rate": 6.053868986153581e-05,
+      "loss": 0.7678,
+      "step": 14145
+    },
+    {
+      "epoch": 2.5185185185185186,
+      "grad_norm": 0.9131760597229004,
+      "learning_rate": 6.052582870758723e-05,
+      "loss": 0.8845,
+      "step": 14146
+    },
+    {
+      "epoch": 2.5186965811965814,
+      "grad_norm": 0.8375167846679688,
+      "learning_rate": 6.0512968327053076e-05,
+      "loss": 1.0082,
+      "step": 14147
+    },
+    {
+      "epoch": 2.5188746438746437,
+      "grad_norm": 0.8587140440940857,
+      "learning_rate": 6.050010872018523e-05,
+      "loss": 0.8745,
+      "step": 14148
+    },
+    {
+      "epoch": 2.5190527065527064,
+      "grad_norm": 0.8347265124320984,
+      "learning_rate": 6.048724988723575e-05,
+      "loss": 0.8345,
+      "step": 14149
+    },
+    {
+      "epoch": 2.519230769230769,
+      "grad_norm": 1.0271183252334595,
+      "learning_rate": 6.047439182845649e-05,
+      "loss": 0.9863,
+      "step": 14150
+    },
+    {
+      "epoch": 2.519408831908832,
+      "grad_norm": 0.6951111555099487,
+      "learning_rate": 6.046153454409943e-05,
+      "loss": 0.6162,
+      "step": 14151
+    },
+    {
+      "epoch": 2.5195868945868947,
+      "grad_norm": 0.7702959179878235,
+      "learning_rate": 6.044867803441645e-05,
+      "loss": 0.8127,
+      "step": 14152
+    },
+    {
+      "epoch": 2.5197649572649574,
+      "grad_norm": 0.7997276186943054,
+      "learning_rate": 6.0435822299659496e-05,
+      "loss": 0.6777,
+      "step": 14153
+    },
+    {
+      "epoch": 2.51994301994302,
+      "grad_norm": 0.8006166815757751,
+      "learning_rate": 6.0422967340080385e-05,
+      "loss": 0.9122,
+      "step": 14154
+    },
+    {
+      "epoch": 2.5201210826210825,
+      "grad_norm": 0.888225793838501,
+      "learning_rate": 6.041011315593102e-05,
+      "loss": 0.7621,
+      "step": 14155
+    },
+    {
+      "epoch": 2.5202991452991452,
+      "grad_norm": 0.928814172744751,
+      "learning_rate": 6.039725974746324e-05,
+      "loss": 1.0245,
+      "step": 14156
+    },
+    {
+      "epoch": 2.520477207977208,
+      "grad_norm": 0.7914403676986694,
+      "learning_rate": 6.038440711492892e-05,
+      "loss": 0.6585,
+      "step": 14157
+    },
+    {
+      "epoch": 2.5206552706552707,
+      "grad_norm": 0.82389897108078,
+      "learning_rate": 6.0371555258579826e-05,
+      "loss": 0.7862,
+      "step": 14158
+    },
+    {
+      "epoch": 2.5208333333333335,
+      "grad_norm": 0.952135443687439,
+      "learning_rate": 6.035870417866778e-05,
+      "loss": 0.8952,
+      "step": 14159
+    },
+    {
+      "epoch": 2.521011396011396,
+      "grad_norm": 0.8626661896705627,
+      "learning_rate": 6.034585387544458e-05,
+      "loss": 0.9166,
+      "step": 14160
+    },
+    {
+      "epoch": 2.5211894586894585,
+      "grad_norm": 0.9641584157943726,
+      "learning_rate": 6.033300434916203e-05,
+      "loss": 0.8481,
+      "step": 14161
+    },
+    {
+      "epoch": 2.5213675213675213,
+      "grad_norm": 0.949110209941864,
+      "learning_rate": 6.0320155600071814e-05,
+      "loss": 0.9628,
+      "step": 14162
+    },
+    {
+      "epoch": 2.521545584045584,
+      "grad_norm": 0.8198522329330444,
+      "learning_rate": 6.030730762842573e-05,
+      "loss": 0.817,
+      "step": 14163
+    },
+    {
+      "epoch": 2.521723646723647,
+      "grad_norm": 0.9209866523742676,
+      "learning_rate": 6.029446043447553e-05,
+      "loss": 0.925,
+      "step": 14164
+    },
+    {
+      "epoch": 2.5219017094017095,
+      "grad_norm": 0.8604369163513184,
+      "learning_rate": 6.0281614018472854e-05,
+      "loss": 0.7846,
+      "step": 14165
+    },
+    {
+      "epoch": 2.5220797720797723,
+      "grad_norm": 0.882255494594574,
+      "learning_rate": 6.026876838066948e-05,
+      "loss": 0.8715,
+      "step": 14166
+    },
+    {
+      "epoch": 2.5222578347578346,
+      "grad_norm": 0.8609021306037903,
+      "learning_rate": 6.0255923521317015e-05,
+      "loss": 0.8627,
+      "step": 14167
+    },
+    {
+      "epoch": 2.5224358974358974,
+      "grad_norm": 0.9782202243804932,
+      "learning_rate": 6.0243079440667226e-05,
+      "loss": 0.8499,
+      "step": 14168
+    },
+    {
+      "epoch": 2.52261396011396,
+      "grad_norm": 0.7932701706886292,
+      "learning_rate": 6.023023613897165e-05,
+      "loss": 0.9174,
+      "step": 14169
+    },
+    {
+      "epoch": 2.522792022792023,
+      "grad_norm": 0.8827422261238098,
+      "learning_rate": 6.021739361648202e-05,
+      "loss": 0.9384,
+      "step": 14170
+    },
+    {
+      "epoch": 2.5229700854700856,
+      "grad_norm": 0.9764171838760376,
+      "learning_rate": 6.020455187344989e-05,
+      "loss": 0.8806,
+      "step": 14171
+    },
+    {
+      "epoch": 2.523148148148148,
+      "grad_norm": 0.7635362148284912,
+      "learning_rate": 6.019171091012694e-05,
+      "loss": 0.7519,
+      "step": 14172
+    },
+    {
+      "epoch": 2.5233262108262107,
+      "grad_norm": 0.9925556182861328,
+      "learning_rate": 6.017887072676468e-05,
+      "loss": 0.8467,
+      "step": 14173
+    },
+    {
+      "epoch": 2.5235042735042734,
+      "grad_norm": 0.9624950289726257,
+      "learning_rate": 6.016603132361477e-05,
+      "loss": 0.9492,
+      "step": 14174
+    },
+    {
+      "epoch": 2.523682336182336,
+      "grad_norm": 0.7960891127586365,
+      "learning_rate": 6.0153192700928685e-05,
+      "loss": 1.0111,
+      "step": 14175
+    },
+    {
+      "epoch": 2.523860398860399,
+      "grad_norm": 0.8387307524681091,
+      "learning_rate": 6.014035485895804e-05,
+      "loss": 0.8013,
+      "step": 14176
+    },
+    {
+      "epoch": 2.5240384615384617,
+      "grad_norm": 0.8488287925720215,
+      "learning_rate": 6.0127517797954316e-05,
+      "loss": 0.8508,
+      "step": 14177
+    },
+    {
+      "epoch": 2.5242165242165244,
+      "grad_norm": 0.7339358329772949,
+      "learning_rate": 6.011468151816908e-05,
+      "loss": 0.7225,
+      "step": 14178
+    },
+    {
+      "epoch": 2.5243945868945867,
+      "grad_norm": 0.9265308976173401,
+      "learning_rate": 6.010184601985378e-05,
+      "loss": 0.7993,
+      "step": 14179
+    },
+    {
+      "epoch": 2.5245726495726495,
+      "grad_norm": 0.7752045392990112,
+      "learning_rate": 6.0089011303259944e-05,
+      "loss": 0.8315,
+      "step": 14180
+    },
+    {
+      "epoch": 2.5247507122507122,
+      "grad_norm": 0.7794929146766663,
+      "learning_rate": 6.007617736863901e-05,
+      "loss": 0.9174,
+      "step": 14181
+    },
+    {
+      "epoch": 2.524928774928775,
+      "grad_norm": 0.9099361896514893,
+      "learning_rate": 6.0063344216242434e-05,
+      "loss": 0.8948,
+      "step": 14182
+    },
+    {
+      "epoch": 2.5251068376068377,
+      "grad_norm": 0.8161521553993225,
+      "learning_rate": 6.005051184632171e-05,
+      "loss": 0.8018,
+      "step": 14183
+    },
+    {
+      "epoch": 2.5252849002849,
+      "grad_norm": 0.9279208183288574,
+      "learning_rate": 6.003768025912819e-05,
+      "loss": 0.9032,
+      "step": 14184
+    },
+    {
+      "epoch": 2.525462962962963,
+      "grad_norm": 0.9689664840698242,
+      "learning_rate": 6.002484945491333e-05,
+      "loss": 0.9463,
+      "step": 14185
+    },
+    {
+      "epoch": 2.5256410256410255,
+      "grad_norm": 0.8367486596107483,
+      "learning_rate": 6.001201943392848e-05,
+      "loss": 0.7866,
+      "step": 14186
+    },
+    {
+      "epoch": 2.5258190883190883,
+      "grad_norm": 0.8383589386940002,
+      "learning_rate": 5.9999190196425056e-05,
+      "loss": 0.7642,
+      "step": 14187
+    },
+    {
+      "epoch": 2.525997150997151,
+      "grad_norm": 0.9113569855690002,
+      "learning_rate": 5.99863617426544e-05,
+      "loss": 1.0451,
+      "step": 14188
+    },
+    {
+      "epoch": 2.526175213675214,
+      "grad_norm": 0.896575391292572,
+      "learning_rate": 5.997353407286788e-05,
+      "loss": 0.8559,
+      "step": 14189
+    },
+    {
+      "epoch": 2.5263532763532766,
+      "grad_norm": 0.899214506149292,
+      "learning_rate": 5.996070718731679e-05,
+      "loss": 1.0006,
+      "step": 14190
+    },
+    {
+      "epoch": 2.5265313390313393,
+      "grad_norm": 0.9739418625831604,
+      "learning_rate": 5.994788108625247e-05,
+      "loss": 0.9412,
+      "step": 14191
+    },
+    {
+      "epoch": 2.5267094017094016,
+      "grad_norm": 0.7940781712532043,
+      "learning_rate": 5.9935055769926215e-05,
+      "loss": 0.8802,
+      "step": 14192
+    },
+    {
+      "epoch": 2.5268874643874644,
+      "grad_norm": 0.802066445350647,
+      "learning_rate": 5.9922231238589346e-05,
+      "loss": 0.8459,
+      "step": 14193
+    },
+    {
+      "epoch": 2.527065527065527,
+      "grad_norm": 0.827560544013977,
+      "learning_rate": 5.990940749249306e-05,
+      "loss": 0.8732,
+      "step": 14194
+    },
+    {
+      "epoch": 2.52724358974359,
+      "grad_norm": 0.7782348394393921,
+      "learning_rate": 5.989658453188869e-05,
+      "loss": 0.6742,
+      "step": 14195
+    },
+    {
+      "epoch": 2.527421652421652,
+      "grad_norm": 0.8418310284614563,
+      "learning_rate": 5.9883762357027416e-05,
+      "loss": 0.8699,
+      "step": 14196
+    },
+    {
+      "epoch": 2.527599715099715,
+      "grad_norm": 0.7925812602043152,
+      "learning_rate": 5.987094096816051e-05,
+      "loss": 0.6807,
+      "step": 14197
+    },
+    {
+      "epoch": 2.5277777777777777,
+      "grad_norm": 0.8200794458389282,
+      "learning_rate": 5.9858120365539105e-05,
+      "loss": 0.8249,
+      "step": 14198
+    },
+    {
+      "epoch": 2.5279558404558404,
+      "grad_norm": 0.7137587070465088,
+      "learning_rate": 5.9845300549414505e-05,
+      "loss": 0.7882,
+      "step": 14199
+    },
+    {
+      "epoch": 2.528133903133903,
+      "grad_norm": 0.8084787726402283,
+      "learning_rate": 5.983248152003778e-05,
+      "loss": 1.0161,
+      "step": 14200
+    },
+    {
+      "epoch": 2.528311965811966,
+      "grad_norm": 0.7717064023017883,
+      "learning_rate": 5.9819663277660156e-05,
+      "loss": 0.798,
+      "step": 14201
+    },
+    {
+      "epoch": 2.5284900284900287,
+      "grad_norm": 0.7722328305244446,
+      "learning_rate": 5.980684582253275e-05,
+      "loss": 0.8324,
+      "step": 14202
+    },
+    {
+      "epoch": 2.5286680911680914,
+      "grad_norm": 0.8357635140419006,
+      "learning_rate": 5.9794029154906696e-05,
+      "loss": 0.9224,
+      "step": 14203
+    },
+    {
+      "epoch": 2.5288461538461537,
+      "grad_norm": 0.8159863352775574,
+      "learning_rate": 5.978121327503317e-05,
+      "loss": 0.7529,
+      "step": 14204
+    },
+    {
+      "epoch": 2.5290242165242165,
+      "grad_norm": 0.8255389332771301,
+      "learning_rate": 5.976839818316317e-05,
+      "loss": 0.9674,
+      "step": 14205
+    },
+    {
+      "epoch": 2.5292022792022792,
+      "grad_norm": 0.8204228281974792,
+      "learning_rate": 5.975558387954787e-05,
+      "loss": 0.9138,
+      "step": 14206
+    },
+    {
+      "epoch": 2.529380341880342,
+      "grad_norm": 0.8232463598251343,
+      "learning_rate": 5.9742770364438275e-05,
+      "loss": 0.7949,
+      "step": 14207
+    },
+    {
+      "epoch": 2.5295584045584043,
+      "grad_norm": 0.8164107203483582,
+      "learning_rate": 5.972995763808551e-05,
+      "loss": 0.7087,
+      "step": 14208
+    },
+    {
+      "epoch": 2.529736467236467,
+      "grad_norm": 0.8100822567939758,
+      "learning_rate": 5.971714570074052e-05,
+      "loss": 0.9187,
+      "step": 14209
+    },
+    {
+      "epoch": 2.52991452991453,
+      "grad_norm": 0.7401103377342224,
+      "learning_rate": 5.970433455265443e-05,
+      "loss": 0.798,
+      "step": 14210
+    },
+    {
+      "epoch": 2.5300925925925926,
+      "grad_norm": 0.798327624797821,
+      "learning_rate": 5.9691524194078154e-05,
+      "loss": 0.7312,
+      "step": 14211
+    },
+    {
+      "epoch": 2.5302706552706553,
+      "grad_norm": 0.8566045165061951,
+      "learning_rate": 5.9678714625262754e-05,
+      "loss": 0.8555,
+      "step": 14212
+    },
+    {
+      "epoch": 2.530448717948718,
+      "grad_norm": 0.8005902767181396,
+      "learning_rate": 5.9665905846459155e-05,
+      "loss": 0.7979,
+      "step": 14213
+    },
+    {
+      "epoch": 2.530626780626781,
+      "grad_norm": 0.815990686416626,
+      "learning_rate": 5.9653097857918396e-05,
+      "loss": 0.8739,
+      "step": 14214
+    },
+    {
+      "epoch": 2.5308048433048436,
+      "grad_norm": 0.7694230079650879,
+      "learning_rate": 5.9640290659891316e-05,
+      "loss": 0.7249,
+      "step": 14215
+    },
+    {
+      "epoch": 2.530982905982906,
+      "grad_norm": 0.8469253182411194,
+      "learning_rate": 5.962748425262892e-05,
+      "loss": 0.8505,
+      "step": 14216
+    },
+    {
+      "epoch": 2.5311609686609686,
+      "grad_norm": 0.8061797022819519,
+      "learning_rate": 5.961467863638209e-05,
+      "loss": 0.8979,
+      "step": 14217
+    },
+    {
+      "epoch": 2.5313390313390314,
+      "grad_norm": 1.0380569696426392,
+      "learning_rate": 5.960187381140179e-05,
+      "loss": 0.8664,
+      "step": 14218
+    },
+    {
+      "epoch": 2.531517094017094,
+      "grad_norm": 0.9435166716575623,
+      "learning_rate": 5.9589069777938786e-05,
+      "loss": 0.7566,
+      "step": 14219
+    },
+    {
+      "epoch": 2.5316951566951564,
+      "grad_norm": 0.8882613182067871,
+      "learning_rate": 5.957626653624407e-05,
+      "loss": 0.6999,
+      "step": 14220
+    },
+    {
+      "epoch": 2.531873219373219,
+      "grad_norm": 0.8544003963470459,
+      "learning_rate": 5.95634640865684e-05,
+      "loss": 0.8028,
+      "step": 14221
+    },
+    {
+      "epoch": 2.532051282051282,
+      "grad_norm": 0.8407679796218872,
+      "learning_rate": 5.9550662429162655e-05,
+      "loss": 0.6868,
+      "step": 14222
+    },
+    {
+      "epoch": 2.5322293447293447,
+      "grad_norm": 0.9049725532531738,
+      "learning_rate": 5.9537861564277654e-05,
+      "loss": 0.8177,
+      "step": 14223
+    },
+    {
+      "epoch": 2.5324074074074074,
+      "grad_norm": 0.938050389289856,
+      "learning_rate": 5.952506149216419e-05,
+      "loss": 0.9187,
+      "step": 14224
+    },
+    {
+      "epoch": 2.53258547008547,
+      "grad_norm": 0.9515482783317566,
+      "learning_rate": 5.951226221307312e-05,
+      "loss": 0.6735,
+      "step": 14225
+    },
+    {
+      "epoch": 2.532763532763533,
+      "grad_norm": 0.8545815348625183,
+      "learning_rate": 5.949946372725512e-05,
+      "loss": 0.7643,
+      "step": 14226
+    },
+    {
+      "epoch": 2.5329415954415957,
+      "grad_norm": 0.8388620615005493,
+      "learning_rate": 5.9486666034961e-05,
+      "loss": 0.9437,
+      "step": 14227
+    },
+    {
+      "epoch": 2.533119658119658,
+      "grad_norm": 0.7202512621879578,
+      "learning_rate": 5.9473869136441506e-05,
+      "loss": 0.718,
+      "step": 14228
+    },
+    {
+      "epoch": 2.5332977207977208,
+      "grad_norm": 0.8375558853149414,
+      "learning_rate": 5.946107303194739e-05,
+      "loss": 0.852,
+      "step": 14229
+    },
+    {
+      "epoch": 2.5334757834757835,
+      "grad_norm": 0.8980572819709778,
+      "learning_rate": 5.94482777217293e-05,
+      "loss": 0.9684,
+      "step": 14230
+    },
+    {
+      "epoch": 2.5336538461538463,
+      "grad_norm": 0.7374732494354248,
+      "learning_rate": 5.9435483206037977e-05,
+      "loss": 0.6498,
+      "step": 14231
+    },
+    {
+      "epoch": 2.5338319088319086,
+      "grad_norm": 1.073758602142334,
+      "learning_rate": 5.942268948512409e-05,
+      "loss": 1.0315,
+      "step": 14232
+    },
+    {
+      "epoch": 2.5340099715099713,
+      "grad_norm": 0.9503611326217651,
+      "learning_rate": 5.940989655923832e-05,
+      "loss": 1.0644,
+      "step": 14233
+    },
+    {
+      "epoch": 2.534188034188034,
+      "grad_norm": 0.870490550994873,
+      "learning_rate": 5.939710442863129e-05,
+      "loss": 1.1,
+      "step": 14234
+    },
+    {
+      "epoch": 2.534366096866097,
+      "grad_norm": 0.8019965887069702,
+      "learning_rate": 5.93843130935537e-05,
+      "loss": 0.9169,
+      "step": 14235
+    },
+    {
+      "epoch": 2.5345441595441596,
+      "grad_norm": 0.8333065509796143,
+      "learning_rate": 5.9371522554256076e-05,
+      "loss": 0.7848,
+      "step": 14236
+    },
+    {
+      "epoch": 2.5347222222222223,
+      "grad_norm": 0.8606435656547546,
+      "learning_rate": 5.935873281098909e-05,
+      "loss": 0.9941,
+      "step": 14237
+    },
+    {
+      "epoch": 2.534900284900285,
+      "grad_norm": 0.7711295485496521,
+      "learning_rate": 5.934594386400328e-05,
+      "loss": 0.8495,
+      "step": 14238
+    },
+    {
+      "epoch": 2.535078347578348,
+      "grad_norm": 0.871533215045929,
+      "learning_rate": 5.93331557135493e-05,
+      "loss": 0.9071,
+      "step": 14239
+    },
+    {
+      "epoch": 2.53525641025641,
+      "grad_norm": 0.9828163981437683,
+      "learning_rate": 5.932036835987762e-05,
+      "loss": 0.9561,
+      "step": 14240
+    },
+    {
+      "epoch": 2.535434472934473,
+      "grad_norm": 0.8485092520713806,
+      "learning_rate": 5.930758180323881e-05,
+      "loss": 0.7278,
+      "step": 14241
+    },
+    {
+      "epoch": 2.5356125356125356,
+      "grad_norm": 0.7608986496925354,
+      "learning_rate": 5.929479604388342e-05,
+      "loss": 1.0449,
+      "step": 14242
+    },
+    {
+      "epoch": 2.5357905982905984,
+      "grad_norm": 0.7852896451950073,
+      "learning_rate": 5.928201108206193e-05,
+      "loss": 0.8844,
+      "step": 14243
+    },
+    {
+      "epoch": 2.5359686609686607,
+      "grad_norm": 0.7636764645576477,
+      "learning_rate": 5.9269226918024875e-05,
+      "loss": 0.8259,
+      "step": 14244
+    },
+    {
+      "epoch": 2.5361467236467234,
+      "grad_norm": 0.9067455530166626,
+      "learning_rate": 5.925644355202269e-05,
+      "loss": 0.8742,
+      "step": 14245
+    },
+    {
+      "epoch": 2.536324786324786,
+      "grad_norm": 0.7911350727081299,
+      "learning_rate": 5.924366098430588e-05,
+      "loss": 0.8586,
+      "step": 14246
+    },
+    {
+      "epoch": 2.536502849002849,
+      "grad_norm": 0.8010593056678772,
+      "learning_rate": 5.923087921512483e-05,
+      "loss": 0.8524,
+      "step": 14247
+    },
+    {
+      "epoch": 2.5366809116809117,
+      "grad_norm": 1.232219934463501,
+      "learning_rate": 5.9218098244730034e-05,
+      "loss": 0.8302,
+      "step": 14248
+    },
+    {
+      "epoch": 2.5368589743589745,
+      "grad_norm": 0.8717244267463684,
+      "learning_rate": 5.9205318073371874e-05,
+      "loss": 0.8692,
+      "step": 14249
+    },
+    {
+      "epoch": 2.537037037037037,
+      "grad_norm": 0.9757453799247742,
+      "learning_rate": 5.919253870130079e-05,
+      "loss": 0.7986,
+      "step": 14250
+    },
+    {
+      "epoch": 2.5372150997151,
+      "grad_norm": 0.8183274865150452,
+      "learning_rate": 5.917976012876712e-05,
+      "loss": 0.8277,
+      "step": 14251
+    },
+    {
+      "epoch": 2.5373931623931623,
+      "grad_norm": 0.823930025100708,
+      "learning_rate": 5.916698235602125e-05,
+      "loss": 0.7972,
+      "step": 14252
+    },
+    {
+      "epoch": 2.537571225071225,
+      "grad_norm": 0.8480231761932373,
+      "learning_rate": 5.915420538331353e-05,
+      "loss": 0.8234,
+      "step": 14253
+    },
+    {
+      "epoch": 2.5377492877492878,
+      "grad_norm": 0.6718716621398926,
+      "learning_rate": 5.914142921089434e-05,
+      "loss": 0.5984,
+      "step": 14254
+    },
+    {
+      "epoch": 2.5379273504273505,
+      "grad_norm": 0.8506333827972412,
+      "learning_rate": 5.912865383901394e-05,
+      "loss": 0.8004,
+      "step": 14255
+    },
+    {
+      "epoch": 2.5381054131054133,
+      "grad_norm": 0.755740225315094,
+      "learning_rate": 5.911587926792269e-05,
+      "loss": 0.7465,
+      "step": 14256
+    },
+    {
+      "epoch": 2.5382834757834756,
+      "grad_norm": 0.8908467888832092,
+      "learning_rate": 5.9103105497870815e-05,
+      "loss": 0.7822,
+      "step": 14257
+    },
+    {
+      "epoch": 2.5384615384615383,
+      "grad_norm": 0.9572851061820984,
+      "learning_rate": 5.909033252910867e-05,
+      "loss": 0.7891,
+      "step": 14258
+    },
+    {
+      "epoch": 2.538639601139601,
+      "grad_norm": 0.8606489896774292,
+      "learning_rate": 5.907756036188644e-05,
+      "loss": 0.9071,
+      "step": 14259
+    },
+    {
+      "epoch": 2.538817663817664,
+      "grad_norm": 0.837658166885376,
+      "learning_rate": 5.906478899645444e-05,
+      "loss": 0.829,
+      "step": 14260
+    },
+    {
+      "epoch": 2.5389957264957266,
+      "grad_norm": 0.8964337706565857,
+      "learning_rate": 5.905201843306285e-05,
+      "loss": 1.0385,
+      "step": 14261
+    },
+    {
+      "epoch": 2.5391737891737893,
+      "grad_norm": 0.7854750156402588,
+      "learning_rate": 5.903924867196189e-05,
+      "loss": 0.8905,
+      "step": 14262
+    },
+    {
+      "epoch": 2.539351851851852,
+      "grad_norm": 0.8828065991401672,
+      "learning_rate": 5.902647971340176e-05,
+      "loss": 0.8541,
+      "step": 14263
+    },
+    {
+      "epoch": 2.5395299145299144,
+      "grad_norm": 0.804121196269989,
+      "learning_rate": 5.9013711557632645e-05,
+      "loss": 0.8333,
+      "step": 14264
+    },
+    {
+      "epoch": 2.539707977207977,
+      "grad_norm": 0.8868918418884277,
+      "learning_rate": 5.900094420490475e-05,
+      "loss": 0.7959,
+      "step": 14265
+    },
+    {
+      "epoch": 2.53988603988604,
+      "grad_norm": 0.9231327176094055,
+      "learning_rate": 5.8988177655468134e-05,
+      "loss": 0.8007,
+      "step": 14266
+    },
+    {
+      "epoch": 2.5400641025641026,
+      "grad_norm": 0.9918177127838135,
+      "learning_rate": 5.897541190957301e-05,
+      "loss": 0.7766,
+      "step": 14267
+    },
+    {
+      "epoch": 2.5402421652421654,
+      "grad_norm": 0.8468625545501709,
+      "learning_rate": 5.896264696746947e-05,
+      "loss": 0.9209,
+      "step": 14268
+    },
+    {
+      "epoch": 2.5404202279202277,
+      "grad_norm": 0.9160833358764648,
+      "learning_rate": 5.894988282940761e-05,
+      "loss": 0.8994,
+      "step": 14269
+    },
+    {
+      "epoch": 2.5405982905982905,
+      "grad_norm": 0.8029152154922485,
+      "learning_rate": 5.8937119495637515e-05,
+      "loss": 0.7936,
+      "step": 14270
+    },
+    {
+      "epoch": 2.540776353276353,
+      "grad_norm": 0.8829928636550903,
+      "learning_rate": 5.8924356966409286e-05,
+      "loss": 0.7368,
+      "step": 14271
+    },
+    {
+      "epoch": 2.540954415954416,
+      "grad_norm": 0.9698056578636169,
+      "learning_rate": 5.8911595241972925e-05,
+      "loss": 0.789,
+      "step": 14272
+    },
+    {
+      "epoch": 2.5411324786324787,
+      "grad_norm": 0.7949244379997253,
+      "learning_rate": 5.8898834322578524e-05,
+      "loss": 0.8885,
+      "step": 14273
+    },
+    {
+      "epoch": 2.5413105413105415,
+      "grad_norm": 1.2430917024612427,
+      "learning_rate": 5.888607420847605e-05,
+      "loss": 0.861,
+      "step": 14274
+    },
+    {
+      "epoch": 2.541488603988604,
+      "grad_norm": 0.7476705312728882,
+      "learning_rate": 5.887331489991559e-05,
+      "loss": 0.7942,
+      "step": 14275
+    },
+    {
+      "epoch": 2.5416666666666665,
+      "grad_norm": 0.9204338192939758,
+      "learning_rate": 5.886055639714706e-05,
+      "loss": 0.8633,
+      "step": 14276
+    },
+    {
+      "epoch": 2.5418447293447293,
+      "grad_norm": 0.8812162280082703,
+      "learning_rate": 5.884779870042047e-05,
+      "loss": 0.7162,
+      "step": 14277
+    },
+    {
+      "epoch": 2.542022792022792,
+      "grad_norm": 0.7859770655632019,
+      "learning_rate": 5.883504180998578e-05,
+      "loss": 0.7965,
+      "step": 14278
+    },
+    {
+      "epoch": 2.5422008547008548,
+      "grad_norm": 0.7732986211776733,
+      "learning_rate": 5.882228572609296e-05,
+      "loss": 0.9671,
+      "step": 14279
+    },
+    {
+      "epoch": 2.5423789173789175,
+      "grad_norm": 0.8555598855018616,
+      "learning_rate": 5.880953044899189e-05,
+      "loss": 0.8993,
+      "step": 14280
+    },
+    {
+      "epoch": 2.54255698005698,
+      "grad_norm": 0.7980908155441284,
+      "learning_rate": 5.879677597893248e-05,
+      "loss": 0.873,
+      "step": 14281
+    },
+    {
+      "epoch": 2.5427350427350426,
+      "grad_norm": 0.9244991540908813,
+      "learning_rate": 5.878402231616471e-05,
+      "loss": 0.934,
+      "step": 14282
+    },
+    {
+      "epoch": 2.5429131054131053,
+      "grad_norm": 1.0128331184387207,
+      "learning_rate": 5.877126946093835e-05,
+      "loss": 0.9607,
+      "step": 14283
+    },
+    {
+      "epoch": 2.543091168091168,
+      "grad_norm": 0.7916569709777832,
+      "learning_rate": 5.875851741350334e-05,
+      "loss": 0.831,
+      "step": 14284
+    },
+    {
+      "epoch": 2.543269230769231,
+      "grad_norm": 0.705007791519165,
+      "learning_rate": 5.8745766174109495e-05,
+      "loss": 0.6399,
+      "step": 14285
+    },
+    {
+      "epoch": 2.5434472934472936,
+      "grad_norm": 0.8785403966903687,
+      "learning_rate": 5.873301574300671e-05,
+      "loss": 0.9336,
+      "step": 14286
+    },
+    {
+      "epoch": 2.5436253561253563,
+      "grad_norm": 0.8225776553153992,
+      "learning_rate": 5.872026612044471e-05,
+      "loss": 0.8252,
+      "step": 14287
+    },
+    {
+      "epoch": 2.5438034188034186,
+      "grad_norm": 0.9629518985748291,
+      "learning_rate": 5.870751730667337e-05,
+      "loss": 1.0213,
+      "step": 14288
+    },
+    {
+      "epoch": 2.5439814814814814,
+      "grad_norm": 0.8242672681808472,
+      "learning_rate": 5.869476930194242e-05,
+      "loss": 0.9642,
+      "step": 14289
+    },
+    {
+      "epoch": 2.544159544159544,
+      "grad_norm": 0.5798216462135315,
+      "learning_rate": 5.868202210650171e-05,
+      "loss": 0.4366,
+      "step": 14290
+    },
+    {
+      "epoch": 2.544337606837607,
+      "grad_norm": 0.7945725917816162,
+      "learning_rate": 5.86692757206009e-05,
+      "loss": 0.9252,
+      "step": 14291
+    },
+    {
+      "epoch": 2.5445156695156697,
+      "grad_norm": 0.9078665375709534,
+      "learning_rate": 5.865653014448982e-05,
+      "loss": 1.0551,
+      "step": 14292
+    },
+    {
+      "epoch": 2.544693732193732,
+      "grad_norm": 0.8044732809066772,
+      "learning_rate": 5.86437853784181e-05,
+      "loss": 0.7778,
+      "step": 14293
+    },
+    {
+      "epoch": 2.5448717948717947,
+      "grad_norm": 0.8317133784294128,
+      "learning_rate": 5.863104142263553e-05,
+      "loss": 1.0047,
+      "step": 14294
+    },
+    {
+      "epoch": 2.5450498575498575,
+      "grad_norm": 0.8330327272415161,
+      "learning_rate": 5.861829827739174e-05,
+      "loss": 0.8074,
+      "step": 14295
+    },
+    {
+      "epoch": 2.54522792022792,
+      "grad_norm": 0.8731801509857178,
+      "learning_rate": 5.8605555942936474e-05,
+      "loss": 0.9311,
+      "step": 14296
+    },
+    {
+      "epoch": 2.545405982905983,
+      "grad_norm": 0.8906812071800232,
+      "learning_rate": 5.85928144195193e-05,
+      "loss": 0.9084,
+      "step": 14297
+    },
+    {
+      "epoch": 2.5455840455840457,
+      "grad_norm": 0.948535144329071,
+      "learning_rate": 5.8580073707389935e-05,
+      "loss": 0.923,
+      "step": 14298
+    },
+    {
+      "epoch": 2.5457621082621085,
+      "grad_norm": 1.0418797731399536,
+      "learning_rate": 5.8567333806797975e-05,
+      "loss": 0.9786,
+      "step": 14299
+    },
+    {
+      "epoch": 2.5459401709401708,
+      "grad_norm": 0.8591430187225342,
+      "learning_rate": 5.8554594717993075e-05,
+      "loss": 0.8706,
+      "step": 14300
+    },
+    {
+      "epoch": 2.5461182336182335,
+      "grad_norm": 1.1056550741195679,
+      "learning_rate": 5.854185644122475e-05,
+      "loss": 0.891,
+      "step": 14301
+    },
+    {
+      "epoch": 2.5462962962962963,
+      "grad_norm": 0.8945133090019226,
+      "learning_rate": 5.8529118976742624e-05,
+      "loss": 0.9584,
+      "step": 14302
+    },
+    {
+      "epoch": 2.546474358974359,
+      "grad_norm": 0.8568279147148132,
+      "learning_rate": 5.851638232479629e-05,
+      "loss": 0.7462,
+      "step": 14303
+    },
+    {
+      "epoch": 2.546652421652422,
+      "grad_norm": 0.934648871421814,
+      "learning_rate": 5.850364648563527e-05,
+      "loss": 0.8977,
+      "step": 14304
+    },
+    {
+      "epoch": 2.546830484330484,
+      "grad_norm": 0.8074216842651367,
+      "learning_rate": 5.849091145950909e-05,
+      "loss": 0.8779,
+      "step": 14305
+    },
+    {
+      "epoch": 2.547008547008547,
+      "grad_norm": 0.8781399726867676,
+      "learning_rate": 5.8478177246667266e-05,
+      "loss": 0.8715,
+      "step": 14306
+    },
+    {
+      "epoch": 2.5471866096866096,
+      "grad_norm": 0.8237441182136536,
+      "learning_rate": 5.846544384735933e-05,
+      "loss": 0.8806,
+      "step": 14307
+    },
+    {
+      "epoch": 2.5473646723646723,
+      "grad_norm": 0.933709442615509,
+      "learning_rate": 5.8452711261834717e-05,
+      "loss": 0.8555,
+      "step": 14308
+    },
+    {
+      "epoch": 2.547542735042735,
+      "grad_norm": 0.9045436978340149,
+      "learning_rate": 5.843997949034292e-05,
+      "loss": 1.0105,
+      "step": 14309
+    },
+    {
+      "epoch": 2.547720797720798,
+      "grad_norm": 0.8088112473487854,
+      "learning_rate": 5.842724853313337e-05,
+      "loss": 0.8667,
+      "step": 14310
+    },
+    {
+      "epoch": 2.5478988603988606,
+      "grad_norm": 1.1022162437438965,
+      "learning_rate": 5.841451839045559e-05,
+      "loss": 0.9818,
+      "step": 14311
+    },
+    {
+      "epoch": 2.5480769230769234,
+      "grad_norm": 0.8974189162254333,
+      "learning_rate": 5.8401789062558876e-05,
+      "loss": 0.843,
+      "step": 14312
+    },
+    {
+      "epoch": 2.5482549857549857,
+      "grad_norm": 0.7816309928894043,
+      "learning_rate": 5.838906054969272e-05,
+      "loss": 0.8665,
+      "step": 14313
+    },
+    {
+      "epoch": 2.5484330484330484,
+      "grad_norm": 0.8243623971939087,
+      "learning_rate": 5.8376332852106485e-05,
+      "loss": 0.7291,
+      "step": 14314
+    },
+    {
+      "epoch": 2.548611111111111,
+      "grad_norm": 0.8475931286811829,
+      "learning_rate": 5.8363605970049526e-05,
+      "loss": 0.7551,
+      "step": 14315
+    },
+    {
+      "epoch": 2.548789173789174,
+      "grad_norm": 0.8949251770973206,
+      "learning_rate": 5.835087990377124e-05,
+      "loss": 0.9138,
+      "step": 14316
+    },
+    {
+      "epoch": 2.548967236467236,
+      "grad_norm": 1.0444703102111816,
+      "learning_rate": 5.833815465352093e-05,
+      "loss": 0.8663,
+      "step": 14317
+    },
+    {
+      "epoch": 2.549145299145299,
+      "grad_norm": 0.8611619472503662,
+      "learning_rate": 5.8325430219547895e-05,
+      "loss": 0.795,
+      "step": 14318
+    },
+    {
+      "epoch": 2.5493233618233617,
+      "grad_norm": 0.7808047533035278,
+      "learning_rate": 5.8312706602101564e-05,
+      "loss": 0.905,
+      "step": 14319
+    },
+    {
+      "epoch": 2.5495014245014245,
+      "grad_norm": 0.9137473106384277,
+      "learning_rate": 5.8299983801431066e-05,
+      "loss": 0.8763,
+      "step": 14320
+    },
+    {
+      "epoch": 2.5496794871794872,
+      "grad_norm": 0.9511715769767761,
+      "learning_rate": 5.828726181778581e-05,
+      "loss": 0.8385,
+      "step": 14321
+    },
+    {
+      "epoch": 2.54985754985755,
+      "grad_norm": 0.9250940084457397,
+      "learning_rate": 5.8274540651415e-05,
+      "loss": 0.7026,
+      "step": 14322
+    },
+    {
+      "epoch": 2.5500356125356127,
+      "grad_norm": 1.001017689704895,
+      "learning_rate": 5.826182030256786e-05,
+      "loss": 0.7952,
+      "step": 14323
+    },
+    {
+      "epoch": 2.5502136752136755,
+      "grad_norm": 0.7638011574745178,
+      "learning_rate": 5.824910077149371e-05,
+      "loss": 0.7178,
+      "step": 14324
+    },
+    {
+      "epoch": 2.550391737891738,
+      "grad_norm": 0.9289371967315674,
+      "learning_rate": 5.823638205844164e-05,
+      "loss": 0.8492,
+      "step": 14325
+    },
+    {
+      "epoch": 2.5505698005698005,
+      "grad_norm": 0.8494341969490051,
+      "learning_rate": 5.822366416366093e-05,
+      "loss": 0.9095,
+      "step": 14326
+    },
+    {
+      "epoch": 2.5507478632478633,
+      "grad_norm": 0.8686699867248535,
+      "learning_rate": 5.8210947087400746e-05,
+      "loss": 0.9548,
+      "step": 14327
+    },
+    {
+      "epoch": 2.550925925925926,
+      "grad_norm": 1.1318142414093018,
+      "learning_rate": 5.819823082991025e-05,
+      "loss": 1.0554,
+      "step": 14328
+    },
+    {
+      "epoch": 2.5511039886039883,
+      "grad_norm": 0.8405448198318481,
+      "learning_rate": 5.818551539143857e-05,
+      "loss": 0.704,
+      "step": 14329
+    },
+    {
+      "epoch": 2.551282051282051,
+      "grad_norm": 0.9133256673812866,
+      "learning_rate": 5.8172800772234856e-05,
+      "loss": 0.922,
+      "step": 14330
+    },
+    {
+      "epoch": 2.551460113960114,
+      "grad_norm": 0.8526531457901001,
+      "learning_rate": 5.816008697254824e-05,
+      "loss": 0.6452,
+      "step": 14331
+    },
+    {
+      "epoch": 2.5516381766381766,
+      "grad_norm": 0.7987905144691467,
+      "learning_rate": 5.81473739926278e-05,
+      "loss": 0.8159,
+      "step": 14332
+    },
+    {
+      "epoch": 2.5518162393162394,
+      "grad_norm": 0.8217538595199585,
+      "learning_rate": 5.813466183272257e-05,
+      "loss": 0.6703,
+      "step": 14333
+    },
+    {
+      "epoch": 2.551994301994302,
+      "grad_norm": 0.7654905915260315,
+      "learning_rate": 5.8121950493081765e-05,
+      "loss": 0.8711,
+      "step": 14334
+    },
+    {
+      "epoch": 2.552172364672365,
+      "grad_norm": 0.872327983379364,
+      "learning_rate": 5.8109239973954264e-05,
+      "loss": 1.0079,
+      "step": 14335
+    },
+    {
+      "epoch": 2.5523504273504276,
+      "grad_norm": 0.7675468325614929,
+      "learning_rate": 5.809653027558922e-05,
+      "loss": 0.7541,
+      "step": 14336
+    },
+    {
+      "epoch": 2.55252849002849,
+      "grad_norm": 0.8367551565170288,
+      "learning_rate": 5.808382139823563e-05,
+      "loss": 0.9325,
+      "step": 14337
+    },
+    {
+      "epoch": 2.5527065527065527,
+      "grad_norm": 0.7946585416793823,
+      "learning_rate": 5.807111334214248e-05,
+      "loss": 0.6703,
+      "step": 14338
+    },
+    {
+      "epoch": 2.5528846153846154,
+      "grad_norm": 0.8752394318580627,
+      "learning_rate": 5.805840610755876e-05,
+      "loss": 0.82,
+      "step": 14339
+    },
+    {
+      "epoch": 2.553062678062678,
+      "grad_norm": 0.9394813776016235,
+      "learning_rate": 5.804569969473341e-05,
+      "loss": 1.0094,
+      "step": 14340
+    },
+    {
+      "epoch": 2.5532407407407405,
+      "grad_norm": 0.7763680219650269,
+      "learning_rate": 5.803299410391551e-05,
+      "loss": 0.8424,
+      "step": 14341
+    },
+    {
+      "epoch": 2.5534188034188032,
+      "grad_norm": 0.9148688912391663,
+      "learning_rate": 5.8020289335353816e-05,
+      "loss": 0.9344,
+      "step": 14342
+    },
+    {
+      "epoch": 2.553596866096866,
+      "grad_norm": 0.8666844367980957,
+      "learning_rate": 5.80075853892974e-05,
+      "loss": 0.8651,
+      "step": 14343
+    },
+    {
+      "epoch": 2.5537749287749287,
+      "grad_norm": 0.7240473628044128,
+      "learning_rate": 5.799488226599511e-05,
+      "loss": 0.6913,
+      "step": 14344
+    },
+    {
+      "epoch": 2.5539529914529915,
+      "grad_norm": 0.8949013352394104,
+      "learning_rate": 5.798217996569585e-05,
+      "loss": 0.7419,
+      "step": 14345
+    },
+    {
+      "epoch": 2.5541310541310542,
+      "grad_norm": 0.7760846614837646,
+      "learning_rate": 5.796947848864849e-05,
+      "loss": 0.8292,
+      "step": 14346
+    },
+    {
+      "epoch": 2.554309116809117,
+      "grad_norm": 0.8448507785797119,
+      "learning_rate": 5.795677783510187e-05,
+      "loss": 0.9953,
+      "step": 14347
+    },
+    {
+      "epoch": 2.5544871794871797,
+      "grad_norm": 0.834007203578949,
+      "learning_rate": 5.794407800530484e-05,
+      "loss": 0.8135,
+      "step": 14348
+    },
+    {
+      "epoch": 2.554665242165242,
+      "grad_norm": 0.8247915506362915,
+      "learning_rate": 5.793137899950629e-05,
+      "loss": 0.8607,
+      "step": 14349
+    },
+    {
+      "epoch": 2.554843304843305,
+      "grad_norm": 0.8796288967132568,
+      "learning_rate": 5.7918680817954906e-05,
+      "loss": 1.0479,
+      "step": 14350
+    },
+    {
+      "epoch": 2.5550213675213675,
+      "grad_norm": 0.8384763598442078,
+      "learning_rate": 5.790598346089964e-05,
+      "loss": 0.98,
+      "step": 14351
+    },
+    {
+      "epoch": 2.5551994301994303,
+      "grad_norm": 0.9394076466560364,
+      "learning_rate": 5.7893286928589107e-05,
+      "loss": 0.922,
+      "step": 14352
+    },
+    {
+      "epoch": 2.5553774928774926,
+      "grad_norm": 0.9548128843307495,
+      "learning_rate": 5.7880591221272184e-05,
+      "loss": 0.9877,
+      "step": 14353
+    },
+    {
+      "epoch": 2.5555555555555554,
+      "grad_norm": 0.7609717845916748,
+      "learning_rate": 5.786789633919758e-05,
+      "loss": 0.8115,
+      "step": 14354
+    },
+    {
+      "epoch": 2.555733618233618,
+      "grad_norm": 0.7415568232536316,
+      "learning_rate": 5.785520228261403e-05,
+      "loss": 0.6336,
+      "step": 14355
+    },
+    {
+      "epoch": 2.555911680911681,
+      "grad_norm": 0.8595952391624451,
+      "learning_rate": 5.7842509051770246e-05,
+      "loss": 0.8065,
+      "step": 14356
+    },
+    {
+      "epoch": 2.5560897435897436,
+      "grad_norm": 1.0075218677520752,
+      "learning_rate": 5.782981664691491e-05,
+      "loss": 0.9967,
+      "step": 14357
+    },
+    {
+      "epoch": 2.5562678062678064,
+      "grad_norm": 0.8405288457870483,
+      "learning_rate": 5.781712506829669e-05,
+      "loss": 0.7953,
+      "step": 14358
+    },
+    {
+      "epoch": 2.556445868945869,
+      "grad_norm": 0.8259321451187134,
+      "learning_rate": 5.780443431616435e-05,
+      "loss": 0.925,
+      "step": 14359
+    },
+    {
+      "epoch": 2.556623931623932,
+      "grad_norm": 0.8155162334442139,
+      "learning_rate": 5.7791744390766376e-05,
+      "loss": 0.9658,
+      "step": 14360
+    },
+    {
+      "epoch": 2.556801994301994,
+      "grad_norm": 0.8670404553413391,
+      "learning_rate": 5.7779055292351545e-05,
+      "loss": 1.0029,
+      "step": 14361
+    },
+    {
+      "epoch": 2.556980056980057,
+      "grad_norm": 0.8574714660644531,
+      "learning_rate": 5.7766367021168423e-05,
+      "loss": 0.9208,
+      "step": 14362
+    },
+    {
+      "epoch": 2.5571581196581197,
+      "grad_norm": 1.0231248140335083,
+      "learning_rate": 5.775367957746556e-05,
+      "loss": 1.0422,
+      "step": 14363
+    },
+    {
+      "epoch": 2.5573361823361824,
+      "grad_norm": 0.8403676152229309,
+      "learning_rate": 5.7740992961491655e-05,
+      "loss": 0.8068,
+      "step": 14364
+    },
+    {
+      "epoch": 2.557514245014245,
+      "grad_norm": 0.8792767524719238,
+      "learning_rate": 5.7728307173495136e-05,
+      "loss": 1.0405,
+      "step": 14365
+    },
+    {
+      "epoch": 2.5576923076923075,
+      "grad_norm": 0.8546510934829712,
+      "learning_rate": 5.771562221372471e-05,
+      "loss": 0.8246,
+      "step": 14366
+    },
+    {
+      "epoch": 2.5578703703703702,
+      "grad_norm": 0.7620588541030884,
+      "learning_rate": 5.770293808242875e-05,
+      "loss": 0.7588,
+      "step": 14367
+    },
+    {
+      "epoch": 2.558048433048433,
+      "grad_norm": 0.8154500722885132,
+      "learning_rate": 5.769025477985588e-05,
+      "loss": 0.8217,
+      "step": 14368
+    },
+    {
+      "epoch": 2.5582264957264957,
+      "grad_norm": 0.8630158305168152,
+      "learning_rate": 5.767757230625459e-05,
+      "loss": 0.8486,
+      "step": 14369
+    },
+    {
+      "epoch": 2.5584045584045585,
+      "grad_norm": 0.8991047143936157,
+      "learning_rate": 5.766489066187335e-05,
+      "loss": 0.9012,
+      "step": 14370
+    },
+    {
+      "epoch": 2.5585826210826212,
+      "grad_norm": 1.056725263595581,
+      "learning_rate": 5.7652209846960626e-05,
+      "loss": 0.8764,
+      "step": 14371
+    },
+    {
+      "epoch": 2.558760683760684,
+      "grad_norm": 0.7467330694198608,
+      "learning_rate": 5.7639529861764885e-05,
+      "loss": 0.6614,
+      "step": 14372
+    },
+    {
+      "epoch": 2.5589387464387463,
+      "grad_norm": 0.7930710315704346,
+      "learning_rate": 5.762685070653453e-05,
+      "loss": 0.6866,
+      "step": 14373
+    },
+    {
+      "epoch": 2.559116809116809,
+      "grad_norm": 0.9234277606010437,
+      "learning_rate": 5.7614172381518085e-05,
+      "loss": 0.9158,
+      "step": 14374
+    },
+    {
+      "epoch": 2.559294871794872,
+      "grad_norm": 1.0100786685943604,
+      "learning_rate": 5.7601494886963806e-05,
+      "loss": 0.9061,
+      "step": 14375
+    },
+    {
+      "epoch": 2.5594729344729346,
+      "grad_norm": 0.9864867925643921,
+      "learning_rate": 5.758881822312023e-05,
+      "loss": 0.9955,
+      "step": 14376
+    },
+    {
+      "epoch": 2.5596509971509973,
+      "grad_norm": 0.7328418493270874,
+      "learning_rate": 5.757614239023559e-05,
+      "loss": 0.874,
+      "step": 14377
+    },
+    {
+      "epoch": 2.5598290598290596,
+      "grad_norm": 0.8538700938224792,
+      "learning_rate": 5.7563467388558355e-05,
+      "loss": 0.8251,
+      "step": 14378
+    },
+    {
+      "epoch": 2.5600071225071224,
+      "grad_norm": 0.7603667378425598,
+      "learning_rate": 5.755079321833681e-05,
+      "loss": 0.8466,
+      "step": 14379
+    },
+    {
+      "epoch": 2.560185185185185,
+      "grad_norm": 0.8983954787254333,
+      "learning_rate": 5.753811987981925e-05,
+      "loss": 0.754,
+      "step": 14380
+    },
+    {
+      "epoch": 2.560363247863248,
+      "grad_norm": 0.8304823040962219,
+      "learning_rate": 5.752544737325411e-05,
+      "loss": 0.7057,
+      "step": 14381
+    },
+    {
+      "epoch": 2.5605413105413106,
+      "grad_norm": 0.8694877028465271,
+      "learning_rate": 5.751277569888952e-05,
+      "loss": 0.843,
+      "step": 14382
+    },
+    {
+      "epoch": 2.5607193732193734,
+      "grad_norm": 0.7965344786643982,
+      "learning_rate": 5.750010485697387e-05,
+      "loss": 0.7679,
+      "step": 14383
+    },
+    {
+      "epoch": 2.560897435897436,
+      "grad_norm": 0.8181809782981873,
+      "learning_rate": 5.7487434847755386e-05,
+      "loss": 0.8408,
+      "step": 14384
+    },
+    {
+      "epoch": 2.5610754985754984,
+      "grad_norm": 0.8492250442504883,
+      "learning_rate": 5.747476567148229e-05,
+      "loss": 0.6768,
+      "step": 14385
+    },
+    {
+      "epoch": 2.561253561253561,
+      "grad_norm": 0.9129379987716675,
+      "learning_rate": 5.746209732840282e-05,
+      "loss": 0.8804,
+      "step": 14386
+    },
+    {
+      "epoch": 2.561431623931624,
+      "grad_norm": 0.8701111078262329,
+      "learning_rate": 5.74494298187652e-05,
+      "loss": 1.0908,
+      "step": 14387
+    },
+    {
+      "epoch": 2.5616096866096867,
+      "grad_norm": 0.9152243733406067,
+      "learning_rate": 5.7436763142817606e-05,
+      "loss": 0.8197,
+      "step": 14388
+    },
+    {
+      "epoch": 2.5617877492877494,
+      "grad_norm": 0.8663429617881775,
+      "learning_rate": 5.742409730080822e-05,
+      "loss": 0.8065,
+      "step": 14389
+    },
+    {
+      "epoch": 2.5619658119658117,
+      "grad_norm": 0.8722090721130371,
+      "learning_rate": 5.741143229298516e-05,
+      "loss": 0.8634,
+      "step": 14390
+    },
+    {
+      "epoch": 2.5621438746438745,
+      "grad_norm": 0.8126732707023621,
+      "learning_rate": 5.7398768119596704e-05,
+      "loss": 0.831,
+      "step": 14391
+    },
+    {
+      "epoch": 2.5623219373219372,
+      "grad_norm": 0.9060684442520142,
+      "learning_rate": 5.7386104780890794e-05,
+      "loss": 0.8757,
+      "step": 14392
+    },
+    {
+      "epoch": 2.5625,
+      "grad_norm": 0.947692334651947,
+      "learning_rate": 5.7373442277115696e-05,
+      "loss": 0.8606,
+      "step": 14393
+    },
+    {
+      "epoch": 2.5626780626780628,
+      "grad_norm": 0.8826618790626526,
+      "learning_rate": 5.736078060851944e-05,
+      "loss": 0.9942,
+      "step": 14394
+    },
+    {
+      "epoch": 2.5628561253561255,
+      "grad_norm": 0.915372908115387,
+      "learning_rate": 5.734811977535011e-05,
+      "loss": 0.7871,
+      "step": 14395
+    },
+    {
+      "epoch": 2.5630341880341883,
+      "grad_norm": 0.8202184438705444,
+      "learning_rate": 5.733545977785577e-05,
+      "loss": 0.8889,
+      "step": 14396
+    },
+    {
+      "epoch": 2.5632122507122506,
+      "grad_norm": 1.0160186290740967,
+      "learning_rate": 5.7322800616284475e-05,
+      "loss": 0.8973,
+      "step": 14397
+    },
+    {
+      "epoch": 2.5633903133903133,
+      "grad_norm": 0.848753809928894,
+      "learning_rate": 5.7310142290884206e-05,
+      "loss": 0.9517,
+      "step": 14398
+    },
+    {
+      "epoch": 2.563568376068376,
+      "grad_norm": 0.7473777532577515,
+      "learning_rate": 5.72974848019031e-05,
+      "loss": 0.6549,
+      "step": 14399
+    },
+    {
+      "epoch": 2.563746438746439,
+      "grad_norm": 0.7396529316902161,
+      "learning_rate": 5.728482814958899e-05,
+      "loss": 0.7814,
+      "step": 14400
+    },
+    {
+      "epoch": 2.5639245014245016,
+      "grad_norm": 0.8617672920227051,
+      "learning_rate": 5.727217233418998e-05,
+      "loss": 0.885,
+      "step": 14401
+    },
+    {
+      "epoch": 2.564102564102564,
+      "grad_norm": 0.920477569103241,
+      "learning_rate": 5.7259517355953984e-05,
+      "loss": 1.0269,
+      "step": 14402
+    },
+    {
+      "epoch": 2.5642806267806266,
+      "grad_norm": 0.8460386991500854,
+      "learning_rate": 5.7246863215128975e-05,
+      "loss": 0.7418,
+      "step": 14403
+    },
+    {
+      "epoch": 2.5644586894586894,
+      "grad_norm": 0.8857694268226624,
+      "learning_rate": 5.723420991196287e-05,
+      "loss": 0.7771,
+      "step": 14404
+    },
+    {
+      "epoch": 2.564636752136752,
+      "grad_norm": 0.9912863969802856,
+      "learning_rate": 5.722155744670352e-05,
+      "loss": 1.101,
+      "step": 14405
+    },
+    {
+      "epoch": 2.564814814814815,
+      "grad_norm": 0.8062789440155029,
+      "learning_rate": 5.720890581959899e-05,
+      "loss": 0.8602,
+      "step": 14406
+    },
+    {
+      "epoch": 2.5649928774928776,
+      "grad_norm": 0.8620314598083496,
+      "learning_rate": 5.719625503089698e-05,
+      "loss": 0.9433,
+      "step": 14407
+    },
+    {
+      "epoch": 2.5651709401709404,
+      "grad_norm": 0.8119623064994812,
+      "learning_rate": 5.718360508084546e-05,
+      "loss": 0.884,
+      "step": 14408
+    },
+    {
+      "epoch": 2.5653490028490027,
+      "grad_norm": 0.7872169613838196,
+      "learning_rate": 5.7170955969692265e-05,
+      "loss": 0.8247,
+      "step": 14409
+    },
+    {
+      "epoch": 2.5655270655270654,
+      "grad_norm": 0.8314040303230286,
+      "learning_rate": 5.715830769768522e-05,
+      "loss": 0.8643,
+      "step": 14410
+    },
+    {
+      "epoch": 2.565705128205128,
+      "grad_norm": 0.9003102779388428,
+      "learning_rate": 5.7145660265072145e-05,
+      "loss": 0.9426,
+      "step": 14411
+    },
+    {
+      "epoch": 2.565883190883191,
+      "grad_norm": 0.6572127938270569,
+      "learning_rate": 5.713301367210082e-05,
+      "loss": 0.4852,
+      "step": 14412
+    },
+    {
+      "epoch": 2.5660612535612537,
+      "grad_norm": 0.9557960629463196,
+      "learning_rate": 5.7120367919019044e-05,
+      "loss": 0.9281,
+      "step": 14413
+    },
+    {
+      "epoch": 2.566239316239316,
+      "grad_norm": 0.9009736180305481,
+      "learning_rate": 5.71077230060746e-05,
+      "loss": 1.0154,
+      "step": 14414
+    },
+    {
+      "epoch": 2.5664173789173788,
+      "grad_norm": 0.8672121167182922,
+      "learning_rate": 5.7095078933515175e-05,
+      "loss": 0.805,
+      "step": 14415
+    },
+    {
+      "epoch": 2.5665954415954415,
+      "grad_norm": 0.9077832698822021,
+      "learning_rate": 5.708243570158862e-05,
+      "loss": 0.7446,
+      "step": 14416
+    },
+    {
+      "epoch": 2.5667735042735043,
+      "grad_norm": 0.850246787071228,
+      "learning_rate": 5.706979331054252e-05,
+      "loss": 0.7773,
+      "step": 14417
+    },
+    {
+      "epoch": 2.566951566951567,
+      "grad_norm": 0.803983211517334,
+      "learning_rate": 5.705715176062467e-05,
+      "loss": 0.9361,
+      "step": 14418
+    },
+    {
+      "epoch": 2.5671296296296298,
+      "grad_norm": 0.8956922888755798,
+      "learning_rate": 5.704451105208273e-05,
+      "loss": 0.8962,
+      "step": 14419
+    },
+    {
+      "epoch": 2.5673076923076925,
+      "grad_norm": 0.8994067907333374,
+      "learning_rate": 5.703187118516433e-05,
+      "loss": 0.9902,
+      "step": 14420
+    },
+    {
+      "epoch": 2.567485754985755,
+      "grad_norm": 0.7383418679237366,
+      "learning_rate": 5.701923216011722e-05,
+      "loss": 0.8188,
+      "step": 14421
+    },
+    {
+      "epoch": 2.5676638176638176,
+      "grad_norm": 0.8397318720817566,
+      "learning_rate": 5.70065939771889e-05,
+      "loss": 0.8557,
+      "step": 14422
+    },
+    {
+      "epoch": 2.5678418803418803,
+      "grad_norm": 0.8804301023483276,
+      "learning_rate": 5.699395663662714e-05,
+      "loss": 0.7248,
+      "step": 14423
+    },
+    {
+      "epoch": 2.568019943019943,
+      "grad_norm": 0.8391412496566772,
+      "learning_rate": 5.698132013867938e-05,
+      "loss": 0.6986,
+      "step": 14424
+    },
+    {
+      "epoch": 2.568198005698006,
+      "grad_norm": 0.7337331771850586,
+      "learning_rate": 5.6968684483593334e-05,
+      "loss": 0.7911,
+      "step": 14425
+    },
+    {
+      "epoch": 2.568376068376068,
+      "grad_norm": 1.006412386894226,
+      "learning_rate": 5.695604967161652e-05,
+      "loss": 1.0131,
+      "step": 14426
+    },
+    {
+      "epoch": 2.568554131054131,
+      "grad_norm": 0.7777771353721619,
+      "learning_rate": 5.6943415702996494e-05,
+      "loss": 0.67,
+      "step": 14427
+    },
+    {
+      "epoch": 2.5687321937321936,
+      "grad_norm": 0.8864775896072388,
+      "learning_rate": 5.6930782577980803e-05,
+      "loss": 0.9513,
+      "step": 14428
+    },
+    {
+      "epoch": 2.5689102564102564,
+      "grad_norm": 0.8505052328109741,
+      "learning_rate": 5.691815029681695e-05,
+      "loss": 0.7213,
+      "step": 14429
+    },
+    {
+      "epoch": 2.569088319088319,
+      "grad_norm": 0.705781877040863,
+      "learning_rate": 5.6905518859752416e-05,
+      "loss": 0.8273,
+      "step": 14430
+    },
+    {
+      "epoch": 2.569266381766382,
+      "grad_norm": 0.7157384753227234,
+      "learning_rate": 5.689288826703479e-05,
+      "loss": 0.6854,
+      "step": 14431
+    },
+    {
+      "epoch": 2.5694444444444446,
+      "grad_norm": 0.871244490146637,
+      "learning_rate": 5.68802585189114e-05,
+      "loss": 0.8786,
+      "step": 14432
+    },
+    {
+      "epoch": 2.5696225071225074,
+      "grad_norm": 0.8742622137069702,
+      "learning_rate": 5.686762961562981e-05,
+      "loss": 0.7253,
+      "step": 14433
+    },
+    {
+      "epoch": 2.5698005698005697,
+      "grad_norm": 0.8194206357002258,
+      "learning_rate": 5.685500155743742e-05,
+      "loss": 0.9028,
+      "step": 14434
+    },
+    {
+      "epoch": 2.5699786324786325,
+      "grad_norm": 0.7505850195884705,
+      "learning_rate": 5.684237434458164e-05,
+      "loss": 0.7711,
+      "step": 14435
+    },
+    {
+      "epoch": 2.570156695156695,
+      "grad_norm": 0.9128859639167786,
+      "learning_rate": 5.6829747977309885e-05,
+      "loss": 0.8495,
+      "step": 14436
+    },
+    {
+      "epoch": 2.570334757834758,
+      "grad_norm": 0.6996384263038635,
+      "learning_rate": 5.681712245586954e-05,
+      "loss": 0.6938,
+      "step": 14437
+    },
+    {
+      "epoch": 2.5705128205128203,
+      "grad_norm": 0.8720461130142212,
+      "learning_rate": 5.680449778050798e-05,
+      "loss": 1.0547,
+      "step": 14438
+    },
+    {
+      "epoch": 2.570690883190883,
+      "grad_norm": 0.7767693996429443,
+      "learning_rate": 5.6791873951472544e-05,
+      "loss": 0.8718,
+      "step": 14439
+    },
+    {
+      "epoch": 2.5708689458689458,
+      "grad_norm": 0.8596739768981934,
+      "learning_rate": 5.6779250969010554e-05,
+      "loss": 0.792,
+      "step": 14440
+    },
+    {
+      "epoch": 2.5710470085470085,
+      "grad_norm": 1.0065197944641113,
+      "learning_rate": 5.676662883336939e-05,
+      "loss": 0.9199,
+      "step": 14441
+    },
+    {
+      "epoch": 2.5712250712250713,
+      "grad_norm": 0.8707680702209473,
+      "learning_rate": 5.6754007544796316e-05,
+      "loss": 0.9696,
+      "step": 14442
+    },
+    {
+      "epoch": 2.571403133903134,
+      "grad_norm": 0.8060235977172852,
+      "learning_rate": 5.674138710353865e-05,
+      "loss": 0.6626,
+      "step": 14443
+    },
+    {
+      "epoch": 2.5715811965811968,
+      "grad_norm": 0.7640239596366882,
+      "learning_rate": 5.6728767509843627e-05,
+      "loss": 0.8177,
+      "step": 14444
+    },
+    {
+      "epoch": 2.5717592592592595,
+      "grad_norm": 1.0821335315704346,
+      "learning_rate": 5.671614876395848e-05,
+      "loss": 1.0084,
+      "step": 14445
+    },
+    {
+      "epoch": 2.571937321937322,
+      "grad_norm": 0.874721884727478,
+      "learning_rate": 5.670353086613056e-05,
+      "loss": 0.9508,
+      "step": 14446
+    },
+    {
+      "epoch": 2.5721153846153846,
+      "grad_norm": 0.7837753891944885,
+      "learning_rate": 5.669091381660694e-05,
+      "loss": 0.6546,
+      "step": 14447
+    },
+    {
+      "epoch": 2.5722934472934473,
+      "grad_norm": 0.832924485206604,
+      "learning_rate": 5.6678297615634965e-05,
+      "loss": 0.9055,
+      "step": 14448
+    },
+    {
+      "epoch": 2.57247150997151,
+      "grad_norm": 0.8463562726974487,
+      "learning_rate": 5.6665682263461696e-05,
+      "loss": 0.8234,
+      "step": 14449
+    },
+    {
+      "epoch": 2.5726495726495724,
+      "grad_norm": 0.8785214424133301,
+      "learning_rate": 5.6653067760334386e-05,
+      "loss": 0.8478,
+      "step": 14450
+    },
+    {
+      "epoch": 2.572827635327635,
+      "grad_norm": 0.7375151515007019,
+      "learning_rate": 5.664045410650017e-05,
+      "loss": 0.8629,
+      "step": 14451
+    },
+    {
+      "epoch": 2.573005698005698,
+      "grad_norm": 0.7428547143936157,
+      "learning_rate": 5.6627841302206196e-05,
+      "loss": 0.9198,
+      "step": 14452
+    },
+    {
+      "epoch": 2.5731837606837606,
+      "grad_norm": 0.7373468279838562,
+      "learning_rate": 5.661522934769956e-05,
+      "loss": 0.6931,
+      "step": 14453
+    },
+    {
+      "epoch": 2.5733618233618234,
+      "grad_norm": 0.9162034392356873,
+      "learning_rate": 5.660261824322739e-05,
+      "loss": 0.9971,
+      "step": 14454
+    },
+    {
+      "epoch": 2.573539886039886,
+      "grad_norm": 0.7816632390022278,
+      "learning_rate": 5.659000798903672e-05,
+      "loss": 1.0481,
+      "step": 14455
+    },
+    {
+      "epoch": 2.573717948717949,
+      "grad_norm": 0.8594158291816711,
+      "learning_rate": 5.657739858537474e-05,
+      "loss": 1.1846,
+      "step": 14456
+    },
+    {
+      "epoch": 2.5738960113960117,
+      "grad_norm": 0.8171747922897339,
+      "learning_rate": 5.656479003248836e-05,
+      "loss": 0.8435,
+      "step": 14457
+    },
+    {
+      "epoch": 2.574074074074074,
+      "grad_norm": 0.8568267822265625,
+      "learning_rate": 5.6552182330624784e-05,
+      "loss": 1.031,
+      "step": 14458
+    },
+    {
+      "epoch": 2.5742521367521367,
+      "grad_norm": 0.8238523602485657,
+      "learning_rate": 5.653957548003084e-05,
+      "loss": 0.8917,
+      "step": 14459
+    },
+    {
+      "epoch": 2.5744301994301995,
+      "grad_norm": 0.7226746082305908,
+      "learning_rate": 5.652696948095369e-05,
+      "loss": 0.832,
+      "step": 14460
+    },
+    {
+      "epoch": 2.574608262108262,
+      "grad_norm": 0.9448554515838623,
+      "learning_rate": 5.651436433364024e-05,
+      "loss": 0.9696,
+      "step": 14461
+    },
+    {
+      "epoch": 2.5747863247863245,
+      "grad_norm": 0.9404924511909485,
+      "learning_rate": 5.650176003833747e-05,
+      "loss": 0.9813,
+      "step": 14462
+    },
+    {
+      "epoch": 2.5749643874643873,
+      "grad_norm": 0.9445366859436035,
+      "learning_rate": 5.648915659529241e-05,
+      "loss": 0.7205,
+      "step": 14463
+    },
+    {
+      "epoch": 2.57514245014245,
+      "grad_norm": 0.9205772876739502,
+      "learning_rate": 5.647655400475189e-05,
+      "loss": 0.958,
+      "step": 14464
+    },
+    {
+      "epoch": 2.5753205128205128,
+      "grad_norm": 0.9025790691375732,
+      "learning_rate": 5.646395226696291e-05,
+      "loss": 0.9107,
+      "step": 14465
+    },
+    {
+      "epoch": 2.5754985754985755,
+      "grad_norm": 0.9562451839447021,
+      "learning_rate": 5.645135138217235e-05,
+      "loss": 0.7618,
+      "step": 14466
+    },
+    {
+      "epoch": 2.5756766381766383,
+      "grad_norm": 0.8896244764328003,
+      "learning_rate": 5.6438751350627085e-05,
+      "loss": 0.9696,
+      "step": 14467
+    },
+    {
+      "epoch": 2.575854700854701,
+      "grad_norm": 0.9051744937896729,
+      "learning_rate": 5.6426152172574e-05,
+      "loss": 0.9537,
+      "step": 14468
+    },
+    {
+      "epoch": 2.576032763532764,
+      "grad_norm": 0.844556450843811,
+      "learning_rate": 5.641355384825995e-05,
+      "loss": 0.8686,
+      "step": 14469
+    },
+    {
+      "epoch": 2.576210826210826,
+      "grad_norm": 0.7751742601394653,
+      "learning_rate": 5.6400956377931726e-05,
+      "loss": 0.8373,
+      "step": 14470
+    },
+    {
+      "epoch": 2.576388888888889,
+      "grad_norm": 0.8988052010536194,
+      "learning_rate": 5.638835976183627e-05,
+      "loss": 0.8661,
+      "step": 14471
+    },
+    {
+      "epoch": 2.5765669515669516,
+      "grad_norm": 0.9114456176757812,
+      "learning_rate": 5.637576400022023e-05,
+      "loss": 1.0583,
+      "step": 14472
+    },
+    {
+      "epoch": 2.5767450142450143,
+      "grad_norm": 0.8742861151695251,
+      "learning_rate": 5.636316909333056e-05,
+      "loss": 0.8392,
+      "step": 14473
+    },
+    {
+      "epoch": 2.5769230769230766,
+      "grad_norm": 0.8418447375297546,
+      "learning_rate": 5.6350575041413854e-05,
+      "loss": 0.7494,
+      "step": 14474
+    },
+    {
+      "epoch": 2.5771011396011394,
+      "grad_norm": 0.9942673444747925,
+      "learning_rate": 5.633798184471701e-05,
+      "loss": 1.0183,
+      "step": 14475
+    },
+    {
+      "epoch": 2.577279202279202,
+      "grad_norm": 0.7663289308547974,
+      "learning_rate": 5.63253895034867e-05,
+      "loss": 0.7551,
+      "step": 14476
+    },
+    {
+      "epoch": 2.577457264957265,
+      "grad_norm": 0.8866778016090393,
+      "learning_rate": 5.631279801796966e-05,
+      "loss": 0.8623,
+      "step": 14477
+    },
+    {
+      "epoch": 2.5776353276353277,
+      "grad_norm": 0.9198449850082397,
+      "learning_rate": 5.6300207388412595e-05,
+      "loss": 1.0388,
+      "step": 14478
+    },
+    {
+      "epoch": 2.5778133903133904,
+      "grad_norm": 0.8202611804008484,
+      "learning_rate": 5.628761761506214e-05,
+      "loss": 0.7556,
+      "step": 14479
+    },
+    {
+      "epoch": 2.577991452991453,
+      "grad_norm": 0.751899003982544,
+      "learning_rate": 5.627502869816505e-05,
+      "loss": 0.8231,
+      "step": 14480
+    },
+    {
+      "epoch": 2.578169515669516,
+      "grad_norm": 1.0094623565673828,
+      "learning_rate": 5.626244063796795e-05,
+      "loss": 0.9778,
+      "step": 14481
+    },
+    {
+      "epoch": 2.578347578347578,
+      "grad_norm": 0.8163259625434875,
+      "learning_rate": 5.624985343471747e-05,
+      "loss": 0.8355,
+      "step": 14482
+    },
+    {
+      "epoch": 2.578525641025641,
+      "grad_norm": 0.8190516829490662,
+      "learning_rate": 5.623726708866023e-05,
+      "loss": 0.7736,
+      "step": 14483
+    },
+    {
+      "epoch": 2.5787037037037037,
+      "grad_norm": 0.884303629398346,
+      "learning_rate": 5.622468160004283e-05,
+      "loss": 0.8618,
+      "step": 14484
+    },
+    {
+      "epoch": 2.5788817663817665,
+      "grad_norm": 0.8564121723175049,
+      "learning_rate": 5.621209696911185e-05,
+      "loss": 0.9691,
+      "step": 14485
+    },
+    {
+      "epoch": 2.5790598290598292,
+      "grad_norm": 0.8122418522834778,
+      "learning_rate": 5.619951319611388e-05,
+      "loss": 0.7539,
+      "step": 14486
+    },
+    {
+      "epoch": 2.5792378917378915,
+      "grad_norm": 0.764470636844635,
+      "learning_rate": 5.6186930281295425e-05,
+      "loss": 0.7065,
+      "step": 14487
+    },
+    {
+      "epoch": 2.5794159544159543,
+      "grad_norm": 0.7477477192878723,
+      "learning_rate": 5.617434822490313e-05,
+      "loss": 0.7456,
+      "step": 14488
+    },
+    {
+      "epoch": 2.579594017094017,
+      "grad_norm": 0.9460917711257935,
+      "learning_rate": 5.616176702718335e-05,
+      "loss": 0.8427,
+      "step": 14489
+    },
+    {
+      "epoch": 2.57977207977208,
+      "grad_norm": 0.858561098575592,
+      "learning_rate": 5.614918668838274e-05,
+      "loss": 0.8913,
+      "step": 14490
+    },
+    {
+      "epoch": 2.5799501424501425,
+      "grad_norm": 0.8664894104003906,
+      "learning_rate": 5.613660720874772e-05,
+      "loss": 0.9211,
+      "step": 14491
+    },
+    {
+      "epoch": 2.5801282051282053,
+      "grad_norm": 0.8475569486618042,
+      "learning_rate": 5.612402858852475e-05,
+      "loss": 0.8149,
+      "step": 14492
+    },
+    {
+      "epoch": 2.580306267806268,
+      "grad_norm": 0.9543033838272095,
+      "learning_rate": 5.6111450827960296e-05,
+      "loss": 0.679,
+      "step": 14493
+    },
+    {
+      "epoch": 2.5804843304843303,
+      "grad_norm": 0.8219496011734009,
+      "learning_rate": 5.60988739273008e-05,
+      "loss": 0.9586,
+      "step": 14494
+    },
+    {
+      "epoch": 2.580662393162393,
+      "grad_norm": 0.8432445526123047,
+      "learning_rate": 5.6086297886792684e-05,
+      "loss": 0.8939,
+      "step": 14495
+    },
+    {
+      "epoch": 2.580840455840456,
+      "grad_norm": 0.9026654362678528,
+      "learning_rate": 5.607372270668232e-05,
+      "loss": 0.7422,
+      "step": 14496
+    },
+    {
+      "epoch": 2.5810185185185186,
+      "grad_norm": 1.0681802034378052,
+      "learning_rate": 5.606114838721608e-05,
+      "loss": 0.8208,
+      "step": 14497
+    },
+    {
+      "epoch": 2.5811965811965814,
+      "grad_norm": 0.8807427883148193,
+      "learning_rate": 5.604857492864044e-05,
+      "loss": 0.8463,
+      "step": 14498
+    },
+    {
+      "epoch": 2.5813746438746437,
+      "grad_norm": 0.7520862221717834,
+      "learning_rate": 5.603600233120159e-05,
+      "loss": 0.6691,
+      "step": 14499
+    },
+    {
+      "epoch": 2.5815527065527064,
+      "grad_norm": 0.8214079737663269,
+      "learning_rate": 5.602343059514599e-05,
+      "loss": 0.8416,
+      "step": 14500
+    },
+    {
+      "epoch": 2.581730769230769,
+      "grad_norm": 0.9263389110565186,
+      "learning_rate": 5.601085972071991e-05,
+      "loss": 1.1466,
+      "step": 14501
+    },
+    {
+      "epoch": 2.581908831908832,
+      "grad_norm": 0.8501101136207581,
+      "learning_rate": 5.5998289708169626e-05,
+      "loss": 0.855,
+      "step": 14502
+    },
+    {
+      "epoch": 2.5820868945868947,
+      "grad_norm": 0.8312939405441284,
+      "learning_rate": 5.598572055774152e-05,
+      "loss": 0.9843,
+      "step": 14503
+    },
+    {
+      "epoch": 2.5822649572649574,
+      "grad_norm": 0.7309035658836365,
+      "learning_rate": 5.5973152269681714e-05,
+      "loss": 0.813,
+      "step": 14504
+    },
+    {
+      "epoch": 2.58244301994302,
+      "grad_norm": 0.8962578177452087,
+      "learning_rate": 5.596058484423656e-05,
+      "loss": 0.7619,
+      "step": 14505
+    },
+    {
+      "epoch": 2.5826210826210825,
+      "grad_norm": 0.7805112600326538,
+      "learning_rate": 5.594801828165228e-05,
+      "loss": 1.1011,
+      "step": 14506
+    },
+    {
+      "epoch": 2.5827991452991452,
+      "grad_norm": 1.224509358406067,
+      "learning_rate": 5.593545258217505e-05,
+      "loss": 0.9764,
+      "step": 14507
+    },
+    {
+      "epoch": 2.582977207977208,
+      "grad_norm": 0.8085877895355225,
+      "learning_rate": 5.59228877460511e-05,
+      "loss": 0.9324,
+      "step": 14508
+    },
+    {
+      "epoch": 2.5831552706552707,
+      "grad_norm": 0.7962629199028015,
+      "learning_rate": 5.591032377352661e-05,
+      "loss": 0.6294,
+      "step": 14509
+    },
+    {
+      "epoch": 2.5833333333333335,
+      "grad_norm": 0.8638611435890198,
+      "learning_rate": 5.589776066484773e-05,
+      "loss": 0.7355,
+      "step": 14510
+    },
+    {
+      "epoch": 2.583511396011396,
+      "grad_norm": 0.8975821733474731,
+      "learning_rate": 5.588519842026061e-05,
+      "loss": 1.0264,
+      "step": 14511
+    },
+    {
+      "epoch": 2.5836894586894585,
+      "grad_norm": 0.8327218890190125,
+      "learning_rate": 5.5872637040011355e-05,
+      "loss": 0.8864,
+      "step": 14512
+    },
+    {
+      "epoch": 2.5838675213675213,
+      "grad_norm": 0.8141334652900696,
+      "learning_rate": 5.5860076524346197e-05,
+      "loss": 1.0277,
+      "step": 14513
+    },
+    {
+      "epoch": 2.584045584045584,
+      "grad_norm": 0.8557519316673279,
+      "learning_rate": 5.584751687351105e-05,
+      "loss": 0.9215,
+      "step": 14514
+    },
+    {
+      "epoch": 2.584223646723647,
+      "grad_norm": 0.902601957321167,
+      "learning_rate": 5.583495808775214e-05,
+      "loss": 0.8527,
+      "step": 14515
+    },
+    {
+      "epoch": 2.5844017094017095,
+      "grad_norm": 0.826359212398529,
+      "learning_rate": 5.582240016731548e-05,
+      "loss": 0.8524,
+      "step": 14516
+    },
+    {
+      "epoch": 2.5845797720797723,
+      "grad_norm": 0.7099179029464722,
+      "learning_rate": 5.580984311244713e-05,
+      "loss": 0.6923,
+      "step": 14517
+    },
+    {
+      "epoch": 2.5847578347578346,
+      "grad_norm": 0.829795777797699,
+      "learning_rate": 5.5797286923393086e-05,
+      "loss": 0.7211,
+      "step": 14518
+    },
+    {
+      "epoch": 2.5849358974358974,
+      "grad_norm": 0.8006768226623535,
+      "learning_rate": 5.5784731600399355e-05,
+      "loss": 0.7237,
+      "step": 14519
+    },
+    {
+      "epoch": 2.58511396011396,
+      "grad_norm": 0.7596119046211243,
+      "learning_rate": 5.577217714371203e-05,
+      "loss": 0.7651,
+      "step": 14520
+    },
+    {
+      "epoch": 2.585292022792023,
+      "grad_norm": 0.7901585102081299,
+      "learning_rate": 5.575962355357694e-05,
+      "loss": 0.7672,
+      "step": 14521
+    },
+    {
+      "epoch": 2.5854700854700856,
+      "grad_norm": 0.8586403131484985,
+      "learning_rate": 5.574707083024018e-05,
+      "loss": 1.084,
+      "step": 14522
+    },
+    {
+      "epoch": 2.585648148148148,
+      "grad_norm": 0.7670607566833496,
+      "learning_rate": 5.5734518973947616e-05,
+      "loss": 0.7929,
+      "step": 14523
+    },
+    {
+      "epoch": 2.5858262108262107,
+      "grad_norm": 0.8114384412765503,
+      "learning_rate": 5.572196798494522e-05,
+      "loss": 0.8154,
+      "step": 14524
+    },
+    {
+      "epoch": 2.5860042735042734,
+      "grad_norm": 0.8050188422203064,
+      "learning_rate": 5.570941786347888e-05,
+      "loss": 0.7969,
+      "step": 14525
+    },
+    {
+      "epoch": 2.586182336182336,
+      "grad_norm": 0.8641461133956909,
+      "learning_rate": 5.569686860979447e-05,
+      "loss": 0.8469,
+      "step": 14526
+    },
+    {
+      "epoch": 2.586360398860399,
+      "grad_norm": 0.7644940614700317,
+      "learning_rate": 5.568432022413787e-05,
+      "loss": 0.563,
+      "step": 14527
+    },
+    {
+      "epoch": 2.5865384615384617,
+      "grad_norm": 0.7620565891265869,
+      "learning_rate": 5.567177270675503e-05,
+      "loss": 0.657,
+      "step": 14528
+    },
+    {
+      "epoch": 2.5867165242165244,
+      "grad_norm": 0.8371306657791138,
+      "learning_rate": 5.5659226057891634e-05,
+      "loss": 0.8862,
+      "step": 14529
+    },
+    {
+      "epoch": 2.5868945868945867,
+      "grad_norm": 0.8996389508247375,
+      "learning_rate": 5.564668027779367e-05,
+      "loss": 0.6031,
+      "step": 14530
+    },
+    {
+      "epoch": 2.5870726495726495,
+      "grad_norm": 0.8691734671592712,
+      "learning_rate": 5.5634135366706806e-05,
+      "loss": 0.9198,
+      "step": 14531
+    },
+    {
+      "epoch": 2.5872507122507122,
+      "grad_norm": 0.8926620483398438,
+      "learning_rate": 5.562159132487693e-05,
+      "loss": 0.8691,
+      "step": 14532
+    },
+    {
+      "epoch": 2.587428774928775,
+      "grad_norm": 1.0852068662643433,
+      "learning_rate": 5.5609048152549794e-05,
+      "loss": 1.2338,
+      "step": 14533
+    },
+    {
+      "epoch": 2.5876068376068377,
+      "grad_norm": 0.7894790172576904,
+      "learning_rate": 5.5596505849971124e-05,
+      "loss": 0.907,
+      "step": 14534
+    },
+    {
+      "epoch": 2.5877849002849,
+      "grad_norm": 0.8084964156150818,
+      "learning_rate": 5.558396441738669e-05,
+      "loss": 0.9082,
+      "step": 14535
+    },
+    {
+      "epoch": 2.587962962962963,
+      "grad_norm": 1.0563920736312866,
+      "learning_rate": 5.557142385504222e-05,
+      "loss": 1.0364,
+      "step": 14536
+    },
+    {
+      "epoch": 2.5881410256410255,
+      "grad_norm": 0.7996996641159058,
+      "learning_rate": 5.5558884163183354e-05,
+      "loss": 0.925,
+      "step": 14537
+    },
+    {
+      "epoch": 2.5883190883190883,
+      "grad_norm": 0.7493244409561157,
+      "learning_rate": 5.5546345342055916e-05,
+      "loss": 0.9516,
+      "step": 14538
+    },
+    {
+      "epoch": 2.588497150997151,
+      "grad_norm": 0.8916776776313782,
+      "learning_rate": 5.553380739190541e-05,
+      "loss": 0.8164,
+      "step": 14539
+    },
+    {
+      "epoch": 2.588675213675214,
+      "grad_norm": 0.8178156614303589,
+      "learning_rate": 5.552127031297762e-05,
+      "loss": 0.905,
+      "step": 14540
+    },
+    {
+      "epoch": 2.5888532763532766,
+      "grad_norm": 0.8305806517601013,
+      "learning_rate": 5.550873410551816e-05,
+      "loss": 0.789,
+      "step": 14541
+    },
+    {
+      "epoch": 2.5890313390313393,
+      "grad_norm": 0.9307064414024353,
+      "learning_rate": 5.549619876977258e-05,
+      "loss": 0.8529,
+      "step": 14542
+    },
+    {
+      "epoch": 2.5892094017094016,
+      "grad_norm": 0.8526419401168823,
+      "learning_rate": 5.5483664305986614e-05,
+      "loss": 0.8314,
+      "step": 14543
+    },
+    {
+      "epoch": 2.5893874643874644,
+      "grad_norm": 0.884918212890625,
+      "learning_rate": 5.547113071440568e-05,
+      "loss": 0.7957,
+      "step": 14544
+    },
+    {
+      "epoch": 2.589565527065527,
+      "grad_norm": 0.7517948746681213,
+      "learning_rate": 5.5458597995275554e-05,
+      "loss": 0.7012,
+      "step": 14545
+    },
+    {
+      "epoch": 2.58974358974359,
+      "grad_norm": 0.8321232199668884,
+      "learning_rate": 5.5446066148841556e-05,
+      "loss": 1.0017,
+      "step": 14546
+    },
+    {
+      "epoch": 2.589921652421652,
+      "grad_norm": 0.8279885053634644,
+      "learning_rate": 5.543353517534939e-05,
+      "loss": 0.987,
+      "step": 14547
+    },
+    {
+      "epoch": 2.590099715099715,
+      "grad_norm": 0.8651175498962402,
+      "learning_rate": 5.542100507504454e-05,
+      "loss": 0.8929,
+      "step": 14548
+    },
+    {
+      "epoch": 2.5902777777777777,
+      "grad_norm": 0.9273492097854614,
+      "learning_rate": 5.540847584817248e-05,
+      "loss": 0.9503,
+      "step": 14549
+    },
+    {
+      "epoch": 2.5904558404558404,
+      "grad_norm": 0.8779071569442749,
+      "learning_rate": 5.5395947494978696e-05,
+      "loss": 0.9099,
+      "step": 14550
+    },
+    {
+      "epoch": 2.590633903133903,
+      "grad_norm": 0.8860164880752563,
+      "learning_rate": 5.538342001570868e-05,
+      "loss": 0.9559,
+      "step": 14551
+    },
+    {
+      "epoch": 2.590811965811966,
+      "grad_norm": 0.9232339859008789,
+      "learning_rate": 5.5370893410607816e-05,
+      "loss": 0.9495,
+      "step": 14552
+    },
+    {
+      "epoch": 2.5909900284900287,
+      "grad_norm": 0.8176831007003784,
+      "learning_rate": 5.5358367679921666e-05,
+      "loss": 0.8897,
+      "step": 14553
+    },
+    {
+      "epoch": 2.5911680911680914,
+      "grad_norm": 0.7926605939865112,
+      "learning_rate": 5.5345842823895486e-05,
+      "loss": 0.7609,
+      "step": 14554
+    },
+    {
+      "epoch": 2.5913461538461537,
+      "grad_norm": 0.9837173819541931,
+      "learning_rate": 5.533331884277484e-05,
+      "loss": 0.7842,
+      "step": 14555
+    },
+    {
+      "epoch": 2.5915242165242165,
+      "grad_norm": 0.7303726673126221,
+      "learning_rate": 5.5320795736804945e-05,
+      "loss": 0.824,
+      "step": 14556
+    },
+    {
+      "epoch": 2.5917022792022792,
+      "grad_norm": 0.8379296660423279,
+      "learning_rate": 5.530827350623128e-05,
+      "loss": 0.8005,
+      "step": 14557
+    },
+    {
+      "epoch": 2.591880341880342,
+      "grad_norm": 0.8562047481536865,
+      "learning_rate": 5.529575215129916e-05,
+      "loss": 1.0048,
+      "step": 14558
+    },
+    {
+      "epoch": 2.5920584045584043,
+      "grad_norm": 0.7543022632598877,
+      "learning_rate": 5.528323167225386e-05,
+      "loss": 0.7543,
+      "step": 14559
+    },
+    {
+      "epoch": 2.592236467236467,
+      "grad_norm": 0.8205977082252502,
+      "learning_rate": 5.5270712069340847e-05,
+      "loss": 0.997,
+      "step": 14560
+    },
+    {
+      "epoch": 2.59241452991453,
+      "grad_norm": 0.8566918969154358,
+      "learning_rate": 5.525819334280522e-05,
+      "loss": 0.9222,
+      "step": 14561
+    },
+    {
+      "epoch": 2.5925925925925926,
+      "grad_norm": 0.8513971567153931,
+      "learning_rate": 5.524567549289239e-05,
+      "loss": 0.7007,
+      "step": 14562
+    },
+    {
+      "epoch": 2.5927706552706553,
+      "grad_norm": 0.8939194679260254,
+      "learning_rate": 5.523315851984758e-05,
+      "loss": 0.8597,
+      "step": 14563
+    },
+    {
+      "epoch": 2.592948717948718,
+      "grad_norm": 0.7597625851631165,
+      "learning_rate": 5.5220642423916035e-05,
+      "loss": 0.7122,
+      "step": 14564
+    },
+    {
+      "epoch": 2.593126780626781,
+      "grad_norm": 0.9511955976486206,
+      "learning_rate": 5.5208127205342983e-05,
+      "loss": 1.0905,
+      "step": 14565
+    },
+    {
+      "epoch": 2.5933048433048436,
+      "grad_norm": 0.8359304070472717,
+      "learning_rate": 5.5195612864373626e-05,
+      "loss": 0.7132,
+      "step": 14566
+    },
+    {
+      "epoch": 2.593482905982906,
+      "grad_norm": 0.8302733302116394,
+      "learning_rate": 5.518309940125317e-05,
+      "loss": 0.9123,
+      "step": 14567
+    },
+    {
+      "epoch": 2.5936609686609686,
+      "grad_norm": 0.7923629283905029,
+      "learning_rate": 5.517058681622678e-05,
+      "loss": 0.8384,
+      "step": 14568
+    },
+    {
+      "epoch": 2.5938390313390314,
+      "grad_norm": 1.0625137090682983,
+      "learning_rate": 5.515807510953956e-05,
+      "loss": 1.0262,
+      "step": 14569
+    },
+    {
+      "epoch": 2.594017094017094,
+      "grad_norm": 1.0595879554748535,
+      "learning_rate": 5.5145564281436804e-05,
+      "loss": 0.9112,
+      "step": 14570
+    },
+    {
+      "epoch": 2.5941951566951564,
+      "grad_norm": 0.7307499647140503,
+      "learning_rate": 5.513305433216346e-05,
+      "loss": 0.8273,
+      "step": 14571
+    },
+    {
+      "epoch": 2.594373219373219,
+      "grad_norm": 0.9221912026405334,
+      "learning_rate": 5.512054526196475e-05,
+      "loss": 1.0679,
+      "step": 14572
+    },
+    {
+      "epoch": 2.594551282051282,
+      "grad_norm": 0.8098722100257874,
+      "learning_rate": 5.5108037071085725e-05,
+      "loss": 0.922,
+      "step": 14573
+    },
+    {
+      "epoch": 2.5947293447293447,
+      "grad_norm": 0.984785258769989,
+      "learning_rate": 5.509552975977146e-05,
+      "loss": 0.7525,
+      "step": 14574
+    },
+    {
+      "epoch": 2.5949074074074074,
+      "grad_norm": 0.8076850771903992,
+      "learning_rate": 5.5083023328267006e-05,
+      "loss": 1.008,
+      "step": 14575
+    },
+    {
+      "epoch": 2.59508547008547,
+      "grad_norm": 0.8375436067581177,
+      "learning_rate": 5.507051777681741e-05,
+      "loss": 0.8822,
+      "step": 14576
+    },
+    {
+      "epoch": 2.595263532763533,
+      "grad_norm": 0.779228687286377,
+      "learning_rate": 5.505801310566764e-05,
+      "loss": 0.8072,
+      "step": 14577
+    },
+    {
+      "epoch": 2.5954415954415957,
+      "grad_norm": 0.7347875833511353,
+      "learning_rate": 5.504550931506278e-05,
+      "loss": 0.7796,
+      "step": 14578
+    },
+    {
+      "epoch": 2.595619658119658,
+      "grad_norm": 0.8229580521583557,
+      "learning_rate": 5.503300640524779e-05,
+      "loss": 0.9337,
+      "step": 14579
+    },
+    {
+      "epoch": 2.5957977207977208,
+      "grad_norm": 0.8643096089363098,
+      "learning_rate": 5.502050437646762e-05,
+      "loss": 0.9101,
+      "step": 14580
+    },
+    {
+      "epoch": 2.5959757834757835,
+      "grad_norm": 0.769158661365509,
+      "learning_rate": 5.500800322896723e-05,
+      "loss": 0.8417,
+      "step": 14581
+    },
+    {
+      "epoch": 2.5961538461538463,
+      "grad_norm": 0.7792086005210876,
+      "learning_rate": 5.4995502962991566e-05,
+      "loss": 0.6965,
+      "step": 14582
+    },
+    {
+      "epoch": 2.5963319088319086,
+      "grad_norm": 0.7833219170570374,
+      "learning_rate": 5.498300357878552e-05,
+      "loss": 0.641,
+      "step": 14583
+    },
+    {
+      "epoch": 2.5965099715099713,
+      "grad_norm": 0.9491978287696838,
+      "learning_rate": 5.4970505076593956e-05,
+      "loss": 0.9229,
+      "step": 14584
+    },
+    {
+      "epoch": 2.596688034188034,
+      "grad_norm": 0.9128090739250183,
+      "learning_rate": 5.495800745666191e-05,
+      "loss": 0.8047,
+      "step": 14585
+    },
+    {
+      "epoch": 2.596866096866097,
+      "grad_norm": 0.9235281944274902,
+      "learning_rate": 5.494551071923404e-05,
+      "loss": 0.961,
+      "step": 14586
+    },
+    {
+      "epoch": 2.5970441595441596,
+      "grad_norm": 0.8582631349563599,
+      "learning_rate": 5.493301486455536e-05,
+      "loss": 0.7203,
+      "step": 14587
+    },
+    {
+      "epoch": 2.5972222222222223,
+      "grad_norm": 0.9605505466461182,
+      "learning_rate": 5.4920519892870605e-05,
+      "loss": 0.8315,
+      "step": 14588
+    },
+    {
+      "epoch": 2.597400284900285,
+      "grad_norm": 0.9344304203987122,
+      "learning_rate": 5.490802580442462e-05,
+      "loss": 0.9031,
+      "step": 14589
+    },
+    {
+      "epoch": 2.597578347578348,
+      "grad_norm": 1.0027791261672974,
+      "learning_rate": 5.4895532599462216e-05,
+      "loss": 1.0361,
+      "step": 14590
+    },
+    {
+      "epoch": 2.59775641025641,
+      "grad_norm": 0.8774647116661072,
+      "learning_rate": 5.488304027822815e-05,
+      "loss": 0.9533,
+      "step": 14591
+    },
+    {
+      "epoch": 2.597934472934473,
+      "grad_norm": 0.886246919631958,
+      "learning_rate": 5.487054884096718e-05,
+      "loss": 0.8588,
+      "step": 14592
+    },
+    {
+      "epoch": 2.5981125356125356,
+      "grad_norm": 0.8963425755500793,
+      "learning_rate": 5.485805828792408e-05,
+      "loss": 0.8685,
+      "step": 14593
+    },
+    {
+      "epoch": 2.5982905982905984,
+      "grad_norm": 0.7650768756866455,
+      "learning_rate": 5.484556861934349e-05,
+      "loss": 0.7441,
+      "step": 14594
+    },
+    {
+      "epoch": 2.5984686609686607,
+      "grad_norm": 0.8266916871070862,
+      "learning_rate": 5.483307983547026e-05,
+      "loss": 0.9625,
+      "step": 14595
+    },
+    {
+      "epoch": 2.5986467236467234,
+      "grad_norm": 0.8243923783302307,
+      "learning_rate": 5.482059193654894e-05,
+      "loss": 0.8553,
+      "step": 14596
+    },
+    {
+      "epoch": 2.598824786324786,
+      "grad_norm": 0.8200470209121704,
+      "learning_rate": 5.48081049228243e-05,
+      "loss": 0.6682,
+      "step": 14597
+    },
+    {
+      "epoch": 2.599002849002849,
+      "grad_norm": 0.8360442519187927,
+      "learning_rate": 5.479561879454097e-05,
+      "loss": 0.8996,
+      "step": 14598
+    },
+    {
+      "epoch": 2.5991809116809117,
+      "grad_norm": 0.8326625227928162,
+      "learning_rate": 5.4783133551943546e-05,
+      "loss": 0.6532,
+      "step": 14599
+    },
+    {
+      "epoch": 2.5993589743589745,
+      "grad_norm": 0.8162251114845276,
+      "learning_rate": 5.4770649195276766e-05,
+      "loss": 1.0514,
+      "step": 14600
+    },
+    {
+      "epoch": 2.599537037037037,
+      "grad_norm": 1.0407251119613647,
+      "learning_rate": 5.4758165724785084e-05,
+      "loss": 0.7991,
+      "step": 14601
+    },
+    {
+      "epoch": 2.5997150997151,
+      "grad_norm": 0.9161550998687744,
+      "learning_rate": 5.474568314071323e-05,
+      "loss": 0.8623,
+      "step": 14602
+    },
+    {
+      "epoch": 2.5998931623931623,
+      "grad_norm": 0.8405734896659851,
+      "learning_rate": 5.4733201443305646e-05,
+      "loss": 0.8406,
+      "step": 14603
+    },
+    {
+      "epoch": 2.600071225071225,
+      "grad_norm": 0.937198281288147,
+      "learning_rate": 5.472072063280698e-05,
+      "loss": 1.0887,
+      "step": 14604
+    },
+    {
+      "epoch": 2.6002492877492878,
+      "grad_norm": 0.8800520896911621,
+      "learning_rate": 5.470824070946172e-05,
+      "loss": 0.8738,
+      "step": 14605
+    },
+    {
+      "epoch": 2.6004273504273505,
+      "grad_norm": 0.9473027586936951,
+      "learning_rate": 5.4695761673514425e-05,
+      "loss": 0.8188,
+      "step": 14606
+    },
+    {
+      "epoch": 2.6006054131054133,
+      "grad_norm": 0.8547683954238892,
+      "learning_rate": 5.468328352520955e-05,
+      "loss": 0.7619,
+      "step": 14607
+    },
+    {
+      "epoch": 2.6007834757834756,
+      "grad_norm": 1.0138040781021118,
+      "learning_rate": 5.4670806264791595e-05,
+      "loss": 1.0805,
+      "step": 14608
+    },
+    {
+      "epoch": 2.6009615384615383,
+      "grad_norm": 0.8458215594291687,
+      "learning_rate": 5.465832989250499e-05,
+      "loss": 0.8386,
+      "step": 14609
+    },
+    {
+      "epoch": 2.601139601139601,
+      "grad_norm": 0.811152458190918,
+      "learning_rate": 5.464585440859431e-05,
+      "loss": 0.8158,
+      "step": 14610
+    },
+    {
+      "epoch": 2.601317663817664,
+      "grad_norm": 0.9584031701087952,
+      "learning_rate": 5.463337981330381e-05,
+      "loss": 0.8537,
+      "step": 14611
+    },
+    {
+      "epoch": 2.6014957264957266,
+      "grad_norm": 0.8734773397445679,
+      "learning_rate": 5.462090610687802e-05,
+      "loss": 1.0246,
+      "step": 14612
+    },
+    {
+      "epoch": 2.6016737891737893,
+      "grad_norm": 0.8463562726974487,
+      "learning_rate": 5.460843328956133e-05,
+      "loss": 0.8763,
+      "step": 14613
+    },
+    {
+      "epoch": 2.601851851851852,
+      "grad_norm": 0.8010903000831604,
+      "learning_rate": 5.459596136159808e-05,
+      "loss": 0.8438,
+      "step": 14614
+    },
+    {
+      "epoch": 2.6020299145299144,
+      "grad_norm": 0.7927500009536743,
+      "learning_rate": 5.458349032323267e-05,
+      "loss": 0.7388,
+      "step": 14615
+    },
+    {
+      "epoch": 2.602207977207977,
+      "grad_norm": 0.784017026424408,
+      "learning_rate": 5.4571020174709407e-05,
+      "loss": 0.6981,
+      "step": 14616
+    },
+    {
+      "epoch": 2.60238603988604,
+      "grad_norm": 0.8732004761695862,
+      "learning_rate": 5.455855091627263e-05,
+      "loss": 1.043,
+      "step": 14617
+    },
+    {
+      "epoch": 2.6025641025641026,
+      "grad_norm": 0.7947654128074646,
+      "learning_rate": 5.454608254816662e-05,
+      "loss": 0.9487,
+      "step": 14618
+    },
+    {
+      "epoch": 2.6027421652421654,
+      "grad_norm": 0.8809077739715576,
+      "learning_rate": 5.4533615070635734e-05,
+      "loss": 0.8499,
+      "step": 14619
+    },
+    {
+      "epoch": 2.6029202279202277,
+      "grad_norm": 0.9094803333282471,
+      "learning_rate": 5.452114848392422e-05,
+      "loss": 0.9522,
+      "step": 14620
+    },
+    {
+      "epoch": 2.6030982905982905,
+      "grad_norm": 0.8943446278572083,
+      "learning_rate": 5.4508682788276324e-05,
+      "loss": 0.7328,
+      "step": 14621
+    },
+    {
+      "epoch": 2.603276353276353,
+      "grad_norm": 0.856849730014801,
+      "learning_rate": 5.449621798393628e-05,
+      "loss": 0.7536,
+      "step": 14622
+    },
+    {
+      "epoch": 2.603454415954416,
+      "grad_norm": 0.8199608325958252,
+      "learning_rate": 5.448375407114833e-05,
+      "loss": 0.6377,
+      "step": 14623
+    },
+    {
+      "epoch": 2.6036324786324787,
+      "grad_norm": 0.8981915712356567,
+      "learning_rate": 5.4471291050156626e-05,
+      "loss": 1.0372,
+      "step": 14624
+    },
+    {
+      "epoch": 2.6038105413105415,
+      "grad_norm": 0.8449446558952332,
+      "learning_rate": 5.4458828921205465e-05,
+      "loss": 0.9948,
+      "step": 14625
+    },
+    {
+      "epoch": 2.603988603988604,
+      "grad_norm": 0.8807474970817566,
+      "learning_rate": 5.444636768453888e-05,
+      "loss": 0.9752,
+      "step": 14626
+    },
+    {
+      "epoch": 2.6041666666666665,
+      "grad_norm": 0.8212316036224365,
+      "learning_rate": 5.443390734040117e-05,
+      "loss": 0.9221,
+      "step": 14627
+    },
+    {
+      "epoch": 2.6043447293447293,
+      "grad_norm": 0.8049453496932983,
+      "learning_rate": 5.4421447889036304e-05,
+      "loss": 0.7726,
+      "step": 14628
+    },
+    {
+      "epoch": 2.604522792022792,
+      "grad_norm": 0.8091840744018555,
+      "learning_rate": 5.440898933068853e-05,
+      "loss": 0.9152,
+      "step": 14629
+    },
+    {
+      "epoch": 2.6047008547008548,
+      "grad_norm": 0.8409022688865662,
+      "learning_rate": 5.43965316656019e-05,
+      "loss": 0.8672,
+      "step": 14630
+    },
+    {
+      "epoch": 2.6048789173789175,
+      "grad_norm": 0.7622308731079102,
+      "learning_rate": 5.4384074894020496e-05,
+      "loss": 0.9021,
+      "step": 14631
+    },
+    {
+      "epoch": 2.60505698005698,
+      "grad_norm": 0.8272425532341003,
+      "learning_rate": 5.437161901618839e-05,
+      "loss": 0.7729,
+      "step": 14632
+    },
+    {
+      "epoch": 2.6052350427350426,
+      "grad_norm": 0.8699020743370056,
+      "learning_rate": 5.435916403234963e-05,
+      "loss": 0.8211,
+      "step": 14633
+    },
+    {
+      "epoch": 2.6054131054131053,
+      "grad_norm": 0.8145751357078552,
+      "learning_rate": 5.4346709942748196e-05,
+      "loss": 0.8996,
+      "step": 14634
+    },
+    {
+      "epoch": 2.605591168091168,
+      "grad_norm": 0.9398832321166992,
+      "learning_rate": 5.433425674762822e-05,
+      "loss": 0.8116,
+      "step": 14635
+    },
+    {
+      "epoch": 2.605769230769231,
+      "grad_norm": 0.9191767573356628,
+      "learning_rate": 5.4321804447233535e-05,
+      "loss": 0.8933,
+      "step": 14636
+    },
+    {
+      "epoch": 2.6059472934472936,
+      "grad_norm": 0.7511529326438904,
+      "learning_rate": 5.430935304180831e-05,
+      "loss": 0.7595,
+      "step": 14637
+    },
+    {
+      "epoch": 2.6061253561253563,
+      "grad_norm": 0.9087170362472534,
+      "learning_rate": 5.4296902531596296e-05,
+      "loss": 0.9781,
+      "step": 14638
+    },
+    {
+      "epoch": 2.6063034188034186,
+      "grad_norm": 0.8496448397636414,
+      "learning_rate": 5.4284452916841575e-05,
+      "loss": 0.9852,
+      "step": 14639
+    },
+    {
+      "epoch": 2.6064814814814814,
+      "grad_norm": 0.868609607219696,
+      "learning_rate": 5.427200419778804e-05,
+      "loss": 0.781,
+      "step": 14640
+    },
+    {
+      "epoch": 2.606659544159544,
+      "grad_norm": 0.7752132415771484,
+      "learning_rate": 5.4259556374679553e-05,
+      "loss": 0.7319,
+      "step": 14641
+    },
+    {
+      "epoch": 2.606837606837607,
+      "grad_norm": 0.8950543999671936,
+      "learning_rate": 5.4247109447760124e-05,
+      "loss": 0.7637,
+      "step": 14642
+    },
+    {
+      "epoch": 2.6070156695156697,
+      "grad_norm": 0.892699658870697,
+      "learning_rate": 5.423466341727346e-05,
+      "loss": 0.8274,
+      "step": 14643
+    },
+    {
+      "epoch": 2.607193732193732,
+      "grad_norm": 0.9283786416053772,
+      "learning_rate": 5.422221828346352e-05,
+      "loss": 1.1009,
+      "step": 14644
+    },
+    {
+      "epoch": 2.6073717948717947,
+      "grad_norm": 0.7551446557044983,
+      "learning_rate": 5.420977404657413e-05,
+      "loss": 0.8105,
+      "step": 14645
+    },
+    {
+      "epoch": 2.6075498575498575,
+      "grad_norm": 0.8014101386070251,
+      "learning_rate": 5.41973307068491e-05,
+      "loss": 0.838,
+      "step": 14646
+    },
+    {
+      "epoch": 2.60772792022792,
+      "grad_norm": 0.8941731452941895,
+      "learning_rate": 5.418488826453223e-05,
+      "loss": 0.9557,
+      "step": 14647
+    },
+    {
+      "epoch": 2.607905982905983,
+      "grad_norm": 0.7990903258323669,
+      "learning_rate": 5.41724467198673e-05,
+      "loss": 0.9634,
+      "step": 14648
+    },
+    {
+      "epoch": 2.6080840455840457,
+      "grad_norm": 1.0688040256500244,
+      "learning_rate": 5.4160006073098035e-05,
+      "loss": 0.8976,
+      "step": 14649
+    },
+    {
+      "epoch": 2.6082621082621085,
+      "grad_norm": 0.8451266884803772,
+      "learning_rate": 5.4147566324468313e-05,
+      "loss": 0.8703,
+      "step": 14650
+    },
+    {
+      "epoch": 2.6084401709401708,
+      "grad_norm": 0.8196333050727844,
+      "learning_rate": 5.413512747422169e-05,
+      "loss": 0.9423,
+      "step": 14651
+    },
+    {
+      "epoch": 2.6086182336182335,
+      "grad_norm": 0.7639298439025879,
+      "learning_rate": 5.412268952260204e-05,
+      "loss": 0.9092,
+      "step": 14652
+    },
+    {
+      "epoch": 2.6087962962962963,
+      "grad_norm": 0.88963782787323,
+      "learning_rate": 5.411025246985293e-05,
+      "loss": 1.2503,
+      "step": 14653
+    },
+    {
+      "epoch": 2.608974358974359,
+      "grad_norm": 0.831516683101654,
+      "learning_rate": 5.409781631621812e-05,
+      "loss": 0.8643,
+      "step": 14654
+    },
+    {
+      "epoch": 2.609152421652422,
+      "grad_norm": 0.7729721069335938,
+      "learning_rate": 5.408538106194125e-05,
+      "loss": 0.8289,
+      "step": 14655
+    },
+    {
+      "epoch": 2.609330484330484,
+      "grad_norm": 0.8360101580619812,
+      "learning_rate": 5.407294670726596e-05,
+      "loss": 0.8619,
+      "step": 14656
+    },
+    {
+      "epoch": 2.609508547008547,
+      "grad_norm": 0.7525733709335327,
+      "learning_rate": 5.406051325243586e-05,
+      "loss": 0.8353,
+      "step": 14657
+    },
+    {
+      "epoch": 2.6096866096866096,
+      "grad_norm": 0.8943357467651367,
+      "learning_rate": 5.404808069769456e-05,
+      "loss": 0.9291,
+      "step": 14658
+    },
+    {
+      "epoch": 2.6098646723646723,
+      "grad_norm": 1.024953007698059,
+      "learning_rate": 5.403564904328568e-05,
+      "loss": 1.0414,
+      "step": 14659
+    },
+    {
+      "epoch": 2.610042735042735,
+      "grad_norm": 0.8671780228614807,
+      "learning_rate": 5.402321828945278e-05,
+      "loss": 0.9309,
+      "step": 14660
+    },
+    {
+      "epoch": 2.610220797720798,
+      "grad_norm": 0.9765334725379944,
+      "learning_rate": 5.4010788436439406e-05,
+      "loss": 0.9399,
+      "step": 14661
+    },
+    {
+      "epoch": 2.6103988603988606,
+      "grad_norm": 0.8996732234954834,
+      "learning_rate": 5.3998359484489106e-05,
+      "loss": 0.9868,
+      "step": 14662
+    },
+    {
+      "epoch": 2.6105769230769234,
+      "grad_norm": 0.8597404956817627,
+      "learning_rate": 5.398593143384538e-05,
+      "loss": 1.0328,
+      "step": 14663
+    },
+    {
+      "epoch": 2.6107549857549857,
+      "grad_norm": 0.8909318447113037,
+      "learning_rate": 5.397350428475176e-05,
+      "loss": 0.9362,
+      "step": 14664
+    },
+    {
+      "epoch": 2.6109330484330484,
+      "grad_norm": 0.8874006867408752,
+      "learning_rate": 5.39610780374517e-05,
+      "loss": 0.9254,
+      "step": 14665
+    },
+    {
+      "epoch": 2.611111111111111,
+      "grad_norm": 0.8325822949409485,
+      "learning_rate": 5.3948652692188626e-05,
+      "loss": 0.8495,
+      "step": 14666
+    },
+    {
+      "epoch": 2.611289173789174,
+      "grad_norm": 0.847998857498169,
+      "learning_rate": 5.393622824920614e-05,
+      "loss": 0.8372,
+      "step": 14667
+    },
+    {
+      "epoch": 2.611467236467236,
+      "grad_norm": 0.8439756631851196,
+      "learning_rate": 5.392380470874749e-05,
+      "loss": 0.8934,
+      "step": 14668
+    },
+    {
+      "epoch": 2.611645299145299,
+      "grad_norm": 0.9563834071159363,
+      "learning_rate": 5.39113820710562e-05,
+      "loss": 1.1213,
+      "step": 14669
+    },
+    {
+      "epoch": 2.6118233618233617,
+      "grad_norm": 0.7761119604110718,
+      "learning_rate": 5.3898960336375646e-05,
+      "loss": 0.6104,
+      "step": 14670
+    },
+    {
+      "epoch": 2.6120014245014245,
+      "grad_norm": 0.8661524653434753,
+      "learning_rate": 5.38865395049492e-05,
+      "loss": 0.7562,
+      "step": 14671
+    },
+    {
+      "epoch": 2.6121794871794872,
+      "grad_norm": 0.854347825050354,
+      "learning_rate": 5.387411957702021e-05,
+      "loss": 0.8613,
+      "step": 14672
+    },
+    {
+      "epoch": 2.61235754985755,
+      "grad_norm": 0.7728402614593506,
+      "learning_rate": 5.386170055283204e-05,
+      "loss": 0.7879,
+      "step": 14673
+    },
+    {
+      "epoch": 2.6125356125356127,
+      "grad_norm": 0.8647109270095825,
+      "learning_rate": 5.384928243262799e-05,
+      "loss": 0.835,
+      "step": 14674
+    },
+    {
+      "epoch": 2.6127136752136755,
+      "grad_norm": 0.6764749884605408,
+      "learning_rate": 5.383686521665139e-05,
+      "loss": 0.7233,
+      "step": 14675
+    },
+    {
+      "epoch": 2.612891737891738,
+      "grad_norm": 0.8431640863418579,
+      "learning_rate": 5.382444890514548e-05,
+      "loss": 1.1699,
+      "step": 14676
+    },
+    {
+      "epoch": 2.6130698005698005,
+      "grad_norm": 0.9196193814277649,
+      "learning_rate": 5.381203349835364e-05,
+      "loss": 0.8668,
+      "step": 14677
+    },
+    {
+      "epoch": 2.6132478632478633,
+      "grad_norm": 0.9449048638343811,
+      "learning_rate": 5.3799618996519e-05,
+      "loss": 0.8353,
+      "step": 14678
+    },
+    {
+      "epoch": 2.613425925925926,
+      "grad_norm": 0.9835928678512573,
+      "learning_rate": 5.378720539988488e-05,
+      "loss": 1.1129,
+      "step": 14679
+    },
+    {
+      "epoch": 2.6136039886039883,
+      "grad_norm": 0.763592004776001,
+      "learning_rate": 5.377479270869448e-05,
+      "loss": 0.7929,
+      "step": 14680
+    },
+    {
+      "epoch": 2.613782051282051,
+      "grad_norm": 0.8119748830795288,
+      "learning_rate": 5.376238092319094e-05,
+      "loss": 1.0257,
+      "step": 14681
+    },
+    {
+      "epoch": 2.613960113960114,
+      "grad_norm": 0.7605236172676086,
+      "learning_rate": 5.374997004361757e-05,
+      "loss": 0.7005,
+      "step": 14682
+    },
+    {
+      "epoch": 2.6141381766381766,
+      "grad_norm": 0.9077369570732117,
+      "learning_rate": 5.3737560070217394e-05,
+      "loss": 0.9208,
+      "step": 14683
+    },
+    {
+      "epoch": 2.6143162393162394,
+      "grad_norm": 0.9089310765266418,
+      "learning_rate": 5.3725151003233665e-05,
+      "loss": 0.6855,
+      "step": 14684
+    },
+    {
+      "epoch": 2.614494301994302,
+      "grad_norm": 0.8387685418128967,
+      "learning_rate": 5.371274284290947e-05,
+      "loss": 0.8682,
+      "step": 14685
+    },
+    {
+      "epoch": 2.614672364672365,
+      "grad_norm": 0.7626301050186157,
+      "learning_rate": 5.3700335589487925e-05,
+      "loss": 0.6928,
+      "step": 14686
+    },
+    {
+      "epoch": 2.6148504273504276,
+      "grad_norm": 1.2667319774627686,
+      "learning_rate": 5.368792924321213e-05,
+      "loss": 0.9288,
+      "step": 14687
+    },
+    {
+      "epoch": 2.61502849002849,
+      "grad_norm": 0.8570333123207092,
+      "learning_rate": 5.3675523804325154e-05,
+      "loss": 0.9916,
+      "step": 14688
+    },
+    {
+      "epoch": 2.6152065527065527,
+      "grad_norm": 0.9050240516662598,
+      "learning_rate": 5.366311927307006e-05,
+      "loss": 0.7734,
+      "step": 14689
+    },
+    {
+      "epoch": 2.6153846153846154,
+      "grad_norm": 1.000036358833313,
+      "learning_rate": 5.365071564968989e-05,
+      "loss": 0.7932,
+      "step": 14690
+    },
+    {
+      "epoch": 2.615562678062678,
+      "grad_norm": 0.8147441744804382,
+      "learning_rate": 5.363831293442763e-05,
+      "loss": 0.8867,
+      "step": 14691
+    },
+    {
+      "epoch": 2.6157407407407405,
+      "grad_norm": 0.8662015795707703,
+      "learning_rate": 5.3625911127526375e-05,
+      "loss": 0.6742,
+      "step": 14692
+    },
+    {
+      "epoch": 2.6159188034188032,
+      "grad_norm": 0.8576271533966064,
+      "learning_rate": 5.3613510229229e-05,
+      "loss": 0.8161,
+      "step": 14693
+    },
+    {
+      "epoch": 2.616096866096866,
+      "grad_norm": 0.8862481713294983,
+      "learning_rate": 5.360111023977856e-05,
+      "loss": 0.8774,
+      "step": 14694
+    },
+    {
+      "epoch": 2.6162749287749287,
+      "grad_norm": 0.8384450674057007,
+      "learning_rate": 5.358871115941799e-05,
+      "loss": 0.9149,
+      "step": 14695
+    },
+    {
+      "epoch": 2.6164529914529915,
+      "grad_norm": 0.9055412411689758,
+      "learning_rate": 5.357631298839021e-05,
+      "loss": 0.8197,
+      "step": 14696
+    },
+    {
+      "epoch": 2.6166310541310542,
+      "grad_norm": 0.937764585018158,
+      "learning_rate": 5.356391572693813e-05,
+      "loss": 1.0392,
+      "step": 14697
+    },
+    {
+      "epoch": 2.616809116809117,
+      "grad_norm": 0.8917306661605835,
+      "learning_rate": 5.355151937530463e-05,
+      "loss": 0.868,
+      "step": 14698
+    },
+    {
+      "epoch": 2.6169871794871797,
+      "grad_norm": 0.7353024482727051,
+      "learning_rate": 5.3539123933732705e-05,
+      "loss": 0.7788,
+      "step": 14699
+    },
+    {
+      "epoch": 2.617165242165242,
+      "grad_norm": 0.8607454299926758,
+      "learning_rate": 5.352672940246504e-05,
+      "loss": 0.7746,
+      "step": 14700
+    },
+    {
+      "epoch": 2.617343304843305,
+      "grad_norm": 0.9775658249855042,
+      "learning_rate": 5.3514335781744616e-05,
+      "loss": 0.9438,
+      "step": 14701
+    },
+    {
+      "epoch": 2.6175213675213675,
+      "grad_norm": 0.9416237473487854,
+      "learning_rate": 5.350194307181422e-05,
+      "loss": 0.9581,
+      "step": 14702
+    },
+    {
+      "epoch": 2.6176994301994303,
+      "grad_norm": 0.8378105163574219,
+      "learning_rate": 5.348955127291666e-05,
+      "loss": 1.0038,
+      "step": 14703
+    },
+    {
+      "epoch": 2.6178774928774926,
+      "grad_norm": 0.8199161887168884,
+      "learning_rate": 5.347716038529471e-05,
+      "loss": 0.9492,
+      "step": 14704
+    },
+    {
+      "epoch": 2.6180555555555554,
+      "grad_norm": 0.9511042833328247,
+      "learning_rate": 5.3464770409191176e-05,
+      "loss": 1.2101,
+      "step": 14705
+    },
+    {
+      "epoch": 2.618233618233618,
+      "grad_norm": 0.8017105460166931,
+      "learning_rate": 5.3452381344848754e-05,
+      "loss": 0.9524,
+      "step": 14706
+    },
+    {
+      "epoch": 2.618411680911681,
+      "grad_norm": 0.8174898624420166,
+      "learning_rate": 5.34399931925103e-05,
+      "loss": 0.911,
+      "step": 14707
+    },
+    {
+      "epoch": 2.6185897435897436,
+      "grad_norm": 0.8134239315986633,
+      "learning_rate": 5.342760595241838e-05,
+      "loss": 0.8971,
+      "step": 14708
+    },
+    {
+      "epoch": 2.6187678062678064,
+      "grad_norm": 0.817252516746521,
+      "learning_rate": 5.341521962481586e-05,
+      "loss": 0.8472,
+      "step": 14709
+    },
+    {
+      "epoch": 2.618945868945869,
+      "grad_norm": 0.8675270080566406,
+      "learning_rate": 5.3402834209945264e-05,
+      "loss": 0.9607,
+      "step": 14710
+    },
+    {
+      "epoch": 2.619123931623932,
+      "grad_norm": 1.0281410217285156,
+      "learning_rate": 5.339044970804936e-05,
+      "loss": 1.0487,
+      "step": 14711
+    },
+    {
+      "epoch": 2.619301994301994,
+      "grad_norm": 0.9276307225227356,
+      "learning_rate": 5.33780661193708e-05,
+      "loss": 0.8915,
+      "step": 14712
+    },
+    {
+      "epoch": 2.619480056980057,
+      "grad_norm": 0.8479217290878296,
+      "learning_rate": 5.336568344415216e-05,
+      "loss": 0.929,
+      "step": 14713
+    },
+    {
+      "epoch": 2.6196581196581197,
+      "grad_norm": 0.8695724010467529,
+      "learning_rate": 5.335330168263608e-05,
+      "loss": 0.8651,
+      "step": 14714
+    },
+    {
+      "epoch": 2.6198361823361824,
+      "grad_norm": 0.7740936875343323,
+      "learning_rate": 5.3340920835065155e-05,
+      "loss": 0.8572,
+      "step": 14715
+    },
+    {
+      "epoch": 2.620014245014245,
+      "grad_norm": 0.8619815111160278,
+      "learning_rate": 5.332854090168192e-05,
+      "loss": 0.6934,
+      "step": 14716
+    },
+    {
+      "epoch": 2.6201923076923075,
+      "grad_norm": 0.8866271376609802,
+      "learning_rate": 5.331616188272902e-05,
+      "loss": 1.038,
+      "step": 14717
+    },
+    {
+      "epoch": 2.6203703703703702,
+      "grad_norm": 0.7526047825813293,
+      "learning_rate": 5.330378377844896e-05,
+      "loss": 0.8534,
+      "step": 14718
+    },
+    {
+      "epoch": 2.620548433048433,
+      "grad_norm": 0.6914070248603821,
+      "learning_rate": 5.329140658908423e-05,
+      "loss": 0.5355,
+      "step": 14719
+    },
+    {
+      "epoch": 2.6207264957264957,
+      "grad_norm": 0.886074423789978,
+      "learning_rate": 5.3279030314877374e-05,
+      "loss": 0.8277,
+      "step": 14720
+    },
+    {
+      "epoch": 2.6209045584045585,
+      "grad_norm": 0.9101460576057434,
+      "learning_rate": 5.326665495607082e-05,
+      "loss": 0.8711,
+      "step": 14721
+    },
+    {
+      "epoch": 2.6210826210826212,
+      "grad_norm": 0.9744461178779602,
+      "learning_rate": 5.3254280512907175e-05,
+      "loss": 1.2376,
+      "step": 14722
+    },
+    {
+      "epoch": 2.621260683760684,
+      "grad_norm": 1.013480544090271,
+      "learning_rate": 5.32419069856287e-05,
+      "loss": 0.8946,
+      "step": 14723
+    },
+    {
+      "epoch": 2.6214387464387463,
+      "grad_norm": 0.82442706823349,
+      "learning_rate": 5.3229534374478005e-05,
+      "loss": 0.732,
+      "step": 14724
+    },
+    {
+      "epoch": 2.621616809116809,
+      "grad_norm": 0.7960239052772522,
+      "learning_rate": 5.3217162679697366e-05,
+      "loss": 0.7633,
+      "step": 14725
+    },
+    {
+      "epoch": 2.621794871794872,
+      "grad_norm": 0.819844126701355,
+      "learning_rate": 5.320479190152926e-05,
+      "loss": 0.974,
+      "step": 14726
+    },
+    {
+      "epoch": 2.6219729344729346,
+      "grad_norm": 0.8245221376419067,
+      "learning_rate": 5.319242204021606e-05,
+      "loss": 0.9122,
+      "step": 14727
+    },
+    {
+      "epoch": 2.6221509971509973,
+      "grad_norm": 0.7574561834335327,
+      "learning_rate": 5.318005309600011e-05,
+      "loss": 0.8427,
+      "step": 14728
+    },
+    {
+      "epoch": 2.6223290598290596,
+      "grad_norm": 1.0385704040527344,
+      "learning_rate": 5.316768506912377e-05,
+      "loss": 0.8214,
+      "step": 14729
+    },
+    {
+      "epoch": 2.6225071225071224,
+      "grad_norm": 0.8616722822189331,
+      "learning_rate": 5.3155317959829346e-05,
+      "loss": 0.8469,
+      "step": 14730
+    },
+    {
+      "epoch": 2.622685185185185,
+      "grad_norm": 0.909667432308197,
+      "learning_rate": 5.314295176835912e-05,
+      "loss": 0.9156,
+      "step": 14731
+    },
+    {
+      "epoch": 2.622863247863248,
+      "grad_norm": 0.9016293883323669,
+      "learning_rate": 5.3130586494955494e-05,
+      "loss": 0.9183,
+      "step": 14732
+    },
+    {
+      "epoch": 2.6230413105413106,
+      "grad_norm": 0.8828284740447998,
+      "learning_rate": 5.311822213986057e-05,
+      "loss": 0.8338,
+      "step": 14733
+    },
+    {
+      "epoch": 2.6232193732193734,
+      "grad_norm": 0.8159047365188599,
+      "learning_rate": 5.3105858703316794e-05,
+      "loss": 0.7055,
+      "step": 14734
+    },
+    {
+      "epoch": 2.623397435897436,
+      "grad_norm": 0.9240905046463013,
+      "learning_rate": 5.309349618556623e-05,
+      "loss": 0.9078,
+      "step": 14735
+    },
+    {
+      "epoch": 2.6235754985754984,
+      "grad_norm": 0.8881595134735107,
+      "learning_rate": 5.308113458685118e-05,
+      "loss": 0.9946,
+      "step": 14736
+    },
+    {
+      "epoch": 2.623753561253561,
+      "grad_norm": 0.8781841397285461,
+      "learning_rate": 5.306877390741385e-05,
+      "loss": 0.8252,
+      "step": 14737
+    },
+    {
+      "epoch": 2.623931623931624,
+      "grad_norm": 0.8348106741905212,
+      "learning_rate": 5.3056414147496355e-05,
+      "loss": 0.8653,
+      "step": 14738
+    },
+    {
+      "epoch": 2.6241096866096867,
+      "grad_norm": 0.9692304134368896,
+      "learning_rate": 5.3044055307341e-05,
+      "loss": 0.7814,
+      "step": 14739
+    },
+    {
+      "epoch": 2.6242877492877494,
+      "grad_norm": 0.866179347038269,
+      "learning_rate": 5.303169738718976e-05,
+      "loss": 0.9255,
+      "step": 14740
+    },
+    {
+      "epoch": 2.6244658119658117,
+      "grad_norm": 0.9306690692901611,
+      "learning_rate": 5.301934038728487e-05,
+      "loss": 0.9123,
+      "step": 14741
+    },
+    {
+      "epoch": 2.6246438746438745,
+      "grad_norm": 0.949357271194458,
+      "learning_rate": 5.3006984307868415e-05,
+      "loss": 0.8452,
+      "step": 14742
+    },
+    {
+      "epoch": 2.6248219373219372,
+      "grad_norm": 0.8638128042221069,
+      "learning_rate": 5.299462914918249e-05,
+      "loss": 0.8026,
+      "step": 14743
+    },
+    {
+      "epoch": 2.625,
+      "grad_norm": 0.9075117707252502,
+      "learning_rate": 5.2982274911469154e-05,
+      "loss": 1.0644,
+      "step": 14744
+    },
+    {
+      "epoch": 2.6251780626780628,
+      "grad_norm": 0.8146225810050964,
+      "learning_rate": 5.296992159497047e-05,
+      "loss": 0.8494,
+      "step": 14745
+    },
+    {
+      "epoch": 2.6253561253561255,
+      "grad_norm": 0.8887025713920593,
+      "learning_rate": 5.295756919992847e-05,
+      "loss": 0.8143,
+      "step": 14746
+    },
+    {
+      "epoch": 2.6255341880341883,
+      "grad_norm": 0.8262654542922974,
+      "learning_rate": 5.29452177265852e-05,
+      "loss": 0.7559,
+      "step": 14747
+    },
+    {
+      "epoch": 2.6257122507122506,
+      "grad_norm": 0.8126912117004395,
+      "learning_rate": 5.2932867175182574e-05,
+      "loss": 0.8528,
+      "step": 14748
+    },
+    {
+      "epoch": 2.6258903133903133,
+      "grad_norm": 0.8970595598220825,
+      "learning_rate": 5.2920517545962746e-05,
+      "loss": 0.8584,
+      "step": 14749
+    },
+    {
+      "epoch": 2.626068376068376,
+      "grad_norm": 0.8678651452064514,
+      "learning_rate": 5.290816883916748e-05,
+      "loss": 0.8686,
+      "step": 14750
+    },
+    {
+      "epoch": 2.626246438746439,
+      "grad_norm": 0.8069576621055603,
+      "learning_rate": 5.289582105503887e-05,
+      "loss": 0.868,
+      "step": 14751
+    },
+    {
+      "epoch": 2.6264245014245016,
+      "grad_norm": 1.0322144031524658,
+      "learning_rate": 5.28834741938188e-05,
+      "loss": 1.1537,
+      "step": 14752
+    },
+    {
+      "epoch": 2.626602564102564,
+      "grad_norm": 0.8274349570274353,
+      "learning_rate": 5.287112825574917e-05,
+      "loss": 1.0126,
+      "step": 14753
+    },
+    {
+      "epoch": 2.6267806267806266,
+      "grad_norm": 0.8820709586143494,
+      "learning_rate": 5.2858783241071875e-05,
+      "loss": 0.893,
+      "step": 14754
+    },
+    {
+      "epoch": 2.6269586894586894,
+      "grad_norm": 1.0102146863937378,
+      "learning_rate": 5.28464391500288e-05,
+      "loss": 0.8524,
+      "step": 14755
+    },
+    {
+      "epoch": 2.627136752136752,
+      "grad_norm": 0.875468373298645,
+      "learning_rate": 5.2834095982861764e-05,
+      "loss": 1.0991,
+      "step": 14756
+    },
+    {
+      "epoch": 2.627314814814815,
+      "grad_norm": 0.8155242800712585,
+      "learning_rate": 5.282175373981267e-05,
+      "loss": 0.666,
+      "step": 14757
+    },
+    {
+      "epoch": 2.6274928774928776,
+      "grad_norm": 0.8777057528495789,
+      "learning_rate": 5.280941242112332e-05,
+      "loss": 0.892,
+      "step": 14758
+    },
+    {
+      "epoch": 2.6276709401709404,
+      "grad_norm": 0.8357667922973633,
+      "learning_rate": 5.279707202703549e-05,
+      "loss": 0.8118,
+      "step": 14759
+    },
+    {
+      "epoch": 2.6278490028490027,
+      "grad_norm": 0.7862337827682495,
+      "learning_rate": 5.278473255779097e-05,
+      "loss": 0.7287,
+      "step": 14760
+    },
+    {
+      "epoch": 2.6280270655270654,
+      "grad_norm": 0.8340336084365845,
+      "learning_rate": 5.277239401363155e-05,
+      "loss": 0.7697,
+      "step": 14761
+    },
+    {
+      "epoch": 2.628205128205128,
+      "grad_norm": 0.7986457943916321,
+      "learning_rate": 5.276005639479896e-05,
+      "loss": 0.9358,
+      "step": 14762
+    },
+    {
+      "epoch": 2.628383190883191,
+      "grad_norm": 0.7377769947052002,
+      "learning_rate": 5.2747719701534895e-05,
+      "loss": 0.8091,
+      "step": 14763
+    },
+    {
+      "epoch": 2.6285612535612537,
+      "grad_norm": 0.9749723672866821,
+      "learning_rate": 5.273538393408117e-05,
+      "loss": 0.8163,
+      "step": 14764
+    },
+    {
+      "epoch": 2.628739316239316,
+      "grad_norm": 0.8718321323394775,
+      "learning_rate": 5.2723049092679354e-05,
+      "loss": 1.1587,
+      "step": 14765
+    },
+    {
+      "epoch": 2.6289173789173788,
+      "grad_norm": 0.9394767880439758,
+      "learning_rate": 5.27107151775712e-05,
+      "loss": 0.9409,
+      "step": 14766
+    },
+    {
+      "epoch": 2.6290954415954415,
+      "grad_norm": 0.9763813614845276,
+      "learning_rate": 5.269838218899836e-05,
+      "loss": 1.0171,
+      "step": 14767
+    },
+    {
+      "epoch": 2.6292735042735043,
+      "grad_norm": 0.878968358039856,
+      "learning_rate": 5.268605012720247e-05,
+      "loss": 0.9117,
+      "step": 14768
+    },
+    {
+      "epoch": 2.629451566951567,
+      "grad_norm": 0.8240547776222229,
+      "learning_rate": 5.267371899242512e-05,
+      "loss": 0.9351,
+      "step": 14769
+    },
+    {
+      "epoch": 2.6296296296296298,
+      "grad_norm": 0.8048275709152222,
+      "learning_rate": 5.266138878490795e-05,
+      "loss": 0.9331,
+      "step": 14770
+    },
+    {
+      "epoch": 2.6298076923076925,
+      "grad_norm": 0.7176041007041931,
+      "learning_rate": 5.264905950489252e-05,
+      "loss": 0.6424,
+      "step": 14771
+    },
+    {
+      "epoch": 2.629985754985755,
+      "grad_norm": 0.973258912563324,
+      "learning_rate": 5.263673115262041e-05,
+      "loss": 0.9295,
+      "step": 14772
+    },
+    {
+      "epoch": 2.6301638176638176,
+      "grad_norm": 0.8955824971199036,
+      "learning_rate": 5.262440372833313e-05,
+      "loss": 0.9306,
+      "step": 14773
+    },
+    {
+      "epoch": 2.6303418803418803,
+      "grad_norm": 0.8430632948875427,
+      "learning_rate": 5.2612077232272305e-05,
+      "loss": 0.9343,
+      "step": 14774
+    },
+    {
+      "epoch": 2.630519943019943,
+      "grad_norm": 1.0231794118881226,
+      "learning_rate": 5.2599751664679334e-05,
+      "loss": 0.941,
+      "step": 14775
+    },
+    {
+      "epoch": 2.630698005698006,
+      "grad_norm": 0.9726024866104126,
+      "learning_rate": 5.258742702579579e-05,
+      "loss": 1.1726,
+      "step": 14776
+    },
+    {
+      "epoch": 2.630876068376068,
+      "grad_norm": 0.8575723171234131,
+      "learning_rate": 5.257510331586312e-05,
+      "loss": 0.5644,
+      "step": 14777
+    },
+    {
+      "epoch": 2.631054131054131,
+      "grad_norm": 0.853165864944458,
+      "learning_rate": 5.2562780535122744e-05,
+      "loss": 0.8555,
+      "step": 14778
+    },
+    {
+      "epoch": 2.6312321937321936,
+      "grad_norm": 0.861574649810791,
+      "learning_rate": 5.255045868381623e-05,
+      "loss": 0.8298,
+      "step": 14779
+    },
+    {
+      "epoch": 2.6314102564102564,
+      "grad_norm": 0.8744526505470276,
+      "learning_rate": 5.2538137762184816e-05,
+      "loss": 0.9889,
+      "step": 14780
+    },
+    {
+      "epoch": 2.631588319088319,
+      "grad_norm": 0.7891412973403931,
+      "learning_rate": 5.2525817770470084e-05,
+      "loss": 0.9765,
+      "step": 14781
+    },
+    {
+      "epoch": 2.631766381766382,
+      "grad_norm": 0.9155156016349792,
+      "learning_rate": 5.251349870891327e-05,
+      "loss": 0.8927,
+      "step": 14782
+    },
+    {
+      "epoch": 2.6319444444444446,
+      "grad_norm": 0.8547508120536804,
+      "learning_rate": 5.250118057775582e-05,
+      "loss": 0.8479,
+      "step": 14783
+    },
+    {
+      "epoch": 2.6321225071225074,
+      "grad_norm": 0.7606263756752014,
+      "learning_rate": 5.248886337723908e-05,
+      "loss": 0.7557,
+      "step": 14784
+    },
+    {
+      "epoch": 2.6323005698005697,
+      "grad_norm": 0.855315625667572,
+      "learning_rate": 5.247654710760437e-05,
+      "loss": 0.8527,
+      "step": 14785
+    },
+    {
+      "epoch": 2.6324786324786325,
+      "grad_norm": 0.7656288743019104,
+      "learning_rate": 5.246423176909298e-05,
+      "loss": 0.8881,
+      "step": 14786
+    },
+    {
+      "epoch": 2.632656695156695,
+      "grad_norm": 0.817034125328064,
+      "learning_rate": 5.2451917361946236e-05,
+      "loss": 1.042,
+      "step": 14787
+    },
+    {
+      "epoch": 2.632834757834758,
+      "grad_norm": 0.8473303318023682,
+      "learning_rate": 5.2439603886405356e-05,
+      "loss": 0.8804,
+      "step": 14788
+    },
+    {
+      "epoch": 2.6330128205128203,
+      "grad_norm": 0.9563126564025879,
+      "learning_rate": 5.242729134271171e-05,
+      "loss": 0.8463,
+      "step": 14789
+    },
+    {
+      "epoch": 2.633190883190883,
+      "grad_norm": 0.8297066688537598,
+      "learning_rate": 5.241497973110641e-05,
+      "loss": 0.7776,
+      "step": 14790
+    },
+    {
+      "epoch": 2.6333689458689458,
+      "grad_norm": 0.8433563709259033,
+      "learning_rate": 5.240266905183075e-05,
+      "loss": 0.8712,
+      "step": 14791
+    },
+    {
+      "epoch": 2.6335470085470085,
+      "grad_norm": 0.814725935459137,
+      "learning_rate": 5.239035930512593e-05,
+      "loss": 0.9819,
+      "step": 14792
+    },
+    {
+      "epoch": 2.6337250712250713,
+      "grad_norm": 0.844292163848877,
+      "learning_rate": 5.23780504912331e-05,
+      "loss": 0.8693,
+      "step": 14793
+    },
+    {
+      "epoch": 2.633903133903134,
+      "grad_norm": 0.8194862008094788,
+      "learning_rate": 5.2365742610393464e-05,
+      "loss": 0.7878,
+      "step": 14794
+    },
+    {
+      "epoch": 2.6340811965811968,
+      "grad_norm": 0.8570502400398254,
+      "learning_rate": 5.2353435662848135e-05,
+      "loss": 0.815,
+      "step": 14795
+    },
+    {
+      "epoch": 2.6342592592592595,
+      "grad_norm": 0.9301772713661194,
+      "learning_rate": 5.2341129648838275e-05,
+      "loss": 0.9092,
+      "step": 14796
+    },
+    {
+      "epoch": 2.634437321937322,
+      "grad_norm": 0.7605858445167542,
+      "learning_rate": 5.232882456860493e-05,
+      "loss": 0.8753,
+      "step": 14797
+    },
+    {
+      "epoch": 2.6346153846153846,
+      "grad_norm": 0.8265452980995178,
+      "learning_rate": 5.231652042238927e-05,
+      "loss": 0.9134,
+      "step": 14798
+    },
+    {
+      "epoch": 2.6347934472934473,
+      "grad_norm": 0.7440468072891235,
+      "learning_rate": 5.230421721043235e-05,
+      "loss": 0.7471,
+      "step": 14799
+    },
+    {
+      "epoch": 2.63497150997151,
+      "grad_norm": 0.9172230958938599,
+      "learning_rate": 5.2291914932975205e-05,
+      "loss": 1.0155,
+      "step": 14800
+    },
+    {
+      "epoch": 2.6351495726495724,
+      "grad_norm": 0.8364499807357788,
+      "learning_rate": 5.227961359025888e-05,
+      "loss": 0.9561,
+      "step": 14801
+    },
+    {
+      "epoch": 2.635327635327635,
+      "grad_norm": 0.7756382822990417,
+      "learning_rate": 5.22673131825244e-05,
+      "loss": 0.6893,
+      "step": 14802
+    },
+    {
+      "epoch": 2.635505698005698,
+      "grad_norm": 0.9042136669158936,
+      "learning_rate": 5.225501371001273e-05,
+      "loss": 0.7613,
+      "step": 14803
+    },
+    {
+      "epoch": 2.6356837606837606,
+      "grad_norm": 0.8989379405975342,
+      "learning_rate": 5.224271517296495e-05,
+      "loss": 0.8092,
+      "step": 14804
+    },
+    {
+      "epoch": 2.6358618233618234,
+      "grad_norm": 0.7999827265739441,
+      "learning_rate": 5.2230417571621906e-05,
+      "loss": 0.8115,
+      "step": 14805
+    },
+    {
+      "epoch": 2.636039886039886,
+      "grad_norm": 0.9071131348609924,
+      "learning_rate": 5.221812090622464e-05,
+      "loss": 0.9072,
+      "step": 14806
+    },
+    {
+      "epoch": 2.636217948717949,
+      "grad_norm": 0.7227704524993896,
+      "learning_rate": 5.220582517701398e-05,
+      "loss": 0.7598,
+      "step": 14807
+    },
+    {
+      "epoch": 2.6363960113960117,
+      "grad_norm": 0.8520537614822388,
+      "learning_rate": 5.219353038423094e-05,
+      "loss": 1.1072,
+      "step": 14808
+    },
+    {
+      "epoch": 2.636574074074074,
+      "grad_norm": 0.8690574765205383,
+      "learning_rate": 5.218123652811634e-05,
+      "loss": 0.773,
+      "step": 14809
+    },
+    {
+      "epoch": 2.6367521367521367,
+      "grad_norm": 0.7897602319717407,
+      "learning_rate": 5.216894360891109e-05,
+      "loss": 0.792,
+      "step": 14810
+    },
+    {
+      "epoch": 2.6369301994301995,
+      "grad_norm": 0.8746532201766968,
+      "learning_rate": 5.215665162685601e-05,
+      "loss": 0.8853,
+      "step": 14811
+    },
+    {
+      "epoch": 2.637108262108262,
+      "grad_norm": 0.8525128364562988,
+      "learning_rate": 5.214436058219199e-05,
+      "loss": 0.7293,
+      "step": 14812
+    },
+    {
+      "epoch": 2.6372863247863245,
+      "grad_norm": 0.979969322681427,
+      "learning_rate": 5.213207047515975e-05,
+      "loss": 0.8485,
+      "step": 14813
+    },
+    {
+      "epoch": 2.6374643874643873,
+      "grad_norm": 0.8439529538154602,
+      "learning_rate": 5.211978130600024e-05,
+      "loss": 0.7492,
+      "step": 14814
+    },
+    {
+      "epoch": 2.63764245014245,
+      "grad_norm": 0.8356610536575317,
+      "learning_rate": 5.2107493074954064e-05,
+      "loss": 0.8255,
+      "step": 14815
+    },
+    {
+      "epoch": 2.6378205128205128,
+      "grad_norm": 0.7857736349105835,
+      "learning_rate": 5.2095205782262116e-05,
+      "loss": 0.766,
+      "step": 14816
+    },
+    {
+      "epoch": 2.6379985754985755,
+      "grad_norm": 0.919058084487915,
+      "learning_rate": 5.20829194281651e-05,
+      "loss": 1.0661,
+      "step": 14817
+    },
+    {
+      "epoch": 2.6381766381766383,
+      "grad_norm": 0.8793047070503235,
+      "learning_rate": 5.207063401290373e-05,
+      "loss": 0.8297,
+      "step": 14818
+    },
+    {
+      "epoch": 2.638354700854701,
+      "grad_norm": 0.7848390340805054,
+      "learning_rate": 5.205834953671873e-05,
+      "loss": 0.8051,
+      "step": 14819
+    },
+    {
+      "epoch": 2.638532763532764,
+      "grad_norm": 0.8391907215118408,
+      "learning_rate": 5.2046065999850736e-05,
+      "loss": 0.8444,
+      "step": 14820
+    },
+    {
+      "epoch": 2.638710826210826,
+      "grad_norm": 0.8137226700782776,
+      "learning_rate": 5.2033783402540546e-05,
+      "loss": 0.7908,
+      "step": 14821
+    },
+    {
+      "epoch": 2.638888888888889,
+      "grad_norm": 0.8440108299255371,
+      "learning_rate": 5.2021501745028645e-05,
+      "loss": 0.7985,
+      "step": 14822
+    },
+    {
+      "epoch": 2.6390669515669516,
+      "grad_norm": 0.7432600855827332,
+      "learning_rate": 5.200922102755581e-05,
+      "loss": 0.7816,
+      "step": 14823
+    },
+    {
+      "epoch": 2.6392450142450143,
+      "grad_norm": 0.9003379344940186,
+      "learning_rate": 5.199694125036257e-05,
+      "loss": 0.9171,
+      "step": 14824
+    },
+    {
+      "epoch": 2.6394230769230766,
+      "grad_norm": 0.8994988203048706,
+      "learning_rate": 5.198466241368957e-05,
+      "loss": 0.8333,
+      "step": 14825
+    },
+    {
+      "epoch": 2.6396011396011394,
+      "grad_norm": 0.9042859077453613,
+      "learning_rate": 5.197238451777735e-05,
+      "loss": 0.7491,
+      "step": 14826
+    },
+    {
+      "epoch": 2.639779202279202,
+      "grad_norm": 0.8024145364761353,
+      "learning_rate": 5.196010756286649e-05,
+      "loss": 0.882,
+      "step": 14827
+    },
+    {
+      "epoch": 2.639957264957265,
+      "grad_norm": 0.73011714220047,
+      "learning_rate": 5.1947831549197504e-05,
+      "loss": 0.865,
+      "step": 14828
+    },
+    {
+      "epoch": 2.6401353276353277,
+      "grad_norm": 0.845160186290741,
+      "learning_rate": 5.1935556477011006e-05,
+      "loss": 0.8912,
+      "step": 14829
+    },
+    {
+      "epoch": 2.6403133903133904,
+      "grad_norm": 0.8264908194541931,
+      "learning_rate": 5.192328234654735e-05,
+      "loss": 0.8597,
+      "step": 14830
+    },
+    {
+      "epoch": 2.640491452991453,
+      "grad_norm": 0.8400609493255615,
+      "learning_rate": 5.191100915804718e-05,
+      "loss": 0.5906,
+      "step": 14831
+    },
+    {
+      "epoch": 2.640669515669516,
+      "grad_norm": 0.8633815050125122,
+      "learning_rate": 5.189873691175082e-05,
+      "loss": 1.031,
+      "step": 14832
+    },
+    {
+      "epoch": 2.640847578347578,
+      "grad_norm": 0.9047896862030029,
+      "learning_rate": 5.188646560789884e-05,
+      "loss": 0.6929,
+      "step": 14833
+    },
+    {
+      "epoch": 2.641025641025641,
+      "grad_norm": 0.8293144106864929,
+      "learning_rate": 5.18741952467316e-05,
+      "loss": 0.9608,
+      "step": 14834
+    },
+    {
+      "epoch": 2.6412037037037037,
+      "grad_norm": 0.7980968356132507,
+      "learning_rate": 5.186192582848955e-05,
+      "loss": 0.6021,
+      "step": 14835
+    },
+    {
+      "epoch": 2.6413817663817665,
+      "grad_norm": 0.7945372462272644,
+      "learning_rate": 5.184965735341305e-05,
+      "loss": 0.7069,
+      "step": 14836
+    },
+    {
+      "epoch": 2.6415598290598292,
+      "grad_norm": 0.8388827443122864,
+      "learning_rate": 5.183738982174246e-05,
+      "loss": 1.2404,
+      "step": 14837
+    },
+    {
+      "epoch": 2.6417378917378915,
+      "grad_norm": 0.8332177400588989,
+      "learning_rate": 5.18251232337182e-05,
+      "loss": 0.9353,
+      "step": 14838
+    },
+    {
+      "epoch": 2.6419159544159543,
+      "grad_norm": 0.9658130407333374,
+      "learning_rate": 5.1812857589580565e-05,
+      "loss": 0.8,
+      "step": 14839
+    },
+    {
+      "epoch": 2.642094017094017,
+      "grad_norm": 0.9074252247810364,
+      "learning_rate": 5.180059288956991e-05,
+      "loss": 0.7567,
+      "step": 14840
+    },
+    {
+      "epoch": 2.64227207977208,
+      "grad_norm": 0.8543582558631897,
+      "learning_rate": 5.178832913392649e-05,
+      "loss": 0.9754,
+      "step": 14841
+    },
+    {
+      "epoch": 2.6424501424501425,
+      "grad_norm": 0.8235877156257629,
+      "learning_rate": 5.177606632289063e-05,
+      "loss": 0.825,
+      "step": 14842
+    },
+    {
+      "epoch": 2.6426282051282053,
+      "grad_norm": 0.8550012111663818,
+      "learning_rate": 5.1763804456702545e-05,
+      "loss": 1.0286,
+      "step": 14843
+    },
+    {
+      "epoch": 2.642806267806268,
+      "grad_norm": 0.8879600763320923,
+      "learning_rate": 5.175154353560254e-05,
+      "loss": 0.8935,
+      "step": 14844
+    },
+    {
+      "epoch": 2.6429843304843303,
+      "grad_norm": 0.8822683095932007,
+      "learning_rate": 5.1739283559830754e-05,
+      "loss": 0.8659,
+      "step": 14845
+    },
+    {
+      "epoch": 2.643162393162393,
+      "grad_norm": 1.0260087251663208,
+      "learning_rate": 5.1727024529627544e-05,
+      "loss": 0.8952,
+      "step": 14846
+    },
+    {
+      "epoch": 2.643340455840456,
+      "grad_norm": 0.8105470538139343,
+      "learning_rate": 5.171476644523292e-05,
+      "loss": 0.7987,
+      "step": 14847
+    },
+    {
+      "epoch": 2.6435185185185186,
+      "grad_norm": 0.8861166834831238,
+      "learning_rate": 5.170250930688719e-05,
+      "loss": 0.8476,
+      "step": 14848
+    },
+    {
+      "epoch": 2.6436965811965814,
+      "grad_norm": 0.8035899996757507,
+      "learning_rate": 5.169025311483047e-05,
+      "loss": 0.7366,
+      "step": 14849
+    },
+    {
+      "epoch": 2.6438746438746437,
+      "grad_norm": 0.8359752297401428,
+      "learning_rate": 5.1677997869302874e-05,
+      "loss": 0.8931,
+      "step": 14850
+    },
+    {
+      "epoch": 2.6440527065527064,
+      "grad_norm": 0.8483668565750122,
+      "learning_rate": 5.166574357054452e-05,
+      "loss": 0.7662,
+      "step": 14851
+    },
+    {
+      "epoch": 2.644230769230769,
+      "grad_norm": 0.9865937829017639,
+      "learning_rate": 5.165349021879553e-05,
+      "loss": 1.101,
+      "step": 14852
+    },
+    {
+      "epoch": 2.644408831908832,
+      "grad_norm": 0.8491073250770569,
+      "learning_rate": 5.164123781429596e-05,
+      "loss": 0.9576,
+      "step": 14853
+    },
+    {
+      "epoch": 2.6445868945868947,
+      "grad_norm": 0.8185597062110901,
+      "learning_rate": 5.162898635728588e-05,
+      "loss": 0.6353,
+      "step": 14854
+    },
+    {
+      "epoch": 2.6447649572649574,
+      "grad_norm": 0.8583887815475464,
+      "learning_rate": 5.1616735848005306e-05,
+      "loss": 0.8715,
+      "step": 14855
+    },
+    {
+      "epoch": 2.64494301994302,
+      "grad_norm": 0.8107531666755676,
+      "learning_rate": 5.16044862866943e-05,
+      "loss": 0.7111,
+      "step": 14856
+    },
+    {
+      "epoch": 2.6451210826210825,
+      "grad_norm": 0.7675925493240356,
+      "learning_rate": 5.1592237673592867e-05,
+      "loss": 0.8145,
+      "step": 14857
+    },
+    {
+      "epoch": 2.6452991452991452,
+      "grad_norm": 0.9418326020240784,
+      "learning_rate": 5.157999000894098e-05,
+      "loss": 0.8454,
+      "step": 14858
+    },
+    {
+      "epoch": 2.645477207977208,
+      "grad_norm": 0.8420053720474243,
+      "learning_rate": 5.15677432929786e-05,
+      "loss": 0.7343,
+      "step": 14859
+    },
+    {
+      "epoch": 2.6456552706552707,
+      "grad_norm": 0.9815202951431274,
+      "learning_rate": 5.155549752594564e-05,
+      "loss": 0.9252,
+      "step": 14860
+    },
+    {
+      "epoch": 2.6458333333333335,
+      "grad_norm": 0.8282185792922974,
+      "learning_rate": 5.1543252708082146e-05,
+      "loss": 0.9935,
+      "step": 14861
+    },
+    {
+      "epoch": 2.646011396011396,
+      "grad_norm": 0.7398781180381775,
+      "learning_rate": 5.153100883962788e-05,
+      "loss": 0.5024,
+      "step": 14862
+    },
+    {
+      "epoch": 2.6461894586894585,
+      "grad_norm": 1.0273998975753784,
+      "learning_rate": 5.1518765920822856e-05,
+      "loss": 0.9023,
+      "step": 14863
+    },
+    {
+      "epoch": 2.6463675213675213,
+      "grad_norm": 0.8017948269844055,
+      "learning_rate": 5.150652395190689e-05,
+      "loss": 0.6755,
+      "step": 14864
+    },
+    {
+      "epoch": 2.646545584045584,
+      "grad_norm": 0.7470258474349976,
+      "learning_rate": 5.1494282933119864e-05,
+      "loss": 0.5408,
+      "step": 14865
+    },
+    {
+      "epoch": 2.646723646723647,
+      "grad_norm": 0.8118627071380615,
+      "learning_rate": 5.1482042864701595e-05,
+      "loss": 0.8032,
+      "step": 14866
+    },
+    {
+      "epoch": 2.6469017094017095,
+      "grad_norm": 0.8302956223487854,
+      "learning_rate": 5.146980374689192e-05,
+      "loss": 0.7428,
+      "step": 14867
+    },
+    {
+      "epoch": 2.6470797720797723,
+      "grad_norm": 0.8660209774971008,
+      "learning_rate": 5.145756557993061e-05,
+      "loss": 0.8284,
+      "step": 14868
+    },
+    {
+      "epoch": 2.6472578347578346,
+      "grad_norm": 1.0153858661651611,
+      "learning_rate": 5.1445328364057475e-05,
+      "loss": 0.9766,
+      "step": 14869
+    },
+    {
+      "epoch": 2.6474358974358974,
+      "grad_norm": 0.9047706127166748,
+      "learning_rate": 5.143309209951223e-05,
+      "loss": 1.0099,
+      "step": 14870
+    },
+    {
+      "epoch": 2.64761396011396,
+      "grad_norm": 0.7924295663833618,
+      "learning_rate": 5.1420856786534724e-05,
+      "loss": 0.8385,
+      "step": 14871
+    },
+    {
+      "epoch": 2.647792022792023,
+      "grad_norm": 0.8885742425918579,
+      "learning_rate": 5.140862242536455e-05,
+      "loss": 1.0259,
+      "step": 14872
+    },
+    {
+      "epoch": 2.6479700854700856,
+      "grad_norm": 0.8826889991760254,
+      "learning_rate": 5.139638901624151e-05,
+      "loss": 1.0755,
+      "step": 14873
+    },
+    {
+      "epoch": 2.648148148148148,
+      "grad_norm": 0.7793754935264587,
+      "learning_rate": 5.138415655940526e-05,
+      "loss": 0.8289,
+      "step": 14874
+    },
+    {
+      "epoch": 2.6483262108262107,
+      "grad_norm": 0.8587870597839355,
+      "learning_rate": 5.137192505509547e-05,
+      "loss": 0.9185,
+      "step": 14875
+    },
+    {
+      "epoch": 2.6485042735042734,
+      "grad_norm": 0.8799259066581726,
+      "learning_rate": 5.13596945035518e-05,
+      "loss": 0.9493,
+      "step": 14876
+    },
+    {
+      "epoch": 2.648682336182336,
+      "grad_norm": 0.8108882308006287,
+      "learning_rate": 5.1347464905013834e-05,
+      "loss": 0.8892,
+      "step": 14877
+    },
+    {
+      "epoch": 2.648860398860399,
+      "grad_norm": 0.8387644290924072,
+      "learning_rate": 5.1335236259721296e-05,
+      "loss": 0.8723,
+      "step": 14878
+    },
+    {
+      "epoch": 2.6490384615384617,
+      "grad_norm": 0.8750926852226257,
+      "learning_rate": 5.1323008567913655e-05,
+      "loss": 0.6978,
+      "step": 14879
+    },
+    {
+      "epoch": 2.6492165242165244,
+      "grad_norm": 0.7837518453598022,
+      "learning_rate": 5.131078182983055e-05,
+      "loss": 0.8747,
+      "step": 14880
+    },
+    {
+      "epoch": 2.6493945868945867,
+      "grad_norm": 0.8998439311981201,
+      "learning_rate": 5.1298556045711566e-05,
+      "loss": 0.9903,
+      "step": 14881
+    },
+    {
+      "epoch": 2.6495726495726495,
+      "grad_norm": 0.8012915253639221,
+      "learning_rate": 5.128633121579619e-05,
+      "loss": 0.9767,
+      "step": 14882
+    },
+    {
+      "epoch": 2.6497507122507122,
+      "grad_norm": 0.9051218032836914,
+      "learning_rate": 5.1274107340323964e-05,
+      "loss": 0.7454,
+      "step": 14883
+    },
+    {
+      "epoch": 2.649928774928775,
+      "grad_norm": 0.8373401761054993,
+      "learning_rate": 5.1261884419534376e-05,
+      "loss": 0.821,
+      "step": 14884
+    },
+    {
+      "epoch": 2.6501068376068377,
+      "grad_norm": 0.7482876181602478,
+      "learning_rate": 5.124966245366689e-05,
+      "loss": 0.7051,
+      "step": 14885
+    },
+    {
+      "epoch": 2.6502849002849,
+      "grad_norm": 0.8445764183998108,
+      "learning_rate": 5.1237441442961074e-05,
+      "loss": 0.7416,
+      "step": 14886
+    },
+    {
+      "epoch": 2.650462962962963,
+      "grad_norm": 0.887598991394043,
+      "learning_rate": 5.122522138765622e-05,
+      "loss": 0.9027,
+      "step": 14887
+    },
+    {
+      "epoch": 2.6506410256410255,
+      "grad_norm": 0.8089238405227661,
+      "learning_rate": 5.1213002287991905e-05,
+      "loss": 0.9294,
+      "step": 14888
+    },
+    {
+      "epoch": 2.6508190883190883,
+      "grad_norm": 0.8614209890365601,
+      "learning_rate": 5.120078414420739e-05,
+      "loss": 0.7716,
+      "step": 14889
+    },
+    {
+      "epoch": 2.650997150997151,
+      "grad_norm": 0.6805269718170166,
+      "learning_rate": 5.118856695654217e-05,
+      "loss": 0.6183,
+      "step": 14890
+    },
+    {
+      "epoch": 2.651175213675214,
+      "grad_norm": 0.9024596214294434,
+      "learning_rate": 5.117635072523559e-05,
+      "loss": 0.9516,
+      "step": 14891
+    },
+    {
+      "epoch": 2.6513532763532766,
+      "grad_norm": 0.906373143196106,
+      "learning_rate": 5.116413545052701e-05,
+      "loss": 0.7522,
+      "step": 14892
+    },
+    {
+      "epoch": 2.6515313390313393,
+      "grad_norm": 0.827235996723175,
+      "learning_rate": 5.1151921132655725e-05,
+      "loss": 0.6776,
+      "step": 14893
+    },
+    {
+      "epoch": 2.6517094017094016,
+      "grad_norm": 0.7769291996955872,
+      "learning_rate": 5.113970777186108e-05,
+      "loss": 0.6682,
+      "step": 14894
+    },
+    {
+      "epoch": 2.6518874643874644,
+      "grad_norm": 0.8420324921607971,
+      "learning_rate": 5.112749536838233e-05,
+      "loss": 0.8303,
+      "step": 14895
+    },
+    {
+      "epoch": 2.652065527065527,
+      "grad_norm": 0.789368748664856,
+      "learning_rate": 5.1115283922458814e-05,
+      "loss": 0.773,
+      "step": 14896
+    },
+    {
+      "epoch": 2.65224358974359,
+      "grad_norm": 0.9156190752983093,
+      "learning_rate": 5.1103073434329766e-05,
+      "loss": 1.0318,
+      "step": 14897
+    },
+    {
+      "epoch": 2.652421652421652,
+      "grad_norm": 1.0411027669906616,
+      "learning_rate": 5.109086390423441e-05,
+      "loss": 0.81,
+      "step": 14898
+    },
+    {
+      "epoch": 2.652599715099715,
+      "grad_norm": 0.9908538460731506,
+      "learning_rate": 5.107865533241198e-05,
+      "loss": 0.9386,
+      "step": 14899
+    },
+    {
+      "epoch": 2.6527777777777777,
+      "grad_norm": 0.7364035844802856,
+      "learning_rate": 5.106644771910165e-05,
+      "loss": 0.675,
+      "step": 14900
+    },
+    {
+      "epoch": 2.6529558404558404,
+      "grad_norm": 0.8409245014190674,
+      "learning_rate": 5.1054241064542686e-05,
+      "loss": 0.9446,
+      "step": 14901
+    },
+    {
+      "epoch": 2.653133903133903,
+      "grad_norm": 0.7731066942214966,
+      "learning_rate": 5.104203536897412e-05,
+      "loss": 0.4684,
+      "step": 14902
+    },
+    {
+      "epoch": 2.653311965811966,
+      "grad_norm": 0.9114529490470886,
+      "learning_rate": 5.102983063263525e-05,
+      "loss": 0.9551,
+      "step": 14903
+    },
+    {
+      "epoch": 2.6534900284900287,
+      "grad_norm": 0.7949321269989014,
+      "learning_rate": 5.101762685576503e-05,
+      "loss": 0.989,
+      "step": 14904
+    },
+    {
+      "epoch": 2.6536680911680914,
+      "grad_norm": 0.940191924571991,
+      "learning_rate": 5.1005424038602724e-05,
+      "loss": 1.0377,
+      "step": 14905
+    },
+    {
+      "epoch": 2.6538461538461537,
+      "grad_norm": 0.7629654407501221,
+      "learning_rate": 5.0993222181387334e-05,
+      "loss": 0.7908,
+      "step": 14906
+    },
+    {
+      "epoch": 2.6540242165242165,
+      "grad_norm": 0.9712302684783936,
+      "learning_rate": 5.098102128435797e-05,
+      "loss": 1.1486,
+      "step": 14907
+    },
+    {
+      "epoch": 2.6542022792022792,
+      "grad_norm": 0.9054526686668396,
+      "learning_rate": 5.096882134775365e-05,
+      "loss": 0.8078,
+      "step": 14908
+    },
+    {
+      "epoch": 2.654380341880342,
+      "grad_norm": 0.824647068977356,
+      "learning_rate": 5.095662237181343e-05,
+      "loss": 0.9095,
+      "step": 14909
+    },
+    {
+      "epoch": 2.6545584045584043,
+      "grad_norm": 0.8760488033294678,
+      "learning_rate": 5.0944424356776287e-05,
+      "loss": 0.8538,
+      "step": 14910
+    },
+    {
+      "epoch": 2.654736467236467,
+      "grad_norm": 0.8012890219688416,
+      "learning_rate": 5.093222730288131e-05,
+      "loss": 0.7972,
+      "step": 14911
+    },
+    {
+      "epoch": 2.65491452991453,
+      "grad_norm": 0.9025147557258606,
+      "learning_rate": 5.0920031210367326e-05,
+      "loss": 0.8485,
+      "step": 14912
+    },
+    {
+      "epoch": 2.6550925925925926,
+      "grad_norm": 0.8621100783348083,
+      "learning_rate": 5.090783607947347e-05,
+      "loss": 1.1856,
+      "step": 14913
+    },
+    {
+      "epoch": 2.6552706552706553,
+      "grad_norm": 0.7914317846298218,
+      "learning_rate": 5.08956419104385e-05,
+      "loss": 0.78,
+      "step": 14914
+    },
+    {
+      "epoch": 2.655448717948718,
+      "grad_norm": 0.8691070675849915,
+      "learning_rate": 5.088344870350146e-05,
+      "loss": 0.8406,
+      "step": 14915
+    },
+    {
+      "epoch": 2.655626780626781,
+      "grad_norm": 0.8521141409873962,
+      "learning_rate": 5.087125645890121e-05,
+      "loss": 1.0077,
+      "step": 14916
+    },
+    {
+      "epoch": 2.6558048433048436,
+      "grad_norm": 0.7918437123298645,
+      "learning_rate": 5.08590651768766e-05,
+      "loss": 0.8367,
+      "step": 14917
+    },
+    {
+      "epoch": 2.655982905982906,
+      "grad_norm": 0.8580697178840637,
+      "learning_rate": 5.084687485766659e-05,
+      "loss": 0.921,
+      "step": 14918
+    },
+    {
+      "epoch": 2.6561609686609686,
+      "grad_norm": 0.7943900227546692,
+      "learning_rate": 5.0834685501509894e-05,
+      "loss": 0.7934,
+      "step": 14919
+    },
+    {
+      "epoch": 2.6563390313390314,
+      "grad_norm": 0.7467655539512634,
+      "learning_rate": 5.082249710864544e-05,
+      "loss": 0.8625,
+      "step": 14920
+    },
+    {
+      "epoch": 2.656517094017094,
+      "grad_norm": 0.7654036283493042,
+      "learning_rate": 5.0810309679311996e-05,
+      "loss": 0.888,
+      "step": 14921
+    },
+    {
+      "epoch": 2.6566951566951564,
+      "grad_norm": 0.8428319692611694,
+      "learning_rate": 5.079812321374836e-05,
+      "loss": 0.858,
+      "step": 14922
+    },
+    {
+      "epoch": 2.656873219373219,
+      "grad_norm": 0.8273693323135376,
+      "learning_rate": 5.078593771219329e-05,
+      "loss": 0.8982,
+      "step": 14923
+    },
+    {
+      "epoch": 2.657051282051282,
+      "grad_norm": 0.9037185311317444,
+      "learning_rate": 5.077375317488553e-05,
+      "loss": 0.7022,
+      "step": 14924
+    },
+    {
+      "epoch": 2.6572293447293447,
+      "grad_norm": 0.916585385799408,
+      "learning_rate": 5.0761569602063816e-05,
+      "loss": 0.8058,
+      "step": 14925
+    },
+    {
+      "epoch": 2.6574074074074074,
+      "grad_norm": 0.8697561621665955,
+      "learning_rate": 5.074938699396687e-05,
+      "loss": 0.8142,
+      "step": 14926
+    },
+    {
+      "epoch": 2.65758547008547,
+      "grad_norm": 1.024512529373169,
+      "learning_rate": 5.073720535083334e-05,
+      "loss": 0.7462,
+      "step": 14927
+    },
+    {
+      "epoch": 2.657763532763533,
+      "grad_norm": 0.8258776664733887,
+      "learning_rate": 5.072502467290201e-05,
+      "loss": 0.7467,
+      "step": 14928
+    },
+    {
+      "epoch": 2.6579415954415957,
+      "grad_norm": 0.8279047012329102,
+      "learning_rate": 5.071284496041138e-05,
+      "loss": 0.9148,
+      "step": 14929
+    },
+    {
+      "epoch": 2.658119658119658,
+      "grad_norm": 0.8176717758178711,
+      "learning_rate": 5.070066621360021e-05,
+      "loss": 1.0971,
+      "step": 14930
+    },
+    {
+      "epoch": 2.6582977207977208,
+      "grad_norm": 0.7482925057411194,
+      "learning_rate": 5.0688488432707074e-05,
+      "loss": 0.8666,
+      "step": 14931
+    },
+    {
+      "epoch": 2.6584757834757835,
+      "grad_norm": 0.9302734136581421,
+      "learning_rate": 5.067631161797057e-05,
+      "loss": 0.9994,
+      "step": 14932
+    },
+    {
+      "epoch": 2.6586538461538463,
+      "grad_norm": 0.7811494469642639,
+      "learning_rate": 5.066413576962927e-05,
+      "loss": 0.5959,
+      "step": 14933
+    },
+    {
+      "epoch": 2.6588319088319086,
+      "grad_norm": 0.8109773993492126,
+      "learning_rate": 5.065196088792177e-05,
+      "loss": 0.7342,
+      "step": 14934
+    },
+    {
+      "epoch": 2.6590099715099713,
+      "grad_norm": 0.8351961374282837,
+      "learning_rate": 5.0639786973086525e-05,
+      "loss": 0.775,
+      "step": 14935
+    },
+    {
+      "epoch": 2.659188034188034,
+      "grad_norm": 0.8558792471885681,
+      "learning_rate": 5.062761402536216e-05,
+      "loss": 0.8819,
+      "step": 14936
+    },
+    {
+      "epoch": 2.659366096866097,
+      "grad_norm": 0.7928652167320251,
+      "learning_rate": 5.061544204498714e-05,
+      "loss": 0.8313,
+      "step": 14937
+    },
+    {
+      "epoch": 2.6595441595441596,
+      "grad_norm": 0.8388734459877014,
+      "learning_rate": 5.060327103219993e-05,
+      "loss": 0.7208,
+      "step": 14938
+    },
+    {
+      "epoch": 2.6597222222222223,
+      "grad_norm": 0.8921391367912292,
+      "learning_rate": 5.059110098723903e-05,
+      "loss": 0.8974,
+      "step": 14939
+    },
+    {
+      "epoch": 2.659900284900285,
+      "grad_norm": 0.8111342787742615,
+      "learning_rate": 5.057893191034286e-05,
+      "loss": 0.6879,
+      "step": 14940
+    },
+    {
+      "epoch": 2.660078347578348,
+      "grad_norm": 0.8677322864532471,
+      "learning_rate": 5.056676380174985e-05,
+      "loss": 0.8643,
+      "step": 14941
+    },
+    {
+      "epoch": 2.66025641025641,
+      "grad_norm": 0.7969355583190918,
+      "learning_rate": 5.055459666169839e-05,
+      "loss": 0.8462,
+      "step": 14942
+    },
+    {
+      "epoch": 2.660434472934473,
+      "grad_norm": 0.9927026629447937,
+      "learning_rate": 5.0542430490426975e-05,
+      "loss": 0.7954,
+      "step": 14943
+    },
+    {
+      "epoch": 2.6606125356125356,
+      "grad_norm": 1.0181084871292114,
+      "learning_rate": 5.053026528817379e-05,
+      "loss": 0.9597,
+      "step": 14944
+    },
+    {
+      "epoch": 2.6607905982905984,
+      "grad_norm": 1.0274122953414917,
+      "learning_rate": 5.0518101055177355e-05,
+      "loss": 0.7321,
+      "step": 14945
+    },
+    {
+      "epoch": 2.6609686609686607,
+      "grad_norm": 1.056132197380066,
+      "learning_rate": 5.050593779167594e-05,
+      "loss": 0.8405,
+      "step": 14946
+    },
+    {
+      "epoch": 2.6611467236467234,
+      "grad_norm": 0.8586339950561523,
+      "learning_rate": 5.0493775497907846e-05,
+      "loss": 1.0238,
+      "step": 14947
+    },
+    {
+      "epoch": 2.661324786324786,
+      "grad_norm": 0.8103144764900208,
+      "learning_rate": 5.048161417411139e-05,
+      "loss": 0.5885,
+      "step": 14948
+    },
+    {
+      "epoch": 2.661502849002849,
+      "grad_norm": 0.7321345210075378,
+      "learning_rate": 5.0469453820524834e-05,
+      "loss": 0.7987,
+      "step": 14949
+    },
+    {
+      "epoch": 2.6616809116809117,
+      "grad_norm": 0.8244233727455139,
+      "learning_rate": 5.045729443738645e-05,
+      "loss": 0.8855,
+      "step": 14950
+    },
+    {
+      "epoch": 2.6618589743589745,
+      "grad_norm": 0.7888374924659729,
+      "learning_rate": 5.0445136024934456e-05,
+      "loss": 0.9192,
+      "step": 14951
+    },
+    {
+      "epoch": 2.662037037037037,
+      "grad_norm": 0.8414669036865234,
+      "learning_rate": 5.0432978583407044e-05,
+      "loss": 0.8152,
+      "step": 14952
+    },
+    {
+      "epoch": 2.6622150997151,
+      "grad_norm": 0.9176363348960876,
+      "learning_rate": 5.042082211304252e-05,
+      "loss": 0.8836,
+      "step": 14953
+    },
+    {
+      "epoch": 2.6623931623931623,
+      "grad_norm": 0.9827163219451904,
+      "learning_rate": 5.040866661407893e-05,
+      "loss": 0.963,
+      "step": 14954
+    },
+    {
+      "epoch": 2.662571225071225,
+      "grad_norm": 0.8765084743499756,
+      "learning_rate": 5.0396512086754535e-05,
+      "loss": 1.022,
+      "step": 14955
+    },
+    {
+      "epoch": 2.6627492877492878,
+      "grad_norm": 0.9236209392547607,
+      "learning_rate": 5.038435853130743e-05,
+      "loss": 0.9152,
+      "step": 14956
+    },
+    {
+      "epoch": 2.6629273504273505,
+      "grad_norm": 0.8300418853759766,
+      "learning_rate": 5.037220594797574e-05,
+      "loss": 0.8063,
+      "step": 14957
+    },
+    {
+      "epoch": 2.6631054131054133,
+      "grad_norm": 0.9248050451278687,
+      "learning_rate": 5.036005433699764e-05,
+      "loss": 0.8799,
+      "step": 14958
+    },
+    {
+      "epoch": 2.6632834757834756,
+      "grad_norm": 0.9670597910881042,
+      "learning_rate": 5.0347903698611085e-05,
+      "loss": 0.9068,
+      "step": 14959
+    },
+    {
+      "epoch": 2.6634615384615383,
+      "grad_norm": 0.851403534412384,
+      "learning_rate": 5.033575403305428e-05,
+      "loss": 0.8058,
+      "step": 14960
+    },
+    {
+      "epoch": 2.663639601139601,
+      "grad_norm": 0.9643952250480652,
+      "learning_rate": 5.032360534056515e-05,
+      "loss": 1.076,
+      "step": 14961
+    },
+    {
+      "epoch": 2.663817663817664,
+      "grad_norm": 0.8473731279373169,
+      "learning_rate": 5.031145762138181e-05,
+      "loss": 0.9585,
+      "step": 14962
+    },
+    {
+      "epoch": 2.6639957264957266,
+      "grad_norm": 0.8265015482902527,
+      "learning_rate": 5.029931087574222e-05,
+      "loss": 0.8602,
+      "step": 14963
+    },
+    {
+      "epoch": 2.6641737891737893,
+      "grad_norm": 0.8004183173179626,
+      "learning_rate": 5.0287165103884416e-05,
+      "loss": 0.7293,
+      "step": 14964
+    },
+    {
+      "epoch": 2.664351851851852,
+      "grad_norm": 0.8410465121269226,
+      "learning_rate": 5.027502030604633e-05,
+      "loss": 0.9479,
+      "step": 14965
+    },
+    {
+      "epoch": 2.6645299145299144,
+      "grad_norm": 0.8365132808685303,
+      "learning_rate": 5.0262876482465925e-05,
+      "loss": 0.7373,
+      "step": 14966
+    },
+    {
+      "epoch": 2.664707977207977,
+      "grad_norm": 0.9017055630683899,
+      "learning_rate": 5.025073363338111e-05,
+      "loss": 0.9463,
+      "step": 14967
+    },
+    {
+      "epoch": 2.66488603988604,
+      "grad_norm": 0.7985300421714783,
+      "learning_rate": 5.023859175902988e-05,
+      "loss": 0.7074,
+      "step": 14968
+    },
+    {
+      "epoch": 2.6650641025641026,
+      "grad_norm": 0.8032601475715637,
+      "learning_rate": 5.022645085965001e-05,
+      "loss": 0.6796,
+      "step": 14969
+    },
+    {
+      "epoch": 2.6652421652421654,
+      "grad_norm": 0.7785899639129639,
+      "learning_rate": 5.021431093547948e-05,
+      "loss": 0.7256,
+      "step": 14970
+    },
+    {
+      "epoch": 2.6654202279202277,
+      "grad_norm": 0.8083044290542603,
+      "learning_rate": 5.02021719867561e-05,
+      "loss": 0.9254,
+      "step": 14971
+    },
+    {
+      "epoch": 2.6655982905982905,
+      "grad_norm": 0.8896783590316772,
+      "learning_rate": 5.019003401371771e-05,
+      "loss": 0.9231,
+      "step": 14972
+    },
+    {
+      "epoch": 2.665776353276353,
+      "grad_norm": 0.9304720163345337,
+      "learning_rate": 5.017789701660215e-05,
+      "loss": 0.8915,
+      "step": 14973
+    },
+    {
+      "epoch": 2.665954415954416,
+      "grad_norm": 0.8683121204376221,
+      "learning_rate": 5.016576099564718e-05,
+      "loss": 0.8654,
+      "step": 14974
+    },
+    {
+      "epoch": 2.6661324786324787,
+      "grad_norm": 1.1082890033721924,
+      "learning_rate": 5.015362595109062e-05,
+      "loss": 1.0669,
+      "step": 14975
+    },
+    {
+      "epoch": 2.6663105413105415,
+      "grad_norm": 1.1696041822433472,
+      "learning_rate": 5.014149188317017e-05,
+      "loss": 0.9273,
+      "step": 14976
+    },
+    {
+      "epoch": 2.666488603988604,
+      "grad_norm": 0.8726202845573425,
+      "learning_rate": 5.0129358792123637e-05,
+      "loss": 0.6615,
+      "step": 14977
+    },
+    {
+      "epoch": 2.6666666666666665,
+      "grad_norm": 0.8246448636054993,
+      "learning_rate": 5.011722667818875e-05,
+      "loss": 0.8263,
+      "step": 14978
+    },
+    {
+      "epoch": 2.6668447293447293,
+      "grad_norm": 0.7201130390167236,
+      "learning_rate": 5.010509554160316e-05,
+      "loss": 0.7122,
+      "step": 14979
+    },
+    {
+      "epoch": 2.667022792022792,
+      "grad_norm": 0.8296586275100708,
+      "learning_rate": 5.009296538260457e-05,
+      "loss": 0.8816,
+      "step": 14980
+    },
+    {
+      "epoch": 2.6672008547008548,
+      "grad_norm": 0.8647085428237915,
+      "learning_rate": 5.008083620143067e-05,
+      "loss": 1.0,
+      "step": 14981
+    },
+    {
+      "epoch": 2.6673789173789175,
+      "grad_norm": 0.8175796270370483,
+      "learning_rate": 5.0068707998319045e-05,
+      "loss": 0.6727,
+      "step": 14982
+    },
+    {
+      "epoch": 2.66755698005698,
+      "grad_norm": 0.8537090420722961,
+      "learning_rate": 5.0056580773507434e-05,
+      "loss": 0.8034,
+      "step": 14983
+    },
+    {
+      "epoch": 2.6677350427350426,
+      "grad_norm": 0.7980232238769531,
+      "learning_rate": 5.00444545272333e-05,
+      "loss": 0.83,
+      "step": 14984
+    },
+    {
+      "epoch": 2.6679131054131053,
+      "grad_norm": 0.8231784701347351,
+      "learning_rate": 5.003232925973438e-05,
+      "loss": 0.6292,
+      "step": 14985
+    },
+    {
+      "epoch": 2.668091168091168,
+      "grad_norm": 0.9140519499778748,
+      "learning_rate": 5.0020204971248096e-05,
+      "loss": 0.893,
+      "step": 14986
+    },
+    {
+      "epoch": 2.668269230769231,
+      "grad_norm": 0.7462875247001648,
+      "learning_rate": 5.000808166201212e-05,
+      "loss": 0.7335,
+      "step": 14987
+    },
+    {
+      "epoch": 2.6684472934472936,
+      "grad_norm": 0.8201214671134949,
+      "learning_rate": 4.999595933226392e-05,
+      "loss": 0.7888,
+      "step": 14988
+    },
+    {
+      "epoch": 2.6686253561253563,
+      "grad_norm": 0.9165699481964111,
+      "learning_rate": 4.9983837982241024e-05,
+      "loss": 0.8808,
+      "step": 14989
+    },
+    {
+      "epoch": 2.6688034188034186,
+      "grad_norm": 0.9286229610443115,
+      "learning_rate": 4.997171761218092e-05,
+      "loss": 0.969,
+      "step": 14990
+    },
+    {
+      "epoch": 2.6689814814814814,
+      "grad_norm": 0.6710283160209656,
+      "learning_rate": 4.995959822232109e-05,
+      "loss": 0.6046,
+      "step": 14991
+    },
+    {
+      "epoch": 2.669159544159544,
+      "grad_norm": 0.9091618061065674,
+      "learning_rate": 4.994747981289895e-05,
+      "loss": 0.9747,
+      "step": 14992
+    },
+    {
+      "epoch": 2.669337606837607,
+      "grad_norm": 0.7992748618125916,
+      "learning_rate": 4.993536238415204e-05,
+      "loss": 0.8441,
+      "step": 14993
+    },
+    {
+      "epoch": 2.6695156695156697,
+      "grad_norm": 0.926811695098877,
+      "learning_rate": 4.992324593631762e-05,
+      "loss": 1.0308,
+      "step": 14994
+    },
+    {
+      "epoch": 2.669693732193732,
+      "grad_norm": 0.8966291546821594,
+      "learning_rate": 4.9911130469633216e-05,
+      "loss": 1.0689,
+      "step": 14995
+    },
+    {
+      "epoch": 2.6698717948717947,
+      "grad_norm": 0.8300046324729919,
+      "learning_rate": 4.989901598433616e-05,
+      "loss": 0.8539,
+      "step": 14996
+    },
+    {
+      "epoch": 2.6700498575498575,
+      "grad_norm": 0.9567606449127197,
+      "learning_rate": 4.988690248066381e-05,
+      "loss": 0.7707,
+      "step": 14997
+    },
+    {
+      "epoch": 2.67022792022792,
+      "grad_norm": 0.7993598580360413,
+      "learning_rate": 4.987478995885351e-05,
+      "loss": 0.9241,
+      "step": 14998
+    },
+    {
+      "epoch": 2.670405982905983,
+      "grad_norm": 0.9573900103569031,
+      "learning_rate": 4.986267841914253e-05,
+      "loss": 0.8051,
+      "step": 14999
+    },
+    {
+      "epoch": 2.6705840455840457,
+      "grad_norm": 0.8562188148498535,
+      "learning_rate": 4.985056786176828e-05,
+      "loss": 0.8818,
+      "step": 15000
+    },
+    {
+      "epoch": 2.6707621082621085,
+      "grad_norm": 0.7997880578041077,
+      "learning_rate": 4.983845828696792e-05,
+      "loss": 0.8749,
+      "step": 15001
+    },
+    {
+      "epoch": 2.6709401709401708,
+      "grad_norm": 0.8442137837409973,
+      "learning_rate": 4.982634969497879e-05,
+      "loss": 1.0239,
+      "step": 15002
+    },
+    {
+      "epoch": 2.6711182336182335,
+      "grad_norm": 0.775762140750885,
+      "learning_rate": 4.981424208603812e-05,
+      "loss": 0.728,
+      "step": 15003
+    },
+    {
+      "epoch": 2.6712962962962963,
+      "grad_norm": 0.7570006251335144,
+      "learning_rate": 4.9802135460383126e-05,
+      "loss": 0.6964,
+      "step": 15004
+    },
+    {
+      "epoch": 2.671474358974359,
+      "grad_norm": 0.8406931161880493,
+      "learning_rate": 4.979002981825101e-05,
+      "loss": 0.783,
+      "step": 15005
+    },
+    {
+      "epoch": 2.671652421652422,
+      "grad_norm": 0.827357828617096,
+      "learning_rate": 4.977792515987896e-05,
+      "loss": 0.9294,
+      "step": 15006
+    },
+    {
+      "epoch": 2.671830484330484,
+      "grad_norm": 0.9244057536125183,
+      "learning_rate": 4.9765821485504094e-05,
+      "loss": 0.8993,
+      "step": 15007
+    },
+    {
+      "epoch": 2.672008547008547,
+      "grad_norm": 0.7569696307182312,
+      "learning_rate": 4.975371879536368e-05,
+      "loss": 0.8305,
+      "step": 15008
+    },
+    {
+      "epoch": 2.6721866096866096,
+      "grad_norm": 0.8337959051132202,
+      "learning_rate": 4.9741617089694695e-05,
+      "loss": 0.8793,
+      "step": 15009
+    },
+    {
+      "epoch": 2.6723646723646723,
+      "grad_norm": 0.7254770994186401,
+      "learning_rate": 4.97295163687344e-05,
+      "loss": 0.9325,
+      "step": 15010
+    },
+    {
+      "epoch": 2.672542735042735,
+      "grad_norm": 0.7988013029098511,
+      "learning_rate": 4.971741663271972e-05,
+      "loss": 0.9787,
+      "step": 15011
+    },
+    {
+      "epoch": 2.672720797720798,
+      "grad_norm": 0.8326970338821411,
+      "learning_rate": 4.9705317881887845e-05,
+      "loss": 0.9164,
+      "step": 15012
+    },
+    {
+      "epoch": 2.6728988603988606,
+      "grad_norm": 0.7416687607765198,
+      "learning_rate": 4.96932201164758e-05,
+      "loss": 0.9041,
+      "step": 15013
+    },
+    {
+      "epoch": 2.6730769230769234,
+      "grad_norm": 0.868765652179718,
+      "learning_rate": 4.968112333672059e-05,
+      "loss": 0.646,
+      "step": 15014
+    },
+    {
+      "epoch": 2.6732549857549857,
+      "grad_norm": 0.7440044283866882,
+      "learning_rate": 4.966902754285925e-05,
+      "loss": 0.9147,
+      "step": 15015
+    },
+    {
+      "epoch": 2.6734330484330484,
+      "grad_norm": 0.8410077691078186,
+      "learning_rate": 4.9656932735128724e-05,
+      "loss": 0.8044,
+      "step": 15016
+    },
+    {
+      "epoch": 2.673611111111111,
+      "grad_norm": 0.8185286521911621,
+      "learning_rate": 4.964483891376606e-05,
+      "loss": 0.8057,
+      "step": 15017
+    },
+    {
+      "epoch": 2.673789173789174,
+      "grad_norm": 0.8550063967704773,
+      "learning_rate": 4.9632746079008166e-05,
+      "loss": 0.7841,
+      "step": 15018
+    },
+    {
+      "epoch": 2.673967236467236,
+      "grad_norm": 0.9171682000160217,
+      "learning_rate": 4.962065423109199e-05,
+      "loss": 0.8731,
+      "step": 15019
+    },
+    {
+      "epoch": 2.674145299145299,
+      "grad_norm": 0.8567686676979065,
+      "learning_rate": 4.9608563370254436e-05,
+      "loss": 0.9284,
+      "step": 15020
+    },
+    {
+      "epoch": 2.6743233618233617,
+      "grad_norm": 0.8641629219055176,
+      "learning_rate": 4.959647349673241e-05,
+      "loss": 1.0165,
+      "step": 15021
+    },
+    {
+      "epoch": 2.6745014245014245,
+      "grad_norm": 0.8058172464370728,
+      "learning_rate": 4.958438461076277e-05,
+      "loss": 0.9737,
+      "step": 15022
+    },
+    {
+      "epoch": 2.6746794871794872,
+      "grad_norm": 0.8329246640205383,
+      "learning_rate": 4.95722967125824e-05,
+      "loss": 0.7943,
+      "step": 15023
+    },
+    {
+      "epoch": 2.67485754985755,
+      "grad_norm": 0.9603211879730225,
+      "learning_rate": 4.956020980242807e-05,
+      "loss": 0.9453,
+      "step": 15024
+    },
+    {
+      "epoch": 2.6750356125356127,
+      "grad_norm": 0.8635705709457397,
+      "learning_rate": 4.9548123880536736e-05,
+      "loss": 0.9028,
+      "step": 15025
+    },
+    {
+      "epoch": 2.6752136752136755,
+      "grad_norm": 0.8909839987754822,
+      "learning_rate": 4.9536038947145024e-05,
+      "loss": 1.0376,
+      "step": 15026
+    },
+    {
+      "epoch": 2.675391737891738,
+      "grad_norm": 0.7507481575012207,
+      "learning_rate": 4.952395500248984e-05,
+      "loss": 0.7151,
+      "step": 15027
+    },
+    {
+      "epoch": 2.6755698005698005,
+      "grad_norm": 0.9425675272941589,
+      "learning_rate": 4.951187204680791e-05,
+      "loss": 0.839,
+      "step": 15028
+    },
+    {
+      "epoch": 2.6757478632478633,
+      "grad_norm": 0.8826829195022583,
+      "learning_rate": 4.949979008033596e-05,
+      "loss": 1.0107,
+      "step": 15029
+    },
+    {
+      "epoch": 2.675925925925926,
+      "grad_norm": 0.9209766387939453,
+      "learning_rate": 4.948770910331072e-05,
+      "loss": 0.8685,
+      "step": 15030
+    },
+    {
+      "epoch": 2.6761039886039883,
+      "grad_norm": 0.8018497824668884,
+      "learning_rate": 4.947562911596889e-05,
+      "loss": 0.7417,
+      "step": 15031
+    },
+    {
+      "epoch": 2.676282051282051,
+      "grad_norm": 0.7865417003631592,
+      "learning_rate": 4.9463550118547155e-05,
+      "loss": 0.9332,
+      "step": 15032
+    },
+    {
+      "epoch": 2.676460113960114,
+      "grad_norm": 0.8146806955337524,
+      "learning_rate": 4.945147211128216e-05,
+      "loss": 0.8658,
+      "step": 15033
+    },
+    {
+      "epoch": 2.6766381766381766,
+      "grad_norm": 0.8176286816596985,
+      "learning_rate": 4.943939509441054e-05,
+      "loss": 1.0603,
+      "step": 15034
+    },
+    {
+      "epoch": 2.6768162393162394,
+      "grad_norm": 0.8441028594970703,
+      "learning_rate": 4.942731906816897e-05,
+      "loss": 0.8699,
+      "step": 15035
+    },
+    {
+      "epoch": 2.676994301994302,
+      "grad_norm": 1.0035977363586426,
+      "learning_rate": 4.941524403279405e-05,
+      "loss": 0.8149,
+      "step": 15036
+    },
+    {
+      "epoch": 2.677172364672365,
+      "grad_norm": 0.8316586017608643,
+      "learning_rate": 4.9403169988522324e-05,
+      "loss": 0.9674,
+      "step": 15037
+    },
+    {
+      "epoch": 2.6773504273504276,
+      "grad_norm": 0.7379693388938904,
+      "learning_rate": 4.9391096935590375e-05,
+      "loss": 0.7097,
+      "step": 15038
+    },
+    {
+      "epoch": 2.67752849002849,
+      "grad_norm": 0.8861358165740967,
+      "learning_rate": 4.937902487423473e-05,
+      "loss": 0.9145,
+      "step": 15039
+    },
+    {
+      "epoch": 2.6777065527065527,
+      "grad_norm": 0.8769996166229248,
+      "learning_rate": 4.9366953804691994e-05,
+      "loss": 0.92,
+      "step": 15040
+    },
+    {
+      "epoch": 2.6778846153846154,
+      "grad_norm": 0.891703724861145,
+      "learning_rate": 4.9354883727198545e-05,
+      "loss": 0.8898,
+      "step": 15041
+    },
+    {
+      "epoch": 2.678062678062678,
+      "grad_norm": 0.8371208310127258,
+      "learning_rate": 4.934281464199099e-05,
+      "loss": 0.8868,
+      "step": 15042
+    },
+    {
+      "epoch": 2.6782407407407405,
+      "grad_norm": 0.8618297576904297,
+      "learning_rate": 4.933074654930574e-05,
+      "loss": 0.8577,
+      "step": 15043
+    },
+    {
+      "epoch": 2.6784188034188032,
+      "grad_norm": 0.7748361229896545,
+      "learning_rate": 4.931867944937926e-05,
+      "loss": 0.7273,
+      "step": 15044
+    },
+    {
+      "epoch": 2.678596866096866,
+      "grad_norm": 0.8320143222808838,
+      "learning_rate": 4.930661334244797e-05,
+      "loss": 0.8654,
+      "step": 15045
+    },
+    {
+      "epoch": 2.6787749287749287,
+      "grad_norm": 0.8370615243911743,
+      "learning_rate": 4.929454822874829e-05,
+      "loss": 0.751,
+      "step": 15046
+    },
+    {
+      "epoch": 2.6789529914529915,
+      "grad_norm": 0.9115342497825623,
+      "learning_rate": 4.9282484108516614e-05,
+      "loss": 0.823,
+      "step": 15047
+    },
+    {
+      "epoch": 2.6791310541310542,
+      "grad_norm": 0.9542914032936096,
+      "learning_rate": 4.9270420981989294e-05,
+      "loss": 0.9271,
+      "step": 15048
+    },
+    {
+      "epoch": 2.679309116809117,
+      "grad_norm": 0.765336275100708,
+      "learning_rate": 4.9258358849402655e-05,
+      "loss": 0.5523,
+      "step": 15049
+    },
+    {
+      "epoch": 2.6794871794871797,
+      "grad_norm": 0.8169335722923279,
+      "learning_rate": 4.924629771099315e-05,
+      "loss": 0.7437,
+      "step": 15050
+    },
+    {
+      "epoch": 2.679665242165242,
+      "grad_norm": 0.8192304968833923,
+      "learning_rate": 4.9234237566996935e-05,
+      "loss": 0.8888,
+      "step": 15051
+    },
+    {
+      "epoch": 2.679843304843305,
+      "grad_norm": 0.8657594919204712,
+      "learning_rate": 4.922217841765041e-05,
+      "loss": 0.9858,
+      "step": 15052
+    },
+    {
+      "epoch": 2.6800213675213675,
+      "grad_norm": 0.9291723370552063,
+      "learning_rate": 4.921012026318982e-05,
+      "loss": 0.9731,
+      "step": 15053
+    },
+    {
+      "epoch": 2.6801994301994303,
+      "grad_norm": 0.7988953590393066,
+      "learning_rate": 4.919806310385138e-05,
+      "loss": 0.8467,
+      "step": 15054
+    },
+    {
+      "epoch": 2.6803774928774926,
+      "grad_norm": 0.8022913336753845,
+      "learning_rate": 4.9186006939871434e-05,
+      "loss": 0.9009,
+      "step": 15055
+    },
+    {
+      "epoch": 2.6805555555555554,
+      "grad_norm": 0.8444825410842896,
+      "learning_rate": 4.917395177148605e-05,
+      "loss": 0.8851,
+      "step": 15056
+    },
+    {
+      "epoch": 2.680733618233618,
+      "grad_norm": 0.8054760694503784,
+      "learning_rate": 4.9161897598931575e-05,
+      "loss": 0.8679,
+      "step": 15057
+    },
+    {
+      "epoch": 2.680911680911681,
+      "grad_norm": 0.8291507959365845,
+      "learning_rate": 4.9149844422444023e-05,
+      "loss": 0.7229,
+      "step": 15058
+    },
+    {
+      "epoch": 2.6810897435897436,
+      "grad_norm": 0.9225491285324097,
+      "learning_rate": 4.91377922422597e-05,
+      "loss": 0.7584,
+      "step": 15059
+    },
+    {
+      "epoch": 2.6812678062678064,
+      "grad_norm": 0.9598490595817566,
+      "learning_rate": 4.912574105861466e-05,
+      "loss": 1.0548,
+      "step": 15060
+    },
+    {
+      "epoch": 2.681445868945869,
+      "grad_norm": 0.7480899691581726,
+      "learning_rate": 4.911369087174504e-05,
+      "loss": 0.8389,
+      "step": 15061
+    },
+    {
+      "epoch": 2.681623931623932,
+      "grad_norm": 1.0396811962127686,
+      "learning_rate": 4.910164168188696e-05,
+      "loss": 0.8776,
+      "step": 15062
+    },
+    {
+      "epoch": 2.681801994301994,
+      "grad_norm": 0.8191503882408142,
+      "learning_rate": 4.9089593489276465e-05,
+      "loss": 0.7601,
+      "step": 15063
+    },
+    {
+      "epoch": 2.681980056980057,
+      "grad_norm": 0.8405289053916931,
+      "learning_rate": 4.907754629414959e-05,
+      "loss": 1.0859,
+      "step": 15064
+    },
+    {
+      "epoch": 2.6821581196581197,
+      "grad_norm": 0.8369600176811218,
+      "learning_rate": 4.90655000967425e-05,
+      "loss": 0.9159,
+      "step": 15065
+    },
+    {
+      "epoch": 2.6823361823361824,
+      "grad_norm": 0.8304924368858337,
+      "learning_rate": 4.905345489729104e-05,
+      "loss": 0.743,
+      "step": 15066
+    },
+    {
+      "epoch": 2.682514245014245,
+      "grad_norm": 0.7378702163696289,
+      "learning_rate": 4.904141069603139e-05,
+      "loss": 0.9386,
+      "step": 15067
+    },
+    {
+      "epoch": 2.6826923076923075,
+      "grad_norm": 0.9135075807571411,
+      "learning_rate": 4.902936749319935e-05,
+      "loss": 0.7341,
+      "step": 15068
+    },
+    {
+      "epoch": 2.6828703703703702,
+      "grad_norm": 0.77586430311203,
+      "learning_rate": 4.901732528903101e-05,
+      "loss": 0.5586,
+      "step": 15069
+    },
+    {
+      "epoch": 2.683048433048433,
+      "grad_norm": 0.8733307719230652,
+      "learning_rate": 4.900528408376228e-05,
+      "loss": 0.8173,
+      "step": 15070
+    },
+    {
+      "epoch": 2.6832264957264957,
+      "grad_norm": 0.7499578595161438,
+      "learning_rate": 4.8993243877629066e-05,
+      "loss": 0.7355,
+      "step": 15071
+    },
+    {
+      "epoch": 2.6834045584045585,
+      "grad_norm": 0.8372282385826111,
+      "learning_rate": 4.8981204670867295e-05,
+      "loss": 0.8169,
+      "step": 15072
+    },
+    {
+      "epoch": 2.6835826210826212,
+      "grad_norm": 0.7705212235450745,
+      "learning_rate": 4.8969166463712834e-05,
+      "loss": 0.7382,
+      "step": 15073
+    },
+    {
+      "epoch": 2.683760683760684,
+      "grad_norm": 0.8367058038711548,
+      "learning_rate": 4.89571292564015e-05,
+      "loss": 0.7268,
+      "step": 15074
+    },
+    {
+      "epoch": 2.6839387464387463,
+      "grad_norm": 0.8421934843063354,
+      "learning_rate": 4.8945093049169233e-05,
+      "loss": 0.8319,
+      "step": 15075
+    },
+    {
+      "epoch": 2.684116809116809,
+      "grad_norm": 0.8927276730537415,
+      "learning_rate": 4.893305784225181e-05,
+      "loss": 0.8669,
+      "step": 15076
+    },
+    {
+      "epoch": 2.684294871794872,
+      "grad_norm": 0.8147335052490234,
+      "learning_rate": 4.892102363588503e-05,
+      "loss": 0.7722,
+      "step": 15077
+    },
+    {
+      "epoch": 2.6844729344729346,
+      "grad_norm": 0.9491320848464966,
+      "learning_rate": 4.890899043030469e-05,
+      "loss": 1.0213,
+      "step": 15078
+    },
+    {
+      "epoch": 2.6846509971509973,
+      "grad_norm": 0.8635398745536804,
+      "learning_rate": 4.889695822574651e-05,
+      "loss": 0.797,
+      "step": 15079
+    },
+    {
+      "epoch": 2.6848290598290596,
+      "grad_norm": 0.7290985584259033,
+      "learning_rate": 4.888492702244636e-05,
+      "loss": 1.0142,
+      "step": 15080
+    },
+    {
+      "epoch": 2.6850071225071224,
+      "grad_norm": 0.7667058110237122,
+      "learning_rate": 4.8872896820639794e-05,
+      "loss": 0.7547,
+      "step": 15081
+    },
+    {
+      "epoch": 2.685185185185185,
+      "grad_norm": 0.9096128344535828,
+      "learning_rate": 4.886086762056269e-05,
+      "loss": 0.7972,
+      "step": 15082
+    },
+    {
+      "epoch": 2.685363247863248,
+      "grad_norm": 0.7461803555488586,
+      "learning_rate": 4.884883942245057e-05,
+      "loss": 0.8994,
+      "step": 15083
+    },
+    {
+      "epoch": 2.6855413105413106,
+      "grad_norm": 0.7640016674995422,
+      "learning_rate": 4.883681222653923e-05,
+      "loss": 0.7607,
+      "step": 15084
+    },
+    {
+      "epoch": 2.6857193732193734,
+      "grad_norm": 0.7481253743171692,
+      "learning_rate": 4.882478603306427e-05,
+      "loss": 0.7089,
+      "step": 15085
+    },
+    {
+      "epoch": 2.685897435897436,
+      "grad_norm": 0.825998842716217,
+      "learning_rate": 4.881276084226132e-05,
+      "loss": 0.6617,
+      "step": 15086
+    },
+    {
+      "epoch": 2.6860754985754984,
+      "grad_norm": 0.9775291085243225,
+      "learning_rate": 4.8800736654365986e-05,
+      "loss": 0.9345,
+      "step": 15087
+    },
+    {
+      "epoch": 2.686253561253561,
+      "grad_norm": 0.8158339262008667,
+      "learning_rate": 4.878871346961387e-05,
+      "loss": 0.8198,
+      "step": 15088
+    },
+    {
+      "epoch": 2.686431623931624,
+      "grad_norm": 0.8778133988380432,
+      "learning_rate": 4.8776691288240486e-05,
+      "loss": 0.8323,
+      "step": 15089
+    },
+    {
+      "epoch": 2.6866096866096867,
+      "grad_norm": 0.9657309055328369,
+      "learning_rate": 4.8764670110481505e-05,
+      "loss": 0.907,
+      "step": 15090
+    },
+    {
+      "epoch": 2.6867877492877494,
+      "grad_norm": 1.0467438697814941,
+      "learning_rate": 4.8752649936572304e-05,
+      "loss": 1.0128,
+      "step": 15091
+    },
+    {
+      "epoch": 2.6869658119658117,
+      "grad_norm": 0.7682142853736877,
+      "learning_rate": 4.874063076674854e-05,
+      "loss": 1.0164,
+      "step": 15092
+    },
+    {
+      "epoch": 2.6871438746438745,
+      "grad_norm": 0.8184331059455872,
+      "learning_rate": 4.8728612601245574e-05,
+      "loss": 0.6614,
+      "step": 15093
+    },
+    {
+      "epoch": 2.6873219373219372,
+      "grad_norm": 0.8372936248779297,
+      "learning_rate": 4.871659544029896e-05,
+      "loss": 0.9011,
+      "step": 15094
+    },
+    {
+      "epoch": 2.6875,
+      "grad_norm": 0.7872710824012756,
+      "learning_rate": 4.870457928414414e-05,
+      "loss": 0.6986,
+      "step": 15095
+    },
+    {
+      "epoch": 2.6876780626780628,
+      "grad_norm": 0.7297250628471375,
+      "learning_rate": 4.8692564133016485e-05,
+      "loss": 0.5399,
+      "step": 15096
+    },
+    {
+      "epoch": 2.6878561253561255,
+      "grad_norm": 0.8855645060539246,
+      "learning_rate": 4.868054998715153e-05,
+      "loss": 0.8992,
+      "step": 15097
+    },
+    {
+      "epoch": 2.6880341880341883,
+      "grad_norm": 0.9055765271186829,
+      "learning_rate": 4.866853684678452e-05,
+      "loss": 0.888,
+      "step": 15098
+    },
+    {
+      "epoch": 2.6882122507122506,
+      "grad_norm": 1.0414996147155762,
+      "learning_rate": 4.865652471215093e-05,
+      "loss": 1.0375,
+      "step": 15099
+    },
+    {
+      "epoch": 2.6883903133903133,
+      "grad_norm": 0.8606446385383606,
+      "learning_rate": 4.8644513583486086e-05,
+      "loss": 0.8906,
+      "step": 15100
+    },
+    {
+      "epoch": 2.688568376068376,
+      "grad_norm": 0.9065528512001038,
+      "learning_rate": 4.8632503461025316e-05,
+      "loss": 0.848,
+      "step": 15101
+    },
+    {
+      "epoch": 2.688746438746439,
+      "grad_norm": 0.7832834720611572,
+      "learning_rate": 4.862049434500393e-05,
+      "loss": 0.7028,
+      "step": 15102
+    },
+    {
+      "epoch": 2.6889245014245016,
+      "grad_norm": 0.7107385396957397,
+      "learning_rate": 4.860848623565723e-05,
+      "loss": 0.7249,
+      "step": 15103
+    },
+    {
+      "epoch": 2.689102564102564,
+      "grad_norm": 0.8936449289321899,
+      "learning_rate": 4.8596479133220485e-05,
+      "loss": 0.9651,
+      "step": 15104
+    },
+    {
+      "epoch": 2.6892806267806266,
+      "grad_norm": 0.9019163846969604,
+      "learning_rate": 4.8584473037928944e-05,
+      "loss": 0.7165,
+      "step": 15105
+    },
+    {
+      "epoch": 2.6894586894586894,
+      "grad_norm": 0.8838223218917847,
+      "learning_rate": 4.857246795001782e-05,
+      "loss": 0.8148,
+      "step": 15106
+    },
+    {
+      "epoch": 2.689636752136752,
+      "grad_norm": 0.8004612922668457,
+      "learning_rate": 4.856046386972243e-05,
+      "loss": 0.9109,
+      "step": 15107
+    },
+    {
+      "epoch": 2.689814814814815,
+      "grad_norm": 0.9337486028671265,
+      "learning_rate": 4.854846079727781e-05,
+      "loss": 1.0952,
+      "step": 15108
+    },
+    {
+      "epoch": 2.6899928774928776,
+      "grad_norm": 0.6513102650642395,
+      "learning_rate": 4.853645873291926e-05,
+      "loss": 0.5435,
+      "step": 15109
+    },
+    {
+      "epoch": 2.6901709401709404,
+      "grad_norm": 0.8750485181808472,
+      "learning_rate": 4.85244576768819e-05,
+      "loss": 0.8783,
+      "step": 15110
+    },
+    {
+      "epoch": 2.6903490028490027,
+      "grad_norm": 0.9513342976570129,
+      "learning_rate": 4.851245762940085e-05,
+      "loss": 0.8822,
+      "step": 15111
+    },
+    {
+      "epoch": 2.6905270655270654,
+      "grad_norm": 0.8832191824913025,
+      "learning_rate": 4.850045859071125e-05,
+      "loss": 0.9216,
+      "step": 15112
+    },
+    {
+      "epoch": 2.690705128205128,
+      "grad_norm": 0.875396728515625,
+      "learning_rate": 4.8488460561048175e-05,
+      "loss": 0.998,
+      "step": 15113
+    },
+    {
+      "epoch": 2.690883190883191,
+      "grad_norm": 0.8847890496253967,
+      "learning_rate": 4.847646354064668e-05,
+      "loss": 1.0916,
+      "step": 15114
+    },
+    {
+      "epoch": 2.6910612535612537,
+      "grad_norm": 0.8235226273536682,
+      "learning_rate": 4.846446752974187e-05,
+      "loss": 0.8154,
+      "step": 15115
+    },
+    {
+      "epoch": 2.691239316239316,
+      "grad_norm": 0.8099366426467896,
+      "learning_rate": 4.845247252856878e-05,
+      "loss": 0.9392,
+      "step": 15116
+    },
+    {
+      "epoch": 2.6914173789173788,
+      "grad_norm": 0.8525599837303162,
+      "learning_rate": 4.84404785373624e-05,
+      "loss": 0.6619,
+      "step": 15117
+    },
+    {
+      "epoch": 2.6915954415954415,
+      "grad_norm": 1.0223274230957031,
+      "learning_rate": 4.842848555635775e-05,
+      "loss": 0.9479,
+      "step": 15118
+    },
+    {
+      "epoch": 2.6917735042735043,
+      "grad_norm": 0.7834655046463013,
+      "learning_rate": 4.841649358578978e-05,
+      "loss": 0.6962,
+      "step": 15119
+    },
+    {
+      "epoch": 2.691951566951567,
+      "grad_norm": 0.787391185760498,
+      "learning_rate": 4.8404502625893474e-05,
+      "loss": 0.8598,
+      "step": 15120
+    },
+    {
+      "epoch": 2.6921296296296298,
+      "grad_norm": 0.907228410243988,
+      "learning_rate": 4.839251267690371e-05,
+      "loss": 0.9913,
+      "step": 15121
+    },
+    {
+      "epoch": 2.6923076923076925,
+      "grad_norm": 0.8313533663749695,
+      "learning_rate": 4.838052373905554e-05,
+      "loss": 0.9542,
+      "step": 15122
+    },
+    {
+      "epoch": 2.692485754985755,
+      "grad_norm": 0.8444675207138062,
+      "learning_rate": 4.83685358125837e-05,
+      "loss": 0.7437,
+      "step": 15123
+    },
+    {
+      "epoch": 2.6926638176638176,
+      "grad_norm": 0.8656189441680908,
+      "learning_rate": 4.835654889772319e-05,
+      "loss": 1.104,
+      "step": 15124
+    },
+    {
+      "epoch": 2.6928418803418803,
+      "grad_norm": 0.9181584715843201,
+      "learning_rate": 4.8344562994708805e-05,
+      "loss": 0.8533,
+      "step": 15125
+    },
+    {
+      "epoch": 2.693019943019943,
+      "grad_norm": 0.5977702140808105,
+      "learning_rate": 4.833257810377542e-05,
+      "loss": 0.495,
+      "step": 15126
+    },
+    {
+      "epoch": 2.693198005698006,
+      "grad_norm": 0.8839932084083557,
+      "learning_rate": 4.8320594225157834e-05,
+      "loss": 0.8026,
+      "step": 15127
+    },
+    {
+      "epoch": 2.693376068376068,
+      "grad_norm": 0.876559853553772,
+      "learning_rate": 4.8308611359090846e-05,
+      "loss": 0.893,
+      "step": 15128
+    },
+    {
+      "epoch": 2.693554131054131,
+      "grad_norm": 0.7847880721092224,
+      "learning_rate": 4.829662950580924e-05,
+      "loss": 0.7794,
+      "step": 15129
+    },
+    {
+      "epoch": 2.6937321937321936,
+      "grad_norm": 0.8713442087173462,
+      "learning_rate": 4.828464866554778e-05,
+      "loss": 1.0394,
+      "step": 15130
+    },
+    {
+      "epoch": 2.6939102564102564,
+      "grad_norm": 0.9720988869667053,
+      "learning_rate": 4.827266883854116e-05,
+      "loss": 0.7844,
+      "step": 15131
+    },
+    {
+      "epoch": 2.694088319088319,
+      "grad_norm": 0.8163195252418518,
+      "learning_rate": 4.82606900250242e-05,
+      "loss": 0.711,
+      "step": 15132
+    },
+    {
+      "epoch": 2.694266381766382,
+      "grad_norm": 0.7119855880737305,
+      "learning_rate": 4.8248712225231486e-05,
+      "loss": 0.6224,
+      "step": 15133
+    },
+    {
+      "epoch": 2.6944444444444446,
+      "grad_norm": 0.8176950812339783,
+      "learning_rate": 4.823673543939777e-05,
+      "loss": 0.8695,
+      "step": 15134
+    },
+    {
+      "epoch": 2.6946225071225074,
+      "grad_norm": 0.8138632774353027,
+      "learning_rate": 4.822475966775771e-05,
+      "loss": 0.7331,
+      "step": 15135
+    },
+    {
+      "epoch": 2.6948005698005697,
+      "grad_norm": 0.9323116540908813,
+      "learning_rate": 4.821278491054589e-05,
+      "loss": 0.8275,
+      "step": 15136
+    },
+    {
+      "epoch": 2.6949786324786325,
+      "grad_norm": 0.7593950033187866,
+      "learning_rate": 4.820081116799704e-05,
+      "loss": 0.9571,
+      "step": 15137
+    },
+    {
+      "epoch": 2.695156695156695,
+      "grad_norm": 0.9058876037597656,
+      "learning_rate": 4.818883844034563e-05,
+      "loss": 0.7676,
+      "step": 15138
+    },
+    {
+      "epoch": 2.695334757834758,
+      "grad_norm": 1.0943962335586548,
+      "learning_rate": 4.8176866727826365e-05,
+      "loss": 0.7542,
+      "step": 15139
+    },
+    {
+      "epoch": 2.6955128205128203,
+      "grad_norm": 0.9133912324905396,
+      "learning_rate": 4.8164896030673664e-05,
+      "loss": 0.8419,
+      "step": 15140
+    },
+    {
+      "epoch": 2.695690883190883,
+      "grad_norm": 0.8556821942329407,
+      "learning_rate": 4.8152926349122195e-05,
+      "loss": 0.8234,
+      "step": 15141
+    },
+    {
+      "epoch": 2.6958689458689458,
+      "grad_norm": 1.0329471826553345,
+      "learning_rate": 4.814095768340643e-05,
+      "loss": 0.8181,
+      "step": 15142
+    },
+    {
+      "epoch": 2.6960470085470085,
+      "grad_norm": 0.89934903383255,
+      "learning_rate": 4.812899003376087e-05,
+      "loss": 0.8392,
+      "step": 15143
+    },
+    {
+      "epoch": 2.6962250712250713,
+      "grad_norm": 0.7836576104164124,
+      "learning_rate": 4.811702340042e-05,
+      "loss": 0.9491,
+      "step": 15144
+    },
+    {
+      "epoch": 2.696403133903134,
+      "grad_norm": 0.9841184020042419,
+      "learning_rate": 4.810505778361828e-05,
+      "loss": 1.0763,
+      "step": 15145
+    },
+    {
+      "epoch": 2.6965811965811968,
+      "grad_norm": 1.0479893684387207,
+      "learning_rate": 4.80930931835901e-05,
+      "loss": 1.054,
+      "step": 15146
+    },
+    {
+      "epoch": 2.6967592592592595,
+      "grad_norm": 0.895803689956665,
+      "learning_rate": 4.808112960057002e-05,
+      "loss": 0.8769,
+      "step": 15147
+    },
+    {
+      "epoch": 2.696937321937322,
+      "grad_norm": 0.8467312455177307,
+      "learning_rate": 4.806916703479227e-05,
+      "loss": 0.8036,
+      "step": 15148
+    },
+    {
+      "epoch": 2.6971153846153846,
+      "grad_norm": 0.7371073365211487,
+      "learning_rate": 4.8057205486491366e-05,
+      "loss": 0.72,
+      "step": 15149
+    },
+    {
+      "epoch": 2.6972934472934473,
+      "grad_norm": 0.9631866812705994,
+      "learning_rate": 4.80452449559016e-05,
+      "loss": 0.8661,
+      "step": 15150
+    },
+    {
+      "epoch": 2.69747150997151,
+      "grad_norm": 0.8467531204223633,
+      "learning_rate": 4.803328544325735e-05,
+      "loss": 0.9359,
+      "step": 15151
+    },
+    {
+      "epoch": 2.6976495726495724,
+      "grad_norm": 0.8170605897903442,
+      "learning_rate": 4.802132694879291e-05,
+      "loss": 0.9086,
+      "step": 15152
+    },
+    {
+      "epoch": 2.697827635327635,
+      "grad_norm": 0.8378857970237732,
+      "learning_rate": 4.800936947274255e-05,
+      "loss": 0.6255,
+      "step": 15153
+    },
+    {
+      "epoch": 2.698005698005698,
+      "grad_norm": 0.8074176907539368,
+      "learning_rate": 4.799741301534067e-05,
+      "loss": 0.9129,
+      "step": 15154
+    },
+    {
+      "epoch": 2.6981837606837606,
+      "grad_norm": 0.862147331237793,
+      "learning_rate": 4.798545757682139e-05,
+      "loss": 0.8298,
+      "step": 15155
+    },
+    {
+      "epoch": 2.6983618233618234,
+      "grad_norm": 0.8020915985107422,
+      "learning_rate": 4.797350315741905e-05,
+      "loss": 0.8364,
+      "step": 15156
+    },
+    {
+      "epoch": 2.698539886039886,
+      "grad_norm": 0.7929054498672485,
+      "learning_rate": 4.7961549757367854e-05,
+      "loss": 1.0302,
+      "step": 15157
+    },
+    {
+      "epoch": 2.698717948717949,
+      "grad_norm": 0.8528931140899658,
+      "learning_rate": 4.7949597376901964e-05,
+      "loss": 0.7891,
+      "step": 15158
+    },
+    {
+      "epoch": 2.6988960113960117,
+      "grad_norm": 0.8090588450431824,
+      "learning_rate": 4.793764601625561e-05,
+      "loss": 0.7905,
+      "step": 15159
+    },
+    {
+      "epoch": 2.699074074074074,
+      "grad_norm": 0.8221202492713928,
+      "learning_rate": 4.7925695675662916e-05,
+      "loss": 0.8156,
+      "step": 15160
+    },
+    {
+      "epoch": 2.6992521367521367,
+      "grad_norm": 0.8121498823165894,
+      "learning_rate": 4.791374635535802e-05,
+      "loss": 0.865,
+      "step": 15161
+    },
+    {
+      "epoch": 2.6994301994301995,
+      "grad_norm": 0.7626228928565979,
+      "learning_rate": 4.790179805557513e-05,
+      "loss": 0.8033,
+      "step": 15162
+    },
+    {
+      "epoch": 2.699608262108262,
+      "grad_norm": 0.8483169078826904,
+      "learning_rate": 4.7889850776548205e-05,
+      "loss": 0.9239,
+      "step": 15163
+    },
+    {
+      "epoch": 2.6997863247863245,
+      "grad_norm": 0.8302589058876038,
+      "learning_rate": 4.7877904518511485e-05,
+      "loss": 0.8445,
+      "step": 15164
+    },
+    {
+      "epoch": 2.6999643874643873,
+      "grad_norm": 0.9140453338623047,
+      "learning_rate": 4.786595928169887e-05,
+      "loss": 1.0492,
+      "step": 15165
+    },
+    {
+      "epoch": 2.70014245014245,
+      "grad_norm": 0.8046873807907104,
+      "learning_rate": 4.785401506634453e-05,
+      "loss": 1.0009,
+      "step": 15166
+    },
+    {
+      "epoch": 2.7003205128205128,
+      "grad_norm": 0.8879752159118652,
+      "learning_rate": 4.7842071872682434e-05,
+      "loss": 0.7788,
+      "step": 15167
+    },
+    {
+      "epoch": 2.7004985754985755,
+      "grad_norm": 0.8190163969993591,
+      "learning_rate": 4.783012970094659e-05,
+      "loss": 0.9063,
+      "step": 15168
+    },
+    {
+      "epoch": 2.7006766381766383,
+      "grad_norm": 0.9363130331039429,
+      "learning_rate": 4.781818855137099e-05,
+      "loss": 0.9723,
+      "step": 15169
+    },
+    {
+      "epoch": 2.700854700854701,
+      "grad_norm": 0.8428171873092651,
+      "learning_rate": 4.780624842418958e-05,
+      "loss": 0.9173,
+      "step": 15170
+    },
+    {
+      "epoch": 2.701032763532764,
+      "grad_norm": 0.8089821934700012,
+      "learning_rate": 4.779430931963627e-05,
+      "loss": 0.8996,
+      "step": 15171
+    },
+    {
+      "epoch": 2.701210826210826,
+      "grad_norm": 0.8893290758132935,
+      "learning_rate": 4.77823712379451e-05,
+      "loss": 0.9483,
+      "step": 15172
+    },
+    {
+      "epoch": 2.701388888888889,
+      "grad_norm": 0.8589824438095093,
+      "learning_rate": 4.777043417934981e-05,
+      "loss": 0.8765,
+      "step": 15173
+    },
+    {
+      "epoch": 2.7015669515669516,
+      "grad_norm": 0.8665438294410706,
+      "learning_rate": 4.7758498144084405e-05,
+      "loss": 0.8546,
+      "step": 15174
+    },
+    {
+      "epoch": 2.7017450142450143,
+      "grad_norm": 0.743841826915741,
+      "learning_rate": 4.774656313238272e-05,
+      "loss": 0.6866,
+      "step": 15175
+    },
+    {
+      "epoch": 2.7019230769230766,
+      "grad_norm": 0.9317346811294556,
+      "learning_rate": 4.7734629144478574e-05,
+      "loss": 0.8004,
+      "step": 15176
+    },
+    {
+      "epoch": 2.7021011396011394,
+      "grad_norm": 0.8244655132293701,
+      "learning_rate": 4.77226961806058e-05,
+      "loss": 0.9302,
+      "step": 15177
+    },
+    {
+      "epoch": 2.702279202279202,
+      "grad_norm": 1.0759600400924683,
+      "learning_rate": 4.771076424099815e-05,
+      "loss": 0.9073,
+      "step": 15178
+    },
+    {
+      "epoch": 2.702457264957265,
+      "grad_norm": 0.8852303624153137,
+      "learning_rate": 4.769883332588954e-05,
+      "loss": 0.8084,
+      "step": 15179
+    },
+    {
+      "epoch": 2.7026353276353277,
+      "grad_norm": 0.8642051815986633,
+      "learning_rate": 4.7686903435513564e-05,
+      "loss": 1.0018,
+      "step": 15180
+    },
+    {
+      "epoch": 2.7028133903133904,
+      "grad_norm": 0.9442928433418274,
+      "learning_rate": 4.767497457010408e-05,
+      "loss": 0.8099,
+      "step": 15181
+    },
+    {
+      "epoch": 2.702991452991453,
+      "grad_norm": 0.8357751965522766,
+      "learning_rate": 4.7663046729894776e-05,
+      "loss": 0.8594,
+      "step": 15182
+    },
+    {
+      "epoch": 2.703169515669516,
+      "grad_norm": 1.0791765451431274,
+      "learning_rate": 4.765111991511936e-05,
+      "loss": 1.1203,
+      "step": 15183
+    },
+    {
+      "epoch": 2.703347578347578,
+      "grad_norm": 0.7855654954910278,
+      "learning_rate": 4.7639194126011485e-05,
+      "loss": 0.7218,
+      "step": 15184
+    },
+    {
+      "epoch": 2.703525641025641,
+      "grad_norm": 0.8058420419692993,
+      "learning_rate": 4.762726936280485e-05,
+      "loss": 0.7885,
+      "step": 15185
+    },
+    {
+      "epoch": 2.7037037037037037,
+      "grad_norm": 0.7701787352561951,
+      "learning_rate": 4.761534562573302e-05,
+      "loss": 0.6378,
+      "step": 15186
+    },
+    {
+      "epoch": 2.7038817663817665,
+      "grad_norm": 0.9011744856834412,
+      "learning_rate": 4.760342291502976e-05,
+      "loss": 0.9106,
+      "step": 15187
+    },
+    {
+      "epoch": 2.7040598290598292,
+      "grad_norm": 0.7268012762069702,
+      "learning_rate": 4.759150123092851e-05,
+      "loss": 0.6303,
+      "step": 15188
+    },
+    {
+      "epoch": 2.7042378917378915,
+      "grad_norm": 0.8369283676147461,
+      "learning_rate": 4.7579580573663e-05,
+      "loss": 0.8013,
+      "step": 15189
+    },
+    {
+      "epoch": 2.7044159544159543,
+      "grad_norm": 0.9511098861694336,
+      "learning_rate": 4.756766094346663e-05,
+      "loss": 1.0211,
+      "step": 15190
+    },
+    {
+      "epoch": 2.704594017094017,
+      "grad_norm": 0.8408896327018738,
+      "learning_rate": 4.7555742340573074e-05,
+      "loss": 1.1018,
+      "step": 15191
+    },
+    {
+      "epoch": 2.70477207977208,
+      "grad_norm": 0.9166504740715027,
+      "learning_rate": 4.7543824765215795e-05,
+      "loss": 0.9222,
+      "step": 15192
+    },
+    {
+      "epoch": 2.7049501424501425,
+      "grad_norm": 0.8373738527297974,
+      "learning_rate": 4.753190821762826e-05,
+      "loss": 0.9735,
+      "step": 15193
+    },
+    {
+      "epoch": 2.7051282051282053,
+      "grad_norm": 0.8610605597496033,
+      "learning_rate": 4.751999269804408e-05,
+      "loss": 0.7942,
+      "step": 15194
+    },
+    {
+      "epoch": 2.705306267806268,
+      "grad_norm": 0.8778019547462463,
+      "learning_rate": 4.750807820669654e-05,
+      "loss": 0.8055,
+      "step": 15195
+    },
+    {
+      "epoch": 2.7054843304843303,
+      "grad_norm": 0.9997664093971252,
+      "learning_rate": 4.749616474381921e-05,
+      "loss": 0.8461,
+      "step": 15196
+    },
+    {
+      "epoch": 2.705662393162393,
+      "grad_norm": 0.8362101316452026,
+      "learning_rate": 4.748425230964545e-05,
+      "loss": 1.0008,
+      "step": 15197
+    },
+    {
+      "epoch": 2.705840455840456,
+      "grad_norm": 0.870482861995697,
+      "learning_rate": 4.747234090440869e-05,
+      "loss": 0.9547,
+      "step": 15198
+    },
+    {
+      "epoch": 2.7060185185185186,
+      "grad_norm": 0.867431104183197,
+      "learning_rate": 4.746043052834228e-05,
+      "loss": 0.8533,
+      "step": 15199
+    },
+    {
+      "epoch": 2.7061965811965814,
+      "grad_norm": 0.842071533203125,
+      "learning_rate": 4.7448521181679604e-05,
+      "loss": 0.8919,
+      "step": 15200
+    },
+    {
+      "epoch": 2.7063746438746437,
+      "grad_norm": 0.9487791657447815,
+      "learning_rate": 4.743661286465398e-05,
+      "loss": 0.8072,
+      "step": 15201
+    },
+    {
+      "epoch": 2.7065527065527064,
+      "grad_norm": 0.8469042181968689,
+      "learning_rate": 4.742470557749874e-05,
+      "loss": 0.8792,
+      "step": 15202
+    },
+    {
+      "epoch": 2.706730769230769,
+      "grad_norm": 0.86415696144104,
+      "learning_rate": 4.7412799320447145e-05,
+      "loss": 0.9725,
+      "step": 15203
+    },
+    {
+      "epoch": 2.706908831908832,
+      "grad_norm": 0.9035004377365112,
+      "learning_rate": 4.740089409373257e-05,
+      "loss": 0.9915,
+      "step": 15204
+    },
+    {
+      "epoch": 2.7070868945868947,
+      "grad_norm": 0.8122807741165161,
+      "learning_rate": 4.7388989897588156e-05,
+      "loss": 0.946,
+      "step": 15205
+    },
+    {
+      "epoch": 2.7072649572649574,
+      "grad_norm": 0.9801422357559204,
+      "learning_rate": 4.737708673224721e-05,
+      "loss": 0.9357,
+      "step": 15206
+    },
+    {
+      "epoch": 2.70744301994302,
+      "grad_norm": 1.0265265703201294,
+      "learning_rate": 4.736518459794295e-05,
+      "loss": 0.7982,
+      "step": 15207
+    },
+    {
+      "epoch": 2.7076210826210825,
+      "grad_norm": 0.828814685344696,
+      "learning_rate": 4.735328349490855e-05,
+      "loss": 0.6864,
+      "step": 15208
+    },
+    {
+      "epoch": 2.7077991452991452,
+      "grad_norm": 0.7948212623596191,
+      "learning_rate": 4.7341383423377195e-05,
+      "loss": 0.8661,
+      "step": 15209
+    },
+    {
+      "epoch": 2.707977207977208,
+      "grad_norm": 0.8372616767883301,
+      "learning_rate": 4.7329484383582046e-05,
+      "loss": 0.8818,
+      "step": 15210
+    },
+    {
+      "epoch": 2.7081552706552707,
+      "grad_norm": 0.8000285029411316,
+      "learning_rate": 4.731758637575624e-05,
+      "loss": 0.8006,
+      "step": 15211
+    },
+    {
+      "epoch": 2.7083333333333335,
+      "grad_norm": 0.7860875725746155,
+      "learning_rate": 4.730568940013289e-05,
+      "loss": 0.926,
+      "step": 15212
+    },
+    {
+      "epoch": 2.708511396011396,
+      "grad_norm": 0.9157412052154541,
+      "learning_rate": 4.7293793456945054e-05,
+      "loss": 0.7042,
+      "step": 15213
+    },
+    {
+      "epoch": 2.7086894586894585,
+      "grad_norm": 0.8802906274795532,
+      "learning_rate": 4.728189854642589e-05,
+      "loss": 0.8639,
+      "step": 15214
+    },
+    {
+      "epoch": 2.7088675213675213,
+      "grad_norm": 0.8047248721122742,
+      "learning_rate": 4.7270004668808397e-05,
+      "loss": 0.7603,
+      "step": 15215
+    },
+    {
+      "epoch": 2.709045584045584,
+      "grad_norm": 0.9848080277442932,
+      "learning_rate": 4.725811182432564e-05,
+      "loss": 0.8213,
+      "step": 15216
+    },
+    {
+      "epoch": 2.709223646723647,
+      "grad_norm": 0.8568090200424194,
+      "learning_rate": 4.724622001321062e-05,
+      "loss": 0.7663,
+      "step": 15217
+    },
+    {
+      "epoch": 2.7094017094017095,
+      "grad_norm": 0.7926214337348938,
+      "learning_rate": 4.7234329235696284e-05,
+      "loss": 0.874,
+      "step": 15218
+    },
+    {
+      "epoch": 2.7095797720797723,
+      "grad_norm": 0.8389978408813477,
+      "learning_rate": 4.7222439492015734e-05,
+      "loss": 0.623,
+      "step": 15219
+    },
+    {
+      "epoch": 2.7097578347578346,
+      "grad_norm": 0.8635174036026001,
+      "learning_rate": 4.7210550782401773e-05,
+      "loss": 0.822,
+      "step": 15220
+    },
+    {
+      "epoch": 2.7099358974358974,
+      "grad_norm": 0.8381666541099548,
+      "learning_rate": 4.7198663107087446e-05,
+      "loss": 1.0864,
+      "step": 15221
+    },
+    {
+      "epoch": 2.71011396011396,
+      "grad_norm": 1.0722376108169556,
+      "learning_rate": 4.718677646630564e-05,
+      "loss": 0.8527,
+      "step": 15222
+    },
+    {
+      "epoch": 2.710292022792023,
+      "grad_norm": 0.9505516290664673,
+      "learning_rate": 4.7174890860289224e-05,
+      "loss": 1.0645,
+      "step": 15223
+    },
+    {
+      "epoch": 2.7104700854700856,
+      "grad_norm": 0.7757406234741211,
+      "learning_rate": 4.7163006289271095e-05,
+      "loss": 0.5924,
+      "step": 15224
+    },
+    {
+      "epoch": 2.710648148148148,
+      "grad_norm": 0.816387414932251,
+      "learning_rate": 4.71511227534841e-05,
+      "loss": 0.8337,
+      "step": 15225
+    },
+    {
+      "epoch": 2.7108262108262107,
+      "grad_norm": 0.7817156910896301,
+      "learning_rate": 4.7139240253161065e-05,
+      "loss": 0.8315,
+      "step": 15226
+    },
+    {
+      "epoch": 2.7110042735042734,
+      "grad_norm": 0.9753041863441467,
+      "learning_rate": 4.7127358788534816e-05,
+      "loss": 0.851,
+      "step": 15227
+    },
+    {
+      "epoch": 2.711182336182336,
+      "grad_norm": 0.7564638257026672,
+      "learning_rate": 4.7115478359838095e-05,
+      "loss": 0.8132,
+      "step": 15228
+    },
+    {
+      "epoch": 2.711360398860399,
+      "grad_norm": 0.8709259629249573,
+      "learning_rate": 4.710359896730379e-05,
+      "loss": 1.0277,
+      "step": 15229
+    },
+    {
+      "epoch": 2.7115384615384617,
+      "grad_norm": 0.9849836230278015,
+      "learning_rate": 4.7091720611164504e-05,
+      "loss": 0.9778,
+      "step": 15230
+    },
+    {
+      "epoch": 2.7117165242165244,
+      "grad_norm": 0.8330100178718567,
+      "learning_rate": 4.707984329165309e-05,
+      "loss": 0.7138,
+      "step": 15231
+    },
+    {
+      "epoch": 2.7118945868945867,
+      "grad_norm": 1.005644679069519,
+      "learning_rate": 4.706796700900221e-05,
+      "loss": 1.0089,
+      "step": 15232
+    },
+    {
+      "epoch": 2.7120726495726495,
+      "grad_norm": 0.8292263746261597,
+      "learning_rate": 4.705609176344452e-05,
+      "loss": 0.8323,
+      "step": 15233
+    },
+    {
+      "epoch": 2.7122507122507122,
+      "grad_norm": 0.860713005065918,
+      "learning_rate": 4.704421755521281e-05,
+      "loss": 0.821,
+      "step": 15234
+    },
+    {
+      "epoch": 2.712428774928775,
+      "grad_norm": 0.8316803574562073,
+      "learning_rate": 4.703234438453958e-05,
+      "loss": 0.9181,
+      "step": 15235
+    },
+    {
+      "epoch": 2.7126068376068377,
+      "grad_norm": 0.7368014454841614,
+      "learning_rate": 4.70204722516576e-05,
+      "loss": 0.8206,
+      "step": 15236
+    },
+    {
+      "epoch": 2.7127849002849,
+      "grad_norm": 1.0202926397323608,
+      "learning_rate": 4.7008601156799336e-05,
+      "loss": 0.8101,
+      "step": 15237
+    },
+    {
+      "epoch": 2.712962962962963,
+      "grad_norm": 0.8069320917129517,
+      "learning_rate": 4.69967311001975e-05,
+      "loss": 0.9042,
+      "step": 15238
+    },
+    {
+      "epoch": 2.7131410256410255,
+      "grad_norm": 0.8426684737205505,
+      "learning_rate": 4.69848620820846e-05,
+      "loss": 0.7318,
+      "step": 15239
+    },
+    {
+      "epoch": 2.7133190883190883,
+      "grad_norm": 0.8863842487335205,
+      "learning_rate": 4.69729941026932e-05,
+      "loss": 1.0172,
+      "step": 15240
+    },
+    {
+      "epoch": 2.713497150997151,
+      "grad_norm": 0.7984182834625244,
+      "learning_rate": 4.696112716225582e-05,
+      "loss": 0.8298,
+      "step": 15241
+    },
+    {
+      "epoch": 2.713675213675214,
+      "grad_norm": 0.8328375220298767,
+      "learning_rate": 4.6949261261005e-05,
+      "loss": 0.7663,
+      "step": 15242
+    },
+    {
+      "epoch": 2.7138532763532766,
+      "grad_norm": 0.9197641015052795,
+      "learning_rate": 4.693739639917314e-05,
+      "loss": 0.8951,
+      "step": 15243
+    },
+    {
+      "epoch": 2.7140313390313393,
+      "grad_norm": 0.7421545386314392,
+      "learning_rate": 4.692553257699286e-05,
+      "loss": 0.7235,
+      "step": 15244
+    },
+    {
+      "epoch": 2.7142094017094016,
+      "grad_norm": 0.8033188581466675,
+      "learning_rate": 4.691366979469642e-05,
+      "loss": 0.9693,
+      "step": 15245
+    },
+    {
+      "epoch": 2.7143874643874644,
+      "grad_norm": 0.8765473365783691,
+      "learning_rate": 4.6901808052516436e-05,
+      "loss": 0.8851,
+      "step": 15246
+    },
+    {
+      "epoch": 2.714565527065527,
+      "grad_norm": 0.8351873755455017,
+      "learning_rate": 4.688994735068515e-05,
+      "loss": 1.0156,
+      "step": 15247
+    },
+    {
+      "epoch": 2.71474358974359,
+      "grad_norm": 0.8569470643997192,
+      "learning_rate": 4.6878087689435046e-05,
+      "loss": 0.7149,
+      "step": 15248
+    },
+    {
+      "epoch": 2.714921652421652,
+      "grad_norm": 0.8334367871284485,
+      "learning_rate": 4.686622906899847e-05,
+      "loss": 0.9218,
+      "step": 15249
+    },
+    {
+      "epoch": 2.715099715099715,
+      "grad_norm": 0.8889651298522949,
+      "learning_rate": 4.685437148960775e-05,
+      "loss": 0.8987,
+      "step": 15250
+    },
+    {
+      "epoch": 2.7152777777777777,
+      "grad_norm": 0.9381657838821411,
+      "learning_rate": 4.684251495149522e-05,
+      "loss": 0.7798,
+      "step": 15251
+    },
+    {
+      "epoch": 2.7154558404558404,
+      "grad_norm": 0.7698730826377869,
+      "learning_rate": 4.68306594548932e-05,
+      "loss": 0.8248,
+      "step": 15252
+    },
+    {
+      "epoch": 2.715633903133903,
+      "grad_norm": 0.8980026245117188,
+      "learning_rate": 4.681880500003391e-05,
+      "loss": 1.0156,
+      "step": 15253
+    },
+    {
+      "epoch": 2.715811965811966,
+      "grad_norm": 0.7872338891029358,
+      "learning_rate": 4.6806951587149694e-05,
+      "loss": 0.6389,
+      "step": 15254
+    },
+    {
+      "epoch": 2.7159900284900287,
+      "grad_norm": 0.8155974745750427,
+      "learning_rate": 4.6795099216472774e-05,
+      "loss": 0.9081,
+      "step": 15255
+    },
+    {
+      "epoch": 2.7161680911680914,
+      "grad_norm": 0.7678217887878418,
+      "learning_rate": 4.678324788823535e-05,
+      "loss": 0.6193,
+      "step": 15256
+    },
+    {
+      "epoch": 2.7163461538461537,
+      "grad_norm": 0.75429767370224,
+      "learning_rate": 4.6771397602669643e-05,
+      "loss": 0.9384,
+      "step": 15257
+    },
+    {
+      "epoch": 2.7165242165242165,
+      "grad_norm": 0.8755250573158264,
+      "learning_rate": 4.675954836000779e-05,
+      "loss": 0.8563,
+      "step": 15258
+    },
+    {
+      "epoch": 2.7167022792022792,
+      "grad_norm": 0.8393009305000305,
+      "learning_rate": 4.6747700160482053e-05,
+      "loss": 0.9407,
+      "step": 15259
+    },
+    {
+      "epoch": 2.716880341880342,
+      "grad_norm": 0.8478221297264099,
+      "learning_rate": 4.673585300432445e-05,
+      "loss": 0.7562,
+      "step": 15260
+    },
+    {
+      "epoch": 2.7170584045584043,
+      "grad_norm": 0.7497259974479675,
+      "learning_rate": 4.672400689176722e-05,
+      "loss": 0.8406,
+      "step": 15261
+    },
+    {
+      "epoch": 2.717236467236467,
+      "grad_norm": 0.9695250391960144,
+      "learning_rate": 4.671216182304234e-05,
+      "loss": 0.9505,
+      "step": 15262
+    },
+    {
+      "epoch": 2.71741452991453,
+      "grad_norm": 0.9375512599945068,
+      "learning_rate": 4.6700317798382e-05,
+      "loss": 0.9024,
+      "step": 15263
+    },
+    {
+      "epoch": 2.7175925925925926,
+      "grad_norm": 0.7930737137794495,
+      "learning_rate": 4.6688474818018194e-05,
+      "loss": 0.8416,
+      "step": 15264
+    },
+    {
+      "epoch": 2.7177706552706553,
+      "grad_norm": 0.9707022309303284,
+      "learning_rate": 4.667663288218298e-05,
+      "loss": 1.1172,
+      "step": 15265
+    },
+    {
+      "epoch": 2.717948717948718,
+      "grad_norm": 0.7616816759109497,
+      "learning_rate": 4.666479199110838e-05,
+      "loss": 0.8557,
+      "step": 15266
+    },
+    {
+      "epoch": 2.718126780626781,
+      "grad_norm": 0.7836055159568787,
+      "learning_rate": 4.66529521450264e-05,
+      "loss": 0.7299,
+      "step": 15267
+    },
+    {
+      "epoch": 2.7183048433048436,
+      "grad_norm": 0.8313519954681396,
+      "learning_rate": 4.664111334416894e-05,
+      "loss": 0.8268,
+      "step": 15268
+    },
+    {
+      "epoch": 2.718482905982906,
+      "grad_norm": 0.9130576252937317,
+      "learning_rate": 4.662927558876812e-05,
+      "loss": 0.8913,
+      "step": 15269
+    },
+    {
+      "epoch": 2.7186609686609686,
+      "grad_norm": 0.8552213907241821,
+      "learning_rate": 4.661743887905569e-05,
+      "loss": 0.9396,
+      "step": 15270
+    },
+    {
+      "epoch": 2.7188390313390314,
+      "grad_norm": 0.7953839898109436,
+      "learning_rate": 4.660560321526373e-05,
+      "loss": 0.74,
+      "step": 15271
+    },
+    {
+      "epoch": 2.719017094017094,
+      "grad_norm": 0.9148657321929932,
+      "learning_rate": 4.6593768597623974e-05,
+      "loss": 0.7821,
+      "step": 15272
+    },
+    {
+      "epoch": 2.7191951566951564,
+      "grad_norm": 0.8587655425071716,
+      "learning_rate": 4.658193502636843e-05,
+      "loss": 0.9495,
+      "step": 15273
+    },
+    {
+      "epoch": 2.719373219373219,
+      "grad_norm": 0.8915669322013855,
+      "learning_rate": 4.6570102501728896e-05,
+      "loss": 0.8612,
+      "step": 15274
+    },
+    {
+      "epoch": 2.719551282051282,
+      "grad_norm": 0.957039475440979,
+      "learning_rate": 4.655827102393717e-05,
+      "loss": 0.8506,
+      "step": 15275
+    },
+    {
+      "epoch": 2.7197293447293447,
+      "grad_norm": 0.7784267067909241,
+      "learning_rate": 4.654644059322519e-05,
+      "loss": 0.6864,
+      "step": 15276
+    },
+    {
+      "epoch": 2.7199074074074074,
+      "grad_norm": 0.9508241415023804,
+      "learning_rate": 4.65346112098246e-05,
+      "loss": 1.0097,
+      "step": 15277
+    },
+    {
+      "epoch": 2.72008547008547,
+      "grad_norm": 0.8316742777824402,
+      "learning_rate": 4.6522782873967265e-05,
+      "loss": 0.7444,
+      "step": 15278
+    },
+    {
+      "epoch": 2.720263532763533,
+      "grad_norm": 0.8781944513320923,
+      "learning_rate": 4.651095558588491e-05,
+      "loss": 0.8725,
+      "step": 15279
+    },
+    {
+      "epoch": 2.7204415954415957,
+      "grad_norm": 0.9407825469970703,
+      "learning_rate": 4.649912934580927e-05,
+      "loss": 0.9788,
+      "step": 15280
+    },
+    {
+      "epoch": 2.720619658119658,
+      "grad_norm": 0.9863289594650269,
+      "learning_rate": 4.6487304153972045e-05,
+      "loss": 0.7777,
+      "step": 15281
+    },
+    {
+      "epoch": 2.7207977207977208,
+      "grad_norm": 0.7580869793891907,
+      "learning_rate": 4.6475480010604945e-05,
+      "loss": 0.5835,
+      "step": 15282
+    },
+    {
+      "epoch": 2.7209757834757835,
+      "grad_norm": 0.7973836660385132,
+      "learning_rate": 4.646365691593961e-05,
+      "loss": 0.7633,
+      "step": 15283
+    },
+    {
+      "epoch": 2.7211538461538463,
+      "grad_norm": 0.8107978701591492,
+      "learning_rate": 4.645183487020772e-05,
+      "loss": 0.7149,
+      "step": 15284
+    },
+    {
+      "epoch": 2.7213319088319086,
+      "grad_norm": 0.8944578170776367,
+      "learning_rate": 4.644001387364084e-05,
+      "loss": 0.9227,
+      "step": 15285
+    },
+    {
+      "epoch": 2.7215099715099713,
+      "grad_norm": 0.7592978477478027,
+      "learning_rate": 4.642819392647071e-05,
+      "loss": 0.464,
+      "step": 15286
+    },
+    {
+      "epoch": 2.721688034188034,
+      "grad_norm": 0.8484344482421875,
+      "learning_rate": 4.641637502892876e-05,
+      "loss": 1.0439,
+      "step": 15287
+    },
+    {
+      "epoch": 2.721866096866097,
+      "grad_norm": 0.8766823410987854,
+      "learning_rate": 4.640455718124667e-05,
+      "loss": 0.7561,
+      "step": 15288
+    },
+    {
+      "epoch": 2.7220441595441596,
+      "grad_norm": 0.8039024472236633,
+      "learning_rate": 4.639274038365594e-05,
+      "loss": 0.6774,
+      "step": 15289
+    },
+    {
+      "epoch": 2.7222222222222223,
+      "grad_norm": 0.8199611902236938,
+      "learning_rate": 4.63809246363881e-05,
+      "loss": 0.7721,
+      "step": 15290
+    },
+    {
+      "epoch": 2.722400284900285,
+      "grad_norm": 0.8209745287895203,
+      "learning_rate": 4.636910993967467e-05,
+      "loss": 0.7017,
+      "step": 15291
+    },
+    {
+      "epoch": 2.722578347578348,
+      "grad_norm": 0.8822476267814636,
+      "learning_rate": 4.6357296293747075e-05,
+      "loss": 0.8742,
+      "step": 15292
+    },
+    {
+      "epoch": 2.72275641025641,
+      "grad_norm": 0.8172603249549866,
+      "learning_rate": 4.634548369883687e-05,
+      "loss": 0.8165,
+      "step": 15293
+    },
+    {
+      "epoch": 2.722934472934473,
+      "grad_norm": 0.8601866960525513,
+      "learning_rate": 4.633367215517546e-05,
+      "loss": 0.7961,
+      "step": 15294
+    },
+    {
+      "epoch": 2.7231125356125356,
+      "grad_norm": 0.9346174001693726,
+      "learning_rate": 4.632186166299425e-05,
+      "loss": 0.9229,
+      "step": 15295
+    },
+    {
+      "epoch": 2.7232905982905984,
+      "grad_norm": 0.8956635594367981,
+      "learning_rate": 4.631005222252465e-05,
+      "loss": 0.7886,
+      "step": 15296
+    },
+    {
+      "epoch": 2.7234686609686607,
+      "grad_norm": 0.8453384637832642,
+      "learning_rate": 4.629824383399805e-05,
+      "loss": 0.8513,
+      "step": 15297
+    },
+    {
+      "epoch": 2.7236467236467234,
+      "grad_norm": 0.8931429982185364,
+      "learning_rate": 4.628643649764581e-05,
+      "loss": 1.0195,
+      "step": 15298
+    },
+    {
+      "epoch": 2.723824786324786,
+      "grad_norm": 0.7326723337173462,
+      "learning_rate": 4.6274630213699265e-05,
+      "loss": 0.7616,
+      "step": 15299
+    },
+    {
+      "epoch": 2.724002849002849,
+      "grad_norm": 0.8572023510932922,
+      "learning_rate": 4.6262824982389706e-05,
+      "loss": 0.8266,
+      "step": 15300
+    },
+    {
+      "epoch": 2.7241809116809117,
+      "grad_norm": 0.7753783464431763,
+      "learning_rate": 4.625102080394853e-05,
+      "loss": 0.6907,
+      "step": 15301
+    },
+    {
+      "epoch": 2.7243589743589745,
+      "grad_norm": 0.8758052587509155,
+      "learning_rate": 4.623921767860687e-05,
+      "loss": 0.6369,
+      "step": 15302
+    },
+    {
+      "epoch": 2.724537037037037,
+      "grad_norm": 0.8508220314979553,
+      "learning_rate": 4.6227415606596104e-05,
+      "loss": 0.933,
+      "step": 15303
+    },
+    {
+      "epoch": 2.7247150997151,
+      "grad_norm": 0.7440072298049927,
+      "learning_rate": 4.621561458814743e-05,
+      "loss": 0.7172,
+      "step": 15304
+    },
+    {
+      "epoch": 2.7248931623931623,
+      "grad_norm": 0.9081870317459106,
+      "learning_rate": 4.6203814623492046e-05,
+      "loss": 0.8964,
+      "step": 15305
+    },
+    {
+      "epoch": 2.725071225071225,
+      "grad_norm": 0.9127907156944275,
+      "learning_rate": 4.619201571286117e-05,
+      "loss": 1.0081,
+      "step": 15306
+    },
+    {
+      "epoch": 2.7252492877492878,
+      "grad_norm": 0.9508554935455322,
+      "learning_rate": 4.618021785648597e-05,
+      "loss": 0.94,
+      "step": 15307
+    },
+    {
+      "epoch": 2.7254273504273505,
+      "grad_norm": 0.8726735711097717,
+      "learning_rate": 4.616842105459761e-05,
+      "loss": 0.9284,
+      "step": 15308
+    },
+    {
+      "epoch": 2.7256054131054133,
+      "grad_norm": 0.9266753792762756,
+      "learning_rate": 4.6156625307427206e-05,
+      "loss": 0.9069,
+      "step": 15309
+    },
+    {
+      "epoch": 2.7257834757834756,
+      "grad_norm": 0.952553391456604,
+      "learning_rate": 4.614483061520584e-05,
+      "loss": 0.8604,
+      "step": 15310
+    },
+    {
+      "epoch": 2.7259615384615383,
+      "grad_norm": 0.7702621817588806,
+      "learning_rate": 4.613303697816471e-05,
+      "loss": 0.808,
+      "step": 15311
+    },
+    {
+      "epoch": 2.726139601139601,
+      "grad_norm": 0.8052653670310974,
+      "learning_rate": 4.612124439653477e-05,
+      "loss": 0.8696,
+      "step": 15312
+    },
+    {
+      "epoch": 2.726317663817664,
+      "grad_norm": 0.8808547854423523,
+      "learning_rate": 4.610945287054714e-05,
+      "loss": 0.9595,
+      "step": 15313
+    },
+    {
+      "epoch": 2.7264957264957266,
+      "grad_norm": 1.0233266353607178,
+      "learning_rate": 4.609766240043284e-05,
+      "loss": 1.0691,
+      "step": 15314
+    },
+    {
+      "epoch": 2.7266737891737893,
+      "grad_norm": 0.8129898309707642,
+      "learning_rate": 4.6085872986422826e-05,
+      "loss": 0.9269,
+      "step": 15315
+    },
+    {
+      "epoch": 2.726851851851852,
+      "grad_norm": 1.2745141983032227,
+      "learning_rate": 4.607408462874823e-05,
+      "loss": 0.937,
+      "step": 15316
+    },
+    {
+      "epoch": 2.7270299145299144,
+      "grad_norm": 0.808274507522583,
+      "learning_rate": 4.606229732763984e-05,
+      "loss": 0.789,
+      "step": 15317
+    },
+    {
+      "epoch": 2.727207977207977,
+      "grad_norm": 0.8849375247955322,
+      "learning_rate": 4.605051108332875e-05,
+      "loss": 0.8993,
+      "step": 15318
+    },
+    {
+      "epoch": 2.72738603988604,
+      "grad_norm": 0.8251593112945557,
+      "learning_rate": 4.603872589604576e-05,
+      "loss": 0.9057,
+      "step": 15319
+    },
+    {
+      "epoch": 2.7275641025641026,
+      "grad_norm": 0.8271582126617432,
+      "learning_rate": 4.602694176602188e-05,
+      "loss": 0.8378,
+      "step": 15320
+    },
+    {
+      "epoch": 2.7277421652421654,
+      "grad_norm": 0.8139070868492126,
+      "learning_rate": 4.6015158693487956e-05,
+      "loss": 0.8014,
+      "step": 15321
+    },
+    {
+      "epoch": 2.7279202279202277,
+      "grad_norm": 0.8873880505561829,
+      "learning_rate": 4.600337667867486e-05,
+      "loss": 0.8707,
+      "step": 15322
+    },
+    {
+      "epoch": 2.7280982905982905,
+      "grad_norm": 0.8616414666175842,
+      "learning_rate": 4.599159572181342e-05,
+      "loss": 0.8538,
+      "step": 15323
+    },
+    {
+      "epoch": 2.728276353276353,
+      "grad_norm": 0.8280995488166809,
+      "learning_rate": 4.5979815823134466e-05,
+      "loss": 0.8444,
+      "step": 15324
+    },
+    {
+      "epoch": 2.728454415954416,
+      "grad_norm": 0.8684375882148743,
+      "learning_rate": 4.596803698286878e-05,
+      "loss": 0.7562,
+      "step": 15325
+    },
+    {
+      "epoch": 2.7286324786324787,
+      "grad_norm": 0.8113002181053162,
+      "learning_rate": 4.595625920124723e-05,
+      "loss": 0.8331,
+      "step": 15326
+    },
+    {
+      "epoch": 2.7288105413105415,
+      "grad_norm": 0.8675588965415955,
+      "learning_rate": 4.5944482478500436e-05,
+      "loss": 1.1016,
+      "step": 15327
+    },
+    {
+      "epoch": 2.728988603988604,
+      "grad_norm": 0.9015034437179565,
+      "learning_rate": 4.593270681485927e-05,
+      "loss": 0.9002,
+      "step": 15328
+    },
+    {
+      "epoch": 2.7291666666666665,
+      "grad_norm": 0.9215324521064758,
+      "learning_rate": 4.592093221055439e-05,
+      "loss": 0.9491,
+      "step": 15329
+    },
+    {
+      "epoch": 2.7293447293447293,
+      "grad_norm": 0.8969921469688416,
+      "learning_rate": 4.590915866581651e-05,
+      "loss": 0.8791,
+      "step": 15330
+    },
+    {
+      "epoch": 2.729522792022792,
+      "grad_norm": 0.9012344479560852,
+      "learning_rate": 4.5897386180876304e-05,
+      "loss": 0.9114,
+      "step": 15331
+    },
+    {
+      "epoch": 2.7297008547008548,
+      "grad_norm": 1.0024429559707642,
+      "learning_rate": 4.588561475596438e-05,
+      "loss": 0.8782,
+      "step": 15332
+    },
+    {
+      "epoch": 2.7298789173789175,
+      "grad_norm": 0.9079484343528748,
+      "learning_rate": 4.5873844391311496e-05,
+      "loss": 1.0012,
+      "step": 15333
+    },
+    {
+      "epoch": 2.73005698005698,
+      "grad_norm": 0.709800660610199,
+      "learning_rate": 4.5862075087148124e-05,
+      "loss": 0.7473,
+      "step": 15334
+    },
+    {
+      "epoch": 2.7302350427350426,
+      "grad_norm": 0.9776272773742676,
+      "learning_rate": 4.585030684370497e-05,
+      "loss": 1.1927,
+      "step": 15335
+    },
+    {
+      "epoch": 2.7304131054131053,
+      "grad_norm": 0.8624512553215027,
+      "learning_rate": 4.5838539661212565e-05,
+      "loss": 0.6661,
+      "step": 15336
+    },
+    {
+      "epoch": 2.730591168091168,
+      "grad_norm": 0.7901379466056824,
+      "learning_rate": 4.5826773539901456e-05,
+      "loss": 0.8244,
+      "step": 15337
+    },
+    {
+      "epoch": 2.730769230769231,
+      "grad_norm": 0.8546316027641296,
+      "learning_rate": 4.58150084800022e-05,
+      "loss": 0.8232,
+      "step": 15338
+    },
+    {
+      "epoch": 2.7309472934472936,
+      "grad_norm": 1.0038648843765259,
+      "learning_rate": 4.5803244481745275e-05,
+      "loss": 0.8363,
+      "step": 15339
+    },
+    {
+      "epoch": 2.7311253561253563,
+      "grad_norm": 0.7757763266563416,
+      "learning_rate": 4.579148154536117e-05,
+      "loss": 0.6935,
+      "step": 15340
+    },
+    {
+      "epoch": 2.7313034188034186,
+      "grad_norm": 0.8671833276748657,
+      "learning_rate": 4.5779719671080436e-05,
+      "loss": 0.8453,
+      "step": 15341
+    },
+    {
+      "epoch": 2.7314814814814814,
+      "grad_norm": 0.8507152795791626,
+      "learning_rate": 4.57679588591334e-05,
+      "loss": 0.8142,
+      "step": 15342
+    },
+    {
+      "epoch": 2.731659544159544,
+      "grad_norm": 0.8205499053001404,
+      "learning_rate": 4.575619910975062e-05,
+      "loss": 0.8442,
+      "step": 15343
+    },
+    {
+      "epoch": 2.731837606837607,
+      "grad_norm": 0.8809645771980286,
+      "learning_rate": 4.574444042316236e-05,
+      "loss": 1.024,
+      "step": 15344
+    },
+    {
+      "epoch": 2.7320156695156697,
+      "grad_norm": 0.825038492679596,
+      "learning_rate": 4.573268279959912e-05,
+      "loss": 0.9089,
+      "step": 15345
+    },
+    {
+      "epoch": 2.732193732193732,
+      "grad_norm": 0.7646815776824951,
+      "learning_rate": 4.572092623929124e-05,
+      "loss": 0.9239,
+      "step": 15346
+    },
+    {
+      "epoch": 2.7323717948717947,
+      "grad_norm": 0.8372252583503723,
+      "learning_rate": 4.570917074246905e-05,
+      "loss": 0.8687,
+      "step": 15347
+    },
+    {
+      "epoch": 2.7325498575498575,
+      "grad_norm": 0.9108861088752747,
+      "learning_rate": 4.5697416309362885e-05,
+      "loss": 0.9451,
+      "step": 15348
+    },
+    {
+      "epoch": 2.73272792022792,
+      "grad_norm": 0.8537670969963074,
+      "learning_rate": 4.568566294020303e-05,
+      "loss": 0.7437,
+      "step": 15349
+    },
+    {
+      "epoch": 2.732905982905983,
+      "grad_norm": 0.8660921454429626,
+      "learning_rate": 4.5673910635219766e-05,
+      "loss": 1.0617,
+      "step": 15350
+    },
+    {
+      "epoch": 2.7330840455840457,
+      "grad_norm": 0.8499423265457153,
+      "learning_rate": 4.5662159394643424e-05,
+      "loss": 0.9355,
+      "step": 15351
+    },
+    {
+      "epoch": 2.7332621082621085,
+      "grad_norm": 0.7776598930358887,
+      "learning_rate": 4.565040921870413e-05,
+      "loss": 0.8132,
+      "step": 15352
+    },
+    {
+      "epoch": 2.7334401709401708,
+      "grad_norm": 0.9197307229042053,
+      "learning_rate": 4.563866010763219e-05,
+      "loss": 0.8648,
+      "step": 15353
+    },
+    {
+      "epoch": 2.7336182336182335,
+      "grad_norm": 0.8543015122413635,
+      "learning_rate": 4.5626912061657786e-05,
+      "loss": 0.8453,
+      "step": 15354
+    },
+    {
+      "epoch": 2.7337962962962963,
+      "grad_norm": 0.9448479413986206,
+      "learning_rate": 4.561516508101109e-05,
+      "loss": 0.9515,
+      "step": 15355
+    },
+    {
+      "epoch": 2.733974358974359,
+      "grad_norm": 0.7448729276657104,
+      "learning_rate": 4.5603419165922265e-05,
+      "loss": 0.64,
+      "step": 15356
+    },
+    {
+      "epoch": 2.734152421652422,
+      "grad_norm": 0.8229237198829651,
+      "learning_rate": 4.5591674316621405e-05,
+      "loss": 0.7936,
+      "step": 15357
+    },
+    {
+      "epoch": 2.734330484330484,
+      "grad_norm": 0.8518769145011902,
+      "learning_rate": 4.557993053333873e-05,
+      "loss": 1.1976,
+      "step": 15358
+    },
+    {
+      "epoch": 2.734508547008547,
+      "grad_norm": 0.8680224418640137,
+      "learning_rate": 4.55681878163042e-05,
+      "loss": 0.8223,
+      "step": 15359
+    },
+    {
+      "epoch": 2.7346866096866096,
+      "grad_norm": 0.8199124336242676,
+      "learning_rate": 4.555644616574799e-05,
+      "loss": 0.634,
+      "step": 15360
+    },
+    {
+      "epoch": 2.7348646723646723,
+      "grad_norm": 0.8262977004051208,
+      "learning_rate": 4.554470558190013e-05,
+      "loss": 0.6373,
+      "step": 15361
+    },
+    {
+      "epoch": 2.735042735042735,
+      "grad_norm": 0.8114070296287537,
+      "learning_rate": 4.553296606499062e-05,
+      "loss": 0.6624,
+      "step": 15362
+    },
+    {
+      "epoch": 2.735220797720798,
+      "grad_norm": 0.9944671392440796,
+      "learning_rate": 4.552122761524952e-05,
+      "loss": 0.8246,
+      "step": 15363
+    },
+    {
+      "epoch": 2.7353988603988606,
+      "grad_norm": 0.8174465298652649,
+      "learning_rate": 4.550949023290678e-05,
+      "loss": 0.8431,
+      "step": 15364
+    },
+    {
+      "epoch": 2.7355769230769234,
+      "grad_norm": 0.8303970694541931,
+      "learning_rate": 4.5497753918192356e-05,
+      "loss": 0.417,
+      "step": 15365
+    },
+    {
+      "epoch": 2.7357549857549857,
+      "grad_norm": 0.8428391218185425,
+      "learning_rate": 4.548601867133629e-05,
+      "loss": 0.751,
+      "step": 15366
+    },
+    {
+      "epoch": 2.7359330484330484,
+      "grad_norm": 0.8769099116325378,
+      "learning_rate": 4.5474284492568384e-05,
+      "loss": 0.8984,
+      "step": 15367
+    },
+    {
+      "epoch": 2.736111111111111,
+      "grad_norm": 0.8389245867729187,
+      "learning_rate": 4.546255138211867e-05,
+      "loss": 0.8503,
+      "step": 15368
+    },
+    {
+      "epoch": 2.736289173789174,
+      "grad_norm": 0.8404824137687683,
+      "learning_rate": 4.5450819340216896e-05,
+      "loss": 0.836,
+      "step": 15369
+    },
+    {
+      "epoch": 2.736467236467236,
+      "grad_norm": 1.0007327795028687,
+      "learning_rate": 4.543908836709304e-05,
+      "loss": 0.9746,
+      "step": 15370
+    },
+    {
+      "epoch": 2.736645299145299,
+      "grad_norm": 0.7373863458633423,
+      "learning_rate": 4.542735846297691e-05,
+      "loss": 0.6752,
+      "step": 15371
+    },
+    {
+      "epoch": 2.7368233618233617,
+      "grad_norm": 0.8973239660263062,
+      "learning_rate": 4.541562962809829e-05,
+      "loss": 0.8988,
+      "step": 15372
+    },
+    {
+      "epoch": 2.7370014245014245,
+      "grad_norm": 0.8576705455780029,
+      "learning_rate": 4.5403901862687095e-05,
+      "loss": 0.9279,
+      "step": 15373
+    },
+    {
+      "epoch": 2.7371794871794872,
+      "grad_norm": 0.7462539076805115,
+      "learning_rate": 4.539217516697295e-05,
+      "loss": 0.7228,
+      "step": 15374
+    },
+    {
+      "epoch": 2.73735754985755,
+      "grad_norm": 0.8082219362258911,
+      "learning_rate": 4.538044954118573e-05,
+      "loss": 0.9562,
+      "step": 15375
+    },
+    {
+      "epoch": 2.7375356125356127,
+      "grad_norm": 0.7067760825157166,
+      "learning_rate": 4.5368724985555134e-05,
+      "loss": 0.5306,
+      "step": 15376
+    },
+    {
+      "epoch": 2.7377136752136755,
+      "grad_norm": 0.9178285598754883,
+      "learning_rate": 4.535700150031089e-05,
+      "loss": 0.8087,
+      "step": 15377
+    },
+    {
+      "epoch": 2.737891737891738,
+      "grad_norm": 0.803240180015564,
+      "learning_rate": 4.53452790856827e-05,
+      "loss": 0.7315,
+      "step": 15378
+    },
+    {
+      "epoch": 2.7380698005698005,
+      "grad_norm": 0.7200242877006531,
+      "learning_rate": 4.5333557741900226e-05,
+      "loss": 0.6329,
+      "step": 15379
+    },
+    {
+      "epoch": 2.7382478632478633,
+      "grad_norm": 0.8744874596595764,
+      "learning_rate": 4.5321837469193117e-05,
+      "loss": 0.8279,
+      "step": 15380
+    },
+    {
+      "epoch": 2.738425925925926,
+      "grad_norm": 0.7736984491348267,
+      "learning_rate": 4.531011826779103e-05,
+      "loss": 0.7267,
+      "step": 15381
+    },
+    {
+      "epoch": 2.7386039886039883,
+      "grad_norm": 0.893189549446106,
+      "learning_rate": 4.5298400137923527e-05,
+      "loss": 0.754,
+      "step": 15382
+    },
+    {
+      "epoch": 2.738782051282051,
+      "grad_norm": 0.7637171149253845,
+      "learning_rate": 4.5286683079820314e-05,
+      "loss": 0.6201,
+      "step": 15383
+    },
+    {
+      "epoch": 2.738960113960114,
+      "grad_norm": 0.8324360847473145,
+      "learning_rate": 4.527496709371082e-05,
+      "loss": 0.7617,
+      "step": 15384
+    },
+    {
+      "epoch": 2.7391381766381766,
+      "grad_norm": 0.7570679783821106,
+      "learning_rate": 4.52632521798247e-05,
+      "loss": 0.7275,
+      "step": 15385
+    },
+    {
+      "epoch": 2.7393162393162394,
+      "grad_norm": 0.7802938222885132,
+      "learning_rate": 4.525153833839144e-05,
+      "loss": 0.7423,
+      "step": 15386
+    },
+    {
+      "epoch": 2.739494301994302,
+      "grad_norm": 0.8645743727684021,
+      "learning_rate": 4.523982556964056e-05,
+      "loss": 0.8066,
+      "step": 15387
+    },
+    {
+      "epoch": 2.739672364672365,
+      "grad_norm": 0.8080064654350281,
+      "learning_rate": 4.522811387380155e-05,
+      "loss": 0.7418,
+      "step": 15388
+    },
+    {
+      "epoch": 2.7398504273504276,
+      "grad_norm": 0.925401508808136,
+      "learning_rate": 4.521640325110387e-05,
+      "loss": 0.9622,
+      "step": 15389
+    },
+    {
+      "epoch": 2.74002849002849,
+      "grad_norm": 0.8898165822029114,
+      "learning_rate": 4.520469370177696e-05,
+      "loss": 0.8543,
+      "step": 15390
+    },
+    {
+      "epoch": 2.7402065527065527,
+      "grad_norm": 0.8610122799873352,
+      "learning_rate": 4.519298522605021e-05,
+      "loss": 0.9502,
+      "step": 15391
+    },
+    {
+      "epoch": 2.7403846153846154,
+      "grad_norm": 0.8111294507980347,
+      "learning_rate": 4.51812778241531e-05,
+      "loss": 0.7526,
+      "step": 15392
+    },
+    {
+      "epoch": 2.740562678062678,
+      "grad_norm": 0.8465895056724548,
+      "learning_rate": 4.516957149631498e-05,
+      "loss": 0.9076,
+      "step": 15393
+    },
+    {
+      "epoch": 2.7407407407407405,
+      "grad_norm": 0.8541668057441711,
+      "learning_rate": 4.51578662427652e-05,
+      "loss": 0.8996,
+      "step": 15394
+    },
+    {
+      "epoch": 2.7409188034188032,
+      "grad_norm": 0.9113210439682007,
+      "learning_rate": 4.514616206373311e-05,
+      "loss": 0.9129,
+      "step": 15395
+    },
+    {
+      "epoch": 2.741096866096866,
+      "grad_norm": 0.7553523182868958,
+      "learning_rate": 4.513445895944802e-05,
+      "loss": 0.7265,
+      "step": 15396
+    },
+    {
+      "epoch": 2.7412749287749287,
+      "grad_norm": 0.8949921131134033,
+      "learning_rate": 4.5122756930139206e-05,
+      "loss": 0.9176,
+      "step": 15397
+    },
+    {
+      "epoch": 2.7414529914529915,
+      "grad_norm": 0.7957020401954651,
+      "learning_rate": 4.5111055976036044e-05,
+      "loss": 0.9384,
+      "step": 15398
+    },
+    {
+      "epoch": 2.7416310541310542,
+      "grad_norm": 0.759608805179596,
+      "learning_rate": 4.509935609736764e-05,
+      "loss": 0.7791,
+      "step": 15399
+    },
+    {
+      "epoch": 2.741809116809117,
+      "grad_norm": 0.927768886089325,
+      "learning_rate": 4.508765729436335e-05,
+      "loss": 0.9113,
+      "step": 15400
+    },
+    {
+      "epoch": 2.7419871794871797,
+      "grad_norm": 0.910513162612915,
+      "learning_rate": 4.5075959567252335e-05,
+      "loss": 0.9334,
+      "step": 15401
+    },
+    {
+      "epoch": 2.742165242165242,
+      "grad_norm": 0.9029644727706909,
+      "learning_rate": 4.5064262916263814e-05,
+      "loss": 0.9487,
+      "step": 15402
+    },
+    {
+      "epoch": 2.742343304843305,
+      "grad_norm": 0.8001708984375,
+      "learning_rate": 4.505256734162693e-05,
+      "loss": 0.8447,
+      "step": 15403
+    },
+    {
+      "epoch": 2.7425213675213675,
+      "grad_norm": 0.8000209927558899,
+      "learning_rate": 4.504087284357085e-05,
+      "loss": 0.6764,
+      "step": 15404
+    },
+    {
+      "epoch": 2.7426994301994303,
+      "grad_norm": 0.7368536591529846,
+      "learning_rate": 4.5029179422324686e-05,
+      "loss": 0.563,
+      "step": 15405
+    },
+    {
+      "epoch": 2.7428774928774926,
+      "grad_norm": 0.9035481214523315,
+      "learning_rate": 4.501748707811757e-05,
+      "loss": 0.8165,
+      "step": 15406
+    },
+    {
+      "epoch": 2.7430555555555554,
+      "grad_norm": 0.7985709309577942,
+      "learning_rate": 4.500579581117854e-05,
+      "loss": 1.0773,
+      "step": 15407
+    },
+    {
+      "epoch": 2.743233618233618,
+      "grad_norm": 0.7867546677589417,
+      "learning_rate": 4.499410562173678e-05,
+      "loss": 0.7416,
+      "step": 15408
+    },
+    {
+      "epoch": 2.743411680911681,
+      "grad_norm": 0.8863609433174133,
+      "learning_rate": 4.498241651002117e-05,
+      "loss": 0.8609,
+      "step": 15409
+    },
+    {
+      "epoch": 2.7435897435897436,
+      "grad_norm": 0.8197270631790161,
+      "learning_rate": 4.497072847626087e-05,
+      "loss": 1.0664,
+      "step": 15410
+    },
+    {
+      "epoch": 2.7437678062678064,
+      "grad_norm": 0.843718409538269,
+      "learning_rate": 4.495904152068483e-05,
+      "loss": 0.9831,
+      "step": 15411
+    },
+    {
+      "epoch": 2.743945868945869,
+      "grad_norm": 0.8311102986335754,
+      "learning_rate": 4.4947355643521985e-05,
+      "loss": 0.8035,
+      "step": 15412
+    },
+    {
+      "epoch": 2.744123931623932,
+      "grad_norm": 0.8396357297897339,
+      "learning_rate": 4.493567084500143e-05,
+      "loss": 1.0015,
+      "step": 15413
+    },
+    {
+      "epoch": 2.744301994301994,
+      "grad_norm": 0.7959007620811462,
+      "learning_rate": 4.492398712535194e-05,
+      "loss": 0.8414,
+      "step": 15414
+    },
+    {
+      "epoch": 2.744480056980057,
+      "grad_norm": 0.7720336318016052,
+      "learning_rate": 4.491230448480258e-05,
+      "loss": 0.8185,
+      "step": 15415
+    },
+    {
+      "epoch": 2.7446581196581197,
+      "grad_norm": 0.7999769449234009,
+      "learning_rate": 4.4900622923582115e-05,
+      "loss": 0.6807,
+      "step": 15416
+    },
+    {
+      "epoch": 2.7448361823361824,
+      "grad_norm": 0.9882165789604187,
+      "learning_rate": 4.488894244191951e-05,
+      "loss": 0.975,
+      "step": 15417
+    },
+    {
+      "epoch": 2.745014245014245,
+      "grad_norm": 0.8275474309921265,
+      "learning_rate": 4.48772630400436e-05,
+      "loss": 0.8459,
+      "step": 15418
+    },
+    {
+      "epoch": 2.7451923076923075,
+      "grad_norm": 0.8468943238258362,
+      "learning_rate": 4.486558471818322e-05,
+      "loss": 0.8217,
+      "step": 15419
+    },
+    {
+      "epoch": 2.7453703703703702,
+      "grad_norm": 0.8845008015632629,
+      "learning_rate": 4.485390747656717e-05,
+      "loss": 0.9811,
+      "step": 15420
+    },
+    {
+      "epoch": 2.745548433048433,
+      "grad_norm": 1.0010331869125366,
+      "learning_rate": 4.4842231315424255e-05,
+      "loss": 0.9437,
+      "step": 15421
+    },
+    {
+      "epoch": 2.7457264957264957,
+      "grad_norm": 0.7468565106391907,
+      "learning_rate": 4.483055623498319e-05,
+      "loss": 0.7205,
+      "step": 15422
+    },
+    {
+      "epoch": 2.7459045584045585,
+      "grad_norm": 0.9002050757408142,
+      "learning_rate": 4.4818882235472845e-05,
+      "loss": 0.8812,
+      "step": 15423
+    },
+    {
+      "epoch": 2.7460826210826212,
+      "grad_norm": 0.8684462904930115,
+      "learning_rate": 4.48072093171218e-05,
+      "loss": 0.7929,
+      "step": 15424
+    },
+    {
+      "epoch": 2.746260683760684,
+      "grad_norm": 0.8685877323150635,
+      "learning_rate": 4.479553748015891e-05,
+      "loss": 0.805,
+      "step": 15425
+    },
+    {
+      "epoch": 2.7464387464387463,
+      "grad_norm": 0.8292124271392822,
+      "learning_rate": 4.478386672481272e-05,
+      "loss": 0.9622,
+      "step": 15426
+    },
+    {
+      "epoch": 2.746616809116809,
+      "grad_norm": 0.8269517421722412,
+      "learning_rate": 4.477219705131199e-05,
+      "loss": 0.8011,
+      "step": 15427
+    },
+    {
+      "epoch": 2.746794871794872,
+      "grad_norm": 0.8913753628730774,
+      "learning_rate": 4.4760528459885334e-05,
+      "loss": 0.9794,
+      "step": 15428
+    },
+    {
+      "epoch": 2.7469729344729346,
+      "grad_norm": 0.8017858266830444,
+      "learning_rate": 4.474886095076137e-05,
+      "loss": 0.8593,
+      "step": 15429
+    },
+    {
+      "epoch": 2.7471509971509973,
+      "grad_norm": 1.0657325983047485,
+      "learning_rate": 4.47371945241687e-05,
+      "loss": 0.8043,
+      "step": 15430
+    },
+    {
+      "epoch": 2.7473290598290596,
+      "grad_norm": 0.8358477354049683,
+      "learning_rate": 4.472552918033588e-05,
+      "loss": 0.818,
+      "step": 15431
+    },
+    {
+      "epoch": 2.7475071225071224,
+      "grad_norm": 1.0436886548995972,
+      "learning_rate": 4.4713864919491514e-05,
+      "loss": 0.9246,
+      "step": 15432
+    },
+    {
+      "epoch": 2.747685185185185,
+      "grad_norm": 0.9838647246360779,
+      "learning_rate": 4.470220174186413e-05,
+      "loss": 1.0345,
+      "step": 15433
+    },
+    {
+      "epoch": 2.747863247863248,
+      "grad_norm": 0.8583347201347351,
+      "learning_rate": 4.469053964768222e-05,
+      "loss": 0.6242,
+      "step": 15434
+    },
+    {
+      "epoch": 2.7480413105413106,
+      "grad_norm": 0.832467794418335,
+      "learning_rate": 4.4678878637174304e-05,
+      "loss": 0.7988,
+      "step": 15435
+    },
+    {
+      "epoch": 2.7482193732193734,
+      "grad_norm": 0.7854242324829102,
+      "learning_rate": 4.4667218710568825e-05,
+      "loss": 0.9386,
+      "step": 15436
+    },
+    {
+      "epoch": 2.748397435897436,
+      "grad_norm": 0.7748091816902161,
+      "learning_rate": 4.465555986809423e-05,
+      "loss": 0.7322,
+      "step": 15437
+    },
+    {
+      "epoch": 2.7485754985754984,
+      "grad_norm": 0.818305492401123,
+      "learning_rate": 4.464390210997904e-05,
+      "loss": 0.822,
+      "step": 15438
+    },
+    {
+      "epoch": 2.748753561253561,
+      "grad_norm": 0.8253993391990662,
+      "learning_rate": 4.463224543645151e-05,
+      "loss": 0.7631,
+      "step": 15439
+    },
+    {
+      "epoch": 2.748931623931624,
+      "grad_norm": 0.804768443107605,
+      "learning_rate": 4.46205898477402e-05,
+      "loss": 0.8783,
+      "step": 15440
+    },
+    {
+      "epoch": 2.7491096866096867,
+      "grad_norm": 0.8612813949584961,
+      "learning_rate": 4.460893534407332e-05,
+      "loss": 0.944,
+      "step": 15441
+    },
+    {
+      "epoch": 2.7492877492877494,
+      "grad_norm": 0.8149600625038147,
+      "learning_rate": 4.459728192567932e-05,
+      "loss": 0.8592,
+      "step": 15442
+    },
+    {
+      "epoch": 2.7494658119658117,
+      "grad_norm": 0.996081531047821,
+      "learning_rate": 4.4585629592786496e-05,
+      "loss": 0.8648,
+      "step": 15443
+    },
+    {
+      "epoch": 2.7496438746438745,
+      "grad_norm": 0.8563137650489807,
+      "learning_rate": 4.457397834562314e-05,
+      "loss": 0.6645,
+      "step": 15444
+    },
+    {
+      "epoch": 2.7496438746438745,
+      "eval_loss": 1.1326396465301514,
+      "eval_runtime": 24.3757,
+      "eval_samples_per_second": 42.707,
+      "eval_steps_per_second": 21.374,
+      "step": 15444
+    },
+    {
+      "epoch": 2.7498219373219372,
+      "grad_norm": 0.7415599226951599,
+      "learning_rate": 4.4562328184417547e-05,
+      "loss": 0.7481,
+      "step": 15445
+    },
+    {
+      "epoch": 2.75,
+      "grad_norm": 0.8192741274833679,
+      "learning_rate": 4.455067910939796e-05,
+      "loss": 0.8367,
+      "step": 15446
+    },
+    {
+      "epoch": 2.7501780626780628,
+      "grad_norm": 0.8514624834060669,
+      "learning_rate": 4.4539031120792604e-05,
+      "loss": 0.7107,
+      "step": 15447
+    },
+    {
+      "epoch": 2.7503561253561255,
+      "grad_norm": 0.8594211339950562,
+      "learning_rate": 4.4527384218829796e-05,
+      "loss": 0.9332,
+      "step": 15448
+    },
+    {
+      "epoch": 2.7505341880341883,
+      "grad_norm": 0.7828420996665955,
+      "learning_rate": 4.4515738403737585e-05,
+      "loss": 0.719,
+      "step": 15449
+    },
+    {
+      "epoch": 2.7507122507122506,
+      "grad_norm": 0.9195737242698669,
+      "learning_rate": 4.4504093675744285e-05,
+      "loss": 0.8581,
+      "step": 15450
+    },
+    {
+      "epoch": 2.7508903133903133,
+      "grad_norm": 0.8090249300003052,
+      "learning_rate": 4.449245003507793e-05,
+      "loss": 0.8012,
+      "step": 15451
+    },
+    {
+      "epoch": 2.751068376068376,
+      "grad_norm": 0.9179023504257202,
+      "learning_rate": 4.4480807481966736e-05,
+      "loss": 0.8968,
+      "step": 15452
+    },
+    {
+      "epoch": 2.751246438746439,
+      "grad_norm": 0.8140867352485657,
+      "learning_rate": 4.446916601663879e-05,
+      "loss": 0.791,
+      "step": 15453
+    },
+    {
+      "epoch": 2.7514245014245016,
+      "grad_norm": 0.8313645720481873,
+      "learning_rate": 4.445752563932214e-05,
+      "loss": 0.8658,
+      "step": 15454
+    },
+    {
+      "epoch": 2.751602564102564,
+      "grad_norm": 1.0235247611999512,
+      "learning_rate": 4.444588635024497e-05,
+      "loss": 0.9383,
+      "step": 15455
+    },
+    {
+      "epoch": 2.7517806267806266,
+      "grad_norm": 0.8270257711410522,
+      "learning_rate": 4.443424814963518e-05,
+      "loss": 0.7666,
+      "step": 15456
+    },
+    {
+      "epoch": 2.7519586894586894,
+      "grad_norm": 0.8711290955543518,
+      "learning_rate": 4.442261103772092e-05,
+      "loss": 1.0819,
+      "step": 15457
+    },
+    {
+      "epoch": 2.752136752136752,
+      "grad_norm": 0.7851848602294922,
+      "learning_rate": 4.441097501473013e-05,
+      "loss": 0.8788,
+      "step": 15458
+    },
+    {
+      "epoch": 2.752314814814815,
+      "grad_norm": 0.9703593850135803,
+      "learning_rate": 4.4399340080890816e-05,
+      "loss": 0.8661,
+      "step": 15459
+    },
+    {
+      "epoch": 2.7524928774928776,
+      "grad_norm": 0.8575040102005005,
+      "learning_rate": 4.438770623643093e-05,
+      "loss": 0.8318,
+      "step": 15460
+    },
+    {
+      "epoch": 2.7526709401709404,
+      "grad_norm": 0.9393935799598694,
+      "learning_rate": 4.43760734815784e-05,
+      "loss": 0.8899,
+      "step": 15461
+    },
+    {
+      "epoch": 2.7528490028490027,
+      "grad_norm": 0.9310712814331055,
+      "learning_rate": 4.4364441816561185e-05,
+      "loss": 0.9519,
+      "step": 15462
+    },
+    {
+      "epoch": 2.7530270655270654,
+      "grad_norm": 0.8066901564598083,
+      "learning_rate": 4.435281124160715e-05,
+      "loss": 0.8102,
+      "step": 15463
+    },
+    {
+      "epoch": 2.753205128205128,
+      "grad_norm": 0.8681934475898743,
+      "learning_rate": 4.434118175694415e-05,
+      "loss": 0.7745,
+      "step": 15464
+    },
+    {
+      "epoch": 2.753383190883191,
+      "grad_norm": 0.7921330332756042,
+      "learning_rate": 4.432955336280014e-05,
+      "loss": 0.8396,
+      "step": 15465
+    },
+    {
+      "epoch": 2.7535612535612537,
+      "grad_norm": 0.8818981051445007,
+      "learning_rate": 4.4317926059402816e-05,
+      "loss": 0.9268,
+      "step": 15466
+    },
+    {
+      "epoch": 2.753739316239316,
+      "grad_norm": 0.8018338084220886,
+      "learning_rate": 4.4306299846980096e-05,
+      "loss": 0.9018,
+      "step": 15467
+    },
+    {
+      "epoch": 2.7539173789173788,
+      "grad_norm": 0.8704143762588501,
+      "learning_rate": 4.4294674725759734e-05,
+      "loss": 0.9517,
+      "step": 15468
+    },
+    {
+      "epoch": 2.7540954415954415,
+      "grad_norm": 1.1460380554199219,
+      "learning_rate": 4.4283050695969506e-05,
+      "loss": 0.7642,
+      "step": 15469
+    },
+    {
+      "epoch": 2.7542735042735043,
+      "grad_norm": 0.8134510517120361,
+      "learning_rate": 4.427142775783716e-05,
+      "loss": 1.0405,
+      "step": 15470
+    },
+    {
+      "epoch": 2.754451566951567,
+      "grad_norm": 0.8054876327514648,
+      "learning_rate": 4.425980591159038e-05,
+      "loss": 0.8929,
+      "step": 15471
+    },
+    {
+      "epoch": 2.7546296296296298,
+      "grad_norm": 0.8607433438301086,
+      "learning_rate": 4.4248185157456953e-05,
+      "loss": 0.9187,
+      "step": 15472
+    },
+    {
+      "epoch": 2.7548076923076925,
+      "grad_norm": 0.7448357939720154,
+      "learning_rate": 4.423656549566453e-05,
+      "loss": 0.7298,
+      "step": 15473
+    },
+    {
+      "epoch": 2.754985754985755,
+      "grad_norm": 0.9228075742721558,
+      "learning_rate": 4.422494692644076e-05,
+      "loss": 0.8704,
+      "step": 15474
+    },
+    {
+      "epoch": 2.7551638176638176,
+      "grad_norm": 0.8781694173812866,
+      "learning_rate": 4.421332945001329e-05,
+      "loss": 0.9555,
+      "step": 15475
+    },
+    {
+      "epoch": 2.7553418803418803,
+      "grad_norm": 0.8632338643074036,
+      "learning_rate": 4.420171306660975e-05,
+      "loss": 0.7322,
+      "step": 15476
+    },
+    {
+      "epoch": 2.755519943019943,
+      "grad_norm": 0.9808199405670166,
+      "learning_rate": 4.4190097776457716e-05,
+      "loss": 0.619,
+      "step": 15477
+    },
+    {
+      "epoch": 2.755698005698006,
+      "grad_norm": 1.025109052658081,
+      "learning_rate": 4.41784835797848e-05,
+      "loss": 1.0617,
+      "step": 15478
+    },
+    {
+      "epoch": 2.755876068376068,
+      "grad_norm": 0.8132767677307129,
+      "learning_rate": 4.416687047681849e-05,
+      "loss": 1.0045,
+      "step": 15479
+    },
+    {
+      "epoch": 2.756054131054131,
+      "grad_norm": 0.9630453586578369,
+      "learning_rate": 4.415525846778645e-05,
+      "loss": 0.9576,
+      "step": 15480
+    },
+    {
+      "epoch": 2.7562321937321936,
+      "grad_norm": 0.9891922473907471,
+      "learning_rate": 4.4143647552916034e-05,
+      "loss": 0.9333,
+      "step": 15481
+    },
+    {
+      "epoch": 2.7564102564102564,
+      "grad_norm": 0.7974509000778198,
+      "learning_rate": 4.413203773243486e-05,
+      "loss": 0.7809,
+      "step": 15482
+    },
+    {
+      "epoch": 2.756588319088319,
+      "grad_norm": 0.8173473477363586,
+      "learning_rate": 4.412042900657034e-05,
+      "loss": 1.1023,
+      "step": 15483
+    },
+    {
+      "epoch": 2.756766381766382,
+      "grad_norm": 0.8502877950668335,
+      "learning_rate": 4.410882137554994e-05,
+      "loss": 0.8705,
+      "step": 15484
+    },
+    {
+      "epoch": 2.7569444444444446,
+      "grad_norm": 0.8519158959388733,
+      "learning_rate": 4.4097214839601074e-05,
+      "loss": 0.8901,
+      "step": 15485
+    },
+    {
+      "epoch": 2.7571225071225074,
+      "grad_norm": 0.7851125001907349,
+      "learning_rate": 4.4085609398951164e-05,
+      "loss": 1.0612,
+      "step": 15486
+    },
+    {
+      "epoch": 2.7573005698005697,
+      "grad_norm": 0.9585029482841492,
+      "learning_rate": 4.407400505382758e-05,
+      "loss": 0.9229,
+      "step": 15487
+    },
+    {
+      "epoch": 2.7574786324786325,
+      "grad_norm": 0.775071918964386,
+      "learning_rate": 4.4062401804457686e-05,
+      "loss": 0.8246,
+      "step": 15488
+    },
+    {
+      "epoch": 2.757656695156695,
+      "grad_norm": 0.8049488067626953,
+      "learning_rate": 4.405079965106881e-05,
+      "loss": 0.9681,
+      "step": 15489
+    },
+    {
+      "epoch": 2.757834757834758,
+      "grad_norm": 0.9452522993087769,
+      "learning_rate": 4.4039198593888306e-05,
+      "loss": 0.7288,
+      "step": 15490
+    },
+    {
+      "epoch": 2.7580128205128203,
+      "grad_norm": 0.8296085596084595,
+      "learning_rate": 4.402759863314346e-05,
+      "loss": 0.8053,
+      "step": 15491
+    },
+    {
+      "epoch": 2.758190883190883,
+      "grad_norm": 0.8086248636245728,
+      "learning_rate": 4.4015999769061556e-05,
+      "loss": 0.8692,
+      "step": 15492
+    },
+    {
+      "epoch": 2.7583689458689458,
+      "grad_norm": 0.8784860372543335,
+      "learning_rate": 4.4004402001869836e-05,
+      "loss": 1.0503,
+      "step": 15493
+    },
+    {
+      "epoch": 2.7585470085470085,
+      "grad_norm": 0.82901930809021,
+      "learning_rate": 4.399280533179551e-05,
+      "loss": 0.8479,
+      "step": 15494
+    },
+    {
+      "epoch": 2.7587250712250713,
+      "grad_norm": 0.7654509544372559,
+      "learning_rate": 4.3981209759065875e-05,
+      "loss": 0.743,
+      "step": 15495
+    },
+    {
+      "epoch": 2.758903133903134,
+      "grad_norm": 0.8240879774093628,
+      "learning_rate": 4.3969615283908e-05,
+      "loss": 0.8303,
+      "step": 15496
+    },
+    {
+      "epoch": 2.7590811965811968,
+      "grad_norm": 0.9411282539367676,
+      "learning_rate": 4.3958021906549195e-05,
+      "loss": 0.8217,
+      "step": 15497
+    },
+    {
+      "epoch": 2.7592592592592595,
+      "grad_norm": 0.8222329616546631,
+      "learning_rate": 4.394642962721647e-05,
+      "loss": 0.9596,
+      "step": 15498
+    },
+    {
+      "epoch": 2.759437321937322,
+      "grad_norm": 0.8462044596672058,
+      "learning_rate": 4.393483844613704e-05,
+      "loss": 0.8029,
+      "step": 15499
+    },
+    {
+      "epoch": 2.7596153846153846,
+      "grad_norm": 1.0385619401931763,
+      "learning_rate": 4.392324836353798e-05,
+      "loss": 1.0352,
+      "step": 15500
+    },
+    {
+      "epoch": 2.7597934472934473,
+      "grad_norm": 0.9049911499023438,
+      "learning_rate": 4.3911659379646384e-05,
+      "loss": 1.0761,
+      "step": 15501
+    },
+    {
+      "epoch": 2.75997150997151,
+      "grad_norm": 0.8253830671310425,
+      "learning_rate": 4.390007149468932e-05,
+      "loss": 0.7693,
+      "step": 15502
+    },
+    {
+      "epoch": 2.7601495726495724,
+      "grad_norm": 0.7939008474349976,
+      "learning_rate": 4.388848470889381e-05,
+      "loss": 0.8847,
+      "step": 15503
+    },
+    {
+      "epoch": 2.760327635327635,
+      "grad_norm": 1.048941969871521,
+      "learning_rate": 4.387689902248684e-05,
+      "loss": 0.7012,
+      "step": 15504
+    },
+    {
+      "epoch": 2.760505698005698,
+      "grad_norm": 0.8834842443466187,
+      "learning_rate": 4.386531443569553e-05,
+      "loss": 0.9561,
+      "step": 15505
+    },
+    {
+      "epoch": 2.7606837606837606,
+      "grad_norm": 0.9147583842277527,
+      "learning_rate": 4.385373094874669e-05,
+      "loss": 0.9736,
+      "step": 15506
+    },
+    {
+      "epoch": 2.7608618233618234,
+      "grad_norm": 0.6820386648178101,
+      "learning_rate": 4.38421485618674e-05,
+      "loss": 0.4449,
+      "step": 15507
+    },
+    {
+      "epoch": 2.761039886039886,
+      "grad_norm": 0.9519942402839661,
+      "learning_rate": 4.383056727528455e-05,
+      "loss": 1.0385,
+      "step": 15508
+    },
+    {
+      "epoch": 2.761217948717949,
+      "grad_norm": 0.8701474070549011,
+      "learning_rate": 4.381898708922505e-05,
+      "loss": 0.9896,
+      "step": 15509
+    },
+    {
+      "epoch": 2.7613960113960117,
+      "grad_norm": 0.8756018877029419,
+      "learning_rate": 4.38074080039158e-05,
+      "loss": 0.9378,
+      "step": 15510
+    },
+    {
+      "epoch": 2.761574074074074,
+      "grad_norm": 0.8670514225959778,
+      "learning_rate": 4.379583001958362e-05,
+      "loss": 0.9175,
+      "step": 15511
+    },
+    {
+      "epoch": 2.7617521367521367,
+      "grad_norm": 0.8227131366729736,
+      "learning_rate": 4.378425313645547e-05,
+      "loss": 0.7864,
+      "step": 15512
+    },
+    {
+      "epoch": 2.7619301994301995,
+      "grad_norm": 0.9121497273445129,
+      "learning_rate": 4.377267735475802e-05,
+      "loss": 0.877,
+      "step": 15513
+    },
+    {
+      "epoch": 2.762108262108262,
+      "grad_norm": 0.8347102999687195,
+      "learning_rate": 4.3761102674718205e-05,
+      "loss": 0.8223,
+      "step": 15514
+    },
+    {
+      "epoch": 2.7622863247863245,
+      "grad_norm": 0.8657951951026917,
+      "learning_rate": 4.374952909656275e-05,
+      "loss": 0.7117,
+      "step": 15515
+    },
+    {
+      "epoch": 2.7624643874643873,
+      "grad_norm": 0.8934728503227234,
+      "learning_rate": 4.3737956620518414e-05,
+      "loss": 0.84,
+      "step": 15516
+    },
+    {
+      "epoch": 2.76264245014245,
+      "grad_norm": 0.7592045068740845,
+      "learning_rate": 4.3726385246811964e-05,
+      "loss": 0.7856,
+      "step": 15517
+    },
+    {
+      "epoch": 2.7628205128205128,
+      "grad_norm": 0.8480674028396606,
+      "learning_rate": 4.371481497567008e-05,
+      "loss": 0.7805,
+      "step": 15518
+    },
+    {
+      "epoch": 2.7629985754985755,
+      "grad_norm": 1.0231767892837524,
+      "learning_rate": 4.3703245807319437e-05,
+      "loss": 1.1517,
+      "step": 15519
+    },
+    {
+      "epoch": 2.7631766381766383,
+      "grad_norm": 0.9852092862129211,
+      "learning_rate": 4.369167774198684e-05,
+      "loss": 0.8735,
+      "step": 15520
+    },
+    {
+      "epoch": 2.763354700854701,
+      "grad_norm": 0.8751610517501831,
+      "learning_rate": 4.368011077989875e-05,
+      "loss": 0.9975,
+      "step": 15521
+    },
+    {
+      "epoch": 2.763532763532764,
+      "grad_norm": 0.8397828340530396,
+      "learning_rate": 4.3668544921281976e-05,
+      "loss": 1.0675,
+      "step": 15522
+    },
+    {
+      "epoch": 2.763710826210826,
+      "grad_norm": 0.7040372490882874,
+      "learning_rate": 4.3656980166362974e-05,
+      "loss": 0.7123,
+      "step": 15523
+    },
+    {
+      "epoch": 2.763888888888889,
+      "grad_norm": 1.0610599517822266,
+      "learning_rate": 4.364541651536844e-05,
+      "loss": 1.0854,
+      "step": 15524
+    },
+    {
+      "epoch": 2.7640669515669516,
+      "grad_norm": 0.78865647315979,
+      "learning_rate": 4.363385396852491e-05,
+      "loss": 0.924,
+      "step": 15525
+    },
+    {
+      "epoch": 2.7642450142450143,
+      "grad_norm": 0.87164705991745,
+      "learning_rate": 4.362229252605891e-05,
+      "loss": 0.9739,
+      "step": 15526
+    },
+    {
+      "epoch": 2.7644230769230766,
+      "grad_norm": 0.9362281560897827,
+      "learning_rate": 4.361073218819698e-05,
+      "loss": 0.7751,
+      "step": 15527
+    },
+    {
+      "epoch": 2.7646011396011394,
+      "grad_norm": 0.7944566011428833,
+      "learning_rate": 4.3599172955165605e-05,
+      "loss": 0.8913,
+      "step": 15528
+    },
+    {
+      "epoch": 2.764779202279202,
+      "grad_norm": 0.9346068501472473,
+      "learning_rate": 4.358761482719125e-05,
+      "loss": 0.8286,
+      "step": 15529
+    },
+    {
+      "epoch": 2.764957264957265,
+      "grad_norm": 0.8570913076400757,
+      "learning_rate": 4.3576057804500414e-05,
+      "loss": 1.0334,
+      "step": 15530
+    },
+    {
+      "epoch": 2.7651353276353277,
+      "grad_norm": 0.801908552646637,
+      "learning_rate": 4.356450188731953e-05,
+      "loss": 0.9021,
+      "step": 15531
+    },
+    {
+      "epoch": 2.7653133903133904,
+      "grad_norm": 0.848849892616272,
+      "learning_rate": 4.355294707587499e-05,
+      "loss": 0.9132,
+      "step": 15532
+    },
+    {
+      "epoch": 2.765491452991453,
+      "grad_norm": 0.7961751818656921,
+      "learning_rate": 4.35413933703932e-05,
+      "loss": 0.841,
+      "step": 15533
+    },
+    {
+      "epoch": 2.765669515669516,
+      "grad_norm": 0.8609708547592163,
+      "learning_rate": 4.352984077110052e-05,
+      "loss": 0.8176,
+      "step": 15534
+    },
+    {
+      "epoch": 2.765847578347578,
+      "grad_norm": 0.8779369592666626,
+      "learning_rate": 4.35182892782233e-05,
+      "loss": 0.8252,
+      "step": 15535
+    },
+    {
+      "epoch": 2.766025641025641,
+      "grad_norm": 0.7878577709197998,
+      "learning_rate": 4.3506738891987844e-05,
+      "loss": 0.7498,
+      "step": 15536
+    },
+    {
+      "epoch": 2.7662037037037037,
+      "grad_norm": 0.9531580805778503,
+      "learning_rate": 4.3495189612620557e-05,
+      "loss": 0.8438,
+      "step": 15537
+    },
+    {
+      "epoch": 2.7663817663817665,
+      "grad_norm": 0.7791294455528259,
+      "learning_rate": 4.3483641440347564e-05,
+      "loss": 0.9188,
+      "step": 15538
+    },
+    {
+      "epoch": 2.7665598290598292,
+      "grad_norm": 0.8683488965034485,
+      "learning_rate": 4.347209437539527e-05,
+      "loss": 1.0422,
+      "step": 15539
+    },
+    {
+      "epoch": 2.7667378917378915,
+      "grad_norm": 0.8904309272766113,
+      "learning_rate": 4.346054841798984e-05,
+      "loss": 0.8072,
+      "step": 15540
+    },
+    {
+      "epoch": 2.7669159544159543,
+      "grad_norm": 0.7409844398498535,
+      "learning_rate": 4.344900356835753e-05,
+      "loss": 0.7179,
+      "step": 15541
+    },
+    {
+      "epoch": 2.767094017094017,
+      "grad_norm": 0.9663724899291992,
+      "learning_rate": 4.343745982672451e-05,
+      "loss": 1.0568,
+      "step": 15542
+    },
+    {
+      "epoch": 2.76727207977208,
+      "grad_norm": 0.8481591939926147,
+      "learning_rate": 4.342591719331698e-05,
+      "loss": 0.8678,
+      "step": 15543
+    },
+    {
+      "epoch": 2.7674501424501425,
+      "grad_norm": 0.7301938533782959,
+      "learning_rate": 4.341437566836103e-05,
+      "loss": 0.6357,
+      "step": 15544
+    },
+    {
+      "epoch": 2.7676282051282053,
+      "grad_norm": 0.8628479242324829,
+      "learning_rate": 4.340283525208292e-05,
+      "loss": 0.9622,
+      "step": 15545
+    },
+    {
+      "epoch": 2.767806267806268,
+      "grad_norm": 0.953744113445282,
+      "learning_rate": 4.339129594470861e-05,
+      "loss": 0.683,
+      "step": 15546
+    },
+    {
+      "epoch": 2.7679843304843303,
+      "grad_norm": 0.7589353322982788,
+      "learning_rate": 4.3379757746464336e-05,
+      "loss": 0.8343,
+      "step": 15547
+    },
+    {
+      "epoch": 2.768162393162393,
+      "grad_norm": 0.8304651379585266,
+      "learning_rate": 4.336822065757601e-05,
+      "loss": 0.9084,
+      "step": 15548
+    },
+    {
+      "epoch": 2.768340455840456,
+      "grad_norm": 0.8092817068099976,
+      "learning_rate": 4.33566846782698e-05,
+      "loss": 0.8371,
+      "step": 15549
+    },
+    {
+      "epoch": 2.7685185185185186,
+      "grad_norm": 0.8983978033065796,
+      "learning_rate": 4.334514980877169e-05,
+      "loss": 0.9631,
+      "step": 15550
+    },
+    {
+      "epoch": 2.7686965811965814,
+      "grad_norm": 0.766621470451355,
+      "learning_rate": 4.3333616049307636e-05,
+      "loss": 0.768,
+      "step": 15551
+    },
+    {
+      "epoch": 2.7688746438746437,
+      "grad_norm": 0.8753345012664795,
+      "learning_rate": 4.332208340010374e-05,
+      "loss": 0.8854,
+      "step": 15552
+    },
+    {
+      "epoch": 2.7690527065527064,
+      "grad_norm": 0.8396589756011963,
+      "learning_rate": 4.331055186138581e-05,
+      "loss": 0.8322,
+      "step": 15553
+    },
+    {
+      "epoch": 2.769230769230769,
+      "grad_norm": 0.8134872317314148,
+      "learning_rate": 4.3299021433379885e-05,
+      "loss": 0.945,
+      "step": 15554
+    },
+    {
+      "epoch": 2.769408831908832,
+      "grad_norm": 0.8712667226791382,
+      "learning_rate": 4.3287492116311854e-05,
+      "loss": 0.8487,
+      "step": 15555
+    },
+    {
+      "epoch": 2.7695868945868947,
+      "grad_norm": 0.8938018083572388,
+      "learning_rate": 4.32759639104076e-05,
+      "loss": 0.854,
+      "step": 15556
+    },
+    {
+      "epoch": 2.7697649572649574,
+      "grad_norm": 0.8213987946510315,
+      "learning_rate": 4.3264436815893005e-05,
+      "loss": 0.9055,
+      "step": 15557
+    },
+    {
+      "epoch": 2.76994301994302,
+      "grad_norm": 0.9587214589118958,
+      "learning_rate": 4.3252910832993906e-05,
+      "loss": 0.9664,
+      "step": 15558
+    },
+    {
+      "epoch": 2.7701210826210825,
+      "grad_norm": 0.8746159076690674,
+      "learning_rate": 4.3241385961936146e-05,
+      "loss": 0.8394,
+      "step": 15559
+    },
+    {
+      "epoch": 2.7702991452991452,
+      "grad_norm": 0.8395819067955017,
+      "learning_rate": 4.3229862202945517e-05,
+      "loss": 0.7991,
+      "step": 15560
+    },
+    {
+      "epoch": 2.770477207977208,
+      "grad_norm": 0.856765627861023,
+      "learning_rate": 4.321833955624777e-05,
+      "loss": 0.8106,
+      "step": 15561
+    },
+    {
+      "epoch": 2.7706552706552707,
+      "grad_norm": 0.8630124926567078,
+      "learning_rate": 4.3206818022068776e-05,
+      "loss": 0.7293,
+      "step": 15562
+    },
+    {
+      "epoch": 2.7708333333333335,
+      "grad_norm": 0.8196776509284973,
+      "learning_rate": 4.319529760063414e-05,
+      "loss": 0.8831,
+      "step": 15563
+    },
+    {
+      "epoch": 2.771011396011396,
+      "grad_norm": 0.8283860683441162,
+      "learning_rate": 4.3183778292169674e-05,
+      "loss": 0.9249,
+      "step": 15564
+    },
+    {
+      "epoch": 2.7711894586894585,
+      "grad_norm": 0.8983619809150696,
+      "learning_rate": 4.3172260096901054e-05,
+      "loss": 1.2334,
+      "step": 15565
+    },
+    {
+      "epoch": 2.7713675213675213,
+      "grad_norm": 0.8437079191207886,
+      "learning_rate": 4.316074301505395e-05,
+      "loss": 0.771,
+      "step": 15566
+    },
+    {
+      "epoch": 2.771545584045584,
+      "grad_norm": 0.9565808773040771,
+      "learning_rate": 4.314922704685401e-05,
+      "loss": 0.8927,
+      "step": 15567
+    },
+    {
+      "epoch": 2.771723646723647,
+      "grad_norm": 0.7943497896194458,
+      "learning_rate": 4.313771219252687e-05,
+      "loss": 0.656,
+      "step": 15568
+    },
+    {
+      "epoch": 2.7719017094017095,
+      "grad_norm": 0.862404465675354,
+      "learning_rate": 4.3126198452298126e-05,
+      "loss": 0.8783,
+      "step": 15569
+    },
+    {
+      "epoch": 2.7720797720797723,
+      "grad_norm": 0.7928122878074646,
+      "learning_rate": 4.3114685826393365e-05,
+      "loss": 0.9799,
+      "step": 15570
+    },
+    {
+      "epoch": 2.7722578347578346,
+      "grad_norm": 0.8270733952522278,
+      "learning_rate": 4.3103174315038184e-05,
+      "loss": 0.7878,
+      "step": 15571
+    },
+    {
+      "epoch": 2.7724358974358974,
+      "grad_norm": 0.8223987817764282,
+      "learning_rate": 4.309166391845811e-05,
+      "loss": 0.7222,
+      "step": 15572
+    },
+    {
+      "epoch": 2.77261396011396,
+      "grad_norm": 0.8159852027893066,
+      "learning_rate": 4.3080154636878675e-05,
+      "loss": 0.8082,
+      "step": 15573
+    },
+    {
+      "epoch": 2.772792022792023,
+      "grad_norm": 0.882792055606842,
+      "learning_rate": 4.306864647052537e-05,
+      "loss": 1.0659,
+      "step": 15574
+    },
+    {
+      "epoch": 2.7729700854700856,
+      "grad_norm": 0.8734562993049622,
+      "learning_rate": 4.305713941962366e-05,
+      "loss": 0.9301,
+      "step": 15575
+    },
+    {
+      "epoch": 2.773148148148148,
+      "grad_norm": 0.8544983267784119,
+      "learning_rate": 4.304563348439898e-05,
+      "loss": 0.7442,
+      "step": 15576
+    },
+    {
+      "epoch": 2.7733262108262107,
+      "grad_norm": 0.9045799374580383,
+      "learning_rate": 4.303412866507689e-05,
+      "loss": 1.1023,
+      "step": 15577
+    },
+    {
+      "epoch": 2.7735042735042734,
+      "grad_norm": 0.8132993578910828,
+      "learning_rate": 4.3022624961882615e-05,
+      "loss": 0.7032,
+      "step": 15578
+    },
+    {
+      "epoch": 2.773682336182336,
+      "grad_norm": 0.7072446942329407,
+      "learning_rate": 4.30111223750417e-05,
+      "loss": 0.7176,
+      "step": 15579
+    },
+    {
+      "epoch": 2.773860398860399,
+      "grad_norm": 0.8212466239929199,
+      "learning_rate": 4.299962090477945e-05,
+      "loss": 0.7664,
+      "step": 15580
+    },
+    {
+      "epoch": 2.7740384615384617,
+      "grad_norm": 0.7781338095664978,
+      "learning_rate": 4.298812055132122e-05,
+      "loss": 0.7439,
+      "step": 15581
+    },
+    {
+      "epoch": 2.7742165242165244,
+      "grad_norm": 0.9289973378181458,
+      "learning_rate": 4.297662131489234e-05,
+      "loss": 0.9504,
+      "step": 15582
+    },
+    {
+      "epoch": 2.7743945868945867,
+      "grad_norm": 0.8571373224258423,
+      "learning_rate": 4.2965123195718105e-05,
+      "loss": 0.9959,
+      "step": 15583
+    },
+    {
+      "epoch": 2.7745726495726495,
+      "grad_norm": 0.8670883774757385,
+      "learning_rate": 4.29536261940238e-05,
+      "loss": 0.8207,
+      "step": 15584
+    },
+    {
+      "epoch": 2.7747507122507122,
+      "grad_norm": 0.8684807419776917,
+      "learning_rate": 4.294213031003469e-05,
+      "loss": 0.7508,
+      "step": 15585
+    },
+    {
+      "epoch": 2.774928774928775,
+      "grad_norm": 0.7746252417564392,
+      "learning_rate": 4.293063554397597e-05,
+      "loss": 0.909,
+      "step": 15586
+    },
+    {
+      "epoch": 2.7751068376068377,
+      "grad_norm": 0.8363521099090576,
+      "learning_rate": 4.291914189607297e-05,
+      "loss": 0.8564,
+      "step": 15587
+    },
+    {
+      "epoch": 2.7752849002849,
+      "grad_norm": 0.8843217492103577,
+      "learning_rate": 4.2907649366550726e-05,
+      "loss": 0.8187,
+      "step": 15588
+    },
+    {
+      "epoch": 2.775462962962963,
+      "grad_norm": 0.9330897331237793,
+      "learning_rate": 4.2896157955634545e-05,
+      "loss": 0.8179,
+      "step": 15589
+    },
+    {
+      "epoch": 2.7756410256410255,
+      "grad_norm": 0.7950356602668762,
+      "learning_rate": 4.288466766354953e-05,
+      "loss": 0.7091,
+      "step": 15590
+    },
+    {
+      "epoch": 2.7758190883190883,
+      "grad_norm": 0.9085933566093445,
+      "learning_rate": 4.287317849052075e-05,
+      "loss": 0.8015,
+      "step": 15591
+    },
+    {
+      "epoch": 2.775997150997151,
+      "grad_norm": 0.9285191893577576,
+      "learning_rate": 4.286169043677345e-05,
+      "loss": 0.9967,
+      "step": 15592
+    },
+    {
+      "epoch": 2.776175213675214,
+      "grad_norm": 0.8202041387557983,
+      "learning_rate": 4.285020350253256e-05,
+      "loss": 0.9286,
+      "step": 15593
+    },
+    {
+      "epoch": 2.7763532763532766,
+      "grad_norm": 1.0619434118270874,
+      "learning_rate": 4.283871768802328e-05,
+      "loss": 0.7863,
+      "step": 15594
+    },
+    {
+      "epoch": 2.7765313390313393,
+      "grad_norm": 0.8250051140785217,
+      "learning_rate": 4.282723299347052e-05,
+      "loss": 0.8531,
+      "step": 15595
+    },
+    {
+      "epoch": 2.7767094017094016,
+      "grad_norm": 0.8794218897819519,
+      "learning_rate": 4.281574941909939e-05,
+      "loss": 0.906,
+      "step": 15596
+    },
+    {
+      "epoch": 2.7768874643874644,
+      "grad_norm": 0.7725922465324402,
+      "learning_rate": 4.2804266965134866e-05,
+      "loss": 0.9084,
+      "step": 15597
+    },
+    {
+      "epoch": 2.777065527065527,
+      "grad_norm": 0.7845144867897034,
+      "learning_rate": 4.279278563180192e-05,
+      "loss": 0.9768,
+      "step": 15598
+    },
+    {
+      "epoch": 2.77724358974359,
+      "grad_norm": 0.9395498633384705,
+      "learning_rate": 4.27813054193255e-05,
+      "loss": 0.9055,
+      "step": 15599
+    },
+    {
+      "epoch": 2.777421652421652,
+      "grad_norm": 0.8043427467346191,
+      "learning_rate": 4.276982632793054e-05,
+      "loss": 0.8244,
+      "step": 15600
+    },
+    {
+      "epoch": 2.777599715099715,
+      "grad_norm": 0.7874096632003784,
+      "learning_rate": 4.27583483578419e-05,
+      "loss": 0.8861,
+      "step": 15601
+    },
+    {
+      "epoch": 2.7777777777777777,
+      "grad_norm": 0.8874611258506775,
+      "learning_rate": 4.27468715092846e-05,
+      "loss": 1.0457,
+      "step": 15602
+    },
+    {
+      "epoch": 2.7779558404558404,
+      "grad_norm": 1.0025757551193237,
+      "learning_rate": 4.273539578248334e-05,
+      "loss": 1.1114,
+      "step": 15603
+    },
+    {
+      "epoch": 2.778133903133903,
+      "grad_norm": 0.9982876777648926,
+      "learning_rate": 4.272392117766313e-05,
+      "loss": 0.9142,
+      "step": 15604
+    },
+    {
+      "epoch": 2.778311965811966,
+      "grad_norm": 0.8762221932411194,
+      "learning_rate": 4.2712447695048616e-05,
+      "loss": 1.0114,
+      "step": 15605
+    },
+    {
+      "epoch": 2.7784900284900287,
+      "grad_norm": 0.9136927723884583,
+      "learning_rate": 4.2700975334864726e-05,
+      "loss": 0.8224,
+      "step": 15606
+    },
+    {
+      "epoch": 2.7786680911680914,
+      "grad_norm": 0.8845604062080383,
+      "learning_rate": 4.2689504097336184e-05,
+      "loss": 0.8135,
+      "step": 15607
+    },
+    {
+      "epoch": 2.7788461538461537,
+      "grad_norm": 0.8584510087966919,
+      "learning_rate": 4.267803398268777e-05,
+      "loss": 0.8928,
+      "step": 15608
+    },
+    {
+      "epoch": 2.7790242165242165,
+      "grad_norm": 0.711402177810669,
+      "learning_rate": 4.266656499114421e-05,
+      "loss": 0.792,
+      "step": 15609
+    },
+    {
+      "epoch": 2.7792022792022792,
+      "grad_norm": 0.9480760097503662,
+      "learning_rate": 4.2655097122930165e-05,
+      "loss": 0.8707,
+      "step": 15610
+    },
+    {
+      "epoch": 2.779380341880342,
+      "grad_norm": 0.863855242729187,
+      "learning_rate": 4.264363037827041e-05,
+      "loss": 1.0114,
+      "step": 15611
+    },
+    {
+      "epoch": 2.7795584045584043,
+      "grad_norm": 0.8010865449905396,
+      "learning_rate": 4.2632164757389556e-05,
+      "loss": 0.9221,
+      "step": 15612
+    },
+    {
+      "epoch": 2.779736467236467,
+      "grad_norm": 0.7950930595397949,
+      "learning_rate": 4.262070026051227e-05,
+      "loss": 0.6951,
+      "step": 15613
+    },
+    {
+      "epoch": 2.77991452991453,
+      "grad_norm": 0.8252870440483093,
+      "learning_rate": 4.260923688786317e-05,
+      "loss": 0.6976,
+      "step": 15614
+    },
+    {
+      "epoch": 2.7800925925925926,
+      "grad_norm": 0.7855920791625977,
+      "learning_rate": 4.259777463966686e-05,
+      "loss": 0.8277,
+      "step": 15615
+    },
+    {
+      "epoch": 2.7802706552706553,
+      "grad_norm": 0.8783130645751953,
+      "learning_rate": 4.258631351614786e-05,
+      "loss": 0.6995,
+      "step": 15616
+    },
+    {
+      "epoch": 2.780448717948718,
+      "grad_norm": 0.8904485106468201,
+      "learning_rate": 4.257485351753085e-05,
+      "loss": 0.8226,
+      "step": 15617
+    },
+    {
+      "epoch": 2.780626780626781,
+      "grad_norm": 0.8761011958122253,
+      "learning_rate": 4.2563394644040244e-05,
+      "loss": 0.9187,
+      "step": 15618
+    },
+    {
+      "epoch": 2.7808048433048436,
+      "grad_norm": 0.897404670715332,
+      "learning_rate": 4.255193689590067e-05,
+      "loss": 1.0234,
+      "step": 15619
+    },
+    {
+      "epoch": 2.780982905982906,
+      "grad_norm": 0.8966960906982422,
+      "learning_rate": 4.254048027333648e-05,
+      "loss": 0.9,
+      "step": 15620
+    },
+    {
+      "epoch": 2.7811609686609686,
+      "grad_norm": 0.7506237030029297,
+      "learning_rate": 4.2529024776572245e-05,
+      "loss": 0.8939,
+      "step": 15621
+    },
+    {
+      "epoch": 2.7813390313390314,
+      "grad_norm": 0.8073886036872864,
+      "learning_rate": 4.2517570405832396e-05,
+      "loss": 0.7779,
+      "step": 15622
+    },
+    {
+      "epoch": 2.781517094017094,
+      "grad_norm": 0.7928911447525024,
+      "learning_rate": 4.250611716134134e-05,
+      "loss": 0.8278,
+      "step": 15623
+    },
+    {
+      "epoch": 2.7816951566951564,
+      "grad_norm": 0.7301982045173645,
+      "learning_rate": 4.249466504332349e-05,
+      "loss": 0.7515,
+      "step": 15624
+    },
+    {
+      "epoch": 2.781873219373219,
+      "grad_norm": 0.8215289115905762,
+      "learning_rate": 4.248321405200322e-05,
+      "loss": 0.9752,
+      "step": 15625
+    },
+    {
+      "epoch": 2.782051282051282,
+      "grad_norm": 0.8281431198120117,
+      "learning_rate": 4.247176418760486e-05,
+      "loss": 0.9625,
+      "step": 15626
+    },
+    {
+      "epoch": 2.7822293447293447,
+      "grad_norm": 0.9202759265899658,
+      "learning_rate": 4.246031545035283e-05,
+      "loss": 0.8757,
+      "step": 15627
+    },
+    {
+      "epoch": 2.7824074074074074,
+      "grad_norm": 0.8628471493721008,
+      "learning_rate": 4.244886784047133e-05,
+      "loss": 0.7626,
+      "step": 15628
+    },
+    {
+      "epoch": 2.78258547008547,
+      "grad_norm": 0.9345491528511047,
+      "learning_rate": 4.2437421358184747e-05,
+      "loss": 0.8714,
+      "step": 15629
+    },
+    {
+      "epoch": 2.782763532763533,
+      "grad_norm": 0.893713116645813,
+      "learning_rate": 4.2425976003717314e-05,
+      "loss": 0.9953,
+      "step": 15630
+    },
+    {
+      "epoch": 2.7829415954415957,
+      "grad_norm": 0.8794371485710144,
+      "learning_rate": 4.2414531777293286e-05,
+      "loss": 0.7899,
+      "step": 15631
+    },
+    {
+      "epoch": 2.783119658119658,
+      "grad_norm": 0.9003345370292664,
+      "learning_rate": 4.240308867913688e-05,
+      "loss": 0.9607,
+      "step": 15632
+    },
+    {
+      "epoch": 2.7832977207977208,
+      "grad_norm": 0.8352270126342773,
+      "learning_rate": 4.239164670947228e-05,
+      "loss": 1.0315,
+      "step": 15633
+    },
+    {
+      "epoch": 2.7834757834757835,
+      "grad_norm": 0.825252890586853,
+      "learning_rate": 4.238020586852375e-05,
+      "loss": 1.0493,
+      "step": 15634
+    },
+    {
+      "epoch": 2.7836538461538463,
+      "grad_norm": 1.1185758113861084,
+      "learning_rate": 4.2368766156515324e-05,
+      "loss": 0.8962,
+      "step": 15635
+    },
+    {
+      "epoch": 2.7838319088319086,
+      "grad_norm": 0.840336799621582,
+      "learning_rate": 4.235732757367125e-05,
+      "loss": 0.8289,
+      "step": 15636
+    },
+    {
+      "epoch": 2.7840099715099713,
+      "grad_norm": 0.9251887202262878,
+      "learning_rate": 4.2345890120215595e-05,
+      "loss": 0.9306,
+      "step": 15637
+    },
+    {
+      "epoch": 2.784188034188034,
+      "grad_norm": 0.9645969867706299,
+      "learning_rate": 4.233445379637244e-05,
+      "loss": 0.8453,
+      "step": 15638
+    },
+    {
+      "epoch": 2.784366096866097,
+      "grad_norm": 0.9010009765625,
+      "learning_rate": 4.232301860236589e-05,
+      "loss": 0.9796,
+      "step": 15639
+    },
+    {
+      "epoch": 2.7845441595441596,
+      "grad_norm": 0.920427143573761,
+      "learning_rate": 4.231158453841998e-05,
+      "loss": 0.7905,
+      "step": 15640
+    },
+    {
+      "epoch": 2.7847222222222223,
+      "grad_norm": 0.8292316794395447,
+      "learning_rate": 4.2300151604758734e-05,
+      "loss": 0.938,
+      "step": 15641
+    },
+    {
+      "epoch": 2.784900284900285,
+      "grad_norm": 0.8550885319709778,
+      "learning_rate": 4.228871980160615e-05,
+      "loss": 0.7728,
+      "step": 15642
+    },
+    {
+      "epoch": 2.785078347578348,
+      "grad_norm": 0.8785567283630371,
+      "learning_rate": 4.227728912918617e-05,
+      "loss": 1.0367,
+      "step": 15643
+    },
+    {
+      "epoch": 2.78525641025641,
+      "grad_norm": 0.8732814788818359,
+      "learning_rate": 4.226585958772289e-05,
+      "loss": 0.9914,
+      "step": 15644
+    },
+    {
+      "epoch": 2.785434472934473,
+      "grad_norm": 0.9473167061805725,
+      "learning_rate": 4.225443117744008e-05,
+      "loss": 1.1311,
+      "step": 15645
+    },
+    {
+      "epoch": 2.7856125356125356,
+      "grad_norm": 0.8819913864135742,
+      "learning_rate": 4.224300389856177e-05,
+      "loss": 0.8798,
+      "step": 15646
+    },
+    {
+      "epoch": 2.7857905982905984,
+      "grad_norm": 0.835367739200592,
+      "learning_rate": 4.223157775131182e-05,
+      "loss": 0.6977,
+      "step": 15647
+    },
+    {
+      "epoch": 2.7859686609686607,
+      "grad_norm": 0.8122659921646118,
+      "learning_rate": 4.222015273591411e-05,
+      "loss": 0.9656,
+      "step": 15648
+    },
+    {
+      "epoch": 2.7861467236467234,
+      "grad_norm": 0.8085313439369202,
+      "learning_rate": 4.220872885259247e-05,
+      "loss": 0.7456,
+      "step": 15649
+    },
+    {
+      "epoch": 2.786324786324786,
+      "grad_norm": 0.681515097618103,
+      "learning_rate": 4.21973061015707e-05,
+      "loss": 0.5008,
+      "step": 15650
+    },
+    {
+      "epoch": 2.786502849002849,
+      "grad_norm": 0.8021831512451172,
+      "learning_rate": 4.2185884483072676e-05,
+      "loss": 0.8954,
+      "step": 15651
+    },
+    {
+      "epoch": 2.7866809116809117,
+      "grad_norm": 0.9254723787307739,
+      "learning_rate": 4.217446399732216e-05,
+      "loss": 0.7855,
+      "step": 15652
+    },
+    {
+      "epoch": 2.7868589743589745,
+      "grad_norm": 0.8415037989616394,
+      "learning_rate": 4.2163044644542894e-05,
+      "loss": 0.8835,
+      "step": 15653
+    },
+    {
+      "epoch": 2.787037037037037,
+      "grad_norm": 0.9031959772109985,
+      "learning_rate": 4.2151626424958614e-05,
+      "loss": 1.0048,
+      "step": 15654
+    },
+    {
+      "epoch": 2.7872150997151,
+      "grad_norm": Infinity,
+      "learning_rate": 4.2151626424958614e-05,
+      "loss": 0.9344,
+      "step": 15655
+    },
+    {
+      "epoch": 2.7873931623931623,
+      "grad_norm": 0.815680742263794,
+      "learning_rate": 4.214020933879306e-05,
+      "loss": 0.6897,
+      "step": 15656
+    },
+    {
+      "epoch": 2.787571225071225,
+      "grad_norm": 0.9080044627189636,
+      "learning_rate": 4.212879338626989e-05,
+      "loss": 1.0366,
+      "step": 15657
+    },
+    {
+      "epoch": 2.7877492877492878,
+      "grad_norm": 0.8387414813041687,
+      "learning_rate": 4.211737856761281e-05,
+      "loss": 0.9255,
+      "step": 15658
+    },
+    {
+      "epoch": 2.7879273504273505,
+      "grad_norm": 0.9269571304321289,
+      "learning_rate": 4.210596488304542e-05,
+      "loss": 0.8971,
+      "step": 15659
+    },
+    {
+      "epoch": 2.7881054131054133,
+      "grad_norm": 0.7987017035484314,
+      "learning_rate": 4.2094552332791456e-05,
+      "loss": 0.8293,
+      "step": 15660
+    },
+    {
+      "epoch": 2.7882834757834756,
+      "grad_norm": 0.8481683731079102,
+      "learning_rate": 4.208314091707437e-05,
+      "loss": 0.9159,
+      "step": 15661
+    },
+    {
+      "epoch": 2.7884615384615383,
+      "grad_norm": 0.944736897945404,
+      "learning_rate": 4.207173063611788e-05,
+      "loss": 0.9398,
+      "step": 15662
+    },
+    {
+      "epoch": 2.788639601139601,
+      "grad_norm": 0.8471882343292236,
+      "learning_rate": 4.206032149014547e-05,
+      "loss": 0.7534,
+      "step": 15663
+    },
+    {
+      "epoch": 2.788817663817664,
+      "grad_norm": 0.8500807881355286,
+      "learning_rate": 4.2048913479380714e-05,
+      "loss": 0.8874,
+      "step": 15664
+    },
+    {
+      "epoch": 2.7889957264957266,
+      "grad_norm": 0.7949451804161072,
+      "learning_rate": 4.2037506604047115e-05,
+      "loss": 0.8691,
+      "step": 15665
+    },
+    {
+      "epoch": 2.7891737891737893,
+      "grad_norm": 0.8587945103645325,
+      "learning_rate": 4.202610086436817e-05,
+      "loss": 0.8288,
+      "step": 15666
+    },
+    {
+      "epoch": 2.789351851851852,
+      "grad_norm": 0.9155020117759705,
+      "learning_rate": 4.201469626056734e-05,
+      "loss": 0.8384,
+      "step": 15667
+    },
+    {
+      "epoch": 2.7895299145299144,
+      "grad_norm": 0.9402222037315369,
+      "learning_rate": 4.200329279286809e-05,
+      "loss": 0.8239,
+      "step": 15668
+    },
+    {
+      "epoch": 2.789707977207977,
+      "grad_norm": 0.9111437201499939,
+      "learning_rate": 4.19918904614938e-05,
+      "loss": 0.9251,
+      "step": 15669
+    },
+    {
+      "epoch": 2.78988603988604,
+      "grad_norm": 0.9434856176376343,
+      "learning_rate": 4.198048926666795e-05,
+      "loss": 1.0517,
+      "step": 15670
+    },
+    {
+      "epoch": 2.7900641025641026,
+      "grad_norm": 0.9518313407897949,
+      "learning_rate": 4.1969089208613896e-05,
+      "loss": 0.8893,
+      "step": 15671
+    },
+    {
+      "epoch": 2.7902421652421654,
+      "grad_norm": 0.8107752799987793,
+      "learning_rate": 4.1957690287554986e-05,
+      "loss": 0.6548,
+      "step": 15672
+    },
+    {
+      "epoch": 2.7904202279202277,
+      "grad_norm": 0.8361678719520569,
+      "learning_rate": 4.1946292503714556e-05,
+      "loss": 0.9224,
+      "step": 15673
+    },
+    {
+      "epoch": 2.7905982905982905,
+      "grad_norm": 0.7812657952308655,
+      "learning_rate": 4.1934895857315904e-05,
+      "loss": 0.7126,
+      "step": 15674
+    },
+    {
+      "epoch": 2.790776353276353,
+      "grad_norm": 0.9054265022277832,
+      "learning_rate": 4.192350034858241e-05,
+      "loss": 0.6891,
+      "step": 15675
+    },
+    {
+      "epoch": 2.790954415954416,
+      "grad_norm": 0.9675585627555847,
+      "learning_rate": 4.1912105977737214e-05,
+      "loss": 0.8429,
+      "step": 15676
+    },
+    {
+      "epoch": 2.7911324786324787,
+      "grad_norm": 0.9077114462852478,
+      "learning_rate": 4.19007127450037e-05,
+      "loss": 0.8864,
+      "step": 15677
+    },
+    {
+      "epoch": 2.7913105413105415,
+      "grad_norm": 0.9230541586875916,
+      "learning_rate": 4.188932065060497e-05,
+      "loss": 1.0065,
+      "step": 15678
+    },
+    {
+      "epoch": 2.791488603988604,
+      "grad_norm": 0.8667981028556824,
+      "learning_rate": 4.1877929694764315e-05,
+      "loss": 0.7584,
+      "step": 15679
+    },
+    {
+      "epoch": 2.7916666666666665,
+      "grad_norm": 0.8986212015151978,
+      "learning_rate": 4.1866539877704894e-05,
+      "loss": 0.9205,
+      "step": 15680
+    },
+    {
+      "epoch": 2.7918447293447293,
+      "grad_norm": 0.8524685502052307,
+      "learning_rate": 4.185515119964986e-05,
+      "loss": 0.8516,
+      "step": 15681
+    },
+    {
+      "epoch": 2.792022792022792,
+      "grad_norm": 0.8247089385986328,
+      "learning_rate": 4.184376366082234e-05,
+      "loss": 0.8733,
+      "step": 15682
+    },
+    {
+      "epoch": 2.7922008547008548,
+      "grad_norm": 0.8236528635025024,
+      "learning_rate": 4.183237726144549e-05,
+      "loss": 0.8715,
+      "step": 15683
+    },
+    {
+      "epoch": 2.7923789173789175,
+      "grad_norm": 0.8853272199630737,
+      "learning_rate": 4.182099200174232e-05,
+      "loss": 0.7741,
+      "step": 15684
+    },
+    {
+      "epoch": 2.79255698005698,
+      "grad_norm": 0.8243789672851562,
+      "learning_rate": 4.180960788193603e-05,
+      "loss": 0.9196,
+      "step": 15685
+    },
+    {
+      "epoch": 2.7927350427350426,
+      "grad_norm": 0.9670386910438538,
+      "learning_rate": 4.1798224902249515e-05,
+      "loss": 0.828,
+      "step": 15686
+    },
+    {
+      "epoch": 2.7929131054131053,
+      "grad_norm": 0.7831283211708069,
+      "learning_rate": 4.178684306290592e-05,
+      "loss": 0.8389,
+      "step": 15687
+    },
+    {
+      "epoch": 2.793091168091168,
+      "grad_norm": 0.9372588396072388,
+      "learning_rate": 4.177546236412822e-05,
+      "loss": 1.2267,
+      "step": 15688
+    },
+    {
+      "epoch": 2.793269230769231,
+      "grad_norm": 0.9065600633621216,
+      "learning_rate": 4.176408280613937e-05,
+      "loss": 0.9674,
+      "step": 15689
+    },
+    {
+      "epoch": 2.7934472934472936,
+      "grad_norm": 0.8220530152320862,
+      "learning_rate": 4.1752704389162344e-05,
+      "loss": 0.8717,
+      "step": 15690
+    },
+    {
+      "epoch": 2.7936253561253563,
+      "grad_norm": 0.8952174782752991,
+      "learning_rate": 4.174132711342005e-05,
+      "loss": 0.8904,
+      "step": 15691
+    },
+    {
+      "epoch": 2.7938034188034186,
+      "grad_norm": 0.8454076647758484,
+      "learning_rate": 4.172995097913549e-05,
+      "loss": 0.9784,
+      "step": 15692
+    },
+    {
+      "epoch": 2.7939814814814814,
+      "grad_norm": 0.8697866797447205,
+      "learning_rate": 4.171857598653143e-05,
+      "loss": 1.0042,
+      "step": 15693
+    },
+    {
+      "epoch": 2.794159544159544,
+      "grad_norm": 0.8736211657524109,
+      "learning_rate": 4.170720213583084e-05,
+      "loss": 0.8787,
+      "step": 15694
+    },
+    {
+      "epoch": 2.794337606837607,
+      "grad_norm": 1.0082578659057617,
+      "learning_rate": 4.1695829427256525e-05,
+      "loss": 1.2508,
+      "step": 15695
+    },
+    {
+      "epoch": 2.7945156695156697,
+      "grad_norm": 0.8092042207717896,
+      "learning_rate": 4.1684457861031325e-05,
+      "loss": 0.8968,
+      "step": 15696
+    },
+    {
+      "epoch": 2.794693732193732,
+      "grad_norm": 0.847034752368927,
+      "learning_rate": 4.167308743737802e-05,
+      "loss": 0.8019,
+      "step": 15697
+    },
+    {
+      "epoch": 2.7948717948717947,
+      "grad_norm": 0.9059078097343445,
+      "learning_rate": 4.1661718156519414e-05,
+      "loss": 1.0393,
+      "step": 15698
+    },
+    {
+      "epoch": 2.7950498575498575,
+      "grad_norm": 0.8907228112220764,
+      "learning_rate": 4.165035001867822e-05,
+      "loss": 0.7388,
+      "step": 15699
+    },
+    {
+      "epoch": 2.79522792022792,
+      "grad_norm": 0.8089052438735962,
+      "learning_rate": 4.163898302407727e-05,
+      "loss": 0.8052,
+      "step": 15700
+    },
+    {
+      "epoch": 2.795405982905983,
+      "grad_norm": 1.1158883571624756,
+      "learning_rate": 4.162761717293915e-05,
+      "loss": 0.6923,
+      "step": 15701
+    },
+    {
+      "epoch": 2.7955840455840457,
+      "grad_norm": 0.8157755732536316,
+      "learning_rate": 4.1616252465486684e-05,
+      "loss": 0.606,
+      "step": 15702
+    },
+    {
+      "epoch": 2.7957621082621085,
+      "grad_norm": 0.8905386328697205,
+      "learning_rate": 4.1604888901942386e-05,
+      "loss": 0.7798,
+      "step": 15703
+    },
+    {
+      "epoch": 2.7959401709401708,
+      "grad_norm": 0.7655990719795227,
+      "learning_rate": 4.1593526482529034e-05,
+      "loss": 0.7435,
+      "step": 15704
+    },
+    {
+      "epoch": 2.7961182336182335,
+      "grad_norm": 0.900643527507782,
+      "learning_rate": 4.1582165207469195e-05,
+      "loss": 0.8314,
+      "step": 15705
+    },
+    {
+      "epoch": 2.7962962962962963,
+      "grad_norm": 0.7737550735473633,
+      "learning_rate": 4.1570805076985475e-05,
+      "loss": 0.8281,
+      "step": 15706
+    },
+    {
+      "epoch": 2.796474358974359,
+      "grad_norm": 0.8385021090507507,
+      "learning_rate": 4.1559446091300455e-05,
+      "loss": 0.896,
+      "step": 15707
+    },
+    {
+      "epoch": 2.796652421652422,
+      "grad_norm": 0.8830214142799377,
+      "learning_rate": 4.1548088250636687e-05,
+      "loss": 0.8856,
+      "step": 15708
+    },
+    {
+      "epoch": 2.796830484330484,
+      "grad_norm": 0.9748533368110657,
+      "learning_rate": 4.1536731555216676e-05,
+      "loss": 0.8768,
+      "step": 15709
+    },
+    {
+      "epoch": 2.797008547008547,
+      "grad_norm": 0.8918380737304688,
+      "learning_rate": 4.1525376005263e-05,
+      "loss": 0.8332,
+      "step": 15710
+    },
+    {
+      "epoch": 2.7971866096866096,
+      "grad_norm": 0.9205654263496399,
+      "learning_rate": 4.15140216009981e-05,
+      "loss": 0.7698,
+      "step": 15711
+    },
+    {
+      "epoch": 2.7973646723646723,
+      "grad_norm": 0.9631472229957581,
+      "learning_rate": 4.1502668342644455e-05,
+      "loss": 0.9604,
+      "step": 15712
+    },
+    {
+      "epoch": 2.797542735042735,
+      "grad_norm": 0.8770546913146973,
+      "learning_rate": 4.1491316230424516e-05,
+      "loss": 0.7661,
+      "step": 15713
+    },
+    {
+      "epoch": 2.797720797720798,
+      "grad_norm": 0.8872628808021545,
+      "learning_rate": 4.147996526456069e-05,
+      "loss": 1.0847,
+      "step": 15714
+    },
+    {
+      "epoch": 2.7978988603988606,
+      "grad_norm": 0.8924010396003723,
+      "learning_rate": 4.146861544527538e-05,
+      "loss": 0.8159,
+      "step": 15715
+    },
+    {
+      "epoch": 2.7980769230769234,
+      "grad_norm": 0.9251703023910522,
+      "learning_rate": 4.1457266772790923e-05,
+      "loss": 0.7204,
+      "step": 15716
+    },
+    {
+      "epoch": 2.7982549857549857,
+      "grad_norm": 0.8891414403915405,
+      "learning_rate": 4.144591924732979e-05,
+      "loss": 0.8576,
+      "step": 15717
+    },
+    {
+      "epoch": 2.7984330484330484,
+      "grad_norm": 0.9676079154014587,
+      "learning_rate": 4.143457286911415e-05,
+      "loss": 1.1912,
+      "step": 15718
+    },
+    {
+      "epoch": 2.798611111111111,
+      "grad_norm": 0.8125061392784119,
+      "learning_rate": 4.142322763836645e-05,
+      "loss": 0.8111,
+      "step": 15719
+    },
+    {
+      "epoch": 2.798789173789174,
+      "grad_norm": 0.8612900972366333,
+      "learning_rate": 4.141188355530891e-05,
+      "loss": 0.7537,
+      "step": 15720
+    },
+    {
+      "epoch": 2.798967236467236,
+      "grad_norm": 1.0774086713790894,
+      "learning_rate": 4.14005406201638e-05,
+      "loss": 0.9364,
+      "step": 15721
+    },
+    {
+      "epoch": 2.799145299145299,
+      "grad_norm": 0.8296873569488525,
+      "learning_rate": 4.138919883315338e-05,
+      "loss": 0.9329,
+      "step": 15722
+    },
+    {
+      "epoch": 2.7993233618233617,
+      "grad_norm": 0.757978618144989,
+      "learning_rate": 4.137785819449984e-05,
+      "loss": 1.0353,
+      "step": 15723
+    },
+    {
+      "epoch": 2.7995014245014245,
+      "grad_norm": 0.8584328293800354,
+      "learning_rate": 4.136651870442536e-05,
+      "loss": 1.0577,
+      "step": 15724
+    },
+    {
+      "epoch": 2.7996794871794872,
+      "grad_norm": 0.7919153571128845,
+      "learning_rate": 4.135518036315222e-05,
+      "loss": 0.8013,
+      "step": 15725
+    },
+    {
+      "epoch": 2.79985754985755,
+      "grad_norm": 0.968519926071167,
+      "learning_rate": 4.134384317090243e-05,
+      "loss": 0.7684,
+      "step": 15726
+    },
+    {
+      "epoch": 2.8000356125356127,
+      "grad_norm": 0.8565614819526672,
+      "learning_rate": 4.133250712789826e-05,
+      "loss": 0.9177,
+      "step": 15727
+    },
+    {
+      "epoch": 2.8002136752136755,
+      "grad_norm": 0.8614934086799622,
+      "learning_rate": 4.1321172234361647e-05,
+      "loss": 0.9613,
+      "step": 15728
+    },
+    {
+      "epoch": 2.800391737891738,
+      "grad_norm": 0.8621053099632263,
+      "learning_rate": 4.130983849051483e-05,
+      "loss": 0.8254,
+      "step": 15729
+    },
+    {
+      "epoch": 2.8005698005698005,
+      "grad_norm": 0.8108318448066711,
+      "learning_rate": 4.12985058965798e-05,
+      "loss": 0.7577,
+      "step": 15730
+    },
+    {
+      "epoch": 2.8007478632478633,
+      "grad_norm": 0.9211961627006531,
+      "learning_rate": 4.1287174452778564e-05,
+      "loss": 0.8204,
+      "step": 15731
+    },
+    {
+      "epoch": 2.800925925925926,
+      "grad_norm": 0.8582359552383423,
+      "learning_rate": 4.127584415933326e-05,
+      "loss": 0.8915,
+      "step": 15732
+    },
+    {
+      "epoch": 2.8011039886039883,
+      "grad_norm": 0.9122742414474487,
+      "learning_rate": 4.126451501646573e-05,
+      "loss": 1.0896,
+      "step": 15733
+    },
+    {
+      "epoch": 2.801282051282051,
+      "grad_norm": 0.7392016649246216,
+      "learning_rate": 4.125318702439804e-05,
+      "loss": 0.6354,
+      "step": 15734
+    },
+    {
+      "epoch": 2.801460113960114,
+      "grad_norm": 0.8227471113204956,
+      "learning_rate": 4.124186018335213e-05,
+      "loss": 0.7903,
+      "step": 15735
+    },
+    {
+      "epoch": 2.8016381766381766,
+      "grad_norm": 0.8843638300895691,
+      "learning_rate": 4.12305344935499e-05,
+      "loss": 0.7545,
+      "step": 15736
+    },
+    {
+      "epoch": 2.8018162393162394,
+      "grad_norm": 0.9978471994400024,
+      "learning_rate": 4.121920995521327e-05,
+      "loss": 1.0127,
+      "step": 15737
+    },
+    {
+      "epoch": 2.801994301994302,
+      "grad_norm": 0.9969626069068909,
+      "learning_rate": 4.12078865685641e-05,
+      "loss": 0.8137,
+      "step": 15738
+    },
+    {
+      "epoch": 2.802172364672365,
+      "grad_norm": 1.0768957138061523,
+      "learning_rate": 4.119656433382428e-05,
+      "loss": 0.6866,
+      "step": 15739
+    },
+    {
+      "epoch": 2.8023504273504276,
+      "grad_norm": 0.7401831746101379,
+      "learning_rate": 4.1185243251215624e-05,
+      "loss": 0.9103,
+      "step": 15740
+    },
+    {
+      "epoch": 2.80252849002849,
+      "grad_norm": 0.9753470420837402,
+      "learning_rate": 4.1173923320959905e-05,
+      "loss": 0.9499,
+      "step": 15741
+    },
+    {
+      "epoch": 2.8027065527065527,
+      "grad_norm": 0.9174960851669312,
+      "learning_rate": 4.116260454327904e-05,
+      "loss": 1.0355,
+      "step": 15742
+    },
+    {
+      "epoch": 2.8028846153846154,
+      "grad_norm": 0.8292258381843567,
+      "learning_rate": 4.115128691839464e-05,
+      "loss": 0.8806,
+      "step": 15743
+    },
+    {
+      "epoch": 2.803062678062678,
+      "grad_norm": 0.9542452096939087,
+      "learning_rate": 4.1139970446528564e-05,
+      "loss": 0.8378,
+      "step": 15744
+    },
+    {
+      "epoch": 2.8032407407407405,
+      "grad_norm": 0.848686933517456,
+      "learning_rate": 4.1128655127902485e-05,
+      "loss": 0.7939,
+      "step": 15745
+    },
+    {
+      "epoch": 2.8034188034188032,
+      "grad_norm": 0.8277645111083984,
+      "learning_rate": 4.1117340962738125e-05,
+      "loss": 0.9277,
+      "step": 15746
+    },
+    {
+      "epoch": 2.803596866096866,
+      "grad_norm": 0.8613318204879761,
+      "learning_rate": 4.110602795125714e-05,
+      "loss": 0.7622,
+      "step": 15747
+    },
+    {
+      "epoch": 2.8037749287749287,
+      "grad_norm": 0.7106199860572815,
+      "learning_rate": 4.109471609368121e-05,
+      "loss": 0.787,
+      "step": 15748
+    },
+    {
+      "epoch": 2.8039529914529915,
+      "grad_norm": 0.8933543562889099,
+      "learning_rate": 4.108340539023194e-05,
+      "loss": 0.857,
+      "step": 15749
+    },
+    {
+      "epoch": 2.8041310541310542,
+      "grad_norm": 0.8682022094726562,
+      "learning_rate": 4.107209584113092e-05,
+      "loss": 0.7931,
+      "step": 15750
+    },
+    {
+      "epoch": 2.804309116809117,
+      "grad_norm": 0.828279435634613,
+      "learning_rate": 4.106078744659981e-05,
+      "loss": 0.8404,
+      "step": 15751
+    },
+    {
+      "epoch": 2.8044871794871797,
+      "grad_norm": 0.9503956437110901,
+      "learning_rate": 4.1049480206860136e-05,
+      "loss": 0.8588,
+      "step": 15752
+    },
+    {
+      "epoch": 2.804665242165242,
+      "grad_norm": 0.810714602470398,
+      "learning_rate": 4.1038174122133435e-05,
+      "loss": 0.8592,
+      "step": 15753
+    },
+    {
+      "epoch": 2.804843304843305,
+      "grad_norm": 0.9458156824111938,
+      "learning_rate": 4.1026869192641225e-05,
+      "loss": 0.7441,
+      "step": 15754
+    },
+    {
+      "epoch": 2.8050213675213675,
+      "grad_norm": 0.8267046809196472,
+      "learning_rate": 4.1015565418605016e-05,
+      "loss": 0.8574,
+      "step": 15755
+    },
+    {
+      "epoch": 2.8051994301994303,
+      "grad_norm": 0.8413352370262146,
+      "learning_rate": 4.100426280024623e-05,
+      "loss": 0.7598,
+      "step": 15756
+    },
+    {
+      "epoch": 2.8053774928774926,
+      "grad_norm": 0.9205049872398376,
+      "learning_rate": 4.099296133778644e-05,
+      "loss": 0.8346,
+      "step": 15757
+    },
+    {
+      "epoch": 2.8055555555555554,
+      "grad_norm": 0.7986966967582703,
+      "learning_rate": 4.098166103144692e-05,
+      "loss": 0.7607,
+      "step": 15758
+    },
+    {
+      "epoch": 2.805733618233618,
+      "grad_norm": 0.9400181174278259,
+      "learning_rate": 4.097036188144918e-05,
+      "loss": 0.8947,
+      "step": 15759
+    },
+    {
+      "epoch": 2.805911680911681,
+      "grad_norm": 0.9014864563941956,
+      "learning_rate": 4.0959063888014594e-05,
+      "loss": 0.7781,
+      "step": 15760
+    },
+    {
+      "epoch": 2.8060897435897436,
+      "grad_norm": 0.8163666725158691,
+      "learning_rate": 4.094776705136448e-05,
+      "loss": 0.9042,
+      "step": 15761
+    },
+    {
+      "epoch": 2.8062678062678064,
+      "grad_norm": 0.8435617685317993,
+      "learning_rate": 4.0936471371720217e-05,
+      "loss": 0.9229,
+      "step": 15762
+    },
+    {
+      "epoch": 2.806445868945869,
+      "grad_norm": 0.7995414137840271,
+      "learning_rate": 4.0925176849303104e-05,
+      "loss": 0.7294,
+      "step": 15763
+    },
+    {
+      "epoch": 2.806623931623932,
+      "grad_norm": 0.9147883057594299,
+      "learning_rate": 4.091388348433442e-05,
+      "loss": 0.955,
+      "step": 15764
+    },
+    {
+      "epoch": 2.806801994301994,
+      "grad_norm": 0.7326688170433044,
+      "learning_rate": 4.0902591277035474e-05,
+      "loss": 0.5885,
+      "step": 15765
+    },
+    {
+      "epoch": 2.806980056980057,
+      "grad_norm": 0.8756957054138184,
+      "learning_rate": 4.0891300227627425e-05,
+      "loss": 0.9404,
+      "step": 15766
+    },
+    {
+      "epoch": 2.8071581196581197,
+      "grad_norm": 0.8897661566734314,
+      "learning_rate": 4.088001033633165e-05,
+      "loss": 1.0214,
+      "step": 15767
+    },
+    {
+      "epoch": 2.8073361823361824,
+      "grad_norm": 0.8007500171661377,
+      "learning_rate": 4.086872160336919e-05,
+      "loss": 0.6553,
+      "step": 15768
+    },
+    {
+      "epoch": 2.807514245014245,
+      "grad_norm": 0.8175814747810364,
+      "learning_rate": 4.0857434028961316e-05,
+      "loss": 0.8572,
+      "step": 15769
+    },
+    {
+      "epoch": 2.8076923076923075,
+      "grad_norm": 0.8290224671363831,
+      "learning_rate": 4.084614761332917e-05,
+      "loss": 0.9196,
+      "step": 15770
+    },
+    {
+      "epoch": 2.8078703703703702,
+      "grad_norm": 0.9355306625366211,
+      "learning_rate": 4.083486235669385e-05,
+      "loss": 0.8198,
+      "step": 15771
+    },
+    {
+      "epoch": 2.808048433048433,
+      "grad_norm": 0.942641019821167,
+      "learning_rate": 4.082357825927656e-05,
+      "loss": 0.7817,
+      "step": 15772
+    },
+    {
+      "epoch": 2.8082264957264957,
+      "grad_norm": 0.9115625023841858,
+      "learning_rate": 4.081229532129827e-05,
+      "loss": 1.046,
+      "step": 15773
+    },
+    {
+      "epoch": 2.8084045584045585,
+      "grad_norm": 0.8559226393699646,
+      "learning_rate": 4.080101354298016e-05,
+      "loss": 0.7085,
+      "step": 15774
+    },
+    {
+      "epoch": 2.8085826210826212,
+      "grad_norm": 0.8223599195480347,
+      "learning_rate": 4.0789732924543144e-05,
+      "loss": 0.9019,
+      "step": 15775
+    },
+    {
+      "epoch": 2.808760683760684,
+      "grad_norm": 0.8511637449264526,
+      "learning_rate": 4.0778453466208344e-05,
+      "loss": 0.6845,
+      "step": 15776
+    },
+    {
+      "epoch": 2.8089387464387463,
+      "grad_norm": 0.9633409976959229,
+      "learning_rate": 4.076717516819674e-05,
+      "loss": 1.0478,
+      "step": 15777
+    },
+    {
+      "epoch": 2.809116809116809,
+      "grad_norm": 0.8657141327857971,
+      "learning_rate": 4.075589803072928e-05,
+      "loss": 0.8694,
+      "step": 15778
+    },
+    {
+      "epoch": 2.809294871794872,
+      "grad_norm": 0.8126863241195679,
+      "learning_rate": 4.0744622054026936e-05,
+      "loss": 0.8529,
+      "step": 15779
+    },
+    {
+      "epoch": 2.8094729344729346,
+      "grad_norm": 0.8671838045120239,
+      "learning_rate": 4.0733347238310626e-05,
+      "loss": 0.81,
+      "step": 15780
+    },
+    {
+      "epoch": 2.8096509971509973,
+      "grad_norm": 0.8336054086685181,
+      "learning_rate": 4.0722073583801223e-05,
+      "loss": 0.7103,
+      "step": 15781
+    },
+    {
+      "epoch": 2.8098290598290596,
+      "grad_norm": 0.7833893299102783,
+      "learning_rate": 4.071080109071973e-05,
+      "loss": 0.875,
+      "step": 15782
+    },
+    {
+      "epoch": 2.8100071225071224,
+      "grad_norm": 0.9142106771469116,
+      "learning_rate": 4.0699529759286844e-05,
+      "loss": 0.9063,
+      "step": 15783
+    },
+    {
+      "epoch": 2.810185185185185,
+      "grad_norm": 0.7767373919487,
+      "learning_rate": 4.0688259589723565e-05,
+      "loss": 0.841,
+      "step": 15784
+    },
+    {
+      "epoch": 2.810363247863248,
+      "grad_norm": 0.8679327368736267,
+      "learning_rate": 4.067699058225056e-05,
+      "loss": 0.8581,
+      "step": 15785
+    },
+    {
+      "epoch": 2.8105413105413106,
+      "grad_norm": 0.9434911608695984,
+      "learning_rate": 4.066572273708873e-05,
+      "loss": 1.0166,
+      "step": 15786
+    },
+    {
+      "epoch": 2.8107193732193734,
+      "grad_norm": 0.8053399920463562,
+      "learning_rate": 4.06544560544588e-05,
+      "loss": 0.874,
+      "step": 15787
+    },
+    {
+      "epoch": 2.810897435897436,
+      "grad_norm": 1.0207599401474,
+      "learning_rate": 4.0643190534581524e-05,
+      "loss": 0.8296,
+      "step": 15788
+    },
+    {
+      "epoch": 2.8110754985754984,
+      "grad_norm": 0.8879590034484863,
+      "learning_rate": 4.0631926177677626e-05,
+      "loss": 0.8787,
+      "step": 15789
+    },
+    {
+      "epoch": 2.811253561253561,
+      "grad_norm": 1.0816758871078491,
+      "learning_rate": 4.062066298396778e-05,
+      "loss": 0.8129,
+      "step": 15790
+    },
+    {
+      "epoch": 2.811431623931624,
+      "grad_norm": 0.8332961797714233,
+      "learning_rate": 4.060940095367272e-05,
+      "loss": 0.8855,
+      "step": 15791
+    },
+    {
+      "epoch": 2.8116096866096867,
+      "grad_norm": 0.98028963804245,
+      "learning_rate": 4.059814008701308e-05,
+      "loss": 1.0065,
+      "step": 15792
+    },
+    {
+      "epoch": 2.8117877492877494,
+      "grad_norm": 0.8983020782470703,
+      "learning_rate": 4.058688038420949e-05,
+      "loss": 0.8259,
+      "step": 15793
+    },
+    {
+      "epoch": 2.8119658119658117,
+      "grad_norm": 0.8029065728187561,
+      "learning_rate": 4.057562184548255e-05,
+      "loss": 0.7639,
+      "step": 15794
+    },
+    {
+      "epoch": 2.8121438746438745,
+      "grad_norm": 0.8188722729682922,
+      "learning_rate": 4.056436447105286e-05,
+      "loss": 0.7179,
+      "step": 15795
+    },
+    {
+      "epoch": 2.8123219373219372,
+      "grad_norm": 0.8493495583534241,
+      "learning_rate": 4.055310826114095e-05,
+      "loss": 0.9479,
+      "step": 15796
+    },
+    {
+      "epoch": 2.8125,
+      "grad_norm": 0.7770833373069763,
+      "learning_rate": 4.0541853215967466e-05,
+      "loss": 0.6915,
+      "step": 15797
+    },
+    {
+      "epoch": 2.8126780626780628,
+      "grad_norm": 0.8238499760627747,
+      "learning_rate": 4.053059933575277e-05,
+      "loss": 1.0257,
+      "step": 15798
+    },
+    {
+      "epoch": 2.8128561253561255,
+      "grad_norm": 0.8537020683288574,
+      "learning_rate": 4.051934662071754e-05,
+      "loss": 0.7037,
+      "step": 15799
+    },
+    {
+      "epoch": 2.8130341880341883,
+      "grad_norm": 0.8120609521865845,
+      "learning_rate": 4.0508095071082055e-05,
+      "loss": 0.9533,
+      "step": 15800
+    },
+    {
+      "epoch": 2.8132122507122506,
+      "grad_norm": 0.8701691627502441,
+      "learning_rate": 4.0496844687066915e-05,
+      "loss": 0.7129,
+      "step": 15801
+    },
+    {
+      "epoch": 2.8133903133903133,
+      "grad_norm": 0.9007154107093811,
+      "learning_rate": 4.048559546889249e-05,
+      "loss": 0.8879,
+      "step": 15802
+    },
+    {
+      "epoch": 2.813568376068376,
+      "grad_norm": 0.8858364820480347,
+      "learning_rate": 4.047434741677919e-05,
+      "loss": 0.9391,
+      "step": 15803
+    },
+    {
+      "epoch": 2.813746438746439,
+      "grad_norm": 0.8597678542137146,
+      "learning_rate": 4.046310053094739e-05,
+      "loss": 0.7897,
+      "step": 15804
+    },
+    {
+      "epoch": 2.8139245014245016,
+      "grad_norm": 0.8493016362190247,
+      "learning_rate": 4.0451854811617475e-05,
+      "loss": 0.914,
+      "step": 15805
+    },
+    {
+      "epoch": 2.814102564102564,
+      "grad_norm": 0.8347373008728027,
+      "learning_rate": 4.044061025900973e-05,
+      "loss": 0.671,
+      "step": 15806
+    },
+    {
+      "epoch": 2.8142806267806266,
+      "grad_norm": 0.7406242489814758,
+      "learning_rate": 4.042936687334455e-05,
+      "loss": 0.6592,
+      "step": 15807
+    },
+    {
+      "epoch": 2.8144586894586894,
+      "grad_norm": 0.93736732006073,
+      "learning_rate": 4.041812465484214e-05,
+      "loss": 0.8301,
+      "step": 15808
+    },
+    {
+      "epoch": 2.814636752136752,
+      "grad_norm": 0.8744384050369263,
+      "learning_rate": 4.040688360372282e-05,
+      "loss": 0.9104,
+      "step": 15809
+    },
+    {
+      "epoch": 2.814814814814815,
+      "grad_norm": 0.7417266964912415,
+      "learning_rate": 4.0395643720206834e-05,
+      "loss": 0.7698,
+      "step": 15810
+    },
+    {
+      "epoch": 2.8149928774928776,
+      "grad_norm": 0.8601716160774231,
+      "learning_rate": 4.038440500451438e-05,
+      "loss": 0.8459,
+      "step": 15811
+    },
+    {
+      "epoch": 2.8151709401709404,
+      "grad_norm": 0.9801309108734131,
+      "learning_rate": 4.037316745686568e-05,
+      "loss": 0.9555,
+      "step": 15812
+    },
+    {
+      "epoch": 2.8153490028490027,
+      "grad_norm": 0.8559770584106445,
+      "learning_rate": 4.036193107748085e-05,
+      "loss": 0.9912,
+      "step": 15813
+    },
+    {
+      "epoch": 2.8155270655270654,
+      "grad_norm": 0.8155802488327026,
+      "learning_rate": 4.035069586658018e-05,
+      "loss": 0.8471,
+      "step": 15814
+    },
+    {
+      "epoch": 2.815705128205128,
+      "grad_norm": 0.96893310546875,
+      "learning_rate": 4.033946182438364e-05,
+      "loss": 0.9329,
+      "step": 15815
+    },
+    {
+      "epoch": 2.815883190883191,
+      "grad_norm": 0.9981120228767395,
+      "learning_rate": 4.032822895111144e-05,
+      "loss": 1.0333,
+      "step": 15816
+    },
+    {
+      "epoch": 2.8160612535612537,
+      "grad_norm": 0.9491816163063049,
+      "learning_rate": 4.031699724698363e-05,
+      "loss": 0.8838,
+      "step": 15817
+    },
+    {
+      "epoch": 2.816239316239316,
+      "grad_norm": 0.7149206399917603,
+      "learning_rate": 4.030576671222029e-05,
+      "loss": 0.8248,
+      "step": 15818
+    },
+    {
+      "epoch": 2.8164173789173788,
+      "grad_norm": 0.8366861343383789,
+      "learning_rate": 4.029453734704145e-05,
+      "loss": 0.8041,
+      "step": 15819
+    },
+    {
+      "epoch": 2.8165954415954415,
+      "grad_norm": 0.8147984147071838,
+      "learning_rate": 4.0283309151667116e-05,
+      "loss": 0.8702,
+      "step": 15820
+    },
+    {
+      "epoch": 2.8167735042735043,
+      "grad_norm": 0.8462722897529602,
+      "learning_rate": 4.027208212631729e-05,
+      "loss": 0.9899,
+      "step": 15821
+    },
+    {
+      "epoch": 2.816951566951567,
+      "grad_norm": 0.9278651475906372,
+      "learning_rate": 4.0260856271211946e-05,
+      "loss": 0.8684,
+      "step": 15822
+    },
+    {
+      "epoch": 2.8171296296296298,
+      "grad_norm": 0.8845569491386414,
+      "learning_rate": 4.0249631586570993e-05,
+      "loss": 0.6956,
+      "step": 15823
+    },
+    {
+      "epoch": 2.8173076923076925,
+      "grad_norm": 0.7803958654403687,
+      "learning_rate": 4.0238408072614453e-05,
+      "loss": 0.758,
+      "step": 15824
+    },
+    {
+      "epoch": 2.817485754985755,
+      "grad_norm": 0.8920331597328186,
+      "learning_rate": 4.022718572956209e-05,
+      "loss": 0.9131,
+      "step": 15825
+    },
+    {
+      "epoch": 2.8176638176638176,
+      "grad_norm": 0.9914098978042603,
+      "learning_rate": 4.021596455763389e-05,
+      "loss": 0.973,
+      "step": 15826
+    },
+    {
+      "epoch": 2.8178418803418803,
+      "grad_norm": 0.8329834938049316,
+      "learning_rate": 4.020474455704968e-05,
+      "loss": 0.8746,
+      "step": 15827
+    },
+    {
+      "epoch": 2.818019943019943,
+      "grad_norm": 0.7789189219474792,
+      "learning_rate": 4.019352572802928e-05,
+      "loss": 0.7359,
+      "step": 15828
+    },
+    {
+      "epoch": 2.818198005698006,
+      "grad_norm": 0.8405792713165283,
+      "learning_rate": 4.0182308070792505e-05,
+      "loss": 0.7897,
+      "step": 15829
+    },
+    {
+      "epoch": 2.818376068376068,
+      "grad_norm": 0.9215472936630249,
+      "learning_rate": 4.0171091585559116e-05,
+      "loss": 0.9017,
+      "step": 15830
+    },
+    {
+      "epoch": 2.818554131054131,
+      "grad_norm": 0.8310655355453491,
+      "learning_rate": 4.0159876272548933e-05,
+      "loss": 0.7499,
+      "step": 15831
+    },
+    {
+      "epoch": 2.8187321937321936,
+      "grad_norm": 0.8645792603492737,
+      "learning_rate": 4.014866213198167e-05,
+      "loss": 0.9009,
+      "step": 15832
+    },
+    {
+      "epoch": 2.8189102564102564,
+      "grad_norm": 0.8059788942337036,
+      "learning_rate": 4.013744916407703e-05,
+      "loss": 0.7367,
+      "step": 15833
+    },
+    {
+      "epoch": 2.819088319088319,
+      "grad_norm": 0.7990314960479736,
+      "learning_rate": 4.0126237369054745e-05,
+      "loss": 1.0172,
+      "step": 15834
+    },
+    {
+      "epoch": 2.819266381766382,
+      "grad_norm": 0.9321257472038269,
+      "learning_rate": 4.0115026747134446e-05,
+      "loss": 1.1224,
+      "step": 15835
+    },
+    {
+      "epoch": 2.8194444444444446,
+      "grad_norm": 0.9674378037452698,
+      "learning_rate": 4.0103817298535794e-05,
+      "loss": 0.9881,
+      "step": 15836
+    },
+    {
+      "epoch": 2.8196225071225074,
+      "grad_norm": 0.8573030829429626,
+      "learning_rate": 4.009260902347842e-05,
+      "loss": 0.7515,
+      "step": 15837
+    },
+    {
+      "epoch": 2.8198005698005697,
+      "grad_norm": 0.9248984456062317,
+      "learning_rate": 4.00814019221819e-05,
+      "loss": 0.9265,
+      "step": 15838
+    },
+    {
+      "epoch": 2.8199786324786325,
+      "grad_norm": 0.8749010562896729,
+      "learning_rate": 4.00701959948659e-05,
+      "loss": 0.8659,
+      "step": 15839
+    },
+    {
+      "epoch": 2.820156695156695,
+      "grad_norm": 0.8070803880691528,
+      "learning_rate": 4.005899124174986e-05,
+      "loss": 0.8813,
+      "step": 15840
+    },
+    {
+      "epoch": 2.820334757834758,
+      "grad_norm": 1.0711981058120728,
+      "learning_rate": 4.004778766305339e-05,
+      "loss": 1.019,
+      "step": 15841
+    },
+    {
+      "epoch": 2.8205128205128203,
+      "grad_norm": 0.7646795511245728,
+      "learning_rate": 4.0036585258995985e-05,
+      "loss": 0.8326,
+      "step": 15842
+    },
+    {
+      "epoch": 2.820690883190883,
+      "grad_norm": 0.6720184683799744,
+      "learning_rate": 4.002538402979713e-05,
+      "loss": 0.6642,
+      "step": 15843
+    },
+    {
+      "epoch": 2.8208689458689458,
+      "grad_norm": 0.8062998056411743,
+      "learning_rate": 4.001418397567629e-05,
+      "loss": 0.9585,
+      "step": 15844
+    },
+    {
+      "epoch": 2.8210470085470085,
+      "grad_norm": 0.835515558719635,
+      "learning_rate": 4.0002985096852893e-05,
+      "loss": 0.7992,
+      "step": 15845
+    },
+    {
+      "epoch": 2.8212250712250713,
+      "grad_norm": 0.8308731913566589,
+      "learning_rate": 3.9991787393546385e-05,
+      "loss": 0.9982,
+      "step": 15846
+    },
+    {
+      "epoch": 2.821403133903134,
+      "grad_norm": 0.9056837558746338,
+      "learning_rate": 3.998059086597614e-05,
+      "loss": 0.8283,
+      "step": 15847
+    },
+    {
+      "epoch": 2.8215811965811968,
+      "grad_norm": 0.853070080280304,
+      "learning_rate": 3.9969395514361506e-05,
+      "loss": 1.033,
+      "step": 15848
+    },
+    {
+      "epoch": 2.8217592592592595,
+      "grad_norm": 0.9179061055183411,
+      "learning_rate": 3.99582013389219e-05,
+      "loss": 0.929,
+      "step": 15849
+    },
+    {
+      "epoch": 2.821937321937322,
+      "grad_norm": 0.8702627420425415,
+      "learning_rate": 3.9947008339876616e-05,
+      "loss": 0.9994,
+      "step": 15850
+    },
+    {
+      "epoch": 2.8221153846153846,
+      "grad_norm": 0.9594024419784546,
+      "learning_rate": 3.9935816517444935e-05,
+      "loss": 0.88,
+      "step": 15851
+    },
+    {
+      "epoch": 2.8222934472934473,
+      "grad_norm": 0.8474575877189636,
+      "learning_rate": 3.992462587184618e-05,
+      "loss": 0.7817,
+      "step": 15852
+    },
+    {
+      "epoch": 2.82247150997151,
+      "grad_norm": 0.7588878870010376,
+      "learning_rate": 3.9913436403299533e-05,
+      "loss": 0.7911,
+      "step": 15853
+    },
+    {
+      "epoch": 2.8226495726495724,
+      "grad_norm": 0.8467457890510559,
+      "learning_rate": 3.9902248112024366e-05,
+      "loss": 0.7433,
+      "step": 15854
+    },
+    {
+      "epoch": 2.822827635327635,
+      "grad_norm": 0.839137077331543,
+      "learning_rate": 3.989106099823972e-05,
+      "loss": 0.7303,
+      "step": 15855
+    },
+    {
+      "epoch": 2.823005698005698,
+      "grad_norm": 0.8945586085319519,
+      "learning_rate": 3.987987506216495e-05,
+      "loss": 0.6697,
+      "step": 15856
+    },
+    {
+      "epoch": 2.8231837606837606,
+      "grad_norm": 0.749971330165863,
+      "learning_rate": 3.9868690304019064e-05,
+      "loss": 0.65,
+      "step": 15857
+    },
+    {
+      "epoch": 2.8233618233618234,
+      "grad_norm": 0.9841105341911316,
+      "learning_rate": 3.985750672402131e-05,
+      "loss": 0.8382,
+      "step": 15858
+    },
+    {
+      "epoch": 2.823539886039886,
+      "grad_norm": 0.8696077466011047,
+      "learning_rate": 3.984632432239078e-05,
+      "loss": 0.7895,
+      "step": 15859
+    },
+    {
+      "epoch": 2.823717948717949,
+      "grad_norm": 0.7845979928970337,
+      "learning_rate": 3.9835143099346575e-05,
+      "loss": 0.8673,
+      "step": 15860
+    },
+    {
+      "epoch": 2.8238960113960117,
+      "grad_norm": 0.8145211338996887,
+      "learning_rate": 3.982396305510775e-05,
+      "loss": 0.905,
+      "step": 15861
+    },
+    {
+      "epoch": 2.824074074074074,
+      "grad_norm": 0.8533337116241455,
+      "learning_rate": 3.981278418989336e-05,
+      "loss": 0.9597,
+      "step": 15862
+    },
+    {
+      "epoch": 2.8242521367521367,
+      "grad_norm": 0.9430350065231323,
+      "learning_rate": 3.980160650392241e-05,
+      "loss": 0.9043,
+      "step": 15863
+    },
+    {
+      "epoch": 2.8244301994301995,
+      "grad_norm": 0.7469115257263184,
+      "learning_rate": 3.9790429997414e-05,
+      "loss": 0.9074,
+      "step": 15864
+    },
+    {
+      "epoch": 2.824608262108262,
+      "grad_norm": 0.8558746576309204,
+      "learning_rate": 3.977925467058696e-05,
+      "loss": 0.8958,
+      "step": 15865
+    },
+    {
+      "epoch": 2.8247863247863245,
+      "grad_norm": 1.1557669639587402,
+      "learning_rate": 3.976808052366037e-05,
+      "loss": 0.7884,
+      "step": 15866
+    },
+    {
+      "epoch": 2.8249643874643873,
+      "grad_norm": 0.8448477983474731,
+      "learning_rate": 3.975690755685312e-05,
+      "loss": 0.9542,
+      "step": 15867
+    },
+    {
+      "epoch": 2.82514245014245,
+      "grad_norm": 0.833256721496582,
+      "learning_rate": 3.9745735770384086e-05,
+      "loss": 0.9196,
+      "step": 15868
+    },
+    {
+      "epoch": 2.8253205128205128,
+      "grad_norm": 0.9318852424621582,
+      "learning_rate": 3.973456516447226e-05,
+      "loss": 0.7792,
+      "step": 15869
+    },
+    {
+      "epoch": 2.8254985754985755,
+      "grad_norm": 0.832655131816864,
+      "learning_rate": 3.972339573933638e-05,
+      "loss": 0.9526,
+      "step": 15870
+    },
+    {
+      "epoch": 2.8256766381766383,
+      "grad_norm": 0.7546842694282532,
+      "learning_rate": 3.9712227495195406e-05,
+      "loss": 0.5969,
+      "step": 15871
+    },
+    {
+      "epoch": 2.825854700854701,
+      "grad_norm": 0.8538267016410828,
+      "learning_rate": 3.970106043226802e-05,
+      "loss": 0.8369,
+      "step": 15872
+    },
+    {
+      "epoch": 2.826032763532764,
+      "grad_norm": 0.8023465871810913,
+      "learning_rate": 3.968989455077314e-05,
+      "loss": 0.7257,
+      "step": 15873
+    },
+    {
+      "epoch": 2.826210826210826,
+      "grad_norm": 0.7905409932136536,
+      "learning_rate": 3.96787298509295e-05,
+      "loss": 0.7965,
+      "step": 15874
+    },
+    {
+      "epoch": 2.826388888888889,
+      "grad_norm": 0.8316642642021179,
+      "learning_rate": 3.966756633295583e-05,
+      "loss": 0.808,
+      "step": 15875
+    },
+    {
+      "epoch": 2.8265669515669516,
+      "grad_norm": 0.9130233526229858,
+      "learning_rate": 3.965640399707088e-05,
+      "loss": 0.882,
+      "step": 15876
+    },
+    {
+      "epoch": 2.8267450142450143,
+      "grad_norm": 0.873147189617157,
+      "learning_rate": 3.9645242843493325e-05,
+      "loss": 0.8347,
+      "step": 15877
+    },
+    {
+      "epoch": 2.8269230769230766,
+      "grad_norm": 0.8806825280189514,
+      "learning_rate": 3.963408287244183e-05,
+      "loss": 1.0496,
+      "step": 15878
+    },
+    {
+      "epoch": 2.8271011396011394,
+      "grad_norm": 0.8899962306022644,
+      "learning_rate": 3.962292408413516e-05,
+      "loss": 1.0014,
+      "step": 15879
+    },
+    {
+      "epoch": 2.827279202279202,
+      "grad_norm": 0.900303065776825,
+      "learning_rate": 3.961176647879179e-05,
+      "loss": 0.7345,
+      "step": 15880
+    },
+    {
+      "epoch": 2.827457264957265,
+      "grad_norm": 1.1055848598480225,
+      "learning_rate": 3.960061005663049e-05,
+      "loss": 0.9697,
+      "step": 15881
+    },
+    {
+      "epoch": 2.8276353276353277,
+      "grad_norm": 0.891404390335083,
+      "learning_rate": 3.958945481786969e-05,
+      "loss": 0.8878,
+      "step": 15882
+    },
+    {
+      "epoch": 2.8278133903133904,
+      "grad_norm": 0.9403249025344849,
+      "learning_rate": 3.957830076272807e-05,
+      "loss": 0.9536,
+      "step": 15883
+    },
+    {
+      "epoch": 2.827991452991453,
+      "grad_norm": 0.8735896944999695,
+      "learning_rate": 3.9567147891424126e-05,
+      "loss": 0.9113,
+      "step": 15884
+    },
+    {
+      "epoch": 2.828169515669516,
+      "grad_norm": 0.7758442759513855,
+      "learning_rate": 3.9555996204176385e-05,
+      "loss": 0.7336,
+      "step": 15885
+    },
+    {
+      "epoch": 2.828347578347578,
+      "grad_norm": 0.8632972836494446,
+      "learning_rate": 3.9544845701203335e-05,
+      "loss": 0.6883,
+      "step": 15886
+    },
+    {
+      "epoch": 2.828525641025641,
+      "grad_norm": 0.8639035224914551,
+      "learning_rate": 3.953369638272345e-05,
+      "loss": 0.8836,
+      "step": 15887
+    },
+    {
+      "epoch": 2.8287037037037037,
+      "grad_norm": 0.8131114840507507,
+      "learning_rate": 3.952254824895514e-05,
+      "loss": 0.8175,
+      "step": 15888
+    },
+    {
+      "epoch": 2.8288817663817665,
+      "grad_norm": 0.7421914935112,
+      "learning_rate": 3.9511401300116904e-05,
+      "loss": 0.7721,
+      "step": 15889
+    },
+    {
+      "epoch": 2.8290598290598292,
+      "grad_norm": 0.8358926177024841,
+      "learning_rate": 3.95002555364271e-05,
+      "loss": 0.7327,
+      "step": 15890
+    },
+    {
+      "epoch": 2.8292378917378915,
+      "grad_norm": 0.8913134932518005,
+      "learning_rate": 3.9489110958104115e-05,
+      "loss": 1.0415,
+      "step": 15891
+    },
+    {
+      "epoch": 2.8294159544159543,
+      "grad_norm": 0.9675887227058411,
+      "learning_rate": 3.94779675653663e-05,
+      "loss": 0.8295,
+      "step": 15892
+    },
+    {
+      "epoch": 2.829594017094017,
+      "grad_norm": 0.8618438839912415,
+      "learning_rate": 3.946682535843199e-05,
+      "loss": 1.0754,
+      "step": 15893
+    },
+    {
+      "epoch": 2.82977207977208,
+      "grad_norm": 0.820209801197052,
+      "learning_rate": 3.945568433751948e-05,
+      "loss": 0.9484,
+      "step": 15894
+    },
+    {
+      "epoch": 2.8299501424501425,
+      "grad_norm": 0.8641984462738037,
+      "learning_rate": 3.944454450284705e-05,
+      "loss": 0.964,
+      "step": 15895
+    },
+    {
+      "epoch": 2.8301282051282053,
+      "grad_norm": 0.8229194283485413,
+      "learning_rate": 3.943340585463303e-05,
+      "loss": 0.7069,
+      "step": 15896
+    },
+    {
+      "epoch": 2.830306267806268,
+      "grad_norm": 0.7874621748924255,
+      "learning_rate": 3.942226839309554e-05,
+      "loss": 0.7815,
+      "step": 15897
+    },
+    {
+      "epoch": 2.8304843304843303,
+      "grad_norm": 0.8581945896148682,
+      "learning_rate": 3.9411132118452896e-05,
+      "loss": 0.9119,
+      "step": 15898
+    },
+    {
+      "epoch": 2.830662393162393,
+      "grad_norm": 0.9327018857002258,
+      "learning_rate": 3.939999703092326e-05,
+      "loss": 1.1002,
+      "step": 15899
+    },
+    {
+      "epoch": 2.830840455840456,
+      "grad_norm": 0.7793048024177551,
+      "learning_rate": 3.9388863130724794e-05,
+      "loss": 0.6529,
+      "step": 15900
+    },
+    {
+      "epoch": 2.8310185185185186,
+      "grad_norm": 0.9133790135383606,
+      "learning_rate": 3.9377730418075645e-05,
+      "loss": 0.7354,
+      "step": 15901
+    },
+    {
+      "epoch": 2.8311965811965814,
+      "grad_norm": 0.7800240516662598,
+      "learning_rate": 3.936659889319394e-05,
+      "loss": 0.8541,
+      "step": 15902
+    },
+    {
+      "epoch": 2.8313746438746437,
+      "grad_norm": 0.782433271408081,
+      "learning_rate": 3.9355468556297737e-05,
+      "loss": 0.9084,
+      "step": 15903
+    },
+    {
+      "epoch": 2.8315527065527064,
+      "grad_norm": 0.8926814198493958,
+      "learning_rate": 3.9344339407605226e-05,
+      "loss": 1.0252,
+      "step": 15904
+    },
+    {
+      "epoch": 2.831730769230769,
+      "grad_norm": 0.92144376039505,
+      "learning_rate": 3.93332114473343e-05,
+      "loss": 0.7859,
+      "step": 15905
+    },
+    {
+      "epoch": 2.831908831908832,
+      "grad_norm": 0.7403308749198914,
+      "learning_rate": 3.932208467570315e-05,
+      "loss": 0.796,
+      "step": 15906
+    },
+    {
+      "epoch": 2.8320868945868947,
+      "grad_norm": 0.939708411693573,
+      "learning_rate": 3.9310959092929636e-05,
+      "loss": 0.9622,
+      "step": 15907
+    },
+    {
+      "epoch": 2.8322649572649574,
+      "grad_norm": 0.7546647787094116,
+      "learning_rate": 3.929983469923184e-05,
+      "loss": 0.7551,
+      "step": 15908
+    },
+    {
+      "epoch": 2.83244301994302,
+      "grad_norm": 0.8301447033882141,
+      "learning_rate": 3.928871149482768e-05,
+      "loss": 0.9377,
+      "step": 15909
+    },
+    {
+      "epoch": 2.8326210826210825,
+      "grad_norm": 0.8961447477340698,
+      "learning_rate": 3.927758947993508e-05,
+      "loss": 0.8245,
+      "step": 15910
+    },
+    {
+      "epoch": 2.8327991452991452,
+      "grad_norm": 0.7845988869667053,
+      "learning_rate": 3.926646865477204e-05,
+      "loss": 0.8662,
+      "step": 15911
+    },
+    {
+      "epoch": 2.832977207977208,
+      "grad_norm": 0.9339789152145386,
+      "learning_rate": 3.925534901955631e-05,
+      "loss": 0.8533,
+      "step": 15912
+    },
+    {
+      "epoch": 2.8331552706552707,
+      "grad_norm": 0.936855137348175,
+      "learning_rate": 3.924423057450587e-05,
+      "loss": 0.8809,
+      "step": 15913
+    },
+    {
+      "epoch": 2.8333333333333335,
+      "grad_norm": 0.972535252571106,
+      "learning_rate": 3.923311331983852e-05,
+      "loss": 1.0103,
+      "step": 15914
+    },
+    {
+      "epoch": 2.833511396011396,
+      "grad_norm": 0.9115430116653442,
+      "learning_rate": 3.922199725577208e-05,
+      "loss": 1.0041,
+      "step": 15915
+    },
+    {
+      "epoch": 2.8336894586894585,
+      "grad_norm": 0.8378027081489563,
+      "learning_rate": 3.921088238252435e-05,
+      "loss": 0.8475,
+      "step": 15916
+    },
+    {
+      "epoch": 2.8338675213675213,
+      "grad_norm": 0.9383054375648499,
+      "learning_rate": 3.91997687003131e-05,
+      "loss": 0.7305,
+      "step": 15917
+    },
+    {
+      "epoch": 2.834045584045584,
+      "grad_norm": 0.7996332049369812,
+      "learning_rate": 3.918865620935609e-05,
+      "loss": 0.8749,
+      "step": 15918
+    },
+    {
+      "epoch": 2.834223646723647,
+      "grad_norm": 0.8876177668571472,
+      "learning_rate": 3.917754490987103e-05,
+      "loss": 0.8836,
+      "step": 15919
+    },
+    {
+      "epoch": 2.8344017094017095,
+      "grad_norm": 0.8004130125045776,
+      "learning_rate": 3.9166434802075594e-05,
+      "loss": 1.0383,
+      "step": 15920
+    },
+    {
+      "epoch": 2.8345797720797723,
+      "grad_norm": 0.76146000623703,
+      "learning_rate": 3.915532588618756e-05,
+      "loss": 0.9616,
+      "step": 15921
+    },
+    {
+      "epoch": 2.8347578347578346,
+      "grad_norm": 0.9333193898200989,
+      "learning_rate": 3.914421816242446e-05,
+      "loss": 0.7386,
+      "step": 15922
+    },
+    {
+      "epoch": 2.8349358974358974,
+      "grad_norm": 0.9340601563453674,
+      "learning_rate": 3.913311163100403e-05,
+      "loss": 0.8894,
+      "step": 15923
+    },
+    {
+      "epoch": 2.83511396011396,
+      "grad_norm": 0.8401036858558655,
+      "learning_rate": 3.912200629214383e-05,
+      "loss": 0.8366,
+      "step": 15924
+    },
+    {
+      "epoch": 2.835292022792023,
+      "grad_norm": 0.9298731088638306,
+      "learning_rate": 3.911090214606146e-05,
+      "loss": 0.899,
+      "step": 15925
+    },
+    {
+      "epoch": 2.8354700854700856,
+      "grad_norm": 0.8085874915122986,
+      "learning_rate": 3.909979919297446e-05,
+      "loss": 0.7597,
+      "step": 15926
+    },
+    {
+      "epoch": 2.835648148148148,
+      "grad_norm": 0.7841027975082397,
+      "learning_rate": 3.9088697433100396e-05,
+      "loss": 0.897,
+      "step": 15927
+    },
+    {
+      "epoch": 2.8358262108262107,
+      "grad_norm": 1.0678621530532837,
+      "learning_rate": 3.907759686665677e-05,
+      "loss": 0.976,
+      "step": 15928
+    },
+    {
+      "epoch": 2.8360042735042734,
+      "grad_norm": 0.7748154997825623,
+      "learning_rate": 3.906649749386106e-05,
+      "loss": 0.6834,
+      "step": 15929
+    },
+    {
+      "epoch": 2.836182336182336,
+      "grad_norm": 0.8119567632675171,
+      "learning_rate": 3.905539931493076e-05,
+      "loss": 0.8076,
+      "step": 15930
+    },
+    {
+      "epoch": 2.836360398860399,
+      "grad_norm": 0.8723282814025879,
+      "learning_rate": 3.9044302330083326e-05,
+      "loss": 0.9057,
+      "step": 15931
+    },
+    {
+      "epoch": 2.8365384615384617,
+      "grad_norm": 0.7785065174102783,
+      "learning_rate": 3.903320653953616e-05,
+      "loss": 0.8899,
+      "step": 15932
+    },
+    {
+      "epoch": 2.8367165242165244,
+      "grad_norm": 0.9053105115890503,
+      "learning_rate": 3.902211194350667e-05,
+      "loss": 0.9038,
+      "step": 15933
+    },
+    {
+      "epoch": 2.8368945868945867,
+      "grad_norm": 0.8937689065933228,
+      "learning_rate": 3.9011018542212216e-05,
+      "loss": 0.736,
+      "step": 15934
+    },
+    {
+      "epoch": 2.8370726495726495,
+      "grad_norm": 0.7169269323348999,
+      "learning_rate": 3.899992633587014e-05,
+      "loss": 0.6632,
+      "step": 15935
+    },
+    {
+      "epoch": 2.8372507122507122,
+      "grad_norm": 0.8168412446975708,
+      "learning_rate": 3.898883532469785e-05,
+      "loss": 0.8482,
+      "step": 15936
+    },
+    {
+      "epoch": 2.837428774928775,
+      "grad_norm": 0.7374065518379211,
+      "learning_rate": 3.897774550891252e-05,
+      "loss": 0.8141,
+      "step": 15937
+    },
+    {
+      "epoch": 2.8376068376068377,
+      "grad_norm": 0.8844853043556213,
+      "learning_rate": 3.8966656888731546e-05,
+      "loss": 0.8204,
+      "step": 15938
+    },
+    {
+      "epoch": 2.8377849002849,
+      "grad_norm": 0.9031739234924316,
+      "learning_rate": 3.895556946437213e-05,
+      "loss": 0.862,
+      "step": 15939
+    },
+    {
+      "epoch": 2.837962962962963,
+      "grad_norm": 0.8141549229621887,
+      "learning_rate": 3.894448323605154e-05,
+      "loss": 0.6687,
+      "step": 15940
+    },
+    {
+      "epoch": 2.8381410256410255,
+      "grad_norm": 0.796144962310791,
+      "learning_rate": 3.893339820398696e-05,
+      "loss": 0.9021,
+      "step": 15941
+    },
+    {
+      "epoch": 2.8383190883190883,
+      "grad_norm": 0.8840420246124268,
+      "learning_rate": 3.8922314368395584e-05,
+      "loss": 0.9608,
+      "step": 15942
+    },
+    {
+      "epoch": 2.838497150997151,
+      "grad_norm": 0.8297450542449951,
+      "learning_rate": 3.891123172949459e-05,
+      "loss": 0.9442,
+      "step": 15943
+    },
+    {
+      "epoch": 2.838675213675214,
+      "grad_norm": 0.7875503301620483,
+      "learning_rate": 3.89001502875011e-05,
+      "loss": 0.9269,
+      "step": 15944
+    },
+    {
+      "epoch": 2.8388532763532766,
+      "grad_norm": 0.9460122585296631,
+      "learning_rate": 3.8889070042632217e-05,
+      "loss": 0.9459,
+      "step": 15945
+    },
+    {
+      "epoch": 2.8390313390313393,
+      "grad_norm": 0.8144980669021606,
+      "learning_rate": 3.887799099510512e-05,
+      "loss": 0.8409,
+      "step": 15946
+    },
+    {
+      "epoch": 2.8392094017094016,
+      "grad_norm": 0.8182117342948914,
+      "learning_rate": 3.886691314513675e-05,
+      "loss": 0.8093,
+      "step": 15947
+    },
+    {
+      "epoch": 2.8393874643874644,
+      "grad_norm": 0.8287648558616638,
+      "learning_rate": 3.885583649294426e-05,
+      "loss": 0.7792,
+      "step": 15948
+    },
+    {
+      "epoch": 2.839565527065527,
+      "grad_norm": 0.9165690541267395,
+      "learning_rate": 3.884476103874464e-05,
+      "loss": 0.9865,
+      "step": 15949
+    },
+    {
+      "epoch": 2.83974358974359,
+      "grad_norm": 0.7819885015487671,
+      "learning_rate": 3.883368678275485e-05,
+      "loss": 0.7245,
+      "step": 15950
+    },
+    {
+      "epoch": 2.839921652421652,
+      "grad_norm": 0.8354606628417969,
+      "learning_rate": 3.882261372519198e-05,
+      "loss": 1.0513,
+      "step": 15951
+    },
+    {
+      "epoch": 2.840099715099715,
+      "grad_norm": 0.7606815099716187,
+      "learning_rate": 3.881154186627284e-05,
+      "loss": 0.9357,
+      "step": 15952
+    },
+    {
+      "epoch": 2.8402777777777777,
+      "grad_norm": 0.9649691581726074,
+      "learning_rate": 3.88004712062145e-05,
+      "loss": 0.8756,
+      "step": 15953
+    },
+    {
+      "epoch": 2.8404558404558404,
+      "grad_norm": 0.8770344853401184,
+      "learning_rate": 3.878940174523371e-05,
+      "loss": 0.868,
+      "step": 15954
+    },
+    {
+      "epoch": 2.840633903133903,
+      "grad_norm": 0.898287832736969,
+      "learning_rate": 3.877833348354749e-05,
+      "loss": 0.8734,
+      "step": 15955
+    },
+    {
+      "epoch": 2.840811965811966,
+      "grad_norm": 0.84062260389328,
+      "learning_rate": 3.876726642137264e-05,
+      "loss": 0.9362,
+      "step": 15956
+    },
+    {
+      "epoch": 2.8409900284900287,
+      "grad_norm": 0.7898240685462952,
+      "learning_rate": 3.8756200558926013e-05,
+      "loss": 0.7788,
+      "step": 15957
+    },
+    {
+      "epoch": 2.8411680911680914,
+      "grad_norm": 0.7237298488616943,
+      "learning_rate": 3.874513589642441e-05,
+      "loss": 0.8426,
+      "step": 15958
+    },
+    {
+      "epoch": 2.8413461538461537,
+      "grad_norm": 0.9025090932846069,
+      "learning_rate": 3.873407243408462e-05,
+      "loss": 0.7135,
+      "step": 15959
+    },
+    {
+      "epoch": 2.8415242165242165,
+      "grad_norm": 0.807295560836792,
+      "learning_rate": 3.872301017212337e-05,
+      "loss": 0.6889,
+      "step": 15960
+    },
+    {
+      "epoch": 2.8417022792022792,
+      "grad_norm": 0.8537244200706482,
+      "learning_rate": 3.8711949110757525e-05,
+      "loss": 0.798,
+      "step": 15961
+    },
+    {
+      "epoch": 2.841880341880342,
+      "grad_norm": 0.8148910999298096,
+      "learning_rate": 3.870088925020366e-05,
+      "loss": 0.8783,
+      "step": 15962
+    },
+    {
+      "epoch": 2.8420584045584043,
+      "grad_norm": 0.8254446983337402,
+      "learning_rate": 3.868983059067859e-05,
+      "loss": 0.7043,
+      "step": 15963
+    },
+    {
+      "epoch": 2.842236467236467,
+      "grad_norm": 0.8392706513404846,
+      "learning_rate": 3.867877313239886e-05,
+      "loss": 0.9942,
+      "step": 15964
+    },
+    {
+      "epoch": 2.84241452991453,
+      "grad_norm": 0.8974948525428772,
+      "learning_rate": 3.8667716875581217e-05,
+      "loss": 0.7646,
+      "step": 15965
+    },
+    {
+      "epoch": 2.8425925925925926,
+      "grad_norm": 0.9764110445976257,
+      "learning_rate": 3.8656661820442264e-05,
+      "loss": 0.8803,
+      "step": 15966
+    },
+    {
+      "epoch": 2.8427706552706553,
+      "grad_norm": 0.9663669466972351,
+      "learning_rate": 3.864560796719855e-05,
+      "loss": 0.8764,
+      "step": 15967
+    },
+    {
+      "epoch": 2.842948717948718,
+      "grad_norm": 0.837733268737793,
+      "learning_rate": 3.863455531606677e-05,
+      "loss": 0.8992,
+      "step": 15968
+    },
+    {
+      "epoch": 2.843126780626781,
+      "grad_norm": 0.8458481431007385,
+      "learning_rate": 3.8623503867263335e-05,
+      "loss": 0.9025,
+      "step": 15969
+    },
+    {
+      "epoch": 2.8433048433048436,
+      "grad_norm": 0.901089072227478,
+      "learning_rate": 3.861245362100488e-05,
+      "loss": 0.8025,
+      "step": 15970
+    },
+    {
+      "epoch": 2.843482905982906,
+      "grad_norm": 0.9032089114189148,
+      "learning_rate": 3.860140457750786e-05,
+      "loss": 0.7217,
+      "step": 15971
+    },
+    {
+      "epoch": 2.8436609686609686,
+      "grad_norm": 0.7998839616775513,
+      "learning_rate": 3.859035673698879e-05,
+      "loss": 0.9127,
+      "step": 15972
+    },
+    {
+      "epoch": 2.8438390313390314,
+      "grad_norm": 0.8568583726882935,
+      "learning_rate": 3.85793100996641e-05,
+      "loss": 0.8847,
+      "step": 15973
+    },
+    {
+      "epoch": 2.844017094017094,
+      "grad_norm": 0.8720089793205261,
+      "learning_rate": 3.856826466575024e-05,
+      "loss": 0.7822,
+      "step": 15974
+    },
+    {
+      "epoch": 2.8441951566951564,
+      "grad_norm": 0.8872382640838623,
+      "learning_rate": 3.8557220435463594e-05,
+      "loss": 0.9601,
+      "step": 15975
+    },
+    {
+      "epoch": 2.844373219373219,
+      "grad_norm": 1.1950596570968628,
+      "learning_rate": 3.8546177409020634e-05,
+      "loss": 1.075,
+      "step": 15976
+    },
+    {
+      "epoch": 2.844551282051282,
+      "grad_norm": 0.9111549854278564,
+      "learning_rate": 3.85351355866376e-05,
+      "loss": 1.0103,
+      "step": 15977
+    },
+    {
+      "epoch": 2.8447293447293447,
+      "grad_norm": 0.9310214519500732,
+      "learning_rate": 3.852409496853099e-05,
+      "loss": 1.0163,
+      "step": 15978
+    },
+    {
+      "epoch": 2.8449074074074074,
+      "grad_norm": 0.8177474737167358,
+      "learning_rate": 3.851305555491695e-05,
+      "loss": 0.7488,
+      "step": 15979
+    },
+    {
+      "epoch": 2.84508547008547,
+      "grad_norm": 0.9321249127388,
+      "learning_rate": 3.85020173460119e-05,
+      "loss": 0.9914,
+      "step": 15980
+    },
+    {
+      "epoch": 2.845263532763533,
+      "grad_norm": 0.7649266719818115,
+      "learning_rate": 3.849098034203206e-05,
+      "loss": 0.692,
+      "step": 15981
+    },
+    {
+      "epoch": 2.8454415954415957,
+      "grad_norm": 0.7714266777038574,
+      "learning_rate": 3.847994454319369e-05,
+      "loss": 0.8859,
+      "step": 15982
+    },
+    {
+      "epoch": 2.845619658119658,
+      "grad_norm": 0.9535303711891174,
+      "learning_rate": 3.846890994971302e-05,
+      "loss": 0.8992,
+      "step": 15983
+    },
+    {
+      "epoch": 2.8457977207977208,
+      "grad_norm": 0.8171879649162292,
+      "learning_rate": 3.845787656180623e-05,
+      "loss": 0.7125,
+      "step": 15984
+    },
+    {
+      "epoch": 2.8459757834757835,
+      "grad_norm": 0.8546884655952454,
+      "learning_rate": 3.8446844379689464e-05,
+      "loss": 0.9895,
+      "step": 15985
+    },
+    {
+      "epoch": 2.8461538461538463,
+      "grad_norm": 0.9110364317893982,
+      "learning_rate": 3.843581340357899e-05,
+      "loss": 1.0702,
+      "step": 15986
+    },
+    {
+      "epoch": 2.8463319088319086,
+      "grad_norm": 0.8862065672874451,
+      "learning_rate": 3.84247836336908e-05,
+      "loss": 0.9138,
+      "step": 15987
+    },
+    {
+      "epoch": 2.8465099715099713,
+      "grad_norm": 0.8485249876976013,
+      "learning_rate": 3.84137550702411e-05,
+      "loss": 1.2831,
+      "step": 15988
+    },
+    {
+      "epoch": 2.846688034188034,
+      "grad_norm": 0.8271495699882507,
+      "learning_rate": 3.840272771344593e-05,
+      "loss": 0.9497,
+      "step": 15989
+    },
+    {
+      "epoch": 2.846866096866097,
+      "grad_norm": 0.7829293608665466,
+      "learning_rate": 3.839170156352135e-05,
+      "loss": 0.6503,
+      "step": 15990
+    },
+    {
+      "epoch": 2.8470441595441596,
+      "grad_norm": 0.9366582036018372,
+      "learning_rate": 3.838067662068341e-05,
+      "loss": 0.7805,
+      "step": 15991
+    },
+    {
+      "epoch": 2.8472222222222223,
+      "grad_norm": 0.8666117787361145,
+      "learning_rate": 3.836965288514807e-05,
+      "loss": 0.7721,
+      "step": 15992
+    },
+    {
+      "epoch": 2.847400284900285,
+      "grad_norm": 0.7855546474456787,
+      "learning_rate": 3.835863035713142e-05,
+      "loss": 0.7457,
+      "step": 15993
+    },
+    {
+      "epoch": 2.847578347578348,
+      "grad_norm": 0.8234511017799377,
+      "learning_rate": 3.8347609036849284e-05,
+      "loss": 0.8937,
+      "step": 15994
+    },
+    {
+      "epoch": 2.84775641025641,
+      "grad_norm": 0.8896345496177673,
+      "learning_rate": 3.833658892451773e-05,
+      "loss": 0.9146,
+      "step": 15995
+    },
+    {
+      "epoch": 2.847934472934473,
+      "grad_norm": 0.8099349737167358,
+      "learning_rate": 3.83255700203526e-05,
+      "loss": 0.8353,
+      "step": 15996
+    },
+    {
+      "epoch": 2.8481125356125356,
+      "grad_norm": 0.874100387096405,
+      "learning_rate": 3.831455232456982e-05,
+      "loss": 0.6829,
+      "step": 15997
+    },
+    {
+      "epoch": 2.8482905982905984,
+      "grad_norm": 0.9338345527648926,
+      "learning_rate": 3.830353583738524e-05,
+      "loss": 1.0345,
+      "step": 15998
+    },
+    {
+      "epoch": 2.8484686609686607,
+      "grad_norm": 0.7876978516578674,
+      "learning_rate": 3.829252055901472e-05,
+      "loss": 0.6703,
+      "step": 15999
+    },
+    {
+      "epoch": 2.8486467236467234,
+      "grad_norm": 0.8565872311592102,
+      "learning_rate": 3.828150648967408e-05,
+      "loss": 0.8227,
+      "step": 16000
+    },
+    {
+      "epoch": 2.848824786324786,
+      "grad_norm": 1.0180596113204956,
+      "learning_rate": 3.82704936295791e-05,
+      "loss": 0.8933,
+      "step": 16001
+    },
+    {
+      "epoch": 2.849002849002849,
+      "grad_norm": 0.8450096845626831,
+      "learning_rate": 3.825948197894553e-05,
+      "loss": 0.851,
+      "step": 16002
+    },
+    {
+      "epoch": 2.8491809116809117,
+      "grad_norm": 0.7936033010482788,
+      "learning_rate": 3.824847153798923e-05,
+      "loss": 0.8573,
+      "step": 16003
+    },
+    {
+      "epoch": 2.8493589743589745,
+      "grad_norm": 0.9499372839927673,
+      "learning_rate": 3.8237462306925774e-05,
+      "loss": 0.8269,
+      "step": 16004
+    },
+    {
+      "epoch": 2.849537037037037,
+      "grad_norm": 0.874855101108551,
+      "learning_rate": 3.822645428597099e-05,
+      "loss": 0.9657,
+      "step": 16005
+    },
+    {
+      "epoch": 2.8497150997151,
+      "grad_norm": 0.8966119885444641,
+      "learning_rate": 3.8215447475340506e-05,
+      "loss": 0.9239,
+      "step": 16006
+    },
+    {
+      "epoch": 2.8498931623931623,
+      "grad_norm": 0.8341490030288696,
+      "learning_rate": 3.820444187524994e-05,
+      "loss": 1.0051,
+      "step": 16007
+    },
+    {
+      "epoch": 2.850071225071225,
+      "grad_norm": 0.7965613007545471,
+      "learning_rate": 3.8193437485915054e-05,
+      "loss": 0.8591,
+      "step": 16008
+    },
+    {
+      "epoch": 2.8502492877492878,
+      "grad_norm": 0.7846593856811523,
+      "learning_rate": 3.818243430755128e-05,
+      "loss": 0.8095,
+      "step": 16009
+    },
+    {
+      "epoch": 2.8504273504273505,
+      "grad_norm": 0.9422695636749268,
+      "learning_rate": 3.8171432340374334e-05,
+      "loss": 0.8902,
+      "step": 16010
+    },
+    {
+      "epoch": 2.8506054131054133,
+      "grad_norm": 0.8810960650444031,
+      "learning_rate": 3.8160431584599744e-05,
+      "loss": 0.8483,
+      "step": 16011
+    },
+    {
+      "epoch": 2.8507834757834756,
+      "grad_norm": 0.8513348698616028,
+      "learning_rate": 3.814943204044302e-05,
+      "loss": 0.832,
+      "step": 16012
+    },
+    {
+      "epoch": 2.8509615384615383,
+      "grad_norm": 0.8906846046447754,
+      "learning_rate": 3.8138433708119704e-05,
+      "loss": 0.9702,
+      "step": 16013
+    },
+    {
+      "epoch": 2.851139601139601,
+      "grad_norm": 0.9517511129379272,
+      "learning_rate": 3.812743658784526e-05,
+      "loss": 0.8138,
+      "step": 16014
+    },
+    {
+      "epoch": 2.851317663817664,
+      "grad_norm": 0.7989702820777893,
+      "learning_rate": 3.811644067983517e-05,
+      "loss": 0.8653,
+      "step": 16015
+    },
+    {
+      "epoch": 2.8514957264957266,
+      "grad_norm": 0.8255589008331299,
+      "learning_rate": 3.8105445984304874e-05,
+      "loss": 0.9456,
+      "step": 16016
+    },
+    {
+      "epoch": 2.8516737891737893,
+      "grad_norm": 0.7919938564300537,
+      "learning_rate": 3.809445250146977e-05,
+      "loss": 0.6261,
+      "step": 16017
+    },
+    {
+      "epoch": 2.851851851851852,
+      "grad_norm": 0.866316020488739,
+      "learning_rate": 3.808346023154532e-05,
+      "loss": 0.8171,
+      "step": 16018
+    },
+    {
+      "epoch": 2.8520299145299144,
+      "grad_norm": 1.0050057172775269,
+      "learning_rate": 3.8072469174746794e-05,
+      "loss": 0.9094,
+      "step": 16019
+    },
+    {
+      "epoch": 2.852207977207977,
+      "grad_norm": 0.8405657410621643,
+      "learning_rate": 3.806147933128962e-05,
+      "loss": 0.7737,
+      "step": 16020
+    },
+    {
+      "epoch": 2.85238603988604,
+      "grad_norm": 0.8127378821372986,
+      "learning_rate": 3.8050490701389085e-05,
+      "loss": 0.9102,
+      "step": 16021
+    },
+    {
+      "epoch": 2.8525641025641026,
+      "grad_norm": 0.8622255921363831,
+      "learning_rate": 3.8039503285260506e-05,
+      "loss": 0.8815,
+      "step": 16022
+    },
+    {
+      "epoch": 2.8527421652421654,
+      "grad_norm": 0.8802367448806763,
+      "learning_rate": 3.802851708311913e-05,
+      "loss": 1.0123,
+      "step": 16023
+    },
+    {
+      "epoch": 2.8529202279202277,
+      "grad_norm": 0.908149778842926,
+      "learning_rate": 3.801753209518024e-05,
+      "loss": 0.6808,
+      "step": 16024
+    },
+    {
+      "epoch": 2.8530982905982905,
+      "grad_norm": 0.9346339702606201,
+      "learning_rate": 3.8006548321659055e-05,
+      "loss": 1.1107,
+      "step": 16025
+    },
+    {
+      "epoch": 2.853276353276353,
+      "grad_norm": 0.945125937461853,
+      "learning_rate": 3.799556576277077e-05,
+      "loss": 0.6578,
+      "step": 16026
+    },
+    {
+      "epoch": 2.853454415954416,
+      "grad_norm": 0.8294890522956848,
+      "learning_rate": 3.798458441873054e-05,
+      "loss": 0.869,
+      "step": 16027
+    },
+    {
+      "epoch": 2.8536324786324787,
+      "grad_norm": 0.7922961115837097,
+      "learning_rate": 3.797360428975358e-05,
+      "loss": 0.718,
+      "step": 16028
+    },
+    {
+      "epoch": 2.8538105413105415,
+      "grad_norm": 1.0540844202041626,
+      "learning_rate": 3.7962625376055005e-05,
+      "loss": 0.8287,
+      "step": 16029
+    },
+    {
+      "epoch": 2.853988603988604,
+      "grad_norm": 0.9409742951393127,
+      "learning_rate": 3.795164767784991e-05,
+      "loss": 1.0484,
+      "step": 16030
+    },
+    {
+      "epoch": 2.8541666666666665,
+      "grad_norm": 0.7328341603279114,
+      "learning_rate": 3.7940671195353385e-05,
+      "loss": 0.7603,
+      "step": 16031
+    },
+    {
+      "epoch": 2.8543447293447293,
+      "grad_norm": 0.9151208996772766,
+      "learning_rate": 3.792969592878045e-05,
+      "loss": 0.7523,
+      "step": 16032
+    },
+    {
+      "epoch": 2.854522792022792,
+      "grad_norm": 0.7935783267021179,
+      "learning_rate": 3.791872187834626e-05,
+      "loss": 0.7559,
+      "step": 16033
+    },
+    {
+      "epoch": 2.8547008547008548,
+      "grad_norm": 0.8030906915664673,
+      "learning_rate": 3.790774904426568e-05,
+      "loss": 0.7905,
+      "step": 16034
+    },
+    {
+      "epoch": 2.8548789173789175,
+      "grad_norm": 0.8756175637245178,
+      "learning_rate": 3.789677742675384e-05,
+      "loss": 0.9067,
+      "step": 16035
+    },
+    {
+      "epoch": 2.85505698005698,
+      "grad_norm": 0.7602807283401489,
+      "learning_rate": 3.788580702602558e-05,
+      "loss": 0.9192,
+      "step": 16036
+    },
+    {
+      "epoch": 2.8552350427350426,
+      "grad_norm": 0.9411010146141052,
+      "learning_rate": 3.787483784229592e-05,
+      "loss": 0.8192,
+      "step": 16037
+    },
+    {
+      "epoch": 2.8554131054131053,
+      "grad_norm": 0.9473391175270081,
+      "learning_rate": 3.786386987577976e-05,
+      "loss": 0.7845,
+      "step": 16038
+    },
+    {
+      "epoch": 2.855591168091168,
+      "grad_norm": 0.9226218461990356,
+      "learning_rate": 3.7852903126692e-05,
+      "loss": 0.8712,
+      "step": 16039
+    },
+    {
+      "epoch": 2.855769230769231,
+      "grad_norm": 0.9519350528717041,
+      "learning_rate": 3.78419375952475e-05,
+      "loss": 0.9584,
+      "step": 16040
+    },
+    {
+      "epoch": 2.8559472934472936,
+      "grad_norm": 0.825547456741333,
+      "learning_rate": 3.783097328166111e-05,
+      "loss": 0.9279,
+      "step": 16041
+    },
+    {
+      "epoch": 2.8561253561253563,
+      "grad_norm": 0.8645279407501221,
+      "learning_rate": 3.782001018614763e-05,
+      "loss": 0.83,
+      "step": 16042
+    },
+    {
+      "epoch": 2.8563034188034186,
+      "grad_norm": 0.827126145362854,
+      "learning_rate": 3.7809048308921936e-05,
+      "loss": 0.7661,
+      "step": 16043
+    },
+    {
+      "epoch": 2.8564814814814814,
+      "grad_norm": 0.9441137909889221,
+      "learning_rate": 3.779808765019869e-05,
+      "loss": 0.8745,
+      "step": 16044
+    },
+    {
+      "epoch": 2.856659544159544,
+      "grad_norm": 0.8505343794822693,
+      "learning_rate": 3.7787128210192736e-05,
+      "loss": 0.8176,
+      "step": 16045
+    },
+    {
+      "epoch": 2.856837606837607,
+      "grad_norm": 0.8797150254249573,
+      "learning_rate": 3.777616998911876e-05,
+      "loss": 0.7018,
+      "step": 16046
+    },
+    {
+      "epoch": 2.8570156695156697,
+      "grad_norm": 0.8386834263801575,
+      "learning_rate": 3.776521298719144e-05,
+      "loss": 0.9805,
+      "step": 16047
+    },
+    {
+      "epoch": 2.857193732193732,
+      "grad_norm": 0.818373441696167,
+      "learning_rate": 3.775425720462558e-05,
+      "loss": 0.7782,
+      "step": 16048
+    },
+    {
+      "epoch": 2.8573717948717947,
+      "grad_norm": 0.760405957698822,
+      "learning_rate": 3.774330264163566e-05,
+      "loss": 0.7283,
+      "step": 16049
+    },
+    {
+      "epoch": 2.8575498575498575,
+      "grad_norm": 0.9082552194595337,
+      "learning_rate": 3.7732349298436465e-05,
+      "loss": 0.7594,
+      "step": 16050
+    },
+    {
+      "epoch": 2.85772792022792,
+      "grad_norm": 0.859868586063385,
+      "learning_rate": 3.7721397175242477e-05,
+      "loss": 0.7841,
+      "step": 16051
+    },
+    {
+      "epoch": 2.857905982905983,
+      "grad_norm": 1.031545877456665,
+      "learning_rate": 3.771044627226836e-05,
+      "loss": 1.2157,
+      "step": 16052
+    },
+    {
+      "epoch": 2.8580840455840457,
+      "grad_norm": 0.859491765499115,
+      "learning_rate": 3.769949658972867e-05,
+      "loss": 0.7942,
+      "step": 16053
+    },
+    {
+      "epoch": 2.8582621082621085,
+      "grad_norm": 0.775382936000824,
+      "learning_rate": 3.768854812783791e-05,
+      "loss": 0.7321,
+      "step": 16054
+    },
+    {
+      "epoch": 2.8584401709401708,
+      "grad_norm": 0.9268868565559387,
+      "learning_rate": 3.767760088681062e-05,
+      "loss": 1.0104,
+      "step": 16055
+    },
+    {
+      "epoch": 2.8586182336182335,
+      "grad_norm": 0.8408828973770142,
+      "learning_rate": 3.7666654866861274e-05,
+      "loss": 0.5298,
+      "step": 16056
+    },
+    {
+      "epoch": 2.8587962962962963,
+      "grad_norm": 0.8417157530784607,
+      "learning_rate": 3.76557100682043e-05,
+      "loss": 0.9123,
+      "step": 16057
+    },
+    {
+      "epoch": 2.858974358974359,
+      "grad_norm": 0.8079593777656555,
+      "learning_rate": 3.764476649105425e-05,
+      "loss": 0.794,
+      "step": 16058
+    },
+    {
+      "epoch": 2.859152421652422,
+      "grad_norm": 0.9449031352996826,
+      "learning_rate": 3.763382413562541e-05,
+      "loss": 1.1281,
+      "step": 16059
+    },
+    {
+      "epoch": 2.859330484330484,
+      "grad_norm": 0.8985004425048828,
+      "learning_rate": 3.762288300213228e-05,
+      "loss": 0.929,
+      "step": 16060
+    },
+    {
+      "epoch": 2.859508547008547,
+      "grad_norm": 0.9850391149520874,
+      "learning_rate": 3.761194309078913e-05,
+      "loss": 0.9796,
+      "step": 16061
+    },
+    {
+      "epoch": 2.8596866096866096,
+      "grad_norm": 0.9231089949607849,
+      "learning_rate": 3.760100440181038e-05,
+      "loss": 0.6981,
+      "step": 16062
+    },
+    {
+      "epoch": 2.8598646723646723,
+      "grad_norm": 0.8458681702613831,
+      "learning_rate": 3.759006693541033e-05,
+      "loss": 0.9502,
+      "step": 16063
+    },
+    {
+      "epoch": 2.860042735042735,
+      "grad_norm": 0.8494541645050049,
+      "learning_rate": 3.7579130691803266e-05,
+      "loss": 0.7477,
+      "step": 16064
+    },
+    {
+      "epoch": 2.860220797720798,
+      "grad_norm": 0.879878580570221,
+      "learning_rate": 3.756819567120348e-05,
+      "loss": 0.8426,
+      "step": 16065
+    },
+    {
+      "epoch": 2.8603988603988606,
+      "grad_norm": 0.8161541223526001,
+      "learning_rate": 3.7557261873825155e-05,
+      "loss": 0.7411,
+      "step": 16066
+    },
+    {
+      "epoch": 2.8605769230769234,
+      "grad_norm": 0.9438506364822388,
+      "learning_rate": 3.754632929988262e-05,
+      "loss": 0.8494,
+      "step": 16067
+    },
+    {
+      "epoch": 2.8607549857549857,
+      "grad_norm": 0.8552418351173401,
+      "learning_rate": 3.753539794959002e-05,
+      "loss": 0.766,
+      "step": 16068
+    },
+    {
+      "epoch": 2.8609330484330484,
+      "grad_norm": 0.8670600056648254,
+      "learning_rate": 3.7524467823161546e-05,
+      "loss": 0.7462,
+      "step": 16069
+    },
+    {
+      "epoch": 2.861111111111111,
+      "grad_norm": 0.7906678318977356,
+      "learning_rate": 3.751353892081134e-05,
+      "loss": 0.791,
+      "step": 16070
+    },
+    {
+      "epoch": 2.861289173789174,
+      "grad_norm": 0.8461915254592896,
+      "learning_rate": 3.7502611242753536e-05,
+      "loss": 0.6979,
+      "step": 16071
+    },
+    {
+      "epoch": 2.861467236467236,
+      "grad_norm": 0.8197309970855713,
+      "learning_rate": 3.749168478920223e-05,
+      "loss": 0.8095,
+      "step": 16072
+    },
+    {
+      "epoch": 2.861645299145299,
+      "grad_norm": 0.9489047527313232,
+      "learning_rate": 3.7480759560371516e-05,
+      "loss": 0.9462,
+      "step": 16073
+    },
+    {
+      "epoch": 2.8618233618233617,
+      "grad_norm": 0.8539329767227173,
+      "learning_rate": 3.7469835556475405e-05,
+      "loss": 0.853,
+      "step": 16074
+    },
+    {
+      "epoch": 2.8620014245014245,
+      "grad_norm": 0.9104743003845215,
+      "learning_rate": 3.745891277772805e-05,
+      "loss": 0.757,
+      "step": 16075
+    },
+    {
+      "epoch": 2.8621794871794872,
+      "grad_norm": 0.8277523517608643,
+      "learning_rate": 3.744799122434332e-05,
+      "loss": 0.8073,
+      "step": 16076
+    },
+    {
+      "epoch": 2.86235754985755,
+      "grad_norm": 0.866422176361084,
+      "learning_rate": 3.743707089653527e-05,
+      "loss": 0.7201,
+      "step": 16077
+    },
+    {
+      "epoch": 2.8625356125356127,
+      "grad_norm": 0.8722748160362244,
+      "learning_rate": 3.742615179451787e-05,
+      "loss": 0.7901,
+      "step": 16078
+    },
+    {
+      "epoch": 2.8627136752136755,
+      "grad_norm": 0.74676513671875,
+      "learning_rate": 3.741523391850504e-05,
+      "loss": 0.7558,
+      "step": 16079
+    },
+    {
+      "epoch": 2.862891737891738,
+      "grad_norm": 0.7674166560173035,
+      "learning_rate": 3.740431726871069e-05,
+      "loss": 0.8699,
+      "step": 16080
+    },
+    {
+      "epoch": 2.8630698005698005,
+      "grad_norm": 0.9028998613357544,
+      "learning_rate": 3.739340184534871e-05,
+      "loss": 1.136,
+      "step": 16081
+    },
+    {
+      "epoch": 2.8632478632478633,
+      "grad_norm": 0.8240773677825928,
+      "learning_rate": 3.7382487648632936e-05,
+      "loss": 0.9357,
+      "step": 16082
+    },
+    {
+      "epoch": 2.863425925925926,
+      "grad_norm": 0.8877659440040588,
+      "learning_rate": 3.737157467877731e-05,
+      "loss": 0.9167,
+      "step": 16083
+    },
+    {
+      "epoch": 2.8636039886039883,
+      "grad_norm": 0.9677366614341736,
+      "learning_rate": 3.73606629359955e-05,
+      "loss": 0.8342,
+      "step": 16084
+    },
+    {
+      "epoch": 2.863782051282051,
+      "grad_norm": 0.8721164464950562,
+      "learning_rate": 3.734975242050146e-05,
+      "loss": 0.9195,
+      "step": 16085
+    },
+    {
+      "epoch": 2.863960113960114,
+      "grad_norm": 0.9791151881217957,
+      "learning_rate": 3.733884313250879e-05,
+      "loss": 1.0011,
+      "step": 16086
+    },
+    {
+      "epoch": 2.8641381766381766,
+      "grad_norm": 0.7869369983673096,
+      "learning_rate": 3.7327935072231366e-05,
+      "loss": 0.7998,
+      "step": 16087
+    },
+    {
+      "epoch": 2.8643162393162394,
+      "grad_norm": 0.891656756401062,
+      "learning_rate": 3.731702823988287e-05,
+      "loss": 0.7287,
+      "step": 16088
+    },
+    {
+      "epoch": 2.864494301994302,
+      "grad_norm": 0.8720460534095764,
+      "learning_rate": 3.7306122635676955e-05,
+      "loss": 0.9492,
+      "step": 16089
+    },
+    {
+      "epoch": 2.864672364672365,
+      "grad_norm": 0.7878959774971008,
+      "learning_rate": 3.72952182598274e-05,
+      "loss": 0.7652,
+      "step": 16090
+    },
+    {
+      "epoch": 2.8648504273504276,
+      "grad_norm": 0.9350453019142151,
+      "learning_rate": 3.728431511254772e-05,
+      "loss": 0.8661,
+      "step": 16091
+    },
+    {
+      "epoch": 2.86502849002849,
+      "grad_norm": 0.7575289011001587,
+      "learning_rate": 3.727341319405163e-05,
+      "loss": 0.7943,
+      "step": 16092
+    },
+    {
+      "epoch": 2.8652065527065527,
+      "grad_norm": 0.8256776928901672,
+      "learning_rate": 3.7262512504552716e-05,
+      "loss": 1.0025,
+      "step": 16093
+    },
+    {
+      "epoch": 2.8653846153846154,
+      "grad_norm": 0.7274962067604065,
+      "learning_rate": 3.7251613044264536e-05,
+      "loss": 0.7628,
+      "step": 16094
+    },
+    {
+      "epoch": 2.865562678062678,
+      "grad_norm": 0.867734968662262,
+      "learning_rate": 3.7240714813400646e-05,
+      "loss": 0.8931,
+      "step": 16095
+    },
+    {
+      "epoch": 2.8657407407407405,
+      "grad_norm": 0.8225845098495483,
+      "learning_rate": 3.722981781217458e-05,
+      "loss": 0.7934,
+      "step": 16096
+    },
+    {
+      "epoch": 2.8659188034188032,
+      "grad_norm": 0.8180573582649231,
+      "learning_rate": 3.721892204079985e-05,
+      "loss": 0.8119,
+      "step": 16097
+    },
+    {
+      "epoch": 2.866096866096866,
+      "grad_norm": 1.0235565900802612,
+      "learning_rate": 3.720802749948993e-05,
+      "loss": 0.8281,
+      "step": 16098
+    },
+    {
+      "epoch": 2.8662749287749287,
+      "grad_norm": 0.7290656566619873,
+      "learning_rate": 3.719713418845823e-05,
+      "loss": 0.7099,
+      "step": 16099
+    },
+    {
+      "epoch": 2.8664529914529915,
+      "grad_norm": 0.8408772349357605,
+      "learning_rate": 3.718624210791828e-05,
+      "loss": 0.8631,
+      "step": 16100
+    },
+    {
+      "epoch": 2.8666310541310542,
+      "grad_norm": 0.8182529807090759,
+      "learning_rate": 3.717535125808338e-05,
+      "loss": 0.7584,
+      "step": 16101
+    },
+    {
+      "epoch": 2.866809116809117,
+      "grad_norm": 0.8381599187850952,
+      "learning_rate": 3.716446163916699e-05,
+      "loss": 0.8735,
+      "step": 16102
+    },
+    {
+      "epoch": 2.8669871794871797,
+      "grad_norm": 0.8975555300712585,
+      "learning_rate": 3.715357325138245e-05,
+      "loss": 0.7564,
+      "step": 16103
+    },
+    {
+      "epoch": 2.867165242165242,
+      "grad_norm": 0.9531118869781494,
+      "learning_rate": 3.714268609494309e-05,
+      "loss": 0.7627,
+      "step": 16104
+    },
+    {
+      "epoch": 2.867343304843305,
+      "grad_norm": 0.853065550327301,
+      "learning_rate": 3.7131800170062216e-05,
+      "loss": 0.8001,
+      "step": 16105
+    },
+    {
+      "epoch": 2.8675213675213675,
+      "grad_norm": 0.788351833820343,
+      "learning_rate": 3.7120915476953085e-05,
+      "loss": 0.7935,
+      "step": 16106
+    },
+    {
+      "epoch": 2.8676994301994303,
+      "grad_norm": 0.9121149778366089,
+      "learning_rate": 3.711003201582908e-05,
+      "loss": 0.9212,
+      "step": 16107
+    },
+    {
+      "epoch": 2.8678774928774926,
+      "grad_norm": 0.8156226277351379,
+      "learning_rate": 3.7099149786903263e-05,
+      "loss": 0.86,
+      "step": 16108
+    },
+    {
+      "epoch": 2.8680555555555554,
+      "grad_norm": 0.8555662035942078,
+      "learning_rate": 3.708826879038899e-05,
+      "loss": 0.8872,
+      "step": 16109
+    },
+    {
+      "epoch": 2.868233618233618,
+      "grad_norm": 1.0395163297653198,
+      "learning_rate": 3.70773890264994e-05,
+      "loss": 0.9794,
+      "step": 16110
+    },
+    {
+      "epoch": 2.868411680911681,
+      "grad_norm": 0.7535551190376282,
+      "learning_rate": 3.706651049544766e-05,
+      "loss": 0.91,
+      "step": 16111
+    },
+    {
+      "epoch": 2.8685897435897436,
+      "grad_norm": 1.0145034790039062,
+      "learning_rate": 3.705563319744691e-05,
+      "loss": 0.9386,
+      "step": 16112
+    },
+    {
+      "epoch": 2.8687678062678064,
+      "grad_norm": 0.8577025532722473,
+      "learning_rate": 3.704475713271029e-05,
+      "loss": 1.0491,
+      "step": 16113
+    },
+    {
+      "epoch": 2.868945868945869,
+      "grad_norm": 0.8291150331497192,
+      "learning_rate": 3.7033882301450815e-05,
+      "loss": 0.8418,
+      "step": 16114
+    },
+    {
+      "epoch": 2.869123931623932,
+      "grad_norm": 0.7628613710403442,
+      "learning_rate": 3.70230087038817e-05,
+      "loss": 0.681,
+      "step": 16115
+    },
+    {
+      "epoch": 2.869301994301994,
+      "grad_norm": 0.8664639592170715,
+      "learning_rate": 3.701213634021583e-05,
+      "loss": 1.0247,
+      "step": 16116
+    },
+    {
+      "epoch": 2.869480056980057,
+      "grad_norm": 0.9613258838653564,
+      "learning_rate": 3.700126521066635e-05,
+      "loss": 1.1058,
+      "step": 16117
+    },
+    {
+      "epoch": 2.8696581196581197,
+      "grad_norm": 0.8279051780700684,
+      "learning_rate": 3.699039531544619e-05,
+      "loss": 0.859,
+      "step": 16118
+    },
+    {
+      "epoch": 2.8698361823361824,
+      "grad_norm": 0.8285593390464783,
+      "learning_rate": 3.697952665476836e-05,
+      "loss": 0.9008,
+      "step": 16119
+    },
+    {
+      "epoch": 2.870014245014245,
+      "grad_norm": 0.9056670069694519,
+      "learning_rate": 3.696865922884578e-05,
+      "loss": 0.965,
+      "step": 16120
+    },
+    {
+      "epoch": 2.8701923076923075,
+      "grad_norm": 0.7092664837837219,
+      "learning_rate": 3.69577930378914e-05,
+      "loss": 0.7417,
+      "step": 16121
+    },
+    {
+      "epoch": 2.8703703703703702,
+      "grad_norm": 0.8726393580436707,
+      "learning_rate": 3.6946928082118096e-05,
+      "loss": 0.8582,
+      "step": 16122
+    },
+    {
+      "epoch": 2.870548433048433,
+      "grad_norm": 1.0046098232269287,
+      "learning_rate": 3.693606436173875e-05,
+      "loss": 0.9616,
+      "step": 16123
+    },
+    {
+      "epoch": 2.8707264957264957,
+      "grad_norm": 0.7739760875701904,
+      "learning_rate": 3.69252018769662e-05,
+      "loss": 0.8775,
+      "step": 16124
+    },
+    {
+      "epoch": 2.8709045584045585,
+      "grad_norm": 0.9054580926895142,
+      "learning_rate": 3.6914340628013344e-05,
+      "loss": 0.9205,
+      "step": 16125
+    },
+    {
+      "epoch": 2.8710826210826212,
+      "grad_norm": 0.8324142694473267,
+      "learning_rate": 3.690348061509288e-05,
+      "loss": 0.8712,
+      "step": 16126
+    },
+    {
+      "epoch": 2.871260683760684,
+      "grad_norm": 0.9162326455116272,
+      "learning_rate": 3.6892621838417664e-05,
+      "loss": 0.9102,
+      "step": 16127
+    },
+    {
+      "epoch": 2.8714387464387463,
+      "grad_norm": 0.8579963445663452,
+      "learning_rate": 3.688176429820044e-05,
+      "loss": 0.8337,
+      "step": 16128
+    },
+    {
+      "epoch": 2.871616809116809,
+      "grad_norm": 0.7649274468421936,
+      "learning_rate": 3.687090799465388e-05,
+      "loss": 0.6982,
+      "step": 16129
+    },
+    {
+      "epoch": 2.871794871794872,
+      "grad_norm": 0.9612696766853333,
+      "learning_rate": 3.6860052927990816e-05,
+      "loss": 0.8779,
+      "step": 16130
+    },
+    {
+      "epoch": 2.8719729344729346,
+      "grad_norm": 0.982455313205719,
+      "learning_rate": 3.6849199098423795e-05,
+      "loss": 0.8145,
+      "step": 16131
+    },
+    {
+      "epoch": 2.8721509971509973,
+      "grad_norm": 0.7292434573173523,
+      "learning_rate": 3.6838346506165587e-05,
+      "loss": 0.6711,
+      "step": 16132
+    },
+    {
+      "epoch": 2.8723290598290596,
+      "grad_norm": 0.877310574054718,
+      "learning_rate": 3.68274951514287e-05,
+      "loss": 0.7682,
+      "step": 16133
+    },
+    {
+      "epoch": 2.8725071225071224,
+      "grad_norm": 0.9633384943008423,
+      "learning_rate": 3.681664503442586e-05,
+      "loss": 1.0046,
+      "step": 16134
+    },
+    {
+      "epoch": 2.872685185185185,
+      "grad_norm": 0.940661609172821,
+      "learning_rate": 3.680579615536961e-05,
+      "loss": 0.897,
+      "step": 16135
+    },
+    {
+      "epoch": 2.872863247863248,
+      "grad_norm": 1.0335214138031006,
+      "learning_rate": 3.6794948514472505e-05,
+      "loss": 0.8836,
+      "step": 16136
+    },
+    {
+      "epoch": 2.8730413105413106,
+      "grad_norm": 0.8682044148445129,
+      "learning_rate": 3.6784102111947084e-05,
+      "loss": 0.7733,
+      "step": 16137
+    },
+    {
+      "epoch": 2.8732193732193734,
+      "grad_norm": 0.8767847418785095,
+      "learning_rate": 3.677325694800586e-05,
+      "loss": 0.8828,
+      "step": 16138
+    },
+    {
+      "epoch": 2.873397435897436,
+      "grad_norm": 0.954585075378418,
+      "learning_rate": 3.6762413022861305e-05,
+      "loss": 0.9294,
+      "step": 16139
+    },
+    {
+      "epoch": 2.8735754985754984,
+      "grad_norm": 0.8497310876846313,
+      "learning_rate": 3.675157033672596e-05,
+      "loss": 0.9396,
+      "step": 16140
+    },
+    {
+      "epoch": 2.873753561253561,
+      "grad_norm": 0.7619023323059082,
+      "learning_rate": 3.674072888981214e-05,
+      "loss": 0.7467,
+      "step": 16141
+    },
+    {
+      "epoch": 2.873931623931624,
+      "grad_norm": 0.9939205646514893,
+      "learning_rate": 3.6729888682332394e-05,
+      "loss": 0.9122,
+      "step": 16142
+    },
+    {
+      "epoch": 2.8741096866096867,
+      "grad_norm": 0.943192183971405,
+      "learning_rate": 3.671904971449899e-05,
+      "loss": 0.8878,
+      "step": 16143
+    },
+    {
+      "epoch": 2.8742877492877494,
+      "grad_norm": 0.8002169728279114,
+      "learning_rate": 3.6708211986524365e-05,
+      "loss": 0.8337,
+      "step": 16144
+    },
+    {
+      "epoch": 2.8744658119658117,
+      "grad_norm": 0.7558008432388306,
+      "learning_rate": 3.669737549862087e-05,
+      "loss": 0.6592,
+      "step": 16145
+    },
+    {
+      "epoch": 2.8746438746438745,
+      "grad_norm": 0.8732983469963074,
+      "learning_rate": 3.6686540251000756e-05,
+      "loss": 0.9166,
+      "step": 16146
+    },
+    {
+      "epoch": 2.8748219373219372,
+      "grad_norm": 0.8272808194160461,
+      "learning_rate": 3.667570624387643e-05,
+      "loss": 0.6342,
+      "step": 16147
+    },
+    {
+      "epoch": 2.875,
+      "grad_norm": 0.8012139797210693,
+      "learning_rate": 3.666487347746004e-05,
+      "loss": 0.8436,
+      "step": 16148
+    },
+    {
+      "epoch": 2.8751780626780628,
+      "grad_norm": 0.8791360259056091,
+      "learning_rate": 3.66540419519639e-05,
+      "loss": 0.8634,
+      "step": 16149
+    },
+    {
+      "epoch": 2.8753561253561255,
+      "grad_norm": 0.8274601101875305,
+      "learning_rate": 3.6643211667600244e-05,
+      "loss": 0.7488,
+      "step": 16150
+    },
+    {
+      "epoch": 2.8755341880341883,
+      "grad_norm": 0.8390287756919861,
+      "learning_rate": 3.663238262458122e-05,
+      "loss": 0.7084,
+      "step": 16151
+    },
+    {
+      "epoch": 2.8757122507122506,
+      "grad_norm": 0.921089768409729,
+      "learning_rate": 3.662155482311903e-05,
+      "loss": 0.8909,
+      "step": 16152
+    },
+    {
+      "epoch": 2.8758903133903133,
+      "grad_norm": 0.8790102601051331,
+      "learning_rate": 3.661072826342583e-05,
+      "loss": 0.8235,
+      "step": 16153
+    },
+    {
+      "epoch": 2.876068376068376,
+      "grad_norm": 0.8030495643615723,
+      "learning_rate": 3.659990294571368e-05,
+      "loss": 0.874,
+      "step": 16154
+    },
+    {
+      "epoch": 2.876246438746439,
+      "grad_norm": 0.9690510034561157,
+      "learning_rate": 3.6589078870194804e-05,
+      "loss": 1.0926,
+      "step": 16155
+    },
+    {
+      "epoch": 2.8764245014245016,
+      "grad_norm": 0.8150941133499146,
+      "learning_rate": 3.657825603708114e-05,
+      "loss": 0.861,
+      "step": 16156
+    },
+    {
+      "epoch": 2.876602564102564,
+      "grad_norm": 0.8865286111831665,
+      "learning_rate": 3.656743444658486e-05,
+      "loss": 0.8219,
+      "step": 16157
+    },
+    {
+      "epoch": 2.8767806267806266,
+      "grad_norm": 0.8591124415397644,
+      "learning_rate": 3.655661409891786e-05,
+      "loss": 0.889,
+      "step": 16158
+    },
+    {
+      "epoch": 2.8769586894586894,
+      "grad_norm": 0.8625560402870178,
+      "learning_rate": 3.6545794994292256e-05,
+      "loss": 0.9581,
+      "step": 16159
+    },
+    {
+      "epoch": 2.877136752136752,
+      "grad_norm": 0.8699239492416382,
+      "learning_rate": 3.653497713291999e-05,
+      "loss": 0.7275,
+      "step": 16160
+    },
+    {
+      "epoch": 2.877314814814815,
+      "grad_norm": 0.9005762338638306,
+      "learning_rate": 3.652416051501301e-05,
+      "loss": 0.8894,
+      "step": 16161
+    },
+    {
+      "epoch": 2.8774928774928776,
+      "grad_norm": 0.7541293501853943,
+      "learning_rate": 3.651334514078323e-05,
+      "loss": 0.7265,
+      "step": 16162
+    },
+    {
+      "epoch": 2.8776709401709404,
+      "grad_norm": 0.9799004793167114,
+      "learning_rate": 3.650253101044258e-05,
+      "loss": 0.8817,
+      "step": 16163
+    },
+    {
+      "epoch": 2.8778490028490027,
+      "grad_norm": 0.7796139717102051,
+      "learning_rate": 3.64917181242029e-05,
+      "loss": 0.7105,
+      "step": 16164
+    },
+    {
+      "epoch": 2.8780270655270654,
+      "grad_norm": 0.8818691968917847,
+      "learning_rate": 3.648090648227613e-05,
+      "loss": 0.8868,
+      "step": 16165
+    },
+    {
+      "epoch": 2.878205128205128,
+      "grad_norm": 0.7982428073883057,
+      "learning_rate": 3.647009608487399e-05,
+      "loss": 0.9269,
+      "step": 16166
+    },
+    {
+      "epoch": 2.878383190883191,
+      "grad_norm": 0.9602195620536804,
+      "learning_rate": 3.645928693220838e-05,
+      "loss": 0.8264,
+      "step": 16167
+    },
+    {
+      "epoch": 2.8785612535612537,
+      "grad_norm": 0.8941618800163269,
+      "learning_rate": 3.6448479024491054e-05,
+      "loss": 0.8009,
+      "step": 16168
+    },
+    {
+      "epoch": 2.878739316239316,
+      "grad_norm": 0.7777221202850342,
+      "learning_rate": 3.643767236193375e-05,
+      "loss": 0.7734,
+      "step": 16169
+    },
+    {
+      "epoch": 2.8789173789173788,
+      "grad_norm": 0.8050239086151123,
+      "learning_rate": 3.642686694474823e-05,
+      "loss": 0.9732,
+      "step": 16170
+    },
+    {
+      "epoch": 2.8790954415954415,
+      "grad_norm": 0.8437817096710205,
+      "learning_rate": 3.6416062773146156e-05,
+      "loss": 0.8161,
+      "step": 16171
+    },
+    {
+      "epoch": 2.8792735042735043,
+      "grad_norm": 0.7887414693832397,
+      "learning_rate": 3.64052598473393e-05,
+      "loss": 0.9165,
+      "step": 16172
+    },
+    {
+      "epoch": 2.879451566951567,
+      "grad_norm": 0.8671287894248962,
+      "learning_rate": 3.639445816753921e-05,
+      "loss": 0.9519,
+      "step": 16173
+    },
+    {
+      "epoch": 2.8796296296296298,
+      "grad_norm": 0.8444932103157043,
+      "learning_rate": 3.638365773395763e-05,
+      "loss": 0.784,
+      "step": 16174
+    },
+    {
+      "epoch": 2.8798076923076925,
+      "grad_norm": 0.8580447435379028,
+      "learning_rate": 3.637285854680612e-05,
+      "loss": 0.797,
+      "step": 16175
+    },
+    {
+      "epoch": 2.879985754985755,
+      "grad_norm": 0.7977848052978516,
+      "learning_rate": 3.636206060629627e-05,
+      "loss": 0.8552,
+      "step": 16176
+    },
+    {
+      "epoch": 2.8801638176638176,
+      "grad_norm": 0.8350155353546143,
+      "learning_rate": 3.6351263912639644e-05,
+      "loss": 0.9481,
+      "step": 16177
+    },
+    {
+      "epoch": 2.8803418803418803,
+      "grad_norm": 0.8759897351264954,
+      "learning_rate": 3.634046846604778e-05,
+      "loss": 0.9972,
+      "step": 16178
+    },
+    {
+      "epoch": 2.880519943019943,
+      "grad_norm": 0.8259425163269043,
+      "learning_rate": 3.6329674266732194e-05,
+      "loss": 0.8947,
+      "step": 16179
+    },
+    {
+      "epoch": 2.880698005698006,
+      "grad_norm": 0.7992371320724487,
+      "learning_rate": 3.631888131490438e-05,
+      "loss": 0.8346,
+      "step": 16180
+    },
+    {
+      "epoch": 2.880876068376068,
+      "grad_norm": 0.9702637195587158,
+      "learning_rate": 3.6308089610775775e-05,
+      "loss": 0.9543,
+      "step": 16181
+    },
+    {
+      "epoch": 2.881054131054131,
+      "grad_norm": 0.8494347929954529,
+      "learning_rate": 3.6297299154557905e-05,
+      "loss": 0.8672,
+      "step": 16182
+    },
+    {
+      "epoch": 2.8812321937321936,
+      "grad_norm": 0.8098355531692505,
+      "learning_rate": 3.628650994646207e-05,
+      "loss": 0.8356,
+      "step": 16183
+    },
+    {
+      "epoch": 2.8814102564102564,
+      "grad_norm": 0.7736916542053223,
+      "learning_rate": 3.627572198669974e-05,
+      "loss": 0.849,
+      "step": 16184
+    },
+    {
+      "epoch": 2.881588319088319,
+      "grad_norm": 0.806710422039032,
+      "learning_rate": 3.626493527548226e-05,
+      "loss": 0.9479,
+      "step": 16185
+    },
+    {
+      "epoch": 2.881766381766382,
+      "grad_norm": 1.0013649463653564,
+      "learning_rate": 3.625414981302095e-05,
+      "loss": 0.967,
+      "step": 16186
+    },
+    {
+      "epoch": 2.8819444444444446,
+      "grad_norm": 0.8116905093193054,
+      "learning_rate": 3.624336559952723e-05,
+      "loss": 0.8608,
+      "step": 16187
+    },
+    {
+      "epoch": 2.8821225071225074,
+      "grad_norm": 0.7848439812660217,
+      "learning_rate": 3.6232582635212233e-05,
+      "loss": 0.9224,
+      "step": 16188
+    },
+    {
+      "epoch": 2.8823005698005697,
+      "grad_norm": 0.9881102442741394,
+      "learning_rate": 3.622180092028735e-05,
+      "loss": 0.8089,
+      "step": 16189
+    },
+    {
+      "epoch": 2.8824786324786325,
+      "grad_norm": 0.790452778339386,
+      "learning_rate": 3.6211020454963796e-05,
+      "loss": 0.8312,
+      "step": 16190
+    },
+    {
+      "epoch": 2.882656695156695,
+      "grad_norm": 0.8228929042816162,
+      "learning_rate": 3.6200241239452784e-05,
+      "loss": 0.8277,
+      "step": 16191
+    },
+    {
+      "epoch": 2.882834757834758,
+      "grad_norm": 0.871133029460907,
+      "learning_rate": 3.618946327396552e-05,
+      "loss": 0.8861,
+      "step": 16192
+    },
+    {
+      "epoch": 2.8830128205128203,
+      "grad_norm": 0.8964230418205261,
+      "learning_rate": 3.617868655871316e-05,
+      "loss": 0.9987,
+      "step": 16193
+    },
+    {
+      "epoch": 2.883190883190883,
+      "grad_norm": 0.8292636275291443,
+      "learning_rate": 3.6167911093906856e-05,
+      "loss": 0.9823,
+      "step": 16194
+    },
+    {
+      "epoch": 2.8833689458689458,
+      "grad_norm": 0.9594070315361023,
+      "learning_rate": 3.615713687975774e-05,
+      "loss": 0.7963,
+      "step": 16195
+    },
+    {
+      "epoch": 2.8835470085470085,
+      "grad_norm": 1.014891266822815,
+      "learning_rate": 3.6146363916476864e-05,
+      "loss": 0.9203,
+      "step": 16196
+    },
+    {
+      "epoch": 2.8837250712250713,
+      "grad_norm": 0.8205485343933105,
+      "learning_rate": 3.6135592204275424e-05,
+      "loss": 0.894,
+      "step": 16197
+    },
+    {
+      "epoch": 2.883903133903134,
+      "grad_norm": 0.9057072997093201,
+      "learning_rate": 3.6124821743364315e-05,
+      "loss": 0.9131,
+      "step": 16198
+    },
+    {
+      "epoch": 2.8840811965811968,
+      "grad_norm": 0.8080529570579529,
+      "learning_rate": 3.6114052533954665e-05,
+      "loss": 0.9995,
+      "step": 16199
+    },
+    {
+      "epoch": 2.8842592592592595,
+      "grad_norm": 0.762264609336853,
+      "learning_rate": 3.6103284576257446e-05,
+      "loss": 0.915,
+      "step": 16200
+    },
+    {
+      "epoch": 2.884437321937322,
+      "grad_norm": 0.8386275172233582,
+      "learning_rate": 3.609251787048363e-05,
+      "loss": 0.9016,
+      "step": 16201
+    },
+    {
+      "epoch": 2.8846153846153846,
+      "grad_norm": 0.9676657319068909,
+      "learning_rate": 3.608175241684417e-05,
+      "loss": 1.1004,
+      "step": 16202
+    },
+    {
+      "epoch": 2.8847934472934473,
+      "grad_norm": 0.8155630826950073,
+      "learning_rate": 3.607098821554999e-05,
+      "loss": 0.8579,
+      "step": 16203
+    },
+    {
+      "epoch": 2.88497150997151,
+      "grad_norm": 0.8426685333251953,
+      "learning_rate": 3.606022526681201e-05,
+      "loss": 0.8722,
+      "step": 16204
+    },
+    {
+      "epoch": 2.8851495726495724,
+      "grad_norm": 0.7646408081054688,
+      "learning_rate": 3.604946357084105e-05,
+      "loss": 0.7806,
+      "step": 16205
+    },
+    {
+      "epoch": 2.885327635327635,
+      "grad_norm": 0.808560848236084,
+      "learning_rate": 3.603870312784803e-05,
+      "loss": 0.8419,
+      "step": 16206
+    },
+    {
+      "epoch": 2.885505698005698,
+      "grad_norm": 0.7197920083999634,
+      "learning_rate": 3.602794393804376e-05,
+      "loss": 0.7356,
+      "step": 16207
+    },
+    {
+      "epoch": 2.8856837606837606,
+      "grad_norm": 0.9405228495597839,
+      "learning_rate": 3.6017186001639036e-05,
+      "loss": 0.7279,
+      "step": 16208
+    },
+    {
+      "epoch": 2.8858618233618234,
+      "grad_norm": 0.8910958170890808,
+      "learning_rate": 3.600642931884465e-05,
+      "loss": 0.7653,
+      "step": 16209
+    },
+    {
+      "epoch": 2.886039886039886,
+      "grad_norm": 0.7354677319526672,
+      "learning_rate": 3.599567388987134e-05,
+      "loss": 0.6884,
+      "step": 16210
+    },
+    {
+      "epoch": 2.886217948717949,
+      "grad_norm": 0.75583815574646,
+      "learning_rate": 3.598491971492981e-05,
+      "loss": 0.7555,
+      "step": 16211
+    },
+    {
+      "epoch": 2.8863960113960117,
+      "grad_norm": 0.839042603969574,
+      "learning_rate": 3.597416679423086e-05,
+      "loss": 1.0522,
+      "step": 16212
+    },
+    {
+      "epoch": 2.886574074074074,
+      "grad_norm": 0.7965270280838013,
+      "learning_rate": 3.596341512798505e-05,
+      "loss": 0.9101,
+      "step": 16213
+    },
+    {
+      "epoch": 2.8867521367521367,
+      "grad_norm": 0.9170811772346497,
+      "learning_rate": 3.5952664716403154e-05,
+      "loss": 0.9078,
+      "step": 16214
+    },
+    {
+      "epoch": 2.8869301994301995,
+      "grad_norm": 0.8612267374992371,
+      "learning_rate": 3.5941915559695685e-05,
+      "loss": 0.8615,
+      "step": 16215
+    },
+    {
+      "epoch": 2.887108262108262,
+      "grad_norm": 0.9182412028312683,
+      "learning_rate": 3.593116765807335e-05,
+      "loss": 0.8918,
+      "step": 16216
+    },
+    {
+      "epoch": 2.8872863247863245,
+      "grad_norm": 0.9452505707740784,
+      "learning_rate": 3.592042101174669e-05,
+      "loss": 1.0386,
+      "step": 16217
+    },
+    {
+      "epoch": 2.8874643874643873,
+      "grad_norm": 0.8544769287109375,
+      "learning_rate": 3.5909675620926255e-05,
+      "loss": 1.0305,
+      "step": 16218
+    },
+    {
+      "epoch": 2.88764245014245,
+      "grad_norm": 0.8184592127799988,
+      "learning_rate": 3.5898931485822605e-05,
+      "loss": 0.6815,
+      "step": 16219
+    },
+    {
+      "epoch": 2.8878205128205128,
+      "grad_norm": 0.8263654708862305,
+      "learning_rate": 3.5888188606646224e-05,
+      "loss": 1.0097,
+      "step": 16220
+    },
+    {
+      "epoch": 2.8879985754985755,
+      "grad_norm": 0.8290582299232483,
+      "learning_rate": 3.587744698360758e-05,
+      "loss": 0.6539,
+      "step": 16221
+    },
+    {
+      "epoch": 2.8881766381766383,
+      "grad_norm": 0.7936849594116211,
+      "learning_rate": 3.5866706616917226e-05,
+      "loss": 0.9524,
+      "step": 16222
+    },
+    {
+      "epoch": 2.888354700854701,
+      "grad_norm": 0.9449033737182617,
+      "learning_rate": 3.585596750678546e-05,
+      "loss": 0.9243,
+      "step": 16223
+    },
+    {
+      "epoch": 2.888532763532764,
+      "grad_norm": 0.7599559426307678,
+      "learning_rate": 3.58452296534228e-05,
+      "loss": 0.8597,
+      "step": 16224
+    },
+    {
+      "epoch": 2.888710826210826,
+      "grad_norm": 0.8485760688781738,
+      "learning_rate": 3.583449305703959e-05,
+      "loss": 0.7236,
+      "step": 16225
+    },
+    {
+      "epoch": 2.888888888888889,
+      "grad_norm": 0.8510624170303345,
+      "learning_rate": 3.582375771784616e-05,
+      "loss": 0.9081,
+      "step": 16226
+    },
+    {
+      "epoch": 2.8890669515669516,
+      "grad_norm": 0.815827488899231,
+      "learning_rate": 3.581302363605296e-05,
+      "loss": 0.8259,
+      "step": 16227
+    },
+    {
+      "epoch": 2.8892450142450143,
+      "grad_norm": 0.7588803768157959,
+      "learning_rate": 3.580229081187016e-05,
+      "loss": 0.8722,
+      "step": 16228
+    },
+    {
+      "epoch": 2.8894230769230766,
+      "grad_norm": 1.0699365139007568,
+      "learning_rate": 3.579155924550817e-05,
+      "loss": 1.0261,
+      "step": 16229
+    },
+    {
+      "epoch": 2.8896011396011394,
+      "grad_norm": 0.9127700924873352,
+      "learning_rate": 3.5780828937177126e-05,
+      "loss": 0.8993,
+      "step": 16230
+    },
+    {
+      "epoch": 2.889779202279202,
+      "grad_norm": 0.8101344108581543,
+      "learning_rate": 3.577009988708737e-05,
+      "loss": 1.0002,
+      "step": 16231
+    },
+    {
+      "epoch": 2.889957264957265,
+      "grad_norm": 0.8259516358375549,
+      "learning_rate": 3.5759372095449085e-05,
+      "loss": 0.9223,
+      "step": 16232
+    },
+    {
+      "epoch": 2.8901353276353277,
+      "grad_norm": 0.8366032242774963,
+      "learning_rate": 3.574864556247246e-05,
+      "loss": 0.9635,
+      "step": 16233
+    },
+    {
+      "epoch": 2.8903133903133904,
+      "grad_norm": 0.8667864203453064,
+      "learning_rate": 3.573792028836764e-05,
+      "loss": 0.9923,
+      "step": 16234
+    },
+    {
+      "epoch": 2.890491452991453,
+      "grad_norm": 0.8982881903648376,
+      "learning_rate": 3.5727196273344784e-05,
+      "loss": 0.8735,
+      "step": 16235
+    },
+    {
+      "epoch": 2.890669515669516,
+      "grad_norm": 0.8208550214767456,
+      "learning_rate": 3.571647351761398e-05,
+      "loss": 0.809,
+      "step": 16236
+    },
+    {
+      "epoch": 2.890847578347578,
+      "grad_norm": 1.0483866930007935,
+      "learning_rate": 3.5705752021385395e-05,
+      "loss": 0.9106,
+      "step": 16237
+    },
+    {
+      "epoch": 2.891025641025641,
+      "grad_norm": 0.8783283829689026,
+      "learning_rate": 3.5695031784868984e-05,
+      "loss": 0.8061,
+      "step": 16238
+    },
+    {
+      "epoch": 2.8912037037037037,
+      "grad_norm": 0.7530997395515442,
+      "learning_rate": 3.56843128082749e-05,
+      "loss": 0.58,
+      "step": 16239
+    },
+    {
+      "epoch": 2.8913817663817665,
+      "grad_norm": 1.0263420343399048,
+      "learning_rate": 3.567359509181304e-05,
+      "loss": 0.9946,
+      "step": 16240
+    },
+    {
+      "epoch": 2.8915598290598292,
+      "grad_norm": 0.7237655520439148,
+      "learning_rate": 3.5662878635693484e-05,
+      "loss": 0.6941,
+      "step": 16241
+    },
+    {
+      "epoch": 2.8917378917378915,
+      "grad_norm": 0.8415992856025696,
+      "learning_rate": 3.565216344012618e-05,
+      "loss": 0.7908,
+      "step": 16242
+    },
+    {
+      "epoch": 2.8919159544159543,
+      "grad_norm": 0.8285344243049622,
+      "learning_rate": 3.564144950532107e-05,
+      "loss": 0.8044,
+      "step": 16243
+    },
+    {
+      "epoch": 2.892094017094017,
+      "grad_norm": 0.9715448617935181,
+      "learning_rate": 3.5630736831488046e-05,
+      "loss": 0.8236,
+      "step": 16244
+    },
+    {
+      "epoch": 2.89227207977208,
+      "grad_norm": 0.8461307287216187,
+      "learning_rate": 3.5620025418836985e-05,
+      "loss": 0.9222,
+      "step": 16245
+    },
+    {
+      "epoch": 2.8924501424501425,
+      "grad_norm": 0.7191013097763062,
+      "learning_rate": 3.5609315267577836e-05,
+      "loss": 0.6089,
+      "step": 16246
+    },
+    {
+      "epoch": 2.8926282051282053,
+      "grad_norm": 0.8537501692771912,
+      "learning_rate": 3.559860637792038e-05,
+      "loss": 0.8771,
+      "step": 16247
+    },
+    {
+      "epoch": 2.892806267806268,
+      "grad_norm": 0.8684942126274109,
+      "learning_rate": 3.558789875007447e-05,
+      "loss": 0.9009,
+      "step": 16248
+    },
+    {
+      "epoch": 2.8929843304843303,
+      "grad_norm": 1.0619043111801147,
+      "learning_rate": 3.5577192384249856e-05,
+      "loss": 0.7283,
+      "step": 16249
+    },
+    {
+      "epoch": 2.893162393162393,
+      "grad_norm": 0.8889201879501343,
+      "learning_rate": 3.556648728065635e-05,
+      "loss": 0.7538,
+      "step": 16250
+    },
+    {
+      "epoch": 2.893340455840456,
+      "grad_norm": 0.8162542581558228,
+      "learning_rate": 3.555578343950367e-05,
+      "loss": 0.8338,
+      "step": 16251
+    },
+    {
+      "epoch": 2.8935185185185186,
+      "grad_norm": 0.8750036954879761,
+      "learning_rate": 3.5545080861001535e-05,
+      "loss": 0.6956,
+      "step": 16252
+    },
+    {
+      "epoch": 2.8936965811965814,
+      "grad_norm": 0.911232054233551,
+      "learning_rate": 3.553437954535962e-05,
+      "loss": 0.7559,
+      "step": 16253
+    },
+    {
+      "epoch": 2.8938746438746437,
+      "grad_norm": 0.889566957950592,
+      "learning_rate": 3.5523679492787685e-05,
+      "loss": 0.969,
+      "step": 16254
+    },
+    {
+      "epoch": 2.8940527065527064,
+      "grad_norm": 0.933595597743988,
+      "learning_rate": 3.551298070349525e-05,
+      "loss": 0.9767,
+      "step": 16255
+    },
+    {
+      "epoch": 2.894230769230769,
+      "grad_norm": 0.8633596897125244,
+      "learning_rate": 3.550228317769203e-05,
+      "loss": 0.7823,
+      "step": 16256
+    },
+    {
+      "epoch": 2.894408831908832,
+      "grad_norm": 0.8595561981201172,
+      "learning_rate": 3.5491586915587585e-05,
+      "loss": 0.9583,
+      "step": 16257
+    },
+    {
+      "epoch": 2.8945868945868947,
+      "grad_norm": 0.7595796585083008,
+      "learning_rate": 3.548089191739149e-05,
+      "loss": 0.7167,
+      "step": 16258
+    },
+    {
+      "epoch": 2.8947649572649574,
+      "grad_norm": 0.8662711381912231,
+      "learning_rate": 3.54701981833133e-05,
+      "loss": 0.9439,
+      "step": 16259
+    },
+    {
+      "epoch": 2.89494301994302,
+      "grad_norm": 0.8028330206871033,
+      "learning_rate": 3.5459505713562525e-05,
+      "loss": 0.7442,
+      "step": 16260
+    },
+    {
+      "epoch": 2.8951210826210825,
+      "grad_norm": 0.9413794279098511,
+      "learning_rate": 3.5448814508348616e-05,
+      "loss": 0.97,
+      "step": 16261
+    },
+    {
+      "epoch": 2.8952991452991452,
+      "grad_norm": 0.9300761222839355,
+      "learning_rate": 3.543812456788118e-05,
+      "loss": 0.9294,
+      "step": 16262
+    },
+    {
+      "epoch": 2.895477207977208,
+      "grad_norm": 0.7364256381988525,
+      "learning_rate": 3.54274358923695e-05,
+      "loss": 0.5352,
+      "step": 16263
+    },
+    {
+      "epoch": 2.8956552706552707,
+      "grad_norm": 0.8452964425086975,
+      "learning_rate": 3.541674848202314e-05,
+      "loss": 0.7362,
+      "step": 16264
+    },
+    {
+      "epoch": 2.8958333333333335,
+      "grad_norm": 0.8745927214622498,
+      "learning_rate": 3.540606233705137e-05,
+      "loss": 1.0222,
+      "step": 16265
+    },
+    {
+      "epoch": 2.896011396011396,
+      "grad_norm": 0.9202282428741455,
+      "learning_rate": 3.539537745766367e-05,
+      "loss": 1.0853,
+      "step": 16266
+    },
+    {
+      "epoch": 2.8961894586894585,
+      "grad_norm": 0.8450053930282593,
+      "learning_rate": 3.538469384406933e-05,
+      "loss": 0.9295,
+      "step": 16267
+    },
+    {
+      "epoch": 2.8963675213675213,
+      "grad_norm": 0.8761671781539917,
+      "learning_rate": 3.5374011496477656e-05,
+      "loss": 0.9273,
+      "step": 16268
+    },
+    {
+      "epoch": 2.896545584045584,
+      "grad_norm": 0.8562198281288147,
+      "learning_rate": 3.536333041509805e-05,
+      "loss": 0.7654,
+      "step": 16269
+    },
+    {
+      "epoch": 2.896723646723647,
+      "grad_norm": 0.9013510942459106,
+      "learning_rate": 3.535265060013965e-05,
+      "loss": 0.7432,
+      "step": 16270
+    },
+    {
+      "epoch": 2.8969017094017095,
+      "grad_norm": 1.1163274049758911,
+      "learning_rate": 3.534197205181179e-05,
+      "loss": 1.1223,
+      "step": 16271
+    },
+    {
+      "epoch": 2.8970797720797723,
+      "grad_norm": 0.869686484336853,
+      "learning_rate": 3.5331294770323674e-05,
+      "loss": 0.9255,
+      "step": 16272
+    },
+    {
+      "epoch": 2.8972578347578346,
+      "grad_norm": 0.8521125316619873,
+      "learning_rate": 3.53206187558845e-05,
+      "loss": 1.0221,
+      "step": 16273
+    },
+    {
+      "epoch": 2.8974358974358974,
+      "grad_norm": 0.9807026982307434,
+      "learning_rate": 3.530994400870345e-05,
+      "loss": 1.0133,
+      "step": 16274
+    },
+    {
+      "epoch": 2.89761396011396,
+      "grad_norm": 0.9236428141593933,
+      "learning_rate": 3.529927052898967e-05,
+      "loss": 0.8711,
+      "step": 16275
+    },
+    {
+      "epoch": 2.897792022792023,
+      "grad_norm": 0.8108885884284973,
+      "learning_rate": 3.528859831695227e-05,
+      "loss": 0.917,
+      "step": 16276
+    },
+    {
+      "epoch": 2.8979700854700856,
+      "grad_norm": 0.7522720098495483,
+      "learning_rate": 3.527792737280036e-05,
+      "loss": 0.6776,
+      "step": 16277
+    },
+    {
+      "epoch": 2.898148148148148,
+      "grad_norm": 0.9261712431907654,
+      "learning_rate": 3.526725769674297e-05,
+      "loss": 1.0728,
+      "step": 16278
+    },
+    {
+      "epoch": 2.8983262108262107,
+      "grad_norm": 0.849559485912323,
+      "learning_rate": 3.5256589288989285e-05,
+      "loss": 0.9886,
+      "step": 16279
+    },
+    {
+      "epoch": 2.8985042735042734,
+      "grad_norm": 0.9024273157119751,
+      "learning_rate": 3.5245922149748155e-05,
+      "loss": 0.9296,
+      "step": 16280
+    },
+    {
+      "epoch": 2.898682336182336,
+      "grad_norm": 0.8285173773765564,
+      "learning_rate": 3.52352562792287e-05,
+      "loss": 0.8776,
+      "step": 16281
+    },
+    {
+      "epoch": 2.898860398860399,
+      "grad_norm": 0.895517885684967,
+      "learning_rate": 3.522459167763987e-05,
+      "loss": 0.8975,
+      "step": 16282
+    },
+    {
+      "epoch": 2.8990384615384617,
+      "grad_norm": 0.8449265956878662,
+      "learning_rate": 3.521392834519061e-05,
+      "loss": 0.6736,
+      "step": 16283
+    },
+    {
+      "epoch": 2.8992165242165244,
+      "grad_norm": 0.8652997612953186,
+      "learning_rate": 3.520326628208983e-05,
+      "loss": 0.7834,
+      "step": 16284
+    },
+    {
+      "epoch": 2.8993945868945867,
+      "grad_norm": 0.9012393951416016,
+      "learning_rate": 3.519260548854642e-05,
+      "loss": 0.8082,
+      "step": 16285
+    },
+    {
+      "epoch": 2.8995726495726495,
+      "grad_norm": 0.9048463106155396,
+      "learning_rate": 3.5181945964769333e-05,
+      "loss": 0.8078,
+      "step": 16286
+    },
+    {
+      "epoch": 2.8997507122507122,
+      "grad_norm": 0.8788473010063171,
+      "learning_rate": 3.5171287710967314e-05,
+      "loss": 0.8022,
+      "step": 16287
+    },
+    {
+      "epoch": 2.899928774928775,
+      "grad_norm": 0.8322813510894775,
+      "learning_rate": 3.516063072734928e-05,
+      "loss": 0.8855,
+      "step": 16288
+    },
+    {
+      "epoch": 2.9001068376068377,
+      "grad_norm": 0.8762373328208923,
+      "learning_rate": 3.514997501412398e-05,
+      "loss": 0.7858,
+      "step": 16289
+    },
+    {
+      "epoch": 2.9002849002849,
+      "grad_norm": 0.7718746066093445,
+      "learning_rate": 3.513932057150021e-05,
+      "loss": 0.6881,
+      "step": 16290
+    },
+    {
+      "epoch": 2.900462962962963,
+      "grad_norm": 1.0138204097747803,
+      "learning_rate": 3.5128667399686724e-05,
+      "loss": 0.9378,
+      "step": 16291
+    },
+    {
+      "epoch": 2.9006410256410255,
+      "grad_norm": 0.6968120336532593,
+      "learning_rate": 3.5118015498892234e-05,
+      "loss": 0.7135,
+      "step": 16292
+    },
+    {
+      "epoch": 2.9008190883190883,
+      "grad_norm": 0.7925532460212708,
+      "learning_rate": 3.510736486932542e-05,
+      "loss": 0.7414,
+      "step": 16293
+    },
+    {
+      "epoch": 2.900997150997151,
+      "grad_norm": 1.0432425737380981,
+      "learning_rate": 3.5096715511195056e-05,
+      "loss": 0.8957,
+      "step": 16294
+    },
+    {
+      "epoch": 2.901175213675214,
+      "grad_norm": 0.8664390444755554,
+      "learning_rate": 3.508606742470966e-05,
+      "loss": 0.7832,
+      "step": 16295
+    },
+    {
+      "epoch": 2.9013532763532766,
+      "grad_norm": 0.8470353484153748,
+      "learning_rate": 3.507542061007795e-05,
+      "loss": 0.8133,
+      "step": 16296
+    },
+    {
+      "epoch": 2.9015313390313393,
+      "grad_norm": 0.8339848518371582,
+      "learning_rate": 3.5064775067508514e-05,
+      "loss": 0.9275,
+      "step": 16297
+    },
+    {
+      "epoch": 2.9017094017094016,
+      "grad_norm": 0.8686776757240295,
+      "learning_rate": 3.5054130797209916e-05,
+      "loss": 0.9238,
+      "step": 16298
+    },
+    {
+      "epoch": 2.9018874643874644,
+      "grad_norm": 0.8178901076316833,
+      "learning_rate": 3.504348779939071e-05,
+      "loss": 1.0776,
+      "step": 16299
+    },
+    {
+      "epoch": 2.902065527065527,
+      "grad_norm": 0.8446379899978638,
+      "learning_rate": 3.5032846074259426e-05,
+      "loss": 0.715,
+      "step": 16300
+    },
+    {
+      "epoch": 2.90224358974359,
+      "grad_norm": 0.8320762515068054,
+      "learning_rate": 3.502220562202457e-05,
+      "loss": 0.8543,
+      "step": 16301
+    },
+    {
+      "epoch": 2.902421652421652,
+      "grad_norm": 0.9112939238548279,
+      "learning_rate": 3.501156644289462e-05,
+      "loss": 0.8239,
+      "step": 16302
+    },
+    {
+      "epoch": 2.902599715099715,
+      "grad_norm": 0.8749213218688965,
+      "learning_rate": 3.500092853707797e-05,
+      "loss": 0.8057,
+      "step": 16303
+    },
+    {
+      "epoch": 2.9027777777777777,
+      "grad_norm": 0.8245106339454651,
+      "learning_rate": 3.4990291904783143e-05,
+      "loss": 0.8955,
+      "step": 16304
+    },
+    {
+      "epoch": 2.9029558404558404,
+      "grad_norm": 0.7982145547866821,
+      "learning_rate": 3.4979656546218506e-05,
+      "loss": 0.6641,
+      "step": 16305
+    },
+    {
+      "epoch": 2.903133903133903,
+      "grad_norm": 0.8777986168861389,
+      "learning_rate": 3.496902246159244e-05,
+      "loss": 0.7365,
+      "step": 16306
+    },
+    {
+      "epoch": 2.903311965811966,
+      "grad_norm": 0.8463431596755981,
+      "learning_rate": 3.4958389651113275e-05,
+      "loss": 1.0434,
+      "step": 16307
+    },
+    {
+      "epoch": 2.9034900284900287,
+      "grad_norm": 0.849039614200592,
+      "learning_rate": 3.494775811498931e-05,
+      "loss": 0.8813,
+      "step": 16308
+    },
+    {
+      "epoch": 2.9036680911680914,
+      "grad_norm": 0.7352656126022339,
+      "learning_rate": 3.4937127853428976e-05,
+      "loss": 0.7936,
+      "step": 16309
+    },
+    {
+      "epoch": 2.9038461538461537,
+      "grad_norm": 0.764543354511261,
+      "learning_rate": 3.49264988666404e-05,
+      "loss": 0.7253,
+      "step": 16310
+    },
+    {
+      "epoch": 2.9040242165242165,
+      "grad_norm": 0.7932603359222412,
+      "learning_rate": 3.491587115483196e-05,
+      "loss": 0.9217,
+      "step": 16311
+    },
+    {
+      "epoch": 2.9042022792022792,
+      "grad_norm": 0.9001819491386414,
+      "learning_rate": 3.490524471821175e-05,
+      "loss": 0.8628,
+      "step": 16312
+    },
+    {
+      "epoch": 2.904380341880342,
+      "grad_norm": 0.7983253002166748,
+      "learning_rate": 3.4894619556988085e-05,
+      "loss": 0.9858,
+      "step": 16313
+    },
+    {
+      "epoch": 2.9045584045584043,
+      "grad_norm": 0.747611403465271,
+      "learning_rate": 3.488399567136911e-05,
+      "loss": 0.8656,
+      "step": 16314
+    },
+    {
+      "epoch": 2.904736467236467,
+      "grad_norm": 0.8770463466644287,
+      "learning_rate": 3.487337306156296e-05,
+      "loss": 1.0154,
+      "step": 16315
+    },
+    {
+      "epoch": 2.90491452991453,
+      "grad_norm": 0.8757193088531494,
+      "learning_rate": 3.4862751727777797e-05,
+      "loss": 0.7931,
+      "step": 16316
+    },
+    {
+      "epoch": 2.9050925925925926,
+      "grad_norm": 0.8232926726341248,
+      "learning_rate": 3.485213167022169e-05,
+      "loss": 0.8477,
+      "step": 16317
+    },
+    {
+      "epoch": 2.9052706552706553,
+      "grad_norm": 0.8445250391960144,
+      "learning_rate": 3.48415128891027e-05,
+      "loss": 1.0618,
+      "step": 16318
+    },
+    {
+      "epoch": 2.905448717948718,
+      "grad_norm": 0.8172810673713684,
+      "learning_rate": 3.483089538462897e-05,
+      "loss": 0.7882,
+      "step": 16319
+    },
+    {
+      "epoch": 2.905626780626781,
+      "grad_norm": 0.910757303237915,
+      "learning_rate": 3.4820279157008404e-05,
+      "loss": 0.9383,
+      "step": 16320
+    },
+    {
+      "epoch": 2.9058048433048436,
+      "grad_norm": 0.8837474584579468,
+      "learning_rate": 3.480966420644911e-05,
+      "loss": 1.1006,
+      "step": 16321
+    },
+    {
+      "epoch": 2.905982905982906,
+      "grad_norm": 0.7739782333374023,
+      "learning_rate": 3.4799050533159014e-05,
+      "loss": 0.6885,
+      "step": 16322
+    },
+    {
+      "epoch": 2.9061609686609686,
+      "grad_norm": 0.911738932132721,
+      "learning_rate": 3.478843813734609e-05,
+      "loss": 0.993,
+      "step": 16323
+    },
+    {
+      "epoch": 2.9063390313390314,
+      "grad_norm": 0.8834345936775208,
+      "learning_rate": 3.477782701921825e-05,
+      "loss": 0.8724,
+      "step": 16324
+    },
+    {
+      "epoch": 2.906517094017094,
+      "grad_norm": 0.8103434443473816,
+      "learning_rate": 3.476721717898337e-05,
+      "loss": 1.0517,
+      "step": 16325
+    },
+    {
+      "epoch": 2.9066951566951564,
+      "grad_norm": 0.8648924827575684,
+      "learning_rate": 3.475660861684943e-05,
+      "loss": 0.7223,
+      "step": 16326
+    },
+    {
+      "epoch": 2.906873219373219,
+      "grad_norm": 0.8762979507446289,
+      "learning_rate": 3.4746001333024134e-05,
+      "loss": 0.8421,
+      "step": 16327
+    },
+    {
+      "epoch": 2.907051282051282,
+      "grad_norm": 0.9596083164215088,
+      "learning_rate": 3.4735395327715434e-05,
+      "loss": 1.0206,
+      "step": 16328
+    },
+    {
+      "epoch": 2.9072293447293447,
+      "grad_norm": 0.8210311532020569,
+      "learning_rate": 3.472479060113107e-05,
+      "loss": 0.836,
+      "step": 16329
+    },
+    {
+      "epoch": 2.9074074074074074,
+      "grad_norm": 0.8436611890792847,
+      "learning_rate": 3.471418715347886e-05,
+      "loss": 0.8459,
+      "step": 16330
+    },
+    {
+      "epoch": 2.90758547008547,
+      "grad_norm": 0.9176212549209595,
+      "learning_rate": 3.470358498496652e-05,
+      "loss": 0.8227,
+      "step": 16331
+    },
+    {
+      "epoch": 2.907763532763533,
+      "grad_norm": 0.7709631323814392,
+      "learning_rate": 3.4692984095801796e-05,
+      "loss": 0.8096,
+      "step": 16332
+    },
+    {
+      "epoch": 2.9079415954415957,
+      "grad_norm": 0.7727495431900024,
+      "learning_rate": 3.4682384486192346e-05,
+      "loss": 0.6843,
+      "step": 16333
+    },
+    {
+      "epoch": 2.908119658119658,
+      "grad_norm": 0.9743461608886719,
+      "learning_rate": 3.4671786156345955e-05,
+      "loss": 0.7542,
+      "step": 16334
+    },
+    {
+      "epoch": 2.9082977207977208,
+      "grad_norm": 0.9035171270370483,
+      "learning_rate": 3.466118910647014e-05,
+      "loss": 0.8827,
+      "step": 16335
+    },
+    {
+      "epoch": 2.9084757834757835,
+      "grad_norm": 1.1434134244918823,
+      "learning_rate": 3.465059333677266e-05,
+      "loss": 0.8026,
+      "step": 16336
+    },
+    {
+      "epoch": 2.9086538461538463,
+      "grad_norm": 0.8229905962944031,
+      "learning_rate": 3.4639998847461e-05,
+      "loss": 0.7415,
+      "step": 16337
+    },
+    {
+      "epoch": 2.9088319088319086,
+      "grad_norm": 0.9193732738494873,
+      "learning_rate": 3.462940563874281e-05,
+      "loss": 0.9773,
+      "step": 16338
+    },
+    {
+      "epoch": 2.9090099715099713,
+      "grad_norm": 0.8461189270019531,
+      "learning_rate": 3.4618813710825614e-05,
+      "loss": 0.9169,
+      "step": 16339
+    },
+    {
+      "epoch": 2.909188034188034,
+      "grad_norm": 0.9471017718315125,
+      "learning_rate": 3.460822306391696e-05,
+      "loss": 0.9941,
+      "step": 16340
+    },
+    {
+      "epoch": 2.909366096866097,
+      "grad_norm": 0.8515542149543762,
+      "learning_rate": 3.459763369822432e-05,
+      "loss": 0.8209,
+      "step": 16341
+    },
+    {
+      "epoch": 2.9095441595441596,
+      "grad_norm": 0.8520704507827759,
+      "learning_rate": 3.458704561395519e-05,
+      "loss": 0.8443,
+      "step": 16342
+    },
+    {
+      "epoch": 2.9097222222222223,
+      "grad_norm": 0.9236885905265808,
+      "learning_rate": 3.457645881131699e-05,
+      "loss": 0.8407,
+      "step": 16343
+    },
+    {
+      "epoch": 2.909900284900285,
+      "grad_norm": 0.9255889654159546,
+      "learning_rate": 3.4565873290517203e-05,
+      "loss": 0.9754,
+      "step": 16344
+    },
+    {
+      "epoch": 2.910078347578348,
+      "grad_norm": 0.7904002666473389,
+      "learning_rate": 3.455528905176321e-05,
+      "loss": 0.7123,
+      "step": 16345
+    },
+    {
+      "epoch": 2.91025641025641,
+      "grad_norm": 0.821877658367157,
+      "learning_rate": 3.454470609526237e-05,
+      "loss": 0.7024,
+      "step": 16346
+    },
+    {
+      "epoch": 2.910434472934473,
+      "grad_norm": 0.84690260887146,
+      "learning_rate": 3.453412442122205e-05,
+      "loss": 0.7551,
+      "step": 16347
+    },
+    {
+      "epoch": 2.9106125356125356,
+      "grad_norm": 1.0308923721313477,
+      "learning_rate": 3.452354402984955e-05,
+      "loss": 1.0316,
+      "step": 16348
+    },
+    {
+      "epoch": 2.9107905982905984,
+      "grad_norm": 0.7016192674636841,
+      "learning_rate": 3.451296492135221e-05,
+      "loss": 0.6816,
+      "step": 16349
+    },
+    {
+      "epoch": 2.9109686609686607,
+      "grad_norm": 0.942915141582489,
+      "learning_rate": 3.4502387095937237e-05,
+      "loss": 0.9029,
+      "step": 16350
+    },
+    {
+      "epoch": 2.9111467236467234,
+      "grad_norm": 0.7511810660362244,
+      "learning_rate": 3.449181055381201e-05,
+      "loss": 0.641,
+      "step": 16351
+    },
+    {
+      "epoch": 2.911324786324786,
+      "grad_norm": 0.8904419541358948,
+      "learning_rate": 3.44812352951836e-05,
+      "loss": 0.8176,
+      "step": 16352
+    },
+    {
+      "epoch": 2.911502849002849,
+      "grad_norm": 0.7585623264312744,
+      "learning_rate": 3.447066132025931e-05,
+      "loss": 0.7815,
+      "step": 16353
+    },
+    {
+      "epoch": 2.9116809116809117,
+      "grad_norm": 0.8587543964385986,
+      "learning_rate": 3.446008862924629e-05,
+      "loss": 0.951,
+      "step": 16354
+    },
+    {
+      "epoch": 2.9118589743589745,
+      "grad_norm": 0.7638232707977295,
+      "learning_rate": 3.444951722235169e-05,
+      "loss": 0.6784,
+      "step": 16355
+    },
+    {
+      "epoch": 2.912037037037037,
+      "grad_norm": 0.7942266464233398,
+      "learning_rate": 3.4438947099782624e-05,
+      "loss": 0.7211,
+      "step": 16356
+    },
+    {
+      "epoch": 2.9122150997151,
+      "grad_norm": 0.7207983732223511,
+      "learning_rate": 3.4428378261746195e-05,
+      "loss": 0.7515,
+      "step": 16357
+    },
+    {
+      "epoch": 2.9123931623931623,
+      "grad_norm": 0.8683337569236755,
+      "learning_rate": 3.4417810708449495e-05,
+      "loss": 0.8724,
+      "step": 16358
+    },
+    {
+      "epoch": 2.912571225071225,
+      "grad_norm": 0.8052859902381897,
+      "learning_rate": 3.440724444009955e-05,
+      "loss": 0.8331,
+      "step": 16359
+    },
+    {
+      "epoch": 2.9127492877492878,
+      "grad_norm": 0.7425459027290344,
+      "learning_rate": 3.439667945690336e-05,
+      "loss": 0.5711,
+      "step": 16360
+    },
+    {
+      "epoch": 2.9129273504273505,
+      "grad_norm": 0.8537404537200928,
+      "learning_rate": 3.438611575906803e-05,
+      "loss": 0.8283,
+      "step": 16361
+    },
+    {
+      "epoch": 2.9131054131054133,
+      "grad_norm": 0.794684648513794,
+      "learning_rate": 3.437555334680038e-05,
+      "loss": 0.679,
+      "step": 16362
+    },
+    {
+      "epoch": 2.9132834757834756,
+      "grad_norm": 0.8330501914024353,
+      "learning_rate": 3.436499222030748e-05,
+      "loss": 0.6892,
+      "step": 16363
+    },
+    {
+      "epoch": 2.9134615384615383,
+      "grad_norm": 0.9036495089530945,
+      "learning_rate": 3.435443237979621e-05,
+      "loss": 0.8349,
+      "step": 16364
+    },
+    {
+      "epoch": 2.913639601139601,
+      "grad_norm": 0.776745080947876,
+      "learning_rate": 3.434387382547344e-05,
+      "loss": 0.6691,
+      "step": 16365
+    },
+    {
+      "epoch": 2.913817663817664,
+      "grad_norm": 0.8921083807945251,
+      "learning_rate": 3.4333316557546145e-05,
+      "loss": 0.9497,
+      "step": 16366
+    },
+    {
+      "epoch": 2.9139957264957266,
+      "grad_norm": 0.8435728549957275,
+      "learning_rate": 3.4322760576221023e-05,
+      "loss": 0.787,
+      "step": 16367
+    },
+    {
+      "epoch": 2.9141737891737893,
+      "grad_norm": 0.7281554341316223,
+      "learning_rate": 3.4312205881705015e-05,
+      "loss": 0.7604,
+      "step": 16368
+    },
+    {
+      "epoch": 2.914351851851852,
+      "grad_norm": 0.8474677801132202,
+      "learning_rate": 3.430165247420488e-05,
+      "loss": 0.9052,
+      "step": 16369
+    },
+    {
+      "epoch": 2.9145299145299144,
+      "grad_norm": 0.9927265048027039,
+      "learning_rate": 3.42911003539274e-05,
+      "loss": 0.7486,
+      "step": 16370
+    },
+    {
+      "epoch": 2.914707977207977,
+      "grad_norm": 0.7754966020584106,
+      "learning_rate": 3.4280549521079286e-05,
+      "loss": 0.8368,
+      "step": 16371
+    },
+    {
+      "epoch": 2.91488603988604,
+      "grad_norm": 0.7774887084960938,
+      "learning_rate": 3.4269999975867295e-05,
+      "loss": 0.7014,
+      "step": 16372
+    },
+    {
+      "epoch": 2.9150641025641026,
+      "grad_norm": 0.9286267161369324,
+      "learning_rate": 3.4259451718498115e-05,
+      "loss": 1.0414,
+      "step": 16373
+    },
+    {
+      "epoch": 2.9152421652421654,
+      "grad_norm": 0.8269815444946289,
+      "learning_rate": 3.424890474917841e-05,
+      "loss": 0.9731,
+      "step": 16374
+    },
+    {
+      "epoch": 2.9154202279202277,
+      "grad_norm": 0.8319926857948303,
+      "learning_rate": 3.42383590681148e-05,
+      "loss": 0.9047,
+      "step": 16375
+    },
+    {
+      "epoch": 2.9155982905982905,
+      "grad_norm": 0.8976882696151733,
+      "learning_rate": 3.4227814675514e-05,
+      "loss": 0.8608,
+      "step": 16376
+    },
+    {
+      "epoch": 2.915776353276353,
+      "grad_norm": 0.80831378698349,
+      "learning_rate": 3.421727157158248e-05,
+      "loss": 0.9213,
+      "step": 16377
+    },
+    {
+      "epoch": 2.915954415954416,
+      "grad_norm": 0.881201982498169,
+      "learning_rate": 3.42067297565269e-05,
+      "loss": 0.9015,
+      "step": 16378
+    },
+    {
+      "epoch": 2.9161324786324787,
+      "grad_norm": 0.7797132134437561,
+      "learning_rate": 3.4196189230553775e-05,
+      "loss": 0.9002,
+      "step": 16379
+    },
+    {
+      "epoch": 2.9163105413105415,
+      "grad_norm": 1.0163915157318115,
+      "learning_rate": 3.4185649993869626e-05,
+      "loss": 1.0185,
+      "step": 16380
+    },
+    {
+      "epoch": 2.916488603988604,
+      "grad_norm": 0.8868339657783508,
+      "learning_rate": 3.417511204668096e-05,
+      "loss": 0.7316,
+      "step": 16381
+    },
+    {
+      "epoch": 2.9166666666666665,
+      "grad_norm": 0.8215135931968689,
+      "learning_rate": 3.416457538919422e-05,
+      "loss": 0.8693,
+      "step": 16382
+    },
+    {
+      "epoch": 2.9168447293447293,
+      "grad_norm": 0.8070623278617859,
+      "learning_rate": 3.4154040021615876e-05,
+      "loss": 0.8532,
+      "step": 16383
+    },
+    {
+      "epoch": 2.917022792022792,
+      "grad_norm": 0.8857349157333374,
+      "learning_rate": 3.4143505944152284e-05,
+      "loss": 0.961,
+      "step": 16384
+    },
+    {
+      "epoch": 2.9172008547008548,
+      "grad_norm": 0.8146688342094421,
+      "learning_rate": 3.4132973157009936e-05,
+      "loss": 0.8597,
+      "step": 16385
+    },
+    {
+      "epoch": 2.9173789173789175,
+      "grad_norm": 0.8757439851760864,
+      "learning_rate": 3.4122441660395156e-05,
+      "loss": 0.7683,
+      "step": 16386
+    },
+    {
+      "epoch": 2.91755698005698,
+      "grad_norm": 0.8319665789604187,
+      "learning_rate": 3.4111911454514266e-05,
+      "loss": 0.8034,
+      "step": 16387
+    },
+    {
+      "epoch": 2.9177350427350426,
+      "grad_norm": 0.8103782534599304,
+      "learning_rate": 3.410138253957361e-05,
+      "loss": 0.8469,
+      "step": 16388
+    },
+    {
+      "epoch": 2.9179131054131053,
+      "grad_norm": 0.8783053755760193,
+      "learning_rate": 3.4090854915779466e-05,
+      "loss": 0.7997,
+      "step": 16389
+    },
+    {
+      "epoch": 2.918091168091168,
+      "grad_norm": 0.9793184995651245,
+      "learning_rate": 3.408032858333808e-05,
+      "loss": 0.8934,
+      "step": 16390
+    },
+    {
+      "epoch": 2.918269230769231,
+      "grad_norm": 0.8603301048278809,
+      "learning_rate": 3.406980354245578e-05,
+      "loss": 0.847,
+      "step": 16391
+    },
+    {
+      "epoch": 2.9184472934472936,
+      "grad_norm": 0.9763472080230713,
+      "learning_rate": 3.405927979333866e-05,
+      "loss": 0.9491,
+      "step": 16392
+    },
+    {
+      "epoch": 2.9186253561253563,
+      "grad_norm": 0.9340085983276367,
+      "learning_rate": 3.4048757336193036e-05,
+      "loss": 0.8913,
+      "step": 16393
+    },
+    {
+      "epoch": 2.9188034188034186,
+      "grad_norm": 1.051066517829895,
+      "learning_rate": 3.4038236171224946e-05,
+      "loss": 0.9903,
+      "step": 16394
+    },
+    {
+      "epoch": 2.9189814814814814,
+      "grad_norm": 0.8462432026863098,
+      "learning_rate": 3.402771629864062e-05,
+      "loss": 0.868,
+      "step": 16395
+    },
+    {
+      "epoch": 2.919159544159544,
+      "grad_norm": 0.9149147868156433,
+      "learning_rate": 3.401719771864615e-05,
+      "loss": 0.9215,
+      "step": 16396
+    },
+    {
+      "epoch": 2.919337606837607,
+      "grad_norm": 0.7974846959114075,
+      "learning_rate": 3.4006680431447636e-05,
+      "loss": 0.9577,
+      "step": 16397
+    },
+    {
+      "epoch": 2.9195156695156697,
+      "grad_norm": 0.9298149943351746,
+      "learning_rate": 3.399616443725112e-05,
+      "loss": 0.8977,
+      "step": 16398
+    },
+    {
+      "epoch": 2.919693732193732,
+      "grad_norm": 0.8799013495445251,
+      "learning_rate": 3.398564973626265e-05,
+      "loss": 1.0161,
+      "step": 16399
+    },
+    {
+      "epoch": 2.9198717948717947,
+      "grad_norm": 0.8730150461196899,
+      "learning_rate": 3.3975136328688206e-05,
+      "loss": 0.7953,
+      "step": 16400
+    },
+    {
+      "epoch": 2.9200498575498575,
+      "grad_norm": 1.0036572217941284,
+      "learning_rate": 3.396462421473387e-05,
+      "loss": 0.5941,
+      "step": 16401
+    },
+    {
+      "epoch": 2.92022792022792,
+      "grad_norm": 0.8903291821479797,
+      "learning_rate": 3.39541133946055e-05,
+      "loss": 0.7715,
+      "step": 16402
+    },
+    {
+      "epoch": 2.920405982905983,
+      "grad_norm": 0.8254680633544922,
+      "learning_rate": 3.39436038685091e-05,
+      "loss": 0.7268,
+      "step": 16403
+    },
+    {
+      "epoch": 2.9205840455840457,
+      "grad_norm": 0.8490110039710999,
+      "learning_rate": 3.393309563665057e-05,
+      "loss": 0.8684,
+      "step": 16404
+    },
+    {
+      "epoch": 2.9207621082621085,
+      "grad_norm": 0.8137654066085815,
+      "learning_rate": 3.392258869923575e-05,
+      "loss": 0.7068,
+      "step": 16405
+    },
+    {
+      "epoch": 2.9209401709401708,
+      "grad_norm": 0.8084680438041687,
+      "learning_rate": 3.39120830564706e-05,
+      "loss": 0.8214,
+      "step": 16406
+    },
+    {
+      "epoch": 2.9211182336182335,
+      "grad_norm": 0.9359310269355774,
+      "learning_rate": 3.3901578708560835e-05,
+      "loss": 1.0247,
+      "step": 16407
+    },
+    {
+      "epoch": 2.9212962962962963,
+      "grad_norm": 0.826842725276947,
+      "learning_rate": 3.38910756557124e-05,
+      "loss": 0.8018,
+      "step": 16408
+    },
+    {
+      "epoch": 2.921474358974359,
+      "grad_norm": 0.9049538373947144,
+      "learning_rate": 3.388057389813093e-05,
+      "loss": 0.8825,
+      "step": 16409
+    },
+    {
+      "epoch": 2.921652421652422,
+      "grad_norm": 0.7966105937957764,
+      "learning_rate": 3.387007343602231e-05,
+      "loss": 0.8716,
+      "step": 16410
+    },
+    {
+      "epoch": 2.921830484330484,
+      "grad_norm": 0.8647517561912537,
+      "learning_rate": 3.385957426959222e-05,
+      "loss": 0.9947,
+      "step": 16411
+    },
+    {
+      "epoch": 2.922008547008547,
+      "grad_norm": 0.8983362317085266,
+      "learning_rate": 3.384907639904638e-05,
+      "loss": 0.8701,
+      "step": 16412
+    },
+    {
+      "epoch": 2.9221866096866096,
+      "grad_norm": 0.7970262765884399,
+      "learning_rate": 3.3838579824590465e-05,
+      "loss": 0.8921,
+      "step": 16413
+    },
+    {
+      "epoch": 2.9223646723646723,
+      "grad_norm": 0.8115224242210388,
+      "learning_rate": 3.382808454643015e-05,
+      "loss": 0.5948,
+      "step": 16414
+    },
+    {
+      "epoch": 2.922542735042735,
+      "grad_norm": 0.832318902015686,
+      "learning_rate": 3.381759056477102e-05,
+      "loss": 0.8399,
+      "step": 16415
+    },
+    {
+      "epoch": 2.922720797720798,
+      "grad_norm": 0.9396497011184692,
+      "learning_rate": 3.380709787981878e-05,
+      "loss": 0.7322,
+      "step": 16416
+    },
+    {
+      "epoch": 2.9228988603988606,
+      "grad_norm": 0.7956545352935791,
+      "learning_rate": 3.3796606491778904e-05,
+      "loss": 0.8566,
+      "step": 16417
+    },
+    {
+      "epoch": 2.9230769230769234,
+      "grad_norm": 0.8257092833518982,
+      "learning_rate": 3.378611640085705e-05,
+      "loss": 0.8682,
+      "step": 16418
+    },
+    {
+      "epoch": 2.9232549857549857,
+      "grad_norm": 0.7565430402755737,
+      "learning_rate": 3.377562760725863e-05,
+      "loss": 0.8513,
+      "step": 16419
+    },
+    {
+      "epoch": 2.9234330484330484,
+      "grad_norm": 0.769944965839386,
+      "learning_rate": 3.3765140111189265e-05,
+      "loss": 0.8869,
+      "step": 16420
+    },
+    {
+      "epoch": 2.923611111111111,
+      "grad_norm": 0.8117483854293823,
+      "learning_rate": 3.375465391285438e-05,
+      "loss": 0.8718,
+      "step": 16421
+    },
+    {
+      "epoch": 2.923789173789174,
+      "grad_norm": 0.8011773824691772,
+      "learning_rate": 3.374416901245944e-05,
+      "loss": 0.9326,
+      "step": 16422
+    },
+    {
+      "epoch": 2.923967236467236,
+      "grad_norm": 0.8096779584884644,
+      "learning_rate": 3.373368541020987e-05,
+      "loss": 0.9317,
+      "step": 16423
+    },
+    {
+      "epoch": 2.924145299145299,
+      "grad_norm": 0.8383152484893799,
+      "learning_rate": 3.3723203106311055e-05,
+      "loss": 0.9399,
+      "step": 16424
+    },
+    {
+      "epoch": 2.9243233618233617,
+      "grad_norm": 0.9268670678138733,
+      "learning_rate": 3.371272210096842e-05,
+      "loss": 0.6867,
+      "step": 16425
+    },
+    {
+      "epoch": 2.9245014245014245,
+      "grad_norm": 0.941338837146759,
+      "learning_rate": 3.3702242394387294e-05,
+      "loss": 0.9164,
+      "step": 16426
+    },
+    {
+      "epoch": 2.9246794871794872,
+      "grad_norm": 0.7108882665634155,
+      "learning_rate": 3.3691763986773014e-05,
+      "loss": 0.6956,
+      "step": 16427
+    },
+    {
+      "epoch": 2.92485754985755,
+      "grad_norm": 0.9144331812858582,
+      "learning_rate": 3.3681286878330876e-05,
+      "loss": 0.8148,
+      "step": 16428
+    },
+    {
+      "epoch": 2.9250356125356127,
+      "grad_norm": 0.882851243019104,
+      "learning_rate": 3.367081106926615e-05,
+      "loss": 0.846,
+      "step": 16429
+    },
+    {
+      "epoch": 2.9252136752136755,
+      "grad_norm": 0.9508523941040039,
+      "learning_rate": 3.366033655978409e-05,
+      "loss": 0.9637,
+      "step": 16430
+    },
+    {
+      "epoch": 2.925391737891738,
+      "grad_norm": 0.7770804166793823,
+      "learning_rate": 3.3649863350089935e-05,
+      "loss": 0.6673,
+      "step": 16431
+    },
+    {
+      "epoch": 2.9255698005698005,
+      "grad_norm": 0.8342770934104919,
+      "learning_rate": 3.3639391440388845e-05,
+      "loss": 1.0366,
+      "step": 16432
+    },
+    {
+      "epoch": 2.9257478632478633,
+      "grad_norm": 0.8854461908340454,
+      "learning_rate": 3.362892083088609e-05,
+      "loss": 0.7885,
+      "step": 16433
+    },
+    {
+      "epoch": 2.925925925925926,
+      "grad_norm": 0.8054807186126709,
+      "learning_rate": 3.36184515217867e-05,
+      "loss": 0.7705,
+      "step": 16434
+    },
+    {
+      "epoch": 2.9261039886039883,
+      "grad_norm": 0.9037294983863831,
+      "learning_rate": 3.360798351329587e-05,
+      "loss": 1.077,
+      "step": 16435
+    },
+    {
+      "epoch": 2.926282051282051,
+      "grad_norm": 0.8743478059768677,
+      "learning_rate": 3.35975168056187e-05,
+      "loss": 0.8633,
+      "step": 16436
+    },
+    {
+      "epoch": 2.926460113960114,
+      "grad_norm": 0.8879369497299194,
+      "learning_rate": 3.3587051398960245e-05,
+      "loss": 0.8059,
+      "step": 16437
+    },
+    {
+      "epoch": 2.9266381766381766,
+      "grad_norm": 0.9445768594741821,
+      "learning_rate": 3.3576587293525564e-05,
+      "loss": 0.861,
+      "step": 16438
+    },
+    {
+      "epoch": 2.9268162393162394,
+      "grad_norm": 0.8131009340286255,
+      "learning_rate": 3.356612448951967e-05,
+      "loss": 0.8299,
+      "step": 16439
+    },
+    {
+      "epoch": 2.926994301994302,
+      "grad_norm": 0.8781847953796387,
+      "learning_rate": 3.3555662987147515e-05,
+      "loss": 0.8447,
+      "step": 16440
+    },
+    {
+      "epoch": 2.927172364672365,
+      "grad_norm": 0.9173591732978821,
+      "learning_rate": 3.3545202786614206e-05,
+      "loss": 0.9297,
+      "step": 16441
+    },
+    {
+      "epoch": 2.9273504273504276,
+      "grad_norm": 0.9596586227416992,
+      "learning_rate": 3.353474388812452e-05,
+      "loss": 0.9109,
+      "step": 16442
+    },
+    {
+      "epoch": 2.92752849002849,
+      "grad_norm": 0.8325486183166504,
+      "learning_rate": 3.352428629188349e-05,
+      "loss": 0.6822,
+      "step": 16443
+    },
+    {
+      "epoch": 2.9277065527065527,
+      "grad_norm": 0.8758354187011719,
+      "learning_rate": 3.351382999809599e-05,
+      "loss": 0.9522,
+      "step": 16444
+    },
+    {
+      "epoch": 2.9278846153846154,
+      "grad_norm": 0.909718930721283,
+      "learning_rate": 3.3503375006966866e-05,
+      "loss": 0.8226,
+      "step": 16445
+    },
+    {
+      "epoch": 2.928062678062678,
+      "grad_norm": 0.9259094595909119,
+      "learning_rate": 3.3492921318700974e-05,
+      "loss": 0.9895,
+      "step": 16446
+    },
+    {
+      "epoch": 2.9282407407407405,
+      "grad_norm": 0.7815300226211548,
+      "learning_rate": 3.348246893350311e-05,
+      "loss": 0.8854,
+      "step": 16447
+    },
+    {
+      "epoch": 2.9284188034188032,
+      "grad_norm": 0.8576910495758057,
+      "learning_rate": 3.3472017851578154e-05,
+      "loss": 0.9379,
+      "step": 16448
+    },
+    {
+      "epoch": 2.928596866096866,
+      "grad_norm": 0.9139176607131958,
+      "learning_rate": 3.3461568073130735e-05,
+      "loss": 0.9372,
+      "step": 16449
+    },
+    {
+      "epoch": 2.9287749287749287,
+      "grad_norm": 0.8304639458656311,
+      "learning_rate": 3.34511195983657e-05,
+      "loss": 0.7105,
+      "step": 16450
+    },
+    {
+      "epoch": 2.9289529914529915,
+      "grad_norm": 0.8689056634902954,
+      "learning_rate": 3.344067242748774e-05,
+      "loss": 0.9518,
+      "step": 16451
+    },
+    {
+      "epoch": 2.9291310541310542,
+      "grad_norm": 0.9180546998977661,
+      "learning_rate": 3.343022656070154e-05,
+      "loss": 0.982,
+      "step": 16452
+    },
+    {
+      "epoch": 2.929309116809117,
+      "grad_norm": 0.9291700124740601,
+      "learning_rate": 3.341978199821175e-05,
+      "loss": 1.0003,
+      "step": 16453
+    },
+    {
+      "epoch": 2.9294871794871797,
+      "grad_norm": 0.970675528049469,
+      "learning_rate": 3.340933874022304e-05,
+      "loss": 0.8941,
+      "step": 16454
+    },
+    {
+      "epoch": 2.929665242165242,
+      "grad_norm": 0.8425672650337219,
+      "learning_rate": 3.339889678693999e-05,
+      "loss": 0.7173,
+      "step": 16455
+    },
+    {
+      "epoch": 2.929843304843305,
+      "grad_norm": 0.9666314721107483,
+      "learning_rate": 3.3388456138567225e-05,
+      "loss": 1.0403,
+      "step": 16456
+    },
+    {
+      "epoch": 2.9300213675213675,
+      "grad_norm": 0.9586226344108582,
+      "learning_rate": 3.337801679530924e-05,
+      "loss": 0.7691,
+      "step": 16457
+    },
+    {
+      "epoch": 2.9301994301994303,
+      "grad_norm": 0.888327419757843,
+      "learning_rate": 3.33675787573707e-05,
+      "loss": 0.7019,
+      "step": 16458
+    },
+    {
+      "epoch": 2.9303774928774926,
+      "grad_norm": 0.7410684823989868,
+      "learning_rate": 3.335714202495596e-05,
+      "loss": 0.9935,
+      "step": 16459
+    },
+    {
+      "epoch": 2.9305555555555554,
+      "grad_norm": 1.0408822298049927,
+      "learning_rate": 3.3346706598269617e-05,
+      "loss": 0.9454,
+      "step": 16460
+    },
+    {
+      "epoch": 2.930733618233618,
+      "grad_norm": 0.9065089821815491,
+      "learning_rate": 3.333627247751611e-05,
+      "loss": 0.9665,
+      "step": 16461
+    },
+    {
+      "epoch": 2.930911680911681,
+      "grad_norm": 0.8904961943626404,
+      "learning_rate": 3.332583966289985e-05,
+      "loss": 0.9402,
+      "step": 16462
+    },
+    {
+      "epoch": 2.9310897435897436,
+      "grad_norm": 0.920364260673523,
+      "learning_rate": 3.331540815462526e-05,
+      "loss": 0.8524,
+      "step": 16463
+    },
+    {
+      "epoch": 2.9312678062678064,
+      "grad_norm": 0.9185073375701904,
+      "learning_rate": 3.330497795289669e-05,
+      "loss": 0.8982,
+      "step": 16464
+    },
+    {
+      "epoch": 2.931445868945869,
+      "grad_norm": 0.9365581274032593,
+      "learning_rate": 3.32945490579186e-05,
+      "loss": 0.9272,
+      "step": 16465
+    },
+    {
+      "epoch": 2.931623931623932,
+      "grad_norm": 0.9139016270637512,
+      "learning_rate": 3.328412146989518e-05,
+      "loss": 1.0019,
+      "step": 16466
+    },
+    {
+      "epoch": 2.931801994301994,
+      "grad_norm": 0.9021140336990356,
+      "learning_rate": 3.327369518903085e-05,
+      "loss": 0.866,
+      "step": 16467
+    },
+    {
+      "epoch": 2.931980056980057,
+      "grad_norm": 0.8053449988365173,
+      "learning_rate": 3.326327021552984e-05,
+      "loss": 0.8915,
+      "step": 16468
+    },
+    {
+      "epoch": 2.9321581196581197,
+      "grad_norm": 1.0013985633850098,
+      "learning_rate": 3.325284654959643e-05,
+      "loss": 0.9063,
+      "step": 16469
+    },
+    {
+      "epoch": 2.9323361823361824,
+      "grad_norm": 0.8236168622970581,
+      "learning_rate": 3.324242419143483e-05,
+      "loss": 0.7113,
+      "step": 16470
+    },
+    {
+      "epoch": 2.932514245014245,
+      "grad_norm": 0.7256088852882385,
+      "learning_rate": 3.323200314124925e-05,
+      "loss": 0.6009,
+      "step": 16471
+    },
+    {
+      "epoch": 2.9326923076923075,
+      "grad_norm": 0.9991055727005005,
+      "learning_rate": 3.322158339924384e-05,
+      "loss": 1.039,
+      "step": 16472
+    },
+    {
+      "epoch": 2.9328703703703702,
+      "grad_norm": 0.8769686818122864,
+      "learning_rate": 3.3211164965622845e-05,
+      "loss": 0.989,
+      "step": 16473
+    },
+    {
+      "epoch": 2.933048433048433,
+      "grad_norm": 0.8226488828659058,
+      "learning_rate": 3.320074784059026e-05,
+      "loss": 0.9269,
+      "step": 16474
+    },
+    {
+      "epoch": 2.9332264957264957,
+      "grad_norm": 0.7763178944587708,
+      "learning_rate": 3.31903320243503e-05,
+      "loss": 0.9245,
+      "step": 16475
+    },
+    {
+      "epoch": 2.9334045584045585,
+      "grad_norm": 0.9204390645027161,
+      "learning_rate": 3.3179917517107e-05,
+      "loss": 1.2374,
+      "step": 16476
+    },
+    {
+      "epoch": 2.9335826210826212,
+      "grad_norm": 0.9970325827598572,
+      "learning_rate": 3.31695043190644e-05,
+      "loss": 0.7862,
+      "step": 16477
+    },
+    {
+      "epoch": 2.933760683760684,
+      "grad_norm": 0.8539462685585022,
+      "learning_rate": 3.315909243042654e-05,
+      "loss": 0.8015,
+      "step": 16478
+    },
+    {
+      "epoch": 2.9339387464387463,
+      "grad_norm": 0.8208832144737244,
+      "learning_rate": 3.314868185139742e-05,
+      "loss": 0.8571,
+      "step": 16479
+    },
+    {
+      "epoch": 2.934116809116809,
+      "grad_norm": 0.8628700375556946,
+      "learning_rate": 3.313827258218101e-05,
+      "loss": 0.8274,
+      "step": 16480
+    },
+    {
+      "epoch": 2.934294871794872,
+      "grad_norm": 1.0472661256790161,
+      "learning_rate": 3.312786462298124e-05,
+      "loss": 0.9252,
+      "step": 16481
+    },
+    {
+      "epoch": 2.9344729344729346,
+      "grad_norm": 0.8292158842086792,
+      "learning_rate": 3.311745797400202e-05,
+      "loss": 0.8548,
+      "step": 16482
+    },
+    {
+      "epoch": 2.9346509971509973,
+      "grad_norm": 0.85657799243927,
+      "learning_rate": 3.310705263544731e-05,
+      "loss": 0.7657,
+      "step": 16483
+    },
+    {
+      "epoch": 2.9348290598290596,
+      "grad_norm": 0.7560283541679382,
+      "learning_rate": 3.309664860752095e-05,
+      "loss": 0.8063,
+      "step": 16484
+    },
+    {
+      "epoch": 2.9350071225071224,
+      "grad_norm": 0.7933059334754944,
+      "learning_rate": 3.308624589042677e-05,
+      "loss": 0.7102,
+      "step": 16485
+    },
+    {
+      "epoch": 2.935185185185185,
+      "grad_norm": 0.8430653810501099,
+      "learning_rate": 3.3075844484368615e-05,
+      "loss": 0.8815,
+      "step": 16486
+    },
+    {
+      "epoch": 2.935363247863248,
+      "grad_norm": 0.8113032579421997,
+      "learning_rate": 3.306544438955021e-05,
+      "loss": 0.8144,
+      "step": 16487
+    },
+    {
+      "epoch": 2.9355413105413106,
+      "grad_norm": 0.8344797492027283,
+      "learning_rate": 3.3055045606175474e-05,
+      "loss": 0.7164,
+      "step": 16488
+    },
+    {
+      "epoch": 2.9357193732193734,
+      "grad_norm": 1.0109050273895264,
+      "learning_rate": 3.3044648134447964e-05,
+      "loss": 0.6619,
+      "step": 16489
+    },
+    {
+      "epoch": 2.935897435897436,
+      "grad_norm": 0.8746094703674316,
+      "learning_rate": 3.303425197457156e-05,
+      "loss": 0.73,
+      "step": 16490
+    },
+    {
+      "epoch": 2.9360754985754984,
+      "grad_norm": 0.919924795627594,
+      "learning_rate": 3.302385712674981e-05,
+      "loss": 1.1008,
+      "step": 16491
+    },
+    {
+      "epoch": 2.936253561253561,
+      "grad_norm": 0.8407595753669739,
+      "learning_rate": 3.301346359118648e-05,
+      "loss": 1.0963,
+      "step": 16492
+    },
+    {
+      "epoch": 2.936431623931624,
+      "grad_norm": 0.8145756721496582,
+      "learning_rate": 3.3003071368085184e-05,
+      "loss": 0.805,
+      "step": 16493
+    },
+    {
+      "epoch": 2.9366096866096867,
+      "grad_norm": 0.8298826813697815,
+      "learning_rate": 3.299268045764953e-05,
+      "loss": 0.7075,
+      "step": 16494
+    },
+    {
+      "epoch": 2.9367877492877494,
+      "grad_norm": 1.1027754545211792,
+      "learning_rate": 3.2982290860083106e-05,
+      "loss": 1.0381,
+      "step": 16495
+    },
+    {
+      "epoch": 2.9369658119658117,
+      "grad_norm": 0.8431075215339661,
+      "learning_rate": 3.2971902575589476e-05,
+      "loss": 0.9013,
+      "step": 16496
+    },
+    {
+      "epoch": 2.9371438746438745,
+      "grad_norm": 1.0045086145401,
+      "learning_rate": 3.296151560437214e-05,
+      "loss": 0.9054,
+      "step": 16497
+    },
+    {
+      "epoch": 2.9373219373219372,
+      "grad_norm": 0.8290889263153076,
+      "learning_rate": 3.295112994663471e-05,
+      "loss": 0.8054,
+      "step": 16498
+    },
+    {
+      "epoch": 2.9375,
+      "grad_norm": 0.7854097485542297,
+      "learning_rate": 3.2940745602580544e-05,
+      "loss": 0.7585,
+      "step": 16499
+    },
+    {
+      "epoch": 2.9376780626780628,
+      "grad_norm": 0.7470735907554626,
+      "learning_rate": 3.29303625724132e-05,
+      "loss": 0.7122,
+      "step": 16500
+    },
+    {
+      "epoch": 2.9378561253561255,
+      "grad_norm": 0.847463071346283,
+      "learning_rate": 3.2919980856336075e-05,
+      "loss": 0.7949,
+      "step": 16501
+    },
+    {
+      "epoch": 2.9380341880341883,
+      "grad_norm": 0.9595068097114563,
+      "learning_rate": 3.290960045455257e-05,
+      "loss": 0.8335,
+      "step": 16502
+    },
+    {
+      "epoch": 2.9382122507122506,
+      "grad_norm": 0.9636897444725037,
+      "learning_rate": 3.289922136726609e-05,
+      "loss": 0.7454,
+      "step": 16503
+    },
+    {
+      "epoch": 2.9383903133903133,
+      "grad_norm": 0.9030486941337585,
+      "learning_rate": 3.288884359467993e-05,
+      "loss": 0.8346,
+      "step": 16504
+    },
+    {
+      "epoch": 2.938568376068376,
+      "grad_norm": 0.8722931146621704,
+      "learning_rate": 3.287846713699755e-05,
+      "loss": 1.0182,
+      "step": 16505
+    },
+    {
+      "epoch": 2.938746438746439,
+      "grad_norm": 0.7612178325653076,
+      "learning_rate": 3.286809199442209e-05,
+      "loss": 0.8759,
+      "step": 16506
+    },
+    {
+      "epoch": 2.9389245014245016,
+      "grad_norm": 0.951334536075592,
+      "learning_rate": 3.2857718167156956e-05,
+      "loss": 0.8035,
+      "step": 16507
+    },
+    {
+      "epoch": 2.939102564102564,
+      "grad_norm": 0.8043029308319092,
+      "learning_rate": 3.284734565540536e-05,
+      "loss": 0.7652,
+      "step": 16508
+    },
+    {
+      "epoch": 2.9392806267806266,
+      "grad_norm": 0.8762648701667786,
+      "learning_rate": 3.283697445937053e-05,
+      "loss": 0.9362,
+      "step": 16509
+    },
+    {
+      "epoch": 2.9394586894586894,
+      "grad_norm": 0.8046880960464478,
+      "learning_rate": 3.282660457925566e-05,
+      "loss": 0.7757,
+      "step": 16510
+    },
+    {
+      "epoch": 2.939636752136752,
+      "grad_norm": 0.7703250050544739,
+      "learning_rate": 3.281623601526394e-05,
+      "loss": 0.6699,
+      "step": 16511
+    },
+    {
+      "epoch": 2.939814814814815,
+      "grad_norm": 0.9165888428688049,
+      "learning_rate": 3.280586876759847e-05,
+      "loss": 0.8321,
+      "step": 16512
+    },
+    {
+      "epoch": 2.9399928774928776,
+      "grad_norm": 0.7389699816703796,
+      "learning_rate": 3.279550283646249e-05,
+      "loss": 0.7621,
+      "step": 16513
+    },
+    {
+      "epoch": 2.9401709401709404,
+      "grad_norm": 0.9256302714347839,
+      "learning_rate": 3.278513822205897e-05,
+      "loss": 0.8634,
+      "step": 16514
+    },
+    {
+      "epoch": 2.9403490028490027,
+      "grad_norm": 0.8384902477264404,
+      "learning_rate": 3.277477492459109e-05,
+      "loss": 0.7645,
+      "step": 16515
+    },
+    {
+      "epoch": 2.9405270655270654,
+      "grad_norm": 0.9446337819099426,
+      "learning_rate": 3.276441294426178e-05,
+      "loss": 0.9729,
+      "step": 16516
+    },
+    {
+      "epoch": 2.940705128205128,
+      "grad_norm": 0.920237123966217,
+      "learning_rate": 3.275405228127417e-05,
+      "loss": 0.7834,
+      "step": 16517
+    },
+    {
+      "epoch": 2.940883190883191,
+      "grad_norm": 0.8432943224906921,
+      "learning_rate": 3.274369293583121e-05,
+      "loss": 0.8246,
+      "step": 16518
+    },
+    {
+      "epoch": 2.9410612535612537,
+      "grad_norm": 0.8046762943267822,
+      "learning_rate": 3.2733334908135885e-05,
+      "loss": 0.9363,
+      "step": 16519
+    },
+    {
+      "epoch": 2.941239316239316,
+      "grad_norm": 0.9555963277816772,
+      "learning_rate": 3.2722978198391106e-05,
+      "loss": 0.8699,
+      "step": 16520
+    },
+    {
+      "epoch": 2.9414173789173788,
+      "grad_norm": 0.8619177937507629,
+      "learning_rate": 3.2712622806799834e-05,
+      "loss": 0.8482,
+      "step": 16521
+    },
+    {
+      "epoch": 2.9415954415954415,
+      "grad_norm": 0.8801655769348145,
+      "learning_rate": 3.27022687335649e-05,
+      "loss": 0.9601,
+      "step": 16522
+    },
+    {
+      "epoch": 2.9417735042735043,
+      "grad_norm": 0.9054547548294067,
+      "learning_rate": 3.2691915978889244e-05,
+      "loss": 0.8752,
+      "step": 16523
+    },
+    {
+      "epoch": 2.941951566951567,
+      "grad_norm": 0.9078481197357178,
+      "learning_rate": 3.2681564542975675e-05,
+      "loss": 0.8225,
+      "step": 16524
+    },
+    {
+      "epoch": 2.9421296296296298,
+      "grad_norm": 0.9574032425880432,
+      "learning_rate": 3.267121442602701e-05,
+      "loss": 0.9133,
+      "step": 16525
+    },
+    {
+      "epoch": 2.9423076923076925,
+      "grad_norm": 0.8808075189590454,
+      "learning_rate": 3.2660865628246026e-05,
+      "loss": 0.8544,
+      "step": 16526
+    },
+    {
+      "epoch": 2.942485754985755,
+      "grad_norm": 0.8007816076278687,
+      "learning_rate": 3.26505181498355e-05,
+      "loss": 0.7846,
+      "step": 16527
+    },
+    {
+      "epoch": 2.9426638176638176,
+      "grad_norm": 0.8623418211936951,
+      "learning_rate": 3.264017199099816e-05,
+      "loss": 0.9056,
+      "step": 16528
+    },
+    {
+      "epoch": 2.9428418803418803,
+      "grad_norm": 0.870961606502533,
+      "learning_rate": 3.2629827151936695e-05,
+      "loss": 0.9883,
+      "step": 16529
+    },
+    {
+      "epoch": 2.943019943019943,
+      "grad_norm": 0.9122142791748047,
+      "learning_rate": 3.2619483632853885e-05,
+      "loss": 0.8012,
+      "step": 16530
+    },
+    {
+      "epoch": 2.943198005698006,
+      "grad_norm": 0.9072979688644409,
+      "learning_rate": 3.260914143395225e-05,
+      "loss": 0.9999,
+      "step": 16531
+    },
+    {
+      "epoch": 2.943376068376068,
+      "grad_norm": 0.8746095299720764,
+      "learning_rate": 3.259880055543454e-05,
+      "loss": 0.8022,
+      "step": 16532
+    },
+    {
+      "epoch": 2.943554131054131,
+      "grad_norm": 1.0012668371200562,
+      "learning_rate": 3.2588460997503314e-05,
+      "loss": 0.8909,
+      "step": 16533
+    },
+    {
+      "epoch": 2.9437321937321936,
+      "grad_norm": 0.913070023059845,
+      "learning_rate": 3.2578122760361154e-05,
+      "loss": 0.7729,
+      "step": 16534
+    },
+    {
+      "epoch": 2.9439102564102564,
+      "grad_norm": 0.791753888130188,
+      "learning_rate": 3.2567785844210616e-05,
+      "loss": 0.9731,
+      "step": 16535
+    },
+    {
+      "epoch": 2.944088319088319,
+      "grad_norm": 0.9673776030540466,
+      "learning_rate": 3.255745024925425e-05,
+      "loss": 0.8008,
+      "step": 16536
+    },
+    {
+      "epoch": 2.944266381766382,
+      "grad_norm": 0.9382752180099487,
+      "learning_rate": 3.254711597569454e-05,
+      "loss": 0.9611,
+      "step": 16537
+    },
+    {
+      "epoch": 2.9444444444444446,
+      "grad_norm": 0.8816630840301514,
+      "learning_rate": 3.2536783023733975e-05,
+      "loss": 0.7565,
+      "step": 16538
+    },
+    {
+      "epoch": 2.9446225071225074,
+      "grad_norm": 0.8474457859992981,
+      "learning_rate": 3.2526451393574964e-05,
+      "loss": 0.7766,
+      "step": 16539
+    },
+    {
+      "epoch": 2.9448005698005697,
+      "grad_norm": 0.9510074257850647,
+      "learning_rate": 3.251612108542005e-05,
+      "loss": 0.9899,
+      "step": 16540
+    },
+    {
+      "epoch": 2.9449786324786325,
+      "grad_norm": 0.776924192905426,
+      "learning_rate": 3.250579209947149e-05,
+      "loss": 0.7845,
+      "step": 16541
+    },
+    {
+      "epoch": 2.945156695156695,
+      "grad_norm": 0.9543585181236267,
+      "learning_rate": 3.2495464435931756e-05,
+      "loss": 1.1313,
+      "step": 16542
+    },
+    {
+      "epoch": 2.945334757834758,
+      "grad_norm": 0.9087918400764465,
+      "learning_rate": 3.2485138095003164e-05,
+      "loss": 0.774,
+      "step": 16543
+    },
+    {
+      "epoch": 2.9455128205128203,
+      "grad_norm": 0.7719675302505493,
+      "learning_rate": 3.247481307688801e-05,
+      "loss": 0.7755,
+      "step": 16544
+    },
+    {
+      "epoch": 2.945690883190883,
+      "grad_norm": 0.8550716638565063,
+      "learning_rate": 3.246448938178869e-05,
+      "loss": 0.8455,
+      "step": 16545
+    },
+    {
+      "epoch": 2.9458689458689458,
+      "grad_norm": 0.8585572838783264,
+      "learning_rate": 3.2454167009907346e-05,
+      "loss": 0.7048,
+      "step": 16546
+    },
+    {
+      "epoch": 2.9460470085470085,
+      "grad_norm": 0.819939136505127,
+      "learning_rate": 3.2443845961446315e-05,
+      "loss": 0.7671,
+      "step": 16547
+    },
+    {
+      "epoch": 2.9462250712250713,
+      "grad_norm": 0.8811594247817993,
+      "learning_rate": 3.243352623660778e-05,
+      "loss": 0.9244,
+      "step": 16548
+    },
+    {
+      "epoch": 2.946403133903134,
+      "grad_norm": 0.8128607869148254,
+      "learning_rate": 3.242320783559395e-05,
+      "loss": 0.8103,
+      "step": 16549
+    },
+    {
+      "epoch": 2.9465811965811968,
+      "grad_norm": 0.778759241104126,
+      "learning_rate": 3.2412890758606985e-05,
+      "loss": 0.6261,
+      "step": 16550
+    },
+    {
+      "epoch": 2.9467592592592595,
+      "grad_norm": 1.0277715921401978,
+      "learning_rate": 3.240257500584901e-05,
+      "loss": 0.8728,
+      "step": 16551
+    },
+    {
+      "epoch": 2.946937321937322,
+      "grad_norm": 0.7647507786750793,
+      "learning_rate": 3.239226057752217e-05,
+      "loss": 0.7142,
+      "step": 16552
+    },
+    {
+      "epoch": 2.9471153846153846,
+      "grad_norm": 0.8381546139717102,
+      "learning_rate": 3.238194747382855e-05,
+      "loss": 0.737,
+      "step": 16553
+    },
+    {
+      "epoch": 2.9472934472934473,
+      "grad_norm": 0.7928317189216614,
+      "learning_rate": 3.237163569497016e-05,
+      "loss": 0.8926,
+      "step": 16554
+    },
+    {
+      "epoch": 2.94747150997151,
+      "grad_norm": 0.7086058259010315,
+      "learning_rate": 3.236132524114914e-05,
+      "loss": 0.6845,
+      "step": 16555
+    },
+    {
+      "epoch": 2.9476495726495724,
+      "grad_norm": 0.83002769947052,
+      "learning_rate": 3.235101611256739e-05,
+      "loss": 0.7365,
+      "step": 16556
+    },
+    {
+      "epoch": 2.947827635327635,
+      "grad_norm": 0.9012778401374817,
+      "learning_rate": 3.234070830942698e-05,
+      "loss": 0.9999,
+      "step": 16557
+    },
+    {
+      "epoch": 2.948005698005698,
+      "grad_norm": 0.8554810881614685,
+      "learning_rate": 3.233040183192985e-05,
+      "loss": 0.6699,
+      "step": 16558
+    },
+    {
+      "epoch": 2.9481837606837606,
+      "grad_norm": 0.9322055578231812,
+      "learning_rate": 3.2320096680277915e-05,
+      "loss": 0.8969,
+      "step": 16559
+    },
+    {
+      "epoch": 2.9483618233618234,
+      "grad_norm": 0.8755966424942017,
+      "learning_rate": 3.2309792854673095e-05,
+      "loss": 0.8398,
+      "step": 16560
+    },
+    {
+      "epoch": 2.948539886039886,
+      "grad_norm": 0.8203766345977783,
+      "learning_rate": 3.229949035531726e-05,
+      "loss": 0.8354,
+      "step": 16561
+    },
+    {
+      "epoch": 2.948717948717949,
+      "grad_norm": 0.8970799446105957,
+      "learning_rate": 3.228918918241229e-05,
+      "loss": 0.9074,
+      "step": 16562
+    },
+    {
+      "epoch": 2.9488960113960117,
+      "grad_norm": 0.8263736963272095,
+      "learning_rate": 3.227888933615997e-05,
+      "loss": 0.8896,
+      "step": 16563
+    },
+    {
+      "epoch": 2.949074074074074,
+      "grad_norm": 1.0277043581008911,
+      "learning_rate": 3.2268590816762155e-05,
+      "loss": 0.8706,
+      "step": 16564
+    },
+    {
+      "epoch": 2.9492521367521367,
+      "grad_norm": 0.8965407013893127,
+      "learning_rate": 3.225829362442061e-05,
+      "loss": 0.6796,
+      "step": 16565
+    },
+    {
+      "epoch": 2.9494301994301995,
+      "grad_norm": 0.8175839185714722,
+      "learning_rate": 3.224799775933708e-05,
+      "loss": 0.9243,
+      "step": 16566
+    },
+    {
+      "epoch": 2.949608262108262,
+      "grad_norm": 0.7979576587677002,
+      "learning_rate": 3.2237703221713286e-05,
+      "loss": 0.8008,
+      "step": 16567
+    },
+    {
+      "epoch": 2.9497863247863245,
+      "grad_norm": 1.054843783378601,
+      "learning_rate": 3.2227410011750945e-05,
+      "loss": 1.0279,
+      "step": 16568
+    },
+    {
+      "epoch": 2.9499643874643873,
+      "grad_norm": 0.7947831749916077,
+      "learning_rate": 3.221711812965168e-05,
+      "loss": 0.6767,
+      "step": 16569
+    },
+    {
+      "epoch": 2.95014245014245,
+      "grad_norm": 0.8702623844146729,
+      "learning_rate": 3.220682757561725e-05,
+      "loss": 0.9844,
+      "step": 16570
+    },
+    {
+      "epoch": 2.9503205128205128,
+      "grad_norm": 0.8653056621551514,
+      "learning_rate": 3.2196538349849123e-05,
+      "loss": 1.0747,
+      "step": 16571
+    },
+    {
+      "epoch": 2.9504985754985755,
+      "grad_norm": 0.9718163013458252,
+      "learning_rate": 3.2186250452549026e-05,
+      "loss": 0.7793,
+      "step": 16572
+    },
+    {
+      "epoch": 2.9506766381766383,
+      "grad_norm": 0.8626788854598999,
+      "learning_rate": 3.217596388391848e-05,
+      "loss": 0.9145,
+      "step": 16573
+    },
+    {
+      "epoch": 2.950854700854701,
+      "grad_norm": 0.9753466844558716,
+      "learning_rate": 3.2165678644159025e-05,
+      "loss": 0.9784,
+      "step": 16574
+    },
+    {
+      "epoch": 2.951032763532764,
+      "grad_norm": 0.795011043548584,
+      "learning_rate": 3.2155394733472186e-05,
+      "loss": 0.7457,
+      "step": 16575
+    },
+    {
+      "epoch": 2.951210826210826,
+      "grad_norm": 0.842991828918457,
+      "learning_rate": 3.2145112152059454e-05,
+      "loss": 0.9126,
+      "step": 16576
+    },
+    {
+      "epoch": 2.951388888888889,
+      "grad_norm": 0.8642476797103882,
+      "learning_rate": 3.213483090012228e-05,
+      "loss": 0.7213,
+      "step": 16577
+    },
+    {
+      "epoch": 2.9515669515669516,
+      "grad_norm": 0.9114034175872803,
+      "learning_rate": 3.212455097786214e-05,
+      "loss": 0.651,
+      "step": 16578
+    },
+    {
+      "epoch": 2.9517450142450143,
+      "grad_norm": 1.0575958490371704,
+      "learning_rate": 3.211427238548037e-05,
+      "loss": 0.9727,
+      "step": 16579
+    },
+    {
+      "epoch": 2.9519230769230766,
+      "grad_norm": 0.8618924617767334,
+      "learning_rate": 3.210399512317849e-05,
+      "loss": 0.8593,
+      "step": 16580
+    },
+    {
+      "epoch": 2.9521011396011394,
+      "grad_norm": 0.768313467502594,
+      "learning_rate": 3.209371919115771e-05,
+      "loss": 0.8305,
+      "step": 16581
+    },
+    {
+      "epoch": 2.952279202279202,
+      "grad_norm": 0.8160355091094971,
+      "learning_rate": 3.208344458961947e-05,
+      "loss": 0.7266,
+      "step": 16582
+    },
+    {
+      "epoch": 2.952457264957265,
+      "grad_norm": 0.908545196056366,
+      "learning_rate": 3.207317131876506e-05,
+      "loss": 0.8022,
+      "step": 16583
+    },
+    {
+      "epoch": 2.9526353276353277,
+      "grad_norm": 1.2593516111373901,
+      "learning_rate": 3.206289937879571e-05,
+      "loss": 1.0274,
+      "step": 16584
+    },
+    {
+      "epoch": 2.9528133903133904,
+      "grad_norm": 0.7789214849472046,
+      "learning_rate": 3.2052628769912795e-05,
+      "loss": 0.8993,
+      "step": 16585
+    },
+    {
+      "epoch": 2.952991452991453,
+      "grad_norm": 0.8475270867347717,
+      "learning_rate": 3.20423594923174e-05,
+      "loss": 1.0145,
+      "step": 16586
+    },
+    {
+      "epoch": 2.953169515669516,
+      "grad_norm": 1.0083874464035034,
+      "learning_rate": 3.203209154621086e-05,
+      "loss": 1.02,
+      "step": 16587
+    },
+    {
+      "epoch": 2.953347578347578,
+      "grad_norm": 0.7013131380081177,
+      "learning_rate": 3.2021824931794245e-05,
+      "loss": 0.5449,
+      "step": 16588
+    },
+    {
+      "epoch": 2.953525641025641,
+      "grad_norm": 0.8298764824867249,
+      "learning_rate": 3.201155964926878e-05,
+      "loss": 0.9914,
+      "step": 16589
+    },
+    {
+      "epoch": 2.9537037037037037,
+      "grad_norm": 0.9371963143348694,
+      "learning_rate": 3.200129569883556e-05,
+      "loss": 0.7349,
+      "step": 16590
+    },
+    {
+      "epoch": 2.9538817663817665,
+      "grad_norm": 0.7932438850402832,
+      "learning_rate": 3.199103308069571e-05,
+      "loss": 0.7911,
+      "step": 16591
+    },
+    {
+      "epoch": 2.9540598290598292,
+      "grad_norm": 0.8415567278862,
+      "learning_rate": 3.198077179505029e-05,
+      "loss": 0.766,
+      "step": 16592
+    },
+    {
+      "epoch": 2.9542378917378915,
+      "grad_norm": 0.9155336618423462,
+      "learning_rate": 3.197051184210035e-05,
+      "loss": 0.9622,
+      "step": 16593
+    },
+    {
+      "epoch": 2.9544159544159543,
+      "grad_norm": 0.8972458839416504,
+      "learning_rate": 3.196025322204688e-05,
+      "loss": 1.0202,
+      "step": 16594
+    },
+    {
+      "epoch": 2.954594017094017,
+      "grad_norm": 0.9620199799537659,
+      "learning_rate": 3.194999593509096e-05,
+      "loss": 0.8114,
+      "step": 16595
+    },
+    {
+      "epoch": 2.95477207977208,
+      "grad_norm": 0.819244921207428,
+      "learning_rate": 3.1939739981433456e-05,
+      "loss": 0.7858,
+      "step": 16596
+    },
+    {
+      "epoch": 2.9549501424501425,
+      "grad_norm": 0.8560270667076111,
+      "learning_rate": 3.192948536127542e-05,
+      "loss": 0.9163,
+      "step": 16597
+    },
+    {
+      "epoch": 2.9551282051282053,
+      "grad_norm": 0.9105932116508484,
+      "learning_rate": 3.191923207481765e-05,
+      "loss": 0.7372,
+      "step": 16598
+    },
+    {
+      "epoch": 2.955306267806268,
+      "grad_norm": 0.888846218585968,
+      "learning_rate": 3.190898012226114e-05,
+      "loss": 0.8624,
+      "step": 16599
+    },
+    {
+      "epoch": 2.9554843304843303,
+      "grad_norm": 0.8116851449012756,
+      "learning_rate": 3.1898729503806726e-05,
+      "loss": 0.8549,
+      "step": 16600
+    },
+    {
+      "epoch": 2.955662393162393,
+      "grad_norm": 0.8970577120780945,
+      "learning_rate": 3.188848021965522e-05,
+      "loss": 0.8228,
+      "step": 16601
+    },
+    {
+      "epoch": 2.955840455840456,
+      "grad_norm": 0.8820711374282837,
+      "learning_rate": 3.187823227000747e-05,
+      "loss": 0.7976,
+      "step": 16602
+    },
+    {
+      "epoch": 2.9560185185185186,
+      "grad_norm": 0.9349139928817749,
+      "learning_rate": 3.1867985655064205e-05,
+      "loss": 0.8965,
+      "step": 16603
+    },
+    {
+      "epoch": 2.9561965811965814,
+      "grad_norm": 0.939132571220398,
+      "learning_rate": 3.185774037502627e-05,
+      "loss": 0.9152,
+      "step": 16604
+    },
+    {
+      "epoch": 2.9563746438746437,
+      "grad_norm": 1.029159665107727,
+      "learning_rate": 3.184749643009435e-05,
+      "loss": 1.0904,
+      "step": 16605
+    },
+    {
+      "epoch": 2.9565527065527064,
+      "grad_norm": 0.966548502445221,
+      "learning_rate": 3.183725382046917e-05,
+      "loss": 0.9508,
+      "step": 16606
+    },
+    {
+      "epoch": 2.956730769230769,
+      "grad_norm": 0.8782771229743958,
+      "learning_rate": 3.18270125463514e-05,
+      "loss": 0.9929,
+      "step": 16607
+    },
+    {
+      "epoch": 2.956908831908832,
+      "grad_norm": 0.9193231463432312,
+      "learning_rate": 3.18167726079417e-05,
+      "loss": 0.9942,
+      "step": 16608
+    },
+    {
+      "epoch": 2.9570868945868947,
+      "grad_norm": 0.8940062522888184,
+      "learning_rate": 3.180653400544071e-05,
+      "loss": 1.0594,
+      "step": 16609
+    },
+    {
+      "epoch": 2.9572649572649574,
+      "grad_norm": 0.8237268328666687,
+      "learning_rate": 3.179629673904903e-05,
+      "loss": 0.8211,
+      "step": 16610
+    },
+    {
+      "epoch": 2.95744301994302,
+      "grad_norm": 0.92745041847229,
+      "learning_rate": 3.17860608089672e-05,
+      "loss": 0.8287,
+      "step": 16611
+    },
+    {
+      "epoch": 2.9576210826210825,
+      "grad_norm": 0.8203856348991394,
+      "learning_rate": 3.177582621539586e-05,
+      "loss": 0.9457,
+      "step": 16612
+    },
+    {
+      "epoch": 2.9577991452991452,
+      "grad_norm": 0.8719314932823181,
+      "learning_rate": 3.176559295853543e-05,
+      "loss": 0.8022,
+      "step": 16613
+    },
+    {
+      "epoch": 2.957977207977208,
+      "grad_norm": 0.8742199540138245,
+      "learning_rate": 3.175536103858648e-05,
+      "loss": 0.976,
+      "step": 16614
+    },
+    {
+      "epoch": 2.9581552706552707,
+      "grad_norm": 0.8677577972412109,
+      "learning_rate": 3.174513045574947e-05,
+      "loss": 0.9513,
+      "step": 16615
+    },
+    {
+      "epoch": 2.9583333333333335,
+      "grad_norm": 0.809138298034668,
+      "learning_rate": 3.173490121022485e-05,
+      "loss": 0.7001,
+      "step": 16616
+    },
+    {
+      "epoch": 2.958511396011396,
+      "grad_norm": 0.9672527313232422,
+      "learning_rate": 3.1724673302213025e-05,
+      "loss": 0.9932,
+      "step": 16617
+    },
+    {
+      "epoch": 2.9586894586894585,
+      "grad_norm": 0.8101853728294373,
+      "learning_rate": 3.17144467319144e-05,
+      "loss": 0.8236,
+      "step": 16618
+    },
+    {
+      "epoch": 2.9588675213675213,
+      "grad_norm": 0.8965981006622314,
+      "learning_rate": 3.170422149952931e-05,
+      "loss": 0.9353,
+      "step": 16619
+    },
+    {
+      "epoch": 2.959045584045584,
+      "grad_norm": 0.8267533183097839,
+      "learning_rate": 3.1693997605258184e-05,
+      "loss": 0.6445,
+      "step": 16620
+    },
+    {
+      "epoch": 2.959223646723647,
+      "grad_norm": 0.9084979891777039,
+      "learning_rate": 3.168377504930122e-05,
+      "loss": 0.9311,
+      "step": 16621
+    },
+    {
+      "epoch": 2.9594017094017095,
+      "grad_norm": 0.8414687514305115,
+      "learning_rate": 3.1673553831858805e-05,
+      "loss": 0.7663,
+      "step": 16622
+    },
+    {
+      "epoch": 2.9595797720797723,
+      "grad_norm": 0.8619266748428345,
+      "learning_rate": 3.166333395313116e-05,
+      "loss": 0.8524,
+      "step": 16623
+    },
+    {
+      "epoch": 2.9597578347578346,
+      "grad_norm": 0.9963071346282959,
+      "learning_rate": 3.1653115413318534e-05,
+      "loss": 0.813,
+      "step": 16624
+    },
+    {
+      "epoch": 2.9599358974358974,
+      "grad_norm": 0.8264978528022766,
+      "learning_rate": 3.164289821262113e-05,
+      "loss": 0.9716,
+      "step": 16625
+    },
+    {
+      "epoch": 2.96011396011396,
+      "grad_norm": 0.8166584372520447,
+      "learning_rate": 3.163268235123911e-05,
+      "loss": 0.7492,
+      "step": 16626
+    },
+    {
+      "epoch": 2.960292022792023,
+      "grad_norm": 0.809241533279419,
+      "learning_rate": 3.1622467829372724e-05,
+      "loss": 0.9033,
+      "step": 16627
+    },
+    {
+      "epoch": 2.9604700854700856,
+      "grad_norm": 1.0441617965698242,
+      "learning_rate": 3.161225464722197e-05,
+      "loss": 0.7305,
+      "step": 16628
+    },
+    {
+      "epoch": 2.960648148148148,
+      "grad_norm": 0.9226490259170532,
+      "learning_rate": 3.160204280498705e-05,
+      "loss": 0.6896,
+      "step": 16629
+    },
+    {
+      "epoch": 2.9608262108262107,
+      "grad_norm": 0.7890266180038452,
+      "learning_rate": 3.159183230286803e-05,
+      "loss": 0.873,
+      "step": 16630
+    },
+    {
+      "epoch": 2.9610042735042734,
+      "grad_norm": 0.8491777181625366,
+      "learning_rate": 3.1581623141064934e-05,
+      "loss": 0.8828,
+      "step": 16631
+    },
+    {
+      "epoch": 2.961182336182336,
+      "grad_norm": 0.7522078156471252,
+      "learning_rate": 3.157141531977782e-05,
+      "loss": 0.7717,
+      "step": 16632
+    },
+    {
+      "epoch": 2.961360398860399,
+      "grad_norm": 0.9260183572769165,
+      "learning_rate": 3.156120883920667e-05,
+      "loss": 0.9715,
+      "step": 16633
+    },
+    {
+      "epoch": 2.9615384615384617,
+      "grad_norm": 0.6620128154754639,
+      "learning_rate": 3.1551003699551465e-05,
+      "loss": 0.5624,
+      "step": 16634
+    },
+    {
+      "epoch": 2.9617165242165244,
+      "grad_norm": 0.7571594715118408,
+      "learning_rate": 3.154079990101214e-05,
+      "loss": 0.6911,
+      "step": 16635
+    },
+    {
+      "epoch": 2.9618945868945867,
+      "grad_norm": 0.8571279644966125,
+      "learning_rate": 3.153059744378861e-05,
+      "loss": 0.9057,
+      "step": 16636
+    },
+    {
+      "epoch": 2.9620726495726495,
+      "grad_norm": 0.8895478248596191,
+      "learning_rate": 3.152039632808085e-05,
+      "loss": 0.7097,
+      "step": 16637
+    },
+    {
+      "epoch": 2.9622507122507122,
+      "grad_norm": 0.8340024352073669,
+      "learning_rate": 3.15101965540886e-05,
+      "loss": 0.7719,
+      "step": 16638
+    },
+    {
+      "epoch": 2.962428774928775,
+      "grad_norm": 0.8263829946517944,
+      "learning_rate": 3.149999812201182e-05,
+      "loss": 0.8561,
+      "step": 16639
+    },
+    {
+      "epoch": 2.9626068376068377,
+      "grad_norm": 0.9083819389343262,
+      "learning_rate": 3.148980103205027e-05,
+      "loss": 0.9319,
+      "step": 16640
+    },
+    {
+      "epoch": 2.9627849002849,
+      "grad_norm": 0.9346078038215637,
+      "learning_rate": 3.147960528440372e-05,
+      "loss": 0.8036,
+      "step": 16641
+    },
+    {
+      "epoch": 2.962962962962963,
+      "grad_norm": 0.907319188117981,
+      "learning_rate": 3.146941087927203e-05,
+      "loss": 0.9228,
+      "step": 16642
+    },
+    {
+      "epoch": 2.9631410256410255,
+      "grad_norm": 0.7912126183509827,
+      "learning_rate": 3.1459217816854815e-05,
+      "loss": 0.7996,
+      "step": 16643
+    },
+    {
+      "epoch": 2.9633190883190883,
+      "grad_norm": 0.8844919800758362,
+      "learning_rate": 3.1449026097351896e-05,
+      "loss": 1.0235,
+      "step": 16644
+    },
+    {
+      "epoch": 2.963497150997151,
+      "grad_norm": 0.7468230128288269,
+      "learning_rate": 3.143883572096286e-05,
+      "loss": 0.7292,
+      "step": 16645
+    },
+    {
+      "epoch": 2.963675213675214,
+      "grad_norm": 0.8521941900253296,
+      "learning_rate": 3.142864668788744e-05,
+      "loss": 0.9012,
+      "step": 16646
+    },
+    {
+      "epoch": 2.9638532763532766,
+      "grad_norm": 0.9340695738792419,
+      "learning_rate": 3.141845899832524e-05,
+      "loss": 0.8736,
+      "step": 16647
+    },
+    {
+      "epoch": 2.9640313390313393,
+      "grad_norm": 0.859395444393158,
+      "learning_rate": 3.140827265247588e-05,
+      "loss": 0.796,
+      "step": 16648
+    },
+    {
+      "epoch": 2.9642094017094016,
+      "grad_norm": 0.8320850729942322,
+      "learning_rate": 3.139808765053892e-05,
+      "loss": 0.86,
+      "step": 16649
+    },
+    {
+      "epoch": 2.9643874643874644,
+      "grad_norm": 0.8911257386207581,
+      "learning_rate": 3.138790399271393e-05,
+      "loss": 0.966,
+      "step": 16650
+    },
+    {
+      "epoch": 2.964565527065527,
+      "grad_norm": 0.8817025423049927,
+      "learning_rate": 3.13777216792004e-05,
+      "loss": 0.983,
+      "step": 16651
+    },
+    {
+      "epoch": 2.96474358974359,
+      "grad_norm": 0.7765538692474365,
+      "learning_rate": 3.136754071019793e-05,
+      "loss": 0.643,
+      "step": 16652
+    },
+    {
+      "epoch": 2.964921652421652,
+      "grad_norm": 0.7961843609809875,
+      "learning_rate": 3.135736108590586e-05,
+      "loss": 0.733,
+      "step": 16653
+    },
+    {
+      "epoch": 2.965099715099715,
+      "grad_norm": 0.7910877466201782,
+      "learning_rate": 3.134718280652373e-05,
+      "loss": 0.7291,
+      "step": 16654
+    },
+    {
+      "epoch": 2.9652777777777777,
+      "grad_norm": 0.9326547384262085,
+      "learning_rate": 3.1337005872250956e-05,
+      "loss": 0.8834,
+      "step": 16655
+    },
+    {
+      "epoch": 2.9654558404558404,
+      "grad_norm": 0.9362995624542236,
+      "learning_rate": 3.132683028328691e-05,
+      "loss": 0.9693,
+      "step": 16656
+    },
+    {
+      "epoch": 2.965633903133903,
+      "grad_norm": 0.8322434425354004,
+      "learning_rate": 3.131665603983096e-05,
+      "loss": 0.9782,
+      "step": 16657
+    },
+    {
+      "epoch": 2.965811965811966,
+      "grad_norm": 0.8336097598075867,
+      "learning_rate": 3.130648314208247e-05,
+      "loss": 0.9995,
+      "step": 16658
+    },
+    {
+      "epoch": 2.9659900284900287,
+      "grad_norm": 0.8637044429779053,
+      "learning_rate": 3.129631159024074e-05,
+      "loss": 0.8291,
+      "step": 16659
+    },
+    {
+      "epoch": 2.9661680911680914,
+      "grad_norm": 0.7853943109512329,
+      "learning_rate": 3.128614138450506e-05,
+      "loss": 0.9016,
+      "step": 16660
+    },
+    {
+      "epoch": 2.9663461538461537,
+      "grad_norm": 0.9506492614746094,
+      "learning_rate": 3.1275972525074674e-05,
+      "loss": 0.9762,
+      "step": 16661
+    },
+    {
+      "epoch": 2.9665242165242165,
+      "grad_norm": 0.9375113248825073,
+      "learning_rate": 3.126580501214887e-05,
+      "loss": 0.8981,
+      "step": 16662
+    },
+    {
+      "epoch": 2.9667022792022792,
+      "grad_norm": 0.9404717683792114,
+      "learning_rate": 3.125563884592684e-05,
+      "loss": 0.7506,
+      "step": 16663
+    },
+    {
+      "epoch": 2.966880341880342,
+      "grad_norm": 0.9678782820701599,
+      "learning_rate": 3.124547402660776e-05,
+      "loss": 0.896,
+      "step": 16664
+    },
+    {
+      "epoch": 2.9670584045584043,
+      "grad_norm": 0.8530639410018921,
+      "learning_rate": 3.12353105543908e-05,
+      "loss": 0.8792,
+      "step": 16665
+    },
+    {
+      "epoch": 2.967236467236467,
+      "grad_norm": 0.8015365600585938,
+      "learning_rate": 3.122514842947504e-05,
+      "loss": 0.7052,
+      "step": 16666
+    },
+    {
+      "epoch": 2.96741452991453,
+      "grad_norm": 0.8753054141998291,
+      "learning_rate": 3.121498765205969e-05,
+      "loss": 0.9032,
+      "step": 16667
+    },
+    {
+      "epoch": 2.9675925925925926,
+      "grad_norm": 1.0053389072418213,
+      "learning_rate": 3.1204828222343716e-05,
+      "loss": 1.0236,
+      "step": 16668
+    },
+    {
+      "epoch": 2.9677706552706553,
+      "grad_norm": 0.8506449460983276,
+      "learning_rate": 3.119467014052628e-05,
+      "loss": 0.6762,
+      "step": 16669
+    },
+    {
+      "epoch": 2.967948717948718,
+      "grad_norm": 0.9376404881477356,
+      "learning_rate": 3.118451340680629e-05,
+      "loss": 0.9391,
+      "step": 16670
+    },
+    {
+      "epoch": 2.968126780626781,
+      "grad_norm": 0.8381097912788391,
+      "learning_rate": 3.117435802138284e-05,
+      "loss": 0.7343,
+      "step": 16671
+    },
+    {
+      "epoch": 2.9683048433048436,
+      "grad_norm": 0.8525682091712952,
+      "learning_rate": 3.116420398445488e-05,
+      "loss": 0.7928,
+      "step": 16672
+    },
+    {
+      "epoch": 2.968482905982906,
+      "grad_norm": 0.9399489760398865,
+      "learning_rate": 3.115405129622133e-05,
+      "loss": 0.975,
+      "step": 16673
+    },
+    {
+      "epoch": 2.9686609686609686,
+      "grad_norm": 0.8394346237182617,
+      "learning_rate": 3.114389995688114e-05,
+      "loss": 0.7591,
+      "step": 16674
+    },
+    {
+      "epoch": 2.9688390313390314,
+      "grad_norm": 0.8935068845748901,
+      "learning_rate": 3.11337499666332e-05,
+      "loss": 0.8899,
+      "step": 16675
+    },
+    {
+      "epoch": 2.969017094017094,
+      "grad_norm": 0.8111040592193604,
+      "learning_rate": 3.112360132567633e-05,
+      "loss": 0.795,
+      "step": 16676
+    },
+    {
+      "epoch": 2.9691951566951564,
+      "grad_norm": 0.8854177594184875,
+      "learning_rate": 3.1113454034209486e-05,
+      "loss": 1.0677,
+      "step": 16677
+    },
+    {
+      "epoch": 2.969373219373219,
+      "grad_norm": 0.9821479916572571,
+      "learning_rate": 3.110330809243134e-05,
+      "loss": 0.7859,
+      "step": 16678
+    },
+    {
+      "epoch": 2.969551282051282,
+      "grad_norm": 0.9066275954246521,
+      "learning_rate": 3.109316350054079e-05,
+      "loss": 1.1727,
+      "step": 16679
+    },
+    {
+      "epoch": 2.9697293447293447,
+      "grad_norm": 0.981238603591919,
+      "learning_rate": 3.108302025873656e-05,
+      "loss": 1.0036,
+      "step": 16680
+    },
+    {
+      "epoch": 2.9699074074074074,
+      "grad_norm": 0.8290690779685974,
+      "learning_rate": 3.107287836721737e-05,
+      "loss": 0.911,
+      "step": 16681
+    },
+    {
+      "epoch": 2.97008547008547,
+      "grad_norm": 0.8419190049171448,
+      "learning_rate": 3.106273782618196e-05,
+      "loss": 0.688,
+      "step": 16682
+    },
+    {
+      "epoch": 2.970263532763533,
+      "grad_norm": 0.9250679612159729,
+      "learning_rate": 3.1052598635828964e-05,
+      "loss": 0.8506,
+      "step": 16683
+    },
+    {
+      "epoch": 2.9704415954415957,
+      "grad_norm": 0.9414278864860535,
+      "learning_rate": 3.104246079635713e-05,
+      "loss": 0.8501,
+      "step": 16684
+    },
+    {
+      "epoch": 2.970619658119658,
+      "grad_norm": 0.9107208847999573,
+      "learning_rate": 3.1032324307964974e-05,
+      "loss": 0.8234,
+      "step": 16685
+    },
+    {
+      "epoch": 2.9707977207977208,
+      "grad_norm": 0.8691245317459106,
+      "learning_rate": 3.102218917085119e-05,
+      "loss": 0.9341,
+      "step": 16686
+    },
+    {
+      "epoch": 2.9709757834757835,
+      "grad_norm": 0.8816282153129578,
+      "learning_rate": 3.101205538521431e-05,
+      "loss": 0.9412,
+      "step": 16687
+    },
+    {
+      "epoch": 2.9711538461538463,
+      "grad_norm": 0.8969736695289612,
+      "learning_rate": 3.100192295125289e-05,
+      "loss": 0.9468,
+      "step": 16688
+    },
+    {
+      "epoch": 2.9713319088319086,
+      "grad_norm": 1.0070735216140747,
+      "learning_rate": 3.099179186916548e-05,
+      "loss": 0.8587,
+      "step": 16689
+    },
+    {
+      "epoch": 2.9715099715099713,
+      "grad_norm": 0.9801154136657715,
+      "learning_rate": 3.0981662139150537e-05,
+      "loss": 0.8999,
+      "step": 16690
+    },
+    {
+      "epoch": 2.971688034188034,
+      "grad_norm": 0.9949473142623901,
+      "learning_rate": 3.0971533761406526e-05,
+      "loss": 1.0552,
+      "step": 16691
+    },
+    {
+      "epoch": 2.971866096866097,
+      "grad_norm": 0.9654440879821777,
+      "learning_rate": 3.096140673613198e-05,
+      "loss": 0.8618,
+      "step": 16692
+    },
+    {
+      "epoch": 2.9720441595441596,
+      "grad_norm": 0.9247317910194397,
+      "learning_rate": 3.0951281063525185e-05,
+      "loss": 0.8915,
+      "step": 16693
+    },
+    {
+      "epoch": 2.9722222222222223,
+      "grad_norm": 1.0254271030426025,
+      "learning_rate": 3.094115674378467e-05,
+      "loss": 0.9098,
+      "step": 16694
+    },
+    {
+      "epoch": 2.972400284900285,
+      "grad_norm": 0.7822396159172058,
+      "learning_rate": 3.0931033777108666e-05,
+      "loss": 0.8497,
+      "step": 16695
+    },
+    {
+      "epoch": 2.972578347578348,
+      "grad_norm": 0.9368909001350403,
+      "learning_rate": 3.092091216369561e-05,
+      "loss": 0.8978,
+      "step": 16696
+    },
+    {
+      "epoch": 2.97275641025641,
+      "grad_norm": 1.0181694030761719,
+      "learning_rate": 3.0910791903743786e-05,
+      "loss": 1.0605,
+      "step": 16697
+    },
+    {
+      "epoch": 2.972934472934473,
+      "grad_norm": 0.8718426823616028,
+      "learning_rate": 3.090067299745146e-05,
+      "loss": 0.9844,
+      "step": 16698
+    },
+    {
+      "epoch": 2.9731125356125356,
+      "grad_norm": 0.8327153325080872,
+      "learning_rate": 3.089055544501692e-05,
+      "loss": 0.8795,
+      "step": 16699
+    },
+    {
+      "epoch": 2.9732905982905984,
+      "grad_norm": 0.7970293164253235,
+      "learning_rate": 3.088043924663838e-05,
+      "loss": 0.794,
+      "step": 16700
+    },
+    {
+      "epoch": 2.9734686609686607,
+      "grad_norm": 0.8885688185691833,
+      "learning_rate": 3.0870324402514014e-05,
+      "loss": 0.984,
+      "step": 16701
+    },
+    {
+      "epoch": 2.9736467236467234,
+      "grad_norm": 0.9126071333885193,
+      "learning_rate": 3.086021091284207e-05,
+      "loss": 0.8962,
+      "step": 16702
+    },
+    {
+      "epoch": 2.973824786324786,
+      "grad_norm": 0.9220738410949707,
+      "learning_rate": 3.085009877782067e-05,
+      "loss": 0.9129,
+      "step": 16703
+    },
+    {
+      "epoch": 2.974002849002849,
+      "grad_norm": 0.8738197684288025,
+      "learning_rate": 3.0839987997647935e-05,
+      "loss": 0.871,
+      "step": 16704
+    },
+    {
+      "epoch": 2.9741809116809117,
+      "grad_norm": 0.7505079507827759,
+      "learning_rate": 3.0829878572521964e-05,
+      "loss": 0.7934,
+      "step": 16705
+    },
+    {
+      "epoch": 2.9743589743589745,
+      "grad_norm": 0.8328925371170044,
+      "learning_rate": 3.081977050264084e-05,
+      "loss": 0.8431,
+      "step": 16706
+    },
+    {
+      "epoch": 2.974537037037037,
+      "grad_norm": 0.8248029947280884,
+      "learning_rate": 3.08096637882026e-05,
+      "loss": 0.7877,
+      "step": 16707
+    },
+    {
+      "epoch": 2.9747150997151,
+      "grad_norm": 1.1086161136627197,
+      "learning_rate": 3.079955842940524e-05,
+      "loss": 1.1196,
+      "step": 16708
+    },
+    {
+      "epoch": 2.9748931623931623,
+      "grad_norm": 0.8271856307983398,
+      "learning_rate": 3.078945442644684e-05,
+      "loss": 0.946,
+      "step": 16709
+    },
+    {
+      "epoch": 2.975071225071225,
+      "grad_norm": 0.9062432050704956,
+      "learning_rate": 3.0779351779525246e-05,
+      "loss": 0.9369,
+      "step": 16710
+    },
+    {
+      "epoch": 2.9752492877492878,
+      "grad_norm": 0.9849454760551453,
+      "learning_rate": 3.0769250488838496e-05,
+      "loss": 0.7748,
+      "step": 16711
+    },
+    {
+      "epoch": 2.9754273504273505,
+      "grad_norm": 0.8420920372009277,
+      "learning_rate": 3.075915055458447e-05,
+      "loss": 0.9954,
+      "step": 16712
+    },
+    {
+      "epoch": 2.9756054131054133,
+      "grad_norm": 1.027130126953125,
+      "learning_rate": 3.074905197696104e-05,
+      "loss": 1.1053,
+      "step": 16713
+    },
+    {
+      "epoch": 2.9757834757834756,
+      "grad_norm": 0.8968556523323059,
+      "learning_rate": 3.073895475616609e-05,
+      "loss": 0.7487,
+      "step": 16714
+    },
+    {
+      "epoch": 2.9759615384615383,
+      "grad_norm": 0.9237984418869019,
+      "learning_rate": 3.072885889239745e-05,
+      "loss": 1.2853,
+      "step": 16715
+    },
+    {
+      "epoch": 2.976139601139601,
+      "grad_norm": 0.7972691059112549,
+      "learning_rate": 3.071876438585292e-05,
+      "loss": 0.6724,
+      "step": 16716
+    },
+    {
+      "epoch": 2.976317663817664,
+      "grad_norm": 0.8158389925956726,
+      "learning_rate": 3.0708671236730266e-05,
+      "loss": 0.8302,
+      "step": 16717
+    },
+    {
+      "epoch": 2.9764957264957266,
+      "grad_norm": 0.7947682738304138,
+      "learning_rate": 3.0698579445227236e-05,
+      "loss": 0.7817,
+      "step": 16718
+    },
+    {
+      "epoch": 2.9766737891737893,
+      "grad_norm": 0.847747802734375,
+      "learning_rate": 3.068848901154166e-05,
+      "loss": 0.8391,
+      "step": 16719
+    },
+    {
+      "epoch": 2.976851851851852,
+      "grad_norm": 0.7906867265701294,
+      "learning_rate": 3.067839993587107e-05,
+      "loss": 0.8935,
+      "step": 16720
+    },
+    {
+      "epoch": 2.9770299145299144,
+      "grad_norm": 0.9028356075286865,
+      "learning_rate": 3.066831221841328e-05,
+      "loss": 0.821,
+      "step": 16721
+    },
+    {
+      "epoch": 2.977207977207977,
+      "grad_norm": 0.8177092671394348,
+      "learning_rate": 3.065822585936589e-05,
+      "loss": 0.7956,
+      "step": 16722
+    },
+    {
+      "epoch": 2.97738603988604,
+      "grad_norm": 0.771271824836731,
+      "learning_rate": 3.064814085892647e-05,
+      "loss": 0.8675,
+      "step": 16723
+    },
+    {
+      "epoch": 2.9775641025641026,
+      "grad_norm": 0.8575150966644287,
+      "learning_rate": 3.063805721729274e-05,
+      "loss": 0.6999,
+      "step": 16724
+    },
+    {
+      "epoch": 2.9777421652421654,
+      "grad_norm": 0.8909936547279358,
+      "learning_rate": 3.062797493466212e-05,
+      "loss": 0.9325,
+      "step": 16725
+    },
+    {
+      "epoch": 2.9779202279202277,
+      "grad_norm": 0.836875319480896,
+      "learning_rate": 3.061789401123226e-05,
+      "loss": 0.9276,
+      "step": 16726
+    },
+    {
+      "epoch": 2.9780982905982905,
+      "grad_norm": 0.8965426683425903,
+      "learning_rate": 3.060781444720065e-05,
+      "loss": 0.7728,
+      "step": 16727
+    },
+    {
+      "epoch": 2.978276353276353,
+      "grad_norm": 0.8298100829124451,
+      "learning_rate": 3.059773624276475e-05,
+      "loss": 0.926,
+      "step": 16728
+    },
+    {
+      "epoch": 2.978454415954416,
+      "grad_norm": 0.9369875192642212,
+      "learning_rate": 3.058765939812204e-05,
+      "loss": 0.8598,
+      "step": 16729
+    },
+    {
+      "epoch": 2.9786324786324787,
+      "grad_norm": 1.019013524055481,
+      "learning_rate": 3.057758391346995e-05,
+      "loss": 0.9075,
+      "step": 16730
+    },
+    {
+      "epoch": 2.9788105413105415,
+      "grad_norm": 0.8597549200057983,
+      "learning_rate": 3.05675097890059e-05,
+      "loss": 0.8279,
+      "step": 16731
+    },
+    {
+      "epoch": 2.978988603988604,
+      "grad_norm": 0.8331323862075806,
+      "learning_rate": 3.055743702492726e-05,
+      "loss": 0.8366,
+      "step": 16732
+    },
+    {
+      "epoch": 2.9791666666666665,
+      "grad_norm": 0.7987640500068665,
+      "learning_rate": 3.054736562143135e-05,
+      "loss": 0.7618,
+      "step": 16733
+    },
+    {
+      "epoch": 2.9793447293447293,
+      "grad_norm": 0.84430330991745,
+      "learning_rate": 3.0537295578715606e-05,
+      "loss": 0.9791,
+      "step": 16734
+    },
+    {
+      "epoch": 2.979522792022792,
+      "grad_norm": 0.7811259627342224,
+      "learning_rate": 3.052722689697719e-05,
+      "loss": 0.7342,
+      "step": 16735
+    },
+    {
+      "epoch": 2.9797008547008548,
+      "grad_norm": 0.8311488032341003,
+      "learning_rate": 3.0517159576413477e-05,
+      "loss": 0.64,
+      "step": 16736
+    },
+    {
+      "epoch": 2.9798789173789175,
+      "grad_norm": 0.8865106105804443,
+      "learning_rate": 3.0507093617221683e-05,
+      "loss": 0.9196,
+      "step": 16737
+    },
+    {
+      "epoch": 2.98005698005698,
+      "grad_norm": 0.7360345125198364,
+      "learning_rate": 3.0497029019599033e-05,
+      "loss": 0.8709,
+      "step": 16738
+    },
+    {
+      "epoch": 2.9802350427350426,
+      "grad_norm": 0.8705546855926514,
+      "learning_rate": 3.0486965783742726e-05,
+      "loss": 0.7567,
+      "step": 16739
+    },
+    {
+      "epoch": 2.9804131054131053,
+      "grad_norm": 0.8622585535049438,
+      "learning_rate": 3.0476903909849908e-05,
+      "loss": 0.7909,
+      "step": 16740
+    },
+    {
+      "epoch": 2.980591168091168,
+      "grad_norm": 0.7877208590507507,
+      "learning_rate": 3.046684339811775e-05,
+      "loss": 0.7046,
+      "step": 16741
+    },
+    {
+      "epoch": 2.980769230769231,
+      "grad_norm": 0.915133535861969,
+      "learning_rate": 3.045678424874332e-05,
+      "loss": 0.9142,
+      "step": 16742
+    },
+    {
+      "epoch": 2.9809472934472936,
+      "grad_norm": 0.8539748191833496,
+      "learning_rate": 3.044672646192377e-05,
+      "loss": 0.9333,
+      "step": 16743
+    },
+    {
+      "epoch": 2.9811253561253563,
+      "grad_norm": 1.1402504444122314,
+      "learning_rate": 3.043667003785612e-05,
+      "loss": 0.8041,
+      "step": 16744
+    },
+    {
+      "epoch": 2.9813034188034186,
+      "grad_norm": 0.7355749607086182,
+      "learning_rate": 3.042661497673742e-05,
+      "loss": 0.8057,
+      "step": 16745
+    },
+    {
+      "epoch": 2.9814814814814814,
+      "grad_norm": 0.8524863719940186,
+      "learning_rate": 3.041656127876469e-05,
+      "loss": 0.7118,
+      "step": 16746
+    },
+    {
+      "epoch": 2.981659544159544,
+      "grad_norm": 0.9434519410133362,
+      "learning_rate": 3.040650894413487e-05,
+      "loss": 0.8104,
+      "step": 16747
+    },
+    {
+      "epoch": 2.981837606837607,
+      "grad_norm": 0.8716524839401245,
+      "learning_rate": 3.0396457973044923e-05,
+      "loss": 0.7502,
+      "step": 16748
+    },
+    {
+      "epoch": 2.9820156695156697,
+      "grad_norm": 0.942481279373169,
+      "learning_rate": 3.0386408365691855e-05,
+      "loss": 0.8506,
+      "step": 16749
+    },
+    {
+      "epoch": 2.982193732193732,
+      "grad_norm": 0.7921879291534424,
+      "learning_rate": 3.037636012227244e-05,
+      "loss": 0.7005,
+      "step": 16750
+    },
+    {
+      "epoch": 2.9823717948717947,
+      "grad_norm": 0.8415821194648743,
+      "learning_rate": 3.0366313242983645e-05,
+      "loss": 0.987,
+      "step": 16751
+    },
+    {
+      "epoch": 2.9825498575498575,
+      "grad_norm": 0.9556678533554077,
+      "learning_rate": 3.0356267728022293e-05,
+      "loss": 0.9198,
+      "step": 16752
+    },
+    {
+      "epoch": 2.98272792022792,
+      "grad_norm": 0.856810450553894,
+      "learning_rate": 3.0346223577585208e-05,
+      "loss": 0.9634,
+      "step": 16753
+    },
+    {
+      "epoch": 2.982905982905983,
+      "grad_norm": 0.8425500392913818,
+      "learning_rate": 3.0336180791869172e-05,
+      "loss": 0.8725,
+      "step": 16754
+    },
+    {
+      "epoch": 2.9830840455840457,
+      "grad_norm": 0.773210883140564,
+      "learning_rate": 3.032613937107096e-05,
+      "loss": 0.8363,
+      "step": 16755
+    },
+    {
+      "epoch": 2.9832621082621085,
+      "grad_norm": 0.7771822214126587,
+      "learning_rate": 3.0316099315387324e-05,
+      "loss": 0.8586,
+      "step": 16756
+    },
+    {
+      "epoch": 2.9834401709401708,
+      "grad_norm": 0.8261004686355591,
+      "learning_rate": 3.0306060625014954e-05,
+      "loss": 0.8185,
+      "step": 16757
+    },
+    {
+      "epoch": 2.9836182336182335,
+      "grad_norm": 1.074456810951233,
+      "learning_rate": 3.029602330015052e-05,
+      "loss": 0.9008,
+      "step": 16758
+    },
+    {
+      "epoch": 2.9837962962962963,
+      "grad_norm": 0.8160609006881714,
+      "learning_rate": 3.0285987340990774e-05,
+      "loss": 0.8494,
+      "step": 16759
+    },
+    {
+      "epoch": 2.983974358974359,
+      "grad_norm": 0.8296660780906677,
+      "learning_rate": 3.0275952747732227e-05,
+      "loss": 0.9089,
+      "step": 16760
+    },
+    {
+      "epoch": 2.984152421652422,
+      "grad_norm": 0.815726637840271,
+      "learning_rate": 3.0265919520571572e-05,
+      "loss": 0.8605,
+      "step": 16761
+    },
+    {
+      "epoch": 2.984330484330484,
+      "grad_norm": 0.7600420117378235,
+      "learning_rate": 3.0255887659705374e-05,
+      "loss": 0.6833,
+      "step": 16762
+    },
+    {
+      "epoch": 2.984508547008547,
+      "grad_norm": 0.8039379119873047,
+      "learning_rate": 3.024585716533014e-05,
+      "loss": 0.8156,
+      "step": 16763
+    },
+    {
+      "epoch": 2.9846866096866096,
+      "grad_norm": 0.8797104358673096,
+      "learning_rate": 3.02358280376425e-05,
+      "loss": 0.9965,
+      "step": 16764
+    },
+    {
+      "epoch": 2.9848646723646723,
+      "grad_norm": 0.8681818246841431,
+      "learning_rate": 3.0225800276838822e-05,
+      "loss": 0.8967,
+      "step": 16765
+    },
+    {
+      "epoch": 2.985042735042735,
+      "grad_norm": 0.804710328578949,
+      "learning_rate": 3.0215773883115706e-05,
+      "loss": 0.7424,
+      "step": 16766
+    },
+    {
+      "epoch": 2.985220797720798,
+      "grad_norm": 0.7346279621124268,
+      "learning_rate": 3.0205748856669467e-05,
+      "loss": 0.6986,
+      "step": 16767
+    },
+    {
+      "epoch": 2.9853988603988606,
+      "grad_norm": 0.8896076083183289,
+      "learning_rate": 3.0195725197696634e-05,
+      "loss": 0.8472,
+      "step": 16768
+    },
+    {
+      "epoch": 2.9855769230769234,
+      "grad_norm": 0.8281870484352112,
+      "learning_rate": 3.0185702906393555e-05,
+      "loss": 0.9873,
+      "step": 16769
+    },
+    {
+      "epoch": 2.9857549857549857,
+      "grad_norm": 0.7376837134361267,
+      "learning_rate": 3.0175681982956607e-05,
+      "loss": 1.0171,
+      "step": 16770
+    },
+    {
+      "epoch": 2.9859330484330484,
+      "grad_norm": 0.9346923828125,
+      "learning_rate": 3.016566242758212e-05,
+      "loss": 0.8742,
+      "step": 16771
+    },
+    {
+      "epoch": 2.986111111111111,
+      "grad_norm": 0.918557345867157,
+      "learning_rate": 3.0155644240466406e-05,
+      "loss": 1.0504,
+      "step": 16772
+    },
+    {
+      "epoch": 2.986289173789174,
+      "grad_norm": 0.8885583281517029,
+      "learning_rate": 3.0145627421805733e-05,
+      "loss": 0.7462,
+      "step": 16773
+    },
+    {
+      "epoch": 2.986467236467236,
+      "grad_norm": 0.8475548028945923,
+      "learning_rate": 3.013561197179644e-05,
+      "loss": 0.7353,
+      "step": 16774
+    },
+    {
+      "epoch": 2.986645299145299,
+      "grad_norm": 0.8925435543060303,
+      "learning_rate": 3.0125597890634626e-05,
+      "loss": 0.8462,
+      "step": 16775
+    },
+    {
+      "epoch": 2.9868233618233617,
+      "grad_norm": 0.923281729221344,
+      "learning_rate": 3.0115585178516648e-05,
+      "loss": 0.9725,
+      "step": 16776
+    },
+    {
+      "epoch": 2.9870014245014245,
+      "grad_norm": 0.9134986996650696,
+      "learning_rate": 3.0105573835638533e-05,
+      "loss": 0.9428,
+      "step": 16777
+    },
+    {
+      "epoch": 2.9871794871794872,
+      "grad_norm": 0.8284963369369507,
+      "learning_rate": 3.009556386219654e-05,
+      "loss": 0.9134,
+      "step": 16778
+    },
+    {
+      "epoch": 2.98735754985755,
+      "grad_norm": 0.9571327567100525,
+      "learning_rate": 3.0085555258386756e-05,
+      "loss": 0.8407,
+      "step": 16779
+    },
+    {
+      "epoch": 2.9875356125356127,
+      "grad_norm": 0.8699741363525391,
+      "learning_rate": 3.0075548024405254e-05,
+      "loss": 0.862,
+      "step": 16780
+    },
+    {
+      "epoch": 2.9877136752136755,
+      "grad_norm": 0.9462196826934814,
+      "learning_rate": 3.006554216044819e-05,
+      "loss": 0.8936,
+      "step": 16781
+    },
+    {
+      "epoch": 2.987891737891738,
+      "grad_norm": 0.9415904879570007,
+      "learning_rate": 3.0055537666711496e-05,
+      "loss": 1.0089,
+      "step": 16782
+    },
+    {
+      "epoch": 2.9880698005698005,
+      "grad_norm": 0.8529258370399475,
+      "learning_rate": 3.0045534543391275e-05,
+      "loss": 0.7896,
+      "step": 16783
+    },
+    {
+      "epoch": 2.9882478632478633,
+      "grad_norm": 0.7967036962509155,
+      "learning_rate": 3.0035532790683486e-05,
+      "loss": 0.7661,
+      "step": 16784
+    },
+    {
+      "epoch": 2.988425925925926,
+      "grad_norm": 1.087546944618225,
+      "learning_rate": 3.0025532408784097e-05,
+      "loss": 0.7232,
+      "step": 16785
+    },
+    {
+      "epoch": 2.9886039886039883,
+      "grad_norm": 1.0032312870025635,
+      "learning_rate": 3.001553339788903e-05,
+      "loss": 0.8462,
+      "step": 16786
+    },
+    {
+      "epoch": 2.988782051282051,
+      "grad_norm": 0.7726916074752808,
+      "learning_rate": 3.0005535758194216e-05,
+      "loss": 0.8318,
+      "step": 16787
+    },
+    {
+      "epoch": 2.988960113960114,
+      "grad_norm": 0.8814677000045776,
+      "learning_rate": 2.999553948989552e-05,
+      "loss": 0.8058,
+      "step": 16788
+    },
+    {
+      "epoch": 2.9891381766381766,
+      "grad_norm": 0.8866276741027832,
+      "learning_rate": 2.9985544593188818e-05,
+      "loss": 0.8576,
+      "step": 16789
+    },
+    {
+      "epoch": 2.9893162393162394,
+      "grad_norm": 0.9877329468727112,
+      "learning_rate": 2.997555106826988e-05,
+      "loss": 0.8675,
+      "step": 16790
+    },
+    {
+      "epoch": 2.989494301994302,
+      "grad_norm": 0.842682421207428,
+      "learning_rate": 2.9965558915334636e-05,
+      "loss": 1.094,
+      "step": 16791
+    },
+    {
+      "epoch": 2.989672364672365,
+      "grad_norm": 0.8740942478179932,
+      "learning_rate": 2.9955568134578703e-05,
+      "loss": 0.904,
+      "step": 16792
+    },
+    {
+      "epoch": 2.9898504273504276,
+      "grad_norm": 0.9166956543922424,
+      "learning_rate": 2.9945578726197944e-05,
+      "loss": 1.0695,
+      "step": 16793
+    },
+    {
+      "epoch": 2.99002849002849,
+      "grad_norm": 0.7124375104904175,
+      "learning_rate": 2.993559069038805e-05,
+      "loss": 0.6606,
+      "step": 16794
+    },
+    {
+      "epoch": 2.9902065527065527,
+      "grad_norm": 0.8711931109428406,
+      "learning_rate": 2.99256040273447e-05,
+      "loss": 0.9049,
+      "step": 16795
+    },
+    {
+      "epoch": 2.9903846153846154,
+      "grad_norm": 0.8475961089134216,
+      "learning_rate": 2.9915618737263584e-05,
+      "loss": 0.8487,
+      "step": 16796
+    },
+    {
+      "epoch": 2.990562678062678,
+      "grad_norm": 0.8962427377700806,
+      "learning_rate": 2.9905634820340324e-05,
+      "loss": 0.9318,
+      "step": 16797
+    },
+    {
+      "epoch": 2.9907407407407405,
+      "grad_norm": 0.9120275378227234,
+      "learning_rate": 2.9895652276770514e-05,
+      "loss": 0.8787,
+      "step": 16798
+    },
+    {
+      "epoch": 2.9909188034188032,
+      "grad_norm": 0.8273578882217407,
+      "learning_rate": 2.9885671106749822e-05,
+      "loss": 0.8566,
+      "step": 16799
+    },
+    {
+      "epoch": 2.991096866096866,
+      "grad_norm": 0.8050937056541443,
+      "learning_rate": 2.9875691310473697e-05,
+      "loss": 1.1276,
+      "step": 16800
+    },
+    {
+      "epoch": 2.9912749287749287,
+      "grad_norm": 0.8434747457504272,
+      "learning_rate": 2.9865712888137766e-05,
+      "loss": 0.6286,
+      "step": 16801
+    },
+    {
+      "epoch": 2.9914529914529915,
+      "grad_norm": 0.7851191759109497,
+      "learning_rate": 2.9855735839937493e-05,
+      "loss": 0.8116,
+      "step": 16802
+    },
+    {
+      "epoch": 2.9916310541310542,
+      "grad_norm": 0.8858240842819214,
+      "learning_rate": 2.984576016606837e-05,
+      "loss": 0.8663,
+      "step": 16803
+    },
+    {
+      "epoch": 2.991809116809117,
+      "grad_norm": 0.8688439726829529,
+      "learning_rate": 2.9835785866725842e-05,
+      "loss": 0.9229,
+      "step": 16804
+    },
+    {
+      "epoch": 2.9919871794871797,
+      "grad_norm": 0.7873746752738953,
+      "learning_rate": 2.9825812942105302e-05,
+      "loss": 0.779,
+      "step": 16805
+    },
+    {
+      "epoch": 2.992165242165242,
+      "grad_norm": 0.9577028751373291,
+      "learning_rate": 2.9815841392402255e-05,
+      "loss": 0.7068,
+      "step": 16806
+    },
+    {
+      "epoch": 2.992343304843305,
+      "grad_norm": 0.8219005465507507,
+      "learning_rate": 2.980587121781193e-05,
+      "loss": 0.7712,
+      "step": 16807
+    },
+    {
+      "epoch": 2.9925213675213675,
+      "grad_norm": 0.9321123957633972,
+      "learning_rate": 2.9795902418529776e-05,
+      "loss": 0.9309,
+      "step": 16808
+    },
+    {
+      "epoch": 2.9926994301994303,
+      "grad_norm": 0.8010317087173462,
+      "learning_rate": 2.9785934994751074e-05,
+      "loss": 0.8973,
+      "step": 16809
+    },
+    {
+      "epoch": 2.9928774928774926,
+      "grad_norm": 0.8819838762283325,
+      "learning_rate": 2.9775968946671117e-05,
+      "loss": 0.6899,
+      "step": 16810
+    },
+    {
+      "epoch": 2.9930555555555554,
+      "grad_norm": 0.8337511420249939,
+      "learning_rate": 2.976600427448518e-05,
+      "loss": 0.8005,
+      "step": 16811
+    },
+    {
+      "epoch": 2.993233618233618,
+      "grad_norm": 1.0560593605041504,
+      "learning_rate": 2.975604097838849e-05,
+      "loss": 0.8804,
+      "step": 16812
+    },
+    {
+      "epoch": 2.993411680911681,
+      "grad_norm": 0.8482444882392883,
+      "learning_rate": 2.9746079058576247e-05,
+      "loss": 0.8606,
+      "step": 16813
+    },
+    {
+      "epoch": 2.9935897435897436,
+      "grad_norm": 0.8289902210235596,
+      "learning_rate": 2.973611851524366e-05,
+      "loss": 0.8163,
+      "step": 16814
+    },
+    {
+      "epoch": 2.9937678062678064,
+      "grad_norm": 0.780939519405365,
+      "learning_rate": 2.9726159348585826e-05,
+      "loss": 0.7041,
+      "step": 16815
+    },
+    {
+      "epoch": 2.993945868945869,
+      "grad_norm": 0.8907291889190674,
+      "learning_rate": 2.9716201558797984e-05,
+      "loss": 0.6484,
+      "step": 16816
+    },
+    {
+      "epoch": 2.994123931623932,
+      "grad_norm": 0.9351929426193237,
+      "learning_rate": 2.9706245146075118e-05,
+      "loss": 0.7232,
+      "step": 16817
+    },
+    {
+      "epoch": 2.994301994301994,
+      "grad_norm": 0.934117317199707,
+      "learning_rate": 2.969629011061238e-05,
+      "loss": 0.9189,
+      "step": 16818
+    },
+    {
+      "epoch": 2.994480056980057,
+      "grad_norm": 0.8751780390739441,
+      "learning_rate": 2.968633645260479e-05,
+      "loss": 0.9695,
+      "step": 16819
+    },
+    {
+      "epoch": 2.9946581196581197,
+      "grad_norm": 0.921024739742279,
+      "learning_rate": 2.967638417224734e-05,
+      "loss": 0.9011,
+      "step": 16820
+    },
+    {
+      "epoch": 2.9948361823361824,
+      "grad_norm": 0.7881745100021362,
+      "learning_rate": 2.9666433269735126e-05,
+      "loss": 0.8001,
+      "step": 16821
+    },
+    {
+      "epoch": 2.995014245014245,
+      "grad_norm": 0.8107823729515076,
+      "learning_rate": 2.9656483745262985e-05,
+      "loss": 0.9239,
+      "step": 16822
+    },
+    {
+      "epoch": 2.9951923076923075,
+      "grad_norm": 0.8491915464401245,
+      "learning_rate": 2.964653559902595e-05,
+      "loss": 0.8193,
+      "step": 16823
+    },
+    {
+      "epoch": 2.9953703703703702,
+      "grad_norm": 0.9521864056587219,
+      "learning_rate": 2.9636588831218904e-05,
+      "loss": 1.0745,
+      "step": 16824
+    },
+    {
+      "epoch": 2.995548433048433,
+      "grad_norm": 0.9784126281738281,
+      "learning_rate": 2.9626643442036727e-05,
+      "loss": 0.9899,
+      "step": 16825
+    },
+    {
+      "epoch": 2.9957264957264957,
+      "grad_norm": 0.8035039305686951,
+      "learning_rate": 2.961669943167429e-05,
+      "loss": 0.9172,
+      "step": 16826
+    },
+    {
+      "epoch": 2.9959045584045585,
+      "grad_norm": 0.9292920827865601,
+      "learning_rate": 2.9606756800326408e-05,
+      "loss": 0.9228,
+      "step": 16827
+    },
+    {
+      "epoch": 2.9960826210826212,
+      "grad_norm": 0.9243139028549194,
+      "learning_rate": 2.9596815548187908e-05,
+      "loss": 0.8698,
+      "step": 16828
+    },
+    {
+      "epoch": 2.996260683760684,
+      "grad_norm": 0.9847014546394348,
+      "learning_rate": 2.958687567545355e-05,
+      "loss": 0.8885,
+      "step": 16829
+    },
+    {
+      "epoch": 2.9964387464387463,
+      "grad_norm": 0.9667131900787354,
+      "learning_rate": 2.9576937182318078e-05,
+      "loss": 0.7951,
+      "step": 16830
+    },
+    {
+      "epoch": 2.996616809116809,
+      "grad_norm": 0.8431822061538696,
+      "learning_rate": 2.956700006897628e-05,
+      "loss": 1.0645,
+      "step": 16831
+    },
+    {
+      "epoch": 2.996794871794872,
+      "grad_norm": 0.7381225228309631,
+      "learning_rate": 2.955706433562274e-05,
+      "loss": 0.5631,
+      "step": 16832
+    },
+    {
+      "epoch": 2.9969729344729346,
+      "grad_norm": 0.7975440621376038,
+      "learning_rate": 2.9547129982452228e-05,
+      "loss": 0.8324,
+      "step": 16833
+    },
+    {
+      "epoch": 2.9971509971509973,
+      "grad_norm": 0.9004024863243103,
+      "learning_rate": 2.9537197009659344e-05,
+      "loss": 1.045,
+      "step": 16834
+    },
+    {
+      "epoch": 2.9973290598290596,
+      "grad_norm": 0.786651074886322,
+      "learning_rate": 2.952726541743871e-05,
+      "loss": 0.7569,
+      "step": 16835
+    },
+    {
+      "epoch": 2.9975071225071224,
+      "grad_norm": 0.8053869605064392,
+      "learning_rate": 2.951733520598492e-05,
+      "loss": 0.8327,
+      "step": 16836
+    },
+    {
+      "epoch": 2.997685185185185,
+      "grad_norm": 0.8725607991218567,
+      "learning_rate": 2.9507406375492543e-05,
+      "loss": 0.7702,
+      "step": 16837
+    },
+    {
+      "epoch": 2.997863247863248,
+      "grad_norm": 0.9187145233154297,
+      "learning_rate": 2.9497478926156087e-05,
+      "loss": 0.8946,
+      "step": 16838
+    },
+    {
+      "epoch": 2.9980413105413106,
+      "grad_norm": 0.9324785470962524,
+      "learning_rate": 2.9487552858170076e-05,
+      "loss": 0.8669,
+      "step": 16839
+    },
+    {
+      "epoch": 2.9982193732193734,
+      "grad_norm": 0.8439409136772156,
+      "learning_rate": 2.9477628171728964e-05,
+      "loss": 0.7187,
+      "step": 16840
+    },
+    {
+      "epoch": 2.998397435897436,
+      "grad_norm": 0.7890669703483582,
+      "learning_rate": 2.9467704867027258e-05,
+      "loss": 0.7856,
+      "step": 16841
+    },
+    {
+      "epoch": 2.9985754985754984,
+      "grad_norm": 0.7931693196296692,
+      "learning_rate": 2.9457782944259362e-05,
+      "loss": 0.7212,
+      "step": 16842
+    },
+    {
+      "epoch": 2.998753561253561,
+      "grad_norm": 0.8256354331970215,
+      "learning_rate": 2.9447862403619665e-05,
+      "loss": 0.8255,
+      "step": 16843
+    },
+    {
+      "epoch": 2.998931623931624,
+      "grad_norm": 0.7902492880821228,
+      "learning_rate": 2.9437943245302547e-05,
+      "loss": 0.8298,
+      "step": 16844
+    },
+    {
+      "epoch": 2.9991096866096867,
+      "grad_norm": 1.0743845701217651,
+      "learning_rate": 2.9428025469502318e-05,
+      "loss": 0.92,
+      "step": 16845
+    },
+    {
+      "epoch": 2.9992877492877494,
+      "grad_norm": 0.9262487292289734,
+      "learning_rate": 2.9418109076413404e-05,
+      "loss": 0.9108,
+      "step": 16846
+    },
+    {
+      "epoch": 2.9994658119658117,
+      "grad_norm": 0.855722963809967,
+      "learning_rate": 2.9408194066229944e-05,
+      "loss": 0.8596,
+      "step": 16847
+    },
+    {
+      "epoch": 2.9996438746438745,
+      "grad_norm": 0.8279969692230225,
+      "learning_rate": 2.939828043914634e-05,
+      "loss": 0.8913,
+      "step": 16848
+    },
+    {
+      "epoch": 2.9996438746438745,
+      "eval_loss": 1.1308976411819458,
+      "eval_runtime": 24.7916,
+      "eval_samples_per_second": 41.99,
+      "eval_steps_per_second": 21.015,
+      "step": 16848
+    },
+    {
+      "epoch": 2.9998219373219372,
+      "grad_norm": 0.8992627263069153,
+      "learning_rate": 2.9388368195356718e-05,
+      "loss": 1.0347,
+      "step": 16849
+    },
+    {
+      "epoch": 3.0,
+      "grad_norm": 0.8295336961746216,
+      "learning_rate": 2.9378457335055364e-05,
+      "loss": 0.797,
+      "step": 16850
+    },
+    {
+      "epoch": 3.0001780626780628,
+      "grad_norm": 0.9085283875465393,
+      "learning_rate": 2.9368547858436434e-05,
+      "loss": 0.7888,
+      "step": 16851
+    },
+    {
+      "epoch": 3.0001780626780628,
+      "grad_norm": 0.7804644107818604,
+      "learning_rate": 2.9358639765694085e-05,
+      "loss": 0.6619,
+      "step": 16852
+    },
+    {
+      "epoch": 3.0003561253561255,
+      "grad_norm": 0.7937109470367432,
+      "learning_rate": 2.9348733057022447e-05,
+      "loss": 0.6769,
+      "step": 16853
+    },
+    {
+      "epoch": 3.0005341880341883,
+      "grad_norm": 0.8836156725883484,
+      "learning_rate": 2.9338827732615627e-05,
+      "loss": 0.8822,
+      "step": 16854
+    },
+    {
+      "epoch": 3.0007122507122506,
+      "grad_norm": 0.7436475157737732,
+      "learning_rate": 2.932892379266765e-05,
+      "loss": 0.8601,
+      "step": 16855
+    },
+    {
+      "epoch": 3.0008903133903133,
+      "grad_norm": 0.825633704662323,
+      "learning_rate": 2.9319021237372668e-05,
+      "loss": 0.8905,
+      "step": 16856
+    },
+    {
+      "epoch": 3.001068376068376,
+      "grad_norm": 0.7219718098640442,
+      "learning_rate": 2.930912006692458e-05,
+      "loss": 0.8757,
+      "step": 16857
+    },
+    {
+      "epoch": 3.001246438746439,
+      "grad_norm": 0.8130925893783569,
+      "learning_rate": 2.929922028151746e-05,
+      "loss": 0.4952,
+      "step": 16858
+    },
+    {
+      "epoch": 3.0014245014245016,
+      "grad_norm": 0.6775509119033813,
+      "learning_rate": 2.9289321881345254e-05,
+      "loss": 0.5514,
+      "step": 16859
+    },
+    {
+      "epoch": 3.0016025641025643,
+      "grad_norm": 0.7436477541923523,
+      "learning_rate": 2.92794248666019e-05,
+      "loss": 0.9079,
+      "step": 16860
+    },
+    {
+      "epoch": 3.0017806267806266,
+      "grad_norm": 0.7549178600311279,
+      "learning_rate": 2.9269529237481307e-05,
+      "loss": 0.7723,
+      "step": 16861
+    },
+    {
+      "epoch": 3.0019586894586894,
+      "grad_norm": 0.8569880127906799,
+      "learning_rate": 2.9259634994177322e-05,
+      "loss": 0.8593,
+      "step": 16862
+    },
+    {
+      "epoch": 3.002136752136752,
+      "grad_norm": 0.7354894280433655,
+      "learning_rate": 2.9249742136883906e-05,
+      "loss": 0.7426,
+      "step": 16863
+    },
+    {
+      "epoch": 3.002314814814815,
+      "grad_norm": 0.8491479158401489,
+      "learning_rate": 2.9239850665794755e-05,
+      "loss": 0.8501,
+      "step": 16864
+    },
+    {
+      "epoch": 3.0024928774928776,
+      "grad_norm": 0.7588241696357727,
+      "learning_rate": 2.9229960581103776e-05,
+      "loss": 0.8064,
+      "step": 16865
+    },
+    {
+      "epoch": 3.0026709401709404,
+      "grad_norm": 0.7409141659736633,
+      "learning_rate": 2.9220071883004708e-05,
+      "loss": 0.7756,
+      "step": 16866
+    },
+    {
+      "epoch": 3.0028490028490027,
+      "grad_norm": 0.8201326131820679,
+      "learning_rate": 2.921018457169129e-05,
+      "loss": 0.9796,
+      "step": 16867
+    },
+    {
+      "epoch": 3.0030270655270654,
+      "grad_norm": 0.7816381454467773,
+      "learning_rate": 2.920029864735726e-05,
+      "loss": 0.8362,
+      "step": 16868
+    },
+    {
+      "epoch": 3.003205128205128,
+      "grad_norm": 0.9872122406959534,
+      "learning_rate": 2.9190414110196295e-05,
+      "loss": 0.9468,
+      "step": 16869
+    },
+    {
+      "epoch": 3.003383190883191,
+      "grad_norm": 0.9007967114448547,
+      "learning_rate": 2.918053096040205e-05,
+      "loss": 0.7311,
+      "step": 16870
+    },
+    {
+      "epoch": 3.0035612535612537,
+      "grad_norm": 0.8848899006843567,
+      "learning_rate": 2.9170649198168243e-05,
+      "loss": 0.6342,
+      "step": 16871
+    },
+    {
+      "epoch": 3.0037393162393164,
+      "grad_norm": 0.6995126605033875,
+      "learning_rate": 2.9160768823688377e-05,
+      "loss": 0.6895,
+      "step": 16872
+    },
+    {
+      "epoch": 3.0039173789173788,
+      "grad_norm": 0.7542599439620972,
+      "learning_rate": 2.9150889837156135e-05,
+      "loss": 0.7411,
+      "step": 16873
+    },
+    {
+      "epoch": 3.0040954415954415,
+      "grad_norm": 0.8360390663146973,
+      "learning_rate": 2.9141012238764985e-05,
+      "loss": 0.7273,
+      "step": 16874
+    },
+    {
+      "epoch": 3.0042735042735043,
+      "grad_norm": 0.8042497038841248,
+      "learning_rate": 2.9131136028708526e-05,
+      "loss": 0.6749,
+      "step": 16875
+    },
+    {
+      "epoch": 3.004451566951567,
+      "grad_norm": 0.7112155556678772,
+      "learning_rate": 2.912126120718024e-05,
+      "loss": 0.7799,
+      "step": 16876
+    },
+    {
+      "epoch": 3.0046296296296298,
+      "grad_norm": 0.7784724831581116,
+      "learning_rate": 2.9111387774373612e-05,
+      "loss": 0.7181,
+      "step": 16877
+    },
+    {
+      "epoch": 3.0048076923076925,
+      "grad_norm": 0.8795644044876099,
+      "learning_rate": 2.9101515730482064e-05,
+      "loss": 0.8351,
+      "step": 16878
+    },
+    {
+      "epoch": 3.004985754985755,
+      "grad_norm": 0.7584426999092102,
+      "learning_rate": 2.9091645075699014e-05,
+      "loss": 0.7506,
+      "step": 16879
+    },
+    {
+      "epoch": 3.0051638176638176,
+      "grad_norm": 0.881862461566925,
+      "learning_rate": 2.9081775810217915e-05,
+      "loss": 0.7594,
+      "step": 16880
+    },
+    {
+      "epoch": 3.0053418803418803,
+      "grad_norm": 0.8785306215286255,
+      "learning_rate": 2.90719079342321e-05,
+      "loss": 0.701,
+      "step": 16881
+    },
+    {
+      "epoch": 3.005519943019943,
+      "grad_norm": 0.749655544757843,
+      "learning_rate": 2.9062041447934908e-05,
+      "loss": 0.7694,
+      "step": 16882
+    },
+    {
+      "epoch": 3.005698005698006,
+      "grad_norm": 0.8743281364440918,
+      "learning_rate": 2.9052176351519646e-05,
+      "loss": 0.9534,
+      "step": 16883
+    },
+    {
+      "epoch": 3.0058760683760686,
+      "grad_norm": 0.7728061079978943,
+      "learning_rate": 2.9042312645179613e-05,
+      "loss": 0.6966,
+      "step": 16884
+    },
+    {
+      "epoch": 3.006054131054131,
+      "grad_norm": 0.8588292002677917,
+      "learning_rate": 2.9032450329108052e-05,
+      "loss": 0.6174,
+      "step": 16885
+    },
+    {
+      "epoch": 3.0062321937321936,
+      "grad_norm": 0.8733890056610107,
+      "learning_rate": 2.9022589403498214e-05,
+      "loss": 0.5899,
+      "step": 16886
+    },
+    {
+      "epoch": 3.0064102564102564,
+      "grad_norm": 0.8324486613273621,
+      "learning_rate": 2.9012729868543264e-05,
+      "loss": 0.7532,
+      "step": 16887
+    },
+    {
+      "epoch": 3.006588319088319,
+      "grad_norm": 0.7193008661270142,
+      "learning_rate": 2.9002871724436465e-05,
+      "loss": 0.6136,
+      "step": 16888
+    },
+    {
+      "epoch": 3.006766381766382,
+      "grad_norm": 0.7795392870903015,
+      "learning_rate": 2.899301497137086e-05,
+      "loss": 0.8326,
+      "step": 16889
+    },
+    {
+      "epoch": 3.0069444444444446,
+      "grad_norm": 0.7805325984954834,
+      "learning_rate": 2.8983159609539635e-05,
+      "loss": 0.6532,
+      "step": 16890
+    },
+    {
+      "epoch": 3.007122507122507,
+      "grad_norm": 0.78617262840271,
+      "learning_rate": 2.8973305639135883e-05,
+      "loss": 0.5964,
+      "step": 16891
+    },
+    {
+      "epoch": 3.0073005698005697,
+      "grad_norm": 0.8331096172332764,
+      "learning_rate": 2.8963453060352662e-05,
+      "loss": 0.7982,
+      "step": 16892
+    },
+    {
+      "epoch": 3.0074786324786325,
+      "grad_norm": 0.8765615820884705,
+      "learning_rate": 2.8953601873383017e-05,
+      "loss": 0.8202,
+      "step": 16893
+    },
+    {
+      "epoch": 3.007656695156695,
+      "grad_norm": 0.8578206896781921,
+      "learning_rate": 2.894375207841995e-05,
+      "loss": 0.5168,
+      "step": 16894
+    },
+    {
+      "epoch": 3.007834757834758,
+      "grad_norm": 0.8777512311935425,
+      "learning_rate": 2.893390367565646e-05,
+      "loss": 0.7154,
+      "step": 16895
+    },
+    {
+      "epoch": 3.0080128205128207,
+      "grad_norm": 0.8647478222846985,
+      "learning_rate": 2.8924056665285494e-05,
+      "loss": 0.6488,
+      "step": 16896
+    },
+    {
+      "epoch": 3.008190883190883,
+      "grad_norm": 0.7841095924377441,
+      "learning_rate": 2.8914211047499963e-05,
+      "loss": 0.8219,
+      "step": 16897
+    },
+    {
+      "epoch": 3.0083689458689458,
+      "grad_norm": 0.9699451923370361,
+      "learning_rate": 2.8904366822492855e-05,
+      "loss": 0.6815,
+      "step": 16898
+    },
+    {
+      "epoch": 3.0085470085470085,
+      "grad_norm": 0.8264166116714478,
+      "learning_rate": 2.8894523990456946e-05,
+      "loss": 0.8515,
+      "step": 16899
+    },
+    {
+      "epoch": 3.0087250712250713,
+      "grad_norm": 0.8300934433937073,
+      "learning_rate": 2.888468255158514e-05,
+      "loss": 0.6219,
+      "step": 16900
+    },
+    {
+      "epoch": 3.008903133903134,
+      "grad_norm": 0.8727999925613403,
+      "learning_rate": 2.8874842506070265e-05,
+      "loss": 0.8451,
+      "step": 16901
+    },
+    {
+      "epoch": 3.0090811965811968,
+      "grad_norm": 0.8788085579872131,
+      "learning_rate": 2.8865003854105066e-05,
+      "loss": 0.8576,
+      "step": 16902
+    },
+    {
+      "epoch": 3.009259259259259,
+      "grad_norm": 0.8885243535041809,
+      "learning_rate": 2.885516659588241e-05,
+      "loss": 0.7011,
+      "step": 16903
+    },
+    {
+      "epoch": 3.009437321937322,
+      "grad_norm": 0.8000853061676025,
+      "learning_rate": 2.8845330731594898e-05,
+      "loss": 0.7582,
+      "step": 16904
+    },
+    {
+      "epoch": 3.0096153846153846,
+      "grad_norm": 0.9916318655014038,
+      "learning_rate": 2.883549626143537e-05,
+      "loss": 0.8114,
+      "step": 16905
+    },
+    {
+      "epoch": 3.0097934472934473,
+      "grad_norm": 0.9104628562927246,
+      "learning_rate": 2.882566318559645e-05,
+      "loss": 1.059,
+      "step": 16906
+    },
+    {
+      "epoch": 3.00997150997151,
+      "grad_norm": 0.8790180683135986,
+      "learning_rate": 2.8815831504270807e-05,
+      "loss": 0.8905,
+      "step": 16907
+    },
+    {
+      "epoch": 3.010149572649573,
+      "grad_norm": 0.954405665397644,
+      "learning_rate": 2.880600121765108e-05,
+      "loss": 0.9691,
+      "step": 16908
+    },
+    {
+      "epoch": 3.010327635327635,
+      "grad_norm": 0.8337442874908447,
+      "learning_rate": 2.879617232592986e-05,
+      "loss": 0.6808,
+      "step": 16909
+    },
+    {
+      "epoch": 3.010505698005698,
+      "grad_norm": 0.9617137908935547,
+      "learning_rate": 2.878634482929974e-05,
+      "loss": 0.7545,
+      "step": 16910
+    },
+    {
+      "epoch": 3.0106837606837606,
+      "grad_norm": 0.8164490461349487,
+      "learning_rate": 2.877651872795325e-05,
+      "loss": 0.7219,
+      "step": 16911
+    },
+    {
+      "epoch": 3.0108618233618234,
+      "grad_norm": 0.7971426248550415,
+      "learning_rate": 2.8766694022082895e-05,
+      "loss": 0.7519,
+      "step": 16912
+    },
+    {
+      "epoch": 3.011039886039886,
+      "grad_norm": 0.8035764694213867,
+      "learning_rate": 2.8756870711881255e-05,
+      "loss": 0.5414,
+      "step": 16913
+    },
+    {
+      "epoch": 3.011217948717949,
+      "grad_norm": 0.851365864276886,
+      "learning_rate": 2.8747048797540676e-05,
+      "loss": 0.8258,
+      "step": 16914
+    },
+    {
+      "epoch": 3.011396011396011,
+      "grad_norm": 0.9337644577026367,
+      "learning_rate": 2.87372282792537e-05,
+      "loss": 0.7766,
+      "step": 16915
+    },
+    {
+      "epoch": 3.011574074074074,
+      "grad_norm": 0.805377721786499,
+      "learning_rate": 2.8727409157212693e-05,
+      "loss": 0.5635,
+      "step": 16916
+    },
+    {
+      "epoch": 3.0117521367521367,
+      "grad_norm": 0.834541916847229,
+      "learning_rate": 2.8717591431610046e-05,
+      "loss": 0.6602,
+      "step": 16917
+    },
+    {
+      "epoch": 3.0119301994301995,
+      "grad_norm": 0.7747098207473755,
+      "learning_rate": 2.8707775102638124e-05,
+      "loss": 0.6742,
+      "step": 16918
+    },
+    {
+      "epoch": 3.012108262108262,
+      "grad_norm": 0.8491533398628235,
+      "learning_rate": 2.8697960170489226e-05,
+      "loss": 0.7951,
+      "step": 16919
+    },
+    {
+      "epoch": 3.012286324786325,
+      "grad_norm": 0.8623409867286682,
+      "learning_rate": 2.8688146635355727e-05,
+      "loss": 0.7455,
+      "step": 16920
+    },
+    {
+      "epoch": 3.0124643874643873,
+      "grad_norm": 0.8398233652114868,
+      "learning_rate": 2.8678334497429804e-05,
+      "loss": 0.8699,
+      "step": 16921
+    },
+    {
+      "epoch": 3.01264245014245,
+      "grad_norm": 0.8916721940040588,
+      "learning_rate": 2.8668523756903798e-05,
+      "loss": 0.7295,
+      "step": 16922
+    },
+    {
+      "epoch": 3.0128205128205128,
+      "grad_norm": 0.8889368772506714,
+      "learning_rate": 2.865871441396988e-05,
+      "loss": 0.7543,
+      "step": 16923
+    },
+    {
+      "epoch": 3.0129985754985755,
+      "grad_norm": 0.8547541499137878,
+      "learning_rate": 2.8648906468820257e-05,
+      "loss": 0.7837,
+      "step": 16924
+    },
+    {
+      "epoch": 3.0131766381766383,
+      "grad_norm": 0.7525152564048767,
+      "learning_rate": 2.8639099921647096e-05,
+      "loss": 0.8279,
+      "step": 16925
+    },
+    {
+      "epoch": 3.013354700854701,
+      "grad_norm": 0.8209140300750732,
+      "learning_rate": 2.8629294772642523e-05,
+      "loss": 0.8206,
+      "step": 16926
+    },
+    {
+      "epoch": 3.0135327635327633,
+      "grad_norm": 0.8411595821380615,
+      "learning_rate": 2.8619491021998633e-05,
+      "loss": 0.7606,
+      "step": 16927
+    },
+    {
+      "epoch": 3.013710826210826,
+      "grad_norm": 0.8703480362892151,
+      "learning_rate": 2.860968866990761e-05,
+      "loss": 0.8538,
+      "step": 16928
+    },
+    {
+      "epoch": 3.013888888888889,
+      "grad_norm": 0.9207778573036194,
+      "learning_rate": 2.8599887716561357e-05,
+      "loss": 0.7485,
+      "step": 16929
+    },
+    {
+      "epoch": 3.0140669515669516,
+      "grad_norm": 0.8924398422241211,
+      "learning_rate": 2.8590088162152016e-05,
+      "loss": 0.9015,
+      "step": 16930
+    },
+    {
+      "epoch": 3.0142450142450143,
+      "grad_norm": 0.8561375141143799,
+      "learning_rate": 2.858029000687157e-05,
+      "loss": 0.7663,
+      "step": 16931
+    },
+    {
+      "epoch": 3.014423076923077,
+      "grad_norm": 0.9084187746047974,
+      "learning_rate": 2.8570493250911967e-05,
+      "loss": 0.8277,
+      "step": 16932
+    },
+    {
+      "epoch": 3.0146011396011394,
+      "grad_norm": 0.9076225757598877,
+      "learning_rate": 2.856069789446517e-05,
+      "loss": 0.7972,
+      "step": 16933
+    },
+    {
+      "epoch": 3.014779202279202,
+      "grad_norm": 0.9396339058876038,
+      "learning_rate": 2.8550903937723104e-05,
+      "loss": 0.6968,
+      "step": 16934
+    },
+    {
+      "epoch": 3.014957264957265,
+      "grad_norm": 0.8107509613037109,
+      "learning_rate": 2.854111138087765e-05,
+      "loss": 0.6931,
+      "step": 16935
+    },
+    {
+      "epoch": 3.0151353276353277,
+      "grad_norm": 0.908308207988739,
+      "learning_rate": 2.8531320224120663e-05,
+      "loss": 0.8363,
+      "step": 16936
+    },
+    {
+      "epoch": 3.0153133903133904,
+      "grad_norm": 0.8976947665214539,
+      "learning_rate": 2.8521530467643976e-05,
+      "loss": 0.7907,
+      "step": 16937
+    },
+    {
+      "epoch": 3.015491452991453,
+      "grad_norm": 0.8451889753341675,
+      "learning_rate": 2.8511742111639473e-05,
+      "loss": 0.8333,
+      "step": 16938
+    },
+    {
+      "epoch": 3.0156695156695155,
+      "grad_norm": 0.8719599843025208,
+      "learning_rate": 2.8501955156298808e-05,
+      "loss": 0.8047,
+      "step": 16939
+    },
+    {
+      "epoch": 3.015847578347578,
+      "grad_norm": 0.7666802406311035,
+      "learning_rate": 2.8492169601813856e-05,
+      "loss": 0.6434,
+      "step": 16940
+    },
+    {
+      "epoch": 3.016025641025641,
+      "grad_norm": 0.9386308789253235,
+      "learning_rate": 2.848238544837628e-05,
+      "loss": 0.7351,
+      "step": 16941
+    },
+    {
+      "epoch": 3.0162037037037037,
+      "grad_norm": 0.9501772522926331,
+      "learning_rate": 2.8472602696177773e-05,
+      "loss": 0.7057,
+      "step": 16942
+    },
+    {
+      "epoch": 3.0163817663817665,
+      "grad_norm": 1.0069975852966309,
+      "learning_rate": 2.8462821345410097e-05,
+      "loss": 0.8611,
+      "step": 16943
+    },
+    {
+      "epoch": 3.0165598290598292,
+      "grad_norm": 0.9124653935432434,
+      "learning_rate": 2.8453041396264756e-05,
+      "loss": 0.6097,
+      "step": 16944
+    },
+    {
+      "epoch": 3.0167378917378915,
+      "grad_norm": 0.8535616993904114,
+      "learning_rate": 2.8443262848933515e-05,
+      "loss": 0.7971,
+      "step": 16945
+    },
+    {
+      "epoch": 3.0169159544159543,
+      "grad_norm": 0.9309000968933105,
+      "learning_rate": 2.843348570360783e-05,
+      "loss": 0.8864,
+      "step": 16946
+    },
+    {
+      "epoch": 3.017094017094017,
+      "grad_norm": 0.8818594813346863,
+      "learning_rate": 2.842370996047935e-05,
+      "loss": 0.7748,
+      "step": 16947
+    },
+    {
+      "epoch": 3.01727207977208,
+      "grad_norm": 1.034568428993225,
+      "learning_rate": 2.8413935619739596e-05,
+      "loss": 0.6979,
+      "step": 16948
+    },
+    {
+      "epoch": 3.0174501424501425,
+      "grad_norm": 0.9504423141479492,
+      "learning_rate": 2.840416268158006e-05,
+      "loss": 0.7447,
+      "step": 16949
+    },
+    {
+      "epoch": 3.0176282051282053,
+      "grad_norm": 0.9099027514457703,
+      "learning_rate": 2.8394391146192234e-05,
+      "loss": 0.8136,
+      "step": 16950
+    },
+    {
+      "epoch": 3.0178062678062676,
+      "grad_norm": 0.8165521025657654,
+      "learning_rate": 2.8384621013767565e-05,
+      "loss": 0.7306,
+      "step": 16951
+    },
+    {
+      "epoch": 3.0179843304843303,
+      "grad_norm": 0.832309901714325,
+      "learning_rate": 2.8374852284497446e-05,
+      "loss": 0.6333,
+      "step": 16952
+    },
+    {
+      "epoch": 3.018162393162393,
+      "grad_norm": 0.8041273951530457,
+      "learning_rate": 2.8365084958573363e-05,
+      "loss": 0.6899,
+      "step": 16953
+    },
+    {
+      "epoch": 3.018340455840456,
+      "grad_norm": 0.9186970591545105,
+      "learning_rate": 2.8355319036186577e-05,
+      "loss": 0.7873,
+      "step": 16954
+    },
+    {
+      "epoch": 3.0185185185185186,
+      "grad_norm": 0.8950297832489014,
+      "learning_rate": 2.834555451752855e-05,
+      "loss": 0.945,
+      "step": 16955
+    },
+    {
+      "epoch": 3.0186965811965814,
+      "grad_norm": 1.0703802108764648,
+      "learning_rate": 2.8335791402790468e-05,
+      "loss": 0.8377,
+      "step": 16956
+    },
+    {
+      "epoch": 3.0188746438746437,
+      "grad_norm": 0.9604542851448059,
+      "learning_rate": 2.8326029692163712e-05,
+      "loss": 0.8197,
+      "step": 16957
+    },
+    {
+      "epoch": 3.0190527065527064,
+      "grad_norm": 0.9822934865951538,
+      "learning_rate": 2.8316269385839524e-05,
+      "loss": 0.839,
+      "step": 16958
+    },
+    {
+      "epoch": 3.019230769230769,
+      "grad_norm": 0.945202648639679,
+      "learning_rate": 2.8306510484009085e-05,
+      "loss": 0.888,
+      "step": 16959
+    },
+    {
+      "epoch": 3.019408831908832,
+      "grad_norm": 0.9367056488990784,
+      "learning_rate": 2.8296752986863706e-05,
+      "loss": 0.8231,
+      "step": 16960
+    },
+    {
+      "epoch": 3.0195868945868947,
+      "grad_norm": 0.8301896452903748,
+      "learning_rate": 2.828699689459444e-05,
+      "loss": 0.7208,
+      "step": 16961
+    },
+    {
+      "epoch": 3.0197649572649574,
+      "grad_norm": 1.0018302202224731,
+      "learning_rate": 2.827724220739254e-05,
+      "loss": 0.9264,
+      "step": 16962
+    },
+    {
+      "epoch": 3.0199430199430197,
+      "grad_norm": 0.8947141170501709,
+      "learning_rate": 2.8267488925449083e-05,
+      "loss": 0.7169,
+      "step": 16963
+    },
+    {
+      "epoch": 3.0201210826210825,
+      "grad_norm": 0.914278507232666,
+      "learning_rate": 2.8257737048955167e-05,
+      "loss": 0.7964,
+      "step": 16964
+    },
+    {
+      "epoch": 3.0202991452991452,
+      "grad_norm": 0.8575536012649536,
+      "learning_rate": 2.824798657810186e-05,
+      "loss": 0.7144,
+      "step": 16965
+    },
+    {
+      "epoch": 3.020477207977208,
+      "grad_norm": 0.8627474904060364,
+      "learning_rate": 2.82382375130802e-05,
+      "loss": 0.6284,
+      "step": 16966
+    },
+    {
+      "epoch": 3.0206552706552707,
+      "grad_norm": 0.9590699076652527,
+      "learning_rate": 2.822848985408122e-05,
+      "loss": 0.6919,
+      "step": 16967
+    },
+    {
+      "epoch": 3.0208333333333335,
+      "grad_norm": 0.8014170527458191,
+      "learning_rate": 2.821874360129587e-05,
+      "loss": 0.7017,
+      "step": 16968
+    },
+    {
+      "epoch": 3.021011396011396,
+      "grad_norm": 0.9732417464256287,
+      "learning_rate": 2.8208998754915107e-05,
+      "loss": 0.9154,
+      "step": 16969
+    },
+    {
+      "epoch": 3.0211894586894585,
+      "grad_norm": 1.035178780555725,
+      "learning_rate": 2.8199255315129934e-05,
+      "loss": 0.7987,
+      "step": 16970
+    },
+    {
+      "epoch": 3.0213675213675213,
+      "grad_norm": 0.9089648723602295,
+      "learning_rate": 2.818951328213115e-05,
+      "loss": 0.7842,
+      "step": 16971
+    },
+    {
+      "epoch": 3.021545584045584,
+      "grad_norm": 0.8718148469924927,
+      "learning_rate": 2.8179772656109702e-05,
+      "loss": 0.5428,
+      "step": 16972
+    },
+    {
+      "epoch": 3.021723646723647,
+      "grad_norm": 0.9230215549468994,
+      "learning_rate": 2.8170033437256415e-05,
+      "loss": 0.8986,
+      "step": 16973
+    },
+    {
+      "epoch": 3.0219017094017095,
+      "grad_norm": 0.7892549633979797,
+      "learning_rate": 2.8160295625762112e-05,
+      "loss": 0.7338,
+      "step": 16974
+    },
+    {
+      "epoch": 3.0220797720797723,
+      "grad_norm": 0.8036878108978271,
+      "learning_rate": 2.8150559221817586e-05,
+      "loss": 0.4678,
+      "step": 16975
+    },
+    {
+      "epoch": 3.0222578347578346,
+      "grad_norm": 0.8545025587081909,
+      "learning_rate": 2.8140824225613594e-05,
+      "loss": 0.7263,
+      "step": 16976
+    },
+    {
+      "epoch": 3.0224358974358974,
+      "grad_norm": 0.8124253749847412,
+      "learning_rate": 2.813109063734084e-05,
+      "loss": 0.5429,
+      "step": 16977
+    },
+    {
+      "epoch": 3.02261396011396,
+      "grad_norm": 0.9141904711723328,
+      "learning_rate": 2.8121358457190116e-05,
+      "loss": 0.7035,
+      "step": 16978
+    },
+    {
+      "epoch": 3.022792022792023,
+      "grad_norm": 0.9336223602294922,
+      "learning_rate": 2.8111627685352048e-05,
+      "loss": 0.8263,
+      "step": 16979
+    },
+    {
+      "epoch": 3.0229700854700856,
+      "grad_norm": 0.972653329372406,
+      "learning_rate": 2.8101898322017295e-05,
+      "loss": 0.6823,
+      "step": 16980
+    },
+    {
+      "epoch": 3.0231481481481484,
+      "grad_norm": 0.9060801267623901,
+      "learning_rate": 2.8092170367376502e-05,
+      "loss": 0.7235,
+      "step": 16981
+    },
+    {
+      "epoch": 3.0233262108262107,
+      "grad_norm": 0.832062304019928,
+      "learning_rate": 2.8082443821620243e-05,
+      "loss": 0.8151,
+      "step": 16982
+    },
+    {
+      "epoch": 3.0235042735042734,
+      "grad_norm": 0.7333422303199768,
+      "learning_rate": 2.8072718684939104e-05,
+      "loss": 0.6096,
+      "step": 16983
+    },
+    {
+      "epoch": 3.023682336182336,
+      "grad_norm": 0.8611993193626404,
+      "learning_rate": 2.8062994957523603e-05,
+      "loss": 0.8708,
+      "step": 16984
+    },
+    {
+      "epoch": 3.023860398860399,
+      "grad_norm": 0.9310294389724731,
+      "learning_rate": 2.805327263956433e-05,
+      "loss": 0.8801,
+      "step": 16985
+    },
+    {
+      "epoch": 3.0240384615384617,
+      "grad_norm": 0.8927240967750549,
+      "learning_rate": 2.804355173125167e-05,
+      "loss": 0.8838,
+      "step": 16986
+    },
+    {
+      "epoch": 3.0242165242165244,
+      "grad_norm": 0.8257143497467041,
+      "learning_rate": 2.803383223277617e-05,
+      "loss": 0.7531,
+      "step": 16987
+    },
+    {
+      "epoch": 3.0243945868945867,
+      "grad_norm": 1.0260870456695557,
+      "learning_rate": 2.8024114144328227e-05,
+      "loss": 0.9371,
+      "step": 16988
+    },
+    {
+      "epoch": 3.0245726495726495,
+      "grad_norm": 0.8376401662826538,
+      "learning_rate": 2.8014397466098252e-05,
+      "loss": 0.6104,
+      "step": 16989
+    },
+    {
+      "epoch": 3.0247507122507122,
+      "grad_norm": 0.9509117603302002,
+      "learning_rate": 2.800468219827662e-05,
+      "loss": 0.6892,
+      "step": 16990
+    },
+    {
+      "epoch": 3.024928774928775,
+      "grad_norm": 0.8816271424293518,
+      "learning_rate": 2.7994968341053684e-05,
+      "loss": 0.8559,
+      "step": 16991
+    },
+    {
+      "epoch": 3.0251068376068377,
+      "grad_norm": 0.9000606536865234,
+      "learning_rate": 2.7985255894619754e-05,
+      "loss": 0.6829,
+      "step": 16992
+    },
+    {
+      "epoch": 3.0252849002849005,
+      "grad_norm": 0.8596423268318176,
+      "learning_rate": 2.797554485916515e-05,
+      "loss": 0.762,
+      "step": 16993
+    },
+    {
+      "epoch": 3.025462962962963,
+      "grad_norm": 0.92213374376297,
+      "learning_rate": 2.79658352348801e-05,
+      "loss": 0.8572,
+      "step": 16994
+    },
+    {
+      "epoch": 3.0256410256410255,
+      "grad_norm": 0.917836606502533,
+      "learning_rate": 2.7956127021954927e-05,
+      "loss": 0.7979,
+      "step": 16995
+    },
+    {
+      "epoch": 3.0258190883190883,
+      "grad_norm": 0.798367440700531,
+      "learning_rate": 2.7946420220579718e-05,
+      "loss": 0.6453,
+      "step": 16996
+    },
+    {
+      "epoch": 3.025997150997151,
+      "grad_norm": 1.0838942527770996,
+      "learning_rate": 2.7936714830944767e-05,
+      "loss": 0.7837,
+      "step": 16997
+    },
+    {
+      "epoch": 3.026175213675214,
+      "grad_norm": 0.9623873829841614,
+      "learning_rate": 2.7927010853240187e-05,
+      "loss": 0.9064,
+      "step": 16998
+    },
+    {
+      "epoch": 3.0263532763532766,
+      "grad_norm": 0.8624036908149719,
+      "learning_rate": 2.7917308287656075e-05,
+      "loss": 0.6475,
+      "step": 16999
+    },
+    {
+      "epoch": 3.026531339031339,
+      "grad_norm": 0.9793442487716675,
+      "learning_rate": 2.790760713438264e-05,
+      "loss": 0.7084,
+      "step": 17000
+    },
+    {
+      "epoch": 3.0267094017094016,
+      "grad_norm": 0.9052400588989258,
+      "learning_rate": 2.7897907393609812e-05,
+      "loss": 0.7263,
+      "step": 17001
+    },
+    {
+      "epoch": 3.0268874643874644,
+      "grad_norm": 0.7912825345993042,
+      "learning_rate": 2.788820906552775e-05,
+      "loss": 0.5578,
+      "step": 17002
+    },
+    {
+      "epoch": 3.027065527065527,
+      "grad_norm": 0.8624247312545776,
+      "learning_rate": 2.787851215032642e-05,
+      "loss": 0.6267,
+      "step": 17003
+    },
+    {
+      "epoch": 3.02724358974359,
+      "grad_norm": 0.9548758268356323,
+      "learning_rate": 2.786881664819584e-05,
+      "loss": 0.8152,
+      "step": 17004
+    },
+    {
+      "epoch": 3.0274216524216526,
+      "grad_norm": 0.7717092633247375,
+      "learning_rate": 2.7859122559325944e-05,
+      "loss": 0.6738,
+      "step": 17005
+    },
+    {
+      "epoch": 3.027599715099715,
+      "grad_norm": 0.9009813070297241,
+      "learning_rate": 2.7849429883906684e-05,
+      "loss": 0.5467,
+      "step": 17006
+    },
+    {
+      "epoch": 3.0277777777777777,
+      "grad_norm": 0.8310969471931458,
+      "learning_rate": 2.7839738622127974e-05,
+      "loss": 0.5986,
+      "step": 17007
+    },
+    {
+      "epoch": 3.0279558404558404,
+      "grad_norm": 0.9535056948661804,
+      "learning_rate": 2.7830048774179685e-05,
+      "loss": 0.7904,
+      "step": 17008
+    },
+    {
+      "epoch": 3.028133903133903,
+      "grad_norm": 0.9698938727378845,
+      "learning_rate": 2.782036034025164e-05,
+      "loss": 0.7514,
+      "step": 17009
+    },
+    {
+      "epoch": 3.028311965811966,
+      "grad_norm": 0.9064823389053345,
+      "learning_rate": 2.781067332053374e-05,
+      "loss": 0.7532,
+      "step": 17010
+    },
+    {
+      "epoch": 3.0284900284900287,
+      "grad_norm": 0.8301812410354614,
+      "learning_rate": 2.7800987715215686e-05,
+      "loss": 0.7068,
+      "step": 17011
+    },
+    {
+      "epoch": 3.028668091168091,
+      "grad_norm": 0.8986283540725708,
+      "learning_rate": 2.7791303524487332e-05,
+      "loss": 0.9139,
+      "step": 17012
+    },
+    {
+      "epoch": 3.0288461538461537,
+      "grad_norm": 0.9557301998138428,
+      "learning_rate": 2.7781620748538384e-05,
+      "loss": 0.7074,
+      "step": 17013
+    },
+    {
+      "epoch": 3.0290242165242165,
+      "grad_norm": 0.9753836989402771,
+      "learning_rate": 2.7771939387558554e-05,
+      "loss": 0.6967,
+      "step": 17014
+    },
+    {
+      "epoch": 3.0292022792022792,
+      "grad_norm": 0.8353948593139648,
+      "learning_rate": 2.7762259441737526e-05,
+      "loss": 0.7621,
+      "step": 17015
+    },
+    {
+      "epoch": 3.029380341880342,
+      "grad_norm": 0.9676481485366821,
+      "learning_rate": 2.7752580911264968e-05,
+      "loss": 0.8218,
+      "step": 17016
+    },
+    {
+      "epoch": 3.0295584045584047,
+      "grad_norm": 0.7660765051841736,
+      "learning_rate": 2.774290379633051e-05,
+      "loss": 0.8033,
+      "step": 17017
+    },
+    {
+      "epoch": 3.029736467236467,
+      "grad_norm": 0.916705310344696,
+      "learning_rate": 2.773322809712371e-05,
+      "loss": 0.849,
+      "step": 17018
+    },
+    {
+      "epoch": 3.02991452991453,
+      "grad_norm": 0.9600055813789368,
+      "learning_rate": 2.7723553813834224e-05,
+      "loss": 0.6848,
+      "step": 17019
+    },
+    {
+      "epoch": 3.0300925925925926,
+      "grad_norm": 1.0234928131103516,
+      "learning_rate": 2.771388094665156e-05,
+      "loss": 0.8953,
+      "step": 17020
+    },
+    {
+      "epoch": 3.0302706552706553,
+      "grad_norm": 0.8239841461181641,
+      "learning_rate": 2.7704209495765232e-05,
+      "loss": 0.8361,
+      "step": 17021
+    },
+    {
+      "epoch": 3.030448717948718,
+      "grad_norm": 1.0438402891159058,
+      "learning_rate": 2.7694539461364742e-05,
+      "loss": 0.6722,
+      "step": 17022
+    },
+    {
+      "epoch": 3.030626780626781,
+      "grad_norm": 0.7628459930419922,
+      "learning_rate": 2.7684870843639545e-05,
+      "loss": 0.6289,
+      "step": 17023
+    },
+    {
+      "epoch": 3.030804843304843,
+      "grad_norm": 0.8909597396850586,
+      "learning_rate": 2.7675203642779057e-05,
+      "loss": 0.7166,
+      "step": 17024
+    },
+    {
+      "epoch": 3.030982905982906,
+      "grad_norm": 1.116269826889038,
+      "learning_rate": 2.7665537858972767e-05,
+      "loss": 0.8198,
+      "step": 17025
+    },
+    {
+      "epoch": 3.0311609686609686,
+      "grad_norm": 0.8789699077606201,
+      "learning_rate": 2.7655873492409946e-05,
+      "loss": 0.5923,
+      "step": 17026
+    },
+    {
+      "epoch": 3.0313390313390314,
+      "grad_norm": 0.8829667568206787,
+      "learning_rate": 2.7646210543280048e-05,
+      "loss": 0.6907,
+      "step": 17027
+    },
+    {
+      "epoch": 3.031517094017094,
+      "grad_norm": 0.9860467314720154,
+      "learning_rate": 2.7636549011772307e-05,
+      "loss": 0.7255,
+      "step": 17028
+    },
+    {
+      "epoch": 3.031695156695157,
+      "grad_norm": 0.8999147415161133,
+      "learning_rate": 2.7626888898076096e-05,
+      "loss": 0.6997,
+      "step": 17029
+    },
+    {
+      "epoch": 3.031873219373219,
+      "grad_norm": 0.8238608241081238,
+      "learning_rate": 2.761723020238065e-05,
+      "loss": 0.6271,
+      "step": 17030
+    },
+    {
+      "epoch": 3.032051282051282,
+      "grad_norm": 0.8773884177207947,
+      "learning_rate": 2.7607572924875214e-05,
+      "loss": 0.8365,
+      "step": 17031
+    },
+    {
+      "epoch": 3.0322293447293447,
+      "grad_norm": 0.9619107246398926,
+      "learning_rate": 2.7597917065749013e-05,
+      "loss": 0.746,
+      "step": 17032
+    },
+    {
+      "epoch": 3.0324074074074074,
+      "grad_norm": 0.8207967877388,
+      "learning_rate": 2.7588262625191218e-05,
+      "loss": 0.7497,
+      "step": 17033
+    },
+    {
+      "epoch": 3.03258547008547,
+      "grad_norm": 0.9363070726394653,
+      "learning_rate": 2.757860960339097e-05,
+      "loss": 0.749,
+      "step": 17034
+    },
+    {
+      "epoch": 3.032763532763533,
+      "grad_norm": 0.7946116328239441,
+      "learning_rate": 2.756895800053748e-05,
+      "loss": 0.7944,
+      "step": 17035
+    },
+    {
+      "epoch": 3.0329415954415953,
+      "grad_norm": 0.8666053414344788,
+      "learning_rate": 2.7559307816819734e-05,
+      "loss": 0.7648,
+      "step": 17036
+    },
+    {
+      "epoch": 3.033119658119658,
+      "grad_norm": 0.9086824655532837,
+      "learning_rate": 2.75496590524269e-05,
+      "loss": 0.6917,
+      "step": 17037
+    },
+    {
+      "epoch": 3.0332977207977208,
+      "grad_norm": 0.8419581055641174,
+      "learning_rate": 2.7540011707547997e-05,
+      "loss": 0.6373,
+      "step": 17038
+    },
+    {
+      "epoch": 3.0334757834757835,
+      "grad_norm": 0.9141862392425537,
+      "learning_rate": 2.7530365782372035e-05,
+      "loss": 0.8568,
+      "step": 17039
+    },
+    {
+      "epoch": 3.0336538461538463,
+      "grad_norm": 0.9606926441192627,
+      "learning_rate": 2.7520721277088024e-05,
+      "loss": 0.7957,
+      "step": 17040
+    },
+    {
+      "epoch": 3.033831908831909,
+      "grad_norm": 0.9588294625282288,
+      "learning_rate": 2.7511078191884877e-05,
+      "loss": 0.8614,
+      "step": 17041
+    },
+    {
+      "epoch": 3.0340099715099713,
+      "grad_norm": 0.9736807942390442,
+      "learning_rate": 2.750143652695163e-05,
+      "loss": 0.9155,
+      "step": 17042
+    },
+    {
+      "epoch": 3.034188034188034,
+      "grad_norm": 0.886987030506134,
+      "learning_rate": 2.7491796282477078e-05,
+      "loss": 0.8215,
+      "step": 17043
+    },
+    {
+      "epoch": 3.034366096866097,
+      "grad_norm": 0.8999798893928528,
+      "learning_rate": 2.7482157458650182e-05,
+      "loss": 0.8449,
+      "step": 17044
+    },
+    {
+      "epoch": 3.0345441595441596,
+      "grad_norm": 0.8654738068580627,
+      "learning_rate": 2.7472520055659768e-05,
+      "loss": 0.6808,
+      "step": 17045
+    },
+    {
+      "epoch": 3.0347222222222223,
+      "grad_norm": 0.9802120923995972,
+      "learning_rate": 2.7462884073694662e-05,
+      "loss": 0.6272,
+      "step": 17046
+    },
+    {
+      "epoch": 3.034900284900285,
+      "grad_norm": 0.9409570693969727,
+      "learning_rate": 2.7453249512943658e-05,
+      "loss": 0.7017,
+      "step": 17047
+    },
+    {
+      "epoch": 3.0350783475783474,
+      "grad_norm": 0.9058420062065125,
+      "learning_rate": 2.7443616373595526e-05,
+      "loss": 0.668,
+      "step": 17048
+    },
+    {
+      "epoch": 3.03525641025641,
+      "grad_norm": 0.6895080208778381,
+      "learning_rate": 2.7433984655838985e-05,
+      "loss": 0.4501,
+      "step": 17049
+    },
+    {
+      "epoch": 3.035434472934473,
+      "grad_norm": 0.8462308049201965,
+      "learning_rate": 2.7424354359862824e-05,
+      "loss": 0.7687,
+      "step": 17050
+    },
+    {
+      "epoch": 3.0356125356125356,
+      "grad_norm": 1.0689095258712769,
+      "learning_rate": 2.741472548585562e-05,
+      "loss": 0.8657,
+      "step": 17051
+    },
+    {
+      "epoch": 3.0357905982905984,
+      "grad_norm": 0.8982266783714294,
+      "learning_rate": 2.7405098034006148e-05,
+      "loss": 0.6265,
+      "step": 17052
+    },
+    {
+      "epoch": 3.035968660968661,
+      "grad_norm": 0.9081108570098877,
+      "learning_rate": 2.739547200450292e-05,
+      "loss": 0.7115,
+      "step": 17053
+    },
+    {
+      "epoch": 3.0361467236467234,
+      "grad_norm": 0.9865546226501465,
+      "learning_rate": 2.7385847397534615e-05,
+      "loss": 0.8161,
+      "step": 17054
+    },
+    {
+      "epoch": 3.036324786324786,
+      "grad_norm": 0.9714661836624146,
+      "learning_rate": 2.737622421328979e-05,
+      "loss": 0.7357,
+      "step": 17055
+    },
+    {
+      "epoch": 3.036502849002849,
+      "grad_norm": 0.8435664772987366,
+      "learning_rate": 2.7366602451957003e-05,
+      "loss": 0.5715,
+      "step": 17056
+    },
+    {
+      "epoch": 3.0366809116809117,
+      "grad_norm": 0.9254729747772217,
+      "learning_rate": 2.7356982113724737e-05,
+      "loss": 0.8485,
+      "step": 17057
+    },
+    {
+      "epoch": 3.0368589743589745,
+      "grad_norm": 0.9366668462753296,
+      "learning_rate": 2.7347363198781496e-05,
+      "loss": 0.909,
+      "step": 17058
+    },
+    {
+      "epoch": 3.037037037037037,
+      "grad_norm": 0.8752120137214661,
+      "learning_rate": 2.7337745707315764e-05,
+      "loss": 0.6937,
+      "step": 17059
+    },
+    {
+      "epoch": 3.0372150997150995,
+      "grad_norm": 0.9134506583213806,
+      "learning_rate": 2.7328129639515963e-05,
+      "loss": 0.719,
+      "step": 17060
+    },
+    {
+      "epoch": 3.0373931623931623,
+      "grad_norm": 0.8471070528030396,
+      "learning_rate": 2.731851499557051e-05,
+      "loss": 0.6216,
+      "step": 17061
+    },
+    {
+      "epoch": 3.037571225071225,
+      "grad_norm": 0.8586950302124023,
+      "learning_rate": 2.730890177566776e-05,
+      "loss": 0.6494,
+      "step": 17062
+    },
+    {
+      "epoch": 3.0377492877492878,
+      "grad_norm": 0.915189802646637,
+      "learning_rate": 2.7299289979996078e-05,
+      "loss": 0.7831,
+      "step": 17063
+    },
+    {
+      "epoch": 3.0379273504273505,
+      "grad_norm": 0.9249003529548645,
+      "learning_rate": 2.7289679608743802e-05,
+      "loss": 0.6142,
+      "step": 17064
+    },
+    {
+      "epoch": 3.0381054131054133,
+      "grad_norm": 1.1311204433441162,
+      "learning_rate": 2.7280070662099198e-05,
+      "loss": 0.6026,
+      "step": 17065
+    },
+    {
+      "epoch": 3.0382834757834756,
+      "grad_norm": 0.8478325605392456,
+      "learning_rate": 2.7270463140250514e-05,
+      "loss": 0.6896,
+      "step": 17066
+    },
+    {
+      "epoch": 3.0384615384615383,
+      "grad_norm": 0.9298551678657532,
+      "learning_rate": 2.726085704338609e-05,
+      "loss": 0.7415,
+      "step": 17067
+    },
+    {
+      "epoch": 3.038639601139601,
+      "grad_norm": 0.8141112327575684,
+      "learning_rate": 2.7251252371694e-05,
+      "loss": 0.6328,
+      "step": 17068
+    },
+    {
+      "epoch": 3.038817663817664,
+      "grad_norm": 0.905672550201416,
+      "learning_rate": 2.7241649125362544e-05,
+      "loss": 0.8341,
+      "step": 17069
+    },
+    {
+      "epoch": 3.0389957264957266,
+      "grad_norm": 0.789050281047821,
+      "learning_rate": 2.7232047304579822e-05,
+      "loss": 0.7296,
+      "step": 17070
+    },
+    {
+      "epoch": 3.0391737891737893,
+      "grad_norm": 0.985736608505249,
+      "learning_rate": 2.722244690953397e-05,
+      "loss": 0.7118,
+      "step": 17071
+    },
+    {
+      "epoch": 3.0393518518518516,
+      "grad_norm": 0.8022065758705139,
+      "learning_rate": 2.7212847940413088e-05,
+      "loss": 0.5858,
+      "step": 17072
+    },
+    {
+      "epoch": 3.0395299145299144,
+      "grad_norm": 0.7794371843338013,
+      "learning_rate": 2.7203250397405244e-05,
+      "loss": 0.7273,
+      "step": 17073
+    },
+    {
+      "epoch": 3.039707977207977,
+      "grad_norm": 0.8665072321891785,
+      "learning_rate": 2.7193654280698466e-05,
+      "loss": 0.6504,
+      "step": 17074
+    },
+    {
+      "epoch": 3.03988603988604,
+      "grad_norm": 0.9348331689834595,
+      "learning_rate": 2.7184059590480848e-05,
+      "loss": 0.7944,
+      "step": 17075
+    },
+    {
+      "epoch": 3.0400641025641026,
+      "grad_norm": 1.0338337421417236,
+      "learning_rate": 2.717446632694025e-05,
+      "loss": 0.8487,
+      "step": 17076
+    },
+    {
+      "epoch": 3.0402421652421654,
+      "grad_norm": 0.9517617225646973,
+      "learning_rate": 2.7164874490264767e-05,
+      "loss": 0.6806,
+      "step": 17077
+    },
+    {
+      "epoch": 3.0404202279202277,
+      "grad_norm": 0.9159125685691833,
+      "learning_rate": 2.715528408064222e-05,
+      "loss": 0.6962,
+      "step": 17078
+    },
+    {
+      "epoch": 3.0405982905982905,
+      "grad_norm": 0.9007083177566528,
+      "learning_rate": 2.714569509826057e-05,
+      "loss": 0.5848,
+      "step": 17079
+    },
+    {
+      "epoch": 3.040776353276353,
+      "grad_norm": 0.8828113079071045,
+      "learning_rate": 2.71361075433077e-05,
+      "loss": 0.7774,
+      "step": 17080
+    },
+    {
+      "epoch": 3.040954415954416,
+      "grad_norm": 0.8390265703201294,
+      "learning_rate": 2.7126521415971405e-05,
+      "loss": 0.6373,
+      "step": 17081
+    },
+    {
+      "epoch": 3.0411324786324787,
+      "grad_norm": 0.8794222474098206,
+      "learning_rate": 2.71169367164396e-05,
+      "loss": 0.6907,
+      "step": 17082
+    },
+    {
+      "epoch": 3.0413105413105415,
+      "grad_norm": 0.8306505680084229,
+      "learning_rate": 2.710735344489995e-05,
+      "loss": 0.6983,
+      "step": 17083
+    },
+    {
+      "epoch": 3.041488603988604,
+      "grad_norm": 0.9314624071121216,
+      "learning_rate": 2.7097771601540333e-05,
+      "loss": 0.7328,
+      "step": 17084
+    },
+    {
+      "epoch": 3.0416666666666665,
+      "grad_norm": 0.9759663939476013,
+      "learning_rate": 2.7088191186548427e-05,
+      "loss": 0.7334,
+      "step": 17085
+    },
+    {
+      "epoch": 3.0418447293447293,
+      "grad_norm": 0.9378162622451782,
+      "learning_rate": 2.7078612200111962e-05,
+      "loss": 0.8155,
+      "step": 17086
+    },
+    {
+      "epoch": 3.042022792022792,
+      "grad_norm": 0.8654826283454895,
+      "learning_rate": 2.706903464241861e-05,
+      "loss": 0.7509,
+      "step": 17087
+    },
+    {
+      "epoch": 3.0422008547008548,
+      "grad_norm": 0.9504775404930115,
+      "learning_rate": 2.705945851365602e-05,
+      "loss": 0.5744,
+      "step": 17088
+    },
+    {
+      "epoch": 3.0423789173789175,
+      "grad_norm": 0.8799991607666016,
+      "learning_rate": 2.7049883814011822e-05,
+      "loss": 0.8687,
+      "step": 17089
+    },
+    {
+      "epoch": 3.04255698005698,
+      "grad_norm": 0.9499333500862122,
+      "learning_rate": 2.704031054367361e-05,
+      "loss": 0.7577,
+      "step": 17090
+    },
+    {
+      "epoch": 3.0427350427350426,
+      "grad_norm": 0.8906816840171814,
+      "learning_rate": 2.7030738702828918e-05,
+      "loss": 0.7075,
+      "step": 17091
+    },
+    {
+      "epoch": 3.0429131054131053,
+      "grad_norm": 0.8520913124084473,
+      "learning_rate": 2.7021168291665388e-05,
+      "loss": 0.7406,
+      "step": 17092
+    },
+    {
+      "epoch": 3.043091168091168,
+      "grad_norm": 0.9780383706092834,
+      "learning_rate": 2.7011599310370393e-05,
+      "loss": 0.7419,
+      "step": 17093
+    },
+    {
+      "epoch": 3.043269230769231,
+      "grad_norm": 0.9649802446365356,
+      "learning_rate": 2.700203175913153e-05,
+      "loss": 0.9557,
+      "step": 17094
+    },
+    {
+      "epoch": 3.0434472934472936,
+      "grad_norm": 0.8526589274406433,
+      "learning_rate": 2.6992465638136212e-05,
+      "loss": 0.7484,
+      "step": 17095
+    },
+    {
+      "epoch": 3.0436253561253563,
+      "grad_norm": 0.9232372641563416,
+      "learning_rate": 2.6982900947571864e-05,
+      "loss": 0.8444,
+      "step": 17096
+    },
+    {
+      "epoch": 3.0438034188034186,
+      "grad_norm": 0.8113499283790588,
+      "learning_rate": 2.69733376876259e-05,
+      "loss": 0.6301,
+      "step": 17097
+    },
+    {
+      "epoch": 3.0439814814814814,
+      "grad_norm": 0.8863048553466797,
+      "learning_rate": 2.6963775858485652e-05,
+      "loss": 0.7519,
+      "step": 17098
+    },
+    {
+      "epoch": 3.044159544159544,
+      "grad_norm": 0.8750777244567871,
+      "learning_rate": 2.6954215460338562e-05,
+      "loss": 0.7545,
+      "step": 17099
+    },
+    {
+      "epoch": 3.044337606837607,
+      "grad_norm": 0.937821090221405,
+      "learning_rate": 2.6944656493371812e-05,
+      "loss": 0.773,
+      "step": 17100
+    },
+    {
+      "epoch": 3.0445156695156697,
+      "grad_norm": 0.8423610925674438,
+      "learning_rate": 2.6935098957772785e-05,
+      "loss": 0.6537,
+      "step": 17101
+    },
+    {
+      "epoch": 3.0446937321937324,
+      "grad_norm": 0.7845390439033508,
+      "learning_rate": 2.6925542853728726e-05,
+      "loss": 0.6826,
+      "step": 17102
+    },
+    {
+      "epoch": 3.0448717948717947,
+      "grad_norm": 0.9942399263381958,
+      "learning_rate": 2.6915988181426842e-05,
+      "loss": 0.8664,
+      "step": 17103
+    },
+    {
+      "epoch": 3.0450498575498575,
+      "grad_norm": 0.8977256417274475,
+      "learning_rate": 2.690643494105437e-05,
+      "loss": 0.6945,
+      "step": 17104
+    },
+    {
+      "epoch": 3.04522792022792,
+      "grad_norm": 1.0192148685455322,
+      "learning_rate": 2.6896883132798456e-05,
+      "loss": 0.7012,
+      "step": 17105
+    },
+    {
+      "epoch": 3.045405982905983,
+      "grad_norm": 0.9779385924339294,
+      "learning_rate": 2.688733275684623e-05,
+      "loss": 0.7788,
+      "step": 17106
+    },
+    {
+      "epoch": 3.0455840455840457,
+      "grad_norm": 0.9183738231658936,
+      "learning_rate": 2.6877783813384894e-05,
+      "loss": 0.6973,
+      "step": 17107
+    },
+    {
+      "epoch": 3.0457621082621085,
+      "grad_norm": 1.0147916078567505,
+      "learning_rate": 2.6868236302601422e-05,
+      "loss": 0.7316,
+      "step": 17108
+    },
+    {
+      "epoch": 3.0459401709401708,
+      "grad_norm": 0.8801769018173218,
+      "learning_rate": 2.685869022468299e-05,
+      "loss": 0.6735,
+      "step": 17109
+    },
+    {
+      "epoch": 3.0461182336182335,
+      "grad_norm": 0.8848022222518921,
+      "learning_rate": 2.684914557981657e-05,
+      "loss": 0.7432,
+      "step": 17110
+    },
+    {
+      "epoch": 3.0462962962962963,
+      "grad_norm": 0.9158660173416138,
+      "learning_rate": 2.6839602368189188e-05,
+      "loss": 0.7133,
+      "step": 17111
+    },
+    {
+      "epoch": 3.046474358974359,
+      "grad_norm": 0.8280047178268433,
+      "learning_rate": 2.6830060589987826e-05,
+      "loss": 0.7432,
+      "step": 17112
+    },
+    {
+      "epoch": 3.046652421652422,
+      "grad_norm": 0.8806058764457703,
+      "learning_rate": 2.6820520245399427e-05,
+      "loss": 0.5107,
+      "step": 17113
+    },
+    {
+      "epoch": 3.0468304843304845,
+      "grad_norm": 0.8557198643684387,
+      "learning_rate": 2.681098133461091e-05,
+      "loss": 0.7598,
+      "step": 17114
+    },
+    {
+      "epoch": 3.047008547008547,
+      "grad_norm": 0.9113288521766663,
+      "learning_rate": 2.6801443857809183e-05,
+      "loss": 0.799,
+      "step": 17115
+    },
+    {
+      "epoch": 3.0471866096866096,
+      "grad_norm": 0.8104450702667236,
+      "learning_rate": 2.6791907815181072e-05,
+      "loss": 0.6265,
+      "step": 17116
+    },
+    {
+      "epoch": 3.0473646723646723,
+      "grad_norm": 0.7648457288742065,
+      "learning_rate": 2.6782373206913492e-05,
+      "loss": 0.6227,
+      "step": 17117
+    },
+    {
+      "epoch": 3.047542735042735,
+      "grad_norm": 0.8509871363639832,
+      "learning_rate": 2.6772840033193204e-05,
+      "loss": 0.7835,
+      "step": 17118
+    },
+    {
+      "epoch": 3.047720797720798,
+      "grad_norm": 0.9865061044692993,
+      "learning_rate": 2.6763308294207e-05,
+      "loss": 0.6175,
+      "step": 17119
+    },
+    {
+      "epoch": 3.0478988603988606,
+      "grad_norm": 1.0529922246932983,
+      "learning_rate": 2.6753777990141625e-05,
+      "loss": 0.7668,
+      "step": 17120
+    },
+    {
+      "epoch": 3.048076923076923,
+      "grad_norm": 0.9097694158554077,
+      "learning_rate": 2.6744249121183795e-05,
+      "loss": 0.7583,
+      "step": 17121
+    },
+    {
+      "epoch": 3.0482549857549857,
+      "grad_norm": 1.0285091400146484,
+      "learning_rate": 2.6734721687520293e-05,
+      "loss": 0.748,
+      "step": 17122
+    },
+    {
+      "epoch": 3.0484330484330484,
+      "grad_norm": 0.9668599963188171,
+      "learning_rate": 2.6725195689337658e-05,
+      "loss": 0.7636,
+      "step": 17123
+    },
+    {
+      "epoch": 3.048611111111111,
+      "grad_norm": 0.8746935725212097,
+      "learning_rate": 2.671567112682265e-05,
+      "loss": 0.6558,
+      "step": 17124
+    },
+    {
+      "epoch": 3.048789173789174,
+      "grad_norm": 0.9465086460113525,
+      "learning_rate": 2.670614800016178e-05,
+      "loss": 0.706,
+      "step": 17125
+    },
+    {
+      "epoch": 3.0489672364672367,
+      "grad_norm": 1.0083024501800537,
+      "learning_rate": 2.6696626309541718e-05,
+      "loss": 0.8062,
+      "step": 17126
+    },
+    {
+      "epoch": 3.049145299145299,
+      "grad_norm": 0.9895032048225403,
+      "learning_rate": 2.668710605514898e-05,
+      "loss": 0.8804,
+      "step": 17127
+    },
+    {
+      "epoch": 3.0493233618233617,
+      "grad_norm": 0.903660237789154,
+      "learning_rate": 2.667758723717011e-05,
+      "loss": 0.7487,
+      "step": 17128
+    },
+    {
+      "epoch": 3.0495014245014245,
+      "grad_norm": 0.8843152523040771,
+      "learning_rate": 2.6668069855791598e-05,
+      "loss": 0.5683,
+      "step": 17129
+    },
+    {
+      "epoch": 3.0496794871794872,
+      "grad_norm": 0.9241248369216919,
+      "learning_rate": 2.6658553911199936e-05,
+      "loss": 0.7855,
+      "step": 17130
+    },
+    {
+      "epoch": 3.04985754985755,
+      "grad_norm": 1.0450769662857056,
+      "learning_rate": 2.664903940358152e-05,
+      "loss": 0.7898,
+      "step": 17131
+    },
+    {
+      "epoch": 3.0500356125356127,
+      "grad_norm": 0.993699312210083,
+      "learning_rate": 2.6639526333122855e-05,
+      "loss": 0.7119,
+      "step": 17132
+    },
+    {
+      "epoch": 3.050213675213675,
+      "grad_norm": 0.8813846707344055,
+      "learning_rate": 2.663001470001023e-05,
+      "loss": 0.7032,
+      "step": 17133
+    },
+    {
+      "epoch": 3.050391737891738,
+      "grad_norm": 0.9121679067611694,
+      "learning_rate": 2.662050450443011e-05,
+      "loss": 0.8321,
+      "step": 17134
+    },
+    {
+      "epoch": 3.0505698005698005,
+      "grad_norm": 0.8883702158927917,
+      "learning_rate": 2.6610995746568713e-05,
+      "loss": 0.7917,
+      "step": 17135
+    },
+    {
+      "epoch": 3.0507478632478633,
+      "grad_norm": 0.8827738761901855,
+      "learning_rate": 2.660148842661243e-05,
+      "loss": 0.7541,
+      "step": 17136
+    },
+    {
+      "epoch": 3.050925925925926,
+      "grad_norm": 0.8486347794532776,
+      "learning_rate": 2.6591982544747508e-05,
+      "loss": 0.6555,
+      "step": 17137
+    },
+    {
+      "epoch": 3.051103988603989,
+      "grad_norm": 0.7430234551429749,
+      "learning_rate": 2.6582478101160167e-05,
+      "loss": 0.5562,
+      "step": 17138
+    },
+    {
+      "epoch": 3.051282051282051,
+      "grad_norm": 0.8816348314285278,
+      "learning_rate": 2.657297509603671e-05,
+      "loss": 0.8661,
+      "step": 17139
+    },
+    {
+      "epoch": 3.051460113960114,
+      "grad_norm": 0.9184949994087219,
+      "learning_rate": 2.656347352956322e-05,
+      "loss": 0.7907,
+      "step": 17140
+    },
+    {
+      "epoch": 3.0516381766381766,
+      "grad_norm": 0.7924085855484009,
+      "learning_rate": 2.6553973401925946e-05,
+      "loss": 0.6181,
+      "step": 17141
+    },
+    {
+      "epoch": 3.0518162393162394,
+      "grad_norm": 0.9879988431930542,
+      "learning_rate": 2.6544474713310997e-05,
+      "loss": 0.8455,
+      "step": 17142
+    },
+    {
+      "epoch": 3.051994301994302,
+      "grad_norm": 0.8584102988243103,
+      "learning_rate": 2.6534977463904475e-05,
+      "loss": 0.89,
+      "step": 17143
+    },
+    {
+      "epoch": 3.052172364672365,
+      "grad_norm": 0.9548700451850891,
+      "learning_rate": 2.6525481653892447e-05,
+      "loss": 0.7599,
+      "step": 17144
+    },
+    {
+      "epoch": 3.052350427350427,
+      "grad_norm": 0.9319002628326416,
+      "learning_rate": 2.6515987283460985e-05,
+      "loss": 0.934,
+      "step": 17145
+    },
+    {
+      "epoch": 3.05252849002849,
+      "grad_norm": 1.006009817123413,
+      "learning_rate": 2.650649435279611e-05,
+      "loss": 0.7246,
+      "step": 17146
+    },
+    {
+      "epoch": 3.0527065527065527,
+      "grad_norm": 0.8670983910560608,
+      "learning_rate": 2.6497002862083797e-05,
+      "loss": 0.6835,
+      "step": 17147
+    },
+    {
+      "epoch": 3.0528846153846154,
+      "grad_norm": 0.79991614818573,
+      "learning_rate": 2.6487512811509994e-05,
+      "loss": 0.5659,
+      "step": 17148
+    },
+    {
+      "epoch": 3.053062678062678,
+      "grad_norm": 0.8038514852523804,
+      "learning_rate": 2.6478024201260732e-05,
+      "loss": 0.7436,
+      "step": 17149
+    },
+    {
+      "epoch": 3.053240740740741,
+      "grad_norm": 0.9287776350975037,
+      "learning_rate": 2.646853703152179e-05,
+      "loss": 0.8187,
+      "step": 17150
+    },
+    {
+      "epoch": 3.0534188034188032,
+      "grad_norm": 0.952595591545105,
+      "learning_rate": 2.6459051302479153e-05,
+      "loss": 0.8553,
+      "step": 17151
+    },
+    {
+      "epoch": 3.053596866096866,
+      "grad_norm": 0.8326176404953003,
+      "learning_rate": 2.6449567014318643e-05,
+      "loss": 0.7641,
+      "step": 17152
+    },
+    {
+      "epoch": 3.0537749287749287,
+      "grad_norm": 0.8747450113296509,
+      "learning_rate": 2.6440084167226063e-05,
+      "loss": 0.7237,
+      "step": 17153
+    },
+    {
+      "epoch": 3.0539529914529915,
+      "grad_norm": 0.9608044624328613,
+      "learning_rate": 2.6430602761387247e-05,
+      "loss": 0.8389,
+      "step": 17154
+    },
+    {
+      "epoch": 3.0541310541310542,
+      "grad_norm": 1.0133368968963623,
+      "learning_rate": 2.642112279698793e-05,
+      "loss": 0.7081,
+      "step": 17155
+    },
+    {
+      "epoch": 3.054309116809117,
+      "grad_norm": 0.8604594469070435,
+      "learning_rate": 2.6411644274213832e-05,
+      "loss": 0.8706,
+      "step": 17156
+    },
+    {
+      "epoch": 3.0544871794871793,
+      "grad_norm": 0.8340579867362976,
+      "learning_rate": 2.640216719325074e-05,
+      "loss": 0.6569,
+      "step": 17157
+    },
+    {
+      "epoch": 3.054665242165242,
+      "grad_norm": 0.894295871257782,
+      "learning_rate": 2.6392691554284287e-05,
+      "loss": 0.8071,
+      "step": 17158
+    },
+    {
+      "epoch": 3.054843304843305,
+      "grad_norm": 0.8919312953948975,
+      "learning_rate": 2.6383217357500133e-05,
+      "loss": 0.5967,
+      "step": 17159
+    },
+    {
+      "epoch": 3.0550213675213675,
+      "grad_norm": 0.9570869207382202,
+      "learning_rate": 2.6373744603083916e-05,
+      "loss": 0.8526,
+      "step": 17160
+    },
+    {
+      "epoch": 3.0551994301994303,
+      "grad_norm": 0.9222795367240906,
+      "learning_rate": 2.636427329122123e-05,
+      "loss": 0.7142,
+      "step": 17161
+    },
+    {
+      "epoch": 3.055377492877493,
+      "grad_norm": 0.8729871511459351,
+      "learning_rate": 2.635480342209764e-05,
+      "loss": 0.7344,
+      "step": 17162
+    },
+    {
+      "epoch": 3.0555555555555554,
+      "grad_norm": 0.9293301701545715,
+      "learning_rate": 2.6345334995898662e-05,
+      "loss": 0.7019,
+      "step": 17163
+    },
+    {
+      "epoch": 3.055733618233618,
+      "grad_norm": 0.9955607652664185,
+      "learning_rate": 2.6335868012809905e-05,
+      "loss": 0.8012,
+      "step": 17164
+    },
+    {
+      "epoch": 3.055911680911681,
+      "grad_norm": 0.8420068621635437,
+      "learning_rate": 2.632640247301673e-05,
+      "loss": 0.7686,
+      "step": 17165
+    },
+    {
+      "epoch": 3.0560897435897436,
+      "grad_norm": 0.9361758232116699,
+      "learning_rate": 2.6316938376704682e-05,
+      "loss": 0.728,
+      "step": 17166
+    },
+    {
+      "epoch": 3.0562678062678064,
+      "grad_norm": 0.9174705147743225,
+      "learning_rate": 2.630747572405916e-05,
+      "loss": 0.77,
+      "step": 17167
+    },
+    {
+      "epoch": 3.056445868945869,
+      "grad_norm": 1.0511798858642578,
+      "learning_rate": 2.6298014515265578e-05,
+      "loss": 0.7319,
+      "step": 17168
+    },
+    {
+      "epoch": 3.0566239316239314,
+      "grad_norm": 0.9990606307983398,
+      "learning_rate": 2.6288554750509286e-05,
+      "loss": 0.7417,
+      "step": 17169
+    },
+    {
+      "epoch": 3.056801994301994,
+      "grad_norm": 0.8228463530540466,
+      "learning_rate": 2.6279096429975648e-05,
+      "loss": 0.8001,
+      "step": 17170
+    },
+    {
+      "epoch": 3.056980056980057,
+      "grad_norm": 0.9636453986167908,
+      "learning_rate": 2.626963955384998e-05,
+      "loss": 0.7647,
+      "step": 17171
+    },
+    {
+      "epoch": 3.0571581196581197,
+      "grad_norm": 1.096347451210022,
+      "learning_rate": 2.6260184122317554e-05,
+      "loss": 0.816,
+      "step": 17172
+    },
+    {
+      "epoch": 3.0573361823361824,
+      "grad_norm": 0.8099706768989563,
+      "learning_rate": 2.6250730135563606e-05,
+      "loss": 0.6105,
+      "step": 17173
+    },
+    {
+      "epoch": 3.057514245014245,
+      "grad_norm": 0.8991966247558594,
+      "learning_rate": 2.6241277593773473e-05,
+      "loss": 0.6853,
+      "step": 17174
+    },
+    {
+      "epoch": 3.0576923076923075,
+      "grad_norm": 0.8311452865600586,
+      "learning_rate": 2.623182649713222e-05,
+      "loss": 0.7684,
+      "step": 17175
+    },
+    {
+      "epoch": 3.0578703703703702,
+      "grad_norm": 1.097398281097412,
+      "learning_rate": 2.6222376845825115e-05,
+      "loss": 1.0463,
+      "step": 17176
+    },
+    {
+      "epoch": 3.058048433048433,
+      "grad_norm": 0.9334904551506042,
+      "learning_rate": 2.621292864003727e-05,
+      "loss": 0.8499,
+      "step": 17177
+    },
+    {
+      "epoch": 3.0582264957264957,
+      "grad_norm": 0.912241518497467,
+      "learning_rate": 2.6203481879953783e-05,
+      "loss": 0.8186,
+      "step": 17178
+    },
+    {
+      "epoch": 3.0584045584045585,
+      "grad_norm": 0.8956477046012878,
+      "learning_rate": 2.619403656575984e-05,
+      "loss": 0.6561,
+      "step": 17179
+    },
+    {
+      "epoch": 3.0585826210826212,
+      "grad_norm": 0.997071385383606,
+      "learning_rate": 2.6184592697640352e-05,
+      "loss": 0.8774,
+      "step": 17180
+    },
+    {
+      "epoch": 3.0587606837606836,
+      "grad_norm": 0.8529218435287476,
+      "learning_rate": 2.617515027578048e-05,
+      "loss": 0.7653,
+      "step": 17181
+    },
+    {
+      "epoch": 3.0589387464387463,
+      "grad_norm": 0.8638035655021667,
+      "learning_rate": 2.616570930036517e-05,
+      "loss": 0.6703,
+      "step": 17182
+    },
+    {
+      "epoch": 3.059116809116809,
+      "grad_norm": 0.9718494415283203,
+      "learning_rate": 2.6156269771579412e-05,
+      "loss": 0.9117,
+      "step": 17183
+    },
+    {
+      "epoch": 3.059294871794872,
+      "grad_norm": 0.8079463243484497,
+      "learning_rate": 2.6146831689608154e-05,
+      "loss": 0.716,
+      "step": 17184
+    },
+    {
+      "epoch": 3.0594729344729346,
+      "grad_norm": 0.8149730563163757,
+      "learning_rate": 2.6137395054636303e-05,
+      "loss": 0.5388,
+      "step": 17185
+    },
+    {
+      "epoch": 3.0596509971509973,
+      "grad_norm": 0.8048275113105774,
+      "learning_rate": 2.6127959866848774e-05,
+      "loss": 0.6043,
+      "step": 17186
+    },
+    {
+      "epoch": 3.0598290598290596,
+      "grad_norm": 0.850706934928894,
+      "learning_rate": 2.611852612643041e-05,
+      "loss": 0.6969,
+      "step": 17187
+    },
+    {
+      "epoch": 3.0600071225071224,
+      "grad_norm": 0.8014549016952515,
+      "learning_rate": 2.6109093833566012e-05,
+      "loss": 0.6422,
+      "step": 17188
+    },
+    {
+      "epoch": 3.060185185185185,
+      "grad_norm": 0.9597519636154175,
+      "learning_rate": 2.60996629884405e-05,
+      "loss": 0.7547,
+      "step": 17189
+    },
+    {
+      "epoch": 3.060363247863248,
+      "grad_norm": 0.8303548693656921,
+      "learning_rate": 2.60902335912385e-05,
+      "loss": 0.6349,
+      "step": 17190
+    },
+    {
+      "epoch": 3.0605413105413106,
+      "grad_norm": 0.8855451345443726,
+      "learning_rate": 2.608080564214488e-05,
+      "loss": 0.8361,
+      "step": 17191
+    },
+    {
+      "epoch": 3.0607193732193734,
+      "grad_norm": 0.8511565327644348,
+      "learning_rate": 2.6071379141344322e-05,
+      "loss": 0.7693,
+      "step": 17192
+    },
+    {
+      "epoch": 3.0608974358974357,
+      "grad_norm": 1.0611032247543335,
+      "learning_rate": 2.6061954089021512e-05,
+      "loss": 0.821,
+      "step": 17193
+    },
+    {
+      "epoch": 3.0610754985754984,
+      "grad_norm": 0.8549980521202087,
+      "learning_rate": 2.6052530485361114e-05,
+      "loss": 0.7522,
+      "step": 17194
+    },
+    {
+      "epoch": 3.061253561253561,
+      "grad_norm": 1.0446410179138184,
+      "learning_rate": 2.604310833054777e-05,
+      "loss": 0.9727,
+      "step": 17195
+    },
+    {
+      "epoch": 3.061431623931624,
+      "grad_norm": 0.949283242225647,
+      "learning_rate": 2.603368762476609e-05,
+      "loss": 0.9997,
+      "step": 17196
+    },
+    {
+      "epoch": 3.0616096866096867,
+      "grad_norm": 0.8465662598609924,
+      "learning_rate": 2.602426836820062e-05,
+      "loss": 0.6856,
+      "step": 17197
+    },
+    {
+      "epoch": 3.0617877492877494,
+      "grad_norm": 0.8954669237136841,
+      "learning_rate": 2.6014850561035963e-05,
+      "loss": 0.8639,
+      "step": 17198
+    },
+    {
+      "epoch": 3.0619658119658117,
+      "grad_norm": 0.9493638277053833,
+      "learning_rate": 2.6005434203456626e-05,
+      "loss": 0.9289,
+      "step": 17199
+    },
+    {
+      "epoch": 3.0621438746438745,
+      "grad_norm": 0.9050379395484924,
+      "learning_rate": 2.599601929564709e-05,
+      "loss": 0.6737,
+      "step": 17200
+    },
+    {
+      "epoch": 3.0623219373219372,
+      "grad_norm": 1.0764466524124146,
+      "learning_rate": 2.5986605837791835e-05,
+      "loss": 0.9317,
+      "step": 17201
+    },
+    {
+      "epoch": 3.0625,
+      "grad_norm": 0.9864383339881897,
+      "learning_rate": 2.5977193830075287e-05,
+      "loss": 0.7932,
+      "step": 17202
+    },
+    {
+      "epoch": 3.0626780626780628,
+      "grad_norm": 0.9895427227020264,
+      "learning_rate": 2.5967783272681823e-05,
+      "loss": 0.8253,
+      "step": 17203
+    },
+    {
+      "epoch": 3.0628561253561255,
+      "grad_norm": 0.9177289009094238,
+      "learning_rate": 2.5958374165795906e-05,
+      "loss": 0.7333,
+      "step": 17204
+    },
+    {
+      "epoch": 3.0630341880341883,
+      "grad_norm": 1.088364601135254,
+      "learning_rate": 2.594896650960179e-05,
+      "loss": 0.9155,
+      "step": 17205
+    },
+    {
+      "epoch": 3.0632122507122506,
+      "grad_norm": 0.894473135471344,
+      "learning_rate": 2.5939560304283917e-05,
+      "loss": 0.8166,
+      "step": 17206
+    },
+    {
+      "epoch": 3.0633903133903133,
+      "grad_norm": 0.8343120813369751,
+      "learning_rate": 2.5930155550026436e-05,
+      "loss": 0.7034,
+      "step": 17207
+    },
+    {
+      "epoch": 3.063568376068376,
+      "grad_norm": 0.8967345356941223,
+      "learning_rate": 2.5920752247013737e-05,
+      "loss": 0.6673,
+      "step": 17208
+    },
+    {
+      "epoch": 3.063746438746439,
+      "grad_norm": 0.9521552324295044,
+      "learning_rate": 2.591135039543e-05,
+      "loss": 0.7148,
+      "step": 17209
+    },
+    {
+      "epoch": 3.0639245014245016,
+      "grad_norm": 0.7986984252929688,
+      "learning_rate": 2.590194999545944e-05,
+      "loss": 0.7299,
+      "step": 17210
+    },
+    {
+      "epoch": 3.064102564102564,
+      "grad_norm": 0.9818909168243408,
+      "learning_rate": 2.589255104728626e-05,
+      "loss": 0.7047,
+      "step": 17211
+    },
+    {
+      "epoch": 3.0642806267806266,
+      "grad_norm": 0.9633432030677795,
+      "learning_rate": 2.5883153551094585e-05,
+      "loss": 0.6901,
+      "step": 17212
+    },
+    {
+      "epoch": 3.0644586894586894,
+      "grad_norm": 0.9325671792030334,
+      "learning_rate": 2.587375750706853e-05,
+      "loss": 0.8643,
+      "step": 17213
+    },
+    {
+      "epoch": 3.064636752136752,
+      "grad_norm": 0.9931867122650146,
+      "learning_rate": 2.5864362915392272e-05,
+      "loss": 0.7626,
+      "step": 17214
+    },
+    {
+      "epoch": 3.064814814814815,
+      "grad_norm": 0.8451269268989563,
+      "learning_rate": 2.585496977624975e-05,
+      "loss": 0.8091,
+      "step": 17215
+    },
+    {
+      "epoch": 3.0649928774928776,
+      "grad_norm": 0.8387511372566223,
+      "learning_rate": 2.5845578089825105e-05,
+      "loss": 0.6729,
+      "step": 17216
+    },
+    {
+      "epoch": 3.0651709401709404,
+      "grad_norm": 0.9810523986816406,
+      "learning_rate": 2.583618785630233e-05,
+      "loss": 0.8085,
+      "step": 17217
+    },
+    {
+      "epoch": 3.0653490028490027,
+      "grad_norm": 0.834884524345398,
+      "learning_rate": 2.5826799075865372e-05,
+      "loss": 0.6366,
+      "step": 17218
+    },
+    {
+      "epoch": 3.0655270655270654,
+      "grad_norm": 1.0484609603881836,
+      "learning_rate": 2.5817411748698217e-05,
+      "loss": 0.7411,
+      "step": 17219
+    },
+    {
+      "epoch": 3.065705128205128,
+      "grad_norm": 0.8887215852737427,
+      "learning_rate": 2.5808025874984742e-05,
+      "loss": 0.7981,
+      "step": 17220
+    },
+    {
+      "epoch": 3.065883190883191,
+      "grad_norm": 0.8052788972854614,
+      "learning_rate": 2.5798641454908944e-05,
+      "loss": 0.5341,
+      "step": 17221
+    },
+    {
+      "epoch": 3.0660612535612537,
+      "grad_norm": 1.0397300720214844,
+      "learning_rate": 2.5789258488654567e-05,
+      "loss": 0.8031,
+      "step": 17222
+    },
+    {
+      "epoch": 3.0662393162393164,
+      "grad_norm": 0.9165951609611511,
+      "learning_rate": 2.5779876976405535e-05,
+      "loss": 0.8193,
+      "step": 17223
+    },
+    {
+      "epoch": 3.0664173789173788,
+      "grad_norm": 0.8900893330574036,
+      "learning_rate": 2.5770496918345633e-05,
+      "loss": 0.7372,
+      "step": 17224
+    },
+    {
+      "epoch": 3.0665954415954415,
+      "grad_norm": 0.8510205149650574,
+      "learning_rate": 2.576111831465865e-05,
+      "loss": 0.8246,
+      "step": 17225
+    },
+    {
+      "epoch": 3.0667735042735043,
+      "grad_norm": 0.837579071521759,
+      "learning_rate": 2.5751741165528342e-05,
+      "loss": 0.9221,
+      "step": 17226
+    },
+    {
+      "epoch": 3.066951566951567,
+      "grad_norm": 0.911714494228363,
+      "learning_rate": 2.5742365471138418e-05,
+      "loss": 0.878,
+      "step": 17227
+    },
+    {
+      "epoch": 3.0671296296296298,
+      "grad_norm": 0.8773578405380249,
+      "learning_rate": 2.5732991231672565e-05,
+      "loss": 0.75,
+      "step": 17228
+    },
+    {
+      "epoch": 3.0673076923076925,
+      "grad_norm": 0.7727368474006653,
+      "learning_rate": 2.5723618447314523e-05,
+      "loss": 0.6015,
+      "step": 17229
+    },
+    {
+      "epoch": 3.067485754985755,
+      "grad_norm": 0.7762733101844788,
+      "learning_rate": 2.5714247118247826e-05,
+      "loss": 0.502,
+      "step": 17230
+    },
+    {
+      "epoch": 3.0676638176638176,
+      "grad_norm": 1.0052740573883057,
+      "learning_rate": 2.57048772446562e-05,
+      "loss": 0.8727,
+      "step": 17231
+    },
+    {
+      "epoch": 3.0678418803418803,
+      "grad_norm": 0.8907061219215393,
+      "learning_rate": 2.5695508826723113e-05,
+      "loss": 0.7661,
+      "step": 17232
+    },
+    {
+      "epoch": 3.068019943019943,
+      "grad_norm": 1.0816287994384766,
+      "learning_rate": 2.5686141864632208e-05,
+      "loss": 0.7263,
+      "step": 17233
+    },
+    {
+      "epoch": 3.068198005698006,
+      "grad_norm": 0.7547267079353333,
+      "learning_rate": 2.567677635856698e-05,
+      "loss": 0.6671,
+      "step": 17234
+    },
+    {
+      "epoch": 3.0683760683760686,
+      "grad_norm": 0.9319064021110535,
+      "learning_rate": 2.5667412308710916e-05,
+      "loss": 0.816,
+      "step": 17235
+    },
+    {
+      "epoch": 3.068554131054131,
+      "grad_norm": 0.8986789584159851,
+      "learning_rate": 2.56580497152475e-05,
+      "loss": 0.657,
+      "step": 17236
+    },
+    {
+      "epoch": 3.0687321937321936,
+      "grad_norm": 0.9154072999954224,
+      "learning_rate": 2.5648688578360135e-05,
+      "loss": 0.8183,
+      "step": 17237
+    },
+    {
+      "epoch": 3.0689102564102564,
+      "grad_norm": 0.8452461361885071,
+      "learning_rate": 2.5639328898232308e-05,
+      "loss": 0.8065,
+      "step": 17238
+    },
+    {
+      "epoch": 3.069088319088319,
+      "grad_norm": 0.9826174974441528,
+      "learning_rate": 2.5629970675047354e-05,
+      "loss": 0.9159,
+      "step": 17239
+    },
+    {
+      "epoch": 3.069266381766382,
+      "grad_norm": 0.7786675691604614,
+      "learning_rate": 2.5620613908988632e-05,
+      "loss": 0.5292,
+      "step": 17240
+    },
+    {
+      "epoch": 3.0694444444444446,
+      "grad_norm": 0.9880074858665466,
+      "learning_rate": 2.5611258600239464e-05,
+      "loss": 0.6227,
+      "step": 17241
+    },
+    {
+      "epoch": 3.069622507122507,
+      "grad_norm": 0.8934128880500793,
+      "learning_rate": 2.560190474898316e-05,
+      "loss": 0.7228,
+      "step": 17242
+    },
+    {
+      "epoch": 3.0698005698005697,
+      "grad_norm": 0.9464970231056213,
+      "learning_rate": 2.5592552355402988e-05,
+      "loss": 0.8885,
+      "step": 17243
+    },
+    {
+      "epoch": 3.0699786324786325,
+      "grad_norm": 1.071493148803711,
+      "learning_rate": 2.558320141968219e-05,
+      "loss": 0.9354,
+      "step": 17244
+    },
+    {
+      "epoch": 3.070156695156695,
+      "grad_norm": 0.9068328142166138,
+      "learning_rate": 2.5573851942003933e-05,
+      "loss": 0.7176,
+      "step": 17245
+    },
+    {
+      "epoch": 3.070334757834758,
+      "grad_norm": 0.7818723320960999,
+      "learning_rate": 2.55645039225515e-05,
+      "loss": 0.6526,
+      "step": 17246
+    },
+    {
+      "epoch": 3.0705128205128207,
+      "grad_norm": 1.052809476852417,
+      "learning_rate": 2.555515736150793e-05,
+      "loss": 0.7552,
+      "step": 17247
+    },
+    {
+      "epoch": 3.070690883190883,
+      "grad_norm": 0.8437279462814331,
+      "learning_rate": 2.5545812259056433e-05,
+      "loss": 0.7114,
+      "step": 17248
+    },
+    {
+      "epoch": 3.0708689458689458,
+      "grad_norm": 1.0028997659683228,
+      "learning_rate": 2.5536468615380083e-05,
+      "loss": 0.8291,
+      "step": 17249
+    },
+    {
+      "epoch": 3.0710470085470085,
+      "grad_norm": 0.9631550908088684,
+      "learning_rate": 2.552712643066194e-05,
+      "loss": 1.0165,
+      "step": 17250
+    },
+    {
+      "epoch": 3.0712250712250713,
+      "grad_norm": 0.8871008157730103,
+      "learning_rate": 2.5517785705085052e-05,
+      "loss": 0.6304,
+      "step": 17251
+    },
+    {
+      "epoch": 3.071403133903134,
+      "grad_norm": 0.8630111813545227,
+      "learning_rate": 2.5508446438832432e-05,
+      "loss": 0.7626,
+      "step": 17252
+    },
+    {
+      "epoch": 3.0715811965811968,
+      "grad_norm": 0.9306061267852783,
+      "learning_rate": 2.549910863208703e-05,
+      "loss": 0.8901,
+      "step": 17253
+    },
+    {
+      "epoch": 3.071759259259259,
+      "grad_norm": 0.9966697692871094,
+      "learning_rate": 2.5489772285031897e-05,
+      "loss": 0.7751,
+      "step": 17254
+    },
+    {
+      "epoch": 3.071937321937322,
+      "grad_norm": 0.9372161030769348,
+      "learning_rate": 2.5480437397849832e-05,
+      "loss": 0.866,
+      "step": 17255
+    },
+    {
+      "epoch": 3.0721153846153846,
+      "grad_norm": 0.9376949667930603,
+      "learning_rate": 2.547110397072382e-05,
+      "loss": 0.6556,
+      "step": 17256
+    },
+    {
+      "epoch": 3.0722934472934473,
+      "grad_norm": 1.0755635499954224,
+      "learning_rate": 2.5461772003836714e-05,
+      "loss": 0.8842,
+      "step": 17257
+    },
+    {
+      "epoch": 3.07247150997151,
+      "grad_norm": 0.8353457450866699,
+      "learning_rate": 2.545244149737134e-05,
+      "loss": 0.7091,
+      "step": 17258
+    },
+    {
+      "epoch": 3.072649572649573,
+      "grad_norm": 0.9944921135902405,
+      "learning_rate": 2.544311245151051e-05,
+      "loss": 0.6709,
+      "step": 17259
+    },
+    {
+      "epoch": 3.072827635327635,
+      "grad_norm": 0.8432626128196716,
+      "learning_rate": 2.5433784866436995e-05,
+      "loss": 0.7281,
+      "step": 17260
+    },
+    {
+      "epoch": 3.073005698005698,
+      "grad_norm": 0.8600237965583801,
+      "learning_rate": 2.5424458742333622e-05,
+      "loss": 0.8318,
+      "step": 17261
+    },
+    {
+      "epoch": 3.0731837606837606,
+      "grad_norm": 1.0386722087860107,
+      "learning_rate": 2.5415134079383006e-05,
+      "loss": 0.784,
+      "step": 17262
+    },
+    {
+      "epoch": 3.0733618233618234,
+      "grad_norm": 0.9741103649139404,
+      "learning_rate": 2.540581087776792e-05,
+      "loss": 0.824,
+      "step": 17263
+    },
+    {
+      "epoch": 3.073539886039886,
+      "grad_norm": 0.85637366771698,
+      "learning_rate": 2.5396489137671033e-05,
+      "loss": 0.6022,
+      "step": 17264
+    },
+    {
+      "epoch": 3.073717948717949,
+      "grad_norm": 0.9319208264350891,
+      "learning_rate": 2.5387168859274947e-05,
+      "loss": 0.7912,
+      "step": 17265
+    },
+    {
+      "epoch": 3.073896011396011,
+      "grad_norm": 0.9643087387084961,
+      "learning_rate": 2.5377850042762295e-05,
+      "loss": 0.8833,
+      "step": 17266
+    },
+    {
+      "epoch": 3.074074074074074,
+      "grad_norm": 0.860395610332489,
+      "learning_rate": 2.536853268831567e-05,
+      "loss": 0.6679,
+      "step": 17267
+    },
+    {
+      "epoch": 3.0742521367521367,
+      "grad_norm": 0.8932734727859497,
+      "learning_rate": 2.5359216796117603e-05,
+      "loss": 0.6577,
+      "step": 17268
+    },
+    {
+      "epoch": 3.0744301994301995,
+      "grad_norm": 0.9940623641014099,
+      "learning_rate": 2.534990236635063e-05,
+      "loss": 0.7469,
+      "step": 17269
+    },
+    {
+      "epoch": 3.074608262108262,
+      "grad_norm": 0.7407364845275879,
+      "learning_rate": 2.5340589399197222e-05,
+      "loss": 0.6288,
+      "step": 17270
+    },
+    {
+      "epoch": 3.074786324786325,
+      "grad_norm": 0.9189527034759521,
+      "learning_rate": 2.5331277894839934e-05,
+      "loss": 0.5919,
+      "step": 17271
+    },
+    {
+      "epoch": 3.0749643874643873,
+      "grad_norm": 0.8940016031265259,
+      "learning_rate": 2.5321967853461093e-05,
+      "loss": 0.8846,
+      "step": 17272
+    },
+    {
+      "epoch": 3.07514245014245,
+      "grad_norm": 0.9263308048248291,
+      "learning_rate": 2.5312659275243177e-05,
+      "loss": 0.7543,
+      "step": 17273
+    },
+    {
+      "epoch": 3.0753205128205128,
+      "grad_norm": 0.9368407726287842,
+      "learning_rate": 2.5303352160368556e-05,
+      "loss": 0.7127,
+      "step": 17274
+    },
+    {
+      "epoch": 3.0754985754985755,
+      "grad_norm": 0.8408545255661011,
+      "learning_rate": 2.5294046509019586e-05,
+      "loss": 0.8141,
+      "step": 17275
+    },
+    {
+      "epoch": 3.0756766381766383,
+      "grad_norm": 1.1111087799072266,
+      "learning_rate": 2.5284742321378585e-05,
+      "loss": 0.6377,
+      "step": 17276
+    },
+    {
+      "epoch": 3.075854700854701,
+      "grad_norm": 0.879222571849823,
+      "learning_rate": 2.5275439597627815e-05,
+      "loss": 0.6398,
+      "step": 17277
+    },
+    {
+      "epoch": 3.0760327635327633,
+      "grad_norm": 0.8783233165740967,
+      "learning_rate": 2.526613833794964e-05,
+      "loss": 0.7461,
+      "step": 17278
+    },
+    {
+      "epoch": 3.076210826210826,
+      "grad_norm": 0.9898136258125305,
+      "learning_rate": 2.5256838542526184e-05,
+      "loss": 0.724,
+      "step": 17279
+    },
+    {
+      "epoch": 3.076388888888889,
+      "grad_norm": 1.0655652284622192,
+      "learning_rate": 2.5247540211539744e-05,
+      "loss": 0.9039,
+      "step": 17280
+    },
+    {
+      "epoch": 3.0765669515669516,
+      "grad_norm": 0.952908992767334,
+      "learning_rate": 2.5238243345172464e-05,
+      "loss": 0.7654,
+      "step": 17281
+    },
+    {
+      "epoch": 3.0767450142450143,
+      "grad_norm": 1.056127667427063,
+      "learning_rate": 2.5228947943606494e-05,
+      "loss": 0.7084,
+      "step": 17282
+    },
+    {
+      "epoch": 3.076923076923077,
+      "grad_norm": 1.0711488723754883,
+      "learning_rate": 2.5219654007023973e-05,
+      "loss": 0.8788,
+      "step": 17283
+    },
+    {
+      "epoch": 3.0771011396011394,
+      "grad_norm": 0.9237586259841919,
+      "learning_rate": 2.5210361535606985e-05,
+      "loss": 0.7423,
+      "step": 17284
+    },
+    {
+      "epoch": 3.077279202279202,
+      "grad_norm": 0.8712356686592102,
+      "learning_rate": 2.5201070529537585e-05,
+      "loss": 0.7332,
+      "step": 17285
+    },
+    {
+      "epoch": 3.077457264957265,
+      "grad_norm": 0.9970635175704956,
+      "learning_rate": 2.5191780988997872e-05,
+      "loss": 0.8417,
+      "step": 17286
+    },
+    {
+      "epoch": 3.0776353276353277,
+      "grad_norm": 0.7353211045265198,
+      "learning_rate": 2.5182492914169754e-05,
+      "loss": 0.5351,
+      "step": 17287
+    },
+    {
+      "epoch": 3.0778133903133904,
+      "grad_norm": 0.8101412653923035,
+      "learning_rate": 2.5173206305235297e-05,
+      "loss": 0.674,
+      "step": 17288
+    },
+    {
+      "epoch": 3.077991452991453,
+      "grad_norm": 1.0728546380996704,
+      "learning_rate": 2.5163921162376425e-05,
+      "loss": 0.7514,
+      "step": 17289
+    },
+    {
+      "epoch": 3.0781695156695155,
+      "grad_norm": 1.05673086643219,
+      "learning_rate": 2.5154637485775058e-05,
+      "loss": 0.8012,
+      "step": 17290
+    },
+    {
+      "epoch": 3.078347578347578,
+      "grad_norm": 0.8431952595710754,
+      "learning_rate": 2.514535527561309e-05,
+      "loss": 0.6435,
+      "step": 17291
+    },
+    {
+      "epoch": 3.078525641025641,
+      "grad_norm": 0.9286113977432251,
+      "learning_rate": 2.5136074532072386e-05,
+      "loss": 0.8412,
+      "step": 17292
+    },
+    {
+      "epoch": 3.0787037037037037,
+      "grad_norm": 0.9323510527610779,
+      "learning_rate": 2.512679525533479e-05,
+      "loss": 0.7716,
+      "step": 17293
+    },
+    {
+      "epoch": 3.0788817663817665,
+      "grad_norm": 0.9141219854354858,
+      "learning_rate": 2.5117517445582107e-05,
+      "loss": 0.7361,
+      "step": 17294
+    },
+    {
+      "epoch": 3.0790598290598292,
+      "grad_norm": 0.8749071359634399,
+      "learning_rate": 2.5108241102996076e-05,
+      "loss": 0.6472,
+      "step": 17295
+    },
+    {
+      "epoch": 3.0792378917378915,
+      "grad_norm": 0.8831200003623962,
+      "learning_rate": 2.5098966227758525e-05,
+      "loss": 0.6902,
+      "step": 17296
+    },
+    {
+      "epoch": 3.0794159544159543,
+      "grad_norm": 0.8700500130653381,
+      "learning_rate": 2.5089692820051136e-05,
+      "loss": 0.6671,
+      "step": 17297
+    },
+    {
+      "epoch": 3.079594017094017,
+      "grad_norm": 0.865032970905304,
+      "learning_rate": 2.50804208800556e-05,
+      "loss": 0.5883,
+      "step": 17298
+    },
+    {
+      "epoch": 3.07977207977208,
+      "grad_norm": 1.0697903633117676,
+      "learning_rate": 2.5071150407953593e-05,
+      "loss": 0.6554,
+      "step": 17299
+    },
+    {
+      "epoch": 3.0799501424501425,
+      "grad_norm": 1.1248756647109985,
+      "learning_rate": 2.50618814039267e-05,
+      "loss": 0.7568,
+      "step": 17300
+    },
+    {
+      "epoch": 3.0801282051282053,
+      "grad_norm": 0.8852017521858215,
+      "learning_rate": 2.5052613868156638e-05,
+      "loss": 0.7013,
+      "step": 17301
+    },
+    {
+      "epoch": 3.0803062678062676,
+      "grad_norm": 0.8573511838912964,
+      "learning_rate": 2.5043347800824857e-05,
+      "loss": 0.7584,
+      "step": 17302
+    },
+    {
+      "epoch": 3.0804843304843303,
+      "grad_norm": 0.8905999660491943,
+      "learning_rate": 2.5034083202113034e-05,
+      "loss": 0.6698,
+      "step": 17303
+    },
+    {
+      "epoch": 3.080662393162393,
+      "grad_norm": 0.8821704387664795,
+      "learning_rate": 2.5024820072202548e-05,
+      "loss": 0.6776,
+      "step": 17304
+    },
+    {
+      "epoch": 3.080840455840456,
+      "grad_norm": 0.8579546809196472,
+      "learning_rate": 2.5015558411275008e-05,
+      "loss": 0.7955,
+      "step": 17305
+    },
+    {
+      "epoch": 3.0810185185185186,
+      "grad_norm": 0.8955085277557373,
+      "learning_rate": 2.5006298219511827e-05,
+      "loss": 0.6944,
+      "step": 17306
+    },
+    {
+      "epoch": 3.0811965811965814,
+      "grad_norm": 1.0479052066802979,
+      "learning_rate": 2.4997039497094453e-05,
+      "loss": 0.8585,
+      "step": 17307
+    },
+    {
+      "epoch": 3.0813746438746437,
+      "grad_norm": 0.8598275780677795,
+      "learning_rate": 2.4987782244204273e-05,
+      "loss": 0.6137,
+      "step": 17308
+    },
+    {
+      "epoch": 3.0815527065527064,
+      "grad_norm": 0.8625657558441162,
+      "learning_rate": 2.497852646102269e-05,
+      "loss": 0.7195,
+      "step": 17309
+    },
+    {
+      "epoch": 3.081730769230769,
+      "grad_norm": 0.945574939250946,
+      "learning_rate": 2.4969272147730994e-05,
+      "loss": 0.821,
+      "step": 17310
+    },
+    {
+      "epoch": 3.081908831908832,
+      "grad_norm": 0.7945711612701416,
+      "learning_rate": 2.4960019304510618e-05,
+      "loss": 0.5694,
+      "step": 17311
+    },
+    {
+      "epoch": 3.0820868945868947,
+      "grad_norm": 0.9468835592269897,
+      "learning_rate": 2.4950767931542717e-05,
+      "loss": 0.8095,
+      "step": 17312
+    },
+    {
+      "epoch": 3.0822649572649574,
+      "grad_norm": 0.9362178444862366,
+      "learning_rate": 2.494151802900867e-05,
+      "loss": 0.6649,
+      "step": 17313
+    },
+    {
+      "epoch": 3.08244301994302,
+      "grad_norm": 0.9418696761131287,
+      "learning_rate": 2.493226959708961e-05,
+      "loss": 0.9316,
+      "step": 17314
+    },
+    {
+      "epoch": 3.0826210826210825,
+      "grad_norm": 0.9696083664894104,
+      "learning_rate": 2.492302263596681e-05,
+      "loss": 0.6348,
+      "step": 17315
+    },
+    {
+      "epoch": 3.0827991452991452,
+      "grad_norm": 0.9243941903114319,
+      "learning_rate": 2.4913777145821427e-05,
+      "loss": 0.8841,
+      "step": 17316
+    },
+    {
+      "epoch": 3.082977207977208,
+      "grad_norm": 0.89748615026474,
+      "learning_rate": 2.4904533126834573e-05,
+      "loss": 0.6133,
+      "step": 17317
+    },
+    {
+      "epoch": 3.0831552706552707,
+      "grad_norm": 0.9694730043411255,
+      "learning_rate": 2.4895290579187446e-05,
+      "loss": 0.7748,
+      "step": 17318
+    },
+    {
+      "epoch": 3.0833333333333335,
+      "grad_norm": 0.8850114345550537,
+      "learning_rate": 2.4886049503061025e-05,
+      "loss": 0.6369,
+      "step": 17319
+    },
+    {
+      "epoch": 3.083511396011396,
+      "grad_norm": 0.7270629405975342,
+      "learning_rate": 2.4876809898636464e-05,
+      "loss": 0.5965,
+      "step": 17320
+    },
+    {
+      "epoch": 3.0836894586894585,
+      "grad_norm": 0.8992782831192017,
+      "learning_rate": 2.4867571766094764e-05,
+      "loss": 0.768,
+      "step": 17321
+    },
+    {
+      "epoch": 3.0838675213675213,
+      "grad_norm": 0.8755559921264648,
+      "learning_rate": 2.4858335105616916e-05,
+      "loss": 0.7047,
+      "step": 17322
+    },
+    {
+      "epoch": 3.084045584045584,
+      "grad_norm": 1.1143306493759155,
+      "learning_rate": 2.4849099917383888e-05,
+      "loss": 0.7245,
+      "step": 17323
+    },
+    {
+      "epoch": 3.084223646723647,
+      "grad_norm": 0.9730618000030518,
+      "learning_rate": 2.4839866201576646e-05,
+      "loss": 0.8453,
+      "step": 17324
+    },
+    {
+      "epoch": 3.0844017094017095,
+      "grad_norm": 0.8983954191207886,
+      "learning_rate": 2.483063395837606e-05,
+      "loss": 0.6329,
+      "step": 17325
+    },
+    {
+      "epoch": 3.0845797720797723,
+      "grad_norm": 0.9837406277656555,
+      "learning_rate": 2.482140318796311e-05,
+      "loss": 0.6922,
+      "step": 17326
+    },
+    {
+      "epoch": 3.0847578347578346,
+      "grad_norm": 0.9490154385566711,
+      "learning_rate": 2.4812173890518544e-05,
+      "loss": 0.9131,
+      "step": 17327
+    },
+    {
+      "epoch": 3.0849358974358974,
+      "grad_norm": 0.7831540107727051,
+      "learning_rate": 2.4802946066223287e-05,
+      "loss": 0.6089,
+      "step": 17328
+    },
+    {
+      "epoch": 3.08511396011396,
+      "grad_norm": 1.0531748533248901,
+      "learning_rate": 2.4793719715258044e-05,
+      "loss": 0.9125,
+      "step": 17329
+    },
+    {
+      "epoch": 3.085292022792023,
+      "grad_norm": 0.9238786697387695,
+      "learning_rate": 2.4784494837803663e-05,
+      "loss": 0.6231,
+      "step": 17330
+    },
+    {
+      "epoch": 3.0854700854700856,
+      "grad_norm": 0.826562762260437,
+      "learning_rate": 2.477527143404086e-05,
+      "loss": 0.8999,
+      "step": 17331
+    },
+    {
+      "epoch": 3.0856481481481484,
+      "grad_norm": 1.0574134588241577,
+      "learning_rate": 2.4766049504150335e-05,
+      "loss": 0.6379,
+      "step": 17332
+    },
+    {
+      "epoch": 3.0858262108262107,
+      "grad_norm": 0.8981440663337708,
+      "learning_rate": 2.47568290483128e-05,
+      "loss": 0.8848,
+      "step": 17333
+    },
+    {
+      "epoch": 3.0860042735042734,
+      "grad_norm": 0.8472706079483032,
+      "learning_rate": 2.4747610066708894e-05,
+      "loss": 0.886,
+      "step": 17334
+    },
+    {
+      "epoch": 3.086182336182336,
+      "grad_norm": 0.7722052335739136,
+      "learning_rate": 2.473839255951921e-05,
+      "loss": 0.5804,
+      "step": 17335
+    },
+    {
+      "epoch": 3.086360398860399,
+      "grad_norm": 1.0919874906539917,
+      "learning_rate": 2.4729176526924412e-05,
+      "loss": 0.8474,
+      "step": 17336
+    },
+    {
+      "epoch": 3.0865384615384617,
+      "grad_norm": 0.8331817388534546,
+      "learning_rate": 2.4719961969105042e-05,
+      "loss": 0.7727,
+      "step": 17337
+    },
+    {
+      "epoch": 3.0867165242165244,
+      "grad_norm": 0.7660060524940491,
+      "learning_rate": 2.471074888624163e-05,
+      "loss": 0.6989,
+      "step": 17338
+    },
+    {
+      "epoch": 3.0868945868945867,
+      "grad_norm": 1.0130395889282227,
+      "learning_rate": 2.4701537278514708e-05,
+      "loss": 0.6055,
+      "step": 17339
+    },
+    {
+      "epoch": 3.0870726495726495,
+      "grad_norm": 0.9906135201454163,
+      "learning_rate": 2.469232714610473e-05,
+      "loss": 0.7473,
+      "step": 17340
+    },
+    {
+      "epoch": 3.0872507122507122,
+      "grad_norm": 0.8437367677688599,
+      "learning_rate": 2.468311848919217e-05,
+      "loss": 0.5901,
+      "step": 17341
+    },
+    {
+      "epoch": 3.087428774928775,
+      "grad_norm": 0.9725748300552368,
+      "learning_rate": 2.467391130795741e-05,
+      "loss": 0.9002,
+      "step": 17342
+    },
+    {
+      "epoch": 3.0876068376068377,
+      "grad_norm": 0.8603026270866394,
+      "learning_rate": 2.466470560258094e-05,
+      "loss": 0.7682,
+      "step": 17343
+    },
+    {
+      "epoch": 3.0877849002849005,
+      "grad_norm": 0.9285373687744141,
+      "learning_rate": 2.4655501373243016e-05,
+      "loss": 0.8618,
+      "step": 17344
+    },
+    {
+      "epoch": 3.087962962962963,
+      "grad_norm": 1.0237677097320557,
+      "learning_rate": 2.4646298620124065e-05,
+      "loss": 0.9456,
+      "step": 17345
+    },
+    {
+      "epoch": 3.0881410256410255,
+      "grad_norm": 0.9248231649398804,
+      "learning_rate": 2.463709734340435e-05,
+      "loss": 0.5566,
+      "step": 17346
+    },
+    {
+      "epoch": 3.0883190883190883,
+      "grad_norm": 0.9666411280632019,
+      "learning_rate": 2.4627897543264154e-05,
+      "loss": 0.7949,
+      "step": 17347
+    },
+    {
+      "epoch": 3.088497150997151,
+      "grad_norm": 0.9193452000617981,
+      "learning_rate": 2.4618699219883735e-05,
+      "loss": 0.7006,
+      "step": 17348
+    },
+    {
+      "epoch": 3.088675213675214,
+      "grad_norm": 1.0161619186401367,
+      "learning_rate": 2.4609502373443316e-05,
+      "loss": 0.7966,
+      "step": 17349
+    },
+    {
+      "epoch": 3.0888532763532766,
+      "grad_norm": 0.9324184656143188,
+      "learning_rate": 2.460030700412309e-05,
+      "loss": 0.78,
+      "step": 17350
+    },
+    {
+      "epoch": 3.089031339031339,
+      "grad_norm": 0.9298717379570007,
+      "learning_rate": 2.4591113112103216e-05,
+      "loss": 0.8061,
+      "step": 17351
+    },
+    {
+      "epoch": 3.0892094017094016,
+      "grad_norm": 0.9238819479942322,
+      "learning_rate": 2.458192069756381e-05,
+      "loss": 0.8955,
+      "step": 17352
+    },
+    {
+      "epoch": 3.0893874643874644,
+      "grad_norm": 0.8168941736221313,
+      "learning_rate": 2.4572729760685052e-05,
+      "loss": 0.8462,
+      "step": 17353
+    },
+    {
+      "epoch": 3.089565527065527,
+      "grad_norm": 1.0559333562850952,
+      "learning_rate": 2.4563540301646914e-05,
+      "loss": 0.8891,
+      "step": 17354
+    },
+    {
+      "epoch": 3.08974358974359,
+      "grad_norm": 0.9265807271003723,
+      "learning_rate": 2.4554352320629525e-05,
+      "loss": 0.6554,
+      "step": 17355
+    },
+    {
+      "epoch": 3.0899216524216526,
+      "grad_norm": 0.9348501563072205,
+      "learning_rate": 2.4545165817812876e-05,
+      "loss": 0.6963,
+      "step": 17356
+    },
+    {
+      "epoch": 3.090099715099715,
+      "grad_norm": 0.942007839679718,
+      "learning_rate": 2.4535980793376922e-05,
+      "loss": 0.8307,
+      "step": 17357
+    },
+    {
+      "epoch": 3.0902777777777777,
+      "grad_norm": 1.119027018547058,
+      "learning_rate": 2.4526797247501732e-05,
+      "loss": 0.8015,
+      "step": 17358
+    },
+    {
+      "epoch": 3.0904558404558404,
+      "grad_norm": 0.9317795634269714,
+      "learning_rate": 2.4517615180367103e-05,
+      "loss": 0.6364,
+      "step": 17359
+    },
+    {
+      "epoch": 3.090633903133903,
+      "grad_norm": 1.03551185131073,
+      "learning_rate": 2.450843459215304e-05,
+      "loss": 0.8805,
+      "step": 17360
+    },
+    {
+      "epoch": 3.090811965811966,
+      "grad_norm": 0.8145185708999634,
+      "learning_rate": 2.4499255483039374e-05,
+      "loss": 0.5424,
+      "step": 17361
+    },
+    {
+      "epoch": 3.0909900284900287,
+      "grad_norm": 0.8325085639953613,
+      "learning_rate": 2.4490077853205962e-05,
+      "loss": 0.8728,
+      "step": 17362
+    },
+    {
+      "epoch": 3.091168091168091,
+      "grad_norm": 0.93165522813797,
+      "learning_rate": 2.4480901702832616e-05,
+      "loss": 0.7658,
+      "step": 17363
+    },
+    {
+      "epoch": 3.0913461538461537,
+      "grad_norm": 1.077810287475586,
+      "learning_rate": 2.4471727032099122e-05,
+      "loss": 0.909,
+      "step": 17364
+    },
+    {
+      "epoch": 3.0915242165242165,
+      "grad_norm": 0.9249957203865051,
+      "learning_rate": 2.4462553841185242e-05,
+      "loss": 0.683,
+      "step": 17365
+    },
+    {
+      "epoch": 3.0917022792022792,
+      "grad_norm": 0.9917559623718262,
+      "learning_rate": 2.4453382130270695e-05,
+      "loss": 0.7331,
+      "step": 17366
+    },
+    {
+      "epoch": 3.091880341880342,
+      "grad_norm": 0.8398951292037964,
+      "learning_rate": 2.4444211899535175e-05,
+      "loss": 0.8472,
+      "step": 17367
+    },
+    {
+      "epoch": 3.0920584045584047,
+      "grad_norm": 0.9382245540618896,
+      "learning_rate": 2.443504314915842e-05,
+      "loss": 0.9592,
+      "step": 17368
+    },
+    {
+      "epoch": 3.092236467236467,
+      "grad_norm": 0.9549033641815186,
+      "learning_rate": 2.442587587931997e-05,
+      "loss": 0.8117,
+      "step": 17369
+    },
+    {
+      "epoch": 3.09241452991453,
+      "grad_norm": 0.8004591464996338,
+      "learning_rate": 2.4416710090199512e-05,
+      "loss": 0.6495,
+      "step": 17370
+    },
+    {
+      "epoch": 3.0925925925925926,
+      "grad_norm": 0.8601853251457214,
+      "learning_rate": 2.4407545781976615e-05,
+      "loss": 0.6949,
+      "step": 17371
+    },
+    {
+      "epoch": 3.0927706552706553,
+      "grad_norm": 0.9627183079719543,
+      "learning_rate": 2.4398382954830823e-05,
+      "loss": 0.8233,
+      "step": 17372
+    },
+    {
+      "epoch": 3.092948717948718,
+      "grad_norm": 0.9439026117324829,
+      "learning_rate": 2.438922160894167e-05,
+      "loss": 0.7068,
+      "step": 17373
+    },
+    {
+      "epoch": 3.093126780626781,
+      "grad_norm": 1.1336002349853516,
+      "learning_rate": 2.438006174448866e-05,
+      "loss": 0.6664,
+      "step": 17374
+    },
+    {
+      "epoch": 3.093304843304843,
+      "grad_norm": 0.8688283562660217,
+      "learning_rate": 2.4370903361651243e-05,
+      "loss": 0.8014,
+      "step": 17375
+    },
+    {
+      "epoch": 3.093482905982906,
+      "grad_norm": 0.8023533821105957,
+      "learning_rate": 2.4361746460608836e-05,
+      "loss": 0.648,
+      "step": 17376
+    },
+    {
+      "epoch": 3.0936609686609686,
+      "grad_norm": 0.9410482048988342,
+      "learning_rate": 2.4352591041540918e-05,
+      "loss": 0.637,
+      "step": 17377
+    },
+    {
+      "epoch": 3.0938390313390314,
+      "grad_norm": 0.9243476390838623,
+      "learning_rate": 2.4343437104626844e-05,
+      "loss": 0.6791,
+      "step": 17378
+    },
+    {
+      "epoch": 3.094017094017094,
+      "grad_norm": 0.9308971762657166,
+      "learning_rate": 2.4334284650045947e-05,
+      "loss": 0.8286,
+      "step": 17379
+    },
+    {
+      "epoch": 3.094195156695157,
+      "grad_norm": 0.923256516456604,
+      "learning_rate": 2.4325133677977564e-05,
+      "loss": 0.6421,
+      "step": 17380
+    },
+    {
+      "epoch": 3.094373219373219,
+      "grad_norm": 0.9886062741279602,
+      "learning_rate": 2.431598418860098e-05,
+      "loss": 0.6816,
+      "step": 17381
+    },
+    {
+      "epoch": 3.094551282051282,
+      "grad_norm": 0.9735310673713684,
+      "learning_rate": 2.430683618209544e-05,
+      "loss": 0.7308,
+      "step": 17382
+    },
+    {
+      "epoch": 3.0947293447293447,
+      "grad_norm": 0.794840931892395,
+      "learning_rate": 2.4297689658640266e-05,
+      "loss": 0.7037,
+      "step": 17383
+    },
+    {
+      "epoch": 3.0949074074074074,
+      "grad_norm": 0.9691025018692017,
+      "learning_rate": 2.4288544618414543e-05,
+      "loss": 0.7427,
+      "step": 17384
+    },
+    {
+      "epoch": 3.09508547008547,
+      "grad_norm": 0.8925696611404419,
+      "learning_rate": 2.4279401061597574e-05,
+      "loss": 0.6131,
+      "step": 17385
+    },
+    {
+      "epoch": 3.095263532763533,
+      "grad_norm": 0.9027791619300842,
+      "learning_rate": 2.4270258988368376e-05,
+      "loss": 0.8347,
+      "step": 17386
+    },
+    {
+      "epoch": 3.0954415954415953,
+      "grad_norm": 0.935856282711029,
+      "learning_rate": 2.426111839890618e-05,
+      "loss": 0.7177,
+      "step": 17387
+    },
+    {
+      "epoch": 3.095619658119658,
+      "grad_norm": 1.0325665473937988,
+      "learning_rate": 2.4251979293390024e-05,
+      "loss": 0.7809,
+      "step": 17388
+    },
+    {
+      "epoch": 3.0957977207977208,
+      "grad_norm": 0.9081370234489441,
+      "learning_rate": 2.4242841671998996e-05,
+      "loss": 0.7376,
+      "step": 17389
+    },
+    {
+      "epoch": 3.0959757834757835,
+      "grad_norm": 0.9406909942626953,
+      "learning_rate": 2.42337055349121e-05,
+      "loss": 0.8681,
+      "step": 17390
+    },
+    {
+      "epoch": 3.0961538461538463,
+      "grad_norm": 0.9385591149330139,
+      "learning_rate": 2.4224570882308363e-05,
+      "loss": 0.721,
+      "step": 17391
+    },
+    {
+      "epoch": 3.096331908831909,
+      "grad_norm": 0.9210137724876404,
+      "learning_rate": 2.4215437714366717e-05,
+      "loss": 0.787,
+      "step": 17392
+    },
+    {
+      "epoch": 3.0965099715099713,
+      "grad_norm": 1.039331316947937,
+      "learning_rate": 2.420630603126619e-05,
+      "loss": 0.8778,
+      "step": 17393
+    },
+    {
+      "epoch": 3.096688034188034,
+      "grad_norm": 0.9159904718399048,
+      "learning_rate": 2.4197175833185603e-05,
+      "loss": 0.7282,
+      "step": 17394
+    },
+    {
+      "epoch": 3.096866096866097,
+      "grad_norm": 0.9137975573539734,
+      "learning_rate": 2.4188047120303915e-05,
+      "loss": 0.8241,
+      "step": 17395
+    },
+    {
+      "epoch": 3.0970441595441596,
+      "grad_norm": 0.9383588433265686,
+      "learning_rate": 2.4178919892799955e-05,
+      "loss": 0.7463,
+      "step": 17396
+    },
+    {
+      "epoch": 3.0972222222222223,
+      "grad_norm": 0.8108921051025391,
+      "learning_rate": 2.4169794150852553e-05,
+      "loss": 0.6284,
+      "step": 17397
+    },
+    {
+      "epoch": 3.097400284900285,
+      "grad_norm": 0.929779589176178,
+      "learning_rate": 2.416066989464052e-05,
+      "loss": 0.7413,
+      "step": 17398
+    },
+    {
+      "epoch": 3.0975783475783474,
+      "grad_norm": 0.8805593848228455,
+      "learning_rate": 2.4151547124342576e-05,
+      "loss": 0.6056,
+      "step": 17399
+    },
+    {
+      "epoch": 3.09775641025641,
+      "grad_norm": 0.8895778059959412,
+      "learning_rate": 2.4142425840137573e-05,
+      "loss": 0.7429,
+      "step": 17400
+    },
+    {
+      "epoch": 3.097934472934473,
+      "grad_norm": 0.8784146308898926,
+      "learning_rate": 2.4133306042204095e-05,
+      "loss": 0.6362,
+      "step": 17401
+    },
+    {
+      "epoch": 3.0981125356125356,
+      "grad_norm": 0.9039027690887451,
+      "learning_rate": 2.4124187730720917e-05,
+      "loss": 0.7523,
+      "step": 17402
+    },
+    {
+      "epoch": 3.0982905982905984,
+      "grad_norm": 1.021745204925537,
+      "learning_rate": 2.4115070905866653e-05,
+      "loss": 0.6444,
+      "step": 17403
+    },
+    {
+      "epoch": 3.098468660968661,
+      "grad_norm": 0.9708871245384216,
+      "learning_rate": 2.4105955567819937e-05,
+      "loss": 0.7391,
+      "step": 17404
+    },
+    {
+      "epoch": 3.0986467236467234,
+      "grad_norm": 0.8263231515884399,
+      "learning_rate": 2.4096841716759376e-05,
+      "loss": 0.7933,
+      "step": 17405
+    },
+    {
+      "epoch": 3.098824786324786,
+      "grad_norm": 0.9058948755264282,
+      "learning_rate": 2.408772935286352e-05,
+      "loss": 0.8139,
+      "step": 17406
+    },
+    {
+      "epoch": 3.099002849002849,
+      "grad_norm": 0.9710341691970825,
+      "learning_rate": 2.4078618476310888e-05,
+      "loss": 0.8429,
+      "step": 17407
+    },
+    {
+      "epoch": 3.0991809116809117,
+      "grad_norm": 0.8417289853096008,
+      "learning_rate": 2.4069509087280072e-05,
+      "loss": 0.6132,
+      "step": 17408
+    },
+    {
+      "epoch": 3.0993589743589745,
+      "grad_norm": 0.853482723236084,
+      "learning_rate": 2.406040118594943e-05,
+      "loss": 0.8565,
+      "step": 17409
+    },
+    {
+      "epoch": 3.099537037037037,
+      "grad_norm": 1.0863711833953857,
+      "learning_rate": 2.4051294772497536e-05,
+      "loss": 0.6898,
+      "step": 17410
+    },
+    {
+      "epoch": 3.0997150997150995,
+      "grad_norm": 0.9241563677787781,
+      "learning_rate": 2.4042189847102694e-05,
+      "loss": 0.8047,
+      "step": 17411
+    },
+    {
+      "epoch": 3.0998931623931623,
+      "grad_norm": 0.9109302759170532,
+      "learning_rate": 2.4033086409943384e-05,
+      "loss": 0.597,
+      "step": 17412
+    },
+    {
+      "epoch": 3.100071225071225,
+      "grad_norm": 0.8546639084815979,
+      "learning_rate": 2.4023984461197936e-05,
+      "loss": 0.6349,
+      "step": 17413
+    },
+    {
+      "epoch": 3.1002492877492878,
+      "grad_norm": 0.9716140627861023,
+      "learning_rate": 2.4014884001044692e-05,
+      "loss": 0.61,
+      "step": 17414
+    },
+    {
+      "epoch": 3.1004273504273505,
+      "grad_norm": 0.8704510927200317,
+      "learning_rate": 2.4005785029661944e-05,
+      "loss": 0.6373,
+      "step": 17415
+    },
+    {
+      "epoch": 3.1006054131054133,
+      "grad_norm": 0.933198869228363,
+      "learning_rate": 2.3996687547227948e-05,
+      "loss": 0.8694,
+      "step": 17416
+    },
+    {
+      "epoch": 3.1007834757834756,
+      "grad_norm": 0.9596254825592041,
+      "learning_rate": 2.3987591553920997e-05,
+      "loss": 0.9679,
+      "step": 17417
+    },
+    {
+      "epoch": 3.1009615384615383,
+      "grad_norm": 0.8634598255157471,
+      "learning_rate": 2.397849704991929e-05,
+      "loss": 0.7072,
+      "step": 17418
+    },
+    {
+      "epoch": 3.101139601139601,
+      "grad_norm": 0.9921422004699707,
+      "learning_rate": 2.3969404035401023e-05,
+      "loss": 0.8952,
+      "step": 17419
+    },
+    {
+      "epoch": 3.101317663817664,
+      "grad_norm": 0.9247313141822815,
+      "learning_rate": 2.3960312510544336e-05,
+      "loss": 0.856,
+      "step": 17420
+    },
+    {
+      "epoch": 3.1014957264957266,
+      "grad_norm": 0.8624486327171326,
+      "learning_rate": 2.395122247552737e-05,
+      "loss": 0.6163,
+      "step": 17421
+    },
+    {
+      "epoch": 3.1016737891737893,
+      "grad_norm": 0.9683988690376282,
+      "learning_rate": 2.394213393052822e-05,
+      "loss": 0.7084,
+      "step": 17422
+    },
+    {
+      "epoch": 3.1018518518518516,
+      "grad_norm": 0.8543943166732788,
+      "learning_rate": 2.3933046875724964e-05,
+      "loss": 0.6083,
+      "step": 17423
+    },
+    {
+      "epoch": 3.1020299145299144,
+      "grad_norm": 0.9302839040756226,
+      "learning_rate": 2.3923961311295596e-05,
+      "loss": 0.6842,
+      "step": 17424
+    },
+    {
+      "epoch": 3.102207977207977,
+      "grad_norm": 0.9772568345069885,
+      "learning_rate": 2.3914877237418244e-05,
+      "loss": 0.7136,
+      "step": 17425
+    },
+    {
+      "epoch": 3.10238603988604,
+      "grad_norm": 0.8349552750587463,
+      "learning_rate": 2.3905794654270753e-05,
+      "loss": 0.6465,
+      "step": 17426
+    },
+    {
+      "epoch": 3.1025641025641026,
+      "grad_norm": 0.9706963300704956,
+      "learning_rate": 2.3896713562031182e-05,
+      "loss": 0.8294,
+      "step": 17427
+    },
+    {
+      "epoch": 3.1027421652421654,
+      "grad_norm": 0.9962044954299927,
+      "learning_rate": 2.388763396087741e-05,
+      "loss": 0.7717,
+      "step": 17428
+    },
+    {
+      "epoch": 3.1029202279202277,
+      "grad_norm": 1.0264992713928223,
+      "learning_rate": 2.387855585098734e-05,
+      "loss": 0.7093,
+      "step": 17429
+    },
+    {
+      "epoch": 3.1030982905982905,
+      "grad_norm": 0.8996137976646423,
+      "learning_rate": 2.386947923253884e-05,
+      "loss": 0.7551,
+      "step": 17430
+    },
+    {
+      "epoch": 3.103276353276353,
+      "grad_norm": 0.8956265449523926,
+      "learning_rate": 2.3860404105709756e-05,
+      "loss": 0.7528,
+      "step": 17431
+    },
+    {
+      "epoch": 3.103454415954416,
+      "grad_norm": 0.8972591757774353,
+      "learning_rate": 2.3851330470677847e-05,
+      "loss": 0.9049,
+      "step": 17432
+    },
+    {
+      "epoch": 3.1036324786324787,
+      "grad_norm": 0.9835371375083923,
+      "learning_rate": 2.3842258327620993e-05,
+      "loss": 0.679,
+      "step": 17433
+    },
+    {
+      "epoch": 3.1038105413105415,
+      "grad_norm": 0.9487658739089966,
+      "learning_rate": 2.383318767671683e-05,
+      "loss": 0.626,
+      "step": 17434
+    },
+    {
+      "epoch": 3.103988603988604,
+      "grad_norm": 0.9638952016830444,
+      "learning_rate": 2.3824118518143156e-05,
+      "loss": 0.8995,
+      "step": 17435
+    },
+    {
+      "epoch": 3.1041666666666665,
+      "grad_norm": 1.085768699645996,
+      "learning_rate": 2.3815050852077637e-05,
+      "loss": 0.5734,
+      "step": 17436
+    },
+    {
+      "epoch": 3.1043447293447293,
+      "grad_norm": 0.9774207472801208,
+      "learning_rate": 2.3805984678697934e-05,
+      "loss": 0.9972,
+      "step": 17437
+    },
+    {
+      "epoch": 3.104522792022792,
+      "grad_norm": 0.8640549182891846,
+      "learning_rate": 2.379691999818169e-05,
+      "loss": 0.6142,
+      "step": 17438
+    },
+    {
+      "epoch": 3.1047008547008548,
+      "grad_norm": 0.8625344634056091,
+      "learning_rate": 2.3787856810706465e-05,
+      "loss": 0.8086,
+      "step": 17439
+    },
+    {
+      "epoch": 3.1048789173789175,
+      "grad_norm": 1.0197455883026123,
+      "learning_rate": 2.3778795116449937e-05,
+      "loss": 0.7878,
+      "step": 17440
+    },
+    {
+      "epoch": 3.10505698005698,
+      "grad_norm": 0.9929977059364319,
+      "learning_rate": 2.3769734915589514e-05,
+      "loss": 0.834,
+      "step": 17441
+    },
+    {
+      "epoch": 3.1052350427350426,
+      "grad_norm": 0.9360977411270142,
+      "learning_rate": 2.3760676208302812e-05,
+      "loss": 0.6217,
+      "step": 17442
+    },
+    {
+      "epoch": 3.1054131054131053,
+      "grad_norm": 0.9955264925956726,
+      "learning_rate": 2.3751618994767288e-05,
+      "loss": 0.8329,
+      "step": 17443
+    },
+    {
+      "epoch": 3.105591168091168,
+      "grad_norm": 0.7187397480010986,
+      "learning_rate": 2.3742563275160402e-05,
+      "loss": 0.5394,
+      "step": 17444
+    },
+    {
+      "epoch": 3.105769230769231,
+      "grad_norm": 0.8444333076477051,
+      "learning_rate": 2.3733509049659563e-05,
+      "loss": 0.7639,
+      "step": 17445
+    },
+    {
+      "epoch": 3.1059472934472936,
+      "grad_norm": 0.7043599486351013,
+      "learning_rate": 2.3724456318442202e-05,
+      "loss": 0.4893,
+      "step": 17446
+    },
+    {
+      "epoch": 3.1061253561253563,
+      "grad_norm": 1.1488189697265625,
+      "learning_rate": 2.371540508168566e-05,
+      "loss": 0.6633,
+      "step": 17447
+    },
+    {
+      "epoch": 3.1063034188034186,
+      "grad_norm": 0.9220834374427795,
+      "learning_rate": 2.3706355339567286e-05,
+      "loss": 0.7883,
+      "step": 17448
+    },
+    {
+      "epoch": 3.1064814814814814,
+      "grad_norm": 0.8987318873405457,
+      "learning_rate": 2.3697307092264375e-05,
+      "loss": 0.6756,
+      "step": 17449
+    },
+    {
+      "epoch": 3.106659544159544,
+      "grad_norm": 1.0003935098648071,
+      "learning_rate": 2.368826033995427e-05,
+      "loss": 0.8456,
+      "step": 17450
+    },
+    {
+      "epoch": 3.106837606837607,
+      "grad_norm": 0.9491976499557495,
+      "learning_rate": 2.3679215082814133e-05,
+      "loss": 0.8256,
+      "step": 17451
+    },
+    {
+      "epoch": 3.1070156695156697,
+      "grad_norm": 0.9657387137413025,
+      "learning_rate": 2.3670171321021262e-05,
+      "loss": 0.8526,
+      "step": 17452
+    },
+    {
+      "epoch": 3.1071937321937324,
+      "grad_norm": 0.863231897354126,
+      "learning_rate": 2.366112905475282e-05,
+      "loss": 0.7528,
+      "step": 17453
+    },
+    {
+      "epoch": 3.1073717948717947,
+      "grad_norm": 0.9410921335220337,
+      "learning_rate": 2.3652088284185934e-05,
+      "loss": 0.8809,
+      "step": 17454
+    },
+    {
+      "epoch": 3.1075498575498575,
+      "grad_norm": 0.940887987613678,
+      "learning_rate": 2.3643049009497853e-05,
+      "loss": 0.6853,
+      "step": 17455
+    },
+    {
+      "epoch": 3.10772792022792,
+      "grad_norm": 0.9488041400909424,
+      "learning_rate": 2.363401123086555e-05,
+      "loss": 0.8417,
+      "step": 17456
+    },
+    {
+      "epoch": 3.107905982905983,
+      "grad_norm": 0.8554413914680481,
+      "learning_rate": 2.3624974948466204e-05,
+      "loss": 0.75,
+      "step": 17457
+    },
+    {
+      "epoch": 3.1080840455840457,
+      "grad_norm": 0.8631907105445862,
+      "learning_rate": 2.361594016247678e-05,
+      "loss": 0.6059,
+      "step": 17458
+    },
+    {
+      "epoch": 3.1082621082621085,
+      "grad_norm": 0.9793382883071899,
+      "learning_rate": 2.3606906873074354e-05,
+      "loss": 0.7293,
+      "step": 17459
+    },
+    {
+      "epoch": 3.1084401709401708,
+      "grad_norm": 0.8650250434875488,
+      "learning_rate": 2.3597875080435883e-05,
+      "loss": 0.6946,
+      "step": 17460
+    },
+    {
+      "epoch": 3.1086182336182335,
+      "grad_norm": 0.954089343547821,
+      "learning_rate": 2.358884478473835e-05,
+      "loss": 0.9179,
+      "step": 17461
+    },
+    {
+      "epoch": 3.1087962962962963,
+      "grad_norm": 0.96575528383255,
+      "learning_rate": 2.357981598615867e-05,
+      "loss": 0.8668,
+      "step": 17462
+    },
+    {
+      "epoch": 3.108974358974359,
+      "grad_norm": 0.9860404133796692,
+      "learning_rate": 2.3570788684873757e-05,
+      "loss": 0.8166,
+      "step": 17463
+    },
+    {
+      "epoch": 3.109152421652422,
+      "grad_norm": 1.058204174041748,
+      "learning_rate": 2.3561762881060434e-05,
+      "loss": 0.8283,
+      "step": 17464
+    },
+    {
+      "epoch": 3.1093304843304845,
+      "grad_norm": 0.9833809733390808,
+      "learning_rate": 2.355273857489564e-05,
+      "loss": 0.8027,
+      "step": 17465
+    },
+    {
+      "epoch": 3.109508547008547,
+      "grad_norm": 0.9593403339385986,
+      "learning_rate": 2.3543715766556073e-05,
+      "loss": 0.713,
+      "step": 17466
+    },
+    {
+      "epoch": 3.1096866096866096,
+      "grad_norm": 1.0403105020523071,
+      "learning_rate": 2.3534694456218587e-05,
+      "loss": 0.8695,
+      "step": 17467
+    },
+    {
+      "epoch": 3.1098646723646723,
+      "grad_norm": 0.9821664094924927,
+      "learning_rate": 2.3525674644059936e-05,
+      "loss": 0.8268,
+      "step": 17468
+    },
+    {
+      "epoch": 3.110042735042735,
+      "grad_norm": 1.0293829441070557,
+      "learning_rate": 2.3516656330256826e-05,
+      "loss": 0.7902,
+      "step": 17469
+    },
+    {
+      "epoch": 3.110220797720798,
+      "grad_norm": 1.0120795965194702,
+      "learning_rate": 2.3507639514985947e-05,
+      "loss": 0.8086,
+      "step": 17470
+    },
+    {
+      "epoch": 3.1103988603988606,
+      "grad_norm": 1.0089393854141235,
+      "learning_rate": 2.349862419842398e-05,
+      "loss": 0.8249,
+      "step": 17471
+    },
+    {
+      "epoch": 3.110576923076923,
+      "grad_norm": 0.908614456653595,
+      "learning_rate": 2.348961038074755e-05,
+      "loss": 0.7212,
+      "step": 17472
+    },
+    {
+      "epoch": 3.1107549857549857,
+      "grad_norm": 0.9755021333694458,
+      "learning_rate": 2.3480598062133263e-05,
+      "loss": 0.7566,
+      "step": 17473
+    },
+    {
+      "epoch": 3.1109330484330484,
+      "grad_norm": 0.7833260893821716,
+      "learning_rate": 2.3471587242757686e-05,
+      "loss": 0.5381,
+      "step": 17474
+    },
+    {
+      "epoch": 3.111111111111111,
+      "grad_norm": 1.0489907264709473,
+      "learning_rate": 2.3462577922797403e-05,
+      "loss": 0.9524,
+      "step": 17475
+    },
+    {
+      "epoch": 3.111289173789174,
+      "grad_norm": 0.9572435021400452,
+      "learning_rate": 2.3453570102428912e-05,
+      "loss": 0.7576,
+      "step": 17476
+    },
+    {
+      "epoch": 3.1114672364672367,
+      "grad_norm": 1.0597792863845825,
+      "learning_rate": 2.344456378182871e-05,
+      "loss": 0.6565,
+      "step": 17477
+    },
+    {
+      "epoch": 3.111645299145299,
+      "grad_norm": 0.9518697261810303,
+      "learning_rate": 2.3435558961173244e-05,
+      "loss": 0.8274,
+      "step": 17478
+    },
+    {
+      "epoch": 3.1118233618233617,
+      "grad_norm": 0.8185853958129883,
+      "learning_rate": 2.3426555640638927e-05,
+      "loss": 0.7058,
+      "step": 17479
+    },
+    {
+      "epoch": 3.1120014245014245,
+      "grad_norm": 0.9529431462287903,
+      "learning_rate": 2.3417553820402237e-05,
+      "loss": 0.8802,
+      "step": 17480
+    },
+    {
+      "epoch": 3.1121794871794872,
+      "grad_norm": 1.0432971715927124,
+      "learning_rate": 2.340855350063944e-05,
+      "loss": 0.853,
+      "step": 17481
+    },
+    {
+      "epoch": 3.11235754985755,
+      "grad_norm": 0.8705558180809021,
+      "learning_rate": 2.339955468152699e-05,
+      "loss": 0.6116,
+      "step": 17482
+    },
+    {
+      "epoch": 3.1125356125356127,
+      "grad_norm": 0.8967922329902649,
+      "learning_rate": 2.3390557363241082e-05,
+      "loss": 0.7099,
+      "step": 17483
+    },
+    {
+      "epoch": 3.112713675213675,
+      "grad_norm": 1.0413905382156372,
+      "learning_rate": 2.3381561545958098e-05,
+      "loss": 0.8544,
+      "step": 17484
+    },
+    {
+      "epoch": 3.112891737891738,
+      "grad_norm": 0.831136167049408,
+      "learning_rate": 2.337256722985425e-05,
+      "loss": 0.7654,
+      "step": 17485
+    },
+    {
+      "epoch": 3.1130698005698005,
+      "grad_norm": 1.002241611480713,
+      "learning_rate": 2.336357441510576e-05,
+      "loss": 0.831,
+      "step": 17486
+    },
+    {
+      "epoch": 3.1132478632478633,
+      "grad_norm": 0.9891117215156555,
+      "learning_rate": 2.335458310188884e-05,
+      "loss": 0.7456,
+      "step": 17487
+    },
+    {
+      "epoch": 3.113425925925926,
+      "grad_norm": 0.8876535296440125,
+      "learning_rate": 2.334559329037964e-05,
+      "loss": 0.7195,
+      "step": 17488
+    },
+    {
+      "epoch": 3.113603988603989,
+      "grad_norm": 1.015412449836731,
+      "learning_rate": 2.3336604980754283e-05,
+      "loss": 0.8297,
+      "step": 17489
+    },
+    {
+      "epoch": 3.113782051282051,
+      "grad_norm": 0.8936545252799988,
+      "learning_rate": 2.3327618173188947e-05,
+      "loss": 0.6681,
+      "step": 17490
+    },
+    {
+      "epoch": 3.113960113960114,
+      "grad_norm": 1.0073288679122925,
+      "learning_rate": 2.3318632867859602e-05,
+      "loss": 0.6266,
+      "step": 17491
+    },
+    {
+      "epoch": 3.1141381766381766,
+      "grad_norm": 0.7852414846420288,
+      "learning_rate": 2.3309649064942418e-05,
+      "loss": 0.4913,
+      "step": 17492
+    },
+    {
+      "epoch": 3.1143162393162394,
+      "grad_norm": 0.9297264218330383,
+      "learning_rate": 2.330066676461329e-05,
+      "loss": 0.6447,
+      "step": 17493
+    },
+    {
+      "epoch": 3.114494301994302,
+      "grad_norm": 0.9050962924957275,
+      "learning_rate": 2.3291685967048295e-05,
+      "loss": 0.7686,
+      "step": 17494
+    },
+    {
+      "epoch": 3.114672364672365,
+      "grad_norm": 0.977013885974884,
+      "learning_rate": 2.328270667242336e-05,
+      "loss": 0.8068,
+      "step": 17495
+    },
+    {
+      "epoch": 3.114850427350427,
+      "grad_norm": 1.0976142883300781,
+      "learning_rate": 2.32737288809144e-05,
+      "loss": 0.8486,
+      "step": 17496
+    },
+    {
+      "epoch": 3.11502849002849,
+      "grad_norm": 1.098183274269104,
+      "learning_rate": 2.3264752592697393e-05,
+      "loss": 1.0246,
+      "step": 17497
+    },
+    {
+      "epoch": 3.1152065527065527,
+      "grad_norm": 0.9509225487709045,
+      "learning_rate": 2.3255777807948098e-05,
+      "loss": 0.8665,
+      "step": 17498
+    },
+    {
+      "epoch": 3.1153846153846154,
+      "grad_norm": 0.8583039045333862,
+      "learning_rate": 2.324680452684245e-05,
+      "loss": 0.7259,
+      "step": 17499
+    },
+    {
+      "epoch": 3.115562678062678,
+      "grad_norm": 0.8832426071166992,
+      "learning_rate": 2.323783274955622e-05,
+      "loss": 0.7073,
+      "step": 17500
+    },
+    {
+      "epoch": 3.115740740740741,
+      "grad_norm": 0.910290539264679,
+      "learning_rate": 2.32288624762652e-05,
+      "loss": 0.7346,
+      "step": 17501
+    },
+    {
+      "epoch": 3.1159188034188032,
+      "grad_norm": 0.9926977753639221,
+      "learning_rate": 2.3219893707145136e-05,
+      "loss": 0.7527,
+      "step": 17502
+    },
+    {
+      "epoch": 3.116096866096866,
+      "grad_norm": 0.9508365392684937,
+      "learning_rate": 2.3210926442371762e-05,
+      "loss": 0.6538,
+      "step": 17503
+    },
+    {
+      "epoch": 3.1162749287749287,
+      "grad_norm": 0.8371008634567261,
+      "learning_rate": 2.3201960682120738e-05,
+      "loss": 0.7429,
+      "step": 17504
+    },
+    {
+      "epoch": 3.1164529914529915,
+      "grad_norm": 0.9090998768806458,
+      "learning_rate": 2.3192996426567827e-05,
+      "loss": 0.8299,
+      "step": 17505
+    },
+    {
+      "epoch": 3.1166310541310542,
+      "grad_norm": 1.0013576745986938,
+      "learning_rate": 2.3184033675888528e-05,
+      "loss": 0.7982,
+      "step": 17506
+    },
+    {
+      "epoch": 3.116809116809117,
+      "grad_norm": 1.021876335144043,
+      "learning_rate": 2.3175072430258583e-05,
+      "loss": 0.817,
+      "step": 17507
+    },
+    {
+      "epoch": 3.1169871794871793,
+      "grad_norm": 1.0198882818222046,
+      "learning_rate": 2.316611268985345e-05,
+      "loss": 0.7713,
+      "step": 17508
+    },
+    {
+      "epoch": 3.117165242165242,
+      "grad_norm": 0.8792022466659546,
+      "learning_rate": 2.315715445484875e-05,
+      "loss": 0.6666,
+      "step": 17509
+    },
+    {
+      "epoch": 3.117343304843305,
+      "grad_norm": 0.9849581718444824,
+      "learning_rate": 2.3148197725419983e-05,
+      "loss": 0.9193,
+      "step": 17510
+    },
+    {
+      "epoch": 3.1175213675213675,
+      "grad_norm": 1.1473517417907715,
+      "learning_rate": 2.313924250174263e-05,
+      "loss": 0.7998,
+      "step": 17511
+    },
+    {
+      "epoch": 3.1176994301994303,
+      "grad_norm": 0.9856370091438293,
+      "learning_rate": 2.3130288783992173e-05,
+      "loss": 0.8371,
+      "step": 17512
+    },
+    {
+      "epoch": 3.117877492877493,
+      "grad_norm": 0.9975123405456543,
+      "learning_rate": 2.312133657234401e-05,
+      "loss": 0.9139,
+      "step": 17513
+    },
+    {
+      "epoch": 3.1180555555555554,
+      "grad_norm": 0.8673362731933594,
+      "learning_rate": 2.311238586697353e-05,
+      "loss": 0.7295,
+      "step": 17514
+    },
+    {
+      "epoch": 3.118233618233618,
+      "grad_norm": 0.9963995218276978,
+      "learning_rate": 2.3103436668056154e-05,
+      "loss": 0.7931,
+      "step": 17515
+    },
+    {
+      "epoch": 3.118411680911681,
+      "grad_norm": 0.9084660410881042,
+      "learning_rate": 2.3094488975767204e-05,
+      "loss": 0.7494,
+      "step": 17516
+    },
+    {
+      "epoch": 3.1185897435897436,
+      "grad_norm": 0.9985509514808655,
+      "learning_rate": 2.308554279028199e-05,
+      "loss": 0.7185,
+      "step": 17517
+    },
+    {
+      "epoch": 3.1187678062678064,
+      "grad_norm": 0.929409384727478,
+      "learning_rate": 2.3076598111775772e-05,
+      "loss": 0.7564,
+      "step": 17518
+    },
+    {
+      "epoch": 3.118945868945869,
+      "grad_norm": 0.8911257386207581,
+      "learning_rate": 2.3067654940423832e-05,
+      "loss": 0.8528,
+      "step": 17519
+    },
+    {
+      "epoch": 3.1191239316239314,
+      "grad_norm": 0.9538902640342712,
+      "learning_rate": 2.3058713276401378e-05,
+      "loss": 0.767,
+      "step": 17520
+    },
+    {
+      "epoch": 3.119301994301994,
+      "grad_norm": 0.9277068376541138,
+      "learning_rate": 2.304977311988358e-05,
+      "loss": 0.8786,
+      "step": 17521
+    },
+    {
+      "epoch": 3.119480056980057,
+      "grad_norm": 0.9150798916816711,
+      "learning_rate": 2.304083447104568e-05,
+      "loss": 0.7237,
+      "step": 17522
+    },
+    {
+      "epoch": 3.1196581196581197,
+      "grad_norm": 0.9311725497245789,
+      "learning_rate": 2.3031897330062702e-05,
+      "loss": 0.8545,
+      "step": 17523
+    },
+    {
+      "epoch": 3.1198361823361824,
+      "grad_norm": 1.0069408416748047,
+      "learning_rate": 2.3022961697109847e-05,
+      "loss": 0.6613,
+      "step": 17524
+    },
+    {
+      "epoch": 3.120014245014245,
+      "grad_norm": 0.8878635168075562,
+      "learning_rate": 2.301402757236214e-05,
+      "loss": 0.8657,
+      "step": 17525
+    },
+    {
+      "epoch": 3.1201923076923075,
+      "grad_norm": 0.9363601803779602,
+      "learning_rate": 2.3005094955994643e-05,
+      "loss": 0.7536,
+      "step": 17526
+    },
+    {
+      "epoch": 3.1203703703703702,
+      "grad_norm": 0.9082810282707214,
+      "learning_rate": 2.299616384818236e-05,
+      "loss": 0.7054,
+      "step": 17527
+    },
+    {
+      "epoch": 3.120548433048433,
+      "grad_norm": 0.9305253624916077,
+      "learning_rate": 2.2987234249100296e-05,
+      "loss": 0.7509,
+      "step": 17528
+    },
+    {
+      "epoch": 3.1207264957264957,
+      "grad_norm": 0.9521283507347107,
+      "learning_rate": 2.297830615892339e-05,
+      "loss": 0.7455,
+      "step": 17529
+    },
+    {
+      "epoch": 3.1209045584045585,
+      "grad_norm": 1.0800763368606567,
+      "learning_rate": 2.2969379577826578e-05,
+      "loss": 0.9612,
+      "step": 17530
+    },
+    {
+      "epoch": 3.1210826210826212,
+      "grad_norm": 0.8735892176628113,
+      "learning_rate": 2.296045450598473e-05,
+      "loss": 0.7061,
+      "step": 17531
+    },
+    {
+      "epoch": 3.1212606837606836,
+      "grad_norm": 0.9851289391517639,
+      "learning_rate": 2.2951530943572796e-05,
+      "loss": 0.5958,
+      "step": 17532
+    },
+    {
+      "epoch": 3.1214387464387463,
+      "grad_norm": 1.1228593587875366,
+      "learning_rate": 2.2942608890765494e-05,
+      "loss": 0.7061,
+      "step": 17533
+    },
+    {
+      "epoch": 3.121616809116809,
+      "grad_norm": 0.8875393867492676,
+      "learning_rate": 2.2933688347737736e-05,
+      "loss": 0.7959,
+      "step": 17534
+    },
+    {
+      "epoch": 3.121794871794872,
+      "grad_norm": 0.8963145613670349,
+      "learning_rate": 2.2924769314664262e-05,
+      "loss": 0.6042,
+      "step": 17535
+    },
+    {
+      "epoch": 3.1219729344729346,
+      "grad_norm": 0.8165764808654785,
+      "learning_rate": 2.291585179171979e-05,
+      "loss": 0.6797,
+      "step": 17536
+    },
+    {
+      "epoch": 3.1221509971509973,
+      "grad_norm": 0.9140230417251587,
+      "learning_rate": 2.290693577907913e-05,
+      "loss": 0.8226,
+      "step": 17537
+    },
+    {
+      "epoch": 3.1223290598290596,
+      "grad_norm": 0.9492660760879517,
+      "learning_rate": 2.289802127691687e-05,
+      "loss": 0.909,
+      "step": 17538
+    },
+    {
+      "epoch": 3.1225071225071224,
+      "grad_norm": 0.9075984358787537,
+      "learning_rate": 2.2889108285407734e-05,
+      "loss": 0.8317,
+      "step": 17539
+    },
+    {
+      "epoch": 3.122685185185185,
+      "grad_norm": 1.0724934339523315,
+      "learning_rate": 2.288019680472635e-05,
+      "loss": 0.9833,
+      "step": 17540
+    },
+    {
+      "epoch": 3.122863247863248,
+      "grad_norm": 0.9564105272293091,
+      "learning_rate": 2.287128683504729e-05,
+      "loss": 0.9504,
+      "step": 17541
+    },
+    {
+      "epoch": 3.1230413105413106,
+      "grad_norm": 0.8370453119277954,
+      "learning_rate": 2.2862378376545156e-05,
+      "loss": 0.6957,
+      "step": 17542
+    },
+    {
+      "epoch": 3.1232193732193734,
+      "grad_norm": 0.926626980304718,
+      "learning_rate": 2.285347142939448e-05,
+      "loss": 0.7962,
+      "step": 17543
+    },
+    {
+      "epoch": 3.123397435897436,
+      "grad_norm": 0.8979136347770691,
+      "learning_rate": 2.2844565993769763e-05,
+      "loss": 0.7981,
+      "step": 17544
+    },
+    {
+      "epoch": 3.1235754985754984,
+      "grad_norm": 0.8645342588424683,
+      "learning_rate": 2.283566206984551e-05,
+      "loss": 0.7094,
+      "step": 17545
+    },
+    {
+      "epoch": 3.123753561253561,
+      "grad_norm": 0.8856910467147827,
+      "learning_rate": 2.2826759657796125e-05,
+      "loss": 0.7623,
+      "step": 17546
+    },
+    {
+      "epoch": 3.123931623931624,
+      "grad_norm": 0.9834395051002502,
+      "learning_rate": 2.2817858757796128e-05,
+      "loss": 0.6267,
+      "step": 17547
+    },
+    {
+      "epoch": 3.1241096866096867,
+      "grad_norm": 0.8917219042778015,
+      "learning_rate": 2.280895937001981e-05,
+      "loss": 0.636,
+      "step": 17548
+    },
+    {
+      "epoch": 3.1242877492877494,
+      "grad_norm": 1.0435439348220825,
+      "learning_rate": 2.28000614946416e-05,
+      "loss": 0.6955,
+      "step": 17549
+    },
+    {
+      "epoch": 3.1244658119658117,
+      "grad_norm": 0.8975854516029358,
+      "learning_rate": 2.2791165131835824e-05,
+      "loss": 0.616,
+      "step": 17550
+    },
+    {
+      "epoch": 3.1246438746438745,
+      "grad_norm": 0.9830999374389648,
+      "learning_rate": 2.2782270281776774e-05,
+      "loss": 0.7004,
+      "step": 17551
+    },
+    {
+      "epoch": 3.1248219373219372,
+      "grad_norm": 0.9302573204040527,
+      "learning_rate": 2.2773376944638735e-05,
+      "loss": 0.8539,
+      "step": 17552
+    },
+    {
+      "epoch": 3.125,
+      "grad_norm": 0.9269157648086548,
+      "learning_rate": 2.2764485120595924e-05,
+      "loss": 0.6907,
+      "step": 17553
+    },
+    {
+      "epoch": 3.1251780626780628,
+      "grad_norm": 0.9939448833465576,
+      "learning_rate": 2.2755594809822644e-05,
+      "loss": 0.7269,
+      "step": 17554
+    },
+    {
+      "epoch": 3.1253561253561255,
+      "grad_norm": 0.9136678576469421,
+      "learning_rate": 2.2746706012492957e-05,
+      "loss": 0.9148,
+      "step": 17555
+    },
+    {
+      "epoch": 3.1255341880341883,
+      "grad_norm": 0.927659273147583,
+      "learning_rate": 2.2737818728781124e-05,
+      "loss": 0.7202,
+      "step": 17556
+    },
+    {
+      "epoch": 3.1257122507122506,
+      "grad_norm": 0.8123641014099121,
+      "learning_rate": 2.2728932958861228e-05,
+      "loss": 0.6804,
+      "step": 17557
+    },
+    {
+      "epoch": 3.1258903133903133,
+      "grad_norm": 0.8486707806587219,
+      "learning_rate": 2.272004870290737e-05,
+      "loss": 0.5989,
+      "step": 17558
+    },
+    {
+      "epoch": 3.126068376068376,
+      "grad_norm": 1.0355206727981567,
+      "learning_rate": 2.271116596109363e-05,
+      "loss": 0.8661,
+      "step": 17559
+    },
+    {
+      "epoch": 3.126246438746439,
+      "grad_norm": 0.947636604309082,
+      "learning_rate": 2.2702284733594037e-05,
+      "loss": 0.742,
+      "step": 17560
+    },
+    {
+      "epoch": 3.1264245014245016,
+      "grad_norm": 0.9906873106956482,
+      "learning_rate": 2.269340502058257e-05,
+      "loss": 0.8523,
+      "step": 17561
+    },
+    {
+      "epoch": 3.126602564102564,
+      "grad_norm": 0.8627552390098572,
+      "learning_rate": 2.2684526822233297e-05,
+      "loss": 0.8379,
+      "step": 17562
+    },
+    {
+      "epoch": 3.1267806267806266,
+      "grad_norm": 0.9640594720840454,
+      "learning_rate": 2.267565013872006e-05,
+      "loss": 0.7408,
+      "step": 17563
+    },
+    {
+      "epoch": 3.1269586894586894,
+      "grad_norm": 0.9448180794715881,
+      "learning_rate": 2.2666774970216888e-05,
+      "loss": 0.8076,
+      "step": 17564
+    },
+    {
+      "epoch": 3.127136752136752,
+      "grad_norm": 1.1045911312103271,
+      "learning_rate": 2.2657901316897555e-05,
+      "loss": 0.8835,
+      "step": 17565
+    },
+    {
+      "epoch": 3.127314814814815,
+      "grad_norm": 0.9843702912330627,
+      "learning_rate": 2.264902917893602e-05,
+      "loss": 0.8285,
+      "step": 17566
+    },
+    {
+      "epoch": 3.1274928774928776,
+      "grad_norm": 0.913769543170929,
+      "learning_rate": 2.2640158556506063e-05,
+      "loss": 0.8045,
+      "step": 17567
+    },
+    {
+      "epoch": 3.1276709401709404,
+      "grad_norm": 0.9181899428367615,
+      "learning_rate": 2.26312894497815e-05,
+      "loss": 0.7648,
+      "step": 17568
+    },
+    {
+      "epoch": 3.1278490028490027,
+      "grad_norm": 0.8163581490516663,
+      "learning_rate": 2.2622421858936104e-05,
+      "loss": 0.6892,
+      "step": 17569
+    },
+    {
+      "epoch": 3.1280270655270654,
+      "grad_norm": 0.8715813755989075,
+      "learning_rate": 2.2613555784143614e-05,
+      "loss": 0.7745,
+      "step": 17570
+    },
+    {
+      "epoch": 3.128205128205128,
+      "grad_norm": 1.0851328372955322,
+      "learning_rate": 2.260469122557771e-05,
+      "loss": 0.8486,
+      "step": 17571
+    },
+    {
+      "epoch": 3.128383190883191,
+      "grad_norm": 1.0961211919784546,
+      "learning_rate": 2.2595828183412172e-05,
+      "loss": 0.6314,
+      "step": 17572
+    },
+    {
+      "epoch": 3.1285612535612537,
+      "grad_norm": 1.1079659461975098,
+      "learning_rate": 2.2586966657820518e-05,
+      "loss": 0.8251,
+      "step": 17573
+    },
+    {
+      "epoch": 3.128739316239316,
+      "grad_norm": 0.8718203902244568,
+      "learning_rate": 2.257810664897648e-05,
+      "loss": 0.6977,
+      "step": 17574
+    },
+    {
+      "epoch": 3.1289173789173788,
+      "grad_norm": 0.7854416370391846,
+      "learning_rate": 2.2569248157053602e-05,
+      "loss": 0.584,
+      "step": 17575
+    },
+    {
+      "epoch": 3.1290954415954415,
+      "grad_norm": 1.1214252710342407,
+      "learning_rate": 2.2560391182225437e-05,
+      "loss": 0.8795,
+      "step": 17576
+    },
+    {
+      "epoch": 3.1292735042735043,
+      "grad_norm": 1.018040657043457,
+      "learning_rate": 2.2551535724665596e-05,
+      "loss": 0.7597,
+      "step": 17577
+    },
+    {
+      "epoch": 3.129451566951567,
+      "grad_norm": 0.9986835718154907,
+      "learning_rate": 2.254268178454747e-05,
+      "loss": 0.7418,
+      "step": 17578
+    },
+    {
+      "epoch": 3.1296296296296298,
+      "grad_norm": 0.9148978590965271,
+      "learning_rate": 2.2533829362044643e-05,
+      "loss": 0.7763,
+      "step": 17579
+    },
+    {
+      "epoch": 3.1298076923076925,
+      "grad_norm": 1.0832401514053345,
+      "learning_rate": 2.2524978457330458e-05,
+      "loss": 0.8144,
+      "step": 17580
+    },
+    {
+      "epoch": 3.129985754985755,
+      "grad_norm": 0.9290724396705627,
+      "learning_rate": 2.2516129070578406e-05,
+      "loss": 0.8129,
+      "step": 17581
+    },
+    {
+      "epoch": 3.1301638176638176,
+      "grad_norm": 0.8968502283096313,
+      "learning_rate": 2.250728120196184e-05,
+      "loss": 0.8565,
+      "step": 17582
+    },
+    {
+      "epoch": 3.1303418803418803,
+      "grad_norm": 0.8710499405860901,
+      "learning_rate": 2.2498434851654126e-05,
+      "loss": 0.7707,
+      "step": 17583
+    },
+    {
+      "epoch": 3.130519943019943,
+      "grad_norm": 0.9112555384635925,
+      "learning_rate": 2.248959001982859e-05,
+      "loss": 0.6694,
+      "step": 17584
+    },
+    {
+      "epoch": 3.130698005698006,
+      "grad_norm": 0.9539666771888733,
+      "learning_rate": 2.2480746706658516e-05,
+      "loss": 0.79,
+      "step": 17585
+    },
+    {
+      "epoch": 3.1308760683760686,
+      "grad_norm": 0.8673475384712219,
+      "learning_rate": 2.2471904912317153e-05,
+      "loss": 0.7533,
+      "step": 17586
+    },
+    {
+      "epoch": 3.131054131054131,
+      "grad_norm": 1.044456958770752,
+      "learning_rate": 2.2463064636977826e-05,
+      "loss": 0.851,
+      "step": 17587
+    },
+    {
+      "epoch": 3.1312321937321936,
+      "grad_norm": 1.0221272706985474,
+      "learning_rate": 2.2454225880813618e-05,
+      "loss": 0.6562,
+      "step": 17588
+    },
+    {
+      "epoch": 3.1314102564102564,
+      "grad_norm": 0.9121100902557373,
+      "learning_rate": 2.2445388643997812e-05,
+      "loss": 0.7856,
+      "step": 17589
+    },
+    {
+      "epoch": 3.131588319088319,
+      "grad_norm": 1.0054078102111816,
+      "learning_rate": 2.243655292670347e-05,
+      "loss": 0.9622,
+      "step": 17590
+    },
+    {
+      "epoch": 3.131766381766382,
+      "grad_norm": 0.8391761779785156,
+      "learning_rate": 2.2427718729103763e-05,
+      "loss": 0.7045,
+      "step": 17591
+    },
+    {
+      "epoch": 3.1319444444444446,
+      "grad_norm": 0.9723725914955139,
+      "learning_rate": 2.2418886051371767e-05,
+      "loss": 0.8161,
+      "step": 17592
+    },
+    {
+      "epoch": 3.132122507122507,
+      "grad_norm": 0.8473033905029297,
+      "learning_rate": 2.241005489368052e-05,
+      "loss": 0.7708,
+      "step": 17593
+    },
+    {
+      "epoch": 3.1323005698005697,
+      "grad_norm": 0.9271494150161743,
+      "learning_rate": 2.240122525620312e-05,
+      "loss": 0.7198,
+      "step": 17594
+    },
+    {
+      "epoch": 3.1324786324786325,
+      "grad_norm": 1.0367611646652222,
+      "learning_rate": 2.2392397139112454e-05,
+      "loss": 1.0495,
+      "step": 17595
+    },
+    {
+      "epoch": 3.132656695156695,
+      "grad_norm": 0.8822857737541199,
+      "learning_rate": 2.238357054258158e-05,
+      "loss": 0.5491,
+      "step": 17596
+    },
+    {
+      "epoch": 3.132834757834758,
+      "grad_norm": 0.9773828387260437,
+      "learning_rate": 2.237474546678341e-05,
+      "loss": 0.7685,
+      "step": 17597
+    },
+    {
+      "epoch": 3.1330128205128207,
+      "grad_norm": 1.003929615020752,
+      "learning_rate": 2.2365921911890842e-05,
+      "loss": 0.7251,
+      "step": 17598
+    },
+    {
+      "epoch": 3.133190883190883,
+      "grad_norm": 0.9347188472747803,
+      "learning_rate": 2.235709987807677e-05,
+      "loss": 0.9213,
+      "step": 17599
+    },
+    {
+      "epoch": 3.1333689458689458,
+      "grad_norm": 0.9359163045883179,
+      "learning_rate": 2.2348279365514036e-05,
+      "loss": 0.8882,
+      "step": 17600
+    },
+    {
+      "epoch": 3.1335470085470085,
+      "grad_norm": 1.0264430046081543,
+      "learning_rate": 2.2339460374375463e-05,
+      "loss": 0.567,
+      "step": 17601
+    },
+    {
+      "epoch": 3.1337250712250713,
+      "grad_norm": 1.0392861366271973,
+      "learning_rate": 2.2330642904833833e-05,
+      "loss": 0.725,
+      "step": 17602
+    },
+    {
+      "epoch": 3.133903133903134,
+      "grad_norm": 0.8876413702964783,
+      "learning_rate": 2.2321826957061888e-05,
+      "loss": 0.8109,
+      "step": 17603
+    },
+    {
+      "epoch": 3.1340811965811968,
+      "grad_norm": 1.0765225887298584,
+      "learning_rate": 2.2313012531232435e-05,
+      "loss": 0.8024,
+      "step": 17604
+    },
+    {
+      "epoch": 3.134259259259259,
+      "grad_norm": 0.8807610273361206,
+      "learning_rate": 2.2304199627518063e-05,
+      "loss": 0.6966,
+      "step": 17605
+    },
+    {
+      "epoch": 3.134437321937322,
+      "grad_norm": 0.8906777501106262,
+      "learning_rate": 2.2295388246091532e-05,
+      "loss": 0.7797,
+      "step": 17606
+    },
+    {
+      "epoch": 3.1346153846153846,
+      "grad_norm": 1.0764859914779663,
+      "learning_rate": 2.2286578387125445e-05,
+      "loss": 0.8388,
+      "step": 17607
+    },
+    {
+      "epoch": 3.1347934472934473,
+      "grad_norm": 0.9732229709625244,
+      "learning_rate": 2.2277770050792422e-05,
+      "loss": 0.846,
+      "step": 17608
+    },
+    {
+      "epoch": 3.13497150997151,
+      "grad_norm": 0.8559836149215698,
+      "learning_rate": 2.226896323726504e-05,
+      "loss": 0.7839,
+      "step": 17609
+    },
+    {
+      "epoch": 3.135149572649573,
+      "grad_norm": 0.957591712474823,
+      "learning_rate": 2.2260157946715853e-05,
+      "loss": 0.7902,
+      "step": 17610
+    },
+    {
+      "epoch": 3.135327635327635,
+      "grad_norm": 0.8924140930175781,
+      "learning_rate": 2.2251354179317352e-05,
+      "loss": 0.5787,
+      "step": 17611
+    },
+    {
+      "epoch": 3.135505698005698,
+      "grad_norm": 1.0200998783111572,
+      "learning_rate": 2.2242551935242117e-05,
+      "loss": 0.9569,
+      "step": 17612
+    },
+    {
+      "epoch": 3.1356837606837606,
+      "grad_norm": 0.9117900729179382,
+      "learning_rate": 2.223375121466249e-05,
+      "loss": 0.7962,
+      "step": 17613
+    },
+    {
+      "epoch": 3.1358618233618234,
+      "grad_norm": 0.9254946112632751,
+      "learning_rate": 2.222495201775099e-05,
+      "loss": 0.6598,
+      "step": 17614
+    },
+    {
+      "epoch": 3.136039886039886,
+      "grad_norm": 0.8572172522544861,
+      "learning_rate": 2.2216154344679983e-05,
+      "loss": 0.7288,
+      "step": 17615
+    },
+    {
+      "epoch": 3.136217948717949,
+      "grad_norm": 0.8563082814216614,
+      "learning_rate": 2.2207358195621865e-05,
+      "loss": 0.6116,
+      "step": 17616
+    },
+    {
+      "epoch": 3.136396011396011,
+      "grad_norm": 0.8751232624053955,
+      "learning_rate": 2.2198563570748953e-05,
+      "loss": 0.8396,
+      "step": 17617
+    },
+    {
+      "epoch": 3.136574074074074,
+      "grad_norm": 0.8472304344177246,
+      "learning_rate": 2.218977047023355e-05,
+      "loss": 0.8801,
+      "step": 17618
+    },
+    {
+      "epoch": 3.1367521367521367,
+      "grad_norm": 0.8158729076385498,
+      "learning_rate": 2.2180978894248007e-05,
+      "loss": 0.6356,
+      "step": 17619
+    },
+    {
+      "epoch": 3.1369301994301995,
+      "grad_norm": 0.9257596731185913,
+      "learning_rate": 2.217218884296447e-05,
+      "loss": 0.6578,
+      "step": 17620
+    },
+    {
+      "epoch": 3.137108262108262,
+      "grad_norm": 0.887220025062561,
+      "learning_rate": 2.216340031655525e-05,
+      "loss": 0.7647,
+      "step": 17621
+    },
+    {
+      "epoch": 3.137286324786325,
+      "grad_norm": 0.8736263513565063,
+      "learning_rate": 2.2154613315192508e-05,
+      "loss": 0.7776,
+      "step": 17622
+    },
+    {
+      "epoch": 3.1374643874643873,
+      "grad_norm": 0.8078323006629944,
+      "learning_rate": 2.2145827839048416e-05,
+      "loss": 0.7054,
+      "step": 17623
+    },
+    {
+      "epoch": 3.13764245014245,
+      "grad_norm": 0.8907020092010498,
+      "learning_rate": 2.2137043888295096e-05,
+      "loss": 0.7799,
+      "step": 17624
+    },
+    {
+      "epoch": 3.1378205128205128,
+      "grad_norm": 0.895183801651001,
+      "learning_rate": 2.212826146310465e-05,
+      "loss": 0.758,
+      "step": 17625
+    },
+    {
+      "epoch": 3.1379985754985755,
+      "grad_norm": 0.9827015995979309,
+      "learning_rate": 2.2119480563649153e-05,
+      "loss": 0.593,
+      "step": 17626
+    },
+    {
+      "epoch": 3.1381766381766383,
+      "grad_norm": 0.927594780921936,
+      "learning_rate": 2.211070119010066e-05,
+      "loss": 0.6629,
+      "step": 17627
+    },
+    {
+      "epoch": 3.138354700854701,
+      "grad_norm": 1.0362217426300049,
+      "learning_rate": 2.2101923342631148e-05,
+      "loss": 0.781,
+      "step": 17628
+    },
+    {
+      "epoch": 3.1385327635327633,
+      "grad_norm": 0.9003650546073914,
+      "learning_rate": 2.2093147021412673e-05,
+      "loss": 0.6849,
+      "step": 17629
+    },
+    {
+      "epoch": 3.138710826210826,
+      "grad_norm": 0.8728612065315247,
+      "learning_rate": 2.20843722266171e-05,
+      "loss": 0.8316,
+      "step": 17630
+    },
+    {
+      "epoch": 3.138888888888889,
+      "grad_norm": 0.8697235584259033,
+      "learning_rate": 2.207559895841642e-05,
+      "loss": 0.7177,
+      "step": 17631
+    },
+    {
+      "epoch": 3.1390669515669516,
+      "grad_norm": 1.0648221969604492,
+      "learning_rate": 2.206682721698251e-05,
+      "loss": 0.794,
+      "step": 17632
+    },
+    {
+      "epoch": 3.1392450142450143,
+      "grad_norm": 0.8567960858345032,
+      "learning_rate": 2.2058057002487198e-05,
+      "loss": 0.6276,
+      "step": 17633
+    },
+    {
+      "epoch": 3.139423076923077,
+      "grad_norm": 1.0193754434585571,
+      "learning_rate": 2.2049288315102412e-05,
+      "loss": 0.8894,
+      "step": 17634
+    },
+    {
+      "epoch": 3.1396011396011394,
+      "grad_norm": 0.9031814932823181,
+      "learning_rate": 2.2040521154999827e-05,
+      "loss": 0.697,
+      "step": 17635
+    },
+    {
+      "epoch": 3.139779202279202,
+      "grad_norm": 0.8300186395645142,
+      "learning_rate": 2.203175552235134e-05,
+      "loss": 0.5605,
+      "step": 17636
+    },
+    {
+      "epoch": 3.139957264957265,
+      "grad_norm": 0.931399405002594,
+      "learning_rate": 2.2022991417328587e-05,
+      "loss": 0.7807,
+      "step": 17637
+    },
+    {
+      "epoch": 3.1401353276353277,
+      "grad_norm": 0.9556393623352051,
+      "learning_rate": 2.2014228840103357e-05,
+      "loss": 0.867,
+      "step": 17638
+    },
+    {
+      "epoch": 3.1403133903133904,
+      "grad_norm": 0.8357135653495789,
+      "learning_rate": 2.200546779084731e-05,
+      "loss": 0.6754,
+      "step": 17639
+    },
+    {
+      "epoch": 3.140491452991453,
+      "grad_norm": 0.9572781324386597,
+      "learning_rate": 2.1996708269732114e-05,
+      "loss": 0.7253,
+      "step": 17640
+    },
+    {
+      "epoch": 3.1406695156695155,
+      "grad_norm": 0.9977589845657349,
+      "learning_rate": 2.198795027692937e-05,
+      "loss": 0.6852,
+      "step": 17641
+    },
+    {
+      "epoch": 3.140847578347578,
+      "grad_norm": 0.9238983988761902,
+      "learning_rate": 2.197919381261069e-05,
+      "loss": 0.6661,
+      "step": 17642
+    },
+    {
+      "epoch": 3.141025641025641,
+      "grad_norm": 0.934651792049408,
+      "learning_rate": 2.1970438876947596e-05,
+      "loss": 0.7537,
+      "step": 17643
+    },
+    {
+      "epoch": 3.1412037037037037,
+      "grad_norm": 1.1089069843292236,
+      "learning_rate": 2.1961685470111725e-05,
+      "loss": 0.8351,
+      "step": 17644
+    },
+    {
+      "epoch": 3.1413817663817665,
+      "grad_norm": 0.890332818031311,
+      "learning_rate": 2.195293359227446e-05,
+      "loss": 0.7571,
+      "step": 17645
+    },
+    {
+      "epoch": 3.1415598290598292,
+      "grad_norm": 0.8519227504730225,
+      "learning_rate": 2.1944183243607363e-05,
+      "loss": 0.9673,
+      "step": 17646
+    },
+    {
+      "epoch": 3.1417378917378915,
+      "grad_norm": 0.9076274633407593,
+      "learning_rate": 2.1935434424281842e-05,
+      "loss": 0.618,
+      "step": 17647
+    },
+    {
+      "epoch": 3.1419159544159543,
+      "grad_norm": 0.9259662628173828,
+      "learning_rate": 2.1926687134469315e-05,
+      "loss": 0.6855,
+      "step": 17648
+    },
+    {
+      "epoch": 3.142094017094017,
+      "grad_norm": 0.839491069316864,
+      "learning_rate": 2.1917941374341177e-05,
+      "loss": 0.5683,
+      "step": 17649
+    },
+    {
+      "epoch": 3.14227207977208,
+      "grad_norm": 0.8948171734809875,
+      "learning_rate": 2.1909197144068783e-05,
+      "loss": 0.8399,
+      "step": 17650
+    },
+    {
+      "epoch": 3.1424501424501425,
+      "grad_norm": 1.0100452899932861,
+      "learning_rate": 2.1900454443823437e-05,
+      "loss": 0.8663,
+      "step": 17651
+    },
+    {
+      "epoch": 3.1426282051282053,
+      "grad_norm": 0.8691473603248596,
+      "learning_rate": 2.189171327377646e-05,
+      "loss": 0.7202,
+      "step": 17652
+    },
+    {
+      "epoch": 3.142806267806268,
+      "grad_norm": 0.979176938533783,
+      "learning_rate": 2.188297363409907e-05,
+      "loss": 0.9264,
+      "step": 17653
+    },
+    {
+      "epoch": 3.1429843304843303,
+      "grad_norm": 0.9306555390357971,
+      "learning_rate": 2.187423552496257e-05,
+      "loss": 0.8755,
+      "step": 17654
+    },
+    {
+      "epoch": 3.143162393162393,
+      "grad_norm": 0.9641979932785034,
+      "learning_rate": 2.1865498946538133e-05,
+      "loss": 0.6323,
+      "step": 17655
+    },
+    {
+      "epoch": 3.143340455840456,
+      "grad_norm": 1.126671314239502,
+      "learning_rate": 2.1856763898996924e-05,
+      "loss": 0.954,
+      "step": 17656
+    },
+    {
+      "epoch": 3.1435185185185186,
+      "grad_norm": 1.0349640846252441,
+      "learning_rate": 2.1848030382510108e-05,
+      "loss": 0.8558,
+      "step": 17657
+    },
+    {
+      "epoch": 3.1436965811965814,
+      "grad_norm": 0.9324985146522522,
+      "learning_rate": 2.1839298397248763e-05,
+      "loss": 0.8061,
+      "step": 17658
+    },
+    {
+      "epoch": 3.1438746438746437,
+      "grad_norm": 0.8919853568077087,
+      "learning_rate": 2.183056794338405e-05,
+      "loss": 0.759,
+      "step": 17659
+    },
+    {
+      "epoch": 3.1440527065527064,
+      "grad_norm": 0.9174198508262634,
+      "learning_rate": 2.182183902108692e-05,
+      "loss": 0.8332,
+      "step": 17660
+    },
+    {
+      "epoch": 3.144230769230769,
+      "grad_norm": 0.8662750124931335,
+      "learning_rate": 2.1813111630528517e-05,
+      "loss": 0.6642,
+      "step": 17661
+    },
+    {
+      "epoch": 3.144408831908832,
+      "grad_norm": 0.9606072902679443,
+      "learning_rate": 2.1804385771879708e-05,
+      "loss": 0.8076,
+      "step": 17662
+    },
+    {
+      "epoch": 3.1445868945868947,
+      "grad_norm": 0.904283344745636,
+      "learning_rate": 2.1795661445311555e-05,
+      "loss": 0.9605,
+      "step": 17663
+    },
+    {
+      "epoch": 3.1447649572649574,
+      "grad_norm": 1.0539193153381348,
+      "learning_rate": 2.1786938650994958e-05,
+      "loss": 0.9011,
+      "step": 17664
+    },
+    {
+      "epoch": 3.14494301994302,
+      "grad_norm": 0.8112294673919678,
+      "learning_rate": 2.177821738910083e-05,
+      "loss": 0.4874,
+      "step": 17665
+    },
+    {
+      "epoch": 3.1451210826210825,
+      "grad_norm": 0.8961923718452454,
+      "learning_rate": 2.1769497659800042e-05,
+      "loss": 0.8034,
+      "step": 17666
+    },
+    {
+      "epoch": 3.1452991452991452,
+      "grad_norm": 0.9964944124221802,
+      "learning_rate": 2.1760779463263426e-05,
+      "loss": 0.7663,
+      "step": 17667
+    },
+    {
+      "epoch": 3.145477207977208,
+      "grad_norm": 0.9126096367835999,
+      "learning_rate": 2.1752062799661787e-05,
+      "loss": 1.0941,
+      "step": 17668
+    },
+    {
+      "epoch": 3.1456552706552707,
+      "grad_norm": 0.9588297009468079,
+      "learning_rate": 2.1743347669165994e-05,
+      "loss": 0.7618,
+      "step": 17669
+    },
+    {
+      "epoch": 3.1458333333333335,
+      "grad_norm": 0.9706279039382935,
+      "learning_rate": 2.173463407194667e-05,
+      "loss": 0.8278,
+      "step": 17670
+    },
+    {
+      "epoch": 3.146011396011396,
+      "grad_norm": 1.005780816078186,
+      "learning_rate": 2.172592200817467e-05,
+      "loss": 0.743,
+      "step": 17671
+    },
+    {
+      "epoch": 3.1461894586894585,
+      "grad_norm": 1.0032274723052979,
+      "learning_rate": 2.1717211478020583e-05,
+      "loss": 0.7665,
+      "step": 17672
+    },
+    {
+      "epoch": 3.1463675213675213,
+      "grad_norm": 0.9049981236457825,
+      "learning_rate": 2.1708502481655134e-05,
+      "loss": 0.7667,
+      "step": 17673
+    },
+    {
+      "epoch": 3.146545584045584,
+      "grad_norm": 0.8788641095161438,
+      "learning_rate": 2.1699795019248947e-05,
+      "loss": 0.6075,
+      "step": 17674
+    },
+    {
+      "epoch": 3.146723646723647,
+      "grad_norm": 0.9241887331008911,
+      "learning_rate": 2.1691089090972595e-05,
+      "loss": 0.9202,
+      "step": 17675
+    },
+    {
+      "epoch": 3.1469017094017095,
+      "grad_norm": 0.8133395314216614,
+      "learning_rate": 2.1682384696996728e-05,
+      "loss": 0.6936,
+      "step": 17676
+    },
+    {
+      "epoch": 3.1470797720797723,
+      "grad_norm": 0.9261813163757324,
+      "learning_rate": 2.167368183749179e-05,
+      "loss": 0.8659,
+      "step": 17677
+    },
+    {
+      "epoch": 3.1472578347578346,
+      "grad_norm": 0.8164187073707581,
+      "learning_rate": 2.1664980512628374e-05,
+      "loss": 0.6205,
+      "step": 17678
+    },
+    {
+      "epoch": 3.1474358974358974,
+      "grad_norm": 0.8534388542175293,
+      "learning_rate": 2.165628072257695e-05,
+      "loss": 0.7437,
+      "step": 17679
+    },
+    {
+      "epoch": 3.14761396011396,
+      "grad_norm": 0.9858724474906921,
+      "learning_rate": 2.1647582467507943e-05,
+      "loss": 0.6912,
+      "step": 17680
+    },
+    {
+      "epoch": 3.147792022792023,
+      "grad_norm": 0.8704313039779663,
+      "learning_rate": 2.1638885747591807e-05,
+      "loss": 0.6267,
+      "step": 17681
+    },
+    {
+      "epoch": 3.1479700854700856,
+      "grad_norm": 0.9345740675926208,
+      "learning_rate": 2.163019056299892e-05,
+      "loss": 0.7671,
+      "step": 17682
+    },
+    {
+      "epoch": 3.148148148148148,
+      "grad_norm": 0.8783660531044006,
+      "learning_rate": 2.162149691389962e-05,
+      "loss": 0.6935,
+      "step": 17683
+    },
+    {
+      "epoch": 3.1483262108262107,
+      "grad_norm": 0.9478751420974731,
+      "learning_rate": 2.1612804800464324e-05,
+      "loss": 0.7343,
+      "step": 17684
+    },
+    {
+      "epoch": 3.1485042735042734,
+      "grad_norm": 1.0307378768920898,
+      "learning_rate": 2.1604114222863236e-05,
+      "loss": 0.6656,
+      "step": 17685
+    },
+    {
+      "epoch": 3.148682336182336,
+      "grad_norm": 0.8509631156921387,
+      "learning_rate": 2.1595425181266726e-05,
+      "loss": 0.7223,
+      "step": 17686
+    },
+    {
+      "epoch": 3.148860398860399,
+      "grad_norm": 0.9910686612129211,
+      "learning_rate": 2.1586737675844938e-05,
+      "loss": 0.6478,
+      "step": 17687
+    },
+    {
+      "epoch": 3.1490384615384617,
+      "grad_norm": 1.0413484573364258,
+      "learning_rate": 2.1578051706768156e-05,
+      "loss": 0.9215,
+      "step": 17688
+    },
+    {
+      "epoch": 3.1492165242165244,
+      "grad_norm": 0.9027596116065979,
+      "learning_rate": 2.156936727420654e-05,
+      "loss": 0.9464,
+      "step": 17689
+    },
+    {
+      "epoch": 3.1493945868945867,
+      "grad_norm": 1.0705668926239014,
+      "learning_rate": 2.1560684378330252e-05,
+      "loss": 0.9369,
+      "step": 17690
+    },
+    {
+      "epoch": 3.1495726495726495,
+      "grad_norm": 0.7609538435935974,
+      "learning_rate": 2.1552003019309418e-05,
+      "loss": 0.4558,
+      "step": 17691
+    },
+    {
+      "epoch": 3.1497507122507122,
+      "grad_norm": 0.9189962148666382,
+      "learning_rate": 2.1543323197314093e-05,
+      "loss": 0.6571,
+      "step": 17692
+    },
+    {
+      "epoch": 3.149928774928775,
+      "grad_norm": 1.3622703552246094,
+      "learning_rate": 2.153464491251439e-05,
+      "loss": 0.8121,
+      "step": 17693
+    },
+    {
+      "epoch": 3.1501068376068377,
+      "grad_norm": 0.9149178266525269,
+      "learning_rate": 2.1525968165080324e-05,
+      "loss": 0.7429,
+      "step": 17694
+    },
+    {
+      "epoch": 3.1502849002849005,
+      "grad_norm": 0.8335025310516357,
+      "learning_rate": 2.15172929551819e-05,
+      "loss": 0.7269,
+      "step": 17695
+    },
+    {
+      "epoch": 3.150462962962963,
+      "grad_norm": 1.145371913909912,
+      "learning_rate": 2.1508619282989084e-05,
+      "loss": 0.9163,
+      "step": 17696
+    },
+    {
+      "epoch": 3.1506410256410255,
+      "grad_norm": 0.7720812559127808,
+      "learning_rate": 2.149994714867182e-05,
+      "loss": 0.6217,
+      "step": 17697
+    },
+    {
+      "epoch": 3.1508190883190883,
+      "grad_norm": 0.9079264402389526,
+      "learning_rate": 2.149127655240003e-05,
+      "loss": 0.8527,
+      "step": 17698
+    },
+    {
+      "epoch": 3.150997150997151,
+      "grad_norm": 0.9881488680839539,
+      "learning_rate": 2.1482607494343575e-05,
+      "loss": 0.9116,
+      "step": 17699
+    },
+    {
+      "epoch": 3.151175213675214,
+      "grad_norm": 0.9430930614471436,
+      "learning_rate": 2.147393997467231e-05,
+      "loss": 0.6562,
+      "step": 17700
+    },
+    {
+      "epoch": 3.1513532763532766,
+      "grad_norm": 0.9559832215309143,
+      "learning_rate": 2.14652739935561e-05,
+      "loss": 0.7929,
+      "step": 17701
+    },
+    {
+      "epoch": 3.151531339031339,
+      "grad_norm": 1.0164002180099487,
+      "learning_rate": 2.1456609551164662e-05,
+      "loss": 0.9208,
+      "step": 17702
+    },
+    {
+      "epoch": 3.1517094017094016,
+      "grad_norm": 1.0220781564712524,
+      "learning_rate": 2.1447946647667837e-05,
+      "loss": 0.8878,
+      "step": 17703
+    },
+    {
+      "epoch": 3.1518874643874644,
+      "grad_norm": 1.060396671295166,
+      "learning_rate": 2.143928528323531e-05,
+      "loss": 0.6821,
+      "step": 17704
+    },
+    {
+      "epoch": 3.152065527065527,
+      "grad_norm": 0.8120558857917786,
+      "learning_rate": 2.143062545803679e-05,
+      "loss": 0.569,
+      "step": 17705
+    },
+    {
+      "epoch": 3.15224358974359,
+      "grad_norm": 0.8634228706359863,
+      "learning_rate": 2.1421967172241953e-05,
+      "loss": 0.7732,
+      "step": 17706
+    },
+    {
+      "epoch": 3.1524216524216526,
+      "grad_norm": 0.9585935473442078,
+      "learning_rate": 2.141331042602044e-05,
+      "loss": 0.7733,
+      "step": 17707
+    },
+    {
+      "epoch": 3.152599715099715,
+      "grad_norm": 0.8006394505500793,
+      "learning_rate": 2.140465521954186e-05,
+      "loss": 0.6772,
+      "step": 17708
+    },
+    {
+      "epoch": 3.1527777777777777,
+      "grad_norm": 0.948411226272583,
+      "learning_rate": 2.1396001552975786e-05,
+      "loss": 0.7058,
+      "step": 17709
+    },
+    {
+      "epoch": 3.1529558404558404,
+      "grad_norm": 0.9930174946784973,
+      "learning_rate": 2.1387349426491753e-05,
+      "loss": 0.8284,
+      "step": 17710
+    },
+    {
+      "epoch": 3.153133903133903,
+      "grad_norm": 0.8288687467575073,
+      "learning_rate": 2.137869884025936e-05,
+      "loss": 0.7214,
+      "step": 17711
+    },
+    {
+      "epoch": 3.153311965811966,
+      "grad_norm": 0.9206148386001587,
+      "learning_rate": 2.1370049794447977e-05,
+      "loss": 0.8247,
+      "step": 17712
+    },
+    {
+      "epoch": 3.1534900284900287,
+      "grad_norm": 1.0066028833389282,
+      "learning_rate": 2.1361402289227173e-05,
+      "loss": 0.8256,
+      "step": 17713
+    },
+    {
+      "epoch": 3.153668091168091,
+      "grad_norm": 1.0277632474899292,
+      "learning_rate": 2.135275632476632e-05,
+      "loss": 0.7422,
+      "step": 17714
+    },
+    {
+      "epoch": 3.1538461538461537,
+      "grad_norm": 0.8791954517364502,
+      "learning_rate": 2.134411190123481e-05,
+      "loss": 0.6614,
+      "step": 17715
+    },
+    {
+      "epoch": 3.1540242165242165,
+      "grad_norm": 0.8627964854240417,
+      "learning_rate": 2.133546901880209e-05,
+      "loss": 0.6784,
+      "step": 17716
+    },
+    {
+      "epoch": 3.1542022792022792,
+      "grad_norm": 1.106667160987854,
+      "learning_rate": 2.1326827677637384e-05,
+      "loss": 0.6475,
+      "step": 17717
+    },
+    {
+      "epoch": 3.154380341880342,
+      "grad_norm": 0.7291306257247925,
+      "learning_rate": 2.131818787791009e-05,
+      "loss": 0.6944,
+      "step": 17718
+    },
+    {
+      "epoch": 3.1545584045584047,
+      "grad_norm": 0.9892288446426392,
+      "learning_rate": 2.1309549619789458e-05,
+      "loss": 0.7805,
+      "step": 17719
+    },
+    {
+      "epoch": 3.154736467236467,
+      "grad_norm": 0.9121565818786621,
+      "learning_rate": 2.1300912903444736e-05,
+      "loss": 1.0122,
+      "step": 17720
+    },
+    {
+      "epoch": 3.15491452991453,
+      "grad_norm": 0.8239485025405884,
+      "learning_rate": 2.129227772904514e-05,
+      "loss": 0.6418,
+      "step": 17721
+    },
+    {
+      "epoch": 3.1550925925925926,
+      "grad_norm": 0.8559371829032898,
+      "learning_rate": 2.1283644096759868e-05,
+      "loss": 0.7703,
+      "step": 17722
+    },
+    {
+      "epoch": 3.1552706552706553,
+      "grad_norm": 0.9148365259170532,
+      "learning_rate": 2.1275012006758068e-05,
+      "loss": 0.884,
+      "step": 17723
+    },
+    {
+      "epoch": 3.155448717948718,
+      "grad_norm": 0.9810414910316467,
+      "learning_rate": 2.126638145920886e-05,
+      "loss": 0.7261,
+      "step": 17724
+    },
+    {
+      "epoch": 3.155626780626781,
+      "grad_norm": 0.960682213306427,
+      "learning_rate": 2.125775245428133e-05,
+      "loss": 0.766,
+      "step": 17725
+    },
+    {
+      "epoch": 3.155804843304843,
+      "grad_norm": 1.0861321687698364,
+      "learning_rate": 2.124912499214463e-05,
+      "loss": 0.568,
+      "step": 17726
+    },
+    {
+      "epoch": 3.155982905982906,
+      "grad_norm": 0.9647825956344604,
+      "learning_rate": 2.124049907296768e-05,
+      "loss": 0.8122,
+      "step": 17727
+    },
+    {
+      "epoch": 3.1561609686609686,
+      "grad_norm": 0.9854398369789124,
+      "learning_rate": 2.1231874696919564e-05,
+      "loss": 0.6373,
+      "step": 17728
+    },
+    {
+      "epoch": 3.1563390313390314,
+      "grad_norm": 0.8971094489097595,
+      "learning_rate": 2.1223251864169237e-05,
+      "loss": 0.6713,
+      "step": 17729
+    },
+    {
+      "epoch": 3.156517094017094,
+      "grad_norm": 0.9614965319633484,
+      "learning_rate": 2.1214630574885643e-05,
+      "loss": 0.9324,
+      "step": 17730
+    },
+    {
+      "epoch": 3.156695156695157,
+      "grad_norm": 0.9546411633491516,
+      "learning_rate": 2.120601082923771e-05,
+      "loss": 0.7762,
+      "step": 17731
+    },
+    {
+      "epoch": 3.156873219373219,
+      "grad_norm": 0.860262393951416,
+      "learning_rate": 2.1197392627394275e-05,
+      "loss": 0.7277,
+      "step": 17732
+    },
+    {
+      "epoch": 3.157051282051282,
+      "grad_norm": 0.8918822407722473,
+      "learning_rate": 2.11887759695243e-05,
+      "loss": 0.8899,
+      "step": 17733
+    },
+    {
+      "epoch": 3.1572293447293447,
+      "grad_norm": 0.9153090119361877,
+      "learning_rate": 2.118016085579647e-05,
+      "loss": 1.0711,
+      "step": 17734
+    },
+    {
+      "epoch": 3.1574074074074074,
+      "grad_norm": 1.1114693880081177,
+      "learning_rate": 2.11715472863797e-05,
+      "loss": 0.9146,
+      "step": 17735
+    },
+    {
+      "epoch": 3.15758547008547,
+      "grad_norm": 0.9536030888557434,
+      "learning_rate": 2.1162935261442705e-05,
+      "loss": 0.8239,
+      "step": 17736
+    },
+    {
+      "epoch": 3.157763532763533,
+      "grad_norm": 0.9061172604560852,
+      "learning_rate": 2.1154324781154222e-05,
+      "loss": 0.7297,
+      "step": 17737
+    },
+    {
+      "epoch": 3.1579415954415953,
+      "grad_norm": 0.9903003573417664,
+      "learning_rate": 2.1145715845682957e-05,
+      "loss": 0.8063,
+      "step": 17738
+    },
+    {
+      "epoch": 3.158119658119658,
+      "grad_norm": 0.9316477179527283,
+      "learning_rate": 2.1137108455197586e-05,
+      "loss": 0.7544,
+      "step": 17739
+    },
+    {
+      "epoch": 3.1582977207977208,
+      "grad_norm": 0.9925363063812256,
+      "learning_rate": 2.112850260986672e-05,
+      "loss": 0.8106,
+      "step": 17740
+    },
+    {
+      "epoch": 3.1584757834757835,
+      "grad_norm": 0.7903617024421692,
+      "learning_rate": 2.1119898309859064e-05,
+      "loss": 0.6821,
+      "step": 17741
+    },
+    {
+      "epoch": 3.1586538461538463,
+      "grad_norm": 1.0054620504379272,
+      "learning_rate": 2.111129555534308e-05,
+      "loss": 0.8715,
+      "step": 17742
+    },
+    {
+      "epoch": 3.158831908831909,
+      "grad_norm": 1.098737359046936,
+      "learning_rate": 2.110269434648744e-05,
+      "loss": 0.6393,
+      "step": 17743
+    },
+    {
+      "epoch": 3.1590099715099713,
+      "grad_norm": 0.7690902352333069,
+      "learning_rate": 2.1094094683460553e-05,
+      "loss": 0.6347,
+      "step": 17744
+    },
+    {
+      "epoch": 3.159188034188034,
+      "grad_norm": 0.9430564641952515,
+      "learning_rate": 2.1085496566431006e-05,
+      "loss": 0.68,
+      "step": 17745
+    },
+    {
+      "epoch": 3.159366096866097,
+      "grad_norm": 0.9977595806121826,
+      "learning_rate": 2.107689999556721e-05,
+      "loss": 0.8586,
+      "step": 17746
+    },
+    {
+      "epoch": 3.1595441595441596,
+      "grad_norm": 0.8231605887413025,
+      "learning_rate": 2.1068304971037612e-05,
+      "loss": 0.6109,
+      "step": 17747
+    },
+    {
+      "epoch": 3.1597222222222223,
+      "grad_norm": 1.1063002347946167,
+      "learning_rate": 2.1059711493010615e-05,
+      "loss": 0.6969,
+      "step": 17748
+    },
+    {
+      "epoch": 3.159900284900285,
+      "grad_norm": 0.8677477836608887,
+      "learning_rate": 2.1051119561654585e-05,
+      "loss": 0.6571,
+      "step": 17749
+    },
+    {
+      "epoch": 3.1600783475783474,
+      "grad_norm": 0.9700071811676025,
+      "learning_rate": 2.104252917713785e-05,
+      "loss": 0.665,
+      "step": 17750
+    },
+    {
+      "epoch": 3.16025641025641,
+      "grad_norm": 1.015114426612854,
+      "learning_rate": 2.1033940339628778e-05,
+      "loss": 0.7666,
+      "step": 17751
+    },
+    {
+      "epoch": 3.160434472934473,
+      "grad_norm": 0.7403838038444519,
+      "learning_rate": 2.1025353049295547e-05,
+      "loss": 0.4922,
+      "step": 17752
+    },
+    {
+      "epoch": 3.1606125356125356,
+      "grad_norm": 0.8768717646598816,
+      "learning_rate": 2.1016767306306506e-05,
+      "loss": 0.6746,
+      "step": 17753
+    },
+    {
+      "epoch": 3.1607905982905984,
+      "grad_norm": 0.9147419929504395,
+      "learning_rate": 2.1008183110829826e-05,
+      "loss": 0.7614,
+      "step": 17754
+    },
+    {
+      "epoch": 3.160968660968661,
+      "grad_norm": 0.949970543384552,
+      "learning_rate": 2.0999600463033696e-05,
+      "loss": 0.7278,
+      "step": 17755
+    },
+    {
+      "epoch": 3.1611467236467234,
+      "grad_norm": 0.9200690984725952,
+      "learning_rate": 2.099101936308633e-05,
+      "loss": 0.8333,
+      "step": 17756
+    },
+    {
+      "epoch": 3.161324786324786,
+      "grad_norm": 0.9482975006103516,
+      "learning_rate": 2.0982439811155753e-05,
+      "loss": 0.8056,
+      "step": 17757
+    },
+    {
+      "epoch": 3.161502849002849,
+      "grad_norm": 0.9989914298057556,
+      "learning_rate": 2.097386180741019e-05,
+      "loss": 0.9437,
+      "step": 17758
+    },
+    {
+      "epoch": 3.1616809116809117,
+      "grad_norm": 0.8893665075302124,
+      "learning_rate": 2.0965285352017572e-05,
+      "loss": 0.7423,
+      "step": 17759
+    },
+    {
+      "epoch": 3.1618589743589745,
+      "grad_norm": 0.8213073015213013,
+      "learning_rate": 2.0956710445146046e-05,
+      "loss": 0.7089,
+      "step": 17760
+    },
+    {
+      "epoch": 3.162037037037037,
+      "grad_norm": 1.022196650505066,
+      "learning_rate": 2.0948137086963583e-05,
+      "loss": 0.6745,
+      "step": 17761
+    },
+    {
+      "epoch": 3.1622150997150995,
+      "grad_norm": 0.9003040790557861,
+      "learning_rate": 2.093956527763815e-05,
+      "loss": 0.8741,
+      "step": 17762
+    },
+    {
+      "epoch": 3.1623931623931623,
+      "grad_norm": 0.9286761283874512,
+      "learning_rate": 2.0930995017337707e-05,
+      "loss": 0.8794,
+      "step": 17763
+    },
+    {
+      "epoch": 3.162571225071225,
+      "grad_norm": 1.0088931322097778,
+      "learning_rate": 2.092242630623016e-05,
+      "loss": 0.8463,
+      "step": 17764
+    },
+    {
+      "epoch": 3.1627492877492878,
+      "grad_norm": 0.9631438851356506,
+      "learning_rate": 2.091385914448337e-05,
+      "loss": 0.9442,
+      "step": 17765
+    },
+    {
+      "epoch": 3.1629273504273505,
+      "grad_norm": 0.9380853772163391,
+      "learning_rate": 2.090529353226528e-05,
+      "loss": 0.7912,
+      "step": 17766
+    },
+    {
+      "epoch": 3.1631054131054133,
+      "grad_norm": 1.0451303720474243,
+      "learning_rate": 2.0896729469743614e-05,
+      "loss": 0.7256,
+      "step": 17767
+    },
+    {
+      "epoch": 3.1632834757834756,
+      "grad_norm": 0.9581690430641174,
+      "learning_rate": 2.0888166957086252e-05,
+      "loss": 0.7764,
+      "step": 17768
+    },
+    {
+      "epoch": 3.1634615384615383,
+      "grad_norm": 1.0340020656585693,
+      "learning_rate": 2.087960599446087e-05,
+      "loss": 0.9355,
+      "step": 17769
+    },
+    {
+      "epoch": 3.163639601139601,
+      "grad_norm": 0.9682812690734863,
+      "learning_rate": 2.087104658203528e-05,
+      "loss": 0.7723,
+      "step": 17770
+    },
+    {
+      "epoch": 3.163817663817664,
+      "grad_norm": 1.0201079845428467,
+      "learning_rate": 2.0862488719977147e-05,
+      "loss": 0.8538,
+      "step": 17771
+    },
+    {
+      "epoch": 3.1639957264957266,
+      "grad_norm": 1.042189598083496,
+      "learning_rate": 2.0853932408454135e-05,
+      "loss": 0.8455,
+      "step": 17772
+    },
+    {
+      "epoch": 3.1641737891737893,
+      "grad_norm": 0.9279144406318665,
+      "learning_rate": 2.0845377647633957e-05,
+      "loss": 0.7129,
+      "step": 17773
+    },
+    {
+      "epoch": 3.164351851851852,
+      "grad_norm": 1.022481083869934,
+      "learning_rate": 2.0836824437684122e-05,
+      "loss": 0.6415,
+      "step": 17774
+    },
+    {
+      "epoch": 3.1645299145299144,
+      "grad_norm": 0.7837867736816406,
+      "learning_rate": 2.0828272778772285e-05,
+      "loss": 0.521,
+      "step": 17775
+    },
+    {
+      "epoch": 3.164707977207977,
+      "grad_norm": 0.9831959009170532,
+      "learning_rate": 2.081972267106599e-05,
+      "loss": 0.6261,
+      "step": 17776
+    },
+    {
+      "epoch": 3.16488603988604,
+      "grad_norm": 0.8895300626754761,
+      "learning_rate": 2.0811174114732734e-05,
+      "loss": 0.6477,
+      "step": 17777
+    },
+    {
+      "epoch": 3.1650641025641026,
+      "grad_norm": 0.8388190865516663,
+      "learning_rate": 2.0802627109940032e-05,
+      "loss": 0.6722,
+      "step": 17778
+    },
+    {
+      "epoch": 3.1652421652421654,
+      "grad_norm": 0.9670847654342651,
+      "learning_rate": 2.0794081656855324e-05,
+      "loss": 0.7609,
+      "step": 17779
+    },
+    {
+      "epoch": 3.1654202279202277,
+      "grad_norm": 0.805338978767395,
+      "learning_rate": 2.0785537755646055e-05,
+      "loss": 0.7714,
+      "step": 17780
+    },
+    {
+      "epoch": 3.1655982905982905,
+      "grad_norm": 0.9516651630401611,
+      "learning_rate": 2.0776995406479616e-05,
+      "loss": 0.6149,
+      "step": 17781
+    },
+    {
+      "epoch": 3.165776353276353,
+      "grad_norm": 0.8569229245185852,
+      "learning_rate": 2.0768454609523357e-05,
+      "loss": 0.6444,
+      "step": 17782
+    },
+    {
+      "epoch": 3.165954415954416,
+      "grad_norm": 0.9347574710845947,
+      "learning_rate": 2.0759915364944693e-05,
+      "loss": 0.6812,
+      "step": 17783
+    },
+    {
+      "epoch": 3.1661324786324787,
+      "grad_norm": 1.0011918544769287,
+      "learning_rate": 2.0751377672910834e-05,
+      "loss": 0.9513,
+      "step": 17784
+    },
+    {
+      "epoch": 3.1663105413105415,
+      "grad_norm": 0.794636607170105,
+      "learning_rate": 2.074284153358912e-05,
+      "loss": 0.5172,
+      "step": 17785
+    },
+    {
+      "epoch": 3.166488603988604,
+      "grad_norm": 1.019755482673645,
+      "learning_rate": 2.0734306947146785e-05,
+      "loss": 0.8476,
+      "step": 17786
+    },
+    {
+      "epoch": 3.1666666666666665,
+      "grad_norm": 1.0172061920166016,
+      "learning_rate": 2.0725773913751056e-05,
+      "loss": 0.7237,
+      "step": 17787
+    },
+    {
+      "epoch": 3.1668447293447293,
+      "grad_norm": 0.9893356561660767,
+      "learning_rate": 2.0717242433569095e-05,
+      "loss": 0.9456,
+      "step": 17788
+    },
+    {
+      "epoch": 3.167022792022792,
+      "grad_norm": 1.0239630937576294,
+      "learning_rate": 2.070871250676808e-05,
+      "loss": 0.7997,
+      "step": 17789
+    },
+    {
+      "epoch": 3.1672008547008548,
+      "grad_norm": 0.7666916847229004,
+      "learning_rate": 2.0700184133515098e-05,
+      "loss": 0.6818,
+      "step": 17790
+    },
+    {
+      "epoch": 3.1673789173789175,
+      "grad_norm": 1.1362230777740479,
+      "learning_rate": 2.069165731397732e-05,
+      "loss": 0.964,
+      "step": 17791
+    },
+    {
+      "epoch": 3.16755698005698,
+      "grad_norm": 1.055202841758728,
+      "learning_rate": 2.0683132048321752e-05,
+      "loss": 0.7176,
+      "step": 17792
+    },
+    {
+      "epoch": 3.1677350427350426,
+      "grad_norm": 1.0161933898925781,
+      "learning_rate": 2.067460833671545e-05,
+      "loss": 0.9733,
+      "step": 17793
+    },
+    {
+      "epoch": 3.1679131054131053,
+      "grad_norm": 0.922094464302063,
+      "learning_rate": 2.0666086179325417e-05,
+      "loss": 0.6734,
+      "step": 17794
+    },
+    {
+      "epoch": 3.168091168091168,
+      "grad_norm": 1.061632752418518,
+      "learning_rate": 2.0657565576318616e-05,
+      "loss": 0.7061,
+      "step": 17795
+    },
+    {
+      "epoch": 3.168269230769231,
+      "grad_norm": 0.9560376405715942,
+      "learning_rate": 2.0649046527862002e-05,
+      "loss": 0.7497,
+      "step": 17796
+    },
+    {
+      "epoch": 3.1684472934472936,
+      "grad_norm": 0.8954378366470337,
+      "learning_rate": 2.0640529034122457e-05,
+      "loss": 0.6278,
+      "step": 17797
+    },
+    {
+      "epoch": 3.1686253561253563,
+      "grad_norm": 0.9111878871917725,
+      "learning_rate": 2.0632013095266945e-05,
+      "loss": 0.858,
+      "step": 17798
+    },
+    {
+      "epoch": 3.1688034188034186,
+      "grad_norm": 1.0656862258911133,
+      "learning_rate": 2.0623498711462208e-05,
+      "loss": 0.7066,
+      "step": 17799
+    },
+    {
+      "epoch": 3.1689814814814814,
+      "grad_norm": 0.940841555595398,
+      "learning_rate": 2.0614985882875147e-05,
+      "loss": 0.8436,
+      "step": 17800
+    },
+    {
+      "epoch": 3.169159544159544,
+      "grad_norm": 0.929924726486206,
+      "learning_rate": 2.0606474609672532e-05,
+      "loss": 0.7138,
+      "step": 17801
+    },
+    {
+      "epoch": 3.169337606837607,
+      "grad_norm": 0.9759158492088318,
+      "learning_rate": 2.0597964892021127e-05,
+      "loss": 1.1146,
+      "step": 17802
+    },
+    {
+      "epoch": 3.1695156695156697,
+      "grad_norm": 0.8888540267944336,
+      "learning_rate": 2.058945673008765e-05,
+      "loss": 0.7375,
+      "step": 17803
+    },
+    {
+      "epoch": 3.169693732193732,
+      "grad_norm": 0.9520168900489807,
+      "learning_rate": 2.0580950124038812e-05,
+      "loss": 0.7527,
+      "step": 17804
+    },
+    {
+      "epoch": 3.1698717948717947,
+      "grad_norm": 0.9040802121162415,
+      "learning_rate": 2.057244507404127e-05,
+      "loss": 0.5453,
+      "step": 17805
+    },
+    {
+      "epoch": 3.1700498575498575,
+      "grad_norm": 0.9299812912940979,
+      "learning_rate": 2.056394158026168e-05,
+      "loss": 0.8139,
+      "step": 17806
+    },
+    {
+      "epoch": 3.17022792022792,
+      "grad_norm": 0.9180371165275574,
+      "learning_rate": 2.0555439642866602e-05,
+      "loss": 0.7142,
+      "step": 17807
+    },
+    {
+      "epoch": 3.170405982905983,
+      "grad_norm": 1.016885757446289,
+      "learning_rate": 2.0546939262022725e-05,
+      "loss": 0.7723,
+      "step": 17808
+    },
+    {
+      "epoch": 3.1705840455840457,
+      "grad_norm": 0.9292659759521484,
+      "learning_rate": 2.0538440437896456e-05,
+      "loss": 0.853,
+      "step": 17809
+    },
+    {
+      "epoch": 3.1707621082621085,
+      "grad_norm": 1.0694403648376465,
+      "learning_rate": 2.052994317065441e-05,
+      "loss": 0.7869,
+      "step": 17810
+    },
+    {
+      "epoch": 3.1709401709401708,
+      "grad_norm": 0.8516891598701477,
+      "learning_rate": 2.052144746046304e-05,
+      "loss": 0.77,
+      "step": 17811
+    },
+    {
+      "epoch": 3.1711182336182335,
+      "grad_norm": 0.8941650986671448,
+      "learning_rate": 2.051295330748878e-05,
+      "loss": 0.568,
+      "step": 17812
+    },
+    {
+      "epoch": 3.1712962962962963,
+      "grad_norm": 0.8694541454315186,
+      "learning_rate": 2.050446071189813e-05,
+      "loss": 0.621,
+      "step": 17813
+    },
+    {
+      "epoch": 3.171474358974359,
+      "grad_norm": 0.9683319926261902,
+      "learning_rate": 2.0495969673857383e-05,
+      "loss": 0.8374,
+      "step": 17814
+    },
+    {
+      "epoch": 3.171652421652422,
+      "grad_norm": 0.8314747214317322,
+      "learning_rate": 2.0487480193533004e-05,
+      "loss": 0.6982,
+      "step": 17815
+    },
+    {
+      "epoch": 3.1718304843304845,
+      "grad_norm": 0.8739849328994751,
+      "learning_rate": 2.0478992271091234e-05,
+      "loss": 0.8121,
+      "step": 17816
+    },
+    {
+      "epoch": 3.172008547008547,
+      "grad_norm": 1.002616047859192,
+      "learning_rate": 2.0470505906698446e-05,
+      "loss": 0.8264,
+      "step": 17817
+    },
+    {
+      "epoch": 3.1721866096866096,
+      "grad_norm": 0.9982789754867554,
+      "learning_rate": 2.046202110052088e-05,
+      "loss": 0.6077,
+      "step": 17818
+    },
+    {
+      "epoch": 3.1723646723646723,
+      "grad_norm": 0.85958331823349,
+      "learning_rate": 2.045353785272479e-05,
+      "loss": 0.6998,
+      "step": 17819
+    },
+    {
+      "epoch": 3.172542735042735,
+      "grad_norm": 1.0867046117782593,
+      "learning_rate": 2.0445056163476374e-05,
+      "loss": 0.6445,
+      "step": 17820
+    },
+    {
+      "epoch": 3.172720797720798,
+      "grad_norm": 1.0782829523086548,
+      "learning_rate": 2.0436576032941834e-05,
+      "loss": 0.7187,
+      "step": 17821
+    },
+    {
+      "epoch": 3.1728988603988606,
+      "grad_norm": 1.0062856674194336,
+      "learning_rate": 2.0428097461287265e-05,
+      "loss": 0.8538,
+      "step": 17822
+    },
+    {
+      "epoch": 3.173076923076923,
+      "grad_norm": 1.019987940788269,
+      "learning_rate": 2.04196204486789e-05,
+      "loss": 0.6901,
+      "step": 17823
+    },
+    {
+      "epoch": 3.1732549857549857,
+      "grad_norm": 0.920170247554779,
+      "learning_rate": 2.0411144995282696e-05,
+      "loss": 0.6752,
+      "step": 17824
+    },
+    {
+      "epoch": 3.1734330484330484,
+      "grad_norm": 0.8132520318031311,
+      "learning_rate": 2.0402671101264802e-05,
+      "loss": 0.6694,
+      "step": 17825
+    },
+    {
+      "epoch": 3.173611111111111,
+      "grad_norm": 0.978809654712677,
+      "learning_rate": 2.0394198766791215e-05,
+      "loss": 0.6091,
+      "step": 17826
+    },
+    {
+      "epoch": 3.173789173789174,
+      "grad_norm": 0.7911710739135742,
+      "learning_rate": 2.0385727992027946e-05,
+      "loss": 0.7579,
+      "step": 17827
+    },
+    {
+      "epoch": 3.1739672364672367,
+      "grad_norm": 0.8443024754524231,
+      "learning_rate": 2.037725877714095e-05,
+      "loss": 0.6157,
+      "step": 17828
+    },
+    {
+      "epoch": 3.174145299145299,
+      "grad_norm": 0.9306837320327759,
+      "learning_rate": 2.036879112229616e-05,
+      "loss": 0.8692,
+      "step": 17829
+    },
+    {
+      "epoch": 3.1743233618233617,
+      "grad_norm": 0.8994047045707703,
+      "learning_rate": 2.036032502765949e-05,
+      "loss": 0.8047,
+      "step": 17830
+    },
+    {
+      "epoch": 3.1745014245014245,
+      "grad_norm": 0.9705090522766113,
+      "learning_rate": 2.035186049339679e-05,
+      "loss": 0.7923,
+      "step": 17831
+    },
+    {
+      "epoch": 3.1746794871794872,
+      "grad_norm": 0.9646127223968506,
+      "learning_rate": 2.0343397519673955e-05,
+      "loss": 0.7605,
+      "step": 17832
+    },
+    {
+      "epoch": 3.17485754985755,
+      "grad_norm": 0.9448650479316711,
+      "learning_rate": 2.0334936106656775e-05,
+      "loss": 0.6693,
+      "step": 17833
+    },
+    {
+      "epoch": 3.1750356125356127,
+      "grad_norm": 0.9898852109909058,
+      "learning_rate": 2.032647625451103e-05,
+      "loss": 0.8309,
+      "step": 17834
+    },
+    {
+      "epoch": 3.175213675213675,
+      "grad_norm": 1.194535732269287,
+      "learning_rate": 2.031801796340248e-05,
+      "loss": 0.8283,
+      "step": 17835
+    },
+    {
+      "epoch": 3.175391737891738,
+      "grad_norm": 0.8253726959228516,
+      "learning_rate": 2.0309561233496832e-05,
+      "loss": 0.7098,
+      "step": 17836
+    },
+    {
+      "epoch": 3.1755698005698005,
+      "grad_norm": 0.8785462379455566,
+      "learning_rate": 2.0301106064959774e-05,
+      "loss": 0.8157,
+      "step": 17837
+    },
+    {
+      "epoch": 3.1757478632478633,
+      "grad_norm": 1.0798345804214478,
+      "learning_rate": 2.0292652457957028e-05,
+      "loss": 0.6791,
+      "step": 17838
+    },
+    {
+      "epoch": 3.175925925925926,
+      "grad_norm": 0.7758960127830505,
+      "learning_rate": 2.028420041265413e-05,
+      "loss": 0.5568,
+      "step": 17839
+    },
+    {
+      "epoch": 3.176103988603989,
+      "grad_norm": 0.833829402923584,
+      "learning_rate": 2.027574992921678e-05,
+      "loss": 0.6835,
+      "step": 17840
+    },
+    {
+      "epoch": 3.176282051282051,
+      "grad_norm": 0.8681000471115112,
+      "learning_rate": 2.0267301007810448e-05,
+      "loss": 0.7613,
+      "step": 17841
+    },
+    {
+      "epoch": 3.176460113960114,
+      "grad_norm": 0.9271478056907654,
+      "learning_rate": 2.025885364860075e-05,
+      "loss": 0.8355,
+      "step": 17842
+    },
+    {
+      "epoch": 3.1766381766381766,
+      "grad_norm": 0.9865009188652039,
+      "learning_rate": 2.025040785175316e-05,
+      "loss": 0.9064,
+      "step": 17843
+    },
+    {
+      "epoch": 3.1768162393162394,
+      "grad_norm": 1.0745911598205566,
+      "learning_rate": 2.024196361743317e-05,
+      "loss": 0.8944,
+      "step": 17844
+    },
+    {
+      "epoch": 3.176994301994302,
+      "grad_norm": 0.9590545296669006,
+      "learning_rate": 2.0233520945806218e-05,
+      "loss": 0.9389,
+      "step": 17845
+    },
+    {
+      "epoch": 3.177172364672365,
+      "grad_norm": 0.9916484951972961,
+      "learning_rate": 2.0225079837037718e-05,
+      "loss": 0.6494,
+      "step": 17846
+    },
+    {
+      "epoch": 3.177350427350427,
+      "grad_norm": 0.8701820969581604,
+      "learning_rate": 2.0216640291293043e-05,
+      "loss": 0.8206,
+      "step": 17847
+    },
+    {
+      "epoch": 3.17752849002849,
+      "grad_norm": 0.882976233959198,
+      "learning_rate": 2.020820230873761e-05,
+      "loss": 0.6473,
+      "step": 17848
+    },
+    {
+      "epoch": 3.1777065527065527,
+      "grad_norm": 0.9599972367286682,
+      "learning_rate": 2.0199765889536647e-05,
+      "loss": 0.8647,
+      "step": 17849
+    },
+    {
+      "epoch": 3.1778846153846154,
+      "grad_norm": 0.864414632320404,
+      "learning_rate": 2.0191331033855564e-05,
+      "loss": 0.7128,
+      "step": 17850
+    },
+    {
+      "epoch": 3.178062678062678,
+      "grad_norm": 1.1498304605484009,
+      "learning_rate": 2.0182897741859497e-05,
+      "loss": 0.8783,
+      "step": 17851
+    },
+    {
+      "epoch": 3.178240740740741,
+      "grad_norm": 0.8664734363555908,
+      "learning_rate": 2.0174466013713777e-05,
+      "loss": 0.7432,
+      "step": 17852
+    },
+    {
+      "epoch": 3.1784188034188032,
+      "grad_norm": 0.8619598150253296,
+      "learning_rate": 2.0166035849583566e-05,
+      "loss": 0.7127,
+      "step": 17853
+    },
+    {
+      "epoch": 3.178596866096866,
+      "grad_norm": 0.8788446187973022,
+      "learning_rate": 2.0157607249634024e-05,
+      "loss": 0.7772,
+      "step": 17854
+    },
+    {
+      "epoch": 3.1787749287749287,
+      "grad_norm": 0.9376852512359619,
+      "learning_rate": 2.014918021403035e-05,
+      "loss": 0.7643,
+      "step": 17855
+    },
+    {
+      "epoch": 3.1789529914529915,
+      "grad_norm": 1.3296799659729004,
+      "learning_rate": 2.0140754742937575e-05,
+      "loss": 0.7753,
+      "step": 17856
+    },
+    {
+      "epoch": 3.1791310541310542,
+      "grad_norm": 0.932537853717804,
+      "learning_rate": 2.013233083652084e-05,
+      "loss": 0.7868,
+      "step": 17857
+    },
+    {
+      "epoch": 3.179309116809117,
+      "grad_norm": 0.8196058869361877,
+      "learning_rate": 2.0123908494945164e-05,
+      "loss": 0.6838,
+      "step": 17858
+    },
+    {
+      "epoch": 3.1794871794871793,
+      "grad_norm": 0.8767826557159424,
+      "learning_rate": 2.011548771837558e-05,
+      "loss": 0.5893,
+      "step": 17859
+    },
+    {
+      "epoch": 3.179665242165242,
+      "grad_norm": 0.9560354948043823,
+      "learning_rate": 2.0107068506977068e-05,
+      "loss": 0.6576,
+      "step": 17860
+    },
+    {
+      "epoch": 3.179843304843305,
+      "grad_norm": 1.0848370790481567,
+      "learning_rate": 2.0098650860914582e-05,
+      "loss": 1.0265,
+      "step": 17861
+    },
+    {
+      "epoch": 3.1800213675213675,
+      "grad_norm": 0.8908656239509583,
+      "learning_rate": 2.0090234780353022e-05,
+      "loss": 0.6375,
+      "step": 17862
+    },
+    {
+      "epoch": 3.1801994301994303,
+      "grad_norm": 1.0982664823532104,
+      "learning_rate": 2.0081820265457374e-05,
+      "loss": 0.7475,
+      "step": 17863
+    },
+    {
+      "epoch": 3.180377492877493,
+      "grad_norm": 1.0687603950500488,
+      "learning_rate": 2.007340731639239e-05,
+      "loss": 0.9097,
+      "step": 17864
+    },
+    {
+      "epoch": 3.1805555555555554,
+      "grad_norm": 1.0065104961395264,
+      "learning_rate": 2.0064995933323004e-05,
+      "loss": 0.7049,
+      "step": 17865
+    },
+    {
+      "epoch": 3.180733618233618,
+      "grad_norm": 0.9913015961647034,
+      "learning_rate": 2.0056586116413923e-05,
+      "loss": 0.7879,
+      "step": 17866
+    },
+    {
+      "epoch": 3.180911680911681,
+      "grad_norm": 0.8901994824409485,
+      "learning_rate": 2.0048177865829987e-05,
+      "loss": 0.7791,
+      "step": 17867
+    },
+    {
+      "epoch": 3.1810897435897436,
+      "grad_norm": 0.8973708748817444,
+      "learning_rate": 2.0039771181735935e-05,
+      "loss": 0.7945,
+      "step": 17868
+    },
+    {
+      "epoch": 3.1812678062678064,
+      "grad_norm": 0.9505265951156616,
+      "learning_rate": 2.003136606429645e-05,
+      "loss": 0.7165,
+      "step": 17869
+    },
+    {
+      "epoch": 3.181445868945869,
+      "grad_norm": 1.0017229318618774,
+      "learning_rate": 2.0022962513676237e-05,
+      "loss": 0.9075,
+      "step": 17870
+    },
+    {
+      "epoch": 3.1816239316239314,
+      "grad_norm": 0.876743495464325,
+      "learning_rate": 2.0014560530039893e-05,
+      "loss": 0.574,
+      "step": 17871
+    },
+    {
+      "epoch": 3.181801994301994,
+      "grad_norm": 0.9605728983879089,
+      "learning_rate": 2.0006160113552118e-05,
+      "loss": 0.7468,
+      "step": 17872
+    },
+    {
+      "epoch": 3.181980056980057,
+      "grad_norm": 1.0196484327316284,
+      "learning_rate": 1.999776126437747e-05,
+      "loss": 0.7313,
+      "step": 17873
+    },
+    {
+      "epoch": 3.1821581196581197,
+      "grad_norm": 0.842857301235199,
+      "learning_rate": 1.9989363982680487e-05,
+      "loss": 0.5271,
+      "step": 17874
+    },
+    {
+      "epoch": 3.1823361823361824,
+      "grad_norm": 0.9760614633560181,
+      "learning_rate": 1.998096826862571e-05,
+      "loss": 0.8287,
+      "step": 17875
+    },
+    {
+      "epoch": 3.182514245014245,
+      "grad_norm": 0.9114407896995544,
+      "learning_rate": 1.997257412237764e-05,
+      "loss": 0.7172,
+      "step": 17876
+    },
+    {
+      "epoch": 3.1826923076923075,
+      "grad_norm": 0.9301532506942749,
+      "learning_rate": 1.9964181544100747e-05,
+      "loss": 0.6226,
+      "step": 17877
+    },
+    {
+      "epoch": 3.1828703703703702,
+      "grad_norm": 0.8122416138648987,
+      "learning_rate": 1.995579053395944e-05,
+      "loss": 0.6236,
+      "step": 17878
+    },
+    {
+      "epoch": 3.183048433048433,
+      "grad_norm": 0.9543060660362244,
+      "learning_rate": 1.9947401092118124e-05,
+      "loss": 0.7392,
+      "step": 17879
+    },
+    {
+      "epoch": 3.1832264957264957,
+      "grad_norm": 1.0016096830368042,
+      "learning_rate": 1.9939013218741233e-05,
+      "loss": 0.7618,
+      "step": 17880
+    },
+    {
+      "epoch": 3.1834045584045585,
+      "grad_norm": 0.8758784532546997,
+      "learning_rate": 1.9930626913993012e-05,
+      "loss": 0.7702,
+      "step": 17881
+    },
+    {
+      "epoch": 3.1835826210826212,
+      "grad_norm": 0.9122433662414551,
+      "learning_rate": 1.9922242178037864e-05,
+      "loss": 0.7759,
+      "step": 17882
+    },
+    {
+      "epoch": 3.183760683760684,
+      "grad_norm": 0.8537415266036987,
+      "learning_rate": 1.9913859011040025e-05,
+      "loss": 0.5062,
+      "step": 17883
+    },
+    {
+      "epoch": 3.1839387464387463,
+      "grad_norm": 0.9870179295539856,
+      "learning_rate": 1.9905477413163754e-05,
+      "loss": 0.6001,
+      "step": 17884
+    },
+    {
+      "epoch": 3.184116809116809,
+      "grad_norm": 0.9088004231452942,
+      "learning_rate": 1.989709738457328e-05,
+      "loss": 0.6941,
+      "step": 17885
+    },
+    {
+      "epoch": 3.184294871794872,
+      "grad_norm": 0.9028190970420837,
+      "learning_rate": 1.988871892543277e-05,
+      "loss": 0.9215,
+      "step": 17886
+    },
+    {
+      "epoch": 3.1844729344729346,
+      "grad_norm": 0.9333863258361816,
+      "learning_rate": 1.9880342035906406e-05,
+      "loss": 0.8036,
+      "step": 17887
+    },
+    {
+      "epoch": 3.1846509971509973,
+      "grad_norm": 1.0232222080230713,
+      "learning_rate": 1.987196671615831e-05,
+      "loss": 0.8573,
+      "step": 17888
+    },
+    {
+      "epoch": 3.1848290598290596,
+      "grad_norm": Infinity,
+      "learning_rate": 1.987196671615831e-05,
+      "loss": 0.7025,
+      "step": 17889
+    },
+    {
+      "epoch": 3.1850071225071224,
+      "grad_norm": 0.9075542092323303,
+      "learning_rate": 1.986359296635254e-05,
+      "loss": 0.7867,
+      "step": 17890
+    },
+    {
+      "epoch": 3.185185185185185,
+      "grad_norm": 1.0360100269317627,
+      "learning_rate": 1.9855220786653217e-05,
+      "loss": 0.7186,
+      "step": 17891
+    },
+    {
+      "epoch": 3.185363247863248,
+      "grad_norm": 0.8505464792251587,
+      "learning_rate": 1.9846850177224363e-05,
+      "loss": 0.7619,
+      "step": 17892
+    },
+    {
+      "epoch": 3.1855413105413106,
+      "grad_norm": 0.8706018328666687,
+      "learning_rate": 1.983848113822998e-05,
+      "loss": 0.8204,
+      "step": 17893
+    },
+    {
+      "epoch": 3.1857193732193734,
+      "grad_norm": 1.1926771402359009,
+      "learning_rate": 1.9830113669834038e-05,
+      "loss": 0.7418,
+      "step": 17894
+    },
+    {
+      "epoch": 3.185897435897436,
+      "grad_norm": 0.8450924754142761,
+      "learning_rate": 1.9821747772200448e-05,
+      "loss": 0.6028,
+      "step": 17895
+    },
+    {
+      "epoch": 3.1860754985754984,
+      "grad_norm": 0.888190746307373,
+      "learning_rate": 1.9813383445493207e-05,
+      "loss": 0.7498,
+      "step": 17896
+    },
+    {
+      "epoch": 3.186253561253561,
+      "grad_norm": 1.065687894821167,
+      "learning_rate": 1.9805020689876096e-05,
+      "loss": 0.9076,
+      "step": 17897
+    },
+    {
+      "epoch": 3.186431623931624,
+      "grad_norm": 0.9114495515823364,
+      "learning_rate": 1.9796659505513038e-05,
+      "loss": 0.828,
+      "step": 17898
+    },
+    {
+      "epoch": 3.1866096866096867,
+      "grad_norm": 0.9044122695922852,
+      "learning_rate": 1.978829989256783e-05,
+      "loss": 0.7941,
+      "step": 17899
+    },
+    {
+      "epoch": 3.1867877492877494,
+      "grad_norm": 0.8886293768882751,
+      "learning_rate": 1.9779941851204265e-05,
+      "loss": 0.8974,
+      "step": 17900
+    },
+    {
+      "epoch": 3.1869658119658117,
+      "grad_norm": 0.9635893106460571,
+      "learning_rate": 1.9771585381586087e-05,
+      "loss": 0.7498,
+      "step": 17901
+    },
+    {
+      "epoch": 3.1871438746438745,
+      "grad_norm": 0.9525110721588135,
+      "learning_rate": 1.9763230483877037e-05,
+      "loss": 0.6705,
+      "step": 17902
+    },
+    {
+      "epoch": 3.1873219373219372,
+      "grad_norm": 1.0253328084945679,
+      "learning_rate": 1.9754877158240805e-05,
+      "loss": 0.7001,
+      "step": 17903
+    },
+    {
+      "epoch": 3.1875,
+      "grad_norm": 1.0464524030685425,
+      "learning_rate": 1.9746525404841064e-05,
+      "loss": 0.7115,
+      "step": 17904
+    },
+    {
+      "epoch": 3.1876780626780628,
+      "grad_norm": 0.9318157434463501,
+      "learning_rate": 1.9738175223841428e-05,
+      "loss": 0.7629,
+      "step": 17905
+    },
+    {
+      "epoch": 3.1878561253561255,
+      "grad_norm": 0.9584185481071472,
+      "learning_rate": 1.9729826615405557e-05,
+      "loss": 0.6182,
+      "step": 17906
+    },
+    {
+      "epoch": 3.1880341880341883,
+      "grad_norm": 0.8253262639045715,
+      "learning_rate": 1.9721479579696945e-05,
+      "loss": 0.5826,
+      "step": 17907
+    },
+    {
+      "epoch": 3.1882122507122506,
+      "grad_norm": 1.0227715969085693,
+      "learning_rate": 1.9713134116879195e-05,
+      "loss": 0.7201,
+      "step": 17908
+    },
+    {
+      "epoch": 3.1883903133903133,
+      "grad_norm": 0.8838222026824951,
+      "learning_rate": 1.97047902271158e-05,
+      "loss": 0.6851,
+      "step": 17909
+    },
+    {
+      "epoch": 3.188568376068376,
+      "grad_norm": 0.8144074082374573,
+      "learning_rate": 1.969644791057025e-05,
+      "loss": 0.7016,
+      "step": 17910
+    },
+    {
+      "epoch": 3.188746438746439,
+      "grad_norm": 1.1255220174789429,
+      "learning_rate": 1.9688107167405977e-05,
+      "loss": 1.0059,
+      "step": 17911
+    },
+    {
+      "epoch": 3.1889245014245016,
+      "grad_norm": 0.9504788517951965,
+      "learning_rate": 1.967976799778639e-05,
+      "loss": 0.8138,
+      "step": 17912
+    },
+    {
+      "epoch": 3.189102564102564,
+      "grad_norm": 1.0853493213653564,
+      "learning_rate": 1.9671430401874946e-05,
+      "loss": 0.9565,
+      "step": 17913
+    },
+    {
+      "epoch": 3.1892806267806266,
+      "grad_norm": 0.9323904514312744,
+      "learning_rate": 1.9663094379834912e-05,
+      "loss": 0.7243,
+      "step": 17914
+    },
+    {
+      "epoch": 3.1894586894586894,
+      "grad_norm": 0.9153193831443787,
+      "learning_rate": 1.965475993182968e-05,
+      "loss": 0.7891,
+      "step": 17915
+    },
+    {
+      "epoch": 3.189636752136752,
+      "grad_norm": 0.9310745596885681,
+      "learning_rate": 1.9646427058022522e-05,
+      "loss": 0.7926,
+      "step": 17916
+    },
+    {
+      "epoch": 3.189814814814815,
+      "grad_norm": 0.7946666479110718,
+      "learning_rate": 1.96380957585767e-05,
+      "loss": 0.5099,
+      "step": 17917
+    },
+    {
+      "epoch": 3.1899928774928776,
+      "grad_norm": 0.9780072569847107,
+      "learning_rate": 1.9629766033655472e-05,
+      "loss": 0.8119,
+      "step": 17918
+    },
+    {
+      "epoch": 3.1901709401709404,
+      "grad_norm": 0.886121928691864,
+      "learning_rate": 1.962143788342201e-05,
+      "loss": 0.8426,
+      "step": 17919
+    },
+    {
+      "epoch": 3.1903490028490027,
+      "grad_norm": 0.9156065583229065,
+      "learning_rate": 1.961311130803948e-05,
+      "loss": 0.7428,
+      "step": 17920
+    },
+    {
+      "epoch": 3.1905270655270654,
+      "grad_norm": 0.9247871041297913,
+      "learning_rate": 1.9604786307671085e-05,
+      "loss": 0.7394,
+      "step": 17921
+    },
+    {
+      "epoch": 3.190705128205128,
+      "grad_norm": 0.8814037442207336,
+      "learning_rate": 1.959646288247986e-05,
+      "loss": 0.6733,
+      "step": 17922
+    },
+    {
+      "epoch": 3.190883190883191,
+      "grad_norm": 0.915059506893158,
+      "learning_rate": 1.9588141032628958e-05,
+      "loss": 1.0306,
+      "step": 17923
+    },
+    {
+      "epoch": 3.1910612535612537,
+      "grad_norm": 1.0811846256256104,
+      "learning_rate": 1.9579820758281353e-05,
+      "loss": 1.0782,
+      "step": 17924
+    },
+    {
+      "epoch": 3.191239316239316,
+      "grad_norm": 0.8332127332687378,
+      "learning_rate": 1.957150205960012e-05,
+      "loss": 0.8365,
+      "step": 17925
+    },
+    {
+      "epoch": 3.1914173789173788,
+      "grad_norm": 0.9311001896858215,
+      "learning_rate": 1.9563184936748236e-05,
+      "loss": 0.7995,
+      "step": 17926
+    },
+    {
+      "epoch": 3.1915954415954415,
+      "grad_norm": 0.844411313533783,
+      "learning_rate": 1.9554869389888642e-05,
+      "loss": 0.7573,
+      "step": 17927
+    },
+    {
+      "epoch": 3.1917735042735043,
+      "grad_norm": 1.0677951574325562,
+      "learning_rate": 1.9546555419184277e-05,
+      "loss": 0.8501,
+      "step": 17928
+    },
+    {
+      "epoch": 3.191951566951567,
+      "grad_norm": 1.0033360719680786,
+      "learning_rate": 1.953824302479803e-05,
+      "loss": 0.8433,
+      "step": 17929
+    },
+    {
+      "epoch": 3.1921296296296298,
+      "grad_norm": 0.9265921711921692,
+      "learning_rate": 1.9529932206892732e-05,
+      "loss": 0.901,
+      "step": 17930
+    },
+    {
+      "epoch": 3.1923076923076925,
+      "grad_norm": 0.9688268303871155,
+      "learning_rate": 1.952162296563128e-05,
+      "loss": 0.743,
+      "step": 17931
+    },
+    {
+      "epoch": 3.192485754985755,
+      "grad_norm": 0.7872363328933716,
+      "learning_rate": 1.9513315301176462e-05,
+      "loss": 0.608,
+      "step": 17932
+    },
+    {
+      "epoch": 3.1926638176638176,
+      "grad_norm": 0.9641526937484741,
+      "learning_rate": 1.9505009213691015e-05,
+      "loss": 0.9266,
+      "step": 17933
+    },
+    {
+      "epoch": 3.1928418803418803,
+      "grad_norm": 0.9330029487609863,
+      "learning_rate": 1.9496704703337708e-05,
+      "loss": 0.8647,
+      "step": 17934
+    },
+    {
+      "epoch": 3.193019943019943,
+      "grad_norm": 1.0177229642868042,
+      "learning_rate": 1.9488401770279207e-05,
+      "loss": 0.7655,
+      "step": 17935
+    },
+    {
+      "epoch": 3.193198005698006,
+      "grad_norm": 1.0131559371948242,
+      "learning_rate": 1.948010041467828e-05,
+      "loss": 0.8529,
+      "step": 17936
+    },
+    {
+      "epoch": 3.1933760683760686,
+      "grad_norm": 0.8644171357154846,
+      "learning_rate": 1.947180063669748e-05,
+      "loss": 0.6328,
+      "step": 17937
+    },
+    {
+      "epoch": 3.193554131054131,
+      "grad_norm": 0.8475095629692078,
+      "learning_rate": 1.9463502436499503e-05,
+      "loss": 0.5732,
+      "step": 17938
+    },
+    {
+      "epoch": 3.1937321937321936,
+      "grad_norm": 0.8605408668518066,
+      "learning_rate": 1.9455205814246846e-05,
+      "loss": 0.6729,
+      "step": 17939
+    },
+    {
+      "epoch": 3.1939102564102564,
+      "grad_norm": 0.8682053685188293,
+      "learning_rate": 1.9446910770102145e-05,
+      "loss": 0.7593,
+      "step": 17940
+    },
+    {
+      "epoch": 3.194088319088319,
+      "grad_norm": 0.9077824950218201,
+      "learning_rate": 1.943861730422788e-05,
+      "loss": 0.7351,
+      "step": 17941
+    },
+    {
+      "epoch": 3.194266381766382,
+      "grad_norm": 0.9960265159606934,
+      "learning_rate": 1.9430325416786564e-05,
+      "loss": 0.7917,
+      "step": 17942
+    },
+    {
+      "epoch": 3.1944444444444446,
+      "grad_norm": 0.9843694567680359,
+      "learning_rate": 1.942203510794066e-05,
+      "loss": 0.812,
+      "step": 17943
+    },
+    {
+      "epoch": 3.194622507122507,
+      "grad_norm": 1.003712773323059,
+      "learning_rate": 1.941374637785258e-05,
+      "loss": 0.6471,
+      "step": 17944
+    },
+    {
+      "epoch": 3.1948005698005697,
+      "grad_norm": 0.926785409450531,
+      "learning_rate": 1.940545922668472e-05,
+      "loss": 0.8304,
+      "step": 17945
+    },
+    {
+      "epoch": 3.1949786324786325,
+      "grad_norm": 0.927204966545105,
+      "learning_rate": 1.939717365459952e-05,
+      "loss": 0.7698,
+      "step": 17946
+    },
+    {
+      "epoch": 3.195156695156695,
+      "grad_norm": 0.8717413544654846,
+      "learning_rate": 1.93888896617592e-05,
+      "loss": 0.7019,
+      "step": 17947
+    },
+    {
+      "epoch": 3.195334757834758,
+      "grad_norm": 0.8853579163551331,
+      "learning_rate": 1.938060724832619e-05,
+      "loss": 0.688,
+      "step": 17948
+    },
+    {
+      "epoch": 3.1955128205128207,
+      "grad_norm": 0.8655480742454529,
+      "learning_rate": 1.937232641446266e-05,
+      "loss": 0.5438,
+      "step": 17949
+    },
+    {
+      "epoch": 3.195690883190883,
+      "grad_norm": 1.0422619581222534,
+      "learning_rate": 1.936404716033092e-05,
+      "loss": 0.7796,
+      "step": 17950
+    },
+    {
+      "epoch": 3.1958689458689458,
+      "grad_norm": 0.9323866963386536,
+      "learning_rate": 1.9355769486093178e-05,
+      "loss": 0.8504,
+      "step": 17951
+    },
+    {
+      "epoch": 3.1960470085470085,
+      "grad_norm": 0.8164253830909729,
+      "learning_rate": 1.9347493391911585e-05,
+      "loss": 0.65,
+      "step": 17952
+    },
+    {
+      "epoch": 3.1962250712250713,
+      "grad_norm": 1.019322156906128,
+      "learning_rate": 1.9339218877948375e-05,
+      "loss": 0.7625,
+      "step": 17953
+    },
+    {
+      "epoch": 3.196403133903134,
+      "grad_norm": 1.0221740007400513,
+      "learning_rate": 1.9330945944365542e-05,
+      "loss": 0.766,
+      "step": 17954
+    },
+    {
+      "epoch": 3.1965811965811968,
+      "grad_norm": 1.0685542821884155,
+      "learning_rate": 1.9322674591325284e-05,
+      "loss": 0.6684,
+      "step": 17955
+    },
+    {
+      "epoch": 3.196759259259259,
+      "grad_norm": 0.9320390820503235,
+      "learning_rate": 1.9314404818989628e-05,
+      "loss": 0.8024,
+      "step": 17956
+    },
+    {
+      "epoch": 3.196937321937322,
+      "grad_norm": 0.9577845335006714,
+      "learning_rate": 1.9306136627520598e-05,
+      "loss": 0.8412,
+      "step": 17957
+    },
+    {
+      "epoch": 3.1971153846153846,
+      "grad_norm": 0.8444850444793701,
+      "learning_rate": 1.9297870017080187e-05,
+      "loss": 0.6378,
+      "step": 17958
+    },
+    {
+      "epoch": 3.1972934472934473,
+      "grad_norm": 0.905194103717804,
+      "learning_rate": 1.928960498783038e-05,
+      "loss": 0.7239,
+      "step": 17959
+    },
+    {
+      "epoch": 3.19747150997151,
+      "grad_norm": 0.9918950200080872,
+      "learning_rate": 1.9281341539933095e-05,
+      "loss": 0.8202,
+      "step": 17960
+    },
+    {
+      "epoch": 3.197649572649573,
+      "grad_norm": 0.9659561514854431,
+      "learning_rate": 1.927307967355024e-05,
+      "loss": 0.9186,
+      "step": 17961
+    },
+    {
+      "epoch": 3.197827635327635,
+      "grad_norm": 1.061000108718872,
+      "learning_rate": 1.926481938884368e-05,
+      "loss": 0.8304,
+      "step": 17962
+    },
+    {
+      "epoch": 3.198005698005698,
+      "grad_norm": 1.0199217796325684,
+      "learning_rate": 1.9256560685975313e-05,
+      "loss": 0.7649,
+      "step": 17963
+    },
+    {
+      "epoch": 3.1981837606837606,
+      "grad_norm": 0.9994050860404968,
+      "learning_rate": 1.9248303565106862e-05,
+      "loss": 0.7811,
+      "step": 17964
+    },
+    {
+      "epoch": 3.1983618233618234,
+      "grad_norm": 1.0911402702331543,
+      "learning_rate": 1.924004802640018e-05,
+      "loss": 0.7703,
+      "step": 17965
+    },
+    {
+      "epoch": 3.198539886039886,
+      "grad_norm": 0.878711998462677,
+      "learning_rate": 1.9231794070016995e-05,
+      "loss": 0.8968,
+      "step": 17966
+    },
+    {
+      "epoch": 3.198717948717949,
+      "grad_norm": 0.965321958065033,
+      "learning_rate": 1.9223541696119017e-05,
+      "loss": 0.646,
+      "step": 17967
+    },
+    {
+      "epoch": 3.198896011396011,
+      "grad_norm": 1.1991591453552246,
+      "learning_rate": 1.9215290904867945e-05,
+      "loss": 0.8747,
+      "step": 17968
+    },
+    {
+      "epoch": 3.199074074074074,
+      "grad_norm": 0.9755321145057678,
+      "learning_rate": 1.920704169642543e-05,
+      "loss": 0.7916,
+      "step": 17969
+    },
+    {
+      "epoch": 3.1992521367521367,
+      "grad_norm": 0.8560118079185486,
+      "learning_rate": 1.9198794070953074e-05,
+      "loss": 0.8143,
+      "step": 17970
+    },
+    {
+      "epoch": 3.1994301994301995,
+      "grad_norm": 1.079470157623291,
+      "learning_rate": 1.9190548028612532e-05,
+      "loss": 0.7934,
+      "step": 17971
+    },
+    {
+      "epoch": 3.199608262108262,
+      "grad_norm": 0.8946192264556885,
+      "learning_rate": 1.918230356956533e-05,
+      "loss": 0.8815,
+      "step": 17972
+    },
+    {
+      "epoch": 3.199786324786325,
+      "grad_norm": 0.8701974749565125,
+      "learning_rate": 1.9174060693973007e-05,
+      "loss": 0.7742,
+      "step": 17973
+    },
+    {
+      "epoch": 3.1999643874643873,
+      "grad_norm": 1.059370756149292,
+      "learning_rate": 1.9165819401997053e-05,
+      "loss": 0.7672,
+      "step": 17974
+    },
+    {
+      "epoch": 3.20014245014245,
+      "grad_norm": 1.151624083518982,
+      "learning_rate": 1.9157579693798955e-05,
+      "loss": 0.737,
+      "step": 17975
+    },
+    {
+      "epoch": 3.2003205128205128,
+      "grad_norm": 1.0536963939666748,
+      "learning_rate": 1.9149341569540158e-05,
+      "loss": 0.9501,
+      "step": 17976
+    },
+    {
+      "epoch": 3.2004985754985755,
+      "grad_norm": 0.8573943376541138,
+      "learning_rate": 1.9141105029382024e-05,
+      "loss": 0.7525,
+      "step": 17977
+    },
+    {
+      "epoch": 3.2006766381766383,
+      "grad_norm": 0.9258541464805603,
+      "learning_rate": 1.913287007348602e-05,
+      "loss": 0.6852,
+      "step": 17978
+    },
+    {
+      "epoch": 3.200854700854701,
+      "grad_norm": 0.9687190055847168,
+      "learning_rate": 1.9124636702013387e-05,
+      "loss": 0.7443,
+      "step": 17979
+    },
+    {
+      "epoch": 3.2010327635327633,
+      "grad_norm": 1.2201204299926758,
+      "learning_rate": 1.911640491512552e-05,
+      "loss": 0.7101,
+      "step": 17980
+    },
+    {
+      "epoch": 3.201210826210826,
+      "grad_norm": 1.032789707183838,
+      "learning_rate": 1.9108174712983675e-05,
+      "loss": 0.8236,
+      "step": 17981
+    },
+    {
+      "epoch": 3.201388888888889,
+      "grad_norm": 0.9653711318969727,
+      "learning_rate": 1.909994609574911e-05,
+      "loss": 0.582,
+      "step": 17982
+    },
+    {
+      "epoch": 3.2015669515669516,
+      "grad_norm": 0.9730488657951355,
+      "learning_rate": 1.9091719063583058e-05,
+      "loss": 0.8163,
+      "step": 17983
+    },
+    {
+      "epoch": 3.2017450142450143,
+      "grad_norm": 0.8995687365531921,
+      "learning_rate": 1.9083493616646686e-05,
+      "loss": 0.8088,
+      "step": 17984
+    },
+    {
+      "epoch": 3.201923076923077,
+      "grad_norm": 0.9465620517730713,
+      "learning_rate": 1.9075269755101167e-05,
+      "loss": 0.855,
+      "step": 17985
+    },
+    {
+      "epoch": 3.2021011396011394,
+      "grad_norm": 0.954304575920105,
+      "learning_rate": 1.9067047479107636e-05,
+      "loss": 0.7283,
+      "step": 17986
+    },
+    {
+      "epoch": 3.202279202279202,
+      "grad_norm": 0.9493895769119263,
+      "learning_rate": 1.9058826788827156e-05,
+      "loss": 0.7459,
+      "step": 17987
+    },
+    {
+      "epoch": 3.202457264957265,
+      "grad_norm": 0.9088132977485657,
+      "learning_rate": 1.9050607684420873e-05,
+      "loss": 0.8661,
+      "step": 17988
+    },
+    {
+      "epoch": 3.2026353276353277,
+      "grad_norm": 0.8793889880180359,
+      "learning_rate": 1.904239016604973e-05,
+      "loss": 0.6837,
+      "step": 17989
+    },
+    {
+      "epoch": 3.2028133903133904,
+      "grad_norm": 0.9571384191513062,
+      "learning_rate": 1.9034174233874803e-05,
+      "loss": 0.8113,
+      "step": 17990
+    },
+    {
+      "epoch": 3.202991452991453,
+      "grad_norm": 1.0003505945205688,
+      "learning_rate": 1.902595988805703e-05,
+      "loss": 0.7563,
+      "step": 17991
+    },
+    {
+      "epoch": 3.2031695156695155,
+      "grad_norm": 1.017555832862854,
+      "learning_rate": 1.9017747128757347e-05,
+      "loss": 0.8769,
+      "step": 17992
+    },
+    {
+      "epoch": 3.203347578347578,
+      "grad_norm": 0.8811057209968567,
+      "learning_rate": 1.9009535956136738e-05,
+      "loss": 0.7809,
+      "step": 17993
+    },
+    {
+      "epoch": 3.203525641025641,
+      "grad_norm": 0.9344915747642517,
+      "learning_rate": 1.9001326370355966e-05,
+      "loss": 0.7215,
+      "step": 17994
+    },
+    {
+      "epoch": 3.2037037037037037,
+      "grad_norm": 0.851786196231842,
+      "learning_rate": 1.8993118371576002e-05,
+      "loss": 0.6358,
+      "step": 17995
+    },
+    {
+      "epoch": 3.2038817663817665,
+      "grad_norm": 1.012872576713562,
+      "learning_rate": 1.8984911959957552e-05,
+      "loss": 0.6153,
+      "step": 17996
+    },
+    {
+      "epoch": 3.2040598290598292,
+      "grad_norm": 0.8867791891098022,
+      "learning_rate": 1.8976707135661488e-05,
+      "loss": 0.8027,
+      "step": 17997
+    },
+    {
+      "epoch": 3.2042378917378915,
+      "grad_norm": 0.954986572265625,
+      "learning_rate": 1.8968503898848532e-05,
+      "loss": 0.7257,
+      "step": 17998
+    },
+    {
+      "epoch": 3.2044159544159543,
+      "grad_norm": 1.006103277206421,
+      "learning_rate": 1.8960302249679418e-05,
+      "loss": 0.7259,
+      "step": 17999
+    },
+    {
+      "epoch": 3.204594017094017,
+      "grad_norm": 0.9252307415008545,
+      "learning_rate": 1.8952102188314835e-05,
+      "loss": 0.8521,
+      "step": 18000
+    },
+    {
+      "epoch": 3.20477207977208,
+      "grad_norm": 0.8885179758071899,
+      "learning_rate": 1.8943903714915447e-05,
+      "loss": 0.7776,
+      "step": 18001
+    },
+    {
+      "epoch": 3.2049501424501425,
+      "grad_norm": 0.9005871415138245,
+      "learning_rate": 1.8935706829641865e-05,
+      "loss": 0.8127,
+      "step": 18002
+    },
+    {
+      "epoch": 3.2051282051282053,
+      "grad_norm": 0.9123699069023132,
+      "learning_rate": 1.8927511532654753e-05,
+      "loss": 0.8827,
+      "step": 18003
+    },
+    {
+      "epoch": 3.205306267806268,
+      "grad_norm": 0.8967127203941345,
+      "learning_rate": 1.891931782411459e-05,
+      "loss": 0.586,
+      "step": 18004
+    },
+    {
+      "epoch": 3.2054843304843303,
+      "grad_norm": 0.8950326442718506,
+      "learning_rate": 1.8911125704181987e-05,
+      "loss": 0.8074,
+      "step": 18005
+    },
+    {
+      "epoch": 3.205662393162393,
+      "grad_norm": 0.8951900005340576,
+      "learning_rate": 1.8902935173017434e-05,
+      "loss": 0.8439,
+      "step": 18006
+    },
+    {
+      "epoch": 3.205840455840456,
+      "grad_norm": 0.9931556582450867,
+      "learning_rate": 1.88947462307814e-05,
+      "loss": 0.7222,
+      "step": 18007
+    },
+    {
+      "epoch": 3.2060185185185186,
+      "grad_norm": 0.8889574408531189,
+      "learning_rate": 1.888655887763433e-05,
+      "loss": 0.7148,
+      "step": 18008
+    },
+    {
+      "epoch": 3.2061965811965814,
+      "grad_norm": 0.9963932037353516,
+      "learning_rate": 1.8878373113736648e-05,
+      "loss": 0.7062,
+      "step": 18009
+    },
+    {
+      "epoch": 3.2063746438746437,
+      "grad_norm": 0.9292500019073486,
+      "learning_rate": 1.8870188939248724e-05,
+      "loss": 0.4827,
+      "step": 18010
+    },
+    {
+      "epoch": 3.2065527065527064,
+      "grad_norm": 1.1208369731903076,
+      "learning_rate": 1.8862006354330887e-05,
+      "loss": 0.9324,
+      "step": 18011
+    },
+    {
+      "epoch": 3.206730769230769,
+      "grad_norm": 0.9836950302124023,
+      "learning_rate": 1.885382535914352e-05,
+      "loss": 0.8058,
+      "step": 18012
+    },
+    {
+      "epoch": 3.206908831908832,
+      "grad_norm": 0.8917511701583862,
+      "learning_rate": 1.884564595384687e-05,
+      "loss": 0.8087,
+      "step": 18013
+    },
+    {
+      "epoch": 3.2070868945868947,
+      "grad_norm": 0.9193904399871826,
+      "learning_rate": 1.88374681386012e-05,
+      "loss": 0.9069,
+      "step": 18014
+    },
+    {
+      "epoch": 3.2072649572649574,
+      "grad_norm": 0.9080356359481812,
+      "learning_rate": 1.882929191356675e-05,
+      "loss": 0.6272,
+      "step": 18015
+    },
+    {
+      "epoch": 3.20744301994302,
+      "grad_norm": 0.955551028251648,
+      "learning_rate": 1.88211172789037e-05,
+      "loss": 0.8133,
+      "step": 18016
+    },
+    {
+      "epoch": 3.2076210826210825,
+      "grad_norm": 1.1204555034637451,
+      "learning_rate": 1.88129442347722e-05,
+      "loss": 0.7411,
+      "step": 18017
+    },
+    {
+      "epoch": 3.2077991452991452,
+      "grad_norm": 0.8058112859725952,
+      "learning_rate": 1.880477278133247e-05,
+      "loss": 0.6178,
+      "step": 18018
+    },
+    {
+      "epoch": 3.207977207977208,
+      "grad_norm": 0.9366341233253479,
+      "learning_rate": 1.8796602918744475e-05,
+      "loss": 0.7446,
+      "step": 18019
+    },
+    {
+      "epoch": 3.2081552706552707,
+      "grad_norm": 0.9692829251289368,
+      "learning_rate": 1.878843464716843e-05,
+      "loss": 0.8648,
+      "step": 18020
+    },
+    {
+      "epoch": 3.2083333333333335,
+      "grad_norm": 0.831867516040802,
+      "learning_rate": 1.8780267966764253e-05,
+      "loss": 0.6409,
+      "step": 18021
+    },
+    {
+      "epoch": 3.208511396011396,
+      "grad_norm": 1.0365211963653564,
+      "learning_rate": 1.877210287769202e-05,
+      "loss": 0.725,
+      "step": 18022
+    },
+    {
+      "epoch": 3.2086894586894585,
+      "grad_norm": 0.8367540836334229,
+      "learning_rate": 1.8763939380111705e-05,
+      "loss": 0.6112,
+      "step": 18023
+    },
+    {
+      "epoch": 3.2088675213675213,
+      "grad_norm": 0.9757236838340759,
+      "learning_rate": 1.8755777474183235e-05,
+      "loss": 0.8073,
+      "step": 18024
+    },
+    {
+      "epoch": 3.209045584045584,
+      "grad_norm": 0.930181622505188,
+      "learning_rate": 1.8747617160066543e-05,
+      "loss": 0.833,
+      "step": 18025
+    },
+    {
+      "epoch": 3.209223646723647,
+      "grad_norm": 0.8707014918327332,
+      "learning_rate": 1.873945843792151e-05,
+      "loss": 0.7944,
+      "step": 18026
+    },
+    {
+      "epoch": 3.2094017094017095,
+      "grad_norm": 0.8476516008377075,
+      "learning_rate": 1.8731301307907946e-05,
+      "loss": 0.8548,
+      "step": 18027
+    },
+    {
+      "epoch": 3.2095797720797723,
+      "grad_norm": 1.193192481994629,
+      "learning_rate": 1.8723145770185768e-05,
+      "loss": 0.9114,
+      "step": 18028
+    },
+    {
+      "epoch": 3.2097578347578346,
+      "grad_norm": 0.8446243405342102,
+      "learning_rate": 1.8714991824914662e-05,
+      "loss": 0.6701,
+      "step": 18029
+    },
+    {
+      "epoch": 3.2099358974358974,
+      "grad_norm": 0.9350818991661072,
+      "learning_rate": 1.8706839472254457e-05,
+      "loss": 0.7339,
+      "step": 18030
+    },
+    {
+      "epoch": 3.21011396011396,
+      "grad_norm": 1.001449465751648,
+      "learning_rate": 1.8698688712364864e-05,
+      "loss": 0.5902,
+      "step": 18031
+    },
+    {
+      "epoch": 3.210292022792023,
+      "grad_norm": 0.9731200337409973,
+      "learning_rate": 1.8690539545405582e-05,
+      "loss": 1.0019,
+      "step": 18032
+    },
+    {
+      "epoch": 3.2104700854700856,
+      "grad_norm": 1.0137463808059692,
+      "learning_rate": 1.868239197153626e-05,
+      "loss": 0.7242,
+      "step": 18033
+    },
+    {
+      "epoch": 3.210648148148148,
+      "grad_norm": 1.0123406648635864,
+      "learning_rate": 1.8674245990916528e-05,
+      "loss": 0.7168,
+      "step": 18034
+    },
+    {
+      "epoch": 3.2108262108262107,
+      "grad_norm": 0.9725518822669983,
+      "learning_rate": 1.8666101603706054e-05,
+      "loss": 0.8522,
+      "step": 18035
+    },
+    {
+      "epoch": 3.2110042735042734,
+      "grad_norm": 0.9783751964569092,
+      "learning_rate": 1.8657958810064315e-05,
+      "loss": 0.8206,
+      "step": 18036
+    },
+    {
+      "epoch": 3.211182336182336,
+      "grad_norm": 0.8869835734367371,
+      "learning_rate": 1.8649817610150923e-05,
+      "loss": 0.7986,
+      "step": 18037
+    },
+    {
+      "epoch": 3.211360398860399,
+      "grad_norm": 0.8768937587738037,
+      "learning_rate": 1.8641678004125363e-05,
+      "loss": 0.8307,
+      "step": 18038
+    },
+    {
+      "epoch": 3.2115384615384617,
+      "grad_norm": 0.8711394667625427,
+      "learning_rate": 1.863353999214712e-05,
+      "loss": 0.6374,
+      "step": 18039
+    },
+    {
+      "epoch": 3.2117165242165244,
+      "grad_norm": 1.0543094873428345,
+      "learning_rate": 1.862540357437563e-05,
+      "loss": 0.7071,
+      "step": 18040
+    },
+    {
+      "epoch": 3.2118945868945867,
+      "grad_norm": 0.8850560784339905,
+      "learning_rate": 1.861726875097032e-05,
+      "loss": 0.6987,
+      "step": 18041
+    },
+    {
+      "epoch": 3.2120726495726495,
+      "grad_norm": 0.8987910151481628,
+      "learning_rate": 1.860913552209055e-05,
+      "loss": 0.6632,
+      "step": 18042
+    },
+    {
+      "epoch": 3.2122507122507122,
+      "grad_norm": 0.9093405604362488,
+      "learning_rate": 1.860100388789574e-05,
+      "loss": 0.7153,
+      "step": 18043
+    },
+    {
+      "epoch": 3.212428774928775,
+      "grad_norm": 0.9864318370819092,
+      "learning_rate": 1.859287384854511e-05,
+      "loss": 0.8924,
+      "step": 18044
+    },
+    {
+      "epoch": 3.2126068376068377,
+      "grad_norm": 1.0086519718170166,
+      "learning_rate": 1.8584745404198066e-05,
+      "loss": 0.7406,
+      "step": 18045
+    },
+    {
+      "epoch": 3.2127849002849005,
+      "grad_norm": 0.8840755224227905,
+      "learning_rate": 1.8576618555013758e-05,
+      "loss": 0.756,
+      "step": 18046
+    },
+    {
+      "epoch": 3.212962962962963,
+      "grad_norm": 0.9135591983795166,
+      "learning_rate": 1.856849330115149e-05,
+      "loss": 0.565,
+      "step": 18047
+    },
+    {
+      "epoch": 3.2131410256410255,
+      "grad_norm": 0.9007353186607361,
+      "learning_rate": 1.856036964277045e-05,
+      "loss": 0.71,
+      "step": 18048
+    },
+    {
+      "epoch": 3.2133190883190883,
+      "grad_norm": 0.8159099817276001,
+      "learning_rate": 1.855224758002978e-05,
+      "loss": 0.5132,
+      "step": 18049
+    },
+    {
+      "epoch": 3.213497150997151,
+      "grad_norm": 0.9244372844696045,
+      "learning_rate": 1.8544127113088617e-05,
+      "loss": 0.813,
+      "step": 18050
+    },
+    {
+      "epoch": 3.213675213675214,
+      "grad_norm": 0.7983192205429077,
+      "learning_rate": 1.853600824210606e-05,
+      "loss": 0.6238,
+      "step": 18051
+    },
+    {
+      "epoch": 3.2138532763532766,
+      "grad_norm": 0.8808383941650391,
+      "learning_rate": 1.8527890967241212e-05,
+      "loss": 0.785,
+      "step": 18052
+    },
+    {
+      "epoch": 3.214031339031339,
+      "grad_norm": 0.8894946575164795,
+      "learning_rate": 1.8519775288653108e-05,
+      "loss": 0.8035,
+      "step": 18053
+    },
+    {
+      "epoch": 3.2142094017094016,
+      "grad_norm": 1.2804183959960938,
+      "learning_rate": 1.851166120650074e-05,
+      "loss": 0.7088,
+      "step": 18054
+    },
+    {
+      "epoch": 3.2143874643874644,
+      "grad_norm": 0.9724962711334229,
+      "learning_rate": 1.85035487209431e-05,
+      "loss": 0.7504,
+      "step": 18055
+    },
+    {
+      "epoch": 3.214565527065527,
+      "grad_norm": 1.0876386165618896,
+      "learning_rate": 1.8495437832139117e-05,
+      "loss": 0.8069,
+      "step": 18056
+    },
+    {
+      "epoch": 3.21474358974359,
+      "grad_norm": 0.9543545842170715,
+      "learning_rate": 1.8487328540247715e-05,
+      "loss": 0.8907,
+      "step": 18057
+    },
+    {
+      "epoch": 3.2149216524216526,
+      "grad_norm": 0.9640414118766785,
+      "learning_rate": 1.8479220845427802e-05,
+      "loss": 0.759,
+      "step": 18058
+    },
+    {
+      "epoch": 3.215099715099715,
+      "grad_norm": 0.8389877676963806,
+      "learning_rate": 1.847111474783817e-05,
+      "loss": 0.7234,
+      "step": 18059
+    },
+    {
+      "epoch": 3.2152777777777777,
+      "grad_norm": 0.9118861556053162,
+      "learning_rate": 1.8463010247637746e-05,
+      "loss": 0.852,
+      "step": 18060
+    },
+    {
+      "epoch": 3.2154558404558404,
+      "grad_norm": 0.9392750859260559,
+      "learning_rate": 1.8454907344985194e-05,
+      "loss": 0.8615,
+      "step": 18061
+    },
+    {
+      "epoch": 3.215633903133903,
+      "grad_norm": 0.9700032472610474,
+      "learning_rate": 1.8446806040039367e-05,
+      "loss": 0.7131,
+      "step": 18062
+    },
+    {
+      "epoch": 3.215811965811966,
+      "grad_norm": 0.894807755947113,
+      "learning_rate": 1.8438706332958965e-05,
+      "loss": 0.7727,
+      "step": 18063
+    },
+    {
+      "epoch": 3.2159900284900287,
+      "grad_norm": 0.85206538438797,
+      "learning_rate": 1.843060822390269e-05,
+      "loss": 0.7985,
+      "step": 18064
+    },
+    {
+      "epoch": 3.216168091168091,
+      "grad_norm": 1.073621392250061,
+      "learning_rate": 1.842251171302919e-05,
+      "loss": 0.7787,
+      "step": 18065
+    },
+    {
+      "epoch": 3.2163461538461537,
+      "grad_norm": 0.8918595314025879,
+      "learning_rate": 1.8414416800497125e-05,
+      "loss": 0.7749,
+      "step": 18066
+    },
+    {
+      "epoch": 3.2165242165242165,
+      "grad_norm": 0.8499172329902649,
+      "learning_rate": 1.8406323486465072e-05,
+      "loss": 0.3994,
+      "step": 18067
+    },
+    {
+      "epoch": 3.2167022792022792,
+      "grad_norm": 0.8019580245018005,
+      "learning_rate": 1.8398231771091613e-05,
+      "loss": 0.6772,
+      "step": 18068
+    },
+    {
+      "epoch": 3.216880341880342,
+      "grad_norm": 0.9750977754592896,
+      "learning_rate": 1.8390141654535265e-05,
+      "loss": 0.7399,
+      "step": 18069
+    },
+    {
+      "epoch": 3.2170584045584047,
+      "grad_norm": 1.0433316230773926,
+      "learning_rate": 1.8382053136954592e-05,
+      "loss": 0.767,
+      "step": 18070
+    },
+    {
+      "epoch": 3.217236467236467,
+      "grad_norm": 0.8822398781776428,
+      "learning_rate": 1.8373966218508043e-05,
+      "loss": 0.7635,
+      "step": 18071
+    },
+    {
+      "epoch": 3.21741452991453,
+      "grad_norm": 0.9446876049041748,
+      "learning_rate": 1.836588089935405e-05,
+      "loss": 0.6262,
+      "step": 18072
+    },
+    {
+      "epoch": 3.2175925925925926,
+      "grad_norm": 0.8917283415794373,
+      "learning_rate": 1.8357797179651047e-05,
+      "loss": 0.6993,
+      "step": 18073
+    },
+    {
+      "epoch": 3.2177706552706553,
+      "grad_norm": 0.8711534738540649,
+      "learning_rate": 1.8349715059557393e-05,
+      "loss": 0.7275,
+      "step": 18074
+    },
+    {
+      "epoch": 3.217948717948718,
+      "grad_norm": 1.0784484148025513,
+      "learning_rate": 1.83416345392315e-05,
+      "loss": 0.8006,
+      "step": 18075
+    },
+    {
+      "epoch": 3.218126780626781,
+      "grad_norm": 0.8539456725120544,
+      "learning_rate": 1.83335556188316e-05,
+      "loss": 0.68,
+      "step": 18076
+    },
+    {
+      "epoch": 3.218304843304843,
+      "grad_norm": 0.9416542053222656,
+      "learning_rate": 1.8325478298516063e-05,
+      "loss": 0.7378,
+      "step": 18077
+    },
+    {
+      "epoch": 3.218482905982906,
+      "grad_norm": 0.9718979001045227,
+      "learning_rate": 1.831740257844311e-05,
+      "loss": 0.7148,
+      "step": 18078
+    },
+    {
+      "epoch": 3.2186609686609686,
+      "grad_norm": 0.8982722163200378,
+      "learning_rate": 1.830932845877097e-05,
+      "loss": 0.8073,
+      "step": 18079
+    },
+    {
+      "epoch": 3.2188390313390314,
+      "grad_norm": 0.9609165191650391,
+      "learning_rate": 1.8301255939657834e-05,
+      "loss": 0.7771,
+      "step": 18080
+    },
+    {
+      "epoch": 3.219017094017094,
+      "grad_norm": 0.9416554570198059,
+      "learning_rate": 1.8293185021261884e-05,
+      "loss": 0.822,
+      "step": 18081
+    },
+    {
+      "epoch": 3.219195156695157,
+      "grad_norm": 0.8695271611213684,
+      "learning_rate": 1.828511570374124e-05,
+      "loss": 0.6192,
+      "step": 18082
+    },
+    {
+      "epoch": 3.219373219373219,
+      "grad_norm": 0.8800650238990784,
+      "learning_rate": 1.8277047987254003e-05,
+      "loss": 0.6805,
+      "step": 18083
+    },
+    {
+      "epoch": 3.219551282051282,
+      "grad_norm": 0.9437993764877319,
+      "learning_rate": 1.8268981871958225e-05,
+      "loss": 0.7948,
+      "step": 18084
+    },
+    {
+      "epoch": 3.2197293447293447,
+      "grad_norm": 1.0480988025665283,
+      "learning_rate": 1.826091735801202e-05,
+      "loss": 0.8144,
+      "step": 18085
+    },
+    {
+      "epoch": 3.2199074074074074,
+      "grad_norm": 0.9806156158447266,
+      "learning_rate": 1.8252854445573276e-05,
+      "loss": 0.5886,
+      "step": 18086
+    },
+    {
+      "epoch": 3.22008547008547,
+      "grad_norm": 1.0387870073318481,
+      "learning_rate": 1.8244793134800075e-05,
+      "loss": 0.7026,
+      "step": 18087
+    },
+    {
+      "epoch": 3.220263532763533,
+      "grad_norm": 1.1478195190429688,
+      "learning_rate": 1.8236733425850305e-05,
+      "loss": 0.7987,
+      "step": 18088
+    },
+    {
+      "epoch": 3.2204415954415953,
+      "grad_norm": 0.9605230093002319,
+      "learning_rate": 1.8228675318881906e-05,
+      "loss": 0.6876,
+      "step": 18089
+    },
+    {
+      "epoch": 3.220619658119658,
+      "grad_norm": 0.955807089805603,
+      "learning_rate": 1.822061881405275e-05,
+      "loss": 0.6625,
+      "step": 18090
+    },
+    {
+      "epoch": 3.2207977207977208,
+      "grad_norm": 0.8398988842964172,
+      "learning_rate": 1.8212563911520664e-05,
+      "loss": 0.6911,
+      "step": 18091
+    },
+    {
+      "epoch": 3.2209757834757835,
+      "grad_norm": 0.863762378692627,
+      "learning_rate": 1.8204510611443526e-05,
+      "loss": 0.7064,
+      "step": 18092
+    },
+    {
+      "epoch": 3.2211538461538463,
+      "grad_norm": 0.908447802066803,
+      "learning_rate": 1.8196458913979053e-05,
+      "loss": 0.7601,
+      "step": 18093
+    },
+    {
+      "epoch": 3.221331908831909,
+      "grad_norm": 0.9673567414283752,
+      "learning_rate": 1.818840881928505e-05,
+      "loss": 0.9187,
+      "step": 18094
+    },
+    {
+      "epoch": 3.2215099715099713,
+      "grad_norm": 0.9876523613929749,
+      "learning_rate": 1.818036032751923e-05,
+      "loss": 1.1142,
+      "step": 18095
+    },
+    {
+      "epoch": 3.221688034188034,
+      "grad_norm": 0.9074375033378601,
+      "learning_rate": 1.8172313438839284e-05,
+      "loss": 0.7491,
+      "step": 18096
+    },
+    {
+      "epoch": 3.221866096866097,
+      "grad_norm": 0.9420785903930664,
+      "learning_rate": 1.8164268153402875e-05,
+      "loss": 0.7464,
+      "step": 18097
+    },
+    {
+      "epoch": 3.2220441595441596,
+      "grad_norm": 0.9435096979141235,
+      "learning_rate": 1.815622447136762e-05,
+      "loss": 0.7585,
+      "step": 18098
+    },
+    {
+      "epoch": 3.2222222222222223,
+      "grad_norm": 0.8565973043441772,
+      "learning_rate": 1.814818239289112e-05,
+      "loss": 0.6909,
+      "step": 18099
+    },
+    {
+      "epoch": 3.222400284900285,
+      "grad_norm": 0.9183077812194824,
+      "learning_rate": 1.8140141918131004e-05,
+      "loss": 0.6136,
+      "step": 18100
+    },
+    {
+      "epoch": 3.2225783475783474,
+      "grad_norm": 0.8680392503738403,
+      "learning_rate": 1.813210304724471e-05,
+      "loss": 0.6443,
+      "step": 18101
+    },
+    {
+      "epoch": 3.22275641025641,
+      "grad_norm": 0.9555699229240417,
+      "learning_rate": 1.812406578038983e-05,
+      "loss": 0.8166,
+      "step": 18102
+    },
+    {
+      "epoch": 3.222934472934473,
+      "grad_norm": 1.0517683029174805,
+      "learning_rate": 1.811603011772375e-05,
+      "loss": 0.7405,
+      "step": 18103
+    },
+    {
+      "epoch": 3.2231125356125356,
+      "grad_norm": 0.8452093601226807,
+      "learning_rate": 1.8107996059403985e-05,
+      "loss": 0.8957,
+      "step": 18104
+    },
+    {
+      "epoch": 3.2232905982905984,
+      "grad_norm": 1.0134345293045044,
+      "learning_rate": 1.8099963605587932e-05,
+      "loss": 0.9799,
+      "step": 18105
+    },
+    {
+      "epoch": 3.223468660968661,
+      "grad_norm": 0.9043877720832825,
+      "learning_rate": 1.809193275643295e-05,
+      "loss": 0.6479,
+      "step": 18106
+    },
+    {
+      "epoch": 3.2236467236467234,
+      "grad_norm": 1.0354737043380737,
+      "learning_rate": 1.80839035120964e-05,
+      "loss": 0.8212,
+      "step": 18107
+    },
+    {
+      "epoch": 3.223824786324786,
+      "grad_norm": 1.0781883001327515,
+      "learning_rate": 1.8075875872735602e-05,
+      "loss": 0.8305,
+      "step": 18108
+    },
+    {
+      "epoch": 3.224002849002849,
+      "grad_norm": 0.9643750786781311,
+      "learning_rate": 1.80678498385078e-05,
+      "loss": 0.5684,
+      "step": 18109
+    },
+    {
+      "epoch": 3.2241809116809117,
+      "grad_norm": 0.9557623267173767,
+      "learning_rate": 1.8059825409570308e-05,
+      "loss": 0.7995,
+      "step": 18110
+    },
+    {
+      "epoch": 3.2243589743589745,
+      "grad_norm": 0.9005846381187439,
+      "learning_rate": 1.8051802586080312e-05,
+      "loss": 0.655,
+      "step": 18111
+    },
+    {
+      "epoch": 3.224537037037037,
+      "grad_norm": 0.985146701335907,
+      "learning_rate": 1.8043781368195024e-05,
+      "loss": 0.9267,
+      "step": 18112
+    },
+    {
+      "epoch": 3.2247150997150995,
+      "grad_norm": 0.9472562670707703,
+      "learning_rate": 1.803576175607159e-05,
+      "loss": 0.7262,
+      "step": 18113
+    },
+    {
+      "epoch": 3.2248931623931623,
+      "grad_norm": 1.2001210451126099,
+      "learning_rate": 1.802774374986711e-05,
+      "loss": 0.7161,
+      "step": 18114
+    },
+    {
+      "epoch": 3.225071225071225,
+      "grad_norm": 0.8562248945236206,
+      "learning_rate": 1.801972734973876e-05,
+      "loss": 0.7848,
+      "step": 18115
+    },
+    {
+      "epoch": 3.2252492877492878,
+      "grad_norm": 0.942823052406311,
+      "learning_rate": 1.8011712555843496e-05,
+      "loss": 0.8428,
+      "step": 18116
+    },
+    {
+      "epoch": 3.2254273504273505,
+      "grad_norm": 0.9686674475669861,
+      "learning_rate": 1.8003699368338466e-05,
+      "loss": 0.6471,
+      "step": 18117
+    },
+    {
+      "epoch": 3.2256054131054133,
+      "grad_norm": 1.0388344526290894,
+      "learning_rate": 1.7995687787380566e-05,
+      "loss": 0.7895,
+      "step": 18118
+    },
+    {
+      "epoch": 3.2257834757834756,
+      "grad_norm": 0.9582616686820984,
+      "learning_rate": 1.7987677813126836e-05,
+      "loss": 0.7369,
+      "step": 18119
+    },
+    {
+      "epoch": 3.2259615384615383,
+      "grad_norm": 0.9572505354881287,
+      "learning_rate": 1.7979669445734194e-05,
+      "loss": 0.7163,
+      "step": 18120
+    },
+    {
+      "epoch": 3.226139601139601,
+      "grad_norm": 0.9522523880004883,
+      "learning_rate": 1.7971662685359547e-05,
+      "loss": 0.8887,
+      "step": 18121
+    },
+    {
+      "epoch": 3.226317663817664,
+      "grad_norm": 1.0779093503952026,
+      "learning_rate": 1.7963657532159773e-05,
+      "loss": 0.8593,
+      "step": 18122
+    },
+    {
+      "epoch": 3.2264957264957266,
+      "grad_norm": 0.926485538482666,
+      "learning_rate": 1.795565398629171e-05,
+      "loss": 0.7664,
+      "step": 18123
+    },
+    {
+      "epoch": 3.2266737891737893,
+      "grad_norm": 1.0132343769073486,
+      "learning_rate": 1.7947652047912144e-05,
+      "loss": 0.7165,
+      "step": 18124
+    },
+    {
+      "epoch": 3.226851851851852,
+      "grad_norm": 0.8460900783538818,
+      "learning_rate": 1.793965171717795e-05,
+      "loss": 0.5995,
+      "step": 18125
+    },
+    {
+      "epoch": 3.2270299145299144,
+      "grad_norm": 0.8448793292045593,
+      "learning_rate": 1.793165299424576e-05,
+      "loss": 0.6078,
+      "step": 18126
+    },
+    {
+      "epoch": 3.227207977207977,
+      "grad_norm": 1.0588792562484741,
+      "learning_rate": 1.7923655879272393e-05,
+      "loss": 0.7189,
+      "step": 18127
+    },
+    {
+      "epoch": 3.22738603988604,
+      "grad_norm": 0.9055169820785522,
+      "learning_rate": 1.7915660372414443e-05,
+      "loss": 0.7475,
+      "step": 18128
+    },
+    {
+      "epoch": 3.2275641025641026,
+      "grad_norm": 0.9485293030738831,
+      "learning_rate": 1.7907666473828643e-05,
+      "loss": 0.7638,
+      "step": 18129
+    },
+    {
+      "epoch": 3.2277421652421654,
+      "grad_norm": 0.8959475755691528,
+      "learning_rate": 1.7899674183671576e-05,
+      "loss": 0.7253,
+      "step": 18130
+    },
+    {
+      "epoch": 3.2279202279202277,
+      "grad_norm": 0.9282844066619873,
+      "learning_rate": 1.789168350209983e-05,
+      "loss": 0.7165,
+      "step": 18131
+    },
+    {
+      "epoch": 3.2280982905982905,
+      "grad_norm": 0.9341546297073364,
+      "learning_rate": 1.7883694429270026e-05,
+      "loss": 0.7162,
+      "step": 18132
+    },
+    {
+      "epoch": 3.228276353276353,
+      "grad_norm": 0.9306660890579224,
+      "learning_rate": 1.787570696533859e-05,
+      "loss": 0.7153,
+      "step": 18133
+    },
+    {
+      "epoch": 3.228454415954416,
+      "grad_norm": 1.024999976158142,
+      "learning_rate": 1.786772111046212e-05,
+      "loss": 0.819,
+      "step": 18134
+    },
+    {
+      "epoch": 3.2286324786324787,
+      "grad_norm": 0.9674655795097351,
+      "learning_rate": 1.7859736864797027e-05,
+      "loss": 0.9286,
+      "step": 18135
+    },
+    {
+      "epoch": 3.2288105413105415,
+      "grad_norm": 0.9502766728401184,
+      "learning_rate": 1.7851754228499763e-05,
+      "loss": 0.6461,
+      "step": 18136
+    },
+    {
+      "epoch": 3.228988603988604,
+      "grad_norm": 1.0583183765411377,
+      "learning_rate": 1.784377320172672e-05,
+      "loss": 0.812,
+      "step": 18137
+    },
+    {
+      "epoch": 3.2291666666666665,
+      "grad_norm": 0.9369902014732361,
+      "learning_rate": 1.783579378463428e-05,
+      "loss": 0.7371,
+      "step": 18138
+    },
+    {
+      "epoch": 3.2293447293447293,
+      "grad_norm": 0.9534862637519836,
+      "learning_rate": 1.7827815977378782e-05,
+      "loss": 0.8534,
+      "step": 18139
+    },
+    {
+      "epoch": 3.229522792022792,
+      "grad_norm": 0.9020227789878845,
+      "learning_rate": 1.7819839780116533e-05,
+      "loss": 0.6451,
+      "step": 18140
+    },
+    {
+      "epoch": 3.2297008547008548,
+      "grad_norm": 1.0265909433364868,
+      "learning_rate": 1.7811865193003784e-05,
+      "loss": 0.9164,
+      "step": 18141
+    },
+    {
+      "epoch": 3.2298789173789175,
+      "grad_norm": 0.8694081902503967,
+      "learning_rate": 1.7803892216196848e-05,
+      "loss": 0.6521,
+      "step": 18142
+    },
+    {
+      "epoch": 3.23005698005698,
+      "grad_norm": 0.9590677618980408,
+      "learning_rate": 1.7795920849851854e-05,
+      "loss": 0.787,
+      "step": 18143
+    },
+    {
+      "epoch": 3.2302350427350426,
+      "grad_norm": 0.9131260514259338,
+      "learning_rate": 1.7787951094125055e-05,
+      "loss": 0.7033,
+      "step": 18144
+    },
+    {
+      "epoch": 3.2304131054131053,
+      "grad_norm": 0.9520593881607056,
+      "learning_rate": 1.7779982949172568e-05,
+      "loss": 0.7207,
+      "step": 18145
+    },
+    {
+      "epoch": 3.230591168091168,
+      "grad_norm": 0.8993405103683472,
+      "learning_rate": 1.7772016415150518e-05,
+      "loss": 0.6616,
+      "step": 18146
+    },
+    {
+      "epoch": 3.230769230769231,
+      "grad_norm": 0.9910643100738525,
+      "learning_rate": 1.7764051492214994e-05,
+      "loss": 0.7724,
+      "step": 18147
+    },
+    {
+      "epoch": 3.2309472934472936,
+      "grad_norm": 0.9680421948432922,
+      "learning_rate": 1.7756088180522045e-05,
+      "loss": 0.7549,
+      "step": 18148
+    },
+    {
+      "epoch": 3.2311253561253563,
+      "grad_norm": 0.8450651168823242,
+      "learning_rate": 1.774812648022768e-05,
+      "loss": 0.6817,
+      "step": 18149
+    },
+    {
+      "epoch": 3.2313034188034186,
+      "grad_norm": 1.028572678565979,
+      "learning_rate": 1.7740166391487946e-05,
+      "loss": 0.7072,
+      "step": 18150
+    },
+    {
+      "epoch": 3.2314814814814814,
+      "grad_norm": 0.9712592959403992,
+      "learning_rate": 1.7732207914458754e-05,
+      "loss": 0.6691,
+      "step": 18151
+    },
+    {
+      "epoch": 3.231659544159544,
+      "grad_norm": 1.0358368158340454,
+      "learning_rate": 1.772425104929607e-05,
+      "loss": 0.6961,
+      "step": 18152
+    },
+    {
+      "epoch": 3.231837606837607,
+      "grad_norm": 1.058782935142517,
+      "learning_rate": 1.771629579615576e-05,
+      "loss": 0.7609,
+      "step": 18153
+    },
+    {
+      "epoch": 3.2320156695156697,
+      "grad_norm": 0.9332877993583679,
+      "learning_rate": 1.7708342155193715e-05,
+      "loss": 0.7787,
+      "step": 18154
+    },
+    {
+      "epoch": 3.232193732193732,
+      "grad_norm": 0.9538434147834778,
+      "learning_rate": 1.7700390126565747e-05,
+      "loss": 0.8176,
+      "step": 18155
+    },
+    {
+      "epoch": 3.2323717948717947,
+      "grad_norm": 0.9596571922302246,
+      "learning_rate": 1.7692439710427655e-05,
+      "loss": 0.8309,
+      "step": 18156
+    },
+    {
+      "epoch": 3.2325498575498575,
+      "grad_norm": 1.0157568454742432,
+      "learning_rate": 1.768449090693528e-05,
+      "loss": 1.1549,
+      "step": 18157
+    },
+    {
+      "epoch": 3.23272792022792,
+      "grad_norm": 1.0346252918243408,
+      "learning_rate": 1.7676543716244254e-05,
+      "loss": 0.892,
+      "step": 18158
+    },
+    {
+      "epoch": 3.232905982905983,
+      "grad_norm": 0.8448618650436401,
+      "learning_rate": 1.766859813851037e-05,
+      "loss": 0.6618,
+      "step": 18159
+    },
+    {
+      "epoch": 3.2330840455840457,
+      "grad_norm": 0.9361056685447693,
+      "learning_rate": 1.766065417388928e-05,
+      "loss": 0.6775,
+      "step": 18160
+    },
+    {
+      "epoch": 3.2332621082621085,
+      "grad_norm": 1.0730383396148682,
+      "learning_rate": 1.7652711822536617e-05,
+      "loss": 0.7644,
+      "step": 18161
+    },
+    {
+      "epoch": 3.2334401709401708,
+      "grad_norm": 0.8408196568489075,
+      "learning_rate": 1.7644771084608015e-05,
+      "loss": 0.742,
+      "step": 18162
+    },
+    {
+      "epoch": 3.2336182336182335,
+      "grad_norm": 1.196700096130371,
+      "learning_rate": 1.7636831960259038e-05,
+      "loss": 0.7993,
+      "step": 18163
+    },
+    {
+      "epoch": 3.2337962962962963,
+      "grad_norm": 0.781534731388092,
+      "learning_rate": 1.7628894449645238e-05,
+      "loss": 0.5319,
+      "step": 18164
+    },
+    {
+      "epoch": 3.233974358974359,
+      "grad_norm": 0.9107591509819031,
+      "learning_rate": 1.762095855292215e-05,
+      "loss": 0.6581,
+      "step": 18165
+    },
+    {
+      "epoch": 3.234152421652422,
+      "grad_norm": 1.1640515327453613,
+      "learning_rate": 1.7613024270245226e-05,
+      "loss": 1.0668,
+      "step": 18166
+    },
+    {
+      "epoch": 3.2343304843304845,
+      "grad_norm": 0.9013523459434509,
+      "learning_rate": 1.7605091601769998e-05,
+      "loss": 0.5969,
+      "step": 18167
+    },
+    {
+      "epoch": 3.234508547008547,
+      "grad_norm": 0.8890687227249146,
+      "learning_rate": 1.759716054765178e-05,
+      "loss": 0.5108,
+      "step": 18168
+    },
+    {
+      "epoch": 3.2346866096866096,
+      "grad_norm": 0.995286226272583,
+      "learning_rate": 1.7589231108046068e-05,
+      "loss": 0.8642,
+      "step": 18169
+    },
+    {
+      "epoch": 3.2348646723646723,
+      "grad_norm": 0.8938494920730591,
+      "learning_rate": 1.7581303283108164e-05,
+      "loss": 0.7484,
+      "step": 18170
+    },
+    {
+      "epoch": 3.235042735042735,
+      "grad_norm": 0.9409791827201843,
+      "learning_rate": 1.757337707299339e-05,
+      "loss": 0.7209,
+      "step": 18171
+    },
+    {
+      "epoch": 3.235220797720798,
+      "grad_norm": 0.8960477709770203,
+      "learning_rate": 1.7565452477857115e-05,
+      "loss": 0.7344,
+      "step": 18172
+    },
+    {
+      "epoch": 3.2353988603988606,
+      "grad_norm": 1.0025544166564941,
+      "learning_rate": 1.755752949785451e-05,
+      "loss": 0.8146,
+      "step": 18173
+    },
+    {
+      "epoch": 3.235576923076923,
+      "grad_norm": 1.1479045152664185,
+      "learning_rate": 1.7549608133140895e-05,
+      "loss": 0.9372,
+      "step": 18174
+    },
+    {
+      "epoch": 3.2357549857549857,
+      "grad_norm": 0.8442793488502502,
+      "learning_rate": 1.754168838387139e-05,
+      "loss": 0.6714,
+      "step": 18175
+    },
+    {
+      "epoch": 3.2359330484330484,
+      "grad_norm": 0.985490083694458,
+      "learning_rate": 1.753377025020123e-05,
+      "loss": 0.9846,
+      "step": 18176
+    },
+    {
+      "epoch": 3.236111111111111,
+      "grad_norm": 1.0424950122833252,
+      "learning_rate": 1.7525853732285545e-05,
+      "loss": 0.8757,
+      "step": 18177
+    },
+    {
+      "epoch": 3.236289173789174,
+      "grad_norm": 0.8219842314720154,
+      "learning_rate": 1.7517938830279423e-05,
+      "loss": 0.5383,
+      "step": 18178
+    },
+    {
+      "epoch": 3.2364672364672367,
+      "grad_norm": 0.9854603409767151,
+      "learning_rate": 1.751002554433795e-05,
+      "loss": 0.8043,
+      "step": 18179
+    },
+    {
+      "epoch": 3.236645299145299,
+      "grad_norm": 0.9642623662948608,
+      "learning_rate": 1.7502113874616167e-05,
+      "loss": 0.6575,
+      "step": 18180
+    },
+    {
+      "epoch": 3.2368233618233617,
+      "grad_norm": 0.9247049689292908,
+      "learning_rate": 1.7494203821269062e-05,
+      "loss": 0.6501,
+      "step": 18181
+    },
+    {
+      "epoch": 3.2370014245014245,
+      "grad_norm": 0.8187673091888428,
+      "learning_rate": 1.748629538445169e-05,
+      "loss": 0.5873,
+      "step": 18182
+    },
+    {
+      "epoch": 3.2371794871794872,
+      "grad_norm": 0.957827627658844,
+      "learning_rate": 1.7478388564318905e-05,
+      "loss": 0.8297,
+      "step": 18183
+    },
+    {
+      "epoch": 3.23735754985755,
+      "grad_norm": 1.021424651145935,
+      "learning_rate": 1.747048336102569e-05,
+      "loss": 0.6894,
+      "step": 18184
+    },
+    {
+      "epoch": 3.2375356125356127,
+      "grad_norm": 0.9959956407546997,
+      "learning_rate": 1.746257977472693e-05,
+      "loss": 0.7137,
+      "step": 18185
+    },
+    {
+      "epoch": 3.237713675213675,
+      "grad_norm": 0.9278098940849304,
+      "learning_rate": 1.7454677805577445e-05,
+      "loss": 0.8262,
+      "step": 18186
+    },
+    {
+      "epoch": 3.237891737891738,
+      "grad_norm": 0.9757589101791382,
+      "learning_rate": 1.7446777453732088e-05,
+      "loss": 0.7696,
+      "step": 18187
+    },
+    {
+      "epoch": 3.2380698005698005,
+      "grad_norm": 0.9537236094474792,
+      "learning_rate": 1.743887871934563e-05,
+      "loss": 0.8835,
+      "step": 18188
+    },
+    {
+      "epoch": 3.2382478632478633,
+      "grad_norm": 1.00649893283844,
+      "learning_rate": 1.743098160257284e-05,
+      "loss": 0.7316,
+      "step": 18189
+    },
+    {
+      "epoch": 3.238425925925926,
+      "grad_norm": 0.8231672644615173,
+      "learning_rate": 1.742308610356842e-05,
+      "loss": 0.7836,
+      "step": 18190
+    },
+    {
+      "epoch": 3.238603988603989,
+      "grad_norm": 1.019817590713501,
+      "learning_rate": 1.741519222248711e-05,
+      "loss": 0.8218,
+      "step": 18191
+    },
+    {
+      "epoch": 3.238782051282051,
+      "grad_norm": 1.0095518827438354,
+      "learning_rate": 1.740729995948356e-05,
+      "loss": 0.7905,
+      "step": 18192
+    },
+    {
+      "epoch": 3.238960113960114,
+      "grad_norm": 1.031113862991333,
+      "learning_rate": 1.739940931471239e-05,
+      "loss": 0.7208,
+      "step": 18193
+    },
+    {
+      "epoch": 3.2391381766381766,
+      "grad_norm": 0.7880609035491943,
+      "learning_rate": 1.7391520288328212e-05,
+      "loss": 0.598,
+      "step": 18194
+    },
+    {
+      "epoch": 3.2393162393162394,
+      "grad_norm": 0.9234829545021057,
+      "learning_rate": 1.738363288048558e-05,
+      "loss": 0.7713,
+      "step": 18195
+    },
+    {
+      "epoch": 3.239494301994302,
+      "grad_norm": 0.8925933241844177,
+      "learning_rate": 1.7375747091339024e-05,
+      "loss": 0.6134,
+      "step": 18196
+    },
+    {
+      "epoch": 3.239672364672365,
+      "grad_norm": 0.9539459943771362,
+      "learning_rate": 1.736786292104312e-05,
+      "loss": 0.794,
+      "step": 18197
+    },
+    {
+      "epoch": 3.239850427350427,
+      "grad_norm": 0.9326531291007996,
+      "learning_rate": 1.7359980369752228e-05,
+      "loss": 0.6804,
+      "step": 18198
+    },
+    {
+      "epoch": 3.24002849002849,
+      "grad_norm": 0.9107605218887329,
+      "learning_rate": 1.7352099437620907e-05,
+      "loss": 0.6677,
+      "step": 18199
+    },
+    {
+      "epoch": 3.2402065527065527,
+      "grad_norm": 1.0316441059112549,
+      "learning_rate": 1.7344220124803468e-05,
+      "loss": 0.7564,
+      "step": 18200
+    },
+    {
+      "epoch": 3.2403846153846154,
+      "grad_norm": 1.1710155010223389,
+      "learning_rate": 1.7336342431454345e-05,
+      "loss": 0.8665,
+      "step": 18201
+    },
+    {
+      "epoch": 3.240562678062678,
+      "grad_norm": 1.002057671546936,
+      "learning_rate": 1.732846635772788e-05,
+      "loss": 0.6974,
+      "step": 18202
+    },
+    {
+      "epoch": 3.240740740740741,
+      "grad_norm": 0.8545549511909485,
+      "learning_rate": 1.732059190377838e-05,
+      "loss": 0.6291,
+      "step": 18203
+    },
+    {
+      "epoch": 3.2409188034188032,
+      "grad_norm": 0.9257392287254333,
+      "learning_rate": 1.731271906976013e-05,
+      "loss": 0.8188,
+      "step": 18204
+    },
+    {
+      "epoch": 3.241096866096866,
+      "grad_norm": 0.8662429451942444,
+      "learning_rate": 1.7304847855827388e-05,
+      "loss": 0.7252,
+      "step": 18205
+    },
+    {
+      "epoch": 3.2412749287749287,
+      "grad_norm": 0.8656207919120789,
+      "learning_rate": 1.7296978262134333e-05,
+      "loss": 0.6465,
+      "step": 18206
+    },
+    {
+      "epoch": 3.2414529914529915,
+      "grad_norm": 0.8379673361778259,
+      "learning_rate": 1.728911028883524e-05,
+      "loss": 0.7603,
+      "step": 18207
+    },
+    {
+      "epoch": 3.2416310541310542,
+      "grad_norm": 0.8996121287345886,
+      "learning_rate": 1.7281243936084168e-05,
+      "loss": 0.7718,
+      "step": 18208
+    },
+    {
+      "epoch": 3.241809116809117,
+      "grad_norm": 0.9700300097465515,
+      "learning_rate": 1.727337920403531e-05,
+      "loss": 0.9161,
+      "step": 18209
+    },
+    {
+      "epoch": 3.2419871794871793,
+      "grad_norm": 0.9493532776832581,
+      "learning_rate": 1.726551609284275e-05,
+      "loss": 0.8266,
+      "step": 18210
+    },
+    {
+      "epoch": 3.242165242165242,
+      "grad_norm": 0.9822573661804199,
+      "learning_rate": 1.7257654602660523e-05,
+      "loss": 0.743,
+      "step": 18211
+    },
+    {
+      "epoch": 3.242343304843305,
+      "grad_norm": 0.9318371415138245,
+      "learning_rate": 1.7249794733642665e-05,
+      "loss": 0.6753,
+      "step": 18212
+    },
+    {
+      "epoch": 3.2425213675213675,
+      "grad_norm": 0.9471957683563232,
+      "learning_rate": 1.7241936485943168e-05,
+      "loss": 0.8213,
+      "step": 18213
+    },
+    {
+      "epoch": 3.2426994301994303,
+      "grad_norm": 1.0344058275222778,
+      "learning_rate": 1.7234079859716057e-05,
+      "loss": 0.897,
+      "step": 18214
+    },
+    {
+      "epoch": 3.242877492877493,
+      "grad_norm": 1.0287681818008423,
+      "learning_rate": 1.722622485511517e-05,
+      "loss": 0.8328,
+      "step": 18215
+    },
+    {
+      "epoch": 3.2430555555555554,
+      "grad_norm": 1.0139912366867065,
+      "learning_rate": 1.721837147229448e-05,
+      "loss": 0.7253,
+      "step": 18216
+    },
+    {
+      "epoch": 3.243233618233618,
+      "grad_norm": 0.9414315819740295,
+      "learning_rate": 1.7210519711407847e-05,
+      "loss": 0.6769,
+      "step": 18217
+    },
+    {
+      "epoch": 3.243411680911681,
+      "grad_norm": 0.9098498225212097,
+      "learning_rate": 1.7202669572609088e-05,
+      "loss": 0.8154,
+      "step": 18218
+    },
+    {
+      "epoch": 3.2435897435897436,
+      "grad_norm": 0.8019130825996399,
+      "learning_rate": 1.7194821056052033e-05,
+      "loss": 0.7373,
+      "step": 18219
+    },
+    {
+      "epoch": 3.2437678062678064,
+      "grad_norm": 0.8887826800346375,
+      "learning_rate": 1.7186974161890435e-05,
+      "loss": 0.6664,
+      "step": 18220
+    },
+    {
+      "epoch": 3.243945868945869,
+      "grad_norm": 0.9187814593315125,
+      "learning_rate": 1.7179128890278028e-05,
+      "loss": 0.8802,
+      "step": 18221
+    },
+    {
+      "epoch": 3.2441239316239314,
+      "grad_norm": 1.022515892982483,
+      "learning_rate": 1.7171285241368606e-05,
+      "loss": 0.8225,
+      "step": 18222
+    },
+    {
+      "epoch": 3.244301994301994,
+      "grad_norm": 0.9992332458496094,
+      "learning_rate": 1.716344321531572e-05,
+      "loss": 0.747,
+      "step": 18223
+    },
+    {
+      "epoch": 3.244480056980057,
+      "grad_norm": 0.9298896193504333,
+      "learning_rate": 1.7155602812273152e-05,
+      "loss": 0.6622,
+      "step": 18224
+    },
+    {
+      "epoch": 3.2446581196581197,
+      "grad_norm": 0.9849324822425842,
+      "learning_rate": 1.71477640323944e-05,
+      "loss": 0.7203,
+      "step": 18225
+    },
+    {
+      "epoch": 3.2448361823361824,
+      "grad_norm": 0.9874289035797119,
+      "learning_rate": 1.7139926875833124e-05,
+      "loss": 0.7284,
+      "step": 18226
+    },
+    {
+      "epoch": 3.245014245014245,
+      "grad_norm": 0.9077398777008057,
+      "learning_rate": 1.713209134274285e-05,
+      "loss": 0.6127,
+      "step": 18227
+    },
+    {
+      "epoch": 3.2451923076923075,
+      "grad_norm": 0.9049743413925171,
+      "learning_rate": 1.7124257433277102e-05,
+      "loss": 0.8492,
+      "step": 18228
+    },
+    {
+      "epoch": 3.2453703703703702,
+      "grad_norm": 0.7180083394050598,
+      "learning_rate": 1.7116425147589378e-05,
+      "loss": 0.4446,
+      "step": 18229
+    },
+    {
+      "epoch": 3.245548433048433,
+      "grad_norm": 1.1515785455703735,
+      "learning_rate": 1.7108594485833095e-05,
+      "loss": 0.8079,
+      "step": 18230
+    },
+    {
+      "epoch": 3.2457264957264957,
+      "grad_norm": 1.0877320766448975,
+      "learning_rate": 1.710076544816174e-05,
+      "loss": 0.7127,
+      "step": 18231
+    },
+    {
+      "epoch": 3.2459045584045585,
+      "grad_norm": 0.8861889839172363,
+      "learning_rate": 1.7092938034728677e-05,
+      "loss": 0.6444,
+      "step": 18232
+    },
+    {
+      "epoch": 3.2460826210826212,
+      "grad_norm": 1.0296967029571533,
+      "learning_rate": 1.7085112245687263e-05,
+      "loss": 0.7912,
+      "step": 18233
+    },
+    {
+      "epoch": 3.246260683760684,
+      "grad_norm": 0.9024568796157837,
+      "learning_rate": 1.7077288081190846e-05,
+      "loss": 0.5452,
+      "step": 18234
+    },
+    {
+      "epoch": 3.2464387464387463,
+      "grad_norm": 0.9017095565795898,
+      "learning_rate": 1.7069465541392703e-05,
+      "loss": 0.7819,
+      "step": 18235
+    },
+    {
+      "epoch": 3.246616809116809,
+      "grad_norm": 0.9934529662132263,
+      "learning_rate": 1.7061644626446116e-05,
+      "loss": 0.8196,
+      "step": 18236
+    },
+    {
+      "epoch": 3.246794871794872,
+      "grad_norm": 1.0247230529785156,
+      "learning_rate": 1.7053825336504304e-05,
+      "loss": 0.9648,
+      "step": 18237
+    },
+    {
+      "epoch": 3.2469729344729346,
+      "grad_norm": 0.9507484436035156,
+      "learning_rate": 1.7046007671720453e-05,
+      "loss": 0.8102,
+      "step": 18238
+    },
+    {
+      "epoch": 3.2471509971509973,
+      "grad_norm": 1.002600908279419,
+      "learning_rate": 1.7038191632247826e-05,
+      "loss": 0.7973,
+      "step": 18239
+    },
+    {
+      "epoch": 3.2473290598290596,
+      "grad_norm": 0.963873028755188,
+      "learning_rate": 1.7030377218239434e-05,
+      "loss": 0.9603,
+      "step": 18240
+    },
+    {
+      "epoch": 3.2475071225071224,
+      "grad_norm": 0.9259035587310791,
+      "learning_rate": 1.7022564429848465e-05,
+      "loss": 0.7526,
+      "step": 18241
+    },
+    {
+      "epoch": 3.247685185185185,
+      "grad_norm": 1.0171494483947754,
+      "learning_rate": 1.701475326722799e-05,
+      "loss": 0.8504,
+      "step": 18242
+    },
+    {
+      "epoch": 3.247863247863248,
+      "grad_norm": 0.8850985765457153,
+      "learning_rate": 1.700694373053102e-05,
+      "loss": 0.7934,
+      "step": 18243
+    },
+    {
+      "epoch": 3.2480413105413106,
+      "grad_norm": 0.8500893712043762,
+      "learning_rate": 1.6999135819910607e-05,
+      "loss": 0.7683,
+      "step": 18244
+    },
+    {
+      "epoch": 3.2482193732193734,
+      "grad_norm": 0.9575412273406982,
+      "learning_rate": 1.6991329535519685e-05,
+      "loss": 0.7072,
+      "step": 18245
+    },
+    {
+      "epoch": 3.248397435897436,
+      "grad_norm": 1.0330621004104614,
+      "learning_rate": 1.6983524877511236e-05,
+      "loss": 0.7366,
+      "step": 18246
+    },
+    {
+      "epoch": 3.2485754985754984,
+      "grad_norm": 0.9334800839424133,
+      "learning_rate": 1.6975721846038173e-05,
+      "loss": 0.768,
+      "step": 18247
+    },
+    {
+      "epoch": 3.248753561253561,
+      "grad_norm": 1.092702031135559,
+      "learning_rate": 1.696792044125334e-05,
+      "loss": 0.5735,
+      "step": 18248
+    },
+    {
+      "epoch": 3.248931623931624,
+      "grad_norm": 0.8769099116325378,
+      "learning_rate": 1.6960120663309643e-05,
+      "loss": 0.7181,
+      "step": 18249
+    },
+    {
+      "epoch": 3.2491096866096867,
+      "grad_norm": 0.8985859751701355,
+      "learning_rate": 1.6952322512359886e-05,
+      "loss": 0.6442,
+      "step": 18250
+    },
+    {
+      "epoch": 3.2492877492877494,
+      "grad_norm": 0.9657853841781616,
+      "learning_rate": 1.6944525988556847e-05,
+      "loss": 0.8127,
+      "step": 18251
+    },
+    {
+      "epoch": 3.2494658119658117,
+      "grad_norm": 0.9426729679107666,
+      "learning_rate": 1.6936731092053292e-05,
+      "loss": 0.9476,
+      "step": 18252
+    },
+    {
+      "epoch": 3.2494658119658117,
+      "eval_loss": 1.1752357482910156,
+      "eval_runtime": 35.242,
+      "eval_samples_per_second": 29.539,
+      "eval_steps_per_second": 14.784,
+      "step": 18252
+    },
+    {
+      "epoch": 3.2496438746438745,
+      "grad_norm": 0.9611350893974304,
+      "learning_rate": 1.6928937823001922e-05,
+      "loss": 0.7863,
+      "step": 18253
+    },
+    {
+      "epoch": 3.2498219373219372,
+      "grad_norm": 0.9414992928504944,
+      "learning_rate": 1.692114618155549e-05,
+      "loss": 0.8036,
+      "step": 18254
+    },
+    {
+      "epoch": 3.25,
+      "grad_norm": 0.9665080904960632,
+      "learning_rate": 1.6913356167866578e-05,
+      "loss": 0.8325,
+      "step": 18255
+    },
+    {
+      "epoch": 3.2501780626780628,
+      "grad_norm": 1.0661953687667847,
+      "learning_rate": 1.690556778208787e-05,
+      "loss": 0.9415,
+      "step": 18256
+    },
+    {
+      "epoch": 3.2503561253561255,
+      "grad_norm": 0.9129988551139832,
+      "learning_rate": 1.689778102437196e-05,
+      "loss": 0.9128,
+      "step": 18257
+    },
+    {
+      "epoch": 3.2505341880341883,
+      "grad_norm": 0.8086062073707581,
+      "learning_rate": 1.6889995894871392e-05,
+      "loss": 0.6195,
+      "step": 18258
+    },
+    {
+      "epoch": 3.2507122507122506,
+      "grad_norm": 0.9159565567970276,
+      "learning_rate": 1.6882212393738707e-05,
+      "loss": 0.6897,
+      "step": 18259
+    },
+    {
+      "epoch": 3.2508903133903133,
+      "grad_norm": 0.9264681339263916,
+      "learning_rate": 1.6874430521126417e-05,
+      "loss": 0.8267,
+      "step": 18260
+    },
+    {
+      "epoch": 3.251068376068376,
+      "grad_norm": 1.0706672668457031,
+      "learning_rate": 1.6866650277186967e-05,
+      "loss": 0.8832,
+      "step": 18261
+    },
+    {
+      "epoch": 3.251246438746439,
+      "grad_norm": 0.8599040508270264,
+      "learning_rate": 1.6858871662072827e-05,
+      "loss": 0.6803,
+      "step": 18262
+    },
+    {
+      "epoch": 3.2514245014245016,
+      "grad_norm": 0.9120079278945923,
+      "learning_rate": 1.6851094675936353e-05,
+      "loss": 0.6025,
+      "step": 18263
+    },
+    {
+      "epoch": 3.251602564102564,
+      "grad_norm": 0.7756575345993042,
+      "learning_rate": 1.6843319318929996e-05,
+      "loss": 0.5763,
+      "step": 18264
+    },
+    {
+      "epoch": 3.2517806267806266,
+      "grad_norm": 0.8564868569374084,
+      "learning_rate": 1.6835545591206014e-05,
+      "loss": 0.7513,
+      "step": 18265
+    },
+    {
+      "epoch": 3.2519586894586894,
+      "grad_norm": 0.9959290623664856,
+      "learning_rate": 1.6827773492916778e-05,
+      "loss": 0.7217,
+      "step": 18266
+    },
+    {
+      "epoch": 3.252136752136752,
+      "grad_norm": 0.8105573058128357,
+      "learning_rate": 1.6820003024214547e-05,
+      "loss": 0.6058,
+      "step": 18267
+    },
+    {
+      "epoch": 3.252314814814815,
+      "grad_norm": 0.8831157088279724,
+      "learning_rate": 1.681223418525154e-05,
+      "loss": 0.7648,
+      "step": 18268
+    },
+    {
+      "epoch": 3.2524928774928776,
+      "grad_norm": 0.932885468006134,
+      "learning_rate": 1.6804466976180046e-05,
+      "loss": 0.8404,
+      "step": 18269
+    },
+    {
+      "epoch": 3.2526709401709404,
+      "grad_norm": 0.9935491681098938,
+      "learning_rate": 1.6796701397152147e-05,
+      "loss": 0.7497,
+      "step": 18270
+    },
+    {
+      "epoch": 3.2528490028490027,
+      "grad_norm": 0.9661284685134888,
+      "learning_rate": 1.6788937448320084e-05,
+      "loss": 0.9749,
+      "step": 18271
+    },
+    {
+      "epoch": 3.2530270655270654,
+      "grad_norm": 0.8758362531661987,
+      "learning_rate": 1.6781175129835903e-05,
+      "loss": 0.84,
+      "step": 18272
+    },
+    {
+      "epoch": 3.253205128205128,
+      "grad_norm": 0.9549978375434875,
+      "learning_rate": 1.677341444185174e-05,
+      "loss": 0.9391,
+      "step": 18273
+    },
+    {
+      "epoch": 3.253383190883191,
+      "grad_norm": 0.9131140112876892,
+      "learning_rate": 1.676565538451963e-05,
+      "loss": 0.6248,
+      "step": 18274
+    },
+    {
+      "epoch": 3.2535612535612537,
+      "grad_norm": 0.9425809979438782,
+      "learning_rate": 1.675789795799161e-05,
+      "loss": 0.7493,
+      "step": 18275
+    },
+    {
+      "epoch": 3.253739316239316,
+      "grad_norm": 0.8880078792572021,
+      "learning_rate": 1.6750142162419647e-05,
+      "loss": 0.6994,
+      "step": 18276
+    },
+    {
+      "epoch": 3.2539173789173788,
+      "grad_norm": 0.9569603800773621,
+      "learning_rate": 1.674238799795572e-05,
+      "loss": 0.7584,
+      "step": 18277
+    },
+    {
+      "epoch": 3.2540954415954415,
+      "grad_norm": 0.9752117395401001,
+      "learning_rate": 1.6734635464751714e-05,
+      "loss": 0.9893,
+      "step": 18278
+    },
+    {
+      "epoch": 3.2542735042735043,
+      "grad_norm": 0.9450941681861877,
+      "learning_rate": 1.672688456295961e-05,
+      "loss": 0.709,
+      "step": 18279
+    },
+    {
+      "epoch": 3.254451566951567,
+      "grad_norm": 0.9441366791725159,
+      "learning_rate": 1.671913529273117e-05,
+      "loss": 0.7121,
+      "step": 18280
+    },
+    {
+      "epoch": 3.2546296296296298,
+      "grad_norm": 1.0751672983169556,
+      "learning_rate": 1.6711387654218323e-05,
+      "loss": 0.7037,
+      "step": 18281
+    },
+    {
+      "epoch": 3.2548076923076925,
+      "grad_norm": 0.8935137987136841,
+      "learning_rate": 1.6703641647572764e-05,
+      "loss": 0.6031,
+      "step": 18282
+    },
+    {
+      "epoch": 3.254985754985755,
+      "grad_norm": 0.8644391298294067,
+      "learning_rate": 1.6695897272946348e-05,
+      "loss": 0.7036,
+      "step": 18283
+    },
+    {
+      "epoch": 3.2551638176638176,
+      "grad_norm": 0.8370543718338013,
+      "learning_rate": 1.6688154530490773e-05,
+      "loss": 0.782,
+      "step": 18284
+    },
+    {
+      "epoch": 3.2553418803418803,
+      "grad_norm": 1.1523544788360596,
+      "learning_rate": 1.668041342035773e-05,
+      "loss": 1.0715,
+      "step": 18285
+    },
+    {
+      "epoch": 3.255519943019943,
+      "grad_norm": 1.0818554162979126,
+      "learning_rate": 1.6672673942698925e-05,
+      "loss": 0.8182,
+      "step": 18286
+    },
+    {
+      "epoch": 3.255698005698006,
+      "grad_norm": 0.9586027264595032,
+      "learning_rate": 1.666493609766596e-05,
+      "loss": 0.802,
+      "step": 18287
+    },
+    {
+      "epoch": 3.255876068376068,
+      "grad_norm": 0.9727693796157837,
+      "learning_rate": 1.6657199885410446e-05,
+      "loss": 0.7155,
+      "step": 18288
+    },
+    {
+      "epoch": 3.256054131054131,
+      "grad_norm": 1.0775129795074463,
+      "learning_rate": 1.664946530608399e-05,
+      "loss": 0.8433,
+      "step": 18289
+    },
+    {
+      "epoch": 3.2562321937321936,
+      "grad_norm": 0.9694477915763855,
+      "learning_rate": 1.6641732359838113e-05,
+      "loss": 0.791,
+      "step": 18290
+    },
+    {
+      "epoch": 3.2564102564102564,
+      "grad_norm": 0.9691938161849976,
+      "learning_rate": 1.6634001046824333e-05,
+      "loss": 0.8073,
+      "step": 18291
+    },
+    {
+      "epoch": 3.256588319088319,
+      "grad_norm": 1.036760926246643,
+      "learning_rate": 1.6626271367194123e-05,
+      "loss": 0.6048,
+      "step": 18292
+    },
+    {
+      "epoch": 3.256766381766382,
+      "grad_norm": 0.8390796184539795,
+      "learning_rate": 1.6618543321098912e-05,
+      "loss": 0.9326,
+      "step": 18293
+    },
+    {
+      "epoch": 3.2569444444444446,
+      "grad_norm": 0.927253007888794,
+      "learning_rate": 1.6610816908690184e-05,
+      "loss": 0.7526,
+      "step": 18294
+    },
+    {
+      "epoch": 3.257122507122507,
+      "grad_norm": 0.9034532308578491,
+      "learning_rate": 1.6603092130119214e-05,
+      "loss": 0.8068,
+      "step": 18295
+    },
+    {
+      "epoch": 3.2573005698005697,
+      "grad_norm": 0.9301304817199707,
+      "learning_rate": 1.6595368985537464e-05,
+      "loss": 0.7274,
+      "step": 18296
+    },
+    {
+      "epoch": 3.2574786324786325,
+      "grad_norm": 0.9493504166603088,
+      "learning_rate": 1.6587647475096157e-05,
+      "loss": 0.7109,
+      "step": 18297
+    },
+    {
+      "epoch": 3.257656695156695,
+      "grad_norm": 0.9562366008758545,
+      "learning_rate": 1.6579927598946644e-05,
+      "loss": 0.8517,
+      "step": 18298
+    },
+    {
+      "epoch": 3.257834757834758,
+      "grad_norm": 1.0997562408447266,
+      "learning_rate": 1.6572209357240155e-05,
+      "loss": 1.1135,
+      "step": 18299
+    },
+    {
+      "epoch": 3.2580128205128207,
+      "grad_norm": 1.040228009223938,
+      "learning_rate": 1.6564492750127925e-05,
+      "loss": 0.83,
+      "step": 18300
+    },
+    {
+      "epoch": 3.258190883190883,
+      "grad_norm": 0.9394234418869019,
+      "learning_rate": 1.6556777777761133e-05,
+      "loss": 0.6691,
+      "step": 18301
+    },
+    {
+      "epoch": 3.2583689458689458,
+      "grad_norm": 0.806209146976471,
+      "learning_rate": 1.654906444029095e-05,
+      "loss": 0.6905,
+      "step": 18302
+    },
+    {
+      "epoch": 3.2585470085470085,
+      "grad_norm": 1.2015374898910522,
+      "learning_rate": 1.6541352737868465e-05,
+      "loss": 0.7325,
+      "step": 18303
+    },
+    {
+      "epoch": 3.2587250712250713,
+      "grad_norm": 0.787306547164917,
+      "learning_rate": 1.6533642670644843e-05,
+      "loss": 0.6128,
+      "step": 18304
+    },
+    {
+      "epoch": 3.258903133903134,
+      "grad_norm": 1.0318105220794678,
+      "learning_rate": 1.652593423877107e-05,
+      "loss": 0.8617,
+      "step": 18305
+    },
+    {
+      "epoch": 3.2590811965811968,
+      "grad_norm": 1.0233045816421509,
+      "learning_rate": 1.651822744239826e-05,
+      "loss": 1.0166,
+      "step": 18306
+    },
+    {
+      "epoch": 3.259259259259259,
+      "grad_norm": 0.9209055304527283,
+      "learning_rate": 1.651052228167731e-05,
+      "loss": 0.8128,
+      "step": 18307
+    },
+    {
+      "epoch": 3.259437321937322,
+      "grad_norm": 1.0221130847930908,
+      "learning_rate": 1.6502818756759276e-05,
+      "loss": 0.8015,
+      "step": 18308
+    },
+    {
+      "epoch": 3.2596153846153846,
+      "grad_norm": 0.998290479183197,
+      "learning_rate": 1.6495116867795047e-05,
+      "loss": 0.752,
+      "step": 18309
+    },
+    {
+      "epoch": 3.2597934472934473,
+      "grad_norm": 0.9447212815284729,
+      "learning_rate": 1.6487416614935513e-05,
+      "loss": 0.6362,
+      "step": 18310
+    },
+    {
+      "epoch": 3.25997150997151,
+      "grad_norm": 1.086247205734253,
+      "learning_rate": 1.6479717998331623e-05,
+      "loss": 0.7776,
+      "step": 18311
+    },
+    {
+      "epoch": 3.260149572649573,
+      "grad_norm": 0.7751702070236206,
+      "learning_rate": 1.647202101813411e-05,
+      "loss": 0.4099,
+      "step": 18312
+    },
+    {
+      "epoch": 3.260327635327635,
+      "grad_norm": 0.8890853524208069,
+      "learning_rate": 1.6464325674493853e-05,
+      "loss": 0.668,
+      "step": 18313
+    },
+    {
+      "epoch": 3.260505698005698,
+      "grad_norm": 0.9836373925209045,
+      "learning_rate": 1.6456631967561608e-05,
+      "loss": 0.7582,
+      "step": 18314
+    },
+    {
+      "epoch": 3.2606837606837606,
+      "grad_norm": 0.9572944641113281,
+      "learning_rate": 1.6448939897488112e-05,
+      "loss": 0.7865,
+      "step": 18315
+    },
+    {
+      "epoch": 3.2608618233618234,
+      "grad_norm": 1.0071512460708618,
+      "learning_rate": 1.644124946442407e-05,
+      "loss": 0.7007,
+      "step": 18316
+    },
+    {
+      "epoch": 3.261039886039886,
+      "grad_norm": 0.9263545870780945,
+      "learning_rate": 1.6433560668520176e-05,
+      "loss": 0.8008,
+      "step": 18317
+    },
+    {
+      "epoch": 3.261217948717949,
+      "grad_norm": 0.9851424694061279,
+      "learning_rate": 1.642587350992706e-05,
+      "loss": 0.8027,
+      "step": 18318
+    },
+    {
+      "epoch": 3.261396011396011,
+      "grad_norm": 0.8615550398826599,
+      "learning_rate": 1.6418187988795342e-05,
+      "loss": 0.7405,
+      "step": 18319
+    },
+    {
+      "epoch": 3.261574074074074,
+      "grad_norm": 0.9881899356842041,
+      "learning_rate": 1.6410504105275593e-05,
+      "loss": 1.0046,
+      "step": 18320
+    },
+    {
+      "epoch": 3.2617521367521367,
+      "grad_norm": 0.9273366332054138,
+      "learning_rate": 1.64028218595184e-05,
+      "loss": 0.718,
+      "step": 18321
+    },
+    {
+      "epoch": 3.2619301994301995,
+      "grad_norm": 0.9626027941703796,
+      "learning_rate": 1.6395141251674228e-05,
+      "loss": 0.8123,
+      "step": 18322
+    },
+    {
+      "epoch": 3.262108262108262,
+      "grad_norm": 1.0157630443572998,
+      "learning_rate": 1.6387462281893596e-05,
+      "loss": 0.8498,
+      "step": 18323
+    },
+    {
+      "epoch": 3.262286324786325,
+      "grad_norm": 0.8313695192337036,
+      "learning_rate": 1.637978495032696e-05,
+      "loss": 0.6479,
+      "step": 18324
+    },
+    {
+      "epoch": 3.2624643874643873,
+      "grad_norm": 0.8598573207855225,
+      "learning_rate": 1.6372109257124735e-05,
+      "loss": 0.5883,
+      "step": 18325
+    },
+    {
+      "epoch": 3.26264245014245,
+      "grad_norm": 1.0443403720855713,
+      "learning_rate": 1.63644352024373e-05,
+      "loss": 0.6393,
+      "step": 18326
+    },
+    {
+      "epoch": 3.2628205128205128,
+      "grad_norm": 1.0185366868972778,
+      "learning_rate": 1.6356762786415026e-05,
+      "loss": 0.8473,
+      "step": 18327
+    },
+    {
+      "epoch": 3.2629985754985755,
+      "grad_norm": 1.0581954717636108,
+      "learning_rate": 1.63490920092082e-05,
+      "loss": 0.6592,
+      "step": 18328
+    },
+    {
+      "epoch": 3.2631766381766383,
+      "grad_norm": 0.8719905614852905,
+      "learning_rate": 1.634142287096717e-05,
+      "loss": 0.6829,
+      "step": 18329
+    },
+    {
+      "epoch": 3.263354700854701,
+      "grad_norm": 1.0849109888076782,
+      "learning_rate": 1.6333755371842175e-05,
+      "loss": 0.5595,
+      "step": 18330
+    },
+    {
+      "epoch": 3.263532763532764,
+      "grad_norm": 0.9903097748756409,
+      "learning_rate": 1.6326089511983443e-05,
+      "loss": 0.8984,
+      "step": 18331
+    },
+    {
+      "epoch": 3.263710826210826,
+      "grad_norm": 0.8619652390480042,
+      "learning_rate": 1.6318425291541173e-05,
+      "loss": 0.5457,
+      "step": 18332
+    },
+    {
+      "epoch": 3.263888888888889,
+      "grad_norm": 0.8803672194480896,
+      "learning_rate": 1.6310762710665528e-05,
+      "loss": 0.7953,
+      "step": 18333
+    },
+    {
+      "epoch": 3.2640669515669516,
+      "grad_norm": 1.0389653444290161,
+      "learning_rate": 1.6303101769506633e-05,
+      "loss": 0.8362,
+      "step": 18334
+    },
+    {
+      "epoch": 3.2642450142450143,
+      "grad_norm": 0.9297505021095276,
+      "learning_rate": 1.629544246821456e-05,
+      "loss": 0.6762,
+      "step": 18335
+    },
+    {
+      "epoch": 3.264423076923077,
+      "grad_norm": 1.0184338092803955,
+      "learning_rate": 1.6287784806939476e-05,
+      "loss": 0.7657,
+      "step": 18336
+    },
+    {
+      "epoch": 3.2646011396011394,
+      "grad_norm": 1.0914297103881836,
+      "learning_rate": 1.6280128785831293e-05,
+      "loss": 0.8824,
+      "step": 18337
+    },
+    {
+      "epoch": 3.264779202279202,
+      "grad_norm": 0.9033359885215759,
+      "learning_rate": 1.6272474405040106e-05,
+      "loss": 0.4726,
+      "step": 18338
+    },
+    {
+      "epoch": 3.264957264957265,
+      "grad_norm": 0.8340174555778503,
+      "learning_rate": 1.6264821664715846e-05,
+      "loss": 0.5618,
+      "step": 18339
+    },
+    {
+      "epoch": 3.2651353276353277,
+      "grad_norm": 0.7262910604476929,
+      "learning_rate": 1.625717056500846e-05,
+      "loss": 0.5631,
+      "step": 18340
+    },
+    {
+      "epoch": 3.2653133903133904,
+      "grad_norm": 0.8162511587142944,
+      "learning_rate": 1.6249521106067866e-05,
+      "loss": 0.7289,
+      "step": 18341
+    },
+    {
+      "epoch": 3.265491452991453,
+      "grad_norm": 0.9709984064102173,
+      "learning_rate": 1.624187328804392e-05,
+      "loss": 0.8121,
+      "step": 18342
+    },
+    {
+      "epoch": 3.265669515669516,
+      "grad_norm": 0.97455233335495,
+      "learning_rate": 1.6234227111086475e-05,
+      "loss": 0.692,
+      "step": 18343
+    },
+    {
+      "epoch": 3.265847578347578,
+      "grad_norm": 0.9413778781890869,
+      "learning_rate": 1.622658257534535e-05,
+      "loss": 0.5593,
+      "step": 18344
+    },
+    {
+      "epoch": 3.266025641025641,
+      "grad_norm": 0.8591381907463074,
+      "learning_rate": 1.621893968097028e-05,
+      "loss": 0.7983,
+      "step": 18345
+    },
+    {
+      "epoch": 3.2662037037037037,
+      "grad_norm": 1.017133116722107,
+      "learning_rate": 1.62112984281111e-05,
+      "loss": 0.8291,
+      "step": 18346
+    },
+    {
+      "epoch": 3.2663817663817665,
+      "grad_norm": 0.8174110651016235,
+      "learning_rate": 1.6203658816917423e-05,
+      "loss": 0.6206,
+      "step": 18347
+    },
+    {
+      "epoch": 3.2665598290598292,
+      "grad_norm": 0.9929126501083374,
+      "learning_rate": 1.6196020847539006e-05,
+      "loss": 0.7346,
+      "step": 18348
+    },
+    {
+      "epoch": 3.2667378917378915,
+      "grad_norm": 0.9130271077156067,
+      "learning_rate": 1.6188384520125476e-05,
+      "loss": 0.9829,
+      "step": 18349
+    },
+    {
+      "epoch": 3.2669159544159543,
+      "grad_norm": 0.9933291673660278,
+      "learning_rate": 1.6180749834826413e-05,
+      "loss": 0.7314,
+      "step": 18350
+    },
+    {
+      "epoch": 3.267094017094017,
+      "grad_norm": 0.9695504307746887,
+      "learning_rate": 1.6173116791791498e-05,
+      "loss": 0.9824,
+      "step": 18351
+    },
+    {
+      "epoch": 3.26727207977208,
+      "grad_norm": 0.9526118636131287,
+      "learning_rate": 1.6165485391170164e-05,
+      "loss": 1.0207,
+      "step": 18352
+    },
+    {
+      "epoch": 3.2674501424501425,
+      "grad_norm": 0.8514195084571838,
+      "learning_rate": 1.6157855633112053e-05,
+      "loss": 0.5906,
+      "step": 18353
+    },
+    {
+      "epoch": 3.2676282051282053,
+      "grad_norm": 0.9104008674621582,
+      "learning_rate": 1.6150227517766535e-05,
+      "loss": 0.7426,
+      "step": 18354
+    },
+    {
+      "epoch": 3.267806267806268,
+      "grad_norm": 1.019457221031189,
+      "learning_rate": 1.6142601045283157e-05,
+      "loss": 0.6441,
+      "step": 18355
+    },
+    {
+      "epoch": 3.2679843304843303,
+      "grad_norm": 0.8585529327392578,
+      "learning_rate": 1.6134976215811305e-05,
+      "loss": 0.5605,
+      "step": 18356
+    },
+    {
+      "epoch": 3.268162393162393,
+      "grad_norm": 0.8874415755271912,
+      "learning_rate": 1.612735302950038e-05,
+      "loss": 0.6461,
+      "step": 18357
+    },
+    {
+      "epoch": 3.268340455840456,
+      "grad_norm": 0.9070242047309875,
+      "learning_rate": 1.6119731486499755e-05,
+      "loss": 0.7123,
+      "step": 18358
+    },
+    {
+      "epoch": 3.2685185185185186,
+      "grad_norm": 0.8923984169960022,
+      "learning_rate": 1.6112111586958733e-05,
+      "loss": 0.6977,
+      "step": 18359
+    },
+    {
+      "epoch": 3.2686965811965814,
+      "grad_norm": 1.0075733661651611,
+      "learning_rate": 1.6104493331026604e-05,
+      "loss": 0.9057,
+      "step": 18360
+    },
+    {
+      "epoch": 3.2688746438746437,
+      "grad_norm": 0.7989169359207153,
+      "learning_rate": 1.6096876718852694e-05,
+      "loss": 0.6616,
+      "step": 18361
+    },
+    {
+      "epoch": 3.2690527065527064,
+      "grad_norm": 1.0517085790634155,
+      "learning_rate": 1.608926175058615e-05,
+      "loss": 0.5575,
+      "step": 18362
+    },
+    {
+      "epoch": 3.269230769230769,
+      "grad_norm": 0.9944202303886414,
+      "learning_rate": 1.608164842637623e-05,
+      "loss": 0.7801,
+      "step": 18363
+    },
+    {
+      "epoch": 3.269408831908832,
+      "grad_norm": 1.3099443912506104,
+      "learning_rate": 1.607403674637209e-05,
+      "loss": 0.8176,
+      "step": 18364
+    },
+    {
+      "epoch": 3.2695868945868947,
+      "grad_norm": 0.918102502822876,
+      "learning_rate": 1.606642671072285e-05,
+      "loss": 0.8969,
+      "step": 18365
+    },
+    {
+      "epoch": 3.2697649572649574,
+      "grad_norm": 1.058408260345459,
+      "learning_rate": 1.605881831957763e-05,
+      "loss": 0.7686,
+      "step": 18366
+    },
+    {
+      "epoch": 3.26994301994302,
+      "grad_norm": 0.956381618976593,
+      "learning_rate": 1.6051211573085467e-05,
+      "loss": 0.8574,
+      "step": 18367
+    },
+    {
+      "epoch": 3.2701210826210825,
+      "grad_norm": 1.0281140804290771,
+      "learning_rate": 1.6043606471395468e-05,
+      "loss": 0.9321,
+      "step": 18368
+    },
+    {
+      "epoch": 3.2702991452991452,
+      "grad_norm": 0.9082704782485962,
+      "learning_rate": 1.6036003014656552e-05,
+      "loss": 0.83,
+      "step": 18369
+    },
+    {
+      "epoch": 3.270477207977208,
+      "grad_norm": 0.9393236637115479,
+      "learning_rate": 1.602840120301776e-05,
+      "loss": 0.6193,
+      "step": 18370
+    },
+    {
+      "epoch": 3.2706552706552707,
+      "grad_norm": 0.8955071568489075,
+      "learning_rate": 1.602080103662801e-05,
+      "loss": 0.708,
+      "step": 18371
+    },
+    {
+      "epoch": 3.2708333333333335,
+      "grad_norm": 1.0704554319381714,
+      "learning_rate": 1.6013202515636206e-05,
+      "loss": 0.8384,
+      "step": 18372
+    },
+    {
+      "epoch": 3.271011396011396,
+      "grad_norm": 0.8835996389389038,
+      "learning_rate": 1.6005605640191247e-05,
+      "loss": 0.7344,
+      "step": 18373
+    },
+    {
+      "epoch": 3.2711894586894585,
+      "grad_norm": 1.0095866918563843,
+      "learning_rate": 1.5998010410441943e-05,
+      "loss": 0.9588,
+      "step": 18374
+    },
+    {
+      "epoch": 3.2713675213675213,
+      "grad_norm": 0.9835611581802368,
+      "learning_rate": 1.5990416826537114e-05,
+      "loss": 0.8655,
+      "step": 18375
+    },
+    {
+      "epoch": 3.271545584045584,
+      "grad_norm": 1.0003584623336792,
+      "learning_rate": 1.5982824888625603e-05,
+      "loss": 0.7716,
+      "step": 18376
+    },
+    {
+      "epoch": 3.271723646723647,
+      "grad_norm": 0.9901667833328247,
+      "learning_rate": 1.597523459685605e-05,
+      "loss": 0.9206,
+      "step": 18377
+    },
+    {
+      "epoch": 3.2719017094017095,
+      "grad_norm": 0.8539385199546814,
+      "learning_rate": 1.5967645951377285e-05,
+      "loss": 0.8601,
+      "step": 18378
+    },
+    {
+      "epoch": 3.2720797720797723,
+      "grad_norm": 1.017080307006836,
+      "learning_rate": 1.5960058952337887e-05,
+      "loss": 0.8568,
+      "step": 18379
+    },
+    {
+      "epoch": 3.2722578347578346,
+      "grad_norm": 0.9467936158180237,
+      "learning_rate": 1.5952473599886575e-05,
+      "loss": 0.8216,
+      "step": 18380
+    },
+    {
+      "epoch": 3.2724358974358974,
+      "grad_norm": 0.8549704551696777,
+      "learning_rate": 1.5944889894171944e-05,
+      "loss": 0.5828,
+      "step": 18381
+    },
+    {
+      "epoch": 3.27261396011396,
+      "grad_norm": 0.9717326760292053,
+      "learning_rate": 1.5937307835342595e-05,
+      "loss": 0.814,
+      "step": 18382
+    },
+    {
+      "epoch": 3.272792022792023,
+      "grad_norm": 1.0630271434783936,
+      "learning_rate": 1.592972742354707e-05,
+      "loss": 0.8204,
+      "step": 18383
+    },
+    {
+      "epoch": 3.2729700854700856,
+      "grad_norm": 0.8099556565284729,
+      "learning_rate": 1.59221486589339e-05,
+      "loss": 0.6395,
+      "step": 18384
+    },
+    {
+      "epoch": 3.273148148148148,
+      "grad_norm": 0.8640239834785461,
+      "learning_rate": 1.5914571541651545e-05,
+      "loss": 0.6824,
+      "step": 18385
+    },
+    {
+      "epoch": 3.2733262108262107,
+      "grad_norm": 0.9581767916679382,
+      "learning_rate": 1.5906996071848534e-05,
+      "loss": 0.7363,
+      "step": 18386
+    },
+    {
+      "epoch": 3.2735042735042734,
+      "grad_norm": 0.9858554601669312,
+      "learning_rate": 1.5899422249673203e-05,
+      "loss": 0.7251,
+      "step": 18387
+    },
+    {
+      "epoch": 3.273682336182336,
+      "grad_norm": 1.040452003479004,
+      "learning_rate": 1.5891850075274005e-05,
+      "loss": 0.9428,
+      "step": 18388
+    },
+    {
+      "epoch": 3.273860398860399,
+      "grad_norm": 1.145668387413025,
+      "learning_rate": 1.5884279548799296e-05,
+      "loss": 0.8521,
+      "step": 18389
+    },
+    {
+      "epoch": 3.2740384615384617,
+      "grad_norm": 0.9758059978485107,
+      "learning_rate": 1.5876710670397386e-05,
+      "loss": 0.8099,
+      "step": 18390
+    },
+    {
+      "epoch": 3.2742165242165244,
+      "grad_norm": 0.9736207723617554,
+      "learning_rate": 1.5869143440216583e-05,
+      "loss": 0.8343,
+      "step": 18391
+    },
+    {
+      "epoch": 3.2743945868945867,
+      "grad_norm": 1.058436393737793,
+      "learning_rate": 1.5861577858405118e-05,
+      "loss": 0.8069,
+      "step": 18392
+    },
+    {
+      "epoch": 3.2745726495726495,
+      "grad_norm": 0.9382075071334839,
+      "learning_rate": 1.585401392511131e-05,
+      "loss": 0.893,
+      "step": 18393
+    },
+    {
+      "epoch": 3.2747507122507122,
+      "grad_norm": 0.9606096148490906,
+      "learning_rate": 1.5846451640483252e-05,
+      "loss": 0.609,
+      "step": 18394
+    },
+    {
+      "epoch": 3.274928774928775,
+      "grad_norm": 0.8548282384872437,
+      "learning_rate": 1.5838891004669175e-05,
+      "loss": 0.8531,
+      "step": 18395
+    },
+    {
+      "epoch": 3.2751068376068377,
+      "grad_norm": 0.8669559955596924,
+      "learning_rate": 1.5831332017817202e-05,
+      "loss": 0.5267,
+      "step": 18396
+    },
+    {
+      "epoch": 3.2752849002849,
+      "grad_norm": 1.0153757333755493,
+      "learning_rate": 1.5823774680075443e-05,
+      "loss": 0.9531,
+      "step": 18397
+    },
+    {
+      "epoch": 3.275462962962963,
+      "grad_norm": 0.9505778551101685,
+      "learning_rate": 1.5816218991591946e-05,
+      "loss": 0.6559,
+      "step": 18398
+    },
+    {
+      "epoch": 3.2756410256410255,
+      "grad_norm": 0.785952091217041,
+      "learning_rate": 1.580866495251476e-05,
+      "loss": 0.5311,
+      "step": 18399
+    },
+    {
+      "epoch": 3.2758190883190883,
+      "grad_norm": 0.9959350824356079,
+      "learning_rate": 1.5801112562991883e-05,
+      "loss": 0.9929,
+      "step": 18400
+    },
+    {
+      "epoch": 3.275997150997151,
+      "grad_norm": 0.9083783626556396,
+      "learning_rate": 1.5793561823171332e-05,
+      "loss": 0.6089,
+      "step": 18401
+    },
+    {
+      "epoch": 3.276175213675214,
+      "grad_norm": 0.94951331615448,
+      "learning_rate": 1.5786012733200973e-05,
+      "loss": 0.728,
+      "step": 18402
+    },
+    {
+      "epoch": 3.2763532763532766,
+      "grad_norm": 0.8742989897727966,
+      "learning_rate": 1.5778465293228795e-05,
+      "loss": 0.8664,
+      "step": 18403
+    },
+    {
+      "epoch": 3.276531339031339,
+      "grad_norm": 0.9766057133674622,
+      "learning_rate": 1.5770919503402594e-05,
+      "loss": 0.6791,
+      "step": 18404
+    },
+    {
+      "epoch": 3.2767094017094016,
+      "grad_norm": 0.9813364744186401,
+      "learning_rate": 1.576337536387027e-05,
+      "loss": 0.8344,
+      "step": 18405
+    },
+    {
+      "epoch": 3.2768874643874644,
+      "grad_norm": 0.8530852794647217,
+      "learning_rate": 1.575583287477963e-05,
+      "loss": 0.9362,
+      "step": 18406
+    },
+    {
+      "epoch": 3.277065527065527,
+      "grad_norm": 1.0678436756134033,
+      "learning_rate": 1.5748292036278412e-05,
+      "loss": 0.655,
+      "step": 18407
+    },
+    {
+      "epoch": 3.27724358974359,
+      "grad_norm": 0.9426410794258118,
+      "learning_rate": 1.574075284851444e-05,
+      "loss": 0.7442,
+      "step": 18408
+    },
+    {
+      "epoch": 3.277421652421652,
+      "grad_norm": 1.096710205078125,
+      "learning_rate": 1.5733215311635342e-05,
+      "loss": 0.8746,
+      "step": 18409
+    },
+    {
+      "epoch": 3.277599715099715,
+      "grad_norm": 1.0035667419433594,
+      "learning_rate": 1.5725679425788853e-05,
+      "loss": 0.8182,
+      "step": 18410
+    },
+    {
+      "epoch": 3.2777777777777777,
+      "grad_norm": 0.6038252115249634,
+      "learning_rate": 1.5718145191122625e-05,
+      "loss": 0.3724,
+      "step": 18411
+    },
+    {
+      "epoch": 3.2779558404558404,
+      "grad_norm": 1.0738803148269653,
+      "learning_rate": 1.5710612607784247e-05,
+      "loss": 0.7623,
+      "step": 18412
+    },
+    {
+      "epoch": 3.278133903133903,
+      "grad_norm": 0.9418566226959229,
+      "learning_rate": 1.5703081675921328e-05,
+      "loss": 0.6925,
+      "step": 18413
+    },
+    {
+      "epoch": 3.278311965811966,
+      "grad_norm": 0.935024619102478,
+      "learning_rate": 1.5695552395681414e-05,
+      "loss": 0.6902,
+      "step": 18414
+    },
+    {
+      "epoch": 3.2784900284900287,
+      "grad_norm": 0.9279661774635315,
+      "learning_rate": 1.5688024767212017e-05,
+      "loss": 0.6978,
+      "step": 18415
+    },
+    {
+      "epoch": 3.278668091168091,
+      "grad_norm": 1.1668767929077148,
+      "learning_rate": 1.5680498790660636e-05,
+      "loss": 0.8869,
+      "step": 18416
+    },
+    {
+      "epoch": 3.2788461538461537,
+      "grad_norm": 0.9925910830497742,
+      "learning_rate": 1.5672974466174695e-05,
+      "loss": 0.9774,
+      "step": 18417
+    },
+    {
+      "epoch": 3.2790242165242165,
+      "grad_norm": 1.0041983127593994,
+      "learning_rate": 1.5665451793901698e-05,
+      "loss": 0.8268,
+      "step": 18418
+    },
+    {
+      "epoch": 3.2792022792022792,
+      "grad_norm": 0.7737284302711487,
+      "learning_rate": 1.565793077398894e-05,
+      "loss": 0.6039,
+      "step": 18419
+    },
+    {
+      "epoch": 3.279380341880342,
+      "grad_norm": 0.9920578598976135,
+      "learning_rate": 1.5650411406583833e-05,
+      "loss": 0.7539,
+      "step": 18420
+    },
+    {
+      "epoch": 3.2795584045584047,
+      "grad_norm": 1.04171621799469,
+      "learning_rate": 1.5642893691833706e-05,
+      "loss": 0.6719,
+      "step": 18421
+    },
+    {
+      "epoch": 3.279736467236467,
+      "grad_norm": 0.7911034226417542,
+      "learning_rate": 1.563537762988583e-05,
+      "loss": 0.5976,
+      "step": 18422
+    },
+    {
+      "epoch": 3.27991452991453,
+      "grad_norm": 1.0236256122589111,
+      "learning_rate": 1.5627863220887485e-05,
+      "loss": 0.8767,
+      "step": 18423
+    },
+    {
+      "epoch": 3.2800925925925926,
+      "grad_norm": 1.0210331678390503,
+      "learning_rate": 1.562035046498589e-05,
+      "loss": 0.6193,
+      "step": 18424
+    },
+    {
+      "epoch": 3.2802706552706553,
+      "grad_norm": 1.0835071802139282,
+      "learning_rate": 1.5612839362328234e-05,
+      "loss": 0.9845,
+      "step": 18425
+    },
+    {
+      "epoch": 3.280448717948718,
+      "grad_norm": 0.9284858703613281,
+      "learning_rate": 1.5605329913061707e-05,
+      "loss": 0.7991,
+      "step": 18426
+    },
+    {
+      "epoch": 3.280626780626781,
+      "grad_norm": 1.1475468873977661,
+      "learning_rate": 1.5597822117333394e-05,
+      "loss": 0.695,
+      "step": 18427
+    },
+    {
+      "epoch": 3.280804843304843,
+      "grad_norm": 0.9122242331504822,
+      "learning_rate": 1.559031597529045e-05,
+      "loss": 0.7703,
+      "step": 18428
+    },
+    {
+      "epoch": 3.280982905982906,
+      "grad_norm": 0.9514288902282715,
+      "learning_rate": 1.558281148707993e-05,
+      "loss": 0.6174,
+      "step": 18429
+    },
+    {
+      "epoch": 3.2811609686609686,
+      "grad_norm": 1.0076818466186523,
+      "learning_rate": 1.557530865284884e-05,
+      "loss": 0.8294,
+      "step": 18430
+    },
+    {
+      "epoch": 3.2813390313390314,
+      "grad_norm": 0.8275960087776184,
+      "learning_rate": 1.5567807472744212e-05,
+      "loss": 0.672,
+      "step": 18431
+    },
+    {
+      "epoch": 3.281517094017094,
+      "grad_norm": 0.9140580296516418,
+      "learning_rate": 1.556030794691298e-05,
+      "loss": 0.7372,
+      "step": 18432
+    },
+    {
+      "epoch": 3.281695156695157,
+      "grad_norm": 1.1289745569229126,
+      "learning_rate": 1.5552810075502144e-05,
+      "loss": 0.8771,
+      "step": 18433
+    },
+    {
+      "epoch": 3.281873219373219,
+      "grad_norm": 1.1868027448654175,
+      "learning_rate": 1.554531385865853e-05,
+      "loss": 0.9765,
+      "step": 18434
+    },
+    {
+      "epoch": 3.282051282051282,
+      "grad_norm": 0.8656640648841858,
+      "learning_rate": 1.5537819296529065e-05,
+      "loss": 0.6889,
+      "step": 18435
+    },
+    {
+      "epoch": 3.2822293447293447,
+      "grad_norm": 0.9567883014678955,
+      "learning_rate": 1.5530326389260585e-05,
+      "loss": 0.5665,
+      "step": 18436
+    },
+    {
+      "epoch": 3.2824074074074074,
+      "grad_norm": 1.0118212699890137,
+      "learning_rate": 1.552283513699989e-05,
+      "loss": 0.6376,
+      "step": 18437
+    },
+    {
+      "epoch": 3.28258547008547,
+      "grad_norm": 0.9064817428588867,
+      "learning_rate": 1.5515345539893756e-05,
+      "loss": 0.7624,
+      "step": 18438
+    },
+    {
+      "epoch": 3.282763532763533,
+      "grad_norm": 0.9137055277824402,
+      "learning_rate": 1.5507857598088914e-05,
+      "loss": 0.6229,
+      "step": 18439
+    },
+    {
+      "epoch": 3.2829415954415953,
+      "grad_norm": 0.9002276062965393,
+      "learning_rate": 1.5500371311732098e-05,
+      "loss": 0.7126,
+      "step": 18440
+    },
+    {
+      "epoch": 3.283119658119658,
+      "grad_norm": 1.0946577787399292,
+      "learning_rate": 1.5492886680969963e-05,
+      "loss": 0.7333,
+      "step": 18441
+    },
+    {
+      "epoch": 3.2832977207977208,
+      "grad_norm": 1.0645185708999634,
+      "learning_rate": 1.548540370594914e-05,
+      "loss": 0.7965,
+      "step": 18442
+    },
+    {
+      "epoch": 3.2834757834757835,
+      "grad_norm": 0.9328752160072327,
+      "learning_rate": 1.5477922386816325e-05,
+      "loss": 0.9929,
+      "step": 18443
+    },
+    {
+      "epoch": 3.2836538461538463,
+      "grad_norm": 0.8564518094062805,
+      "learning_rate": 1.5470442723717993e-05,
+      "loss": 0.5875,
+      "step": 18444
+    },
+    {
+      "epoch": 3.283831908831909,
+      "grad_norm": 0.92264324426651,
+      "learning_rate": 1.5462964716800753e-05,
+      "loss": 0.8774,
+      "step": 18445
+    },
+    {
+      "epoch": 3.2840099715099713,
+      "grad_norm": 0.8862282037734985,
+      "learning_rate": 1.5455488366211126e-05,
+      "loss": 0.6523,
+      "step": 18446
+    },
+    {
+      "epoch": 3.284188034188034,
+      "grad_norm": 1.1124815940856934,
+      "learning_rate": 1.5448013672095542e-05,
+      "loss": 0.8572,
+      "step": 18447
+    },
+    {
+      "epoch": 3.284366096866097,
+      "grad_norm": 0.96551513671875,
+      "learning_rate": 1.5440540634600542e-05,
+      "loss": 0.8255,
+      "step": 18448
+    },
+    {
+      "epoch": 3.2845441595441596,
+      "grad_norm": 1.0143595933914185,
+      "learning_rate": 1.5433069253872435e-05,
+      "loss": 0.5876,
+      "step": 18449
+    },
+    {
+      "epoch": 3.2847222222222223,
+      "grad_norm": 0.9641425013542175,
+      "learning_rate": 1.5425599530057722e-05,
+      "loss": 0.7874,
+      "step": 18450
+    },
+    {
+      "epoch": 3.284900284900285,
+      "grad_norm": 0.982153058052063,
+      "learning_rate": 1.5418131463302643e-05,
+      "loss": 0.6765,
+      "step": 18451
+    },
+    {
+      "epoch": 3.285078347578348,
+      "grad_norm": 0.9491843581199646,
+      "learning_rate": 1.5410665053753604e-05,
+      "loss": 0.8588,
+      "step": 18452
+    },
+    {
+      "epoch": 3.28525641025641,
+      "grad_norm": 0.9593594074249268,
+      "learning_rate": 1.5403200301556863e-05,
+      "loss": 0.8069,
+      "step": 18453
+    },
+    {
+      "epoch": 3.285434472934473,
+      "grad_norm": 1.0291471481323242,
+      "learning_rate": 1.5395737206858673e-05,
+      "loss": 0.8385,
+      "step": 18454
+    },
+    {
+      "epoch": 3.2856125356125356,
+      "grad_norm": 0.8283519148826599,
+      "learning_rate": 1.5388275769805272e-05,
+      "loss": 0.6765,
+      "step": 18455
+    },
+    {
+      "epoch": 3.2857905982905984,
+      "grad_norm": 1.0748952627182007,
+      "learning_rate": 1.538081599054283e-05,
+      "loss": 0.8704,
+      "step": 18456
+    },
+    {
+      "epoch": 3.285968660968661,
+      "grad_norm": 0.8589741587638855,
+      "learning_rate": 1.53733578692175e-05,
+      "loss": 0.6258,
+      "step": 18457
+    },
+    {
+      "epoch": 3.2861467236467234,
+      "grad_norm": 1.0470752716064453,
+      "learning_rate": 1.536590140597547e-05,
+      "loss": 0.7413,
+      "step": 18458
+    },
+    {
+      "epoch": 3.286324786324786,
+      "grad_norm": 0.9156502485275269,
+      "learning_rate": 1.535844660096274e-05,
+      "loss": 0.713,
+      "step": 18459
+    },
+    {
+      "epoch": 3.286502849002849,
+      "grad_norm": 0.8092247843742371,
+      "learning_rate": 1.535099345432548e-05,
+      "loss": 0.5603,
+      "step": 18460
+    },
+    {
+      "epoch": 3.2866809116809117,
+      "grad_norm": 0.9388442039489746,
+      "learning_rate": 1.5343541966209607e-05,
+      "loss": 0.745,
+      "step": 18461
+    },
+    {
+      "epoch": 3.2868589743589745,
+      "grad_norm": 0.9431697726249695,
+      "learning_rate": 1.5336092136761193e-05,
+      "loss": 0.7621,
+      "step": 18462
+    },
+    {
+      "epoch": 3.287037037037037,
+      "grad_norm": 0.8809608221054077,
+      "learning_rate": 1.5328643966126178e-05,
+      "loss": 0.6982,
+      "step": 18463
+    },
+    {
+      "epoch": 3.2872150997151,
+      "grad_norm": 1.0238356590270996,
+      "learning_rate": 1.5321197454450496e-05,
+      "loss": 0.7742,
+      "step": 18464
+    },
+    {
+      "epoch": 3.2873931623931623,
+      "grad_norm": 1.099500298500061,
+      "learning_rate": 1.5313752601880038e-05,
+      "loss": 0.871,
+      "step": 18465
+    },
+    {
+      "epoch": 3.287571225071225,
+      "grad_norm": 0.8493121862411499,
+      "learning_rate": 1.530630940856067e-05,
+      "loss": 0.6869,
+      "step": 18466
+    },
+    {
+      "epoch": 3.2877492877492878,
+      "grad_norm": 0.9258021116256714,
+      "learning_rate": 1.5298867874638246e-05,
+      "loss": 0.7975,
+      "step": 18467
+    },
+    {
+      "epoch": 3.2879273504273505,
+      "grad_norm": 1.0687447786331177,
+      "learning_rate": 1.5291428000258556e-05,
+      "loss": 0.797,
+      "step": 18468
+    },
+    {
+      "epoch": 3.2881054131054133,
+      "grad_norm": 1.0039074420928955,
+      "learning_rate": 1.5283989785567366e-05,
+      "loss": 0.6995,
+      "step": 18469
+    },
+    {
+      "epoch": 3.2882834757834756,
+      "grad_norm": 0.9193965196609497,
+      "learning_rate": 1.5276553230710423e-05,
+      "loss": 0.6836,
+      "step": 18470
+    },
+    {
+      "epoch": 3.2884615384615383,
+      "grad_norm": 1.0001294612884521,
+      "learning_rate": 1.526911833583341e-05,
+      "loss": 0.7799,
+      "step": 18471
+    },
+    {
+      "epoch": 3.288639601139601,
+      "grad_norm": 0.9037470817565918,
+      "learning_rate": 1.526168510108199e-05,
+      "loss": 0.836,
+      "step": 18472
+    },
+    {
+      "epoch": 3.288817663817664,
+      "grad_norm": 0.8541157245635986,
+      "learning_rate": 1.525425352660187e-05,
+      "loss": 0.7288,
+      "step": 18473
+    },
+    {
+      "epoch": 3.2889957264957266,
+      "grad_norm": 1.0287656784057617,
+      "learning_rate": 1.5246823612538563e-05,
+      "loss": 0.8542,
+      "step": 18474
+    },
+    {
+      "epoch": 3.2891737891737893,
+      "grad_norm": 1.1029131412506104,
+      "learning_rate": 1.5239395359037734e-05,
+      "loss": 0.7211,
+      "step": 18475
+    },
+    {
+      "epoch": 3.289351851851852,
+      "grad_norm": 0.8748056292533875,
+      "learning_rate": 1.523196876624483e-05,
+      "loss": 0.7278,
+      "step": 18476
+    },
+    {
+      "epoch": 3.2895299145299144,
+      "grad_norm": 0.9509679675102234,
+      "learning_rate": 1.5224543834305427e-05,
+      "loss": 0.649,
+      "step": 18477
+    },
+    {
+      "epoch": 3.289707977207977,
+      "grad_norm": 0.9093441367149353,
+      "learning_rate": 1.5217120563364985e-05,
+      "loss": 0.822,
+      "step": 18478
+    },
+    {
+      "epoch": 3.28988603988604,
+      "grad_norm": 0.9471918940544128,
+      "learning_rate": 1.5209698953568952e-05,
+      "loss": 0.8744,
+      "step": 18479
+    },
+    {
+      "epoch": 3.2900641025641026,
+      "grad_norm": 0.8799687623977661,
+      "learning_rate": 1.5202279005062725e-05,
+      "loss": 0.651,
+      "step": 18480
+    },
+    {
+      "epoch": 3.2902421652421654,
+      "grad_norm": 0.9291742444038391,
+      "learning_rate": 1.5194860717991687e-05,
+      "loss": 0.6697,
+      "step": 18481
+    },
+    {
+      "epoch": 3.2904202279202277,
+      "grad_norm": 0.9564397931098938,
+      "learning_rate": 1.5187444092501158e-05,
+      "loss": 0.8578,
+      "step": 18482
+    },
+    {
+      "epoch": 3.2905982905982905,
+      "grad_norm": 0.9487985968589783,
+      "learning_rate": 1.518002912873654e-05,
+      "loss": 0.727,
+      "step": 18483
+    },
+    {
+      "epoch": 3.290776353276353,
+      "grad_norm": 1.316943645477295,
+      "learning_rate": 1.5172615826842996e-05,
+      "loss": 0.7059,
+      "step": 18484
+    },
+    {
+      "epoch": 3.290954415954416,
+      "grad_norm": 0.9622043371200562,
+      "learning_rate": 1.5165204186965875e-05,
+      "loss": 0.5788,
+      "step": 18485
+    },
+    {
+      "epoch": 3.2911324786324787,
+      "grad_norm": 0.7761589288711548,
+      "learning_rate": 1.5157794209250309e-05,
+      "loss": 0.4303,
+      "step": 18486
+    },
+    {
+      "epoch": 3.2913105413105415,
+      "grad_norm": 1.0018688440322876,
+      "learning_rate": 1.5150385893841545e-05,
+      "loss": 0.5912,
+      "step": 18487
+    },
+    {
+      "epoch": 3.291488603988604,
+      "grad_norm": 0.9771264791488647,
+      "learning_rate": 1.5142979240884713e-05,
+      "loss": 0.805,
+      "step": 18488
+    },
+    {
+      "epoch": 3.2916666666666665,
+      "grad_norm": 0.9868426322937012,
+      "learning_rate": 1.5135574250524897e-05,
+      "loss": 0.7762,
+      "step": 18489
+    },
+    {
+      "epoch": 3.2918447293447293,
+      "grad_norm": 1.0445677042007446,
+      "learning_rate": 1.5128170922907258e-05,
+      "loss": 0.8809,
+      "step": 18490
+    },
+    {
+      "epoch": 3.292022792022792,
+      "grad_norm": 1.082547664642334,
+      "learning_rate": 1.5120769258176758e-05,
+      "loss": 0.9871,
+      "step": 18491
+    },
+    {
+      "epoch": 3.2922008547008548,
+      "grad_norm": 0.9163486361503601,
+      "learning_rate": 1.5113369256478493e-05,
+      "loss": 0.6729,
+      "step": 18492
+    },
+    {
+      "epoch": 3.2923789173789175,
+      "grad_norm": 0.9486492872238159,
+      "learning_rate": 1.5105970917957413e-05,
+      "loss": 0.6898,
+      "step": 18493
+    },
+    {
+      "epoch": 3.29255698005698,
+      "grad_norm": 0.9150809645652771,
+      "learning_rate": 1.509857424275849e-05,
+      "loss": 0.8039,
+      "step": 18494
+    },
+    {
+      "epoch": 3.2927350427350426,
+      "grad_norm": 0.9275760650634766,
+      "learning_rate": 1.509117923102663e-05,
+      "loss": 0.8944,
+      "step": 18495
+    },
+    {
+      "epoch": 3.2929131054131053,
+      "grad_norm": 1.0610506534576416,
+      "learning_rate": 1.508378588290673e-05,
+      "loss": 0.6999,
+      "step": 18496
+    },
+    {
+      "epoch": 3.293091168091168,
+      "grad_norm": 0.9436303377151489,
+      "learning_rate": 1.5076394198543653e-05,
+      "loss": 0.8837,
+      "step": 18497
+    },
+    {
+      "epoch": 3.293269230769231,
+      "grad_norm": 0.9994230270385742,
+      "learning_rate": 1.5069004178082213e-05,
+      "loss": 0.7606,
+      "step": 18498
+    },
+    {
+      "epoch": 3.2934472934472936,
+      "grad_norm": 0.7854368090629578,
+      "learning_rate": 1.5061615821667185e-05,
+      "loss": 0.4879,
+      "step": 18499
+    },
+    {
+      "epoch": 3.2936253561253563,
+      "grad_norm": 1.0826393365859985,
+      "learning_rate": 1.5054229129443397e-05,
+      "loss": 0.7172,
+      "step": 18500
+    },
+    {
+      "epoch": 3.2938034188034186,
+      "grad_norm": 0.9371114373207092,
+      "learning_rate": 1.5046844101555491e-05,
+      "loss": 0.8157,
+      "step": 18501
+    },
+    {
+      "epoch": 3.2939814814814814,
+      "grad_norm": 1.0282293558120728,
+      "learning_rate": 1.5039460738148215e-05,
+      "loss": 0.7367,
+      "step": 18502
+    },
+    {
+      "epoch": 3.294159544159544,
+      "grad_norm": 1.0519812107086182,
+      "learning_rate": 1.5032079039366209e-05,
+      "loss": 0.8756,
+      "step": 18503
+    },
+    {
+      "epoch": 3.294337606837607,
+      "grad_norm": 0.9510549306869507,
+      "learning_rate": 1.5024699005354127e-05,
+      "loss": 0.7443,
+      "step": 18504
+    },
+    {
+      "epoch": 3.2945156695156697,
+      "grad_norm": 0.871846616268158,
+      "learning_rate": 1.5017320636256527e-05,
+      "loss": 0.8086,
+      "step": 18505
+    },
+    {
+      "epoch": 3.294693732193732,
+      "grad_norm": 0.9595451951026917,
+      "learning_rate": 1.5009943932217985e-05,
+      "loss": 0.7864,
+      "step": 18506
+    },
+    {
+      "epoch": 3.2948717948717947,
+      "grad_norm": 1.0278631448745728,
+      "learning_rate": 1.5002568893383051e-05,
+      "loss": 0.8224,
+      "step": 18507
+    },
+    {
+      "epoch": 3.2950498575498575,
+      "grad_norm": 0.9716551899909973,
+      "learning_rate": 1.499519551989622e-05,
+      "loss": 0.7774,
+      "step": 18508
+    },
+    {
+      "epoch": 3.29522792022792,
+      "grad_norm": 0.9664580821990967,
+      "learning_rate": 1.4987823811901958e-05,
+      "loss": 0.8641,
+      "step": 18509
+    },
+    {
+      "epoch": 3.295405982905983,
+      "grad_norm": 0.9358782768249512,
+      "learning_rate": 1.4980453769544677e-05,
+      "loss": 0.8348,
+      "step": 18510
+    },
+    {
+      "epoch": 3.2955840455840457,
+      "grad_norm": 0.8839470744132996,
+      "learning_rate": 1.49730853929688e-05,
+      "loss": 0.9168,
+      "step": 18511
+    },
+    {
+      "epoch": 3.2957621082621085,
+      "grad_norm": 1.004765272140503,
+      "learning_rate": 1.4965718682318685e-05,
+      "loss": 0.8787,
+      "step": 18512
+    },
+    {
+      "epoch": 3.2959401709401708,
+      "grad_norm": 0.8481960296630859,
+      "learning_rate": 1.4958353637738665e-05,
+      "loss": 0.5791,
+      "step": 18513
+    },
+    {
+      "epoch": 3.2961182336182335,
+      "grad_norm": 0.884299635887146,
+      "learning_rate": 1.4950990259373032e-05,
+      "loss": 0.8386,
+      "step": 18514
+    },
+    {
+      "epoch": 3.2962962962962963,
+      "grad_norm": 1.1240220069885254,
+      "learning_rate": 1.4943628547366106e-05,
+      "loss": 0.8071,
+      "step": 18515
+    },
+    {
+      "epoch": 3.296474358974359,
+      "grad_norm": 1.0169334411621094,
+      "learning_rate": 1.493626850186205e-05,
+      "loss": 0.8099,
+      "step": 18516
+    },
+    {
+      "epoch": 3.296652421652422,
+      "grad_norm": 0.8471205234527588,
+      "learning_rate": 1.4928910123005135e-05,
+      "loss": 0.7887,
+      "step": 18517
+    },
+    {
+      "epoch": 3.296830484330484,
+      "grad_norm": 0.9037573933601379,
+      "learning_rate": 1.4921553410939492e-05,
+      "loss": 0.7674,
+      "step": 18518
+    },
+    {
+      "epoch": 3.297008547008547,
+      "grad_norm": 1.0591832399368286,
+      "learning_rate": 1.4914198365809296e-05,
+      "loss": 0.7349,
+      "step": 18519
+    },
+    {
+      "epoch": 3.2971866096866096,
+      "grad_norm": 0.7707160115242004,
+      "learning_rate": 1.4906844987758616e-05,
+      "loss": 0.5358,
+      "step": 18520
+    },
+    {
+      "epoch": 3.2973646723646723,
+      "grad_norm": 0.899319589138031,
+      "learning_rate": 1.4899493276931541e-05,
+      "loss": 0.7471,
+      "step": 18521
+    },
+    {
+      "epoch": 3.297542735042735,
+      "grad_norm": 0.9070066809654236,
+      "learning_rate": 1.4892143233472121e-05,
+      "loss": 0.7375,
+      "step": 18522
+    },
+    {
+      "epoch": 3.297720797720798,
+      "grad_norm": 0.8646981716156006,
+      "learning_rate": 1.4884794857524364e-05,
+      "loss": 0.7047,
+      "step": 18523
+    },
+    {
+      "epoch": 3.2978988603988606,
+      "grad_norm": 0.9446913003921509,
+      "learning_rate": 1.487744814923221e-05,
+      "loss": 0.6036,
+      "step": 18524
+    },
+    {
+      "epoch": 3.298076923076923,
+      "grad_norm": 1.2150342464447021,
+      "learning_rate": 1.4870103108739674e-05,
+      "loss": 0.8835,
+      "step": 18525
+    },
+    {
+      "epoch": 3.2982549857549857,
+      "grad_norm": 0.8841198682785034,
+      "learning_rate": 1.486275973619059e-05,
+      "loss": 0.7056,
+      "step": 18526
+    },
+    {
+      "epoch": 3.2984330484330484,
+      "grad_norm": 0.8644464612007141,
+      "learning_rate": 1.4855418031728885e-05,
+      "loss": 0.6371,
+      "step": 18527
+    },
+    {
+      "epoch": 3.298611111111111,
+      "grad_norm": 0.8655923008918762,
+      "learning_rate": 1.48480779954984e-05,
+      "loss": 0.6938,
+      "step": 18528
+    },
+    {
+      "epoch": 3.298789173789174,
+      "grad_norm": 0.9875824451446533,
+      "learning_rate": 1.4840739627642908e-05,
+      "loss": 0.7635,
+      "step": 18529
+    },
+    {
+      "epoch": 3.298967236467236,
+      "grad_norm": 0.8669924139976501,
+      "learning_rate": 1.4833402928306273e-05,
+      "loss": 0.5327,
+      "step": 18530
+    },
+    {
+      "epoch": 3.299145299145299,
+      "grad_norm": 0.9445573687553406,
+      "learning_rate": 1.4826067897632146e-05,
+      "loss": 0.6969,
+      "step": 18531
+    },
+    {
+      "epoch": 3.2993233618233617,
+      "grad_norm": 0.8305925726890564,
+      "learning_rate": 1.4818734535764323e-05,
+      "loss": 0.7279,
+      "step": 18532
+    },
+    {
+      "epoch": 3.2995014245014245,
+      "grad_norm": 0.9907183051109314,
+      "learning_rate": 1.48114028428464e-05,
+      "loss": 0.8681,
+      "step": 18533
+    },
+    {
+      "epoch": 3.2996794871794872,
+      "grad_norm": 1.065702199935913,
+      "learning_rate": 1.4804072819022108e-05,
+      "loss": 0.8344,
+      "step": 18534
+    },
+    {
+      "epoch": 3.29985754985755,
+      "grad_norm": 1.0929903984069824,
+      "learning_rate": 1.479674446443502e-05,
+      "loss": 0.8327,
+      "step": 18535
+    },
+    {
+      "epoch": 3.3000356125356127,
+      "grad_norm": 1.0050452947616577,
+      "learning_rate": 1.4789417779228732e-05,
+      "loss": 0.6864,
+      "step": 18536
+    },
+    {
+      "epoch": 3.300213675213675,
+      "grad_norm": 0.9825767278671265,
+      "learning_rate": 1.4782092763546795e-05,
+      "loss": 0.7337,
+      "step": 18537
+    },
+    {
+      "epoch": 3.300391737891738,
+      "grad_norm": 0.867043137550354,
+      "learning_rate": 1.477476941753273e-05,
+      "loss": 0.8247,
+      "step": 18538
+    },
+    {
+      "epoch": 3.3005698005698005,
+      "grad_norm": 0.9017619490623474,
+      "learning_rate": 1.4767447741329987e-05,
+      "loss": 0.7135,
+      "step": 18539
+    },
+    {
+      "epoch": 3.3007478632478633,
+      "grad_norm": 0.8865274786949158,
+      "learning_rate": 1.4760127735082097e-05,
+      "loss": 0.7764,
+      "step": 18540
+    },
+    {
+      "epoch": 3.300925925925926,
+      "grad_norm": 1.0825269222259521,
+      "learning_rate": 1.4752809398932388e-05,
+      "loss": 0.7734,
+      "step": 18541
+    },
+    {
+      "epoch": 3.301103988603989,
+      "grad_norm": 0.934187650680542,
+      "learning_rate": 1.4745492733024325e-05,
+      "loss": 0.7147,
+      "step": 18542
+    },
+    {
+      "epoch": 3.301282051282051,
+      "grad_norm": 0.9431992769241333,
+      "learning_rate": 1.4738177737501214e-05,
+      "loss": 0.7086,
+      "step": 18543
+    },
+    {
+      "epoch": 3.301460113960114,
+      "grad_norm": 0.9284996390342712,
+      "learning_rate": 1.4730864412506407e-05,
+      "loss": 0.8364,
+      "step": 18544
+    },
+    {
+      "epoch": 3.3016381766381766,
+      "grad_norm": 0.7849892377853394,
+      "learning_rate": 1.4723552758183178e-05,
+      "loss": 0.7114,
+      "step": 18545
+    },
+    {
+      "epoch": 3.3018162393162394,
+      "grad_norm": 1.096384882926941,
+      "learning_rate": 1.471624277467476e-05,
+      "loss": 0.8764,
+      "step": 18546
+    },
+    {
+      "epoch": 3.301994301994302,
+      "grad_norm": 0.9354200959205627,
+      "learning_rate": 1.4708934462124458e-05,
+      "loss": 0.8211,
+      "step": 18547
+    },
+    {
+      "epoch": 3.302172364672365,
+      "grad_norm": 0.8292770981788635,
+      "learning_rate": 1.4701627820675356e-05,
+      "loss": 0.8402,
+      "step": 18548
+    },
+    {
+      "epoch": 3.302350427350427,
+      "grad_norm": 0.8030630946159363,
+      "learning_rate": 1.4694322850470687e-05,
+      "loss": 0.4122,
+      "step": 18549
+    },
+    {
+      "epoch": 3.30252849002849,
+      "grad_norm": 0.8566523790359497,
+      "learning_rate": 1.4687019551653558e-05,
+      "loss": 0.6334,
+      "step": 18550
+    },
+    {
+      "epoch": 3.3027065527065527,
+      "grad_norm": 0.869368851184845,
+      "learning_rate": 1.4679717924367053e-05,
+      "loss": 0.8575,
+      "step": 18551
+    },
+    {
+      "epoch": 3.3028846153846154,
+      "grad_norm": 0.9408078193664551,
+      "learning_rate": 1.4672417968754237e-05,
+      "loss": 0.6937,
+      "step": 18552
+    },
+    {
+      "epoch": 3.303062678062678,
+      "grad_norm": 0.9742521643638611,
+      "learning_rate": 1.466511968495814e-05,
+      "loss": 0.7323,
+      "step": 18553
+    },
+    {
+      "epoch": 3.303240740740741,
+      "grad_norm": 1.073304533958435,
+      "learning_rate": 1.4657823073121735e-05,
+      "loss": 0.6501,
+      "step": 18554
+    },
+    {
+      "epoch": 3.3034188034188032,
+      "grad_norm": 1.0220160484313965,
+      "learning_rate": 1.4650528133388053e-05,
+      "loss": 0.7497,
+      "step": 18555
+    },
+    {
+      "epoch": 3.303596866096866,
+      "grad_norm": 0.9558535218238831,
+      "learning_rate": 1.4643234865899924e-05,
+      "loss": 0.796,
+      "step": 18556
+    },
+    {
+      "epoch": 3.3037749287749287,
+      "grad_norm": 0.9674620628356934,
+      "learning_rate": 1.4635943270800334e-05,
+      "loss": 0.806,
+      "step": 18557
+    },
+    {
+      "epoch": 3.3039529914529915,
+      "grad_norm": 0.934508204460144,
+      "learning_rate": 1.462865334823207e-05,
+      "loss": 0.6858,
+      "step": 18558
+    },
+    {
+      "epoch": 3.3041310541310542,
+      "grad_norm": 0.9529309868812561,
+      "learning_rate": 1.4621365098338024e-05,
+      "loss": 0.719,
+      "step": 18559
+    },
+    {
+      "epoch": 3.304309116809117,
+      "grad_norm": 0.9932489395141602,
+      "learning_rate": 1.4614078521260965e-05,
+      "loss": 0.9503,
+      "step": 18560
+    },
+    {
+      "epoch": 3.3044871794871793,
+      "grad_norm": 0.9791563749313354,
+      "learning_rate": 1.4606793617143667e-05,
+      "loss": 0.8629,
+      "step": 18561
+    },
+    {
+      "epoch": 3.304665242165242,
+      "grad_norm": 1.036198616027832,
+      "learning_rate": 1.4599510386128857e-05,
+      "loss": 0.82,
+      "step": 18562
+    },
+    {
+      "epoch": 3.304843304843305,
+      "grad_norm": 0.9353040456771851,
+      "learning_rate": 1.4592228828359234e-05,
+      "loss": 0.7878,
+      "step": 18563
+    },
+    {
+      "epoch": 3.3050213675213675,
+      "grad_norm": 1.0084593296051025,
+      "learning_rate": 1.458494894397745e-05,
+      "loss": 0.7765,
+      "step": 18564
+    },
+    {
+      "epoch": 3.3051994301994303,
+      "grad_norm": 1.0099483728408813,
+      "learning_rate": 1.4577670733126203e-05,
+      "loss": 0.9263,
+      "step": 18565
+    },
+    {
+      "epoch": 3.305377492877493,
+      "grad_norm": 0.8631238341331482,
+      "learning_rate": 1.4570394195948e-05,
+      "loss": 0.739,
+      "step": 18566
+    },
+    {
+      "epoch": 3.3055555555555554,
+      "grad_norm": 0.8711506724357605,
+      "learning_rate": 1.4563119332585484e-05,
+      "loss": 0.7182,
+      "step": 18567
+    },
+    {
+      "epoch": 3.305733618233618,
+      "grad_norm": 0.9329670667648315,
+      "learning_rate": 1.4555846143181162e-05,
+      "loss": 0.6453,
+      "step": 18568
+    },
+    {
+      "epoch": 3.305911680911681,
+      "grad_norm": 0.9587839245796204,
+      "learning_rate": 1.4548574627877532e-05,
+      "loss": 0.7599,
+      "step": 18569
+    },
+    {
+      "epoch": 3.3060897435897436,
+      "grad_norm": 0.9862858653068542,
+      "learning_rate": 1.4541304786817089e-05,
+      "loss": 0.8038,
+      "step": 18570
+    },
+    {
+      "epoch": 3.3062678062678064,
+      "grad_norm": 0.852069616317749,
+      "learning_rate": 1.4534036620142221e-05,
+      "loss": 0.8945,
+      "step": 18571
+    },
+    {
+      "epoch": 3.306445868945869,
+      "grad_norm": 0.9384768009185791,
+      "learning_rate": 1.4526770127995415e-05,
+      "loss": 0.7363,
+      "step": 18572
+    },
+    {
+      "epoch": 3.306623931623932,
+      "grad_norm": 0.8548694252967834,
+      "learning_rate": 1.4519505310518944e-05,
+      "loss": 0.7693,
+      "step": 18573
+    },
+    {
+      "epoch": 3.306801994301994,
+      "grad_norm": 1.0954573154449463,
+      "learning_rate": 1.4512242167855216e-05,
+      "loss": 0.8015,
+      "step": 18574
+    },
+    {
+      "epoch": 3.306980056980057,
+      "grad_norm": 0.8069053292274475,
+      "learning_rate": 1.450498070014652e-05,
+      "loss": 0.6506,
+      "step": 18575
+    },
+    {
+      "epoch": 3.3071581196581197,
+      "grad_norm": 0.8466541171073914,
+      "learning_rate": 1.4497720907535128e-05,
+      "loss": 0.6072,
+      "step": 18576
+    },
+    {
+      "epoch": 3.3073361823361824,
+      "grad_norm": 0.7500994205474854,
+      "learning_rate": 1.4490462790163285e-05,
+      "loss": 0.7196,
+      "step": 18577
+    },
+    {
+      "epoch": 3.307514245014245,
+      "grad_norm": 0.9503157734870911,
+      "learning_rate": 1.4483206348173185e-05,
+      "loss": 0.7822,
+      "step": 18578
+    },
+    {
+      "epoch": 3.3076923076923075,
+      "grad_norm": 0.9990713596343994,
+      "learning_rate": 1.447595158170698e-05,
+      "loss": 0.8523,
+      "step": 18579
+    },
+    {
+      "epoch": 3.3078703703703702,
+      "grad_norm": 1.0937577486038208,
+      "learning_rate": 1.4468698490906907e-05,
+      "loss": 0.6461,
+      "step": 18580
+    },
+    {
+      "epoch": 3.308048433048433,
+      "grad_norm": 0.8959493637084961,
+      "learning_rate": 1.4461447075914957e-05,
+      "loss": 0.6803,
+      "step": 18581
+    },
+    {
+      "epoch": 3.3082264957264957,
+      "grad_norm": 0.9823698997497559,
+      "learning_rate": 1.4454197336873299e-05,
+      "loss": 0.8456,
+      "step": 18582
+    },
+    {
+      "epoch": 3.3084045584045585,
+      "grad_norm": 0.9720016121864319,
+      "learning_rate": 1.4446949273923904e-05,
+      "loss": 0.7386,
+      "step": 18583
+    },
+    {
+      "epoch": 3.3085826210826212,
+      "grad_norm": 0.9927224516868591,
+      "learning_rate": 1.4439702887208839e-05,
+      "loss": 0.706,
+      "step": 18584
+    },
+    {
+      "epoch": 3.308760683760684,
+      "grad_norm": 0.8993792533874512,
+      "learning_rate": 1.4432458176870056e-05,
+      "loss": 0.7265,
+      "step": 18585
+    },
+    {
+      "epoch": 3.3089387464387463,
+      "grad_norm": 0.7669193148612976,
+      "learning_rate": 1.4425215143049475e-05,
+      "loss": 0.5659,
+      "step": 18586
+    },
+    {
+      "epoch": 3.309116809116809,
+      "grad_norm": 0.9683290123939514,
+      "learning_rate": 1.4417973785889094e-05,
+      "loss": 0.967,
+      "step": 18587
+    },
+    {
+      "epoch": 3.309294871794872,
+      "grad_norm": 0.8810568451881409,
+      "learning_rate": 1.4410734105530688e-05,
+      "loss": 0.7119,
+      "step": 18588
+    },
+    {
+      "epoch": 3.3094729344729346,
+      "grad_norm": 0.9147163033485413,
+      "learning_rate": 1.4403496102116165e-05,
+      "loss": 0.7214,
+      "step": 18589
+    },
+    {
+      "epoch": 3.3096509971509973,
+      "grad_norm": 1.13361394405365,
+      "learning_rate": 1.4396259775787335e-05,
+      "loss": 0.8442,
+      "step": 18590
+    },
+    {
+      "epoch": 3.3098290598290596,
+      "grad_norm": 0.8798992037773132,
+      "learning_rate": 1.4389025126685962e-05,
+      "loss": 0.5915,
+      "step": 18591
+    },
+    {
+      "epoch": 3.3100071225071224,
+      "grad_norm": 0.9003865718841553,
+      "learning_rate": 1.43817921549538e-05,
+      "loss": 0.6996,
+      "step": 18592
+    },
+    {
+      "epoch": 3.310185185185185,
+      "grad_norm": 0.8331068754196167,
+      "learning_rate": 1.4374560860732567e-05,
+      "loss": 0.4284,
+      "step": 18593
+    },
+    {
+      "epoch": 3.310363247863248,
+      "grad_norm": 0.8950058817863464,
+      "learning_rate": 1.4367331244163939e-05,
+      "loss": 0.6156,
+      "step": 18594
+    },
+    {
+      "epoch": 3.3105413105413106,
+      "grad_norm": 0.8993448615074158,
+      "learning_rate": 1.4360103305389572e-05,
+      "loss": 0.8449,
+      "step": 18595
+    },
+    {
+      "epoch": 3.3107193732193734,
+      "grad_norm": 0.9031822085380554,
+      "learning_rate": 1.435287704455105e-05,
+      "loss": 0.6608,
+      "step": 18596
+    },
+    {
+      "epoch": 3.310897435897436,
+      "grad_norm": 0.8823438882827759,
+      "learning_rate": 1.4345652461790038e-05,
+      "loss": 0.6606,
+      "step": 18597
+    },
+    {
+      "epoch": 3.3110754985754984,
+      "grad_norm": 1.0061595439910889,
+      "learning_rate": 1.433842955724799e-05,
+      "loss": 0.7287,
+      "step": 18598
+    },
+    {
+      "epoch": 3.311253561253561,
+      "grad_norm": 1.0008385181427002,
+      "learning_rate": 1.4331208331066493e-05,
+      "loss": 0.8076,
+      "step": 18599
+    },
+    {
+      "epoch": 3.311431623931624,
+      "grad_norm": 0.7386804819107056,
+      "learning_rate": 1.4323988783387e-05,
+      "loss": 0.4206,
+      "step": 18600
+    },
+    {
+      "epoch": 3.3116096866096867,
+      "grad_norm": 0.9149610996246338,
+      "learning_rate": 1.4316770914350975e-05,
+      "loss": 0.6752,
+      "step": 18601
+    },
+    {
+      "epoch": 3.3117877492877494,
+      "grad_norm": 0.9260136485099792,
+      "learning_rate": 1.4309554724099827e-05,
+      "loss": 0.6503,
+      "step": 18602
+    },
+    {
+      "epoch": 3.3119658119658117,
+      "grad_norm": 0.9129661321640015,
+      "learning_rate": 1.4302340212774956e-05,
+      "loss": 0.7452,
+      "step": 18603
+    },
+    {
+      "epoch": 3.3121438746438745,
+      "grad_norm": 0.9771081805229187,
+      "learning_rate": 1.4295127380517704e-05,
+      "loss": 0.8258,
+      "step": 18604
+    },
+    {
+      "epoch": 3.3123219373219372,
+      "grad_norm": 0.8027868866920471,
+      "learning_rate": 1.4287916227469366e-05,
+      "loss": 0.617,
+      "step": 18605
+    },
+    {
+      "epoch": 3.3125,
+      "grad_norm": 0.8909293413162231,
+      "learning_rate": 1.428070675377129e-05,
+      "loss": 0.6898,
+      "step": 18606
+    },
+    {
+      "epoch": 3.3126780626780628,
+      "grad_norm": 0.8994025588035583,
+      "learning_rate": 1.427349895956469e-05,
+      "loss": 0.8001,
+      "step": 18607
+    },
+    {
+      "epoch": 3.3128561253561255,
+      "grad_norm": 1.062908411026001,
+      "learning_rate": 1.4266292844990802e-05,
+      "loss": 0.9458,
+      "step": 18608
+    },
+    {
+      "epoch": 3.3130341880341883,
+      "grad_norm": 0.9574441909790039,
+      "learning_rate": 1.4259088410190813e-05,
+      "loss": 0.7763,
+      "step": 18609
+    },
+    {
+      "epoch": 3.3132122507122506,
+      "grad_norm": 0.9709355235099792,
+      "learning_rate": 1.4251885655305863e-05,
+      "loss": 0.6442,
+      "step": 18610
+    },
+    {
+      "epoch": 3.3133903133903133,
+      "grad_norm": 0.964805006980896,
+      "learning_rate": 1.4244684580477075e-05,
+      "loss": 0.8413,
+      "step": 18611
+    },
+    {
+      "epoch": 3.313568376068376,
+      "grad_norm": 0.806939959526062,
+      "learning_rate": 1.423748518584559e-05,
+      "loss": 0.5702,
+      "step": 18612
+    },
+    {
+      "epoch": 3.313746438746439,
+      "grad_norm": 0.9942196607589722,
+      "learning_rate": 1.4230287471552394e-05,
+      "loss": 0.8193,
+      "step": 18613
+    },
+    {
+      "epoch": 3.3139245014245016,
+      "grad_norm": 1.024062991142273,
+      "learning_rate": 1.4223091437738556e-05,
+      "loss": 0.6324,
+      "step": 18614
+    },
+    {
+      "epoch": 3.314102564102564,
+      "grad_norm": 0.8655102252960205,
+      "learning_rate": 1.421589708454505e-05,
+      "loss": 0.7141,
+      "step": 18615
+    },
+    {
+      "epoch": 3.3142806267806266,
+      "grad_norm": 0.8736686706542969,
+      "learning_rate": 1.420870441211285e-05,
+      "loss": 0.7649,
+      "step": 18616
+    },
+    {
+      "epoch": 3.3144586894586894,
+      "grad_norm": 1.0445935726165771,
+      "learning_rate": 1.4201513420582868e-05,
+      "loss": 0.943,
+      "step": 18617
+    },
+    {
+      "epoch": 3.314636752136752,
+      "grad_norm": 1.0183906555175781,
+      "learning_rate": 1.4194324110096002e-05,
+      "loss": 0.7352,
+      "step": 18618
+    },
+    {
+      "epoch": 3.314814814814815,
+      "grad_norm": 0.8862751126289368,
+      "learning_rate": 1.4187136480793106e-05,
+      "loss": 0.6363,
+      "step": 18619
+    },
+    {
+      "epoch": 3.3149928774928776,
+      "grad_norm": 0.9212391972541809,
+      "learning_rate": 1.4179950532815011e-05,
+      "loss": 0.7494,
+      "step": 18620
+    },
+    {
+      "epoch": 3.3151709401709404,
+      "grad_norm": 0.8244057297706604,
+      "learning_rate": 1.4172766266302484e-05,
+      "loss": 0.5704,
+      "step": 18621
+    },
+    {
+      "epoch": 3.3153490028490027,
+      "grad_norm": 1.016066551208496,
+      "learning_rate": 1.4165583681396355e-05,
+      "loss": 0.8147,
+      "step": 18622
+    },
+    {
+      "epoch": 3.3155270655270654,
+      "grad_norm": 0.938336968421936,
+      "learning_rate": 1.4158402778237267e-05,
+      "loss": 0.7383,
+      "step": 18623
+    },
+    {
+      "epoch": 3.315705128205128,
+      "grad_norm": 1.1921639442443848,
+      "learning_rate": 1.4151223556965976e-05,
+      "loss": 0.8468,
+      "step": 18624
+    },
+    {
+      "epoch": 3.315883190883191,
+      "grad_norm": 1.0110679864883423,
+      "learning_rate": 1.4144046017723134e-05,
+      "loss": 0.8632,
+      "step": 18625
+    },
+    {
+      "epoch": 3.3160612535612537,
+      "grad_norm": 1.1017330884933472,
+      "learning_rate": 1.4136870160649329e-05,
+      "loss": 0.8882,
+      "step": 18626
+    },
+    {
+      "epoch": 3.316239316239316,
+      "grad_norm": 0.8759216070175171,
+      "learning_rate": 1.4129695985885228e-05,
+      "loss": 0.8481,
+      "step": 18627
+    },
+    {
+      "epoch": 3.3164173789173788,
+      "grad_norm": 0.9331730008125305,
+      "learning_rate": 1.4122523493571316e-05,
+      "loss": 0.7723,
+      "step": 18628
+    },
+    {
+      "epoch": 3.3165954415954415,
+      "grad_norm": 0.8843693137168884,
+      "learning_rate": 1.4115352683848204e-05,
+      "loss": 0.7457,
+      "step": 18629
+    },
+    {
+      "epoch": 3.3167735042735043,
+      "grad_norm": 1.042717456817627,
+      "learning_rate": 1.4108183556856302e-05,
+      "loss": 0.7947,
+      "step": 18630
+    },
+    {
+      "epoch": 3.316951566951567,
+      "grad_norm": 0.912771999835968,
+      "learning_rate": 1.4101016112736143e-05,
+      "loss": 0.7209,
+      "step": 18631
+    },
+    {
+      "epoch": 3.3171296296296298,
+      "grad_norm": 0.8966865539550781,
+      "learning_rate": 1.4093850351628136e-05,
+      "loss": 0.8184,
+      "step": 18632
+    },
+    {
+      "epoch": 3.3173076923076925,
+      "grad_norm": 0.9174174666404724,
+      "learning_rate": 1.4086686273672668e-05,
+      "loss": 0.7335,
+      "step": 18633
+    },
+    {
+      "epoch": 3.317485754985755,
+      "grad_norm": 0.9604222178459167,
+      "learning_rate": 1.4079523879010114e-05,
+      "loss": 0.7813,
+      "step": 18634
+    },
+    {
+      "epoch": 3.3176638176638176,
+      "grad_norm": 0.9172660708427429,
+      "learning_rate": 1.4072363167780811e-05,
+      "loss": 0.7329,
+      "step": 18635
+    },
+    {
+      "epoch": 3.3178418803418803,
+      "grad_norm": 0.8856810927391052,
+      "learning_rate": 1.4065204140125033e-05,
+      "loss": 0.7138,
+      "step": 18636
+    },
+    {
+      "epoch": 3.318019943019943,
+      "grad_norm": 0.9621987342834473,
+      "learning_rate": 1.40580467961831e-05,
+      "loss": 0.777,
+      "step": 18637
+    },
+    {
+      "epoch": 3.318198005698006,
+      "grad_norm": 0.9171984791755676,
+      "learning_rate": 1.4050891136095179e-05,
+      "loss": 0.9051,
+      "step": 18638
+    },
+    {
+      "epoch": 3.318376068376068,
+      "grad_norm": 0.963080108165741,
+      "learning_rate": 1.4043737160001536e-05,
+      "loss": 0.7974,
+      "step": 18639
+    },
+    {
+      "epoch": 3.318554131054131,
+      "grad_norm": 0.8158769607543945,
+      "learning_rate": 1.4036584868042268e-05,
+      "loss": 0.6412,
+      "step": 18640
+    },
+    {
+      "epoch": 3.3187321937321936,
+      "grad_norm": 1.070873737335205,
+      "learning_rate": 1.4029434260357565e-05,
+      "loss": 0.8986,
+      "step": 18641
+    },
+    {
+      "epoch": 3.3189102564102564,
+      "grad_norm": 1.0787153244018555,
+      "learning_rate": 1.4022285337087504e-05,
+      "loss": 0.7754,
+      "step": 18642
+    },
+    {
+      "epoch": 3.319088319088319,
+      "grad_norm": 0.8161159157752991,
+      "learning_rate": 1.401513809837216e-05,
+      "loss": 0.6833,
+      "step": 18643
+    },
+    {
+      "epoch": 3.319266381766382,
+      "grad_norm": 0.9341142177581787,
+      "learning_rate": 1.4007992544351578e-05,
+      "loss": 0.6834,
+      "step": 18644
+    },
+    {
+      "epoch": 3.3194444444444446,
+      "grad_norm": 0.9400647878646851,
+      "learning_rate": 1.4000848675165711e-05,
+      "loss": 0.8276,
+      "step": 18645
+    },
+    {
+      "epoch": 3.319622507122507,
+      "grad_norm": 0.8795194029808044,
+      "learning_rate": 1.3993706490954595e-05,
+      "loss": 0.7008,
+      "step": 18646
+    },
+    {
+      "epoch": 3.3198005698005697,
+      "grad_norm": 1.02724289894104,
+      "learning_rate": 1.3986565991858148e-05,
+      "loss": 0.743,
+      "step": 18647
+    },
+    {
+      "epoch": 3.3199786324786325,
+      "grad_norm": 1.0184358358383179,
+      "learning_rate": 1.3979427178016247e-05,
+      "loss": 0.9255,
+      "step": 18648
+    },
+    {
+      "epoch": 3.320156695156695,
+      "grad_norm": 0.9417447447776794,
+      "learning_rate": 1.3972290049568793e-05,
+      "loss": 0.7776,
+      "step": 18649
+    },
+    {
+      "epoch": 3.320334757834758,
+      "grad_norm": 1.0770184993743896,
+      "learning_rate": 1.3965154606655595e-05,
+      "loss": 0.9044,
+      "step": 18650
+    },
+    {
+      "epoch": 3.3205128205128207,
+      "grad_norm": 1.0059106349945068,
+      "learning_rate": 1.3958020849416453e-05,
+      "loss": 0.8848,
+      "step": 18651
+    },
+    {
+      "epoch": 3.320690883190883,
+      "grad_norm": 0.8800293207168579,
+      "learning_rate": 1.39508887779912e-05,
+      "loss": 0.627,
+      "step": 18652
+    },
+    {
+      "epoch": 3.3208689458689458,
+      "grad_norm": 1.1125677824020386,
+      "learning_rate": 1.394375839251949e-05,
+      "loss": 1.0158,
+      "step": 18653
+    },
+    {
+      "epoch": 3.3210470085470085,
+      "grad_norm": 0.9559670686721802,
+      "learning_rate": 1.3936629693141112e-05,
+      "loss": 0.8771,
+      "step": 18654
+    },
+    {
+      "epoch": 3.3212250712250713,
+      "grad_norm": 0.8112167119979858,
+      "learning_rate": 1.3929502679995643e-05,
+      "loss": 0.5987,
+      "step": 18655
+    },
+    {
+      "epoch": 3.321403133903134,
+      "grad_norm": 1.1909687519073486,
+      "learning_rate": 1.3922377353222805e-05,
+      "loss": 0.8939,
+      "step": 18656
+    },
+    {
+      "epoch": 3.3215811965811968,
+      "grad_norm": 0.9428912997245789,
+      "learning_rate": 1.3915253712962162e-05,
+      "loss": 0.8213,
+      "step": 18657
+    },
+    {
+      "epoch": 3.321759259259259,
+      "grad_norm": 0.9573705792427063,
+      "learning_rate": 1.3908131759353304e-05,
+      "loss": 0.8073,
+      "step": 18658
+    },
+    {
+      "epoch": 3.321937321937322,
+      "grad_norm": 0.8574663400650024,
+      "learning_rate": 1.3901011492535754e-05,
+      "loss": 0.5301,
+      "step": 18659
+    },
+    {
+      "epoch": 3.3221153846153846,
+      "grad_norm": 1.016772985458374,
+      "learning_rate": 1.3893892912649043e-05,
+      "loss": 0.5597,
+      "step": 18660
+    },
+    {
+      "epoch": 3.3222934472934473,
+      "grad_norm": 0.8159156441688538,
+      "learning_rate": 1.3886776019832592e-05,
+      "loss": 0.5178,
+      "step": 18661
+    },
+    {
+      "epoch": 3.32247150997151,
+      "grad_norm": 1.0510454177856445,
+      "learning_rate": 1.3879660814225937e-05,
+      "loss": 0.7447,
+      "step": 18662
+    },
+    {
+      "epoch": 3.322649572649573,
+      "grad_norm": 0.9608046412467957,
+      "learning_rate": 1.3872547295968386e-05,
+      "loss": 0.567,
+      "step": 18663
+    },
+    {
+      "epoch": 3.322827635327635,
+      "grad_norm": 0.9422797560691833,
+      "learning_rate": 1.3865435465199394e-05,
+      "loss": 0.7763,
+      "step": 18664
+    },
+    {
+      "epoch": 3.323005698005698,
+      "grad_norm": 0.8879508376121521,
+      "learning_rate": 1.385832532205822e-05,
+      "loss": 0.6057,
+      "step": 18665
+    },
+    {
+      "epoch": 3.3231837606837606,
+      "grad_norm": 0.8140594959259033,
+      "learning_rate": 1.3851216866684236e-05,
+      "loss": 0.6785,
+      "step": 18666
+    },
+    {
+      "epoch": 3.3233618233618234,
+      "grad_norm": 0.8440611362457275,
+      "learning_rate": 1.3844110099216712e-05,
+      "loss": 0.8621,
+      "step": 18667
+    },
+    {
+      "epoch": 3.323539886039886,
+      "grad_norm": 1.1034574508666992,
+      "learning_rate": 1.3837005019794847e-05,
+      "loss": 0.8346,
+      "step": 18668
+    },
+    {
+      "epoch": 3.323717948717949,
+      "grad_norm": 0.9221482276916504,
+      "learning_rate": 1.3829901628557928e-05,
+      "loss": 0.848,
+      "step": 18669
+    },
+    {
+      "epoch": 3.323896011396011,
+      "grad_norm": 0.9995195865631104,
+      "learning_rate": 1.3822799925645036e-05,
+      "loss": 0.7762,
+      "step": 18670
+    },
+    {
+      "epoch": 3.324074074074074,
+      "grad_norm": 0.8469681143760681,
+      "learning_rate": 1.3815699911195379e-05,
+      "loss": 0.7104,
+      "step": 18671
+    },
+    {
+      "epoch": 3.3242521367521367,
+      "grad_norm": 0.9675276875495911,
+      "learning_rate": 1.3808601585348057e-05,
+      "loss": 0.8602,
+      "step": 18672
+    },
+    {
+      "epoch": 3.3244301994301995,
+      "grad_norm": 0.9134352803230286,
+      "learning_rate": 1.3801504948242138e-05,
+      "loss": 0.7042,
+      "step": 18673
+    },
+    {
+      "epoch": 3.324608262108262,
+      "grad_norm": 0.9910576939582825,
+      "learning_rate": 1.3794410000016667e-05,
+      "loss": 0.714,
+      "step": 18674
+    },
+    {
+      "epoch": 3.324786324786325,
+      "grad_norm": 1.1341938972473145,
+      "learning_rate": 1.3787316740810663e-05,
+      "loss": 0.838,
+      "step": 18675
+    },
+    {
+      "epoch": 3.3249643874643873,
+      "grad_norm": 0.8614459037780762,
+      "learning_rate": 1.3780225170763083e-05,
+      "loss": 0.699,
+      "step": 18676
+    },
+    {
+      "epoch": 3.32514245014245,
+      "grad_norm": 0.906356155872345,
+      "learning_rate": 1.3773135290012895e-05,
+      "loss": 0.7302,
+      "step": 18677
+    },
+    {
+      "epoch": 3.3253205128205128,
+      "grad_norm": 1.0148662328720093,
+      "learning_rate": 1.3766047098698975e-05,
+      "loss": 0.6931,
+      "step": 18678
+    },
+    {
+      "epoch": 3.3254985754985755,
+      "grad_norm": 0.9177684187889099,
+      "learning_rate": 1.3758960596960268e-05,
+      "loss": 0.8363,
+      "step": 18679
+    },
+    {
+      "epoch": 3.3256766381766383,
+      "grad_norm": 1.0270618200302124,
+      "learning_rate": 1.3751875784935542e-05,
+      "loss": 0.823,
+      "step": 18680
+    },
+    {
+      "epoch": 3.325854700854701,
+      "grad_norm": 0.9151124954223633,
+      "learning_rate": 1.3744792662763661e-05,
+      "loss": 0.7939,
+      "step": 18681
+    },
+    {
+      "epoch": 3.326032763532764,
+      "grad_norm": 0.9115209579467773,
+      "learning_rate": 1.3737711230583384e-05,
+      "loss": 0.7914,
+      "step": 18682
+    },
+    {
+      "epoch": 3.326210826210826,
+      "grad_norm": 1.0149354934692383,
+      "learning_rate": 1.3730631488533462e-05,
+      "loss": 0.8579,
+      "step": 18683
+    },
+    {
+      "epoch": 3.326388888888889,
+      "grad_norm": 0.9036294221878052,
+      "learning_rate": 1.3723553436752612e-05,
+      "loss": 0.5274,
+      "step": 18684
+    },
+    {
+      "epoch": 3.3265669515669516,
+      "grad_norm": 1.0346934795379639,
+      "learning_rate": 1.3716477075379485e-05,
+      "loss": 0.8489,
+      "step": 18685
+    },
+    {
+      "epoch": 3.3267450142450143,
+      "grad_norm": 0.8915591835975647,
+      "learning_rate": 1.3709402404552773e-05,
+      "loss": 0.7278,
+      "step": 18686
+    },
+    {
+      "epoch": 3.326923076923077,
+      "grad_norm": 0.9130534529685974,
+      "learning_rate": 1.3702329424411076e-05,
+      "loss": 0.7576,
+      "step": 18687
+    },
+    {
+      "epoch": 3.3271011396011394,
+      "grad_norm": 0.9920860528945923,
+      "learning_rate": 1.3695258135092959e-05,
+      "loss": 0.7424,
+      "step": 18688
+    },
+    {
+      "epoch": 3.327279202279202,
+      "grad_norm": 0.9840420484542847,
+      "learning_rate": 1.3688188536736968e-05,
+      "loss": 0.6867,
+      "step": 18689
+    },
+    {
+      "epoch": 3.327457264957265,
+      "grad_norm": 0.8766686320304871,
+      "learning_rate": 1.3681120629481637e-05,
+      "loss": 0.6816,
+      "step": 18690
+    },
+    {
+      "epoch": 3.3276353276353277,
+      "grad_norm": 1.0345094203948975,
+      "learning_rate": 1.367405441346543e-05,
+      "loss": 0.9062,
+      "step": 18691
+    },
+    {
+      "epoch": 3.3278133903133904,
+      "grad_norm": 0.9291123151779175,
+      "learning_rate": 1.3666989888826798e-05,
+      "loss": 0.9039,
+      "step": 18692
+    },
+    {
+      "epoch": 3.327991452991453,
+      "grad_norm": 0.8739197254180908,
+      "learning_rate": 1.3659927055704136e-05,
+      "loss": 0.6158,
+      "step": 18693
+    },
+    {
+      "epoch": 3.328169515669516,
+      "grad_norm": 1.0071395635604858,
+      "learning_rate": 1.3652865914235901e-05,
+      "loss": 0.7055,
+      "step": 18694
+    },
+    {
+      "epoch": 3.328347578347578,
+      "grad_norm": 0.9326104521751404,
+      "learning_rate": 1.364580646456033e-05,
+      "loss": 0.795,
+      "step": 18695
+    },
+    {
+      "epoch": 3.328525641025641,
+      "grad_norm": 0.9884015321731567,
+      "learning_rate": 1.363874870681583e-05,
+      "loss": 0.8504,
+      "step": 18696
+    },
+    {
+      "epoch": 3.3287037037037037,
+      "grad_norm": 0.9422696828842163,
+      "learning_rate": 1.3631692641140647e-05,
+      "loss": 0.6087,
+      "step": 18697
+    },
+    {
+      "epoch": 3.3288817663817665,
+      "grad_norm": 0.9124358296394348,
+      "learning_rate": 1.3624638267673029e-05,
+      "loss": 0.8993,
+      "step": 18698
+    },
+    {
+      "epoch": 3.3290598290598292,
+      "grad_norm": 0.9755852818489075,
+      "learning_rate": 1.3617585586551196e-05,
+      "loss": 0.8998,
+      "step": 18699
+    },
+    {
+      "epoch": 3.3292378917378915,
+      "grad_norm": 0.9847198128700256,
+      "learning_rate": 1.3610534597913326e-05,
+      "loss": 0.7212,
+      "step": 18700
+    },
+    {
+      "epoch": 3.3294159544159543,
+      "grad_norm": 0.9432051777839661,
+      "learning_rate": 1.3603485301897579e-05,
+      "loss": 0.6306,
+      "step": 18701
+    },
+    {
+      "epoch": 3.329594017094017,
+      "grad_norm": 0.973268985748291,
+      "learning_rate": 1.359643769864205e-05,
+      "loss": 0.8055,
+      "step": 18702
+    },
+    {
+      "epoch": 3.32977207977208,
+      "grad_norm": 1.0206477642059326,
+      "learning_rate": 1.3589391788284834e-05,
+      "loss": 0.8731,
+      "step": 18703
+    },
+    {
+      "epoch": 3.3299501424501425,
+      "grad_norm": 1.0303237438201904,
+      "learning_rate": 1.3582347570963994e-05,
+      "loss": 0.7416,
+      "step": 18704
+    },
+    {
+      "epoch": 3.3301282051282053,
+      "grad_norm": 0.9327782392501831,
+      "learning_rate": 1.3575305046817533e-05,
+      "loss": 1.1033,
+      "step": 18705
+    },
+    {
+      "epoch": 3.330306267806268,
+      "grad_norm": 1.0996826887130737,
+      "learning_rate": 1.3568264215983439e-05,
+      "loss": 0.6945,
+      "step": 18706
+    },
+    {
+      "epoch": 3.3304843304843303,
+      "grad_norm": 0.9221864342689514,
+      "learning_rate": 1.356122507859967e-05,
+      "loss": 0.64,
+      "step": 18707
+    },
+    {
+      "epoch": 3.330662393162393,
+      "grad_norm": 0.8814072608947754,
+      "learning_rate": 1.3554187634804105e-05,
+      "loss": 0.787,
+      "step": 18708
+    },
+    {
+      "epoch": 3.330840455840456,
+      "grad_norm": 0.9193364977836609,
+      "learning_rate": 1.354715188473471e-05,
+      "loss": 0.7423,
+      "step": 18709
+    },
+    {
+      "epoch": 3.3310185185185186,
+      "grad_norm": 0.9405232667922974,
+      "learning_rate": 1.354011782852923e-05,
+      "loss": 0.8607,
+      "step": 18710
+    },
+    {
+      "epoch": 3.3311965811965814,
+      "grad_norm": 1.0526103973388672,
+      "learning_rate": 1.353308546632559e-05,
+      "loss": 0.8387,
+      "step": 18711
+    },
+    {
+      "epoch": 3.3313746438746437,
+      "grad_norm": 1.0627460479736328,
+      "learning_rate": 1.3526054798261468e-05,
+      "loss": 0.7533,
+      "step": 18712
+    },
+    {
+      "epoch": 3.3315527065527064,
+      "grad_norm": 0.9542255997657776,
+      "learning_rate": 1.3519025824474695e-05,
+      "loss": 0.8249,
+      "step": 18713
+    },
+    {
+      "epoch": 3.331730769230769,
+      "grad_norm": 0.8286230564117432,
+      "learning_rate": 1.3511998545102956e-05,
+      "loss": 0.6924,
+      "step": 18714
+    },
+    {
+      "epoch": 3.331908831908832,
+      "grad_norm": 1.0210462808609009,
+      "learning_rate": 1.3504972960283946e-05,
+      "loss": 0.7209,
+      "step": 18715
+    },
+    {
+      "epoch": 3.3320868945868947,
+      "grad_norm": 1.0881197452545166,
+      "learning_rate": 1.3497949070155313e-05,
+      "loss": 0.8403,
+      "step": 18716
+    },
+    {
+      "epoch": 3.3322649572649574,
+      "grad_norm": 0.8475449085235596,
+      "learning_rate": 1.349092687485468e-05,
+      "loss": 0.701,
+      "step": 18717
+    },
+    {
+      "epoch": 3.33244301994302,
+      "grad_norm": 0.814493715763092,
+      "learning_rate": 1.3483906374519607e-05,
+      "loss": 0.7209,
+      "step": 18718
+    },
+    {
+      "epoch": 3.3326210826210825,
+      "grad_norm": 0.9363925457000732,
+      "learning_rate": 1.3476887569287699e-05,
+      "loss": 0.7384,
+      "step": 18719
+    },
+    {
+      "epoch": 3.3327991452991452,
+      "grad_norm": 0.8990989327430725,
+      "learning_rate": 1.3469870459296408e-05,
+      "loss": 0.7414,
+      "step": 18720
+    },
+    {
+      "epoch": 3.332977207977208,
+      "grad_norm": 0.814422607421875,
+      "learning_rate": 1.3462855044683276e-05,
+      "loss": 0.7229,
+      "step": 18721
+    },
+    {
+      "epoch": 3.3331552706552707,
+      "grad_norm": 0.8876845240592957,
+      "learning_rate": 1.3455841325585727e-05,
+      "loss": 0.9162,
+      "step": 18722
+    },
+    {
+      "epoch": 3.3333333333333335,
+      "grad_norm": 0.9989238381385803,
+      "learning_rate": 1.3448829302141198e-05,
+      "loss": 0.74,
+      "step": 18723
+    },
+    {
+      "epoch": 3.333511396011396,
+      "grad_norm": 1.0466902256011963,
+      "learning_rate": 1.3441818974487064e-05,
+      "loss": 0.7875,
+      "step": 18724
+    },
+    {
+      "epoch": 3.3336894586894585,
+      "grad_norm": 0.9067308306694031,
+      "learning_rate": 1.343481034276065e-05,
+      "loss": 0.7372,
+      "step": 18725
+    },
+    {
+      "epoch": 3.3338675213675213,
+      "grad_norm": 0.9707076549530029,
+      "learning_rate": 1.3427803407099349e-05,
+      "loss": 0.7035,
+      "step": 18726
+    },
+    {
+      "epoch": 3.334045584045584,
+      "grad_norm": 1.0841397047042847,
+      "learning_rate": 1.3420798167640358e-05,
+      "loss": 0.6551,
+      "step": 18727
+    },
+    {
+      "epoch": 3.334223646723647,
+      "grad_norm": 0.9127151370048523,
+      "learning_rate": 1.3413794624520992e-05,
+      "loss": 0.8078,
+      "step": 18728
+    },
+    {
+      "epoch": 3.3344017094017095,
+      "grad_norm": 0.9369844198226929,
+      "learning_rate": 1.3406792777878463e-05,
+      "loss": 0.6668,
+      "step": 18729
+    },
+    {
+      "epoch": 3.3345797720797723,
+      "grad_norm": 0.9303719401359558,
+      "learning_rate": 1.3399792627849938e-05,
+      "loss": 0.7203,
+      "step": 18730
+    },
+    {
+      "epoch": 3.3347578347578346,
+      "grad_norm": 0.9815326929092407,
+      "learning_rate": 1.3392794174572588e-05,
+      "loss": 0.7766,
+      "step": 18731
+    },
+    {
+      "epoch": 3.3349358974358974,
+      "grad_norm": 0.8707610964775085,
+      "learning_rate": 1.338579741818352e-05,
+      "loss": 0.6492,
+      "step": 18732
+    },
+    {
+      "epoch": 3.33511396011396,
+      "grad_norm": 0.8969348669052124,
+      "learning_rate": 1.337880235881981e-05,
+      "loss": 0.6799,
+      "step": 18733
+    },
+    {
+      "epoch": 3.335292022792023,
+      "grad_norm": 1.079229474067688,
+      "learning_rate": 1.3371808996618562e-05,
+      "loss": 0.683,
+      "step": 18734
+    },
+    {
+      "epoch": 3.3354700854700856,
+      "grad_norm": 0.9601012468338013,
+      "learning_rate": 1.3364817331716728e-05,
+      "loss": 0.8354,
+      "step": 18735
+    },
+    {
+      "epoch": 3.335648148148148,
+      "grad_norm": 0.7610724568367004,
+      "learning_rate": 1.3357827364251364e-05,
+      "loss": 0.6091,
+      "step": 18736
+    },
+    {
+      "epoch": 3.3358262108262107,
+      "grad_norm": 1.0253962278366089,
+      "learning_rate": 1.3350839094359346e-05,
+      "loss": 0.7979,
+      "step": 18737
+    },
+    {
+      "epoch": 3.3360042735042734,
+      "grad_norm": 1.020058512687683,
+      "learning_rate": 1.3343852522177668e-05,
+      "loss": 0.8259,
+      "step": 18738
+    },
+    {
+      "epoch": 3.336182336182336,
+      "grad_norm": 0.7768040895462036,
+      "learning_rate": 1.3336867647843188e-05,
+      "loss": 0.4803,
+      "step": 18739
+    },
+    {
+      "epoch": 3.336360398860399,
+      "grad_norm": 0.8641179800033569,
+      "learning_rate": 1.3329884471492749e-05,
+      "loss": 0.6559,
+      "step": 18740
+    },
+    {
+      "epoch": 3.3365384615384617,
+      "grad_norm": 1.1274569034576416,
+      "learning_rate": 1.3322902993263197e-05,
+      "loss": 0.8297,
+      "step": 18741
+    },
+    {
+      "epoch": 3.3367165242165244,
+      "grad_norm": 0.86558598279953,
+      "learning_rate": 1.3315923213291292e-05,
+      "loss": 0.7778,
+      "step": 18742
+    },
+    {
+      "epoch": 3.3368945868945867,
+      "grad_norm": 1.0409115552902222,
+      "learning_rate": 1.330894513171378e-05,
+      "loss": 0.7114,
+      "step": 18743
+    },
+    {
+      "epoch": 3.3370726495726495,
+      "grad_norm": 0.9985263347625732,
+      "learning_rate": 1.3301968748667426e-05,
+      "loss": 0.6776,
+      "step": 18744
+    },
+    {
+      "epoch": 3.3372507122507122,
+      "grad_norm": 0.9003916382789612,
+      "learning_rate": 1.3294994064288902e-05,
+      "loss": 0.7261,
+      "step": 18745
+    },
+    {
+      "epoch": 3.337428774928775,
+      "grad_norm": 1.0112050771713257,
+      "learning_rate": 1.3288021078714851e-05,
+      "loss": 0.7276,
+      "step": 18746
+    },
+    {
+      "epoch": 3.3376068376068377,
+      "grad_norm": 0.9479689002037048,
+      "learning_rate": 1.3281049792081901e-05,
+      "loss": 0.8231,
+      "step": 18747
+    },
+    {
+      "epoch": 3.3377849002849,
+      "grad_norm": 0.9517938494682312,
+      "learning_rate": 1.327408020452663e-05,
+      "loss": 0.8923,
+      "step": 18748
+    },
+    {
+      "epoch": 3.337962962962963,
+      "grad_norm": 0.9328367710113525,
+      "learning_rate": 1.3267112316185603e-05,
+      "loss": 0.8031,
+      "step": 18749
+    },
+    {
+      "epoch": 3.3381410256410255,
+      "grad_norm": 1.142439842224121,
+      "learning_rate": 1.3260146127195317e-05,
+      "loss": 0.8177,
+      "step": 18750
+    },
+    {
+      "epoch": 3.3383190883190883,
+      "grad_norm": 0.9793458580970764,
+      "learning_rate": 1.3253181637692324e-05,
+      "loss": 0.6184,
+      "step": 18751
+    },
+    {
+      "epoch": 3.338497150997151,
+      "grad_norm": 1.0512577295303345,
+      "learning_rate": 1.3246218847812996e-05,
+      "loss": 0.7105,
+      "step": 18752
+    },
+    {
+      "epoch": 3.338675213675214,
+      "grad_norm": 0.9189818501472473,
+      "learning_rate": 1.3239257757693812e-05,
+      "loss": 0.8745,
+      "step": 18753
+    },
+    {
+      "epoch": 3.3388532763532766,
+      "grad_norm": 1.0795321464538574,
+      "learning_rate": 1.3232298367471141e-05,
+      "loss": 1.0447,
+      "step": 18754
+    },
+    {
+      "epoch": 3.339031339031339,
+      "grad_norm": 0.9658595323562622,
+      "learning_rate": 1.322534067728135e-05,
+      "loss": 0.9254,
+      "step": 18755
+    },
+    {
+      "epoch": 3.3392094017094016,
+      "grad_norm": 0.9447624087333679,
+      "learning_rate": 1.3218384687260743e-05,
+      "loss": 0.7847,
+      "step": 18756
+    },
+    {
+      "epoch": 3.3393874643874644,
+      "grad_norm": 1.0141079425811768,
+      "learning_rate": 1.321143039754561e-05,
+      "loss": 0.6128,
+      "step": 18757
+    },
+    {
+      "epoch": 3.339565527065527,
+      "grad_norm": 0.9425485730171204,
+      "learning_rate": 1.3204477808272187e-05,
+      "loss": 0.629,
+      "step": 18758
+    },
+    {
+      "epoch": 3.33974358974359,
+      "grad_norm": 1.0052008628845215,
+      "learning_rate": 1.3197526919576775e-05,
+      "loss": 0.8226,
+      "step": 18759
+    },
+    {
+      "epoch": 3.339921652421652,
+      "grad_norm": 0.8588734865188599,
+      "learning_rate": 1.3190577731595454e-05,
+      "loss": 0.6467,
+      "step": 18760
+    },
+    {
+      "epoch": 3.340099715099715,
+      "grad_norm": 0.8867504596710205,
+      "learning_rate": 1.3183630244464473e-05,
+      "loss": 0.8033,
+      "step": 18761
+    },
+    {
+      "epoch": 3.3402777777777777,
+      "grad_norm": 0.9576687812805176,
+      "learning_rate": 1.3176684458319877e-05,
+      "loss": 0.7776,
+      "step": 18762
+    },
+    {
+      "epoch": 3.3404558404558404,
+      "grad_norm": 0.9716742038726807,
+      "learning_rate": 1.3169740373297813e-05,
+      "loss": 0.9164,
+      "step": 18763
+    },
+    {
+      "epoch": 3.340633903133903,
+      "grad_norm": 0.9201170206069946,
+      "learning_rate": 1.3162797989534303e-05,
+      "loss": 0.7159,
+      "step": 18764
+    },
+    {
+      "epoch": 3.340811965811966,
+      "grad_norm": 0.8414926528930664,
+      "learning_rate": 1.3155857307165354e-05,
+      "loss": 0.6775,
+      "step": 18765
+    },
+    {
+      "epoch": 3.3409900284900287,
+      "grad_norm": 1.06947922706604,
+      "learning_rate": 1.314891832632703e-05,
+      "loss": 0.7879,
+      "step": 18766
+    },
+    {
+      "epoch": 3.341168091168091,
+      "grad_norm": 1.0419055223464966,
+      "learning_rate": 1.3141981047155183e-05,
+      "loss": 0.849,
+      "step": 18767
+    },
+    {
+      "epoch": 3.3413461538461537,
+      "grad_norm": 0.9134105443954468,
+      "learning_rate": 1.3135045469785811e-05,
+      "loss": 0.711,
+      "step": 18768
+    },
+    {
+      "epoch": 3.3415242165242165,
+      "grad_norm": 0.9776987433433533,
+      "learning_rate": 1.3128111594354775e-05,
+      "loss": 0.8041,
+      "step": 18769
+    },
+    {
+      "epoch": 3.3417022792022792,
+      "grad_norm": 0.9296642541885376,
+      "learning_rate": 1.3121179420997941e-05,
+      "loss": 0.7348,
+      "step": 18770
+    },
+    {
+      "epoch": 3.341880341880342,
+      "grad_norm": 0.9305540323257446,
+      "learning_rate": 1.3114248949851116e-05,
+      "loss": 0.6617,
+      "step": 18771
+    },
+    {
+      "epoch": 3.3420584045584047,
+      "grad_norm": 1.1082522869110107,
+      "learning_rate": 1.3107320181050086e-05,
+      "loss": 0.6465,
+      "step": 18772
+    },
+    {
+      "epoch": 3.342236467236467,
+      "grad_norm": 1.0075236558914185,
+      "learning_rate": 1.3100393114730614e-05,
+      "loss": 0.8232,
+      "step": 18773
+    },
+    {
+      "epoch": 3.34241452991453,
+      "grad_norm": 0.9493429064750671,
+      "learning_rate": 1.3093467751028433e-05,
+      "loss": 0.6698,
+      "step": 18774
+    },
+    {
+      "epoch": 3.3425925925925926,
+      "grad_norm": 0.8520618677139282,
+      "learning_rate": 1.3086544090079179e-05,
+      "loss": 0.7738,
+      "step": 18775
+    },
+    {
+      "epoch": 3.3427706552706553,
+      "grad_norm": 1.036331295967102,
+      "learning_rate": 1.3079622132018599e-05,
+      "loss": 0.6765,
+      "step": 18776
+    },
+    {
+      "epoch": 3.342948717948718,
+      "grad_norm": 1.008399486541748,
+      "learning_rate": 1.3072701876982218e-05,
+      "loss": 0.7976,
+      "step": 18777
+    },
+    {
+      "epoch": 3.343126780626781,
+      "grad_norm": 1.049607515335083,
+      "learning_rate": 1.3065783325105695e-05,
+      "loss": 0.8794,
+      "step": 18778
+    },
+    {
+      "epoch": 3.343304843304843,
+      "grad_norm": 0.954903244972229,
+      "learning_rate": 1.3058866476524556e-05,
+      "loss": 0.8595,
+      "step": 18779
+    },
+    {
+      "epoch": 3.343482905982906,
+      "grad_norm": 0.9020154476165771,
+      "learning_rate": 1.3051951331374323e-05,
+      "loss": 0.7465,
+      "step": 18780
+    },
+    {
+      "epoch": 3.3436609686609686,
+      "grad_norm": 1.0125290155410767,
+      "learning_rate": 1.3045037889790478e-05,
+      "loss": 1.0211,
+      "step": 18781
+    },
+    {
+      "epoch": 3.3438390313390314,
+      "grad_norm": 1.1179999113082886,
+      "learning_rate": 1.3038126151908492e-05,
+      "loss": 0.7702,
+      "step": 18782
+    },
+    {
+      "epoch": 3.344017094017094,
+      "grad_norm": 0.8777053952217102,
+      "learning_rate": 1.3031216117863764e-05,
+      "loss": 0.6477,
+      "step": 18783
+    },
+    {
+      "epoch": 3.344195156695157,
+      "grad_norm": 1.083327054977417,
+      "learning_rate": 1.3024307787791679e-05,
+      "loss": 0.9073,
+      "step": 18784
+    },
+    {
+      "epoch": 3.344373219373219,
+      "grad_norm": 1.0102063417434692,
+      "learning_rate": 1.3017401161827636e-05,
+      "loss": 0.7956,
+      "step": 18785
+    },
+    {
+      "epoch": 3.344551282051282,
+      "grad_norm": 0.8271292448043823,
+      "learning_rate": 1.3010496240106918e-05,
+      "loss": 0.7027,
+      "step": 18786
+    },
+    {
+      "epoch": 3.3447293447293447,
+      "grad_norm": 1.0592424869537354,
+      "learning_rate": 1.3003593022764826e-05,
+      "loss": 0.8079,
+      "step": 18787
+    },
+    {
+      "epoch": 3.3449074074074074,
+      "grad_norm": 1.0229346752166748,
+      "learning_rate": 1.2996691509936598e-05,
+      "loss": 0.8416,
+      "step": 18788
+    },
+    {
+      "epoch": 3.34508547008547,
+      "grad_norm": 1.0877922773361206,
+      "learning_rate": 1.298979170175748e-05,
+      "loss": 0.793,
+      "step": 18789
+    },
+    {
+      "epoch": 3.345263532763533,
+      "grad_norm": 0.9380156397819519,
+      "learning_rate": 1.298289359836261e-05,
+      "loss": 0.6552,
+      "step": 18790
+    },
+    {
+      "epoch": 3.3454415954415953,
+      "grad_norm": 0.9297496676445007,
+      "learning_rate": 1.2975997199887235e-05,
+      "loss": 0.8337,
+      "step": 18791
+    },
+    {
+      "epoch": 3.345619658119658,
+      "grad_norm": 0.9621930718421936,
+      "learning_rate": 1.296910250646637e-05,
+      "loss": 0.7204,
+      "step": 18792
+    },
+    {
+      "epoch": 3.3457977207977208,
+      "grad_norm": 0.847952663898468,
+      "learning_rate": 1.296220951823517e-05,
+      "loss": 0.6411,
+      "step": 18793
+    },
+    {
+      "epoch": 3.3459757834757835,
+      "grad_norm": 1.0718971490859985,
+      "learning_rate": 1.2955318235328672e-05,
+      "loss": 0.7651,
+      "step": 18794
+    },
+    {
+      "epoch": 3.3461538461538463,
+      "grad_norm": 0.9610202312469482,
+      "learning_rate": 1.2948428657881884e-05,
+      "loss": 0.884,
+      "step": 18795
+    },
+    {
+      "epoch": 3.346331908831909,
+      "grad_norm": 0.9173485040664673,
+      "learning_rate": 1.2941540786029815e-05,
+      "loss": 0.736,
+      "step": 18796
+    },
+    {
+      "epoch": 3.3465099715099713,
+      "grad_norm": 1.0938438177108765,
+      "learning_rate": 1.2934654619907404e-05,
+      "loss": 0.8351,
+      "step": 18797
+    },
+    {
+      "epoch": 3.346688034188034,
+      "grad_norm": 1.040371060371399,
+      "learning_rate": 1.2927770159649566e-05,
+      "loss": 0.7698,
+      "step": 18798
+    },
+    {
+      "epoch": 3.346866096866097,
+      "grad_norm": 1.0388879776000977,
+      "learning_rate": 1.2920887405391202e-05,
+      "loss": 0.7521,
+      "step": 18799
+    },
+    {
+      "epoch": 3.3470441595441596,
+      "grad_norm": 1.1408355236053467,
+      "learning_rate": 1.2914006357267128e-05,
+      "loss": 0.6752,
+      "step": 18800
+    },
+    {
+      "epoch": 3.3472222222222223,
+      "grad_norm": 0.9846867322921753,
+      "learning_rate": 1.2907127015412247e-05,
+      "loss": 0.7124,
+      "step": 18801
+    },
+    {
+      "epoch": 3.347400284900285,
+      "grad_norm": 0.9047761559486389,
+      "learning_rate": 1.2900249379961238e-05,
+      "loss": 0.9022,
+      "step": 18802
+    },
+    {
+      "epoch": 3.347578347578348,
+      "grad_norm": 1.1381040811538696,
+      "learning_rate": 1.289337345104894e-05,
+      "loss": 0.8264,
+      "step": 18803
+    },
+    {
+      "epoch": 3.34775641025641,
+      "grad_norm": 0.8845169544219971,
+      "learning_rate": 1.2886499228810045e-05,
+      "loss": 0.7129,
+      "step": 18804
+    },
+    {
+      "epoch": 3.347934472934473,
+      "grad_norm": 0.8322994709014893,
+      "learning_rate": 1.28796267133792e-05,
+      "loss": 0.8143,
+      "step": 18805
+    },
+    {
+      "epoch": 3.3481125356125356,
+      "grad_norm": 0.9626625776290894,
+      "learning_rate": 1.2872755904891142e-05,
+      "loss": 0.747,
+      "step": 18806
+    },
+    {
+      "epoch": 3.3482905982905984,
+      "grad_norm": 0.8659831285476685,
+      "learning_rate": 1.2865886803480399e-05,
+      "loss": 0.698,
+      "step": 18807
+    },
+    {
+      "epoch": 3.348468660968661,
+      "grad_norm": 1.006449580192566,
+      "learning_rate": 1.2859019409281636e-05,
+      "loss": 0.9115,
+      "step": 18808
+    },
+    {
+      "epoch": 3.3486467236467234,
+      "grad_norm": 0.9104591012001038,
+      "learning_rate": 1.2852153722429327e-05,
+      "loss": 0.5964,
+      "step": 18809
+    },
+    {
+      "epoch": 3.348824786324786,
+      "grad_norm": 0.9254124760627747,
+      "learning_rate": 1.2845289743058064e-05,
+      "loss": 0.784,
+      "step": 18810
+    },
+    {
+      "epoch": 3.349002849002849,
+      "grad_norm": 0.9335882663726807,
+      "learning_rate": 1.2838427471302284e-05,
+      "loss": 0.9993,
+      "step": 18811
+    },
+    {
+      "epoch": 3.3491809116809117,
+      "grad_norm": 1.064700961112976,
+      "learning_rate": 1.2831566907296466e-05,
+      "loss": 0.7263,
+      "step": 18812
+    },
+    {
+      "epoch": 3.3493589743589745,
+      "grad_norm": 1.071642279624939,
+      "learning_rate": 1.2824708051175016e-05,
+      "loss": 0.8929,
+      "step": 18813
+    },
+    {
+      "epoch": 3.349537037037037,
+      "grad_norm": 0.8615704774856567,
+      "learning_rate": 1.2817850903072315e-05,
+      "loss": 0.8469,
+      "step": 18814
+    },
+    {
+      "epoch": 3.3497150997151,
+      "grad_norm": 0.9117037057876587,
+      "learning_rate": 1.281099546312271e-05,
+      "loss": 0.8622,
+      "step": 18815
+    },
+    {
+      "epoch": 3.3498931623931623,
+      "grad_norm": 1.1236242055892944,
+      "learning_rate": 1.2804141731460562e-05,
+      "loss": 0.7232,
+      "step": 18816
+    },
+    {
+      "epoch": 3.350071225071225,
+      "grad_norm": 0.7709314823150635,
+      "learning_rate": 1.2797289708220084e-05,
+      "loss": 0.4792,
+      "step": 18817
+    },
+    {
+      "epoch": 3.3502492877492878,
+      "grad_norm": 0.9297556281089783,
+      "learning_rate": 1.2790439393535614e-05,
+      "loss": 0.6442,
+      "step": 18818
+    },
+    {
+      "epoch": 3.3504273504273505,
+      "grad_norm": 0.9198339581489563,
+      "learning_rate": 1.2783590787541266e-05,
+      "loss": 0.7885,
+      "step": 18819
+    },
+    {
+      "epoch": 3.3506054131054133,
+      "grad_norm": 0.9080191850662231,
+      "learning_rate": 1.2776743890371312e-05,
+      "loss": 0.6749,
+      "step": 18820
+    },
+    {
+      "epoch": 3.3507834757834756,
+      "grad_norm": 0.969227135181427,
+      "learning_rate": 1.2769898702159867e-05,
+      "loss": 0.6469,
+      "step": 18821
+    },
+    {
+      "epoch": 3.3509615384615383,
+      "grad_norm": 0.9139270186424255,
+      "learning_rate": 1.2763055223041055e-05,
+      "loss": 0.7155,
+      "step": 18822
+    },
+    {
+      "epoch": 3.351139601139601,
+      "grad_norm": 0.7823638916015625,
+      "learning_rate": 1.2756213453148958e-05,
+      "loss": 0.5529,
+      "step": 18823
+    },
+    {
+      "epoch": 3.351317663817664,
+      "grad_norm": 0.9755151867866516,
+      "learning_rate": 1.2749373392617603e-05,
+      "loss": 0.7031,
+      "step": 18824
+    },
+    {
+      "epoch": 3.3514957264957266,
+      "grad_norm": 0.9338952302932739,
+      "learning_rate": 1.274253504158105e-05,
+      "loss": 0.771,
+      "step": 18825
+    },
+    {
+      "epoch": 3.3516737891737893,
+      "grad_norm": 0.9785382747650146,
+      "learning_rate": 1.2735698400173257e-05,
+      "loss": 0.8773,
+      "step": 18826
+    },
+    {
+      "epoch": 3.351851851851852,
+      "grad_norm": 1.0508654117584229,
+      "learning_rate": 1.2728863468528174e-05,
+      "loss": 0.9981,
+      "step": 18827
+    },
+    {
+      "epoch": 3.3520299145299144,
+      "grad_norm": 1.0220104455947876,
+      "learning_rate": 1.2722030246779737e-05,
+      "loss": 0.8708,
+      "step": 18828
+    },
+    {
+      "epoch": 3.352207977207977,
+      "grad_norm": 0.8605777025222778,
+      "learning_rate": 1.2715198735061794e-05,
+      "loss": 0.703,
+      "step": 18829
+    },
+    {
+      "epoch": 3.35238603988604,
+      "grad_norm": 1.0802946090698242,
+      "learning_rate": 1.2708368933508207e-05,
+      "loss": 0.758,
+      "step": 18830
+    },
+    {
+      "epoch": 3.3525641025641026,
+      "grad_norm": 0.9684296250343323,
+      "learning_rate": 1.2701540842252835e-05,
+      "loss": 0.6264,
+      "step": 18831
+    },
+    {
+      "epoch": 3.3527421652421654,
+      "grad_norm": 0.8688775897026062,
+      "learning_rate": 1.269471446142938e-05,
+      "loss": 0.7257,
+      "step": 18832
+    },
+    {
+      "epoch": 3.3529202279202277,
+      "grad_norm": 0.9325960278511047,
+      "learning_rate": 1.2687889791171681e-05,
+      "loss": 0.6823,
+      "step": 18833
+    },
+    {
+      "epoch": 3.3530982905982905,
+      "grad_norm": 0.9034467935562134,
+      "learning_rate": 1.2681066831613365e-05,
+      "loss": 0.7568,
+      "step": 18834
+    },
+    {
+      "epoch": 3.353276353276353,
+      "grad_norm": 0.8753020763397217,
+      "learning_rate": 1.2674245582888167e-05,
+      "loss": 0.6769,
+      "step": 18835
+    },
+    {
+      "epoch": 3.353454415954416,
+      "grad_norm": 1.071382761001587,
+      "learning_rate": 1.2667426045129727e-05,
+      "loss": 0.6557,
+      "step": 18836
+    },
+    {
+      "epoch": 3.3536324786324787,
+      "grad_norm": 0.9410926699638367,
+      "learning_rate": 1.266060821847166e-05,
+      "loss": 0.7636,
+      "step": 18837
+    },
+    {
+      "epoch": 3.3538105413105415,
+      "grad_norm": 0.9422865509986877,
+      "learning_rate": 1.2653792103047535e-05,
+      "loss": 0.7199,
+      "step": 18838
+    },
+    {
+      "epoch": 3.353988603988604,
+      "grad_norm": 0.9728864431381226,
+      "learning_rate": 1.2646977698990914e-05,
+      "loss": 0.7725,
+      "step": 18839
+    },
+    {
+      "epoch": 3.3541666666666665,
+      "grad_norm": 0.8972764611244202,
+      "learning_rate": 1.264016500643529e-05,
+      "loss": 0.8581,
+      "step": 18840
+    },
+    {
+      "epoch": 3.3543447293447293,
+      "grad_norm": 0.9951597452163696,
+      "learning_rate": 1.2633354025514188e-05,
+      "loss": 0.7828,
+      "step": 18841
+    },
+    {
+      "epoch": 3.354522792022792,
+      "grad_norm": 0.8521020412445068,
+      "learning_rate": 1.2626544756360991e-05,
+      "loss": 0.7818,
+      "step": 18842
+    },
+    {
+      "epoch": 3.3547008547008548,
+      "grad_norm": 0.9301114082336426,
+      "learning_rate": 1.261973719910916e-05,
+      "loss": 0.6921,
+      "step": 18843
+    },
+    {
+      "epoch": 3.3548789173789175,
+      "grad_norm": 1.0106427669525146,
+      "learning_rate": 1.2612931353892077e-05,
+      "loss": 0.6784,
+      "step": 18844
+    },
+    {
+      "epoch": 3.35505698005698,
+      "grad_norm": 0.9824902415275574,
+      "learning_rate": 1.2606127220843057e-05,
+      "loss": 0.8435,
+      "step": 18845
+    },
+    {
+      "epoch": 3.3552350427350426,
+      "grad_norm": 1.5545586347579956,
+      "learning_rate": 1.2599324800095435e-05,
+      "loss": 0.9226,
+      "step": 18846
+    },
+    {
+      "epoch": 3.3554131054131053,
+      "grad_norm": 1.0093287229537964,
+      "learning_rate": 1.2592524091782465e-05,
+      "loss": 0.7677,
+      "step": 18847
+    },
+    {
+      "epoch": 3.355591168091168,
+      "grad_norm": 0.8863753080368042,
+      "learning_rate": 1.2585725096037448e-05,
+      "loss": 0.7108,
+      "step": 18848
+    },
+    {
+      "epoch": 3.355769230769231,
+      "grad_norm": 0.880733072757721,
+      "learning_rate": 1.2578927812993524e-05,
+      "loss": 0.6485,
+      "step": 18849
+    },
+    {
+      "epoch": 3.3559472934472936,
+      "grad_norm": 0.8626177310943604,
+      "learning_rate": 1.2572132242783929e-05,
+      "loss": 0.6838,
+      "step": 18850
+    },
+    {
+      "epoch": 3.3561253561253563,
+      "grad_norm": 0.9541394114494324,
+      "learning_rate": 1.2565338385541792e-05,
+      "loss": 0.6914,
+      "step": 18851
+    },
+    {
+      "epoch": 3.3563034188034186,
+      "grad_norm": 0.933340311050415,
+      "learning_rate": 1.2558546241400215e-05,
+      "loss": 0.719,
+      "step": 18852
+    },
+    {
+      "epoch": 3.3564814814814814,
+      "grad_norm": 0.8834456205368042,
+      "learning_rate": 1.2551755810492272e-05,
+      "loss": 0.7702,
+      "step": 18853
+    },
+    {
+      "epoch": 3.356659544159544,
+      "grad_norm": 0.845119297504425,
+      "learning_rate": 1.2544967092951031e-05,
+      "loss": 0.6369,
+      "step": 18854
+    },
+    {
+      "epoch": 3.356837606837607,
+      "grad_norm": 0.9516612887382507,
+      "learning_rate": 1.2538180088909479e-05,
+      "loss": 0.6735,
+      "step": 18855
+    },
+    {
+      "epoch": 3.3570156695156697,
+      "grad_norm": 0.8660346865653992,
+      "learning_rate": 1.2531394798500606e-05,
+      "loss": 0.6033,
+      "step": 18856
+    },
+    {
+      "epoch": 3.357193732193732,
+      "grad_norm": 1.4815114736557007,
+      "learning_rate": 1.2524611221857318e-05,
+      "loss": 1.1411,
+      "step": 18857
+    },
+    {
+      "epoch": 3.3573717948717947,
+      "grad_norm": 0.9433855414390564,
+      "learning_rate": 1.251782935911262e-05,
+      "loss": 0.7562,
+      "step": 18858
+    },
+    {
+      "epoch": 3.3575498575498575,
+      "grad_norm": 0.8575382828712463,
+      "learning_rate": 1.2511049210399272e-05,
+      "loss": 0.7071,
+      "step": 18859
+    },
+    {
+      "epoch": 3.35772792022792,
+      "grad_norm": 1.0027903318405151,
+      "learning_rate": 1.250427077585019e-05,
+      "loss": 0.6572,
+      "step": 18860
+    },
+    {
+      "epoch": 3.357905982905983,
+      "grad_norm": 1.028286099433899,
+      "learning_rate": 1.2497494055598181e-05,
+      "loss": 0.7687,
+      "step": 18861
+    },
+    {
+      "epoch": 3.3580840455840457,
+      "grad_norm": 0.9715962409973145,
+      "learning_rate": 1.249071904977599e-05,
+      "loss": 0.6205,
+      "step": 18862
+    },
+    {
+      "epoch": 3.3582621082621085,
+      "grad_norm": 1.0001580715179443,
+      "learning_rate": 1.248394575851638e-05,
+      "loss": 0.6317,
+      "step": 18863
+    },
+    {
+      "epoch": 3.3584401709401708,
+      "grad_norm": 0.8842644691467285,
+      "learning_rate": 1.2477174181952034e-05,
+      "loss": 0.6289,
+      "step": 18864
+    },
+    {
+      "epoch": 3.3586182336182335,
+      "grad_norm": 0.9407495260238647,
+      "learning_rate": 1.2470404320215667e-05,
+      "loss": 0.6886,
+      "step": 18865
+    },
+    {
+      "epoch": 3.3587962962962963,
+      "grad_norm": 0.9713712930679321,
+      "learning_rate": 1.2463636173439908e-05,
+      "loss": 0.6181,
+      "step": 18866
+    },
+    {
+      "epoch": 3.358974358974359,
+      "grad_norm": 1.070308804512024,
+      "learning_rate": 1.245686974175735e-05,
+      "loss": 0.9199,
+      "step": 18867
+    },
+    {
+      "epoch": 3.359152421652422,
+      "grad_norm": 0.9229024648666382,
+      "learning_rate": 1.2450105025300574e-05,
+      "loss": 0.8533,
+      "step": 18868
+    },
+    {
+      "epoch": 3.359330484330484,
+      "grad_norm": 0.8956949710845947,
+      "learning_rate": 1.244334202420212e-05,
+      "loss": 0.731,
+      "step": 18869
+    },
+    {
+      "epoch": 3.359508547008547,
+      "grad_norm": 0.8161178231239319,
+      "learning_rate": 1.2436580738594494e-05,
+      "loss": 0.592,
+      "step": 18870
+    },
+    {
+      "epoch": 3.3596866096866096,
+      "grad_norm": 0.8732914328575134,
+      "learning_rate": 1.2429821168610179e-05,
+      "loss": 0.7275,
+      "step": 18871
+    },
+    {
+      "epoch": 3.3598646723646723,
+      "grad_norm": 1.0021698474884033,
+      "learning_rate": 1.2423063314381578e-05,
+      "loss": 0.8745,
+      "step": 18872
+    },
+    {
+      "epoch": 3.360042735042735,
+      "grad_norm": 0.9891536235809326,
+      "learning_rate": 1.2416307176041176e-05,
+      "loss": 0.7716,
+      "step": 18873
+    },
+    {
+      "epoch": 3.360220797720798,
+      "grad_norm": 1.0069750547409058,
+      "learning_rate": 1.2409552753721254e-05,
+      "loss": 0.91,
+      "step": 18874
+    },
+    {
+      "epoch": 3.3603988603988606,
+      "grad_norm": 1.0723732709884644,
+      "learning_rate": 1.2402800047554208e-05,
+      "loss": 0.9909,
+      "step": 18875
+    },
+    {
+      "epoch": 3.360576923076923,
+      "grad_norm": 0.9411680698394775,
+      "learning_rate": 1.239604905767232e-05,
+      "loss": 0.7607,
+      "step": 18876
+    },
+    {
+      "epoch": 3.3607549857549857,
+      "grad_norm": 1.0347760915756226,
+      "learning_rate": 1.2389299784207886e-05,
+      "loss": 0.8984,
+      "step": 18877
+    },
+    {
+      "epoch": 3.3609330484330484,
+      "grad_norm": 1.0429126024246216,
+      "learning_rate": 1.238255222729311e-05,
+      "loss": 0.8522,
+      "step": 18878
+    },
+    {
+      "epoch": 3.361111111111111,
+      "grad_norm": 0.938989520072937,
+      "learning_rate": 1.2375806387060229e-05,
+      "loss": 0.7692,
+      "step": 18879
+    },
+    {
+      "epoch": 3.361289173789174,
+      "grad_norm": 0.8867523074150085,
+      "learning_rate": 1.2369062263641384e-05,
+      "loss": 0.6354,
+      "step": 18880
+    },
+    {
+      "epoch": 3.361467236467236,
+      "grad_norm": 0.9158368110656738,
+      "learning_rate": 1.2362319857168735e-05,
+      "loss": 0.7264,
+      "step": 18881
+    },
+    {
+      "epoch": 3.361645299145299,
+      "grad_norm": 0.9424712061882019,
+      "learning_rate": 1.2355579167774355e-05,
+      "loss": 0.7719,
+      "step": 18882
+    },
+    {
+      "epoch": 3.3618233618233617,
+      "grad_norm": 1.001279592514038,
+      "learning_rate": 1.234884019559036e-05,
+      "loss": 0.8973,
+      "step": 18883
+    },
+    {
+      "epoch": 3.3620014245014245,
+      "grad_norm": 0.9505113959312439,
+      "learning_rate": 1.2342102940748756e-05,
+      "loss": 0.6743,
+      "step": 18884
+    },
+    {
+      "epoch": 3.3621794871794872,
+      "grad_norm": 1.0694880485534668,
+      "learning_rate": 1.2335367403381559e-05,
+      "loss": 0.7723,
+      "step": 18885
+    },
+    {
+      "epoch": 3.36235754985755,
+      "grad_norm": 0.8937603235244751,
+      "learning_rate": 1.232863358362072e-05,
+      "loss": 0.7262,
+      "step": 18886
+    },
+    {
+      "epoch": 3.3625356125356127,
+      "grad_norm": 0.9782434105873108,
+      "learning_rate": 1.2321901481598174e-05,
+      "loss": 0.8082,
+      "step": 18887
+    },
+    {
+      "epoch": 3.362713675213675,
+      "grad_norm": 1.0660442113876343,
+      "learning_rate": 1.2315171097445866e-05,
+      "loss": 0.8608,
+      "step": 18888
+    },
+    {
+      "epoch": 3.362891737891738,
+      "grad_norm": 0.9985655546188354,
+      "learning_rate": 1.2308442431295598e-05,
+      "loss": 0.8218,
+      "step": 18889
+    },
+    {
+      "epoch": 3.3630698005698005,
+      "grad_norm": 0.9028006196022034,
+      "learning_rate": 1.2301715483279275e-05,
+      "loss": 0.5749,
+      "step": 18890
+    },
+    {
+      "epoch": 3.3632478632478633,
+      "grad_norm": 0.9652066826820374,
+      "learning_rate": 1.2294990253528616e-05,
+      "loss": 0.7315,
+      "step": 18891
+    },
+    {
+      "epoch": 3.363425925925926,
+      "grad_norm": 0.8819321990013123,
+      "learning_rate": 1.2288266742175446e-05,
+      "loss": 0.6078,
+      "step": 18892
+    },
+    {
+      "epoch": 3.363603988603989,
+      "grad_norm": 0.8945509791374207,
+      "learning_rate": 1.2281544949351498e-05,
+      "loss": 0.756,
+      "step": 18893
+    },
+    {
+      "epoch": 3.363782051282051,
+      "grad_norm": 0.9183369278907776,
+      "learning_rate": 1.2274824875188452e-05,
+      "loss": 0.7717,
+      "step": 18894
+    },
+    {
+      "epoch": 3.363960113960114,
+      "grad_norm": 1.0190612077713013,
+      "learning_rate": 1.2268106519817969e-05,
+      "loss": 0.7269,
+      "step": 18895
+    },
+    {
+      "epoch": 3.3641381766381766,
+      "grad_norm": 0.8803049325942993,
+      "learning_rate": 1.2261389883371698e-05,
+      "loss": 0.6672,
+      "step": 18896
+    },
+    {
+      "epoch": 3.3643162393162394,
+      "grad_norm": 0.8707033395767212,
+      "learning_rate": 1.2254674965981217e-05,
+      "loss": 0.7234,
+      "step": 18897
+    },
+    {
+      "epoch": 3.364494301994302,
+      "grad_norm": 0.9297593235969543,
+      "learning_rate": 1.2247961767778138e-05,
+      "loss": 0.8101,
+      "step": 18898
+    },
+    {
+      "epoch": 3.364672364672365,
+      "grad_norm": 0.8042552471160889,
+      "learning_rate": 1.2241250288893925e-05,
+      "loss": 0.5784,
+      "step": 18899
+    },
+    {
+      "epoch": 3.364850427350427,
+      "grad_norm": 1.1243376731872559,
+      "learning_rate": 1.2234540529460126e-05,
+      "loss": 0.9558,
+      "step": 18900
+    },
+    {
+      "epoch": 3.36502849002849,
+      "grad_norm": 1.0456633567810059,
+      "learning_rate": 1.2227832489608194e-05,
+      "loss": 0.7718,
+      "step": 18901
+    },
+    {
+      "epoch": 3.3652065527065527,
+      "grad_norm": 0.9715479016304016,
+      "learning_rate": 1.2221126169469543e-05,
+      "loss": 0.8759,
+      "step": 18902
+    },
+    {
+      "epoch": 3.3653846153846154,
+      "grad_norm": 1.194477915763855,
+      "learning_rate": 1.2214421569175583e-05,
+      "loss": 0.8068,
+      "step": 18903
+    },
+    {
+      "epoch": 3.365562678062678,
+      "grad_norm": 0.8747355341911316,
+      "learning_rate": 1.2207718688857662e-05,
+      "loss": 0.5554,
+      "step": 18904
+    },
+    {
+      "epoch": 3.365740740740741,
+      "grad_norm": 0.9207034111022949,
+      "learning_rate": 1.2201017528647152e-05,
+      "loss": 1.0054,
+      "step": 18905
+    },
+    {
+      "epoch": 3.3659188034188032,
+      "grad_norm": 0.8625784516334534,
+      "learning_rate": 1.2194318088675283e-05,
+      "loss": 0.5582,
+      "step": 18906
+    },
+    {
+      "epoch": 3.366096866096866,
+      "grad_norm": 0.9648886322975159,
+      "learning_rate": 1.2187620369073361e-05,
+      "loss": 0.7067,
+      "step": 18907
+    },
+    {
+      "epoch": 3.3662749287749287,
+      "grad_norm": 0.8335103988647461,
+      "learning_rate": 1.2180924369972614e-05,
+      "loss": 0.7857,
+      "step": 18908
+    },
+    {
+      "epoch": 3.3664529914529915,
+      "grad_norm": 1.0891221761703491,
+      "learning_rate": 1.2174230091504214e-05,
+      "loss": 0.8173,
+      "step": 18909
+    },
+    {
+      "epoch": 3.3666310541310542,
+      "grad_norm": 0.9425504207611084,
+      "learning_rate": 1.2167537533799345e-05,
+      "loss": 0.6784,
+      "step": 18910
+    },
+    {
+      "epoch": 3.366809116809117,
+      "grad_norm": 0.8152710199356079,
+      "learning_rate": 1.2160846696989114e-05,
+      "loss": 0.8011,
+      "step": 18911
+    },
+    {
+      "epoch": 3.3669871794871793,
+      "grad_norm": 1.0064387321472168,
+      "learning_rate": 1.2154157581204607e-05,
+      "loss": 0.7862,
+      "step": 18912
+    },
+    {
+      "epoch": 3.367165242165242,
+      "grad_norm": 0.9653602838516235,
+      "learning_rate": 1.2147470186576936e-05,
+      "loss": 0.7156,
+      "step": 18913
+    },
+    {
+      "epoch": 3.367343304843305,
+      "grad_norm": 1.0241694450378418,
+      "learning_rate": 1.2140784513237046e-05,
+      "loss": 0.6863,
+      "step": 18914
+    },
+    {
+      "epoch": 3.3675213675213675,
+      "grad_norm": 0.8816869258880615,
+      "learning_rate": 1.213410056131602e-05,
+      "loss": 0.6623,
+      "step": 18915
+    },
+    {
+      "epoch": 3.3676994301994303,
+      "grad_norm": 0.9364701509475708,
+      "learning_rate": 1.2127418330944718e-05,
+      "loss": 0.6271,
+      "step": 18916
+    },
+    {
+      "epoch": 3.367877492877493,
+      "grad_norm": 1.0039292573928833,
+      "learning_rate": 1.2120737822254147e-05,
+      "loss": 0.7901,
+      "step": 18917
+    },
+    {
+      "epoch": 3.3680555555555554,
+      "grad_norm": 1.2018239498138428,
+      "learning_rate": 1.2114059035375158e-05,
+      "loss": 0.8665,
+      "step": 18918
+    },
+    {
+      "epoch": 3.368233618233618,
+      "grad_norm": 0.9742374420166016,
+      "learning_rate": 1.2107381970438614e-05,
+      "loss": 0.7912,
+      "step": 18919
+    },
+    {
+      "epoch": 3.368411680911681,
+      "grad_norm": 0.8076589703559875,
+      "learning_rate": 1.210070662757533e-05,
+      "loss": 0.5589,
+      "step": 18920
+    },
+    {
+      "epoch": 3.3685897435897436,
+      "grad_norm": 0.8905361890792847,
+      "learning_rate": 1.2094033006916117e-05,
+      "loss": 0.6812,
+      "step": 18921
+    },
+    {
+      "epoch": 3.3687678062678064,
+      "grad_norm": 0.7845730781555176,
+      "learning_rate": 1.20873611085917e-05,
+      "loss": 0.4487,
+      "step": 18922
+    },
+    {
+      "epoch": 3.368945868945869,
+      "grad_norm": 0.9479206204414368,
+      "learning_rate": 1.2080690932732829e-05,
+      "loss": 0.7815,
+      "step": 18923
+    },
+    {
+      "epoch": 3.369123931623932,
+      "grad_norm": 1.0130283832550049,
+      "learning_rate": 1.2074022479470193e-05,
+      "loss": 0.644,
+      "step": 18924
+    },
+    {
+      "epoch": 3.369301994301994,
+      "grad_norm": 0.9899544715881348,
+      "learning_rate": 1.2067355748934429e-05,
+      "loss": 0.8751,
+      "step": 18925
+    },
+    {
+      "epoch": 3.369480056980057,
+      "grad_norm": 1.0933879613876343,
+      "learning_rate": 1.2060690741256165e-05,
+      "loss": 0.7778,
+      "step": 18926
+    },
+    {
+      "epoch": 3.3696581196581197,
+      "grad_norm": 0.9882935881614685,
+      "learning_rate": 1.2054027456565997e-05,
+      "loss": 0.6674,
+      "step": 18927
+    },
+    {
+      "epoch": 3.3698361823361824,
+      "grad_norm": 0.9455338716506958,
+      "learning_rate": 1.2047365894994455e-05,
+      "loss": 0.6675,
+      "step": 18928
+    },
+    {
+      "epoch": 3.370014245014245,
+      "grad_norm": 0.9364031553268433,
+      "learning_rate": 1.2040706056672069e-05,
+      "loss": 0.9199,
+      "step": 18929
+    },
+    {
+      "epoch": 3.3701923076923075,
+      "grad_norm": 1.004025936126709,
+      "learning_rate": 1.2034047941729354e-05,
+      "loss": 0.8102,
+      "step": 18930
+    },
+    {
+      "epoch": 3.3703703703703702,
+      "grad_norm": 0.9347146153450012,
+      "learning_rate": 1.2027391550296707e-05,
+      "loss": 0.7639,
+      "step": 18931
+    },
+    {
+      "epoch": 3.370548433048433,
+      "grad_norm": 0.9768404960632324,
+      "learning_rate": 1.202073688250458e-05,
+      "loss": 0.7367,
+      "step": 18932
+    },
+    {
+      "epoch": 3.3707264957264957,
+      "grad_norm": 1.0199530124664307,
+      "learning_rate": 1.2014083938483367e-05,
+      "loss": 0.7956,
+      "step": 18933
+    },
+    {
+      "epoch": 3.3709045584045585,
+      "grad_norm": 0.8989109396934509,
+      "learning_rate": 1.2007432718363398e-05,
+      "loss": 0.6288,
+      "step": 18934
+    },
+    {
+      "epoch": 3.3710826210826212,
+      "grad_norm": 1.0402858257293701,
+      "learning_rate": 1.2000783222275003e-05,
+      "loss": 0.8463,
+      "step": 18935
+    },
+    {
+      "epoch": 3.371260683760684,
+      "grad_norm": 0.8559962511062622,
+      "learning_rate": 1.1994135450348453e-05,
+      "loss": 0.9195,
+      "step": 18936
+    },
+    {
+      "epoch": 3.3714387464387463,
+      "grad_norm": 0.8805050849914551,
+      "learning_rate": 1.1987489402713981e-05,
+      "loss": 0.7653,
+      "step": 18937
+    },
+    {
+      "epoch": 3.371616809116809,
+      "grad_norm": 0.8654996752738953,
+      "learning_rate": 1.1980845079501867e-05,
+      "loss": 0.7037,
+      "step": 18938
+    },
+    {
+      "epoch": 3.371794871794872,
+      "grad_norm": 0.8487333059310913,
+      "learning_rate": 1.19742024808422e-05,
+      "loss": 0.823,
+      "step": 18939
+    },
+    {
+      "epoch": 3.3719729344729346,
+      "grad_norm": 0.9701769948005676,
+      "learning_rate": 1.196756160686523e-05,
+      "loss": 0.64,
+      "step": 18940
+    },
+    {
+      "epoch": 3.3721509971509973,
+      "grad_norm": 1.0142244100570679,
+      "learning_rate": 1.1960922457700975e-05,
+      "loss": 1.0056,
+      "step": 18941
+    },
+    {
+      "epoch": 3.3723290598290596,
+      "grad_norm": 0.9956945776939392,
+      "learning_rate": 1.1954285033479573e-05,
+      "loss": 0.6369,
+      "step": 18942
+    },
+    {
+      "epoch": 3.3725071225071224,
+      "grad_norm": 0.9598103165626526,
+      "learning_rate": 1.1947649334331068e-05,
+      "loss": 0.7008,
+      "step": 18943
+    },
+    {
+      "epoch": 3.372685185185185,
+      "grad_norm": 0.9213023781776428,
+      "learning_rate": 1.1941015360385432e-05,
+      "loss": 0.5899,
+      "step": 18944
+    },
+    {
+      "epoch": 3.372863247863248,
+      "grad_norm": 0.9229640960693359,
+      "learning_rate": 1.1934383111772718e-05,
+      "loss": 0.7408,
+      "step": 18945
+    },
+    {
+      "epoch": 3.3730413105413106,
+      "grad_norm": 1.0240286588668823,
+      "learning_rate": 1.1927752588622787e-05,
+      "loss": 0.6822,
+      "step": 18946
+    },
+    {
+      "epoch": 3.3732193732193734,
+      "grad_norm": 0.9154404401779175,
+      "learning_rate": 1.1921123791065602e-05,
+      "loss": 0.7995,
+      "step": 18947
+    },
+    {
+      "epoch": 3.373397435897436,
+      "grad_norm": 1.0823782682418823,
+      "learning_rate": 1.1914496719231039e-05,
+      "loss": 0.8212,
+      "step": 18948
+    },
+    {
+      "epoch": 3.3735754985754984,
+      "grad_norm": 1.005828857421875,
+      "learning_rate": 1.1907871373248925e-05,
+      "loss": 0.7916,
+      "step": 18949
+    },
+    {
+      "epoch": 3.373753561253561,
+      "grad_norm": 0.9120662212371826,
+      "learning_rate": 1.1901247753249079e-05,
+      "loss": 0.635,
+      "step": 18950
+    },
+    {
+      "epoch": 3.373931623931624,
+      "grad_norm": 1.006744146347046,
+      "learning_rate": 1.1894625859361274e-05,
+      "loss": 0.824,
+      "step": 18951
+    },
+    {
+      "epoch": 3.3741096866096867,
+      "grad_norm": 0.9716039299964905,
+      "learning_rate": 1.1888005691715254e-05,
+      "loss": 0.7036,
+      "step": 18952
+    },
+    {
+      "epoch": 3.3742877492877494,
+      "grad_norm": 1.0244479179382324,
+      "learning_rate": 1.1881387250440723e-05,
+      "loss": 0.7777,
+      "step": 18953
+    },
+    {
+      "epoch": 3.3744658119658117,
+      "grad_norm": 0.9912460446357727,
+      "learning_rate": 1.1874770535667334e-05,
+      "loss": 0.7136,
+      "step": 18954
+    },
+    {
+      "epoch": 3.3746438746438745,
+      "grad_norm": 1.1135791540145874,
+      "learning_rate": 1.1868155547524795e-05,
+      "loss": 0.846,
+      "step": 18955
+    },
+    {
+      "epoch": 3.3748219373219372,
+      "grad_norm": 0.8404874801635742,
+      "learning_rate": 1.1861542286142635e-05,
+      "loss": 0.6614,
+      "step": 18956
+    },
+    {
+      "epoch": 3.375,
+      "grad_norm": 0.8777139186859131,
+      "learning_rate": 1.1854930751650472e-05,
+      "loss": 0.614,
+      "step": 18957
+    },
+    {
+      "epoch": 3.3751780626780628,
+      "grad_norm": 1.0814663171768188,
+      "learning_rate": 1.1848320944177837e-05,
+      "loss": 0.8726,
+      "step": 18958
+    },
+    {
+      "epoch": 3.3753561253561255,
+      "grad_norm": 1.140494704246521,
+      "learning_rate": 1.1841712863854249e-05,
+      "loss": 0.7207,
+      "step": 18959
+    },
+    {
+      "epoch": 3.3755341880341883,
+      "grad_norm": 1.1019811630249023,
+      "learning_rate": 1.1835106510809146e-05,
+      "loss": 0.7376,
+      "step": 18960
+    },
+    {
+      "epoch": 3.3757122507122506,
+      "grad_norm": 0.8897548913955688,
+      "learning_rate": 1.1828501885171995e-05,
+      "loss": 0.6464,
+      "step": 18961
+    },
+    {
+      "epoch": 3.3758903133903133,
+      "grad_norm": 0.9695185422897339,
+      "learning_rate": 1.1821898987072188e-05,
+      "loss": 0.7566,
+      "step": 18962
+    },
+    {
+      "epoch": 3.376068376068376,
+      "grad_norm": 1.022469162940979,
+      "learning_rate": 1.1815297816639071e-05,
+      "loss": 0.8839,
+      "step": 18963
+    },
+    {
+      "epoch": 3.376246438746439,
+      "grad_norm": 0.9937511086463928,
+      "learning_rate": 1.1808698374002025e-05,
+      "loss": 0.8049,
+      "step": 18964
+    },
+    {
+      "epoch": 3.3764245014245016,
+      "grad_norm": 1.1339633464813232,
+      "learning_rate": 1.1802100659290337e-05,
+      "loss": 0.8131,
+      "step": 18965
+    },
+    {
+      "epoch": 3.376602564102564,
+      "grad_norm": 0.9775537848472595,
+      "learning_rate": 1.1795504672633273e-05,
+      "loss": 0.7819,
+      "step": 18966
+    },
+    {
+      "epoch": 3.3767806267806266,
+      "grad_norm": 1.0051268339157104,
+      "learning_rate": 1.1788910414160059e-05,
+      "loss": 0.658,
+      "step": 18967
+    },
+    {
+      "epoch": 3.3769586894586894,
+      "grad_norm": 0.9491516947746277,
+      "learning_rate": 1.1782317883999915e-05,
+      "loss": 0.9192,
+      "step": 18968
+    },
+    {
+      "epoch": 3.377136752136752,
+      "grad_norm": 0.9227840900421143,
+      "learning_rate": 1.1775727082281962e-05,
+      "loss": 0.7612,
+      "step": 18969
+    },
+    {
+      "epoch": 3.377314814814815,
+      "grad_norm": 1.0352122783660889,
+      "learning_rate": 1.1769138009135416e-05,
+      "loss": 0.8781,
+      "step": 18970
+    },
+    {
+      "epoch": 3.3774928774928776,
+      "grad_norm": 1.0505750179290771,
+      "learning_rate": 1.1762550664689286e-05,
+      "loss": 0.8723,
+      "step": 18971
+    },
+    {
+      "epoch": 3.3776709401709404,
+      "grad_norm": 0.9471299648284912,
+      "learning_rate": 1.1755965049072693e-05,
+      "loss": 0.8168,
+      "step": 18972
+    },
+    {
+      "epoch": 3.3778490028490027,
+      "grad_norm": 1.0091118812561035,
+      "learning_rate": 1.1749381162414652e-05,
+      "loss": 0.6761,
+      "step": 18973
+    },
+    {
+      "epoch": 3.3780270655270654,
+      "grad_norm": 1.0159013271331787,
+      "learning_rate": 1.1742799004844174e-05,
+      "loss": 0.6337,
+      "step": 18974
+    },
+    {
+      "epoch": 3.378205128205128,
+      "grad_norm": 1.0220392942428589,
+      "learning_rate": 1.17362185764902e-05,
+      "loss": 0.9644,
+      "step": 18975
+    },
+    {
+      "epoch": 3.378383190883191,
+      "grad_norm": 0.9097932577133179,
+      "learning_rate": 1.172963987748168e-05,
+      "loss": 0.815,
+      "step": 18976
+    },
+    {
+      "epoch": 3.3785612535612537,
+      "grad_norm": 0.9885165691375732,
+      "learning_rate": 1.172306290794749e-05,
+      "loss": 0.6194,
+      "step": 18977
+    },
+    {
+      "epoch": 3.378739316239316,
+      "grad_norm": 1.1714304685592651,
+      "learning_rate": 1.1716487668016506e-05,
+      "loss": 0.863,
+      "step": 18978
+    },
+    {
+      "epoch": 3.3789173789173788,
+      "grad_norm": 0.9001526832580566,
+      "learning_rate": 1.1709914157817537e-05,
+      "loss": 0.6687,
+      "step": 18979
+    },
+    {
+      "epoch": 3.3790954415954415,
+      "grad_norm": 0.8851606249809265,
+      "learning_rate": 1.1703342377479432e-05,
+      "loss": 0.6135,
+      "step": 18980
+    },
+    {
+      "epoch": 3.3792735042735043,
+      "grad_norm": 1.1299785375595093,
+      "learning_rate": 1.1696772327130878e-05,
+      "loss": 0.7939,
+      "step": 18981
+    },
+    {
+      "epoch": 3.379451566951567,
+      "grad_norm": 0.8739166855812073,
+      "learning_rate": 1.1690204006900652e-05,
+      "loss": 0.6802,
+      "step": 18982
+    },
+    {
+      "epoch": 3.3796296296296298,
+      "grad_norm": 1.2778574228286743,
+      "learning_rate": 1.1683637416917425e-05,
+      "loss": 0.7556,
+      "step": 18983
+    },
+    {
+      "epoch": 3.3798076923076925,
+      "grad_norm": 1.0917866230010986,
+      "learning_rate": 1.1677072557309843e-05,
+      "loss": 0.8309,
+      "step": 18984
+    },
+    {
+      "epoch": 3.379985754985755,
+      "grad_norm": 0.9844012260437012,
+      "learning_rate": 1.16705094282066e-05,
+      "loss": 0.8137,
+      "step": 18985
+    },
+    {
+      "epoch": 3.3801638176638176,
+      "grad_norm": 0.8838617205619812,
+      "learning_rate": 1.1663948029736182e-05,
+      "loss": 0.7654,
+      "step": 18986
+    },
+    {
+      "epoch": 3.3803418803418803,
+      "grad_norm": 0.9886083602905273,
+      "learning_rate": 1.1657388362027255e-05,
+      "loss": 0.8853,
+      "step": 18987
+    },
+    {
+      "epoch": 3.380519943019943,
+      "grad_norm": 0.9307940602302551,
+      "learning_rate": 1.1650830425208236e-05,
+      "loss": 0.8353,
+      "step": 18988
+    },
+    {
+      "epoch": 3.380698005698006,
+      "grad_norm": 0.9275619387626648,
+      "learning_rate": 1.1644274219407692e-05,
+      "loss": 0.5412,
+      "step": 18989
+    },
+    {
+      "epoch": 3.380876068376068,
+      "grad_norm": 0.8723110556602478,
+      "learning_rate": 1.1637719744754038e-05,
+      "loss": 0.7945,
+      "step": 18990
+    },
+    {
+      "epoch": 3.381054131054131,
+      "grad_norm": 0.9911721348762512,
+      "learning_rate": 1.163116700137572e-05,
+      "loss": 0.9201,
+      "step": 18991
+    },
+    {
+      "epoch": 3.3812321937321936,
+      "grad_norm": 0.9290093183517456,
+      "learning_rate": 1.1624615989401112e-05,
+      "loss": 0.8472,
+      "step": 18992
+    },
+    {
+      "epoch": 3.3814102564102564,
+      "grad_norm": 1.068137526512146,
+      "learning_rate": 1.1618066708958574e-05,
+      "loss": 0.7443,
+      "step": 18993
+    },
+    {
+      "epoch": 3.381588319088319,
+      "grad_norm": 1.0363795757293701,
+      "learning_rate": 1.1611519160176398e-05,
+      "loss": 0.8599,
+      "step": 18994
+    },
+    {
+      "epoch": 3.381766381766382,
+      "grad_norm": 0.8963867425918579,
+      "learning_rate": 1.1604973343182935e-05,
+      "loss": 0.8149,
+      "step": 18995
+    },
+    {
+      "epoch": 3.3819444444444446,
+      "grad_norm": 0.9797514081001282,
+      "learning_rate": 1.1598429258106347e-05,
+      "loss": 0.9245,
+      "step": 18996
+    },
+    {
+      "epoch": 3.382122507122507,
+      "grad_norm": 0.9539228677749634,
+      "learning_rate": 1.1591886905074945e-05,
+      "loss": 0.7468,
+      "step": 18997
+    },
+    {
+      "epoch": 3.3823005698005697,
+      "grad_norm": 0.9980157613754272,
+      "learning_rate": 1.1585346284216825e-05,
+      "loss": 0.9399,
+      "step": 18998
+    },
+    {
+      "epoch": 3.3824786324786325,
+      "grad_norm": 1.0480377674102783,
+      "learning_rate": 1.1578807395660207e-05,
+      "loss": 0.9205,
+      "step": 18999
+    },
+    {
+      "epoch": 3.382656695156695,
+      "grad_norm": 1.0390716791152954,
+      "learning_rate": 1.1572270239533168e-05,
+      "loss": 0.8119,
+      "step": 19000
+    },
+    {
+      "epoch": 3.382834757834758,
+      "grad_norm": 0.9198310971260071,
+      "learning_rate": 1.1565734815963814e-05,
+      "loss": 0.6367,
+      "step": 19001
+    },
+    {
+      "epoch": 3.3830128205128207,
+      "grad_norm": 0.8358418941497803,
+      "learning_rate": 1.1559201125080167e-05,
+      "loss": 0.6825,
+      "step": 19002
+    },
+    {
+      "epoch": 3.383190883190883,
+      "grad_norm": 0.9615653157234192,
+      "learning_rate": 1.1552669167010244e-05,
+      "loss": 0.7571,
+      "step": 19003
+    },
+    {
+      "epoch": 3.3833689458689458,
+      "grad_norm": 1.0061991214752197,
+      "learning_rate": 1.1546138941882045e-05,
+      "loss": 0.8323,
+      "step": 19004
+    },
+    {
+      "epoch": 3.3835470085470085,
+      "grad_norm": 0.9179396629333496,
+      "learning_rate": 1.1539610449823512e-05,
+      "loss": 0.6407,
+      "step": 19005
+    },
+    {
+      "epoch": 3.3837250712250713,
+      "grad_norm": 1.0422332286834717,
+      "learning_rate": 1.1533083690962554e-05,
+      "loss": 0.9024,
+      "step": 19006
+    },
+    {
+      "epoch": 3.383903133903134,
+      "grad_norm": 0.9112447500228882,
+      "learning_rate": 1.1526558665427045e-05,
+      "loss": 0.7383,
+      "step": 19007
+    },
+    {
+      "epoch": 3.3840811965811968,
+      "grad_norm": 0.9500386714935303,
+      "learning_rate": 1.1520035373344829e-05,
+      "loss": 0.6305,
+      "step": 19008
+    },
+    {
+      "epoch": 3.384259259259259,
+      "grad_norm": 1.0099005699157715,
+      "learning_rate": 1.1513513814843701e-05,
+      "loss": 0.8355,
+      "step": 19009
+    },
+    {
+      "epoch": 3.384437321937322,
+      "grad_norm": 1.0299774408340454,
+      "learning_rate": 1.1506993990051495e-05,
+      "loss": 0.747,
+      "step": 19010
+    },
+    {
+      "epoch": 3.3846153846153846,
+      "grad_norm": 0.9437386393547058,
+      "learning_rate": 1.1500475899095864e-05,
+      "loss": 0.4786,
+      "step": 19011
+    },
+    {
+      "epoch": 3.3847934472934473,
+      "grad_norm": 0.8705305457115173,
+      "learning_rate": 1.1493959542104615e-05,
+      "loss": 0.8052,
+      "step": 19012
+    },
+    {
+      "epoch": 3.38497150997151,
+      "grad_norm": 1.0724691152572632,
+      "learning_rate": 1.1487444919205336e-05,
+      "loss": 0.8645,
+      "step": 19013
+    },
+    {
+      "epoch": 3.385149572649573,
+      "grad_norm": 0.8759792447090149,
+      "learning_rate": 1.1480932030525725e-05,
+      "loss": 0.5631,
+      "step": 19014
+    },
+    {
+      "epoch": 3.385327635327635,
+      "grad_norm": 0.8842191100120544,
+      "learning_rate": 1.1474420876193359e-05,
+      "loss": 0.528,
+      "step": 19015
+    },
+    {
+      "epoch": 3.385505698005698,
+      "grad_norm": 0.8085874319076538,
+      "learning_rate": 1.1467911456335833e-05,
+      "loss": 0.6945,
+      "step": 19016
+    },
+    {
+      "epoch": 3.3856837606837606,
+      "grad_norm": 0.9087811708450317,
+      "learning_rate": 1.1461403771080658e-05,
+      "loss": 0.7253,
+      "step": 19017
+    },
+    {
+      "epoch": 3.3858618233618234,
+      "grad_norm": 0.9133577346801758,
+      "learning_rate": 1.1454897820555366e-05,
+      "loss": 0.7641,
+      "step": 19018
+    },
+    {
+      "epoch": 3.386039886039886,
+      "grad_norm": 1.1190037727355957,
+      "learning_rate": 1.1448393604887386e-05,
+      "loss": 1.0035,
+      "step": 19019
+    },
+    {
+      "epoch": 3.386217948717949,
+      "grad_norm": 0.9388802647590637,
+      "learning_rate": 1.1441891124204229e-05,
+      "loss": 0.7469,
+      "step": 19020
+    },
+    {
+      "epoch": 3.386396011396011,
+      "grad_norm": 1.0015909671783447,
+      "learning_rate": 1.1435390378633216e-05,
+      "loss": 0.6523,
+      "step": 19021
+    },
+    {
+      "epoch": 3.386574074074074,
+      "grad_norm": 0.9067725539207458,
+      "learning_rate": 1.1428891368301764e-05,
+      "loss": 0.6453,
+      "step": 19022
+    },
+    {
+      "epoch": 3.3867521367521367,
+      "grad_norm": 1.1131070852279663,
+      "learning_rate": 1.1422394093337196e-05,
+      "loss": 0.9134,
+      "step": 19023
+    },
+    {
+      "epoch": 3.3869301994301995,
+      "grad_norm": 0.922660231590271,
+      "learning_rate": 1.141589855386681e-05,
+      "loss": 0.8114,
+      "step": 19024
+    },
+    {
+      "epoch": 3.387108262108262,
+      "grad_norm": 0.9011611342430115,
+      "learning_rate": 1.1409404750017872e-05,
+      "loss": 0.6133,
+      "step": 19025
+    },
+    {
+      "epoch": 3.387286324786325,
+      "grad_norm": 1.0388410091400146,
+      "learning_rate": 1.140291268191761e-05,
+      "loss": 0.8391,
+      "step": 19026
+    },
+    {
+      "epoch": 3.3874643874643873,
+      "grad_norm": 0.971215009689331,
+      "learning_rate": 1.139642234969326e-05,
+      "loss": 0.8261,
+      "step": 19027
+    },
+    {
+      "epoch": 3.38764245014245,
+      "grad_norm": 0.8922646641731262,
+      "learning_rate": 1.1389933753471915e-05,
+      "loss": 0.6263,
+      "step": 19028
+    },
+    {
+      "epoch": 3.3878205128205128,
+      "grad_norm": 0.8670883178710938,
+      "learning_rate": 1.1383446893380767e-05,
+      "loss": 0.8308,
+      "step": 19029
+    },
+    {
+      "epoch": 3.3879985754985755,
+      "grad_norm": 0.8676888942718506,
+      "learning_rate": 1.137696176954689e-05,
+      "loss": 0.6909,
+      "step": 19030
+    },
+    {
+      "epoch": 3.3881766381766383,
+      "grad_norm": 1.0416065454483032,
+      "learning_rate": 1.1370478382097361e-05,
+      "loss": 0.7028,
+      "step": 19031
+    },
+    {
+      "epoch": 3.388354700854701,
+      "grad_norm": 0.8439168930053711,
+      "learning_rate": 1.1363996731159188e-05,
+      "loss": 0.6838,
+      "step": 19032
+    },
+    {
+      "epoch": 3.388532763532764,
+      "grad_norm": 1.0615967512130737,
+      "learning_rate": 1.135751681685937e-05,
+      "loss": 0.8698,
+      "step": 19033
+    },
+    {
+      "epoch": 3.388710826210826,
+      "grad_norm": 1.0846972465515137,
+      "learning_rate": 1.1351038639324874e-05,
+      "loss": 0.8899,
+      "step": 19034
+    },
+    {
+      "epoch": 3.388888888888889,
+      "grad_norm": 0.8791643381118774,
+      "learning_rate": 1.1344562198682617e-05,
+      "loss": 0.7181,
+      "step": 19035
+    },
+    {
+      "epoch": 3.3890669515669516,
+      "grad_norm": 0.904293954372406,
+      "learning_rate": 1.133808749505949e-05,
+      "loss": 0.6749,
+      "step": 19036
+    },
+    {
+      "epoch": 3.3892450142450143,
+      "grad_norm": 0.877621591091156,
+      "learning_rate": 1.1331614528582391e-05,
+      "loss": 0.771,
+      "step": 19037
+    },
+    {
+      "epoch": 3.389423076923077,
+      "grad_norm": 0.8001459240913391,
+      "learning_rate": 1.1325143299378083e-05,
+      "loss": 0.5418,
+      "step": 19038
+    },
+    {
+      "epoch": 3.3896011396011394,
+      "grad_norm": 0.8537688851356506,
+      "learning_rate": 1.1318673807573399e-05,
+      "loss": 0.72,
+      "step": 19039
+    },
+    {
+      "epoch": 3.389779202279202,
+      "grad_norm": 1.04570734500885,
+      "learning_rate": 1.1312206053295082e-05,
+      "loss": 0.7882,
+      "step": 19040
+    },
+    {
+      "epoch": 3.389957264957265,
+      "grad_norm": 0.8674250245094299,
+      "learning_rate": 1.1305740036669832e-05,
+      "loss": 0.7391,
+      "step": 19041
+    },
+    {
+      "epoch": 3.3901353276353277,
+      "grad_norm": 0.9948473572731018,
+      "learning_rate": 1.1299275757824402e-05,
+      "loss": 0.6436,
+      "step": 19042
+    },
+    {
+      "epoch": 3.3903133903133904,
+      "grad_norm": 0.8702938556671143,
+      "learning_rate": 1.1292813216885366e-05,
+      "loss": 0.7127,
+      "step": 19043
+    },
+    {
+      "epoch": 3.390491452991453,
+      "grad_norm": 0.9511193037033081,
+      "learning_rate": 1.1286352413979396e-05,
+      "loss": 0.5849,
+      "step": 19044
+    },
+    {
+      "epoch": 3.390669515669516,
+      "grad_norm": 1.037414312362671,
+      "learning_rate": 1.1279893349233073e-05,
+      "loss": 0.6974,
+      "step": 19045
+    },
+    {
+      "epoch": 3.390847578347578,
+      "grad_norm": 0.9115296602249146,
+      "learning_rate": 1.1273436022772931e-05,
+      "loss": 0.8002,
+      "step": 19046
+    },
+    {
+      "epoch": 3.391025641025641,
+      "grad_norm": 0.9375928044319153,
+      "learning_rate": 1.1266980434725506e-05,
+      "loss": 0.6869,
+      "step": 19047
+    },
+    {
+      "epoch": 3.3912037037037037,
+      "grad_norm": 0.9377243518829346,
+      "learning_rate": 1.1260526585217257e-05,
+      "loss": 0.6117,
+      "step": 19048
+    },
+    {
+      "epoch": 3.3913817663817665,
+      "grad_norm": 0.9592469334602356,
+      "learning_rate": 1.1254074474374665e-05,
+      "loss": 0.7742,
+      "step": 19049
+    },
+    {
+      "epoch": 3.3915598290598292,
+      "grad_norm": 1.103419542312622,
+      "learning_rate": 1.1247624102324127e-05,
+      "loss": 0.8691,
+      "step": 19050
+    },
+    {
+      "epoch": 3.3917378917378915,
+      "grad_norm": 0.8406701683998108,
+      "learning_rate": 1.1241175469192e-05,
+      "loss": 0.6666,
+      "step": 19051
+    },
+    {
+      "epoch": 3.3919159544159543,
+      "grad_norm": 0.9886174201965332,
+      "learning_rate": 1.1234728575104703e-05,
+      "loss": 0.6932,
+      "step": 19052
+    },
+    {
+      "epoch": 3.392094017094017,
+      "grad_norm": 0.9601133465766907,
+      "learning_rate": 1.1228283420188468e-05,
+      "loss": 0.696,
+      "step": 19053
+    },
+    {
+      "epoch": 3.39227207977208,
+      "grad_norm": 1.1034084558486938,
+      "learning_rate": 1.1221840004569629e-05,
+      "loss": 0.7785,
+      "step": 19054
+    },
+    {
+      "epoch": 3.3924501424501425,
+      "grad_norm": 0.930463969707489,
+      "learning_rate": 1.1215398328374415e-05,
+      "loss": 0.7812,
+      "step": 19055
+    },
+    {
+      "epoch": 3.3926282051282053,
+      "grad_norm": 1.054152488708496,
+      "learning_rate": 1.120895839172904e-05,
+      "loss": 0.7522,
+      "step": 19056
+    },
+    {
+      "epoch": 3.392806267806268,
+      "grad_norm": 1.0049049854278564,
+      "learning_rate": 1.1202520194759681e-05,
+      "loss": 0.6845,
+      "step": 19057
+    },
+    {
+      "epoch": 3.3929843304843303,
+      "grad_norm": 0.9241994619369507,
+      "learning_rate": 1.1196083737592467e-05,
+      "loss": 0.9938,
+      "step": 19058
+    },
+    {
+      "epoch": 3.393162393162393,
+      "grad_norm": 0.988713800907135,
+      "learning_rate": 1.118964902035352e-05,
+      "loss": 0.7819,
+      "step": 19059
+    },
+    {
+      "epoch": 3.393340455840456,
+      "grad_norm": 0.9430522918701172,
+      "learning_rate": 1.1183216043168921e-05,
+      "loss": 0.6091,
+      "step": 19060
+    },
+    {
+      "epoch": 3.3935185185185186,
+      "grad_norm": 1.0958164930343628,
+      "learning_rate": 1.1176784806164676e-05,
+      "loss": 0.6572,
+      "step": 19061
+    },
+    {
+      "epoch": 3.3936965811965814,
+      "grad_norm": 0.9768433570861816,
+      "learning_rate": 1.117035530946684e-05,
+      "loss": 0.9099,
+      "step": 19062
+    },
+    {
+      "epoch": 3.3938746438746437,
+      "grad_norm": 0.9464746117591858,
+      "learning_rate": 1.116392755320137e-05,
+      "loss": 0.7463,
+      "step": 19063
+    },
+    {
+      "epoch": 3.3940527065527064,
+      "grad_norm": 1.061977505683899,
+      "learning_rate": 1.1157501537494197e-05,
+      "loss": 0.9118,
+      "step": 19064
+    },
+    {
+      "epoch": 3.394230769230769,
+      "grad_norm": 0.9586532711982727,
+      "learning_rate": 1.1151077262471222e-05,
+      "loss": 0.8105,
+      "step": 19065
+    },
+    {
+      "epoch": 3.394408831908832,
+      "grad_norm": 0.9120199680328369,
+      "learning_rate": 1.1144654728258297e-05,
+      "loss": 0.5844,
+      "step": 19066
+    },
+    {
+      "epoch": 3.3945868945868947,
+      "grad_norm": 1.0607168674468994,
+      "learning_rate": 1.1138233934981323e-05,
+      "loss": 0.7454,
+      "step": 19067
+    },
+    {
+      "epoch": 3.3947649572649574,
+      "grad_norm": 1.0327951908111572,
+      "learning_rate": 1.1131814882766034e-05,
+      "loss": 0.7933,
+      "step": 19068
+    },
+    {
+      "epoch": 3.39494301994302,
+      "grad_norm": 0.9982885122299194,
+      "learning_rate": 1.112539757173825e-05,
+      "loss": 0.7294,
+      "step": 19069
+    },
+    {
+      "epoch": 3.3951210826210825,
+      "grad_norm": 0.8472577929496765,
+      "learning_rate": 1.1118982002023649e-05,
+      "loss": 0.6405,
+      "step": 19070
+    },
+    {
+      "epoch": 3.3952991452991452,
+      "grad_norm": 0.8824619650840759,
+      "learning_rate": 1.1112568173747972e-05,
+      "loss": 0.8163,
+      "step": 19071
+    },
+    {
+      "epoch": 3.395477207977208,
+      "grad_norm": 1.1966749429702759,
+      "learning_rate": 1.1106156087036879e-05,
+      "loss": 1.0977,
+      "step": 19072
+    },
+    {
+      "epoch": 3.3956552706552707,
+      "grad_norm": 0.9600608944892883,
+      "learning_rate": 1.1099745742016e-05,
+      "loss": 0.7247,
+      "step": 19073
+    },
+    {
+      "epoch": 3.3958333333333335,
+      "grad_norm": 0.9293968677520752,
+      "learning_rate": 1.1093337138810921e-05,
+      "loss": 0.6889,
+      "step": 19074
+    },
+    {
+      "epoch": 3.396011396011396,
+      "grad_norm": 1.020552635192871,
+      "learning_rate": 1.1086930277547226e-05,
+      "loss": 0.7282,
+      "step": 19075
+    },
+    {
+      "epoch": 3.3961894586894585,
+      "grad_norm": 0.8889341950416565,
+      "learning_rate": 1.1080525158350397e-05,
+      "loss": 0.6518,
+      "step": 19076
+    },
+    {
+      "epoch": 3.3963675213675213,
+      "grad_norm": 0.9952123761177063,
+      "learning_rate": 1.1074121781346014e-05,
+      "loss": 0.8552,
+      "step": 19077
+    },
+    {
+      "epoch": 3.396545584045584,
+      "grad_norm": 0.8794593811035156,
+      "learning_rate": 1.1067720146659432e-05,
+      "loss": 0.8402,
+      "step": 19078
+    },
+    {
+      "epoch": 3.396723646723647,
+      "grad_norm": 1.0979092121124268,
+      "learning_rate": 1.1061320254416175e-05,
+      "loss": 0.8358,
+      "step": 19079
+    },
+    {
+      "epoch": 3.3969017094017095,
+      "grad_norm": 0.9097040295600891,
+      "learning_rate": 1.1054922104741572e-05,
+      "loss": 0.7482,
+      "step": 19080
+    },
+    {
+      "epoch": 3.3970797720797723,
+      "grad_norm": 0.9845696687698364,
+      "learning_rate": 1.1048525697760993e-05,
+      "loss": 0.8369,
+      "step": 19081
+    },
+    {
+      "epoch": 3.3972578347578346,
+      "grad_norm": 0.7637016773223877,
+      "learning_rate": 1.1042131033599812e-05,
+      "loss": 0.541,
+      "step": 19082
+    },
+    {
+      "epoch": 3.3974358974358974,
+      "grad_norm": 0.9949057698249817,
+      "learning_rate": 1.1035738112383243e-05,
+      "loss": 0.8823,
+      "step": 19083
+    },
+    {
+      "epoch": 3.39761396011396,
+      "grad_norm": 0.9214608669281006,
+      "learning_rate": 1.1029346934236618e-05,
+      "loss": 0.7869,
+      "step": 19084
+    },
+    {
+      "epoch": 3.397792022792023,
+      "grad_norm": 1.1592236757278442,
+      "learning_rate": 1.102295749928507e-05,
+      "loss": 0.883,
+      "step": 19085
+    },
+    {
+      "epoch": 3.3979700854700856,
+      "grad_norm": 1.0953752994537354,
+      "learning_rate": 1.1016569807653865e-05,
+      "loss": 0.8813,
+      "step": 19086
+    },
+    {
+      "epoch": 3.398148148148148,
+      "grad_norm": 0.9052318334579468,
+      "learning_rate": 1.1010183859468126e-05,
+      "loss": 0.6289,
+      "step": 19087
+    },
+    {
+      "epoch": 3.3983262108262107,
+      "grad_norm": 0.9558716416358948,
+      "learning_rate": 1.1003799654852976e-05,
+      "loss": 0.87,
+      "step": 19088
+    },
+    {
+      "epoch": 3.3985042735042734,
+      "grad_norm": 0.8846414089202881,
+      "learning_rate": 1.099741719393349e-05,
+      "loss": 0.7292,
+      "step": 19089
+    },
+    {
+      "epoch": 3.398682336182336,
+      "grad_norm": 1.1074748039245605,
+      "learning_rate": 1.0991036476834737e-05,
+      "loss": 0.8082,
+      "step": 19090
+    },
+    {
+      "epoch": 3.398860398860399,
+      "grad_norm": 0.924491286277771,
+      "learning_rate": 1.0984657503681695e-05,
+      "loss": 0.7887,
+      "step": 19091
+    },
+    {
+      "epoch": 3.3990384615384617,
+      "grad_norm": 0.9129840135574341,
+      "learning_rate": 1.0978280274599418e-05,
+      "loss": 0.7463,
+      "step": 19092
+    },
+    {
+      "epoch": 3.3992165242165244,
+      "grad_norm": 1.0049107074737549,
+      "learning_rate": 1.0971904789712773e-05,
+      "loss": 0.7386,
+      "step": 19093
+    },
+    {
+      "epoch": 3.3993945868945867,
+      "grad_norm": 0.9263444542884827,
+      "learning_rate": 1.096553104914676e-05,
+      "loss": 0.8414,
+      "step": 19094
+    },
+    {
+      "epoch": 3.3995726495726495,
+      "grad_norm": 1.01347017288208,
+      "learning_rate": 1.095915905302617e-05,
+      "loss": 0.6919,
+      "step": 19095
+    },
+    {
+      "epoch": 3.3997507122507122,
+      "grad_norm": 0.9228973388671875,
+      "learning_rate": 1.095278880147591e-05,
+      "loss": 0.6941,
+      "step": 19096
+    },
+    {
+      "epoch": 3.399928774928775,
+      "grad_norm": 0.9571877717971802,
+      "learning_rate": 1.0946420294620774e-05,
+      "loss": 0.6876,
+      "step": 19097
+    },
+    {
+      "epoch": 3.4001068376068377,
+      "grad_norm": 1.3483099937438965,
+      "learning_rate": 1.0940053532585537e-05,
+      "loss": 0.9632,
+      "step": 19098
+    },
+    {
+      "epoch": 3.4002849002849,
+      "grad_norm": 0.920470654964447,
+      "learning_rate": 1.0933688515494945e-05,
+      "loss": 0.8084,
+      "step": 19099
+    },
+    {
+      "epoch": 3.400462962962963,
+      "grad_norm": 1.0069130659103394,
+      "learning_rate": 1.0927325243473718e-05,
+      "loss": 0.6561,
+      "step": 19100
+    },
+    {
+      "epoch": 3.4006410256410255,
+      "grad_norm": 0.9278994202613831,
+      "learning_rate": 1.0920963716646492e-05,
+      "loss": 0.6935,
+      "step": 19101
+    },
+    {
+      "epoch": 3.4008190883190883,
+      "grad_norm": 0.9868398308753967,
+      "learning_rate": 1.0914603935137957e-05,
+      "loss": 0.7568,
+      "step": 19102
+    },
+    {
+      "epoch": 3.400997150997151,
+      "grad_norm": 0.9027713537216187,
+      "learning_rate": 1.09082458990727e-05,
+      "loss": 0.7063,
+      "step": 19103
+    },
+    {
+      "epoch": 3.401175213675214,
+      "grad_norm": 0.8853753209114075,
+      "learning_rate": 1.0901889608575288e-05,
+      "loss": 0.7147,
+      "step": 19104
+    },
+    {
+      "epoch": 3.4013532763532766,
+      "grad_norm": 0.9795311093330383,
+      "learning_rate": 1.0895535063770268e-05,
+      "loss": 0.741,
+      "step": 19105
+    },
+    {
+      "epoch": 3.401531339031339,
+      "grad_norm": 0.8565807938575745,
+      "learning_rate": 1.0889182264782138e-05,
+      "loss": 0.6649,
+      "step": 19106
+    },
+    {
+      "epoch": 3.4017094017094016,
+      "grad_norm": 0.8639920353889465,
+      "learning_rate": 1.0882831211735367e-05,
+      "loss": 0.6754,
+      "step": 19107
+    },
+    {
+      "epoch": 3.4018874643874644,
+      "grad_norm": 0.8701103329658508,
+      "learning_rate": 1.0876481904754376e-05,
+      "loss": 0.7004,
+      "step": 19108
+    },
+    {
+      "epoch": 3.402065527065527,
+      "grad_norm": 1.0115605592727661,
+      "learning_rate": 1.0870134343963633e-05,
+      "loss": 0.8188,
+      "step": 19109
+    },
+    {
+      "epoch": 3.40224358974359,
+      "grad_norm": 1.002318263053894,
+      "learning_rate": 1.0863788529487407e-05,
+      "loss": 0.7851,
+      "step": 19110
+    },
+    {
+      "epoch": 3.402421652421652,
+      "grad_norm": 1.0113846063613892,
+      "learning_rate": 1.0857444461450106e-05,
+      "loss": 0.911,
+      "step": 19111
+    },
+    {
+      "epoch": 3.402599715099715,
+      "grad_norm": 1.1443030834197998,
+      "learning_rate": 1.0851102139976e-05,
+      "loss": 0.8623,
+      "step": 19112
+    },
+    {
+      "epoch": 3.4027777777777777,
+      "grad_norm": 0.9804279208183289,
+      "learning_rate": 1.0844761565189354e-05,
+      "loss": 0.6802,
+      "step": 19113
+    },
+    {
+      "epoch": 3.4029558404558404,
+      "grad_norm": 0.9586710333824158,
+      "learning_rate": 1.0838422737214404e-05,
+      "loss": 0.8277,
+      "step": 19114
+    },
+    {
+      "epoch": 3.403133903133903,
+      "grad_norm": 0.8377839922904968,
+      "learning_rate": 1.083208565617534e-05,
+      "loss": 0.68,
+      "step": 19115
+    },
+    {
+      "epoch": 3.403311965811966,
+      "grad_norm": 0.901368260383606,
+      "learning_rate": 1.0825750322196304e-05,
+      "loss": 0.7212,
+      "step": 19116
+    },
+    {
+      "epoch": 3.4034900284900287,
+      "grad_norm": 0.9065432548522949,
+      "learning_rate": 1.08194167354015e-05,
+      "loss": 0.7003,
+      "step": 19117
+    },
+    {
+      "epoch": 3.403668091168091,
+      "grad_norm": 0.9668724536895752,
+      "learning_rate": 1.0813084895914915e-05,
+      "loss": 0.7776,
+      "step": 19118
+    },
+    {
+      "epoch": 3.4038461538461537,
+      "grad_norm": 0.9226725101470947,
+      "learning_rate": 1.0806754803860708e-05,
+      "loss": 0.8726,
+      "step": 19119
+    },
+    {
+      "epoch": 3.4040242165242165,
+      "grad_norm": 0.952060341835022,
+      "learning_rate": 1.0800426459362822e-05,
+      "loss": 0.6952,
+      "step": 19120
+    },
+    {
+      "epoch": 3.4042022792022792,
+      "grad_norm": 0.8968328833580017,
+      "learning_rate": 1.0794099862545293e-05,
+      "loss": 0.68,
+      "step": 19121
+    },
+    {
+      "epoch": 3.404380341880342,
+      "grad_norm": 0.8708463907241821,
+      "learning_rate": 1.0787775013532075e-05,
+      "loss": 0.6991,
+      "step": 19122
+    },
+    {
+      "epoch": 3.4045584045584047,
+      "grad_norm": 0.909925639629364,
+      "learning_rate": 1.078145191244706e-05,
+      "loss": 0.6232,
+      "step": 19123
+    },
+    {
+      "epoch": 3.404736467236467,
+      "grad_norm": 1.08229398727417,
+      "learning_rate": 1.0775130559414204e-05,
+      "loss": 0.786,
+      "step": 19124
+    },
+    {
+      "epoch": 3.40491452991453,
+      "grad_norm": 0.9113919734954834,
+      "learning_rate": 1.0768810954557273e-05,
+      "loss": 0.9116,
+      "step": 19125
+    },
+    {
+      "epoch": 3.4050925925925926,
+      "grad_norm": 1.0875805616378784,
+      "learning_rate": 1.0762493098000158e-05,
+      "loss": 0.922,
+      "step": 19126
+    },
+    {
+      "epoch": 3.4052706552706553,
+      "grad_norm": 1.0122466087341309,
+      "learning_rate": 1.0756176989866607e-05,
+      "loss": 0.6557,
+      "step": 19127
+    },
+    {
+      "epoch": 3.405448717948718,
+      "grad_norm": 0.9092339277267456,
+      "learning_rate": 1.0749862630280384e-05,
+      "loss": 0.5974,
+      "step": 19128
+    },
+    {
+      "epoch": 3.405626780626781,
+      "grad_norm": 0.8997379541397095,
+      "learning_rate": 1.0743550019365189e-05,
+      "loss": 0.9398,
+      "step": 19129
+    },
+    {
+      "epoch": 3.405804843304843,
+      "grad_norm": 0.9965963959693909,
+      "learning_rate": 1.0737239157244727e-05,
+      "loss": 0.7401,
+      "step": 19130
+    },
+    {
+      "epoch": 3.405982905982906,
+      "grad_norm": 0.9418243765830994,
+      "learning_rate": 1.0730930044042642e-05,
+      "loss": 0.8724,
+      "step": 19131
+    },
+    {
+      "epoch": 3.4061609686609686,
+      "grad_norm": 0.8757455348968506,
+      "learning_rate": 1.0724622679882534e-05,
+      "loss": 0.6671,
+      "step": 19132
+    },
+    {
+      "epoch": 3.4063390313390314,
+      "grad_norm": 0.8993387818336487,
+      "learning_rate": 1.0718317064887972e-05,
+      "loss": 0.6749,
+      "step": 19133
+    },
+    {
+      "epoch": 3.406517094017094,
+      "grad_norm": 0.9798662662506104,
+      "learning_rate": 1.071201319918257e-05,
+      "loss": 0.926,
+      "step": 19134
+    },
+    {
+      "epoch": 3.406695156695157,
+      "grad_norm": 0.8816490173339844,
+      "learning_rate": 1.0705711082889736e-05,
+      "loss": 0.789,
+      "step": 19135
+    },
+    {
+      "epoch": 3.406873219373219,
+      "grad_norm": 1.0084799528121948,
+      "learning_rate": 1.069941071613303e-05,
+      "loss": 0.8257,
+      "step": 19136
+    },
+    {
+      "epoch": 3.407051282051282,
+      "grad_norm": 0.9338811635971069,
+      "learning_rate": 1.0693112099035862e-05,
+      "loss": 0.723,
+      "step": 19137
+    },
+    {
+      "epoch": 3.4072293447293447,
+      "grad_norm": 0.9649238586425781,
+      "learning_rate": 1.0686815231721636e-05,
+      "loss": 0.9459,
+      "step": 19138
+    },
+    {
+      "epoch": 3.4074074074074074,
+      "grad_norm": 0.9531043767929077,
+      "learning_rate": 1.068052011431374e-05,
+      "loss": 0.6914,
+      "step": 19139
+    },
+    {
+      "epoch": 3.40758547008547,
+      "grad_norm": 1.1801680326461792,
+      "learning_rate": 1.0674226746935511e-05,
+      "loss": 0.9974,
+      "step": 19140
+    },
+    {
+      "epoch": 3.407763532763533,
+      "grad_norm": 0.9781167507171631,
+      "learning_rate": 1.0667935129710249e-05,
+      "loss": 0.6573,
+      "step": 19141
+    },
+    {
+      "epoch": 3.4079415954415953,
+      "grad_norm": 1.018373966217041,
+      "learning_rate": 1.06616452627612e-05,
+      "loss": 0.8378,
+      "step": 19142
+    },
+    {
+      "epoch": 3.408119658119658,
+      "grad_norm": 0.9358958005905151,
+      "learning_rate": 1.0655357146211653e-05,
+      "loss": 0.7662,
+      "step": 19143
+    },
+    {
+      "epoch": 3.4082977207977208,
+      "grad_norm": 0.9799219369888306,
+      "learning_rate": 1.064907078018479e-05,
+      "loss": 0.8614,
+      "step": 19144
+    },
+    {
+      "epoch": 3.4084757834757835,
+      "grad_norm": 1.0031346082687378,
+      "learning_rate": 1.0642786164803764e-05,
+      "loss": 0.6955,
+      "step": 19145
+    },
+    {
+      "epoch": 3.4086538461538463,
+      "grad_norm": 0.8543678522109985,
+      "learning_rate": 1.0636503300191736e-05,
+      "loss": 0.7027,
+      "step": 19146
+    },
+    {
+      "epoch": 3.408831908831909,
+      "grad_norm": 0.9312917590141296,
+      "learning_rate": 1.0630222186471773e-05,
+      "loss": 0.7433,
+      "step": 19147
+    },
+    {
+      "epoch": 3.4090099715099713,
+      "grad_norm": 1.0434274673461914,
+      "learning_rate": 1.0623942823766941e-05,
+      "loss": 0.8913,
+      "step": 19148
+    },
+    {
+      "epoch": 3.409188034188034,
+      "grad_norm": 0.9719704985618591,
+      "learning_rate": 1.0617665212200335e-05,
+      "loss": 0.7474,
+      "step": 19149
+    },
+    {
+      "epoch": 3.409366096866097,
+      "grad_norm": 0.9384031295776367,
+      "learning_rate": 1.0611389351894852e-05,
+      "loss": 0.738,
+      "step": 19150
+    },
+    {
+      "epoch": 3.4095441595441596,
+      "grad_norm": 0.9763504862785339,
+      "learning_rate": 1.060511524297353e-05,
+      "loss": 0.9144,
+      "step": 19151
+    },
+    {
+      "epoch": 3.4097222222222223,
+      "grad_norm": 1.044063925743103,
+      "learning_rate": 1.0598842885559268e-05,
+      "loss": 0.8187,
+      "step": 19152
+    },
+    {
+      "epoch": 3.409900284900285,
+      "grad_norm": 0.9677876234054565,
+      "learning_rate": 1.059257227977497e-05,
+      "loss": 0.6989,
+      "step": 19153
+    },
+    {
+      "epoch": 3.410078347578348,
+      "grad_norm": 0.8552587032318115,
+      "learning_rate": 1.0586303425743493e-05,
+      "loss": 0.8524,
+      "step": 19154
+    },
+    {
+      "epoch": 3.41025641025641,
+      "grad_norm": 1.0604221820831299,
+      "learning_rate": 1.058003632358766e-05,
+      "loss": 0.7479,
+      "step": 19155
+    },
+    {
+      "epoch": 3.410434472934473,
+      "grad_norm": 0.7916514277458191,
+      "learning_rate": 1.0573770973430253e-05,
+      "loss": 0.5946,
+      "step": 19156
+    },
+    {
+      "epoch": 3.4106125356125356,
+      "grad_norm": 0.7886891961097717,
+      "learning_rate": 1.0567507375394048e-05,
+      "loss": 0.6991,
+      "step": 19157
+    },
+    {
+      "epoch": 3.4107905982905984,
+      "grad_norm": 0.8678019642829895,
+      "learning_rate": 1.0561245529601727e-05,
+      "loss": 0.6513,
+      "step": 19158
+    },
+    {
+      "epoch": 3.410968660968661,
+      "grad_norm": 1.034912347793579,
+      "learning_rate": 1.0554985436176045e-05,
+      "loss": 0.8279,
+      "step": 19159
+    },
+    {
+      "epoch": 3.4111467236467234,
+      "grad_norm": 0.9476612210273743,
+      "learning_rate": 1.0548727095239585e-05,
+      "loss": 0.7955,
+      "step": 19160
+    },
+    {
+      "epoch": 3.411324786324786,
+      "grad_norm": 0.8874943256378174,
+      "learning_rate": 1.0542470506915015e-05,
+      "loss": 0.6118,
+      "step": 19161
+    },
+    {
+      "epoch": 3.411502849002849,
+      "grad_norm": 0.9441639184951782,
+      "learning_rate": 1.0536215671324912e-05,
+      "loss": 0.7955,
+      "step": 19162
+    },
+    {
+      "epoch": 3.4116809116809117,
+      "grad_norm": 0.8857356309890747,
+      "learning_rate": 1.052996258859179e-05,
+      "loss": 0.7249,
+      "step": 19163
+    },
+    {
+      "epoch": 3.4118589743589745,
+      "grad_norm": 0.9193042516708374,
+      "learning_rate": 1.0523711258838231e-05,
+      "loss": 0.7856,
+      "step": 19164
+    },
+    {
+      "epoch": 3.412037037037037,
+      "grad_norm": 0.962785542011261,
+      "learning_rate": 1.0517461682186646e-05,
+      "loss": 0.9475,
+      "step": 19165
+    },
+    {
+      "epoch": 3.4122150997151,
+      "grad_norm": 1.0464279651641846,
+      "learning_rate": 1.051121385875955e-05,
+      "loss": 0.8153,
+      "step": 19166
+    },
+    {
+      "epoch": 3.4123931623931623,
+      "grad_norm": 0.9516451954841614,
+      "learning_rate": 1.0504967788679277e-05,
+      "loss": 0.683,
+      "step": 19167
+    },
+    {
+      "epoch": 3.412571225071225,
+      "grad_norm": 0.9690127968788147,
+      "learning_rate": 1.0498723472068283e-05,
+      "loss": 0.6226,
+      "step": 19168
+    },
+    {
+      "epoch": 3.4127492877492878,
+      "grad_norm": 0.9517914652824402,
+      "learning_rate": 1.0492480909048875e-05,
+      "loss": 0.626,
+      "step": 19169
+    },
+    {
+      "epoch": 3.4129273504273505,
+      "grad_norm": 0.9089692234992981,
+      "learning_rate": 1.0486240099743362e-05,
+      "loss": 0.6041,
+      "step": 19170
+    },
+    {
+      "epoch": 3.4131054131054133,
+      "grad_norm": 0.9216201305389404,
+      "learning_rate": 1.0480001044274023e-05,
+      "loss": 0.6596,
+      "step": 19171
+    },
+    {
+      "epoch": 3.4132834757834756,
+      "grad_norm": 1.018068552017212,
+      "learning_rate": 1.047376374276311e-05,
+      "loss": 0.8819,
+      "step": 19172
+    },
+    {
+      "epoch": 3.4134615384615383,
+      "grad_norm": 0.9685564041137695,
+      "learning_rate": 1.046752819533281e-05,
+      "loss": 0.6659,
+      "step": 19173
+    },
+    {
+      "epoch": 3.413639601139601,
+      "grad_norm": 0.9151574373245239,
+      "learning_rate": 1.0461294402105327e-05,
+      "loss": 0.6773,
+      "step": 19174
+    },
+    {
+      "epoch": 3.413817663817664,
+      "grad_norm": 0.8832176923751831,
+      "learning_rate": 1.045506236320275e-05,
+      "loss": 0.658,
+      "step": 19175
+    },
+    {
+      "epoch": 3.4139957264957266,
+      "grad_norm": 0.8827099204063416,
+      "learning_rate": 1.0448832078747262e-05,
+      "loss": 0.7711,
+      "step": 19176
+    },
+    {
+      "epoch": 3.4141737891737893,
+      "grad_norm": 0.8752033114433289,
+      "learning_rate": 1.044260354886083e-05,
+      "loss": 0.8464,
+      "step": 19177
+    },
+    {
+      "epoch": 3.414351851851852,
+      "grad_norm": 1.0381900072097778,
+      "learning_rate": 1.0436376773665569e-05,
+      "loss": 0.9648,
+      "step": 19178
+    },
+    {
+      "epoch": 3.4145299145299144,
+      "grad_norm": 1.02308988571167,
+      "learning_rate": 1.0430151753283445e-05,
+      "loss": 0.9882,
+      "step": 19179
+    },
+    {
+      "epoch": 3.414707977207977,
+      "grad_norm": 1.0302221775054932,
+      "learning_rate": 1.042392848783642e-05,
+      "loss": 0.8057,
+      "step": 19180
+    },
+    {
+      "epoch": 3.41488603988604,
+      "grad_norm": 1.1306030750274658,
+      "learning_rate": 1.0417706977446473e-05,
+      "loss": 0.7419,
+      "step": 19181
+    },
+    {
+      "epoch": 3.4150641025641026,
+      "grad_norm": 0.9197164177894592,
+      "learning_rate": 1.0411487222235428e-05,
+      "loss": 0.7504,
+      "step": 19182
+    },
+    {
+      "epoch": 3.4152421652421654,
+      "grad_norm": 0.8756219744682312,
+      "learning_rate": 1.0405269222325209e-05,
+      "loss": 0.7493,
+      "step": 19183
+    },
+    {
+      "epoch": 3.4154202279202277,
+      "grad_norm": 1.0161107778549194,
+      "learning_rate": 1.039905297783762e-05,
+      "loss": 0.6777,
+      "step": 19184
+    },
+    {
+      "epoch": 3.4155982905982905,
+      "grad_norm": 0.8732602596282959,
+      "learning_rate": 1.0392838488894463e-05,
+      "loss": 0.7629,
+      "step": 19185
+    },
+    {
+      "epoch": 3.415776353276353,
+      "grad_norm": 1.0775624513626099,
+      "learning_rate": 1.0386625755617485e-05,
+      "loss": 0.7746,
+      "step": 19186
+    },
+    {
+      "epoch": 3.415954415954416,
+      "grad_norm": 0.8961769342422485,
+      "learning_rate": 1.0380414778128423e-05,
+      "loss": 0.5364,
+      "step": 19187
+    },
+    {
+      "epoch": 3.4161324786324787,
+      "grad_norm": 0.9528487324714661,
+      "learning_rate": 1.0374205556548944e-05,
+      "loss": 0.7021,
+      "step": 19188
+    },
+    {
+      "epoch": 3.4163105413105415,
+      "grad_norm": 1.0150240659713745,
+      "learning_rate": 1.0367998091000764e-05,
+      "loss": 0.7728,
+      "step": 19189
+    },
+    {
+      "epoch": 3.416488603988604,
+      "grad_norm": 0.9373225569725037,
+      "learning_rate": 1.036179238160544e-05,
+      "loss": 0.8242,
+      "step": 19190
+    },
+    {
+      "epoch": 3.4166666666666665,
+      "grad_norm": 1.0559889078140259,
+      "learning_rate": 1.0355588428484608e-05,
+      "loss": 0.7779,
+      "step": 19191
+    },
+    {
+      "epoch": 3.4168447293447293,
+      "grad_norm": 0.8923398852348328,
+      "learning_rate": 1.0349386231759773e-05,
+      "loss": 0.621,
+      "step": 19192
+    },
+    {
+      "epoch": 3.417022792022792,
+      "grad_norm": 0.8571991324424744,
+      "learning_rate": 1.0343185791552501e-05,
+      "loss": 0.6731,
+      "step": 19193
+    },
+    {
+      "epoch": 3.4172008547008548,
+      "grad_norm": 1.01732337474823,
+      "learning_rate": 1.0336987107984253e-05,
+      "loss": 0.8578,
+      "step": 19194
+    },
+    {
+      "epoch": 3.4173789173789175,
+      "grad_norm": 0.9371682405471802,
+      "learning_rate": 1.0330790181176487e-05,
+      "loss": 0.8188,
+      "step": 19195
+    },
+    {
+      "epoch": 3.41755698005698,
+      "grad_norm": 0.8861309885978699,
+      "learning_rate": 1.0324595011250616e-05,
+      "loss": 0.7745,
+      "step": 19196
+    },
+    {
+      "epoch": 3.4177350427350426,
+      "grad_norm": 0.8951418995857239,
+      "learning_rate": 1.031840159832802e-05,
+      "loss": 0.6104,
+      "step": 19197
+    },
+    {
+      "epoch": 3.4179131054131053,
+      "grad_norm": 0.9123335480690002,
+      "learning_rate": 1.0312209942530027e-05,
+      "loss": 0.7475,
+      "step": 19198
+    },
+    {
+      "epoch": 3.418091168091168,
+      "grad_norm": 0.9403263926506042,
+      "learning_rate": 1.0306020043978004e-05,
+      "loss": 0.7961,
+      "step": 19199
+    },
+    {
+      "epoch": 3.418269230769231,
+      "grad_norm": 0.9268109202384949,
+      "learning_rate": 1.0299831902793168e-05,
+      "loss": 0.6927,
+      "step": 19200
+    },
+    {
+      "epoch": 3.4184472934472936,
+      "grad_norm": 0.9422159194946289,
+      "learning_rate": 1.0293645519096807e-05,
+      "loss": 0.8041,
+      "step": 19201
+    },
+    {
+      "epoch": 3.4186253561253563,
+      "grad_norm": 0.9820014834403992,
+      "learning_rate": 1.0287460893010104e-05,
+      "loss": 0.6622,
+      "step": 19202
+    },
+    {
+      "epoch": 3.4188034188034186,
+      "grad_norm": 0.9447580575942993,
+      "learning_rate": 1.028127802465425e-05,
+      "loss": 0.8675,
+      "step": 19203
+    },
+    {
+      "epoch": 3.4189814814814814,
+      "grad_norm": 0.8962849378585815,
+      "learning_rate": 1.0275096914150373e-05,
+      "loss": 0.5652,
+      "step": 19204
+    },
+    {
+      "epoch": 3.419159544159544,
+      "grad_norm": 1.0891627073287964,
+      "learning_rate": 1.0268917561619574e-05,
+      "loss": 0.7977,
+      "step": 19205
+    },
+    {
+      "epoch": 3.419337606837607,
+      "grad_norm": 0.9739325642585754,
+      "learning_rate": 1.0262739967182966e-05,
+      "loss": 0.7116,
+      "step": 19206
+    },
+    {
+      "epoch": 3.4195156695156697,
+      "grad_norm": 0.9855953454971313,
+      "learning_rate": 1.0256564130961522e-05,
+      "loss": 0.6721,
+      "step": 19207
+    },
+    {
+      "epoch": 3.419693732193732,
+      "grad_norm": 1.1992031335830688,
+      "learning_rate": 1.0250390053076298e-05,
+      "loss": 0.892,
+      "step": 19208
+    },
+    {
+      "epoch": 3.4198717948717947,
+      "grad_norm": 0.9223141074180603,
+      "learning_rate": 1.0244217733648242e-05,
+      "loss": 0.5814,
+      "step": 19209
+    },
+    {
+      "epoch": 3.4200498575498575,
+      "grad_norm": 1.0374293327331543,
+      "learning_rate": 1.0238047172798282e-05,
+      "loss": 0.6035,
+      "step": 19210
+    },
+    {
+      "epoch": 3.42022792022792,
+      "grad_norm": 0.8509711027145386,
+      "learning_rate": 1.023187837064733e-05,
+      "loss": 0.6018,
+      "step": 19211
+    },
+    {
+      "epoch": 3.420405982905983,
+      "grad_norm": 1.0256701707839966,
+      "learning_rate": 1.0225711327316234e-05,
+      "loss": 0.6962,
+      "step": 19212
+    },
+    {
+      "epoch": 3.4205840455840457,
+      "grad_norm": 0.9454541206359863,
+      "learning_rate": 1.0219546042925843e-05,
+      "loss": 0.8685,
+      "step": 19213
+    },
+    {
+      "epoch": 3.4207621082621085,
+      "grad_norm": 0.9761176705360413,
+      "learning_rate": 1.0213382517596948e-05,
+      "loss": 0.729,
+      "step": 19214
+    },
+    {
+      "epoch": 3.4209401709401708,
+      "grad_norm": 0.9365816712379456,
+      "learning_rate": 1.0207220751450286e-05,
+      "loss": 0.7878,
+      "step": 19215
+    },
+    {
+      "epoch": 3.4211182336182335,
+      "grad_norm": 0.884979784488678,
+      "learning_rate": 1.0201060744606639e-05,
+      "loss": 0.6086,
+      "step": 19216
+    },
+    {
+      "epoch": 3.4212962962962963,
+      "grad_norm": 0.9652853608131409,
+      "learning_rate": 1.019490249718663e-05,
+      "loss": 0.9374,
+      "step": 19217
+    },
+    {
+      "epoch": 3.421474358974359,
+      "grad_norm": 0.9737793803215027,
+      "learning_rate": 1.0188746009310968e-05,
+      "loss": 0.8034,
+      "step": 19218
+    },
+    {
+      "epoch": 3.421652421652422,
+      "grad_norm": 1.0123218297958374,
+      "learning_rate": 1.0182591281100262e-05,
+      "loss": 0.7928,
+      "step": 19219
+    },
+    {
+      "epoch": 3.421830484330484,
+      "grad_norm": 0.9481899738311768,
+      "learning_rate": 1.0176438312675086e-05,
+      "loss": 0.9576,
+      "step": 19220
+    },
+    {
+      "epoch": 3.422008547008547,
+      "grad_norm": 0.977864146232605,
+      "learning_rate": 1.017028710415604e-05,
+      "loss": 0.8792,
+      "step": 19221
+    },
+    {
+      "epoch": 3.4221866096866096,
+      "grad_norm": 1.0100616216659546,
+      "learning_rate": 1.0164137655663586e-05,
+      "loss": 0.8322,
+      "step": 19222
+    },
+    {
+      "epoch": 3.4223646723646723,
+      "grad_norm": 0.8587307929992676,
+      "learning_rate": 1.015798996731825e-05,
+      "loss": 0.68,
+      "step": 19223
+    },
+    {
+      "epoch": 3.422542735042735,
+      "grad_norm": 0.979102373123169,
+      "learning_rate": 1.0151844039240465e-05,
+      "loss": 0.9162,
+      "step": 19224
+    },
+    {
+      "epoch": 3.422720797720798,
+      "grad_norm": 0.95932936668396,
+      "learning_rate": 1.0145699871550652e-05,
+      "loss": 0.865,
+      "step": 19225
+    },
+    {
+      "epoch": 3.4228988603988606,
+      "grad_norm": 1.080337643623352,
+      "learning_rate": 1.0139557464369199e-05,
+      "loss": 0.8694,
+      "step": 19226
+    },
+    {
+      "epoch": 3.423076923076923,
+      "grad_norm": 0.9872581362724304,
+      "learning_rate": 1.0133416817816454e-05,
+      "loss": 0.7365,
+      "step": 19227
+    },
+    {
+      "epoch": 3.4232549857549857,
+      "grad_norm": 0.9537657499313354,
+      "learning_rate": 1.0127277932012713e-05,
+      "loss": 0.9292,
+      "step": 19228
+    },
+    {
+      "epoch": 3.4234330484330484,
+      "grad_norm": 1.0455421209335327,
+      "learning_rate": 1.0121140807078278e-05,
+      "loss": 0.6459,
+      "step": 19229
+    },
+    {
+      "epoch": 3.423611111111111,
+      "grad_norm": 0.9274110198020935,
+      "learning_rate": 1.0115005443133351e-05,
+      "loss": 0.6901,
+      "step": 19230
+    },
+    {
+      "epoch": 3.423789173789174,
+      "grad_norm": 0.9798219799995422,
+      "learning_rate": 1.0108871840298217e-05,
+      "loss": 0.7448,
+      "step": 19231
+    },
+    {
+      "epoch": 3.423967236467236,
+      "grad_norm": 0.9771175980567932,
+      "learning_rate": 1.0102739998692977e-05,
+      "loss": 0.9554,
+      "step": 19232
+    },
+    {
+      "epoch": 3.424145299145299,
+      "grad_norm": 1.0822041034698486,
+      "learning_rate": 1.0096609918437805e-05,
+      "loss": 0.6761,
+      "step": 19233
+    },
+    {
+      "epoch": 3.4243233618233617,
+      "grad_norm": 1.0950684547424316,
+      "learning_rate": 1.0090481599652812e-05,
+      "loss": 0.6483,
+      "step": 19234
+    },
+    {
+      "epoch": 3.4245014245014245,
+      "grad_norm": 1.1559733152389526,
+      "learning_rate": 1.008435504245806e-05,
+      "loss": 0.6516,
+      "step": 19235
+    },
+    {
+      "epoch": 3.4246794871794872,
+      "grad_norm": 0.8701435327529907,
+      "learning_rate": 1.0078230246973586e-05,
+      "loss": 0.6525,
+      "step": 19236
+    },
+    {
+      "epoch": 3.42485754985755,
+      "grad_norm": 1.002463698387146,
+      "learning_rate": 1.0072107213319394e-05,
+      "loss": 0.7652,
+      "step": 19237
+    },
+    {
+      "epoch": 3.4250356125356127,
+      "grad_norm": 1.0020043849945068,
+      "learning_rate": 1.0065985941615452e-05,
+      "loss": 0.6226,
+      "step": 19238
+    },
+    {
+      "epoch": 3.425213675213675,
+      "grad_norm": 1.0987087488174438,
+      "learning_rate": 1.0059866431981701e-05,
+      "loss": 0.8834,
+      "step": 19239
+    },
+    {
+      "epoch": 3.425391737891738,
+      "grad_norm": 1.0064318180084229,
+      "learning_rate": 1.0053748684538e-05,
+      "loss": 0.7865,
+      "step": 19240
+    },
+    {
+      "epoch": 3.4255698005698005,
+      "grad_norm": 0.9753440618515015,
+      "learning_rate": 1.0047632699404274e-05,
+      "loss": 0.8149,
+      "step": 19241
+    },
+    {
+      "epoch": 3.4257478632478633,
+      "grad_norm": 0.9898001551628113,
+      "learning_rate": 1.0041518476700329e-05,
+      "loss": 0.7078,
+      "step": 19242
+    },
+    {
+      "epoch": 3.425925925925926,
+      "grad_norm": 0.9829652309417725,
+      "learning_rate": 1.0035406016545955e-05,
+      "loss": 0.7877,
+      "step": 19243
+    },
+    {
+      "epoch": 3.426103988603989,
+      "grad_norm": 0.8720074892044067,
+      "learning_rate": 1.0029295319060905e-05,
+      "loss": 0.7099,
+      "step": 19244
+    },
+    {
+      "epoch": 3.426282051282051,
+      "grad_norm": 0.8775247931480408,
+      "learning_rate": 1.0023186384364902e-05,
+      "loss": 0.748,
+      "step": 19245
+    },
+    {
+      "epoch": 3.426460113960114,
+      "grad_norm": 1.011750340461731,
+      "learning_rate": 1.0017079212577696e-05,
+      "loss": 0.8075,
+      "step": 19246
+    },
+    {
+      "epoch": 3.4266381766381766,
+      "grad_norm": 0.9512860178947449,
+      "learning_rate": 1.0010973803818857e-05,
+      "loss": 0.8108,
+      "step": 19247
+    },
+    {
+      "epoch": 3.4268162393162394,
+      "grad_norm": 0.9716009497642517,
+      "learning_rate": 1.000487015820809e-05,
+      "loss": 0.8015,
+      "step": 19248
+    },
+    {
+      "epoch": 3.426994301994302,
+      "grad_norm": 1.016042709350586,
+      "learning_rate": 9.998768275864901e-06,
+      "loss": 0.6669,
+      "step": 19249
+    },
+    {
+      "epoch": 3.427172364672365,
+      "grad_norm": 0.9144439697265625,
+      "learning_rate": 9.992668156908902e-06,
+      "loss": 0.6384,
+      "step": 19250
+    },
+    {
+      "epoch": 3.427350427350427,
+      "grad_norm": 0.9310395121574402,
+      "learning_rate": 9.9865698014596e-06,
+      "loss": 0.7346,
+      "step": 19251
+    },
+    {
+      "epoch": 3.42752849002849,
+      "grad_norm": 1.012934684753418,
+      "learning_rate": 9.980473209636477e-06,
+      "loss": 0.7994,
+      "step": 19252
+    },
+    {
+      "epoch": 3.4277065527065527,
+      "grad_norm": 1.0739672183990479,
+      "learning_rate": 9.97437838155899e-06,
+      "loss": 0.9918,
+      "step": 19253
+    },
+    {
+      "epoch": 3.4278846153846154,
+      "grad_norm": 1.0439870357513428,
+      "learning_rate": 9.968285317346538e-06,
+      "loss": 0.8058,
+      "step": 19254
+    },
+    {
+      "epoch": 3.428062678062678,
+      "grad_norm": 0.8677945733070374,
+      "learning_rate": 9.962194017118486e-06,
+      "loss": 0.7721,
+      "step": 19255
+    },
+    {
+      "epoch": 3.428240740740741,
+      "grad_norm": 0.9556106328964233,
+      "learning_rate": 9.956104480994254e-06,
+      "loss": 0.7655,
+      "step": 19256
+    },
+    {
+      "epoch": 3.4284188034188032,
+      "grad_norm": 0.9485167264938354,
+      "learning_rate": 9.950016709093068e-06,
+      "loss": 0.7862,
+      "step": 19257
+    },
+    {
+      "epoch": 3.428596866096866,
+      "grad_norm": 0.9476799368858337,
+      "learning_rate": 9.943930701534254e-06,
+      "loss": 0.7286,
+      "step": 19258
+    },
+    {
+      "epoch": 3.4287749287749287,
+      "grad_norm": 0.8918333649635315,
+      "learning_rate": 9.937846458437039e-06,
+      "loss": 0.5837,
+      "step": 19259
+    },
+    {
+      "epoch": 3.4289529914529915,
+      "grad_norm": 0.9961351156234741,
+      "learning_rate": 9.931763979920627e-06,
+      "loss": 0.719,
+      "step": 19260
+    },
+    {
+      "epoch": 3.4291310541310542,
+      "grad_norm": 0.832885205745697,
+      "learning_rate": 9.925683266104224e-06,
+      "loss": 0.7854,
+      "step": 19261
+    },
+    {
+      "epoch": 3.429309116809117,
+      "grad_norm": 0.9508270621299744,
+      "learning_rate": 9.919604317106913e-06,
+      "loss": 0.7528,
+      "step": 19262
+    },
+    {
+      "epoch": 3.4294871794871793,
+      "grad_norm": 0.9197067022323608,
+      "learning_rate": 9.913527133047873e-06,
+      "loss": 0.8416,
+      "step": 19263
+    },
+    {
+      "epoch": 3.429665242165242,
+      "grad_norm": 0.8550262451171875,
+      "learning_rate": 9.90745171404609e-06,
+      "loss": 0.8283,
+      "step": 19264
+    },
+    {
+      "epoch": 3.429843304843305,
+      "grad_norm": 0.9762335419654846,
+      "learning_rate": 9.901378060220646e-06,
+      "loss": 0.8186,
+      "step": 19265
+    },
+    {
+      "epoch": 3.4300213675213675,
+      "grad_norm": 0.8388769030570984,
+      "learning_rate": 9.895306171690554e-06,
+      "loss": 0.5665,
+      "step": 19266
+    },
+    {
+      "epoch": 3.4301994301994303,
+      "grad_norm": 0.7391115427017212,
+      "learning_rate": 9.889236048574746e-06,
+      "loss": 0.5464,
+      "step": 19267
+    },
+    {
+      "epoch": 3.430377492877493,
+      "grad_norm": 0.7876406311988831,
+      "learning_rate": 9.883167690992179e-06,
+      "loss": 0.3883,
+      "step": 19268
+    },
+    {
+      "epoch": 3.4305555555555554,
+      "grad_norm": 1.0693145990371704,
+      "learning_rate": 9.877101099061737e-06,
+      "loss": 0.7961,
+      "step": 19269
+    },
+    {
+      "epoch": 3.430733618233618,
+      "grad_norm": 1.2267308235168457,
+      "learning_rate": 9.871036272902256e-06,
+      "loss": 0.9196,
+      "step": 19270
+    },
+    {
+      "epoch": 3.430911680911681,
+      "grad_norm": 1.0068786144256592,
+      "learning_rate": 9.864973212632645e-06,
+      "loss": 0.8718,
+      "step": 19271
+    },
+    {
+      "epoch": 3.4310897435897436,
+      "grad_norm": 0.8946726322174072,
+      "learning_rate": 9.858911918371605e-06,
+      "loss": 0.7608,
+      "step": 19272
+    },
+    {
+      "epoch": 3.4312678062678064,
+      "grad_norm": 0.9454519748687744,
+      "learning_rate": 9.852852390237966e-06,
+      "loss": 0.7771,
+      "step": 19273
+    },
+    {
+      "epoch": 3.431445868945869,
+      "grad_norm": 0.9317994713783264,
+      "learning_rate": 9.846794628350376e-06,
+      "loss": 0.8406,
+      "step": 19274
+    },
+    {
+      "epoch": 3.431623931623932,
+      "grad_norm": 0.9324338436126709,
+      "learning_rate": 9.840738632827594e-06,
+      "loss": 0.7005,
+      "step": 19275
+    },
+    {
+      "epoch": 3.431801994301994,
+      "grad_norm": 0.960617184638977,
+      "learning_rate": 9.834684403788252e-06,
+      "loss": 0.8823,
+      "step": 19276
+    },
+    {
+      "epoch": 3.431980056980057,
+      "grad_norm": 0.89632248878479,
+      "learning_rate": 9.828631941350963e-06,
+      "loss": 0.6818,
+      "step": 19277
+    },
+    {
+      "epoch": 3.4321581196581197,
+      "grad_norm": 1.0242297649383545,
+      "learning_rate": 9.822581245634321e-06,
+      "loss": 0.8945,
+      "step": 19278
+    },
+    {
+      "epoch": 3.4323361823361824,
+      "grad_norm": 1.0989742279052734,
+      "learning_rate": 9.816532316756855e-06,
+      "loss": 0.7583,
+      "step": 19279
+    },
+    {
+      "epoch": 3.432514245014245,
+      "grad_norm": 0.9161012172698975,
+      "learning_rate": 9.810485154837112e-06,
+      "loss": 0.695,
+      "step": 19280
+    },
+    {
+      "epoch": 3.4326923076923075,
+      "grad_norm": 0.8679816126823425,
+      "learning_rate": 9.804439759993555e-06,
+      "loss": 0.5813,
+      "step": 19281
+    },
+    {
+      "epoch": 3.4328703703703702,
+      "grad_norm": 0.8272216320037842,
+      "learning_rate": 9.798396132344644e-06,
+      "loss": 0.6652,
+      "step": 19282
+    },
+    {
+      "epoch": 3.433048433048433,
+      "grad_norm": 0.9027792811393738,
+      "learning_rate": 9.792354272008775e-06,
+      "loss": 0.7356,
+      "step": 19283
+    },
+    {
+      "epoch": 3.4332264957264957,
+      "grad_norm": 0.9031938910484314,
+      "learning_rate": 9.786314179104339e-06,
+      "loss": 0.8202,
+      "step": 19284
+    },
+    {
+      "epoch": 3.4334045584045585,
+      "grad_norm": 1.02913236618042,
+      "learning_rate": 9.780275853749676e-06,
+      "loss": 0.6623,
+      "step": 19285
+    },
+    {
+      "epoch": 3.4335826210826212,
+      "grad_norm": 1.0442672967910767,
+      "learning_rate": 9.774239296063093e-06,
+      "loss": 0.8638,
+      "step": 19286
+    },
+    {
+      "epoch": 3.433760683760684,
+      "grad_norm": 0.9184064865112305,
+      "learning_rate": 9.768204506162837e-06,
+      "loss": 0.7879,
+      "step": 19287
+    },
+    {
+      "epoch": 3.4339387464387463,
+      "grad_norm": 1.2148641347885132,
+      "learning_rate": 9.762171484167216e-06,
+      "loss": 0.7737,
+      "step": 19288
+    },
+    {
+      "epoch": 3.434116809116809,
+      "grad_norm": 0.9303769469261169,
+      "learning_rate": 9.756140230194355e-06,
+      "loss": 0.7484,
+      "step": 19289
+    },
+    {
+      "epoch": 3.434294871794872,
+      "grad_norm": 0.9049912691116333,
+      "learning_rate": 9.750110744362484e-06,
+      "loss": 0.6674,
+      "step": 19290
+    },
+    {
+      "epoch": 3.4344729344729346,
+      "grad_norm": 0.7755575776100159,
+      "learning_rate": 9.744083026789708e-06,
+      "loss": 0.6105,
+      "step": 19291
+    },
+    {
+      "epoch": 3.4346509971509973,
+      "grad_norm": 1.0556714534759521,
+      "learning_rate": 9.738057077594132e-06,
+      "loss": 0.8547,
+      "step": 19292
+    },
+    {
+      "epoch": 3.4348290598290596,
+      "grad_norm": 0.8901651501655579,
+      "learning_rate": 9.732032896893838e-06,
+      "loss": 0.6511,
+      "step": 19293
+    },
+    {
+      "epoch": 3.4350071225071224,
+      "grad_norm": 1.0504934787750244,
+      "learning_rate": 9.726010484806836e-06,
+      "loss": 0.8204,
+      "step": 19294
+    },
+    {
+      "epoch": 3.435185185185185,
+      "grad_norm": 0.9080862402915955,
+      "learning_rate": 9.719989841451105e-06,
+      "loss": 0.7081,
+      "step": 19295
+    },
+    {
+      "epoch": 3.435363247863248,
+      "grad_norm": 0.8364829421043396,
+      "learning_rate": 9.713970966944674e-06,
+      "loss": 0.5457,
+      "step": 19296
+    },
+    {
+      "epoch": 3.4355413105413106,
+      "grad_norm": 1.1203440427780151,
+      "learning_rate": 9.707953861405394e-06,
+      "loss": 0.6262,
+      "step": 19297
+    },
+    {
+      "epoch": 3.4357193732193734,
+      "grad_norm": 0.9322909116744995,
+      "learning_rate": 9.701938524951237e-06,
+      "loss": 0.7216,
+      "step": 19298
+    },
+    {
+      "epoch": 3.435897435897436,
+      "grad_norm": 1.0503484010696411,
+      "learning_rate": 9.695924957699964e-06,
+      "loss": 0.6878,
+      "step": 19299
+    },
+    {
+      "epoch": 3.4360754985754984,
+      "grad_norm": 1.0737676620483398,
+      "learning_rate": 9.689913159769481e-06,
+      "loss": 0.9491,
+      "step": 19300
+    },
+    {
+      "epoch": 3.436253561253561,
+      "grad_norm": 1.0106950998306274,
+      "learning_rate": 9.683903131277539e-06,
+      "loss": 0.7918,
+      "step": 19301
+    },
+    {
+      "epoch": 3.436431623931624,
+      "grad_norm": 0.890433669090271,
+      "learning_rate": 9.677894872341886e-06,
+      "loss": 0.8192,
+      "step": 19302
+    },
+    {
+      "epoch": 3.4366096866096867,
+      "grad_norm": 0.9569767117500305,
+      "learning_rate": 9.671888383080297e-06,
+      "loss": 0.6926,
+      "step": 19303
+    },
+    {
+      "epoch": 3.4367877492877494,
+      "grad_norm": 1.1156257390975952,
+      "learning_rate": 9.665883663610365e-06,
+      "loss": 0.9691,
+      "step": 19304
+    },
+    {
+      "epoch": 3.4369658119658117,
+      "grad_norm": 1.0798922777175903,
+      "learning_rate": 9.659880714049819e-06,
+      "loss": 0.8248,
+      "step": 19305
+    },
+    {
+      "epoch": 3.4371438746438745,
+      "grad_norm": 1.134392261505127,
+      "learning_rate": 9.653879534516242e-06,
+      "loss": 0.7283,
+      "step": 19306
+    },
+    {
+      "epoch": 3.4373219373219372,
+      "grad_norm": 0.9919711947441101,
+      "learning_rate": 9.647880125127217e-06,
+      "loss": 0.7475,
+      "step": 19307
+    },
+    {
+      "epoch": 3.4375,
+      "grad_norm": 0.9417802095413208,
+      "learning_rate": 9.641882486000287e-06,
+      "loss": 0.7534,
+      "step": 19308
+    },
+    {
+      "epoch": 3.4376780626780628,
+      "grad_norm": 1.0912595987319946,
+      "learning_rate": 9.635886617252975e-06,
+      "loss": 0.6257,
+      "step": 19309
+    },
+    {
+      "epoch": 3.4378561253561255,
+      "grad_norm": 0.8998816013336182,
+      "learning_rate": 9.629892519002747e-06,
+      "loss": 0.7533,
+      "step": 19310
+    },
+    {
+      "epoch": 3.4380341880341883,
+      "grad_norm": 1.1993279457092285,
+      "learning_rate": 9.623900191367041e-06,
+      "loss": 0.7533,
+      "step": 19311
+    },
+    {
+      "epoch": 3.4382122507122506,
+      "grad_norm": 0.9719237089157104,
+      "learning_rate": 9.617909634463263e-06,
+      "loss": 0.7894,
+      "step": 19312
+    },
+    {
+      "epoch": 3.4383903133903133,
+      "grad_norm": 0.9022089838981628,
+      "learning_rate": 9.611920848408817e-06,
+      "loss": 0.7212,
+      "step": 19313
+    },
+    {
+      "epoch": 3.438568376068376,
+      "grad_norm": 0.9632169008255005,
+      "learning_rate": 9.60593383332099e-06,
+      "loss": 0.8404,
+      "step": 19314
+    },
+    {
+      "epoch": 3.438746438746439,
+      "grad_norm": 0.9799385070800781,
+      "learning_rate": 9.599948589317131e-06,
+      "loss": 0.7254,
+      "step": 19315
+    },
+    {
+      "epoch": 3.4389245014245016,
+      "grad_norm": 0.9019197225570679,
+      "learning_rate": 9.59396511651448e-06,
+      "loss": 0.6593,
+      "step": 19316
+    },
+    {
+      "epoch": 3.439102564102564,
+      "grad_norm": 1.1352930068969727,
+      "learning_rate": 9.587983415030288e-06,
+      "loss": 0.9686,
+      "step": 19317
+    },
+    {
+      "epoch": 3.4392806267806266,
+      "grad_norm": 0.8560138940811157,
+      "learning_rate": 9.58200348498175e-06,
+      "loss": 0.7901,
+      "step": 19318
+    },
+    {
+      "epoch": 3.4394586894586894,
+      "grad_norm": 0.8698241710662842,
+      "learning_rate": 9.576025326485993e-06,
+      "loss": 0.6078,
+      "step": 19319
+    },
+    {
+      "epoch": 3.439636752136752,
+      "grad_norm": 0.9737006425857544,
+      "learning_rate": 9.570048939660236e-06,
+      "loss": 0.9071,
+      "step": 19320
+    },
+    {
+      "epoch": 3.439814814814815,
+      "grad_norm": 0.8899868726730347,
+      "learning_rate": 9.564074324621475e-06,
+      "loss": 0.6993,
+      "step": 19321
+    },
+    {
+      "epoch": 3.4399928774928776,
+      "grad_norm": 0.8926196694374084,
+      "learning_rate": 9.558101481486826e-06,
+      "loss": 0.533,
+      "step": 19322
+    },
+    {
+      "epoch": 3.4401709401709404,
+      "grad_norm": 0.8816601634025574,
+      "learning_rate": 9.552130410373306e-06,
+      "loss": 0.8344,
+      "step": 19323
+    },
+    {
+      "epoch": 3.4403490028490027,
+      "grad_norm": 1.0575761795043945,
+      "learning_rate": 9.5461611113979e-06,
+      "loss": 0.9992,
+      "step": 19324
+    },
+    {
+      "epoch": 3.4405270655270654,
+      "grad_norm": 0.9737523198127747,
+      "learning_rate": 9.54019358467756e-06,
+      "loss": 0.8928,
+      "step": 19325
+    },
+    {
+      "epoch": 3.440705128205128,
+      "grad_norm": 1.1094090938568115,
+      "learning_rate": 9.534227830329224e-06,
+      "loss": 0.846,
+      "step": 19326
+    },
+    {
+      "epoch": 3.440883190883191,
+      "grad_norm": 0.8708354830741882,
+      "learning_rate": 9.528263848469742e-06,
+      "loss": 0.7715,
+      "step": 19327
+    },
+    {
+      "epoch": 3.4410612535612537,
+      "grad_norm": 0.916589081287384,
+      "learning_rate": 9.522301639216024e-06,
+      "loss": 0.6541,
+      "step": 19328
+    },
+    {
+      "epoch": 3.441239316239316,
+      "grad_norm": 0.8238539695739746,
+      "learning_rate": 9.51634120268482e-06,
+      "loss": 0.7523,
+      "step": 19329
+    },
+    {
+      "epoch": 3.4414173789173788,
+      "grad_norm": 0.9754942059516907,
+      "learning_rate": 9.510382538992969e-06,
+      "loss": 0.7002,
+      "step": 19330
+    },
+    {
+      "epoch": 3.4415954415954415,
+      "grad_norm": 0.898399293422699,
+      "learning_rate": 9.504425648257198e-06,
+      "loss": 0.7644,
+      "step": 19331
+    },
+    {
+      "epoch": 3.4417735042735043,
+      "grad_norm": 0.8976600766181946,
+      "learning_rate": 9.498470530594217e-06,
+      "loss": 0.7345,
+      "step": 19332
+    },
+    {
+      "epoch": 3.441951566951567,
+      "grad_norm": 1.0468547344207764,
+      "learning_rate": 9.492517186120697e-06,
+      "loss": 0.7121,
+      "step": 19333
+    },
+    {
+      "epoch": 3.4421296296296298,
+      "grad_norm": 1.185500144958496,
+      "learning_rate": 9.486565614953302e-06,
+      "loss": 0.8665,
+      "step": 19334
+    },
+    {
+      "epoch": 3.4423076923076925,
+      "grad_norm": 1.0200905799865723,
+      "learning_rate": 9.480615817208615e-06,
+      "loss": 0.7206,
+      "step": 19335
+    },
+    {
+      "epoch": 3.442485754985755,
+      "grad_norm": 0.9022289514541626,
+      "learning_rate": 9.474667793003234e-06,
+      "loss": 0.6764,
+      "step": 19336
+    },
+    {
+      "epoch": 3.4426638176638176,
+      "grad_norm": 0.9930318593978882,
+      "learning_rate": 9.468721542453662e-06,
+      "loss": 0.7878,
+      "step": 19337
+    },
+    {
+      "epoch": 3.4428418803418803,
+      "grad_norm": 0.9643216729164124,
+      "learning_rate": 9.462777065676476e-06,
+      "loss": 0.8636,
+      "step": 19338
+    },
+    {
+      "epoch": 3.443019943019943,
+      "grad_norm": 1.0845335721969604,
+      "learning_rate": 9.456834362788059e-06,
+      "loss": 0.6301,
+      "step": 19339
+    },
+    {
+      "epoch": 3.443198005698006,
+      "grad_norm": 0.8617258071899414,
+      "learning_rate": 9.450893433904894e-06,
+      "loss": 0.6117,
+      "step": 19340
+    },
+    {
+      "epoch": 3.443376068376068,
+      "grad_norm": 1.0213714838027954,
+      "learning_rate": 9.444954279143382e-06,
+      "loss": 0.7732,
+      "step": 19341
+    },
+    {
+      "epoch": 3.443554131054131,
+      "grad_norm": 1.0030566453933716,
+      "learning_rate": 9.439016898619857e-06,
+      "loss": 0.6772,
+      "step": 19342
+    },
+    {
+      "epoch": 3.4437321937321936,
+      "grad_norm": 0.8375689387321472,
+      "learning_rate": 9.433081292450708e-06,
+      "loss": 0.6771,
+      "step": 19343
+    },
+    {
+      "epoch": 3.4439102564102564,
+      "grad_norm": 0.8299211859703064,
+      "learning_rate": 9.42714746075216e-06,
+      "loss": 0.649,
+      "step": 19344
+    },
+    {
+      "epoch": 3.444088319088319,
+      "grad_norm": 0.9578790068626404,
+      "learning_rate": 9.421215403640549e-06,
+      "loss": 0.5323,
+      "step": 19345
+    },
+    {
+      "epoch": 3.444266381766382,
+      "grad_norm": 1.0069869756698608,
+      "learning_rate": 9.415285121232021e-06,
+      "loss": 0.694,
+      "step": 19346
+    },
+    {
+      "epoch": 3.4444444444444446,
+      "grad_norm": 1.0765708684921265,
+      "learning_rate": 9.409356613642817e-06,
+      "loss": 0.7485,
+      "step": 19347
+    },
+    {
+      "epoch": 3.444622507122507,
+      "grad_norm": 1.0542240142822266,
+      "learning_rate": 9.40342988098909e-06,
+      "loss": 0.9368,
+      "step": 19348
+    },
+    {
+      "epoch": 3.4448005698005697,
+      "grad_norm": 0.8405135869979858,
+      "learning_rate": 9.397504923386957e-06,
+      "loss": 0.5911,
+      "step": 19349
+    },
+    {
+      "epoch": 3.4449786324786325,
+      "grad_norm": 0.8794723153114319,
+      "learning_rate": 9.391581740952516e-06,
+      "loss": 0.8313,
+      "step": 19350
+    },
+    {
+      "epoch": 3.445156695156695,
+      "grad_norm": 1.132588505744934,
+      "learning_rate": 9.385660333801793e-06,
+      "loss": 0.7254,
+      "step": 19351
+    },
+    {
+      "epoch": 3.445334757834758,
+      "grad_norm": 0.958625078201294,
+      "learning_rate": 9.379740702050809e-06,
+      "loss": 0.7199,
+      "step": 19352
+    },
+    {
+      "epoch": 3.4455128205128207,
+      "grad_norm": 1.0391316413879395,
+      "learning_rate": 9.373822845815593e-06,
+      "loss": 0.8571,
+      "step": 19353
+    },
+    {
+      "epoch": 3.445690883190883,
+      "grad_norm": 1.0981941223144531,
+      "learning_rate": 9.36790676521202e-06,
+      "loss": 0.8233,
+      "step": 19354
+    },
+    {
+      "epoch": 3.4458689458689458,
+      "grad_norm": 0.8751938939094543,
+      "learning_rate": 9.361992460356084e-06,
+      "loss": 0.708,
+      "step": 19355
+    },
+    {
+      "epoch": 3.4460470085470085,
+      "grad_norm": 1.240193486213684,
+      "learning_rate": 9.356079931363582e-06,
+      "loss": 0.9598,
+      "step": 19356
+    },
+    {
+      "epoch": 3.4462250712250713,
+      "grad_norm": 0.985924243927002,
+      "learning_rate": 9.350169178350421e-06,
+      "loss": 0.7641,
+      "step": 19357
+    },
+    {
+      "epoch": 3.446403133903134,
+      "grad_norm": 1.125505805015564,
+      "learning_rate": 9.344260201432375e-06,
+      "loss": 0.9037,
+      "step": 19358
+    },
+    {
+      "epoch": 3.4465811965811968,
+      "grad_norm": 1.1013590097427368,
+      "learning_rate": 9.33835300072522e-06,
+      "loss": 0.8685,
+      "step": 19359
+    },
+    {
+      "epoch": 3.446759259259259,
+      "grad_norm": 0.815940797328949,
+      "learning_rate": 9.332447576344739e-06,
+      "loss": 0.6551,
+      "step": 19360
+    },
+    {
+      "epoch": 3.446937321937322,
+      "grad_norm": 1.1269092559814453,
+      "learning_rate": 9.326543928406573e-06,
+      "loss": 0.9763,
+      "step": 19361
+    },
+    {
+      "epoch": 3.4471153846153846,
+      "grad_norm": 0.9986848831176758,
+      "learning_rate": 9.320642057026429e-06,
+      "loss": 0.8283,
+      "step": 19362
+    },
+    {
+      "epoch": 3.4472934472934473,
+      "grad_norm": 0.9624196290969849,
+      "learning_rate": 9.31474196231994e-06,
+      "loss": 0.8278,
+      "step": 19363
+    },
+    {
+      "epoch": 3.44747150997151,
+      "grad_norm": 0.914781391620636,
+      "learning_rate": 9.308843644402687e-06,
+      "loss": 0.7268,
+      "step": 19364
+    },
+    {
+      "epoch": 3.447649572649573,
+      "grad_norm": 0.9857885241508484,
+      "learning_rate": 9.302947103390258e-06,
+      "loss": 0.5724,
+      "step": 19365
+    },
+    {
+      "epoch": 3.447827635327635,
+      "grad_norm": 1.0420119762420654,
+      "learning_rate": 9.297052339398182e-06,
+      "loss": 0.8149,
+      "step": 19366
+    },
+    {
+      "epoch": 3.448005698005698,
+      "grad_norm": 0.8537631630897522,
+      "learning_rate": 9.291159352541912e-06,
+      "loss": 0.5231,
+      "step": 19367
+    },
+    {
+      "epoch": 3.4481837606837606,
+      "grad_norm": 1.0447754859924316,
+      "learning_rate": 9.285268142936988e-06,
+      "loss": 0.7794,
+      "step": 19368
+    },
+    {
+      "epoch": 3.4483618233618234,
+      "grad_norm": 0.9043522477149963,
+      "learning_rate": 9.27937871069876e-06,
+      "loss": 0.7547,
+      "step": 19369
+    },
+    {
+      "epoch": 3.448539886039886,
+      "grad_norm": 0.9774514436721802,
+      "learning_rate": 9.273491055942673e-06,
+      "loss": 0.6994,
+      "step": 19370
+    },
+    {
+      "epoch": 3.448717948717949,
+      "grad_norm": 1.0255252122879028,
+      "learning_rate": 9.267605178784033e-06,
+      "loss": 1.0779,
+      "step": 19371
+    },
+    {
+      "epoch": 3.448896011396011,
+      "grad_norm": 0.9238240122795105,
+      "learning_rate": 9.261721079338214e-06,
+      "loss": 0.7204,
+      "step": 19372
+    },
+    {
+      "epoch": 3.449074074074074,
+      "grad_norm": 0.9913932085037231,
+      "learning_rate": 9.25583875772047e-06,
+      "loss": 0.7491,
+      "step": 19373
+    },
+    {
+      "epoch": 3.4492521367521367,
+      "grad_norm": 0.9521636366844177,
+      "learning_rate": 9.249958214046062e-06,
+      "loss": 0.7428,
+      "step": 19374
+    },
+    {
+      "epoch": 3.4494301994301995,
+      "grad_norm": 0.8698529601097107,
+      "learning_rate": 9.244079448430199e-06,
+      "loss": 0.567,
+      "step": 19375
+    },
+    {
+      "epoch": 3.449608262108262,
+      "grad_norm": 1.0148518085479736,
+      "learning_rate": 9.238202460988077e-06,
+      "loss": 0.6805,
+      "step": 19376
+    },
+    {
+      "epoch": 3.449786324786325,
+      "grad_norm": 1.0295552015304565,
+      "learning_rate": 9.232327251834827e-06,
+      "loss": 0.7211,
+      "step": 19377
+    },
+    {
+      "epoch": 3.4499643874643873,
+      "grad_norm": 0.88274747133255,
+      "learning_rate": 9.22645382108559e-06,
+      "loss": 0.7126,
+      "step": 19378
+    },
+    {
+      "epoch": 3.45014245014245,
+      "grad_norm": 1.062113881111145,
+      "learning_rate": 9.220582168855397e-06,
+      "loss": 0.9362,
+      "step": 19379
+    },
+    {
+      "epoch": 3.4503205128205128,
+      "grad_norm": 0.811355471611023,
+      "learning_rate": 9.214712295259342e-06,
+      "loss": 0.6212,
+      "step": 19380
+    },
+    {
+      "epoch": 3.4504985754985755,
+      "grad_norm": 1.0234562158584595,
+      "learning_rate": 9.208844200412403e-06,
+      "loss": 0.8328,
+      "step": 19381
+    },
+    {
+      "epoch": 3.4506766381766383,
+      "grad_norm": 0.9234959483146667,
+      "learning_rate": 9.202977884429554e-06,
+      "loss": 0.7139,
+      "step": 19382
+    },
+    {
+      "epoch": 3.450854700854701,
+      "grad_norm": 0.8315097093582153,
+      "learning_rate": 9.197113347425745e-06,
+      "loss": 0.6782,
+      "step": 19383
+    },
+    {
+      "epoch": 3.451032763532764,
+      "grad_norm": 1.0691921710968018,
+      "learning_rate": 9.191250589515866e-06,
+      "loss": 0.8085,
+      "step": 19384
+    },
+    {
+      "epoch": 3.451210826210826,
+      "grad_norm": 0.9873775839805603,
+      "learning_rate": 9.18538961081481e-06,
+      "loss": 0.9339,
+      "step": 19385
+    },
+    {
+      "epoch": 3.451388888888889,
+      "grad_norm": 1.058555006980896,
+      "learning_rate": 9.179530411437365e-06,
+      "loss": 0.847,
+      "step": 19386
+    },
+    {
+      "epoch": 3.4515669515669516,
+      "grad_norm": 0.8825174570083618,
+      "learning_rate": 9.173672991498384e-06,
+      "loss": 0.7246,
+      "step": 19387
+    },
+    {
+      "epoch": 3.4517450142450143,
+      "grad_norm": 0.8538999557495117,
+      "learning_rate": 9.167817351112596e-06,
+      "loss": 0.7268,
+      "step": 19388
+    },
+    {
+      "epoch": 3.451923076923077,
+      "grad_norm": 0.9254639744758606,
+      "learning_rate": 9.161963490394743e-06,
+      "loss": 0.6828,
+      "step": 19389
+    },
+    {
+      "epoch": 3.4521011396011394,
+      "grad_norm": 1.062555193901062,
+      "learning_rate": 9.156111409459512e-06,
+      "loss": 0.8251,
+      "step": 19390
+    },
+    {
+      "epoch": 3.452279202279202,
+      "grad_norm": 0.8910902142524719,
+      "learning_rate": 9.150261108421575e-06,
+      "loss": 0.8508,
+      "step": 19391
+    },
+    {
+      "epoch": 3.452457264957265,
+      "grad_norm": 0.9884508848190308,
+      "learning_rate": 9.144412587395534e-06,
+      "loss": 0.8475,
+      "step": 19392
+    },
+    {
+      "epoch": 3.4526353276353277,
+      "grad_norm": 1.0337510108947754,
+      "learning_rate": 9.138565846496005e-06,
+      "loss": 0.6761,
+      "step": 19393
+    },
+    {
+      "epoch": 3.4528133903133904,
+      "grad_norm": 0.9345211982727051,
+      "learning_rate": 9.13272088583751e-06,
+      "loss": 0.7458,
+      "step": 19394
+    },
+    {
+      "epoch": 3.452991452991453,
+      "grad_norm": 0.9515882134437561,
+      "learning_rate": 9.12687770553462e-06,
+      "loss": 0.815,
+      "step": 19395
+    },
+    {
+      "epoch": 3.453169515669516,
+      "grad_norm": 0.9667288661003113,
+      "learning_rate": 9.121036305701746e-06,
+      "loss": 0.6266,
+      "step": 19396
+    },
+    {
+      "epoch": 3.453347578347578,
+      "grad_norm": 0.940667450428009,
+      "learning_rate": 9.11519668645341e-06,
+      "loss": 0.5882,
+      "step": 19397
+    },
+    {
+      "epoch": 3.453525641025641,
+      "grad_norm": 0.9668877720832825,
+      "learning_rate": 9.109358847904003e-06,
+      "loss": 0.8559,
+      "step": 19398
+    },
+    {
+      "epoch": 3.4537037037037037,
+      "grad_norm": 0.904509961605072,
+      "learning_rate": 9.103522790167874e-06,
+      "loss": 0.7116,
+      "step": 19399
+    },
+    {
+      "epoch": 3.4538817663817665,
+      "grad_norm": 0.9343373775482178,
+      "learning_rate": 9.097688513359425e-06,
+      "loss": 0.5469,
+      "step": 19400
+    },
+    {
+      "epoch": 3.4540598290598292,
+      "grad_norm": 1.025084137916565,
+      "learning_rate": 9.091856017592915e-06,
+      "loss": 0.749,
+      "step": 19401
+    },
+    {
+      "epoch": 3.4542378917378915,
+      "grad_norm": 1.0344691276550293,
+      "learning_rate": 9.086025302982648e-06,
+      "loss": 0.8593,
+      "step": 19402
+    },
+    {
+      "epoch": 3.4544159544159543,
+      "grad_norm": 0.9288195371627808,
+      "learning_rate": 9.080196369642858e-06,
+      "loss": 0.6745,
+      "step": 19403
+    },
+    {
+      "epoch": 3.454594017094017,
+      "grad_norm": 0.8177899122238159,
+      "learning_rate": 9.07436921768775e-06,
+      "loss": 0.5978,
+      "step": 19404
+    },
+    {
+      "epoch": 3.45477207977208,
+      "grad_norm": 0.8998305201530457,
+      "learning_rate": 9.068543847231503e-06,
+      "loss": 0.779,
+      "step": 19405
+    },
+    {
+      "epoch": 3.4549501424501425,
+      "grad_norm": 0.9722973108291626,
+      "learning_rate": 9.062720258388246e-06,
+      "loss": 0.7639,
+      "step": 19406
+    },
+    {
+      "epoch": 3.4551282051282053,
+      "grad_norm": 1.0662795305252075,
+      "learning_rate": 9.056898451272077e-06,
+      "loss": 0.6982,
+      "step": 19407
+    },
+    {
+      "epoch": 3.455306267806268,
+      "grad_norm": 0.9679057002067566,
+      "learning_rate": 9.051078425997062e-06,
+      "loss": 0.7254,
+      "step": 19408
+    },
+    {
+      "epoch": 3.4554843304843303,
+      "grad_norm": 1.0791397094726562,
+      "learning_rate": 9.045260182677217e-06,
+      "loss": 0.8541,
+      "step": 19409
+    },
+    {
+      "epoch": 3.455662393162393,
+      "grad_norm": 0.9533210396766663,
+      "learning_rate": 9.039443721426589e-06,
+      "loss": 0.8441,
+      "step": 19410
+    },
+    {
+      "epoch": 3.455840455840456,
+      "grad_norm": 0.9457281827926636,
+      "learning_rate": 9.033629042359081e-06,
+      "loss": 0.6461,
+      "step": 19411
+    },
+    {
+      "epoch": 3.4560185185185186,
+      "grad_norm": 0.9577490091323853,
+      "learning_rate": 9.027816145588664e-06,
+      "loss": 0.7646,
+      "step": 19412
+    },
+    {
+      "epoch": 3.4561965811965814,
+      "grad_norm": 0.8724551796913147,
+      "learning_rate": 9.022005031229196e-06,
+      "loss": 0.6517,
+      "step": 19413
+    },
+    {
+      "epoch": 3.4563746438746437,
+      "grad_norm": 0.9715121984481812,
+      "learning_rate": 9.016195699394559e-06,
+      "loss": 0.592,
+      "step": 19414
+    },
+    {
+      "epoch": 3.4565527065527064,
+      "grad_norm": 0.9042280912399292,
+      "learning_rate": 9.010388150198567e-06,
+      "loss": 0.8143,
+      "step": 19415
+    },
+    {
+      "epoch": 3.456730769230769,
+      "grad_norm": 0.9272587299346924,
+      "learning_rate": 9.004582383755e-06,
+      "loss": 0.9227,
+      "step": 19416
+    },
+    {
+      "epoch": 3.456908831908832,
+      "grad_norm": 1.0067778825759888,
+      "learning_rate": 8.998778400177622e-06,
+      "loss": 0.7434,
+      "step": 19417
+    },
+    {
+      "epoch": 3.4570868945868947,
+      "grad_norm": 0.8889135122299194,
+      "learning_rate": 8.992976199580117e-06,
+      "loss": 0.7774,
+      "step": 19418
+    },
+    {
+      "epoch": 3.4572649572649574,
+      "grad_norm": 0.9987656474113464,
+      "learning_rate": 8.987175782076217e-06,
+      "loss": 0.7912,
+      "step": 19419
+    },
+    {
+      "epoch": 3.45744301994302,
+      "grad_norm": 1.0001306533813477,
+      "learning_rate": 8.981377147779535e-06,
+      "loss": 0.8925,
+      "step": 19420
+    },
+    {
+      "epoch": 3.4576210826210825,
+      "grad_norm": 1.069911003112793,
+      "learning_rate": 8.975580296803709e-06,
+      "loss": 0.8013,
+      "step": 19421
+    },
+    {
+      "epoch": 3.4577991452991452,
+      "grad_norm": 0.9291098713874817,
+      "learning_rate": 8.969785229262284e-06,
+      "loss": 0.9167,
+      "step": 19422
+    },
+    {
+      "epoch": 3.457977207977208,
+      "grad_norm": 1.0406343936920166,
+      "learning_rate": 8.963991945268825e-06,
+      "loss": 0.7271,
+      "step": 19423
+    },
+    {
+      "epoch": 3.4581552706552707,
+      "grad_norm": 1.0367125272750854,
+      "learning_rate": 8.958200444936815e-06,
+      "loss": 0.6179,
+      "step": 19424
+    },
+    {
+      "epoch": 3.4583333333333335,
+      "grad_norm": 1.0270168781280518,
+      "learning_rate": 8.952410728379779e-06,
+      "loss": 0.555,
+      "step": 19425
+    },
+    {
+      "epoch": 3.458511396011396,
+      "grad_norm": 0.9439095854759216,
+      "learning_rate": 8.94662279571109e-06,
+      "loss": 0.7084,
+      "step": 19426
+    },
+    {
+      "epoch": 3.4586894586894585,
+      "grad_norm": 0.934548556804657,
+      "learning_rate": 8.940836647044216e-06,
+      "loss": 0.6366,
+      "step": 19427
+    },
+    {
+      "epoch": 3.4588675213675213,
+      "grad_norm": 1.0294036865234375,
+      "learning_rate": 8.93505228249244e-06,
+      "loss": 0.6797,
+      "step": 19428
+    },
+    {
+      "epoch": 3.459045584045584,
+      "grad_norm": 0.7903768420219421,
+      "learning_rate": 8.929269702169174e-06,
+      "loss": 0.7274,
+      "step": 19429
+    },
+    {
+      "epoch": 3.459223646723647,
+      "grad_norm": 0.9741223454475403,
+      "learning_rate": 8.923488906187683e-06,
+      "loss": 0.7743,
+      "step": 19430
+    },
+    {
+      "epoch": 3.4594017094017095,
+      "grad_norm": 0.9716152548789978,
+      "learning_rate": 8.917709894661231e-06,
+      "loss": 0.5779,
+      "step": 19431
+    },
+    {
+      "epoch": 3.4595797720797723,
+      "grad_norm": 0.9969064593315125,
+      "learning_rate": 8.911932667703038e-06,
+      "loss": 0.7671,
+      "step": 19432
+    },
+    {
+      "epoch": 3.4597578347578346,
+      "grad_norm": 0.7803093194961548,
+      "learning_rate": 8.906157225426315e-06,
+      "loss": 0.4898,
+      "step": 19433
+    },
+    {
+      "epoch": 3.4599358974358974,
+      "grad_norm": 0.9883405566215515,
+      "learning_rate": 8.900383567944192e-06,
+      "loss": 0.7084,
+      "step": 19434
+    },
+    {
+      "epoch": 3.46011396011396,
+      "grad_norm": 0.8429387211799622,
+      "learning_rate": 8.894611695369836e-06,
+      "loss": 0.7378,
+      "step": 19435
+    },
+    {
+      "epoch": 3.460292022792023,
+      "grad_norm": 1.0472184419631958,
+      "learning_rate": 8.888841607816278e-06,
+      "loss": 0.9414,
+      "step": 19436
+    },
+    {
+      "epoch": 3.4604700854700856,
+      "grad_norm": 0.8618428111076355,
+      "learning_rate": 8.883073305396627e-06,
+      "loss": 0.6176,
+      "step": 19437
+    },
+    {
+      "epoch": 3.460648148148148,
+      "grad_norm": 1.0710008144378662,
+      "learning_rate": 8.877306788223872e-06,
+      "loss": 0.9157,
+      "step": 19438
+    },
+    {
+      "epoch": 3.4608262108262107,
+      "grad_norm": 0.9094607830047607,
+      "learning_rate": 8.871542056410976e-06,
+      "loss": 0.7429,
+      "step": 19439
+    },
+    {
+      "epoch": 3.4610042735042734,
+      "grad_norm": 0.8668854832649231,
+      "learning_rate": 8.865779110070949e-06,
+      "loss": 0.6732,
+      "step": 19440
+    },
+    {
+      "epoch": 3.461182336182336,
+      "grad_norm": 0.9430466890335083,
+      "learning_rate": 8.860017949316634e-06,
+      "loss": 0.6454,
+      "step": 19441
+    },
+    {
+      "epoch": 3.461360398860399,
+      "grad_norm": 0.9155656695365906,
+      "learning_rate": 8.854258574260977e-06,
+      "loss": 0.6573,
+      "step": 19442
+    },
+    {
+      "epoch": 3.4615384615384617,
+      "grad_norm": 1.0212770700454712,
+      "learning_rate": 8.848500985016739e-06,
+      "loss": 0.8552,
+      "step": 19443
+    },
+    {
+      "epoch": 3.4617165242165244,
+      "grad_norm": 0.9474434852600098,
+      "learning_rate": 8.8427451816968e-06,
+      "loss": 0.7882,
+      "step": 19444
+    },
+    {
+      "epoch": 3.4618945868945867,
+      "grad_norm": 0.9351694583892822,
+      "learning_rate": 8.836991164413898e-06,
+      "loss": 0.6336,
+      "step": 19445
+    },
+    {
+      "epoch": 3.4620726495726495,
+      "grad_norm": 1.0357493162155151,
+      "learning_rate": 8.831238933280795e-06,
+      "loss": 0.8264,
+      "step": 19446
+    },
+    {
+      "epoch": 3.4622507122507122,
+      "grad_norm": 1.0070866346359253,
+      "learning_rate": 8.82548848841016e-06,
+      "loss": 0.8509,
+      "step": 19447
+    },
+    {
+      "epoch": 3.462428774928775,
+      "grad_norm": 0.9792636036872864,
+      "learning_rate": 8.819739829914685e-06,
+      "loss": 0.7739,
+      "step": 19448
+    },
+    {
+      "epoch": 3.4626068376068377,
+      "grad_norm": 1.0221115350723267,
+      "learning_rate": 8.81399295790699e-06,
+      "loss": 0.7329,
+      "step": 19449
+    },
+    {
+      "epoch": 3.4627849002849,
+      "grad_norm": 1.0703456401824951,
+      "learning_rate": 8.80824787249971e-06,
+      "loss": 0.8073,
+      "step": 19450
+    },
+    {
+      "epoch": 3.462962962962963,
+      "grad_norm": 0.9054359793663025,
+      "learning_rate": 8.802504573805336e-06,
+      "loss": 0.8686,
+      "step": 19451
+    },
+    {
+      "epoch": 3.4631410256410255,
+      "grad_norm": 1.019992470741272,
+      "learning_rate": 8.796763061936486e-06,
+      "loss": 1.0346,
+      "step": 19452
+    },
+    {
+      "epoch": 3.4633190883190883,
+      "grad_norm": 0.9106883406639099,
+      "learning_rate": 8.791023337005555e-06,
+      "loss": 0.6639,
+      "step": 19453
+    },
+    {
+      "epoch": 3.463497150997151,
+      "grad_norm": 0.9866195321083069,
+      "learning_rate": 8.785285399125077e-06,
+      "loss": 0.8251,
+      "step": 19454
+    },
+    {
+      "epoch": 3.463675213675214,
+      "grad_norm": 1.026564359664917,
+      "learning_rate": 8.779549248407449e-06,
+      "loss": 0.804,
+      "step": 19455
+    },
+    {
+      "epoch": 3.4638532763532766,
+      "grad_norm": 1.0608559846878052,
+      "learning_rate": 8.773814884965058e-06,
+      "loss": 0.8622,
+      "step": 19456
+    },
+    {
+      "epoch": 3.464031339031339,
+      "grad_norm": 0.8756099939346313,
+      "learning_rate": 8.768082308910264e-06,
+      "loss": 0.7148,
+      "step": 19457
+    },
+    {
+      "epoch": 3.4642094017094016,
+      "grad_norm": 0.9199277758598328,
+      "learning_rate": 8.76235152035536e-06,
+      "loss": 0.8529,
+      "step": 19458
+    },
+    {
+      "epoch": 3.4643874643874644,
+      "grad_norm": 1.0407079458236694,
+      "learning_rate": 8.75662251941266e-06,
+      "loss": 0.7854,
+      "step": 19459
+    },
+    {
+      "epoch": 3.464565527065527,
+      "grad_norm": 0.8924119472503662,
+      "learning_rate": 8.750895306194407e-06,
+      "loss": 0.8325,
+      "step": 19460
+    },
+    {
+      "epoch": 3.46474358974359,
+      "grad_norm": 0.9075109362602234,
+      "learning_rate": 8.745169880812808e-06,
+      "loss": 0.716,
+      "step": 19461
+    },
+    {
+      "epoch": 3.464921652421652,
+      "grad_norm": 1.1008402109146118,
+      "learning_rate": 8.739446243380034e-06,
+      "loss": 0.83,
+      "step": 19462
+    },
+    {
+      "epoch": 3.465099715099715,
+      "grad_norm": 0.9462727308273315,
+      "learning_rate": 8.733724394008236e-06,
+      "loss": 0.8705,
+      "step": 19463
+    },
+    {
+      "epoch": 3.4652777777777777,
+      "grad_norm": 0.9452762007713318,
+      "learning_rate": 8.728004332809514e-06,
+      "loss": 0.6937,
+      "step": 19464
+    },
+    {
+      "epoch": 3.4654558404558404,
+      "grad_norm": 0.899398922920227,
+      "learning_rate": 8.722286059895957e-06,
+      "loss": 0.7192,
+      "step": 19465
+    },
+    {
+      "epoch": 3.465633903133903,
+      "grad_norm": 0.8906122446060181,
+      "learning_rate": 8.716569575379563e-06,
+      "loss": 0.8128,
+      "step": 19466
+    },
+    {
+      "epoch": 3.465811965811966,
+      "grad_norm": 0.8970988988876343,
+      "learning_rate": 8.710854879372398e-06,
+      "loss": 0.7183,
+      "step": 19467
+    },
+    {
+      "epoch": 3.4659900284900287,
+      "grad_norm": 0.8346308469772339,
+      "learning_rate": 8.705141971986363e-06,
+      "loss": 0.8172,
+      "step": 19468
+    },
+    {
+      "epoch": 3.466168091168091,
+      "grad_norm": 1.137384057044983,
+      "learning_rate": 8.699430853333446e-06,
+      "loss": 0.8213,
+      "step": 19469
+    },
+    {
+      "epoch": 3.4663461538461537,
+      "grad_norm": 1.241615653038025,
+      "learning_rate": 8.693721523525522e-06,
+      "loss": 0.9548,
+      "step": 19470
+    },
+    {
+      "epoch": 3.4665242165242165,
+      "grad_norm": 0.9010623097419739,
+      "learning_rate": 8.688013982674436e-06,
+      "loss": 0.9466,
+      "step": 19471
+    },
+    {
+      "epoch": 3.4667022792022792,
+      "grad_norm": 1.0348331928253174,
+      "learning_rate": 8.682308230892044e-06,
+      "loss": 0.632,
+      "step": 19472
+    },
+    {
+      "epoch": 3.466880341880342,
+      "grad_norm": 0.9175500273704529,
+      "learning_rate": 8.676604268290134e-06,
+      "loss": 0.7857,
+      "step": 19473
+    },
+    {
+      "epoch": 3.4670584045584047,
+      "grad_norm": 0.9245565533638,
+      "learning_rate": 8.670902094980426e-06,
+      "loss": 0.6556,
+      "step": 19474
+    },
+    {
+      "epoch": 3.467236467236467,
+      "grad_norm": 0.9198375940322876,
+      "learning_rate": 8.66520171107471e-06,
+      "loss": 0.9424,
+      "step": 19475
+    },
+    {
+      "epoch": 3.46741452991453,
+      "grad_norm": 1.0737100839614868,
+      "learning_rate": 8.659503116684598e-06,
+      "loss": 0.7627,
+      "step": 19476
+    },
+    {
+      "epoch": 3.4675925925925926,
+      "grad_norm": 0.9110404253005981,
+      "learning_rate": 8.653806311921809e-06,
+      "loss": 0.6945,
+      "step": 19477
+    },
+    {
+      "epoch": 3.4677706552706553,
+      "grad_norm": 1.0172821283340454,
+      "learning_rate": 8.648111296897909e-06,
+      "loss": 0.9387,
+      "step": 19478
+    },
+    {
+      "epoch": 3.467948717948718,
+      "grad_norm": 0.9516095519065857,
+      "learning_rate": 8.642418071724512e-06,
+      "loss": 0.7702,
+      "step": 19479
+    },
+    {
+      "epoch": 3.468126780626781,
+      "grad_norm": 0.950921893119812,
+      "learning_rate": 8.636726636513148e-06,
+      "loss": 0.9081,
+      "step": 19480
+    },
+    {
+      "epoch": 3.468304843304843,
+      "grad_norm": 0.871261715888977,
+      "learning_rate": 8.631036991375318e-06,
+      "loss": 0.7908,
+      "step": 19481
+    },
+    {
+      "epoch": 3.468482905982906,
+      "grad_norm": 0.9214411377906799,
+      "learning_rate": 8.625349136422557e-06,
+      "loss": 0.7855,
+      "step": 19482
+    },
+    {
+      "epoch": 3.4686609686609686,
+      "grad_norm": 1.0029644966125488,
+      "learning_rate": 8.619663071766227e-06,
+      "loss": 0.7906,
+      "step": 19483
+    },
+    {
+      "epoch": 3.4688390313390314,
+      "grad_norm": 1.0286318063735962,
+      "learning_rate": 8.613978797517797e-06,
+      "loss": 0.8748,
+      "step": 19484
+    },
+    {
+      "epoch": 3.469017094017094,
+      "grad_norm": 0.9796161651611328,
+      "learning_rate": 8.6082963137886e-06,
+      "loss": 0.7109,
+      "step": 19485
+    },
+    {
+      "epoch": 3.469195156695157,
+      "grad_norm": 0.9568044543266296,
+      "learning_rate": 8.602615620690001e-06,
+      "loss": 0.8721,
+      "step": 19486
+    },
+    {
+      "epoch": 3.469373219373219,
+      "grad_norm": 1.0369888544082642,
+      "learning_rate": 8.596936718333281e-06,
+      "loss": 0.7648,
+      "step": 19487
+    },
+    {
+      "epoch": 3.469551282051282,
+      "grad_norm": 0.9487068057060242,
+      "learning_rate": 8.591259606829716e-06,
+      "loss": 0.677,
+      "step": 19488
+    },
+    {
+      "epoch": 3.4697293447293447,
+      "grad_norm": 1.0003830194473267,
+      "learning_rate": 8.585584286290527e-06,
+      "loss": 0.7131,
+      "step": 19489
+    },
+    {
+      "epoch": 3.4699074074074074,
+      "grad_norm": 0.9841815829277039,
+      "learning_rate": 8.579910756826925e-06,
+      "loss": 0.9088,
+      "step": 19490
+    },
+    {
+      "epoch": 3.47008547008547,
+      "grad_norm": 0.7646492123603821,
+      "learning_rate": 8.574239018550035e-06,
+      "loss": 0.4016,
+      "step": 19491
+    },
+    {
+      "epoch": 3.470263532763533,
+      "grad_norm": 0.9418407678604126,
+      "learning_rate": 8.568569071571053e-06,
+      "loss": 0.6413,
+      "step": 19492
+    },
+    {
+      "epoch": 3.4704415954415953,
+      "grad_norm": 1.078834891319275,
+      "learning_rate": 8.562900916000993e-06,
+      "loss": 0.8014,
+      "step": 19493
+    },
+    {
+      "epoch": 3.470619658119658,
+      "grad_norm": 0.9450939297676086,
+      "learning_rate": 8.557234551950966e-06,
+      "loss": 0.735,
+      "step": 19494
+    },
+    {
+      "epoch": 3.4707977207977208,
+      "grad_norm": 0.9362136125564575,
+      "learning_rate": 8.55156997953197e-06,
+      "loss": 0.8151,
+      "step": 19495
+    },
+    {
+      "epoch": 3.4709757834757835,
+      "grad_norm": 0.8481518626213074,
+      "learning_rate": 8.545907198854986e-06,
+      "loss": 0.5628,
+      "step": 19496
+    },
+    {
+      "epoch": 3.4711538461538463,
+      "grad_norm": 0.968862771987915,
+      "learning_rate": 8.540246210030978e-06,
+      "loss": 0.8156,
+      "step": 19497
+    },
+    {
+      "epoch": 3.471331908831909,
+      "grad_norm": 0.8841079473495483,
+      "learning_rate": 8.534587013170836e-06,
+      "loss": 0.8872,
+      "step": 19498
+    },
+    {
+      "epoch": 3.4715099715099713,
+      "grad_norm": 0.8855252265930176,
+      "learning_rate": 8.528929608385494e-06,
+      "loss": 0.8796,
+      "step": 19499
+    },
+    {
+      "epoch": 3.471688034188034,
+      "grad_norm": 0.925837516784668,
+      "learning_rate": 8.523273995785719e-06,
+      "loss": 0.8763,
+      "step": 19500
+    },
+    {
+      "epoch": 3.471866096866097,
+      "grad_norm": 0.8930026292800903,
+      "learning_rate": 8.517620175482388e-06,
+      "loss": 0.74,
+      "step": 19501
+    },
+    {
+      "epoch": 3.4720441595441596,
+      "grad_norm": 0.9781539440155029,
+      "learning_rate": 8.511968147586246e-06,
+      "loss": 0.5703,
+      "step": 19502
+    },
+    {
+      "epoch": 3.4722222222222223,
+      "grad_norm": 0.901134192943573,
+      "learning_rate": 8.50631791220805e-06,
+      "loss": 0.658,
+      "step": 19503
+    },
+    {
+      "epoch": 3.472400284900285,
+      "grad_norm": 1.052512526512146,
+      "learning_rate": 8.500669469458478e-06,
+      "loss": 0.6627,
+      "step": 19504
+    },
+    {
+      "epoch": 3.472578347578348,
+      "grad_norm": 1.0030267238616943,
+      "learning_rate": 8.495022819448228e-06,
+      "loss": 0.7527,
+      "step": 19505
+    },
+    {
+      "epoch": 3.47275641025641,
+      "grad_norm": 0.936089277267456,
+      "learning_rate": 8.489377962287893e-06,
+      "loss": 0.787,
+      "step": 19506
+    },
+    {
+      "epoch": 3.472934472934473,
+      "grad_norm": 1.0984883308410645,
+      "learning_rate": 8.483734898088135e-06,
+      "loss": 0.8435,
+      "step": 19507
+    },
+    {
+      "epoch": 3.4731125356125356,
+      "grad_norm": 0.9354465007781982,
+      "learning_rate": 8.47809362695946e-06,
+      "loss": 0.7568,
+      "step": 19508
+    },
+    {
+      "epoch": 3.4732905982905984,
+      "grad_norm": 0.9807959198951721,
+      "learning_rate": 8.472454149012431e-06,
+      "loss": 0.8719,
+      "step": 19509
+    },
+    {
+      "epoch": 3.473468660968661,
+      "grad_norm": 0.955855131149292,
+      "learning_rate": 8.46681646435753e-06,
+      "loss": 0.6109,
+      "step": 19510
+    },
+    {
+      "epoch": 3.4736467236467234,
+      "grad_norm": 0.9258124828338623,
+      "learning_rate": 8.46118057310521e-06,
+      "loss": 0.7669,
+      "step": 19511
+    },
+    {
+      "epoch": 3.473824786324786,
+      "grad_norm": 1.161522388458252,
+      "learning_rate": 8.455546475365905e-06,
+      "loss": 0.9021,
+      "step": 19512
+    },
+    {
+      "epoch": 3.474002849002849,
+      "grad_norm": 0.9282271862030029,
+      "learning_rate": 8.449914171250006e-06,
+      "loss": 0.8032,
+      "step": 19513
+    },
+    {
+      "epoch": 3.4741809116809117,
+      "grad_norm": 0.9432225823402405,
+      "learning_rate": 8.444283660867858e-06,
+      "loss": 0.7169,
+      "step": 19514
+    },
+    {
+      "epoch": 3.4743589743589745,
+      "grad_norm": 0.9784168601036072,
+      "learning_rate": 8.438654944329782e-06,
+      "loss": 0.6596,
+      "step": 19515
+    },
+    {
+      "epoch": 3.474537037037037,
+      "grad_norm": 1.008577823638916,
+      "learning_rate": 8.433028021746036e-06,
+      "loss": 0.84,
+      "step": 19516
+    },
+    {
+      "epoch": 3.4747150997151,
+      "grad_norm": 0.8572508096694946,
+      "learning_rate": 8.42740289322691e-06,
+      "loss": 0.846,
+      "step": 19517
+    },
+    {
+      "epoch": 3.4748931623931623,
+      "grad_norm": 1.112871766090393,
+      "learning_rate": 8.421779558882603e-06,
+      "loss": 0.7197,
+      "step": 19518
+    },
+    {
+      "epoch": 3.475071225071225,
+      "grad_norm": 0.8831349015235901,
+      "learning_rate": 8.416158018823294e-06,
+      "loss": 0.7389,
+      "step": 19519
+    },
+    {
+      "epoch": 3.4752492877492878,
+      "grad_norm": 0.899161696434021,
+      "learning_rate": 8.410538273159107e-06,
+      "loss": 0.8457,
+      "step": 19520
+    },
+    {
+      "epoch": 3.4754273504273505,
+      "grad_norm": 0.8747708797454834,
+      "learning_rate": 8.404920322000154e-06,
+      "loss": 0.7574,
+      "step": 19521
+    },
+    {
+      "epoch": 3.4756054131054133,
+      "grad_norm": 0.892696738243103,
+      "learning_rate": 8.399304165456545e-06,
+      "loss": 0.8276,
+      "step": 19522
+    },
+    {
+      "epoch": 3.4757834757834756,
+      "grad_norm": 0.9339589476585388,
+      "learning_rate": 8.393689803638249e-06,
+      "loss": 0.7088,
+      "step": 19523
+    },
+    {
+      "epoch": 3.4759615384615383,
+      "grad_norm": 0.8795723915100098,
+      "learning_rate": 8.388077236655356e-06,
+      "loss": 0.6545,
+      "step": 19524
+    },
+    {
+      "epoch": 3.476139601139601,
+      "grad_norm": 0.9463884830474854,
+      "learning_rate": 8.382466464617733e-06,
+      "loss": 0.6912,
+      "step": 19525
+    },
+    {
+      "epoch": 3.476317663817664,
+      "grad_norm": 1.0345131158828735,
+      "learning_rate": 8.37685748763538e-06,
+      "loss": 0.8502,
+      "step": 19526
+    },
+    {
+      "epoch": 3.4764957264957266,
+      "grad_norm": 0.9798212647438049,
+      "learning_rate": 8.37125030581818e-06,
+      "loss": 0.6323,
+      "step": 19527
+    },
+    {
+      "epoch": 3.4766737891737893,
+      "grad_norm": 0.9508503079414368,
+      "learning_rate": 8.365644919275983e-06,
+      "loss": 0.732,
+      "step": 19528
+    },
+    {
+      "epoch": 3.476851851851852,
+      "grad_norm": 0.8767679333686829,
+      "learning_rate": 8.360041328118617e-06,
+      "loss": 0.454,
+      "step": 19529
+    },
+    {
+      "epoch": 3.4770299145299144,
+      "grad_norm": 1.0020666122436523,
+      "learning_rate": 8.354439532455882e-06,
+      "loss": 0.7342,
+      "step": 19530
+    },
+    {
+      "epoch": 3.477207977207977,
+      "grad_norm": 0.9661487340927124,
+      "learning_rate": 8.348839532397501e-06,
+      "loss": 0.9732,
+      "step": 19531
+    },
+    {
+      "epoch": 3.47738603988604,
+      "grad_norm": 1.0699942111968994,
+      "learning_rate": 8.343241328053264e-06,
+      "loss": 0.9216,
+      "step": 19532
+    },
+    {
+      "epoch": 3.4775641025641026,
+      "grad_norm": 0.8531695008277893,
+      "learning_rate": 8.337644919532772e-06,
+      "loss": 0.6829,
+      "step": 19533
+    },
+    {
+      "epoch": 3.4777421652421654,
+      "grad_norm": 1.080919861793518,
+      "learning_rate": 8.33205030694576e-06,
+      "loss": 0.9975,
+      "step": 19534
+    },
+    {
+      "epoch": 3.4779202279202277,
+      "grad_norm": 0.8957521915435791,
+      "learning_rate": 8.32645749040175e-06,
+      "loss": 0.5677,
+      "step": 19535
+    },
+    {
+      "epoch": 3.4780982905982905,
+      "grad_norm": 0.9968222975730896,
+      "learning_rate": 8.320866470010402e-06,
+      "loss": 0.7552,
+      "step": 19536
+    },
+    {
+      "epoch": 3.478276353276353,
+      "grad_norm": 0.8942126631736755,
+      "learning_rate": 8.315277245881215e-06,
+      "loss": 0.7861,
+      "step": 19537
+    },
+    {
+      "epoch": 3.478454415954416,
+      "grad_norm": 0.930486261844635,
+      "learning_rate": 8.3096898181237e-06,
+      "loss": 0.7911,
+      "step": 19538
+    },
+    {
+      "epoch": 3.4786324786324787,
+      "grad_norm": 1.013940691947937,
+      "learning_rate": 8.304104186847384e-06,
+      "loss": 0.7615,
+      "step": 19539
+    },
+    {
+      "epoch": 3.4788105413105415,
+      "grad_norm": 0.8439841866493225,
+      "learning_rate": 8.29852035216162e-06,
+      "loss": 0.7309,
+      "step": 19540
+    },
+    {
+      "epoch": 3.478988603988604,
+      "grad_norm": 0.9972648620605469,
+      "learning_rate": 8.29293831417588e-06,
+      "loss": 0.8021,
+      "step": 19541
+    },
+    {
+      "epoch": 3.4791666666666665,
+      "grad_norm": 0.9759960174560547,
+      "learning_rate": 8.287358072999507e-06,
+      "loss": 0.6253,
+      "step": 19542
+    },
+    {
+      "epoch": 3.4793447293447293,
+      "grad_norm": 1.2256296873092651,
+      "learning_rate": 8.281779628741837e-06,
+      "loss": 0.8939,
+      "step": 19543
+    },
+    {
+      "epoch": 3.479522792022792,
+      "grad_norm": 0.9147930145263672,
+      "learning_rate": 8.276202981512171e-06,
+      "loss": 0.7656,
+      "step": 19544
+    },
+    {
+      "epoch": 3.4797008547008548,
+      "grad_norm": 0.7807040214538574,
+      "learning_rate": 8.270628131419767e-06,
+      "loss": 0.5756,
+      "step": 19545
+    },
+    {
+      "epoch": 3.4798789173789175,
+      "grad_norm": 0.8961909413337708,
+      "learning_rate": 8.265055078573824e-06,
+      "loss": 0.7962,
+      "step": 19546
+    },
+    {
+      "epoch": 3.48005698005698,
+      "grad_norm": 1.0497139692306519,
+      "learning_rate": 8.259483823083614e-06,
+      "loss": 0.8766,
+      "step": 19547
+    },
+    {
+      "epoch": 3.4802350427350426,
+      "grad_norm": 0.9595639109611511,
+      "learning_rate": 8.253914365058202e-06,
+      "loss": 1.0157,
+      "step": 19548
+    },
+    {
+      "epoch": 3.4804131054131053,
+      "grad_norm": 0.8763233423233032,
+      "learning_rate": 8.248346704606779e-06,
+      "loss": 0.6731,
+      "step": 19549
+    },
+    {
+      "epoch": 3.480591168091168,
+      "grad_norm": 0.9633421897888184,
+      "learning_rate": 8.242780841838383e-06,
+      "loss": 0.9382,
+      "step": 19550
+    },
+    {
+      "epoch": 3.480769230769231,
+      "grad_norm": 0.982096791267395,
+      "learning_rate": 8.2372167768621e-06,
+      "loss": 0.7388,
+      "step": 19551
+    },
+    {
+      "epoch": 3.4809472934472936,
+      "grad_norm": 1.014060139656067,
+      "learning_rate": 8.231654509786935e-06,
+      "loss": 0.6534,
+      "step": 19552
+    },
+    {
+      "epoch": 3.4811253561253563,
+      "grad_norm": 0.8884333968162537,
+      "learning_rate": 8.226094040721865e-06,
+      "loss": 0.6659,
+      "step": 19553
+    },
+    {
+      "epoch": 3.4813034188034186,
+      "grad_norm": 0.9313388466835022,
+      "learning_rate": 8.22053536977584e-06,
+      "loss": 0.8194,
+      "step": 19554
+    },
+    {
+      "epoch": 3.4814814814814814,
+      "grad_norm": 0.9285356998443604,
+      "learning_rate": 8.214978497057768e-06,
+      "loss": 0.7316,
+      "step": 19555
+    },
+    {
+      "epoch": 3.481659544159544,
+      "grad_norm": 0.9004967212677002,
+      "learning_rate": 8.20942342267651e-06,
+      "loss": 0.7533,
+      "step": 19556
+    },
+    {
+      "epoch": 3.481837606837607,
+      "grad_norm": 0.8874818086624146,
+      "learning_rate": 8.203870146740932e-06,
+      "loss": 0.7632,
+      "step": 19557
+    },
+    {
+      "epoch": 3.4820156695156697,
+      "grad_norm": 0.9493485689163208,
+      "learning_rate": 8.19831866935984e-06,
+      "loss": 0.6767,
+      "step": 19558
+    },
+    {
+      "epoch": 3.482193732193732,
+      "grad_norm": 0.9512984156608582,
+      "learning_rate": 8.192768990641986e-06,
+      "loss": 0.8117,
+      "step": 19559
+    },
+    {
+      "epoch": 3.4823717948717947,
+      "grad_norm": 0.9378128051757812,
+      "learning_rate": 8.187221110696108e-06,
+      "loss": 0.7132,
+      "step": 19560
+    },
+    {
+      "epoch": 3.4825498575498575,
+      "grad_norm": 1.0644614696502686,
+      "learning_rate": 8.18167502963092e-06,
+      "loss": 0.7283,
+      "step": 19561
+    },
+    {
+      "epoch": 3.48272792022792,
+      "grad_norm": 0.9525963664054871,
+      "learning_rate": 8.176130747555055e-06,
+      "loss": 0.762,
+      "step": 19562
+    },
+    {
+      "epoch": 3.482905982905983,
+      "grad_norm": 0.8559122085571289,
+      "learning_rate": 8.170588264577161e-06,
+      "loss": 0.5776,
+      "step": 19563
+    },
+    {
+      "epoch": 3.4830840455840457,
+      "grad_norm": 0.870557427406311,
+      "learning_rate": 8.165047580805851e-06,
+      "loss": 0.7302,
+      "step": 19564
+    },
+    {
+      "epoch": 3.4832621082621085,
+      "grad_norm": 0.9522432684898376,
+      "learning_rate": 8.159508696349639e-06,
+      "loss": 0.6782,
+      "step": 19565
+    },
+    {
+      "epoch": 3.4834401709401708,
+      "grad_norm": 1.1580432653427124,
+      "learning_rate": 8.153971611317079e-06,
+      "loss": 0.7527,
+      "step": 19566
+    },
+    {
+      "epoch": 3.4836182336182335,
+      "grad_norm": 0.9478859901428223,
+      "learning_rate": 8.148436325816666e-06,
+      "loss": 0.7282,
+      "step": 19567
+    },
+    {
+      "epoch": 3.4837962962962963,
+      "grad_norm": 0.9263066649436951,
+      "learning_rate": 8.142902839956822e-06,
+      "loss": 0.808,
+      "step": 19568
+    },
+    {
+      "epoch": 3.483974358974359,
+      "grad_norm": 0.939940869808197,
+      "learning_rate": 8.137371153845996e-06,
+      "loss": 0.67,
+      "step": 19569
+    },
+    {
+      "epoch": 3.484152421652422,
+      "grad_norm": 1.0895870923995972,
+      "learning_rate": 8.131841267592544e-06,
+      "loss": 0.8526,
+      "step": 19570
+    },
+    {
+      "epoch": 3.484330484330484,
+      "grad_norm": 0.9202786087989807,
+      "learning_rate": 8.126313181304823e-06,
+      "loss": 0.668,
+      "step": 19571
+    },
+    {
+      "epoch": 3.484508547008547,
+      "grad_norm": 1.1485145092010498,
+      "learning_rate": 8.120786895091147e-06,
+      "loss": 0.8667,
+      "step": 19572
+    },
+    {
+      "epoch": 3.4846866096866096,
+      "grad_norm": 1.0418143272399902,
+      "learning_rate": 8.115262409059775e-06,
+      "loss": 0.7774,
+      "step": 19573
+    },
+    {
+      "epoch": 3.4848646723646723,
+      "grad_norm": 1.0974538326263428,
+      "learning_rate": 8.109739723318987e-06,
+      "loss": 0.9011,
+      "step": 19574
+    },
+    {
+      "epoch": 3.485042735042735,
+      "grad_norm": 0.9369264841079712,
+      "learning_rate": 8.10421883797694e-06,
+      "loss": 1.0659,
+      "step": 19575
+    },
+    {
+      "epoch": 3.485220797720798,
+      "grad_norm": 1.0749176740646362,
+      "learning_rate": 8.098699753141837e-06,
+      "loss": 0.8179,
+      "step": 19576
+    },
+    {
+      "epoch": 3.4853988603988606,
+      "grad_norm": 0.9837545156478882,
+      "learning_rate": 8.093182468921812e-06,
+      "loss": 0.8466,
+      "step": 19577
+    },
+    {
+      "epoch": 3.485576923076923,
+      "grad_norm": 0.9381014108657837,
+      "learning_rate": 8.087666985424935e-06,
+      "loss": 0.8468,
+      "step": 19578
+    },
+    {
+      "epoch": 3.4857549857549857,
+      "grad_norm": 0.8582633137702942,
+      "learning_rate": 8.082153302759322e-06,
+      "loss": 0.7085,
+      "step": 19579
+    },
+    {
+      "epoch": 3.4859330484330484,
+      "grad_norm": 0.9065280556678772,
+      "learning_rate": 8.07664142103295e-06,
+      "loss": 0.7177,
+      "step": 19580
+    },
+    {
+      "epoch": 3.486111111111111,
+      "grad_norm": 0.9608262181282043,
+      "learning_rate": 8.071131340353833e-06,
+      "loss": 0.6808,
+      "step": 19581
+    },
+    {
+      "epoch": 3.486289173789174,
+      "grad_norm": 0.8834822773933411,
+      "learning_rate": 8.065623060829951e-06,
+      "loss": 0.6307,
+      "step": 19582
+    },
+    {
+      "epoch": 3.486467236467236,
+      "grad_norm": 0.9237925410270691,
+      "learning_rate": 8.0601165825692e-06,
+      "loss": 1.0483,
+      "step": 19583
+    },
+    {
+      "epoch": 3.486645299145299,
+      "grad_norm": 0.9548817873001099,
+      "learning_rate": 8.054611905679477e-06,
+      "loss": 0.8959,
+      "step": 19584
+    },
+    {
+      "epoch": 3.4868233618233617,
+      "grad_norm": 1.0435808897018433,
+      "learning_rate": 8.049109030268631e-06,
+      "loss": 0.6882,
+      "step": 19585
+    },
+    {
+      "epoch": 3.4870014245014245,
+      "grad_norm": 0.9147984981536865,
+      "learning_rate": 8.043607956444477e-06,
+      "loss": 0.6671,
+      "step": 19586
+    },
+    {
+      "epoch": 3.4871794871794872,
+      "grad_norm": 0.9345784783363342,
+      "learning_rate": 8.038108684314815e-06,
+      "loss": 0.7734,
+      "step": 19587
+    },
+    {
+      "epoch": 3.48735754985755,
+      "grad_norm": 1.0092869997024536,
+      "learning_rate": 8.032611213987351e-06,
+      "loss": 0.8451,
+      "step": 19588
+    },
+    {
+      "epoch": 3.4875356125356127,
+      "grad_norm": 0.889281153678894,
+      "learning_rate": 8.027115545569863e-06,
+      "loss": 0.5752,
+      "step": 19589
+    },
+    {
+      "epoch": 3.487713675213675,
+      "grad_norm": 0.9964619874954224,
+      "learning_rate": 8.021621679169955e-06,
+      "loss": 0.7128,
+      "step": 19590
+    },
+    {
+      "epoch": 3.487891737891738,
+      "grad_norm": 0.9543761610984802,
+      "learning_rate": 8.016129614895329e-06,
+      "loss": 0.7243,
+      "step": 19591
+    },
+    {
+      "epoch": 3.4880698005698005,
+      "grad_norm": 0.9491791725158691,
+      "learning_rate": 8.010639352853544e-06,
+      "loss": 0.8746,
+      "step": 19592
+    },
+    {
+      "epoch": 3.4882478632478633,
+      "grad_norm": 0.9773432016372681,
+      "learning_rate": 8.005150893152203e-06,
+      "loss": 0.7438,
+      "step": 19593
+    },
+    {
+      "epoch": 3.488425925925926,
+      "grad_norm": 1.0735520124435425,
+      "learning_rate": 7.999664235898819e-06,
+      "loss": 1.0609,
+      "step": 19594
+    },
+    {
+      "epoch": 3.488603988603989,
+      "grad_norm": 0.981893002986908,
+      "learning_rate": 7.994179381200906e-06,
+      "loss": 0.9202,
+      "step": 19595
+    },
+    {
+      "epoch": 3.488782051282051,
+      "grad_norm": 1.0082266330718994,
+      "learning_rate": 7.988696329165924e-06,
+      "loss": 0.7032,
+      "step": 19596
+    },
+    {
+      "epoch": 3.488960113960114,
+      "grad_norm": 1.0541062355041504,
+      "learning_rate": 7.983215079901285e-06,
+      "loss": 0.7497,
+      "step": 19597
+    },
+    {
+      "epoch": 3.4891381766381766,
+      "grad_norm": 0.943924605846405,
+      "learning_rate": 7.977735633514405e-06,
+      "loss": 0.6938,
+      "step": 19598
+    },
+    {
+      "epoch": 3.4893162393162394,
+      "grad_norm": 0.8759856820106506,
+      "learning_rate": 7.972257990112642e-06,
+      "loss": 0.7018,
+      "step": 19599
+    },
+    {
+      "epoch": 3.489494301994302,
+      "grad_norm": 1.0315039157867432,
+      "learning_rate": 7.966782149803308e-06,
+      "loss": 0.8277,
+      "step": 19600
+    },
+    {
+      "epoch": 3.489672364672365,
+      "grad_norm": 1.0187058448791504,
+      "learning_rate": 7.9613081126937e-06,
+      "loss": 0.7854,
+      "step": 19601
+    },
+    {
+      "epoch": 3.489850427350427,
+      "grad_norm": 0.8936801552772522,
+      "learning_rate": 7.955835878891071e-06,
+      "loss": 0.7572,
+      "step": 19602
+    },
+    {
+      "epoch": 3.49002849002849,
+      "grad_norm": 1.004878044128418,
+      "learning_rate": 7.950365448502606e-06,
+      "loss": 0.8591,
+      "step": 19603
+    },
+    {
+      "epoch": 3.4902065527065527,
+      "grad_norm": 1.0317310094833374,
+      "learning_rate": 7.94489682163555e-06,
+      "loss": 0.7698,
+      "step": 19604
+    },
+    {
+      "epoch": 3.4903846153846154,
+      "grad_norm": 0.9507501721382141,
+      "learning_rate": 7.939429998396986e-06,
+      "loss": 0.7512,
+      "step": 19605
+    },
+    {
+      "epoch": 3.490562678062678,
+      "grad_norm": 0.9824426770210266,
+      "learning_rate": 7.933964978894082e-06,
+      "loss": 0.7041,
+      "step": 19606
+    },
+    {
+      "epoch": 3.490740740740741,
+      "grad_norm": 0.9749001264572144,
+      "learning_rate": 7.928501763233841e-06,
+      "loss": 0.7729,
+      "step": 19607
+    },
+    {
+      "epoch": 3.4909188034188032,
+      "grad_norm": 0.9753629565238953,
+      "learning_rate": 7.92304035152337e-06,
+      "loss": 0.7479,
+      "step": 19608
+    },
+    {
+      "epoch": 3.491096866096866,
+      "grad_norm": 0.9824631214141846,
+      "learning_rate": 7.917580743869646e-06,
+      "loss": 0.8447,
+      "step": 19609
+    },
+    {
+      "epoch": 3.4912749287749287,
+      "grad_norm": 1.1632194519042969,
+      "learning_rate": 7.912122940379651e-06,
+      "loss": 0.8926,
+      "step": 19610
+    },
+    {
+      "epoch": 3.4914529914529915,
+      "grad_norm": 0.8203635811805725,
+      "learning_rate": 7.9066669411603e-06,
+      "loss": 0.5599,
+      "step": 19611
+    },
+    {
+      "epoch": 3.4916310541310542,
+      "grad_norm": 0.922036349773407,
+      "learning_rate": 7.90121274631852e-06,
+      "loss": 0.7774,
+      "step": 19612
+    },
+    {
+      "epoch": 3.491809116809117,
+      "grad_norm": 1.112792730331421,
+      "learning_rate": 7.895760355961124e-06,
+      "loss": 0.9519,
+      "step": 19613
+    },
+    {
+      "epoch": 3.4919871794871793,
+      "grad_norm": 0.9443914890289307,
+      "learning_rate": 7.890309770195015e-06,
+      "loss": 0.8376,
+      "step": 19614
+    },
+    {
+      "epoch": 3.492165242165242,
+      "grad_norm": 0.9320971965789795,
+      "learning_rate": 7.884860989126907e-06,
+      "loss": 0.8073,
+      "step": 19615
+    },
+    {
+      "epoch": 3.492343304843305,
+      "grad_norm": 0.9588237404823303,
+      "learning_rate": 7.879414012863618e-06,
+      "loss": 0.7115,
+      "step": 19616
+    },
+    {
+      "epoch": 3.4925213675213675,
+      "grad_norm": 0.9260135293006897,
+      "learning_rate": 7.873968841511848e-06,
+      "loss": 0.6881,
+      "step": 19617
+    },
+    {
+      "epoch": 3.4926994301994303,
+      "grad_norm": 0.9520851969718933,
+      "learning_rate": 7.868525475178256e-06,
+      "loss": 0.8112,
+      "step": 19618
+    },
+    {
+      "epoch": 3.492877492877493,
+      "grad_norm": 1.027476191520691,
+      "learning_rate": 7.86308391396956e-06,
+      "loss": 0.8152,
+      "step": 19619
+    },
+    {
+      "epoch": 3.4930555555555554,
+      "grad_norm": 0.879357099533081,
+      "learning_rate": 7.857644157992305e-06,
+      "loss": 0.5809,
+      "step": 19620
+    },
+    {
+      "epoch": 3.493233618233618,
+      "grad_norm": 0.9070191979408264,
+      "learning_rate": 7.852206207353141e-06,
+      "loss": 0.6424,
+      "step": 19621
+    },
+    {
+      "epoch": 3.493411680911681,
+      "grad_norm": 1.0702928304672241,
+      "learning_rate": 7.846770062158537e-06,
+      "loss": 0.8733,
+      "step": 19622
+    },
+    {
+      "epoch": 3.4935897435897436,
+      "grad_norm": 0.8494130373001099,
+      "learning_rate": 7.841335722515053e-06,
+      "loss": 0.53,
+      "step": 19623
+    },
+    {
+      "epoch": 3.4937678062678064,
+      "grad_norm": 0.9504514336585999,
+      "learning_rate": 7.835903188529158e-06,
+      "loss": 0.7833,
+      "step": 19624
+    },
+    {
+      "epoch": 3.493945868945869,
+      "grad_norm": 0.9808318614959717,
+      "learning_rate": 7.830472460307293e-06,
+      "loss": 0.8398,
+      "step": 19625
+    },
+    {
+      "epoch": 3.494123931623932,
+      "grad_norm": 0.8339491486549377,
+      "learning_rate": 7.825043537955846e-06,
+      "loss": 0.7661,
+      "step": 19626
+    },
+    {
+      "epoch": 3.494301994301994,
+      "grad_norm": 1.1706161499023438,
+      "learning_rate": 7.819616421581199e-06,
+      "loss": 0.9843,
+      "step": 19627
+    },
+    {
+      "epoch": 3.494480056980057,
+      "grad_norm": 1.0100945234298706,
+      "learning_rate": 7.814191111289659e-06,
+      "loss": 0.8097,
+      "step": 19628
+    },
+    {
+      "epoch": 3.4946581196581197,
+      "grad_norm": 0.9706094861030579,
+      "learning_rate": 7.808767607187584e-06,
+      "loss": 0.9364,
+      "step": 19629
+    },
+    {
+      "epoch": 3.4948361823361824,
+      "grad_norm": 0.928600013256073,
+      "learning_rate": 7.803345909381154e-06,
+      "loss": 0.7667,
+      "step": 19630
+    },
+    {
+      "epoch": 3.495014245014245,
+      "grad_norm": 0.9604787826538086,
+      "learning_rate": 7.797926017976675e-06,
+      "loss": 0.8774,
+      "step": 19631
+    },
+    {
+      "epoch": 3.4951923076923075,
+      "grad_norm": 1.0755119323730469,
+      "learning_rate": 7.792507933080273e-06,
+      "loss": 0.9109,
+      "step": 19632
+    },
+    {
+      "epoch": 3.4953703703703702,
+      "grad_norm": 0.9468057155609131,
+      "learning_rate": 7.787091654798151e-06,
+      "loss": 0.812,
+      "step": 19633
+    },
+    {
+      "epoch": 3.495548433048433,
+      "grad_norm": 1.0440374612808228,
+      "learning_rate": 7.781677183236414e-06,
+      "loss": 0.6985,
+      "step": 19634
+    },
+    {
+      "epoch": 3.4957264957264957,
+      "grad_norm": 0.8866205215454102,
+      "learning_rate": 7.77626451850113e-06,
+      "loss": 0.523,
+      "step": 19635
+    },
+    {
+      "epoch": 3.4959045584045585,
+      "grad_norm": 0.8973997235298157,
+      "learning_rate": 7.770853660698384e-06,
+      "loss": 0.8511,
+      "step": 19636
+    },
+    {
+      "epoch": 3.4960826210826212,
+      "grad_norm": 0.9881580471992493,
+      "learning_rate": 7.765444609934147e-06,
+      "loss": 1.0181,
+      "step": 19637
+    },
+    {
+      "epoch": 3.496260683760684,
+      "grad_norm": 1.003432273864746,
+      "learning_rate": 7.760037366314433e-06,
+      "loss": 0.649,
+      "step": 19638
+    },
+    {
+      "epoch": 3.4964387464387463,
+      "grad_norm": 0.8911902904510498,
+      "learning_rate": 7.75463192994519e-06,
+      "loss": 0.925,
+      "step": 19639
+    },
+    {
+      "epoch": 3.496616809116809,
+      "grad_norm": 1.1478197574615479,
+      "learning_rate": 7.749228300932299e-06,
+      "loss": 0.8193,
+      "step": 19640
+    },
+    {
+      "epoch": 3.496794871794872,
+      "grad_norm": 0.9285799264907837,
+      "learning_rate": 7.743826479381644e-06,
+      "loss": 0.7878,
+      "step": 19641
+    },
+    {
+      "epoch": 3.4969729344729346,
+      "grad_norm": 1.0374022722244263,
+      "learning_rate": 7.738426465399063e-06,
+      "loss": 0.6693,
+      "step": 19642
+    },
+    {
+      "epoch": 3.4971509971509973,
+      "grad_norm": 0.9434183835983276,
+      "learning_rate": 7.733028259090369e-06,
+      "loss": 0.7659,
+      "step": 19643
+    },
+    {
+      "epoch": 3.4973290598290596,
+      "grad_norm": 1.0029457807540894,
+      "learning_rate": 7.727631860561314e-06,
+      "loss": 0.6427,
+      "step": 19644
+    },
+    {
+      "epoch": 3.4975071225071224,
+      "grad_norm": 0.883100688457489,
+      "learning_rate": 7.72223726991761e-06,
+      "loss": 0.8817,
+      "step": 19645
+    },
+    {
+      "epoch": 3.497685185185185,
+      "grad_norm": 0.9539889693260193,
+      "learning_rate": 7.716844487265018e-06,
+      "loss": 0.8675,
+      "step": 19646
+    },
+    {
+      "epoch": 3.497863247863248,
+      "grad_norm": 0.8217553496360779,
+      "learning_rate": 7.711453512709121e-06,
+      "loss": 0.5515,
+      "step": 19647
+    },
+    {
+      "epoch": 3.4980413105413106,
+      "grad_norm": 0.8870468735694885,
+      "learning_rate": 7.706064346355591e-06,
+      "loss": 0.6369,
+      "step": 19648
+    },
+    {
+      "epoch": 3.4982193732193734,
+      "grad_norm": 1.00053870677948,
+      "learning_rate": 7.700676988310008e-06,
+      "loss": 0.7248,
+      "step": 19649
+    },
+    {
+      "epoch": 3.498397435897436,
+      "grad_norm": 0.9883842468261719,
+      "learning_rate": 7.695291438677932e-06,
+      "loss": 0.7862,
+      "step": 19650
+    },
+    {
+      "epoch": 3.4985754985754984,
+      "grad_norm": 0.933318018913269,
+      "learning_rate": 7.68990769756487e-06,
+      "loss": 0.4922,
+      "step": 19651
+    },
+    {
+      "epoch": 3.498753561253561,
+      "grad_norm": 0.9415388107299805,
+      "learning_rate": 7.6845257650763e-06,
+      "loss": 0.82,
+      "step": 19652
+    },
+    {
+      "epoch": 3.498931623931624,
+      "grad_norm": 0.8702071309089661,
+      "learning_rate": 7.679145641317676e-06,
+      "loss": 0.4565,
+      "step": 19653
+    },
+    {
+      "epoch": 3.4991096866096867,
+      "grad_norm": 0.9140437245368958,
+      "learning_rate": 7.673767326394431e-06,
+      "loss": 0.6803,
+      "step": 19654
+    },
+    {
+      "epoch": 3.4992877492877494,
+      "grad_norm": 0.9249038696289062,
+      "learning_rate": 7.668390820411908e-06,
+      "loss": 0.7911,
+      "step": 19655
+    },
+    {
+      "epoch": 3.4994658119658117,
+      "grad_norm": 0.9517161846160889,
+      "learning_rate": 7.663016123475464e-06,
+      "loss": 0.9015,
+      "step": 19656
+    },
+    {
+      "epoch": 3.4994658119658117,
+      "eval_loss": 1.1762211322784424,
+      "eval_runtime": 24.2999,
+      "eval_samples_per_second": 42.84,
+      "eval_steps_per_second": 21.44,
+      "step": 19656
+    },
+    {
+      "epoch": 3.4996438746438745,
+      "grad_norm": 0.8536194562911987,
+      "learning_rate": 7.657643235690414e-06,
+      "loss": 0.7846,
+      "step": 19657
+    },
+    {
+      "epoch": 3.4998219373219372,
+      "grad_norm": 0.9985692501068115,
+      "learning_rate": 7.652272157162021e-06,
+      "loss": 0.7595,
+      "step": 19658
+    },
+    {
+      "epoch": 3.5,
+      "grad_norm": 1.068899154663086,
+      "learning_rate": 7.646902887995522e-06,
+      "loss": 0.6989,
+      "step": 19659
+    },
+    {
+      "epoch": 3.5001780626780628,
+      "grad_norm": 1.0672051906585693,
+      "learning_rate": 7.641535428296098e-06,
+      "loss": 0.774,
+      "step": 19660
+    },
+    {
+      "epoch": 3.5003561253561255,
+      "grad_norm": 0.978769063949585,
+      "learning_rate": 7.636169778168955e-06,
+      "loss": 0.8278,
+      "step": 19661
+    },
+    {
+      "epoch": 3.5005341880341883,
+      "grad_norm": 0.9284443855285645,
+      "learning_rate": 7.630805937719166e-06,
+      "loss": 0.7665,
+      "step": 19662
+    },
+    {
+      "epoch": 3.5007122507122506,
+      "grad_norm": 1.063376784324646,
+      "learning_rate": 7.6254439070518765e-06,
+      "loss": 0.6805,
+      "step": 19663
+    },
+    {
+      "epoch": 3.5008903133903133,
+      "grad_norm": 0.893804669380188,
+      "learning_rate": 7.620083686272117e-06,
+      "loss": 0.6082,
+      "step": 19664
+    },
+    {
+      "epoch": 3.501068376068376,
+      "grad_norm": 0.9822537899017334,
+      "learning_rate": 7.614725275484913e-06,
+      "loss": 0.9002,
+      "step": 19665
+    },
+    {
+      "epoch": 3.501246438746439,
+      "grad_norm": 0.9976572394371033,
+      "learning_rate": 7.609368674795259e-06,
+      "loss": 0.9215,
+      "step": 19666
+    },
+    {
+      "epoch": 3.5014245014245016,
+      "grad_norm": 0.925886332988739,
+      "learning_rate": 7.6040138843080925e-06,
+      "loss": 0.5707,
+      "step": 19667
+    },
+    {
+      "epoch": 3.501602564102564,
+      "grad_norm": 1.1334714889526367,
+      "learning_rate": 7.598660904128341e-06,
+      "loss": 0.7859,
+      "step": 19668
+    },
+    {
+      "epoch": 3.5017806267806266,
+      "grad_norm": 0.8253616690635681,
+      "learning_rate": 7.5933097343608874e-06,
+      "loss": 0.7421,
+      "step": 19669
+    },
+    {
+      "epoch": 3.5019586894586894,
+      "grad_norm": 0.8803296089172363,
+      "learning_rate": 7.587960375110548e-06,
+      "loss": 0.6529,
+      "step": 19670
+    },
+    {
+      "epoch": 3.502136752136752,
+      "grad_norm": 0.9909247756004333,
+      "learning_rate": 7.582612826482194e-06,
+      "loss": 0.8395,
+      "step": 19671
+    },
+    {
+      "epoch": 3.502314814814815,
+      "grad_norm": 0.895326554775238,
+      "learning_rate": 7.5772670885805194e-06,
+      "loss": 0.8597,
+      "step": 19672
+    },
+    {
+      "epoch": 3.5024928774928776,
+      "grad_norm": 0.9548590779304504,
+      "learning_rate": 7.5719231615103305e-06,
+      "loss": 0.6942,
+      "step": 19673
+    },
+    {
+      "epoch": 3.5026709401709404,
+      "grad_norm": 0.9338228106498718,
+      "learning_rate": 7.566581045376297e-06,
+      "loss": 0.7618,
+      "step": 19674
+    },
+    {
+      "epoch": 3.5028490028490027,
+      "grad_norm": 0.9816923141479492,
+      "learning_rate": 7.561240740283104e-06,
+      "loss": 0.7811,
+      "step": 19675
+    },
+    {
+      "epoch": 3.5030270655270654,
+      "grad_norm": 1.141014814376831,
+      "learning_rate": 7.5559022463353664e-06,
+      "loss": 0.823,
+      "step": 19676
+    },
+    {
+      "epoch": 3.503205128205128,
+      "grad_norm": 1.0397098064422607,
+      "learning_rate": 7.550565563637679e-06,
+      "loss": 0.73,
+      "step": 19677
+    },
+    {
+      "epoch": 3.503383190883191,
+      "grad_norm": 0.9919942021369934,
+      "learning_rate": 7.5452306922946355e-06,
+      "loss": 1.1807,
+      "step": 19678
+    },
+    {
+      "epoch": 3.5035612535612537,
+      "grad_norm": 1.051186442375183,
+      "learning_rate": 7.539897632410709e-06,
+      "loss": 0.7733,
+      "step": 19679
+    },
+    {
+      "epoch": 3.503739316239316,
+      "grad_norm": 1.067172646522522,
+      "learning_rate": 7.534566384090436e-06,
+      "loss": 0.9685,
+      "step": 19680
+    },
+    {
+      "epoch": 3.5039173789173788,
+      "grad_norm": 1.0977402925491333,
+      "learning_rate": 7.529236947438256e-06,
+      "loss": 0.8009,
+      "step": 19681
+    },
+    {
+      "epoch": 3.5040954415954415,
+      "grad_norm": 0.9920042753219604,
+      "learning_rate": 7.523909322558587e-06,
+      "loss": 0.6871,
+      "step": 19682
+    },
+    {
+      "epoch": 3.5042735042735043,
+      "grad_norm": 0.921493411064148,
+      "learning_rate": 7.51858350955581e-06,
+      "loss": 0.606,
+      "step": 19683
+    },
+    {
+      "epoch": 3.504451566951567,
+      "grad_norm": 1.0852283239364624,
+      "learning_rate": 7.513259508534276e-06,
+      "loss": 0.5686,
+      "step": 19684
+    },
+    {
+      "epoch": 3.5046296296296298,
+      "grad_norm": 0.8194310069084167,
+      "learning_rate": 7.507937319598291e-06,
+      "loss": 0.6166,
+      "step": 19685
+    },
+    {
+      "epoch": 3.5048076923076925,
+      "grad_norm": 0.921829104423523,
+      "learning_rate": 7.502616942852159e-06,
+      "loss": 0.6076,
+      "step": 19686
+    },
+    {
+      "epoch": 3.504985754985755,
+      "grad_norm": 0.9465250968933105,
+      "learning_rate": 7.497298378400075e-06,
+      "loss": 0.6255,
+      "step": 19687
+    },
+    {
+      "epoch": 3.5051638176638176,
+      "grad_norm": 0.9845356345176697,
+      "learning_rate": 7.4919816263462786e-06,
+      "loss": 0.7305,
+      "step": 19688
+    },
+    {
+      "epoch": 3.5053418803418803,
+      "grad_norm": 1.0652577877044678,
+      "learning_rate": 7.486666686794941e-06,
+      "loss": 0.7852,
+      "step": 19689
+    },
+    {
+      "epoch": 3.505519943019943,
+      "grad_norm": 0.9593751430511475,
+      "learning_rate": 7.4813535598501905e-06,
+      "loss": 0.8299,
+      "step": 19690
+    },
+    {
+      "epoch": 3.505698005698006,
+      "grad_norm": 0.8477317690849304,
+      "learning_rate": 7.4760422456161215e-06,
+      "loss": 0.5603,
+      "step": 19691
+    },
+    {
+      "epoch": 3.505876068376068,
+      "grad_norm": 0.9810600280761719,
+      "learning_rate": 7.470732744196807e-06,
+      "loss": 0.6908,
+      "step": 19692
+    },
+    {
+      "epoch": 3.506054131054131,
+      "grad_norm": 0.9363740682601929,
+      "learning_rate": 7.4654250556962734e-06,
+      "loss": 0.6607,
+      "step": 19693
+    },
+    {
+      "epoch": 3.5062321937321936,
+      "grad_norm": 0.96392422914505,
+      "learning_rate": 7.460119180218505e-06,
+      "loss": 0.726,
+      "step": 19694
+    },
+    {
+      "epoch": 3.5064102564102564,
+      "grad_norm": 0.9342692494392395,
+      "learning_rate": 7.454815117867453e-06,
+      "loss": 0.6143,
+      "step": 19695
+    },
+    {
+      "epoch": 3.506588319088319,
+      "grad_norm": 0.9736638069152832,
+      "learning_rate": 7.449512868747066e-06,
+      "loss": 0.7332,
+      "step": 19696
+    },
+    {
+      "epoch": 3.506766381766382,
+      "grad_norm": 0.9093930125236511,
+      "learning_rate": 7.444212432961228e-06,
+      "loss": 0.7471,
+      "step": 19697
+    },
+    {
+      "epoch": 3.5069444444444446,
+      "grad_norm": 0.9061489701271057,
+      "learning_rate": 7.438913810613768e-06,
+      "loss": 0.7445,
+      "step": 19698
+    },
+    {
+      "epoch": 3.5071225071225074,
+      "grad_norm": 0.9087863564491272,
+      "learning_rate": 7.433617001808513e-06,
+      "loss": 0.7966,
+      "step": 19699
+    },
+    {
+      "epoch": 3.5073005698005697,
+      "grad_norm": 0.9920783042907715,
+      "learning_rate": 7.428322006649236e-06,
+      "loss": 0.9201,
+      "step": 19700
+    },
+    {
+      "epoch": 3.5074786324786325,
+      "grad_norm": 1.0740957260131836,
+      "learning_rate": 7.423028825239719e-06,
+      "loss": 0.7541,
+      "step": 19701
+    },
+    {
+      "epoch": 3.507656695156695,
+      "grad_norm": 0.9236546754837036,
+      "learning_rate": 7.417737457683594e-06,
+      "loss": 0.8124,
+      "step": 19702
+    },
+    {
+      "epoch": 3.507834757834758,
+      "grad_norm": 1.0994749069213867,
+      "learning_rate": 7.412447904084629e-06,
+      "loss": 0.7571,
+      "step": 19703
+    },
+    {
+      "epoch": 3.5080128205128203,
+      "grad_norm": 0.8928847908973694,
+      "learning_rate": 7.4071601645463785e-06,
+      "loss": 0.7111,
+      "step": 19704
+    },
+    {
+      "epoch": 3.508190883190883,
+      "grad_norm": 0.9118340611457825,
+      "learning_rate": 7.40187423917249e-06,
+      "loss": 0.7538,
+      "step": 19705
+    },
+    {
+      "epoch": 3.5083689458689458,
+      "grad_norm": 1.0264140367507935,
+      "learning_rate": 7.396590128066516e-06,
+      "loss": 0.7927,
+      "step": 19706
+    },
+    {
+      "epoch": 3.5085470085470085,
+      "grad_norm": 0.8983800411224365,
+      "learning_rate": 7.391307831332006e-06,
+      "loss": 0.6858,
+      "step": 19707
+    },
+    {
+      "epoch": 3.5087250712250713,
+      "grad_norm": 1.0997859239578247,
+      "learning_rate": 7.386027349072433e-06,
+      "loss": 0.8014,
+      "step": 19708
+    },
+    {
+      "epoch": 3.508903133903134,
+      "grad_norm": 1.0893813371658325,
+      "learning_rate": 7.380748681391258e-06,
+      "loss": 0.8363,
+      "step": 19709
+    },
+    {
+      "epoch": 3.5090811965811968,
+      "grad_norm": 0.9206538796424866,
+      "learning_rate": 7.375471828391911e-06,
+      "loss": 0.6414,
+      "step": 19710
+    },
+    {
+      "epoch": 3.5092592592592595,
+      "grad_norm": 0.9687008857727051,
+      "learning_rate": 7.370196790177808e-06,
+      "loss": 0.8868,
+      "step": 19711
+    },
+    {
+      "epoch": 3.509437321937322,
+      "grad_norm": 0.8396771550178528,
+      "learning_rate": 7.3649235668522445e-06,
+      "loss": 0.5922,
+      "step": 19712
+    },
+    {
+      "epoch": 3.5096153846153846,
+      "grad_norm": 1.01363205909729,
+      "learning_rate": 7.359652158518604e-06,
+      "loss": 0.7419,
+      "step": 19713
+    },
+    {
+      "epoch": 3.5097934472934473,
+      "grad_norm": 1.086411952972412,
+      "learning_rate": 7.354382565280094e-06,
+      "loss": 0.8505,
+      "step": 19714
+    },
+    {
+      "epoch": 3.50997150997151,
+      "grad_norm": 1.1874315738677979,
+      "learning_rate": 7.34911478724003e-06,
+      "loss": 0.8852,
+      "step": 19715
+    },
+    {
+      "epoch": 3.5101495726495724,
+      "grad_norm": 1.0598959922790527,
+      "learning_rate": 7.343848824501598e-06,
+      "loss": 0.7538,
+      "step": 19716
+    },
+    {
+      "epoch": 3.510327635327635,
+      "grad_norm": 0.7151375412940979,
+      "learning_rate": 7.338584677167948e-06,
+      "loss": 0.4218,
+      "step": 19717
+    },
+    {
+      "epoch": 3.510505698005698,
+      "grad_norm": 0.9595068097114563,
+      "learning_rate": 7.333322345342286e-06,
+      "loss": 0.8518,
+      "step": 19718
+    },
+    {
+      "epoch": 3.5106837606837606,
+      "grad_norm": 0.8868532776832581,
+      "learning_rate": 7.328061829127631e-06,
+      "loss": 0.9117,
+      "step": 19719
+    },
+    {
+      "epoch": 3.5108618233618234,
+      "grad_norm": 1.0120736360549927,
+      "learning_rate": 7.32280312862712e-06,
+      "loss": 0.8192,
+      "step": 19720
+    },
+    {
+      "epoch": 3.511039886039886,
+      "grad_norm": 1.0815523862838745,
+      "learning_rate": 7.317546243943751e-06,
+      "loss": 0.6933,
+      "step": 19721
+    },
+    {
+      "epoch": 3.511217948717949,
+      "grad_norm": 1.0424879789352417,
+      "learning_rate": 7.312291175180541e-06,
+      "loss": 0.7333,
+      "step": 19722
+    },
+    {
+      "epoch": 3.5113960113960117,
+      "grad_norm": 0.9301778078079224,
+      "learning_rate": 7.307037922440441e-06,
+      "loss": 0.7172,
+      "step": 19723
+    },
+    {
+      "epoch": 3.511574074074074,
+      "grad_norm": 1.0618454217910767,
+      "learning_rate": 7.301786485826389e-06,
+      "loss": 0.6895,
+      "step": 19724
+    },
+    {
+      "epoch": 3.5117521367521367,
+      "grad_norm": 1.0267716646194458,
+      "learning_rate": 7.2965368654412395e-06,
+      "loss": 0.9075,
+      "step": 19725
+    },
+    {
+      "epoch": 3.5119301994301995,
+      "grad_norm": 1.1378053426742554,
+      "learning_rate": 7.291289061387907e-06,
+      "loss": 0.9324,
+      "step": 19726
+    },
+    {
+      "epoch": 3.512108262108262,
+      "grad_norm": 0.9694442749023438,
+      "learning_rate": 7.286043073769155e-06,
+      "loss": 0.7921,
+      "step": 19727
+    },
+    {
+      "epoch": 3.5122863247863245,
+      "grad_norm": 0.8989338278770447,
+      "learning_rate": 7.280798902687813e-06,
+      "loss": 0.7454,
+      "step": 19728
+    },
+    {
+      "epoch": 3.5124643874643873,
+      "grad_norm": 0.8680744171142578,
+      "learning_rate": 7.275556548246587e-06,
+      "loss": 0.7519,
+      "step": 19729
+    },
+    {
+      "epoch": 3.51264245014245,
+      "grad_norm": 0.9522790312767029,
+      "learning_rate": 7.2703160105482285e-06,
+      "loss": 0.7243,
+      "step": 19730
+    },
+    {
+      "epoch": 3.5128205128205128,
+      "grad_norm": 0.9496331214904785,
+      "learning_rate": 7.265077289695399e-06,
+      "loss": 0.7005,
+      "step": 19731
+    },
+    {
+      "epoch": 3.5129985754985755,
+      "grad_norm": 0.8820855021476746,
+      "learning_rate": 7.259840385790728e-06,
+      "loss": 0.7302,
+      "step": 19732
+    },
+    {
+      "epoch": 3.5131766381766383,
+      "grad_norm": 0.9601408839225769,
+      "learning_rate": 7.254605298936845e-06,
+      "loss": 0.7137,
+      "step": 19733
+    },
+    {
+      "epoch": 3.513354700854701,
+      "grad_norm": 0.9732968211174011,
+      "learning_rate": 7.249372029236312e-06,
+      "loss": 0.7444,
+      "step": 19734
+    },
+    {
+      "epoch": 3.513532763532764,
+      "grad_norm": 0.9062048196792603,
+      "learning_rate": 7.244140576791636e-06,
+      "loss": 0.8984,
+      "step": 19735
+    },
+    {
+      "epoch": 3.513710826210826,
+      "grad_norm": 0.9823424220085144,
+      "learning_rate": 7.2389109417053566e-06,
+      "loss": 0.8515,
+      "step": 19736
+    },
+    {
+      "epoch": 3.513888888888889,
+      "grad_norm": 0.8319346904754639,
+      "learning_rate": 7.233683124079937e-06,
+      "loss": 0.7185,
+      "step": 19737
+    },
+    {
+      "epoch": 3.5140669515669516,
+      "grad_norm": 0.9495996236801147,
+      "learning_rate": 7.2284571240177735e-06,
+      "loss": 0.7912,
+      "step": 19738
+    },
+    {
+      "epoch": 3.5142450142450143,
+      "grad_norm": 0.8981806039810181,
+      "learning_rate": 7.223232941621294e-06,
+      "loss": 0.5754,
+      "step": 19739
+    },
+    {
+      "epoch": 3.5144230769230766,
+      "grad_norm": 1.0965253114700317,
+      "learning_rate": 7.218010576992829e-06,
+      "loss": 0.894,
+      "step": 19740
+    },
+    {
+      "epoch": 3.5146011396011394,
+      "grad_norm": 0.9036837816238403,
+      "learning_rate": 7.212790030234706e-06,
+      "loss": 0.8213,
+      "step": 19741
+    },
+    {
+      "epoch": 3.514779202279202,
+      "grad_norm": 0.8697100281715393,
+      "learning_rate": 7.2075713014492004e-06,
+      "loss": 0.7141,
+      "step": 19742
+    },
+    {
+      "epoch": 3.514957264957265,
+      "grad_norm": 0.9425762295722961,
+      "learning_rate": 7.202354390738608e-06,
+      "loss": 0.6477,
+      "step": 19743
+    },
+    {
+      "epoch": 3.5151353276353277,
+      "grad_norm": 0.9594342708587646,
+      "learning_rate": 7.19713929820508e-06,
+      "loss": 0.8327,
+      "step": 19744
+    },
+    {
+      "epoch": 3.5153133903133904,
+      "grad_norm": 1.132218837738037,
+      "learning_rate": 7.191926023950835e-06,
+      "loss": 0.8389,
+      "step": 19745
+    },
+    {
+      "epoch": 3.515491452991453,
+      "grad_norm": 0.923564612865448,
+      "learning_rate": 7.186714568078012e-06,
+      "loss": 0.7108,
+      "step": 19746
+    },
+    {
+      "epoch": 3.515669515669516,
+      "grad_norm": 0.9644235968589783,
+      "learning_rate": 7.1815049306887204e-06,
+      "loss": 0.7992,
+      "step": 19747
+    },
+    {
+      "epoch": 3.515847578347578,
+      "grad_norm": 0.9429721832275391,
+      "learning_rate": 7.176297111885022e-06,
+      "loss": 0.7287,
+      "step": 19748
+    },
+    {
+      "epoch": 3.516025641025641,
+      "grad_norm": 1.015499472618103,
+      "learning_rate": 7.171091111768957e-06,
+      "loss": 0.8746,
+      "step": 19749
+    },
+    {
+      "epoch": 3.5162037037037037,
+      "grad_norm": 1.0005598068237305,
+      "learning_rate": 7.165886930442522e-06,
+      "loss": 0.8414,
+      "step": 19750
+    },
+    {
+      "epoch": 3.5163817663817665,
+      "grad_norm": 1.1245676279067993,
+      "learning_rate": 7.160684568007692e-06,
+      "loss": 1.0094,
+      "step": 19751
+    },
+    {
+      "epoch": 3.5165598290598292,
+      "grad_norm": 0.9880303144454956,
+      "learning_rate": 7.155484024566372e-06,
+      "loss": 0.7842,
+      "step": 19752
+    },
+    {
+      "epoch": 3.5167378917378915,
+      "grad_norm": 0.9797958135604858,
+      "learning_rate": 7.150285300220505e-06,
+      "loss": 0.928,
+      "step": 19753
+    },
+    {
+      "epoch": 3.5169159544159543,
+      "grad_norm": 0.9521427154541016,
+      "learning_rate": 7.145088395071886e-06,
+      "loss": 0.7293,
+      "step": 19754
+    },
+    {
+      "epoch": 3.517094017094017,
+      "grad_norm": 0.9081432223320007,
+      "learning_rate": 7.1398933092223896e-06,
+      "loss": 0.8062,
+      "step": 19755
+    },
+    {
+      "epoch": 3.51727207977208,
+      "grad_norm": 1.08573317527771,
+      "learning_rate": 7.13470004277379e-06,
+      "loss": 0.7256,
+      "step": 19756
+    },
+    {
+      "epoch": 3.5174501424501425,
+      "grad_norm": 1.0338917970657349,
+      "learning_rate": 7.129508595827805e-06,
+      "loss": 0.9324,
+      "step": 19757
+    },
+    {
+      "epoch": 3.5176282051282053,
+      "grad_norm": 0.9286987781524658,
+      "learning_rate": 7.124318968486221e-06,
+      "loss": 0.7037,
+      "step": 19758
+    },
+    {
+      "epoch": 3.517806267806268,
+      "grad_norm": 0.9461976885795593,
+      "learning_rate": 7.119131160850634e-06,
+      "loss": 0.7509,
+      "step": 19759
+    },
+    {
+      "epoch": 3.5179843304843303,
+      "grad_norm": 1.0241987705230713,
+      "learning_rate": 7.11394517302274e-06,
+      "loss": 0.8071,
+      "step": 19760
+    },
+    {
+      "epoch": 3.518162393162393,
+      "grad_norm": 0.8855302333831787,
+      "learning_rate": 7.108761005104147e-06,
+      "loss": 0.7045,
+      "step": 19761
+    },
+    {
+      "epoch": 3.518340455840456,
+      "grad_norm": 0.9970550537109375,
+      "learning_rate": 7.103578657196419e-06,
+      "loss": 0.7092,
+      "step": 19762
+    },
+    {
+      "epoch": 3.5185185185185186,
+      "grad_norm": 0.9413601756095886,
+      "learning_rate": 7.0983981294010845e-06,
+      "loss": 0.6876,
+      "step": 19763
+    },
+    {
+      "epoch": 3.5186965811965814,
+      "grad_norm": 0.9557897448539734,
+      "learning_rate": 7.093219421819653e-06,
+      "loss": 0.7068,
+      "step": 19764
+    },
+    {
+      "epoch": 3.5188746438746437,
+      "grad_norm": 0.9844231605529785,
+      "learning_rate": 7.088042534553585e-06,
+      "loss": 0.7696,
+      "step": 19765
+    },
+    {
+      "epoch": 3.5190527065527064,
+      "grad_norm": 0.8869762420654297,
+      "learning_rate": 7.082867467704324e-06,
+      "loss": 0.855,
+      "step": 19766
+    },
+    {
+      "epoch": 3.519230769230769,
+      "grad_norm": 1.0310113430023193,
+      "learning_rate": 7.0776942213732325e-06,
+      "loss": 0.7864,
+      "step": 19767
+    },
+    {
+      "epoch": 3.519408831908832,
+      "grad_norm": 0.9870700836181641,
+      "learning_rate": 7.072522795661729e-06,
+      "loss": 0.8516,
+      "step": 19768
+    },
+    {
+      "epoch": 3.5195868945868947,
+      "grad_norm": 0.9155564308166504,
+      "learning_rate": 7.067353190671078e-06,
+      "loss": 0.6531,
+      "step": 19769
+    },
+    {
+      "epoch": 3.5197649572649574,
+      "grad_norm": 0.8860092759132385,
+      "learning_rate": 7.062185406502597e-06,
+      "loss": 0.7624,
+      "step": 19770
+    },
+    {
+      "epoch": 3.51994301994302,
+      "grad_norm": 0.9172900915145874,
+      "learning_rate": 7.057019443257528e-06,
+      "loss": 0.7268,
+      "step": 19771
+    },
+    {
+      "epoch": 3.5201210826210825,
+      "grad_norm": 0.932388424873352,
+      "learning_rate": 7.051855301037102e-06,
+      "loss": 0.5527,
+      "step": 19772
+    },
+    {
+      "epoch": 3.5202991452991452,
+      "grad_norm": 0.9461301565170288,
+      "learning_rate": 7.0466929799424816e-06,
+      "loss": 0.8035,
+      "step": 19773
+    },
+    {
+      "epoch": 3.520477207977208,
+      "grad_norm": 0.925786554813385,
+      "learning_rate": 7.04153248007482e-06,
+      "loss": 0.7452,
+      "step": 19774
+    },
+    {
+      "epoch": 3.5206552706552707,
+      "grad_norm": 1.4296557903289795,
+      "learning_rate": 7.036373801535223e-06,
+      "loss": 0.8918,
+      "step": 19775
+    },
+    {
+      "epoch": 3.5208333333333335,
+      "grad_norm": 0.9940581321716309,
+      "learning_rate": 7.031216944424746e-06,
+      "loss": 0.8082,
+      "step": 19776
+    },
+    {
+      "epoch": 3.521011396011396,
+      "grad_norm": 0.8591741919517517,
+      "learning_rate": 7.026061908844472e-06,
+      "loss": 0.7566,
+      "step": 19777
+    },
+    {
+      "epoch": 3.5211894586894585,
+      "grad_norm": 0.8477209210395813,
+      "learning_rate": 7.020908694895368e-06,
+      "loss": 0.6008,
+      "step": 19778
+    },
+    {
+      "epoch": 3.5213675213675213,
+      "grad_norm": 1.1067235469818115,
+      "learning_rate": 7.015757302678416e-06,
+      "loss": 1.0526,
+      "step": 19779
+    },
+    {
+      "epoch": 3.521545584045584,
+      "grad_norm": 0.9242870807647705,
+      "learning_rate": 7.01060773229455e-06,
+      "loss": 0.827,
+      "step": 19780
+    },
+    {
+      "epoch": 3.521723646723647,
+      "grad_norm": 0.9401116967201233,
+      "learning_rate": 7.005459983844642e-06,
+      "loss": 0.8458,
+      "step": 19781
+    },
+    {
+      "epoch": 3.5219017094017095,
+      "grad_norm": 0.8446176648139954,
+      "learning_rate": 7.000314057429558e-06,
+      "loss": 0.5964,
+      "step": 19782
+    },
+    {
+      "epoch": 3.5220797720797723,
+      "grad_norm": 0.931850254535675,
+      "learning_rate": 6.9951699531501714e-06,
+      "loss": 0.6574,
+      "step": 19783
+    },
+    {
+      "epoch": 3.5222578347578346,
+      "grad_norm": 0.9261093735694885,
+      "learning_rate": 6.99002767110718e-06,
+      "loss": 0.8009,
+      "step": 19784
+    },
+    {
+      "epoch": 3.5224358974358974,
+      "grad_norm": 1.123136281967163,
+      "learning_rate": 6.984887211401425e-06,
+      "loss": 0.7744,
+      "step": 19785
+    },
+    {
+      "epoch": 3.52261396011396,
+      "grad_norm": 0.9234578609466553,
+      "learning_rate": 6.979748574133549e-06,
+      "loss": 0.8332,
+      "step": 19786
+    },
+    {
+      "epoch": 3.522792022792023,
+      "grad_norm": 1.0575841665267944,
+      "learning_rate": 6.974611759404281e-06,
+      "loss": 0.8427,
+      "step": 19787
+    },
+    {
+      "epoch": 3.5229700854700856,
+      "grad_norm": 0.9604585766792297,
+      "learning_rate": 6.9694767673142645e-06,
+      "loss": 0.8508,
+      "step": 19788
+    },
+    {
+      "epoch": 3.523148148148148,
+      "grad_norm": 0.9769417643547058,
+      "learning_rate": 6.9643435979640845e-06,
+      "loss": 0.6211,
+      "step": 19789
+    },
+    {
+      "epoch": 3.5233262108262107,
+      "grad_norm": 0.9469193816184998,
+      "learning_rate": 6.959212251454328e-06,
+      "loss": 0.8078,
+      "step": 19790
+    },
+    {
+      "epoch": 3.5235042735042734,
+      "grad_norm": 0.9950293302536011,
+      "learning_rate": 6.9540827278855354e-06,
+      "loss": 0.6546,
+      "step": 19791
+    },
+    {
+      "epoch": 3.523682336182336,
+      "grad_norm": 0.9339304566383362,
+      "learning_rate": 6.9489550273581834e-06,
+      "loss": 0.7838,
+      "step": 19792
+    },
+    {
+      "epoch": 3.523860398860399,
+      "grad_norm": 1.0213496685028076,
+      "learning_rate": 6.943829149972802e-06,
+      "loss": 0.8105,
+      "step": 19793
+    },
+    {
+      "epoch": 3.5240384615384617,
+      "grad_norm": 0.8725095391273499,
+      "learning_rate": 6.938705095829734e-06,
+      "loss": 0.6082,
+      "step": 19794
+    },
+    {
+      "epoch": 3.5242165242165244,
+      "grad_norm": 0.8054739236831665,
+      "learning_rate": 6.933582865029453e-06,
+      "loss": 0.6168,
+      "step": 19795
+    },
+    {
+      "epoch": 3.5243945868945867,
+      "grad_norm": 0.9824689626693726,
+      "learning_rate": 6.92846245767228e-06,
+      "loss": 0.8601,
+      "step": 19796
+    },
+    {
+      "epoch": 3.5245726495726495,
+      "grad_norm": 1.024682641029358,
+      "learning_rate": 6.9233438738585345e-06,
+      "loss": 0.7557,
+      "step": 19797
+    },
+    {
+      "epoch": 3.5247507122507122,
+      "grad_norm": 0.8784366250038147,
+      "learning_rate": 6.918227113688547e-06,
+      "loss": 0.7456,
+      "step": 19798
+    },
+    {
+      "epoch": 3.524928774928775,
+      "grad_norm": 0.8838477730751038,
+      "learning_rate": 6.913112177262493e-06,
+      "loss": 0.6641,
+      "step": 19799
+    },
+    {
+      "epoch": 3.5251068376068377,
+      "grad_norm": 0.8852226138114929,
+      "learning_rate": 6.907999064680681e-06,
+      "loss": 0.7466,
+      "step": 19800
+    },
+    {
+      "epoch": 3.5252849002849,
+      "grad_norm": 1.0038496255874634,
+      "learning_rate": 6.90288777604321e-06,
+      "loss": 0.7156,
+      "step": 19801
+    },
+    {
+      "epoch": 3.525462962962963,
+      "grad_norm": 1.1669005155563354,
+      "learning_rate": 6.897778311450276e-06,
+      "loss": 0.8363,
+      "step": 19802
+    },
+    {
+      "epoch": 3.5256410256410255,
+      "grad_norm": 0.9355636835098267,
+      "learning_rate": 6.892670671001977e-06,
+      "loss": 0.8016,
+      "step": 19803
+    },
+    {
+      "epoch": 3.5258190883190883,
+      "grad_norm": 0.9169090986251831,
+      "learning_rate": 6.887564854798378e-06,
+      "loss": 0.712,
+      "step": 19804
+    },
+    {
+      "epoch": 3.525997150997151,
+      "grad_norm": 0.9154117703437805,
+      "learning_rate": 6.882460862939522e-06,
+      "loss": 0.6729,
+      "step": 19805
+    },
+    {
+      "epoch": 3.526175213675214,
+      "grad_norm": 0.9298785328865051,
+      "learning_rate": 6.877358695525416e-06,
+      "loss": 0.6924,
+      "step": 19806
+    },
+    {
+      "epoch": 3.5263532763532766,
+      "grad_norm": 0.9413958191871643,
+      "learning_rate": 6.872258352655991e-06,
+      "loss": 0.8641,
+      "step": 19807
+    },
+    {
+      "epoch": 3.5265313390313393,
+      "grad_norm": 0.9077426791191101,
+      "learning_rate": 6.867159834431247e-06,
+      "loss": 0.725,
+      "step": 19808
+    },
+    {
+      "epoch": 3.5267094017094016,
+      "grad_norm": 0.9770194292068481,
+      "learning_rate": 6.8620631409510135e-06,
+      "loss": 0.852,
+      "step": 19809
+    },
+    {
+      "epoch": 3.5268874643874644,
+      "grad_norm": 1.092766523361206,
+      "learning_rate": 6.8569682723152e-06,
+      "loss": 0.6776,
+      "step": 19810
+    },
+    {
+      "epoch": 3.527065527065527,
+      "grad_norm": 0.8729142546653748,
+      "learning_rate": 6.851875228623572e-06,
+      "loss": 0.7475,
+      "step": 19811
+    },
+    {
+      "epoch": 3.52724358974359,
+      "grad_norm": 1.0176565647125244,
+      "learning_rate": 6.846784009975971e-06,
+      "loss": 0.9025,
+      "step": 19812
+    },
+    {
+      "epoch": 3.527421652421652,
+      "grad_norm": 1.1086188554763794,
+      "learning_rate": 6.841694616472128e-06,
+      "loss": 0.9416,
+      "step": 19813
+    },
+    {
+      "epoch": 3.527599715099715,
+      "grad_norm": 0.9331051707267761,
+      "learning_rate": 6.8366070482117625e-06,
+      "loss": 0.5952,
+      "step": 19814
+    },
+    {
+      "epoch": 3.5277777777777777,
+      "grad_norm": 0.941539466381073,
+      "learning_rate": 6.8315213052945526e-06,
+      "loss": 0.6675,
+      "step": 19815
+    },
+    {
+      "epoch": 3.5279558404558404,
+      "grad_norm": 0.9927091598510742,
+      "learning_rate": 6.826437387820117e-06,
+      "loss": 0.6435,
+      "step": 19816
+    },
+    {
+      "epoch": 3.528133903133903,
+      "grad_norm": 0.9932602047920227,
+      "learning_rate": 6.8213552958881096e-06,
+      "loss": 0.9288,
+      "step": 19817
+    },
+    {
+      "epoch": 3.528311965811966,
+      "grad_norm": 1.017561912536621,
+      "learning_rate": 6.816275029598085e-06,
+      "loss": 0.8233,
+      "step": 19818
+    },
+    {
+      "epoch": 3.5284900284900287,
+      "grad_norm": 0.972872257232666,
+      "learning_rate": 6.811196589049573e-06,
+      "loss": 0.6626,
+      "step": 19819
+    },
+    {
+      "epoch": 3.5286680911680914,
+      "grad_norm": 1.009381651878357,
+      "learning_rate": 6.806119974342084e-06,
+      "loss": 0.718,
+      "step": 19820
+    },
+    {
+      "epoch": 3.5288461538461537,
+      "grad_norm": 1.0876743793487549,
+      "learning_rate": 6.801045185575083e-06,
+      "loss": 0.7202,
+      "step": 19821
+    },
+    {
+      "epoch": 3.5290242165242165,
+      "grad_norm": 0.8937970399856567,
+      "learning_rate": 6.795972222848002e-06,
+      "loss": 0.5068,
+      "step": 19822
+    },
+    {
+      "epoch": 3.5292022792022792,
+      "grad_norm": 1.0466728210449219,
+      "learning_rate": 6.790901086260226e-06,
+      "loss": 0.6791,
+      "step": 19823
+    },
+    {
+      "epoch": 3.529380341880342,
+      "grad_norm": 0.9000089168548584,
+      "learning_rate": 6.7858317759111e-06,
+      "loss": 0.7009,
+      "step": 19824
+    },
+    {
+      "epoch": 3.5295584045584043,
+      "grad_norm": 1.0114625692367554,
+      "learning_rate": 6.780764291899988e-06,
+      "loss": 0.6994,
+      "step": 19825
+    },
+    {
+      "epoch": 3.529736467236467,
+      "grad_norm": 0.9358991980552673,
+      "learning_rate": 6.775698634326133e-06,
+      "loss": 0.6651,
+      "step": 19826
+    },
+    {
+      "epoch": 3.52991452991453,
+      "grad_norm": 0.8242545127868652,
+      "learning_rate": 6.770634803288822e-06,
+      "loss": 0.6033,
+      "step": 19827
+    },
+    {
+      "epoch": 3.5300925925925926,
+      "grad_norm": 0.8415285348892212,
+      "learning_rate": 6.765572798887254e-06,
+      "loss": 0.5524,
+      "step": 19828
+    },
+    {
+      "epoch": 3.5302706552706553,
+      "grad_norm": 0.9627799987792969,
+      "learning_rate": 6.760512621220616e-06,
+      "loss": 0.8388,
+      "step": 19829
+    },
+    {
+      "epoch": 3.530448717948718,
+      "grad_norm": 1.0505856275558472,
+      "learning_rate": 6.755454270388029e-06,
+      "loss": 0.7476,
+      "step": 19830
+    },
+    {
+      "epoch": 3.530626780626781,
+      "grad_norm": 0.9269016981124878,
+      "learning_rate": 6.7503977464886235e-06,
+      "loss": 0.6445,
+      "step": 19831
+    },
+    {
+      "epoch": 3.5308048433048436,
+      "grad_norm": 0.8944364786148071,
+      "learning_rate": 6.745343049621456e-06,
+      "loss": 0.7826,
+      "step": 19832
+    },
+    {
+      "epoch": 3.530982905982906,
+      "grad_norm": 0.9199281334877014,
+      "learning_rate": 6.7402901798856e-06,
+      "loss": 0.9266,
+      "step": 19833
+    },
+    {
+      "epoch": 3.5311609686609686,
+      "grad_norm": 0.8758364915847778,
+      "learning_rate": 6.7352391373799895e-06,
+      "loss": 0.8124,
+      "step": 19834
+    },
+    {
+      "epoch": 3.5313390313390314,
+      "grad_norm": 1.1305196285247803,
+      "learning_rate": 6.7301899222036555e-06,
+      "loss": 1.0041,
+      "step": 19835
+    },
+    {
+      "epoch": 3.531517094017094,
+      "grad_norm": 0.9125435948371887,
+      "learning_rate": 6.725142534455487e-06,
+      "loss": 0.7151,
+      "step": 19836
+    },
+    {
+      "epoch": 3.5316951566951564,
+      "grad_norm": 0.9822704195976257,
+      "learning_rate": 6.7200969742343915e-06,
+      "loss": 0.7827,
+      "step": 19837
+    },
+    {
+      "epoch": 3.531873219373219,
+      "grad_norm": 0.9346258044242859,
+      "learning_rate": 6.715053241639224e-06,
+      "loss": 0.7513,
+      "step": 19838
+    },
+    {
+      "epoch": 3.532051282051282,
+      "grad_norm": 0.8414269685745239,
+      "learning_rate": 6.710011336768796e-06,
+      "loss": 0.6312,
+      "step": 19839
+    },
+    {
+      "epoch": 3.5322293447293447,
+      "grad_norm": 0.9639045000076294,
+      "learning_rate": 6.704971259721926e-06,
+      "loss": 0.719,
+      "step": 19840
+    },
+    {
+      "epoch": 3.5324074074074074,
+      "grad_norm": 0.9534916877746582,
+      "learning_rate": 6.699933010597314e-06,
+      "loss": 0.7557,
+      "step": 19841
+    },
+    {
+      "epoch": 3.53258547008547,
+      "grad_norm": 0.8696106672286987,
+      "learning_rate": 6.6948965894937155e-06,
+      "loss": 0.5598,
+      "step": 19842
+    },
+    {
+      "epoch": 3.532763532763533,
+      "grad_norm": 0.9199994206428528,
+      "learning_rate": 6.689861996509794e-06,
+      "loss": 0.7441,
+      "step": 19843
+    },
+    {
+      "epoch": 3.5329415954415957,
+      "grad_norm": 0.9052377343177795,
+      "learning_rate": 6.684829231744183e-06,
+      "loss": 0.7531,
+      "step": 19844
+    },
+    {
+      "epoch": 3.533119658119658,
+      "grad_norm": 0.9178473949432373,
+      "learning_rate": 6.679798295295514e-06,
+      "loss": 0.7278,
+      "step": 19845
+    },
+    {
+      "epoch": 3.5332977207977208,
+      "grad_norm": 0.8894423842430115,
+      "learning_rate": 6.6747691872623305e-06,
+      "loss": 0.7972,
+      "step": 19846
+    },
+    {
+      "epoch": 3.5334757834757835,
+      "grad_norm": 1.0122573375701904,
+      "learning_rate": 6.669741907743177e-06,
+      "loss": 0.9849,
+      "step": 19847
+    },
+    {
+      "epoch": 3.5336538461538463,
+      "grad_norm": 0.9068432450294495,
+      "learning_rate": 6.664716456836561e-06,
+      "loss": 0.9914,
+      "step": 19848
+    },
+    {
+      "epoch": 3.5338319088319086,
+      "grad_norm": 0.840515673160553,
+      "learning_rate": 6.6596928346409185e-06,
+      "loss": 0.7176,
+      "step": 19849
+    },
+    {
+      "epoch": 3.5340099715099713,
+      "grad_norm": 0.9179866313934326,
+      "learning_rate": 6.6546710412547344e-06,
+      "loss": 0.6759,
+      "step": 19850
+    },
+    {
+      "epoch": 3.534188034188034,
+      "grad_norm": 0.7912464737892151,
+      "learning_rate": 6.64965107677632e-06,
+      "loss": 0.6583,
+      "step": 19851
+    },
+    {
+      "epoch": 3.534366096866097,
+      "grad_norm": 1.0819852352142334,
+      "learning_rate": 6.6446329413040965e-06,
+      "loss": 0.9199,
+      "step": 19852
+    },
+    {
+      "epoch": 3.5345441595441596,
+      "grad_norm": 1.072750449180603,
+      "learning_rate": 6.6396166349363635e-06,
+      "loss": 0.7999,
+      "step": 19853
+    },
+    {
+      "epoch": 3.5347222222222223,
+      "grad_norm": 1.0215152502059937,
+      "learning_rate": 6.634602157771385e-06,
+      "loss": 0.8409,
+      "step": 19854
+    },
+    {
+      "epoch": 3.534900284900285,
+      "grad_norm": 0.9943056106567383,
+      "learning_rate": 6.629589509907464e-06,
+      "loss": 0.8393,
+      "step": 19855
+    },
+    {
+      "epoch": 3.535078347578348,
+      "grad_norm": 1.0051432847976685,
+      "learning_rate": 6.62457869144274e-06,
+      "loss": 0.821,
+      "step": 19856
+    },
+    {
+      "epoch": 3.53525641025641,
+      "grad_norm": 0.9929559230804443,
+      "learning_rate": 6.619569702475459e-06,
+      "loss": 0.8753,
+      "step": 19857
+    },
+    {
+      "epoch": 3.535434472934473,
+      "grad_norm": 1.0302678346633911,
+      "learning_rate": 6.6145625431036975e-06,
+      "loss": 0.9094,
+      "step": 19858
+    },
+    {
+      "epoch": 3.5356125356125356,
+      "grad_norm": 0.9745633602142334,
+      "learning_rate": 6.609557213425599e-06,
+      "loss": 0.8616,
+      "step": 19859
+    },
+    {
+      "epoch": 3.5357905982905984,
+      "grad_norm": 0.9746029376983643,
+      "learning_rate": 6.60455371353923e-06,
+      "loss": 0.9074,
+      "step": 19860
+    },
+    {
+      "epoch": 3.5359686609686607,
+      "grad_norm": 0.9690983295440674,
+      "learning_rate": 6.599552043542623e-06,
+      "loss": 0.788,
+      "step": 19861
+    },
+    {
+      "epoch": 3.5361467236467234,
+      "grad_norm": 1.0887675285339355,
+      "learning_rate": 6.594552203533766e-06,
+      "loss": 0.809,
+      "step": 19862
+    },
+    {
+      "epoch": 3.536324786324786,
+      "grad_norm": 0.943412721157074,
+      "learning_rate": 6.589554193610637e-06,
+      "loss": 0.6296,
+      "step": 19863
+    },
+    {
+      "epoch": 3.536502849002849,
+      "grad_norm": 0.9202658534049988,
+      "learning_rate": 6.584558013871123e-06,
+      "loss": 0.7163,
+      "step": 19864
+    },
+    {
+      "epoch": 3.5366809116809117,
+      "grad_norm": 0.8274959325790405,
+      "learning_rate": 6.57956366441318e-06,
+      "loss": 0.6825,
+      "step": 19865
+    },
+    {
+      "epoch": 3.5368589743589745,
+      "grad_norm": 0.9158870577812195,
+      "learning_rate": 6.574571145334585e-06,
+      "loss": 0.6954,
+      "step": 19866
+    },
+    {
+      "epoch": 3.537037037037037,
+      "grad_norm": 0.8894117474555969,
+      "learning_rate": 6.5695804567332044e-06,
+      "loss": 0.8603,
+      "step": 19867
+    },
+    {
+      "epoch": 3.5372150997151,
+      "grad_norm": 1.219153642654419,
+      "learning_rate": 6.564591598706815e-06,
+      "loss": 0.8096,
+      "step": 19868
+    },
+    {
+      "epoch": 3.5373931623931623,
+      "grad_norm": 0.8852741122245789,
+      "learning_rate": 6.5596045713531615e-06,
+      "loss": 0.6964,
+      "step": 19869
+    },
+    {
+      "epoch": 3.537571225071225,
+      "grad_norm": 0.9260098338127136,
+      "learning_rate": 6.554619374769955e-06,
+      "loss": 0.7107,
+      "step": 19870
+    },
+    {
+      "epoch": 3.5377492877492878,
+      "grad_norm": 0.960800051689148,
+      "learning_rate": 6.54963600905486e-06,
+      "loss": 0.5889,
+      "step": 19871
+    },
+    {
+      "epoch": 3.5379273504273505,
+      "grad_norm": 0.957720935344696,
+      "learning_rate": 6.544654474305523e-06,
+      "loss": 0.8725,
+      "step": 19872
+    },
+    {
+      "epoch": 3.5381054131054133,
+      "grad_norm": 0.8383321762084961,
+      "learning_rate": 6.539674770619542e-06,
+      "loss": 0.7994,
+      "step": 19873
+    },
+    {
+      "epoch": 3.5382834757834756,
+      "grad_norm": 0.9970546960830688,
+      "learning_rate": 6.534696898094472e-06,
+      "loss": 0.6841,
+      "step": 19874
+    },
+    {
+      "epoch": 3.5384615384615383,
+      "grad_norm": 0.987698495388031,
+      "learning_rate": 6.52972085682787e-06,
+      "loss": 0.8124,
+      "step": 19875
+    },
+    {
+      "epoch": 3.538639601139601,
+      "grad_norm": 0.9530283212661743,
+      "learning_rate": 6.524746646917224e-06,
+      "loss": 0.8111,
+      "step": 19876
+    },
+    {
+      "epoch": 3.538817663817664,
+      "grad_norm": 0.9642528295516968,
+      "learning_rate": 6.519774268459988e-06,
+      "loss": 0.6965,
+      "step": 19877
+    },
+    {
+      "epoch": 3.5389957264957266,
+      "grad_norm": 0.8841597437858582,
+      "learning_rate": 6.514803721553597e-06,
+      "loss": 0.7679,
+      "step": 19878
+    },
+    {
+      "epoch": 3.5391737891737893,
+      "grad_norm": 0.9791342616081238,
+      "learning_rate": 6.509835006295395e-06,
+      "loss": 0.72,
+      "step": 19879
+    },
+    {
+      "epoch": 3.539351851851852,
+      "grad_norm": 0.9851047396659851,
+      "learning_rate": 6.504868122782815e-06,
+      "loss": 0.7728,
+      "step": 19880
+    },
+    {
+      "epoch": 3.5395299145299144,
+      "grad_norm": 0.9055594801902771,
+      "learning_rate": 6.499903071113089e-06,
+      "loss": 0.4758,
+      "step": 19881
+    },
+    {
+      "epoch": 3.539707977207977,
+      "grad_norm": 1.0211628675460815,
+      "learning_rate": 6.494939851383553e-06,
+      "loss": 0.6912,
+      "step": 19882
+    },
+    {
+      "epoch": 3.53988603988604,
+      "grad_norm": 1.055820345878601,
+      "learning_rate": 6.489978463691415e-06,
+      "loss": 0.6928,
+      "step": 19883
+    },
+    {
+      "epoch": 3.5400641025641026,
+      "grad_norm": 0.8368833065032959,
+      "learning_rate": 6.485018908133911e-06,
+      "loss": 0.6069,
+      "step": 19884
+    },
+    {
+      "epoch": 3.5402421652421654,
+      "grad_norm": 0.8132014274597168,
+      "learning_rate": 6.480061184808195e-06,
+      "loss": 0.7176,
+      "step": 19885
+    },
+    {
+      "epoch": 3.5404202279202277,
+      "grad_norm": 0.8118695020675659,
+      "learning_rate": 6.475105293811412e-06,
+      "loss": 0.6785,
+      "step": 19886
+    },
+    {
+      "epoch": 3.5405982905982905,
+      "grad_norm": 0.9680670499801636,
+      "learning_rate": 6.470151235240651e-06,
+      "loss": 0.8413,
+      "step": 19887
+    },
+    {
+      "epoch": 3.540776353276353,
+      "grad_norm": 0.8878784775733948,
+      "learning_rate": 6.465199009193001e-06,
+      "loss": 0.7328,
+      "step": 19888
+    },
+    {
+      "epoch": 3.540954415954416,
+      "grad_norm": 0.8278130888938904,
+      "learning_rate": 6.460248615765452e-06,
+      "loss": 0.7212,
+      "step": 19889
+    },
+    {
+      "epoch": 3.5411324786324787,
+      "grad_norm": 1.0419317483901978,
+      "learning_rate": 6.455300055055047e-06,
+      "loss": 0.7404,
+      "step": 19890
+    },
+    {
+      "epoch": 3.5413105413105415,
+      "grad_norm": 0.9516051411628723,
+      "learning_rate": 6.450353327158687e-06,
+      "loss": 0.6078,
+      "step": 19891
+    },
+    {
+      "epoch": 3.541488603988604,
+      "grad_norm": 0.9726034998893738,
+      "learning_rate": 6.44540843217335e-06,
+      "loss": 0.8113,
+      "step": 19892
+    },
+    {
+      "epoch": 3.5416666666666665,
+      "grad_norm": 0.9923378229141235,
+      "learning_rate": 6.4404653701958695e-06,
+      "loss": 0.8888,
+      "step": 19893
+    },
+    {
+      "epoch": 3.5418447293447293,
+      "grad_norm": 1.088355302810669,
+      "learning_rate": 6.4355241413231234e-06,
+      "loss": 0.8549,
+      "step": 19894
+    },
+    {
+      "epoch": 3.542022792022792,
+      "grad_norm": 0.9767059087753296,
+      "learning_rate": 6.430584745651924e-06,
+      "loss": 0.8886,
+      "step": 19895
+    },
+    {
+      "epoch": 3.5422008547008548,
+      "grad_norm": 0.9931213855743408,
+      "learning_rate": 6.425647183279016e-06,
+      "loss": 0.7972,
+      "step": 19896
+    },
+    {
+      "epoch": 3.5423789173789175,
+      "grad_norm": 0.939339816570282,
+      "learning_rate": 6.42071145430121e-06,
+      "loss": 0.8591,
+      "step": 19897
+    },
+    {
+      "epoch": 3.54255698005698,
+      "grad_norm": 0.9261937141418457,
+      "learning_rate": 6.415777558815139e-06,
+      "loss": 0.7403,
+      "step": 19898
+    },
+    {
+      "epoch": 3.5427350427350426,
+      "grad_norm": 0.9660967588424683,
+      "learning_rate": 6.410845496917506e-06,
+      "loss": 0.7583,
+      "step": 19899
+    },
+    {
+      "epoch": 3.5429131054131053,
+      "grad_norm": 0.9808663725852966,
+      "learning_rate": 6.405915268704954e-06,
+      "loss": 0.5649,
+      "step": 19900
+    },
+    {
+      "epoch": 3.543091168091168,
+      "grad_norm": 0.9739185571670532,
+      "learning_rate": 6.4009868742740616e-06,
+      "loss": 0.7576,
+      "step": 19901
+    },
+    {
+      "epoch": 3.543269230769231,
+      "grad_norm": 0.9260064363479614,
+      "learning_rate": 6.396060313721397e-06,
+      "loss": 0.793,
+      "step": 19902
+    },
+    {
+      "epoch": 3.5434472934472936,
+      "grad_norm": 0.8840816617012024,
+      "learning_rate": 6.391135587143482e-06,
+      "loss": 0.7113,
+      "step": 19903
+    },
+    {
+      "epoch": 3.5436253561253563,
+      "grad_norm": 1.163144826889038,
+      "learning_rate": 6.386212694636795e-06,
+      "loss": 0.8255,
+      "step": 19904
+    },
+    {
+      "epoch": 3.5438034188034186,
+      "grad_norm": 0.9658638834953308,
+      "learning_rate": 6.381291636297837e-06,
+      "loss": 0.7442,
+      "step": 19905
+    },
+    {
+      "epoch": 3.5439814814814814,
+      "grad_norm": 1.0708949565887451,
+      "learning_rate": 6.376372412222964e-06,
+      "loss": 0.8555,
+      "step": 19906
+    },
+    {
+      "epoch": 3.544159544159544,
+      "grad_norm": 1.014028787612915,
+      "learning_rate": 6.371455022508621e-06,
+      "loss": 0.6674,
+      "step": 19907
+    },
+    {
+      "epoch": 3.544337606837607,
+      "grad_norm": 1.0435295104980469,
+      "learning_rate": 6.366539467251087e-06,
+      "loss": 1.0046,
+      "step": 19908
+    },
+    {
+      "epoch": 3.5445156695156697,
+      "grad_norm": 0.8781193494796753,
+      "learning_rate": 6.361625746546718e-06,
+      "loss": 0.6087,
+      "step": 19909
+    },
+    {
+      "epoch": 3.544693732193732,
+      "grad_norm": 1.0531107187271118,
+      "learning_rate": 6.356713860491781e-06,
+      "loss": 0.7646,
+      "step": 19910
+    },
+    {
+      "epoch": 3.5448717948717947,
+      "grad_norm": 0.9368358850479126,
+      "learning_rate": 6.3518038091825e-06,
+      "loss": 0.7514,
+      "step": 19911
+    },
+    {
+      "epoch": 3.5450498575498575,
+      "grad_norm": 0.9585633277893066,
+      "learning_rate": 6.346895592715096e-06,
+      "loss": 0.6294,
+      "step": 19912
+    },
+    {
+      "epoch": 3.54522792022792,
+      "grad_norm": 0.9690647721290588,
+      "learning_rate": 6.341989211185717e-06,
+      "loss": 0.7861,
+      "step": 19913
+    },
+    {
+      "epoch": 3.545405982905983,
+      "grad_norm": 0.9715308547019958,
+      "learning_rate": 6.3370846646904844e-06,
+      "loss": 1.0209,
+      "step": 19914
+    },
+    {
+      "epoch": 3.5455840455840457,
+      "grad_norm": 1.067807674407959,
+      "learning_rate": 6.332181953325534e-06,
+      "loss": 0.7392,
+      "step": 19915
+    },
+    {
+      "epoch": 3.5457621082621085,
+      "grad_norm": 0.7337046265602112,
+      "learning_rate": 6.327281077186897e-06,
+      "loss": 0.5182,
+      "step": 19916
+    },
+    {
+      "epoch": 3.5459401709401708,
+      "grad_norm": 0.9948704838752747,
+      "learning_rate": 6.3223820363706e-06,
+      "loss": 0.697,
+      "step": 19917
+    },
+    {
+      "epoch": 3.5461182336182335,
+      "grad_norm": 0.9387503862380981,
+      "learning_rate": 6.3174848309726305e-06,
+      "loss": 0.7389,
+      "step": 19918
+    },
+    {
+      "epoch": 3.5462962962962963,
+      "grad_norm": 1.026684284210205,
+      "learning_rate": 6.312589461088936e-06,
+      "loss": 0.9897,
+      "step": 19919
+    },
+    {
+      "epoch": 3.546474358974359,
+      "grad_norm": 0.8634870648384094,
+      "learning_rate": 6.307695926815426e-06,
+      "loss": 0.6171,
+      "step": 19920
+    },
+    {
+      "epoch": 3.546652421652422,
+      "grad_norm": 0.8596929311752319,
+      "learning_rate": 6.302804228247972e-06,
+      "loss": 0.7231,
+      "step": 19921
+    },
+    {
+      "epoch": 3.546830484330484,
+      "grad_norm": 0.8627797961235046,
+      "learning_rate": 6.2979143654824714e-06,
+      "loss": 0.6921,
+      "step": 19922
+    },
+    {
+      "epoch": 3.547008547008547,
+      "grad_norm": 0.9567093849182129,
+      "learning_rate": 6.29302633861465e-06,
+      "loss": 0.7992,
+      "step": 19923
+    },
+    {
+      "epoch": 3.5471866096866096,
+      "grad_norm": 0.9002968668937683,
+      "learning_rate": 6.288140147740329e-06,
+      "loss": 0.7066,
+      "step": 19924
+    },
+    {
+      "epoch": 3.5473646723646723,
+      "grad_norm": 1.0079301595687866,
+      "learning_rate": 6.283255792955245e-06,
+      "loss": 0.938,
+      "step": 19925
+    },
+    {
+      "epoch": 3.547542735042735,
+      "grad_norm": 1.0074832439422607,
+      "learning_rate": 6.278373274355076e-06,
+      "loss": 0.7939,
+      "step": 19926
+    },
+    {
+      "epoch": 3.547720797720798,
+      "grad_norm": 0.8892631530761719,
+      "learning_rate": 6.27349259203549e-06,
+      "loss": 0.8861,
+      "step": 19927
+    },
+    {
+      "epoch": 3.5478988603988606,
+      "grad_norm": 1.0393931865692139,
+      "learning_rate": 6.2686137460921226e-06,
+      "loss": 0.8119,
+      "step": 19928
+    },
+    {
+      "epoch": 3.5480769230769234,
+      "grad_norm": 1.089049220085144,
+      "learning_rate": 6.263736736620551e-06,
+      "loss": 0.8574,
+      "step": 19929
+    },
+    {
+      "epoch": 3.5482549857549857,
+      "grad_norm": 0.887886106967926,
+      "learning_rate": 6.258861563716345e-06,
+      "loss": 0.8319,
+      "step": 19930
+    },
+    {
+      "epoch": 3.5484330484330484,
+      "grad_norm": 1.0811010599136353,
+      "learning_rate": 6.253988227475005e-06,
+      "loss": 0.8569,
+      "step": 19931
+    },
+    {
+      "epoch": 3.548611111111111,
+      "grad_norm": 0.9962393045425415,
+      "learning_rate": 6.2491167279920435e-06,
+      "loss": 0.9301,
+      "step": 19932
+    },
+    {
+      "epoch": 3.548789173789174,
+      "grad_norm": 0.9498433470726013,
+      "learning_rate": 6.244247065362862e-06,
+      "loss": 0.6579,
+      "step": 19933
+    },
+    {
+      "epoch": 3.548967236467236,
+      "grad_norm": 0.958933413028717,
+      "learning_rate": 6.239379239682918e-06,
+      "loss": 0.7045,
+      "step": 19934
+    },
+    {
+      "epoch": 3.549145299145299,
+      "grad_norm": 0.9499155282974243,
+      "learning_rate": 6.234513251047569e-06,
+      "loss": 0.6971,
+      "step": 19935
+    },
+    {
+      "epoch": 3.5493233618233617,
+      "grad_norm": 0.9261385798454285,
+      "learning_rate": 6.2296490995521375e-06,
+      "loss": 0.7128,
+      "step": 19936
+    },
+    {
+      "epoch": 3.5495014245014245,
+      "grad_norm": 0.9812938570976257,
+      "learning_rate": 6.22478678529197e-06,
+      "loss": 0.7182,
+      "step": 19937
+    },
+    {
+      "epoch": 3.5496794871794872,
+      "grad_norm": 0.8759663105010986,
+      "learning_rate": 6.219926308362267e-06,
+      "loss": 0.7853,
+      "step": 19938
+    },
+    {
+      "epoch": 3.54985754985755,
+      "grad_norm": 1.1171659231185913,
+      "learning_rate": 6.2150676688583095e-06,
+      "loss": 0.8763,
+      "step": 19939
+    },
+    {
+      "epoch": 3.5500356125356127,
+      "grad_norm": 0.8224866986274719,
+      "learning_rate": 6.210210866875288e-06,
+      "loss": 0.6219,
+      "step": 19940
+    },
+    {
+      "epoch": 3.5502136752136755,
+      "grad_norm": 1.0393109321594238,
+      "learning_rate": 6.205355902508359e-06,
+      "loss": 0.7143,
+      "step": 19941
+    },
+    {
+      "epoch": 3.550391737891738,
+      "grad_norm": 0.9704436659812927,
+      "learning_rate": 6.200502775852623e-06,
+      "loss": 0.7377,
+      "step": 19942
+    },
+    {
+      "epoch": 3.5505698005698005,
+      "grad_norm": 1.0549170970916748,
+      "learning_rate": 6.1956514870031956e-06,
+      "loss": 0.676,
+      "step": 19943
+    },
+    {
+      "epoch": 3.5507478632478633,
+      "grad_norm": 0.944458544254303,
+      "learning_rate": 6.190802036055121e-06,
+      "loss": 0.678,
+      "step": 19944
+    },
+    {
+      "epoch": 3.550925925925926,
+      "grad_norm": 0.9729311466217041,
+      "learning_rate": 6.185954423103391e-06,
+      "loss": 0.814,
+      "step": 19945
+    },
+    {
+      "epoch": 3.5511039886039883,
+      "grad_norm": 0.6908536553382874,
+      "learning_rate": 6.181108648243006e-06,
+      "loss": 0.4705,
+      "step": 19946
+    },
+    {
+      "epoch": 3.551282051282051,
+      "grad_norm": 0.9184253215789795,
+      "learning_rate": 6.176264711568924e-06,
+      "loss": 0.7976,
+      "step": 19947
+    },
+    {
+      "epoch": 3.551460113960114,
+      "grad_norm": 1.1098743677139282,
+      "learning_rate": 6.171422613176003e-06,
+      "loss": 0.7885,
+      "step": 19948
+    },
+    {
+      "epoch": 3.5516381766381766,
+      "grad_norm": 0.9816890358924866,
+      "learning_rate": 6.166582353159156e-06,
+      "loss": 0.8283,
+      "step": 19949
+    },
+    {
+      "epoch": 3.5518162393162394,
+      "grad_norm": 1.26338529586792,
+      "learning_rate": 6.161743931613217e-06,
+      "loss": 0.7938,
+      "step": 19950
+    },
+    {
+      "epoch": 3.551994301994302,
+      "grad_norm": 0.8663501143455505,
+      "learning_rate": 6.156907348632968e-06,
+      "loss": 0.8333,
+      "step": 19951
+    },
+    {
+      "epoch": 3.552172364672365,
+      "grad_norm": 0.9667189717292786,
+      "learning_rate": 6.152072604313175e-06,
+      "loss": 0.7434,
+      "step": 19952
+    },
+    {
+      "epoch": 3.5523504273504276,
+      "grad_norm": 1.1370351314544678,
+      "learning_rate": 6.147239698748553e-06,
+      "loss": 0.6546,
+      "step": 19953
+    },
+    {
+      "epoch": 3.55252849002849,
+      "grad_norm": 0.8451266884803772,
+      "learning_rate": 6.142408632033836e-06,
+      "loss": 0.6856,
+      "step": 19954
+    },
+    {
+      "epoch": 3.5527065527065527,
+      "grad_norm": 0.7988196611404419,
+      "learning_rate": 6.137579404263627e-06,
+      "loss": 0.6,
+      "step": 19955
+    },
+    {
+      "epoch": 3.5528846153846154,
+      "grad_norm": 0.9842251539230347,
+      "learning_rate": 6.132752015532572e-06,
+      "loss": 0.9265,
+      "step": 19956
+    },
+    {
+      "epoch": 3.553062678062678,
+      "grad_norm": 1.0470566749572754,
+      "learning_rate": 6.127926465935252e-06,
+      "loss": 0.7726,
+      "step": 19957
+    },
+    {
+      "epoch": 3.5532407407407405,
+      "grad_norm": 0.9638791084289551,
+      "learning_rate": 6.1231027555662125e-06,
+      "loss": 0.8386,
+      "step": 19958
+    },
+    {
+      "epoch": 3.5534188034188032,
+      "grad_norm": 0.9695351123809814,
+      "learning_rate": 6.1182808845199665e-06,
+      "loss": 0.8146,
+      "step": 19959
+    },
+    {
+      "epoch": 3.553596866096866,
+      "grad_norm": 1.1904405355453491,
+      "learning_rate": 6.1134608528909735e-06,
+      "loss": 0.8364,
+      "step": 19960
+    },
+    {
+      "epoch": 3.5537749287749287,
+      "grad_norm": 0.9649089574813843,
+      "learning_rate": 6.108642660773678e-06,
+      "loss": 0.5882,
+      "step": 19961
+    },
+    {
+      "epoch": 3.5539529914529915,
+      "grad_norm": 1.0587506294250488,
+      "learning_rate": 6.103826308262517e-06,
+      "loss": 0.6633,
+      "step": 19962
+    },
+    {
+      "epoch": 3.5541310541310542,
+      "grad_norm": 0.9397716522216797,
+      "learning_rate": 6.099011795451792e-06,
+      "loss": 0.7343,
+      "step": 19963
+    },
+    {
+      "epoch": 3.554309116809117,
+      "grad_norm": 0.8498682975769043,
+      "learning_rate": 6.094199122435895e-06,
+      "loss": 0.6133,
+      "step": 19964
+    },
+    {
+      "epoch": 3.5544871794871797,
+      "grad_norm": 1.1127020120620728,
+      "learning_rate": 6.089388289309073e-06,
+      "loss": 0.9287,
+      "step": 19965
+    },
+    {
+      "epoch": 3.554665242165242,
+      "grad_norm": 0.9622563719749451,
+      "learning_rate": 6.084579296165605e-06,
+      "loss": 0.6882,
+      "step": 19966
+    },
+    {
+      "epoch": 3.554843304843305,
+      "grad_norm": 1.1055150032043457,
+      "learning_rate": 6.079772143099726e-06,
+      "loss": 0.7718,
+      "step": 19967
+    },
+    {
+      "epoch": 3.5550213675213675,
+      "grad_norm": 0.9930889010429382,
+      "learning_rate": 6.074966830205608e-06,
+      "loss": 0.6884,
+      "step": 19968
+    },
+    {
+      "epoch": 3.5551994301994303,
+      "grad_norm": 0.8389014601707458,
+      "learning_rate": 6.070163357577408e-06,
+      "loss": 0.6328,
+      "step": 19969
+    },
+    {
+      "epoch": 3.5553774928774926,
+      "grad_norm": 0.8436742424964905,
+      "learning_rate": 6.065361725309227e-06,
+      "loss": 0.7243,
+      "step": 19970
+    },
+    {
+      "epoch": 3.5555555555555554,
+      "grad_norm": 0.8762266635894775,
+      "learning_rate": 6.060561933495135e-06,
+      "loss": 0.8242,
+      "step": 19971
+    },
+    {
+      "epoch": 3.555733618233618,
+      "grad_norm": 1.049564003944397,
+      "learning_rate": 6.055763982229224e-06,
+      "loss": 0.7959,
+      "step": 19972
+    },
+    {
+      "epoch": 3.555911680911681,
+      "grad_norm": 0.8714830279350281,
+      "learning_rate": 6.050967871605429e-06,
+      "loss": 0.7953,
+      "step": 19973
+    },
+    {
+      "epoch": 3.5560897435897436,
+      "grad_norm": 1.0046193599700928,
+      "learning_rate": 6.046173601717775e-06,
+      "loss": 1.0607,
+      "step": 19974
+    },
+    {
+      "epoch": 3.5562678062678064,
+      "grad_norm": 0.8975396156311035,
+      "learning_rate": 6.041381172660177e-06,
+      "loss": 0.66,
+      "step": 19975
+    },
+    {
+      "epoch": 3.556445868945869,
+      "grad_norm": 1.0665318965911865,
+      "learning_rate": 6.036590584526525e-06,
+      "loss": 0.8612,
+      "step": 19976
+    },
+    {
+      "epoch": 3.556623931623932,
+      "grad_norm": 0.908662736415863,
+      "learning_rate": 6.031801837410711e-06,
+      "loss": 0.6435,
+      "step": 19977
+    },
+    {
+      "epoch": 3.556801994301994,
+      "grad_norm": 0.9590704441070557,
+      "learning_rate": 6.027014931406505e-06,
+      "loss": 0.6757,
+      "step": 19978
+    },
+    {
+      "epoch": 3.556980056980057,
+      "grad_norm": 1.0787307024002075,
+      "learning_rate": 6.022229866607765e-06,
+      "loss": 0.8774,
+      "step": 19979
+    },
+    {
+      "epoch": 3.5571581196581197,
+      "grad_norm": 0.9405365586280823,
+      "learning_rate": 6.017446643108171e-06,
+      "loss": 0.7256,
+      "step": 19980
+    },
+    {
+      "epoch": 3.5573361823361824,
+      "grad_norm": 1.0895098447799683,
+      "learning_rate": 6.0126652610015045e-06,
+      "loss": 0.8127,
+      "step": 19981
+    },
+    {
+      "epoch": 3.557514245014245,
+      "grad_norm": 0.8931481838226318,
+      "learning_rate": 6.007885720381412e-06,
+      "loss": 0.6243,
+      "step": 19982
+    },
+    {
+      "epoch": 3.5576923076923075,
+      "grad_norm": 0.9177520275115967,
+      "learning_rate": 6.003108021341553e-06,
+      "loss": 0.7058,
+      "step": 19983
+    },
+    {
+      "epoch": 3.5578703703703702,
+      "grad_norm": 1.112932801246643,
+      "learning_rate": 5.998332163975528e-06,
+      "loss": 0.8612,
+      "step": 19984
+    },
+    {
+      "epoch": 3.558048433048433,
+      "grad_norm": 1.0220192670822144,
+      "learning_rate": 5.99355814837691e-06,
+      "loss": 0.7389,
+      "step": 19985
+    },
+    {
+      "epoch": 3.5582264957264957,
+      "grad_norm": 0.9819709062576294,
+      "learning_rate": 5.988785974639232e-06,
+      "loss": 0.7656,
+      "step": 19986
+    },
+    {
+      "epoch": 3.5584045584045585,
+      "grad_norm": 1.1695373058319092,
+      "learning_rate": 5.984015642856022e-06,
+      "loss": 0.959,
+      "step": 19987
+    },
+    {
+      "epoch": 3.5585826210826212,
+      "grad_norm": 0.9889600276947021,
+      "learning_rate": 5.979247153120693e-06,
+      "loss": 0.8078,
+      "step": 19988
+    },
+    {
+      "epoch": 3.558760683760684,
+      "grad_norm": 1.030166506767273,
+      "learning_rate": 5.974480505526747e-06,
+      "loss": 0.8582,
+      "step": 19989
+    },
+    {
+      "epoch": 3.5589387464387463,
+      "grad_norm": 0.9108613729476929,
+      "learning_rate": 5.9697157001674885e-06,
+      "loss": 0.8143,
+      "step": 19990
+    },
+    {
+      "epoch": 3.559116809116809,
+      "grad_norm": 0.9014456868171692,
+      "learning_rate": 5.964952737136353e-06,
+      "loss": 0.7242,
+      "step": 19991
+    },
+    {
+      "epoch": 3.559294871794872,
+      "grad_norm": 0.9596982598304749,
+      "learning_rate": 5.960191616526612e-06,
+      "loss": 0.7093,
+      "step": 19992
+    },
+    {
+      "epoch": 3.5594729344729346,
+      "grad_norm": 0.9499574899673462,
+      "learning_rate": 5.955432338431555e-06,
+      "loss": 0.6325,
+      "step": 19993
+    },
+    {
+      "epoch": 3.5596509971509973,
+      "grad_norm": 0.9167241454124451,
+      "learning_rate": 5.950674902944475e-06,
+      "loss": 0.7084,
+      "step": 19994
+    },
+    {
+      "epoch": 3.5598290598290596,
+      "grad_norm": 0.8762951493263245,
+      "learning_rate": 5.945919310158521e-06,
+      "loss": 0.6922,
+      "step": 19995
+    },
+    {
+      "epoch": 3.5600071225071224,
+      "grad_norm": 1.0726141929626465,
+      "learning_rate": 5.941165560166906e-06,
+      "loss": 0.7219,
+      "step": 19996
+    },
+    {
+      "epoch": 3.560185185185185,
+      "grad_norm": 1.2705079317092896,
+      "learning_rate": 5.936413653062767e-06,
+      "loss": 0.9296,
+      "step": 19997
+    },
+    {
+      "epoch": 3.560363247863248,
+      "grad_norm": 0.9176344275474548,
+      "learning_rate": 5.931663588939196e-06,
+      "loss": 0.8195,
+      "step": 19998
+    },
+    {
+      "epoch": 3.5605413105413106,
+      "grad_norm": 1.0315814018249512,
+      "learning_rate": 5.926915367889274e-06,
+      "loss": 0.8144,
+      "step": 19999
+    },
+    {
+      "epoch": 3.5607193732193734,
+      "grad_norm": 0.9186922907829285,
+      "learning_rate": 5.922168990006016e-06,
+      "loss": 0.8895,
+      "step": 20000
+    },
+    {
+      "epoch": 3.560897435897436,
+      "grad_norm": 1.0576261281967163,
+      "learning_rate": 5.917424455382437e-06,
+      "loss": 0.9221,
+      "step": 20001
+    },
+    {
+      "epoch": 3.5610754985754984,
+      "grad_norm": 0.8663935661315918,
+      "learning_rate": 5.912681764111483e-06,
+      "loss": 0.6516,
+      "step": 20002
+    },
+    {
+      "epoch": 3.561253561253561,
+      "grad_norm": 0.9327130913734436,
+      "learning_rate": 5.907940916286059e-06,
+      "loss": 0.7252,
+      "step": 20003
+    },
+    {
+      "epoch": 3.561431623931624,
+      "grad_norm": 0.9112497568130493,
+      "learning_rate": 5.903201911999112e-06,
+      "loss": 0.6984,
+      "step": 20004
+    },
+    {
+      "epoch": 3.5616096866096867,
+      "grad_norm": 0.8391700983047485,
+      "learning_rate": 5.8984647513434245e-06,
+      "loss": 0.7863,
+      "step": 20005
+    },
+    {
+      "epoch": 3.5617877492877494,
+      "grad_norm": 0.9279758334159851,
+      "learning_rate": 5.893729434411854e-06,
+      "loss": 0.7333,
+      "step": 20006
+    },
+    {
+      "epoch": 3.5619658119658117,
+      "grad_norm": 0.9002404808998108,
+      "learning_rate": 5.888995961297161e-06,
+      "loss": 0.557,
+      "step": 20007
+    },
+    {
+      "epoch": 3.5621438746438745,
+      "grad_norm": 0.9967086315155029,
+      "learning_rate": 5.884264332092104e-06,
+      "loss": 0.6941,
+      "step": 20008
+    },
+    {
+      "epoch": 3.5623219373219372,
+      "grad_norm": 1.030300498008728,
+      "learning_rate": 5.879534546889365e-06,
+      "loss": 0.7942,
+      "step": 20009
+    },
+    {
+      "epoch": 3.5625,
+      "grad_norm": 0.9536016583442688,
+      "learning_rate": 5.8748066057816354e-06,
+      "loss": 0.7312,
+      "step": 20010
+    },
+    {
+      "epoch": 3.5626780626780628,
+      "grad_norm": 1.1330475807189941,
+      "learning_rate": 5.870080508861531e-06,
+      "loss": 0.7598,
+      "step": 20011
+    },
+    {
+      "epoch": 3.5628561253561255,
+      "grad_norm": 0.9957557916641235,
+      "learning_rate": 5.865356256221688e-06,
+      "loss": 0.846,
+      "step": 20012
+    },
+    {
+      "epoch": 3.5630341880341883,
+      "grad_norm": 1.0001511573791504,
+      "learning_rate": 5.860633847954611e-06,
+      "loss": 0.7996,
+      "step": 20013
+    },
+    {
+      "epoch": 3.5632122507122506,
+      "grad_norm": 1.0668739080429077,
+      "learning_rate": 5.8559132841528696e-06,
+      "loss": 0.7621,
+      "step": 20014
+    },
+    {
+      "epoch": 3.5633903133903133,
+      "grad_norm": 1.0167616605758667,
+      "learning_rate": 5.851194564908946e-06,
+      "loss": 0.8459,
+      "step": 20015
+    },
+    {
+      "epoch": 3.563568376068376,
+      "grad_norm": 1.017533302307129,
+      "learning_rate": 5.8464776903152775e-06,
+      "loss": 0.4954,
+      "step": 20016
+    },
+    {
+      "epoch": 3.563746438746439,
+      "grad_norm": 1.1081538200378418,
+      "learning_rate": 5.8417626604643005e-06,
+      "loss": 0.8442,
+      "step": 20017
+    },
+    {
+      "epoch": 3.5639245014245016,
+      "grad_norm": 0.9077942967414856,
+      "learning_rate": 5.837049475448375e-06,
+      "loss": 0.8122,
+      "step": 20018
+    },
+    {
+      "epoch": 3.564102564102564,
+      "grad_norm": 0.8072832822799683,
+      "learning_rate": 5.832338135359883e-06,
+      "loss": 0.6718,
+      "step": 20019
+    },
+    {
+      "epoch": 3.5642806267806266,
+      "grad_norm": 0.928887665271759,
+      "learning_rate": 5.8276286402910715e-06,
+      "loss": 0.7962,
+      "step": 20020
+    },
+    {
+      "epoch": 3.5644586894586894,
+      "grad_norm": 0.9171196818351746,
+      "learning_rate": 5.82292099033428e-06,
+      "loss": 0.7408,
+      "step": 20021
+    },
+    {
+      "epoch": 3.564636752136752,
+      "grad_norm": 0.900301992893219,
+      "learning_rate": 5.818215185581699e-06,
+      "loss": 0.7775,
+      "step": 20022
+    },
+    {
+      "epoch": 3.564814814814815,
+      "grad_norm": 1.0731996297836304,
+      "learning_rate": 5.813511226125557e-06,
+      "loss": 0.8732,
+      "step": 20023
+    },
+    {
+      "epoch": 3.5649928774928776,
+      "grad_norm": 1.1355241537094116,
+      "learning_rate": 5.808809112058011e-06,
+      "loss": 0.7019,
+      "step": 20024
+    },
+    {
+      "epoch": 3.5651709401709404,
+      "grad_norm": 0.8540470600128174,
+      "learning_rate": 5.804108843471179e-06,
+      "loss": 0.6102,
+      "step": 20025
+    },
+    {
+      "epoch": 3.5653490028490027,
+      "grad_norm": 0.9603937268257141,
+      "learning_rate": 5.799410420457163e-06,
+      "loss": 0.7485,
+      "step": 20026
+    },
+    {
+      "epoch": 3.5655270655270654,
+      "grad_norm": 0.9940163493156433,
+      "learning_rate": 5.794713843108013e-06,
+      "loss": 0.7051,
+      "step": 20027
+    },
+    {
+      "epoch": 3.565705128205128,
+      "grad_norm": 0.9198596477508545,
+      "learning_rate": 5.790019111515732e-06,
+      "loss": 0.7189,
+      "step": 20028
+    },
+    {
+      "epoch": 3.565883190883191,
+      "grad_norm": 0.8233235478401184,
+      "learning_rate": 5.7853262257723476e-06,
+      "loss": 0.5998,
+      "step": 20029
+    },
+    {
+      "epoch": 3.5660612535612537,
+      "grad_norm": 1.0005290508270264,
+      "learning_rate": 5.780635185969762e-06,
+      "loss": 0.7284,
+      "step": 20030
+    },
+    {
+      "epoch": 3.566239316239316,
+      "grad_norm": 0.8798156380653381,
+      "learning_rate": 5.775945992199916e-06,
+      "loss": 0.6115,
+      "step": 20031
+    },
+    {
+      "epoch": 3.5664173789173788,
+      "grad_norm": 0.9834653735160828,
+      "learning_rate": 5.771258644554678e-06,
+      "loss": 0.6293,
+      "step": 20032
+    },
+    {
+      "epoch": 3.5665954415954415,
+      "grad_norm": 0.929243266582489,
+      "learning_rate": 5.7665731431258755e-06,
+      "loss": 0.7863,
+      "step": 20033
+    },
+    {
+      "epoch": 3.5667735042735043,
+      "grad_norm": 1.131831169128418,
+      "learning_rate": 5.7618894880053344e-06,
+      "loss": 0.8469,
+      "step": 20034
+    },
+    {
+      "epoch": 3.566951566951567,
+      "grad_norm": 0.9003351330757141,
+      "learning_rate": 5.757207679284782e-06,
+      "loss": 0.6531,
+      "step": 20035
+    },
+    {
+      "epoch": 3.5671296296296298,
+      "grad_norm": 0.9676564931869507,
+      "learning_rate": 5.752527717056011e-06,
+      "loss": 0.7791,
+      "step": 20036
+    },
+    {
+      "epoch": 3.5673076923076925,
+      "grad_norm": 0.9569422602653503,
+      "learning_rate": 5.7478496014106374e-06,
+      "loss": 0.8359,
+      "step": 20037
+    },
+    {
+      "epoch": 3.567485754985755,
+      "grad_norm": 1.0955125093460083,
+      "learning_rate": 5.743173332440377e-06,
+      "loss": 0.6683,
+      "step": 20038
+    },
+    {
+      "epoch": 3.5676638176638176,
+      "grad_norm": 1.0164920091629028,
+      "learning_rate": 5.738498910236834e-06,
+      "loss": 0.7597,
+      "step": 20039
+    },
+    {
+      "epoch": 3.5678418803418803,
+      "grad_norm": 0.977257490158081,
+      "learning_rate": 5.73382633489159e-06,
+      "loss": 0.8192,
+      "step": 20040
+    },
+    {
+      "epoch": 3.568019943019943,
+      "grad_norm": 0.9285514950752258,
+      "learning_rate": 5.729155606496195e-06,
+      "loss": 0.8168,
+      "step": 20041
+    },
+    {
+      "epoch": 3.568198005698006,
+      "grad_norm": 1.0951110124588013,
+      "learning_rate": 5.724486725142175e-06,
+      "loss": 0.8083,
+      "step": 20042
+    },
+    {
+      "epoch": 3.568376068376068,
+      "grad_norm": 0.9273848533630371,
+      "learning_rate": 5.71981969092098e-06,
+      "loss": 0.6098,
+      "step": 20043
+    },
+    {
+      "epoch": 3.568554131054131,
+      "grad_norm": 1.0327738523483276,
+      "learning_rate": 5.7151545039240915e-06,
+      "loss": 0.828,
+      "step": 20044
+    },
+    {
+      "epoch": 3.5687321937321936,
+      "grad_norm": 1.0657696723937988,
+      "learning_rate": 5.71049116424287e-06,
+      "loss": 0.787,
+      "step": 20045
+    },
+    {
+      "epoch": 3.5689102564102564,
+      "grad_norm": 0.9822142720222473,
+      "learning_rate": 5.7058296719687095e-06,
+      "loss": 0.8614,
+      "step": 20046
+    },
+    {
+      "epoch": 3.569088319088319,
+      "grad_norm": 0.8666147589683533,
+      "learning_rate": 5.701170027192948e-06,
+      "loss": 0.6601,
+      "step": 20047
+    },
+    {
+      "epoch": 3.569266381766382,
+      "grad_norm": 0.9923439621925354,
+      "learning_rate": 5.696512230006867e-06,
+      "loss": 0.7775,
+      "step": 20048
+    },
+    {
+      "epoch": 3.5694444444444446,
+      "grad_norm": 0.9324108958244324,
+      "learning_rate": 5.691856280501728e-06,
+      "loss": 0.7463,
+      "step": 20049
+    },
+    {
+      "epoch": 3.5696225071225074,
+      "grad_norm": 0.9872146844863892,
+      "learning_rate": 5.687202178768758e-06,
+      "loss": 0.6716,
+      "step": 20050
+    },
+    {
+      "epoch": 3.5698005698005697,
+      "grad_norm": 1.0207147598266602,
+      "learning_rate": 5.682549924899139e-06,
+      "loss": 0.7594,
+      "step": 20051
+    },
+    {
+      "epoch": 3.5699786324786325,
+      "grad_norm": 1.0404084920883179,
+      "learning_rate": 5.677899518984031e-06,
+      "loss": 0.7924,
+      "step": 20052
+    },
+    {
+      "epoch": 3.570156695156695,
+      "grad_norm": 0.996023416519165,
+      "learning_rate": 5.673250961114529e-06,
+      "loss": 0.7175,
+      "step": 20053
+    },
+    {
+      "epoch": 3.570334757834758,
+      "grad_norm": 0.8972271084785461,
+      "learning_rate": 5.668604251381748e-06,
+      "loss": 0.7454,
+      "step": 20054
+    },
+    {
+      "epoch": 3.5705128205128203,
+      "grad_norm": 0.9665964841842651,
+      "learning_rate": 5.663959389876705e-06,
+      "loss": 0.8127,
+      "step": 20055
+    },
+    {
+      "epoch": 3.570690883190883,
+      "grad_norm": 0.9740979075431824,
+      "learning_rate": 5.659316376690416e-06,
+      "loss": 0.9236,
+      "step": 20056
+    },
+    {
+      "epoch": 3.5708689458689458,
+      "grad_norm": 0.972089946269989,
+      "learning_rate": 5.65467521191384e-06,
+      "loss": 0.8145,
+      "step": 20057
+    },
+    {
+      "epoch": 3.5710470085470085,
+      "grad_norm": 0.9361711740493774,
+      "learning_rate": 5.650035895637906e-06,
+      "loss": 0.665,
+      "step": 20058
+    },
+    {
+      "epoch": 3.5712250712250713,
+      "grad_norm": 0.9758422374725342,
+      "learning_rate": 5.645398427953552e-06,
+      "loss": 0.7318,
+      "step": 20059
+    },
+    {
+      "epoch": 3.571403133903134,
+      "grad_norm": 0.8265459537506104,
+      "learning_rate": 5.640762808951583e-06,
+      "loss": 0.6444,
+      "step": 20060
+    },
+    {
+      "epoch": 3.5715811965811968,
+      "grad_norm": 1.0095800161361694,
+      "learning_rate": 5.6361290387228814e-06,
+      "loss": 0.8519,
+      "step": 20061
+    },
+    {
+      "epoch": 3.5717592592592595,
+      "grad_norm": 0.9909504055976868,
+      "learning_rate": 5.631497117358175e-06,
+      "loss": 0.8529,
+      "step": 20062
+    },
+    {
+      "epoch": 3.571937321937322,
+      "grad_norm": 0.7235755920410156,
+      "learning_rate": 5.62686704494827e-06,
+      "loss": 0.5298,
+      "step": 20063
+    },
+    {
+      "epoch": 3.5721153846153846,
+      "grad_norm": 0.991928219795227,
+      "learning_rate": 5.622238821583858e-06,
+      "loss": 0.6757,
+      "step": 20064
+    },
+    {
+      "epoch": 3.5722934472934473,
+      "grad_norm": 0.9465965628623962,
+      "learning_rate": 5.617612447355624e-06,
+      "loss": 0.6856,
+      "step": 20065
+    },
+    {
+      "epoch": 3.57247150997151,
+      "grad_norm": 0.9528710246086121,
+      "learning_rate": 5.612987922354207e-06,
+      "loss": 0.758,
+      "step": 20066
+    },
+    {
+      "epoch": 3.5726495726495724,
+      "grad_norm": 1.0337190628051758,
+      "learning_rate": 5.608365246670233e-06,
+      "loss": 0.742,
+      "step": 20067
+    },
+    {
+      "epoch": 3.572827635327635,
+      "grad_norm": 0.9663426876068115,
+      "learning_rate": 5.60374442039423e-06,
+      "loss": 0.7674,
+      "step": 20068
+    },
+    {
+      "epoch": 3.573005698005698,
+      "grad_norm": 0.7379756569862366,
+      "learning_rate": 5.599125443616804e-06,
+      "loss": 0.516,
+      "step": 20069
+    },
+    {
+      "epoch": 3.5731837606837606,
+      "grad_norm": 0.932037889957428,
+      "learning_rate": 5.5945083164283815e-06,
+      "loss": 0.7166,
+      "step": 20070
+    },
+    {
+      "epoch": 3.5733618233618234,
+      "grad_norm": 0.9008628726005554,
+      "learning_rate": 5.5898930389194915e-06,
+      "loss": 0.7829,
+      "step": 20071
+    },
+    {
+      "epoch": 3.573539886039886,
+      "grad_norm": 0.8821158409118652,
+      "learning_rate": 5.585279611180494e-06,
+      "loss": 0.8116,
+      "step": 20072
+    },
+    {
+      "epoch": 3.573717948717949,
+      "grad_norm": 0.9247183203697205,
+      "learning_rate": 5.580668033301817e-06,
+      "loss": 0.8132,
+      "step": 20073
+    },
+    {
+      "epoch": 3.5738960113960117,
+      "grad_norm": 0.983182430267334,
+      "learning_rate": 5.576058305373822e-06,
+      "loss": 0.6751,
+      "step": 20074
+    },
+    {
+      "epoch": 3.574074074074074,
+      "grad_norm": 0.9759270548820496,
+      "learning_rate": 5.5714504274868015e-06,
+      "loss": 0.7332,
+      "step": 20075
+    },
+    {
+      "epoch": 3.5742521367521367,
+      "grad_norm": 1.0891354084014893,
+      "learning_rate": 5.566844399731086e-06,
+      "loss": 0.7377,
+      "step": 20076
+    },
+    {
+      "epoch": 3.5744301994301995,
+      "grad_norm": 1.0494391918182373,
+      "learning_rate": 5.5622402221968465e-06,
+      "loss": 0.8022,
+      "step": 20077
+    },
+    {
+      "epoch": 3.574608262108262,
+      "grad_norm": 0.9215951561927795,
+      "learning_rate": 5.557637894974354e-06,
+      "loss": 0.695,
+      "step": 20078
+    },
+    {
+      "epoch": 3.5747863247863245,
+      "grad_norm": 1.1212730407714844,
+      "learning_rate": 5.55303741815375e-06,
+      "loss": 0.8315,
+      "step": 20079
+    },
+    {
+      "epoch": 3.5749643874643873,
+      "grad_norm": 0.816863477230072,
+      "learning_rate": 5.548438791825195e-06,
+      "loss": 0.9294,
+      "step": 20080
+    },
+    {
+      "epoch": 3.57514245014245,
+      "grad_norm": 0.9771558046340942,
+      "learning_rate": 5.543842016078771e-06,
+      "loss": 0.783,
+      "step": 20081
+    },
+    {
+      "epoch": 3.5753205128205128,
+      "grad_norm": 0.8157296776771545,
+      "learning_rate": 5.53924709100454e-06,
+      "loss": 0.572,
+      "step": 20082
+    },
+    {
+      "epoch": 3.5754985754985755,
+      "grad_norm": 0.8209017515182495,
+      "learning_rate": 5.5346540166925305e-06,
+      "loss": 0.5762,
+      "step": 20083
+    },
+    {
+      "epoch": 3.5756766381766383,
+      "grad_norm": 0.9781062006950378,
+      "learning_rate": 5.530062793232771e-06,
+      "loss": 0.8179,
+      "step": 20084
+    },
+    {
+      "epoch": 3.575854700854701,
+      "grad_norm": 0.874980092048645,
+      "learning_rate": 5.525473420715155e-06,
+      "loss": 0.6966,
+      "step": 20085
+    },
+    {
+      "epoch": 3.576032763532764,
+      "grad_norm": 0.9744297862052917,
+      "learning_rate": 5.520885899229655e-06,
+      "loss": 0.9015,
+      "step": 20086
+    },
+    {
+      "epoch": 3.576210826210826,
+      "grad_norm": 0.8454684615135193,
+      "learning_rate": 5.516300228866111e-06,
+      "loss": 0.5204,
+      "step": 20087
+    },
+    {
+      "epoch": 3.576388888888889,
+      "grad_norm": 0.8786317110061646,
+      "learning_rate": 5.511716409714407e-06,
+      "loss": 0.7868,
+      "step": 20088
+    },
+    {
+      "epoch": 3.5765669515669516,
+      "grad_norm": 0.8432426452636719,
+      "learning_rate": 5.507134441864337e-06,
+      "loss": 0.6851,
+      "step": 20089
+    },
+    {
+      "epoch": 3.5767450142450143,
+      "grad_norm": 1.0218333005905151,
+      "learning_rate": 5.502554325405674e-06,
+      "loss": 0.836,
+      "step": 20090
+    },
+    {
+      "epoch": 3.5769230769230766,
+      "grad_norm": 0.8945069909095764,
+      "learning_rate": 5.497976060428156e-06,
+      "loss": 0.593,
+      "step": 20091
+    },
+    {
+      "epoch": 3.5771011396011394,
+      "grad_norm": 0.9170123934745789,
+      "learning_rate": 5.493399647021469e-06,
+      "loss": 0.8404,
+      "step": 20092
+    },
+    {
+      "epoch": 3.577279202279202,
+      "grad_norm": 0.8489331603050232,
+      "learning_rate": 5.488825085275317e-06,
+      "loss": 0.7002,
+      "step": 20093
+    },
+    {
+      "epoch": 3.577457264957265,
+      "grad_norm": 0.9071032404899597,
+      "learning_rate": 5.484252375279297e-06,
+      "loss": 0.79,
+      "step": 20094
+    },
+    {
+      "epoch": 3.5776353276353277,
+      "grad_norm": 1.0012317895889282,
+      "learning_rate": 5.479681517123026e-06,
+      "loss": 0.6822,
+      "step": 20095
+    },
+    {
+      "epoch": 3.5778133903133904,
+      "grad_norm": 0.9741858839988708,
+      "learning_rate": 5.475112510896041e-06,
+      "loss": 0.7569,
+      "step": 20096
+    },
+    {
+      "epoch": 3.577991452991453,
+      "grad_norm": 0.8876017928123474,
+      "learning_rate": 5.470545356687862e-06,
+      "loss": 0.8028,
+      "step": 20097
+    },
+    {
+      "epoch": 3.578169515669516,
+      "grad_norm": 0.8617599606513977,
+      "learning_rate": 5.465980054587983e-06,
+      "loss": 0.7123,
+      "step": 20098
+    },
+    {
+      "epoch": 3.578347578347578,
+      "grad_norm": 0.844270646572113,
+      "learning_rate": 5.461416604685854e-06,
+      "loss": 0.6884,
+      "step": 20099
+    },
+    {
+      "epoch": 3.578525641025641,
+      "grad_norm": 0.9308794140815735,
+      "learning_rate": 5.45685500707086e-06,
+      "loss": 0.6827,
+      "step": 20100
+    },
+    {
+      "epoch": 3.5787037037037037,
+      "grad_norm": 0.8432201743125916,
+      "learning_rate": 5.452295261832418e-06,
+      "loss": 0.7057,
+      "step": 20101
+    },
+    {
+      "epoch": 3.5788817663817665,
+      "grad_norm": 1.0532596111297607,
+      "learning_rate": 5.447737369059824e-06,
+      "loss": 0.7688,
+      "step": 20102
+    },
+    {
+      "epoch": 3.5790598290598292,
+      "grad_norm": 0.8805003762245178,
+      "learning_rate": 5.443181328842417e-06,
+      "loss": 0.6596,
+      "step": 20103
+    },
+    {
+      "epoch": 3.5792378917378915,
+      "grad_norm": 0.8240517973899841,
+      "learning_rate": 5.438627141269437e-06,
+      "loss": 0.5545,
+      "step": 20104
+    },
+    {
+      "epoch": 3.5794159544159543,
+      "grad_norm": 0.9336821436882019,
+      "learning_rate": 5.434074806430134e-06,
+      "loss": 0.7931,
+      "step": 20105
+    },
+    {
+      "epoch": 3.579594017094017,
+      "grad_norm": 0.978220522403717,
+      "learning_rate": 5.429524324413693e-06,
+      "loss": 0.9601,
+      "step": 20106
+    },
+    {
+      "epoch": 3.57977207977208,
+      "grad_norm": 0.9296578764915466,
+      "learning_rate": 5.424975695309265e-06,
+      "loss": 0.8462,
+      "step": 20107
+    },
+    {
+      "epoch": 3.5799501424501425,
+      "grad_norm": 0.9612907767295837,
+      "learning_rate": 5.4204289192059664e-06,
+      "loss": 0.6046,
+      "step": 20108
+    },
+    {
+      "epoch": 3.5801282051282053,
+      "grad_norm": 0.9216350317001343,
+      "learning_rate": 5.415883996192905e-06,
+      "loss": 0.7423,
+      "step": 20109
+    },
+    {
+      "epoch": 3.580306267806268,
+      "grad_norm": 0.9715888500213623,
+      "learning_rate": 5.4113409263590985e-06,
+      "loss": 0.6903,
+      "step": 20110
+    },
+    {
+      "epoch": 3.5804843304843303,
+      "grad_norm": 0.9958703517913818,
+      "learning_rate": 5.406799709793597e-06,
+      "loss": 0.7843,
+      "step": 20111
+    },
+    {
+      "epoch": 3.580662393162393,
+      "grad_norm": 0.9611614942550659,
+      "learning_rate": 5.40226034658533e-06,
+      "loss": 0.7979,
+      "step": 20112
+    },
+    {
+      "epoch": 3.580840455840456,
+      "grad_norm": 0.9550825953483582,
+      "learning_rate": 5.397722836823282e-06,
+      "loss": 0.7704,
+      "step": 20113
+    },
+    {
+      "epoch": 3.5810185185185186,
+      "grad_norm": 0.8597258925437927,
+      "learning_rate": 5.393187180596326e-06,
+      "loss": 1.0874,
+      "step": 20114
+    },
+    {
+      "epoch": 3.5811965811965814,
+      "grad_norm": 1.0289723873138428,
+      "learning_rate": 5.388653377993324e-06,
+      "loss": 0.7821,
+      "step": 20115
+    },
+    {
+      "epoch": 3.5813746438746437,
+      "grad_norm": 1.0703620910644531,
+      "learning_rate": 5.384121429103161e-06,
+      "loss": 0.7892,
+      "step": 20116
+    },
+    {
+      "epoch": 3.5815527065527064,
+      "grad_norm": 1.0069445371627808,
+      "learning_rate": 5.379591334014556e-06,
+      "loss": 0.6241,
+      "step": 20117
+    },
+    {
+      "epoch": 3.581730769230769,
+      "grad_norm": 1.050201654434204,
+      "learning_rate": 5.375063092816313e-06,
+      "loss": 0.5873,
+      "step": 20118
+    },
+    {
+      "epoch": 3.581908831908832,
+      "grad_norm": 0.9159594774246216,
+      "learning_rate": 5.370536705597151e-06,
+      "loss": 0.7451,
+      "step": 20119
+    },
+    {
+      "epoch": 3.5820868945868947,
+      "grad_norm": 0.9135583639144897,
+      "learning_rate": 5.366012172445734e-06,
+      "loss": 0.6258,
+      "step": 20120
+    },
+    {
+      "epoch": 3.5822649572649574,
+      "grad_norm": 0.9547392725944519,
+      "learning_rate": 5.3614894934507335e-06,
+      "loss": 0.8267,
+      "step": 20121
+    },
+    {
+      "epoch": 3.58244301994302,
+      "grad_norm": 0.9301168918609619,
+      "learning_rate": 5.356968668700746e-06,
+      "loss": 0.6885,
+      "step": 20122
+    },
+    {
+      "epoch": 3.5826210826210825,
+      "grad_norm": 0.9445184469223022,
+      "learning_rate": 5.352449698284356e-06,
+      "loss": 0.6854,
+      "step": 20123
+    },
+    {
+      "epoch": 3.5827991452991452,
+      "grad_norm": 0.9877057075500488,
+      "learning_rate": 5.347932582290105e-06,
+      "loss": 0.9167,
+      "step": 20124
+    },
+    {
+      "epoch": 3.582977207977208,
+      "grad_norm": 0.8706746697425842,
+      "learning_rate": 5.343417320806477e-06,
+      "loss": 0.8327,
+      "step": 20125
+    },
+    {
+      "epoch": 3.5831552706552707,
+      "grad_norm": 0.9573389887809753,
+      "learning_rate": 5.3389039139219775e-06,
+      "loss": 0.7544,
+      "step": 20126
+    },
+    {
+      "epoch": 3.5833333333333335,
+      "grad_norm": 1.0821263790130615,
+      "learning_rate": 5.334392361724994e-06,
+      "loss": 0.6362,
+      "step": 20127
+    },
+    {
+      "epoch": 3.583511396011396,
+      "grad_norm": 1.0245816707611084,
+      "learning_rate": 5.329882664303943e-06,
+      "loss": 0.7596,
+      "step": 20128
+    },
+    {
+      "epoch": 3.5836894586894585,
+      "grad_norm": 0.9184259176254272,
+      "learning_rate": 5.325374821747197e-06,
+      "loss": 0.8477,
+      "step": 20129
+    },
+    {
+      "epoch": 3.5838675213675213,
+      "grad_norm": 0.983049213886261,
+      "learning_rate": 5.320868834143056e-06,
+      "loss": 0.7234,
+      "step": 20130
+    },
+    {
+      "epoch": 3.584045584045584,
+      "grad_norm": 0.9151159524917603,
+      "learning_rate": 5.316364701579801e-06,
+      "loss": 0.6523,
+      "step": 20131
+    },
+    {
+      "epoch": 3.584223646723647,
+      "grad_norm": 0.9754053950309753,
+      "learning_rate": 5.311862424145686e-06,
+      "loss": 0.7408,
+      "step": 20132
+    },
+    {
+      "epoch": 3.5844017094017095,
+      "grad_norm": 0.8654192090034485,
+      "learning_rate": 5.307362001928961e-06,
+      "loss": 0.6214,
+      "step": 20133
+    },
+    {
+      "epoch": 3.5845797720797723,
+      "grad_norm": 0.9617530703544617,
+      "learning_rate": 5.302863435017735e-06,
+      "loss": 0.7405,
+      "step": 20134
+    },
+    {
+      "epoch": 3.5847578347578346,
+      "grad_norm": Infinity,
+      "learning_rate": 5.302863435017735e-06,
+      "loss": 0.6821,
+      "step": 20135
+    },
+    {
+      "epoch": 3.5849358974358974,
+      "grad_norm": 0.9110551476478577,
+      "learning_rate": 5.298366723500203e-06,
+      "loss": 0.6208,
+      "step": 20136
+    },
+    {
+      "epoch": 3.58511396011396,
+      "grad_norm": 1.0188283920288086,
+      "learning_rate": 5.29387186746445e-06,
+      "loss": 0.809,
+      "step": 20137
+    },
+    {
+      "epoch": 3.585292022792023,
+      "grad_norm": 1.1383823156356812,
+      "learning_rate": 5.28937886699854e-06,
+      "loss": 0.7143,
+      "step": 20138
+    },
+    {
+      "epoch": 3.5854700854700856,
+      "grad_norm": 1.1039072275161743,
+      "learning_rate": 5.284887722190501e-06,
+      "loss": 0.9603,
+      "step": 20139
+    },
+    {
+      "epoch": 3.585648148148148,
+      "grad_norm": 0.8831756711006165,
+      "learning_rate": 5.280398433128342e-06,
+      "loss": 0.6444,
+      "step": 20140
+    },
+    {
+      "epoch": 3.5858262108262107,
+      "grad_norm": 0.8430986404418945,
+      "learning_rate": 5.2759109999000025e-06,
+      "loss": 0.6404,
+      "step": 20141
+    },
+    {
+      "epoch": 3.5860042735042734,
+      "grad_norm": 0.9114181399345398,
+      "learning_rate": 5.271425422593435e-06,
+      "loss": 0.7964,
+      "step": 20142
+    },
+    {
+      "epoch": 3.586182336182336,
+      "grad_norm": 1.0493398904800415,
+      "learning_rate": 5.266941701296479e-06,
+      "loss": 0.8568,
+      "step": 20143
+    },
+    {
+      "epoch": 3.586360398860399,
+      "grad_norm": 0.8649964332580566,
+      "learning_rate": 5.262459836097033e-06,
+      "loss": 0.6505,
+      "step": 20144
+    },
+    {
+      "epoch": 3.5865384615384617,
+      "grad_norm": 0.8710708618164062,
+      "learning_rate": 5.257979827082859e-06,
+      "loss": 0.7117,
+      "step": 20145
+    },
+    {
+      "epoch": 3.5867165242165244,
+      "grad_norm": 1.1388016939163208,
+      "learning_rate": 5.253501674341777e-06,
+      "loss": 0.9579,
+      "step": 20146
+    },
+    {
+      "epoch": 3.5868945868945867,
+      "grad_norm": 1.029822587966919,
+      "learning_rate": 5.249025377961514e-06,
+      "loss": 0.7996,
+      "step": 20147
+    },
+    {
+      "epoch": 3.5870726495726495,
+      "grad_norm": 0.9227847456932068,
+      "learning_rate": 5.244550938029768e-06,
+      "loss": 0.902,
+      "step": 20148
+    },
+    {
+      "epoch": 3.5872507122507122,
+      "grad_norm": 0.9261413216590881,
+      "learning_rate": 5.240078354634214e-06,
+      "loss": 0.7765,
+      "step": 20149
+    },
+    {
+      "epoch": 3.587428774928775,
+      "grad_norm": 0.9532697200775146,
+      "learning_rate": 5.235607627862471e-06,
+      "loss": 0.8003,
+      "step": 20150
+    },
+    {
+      "epoch": 3.5876068376068377,
+      "grad_norm": 0.9802052974700928,
+      "learning_rate": 5.231138757802134e-06,
+      "loss": 0.7542,
+      "step": 20151
+    },
+    {
+      "epoch": 3.5877849002849,
+      "grad_norm": 1.019360065460205,
+      "learning_rate": 5.22667174454079e-06,
+      "loss": 0.6551,
+      "step": 20152
+    },
+    {
+      "epoch": 3.587962962962963,
+      "grad_norm": 1.1361932754516602,
+      "learning_rate": 5.222206588165912e-06,
+      "loss": 0.6014,
+      "step": 20153
+    },
+    {
+      "epoch": 3.5881410256410255,
+      "grad_norm": 1.0577775239944458,
+      "learning_rate": 5.21774328876502e-06,
+      "loss": 0.7311,
+      "step": 20154
+    },
+    {
+      "epoch": 3.5883190883190883,
+      "grad_norm": 0.9792344570159912,
+      "learning_rate": 5.2132818464255664e-06,
+      "loss": 0.8125,
+      "step": 20155
+    },
+    {
+      "epoch": 3.588497150997151,
+      "grad_norm": 1.0200873613357544,
+      "learning_rate": 5.208822261234925e-06,
+      "loss": 0.8364,
+      "step": 20156
+    },
+    {
+      "epoch": 3.588675213675214,
+      "grad_norm": 0.8380966782569885,
+      "learning_rate": 5.204364533280537e-06,
+      "loss": 0.6492,
+      "step": 20157
+    },
+    {
+      "epoch": 3.5888532763532766,
+      "grad_norm": 1.0056066513061523,
+      "learning_rate": 5.199908662649677e-06,
+      "loss": 0.6339,
+      "step": 20158
+    },
+    {
+      "epoch": 3.5890313390313393,
+      "grad_norm": 1.0888465642929077,
+      "learning_rate": 5.19545464942971e-06,
+      "loss": 0.7766,
+      "step": 20159
+    },
+    {
+      "epoch": 3.5892094017094016,
+      "grad_norm": 0.9547351598739624,
+      "learning_rate": 5.19100249370783e-06,
+      "loss": 0.8142,
+      "step": 20160
+    },
+    {
+      "epoch": 3.5893874643874644,
+      "grad_norm": 1.0138623714447021,
+      "learning_rate": 5.186552195571326e-06,
+      "loss": 0.9173,
+      "step": 20161
+    },
+    {
+      "epoch": 3.589565527065527,
+      "grad_norm": 1.025498628616333,
+      "learning_rate": 5.18210375510737e-06,
+      "loss": 0.5887,
+      "step": 20162
+    },
+    {
+      "epoch": 3.58974358974359,
+      "grad_norm": 0.8190540075302124,
+      "learning_rate": 5.177657172403139e-06,
+      "loss": 0.6902,
+      "step": 20163
+    },
+    {
+      "epoch": 3.589921652421652,
+      "grad_norm": 1.0583090782165527,
+      "learning_rate": 5.173212447545728e-06,
+      "loss": 0.7482,
+      "step": 20164
+    },
+    {
+      "epoch": 3.590099715099715,
+      "grad_norm": 0.9414075613021851,
+      "learning_rate": 5.168769580622234e-06,
+      "loss": 0.7471,
+      "step": 20165
+    },
+    {
+      "epoch": 3.5902777777777777,
+      "grad_norm": 0.9538351893424988,
+      "learning_rate": 5.16432857171969e-06,
+      "loss": 0.6802,
+      "step": 20166
+    },
+    {
+      "epoch": 3.5904558404558404,
+      "grad_norm": 0.9829911589622498,
+      "learning_rate": 5.159889420925146e-06,
+      "loss": 0.7222,
+      "step": 20167
+    },
+    {
+      "epoch": 3.590633903133903,
+      "grad_norm": 1.1673390865325928,
+      "learning_rate": 5.155452128325544e-06,
+      "loss": 0.7527,
+      "step": 20168
+    },
+    {
+      "epoch": 3.590811965811966,
+      "grad_norm": 1.0132838487625122,
+      "learning_rate": 5.151016694007849e-06,
+      "loss": 0.7148,
+      "step": 20169
+    },
+    {
+      "epoch": 3.5909900284900287,
+      "grad_norm": 0.9084600210189819,
+      "learning_rate": 5.146583118058923e-06,
+      "loss": 0.6949,
+      "step": 20170
+    },
+    {
+      "epoch": 3.5911680911680914,
+      "grad_norm": 0.9238780736923218,
+      "learning_rate": 5.142151400565687e-06,
+      "loss": 0.7453,
+      "step": 20171
+    },
+    {
+      "epoch": 3.5913461538461537,
+      "grad_norm": 0.9433969259262085,
+      "learning_rate": 5.137721541614926e-06,
+      "loss": 0.7395,
+      "step": 20172
+    },
+    {
+      "epoch": 3.5915242165242165,
+      "grad_norm": 0.8848153948783875,
+      "learning_rate": 5.133293541293449e-06,
+      "loss": 0.7717,
+      "step": 20173
+    },
+    {
+      "epoch": 3.5917022792022792,
+      "grad_norm": 0.9557967782020569,
+      "learning_rate": 5.1288673996880425e-06,
+      "loss": 0.8706,
+      "step": 20174
+    },
+    {
+      "epoch": 3.591880341880342,
+      "grad_norm": 0.9979602694511414,
+      "learning_rate": 5.12444311688538e-06,
+      "loss": 0.6658,
+      "step": 20175
+    },
+    {
+      "epoch": 3.5920584045584043,
+      "grad_norm": 0.9105966687202454,
+      "learning_rate": 5.120020692972172e-06,
+      "loss": 0.6382,
+      "step": 20176
+    },
+    {
+      "epoch": 3.592236467236467,
+      "grad_norm": 0.8652068376541138,
+      "learning_rate": 5.11560012803507e-06,
+      "loss": 0.6003,
+      "step": 20177
+    },
+    {
+      "epoch": 3.59241452991453,
+      "grad_norm": 0.9879941940307617,
+      "learning_rate": 5.111181422160671e-06,
+      "loss": 0.7795,
+      "step": 20178
+    },
+    {
+      "epoch": 3.5925925925925926,
+      "grad_norm": 1.225643515586853,
+      "learning_rate": 5.106764575435563e-06,
+      "loss": 0.8514,
+      "step": 20179
+    },
+    {
+      "epoch": 3.5927706552706553,
+      "grad_norm": 0.9002955555915833,
+      "learning_rate": 5.102349587946275e-06,
+      "loss": 1.0394,
+      "step": 20180
+    },
+    {
+      "epoch": 3.592948717948718,
+      "grad_norm": 0.8426976203918457,
+      "learning_rate": 5.097936459779318e-06,
+      "loss": 0.7507,
+      "step": 20181
+    },
+    {
+      "epoch": 3.593126780626781,
+      "grad_norm": 0.9469396471977234,
+      "learning_rate": 5.093525191021154e-06,
+      "loss": 0.7158,
+      "step": 20182
+    },
+    {
+      "epoch": 3.5933048433048436,
+      "grad_norm": 0.8771377801895142,
+      "learning_rate": 5.0891157817581916e-06,
+      "loss": 0.6222,
+      "step": 20183
+    },
+    {
+      "epoch": 3.593482905982906,
+      "grad_norm": 1.0022752285003662,
+      "learning_rate": 5.084708232076873e-06,
+      "loss": 0.7088,
+      "step": 20184
+    },
+    {
+      "epoch": 3.5936609686609686,
+      "grad_norm": 0.9102463722229004,
+      "learning_rate": 5.080302542063508e-06,
+      "loss": 0.7738,
+      "step": 20185
+    },
+    {
+      "epoch": 3.5938390313390314,
+      "grad_norm": 1.1719863414764404,
+      "learning_rate": 5.07589871180445e-06,
+      "loss": 0.8472,
+      "step": 20186
+    },
+    {
+      "epoch": 3.594017094017094,
+      "grad_norm": 0.9453331828117371,
+      "learning_rate": 5.071496741385961e-06,
+      "loss": 0.8113,
+      "step": 20187
+    },
+    {
+      "epoch": 3.5941951566951564,
+      "grad_norm": 0.9310576915740967,
+      "learning_rate": 5.0670966308943076e-06,
+      "loss": 0.7145,
+      "step": 20188
+    },
+    {
+      "epoch": 3.594373219373219,
+      "grad_norm": 0.933391809463501,
+      "learning_rate": 5.062698380415687e-06,
+      "loss": 0.8318,
+      "step": 20189
+    },
+    {
+      "epoch": 3.594551282051282,
+      "grad_norm": 1.0447312593460083,
+      "learning_rate": 5.0583019900362625e-06,
+      "loss": 0.7597,
+      "step": 20190
+    },
+    {
+      "epoch": 3.5947293447293447,
+      "grad_norm": 0.8405809998512268,
+      "learning_rate": 5.053907459842189e-06,
+      "loss": 0.5576,
+      "step": 20191
+    },
+    {
+      "epoch": 3.5949074074074074,
+      "grad_norm": 0.8650723099708557,
+      "learning_rate": 5.0495147899195736e-06,
+      "loss": 0.589,
+      "step": 20192
+    },
+    {
+      "epoch": 3.59508547008547,
+      "grad_norm": 0.8867887854576111,
+      "learning_rate": 5.045123980354472e-06,
+      "loss": 0.7514,
+      "step": 20193
+    },
+    {
+      "epoch": 3.595263532763533,
+      "grad_norm": 0.9023543000221252,
+      "learning_rate": 5.0407350312329125e-06,
+      "loss": 0.6218,
+      "step": 20194
+    },
+    {
+      "epoch": 3.5954415954415957,
+      "grad_norm": 0.9571815729141235,
+      "learning_rate": 5.036347942640896e-06,
+      "loss": 0.7849,
+      "step": 20195
+    },
+    {
+      "epoch": 3.595619658119658,
+      "grad_norm": 0.9155046343803406,
+      "learning_rate": 5.031962714664362e-06,
+      "loss": 0.6944,
+      "step": 20196
+    },
+    {
+      "epoch": 3.5957977207977208,
+      "grad_norm": 0.9272841215133667,
+      "learning_rate": 5.027579347389244e-06,
+      "loss": 0.7431,
+      "step": 20197
+    },
+    {
+      "epoch": 3.5959757834757835,
+      "grad_norm": 1.0354583263397217,
+      "learning_rate": 5.023197840901417e-06,
+      "loss": 0.9811,
+      "step": 20198
+    },
+    {
+      "epoch": 3.5961538461538463,
+      "grad_norm": 1.0272680521011353,
+      "learning_rate": 5.018818195286745e-06,
+      "loss": 0.7009,
+      "step": 20199
+    },
+    {
+      "epoch": 3.5963319088319086,
+      "grad_norm": 0.9123852252960205,
+      "learning_rate": 5.014440410631005e-06,
+      "loss": 0.851,
+      "step": 20200
+    },
+    {
+      "epoch": 3.5965099715099713,
+      "grad_norm": 1.0875011682510376,
+      "learning_rate": 5.010064487019994e-06,
+      "loss": 0.587,
+      "step": 20201
+    },
+    {
+      "epoch": 3.596688034188034,
+      "grad_norm": 0.9781659841537476,
+      "learning_rate": 5.005690424539455e-06,
+      "loss": 0.7736,
+      "step": 20202
+    },
+    {
+      "epoch": 3.596866096866097,
+      "grad_norm": 0.9043481945991516,
+      "learning_rate": 5.001318223275075e-06,
+      "loss": 0.7778,
+      "step": 20203
+    },
+    {
+      "epoch": 3.5970441595441596,
+      "grad_norm": 0.9980285167694092,
+      "learning_rate": 4.996947883312519e-06,
+      "loss": 0.8251,
+      "step": 20204
+    },
+    {
+      "epoch": 3.5972222222222223,
+      "grad_norm": 1.058321475982666,
+      "learning_rate": 4.992579404737419e-06,
+      "loss": 0.7323,
+      "step": 20205
+    },
+    {
+      "epoch": 3.597400284900285,
+      "grad_norm": 1.0859766006469727,
+      "learning_rate": 4.98821278763536e-06,
+      "loss": 0.9372,
+      "step": 20206
+    },
+    {
+      "epoch": 3.597578347578348,
+      "grad_norm": 0.9477382302284241,
+      "learning_rate": 4.983848032091909e-06,
+      "loss": 0.7893,
+      "step": 20207
+    },
+    {
+      "epoch": 3.59775641025641,
+      "grad_norm": 0.9508783221244812,
+      "learning_rate": 4.979485138192563e-06,
+      "loss": 0.7689,
+      "step": 20208
+    },
+    {
+      "epoch": 3.597934472934473,
+      "grad_norm": 1.0118316411972046,
+      "learning_rate": 4.975124106022844e-06,
+      "loss": 0.8543,
+      "step": 20209
+    },
+    {
+      "epoch": 3.5981125356125356,
+      "grad_norm": 0.8835347890853882,
+      "learning_rate": 4.970764935668137e-06,
+      "loss": 0.595,
+      "step": 20210
+    },
+    {
+      "epoch": 3.5982905982905984,
+      "grad_norm": 0.857542872428894,
+      "learning_rate": 4.966407627213909e-06,
+      "loss": 0.6382,
+      "step": 20211
+    },
+    {
+      "epoch": 3.5984686609686607,
+      "grad_norm": 1.1189042329788208,
+      "learning_rate": 4.9620521807455026e-06,
+      "loss": 0.7847,
+      "step": 20212
+    },
+    {
+      "epoch": 3.5986467236467234,
+      "grad_norm": 0.9164559841156006,
+      "learning_rate": 4.957698596348248e-06,
+      "loss": 0.784,
+      "step": 20213
+    },
+    {
+      "epoch": 3.598824786324786,
+      "grad_norm": 1.0209029912948608,
+      "learning_rate": 4.953346874107478e-06,
+      "loss": 0.9315,
+      "step": 20214
+    },
+    {
+      "epoch": 3.599002849002849,
+      "grad_norm": 0.8468409776687622,
+      "learning_rate": 4.948997014108414e-06,
+      "loss": 0.7392,
+      "step": 20215
+    },
+    {
+      "epoch": 3.5991809116809117,
+      "grad_norm": 0.8785325884819031,
+      "learning_rate": 4.944649016436331e-06,
+      "loss": 0.7619,
+      "step": 20216
+    },
+    {
+      "epoch": 3.5993589743589745,
+      "grad_norm": 0.9535296559333801,
+      "learning_rate": 4.940302881176351e-06,
+      "loss": 0.7385,
+      "step": 20217
+    },
+    {
+      "epoch": 3.599537037037037,
+      "grad_norm": 1.067630410194397,
+      "learning_rate": 4.935958608413693e-06,
+      "loss": 0.7811,
+      "step": 20218
+    },
+    {
+      "epoch": 3.5997150997151,
+      "grad_norm": 0.9168570041656494,
+      "learning_rate": 4.9316161982334355e-06,
+      "loss": 0.7221,
+      "step": 20219
+    },
+    {
+      "epoch": 3.5998931623931623,
+      "grad_norm": 0.9218630790710449,
+      "learning_rate": 4.927275650720686e-06,
+      "loss": 0.7633,
+      "step": 20220
+    },
+    {
+      "epoch": 3.600071225071225,
+      "grad_norm": 0.9325320720672607,
+      "learning_rate": 4.922936965960457e-06,
+      "loss": 0.9047,
+      "step": 20221
+    },
+    {
+      "epoch": 3.6002492877492878,
+      "grad_norm": 1.031030535697937,
+      "learning_rate": 4.918600144037788e-06,
+      "loss": 0.6517,
+      "step": 20222
+    },
+    {
+      "epoch": 3.6004273504273505,
+      "grad_norm": 0.9421409368515015,
+      "learning_rate": 4.914265185037614e-06,
+      "loss": 0.565,
+      "step": 20223
+    },
+    {
+      "epoch": 3.6006054131054133,
+      "grad_norm": 0.9039212465286255,
+      "learning_rate": 4.9099320890449106e-06,
+      "loss": 0.8056,
+      "step": 20224
+    },
+    {
+      "epoch": 3.6007834757834756,
+      "grad_norm": 1.0419108867645264,
+      "learning_rate": 4.9056008561445324e-06,
+      "loss": 0.8205,
+      "step": 20225
+    },
+    {
+      "epoch": 3.6009615384615383,
+      "grad_norm": 0.9815367460250854,
+      "learning_rate": 4.901271486421388e-06,
+      "loss": 0.6823,
+      "step": 20226
+    },
+    {
+      "epoch": 3.601139601139601,
+      "grad_norm": 0.951187014579773,
+      "learning_rate": 4.896943979960267e-06,
+      "loss": 0.7703,
+      "step": 20227
+    },
+    {
+      "epoch": 3.601317663817664,
+      "grad_norm": 0.9774547219276428,
+      "learning_rate": 4.8926183368459775e-06,
+      "loss": 0.572,
+      "step": 20228
+    },
+    {
+      "epoch": 3.6014957264957266,
+      "grad_norm": 0.9099259376525879,
+      "learning_rate": 4.888294557163254e-06,
+      "loss": 0.7786,
+      "step": 20229
+    },
+    {
+      "epoch": 3.6016737891737893,
+      "grad_norm": 0.9812109470367432,
+      "learning_rate": 4.883972640996826e-06,
+      "loss": 0.7925,
+      "step": 20230
+    },
+    {
+      "epoch": 3.601851851851852,
+      "grad_norm": 0.8465882539749146,
+      "learning_rate": 4.8796525884313604e-06,
+      "loss": 0.6353,
+      "step": 20231
+    },
+    {
+      "epoch": 3.6020299145299144,
+      "grad_norm": 1.130205512046814,
+      "learning_rate": 4.875334399551501e-06,
+      "loss": 0.7415,
+      "step": 20232
+    },
+    {
+      "epoch": 3.602207977207977,
+      "grad_norm": 0.8592739105224609,
+      "learning_rate": 4.87101807444188e-06,
+      "loss": 0.5869,
+      "step": 20233
+    },
+    {
+      "epoch": 3.60238603988604,
+      "grad_norm": 0.7758101224899292,
+      "learning_rate": 4.86670361318704e-06,
+      "loss": 0.6109,
+      "step": 20234
+    },
+    {
+      "epoch": 3.6025641025641026,
+      "grad_norm": 1.0534182786941528,
+      "learning_rate": 4.862391015871515e-06,
+      "loss": 0.5944,
+      "step": 20235
+    },
+    {
+      "epoch": 3.6027421652421654,
+      "grad_norm": 0.9213957786560059,
+      "learning_rate": 4.858080282579813e-06,
+      "loss": 0.6962,
+      "step": 20236
+    },
+    {
+      "epoch": 3.6029202279202277,
+      "grad_norm": 1.1342556476593018,
+      "learning_rate": 4.85377141339638e-06,
+      "loss": 0.7288,
+      "step": 20237
+    },
+    {
+      "epoch": 3.6030982905982905,
+      "grad_norm": 0.8825157284736633,
+      "learning_rate": 4.849464408405646e-06,
+      "loss": 0.6383,
+      "step": 20238
+    },
+    {
+      "epoch": 3.603276353276353,
+      "grad_norm": 1.0171782970428467,
+      "learning_rate": 4.845159267692012e-06,
+      "loss": 0.7471,
+      "step": 20239
+    },
+    {
+      "epoch": 3.603454415954416,
+      "grad_norm": 1.1089091300964355,
+      "learning_rate": 4.840855991339799e-06,
+      "loss": 0.9887,
+      "step": 20240
+    },
+    {
+      "epoch": 3.6036324786324787,
+      "grad_norm": 1.0146455764770508,
+      "learning_rate": 4.836554579433361e-06,
+      "loss": 0.7713,
+      "step": 20241
+    },
+    {
+      "epoch": 3.6038105413105415,
+      "grad_norm": 0.895252525806427,
+      "learning_rate": 4.8322550320569204e-06,
+      "loss": 0.5434,
+      "step": 20242
+    },
+    {
+      "epoch": 3.603988603988604,
+      "grad_norm": 0.9839693903923035,
+      "learning_rate": 4.827957349294765e-06,
+      "loss": 0.7696,
+      "step": 20243
+    },
+    {
+      "epoch": 3.6041666666666665,
+      "grad_norm": 0.9720914363861084,
+      "learning_rate": 4.823661531231083e-06,
+      "loss": 0.7884,
+      "step": 20244
+    },
+    {
+      "epoch": 3.6043447293447293,
+      "grad_norm": 0.9014876484870911,
+      "learning_rate": 4.81936757795004e-06,
+      "loss": 0.8938,
+      "step": 20245
+    },
+    {
+      "epoch": 3.604522792022792,
+      "grad_norm": 0.9114326238632202,
+      "learning_rate": 4.81507548953577e-06,
+      "loss": 0.8245,
+      "step": 20246
+    },
+    {
+      "epoch": 3.6047008547008548,
+      "grad_norm": 1.0024261474609375,
+      "learning_rate": 4.81078526607236e-06,
+      "loss": 0.7461,
+      "step": 20247
+    },
+    {
+      "epoch": 3.6048789173789175,
+      "grad_norm": 0.9544615149497986,
+      "learning_rate": 4.806496907643876e-06,
+      "loss": 0.7481,
+      "step": 20248
+    },
+    {
+      "epoch": 3.60505698005698,
+      "grad_norm": 0.8342941999435425,
+      "learning_rate": 4.802210414334351e-06,
+      "loss": 0.5793,
+      "step": 20249
+    },
+    {
+      "epoch": 3.6052350427350426,
+      "grad_norm": 0.9724514484405518,
+      "learning_rate": 4.79792578622773e-06,
+      "loss": 0.8259,
+      "step": 20250
+    },
+    {
+      "epoch": 3.6054131054131053,
+      "grad_norm": 1.0200597047805786,
+      "learning_rate": 4.7936430234080234e-06,
+      "loss": 0.8159,
+      "step": 20251
+    },
+    {
+      "epoch": 3.605591168091168,
+      "grad_norm": 0.982551097869873,
+      "learning_rate": 4.789362125959073e-06,
+      "loss": 0.8049,
+      "step": 20252
+    },
+    {
+      "epoch": 3.605769230769231,
+      "grad_norm": 0.9515790343284607,
+      "learning_rate": 4.785083093964804e-06,
+      "loss": 0.721,
+      "step": 20253
+    },
+    {
+      "epoch": 3.6059472934472936,
+      "grad_norm": 0.9414631724357605,
+      "learning_rate": 4.7808059275090465e-06,
+      "loss": 0.8164,
+      "step": 20254
+    },
+    {
+      "epoch": 3.6061253561253563,
+      "grad_norm": 0.7917568683624268,
+      "learning_rate": 4.77653062667558e-06,
+      "loss": 0.6002,
+      "step": 20255
+    },
+    {
+      "epoch": 3.6063034188034186,
+      "grad_norm": 0.9543023705482483,
+      "learning_rate": 4.772257191548202e-06,
+      "loss": 0.7494,
+      "step": 20256
+    },
+    {
+      "epoch": 3.6064814814814814,
+      "grad_norm": 0.908807635307312,
+      "learning_rate": 4.7679856222106135e-06,
+      "loss": 0.7106,
+      "step": 20257
+    },
+    {
+      "epoch": 3.606659544159544,
+      "grad_norm": 1.0815712213516235,
+      "learning_rate": 4.763715918746525e-06,
+      "loss": 0.7924,
+      "step": 20258
+    },
+    {
+      "epoch": 3.606837606837607,
+      "grad_norm": 0.8257647156715393,
+      "learning_rate": 4.7594480812395925e-06,
+      "loss": 0.7398,
+      "step": 20259
+    },
+    {
+      "epoch": 3.6070156695156697,
+      "grad_norm": 0.8509652614593506,
+      "learning_rate": 4.755182109773426e-06,
+      "loss": 0.8162,
+      "step": 20260
+    },
+    {
+      "epoch": 3.607193732193732,
+      "grad_norm": 0.9860865473747253,
+      "learning_rate": 4.750918004431604e-06,
+      "loss": 0.974,
+      "step": 20261
+    },
+    {
+      "epoch": 3.6073717948717947,
+      "grad_norm": 0.9228841066360474,
+      "learning_rate": 4.746655765297692e-06,
+      "loss": 0.8106,
+      "step": 20262
+    },
+    {
+      "epoch": 3.6075498575498575,
+      "grad_norm": 1.1330608129501343,
+      "learning_rate": 4.7423953924551675e-06,
+      "loss": 0.6528,
+      "step": 20263
+    },
+    {
+      "epoch": 3.60772792022792,
+      "grad_norm": 1.0182117223739624,
+      "learning_rate": 4.738136885987565e-06,
+      "loss": 0.7582,
+      "step": 20264
+    },
+    {
+      "epoch": 3.607905982905983,
+      "grad_norm": 1.006861925125122,
+      "learning_rate": 4.733880245978239e-06,
+      "loss": 0.6213,
+      "step": 20265
+    },
+    {
+      "epoch": 3.6080840455840457,
+      "grad_norm": 1.037815809249878,
+      "learning_rate": 4.729625472510668e-06,
+      "loss": 0.9137,
+      "step": 20266
+    },
+    {
+      "epoch": 3.6082621082621085,
+      "grad_norm": 1.1505076885223389,
+      "learning_rate": 4.7253725656681515e-06,
+      "loss": 0.8987,
+      "step": 20267
+    },
+    {
+      "epoch": 3.6084401709401708,
+      "grad_norm": 0.9972392320632935,
+      "learning_rate": 4.721121525534045e-06,
+      "loss": 0.6091,
+      "step": 20268
+    },
+    {
+      "epoch": 3.6086182336182335,
+      "grad_norm": 0.8960257768630981,
+      "learning_rate": 4.716872352191648e-06,
+      "loss": 0.6685,
+      "step": 20269
+    },
+    {
+      "epoch": 3.6087962962962963,
+      "grad_norm": 0.9827064275741577,
+      "learning_rate": 4.712625045724206e-06,
+      "loss": 0.8017,
+      "step": 20270
+    },
+    {
+      "epoch": 3.608974358974359,
+      "grad_norm": 0.9110612273216248,
+      "learning_rate": 4.70837960621493e-06,
+      "loss": 0.6586,
+      "step": 20271
+    },
+    {
+      "epoch": 3.609152421652422,
+      "grad_norm": 0.9796404838562012,
+      "learning_rate": 4.704136033746987e-06,
+      "loss": 0.7285,
+      "step": 20272
+    },
+    {
+      "epoch": 3.609330484330484,
+      "grad_norm": 1.166001796722412,
+      "learning_rate": 4.699894328403554e-06,
+      "loss": 0.8324,
+      "step": 20273
+    },
+    {
+      "epoch": 3.609508547008547,
+      "grad_norm": 0.8968726992607117,
+      "learning_rate": 4.6956544902677315e-06,
+      "loss": 0.6518,
+      "step": 20274
+    },
+    {
+      "epoch": 3.6096866096866096,
+      "grad_norm": 0.9205286502838135,
+      "learning_rate": 4.691416519422575e-06,
+      "loss": 0.8745,
+      "step": 20275
+    },
+    {
+      "epoch": 3.6098646723646723,
+      "grad_norm": 0.9699488282203674,
+      "learning_rate": 4.68718041595112e-06,
+      "loss": 0.7312,
+      "step": 20276
+    },
+    {
+      "epoch": 3.610042735042735,
+      "grad_norm": 1.0142552852630615,
+      "learning_rate": 4.682946179936376e-06,
+      "loss": 0.6849,
+      "step": 20277
+    },
+    {
+      "epoch": 3.610220797720798,
+      "grad_norm": 1.0610849857330322,
+      "learning_rate": 4.678713811461299e-06,
+      "loss": 0.6934,
+      "step": 20278
+    },
+    {
+      "epoch": 3.6103988603988606,
+      "grad_norm": 0.9037832617759705,
+      "learning_rate": 4.674483310608801e-06,
+      "loss": 0.6833,
+      "step": 20279
+    },
+    {
+      "epoch": 3.6105769230769234,
+      "grad_norm": 0.998599112033844,
+      "learning_rate": 4.670254677461783e-06,
+      "loss": 0.643,
+      "step": 20280
+    },
+    {
+      "epoch": 3.6107549857549857,
+      "grad_norm": 0.8829896450042725,
+      "learning_rate": 4.6660279121031105e-06,
+      "loss": 0.6309,
+      "step": 20281
+    },
+    {
+      "epoch": 3.6109330484330484,
+      "grad_norm": 0.9085970520973206,
+      "learning_rate": 4.661803014615562e-06,
+      "loss": 0.6999,
+      "step": 20282
+    },
+    {
+      "epoch": 3.611111111111111,
+      "grad_norm": 1.0698771476745605,
+      "learning_rate": 4.657579985081939e-06,
+      "loss": 0.6356,
+      "step": 20283
+    },
+    {
+      "epoch": 3.611289173789174,
+      "grad_norm": 1.119136095046997,
+      "learning_rate": 4.653358823584986e-06,
+      "loss": 0.7683,
+      "step": 20284
+    },
+    {
+      "epoch": 3.611467236467236,
+      "grad_norm": 0.9300078749656677,
+      "learning_rate": 4.649139530207403e-06,
+      "loss": 0.7022,
+      "step": 20285
+    },
+    {
+      "epoch": 3.611645299145299,
+      "grad_norm": 0.9168888926506042,
+      "learning_rate": 4.644922105031857e-06,
+      "loss": 0.8505,
+      "step": 20286
+    },
+    {
+      "epoch": 3.6118233618233617,
+      "grad_norm": 0.9180569648742676,
+      "learning_rate": 4.640706548140972e-06,
+      "loss": 0.783,
+      "step": 20287
+    },
+    {
+      "epoch": 3.6120014245014245,
+      "grad_norm": 0.944037914276123,
+      "learning_rate": 4.636492859617358e-06,
+      "loss": 0.6348,
+      "step": 20288
+    },
+    {
+      "epoch": 3.6121794871794872,
+      "grad_norm": 1.140091061592102,
+      "learning_rate": 4.632281039543562e-06,
+      "loss": 0.7691,
+      "step": 20289
+    },
+    {
+      "epoch": 3.61235754985755,
+      "grad_norm": 0.9035637974739075,
+      "learning_rate": 4.6280710880021065e-06,
+      "loss": 0.4777,
+      "step": 20290
+    },
+    {
+      "epoch": 3.6125356125356127,
+      "grad_norm": 0.9371036887168884,
+      "learning_rate": 4.62386300507549e-06,
+      "loss": 0.6795,
+      "step": 20291
+    },
+    {
+      "epoch": 3.6127136752136755,
+      "grad_norm": 0.8894741535186768,
+      "learning_rate": 4.619656790846139e-06,
+      "loss": 0.6351,
+      "step": 20292
+    },
+    {
+      "epoch": 3.612891737891738,
+      "grad_norm": 1.035195231437683,
+      "learning_rate": 4.6154524453964846e-06,
+      "loss": 0.8925,
+      "step": 20293
+    },
+    {
+      "epoch": 3.6130698005698005,
+      "grad_norm": 0.7878008484840393,
+      "learning_rate": 4.611249968808895e-06,
+      "loss": 0.4768,
+      "step": 20294
+    },
+    {
+      "epoch": 3.6132478632478633,
+      "grad_norm": 0.9613704085350037,
+      "learning_rate": 4.607049361165694e-06,
+      "loss": 0.6607,
+      "step": 20295
+    },
+    {
+      "epoch": 3.613425925925926,
+      "grad_norm": 0.9866206645965576,
+      "learning_rate": 4.602850622549227e-06,
+      "loss": 0.8279,
+      "step": 20296
+    },
+    {
+      "epoch": 3.6136039886039883,
+      "grad_norm": 1.0364552736282349,
+      "learning_rate": 4.598653753041704e-06,
+      "loss": 0.7251,
+      "step": 20297
+    },
+    {
+      "epoch": 3.613782051282051,
+      "grad_norm": 0.7488967776298523,
+      "learning_rate": 4.594458752725395e-06,
+      "loss": 0.675,
+      "step": 20298
+    },
+    {
+      "epoch": 3.613960113960114,
+      "grad_norm": 1.0590063333511353,
+      "learning_rate": 4.590265621682488e-06,
+      "loss": 0.8801,
+      "step": 20299
+    },
+    {
+      "epoch": 3.6141381766381766,
+      "grad_norm": 1.012354850769043,
+      "learning_rate": 4.586074359995119e-06,
+      "loss": 0.8227,
+      "step": 20300
+    },
+    {
+      "epoch": 3.6143162393162394,
+      "grad_norm": 0.8928263187408447,
+      "learning_rate": 4.58188496774542e-06,
+      "loss": 0.5889,
+      "step": 20301
+    },
+    {
+      "epoch": 3.614494301994302,
+      "grad_norm": 0.9703002572059631,
+      "learning_rate": 4.577697445015472e-06,
+      "loss": 0.6322,
+      "step": 20302
+    },
+    {
+      "epoch": 3.614672364672365,
+      "grad_norm": 0.8588637709617615,
+      "learning_rate": 4.573511791887319e-06,
+      "loss": 0.6494,
+      "step": 20303
+    },
+    {
+      "epoch": 3.6148504273504276,
+      "grad_norm": 0.9801368117332458,
+      "learning_rate": 4.569328008442975e-06,
+      "loss": 0.8244,
+      "step": 20304
+    },
+    {
+      "epoch": 3.61502849002849,
+      "grad_norm": 1.0798895359039307,
+      "learning_rate": 4.565146094764394e-06,
+      "loss": 0.6987,
+      "step": 20305
+    },
+    {
+      "epoch": 3.6152065527065527,
+      "grad_norm": 0.9438313841819763,
+      "learning_rate": 4.560966050933546e-06,
+      "loss": 0.507,
+      "step": 20306
+    },
+    {
+      "epoch": 3.6153846153846154,
+      "grad_norm": 0.9473024606704712,
+      "learning_rate": 4.556787877032287e-06,
+      "loss": 0.7889,
+      "step": 20307
+    },
+    {
+      "epoch": 3.615562678062678,
+      "grad_norm": 0.9516319632530212,
+      "learning_rate": 4.552611573142518e-06,
+      "loss": 0.6727,
+      "step": 20308
+    },
+    {
+      "epoch": 3.6157407407407405,
+      "grad_norm": 0.9949623942375183,
+      "learning_rate": 4.548437139346052e-06,
+      "loss": 0.9616,
+      "step": 20309
+    },
+    {
+      "epoch": 3.6159188034188032,
+      "grad_norm": 0.9285261631011963,
+      "learning_rate": 4.544264575724677e-06,
+      "loss": 0.819,
+      "step": 20310
+    },
+    {
+      "epoch": 3.616096866096866,
+      "grad_norm": 0.9633162021636963,
+      "learning_rate": 4.540093882360152e-06,
+      "loss": 0.7936,
+      "step": 20311
+    },
+    {
+      "epoch": 3.6162749287749287,
+      "grad_norm": 0.9205031991004944,
+      "learning_rate": 4.535925059334168e-06,
+      "loss": 0.6606,
+      "step": 20312
+    },
+    {
+      "epoch": 3.6164529914529915,
+      "grad_norm": 0.9210516810417175,
+      "learning_rate": 4.531758106728446e-06,
+      "loss": 0.8,
+      "step": 20313
+    },
+    {
+      "epoch": 3.6166310541310542,
+      "grad_norm": 1.0242942571640015,
+      "learning_rate": 4.5275930246245764e-06,
+      "loss": 0.6824,
+      "step": 20314
+    },
+    {
+      "epoch": 3.616809116809117,
+      "grad_norm": 1.2146587371826172,
+      "learning_rate": 4.523429813104218e-06,
+      "loss": 0.7973,
+      "step": 20315
+    },
+    {
+      "epoch": 3.6169871794871797,
+      "grad_norm": 0.9127520322799683,
+      "learning_rate": 4.519268472248906e-06,
+      "loss": 0.7579,
+      "step": 20316
+    },
+    {
+      "epoch": 3.617165242165242,
+      "grad_norm": 0.9486130475997925,
+      "learning_rate": 4.5151090021401946e-06,
+      "loss": 0.6724,
+      "step": 20317
+    },
+    {
+      "epoch": 3.617343304843305,
+      "grad_norm": 1.0272002220153809,
+      "learning_rate": 4.510951402859564e-06,
+      "loss": 0.7484,
+      "step": 20318
+    },
+    {
+      "epoch": 3.6175213675213675,
+      "grad_norm": 1.0201951265335083,
+      "learning_rate": 4.506795674488484e-06,
+      "loss": 0.7737,
+      "step": 20319
+    },
+    {
+      "epoch": 3.6176994301994303,
+      "grad_norm": 1.0206116437911987,
+      "learning_rate": 4.502641817108355e-06,
+      "loss": 0.8504,
+      "step": 20320
+    },
+    {
+      "epoch": 3.6178774928774926,
+      "grad_norm": 0.9464601278305054,
+      "learning_rate": 4.498489830800601e-06,
+      "loss": 0.791,
+      "step": 20321
+    },
+    {
+      "epoch": 3.6180555555555554,
+      "grad_norm": 0.8579327464103699,
+      "learning_rate": 4.494339715646534e-06,
+      "loss": 0.5689,
+      "step": 20322
+    },
+    {
+      "epoch": 3.618233618233618,
+      "grad_norm": 0.9792792797088623,
+      "learning_rate": 4.490191471727511e-06,
+      "loss": 0.8564,
+      "step": 20323
+    },
+    {
+      "epoch": 3.618411680911681,
+      "grad_norm": 1.008038878440857,
+      "learning_rate": 4.486045099124758e-06,
+      "loss": 0.8837,
+      "step": 20324
+    },
+    {
+      "epoch": 3.6185897435897436,
+      "grad_norm": 0.8817142248153687,
+      "learning_rate": 4.481900597919541e-06,
+      "loss": 0.6801,
+      "step": 20325
+    },
+    {
+      "epoch": 3.6187678062678064,
+      "grad_norm": 1.1172369718551636,
+      "learning_rate": 4.477757968193075e-06,
+      "loss": 0.8062,
+      "step": 20326
+    },
+    {
+      "epoch": 3.618945868945869,
+      "grad_norm": 0.879314124584198,
+      "learning_rate": 4.473617210026504e-06,
+      "loss": 0.769,
+      "step": 20327
+    },
+    {
+      "epoch": 3.619123931623932,
+      "grad_norm": 1.1788002252578735,
+      "learning_rate": 4.469478323500953e-06,
+      "loss": 0.7673,
+      "step": 20328
+    },
+    {
+      "epoch": 3.619301994301994,
+      "grad_norm": 0.9070376753807068,
+      "learning_rate": 4.465341308697546e-06,
+      "loss": 0.8223,
+      "step": 20329
+    },
+    {
+      "epoch": 3.619480056980057,
+      "grad_norm": 1.0292595624923706,
+      "learning_rate": 4.461206165697296e-06,
+      "loss": 0.806,
+      "step": 20330
+    },
+    {
+      "epoch": 3.6196581196581197,
+      "grad_norm": 0.9035056233406067,
+      "learning_rate": 4.4570728945812605e-06,
+      "loss": 0.7187,
+      "step": 20331
+    },
+    {
+      "epoch": 3.6198361823361824,
+      "grad_norm": 1.1375666856765747,
+      "learning_rate": 4.4529414954304075e-06,
+      "loss": 0.723,
+      "step": 20332
+    },
+    {
+      "epoch": 3.620014245014245,
+      "grad_norm": 0.9073819518089294,
+      "learning_rate": 4.448811968325683e-06,
+      "loss": 0.7414,
+      "step": 20333
+    },
+    {
+      "epoch": 3.6201923076923075,
+      "grad_norm": 0.8783019185066223,
+      "learning_rate": 4.444684313348002e-06,
+      "loss": 0.7602,
+      "step": 20334
+    },
+    {
+      "epoch": 3.6203703703703702,
+      "grad_norm": 0.9157004356384277,
+      "learning_rate": 4.440558530578221e-06,
+      "loss": 0.771,
+      "step": 20335
+    },
+    {
+      "epoch": 3.620548433048433,
+      "grad_norm": 0.945231020450592,
+      "learning_rate": 4.4364346200972184e-06,
+      "loss": 0.9442,
+      "step": 20336
+    },
+    {
+      "epoch": 3.6207264957264957,
+      "grad_norm": 0.9638859629631042,
+      "learning_rate": 4.432312581985732e-06,
+      "loss": 0.73,
+      "step": 20337
+    },
+    {
+      "epoch": 3.6209045584045585,
+      "grad_norm": 1.0869982242584229,
+      "learning_rate": 4.428192416324573e-06,
+      "loss": 0.906,
+      "step": 20338
+    },
+    {
+      "epoch": 3.6210826210826212,
+      "grad_norm": 0.9156002998352051,
+      "learning_rate": 4.424074123194433e-06,
+      "loss": 0.7178,
+      "step": 20339
+    },
+    {
+      "epoch": 3.621260683760684,
+      "grad_norm": 1.0202544927597046,
+      "learning_rate": 4.419957702676037e-06,
+      "loss": 0.6788,
+      "step": 20340
+    },
+    {
+      "epoch": 3.6214387464387463,
+      "grad_norm": 1.0651593208312988,
+      "learning_rate": 4.4158431548500075e-06,
+      "loss": 0.624,
+      "step": 20341
+    },
+    {
+      "epoch": 3.621616809116809,
+      "grad_norm": 0.9189126491546631,
+      "learning_rate": 4.411730479796982e-06,
+      "loss": 0.69,
+      "step": 20342
+    },
+    {
+      "epoch": 3.621794871794872,
+      "grad_norm": 1.0193413496017456,
+      "learning_rate": 4.40761967759753e-06,
+      "loss": 0.8095,
+      "step": 20343
+    },
+    {
+      "epoch": 3.6219729344729346,
+      "grad_norm": 1.0479416847229004,
+      "learning_rate": 4.403510748332185e-06,
+      "loss": 0.7535,
+      "step": 20344
+    },
+    {
+      "epoch": 3.6221509971509973,
+      "grad_norm": 0.9200383424758911,
+      "learning_rate": 4.399403692081461e-06,
+      "loss": 0.7786,
+      "step": 20345
+    },
+    {
+      "epoch": 3.6223290598290596,
+      "grad_norm": 1.0074836015701294,
+      "learning_rate": 4.3952985089258495e-06,
+      "loss": 0.6789,
+      "step": 20346
+    },
+    {
+      "epoch": 3.6225071225071224,
+      "grad_norm": 0.9245620965957642,
+      "learning_rate": 4.39119519894573e-06,
+      "loss": 0.8554,
+      "step": 20347
+    },
+    {
+      "epoch": 3.622685185185185,
+      "grad_norm": 0.8818466663360596,
+      "learning_rate": 4.387093762221562e-06,
+      "loss": 0.838,
+      "step": 20348
+    },
+    {
+      "epoch": 3.622863247863248,
+      "grad_norm": 0.9582803845405579,
+      "learning_rate": 4.382994198833645e-06,
+      "loss": 0.8354,
+      "step": 20349
+    },
+    {
+      "epoch": 3.6230413105413106,
+      "grad_norm": 0.8472702503204346,
+      "learning_rate": 4.378896508862351e-06,
+      "loss": 0.6862,
+      "step": 20350
+    },
+    {
+      "epoch": 3.6232193732193734,
+      "grad_norm": 0.8588680028915405,
+      "learning_rate": 4.374800692387937e-06,
+      "loss": 0.5167,
+      "step": 20351
+    },
+    {
+      "epoch": 3.623397435897436,
+      "grad_norm": 0.9006664752960205,
+      "learning_rate": 4.37070674949065e-06,
+      "loss": 0.5662,
+      "step": 20352
+    },
+    {
+      "epoch": 3.6235754985754984,
+      "grad_norm": 1.099118709564209,
+      "learning_rate": 4.366614680250736e-06,
+      "loss": 0.7506,
+      "step": 20353
+    },
+    {
+      "epoch": 3.623753561253561,
+      "grad_norm": 0.8333063721656799,
+      "learning_rate": 4.362524484748309e-06,
+      "loss": 0.7858,
+      "step": 20354
+    },
+    {
+      "epoch": 3.623931623931624,
+      "grad_norm": 1.0544650554656982,
+      "learning_rate": 4.358436163063573e-06,
+      "loss": 1.0375,
+      "step": 20355
+    },
+    {
+      "epoch": 3.6241096866096867,
+      "grad_norm": 1.008267879486084,
+      "learning_rate": 4.354349715276595e-06,
+      "loss": 0.659,
+      "step": 20356
+    },
+    {
+      "epoch": 3.6242877492877494,
+      "grad_norm": 0.8872396349906921,
+      "learning_rate": 4.350265141467458e-06,
+      "loss": 0.8278,
+      "step": 20357
+    },
+    {
+      "epoch": 3.6244658119658117,
+      "grad_norm": 0.9096971750259399,
+      "learning_rate": 4.346182441716162e-06,
+      "loss": 0.7571,
+      "step": 20358
+    },
+    {
+      "epoch": 3.6246438746438745,
+      "grad_norm": 0.9562456011772156,
+      "learning_rate": 4.342101616102734e-06,
+      "loss": 0.7768,
+      "step": 20359
+    },
+    {
+      "epoch": 3.6248219373219372,
+      "grad_norm": 0.8731200695037842,
+      "learning_rate": 4.338022664707109e-06,
+      "loss": 0.5639,
+      "step": 20360
+    },
+    {
+      "epoch": 3.625,
+      "grad_norm": 0.9827439785003662,
+      "learning_rate": 4.333945587609201e-06,
+      "loss": 0.8766,
+      "step": 20361
+    },
+    {
+      "epoch": 3.6251780626780628,
+      "grad_norm": 0.7939251661300659,
+      "learning_rate": 4.329870384888901e-06,
+      "loss": 0.4391,
+      "step": 20362
+    },
+    {
+      "epoch": 3.6253561253561255,
+      "grad_norm": 0.9462933540344238,
+      "learning_rate": 4.325797056626069e-06,
+      "loss": 0.5996,
+      "step": 20363
+    },
+    {
+      "epoch": 3.6255341880341883,
+      "grad_norm": 1.0042550563812256,
+      "learning_rate": 4.321725602900473e-06,
+      "loss": 0.7451,
+      "step": 20364
+    },
+    {
+      "epoch": 3.6257122507122506,
+      "grad_norm": 0.914974570274353,
+      "learning_rate": 4.317656023791927e-06,
+      "loss": 0.7685,
+      "step": 20365
+    },
+    {
+      "epoch": 3.6258903133903133,
+      "grad_norm": 0.85157310962677,
+      "learning_rate": 4.313588319380146e-06,
+      "loss": 0.6611,
+      "step": 20366
+    },
+    {
+      "epoch": 3.626068376068376,
+      "grad_norm": 1.184873104095459,
+      "learning_rate": 4.309522489744822e-06,
+      "loss": 0.8994,
+      "step": 20367
+    },
+    {
+      "epoch": 3.626246438746439,
+      "grad_norm": 1.1252131462097168,
+      "learning_rate": 4.305458534965634e-06,
+      "loss": 0.9815,
+      "step": 20368
+    },
+    {
+      "epoch": 3.6264245014245016,
+      "grad_norm": 0.9970200657844543,
+      "learning_rate": 4.301396455122198e-06,
+      "loss": 0.6608,
+      "step": 20369
+    },
+    {
+      "epoch": 3.626602564102564,
+      "grad_norm": 1.041490912437439,
+      "learning_rate": 4.297336250294093e-06,
+      "loss": 0.8741,
+      "step": 20370
+    },
+    {
+      "epoch": 3.6267806267806266,
+      "grad_norm": 1.0515310764312744,
+      "learning_rate": 4.2932779205608785e-06,
+      "loss": 0.8035,
+      "step": 20371
+    },
+    {
+      "epoch": 3.6269586894586894,
+      "grad_norm": 1.1585599184036255,
+      "learning_rate": 4.28922146600208e-06,
+      "loss": 0.9048,
+      "step": 20372
+    },
+    {
+      "epoch": 3.627136752136752,
+      "grad_norm": 0.9767042398452759,
+      "learning_rate": 4.285166886697167e-06,
+      "loss": 0.7881,
+      "step": 20373
+    },
+    {
+      "epoch": 3.627314814814815,
+      "grad_norm": 0.9792457818984985,
+      "learning_rate": 4.2811141827255764e-06,
+      "loss": 0.7102,
+      "step": 20374
+    },
+    {
+      "epoch": 3.6274928774928776,
+      "grad_norm": 0.9069845676422119,
+      "learning_rate": 4.27706335416671e-06,
+      "loss": 0.7514,
+      "step": 20375
+    },
+    {
+      "epoch": 3.6276709401709404,
+      "grad_norm": 0.8710619807243347,
+      "learning_rate": 4.273014401099939e-06,
+      "loss": 0.767,
+      "step": 20376
+    },
+    {
+      "epoch": 3.6278490028490027,
+      "grad_norm": 1.1712818145751953,
+      "learning_rate": 4.268967323604589e-06,
+      "loss": 0.5082,
+      "step": 20377
+    },
+    {
+      "epoch": 3.6280270655270654,
+      "grad_norm": 1.053260087966919,
+      "learning_rate": 4.264922121759985e-06,
+      "loss": 0.7219,
+      "step": 20378
+    },
+    {
+      "epoch": 3.628205128205128,
+      "grad_norm": 0.9651164412498474,
+      "learning_rate": 4.2608787956453286e-06,
+      "loss": 0.6483,
+      "step": 20379
+    },
+    {
+      "epoch": 3.628383190883191,
+      "grad_norm": 1.110133171081543,
+      "learning_rate": 4.256837345339892e-06,
+      "loss": 0.902,
+      "step": 20380
+    },
+    {
+      "epoch": 3.6285612535612537,
+      "grad_norm": 1.1139036417007446,
+      "learning_rate": 4.252797770922834e-06,
+      "loss": 0.7422,
+      "step": 20381
+    },
+    {
+      "epoch": 3.628739316239316,
+      "grad_norm": 1.043532371520996,
+      "learning_rate": 4.2487600724733016e-06,
+      "loss": 0.9144,
+      "step": 20382
+    },
+    {
+      "epoch": 3.6289173789173788,
+      "grad_norm": 0.9324162006378174,
+      "learning_rate": 4.24472425007042e-06,
+      "loss": 0.75,
+      "step": 20383
+    },
+    {
+      "epoch": 3.6290954415954415,
+      "grad_norm": 1.013924479484558,
+      "learning_rate": 4.240690303793238e-06,
+      "loss": 0.7576,
+      "step": 20384
+    },
+    {
+      "epoch": 3.6292735042735043,
+      "grad_norm": 1.1732478141784668,
+      "learning_rate": 4.236658233720814e-06,
+      "loss": 0.7254,
+      "step": 20385
+    },
+    {
+      "epoch": 3.629451566951567,
+      "grad_norm": 0.9141867756843567,
+      "learning_rate": 4.232628039932152e-06,
+      "loss": 0.7594,
+      "step": 20386
+    },
+    {
+      "epoch": 3.6296296296296298,
+      "grad_norm": 0.8569268584251404,
+      "learning_rate": 4.228599722506177e-06,
+      "loss": 0.5785,
+      "step": 20387
+    },
+    {
+      "epoch": 3.6298076923076925,
+      "grad_norm": 0.9507982730865479,
+      "learning_rate": 4.224573281521871e-06,
+      "loss": 0.799,
+      "step": 20388
+    },
+    {
+      "epoch": 3.629985754985755,
+      "grad_norm": 0.9895933270454407,
+      "learning_rate": 4.22054871705807e-06,
+      "loss": 0.8099,
+      "step": 20389
+    },
+    {
+      "epoch": 3.6301638176638176,
+      "grad_norm": 0.9135871529579163,
+      "learning_rate": 4.216526029193668e-06,
+      "loss": 0.9647,
+      "step": 20390
+    },
+    {
+      "epoch": 3.6303418803418803,
+      "grad_norm": 0.9216475486755371,
+      "learning_rate": 4.2125052180074674e-06,
+      "loss": 0.9267,
+      "step": 20391
+    },
+    {
+      "epoch": 3.630519943019943,
+      "grad_norm": 0.8940479755401611,
+      "learning_rate": 4.208486283578228e-06,
+      "loss": 0.6267,
+      "step": 20392
+    },
+    {
+      "epoch": 3.630698005698006,
+      "grad_norm": 0.8756904006004333,
+      "learning_rate": 4.204469225984731e-06,
+      "loss": 0.7487,
+      "step": 20393
+    },
+    {
+      "epoch": 3.630876068376068,
+      "grad_norm": 0.9734384417533875,
+      "learning_rate": 4.200454045305646e-06,
+      "loss": 0.3735,
+      "step": 20394
+    },
+    {
+      "epoch": 3.631054131054131,
+      "grad_norm": 0.9914159178733826,
+      "learning_rate": 4.196440741619678e-06,
+      "loss": 0.85,
+      "step": 20395
+    },
+    {
+      "epoch": 3.6312321937321936,
+      "grad_norm": 0.9292317032814026,
+      "learning_rate": 4.19242931500542e-06,
+      "loss": 0.9003,
+      "step": 20396
+    },
+    {
+      "epoch": 3.6314102564102564,
+      "grad_norm": 1.0771266222000122,
+      "learning_rate": 4.1884197655414955e-06,
+      "loss": 0.8604,
+      "step": 20397
+    },
+    {
+      "epoch": 3.631588319088319,
+      "grad_norm": 0.9089133143424988,
+      "learning_rate": 4.1844120933064445e-06,
+      "loss": 0.7653,
+      "step": 20398
+    },
+    {
+      "epoch": 3.631766381766382,
+      "grad_norm": 1.0063055753707886,
+      "learning_rate": 4.180406298378814e-06,
+      "loss": 0.7252,
+      "step": 20399
+    },
+    {
+      "epoch": 3.6319444444444446,
+      "grad_norm": 0.9310106039047241,
+      "learning_rate": 4.1764023808370635e-06,
+      "loss": 0.8539,
+      "step": 20400
+    },
+    {
+      "epoch": 3.6321225071225074,
+      "grad_norm": 0.9510707855224609,
+      "learning_rate": 4.172400340759652e-06,
+      "loss": 0.6733,
+      "step": 20401
+    },
+    {
+      "epoch": 3.6323005698005697,
+      "grad_norm": 0.9140132069587708,
+      "learning_rate": 4.168400178224985e-06,
+      "loss": 0.9194,
+      "step": 20402
+    },
+    {
+      "epoch": 3.6324786324786325,
+      "grad_norm": 0.8771872520446777,
+      "learning_rate": 4.1644018933114645e-06,
+      "loss": 0.7011,
+      "step": 20403
+    },
+    {
+      "epoch": 3.632656695156695,
+      "grad_norm": 0.9165019989013672,
+      "learning_rate": 4.160405486097385e-06,
+      "loss": 0.6365,
+      "step": 20404
+    },
+    {
+      "epoch": 3.632834757834758,
+      "grad_norm": 1.0388853549957275,
+      "learning_rate": 4.156410956661083e-06,
+      "loss": 0.8182,
+      "step": 20405
+    },
+    {
+      "epoch": 3.6330128205128203,
+      "grad_norm": 1.0131229162216187,
+      "learning_rate": 4.152418305080819e-06,
+      "loss": 0.8188,
+      "step": 20406
+    },
+    {
+      "epoch": 3.633190883190883,
+      "grad_norm": 0.919827401638031,
+      "learning_rate": 4.1484275314347955e-06,
+      "loss": 0.786,
+      "step": 20407
+    },
+    {
+      "epoch": 3.6333689458689458,
+      "grad_norm": 0.9317741990089417,
+      "learning_rate": 4.144438635801229e-06,
+      "loss": 0.7544,
+      "step": 20408
+    },
+    {
+      "epoch": 3.6335470085470085,
+      "grad_norm": 0.9361299872398376,
+      "learning_rate": 4.140451618258267e-06,
+      "loss": 0.7855,
+      "step": 20409
+    },
+    {
+      "epoch": 3.6337250712250713,
+      "grad_norm": 0.9616186022758484,
+      "learning_rate": 4.136466478884016e-06,
+      "loss": 0.7551,
+      "step": 20410
+    },
+    {
+      "epoch": 3.633903133903134,
+      "grad_norm": 0.8722997307777405,
+      "learning_rate": 4.132483217756567e-06,
+      "loss": 0.7949,
+      "step": 20411
+    },
+    {
+      "epoch": 3.6340811965811968,
+      "grad_norm": 1.0613240003585815,
+      "learning_rate": 4.128501834953957e-06,
+      "loss": 0.8799,
+      "step": 20412
+    },
+    {
+      "epoch": 3.6342592592592595,
+      "grad_norm": 0.8569284677505493,
+      "learning_rate": 4.124522330554215e-06,
+      "loss": 0.6788,
+      "step": 20413
+    },
+    {
+      "epoch": 3.634437321937322,
+      "grad_norm": 0.9483793377876282,
+      "learning_rate": 4.1205447046352766e-06,
+      "loss": 0.7351,
+      "step": 20414
+    },
+    {
+      "epoch": 3.6346153846153846,
+      "grad_norm": 1.0549039840698242,
+      "learning_rate": 4.116568957275102e-06,
+      "loss": 0.8051,
+      "step": 20415
+    },
+    {
+      "epoch": 3.6347934472934473,
+      "grad_norm": 0.9289839267730713,
+      "learning_rate": 4.112595088551574e-06,
+      "loss": 0.7473,
+      "step": 20416
+    },
+    {
+      "epoch": 3.63497150997151,
+      "grad_norm": 0.893508791923523,
+      "learning_rate": 4.108623098542552e-06,
+      "loss": 0.8555,
+      "step": 20417
+    },
+    {
+      "epoch": 3.6351495726495724,
+      "grad_norm": 1.0892490148544312,
+      "learning_rate": 4.1046529873258854e-06,
+      "loss": 0.9039,
+      "step": 20418
+    },
+    {
+      "epoch": 3.635327635327635,
+      "grad_norm": 1.0458922386169434,
+      "learning_rate": 4.1006847549793115e-06,
+      "loss": 0.7172,
+      "step": 20419
+    },
+    {
+      "epoch": 3.635505698005698,
+      "grad_norm": 0.9979022145271301,
+      "learning_rate": 4.0967184015806235e-06,
+      "loss": 0.7928,
+      "step": 20420
+    },
+    {
+      "epoch": 3.6356837606837606,
+      "grad_norm": 0.9028869867324829,
+      "learning_rate": 4.092753927207505e-06,
+      "loss": 0.8833,
+      "step": 20421
+    },
+    {
+      "epoch": 3.6358618233618234,
+      "grad_norm": 0.9818532466888428,
+      "learning_rate": 4.0887913319376466e-06,
+      "loss": 0.8457,
+      "step": 20422
+    },
+    {
+      "epoch": 3.636039886039886,
+      "grad_norm": 1.0005574226379395,
+      "learning_rate": 4.084830615848689e-06,
+      "loss": 0.9425,
+      "step": 20423
+    },
+    {
+      "epoch": 3.636217948717949,
+      "grad_norm": 0.7937536835670471,
+      "learning_rate": 4.080871779018225e-06,
+      "loss": 0.5992,
+      "step": 20424
+    },
+    {
+      "epoch": 3.6363960113960117,
+      "grad_norm": 0.8660478591918945,
+      "learning_rate": 4.076914821523825e-06,
+      "loss": 0.5975,
+      "step": 20425
+    },
+    {
+      "epoch": 3.636574074074074,
+      "grad_norm": 0.9407948851585388,
+      "learning_rate": 4.072959743443017e-06,
+      "loss": 0.9115,
+      "step": 20426
+    },
+    {
+      "epoch": 3.6367521367521367,
+      "grad_norm": 0.9861929416656494,
+      "learning_rate": 4.069006544853271e-06,
+      "loss": 0.7914,
+      "step": 20427
+    },
+    {
+      "epoch": 3.6369301994301995,
+      "grad_norm": 1.0055484771728516,
+      "learning_rate": 4.065055225832104e-06,
+      "loss": 0.885,
+      "step": 20428
+    },
+    {
+      "epoch": 3.637108262108262,
+      "grad_norm": 0.8550659418106079,
+      "learning_rate": 4.0611057864568536e-06,
+      "loss": 0.61,
+      "step": 20429
+    },
+    {
+      "epoch": 3.6372863247863245,
+      "grad_norm": 0.8787573575973511,
+      "learning_rate": 4.057158226804958e-06,
+      "loss": 0.8518,
+      "step": 20430
+    },
+    {
+      "epoch": 3.6374643874643873,
+      "grad_norm": 0.9358032941818237,
+      "learning_rate": 4.053212546953744e-06,
+      "loss": 0.5766,
+      "step": 20431
+    },
+    {
+      "epoch": 3.63764245014245,
+      "grad_norm": 1.0040407180786133,
+      "learning_rate": 4.049268746980517e-06,
+      "loss": 0.7181,
+      "step": 20432
+    },
+    {
+      "epoch": 3.6378205128205128,
+      "grad_norm": 1.0668649673461914,
+      "learning_rate": 4.045326826962548e-06,
+      "loss": 0.8602,
+      "step": 20433
+    },
+    {
+      "epoch": 3.6379985754985755,
+      "grad_norm": 1.019614338874817,
+      "learning_rate": 4.041386786977053e-06,
+      "loss": 0.9243,
+      "step": 20434
+    },
+    {
+      "epoch": 3.6381766381766383,
+      "grad_norm": 0.9878685474395752,
+      "learning_rate": 4.037448627101281e-06,
+      "loss": 0.8588,
+      "step": 20435
+    },
+    {
+      "epoch": 3.638354700854701,
+      "grad_norm": 0.9777923822402954,
+      "learning_rate": 4.033512347412327e-06,
+      "loss": 0.7589,
+      "step": 20436
+    },
+    {
+      "epoch": 3.638532763532764,
+      "grad_norm": 0.9318233132362366,
+      "learning_rate": 4.029577947987362e-06,
+      "loss": 0.7857,
+      "step": 20437
+    },
+    {
+      "epoch": 3.638710826210826,
+      "grad_norm": 0.9956740140914917,
+      "learning_rate": 4.025645428903446e-06,
+      "loss": 0.7942,
+      "step": 20438
+    },
+    {
+      "epoch": 3.638888888888889,
+      "grad_norm": 0.9890974760055542,
+      "learning_rate": 4.021714790237652e-06,
+      "loss": 0.8209,
+      "step": 20439
+    },
+    {
+      "epoch": 3.6390669515669516,
+      "grad_norm": 0.9391531348228455,
+      "learning_rate": 4.017786032066972e-06,
+      "loss": 0.8631,
+      "step": 20440
+    },
+    {
+      "epoch": 3.6392450142450143,
+      "grad_norm": 0.8784220814704895,
+      "learning_rate": 4.013859154468391e-06,
+      "loss": 0.6432,
+      "step": 20441
+    },
+    {
+      "epoch": 3.6394230769230766,
+      "grad_norm": 0.7741743922233582,
+      "learning_rate": 4.009934157518835e-06,
+      "loss": 0.6588,
+      "step": 20442
+    },
+    {
+      "epoch": 3.6396011396011394,
+      "grad_norm": 0.9611365795135498,
+      "learning_rate": 4.0060110412952325e-06,
+      "loss": 0.7351,
+      "step": 20443
+    },
+    {
+      "epoch": 3.639779202279202,
+      "grad_norm": 0.9556607007980347,
+      "learning_rate": 4.0020898058744204e-06,
+      "loss": 0.7602,
+      "step": 20444
+    },
+    {
+      "epoch": 3.639957264957265,
+      "grad_norm": 0.8218309879302979,
+      "learning_rate": 3.998170451333261e-06,
+      "loss": 0.6772,
+      "step": 20445
+    },
+    {
+      "epoch": 3.6401353276353277,
+      "grad_norm": 1.0754879713058472,
+      "learning_rate": 3.994252977748503e-06,
+      "loss": 0.847,
+      "step": 20446
+    },
+    {
+      "epoch": 3.6403133903133904,
+      "grad_norm": 1.021073341369629,
+      "learning_rate": 3.990337385196929e-06,
+      "loss": 0.6531,
+      "step": 20447
+    },
+    {
+      "epoch": 3.640491452991453,
+      "grad_norm": 0.983539879322052,
+      "learning_rate": 3.986423673755257e-06,
+      "loss": 0.6254,
+      "step": 20448
+    },
+    {
+      "epoch": 3.640669515669516,
+      "grad_norm": 0.8971178531646729,
+      "learning_rate": 3.982511843500159e-06,
+      "loss": 0.7201,
+      "step": 20449
+    },
+    {
+      "epoch": 3.640847578347578,
+      "grad_norm": 0.9377745985984802,
+      "learning_rate": 3.978601894508282e-06,
+      "loss": 0.7173,
+      "step": 20450
+    },
+    {
+      "epoch": 3.641025641025641,
+      "grad_norm": 0.9671639204025269,
+      "learning_rate": 3.974693826856224e-06,
+      "loss": 0.81,
+      "step": 20451
+    },
+    {
+      "epoch": 3.6412037037037037,
+      "grad_norm": 1.0108625888824463,
+      "learning_rate": 3.970787640620577e-06,
+      "loss": 0.5756,
+      "step": 20452
+    },
+    {
+      "epoch": 3.6413817663817665,
+      "grad_norm": 0.9857928156852722,
+      "learning_rate": 3.966883335877858e-06,
+      "loss": 0.6986,
+      "step": 20453
+    },
+    {
+      "epoch": 3.6415598290598292,
+      "grad_norm": 1.0502939224243164,
+      "learning_rate": 3.962980912704572e-06,
+      "loss": 0.6687,
+      "step": 20454
+    },
+    {
+      "epoch": 3.6417378917378915,
+      "grad_norm": 0.8568015694618225,
+      "learning_rate": 3.9590803711771705e-06,
+      "loss": 0.6249,
+      "step": 20455
+    },
+    {
+      "epoch": 3.6419159544159543,
+      "grad_norm": 0.888664722442627,
+      "learning_rate": 3.9551817113720915e-06,
+      "loss": 0.6097,
+      "step": 20456
+    },
+    {
+      "epoch": 3.642094017094017,
+      "grad_norm": 1.024553894996643,
+      "learning_rate": 3.9512849333657065e-06,
+      "loss": 0.891,
+      "step": 20457
+    },
+    {
+      "epoch": 3.64227207977208,
+      "grad_norm": 0.9419230222702026,
+      "learning_rate": 3.947390037234366e-06,
+      "loss": 0.8291,
+      "step": 20458
+    },
+    {
+      "epoch": 3.6424501424501425,
+      "grad_norm": 0.9657182097434998,
+      "learning_rate": 3.9434970230543765e-06,
+      "loss": 0.5703,
+      "step": 20459
+    },
+    {
+      "epoch": 3.6426282051282053,
+      "grad_norm": 1.196069598197937,
+      "learning_rate": 3.939605890902054e-06,
+      "loss": 0.8661,
+      "step": 20460
+    },
+    {
+      "epoch": 3.642806267806268,
+      "grad_norm": 1.0185527801513672,
+      "learning_rate": 3.935716640853571e-06,
+      "loss": 0.6936,
+      "step": 20461
+    },
+    {
+      "epoch": 3.6429843304843303,
+      "grad_norm": 1.0709651708602905,
+      "learning_rate": 3.931829272985177e-06,
+      "loss": 0.6632,
+      "step": 20462
+    },
+    {
+      "epoch": 3.643162393162393,
+      "grad_norm": 0.8733354210853577,
+      "learning_rate": 3.927943787373034e-06,
+      "loss": 0.8073,
+      "step": 20463
+    },
+    {
+      "epoch": 3.643340455840456,
+      "grad_norm": 1.1199495792388916,
+      "learning_rate": 3.924060184093248e-06,
+      "loss": 0.7714,
+      "step": 20464
+    },
+    {
+      "epoch": 3.6435185185185186,
+      "grad_norm": 0.9008165597915649,
+      "learning_rate": 3.920178463221924e-06,
+      "loss": 0.7476,
+      "step": 20465
+    },
+    {
+      "epoch": 3.6436965811965814,
+      "grad_norm": 0.9498278498649597,
+      "learning_rate": 3.916298624835124e-06,
+      "loss": 0.829,
+      "step": 20466
+    },
+    {
+      "epoch": 3.6438746438746437,
+      "grad_norm": 1.033387541770935,
+      "learning_rate": 3.9124206690088425e-06,
+      "loss": 0.5773,
+      "step": 20467
+    },
+    {
+      "epoch": 3.6440527065527064,
+      "grad_norm": 0.8976402878761292,
+      "learning_rate": 3.908544595819086e-06,
+      "loss": 0.6666,
+      "step": 20468
+    },
+    {
+      "epoch": 3.644230769230769,
+      "grad_norm": 1.055590033531189,
+      "learning_rate": 3.90467040534177e-06,
+      "loss": 0.8151,
+      "step": 20469
+    },
+    {
+      "epoch": 3.644408831908832,
+      "grad_norm": 0.9862080216407776,
+      "learning_rate": 3.9007980976528246e-06,
+      "loss": 0.6903,
+      "step": 20470
+    },
+    {
+      "epoch": 3.6445868945868947,
+      "grad_norm": 0.9331284761428833,
+      "learning_rate": 3.89692767282811e-06,
+      "loss": 0.801,
+      "step": 20471
+    },
+    {
+      "epoch": 3.6447649572649574,
+      "grad_norm": 0.9865574240684509,
+      "learning_rate": 3.893059130943466e-06,
+      "loss": 0.6761,
+      "step": 20472
+    },
+    {
+      "epoch": 3.64494301994302,
+      "grad_norm": 0.9861798286437988,
+      "learning_rate": 3.889192472074677e-06,
+      "loss": 0.803,
+      "step": 20473
+    },
+    {
+      "epoch": 3.6451210826210825,
+      "grad_norm": 0.9552395343780518,
+      "learning_rate": 3.885327696297503e-06,
+      "loss": 0.8508,
+      "step": 20474
+    },
+    {
+      "epoch": 3.6452991452991452,
+      "grad_norm": 1.0771273374557495,
+      "learning_rate": 3.881464803687695e-06,
+      "loss": 0.7232,
+      "step": 20475
+    },
+    {
+      "epoch": 3.645477207977208,
+      "grad_norm": 0.8707461953163147,
+      "learning_rate": 3.877603794320894e-06,
+      "loss": 0.7563,
+      "step": 20476
+    },
+    {
+      "epoch": 3.6456552706552707,
+      "grad_norm": 1.192508339881897,
+      "learning_rate": 3.873744668272772e-06,
+      "loss": 0.834,
+      "step": 20477
+    },
+    {
+      "epoch": 3.6458333333333335,
+      "grad_norm": 1.0404844284057617,
+      "learning_rate": 3.869887425618945e-06,
+      "loss": 0.8408,
+      "step": 20478
+    },
+    {
+      "epoch": 3.646011396011396,
+      "grad_norm": 0.8634822368621826,
+      "learning_rate": 3.866032066434988e-06,
+      "loss": 0.687,
+      "step": 20479
+    },
+    {
+      "epoch": 3.6461894586894585,
+      "grad_norm": 0.9610817432403564,
+      "learning_rate": 3.862178590796417e-06,
+      "loss": 0.7168,
+      "step": 20480
+    },
+    {
+      "epoch": 3.6463675213675213,
+      "grad_norm": 0.9478577971458435,
+      "learning_rate": 3.858326998778761e-06,
+      "loss": 0.7505,
+      "step": 20481
+    },
+    {
+      "epoch": 3.646545584045584,
+      "grad_norm": 1.1278072595596313,
+      "learning_rate": 3.8544772904574585e-06,
+      "loss": 0.8973,
+      "step": 20482
+    },
+    {
+      "epoch": 3.646723646723647,
+      "grad_norm": 0.83364337682724,
+      "learning_rate": 3.850629465907951e-06,
+      "loss": 0.6548,
+      "step": 20483
+    },
+    {
+      "epoch": 3.6469017094017095,
+      "grad_norm": 0.965010404586792,
+      "learning_rate": 3.846783525205622e-06,
+      "loss": 0.7875,
+      "step": 20484
+    },
+    {
+      "epoch": 3.6470797720797723,
+      "grad_norm": 1.0525926351547241,
+      "learning_rate": 3.842939468425844e-06,
+      "loss": 0.9994,
+      "step": 20485
+    },
+    {
+      "epoch": 3.6472578347578346,
+      "grad_norm": 0.9967437386512756,
+      "learning_rate": 3.839097295643901e-06,
+      "loss": 0.7788,
+      "step": 20486
+    },
+    {
+      "epoch": 3.6474358974358974,
+      "grad_norm": 1.0144619941711426,
+      "learning_rate": 3.8352570069351e-06,
+      "loss": 0.9022,
+      "step": 20487
+    },
+    {
+      "epoch": 3.64761396011396,
+      "grad_norm": 1.0700451135635376,
+      "learning_rate": 3.83141860237467e-06,
+      "loss": 0.7186,
+      "step": 20488
+    },
+    {
+      "epoch": 3.647792022792023,
+      "grad_norm": 0.9724757671356201,
+      "learning_rate": 3.827582082037817e-06,
+      "loss": 0.7279,
+      "step": 20489
+    },
+    {
+      "epoch": 3.6479700854700856,
+      "grad_norm": 0.9472264647483826,
+      "learning_rate": 3.823747445999714e-06,
+      "loss": 0.5964,
+      "step": 20490
+    },
+    {
+      "epoch": 3.648148148148148,
+      "grad_norm": 0.8835110068321228,
+      "learning_rate": 3.81991469433548e-06,
+      "loss": 0.7234,
+      "step": 20491
+    },
+    {
+      "epoch": 3.6483262108262107,
+      "grad_norm": 0.9631081819534302,
+      "learning_rate": 3.816083827120254e-06,
+      "loss": 0.7303,
+      "step": 20492
+    },
+    {
+      "epoch": 3.6485042735042734,
+      "grad_norm": 1.0134692192077637,
+      "learning_rate": 3.8122548444290307e-06,
+      "loss": 0.8875,
+      "step": 20493
+    },
+    {
+      "epoch": 3.648682336182336,
+      "grad_norm": 0.8940348625183105,
+      "learning_rate": 3.8084277463368623e-06,
+      "loss": 0.8223,
+      "step": 20494
+    },
+    {
+      "epoch": 3.648860398860399,
+      "grad_norm": 0.9974220991134644,
+      "learning_rate": 3.804602532918744e-06,
+      "loss": 0.7668,
+      "step": 20495
+    },
+    {
+      "epoch": 3.6490384615384617,
+      "grad_norm": 0.9602738618850708,
+      "learning_rate": 3.800779204249605e-06,
+      "loss": 0.6415,
+      "step": 20496
+    },
+    {
+      "epoch": 3.6492165242165244,
+      "grad_norm": 0.9182425737380981,
+      "learning_rate": 3.796957760404363e-06,
+      "loss": 0.8986,
+      "step": 20497
+    },
+    {
+      "epoch": 3.6493945868945867,
+      "grad_norm": 0.9976184964179993,
+      "learning_rate": 3.793138201457891e-06,
+      "loss": 0.7179,
+      "step": 20498
+    },
+    {
+      "epoch": 3.6495726495726495,
+      "grad_norm": 0.9328181147575378,
+      "learning_rate": 3.7893205274850076e-06,
+      "loss": 0.7968,
+      "step": 20499
+    },
+    {
+      "epoch": 3.6497507122507122,
+      "grad_norm": 0.9778452515602112,
+      "learning_rate": 3.7855047385605525e-06,
+      "loss": 0.6892,
+      "step": 20500
+    },
+    {
+      "epoch": 3.649928774928775,
+      "grad_norm": 1.0850595235824585,
+      "learning_rate": 3.781690834759244e-06,
+      "loss": 0.9374,
+      "step": 20501
+    },
+    {
+      "epoch": 3.6501068376068377,
+      "grad_norm": 0.977279007434845,
+      "learning_rate": 3.7778788161558444e-06,
+      "loss": 0.8858,
+      "step": 20502
+    },
+    {
+      "epoch": 3.6502849002849,
+      "grad_norm": 0.8755964040756226,
+      "learning_rate": 3.774068682825005e-06,
+      "loss": 0.8847,
+      "step": 20503
+    },
+    {
+      "epoch": 3.650462962962963,
+      "grad_norm": 0.9674113988876343,
+      "learning_rate": 3.7702604348414107e-06,
+      "loss": 0.7908,
+      "step": 20504
+    },
+    {
+      "epoch": 3.6506410256410255,
+      "grad_norm": 0.9421332478523254,
+      "learning_rate": 3.766454072279657e-06,
+      "loss": 0.8609,
+      "step": 20505
+    },
+    {
+      "epoch": 3.6508190883190883,
+      "grad_norm": 1.0132116079330444,
+      "learning_rate": 3.76264959521434e-06,
+      "loss": 0.9126,
+      "step": 20506
+    },
+    {
+      "epoch": 3.650997150997151,
+      "grad_norm": 0.9830502867698669,
+      "learning_rate": 3.7588470037199787e-06,
+      "loss": 0.9262,
+      "step": 20507
+    },
+    {
+      "epoch": 3.651175213675214,
+      "grad_norm": 0.9435737133026123,
+      "learning_rate": 3.7550462978710897e-06,
+      "loss": 0.7092,
+      "step": 20508
+    },
+    {
+      "epoch": 3.6513532763532766,
+      "grad_norm": 0.9076669216156006,
+      "learning_rate": 3.7512474777421257e-06,
+      "loss": 0.8453,
+      "step": 20509
+    },
+    {
+      "epoch": 3.6515313390313393,
+      "grad_norm": 0.6464830636978149,
+      "learning_rate": 3.747450543407538e-06,
+      "loss": 0.3109,
+      "step": 20510
+    },
+    {
+      "epoch": 3.6517094017094016,
+      "grad_norm": 0.9773824214935303,
+      "learning_rate": 3.7436554949417113e-06,
+      "loss": 0.7024,
+      "step": 20511
+    },
+    {
+      "epoch": 3.6518874643874644,
+      "grad_norm": 0.7433509826660156,
+      "learning_rate": 3.7398623324189973e-06,
+      "loss": 0.4606,
+      "step": 20512
+    },
+    {
+      "epoch": 3.652065527065527,
+      "grad_norm": 0.8884417414665222,
+      "learning_rate": 3.7360710559137146e-06,
+      "loss": 0.5894,
+      "step": 20513
+    },
+    {
+      "epoch": 3.65224358974359,
+      "grad_norm": 1.0048164129257202,
+      "learning_rate": 3.732281665500148e-06,
+      "loss": 0.834,
+      "step": 20514
+    },
+    {
+      "epoch": 3.652421652421652,
+      "grad_norm": 0.9860281944274902,
+      "learning_rate": 3.7284941612525605e-06,
+      "loss": 0.7918,
+      "step": 20515
+    },
+    {
+      "epoch": 3.652599715099715,
+      "grad_norm": 1.150870680809021,
+      "learning_rate": 3.7247085432451147e-06,
+      "loss": 0.9382,
+      "step": 20516
+    },
+    {
+      "epoch": 3.6527777777777777,
+      "grad_norm": 0.9690985083580017,
+      "learning_rate": 3.72092481155204e-06,
+      "loss": 0.6593,
+      "step": 20517
+    },
+    {
+      "epoch": 3.6529558404558404,
+      "grad_norm": 0.7312018871307373,
+      "learning_rate": 3.7171429662474223e-06,
+      "loss": 0.5452,
+      "step": 20518
+    },
+    {
+      "epoch": 3.653133903133903,
+      "grad_norm": 0.9348322153091431,
+      "learning_rate": 3.7133630074053794e-06,
+      "loss": 0.8746,
+      "step": 20519
+    },
+    {
+      "epoch": 3.653311965811966,
+      "grad_norm": 0.85910564661026,
+      "learning_rate": 3.7095849350999746e-06,
+      "loss": 0.6533,
+      "step": 20520
+    },
+    {
+      "epoch": 3.6534900284900287,
+      "grad_norm": 0.9279438853263855,
+      "learning_rate": 3.705808749405226e-06,
+      "loss": 0.6357,
+      "step": 20521
+    },
+    {
+      "epoch": 3.6536680911680914,
+      "grad_norm": 1.0286200046539307,
+      "learning_rate": 3.70203445039512e-06,
+      "loss": 0.8155,
+      "step": 20522
+    },
+    {
+      "epoch": 3.6538461538461537,
+      "grad_norm": 0.9838545322418213,
+      "learning_rate": 3.698262038143618e-06,
+      "loss": 0.8736,
+      "step": 20523
+    },
+    {
+      "epoch": 3.6540242165242165,
+      "grad_norm": 0.9309592843055725,
+      "learning_rate": 3.694491512724596e-06,
+      "loss": 0.8286,
+      "step": 20524
+    },
+    {
+      "epoch": 3.6542022792022792,
+      "grad_norm": 0.9647026062011719,
+      "learning_rate": 3.6907228742119825e-06,
+      "loss": 0.7917,
+      "step": 20525
+    },
+    {
+      "epoch": 3.654380341880342,
+      "grad_norm": 1.0884795188903809,
+      "learning_rate": 3.6869561226795744e-06,
+      "loss": 0.9536,
+      "step": 20526
+    },
+    {
+      "epoch": 3.6545584045584043,
+      "grad_norm": 0.8624057173728943,
+      "learning_rate": 3.6831912582012017e-06,
+      "loss": 0.6808,
+      "step": 20527
+    },
+    {
+      "epoch": 3.654736467236467,
+      "grad_norm": 0.9675288796424866,
+      "learning_rate": 3.6794282808505943e-06,
+      "loss": 0.7873,
+      "step": 20528
+    },
+    {
+      "epoch": 3.65491452991453,
+      "grad_norm": 1.0651471614837646,
+      "learning_rate": 3.675667190701515e-06,
+      "loss": 0.8461,
+      "step": 20529
+    },
+    {
+      "epoch": 3.6550925925925926,
+      "grad_norm": 1.0895347595214844,
+      "learning_rate": 3.6719079878276387e-06,
+      "loss": 1.2929,
+      "step": 20530
+    },
+    {
+      "epoch": 3.6552706552706553,
+      "grad_norm": 0.887728214263916,
+      "learning_rate": 3.668150672302606e-06,
+      "loss": 0.7159,
+      "step": 20531
+    },
+    {
+      "epoch": 3.655448717948718,
+      "grad_norm": 0.9625465869903564,
+      "learning_rate": 3.6643952442000807e-06,
+      "loss": 0.8361,
+      "step": 20532
+    },
+    {
+      "epoch": 3.655626780626781,
+      "grad_norm": 0.8512797951698303,
+      "learning_rate": 3.6606417035935816e-06,
+      "loss": 0.7388,
+      "step": 20533
+    },
+    {
+      "epoch": 3.6558048433048436,
+      "grad_norm": 1.0100477933883667,
+      "learning_rate": 3.656890050556694e-06,
+      "loss": 0.6821,
+      "step": 20534
+    },
+    {
+      "epoch": 3.655982905982906,
+      "grad_norm": 0.9464243054389954,
+      "learning_rate": 3.6531402851629036e-06,
+      "loss": 0.7139,
+      "step": 20535
+    },
+    {
+      "epoch": 3.6561609686609686,
+      "grad_norm": 0.8691574931144714,
+      "learning_rate": 3.6493924074856966e-06,
+      "loss": 0.6875,
+      "step": 20536
+    },
+    {
+      "epoch": 3.6563390313390314,
+      "grad_norm": 0.8101440072059631,
+      "learning_rate": 3.645646417598492e-06,
+      "loss": 0.5582,
+      "step": 20537
+    },
+    {
+      "epoch": 3.656517094017094,
+      "grad_norm": 0.8926877379417419,
+      "learning_rate": 3.6419023155746854e-06,
+      "loss": 0.6672,
+      "step": 20538
+    },
+    {
+      "epoch": 3.6566951566951564,
+      "grad_norm": 0.9474127888679504,
+      "learning_rate": 3.638160101487631e-06,
+      "loss": 0.7278,
+      "step": 20539
+    },
+    {
+      "epoch": 3.656873219373219,
+      "grad_norm": 0.9432591199874878,
+      "learning_rate": 3.634419775410658e-06,
+      "loss": 0.7238,
+      "step": 20540
+    },
+    {
+      "epoch": 3.657051282051282,
+      "grad_norm": 0.9014365673065186,
+      "learning_rate": 3.630681337417041e-06,
+      "loss": 0.6098,
+      "step": 20541
+    },
+    {
+      "epoch": 3.6572293447293447,
+      "grad_norm": 0.9230480790138245,
+      "learning_rate": 3.6269447875800557e-06,
+      "loss": 0.6788,
+      "step": 20542
+    },
+    {
+      "epoch": 3.6574074074074074,
+      "grad_norm": 0.9330241680145264,
+      "learning_rate": 3.6232101259728644e-06,
+      "loss": 0.8389,
+      "step": 20543
+    },
+    {
+      "epoch": 3.65758547008547,
+      "grad_norm": 0.9799591898918152,
+      "learning_rate": 3.6194773526686764e-06,
+      "loss": 0.8186,
+      "step": 20544
+    },
+    {
+      "epoch": 3.657763532763533,
+      "grad_norm": 1.0543632507324219,
+      "learning_rate": 3.6157464677406105e-06,
+      "loss": 0.7756,
+      "step": 20545
+    },
+    {
+      "epoch": 3.6579415954415957,
+      "grad_norm": 0.8438118696212769,
+      "learning_rate": 3.612017471261775e-06,
+      "loss": 0.6754,
+      "step": 20546
+    },
+    {
+      "epoch": 3.658119658119658,
+      "grad_norm": 0.8986937403678894,
+      "learning_rate": 3.6082903633052335e-06,
+      "loss": 0.656,
+      "step": 20547
+    },
+    {
+      "epoch": 3.6582977207977208,
+      "grad_norm": 1.0008556842803955,
+      "learning_rate": 3.604565143944005e-06,
+      "loss": 0.7939,
+      "step": 20548
+    },
+    {
+      "epoch": 3.6584757834757835,
+      "grad_norm": 1.050921082496643,
+      "learning_rate": 3.600841813251066e-06,
+      "loss": 0.6991,
+      "step": 20549
+    },
+    {
+      "epoch": 3.6586538461538463,
+      "grad_norm": 1.1899060010910034,
+      "learning_rate": 3.5971203712993894e-06,
+      "loss": 0.8935,
+      "step": 20550
+    },
+    {
+      "epoch": 3.6588319088319086,
+      "grad_norm": 0.9493093490600586,
+      "learning_rate": 3.593400818161885e-06,
+      "loss": 0.8281,
+      "step": 20551
+    },
+    {
+      "epoch": 3.6590099715099713,
+      "grad_norm": 0.9087548851966858,
+      "learning_rate": 3.5896831539114162e-06,
+      "loss": 0.5721,
+      "step": 20552
+    },
+    {
+      "epoch": 3.659188034188034,
+      "grad_norm": 0.934526801109314,
+      "learning_rate": 3.585967378620836e-06,
+      "loss": 0.7392,
+      "step": 20553
+    },
+    {
+      "epoch": 3.659366096866097,
+      "grad_norm": 0.8018043041229248,
+      "learning_rate": 3.582253492362941e-06,
+      "loss": 0.6153,
+      "step": 20554
+    },
+    {
+      "epoch": 3.6595441595441596,
+      "grad_norm": 1.037794828414917,
+      "learning_rate": 3.5785414952104966e-06,
+      "loss": 0.7521,
+      "step": 20555
+    },
+    {
+      "epoch": 3.6597222222222223,
+      "grad_norm": 0.9660981297492981,
+      "learning_rate": 3.5748313872362215e-06,
+      "loss": 0.7198,
+      "step": 20556
+    },
+    {
+      "epoch": 3.659900284900285,
+      "grad_norm": 0.9670109152793884,
+      "learning_rate": 3.5711231685128464e-06,
+      "loss": 0.6839,
+      "step": 20557
+    },
+    {
+      "epoch": 3.660078347578348,
+      "grad_norm": 1.0562913417816162,
+      "learning_rate": 3.5674168391129693e-06,
+      "loss": 1.0162,
+      "step": 20558
+    },
+    {
+      "epoch": 3.66025641025641,
+      "grad_norm": 0.9485027194023132,
+      "learning_rate": 3.5637123991092538e-06,
+      "loss": 0.8613,
+      "step": 20559
+    },
+    {
+      "epoch": 3.660434472934473,
+      "grad_norm": 1.0248631238937378,
+      "learning_rate": 3.5600098485742637e-06,
+      "loss": 0.816,
+      "step": 20560
+    },
+    {
+      "epoch": 3.6606125356125356,
+      "grad_norm": 0.9192467331886292,
+      "learning_rate": 3.556309187580553e-06,
+      "loss": 0.7076,
+      "step": 20561
+    },
+    {
+      "epoch": 3.6607905982905984,
+      "grad_norm": 0.982490599155426,
+      "learning_rate": 3.552610416200608e-06,
+      "loss": 0.6947,
+      "step": 20562
+    },
+    {
+      "epoch": 3.6609686609686607,
+      "grad_norm": 0.8929651975631714,
+      "learning_rate": 3.5489135345069147e-06,
+      "loss": 0.7547,
+      "step": 20563
+    },
+    {
+      "epoch": 3.6611467236467234,
+      "grad_norm": 1.0499329566955566,
+      "learning_rate": 3.545218542571893e-06,
+      "loss": 0.6648,
+      "step": 20564
+    },
+    {
+      "epoch": 3.661324786324786,
+      "grad_norm": 1.022430181503296,
+      "learning_rate": 3.541525440467952e-06,
+      "loss": 0.7897,
+      "step": 20565
+    },
+    {
+      "epoch": 3.661502849002849,
+      "grad_norm": 0.9741869568824768,
+      "learning_rate": 3.5378342282674336e-06,
+      "loss": 0.7267,
+      "step": 20566
+    },
+    {
+      "epoch": 3.6616809116809117,
+      "grad_norm": 0.9906203150749207,
+      "learning_rate": 3.534144906042702e-06,
+      "loss": 0.5623,
+      "step": 20567
+    },
+    {
+      "epoch": 3.6618589743589745,
+      "grad_norm": 0.8446068167686462,
+      "learning_rate": 3.530457473865978e-06,
+      "loss": 0.79,
+      "step": 20568
+    },
+    {
+      "epoch": 3.662037037037037,
+      "grad_norm": 0.9871222376823425,
+      "learning_rate": 3.5267719318095583e-06,
+      "loss": 0.8294,
+      "step": 20569
+    },
+    {
+      "epoch": 3.6622150997151,
+      "grad_norm": 1.349190592765808,
+      "learning_rate": 3.52308827994563e-06,
+      "loss": 0.9821,
+      "step": 20570
+    },
+    {
+      "epoch": 3.6623931623931623,
+      "grad_norm": 0.9572153687477112,
+      "learning_rate": 3.5194065183463686e-06,
+      "loss": 0.7993,
+      "step": 20571
+    },
+    {
+      "epoch": 3.662571225071225,
+      "grad_norm": 0.7855825424194336,
+      "learning_rate": 3.5157266470839277e-06,
+      "loss": 0.6361,
+      "step": 20572
+    },
+    {
+      "epoch": 3.6627492877492878,
+      "grad_norm": 0.958167314529419,
+      "learning_rate": 3.512048666230383e-06,
+      "loss": 0.7887,
+      "step": 20573
+    },
+    {
+      "epoch": 3.6629273504273505,
+      "grad_norm": 1.0252838134765625,
+      "learning_rate": 3.5083725758578325e-06,
+      "loss": 0.6965,
+      "step": 20574
+    },
+    {
+      "epoch": 3.6631054131054133,
+      "grad_norm": 0.8903803825378418,
+      "learning_rate": 3.5046983760382403e-06,
+      "loss": 0.7763,
+      "step": 20575
+    },
+    {
+      "epoch": 3.6632834757834756,
+      "grad_norm": 1.00473153591156,
+      "learning_rate": 3.50102606684366e-06,
+      "loss": 0.6745,
+      "step": 20576
+    },
+    {
+      "epoch": 3.6634615384615383,
+      "grad_norm": 0.9572219848632812,
+      "learning_rate": 3.4973556483460013e-06,
+      "loss": 0.6424,
+      "step": 20577
+    },
+    {
+      "epoch": 3.663639601139601,
+      "grad_norm": 0.8433740139007568,
+      "learning_rate": 3.493687120617206e-06,
+      "loss": 0.6839,
+      "step": 20578
+    },
+    {
+      "epoch": 3.663817663817664,
+      "grad_norm": 0.9884551763534546,
+      "learning_rate": 3.4900204837291284e-06,
+      "loss": 0.7555,
+      "step": 20579
+    },
+    {
+      "epoch": 3.6639957264957266,
+      "grad_norm": 1.024325966835022,
+      "learning_rate": 3.4863557377536103e-06,
+      "loss": 0.7884,
+      "step": 20580
+    },
+    {
+      "epoch": 3.6641737891737893,
+      "grad_norm": 0.9565973281860352,
+      "learning_rate": 3.4826928827624617e-06,
+      "loss": 0.7441,
+      "step": 20581
+    },
+    {
+      "epoch": 3.664351851851852,
+      "grad_norm": 1.0180308818817139,
+      "learning_rate": 3.479031918827469e-06,
+      "loss": 0.6455,
+      "step": 20582
+    },
+    {
+      "epoch": 3.6645299145299144,
+      "grad_norm": 0.9916195869445801,
+      "learning_rate": 3.4753728460203082e-06,
+      "loss": 0.9352,
+      "step": 20583
+    },
+    {
+      "epoch": 3.664707977207977,
+      "grad_norm": 0.912993848323822,
+      "learning_rate": 3.4717156644127335e-06,
+      "loss": 0.766,
+      "step": 20584
+    },
+    {
+      "epoch": 3.66488603988604,
+      "grad_norm": 0.9358601570129395,
+      "learning_rate": 3.468060374076354e-06,
+      "loss": 0.7468,
+      "step": 20585
+    },
+    {
+      "epoch": 3.6650641025641026,
+      "grad_norm": 0.9579420685768127,
+      "learning_rate": 3.464406975082812e-06,
+      "loss": 0.7707,
+      "step": 20586
+    },
+    {
+      "epoch": 3.6652421652421654,
+      "grad_norm": 0.912672221660614,
+      "learning_rate": 3.4607554675036736e-06,
+      "loss": 0.7838,
+      "step": 20587
+    },
+    {
+      "epoch": 3.6654202279202277,
+      "grad_norm": 0.9116925597190857,
+      "learning_rate": 3.4571058514105027e-06,
+      "loss": 0.6586,
+      "step": 20588
+    },
+    {
+      "epoch": 3.6655982905982905,
+      "grad_norm": 1.2207914590835571,
+      "learning_rate": 3.453458126874776e-06,
+      "loss": 0.9087,
+      "step": 20589
+    },
+    {
+      "epoch": 3.665776353276353,
+      "grad_norm": 0.8218117356300354,
+      "learning_rate": 3.44981229396798e-06,
+      "loss": 0.4345,
+      "step": 20590
+    },
+    {
+      "epoch": 3.665954415954416,
+      "grad_norm": 0.9615563154220581,
+      "learning_rate": 3.4461683527615475e-06,
+      "loss": 0.7845,
+      "step": 20591
+    },
+    {
+      "epoch": 3.6661324786324787,
+      "grad_norm": 0.9879812598228455,
+      "learning_rate": 3.4425263033268762e-06,
+      "loss": 0.6126,
+      "step": 20592
+    },
+    {
+      "epoch": 3.6663105413105415,
+      "grad_norm": 0.8715957403182983,
+      "learning_rate": 3.4388861457353316e-06,
+      "loss": 0.7591,
+      "step": 20593
+    },
+    {
+      "epoch": 3.666488603988604,
+      "grad_norm": 0.9076233506202698,
+      "learning_rate": 3.435247880058212e-06,
+      "loss": 0.9255,
+      "step": 20594
+    },
+    {
+      "epoch": 3.6666666666666665,
+      "grad_norm": 0.8798804879188538,
+      "learning_rate": 3.4316115063668164e-06,
+      "loss": 0.7389,
+      "step": 20595
+    },
+    {
+      "epoch": 3.6668447293447293,
+      "grad_norm": 0.9797578454017639,
+      "learning_rate": 3.427977024732376e-06,
+      "loss": 0.7214,
+      "step": 20596
+    },
+    {
+      "epoch": 3.667022792022792,
+      "grad_norm": 0.9402558207511902,
+      "learning_rate": 3.4243444352261454e-06,
+      "loss": 0.7828,
+      "step": 20597
+    },
+    {
+      "epoch": 3.6672008547008548,
+      "grad_norm": 0.8900625705718994,
+      "learning_rate": 3.4207137379192454e-06,
+      "loss": 0.7614,
+      "step": 20598
+    },
+    {
+      "epoch": 3.6673789173789175,
+      "grad_norm": 0.9660239219665527,
+      "learning_rate": 3.4170849328828525e-06,
+      "loss": 0.657,
+      "step": 20599
+    },
+    {
+      "epoch": 3.66755698005698,
+      "grad_norm": 0.9239768981933594,
+      "learning_rate": 3.4134580201880208e-06,
+      "loss": 0.716,
+      "step": 20600
+    },
+    {
+      "epoch": 3.6677350427350426,
+      "grad_norm": 0.8773369193077087,
+      "learning_rate": 3.4098329999058487e-06,
+      "loss": 0.7245,
+      "step": 20601
+    },
+    {
+      "epoch": 3.6679131054131053,
+      "grad_norm": 0.938544750213623,
+      "learning_rate": 3.4062098721073575e-06,
+      "loss": 0.7817,
+      "step": 20602
+    },
+    {
+      "epoch": 3.668091168091168,
+      "grad_norm": 1.0228548049926758,
+      "learning_rate": 3.402588636863524e-06,
+      "loss": 0.947,
+      "step": 20603
+    },
+    {
+      "epoch": 3.668269230769231,
+      "grad_norm": 0.921551525592804,
+      "learning_rate": 3.398969294245302e-06,
+      "loss": 0.6954,
+      "step": 20604
+    },
+    {
+      "epoch": 3.6684472934472936,
+      "grad_norm": 0.9160789251327515,
+      "learning_rate": 3.395351844323602e-06,
+      "loss": 0.7823,
+      "step": 20605
+    },
+    {
+      "epoch": 3.6686253561253563,
+      "grad_norm": 0.9858134984970093,
+      "learning_rate": 3.3917362871693004e-06,
+      "loss": 0.7063,
+      "step": 20606
+    },
+    {
+      "epoch": 3.6688034188034186,
+      "grad_norm": 0.868196964263916,
+      "learning_rate": 3.3881226228532513e-06,
+      "loss": 0.6792,
+      "step": 20607
+    },
+    {
+      "epoch": 3.6689814814814814,
+      "grad_norm": 0.8798336386680603,
+      "learning_rate": 3.3845108514462322e-06,
+      "loss": 0.7249,
+      "step": 20608
+    },
+    {
+      "epoch": 3.669159544159544,
+      "grad_norm": 1.0030553340911865,
+      "learning_rate": 3.38090097301903e-06,
+      "loss": 0.7388,
+      "step": 20609
+    },
+    {
+      "epoch": 3.669337606837607,
+      "grad_norm": 0.8349957466125488,
+      "learning_rate": 3.3772929876423664e-06,
+      "loss": 0.6141,
+      "step": 20610
+    },
+    {
+      "epoch": 3.6695156695156697,
+      "grad_norm": 0.937864363193512,
+      "learning_rate": 3.3736868953869293e-06,
+      "loss": 0.7051,
+      "step": 20611
+    },
+    {
+      "epoch": 3.669693732193732,
+      "grad_norm": 0.9294646382331848,
+      "learning_rate": 3.3700826963233735e-06,
+      "loss": 0.7734,
+      "step": 20612
+    },
+    {
+      "epoch": 3.6698717948717947,
+      "grad_norm": 1.09479820728302,
+      "learning_rate": 3.3664803905223086e-06,
+      "loss": 0.8817,
+      "step": 20613
+    },
+    {
+      "epoch": 3.6700498575498575,
+      "grad_norm": 0.9988318085670471,
+      "learning_rate": 3.3628799780543342e-06,
+      "loss": 0.7704,
+      "step": 20614
+    },
+    {
+      "epoch": 3.67022792022792,
+      "grad_norm": 0.9090431928634644,
+      "learning_rate": 3.359281458989971e-06,
+      "loss": 0.6913,
+      "step": 20615
+    },
+    {
+      "epoch": 3.670405982905983,
+      "grad_norm": 1.36585533618927,
+      "learning_rate": 3.3556848333997304e-06,
+      "loss": 0.8042,
+      "step": 20616
+    },
+    {
+      "epoch": 3.6705840455840457,
+      "grad_norm": 1.0545517206192017,
+      "learning_rate": 3.3520901013540996e-06,
+      "loss": 0.7149,
+      "step": 20617
+    },
+    {
+      "epoch": 3.6707621082621085,
+      "grad_norm": 1.0674611330032349,
+      "learning_rate": 3.348497262923489e-06,
+      "loss": 0.9619,
+      "step": 20618
+    },
+    {
+      "epoch": 3.6709401709401708,
+      "grad_norm": 0.9115710258483887,
+      "learning_rate": 3.344906318178287e-06,
+      "loss": 0.823,
+      "step": 20619
+    },
+    {
+      "epoch": 3.6711182336182335,
+      "grad_norm": 0.8687363862991333,
+      "learning_rate": 3.3413172671888813e-06,
+      "loss": 0.6472,
+      "step": 20620
+    },
+    {
+      "epoch": 3.6712962962962963,
+      "grad_norm": 0.9118328094482422,
+      "learning_rate": 3.33773011002555e-06,
+      "loss": 0.6483,
+      "step": 20621
+    },
+    {
+      "epoch": 3.671474358974359,
+      "grad_norm": 1.0184921026229858,
+      "learning_rate": 3.3341448467586245e-06,
+      "loss": 0.6792,
+      "step": 20622
+    },
+    {
+      "epoch": 3.671652421652422,
+      "grad_norm": 0.8835800290107727,
+      "learning_rate": 3.330561477458294e-06,
+      "loss": 0.7004,
+      "step": 20623
+    },
+    {
+      "epoch": 3.671830484330484,
+      "grad_norm": 1.0011813640594482,
+      "learning_rate": 3.326980002194835e-06,
+      "loss": 0.8112,
+      "step": 20624
+    },
+    {
+      "epoch": 3.672008547008547,
+      "grad_norm": 0.8921766877174377,
+      "learning_rate": 3.3234004210383473e-06,
+      "loss": 0.5758,
+      "step": 20625
+    },
+    {
+      "epoch": 3.6721866096866096,
+      "grad_norm": 0.94114750623703,
+      "learning_rate": 3.319822734059019e-06,
+      "loss": 0.7338,
+      "step": 20626
+    },
+    {
+      "epoch": 3.6723646723646723,
+      "grad_norm": 1.024093747138977,
+      "learning_rate": 3.316246941326917e-06,
+      "loss": 0.7397,
+      "step": 20627
+    },
+    {
+      "epoch": 3.672542735042735,
+      "grad_norm": 0.9239045977592468,
+      "learning_rate": 3.312673042912129e-06,
+      "loss": 0.7344,
+      "step": 20628
+    },
+    {
+      "epoch": 3.672720797720798,
+      "grad_norm": 1.241755485534668,
+      "learning_rate": 3.3091010388846543e-06,
+      "loss": 0.7574,
+      "step": 20629
+    },
+    {
+      "epoch": 3.6728988603988606,
+      "grad_norm": 0.9279388189315796,
+      "learning_rate": 3.305530929314471e-06,
+      "loss": 0.6978,
+      "step": 20630
+    },
+    {
+      "epoch": 3.6730769230769234,
+      "grad_norm": 1.0421857833862305,
+      "learning_rate": 3.301962714271567e-06,
+      "loss": 0.7893,
+      "step": 20631
+    },
+    {
+      "epoch": 3.6732549857549857,
+      "grad_norm": 0.944916307926178,
+      "learning_rate": 3.29839639382582e-06,
+      "loss": 0.5987,
+      "step": 20632
+    },
+    {
+      "epoch": 3.6734330484330484,
+      "grad_norm": 0.8627259135246277,
+      "learning_rate": 3.2948319680471184e-06,
+      "loss": 0.6824,
+      "step": 20633
+    },
+    {
+      "epoch": 3.673611111111111,
+      "grad_norm": 1.1818138360977173,
+      "learning_rate": 3.2912694370052954e-06,
+      "loss": 0.9816,
+      "step": 20634
+    },
+    {
+      "epoch": 3.673789173789174,
+      "grad_norm": 1.1198930740356445,
+      "learning_rate": 3.2877088007701618e-06,
+      "loss": 0.8528,
+      "step": 20635
+    },
+    {
+      "epoch": 3.673967236467236,
+      "grad_norm": 0.9954573512077332,
+      "learning_rate": 3.2841500594114615e-06,
+      "loss": 0.9535,
+      "step": 20636
+    },
+    {
+      "epoch": 3.674145299145299,
+      "grad_norm": 0.9435851573944092,
+      "learning_rate": 3.2805932129989393e-06,
+      "loss": 0.7501,
+      "step": 20637
+    },
+    {
+      "epoch": 3.6743233618233617,
+      "grad_norm": 1.2884607315063477,
+      "learning_rate": 3.277038261602261e-06,
+      "loss": 0.815,
+      "step": 20638
+    },
+    {
+      "epoch": 3.6745014245014245,
+      "grad_norm": 0.9409137964248657,
+      "learning_rate": 3.273485205291116e-06,
+      "loss": 1.0288,
+      "step": 20639
+    },
+    {
+      "epoch": 3.6746794871794872,
+      "grad_norm": 0.8598392009735107,
+      "learning_rate": 3.26993404413507e-06,
+      "loss": 0.5697,
+      "step": 20640
+    },
+    {
+      "epoch": 3.67485754985755,
+      "grad_norm": 0.9609706401824951,
+      "learning_rate": 3.266384778203735e-06,
+      "loss": 0.7962,
+      "step": 20641
+    },
+    {
+      "epoch": 3.6750356125356127,
+      "grad_norm": 1.1136882305145264,
+      "learning_rate": 3.2628374075666546e-06,
+      "loss": 0.8393,
+      "step": 20642
+    },
+    {
+      "epoch": 3.6752136752136755,
+      "grad_norm": 0.9873359203338623,
+      "learning_rate": 3.259291932293318e-06,
+      "loss": 0.749,
+      "step": 20643
+    },
+    {
+      "epoch": 3.675391737891738,
+      "grad_norm": 0.8118230700492859,
+      "learning_rate": 3.255748352453192e-06,
+      "loss": 0.6138,
+      "step": 20644
+    },
+    {
+      "epoch": 3.6755698005698005,
+      "grad_norm": 0.9738235473632812,
+      "learning_rate": 3.2522066681157094e-06,
+      "loss": 0.7854,
+      "step": 20645
+    },
+    {
+      "epoch": 3.6757478632478633,
+      "grad_norm": 0.965652346611023,
+      "learning_rate": 3.2486668793502593e-06,
+      "loss": 0.8144,
+      "step": 20646
+    },
+    {
+      "epoch": 3.675925925925926,
+      "grad_norm": 0.8933370113372803,
+      "learning_rate": 3.245128986226198e-06,
+      "loss": 0.9418,
+      "step": 20647
+    },
+    {
+      "epoch": 3.6761039886039883,
+      "grad_norm": 1.0203384160995483,
+      "learning_rate": 3.241592988812836e-06,
+      "loss": 1.0724,
+      "step": 20648
+    },
+    {
+      "epoch": 3.676282051282051,
+      "grad_norm": 0.8109650611877441,
+      "learning_rate": 3.2380588871794736e-06,
+      "loss": 0.6265,
+      "step": 20649
+    },
+    {
+      "epoch": 3.676460113960114,
+      "grad_norm": 1.0682241916656494,
+      "learning_rate": 3.2345266813953334e-06,
+      "loss": 0.797,
+      "step": 20650
+    },
+    {
+      "epoch": 3.6766381766381766,
+      "grad_norm": 0.9123262166976929,
+      "learning_rate": 3.2309963715296376e-06,
+      "loss": 0.6912,
+      "step": 20651
+    },
+    {
+      "epoch": 3.6768162393162394,
+      "grad_norm": 0.8609589338302612,
+      "learning_rate": 3.227467957651553e-06,
+      "loss": 0.8129,
+      "step": 20652
+    },
+    {
+      "epoch": 3.676994301994302,
+      "grad_norm": 0.8229508399963379,
+      "learning_rate": 3.22394143983018e-06,
+      "loss": 0.678,
+      "step": 20653
+    },
+    {
+      "epoch": 3.677172364672365,
+      "grad_norm": 0.9235960245132446,
+      "learning_rate": 3.220416818134675e-06,
+      "loss": 0.6403,
+      "step": 20654
+    },
+    {
+      "epoch": 3.6773504273504276,
+      "grad_norm": 0.9439797401428223,
+      "learning_rate": 3.2168940926340264e-06,
+      "loss": 0.723,
+      "step": 20655
+    },
+    {
+      "epoch": 3.67752849002849,
+      "grad_norm": 1.1216174364089966,
+      "learning_rate": 3.2133732633973124e-06,
+      "loss": 0.5696,
+      "step": 20656
+    },
+    {
+      "epoch": 3.6777065527065527,
+      "grad_norm": 0.8374674916267395,
+      "learning_rate": 3.209854330493478e-06,
+      "loss": 0.6244,
+      "step": 20657
+    },
+    {
+      "epoch": 3.6778846153846154,
+      "grad_norm": 1.0153228044509888,
+      "learning_rate": 3.2063372939915014e-06,
+      "loss": 0.8841,
+      "step": 20658
+    },
+    {
+      "epoch": 3.678062678062678,
+      "grad_norm": 0.9651415944099426,
+      "learning_rate": 3.2028221539602608e-06,
+      "loss": 0.7879,
+      "step": 20659
+    },
+    {
+      "epoch": 3.6782407407407405,
+      "grad_norm": 0.9977912902832031,
+      "learning_rate": 3.199308910468646e-06,
+      "loss": 0.7334,
+      "step": 20660
+    },
+    {
+      "epoch": 3.6784188034188032,
+      "grad_norm": 0.9887592196464539,
+      "learning_rate": 3.1957975635854786e-06,
+      "loss": 0.7634,
+      "step": 20661
+    },
+    {
+      "epoch": 3.678596866096866,
+      "grad_norm": 1.00025475025177,
+      "learning_rate": 3.1922881133795825e-06,
+      "loss": 0.8121,
+      "step": 20662
+    },
+    {
+      "epoch": 3.6787749287749287,
+      "grad_norm": 0.8790071606636047,
+      "learning_rate": 3.1887805599196683e-06,
+      "loss": 0.6167,
+      "step": 20663
+    },
+    {
+      "epoch": 3.6789529914529915,
+      "grad_norm": 1.0021897554397583,
+      "learning_rate": 3.1852749032745267e-06,
+      "loss": 0.6549,
+      "step": 20664
+    },
+    {
+      "epoch": 3.6791310541310542,
+      "grad_norm": 1.0172662734985352,
+      "learning_rate": 3.1817711435127906e-06,
+      "loss": 0.7399,
+      "step": 20665
+    },
+    {
+      "epoch": 3.679309116809117,
+      "grad_norm": 0.9718804955482483,
+      "learning_rate": 3.1782692807031276e-06,
+      "loss": 0.6546,
+      "step": 20666
+    },
+    {
+      "epoch": 3.6794871794871797,
+      "grad_norm": 1.0091427564620972,
+      "learning_rate": 3.17476931491415e-06,
+      "loss": 0.7262,
+      "step": 20667
+    },
+    {
+      "epoch": 3.679665242165242,
+      "grad_norm": 0.9035787582397461,
+      "learning_rate": 3.1712712462144134e-06,
+      "loss": 0.9596,
+      "step": 20668
+    },
+    {
+      "epoch": 3.679843304843305,
+      "grad_norm": 1.2351285219192505,
+      "learning_rate": 3.1677750746725077e-06,
+      "loss": 0.7528,
+      "step": 20669
+    },
+    {
+      "epoch": 3.6800213675213675,
+      "grad_norm": 1.065185308456421,
+      "learning_rate": 3.164280800356867e-06,
+      "loss": 0.8681,
+      "step": 20670
+    },
+    {
+      "epoch": 3.6801994301994303,
+      "grad_norm": 0.8398473858833313,
+      "learning_rate": 3.160788423336014e-06,
+      "loss": 0.5556,
+      "step": 20671
+    },
+    {
+      "epoch": 3.6803774928774926,
+      "grad_norm": 0.9858914613723755,
+      "learning_rate": 3.1572979436783168e-06,
+      "loss": 0.9169,
+      "step": 20672
+    },
+    {
+      "epoch": 3.6805555555555554,
+      "grad_norm": 0.968228280544281,
+      "learning_rate": 3.153809361452209e-06,
+      "loss": 0.7122,
+      "step": 20673
+    },
+    {
+      "epoch": 3.680733618233618,
+      "grad_norm": 0.9299584031105042,
+      "learning_rate": 3.1503226767260252e-06,
+      "loss": 0.7147,
+      "step": 20674
+    },
+    {
+      "epoch": 3.680911680911681,
+      "grad_norm": 0.9766196608543396,
+      "learning_rate": 3.1468378895680773e-06,
+      "loss": 0.8661,
+      "step": 20675
+    },
+    {
+      "epoch": 3.6810897435897436,
+      "grad_norm": 0.9744870662689209,
+      "learning_rate": 3.143355000046655e-06,
+      "loss": 0.7798,
+      "step": 20676
+    },
+    {
+      "epoch": 3.6812678062678064,
+      "grad_norm": 0.9700605273246765,
+      "learning_rate": 3.1398740082299817e-06,
+      "loss": 0.9003,
+      "step": 20677
+    },
+    {
+      "epoch": 3.681445868945869,
+      "grad_norm": 0.9444677829742432,
+      "learning_rate": 3.136394914186258e-06,
+      "loss": 0.8441,
+      "step": 20678
+    },
+    {
+      "epoch": 3.681623931623932,
+      "grad_norm": 1.0803836584091187,
+      "learning_rate": 3.1329177179836745e-06,
+      "loss": 0.9904,
+      "step": 20679
+    },
+    {
+      "epoch": 3.681801994301994,
+      "grad_norm": 0.8475477695465088,
+      "learning_rate": 3.1294424196903317e-06,
+      "loss": 0.8504,
+      "step": 20680
+    },
+    {
+      "epoch": 3.681980056980057,
+      "grad_norm": 0.882752001285553,
+      "learning_rate": 3.1259690193743527e-06,
+      "loss": 0.7105,
+      "step": 20681
+    },
+    {
+      "epoch": 3.6821581196581197,
+      "grad_norm": 0.9671033620834351,
+      "learning_rate": 3.122497517103751e-06,
+      "loss": 0.7545,
+      "step": 20682
+    },
+    {
+      "epoch": 3.6823361823361824,
+      "grad_norm": 1.0363636016845703,
+      "learning_rate": 3.1190279129465705e-06,
+      "loss": 0.7792,
+      "step": 20683
+    },
+    {
+      "epoch": 3.682514245014245,
+      "grad_norm": 0.9925054907798767,
+      "learning_rate": 3.1155602069707914e-06,
+      "loss": 0.7609,
+      "step": 20684
+    },
+    {
+      "epoch": 3.6826923076923075,
+      "grad_norm": 1.023887038230896,
+      "learning_rate": 3.1120943992443473e-06,
+      "loss": 0.8605,
+      "step": 20685
+    },
+    {
+      "epoch": 3.6828703703703702,
+      "grad_norm": 0.9789915680885315,
+      "learning_rate": 3.1086304898351513e-06,
+      "loss": 0.7812,
+      "step": 20686
+    },
+    {
+      "epoch": 3.683048433048433,
+      "grad_norm": 0.8969167470932007,
+      "learning_rate": 3.10516847881106e-06,
+      "loss": 0.608,
+      "step": 20687
+    },
+    {
+      "epoch": 3.6832264957264957,
+      "grad_norm": 0.90472811460495,
+      "learning_rate": 3.101708366239908e-06,
+      "loss": 0.7239,
+      "step": 20688
+    },
+    {
+      "epoch": 3.6834045584045585,
+      "grad_norm": 1.0310150384902954,
+      "learning_rate": 3.098250152189497e-06,
+      "loss": 0.8472,
+      "step": 20689
+    },
+    {
+      "epoch": 3.6835826210826212,
+      "grad_norm": 1.0674567222595215,
+      "learning_rate": 3.0947938367275717e-06,
+      "loss": 0.8983,
+      "step": 20690
+    },
+    {
+      "epoch": 3.683760683760684,
+      "grad_norm": 0.9554956555366516,
+      "learning_rate": 3.091339419921868e-06,
+      "loss": 0.7877,
+      "step": 20691
+    },
+    {
+      "epoch": 3.6839387464387463,
+      "grad_norm": 0.8440012335777283,
+      "learning_rate": 3.0878869018400537e-06,
+      "loss": 0.5558,
+      "step": 20692
+    },
+    {
+      "epoch": 3.684116809116809,
+      "grad_norm": 0.9536983370780945,
+      "learning_rate": 3.084436282549774e-06,
+      "loss": 0.7293,
+      "step": 20693
+    },
+    {
+      "epoch": 3.684294871794872,
+      "grad_norm": 0.9408102631568909,
+      "learning_rate": 3.0809875621186534e-06,
+      "loss": 0.6177,
+      "step": 20694
+    },
+    {
+      "epoch": 3.6844729344729346,
+      "grad_norm": 0.9360013008117676,
+      "learning_rate": 3.0775407406142375e-06,
+      "loss": 0.7644,
+      "step": 20695
+    },
+    {
+      "epoch": 3.6846509971509973,
+      "grad_norm": 0.9667526483535767,
+      "learning_rate": 3.0740958181040836e-06,
+      "loss": 0.6533,
+      "step": 20696
+    },
+    {
+      "epoch": 3.6848290598290596,
+      "grad_norm": 1.0383641719818115,
+      "learning_rate": 3.0706527946556596e-06,
+      "loss": 0.6392,
+      "step": 20697
+    },
+    {
+      "epoch": 3.6850071225071224,
+      "grad_norm": 0.9858824014663696,
+      "learning_rate": 3.067211670336445e-06,
+      "loss": 0.7166,
+      "step": 20698
+    },
+    {
+      "epoch": 3.685185185185185,
+      "grad_norm": 0.8552738428115845,
+      "learning_rate": 3.063772445213864e-06,
+      "loss": 0.8228,
+      "step": 20699
+    },
+    {
+      "epoch": 3.685363247863248,
+      "grad_norm": 0.8895391821861267,
+      "learning_rate": 3.0603351193552954e-06,
+      "loss": 0.7124,
+      "step": 20700
+    },
+    {
+      "epoch": 3.6855413105413106,
+      "grad_norm": 1.1577317714691162,
+      "learning_rate": 3.0568996928280857e-06,
+      "loss": 0.6814,
+      "step": 20701
+    },
+    {
+      "epoch": 3.6857193732193734,
+      "grad_norm": 0.8977794647216797,
+      "learning_rate": 3.0534661656995366e-06,
+      "loss": 0.7366,
+      "step": 20702
+    },
+    {
+      "epoch": 3.685897435897436,
+      "grad_norm": 0.9750751852989197,
+      "learning_rate": 3.050034538036928e-06,
+      "loss": 0.924,
+      "step": 20703
+    },
+    {
+      "epoch": 3.6860754985754984,
+      "grad_norm": 0.9595991969108582,
+      "learning_rate": 3.0466048099075163e-06,
+      "loss": 0.8004,
+      "step": 20704
+    },
+    {
+      "epoch": 3.686253561253561,
+      "grad_norm": 0.9346702098846436,
+      "learning_rate": 3.0431769813784596e-06,
+      "loss": 0.928,
+      "step": 20705
+    },
+    {
+      "epoch": 3.686431623931624,
+      "grad_norm": 0.9379191994667053,
+      "learning_rate": 3.039751052516948e-06,
+      "loss": 0.7355,
+      "step": 20706
+    },
+    {
+      "epoch": 3.6866096866096867,
+      "grad_norm": 0.9948887825012207,
+      "learning_rate": 3.0363270233900844e-06,
+      "loss": 0.7923,
+      "step": 20707
+    },
+    {
+      "epoch": 3.6867877492877494,
+      "grad_norm": 0.9426677227020264,
+      "learning_rate": 3.0329048940649805e-06,
+      "loss": 0.844,
+      "step": 20708
+    },
+    {
+      "epoch": 3.6869658119658117,
+      "grad_norm": 0.957449197769165,
+      "learning_rate": 3.0294846646086726e-06,
+      "loss": 0.6105,
+      "step": 20709
+    },
+    {
+      "epoch": 3.6871438746438745,
+      "grad_norm": 1.0119540691375732,
+      "learning_rate": 3.0260663350881622e-06,
+      "loss": 0.8269,
+      "step": 20710
+    },
+    {
+      "epoch": 3.6873219373219372,
+      "grad_norm": 0.9362712502479553,
+      "learning_rate": 3.022649905570463e-06,
+      "loss": 0.7466,
+      "step": 20711
+    },
+    {
+      "epoch": 3.6875,
+      "grad_norm": 1.0067883729934692,
+      "learning_rate": 3.0192353761224647e-06,
+      "loss": 0.6891,
+      "step": 20712
+    },
+    {
+      "epoch": 3.6876780626780628,
+      "grad_norm": 0.9323202967643738,
+      "learning_rate": 3.015822746811092e-06,
+      "loss": 0.8198,
+      "step": 20713
+    },
+    {
+      "epoch": 3.6878561253561255,
+      "grad_norm": 1.1497761011123657,
+      "learning_rate": 3.0124120177032034e-06,
+      "loss": 0.632,
+      "step": 20714
+    },
+    {
+      "epoch": 3.6880341880341883,
+      "grad_norm": 0.9109611511230469,
+      "learning_rate": 3.0090031888656334e-06,
+      "loss": 0.8147,
+      "step": 20715
+    },
+    {
+      "epoch": 3.6882122507122506,
+      "grad_norm": 0.9743979573249817,
+      "learning_rate": 3.0055962603651733e-06,
+      "loss": 0.7787,
+      "step": 20716
+    },
+    {
+      "epoch": 3.6883903133903133,
+      "grad_norm": 0.8306061625480652,
+      "learning_rate": 3.0021912322685473e-06,
+      "loss": 0.6223,
+      "step": 20717
+    },
+    {
+      "epoch": 3.688568376068376,
+      "grad_norm": 1.0465481281280518,
+      "learning_rate": 2.9987881046425025e-06,
+      "loss": 0.7197,
+      "step": 20718
+    },
+    {
+      "epoch": 3.688746438746439,
+      "grad_norm": 1.0467299222946167,
+      "learning_rate": 2.9953868775536964e-06,
+      "loss": 0.7249,
+      "step": 20719
+    },
+    {
+      "epoch": 3.6889245014245016,
+      "grad_norm": 1.1018860340118408,
+      "learning_rate": 2.9919875510687644e-06,
+      "loss": 0.7112,
+      "step": 20720
+    },
+    {
+      "epoch": 3.689102564102564,
+      "grad_norm": 0.9819473624229431,
+      "learning_rate": 2.9885901252543425e-06,
+      "loss": 0.6801,
+      "step": 20721
+    },
+    {
+      "epoch": 3.6892806267806266,
+      "grad_norm": 1.0312379598617554,
+      "learning_rate": 2.985194600176955e-06,
+      "loss": 0.7441,
+      "step": 20722
+    },
+    {
+      "epoch": 3.6894586894586894,
+      "grad_norm": 0.9706970453262329,
+      "learning_rate": 2.9818009759031483e-06,
+      "loss": 0.7264,
+      "step": 20723
+    },
+    {
+      "epoch": 3.689636752136752,
+      "grad_norm": 0.8469666242599487,
+      "learning_rate": 2.9784092524994257e-06,
+      "loss": 0.6271,
+      "step": 20724
+    },
+    {
+      "epoch": 3.689814814814815,
+      "grad_norm": 0.8831173777580261,
+      "learning_rate": 2.9750194300322223e-06,
+      "loss": 0.5946,
+      "step": 20725
+    },
+    {
+      "epoch": 3.6899928774928776,
+      "grad_norm": 1.000719428062439,
+      "learning_rate": 2.9716315085679623e-06,
+      "loss": 0.8439,
+      "step": 20726
+    },
+    {
+      "epoch": 3.6901709401709404,
+      "grad_norm": 0.8967503309249878,
+      "learning_rate": 2.968245488173027e-06,
+      "loss": 0.6256,
+      "step": 20727
+    },
+    {
+      "epoch": 3.6903490028490027,
+      "grad_norm": 1.0560952425003052,
+      "learning_rate": 2.9648613689137404e-06,
+      "loss": 0.9563,
+      "step": 20728
+    },
+    {
+      "epoch": 3.6905270655270654,
+      "grad_norm": 0.9296253323554993,
+      "learning_rate": 2.9614791508564277e-06,
+      "loss": 0.8398,
+      "step": 20729
+    },
+    {
+      "epoch": 3.690705128205128,
+      "grad_norm": 0.8290141820907593,
+      "learning_rate": 2.9580988340673577e-06,
+      "loss": 0.5122,
+      "step": 20730
+    },
+    {
+      "epoch": 3.690883190883191,
+      "grad_norm": 0.9456691741943359,
+      "learning_rate": 2.954720418612755e-06,
+      "loss": 0.6461,
+      "step": 20731
+    },
+    {
+      "epoch": 3.6910612535612537,
+      "grad_norm": 0.9314257502555847,
+      "learning_rate": 2.9513439045588122e-06,
+      "loss": 0.755,
+      "step": 20732
+    },
+    {
+      "epoch": 3.691239316239316,
+      "grad_norm": 1.037325143814087,
+      "learning_rate": 2.947969291971686e-06,
+      "loss": 0.6729,
+      "step": 20733
+    },
+    {
+      "epoch": 3.6914173789173788,
+      "grad_norm": 0.9310669302940369,
+      "learning_rate": 2.9445965809174913e-06,
+      "loss": 0.7465,
+      "step": 20734
+    },
+    {
+      "epoch": 3.6915954415954415,
+      "grad_norm": 1.0100094079971313,
+      "learning_rate": 2.941225771462297e-06,
+      "loss": 0.8561,
+      "step": 20735
+    },
+    {
+      "epoch": 3.6917735042735043,
+      "grad_norm": 1.0392314195632935,
+      "learning_rate": 2.9378568636721835e-06,
+      "loss": 0.9093,
+      "step": 20736
+    },
+    {
+      "epoch": 3.691951566951567,
+      "grad_norm": 1.1194417476654053,
+      "learning_rate": 2.934489857613121e-06,
+      "loss": 0.8177,
+      "step": 20737
+    },
+    {
+      "epoch": 3.6921296296296298,
+      "grad_norm": 0.9694803357124329,
+      "learning_rate": 2.9311247533511e-06,
+      "loss": 0.7652,
+      "step": 20738
+    },
+    {
+      "epoch": 3.6923076923076925,
+      "grad_norm": 0.9330110549926758,
+      "learning_rate": 2.927761550952046e-06,
+      "loss": 0.8571,
+      "step": 20739
+    },
+    {
+      "epoch": 3.692485754985755,
+      "grad_norm": 0.9791732430458069,
+      "learning_rate": 2.9244002504818514e-06,
+      "loss": 0.9147,
+      "step": 20740
+    },
+    {
+      "epoch": 3.6926638176638176,
+      "grad_norm": 0.9518002271652222,
+      "learning_rate": 2.921040852006385e-06,
+      "loss": 0.7033,
+      "step": 20741
+    },
+    {
+      "epoch": 3.6928418803418803,
+      "grad_norm": 0.9248731732368469,
+      "learning_rate": 2.9176833555914495e-06,
+      "loss": 0.7099,
+      "step": 20742
+    },
+    {
+      "epoch": 3.693019943019943,
+      "grad_norm": 1.0054923295974731,
+      "learning_rate": 2.9143277613028486e-06,
+      "loss": 0.7161,
+      "step": 20743
+    },
+    {
+      "epoch": 3.693198005698006,
+      "grad_norm": 1.0332942008972168,
+      "learning_rate": 2.910974069206307e-06,
+      "loss": 0.7649,
+      "step": 20744
+    },
+    {
+      "epoch": 3.693376068376068,
+      "grad_norm": 0.9899537563323975,
+      "learning_rate": 2.907622279367539e-06,
+      "loss": 0.7775,
+      "step": 20745
+    },
+    {
+      "epoch": 3.693554131054131,
+      "grad_norm": 1.0299023389816284,
+      "learning_rate": 2.9042723918522365e-06,
+      "loss": 0.7575,
+      "step": 20746
+    },
+    {
+      "epoch": 3.6937321937321936,
+      "grad_norm": 0.9683845043182373,
+      "learning_rate": 2.9009244067260024e-06,
+      "loss": 0.8563,
+      "step": 20747
+    },
+    {
+      "epoch": 3.6939102564102564,
+      "grad_norm": 0.9879896640777588,
+      "learning_rate": 2.897578324054451e-06,
+      "loss": 0.9879,
+      "step": 20748
+    },
+    {
+      "epoch": 3.694088319088319,
+      "grad_norm": 0.9869822859764099,
+      "learning_rate": 2.8942341439031405e-06,
+      "loss": 0.804,
+      "step": 20749
+    },
+    {
+      "epoch": 3.694266381766382,
+      "grad_norm": 0.9771765470504761,
+      "learning_rate": 2.8908918663375863e-06,
+      "loss": 0.9977,
+      "step": 20750
+    },
+    {
+      "epoch": 3.6944444444444446,
+      "grad_norm": 0.8605872988700867,
+      "learning_rate": 2.8875514914233013e-06,
+      "loss": 0.8474,
+      "step": 20751
+    },
+    {
+      "epoch": 3.6946225071225074,
+      "grad_norm": 0.9646205902099609,
+      "learning_rate": 2.88421301922569e-06,
+      "loss": 0.6685,
+      "step": 20752
+    },
+    {
+      "epoch": 3.6948005698005697,
+      "grad_norm": 0.9683958888053894,
+      "learning_rate": 2.880876449810199e-06,
+      "loss": 0.965,
+      "step": 20753
+    },
+    {
+      "epoch": 3.6949786324786325,
+      "grad_norm": 0.8820703029632568,
+      "learning_rate": 2.877541783242177e-06,
+      "loss": 0.5889,
+      "step": 20754
+    },
+    {
+      "epoch": 3.695156695156695,
+      "grad_norm": 0.9471021890640259,
+      "learning_rate": 2.8742090195869707e-06,
+      "loss": 0.6262,
+      "step": 20755
+    },
+    {
+      "epoch": 3.695334757834758,
+      "grad_norm": 0.9770998358726501,
+      "learning_rate": 2.870878158909884e-06,
+      "loss": 0.8444,
+      "step": 20756
+    },
+    {
+      "epoch": 3.6955128205128203,
+      "grad_norm": 0.8741800785064697,
+      "learning_rate": 2.867549201276165e-06,
+      "loss": 0.6484,
+      "step": 20757
+    },
+    {
+      "epoch": 3.695690883190883,
+      "grad_norm": 0.9649479985237122,
+      "learning_rate": 2.864222146751039e-06,
+      "loss": 0.6613,
+      "step": 20758
+    },
+    {
+      "epoch": 3.6958689458689458,
+      "grad_norm": 0.9703496694564819,
+      "learning_rate": 2.8608969953997087e-06,
+      "loss": 0.6868,
+      "step": 20759
+    },
+    {
+      "epoch": 3.6960470085470085,
+      "grad_norm": 0.9378167986869812,
+      "learning_rate": 2.857573747287301e-06,
+      "loss": 0.6265,
+      "step": 20760
+    },
+    {
+      "epoch": 3.6962250712250713,
+      "grad_norm": 0.9149529337882996,
+      "learning_rate": 2.8542524024789517e-06,
+      "loss": 0.5854,
+      "step": 20761
+    },
+    {
+      "epoch": 3.696403133903134,
+      "grad_norm": 0.9306116700172424,
+      "learning_rate": 2.8509329610397095e-06,
+      "loss": 0.795,
+      "step": 20762
+    },
+    {
+      "epoch": 3.6965811965811968,
+      "grad_norm": 0.9093084335327148,
+      "learning_rate": 2.8476154230346443e-06,
+      "loss": 0.7913,
+      "step": 20763
+    },
+    {
+      "epoch": 3.6967592592592595,
+      "grad_norm": 1.0166517496109009,
+      "learning_rate": 2.844299788528726e-06,
+      "loss": 0.739,
+      "step": 20764
+    },
+    {
+      "epoch": 3.696937321937322,
+      "grad_norm": 0.8517898917198181,
+      "learning_rate": 2.8409860575869364e-06,
+      "loss": 0.7489,
+      "step": 20765
+    },
+    {
+      "epoch": 3.6971153846153846,
+      "grad_norm": 1.0916438102722168,
+      "learning_rate": 2.83767423027419e-06,
+      "loss": 0.8086,
+      "step": 20766
+    },
+    {
+      "epoch": 3.6972934472934473,
+      "grad_norm": 1.061730980873108,
+      "learning_rate": 2.8343643066553793e-06,
+      "loss": 0.5334,
+      "step": 20767
+    },
+    {
+      "epoch": 3.69747150997151,
+      "grad_norm": 1.100048542022705,
+      "learning_rate": 2.831056286795364e-06,
+      "loss": 0.8805,
+      "step": 20768
+    },
+    {
+      "epoch": 3.6976495726495724,
+      "grad_norm": 0.6898918747901917,
+      "learning_rate": 2.827750170758936e-06,
+      "loss": 0.4459,
+      "step": 20769
+    },
+    {
+      "epoch": 3.697827635327635,
+      "grad_norm": 0.8306655883789062,
+      "learning_rate": 2.8244459586108997e-06,
+      "loss": 0.7836,
+      "step": 20770
+    },
+    {
+      "epoch": 3.698005698005698,
+      "grad_norm": 1.0033217668533325,
+      "learning_rate": 2.8211436504159695e-06,
+      "loss": 0.8711,
+      "step": 20771
+    },
+    {
+      "epoch": 3.6981837606837606,
+      "grad_norm": 1.0434397459030151,
+      "learning_rate": 2.8178432462388602e-06,
+      "loss": 0.7871,
+      "step": 20772
+    },
+    {
+      "epoch": 3.6983618233618234,
+      "grad_norm": 0.9520493149757385,
+      "learning_rate": 2.814544746144243e-06,
+      "loss": 0.9114,
+      "step": 20773
+    },
+    {
+      "epoch": 3.698539886039886,
+      "grad_norm": 0.9986703991889954,
+      "learning_rate": 2.8112481501967326e-06,
+      "loss": 0.8862,
+      "step": 20774
+    },
+    {
+      "epoch": 3.698717948717949,
+      "grad_norm": 0.9036804437637329,
+      "learning_rate": 2.80795345846091e-06,
+      "loss": 0.7594,
+      "step": 20775
+    },
+    {
+      "epoch": 3.6988960113960117,
+      "grad_norm": 0.9680522084236145,
+      "learning_rate": 2.804660671001358e-06,
+      "loss": 0.8734,
+      "step": 20776
+    },
+    {
+      "epoch": 3.699074074074074,
+      "grad_norm": 0.9929130673408508,
+      "learning_rate": 2.8013697878825573e-06,
+      "loss": 0.8834,
+      "step": 20777
+    },
+    {
+      "epoch": 3.6992521367521367,
+      "grad_norm": 0.9345636367797852,
+      "learning_rate": 2.7980808091690234e-06,
+      "loss": 0.5392,
+      "step": 20778
+    },
+    {
+      "epoch": 3.6994301994301995,
+      "grad_norm": 0.9011038541793823,
+      "learning_rate": 2.7947937349251495e-06,
+      "loss": 0.6209,
+      "step": 20779
+    },
+    {
+      "epoch": 3.699608262108262,
+      "grad_norm": 1.0361058712005615,
+      "learning_rate": 2.791508565215384e-06,
+      "loss": 0.8411,
+      "step": 20780
+    },
+    {
+      "epoch": 3.6997863247863245,
+      "grad_norm": 0.8469780087471008,
+      "learning_rate": 2.7882253001040746e-06,
+      "loss": 0.5842,
+      "step": 20781
+    },
+    {
+      "epoch": 3.6999643874643873,
+      "grad_norm": 0.9913833141326904,
+      "learning_rate": 2.784943939655549e-06,
+      "loss": 0.7048,
+      "step": 20782
+    },
+    {
+      "epoch": 3.70014245014245,
+      "grad_norm": 0.6060404777526855,
+      "learning_rate": 2.781664483934099e-06,
+      "loss": 0.3919,
+      "step": 20783
+    },
+    {
+      "epoch": 3.7003205128205128,
+      "grad_norm": 0.9720197319984436,
+      "learning_rate": 2.7783869330039847e-06,
+      "loss": 0.6759,
+      "step": 20784
+    },
+    {
+      "epoch": 3.7004985754985755,
+      "grad_norm": 0.9591602087020874,
+      "learning_rate": 2.7751112869293993e-06,
+      "loss": 0.9101,
+      "step": 20785
+    },
+    {
+      "epoch": 3.7006766381766383,
+      "grad_norm": 1.0980218648910522,
+      "learning_rate": 2.7718375457745693e-06,
+      "loss": 0.7299,
+      "step": 20786
+    },
+    {
+      "epoch": 3.700854700854701,
+      "grad_norm": 0.9733334183692932,
+      "learning_rate": 2.7685657096035877e-06,
+      "loss": 0.9226,
+      "step": 20787
+    },
+    {
+      "epoch": 3.701032763532764,
+      "grad_norm": 1.0194134712219238,
+      "learning_rate": 2.7652957784805923e-06,
+      "loss": 0.7872,
+      "step": 20788
+    },
+    {
+      "epoch": 3.701210826210826,
+      "grad_norm": 0.893657922744751,
+      "learning_rate": 2.7620277524696313e-06,
+      "loss": 0.7858,
+      "step": 20789
+    },
+    {
+      "epoch": 3.701388888888889,
+      "grad_norm": 1.019237995147705,
+      "learning_rate": 2.758761631634743e-06,
+      "loss": 0.668,
+      "step": 20790
+    },
+    {
+      "epoch": 3.7015669515669516,
+      "grad_norm": 0.9971436858177185,
+      "learning_rate": 2.7554974160399203e-06,
+      "loss": 0.7905,
+      "step": 20791
+    },
+    {
+      "epoch": 3.7017450142450143,
+      "grad_norm": 1.0228685140609741,
+      "learning_rate": 2.7522351057491124e-06,
+      "loss": 0.8796,
+      "step": 20792
+    },
+    {
+      "epoch": 3.7019230769230766,
+      "grad_norm": 0.8594250082969666,
+      "learning_rate": 2.748974700826257e-06,
+      "loss": 0.7012,
+      "step": 20793
+    },
+    {
+      "epoch": 3.7021011396011394,
+      "grad_norm": 1.0440367460250854,
+      "learning_rate": 2.7457162013352023e-06,
+      "loss": 0.8736,
+      "step": 20794
+    },
+    {
+      "epoch": 3.702279202279202,
+      "grad_norm": 0.9900957942008972,
+      "learning_rate": 2.74245960733982e-06,
+      "loss": 0.7699,
+      "step": 20795
+    },
+    {
+      "epoch": 3.702457264957265,
+      "grad_norm": 0.8492827415466309,
+      "learning_rate": 2.739204918903915e-06,
+      "loss": 0.6039,
+      "step": 20796
+    },
+    {
+      "epoch": 3.7026353276353277,
+      "grad_norm": 0.9645397663116455,
+      "learning_rate": 2.7359521360912354e-06,
+      "loss": 0.7195,
+      "step": 20797
+    },
+    {
+      "epoch": 3.7028133903133904,
+      "grad_norm": 0.9234998226165771,
+      "learning_rate": 2.732701258965531e-06,
+      "loss": 0.6246,
+      "step": 20798
+    },
+    {
+      "epoch": 3.702991452991453,
+      "grad_norm": 0.9107007384300232,
+      "learning_rate": 2.7294522875904947e-06,
+      "loss": 0.6909,
+      "step": 20799
+    },
+    {
+      "epoch": 3.703169515669516,
+      "grad_norm": 1.1360125541687012,
+      "learning_rate": 2.726205222029754e-06,
+      "loss": 0.6849,
+      "step": 20800
+    },
+    {
+      "epoch": 3.703347578347578,
+      "grad_norm": 0.8517076969146729,
+      "learning_rate": 2.722960062346991e-06,
+      "loss": 0.6815,
+      "step": 20801
+    },
+    {
+      "epoch": 3.703525641025641,
+      "grad_norm": 1.07355535030365,
+      "learning_rate": 2.7197168086057213e-06,
+      "loss": 0.6606,
+      "step": 20802
+    },
+    {
+      "epoch": 3.7037037037037037,
+      "grad_norm": 0.8663332462310791,
+      "learning_rate": 2.7164754608695387e-06,
+      "loss": 0.6732,
+      "step": 20803
+    },
+    {
+      "epoch": 3.7038817663817665,
+      "grad_norm": 0.8558155298233032,
+      "learning_rate": 2.7132360192019035e-06,
+      "loss": 0.6664,
+      "step": 20804
+    },
+    {
+      "epoch": 3.7040598290598292,
+      "grad_norm": 1.0739531517028809,
+      "learning_rate": 2.7099984836663316e-06,
+      "loss": 0.8237,
+      "step": 20805
+    },
+    {
+      "epoch": 3.7042378917378915,
+      "grad_norm": 0.9637136459350586,
+      "learning_rate": 2.706762854326239e-06,
+      "loss": 0.9264,
+      "step": 20806
+    },
+    {
+      "epoch": 3.7044159544159543,
+      "grad_norm": 0.8434587121009827,
+      "learning_rate": 2.7035291312450084e-06,
+      "loss": 0.6871,
+      "step": 20807
+    },
+    {
+      "epoch": 3.704594017094017,
+      "grad_norm": 0.9097843170166016,
+      "learning_rate": 2.700297314486022e-06,
+      "loss": 0.7741,
+      "step": 20808
+    },
+    {
+      "epoch": 3.70477207977208,
+      "grad_norm": 0.9461157321929932,
+      "learning_rate": 2.6970674041125744e-06,
+      "loss": 0.7223,
+      "step": 20809
+    },
+    {
+      "epoch": 3.7049501424501425,
+      "grad_norm": 0.799718976020813,
+      "learning_rate": 2.6938394001879695e-06,
+      "loss": 0.6793,
+      "step": 20810
+    },
+    {
+      "epoch": 3.7051282051282053,
+      "grad_norm": 0.9917070865631104,
+      "learning_rate": 2.6906133027754353e-06,
+      "loss": 0.8659,
+      "step": 20811
+    },
+    {
+      "epoch": 3.705306267806268,
+      "grad_norm": 0.8843945264816284,
+      "learning_rate": 2.687389111938199e-06,
+      "loss": 0.761,
+      "step": 20812
+    },
+    {
+      "epoch": 3.7054843304843303,
+      "grad_norm": 1.0072822570800781,
+      "learning_rate": 2.6841668277394315e-06,
+      "loss": 0.665,
+      "step": 20813
+    },
+    {
+      "epoch": 3.705662393162393,
+      "grad_norm": 1.0159565210342407,
+      "learning_rate": 2.68094645024225e-06,
+      "loss": 0.8305,
+      "step": 20814
+    },
+    {
+      "epoch": 3.705840455840456,
+      "grad_norm": 1.0532197952270508,
+      "learning_rate": 2.6777279795097586e-06,
+      "loss": 0.7869,
+      "step": 20815
+    },
+    {
+      "epoch": 3.7060185185185186,
+      "grad_norm": 1.0152393579483032,
+      "learning_rate": 2.6745114156050187e-06,
+      "loss": 0.6795,
+      "step": 20816
+    },
+    {
+      "epoch": 3.7061965811965814,
+      "grad_norm": 0.8705698847770691,
+      "learning_rate": 2.6712967585910465e-06,
+      "loss": 0.5973,
+      "step": 20817
+    },
+    {
+      "epoch": 3.7063746438746437,
+      "grad_norm": 0.9337679147720337,
+      "learning_rate": 2.668084008530847e-06,
+      "loss": 0.745,
+      "step": 20818
+    },
+    {
+      "epoch": 3.7065527065527064,
+      "grad_norm": 1.0119414329528809,
+      "learning_rate": 2.664873165487336e-06,
+      "loss": 0.8467,
+      "step": 20819
+    },
+    {
+      "epoch": 3.706730769230769,
+      "grad_norm": 1.0212607383728027,
+      "learning_rate": 2.661664229523442e-06,
+      "loss": 0.6991,
+      "step": 20820
+    },
+    {
+      "epoch": 3.706908831908832,
+      "grad_norm": 0.6226426362991333,
+      "learning_rate": 2.658457200702047e-06,
+      "loss": 0.2538,
+      "step": 20821
+    },
+    {
+      "epoch": 3.7070868945868947,
+      "grad_norm": 1.035329818725586,
+      "learning_rate": 2.655252079085957e-06,
+      "loss": 0.801,
+      "step": 20822
+    },
+    {
+      "epoch": 3.7072649572649574,
+      "grad_norm": 0.9035121202468872,
+      "learning_rate": 2.6520488647379995e-06,
+      "loss": 0.7865,
+      "step": 20823
+    },
+    {
+      "epoch": 3.70744301994302,
+      "grad_norm": 0.8977579474449158,
+      "learning_rate": 2.6488475577209127e-06,
+      "loss": 0.8943,
+      "step": 20824
+    },
+    {
+      "epoch": 3.7076210826210825,
+      "grad_norm": 1.0442665815353394,
+      "learning_rate": 2.6456481580974356e-06,
+      "loss": 0.8296,
+      "step": 20825
+    },
+    {
+      "epoch": 3.7077991452991452,
+      "grad_norm": 0.9925961494445801,
+      "learning_rate": 2.6424506659302406e-06,
+      "loss": 0.8141,
+      "step": 20826
+    },
+    {
+      "epoch": 3.707977207977208,
+      "grad_norm": 0.8873761892318726,
+      "learning_rate": 2.639255081281977e-06,
+      "loss": 0.7006,
+      "step": 20827
+    },
+    {
+      "epoch": 3.7081552706552707,
+      "grad_norm": 0.9560503363609314,
+      "learning_rate": 2.6360614042152732e-06,
+      "loss": 0.8785,
+      "step": 20828
+    },
+    {
+      "epoch": 3.7083333333333335,
+      "grad_norm": 0.7737786173820496,
+      "learning_rate": 2.6328696347926784e-06,
+      "loss": 0.5948,
+      "step": 20829
+    },
+    {
+      "epoch": 3.708511396011396,
+      "grad_norm": 1.1564871072769165,
+      "learning_rate": 2.629679773076754e-06,
+      "loss": 0.7169,
+      "step": 20830
+    },
+    {
+      "epoch": 3.7086894586894585,
+      "grad_norm": 0.8807209730148315,
+      "learning_rate": 2.6264918191299724e-06,
+      "loss": 0.5954,
+      "step": 20831
+    },
+    {
+      "epoch": 3.7088675213675213,
+      "grad_norm": 0.9065495133399963,
+      "learning_rate": 2.6233057730148057e-06,
+      "loss": 0.7257,
+      "step": 20832
+    },
+    {
+      "epoch": 3.709045584045584,
+      "grad_norm": 0.9244785308837891,
+      "learning_rate": 2.6201216347937042e-06,
+      "loss": 0.7843,
+      "step": 20833
+    },
+    {
+      "epoch": 3.709223646723647,
+      "grad_norm": 1.0104621648788452,
+      "learning_rate": 2.6169394045290065e-06,
+      "loss": 0.7134,
+      "step": 20834
+    },
+    {
+      "epoch": 3.7094017094017095,
+      "grad_norm": 0.9149969816207886,
+      "learning_rate": 2.613759082283085e-06,
+      "loss": 0.6389,
+      "step": 20835
+    },
+    {
+      "epoch": 3.7095797720797723,
+      "grad_norm": 1.1129883527755737,
+      "learning_rate": 2.6105806681182676e-06,
+      "loss": 0.7805,
+      "step": 20836
+    },
+    {
+      "epoch": 3.7097578347578346,
+      "grad_norm": 0.9657980799674988,
+      "learning_rate": 2.6074041620968047e-06,
+      "loss": 0.8758,
+      "step": 20837
+    },
+    {
+      "epoch": 3.7099358974358974,
+      "grad_norm": 1.2004457712173462,
+      "learning_rate": 2.6042295642809355e-06,
+      "loss": 0.8944,
+      "step": 20838
+    },
+    {
+      "epoch": 3.71011396011396,
+      "grad_norm": 1.0154633522033691,
+      "learning_rate": 2.6010568747328766e-06,
+      "loss": 1.0198,
+      "step": 20839
+    },
+    {
+      "epoch": 3.710292022792023,
+      "grad_norm": 0.908183753490448,
+      "learning_rate": 2.5978860935147788e-06,
+      "loss": 0.7449,
+      "step": 20840
+    },
+    {
+      "epoch": 3.7104700854700856,
+      "grad_norm": 1.012807846069336,
+      "learning_rate": 2.594717220688758e-06,
+      "loss": 0.793,
+      "step": 20841
+    },
+    {
+      "epoch": 3.710648148148148,
+      "grad_norm": 1.0019315481185913,
+      "learning_rate": 2.5915502563169105e-06,
+      "loss": 0.7707,
+      "step": 20842
+    },
+    {
+      "epoch": 3.7108262108262107,
+      "grad_norm": 1.0038162469863892,
+      "learning_rate": 2.5883852004613074e-06,
+      "loss": 0.7959,
+      "step": 20843
+    },
+    {
+      "epoch": 3.7110042735042734,
+      "grad_norm": 1.0344631671905518,
+      "learning_rate": 2.5852220531839113e-06,
+      "loss": 0.7611,
+      "step": 20844
+    },
+    {
+      "epoch": 3.711182336182336,
+      "grad_norm": 1.0860040187835693,
+      "learning_rate": 2.5820608145467494e-06,
+      "loss": 0.6978,
+      "step": 20845
+    },
+    {
+      "epoch": 3.711360398860399,
+      "grad_norm": 0.8946214318275452,
+      "learning_rate": 2.5789014846117288e-06,
+      "loss": 0.834,
+      "step": 20846
+    },
+    {
+      "epoch": 3.7115384615384617,
+      "grad_norm": 0.8077123761177063,
+      "learning_rate": 2.575744063440755e-06,
+      "loss": 0.6498,
+      "step": 20847
+    },
+    {
+      "epoch": 3.7117165242165244,
+      "grad_norm": 0.9708347916603088,
+      "learning_rate": 2.5725885510957116e-06,
+      "loss": 0.7118,
+      "step": 20848
+    },
+    {
+      "epoch": 3.7118945868945867,
+      "grad_norm": 0.8636693358421326,
+      "learning_rate": 2.5694349476383826e-06,
+      "loss": 0.6884,
+      "step": 20849
+    },
+    {
+      "epoch": 3.7120726495726495,
+      "grad_norm": 1.106953740119934,
+      "learning_rate": 2.566283253130608e-06,
+      "loss": 0.6778,
+      "step": 20850
+    },
+    {
+      "epoch": 3.7122507122507122,
+      "grad_norm": 1.0069822072982788,
+      "learning_rate": 2.5631334676340936e-06,
+      "loss": 0.7222,
+      "step": 20851
+    },
+    {
+      "epoch": 3.712428774928775,
+      "grad_norm": 0.8577214479446411,
+      "learning_rate": 2.5599855912105787e-06,
+      "loss": 0.7543,
+      "step": 20852
+    },
+    {
+      "epoch": 3.7126068376068377,
+      "grad_norm": 0.9592446684837341,
+      "learning_rate": 2.5568396239217363e-06,
+      "loss": 0.7331,
+      "step": 20853
+    },
+    {
+      "epoch": 3.7127849002849,
+      "grad_norm": 0.9552914500236511,
+      "learning_rate": 2.553695565829195e-06,
+      "loss": 0.8148,
+      "step": 20854
+    },
+    {
+      "epoch": 3.712962962962963,
+      "grad_norm": 1.1876866817474365,
+      "learning_rate": 2.550553416994561e-06,
+      "loss": 0.9698,
+      "step": 20855
+    },
+    {
+      "epoch": 3.7131410256410255,
+      "grad_norm": 0.985346257686615,
+      "learning_rate": 2.5474131774794073e-06,
+      "loss": 0.8515,
+      "step": 20856
+    },
+    {
+      "epoch": 3.7133190883190883,
+      "grad_norm": 0.8775020837783813,
+      "learning_rate": 2.5442748473452293e-06,
+      "loss": 0.6048,
+      "step": 20857
+    },
+    {
+      "epoch": 3.713497150997151,
+      "grad_norm": 1.0920336246490479,
+      "learning_rate": 2.5411384266535663e-06,
+      "loss": 0.8886,
+      "step": 20858
+    },
+    {
+      "epoch": 3.713675213675214,
+      "grad_norm": 0.960750937461853,
+      "learning_rate": 2.5380039154658253e-06,
+      "loss": 0.5685,
+      "step": 20859
+    },
+    {
+      "epoch": 3.7138532763532766,
+      "grad_norm": 0.9069384336471558,
+      "learning_rate": 2.5348713138434564e-06,
+      "loss": 0.7219,
+      "step": 20860
+    },
+    {
+      "epoch": 3.7140313390313393,
+      "grad_norm": 0.8939675688743591,
+      "learning_rate": 2.5317406218478e-06,
+      "loss": 0.6803,
+      "step": 20861
+    },
+    {
+      "epoch": 3.7142094017094016,
+      "grad_norm": 0.9675079584121704,
+      "learning_rate": 2.5286118395402182e-06,
+      "loss": 0.8229,
+      "step": 20862
+    },
+    {
+      "epoch": 3.7143874643874644,
+      "grad_norm": 0.8619394898414612,
+      "learning_rate": 2.5254849669820056e-06,
+      "loss": 0.771,
+      "step": 20863
+    },
+    {
+      "epoch": 3.714565527065527,
+      "grad_norm": 1.0089298486709595,
+      "learning_rate": 2.5223600042344362e-06,
+      "loss": 0.798,
+      "step": 20864
+    },
+    {
+      "epoch": 3.71474358974359,
+      "grad_norm": 1.3646090030670166,
+      "learning_rate": 2.5192369513587276e-06,
+      "loss": 0.7596,
+      "step": 20865
+    },
+    {
+      "epoch": 3.714921652421652,
+      "grad_norm": 0.950450599193573,
+      "learning_rate": 2.5161158084160754e-06,
+      "loss": 0.6728,
+      "step": 20866
+    },
+    {
+      "epoch": 3.715099715099715,
+      "grad_norm": 1.071611762046814,
+      "learning_rate": 2.512996575467608e-06,
+      "loss": 0.6569,
+      "step": 20867
+    },
+    {
+      "epoch": 3.7152777777777777,
+      "grad_norm": 0.8743181228637695,
+      "learning_rate": 2.5098792525744874e-06,
+      "loss": 0.7567,
+      "step": 20868
+    },
+    {
+      "epoch": 3.7154558404558404,
+      "grad_norm": 0.8863722085952759,
+      "learning_rate": 2.5067638397977544e-06,
+      "loss": 0.6423,
+      "step": 20869
+    },
+    {
+      "epoch": 3.715633903133903,
+      "grad_norm": 0.7184516191482544,
+      "learning_rate": 2.50365033719846e-06,
+      "loss": 0.347,
+      "step": 20870
+    },
+    {
+      "epoch": 3.715811965811966,
+      "grad_norm": 0.9171306490898132,
+      "learning_rate": 2.5005387448375995e-06,
+      "loss": 0.7904,
+      "step": 20871
+    },
+    {
+      "epoch": 3.7159900284900287,
+      "grad_norm": 1.0704946517944336,
+      "learning_rate": 2.4974290627761466e-06,
+      "loss": 0.9393,
+      "step": 20872
+    },
+    {
+      "epoch": 3.7161680911680914,
+      "grad_norm": 0.9949630498886108,
+      "learning_rate": 2.4943212910750414e-06,
+      "loss": 0.646,
+      "step": 20873
+    },
+    {
+      "epoch": 3.7163461538461537,
+      "grad_norm": 0.8926770687103271,
+      "learning_rate": 2.491215429795146e-06,
+      "loss": 0.8417,
+      "step": 20874
+    },
+    {
+      "epoch": 3.7165242165242165,
+      "grad_norm": 1.0397453308105469,
+      "learning_rate": 2.4881114789973346e-06,
+      "loss": 0.7087,
+      "step": 20875
+    },
+    {
+      "epoch": 3.7167022792022792,
+      "grad_norm": 0.8475403189659119,
+      "learning_rate": 2.485009438742414e-06,
+      "loss": 0.6031,
+      "step": 20876
+    },
+    {
+      "epoch": 3.716880341880342,
+      "grad_norm": 0.8874244093894958,
+      "learning_rate": 2.481909309091157e-06,
+      "loss": 0.8112,
+      "step": 20877
+    },
+    {
+      "epoch": 3.7170584045584043,
+      "grad_norm": 0.9252071380615234,
+      "learning_rate": 2.478811090104316e-06,
+      "loss": 0.6202,
+      "step": 20878
+    },
+    {
+      "epoch": 3.717236467236467,
+      "grad_norm": 0.8514638543128967,
+      "learning_rate": 2.4757147818425865e-06,
+      "loss": 0.656,
+      "step": 20879
+    },
+    {
+      "epoch": 3.71741452991453,
+      "grad_norm": 0.942911684513092,
+      "learning_rate": 2.4726203843666417e-06,
+      "loss": 0.7602,
+      "step": 20880
+    },
+    {
+      "epoch": 3.7175925925925926,
+      "grad_norm": 0.8529558777809143,
+      "learning_rate": 2.4695278977371005e-06,
+      "loss": 0.8576,
+      "step": 20881
+    },
+    {
+      "epoch": 3.7177706552706553,
+      "grad_norm": 0.8798624873161316,
+      "learning_rate": 2.4664373220145587e-06,
+      "loss": 0.6588,
+      "step": 20882
+    },
+    {
+      "epoch": 3.717948717948718,
+      "grad_norm": 0.9501121640205383,
+      "learning_rate": 2.4633486572595787e-06,
+      "loss": 0.7127,
+      "step": 20883
+    },
+    {
+      "epoch": 3.718126780626781,
+      "grad_norm": 0.9304238557815552,
+      "learning_rate": 2.4602619035326456e-06,
+      "loss": 0.7881,
+      "step": 20884
+    },
+    {
+      "epoch": 3.7183048433048436,
+      "grad_norm": 1.0175336599349976,
+      "learning_rate": 2.457177060894289e-06,
+      "loss": 0.7735,
+      "step": 20885
+    },
+    {
+      "epoch": 3.718482905982906,
+      "grad_norm": 0.9424969553947449,
+      "learning_rate": 2.4540941294048937e-06,
+      "loss": 0.6261,
+      "step": 20886
+    },
+    {
+      "epoch": 3.7186609686609686,
+      "grad_norm": 0.9643171429634094,
+      "learning_rate": 2.4510131091249e-06,
+      "loss": 0.7912,
+      "step": 20887
+    },
+    {
+      "epoch": 3.7188390313390314,
+      "grad_norm": 0.9775816798210144,
+      "learning_rate": 2.44793400011466e-06,
+      "loss": 0.754,
+      "step": 20888
+    },
+    {
+      "epoch": 3.719017094017094,
+      "grad_norm": 0.8465856909751892,
+      "learning_rate": 2.4448568024345032e-06,
+      "loss": 0.6311,
+      "step": 20889
+    },
+    {
+      "epoch": 3.7191951566951564,
+      "grad_norm": 0.9722816348075867,
+      "learning_rate": 2.4417815161447367e-06,
+      "loss": 0.741,
+      "step": 20890
+    },
+    {
+      "epoch": 3.719373219373219,
+      "grad_norm": 0.9861366748809814,
+      "learning_rate": 2.4387081413055903e-06,
+      "loss": 0.889,
+      "step": 20891
+    },
+    {
+      "epoch": 3.719551282051282,
+      "grad_norm": 0.9273673892021179,
+      "learning_rate": 2.4356366779773045e-06,
+      "loss": 0.7996,
+      "step": 20892
+    },
+    {
+      "epoch": 3.7197293447293447,
+      "grad_norm": 0.9955036640167236,
+      "learning_rate": 2.432567126220031e-06,
+      "loss": 0.9323,
+      "step": 20893
+    },
+    {
+      "epoch": 3.7199074074074074,
+      "grad_norm": 1.1071380376815796,
+      "learning_rate": 2.4294994860939337e-06,
+      "loss": 0.7601,
+      "step": 20894
+    },
+    {
+      "epoch": 3.72008547008547,
+      "grad_norm": 0.9730632901191711,
+      "learning_rate": 2.426433757659108e-06,
+      "loss": 0.7674,
+      "step": 20895
+    },
+    {
+      "epoch": 3.720263532763533,
+      "grad_norm": 0.8447731733322144,
+      "learning_rate": 2.4233699409756284e-06,
+      "loss": 0.5593,
+      "step": 20896
+    },
+    {
+      "epoch": 3.7204415954415957,
+      "grad_norm": 1.0535780191421509,
+      "learning_rate": 2.4203080361035136e-06,
+      "loss": 0.7876,
+      "step": 20897
+    },
+    {
+      "epoch": 3.720619658119658,
+      "grad_norm": 1.0676836967468262,
+      "learning_rate": 2.4172480431027487e-06,
+      "loss": 0.7371,
+      "step": 20898
+    },
+    {
+      "epoch": 3.7207977207977208,
+      "grad_norm": 0.920718789100647,
+      "learning_rate": 2.4141899620332976e-06,
+      "loss": 0.6973,
+      "step": 20899
+    },
+    {
+      "epoch": 3.7209757834757835,
+      "grad_norm": 0.9319782257080078,
+      "learning_rate": 2.4111337929551002e-06,
+      "loss": 0.7951,
+      "step": 20900
+    },
+    {
+      "epoch": 3.7211538461538463,
+      "grad_norm": 0.9919620752334595,
+      "learning_rate": 2.408079535927987e-06,
+      "loss": 0.7875,
+      "step": 20901
+    },
+    {
+      "epoch": 3.7213319088319086,
+      "grad_norm": 0.9960176348686218,
+      "learning_rate": 2.4050271910118325e-06,
+      "loss": 0.8002,
+      "step": 20902
+    },
+    {
+      "epoch": 3.7215099715099713,
+      "grad_norm": 0.9272335171699524,
+      "learning_rate": 2.401976758266433e-06,
+      "loss": 0.6174,
+      "step": 20903
+    },
+    {
+      "epoch": 3.721688034188034,
+      "grad_norm": 0.9436232447624207,
+      "learning_rate": 2.3989282377515633e-06,
+      "loss": 0.5779,
+      "step": 20904
+    },
+    {
+      "epoch": 3.721866096866097,
+      "grad_norm": 0.8671663999557495,
+      "learning_rate": 2.3958816295269417e-06,
+      "loss": 0.6888,
+      "step": 20905
+    },
+    {
+      "epoch": 3.7220441595441596,
+      "grad_norm": 1.175491213798523,
+      "learning_rate": 2.3928369336522647e-06,
+      "loss": 0.7466,
+      "step": 20906
+    },
+    {
+      "epoch": 3.7222222222222223,
+      "grad_norm": 0.9122247695922852,
+      "learning_rate": 2.3897941501871855e-06,
+      "loss": 0.8985,
+      "step": 20907
+    },
+    {
+      "epoch": 3.722400284900285,
+      "grad_norm": 1.0453401803970337,
+      "learning_rate": 2.386753279191323e-06,
+      "loss": 0.7194,
+      "step": 20908
+    },
+    {
+      "epoch": 3.722578347578348,
+      "grad_norm": 0.938706636428833,
+      "learning_rate": 2.383714320724262e-06,
+      "loss": 0.6207,
+      "step": 20909
+    },
+    {
+      "epoch": 3.72275641025641,
+      "grad_norm": 0.9452419281005859,
+      "learning_rate": 2.380677274845533e-06,
+      "loss": 0.8168,
+      "step": 20910
+    },
+    {
+      "epoch": 3.722934472934473,
+      "grad_norm": 1.0109187364578247,
+      "learning_rate": 2.3776421416146556e-06,
+      "loss": 0.6572,
+      "step": 20911
+    },
+    {
+      "epoch": 3.7231125356125356,
+      "grad_norm": 0.9023509621620178,
+      "learning_rate": 2.3746089210910816e-06,
+      "loss": 0.7597,
+      "step": 20912
+    },
+    {
+      "epoch": 3.7232905982905984,
+      "grad_norm": 1.0462290048599243,
+      "learning_rate": 2.3715776133342416e-06,
+      "loss": 0.9093,
+      "step": 20913
+    },
+    {
+      "epoch": 3.7234686609686607,
+      "grad_norm": 0.9403861165046692,
+      "learning_rate": 2.368548218403521e-06,
+      "loss": 0.7762,
+      "step": 20914
+    },
+    {
+      "epoch": 3.7236467236467234,
+      "grad_norm": 0.8098660111427307,
+      "learning_rate": 2.3655207363583177e-06,
+      "loss": 0.515,
+      "step": 20915
+    },
+    {
+      "epoch": 3.723824786324786,
+      "grad_norm": 1.1157571077346802,
+      "learning_rate": 2.362495167257894e-06,
+      "loss": 0.7646,
+      "step": 20916
+    },
+    {
+      "epoch": 3.724002849002849,
+      "grad_norm": 0.8967264890670776,
+      "learning_rate": 2.3594715111615594e-06,
+      "loss": 0.6829,
+      "step": 20917
+    },
+    {
+      "epoch": 3.7241809116809117,
+      "grad_norm": 1.0004037618637085,
+      "learning_rate": 2.3564497681285547e-06,
+      "loss": 0.9033,
+      "step": 20918
+    },
+    {
+      "epoch": 3.7243589743589745,
+      "grad_norm": 0.9404234290122986,
+      "learning_rate": 2.353429938218077e-06,
+      "loss": 0.7891,
+      "step": 20919
+    },
+    {
+      "epoch": 3.724537037037037,
+      "grad_norm": 0.9244145750999451,
+      "learning_rate": 2.35041202148929e-06,
+      "loss": 0.8618,
+      "step": 20920
+    },
+    {
+      "epoch": 3.7247150997151,
+      "grad_norm": 0.9113515615463257,
+      "learning_rate": 2.3473960180013353e-06,
+      "loss": 0.6614,
+      "step": 20921
+    },
+    {
+      "epoch": 3.7248931623931623,
+      "grad_norm": 1.0383996963500977,
+      "learning_rate": 2.3443819278132996e-06,
+      "loss": 0.8602,
+      "step": 20922
+    },
+    {
+      "epoch": 3.725071225071225,
+      "grad_norm": 0.9273353815078735,
+      "learning_rate": 2.341369750984246e-06,
+      "loss": 0.734,
+      "step": 20923
+    },
+    {
+      "epoch": 3.7252492877492878,
+      "grad_norm": 0.8500500321388245,
+      "learning_rate": 2.3383594875731605e-06,
+      "loss": 0.7722,
+      "step": 20924
+    },
+    {
+      "epoch": 3.7254273504273505,
+      "grad_norm": 0.9240385890007019,
+      "learning_rate": 2.3353511376390747e-06,
+      "loss": 0.6948,
+      "step": 20925
+    },
+    {
+      "epoch": 3.7256054131054133,
+      "grad_norm": 1.025537371635437,
+      "learning_rate": 2.332344701240885e-06,
+      "loss": 0.8939,
+      "step": 20926
+    },
+    {
+      "epoch": 3.7257834757834756,
+      "grad_norm": 0.9603112936019897,
+      "learning_rate": 2.3293401784375223e-06,
+      "loss": 0.9551,
+      "step": 20927
+    },
+    {
+      "epoch": 3.7259615384615383,
+      "grad_norm": 0.9035696983337402,
+      "learning_rate": 2.32633756928784e-06,
+      "loss": 0.7112,
+      "step": 20928
+    },
+    {
+      "epoch": 3.726139601139601,
+      "grad_norm": 0.9449596405029297,
+      "learning_rate": 2.3233368738506677e-06,
+      "loss": 0.8199,
+      "step": 20929
+    },
+    {
+      "epoch": 3.726317663817664,
+      "grad_norm": 1.058143138885498,
+      "learning_rate": 2.320338092184826e-06,
+      "loss": 0.8258,
+      "step": 20930
+    },
+    {
+      "epoch": 3.7264957264957266,
+      "grad_norm": 1.0171890258789062,
+      "learning_rate": 2.317341224349023e-06,
+      "loss": 0.991,
+      "step": 20931
+    },
+    {
+      "epoch": 3.7266737891737893,
+      "grad_norm": 1.008941888809204,
+      "learning_rate": 2.3143462704020124e-06,
+      "loss": 0.8857,
+      "step": 20932
+    },
+    {
+      "epoch": 3.726851851851852,
+      "grad_norm": 0.9405086636543274,
+      "learning_rate": 2.3113532304024575e-06,
+      "loss": 0.7347,
+      "step": 20933
+    },
+    {
+      "epoch": 3.7270299145299144,
+      "grad_norm": 1.136816143989563,
+      "learning_rate": 2.308362104409012e-06,
+      "loss": 0.8393,
+      "step": 20934
+    },
+    {
+      "epoch": 3.727207977207977,
+      "grad_norm": 1.0990225076675415,
+      "learning_rate": 2.305372892480273e-06,
+      "loss": 0.84,
+      "step": 20935
+    },
+    {
+      "epoch": 3.72738603988604,
+      "grad_norm": 1.0101460218429565,
+      "learning_rate": 2.302385594674805e-06,
+      "loss": 0.8234,
+      "step": 20936
+    },
+    {
+      "epoch": 3.7275641025641026,
+      "grad_norm": 1.0452793836593628,
+      "learning_rate": 2.299400211051139e-06,
+      "loss": 0.7695,
+      "step": 20937
+    },
+    {
+      "epoch": 3.7277421652421654,
+      "grad_norm": 0.9525721669197083,
+      "learning_rate": 2.296416741667784e-06,
+      "loss": 0.8109,
+      "step": 20938
+    },
+    {
+      "epoch": 3.7279202279202277,
+      "grad_norm": 0.910220205783844,
+      "learning_rate": 2.293435186583159e-06,
+      "loss": 0.6896,
+      "step": 20939
+    },
+    {
+      "epoch": 3.7280982905982905,
+      "grad_norm": 0.9199397563934326,
+      "learning_rate": 2.2904555458557298e-06,
+      "loss": 0.795,
+      "step": 20940
+    },
+    {
+      "epoch": 3.728276353276353,
+      "grad_norm": 1.0512983798980713,
+      "learning_rate": 2.2874778195438263e-06,
+      "loss": 0.853,
+      "step": 20941
+    },
+    {
+      "epoch": 3.728454415954416,
+      "grad_norm": 0.9414512515068054,
+      "learning_rate": 2.2845020077058356e-06,
+      "loss": 0.651,
+      "step": 20942
+    },
+    {
+      "epoch": 3.7286324786324787,
+      "grad_norm": 0.8715613484382629,
+      "learning_rate": 2.281528110400033e-06,
+      "loss": 0.7046,
+      "step": 20943
+    },
+    {
+      "epoch": 3.7288105413105415,
+      "grad_norm": 0.9587253928184509,
+      "learning_rate": 2.2785561276846947e-06,
+      "loss": 0.6152,
+      "step": 20944
+    },
+    {
+      "epoch": 3.728988603988604,
+      "grad_norm": 0.9781585335731506,
+      "learning_rate": 2.2755860596180514e-06,
+      "loss": 0.8213,
+      "step": 20945
+    },
+    {
+      "epoch": 3.7291666666666665,
+      "grad_norm": 0.917754054069519,
+      "learning_rate": 2.2726179062582786e-06,
+      "loss": 0.724,
+      "step": 20946
+    },
+    {
+      "epoch": 3.7293447293447293,
+      "grad_norm": 1.1454041004180908,
+      "learning_rate": 2.2696516676635747e-06,
+      "loss": 0.8493,
+      "step": 20947
+    },
+    {
+      "epoch": 3.729522792022792,
+      "grad_norm": 1.0171089172363281,
+      "learning_rate": 2.2666873438920042e-06,
+      "loss": 0.6073,
+      "step": 20948
+    },
+    {
+      "epoch": 3.7297008547008548,
+      "grad_norm": 0.914182722568512,
+      "learning_rate": 2.2637249350016877e-06,
+      "loss": 0.7566,
+      "step": 20949
+    },
+    {
+      "epoch": 3.7298789173789175,
+      "grad_norm": 0.9400814771652222,
+      "learning_rate": 2.2607644410506446e-06,
+      "loss": 0.7434,
+      "step": 20950
+    },
+    {
+      "epoch": 3.73005698005698,
+      "grad_norm": 0.9671114087104797,
+      "learning_rate": 2.2578058620968846e-06,
+      "loss": 0.8685,
+      "step": 20951
+    },
+    {
+      "epoch": 3.7302350427350426,
+      "grad_norm": 0.9570022821426392,
+      "learning_rate": 2.254849198198383e-06,
+      "loss": 0.8194,
+      "step": 20952
+    },
+    {
+      "epoch": 3.7304131054131053,
+      "grad_norm": 1.0614973306655884,
+      "learning_rate": 2.251894449413061e-06,
+      "loss": 0.8729,
+      "step": 20953
+    },
+    {
+      "epoch": 3.730591168091168,
+      "grad_norm": 0.9944061040878296,
+      "learning_rate": 2.2489416157988165e-06,
+      "loss": 0.848,
+      "step": 20954
+    },
+    {
+      "epoch": 3.730769230769231,
+      "grad_norm": 0.9106524586677551,
+      "learning_rate": 2.2459906974135026e-06,
+      "loss": 0.7561,
+      "step": 20955
+    },
+    {
+      "epoch": 3.7309472934472936,
+      "grad_norm": 1.0078433752059937,
+      "learning_rate": 2.2430416943149177e-06,
+      "loss": 0.8776,
+      "step": 20956
+    },
+    {
+      "epoch": 3.7311253561253563,
+      "grad_norm": 0.9963732361793518,
+      "learning_rate": 2.240094606560883e-06,
+      "loss": 0.7677,
+      "step": 20957
+    },
+    {
+      "epoch": 3.7313034188034186,
+      "grad_norm": 1.1062570810317993,
+      "learning_rate": 2.237149434209096e-06,
+      "loss": 0.9031,
+      "step": 20958
+    },
+    {
+      "epoch": 3.7314814814814814,
+      "grad_norm": 1.0618034601211548,
+      "learning_rate": 2.2342061773172884e-06,
+      "loss": 0.6396,
+      "step": 20959
+    },
+    {
+      "epoch": 3.731659544159544,
+      "grad_norm": 1.0357275009155273,
+      "learning_rate": 2.2312648359431256e-06,
+      "loss": 0.8437,
+      "step": 20960
+    },
+    {
+      "epoch": 3.731837606837607,
+      "grad_norm": 0.8858973979949951,
+      "learning_rate": 2.2283254101442277e-06,
+      "loss": 0.6799,
+      "step": 20961
+    },
+    {
+      "epoch": 3.7320156695156697,
+      "grad_norm": 0.8738244771957397,
+      "learning_rate": 2.2253878999781818e-06,
+      "loss": 0.6288,
+      "step": 20962
+    },
+    {
+      "epoch": 3.732193732193732,
+      "grad_norm": 1.0547064542770386,
+      "learning_rate": 2.222452305502565e-06,
+      "loss": 0.9165,
+      "step": 20963
+    },
+    {
+      "epoch": 3.7323717948717947,
+      "grad_norm": 0.9456502199172974,
+      "learning_rate": 2.2195186267748637e-06,
+      "loss": 0.6642,
+      "step": 20964
+    },
+    {
+      "epoch": 3.7325498575498575,
+      "grad_norm": 0.8678750991821289,
+      "learning_rate": 2.2165868638525878e-06,
+      "loss": 0.7079,
+      "step": 20965
+    },
+    {
+      "epoch": 3.73272792022792,
+      "grad_norm": 1.0330867767333984,
+      "learning_rate": 2.2136570167931583e-06,
+      "loss": 0.836,
+      "step": 20966
+    },
+    {
+      "epoch": 3.732905982905983,
+      "grad_norm": 0.9410081505775452,
+      "learning_rate": 2.210729085653984e-06,
+      "loss": 0.6817,
+      "step": 20967
+    },
+    {
+      "epoch": 3.7330840455840457,
+      "grad_norm": 0.9865879416465759,
+      "learning_rate": 2.2078030704924423e-06,
+      "loss": 0.874,
+      "step": 20968
+    },
+    {
+      "epoch": 3.7332621082621085,
+      "grad_norm": 0.9446350336074829,
+      "learning_rate": 2.2048789713658423e-06,
+      "loss": 0.7081,
+      "step": 20969
+    },
+    {
+      "epoch": 3.7334401709401708,
+      "grad_norm": 0.8920941948890686,
+      "learning_rate": 2.201956788331494e-06,
+      "loss": 0.7024,
+      "step": 20970
+    },
+    {
+      "epoch": 3.7336182336182335,
+      "grad_norm": 0.9637290239334106,
+      "learning_rate": 2.1990365214466403e-06,
+      "loss": 0.7942,
+      "step": 20971
+    },
+    {
+      "epoch": 3.7337962962962963,
+      "grad_norm": 1.140133261680603,
+      "learning_rate": 2.196118170768513e-06,
+      "loss": 0.7663,
+      "step": 20972
+    },
+    {
+      "epoch": 3.733974358974359,
+      "grad_norm": 0.9300453662872314,
+      "learning_rate": 2.193201736354267e-06,
+      "loss": 0.7228,
+      "step": 20973
+    },
+    {
+      "epoch": 3.734152421652422,
+      "grad_norm": 0.9268558621406555,
+      "learning_rate": 2.1902872182610666e-06,
+      "loss": 0.7171,
+      "step": 20974
+    },
+    {
+      "epoch": 3.734330484330484,
+      "grad_norm": 0.9928264021873474,
+      "learning_rate": 2.1873746165460007e-06,
+      "loss": 0.7955,
+      "step": 20975
+    },
+    {
+      "epoch": 3.734508547008547,
+      "grad_norm": 0.9498245716094971,
+      "learning_rate": 2.184463931266145e-06,
+      "loss": 0.9005,
+      "step": 20976
+    },
+    {
+      "epoch": 3.7346866096866096,
+      "grad_norm": 0.8877419829368591,
+      "learning_rate": 2.1815551624785213e-06,
+      "loss": 0.6512,
+      "step": 20977
+    },
+    {
+      "epoch": 3.7348646723646723,
+      "grad_norm": 0.974147617816925,
+      "learning_rate": 2.178648310240128e-06,
+      "loss": 0.8416,
+      "step": 20978
+    },
+    {
+      "epoch": 3.735042735042735,
+      "grad_norm": 0.8951764702796936,
+      "learning_rate": 2.175743374607897e-06,
+      "loss": 0.5664,
+      "step": 20979
+    },
+    {
+      "epoch": 3.735220797720798,
+      "grad_norm": 1.0801883935928345,
+      "learning_rate": 2.1728403556387833e-06,
+      "loss": 0.9177,
+      "step": 20980
+    },
+    {
+      "epoch": 3.7353988603988606,
+      "grad_norm": 0.9792835712432861,
+      "learning_rate": 2.1699392533896302e-06,
+      "loss": 0.7572,
+      "step": 20981
+    },
+    {
+      "epoch": 3.7355769230769234,
+      "grad_norm": 0.8745859265327454,
+      "learning_rate": 2.167040067917314e-06,
+      "loss": 0.7706,
+      "step": 20982
+    },
+    {
+      "epoch": 3.7357549857549857,
+      "grad_norm": 0.9623910784721375,
+      "learning_rate": 2.164142799278601e-06,
+      "loss": 0.6661,
+      "step": 20983
+    },
+    {
+      "epoch": 3.7359330484330484,
+      "grad_norm": 0.8940355181694031,
+      "learning_rate": 2.161247447530268e-06,
+      "loss": 0.6218,
+      "step": 20984
+    },
+    {
+      "epoch": 3.736111111111111,
+      "grad_norm": 0.9015322327613831,
+      "learning_rate": 2.158354012729069e-06,
+      "loss": 0.7744,
+      "step": 20985
+    },
+    {
+      "epoch": 3.736289173789174,
+      "grad_norm": 0.9271003007888794,
+      "learning_rate": 2.155462494931648e-06,
+      "loss": 0.7519,
+      "step": 20986
+    },
+    {
+      "epoch": 3.736467236467236,
+      "grad_norm": 0.8898332715034485,
+      "learning_rate": 2.1525728941947156e-06,
+      "loss": 0.8417,
+      "step": 20987
+    },
+    {
+      "epoch": 3.736645299145299,
+      "grad_norm": 0.9652456641197205,
+      "learning_rate": 2.1496852105748256e-06,
+      "loss": 0.9963,
+      "step": 20988
+    },
+    {
+      "epoch": 3.7368233618233617,
+      "grad_norm": 1.1075891256332397,
+      "learning_rate": 2.146799444128611e-06,
+      "loss": 0.8754,
+      "step": 20989
+    },
+    {
+      "epoch": 3.7370014245014245,
+      "grad_norm": 0.8933864235877991,
+      "learning_rate": 2.1439155949125822e-06,
+      "loss": 0.6959,
+      "step": 20990
+    },
+    {
+      "epoch": 3.7371794871794872,
+      "grad_norm": 0.8123324513435364,
+      "learning_rate": 2.1410336629832497e-06,
+      "loss": 0.5327,
+      "step": 20991
+    },
+    {
+      "epoch": 3.73735754985755,
+      "grad_norm": 1.122158169746399,
+      "learning_rate": 2.1381536483970676e-06,
+      "loss": 0.5568,
+      "step": 20992
+    },
+    {
+      "epoch": 3.7375356125356127,
+      "grad_norm": 0.9673072099685669,
+      "learning_rate": 2.1352755512104916e-06,
+      "loss": 0.7785,
+      "step": 20993
+    },
+    {
+      "epoch": 3.7377136752136755,
+      "grad_norm": 0.9374717473983765,
+      "learning_rate": 2.1323993714798874e-06,
+      "loss": 0.8412,
+      "step": 20994
+    },
+    {
+      "epoch": 3.737891737891738,
+      "grad_norm": 0.9312807321548462,
+      "learning_rate": 2.1295251092616095e-06,
+      "loss": 0.8846,
+      "step": 20995
+    },
+    {
+      "epoch": 3.7380698005698005,
+      "grad_norm": 1.020756483078003,
+      "learning_rate": 2.1266527646119805e-06,
+      "loss": 0.5703,
+      "step": 20996
+    },
+    {
+      "epoch": 3.7382478632478633,
+      "grad_norm": 0.9203028082847595,
+      "learning_rate": 2.123782337587288e-06,
+      "loss": 0.7471,
+      "step": 20997
+    },
+    {
+      "epoch": 3.738425925925926,
+      "grad_norm": 0.9450611472129822,
+      "learning_rate": 2.1209138282437423e-06,
+      "loss": 0.6721,
+      "step": 20998
+    },
+    {
+      "epoch": 3.7386039886039883,
+      "grad_norm": 0.8743969798088074,
+      "learning_rate": 2.118047236637577e-06,
+      "loss": 0.5839,
+      "step": 20999
+    },
+    {
+      "epoch": 3.738782051282051,
+      "grad_norm": 0.9334909915924072,
+      "learning_rate": 2.1151825628249357e-06,
+      "loss": 0.7405,
+      "step": 21000
+    },
+    {
+      "epoch": 3.738960113960114,
+      "grad_norm": 0.918246865272522,
+      "learning_rate": 2.112319806861962e-06,
+      "loss": 0.8169,
+      "step": 21001
+    },
+    {
+      "epoch": 3.7391381766381766,
+      "grad_norm": 1.0408916473388672,
+      "learning_rate": 2.109458968804734e-06,
+      "loss": 0.6997,
+      "step": 21002
+    },
+    {
+      "epoch": 3.7393162393162394,
+      "grad_norm": 0.905485212802887,
+      "learning_rate": 2.1066000487093063e-06,
+      "loss": 0.78,
+      "step": 21003
+    },
+    {
+      "epoch": 3.739494301994302,
+      "grad_norm": 0.8806813359260559,
+      "learning_rate": 2.103743046631701e-06,
+      "loss": 0.5886,
+      "step": 21004
+    },
+    {
+      "epoch": 3.739672364672365,
+      "grad_norm": 0.9529492259025574,
+      "learning_rate": 2.100887962627873e-06,
+      "loss": 0.7533,
+      "step": 21005
+    },
+    {
+      "epoch": 3.7398504273504276,
+      "grad_norm": 1.0024051666259766,
+      "learning_rate": 2.0980347967537895e-06,
+      "loss": 0.8242,
+      "step": 21006
+    },
+    {
+      "epoch": 3.74002849002849,
+      "grad_norm": 0.9132628440856934,
+      "learning_rate": 2.0951835490653382e-06,
+      "loss": 0.9098,
+      "step": 21007
+    },
+    {
+      "epoch": 3.7402065527065527,
+      "grad_norm": 0.8995814919471741,
+      "learning_rate": 2.092334219618397e-06,
+      "loss": 0.7624,
+      "step": 21008
+    },
+    {
+      "epoch": 3.7403846153846154,
+      "grad_norm": 0.9356616139411926,
+      "learning_rate": 2.089486808468766e-06,
+      "loss": 0.8032,
+      "step": 21009
+    },
+    {
+      "epoch": 3.740562678062678,
+      "grad_norm": 0.9492998719215393,
+      "learning_rate": 2.0866413156722554e-06,
+      "loss": 0.7695,
+      "step": 21010
+    },
+    {
+      "epoch": 3.7407407407407405,
+      "grad_norm": 0.9392259120941162,
+      "learning_rate": 2.083797741284599e-06,
+      "loss": 0.6681,
+      "step": 21011
+    },
+    {
+      "epoch": 3.7409188034188032,
+      "grad_norm": 0.9827434420585632,
+      "learning_rate": 2.0809560853615517e-06,
+      "loss": 0.7144,
+      "step": 21012
+    },
+    {
+      "epoch": 3.741096866096866,
+      "grad_norm": 1.0385278463363647,
+      "learning_rate": 2.078116347958725e-06,
+      "loss": 0.6568,
+      "step": 21013
+    },
+    {
+      "epoch": 3.7412749287749287,
+      "grad_norm": 0.931098461151123,
+      "learning_rate": 2.075278529131819e-06,
+      "loss": 0.7402,
+      "step": 21014
+    },
+    {
+      "epoch": 3.7414529914529915,
+      "grad_norm": 0.8532630801200867,
+      "learning_rate": 2.0724426289363995e-06,
+      "loss": 0.6599,
+      "step": 21015
+    },
+    {
+      "epoch": 3.7416310541310542,
+      "grad_norm": 1.0030345916748047,
+      "learning_rate": 2.0696086474280453e-06,
+      "loss": 0.6981,
+      "step": 21016
+    },
+    {
+      "epoch": 3.741809116809117,
+      "grad_norm": 0.8422593474388123,
+      "learning_rate": 2.0667765846622667e-06,
+      "loss": 0.6115,
+      "step": 21017
+    },
+    {
+      "epoch": 3.7419871794871797,
+      "grad_norm": 1.0260411500930786,
+      "learning_rate": 2.0639464406945752e-06,
+      "loss": 0.7547,
+      "step": 21018
+    },
+    {
+      "epoch": 3.742165242165242,
+      "grad_norm": 0.953831136226654,
+      "learning_rate": 2.0611182155804045e-06,
+      "loss": 0.6901,
+      "step": 21019
+    },
+    {
+      "epoch": 3.742343304843305,
+      "grad_norm": 0.984882116317749,
+      "learning_rate": 2.0582919093751653e-06,
+      "loss": 0.9004,
+      "step": 21020
+    },
+    {
+      "epoch": 3.7425213675213675,
+      "grad_norm": 0.8946552276611328,
+      "learning_rate": 2.055467522134236e-06,
+      "loss": 0.6706,
+      "step": 21021
+    },
+    {
+      "epoch": 3.7426994301994303,
+      "grad_norm": 0.8353488445281982,
+      "learning_rate": 2.052645053912983e-06,
+      "loss": 0.5818,
+      "step": 21022
+    },
+    {
+      "epoch": 3.7428774928774926,
+      "grad_norm": 0.9077821373939514,
+      "learning_rate": 2.049824504766651e-06,
+      "loss": 0.7353,
+      "step": 21023
+    },
+    {
+      "epoch": 3.7430555555555554,
+      "grad_norm": 1.0203959941864014,
+      "learning_rate": 2.0470058747505516e-06,
+      "loss": 0.8087,
+      "step": 21024
+    },
+    {
+      "epoch": 3.743233618233618,
+      "grad_norm": 1.1064496040344238,
+      "learning_rate": 2.0441891639198964e-06,
+      "loss": 0.6609,
+      "step": 21025
+    },
+    {
+      "epoch": 3.743411680911681,
+      "grad_norm": 0.9128747582435608,
+      "learning_rate": 2.041374372329852e-06,
+      "loss": 0.7507,
+      "step": 21026
+    },
+    {
+      "epoch": 3.7435897435897436,
+      "grad_norm": 0.8393090963363647,
+      "learning_rate": 2.0385615000356075e-06,
+      "loss": 0.5821,
+      "step": 21027
+    },
+    {
+      "epoch": 3.7437678062678064,
+      "grad_norm": 0.9668859243392944,
+      "learning_rate": 2.03575054709223e-06,
+      "loss": 0.863,
+      "step": 21028
+    },
+    {
+      "epoch": 3.743945868945869,
+      "grad_norm": 0.7194132208824158,
+      "learning_rate": 2.0329415135548313e-06,
+      "loss": 0.4231,
+      "step": 21029
+    },
+    {
+      "epoch": 3.744123931623932,
+      "grad_norm": 1.122671365737915,
+      "learning_rate": 2.030134399478434e-06,
+      "loss": 0.7939,
+      "step": 21030
+    },
+    {
+      "epoch": 3.744301994301994,
+      "grad_norm": 0.9349450469017029,
+      "learning_rate": 2.027329204918027e-06,
+      "loss": 0.8354,
+      "step": 21031
+    },
+    {
+      "epoch": 3.744480056980057,
+      "grad_norm": 0.9546355605125427,
+      "learning_rate": 2.0245259299286003e-06,
+      "loss": 0.6134,
+      "step": 21032
+    },
+    {
+      "epoch": 3.7446581196581197,
+      "grad_norm": 0.8751131296157837,
+      "learning_rate": 2.0217245745650536e-06,
+      "loss": 0.6028,
+      "step": 21033
+    },
+    {
+      "epoch": 3.7448361823361824,
+      "grad_norm": 1.0459450483322144,
+      "learning_rate": 2.0189251388822773e-06,
+      "loss": 0.6969,
+      "step": 21034
+    },
+    {
+      "epoch": 3.745014245014245,
+      "grad_norm": 1.080256700515747,
+      "learning_rate": 2.0161276229351376e-06,
+      "loss": 0.7175,
+      "step": 21035
+    },
+    {
+      "epoch": 3.7451923076923075,
+      "grad_norm": 0.9239566326141357,
+      "learning_rate": 2.0133320267784137e-06,
+      "loss": 0.7177,
+      "step": 21036
+    },
+    {
+      "epoch": 3.7453703703703702,
+      "grad_norm": 1.0287296772003174,
+      "learning_rate": 2.0105383504669172e-06,
+      "loss": 0.6814,
+      "step": 21037
+    },
+    {
+      "epoch": 3.745548433048433,
+      "grad_norm": 0.8914669156074524,
+      "learning_rate": 2.0077465940553596e-06,
+      "loss": 0.7653,
+      "step": 21038
+    },
+    {
+      "epoch": 3.7457264957264957,
+      "grad_norm": 0.9368272423744202,
+      "learning_rate": 2.004956757598453e-06,
+      "loss": 0.7811,
+      "step": 21039
+    },
+    {
+      "epoch": 3.7459045584045585,
+      "grad_norm": 0.9822359085083008,
+      "learning_rate": 2.002168841150842e-06,
+      "loss": 0.7224,
+      "step": 21040
+    },
+    {
+      "epoch": 3.7460826210826212,
+      "grad_norm": 1.0479099750518799,
+      "learning_rate": 1.9993828447671614e-06,
+      "loss": 0.7241,
+      "step": 21041
+    },
+    {
+      "epoch": 3.746260683760684,
+      "grad_norm": 0.8748711347579956,
+      "learning_rate": 1.9965987685020003e-06,
+      "loss": 0.7637,
+      "step": 21042
+    },
+    {
+      "epoch": 3.7464387464387463,
+      "grad_norm": 1.1019326448440552,
+      "learning_rate": 1.993816612409893e-06,
+      "loss": 0.88,
+      "step": 21043
+    },
+    {
+      "epoch": 3.746616809116809,
+      "grad_norm": 0.8620260953903198,
+      "learning_rate": 1.9910363765453633e-06,
+      "loss": 0.657,
+      "step": 21044
+    },
+    {
+      "epoch": 3.746794871794872,
+      "grad_norm": 1.2118844985961914,
+      "learning_rate": 1.9882580609628774e-06,
+      "loss": 0.8122,
+      "step": 21045
+    },
+    {
+      "epoch": 3.7469729344729346,
+      "grad_norm": 0.9898678660392761,
+      "learning_rate": 1.985481665716882e-06,
+      "loss": 0.9632,
+      "step": 21046
+    },
+    {
+      "epoch": 3.7471509971509973,
+      "grad_norm": 1.0883867740631104,
+      "learning_rate": 1.9827071908617545e-06,
+      "loss": 0.8129,
+      "step": 21047
+    },
+    {
+      "epoch": 3.7473290598290596,
+      "grad_norm": 0.9610900282859802,
+      "learning_rate": 1.9799346364518745e-06,
+      "loss": 0.8895,
+      "step": 21048
+    },
+    {
+      "epoch": 3.7475071225071224,
+      "grad_norm": 1.0719484090805054,
+      "learning_rate": 1.977164002541554e-06,
+      "loss": 0.7515,
+      "step": 21049
+    },
+    {
+      "epoch": 3.747685185185185,
+      "grad_norm": 0.915931761264801,
+      "learning_rate": 1.9743952891850825e-06,
+      "loss": 0.8945,
+      "step": 21050
+    },
+    {
+      "epoch": 3.747863247863248,
+      "grad_norm": 0.8382566571235657,
+      "learning_rate": 1.971628496436695e-06,
+      "loss": 0.5806,
+      "step": 21051
+    },
+    {
+      "epoch": 3.7480413105413106,
+      "grad_norm": 0.9513857960700989,
+      "learning_rate": 1.968863624350625e-06,
+      "loss": 0.6873,
+      "step": 21052
+    },
+    {
+      "epoch": 3.7482193732193734,
+      "grad_norm": 0.8445575833320618,
+      "learning_rate": 1.9661006729810196e-06,
+      "loss": 0.7492,
+      "step": 21053
+    },
+    {
+      "epoch": 3.748397435897436,
+      "grad_norm": 1.137221097946167,
+      "learning_rate": 1.963339642382034e-06,
+      "loss": 0.796,
+      "step": 21054
+    },
+    {
+      "epoch": 3.7485754985754984,
+      "grad_norm": 0.9879746437072754,
+      "learning_rate": 1.9605805326077364e-06,
+      "loss": 0.6656,
+      "step": 21055
+    },
+    {
+      "epoch": 3.748753561253561,
+      "grad_norm": 0.8657693266868591,
+      "learning_rate": 1.9578233437122173e-06,
+      "loss": 0.7989,
+      "step": 21056
+    },
+    {
+      "epoch": 3.748931623931624,
+      "grad_norm": 0.9027921557426453,
+      "learning_rate": 1.9550680757494887e-06,
+      "loss": 0.7416,
+      "step": 21057
+    },
+    {
+      "epoch": 3.7491096866096867,
+      "grad_norm": 0.9542195200920105,
+      "learning_rate": 1.952314728773519e-06,
+      "loss": 0.7625,
+      "step": 21058
+    },
+    {
+      "epoch": 3.7492877492877494,
+      "grad_norm": 0.9206588864326477,
+      "learning_rate": 1.9495633028382755e-06,
+      "loss": 0.5892,
+      "step": 21059
+    },
+    {
+      "epoch": 3.7494658119658117,
+      "grad_norm": 0.8397642374038696,
+      "learning_rate": 1.9468137979976597e-06,
+      "loss": 0.6284,
+      "step": 21060
+    },
+    {
+      "epoch": 3.7494658119658117,
+      "eval_loss": 1.1768102645874023,
+      "eval_runtime": 25.9484,
+      "eval_samples_per_second": 40.118,
+      "eval_steps_per_second": 20.078,
+      "step": 21060
+    },
+    {
+      "epoch": 3.7496438746438745,
+      "grad_norm": 0.996372640132904,
+      "learning_rate": 1.944066214305518e-06,
+      "loss": 0.6501,
+      "step": 21061
+    },
+    {
+      "epoch": 3.7498219373219372,
+      "grad_norm": 1.2021937370300293,
+      "learning_rate": 1.9413205518157284e-06,
+      "loss": 0.7819,
+      "step": 21062
+    },
+    {
+      "epoch": 3.75,
+      "grad_norm": 0.9370668530464172,
+      "learning_rate": 1.938576810582049e-06,
+      "loss": 0.5687,
+      "step": 21063
+    },
+    {
+      "epoch": 3.7501780626780628,
+      "grad_norm": 0.8952577114105225,
+      "learning_rate": 1.9358349906582694e-06,
+      "loss": 0.6616,
+      "step": 21064
+    },
+    {
+      "epoch": 3.7503561253561255,
+      "grad_norm": 0.9440550804138184,
+      "learning_rate": 1.9330950920980696e-06,
+      "loss": 0.8607,
+      "step": 21065
+    },
+    {
+      "epoch": 3.7505341880341883,
+      "grad_norm": 0.897367000579834,
+      "learning_rate": 1.9303571149551726e-06,
+      "loss": 0.7928,
+      "step": 21066
+    },
+    {
+      "epoch": 3.7507122507122506,
+      "grad_norm": 0.932023823261261,
+      "learning_rate": 1.9276210592832023e-06,
+      "loss": 0.7398,
+      "step": 21067
+    },
+    {
+      "epoch": 3.7508903133903133,
+      "grad_norm": 0.9534773230552673,
+      "learning_rate": 1.92488692513576e-06,
+      "loss": 0.7433,
+      "step": 21068
+    },
+    {
+      "epoch": 3.751068376068376,
+      "grad_norm": 0.8925126791000366,
+      "learning_rate": 1.9221547125664372e-06,
+      "loss": 0.8369,
+      "step": 21069
+    },
+    {
+      "epoch": 3.751246438746439,
+      "grad_norm": 1.084887146949768,
+      "learning_rate": 1.919424421628746e-06,
+      "loss": 0.8246,
+      "step": 21070
+    },
+    {
+      "epoch": 3.7514245014245016,
+      "grad_norm": 0.9539969563484192,
+      "learning_rate": 1.916696052376188e-06,
+      "loss": 0.726,
+      "step": 21071
+    },
+    {
+      "epoch": 3.751602564102564,
+      "grad_norm": 0.8407643437385559,
+      "learning_rate": 1.9139696048622313e-06,
+      "loss": 0.6814,
+      "step": 21072
+    },
+    {
+      "epoch": 3.7517806267806266,
+      "grad_norm": 1.1756813526153564,
+      "learning_rate": 1.911245079140278e-06,
+      "loss": 0.9399,
+      "step": 21073
+    },
+    {
+      "epoch": 3.7519586894586894,
+      "grad_norm": 1.0977120399475098,
+      "learning_rate": 1.9085224752637075e-06,
+      "loss": 0.9266,
+      "step": 21074
+    },
+    {
+      "epoch": 3.752136752136752,
+      "grad_norm": 0.9862158298492432,
+      "learning_rate": 1.9058017932858773e-06,
+      "loss": 0.7854,
+      "step": 21075
+    },
+    {
+      "epoch": 3.752314814814815,
+      "grad_norm": 1.0166863203048706,
+      "learning_rate": 1.9030830332601001e-06,
+      "loss": 0.6384,
+      "step": 21076
+    },
+    {
+      "epoch": 3.7524928774928776,
+      "grad_norm": 0.900991678237915,
+      "learning_rate": 1.9003661952396224e-06,
+      "loss": 0.7819,
+      "step": 21077
+    },
+    {
+      "epoch": 3.7526709401709404,
+      "grad_norm": 0.8836488127708435,
+      "learning_rate": 1.897651279277668e-06,
+      "loss": 0.5465,
+      "step": 21078
+    },
+    {
+      "epoch": 3.7528490028490027,
+      "grad_norm": 0.9742031693458557,
+      "learning_rate": 1.8949382854274722e-06,
+      "loss": 0.7809,
+      "step": 21079
+    },
+    {
+      "epoch": 3.7530270655270654,
+      "grad_norm": 0.9790308475494385,
+      "learning_rate": 1.8922272137421482e-06,
+      "loss": 0.6517,
+      "step": 21080
+    },
+    {
+      "epoch": 3.753205128205128,
+      "grad_norm": 1.0090285539627075,
+      "learning_rate": 1.8895180642748422e-06,
+      "loss": 0.7865,
+      "step": 21081
+    },
+    {
+      "epoch": 3.753383190883191,
+      "grad_norm": 0.9838883876800537,
+      "learning_rate": 1.886810837078623e-06,
+      "loss": 0.6011,
+      "step": 21082
+    },
+    {
+      "epoch": 3.7535612535612537,
+      "grad_norm": 0.9217146039009094,
+      "learning_rate": 1.884105532206526e-06,
+      "loss": 0.7815,
+      "step": 21083
+    },
+    {
+      "epoch": 3.753739316239316,
+      "grad_norm": 1.0938522815704346,
+      "learning_rate": 1.8814021497115642e-06,
+      "loss": 0.9792,
+      "step": 21084
+    },
+    {
+      "epoch": 3.7539173789173788,
+      "grad_norm": 1.0393195152282715,
+      "learning_rate": 1.8787006896466952e-06,
+      "loss": 0.7068,
+      "step": 21085
+    },
+    {
+      "epoch": 3.7540954415954415,
+      "grad_norm": 0.9190340042114258,
+      "learning_rate": 1.8760011520648656e-06,
+      "loss": 0.7742,
+      "step": 21086
+    },
+    {
+      "epoch": 3.7542735042735043,
+      "grad_norm": 0.9961007833480835,
+      "learning_rate": 1.8733035370189667e-06,
+      "loss": 0.884,
+      "step": 21087
+    },
+    {
+      "epoch": 3.754451566951567,
+      "grad_norm": 0.9565320014953613,
+      "learning_rate": 1.8706078445618336e-06,
+      "loss": 0.6365,
+      "step": 21088
+    },
+    {
+      "epoch": 3.7546296296296298,
+      "grad_norm": 0.9802839756011963,
+      "learning_rate": 1.867914074746302e-06,
+      "loss": 0.64,
+      "step": 21089
+    },
+    {
+      "epoch": 3.7548076923076925,
+      "grad_norm": 0.9352937340736389,
+      "learning_rate": 1.8652222276251408e-06,
+      "loss": 0.6464,
+      "step": 21090
+    },
+    {
+      "epoch": 3.754985754985755,
+      "grad_norm": 0.960495114326477,
+      "learning_rate": 1.8625323032510855e-06,
+      "loss": 0.7469,
+      "step": 21091
+    },
+    {
+      "epoch": 3.7551638176638176,
+      "grad_norm": 1.088507056236267,
+      "learning_rate": 1.8598443016768497e-06,
+      "loss": 0.9544,
+      "step": 21092
+    },
+    {
+      "epoch": 3.7553418803418803,
+      "grad_norm": 0.871731162071228,
+      "learning_rate": 1.8571582229550911e-06,
+      "loss": 0.7206,
+      "step": 21093
+    },
+    {
+      "epoch": 3.755519943019943,
+      "grad_norm": 0.9332573413848877,
+      "learning_rate": 1.8544740671384565e-06,
+      "loss": 0.8691,
+      "step": 21094
+    },
+    {
+      "epoch": 3.755698005698006,
+      "grad_norm": 0.9459754228591919,
+      "learning_rate": 1.8517918342795148e-06,
+      "loss": 0.7293,
+      "step": 21095
+    },
+    {
+      "epoch": 3.755876068376068,
+      "grad_norm": 1.2970104217529297,
+      "learning_rate": 1.8491115244308243e-06,
+      "loss": 0.8888,
+      "step": 21096
+    },
+    {
+      "epoch": 3.756054131054131,
+      "grad_norm": 0.8978582620620728,
+      "learning_rate": 1.8464331376449095e-06,
+      "loss": 0.6223,
+      "step": 21097
+    },
+    {
+      "epoch": 3.7562321937321936,
+      "grad_norm": 0.8985556960105896,
+      "learning_rate": 1.8437566739742396e-06,
+      "loss": 0.6488,
+      "step": 21098
+    },
+    {
+      "epoch": 3.7564102564102564,
+      "grad_norm": 1.0541517734527588,
+      "learning_rate": 1.8410821334712503e-06,
+      "loss": 0.7203,
+      "step": 21099
+    },
+    {
+      "epoch": 3.756588319088319,
+      "grad_norm": 0.96063631772995,
+      "learning_rate": 1.8384095161883552e-06,
+      "loss": 0.7773,
+      "step": 21100
+    },
+    {
+      "epoch": 3.756766381766382,
+      "grad_norm": 1.0475854873657227,
+      "learning_rate": 1.8357388221779126e-06,
+      "loss": 0.8795,
+      "step": 21101
+    },
+    {
+      "epoch": 3.7569444444444446,
+      "grad_norm": 0.9499286413192749,
+      "learning_rate": 1.8330700514922472e-06,
+      "loss": 0.7407,
+      "step": 21102
+    },
+    {
+      "epoch": 3.7571225071225074,
+      "grad_norm": 0.94266676902771,
+      "learning_rate": 1.8304032041836393e-06,
+      "loss": 0.7239,
+      "step": 21103
+    },
+    {
+      "epoch": 3.7573005698005697,
+      "grad_norm": 0.9316100478172302,
+      "learning_rate": 1.8277382803043585e-06,
+      "loss": 0.7977,
+      "step": 21104
+    },
+    {
+      "epoch": 3.7574786324786325,
+      "grad_norm": 0.9400086998939514,
+      "learning_rate": 1.8250752799066184e-06,
+      "loss": 0.6354,
+      "step": 21105
+    },
+    {
+      "epoch": 3.757656695156695,
+      "grad_norm": 0.9189943671226501,
+      "learning_rate": 1.8224142030425772e-06,
+      "loss": 0.7169,
+      "step": 21106
+    },
+    {
+      "epoch": 3.757834757834758,
+      "grad_norm": 1.0137799978256226,
+      "learning_rate": 1.8197550497643935e-06,
+      "loss": 0.5435,
+      "step": 21107
+    },
+    {
+      "epoch": 3.7580128205128203,
+      "grad_norm": 0.9665256142616272,
+      "learning_rate": 1.8170978201241474e-06,
+      "loss": 0.7213,
+      "step": 21108
+    },
+    {
+      "epoch": 3.758190883190883,
+      "grad_norm": 0.9698337316513062,
+      "learning_rate": 1.8144425141739196e-06,
+      "loss": 0.7327,
+      "step": 21109
+    },
+    {
+      "epoch": 3.7583689458689458,
+      "grad_norm": 1.0307881832122803,
+      "learning_rate": 1.8117891319657243e-06,
+      "loss": 0.7908,
+      "step": 21110
+    },
+    {
+      "epoch": 3.7585470085470085,
+      "grad_norm": 1.087488055229187,
+      "learning_rate": 1.809137673551564e-06,
+      "loss": 0.662,
+      "step": 21111
+    },
+    {
+      "epoch": 3.7587250712250713,
+      "grad_norm": 0.8748802542686462,
+      "learning_rate": 1.8064881389833533e-06,
+      "loss": 0.625,
+      "step": 21112
+    },
+    {
+      "epoch": 3.758903133903134,
+      "grad_norm": 0.9132877588272095,
+      "learning_rate": 1.8038405283130499e-06,
+      "loss": 0.9392,
+      "step": 21113
+    },
+    {
+      "epoch": 3.7590811965811968,
+      "grad_norm": 1.169862985610962,
+      "learning_rate": 1.8011948415925017e-06,
+      "loss": 0.7501,
+      "step": 21114
+    },
+    {
+      "epoch": 3.7592592592592595,
+      "grad_norm": 1.095362663269043,
+      "learning_rate": 1.7985510788735449e-06,
+      "loss": 0.7822,
+      "step": 21115
+    },
+    {
+      "epoch": 3.759437321937322,
+      "grad_norm": 1.0184403657913208,
+      "learning_rate": 1.7959092402079825e-06,
+      "loss": 0.8347,
+      "step": 21116
+    },
+    {
+      "epoch": 3.7596153846153846,
+      "grad_norm": 1.0453873872756958,
+      "learning_rate": 1.793269325647584e-06,
+      "loss": 0.6534,
+      "step": 21117
+    },
+    {
+      "epoch": 3.7597934472934473,
+      "grad_norm": 0.8609569668769836,
+      "learning_rate": 1.790631335244053e-06,
+      "loss": 0.7615,
+      "step": 21118
+    },
+    {
+      "epoch": 3.75997150997151,
+      "grad_norm": 0.960797131061554,
+      "learning_rate": 1.7879952690491141e-06,
+      "loss": 0.6932,
+      "step": 21119
+    },
+    {
+      "epoch": 3.7601495726495724,
+      "grad_norm": 0.9469041228294373,
+      "learning_rate": 1.78536112711436e-06,
+      "loss": 0.7238,
+      "step": 21120
+    },
+    {
+      "epoch": 3.760327635327635,
+      "grad_norm": 0.9235484600067139,
+      "learning_rate": 1.782728909491449e-06,
+      "loss": 0.7654,
+      "step": 21121
+    },
+    {
+      "epoch": 3.760505698005698,
+      "grad_norm": 1.1060317754745483,
+      "learning_rate": 1.78009861623194e-06,
+      "loss": 0.8391,
+      "step": 21122
+    },
+    {
+      "epoch": 3.7606837606837606,
+      "grad_norm": 0.9043984413146973,
+      "learning_rate": 1.7774702473873584e-06,
+      "loss": 0.8132,
+      "step": 21123
+    },
+    {
+      "epoch": 3.7608618233618234,
+      "grad_norm": 0.9583942890167236,
+      "learning_rate": 1.7748438030092074e-06,
+      "loss": 0.695,
+      "step": 21124
+    },
+    {
+      "epoch": 3.761039886039886,
+      "grad_norm": 0.9370834231376648,
+      "learning_rate": 1.7722192831489348e-06,
+      "loss": 0.829,
+      "step": 21125
+    },
+    {
+      "epoch": 3.761217948717949,
+      "grad_norm": 0.9919298887252808,
+      "learning_rate": 1.7695966878579994e-06,
+      "loss": 0.9974,
+      "step": 21126
+    },
+    {
+      "epoch": 3.7613960113960117,
+      "grad_norm": 0.9389301538467407,
+      "learning_rate": 1.766976017187738e-06,
+      "loss": 0.8368,
+      "step": 21127
+    },
+    {
+      "epoch": 3.761574074074074,
+      "grad_norm": 0.8722683787345886,
+      "learning_rate": 1.7643572711895318e-06,
+      "loss": 0.689,
+      "step": 21128
+    },
+    {
+      "epoch": 3.7617521367521367,
+      "grad_norm": 1.0586130619049072,
+      "learning_rate": 1.7617404499146839e-06,
+      "loss": 0.8207,
+      "step": 21129
+    },
+    {
+      "epoch": 3.7619301994301995,
+      "grad_norm": 1.040549397468567,
+      "learning_rate": 1.7591255534144535e-06,
+      "loss": 0.878,
+      "step": 21130
+    },
+    {
+      "epoch": 3.762108262108262,
+      "grad_norm": 1.0269027948379517,
+      "learning_rate": 1.7565125817400773e-06,
+      "loss": 0.867,
+      "step": 21131
+    },
+    {
+      "epoch": 3.7622863247863245,
+      "grad_norm": 0.923396110534668,
+      "learning_rate": 1.753901534942759e-06,
+      "loss": 0.7129,
+      "step": 21132
+    },
+    {
+      "epoch": 3.7624643874643873,
+      "grad_norm": 1.132996916770935,
+      "learning_rate": 1.7512924130736353e-06,
+      "loss": 0.7754,
+      "step": 21133
+    },
+    {
+      "epoch": 3.76264245014245,
+      "grad_norm": 0.9143159985542297,
+      "learning_rate": 1.7486852161838653e-06,
+      "loss": 0.7042,
+      "step": 21134
+    },
+    {
+      "epoch": 3.7628205128205128,
+      "grad_norm": 0.9809998869895935,
+      "learning_rate": 1.7460799443244968e-06,
+      "loss": 0.8026,
+      "step": 21135
+    },
+    {
+      "epoch": 3.7629985754985755,
+      "grad_norm": 0.9136005640029907,
+      "learning_rate": 1.7434765975466006e-06,
+      "loss": 0.8317,
+      "step": 21136
+    },
+    {
+      "epoch": 3.7631766381766383,
+      "grad_norm": 0.8698214292526245,
+      "learning_rate": 1.7408751759011465e-06,
+      "loss": 0.8658,
+      "step": 21137
+    },
+    {
+      "epoch": 3.763354700854701,
+      "grad_norm": 1.0867608785629272,
+      "learning_rate": 1.7382756794391386e-06,
+      "loss": 0.9433,
+      "step": 21138
+    },
+    {
+      "epoch": 3.763532763532764,
+      "grad_norm": 0.8608876466751099,
+      "learning_rate": 1.7356781082115026e-06,
+      "loss": 0.6308,
+      "step": 21139
+    },
+    {
+      "epoch": 3.763710826210826,
+      "grad_norm": 0.8908528685569763,
+      "learning_rate": 1.733082462269131e-06,
+      "loss": 0.6537,
+      "step": 21140
+    },
+    {
+      "epoch": 3.763888888888889,
+      "grad_norm": 0.9231675863265991,
+      "learning_rate": 1.7304887416628724e-06,
+      "loss": 0.8053,
+      "step": 21141
+    },
+    {
+      "epoch": 3.7640669515669516,
+      "grad_norm": 0.9419436454772949,
+      "learning_rate": 1.7278969464435413e-06,
+      "loss": 0.8501,
+      "step": 21142
+    },
+    {
+      "epoch": 3.7642450142450143,
+      "grad_norm": 1.0135849714279175,
+      "learning_rate": 1.7253070766619305e-06,
+      "loss": 0.8137,
+      "step": 21143
+    },
+    {
+      "epoch": 3.7644230769230766,
+      "grad_norm": 0.9566580057144165,
+      "learning_rate": 1.7227191323687774e-06,
+      "loss": 0.73,
+      "step": 21144
+    },
+    {
+      "epoch": 3.7646011396011394,
+      "grad_norm": 1.033128261566162,
+      "learning_rate": 1.7201331136147968e-06,
+      "loss": 0.6961,
+      "step": 21145
+    },
+    {
+      "epoch": 3.764779202279202,
+      "grad_norm": 0.9674811959266663,
+      "learning_rate": 1.7175490204506484e-06,
+      "loss": 0.8184,
+      "step": 21146
+    },
+    {
+      "epoch": 3.764957264957265,
+      "grad_norm": 0.9115480184555054,
+      "learning_rate": 1.714966852926958e-06,
+      "loss": 0.9166,
+      "step": 21147
+    },
+    {
+      "epoch": 3.7651353276353277,
+      "grad_norm": 0.9724944233894348,
+      "learning_rate": 1.7123866110943298e-06,
+      "loss": 0.7997,
+      "step": 21148
+    },
+    {
+      "epoch": 3.7653133903133904,
+      "grad_norm": 0.9939026832580566,
+      "learning_rate": 1.7098082950033124e-06,
+      "loss": 0.7107,
+      "step": 21149
+    },
+    {
+      "epoch": 3.765491452991453,
+      "grad_norm": 0.9845420718193054,
+      "learning_rate": 1.7072319047044094e-06,
+      "loss": 0.7363,
+      "step": 21150
+    },
+    {
+      "epoch": 3.765669515669516,
+      "grad_norm": 1.0131663084030151,
+      "learning_rate": 1.7046574402481253e-06,
+      "loss": 0.5866,
+      "step": 21151
+    },
+    {
+      "epoch": 3.765847578347578,
+      "grad_norm": 1.0212078094482422,
+      "learning_rate": 1.702084901684875e-06,
+      "loss": 0.8843,
+      "step": 21152
+    },
+    {
+      "epoch": 3.766025641025641,
+      "grad_norm": 0.9821169972419739,
+      "learning_rate": 1.699514289065074e-06,
+      "loss": 0.8066,
+      "step": 21153
+    },
+    {
+      "epoch": 3.7662037037037037,
+      "grad_norm": 0.8868746757507324,
+      "learning_rate": 1.696945602439104e-06,
+      "loss": 0.789,
+      "step": 21154
+    },
+    {
+      "epoch": 3.7663817663817665,
+      "grad_norm": 0.9874410033226013,
+      "learning_rate": 1.6943788418572692e-06,
+      "loss": 0.7745,
+      "step": 21155
+    },
+    {
+      "epoch": 3.7665598290598292,
+      "grad_norm": 0.8668256402015686,
+      "learning_rate": 1.691814007369863e-06,
+      "loss": 0.7893,
+      "step": 21156
+    },
+    {
+      "epoch": 3.7667378917378915,
+      "grad_norm": 1.0595078468322754,
+      "learning_rate": 1.689251099027156e-06,
+      "loss": 0.7377,
+      "step": 21157
+    },
+    {
+      "epoch": 3.7669159544159543,
+      "grad_norm": 1.0612707138061523,
+      "learning_rate": 1.6866901168793413e-06,
+      "loss": 0.7955,
+      "step": 21158
+    },
+    {
+      "epoch": 3.767094017094017,
+      "grad_norm": 0.8881635665893555,
+      "learning_rate": 1.6841310609766126e-06,
+      "loss": 0.8739,
+      "step": 21159
+    },
+    {
+      "epoch": 3.76727207977208,
+      "grad_norm": 0.9735580086708069,
+      "learning_rate": 1.681573931369096e-06,
+      "loss": 0.7704,
+      "step": 21160
+    },
+    {
+      "epoch": 3.7674501424501425,
+      "grad_norm": 0.9362749457359314,
+      "learning_rate": 1.6790187281069069e-06,
+      "loss": 0.7712,
+      "step": 21161
+    },
+    {
+      "epoch": 3.7676282051282053,
+      "grad_norm": 0.9322159290313721,
+      "learning_rate": 1.6764654512400946e-06,
+      "loss": 0.741,
+      "step": 21162
+    },
+    {
+      "epoch": 3.767806267806268,
+      "grad_norm": 0.9788293242454529,
+      "learning_rate": 1.6739141008186854e-06,
+      "loss": 0.7966,
+      "step": 21163
+    },
+    {
+      "epoch": 3.7679843304843303,
+      "grad_norm": 0.9341052174568176,
+      "learning_rate": 1.671364676892695e-06,
+      "loss": 0.5836,
+      "step": 21164
+    },
+    {
+      "epoch": 3.768162393162393,
+      "grad_norm": 1.0333666801452637,
+      "learning_rate": 1.6688171795120278e-06,
+      "loss": 0.7703,
+      "step": 21165
+    },
+    {
+      "epoch": 3.768340455840456,
+      "grad_norm": 1.1365190744400024,
+      "learning_rate": 1.666271608726644e-06,
+      "loss": 0.8859,
+      "step": 21166
+    },
+    {
+      "epoch": 3.7685185185185186,
+      "grad_norm": 0.9362785220146179,
+      "learning_rate": 1.6637279645863923e-06,
+      "loss": 0.6945,
+      "step": 21167
+    },
+    {
+      "epoch": 3.7686965811965814,
+      "grad_norm": 0.9373227953910828,
+      "learning_rate": 1.661186247141111e-06,
+      "loss": 0.833,
+      "step": 21168
+    },
+    {
+      "epoch": 3.7688746438746437,
+      "grad_norm": 1.067119836807251,
+      "learning_rate": 1.6586464564406046e-06,
+      "loss": 0.78,
+      "step": 21169
+    },
+    {
+      "epoch": 3.7690527065527064,
+      "grad_norm": 1.0004785060882568,
+      "learning_rate": 1.6561085925346332e-06,
+      "loss": 0.7274,
+      "step": 21170
+    },
+    {
+      "epoch": 3.769230769230769,
+      "grad_norm": 0.803828239440918,
+      "learning_rate": 1.6535726554729347e-06,
+      "loss": 0.5589,
+      "step": 21171
+    },
+    {
+      "epoch": 3.769408831908832,
+      "grad_norm": 0.9255486726760864,
+      "learning_rate": 1.6510386453051695e-06,
+      "loss": 0.8917,
+      "step": 21172
+    },
+    {
+      "epoch": 3.7695868945868947,
+      "grad_norm": 0.9268200397491455,
+      "learning_rate": 1.6485065620810092e-06,
+      "loss": 0.7948,
+      "step": 21173
+    },
+    {
+      "epoch": 3.7697649572649574,
+      "grad_norm": 1.0468385219573975,
+      "learning_rate": 1.6459764058500472e-06,
+      "loss": 0.6552,
+      "step": 21174
+    },
+    {
+      "epoch": 3.76994301994302,
+      "grad_norm": 0.859835147857666,
+      "learning_rate": 1.643448176661866e-06,
+      "loss": 0.7813,
+      "step": 21175
+    },
+    {
+      "epoch": 3.7701210826210825,
+      "grad_norm": 1.0402703285217285,
+      "learning_rate": 1.6409218745660037e-06,
+      "loss": 0.8416,
+      "step": 21176
+    },
+    {
+      "epoch": 3.7702991452991452,
+      "grad_norm": 0.8524719476699829,
+      "learning_rate": 1.6383974996119434e-06,
+      "loss": 0.5359,
+      "step": 21177
+    },
+    {
+      "epoch": 3.770477207977208,
+      "grad_norm": 0.9544599056243896,
+      "learning_rate": 1.6358750518491672e-06,
+      "loss": 0.7721,
+      "step": 21178
+    },
+    {
+      "epoch": 3.7706552706552707,
+      "grad_norm": 1.1628994941711426,
+      "learning_rate": 1.6333545313270803e-06,
+      "loss": 0.7837,
+      "step": 21179
+    },
+    {
+      "epoch": 3.7708333333333335,
+      "grad_norm": 0.9267588257789612,
+      "learning_rate": 1.6308359380950765e-06,
+      "loss": 0.6741,
+      "step": 21180
+    },
+    {
+      "epoch": 3.771011396011396,
+      "grad_norm": 0.9370381236076355,
+      "learning_rate": 1.6283192722024942e-06,
+      "loss": 0.8062,
+      "step": 21181
+    },
+    {
+      "epoch": 3.7711894586894585,
+      "grad_norm": 0.8971649408340454,
+      "learning_rate": 1.6258045336986493e-06,
+      "loss": 0.7223,
+      "step": 21182
+    },
+    {
+      "epoch": 3.7713675213675213,
+      "grad_norm": 0.8068461418151855,
+      "learning_rate": 1.6232917226328137e-06,
+      "loss": 0.8003,
+      "step": 21183
+    },
+    {
+      "epoch": 3.771545584045584,
+      "grad_norm": 0.830176830291748,
+      "learning_rate": 1.6207808390542034e-06,
+      "loss": 0.5199,
+      "step": 21184
+    },
+    {
+      "epoch": 3.771723646723647,
+      "grad_norm": 1.0348658561706543,
+      "learning_rate": 1.6182718830120346e-06,
+      "loss": 0.9757,
+      "step": 21185
+    },
+    {
+      "epoch": 3.7719017094017095,
+      "grad_norm": 0.9341155290603638,
+      "learning_rate": 1.6157648545554571e-06,
+      "loss": 0.7659,
+      "step": 21186
+    },
+    {
+      "epoch": 3.7720797720797723,
+      "grad_norm": 0.9962141513824463,
+      "learning_rate": 1.613259753733598e-06,
+      "loss": 0.8433,
+      "step": 21187
+    },
+    {
+      "epoch": 3.7722578347578346,
+      "grad_norm": 0.994513988494873,
+      "learning_rate": 1.6107565805955293e-06,
+      "loss": 0.9445,
+      "step": 21188
+    },
+    {
+      "epoch": 3.7724358974358974,
+      "grad_norm": 0.9562044143676758,
+      "learning_rate": 1.6082553351903008e-06,
+      "loss": 0.7425,
+      "step": 21189
+    },
+    {
+      "epoch": 3.77261396011396,
+      "grad_norm": 0.9779346585273743,
+      "learning_rate": 1.6057560175669062e-06,
+      "loss": 0.6848,
+      "step": 21190
+    },
+    {
+      "epoch": 3.772792022792023,
+      "grad_norm": 1.0121656656265259,
+      "learning_rate": 1.603258627774351e-06,
+      "loss": 0.8805,
+      "step": 21191
+    },
+    {
+      "epoch": 3.7729700854700856,
+      "grad_norm": 0.8703751564025879,
+      "learning_rate": 1.6007631658615186e-06,
+      "loss": 0.763,
+      "step": 21192
+    },
+    {
+      "epoch": 3.773148148148148,
+      "grad_norm": 0.9765893220901489,
+      "learning_rate": 1.598269631877336e-06,
+      "loss": 0.7559,
+      "step": 21193
+    },
+    {
+      "epoch": 3.7733262108262107,
+      "grad_norm": 1.0756961107254028,
+      "learning_rate": 1.5957780258706423e-06,
+      "loss": 0.6645,
+      "step": 21194
+    },
+    {
+      "epoch": 3.7735042735042734,
+      "grad_norm": 0.9320286512374878,
+      "learning_rate": 1.593288347890265e-06,
+      "loss": 0.6629,
+      "step": 21195
+    },
+    {
+      "epoch": 3.773682336182336,
+      "grad_norm": 0.930663526058197,
+      "learning_rate": 1.590800597984976e-06,
+      "loss": 0.715,
+      "step": 21196
+    },
+    {
+      "epoch": 3.773860398860399,
+      "grad_norm": 0.9290189743041992,
+      "learning_rate": 1.5883147762035366e-06,
+      "loss": 0.7135,
+      "step": 21197
+    },
+    {
+      "epoch": 3.7740384615384617,
+      "grad_norm": 0.9259148240089417,
+      "learning_rate": 1.585830882594619e-06,
+      "loss": 0.789,
+      "step": 21198
+    },
+    {
+      "epoch": 3.7742165242165244,
+      "grad_norm": 1.0864899158477783,
+      "learning_rate": 1.583348917206917e-06,
+      "loss": 0.6689,
+      "step": 21199
+    },
+    {
+      "epoch": 3.7743945868945867,
+      "grad_norm": 0.9391498565673828,
+      "learning_rate": 1.580868880089037e-06,
+      "loss": 0.5958,
+      "step": 21200
+    },
+    {
+      "epoch": 3.7745726495726495,
+      "grad_norm": 0.9732069373130798,
+      "learning_rate": 1.5783907712896062e-06,
+      "loss": 0.8109,
+      "step": 21201
+    },
+    {
+      "epoch": 3.7747507122507122,
+      "grad_norm": 0.9975631237030029,
+      "learning_rate": 1.5759145908571304e-06,
+      "loss": 0.6909,
+      "step": 21202
+    },
+    {
+      "epoch": 3.774928774928775,
+      "grad_norm": 0.9436360001564026,
+      "learning_rate": 1.5734403388401597e-06,
+      "loss": 0.7629,
+      "step": 21203
+    },
+    {
+      "epoch": 3.7751068376068377,
+      "grad_norm": 0.9783167839050293,
+      "learning_rate": 1.5709680152871664e-06,
+      "loss": 0.7348,
+      "step": 21204
+    },
+    {
+      "epoch": 3.7752849002849,
+      "grad_norm": 0.9854038953781128,
+      "learning_rate": 1.5684976202465784e-06,
+      "loss": 0.687,
+      "step": 21205
+    },
+    {
+      "epoch": 3.775462962962963,
+      "grad_norm": 0.8817933797836304,
+      "learning_rate": 1.5660291537668237e-06,
+      "loss": 0.7331,
+      "step": 21206
+    },
+    {
+      "epoch": 3.7756410256410255,
+      "grad_norm": 0.8301796317100525,
+      "learning_rate": 1.5635626158962303e-06,
+      "loss": 0.8223,
+      "step": 21207
+    },
+    {
+      "epoch": 3.7758190883190883,
+      "grad_norm": 1.1981323957443237,
+      "learning_rate": 1.5610980066831593e-06,
+      "loss": 0.7869,
+      "step": 21208
+    },
+    {
+      "epoch": 3.775997150997151,
+      "grad_norm": 1.0148868560791016,
+      "learning_rate": 1.5586353261758612e-06,
+      "loss": 0.7864,
+      "step": 21209
+    },
+    {
+      "epoch": 3.776175213675214,
+      "grad_norm": 0.9011337161064148,
+      "learning_rate": 1.5561745744226198e-06,
+      "loss": 0.6761,
+      "step": 21210
+    },
+    {
+      "epoch": 3.7763532763532766,
+      "grad_norm": 0.9810119271278381,
+      "learning_rate": 1.5537157514716516e-06,
+      "loss": 0.6671,
+      "step": 21211
+    },
+    {
+      "epoch": 3.7765313390313393,
+      "grad_norm": 0.9342007040977478,
+      "learning_rate": 1.5512588573711074e-06,
+      "loss": 0.696,
+      "step": 21212
+    },
+    {
+      "epoch": 3.7767094017094016,
+      "grad_norm": 1.0802044868469238,
+      "learning_rate": 1.5488038921691373e-06,
+      "loss": 0.8527,
+      "step": 21213
+    },
+    {
+      "epoch": 3.7768874643874644,
+      "grad_norm": 0.8123725652694702,
+      "learning_rate": 1.5463508559138362e-06,
+      "loss": 0.676,
+      "step": 21214
+    },
+    {
+      "epoch": 3.777065527065527,
+      "grad_norm": 0.9726629853248596,
+      "learning_rate": 1.5438997486532658e-06,
+      "loss": 0.6526,
+      "step": 21215
+    },
+    {
+      "epoch": 3.77724358974359,
+      "grad_norm": 0.8095312714576721,
+      "learning_rate": 1.5414505704354764e-06,
+      "loss": 0.4255,
+      "step": 21216
+    },
+    {
+      "epoch": 3.777421652421652,
+      "grad_norm": 0.8774579763412476,
+      "learning_rate": 1.5390033213084075e-06,
+      "loss": 0.6785,
+      "step": 21217
+    },
+    {
+      "epoch": 3.777599715099715,
+      "grad_norm": 1.214205265045166,
+      "learning_rate": 1.5365580013200542e-06,
+      "loss": 0.7598,
+      "step": 21218
+    },
+    {
+      "epoch": 3.7777777777777777,
+      "grad_norm": 0.9117157459259033,
+      "learning_rate": 1.5341146105183002e-06,
+      "loss": 0.7835,
+      "step": 21219
+    },
+    {
+      "epoch": 3.7779558404558404,
+      "grad_norm": 0.9653142690658569,
+      "learning_rate": 1.5316731489510184e-06,
+      "loss": 0.7136,
+      "step": 21220
+    },
+    {
+      "epoch": 3.778133903133903,
+      "grad_norm": 0.9996618628501892,
+      "learning_rate": 1.5292336166660591e-06,
+      "loss": 0.9551,
+      "step": 21221
+    },
+    {
+      "epoch": 3.778311965811966,
+      "grad_norm": 0.880121111869812,
+      "learning_rate": 1.526796013711207e-06,
+      "loss": 0.6935,
+      "step": 21222
+    },
+    {
+      "epoch": 3.7784900284900287,
+      "grad_norm": 1.0504447221755981,
+      "learning_rate": 1.5243603401342343e-06,
+      "loss": 0.7668,
+      "step": 21223
+    },
+    {
+      "epoch": 3.7786680911680914,
+      "grad_norm": 0.9271687269210815,
+      "learning_rate": 1.5219265959828367e-06,
+      "loss": 0.7224,
+      "step": 21224
+    },
+    {
+      "epoch": 3.7788461538461537,
+      "grad_norm": 0.9729786515235901,
+      "learning_rate": 1.5194947813047312e-06,
+      "loss": 0.7342,
+      "step": 21225
+    },
+    {
+      "epoch": 3.7790242165242165,
+      "grad_norm": 1.0969620943069458,
+      "learning_rate": 1.5170648961475576e-06,
+      "loss": 0.8477,
+      "step": 21226
+    },
+    {
+      "epoch": 3.7792022792022792,
+      "grad_norm": 0.8730276226997375,
+      "learning_rate": 1.5146369405589001e-06,
+      "loss": 0.8116,
+      "step": 21227
+    },
+    {
+      "epoch": 3.779380341880342,
+      "grad_norm": 1.0915426015853882,
+      "learning_rate": 1.512210914586365e-06,
+      "loss": 0.848,
+      "step": 21228
+    },
+    {
+      "epoch": 3.7795584045584043,
+      "grad_norm": 0.9392247796058655,
+      "learning_rate": 1.5097868182774477e-06,
+      "loss": 0.8485,
+      "step": 21229
+    },
+    {
+      "epoch": 3.779736467236467,
+      "grad_norm": 0.9891657829284668,
+      "learning_rate": 1.5073646516796658e-06,
+      "loss": 0.7992,
+      "step": 21230
+    },
+    {
+      "epoch": 3.77991452991453,
+      "grad_norm": 0.8666152954101562,
+      "learning_rate": 1.5049444148404813e-06,
+      "loss": 0.7858,
+      "step": 21231
+    },
+    {
+      "epoch": 3.7800925925925926,
+      "grad_norm": 1.147723913192749,
+      "learning_rate": 1.5025261078073005e-06,
+      "loss": 0.7486,
+      "step": 21232
+    },
+    {
+      "epoch": 3.7802706552706553,
+      "grad_norm": 0.996700644493103,
+      "learning_rate": 1.500109730627519e-06,
+      "loss": 0.8679,
+      "step": 21233
+    },
+    {
+      "epoch": 3.780448717948718,
+      "grad_norm": 1.0153658390045166,
+      "learning_rate": 1.4976952833484548e-06,
+      "loss": 0.6359,
+      "step": 21234
+    },
+    {
+      "epoch": 3.780626780626781,
+      "grad_norm": 0.9375406503677368,
+      "learning_rate": 1.4952827660174362e-06,
+      "loss": 0.8339,
+      "step": 21235
+    },
+    {
+      "epoch": 3.7808048433048436,
+      "grad_norm": 0.8846505284309387,
+      "learning_rate": 1.4928721786817256e-06,
+      "loss": 0.7411,
+      "step": 21236
+    },
+    {
+      "epoch": 3.780982905982906,
+      "grad_norm": 0.9112651348114014,
+      "learning_rate": 1.4904635213885631e-06,
+      "loss": 0.8434,
+      "step": 21237
+    },
+    {
+      "epoch": 3.7811609686609686,
+      "grad_norm": 1.0657570362091064,
+      "learning_rate": 1.488056794185122e-06,
+      "loss": 0.871,
+      "step": 21238
+    },
+    {
+      "epoch": 3.7813390313390314,
+      "grad_norm": 0.966810941696167,
+      "learning_rate": 1.4856519971185756e-06,
+      "loss": 0.8442,
+      "step": 21239
+    },
+    {
+      "epoch": 3.781517094017094,
+      "grad_norm": 0.9643548727035522,
+      "learning_rate": 1.4832491302360196e-06,
+      "loss": 0.7939,
+      "step": 21240
+    },
+    {
+      "epoch": 3.7816951566951564,
+      "grad_norm": 0.9209652543067932,
+      "learning_rate": 1.4808481935845608e-06,
+      "loss": 0.8603,
+      "step": 21241
+    },
+    {
+      "epoch": 3.781873219373219,
+      "grad_norm": 0.9699517488479614,
+      "learning_rate": 1.478449187211206e-06,
+      "loss": 0.7318,
+      "step": 21242
+    },
+    {
+      "epoch": 3.782051282051282,
+      "grad_norm": 0.9576825499534607,
+      "learning_rate": 1.4760521111629954e-06,
+      "loss": 0.8847,
+      "step": 21243
+    },
+    {
+      "epoch": 3.7822293447293447,
+      "grad_norm": 1.0206605195999146,
+      "learning_rate": 1.4736569654868803e-06,
+      "loss": 0.7726,
+      "step": 21244
+    },
+    {
+      "epoch": 3.7824074074074074,
+      "grad_norm": 1.0139795541763306,
+      "learning_rate": 1.4712637502297789e-06,
+      "loss": 0.7702,
+      "step": 21245
+    },
+    {
+      "epoch": 3.78258547008547,
+      "grad_norm": 0.9946370124816895,
+      "learning_rate": 1.4688724654385867e-06,
+      "loss": 0.853,
+      "step": 21246
+    },
+    {
+      "epoch": 3.782763532763533,
+      "grad_norm": 1.069388508796692,
+      "learning_rate": 1.4664831111601552e-06,
+      "loss": 0.8966,
+      "step": 21247
+    },
+    {
+      "epoch": 3.7829415954415957,
+      "grad_norm": 0.8335608243942261,
+      "learning_rate": 1.4640956874413137e-06,
+      "loss": 0.6154,
+      "step": 21248
+    },
+    {
+      "epoch": 3.783119658119658,
+      "grad_norm": 1.035109043121338,
+      "learning_rate": 1.4617101943288136e-06,
+      "loss": 0.7114,
+      "step": 21249
+    },
+    {
+      "epoch": 3.7832977207977208,
+      "grad_norm": 0.8770612478256226,
+      "learning_rate": 1.4593266318694176e-06,
+      "loss": 0.7752,
+      "step": 21250
+    },
+    {
+      "epoch": 3.7834757834757835,
+      "grad_norm": 0.9383000731468201,
+      "learning_rate": 1.4569450001098106e-06,
+      "loss": 0.8335,
+      "step": 21251
+    },
+    {
+      "epoch": 3.7836538461538463,
+      "grad_norm": 0.9217562675476074,
+      "learning_rate": 1.4545652990966662e-06,
+      "loss": 0.6357,
+      "step": 21252
+    },
+    {
+      "epoch": 3.7838319088319086,
+      "grad_norm": 0.9792481064796448,
+      "learning_rate": 1.4521875288765918e-06,
+      "loss": 0.855,
+      "step": 21253
+    },
+    {
+      "epoch": 3.7840099715099713,
+      "grad_norm": 0.9805694222450256,
+      "learning_rate": 1.4498116894962055e-06,
+      "loss": 0.6227,
+      "step": 21254
+    },
+    {
+      "epoch": 3.784188034188034,
+      "grad_norm": 0.8412477374076843,
+      "learning_rate": 1.4474377810020257e-06,
+      "loss": 0.7493,
+      "step": 21255
+    },
+    {
+      "epoch": 3.784366096866097,
+      "grad_norm": 1.0370614528656006,
+      "learning_rate": 1.4450658034405817e-06,
+      "loss": 0.7173,
+      "step": 21256
+    },
+    {
+      "epoch": 3.7845441595441596,
+      "grad_norm": 0.8881233930587769,
+      "learning_rate": 1.4426957568583366e-06,
+      "loss": 0.6639,
+      "step": 21257
+    },
+    {
+      "epoch": 3.7847222222222223,
+      "grad_norm": 1.362695574760437,
+      "learning_rate": 1.440327641301742e-06,
+      "loss": 0.67,
+      "step": 21258
+    },
+    {
+      "epoch": 3.784900284900285,
+      "grad_norm": 0.9417729377746582,
+      "learning_rate": 1.4379614568171718e-06,
+      "loss": 0.841,
+      "step": 21259
+    },
+    {
+      "epoch": 3.785078347578348,
+      "grad_norm": 0.8955726027488708,
+      "learning_rate": 1.4355972034510224e-06,
+      "loss": 0.6153,
+      "step": 21260
+    },
+    {
+      "epoch": 3.78525641025641,
+      "grad_norm": 0.9870222210884094,
+      "learning_rate": 1.43323488124959e-06,
+      "loss": 0.9167,
+      "step": 21261
+    },
+    {
+      "epoch": 3.785434472934473,
+      "grad_norm": 0.9800770878791809,
+      "learning_rate": 1.4308744902591598e-06,
+      "loss": 0.7841,
+      "step": 21262
+    },
+    {
+      "epoch": 3.7856125356125356,
+      "grad_norm": 0.8268224000930786,
+      "learning_rate": 1.4285160305259836e-06,
+      "loss": 0.6672,
+      "step": 21263
+    },
+    {
+      "epoch": 3.7857905982905984,
+      "grad_norm": 0.9754951000213623,
+      "learning_rate": 1.4261595020962692e-06,
+      "loss": 0.8167,
+      "step": 21264
+    },
+    {
+      "epoch": 3.7859686609686607,
+      "grad_norm": 0.9257098436355591,
+      "learning_rate": 1.4238049050162018e-06,
+      "loss": 0.5471,
+      "step": 21265
+    },
+    {
+      "epoch": 3.7861467236467234,
+      "grad_norm": 0.8276677131652832,
+      "learning_rate": 1.4214522393318886e-06,
+      "loss": 0.6344,
+      "step": 21266
+    },
+    {
+      "epoch": 3.786324786324786,
+      "grad_norm": 0.8694467544555664,
+      "learning_rate": 1.4191015050894485e-06,
+      "loss": 0.7575,
+      "step": 21267
+    },
+    {
+      "epoch": 3.786502849002849,
+      "grad_norm": 0.917188823223114,
+      "learning_rate": 1.4167527023349336e-06,
+      "loss": 0.8464,
+      "step": 21268
+    },
+    {
+      "epoch": 3.7866809116809117,
+      "grad_norm": 1.0555243492126465,
+      "learning_rate": 1.4144058311143626e-06,
+      "loss": 0.876,
+      "step": 21269
+    },
+    {
+      "epoch": 3.7868589743589745,
+      "grad_norm": 0.8941957354545593,
+      "learning_rate": 1.4120608914737099e-06,
+      "loss": 0.7512,
+      "step": 21270
+    },
+    {
+      "epoch": 3.787037037037037,
+      "grad_norm": 1.2200781106948853,
+      "learning_rate": 1.4097178834589165e-06,
+      "loss": 0.8148,
+      "step": 21271
+    },
+    {
+      "epoch": 3.7872150997151,
+      "grad_norm": 1.1504127979278564,
+      "learning_rate": 1.4073768071159011e-06,
+      "loss": 0.7324,
+      "step": 21272
+    },
+    {
+      "epoch": 3.7873931623931623,
+      "grad_norm": 0.9139384031295776,
+      "learning_rate": 1.4050376624905493e-06,
+      "loss": 0.7327,
+      "step": 21273
+    },
+    {
+      "epoch": 3.787571225071225,
+      "grad_norm": 0.9477390050888062,
+      "learning_rate": 1.4027004496286466e-06,
+      "loss": 0.6668,
+      "step": 21274
+    },
+    {
+      "epoch": 3.7877492877492878,
+      "grad_norm": 1.0441744327545166,
+      "learning_rate": 1.400365168576012e-06,
+      "loss": 0.5605,
+      "step": 21275
+    },
+    {
+      "epoch": 3.7879273504273505,
+      "grad_norm": 0.983304500579834,
+      "learning_rate": 1.3980318193784091e-06,
+      "loss": 0.7046,
+      "step": 21276
+    },
+    {
+      "epoch": 3.7881054131054133,
+      "grad_norm": 0.8792953491210938,
+      "learning_rate": 1.3957004020815456e-06,
+      "loss": 0.6617,
+      "step": 21277
+    },
+    {
+      "epoch": 3.7882834757834756,
+      "grad_norm": 1.0233715772628784,
+      "learning_rate": 1.393370916731096e-06,
+      "loss": 0.7563,
+      "step": 21278
+    },
+    {
+      "epoch": 3.7884615384615383,
+      "grad_norm": 0.9095731377601624,
+      "learning_rate": 1.3910433633727127e-06,
+      "loss": 0.7283,
+      "step": 21279
+    },
+    {
+      "epoch": 3.788639601139601,
+      "grad_norm": 1.061542272567749,
+      "learning_rate": 1.3887177420519815e-06,
+      "loss": 0.7871,
+      "step": 21280
+    },
+    {
+      "epoch": 3.788817663817664,
+      "grad_norm": 0.9254636764526367,
+      "learning_rate": 1.386394052814488e-06,
+      "loss": 0.713,
+      "step": 21281
+    },
+    {
+      "epoch": 3.7889957264957266,
+      "grad_norm": 0.9656744003295898,
+      "learning_rate": 1.3840722957057406e-06,
+      "loss": 0.639,
+      "step": 21282
+    },
+    {
+      "epoch": 3.7891737891737893,
+      "grad_norm": 0.8546678423881531,
+      "learning_rate": 1.381752470771247e-06,
+      "loss": 0.5376,
+      "step": 21283
+    },
+    {
+      "epoch": 3.789351851851852,
+      "grad_norm": 0.923988401889801,
+      "learning_rate": 1.3794345780564488e-06,
+      "loss": 0.832,
+      "step": 21284
+    },
+    {
+      "epoch": 3.7895299145299144,
+      "grad_norm": 0.9867568612098694,
+      "learning_rate": 1.3771186176067653e-06,
+      "loss": 0.6947,
+      "step": 21285
+    },
+    {
+      "epoch": 3.789707977207977,
+      "grad_norm": 0.832467794418335,
+      "learning_rate": 1.374804589467571e-06,
+      "loss": 0.7129,
+      "step": 21286
+    },
+    {
+      "epoch": 3.78988603988604,
+      "grad_norm": 1.0230439901351929,
+      "learning_rate": 1.3724924936841965e-06,
+      "loss": 0.6923,
+      "step": 21287
+    },
+    {
+      "epoch": 3.7900641025641026,
+      "grad_norm": 0.8498647809028625,
+      "learning_rate": 1.3701823303019722e-06,
+      "loss": 0.667,
+      "step": 21288
+    },
+    {
+      "epoch": 3.7902421652421654,
+      "grad_norm": 0.9631175994873047,
+      "learning_rate": 1.3678740993661177e-06,
+      "loss": 0.6332,
+      "step": 21289
+    },
+    {
+      "epoch": 3.7904202279202277,
+      "grad_norm": 0.8047894835472107,
+      "learning_rate": 1.3655678009218965e-06,
+      "loss": 0.6037,
+      "step": 21290
+    },
+    {
+      "epoch": 3.7905982905982905,
+      "grad_norm": 1.03085196018219,
+      "learning_rate": 1.3632634350144614e-06,
+      "loss": 0.7858,
+      "step": 21291
+    },
+    {
+      "epoch": 3.790776353276353,
+      "grad_norm": 0.9661818146705627,
+      "learning_rate": 1.3609610016889873e-06,
+      "loss": 0.887,
+      "step": 21292
+    },
+    {
+      "epoch": 3.790954415954416,
+      "grad_norm": 0.8671273589134216,
+      "learning_rate": 1.3586605009905829e-06,
+      "loss": 0.8124,
+      "step": 21293
+    },
+    {
+      "epoch": 3.7911324786324787,
+      "grad_norm": 0.8627972602844238,
+      "learning_rate": 1.3563619329643119e-06,
+      "loss": 0.6146,
+      "step": 21294
+    },
+    {
+      "epoch": 3.7913105413105415,
+      "grad_norm": 1.0023887157440186,
+      "learning_rate": 1.3540652976552159e-06,
+      "loss": 0.7717,
+      "step": 21295
+    },
+    {
+      "epoch": 3.791488603988604,
+      "grad_norm": 0.8316352963447571,
+      "learning_rate": 1.3517705951082926e-06,
+      "loss": 0.6447,
+      "step": 21296
+    },
+    {
+      "epoch": 3.7916666666666665,
+      "grad_norm": 0.9875814914703369,
+      "learning_rate": 1.3494778253684836e-06,
+      "loss": 0.7261,
+      "step": 21297
+    },
+    {
+      "epoch": 3.7918447293447293,
+      "grad_norm": 0.8049336075782776,
+      "learning_rate": 1.347186988480753e-06,
+      "loss": 0.4757,
+      "step": 21298
+    },
+    {
+      "epoch": 3.792022792022792,
+      "grad_norm": 1.1316081285476685,
+      "learning_rate": 1.3448980844899428e-06,
+      "loss": 0.8487,
+      "step": 21299
+    },
+    {
+      "epoch": 3.7922008547008548,
+      "grad_norm": 1.0074068307876587,
+      "learning_rate": 1.3426111134409281e-06,
+      "loss": 0.773,
+      "step": 21300
+    },
+    {
+      "epoch": 3.7923789173789175,
+      "grad_norm": 1.0519719123840332,
+      "learning_rate": 1.3403260753784951e-06,
+      "loss": 0.7062,
+      "step": 21301
+    },
+    {
+      "epoch": 3.79255698005698,
+      "grad_norm": 0.8730934858322144,
+      "learning_rate": 1.3380429703474306e-06,
+      "loss": 0.6868,
+      "step": 21302
+    },
+    {
+      "epoch": 3.7927350427350426,
+      "grad_norm": 0.992592453956604,
+      "learning_rate": 1.3357617983924652e-06,
+      "loss": 0.7468,
+      "step": 21303
+    },
+    {
+      "epoch": 3.7929131054131053,
+      "grad_norm": 1.0449074506759644,
+      "learning_rate": 1.3334825595582745e-06,
+      "loss": 0.7136,
+      "step": 21304
+    },
+    {
+      "epoch": 3.793091168091168,
+      "grad_norm": 0.8978158831596375,
+      "learning_rate": 1.3312052538895559e-06,
+      "loss": 0.6355,
+      "step": 21305
+    },
+    {
+      "epoch": 3.793269230769231,
+      "grad_norm": 1.0906991958618164,
+      "learning_rate": 1.3289298814308848e-06,
+      "loss": 0.7947,
+      "step": 21306
+    },
+    {
+      "epoch": 3.7934472934472936,
+      "grad_norm": 1.0528931617736816,
+      "learning_rate": 1.32665644222687e-06,
+      "loss": 0.6711,
+      "step": 21307
+    },
+    {
+      "epoch": 3.7936253561253563,
+      "grad_norm": 1.055999517440796,
+      "learning_rate": 1.324384936322043e-06,
+      "loss": 0.7513,
+      "step": 21308
+    },
+    {
+      "epoch": 3.7938034188034186,
+      "grad_norm": 1.0277466773986816,
+      "learning_rate": 1.3221153637609119e-06,
+      "loss": 0.7602,
+      "step": 21309
+    },
+    {
+      "epoch": 3.7939814814814814,
+      "grad_norm": 0.9280304908752441,
+      "learning_rate": 1.3198477245879414e-06,
+      "loss": 0.7614,
+      "step": 21310
+    },
+    {
+      "epoch": 3.794159544159544,
+      "grad_norm": 1.0405571460723877,
+      "learning_rate": 1.317582018847574e-06,
+      "loss": 0.8271,
+      "step": 21311
+    },
+    {
+      "epoch": 3.794337606837607,
+      "grad_norm": 0.9499875903129578,
+      "learning_rate": 1.315318246584174e-06,
+      "loss": 0.804,
+      "step": 21312
+    },
+    {
+      "epoch": 3.7945156695156697,
+      "grad_norm": 0.9112995266914368,
+      "learning_rate": 1.3130564078421391e-06,
+      "loss": 0.6156,
+      "step": 21313
+    },
+    {
+      "epoch": 3.794693732193732,
+      "grad_norm": 0.9878358244895935,
+      "learning_rate": 1.3107965026657343e-06,
+      "loss": 0.7968,
+      "step": 21314
+    },
+    {
+      "epoch": 3.7948717948717947,
+      "grad_norm": 0.9207808971405029,
+      "learning_rate": 1.3085385310992904e-06,
+      "loss": 0.7832,
+      "step": 21315
+    },
+    {
+      "epoch": 3.7950498575498575,
+      "grad_norm": 0.8998355865478516,
+      "learning_rate": 1.3062824931869944e-06,
+      "loss": 0.6685,
+      "step": 21316
+    },
+    {
+      "epoch": 3.79522792022792,
+      "grad_norm": 0.935667872428894,
+      "learning_rate": 1.3040283889730886e-06,
+      "loss": 0.5034,
+      "step": 21317
+    },
+    {
+      "epoch": 3.795405982905983,
+      "grad_norm": 0.9035351872444153,
+      "learning_rate": 1.3017762185017157e-06,
+      "loss": 0.7745,
+      "step": 21318
+    },
+    {
+      "epoch": 3.7955840455840457,
+      "grad_norm": 1.0024009943008423,
+      "learning_rate": 1.299525981817018e-06,
+      "loss": 0.7638,
+      "step": 21319
+    },
+    {
+      "epoch": 3.7957621082621085,
+      "grad_norm": 0.9762584567070007,
+      "learning_rate": 1.2972776789630714e-06,
+      "loss": 0.9055,
+      "step": 21320
+    },
+    {
+      "epoch": 3.7959401709401708,
+      "grad_norm": 0.9953422546386719,
+      "learning_rate": 1.2950313099839296e-06,
+      "loss": 0.8011,
+      "step": 21321
+    },
+    {
+      "epoch": 3.7961182336182335,
+      "grad_norm": 0.8137484788894653,
+      "learning_rate": 1.2927868749236017e-06,
+      "loss": 0.5586,
+      "step": 21322
+    },
+    {
+      "epoch": 3.7962962962962963,
+      "grad_norm": 0.8958911299705505,
+      "learning_rate": 1.2905443738260747e-06,
+      "loss": 0.6886,
+      "step": 21323
+    },
+    {
+      "epoch": 3.796474358974359,
+      "grad_norm": 1.1359935998916626,
+      "learning_rate": 1.2883038067352916e-06,
+      "loss": 0.8033,
+      "step": 21324
+    },
+    {
+      "epoch": 3.796652421652422,
+      "grad_norm": 0.8494634628295898,
+      "learning_rate": 1.286065173695128e-06,
+      "loss": 0.71,
+      "step": 21325
+    },
+    {
+      "epoch": 3.796830484330484,
+      "grad_norm": 0.9622138738632202,
+      "learning_rate": 1.28382847474946e-06,
+      "loss": 0.6438,
+      "step": 21326
+    },
+    {
+      "epoch": 3.797008547008547,
+      "grad_norm": 0.8593050241470337,
+      "learning_rate": 1.2815937099421083e-06,
+      "loss": 0.7284,
+      "step": 21327
+    },
+    {
+      "epoch": 3.7971866096866096,
+      "grad_norm": 0.9425849318504333,
+      "learning_rate": 1.2793608793168488e-06,
+      "loss": 0.645,
+      "step": 21328
+    },
+    {
+      "epoch": 3.7973646723646723,
+      "grad_norm": 1.0602357387542725,
+      "learning_rate": 1.2771299829174465e-06,
+      "loss": 0.6953,
+      "step": 21329
+    },
+    {
+      "epoch": 3.797542735042735,
+      "grad_norm": 1.0182085037231445,
+      "learning_rate": 1.274901020787611e-06,
+      "loss": 0.851,
+      "step": 21330
+    },
+    {
+      "epoch": 3.797720797720798,
+      "grad_norm": 0.9231748580932617,
+      "learning_rate": 1.2726739929709852e-06,
+      "loss": 0.7818,
+      "step": 21331
+    },
+    {
+      "epoch": 3.7978988603988606,
+      "grad_norm": 0.8449130058288574,
+      "learning_rate": 1.2704488995112451e-06,
+      "loss": 0.6666,
+      "step": 21332
+    },
+    {
+      "epoch": 3.7980769230769234,
+      "grad_norm": 0.9367035627365112,
+      "learning_rate": 1.268225740451956e-06,
+      "loss": 0.7047,
+      "step": 21333
+    },
+    {
+      "epoch": 3.7982549857549857,
+      "grad_norm": 1.005717158317566,
+      "learning_rate": 1.2660045158366828e-06,
+      "loss": 0.7707,
+      "step": 21334
+    },
+    {
+      "epoch": 3.7984330484330484,
+      "grad_norm": 0.9248343706130981,
+      "learning_rate": 1.2637852257089578e-06,
+      "loss": 0.6067,
+      "step": 21335
+    },
+    {
+      "epoch": 3.798611111111111,
+      "grad_norm": 0.9488750696182251,
+      "learning_rate": 1.261567870112257e-06,
+      "loss": 0.7695,
+      "step": 21336
+    },
+    {
+      "epoch": 3.798789173789174,
+      "grad_norm": 0.9528114795684814,
+      "learning_rate": 1.259352449090001e-06,
+      "loss": 0.6908,
+      "step": 21337
+    },
+    {
+      "epoch": 3.798967236467236,
+      "grad_norm": 0.886128306388855,
+      "learning_rate": 1.2571389626856445e-06,
+      "loss": 0.5891,
+      "step": 21338
+    },
+    {
+      "epoch": 3.799145299145299,
+      "grad_norm": 0.9369097948074341,
+      "learning_rate": 1.254927410942508e-06,
+      "loss": 0.9413,
+      "step": 21339
+    },
+    {
+      "epoch": 3.7993233618233617,
+      "grad_norm": 0.9909436106681824,
+      "learning_rate": 1.2527177939039681e-06,
+      "loss": 0.8069,
+      "step": 21340
+    },
+    {
+      "epoch": 3.7995014245014245,
+      "grad_norm": 0.9349713921546936,
+      "learning_rate": 1.2505101116132678e-06,
+      "loss": 0.7218,
+      "step": 21341
+    },
+    {
+      "epoch": 3.7996794871794872,
+      "grad_norm": 0.912471354007721,
+      "learning_rate": 1.248304364113706e-06,
+      "loss": 0.6264,
+      "step": 21342
+    },
+    {
+      "epoch": 3.79985754985755,
+      "grad_norm": 0.9769580960273743,
+      "learning_rate": 1.2461005514484703e-06,
+      "loss": 0.6653,
+      "step": 21343
+    },
+    {
+      "epoch": 3.8000356125356127,
+      "grad_norm": 0.9078145027160645,
+      "learning_rate": 1.2438986736607484e-06,
+      "loss": 0.8129,
+      "step": 21344
+    },
+    {
+      "epoch": 3.8002136752136755,
+      "grad_norm": 0.8971534967422485,
+      "learning_rate": 1.2416987307937056e-06,
+      "loss": 0.8267,
+      "step": 21345
+    },
+    {
+      "epoch": 3.800391737891738,
+      "grad_norm": 1.091088891029358,
+      "learning_rate": 1.2395007228903966e-06,
+      "loss": 0.74,
+      "step": 21346
+    },
+    {
+      "epoch": 3.8005698005698005,
+      "grad_norm": 0.8109734058380127,
+      "learning_rate": 1.2373046499939311e-06,
+      "loss": 0.576,
+      "step": 21347
+    },
+    {
+      "epoch": 3.8007478632478633,
+      "grad_norm": 0.9560970067977905,
+      "learning_rate": 1.2351105121473084e-06,
+      "loss": 0.7023,
+      "step": 21348
+    },
+    {
+      "epoch": 3.800925925925926,
+      "grad_norm": 0.8324378132820129,
+      "learning_rate": 1.2329183093935382e-06,
+      "loss": 0.4152,
+      "step": 21349
+    },
+    {
+      "epoch": 3.8011039886039883,
+      "grad_norm": 0.8839790225028992,
+      "learning_rate": 1.230728041775553e-06,
+      "loss": 0.7523,
+      "step": 21350
+    },
+    {
+      "epoch": 3.801282051282051,
+      "grad_norm": 0.9106653928756714,
+      "learning_rate": 1.228539709336285e-06,
+      "loss": 0.8959,
+      "step": 21351
+    },
+    {
+      "epoch": 3.801460113960114,
+      "grad_norm": 1.0674225091934204,
+      "learning_rate": 1.2263533121186e-06,
+      "loss": 0.9069,
+      "step": 21352
+    },
+    {
+      "epoch": 3.8016381766381766,
+      "grad_norm": 0.904495358467102,
+      "learning_rate": 1.2241688501653414e-06,
+      "loss": 0.7724,
+      "step": 21353
+    },
+    {
+      "epoch": 3.8018162393162394,
+      "grad_norm": 0.8402371406555176,
+      "learning_rate": 1.2219863235192864e-06,
+      "loss": 0.5943,
+      "step": 21354
+    },
+    {
+      "epoch": 3.801994301994302,
+      "grad_norm": 1.0665391683578491,
+      "learning_rate": 1.2198057322232448e-06,
+      "loss": 0.9477,
+      "step": 21355
+    },
+    {
+      "epoch": 3.802172364672365,
+      "grad_norm": 0.8307034969329834,
+      "learning_rate": 1.2176270763198828e-06,
+      "loss": 0.559,
+      "step": 21356
+    },
+    {
+      "epoch": 3.8023504273504276,
+      "grad_norm": 1.033492922782898,
+      "learning_rate": 1.2154503558519325e-06,
+      "loss": 0.9119,
+      "step": 21357
+    },
+    {
+      "epoch": 3.80252849002849,
+      "grad_norm": 0.9070342779159546,
+      "learning_rate": 1.213275570862027e-06,
+      "loss": 0.7226,
+      "step": 21358
+    },
+    {
+      "epoch": 3.8027065527065527,
+      "grad_norm": 0.9109193682670593,
+      "learning_rate": 1.2111027213927651e-06,
+      "loss": 0.8976,
+      "step": 21359
+    },
+    {
+      "epoch": 3.8028846153846154,
+      "grad_norm": 0.9902597665786743,
+      "learning_rate": 1.208931807486735e-06,
+      "loss": 0.6105,
+      "step": 21360
+    },
+    {
+      "epoch": 3.803062678062678,
+      "grad_norm": 0.8371327519416809,
+      "learning_rate": 1.2067628291864696e-06,
+      "loss": 0.7772,
+      "step": 21361
+    },
+    {
+      "epoch": 3.8032407407407405,
+      "grad_norm": 0.8901026248931885,
+      "learning_rate": 1.204595786534457e-06,
+      "loss": 0.6908,
+      "step": 21362
+    },
+    {
+      "epoch": 3.8034188034188032,
+      "grad_norm": 0.9610844850540161,
+      "learning_rate": 1.2024306795731522e-06,
+      "loss": 0.7674,
+      "step": 21363
+    },
+    {
+      "epoch": 3.803596866096866,
+      "grad_norm": 1.0326725244522095,
+      "learning_rate": 1.2002675083449877e-06,
+      "loss": 0.8886,
+      "step": 21364
+    },
+    {
+      "epoch": 3.8037749287749287,
+      "grad_norm": 0.9106666445732117,
+      "learning_rate": 1.198106272892352e-06,
+      "loss": 0.7843,
+      "step": 21365
+    },
+    {
+      "epoch": 3.8039529914529915,
+      "grad_norm": 1.061942458152771,
+      "learning_rate": 1.195946973257578e-06,
+      "loss": 0.6653,
+      "step": 21366
+    },
+    {
+      "epoch": 3.8041310541310542,
+      "grad_norm": 0.9274312257766724,
+      "learning_rate": 1.1937896094829759e-06,
+      "loss": 0.7761,
+      "step": 21367
+    },
+    {
+      "epoch": 3.804309116809117,
+      "grad_norm": 0.9896464943885803,
+      "learning_rate": 1.1916341816108124e-06,
+      "loss": 0.8284,
+      "step": 21368
+    },
+    {
+      "epoch": 3.8044871794871797,
+      "grad_norm": 1.078537106513977,
+      "learning_rate": 1.1894806896833088e-06,
+      "loss": 0.851,
+      "step": 21369
+    },
+    {
+      "epoch": 3.804665242165242,
+      "grad_norm": 0.9805821180343628,
+      "learning_rate": 1.1873291337426873e-06,
+      "loss": 0.8921,
+      "step": 21370
+    },
+    {
+      "epoch": 3.804843304843305,
+      "grad_norm": 0.9240527153015137,
+      "learning_rate": 1.1851795138310696e-06,
+      "loss": 0.695,
+      "step": 21371
+    },
+    {
+      "epoch": 3.8050213675213675,
+      "grad_norm": 1.124892234802246,
+      "learning_rate": 1.1830318299905996e-06,
+      "loss": 0.7725,
+      "step": 21372
+    },
+    {
+      "epoch": 3.8051994301994303,
+      "grad_norm": 1.07109534740448,
+      "learning_rate": 1.180886082263355e-06,
+      "loss": 0.7807,
+      "step": 21373
+    },
+    {
+      "epoch": 3.8053774928774926,
+      "grad_norm": 0.8756905198097229,
+      "learning_rate": 1.1787422706913577e-06,
+      "loss": 0.686,
+      "step": 21374
+    },
+    {
+      "epoch": 3.8055555555555554,
+      "grad_norm": 0.8990702629089355,
+      "learning_rate": 1.1766003953166183e-06,
+      "loss": 0.7694,
+      "step": 21375
+    },
+    {
+      "epoch": 3.805733618233618,
+      "grad_norm": 1.0288631916046143,
+      "learning_rate": 1.1744604561811146e-06,
+      "loss": 0.789,
+      "step": 21376
+    },
+    {
+      "epoch": 3.805911680911681,
+      "grad_norm": 1.0674315690994263,
+      "learning_rate": 1.1723224533267686e-06,
+      "loss": 0.662,
+      "step": 21377
+    },
+    {
+      "epoch": 3.8060897435897436,
+      "grad_norm": 0.9958518743515015,
+      "learning_rate": 1.1701863867954577e-06,
+      "loss": 0.749,
+      "step": 21378
+    },
+    {
+      "epoch": 3.8062678062678064,
+      "grad_norm": 1.0079996585845947,
+      "learning_rate": 1.1680522566290375e-06,
+      "loss": 0.77,
+      "step": 21379
+    },
+    {
+      "epoch": 3.806445868945869,
+      "grad_norm": 0.9467402100563049,
+      "learning_rate": 1.1659200628693523e-06,
+      "loss": 0.6779,
+      "step": 21380
+    },
+    {
+      "epoch": 3.806623931623932,
+      "grad_norm": 0.8622475862503052,
+      "learning_rate": 1.1637898055581354e-06,
+      "loss": 0.7096,
+      "step": 21381
+    },
+    {
+      "epoch": 3.806801994301994,
+      "grad_norm": 0.9366708397865295,
+      "learning_rate": 1.1616614847371421e-06,
+      "loss": 0.7709,
+      "step": 21382
+    },
+    {
+      "epoch": 3.806980056980057,
+      "grad_norm": 1.184937834739685,
+      "learning_rate": 1.1595351004480837e-06,
+      "loss": 0.5492,
+      "step": 21383
+    },
+    {
+      "epoch": 3.8071581196581197,
+      "grad_norm": 1.089446783065796,
+      "learning_rate": 1.1574106527325933e-06,
+      "loss": 0.742,
+      "step": 21384
+    },
+    {
+      "epoch": 3.8073361823361824,
+      "grad_norm": 1.050498366355896,
+      "learning_rate": 1.155288141632338e-06,
+      "loss": 0.9068,
+      "step": 21385
+    },
+    {
+      "epoch": 3.807514245014245,
+      "grad_norm": 0.9292911887168884,
+      "learning_rate": 1.1531675671888619e-06,
+      "loss": 0.7896,
+      "step": 21386
+    },
+    {
+      "epoch": 3.8076923076923075,
+      "grad_norm": 1.1359772682189941,
+      "learning_rate": 1.1510489294437432e-06,
+      "loss": 0.8933,
+      "step": 21387
+    },
+    {
+      "epoch": 3.8078703703703702,
+      "grad_norm": 0.8609947562217712,
+      "learning_rate": 1.1489322284384596e-06,
+      "loss": 0.5453,
+      "step": 21388
+    },
+    {
+      "epoch": 3.808048433048433,
+      "grad_norm": 0.9479160308837891,
+      "learning_rate": 1.1468174642145223e-06,
+      "loss": 0.8166,
+      "step": 21389
+    },
+    {
+      "epoch": 3.8082264957264957,
+      "grad_norm": 0.9272875189781189,
+      "learning_rate": 1.144704636813343e-06,
+      "loss": 0.7493,
+      "step": 21390
+    },
+    {
+      "epoch": 3.8084045584045585,
+      "grad_norm": 1.0059126615524292,
+      "learning_rate": 1.1425937462763215e-06,
+      "loss": 0.8498,
+      "step": 21391
+    },
+    {
+      "epoch": 3.8085826210826212,
+      "grad_norm": 0.9803183674812317,
+      "learning_rate": 1.1404847926448136e-06,
+      "loss": 0.7432,
+      "step": 21392
+    },
+    {
+      "epoch": 3.808760683760684,
+      "grad_norm": 0.8550325632095337,
+      "learning_rate": 1.1383777759601533e-06,
+      "loss": 0.7143,
+      "step": 21393
+    },
+    {
+      "epoch": 3.8089387464387463,
+      "grad_norm": 0.9402148127555847,
+      "learning_rate": 1.1362726962635961e-06,
+      "loss": 0.6031,
+      "step": 21394
+    },
+    {
+      "epoch": 3.809116809116809,
+      "grad_norm": 1.1336814165115356,
+      "learning_rate": 1.1341695535964203e-06,
+      "loss": 0.8163,
+      "step": 21395
+    },
+    {
+      "epoch": 3.809294871794872,
+      "grad_norm": 0.8783149719238281,
+      "learning_rate": 1.132068347999804e-06,
+      "loss": 0.764,
+      "step": 21396
+    },
+    {
+      "epoch": 3.8094729344729346,
+      "grad_norm": 0.9727262854576111,
+      "learning_rate": 1.1299690795149365e-06,
+      "loss": 0.7521,
+      "step": 21397
+    },
+    {
+      "epoch": 3.8096509971509973,
+      "grad_norm": 0.9679130911827087,
+      "learning_rate": 1.1278717481829403e-06,
+      "loss": 0.6411,
+      "step": 21398
+    },
+    {
+      "epoch": 3.8098290598290596,
+      "grad_norm": 1.0554420948028564,
+      "learning_rate": 1.125776354044905e-06,
+      "loss": 0.738,
+      "step": 21399
+    },
+    {
+      "epoch": 3.8100071225071224,
+      "grad_norm": 1.0295968055725098,
+      "learning_rate": 1.1236828971418867e-06,
+      "loss": 0.6789,
+      "step": 21400
+    },
+    {
+      "epoch": 3.810185185185185,
+      "grad_norm": 1.2897826433181763,
+      "learning_rate": 1.1215913775149079e-06,
+      "loss": 0.8072,
+      "step": 21401
+    },
+    {
+      "epoch": 3.810363247863248,
+      "grad_norm": 0.8948628902435303,
+      "learning_rate": 1.119501795204947e-06,
+      "loss": 0.5636,
+      "step": 21402
+    },
+    {
+      "epoch": 3.8105413105413106,
+      "grad_norm": 1.040513277053833,
+      "learning_rate": 1.1174141502529268e-06,
+      "loss": 0.9099,
+      "step": 21403
+    },
+    {
+      "epoch": 3.8107193732193734,
+      "grad_norm": 1.0494636297225952,
+      "learning_rate": 1.1153284426997811e-06,
+      "loss": 0.9003,
+      "step": 21404
+    },
+    {
+      "epoch": 3.810897435897436,
+      "grad_norm": 0.9162052273750305,
+      "learning_rate": 1.1132446725863554e-06,
+      "loss": 0.7323,
+      "step": 21405
+    },
+    {
+      "epoch": 3.8110754985754984,
+      "grad_norm": 1.1157633066177368,
+      "learning_rate": 1.1111628399534724e-06,
+      "loss": 0.731,
+      "step": 21406
+    },
+    {
+      "epoch": 3.811253561253561,
+      "grad_norm": 0.9189321994781494,
+      "learning_rate": 1.1090829448419438e-06,
+      "loss": 0.8703,
+      "step": 21407
+    },
+    {
+      "epoch": 3.811431623931624,
+      "grad_norm": 1.1154193878173828,
+      "learning_rate": 1.1070049872925037e-06,
+      "loss": 0.8683,
+      "step": 21408
+    },
+    {
+      "epoch": 3.8116096866096867,
+      "grad_norm": 0.8818658590316772,
+      "learning_rate": 1.104928967345853e-06,
+      "loss": 0.6899,
+      "step": 21409
+    },
+    {
+      "epoch": 3.8117877492877494,
+      "grad_norm": 1.0003615617752075,
+      "learning_rate": 1.1028548850427034e-06,
+      "loss": 0.763,
+      "step": 21410
+    },
+    {
+      "epoch": 3.8119658119658117,
+      "grad_norm": 0.9913126230239868,
+      "learning_rate": 1.100782740423656e-06,
+      "loss": 1.0265,
+      "step": 21411
+    },
+    {
+      "epoch": 3.8121438746438745,
+      "grad_norm": 0.982927143573761,
+      "learning_rate": 1.0987125335293448e-06,
+      "loss": 0.7146,
+      "step": 21412
+    },
+    {
+      "epoch": 3.8123219373219372,
+      "grad_norm": 1.0777631998062134,
+      "learning_rate": 1.0966442644002928e-06,
+      "loss": 0.8946,
+      "step": 21413
+    },
+    {
+      "epoch": 3.8125,
+      "grad_norm": 0.885434627532959,
+      "learning_rate": 1.0945779330770457e-06,
+      "loss": 0.5987,
+      "step": 21414
+    },
+    {
+      "epoch": 3.8126780626780628,
+      "grad_norm": 1.2291704416275024,
+      "learning_rate": 1.092513539600093e-06,
+      "loss": 0.7442,
+      "step": 21415
+    },
+    {
+      "epoch": 3.8128561253561255,
+      "grad_norm": 0.976157009601593,
+      "learning_rate": 1.0904510840098692e-06,
+      "loss": 0.9357,
+      "step": 21416
+    },
+    {
+      "epoch": 3.8130341880341883,
+      "grad_norm": 1.0253404378890991,
+      "learning_rate": 1.0883905663467974e-06,
+      "loss": 0.8082,
+      "step": 21417
+    },
+    {
+      "epoch": 3.8132122507122506,
+      "grad_norm": 0.9216427206993103,
+      "learning_rate": 1.0863319866512346e-06,
+      "loss": 0.7686,
+      "step": 21418
+    },
+    {
+      "epoch": 3.8133903133903133,
+      "grad_norm": 0.903035581111908,
+      "learning_rate": 1.0842753449635147e-06,
+      "loss": 0.8007,
+      "step": 21419
+    },
+    {
+      "epoch": 3.813568376068376,
+      "grad_norm": 0.8471698760986328,
+      "learning_rate": 1.0822206413239499e-06,
+      "loss": 0.5844,
+      "step": 21420
+    },
+    {
+      "epoch": 3.813746438746439,
+      "grad_norm": 0.9668221473693848,
+      "learning_rate": 1.0801678757727862e-06,
+      "loss": 0.8883,
+      "step": 21421
+    },
+    {
+      "epoch": 3.8139245014245016,
+      "grad_norm": 1.026482343673706,
+      "learning_rate": 1.0781170483502355e-06,
+      "loss": 0.7599,
+      "step": 21422
+    },
+    {
+      "epoch": 3.814102564102564,
+      "grad_norm": 0.9170508980751038,
+      "learning_rate": 1.0760681590964995e-06,
+      "loss": 0.6353,
+      "step": 21423
+    },
+    {
+      "epoch": 3.8142806267806266,
+      "grad_norm": 0.9856457710266113,
+      "learning_rate": 1.074021208051712e-06,
+      "loss": 0.7551,
+      "step": 21424
+    },
+    {
+      "epoch": 3.8144586894586894,
+      "grad_norm": 0.8754544854164124,
+      "learning_rate": 1.0719761952559748e-06,
+      "loss": 0.577,
+      "step": 21425
+    },
+    {
+      "epoch": 3.814636752136752,
+      "grad_norm": 0.867774248123169,
+      "learning_rate": 1.0699331207493556e-06,
+      "loss": 0.6398,
+      "step": 21426
+    },
+    {
+      "epoch": 3.814814814814815,
+      "grad_norm": 1.0027731657028198,
+      "learning_rate": 1.0678919845718892e-06,
+      "loss": 0.8892,
+      "step": 21427
+    },
+    {
+      "epoch": 3.8149928774928776,
+      "grad_norm": 0.8579356670379639,
+      "learning_rate": 1.065852786763566e-06,
+      "loss": 0.8224,
+      "step": 21428
+    },
+    {
+      "epoch": 3.8151709401709404,
+      "grad_norm": 1.1910187005996704,
+      "learning_rate": 1.0638155273643425e-06,
+      "loss": 0.9737,
+      "step": 21429
+    },
+    {
+      "epoch": 3.8153490028490027,
+      "grad_norm": 0.9423369765281677,
+      "learning_rate": 1.0617802064141313e-06,
+      "loss": 0.7792,
+      "step": 21430
+    },
+    {
+      "epoch": 3.8155270655270654,
+      "grad_norm": 1.0974822044372559,
+      "learning_rate": 1.0597468239528118e-06,
+      "loss": 0.6306,
+      "step": 21431
+    },
+    {
+      "epoch": 3.815705128205128,
+      "grad_norm": 0.8709555268287659,
+      "learning_rate": 1.0577153800202188e-06,
+      "loss": 0.6959,
+      "step": 21432
+    },
+    {
+      "epoch": 3.815883190883191,
+      "grad_norm": 0.9723543524742126,
+      "learning_rate": 1.0556858746561538e-06,
+      "loss": 0.7802,
+      "step": 21433
+    },
+    {
+      "epoch": 3.8160612535612537,
+      "grad_norm": 0.959344208240509,
+      "learning_rate": 1.0536583079003958e-06,
+      "loss": 0.924,
+      "step": 21434
+    },
+    {
+      "epoch": 3.816239316239316,
+      "grad_norm": 1.0029487609863281,
+      "learning_rate": 1.051632679792658e-06,
+      "loss": 0.9172,
+      "step": 21435
+    },
+    {
+      "epoch": 3.8164173789173788,
+      "grad_norm": 0.9423520565032959,
+      "learning_rate": 1.0496089903726192e-06,
+      "loss": 0.8802,
+      "step": 21436
+    },
+    {
+      "epoch": 3.8165954415954415,
+      "grad_norm": 0.9231133460998535,
+      "learning_rate": 1.047587239679959e-06,
+      "loss": 0.842,
+      "step": 21437
+    },
+    {
+      "epoch": 3.8167735042735043,
+      "grad_norm": 0.9937446713447571,
+      "learning_rate": 1.0455674277542459e-06,
+      "loss": 0.5957,
+      "step": 21438
+    },
+    {
+      "epoch": 3.816951566951567,
+      "grad_norm": 1.1649292707443237,
+      "learning_rate": 1.0435495546351036e-06,
+      "loss": 0.8486,
+      "step": 21439
+    },
+    {
+      "epoch": 3.8171296296296298,
+      "grad_norm": 0.8023452758789062,
+      "learning_rate": 1.041533620362034e-06,
+      "loss": 0.7446,
+      "step": 21440
+    },
+    {
+      "epoch": 3.8173076923076925,
+      "grad_norm": 0.7962468266487122,
+      "learning_rate": 1.0395196249745387e-06,
+      "loss": 0.6481,
+      "step": 21441
+    },
+    {
+      "epoch": 3.817485754985755,
+      "grad_norm": 0.9893062114715576,
+      "learning_rate": 1.0375075685120972e-06,
+      "loss": 0.7427,
+      "step": 21442
+    },
+    {
+      "epoch": 3.8176638176638176,
+      "grad_norm": 0.9541147947311401,
+      "learning_rate": 1.0354974510141002e-06,
+      "loss": 0.7193,
+      "step": 21443
+    },
+    {
+      "epoch": 3.8178418803418803,
+      "grad_norm": 1.0776668787002563,
+      "learning_rate": 1.0334892725199607e-06,
+      "loss": 0.9832,
+      "step": 21444
+    },
+    {
+      "epoch": 3.818019943019943,
+      "grad_norm": 1.0417940616607666,
+      "learning_rate": 1.0314830330690139e-06,
+      "loss": 0.6436,
+      "step": 21445
+    },
+    {
+      "epoch": 3.818198005698006,
+      "grad_norm": 0.8967922925949097,
+      "learning_rate": 1.0294787327005729e-06,
+      "loss": 0.7823,
+      "step": 21446
+    },
+    {
+      "epoch": 3.818376068376068,
+      "grad_norm": 0.9486142992973328,
+      "learning_rate": 1.027476371453895e-06,
+      "loss": 0.8485,
+      "step": 21447
+    },
+    {
+      "epoch": 3.818554131054131,
+      "grad_norm": 1.0096664428710938,
+      "learning_rate": 1.0254759493682152e-06,
+      "loss": 0.6872,
+      "step": 21448
+    },
+    {
+      "epoch": 3.8187321937321936,
+      "grad_norm": 1.0282636880874634,
+      "learning_rate": 1.0234774664827474e-06,
+      "loss": 1.0026,
+      "step": 21449
+    },
+    {
+      "epoch": 3.8189102564102564,
+      "grad_norm": 0.975347638130188,
+      "learning_rate": 1.0214809228366262e-06,
+      "loss": 0.8115,
+      "step": 21450
+    },
+    {
+      "epoch": 3.819088319088319,
+      "grad_norm": 1.000891089439392,
+      "learning_rate": 1.0194863184689652e-06,
+      "loss": 0.9413,
+      "step": 21451
+    },
+    {
+      "epoch": 3.819266381766382,
+      "grad_norm": 0.9597340226173401,
+      "learning_rate": 1.0174936534188662e-06,
+      "loss": 0.6959,
+      "step": 21452
+    },
+    {
+      "epoch": 3.8194444444444446,
+      "grad_norm": 1.018418788909912,
+      "learning_rate": 1.0155029277253537e-06,
+      "loss": 0.7004,
+      "step": 21453
+    },
+    {
+      "epoch": 3.8196225071225074,
+      "grad_norm": 1.005155086517334,
+      "learning_rate": 1.0135141414274519e-06,
+      "loss": 0.9321,
+      "step": 21454
+    },
+    {
+      "epoch": 3.8198005698005697,
+      "grad_norm": 0.8870786428451538,
+      "learning_rate": 1.0115272945641075e-06,
+      "loss": 0.9596,
+      "step": 21455
+    },
+    {
+      "epoch": 3.8199786324786325,
+      "grad_norm": 0.8494182229042053,
+      "learning_rate": 1.009542387174256e-06,
+      "loss": 0.7022,
+      "step": 21456
+    },
+    {
+      "epoch": 3.820156695156695,
+      "grad_norm": 1.0547996759414673,
+      "learning_rate": 1.0075594192967774e-06,
+      "loss": 0.7463,
+      "step": 21457
+    },
+    {
+      "epoch": 3.820334757834758,
+      "grad_norm": 0.8766087889671326,
+      "learning_rate": 1.0055783909705406e-06,
+      "loss": 0.5984,
+      "step": 21458
+    },
+    {
+      "epoch": 3.8205128205128203,
+      "grad_norm": 0.9966375827789307,
+      "learning_rate": 1.0035993022343592e-06,
+      "loss": 0.7422,
+      "step": 21459
+    },
+    {
+      "epoch": 3.820690883190883,
+      "grad_norm": 0.8210493922233582,
+      "learning_rate": 1.0016221531269909e-06,
+      "loss": 0.78,
+      "step": 21460
+    },
+    {
+      "epoch": 3.8208689458689458,
+      "grad_norm": 0.9703954458236694,
+      "learning_rate": 9.996469436871824e-07,
+      "loss": 0.756,
+      "step": 21461
+    },
+    {
+      "epoch": 3.8210470085470085,
+      "grad_norm": 0.9013912677764893,
+      "learning_rate": 9.976736739536474e-07,
+      "loss": 0.6361,
+      "step": 21462
+    },
+    {
+      "epoch": 3.8212250712250713,
+      "grad_norm": 0.8628489971160889,
+      "learning_rate": 9.957023439650327e-07,
+      "loss": 0.5448,
+      "step": 21463
+    },
+    {
+      "epoch": 3.821403133903134,
+      "grad_norm": 1.0915628671646118,
+      "learning_rate": 9.937329537599627e-07,
+      "loss": 0.8572,
+      "step": 21464
+    },
+    {
+      "epoch": 3.8215811965811968,
+      "grad_norm": 0.9227813482284546,
+      "learning_rate": 9.9176550337704e-07,
+      "loss": 0.669,
+      "step": 21465
+    },
+    {
+      "epoch": 3.8217592592592595,
+      "grad_norm": 0.9919458031654358,
+      "learning_rate": 9.897999928547784e-07,
+      "loss": 0.7664,
+      "step": 21466
+    },
+    {
+      "epoch": 3.821937321937322,
+      "grad_norm": 0.9025664925575256,
+      "learning_rate": 9.87836422231736e-07,
+      "loss": 0.6215,
+      "step": 21467
+    },
+    {
+      "epoch": 3.8221153846153846,
+      "grad_norm": 1.154800295829773,
+      "learning_rate": 9.858747915463374e-07,
+      "loss": 0.9609,
+      "step": 21468
+    },
+    {
+      "epoch": 3.8222934472934473,
+      "grad_norm": 1.0049433708190918,
+      "learning_rate": 9.839151008370629e-07,
+      "loss": 0.7946,
+      "step": 21469
+    },
+    {
+      "epoch": 3.82247150997151,
+      "grad_norm": 0.9646359086036682,
+      "learning_rate": 9.819573501422596e-07,
+      "loss": 0.793,
+      "step": 21470
+    },
+    {
+      "epoch": 3.8226495726495724,
+      "grad_norm": 0.8603723049163818,
+      "learning_rate": 9.800015395003192e-07,
+      "loss": 0.5834,
+      "step": 21471
+    },
+    {
+      "epoch": 3.822827635327635,
+      "grad_norm": 0.9426991939544678,
+      "learning_rate": 9.780476689495554e-07,
+      "loss": 0.7321,
+      "step": 21472
+    },
+    {
+      "epoch": 3.823005698005698,
+      "grad_norm": 0.8795328140258789,
+      "learning_rate": 9.760957385282488e-07,
+      "loss": 0.7916,
+      "step": 21473
+    },
+    {
+      "epoch": 3.8231837606837606,
+      "grad_norm": 1.0602772235870361,
+      "learning_rate": 9.741457482746352e-07,
+      "loss": 0.7095,
+      "step": 21474
+    },
+    {
+      "epoch": 3.8233618233618234,
+      "grad_norm": 1.0050617456436157,
+      "learning_rate": 9.721976982269287e-07,
+      "loss": 0.8111,
+      "step": 21475
+    },
+    {
+      "epoch": 3.823539886039886,
+      "grad_norm": 1.0178896188735962,
+      "learning_rate": 9.702515884232878e-07,
+      "loss": 0.6689,
+      "step": 21476
+    },
+    {
+      "epoch": 3.823717948717949,
+      "grad_norm": 1.0118085145950317,
+      "learning_rate": 9.683074189018704e-07,
+      "loss": 0.8217,
+      "step": 21477
+    },
+    {
+      "epoch": 3.8238960113960117,
+      "grad_norm": 1.161960482597351,
+      "learning_rate": 9.663651897007353e-07,
+      "loss": 0.8104,
+      "step": 21478
+    },
+    {
+      "epoch": 3.824074074074074,
+      "grad_norm": 1.0038970708847046,
+      "learning_rate": 9.64424900857952e-07,
+      "loss": 0.9059,
+      "step": 21479
+    },
+    {
+      "epoch": 3.8242521367521367,
+      "grad_norm": 0.9953527450561523,
+      "learning_rate": 9.624865524115346e-07,
+      "loss": 0.8837,
+      "step": 21480
+    },
+    {
+      "epoch": 3.8244301994301995,
+      "grad_norm": 0.9455059766769409,
+      "learning_rate": 9.605501443994524e-07,
+      "loss": 0.5752,
+      "step": 21481
+    },
+    {
+      "epoch": 3.824608262108262,
+      "grad_norm": 0.9261574149131775,
+      "learning_rate": 9.586156768596645e-07,
+      "loss": 0.8545,
+      "step": 21482
+    },
+    {
+      "epoch": 3.8247863247863245,
+      "grad_norm": 1.0939536094665527,
+      "learning_rate": 9.56683149830062e-07,
+      "loss": 0.7738,
+      "step": 21483
+    },
+    {
+      "epoch": 3.8249643874643873,
+      "grad_norm": 0.9014570116996765,
+      "learning_rate": 9.547525633485044e-07,
+      "loss": 0.7568,
+      "step": 21484
+    },
+    {
+      "epoch": 3.82514245014245,
+      "grad_norm": 0.9881065487861633,
+      "learning_rate": 9.528239174528165e-07,
+      "loss": 0.7546,
+      "step": 21485
+    },
+    {
+      "epoch": 3.8253205128205128,
+      "grad_norm": 0.94111168384552,
+      "learning_rate": 9.508972121808013e-07,
+      "loss": 0.7925,
+      "step": 21486
+    },
+    {
+      "epoch": 3.8254985754985755,
+      "grad_norm": 0.8885481953620911,
+      "learning_rate": 9.489724475701955e-07,
+      "loss": 0.6069,
+      "step": 21487
+    },
+    {
+      "epoch": 3.8256766381766383,
+      "grad_norm": 1.0248520374298096,
+      "learning_rate": 9.470496236587134e-07,
+      "loss": 0.9262,
+      "step": 21488
+    },
+    {
+      "epoch": 3.825854700854701,
+      "grad_norm": 0.9528713226318359,
+      "learning_rate": 9.451287404840247e-07,
+      "loss": 0.7226,
+      "step": 21489
+    },
+    {
+      "epoch": 3.826032763532764,
+      "grad_norm": 0.983078122138977,
+      "learning_rate": 9.432097980837773e-07,
+      "loss": 0.651,
+      "step": 21490
+    },
+    {
+      "epoch": 3.826210826210826,
+      "grad_norm": 1.0022083520889282,
+      "learning_rate": 9.41292796495552e-07,
+      "loss": 0.7214,
+      "step": 21491
+    },
+    {
+      "epoch": 3.826388888888889,
+      "grad_norm": 0.838897168636322,
+      "learning_rate": 9.3937773575693e-07,
+      "loss": 0.5388,
+      "step": 21492
+    },
+    {
+      "epoch": 3.8265669515669516,
+      "grad_norm": 0.9708632230758667,
+      "learning_rate": 9.374646159054146e-07,
+      "loss": 0.7681,
+      "step": 21493
+    },
+    {
+      "epoch": 3.8267450142450143,
+      "grad_norm": 0.9692725539207458,
+      "learning_rate": 9.355534369784979e-07,
+      "loss": 0.8608,
+      "step": 21494
+    },
+    {
+      "epoch": 3.8269230769230766,
+      "grad_norm": 1.0063785314559937,
+      "learning_rate": 9.336441990136169e-07,
+      "loss": 0.8862,
+      "step": 21495
+    },
+    {
+      "epoch": 3.8271011396011394,
+      "grad_norm": 0.9906958937644958,
+      "learning_rate": 9.317369020481858e-07,
+      "loss": 0.7091,
+      "step": 21496
+    },
+    {
+      "epoch": 3.827279202279202,
+      "grad_norm": 0.9517685770988464,
+      "learning_rate": 9.298315461195751e-07,
+      "loss": 0.8612,
+      "step": 21497
+    },
+    {
+      "epoch": 3.827457264957265,
+      "grad_norm": 1.061146855354309,
+      "learning_rate": 9.279281312651211e-07,
+      "loss": 0.8362,
+      "step": 21498
+    },
+    {
+      "epoch": 3.8276353276353277,
+      "grad_norm": 1.1083885431289673,
+      "learning_rate": 9.260266575221055e-07,
+      "loss": 0.7663,
+      "step": 21499
+    },
+    {
+      "epoch": 3.8278133903133904,
+      "grad_norm": 0.9375349879264832,
+      "learning_rate": 9.241271249277872e-07,
+      "loss": 0.682,
+      "step": 21500
+    },
+    {
+      "epoch": 3.827991452991453,
+      "grad_norm": 1.0170559883117676,
+      "learning_rate": 9.222295335193809e-07,
+      "loss": 0.6219,
+      "step": 21501
+    },
+    {
+      "epoch": 3.828169515669516,
+      "grad_norm": 0.9686765670776367,
+      "learning_rate": 9.203338833340791e-07,
+      "loss": 0.6775,
+      "step": 21502
+    },
+    {
+      "epoch": 3.828347578347578,
+      "grad_norm": 0.8623543977737427,
+      "learning_rate": 9.184401744090188e-07,
+      "loss": 0.7056,
+      "step": 21503
+    },
+    {
+      "epoch": 3.828525641025641,
+      "grad_norm": 0.8922286629676819,
+      "learning_rate": 9.165484067812924e-07,
+      "loss": 0.8155,
+      "step": 21504
+    },
+    {
+      "epoch": 3.8287037037037037,
+      "grad_norm": 1.0283470153808594,
+      "learning_rate": 9.146585804879704e-07,
+      "loss": 0.8419,
+      "step": 21505
+    },
+    {
+      "epoch": 3.8288817663817665,
+      "grad_norm": 0.9561583399772644,
+      "learning_rate": 9.127706955660898e-07,
+      "loss": 0.8576,
+      "step": 21506
+    },
+    {
+      "epoch": 3.8290598290598292,
+      "grad_norm": 1.0040736198425293,
+      "learning_rate": 9.10884752052621e-07,
+      "loss": 0.6805,
+      "step": 21507
+    },
+    {
+      "epoch": 3.8292378917378915,
+      "grad_norm": 0.9792984127998352,
+      "learning_rate": 9.090007499845232e-07,
+      "loss": 1.0002,
+      "step": 21508
+    },
+    {
+      "epoch": 3.8294159544159543,
+      "grad_norm": 0.9882341623306274,
+      "learning_rate": 9.071186893987338e-07,
+      "loss": 0.8826,
+      "step": 21509
+    },
+    {
+      "epoch": 3.829594017094017,
+      "grad_norm": 0.9094791412353516,
+      "learning_rate": 9.052385703320787e-07,
+      "loss": 0.7015,
+      "step": 21510
+    },
+    {
+      "epoch": 3.82977207977208,
+      "grad_norm": 0.9043287038803101,
+      "learning_rate": 9.033603928214395e-07,
+      "loss": 0.6868,
+      "step": 21511
+    },
+    {
+      "epoch": 3.8299501424501425,
+      "grad_norm": 1.0372240543365479,
+      "learning_rate": 9.014841569035981e-07,
+      "loss": 0.7217,
+      "step": 21512
+    },
+    {
+      "epoch": 3.8301282051282053,
+      "grad_norm": 0.8339055776596069,
+      "learning_rate": 8.996098626153138e-07,
+      "loss": 0.7506,
+      "step": 21513
+    },
+    {
+      "epoch": 3.830306267806268,
+      "grad_norm": 0.9318352341651917,
+      "learning_rate": 8.977375099933017e-07,
+      "loss": 0.7919,
+      "step": 21514
+    },
+    {
+      "epoch": 3.8304843304843303,
+      "grad_norm": 0.9269062876701355,
+      "learning_rate": 8.958670990742657e-07,
+      "loss": 0.782,
+      "step": 21515
+    },
+    {
+      "epoch": 3.830662393162393,
+      "grad_norm": 1.2074931859970093,
+      "learning_rate": 8.939986298948322e-07,
+      "loss": 0.7641,
+      "step": 21516
+    },
+    {
+      "epoch": 3.830840455840456,
+      "grad_norm": 0.8946232199668884,
+      "learning_rate": 8.921321024916384e-07,
+      "loss": 0.7086,
+      "step": 21517
+    },
+    {
+      "epoch": 3.8310185185185186,
+      "grad_norm": 1.0376194715499878,
+      "learning_rate": 8.90267516901222e-07,
+      "loss": 0.7484,
+      "step": 21518
+    },
+    {
+      "epoch": 3.8311965811965814,
+      "grad_norm": 0.9210571050643921,
+      "learning_rate": 8.884048731601424e-07,
+      "loss": 0.7251,
+      "step": 21519
+    },
+    {
+      "epoch": 3.8313746438746437,
+      "grad_norm": 0.8585400581359863,
+      "learning_rate": 8.865441713048706e-07,
+      "loss": 0.9469,
+      "step": 21520
+    },
+    {
+      "epoch": 3.8315527065527064,
+      "grad_norm": 0.9701051712036133,
+      "learning_rate": 8.846854113718772e-07,
+      "loss": 0.6994,
+      "step": 21521
+    },
+    {
+      "epoch": 3.831730769230769,
+      "grad_norm": 1.0702025890350342,
+      "learning_rate": 8.828285933975888e-07,
+      "loss": 0.918,
+      "step": 21522
+    },
+    {
+      "epoch": 3.831908831908832,
+      "grad_norm": 0.887536883354187,
+      "learning_rate": 8.809737174183652e-07,
+      "loss": 0.966,
+      "step": 21523
+    },
+    {
+      "epoch": 3.8320868945868947,
+      "grad_norm": 0.9875296354293823,
+      "learning_rate": 8.791207834705662e-07,
+      "loss": 0.6294,
+      "step": 21524
+    },
+    {
+      "epoch": 3.8322649572649574,
+      "grad_norm": 0.734708845615387,
+      "learning_rate": 8.772697915904848e-07,
+      "loss": 0.573,
+      "step": 21525
+    },
+    {
+      "epoch": 3.83244301994302,
+      "grad_norm": 1.1104393005371094,
+      "learning_rate": 8.754207418143923e-07,
+      "loss": 0.7678,
+      "step": 21526
+    },
+    {
+      "epoch": 3.8326210826210825,
+      "grad_norm": 1.0131367444992065,
+      "learning_rate": 8.735736341785261e-07,
+      "loss": 0.7966,
+      "step": 21527
+    },
+    {
+      "epoch": 3.8327991452991452,
+      "grad_norm": 0.8396084904670715,
+      "learning_rate": 8.717284687190575e-07,
+      "loss": 0.5308,
+      "step": 21528
+    },
+    {
+      "epoch": 3.832977207977208,
+      "grad_norm": 0.9788475036621094,
+      "learning_rate": 8.698852454721573e-07,
+      "loss": 0.5804,
+      "step": 21529
+    },
+    {
+      "epoch": 3.8331552706552707,
+      "grad_norm": 0.97635418176651,
+      "learning_rate": 8.680439644739191e-07,
+      "loss": 0.7716,
+      "step": 21530
+    },
+    {
+      "epoch": 3.8333333333333335,
+      "grad_norm": 0.9419910311698914,
+      "learning_rate": 8.662046257604472e-07,
+      "loss": 0.7311,
+      "step": 21531
+    },
+    {
+      "epoch": 3.833511396011396,
+      "grad_norm": 0.8974188566207886,
+      "learning_rate": 8.643672293677463e-07,
+      "loss": 0.7538,
+      "step": 21532
+    },
+    {
+      "epoch": 3.8336894586894585,
+      "grad_norm": 0.9138547778129578,
+      "learning_rate": 8.625317753318318e-07,
+      "loss": 0.718,
+      "step": 21533
+    },
+    {
+      "epoch": 3.8338675213675213,
+      "grad_norm": 0.9484068751335144,
+      "learning_rate": 8.606982636886862e-07,
+      "loss": 0.766,
+      "step": 21534
+    },
+    {
+      "epoch": 3.834045584045584,
+      "grad_norm": 1.105020523071289,
+      "learning_rate": 8.588666944741918e-07,
+      "loss": 0.7913,
+      "step": 21535
+    },
+    {
+      "epoch": 3.834223646723647,
+      "grad_norm": 1.0500797033309937,
+      "learning_rate": 8.570370677242756e-07,
+      "loss": 0.8379,
+      "step": 21536
+    },
+    {
+      "epoch": 3.8344017094017095,
+      "grad_norm": 0.9115577340126038,
+      "learning_rate": 8.552093834747532e-07,
+      "loss": 0.7894,
+      "step": 21537
+    },
+    {
+      "epoch": 3.8345797720797723,
+      "grad_norm": 0.9908710718154907,
+      "learning_rate": 8.533836417614516e-07,
+      "loss": 0.7978,
+      "step": 21538
+    },
+    {
+      "epoch": 3.8347578347578346,
+      "grad_norm": 0.9764583706855774,
+      "learning_rate": 8.515598426201421e-07,
+      "loss": 0.6787,
+      "step": 21539
+    },
+    {
+      "epoch": 3.8349358974358974,
+      "grad_norm": 0.9413735270500183,
+      "learning_rate": 8.497379860865518e-07,
+      "loss": 0.7627,
+      "step": 21540
+    },
+    {
+      "epoch": 3.83511396011396,
+      "grad_norm": 1.0700278282165527,
+      "learning_rate": 8.479180721963853e-07,
+      "loss": 0.8845,
+      "step": 21541
+    },
+    {
+      "epoch": 3.835292022792023,
+      "grad_norm": 1.0641980171203613,
+      "learning_rate": 8.461001009852809e-07,
+      "loss": 0.7407,
+      "step": 21542
+    },
+    {
+      "epoch": 3.8354700854700856,
+      "grad_norm": 1.0877798795700073,
+      "learning_rate": 8.442840724888768e-07,
+      "loss": 0.8741,
+      "step": 21543
+    },
+    {
+      "epoch": 3.835648148148148,
+      "grad_norm": 1.1364705562591553,
+      "learning_rate": 8.424699867427444e-07,
+      "loss": 0.973,
+      "step": 21544
+    },
+    {
+      "epoch": 3.8358262108262107,
+      "grad_norm": 0.9369027614593506,
+      "learning_rate": 8.40657843782433e-07,
+      "loss": 0.8727,
+      "step": 21545
+    },
+    {
+      "epoch": 3.8360042735042734,
+      "grad_norm": 0.9268773794174194,
+      "learning_rate": 8.388476436434478e-07,
+      "loss": 0.7735,
+      "step": 21546
+    },
+    {
+      "epoch": 3.836182336182336,
+      "grad_norm": 1.2019761800765991,
+      "learning_rate": 8.370393863612602e-07,
+      "loss": 0.8365,
+      "step": 21547
+    },
+    {
+      "epoch": 3.836360398860399,
+      "grad_norm": 0.8963661789894104,
+      "learning_rate": 8.352330719712753e-07,
+      "loss": 0.5905,
+      "step": 21548
+    },
+    {
+      "epoch": 3.8365384615384617,
+      "grad_norm": 1.0327519178390503,
+      "learning_rate": 8.334287005089203e-07,
+      "loss": 0.6485,
+      "step": 21549
+    },
+    {
+      "epoch": 3.8367165242165244,
+      "grad_norm": 0.9324591755867004,
+      "learning_rate": 8.316262720095114e-07,
+      "loss": 0.7034,
+      "step": 21550
+    },
+    {
+      "epoch": 3.8368945868945867,
+      "grad_norm": 0.9687849879264832,
+      "learning_rate": 8.29825786508387e-07,
+      "loss": 0.9147,
+      "step": 21551
+    },
+    {
+      "epoch": 3.8370726495726495,
+      "grad_norm": 0.9329792261123657,
+      "learning_rate": 8.280272440408298e-07,
+      "loss": 0.6578,
+      "step": 21552
+    },
+    {
+      "epoch": 3.8372507122507122,
+      "grad_norm": 0.9121544361114502,
+      "learning_rate": 8.262306446420565e-07,
+      "loss": 0.7456,
+      "step": 21553
+    },
+    {
+      "epoch": 3.837428774928775,
+      "grad_norm": 0.9514213800430298,
+      "learning_rate": 8.244359883472719e-07,
+      "loss": 0.8341,
+      "step": 21554
+    },
+    {
+      "epoch": 3.8376068376068377,
+      "grad_norm": 1.020097017288208,
+      "learning_rate": 8.22643275191659e-07,
+      "loss": 0.9089,
+      "step": 21555
+    },
+    {
+      "epoch": 3.8377849002849,
+      "grad_norm": 0.9623993635177612,
+      "learning_rate": 8.208525052103233e-07,
+      "loss": 0.8057,
+      "step": 21556
+    },
+    {
+      "epoch": 3.837962962962963,
+      "grad_norm": 0.9708266854286194,
+      "learning_rate": 8.190636784383477e-07,
+      "loss": 0.7576,
+      "step": 21557
+    },
+    {
+      "epoch": 3.8381410256410255,
+      "grad_norm": 0.9058563709259033,
+      "learning_rate": 8.172767949107929e-07,
+      "loss": 0.7129,
+      "step": 21558
+    },
+    {
+      "epoch": 3.8383190883190883,
+      "grad_norm": 0.9107457399368286,
+      "learning_rate": 8.154918546626755e-07,
+      "loss": 0.6841,
+      "step": 21559
+    },
+    {
+      "epoch": 3.838497150997151,
+      "grad_norm": 1.144352674484253,
+      "learning_rate": 8.137088577289453e-07,
+      "loss": 0.905,
+      "step": 21560
+    },
+    {
+      "epoch": 3.838675213675214,
+      "grad_norm": 1.0036160945892334,
+      "learning_rate": 8.119278041445522e-07,
+      "loss": 0.8943,
+      "step": 21561
+    },
+    {
+      "epoch": 3.8388532763532766,
+      "grad_norm": 0.8285079002380371,
+      "learning_rate": 8.101486939443903e-07,
+      "loss": 0.5747,
+      "step": 21562
+    },
+    {
+      "epoch": 3.8390313390313393,
+      "grad_norm": 0.8895573616027832,
+      "learning_rate": 8.083715271633097e-07,
+      "loss": 0.7271,
+      "step": 21563
+    },
+    {
+      "epoch": 3.8392094017094016,
+      "grad_norm": 1.020896553993225,
+      "learning_rate": 8.06596303836138e-07,
+      "loss": 0.6835,
+      "step": 21564
+    },
+    {
+      "epoch": 3.8393874643874644,
+      "grad_norm": 0.9939431548118591,
+      "learning_rate": 8.048230239976584e-07,
+      "loss": 0.9226,
+      "step": 21565
+    },
+    {
+      "epoch": 3.839565527065527,
+      "grad_norm": 1.0216248035430908,
+      "learning_rate": 8.030516876826099e-07,
+      "loss": 0.5721,
+      "step": 21566
+    },
+    {
+      "epoch": 3.83974358974359,
+      "grad_norm": 0.9228139519691467,
+      "learning_rate": 8.012822949256982e-07,
+      "loss": 0.8811,
+      "step": 21567
+    },
+    {
+      "epoch": 3.839921652421652,
+      "grad_norm": 1.1747238636016846,
+      "learning_rate": 7.995148457615953e-07,
+      "loss": 0.8204,
+      "step": 21568
+    },
+    {
+      "epoch": 3.840099715099715,
+      "grad_norm": 0.9457100033760071,
+      "learning_rate": 7.977493402249292e-07,
+      "loss": 0.8442,
+      "step": 21569
+    },
+    {
+      "epoch": 3.8402777777777777,
+      "grad_norm": 0.865328311920166,
+      "learning_rate": 7.959857783502833e-07,
+      "loss": 0.9258,
+      "step": 21570
+    },
+    {
+      "epoch": 3.8404558404558404,
+      "grad_norm": 1.0624295473098755,
+      "learning_rate": 7.9422416017223e-07,
+      "loss": 0.7901,
+      "step": 21571
+    },
+    {
+      "epoch": 3.840633903133903,
+      "grad_norm": 0.9670442938804626,
+      "learning_rate": 7.924644857252639e-07,
+      "loss": 0.7427,
+      "step": 21572
+    },
+    {
+      "epoch": 3.840811965811966,
+      "grad_norm": 0.992448091506958,
+      "learning_rate": 7.907067550438685e-07,
+      "loss": 0.7893,
+      "step": 21573
+    },
+    {
+      "epoch": 3.8409900284900287,
+      "grad_norm": 0.8243263363838196,
+      "learning_rate": 7.889509681624941e-07,
+      "loss": 0.5991,
+      "step": 21574
+    },
+    {
+      "epoch": 3.8411680911680914,
+      "grad_norm": 0.839352011680603,
+      "learning_rate": 7.871971251155131e-07,
+      "loss": 0.7611,
+      "step": 21575
+    },
+    {
+      "epoch": 3.8413461538461537,
+      "grad_norm": 0.9567636251449585,
+      "learning_rate": 7.854452259373313e-07,
+      "loss": 0.657,
+      "step": 21576
+    },
+    {
+      "epoch": 3.8415242165242165,
+      "grad_norm": 0.9481256008148193,
+      "learning_rate": 7.836952706622325e-07,
+      "loss": 0.8136,
+      "step": 21577
+    },
+    {
+      "epoch": 3.8417022792022792,
+      "grad_norm": 0.8747114539146423,
+      "learning_rate": 7.819472593245225e-07,
+      "loss": 0.6968,
+      "step": 21578
+    },
+    {
+      "epoch": 3.841880341880342,
+      "grad_norm": 0.9657062888145447,
+      "learning_rate": 7.802011919584518e-07,
+      "loss": 0.9184,
+      "step": 21579
+    },
+    {
+      "epoch": 3.8420584045584043,
+      "grad_norm": 1.0970104932785034,
+      "learning_rate": 7.784570685982262e-07,
+      "loss": 0.7596,
+      "step": 21580
+    },
+    {
+      "epoch": 3.842236467236467,
+      "grad_norm": 0.9810351729393005,
+      "learning_rate": 7.767148892780296e-07,
+      "loss": 0.9235,
+      "step": 21581
+    },
+    {
+      "epoch": 3.84241452991453,
+      "grad_norm": 1.017484426498413,
+      "learning_rate": 7.74974654031968e-07,
+      "loss": 0.9001,
+      "step": 21582
+    },
+    {
+      "epoch": 3.8425925925925926,
+      "grad_norm": 0.9285971522331238,
+      "learning_rate": 7.732363628941696e-07,
+      "loss": 0.693,
+      "step": 21583
+    },
+    {
+      "epoch": 3.8427706552706553,
+      "grad_norm": 0.8508382439613342,
+      "learning_rate": 7.71500015898674e-07,
+      "loss": 0.7199,
+      "step": 21584
+    },
+    {
+      "epoch": 3.842948717948718,
+      "grad_norm": 1.021244764328003,
+      "learning_rate": 7.697656130795094e-07,
+      "loss": 0.7583,
+      "step": 21585
+    },
+    {
+      "epoch": 3.843126780626781,
+      "grad_norm": 1.202195167541504,
+      "learning_rate": 7.680331544706598e-07,
+      "loss": 0.9845,
+      "step": 21586
+    },
+    {
+      "epoch": 3.8433048433048436,
+      "grad_norm": 0.9475895762443542,
+      "learning_rate": 7.663026401060535e-07,
+      "loss": 0.7508,
+      "step": 21587
+    },
+    {
+      "epoch": 3.843482905982906,
+      "grad_norm": 0.9706496000289917,
+      "learning_rate": 7.64574070019608e-07,
+      "loss": 0.7399,
+      "step": 21588
+    },
+    {
+      "epoch": 3.8436609686609686,
+      "grad_norm": 0.9435704946517944,
+      "learning_rate": 7.628474442451961e-07,
+      "loss": 0.7967,
+      "step": 21589
+    },
+    {
+      "epoch": 3.8438390313390314,
+      "grad_norm": 0.9195558428764343,
+      "learning_rate": 7.61122762816635e-07,
+      "loss": 0.8477,
+      "step": 21590
+    },
+    {
+      "epoch": 3.844017094017094,
+      "grad_norm": 0.8691080212593079,
+      "learning_rate": 7.594000257677314e-07,
+      "loss": 0.7367,
+      "step": 21591
+    },
+    {
+      "epoch": 3.8441951566951564,
+      "grad_norm": 0.9354162812232971,
+      "learning_rate": 7.576792331322136e-07,
+      "loss": 0.8365,
+      "step": 21592
+    },
+    {
+      "epoch": 3.844373219373219,
+      "grad_norm": 1.0419033765792847,
+      "learning_rate": 7.559603849438213e-07,
+      "loss": 0.6427,
+      "step": 21593
+    },
+    {
+      "epoch": 3.844551282051282,
+      "grad_norm": 0.9753270149230957,
+      "learning_rate": 7.542434812362275e-07,
+      "loss": 0.6996,
+      "step": 21594
+    },
+    {
+      "epoch": 3.8447293447293447,
+      "grad_norm": 0.917040228843689,
+      "learning_rate": 7.525285220430723e-07,
+      "loss": 0.6474,
+      "step": 21595
+    },
+    {
+      "epoch": 3.8449074074074074,
+      "grad_norm": 0.8489373922348022,
+      "learning_rate": 7.508155073979395e-07,
+      "loss": 0.688,
+      "step": 21596
+    },
+    {
+      "epoch": 3.84508547008547,
+      "grad_norm": 0.8564489483833313,
+      "learning_rate": 7.491044373344136e-07,
+      "loss": 0.6195,
+      "step": 21597
+    },
+    {
+      "epoch": 3.845263532763533,
+      "grad_norm": 1.1631865501403809,
+      "learning_rate": 7.473953118860011e-07,
+      "loss": 0.9136,
+      "step": 21598
+    },
+    {
+      "epoch": 3.8454415954415957,
+      "grad_norm": 1.008110523223877,
+      "learning_rate": 7.456881310862085e-07,
+      "loss": 0.6872,
+      "step": 21599
+    },
+    {
+      "epoch": 3.845619658119658,
+      "grad_norm": 0.9467812180519104,
+      "learning_rate": 7.439828949684646e-07,
+      "loss": 0.7575,
+      "step": 21600
+    },
+    {
+      "epoch": 3.8457977207977208,
+      "grad_norm": 1.0625617504119873,
+      "learning_rate": 7.422796035661983e-07,
+      "loss": 0.7556,
+      "step": 21601
+    },
+    {
+      "epoch": 3.8459757834757835,
+      "grad_norm": 0.940880537033081,
+      "learning_rate": 7.405782569127606e-07,
+      "loss": 0.9087,
+      "step": 21602
+    },
+    {
+      "epoch": 3.8461538461538463,
+      "grad_norm": 0.9074656963348389,
+      "learning_rate": 7.388788550415138e-07,
+      "loss": 1.0545,
+      "step": 21603
+    },
+    {
+      "epoch": 3.8463319088319086,
+      "grad_norm": 0.913948118686676,
+      "learning_rate": 7.371813979857312e-07,
+      "loss": 0.7383,
+      "step": 21604
+    },
+    {
+      "epoch": 3.8465099715099713,
+      "grad_norm": 0.983601987361908,
+      "learning_rate": 7.354858857786751e-07,
+      "loss": 0.859,
+      "step": 21605
+    },
+    {
+      "epoch": 3.846688034188034,
+      "grad_norm": 0.9661694169044495,
+      "learning_rate": 7.337923184535744e-07,
+      "loss": 1.0301,
+      "step": 21606
+    },
+    {
+      "epoch": 3.846866096866097,
+      "grad_norm": 0.9005492329597473,
+      "learning_rate": 7.321006960436027e-07,
+      "loss": 0.8122,
+      "step": 21607
+    },
+    {
+      "epoch": 3.8470441595441596,
+      "grad_norm": 0.8839511871337891,
+      "learning_rate": 7.304110185819002e-07,
+      "loss": 0.6888,
+      "step": 21608
+    },
+    {
+      "epoch": 3.8472222222222223,
+      "grad_norm": 1.069863200187683,
+      "learning_rate": 7.287232861015958e-07,
+      "loss": 0.7776,
+      "step": 21609
+    },
+    {
+      "epoch": 3.847400284900285,
+      "grad_norm": 1.0439397096633911,
+      "learning_rate": 7.270374986357297e-07,
+      "loss": 0.871,
+      "step": 21610
+    },
+    {
+      "epoch": 3.847578347578348,
+      "grad_norm": 1.0932252407073975,
+      "learning_rate": 7.253536562173424e-07,
+      "loss": 0.8679,
+      "step": 21611
+    },
+    {
+      "epoch": 3.84775641025641,
+      "grad_norm": 0.9416168928146362,
+      "learning_rate": 7.236717588794295e-07,
+      "loss": 0.9173,
+      "step": 21612
+    },
+    {
+      "epoch": 3.847934472934473,
+      "grad_norm": 1.028075933456421,
+      "learning_rate": 7.219918066549313e-07,
+      "loss": 0.8136,
+      "step": 21613
+    },
+    {
+      "epoch": 3.8481125356125356,
+      "grad_norm": 0.8770352005958557,
+      "learning_rate": 7.203137995767772e-07,
+      "loss": 0.9196,
+      "step": 21614
+    },
+    {
+      "epoch": 3.8482905982905984,
+      "grad_norm": 0.9017715454101562,
+      "learning_rate": 7.186377376778297e-07,
+      "loss": 0.8794,
+      "step": 21615
+    },
+    {
+      "epoch": 3.8484686609686607,
+      "grad_norm": 0.9379174113273621,
+      "learning_rate": 7.169636209909514e-07,
+      "loss": 0.6274,
+      "step": 21616
+    },
+    {
+      "epoch": 3.8486467236467234,
+      "grad_norm": 0.9132708311080933,
+      "learning_rate": 7.152914495489161e-07,
+      "loss": 0.8766,
+      "step": 21617
+    },
+    {
+      "epoch": 3.848824786324786,
+      "grad_norm": 0.9938055276870728,
+      "learning_rate": 7.136212233845085e-07,
+      "loss": 0.702,
+      "step": 21618
+    },
+    {
+      "epoch": 3.849002849002849,
+      "grad_norm": 0.8011921048164368,
+      "learning_rate": 7.119529425304361e-07,
+      "loss": 0.6543,
+      "step": 21619
+    },
+    {
+      "epoch": 3.8491809116809117,
+      "grad_norm": 1.0564098358154297,
+      "learning_rate": 7.102866070193947e-07,
+      "loss": 0.8832,
+      "step": 21620
+    },
+    {
+      "epoch": 3.8493589743589745,
+      "grad_norm": 0.8896064162254333,
+      "learning_rate": 7.086222168840362e-07,
+      "loss": 0.5891,
+      "step": 21621
+    },
+    {
+      "epoch": 3.849537037037037,
+      "grad_norm": 1.0690109729766846,
+      "learning_rate": 7.069597721569565e-07,
+      "loss": 0.9243,
+      "step": 21622
+    },
+    {
+      "epoch": 3.8497150997151,
+      "grad_norm": 0.9678674936294556,
+      "learning_rate": 7.052992728707408e-07,
+      "loss": 0.855,
+      "step": 21623
+    },
+    {
+      "epoch": 3.8498931623931623,
+      "grad_norm": 0.8748130798339844,
+      "learning_rate": 7.036407190579187e-07,
+      "loss": 0.797,
+      "step": 21624
+    },
+    {
+      "epoch": 3.850071225071225,
+      "grad_norm": 0.9823090434074402,
+      "learning_rate": 7.019841107509862e-07,
+      "loss": 0.8641,
+      "step": 21625
+    },
+    {
+      "epoch": 3.8502492877492878,
+      "grad_norm": 1.0925766229629517,
+      "learning_rate": 7.003294479824063e-07,
+      "loss": 0.888,
+      "step": 21626
+    },
+    {
+      "epoch": 3.8504273504273505,
+      "grad_norm": 1.028260588645935,
+      "learning_rate": 6.986767307845866e-07,
+      "loss": 0.8097,
+      "step": 21627
+    },
+    {
+      "epoch": 3.8506054131054133,
+      "grad_norm": 0.8923847675323486,
+      "learning_rate": 6.970259591899231e-07,
+      "loss": 0.7852,
+      "step": 21628
+    },
+    {
+      "epoch": 3.8507834757834756,
+      "grad_norm": 0.9850567579269409,
+      "learning_rate": 6.953771332307458e-07,
+      "loss": 0.768,
+      "step": 21629
+    },
+    {
+      "epoch": 3.8509615384615383,
+      "grad_norm": 1.0248289108276367,
+      "learning_rate": 6.937302529393619e-07,
+      "loss": 0.7098,
+      "step": 21630
+    },
+    {
+      "epoch": 3.851139601139601,
+      "grad_norm": 0.8439950942993164,
+      "learning_rate": 6.920853183480569e-07,
+      "loss": 0.7061,
+      "step": 21631
+    },
+    {
+      "epoch": 3.851317663817664,
+      "grad_norm": 0.9808463454246521,
+      "learning_rate": 6.904423294890272e-07,
+      "loss": 0.8573,
+      "step": 21632
+    },
+    {
+      "epoch": 3.8514957264957266,
+      "grad_norm": 1.0319223403930664,
+      "learning_rate": 6.888012863944915e-07,
+      "loss": 0.8063,
+      "step": 21633
+    },
+    {
+      "epoch": 3.8516737891737893,
+      "grad_norm": 0.9523464441299438,
+      "learning_rate": 6.871621890966018e-07,
+      "loss": 0.7309,
+      "step": 21634
+    },
+    {
+      "epoch": 3.851851851851852,
+      "grad_norm": 1.1411263942718506,
+      "learning_rate": 6.855250376274547e-07,
+      "loss": 0.9086,
+      "step": 21635
+    },
+    {
+      "epoch": 3.8520299145299144,
+      "grad_norm": 0.990542471408844,
+      "learning_rate": 6.838898320191356e-07,
+      "loss": 0.8246,
+      "step": 21636
+    },
+    {
+      "epoch": 3.852207977207977,
+      "grad_norm": 1.066152572631836,
+      "learning_rate": 6.822565723036856e-07,
+      "loss": 0.8174,
+      "step": 21637
+    },
+    {
+      "epoch": 3.85238603988604,
+      "grad_norm": 1.0600324869155884,
+      "learning_rate": 6.806252585131124e-07,
+      "loss": 0.7849,
+      "step": 21638
+    },
+    {
+      "epoch": 3.8525641025641026,
+      "grad_norm": 0.9351125955581665,
+      "learning_rate": 6.78995890679357e-07,
+      "loss": 0.8347,
+      "step": 21639
+    },
+    {
+      "epoch": 3.8527421652421654,
+      "grad_norm": 0.9323590397834778,
+      "learning_rate": 6.773684688343606e-07,
+      "loss": 0.8798,
+      "step": 21640
+    },
+    {
+      "epoch": 3.8529202279202277,
+      "grad_norm": 1.0057048797607422,
+      "learning_rate": 6.757429930099979e-07,
+      "loss": 0.811,
+      "step": 21641
+    },
+    {
+      "epoch": 3.8530982905982905,
+      "grad_norm": 0.9667607545852661,
+      "learning_rate": 6.74119463238132e-07,
+      "loss": 0.8218,
+      "step": 21642
+    },
+    {
+      "epoch": 3.853276353276353,
+      "grad_norm": 0.9609261751174927,
+      "learning_rate": 6.724978795505598e-07,
+      "loss": 0.7635,
+      "step": 21643
+    },
+    {
+      "epoch": 3.853454415954416,
+      "grad_norm": 0.8612968921661377,
+      "learning_rate": 6.70878241979056e-07,
+      "loss": 0.6867,
+      "step": 21644
+    },
+    {
+      "epoch": 3.8536324786324787,
+      "grad_norm": 0.95118248462677,
+      "learning_rate": 6.692605505553506e-07,
+      "loss": 0.7477,
+      "step": 21645
+    },
+    {
+      "epoch": 3.8538105413105415,
+      "grad_norm": 0.9131327271461487,
+      "learning_rate": 6.676448053111628e-07,
+      "loss": 0.7183,
+      "step": 21646
+    },
+    {
+      "epoch": 3.853988603988604,
+      "grad_norm": 0.8879417181015015,
+      "learning_rate": 6.660310062781116e-07,
+      "loss": 0.683,
+      "step": 21647
+    },
+    {
+      "epoch": 3.8541666666666665,
+      "grad_norm": 0.9245330095291138,
+      "learning_rate": 6.644191534878385e-07,
+      "loss": 0.6442,
+      "step": 21648
+    },
+    {
+      "epoch": 3.8543447293447293,
+      "grad_norm": 1.046465277671814,
+      "learning_rate": 6.628092469719182e-07,
+      "loss": 0.7835,
+      "step": 21649
+    },
+    {
+      "epoch": 3.854522792022792,
+      "grad_norm": 0.865329384803772,
+      "learning_rate": 6.612012867618922e-07,
+      "loss": 0.7673,
+      "step": 21650
+    },
+    {
+      "epoch": 3.8547008547008548,
+      "grad_norm": 0.9498918056488037,
+      "learning_rate": 6.595952728892796e-07,
+      "loss": 0.7124,
+      "step": 21651
+    },
+    {
+      "epoch": 3.8548789173789175,
+      "grad_norm": 0.9849167466163635,
+      "learning_rate": 6.57991205385533e-07,
+      "loss": 0.6921,
+      "step": 21652
+    },
+    {
+      "epoch": 3.85505698005698,
+      "grad_norm": 0.9602441191673279,
+      "learning_rate": 6.563890842820719e-07,
+      "loss": 0.812,
+      "step": 21653
+    },
+    {
+      "epoch": 3.8552350427350426,
+      "grad_norm": 1.0433622598648071,
+      "learning_rate": 6.547889096103155e-07,
+      "loss": 0.9083,
+      "step": 21654
+    },
+    {
+      "epoch": 3.8554131054131053,
+      "grad_norm": 0.9389081001281738,
+      "learning_rate": 6.531906814015831e-07,
+      "loss": 0.7912,
+      "step": 21655
+    },
+    {
+      "epoch": 3.855591168091168,
+      "grad_norm": 0.991176187992096,
+      "learning_rate": 6.515943996872165e-07,
+      "loss": 0.7138,
+      "step": 21656
+    },
+    {
+      "epoch": 3.855769230769231,
+      "grad_norm": 1.1098177433013916,
+      "learning_rate": 6.500000644984682e-07,
+      "loss": 0.7762,
+      "step": 21657
+    },
+    {
+      "epoch": 3.8559472934472936,
+      "grad_norm": 0.8345046639442444,
+      "learning_rate": 6.484076758665914e-07,
+      "loss": 0.7031,
+      "step": 21658
+    },
+    {
+      "epoch": 3.8561253561253563,
+      "grad_norm": 0.9750561118125916,
+      "learning_rate": 6.46817233822783e-07,
+      "loss": 0.6956,
+      "step": 21659
+    },
+    {
+      "epoch": 3.8563034188034186,
+      "grad_norm": 1.0653631687164307,
+      "learning_rate": 6.452287383981958e-07,
+      "loss": 0.7884,
+      "step": 21660
+    },
+    {
+      "epoch": 3.8564814814814814,
+      "grad_norm": 1.1299207210540771,
+      "learning_rate": 6.436421896239719e-07,
+      "loss": 1.169,
+      "step": 21661
+    },
+    {
+      "epoch": 3.856659544159544,
+      "grad_norm": 0.8966163396835327,
+      "learning_rate": 6.420575875311752e-07,
+      "loss": 0.7762,
+      "step": 21662
+    },
+    {
+      "epoch": 3.856837606837607,
+      "grad_norm": 1.015863299369812,
+      "learning_rate": 6.404749321508697e-07,
+      "loss": 0.866,
+      "step": 21663
+    },
+    {
+      "epoch": 3.8570156695156697,
+      "grad_norm": 0.9820823669433594,
+      "learning_rate": 6.388942235140527e-07,
+      "loss": 0.8393,
+      "step": 21664
+    },
+    {
+      "epoch": 3.857193732193732,
+      "grad_norm": 0.8580895066261292,
+      "learning_rate": 6.373154616516885e-07,
+      "loss": 0.6423,
+      "step": 21665
+    },
+    {
+      "epoch": 3.8573717948717947,
+      "grad_norm": 0.9994450211524963,
+      "learning_rate": 6.357386465947301e-07,
+      "loss": 0.7924,
+      "step": 21666
+    },
+    {
+      "epoch": 3.8575498575498575,
+      "grad_norm": 0.9615799784660339,
+      "learning_rate": 6.341637783740639e-07,
+      "loss": 0.5859,
+      "step": 21667
+    },
+    {
+      "epoch": 3.85772792022792,
+      "grad_norm": 0.8876405954360962,
+      "learning_rate": 6.325908570205429e-07,
+      "loss": 0.7389,
+      "step": 21668
+    },
+    {
+      "epoch": 3.857905982905983,
+      "grad_norm": 0.9425645470619202,
+      "learning_rate": 6.31019882564976e-07,
+      "loss": 0.8526,
+      "step": 21669
+    },
+    {
+      "epoch": 3.8580840455840457,
+      "grad_norm": 0.9900808334350586,
+      "learning_rate": 6.294508550381606e-07,
+      "loss": 0.79,
+      "step": 21670
+    },
+    {
+      "epoch": 3.8582621082621085,
+      "grad_norm": 0.924473226070404,
+      "learning_rate": 6.278837744708388e-07,
+      "loss": 0.681,
+      "step": 21671
+    },
+    {
+      "epoch": 3.8584401709401708,
+      "grad_norm": 1.0653358697891235,
+      "learning_rate": 6.263186408936972e-07,
+      "loss": 0.8024,
+      "step": 21672
+    },
+    {
+      "epoch": 3.8586182336182335,
+      "grad_norm": 0.8799527883529663,
+      "learning_rate": 6.247554543374113e-07,
+      "loss": 0.7357,
+      "step": 21673
+    },
+    {
+      "epoch": 3.8587962962962963,
+      "grad_norm": 0.9546528458595276,
+      "learning_rate": 6.23194214832612e-07,
+      "loss": 0.7254,
+      "step": 21674
+    },
+    {
+      "epoch": 3.858974358974359,
+      "grad_norm": 0.8290728330612183,
+      "learning_rate": 6.21634922409875e-07,
+      "loss": 0.5482,
+      "step": 21675
+    },
+    {
+      "epoch": 3.859152421652422,
+      "grad_norm": 0.8252167701721191,
+      "learning_rate": 6.200775770997758e-07,
+      "loss": 0.5761,
+      "step": 21676
+    },
+    {
+      "epoch": 3.859330484330484,
+      "grad_norm": 0.9244386553764343,
+      "learning_rate": 6.18522178932801e-07,
+      "loss": 0.8321,
+      "step": 21677
+    },
+    {
+      "epoch": 3.859508547008547,
+      "grad_norm": 0.9701852202415466,
+      "learning_rate": 6.169687279394376e-07,
+      "loss": 0.8276,
+      "step": 21678
+    },
+    {
+      "epoch": 3.8596866096866096,
+      "grad_norm": 0.885154128074646,
+      "learning_rate": 6.154172241501277e-07,
+      "loss": 0.832,
+      "step": 21679
+    },
+    {
+      "epoch": 3.8598646723646723,
+      "grad_norm": 0.9132698774337769,
+      "learning_rate": 6.138676675952581e-07,
+      "loss": 0.721,
+      "step": 21680
+    },
+    {
+      "epoch": 3.860042735042735,
+      "grad_norm": 0.86441570520401,
+      "learning_rate": 6.123200583051936e-07,
+      "loss": 0.6057,
+      "step": 21681
+    },
+    {
+      "epoch": 3.860220797720798,
+      "grad_norm": 0.9521238803863525,
+      "learning_rate": 6.107743963102652e-07,
+      "loss": 0.6874,
+      "step": 21682
+    },
+    {
+      "epoch": 3.8603988603988606,
+      "grad_norm": 0.9968441724777222,
+      "learning_rate": 6.092306816407489e-07,
+      "loss": 0.8812,
+      "step": 21683
+    },
+    {
+      "epoch": 3.8605769230769234,
+      "grad_norm": 0.7897293567657471,
+      "learning_rate": 6.076889143268871e-07,
+      "loss": 0.5925,
+      "step": 21684
+    },
+    {
+      "epoch": 3.8607549857549857,
+      "grad_norm": 0.9483756422996521,
+      "learning_rate": 6.06149094398889e-07,
+      "loss": 0.7941,
+      "step": 21685
+    },
+    {
+      "epoch": 3.8609330484330484,
+      "grad_norm": 1.136690378189087,
+      "learning_rate": 6.046112218869305e-07,
+      "loss": 0.8158,
+      "step": 21686
+    },
+    {
+      "epoch": 3.861111111111111,
+      "grad_norm": 1.1348493099212646,
+      "learning_rate": 6.030752968211317e-07,
+      "loss": 0.8461,
+      "step": 21687
+    },
+    {
+      "epoch": 3.861289173789174,
+      "grad_norm": 0.9544394612312317,
+      "learning_rate": 6.015413192316133e-07,
+      "loss": 0.7925,
+      "step": 21688
+    },
+    {
+      "epoch": 3.861467236467236,
+      "grad_norm": 1.0044975280761719,
+      "learning_rate": 6.000092891483844e-07,
+      "loss": 0.6849,
+      "step": 21689
+    },
+    {
+      "epoch": 3.861645299145299,
+      "grad_norm": 0.9967083930969238,
+      "learning_rate": 5.984792066014988e-07,
+      "loss": 0.9211,
+      "step": 21690
+    },
+    {
+      "epoch": 3.8618233618233617,
+      "grad_norm": 1.1682944297790527,
+      "learning_rate": 5.969510716209326e-07,
+      "loss": 0.9764,
+      "step": 21691
+    },
+    {
+      "epoch": 3.8620014245014245,
+      "grad_norm": 0.8601269721984863,
+      "learning_rate": 5.954248842366062e-07,
+      "loss": 0.6743,
+      "step": 21692
+    },
+    {
+      "epoch": 3.8621794871794872,
+      "grad_norm": 0.9522037506103516,
+      "learning_rate": 5.9390064447844e-07,
+      "loss": 0.6689,
+      "step": 21693
+    },
+    {
+      "epoch": 3.86235754985755,
+      "grad_norm": 0.9057328104972839,
+      "learning_rate": 5.923783523762993e-07,
+      "loss": 0.9379,
+      "step": 21694
+    },
+    {
+      "epoch": 3.8625356125356127,
+      "grad_norm": 0.9345870614051819,
+      "learning_rate": 5.908580079599934e-07,
+      "loss": 0.7687,
+      "step": 21695
+    },
+    {
+      "epoch": 3.8627136752136755,
+      "grad_norm": 1.0109366178512573,
+      "learning_rate": 5.893396112593208e-07,
+      "loss": 0.8095,
+      "step": 21696
+    },
+    {
+      "epoch": 3.862891737891738,
+      "grad_norm": 0.9415621161460876,
+      "learning_rate": 5.878231623040242e-07,
+      "loss": 0.679,
+      "step": 21697
+    },
+    {
+      "epoch": 3.8630698005698005,
+      "grad_norm": 0.9021549224853516,
+      "learning_rate": 5.863086611238356e-07,
+      "loss": 0.8654,
+      "step": 21698
+    },
+    {
+      "epoch": 3.8632478632478633,
+      "grad_norm": 1.1498757600784302,
+      "learning_rate": 5.847961077484087e-07,
+      "loss": 0.7595,
+      "step": 21699
+    },
+    {
+      "epoch": 3.863425925925926,
+      "grad_norm": 0.9590532183647156,
+      "learning_rate": 5.832855022073868e-07,
+      "loss": 0.7458,
+      "step": 21700
+    },
+    {
+      "epoch": 3.8636039886039883,
+      "grad_norm": 0.9005709290504456,
+      "learning_rate": 5.817768445303684e-07,
+      "loss": 0.7446,
+      "step": 21701
+    },
+    {
+      "epoch": 3.863782051282051,
+      "grad_norm": 0.9828490018844604,
+      "learning_rate": 5.802701347468965e-07,
+      "loss": 0.7723,
+      "step": 21702
+    },
+    {
+      "epoch": 3.863960113960114,
+      "grad_norm": 0.9257676601409912,
+      "learning_rate": 5.787653728865139e-07,
+      "loss": 1.0391,
+      "step": 21703
+    },
+    {
+      "epoch": 3.8641381766381766,
+      "grad_norm": 0.9859639406204224,
+      "learning_rate": 5.772625589786973e-07,
+      "loss": 0.7769,
+      "step": 21704
+    },
+    {
+      "epoch": 3.8643162393162394,
+      "grad_norm": 0.9691302180290222,
+      "learning_rate": 5.757616930528786e-07,
+      "loss": 0.6341,
+      "step": 21705
+    },
+    {
+      "epoch": 3.864494301994302,
+      "grad_norm": 0.9758834838867188,
+      "learning_rate": 5.742627751384788e-07,
+      "loss": 0.7149,
+      "step": 21706
+    },
+    {
+      "epoch": 3.864672364672365,
+      "grad_norm": 1.028484582901001,
+      "learning_rate": 5.727658052648633e-07,
+      "loss": 0.8684,
+      "step": 21707
+    },
+    {
+      "epoch": 3.8648504273504276,
+      "grad_norm": 0.92888343334198,
+      "learning_rate": 5.712707834613528e-07,
+      "loss": 0.6475,
+      "step": 21708
+    },
+    {
+      "epoch": 3.86502849002849,
+      "grad_norm": 0.9963685274124146,
+      "learning_rate": 5.697777097572577e-07,
+      "loss": 0.6252,
+      "step": 21709
+    },
+    {
+      "epoch": 3.8652065527065527,
+      "grad_norm": 0.8675285577774048,
+      "learning_rate": 5.682865841818097e-07,
+      "loss": 0.8577,
+      "step": 21710
+    },
+    {
+      "epoch": 3.8653846153846154,
+      "grad_norm": 1.0220355987548828,
+      "learning_rate": 5.667974067642412e-07,
+      "loss": 0.8689,
+      "step": 21711
+    },
+    {
+      "epoch": 3.865562678062678,
+      "grad_norm": 0.9978788495063782,
+      "learning_rate": 5.653101775337067e-07,
+      "loss": 0.6846,
+      "step": 21712
+    },
+    {
+      "epoch": 3.8657407407407405,
+      "grad_norm": 1.0353487730026245,
+      "learning_rate": 5.638248965193826e-07,
+      "loss": 0.9282,
+      "step": 21713
+    },
+    {
+      "epoch": 3.8659188034188032,
+      "grad_norm": 0.8543832898139954,
+      "learning_rate": 5.623415637503348e-07,
+      "loss": 0.6257,
+      "step": 21714
+    },
+    {
+      "epoch": 3.866096866096866,
+      "grad_norm": 1.0639516115188599,
+      "learning_rate": 5.608601792556511e-07,
+      "loss": 0.6694,
+      "step": 21715
+    },
+    {
+      "epoch": 3.8662749287749287,
+      "grad_norm": 0.8737302422523499,
+      "learning_rate": 5.593807430643416e-07,
+      "loss": 0.7041,
+      "step": 21716
+    },
+    {
+      "epoch": 3.8664529914529915,
+      "grad_norm": 1.1135413646697998,
+      "learning_rate": 5.579032552053942e-07,
+      "loss": 0.762,
+      "step": 21717
+    },
+    {
+      "epoch": 3.8666310541310542,
+      "grad_norm": 0.9849023222923279,
+      "learning_rate": 5.564277157077524e-07,
+      "loss": 0.7913,
+      "step": 21718
+    },
+    {
+      "epoch": 3.866809116809117,
+      "grad_norm": 1.013596534729004,
+      "learning_rate": 5.549541246003376e-07,
+      "loss": 0.7516,
+      "step": 21719
+    },
+    {
+      "epoch": 3.8669871794871797,
+      "grad_norm": 0.8574326038360596,
+      "learning_rate": 5.534824819120266e-07,
+      "loss": 0.6867,
+      "step": 21720
+    },
+    {
+      "epoch": 3.867165242165242,
+      "grad_norm": 0.9084491729736328,
+      "learning_rate": 5.520127876716408e-07,
+      "loss": 0.7386,
+      "step": 21721
+    },
+    {
+      "epoch": 3.867343304843305,
+      "grad_norm": 0.9534892439842224,
+      "learning_rate": 5.505450419079794e-07,
+      "loss": 0.7824,
+      "step": 21722
+    },
+    {
+      "epoch": 3.8675213675213675,
+      "grad_norm": 1.0126458406448364,
+      "learning_rate": 5.490792446497973e-07,
+      "loss": 0.6519,
+      "step": 21723
+    },
+    {
+      "epoch": 3.8676994301994303,
+      "grad_norm": 0.9215713739395142,
+      "learning_rate": 5.476153959258157e-07,
+      "loss": 0.7617,
+      "step": 21724
+    },
+    {
+      "epoch": 3.8678774928774926,
+      "grad_norm": 0.7952741384506226,
+      "learning_rate": 5.461534957647119e-07,
+      "loss": 0.6392,
+      "step": 21725
+    },
+    {
+      "epoch": 3.8680555555555554,
+      "grad_norm": 1.035820722579956,
+      "learning_rate": 5.446935441951406e-07,
+      "loss": 1.011,
+      "step": 21726
+    },
+    {
+      "epoch": 3.868233618233618,
+      "grad_norm": 0.9325335025787354,
+      "learning_rate": 5.432355412456902e-07,
+      "loss": 0.6771,
+      "step": 21727
+    },
+    {
+      "epoch": 3.868411680911681,
+      "grad_norm": 1.0619163513183594,
+      "learning_rate": 5.417794869449377e-07,
+      "loss": 0.7757,
+      "step": 21728
+    },
+    {
+      "epoch": 3.8685897435897436,
+      "grad_norm": 1.1454271078109741,
+      "learning_rate": 5.40325381321416e-07,
+      "loss": 0.9636,
+      "step": 21729
+    },
+    {
+      "epoch": 3.8687678062678064,
+      "grad_norm": 1.032842993736267,
+      "learning_rate": 5.388732244036021e-07,
+      "loss": 0.6985,
+      "step": 21730
+    },
+    {
+      "epoch": 3.868945868945869,
+      "grad_norm": 1.2101705074310303,
+      "learning_rate": 5.374230162199623e-07,
+      "loss": 0.7568,
+      "step": 21731
+    },
+    {
+      "epoch": 3.869123931623932,
+      "grad_norm": 1.015390396118164,
+      "learning_rate": 5.35974756798896e-07,
+      "loss": 0.6603,
+      "step": 21732
+    },
+    {
+      "epoch": 3.869301994301994,
+      "grad_norm": 0.837568998336792,
+      "learning_rate": 5.345284461687916e-07,
+      "loss": 0.7916,
+      "step": 21733
+    },
+    {
+      "epoch": 3.869480056980057,
+      "grad_norm": 0.9118668437004089,
+      "learning_rate": 5.330840843579709e-07,
+      "loss": 0.7041,
+      "step": 21734
+    },
+    {
+      "epoch": 3.8696581196581197,
+      "grad_norm": 1.0995440483093262,
+      "learning_rate": 5.316416713947559e-07,
+      "loss": 0.8364,
+      "step": 21735
+    },
+    {
+      "epoch": 3.8698361823361824,
+      "grad_norm": 0.906592845916748,
+      "learning_rate": 5.302012073073903e-07,
+      "loss": 0.7558,
+      "step": 21736
+    },
+    {
+      "epoch": 3.870014245014245,
+      "grad_norm": 0.8995518684387207,
+      "learning_rate": 5.287626921240963e-07,
+      "loss": 0.7778,
+      "step": 21737
+    },
+    {
+      "epoch": 3.8701923076923075,
+      "grad_norm": 0.838267982006073,
+      "learning_rate": 5.273261258730733e-07,
+      "loss": 0.6692,
+      "step": 21738
+    },
+    {
+      "epoch": 3.8703703703703702,
+      "grad_norm": 0.9452478885650635,
+      "learning_rate": 5.258915085824434e-07,
+      "loss": 0.7617,
+      "step": 21739
+    },
+    {
+      "epoch": 3.870548433048433,
+      "grad_norm": 1.012858510017395,
+      "learning_rate": 5.244588402803396e-07,
+      "loss": 0.8608,
+      "step": 21740
+    },
+    {
+      "epoch": 3.8707264957264957,
+      "grad_norm": 1.0003842115402222,
+      "learning_rate": 5.230281209948174e-07,
+      "loss": 0.7907,
+      "step": 21741
+    },
+    {
+      "epoch": 3.8709045584045585,
+      "grad_norm": 1.0578328371047974,
+      "learning_rate": 5.215993507539207e-07,
+      "loss": 0.8449,
+      "step": 21742
+    },
+    {
+      "epoch": 3.8710826210826212,
+      "grad_norm": 0.9512600302696228,
+      "learning_rate": 5.201725295856386e-07,
+      "loss": 0.8504,
+      "step": 21743
+    },
+    {
+      "epoch": 3.871260683760684,
+      "grad_norm": 0.952800989151001,
+      "learning_rate": 5.187476575179151e-07,
+      "loss": 0.832,
+      "step": 21744
+    },
+    {
+      "epoch": 3.8714387464387463,
+      "grad_norm": 0.9954485893249512,
+      "learning_rate": 5.173247345786835e-07,
+      "loss": 0.8799,
+      "step": 21745
+    },
+    {
+      "epoch": 3.871616809116809,
+      "grad_norm": 0.9191272854804993,
+      "learning_rate": 5.159037607958106e-07,
+      "loss": 0.8221,
+      "step": 21746
+    },
+    {
+      "epoch": 3.871794871794872,
+      "grad_norm": 0.9987154006958008,
+      "learning_rate": 5.144847361971406e-07,
+      "loss": 0.7792,
+      "step": 21747
+    },
+    {
+      "epoch": 3.8719729344729346,
+      "grad_norm": 1.0156099796295166,
+      "learning_rate": 5.130676608104845e-07,
+      "loss": 0.9094,
+      "step": 21748
+    },
+    {
+      "epoch": 3.8721509971509973,
+      "grad_norm": 1.0321117639541626,
+      "learning_rate": 5.116525346635981e-07,
+      "loss": 0.8166,
+      "step": 21749
+    },
+    {
+      "epoch": 3.8723290598290596,
+      "grad_norm": 0.9277944564819336,
+      "learning_rate": 5.102393577842146e-07,
+      "loss": 0.9311,
+      "step": 21750
+    },
+    {
+      "epoch": 3.8725071225071224,
+      "grad_norm": 1.0144904851913452,
+      "learning_rate": 5.088281302000231e-07,
+      "loss": 0.6603,
+      "step": 21751
+    },
+    {
+      "epoch": 3.872685185185185,
+      "grad_norm": 1.0451470613479614,
+      "learning_rate": 5.074188519386569e-07,
+      "loss": 0.7662,
+      "step": 21752
+    },
+    {
+      "epoch": 3.872863247863248,
+      "grad_norm": 1.0115594863891602,
+      "learning_rate": 5.060115230277606e-07,
+      "loss": 0.6707,
+      "step": 21753
+    },
+    {
+      "epoch": 3.8730413105413106,
+      "grad_norm": 1.1018215417861938,
+      "learning_rate": 5.046061434948679e-07,
+      "loss": 0.8675,
+      "step": 21754
+    },
+    {
+      "epoch": 3.8732193732193734,
+      "grad_norm": 0.9691307544708252,
+      "learning_rate": 5.032027133675454e-07,
+      "loss": 0.8355,
+      "step": 21755
+    },
+    {
+      "epoch": 3.873397435897436,
+      "grad_norm": 0.9273481369018555,
+      "learning_rate": 5.018012326732713e-07,
+      "loss": 0.8075,
+      "step": 21756
+    },
+    {
+      "epoch": 3.8735754985754984,
+      "grad_norm": 0.9765976071357727,
+      "learning_rate": 5.004017014395124e-07,
+      "loss": 0.7604,
+      "step": 21757
+    },
+    {
+      "epoch": 3.873753561253561,
+      "grad_norm": 1.0961931943893433,
+      "learning_rate": 4.990041196937023e-07,
+      "loss": 0.8029,
+      "step": 21758
+    },
+    {
+      "epoch": 3.873931623931624,
+      "grad_norm": 0.9625610113143921,
+      "learning_rate": 4.97608487463197e-07,
+      "loss": 0.8036,
+      "step": 21759
+    },
+    {
+      "epoch": 3.8741096866096867,
+      "grad_norm": 0.9747796058654785,
+      "learning_rate": 4.962148047753634e-07,
+      "loss": 0.7748,
+      "step": 21760
+    },
+    {
+      "epoch": 3.8742877492877494,
+      "grad_norm": 0.8600348234176636,
+      "learning_rate": 4.948230716574909e-07,
+      "loss": 0.6761,
+      "step": 21761
+    },
+    {
+      "epoch": 3.8744658119658117,
+      "grad_norm": 0.8769771456718445,
+      "learning_rate": 4.934332881368575e-07,
+      "loss": 0.6293,
+      "step": 21762
+    },
+    {
+      "epoch": 3.8746438746438745,
+      "grad_norm": 0.9831801652908325,
+      "learning_rate": 4.920454542406972e-07,
+      "loss": 0.9356,
+      "step": 21763
+    },
+    {
+      "epoch": 3.8748219373219372,
+      "grad_norm": 0.9225176572799683,
+      "learning_rate": 4.906595699961992e-07,
+      "loss": 0.8194,
+      "step": 21764
+    },
+    {
+      "epoch": 3.875,
+      "grad_norm": 0.8482645153999329,
+      "learning_rate": 4.892756354305084e-07,
+      "loss": 0.6919,
+      "step": 21765
+    },
+    {
+      "epoch": 3.8751780626780628,
+      "grad_norm": 1.2182742357254028,
+      "learning_rate": 4.878936505707477e-07,
+      "loss": 0.7674,
+      "step": 21766
+    },
+    {
+      "epoch": 3.8753561253561255,
+      "grad_norm": 0.9315845370292664,
+      "learning_rate": 4.865136154439954e-07,
+      "loss": 0.7609,
+      "step": 21767
+    },
+    {
+      "epoch": 3.8755341880341883,
+      "grad_norm": 0.9341706037521362,
+      "learning_rate": 4.851355300772852e-07,
+      "loss": 0.5422,
+      "step": 21768
+    },
+    {
+      "epoch": 3.8757122507122506,
+      "grad_norm": 0.9102457165718079,
+      "learning_rate": 4.837593944976182e-07,
+      "loss": 0.8436,
+      "step": 21769
+    },
+    {
+      "epoch": 3.8758903133903133,
+      "grad_norm": 1.2303838729858398,
+      "learning_rate": 4.823852087319614e-07,
+      "loss": 0.8097,
+      "step": 21770
+    },
+    {
+      "epoch": 3.876068376068376,
+      "grad_norm": 0.8186293244361877,
+      "learning_rate": 4.810129728072266e-07,
+      "loss": 0.791,
+      "step": 21771
+    },
+    {
+      "epoch": 3.876246438746439,
+      "grad_norm": 1.1224546432495117,
+      "learning_rate": 4.796426867503257e-07,
+      "loss": 0.9078,
+      "step": 21772
+    },
+    {
+      "epoch": 3.8764245014245016,
+      "grad_norm": 0.8153115510940552,
+      "learning_rate": 4.782743505880816e-07,
+      "loss": 0.568,
+      "step": 21773
+    },
+    {
+      "epoch": 3.876602564102564,
+      "grad_norm": 1.0225319862365723,
+      "learning_rate": 4.769079643473173e-07,
+      "loss": 0.8927,
+      "step": 21774
+    },
+    {
+      "epoch": 3.8767806267806266,
+      "grad_norm": 0.8474786877632141,
+      "learning_rate": 4.755435280547893e-07,
+      "loss": 0.6705,
+      "step": 21775
+    },
+    {
+      "epoch": 3.8769586894586894,
+      "grad_norm": 0.9945504665374756,
+      "learning_rate": 4.7418104173725387e-07,
+      "loss": 0.7112,
+      "step": 21776
+    },
+    {
+      "epoch": 3.877136752136752,
+      "grad_norm": 0.9561564922332764,
+      "learning_rate": 4.728205054213897e-07,
+      "loss": 0.9308,
+      "step": 21777
+    },
+    {
+      "epoch": 3.877314814814815,
+      "grad_norm": 0.9943435788154602,
+      "learning_rate": 4.714619191338643e-07,
+      "loss": 0.7911,
+      "step": 21778
+    },
+    {
+      "epoch": 3.8774928774928776,
+      "grad_norm": 0.8424780964851379,
+      "learning_rate": 4.7010528290127863e-07,
+      "loss": 0.616,
+      "step": 21779
+    },
+    {
+      "epoch": 3.8776709401709404,
+      "grad_norm": 0.9379110336303711,
+      "learning_rate": 4.687505967502226e-07,
+      "loss": 0.7668,
+      "step": 21780
+    },
+    {
+      "epoch": 3.8778490028490027,
+      "grad_norm": 1.0603646039962769,
+      "learning_rate": 4.6739786070725264e-07,
+      "loss": 0.8882,
+      "step": 21781
+    },
+    {
+      "epoch": 3.8780270655270654,
+      "grad_norm": 0.8258822560310364,
+      "learning_rate": 4.660470747988588e-07,
+      "loss": 0.646,
+      "step": 21782
+    },
+    {
+      "epoch": 3.878205128205128,
+      "grad_norm": 0.9126657843589783,
+      "learning_rate": 4.646982390514976e-07,
+      "loss": 0.64,
+      "step": 21783
+    },
+    {
+      "epoch": 3.878383190883191,
+      "grad_norm": 1.0353071689605713,
+      "learning_rate": 4.633513534916145e-07,
+      "loss": 0.7882,
+      "step": 21784
+    },
+    {
+      "epoch": 3.8785612535612537,
+      "grad_norm": 0.9383770227432251,
+      "learning_rate": 4.6200641814559964e-07,
+      "loss": 0.7981,
+      "step": 21785
+    },
+    {
+      "epoch": 3.878739316239316,
+      "grad_norm": 0.9015040397644043,
+      "learning_rate": 4.606634330397874e-07,
+      "loss": 0.8177,
+      "step": 21786
+    },
+    {
+      "epoch": 3.8789173789173788,
+      "grad_norm": 0.9347863793373108,
+      "learning_rate": 4.5932239820050125e-07,
+      "loss": 0.8157,
+      "step": 21787
+    },
+    {
+      "epoch": 3.8790954415954415,
+      "grad_norm": 1.0109801292419434,
+      "learning_rate": 4.5798331365402016e-07,
+      "loss": 0.7419,
+      "step": 21788
+    },
+    {
+      "epoch": 3.8792735042735043,
+      "grad_norm": 0.9674223065376282,
+      "learning_rate": 4.566461794265675e-07,
+      "loss": 0.7438,
+      "step": 21789
+    },
+    {
+      "epoch": 3.879451566951567,
+      "grad_norm": 0.8686508536338806,
+      "learning_rate": 4.5531099554435576e-07,
+      "loss": 0.6051,
+      "step": 21790
+    },
+    {
+      "epoch": 3.8796296296296298,
+      "grad_norm": 0.7613703608512878,
+      "learning_rate": 4.539777620335417e-07,
+      "loss": 0.5552,
+      "step": 21791
+    },
+    {
+      "epoch": 3.8798076923076925,
+      "grad_norm": 1.0486373901367188,
+      "learning_rate": 4.526464789202378e-07,
+      "loss": 0.9524,
+      "step": 21792
+    },
+    {
+      "epoch": 3.879985754985755,
+      "grad_norm": 0.8991581797599792,
+      "learning_rate": 4.5131714623053433e-07,
+      "loss": 0.8313,
+      "step": 21793
+    },
+    {
+      "epoch": 3.8801638176638176,
+      "grad_norm": 0.9314252138137817,
+      "learning_rate": 4.499897639904771e-07,
+      "loss": 0.6796,
+      "step": 21794
+    },
+    {
+      "epoch": 3.8803418803418803,
+      "grad_norm": 1.0096250772476196,
+      "learning_rate": 4.4866433222607864e-07,
+      "loss": 0.8151,
+      "step": 21795
+    },
+    {
+      "epoch": 3.880519943019943,
+      "grad_norm": 0.9510086178779602,
+      "learning_rate": 4.4734085096329594e-07,
+      "loss": 0.7451,
+      "step": 21796
+    },
+    {
+      "epoch": 3.880698005698006,
+      "grad_norm": 0.9142504334449768,
+      "learning_rate": 4.460193202280638e-07,
+      "loss": 0.8498,
+      "step": 21797
+    },
+    {
+      "epoch": 3.880876068376068,
+      "grad_norm": 0.9794802665710449,
+      "learning_rate": 4.446997400462838e-07,
+      "loss": 0.7224,
+      "step": 21798
+    },
+    {
+      "epoch": 3.881054131054131,
+      "grad_norm": 1.0646485090255737,
+      "learning_rate": 4.433821104438018e-07,
+      "loss": 0.633,
+      "step": 21799
+    },
+    {
+      "epoch": 3.8812321937321936,
+      "grad_norm": 0.9331605434417725,
+      "learning_rate": 4.420664314464418e-07,
+      "loss": 0.7344,
+      "step": 21800
+    },
+    {
+      "epoch": 3.8814102564102564,
+      "grad_norm": 0.9503687024116516,
+      "learning_rate": 4.4075270307997186e-07,
+      "loss": 0.611,
+      "step": 21801
+    },
+    {
+      "epoch": 3.881588319088319,
+      "grad_norm": 0.8861737847328186,
+      "learning_rate": 4.3944092537013814e-07,
+      "loss": 0.5885,
+      "step": 21802
+    },
+    {
+      "epoch": 3.881766381766382,
+      "grad_norm": 0.910184919834137,
+      "learning_rate": 4.3813109834264233e-07,
+      "loss": 0.5804,
+      "step": 21803
+    },
+    {
+      "epoch": 3.8819444444444446,
+      "grad_norm": 1.016818881034851,
+      "learning_rate": 4.3682322202314163e-07,
+      "loss": 0.8663,
+      "step": 21804
+    },
+    {
+      "epoch": 3.8821225071225074,
+      "grad_norm": 0.7735291123390198,
+      "learning_rate": 4.3551729643727113e-07,
+      "loss": 0.4588,
+      "step": 21805
+    },
+    {
+      "epoch": 3.8823005698005697,
+      "grad_norm": 0.9913780689239502,
+      "learning_rate": 4.3421332161059926e-07,
+      "loss": 0.799,
+      "step": 21806
+    },
+    {
+      "epoch": 3.8824786324786325,
+      "grad_norm": 1.0465813875198364,
+      "learning_rate": 4.329112975686944e-07,
+      "loss": 0.8979,
+      "step": 21807
+    },
+    {
+      "epoch": 3.882656695156695,
+      "grad_norm": 0.9852820038795471,
+      "learning_rate": 4.316112243370696e-07,
+      "loss": 0.6351,
+      "step": 21808
+    },
+    {
+      "epoch": 3.882834757834758,
+      "grad_norm": 0.9551993608474731,
+      "learning_rate": 4.3031310194117104e-07,
+      "loss": 0.7989,
+      "step": 21809
+    },
+    {
+      "epoch": 3.8830128205128203,
+      "grad_norm": 1.037743330001831,
+      "learning_rate": 4.290169304064673e-07,
+      "loss": 0.8558,
+      "step": 21810
+    },
+    {
+      "epoch": 3.883190883190883,
+      "grad_norm": 0.967463493347168,
+      "learning_rate": 4.2772270975831583e-07,
+      "loss": 0.7983,
+      "step": 21811
+    },
+    {
+      "epoch": 3.8833689458689458,
+      "grad_norm": 1.2085150480270386,
+      "learning_rate": 4.264304400221075e-07,
+      "loss": 0.5849,
+      "step": 21812
+    },
+    {
+      "epoch": 3.8835470085470085,
+      "grad_norm": 0.9609394669532776,
+      "learning_rate": 4.251401212231443e-07,
+      "loss": 0.8195,
+      "step": 21813
+    },
+    {
+      "epoch": 3.8837250712250713,
+      "grad_norm": 0.9492299556732178,
+      "learning_rate": 4.23851753386717e-07,
+      "loss": 1.0462,
+      "step": 21814
+    },
+    {
+      "epoch": 3.883903133903134,
+      "grad_norm": 1.1199448108673096,
+      "learning_rate": 4.2256533653804997e-07,
+      "loss": 0.6554,
+      "step": 21815
+    },
+    {
+      "epoch": 3.8840811965811968,
+      "grad_norm": 1.1129202842712402,
+      "learning_rate": 4.212808707023785e-07,
+      "loss": 0.7439,
+      "step": 21816
+    },
+    {
+      "epoch": 3.8842592592592595,
+      "grad_norm": 1.0576272010803223,
+      "learning_rate": 4.1999835590483815e-07,
+      "loss": 0.693,
+      "step": 21817
+    },
+    {
+      "epoch": 3.884437321937322,
+      "grad_norm": 0.9170486927032471,
+      "learning_rate": 4.187177921705754e-07,
+      "loss": 0.7298,
+      "step": 21818
+    },
+    {
+      "epoch": 3.8846153846153846,
+      "grad_norm": 0.8820095062255859,
+      "learning_rate": 4.1743917952467015e-07,
+      "loss": 0.6307,
+      "step": 21819
+    },
+    {
+      "epoch": 3.8847934472934473,
+      "grad_norm": 1.213741660118103,
+      "learning_rate": 4.1616251799219133e-07,
+      "loss": 1.0166,
+      "step": 21820
+    },
+    {
+      "epoch": 3.88497150997151,
+      "grad_norm": 1.1306160688400269,
+      "learning_rate": 4.1488780759812995e-07,
+      "loss": 0.7279,
+      "step": 21821
+    },
+    {
+      "epoch": 3.8851495726495724,
+      "grad_norm": 0.9849101901054382,
+      "learning_rate": 4.136150483674772e-07,
+      "loss": 0.8513,
+      "step": 21822
+    },
+    {
+      "epoch": 3.885327635327635,
+      "grad_norm": 0.9408984780311584,
+      "learning_rate": 4.123442403251576e-07,
+      "loss": 0.9024,
+      "step": 21823
+    },
+    {
+      "epoch": 3.885505698005698,
+      "grad_norm": 0.9596730470657349,
+      "learning_rate": 4.110753834960845e-07,
+      "loss": 0.7024,
+      "step": 21824
+    },
+    {
+      "epoch": 3.8856837606837606,
+      "grad_norm": 1.0172388553619385,
+      "learning_rate": 4.098084779051048e-07,
+      "loss": 0.9005,
+      "step": 21825
+    },
+    {
+      "epoch": 3.8858618233618234,
+      "grad_norm": 0.7936252355575562,
+      "learning_rate": 4.0854352357705406e-07,
+      "loss": 0.5977,
+      "step": 21826
+    },
+    {
+      "epoch": 3.886039886039886,
+      "grad_norm": 0.9177237153053284,
+      "learning_rate": 4.072805205367125e-07,
+      "loss": 0.5956,
+      "step": 21827
+    },
+    {
+      "epoch": 3.886217948717949,
+      "grad_norm": 1.0219414234161377,
+      "learning_rate": 4.060194688088048e-07,
+      "loss": 0.8677,
+      "step": 21828
+    },
+    {
+      "epoch": 3.8863960113960117,
+      "grad_norm": 0.8081260919570923,
+      "learning_rate": 4.047603684180778e-07,
+      "loss": 0.5772,
+      "step": 21829
+    },
+    {
+      "epoch": 3.886574074074074,
+      "grad_norm": 1.1134142875671387,
+      "learning_rate": 4.0350321938916745e-07,
+      "loss": 0.8819,
+      "step": 21830
+    },
+    {
+      "epoch": 3.8867521367521367,
+      "grad_norm": 1.1589722633361816,
+      "learning_rate": 4.022480217467206e-07,
+      "loss": 0.724,
+      "step": 21831
+    },
+    {
+      "epoch": 3.8869301994301995,
+      "grad_norm": 1.0400350093841553,
+      "learning_rate": 4.009947755153398e-07,
+      "loss": 0.6483,
+      "step": 21832
+    },
+    {
+      "epoch": 3.887108262108262,
+      "grad_norm": 1.082972526550293,
+      "learning_rate": 3.997434807195499e-07,
+      "loss": 0.7819,
+      "step": 21833
+    },
+    {
+      "epoch": 3.8872863247863245,
+      "grad_norm": 0.9113301038742065,
+      "learning_rate": 3.9849413738388686e-07,
+      "loss": 0.7444,
+      "step": 21834
+    },
+    {
+      "epoch": 3.8874643874643873,
+      "grad_norm": 0.9959047436714172,
+      "learning_rate": 3.9724674553284215e-07,
+      "loss": 1.0285,
+      "step": 21835
+    },
+    {
+      "epoch": 3.88764245014245,
+      "grad_norm": 0.8362236022949219,
+      "learning_rate": 3.9600130519082956e-07,
+      "loss": 0.5392,
+      "step": 21836
+    },
+    {
+      "epoch": 3.8878205128205128,
+      "grad_norm": 1.2744941711425781,
+      "learning_rate": 3.9475781638226294e-07,
+      "loss": 0.9262,
+      "step": 21837
+    },
+    {
+      "epoch": 3.8879985754985755,
+      "grad_norm": 0.9878840446472168,
+      "learning_rate": 3.935162791315006e-07,
+      "loss": 0.8396,
+      "step": 21838
+    },
+    {
+      "epoch": 3.8881766381766383,
+      "grad_norm": 1.108947515487671,
+      "learning_rate": 3.9227669346286744e-07,
+      "loss": 0.6708,
+      "step": 21839
+    },
+    {
+      "epoch": 3.888354700854701,
+      "grad_norm": 1.107846736907959,
+      "learning_rate": 3.910390594006774e-07,
+      "loss": 0.8392,
+      "step": 21840
+    },
+    {
+      "epoch": 3.888532763532764,
+      "grad_norm": 1.0903797149658203,
+      "learning_rate": 3.898033769691334e-07,
+      "loss": 0.8854,
+      "step": 21841
+    },
+    {
+      "epoch": 3.888710826210826,
+      "grad_norm": 0.9175450801849365,
+      "learning_rate": 3.885696461924937e-07,
+      "loss": 0.6413,
+      "step": 21842
+    },
+    {
+      "epoch": 3.888888888888889,
+      "grad_norm": 0.9295111298561096,
+      "learning_rate": 3.8733786709488354e-07,
+      "loss": 0.7888,
+      "step": 21843
+    },
+    {
+      "epoch": 3.8890669515669516,
+      "grad_norm": 0.9339991807937622,
+      "learning_rate": 3.8610803970047236e-07,
+      "loss": 0.6805,
+      "step": 21844
+    },
+    {
+      "epoch": 3.8892450142450143,
+      "grad_norm": 1.1252503395080566,
+      "learning_rate": 3.84880164033341e-07,
+      "loss": 0.796,
+      "step": 21845
+    },
+    {
+      "epoch": 3.8894230769230766,
+      "grad_norm": 0.9593950510025024,
+      "learning_rate": 3.836542401175591e-07,
+      "loss": 0.7758,
+      "step": 21846
+    },
+    {
+      "epoch": 3.8896011396011394,
+      "grad_norm": 0.9289141297340393,
+      "learning_rate": 3.8243026797712967e-07,
+      "loss": 0.9374,
+      "step": 21847
+    },
+    {
+      "epoch": 3.889779202279202,
+      "grad_norm": 1.0639877319335938,
+      "learning_rate": 3.8120824763604456e-07,
+      "loss": 0.9137,
+      "step": 21848
+    },
+    {
+      "epoch": 3.889957264957265,
+      "grad_norm": 0.9373790621757507,
+      "learning_rate": 3.7998817911824026e-07,
+      "loss": 0.7312,
+      "step": 21849
+    },
+    {
+      "epoch": 3.8901353276353277,
+      "grad_norm": 0.9877315759658813,
+      "learning_rate": 3.787700624476198e-07,
+      "loss": 0.7707,
+      "step": 21850
+    },
+    {
+      "epoch": 3.8903133903133904,
+      "grad_norm": 0.9654448628425598,
+      "learning_rate": 3.7755389764806416e-07,
+      "loss": 0.7137,
+      "step": 21851
+    },
+    {
+      "epoch": 3.890491452991453,
+      "grad_norm": 0.930350661277771,
+      "learning_rate": 3.763396847433875e-07,
+      "loss": 0.7167,
+      "step": 21852
+    },
+    {
+      "epoch": 3.890669515669516,
+      "grad_norm": 0.8244062662124634,
+      "learning_rate": 3.7512742375739316e-07,
+      "loss": 0.7171,
+      "step": 21853
+    },
+    {
+      "epoch": 3.890847578347578,
+      "grad_norm": 0.9072375297546387,
+      "learning_rate": 3.739171147138176e-07,
+      "loss": 0.8647,
+      "step": 21854
+    },
+    {
+      "epoch": 3.891025641025641,
+      "grad_norm": 0.9860767722129822,
+      "learning_rate": 3.7270875763637527e-07,
+      "loss": 0.7323,
+      "step": 21855
+    },
+    {
+      "epoch": 3.8912037037037037,
+      "grad_norm": 0.9968366622924805,
+      "learning_rate": 3.715023525487582e-07,
+      "loss": 0.8133,
+      "step": 21856
+    },
+    {
+      "epoch": 3.8913817663817665,
+      "grad_norm": 1.3136839866638184,
+      "learning_rate": 3.7029789947458094e-07,
+      "loss": 0.9965,
+      "step": 21857
+    },
+    {
+      "epoch": 3.8915598290598292,
+      "grad_norm": 1.0396318435668945,
+      "learning_rate": 3.6909539843745787e-07,
+      "loss": 0.7561,
+      "step": 21858
+    },
+    {
+      "epoch": 3.8917378917378915,
+      "grad_norm": 0.9965425133705139,
+      "learning_rate": 3.67894849460948e-07,
+      "loss": 0.8734,
+      "step": 21859
+    },
+    {
+      "epoch": 3.8919159544159543,
+      "grad_norm": 1.020198106765747,
+      "learning_rate": 3.6669625256856576e-07,
+      "loss": 0.6508,
+      "step": 21860
+    },
+    {
+      "epoch": 3.892094017094017,
+      "grad_norm": 0.9760180711746216,
+      "learning_rate": 3.6549960778380357e-07,
+      "loss": 0.9447,
+      "step": 21861
+    },
+    {
+      "epoch": 3.89227207977208,
+      "grad_norm": 1.0385621786117554,
+      "learning_rate": 3.643049151301092e-07,
+      "loss": 0.7007,
+      "step": 21862
+    },
+    {
+      "epoch": 3.8924501424501425,
+      "grad_norm": 0.8879075050354004,
+      "learning_rate": 3.631121746308752e-07,
+      "loss": 0.6414,
+      "step": 21863
+    },
+    {
+      "epoch": 3.8926282051282053,
+      "grad_norm": 0.865680992603302,
+      "learning_rate": 3.619213863094828e-07,
+      "loss": 0.6515,
+      "step": 21864
+    },
+    {
+      "epoch": 3.892806267806268,
+      "grad_norm": 1.055982232093811,
+      "learning_rate": 3.607325501892689e-07,
+      "loss": 0.7958,
+      "step": 21865
+    },
+    {
+      "epoch": 3.8929843304843303,
+      "grad_norm": 1.0314075946807861,
+      "learning_rate": 3.595456662935037e-07,
+      "loss": 0.8503,
+      "step": 21866
+    },
+    {
+      "epoch": 3.893162393162393,
+      "grad_norm": 0.9081546664237976,
+      "learning_rate": 3.583607346454687e-07,
+      "loss": 0.8502,
+      "step": 21867
+    },
+    {
+      "epoch": 3.893340455840456,
+      "grad_norm": 0.989351749420166,
+      "learning_rate": 3.571777552683564e-07,
+      "loss": 0.7857,
+      "step": 21868
+    },
+    {
+      "epoch": 3.8935185185185186,
+      "grad_norm": 1.021647572517395,
+      "learning_rate": 3.5599672818537046e-07,
+      "loss": 0.7159,
+      "step": 21869
+    },
+    {
+      "epoch": 3.8936965811965814,
+      "grad_norm": 0.9180936813354492,
+      "learning_rate": 3.548176534196257e-07,
+      "loss": 0.7763,
+      "step": 21870
+    },
+    {
+      "epoch": 3.8938746438746437,
+      "grad_norm": 0.9282814860343933,
+      "learning_rate": 3.53640530994237e-07,
+      "loss": 0.7063,
+      "step": 21871
+    },
+    {
+      "epoch": 3.8940527065527064,
+      "grad_norm": 1.0444775819778442,
+      "learning_rate": 3.5246536093226366e-07,
+      "loss": 0.8247,
+      "step": 21872
+    },
+    {
+      "epoch": 3.894230769230769,
+      "grad_norm": 0.6987374424934387,
+      "learning_rate": 3.512921432567318e-07,
+      "loss": 0.4846,
+      "step": 21873
+    },
+    {
+      "epoch": 3.894408831908832,
+      "grad_norm": 0.9549040794372559,
+      "learning_rate": 3.501208779906229e-07,
+      "loss": 0.7085,
+      "step": 21874
+    },
+    {
+      "epoch": 3.8945868945868947,
+      "grad_norm": 0.9242094159126282,
+      "learning_rate": 3.4895156515690753e-07,
+      "loss": 0.7811,
+      "step": 21875
+    },
+    {
+      "epoch": 3.8947649572649574,
+      "grad_norm": 0.9738181829452515,
+      "learning_rate": 3.477842047784563e-07,
+      "loss": 0.8004,
+      "step": 21876
+    },
+    {
+      "epoch": 3.89494301994302,
+      "grad_norm": 0.9645888805389404,
+      "learning_rate": 3.466187968781842e-07,
+      "loss": 0.656,
+      "step": 21877
+    },
+    {
+      "epoch": 3.8951210826210825,
+      "grad_norm": 0.9509434103965759,
+      "learning_rate": 3.4545534147889523e-07,
+      "loss": 0.7381,
+      "step": 21878
+    },
+    {
+      "epoch": 3.8952991452991452,
+      "grad_norm": 1.1472703218460083,
+      "learning_rate": 3.4429383860339336e-07,
+      "loss": 0.8948,
+      "step": 21879
+    },
+    {
+      "epoch": 3.895477207977208,
+      "grad_norm": 0.9420302510261536,
+      "learning_rate": 3.4313428827443817e-07,
+      "loss": 0.8261,
+      "step": 21880
+    },
+    {
+      "epoch": 3.8956552706552707,
+      "grad_norm": 0.8902772665023804,
+      "learning_rate": 3.419766905147448e-07,
+      "loss": 0.9552,
+      "step": 21881
+    },
+    {
+      "epoch": 3.8958333333333335,
+      "grad_norm": 0.8738743662834167,
+      "learning_rate": 3.408210453470062e-07,
+      "loss": 0.6172,
+      "step": 21882
+    },
+    {
+      "epoch": 3.896011396011396,
+      "grad_norm": 0.8608858585357666,
+      "learning_rate": 3.3966735279384875e-07,
+      "loss": 0.7194,
+      "step": 21883
+    },
+    {
+      "epoch": 3.8961894586894585,
+      "grad_norm": 0.9503448009490967,
+      "learning_rate": 3.385156128778766e-07,
+      "loss": 0.7791,
+      "step": 21884
+    },
+    {
+      "epoch": 3.8963675213675213,
+      "grad_norm": 0.9241414070129395,
+      "learning_rate": 3.3736582562167163e-07,
+      "loss": 0.634,
+      "step": 21885
+    },
+    {
+      "epoch": 3.896545584045584,
+      "grad_norm": 0.9042157530784607,
+      "learning_rate": 3.362179910477492e-07,
+      "loss": 0.8056,
+      "step": 21886
+    },
+    {
+      "epoch": 3.896723646723647,
+      "grad_norm": 0.9168319702148438,
+      "learning_rate": 3.350721091786024e-07,
+      "loss": 0.507,
+      "step": 21887
+    },
+    {
+      "epoch": 3.8969017094017095,
+      "grad_norm": 0.7683261632919312,
+      "learning_rate": 3.3392818003668e-07,
+      "loss": 0.6233,
+      "step": 21888
+    },
+    {
+      "epoch": 3.8970797720797723,
+      "grad_norm": 0.8454415798187256,
+      "learning_rate": 3.3278620364440847e-07,
+      "loss": 0.6458,
+      "step": 21889
+    },
+    {
+      "epoch": 3.8972578347578346,
+      "grad_norm": 0.878555178642273,
+      "learning_rate": 3.316461800241366e-07,
+      "loss": 0.6788,
+      "step": 21890
+    },
+    {
+      "epoch": 3.8974358974358974,
+      "grad_norm": 0.9094281792640686,
+      "learning_rate": 3.3050810919821316e-07,
+      "loss": 0.708,
+      "step": 21891
+    },
+    {
+      "epoch": 3.89761396011396,
+      "grad_norm": 0.9100331664085388,
+      "learning_rate": 3.2937199118894257e-07,
+      "loss": 0.6912,
+      "step": 21892
+    },
+    {
+      "epoch": 3.897792022792023,
+      "grad_norm": 0.9003938436508179,
+      "learning_rate": 3.282378260185848e-07,
+      "loss": 0.8759,
+      "step": 21893
+    },
+    {
+      "epoch": 3.8979700854700856,
+      "grad_norm": 0.9322202205657959,
+      "learning_rate": 3.2710561370934424e-07,
+      "loss": 0.7589,
+      "step": 21894
+    },
+    {
+      "epoch": 3.898148148148148,
+      "grad_norm": 0.9860547184944153,
+      "learning_rate": 3.259753542834254e-07,
+      "loss": 0.742,
+      "step": 21895
+    },
+    {
+      "epoch": 3.8983262108262107,
+      "grad_norm": 0.9263895153999329,
+      "learning_rate": 3.2484704776296613e-07,
+      "loss": 0.8624,
+      "step": 21896
+    },
+    {
+      "epoch": 3.8985042735042734,
+      "grad_norm": 1.0885869264602661,
+      "learning_rate": 3.23720694170071e-07,
+      "loss": 0.8088,
+      "step": 21897
+    },
+    {
+      "epoch": 3.898682336182336,
+      "grad_norm": 1.0078108310699463,
+      "learning_rate": 3.225962935268112e-07,
+      "loss": 0.8197,
+      "step": 21898
+    },
+    {
+      "epoch": 3.898860398860399,
+      "grad_norm": 1.0098800659179688,
+      "learning_rate": 3.2147384585521354e-07,
+      "loss": 0.748,
+      "step": 21899
+    },
+    {
+      "epoch": 3.8990384615384617,
+      "grad_norm": 0.9592456817626953,
+      "learning_rate": 3.203533511772605e-07,
+      "loss": 0.7737,
+      "step": 21900
+    },
+    {
+      "epoch": 3.8992165242165244,
+      "grad_norm": 0.9254228472709656,
+      "learning_rate": 3.1923480951493447e-07,
+      "loss": 0.8406,
+      "step": 21901
+    },
+    {
+      "epoch": 3.8993945868945867,
+      "grad_norm": 0.9102961421012878,
+      "learning_rate": 3.181182208901179e-07,
+      "loss": 0.9384,
+      "step": 21902
+    },
+    {
+      "epoch": 3.8995726495726495,
+      "grad_norm": 0.9881536364555359,
+      "learning_rate": 3.170035853247155e-07,
+      "loss": 0.7607,
+      "step": 21903
+    },
+    {
+      "epoch": 3.8997507122507122,
+      "grad_norm": 0.9650844931602478,
+      "learning_rate": 3.1589090284055436e-07,
+      "loss": 0.7619,
+      "step": 21904
+    },
+    {
+      "epoch": 3.899928774928775,
+      "grad_norm": 0.7324507832527161,
+      "learning_rate": 3.1478017345942803e-07,
+      "loss": 0.5913,
+      "step": 21905
+    },
+    {
+      "epoch": 3.9001068376068377,
+      "grad_norm": 0.9290045499801636,
+      "learning_rate": 3.1367139720310803e-07,
+      "loss": 0.719,
+      "step": 21906
+    },
+    {
+      "epoch": 3.9002849002849,
+      "grad_norm": 1.1251944303512573,
+      "learning_rate": 3.125645740933214e-07,
+      "loss": 0.7008,
+      "step": 21907
+    },
+    {
+      "epoch": 3.900462962962963,
+      "grad_norm": 1.1408454179763794,
+      "learning_rate": 3.1145970415173975e-07,
+      "loss": 0.7176,
+      "step": 21908
+    },
+    {
+      "epoch": 3.9006410256410255,
+      "grad_norm": 0.9022751450538635,
+      "learning_rate": 3.103567874000235e-07,
+      "loss": 0.7632,
+      "step": 21909
+    },
+    {
+      "epoch": 3.9008190883190883,
+      "grad_norm": 0.8978546857833862,
+      "learning_rate": 3.092558238597887e-07,
+      "loss": 0.7441,
+      "step": 21910
+    },
+    {
+      "epoch": 3.900997150997151,
+      "grad_norm": 0.9025968909263611,
+      "learning_rate": 3.0815681355258477e-07,
+      "loss": 0.6771,
+      "step": 21911
+    },
+    {
+      "epoch": 3.901175213675214,
+      "grad_norm": 0.9397212862968445,
+      "learning_rate": 3.070597564999611e-07,
+      "loss": 0.7661,
+      "step": 21912
+    },
+    {
+      "epoch": 3.9013532763532766,
+      "grad_norm": 0.9135825037956238,
+      "learning_rate": 3.0596465272340056e-07,
+      "loss": 0.7339,
+      "step": 21913
+    },
+    {
+      "epoch": 3.9015313390313393,
+      "grad_norm": 1.1414575576782227,
+      "learning_rate": 3.048715022443749e-07,
+      "loss": 0.8099,
+      "step": 21914
+    },
+    {
+      "epoch": 3.9017094017094016,
+      "grad_norm": 0.9361329078674316,
+      "learning_rate": 3.0378030508428913e-07,
+      "loss": 0.813,
+      "step": 21915
+    },
+    {
+      "epoch": 3.9018874643874644,
+      "grad_norm": 0.9217289090156555,
+      "learning_rate": 3.0269106126452617e-07,
+      "loss": 0.5702,
+      "step": 21916
+    },
+    {
+      "epoch": 3.902065527065527,
+      "grad_norm": 1.1916371583938599,
+      "learning_rate": 3.0160377080643563e-07,
+      "loss": 0.8607,
+      "step": 21917
+    },
+    {
+      "epoch": 3.90224358974359,
+      "grad_norm": 0.9516651630401611,
+      "learning_rate": 3.005184337313116e-07,
+      "loss": 0.6183,
+      "step": 21918
+    },
+    {
+      "epoch": 3.902421652421652,
+      "grad_norm": 0.9865080118179321,
+      "learning_rate": 2.994350500604148e-07,
+      "loss": 0.7953,
+      "step": 21919
+    },
+    {
+      "epoch": 3.902599715099715,
+      "grad_norm": 0.9364351034164429,
+      "learning_rate": 2.983536198149839e-07,
+      "loss": 0.6312,
+      "step": 21920
+    },
+    {
+      "epoch": 3.9027777777777777,
+      "grad_norm": 1.0097970962524414,
+      "learning_rate": 2.9727414301620184e-07,
+      "loss": 0.8344,
+      "step": 21921
+    },
+    {
+      "epoch": 3.9029558404558404,
+      "grad_norm": 1.025741696357727,
+      "learning_rate": 2.961966196852184e-07,
+      "loss": 0.8122,
+      "step": 21922
+    },
+    {
+      "epoch": 3.903133903133903,
+      "grad_norm": 0.9230335354804993,
+      "learning_rate": 2.9512104984313893e-07,
+      "loss": 0.6786,
+      "step": 21923
+    },
+    {
+      "epoch": 3.903311965811966,
+      "grad_norm": 1.030914068222046,
+      "learning_rate": 2.9404743351105767e-07,
+      "loss": 0.7809,
+      "step": 21924
+    },
+    {
+      "epoch": 3.9034900284900287,
+      "grad_norm": 0.8610934019088745,
+      "learning_rate": 2.929757707099801e-07,
+      "loss": 0.6601,
+      "step": 21925
+    },
+    {
+      "epoch": 3.9036680911680914,
+      "grad_norm": 1.0533103942871094,
+      "learning_rate": 2.919060614609226e-07,
+      "loss": 0.8471,
+      "step": 21926
+    },
+    {
+      "epoch": 3.9038461538461537,
+      "grad_norm": 1.0236098766326904,
+      "learning_rate": 2.908383057848463e-07,
+      "loss": 0.8785,
+      "step": 21927
+    },
+    {
+      "epoch": 3.9040242165242165,
+      "grad_norm": 0.9521497488021851,
+      "learning_rate": 2.897725037026566e-07,
+      "loss": 0.8093,
+      "step": 21928
+    },
+    {
+      "epoch": 3.9042022792022792,
+      "grad_norm": 1.0812114477157593,
+      "learning_rate": 2.8870865523525915e-07,
+      "loss": 0.8727,
+      "step": 21929
+    },
+    {
+      "epoch": 3.904380341880342,
+      "grad_norm": 1.0053845643997192,
+      "learning_rate": 2.876467604034705e-07,
+      "loss": 0.7937,
+      "step": 21930
+    },
+    {
+      "epoch": 3.9045584045584043,
+      "grad_norm": 0.722806990146637,
+      "learning_rate": 2.865868192281074e-07,
+      "loss": 0.5283,
+      "step": 21931
+    },
+    {
+      "epoch": 3.904736467236467,
+      "grad_norm": 0.8876988291740417,
+      "learning_rate": 2.855288317299531e-07,
+      "loss": 0.6507,
+      "step": 21932
+    },
+    {
+      "epoch": 3.90491452991453,
+      "grad_norm": 0.9309136271476746,
+      "learning_rate": 2.8447279792971346e-07,
+      "loss": 0.6041,
+      "step": 21933
+    },
+    {
+      "epoch": 3.9050925925925926,
+      "grad_norm": 1.25245201587677,
+      "learning_rate": 2.8341871784808293e-07,
+      "loss": 0.7069,
+      "step": 21934
+    },
+    {
+      "epoch": 3.9052706552706553,
+      "grad_norm": 0.8108494281768799,
+      "learning_rate": 2.8236659150572274e-07,
+      "loss": 0.4754,
+      "step": 21935
+    },
+    {
+      "epoch": 3.905448717948718,
+      "grad_norm": 0.97728031873703,
+      "learning_rate": 2.813164189232498e-07,
+      "loss": 0.8704,
+      "step": 21936
+    },
+    {
+      "epoch": 3.905626780626781,
+      "grad_norm": 1.0768465995788574,
+      "learning_rate": 2.8026820012123645e-07,
+      "loss": 0.7118,
+      "step": 21937
+    },
+    {
+      "epoch": 3.9058048433048436,
+      "grad_norm": 0.9180233478546143,
+      "learning_rate": 2.7922193512019965e-07,
+      "loss": 0.9206,
+      "step": 21938
+    },
+    {
+      "epoch": 3.905982905982906,
+      "grad_norm": 1.0183178186416626,
+      "learning_rate": 2.781776239406786e-07,
+      "loss": 0.8764,
+      "step": 21939
+    },
+    {
+      "epoch": 3.9061609686609686,
+      "grad_norm": 1.081843614578247,
+      "learning_rate": 2.771352666031013e-07,
+      "loss": 0.7171,
+      "step": 21940
+    },
+    {
+      "epoch": 3.9063390313390314,
+      "grad_norm": 1.008541464805603,
+      "learning_rate": 2.76094863127907e-07,
+      "loss": 0.6932,
+      "step": 21941
+    },
+    {
+      "epoch": 3.906517094017094,
+      "grad_norm": 0.9277056455612183,
+      "learning_rate": 2.750564135354683e-07,
+      "loss": 0.787,
+      "step": 21942
+    },
+    {
+      "epoch": 3.9066951566951564,
+      "grad_norm": 1.0183097124099731,
+      "learning_rate": 2.7401991784614666e-07,
+      "loss": 0.966,
+      "step": 21943
+    },
+    {
+      "epoch": 3.906873219373219,
+      "grad_norm": 0.943051815032959,
+      "learning_rate": 2.72985376080237e-07,
+      "loss": 0.6795,
+      "step": 21944
+    },
+    {
+      "epoch": 3.907051282051282,
+      "grad_norm": 0.9863778352737427,
+      "learning_rate": 2.7195278825801195e-07,
+      "loss": 0.6972,
+      "step": 21945
+    },
+    {
+      "epoch": 3.9072293447293447,
+      "grad_norm": 1.0453509092330933,
+      "learning_rate": 2.709221543997109e-07,
+      "loss": 0.8906,
+      "step": 21946
+    },
+    {
+      "epoch": 3.9074074074074074,
+      "grad_norm": 0.9586603045463562,
+      "learning_rate": 2.698934745255177e-07,
+      "loss": 0.8719,
+      "step": 21947
+    },
+    {
+      "epoch": 3.90758547008547,
+      "grad_norm": 0.9506910443305969,
+      "learning_rate": 2.6886674865559403e-07,
+      "loss": 0.6272,
+      "step": 21948
+    },
+    {
+      "epoch": 3.907763532763533,
+      "grad_norm": 0.9721106886863708,
+      "learning_rate": 2.6784197681004595e-07,
+      "loss": 0.8897,
+      "step": 21949
+    },
+    {
+      "epoch": 3.9079415954415957,
+      "grad_norm": 0.917022168636322,
+      "learning_rate": 2.6681915900896857e-07,
+      "loss": 0.8084,
+      "step": 21950
+    },
+    {
+      "epoch": 3.908119658119658,
+      "grad_norm": 1.0294543504714966,
+      "learning_rate": 2.657982952723792e-07,
+      "loss": 0.8383,
+      "step": 21951
+    },
+    {
+      "epoch": 3.9082977207977208,
+      "grad_norm": 1.017067313194275,
+      "learning_rate": 2.647793856203062e-07,
+      "loss": 0.9104,
+      "step": 21952
+    },
+    {
+      "epoch": 3.9084757834757835,
+      "grad_norm": 0.9635358452796936,
+      "learning_rate": 2.637624300726893e-07,
+      "loss": 0.8508,
+      "step": 21953
+    },
+    {
+      "epoch": 3.9086538461538463,
+      "grad_norm": 1.0203074216842651,
+      "learning_rate": 2.62747428649468e-07,
+      "loss": 0.8975,
+      "step": 21954
+    },
+    {
+      "epoch": 3.9088319088319086,
+      "grad_norm": 0.8658331036567688,
+      "learning_rate": 2.617343813705264e-07,
+      "loss": 0.6994,
+      "step": 21955
+    },
+    {
+      "epoch": 3.9090099715099713,
+      "grad_norm": 1.0209558010101318,
+      "learning_rate": 2.6072328825570425e-07,
+      "loss": 0.8763,
+      "step": 21956
+    },
+    {
+      "epoch": 3.909188034188034,
+      "grad_norm": 1.0200470685958862,
+      "learning_rate": 2.59714149324819e-07,
+      "loss": 0.7055,
+      "step": 21957
+    },
+    {
+      "epoch": 3.909366096866097,
+      "grad_norm": 1.178375244140625,
+      "learning_rate": 2.5870696459764365e-07,
+      "loss": 0.7509,
+      "step": 21958
+    },
+    {
+      "epoch": 3.9095441595441596,
+      "grad_norm": 0.8727983832359314,
+      "learning_rate": 2.577017340939181e-07,
+      "loss": 0.7704,
+      "step": 21959
+    },
+    {
+      "epoch": 3.9097222222222223,
+      "grad_norm": 0.9848031401634216,
+      "learning_rate": 2.5669845783332645e-07,
+      "loss": 0.8052,
+      "step": 21960
+    },
+    {
+      "epoch": 3.909900284900285,
+      "grad_norm": 0.877185583114624,
+      "learning_rate": 2.556971358355309e-07,
+      "loss": 0.8031,
+      "step": 21961
+    },
+    {
+      "epoch": 3.910078347578348,
+      "grad_norm": 0.9560007452964783,
+      "learning_rate": 2.54697768120149e-07,
+      "loss": 0.9216,
+      "step": 21962
+    },
+    {
+      "epoch": 3.91025641025641,
+      "grad_norm": 0.8177520632743835,
+      "learning_rate": 2.5370035470675404e-07,
+      "loss": 0.7067,
+      "step": 21963
+    },
+    {
+      "epoch": 3.910434472934473,
+      "grad_norm": 1.050285816192627,
+      "learning_rate": 2.5270489561490807e-07,
+      "loss": 0.7827,
+      "step": 21964
+    },
+    {
+      "epoch": 3.9106125356125356,
+      "grad_norm": 0.9988555908203125,
+      "learning_rate": 2.5171139086408444e-07,
+      "loss": 0.7919,
+      "step": 21965
+    },
+    {
+      "epoch": 3.9107905982905984,
+      "grad_norm": 1.05731999874115,
+      "learning_rate": 2.5071984047377873e-07,
+      "loss": 0.9199,
+      "step": 21966
+    },
+    {
+      "epoch": 3.9109686609686607,
+      "grad_norm": 0.9310580492019653,
+      "learning_rate": 2.4973024446340864e-07,
+      "loss": 0.5524,
+      "step": 21967
+    },
+    {
+      "epoch": 3.9111467236467234,
+      "grad_norm": 0.8600029349327087,
+      "learning_rate": 2.487426028523587e-07,
+      "loss": 0.6444,
+      "step": 21968
+    },
+    {
+      "epoch": 3.911324786324786,
+      "grad_norm": 0.8115721344947815,
+      "learning_rate": 2.4775691565998014e-07,
+      "loss": 0.6268,
+      "step": 21969
+    },
+    {
+      "epoch": 3.911502849002849,
+      "grad_norm": 1.136781930923462,
+      "learning_rate": 2.467731829055908e-07,
+      "loss": 0.8586,
+      "step": 21970
+    },
+    {
+      "epoch": 3.9116809116809117,
+      "grad_norm": 0.9071054458618164,
+      "learning_rate": 2.4579140460846415e-07,
+      "loss": 0.7965,
+      "step": 21971
+    },
+    {
+      "epoch": 3.9118589743589745,
+      "grad_norm": 1.0506306886672974,
+      "learning_rate": 2.448115807878293e-07,
+      "loss": 0.752,
+      "step": 21972
+    },
+    {
+      "epoch": 3.912037037037037,
+      "grad_norm": 1.117159366607666,
+      "learning_rate": 2.438337114628819e-07,
+      "loss": 0.7914,
+      "step": 21973
+    },
+    {
+      "epoch": 3.9122150997151,
+      "grad_norm": 0.892483115196228,
+      "learning_rate": 2.4285779665280675e-07,
+      "loss": 0.8148,
+      "step": 21974
+    },
+    {
+      "epoch": 3.9123931623931623,
+      "grad_norm": 0.9833968281745911,
+      "learning_rate": 2.4188383637668845e-07,
+      "loss": 0.7573,
+      "step": 21975
+    },
+    {
+      "epoch": 3.912571225071225,
+      "grad_norm": 0.9161805510520935,
+      "learning_rate": 2.409118306536229e-07,
+      "loss": 0.7162,
+      "step": 21976
+    },
+    {
+      "epoch": 3.9127492877492878,
+      "grad_norm": 0.9161592125892639,
+      "learning_rate": 2.399417795026726e-07,
+      "loss": 0.7717,
+      "step": 21977
+    },
+    {
+      "epoch": 3.9129273504273505,
+      "grad_norm": 0.8876699805259705,
+      "learning_rate": 2.389736829428224e-07,
+      "loss": 0.5481,
+      "step": 21978
+    },
+    {
+      "epoch": 3.9131054131054133,
+      "grad_norm": 0.9949450492858887,
+      "learning_rate": 2.3800754099304602e-07,
+      "loss": 0.6204,
+      "step": 21979
+    },
+    {
+      "epoch": 3.9132834757834756,
+      "grad_norm": 0.9041038751602173,
+      "learning_rate": 2.370433536722838e-07,
+      "loss": 0.7954,
+      "step": 21980
+    },
+    {
+      "epoch": 3.9134615384615383,
+      "grad_norm": 0.9964247941970825,
+      "learning_rate": 2.360811209994096e-07,
+      "loss": 0.8088,
+      "step": 21981
+    },
+    {
+      "epoch": 3.913639601139601,
+      "grad_norm": 0.9314337372779846,
+      "learning_rate": 2.3512084299328606e-07,
+      "loss": 0.6992,
+      "step": 21982
+    },
+    {
+      "epoch": 3.913817663817664,
+      "grad_norm": 0.997922956943512,
+      "learning_rate": 2.341625196727204e-07,
+      "loss": 0.7194,
+      "step": 21983
+    },
+    {
+      "epoch": 3.9139957264957266,
+      "grad_norm": 0.9609016180038452,
+      "learning_rate": 2.3320615105649757e-07,
+      "loss": 0.7588,
+      "step": 21984
+    },
+    {
+      "epoch": 3.9141737891737893,
+      "grad_norm": 1.006381630897522,
+      "learning_rate": 2.3225173716335812e-07,
+      "loss": 0.8141,
+      "step": 21985
+    },
+    {
+      "epoch": 3.914351851851852,
+      "grad_norm": 0.9407157897949219,
+      "learning_rate": 2.3129927801198715e-07,
+      "loss": 0.7846,
+      "step": 21986
+    },
+    {
+      "epoch": 3.9145299145299144,
+      "grad_norm": 0.8722037672996521,
+      "learning_rate": 2.3034877362106967e-07,
+      "loss": 0.6581,
+      "step": 21987
+    },
+    {
+      "epoch": 3.914707977207977,
+      "grad_norm": 0.9266372323036194,
+      "learning_rate": 2.2940022400920192e-07,
+      "loss": 0.8043,
+      "step": 21988
+    },
+    {
+      "epoch": 3.91488603988604,
+      "grad_norm": 1.1715607643127441,
+      "learning_rate": 2.2845362919498015e-07,
+      "loss": 0.8472,
+      "step": 21989
+    },
+    {
+      "epoch": 3.9150641025641026,
+      "grad_norm": 0.9698849320411682,
+      "learning_rate": 2.2750898919695617e-07,
+      "loss": 0.6895,
+      "step": 21990
+    },
+    {
+      "epoch": 3.9152421652421654,
+      "grad_norm": 1.0203914642333984,
+      "learning_rate": 2.2656630403363743e-07,
+      "loss": 0.7114,
+      "step": 21991
+    },
+    {
+      "epoch": 3.9154202279202277,
+      "grad_norm": 0.9018568992614746,
+      "learning_rate": 2.2562557372348692e-07,
+      "loss": 0.6274,
+      "step": 21992
+    },
+    {
+      "epoch": 3.9155982905982905,
+      "grad_norm": 0.8307010531425476,
+      "learning_rate": 2.2468679828494544e-07,
+      "loss": 0.5552,
+      "step": 21993
+    },
+    {
+      "epoch": 3.915776353276353,
+      "grad_norm": 1.0528616905212402,
+      "learning_rate": 2.2374997773639827e-07,
+      "loss": 1.0276,
+      "step": 21994
+    },
+    {
+      "epoch": 3.915954415954416,
+      "grad_norm": 0.879680335521698,
+      "learning_rate": 2.2281511209619744e-07,
+      "loss": 0.8538,
+      "step": 21995
+    },
+    {
+      "epoch": 3.9161324786324787,
+      "grad_norm": 0.9187359809875488,
+      "learning_rate": 2.218822013826727e-07,
+      "loss": 0.7884,
+      "step": 21996
+    },
+    {
+      "epoch": 3.9163105413105415,
+      "grad_norm": 0.9949030876159668,
+      "learning_rate": 2.209512456140872e-07,
+      "loss": 0.663,
+      "step": 21997
+    },
+    {
+      "epoch": 3.916488603988604,
+      "grad_norm": 0.8498545289039612,
+      "learning_rate": 2.2002224480869305e-07,
+      "loss": 0.7736,
+      "step": 21998
+    },
+    {
+      "epoch": 3.9166666666666665,
+      "grad_norm": 1.050980806350708,
+      "learning_rate": 2.1909519898468678e-07,
+      "loss": 0.8053,
+      "step": 21999
+    },
+    {
+      "epoch": 3.9168447293447293,
+      "grad_norm": 0.9050692319869995,
+      "learning_rate": 2.1817010816024275e-07,
+      "loss": 0.766,
+      "step": 22000
+    },
+    {
+      "epoch": 3.917022792022792,
+      "grad_norm": 0.9989611506462097,
+      "learning_rate": 2.172469723534687e-07,
+      "loss": 0.8758,
+      "step": 22001
+    },
+    {
+      "epoch": 3.9172008547008548,
+      "grad_norm": 0.8801424503326416,
+      "learning_rate": 2.1632579158246124e-07,
+      "loss": 0.7643,
+      "step": 22002
+    },
+    {
+      "epoch": 3.9173789173789175,
+      "grad_norm": 0.8321191668510437,
+      "learning_rate": 2.1540656586526152e-07,
+      "loss": 0.7413,
+      "step": 22003
+    },
+    {
+      "epoch": 3.91755698005698,
+      "grad_norm": 0.9452522993087769,
+      "learning_rate": 2.1448929521988848e-07,
+      "loss": 0.7105,
+      "step": 22004
+    },
+    {
+      "epoch": 3.9177350427350426,
+      "grad_norm": 1.0694053173065186,
+      "learning_rate": 2.135739796643166e-07,
+      "loss": 0.6839,
+      "step": 22005
+    },
+    {
+      "epoch": 3.9179131054131053,
+      "grad_norm": 0.8962434530258179,
+      "learning_rate": 2.1266061921646485e-07,
+      "loss": 0.7743,
+      "step": 22006
+    },
+    {
+      "epoch": 3.918091168091168,
+      "grad_norm": 0.9079770445823669,
+      "learning_rate": 2.117492138942412e-07,
+      "loss": 0.7312,
+      "step": 22007
+    },
+    {
+      "epoch": 3.918269230769231,
+      "grad_norm": 0.9559836387634277,
+      "learning_rate": 2.1083976371550907e-07,
+      "loss": 0.7365,
+      "step": 22008
+    },
+    {
+      "epoch": 3.9184472934472936,
+      "grad_norm": 1.2130017280578613,
+      "learning_rate": 2.0993226869806536e-07,
+      "loss": 1.0424,
+      "step": 22009
+    },
+    {
+      "epoch": 3.9186253561253563,
+      "grad_norm": 0.9800707101821899,
+      "learning_rate": 2.0902672885970697e-07,
+      "loss": 0.8919,
+      "step": 22010
+    },
+    {
+      "epoch": 3.9188034188034186,
+      "grad_norm": 0.9185284376144409,
+      "learning_rate": 2.0812314421817524e-07,
+      "loss": 0.5581,
+      "step": 22011
+    },
+    {
+      "epoch": 3.9189814814814814,
+      "grad_norm": 1.0603922605514526,
+      "learning_rate": 2.0722151479116714e-07,
+      "loss": 0.7397,
+      "step": 22012
+    },
+    {
+      "epoch": 3.919159544159544,
+      "grad_norm": 0.9114738702774048,
+      "learning_rate": 2.063218405963574e-07,
+      "loss": 0.7737,
+      "step": 22013
+    },
+    {
+      "epoch": 3.919337606837607,
+      "grad_norm": 0.8841904401779175,
+      "learning_rate": 2.0542412165136526e-07,
+      "loss": 0.7496,
+      "step": 22014
+    },
+    {
+      "epoch": 3.9195156695156697,
+      "grad_norm": 0.9699673652648926,
+      "learning_rate": 2.0452835797377667e-07,
+      "loss": 0.4983,
+      "step": 22015
+    },
+    {
+      "epoch": 3.919693732193732,
+      "grad_norm": 0.87566739320755,
+      "learning_rate": 2.0363454958115536e-07,
+      "loss": 0.7714,
+      "step": 22016
+    },
+    {
+      "epoch": 3.9198717948717947,
+      "grad_norm": 1.0994035005569458,
+      "learning_rate": 2.027426964909984e-07,
+      "loss": 0.8937,
+      "step": 22017
+    },
+    {
+      "epoch": 3.9200498575498575,
+      "grad_norm": 0.9632015228271484,
+      "learning_rate": 2.0185279872079188e-07,
+      "loss": 0.6813,
+      "step": 22018
+    },
+    {
+      "epoch": 3.92022792022792,
+      "grad_norm": 1.015339970588684,
+      "learning_rate": 2.0096485628796625e-07,
+      "loss": 0.8103,
+      "step": 22019
+    },
+    {
+      "epoch": 3.920405982905983,
+      "grad_norm": 1.0228614807128906,
+      "learning_rate": 2.000788692099187e-07,
+      "loss": 0.8716,
+      "step": 22020
+    },
+    {
+      "epoch": 3.9205840455840457,
+      "grad_norm": 0.8183853626251221,
+      "learning_rate": 1.9919483750401313e-07,
+      "loss": 0.6534,
+      "step": 22021
+    },
+    {
+      "epoch": 3.9207621082621085,
+      "grad_norm": 1.1893196105957031,
+      "learning_rate": 1.9831276118756903e-07,
+      "loss": 0.8373,
+      "step": 22022
+    },
+    {
+      "epoch": 3.9209401709401708,
+      "grad_norm": 1.0825175046920776,
+      "learning_rate": 1.9743264027786147e-07,
+      "loss": 0.7538,
+      "step": 22023
+    },
+    {
+      "epoch": 3.9211182336182335,
+      "grad_norm": 1.1777693033218384,
+      "learning_rate": 1.965544747921322e-07,
+      "loss": 0.6503,
+      "step": 22024
+    },
+    {
+      "epoch": 3.9212962962962963,
+      "grad_norm": 0.8321196436882019,
+      "learning_rate": 1.9567826474760075e-07,
+      "loss": 0.6849,
+      "step": 22025
+    },
+    {
+      "epoch": 3.921474358974359,
+      "grad_norm": 0.930051863193512,
+      "learning_rate": 1.9480401016143123e-07,
+      "loss": 0.5483,
+      "step": 22026
+    },
+    {
+      "epoch": 3.921652421652422,
+      "grad_norm": 0.8987363576889038,
+      "learning_rate": 1.9393171105075435e-07,
+      "loss": 0.7334,
+      "step": 22027
+    },
+    {
+      "epoch": 3.921830484330484,
+      "grad_norm": 0.956389307975769,
+      "learning_rate": 1.9306136743264536e-07,
+      "loss": 0.7411,
+      "step": 22028
+    },
+    {
+      "epoch": 3.922008547008547,
+      "grad_norm": 1.0126129388809204,
+      "learning_rate": 1.9219297932416836e-07,
+      "loss": 0.8383,
+      "step": 22029
+    },
+    {
+      "epoch": 3.9221866096866096,
+      "grad_norm": 0.9052560925483704,
+      "learning_rate": 1.9132654674234306e-07,
+      "loss": 0.7371,
+      "step": 22030
+    },
+    {
+      "epoch": 3.9223646723646723,
+      "grad_norm": 0.9211751818656921,
+      "learning_rate": 1.9046206970414483e-07,
+      "loss": 0.677,
+      "step": 22031
+    },
+    {
+      "epoch": 3.922542735042735,
+      "grad_norm": 0.9746459722518921,
+      "learning_rate": 1.8959954822649339e-07,
+      "loss": 0.9056,
+      "step": 22032
+    },
+    {
+      "epoch": 3.922720797720798,
+      "grad_norm": 0.9646094441413879,
+      "learning_rate": 1.887389823263086e-07,
+      "loss": 0.8162,
+      "step": 22033
+    },
+    {
+      "epoch": 3.9228988603988606,
+      "grad_norm": 0.9268381595611572,
+      "learning_rate": 1.8788037202044363e-07,
+      "loss": 0.977,
+      "step": 22034
+    },
+    {
+      "epoch": 3.9230769230769234,
+      "grad_norm": 1.0845803022384644,
+      "learning_rate": 1.8702371732571834e-07,
+      "loss": 0.803,
+      "step": 22035
+    },
+    {
+      "epoch": 3.9232549857549857,
+      "grad_norm": 0.8709145188331604,
+      "learning_rate": 1.861690182589193e-07,
+      "loss": 0.8107,
+      "step": 22036
+    },
+    {
+      "epoch": 3.9234330484330484,
+      "grad_norm": 0.9537857174873352,
+      "learning_rate": 1.853162748367887e-07,
+      "loss": 0.7444,
+      "step": 22037
+    },
+    {
+      "epoch": 3.923611111111111,
+      "grad_norm": 0.8645097017288208,
+      "learning_rate": 1.8446548707604648e-07,
+      "loss": 0.7448,
+      "step": 22038
+    },
+    {
+      "epoch": 3.923789173789174,
+      "grad_norm": 0.921103835105896,
+      "learning_rate": 1.8361665499334602e-07,
+      "loss": 0.768,
+      "step": 22039
+    },
+    {
+      "epoch": 3.923967236467236,
+      "grad_norm": 0.9298867583274841,
+      "learning_rate": 1.827697786053295e-07,
+      "loss": 0.6328,
+      "step": 22040
+    },
+    {
+      "epoch": 3.924145299145299,
+      "grad_norm": 1.068195104598999,
+      "learning_rate": 1.8192485792859483e-07,
+      "loss": 0.764,
+      "step": 22041
+    },
+    {
+      "epoch": 3.9243233618233617,
+      "grad_norm": 1.0562421083450317,
+      "learning_rate": 1.8108189297968425e-07,
+      "loss": 0.9191,
+      "step": 22042
+    },
+    {
+      "epoch": 3.9245014245014245,
+      "grad_norm": 0.984227180480957,
+      "learning_rate": 1.8024088377511795e-07,
+      "loss": 0.7772,
+      "step": 22043
+    },
+    {
+      "epoch": 3.9246794871794872,
+      "grad_norm": 1.0047521591186523,
+      "learning_rate": 1.794018303313716e-07,
+      "loss": 0.5225,
+      "step": 22044
+    },
+    {
+      "epoch": 3.92485754985755,
+      "grad_norm": 0.8145391941070557,
+      "learning_rate": 1.785647326648876e-07,
+      "loss": 0.5202,
+      "step": 22045
+    },
+    {
+      "epoch": 3.9250356125356127,
+      "grad_norm": 0.91175377368927,
+      "learning_rate": 1.7772959079206396e-07,
+      "loss": 0.6479,
+      "step": 22046
+    },
+    {
+      "epoch": 3.9252136752136755,
+      "grad_norm": 0.9365923404693604,
+      "learning_rate": 1.7689640472926538e-07,
+      "loss": 0.8265,
+      "step": 22047
+    },
+    {
+      "epoch": 3.925391737891738,
+      "grad_norm": 0.8597645163536072,
+      "learning_rate": 1.7606517449282323e-07,
+      "loss": 0.5941,
+      "step": 22048
+    },
+    {
+      "epoch": 3.9255698005698005,
+      "grad_norm": 0.821519672870636,
+      "learning_rate": 1.7523590009902445e-07,
+      "loss": 0.779,
+      "step": 22049
+    },
+    {
+      "epoch": 3.9257478632478633,
+      "grad_norm": 0.8071169257164001,
+      "learning_rate": 1.7440858156410057e-07,
+      "loss": 0.6711,
+      "step": 22050
+    },
+    {
+      "epoch": 3.925925925925926,
+      "grad_norm": 0.9287723302841187,
+      "learning_rate": 1.7358321890427188e-07,
+      "loss": 0.8539,
+      "step": 22051
+    },
+    {
+      "epoch": 3.9261039886039883,
+      "grad_norm": 0.9395582675933838,
+      "learning_rate": 1.7275981213570325e-07,
+      "loss": 0.7498,
+      "step": 22052
+    },
+    {
+      "epoch": 3.926282051282051,
+      "grad_norm": 0.9510295987129211,
+      "learning_rate": 1.7193836127453733e-07,
+      "loss": 0.6974,
+      "step": 22053
+    },
+    {
+      "epoch": 3.926460113960114,
+      "grad_norm": 0.9212534427642822,
+      "learning_rate": 1.7111886633687236e-07,
+      "loss": 0.8491,
+      "step": 22054
+    },
+    {
+      "epoch": 3.9266381766381766,
+      "grad_norm": 1.1693989038467407,
+      "learning_rate": 1.7030132733873994e-07,
+      "loss": 0.9457,
+      "step": 22055
+    },
+    {
+      "epoch": 3.9268162393162394,
+      "grad_norm": 0.9136499762535095,
+      "learning_rate": 1.6948574429618282e-07,
+      "loss": 0.7862,
+      "step": 22056
+    },
+    {
+      "epoch": 3.926994301994302,
+      "grad_norm": 0.9177721738815308,
+      "learning_rate": 1.6867211722517706e-07,
+      "loss": 0.7247,
+      "step": 22057
+    },
+    {
+      "epoch": 3.927172364672365,
+      "grad_norm": 1.0243544578552246,
+      "learning_rate": 1.6786044614165443e-07,
+      "loss": 0.8475,
+      "step": 22058
+    },
+    {
+      "epoch": 3.9273504273504276,
+      "grad_norm": 0.8945780396461487,
+      "learning_rate": 1.6705073106153547e-07,
+      "loss": 0.6658,
+      "step": 22059
+    },
+    {
+      "epoch": 3.92752849002849,
+      "grad_norm": 1.1358131170272827,
+      "learning_rate": 1.6624297200065197e-07,
+      "loss": 0.8744,
+      "step": 22060
+    },
+    {
+      "epoch": 3.9277065527065527,
+      "grad_norm": 0.9518811106681824,
+      "learning_rate": 1.6543716897486904e-07,
+      "loss": 0.9271,
+      "step": 22061
+    },
+    {
+      "epoch": 3.9278846153846154,
+      "grad_norm": 0.9149428606033325,
+      "learning_rate": 1.6463332199994075e-07,
+      "loss": 0.8896,
+      "step": 22062
+    },
+    {
+      "epoch": 3.928062678062678,
+      "grad_norm": 0.9397650361061096,
+      "learning_rate": 1.6383143109164333e-07,
+      "loss": 0.8038,
+      "step": 22063
+    },
+    {
+      "epoch": 3.9282407407407405,
+      "grad_norm": 0.9320110082626343,
+      "learning_rate": 1.6303149626567538e-07,
+      "loss": 0.6095,
+      "step": 22064
+    },
+    {
+      "epoch": 3.9284188034188032,
+      "grad_norm": 1.0589196681976318,
+      "learning_rate": 1.622335175377132e-07,
+      "loss": 0.8121,
+      "step": 22065
+    },
+    {
+      "epoch": 3.928596866096866,
+      "grad_norm": 0.821600079536438,
+      "learning_rate": 1.6143749492338877e-07,
+      "loss": 0.6471,
+      "step": 22066
+    },
+    {
+      "epoch": 3.9287749287749287,
+      "grad_norm": 0.9287008047103882,
+      "learning_rate": 1.606434284383007e-07,
+      "loss": 0.9064,
+      "step": 22067
+    },
+    {
+      "epoch": 3.9289529914529915,
+      "grad_norm": 1.094293236732483,
+      "learning_rate": 1.5985131809800326e-07,
+      "loss": 0.8028,
+      "step": 22068
+    },
+    {
+      "epoch": 3.9291310541310542,
+      "grad_norm": 0.9387109875679016,
+      "learning_rate": 1.5906116391801728e-07,
+      "loss": 0.7429,
+      "step": 22069
+    },
+    {
+      "epoch": 3.929309116809117,
+      "grad_norm": 0.994990885257721,
+      "learning_rate": 1.5827296591383045e-07,
+      "loss": 0.9219,
+      "step": 22070
+    },
+    {
+      "epoch": 3.9294871794871797,
+      "grad_norm": 0.995576024055481,
+      "learning_rate": 1.5748672410088593e-07,
+      "loss": 0.7344,
+      "step": 22071
+    },
+    {
+      "epoch": 3.929665242165242,
+      "grad_norm": 0.9641157388687134,
+      "learning_rate": 1.5670243849457144e-07,
+      "loss": 0.6646,
+      "step": 22072
+    },
+    {
+      "epoch": 3.929843304843305,
+      "grad_norm": 0.9594936370849609,
+      "learning_rate": 1.5592010911027467e-07,
+      "loss": 0.685,
+      "step": 22073
+    },
+    {
+      "epoch": 3.9300213675213675,
+      "grad_norm": 0.868777334690094,
+      "learning_rate": 1.5513973596331666e-07,
+      "loss": 0.6779,
+      "step": 22074
+    },
+    {
+      "epoch": 3.9301994301994303,
+      "grad_norm": 1.064650535583496,
+      "learning_rate": 1.543613190689852e-07,
+      "loss": 0.7801,
+      "step": 22075
+    },
+    {
+      "epoch": 3.9303774928774926,
+      "grad_norm": 1.0150713920593262,
+      "learning_rate": 1.535848584425348e-07,
+      "loss": 0.8817,
+      "step": 22076
+    },
+    {
+      "epoch": 3.9305555555555554,
+      "grad_norm": 0.932295024394989,
+      "learning_rate": 1.5281035409916433e-07,
+      "loss": 0.8153,
+      "step": 22077
+    },
+    {
+      "epoch": 3.930733618233618,
+      "grad_norm": 1.0463019609451294,
+      "learning_rate": 1.520378060540728e-07,
+      "loss": 0.7176,
+      "step": 22078
+    },
+    {
+      "epoch": 3.930911680911681,
+      "grad_norm": 0.8910303115844727,
+      "learning_rate": 1.5126721432238144e-07,
+      "loss": 0.6787,
+      "step": 22079
+    },
+    {
+      "epoch": 3.9310897435897436,
+      "grad_norm": 0.919048547744751,
+      "learning_rate": 1.5049857891918928e-07,
+      "loss": 0.7969,
+      "step": 22080
+    },
+    {
+      "epoch": 3.9312678062678064,
+      "grad_norm": 0.9526001811027527,
+      "learning_rate": 1.4973189985955094e-07,
+      "loss": 0.7759,
+      "step": 22081
+    },
+    {
+      "epoch": 3.931445868945869,
+      "grad_norm": 0.8939839601516724,
+      "learning_rate": 1.4896717715850995e-07,
+      "loss": 0.6735,
+      "step": 22082
+    },
+    {
+      "epoch": 3.931623931623932,
+      "grad_norm": 1.0000548362731934,
+      "learning_rate": 1.4820441083102098e-07,
+      "loss": 0.7515,
+      "step": 22083
+    },
+    {
+      "epoch": 3.931801994301994,
+      "grad_norm": 0.9883599281311035,
+      "learning_rate": 1.474436008920499e-07,
+      "loss": 0.6543,
+      "step": 22084
+    },
+    {
+      "epoch": 3.931980056980057,
+      "grad_norm": 0.8567184805870056,
+      "learning_rate": 1.4668474735649584e-07,
+      "loss": 0.7501,
+      "step": 22085
+    },
+    {
+      "epoch": 3.9321581196581197,
+      "grad_norm": 0.9684944748878479,
+      "learning_rate": 1.4592785023922473e-07,
+      "loss": 0.6182,
+      "step": 22086
+    },
+    {
+      "epoch": 3.9323361823361824,
+      "grad_norm": 0.97890704870224,
+      "learning_rate": 1.4517290955506912e-07,
+      "loss": 0.7792,
+      "step": 22087
+    },
+    {
+      "epoch": 3.932514245014245,
+      "grad_norm": 1.0463794469833374,
+      "learning_rate": 1.444199253188172e-07,
+      "loss": 0.812,
+      "step": 22088
+    },
+    {
+      "epoch": 3.9326923076923075,
+      "grad_norm": 0.8969535231590271,
+      "learning_rate": 1.4366889754523493e-07,
+      "loss": 0.7324,
+      "step": 22089
+    },
+    {
+      "epoch": 3.9328703703703702,
+      "grad_norm": 0.9530897736549377,
+      "learning_rate": 1.4291982624901058e-07,
+      "loss": 0.7213,
+      "step": 22090
+    },
+    {
+      "epoch": 3.933048433048433,
+      "grad_norm": 0.8949909210205078,
+      "learning_rate": 1.4217271144485455e-07,
+      "loss": 0.8138,
+      "step": 22091
+    },
+    {
+      "epoch": 3.9332264957264957,
+      "grad_norm": 0.974327027797699,
+      "learning_rate": 1.4142755314737744e-07,
+      "loss": 0.7491,
+      "step": 22092
+    },
+    {
+      "epoch": 3.9334045584045585,
+      "grad_norm": 1.043455958366394,
+      "learning_rate": 1.4068435137118974e-07,
+      "loss": 0.8267,
+      "step": 22093
+    },
+    {
+      "epoch": 3.9335826210826212,
+      "grad_norm": 0.8914118409156799,
+      "learning_rate": 1.399431061308576e-07,
+      "loss": 0.6382,
+      "step": 22094
+    },
+    {
+      "epoch": 3.933760683760684,
+      "grad_norm": 0.9680683016777039,
+      "learning_rate": 1.392038174408916e-07,
+      "loss": 0.7857,
+      "step": 22095
+    },
+    {
+      "epoch": 3.9339387464387463,
+      "grad_norm": 1.0403897762298584,
+      "learning_rate": 1.384664853157802e-07,
+      "loss": 0.784,
+      "step": 22096
+    },
+    {
+      "epoch": 3.934116809116809,
+      "grad_norm": 0.862411618232727,
+      "learning_rate": 1.3773110976998959e-07,
+      "loss": 0.7273,
+      "step": 22097
+    },
+    {
+      "epoch": 3.934294871794872,
+      "grad_norm": 0.943002462387085,
+      "learning_rate": 1.3699769081789714e-07,
+      "loss": 0.6417,
+      "step": 22098
+    },
+    {
+      "epoch": 3.9344729344729346,
+      "grad_norm": 0.9510722756385803,
+      "learning_rate": 1.3626622847390246e-07,
+      "loss": 0.7305,
+      "step": 22099
+    },
+    {
+      "epoch": 3.9346509971509973,
+      "grad_norm": 0.9381536245346069,
+      "learning_rate": 1.3553672275230523e-07,
+      "loss": 0.6609,
+      "step": 22100
+    },
+    {
+      "epoch": 3.9348290598290596,
+      "grad_norm": 0.9791659116744995,
+      "learning_rate": 1.3480917366742728e-07,
+      "loss": 0.7784,
+      "step": 22101
+    },
+    {
+      "epoch": 3.9350071225071224,
+      "grad_norm": 0.716427743434906,
+      "learning_rate": 1.3408358123350174e-07,
+      "loss": 0.42,
+      "step": 22102
+    },
+    {
+      "epoch": 3.935185185185185,
+      "grad_norm": 0.932883083820343,
+      "learning_rate": 1.333599454647727e-07,
+      "loss": 0.7054,
+      "step": 22103
+    },
+    {
+      "epoch": 3.935363247863248,
+      "grad_norm": 0.8895878195762634,
+      "learning_rate": 1.3263826637538445e-07,
+      "loss": 0.566,
+      "step": 22104
+    },
+    {
+      "epoch": 3.9355413105413106,
+      "grad_norm": 0.9334657192230225,
+      "learning_rate": 1.3191854397949232e-07,
+      "loss": 0.5716,
+      "step": 22105
+    },
+    {
+      "epoch": 3.9357193732193734,
+      "grad_norm": 0.8970123529434204,
+      "learning_rate": 1.3120077829120725e-07,
+      "loss": 0.7458,
+      "step": 22106
+    },
+    {
+      "epoch": 3.935897435897436,
+      "grad_norm": 0.9838302135467529,
+      "learning_rate": 1.3048496932457354e-07,
+      "loss": 0.866,
+      "step": 22107
+    },
+    {
+      "epoch": 3.9360754985754984,
+      "grad_norm": 0.9944798350334167,
+      "learning_rate": 1.2977111709363555e-07,
+      "loss": 0.9725,
+      "step": 22108
+    },
+    {
+      "epoch": 3.936253561253561,
+      "grad_norm": 0.9793491959571838,
+      "learning_rate": 1.2905922161237094e-07,
+      "loss": 0.6966,
+      "step": 22109
+    },
+    {
+      "epoch": 3.936431623931624,
+      "grad_norm": 0.9808713793754578,
+      "learning_rate": 1.2834928289472416e-07,
+      "loss": 0.6675,
+      "step": 22110
+    },
+    {
+      "epoch": 3.9366096866096867,
+      "grad_norm": 0.9978953003883362,
+      "learning_rate": 1.2764130095460624e-07,
+      "loss": 0.8886,
+      "step": 22111
+    },
+    {
+      "epoch": 3.9367877492877494,
+      "grad_norm": 0.9405983090400696,
+      "learning_rate": 1.2693527580588394e-07,
+      "loss": 0.9117,
+      "step": 22112
+    },
+    {
+      "epoch": 3.9369658119658117,
+      "grad_norm": 0.8421781659126282,
+      "learning_rate": 1.262312074624017e-07,
+      "loss": 0.6117,
+      "step": 22113
+    },
+    {
+      "epoch": 3.9371438746438745,
+      "grad_norm": 1.0687942504882812,
+      "learning_rate": 1.2552909593794847e-07,
+      "loss": 0.7961,
+      "step": 22114
+    },
+    {
+      "epoch": 3.9373219373219372,
+      "grad_norm": 0.9537040591239929,
+      "learning_rate": 1.2482894124629107e-07,
+      "loss": 0.7356,
+      "step": 22115
+    },
+    {
+      "epoch": 3.9375,
+      "grad_norm": 0.9954497814178467,
+      "learning_rate": 1.2413074340112961e-07,
+      "loss": 0.802,
+      "step": 22116
+    },
+    {
+      "epoch": 3.9376780626780628,
+      "grad_norm": 1.0326809883117676,
+      "learning_rate": 1.2343450241615318e-07,
+      "loss": 0.8831,
+      "step": 22117
+    },
+    {
+      "epoch": 3.9378561253561255,
+      "grad_norm": 0.9369449615478516,
+      "learning_rate": 1.2274021830499528e-07,
+      "loss": 0.7104,
+      "step": 22118
+    },
+    {
+      "epoch": 3.9380341880341883,
+      "grad_norm": 1.008576512336731,
+      "learning_rate": 1.2204789108127835e-07,
+      "loss": 0.7815,
+      "step": 22119
+    },
+    {
+      "epoch": 3.9382122507122506,
+      "grad_norm": 1.1734954118728638,
+      "learning_rate": 1.2135752075854712e-07,
+      "loss": 0.9971,
+      "step": 22120
+    },
+    {
+      "epoch": 3.9383903133903133,
+      "grad_norm": 1.0208659172058105,
+      "learning_rate": 1.206691073503352e-07,
+      "loss": 1.0925,
+      "step": 22121
+    },
+    {
+      "epoch": 3.938568376068376,
+      "grad_norm": 0.8521791696548462,
+      "learning_rate": 1.1998265087013182e-07,
+      "loss": 0.5814,
+      "step": 22122
+    },
+    {
+      "epoch": 3.938746438746439,
+      "grad_norm": 0.8949893116950989,
+      "learning_rate": 1.1929815133138177e-07,
+      "loss": 0.7053,
+      "step": 22123
+    },
+    {
+      "epoch": 3.9389245014245016,
+      "grad_norm": 0.9781447052955627,
+      "learning_rate": 1.1861560874750765e-07,
+      "loss": 0.6777,
+      "step": 22124
+    },
+    {
+      "epoch": 3.939102564102564,
+      "grad_norm": 0.9684866666793823,
+      "learning_rate": 1.1793502313186544e-07,
+      "loss": 0.8261,
+      "step": 22125
+    },
+    {
+      "epoch": 3.9392806267806266,
+      "grad_norm": 0.9222269058227539,
+      "learning_rate": 1.1725639449781111e-07,
+      "loss": 0.6531,
+      "step": 22126
+    },
+    {
+      "epoch": 3.9394586894586894,
+      "grad_norm": 1.074013113975525,
+      "learning_rate": 1.1657972285862295e-07,
+      "loss": 0.881,
+      "step": 22127
+    },
+    {
+      "epoch": 3.939636752136752,
+      "grad_norm": 0.9172451496124268,
+      "learning_rate": 1.1590500822756811e-07,
+      "loss": 0.8913,
+      "step": 22128
+    },
+    {
+      "epoch": 3.939814814814815,
+      "grad_norm": 0.8592706322669983,
+      "learning_rate": 1.1523225061785825e-07,
+      "loss": 0.6367,
+      "step": 22129
+    },
+    {
+      "epoch": 3.9399928774928776,
+      "grad_norm": 1.0467790365219116,
+      "learning_rate": 1.1456145004268282e-07,
+      "loss": 0.8392,
+      "step": 22130
+    },
+    {
+      "epoch": 3.9401709401709404,
+      "grad_norm": 0.9529557228088379,
+      "learning_rate": 1.1389260651518685e-07,
+      "loss": 0.7073,
+      "step": 22131
+    },
+    {
+      "epoch": 3.9403490028490027,
+      "grad_norm": 0.8891605138778687,
+      "learning_rate": 1.1322572004845988e-07,
+      "loss": 0.7151,
+      "step": 22132
+    },
+    {
+      "epoch": 3.9405270655270654,
+      "grad_norm": 0.9059684872627258,
+      "learning_rate": 1.1256079065558034e-07,
+      "loss": 0.5416,
+      "step": 22133
+    },
+    {
+      "epoch": 3.940705128205128,
+      "grad_norm": 0.9860732555389404,
+      "learning_rate": 1.1189781834958224e-07,
+      "loss": 0.7043,
+      "step": 22134
+    },
+    {
+      "epoch": 3.940883190883191,
+      "grad_norm": 0.956551194190979,
+      "learning_rate": 1.112368031434441e-07,
+      "loss": 0.826,
+      "step": 22135
+    },
+    {
+      "epoch": 3.9410612535612537,
+      "grad_norm": 0.9802271723747253,
+      "learning_rate": 1.1057774505011109e-07,
+      "loss": 0.8505,
+      "step": 22136
+    },
+    {
+      "epoch": 3.941239316239316,
+      "grad_norm": 0.9822153449058533,
+      "learning_rate": 1.0992064408251734e-07,
+      "loss": 0.6106,
+      "step": 22137
+    },
+    {
+      "epoch": 3.9414173789173788,
+      "grad_norm": 1.1056941747665405,
+      "learning_rate": 1.0926550025351923e-07,
+      "loss": 0.8706,
+      "step": 22138
+    },
+    {
+      "epoch": 3.9415954415954415,
+      "grad_norm": 0.9877166748046875,
+      "learning_rate": 1.0861231357595092e-07,
+      "loss": 0.8755,
+      "step": 22139
+    },
+    {
+      "epoch": 3.9417735042735043,
+      "grad_norm": 0.9555666446685791,
+      "learning_rate": 1.079610840626244e-07,
+      "loss": 0.8356,
+      "step": 22140
+    },
+    {
+      "epoch": 3.941951566951567,
+      "grad_norm": 0.9267847537994385,
+      "learning_rate": 1.0731181172629612e-07,
+      "loss": 0.8328,
+      "step": 22141
+    },
+    {
+      "epoch": 3.9421296296296298,
+      "grad_norm": 1.053410291671753,
+      "learning_rate": 1.0666449657967814e-07,
+      "loss": 0.8671,
+      "step": 22142
+    },
+    {
+      "epoch": 3.9423076923076925,
+      "grad_norm": 0.8405706286430359,
+      "learning_rate": 1.0601913863546032e-07,
+      "loss": 0.5916,
+      "step": 22143
+    },
+    {
+      "epoch": 3.942485754985755,
+      "grad_norm": 1.0977611541748047,
+      "learning_rate": 1.0537573790628808e-07,
+      "loss": 0.9064,
+      "step": 22144
+    },
+    {
+      "epoch": 3.9426638176638176,
+      "grad_norm": 0.9488881826400757,
+      "learning_rate": 1.0473429440476246e-07,
+      "loss": 0.7252,
+      "step": 22145
+    },
+    {
+      "epoch": 3.9428418803418803,
+      "grad_norm": 0.8930643200874329,
+      "learning_rate": 1.0409480814346229e-07,
+      "loss": 0.7395,
+      "step": 22146
+    },
+    {
+      "epoch": 3.943019943019943,
+      "grad_norm": 0.8349822759628296,
+      "learning_rate": 1.0345727913489978e-07,
+      "loss": 0.6557,
+      "step": 22147
+    },
+    {
+      "epoch": 3.943198005698006,
+      "grad_norm": 0.9073463082313538,
+      "learning_rate": 1.0282170739157604e-07,
+      "loss": 0.6097,
+      "step": 22148
+    },
+    {
+      "epoch": 3.943376068376068,
+      "grad_norm": 0.9441525340080261,
+      "learning_rate": 1.0218809292594778e-07,
+      "loss": 0.6664,
+      "step": 22149
+    },
+    {
+      "epoch": 3.943554131054131,
+      "grad_norm": 1.1835004091262817,
+      "learning_rate": 1.0155643575041618e-07,
+      "loss": 0.9609,
+      "step": 22150
+    },
+    {
+      "epoch": 3.9437321937321936,
+      "grad_norm": 0.8908481001853943,
+      "learning_rate": 1.0092673587737133e-07,
+      "loss": 0.591,
+      "step": 22151
+    },
+    {
+      "epoch": 3.9439102564102564,
+      "grad_norm": 1.1800652742385864,
+      "learning_rate": 1.0029899331913673e-07,
+      "loss": 0.9494,
+      "step": 22152
+    },
+    {
+      "epoch": 3.944088319088319,
+      "grad_norm": 0.9957692623138428,
+      "learning_rate": 9.967320808802472e-08,
+      "loss": 0.8205,
+      "step": 22153
+    },
+    {
+      "epoch": 3.944266381766382,
+      "grad_norm": 1.0245128870010376,
+      "learning_rate": 9.904938019629217e-08,
+      "loss": 0.7056,
+      "step": 22154
+    },
+    {
+      "epoch": 3.9444444444444446,
+      "grad_norm": 1.030775785446167,
+      "learning_rate": 9.842750965616265e-08,
+      "loss": 0.9636,
+      "step": 22155
+    },
+    {
+      "epoch": 3.9446225071225074,
+      "grad_norm": 0.9319864511489868,
+      "learning_rate": 9.78075964798153e-08,
+      "loss": 0.8961,
+      "step": 22156
+    },
+    {
+      "epoch": 3.9448005698005697,
+      "grad_norm": 0.9753043055534363,
+      "learning_rate": 9.718964067939596e-08,
+      "loss": 0.7721,
+      "step": 22157
+    },
+    {
+      "epoch": 3.9449786324786325,
+      "grad_norm": 0.8872262835502625,
+      "learning_rate": 9.657364226702825e-08,
+      "loss": 0.6485,
+      "step": 22158
+    },
+    {
+      "epoch": 3.945156695156695,
+      "grad_norm": 0.924390435218811,
+      "learning_rate": 9.595960125475812e-08,
+      "loss": 0.6058,
+      "step": 22159
+    },
+    {
+      "epoch": 3.945334757834758,
+      "grad_norm": 0.9848852753639221,
+      "learning_rate": 9.534751765462036e-08,
+      "loss": 0.7306,
+      "step": 22160
+    },
+    {
+      "epoch": 3.9455128205128203,
+      "grad_norm": 0.9805606007575989,
+      "learning_rate": 9.473739147862759e-08,
+      "loss": 0.8002,
+      "step": 22161
+    },
+    {
+      "epoch": 3.945690883190883,
+      "grad_norm": 1.0841553211212158,
+      "learning_rate": 9.412922273871471e-08,
+      "loss": 0.8198,
+      "step": 22162
+    },
+    {
+      "epoch": 3.9458689458689458,
+      "grad_norm": 1.023932695388794,
+      "learning_rate": 9.352301144680554e-08,
+      "loss": 0.8471,
+      "step": 22163
+    },
+    {
+      "epoch": 3.9460470085470085,
+      "grad_norm": 0.9031496644020081,
+      "learning_rate": 9.291875761476831e-08,
+      "loss": 0.7516,
+      "step": 22164
+    },
+    {
+      "epoch": 3.9462250712250713,
+      "grad_norm": 1.0119421482086182,
+      "learning_rate": 9.231646125446025e-08,
+      "loss": 0.8783,
+      "step": 22165
+    },
+    {
+      "epoch": 3.946403133903134,
+      "grad_norm": 0.9956295490264893,
+      "learning_rate": 9.17161223776608e-08,
+      "loss": 0.7641,
+      "step": 22166
+    },
+    {
+      "epoch": 3.9465811965811968,
+      "grad_norm": 0.9614377617835999,
+      "learning_rate": 9.111774099614945e-08,
+      "loss": 0.8233,
+      "step": 22167
+    },
+    {
+      "epoch": 3.9467592592592595,
+      "grad_norm": 1.1682920455932617,
+      "learning_rate": 9.052131712163903e-08,
+      "loss": 0.8376,
+      "step": 22168
+    },
+    {
+      "epoch": 3.946937321937322,
+      "grad_norm": 0.8535422682762146,
+      "learning_rate": 8.992685076582019e-08,
+      "loss": 0.5982,
+      "step": 22169
+    },
+    {
+      "epoch": 3.9471153846153846,
+      "grad_norm": 0.9734843969345093,
+      "learning_rate": 8.933434194033919e-08,
+      "loss": 0.6912,
+      "step": 22170
+    },
+    {
+      "epoch": 3.9472934472934473,
+      "grad_norm": 1.0210565328598022,
+      "learning_rate": 8.874379065680893e-08,
+      "loss": 0.8513,
+      "step": 22171
+    },
+    {
+      "epoch": 3.94747150997151,
+      "grad_norm": 0.9472604393959045,
+      "learning_rate": 8.815519692678686e-08,
+      "loss": 0.7962,
+      "step": 22172
+    },
+    {
+      "epoch": 3.9476495726495724,
+      "grad_norm": 0.9001045227050781,
+      "learning_rate": 8.756856076183039e-08,
+      "loss": 0.8089,
+      "step": 22173
+    },
+    {
+      "epoch": 3.947827635327635,
+      "grad_norm": 0.9715784192085266,
+      "learning_rate": 8.698388217340813e-08,
+      "loss": 0.6414,
+      "step": 22174
+    },
+    {
+      "epoch": 3.948005698005698,
+      "grad_norm": 0.8441361784934998,
+      "learning_rate": 8.640116117298869e-08,
+      "loss": 0.8897,
+      "step": 22175
+    },
+    {
+      "epoch": 3.9481837606837606,
+      "grad_norm": 1.0581719875335693,
+      "learning_rate": 8.582039777197404e-08,
+      "loss": 0.7248,
+      "step": 22176
+    },
+    {
+      "epoch": 3.9483618233618234,
+      "grad_norm": 1.1761430501937866,
+      "learning_rate": 8.524159198176618e-08,
+      "loss": 0.9026,
+      "step": 22177
+    },
+    {
+      "epoch": 3.948539886039886,
+      "grad_norm": 0.9691876769065857,
+      "learning_rate": 8.466474381370049e-08,
+      "loss": 0.7835,
+      "step": 22178
+    },
+    {
+      "epoch": 3.948717948717949,
+      "grad_norm": 0.9321456551551819,
+      "learning_rate": 8.408985327905683e-08,
+      "loss": 0.6644,
+      "step": 22179
+    },
+    {
+      "epoch": 3.9488960113960117,
+      "grad_norm": 0.8379319906234741,
+      "learning_rate": 8.351692038912618e-08,
+      "loss": 0.4799,
+      "step": 22180
+    },
+    {
+      "epoch": 3.949074074074074,
+      "grad_norm": 1.0753391981124878,
+      "learning_rate": 8.294594515512177e-08,
+      "loss": 0.8837,
+      "step": 22181
+    },
+    {
+      "epoch": 3.9492521367521367,
+      "grad_norm": 0.8955191373825073,
+      "learning_rate": 8.237692758823468e-08,
+      "loss": 0.6253,
+      "step": 22182
+    },
+    {
+      "epoch": 3.9494301994301995,
+      "grad_norm": 1.1359809637069702,
+      "learning_rate": 8.180986769960041e-08,
+      "loss": 0.7331,
+      "step": 22183
+    },
+    {
+      "epoch": 3.949608262108262,
+      "grad_norm": 0.8849379420280457,
+      "learning_rate": 8.124476550034344e-08,
+      "loss": 0.9057,
+      "step": 22184
+    },
+    {
+      "epoch": 3.9497863247863245,
+      "grad_norm": 1.1383916139602661,
+      "learning_rate": 8.068162100154375e-08,
+      "loss": 0.7332,
+      "step": 22185
+    },
+    {
+      "epoch": 3.9499643874643873,
+      "grad_norm": 0.9934820532798767,
+      "learning_rate": 8.012043421421478e-08,
+      "loss": 0.8742,
+      "step": 22186
+    },
+    {
+      "epoch": 3.95014245014245,
+      "grad_norm": 1.0497565269470215,
+      "learning_rate": 7.956120514935882e-08,
+      "loss": 0.8896,
+      "step": 22187
+    },
+    {
+      "epoch": 3.9503205128205128,
+      "grad_norm": 0.916329026222229,
+      "learning_rate": 7.900393381793381e-08,
+      "loss": 0.82,
+      "step": 22188
+    },
+    {
+      "epoch": 3.9504985754985755,
+      "grad_norm": 0.9675973653793335,
+      "learning_rate": 7.844862023085319e-08,
+      "loss": 0.8465,
+      "step": 22189
+    },
+    {
+      "epoch": 3.9506766381766383,
+      "grad_norm": 1.0843769311904907,
+      "learning_rate": 7.78952643990194e-08,
+      "loss": 0.7282,
+      "step": 22190
+    },
+    {
+      "epoch": 3.950854700854701,
+      "grad_norm": 0.8486320376396179,
+      "learning_rate": 7.734386633324597e-08,
+      "loss": 0.662,
+      "step": 22191
+    },
+    {
+      "epoch": 3.951032763532764,
+      "grad_norm": 0.8972040414810181,
+      "learning_rate": 7.67944260443465e-08,
+      "loss": 0.8706,
+      "step": 22192
+    },
+    {
+      "epoch": 3.951210826210826,
+      "grad_norm": 0.9764129519462585,
+      "learning_rate": 7.624694354309014e-08,
+      "loss": 0.6795,
+      "step": 22193
+    },
+    {
+      "epoch": 3.951388888888889,
+      "grad_norm": 1.0516738891601562,
+      "learning_rate": 7.570141884020166e-08,
+      "loss": 0.8508,
+      "step": 22194
+    },
+    {
+      "epoch": 3.9515669515669516,
+      "grad_norm": 1.031455636024475,
+      "learning_rate": 7.515785194637249e-08,
+      "loss": 0.8451,
+      "step": 22195
+    },
+    {
+      "epoch": 3.9517450142450143,
+      "grad_norm": 1.0027506351470947,
+      "learning_rate": 7.461624287224966e-08,
+      "loss": 0.6502,
+      "step": 22196
+    },
+    {
+      "epoch": 3.9519230769230766,
+      "grad_norm": 0.933249294757843,
+      "learning_rate": 7.407659162843583e-08,
+      "loss": 0.9345,
+      "step": 22197
+    },
+    {
+      "epoch": 3.9521011396011394,
+      "grad_norm": 0.7095343470573425,
+      "learning_rate": 7.353889822552252e-08,
+      "loss": 0.4258,
+      "step": 22198
+    },
+    {
+      "epoch": 3.952279202279202,
+      "grad_norm": 0.9729008078575134,
+      "learning_rate": 7.300316267403462e-08,
+      "loss": 0.7523,
+      "step": 22199
+    },
+    {
+      "epoch": 3.952457264957265,
+      "grad_norm": 0.9167136549949646,
+      "learning_rate": 7.246938498446377e-08,
+      "loss": 0.6221,
+      "step": 22200
+    },
+    {
+      "epoch": 3.9526353276353277,
+      "grad_norm": 0.9073047041893005,
+      "learning_rate": 7.193756516727935e-08,
+      "loss": 0.6346,
+      "step": 22201
+    },
+    {
+      "epoch": 3.9528133903133904,
+      "grad_norm": 1.0831269025802612,
+      "learning_rate": 7.140770323289525e-08,
+      "loss": 0.7293,
+      "step": 22202
+    },
+    {
+      "epoch": 3.952991452991453,
+      "grad_norm": 0.8954500555992126,
+      "learning_rate": 7.087979919169207e-08,
+      "loss": 0.6734,
+      "step": 22203
+    },
+    {
+      "epoch": 3.953169515669516,
+      "grad_norm": 1.0015569925308228,
+      "learning_rate": 7.035385305400599e-08,
+      "loss": 0.8681,
+      "step": 22204
+    },
+    {
+      "epoch": 3.953347578347578,
+      "grad_norm": 0.9596917033195496,
+      "learning_rate": 6.982986483016207e-08,
+      "loss": 0.8364,
+      "step": 22205
+    },
+    {
+      "epoch": 3.953525641025641,
+      "grad_norm": 1.0264099836349487,
+      "learning_rate": 6.930783453040767e-08,
+      "loss": 0.7442,
+      "step": 22206
+    },
+    {
+      "epoch": 3.9537037037037037,
+      "grad_norm": 0.8573072552680969,
+      "learning_rate": 6.878776216499016e-08,
+      "loss": 0.6549,
+      "step": 22207
+    },
+    {
+      "epoch": 3.9538817663817665,
+      "grad_norm": 0.9661825895309448,
+      "learning_rate": 6.826964774407918e-08,
+      "loss": 0.7778,
+      "step": 22208
+    },
+    {
+      "epoch": 3.9540598290598292,
+      "grad_norm": 0.8301908373832703,
+      "learning_rate": 6.775349127783326e-08,
+      "loss": 0.6448,
+      "step": 22209
+    },
+    {
+      "epoch": 3.9542378917378915,
+      "grad_norm": 0.8918058276176453,
+      "learning_rate": 6.723929277636653e-08,
+      "loss": 0.6934,
+      "step": 22210
+    },
+    {
+      "epoch": 3.9544159544159543,
+      "grad_norm": 1.1086747646331787,
+      "learning_rate": 6.672705224974874e-08,
+      "loss": 0.9753,
+      "step": 22211
+    },
+    {
+      "epoch": 3.954594017094017,
+      "grad_norm": 1.0281782150268555,
+      "learning_rate": 6.621676970802738e-08,
+      "loss": 0.6285,
+      "step": 22212
+    },
+    {
+      "epoch": 3.95477207977208,
+      "grad_norm": 0.9020591378211975,
+      "learning_rate": 6.570844516119445e-08,
+      "loss": 0.5609,
+      "step": 22213
+    },
+    {
+      "epoch": 3.9549501424501425,
+      "grad_norm": 0.9565960764884949,
+      "learning_rate": 6.520207861920869e-08,
+      "loss": 0.7437,
+      "step": 22214
+    },
+    {
+      "epoch": 3.9551282051282053,
+      "grad_norm": 1.183899998664856,
+      "learning_rate": 6.469767009198436e-08,
+      "loss": 0.798,
+      "step": 22215
+    },
+    {
+      "epoch": 3.955306267806268,
+      "grad_norm": 1.0373740196228027,
+      "learning_rate": 6.419521958942465e-08,
+      "loss": 0.9208,
+      "step": 22216
+    },
+    {
+      "epoch": 3.9554843304843303,
+      "grad_norm": 0.891069769859314,
+      "learning_rate": 6.369472712135505e-08,
+      "loss": 0.5366,
+      "step": 22217
+    },
+    {
+      "epoch": 3.955662393162393,
+      "grad_norm": 0.8725003004074097,
+      "learning_rate": 6.319619269757881e-08,
+      "loss": 0.6358,
+      "step": 22218
+    },
+    {
+      "epoch": 3.955840455840456,
+      "grad_norm": 0.9663777947425842,
+      "learning_rate": 6.269961632788812e-08,
+      "loss": 0.8058,
+      "step": 22219
+    },
+    {
+      "epoch": 3.9560185185185186,
+      "grad_norm": 1.0360321998596191,
+      "learning_rate": 6.220499802198631e-08,
+      "loss": 0.7722,
+      "step": 22220
+    },
+    {
+      "epoch": 3.9561965811965814,
+      "grad_norm": 1.0019513368606567,
+      "learning_rate": 6.171233778957675e-08,
+      "loss": 0.6321,
+      "step": 22221
+    },
+    {
+      "epoch": 3.9563746438746437,
+      "grad_norm": 0.9397497177124023,
+      "learning_rate": 6.122163564030725e-08,
+      "loss": 0.865,
+      "step": 22222
+    },
+    {
+      "epoch": 3.9565527065527064,
+      "grad_norm": 0.9361780285835266,
+      "learning_rate": 6.073289158380346e-08,
+      "loss": 0.707,
+      "step": 22223
+    },
+    {
+      "epoch": 3.956730769230769,
+      "grad_norm": 1.0211809873580933,
+      "learning_rate": 6.024610562962441e-08,
+      "loss": 0.8148,
+      "step": 22224
+    },
+    {
+      "epoch": 3.956908831908832,
+      "grad_norm": 0.970639705657959,
+      "learning_rate": 5.9761277787318e-08,
+      "loss": 0.7457,
+      "step": 22225
+    },
+    {
+      "epoch": 3.9570868945868947,
+      "grad_norm": 0.9334878325462341,
+      "learning_rate": 5.927840806638774e-08,
+      "loss": 0.6136,
+      "step": 22226
+    },
+    {
+      "epoch": 3.9572649572649574,
+      "grad_norm": 1.0210156440734863,
+      "learning_rate": 5.879749647628163e-08,
+      "loss": 0.6896,
+      "step": 22227
+    },
+    {
+      "epoch": 3.95744301994302,
+      "grad_norm": 0.9676822423934937,
+      "learning_rate": 5.8318543026425473e-08,
+      "loss": 0.6779,
+      "step": 22228
+    },
+    {
+      "epoch": 3.9576210826210825,
+      "grad_norm": 0.8727642297744751,
+      "learning_rate": 5.784154772621175e-08,
+      "loss": 0.6737,
+      "step": 22229
+    },
+    {
+      "epoch": 3.9577991452991452,
+      "grad_norm": 0.9030994772911072,
+      "learning_rate": 5.7366510584988544e-08,
+      "loss": 0.7467,
+      "step": 22230
+    },
+    {
+      "epoch": 3.957977207977208,
+      "grad_norm": 0.9664105176925659,
+      "learning_rate": 5.689343161204841e-08,
+      "loss": 0.8499,
+      "step": 22231
+    },
+    {
+      "epoch": 3.9581552706552707,
+      "grad_norm": 1.0056463479995728,
+      "learning_rate": 5.6422310816661714e-08,
+      "loss": 0.8082,
+      "step": 22232
+    },
+    {
+      "epoch": 3.9583333333333335,
+      "grad_norm": 0.9833633899688721,
+      "learning_rate": 5.595314820807662e-08,
+      "loss": 0.7669,
+      "step": 22233
+    },
+    {
+      "epoch": 3.958511396011396,
+      "grad_norm": 0.9480645060539246,
+      "learning_rate": 5.5485943795463566e-08,
+      "loss": 0.8063,
+      "step": 22234
+    },
+    {
+      "epoch": 3.9586894586894585,
+      "grad_norm": 0.9250748753547668,
+      "learning_rate": 5.5020697587993e-08,
+      "loss": 0.7095,
+      "step": 22235
+    },
+    {
+      "epoch": 3.9588675213675213,
+      "grad_norm": 0.9149683117866516,
+      "learning_rate": 5.455740959476874e-08,
+      "loss": 0.6697,
+      "step": 22236
+    },
+    {
+      "epoch": 3.959045584045584,
+      "grad_norm": 0.7881235480308533,
+      "learning_rate": 5.4096079824872414e-08,
+      "loss": 0.5722,
+      "step": 22237
+    },
+    {
+      "epoch": 3.959223646723647,
+      "grad_norm": 0.8992252945899963,
+      "learning_rate": 5.3636708287352346e-08,
+      "loss": 0.649,
+      "step": 22238
+    },
+    {
+      "epoch": 3.9594017094017095,
+      "grad_norm": 0.8622689247131348,
+      "learning_rate": 5.317929499119023e-08,
+      "loss": 0.6962,
+      "step": 22239
+    },
+    {
+      "epoch": 3.9595797720797723,
+      "grad_norm": 0.9292128086090088,
+      "learning_rate": 5.272383994536778e-08,
+      "loss": 0.8515,
+      "step": 22240
+    },
+    {
+      "epoch": 3.9597578347578346,
+      "grad_norm": 1.0485222339630127,
+      "learning_rate": 5.227034315880008e-08,
+      "loss": 0.7598,
+      "step": 22241
+    },
+    {
+      "epoch": 3.9599358974358974,
+      "grad_norm": 0.9419103860855103,
+      "learning_rate": 5.1818804640368925e-08,
+      "loss": 0.9492,
+      "step": 22242
+    },
+    {
+      "epoch": 3.96011396011396,
+      "grad_norm": 0.9244880676269531,
+      "learning_rate": 5.1369224398911675e-08,
+      "loss": 0.6919,
+      "step": 22243
+    },
+    {
+      "epoch": 3.960292022792023,
+      "grad_norm": 0.9493283033370972,
+      "learning_rate": 5.092160244326571e-08,
+      "loss": 0.8251,
+      "step": 22244
+    },
+    {
+      "epoch": 3.9604700854700856,
+      "grad_norm": 0.8471837043762207,
+      "learning_rate": 5.04759387821796e-08,
+      "loss": 0.5627,
+      "step": 22245
+    },
+    {
+      "epoch": 3.960648148148148,
+      "grad_norm": 1.041853904724121,
+      "learning_rate": 5.003223342439078e-08,
+      "loss": 0.8932,
+      "step": 22246
+    },
+    {
+      "epoch": 3.9608262108262107,
+      "grad_norm": 0.8454943895339966,
+      "learning_rate": 4.959048637859231e-08,
+      "loss": 0.6321,
+      "step": 22247
+    },
+    {
+      "epoch": 3.9610042735042734,
+      "grad_norm": 0.9939231872558594,
+      "learning_rate": 4.9150697653432834e-08,
+      "loss": 0.9207,
+      "step": 22248
+    },
+    {
+      "epoch": 3.961182336182336,
+      "grad_norm": 0.8977824449539185,
+      "learning_rate": 4.871286725753876e-08,
+      "loss": 0.6579,
+      "step": 22249
+    },
+    {
+      "epoch": 3.961360398860399,
+      "grad_norm": 0.9290462732315063,
+      "learning_rate": 4.827699519949214e-08,
+      "loss": 0.7353,
+      "step": 22250
+    },
+    {
+      "epoch": 3.9615384615384617,
+      "grad_norm": 0.9763110280036926,
+      "learning_rate": 4.7843081487819466e-08,
+      "loss": 0.742,
+      "step": 22251
+    },
+    {
+      "epoch": 3.9617165242165244,
+      "grad_norm": 0.9784603714942932,
+      "learning_rate": 4.741112613102505e-08,
+      "loss": 0.7809,
+      "step": 22252
+    },
+    {
+      "epoch": 3.9618945868945867,
+      "grad_norm": 0.8668375611305237,
+      "learning_rate": 4.69811291375799e-08,
+      "loss": 0.7355,
+      "step": 22253
+    },
+    {
+      "epoch": 3.9620726495726495,
+      "grad_norm": 0.9601633548736572,
+      "learning_rate": 4.655309051591061e-08,
+      "loss": 0.7645,
+      "step": 22254
+    },
+    {
+      "epoch": 3.9622507122507122,
+      "grad_norm": 0.9448567628860474,
+      "learning_rate": 4.6127010274399364e-08,
+      "loss": 0.9177,
+      "step": 22255
+    },
+    {
+      "epoch": 3.962428774928775,
+      "grad_norm": 0.9351767301559448,
+      "learning_rate": 4.570288842138393e-08,
+      "loss": 0.8333,
+      "step": 22256
+    },
+    {
+      "epoch": 3.9626068376068377,
+      "grad_norm": 0.9702423810958862,
+      "learning_rate": 4.528072496519098e-08,
+      "loss": 0.6999,
+      "step": 22257
+    },
+    {
+      "epoch": 3.9627849002849,
+      "grad_norm": 1.0096203088760376,
+      "learning_rate": 4.486051991408058e-08,
+      "loss": 0.8767,
+      "step": 22258
+    },
+    {
+      "epoch": 3.962962962962963,
+      "grad_norm": 0.9545206427574158,
+      "learning_rate": 4.444227327629058e-08,
+      "loss": 0.867,
+      "step": 22259
+    },
+    {
+      "epoch": 3.9631410256410255,
+      "grad_norm": 1.0196224451065063,
+      "learning_rate": 4.402598506001443e-08,
+      "loss": 0.9208,
+      "step": 22260
+    },
+    {
+      "epoch": 3.9633190883190883,
+      "grad_norm": 0.9041821360588074,
+      "learning_rate": 4.3611655273401165e-08,
+      "loss": 0.7637,
+      "step": 22261
+    },
+    {
+      "epoch": 3.963497150997151,
+      "grad_norm": 1.0449297428131104,
+      "learning_rate": 4.3199283924588716e-08,
+      "loss": 0.7901,
+      "step": 22262
+    },
+    {
+      "epoch": 3.963675213675214,
+      "grad_norm": 1.034912109375,
+      "learning_rate": 4.278887102163731e-08,
+      "loss": 0.8515,
+      "step": 22263
+    },
+    {
+      "epoch": 3.9638532763532766,
+      "grad_norm": 0.9550645351409912,
+      "learning_rate": 4.238041657259606e-08,
+      "loss": 0.6318,
+      "step": 22264
+    },
+    {
+      "epoch": 3.9640313390313393,
+      "grad_norm": 1.076509952545166,
+      "learning_rate": 4.197392058545857e-08,
+      "loss": 0.6801,
+      "step": 22265
+    },
+    {
+      "epoch": 3.9642094017094016,
+      "grad_norm": 1.0079200267791748,
+      "learning_rate": 4.156938306820735e-08,
+      "loss": 0.6512,
+      "step": 22266
+    },
+    {
+      "epoch": 3.9643874643874644,
+      "grad_norm": 0.8909665942192078,
+      "learning_rate": 4.1166804028758275e-08,
+      "loss": 0.7712,
+      "step": 22267
+    },
+    {
+      "epoch": 3.964565527065527,
+      "grad_norm": 0.9579968452453613,
+      "learning_rate": 4.0766183474993946e-08,
+      "loss": 0.6829,
+      "step": 22268
+    },
+    {
+      "epoch": 3.96474358974359,
+      "grad_norm": 0.9043282866477966,
+      "learning_rate": 4.0367521414774734e-08,
+      "loss": 0.7346,
+      "step": 22269
+    },
+    {
+      "epoch": 3.964921652421652,
+      "grad_norm": 1.1984134912490845,
+      "learning_rate": 3.9970817855905504e-08,
+      "loss": 0.6943,
+      "step": 22270
+    },
+    {
+      "epoch": 3.965099715099715,
+      "grad_norm": 0.8458036780357361,
+      "learning_rate": 3.9576072806146726e-08,
+      "loss": 0.6403,
+      "step": 22271
+    },
+    {
+      "epoch": 3.9652777777777777,
+      "grad_norm": 0.9842307567596436,
+      "learning_rate": 3.9183286273258843e-08,
+      "loss": 0.7469,
+      "step": 22272
+    },
+    {
+      "epoch": 3.9654558404558404,
+      "grad_norm": 1.2443668842315674,
+      "learning_rate": 3.879245826492462e-08,
+      "loss": 0.8978,
+      "step": 22273
+    },
+    {
+      "epoch": 3.965633903133903,
+      "grad_norm": 0.9370720982551575,
+      "learning_rate": 3.840358878879347e-08,
+      "loss": 0.6538,
+      "step": 22274
+    },
+    {
+      "epoch": 3.965811965811966,
+      "grad_norm": 0.9503483176231384,
+      "learning_rate": 3.801667785249263e-08,
+      "loss": 0.7674,
+      "step": 22275
+    },
+    {
+      "epoch": 3.9659900284900287,
+      "grad_norm": 0.8143479824066162,
+      "learning_rate": 3.763172546360494e-08,
+      "loss": 0.6565,
+      "step": 22276
+    },
+    {
+      "epoch": 3.9661680911680914,
+      "grad_norm": 0.8467413783073425,
+      "learning_rate": 3.72487316296688e-08,
+      "loss": 0.6532,
+      "step": 22277
+    },
+    {
+      "epoch": 3.9663461538461537,
+      "grad_norm": 0.9195687770843506,
+      "learning_rate": 3.686769635818932e-08,
+      "loss": 0.8072,
+      "step": 22278
+    },
+    {
+      "epoch": 3.9665242165242165,
+      "grad_norm": 0.9875113368034363,
+      "learning_rate": 3.64886196566272e-08,
+      "loss": 0.7161,
+      "step": 22279
+    },
+    {
+      "epoch": 3.9667022792022792,
+      "grad_norm": 1.1833739280700684,
+      "learning_rate": 3.611150153242093e-08,
+      "loss": 0.885,
+      "step": 22280
+    },
+    {
+      "epoch": 3.966880341880342,
+      "grad_norm": 1.0315101146697998,
+      "learning_rate": 3.5736341992953506e-08,
+      "loss": 0.7071,
+      "step": 22281
+    },
+    {
+      "epoch": 3.9670584045584043,
+      "grad_norm": 0.8316394090652466,
+      "learning_rate": 3.536314104556348e-08,
+      "loss": 0.5731,
+      "step": 22282
+    },
+    {
+      "epoch": 3.967236467236467,
+      "grad_norm": 0.9123276472091675,
+      "learning_rate": 3.4991898697589456e-08,
+      "loss": 0.6967,
+      "step": 22283
+    },
+    {
+      "epoch": 3.96741452991453,
+      "grad_norm": 1.0520005226135254,
+      "learning_rate": 3.462261495628116e-08,
+      "loss": 0.8838,
+      "step": 22284
+    },
+    {
+      "epoch": 3.9675925925925926,
+      "grad_norm": 0.9927443265914917,
+      "learning_rate": 3.4255289828877267e-08,
+      "loss": 0.7186,
+      "step": 22285
+    },
+    {
+      "epoch": 3.9677706552706553,
+      "grad_norm": 0.9712579250335693,
+      "learning_rate": 3.388992332259422e-08,
+      "loss": 0.8944,
+      "step": 22286
+    },
+    {
+      "epoch": 3.967948717948718,
+      "grad_norm": 0.897274374961853,
+      "learning_rate": 3.352651544457075e-08,
+      "loss": 0.6558,
+      "step": 22287
+    },
+    {
+      "epoch": 3.968126780626781,
+      "grad_norm": 0.9379400610923767,
+      "learning_rate": 3.316506620192339e-08,
+      "loss": 0.9769,
+      "step": 22288
+    },
+    {
+      "epoch": 3.9683048433048436,
+      "grad_norm": 0.8776909708976746,
+      "learning_rate": 3.2805575601757567e-08,
+      "loss": 0.6874,
+      "step": 22289
+    },
+    {
+      "epoch": 3.968482905982906,
+      "grad_norm": 0.9665780663490295,
+      "learning_rate": 3.2448043651089885e-08,
+      "loss": 0.7002,
+      "step": 22290
+    },
+    {
+      "epoch": 3.9686609686609686,
+      "grad_norm": 0.8973356485366821,
+      "learning_rate": 3.2092470356948066e-08,
+      "loss": 0.6631,
+      "step": 22291
+    },
+    {
+      "epoch": 3.9688390313390314,
+      "grad_norm": 0.9495517611503601,
+      "learning_rate": 3.17388557262821e-08,
+      "loss": 0.7328,
+      "step": 22292
+    },
+    {
+      "epoch": 3.969017094017094,
+      "grad_norm": 0.9756171703338623,
+      "learning_rate": 3.1387199766030884e-08,
+      "loss": 0.7874,
+      "step": 22293
+    },
+    {
+      "epoch": 3.9691951566951564,
+      "grad_norm": 0.9444931745529175,
+      "learning_rate": 3.10375024830889e-08,
+      "loss": 0.6176,
+      "step": 22294
+    },
+    {
+      "epoch": 3.969373219373219,
+      "grad_norm": 0.9644432663917542,
+      "learning_rate": 3.068976388428402e-08,
+      "loss": 0.8901,
+      "step": 22295
+    },
+    {
+      "epoch": 3.969551282051282,
+      "grad_norm": 0.9551946520805359,
+      "learning_rate": 3.0343983976455216e-08,
+      "loss": 0.6592,
+      "step": 22296
+    },
+    {
+      "epoch": 3.9697293447293447,
+      "grad_norm": 0.9582430124282837,
+      "learning_rate": 3.000016276636375e-08,
+      "loss": 0.8671,
+      "step": 22297
+    },
+    {
+      "epoch": 3.9699074074074074,
+      "grad_norm": 0.972446084022522,
+      "learning_rate": 2.9658300260748673e-08,
+      "loss": 0.8244,
+      "step": 22298
+    },
+    {
+      "epoch": 3.97008547008547,
+      "grad_norm": 1.0726343393325806,
+      "learning_rate": 2.9318396466304633e-08,
+      "loss": 0.7543,
+      "step": 22299
+    },
+    {
+      "epoch": 3.970263532763533,
+      "grad_norm": 0.9716382622718811,
+      "learning_rate": 2.8980451389704067e-08,
+      "loss": 0.7246,
+      "step": 22300
+    },
+    {
+      "epoch": 3.9704415954415957,
+      "grad_norm": 0.9425173401832581,
+      "learning_rate": 2.8644465037552803e-08,
+      "loss": 0.7026,
+      "step": 22301
+    },
+    {
+      "epoch": 3.970619658119658,
+      "grad_norm": 1.0111706256866455,
+      "learning_rate": 2.831043741644557e-08,
+      "loss": 0.6782,
+      "step": 22302
+    },
+    {
+      "epoch": 3.9707977207977208,
+      "grad_norm": 0.9812050461769104,
+      "learning_rate": 2.7978368532921574e-08,
+      "loss": 0.7114,
+      "step": 22303
+    },
+    {
+      "epoch": 3.9709757834757835,
+      "grad_norm": 1.0532653331756592,
+      "learning_rate": 2.7648258393486727e-08,
+      "loss": 0.8216,
+      "step": 22304
+    },
+    {
+      "epoch": 3.9711538461538463,
+      "grad_norm": 1.0057753324508667,
+      "learning_rate": 2.7320107004613626e-08,
+      "loss": 0.6944,
+      "step": 22305
+    },
+    {
+      "epoch": 3.9713319088319086,
+      "grad_norm": 1.0849117040634155,
+      "learning_rate": 2.6993914372719364e-08,
+      "loss": 0.7063,
+      "step": 22306
+    },
+    {
+      "epoch": 3.9715099715099713,
+      "grad_norm": 1.084551215171814,
+      "learning_rate": 2.6669680504209925e-08,
+      "loss": 0.7596,
+      "step": 22307
+    },
+    {
+      "epoch": 3.971688034188034,
+      "grad_norm": 1.20219886302948,
+      "learning_rate": 2.634740540543579e-08,
+      "loss": 0.8109,
+      "step": 22308
+    },
+    {
+      "epoch": 3.971866096866097,
+      "grad_norm": 0.9706676006317139,
+      "learning_rate": 2.6027089082691913e-08,
+      "loss": 0.6832,
+      "step": 22309
+    },
+    {
+      "epoch": 3.9720441595441596,
+      "grad_norm": 0.899502158164978,
+      "learning_rate": 2.570873154228437e-08,
+      "loss": 0.7876,
+      "step": 22310
+    },
+    {
+      "epoch": 3.9722222222222223,
+      "grad_norm": 0.9696589112281799,
+      "learning_rate": 2.5392332790430407e-08,
+      "loss": 0.8242,
+      "step": 22311
+    },
+    {
+      "epoch": 3.972400284900285,
+      "grad_norm": 0.98177170753479,
+      "learning_rate": 2.507789283332507e-08,
+      "loss": 0.7454,
+      "step": 22312
+    },
+    {
+      "epoch": 3.972578347578348,
+      "grad_norm": 0.8853342533111572,
+      "learning_rate": 2.4765411677152296e-08,
+      "loss": 0.8819,
+      "step": 22313
+    },
+    {
+      "epoch": 3.97275641025641,
+      "grad_norm": 1.0389357805252075,
+      "learning_rate": 2.4454889328018317e-08,
+      "loss": 0.7136,
+      "step": 22314
+    },
+    {
+      "epoch": 3.972934472934473,
+      "grad_norm": 0.9477161765098572,
+      "learning_rate": 2.414632579200715e-08,
+      "loss": 0.7858,
+      "step": 22315
+    },
+    {
+      "epoch": 3.9731125356125356,
+      "grad_norm": 1.163924217224121,
+      "learning_rate": 2.3839721075158415e-08,
+      "loss": 0.8402,
+      "step": 22316
+    },
+    {
+      "epoch": 3.9732905982905984,
+      "grad_norm": 0.8560773730278015,
+      "learning_rate": 2.353507518350062e-08,
+      "loss": 0.6802,
+      "step": 22317
+    },
+    {
+      "epoch": 3.9734686609686607,
+      "grad_norm": 0.8994771242141724,
+      "learning_rate": 2.3232388122984562e-08,
+      "loss": 0.7222,
+      "step": 22318
+    },
+    {
+      "epoch": 3.9736467236467234,
+      "grad_norm": 0.9608843326568604,
+      "learning_rate": 2.293165989954993e-08,
+      "loss": 0.6882,
+      "step": 22319
+    },
+    {
+      "epoch": 3.973824786324786,
+      "grad_norm": 0.8943131566047668,
+      "learning_rate": 2.2632890519080907e-08,
+      "loss": 0.6746,
+      "step": 22320
+    },
+    {
+      "epoch": 3.974002849002849,
+      "grad_norm": 0.875074565410614,
+      "learning_rate": 2.233607998742837e-08,
+      "loss": 0.5955,
+      "step": 22321
+    },
+    {
+      "epoch": 3.9741809116809117,
+      "grad_norm": 0.922785758972168,
+      "learning_rate": 2.204122831042099e-08,
+      "loss": 0.6879,
+      "step": 22322
+    },
+    {
+      "epoch": 3.9743589743589745,
+      "grad_norm": 1.0186944007873535,
+      "learning_rate": 2.174833549383193e-08,
+      "loss": 0.7392,
+      "step": 22323
+    },
+    {
+      "epoch": 3.974537037037037,
+      "grad_norm": 0.9766637682914734,
+      "learning_rate": 2.1457401543401034e-08,
+      "loss": 0.838,
+      "step": 22324
+    },
+    {
+      "epoch": 3.9747150997151,
+      "grad_norm": 1.1547154188156128,
+      "learning_rate": 2.1168426464823754e-08,
+      "loss": 1.0063,
+      "step": 22325
+    },
+    {
+      "epoch": 3.9748931623931623,
+      "grad_norm": 0.9768098592758179,
+      "learning_rate": 2.0881410263751123e-08,
+      "loss": 0.8346,
+      "step": 22326
+    },
+    {
+      "epoch": 3.975071225071225,
+      "grad_norm": 1.011702299118042,
+      "learning_rate": 2.0596352945834173e-08,
+      "loss": 0.7964,
+      "step": 22327
+    },
+    {
+      "epoch": 3.9752492877492878,
+      "grad_norm": 1.0046590566635132,
+      "learning_rate": 2.0313254516635126e-08,
+      "loss": 1.094,
+      "step": 22328
+    },
+    {
+      "epoch": 3.9754273504273505,
+      "grad_norm": 1.0097856521606445,
+      "learning_rate": 2.0032114981705098e-08,
+      "loss": 0.6802,
+      "step": 22329
+    },
+    {
+      "epoch": 3.9756054131054133,
+      "grad_norm": 0.908306896686554,
+      "learning_rate": 1.975293434656189e-08,
+      "loss": 0.7077,
+      "step": 22330
+    },
+    {
+      "epoch": 3.9757834757834756,
+      "grad_norm": 0.918569028377533,
+      "learning_rate": 1.9475712616667808e-08,
+      "loss": 0.6482,
+      "step": 22331
+    },
+    {
+      "epoch": 3.9759615384615383,
+      "grad_norm": 0.978238046169281,
+      "learning_rate": 1.9200449797451835e-08,
+      "loss": 0.8541,
+      "step": 22332
+    },
+    {
+      "epoch": 3.976139601139601,
+      "grad_norm": 1.020745038986206,
+      "learning_rate": 1.8927145894309662e-08,
+      "loss": 0.7957,
+      "step": 22333
+    },
+    {
+      "epoch": 3.976317663817664,
+      "grad_norm": 0.9351560473442078,
+      "learning_rate": 1.865580091260366e-08,
+      "loss": 0.7603,
+      "step": 22334
+    },
+    {
+      "epoch": 3.9764957264957266,
+      "grad_norm": 1.0288119316101074,
+      "learning_rate": 1.8386414857640698e-08,
+      "loss": 0.6967,
+      "step": 22335
+    },
+    {
+      "epoch": 3.9766737891737893,
+      "grad_norm": 0.9172123074531555,
+      "learning_rate": 1.8118987734694337e-08,
+      "loss": 0.6197,
+      "step": 22336
+    },
+    {
+      "epoch": 3.976851851851852,
+      "grad_norm": 0.9761427044868469,
+      "learning_rate": 1.785351954902703e-08,
+      "loss": 0.9306,
+      "step": 22337
+    },
+    {
+      "epoch": 3.9770299145299144,
+      "grad_norm": 1.0278958082199097,
+      "learning_rate": 1.7590010305812422e-08,
+      "loss": 0.7256,
+      "step": 22338
+    },
+    {
+      "epoch": 3.977207977207977,
+      "grad_norm": 0.8604381680488586,
+      "learning_rate": 1.7328460010235247e-08,
+      "loss": 0.696,
+      "step": 22339
+    },
+    {
+      "epoch": 3.97738603988604,
+      "grad_norm": 0.9066053628921509,
+      "learning_rate": 1.7068868667402538e-08,
+      "loss": 0.7578,
+      "step": 22340
+    },
+    {
+      "epoch": 3.9775641025641026,
+      "grad_norm": 1.0091204643249512,
+      "learning_rate": 1.6811236282421315e-08,
+      "loss": 0.7656,
+      "step": 22341
+    },
+    {
+      "epoch": 3.9777421652421654,
+      "grad_norm": 0.9811164140701294,
+      "learning_rate": 1.6555562860320895e-08,
+      "loss": 0.8741,
+      "step": 22342
+    },
+    {
+      "epoch": 3.9779202279202277,
+      "grad_norm": 1.1655586957931519,
+      "learning_rate": 1.630184840610838e-08,
+      "loss": 0.8747,
+      "step": 22343
+    },
+    {
+      "epoch": 3.9780982905982905,
+      "grad_norm": 0.9563546180725098,
+      "learning_rate": 1.6050092924768666e-08,
+      "loss": 0.7394,
+      "step": 22344
+    },
+    {
+      "epoch": 3.978276353276353,
+      "grad_norm": 0.9154346585273743,
+      "learning_rate": 1.5800296421231153e-08,
+      "loss": 0.6454,
+      "step": 22345
+    },
+    {
+      "epoch": 3.978454415954416,
+      "grad_norm": 0.9271607398986816,
+      "learning_rate": 1.555245890038082e-08,
+      "loss": 0.8219,
+      "step": 22346
+    },
+    {
+      "epoch": 3.9786324786324787,
+      "grad_norm": 0.8896580934524536,
+      "learning_rate": 1.5306580367091537e-08,
+      "loss": 0.5634,
+      "step": 22347
+    },
+    {
+      "epoch": 3.9788105413105415,
+      "grad_norm": 0.8249130249023438,
+      "learning_rate": 1.506266082615948e-08,
+      "loss": 0.6459,
+      "step": 22348
+    },
+    {
+      "epoch": 3.978988603988604,
+      "grad_norm": 0.9651779532432556,
+      "learning_rate": 1.4820700282380806e-08,
+      "loss": 0.9841,
+      "step": 22349
+    },
+    {
+      "epoch": 3.9791666666666665,
+      "grad_norm": 1.0503604412078857,
+      "learning_rate": 1.4580698740485066e-08,
+      "loss": 0.8412,
+      "step": 22350
+    },
+    {
+      "epoch": 3.9793447293447293,
+      "grad_norm": 0.9394381046295166,
+      "learning_rate": 1.4342656205179606e-08,
+      "loss": 0.6783,
+      "step": 22351
+    },
+    {
+      "epoch": 3.979522792022792,
+      "grad_norm": 1.0114195346832275,
+      "learning_rate": 1.4106572681127361e-08,
+      "loss": 0.8506,
+      "step": 22352
+    },
+    {
+      "epoch": 3.9797008547008548,
+      "grad_norm": 0.908707320690155,
+      "learning_rate": 1.3872448172957964e-08,
+      "loss": 0.6999,
+      "step": 22353
+    },
+    {
+      "epoch": 3.9798789173789175,
+      "grad_norm": 1.007361888885498,
+      "learning_rate": 1.3640282685256633e-08,
+      "loss": 0.7783,
+      "step": 22354
+    },
+    {
+      "epoch": 3.98005698005698,
+      "grad_norm": 1.1280618906021118,
+      "learning_rate": 1.3410076222564184e-08,
+      "loss": 0.8988,
+      "step": 22355
+    },
+    {
+      "epoch": 3.9802350427350426,
+      "grad_norm": 0.9843003153800964,
+      "learning_rate": 1.3181828789388118e-08,
+      "loss": 0.8007,
+      "step": 22356
+    },
+    {
+      "epoch": 3.9804131054131053,
+      "grad_norm": 1.0101714134216309,
+      "learning_rate": 1.295554039022484e-08,
+      "loss": 0.7549,
+      "step": 22357
+    },
+    {
+      "epoch": 3.980591168091168,
+      "grad_norm": 0.9103354215621948,
+      "learning_rate": 1.273121102949304e-08,
+      "loss": 0.7031,
+      "step": 22358
+    },
+    {
+      "epoch": 3.980769230769231,
+      "grad_norm": 0.8771821856498718,
+      "learning_rate": 1.2508840711578095e-08,
+      "loss": 0.7576,
+      "step": 22359
+    },
+    {
+      "epoch": 3.9809472934472936,
+      "grad_norm": 0.9804820418357849,
+      "learning_rate": 1.2288429440854288e-08,
+      "loss": 0.7814,
+      "step": 22360
+    },
+    {
+      "epoch": 3.9811253561253563,
+      "grad_norm": 0.8193320035934448,
+      "learning_rate": 1.2069977221618178e-08,
+      "loss": 0.6688,
+      "step": 22361
+    },
+    {
+      "epoch": 3.9813034188034186,
+      "grad_norm": 0.9540324211120605,
+      "learning_rate": 1.1853484058177433e-08,
+      "loss": 0.9382,
+      "step": 22362
+    },
+    {
+      "epoch": 3.9814814814814814,
+      "grad_norm": 0.850046694278717,
+      "learning_rate": 1.1638949954762002e-08,
+      "loss": 0.6782,
+      "step": 22363
+    },
+    {
+      "epoch": 3.981659544159544,
+      "grad_norm": 0.8397730588912964,
+      "learning_rate": 1.1426374915568528e-08,
+      "loss": 0.689,
+      "step": 22364
+    },
+    {
+      "epoch": 3.981837606837607,
+      "grad_norm": 0.9481822848320007,
+      "learning_rate": 1.1215758944760346e-08,
+      "loss": 0.8216,
+      "step": 22365
+    },
+    {
+      "epoch": 3.9820156695156697,
+      "grad_norm": 0.882338285446167,
+      "learning_rate": 1.100710204647859e-08,
+      "loss": 0.7705,
+      "step": 22366
+    },
+    {
+      "epoch": 3.982193732193732,
+      "grad_norm": 0.9093656539916992,
+      "learning_rate": 1.0800404224797778e-08,
+      "loss": 0.5796,
+      "step": 22367
+    },
+    {
+      "epoch": 3.9823717948717947,
+      "grad_norm": 1.0681262016296387,
+      "learning_rate": 1.0595665483781324e-08,
+      "loss": 0.7811,
+      "step": 22368
+    },
+    {
+      "epoch": 3.9825498575498575,
+      "grad_norm": 1.0023845434188843,
+      "learning_rate": 1.0392885827426036e-08,
+      "loss": 0.565,
+      "step": 22369
+    },
+    {
+      "epoch": 3.98272792022792,
+      "grad_norm": 0.9780094027519226,
+      "learning_rate": 1.019206525971761e-08,
+      "loss": 0.8498,
+      "step": 22370
+    },
+    {
+      "epoch": 3.982905982905983,
+      "grad_norm": 1.0917373895645142,
+      "learning_rate": 9.993203784586236e-09,
+      "loss": 0.6962,
+      "step": 22371
+    },
+    {
+      "epoch": 3.9830840455840457,
+      "grad_norm": 1.0024151802062988,
+      "learning_rate": 9.796301405917695e-09,
+      "loss": 0.7992,
+      "step": 22372
+    },
+    {
+      "epoch": 3.9832621082621085,
+      "grad_norm": 1.0047333240509033,
+      "learning_rate": 9.601358127586668e-09,
+      "loss": 0.8248,
+      "step": 22373
+    },
+    {
+      "epoch": 3.9834401709401708,
+      "grad_norm": 0.8688977956771851,
+      "learning_rate": 9.408373953401218e-09,
+      "loss": 0.6448,
+      "step": 22374
+    },
+    {
+      "epoch": 3.9836182336182335,
+      "grad_norm": 1.1232283115386963,
+      "learning_rate": 9.217348887147204e-09,
+      "loss": 0.7539,
+      "step": 22375
+    },
+    {
+      "epoch": 3.9837962962962963,
+      "grad_norm": 0.8725656270980835,
+      "learning_rate": 9.028282932566078e-09,
+      "loss": 0.7398,
+      "step": 22376
+    },
+    {
+      "epoch": 3.983974358974359,
+      "grad_norm": 0.9702167510986328,
+      "learning_rate": 8.841176093365988e-09,
+      "loss": 0.7629,
+      "step": 22377
+    },
+    {
+      "epoch": 3.984152421652422,
+      "grad_norm": 0.9710015058517456,
+      "learning_rate": 8.656028373210668e-09,
+      "loss": 0.9984,
+      "step": 22378
+    },
+    {
+      "epoch": 3.984330484330484,
+      "grad_norm": 1.1007386445999146,
+      "learning_rate": 8.472839775719443e-09,
+      "loss": 0.8551,
+      "step": 22379
+    },
+    {
+      "epoch": 3.984508547008547,
+      "grad_norm": 0.9243883490562439,
+      "learning_rate": 8.291610304489439e-09,
+      "loss": 0.7902,
+      "step": 22380
+    },
+    {
+      "epoch": 3.9846866096866096,
+      "grad_norm": 1.0163289308547974,
+      "learning_rate": 8.112339963073368e-09,
+      "loss": 0.755,
+      "step": 22381
+    },
+    {
+      "epoch": 3.9848646723646723,
+      "grad_norm": 0.9340982437133789,
+      "learning_rate": 7.935028754979534e-09,
+      "loss": 0.8079,
+      "step": 22382
+    },
+    {
+      "epoch": 3.985042735042735,
+      "grad_norm": 0.9406797289848328,
+      "learning_rate": 7.759676683682937e-09,
+      "loss": 0.6836,
+      "step": 22383
+    },
+    {
+      "epoch": 3.985220797720798,
+      "grad_norm": 0.9695764183998108,
+      "learning_rate": 7.586283752614165e-09,
+      "loss": 0.6901,
+      "step": 22384
+    },
+    {
+      "epoch": 3.9853988603988606,
+      "grad_norm": 1.0253673791885376,
+      "learning_rate": 7.4148499651927054e-09,
+      "loss": 0.6025,
+      "step": 22385
+    },
+    {
+      "epoch": 3.9855769230769234,
+      "grad_norm": 0.9360101819038391,
+      "learning_rate": 7.245375324749226e-09,
+      "loss": 0.7924,
+      "step": 22386
+    },
+    {
+      "epoch": 3.9857549857549857,
+      "grad_norm": 1.0131568908691406,
+      "learning_rate": 7.077859834614397e-09,
+      "loss": 0.7307,
+      "step": 22387
+    },
+    {
+      "epoch": 3.9859330484330484,
+      "grad_norm": 1.0636998414993286,
+      "learning_rate": 6.912303498074479e-09,
+      "loss": 0.8381,
+      "step": 22388
+    },
+    {
+      "epoch": 3.986111111111111,
+      "grad_norm": 1.13767671585083,
+      "learning_rate": 6.748706318371323e-09,
+      "loss": 0.9167,
+      "step": 22389
+    },
+    {
+      "epoch": 3.986289173789174,
+      "grad_norm": 0.9454557299613953,
+      "learning_rate": 6.58706829870237e-09,
+      "loss": 0.7844,
+      "step": 22390
+    },
+    {
+      "epoch": 3.986467236467236,
+      "grad_norm": 0.9187841415405273,
+      "learning_rate": 6.427389442242859e-09,
+      "loss": 0.6001,
+      "step": 22391
+    },
+    {
+      "epoch": 3.986645299145299,
+      "grad_norm": 1.2162847518920898,
+      "learning_rate": 6.269669752123619e-09,
+      "loss": 0.7899,
+      "step": 22392
+    },
+    {
+      "epoch": 3.9868233618233617,
+      "grad_norm": 0.8310156464576721,
+      "learning_rate": 6.11390923143107e-09,
+      "loss": 0.5653,
+      "step": 22393
+    },
+    {
+      "epoch": 3.9870014245014245,
+      "grad_norm": 1.0062121152877808,
+      "learning_rate": 5.960107883218324e-09,
+      "loss": 0.7532,
+      "step": 22394
+    },
+    {
+      "epoch": 3.9871794871794872,
+      "grad_norm": 0.9466153383255005,
+      "learning_rate": 5.808265710494087e-09,
+      "loss": 0.8552,
+      "step": 22395
+    },
+    {
+      "epoch": 3.98735754985755,
+      "grad_norm": 0.9300433397293091,
+      "learning_rate": 5.658382716244859e-09,
+      "loss": 0.6727,
+      "step": 22396
+    },
+    {
+      "epoch": 3.9875356125356127,
+      "grad_norm": 1.028559684753418,
+      "learning_rate": 5.510458903390525e-09,
+      "loss": 0.7283,
+      "step": 22397
+    },
+    {
+      "epoch": 3.9877136752136755,
+      "grad_norm": 1.0546510219573975,
+      "learning_rate": 5.364494274839871e-09,
+      "loss": 0.5589,
+      "step": 22398
+    },
+    {
+      "epoch": 3.987891737891738,
+      "grad_norm": 0.8639279007911682,
+      "learning_rate": 5.220488833457271e-09,
+      "loss": 0.7989,
+      "step": 22399
+    },
+    {
+      "epoch": 3.9880698005698005,
+      "grad_norm": 0.9834537506103516,
+      "learning_rate": 5.07844258205159e-09,
+      "loss": 0.694,
+      "step": 22400
+    },
+    {
+      "epoch": 3.9882478632478633,
+      "grad_norm": 0.9686489701271057,
+      "learning_rate": 4.93835552342059e-09,
+      "loss": 0.8029,
+      "step": 22401
+    },
+    {
+      "epoch": 3.988425925925926,
+      "grad_norm": 0.9237646460533142,
+      "learning_rate": 4.80022766029542e-09,
+      "loss": 0.7817,
+      "step": 22402
+    },
+    {
+      "epoch": 3.9886039886039883,
+      "grad_norm": 1.0533924102783203,
+      "learning_rate": 4.664058995385023e-09,
+      "loss": 0.5459,
+      "step": 22403
+    },
+    {
+      "epoch": 3.988782051282051,
+      "grad_norm": 1.1050822734832764,
+      "learning_rate": 4.529849531353936e-09,
+      "loss": 0.7622,
+      "step": 22404
+    },
+    {
+      "epoch": 3.988960113960114,
+      "grad_norm": 0.8912914395332336,
+      "learning_rate": 4.39759927085559e-09,
+      "loss": 0.7879,
+      "step": 22405
+    },
+    {
+      "epoch": 3.9891381766381766,
+      "grad_norm": 0.802912175655365,
+      "learning_rate": 4.2673082164434995e-09,
+      "loss": 0.559,
+      "step": 22406
+    },
+    {
+      "epoch": 3.9893162393162394,
+      "grad_norm": 1.0098425149917603,
+      "learning_rate": 4.1389763707044835e-09,
+      "loss": 0.809,
+      "step": 22407
+    },
+    {
+      "epoch": 3.989494301994302,
+      "grad_norm": 0.8691425323486328,
+      "learning_rate": 4.012603736136544e-09,
+      "loss": 0.6097,
+      "step": 22408
+    },
+    {
+      "epoch": 3.989672364672365,
+      "grad_norm": 0.8382730484008789,
+      "learning_rate": 3.888190315215479e-09,
+      "loss": 0.6292,
+      "step": 22409
+    },
+    {
+      "epoch": 3.9898504273504276,
+      "grad_norm": 1.00441312789917,
+      "learning_rate": 3.7657361103837776e-09,
+      "loss": 0.8296,
+      "step": 22410
+    },
+    {
+      "epoch": 3.99002849002849,
+      "grad_norm": 0.9702491164207458,
+      "learning_rate": 3.645241124039522e-09,
+      "loss": 0.8129,
+      "step": 22411
+    },
+    {
+      "epoch": 3.9902065527065527,
+      "grad_norm": 1.0183627605438232,
+      "learning_rate": 3.5267053585363863e-09,
+      "loss": 0.9345,
+      "step": 22412
+    },
+    {
+      "epoch": 3.9903846153846154,
+      "grad_norm": 1.120223045349121,
+      "learning_rate": 3.4101288162058377e-09,
+      "loss": 0.9309,
+      "step": 22413
+    },
+    {
+      "epoch": 3.990562678062678,
+      "grad_norm": 1.0526723861694336,
+      "learning_rate": 3.2955114993238336e-09,
+      "loss": 0.7433,
+      "step": 22414
+    },
+    {
+      "epoch": 3.9907407407407405,
+      "grad_norm": 0.9110050201416016,
+      "learning_rate": 3.182853410155229e-09,
+      "loss": 0.6849,
+      "step": 22415
+    },
+    {
+      "epoch": 3.9909188034188032,
+      "grad_norm": 0.9287052154541016,
+      "learning_rate": 3.0721545508760606e-09,
+      "loss": 0.7313,
+      "step": 22416
+    },
+    {
+      "epoch": 3.991096866096866,
+      "grad_norm": 0.9603120684623718,
+      "learning_rate": 2.9634149236845708e-09,
+      "loss": 0.7157,
+      "step": 22417
+    },
+    {
+      "epoch": 3.9912749287749287,
+      "grad_norm": 0.8682617545127869,
+      "learning_rate": 2.856634530690183e-09,
+      "loss": 0.7321,
+      "step": 22418
+    },
+    {
+      "epoch": 3.9914529914529915,
+      "grad_norm": 0.894698441028595,
+      "learning_rate": 2.751813374002321e-09,
+      "loss": 0.7382,
+      "step": 22419
+    },
+    {
+      "epoch": 3.9916310541310542,
+      "grad_norm": 0.868032693862915,
+      "learning_rate": 2.648951455663795e-09,
+      "loss": 0.7712,
+      "step": 22420
+    },
+    {
+      "epoch": 3.991809116809117,
+      "grad_norm": 0.9529984593391418,
+      "learning_rate": 2.548048777695211e-09,
+      "loss": 0.917,
+      "step": 22421
+    },
+    {
+      "epoch": 3.9919871794871797,
+      "grad_norm": 0.9214971661567688,
+      "learning_rate": 2.4491053420616637e-09,
+      "loss": 0.9054,
+      "step": 22422
+    },
+    {
+      "epoch": 3.992165242165242,
+      "grad_norm": 1.053322434425354,
+      "learning_rate": 2.352121150717146e-09,
+      "loss": 0.9304,
+      "step": 22423
+    },
+    {
+      "epoch": 3.992343304843305,
+      "grad_norm": 0.9296656250953674,
+      "learning_rate": 2.2570962055601385e-09,
+      "loss": 0.9732,
+      "step": 22424
+    },
+    {
+      "epoch": 3.9925213675213675,
+      "grad_norm": 1.047304630279541,
+      "learning_rate": 2.1640305084447144e-09,
+      "loss": 0.6625,
+      "step": 22425
+    },
+    {
+      "epoch": 3.9926994301994303,
+      "grad_norm": 0.9102067947387695,
+      "learning_rate": 2.072924061191639e-09,
+      "loss": 0.6529,
+      "step": 22426
+    },
+    {
+      "epoch": 3.9928774928774926,
+      "grad_norm": 1.0134507417678833,
+      "learning_rate": 1.983776865599474e-09,
+      "loss": 0.8424,
+      "step": 22427
+    },
+    {
+      "epoch": 3.9930555555555554,
+      "grad_norm": 0.8566016554832458,
+      "learning_rate": 1.8965889234001666e-09,
+      "loss": 0.5787,
+      "step": 22428
+    },
+    {
+      "epoch": 3.993233618233618,
+      "grad_norm": 1.1094683408737183,
+      "learning_rate": 1.8113602363145632e-09,
+      "loss": 0.8465,
+      "step": 22429
+    },
+    {
+      "epoch": 3.993411680911681,
+      "grad_norm": 0.9093437790870667,
+      "learning_rate": 1.728090806007998e-09,
+      "loss": 0.6979,
+      "step": 22430
+    },
+    {
+      "epoch": 3.9935897435897436,
+      "grad_norm": 0.9416308999061584,
+      "learning_rate": 1.6467806341124991e-09,
+      "loss": 0.8162,
+      "step": 22431
+    },
+    {
+      "epoch": 3.9937678062678064,
+      "grad_norm": 0.9535760283470154,
+      "learning_rate": 1.5674297222156852e-09,
+      "loss": 0.7527,
+      "step": 22432
+    },
+    {
+      "epoch": 3.993945868945869,
+      "grad_norm": 1.0274043083190918,
+      "learning_rate": 1.4900380718718688e-09,
+      "loss": 0.8621,
+      "step": 22433
+    },
+    {
+      "epoch": 3.994123931623932,
+      "grad_norm": 0.9442184567451477,
+      "learning_rate": 1.414605684602055e-09,
+      "loss": 0.4077,
+      "step": 22434
+    },
+    {
+      "epoch": 3.994301994301994,
+      "grad_norm": 0.9062432050704956,
+      "learning_rate": 1.341132561893943e-09,
+      "loss": 0.8457,
+      "step": 22435
+    },
+    {
+      "epoch": 3.994480056980057,
+      "grad_norm": 1.0084505081176758,
+      "learning_rate": 1.2696187051686182e-09,
+      "loss": 0.8652,
+      "step": 22436
+    },
+    {
+      "epoch": 3.9946581196581197,
+      "grad_norm": 0.8726778030395508,
+      "learning_rate": 1.200064115847166e-09,
+      "loss": 0.7724,
+      "step": 22437
+    },
+    {
+      "epoch": 3.9948361823361824,
+      "grad_norm": 0.927804708480835,
+      "learning_rate": 1.1324687952729562e-09,
+      "loss": 0.9643,
+      "step": 22438
+    },
+    {
+      "epoch": 3.995014245014245,
+      "grad_norm": 0.9145159721374512,
+      "learning_rate": 1.0668327447782567e-09,
+      "loss": 0.7232,
+      "step": 22439
+    },
+    {
+      "epoch": 3.9951923076923075,
+      "grad_norm": 0.8730134963989258,
+      "learning_rate": 1.0031559656398238e-09,
+      "loss": 0.6696,
+      "step": 22440
+    },
+    {
+      "epoch": 3.9953703703703702,
+      "grad_norm": 0.884189248085022,
+      "learning_rate": 9.414384591233116e-10,
+      "loss": 0.7042,
+      "step": 22441
+    },
+    {
+      "epoch": 3.995548433048433,
+      "grad_norm": 1.069779396057129,
+      "learning_rate": 8.816802264388635e-10,
+      "loss": 0.7686,
+      "step": 22442
+    },
+    {
+      "epoch": 3.9957264957264957,
+      "grad_norm": 0.9624055624008179,
+      "learning_rate": 8.238812687300091e-10,
+      "loss": 0.7969,
+      "step": 22443
+    },
+    {
+      "epoch": 3.9959045584045585,
+      "grad_norm": 0.9532380700111389,
+      "learning_rate": 7.680415871624825e-10,
+      "loss": 0.7705,
+      "step": 22444
+    },
+    {
+      "epoch": 3.9960826210826212,
+      "grad_norm": 0.8498616218566895,
+      "learning_rate": 7.141611828020977e-10,
+      "loss": 0.6918,
+      "step": 22445
+    },
+    {
+      "epoch": 3.996260683760684,
+      "grad_norm": 0.897534966468811,
+      "learning_rate": 6.622400567257714e-10,
+      "loss": 0.6194,
+      "step": 22446
+    },
+    {
+      "epoch": 3.9964387464387463,
+      "grad_norm": 0.9699941873550415,
+      "learning_rate": 6.122782099438063e-10,
+      "loss": 0.7863,
+      "step": 22447
+    },
+    {
+      "epoch": 3.996616809116809,
+      "grad_norm": 1.0888839960098267,
+      "learning_rate": 5.642756434220964e-10,
+      "loss": 0.8153,
+      "step": 22448
+    },
+    {
+      "epoch": 3.996794871794872,
+      "grad_norm": 0.9677609801292419,
+      "learning_rate": 5.182323581265358e-10,
+      "loss": 0.933,
+      "step": 22449
+    },
+    {
+      "epoch": 3.9969729344729346,
+      "grad_norm": 1.1706438064575195,
+      "learning_rate": 4.741483549342008e-10,
+      "loss": 0.877,
+      "step": 22450
+    },
+    {
+      "epoch": 3.9971509971509973,
+      "grad_norm": 0.9511678814888,
+      "learning_rate": 4.3202363472216733e-10,
+      "loss": 0.6967,
+      "step": 22451
+    },
+    {
+      "epoch": 3.9973290598290596,
+      "grad_norm": 1.0145454406738281,
+      "learning_rate": 3.9185819831200067e-10,
+      "loss": 0.8106,
+      "step": 22452
+    },
+    {
+      "epoch": 3.9975071225071224,
+      "grad_norm": 0.9555829763412476,
+      "learning_rate": 3.536520465030613e-10,
+      "loss": 0.6381,
+      "step": 22453
+    },
+    {
+      "epoch": 3.997685185185185,
+      "grad_norm": 0.8437405228614807,
+      "learning_rate": 3.174051800280964e-10,
+      "loss": 0.5492,
+      "step": 22454
+    },
+    {
+      "epoch": 3.997863247863248,
+      "grad_norm": 0.965201735496521,
+      "learning_rate": 2.831175995976487e-10,
+      "loss": 0.7768,
+      "step": 22455
+    },
+    {
+      "epoch": 3.9980413105413106,
+      "grad_norm": 1.0718035697937012,
+      "learning_rate": 2.5078930588895433e-10,
+      "loss": 0.7831,
+      "step": 22456
+    },
+    {
+      "epoch": 3.9982193732193734,
+      "grad_norm": 0.937907874584198,
+      "learning_rate": 2.2042029953484033e-10,
+      "loss": 0.6929,
+      "step": 22457
+    },
+    {
+      "epoch": 3.998397435897436,
+      "grad_norm": 1.2005555629730225,
+      "learning_rate": 1.9201058113482717e-10,
+      "loss": 0.8616,
+      "step": 22458
+    },
+    {
+      "epoch": 3.9985754985754984,
+      "grad_norm": 1.012808918952942,
+      "learning_rate": 1.6556015123292412e-10,
+      "loss": 0.6396,
+      "step": 22459
+    },
+    {
+      "epoch": 3.998753561253561,
+      "grad_norm": 0.8904356360435486,
+      "learning_rate": 1.4106901036203823e-10,
+      "loss": 0.6905,
+      "step": 22460
+    },
+    {
+      "epoch": 3.998931623931624,
+      "grad_norm": 0.8998084664344788,
+      "learning_rate": 1.185371589884632e-10,
+      "loss": 0.6481,
+      "step": 22461
+    },
+    {
+      "epoch": 3.9991096866096867,
+      "grad_norm": 1.0224751234054565,
+      "learning_rate": 9.79645975673904e-11,
+      "loss": 0.9935,
+      "step": 22462
+    },
+    {
+      "epoch": 3.9992877492877494,
+      "grad_norm": Infinity,
+      "learning_rate": 9.79645975673904e-11,
+      "loss": 0.7249,
+      "step": 22463
+    },
+    {
+      "epoch": 3.9994658119658117,
+      "grad_norm": 0.9459518790245056,
+      "learning_rate": 7.935132648739796e-11,
+      "loss": 0.7522,
+      "step": 22464
+    },
+    {
+      "epoch": 3.9994658119658117,
+      "eval_loss": 1.177049160003662,
+      "eval_runtime": 25.8941,
+      "eval_samples_per_second": 40.202,
+      "eval_steps_per_second": 20.12,
+      "step": 22464
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 22464,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 4,
+  "save_steps": 5616,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 8.674333832353382e+17,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-22464/training_args.bin b/checkpoint-22464/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..1245f6a2afbe9a6eefbb6d141231d555e0b0bf84
--- /dev/null
+++ b/checkpoint-22464/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:86de370014ed2be86ea27c820b434ceec5e097da2b5f9b08d0eac9aa564d8961
+size 6200
diff --git a/checkpoint-5616/README.md b/checkpoint-5616/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..719b4726992f7d0707a4253e9123dec35e4de390
--- /dev/null
+++ b/checkpoint-5616/README.md
@@ -0,0 +1,202 @@
+---
+base_model: openlm-research/open_llama_3b_v2
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/checkpoint-5616/adapter_config.json b/checkpoint-5616/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..6b6f20a570fc808390da3f2e001093ac1e56c1da
--- /dev/null
+++ b/checkpoint-5616/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "openlm-research/open_llama_3b_v2",
+  "bias": "none",
+  "fan_in_fan_out": null,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "down_proj",
+    "o_proj",
+    "q_proj",
+    "up_proj",
+    "k_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-5616/adapter_model.safetensors b/checkpoint-5616/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..dd5e7b14c97cfc3ef09445a2e53a4b1a3983820d
--- /dev/null
+++ b/checkpoint-5616/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:efd66c77b4c02ea9aa38b1686fca713fdffa16a78794d032ec9c8257cf23a895
+size 50899792
diff --git a/checkpoint-5616/optimizer.pt b/checkpoint-5616/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b7bc80f428fba1b747af09ee0b8111c8a636bf32
--- /dev/null
+++ b/checkpoint-5616/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8f8c777c25aeaeaaaa9324690795239160c715d50d22636f64a3a82bb3f06d9b
+size 26231684
diff --git a/checkpoint-5616/rng_state.pth b/checkpoint-5616/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..41dfa7d7903dea42d227bad638c2c750928d590c
--- /dev/null
+++ b/checkpoint-5616/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c062f7f375beded48b5337f5a3f3a5cb38807fa3e85dbf3e294c0ab6b627bfc2
+size 14244
diff --git a/checkpoint-5616/scheduler.pt b/checkpoint-5616/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..90dbb49a7ac95509a5c54bf578be193ee7e1ee9e
--- /dev/null
+++ b/checkpoint-5616/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4164a586e6ce7816c769ce21463fb39cb8c13a77d5e42f55c5f48dcac354572b
+size 1064
diff --git a/checkpoint-5616/special_tokens_map.json b/checkpoint-5616/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..72ecfeeb7e14d244c936169d2ed139eeae235ef1
--- /dev/null
+++ b/checkpoint-5616/special_tokens_map.json
@@ -0,0 +1,24 @@
+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/checkpoint-5616/tokenizer.model b/checkpoint-5616/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..98866ff8ae3631f331c57923c921a0c9ad22b97d
--- /dev/null
+++ b/checkpoint-5616/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:91b289e85fa20fd375d8b33dc12f77616f18abc6359804471d1fafcb425fecb8
+size 511574
diff --git a/checkpoint-5616/tokenizer_config.json b/checkpoint-5616/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c218d1b7228e3ad6055bdcf0ec15c4f188dc7d79
--- /dev/null
+++ b/checkpoint-5616/tokenizer_config.json
@@ -0,0 +1,43 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": true,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": true,
+  "model_max_length": 2048,
+  "pad_token": "</s>",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false,
+  "use_fast": true
+}
diff --git a/checkpoint-5616/trainer_state.json b/checkpoint-5616/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7e18c77dc0fb255b3d719d75c3bab953eebb8d4
--- /dev/null
+++ b/checkpoint-5616/trainer_state.json
@@ -0,0 +1,39385 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 1404,
+  "global_step": 5616,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.00017806267806267807,
+      "grad_norm": 0.2854898273944855,
+      "learning_rate": 1e-05,
+      "loss": 1.1997,
+      "step": 1
+    },
+    {
+      "epoch": 0.00017806267806267807,
+      "eval_loss": 1.3698358535766602,
+      "eval_runtime": 24.1591,
+      "eval_samples_per_second": 43.089,
+      "eval_steps_per_second": 21.565,
+      "step": 1
+    },
+    {
+      "epoch": 0.00035612535612535614,
+      "grad_norm": 0.3508087396621704,
+      "learning_rate": 2e-05,
+      "loss": 1.4134,
+      "step": 2
+    },
+    {
+      "epoch": 0.0005341880341880342,
+      "grad_norm": 0.27050870656967163,
+      "learning_rate": 3e-05,
+      "loss": 1.3447,
+      "step": 3
+    },
+    {
+      "epoch": 0.0007122507122507123,
+      "grad_norm": 0.27706292271614075,
+      "learning_rate": 4e-05,
+      "loss": 1.0354,
+      "step": 4
+    },
+    {
+      "epoch": 0.0008903133903133903,
+      "grad_norm": 0.30398961901664734,
+      "learning_rate": 5e-05,
+      "loss": 1.1441,
+      "step": 5
+    },
+    {
+      "epoch": 0.0010683760683760685,
+      "grad_norm": 0.3103881776332855,
+      "learning_rate": 6e-05,
+      "loss": 1.341,
+      "step": 6
+    },
+    {
+      "epoch": 0.0012464387464387464,
+      "grad_norm": 0.5191189646720886,
+      "learning_rate": 7e-05,
+      "loss": 1.3457,
+      "step": 7
+    },
+    {
+      "epoch": 0.0014245014245014246,
+      "grad_norm": 0.4449467360973358,
+      "learning_rate": 8e-05,
+      "loss": 1.5051,
+      "step": 8
+    },
+    {
+      "epoch": 0.0016025641025641025,
+      "grad_norm": 0.3914581537246704,
+      "learning_rate": 9e-05,
+      "loss": 1.5525,
+      "step": 9
+    },
+    {
+      "epoch": 0.0017806267806267807,
+      "grad_norm": 0.37746086716651917,
+      "learning_rate": 0.0001,
+      "loss": 1.3266,
+      "step": 10
+    },
+    {
+      "epoch": 0.001958689458689459,
+      "grad_norm": 0.35226109623908997,
+      "learning_rate": 0.00011000000000000002,
+      "loss": 1.5416,
+      "step": 11
+    },
+    {
+      "epoch": 0.002136752136752137,
+      "grad_norm": 0.3343672454357147,
+      "learning_rate": 0.00012,
+      "loss": 1.3221,
+      "step": 12
+    },
+    {
+      "epoch": 0.0023148148148148147,
+      "grad_norm": 0.47298333048820496,
+      "learning_rate": 0.00013000000000000002,
+      "loss": 1.2999,
+      "step": 13
+    },
+    {
+      "epoch": 0.002492877492877493,
+      "grad_norm": 0.377814918756485,
+      "learning_rate": 0.00014,
+      "loss": 1.1688,
+      "step": 14
+    },
+    {
+      "epoch": 0.002670940170940171,
+      "grad_norm": 0.46344801783561707,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 1.3565,
+      "step": 15
+    },
+    {
+      "epoch": 0.002849002849002849,
+      "grad_norm": 0.49615249037742615,
+      "learning_rate": 0.00016,
+      "loss": 1.5692,
+      "step": 16
+    },
+    {
+      "epoch": 0.003027065527065527,
+      "grad_norm": 0.5109946131706238,
+      "learning_rate": 0.00017,
+      "loss": 1.2991,
+      "step": 17
+    },
+    {
+      "epoch": 0.003205128205128205,
+      "grad_norm": 0.5125070214271545,
+      "learning_rate": 0.00018,
+      "loss": 1.3309,
+      "step": 18
+    },
+    {
+      "epoch": 0.003383190883190883,
+      "grad_norm": 0.4517767131328583,
+      "learning_rate": 0.00019,
+      "loss": 1.357,
+      "step": 19
+    },
+    {
+      "epoch": 0.0035612535612535613,
+      "grad_norm": 0.47267794609069824,
+      "learning_rate": 0.0002,
+      "loss": 1.1301,
+      "step": 20
+    },
+    {
+      "epoch": 0.0037393162393162395,
+      "grad_norm": 0.46823424100875854,
+      "learning_rate": 0.00019999999902035388,
+      "loss": 1.1195,
+      "step": 21
+    },
+    {
+      "epoch": 0.003917378917378918,
+      "grad_norm": 0.440036803483963,
+      "learning_rate": 0.00019999999608141548,
+      "loss": 1.2822,
+      "step": 22
+    },
+    {
+      "epoch": 0.004095441595441595,
+      "grad_norm": 0.371101975440979,
+      "learning_rate": 0.00019999999118318492,
+      "loss": 1.132,
+      "step": 23
+    },
+    {
+      "epoch": 0.004273504273504274,
+      "grad_norm": 0.44691094756126404,
+      "learning_rate": 0.00019999998432566226,
+      "loss": 1.2968,
+      "step": 24
+    },
+    {
+      "epoch": 0.004451566951566952,
+      "grad_norm": 0.5462725162506104,
+      "learning_rate": 0.0001999999755088476,
+      "loss": 1.1714,
+      "step": 25
+    },
+    {
+      "epoch": 0.004629629629629629,
+      "grad_norm": 0.39860013127326965,
+      "learning_rate": 0.0001999999647327412,
+      "loss": 1.0407,
+      "step": 26
+    },
+    {
+      "epoch": 0.004807692307692308,
+      "grad_norm": 0.5031934380531311,
+      "learning_rate": 0.0001999999519973432,
+      "loss": 1.2773,
+      "step": 27
+    },
+    {
+      "epoch": 0.004985754985754986,
+      "grad_norm": 0.42162764072418213,
+      "learning_rate": 0.0001999999373026539,
+      "loss": 1.2824,
+      "step": 28
+    },
+    {
+      "epoch": 0.005163817663817663,
+      "grad_norm": 0.40964868664741516,
+      "learning_rate": 0.00019999992064867353,
+      "loss": 1.226,
+      "step": 29
+    },
+    {
+      "epoch": 0.005341880341880342,
+      "grad_norm": 0.41650915145874023,
+      "learning_rate": 0.00019999990203540245,
+      "loss": 1.2677,
+      "step": 30
+    },
+    {
+      "epoch": 0.00551994301994302,
+      "grad_norm": 0.40052226185798645,
+      "learning_rate": 0.00019999988146284103,
+      "loss": 0.9443,
+      "step": 31
+    },
+    {
+      "epoch": 0.005698005698005698,
+      "grad_norm": 0.5198387503623962,
+      "learning_rate": 0.00019999985893098964,
+      "loss": 1.3043,
+      "step": 32
+    },
+    {
+      "epoch": 0.005876068376068376,
+      "grad_norm": 0.50941002368927,
+      "learning_rate": 0.00019999983443984878,
+      "loss": 1.2002,
+      "step": 33
+    },
+    {
+      "epoch": 0.006054131054131054,
+      "grad_norm": 0.30082932114601135,
+      "learning_rate": 0.00019999980798941888,
+      "loss": 0.9904,
+      "step": 34
+    },
+    {
+      "epoch": 0.006232193732193732,
+      "grad_norm": 0.4228935241699219,
+      "learning_rate": 0.00019999977957970048,
+      "loss": 1.1137,
+      "step": 35
+    },
+    {
+      "epoch": 0.00641025641025641,
+      "grad_norm": 0.41294750571250916,
+      "learning_rate": 0.0001999997492106941,
+      "loss": 1.3385,
+      "step": 36
+    },
+    {
+      "epoch": 0.006588319088319089,
+      "grad_norm": 0.4415493905544281,
+      "learning_rate": 0.00019999971688240041,
+      "loss": 1.1695,
+      "step": 37
+    },
+    {
+      "epoch": 0.006766381766381766,
+      "grad_norm": 0.3726460933685303,
+      "learning_rate": 0.00019999968259482,
+      "loss": 1.1734,
+      "step": 38
+    },
+    {
+      "epoch": 0.006944444444444444,
+      "grad_norm": 0.3969627320766449,
+      "learning_rate": 0.0001999996463479535,
+      "loss": 1.1209,
+      "step": 39
+    },
+    {
+      "epoch": 0.007122507122507123,
+      "grad_norm": 0.3779667913913727,
+      "learning_rate": 0.0001999996081418017,
+      "loss": 1.1635,
+      "step": 40
+    },
+    {
+      "epoch": 0.0073005698005698,
+      "grad_norm": 0.3933636546134949,
+      "learning_rate": 0.0001999995679763653,
+      "loss": 1.1514,
+      "step": 41
+    },
+    {
+      "epoch": 0.007478632478632479,
+      "grad_norm": 0.3567957282066345,
+      "learning_rate": 0.00019999952585164507,
+      "loss": 1.2488,
+      "step": 42
+    },
+    {
+      "epoch": 0.007656695156695157,
+      "grad_norm": 0.32506081461906433,
+      "learning_rate": 0.00019999948176764186,
+      "loss": 1.149,
+      "step": 43
+    },
+    {
+      "epoch": 0.007834757834757835,
+      "grad_norm": 0.46588361263275146,
+      "learning_rate": 0.0001999994357243566,
+      "loss": 1.4263,
+      "step": 44
+    },
+    {
+      "epoch": 0.008012820512820512,
+      "grad_norm": 0.5070307850837708,
+      "learning_rate": 0.00019999938772179005,
+      "loss": 1.0698,
+      "step": 45
+    },
+    {
+      "epoch": 0.00819088319088319,
+      "grad_norm": 0.38199326395988464,
+      "learning_rate": 0.00019999933775994327,
+      "loss": 0.9907,
+      "step": 46
+    },
+    {
+      "epoch": 0.00836894586894587,
+      "grad_norm": 0.43684661388397217,
+      "learning_rate": 0.0001999992858388172,
+      "loss": 1.2905,
+      "step": 47
+    },
+    {
+      "epoch": 0.008547008547008548,
+      "grad_norm": 0.44482162594795227,
+      "learning_rate": 0.00019999923195841284,
+      "loss": 1.2153,
+      "step": 48
+    },
+    {
+      "epoch": 0.008725071225071225,
+      "grad_norm": 0.4259667694568634,
+      "learning_rate": 0.0001999991761187313,
+      "loss": 1.1582,
+      "step": 49
+    },
+    {
+      "epoch": 0.008903133903133903,
+      "grad_norm": 0.41649091243743896,
+      "learning_rate": 0.00019999911831977357,
+      "loss": 1.0185,
+      "step": 50
+    },
+    {
+      "epoch": 0.009081196581196582,
+      "grad_norm": 0.4179716110229492,
+      "learning_rate": 0.0001999990585615409,
+      "loss": 1.3579,
+      "step": 51
+    },
+    {
+      "epoch": 0.009259259259259259,
+      "grad_norm": 0.3372558355331421,
+      "learning_rate": 0.00019999899684403438,
+      "loss": 1.0638,
+      "step": 52
+    },
+    {
+      "epoch": 0.009437321937321937,
+      "grad_norm": 0.41294020414352417,
+      "learning_rate": 0.00019999893316725525,
+      "loss": 1.1932,
+      "step": 53
+    },
+    {
+      "epoch": 0.009615384615384616,
+      "grad_norm": 0.4407919645309448,
+      "learning_rate": 0.00019999886753120473,
+      "loss": 1.4129,
+      "step": 54
+    },
+    {
+      "epoch": 0.009793447293447293,
+      "grad_norm": 0.47948843240737915,
+      "learning_rate": 0.00019999879993588414,
+      "loss": 1.2424,
+      "step": 55
+    },
+    {
+      "epoch": 0.009971509971509971,
+      "grad_norm": 0.3535355031490326,
+      "learning_rate": 0.00019999873038129484,
+      "loss": 1.0145,
+      "step": 56
+    },
+    {
+      "epoch": 0.01014957264957265,
+      "grad_norm": 0.5067078471183777,
+      "learning_rate": 0.00019999865886743813,
+      "loss": 1.4708,
+      "step": 57
+    },
+    {
+      "epoch": 0.010327635327635327,
+      "grad_norm": 0.42862898111343384,
+      "learning_rate": 0.0001999985853943154,
+      "loss": 1.0399,
+      "step": 58
+    },
+    {
+      "epoch": 0.010505698005698005,
+      "grad_norm": 0.4769059419631958,
+      "learning_rate": 0.00019999850996192816,
+      "loss": 1.1258,
+      "step": 59
+    },
+    {
+      "epoch": 0.010683760683760684,
+      "grad_norm": 0.4065442383289337,
+      "learning_rate": 0.0001999984325702778,
+      "loss": 1.2077,
+      "step": 60
+    },
+    {
+      "epoch": 0.010861823361823363,
+      "grad_norm": 0.5318329930305481,
+      "learning_rate": 0.0001999983532193659,
+      "loss": 1.2298,
+      "step": 61
+    },
+    {
+      "epoch": 0.01103988603988604,
+      "grad_norm": 0.4777173101902008,
+      "learning_rate": 0.000199998271909194,
+      "loss": 1.3195,
+      "step": 62
+    },
+    {
+      "epoch": 0.011217948717948718,
+      "grad_norm": 0.37553808093070984,
+      "learning_rate": 0.0001999981886397637,
+      "loss": 1.1188,
+      "step": 63
+    },
+    {
+      "epoch": 0.011396011396011397,
+      "grad_norm": 0.3920556902885437,
+      "learning_rate": 0.0001999981034110766,
+      "loss": 1.1448,
+      "step": 64
+    },
+    {
+      "epoch": 0.011574074074074073,
+      "grad_norm": 0.454272598028183,
+      "learning_rate": 0.0001999980162231344,
+      "loss": 1.0812,
+      "step": 65
+    },
+    {
+      "epoch": 0.011752136752136752,
+      "grad_norm": 0.4354456663131714,
+      "learning_rate": 0.00019999792707593882,
+      "loss": 1.1174,
+      "step": 66
+    },
+    {
+      "epoch": 0.01193019943019943,
+      "grad_norm": 0.5030252933502197,
+      "learning_rate": 0.00019999783596949156,
+      "loss": 1.2925,
+      "step": 67
+    },
+    {
+      "epoch": 0.012108262108262107,
+      "grad_norm": 0.5141571164131165,
+      "learning_rate": 0.00019999774290379446,
+      "loss": 1.6193,
+      "step": 68
+    },
+    {
+      "epoch": 0.012286324786324786,
+      "grad_norm": 0.417298287153244,
+      "learning_rate": 0.0001999976478788493,
+      "loss": 1.1875,
+      "step": 69
+    },
+    {
+      "epoch": 0.012464387464387465,
+      "grad_norm": 0.4642415940761566,
+      "learning_rate": 0.00019999755089465795,
+      "loss": 1.4138,
+      "step": 70
+    },
+    {
+      "epoch": 0.012642450142450143,
+      "grad_norm": 0.43184754252433777,
+      "learning_rate": 0.0001999974519512223,
+      "loss": 1.0697,
+      "step": 71
+    },
+    {
+      "epoch": 0.01282051282051282,
+      "grad_norm": 0.46698349714279175,
+      "learning_rate": 0.00019999735104854436,
+      "loss": 0.709,
+      "step": 72
+    },
+    {
+      "epoch": 0.012998575498575499,
+      "grad_norm": 0.37253814935684204,
+      "learning_rate": 0.000199997248186626,
+      "loss": 1.2084,
+      "step": 73
+    },
+    {
+      "epoch": 0.013176638176638177,
+      "grad_norm": 0.3851388692855835,
+      "learning_rate": 0.0001999971433654693,
+      "loss": 1.0548,
+      "step": 74
+    },
+    {
+      "epoch": 0.013354700854700854,
+      "grad_norm": 0.4434688985347748,
+      "learning_rate": 0.00019999703658507635,
+      "loss": 1.4084,
+      "step": 75
+    },
+    {
+      "epoch": 0.013532763532763533,
+      "grad_norm": 0.43164482712745667,
+      "learning_rate": 0.00019999692784544913,
+      "loss": 1.4872,
+      "step": 76
+    },
+    {
+      "epoch": 0.013710826210826211,
+      "grad_norm": 0.4224303364753723,
+      "learning_rate": 0.00019999681714658984,
+      "loss": 1.2221,
+      "step": 77
+    },
+    {
+      "epoch": 0.013888888888888888,
+      "grad_norm": 0.35588955879211426,
+      "learning_rate": 0.00019999670448850069,
+      "loss": 0.84,
+      "step": 78
+    },
+    {
+      "epoch": 0.014066951566951567,
+      "grad_norm": 0.3970590829849243,
+      "learning_rate": 0.0001999965898711838,
+      "loss": 1.1886,
+      "step": 79
+    },
+    {
+      "epoch": 0.014245014245014245,
+      "grad_norm": 0.4331924319267273,
+      "learning_rate": 0.00019999647329464146,
+      "loss": 1.179,
+      "step": 80
+    },
+    {
+      "epoch": 0.014423076923076924,
+      "grad_norm": 0.4226946234703064,
+      "learning_rate": 0.00019999635475887598,
+      "loss": 1.1496,
+      "step": 81
+    },
+    {
+      "epoch": 0.0146011396011396,
+      "grad_norm": 0.381592720746994,
+      "learning_rate": 0.00019999623426388962,
+      "loss": 1.1774,
+      "step": 82
+    },
+    {
+      "epoch": 0.01477920227920228,
+      "grad_norm": 0.4190855622291565,
+      "learning_rate": 0.00019999611180968478,
+      "loss": 1.1491,
+      "step": 83
+    },
+    {
+      "epoch": 0.014957264957264958,
+      "grad_norm": 0.3904292583465576,
+      "learning_rate": 0.00019999598739626389,
+      "loss": 1.1275,
+      "step": 84
+    },
+    {
+      "epoch": 0.015135327635327635,
+      "grad_norm": 0.4515478014945984,
+      "learning_rate": 0.0001999958610236293,
+      "loss": 1.2404,
+      "step": 85
+    },
+    {
+      "epoch": 0.015313390313390313,
+      "grad_norm": 0.48341724276542664,
+      "learning_rate": 0.00019999573269178359,
+      "loss": 1.3572,
+      "step": 86
+    },
+    {
+      "epoch": 0.015491452991452992,
+      "grad_norm": 0.42150333523750305,
+      "learning_rate": 0.00019999560240072914,
+      "loss": 1.0203,
+      "step": 87
+    },
+    {
+      "epoch": 0.01566951566951567,
+      "grad_norm": 0.45445525646209717,
+      "learning_rate": 0.00019999547015046867,
+      "loss": 1.0677,
+      "step": 88
+    },
+    {
+      "epoch": 0.01584757834757835,
+      "grad_norm": 0.3581015467643738,
+      "learning_rate": 0.00019999533594100463,
+      "loss": 1.0693,
+      "step": 89
+    },
+    {
+      "epoch": 0.016025641025641024,
+      "grad_norm": 0.4430878758430481,
+      "learning_rate": 0.00019999519977233971,
+      "loss": 1.1591,
+      "step": 90
+    },
+    {
+      "epoch": 0.016203703703703703,
+      "grad_norm": 0.3940352201461792,
+      "learning_rate": 0.0001999950616444766,
+      "loss": 1.1325,
+      "step": 91
+    },
+    {
+      "epoch": 0.01638176638176638,
+      "grad_norm": 0.4521673321723938,
+      "learning_rate": 0.00019999492155741794,
+      "loss": 1.3288,
+      "step": 92
+    },
+    {
+      "epoch": 0.01655982905982906,
+      "grad_norm": 0.3988296687602997,
+      "learning_rate": 0.00019999477951116658,
+      "loss": 1.0023,
+      "step": 93
+    },
+    {
+      "epoch": 0.01673789173789174,
+      "grad_norm": 0.38709723949432373,
+      "learning_rate": 0.00019999463550572516,
+      "loss": 1.2623,
+      "step": 94
+    },
+    {
+      "epoch": 0.016915954415954417,
+      "grad_norm": 0.35376182198524475,
+      "learning_rate": 0.00019999448954109662,
+      "loss": 1.0643,
+      "step": 95
+    },
+    {
+      "epoch": 0.017094017094017096,
+      "grad_norm": 0.49547120928764343,
+      "learning_rate": 0.00019999434161728377,
+      "loss": 1.2121,
+      "step": 96
+    },
+    {
+      "epoch": 0.01727207977207977,
+      "grad_norm": 0.49593672156333923,
+      "learning_rate": 0.00019999419173428952,
+      "loss": 1.1635,
+      "step": 97
+    },
+    {
+      "epoch": 0.01745014245014245,
+      "grad_norm": 0.4146541953086853,
+      "learning_rate": 0.0001999940398921168,
+      "loss": 1.1452,
+      "step": 98
+    },
+    {
+      "epoch": 0.017628205128205128,
+      "grad_norm": 0.5177254676818848,
+      "learning_rate": 0.00019999388609076858,
+      "loss": 1.2178,
+      "step": 99
+    },
+    {
+      "epoch": 0.017806267806267807,
+      "grad_norm": 0.4012768864631653,
+      "learning_rate": 0.0001999937303302479,
+      "loss": 0.9222,
+      "step": 100
+    },
+    {
+      "epoch": 0.017984330484330485,
+      "grad_norm": 0.4597131907939911,
+      "learning_rate": 0.00019999357261055777,
+      "loss": 0.979,
+      "step": 101
+    },
+    {
+      "epoch": 0.018162393162393164,
+      "grad_norm": 0.6190966963768005,
+      "learning_rate": 0.00019999341293170132,
+      "loss": 1.3909,
+      "step": 102
+    },
+    {
+      "epoch": 0.01834045584045584,
+      "grad_norm": 0.4576462209224701,
+      "learning_rate": 0.00019999325129368164,
+      "loss": 1.073,
+      "step": 103
+    },
+    {
+      "epoch": 0.018518518518518517,
+      "grad_norm": 0.4036749005317688,
+      "learning_rate": 0.00019999308769650192,
+      "loss": 1.1354,
+      "step": 104
+    },
+    {
+      "epoch": 0.018696581196581196,
+      "grad_norm": 0.4722452759742737,
+      "learning_rate": 0.00019999292214016538,
+      "loss": 1.2039,
+      "step": 105
+    },
+    {
+      "epoch": 0.018874643874643875,
+      "grad_norm": 0.5338274240493774,
+      "learning_rate": 0.00019999275462467527,
+      "loss": 1.225,
+      "step": 106
+    },
+    {
+      "epoch": 0.019052706552706553,
+      "grad_norm": 0.4301491677761078,
+      "learning_rate": 0.00019999258515003484,
+      "loss": 1.0601,
+      "step": 107
+    },
+    {
+      "epoch": 0.019230769230769232,
+      "grad_norm": 0.33271175622940063,
+      "learning_rate": 0.0001999924137162474,
+      "loss": 0.8441,
+      "step": 108
+    },
+    {
+      "epoch": 0.01940883190883191,
+      "grad_norm": 0.4648784399032593,
+      "learning_rate": 0.0001999922403233163,
+      "loss": 1.2038,
+      "step": 109
+    },
+    {
+      "epoch": 0.019586894586894586,
+      "grad_norm": 0.37915176153182983,
+      "learning_rate": 0.00019999206497124504,
+      "loss": 1.0923,
+      "step": 110
+    },
+    {
+      "epoch": 0.019764957264957264,
+      "grad_norm": 0.3865506052970886,
+      "learning_rate": 0.00019999188766003695,
+      "loss": 0.9535,
+      "step": 111
+    },
+    {
+      "epoch": 0.019943019943019943,
+      "grad_norm": 0.35739636421203613,
+      "learning_rate": 0.0001999917083896955,
+      "loss": 1.2688,
+      "step": 112
+    },
+    {
+      "epoch": 0.02012108262108262,
+      "grad_norm": 0.3943796157836914,
+      "learning_rate": 0.0001999915271602243,
+      "loss": 1.1097,
+      "step": 113
+    },
+    {
+      "epoch": 0.0202991452991453,
+      "grad_norm": 0.44758161902427673,
+      "learning_rate": 0.0001999913439716268,
+      "loss": 1.2698,
+      "step": 114
+    },
+    {
+      "epoch": 0.02047720797720798,
+      "grad_norm": 0.3749747574329376,
+      "learning_rate": 0.00019999115882390664,
+      "loss": 1.1091,
+      "step": 115
+    },
+    {
+      "epoch": 0.020655270655270654,
+      "grad_norm": 0.3479487895965576,
+      "learning_rate": 0.00019999097171706745,
+      "loss": 1.0049,
+      "step": 116
+    },
+    {
+      "epoch": 0.020833333333333332,
+      "grad_norm": 0.4491243064403534,
+      "learning_rate": 0.00019999078265111285,
+      "loss": 1.1857,
+      "step": 117
+    },
+    {
+      "epoch": 0.02101139601139601,
+      "grad_norm": 0.345289021730423,
+      "learning_rate": 0.00019999059162604662,
+      "loss": 1.1397,
+      "step": 118
+    },
+    {
+      "epoch": 0.02118945868945869,
+      "grad_norm": 0.5467649698257446,
+      "learning_rate": 0.00019999039864187243,
+      "loss": 1.2196,
+      "step": 119
+    },
+    {
+      "epoch": 0.021367521367521368,
+      "grad_norm": 0.36446481943130493,
+      "learning_rate": 0.00019999020369859409,
+      "loss": 0.796,
+      "step": 120
+    },
+    {
+      "epoch": 0.021545584045584047,
+      "grad_norm": 0.4225841760635376,
+      "learning_rate": 0.00019999000679621543,
+      "loss": 0.9684,
+      "step": 121
+    },
+    {
+      "epoch": 0.021723646723646725,
+      "grad_norm": 0.4205594062805176,
+      "learning_rate": 0.0001999898079347403,
+      "loss": 1.2762,
+      "step": 122
+    },
+    {
+      "epoch": 0.0219017094017094,
+      "grad_norm": 0.43773892521858215,
+      "learning_rate": 0.00019998960711417257,
+      "loss": 1.117,
+      "step": 123
+    },
+    {
+      "epoch": 0.02207977207977208,
+      "grad_norm": 0.41279685497283936,
+      "learning_rate": 0.00019998940433451623,
+      "loss": 1.1502,
+      "step": 124
+    },
+    {
+      "epoch": 0.022257834757834757,
+      "grad_norm": 0.4090803563594818,
+      "learning_rate": 0.0001999891995957752,
+      "loss": 1.2591,
+      "step": 125
+    },
+    {
+      "epoch": 0.022435897435897436,
+      "grad_norm": 0.6000410914421082,
+      "learning_rate": 0.0001999889928979535,
+      "loss": 1.4321,
+      "step": 126
+    },
+    {
+      "epoch": 0.022613960113960115,
+      "grad_norm": 0.524264395236969,
+      "learning_rate": 0.00019998878424105524,
+      "loss": 1.1849,
+      "step": 127
+    },
+    {
+      "epoch": 0.022792022792022793,
+      "grad_norm": 0.4581047296524048,
+      "learning_rate": 0.00019998857362508443,
+      "loss": 1.0598,
+      "step": 128
+    },
+    {
+      "epoch": 0.022970085470085472,
+      "grad_norm": 0.42663446068763733,
+      "learning_rate": 0.00019998836105004526,
+      "loss": 1.1909,
+      "step": 129
+    },
+    {
+      "epoch": 0.023148148148148147,
+      "grad_norm": 0.45709118247032166,
+      "learning_rate": 0.00019998814651594183,
+      "loss": 1.2104,
+      "step": 130
+    },
+    {
+      "epoch": 0.023326210826210825,
+      "grad_norm": 0.39528369903564453,
+      "learning_rate": 0.0001999879300227784,
+      "loss": 1.3073,
+      "step": 131
+    },
+    {
+      "epoch": 0.023504273504273504,
+      "grad_norm": 0.46896448731422424,
+      "learning_rate": 0.00019998771157055914,
+      "loss": 1.3202,
+      "step": 132
+    },
+    {
+      "epoch": 0.023682336182336183,
+      "grad_norm": 0.4386129677295685,
+      "learning_rate": 0.00019998749115928842,
+      "loss": 1.2196,
+      "step": 133
+    },
+    {
+      "epoch": 0.02386039886039886,
+      "grad_norm": 0.45920488238334656,
+      "learning_rate": 0.00019998726878897051,
+      "loss": 1.3668,
+      "step": 134
+    },
+    {
+      "epoch": 0.02403846153846154,
+      "grad_norm": 0.4115797281265259,
+      "learning_rate": 0.0001999870444596098,
+      "loss": 1.1052,
+      "step": 135
+    },
+    {
+      "epoch": 0.024216524216524215,
+      "grad_norm": 0.3860839903354645,
+      "learning_rate": 0.0001999868181712106,
+      "loss": 1.0344,
+      "step": 136
+    },
+    {
+      "epoch": 0.024394586894586893,
+      "grad_norm": 0.42514732480049133,
+      "learning_rate": 0.00019998658992377742,
+      "loss": 1.1979,
+      "step": 137
+    },
+    {
+      "epoch": 0.024572649572649572,
+      "grad_norm": 0.36001840233802795,
+      "learning_rate": 0.00019998635971731475,
+      "loss": 1.4536,
+      "step": 138
+    },
+    {
+      "epoch": 0.02475071225071225,
+      "grad_norm": 0.3739112317562103,
+      "learning_rate": 0.00019998612755182707,
+      "loss": 1.0097,
+      "step": 139
+    },
+    {
+      "epoch": 0.02492877492877493,
+      "grad_norm": 0.37545472383499146,
+      "learning_rate": 0.00019998589342731888,
+      "loss": 0.829,
+      "step": 140
+    },
+    {
+      "epoch": 0.025106837606837608,
+      "grad_norm": 0.38660728931427,
+      "learning_rate": 0.0001999856573437948,
+      "loss": 1.1324,
+      "step": 141
+    },
+    {
+      "epoch": 0.025284900284900286,
+      "grad_norm": 0.3741356432437897,
+      "learning_rate": 0.00019998541930125953,
+      "loss": 1.0934,
+      "step": 142
+    },
+    {
+      "epoch": 0.02546296296296296,
+      "grad_norm": 0.41900336742401123,
+      "learning_rate": 0.00019998517929971764,
+      "loss": 1.0336,
+      "step": 143
+    },
+    {
+      "epoch": 0.02564102564102564,
+      "grad_norm": 0.4167572259902954,
+      "learning_rate": 0.00019998493733917384,
+      "loss": 1.2571,
+      "step": 144
+    },
+    {
+      "epoch": 0.02581908831908832,
+      "grad_norm": 0.39437636733055115,
+      "learning_rate": 0.0001999846934196329,
+      "loss": 1.2283,
+      "step": 145
+    },
+    {
+      "epoch": 0.025997150997150997,
+      "grad_norm": 0.39129480719566345,
+      "learning_rate": 0.00019998444754109964,
+      "loss": 0.9893,
+      "step": 146
+    },
+    {
+      "epoch": 0.026175213675213676,
+      "grad_norm": 0.45533549785614014,
+      "learning_rate": 0.0001999841997035788,
+      "loss": 1.0793,
+      "step": 147
+    },
+    {
+      "epoch": 0.026353276353276354,
+      "grad_norm": 0.3741768002510071,
+      "learning_rate": 0.00019998394990707524,
+      "loss": 1.2179,
+      "step": 148
+    },
+    {
+      "epoch": 0.026531339031339033,
+      "grad_norm": 0.4066533148288727,
+      "learning_rate": 0.0001999836981515939,
+      "loss": 1.1443,
+      "step": 149
+    },
+    {
+      "epoch": 0.026709401709401708,
+      "grad_norm": 0.4851688742637634,
+      "learning_rate": 0.0001999834444371397,
+      "loss": 1.1668,
+      "step": 150
+    },
+    {
+      "epoch": 0.026887464387464387,
+      "grad_norm": 0.428091436624527,
+      "learning_rate": 0.0001999831887637176,
+      "loss": 1.2676,
+      "step": 151
+    },
+    {
+      "epoch": 0.027065527065527065,
+      "grad_norm": 0.4024655222892761,
+      "learning_rate": 0.0001999829311313326,
+      "loss": 1.3115,
+      "step": 152
+    },
+    {
+      "epoch": 0.027243589743589744,
+      "grad_norm": 0.43983033299446106,
+      "learning_rate": 0.00019998267153998976,
+      "loss": 1.1019,
+      "step": 153
+    },
+    {
+      "epoch": 0.027421652421652423,
+      "grad_norm": 0.4317505359649658,
+      "learning_rate": 0.0001999824099896942,
+      "loss": 1.3129,
+      "step": 154
+    },
+    {
+      "epoch": 0.0275997150997151,
+      "grad_norm": 0.43107882142066956,
+      "learning_rate": 0.000199982146480451,
+      "loss": 1.2134,
+      "step": 155
+    },
+    {
+      "epoch": 0.027777777777777776,
+      "grad_norm": 0.3939448297023773,
+      "learning_rate": 0.00019998188101226532,
+      "loss": 1.0321,
+      "step": 156
+    },
+    {
+      "epoch": 0.027955840455840455,
+      "grad_norm": 0.4641847610473633,
+      "learning_rate": 0.00019998161358514237,
+      "loss": 1.2369,
+      "step": 157
+    },
+    {
+      "epoch": 0.028133903133903133,
+      "grad_norm": 0.3538529872894287,
+      "learning_rate": 0.0001999813441990874,
+      "loss": 1.2061,
+      "step": 158
+    },
+    {
+      "epoch": 0.028311965811965812,
+      "grad_norm": 0.3277950584888458,
+      "learning_rate": 0.0001999810728541057,
+      "loss": 0.9419,
+      "step": 159
+    },
+    {
+      "epoch": 0.02849002849002849,
+      "grad_norm": 0.424710750579834,
+      "learning_rate": 0.00019998079955020254,
+      "loss": 1.3302,
+      "step": 160
+    },
+    {
+      "epoch": 0.02866809116809117,
+      "grad_norm": 0.4120834469795227,
+      "learning_rate": 0.00019998052428738333,
+      "loss": 1.079,
+      "step": 161
+    },
+    {
+      "epoch": 0.028846153846153848,
+      "grad_norm": 0.45811930298805237,
+      "learning_rate": 0.00019998024706565346,
+      "loss": 1.1259,
+      "step": 162
+    },
+    {
+      "epoch": 0.029024216524216523,
+      "grad_norm": 0.3873266875743866,
+      "learning_rate": 0.0001999799678850183,
+      "loss": 1.2124,
+      "step": 163
+    },
+    {
+      "epoch": 0.0292022792022792,
+      "grad_norm": 0.5806412696838379,
+      "learning_rate": 0.00019997968674548337,
+      "loss": 1.3467,
+      "step": 164
+    },
+    {
+      "epoch": 0.02938034188034188,
+      "grad_norm": 0.3906802833080292,
+      "learning_rate": 0.00019997940364705418,
+      "loss": 1.1438,
+      "step": 165
+    },
+    {
+      "epoch": 0.02955840455840456,
+      "grad_norm": 0.45201995968818665,
+      "learning_rate": 0.00019997911858973626,
+      "loss": 1.1469,
+      "step": 166
+    },
+    {
+      "epoch": 0.029736467236467237,
+      "grad_norm": 0.4965892732143402,
+      "learning_rate": 0.0001999788315735352,
+      "loss": 1.0829,
+      "step": 167
+    },
+    {
+      "epoch": 0.029914529914529916,
+      "grad_norm": 0.32578057050704956,
+      "learning_rate": 0.0001999785425984566,
+      "loss": 1.0432,
+      "step": 168
+    },
+    {
+      "epoch": 0.03009259259259259,
+      "grad_norm": 0.4146028161048889,
+      "learning_rate": 0.00019997825166450617,
+      "loss": 1.1657,
+      "step": 169
+    },
+    {
+      "epoch": 0.03027065527065527,
+      "grad_norm": 0.4342964291572571,
+      "learning_rate": 0.0001999779587716896,
+      "loss": 1.2038,
+      "step": 170
+    },
+    {
+      "epoch": 0.030448717948717948,
+      "grad_norm": 0.40128546953201294,
+      "learning_rate": 0.00019997766392001258,
+      "loss": 1.3044,
+      "step": 171
+    },
+    {
+      "epoch": 0.030626780626780627,
+      "grad_norm": 0.4357539117336273,
+      "learning_rate": 0.00019997736710948094,
+      "loss": 1.2143,
+      "step": 172
+    },
+    {
+      "epoch": 0.030804843304843305,
+      "grad_norm": 0.4821035861968994,
+      "learning_rate": 0.00019997706834010045,
+      "loss": 1.0469,
+      "step": 173
+    },
+    {
+      "epoch": 0.030982905982905984,
+      "grad_norm": 0.3966675102710724,
+      "learning_rate": 0.000199976767611877,
+      "loss": 1.2122,
+      "step": 174
+    },
+    {
+      "epoch": 0.031160968660968662,
+      "grad_norm": 0.4265064299106598,
+      "learning_rate": 0.00019997646492481648,
+      "loss": 1.0871,
+      "step": 175
+    },
+    {
+      "epoch": 0.03133903133903134,
+      "grad_norm": 0.3445652723312378,
+      "learning_rate": 0.00019997616027892485,
+      "loss": 1.0412,
+      "step": 176
+    },
+    {
+      "epoch": 0.031517094017094016,
+      "grad_norm": 0.47187718749046326,
+      "learning_rate": 0.000199975853674208,
+      "loss": 1.0822,
+      "step": 177
+    },
+    {
+      "epoch": 0.0316951566951567,
+      "grad_norm": 0.37751707434654236,
+      "learning_rate": 0.000199975545110672,
+      "loss": 1.1439,
+      "step": 178
+    },
+    {
+      "epoch": 0.03187321937321937,
+      "grad_norm": 0.38792455196380615,
+      "learning_rate": 0.00019997523458832286,
+      "loss": 0.8604,
+      "step": 179
+    },
+    {
+      "epoch": 0.03205128205128205,
+      "grad_norm": 0.35199594497680664,
+      "learning_rate": 0.00019997492210716667,
+      "loss": 1.0819,
+      "step": 180
+    },
+    {
+      "epoch": 0.03222934472934473,
+      "grad_norm": 0.4828922748565674,
+      "learning_rate": 0.00019997460766720958,
+      "loss": 1.1879,
+      "step": 181
+    },
+    {
+      "epoch": 0.032407407407407406,
+      "grad_norm": 0.46153363585472107,
+      "learning_rate": 0.00019997429126845774,
+      "loss": 1.1592,
+      "step": 182
+    },
+    {
+      "epoch": 0.03258547008547009,
+      "grad_norm": 0.4844890832901001,
+      "learning_rate": 0.0001999739729109173,
+      "loss": 1.1334,
+      "step": 183
+    },
+    {
+      "epoch": 0.03276353276353276,
+      "grad_norm": 0.414617121219635,
+      "learning_rate": 0.00019997365259459457,
+      "loss": 1.0547,
+      "step": 184
+    },
+    {
+      "epoch": 0.032941595441595445,
+      "grad_norm": 0.46544626355171204,
+      "learning_rate": 0.00019997333031949581,
+      "loss": 1.4067,
+      "step": 185
+    },
+    {
+      "epoch": 0.03311965811965812,
+      "grad_norm": 0.48489415645599365,
+      "learning_rate": 0.0001999730060856273,
+      "loss": 1.4027,
+      "step": 186
+    },
+    {
+      "epoch": 0.033297720797720795,
+      "grad_norm": 0.3963346481323242,
+      "learning_rate": 0.0001999726798929954,
+      "loss": 1.1327,
+      "step": 187
+    },
+    {
+      "epoch": 0.03347578347578348,
+      "grad_norm": 0.3809385895729065,
+      "learning_rate": 0.00019997235174160652,
+      "loss": 1.3475,
+      "step": 188
+    },
+    {
+      "epoch": 0.03365384615384615,
+      "grad_norm": 0.3866960406303406,
+      "learning_rate": 0.0001999720216314671,
+      "loss": 1.1576,
+      "step": 189
+    },
+    {
+      "epoch": 0.033831908831908834,
+      "grad_norm": 0.34976935386657715,
+      "learning_rate": 0.00019997168956258356,
+      "loss": 0.9361,
+      "step": 190
+    },
+    {
+      "epoch": 0.03400997150997151,
+      "grad_norm": 0.38681939244270325,
+      "learning_rate": 0.00019997135553496243,
+      "loss": 1.1796,
+      "step": 191
+    },
+    {
+      "epoch": 0.03418803418803419,
+      "grad_norm": 0.41905197501182556,
+      "learning_rate": 0.0001999710195486103,
+      "loss": 1.1714,
+      "step": 192
+    },
+    {
+      "epoch": 0.03436609686609687,
+      "grad_norm": 0.42356589436531067,
+      "learning_rate": 0.0001999706816035337,
+      "loss": 1.0022,
+      "step": 193
+    },
+    {
+      "epoch": 0.03454415954415954,
+      "grad_norm": 0.3929740786552429,
+      "learning_rate": 0.00019997034169973925,
+      "loss": 1.3769,
+      "step": 194
+    },
+    {
+      "epoch": 0.034722222222222224,
+      "grad_norm": 0.4325186312198639,
+      "learning_rate": 0.00019996999983723366,
+      "loss": 1.3057,
+      "step": 195
+    },
+    {
+      "epoch": 0.0349002849002849,
+      "grad_norm": 0.3954029381275177,
+      "learning_rate": 0.00019996965601602355,
+      "loss": 1.1958,
+      "step": 196
+    },
+    {
+      "epoch": 0.03507834757834758,
+      "grad_norm": 0.34454262256622314,
+      "learning_rate": 0.00019996931023611572,
+      "loss": 1.0972,
+      "step": 197
+    },
+    {
+      "epoch": 0.035256410256410256,
+      "grad_norm": 0.48900291323661804,
+      "learning_rate": 0.0001999689624975169,
+      "loss": 1.213,
+      "step": 198
+    },
+    {
+      "epoch": 0.03543447293447293,
+      "grad_norm": 0.35214388370513916,
+      "learning_rate": 0.00019996861280023397,
+      "loss": 1.0285,
+      "step": 199
+    },
+    {
+      "epoch": 0.03561253561253561,
+      "grad_norm": 0.49393126368522644,
+      "learning_rate": 0.00019996826114427373,
+      "loss": 1.2313,
+      "step": 200
+    },
+    {
+      "epoch": 0.03579059829059829,
+      "grad_norm": 0.3994458019733429,
+      "learning_rate": 0.00019996790752964305,
+      "loss": 1.0474,
+      "step": 201
+    },
+    {
+      "epoch": 0.03596866096866097,
+      "grad_norm": 0.5387318730354309,
+      "learning_rate": 0.0001999675519563489,
+      "loss": 1.3067,
+      "step": 202
+    },
+    {
+      "epoch": 0.036146723646723646,
+      "grad_norm": 0.4976751208305359,
+      "learning_rate": 0.00019996719442439824,
+      "loss": 1.2593,
+      "step": 203
+    },
+    {
+      "epoch": 0.03632478632478633,
+      "grad_norm": 0.47052907943725586,
+      "learning_rate": 0.0001999668349337981,
+      "loss": 1.1036,
+      "step": 204
+    },
+    {
+      "epoch": 0.036502849002849,
+      "grad_norm": 0.39616644382476807,
+      "learning_rate": 0.00019996647348455543,
+      "loss": 1.0481,
+      "step": 205
+    },
+    {
+      "epoch": 0.03668091168091168,
+      "grad_norm": 0.42987677454948425,
+      "learning_rate": 0.00019996611007667742,
+      "loss": 1.0923,
+      "step": 206
+    },
+    {
+      "epoch": 0.03685897435897436,
+      "grad_norm": 0.47065848112106323,
+      "learning_rate": 0.00019996574471017113,
+      "loss": 1.1403,
+      "step": 207
+    },
+    {
+      "epoch": 0.037037037037037035,
+      "grad_norm": 0.4363015592098236,
+      "learning_rate": 0.00019996537738504373,
+      "loss": 1.253,
+      "step": 208
+    },
+    {
+      "epoch": 0.03721509971509972,
+      "grad_norm": 0.4038296937942505,
+      "learning_rate": 0.00019996500810130243,
+      "loss": 1.1679,
+      "step": 209
+    },
+    {
+      "epoch": 0.03739316239316239,
+      "grad_norm": 0.5038532018661499,
+      "learning_rate": 0.00019996463685895445,
+      "loss": 1.1182,
+      "step": 210
+    },
+    {
+      "epoch": 0.037571225071225074,
+      "grad_norm": 0.37740692496299744,
+      "learning_rate": 0.00019996426365800706,
+      "loss": 1.0465,
+      "step": 211
+    },
+    {
+      "epoch": 0.03774928774928775,
+      "grad_norm": 0.47794604301452637,
+      "learning_rate": 0.00019996388849846759,
+      "loss": 1.2836,
+      "step": 212
+    },
+    {
+      "epoch": 0.037927350427350424,
+      "grad_norm": 0.38460609316825867,
+      "learning_rate": 0.0001999635113803434,
+      "loss": 1.2099,
+      "step": 213
+    },
+    {
+      "epoch": 0.038105413105413107,
+      "grad_norm": 0.42016157507896423,
+      "learning_rate": 0.0001999631323036418,
+      "loss": 1.152,
+      "step": 214
+    },
+    {
+      "epoch": 0.03828347578347578,
+      "grad_norm": 0.4024946391582489,
+      "learning_rate": 0.00019996275126837033,
+      "loss": 1.1534,
+      "step": 215
+    },
+    {
+      "epoch": 0.038461538461538464,
+      "grad_norm": 0.4573793411254883,
+      "learning_rate": 0.00019996236827453642,
+      "loss": 1.2019,
+      "step": 216
+    },
+    {
+      "epoch": 0.03863960113960114,
+      "grad_norm": 0.3642503321170807,
+      "learning_rate": 0.0001999619833221475,
+      "loss": 1.0541,
+      "step": 217
+    },
+    {
+      "epoch": 0.03881766381766382,
+      "grad_norm": 0.38492897152900696,
+      "learning_rate": 0.0001999615964112112,
+      "loss": 1.1269,
+      "step": 218
+    },
+    {
+      "epoch": 0.038995726495726496,
+      "grad_norm": 0.427219420671463,
+      "learning_rate": 0.0001999612075417351,
+      "loss": 1.1126,
+      "step": 219
+    },
+    {
+      "epoch": 0.03917378917378917,
+      "grad_norm": 0.40781742334365845,
+      "learning_rate": 0.00019996081671372676,
+      "loss": 1.2207,
+      "step": 220
+    },
+    {
+      "epoch": 0.03935185185185185,
+      "grad_norm": 0.39229512214660645,
+      "learning_rate": 0.00019996042392719386,
+      "loss": 1.0403,
+      "step": 221
+    },
+    {
+      "epoch": 0.03952991452991453,
+      "grad_norm": 0.42038577795028687,
+      "learning_rate": 0.0001999600291821441,
+      "loss": 1.2157,
+      "step": 222
+    },
+    {
+      "epoch": 0.03970797720797721,
+      "grad_norm": 0.3963491916656494,
+      "learning_rate": 0.00019995963247858525,
+      "loss": 1.0532,
+      "step": 223
+    },
+    {
+      "epoch": 0.039886039886039885,
+      "grad_norm": 0.4389874041080475,
+      "learning_rate": 0.00019995923381652502,
+      "loss": 1.4279,
+      "step": 224
+    },
+    {
+      "epoch": 0.04006410256410257,
+      "grad_norm": 0.357312947511673,
+      "learning_rate": 0.00019995883319597123,
+      "loss": 0.9871,
+      "step": 225
+    },
+    {
+      "epoch": 0.04024216524216524,
+      "grad_norm": 0.3644427955150604,
+      "learning_rate": 0.00019995843061693181,
+      "loss": 1.0879,
+      "step": 226
+    },
+    {
+      "epoch": 0.04042022792022792,
+      "grad_norm": 0.4074651002883911,
+      "learning_rate": 0.00019995802607941453,
+      "loss": 1.2138,
+      "step": 227
+    },
+    {
+      "epoch": 0.0405982905982906,
+      "grad_norm": 0.40709465742111206,
+      "learning_rate": 0.0001999576195834274,
+      "loss": 1.1905,
+      "step": 228
+    },
+    {
+      "epoch": 0.040776353276353275,
+      "grad_norm": 0.4280182719230652,
+      "learning_rate": 0.00019995721112897838,
+      "loss": 1.2331,
+      "step": 229
+    },
+    {
+      "epoch": 0.04095441595441596,
+      "grad_norm": 0.37846076488494873,
+      "learning_rate": 0.00019995680071607544,
+      "loss": 1.078,
+      "step": 230
+    },
+    {
+      "epoch": 0.04113247863247863,
+      "grad_norm": 0.3877260088920593,
+      "learning_rate": 0.0001999563883447266,
+      "loss": 1.0309,
+      "step": 231
+    },
+    {
+      "epoch": 0.04131054131054131,
+      "grad_norm": 0.42886826395988464,
+      "learning_rate": 0.00019995597401494,
+      "loss": 1.0403,
+      "step": 232
+    },
+    {
+      "epoch": 0.04148860398860399,
+      "grad_norm": 0.4316534101963043,
+      "learning_rate": 0.00019995555772672372,
+      "loss": 1.2418,
+      "step": 233
+    },
+    {
+      "epoch": 0.041666666666666664,
+      "grad_norm": 0.45768865942955017,
+      "learning_rate": 0.00019995513948008593,
+      "loss": 1.233,
+      "step": 234
+    },
+    {
+      "epoch": 0.041844729344729346,
+      "grad_norm": 0.5647913813591003,
+      "learning_rate": 0.00019995471927503481,
+      "loss": 1.1346,
+      "step": 235
+    },
+    {
+      "epoch": 0.04202279202279202,
+      "grad_norm": 0.3797492980957031,
+      "learning_rate": 0.00019995429711157863,
+      "loss": 1.1574,
+      "step": 236
+    },
+    {
+      "epoch": 0.042200854700854704,
+      "grad_norm": 0.4392767548561096,
+      "learning_rate": 0.00019995387298972562,
+      "loss": 0.8988,
+      "step": 237
+    },
+    {
+      "epoch": 0.04237891737891738,
+      "grad_norm": 0.37331557273864746,
+      "learning_rate": 0.0001999534469094841,
+      "loss": 1.0439,
+      "step": 238
+    },
+    {
+      "epoch": 0.042556980056980054,
+      "grad_norm": 0.3785935938358307,
+      "learning_rate": 0.00019995301887086245,
+      "loss": 0.9839,
+      "step": 239
+    },
+    {
+      "epoch": 0.042735042735042736,
+      "grad_norm": 0.4351862668991089,
+      "learning_rate": 0.00019995258887386898,
+      "loss": 1.2653,
+      "step": 240
+    },
+    {
+      "epoch": 0.04291310541310541,
+      "grad_norm": 0.399475634098053,
+      "learning_rate": 0.0001999521569185122,
+      "loss": 0.9877,
+      "step": 241
+    },
+    {
+      "epoch": 0.04309116809116809,
+      "grad_norm": 0.42332810163497925,
+      "learning_rate": 0.00019995172300480053,
+      "loss": 1.2403,
+      "step": 242
+    },
+    {
+      "epoch": 0.04326923076923077,
+      "grad_norm": 0.4397708475589752,
+      "learning_rate": 0.00019995128713274247,
+      "loss": 0.9316,
+      "step": 243
+    },
+    {
+      "epoch": 0.04344729344729345,
+      "grad_norm": 0.3614110052585602,
+      "learning_rate": 0.00019995084930234658,
+      "loss": 1.1088,
+      "step": 244
+    },
+    {
+      "epoch": 0.043625356125356125,
+      "grad_norm": 0.39433717727661133,
+      "learning_rate": 0.0001999504095136214,
+      "loss": 1.2002,
+      "step": 245
+    },
+    {
+      "epoch": 0.0438034188034188,
+      "grad_norm": 0.33088216185569763,
+      "learning_rate": 0.0001999499677665756,
+      "loss": 0.8796,
+      "step": 246
+    },
+    {
+      "epoch": 0.04398148148148148,
+      "grad_norm": 0.5239143967628479,
+      "learning_rate": 0.00019994952406121784,
+      "loss": 1.2808,
+      "step": 247
+    },
+    {
+      "epoch": 0.04415954415954416,
+      "grad_norm": 0.42156723141670227,
+      "learning_rate": 0.00019994907839755675,
+      "loss": 1.1775,
+      "step": 248
+    },
+    {
+      "epoch": 0.04433760683760684,
+      "grad_norm": 0.42569902539253235,
+      "learning_rate": 0.0001999486307756011,
+      "loss": 1.001,
+      "step": 249
+    },
+    {
+      "epoch": 0.044515669515669515,
+      "grad_norm": 0.38241544365882874,
+      "learning_rate": 0.00019994818119535964,
+      "loss": 1.1064,
+      "step": 250
+    },
+    {
+      "epoch": 0.0446937321937322,
+      "grad_norm": 0.4185071885585785,
+      "learning_rate": 0.0001999477296568412,
+      "loss": 1.2109,
+      "step": 251
+    },
+    {
+      "epoch": 0.04487179487179487,
+      "grad_norm": 0.4189644157886505,
+      "learning_rate": 0.00019994727616005464,
+      "loss": 1.2902,
+      "step": 252
+    },
+    {
+      "epoch": 0.04504985754985755,
+      "grad_norm": 0.34671884775161743,
+      "learning_rate": 0.0001999468207050088,
+      "loss": 0.9429,
+      "step": 253
+    },
+    {
+      "epoch": 0.04522792022792023,
+      "grad_norm": 0.42391687631607056,
+      "learning_rate": 0.00019994636329171266,
+      "loss": 0.7179,
+      "step": 254
+    },
+    {
+      "epoch": 0.045405982905982904,
+      "grad_norm": 0.3803195655345917,
+      "learning_rate": 0.00019994590392017513,
+      "loss": 1.0318,
+      "step": 255
+    },
+    {
+      "epoch": 0.045584045584045586,
+      "grad_norm": 0.3389956057071686,
+      "learning_rate": 0.00019994544259040525,
+      "loss": 1.0485,
+      "step": 256
+    },
+    {
+      "epoch": 0.04576210826210826,
+      "grad_norm": 0.4927038550376892,
+      "learning_rate": 0.000199944979302412,
+      "loss": 1.3426,
+      "step": 257
+    },
+    {
+      "epoch": 0.045940170940170943,
+      "grad_norm": 0.33200421929359436,
+      "learning_rate": 0.00019994451405620453,
+      "loss": 1.0071,
+      "step": 258
+    },
+    {
+      "epoch": 0.04611823361823362,
+      "grad_norm": 0.38028615713119507,
+      "learning_rate": 0.00019994404685179195,
+      "loss": 1.0985,
+      "step": 259
+    },
+    {
+      "epoch": 0.046296296296296294,
+      "grad_norm": 0.3752151429653168,
+      "learning_rate": 0.00019994357768918333,
+      "loss": 0.9209,
+      "step": 260
+    },
+    {
+      "epoch": 0.046474358974358976,
+      "grad_norm": 0.43030866980552673,
+      "learning_rate": 0.00019994310656838796,
+      "loss": 0.9921,
+      "step": 261
+    },
+    {
+      "epoch": 0.04665242165242165,
+      "grad_norm": 0.4402460753917694,
+      "learning_rate": 0.00019994263348941502,
+      "loss": 1.1051,
+      "step": 262
+    },
+    {
+      "epoch": 0.04683048433048433,
+      "grad_norm": 0.43012720346450806,
+      "learning_rate": 0.0001999421584522738,
+      "loss": 1.1839,
+      "step": 263
+    },
+    {
+      "epoch": 0.04700854700854701,
+      "grad_norm": 0.4195305407047272,
+      "learning_rate": 0.0001999416814569736,
+      "loss": 1.1749,
+      "step": 264
+    },
+    {
+      "epoch": 0.04718660968660968,
+      "grad_norm": 0.45623287558555603,
+      "learning_rate": 0.00019994120250352372,
+      "loss": 1.2433,
+      "step": 265
+    },
+    {
+      "epoch": 0.047364672364672365,
+      "grad_norm": 0.4736156761646271,
+      "learning_rate": 0.00019994072159193363,
+      "loss": 1.2882,
+      "step": 266
+    },
+    {
+      "epoch": 0.04754273504273504,
+      "grad_norm": 0.36698561906814575,
+      "learning_rate": 0.0001999402387222127,
+      "loss": 1.1486,
+      "step": 267
+    },
+    {
+      "epoch": 0.04772079772079772,
+      "grad_norm": 0.3854144215583801,
+      "learning_rate": 0.00019993975389437038,
+      "loss": 0.8115,
+      "step": 268
+    },
+    {
+      "epoch": 0.0478988603988604,
+      "grad_norm": 0.41512808203697205,
+      "learning_rate": 0.0001999392671084162,
+      "loss": 1.0959,
+      "step": 269
+    },
+    {
+      "epoch": 0.04807692307692308,
+      "grad_norm": 0.3869563341140747,
+      "learning_rate": 0.0001999387783643597,
+      "loss": 1.087,
+      "step": 270
+    },
+    {
+      "epoch": 0.048254985754985755,
+      "grad_norm": 0.4649744927883148,
+      "learning_rate": 0.00019993828766221044,
+      "loss": 1.0011,
+      "step": 271
+    },
+    {
+      "epoch": 0.04843304843304843,
+      "grad_norm": 0.40331923961639404,
+      "learning_rate": 0.00019993779500197803,
+      "loss": 1.1463,
+      "step": 272
+    },
+    {
+      "epoch": 0.04861111111111111,
+      "grad_norm": 0.3826279938220978,
+      "learning_rate": 0.0001999373003836721,
+      "loss": 1.1491,
+      "step": 273
+    },
+    {
+      "epoch": 0.04878917378917379,
+      "grad_norm": 0.3967166543006897,
+      "learning_rate": 0.00019993680380730243,
+      "loss": 1.1462,
+      "step": 274
+    },
+    {
+      "epoch": 0.04896723646723647,
+      "grad_norm": 0.4298507869243622,
+      "learning_rate": 0.00019993630527287865,
+      "loss": 1.2471,
+      "step": 275
+    },
+    {
+      "epoch": 0.049145299145299144,
+      "grad_norm": 0.41486215591430664,
+      "learning_rate": 0.0001999358047804106,
+      "loss": 1.287,
+      "step": 276
+    },
+    {
+      "epoch": 0.049323361823361826,
+      "grad_norm": 0.3914124369621277,
+      "learning_rate": 0.00019993530232990803,
+      "loss": 1.0935,
+      "step": 277
+    },
+    {
+      "epoch": 0.0495014245014245,
+      "grad_norm": 0.39888378977775574,
+      "learning_rate": 0.00019993479792138082,
+      "loss": 1.2347,
+      "step": 278
+    },
+    {
+      "epoch": 0.049679487179487176,
+      "grad_norm": 0.3911665678024292,
+      "learning_rate": 0.00019993429155483884,
+      "loss": 1.0917,
+      "step": 279
+    },
+    {
+      "epoch": 0.04985754985754986,
+      "grad_norm": 0.42871445417404175,
+      "learning_rate": 0.00019993378323029197,
+      "loss": 1.0277,
+      "step": 280
+    },
+    {
+      "epoch": 0.050035612535612534,
+      "grad_norm": 0.35397860407829285,
+      "learning_rate": 0.00019993327294775027,
+      "loss": 0.9549,
+      "step": 281
+    },
+    {
+      "epoch": 0.050213675213675216,
+      "grad_norm": 0.4528059959411621,
+      "learning_rate": 0.00019993276070722364,
+      "loss": 1.2338,
+      "step": 282
+    },
+    {
+      "epoch": 0.05039173789173789,
+      "grad_norm": 0.354735791683197,
+      "learning_rate": 0.00019993224650872218,
+      "loss": 1.1892,
+      "step": 283
+    },
+    {
+      "epoch": 0.05056980056980057,
+      "grad_norm": 0.44407567381858826,
+      "learning_rate": 0.00019993173035225592,
+      "loss": 1.1621,
+      "step": 284
+    },
+    {
+      "epoch": 0.05074786324786325,
+      "grad_norm": 0.4177244305610657,
+      "learning_rate": 0.000199931212237835,
+      "loss": 1.1184,
+      "step": 285
+    },
+    {
+      "epoch": 0.05092592592592592,
+      "grad_norm": 0.5627759695053101,
+      "learning_rate": 0.0001999306921654696,
+      "loss": 1.0755,
+      "step": 286
+    },
+    {
+      "epoch": 0.051103988603988605,
+      "grad_norm": 0.46767523884773254,
+      "learning_rate": 0.00019993017013516986,
+      "loss": 1.2654,
+      "step": 287
+    },
+    {
+      "epoch": 0.05128205128205128,
+      "grad_norm": 0.4163128733634949,
+      "learning_rate": 0.000199929646146946,
+      "loss": 1.1307,
+      "step": 288
+    },
+    {
+      "epoch": 0.05146011396011396,
+      "grad_norm": 0.36954161524772644,
+      "learning_rate": 0.00019992912020080832,
+      "loss": 0.8274,
+      "step": 289
+    },
+    {
+      "epoch": 0.05163817663817664,
+      "grad_norm": 0.4770594835281372,
+      "learning_rate": 0.00019992859229676712,
+      "loss": 1.2235,
+      "step": 290
+    },
+    {
+      "epoch": 0.05181623931623932,
+      "grad_norm": 0.4174608290195465,
+      "learning_rate": 0.00019992806243483274,
+      "loss": 1.2893,
+      "step": 291
+    },
+    {
+      "epoch": 0.051994301994301995,
+      "grad_norm": 0.3794898986816406,
+      "learning_rate": 0.00019992753061501555,
+      "loss": 1.104,
+      "step": 292
+    },
+    {
+      "epoch": 0.05217236467236467,
+      "grad_norm": 0.3912592828273773,
+      "learning_rate": 0.000199926996837326,
+      "loss": 1.0043,
+      "step": 293
+    },
+    {
+      "epoch": 0.05235042735042735,
+      "grad_norm": 0.39641159772872925,
+      "learning_rate": 0.00019992646110177448,
+      "loss": 1.083,
+      "step": 294
+    },
+    {
+      "epoch": 0.05252849002849003,
+      "grad_norm": 0.3518857955932617,
+      "learning_rate": 0.00019992592340837157,
+      "loss": 0.9275,
+      "step": 295
+    },
+    {
+      "epoch": 0.05270655270655271,
+      "grad_norm": 0.3955721855163574,
+      "learning_rate": 0.00019992538375712777,
+      "loss": 1.0153,
+      "step": 296
+    },
+    {
+      "epoch": 0.052884615384615384,
+      "grad_norm": 0.3837333023548126,
+      "learning_rate": 0.00019992484214805364,
+      "loss": 1.1664,
+      "step": 297
+    },
+    {
+      "epoch": 0.053062678062678066,
+      "grad_norm": 0.39400920271873474,
+      "learning_rate": 0.0001999242985811598,
+      "loss": 1.0532,
+      "step": 298
+    },
+    {
+      "epoch": 0.05324074074074074,
+      "grad_norm": 0.39258649945259094,
+      "learning_rate": 0.00019992375305645692,
+      "loss": 1.0081,
+      "step": 299
+    },
+    {
+      "epoch": 0.053418803418803416,
+      "grad_norm": 0.49768248200416565,
+      "learning_rate": 0.00019992320557395566,
+      "loss": 1.2553,
+      "step": 300
+    },
+    {
+      "epoch": 0.0535968660968661,
+      "grad_norm": 0.364776074886322,
+      "learning_rate": 0.00019992265613366677,
+      "loss": 1.0582,
+      "step": 301
+    },
+    {
+      "epoch": 0.053774928774928774,
+      "grad_norm": 0.47317907214164734,
+      "learning_rate": 0.00019992210473560097,
+      "loss": 1.3114,
+      "step": 302
+    },
+    {
+      "epoch": 0.053952991452991456,
+      "grad_norm": 0.3706119656562805,
+      "learning_rate": 0.00019992155137976917,
+      "loss": 0.9554,
+      "step": 303
+    },
+    {
+      "epoch": 0.05413105413105413,
+      "grad_norm": 0.42809563875198364,
+      "learning_rate": 0.0001999209960661821,
+      "loss": 1.306,
+      "step": 304
+    },
+    {
+      "epoch": 0.054309116809116806,
+      "grad_norm": 0.4514487385749817,
+      "learning_rate": 0.00019992043879485066,
+      "loss": 1.0147,
+      "step": 305
+    },
+    {
+      "epoch": 0.05448717948717949,
+      "grad_norm": 0.36672836542129517,
+      "learning_rate": 0.0001999198795657858,
+      "loss": 1.1392,
+      "step": 306
+    },
+    {
+      "epoch": 0.05466524216524216,
+      "grad_norm": 0.4206554889678955,
+      "learning_rate": 0.00019991931837899847,
+      "loss": 1.2405,
+      "step": 307
+    },
+    {
+      "epoch": 0.054843304843304845,
+      "grad_norm": 0.46168261766433716,
+      "learning_rate": 0.00019991875523449966,
+      "loss": 1.2707,
+      "step": 308
+    },
+    {
+      "epoch": 0.05502136752136752,
+      "grad_norm": 0.39503365755081177,
+      "learning_rate": 0.00019991819013230039,
+      "loss": 1.0776,
+      "step": 309
+    },
+    {
+      "epoch": 0.0551994301994302,
+      "grad_norm": 0.35244834423065186,
+      "learning_rate": 0.00019991762307241178,
+      "loss": 1.0864,
+      "step": 310
+    },
+    {
+      "epoch": 0.05537749287749288,
+      "grad_norm": 0.3865319490432739,
+      "learning_rate": 0.0001999170540548449,
+      "loss": 1.3659,
+      "step": 311
+    },
+    {
+      "epoch": 0.05555555555555555,
+      "grad_norm": 0.3666876554489136,
+      "learning_rate": 0.0001999164830796109,
+      "loss": 0.9884,
+      "step": 312
+    },
+    {
+      "epoch": 0.055733618233618235,
+      "grad_norm": 0.4278281629085541,
+      "learning_rate": 0.00019991591014672096,
+      "loss": 1.1522,
+      "step": 313
+    },
+    {
+      "epoch": 0.05591168091168091,
+      "grad_norm": 0.4172627031803131,
+      "learning_rate": 0.0001999153352561863,
+      "loss": 1.2527,
+      "step": 314
+    },
+    {
+      "epoch": 0.05608974358974359,
+      "grad_norm": 0.38872212171554565,
+      "learning_rate": 0.00019991475840801823,
+      "loss": 1.2985,
+      "step": 315
+    },
+    {
+      "epoch": 0.05626780626780627,
+      "grad_norm": 0.4160458445549011,
+      "learning_rate": 0.00019991417960222804,
+      "loss": 1.1347,
+      "step": 316
+    },
+    {
+      "epoch": 0.05644586894586895,
+      "grad_norm": 0.5169723033905029,
+      "learning_rate": 0.00019991359883882705,
+      "loss": 1.0819,
+      "step": 317
+    },
+    {
+      "epoch": 0.056623931623931624,
+      "grad_norm": 0.42306259274482727,
+      "learning_rate": 0.0001999130161178266,
+      "loss": 1.3139,
+      "step": 318
+    },
+    {
+      "epoch": 0.0568019943019943,
+      "grad_norm": 0.41975873708724976,
+      "learning_rate": 0.00019991243143923816,
+      "loss": 1.2277,
+      "step": 319
+    },
+    {
+      "epoch": 0.05698005698005698,
+      "grad_norm": 0.3873472511768341,
+      "learning_rate": 0.00019991184480307324,
+      "loss": 1.156,
+      "step": 320
+    },
+    {
+      "epoch": 0.057158119658119656,
+      "grad_norm": 0.43656104803085327,
+      "learning_rate": 0.0001999112562093432,
+      "loss": 1.2344,
+      "step": 321
+    },
+    {
+      "epoch": 0.05733618233618234,
+      "grad_norm": 0.3738791048526764,
+      "learning_rate": 0.00019991066565805968,
+      "loss": 0.9573,
+      "step": 322
+    },
+    {
+      "epoch": 0.05751424501424501,
+      "grad_norm": 0.3838156461715698,
+      "learning_rate": 0.00019991007314923418,
+      "loss": 0.9274,
+      "step": 323
+    },
+    {
+      "epoch": 0.057692307692307696,
+      "grad_norm": 0.4564770758152008,
+      "learning_rate": 0.00019990947868287837,
+      "loss": 1.0756,
+      "step": 324
+    },
+    {
+      "epoch": 0.05787037037037037,
+      "grad_norm": 0.4560079872608185,
+      "learning_rate": 0.00019990888225900386,
+      "loss": 1.1508,
+      "step": 325
+    },
+    {
+      "epoch": 0.058048433048433046,
+      "grad_norm": 0.44356057047843933,
+      "learning_rate": 0.00019990828387762236,
+      "loss": 1.2323,
+      "step": 326
+    },
+    {
+      "epoch": 0.05822649572649573,
+      "grad_norm": 0.46390119194984436,
+      "learning_rate": 0.00019990768353874553,
+      "loss": 1.0031,
+      "step": 327
+    },
+    {
+      "epoch": 0.0584045584045584,
+      "grad_norm": 0.4502357244491577,
+      "learning_rate": 0.00019990708124238525,
+      "loss": 1.3454,
+      "step": 328
+    },
+    {
+      "epoch": 0.058582621082621085,
+      "grad_norm": 0.3979945182800293,
+      "learning_rate": 0.0001999064769885532,
+      "loss": 1.2833,
+      "step": 329
+    },
+    {
+      "epoch": 0.05876068376068376,
+      "grad_norm": 0.3899286687374115,
+      "learning_rate": 0.00019990587077726128,
+      "loss": 1.0175,
+      "step": 330
+    },
+    {
+      "epoch": 0.05893874643874644,
+      "grad_norm": 0.41422948241233826,
+      "learning_rate": 0.00019990526260852139,
+      "loss": 1.1151,
+      "step": 331
+    },
+    {
+      "epoch": 0.05911680911680912,
+      "grad_norm": 0.4266608953475952,
+      "learning_rate": 0.0001999046524823454,
+      "loss": 1.1119,
+      "step": 332
+    },
+    {
+      "epoch": 0.05929487179487179,
+      "grad_norm": 0.46563324332237244,
+      "learning_rate": 0.00019990404039874524,
+      "loss": 1.2358,
+      "step": 333
+    },
+    {
+      "epoch": 0.059472934472934474,
+      "grad_norm": 0.4404347240924835,
+      "learning_rate": 0.00019990342635773297,
+      "loss": 1.1748,
+      "step": 334
+    },
+    {
+      "epoch": 0.05965099715099715,
+      "grad_norm": 0.5133237838745117,
+      "learning_rate": 0.00019990281035932062,
+      "loss": 1.1649,
+      "step": 335
+    },
+    {
+      "epoch": 0.05982905982905983,
+      "grad_norm": 0.3593895435333252,
+      "learning_rate": 0.00019990219240352018,
+      "loss": 1.0318,
+      "step": 336
+    },
+    {
+      "epoch": 0.06000712250712251,
+      "grad_norm": 0.40554583072662354,
+      "learning_rate": 0.00019990157249034384,
+      "loss": 1.1202,
+      "step": 337
+    },
+    {
+      "epoch": 0.06018518518518518,
+      "grad_norm": 0.3770706057548523,
+      "learning_rate": 0.00019990095061980372,
+      "loss": 0.9908,
+      "step": 338
+    },
+    {
+      "epoch": 0.060363247863247864,
+      "grad_norm": 0.39676955342292786,
+      "learning_rate": 0.000199900326791912,
+      "loss": 0.8176,
+      "step": 339
+    },
+    {
+      "epoch": 0.06054131054131054,
+      "grad_norm": 0.41448578238487244,
+      "learning_rate": 0.00019989970100668086,
+      "loss": 1.2877,
+      "step": 340
+    },
+    {
+      "epoch": 0.06071937321937322,
+      "grad_norm": 0.4200015068054199,
+      "learning_rate": 0.00019989907326412265,
+      "loss": 1.2293,
+      "step": 341
+    },
+    {
+      "epoch": 0.060897435897435896,
+      "grad_norm": 0.47350621223449707,
+      "learning_rate": 0.0001998984435642496,
+      "loss": 1.2331,
+      "step": 342
+    },
+    {
+      "epoch": 0.06107549857549858,
+      "grad_norm": 0.47050634026527405,
+      "learning_rate": 0.00019989781190707406,
+      "loss": 0.8888,
+      "step": 343
+    },
+    {
+      "epoch": 0.06125356125356125,
+      "grad_norm": 0.4994896948337555,
+      "learning_rate": 0.00019989717829260842,
+      "loss": 1.0921,
+      "step": 344
+    },
+    {
+      "epoch": 0.06143162393162393,
+      "grad_norm": 0.36340200901031494,
+      "learning_rate": 0.0001998965427208651,
+      "loss": 0.9777,
+      "step": 345
+    },
+    {
+      "epoch": 0.06160968660968661,
+      "grad_norm": 0.3538152873516083,
+      "learning_rate": 0.00019989590519185654,
+      "loss": 1.0055,
+      "step": 346
+    },
+    {
+      "epoch": 0.061787749287749286,
+      "grad_norm": 0.5388944149017334,
+      "learning_rate": 0.00019989526570559526,
+      "loss": 1.1001,
+      "step": 347
+    },
+    {
+      "epoch": 0.06196581196581197,
+      "grad_norm": 0.4411574602127075,
+      "learning_rate": 0.00019989462426209373,
+      "loss": 1.0038,
+      "step": 348
+    },
+    {
+      "epoch": 0.06214387464387464,
+      "grad_norm": 0.3930876851081848,
+      "learning_rate": 0.00019989398086136455,
+      "loss": 1.1534,
+      "step": 349
+    },
+    {
+      "epoch": 0.062321937321937325,
+      "grad_norm": 0.47357070446014404,
+      "learning_rate": 0.00019989333550342033,
+      "loss": 1.2687,
+      "step": 350
+    },
+    {
+      "epoch": 0.0625,
+      "grad_norm": 0.40302303433418274,
+      "learning_rate": 0.00019989268818827372,
+      "loss": 1.1894,
+      "step": 351
+    },
+    {
+      "epoch": 0.06267806267806268,
+      "grad_norm": 0.4470510184764862,
+      "learning_rate": 0.00019989203891593738,
+      "loss": 1.2207,
+      "step": 352
+    },
+    {
+      "epoch": 0.06285612535612535,
+      "grad_norm": 0.42235100269317627,
+      "learning_rate": 0.00019989138768642406,
+      "loss": 1.2086,
+      "step": 353
+    },
+    {
+      "epoch": 0.06303418803418803,
+      "grad_norm": 0.38305309414863586,
+      "learning_rate": 0.0001998907344997465,
+      "loss": 1.0473,
+      "step": 354
+    },
+    {
+      "epoch": 0.06321225071225071,
+      "grad_norm": 0.3893027901649475,
+      "learning_rate": 0.0001998900793559175,
+      "loss": 1.1746,
+      "step": 355
+    },
+    {
+      "epoch": 0.0633903133903134,
+      "grad_norm": 0.41206735372543335,
+      "learning_rate": 0.0001998894222549499,
+      "loss": 1.188,
+      "step": 356
+    },
+    {
+      "epoch": 0.06356837606837606,
+      "grad_norm": 0.3700513243675232,
+      "learning_rate": 0.00019988876319685658,
+      "loss": 0.9862,
+      "step": 357
+    },
+    {
+      "epoch": 0.06374643874643875,
+      "grad_norm": 0.3708794116973877,
+      "learning_rate": 0.0001998881021816504,
+      "loss": 1.2003,
+      "step": 358
+    },
+    {
+      "epoch": 0.06392450142450143,
+      "grad_norm": 0.4058014154434204,
+      "learning_rate": 0.00019988743920934442,
+      "loss": 1.2311,
+      "step": 359
+    },
+    {
+      "epoch": 0.0641025641025641,
+      "grad_norm": 0.39134132862091064,
+      "learning_rate": 0.00019988677427995155,
+      "loss": 1.001,
+      "step": 360
+    },
+    {
+      "epoch": 0.06428062678062678,
+      "grad_norm": 0.3853437602519989,
+      "learning_rate": 0.00019988610739348484,
+      "loss": 1.0725,
+      "step": 361
+    },
+    {
+      "epoch": 0.06445868945868946,
+      "grad_norm": 0.47114330530166626,
+      "learning_rate": 0.00019988543854995735,
+      "loss": 1.2196,
+      "step": 362
+    },
+    {
+      "epoch": 0.06463675213675214,
+      "grad_norm": 0.40465688705444336,
+      "learning_rate": 0.00019988476774938216,
+      "loss": 1.1869,
+      "step": 363
+    },
+    {
+      "epoch": 0.06481481481481481,
+      "grad_norm": 0.40301886200904846,
+      "learning_rate": 0.00019988409499177245,
+      "loss": 1.1765,
+      "step": 364
+    },
+    {
+      "epoch": 0.0649928774928775,
+      "grad_norm": 0.43443185091018677,
+      "learning_rate": 0.0001998834202771414,
+      "loss": 1.2022,
+      "step": 365
+    },
+    {
+      "epoch": 0.06517094017094018,
+      "grad_norm": 0.4712986350059509,
+      "learning_rate": 0.00019988274360550217,
+      "loss": 1.156,
+      "step": 366
+    },
+    {
+      "epoch": 0.06534900284900284,
+      "grad_norm": 0.4524450898170471,
+      "learning_rate": 0.00019988206497686815,
+      "loss": 1.2917,
+      "step": 367
+    },
+    {
+      "epoch": 0.06552706552706553,
+      "grad_norm": 0.40302205085754395,
+      "learning_rate": 0.0001998813843912525,
+      "loss": 0.9993,
+      "step": 368
+    },
+    {
+      "epoch": 0.06570512820512821,
+      "grad_norm": 0.39435216784477234,
+      "learning_rate": 0.00019988070184866864,
+      "loss": 1.0914,
+      "step": 369
+    },
+    {
+      "epoch": 0.06588319088319089,
+      "grad_norm": 0.39267390966415405,
+      "learning_rate": 0.00019988001734912988,
+      "loss": 1.3138,
+      "step": 370
+    },
+    {
+      "epoch": 0.06606125356125356,
+      "grad_norm": 0.38351675868034363,
+      "learning_rate": 0.00019987933089264968,
+      "loss": 1.0997,
+      "step": 371
+    },
+    {
+      "epoch": 0.06623931623931624,
+      "grad_norm": 0.3294839859008789,
+      "learning_rate": 0.00019987864247924145,
+      "loss": 0.9656,
+      "step": 372
+    },
+    {
+      "epoch": 0.06641737891737892,
+      "grad_norm": 0.45333364605903625,
+      "learning_rate": 0.00019987795210891872,
+      "loss": 1.095,
+      "step": 373
+    },
+    {
+      "epoch": 0.06659544159544159,
+      "grad_norm": 0.4362282454967499,
+      "learning_rate": 0.00019987725978169501,
+      "loss": 1.2103,
+      "step": 374
+    },
+    {
+      "epoch": 0.06677350427350427,
+      "grad_norm": 0.41314780712127686,
+      "learning_rate": 0.00019987656549758385,
+      "loss": 1.2115,
+      "step": 375
+    },
+    {
+      "epoch": 0.06695156695156695,
+      "grad_norm": 0.4230864644050598,
+      "learning_rate": 0.00019987586925659888,
+      "loss": 1.17,
+      "step": 376
+    },
+    {
+      "epoch": 0.06712962962962964,
+      "grad_norm": 0.4703855812549591,
+      "learning_rate": 0.00019987517105875372,
+      "loss": 1.367,
+      "step": 377
+    },
+    {
+      "epoch": 0.0673076923076923,
+      "grad_norm": 0.4671297073364258,
+      "learning_rate": 0.00019987447090406206,
+      "loss": 1.2543,
+      "step": 378
+    },
+    {
+      "epoch": 0.06748575498575499,
+      "grad_norm": 0.43746981024742126,
+      "learning_rate": 0.0001998737687925376,
+      "loss": 1.214,
+      "step": 379
+    },
+    {
+      "epoch": 0.06766381766381767,
+      "grad_norm": 0.40889596939086914,
+      "learning_rate": 0.00019987306472419412,
+      "loss": 1.0496,
+      "step": 380
+    },
+    {
+      "epoch": 0.06784188034188034,
+      "grad_norm": 0.3677358627319336,
+      "learning_rate": 0.0001998723586990454,
+      "loss": 1.1242,
+      "step": 381
+    },
+    {
+      "epoch": 0.06801994301994302,
+      "grad_norm": 0.3892628848552704,
+      "learning_rate": 0.00019987165071710527,
+      "loss": 1.0246,
+      "step": 382
+    },
+    {
+      "epoch": 0.0681980056980057,
+      "grad_norm": 0.4281293749809265,
+      "learning_rate": 0.00019987094077838764,
+      "loss": 1.2817,
+      "step": 383
+    },
+    {
+      "epoch": 0.06837606837606838,
+      "grad_norm": 0.45030340552330017,
+      "learning_rate": 0.00019987022888290636,
+      "loss": 1.159,
+      "step": 384
+    },
+    {
+      "epoch": 0.06855413105413105,
+      "grad_norm": 0.6327905058860779,
+      "learning_rate": 0.00019986951503067545,
+      "loss": 0.9577,
+      "step": 385
+    },
+    {
+      "epoch": 0.06873219373219373,
+      "grad_norm": 0.40339627861976624,
+      "learning_rate": 0.0001998687992217088,
+      "loss": 1.138,
+      "step": 386
+    },
+    {
+      "epoch": 0.06891025641025642,
+      "grad_norm": 0.4018291234970093,
+      "learning_rate": 0.00019986808145602052,
+      "loss": 0.9109,
+      "step": 387
+    },
+    {
+      "epoch": 0.06908831908831908,
+      "grad_norm": 0.41566264629364014,
+      "learning_rate": 0.00019986736173362464,
+      "loss": 1.1516,
+      "step": 388
+    },
+    {
+      "epoch": 0.06926638176638177,
+      "grad_norm": 0.3569067418575287,
+      "learning_rate": 0.00019986664005453527,
+      "loss": 1.2329,
+      "step": 389
+    },
+    {
+      "epoch": 0.06944444444444445,
+      "grad_norm": 0.3959648907184601,
+      "learning_rate": 0.0001998659164187665,
+      "loss": 1.1041,
+      "step": 390
+    },
+    {
+      "epoch": 0.06962250712250712,
+      "grad_norm": 0.42853206396102905,
+      "learning_rate": 0.00019986519082633257,
+      "loss": 1.0859,
+      "step": 391
+    },
+    {
+      "epoch": 0.0698005698005698,
+      "grad_norm": 0.42005518078804016,
+      "learning_rate": 0.0001998644632772477,
+      "loss": 1.2017,
+      "step": 392
+    },
+    {
+      "epoch": 0.06997863247863248,
+      "grad_norm": 0.4296947419643402,
+      "learning_rate": 0.00019986373377152612,
+      "loss": 1.1464,
+      "step": 393
+    },
+    {
+      "epoch": 0.07015669515669516,
+      "grad_norm": 0.394747793674469,
+      "learning_rate": 0.0001998630023091821,
+      "loss": 1.0316,
+      "step": 394
+    },
+    {
+      "epoch": 0.07033475783475783,
+      "grad_norm": 0.3779357969760895,
+      "learning_rate": 0.00019986226889023002,
+      "loss": 1.1081,
+      "step": 395
+    },
+    {
+      "epoch": 0.07051282051282051,
+      "grad_norm": 0.4271804690361023,
+      "learning_rate": 0.00019986153351468424,
+      "loss": 0.985,
+      "step": 396
+    },
+    {
+      "epoch": 0.0706908831908832,
+      "grad_norm": 0.49412235617637634,
+      "learning_rate": 0.00019986079618255912,
+      "loss": 1.2606,
+      "step": 397
+    },
+    {
+      "epoch": 0.07086894586894586,
+      "grad_norm": 0.43657439947128296,
+      "learning_rate": 0.00019986005689386915,
+      "loss": 1.2266,
+      "step": 398
+    },
+    {
+      "epoch": 0.07104700854700854,
+      "grad_norm": 0.4060729444026947,
+      "learning_rate": 0.0001998593156486288,
+      "loss": 1.1787,
+      "step": 399
+    },
+    {
+      "epoch": 0.07122507122507123,
+      "grad_norm": 0.387046217918396,
+      "learning_rate": 0.00019985857244685264,
+      "loss": 0.9411,
+      "step": 400
+    },
+    {
+      "epoch": 0.07140313390313391,
+      "grad_norm": 0.4243999123573303,
+      "learning_rate": 0.00019985782728855516,
+      "loss": 1.2024,
+      "step": 401
+    },
+    {
+      "epoch": 0.07158119658119658,
+      "grad_norm": 0.43113812804222107,
+      "learning_rate": 0.000199857080173751,
+      "loss": 1.1246,
+      "step": 402
+    },
+    {
+      "epoch": 0.07175925925925926,
+      "grad_norm": 0.4653271436691284,
+      "learning_rate": 0.0001998563311024548,
+      "loss": 1.2343,
+      "step": 403
+    },
+    {
+      "epoch": 0.07193732193732194,
+      "grad_norm": 0.43260812759399414,
+      "learning_rate": 0.0001998555800746812,
+      "loss": 0.9543,
+      "step": 404
+    },
+    {
+      "epoch": 0.07211538461538461,
+      "grad_norm": 0.4635484516620636,
+      "learning_rate": 0.00019985482709044495,
+      "loss": 1.1091,
+      "step": 405
+    },
+    {
+      "epoch": 0.07229344729344729,
+      "grad_norm": 0.38362643122673035,
+      "learning_rate": 0.00019985407214976076,
+      "loss": 1.2584,
+      "step": 406
+    },
+    {
+      "epoch": 0.07247150997150997,
+      "grad_norm": 0.4068310558795929,
+      "learning_rate": 0.00019985331525264351,
+      "loss": 1.1944,
+      "step": 407
+    },
+    {
+      "epoch": 0.07264957264957266,
+      "grad_norm": 0.43909943103790283,
+      "learning_rate": 0.00019985255639910795,
+      "loss": 1.3748,
+      "step": 408
+    },
+    {
+      "epoch": 0.07282763532763532,
+      "grad_norm": 0.48674601316452026,
+      "learning_rate": 0.000199851795589169,
+      "loss": 1.2684,
+      "step": 409
+    },
+    {
+      "epoch": 0.073005698005698,
+      "grad_norm": 0.4218580722808838,
+      "learning_rate": 0.0001998510328228415,
+      "loss": 1.168,
+      "step": 410
+    },
+    {
+      "epoch": 0.07318376068376069,
+      "grad_norm": 0.4688236117362976,
+      "learning_rate": 0.00019985026810014046,
+      "loss": 1.3088,
+      "step": 411
+    },
+    {
+      "epoch": 0.07336182336182336,
+      "grad_norm": 0.3863612711429596,
+      "learning_rate": 0.00019984950142108083,
+      "loss": 1.0261,
+      "step": 412
+    },
+    {
+      "epoch": 0.07353988603988604,
+      "grad_norm": 0.4177640378475189,
+      "learning_rate": 0.00019984873278567765,
+      "loss": 1.1985,
+      "step": 413
+    },
+    {
+      "epoch": 0.07371794871794872,
+      "grad_norm": 0.4645586311817169,
+      "learning_rate": 0.00019984796219394592,
+      "loss": 1.2463,
+      "step": 414
+    },
+    {
+      "epoch": 0.0738960113960114,
+      "grad_norm": 0.5051766633987427,
+      "learning_rate": 0.00019984718964590083,
+      "loss": 1.3031,
+      "step": 415
+    },
+    {
+      "epoch": 0.07407407407407407,
+      "grad_norm": 0.4200040400028229,
+      "learning_rate": 0.0001998464151415575,
+      "loss": 1.0842,
+      "step": 416
+    },
+    {
+      "epoch": 0.07425213675213675,
+      "grad_norm": 0.34211036562919617,
+      "learning_rate": 0.000199845638680931,
+      "loss": 0.9659,
+      "step": 417
+    },
+    {
+      "epoch": 0.07443019943019943,
+      "grad_norm": 0.3553323447704315,
+      "learning_rate": 0.00019984486026403668,
+      "loss": 1.0102,
+      "step": 418
+    },
+    {
+      "epoch": 0.0746082621082621,
+      "grad_norm": 0.4967300295829773,
+      "learning_rate": 0.00019984407989088974,
+      "loss": 1.3125,
+      "step": 419
+    },
+    {
+      "epoch": 0.07478632478632478,
+      "grad_norm": 0.41649797558784485,
+      "learning_rate": 0.00019984329756150544,
+      "loss": 1.3092,
+      "step": 420
+    },
+    {
+      "epoch": 0.07496438746438747,
+      "grad_norm": 0.43825802206993103,
+      "learning_rate": 0.00019984251327589912,
+      "loss": 1.3678,
+      "step": 421
+    },
+    {
+      "epoch": 0.07514245014245015,
+      "grad_norm": 0.363394170999527,
+      "learning_rate": 0.00019984172703408617,
+      "loss": 1.305,
+      "step": 422
+    },
+    {
+      "epoch": 0.07532051282051282,
+      "grad_norm": 0.411563903093338,
+      "learning_rate": 0.000199840938836082,
+      "loss": 1.4248,
+      "step": 423
+    },
+    {
+      "epoch": 0.0754985754985755,
+      "grad_norm": 0.40548190474510193,
+      "learning_rate": 0.000199840148681902,
+      "loss": 1.1081,
+      "step": 424
+    },
+    {
+      "epoch": 0.07567663817663818,
+      "grad_norm": 0.3781099021434784,
+      "learning_rate": 0.00019983935657156171,
+      "loss": 1.185,
+      "step": 425
+    },
+    {
+      "epoch": 0.07585470085470085,
+      "grad_norm": 0.46597573161125183,
+      "learning_rate": 0.00019983856250507662,
+      "loss": 1.119,
+      "step": 426
+    },
+    {
+      "epoch": 0.07603276353276353,
+      "grad_norm": 0.3988197147846222,
+      "learning_rate": 0.00019983776648246232,
+      "loss": 1.206,
+      "step": 427
+    },
+    {
+      "epoch": 0.07621082621082621,
+      "grad_norm": 0.41210901737213135,
+      "learning_rate": 0.00019983696850373433,
+      "loss": 1.1843,
+      "step": 428
+    },
+    {
+      "epoch": 0.0763888888888889,
+      "grad_norm": 0.41870948672294617,
+      "learning_rate": 0.00019983616856890837,
+      "loss": 1.2248,
+      "step": 429
+    },
+    {
+      "epoch": 0.07656695156695156,
+      "grad_norm": 0.4320056140422821,
+      "learning_rate": 0.00019983536667800007,
+      "loss": 0.9743,
+      "step": 430
+    },
+    {
+      "epoch": 0.07674501424501425,
+      "grad_norm": 0.48455503582954407,
+      "learning_rate": 0.00019983456283102517,
+      "loss": 1.0438,
+      "step": 431
+    },
+    {
+      "epoch": 0.07692307692307693,
+      "grad_norm": 0.38712427020072937,
+      "learning_rate": 0.00019983375702799935,
+      "loss": 1.2041,
+      "step": 432
+    },
+    {
+      "epoch": 0.0771011396011396,
+      "grad_norm": 0.3578857481479645,
+      "learning_rate": 0.0001998329492689385,
+      "loss": 1.1623,
+      "step": 433
+    },
+    {
+      "epoch": 0.07727920227920228,
+      "grad_norm": 0.43065932393074036,
+      "learning_rate": 0.00019983213955385834,
+      "loss": 1.3033,
+      "step": 434
+    },
+    {
+      "epoch": 0.07745726495726496,
+      "grad_norm": 0.4882095754146576,
+      "learning_rate": 0.00019983132788277484,
+      "loss": 1.1635,
+      "step": 435
+    },
+    {
+      "epoch": 0.07763532763532764,
+      "grad_norm": 0.3429015874862671,
+      "learning_rate": 0.00019983051425570382,
+      "loss": 0.7289,
+      "step": 436
+    },
+    {
+      "epoch": 0.07781339031339031,
+      "grad_norm": 0.4320310056209564,
+      "learning_rate": 0.00019982969867266128,
+      "loss": 1.3685,
+      "step": 437
+    },
+    {
+      "epoch": 0.07799145299145299,
+      "grad_norm": 0.39891982078552246,
+      "learning_rate": 0.00019982888113366314,
+      "loss": 1.0444,
+      "step": 438
+    },
+    {
+      "epoch": 0.07816951566951567,
+      "grad_norm": 0.3675695061683655,
+      "learning_rate": 0.00019982806163872547,
+      "loss": 1.0527,
+      "step": 439
+    },
+    {
+      "epoch": 0.07834757834757834,
+      "grad_norm": 0.42824694514274597,
+      "learning_rate": 0.0001998272401878643,
+      "loss": 1.166,
+      "step": 440
+    },
+    {
+      "epoch": 0.07852564102564102,
+      "grad_norm": 0.3721694350242615,
+      "learning_rate": 0.00019982641678109575,
+      "loss": 1.1328,
+      "step": 441
+    },
+    {
+      "epoch": 0.0787037037037037,
+      "grad_norm": 0.33899208903312683,
+      "learning_rate": 0.00019982559141843592,
+      "loss": 1.016,
+      "step": 442
+    },
+    {
+      "epoch": 0.07888176638176639,
+      "grad_norm": 0.4029340147972107,
+      "learning_rate": 0.000199824764099901,
+      "loss": 1.0076,
+      "step": 443
+    },
+    {
+      "epoch": 0.07905982905982906,
+      "grad_norm": 0.4169132113456726,
+      "learning_rate": 0.0001998239348255072,
+      "loss": 1.208,
+      "step": 444
+    },
+    {
+      "epoch": 0.07923789173789174,
+      "grad_norm": 0.3865824043750763,
+      "learning_rate": 0.00019982310359527075,
+      "loss": 1.067,
+      "step": 445
+    },
+    {
+      "epoch": 0.07941595441595442,
+      "grad_norm": 0.4218919277191162,
+      "learning_rate": 0.00019982227040920796,
+      "loss": 1.195,
+      "step": 446
+    },
+    {
+      "epoch": 0.07959401709401709,
+      "grad_norm": 0.40504586696624756,
+      "learning_rate": 0.00019982143526733512,
+      "loss": 1.0188,
+      "step": 447
+    },
+    {
+      "epoch": 0.07977207977207977,
+      "grad_norm": 0.38330578804016113,
+      "learning_rate": 0.00019982059816966863,
+      "loss": 1.0484,
+      "step": 448
+    },
+    {
+      "epoch": 0.07995014245014245,
+      "grad_norm": 0.43731689453125,
+      "learning_rate": 0.00019981975911622488,
+      "loss": 1.074,
+      "step": 449
+    },
+    {
+      "epoch": 0.08012820512820513,
+      "grad_norm": 0.40858447551727295,
+      "learning_rate": 0.00019981891810702033,
+      "loss": 1.0008,
+      "step": 450
+    },
+    {
+      "epoch": 0.0803062678062678,
+      "grad_norm": 0.4031754732131958,
+      "learning_rate": 0.00019981807514207143,
+      "loss": 1.2179,
+      "step": 451
+    },
+    {
+      "epoch": 0.08048433048433049,
+      "grad_norm": 0.41920867562294006,
+      "learning_rate": 0.00019981723022139466,
+      "loss": 1.1406,
+      "step": 452
+    },
+    {
+      "epoch": 0.08066239316239317,
+      "grad_norm": 0.40305474400520325,
+      "learning_rate": 0.00019981638334500668,
+      "loss": 1.098,
+      "step": 453
+    },
+    {
+      "epoch": 0.08084045584045584,
+      "grad_norm": 0.4564182460308075,
+      "learning_rate": 0.00019981553451292396,
+      "loss": 1.419,
+      "step": 454
+    },
+    {
+      "epoch": 0.08101851851851852,
+      "grad_norm": 0.3832945227622986,
+      "learning_rate": 0.00019981468372516322,
+      "loss": 1.0919,
+      "step": 455
+    },
+    {
+      "epoch": 0.0811965811965812,
+      "grad_norm": 0.43062624335289,
+      "learning_rate": 0.0001998138309817411,
+      "loss": 1.0458,
+      "step": 456
+    },
+    {
+      "epoch": 0.08137464387464387,
+      "grad_norm": 0.3871173560619354,
+      "learning_rate": 0.0001998129762826743,
+      "loss": 1.1391,
+      "step": 457
+    },
+    {
+      "epoch": 0.08155270655270655,
+      "grad_norm": 0.43423157930374146,
+      "learning_rate": 0.0001998121196279796,
+      "loss": 1.1132,
+      "step": 458
+    },
+    {
+      "epoch": 0.08173076923076923,
+      "grad_norm": 0.4341012239456177,
+      "learning_rate": 0.00019981126101767372,
+      "loss": 1.113,
+      "step": 459
+    },
+    {
+      "epoch": 0.08190883190883191,
+      "grad_norm": 0.36748576164245605,
+      "learning_rate": 0.00019981040045177352,
+      "loss": 0.8108,
+      "step": 460
+    },
+    {
+      "epoch": 0.08208689458689458,
+      "grad_norm": 0.43133220076560974,
+      "learning_rate": 0.00019980953793029586,
+      "loss": 1.1861,
+      "step": 461
+    },
+    {
+      "epoch": 0.08226495726495726,
+      "grad_norm": 0.37204909324645996,
+      "learning_rate": 0.00019980867345325767,
+      "loss": 0.9222,
+      "step": 462
+    },
+    {
+      "epoch": 0.08244301994301995,
+      "grad_norm": 0.43370047211647034,
+      "learning_rate": 0.00019980780702067582,
+      "loss": 1.2984,
+      "step": 463
+    },
+    {
+      "epoch": 0.08262108262108261,
+      "grad_norm": 0.4991510808467865,
+      "learning_rate": 0.00019980693863256736,
+      "loss": 1.2222,
+      "step": 464
+    },
+    {
+      "epoch": 0.0827991452991453,
+      "grad_norm": 0.44318175315856934,
+      "learning_rate": 0.00019980606828894927,
+      "loss": 1.2262,
+      "step": 465
+    },
+    {
+      "epoch": 0.08297720797720798,
+      "grad_norm": 0.380231648683548,
+      "learning_rate": 0.0001998051959898386,
+      "loss": 1.0274,
+      "step": 466
+    },
+    {
+      "epoch": 0.08315527065527066,
+      "grad_norm": 0.39519667625427246,
+      "learning_rate": 0.0001998043217352524,
+      "loss": 1.2499,
+      "step": 467
+    },
+    {
+      "epoch": 0.08333333333333333,
+      "grad_norm": 0.457499235868454,
+      "learning_rate": 0.0001998034455252079,
+      "loss": 1.0751,
+      "step": 468
+    },
+    {
+      "epoch": 0.08351139601139601,
+      "grad_norm": 0.368522584438324,
+      "learning_rate": 0.00019980256735972215,
+      "loss": 1.0776,
+      "step": 469
+    },
+    {
+      "epoch": 0.08368945868945869,
+      "grad_norm": 0.3768427073955536,
+      "learning_rate": 0.00019980168723881243,
+      "loss": 1.2198,
+      "step": 470
+    },
+    {
+      "epoch": 0.08386752136752136,
+      "grad_norm": 0.37045565247535706,
+      "learning_rate": 0.000199800805162496,
+      "loss": 1.1816,
+      "step": 471
+    },
+    {
+      "epoch": 0.08404558404558404,
+      "grad_norm": 0.4219281077384949,
+      "learning_rate": 0.0001997999211307901,
+      "loss": 1.0515,
+      "step": 472
+    },
+    {
+      "epoch": 0.08422364672364673,
+      "grad_norm": 0.3815271258354187,
+      "learning_rate": 0.00019979903514371207,
+      "loss": 1.1709,
+      "step": 473
+    },
+    {
+      "epoch": 0.08440170940170941,
+      "grad_norm": 0.4566493630409241,
+      "learning_rate": 0.00019979814720127924,
+      "loss": 1.3063,
+      "step": 474
+    },
+    {
+      "epoch": 0.08457977207977208,
+      "grad_norm": 0.4043879806995392,
+      "learning_rate": 0.000199797257303509,
+      "loss": 1.0549,
+      "step": 475
+    },
+    {
+      "epoch": 0.08475783475783476,
+      "grad_norm": 0.3897830545902252,
+      "learning_rate": 0.00019979636545041886,
+      "loss": 1.1483,
+      "step": 476
+    },
+    {
+      "epoch": 0.08493589743589744,
+      "grad_norm": 0.36097025871276855,
+      "learning_rate": 0.00019979547164202622,
+      "loss": 1.1196,
+      "step": 477
+    },
+    {
+      "epoch": 0.08511396011396011,
+      "grad_norm": 0.3766986131668091,
+      "learning_rate": 0.00019979457587834863,
+      "loss": 1.0131,
+      "step": 478
+    },
+    {
+      "epoch": 0.08529202279202279,
+      "grad_norm": 0.39460286498069763,
+      "learning_rate": 0.00019979367815940364,
+      "loss": 1.1729,
+      "step": 479
+    },
+    {
+      "epoch": 0.08547008547008547,
+      "grad_norm": 0.4137469232082367,
+      "learning_rate": 0.00019979277848520885,
+      "loss": 1.2569,
+      "step": 480
+    },
+    {
+      "epoch": 0.08564814814814815,
+      "grad_norm": 0.464688777923584,
+      "learning_rate": 0.00019979187685578183,
+      "loss": 1.2064,
+      "step": 481
+    },
+    {
+      "epoch": 0.08582621082621082,
+      "grad_norm": 0.4245518147945404,
+      "learning_rate": 0.0001997909732711403,
+      "loss": 0.9812,
+      "step": 482
+    },
+    {
+      "epoch": 0.0860042735042735,
+      "grad_norm": 0.43368837237358093,
+      "learning_rate": 0.00019979006773130197,
+      "loss": 1.2822,
+      "step": 483
+    },
+    {
+      "epoch": 0.08618233618233619,
+      "grad_norm": 0.4232824444770813,
+      "learning_rate": 0.00019978916023628452,
+      "loss": 1.1446,
+      "step": 484
+    },
+    {
+      "epoch": 0.08636039886039885,
+      "grad_norm": 0.4183506369590759,
+      "learning_rate": 0.00019978825078610578,
+      "loss": 1.2605,
+      "step": 485
+    },
+    {
+      "epoch": 0.08653846153846154,
+      "grad_norm": 0.4391268491744995,
+      "learning_rate": 0.00019978733938078356,
+      "loss": 1.2165,
+      "step": 486
+    },
+    {
+      "epoch": 0.08671652421652422,
+      "grad_norm": 0.4139612317085266,
+      "learning_rate": 0.0001997864260203357,
+      "loss": 0.9389,
+      "step": 487
+    },
+    {
+      "epoch": 0.0868945868945869,
+      "grad_norm": 0.4058656096458435,
+      "learning_rate": 0.00019978551070478013,
+      "loss": 1.0652,
+      "step": 488
+    },
+    {
+      "epoch": 0.08707264957264957,
+      "grad_norm": 0.42333099246025085,
+      "learning_rate": 0.00019978459343413473,
+      "loss": 1.119,
+      "step": 489
+    },
+    {
+      "epoch": 0.08725071225071225,
+      "grad_norm": 0.4573031961917877,
+      "learning_rate": 0.00019978367420841754,
+      "loss": 1.1546,
+      "step": 490
+    },
+    {
+      "epoch": 0.08742877492877493,
+      "grad_norm": 0.4161617159843445,
+      "learning_rate": 0.00019978275302764655,
+      "loss": 1.0836,
+      "step": 491
+    },
+    {
+      "epoch": 0.0876068376068376,
+      "grad_norm": 0.422145277261734,
+      "learning_rate": 0.00019978182989183977,
+      "loss": 1.1908,
+      "step": 492
+    },
+    {
+      "epoch": 0.08778490028490028,
+      "grad_norm": 0.4588126838207245,
+      "learning_rate": 0.00019978090480101532,
+      "loss": 1.1758,
+      "step": 493
+    },
+    {
+      "epoch": 0.08796296296296297,
+      "grad_norm": 0.4425722062587738,
+      "learning_rate": 0.00019977997775519132,
+      "loss": 1.088,
+      "step": 494
+    },
+    {
+      "epoch": 0.08814102564102565,
+      "grad_norm": 0.37860307097435,
+      "learning_rate": 0.00019977904875438594,
+      "loss": 1.1532,
+      "step": 495
+    },
+    {
+      "epoch": 0.08831908831908832,
+      "grad_norm": 0.40435823798179626,
+      "learning_rate": 0.00019977811779861733,
+      "loss": 1.1271,
+      "step": 496
+    },
+    {
+      "epoch": 0.088497150997151,
+      "grad_norm": 0.42578884959220886,
+      "learning_rate": 0.0001997771848879038,
+      "loss": 0.9889,
+      "step": 497
+    },
+    {
+      "epoch": 0.08867521367521368,
+      "grad_norm": 0.3439478874206543,
+      "learning_rate": 0.00019977625002226361,
+      "loss": 1.1273,
+      "step": 498
+    },
+    {
+      "epoch": 0.08885327635327635,
+      "grad_norm": 0.362341970205307,
+      "learning_rate": 0.00019977531320171504,
+      "loss": 1.0214,
+      "step": 499
+    },
+    {
+      "epoch": 0.08903133903133903,
+      "grad_norm": 0.4305768609046936,
+      "learning_rate": 0.0001997743744262765,
+      "loss": 1.2648,
+      "step": 500
+    },
+    {
+      "epoch": 0.08920940170940171,
+      "grad_norm": 0.35900023579597473,
+      "learning_rate": 0.00019977343369596636,
+      "loss": 1.0274,
+      "step": 501
+    },
+    {
+      "epoch": 0.0893874643874644,
+      "grad_norm": 0.4950818717479706,
+      "learning_rate": 0.00019977249101080306,
+      "loss": 1.1483,
+      "step": 502
+    },
+    {
+      "epoch": 0.08956552706552706,
+      "grad_norm": 0.3800346553325653,
+      "learning_rate": 0.00019977154637080503,
+      "loss": 1.0636,
+      "step": 503
+    },
+    {
+      "epoch": 0.08974358974358974,
+      "grad_norm": 0.46202352643013,
+      "learning_rate": 0.0001997705997759908,
+      "loss": 1.1544,
+      "step": 504
+    },
+    {
+      "epoch": 0.08992165242165243,
+      "grad_norm": 0.36818403005599976,
+      "learning_rate": 0.00019976965122637895,
+      "loss": 0.9824,
+      "step": 505
+    },
+    {
+      "epoch": 0.0900997150997151,
+      "grad_norm": 0.40248095989227295,
+      "learning_rate": 0.00019976870072198805,
+      "loss": 1.1002,
+      "step": 506
+    },
+    {
+      "epoch": 0.09027777777777778,
+      "grad_norm": 0.3841850459575653,
+      "learning_rate": 0.00019976774826283667,
+      "loss": 1.2433,
+      "step": 507
+    },
+    {
+      "epoch": 0.09045584045584046,
+      "grad_norm": 0.46892330050468445,
+      "learning_rate": 0.0001997667938489435,
+      "loss": 1.3194,
+      "step": 508
+    },
+    {
+      "epoch": 0.09063390313390314,
+      "grad_norm": 0.39059561491012573,
+      "learning_rate": 0.0001997658374803273,
+      "loss": 1.1778,
+      "step": 509
+    },
+    {
+      "epoch": 0.09081196581196581,
+      "grad_norm": 0.3793235421180725,
+      "learning_rate": 0.00019976487915700672,
+      "loss": 1.0659,
+      "step": 510
+    },
+    {
+      "epoch": 0.09099002849002849,
+      "grad_norm": 0.39067742228507996,
+      "learning_rate": 0.00019976391887900058,
+      "loss": 1.107,
+      "step": 511
+    },
+    {
+      "epoch": 0.09116809116809117,
+      "grad_norm": 0.40121713280677795,
+      "learning_rate": 0.00019976295664632772,
+      "loss": 1.102,
+      "step": 512
+    },
+    {
+      "epoch": 0.09134615384615384,
+      "grad_norm": 0.49830010533332825,
+      "learning_rate": 0.00019976199245900697,
+      "loss": 1.1701,
+      "step": 513
+    },
+    {
+      "epoch": 0.09152421652421652,
+      "grad_norm": 0.4536968171596527,
+      "learning_rate": 0.0001997610263170572,
+      "loss": 1.1067,
+      "step": 514
+    },
+    {
+      "epoch": 0.0917022792022792,
+      "grad_norm": 0.3832971453666687,
+      "learning_rate": 0.00019976005822049735,
+      "loss": 1.0991,
+      "step": 515
+    },
+    {
+      "epoch": 0.09188034188034189,
+      "grad_norm": 0.4093509614467621,
+      "learning_rate": 0.0001997590881693464,
+      "loss": 1.0565,
+      "step": 516
+    },
+    {
+      "epoch": 0.09205840455840456,
+      "grad_norm": 0.46073687076568604,
+      "learning_rate": 0.0001997581161636233,
+      "loss": 1.0057,
+      "step": 517
+    },
+    {
+      "epoch": 0.09223646723646724,
+      "grad_norm": 0.5001922845840454,
+      "learning_rate": 0.0001997571422033472,
+      "loss": 1.2639,
+      "step": 518
+    },
+    {
+      "epoch": 0.09241452991452992,
+      "grad_norm": 0.4620618224143982,
+      "learning_rate": 0.00019975616628853713,
+      "loss": 1.0966,
+      "step": 519
+    },
+    {
+      "epoch": 0.09259259259259259,
+      "grad_norm": 0.3788183927536011,
+      "learning_rate": 0.0001997551884192122,
+      "loss": 0.9783,
+      "step": 520
+    },
+    {
+      "epoch": 0.09277065527065527,
+      "grad_norm": 0.45589539408683777,
+      "learning_rate": 0.00019975420859539154,
+      "loss": 1.2194,
+      "step": 521
+    },
+    {
+      "epoch": 0.09294871794871795,
+      "grad_norm": 0.40747523307800293,
+      "learning_rate": 0.00019975322681709443,
+      "loss": 1.0349,
+      "step": 522
+    },
+    {
+      "epoch": 0.09312678062678063,
+      "grad_norm": 0.5045142769813538,
+      "learning_rate": 0.00019975224308434002,
+      "loss": 1.1373,
+      "step": 523
+    },
+    {
+      "epoch": 0.0933048433048433,
+      "grad_norm": 0.40352702140808105,
+      "learning_rate": 0.00019975125739714767,
+      "loss": 1.1236,
+      "step": 524
+    },
+    {
+      "epoch": 0.09348290598290598,
+      "grad_norm": 0.4301735758781433,
+      "learning_rate": 0.0001997502697555366,
+      "loss": 1.2932,
+      "step": 525
+    },
+    {
+      "epoch": 0.09366096866096867,
+      "grad_norm": 0.36800238490104675,
+      "learning_rate": 0.00019974928015952624,
+      "loss": 1.0734,
+      "step": 526
+    },
+    {
+      "epoch": 0.09383903133903133,
+      "grad_norm": 0.4027230143547058,
+      "learning_rate": 0.00019974828860913594,
+      "loss": 1.2776,
+      "step": 527
+    },
+    {
+      "epoch": 0.09401709401709402,
+      "grad_norm": 0.42497140169143677,
+      "learning_rate": 0.0001997472951043851,
+      "loss": 1.248,
+      "step": 528
+    },
+    {
+      "epoch": 0.0941951566951567,
+      "grad_norm": 0.3888593018054962,
+      "learning_rate": 0.00019974629964529325,
+      "loss": 1.0231,
+      "step": 529
+    },
+    {
+      "epoch": 0.09437321937321937,
+      "grad_norm": 0.3761361241340637,
+      "learning_rate": 0.00019974530223187986,
+      "loss": 1.0216,
+      "step": 530
+    },
+    {
+      "epoch": 0.09455128205128205,
+      "grad_norm": 0.42192980647087097,
+      "learning_rate": 0.00019974430286416448,
+      "loss": 1.0731,
+      "step": 531
+    },
+    {
+      "epoch": 0.09472934472934473,
+      "grad_norm": 0.44244512915611267,
+      "learning_rate": 0.00019974330154216667,
+      "loss": 1.2793,
+      "step": 532
+    },
+    {
+      "epoch": 0.09490740740740741,
+      "grad_norm": 0.378252774477005,
+      "learning_rate": 0.0001997422982659061,
+      "loss": 1.0462,
+      "step": 533
+    },
+    {
+      "epoch": 0.09508547008547008,
+      "grad_norm": 0.45589110255241394,
+      "learning_rate": 0.00019974129303540236,
+      "loss": 1.1884,
+      "step": 534
+    },
+    {
+      "epoch": 0.09526353276353276,
+      "grad_norm": 0.33930808305740356,
+      "learning_rate": 0.0001997402858506752,
+      "loss": 0.8381,
+      "step": 535
+    },
+    {
+      "epoch": 0.09544159544159544,
+      "grad_norm": 0.45408427715301514,
+      "learning_rate": 0.0001997392767117443,
+      "loss": 1.2379,
+      "step": 536
+    },
+    {
+      "epoch": 0.09561965811965811,
+      "grad_norm": 0.44125741720199585,
+      "learning_rate": 0.0001997382656186295,
+      "loss": 1.1941,
+      "step": 537
+    },
+    {
+      "epoch": 0.0957977207977208,
+      "grad_norm": 0.4075697660446167,
+      "learning_rate": 0.00019973725257135054,
+      "loss": 1.0142,
+      "step": 538
+    },
+    {
+      "epoch": 0.09597578347578348,
+      "grad_norm": 0.4258415102958679,
+      "learning_rate": 0.00019973623756992733,
+      "loss": 1.0447,
+      "step": 539
+    },
+    {
+      "epoch": 0.09615384615384616,
+      "grad_norm": 0.2738485038280487,
+      "learning_rate": 0.0001997352206143797,
+      "loss": 0.5521,
+      "step": 540
+    },
+    {
+      "epoch": 0.09633190883190883,
+      "grad_norm": 0.38815587759017944,
+      "learning_rate": 0.00019973420170472762,
+      "loss": 1.1052,
+      "step": 541
+    },
+    {
+      "epoch": 0.09650997150997151,
+      "grad_norm": 0.3909834027290344,
+      "learning_rate": 0.00019973318084099106,
+      "loss": 1.0494,
+      "step": 542
+    },
+    {
+      "epoch": 0.09668803418803419,
+      "grad_norm": 0.4517597258090973,
+      "learning_rate": 0.00019973215802318996,
+      "loss": 1.0611,
+      "step": 543
+    },
+    {
+      "epoch": 0.09686609686609686,
+      "grad_norm": 0.48659002780914307,
+      "learning_rate": 0.00019973113325134442,
+      "loss": 0.9967,
+      "step": 544
+    },
+    {
+      "epoch": 0.09704415954415954,
+      "grad_norm": 0.4039791524410248,
+      "learning_rate": 0.0001997301065254745,
+      "loss": 1.251,
+      "step": 545
+    },
+    {
+      "epoch": 0.09722222222222222,
+      "grad_norm": 0.3985383212566376,
+      "learning_rate": 0.0001997290778456003,
+      "loss": 1.2263,
+      "step": 546
+    },
+    {
+      "epoch": 0.0974002849002849,
+      "grad_norm": 0.4540637731552124,
+      "learning_rate": 0.00019972804721174199,
+      "loss": 1.2084,
+      "step": 547
+    },
+    {
+      "epoch": 0.09757834757834757,
+      "grad_norm": 0.36867982149124146,
+      "learning_rate": 0.00019972701462391977,
+      "loss": 0.9704,
+      "step": 548
+    },
+    {
+      "epoch": 0.09775641025641026,
+      "grad_norm": 0.40199780464172363,
+      "learning_rate": 0.00019972598008215385,
+      "loss": 1.1121,
+      "step": 549
+    },
+    {
+      "epoch": 0.09793447293447294,
+      "grad_norm": 0.42728984355926514,
+      "learning_rate": 0.00019972494358646455,
+      "loss": 1.1606,
+      "step": 550
+    },
+    {
+      "epoch": 0.0981125356125356,
+      "grad_norm": 0.4212374687194824,
+      "learning_rate": 0.0001997239051368721,
+      "loss": 1.3093,
+      "step": 551
+    },
+    {
+      "epoch": 0.09829059829059829,
+      "grad_norm": 0.3972226083278656,
+      "learning_rate": 0.0001997228647333969,
+      "loss": 1.1218,
+      "step": 552
+    },
+    {
+      "epoch": 0.09846866096866097,
+      "grad_norm": 0.43649932742118835,
+      "learning_rate": 0.00019972182237605935,
+      "loss": 1.2532,
+      "step": 553
+    },
+    {
+      "epoch": 0.09864672364672365,
+      "grad_norm": 0.3812280595302582,
+      "learning_rate": 0.0001997207780648798,
+      "loss": 1.0409,
+      "step": 554
+    },
+    {
+      "epoch": 0.09882478632478632,
+      "grad_norm": 0.41684821248054504,
+      "learning_rate": 0.00019971973179987878,
+      "loss": 0.9569,
+      "step": 555
+    },
+    {
+      "epoch": 0.099002849002849,
+      "grad_norm": 0.38081470131874084,
+      "learning_rate": 0.00019971868358107674,
+      "loss": 1.1615,
+      "step": 556
+    },
+    {
+      "epoch": 0.09918091168091168,
+      "grad_norm": 0.3702073097229004,
+      "learning_rate": 0.0001997176334084943,
+      "loss": 1.3907,
+      "step": 557
+    },
+    {
+      "epoch": 0.09935897435897435,
+      "grad_norm": 0.3625728189945221,
+      "learning_rate": 0.00019971658128215193,
+      "loss": 1.1897,
+      "step": 558
+    },
+    {
+      "epoch": 0.09953703703703703,
+      "grad_norm": 0.3815405070781708,
+      "learning_rate": 0.0001997155272020703,
+      "loss": 1.1473,
+      "step": 559
+    },
+    {
+      "epoch": 0.09971509971509972,
+      "grad_norm": 0.48664286732673645,
+      "learning_rate": 0.00019971447116827004,
+      "loss": 1.2462,
+      "step": 560
+    },
+    {
+      "epoch": 0.0998931623931624,
+      "grad_norm": 0.3708696663379669,
+      "learning_rate": 0.0001997134131807719,
+      "loss": 1.0979,
+      "step": 561
+    },
+    {
+      "epoch": 0.10007122507122507,
+      "grad_norm": 0.44511324167251587,
+      "learning_rate": 0.00019971235323959654,
+      "loss": 1.2313,
+      "step": 562
+    },
+    {
+      "epoch": 0.10024928774928775,
+      "grad_norm": 0.3687448799610138,
+      "learning_rate": 0.00019971129134476473,
+      "loss": 1.1526,
+      "step": 563
+    },
+    {
+      "epoch": 0.10042735042735043,
+      "grad_norm": 0.4506866931915283,
+      "learning_rate": 0.00019971022749629735,
+      "loss": 1.0003,
+      "step": 564
+    },
+    {
+      "epoch": 0.1006054131054131,
+      "grad_norm": 0.41910406947135925,
+      "learning_rate": 0.00019970916169421515,
+      "loss": 1.013,
+      "step": 565
+    },
+    {
+      "epoch": 0.10078347578347578,
+      "grad_norm": 0.39728936553001404,
+      "learning_rate": 0.0001997080939385391,
+      "loss": 1.0501,
+      "step": 566
+    },
+    {
+      "epoch": 0.10096153846153846,
+      "grad_norm": 0.41415902972221375,
+      "learning_rate": 0.00019970702422929005,
+      "loss": 1.0791,
+      "step": 567
+    },
+    {
+      "epoch": 0.10113960113960115,
+      "grad_norm": 0.45630788803100586,
+      "learning_rate": 0.00019970595256648896,
+      "loss": 1.2884,
+      "step": 568
+    },
+    {
+      "epoch": 0.10131766381766381,
+      "grad_norm": 0.4371698796749115,
+      "learning_rate": 0.00019970487895015686,
+      "loss": 1.0684,
+      "step": 569
+    },
+    {
+      "epoch": 0.1014957264957265,
+      "grad_norm": 0.4350591003894806,
+      "learning_rate": 0.00019970380338031477,
+      "loss": 1.2415,
+      "step": 570
+    },
+    {
+      "epoch": 0.10167378917378918,
+      "grad_norm": 0.4232708215713501,
+      "learning_rate": 0.00019970272585698382,
+      "loss": 1.2656,
+      "step": 571
+    },
+    {
+      "epoch": 0.10185185185185185,
+      "grad_norm": 0.3917689919471741,
+      "learning_rate": 0.00019970164638018502,
+      "loss": 1.0178,
+      "step": 572
+    },
+    {
+      "epoch": 0.10202991452991453,
+      "grad_norm": 0.4262804388999939,
+      "learning_rate": 0.0001997005649499396,
+      "loss": 1.1805,
+      "step": 573
+    },
+    {
+      "epoch": 0.10220797720797721,
+      "grad_norm": 0.5217884182929993,
+      "learning_rate": 0.0001996994815662687,
+      "loss": 1.2392,
+      "step": 574
+    },
+    {
+      "epoch": 0.10238603988603989,
+      "grad_norm": 0.4273875057697296,
+      "learning_rate": 0.00019969839622919358,
+      "loss": 1.0844,
+      "step": 575
+    },
+    {
+      "epoch": 0.10256410256410256,
+      "grad_norm": 0.41588085889816284,
+      "learning_rate": 0.00019969730893873547,
+      "loss": 1.2437,
+      "step": 576
+    },
+    {
+      "epoch": 0.10274216524216524,
+      "grad_norm": 0.41617709398269653,
+      "learning_rate": 0.0001996962196949157,
+      "loss": 0.9519,
+      "step": 577
+    },
+    {
+      "epoch": 0.10292022792022792,
+      "grad_norm": 0.4832979142665863,
+      "learning_rate": 0.00019969512849775565,
+      "loss": 1.1889,
+      "step": 578
+    },
+    {
+      "epoch": 0.10309829059829059,
+      "grad_norm": 0.3936060965061188,
+      "learning_rate": 0.0001996940353472766,
+      "loss": 0.9888,
+      "step": 579
+    },
+    {
+      "epoch": 0.10327635327635327,
+      "grad_norm": 0.4147680997848511,
+      "learning_rate": 0.00019969294024350004,
+      "loss": 1.0733,
+      "step": 580
+    },
+    {
+      "epoch": 0.10345441595441596,
+      "grad_norm": 0.37791356444358826,
+      "learning_rate": 0.00019969184318644742,
+      "loss": 1.212,
+      "step": 581
+    },
+    {
+      "epoch": 0.10363247863247864,
+      "grad_norm": 0.44297221302986145,
+      "learning_rate": 0.00019969074417614023,
+      "loss": 1.0535,
+      "step": 582
+    },
+    {
+      "epoch": 0.10381054131054131,
+      "grad_norm": 0.4032835066318512,
+      "learning_rate": 0.0001996896432126,
+      "loss": 1.1869,
+      "step": 583
+    },
+    {
+      "epoch": 0.10398860398860399,
+      "grad_norm": 0.49271953105926514,
+      "learning_rate": 0.00019968854029584827,
+      "loss": 1.1661,
+      "step": 584
+    },
+    {
+      "epoch": 0.10416666666666667,
+      "grad_norm": 0.362699031829834,
+      "learning_rate": 0.0001996874354259067,
+      "loss": 0.868,
+      "step": 585
+    },
+    {
+      "epoch": 0.10434472934472934,
+      "grad_norm": 0.401795357465744,
+      "learning_rate": 0.0001996863286027969,
+      "loss": 1.1045,
+      "step": 586
+    },
+    {
+      "epoch": 0.10452279202279202,
+      "grad_norm": 0.45380479097366333,
+      "learning_rate": 0.00019968521982654058,
+      "loss": 0.8503,
+      "step": 587
+    },
+    {
+      "epoch": 0.1047008547008547,
+      "grad_norm": 0.49759066104888916,
+      "learning_rate": 0.00019968410909715947,
+      "loss": 1.4073,
+      "step": 588
+    },
+    {
+      "epoch": 0.10487891737891739,
+      "grad_norm": 0.4421198070049286,
+      "learning_rate": 0.0001996829964146753,
+      "loss": 1.1512,
+      "step": 589
+    },
+    {
+      "epoch": 0.10505698005698005,
+      "grad_norm": 0.46675658226013184,
+      "learning_rate": 0.00019968188177910988,
+      "loss": 1.0132,
+      "step": 590
+    },
+    {
+      "epoch": 0.10523504273504274,
+      "grad_norm": 0.5710657238960266,
+      "learning_rate": 0.00019968076519048507,
+      "loss": 1.267,
+      "step": 591
+    },
+    {
+      "epoch": 0.10541310541310542,
+      "grad_norm": 0.4655563235282898,
+      "learning_rate": 0.00019967964664882276,
+      "loss": 1.1204,
+      "step": 592
+    },
+    {
+      "epoch": 0.10559116809116809,
+      "grad_norm": 0.3895256519317627,
+      "learning_rate": 0.00019967852615414478,
+      "loss": 1.0814,
+      "step": 593
+    },
+    {
+      "epoch": 0.10576923076923077,
+      "grad_norm": 0.424216091632843,
+      "learning_rate": 0.00019967740370647322,
+      "loss": 1.1663,
+      "step": 594
+    },
+    {
+      "epoch": 0.10594729344729345,
+      "grad_norm": 0.3978985846042633,
+      "learning_rate": 0.00019967627930582996,
+      "loss": 0.909,
+      "step": 595
+    },
+    {
+      "epoch": 0.10612535612535613,
+      "grad_norm": 0.47064995765686035,
+      "learning_rate": 0.00019967515295223705,
+      "loss": 1.2351,
+      "step": 596
+    },
+    {
+      "epoch": 0.1063034188034188,
+      "grad_norm": 0.42449644207954407,
+      "learning_rate": 0.0001996740246457166,
+      "loss": 0.9739,
+      "step": 597
+    },
+    {
+      "epoch": 0.10648148148148148,
+      "grad_norm": 0.39033401012420654,
+      "learning_rate": 0.00019967289438629066,
+      "loss": 1.0933,
+      "step": 598
+    },
+    {
+      "epoch": 0.10665954415954416,
+      "grad_norm": 0.4398612678050995,
+      "learning_rate": 0.00019967176217398143,
+      "loss": 1.2479,
+      "step": 599
+    },
+    {
+      "epoch": 0.10683760683760683,
+      "grad_norm": 0.3946632742881775,
+      "learning_rate": 0.00019967062800881107,
+      "loss": 1.0417,
+      "step": 600
+    },
+    {
+      "epoch": 0.10701566951566951,
+      "grad_norm": 0.5083445906639099,
+      "learning_rate": 0.0001996694918908018,
+      "loss": 1.1109,
+      "step": 601
+    },
+    {
+      "epoch": 0.1071937321937322,
+      "grad_norm": 0.477724552154541,
+      "learning_rate": 0.00019966835381997585,
+      "loss": 1.2891,
+      "step": 602
+    },
+    {
+      "epoch": 0.10737179487179487,
+      "grad_norm": 0.4110167920589447,
+      "learning_rate": 0.0001996672137963556,
+      "loss": 1.0555,
+      "step": 603
+    },
+    {
+      "epoch": 0.10754985754985755,
+      "grad_norm": 0.44078320264816284,
+      "learning_rate": 0.00019966607181996334,
+      "loss": 0.9188,
+      "step": 604
+    },
+    {
+      "epoch": 0.10772792022792023,
+      "grad_norm": 0.41251105070114136,
+      "learning_rate": 0.00019966492789082142,
+      "loss": 1.2592,
+      "step": 605
+    },
+    {
+      "epoch": 0.10790598290598291,
+      "grad_norm": 0.37701505422592163,
+      "learning_rate": 0.00019966378200895227,
+      "loss": 1.0233,
+      "step": 606
+    },
+    {
+      "epoch": 0.10808404558404558,
+      "grad_norm": 0.44624966382980347,
+      "learning_rate": 0.00019966263417437835,
+      "loss": 1.2273,
+      "step": 607
+    },
+    {
+      "epoch": 0.10826210826210826,
+      "grad_norm": 0.3618549108505249,
+      "learning_rate": 0.00019966148438712214,
+      "loss": 0.9101,
+      "step": 608
+    },
+    {
+      "epoch": 0.10844017094017094,
+      "grad_norm": 0.384574294090271,
+      "learning_rate": 0.00019966033264720616,
+      "loss": 1.1769,
+      "step": 609
+    },
+    {
+      "epoch": 0.10861823361823361,
+      "grad_norm": 0.50872403383255,
+      "learning_rate": 0.000199659178954653,
+      "loss": 1.1213,
+      "step": 610
+    },
+    {
+      "epoch": 0.1087962962962963,
+      "grad_norm": 0.39736685156822205,
+      "learning_rate": 0.00019965802330948527,
+      "loss": 1.275,
+      "step": 611
+    },
+    {
+      "epoch": 0.10897435897435898,
+      "grad_norm": 0.484660267829895,
+      "learning_rate": 0.00019965686571172557,
+      "loss": 1.1671,
+      "step": 612
+    },
+    {
+      "epoch": 0.10915242165242166,
+      "grad_norm": 0.41420218348503113,
+      "learning_rate": 0.0001996557061613966,
+      "loss": 0.9541,
+      "step": 613
+    },
+    {
+      "epoch": 0.10933048433048433,
+      "grad_norm": 0.4057196080684662,
+      "learning_rate": 0.00019965454465852112,
+      "loss": 1.0145,
+      "step": 614
+    },
+    {
+      "epoch": 0.10950854700854701,
+      "grad_norm": 0.4559510052204132,
+      "learning_rate": 0.00019965338120312182,
+      "loss": 1.0889,
+      "step": 615
+    },
+    {
+      "epoch": 0.10968660968660969,
+      "grad_norm": 0.40960055589675903,
+      "learning_rate": 0.00019965221579522154,
+      "loss": 1.1447,
+      "step": 616
+    },
+    {
+      "epoch": 0.10986467236467236,
+      "grad_norm": 0.4701732099056244,
+      "learning_rate": 0.0001996510484348431,
+      "loss": 1.2871,
+      "step": 617
+    },
+    {
+      "epoch": 0.11004273504273504,
+      "grad_norm": 0.38420796394348145,
+      "learning_rate": 0.0001996498791220094,
+      "loss": 1.058,
+      "step": 618
+    },
+    {
+      "epoch": 0.11022079772079772,
+      "grad_norm": 0.4014730453491211,
+      "learning_rate": 0.00019964870785674327,
+      "loss": 1.023,
+      "step": 619
+    },
+    {
+      "epoch": 0.1103988603988604,
+      "grad_norm": 0.38846179842948914,
+      "learning_rate": 0.00019964753463906773,
+      "loss": 0.9834,
+      "step": 620
+    },
+    {
+      "epoch": 0.11057692307692307,
+      "grad_norm": 0.5120236277580261,
+      "learning_rate": 0.00019964635946900577,
+      "loss": 1.2347,
+      "step": 621
+    },
+    {
+      "epoch": 0.11075498575498575,
+      "grad_norm": 0.40483301877975464,
+      "learning_rate": 0.00019964518234658038,
+      "loss": 1.131,
+      "step": 622
+    },
+    {
+      "epoch": 0.11093304843304844,
+      "grad_norm": 0.445782870054245,
+      "learning_rate": 0.00019964400327181464,
+      "loss": 0.9349,
+      "step": 623
+    },
+    {
+      "epoch": 0.1111111111111111,
+      "grad_norm": 0.490460604429245,
+      "learning_rate": 0.00019964282224473165,
+      "loss": 1.0257,
+      "step": 624
+    },
+    {
+      "epoch": 0.11128917378917379,
+      "grad_norm": 0.37585243582725525,
+      "learning_rate": 0.00019964163926535454,
+      "loss": 0.9724,
+      "step": 625
+    },
+    {
+      "epoch": 0.11146723646723647,
+      "grad_norm": 0.4160473346710205,
+      "learning_rate": 0.00019964045433370651,
+      "loss": 0.874,
+      "step": 626
+    },
+    {
+      "epoch": 0.11164529914529915,
+      "grad_norm": 0.442425012588501,
+      "learning_rate": 0.00019963926744981074,
+      "loss": 1.064,
+      "step": 627
+    },
+    {
+      "epoch": 0.11182336182336182,
+      "grad_norm": 0.4451471269130707,
+      "learning_rate": 0.00019963807861369054,
+      "loss": 1.2343,
+      "step": 628
+    },
+    {
+      "epoch": 0.1120014245014245,
+      "grad_norm": 0.5018183588981628,
+      "learning_rate": 0.00019963688782536913,
+      "loss": 1.1226,
+      "step": 629
+    },
+    {
+      "epoch": 0.11217948717948718,
+      "grad_norm": 0.43723925948143005,
+      "learning_rate": 0.0001996356950848699,
+      "loss": 1.0178,
+      "step": 630
+    },
+    {
+      "epoch": 0.11235754985754985,
+      "grad_norm": 0.4794611930847168,
+      "learning_rate": 0.0001996345003922162,
+      "loss": 0.9695,
+      "step": 631
+    },
+    {
+      "epoch": 0.11253561253561253,
+      "grad_norm": 0.5021790266036987,
+      "learning_rate": 0.00019963330374743143,
+      "loss": 1.1748,
+      "step": 632
+    },
+    {
+      "epoch": 0.11271367521367522,
+      "grad_norm": 0.47228625416755676,
+      "learning_rate": 0.00019963210515053906,
+      "loss": 1.2138,
+      "step": 633
+    },
+    {
+      "epoch": 0.1128917378917379,
+      "grad_norm": 0.4261155128479004,
+      "learning_rate": 0.00019963090460156256,
+      "loss": 0.9428,
+      "step": 634
+    },
+    {
+      "epoch": 0.11306980056980057,
+      "grad_norm": 0.3279525339603424,
+      "learning_rate": 0.00019962970210052542,
+      "loss": 0.7803,
+      "step": 635
+    },
+    {
+      "epoch": 0.11324786324786325,
+      "grad_norm": 0.5106086730957031,
+      "learning_rate": 0.00019962849764745125,
+      "loss": 1.113,
+      "step": 636
+    },
+    {
+      "epoch": 0.11342592592592593,
+      "grad_norm": 0.38272222876548767,
+      "learning_rate": 0.00019962729124236363,
+      "loss": 0.896,
+      "step": 637
+    },
+    {
+      "epoch": 0.1136039886039886,
+      "grad_norm": 0.39532098174095154,
+      "learning_rate": 0.0001996260828852862,
+      "loss": 0.9308,
+      "step": 638
+    },
+    {
+      "epoch": 0.11378205128205128,
+      "grad_norm": 0.44947221875190735,
+      "learning_rate": 0.00019962487257624262,
+      "loss": 1.207,
+      "step": 639
+    },
+    {
+      "epoch": 0.11396011396011396,
+      "grad_norm": 0.40684598684310913,
+      "learning_rate": 0.00019962366031525664,
+      "loss": 1.11,
+      "step": 640
+    },
+    {
+      "epoch": 0.11413817663817664,
+      "grad_norm": 0.4296625852584839,
+      "learning_rate": 0.00019962244610235194,
+      "loss": 1.2784,
+      "step": 641
+    },
+    {
+      "epoch": 0.11431623931623931,
+      "grad_norm": 0.4560794532299042,
+      "learning_rate": 0.0001996212299375524,
+      "loss": 1.1191,
+      "step": 642
+    },
+    {
+      "epoch": 0.114494301994302,
+      "grad_norm": 0.40246087312698364,
+      "learning_rate": 0.00019962001182088177,
+      "loss": 1.1401,
+      "step": 643
+    },
+    {
+      "epoch": 0.11467236467236468,
+      "grad_norm": 0.3938910663127899,
+      "learning_rate": 0.000199618791752364,
+      "loss": 1.0959,
+      "step": 644
+    },
+    {
+      "epoch": 0.11485042735042734,
+      "grad_norm": 0.4123380184173584,
+      "learning_rate": 0.00019961756973202287,
+      "loss": 1.2824,
+      "step": 645
+    },
+    {
+      "epoch": 0.11502849002849003,
+      "grad_norm": 0.41085442900657654,
+      "learning_rate": 0.00019961634575988243,
+      "loss": 1.1137,
+      "step": 646
+    },
+    {
+      "epoch": 0.11520655270655271,
+      "grad_norm": 0.38276201486587524,
+      "learning_rate": 0.0001996151198359667,
+      "loss": 1.0747,
+      "step": 647
+    },
+    {
+      "epoch": 0.11538461538461539,
+      "grad_norm": 0.49269407987594604,
+      "learning_rate": 0.00019961389196029953,
+      "loss": 1.1731,
+      "step": 648
+    },
+    {
+      "epoch": 0.11556267806267806,
+      "grad_norm": 0.5152469277381897,
+      "learning_rate": 0.00019961266213290512,
+      "loss": 1.3574,
+      "step": 649
+    },
+    {
+      "epoch": 0.11574074074074074,
+      "grad_norm": 0.4835714101791382,
+      "learning_rate": 0.0001996114303538075,
+      "loss": 1.2859,
+      "step": 650
+    },
+    {
+      "epoch": 0.11591880341880342,
+      "grad_norm": 0.4284524917602539,
+      "learning_rate": 0.00019961019662303087,
+      "loss": 1.1103,
+      "step": 651
+    },
+    {
+      "epoch": 0.11609686609686609,
+      "grad_norm": 0.3933276832103729,
+      "learning_rate": 0.00019960896094059933,
+      "loss": 1.2647,
+      "step": 652
+    },
+    {
+      "epoch": 0.11627492877492877,
+      "grad_norm": 0.33749741315841675,
+      "learning_rate": 0.00019960772330653712,
+      "loss": 0.819,
+      "step": 653
+    },
+    {
+      "epoch": 0.11645299145299146,
+      "grad_norm": 0.48122069239616394,
+      "learning_rate": 0.00019960648372086852,
+      "loss": 1.2781,
+      "step": 654
+    },
+    {
+      "epoch": 0.11663105413105414,
+      "grad_norm": 0.4681607186794281,
+      "learning_rate": 0.00019960524218361775,
+      "loss": 0.9723,
+      "step": 655
+    },
+    {
+      "epoch": 0.1168091168091168,
+      "grad_norm": 0.3974960148334503,
+      "learning_rate": 0.0001996039986948092,
+      "loss": 1.0302,
+      "step": 656
+    },
+    {
+      "epoch": 0.11698717948717949,
+      "grad_norm": 0.43180662393569946,
+      "learning_rate": 0.0001996027532544672,
+      "loss": 1.3265,
+      "step": 657
+    },
+    {
+      "epoch": 0.11716524216524217,
+      "grad_norm": 0.4481917917728424,
+      "learning_rate": 0.00019960150586261613,
+      "loss": 1.136,
+      "step": 658
+    },
+    {
+      "epoch": 0.11734330484330484,
+      "grad_norm": 0.43428945541381836,
+      "learning_rate": 0.00019960025651928045,
+      "loss": 1.2412,
+      "step": 659
+    },
+    {
+      "epoch": 0.11752136752136752,
+      "grad_norm": 0.36211395263671875,
+      "learning_rate": 0.00019959900522448467,
+      "loss": 0.9563,
+      "step": 660
+    },
+    {
+      "epoch": 0.1176994301994302,
+      "grad_norm": 0.43585848808288574,
+      "learning_rate": 0.0001995977519782533,
+      "loss": 1.1677,
+      "step": 661
+    },
+    {
+      "epoch": 0.11787749287749288,
+      "grad_norm": 0.4232597351074219,
+      "learning_rate": 0.00019959649678061086,
+      "loss": 1.1187,
+      "step": 662
+    },
+    {
+      "epoch": 0.11805555555555555,
+      "grad_norm": 0.3304753303527832,
+      "learning_rate": 0.00019959523963158194,
+      "loss": 0.8473,
+      "step": 663
+    },
+    {
+      "epoch": 0.11823361823361823,
+      "grad_norm": 0.37600061297416687,
+      "learning_rate": 0.0001995939805311912,
+      "loss": 1.1227,
+      "step": 664
+    },
+    {
+      "epoch": 0.11841168091168092,
+      "grad_norm": 0.33417847752571106,
+      "learning_rate": 0.0001995927194794633,
+      "loss": 1.0315,
+      "step": 665
+    },
+    {
+      "epoch": 0.11858974358974358,
+      "grad_norm": 0.46799129247665405,
+      "learning_rate": 0.00019959145647642298,
+      "loss": 1.135,
+      "step": 666
+    },
+    {
+      "epoch": 0.11876780626780627,
+      "grad_norm": 0.4141576886177063,
+      "learning_rate": 0.0001995901915220949,
+      "loss": 1.0956,
+      "step": 667
+    },
+    {
+      "epoch": 0.11894586894586895,
+      "grad_norm": 0.3824596405029297,
+      "learning_rate": 0.0001995889246165039,
+      "loss": 1.1782,
+      "step": 668
+    },
+    {
+      "epoch": 0.11912393162393162,
+      "grad_norm": 0.4087786376476288,
+      "learning_rate": 0.00019958765575967484,
+      "loss": 0.9704,
+      "step": 669
+    },
+    {
+      "epoch": 0.1193019943019943,
+      "grad_norm": 0.5161317586898804,
+      "learning_rate": 0.00019958638495163252,
+      "loss": 1.2207,
+      "step": 670
+    },
+    {
+      "epoch": 0.11948005698005698,
+      "grad_norm": 0.4782274067401886,
+      "learning_rate": 0.0001995851121924019,
+      "loss": 1.1257,
+      "step": 671
+    },
+    {
+      "epoch": 0.11965811965811966,
+      "grad_norm": 0.40617331862449646,
+      "learning_rate": 0.00019958383748200782,
+      "loss": 1.1153,
+      "step": 672
+    },
+    {
+      "epoch": 0.11983618233618233,
+      "grad_norm": 0.40149980783462524,
+      "learning_rate": 0.00019958256082047533,
+      "loss": 0.9785,
+      "step": 673
+    },
+    {
+      "epoch": 0.12001424501424501,
+      "grad_norm": 0.4378886818885803,
+      "learning_rate": 0.00019958128220782942,
+      "loss": 1.1355,
+      "step": 674
+    },
+    {
+      "epoch": 0.1201923076923077,
+      "grad_norm": 0.4449596703052521,
+      "learning_rate": 0.0001995800016440952,
+      "loss": 1.0325,
+      "step": 675
+    },
+    {
+      "epoch": 0.12037037037037036,
+      "grad_norm": 0.4268079698085785,
+      "learning_rate": 0.00019957871912929765,
+      "loss": 1.1901,
+      "step": 676
+    },
+    {
+      "epoch": 0.12054843304843305,
+      "grad_norm": 0.4250091016292572,
+      "learning_rate": 0.00019957743466346198,
+      "loss": 1.0084,
+      "step": 677
+    },
+    {
+      "epoch": 0.12072649572649573,
+      "grad_norm": 0.40724286437034607,
+      "learning_rate": 0.0001995761482466133,
+      "loss": 1.0866,
+      "step": 678
+    },
+    {
+      "epoch": 0.12090455840455841,
+      "grad_norm": 0.42478349804878235,
+      "learning_rate": 0.00019957485987877688,
+      "loss": 1.1909,
+      "step": 679
+    },
+    {
+      "epoch": 0.12108262108262108,
+      "grad_norm": 0.371362566947937,
+      "learning_rate": 0.0001995735695599779,
+      "loss": 1.083,
+      "step": 680
+    },
+    {
+      "epoch": 0.12126068376068376,
+      "grad_norm": 0.4715283513069153,
+      "learning_rate": 0.0001995722772902417,
+      "loss": 1.2942,
+      "step": 681
+    },
+    {
+      "epoch": 0.12143874643874644,
+      "grad_norm": 0.3611983060836792,
+      "learning_rate": 0.00019957098306959355,
+      "loss": 0.9878,
+      "step": 682
+    },
+    {
+      "epoch": 0.12161680911680911,
+      "grad_norm": 0.4764883816242218,
+      "learning_rate": 0.00019956968689805883,
+      "loss": 1.0082,
+      "step": 683
+    },
+    {
+      "epoch": 0.12179487179487179,
+      "grad_norm": 0.33170604705810547,
+      "learning_rate": 0.00019956838877566293,
+      "loss": 0.8529,
+      "step": 684
+    },
+    {
+      "epoch": 0.12197293447293447,
+      "grad_norm": 0.46896886825561523,
+      "learning_rate": 0.00019956708870243133,
+      "loss": 1.0745,
+      "step": 685
+    },
+    {
+      "epoch": 0.12215099715099716,
+      "grad_norm": 0.4120674431324005,
+      "learning_rate": 0.00019956578667838941,
+      "loss": 1.1828,
+      "step": 686
+    },
+    {
+      "epoch": 0.12232905982905982,
+      "grad_norm": 0.45671191811561584,
+      "learning_rate": 0.00019956448270356275,
+      "loss": 1.3484,
+      "step": 687
+    },
+    {
+      "epoch": 0.1225071225071225,
+      "grad_norm": 0.4023838937282562,
+      "learning_rate": 0.00019956317677797687,
+      "loss": 0.9623,
+      "step": 688
+    },
+    {
+      "epoch": 0.12268518518518519,
+      "grad_norm": 0.5205856561660767,
+      "learning_rate": 0.00019956186890165737,
+      "loss": 1.2221,
+      "step": 689
+    },
+    {
+      "epoch": 0.12286324786324786,
+      "grad_norm": 0.43956050276756287,
+      "learning_rate": 0.00019956055907462987,
+      "loss": 1.1051,
+      "step": 690
+    },
+    {
+      "epoch": 0.12304131054131054,
+      "grad_norm": 0.4341758191585541,
+      "learning_rate": 0.00019955924729692003,
+      "loss": 0.8972,
+      "step": 691
+    },
+    {
+      "epoch": 0.12321937321937322,
+      "grad_norm": 0.42025020718574524,
+      "learning_rate": 0.00019955793356855357,
+      "loss": 1.1137,
+      "step": 692
+    },
+    {
+      "epoch": 0.1233974358974359,
+      "grad_norm": 0.44375079870224,
+      "learning_rate": 0.0001995566178895562,
+      "loss": 1.2783,
+      "step": 693
+    },
+    {
+      "epoch": 0.12357549857549857,
+      "grad_norm": 0.4703320264816284,
+      "learning_rate": 0.00019955530025995372,
+      "loss": 1.1991,
+      "step": 694
+    },
+    {
+      "epoch": 0.12375356125356125,
+      "grad_norm": 0.43781620264053345,
+      "learning_rate": 0.00019955398067977195,
+      "loss": 1.2316,
+      "step": 695
+    },
+    {
+      "epoch": 0.12393162393162394,
+      "grad_norm": 0.4362877607345581,
+      "learning_rate": 0.0001995526591490367,
+      "loss": 1.1374,
+      "step": 696
+    },
+    {
+      "epoch": 0.1241096866096866,
+      "grad_norm": 0.4434499442577362,
+      "learning_rate": 0.00019955133566777392,
+      "loss": 1.1034,
+      "step": 697
+    },
+    {
+      "epoch": 0.12428774928774929,
+      "grad_norm": 0.46613508462905884,
+      "learning_rate": 0.00019955001023600955,
+      "loss": 1.2252,
+      "step": 698
+    },
+    {
+      "epoch": 0.12446581196581197,
+      "grad_norm": 0.46226736903190613,
+      "learning_rate": 0.00019954868285376945,
+      "loss": 1.0296,
+      "step": 699
+    },
+    {
+      "epoch": 0.12464387464387465,
+      "grad_norm": 0.4460904002189636,
+      "learning_rate": 0.00019954735352107977,
+      "loss": 1.0553,
+      "step": 700
+    },
+    {
+      "epoch": 0.12482193732193732,
+      "grad_norm": 0.36708924174308777,
+      "learning_rate": 0.00019954602223796648,
+      "loss": 0.9384,
+      "step": 701
+    },
+    {
+      "epoch": 0.125,
+      "grad_norm": 0.3780093491077423,
+      "learning_rate": 0.00019954468900445566,
+      "loss": 0.9062,
+      "step": 702
+    },
+    {
+      "epoch": 0.12517806267806267,
+      "grad_norm": 0.41797417402267456,
+      "learning_rate": 0.00019954335382057345,
+      "loss": 1.0344,
+      "step": 703
+    },
+    {
+      "epoch": 0.12535612535612536,
+      "grad_norm": 0.43710798025131226,
+      "learning_rate": 0.00019954201668634597,
+      "loss": 1.1324,
+      "step": 704
+    },
+    {
+      "epoch": 0.12553418803418803,
+      "grad_norm": 0.4732789695262909,
+      "learning_rate": 0.00019954067760179952,
+      "loss": 1.1419,
+      "step": 705
+    },
+    {
+      "epoch": 0.1257122507122507,
+      "grad_norm": 0.43248575925827026,
+      "learning_rate": 0.00019953933656696022,
+      "loss": 1.5112,
+      "step": 706
+    },
+    {
+      "epoch": 0.1258903133903134,
+      "grad_norm": 0.4074753522872925,
+      "learning_rate": 0.00019953799358185442,
+      "loss": 0.9751,
+      "step": 707
+    },
+    {
+      "epoch": 0.12606837606837606,
+      "grad_norm": 0.4586823880672455,
+      "learning_rate": 0.0001995366486465084,
+      "loss": 1.267,
+      "step": 708
+    },
+    {
+      "epoch": 0.12624643874643873,
+      "grad_norm": 0.4716857075691223,
+      "learning_rate": 0.0001995353017609485,
+      "loss": 1.1636,
+      "step": 709
+    },
+    {
+      "epoch": 0.12642450142450143,
+      "grad_norm": 0.5214398503303528,
+      "learning_rate": 0.00019953395292520115,
+      "loss": 1.2317,
+      "step": 710
+    },
+    {
+      "epoch": 0.1266025641025641,
+      "grad_norm": 0.42961129546165466,
+      "learning_rate": 0.00019953260213929276,
+      "loss": 1.0271,
+      "step": 711
+    },
+    {
+      "epoch": 0.1267806267806268,
+      "grad_norm": 0.4764653444290161,
+      "learning_rate": 0.00019953124940324979,
+      "loss": 1.1747,
+      "step": 712
+    },
+    {
+      "epoch": 0.12695868945868946,
+      "grad_norm": 0.4420304000377655,
+      "learning_rate": 0.00019952989471709874,
+      "loss": 0.9783,
+      "step": 713
+    },
+    {
+      "epoch": 0.12713675213675213,
+      "grad_norm": 0.44114625453948975,
+      "learning_rate": 0.00019952853808086616,
+      "loss": 1.1953,
+      "step": 714
+    },
+    {
+      "epoch": 0.12731481481481483,
+      "grad_norm": 0.501923143863678,
+      "learning_rate": 0.0001995271794945786,
+      "loss": 0.9886,
+      "step": 715
+    },
+    {
+      "epoch": 0.1274928774928775,
+      "grad_norm": 0.42266538739204407,
+      "learning_rate": 0.00019952581895826276,
+      "loss": 1.2033,
+      "step": 716
+    },
+    {
+      "epoch": 0.12767094017094016,
+      "grad_norm": 0.37770554423332214,
+      "learning_rate": 0.00019952445647194523,
+      "loss": 1.0164,
+      "step": 717
+    },
+    {
+      "epoch": 0.12784900284900286,
+      "grad_norm": 0.369266152381897,
+      "learning_rate": 0.00019952309203565268,
+      "loss": 0.9186,
+      "step": 718
+    },
+    {
+      "epoch": 0.12802706552706553,
+      "grad_norm": 0.40446221828460693,
+      "learning_rate": 0.00019952172564941193,
+      "loss": 1.1576,
+      "step": 719
+    },
+    {
+      "epoch": 0.1282051282051282,
+      "grad_norm": 0.504172146320343,
+      "learning_rate": 0.00019952035731324967,
+      "loss": 1.2695,
+      "step": 720
+    },
+    {
+      "epoch": 0.1283831908831909,
+      "grad_norm": 0.37284108996391296,
+      "learning_rate": 0.0001995189870271928,
+      "loss": 1.0288,
+      "step": 721
+    },
+    {
+      "epoch": 0.12856125356125356,
+      "grad_norm": 0.41811618208885193,
+      "learning_rate": 0.00019951761479126805,
+      "loss": 1.2241,
+      "step": 722
+    },
+    {
+      "epoch": 0.12873931623931623,
+      "grad_norm": 0.44706249237060547,
+      "learning_rate": 0.0001995162406055024,
+      "loss": 1.0831,
+      "step": 723
+    },
+    {
+      "epoch": 0.12891737891737892,
+      "grad_norm": 0.426572322845459,
+      "learning_rate": 0.00019951486446992273,
+      "loss": 1.0047,
+      "step": 724
+    },
+    {
+      "epoch": 0.1290954415954416,
+      "grad_norm": 0.4446277618408203,
+      "learning_rate": 0.00019951348638455602,
+      "loss": 1.0827,
+      "step": 725
+    },
+    {
+      "epoch": 0.12927350427350429,
+      "grad_norm": 0.3934919834136963,
+      "learning_rate": 0.00019951210634942926,
+      "loss": 0.9808,
+      "step": 726
+    },
+    {
+      "epoch": 0.12945156695156695,
+      "grad_norm": 0.4316558241844177,
+      "learning_rate": 0.0001995107243645695,
+      "loss": 1.3341,
+      "step": 727
+    },
+    {
+      "epoch": 0.12962962962962962,
+      "grad_norm": 0.43074217438697815,
+      "learning_rate": 0.00019950934043000382,
+      "loss": 1.007,
+      "step": 728
+    },
+    {
+      "epoch": 0.12980769230769232,
+      "grad_norm": 0.5212171673774719,
+      "learning_rate": 0.0001995079545457593,
+      "loss": 1.1822,
+      "step": 729
+    },
+    {
+      "epoch": 0.129985754985755,
+      "grad_norm": 0.3749600946903229,
+      "learning_rate": 0.00019950656671186313,
+      "loss": 0.9657,
+      "step": 730
+    },
+    {
+      "epoch": 0.13016381766381765,
+      "grad_norm": 0.36626043915748596,
+      "learning_rate": 0.00019950517692834252,
+      "loss": 1.1274,
+      "step": 731
+    },
+    {
+      "epoch": 0.13034188034188035,
+      "grad_norm": 0.4635467529296875,
+      "learning_rate": 0.00019950378519522467,
+      "loss": 1.2305,
+      "step": 732
+    },
+    {
+      "epoch": 0.13051994301994302,
+      "grad_norm": 0.4077455699443817,
+      "learning_rate": 0.00019950239151253683,
+      "loss": 0.9485,
+      "step": 733
+    },
+    {
+      "epoch": 0.1306980056980057,
+      "grad_norm": 0.4222758114337921,
+      "learning_rate": 0.0001995009958803063,
+      "loss": 1.0376,
+      "step": 734
+    },
+    {
+      "epoch": 0.13087606837606838,
+      "grad_norm": 0.4330402612686157,
+      "learning_rate": 0.0001994995982985605,
+      "loss": 1.1774,
+      "step": 735
+    },
+    {
+      "epoch": 0.13105413105413105,
+      "grad_norm": 0.42275673151016235,
+      "learning_rate": 0.00019949819876732673,
+      "loss": 1.1238,
+      "step": 736
+    },
+    {
+      "epoch": 0.13123219373219372,
+      "grad_norm": 0.45576968789100647,
+      "learning_rate": 0.00019949679728663246,
+      "loss": 1.0428,
+      "step": 737
+    },
+    {
+      "epoch": 0.13141025641025642,
+      "grad_norm": 0.5508752465248108,
+      "learning_rate": 0.00019949539385650514,
+      "loss": 1.3221,
+      "step": 738
+    },
+    {
+      "epoch": 0.13158831908831908,
+      "grad_norm": 0.4115872383117676,
+      "learning_rate": 0.00019949398847697225,
+      "loss": 1.0301,
+      "step": 739
+    },
+    {
+      "epoch": 0.13176638176638178,
+      "grad_norm": 0.4662442207336426,
+      "learning_rate": 0.00019949258114806132,
+      "loss": 1.3263,
+      "step": 740
+    },
+    {
+      "epoch": 0.13194444444444445,
+      "grad_norm": 0.6077266931533813,
+      "learning_rate": 0.00019949117186979999,
+      "loss": 1.0269,
+      "step": 741
+    },
+    {
+      "epoch": 0.13212250712250712,
+      "grad_norm": 0.47039318084716797,
+      "learning_rate": 0.00019948976064221579,
+      "loss": 1.3782,
+      "step": 742
+    },
+    {
+      "epoch": 0.1323005698005698,
+      "grad_norm": 0.4773450493812561,
+      "learning_rate": 0.0001994883474653364,
+      "loss": 1.289,
+      "step": 743
+    },
+    {
+      "epoch": 0.13247863247863248,
+      "grad_norm": 0.40180155634880066,
+      "learning_rate": 0.00019948693233918952,
+      "loss": 0.8691,
+      "step": 744
+    },
+    {
+      "epoch": 0.13265669515669515,
+      "grad_norm": 0.45216289162635803,
+      "learning_rate": 0.00019948551526380288,
+      "loss": 1.071,
+      "step": 745
+    },
+    {
+      "epoch": 0.13283475783475784,
+      "grad_norm": 0.4289272427558899,
+      "learning_rate": 0.0001994840962392042,
+      "loss": 1.0422,
+      "step": 746
+    },
+    {
+      "epoch": 0.1330128205128205,
+      "grad_norm": 0.4617730379104614,
+      "learning_rate": 0.00019948267526542134,
+      "loss": 1.0835,
+      "step": 747
+    },
+    {
+      "epoch": 0.13319088319088318,
+      "grad_norm": 0.42710617184638977,
+      "learning_rate": 0.00019948125234248208,
+      "loss": 1.0535,
+      "step": 748
+    },
+    {
+      "epoch": 0.13336894586894588,
+      "grad_norm": 0.43433234095573425,
+      "learning_rate": 0.0001994798274704144,
+      "loss": 0.9313,
+      "step": 749
+    },
+    {
+      "epoch": 0.13354700854700854,
+      "grad_norm": 0.46270284056663513,
+      "learning_rate": 0.0001994784006492461,
+      "loss": 1.0903,
+      "step": 750
+    },
+    {
+      "epoch": 0.1337250712250712,
+      "grad_norm": 0.5319814682006836,
+      "learning_rate": 0.00019947697187900517,
+      "loss": 1.2329,
+      "step": 751
+    },
+    {
+      "epoch": 0.1339031339031339,
+      "grad_norm": 0.3511372208595276,
+      "learning_rate": 0.00019947554115971967,
+      "loss": 0.7116,
+      "step": 752
+    },
+    {
+      "epoch": 0.13408119658119658,
+      "grad_norm": 0.4103890359401703,
+      "learning_rate": 0.00019947410849141756,
+      "loss": 1.1527,
+      "step": 753
+    },
+    {
+      "epoch": 0.13425925925925927,
+      "grad_norm": 0.5390757322311401,
+      "learning_rate": 0.00019947267387412695,
+      "loss": 1.1682,
+      "step": 754
+    },
+    {
+      "epoch": 0.13443732193732194,
+      "grad_norm": 0.29939723014831543,
+      "learning_rate": 0.0001994712373078759,
+      "loss": 0.5848,
+      "step": 755
+    },
+    {
+      "epoch": 0.1346153846153846,
+      "grad_norm": 0.4605920612812042,
+      "learning_rate": 0.0001994697987926926,
+      "loss": 0.9448,
+      "step": 756
+    },
+    {
+      "epoch": 0.1347934472934473,
+      "grad_norm": 0.426213800907135,
+      "learning_rate": 0.00019946835832860527,
+      "loss": 1.0487,
+      "step": 757
+    },
+    {
+      "epoch": 0.13497150997150997,
+      "grad_norm": 0.4209515154361725,
+      "learning_rate": 0.00019946691591564203,
+      "loss": 1.0951,
+      "step": 758
+    },
+    {
+      "epoch": 0.13514957264957264,
+      "grad_norm": 0.39555591344833374,
+      "learning_rate": 0.0001994654715538312,
+      "loss": 0.8754,
+      "step": 759
+    },
+    {
+      "epoch": 0.13532763532763534,
+      "grad_norm": 0.4065483510494232,
+      "learning_rate": 0.0001994640252432011,
+      "loss": 0.9451,
+      "step": 760
+    },
+    {
+      "epoch": 0.135505698005698,
+      "grad_norm": 0.4489104151725769,
+      "learning_rate": 0.00019946257698378003,
+      "loss": 1.2031,
+      "step": 761
+    },
+    {
+      "epoch": 0.13568376068376067,
+      "grad_norm": 0.39928409457206726,
+      "learning_rate": 0.0001994611267755964,
+      "loss": 1.1124,
+      "step": 762
+    },
+    {
+      "epoch": 0.13586182336182337,
+      "grad_norm": 0.4145409166812897,
+      "learning_rate": 0.00019945967461867858,
+      "loss": 1.083,
+      "step": 763
+    },
+    {
+      "epoch": 0.13603988603988604,
+      "grad_norm": 0.43508613109588623,
+      "learning_rate": 0.00019945822051305507,
+      "loss": 1.1119,
+      "step": 764
+    },
+    {
+      "epoch": 0.1362179487179487,
+      "grad_norm": 0.5186598300933838,
+      "learning_rate": 0.0001994567644587543,
+      "loss": 1.3256,
+      "step": 765
+    },
+    {
+      "epoch": 0.1363960113960114,
+      "grad_norm": 0.4615778625011444,
+      "learning_rate": 0.00019945530645580487,
+      "loss": 1.3906,
+      "step": 766
+    },
+    {
+      "epoch": 0.13657407407407407,
+      "grad_norm": 0.4838152527809143,
+      "learning_rate": 0.00019945384650423532,
+      "loss": 0.8169,
+      "step": 767
+    },
+    {
+      "epoch": 0.13675213675213677,
+      "grad_norm": 0.49253368377685547,
+      "learning_rate": 0.0001994523846040742,
+      "loss": 1.1613,
+      "step": 768
+    },
+    {
+      "epoch": 0.13693019943019943,
+      "grad_norm": 0.4697009325027466,
+      "learning_rate": 0.00019945092075535024,
+      "loss": 1.1722,
+      "step": 769
+    },
+    {
+      "epoch": 0.1371082621082621,
+      "grad_norm": 0.47162383794784546,
+      "learning_rate": 0.00019944945495809204,
+      "loss": 1.054,
+      "step": 770
+    },
+    {
+      "epoch": 0.1372863247863248,
+      "grad_norm": 0.4653547704219818,
+      "learning_rate": 0.00019944798721232835,
+      "loss": 1.1791,
+      "step": 771
+    },
+    {
+      "epoch": 0.13746438746438747,
+      "grad_norm": 0.4244011640548706,
+      "learning_rate": 0.000199446517518088,
+      "loss": 1.1557,
+      "step": 772
+    },
+    {
+      "epoch": 0.13764245014245013,
+      "grad_norm": 0.43812859058380127,
+      "learning_rate": 0.00019944504587539967,
+      "loss": 1.1567,
+      "step": 773
+    },
+    {
+      "epoch": 0.13782051282051283,
+      "grad_norm": 0.3984275162220001,
+      "learning_rate": 0.00019944357228429227,
+      "loss": 1.0715,
+      "step": 774
+    },
+    {
+      "epoch": 0.1379985754985755,
+      "grad_norm": 0.3794248104095459,
+      "learning_rate": 0.0001994420967447946,
+      "loss": 0.9377,
+      "step": 775
+    },
+    {
+      "epoch": 0.13817663817663817,
+      "grad_norm": 0.4214578866958618,
+      "learning_rate": 0.00019944061925693566,
+      "loss": 1.0112,
+      "step": 776
+    },
+    {
+      "epoch": 0.13835470085470086,
+      "grad_norm": 0.4738999605178833,
+      "learning_rate": 0.00019943913982074435,
+      "loss": 0.8718,
+      "step": 777
+    },
+    {
+      "epoch": 0.13853276353276353,
+      "grad_norm": 0.43455326557159424,
+      "learning_rate": 0.00019943765843624965,
+      "loss": 1.1343,
+      "step": 778
+    },
+    {
+      "epoch": 0.1387108262108262,
+      "grad_norm": 0.44973456859588623,
+      "learning_rate": 0.00019943617510348062,
+      "loss": 1.0487,
+      "step": 779
+    },
+    {
+      "epoch": 0.1388888888888889,
+      "grad_norm": 0.4216597080230713,
+      "learning_rate": 0.00019943468982246628,
+      "loss": 1.0765,
+      "step": 780
+    },
+    {
+      "epoch": 0.13906695156695156,
+      "grad_norm": 0.5089883208274841,
+      "learning_rate": 0.00019943320259323578,
+      "loss": 1.3137,
+      "step": 781
+    },
+    {
+      "epoch": 0.13924501424501423,
+      "grad_norm": 0.4358222782611847,
+      "learning_rate": 0.00019943171341581822,
+      "loss": 1.1891,
+      "step": 782
+    },
+    {
+      "epoch": 0.13942307692307693,
+      "grad_norm": 0.40918609499931335,
+      "learning_rate": 0.00019943022229024275,
+      "loss": 1.279,
+      "step": 783
+    },
+    {
+      "epoch": 0.1396011396011396,
+      "grad_norm": 0.4614863395690918,
+      "learning_rate": 0.00019942872921653866,
+      "loss": 1.2477,
+      "step": 784
+    },
+    {
+      "epoch": 0.1397792022792023,
+      "grad_norm": 0.4141528904438019,
+      "learning_rate": 0.00019942723419473515,
+      "loss": 0.9622,
+      "step": 785
+    },
+    {
+      "epoch": 0.13995726495726496,
+      "grad_norm": 0.536139726638794,
+      "learning_rate": 0.00019942573722486154,
+      "loss": 1.2127,
+      "step": 786
+    },
+    {
+      "epoch": 0.14013532763532763,
+      "grad_norm": 0.4968845546245575,
+      "learning_rate": 0.0001994242383069471,
+      "loss": 1.2965,
+      "step": 787
+    },
+    {
+      "epoch": 0.14031339031339032,
+      "grad_norm": 0.3897174894809723,
+      "learning_rate": 0.00019942273744102132,
+      "loss": 0.9907,
+      "step": 788
+    },
+    {
+      "epoch": 0.140491452991453,
+      "grad_norm": 0.466307669878006,
+      "learning_rate": 0.0001994212346271135,
+      "loss": 1.2021,
+      "step": 789
+    },
+    {
+      "epoch": 0.14066951566951566,
+      "grad_norm": 0.49283576011657715,
+      "learning_rate": 0.0001994197298652531,
+      "loss": 1.0969,
+      "step": 790
+    },
+    {
+      "epoch": 0.14084757834757836,
+      "grad_norm": 0.4686102271080017,
+      "learning_rate": 0.00019941822315546964,
+      "loss": 1.0125,
+      "step": 791
+    },
+    {
+      "epoch": 0.14102564102564102,
+      "grad_norm": 0.4389997124671936,
+      "learning_rate": 0.0001994167144977926,
+      "loss": 1.1294,
+      "step": 792
+    },
+    {
+      "epoch": 0.1412037037037037,
+      "grad_norm": 0.38539355993270874,
+      "learning_rate": 0.00019941520389225162,
+      "loss": 1.1231,
+      "step": 793
+    },
+    {
+      "epoch": 0.1413817663817664,
+      "grad_norm": 0.4860847592353821,
+      "learning_rate": 0.00019941369133887618,
+      "loss": 1.2268,
+      "step": 794
+    },
+    {
+      "epoch": 0.14155982905982906,
+      "grad_norm": 0.4567467272281647,
+      "learning_rate": 0.00019941217683769598,
+      "loss": 1.1482,
+      "step": 795
+    },
+    {
+      "epoch": 0.14173789173789172,
+      "grad_norm": 0.5549420714378357,
+      "learning_rate": 0.00019941066038874067,
+      "loss": 1.1899,
+      "step": 796
+    },
+    {
+      "epoch": 0.14191595441595442,
+      "grad_norm": 0.3950003385543823,
+      "learning_rate": 0.00019940914199204,
+      "loss": 0.96,
+      "step": 797
+    },
+    {
+      "epoch": 0.1420940170940171,
+      "grad_norm": 0.43845999240875244,
+      "learning_rate": 0.00019940762164762373,
+      "loss": 1.0338,
+      "step": 798
+    },
+    {
+      "epoch": 0.14227207977207978,
+      "grad_norm": 0.468537300825119,
+      "learning_rate": 0.00019940609935552157,
+      "loss": 1.2416,
+      "step": 799
+    },
+    {
+      "epoch": 0.14245014245014245,
+      "grad_norm": 0.4292038679122925,
+      "learning_rate": 0.0001994045751157634,
+      "loss": 1.1397,
+      "step": 800
+    },
+    {
+      "epoch": 0.14262820512820512,
+      "grad_norm": 0.3800995647907257,
+      "learning_rate": 0.00019940304892837908,
+      "loss": 0.939,
+      "step": 801
+    },
+    {
+      "epoch": 0.14280626780626782,
+      "grad_norm": 0.38004353642463684,
+      "learning_rate": 0.00019940152079339852,
+      "loss": 1.0485,
+      "step": 802
+    },
+    {
+      "epoch": 0.14298433048433049,
+      "grad_norm": 0.4658142924308777,
+      "learning_rate": 0.00019939999071085163,
+      "loss": 1.1561,
+      "step": 803
+    },
+    {
+      "epoch": 0.14316239316239315,
+      "grad_norm": 0.4235048294067383,
+      "learning_rate": 0.0001993984586807684,
+      "loss": 1.0516,
+      "step": 804
+    },
+    {
+      "epoch": 0.14334045584045585,
+      "grad_norm": 0.42925819754600525,
+      "learning_rate": 0.00019939692470317887,
+      "loss": 1.2238,
+      "step": 805
+    },
+    {
+      "epoch": 0.14351851851851852,
+      "grad_norm": 0.43701639771461487,
+      "learning_rate": 0.00019939538877811308,
+      "loss": 1.0129,
+      "step": 806
+    },
+    {
+      "epoch": 0.14369658119658119,
+      "grad_norm": 0.42786353826522827,
+      "learning_rate": 0.00019939385090560113,
+      "loss": 1.1355,
+      "step": 807
+    },
+    {
+      "epoch": 0.14387464387464388,
+      "grad_norm": 0.371218740940094,
+      "learning_rate": 0.00019939231108567312,
+      "loss": 0.9712,
+      "step": 808
+    },
+    {
+      "epoch": 0.14405270655270655,
+      "grad_norm": 0.4834294617176056,
+      "learning_rate": 0.00019939076931835926,
+      "loss": 1.1375,
+      "step": 809
+    },
+    {
+      "epoch": 0.14423076923076922,
+      "grad_norm": 0.4700150191783905,
+      "learning_rate": 0.00019938922560368974,
+      "loss": 1.1943,
+      "step": 810
+    },
+    {
+      "epoch": 0.14440883190883191,
+      "grad_norm": 0.4430996775627136,
+      "learning_rate": 0.0001993876799416948,
+      "loss": 1.1976,
+      "step": 811
+    },
+    {
+      "epoch": 0.14458689458689458,
+      "grad_norm": 0.4161672592163086,
+      "learning_rate": 0.00019938613233240476,
+      "loss": 1.0291,
+      "step": 812
+    },
+    {
+      "epoch": 0.14476495726495728,
+      "grad_norm": 0.39838850498199463,
+      "learning_rate": 0.0001993845827758499,
+      "loss": 1.2103,
+      "step": 813
+    },
+    {
+      "epoch": 0.14494301994301995,
+      "grad_norm": 0.429198294878006,
+      "learning_rate": 0.00019938303127206057,
+      "loss": 0.9971,
+      "step": 814
+    },
+    {
+      "epoch": 0.14512108262108261,
+      "grad_norm": 0.4589254856109619,
+      "learning_rate": 0.00019938147782106719,
+      "loss": 1.2392,
+      "step": 815
+    },
+    {
+      "epoch": 0.1452991452991453,
+      "grad_norm": 0.42506635189056396,
+      "learning_rate": 0.00019937992242290023,
+      "loss": 1.0827,
+      "step": 816
+    },
+    {
+      "epoch": 0.14547720797720798,
+      "grad_norm": 0.3778113126754761,
+      "learning_rate": 0.00019937836507759012,
+      "loss": 1.021,
+      "step": 817
+    },
+    {
+      "epoch": 0.14565527065527065,
+      "grad_norm": 0.43071216344833374,
+      "learning_rate": 0.0001993768057851674,
+      "loss": 1.273,
+      "step": 818
+    },
+    {
+      "epoch": 0.14583333333333334,
+      "grad_norm": 0.4944681227207184,
+      "learning_rate": 0.00019937524454566262,
+      "loss": 1.3037,
+      "step": 819
+    },
+    {
+      "epoch": 0.146011396011396,
+      "grad_norm": 0.4438824951648712,
+      "learning_rate": 0.00019937368135910632,
+      "loss": 1.1383,
+      "step": 820
+    },
+    {
+      "epoch": 0.14618945868945868,
+      "grad_norm": 0.400215744972229,
+      "learning_rate": 0.0001993721162255292,
+      "loss": 1.0669,
+      "step": 821
+    },
+    {
+      "epoch": 0.14636752136752137,
+      "grad_norm": 0.4341452121734619,
+      "learning_rate": 0.00019937054914496185,
+      "loss": 1.1431,
+      "step": 822
+    },
+    {
+      "epoch": 0.14654558404558404,
+      "grad_norm": 0.3941744267940521,
+      "learning_rate": 0.00019936898011743503,
+      "loss": 1.1593,
+      "step": 823
+    },
+    {
+      "epoch": 0.1467236467236467,
+      "grad_norm": 0.4318541884422302,
+      "learning_rate": 0.00019936740914297947,
+      "loss": 1.2814,
+      "step": 824
+    },
+    {
+      "epoch": 0.1469017094017094,
+      "grad_norm": 0.44488632678985596,
+      "learning_rate": 0.00019936583622162595,
+      "loss": 1.1054,
+      "step": 825
+    },
+    {
+      "epoch": 0.14707977207977208,
+      "grad_norm": 0.38701096177101135,
+      "learning_rate": 0.00019936426135340528,
+      "loss": 1.1086,
+      "step": 826
+    },
+    {
+      "epoch": 0.14725783475783477,
+      "grad_norm": 0.45794424414634705,
+      "learning_rate": 0.0001993626845383483,
+      "loss": 1.2395,
+      "step": 827
+    },
+    {
+      "epoch": 0.14743589743589744,
+      "grad_norm": 0.49237680435180664,
+      "learning_rate": 0.00019936110577648596,
+      "loss": 1.3483,
+      "step": 828
+    },
+    {
+      "epoch": 0.1476139601139601,
+      "grad_norm": 0.481666624546051,
+      "learning_rate": 0.00019935952506784914,
+      "loss": 1.1848,
+      "step": 829
+    },
+    {
+      "epoch": 0.1477920227920228,
+      "grad_norm": 0.4015209376811981,
+      "learning_rate": 0.00019935794241246883,
+      "loss": 1.0624,
+      "step": 830
+    },
+    {
+      "epoch": 0.14797008547008547,
+      "grad_norm": 0.47975999116897583,
+      "learning_rate": 0.00019935635781037606,
+      "loss": 1.1595,
+      "step": 831
+    },
+    {
+      "epoch": 0.14814814814814814,
+      "grad_norm": 0.4440356492996216,
+      "learning_rate": 0.00019935477126160181,
+      "loss": 1.1325,
+      "step": 832
+    },
+    {
+      "epoch": 0.14832621082621084,
+      "grad_norm": 0.4167410731315613,
+      "learning_rate": 0.00019935318276617723,
+      "loss": 1.0662,
+      "step": 833
+    },
+    {
+      "epoch": 0.1485042735042735,
+      "grad_norm": 0.4107447862625122,
+      "learning_rate": 0.0001993515923241334,
+      "loss": 0.8816,
+      "step": 834
+    },
+    {
+      "epoch": 0.14868233618233617,
+      "grad_norm": 0.4020158648490906,
+      "learning_rate": 0.00019934999993550154,
+      "loss": 0.9797,
+      "step": 835
+    },
+    {
+      "epoch": 0.14886039886039887,
+      "grad_norm": 0.4186473786830902,
+      "learning_rate": 0.0001993484056003128,
+      "loss": 1.1243,
+      "step": 836
+    },
+    {
+      "epoch": 0.14903846153846154,
+      "grad_norm": 0.5534794926643372,
+      "learning_rate": 0.00019934680931859842,
+      "loss": 1.1189,
+      "step": 837
+    },
+    {
+      "epoch": 0.1492165242165242,
+      "grad_norm": 0.37901270389556885,
+      "learning_rate": 0.0001993452110903897,
+      "loss": 0.9241,
+      "step": 838
+    },
+    {
+      "epoch": 0.1493945868945869,
+      "grad_norm": 0.41773587465286255,
+      "learning_rate": 0.00019934361091571793,
+      "loss": 0.9467,
+      "step": 839
+    },
+    {
+      "epoch": 0.14957264957264957,
+      "grad_norm": 0.4962073564529419,
+      "learning_rate": 0.00019934200879461448,
+      "loss": 1.2423,
+      "step": 840
+    },
+    {
+      "epoch": 0.14975071225071226,
+      "grad_norm": 0.38565897941589355,
+      "learning_rate": 0.00019934040472711074,
+      "loss": 1.1545,
+      "step": 841
+    },
+    {
+      "epoch": 0.14992877492877493,
+      "grad_norm": 0.4295346736907959,
+      "learning_rate": 0.0001993387987132381,
+      "loss": 1.2482,
+      "step": 842
+    },
+    {
+      "epoch": 0.1501068376068376,
+      "grad_norm": 0.4279189705848694,
+      "learning_rate": 0.0001993371907530281,
+      "loss": 1.1135,
+      "step": 843
+    },
+    {
+      "epoch": 0.1502849002849003,
+      "grad_norm": 0.44649168848991394,
+      "learning_rate": 0.0001993355808465122,
+      "loss": 1.0734,
+      "step": 844
+    },
+    {
+      "epoch": 0.15046296296296297,
+      "grad_norm": 0.453707218170166,
+      "learning_rate": 0.0001993339689937219,
+      "loss": 1.0992,
+      "step": 845
+    },
+    {
+      "epoch": 0.15064102564102563,
+      "grad_norm": 0.5113263726234436,
+      "learning_rate": 0.00019933235519468886,
+      "loss": 1.1792,
+      "step": 846
+    },
+    {
+      "epoch": 0.15081908831908833,
+      "grad_norm": 0.5822970271110535,
+      "learning_rate": 0.00019933073944944466,
+      "loss": 1.367,
+      "step": 847
+    },
+    {
+      "epoch": 0.150997150997151,
+      "grad_norm": 0.3946528732776642,
+      "learning_rate": 0.00019932912175802097,
+      "loss": 0.9781,
+      "step": 848
+    },
+    {
+      "epoch": 0.15117521367521367,
+      "grad_norm": 0.5429860949516296,
+      "learning_rate": 0.00019932750212044945,
+      "loss": 0.9783,
+      "step": 849
+    },
+    {
+      "epoch": 0.15135327635327636,
+      "grad_norm": 0.45847952365875244,
+      "learning_rate": 0.0001993258805367619,
+      "loss": 1.1352,
+      "step": 850
+    },
+    {
+      "epoch": 0.15153133903133903,
+      "grad_norm": 0.42770692706108093,
+      "learning_rate": 0.00019932425700699004,
+      "loss": 1.2365,
+      "step": 851
+    },
+    {
+      "epoch": 0.1517094017094017,
+      "grad_norm": 0.41845405101776123,
+      "learning_rate": 0.00019932263153116565,
+      "loss": 1.2642,
+      "step": 852
+    },
+    {
+      "epoch": 0.1518874643874644,
+      "grad_norm": 0.4641731083393097,
+      "learning_rate": 0.00019932100410932066,
+      "loss": 1.2009,
+      "step": 853
+    },
+    {
+      "epoch": 0.15206552706552706,
+      "grad_norm": 0.4128672778606415,
+      "learning_rate": 0.00019931937474148689,
+      "loss": 1.1981,
+      "step": 854
+    },
+    {
+      "epoch": 0.15224358974358973,
+      "grad_norm": 0.4730764925479889,
+      "learning_rate": 0.00019931774342769632,
+      "loss": 1.2145,
+      "step": 855
+    },
+    {
+      "epoch": 0.15242165242165243,
+      "grad_norm": 0.36611825227737427,
+      "learning_rate": 0.00019931611016798089,
+      "loss": 0.8504,
+      "step": 856
+    },
+    {
+      "epoch": 0.1525997150997151,
+      "grad_norm": 0.40944692492485046,
+      "learning_rate": 0.00019931447496237254,
+      "loss": 1.2853,
+      "step": 857
+    },
+    {
+      "epoch": 0.1527777777777778,
+      "grad_norm": 0.4521993398666382,
+      "learning_rate": 0.0001993128378109034,
+      "loss": 1.0198,
+      "step": 858
+    },
+    {
+      "epoch": 0.15295584045584046,
+      "grad_norm": 0.42113015055656433,
+      "learning_rate": 0.0001993111987136055,
+      "loss": 1.1284,
+      "step": 859
+    },
+    {
+      "epoch": 0.15313390313390313,
+      "grad_norm": 0.4117624759674072,
+      "learning_rate": 0.00019930955767051098,
+      "loss": 1.0445,
+      "step": 860
+    },
+    {
+      "epoch": 0.15331196581196582,
+      "grad_norm": 0.4807964265346527,
+      "learning_rate": 0.00019930791468165197,
+      "loss": 1.1378,
+      "step": 861
+    },
+    {
+      "epoch": 0.1534900284900285,
+      "grad_norm": 0.4186483323574066,
+      "learning_rate": 0.00019930626974706063,
+      "loss": 1.1636,
+      "step": 862
+    },
+    {
+      "epoch": 0.15366809116809116,
+      "grad_norm": 0.3764737844467163,
+      "learning_rate": 0.00019930462286676926,
+      "loss": 0.9523,
+      "step": 863
+    },
+    {
+      "epoch": 0.15384615384615385,
+      "grad_norm": 0.4283556044101715,
+      "learning_rate": 0.00019930297404081008,
+      "loss": 1.1008,
+      "step": 864
+    },
+    {
+      "epoch": 0.15402421652421652,
+      "grad_norm": 0.4485796093940735,
+      "learning_rate": 0.00019930132326921541,
+      "loss": 1.0834,
+      "step": 865
+    },
+    {
+      "epoch": 0.1542022792022792,
+      "grad_norm": 0.3882720172405243,
+      "learning_rate": 0.0001992996705520176,
+      "loss": 1.1086,
+      "step": 866
+    },
+    {
+      "epoch": 0.1543803418803419,
+      "grad_norm": 0.44698455929756165,
+      "learning_rate": 0.00019929801588924902,
+      "loss": 1.1437,
+      "step": 867
+    },
+    {
+      "epoch": 0.15455840455840456,
+      "grad_norm": 0.46978411078453064,
+      "learning_rate": 0.00019929635928094208,
+      "loss": 1.091,
+      "step": 868
+    },
+    {
+      "epoch": 0.15473646723646722,
+      "grad_norm": 0.4717854857444763,
+      "learning_rate": 0.00019929470072712927,
+      "loss": 1.1959,
+      "step": 869
+    },
+    {
+      "epoch": 0.15491452991452992,
+      "grad_norm": 0.4324854016304016,
+      "learning_rate": 0.00019929304022784305,
+      "loss": 1.2062,
+      "step": 870
+    },
+    {
+      "epoch": 0.1550925925925926,
+      "grad_norm": 0.3948180675506592,
+      "learning_rate": 0.00019929137778311597,
+      "loss": 1.1101,
+      "step": 871
+    },
+    {
+      "epoch": 0.15527065527065528,
+      "grad_norm": 0.40345287322998047,
+      "learning_rate": 0.0001992897133929806,
+      "loss": 0.8894,
+      "step": 872
+    },
+    {
+      "epoch": 0.15544871794871795,
+      "grad_norm": 0.44931963086128235,
+      "learning_rate": 0.00019928804705746957,
+      "loss": 0.9389,
+      "step": 873
+    },
+    {
+      "epoch": 0.15562678062678062,
+      "grad_norm": 0.529196560382843,
+      "learning_rate": 0.0001992863787766155,
+      "loss": 1.3362,
+      "step": 874
+    },
+    {
+      "epoch": 0.15580484330484332,
+      "grad_norm": 0.41218671202659607,
+      "learning_rate": 0.0001992847085504511,
+      "loss": 1.0727,
+      "step": 875
+    },
+    {
+      "epoch": 0.15598290598290598,
+      "grad_norm": 0.44074541330337524,
+      "learning_rate": 0.00019928303637900907,
+      "loss": 1.1091,
+      "step": 876
+    },
+    {
+      "epoch": 0.15616096866096865,
+      "grad_norm": 0.5264310240745544,
+      "learning_rate": 0.00019928136226232218,
+      "loss": 1.201,
+      "step": 877
+    },
+    {
+      "epoch": 0.15633903133903135,
+      "grad_norm": 0.4255099594593048,
+      "learning_rate": 0.00019927968620042324,
+      "loss": 1.2514,
+      "step": 878
+    },
+    {
+      "epoch": 0.15651709401709402,
+      "grad_norm": 0.4030280113220215,
+      "learning_rate": 0.0001992780081933451,
+      "loss": 1.0422,
+      "step": 879
+    },
+    {
+      "epoch": 0.15669515669515668,
+      "grad_norm": 0.5270203948020935,
+      "learning_rate": 0.00019927632824112058,
+      "loss": 1.2476,
+      "step": 880
+    },
+    {
+      "epoch": 0.15687321937321938,
+      "grad_norm": 0.37767237424850464,
+      "learning_rate": 0.00019927464634378268,
+      "loss": 1.0768,
+      "step": 881
+    },
+    {
+      "epoch": 0.15705128205128205,
+      "grad_norm": 0.4535936415195465,
+      "learning_rate": 0.0001992729625013643,
+      "loss": 1.2097,
+      "step": 882
+    },
+    {
+      "epoch": 0.15722934472934472,
+      "grad_norm": 0.4282119870185852,
+      "learning_rate": 0.00019927127671389843,
+      "loss": 1.0904,
+      "step": 883
+    },
+    {
+      "epoch": 0.1574074074074074,
+      "grad_norm": 0.3924157917499542,
+      "learning_rate": 0.0001992695889814181,
+      "loss": 0.9692,
+      "step": 884
+    },
+    {
+      "epoch": 0.15758547008547008,
+      "grad_norm": 0.525075376033783,
+      "learning_rate": 0.0001992678993039564,
+      "loss": 1.0292,
+      "step": 885
+    },
+    {
+      "epoch": 0.15776353276353278,
+      "grad_norm": 0.4388505518436432,
+      "learning_rate": 0.00019926620768154644,
+      "loss": 1.1944,
+      "step": 886
+    },
+    {
+      "epoch": 0.15794159544159544,
+      "grad_norm": 0.4362235963344574,
+      "learning_rate": 0.00019926451411422132,
+      "loss": 0.97,
+      "step": 887
+    },
+    {
+      "epoch": 0.1581196581196581,
+      "grad_norm": 0.4265296459197998,
+      "learning_rate": 0.0001992628186020143,
+      "loss": 0.9196,
+      "step": 888
+    },
+    {
+      "epoch": 0.1582977207977208,
+      "grad_norm": 0.4019876718521118,
+      "learning_rate": 0.0001992611211449585,
+      "loss": 1.1368,
+      "step": 889
+    },
+    {
+      "epoch": 0.15847578347578348,
+      "grad_norm": 0.5003397464752197,
+      "learning_rate": 0.00019925942174308726,
+      "loss": 1.2582,
+      "step": 890
+    },
+    {
+      "epoch": 0.15865384615384615,
+      "grad_norm": 0.4774404466152191,
+      "learning_rate": 0.00019925772039643382,
+      "loss": 1.2277,
+      "step": 891
+    },
+    {
+      "epoch": 0.15883190883190884,
+      "grad_norm": 0.4590449333190918,
+      "learning_rate": 0.00019925601710503153,
+      "loss": 1.1679,
+      "step": 892
+    },
+    {
+      "epoch": 0.1590099715099715,
+      "grad_norm": 0.4221442639827728,
+      "learning_rate": 0.0001992543118689138,
+      "loss": 1.1626,
+      "step": 893
+    },
+    {
+      "epoch": 0.15918803418803418,
+      "grad_norm": 0.47613003849983215,
+      "learning_rate": 0.00019925260468811403,
+      "loss": 1.1509,
+      "step": 894
+    },
+    {
+      "epoch": 0.15936609686609687,
+      "grad_norm": 0.41706812381744385,
+      "learning_rate": 0.0001992508955626656,
+      "loss": 1.0366,
+      "step": 895
+    },
+    {
+      "epoch": 0.15954415954415954,
+      "grad_norm": 0.5064654350280762,
+      "learning_rate": 0.00019924918449260205,
+      "loss": 1.0729,
+      "step": 896
+    },
+    {
+      "epoch": 0.1597222222222222,
+      "grad_norm": 0.5019610524177551,
+      "learning_rate": 0.00019924747147795696,
+      "loss": 1.0642,
+      "step": 897
+    },
+    {
+      "epoch": 0.1599002849002849,
+      "grad_norm": 0.4345671534538269,
+      "learning_rate": 0.00019924575651876378,
+      "loss": 1.1747,
+      "step": 898
+    },
+    {
+      "epoch": 0.16007834757834757,
+      "grad_norm": 0.4397568702697754,
+      "learning_rate": 0.0001992440396150562,
+      "loss": 1.282,
+      "step": 899
+    },
+    {
+      "epoch": 0.16025641025641027,
+      "grad_norm": 0.520187497138977,
+      "learning_rate": 0.0001992423207668678,
+      "loss": 0.976,
+      "step": 900
+    },
+    {
+      "epoch": 0.16043447293447294,
+      "grad_norm": 0.39329993724823,
+      "learning_rate": 0.0001992405999742323,
+      "loss": 0.9829,
+      "step": 901
+    },
+    {
+      "epoch": 0.1606125356125356,
+      "grad_norm": 0.42361345887184143,
+      "learning_rate": 0.00019923887723718339,
+      "loss": 1.139,
+      "step": 902
+    },
+    {
+      "epoch": 0.1607905982905983,
+      "grad_norm": 0.3846314251422882,
+      "learning_rate": 0.00019923715255575482,
+      "loss": 0.8262,
+      "step": 903
+    },
+    {
+      "epoch": 0.16096866096866097,
+      "grad_norm": 0.39258381724357605,
+      "learning_rate": 0.0001992354259299804,
+      "loss": 0.9638,
+      "step": 904
+    },
+    {
+      "epoch": 0.16114672364672364,
+      "grad_norm": 0.4000850319862366,
+      "learning_rate": 0.00019923369735989397,
+      "loss": 0.91,
+      "step": 905
+    },
+    {
+      "epoch": 0.16132478632478633,
+      "grad_norm": 0.46303513646125793,
+      "learning_rate": 0.00019923196684552936,
+      "loss": 1.1447,
+      "step": 906
+    },
+    {
+      "epoch": 0.161502849002849,
+      "grad_norm": 0.38437438011169434,
+      "learning_rate": 0.0001992302343869205,
+      "loss": 1.0212,
+      "step": 907
+    },
+    {
+      "epoch": 0.16168091168091167,
+      "grad_norm": 0.44585472345352173,
+      "learning_rate": 0.00019922849998410135,
+      "loss": 1.1964,
+      "step": 908
+    },
+    {
+      "epoch": 0.16185897435897437,
+      "grad_norm": 0.41959813237190247,
+      "learning_rate": 0.00019922676363710583,
+      "loss": 0.9925,
+      "step": 909
+    },
+    {
+      "epoch": 0.16203703703703703,
+      "grad_norm": 0.47442761063575745,
+      "learning_rate": 0.00019922502534596803,
+      "loss": 0.9237,
+      "step": 910
+    },
+    {
+      "epoch": 0.1622150997150997,
+      "grad_norm": 0.5065128207206726,
+      "learning_rate": 0.00019922328511072198,
+      "loss": 1.2573,
+      "step": 911
+    },
+    {
+      "epoch": 0.1623931623931624,
+      "grad_norm": 0.4739879369735718,
+      "learning_rate": 0.0001992215429314018,
+      "loss": 1.4416,
+      "step": 912
+    },
+    {
+      "epoch": 0.16257122507122507,
+      "grad_norm": 0.48763832449913025,
+      "learning_rate": 0.00019921979880804157,
+      "loss": 1.0408,
+      "step": 913
+    },
+    {
+      "epoch": 0.16274928774928774,
+      "grad_norm": 0.4841614067554474,
+      "learning_rate": 0.0001992180527406755,
+      "loss": 1.1826,
+      "step": 914
+    },
+    {
+      "epoch": 0.16292735042735043,
+      "grad_norm": 0.49433308839797974,
+      "learning_rate": 0.0001992163047293378,
+      "loss": 1.3552,
+      "step": 915
+    },
+    {
+      "epoch": 0.1631054131054131,
+      "grad_norm": 0.4985002875328064,
+      "learning_rate": 0.0001992145547740627,
+      "loss": 1.2639,
+      "step": 916
+    },
+    {
+      "epoch": 0.1632834757834758,
+      "grad_norm": 0.40348032116889954,
+      "learning_rate": 0.00019921280287488448,
+      "loss": 1.1731,
+      "step": 917
+    },
+    {
+      "epoch": 0.16346153846153846,
+      "grad_norm": 0.5166002511978149,
+      "learning_rate": 0.0001992110490318375,
+      "loss": 1.0692,
+      "step": 918
+    },
+    {
+      "epoch": 0.16363960113960113,
+      "grad_norm": 0.44233468174934387,
+      "learning_rate": 0.00019920929324495615,
+      "loss": 1.0488,
+      "step": 919
+    },
+    {
+      "epoch": 0.16381766381766383,
+      "grad_norm": 0.43709903955459595,
+      "learning_rate": 0.00019920753551427476,
+      "loss": 0.8884,
+      "step": 920
+    },
+    {
+      "epoch": 0.1639957264957265,
+      "grad_norm": 0.4054167568683624,
+      "learning_rate": 0.00019920577583982778,
+      "loss": 0.9872,
+      "step": 921
+    },
+    {
+      "epoch": 0.16417378917378916,
+      "grad_norm": 0.4657362997531891,
+      "learning_rate": 0.0001992040142216497,
+      "loss": 1.4402,
+      "step": 922
+    },
+    {
+      "epoch": 0.16435185185185186,
+      "grad_norm": 0.42550426721572876,
+      "learning_rate": 0.0001992022506597751,
+      "loss": 1.0456,
+      "step": 923
+    },
+    {
+      "epoch": 0.16452991452991453,
+      "grad_norm": 0.49346762895584106,
+      "learning_rate": 0.00019920048515423842,
+      "loss": 1.527,
+      "step": 924
+    },
+    {
+      "epoch": 0.1647079772079772,
+      "grad_norm": 0.3970337510108948,
+      "learning_rate": 0.0001991987177050743,
+      "loss": 1.0363,
+      "step": 925
+    },
+    {
+      "epoch": 0.1648860398860399,
+      "grad_norm": 0.4027378559112549,
+      "learning_rate": 0.0001991969483123174,
+      "loss": 0.8416,
+      "step": 926
+    },
+    {
+      "epoch": 0.16506410256410256,
+      "grad_norm": 0.4181644916534424,
+      "learning_rate": 0.00019919517697600237,
+      "loss": 1.2253,
+      "step": 927
+    },
+    {
+      "epoch": 0.16524216524216523,
+      "grad_norm": 0.43686383962631226,
+      "learning_rate": 0.0001991934036961639,
+      "loss": 1.0808,
+      "step": 928
+    },
+    {
+      "epoch": 0.16542022792022792,
+      "grad_norm": 0.4242876172065735,
+      "learning_rate": 0.0001991916284728367,
+      "loss": 0.9483,
+      "step": 929
+    },
+    {
+      "epoch": 0.1655982905982906,
+      "grad_norm": 0.3690609037876129,
+      "learning_rate": 0.00019918985130605563,
+      "loss": 0.9495,
+      "step": 930
+    },
+    {
+      "epoch": 0.1657763532763533,
+      "grad_norm": 0.42184555530548096,
+      "learning_rate": 0.00019918807219585546,
+      "loss": 1.0966,
+      "step": 931
+    },
+    {
+      "epoch": 0.16595441595441596,
+      "grad_norm": 0.4342746138572693,
+      "learning_rate": 0.00019918629114227106,
+      "loss": 1.0875,
+      "step": 932
+    },
+    {
+      "epoch": 0.16613247863247863,
+      "grad_norm": 0.4191494286060333,
+      "learning_rate": 0.00019918450814533737,
+      "loss": 1.0777,
+      "step": 933
+    },
+    {
+      "epoch": 0.16631054131054132,
+      "grad_norm": 0.37124550342559814,
+      "learning_rate": 0.00019918272320508922,
+      "loss": 1.0131,
+      "step": 934
+    },
+    {
+      "epoch": 0.166488603988604,
+      "grad_norm": 0.4475722014904022,
+      "learning_rate": 0.00019918093632156168,
+      "loss": 1.1185,
+      "step": 935
+    },
+    {
+      "epoch": 0.16666666666666666,
+      "grad_norm": 0.4629058241844177,
+      "learning_rate": 0.0001991791474947897,
+      "loss": 1.0353,
+      "step": 936
+    },
+    {
+      "epoch": 0.16684472934472935,
+      "grad_norm": 0.48192909359931946,
+      "learning_rate": 0.00019917735672480834,
+      "loss": 1.1628,
+      "step": 937
+    },
+    {
+      "epoch": 0.16702279202279202,
+      "grad_norm": 0.5542252063751221,
+      "learning_rate": 0.00019917556401165273,
+      "loss": 1.3133,
+      "step": 938
+    },
+    {
+      "epoch": 0.1672008547008547,
+      "grad_norm": 0.4172651171684265,
+      "learning_rate": 0.00019917376935535796,
+      "loss": 1.1733,
+      "step": 939
+    },
+    {
+      "epoch": 0.16737891737891739,
+      "grad_norm": 0.4424920380115509,
+      "learning_rate": 0.0001991719727559592,
+      "loss": 1.0262,
+      "step": 940
+    },
+    {
+      "epoch": 0.16755698005698005,
+      "grad_norm": 0.4551742970943451,
+      "learning_rate": 0.00019917017421349162,
+      "loss": 1.0883,
+      "step": 941
+    },
+    {
+      "epoch": 0.16773504273504272,
+      "grad_norm": 0.45929640531539917,
+      "learning_rate": 0.00019916837372799048,
+      "loss": 1.1836,
+      "step": 942
+    },
+    {
+      "epoch": 0.16791310541310542,
+      "grad_norm": 0.4609353542327881,
+      "learning_rate": 0.0001991665712994911,
+      "loss": 1.0682,
+      "step": 943
+    },
+    {
+      "epoch": 0.16809116809116809,
+      "grad_norm": 0.42617303133010864,
+      "learning_rate": 0.00019916476692802873,
+      "loss": 1.074,
+      "step": 944
+    },
+    {
+      "epoch": 0.16826923076923078,
+      "grad_norm": 0.41919493675231934,
+      "learning_rate": 0.00019916296061363875,
+      "loss": 1.0969,
+      "step": 945
+    },
+    {
+      "epoch": 0.16844729344729345,
+      "grad_norm": 0.450979083776474,
+      "learning_rate": 0.00019916115235635656,
+      "loss": 1.1686,
+      "step": 946
+    },
+    {
+      "epoch": 0.16862535612535612,
+      "grad_norm": 0.42166751623153687,
+      "learning_rate": 0.00019915934215621758,
+      "loss": 0.9273,
+      "step": 947
+    },
+    {
+      "epoch": 0.16880341880341881,
+      "grad_norm": 0.4404160976409912,
+      "learning_rate": 0.00019915753001325729,
+      "loss": 1.1663,
+      "step": 948
+    },
+    {
+      "epoch": 0.16898148148148148,
+      "grad_norm": 0.42025226354599,
+      "learning_rate": 0.0001991557159275111,
+      "loss": 0.9433,
+      "step": 949
+    },
+    {
+      "epoch": 0.16915954415954415,
+      "grad_norm": 0.4277796745300293,
+      "learning_rate": 0.00019915389989901474,
+      "loss": 0.8475,
+      "step": 950
+    },
+    {
+      "epoch": 0.16933760683760685,
+      "grad_norm": 0.5162755250930786,
+      "learning_rate": 0.00019915208192780365,
+      "loss": 1.1155,
+      "step": 951
+    },
+    {
+      "epoch": 0.16951566951566951,
+      "grad_norm": 0.4214856028556824,
+      "learning_rate": 0.00019915026201391346,
+      "loss": 1.173,
+      "step": 952
+    },
+    {
+      "epoch": 0.16969373219373218,
+      "grad_norm": 0.4713292419910431,
+      "learning_rate": 0.00019914844015737985,
+      "loss": 1.1615,
+      "step": 953
+    },
+    {
+      "epoch": 0.16987179487179488,
+      "grad_norm": 0.461179256439209,
+      "learning_rate": 0.00019914661635823854,
+      "loss": 1.1169,
+      "step": 954
+    },
+    {
+      "epoch": 0.17004985754985755,
+      "grad_norm": 0.46200552582740784,
+      "learning_rate": 0.00019914479061652527,
+      "loss": 1.0274,
+      "step": 955
+    },
+    {
+      "epoch": 0.17022792022792022,
+      "grad_norm": 0.40968334674835205,
+      "learning_rate": 0.00019914296293227572,
+      "loss": 1.066,
+      "step": 956
+    },
+    {
+      "epoch": 0.1704059829059829,
+      "grad_norm": 0.40877434611320496,
+      "learning_rate": 0.0001991411333055258,
+      "loss": 1.1595,
+      "step": 957
+    },
+    {
+      "epoch": 0.17058404558404558,
+      "grad_norm": 0.42940187454223633,
+      "learning_rate": 0.00019913930173631132,
+      "loss": 1.0364,
+      "step": 958
+    },
+    {
+      "epoch": 0.17076210826210828,
+      "grad_norm": 0.49648910760879517,
+      "learning_rate": 0.00019913746822466819,
+      "loss": 1.0763,
+      "step": 959
+    },
+    {
+      "epoch": 0.17094017094017094,
+      "grad_norm": 0.4353426396846771,
+      "learning_rate": 0.00019913563277063228,
+      "loss": 0.9698,
+      "step": 960
+    },
+    {
+      "epoch": 0.1711182336182336,
+      "grad_norm": 0.45079681277275085,
+      "learning_rate": 0.00019913379537423958,
+      "loss": 1.2244,
+      "step": 961
+    },
+    {
+      "epoch": 0.1712962962962963,
+      "grad_norm": 0.4276828467845917,
+      "learning_rate": 0.00019913195603552607,
+      "loss": 0.9976,
+      "step": 962
+    },
+    {
+      "epoch": 0.17147435897435898,
+      "grad_norm": 0.41122403740882874,
+      "learning_rate": 0.00019913011475452785,
+      "loss": 1.0077,
+      "step": 963
+    },
+    {
+      "epoch": 0.17165242165242164,
+      "grad_norm": 0.43170276284217834,
+      "learning_rate": 0.00019912827153128096,
+      "loss": 1.1402,
+      "step": 964
+    },
+    {
+      "epoch": 0.17183048433048434,
+      "grad_norm": 0.37950268387794495,
+      "learning_rate": 0.0001991264263658215,
+      "loss": 0.9818,
+      "step": 965
+    },
+    {
+      "epoch": 0.172008547008547,
+      "grad_norm": 0.477333128452301,
+      "learning_rate": 0.00019912457925818562,
+      "loss": 1.1756,
+      "step": 966
+    },
+    {
+      "epoch": 0.17218660968660968,
+      "grad_norm": 0.4326401352882385,
+      "learning_rate": 0.00019912273020840954,
+      "loss": 1.3718,
+      "step": 967
+    },
+    {
+      "epoch": 0.17236467236467237,
+      "grad_norm": 0.37711042165756226,
+      "learning_rate": 0.00019912087921652945,
+      "loss": 0.9011,
+      "step": 968
+    },
+    {
+      "epoch": 0.17254273504273504,
+      "grad_norm": 0.50013667345047,
+      "learning_rate": 0.00019911902628258162,
+      "loss": 1.1163,
+      "step": 969
+    },
+    {
+      "epoch": 0.1727207977207977,
+      "grad_norm": 0.41913339495658875,
+      "learning_rate": 0.0001991171714066024,
+      "loss": 1.2614,
+      "step": 970
+    },
+    {
+      "epoch": 0.1728988603988604,
+      "grad_norm": 0.4075855612754822,
+      "learning_rate": 0.00019911531458862813,
+      "loss": 0.8984,
+      "step": 971
+    },
+    {
+      "epoch": 0.17307692307692307,
+      "grad_norm": 0.40277954936027527,
+      "learning_rate": 0.00019911345582869513,
+      "loss": 1.0851,
+      "step": 972
+    },
+    {
+      "epoch": 0.17325498575498577,
+      "grad_norm": 0.4312847852706909,
+      "learning_rate": 0.00019911159512683987,
+      "loss": 1.1273,
+      "step": 973
+    },
+    {
+      "epoch": 0.17343304843304844,
+      "grad_norm": 0.40303611755371094,
+      "learning_rate": 0.0001991097324830988,
+      "loss": 0.9645,
+      "step": 974
+    },
+    {
+      "epoch": 0.1736111111111111,
+      "grad_norm": 0.45560577511787415,
+      "learning_rate": 0.00019910786789750838,
+      "loss": 1.0864,
+      "step": 975
+    },
+    {
+      "epoch": 0.1737891737891738,
+      "grad_norm": 0.43775680661201477,
+      "learning_rate": 0.00019910600137010517,
+      "loss": 1.028,
+      "step": 976
+    },
+    {
+      "epoch": 0.17396723646723647,
+      "grad_norm": 0.3917224407196045,
+      "learning_rate": 0.00019910413290092572,
+      "loss": 1.0491,
+      "step": 977
+    },
+    {
+      "epoch": 0.17414529914529914,
+      "grad_norm": 0.4068751037120819,
+      "learning_rate": 0.0001991022624900067,
+      "loss": 1.0476,
+      "step": 978
+    },
+    {
+      "epoch": 0.17432336182336183,
+      "grad_norm": 0.4463370144367218,
+      "learning_rate": 0.0001991003901373847,
+      "loss": 1.0612,
+      "step": 979
+    },
+    {
+      "epoch": 0.1745014245014245,
+      "grad_norm": 0.46949052810668945,
+      "learning_rate": 0.0001990985158430964,
+      "loss": 1.3099,
+      "step": 980
+    },
+    {
+      "epoch": 0.17467948717948717,
+      "grad_norm": 0.4250012934207916,
+      "learning_rate": 0.00019909663960717856,
+      "loss": 0.9903,
+      "step": 981
+    },
+    {
+      "epoch": 0.17485754985754987,
+      "grad_norm": 0.5293903946876526,
+      "learning_rate": 0.0001990947614296679,
+      "loss": 0.9908,
+      "step": 982
+    },
+    {
+      "epoch": 0.17503561253561253,
+      "grad_norm": 0.3838284909725189,
+      "learning_rate": 0.0001990928813106013,
+      "loss": 0.716,
+      "step": 983
+    },
+    {
+      "epoch": 0.1752136752136752,
+      "grad_norm": 0.4597751200199127,
+      "learning_rate": 0.0001990909992500155,
+      "loss": 1.0126,
+      "step": 984
+    },
+    {
+      "epoch": 0.1753917378917379,
+      "grad_norm": 0.4844081699848175,
+      "learning_rate": 0.0001990891152479474,
+      "loss": 1.1043,
+      "step": 985
+    },
+    {
+      "epoch": 0.17556980056980057,
+      "grad_norm": 0.4763399660587311,
+      "learning_rate": 0.00019908722930443392,
+      "loss": 1.019,
+      "step": 986
+    },
+    {
+      "epoch": 0.17574786324786323,
+      "grad_norm": 0.4670077860355377,
+      "learning_rate": 0.00019908534141951204,
+      "loss": 1.1382,
+      "step": 987
+    },
+    {
+      "epoch": 0.17592592592592593,
+      "grad_norm": 0.39372730255126953,
+      "learning_rate": 0.00019908345159321873,
+      "loss": 1.1219,
+      "step": 988
+    },
+    {
+      "epoch": 0.1761039886039886,
+      "grad_norm": 0.41869843006134033,
+      "learning_rate": 0.00019908155982559098,
+      "loss": 0.9461,
+      "step": 989
+    },
+    {
+      "epoch": 0.1762820512820513,
+      "grad_norm": 0.4398406147956848,
+      "learning_rate": 0.00019907966611666593,
+      "loss": 1.1328,
+      "step": 990
+    },
+    {
+      "epoch": 0.17646011396011396,
+      "grad_norm": 0.4315733015537262,
+      "learning_rate": 0.0001990777704664806,
+      "loss": 1.0974,
+      "step": 991
+    },
+    {
+      "epoch": 0.17663817663817663,
+      "grad_norm": 0.42859575152397156,
+      "learning_rate": 0.00019907587287507222,
+      "loss": 1.2637,
+      "step": 992
+    },
+    {
+      "epoch": 0.17681623931623933,
+      "grad_norm": 0.47928622364997864,
+      "learning_rate": 0.0001990739733424779,
+      "loss": 1.0699,
+      "step": 993
+    },
+    {
+      "epoch": 0.176994301994302,
+      "grad_norm": 0.4443826973438263,
+      "learning_rate": 0.00019907207186873488,
+      "loss": 1.0547,
+      "step": 994
+    },
+    {
+      "epoch": 0.17717236467236466,
+      "grad_norm": 0.4108099937438965,
+      "learning_rate": 0.00019907016845388043,
+      "loss": 1.1401,
+      "step": 995
+    },
+    {
+      "epoch": 0.17735042735042736,
+      "grad_norm": 0.4474675953388214,
+      "learning_rate": 0.00019906826309795182,
+      "loss": 1.0712,
+      "step": 996
+    },
+    {
+      "epoch": 0.17752849002849003,
+      "grad_norm": 0.4149756133556366,
+      "learning_rate": 0.00019906635580098638,
+      "loss": 0.9585,
+      "step": 997
+    },
+    {
+      "epoch": 0.1777065527065527,
+      "grad_norm": 0.4875968098640442,
+      "learning_rate": 0.00019906444656302152,
+      "loss": 1.0659,
+      "step": 998
+    },
+    {
+      "epoch": 0.1778846153846154,
+      "grad_norm": 0.5494784116744995,
+      "learning_rate": 0.0001990625353840946,
+      "loss": 1.2858,
+      "step": 999
+    },
+    {
+      "epoch": 0.17806267806267806,
+      "grad_norm": 0.425062358379364,
+      "learning_rate": 0.0001990606222642431,
+      "loss": 1.1826,
+      "step": 1000
+    },
+    {
+      "epoch": 0.17824074074074073,
+      "grad_norm": 0.3890725374221802,
+      "learning_rate": 0.00019905870720350445,
+      "loss": 0.9568,
+      "step": 1001
+    },
+    {
+      "epoch": 0.17841880341880342,
+      "grad_norm": 0.3884070813655853,
+      "learning_rate": 0.00019905679020191624,
+      "loss": 0.9674,
+      "step": 1002
+    },
+    {
+      "epoch": 0.1785968660968661,
+      "grad_norm": 0.49496129155158997,
+      "learning_rate": 0.00019905487125951597,
+      "loss": 0.9143,
+      "step": 1003
+    },
+    {
+      "epoch": 0.1787749287749288,
+      "grad_norm": 0.43448135256767273,
+      "learning_rate": 0.00019905295037634128,
+      "loss": 1.2677,
+      "step": 1004
+    },
+    {
+      "epoch": 0.17895299145299146,
+      "grad_norm": 0.47327905893325806,
+      "learning_rate": 0.00019905102755242982,
+      "loss": 0.9089,
+      "step": 1005
+    },
+    {
+      "epoch": 0.17913105413105412,
+      "grad_norm": 0.4962378442287445,
+      "learning_rate": 0.00019904910278781922,
+      "loss": 1.1748,
+      "step": 1006
+    },
+    {
+      "epoch": 0.17930911680911682,
+      "grad_norm": 0.4343934655189514,
+      "learning_rate": 0.0001990471760825472,
+      "loss": 1.2176,
+      "step": 1007
+    },
+    {
+      "epoch": 0.1794871794871795,
+      "grad_norm": 0.4695793092250824,
+      "learning_rate": 0.0001990452474366515,
+      "loss": 1.1822,
+      "step": 1008
+    },
+    {
+      "epoch": 0.17966524216524216,
+      "grad_norm": 0.4156060516834259,
+      "learning_rate": 0.00019904331685016995,
+      "loss": 0.8231,
+      "step": 1009
+    },
+    {
+      "epoch": 0.17984330484330485,
+      "grad_norm": 0.5068191885948181,
+      "learning_rate": 0.00019904138432314035,
+      "loss": 1.1363,
+      "step": 1010
+    },
+    {
+      "epoch": 0.18002136752136752,
+      "grad_norm": 0.5189786553382874,
+      "learning_rate": 0.00019903944985560058,
+      "loss": 1.3131,
+      "step": 1011
+    },
+    {
+      "epoch": 0.1801994301994302,
+      "grad_norm": 0.5126828551292419,
+      "learning_rate": 0.00019903751344758848,
+      "loss": 1.0305,
+      "step": 1012
+    },
+    {
+      "epoch": 0.18037749287749288,
+      "grad_norm": 0.41045933961868286,
+      "learning_rate": 0.00019903557509914205,
+      "loss": 1.2726,
+      "step": 1013
+    },
+    {
+      "epoch": 0.18055555555555555,
+      "grad_norm": 0.4141713082790375,
+      "learning_rate": 0.0001990336348102993,
+      "loss": 0.9606,
+      "step": 1014
+    },
+    {
+      "epoch": 0.18073361823361822,
+      "grad_norm": 0.42652079463005066,
+      "learning_rate": 0.00019903169258109812,
+      "loss": 1.0235,
+      "step": 1015
+    },
+    {
+      "epoch": 0.18091168091168092,
+      "grad_norm": 0.42098379135131836,
+      "learning_rate": 0.0001990297484115767,
+      "loss": 1.0602,
+      "step": 1016
+    },
+    {
+      "epoch": 0.18108974358974358,
+      "grad_norm": 0.49920013546943665,
+      "learning_rate": 0.0001990278023017731,
+      "loss": 1.3322,
+      "step": 1017
+    },
+    {
+      "epoch": 0.18126780626780628,
+      "grad_norm": 0.412304550409317,
+      "learning_rate": 0.00019902585425172537,
+      "loss": 1.1011,
+      "step": 1018
+    },
+    {
+      "epoch": 0.18144586894586895,
+      "grad_norm": 0.44226935505867004,
+      "learning_rate": 0.00019902390426147177,
+      "loss": 0.9777,
+      "step": 1019
+    },
+    {
+      "epoch": 0.18162393162393162,
+      "grad_norm": 0.4685269594192505,
+      "learning_rate": 0.00019902195233105046,
+      "loss": 1.3587,
+      "step": 1020
+    },
+    {
+      "epoch": 0.1818019943019943,
+      "grad_norm": 0.4500584304332733,
+      "learning_rate": 0.00019901999846049968,
+      "loss": 0.9888,
+      "step": 1021
+    },
+    {
+      "epoch": 0.18198005698005698,
+      "grad_norm": 0.48566994071006775,
+      "learning_rate": 0.00019901804264985774,
+      "loss": 1.2364,
+      "step": 1022
+    },
+    {
+      "epoch": 0.18215811965811965,
+      "grad_norm": 0.4063156247138977,
+      "learning_rate": 0.00019901608489916294,
+      "loss": 1.2224,
+      "step": 1023
+    },
+    {
+      "epoch": 0.18233618233618235,
+      "grad_norm": 0.471276193857193,
+      "learning_rate": 0.00019901412520845367,
+      "loss": 0.9926,
+      "step": 1024
+    },
+    {
+      "epoch": 0.182514245014245,
+      "grad_norm": 0.5165421366691589,
+      "learning_rate": 0.00019901216357776829,
+      "loss": 0.9595,
+      "step": 1025
+    },
+    {
+      "epoch": 0.18269230769230768,
+      "grad_norm": 0.4746754467487335,
+      "learning_rate": 0.0001990102000071452,
+      "loss": 1.2057,
+      "step": 1026
+    },
+    {
+      "epoch": 0.18287037037037038,
+      "grad_norm": 0.44803035259246826,
+      "learning_rate": 0.00019900823449662297,
+      "loss": 1.2114,
+      "step": 1027
+    },
+    {
+      "epoch": 0.18304843304843305,
+      "grad_norm": 0.47256240248680115,
+      "learning_rate": 0.00019900626704624005,
+      "loss": 1.112,
+      "step": 1028
+    },
+    {
+      "epoch": 0.18322649572649571,
+      "grad_norm": 0.4253387153148651,
+      "learning_rate": 0.000199004297656035,
+      "loss": 0.9899,
+      "step": 1029
+    },
+    {
+      "epoch": 0.1834045584045584,
+      "grad_norm": 0.44958099722862244,
+      "learning_rate": 0.00019900232632604636,
+      "loss": 1.1445,
+      "step": 1030
+    },
+    {
+      "epoch": 0.18358262108262108,
+      "grad_norm": 0.5296537280082703,
+      "learning_rate": 0.00019900035305631285,
+      "loss": 1.2502,
+      "step": 1031
+    },
+    {
+      "epoch": 0.18376068376068377,
+      "grad_norm": 0.5057148933410645,
+      "learning_rate": 0.00019899837784687302,
+      "loss": 1.1426,
+      "step": 1032
+    },
+    {
+      "epoch": 0.18393874643874644,
+      "grad_norm": 0.41463762521743774,
+      "learning_rate": 0.00019899640069776566,
+      "loss": 1.1854,
+      "step": 1033
+    },
+    {
+      "epoch": 0.1841168091168091,
+      "grad_norm": 0.45800045132637024,
+      "learning_rate": 0.00019899442160902945,
+      "loss": 1.2438,
+      "step": 1034
+    },
+    {
+      "epoch": 0.1842948717948718,
+      "grad_norm": 0.43450453877449036,
+      "learning_rate": 0.00019899244058070324,
+      "loss": 1.0598,
+      "step": 1035
+    },
+    {
+      "epoch": 0.18447293447293447,
+      "grad_norm": 0.4141148626804352,
+      "learning_rate": 0.00019899045761282577,
+      "loss": 1.0465,
+      "step": 1036
+    },
+    {
+      "epoch": 0.18465099715099714,
+      "grad_norm": 0.3938458263874054,
+      "learning_rate": 0.0001989884727054359,
+      "loss": 1.0142,
+      "step": 1037
+    },
+    {
+      "epoch": 0.18482905982905984,
+      "grad_norm": 0.43898263573646545,
+      "learning_rate": 0.00019898648585857257,
+      "loss": 0.9212,
+      "step": 1038
+    },
+    {
+      "epoch": 0.1850071225071225,
+      "grad_norm": 0.4425487816333771,
+      "learning_rate": 0.00019898449707227465,
+      "loss": 1.2987,
+      "step": 1039
+    },
+    {
+      "epoch": 0.18518518518518517,
+      "grad_norm": 0.4537975490093231,
+      "learning_rate": 0.00019898250634658115,
+      "loss": 1.2023,
+      "step": 1040
+    },
+    {
+      "epoch": 0.18536324786324787,
+      "grad_norm": 0.4107198119163513,
+      "learning_rate": 0.00019898051368153104,
+      "loss": 0.8443,
+      "step": 1041
+    },
+    {
+      "epoch": 0.18554131054131054,
+      "grad_norm": 0.4389404058456421,
+      "learning_rate": 0.0001989785190771634,
+      "loss": 1.0502,
+      "step": 1042
+    },
+    {
+      "epoch": 0.1857193732193732,
+      "grad_norm": 0.4288824796676636,
+      "learning_rate": 0.00019897652253351726,
+      "loss": 1.01,
+      "step": 1043
+    },
+    {
+      "epoch": 0.1858974358974359,
+      "grad_norm": 0.50815349817276,
+      "learning_rate": 0.00019897452405063178,
+      "loss": 1.0308,
+      "step": 1044
+    },
+    {
+      "epoch": 0.18607549857549857,
+      "grad_norm": 0.45252710580825806,
+      "learning_rate": 0.0001989725236285461,
+      "loss": 1.0967,
+      "step": 1045
+    },
+    {
+      "epoch": 0.18625356125356127,
+      "grad_norm": 0.45049402117729187,
+      "learning_rate": 0.00019897052126729943,
+      "loss": 1.0141,
+      "step": 1046
+    },
+    {
+      "epoch": 0.18643162393162394,
+      "grad_norm": 0.49637508392333984,
+      "learning_rate": 0.00019896851696693098,
+      "loss": 1.0997,
+      "step": 1047
+    },
+    {
+      "epoch": 0.1866096866096866,
+      "grad_norm": 0.4465886056423187,
+      "learning_rate": 0.00019896651072748005,
+      "loss": 1.1415,
+      "step": 1048
+    },
+    {
+      "epoch": 0.1867877492877493,
+      "grad_norm": 0.5309500694274902,
+      "learning_rate": 0.00019896450254898592,
+      "loss": 1.1028,
+      "step": 1049
+    },
+    {
+      "epoch": 0.18696581196581197,
+      "grad_norm": 0.3516653776168823,
+      "learning_rate": 0.00019896249243148793,
+      "loss": 0.9841,
+      "step": 1050
+    },
+    {
+      "epoch": 0.18714387464387464,
+      "grad_norm": 0.4529176950454712,
+      "learning_rate": 0.0001989604803750255,
+      "loss": 1.1335,
+      "step": 1051
+    },
+    {
+      "epoch": 0.18732193732193733,
+      "grad_norm": 0.47694942355155945,
+      "learning_rate": 0.000198958466379638,
+      "loss": 1.2383,
+      "step": 1052
+    },
+    {
+      "epoch": 0.1875,
+      "grad_norm": 0.5524206757545471,
+      "learning_rate": 0.0001989564504453649,
+      "loss": 1.3668,
+      "step": 1053
+    },
+    {
+      "epoch": 0.18767806267806267,
+      "grad_norm": 0.39203691482543945,
+      "learning_rate": 0.00019895443257224576,
+      "loss": 1.2203,
+      "step": 1054
+    },
+    {
+      "epoch": 0.18785612535612536,
+      "grad_norm": 0.4164120852947235,
+      "learning_rate": 0.00019895241276032005,
+      "loss": 0.8954,
+      "step": 1055
+    },
+    {
+      "epoch": 0.18803418803418803,
+      "grad_norm": 0.41217970848083496,
+      "learning_rate": 0.0001989503910096274,
+      "loss": 1.0238,
+      "step": 1056
+    },
+    {
+      "epoch": 0.1882122507122507,
+      "grad_norm": 0.44038307666778564,
+      "learning_rate": 0.00019894836732020735,
+      "loss": 0.8159,
+      "step": 1057
+    },
+    {
+      "epoch": 0.1883903133903134,
+      "grad_norm": 0.45780670642852783,
+      "learning_rate": 0.0001989463416920996,
+      "loss": 1.2864,
+      "step": 1058
+    },
+    {
+      "epoch": 0.18856837606837606,
+      "grad_norm": 0.5197559595108032,
+      "learning_rate": 0.00019894431412534384,
+      "loss": 1.0756,
+      "step": 1059
+    },
+    {
+      "epoch": 0.18874643874643873,
+      "grad_norm": 0.43283385038375854,
+      "learning_rate": 0.00019894228461997979,
+      "loss": 1.0642,
+      "step": 1060
+    },
+    {
+      "epoch": 0.18892450142450143,
+      "grad_norm": 0.4657376706600189,
+      "learning_rate": 0.00019894025317604717,
+      "loss": 1.1159,
+      "step": 1061
+    },
+    {
+      "epoch": 0.1891025641025641,
+      "grad_norm": 0.4474908113479614,
+      "learning_rate": 0.00019893821979358588,
+      "loss": 1.2006,
+      "step": 1062
+    },
+    {
+      "epoch": 0.1892806267806268,
+      "grad_norm": 0.43878164887428284,
+      "learning_rate": 0.00019893618447263566,
+      "loss": 1.1599,
+      "step": 1063
+    },
+    {
+      "epoch": 0.18945868945868946,
+      "grad_norm": 0.4598735272884369,
+      "learning_rate": 0.00019893414721323645,
+      "loss": 1.3346,
+      "step": 1064
+    },
+    {
+      "epoch": 0.18963675213675213,
+      "grad_norm": 0.3947420120239258,
+      "learning_rate": 0.00019893210801542812,
+      "loss": 1.1201,
+      "step": 1065
+    },
+    {
+      "epoch": 0.18981481481481483,
+      "grad_norm": 0.3401558995246887,
+      "learning_rate": 0.00019893006687925064,
+      "loss": 0.7568,
+      "step": 1066
+    },
+    {
+      "epoch": 0.1899928774928775,
+      "grad_norm": 0.4400341808795929,
+      "learning_rate": 0.00019892802380474405,
+      "loss": 1.1706,
+      "step": 1067
+    },
+    {
+      "epoch": 0.19017094017094016,
+      "grad_norm": 0.42394164204597473,
+      "learning_rate": 0.00019892597879194829,
+      "loss": 1.0163,
+      "step": 1068
+    },
+    {
+      "epoch": 0.19034900284900286,
+      "grad_norm": 0.42904096841812134,
+      "learning_rate": 0.00019892393184090353,
+      "loss": 0.9193,
+      "step": 1069
+    },
+    {
+      "epoch": 0.19052706552706553,
+      "grad_norm": 0.497601181268692,
+      "learning_rate": 0.00019892188295164977,
+      "loss": 1.0377,
+      "step": 1070
+    },
+    {
+      "epoch": 0.1907051282051282,
+      "grad_norm": 0.4536020755767822,
+      "learning_rate": 0.00019891983212422723,
+      "loss": 1.0946,
+      "step": 1071
+    },
+    {
+      "epoch": 0.1908831908831909,
+      "grad_norm": 0.44916942715644836,
+      "learning_rate": 0.00019891777935867607,
+      "loss": 1.0563,
+      "step": 1072
+    },
+    {
+      "epoch": 0.19106125356125356,
+      "grad_norm": 0.4256889820098877,
+      "learning_rate": 0.0001989157246550365,
+      "loss": 1.0988,
+      "step": 1073
+    },
+    {
+      "epoch": 0.19123931623931623,
+      "grad_norm": 0.5559163689613342,
+      "learning_rate": 0.0001989136680133488,
+      "loss": 0.9155,
+      "step": 1074
+    },
+    {
+      "epoch": 0.19141737891737892,
+      "grad_norm": 0.391804963350296,
+      "learning_rate": 0.00019891160943365322,
+      "loss": 0.9314,
+      "step": 1075
+    },
+    {
+      "epoch": 0.1915954415954416,
+      "grad_norm": 0.4535716474056244,
+      "learning_rate": 0.00019890954891599015,
+      "loss": 1.0768,
+      "step": 1076
+    },
+    {
+      "epoch": 0.19177350427350429,
+      "grad_norm": 0.46770521998405457,
+      "learning_rate": 0.00019890748646039991,
+      "loss": 0.8406,
+      "step": 1077
+    },
+    {
+      "epoch": 0.19195156695156695,
+      "grad_norm": 0.4875394403934479,
+      "learning_rate": 0.00019890542206692295,
+      "loss": 1.1055,
+      "step": 1078
+    },
+    {
+      "epoch": 0.19212962962962962,
+      "grad_norm": 0.5072727203369141,
+      "learning_rate": 0.0001989033557355997,
+      "loss": 1.3093,
+      "step": 1079
+    },
+    {
+      "epoch": 0.19230769230769232,
+      "grad_norm": 0.4419287443161011,
+      "learning_rate": 0.00019890128746647068,
+      "loss": 1.1916,
+      "step": 1080
+    },
+    {
+      "epoch": 0.192485754985755,
+      "grad_norm": 0.45803651213645935,
+      "learning_rate": 0.00019889921725957637,
+      "loss": 1.2579,
+      "step": 1081
+    },
+    {
+      "epoch": 0.19266381766381765,
+      "grad_norm": 0.4832262098789215,
+      "learning_rate": 0.0001988971451149573,
+      "loss": 1.3217,
+      "step": 1082
+    },
+    {
+      "epoch": 0.19284188034188035,
+      "grad_norm": 0.4819786250591278,
+      "learning_rate": 0.00019889507103265416,
+      "loss": 1.0979,
+      "step": 1083
+    },
+    {
+      "epoch": 0.19301994301994302,
+      "grad_norm": 0.49360713362693787,
+      "learning_rate": 0.0001988929950127075,
+      "loss": 1.0987,
+      "step": 1084
+    },
+    {
+      "epoch": 0.1931980056980057,
+      "grad_norm": 0.44209200143814087,
+      "learning_rate": 0.00019889091705515806,
+      "loss": 1.2616,
+      "step": 1085
+    },
+    {
+      "epoch": 0.19337606837606838,
+      "grad_norm": 0.41626206040382385,
+      "learning_rate": 0.00019888883716004654,
+      "loss": 1.0922,
+      "step": 1086
+    },
+    {
+      "epoch": 0.19355413105413105,
+      "grad_norm": 0.4916635751724243,
+      "learning_rate": 0.00019888675532741366,
+      "loss": 0.9331,
+      "step": 1087
+    },
+    {
+      "epoch": 0.19373219373219372,
+      "grad_norm": 0.4493125379085541,
+      "learning_rate": 0.00019888467155730025,
+      "loss": 1.1261,
+      "step": 1088
+    },
+    {
+      "epoch": 0.19391025641025642,
+      "grad_norm": 0.3755671977996826,
+      "learning_rate": 0.00019888258584974708,
+      "loss": 0.9821,
+      "step": 1089
+    },
+    {
+      "epoch": 0.19408831908831908,
+      "grad_norm": 0.41917556524276733,
+      "learning_rate": 0.00019888049820479507,
+      "loss": 1.251,
+      "step": 1090
+    },
+    {
+      "epoch": 0.19426638176638178,
+      "grad_norm": 0.46184420585632324,
+      "learning_rate": 0.0001988784086224851,
+      "loss": 1.1731,
+      "step": 1091
+    },
+    {
+      "epoch": 0.19444444444444445,
+      "grad_norm": 0.4783691465854645,
+      "learning_rate": 0.00019887631710285812,
+      "loss": 1.1635,
+      "step": 1092
+    },
+    {
+      "epoch": 0.19462250712250712,
+      "grad_norm": 0.4710482060909271,
+      "learning_rate": 0.00019887422364595512,
+      "loss": 1.0229,
+      "step": 1093
+    },
+    {
+      "epoch": 0.1948005698005698,
+      "grad_norm": 0.4738706648349762,
+      "learning_rate": 0.00019887212825181707,
+      "loss": 1.128,
+      "step": 1094
+    },
+    {
+      "epoch": 0.19497863247863248,
+      "grad_norm": 0.45665010809898376,
+      "learning_rate": 0.00019887003092048508,
+      "loss": 1.0425,
+      "step": 1095
+    },
+    {
+      "epoch": 0.19515669515669515,
+      "grad_norm": 0.42740485072135925,
+      "learning_rate": 0.0001988679316520002,
+      "loss": 1.0738,
+      "step": 1096
+    },
+    {
+      "epoch": 0.19533475783475784,
+      "grad_norm": 0.5977092385292053,
+      "learning_rate": 0.0001988658304464036,
+      "loss": 1.2687,
+      "step": 1097
+    },
+    {
+      "epoch": 0.1955128205128205,
+      "grad_norm": 0.4411074221134186,
+      "learning_rate": 0.0001988637273037364,
+      "loss": 1.287,
+      "step": 1098
+    },
+    {
+      "epoch": 0.19569088319088318,
+      "grad_norm": 0.4409518539905548,
+      "learning_rate": 0.00019886162222403986,
+      "loss": 1.0515,
+      "step": 1099
+    },
+    {
+      "epoch": 0.19586894586894588,
+      "grad_norm": 0.4926736652851105,
+      "learning_rate": 0.0001988595152073552,
+      "loss": 1.1388,
+      "step": 1100
+    },
+    {
+      "epoch": 0.19604700854700854,
+      "grad_norm": 0.4607115387916565,
+      "learning_rate": 0.00019885740625372368,
+      "loss": 0.9803,
+      "step": 1101
+    },
+    {
+      "epoch": 0.1962250712250712,
+      "grad_norm": 0.4725342094898224,
+      "learning_rate": 0.0001988552953631867,
+      "loss": 1.199,
+      "step": 1102
+    },
+    {
+      "epoch": 0.1964031339031339,
+      "grad_norm": 0.48014503717422485,
+      "learning_rate": 0.00019885318253578548,
+      "loss": 1.1868,
+      "step": 1103
+    },
+    {
+      "epoch": 0.19658119658119658,
+      "grad_norm": 0.3872644603252411,
+      "learning_rate": 0.00019885106777156155,
+      "loss": 0.9182,
+      "step": 1104
+    },
+    {
+      "epoch": 0.19675925925925927,
+      "grad_norm": 0.4737720787525177,
+      "learning_rate": 0.00019884895107055627,
+      "loss": 1.1513,
+      "step": 1105
+    },
+    {
+      "epoch": 0.19693732193732194,
+      "grad_norm": 0.4144562780857086,
+      "learning_rate": 0.00019884683243281116,
+      "loss": 1.1711,
+      "step": 1106
+    },
+    {
+      "epoch": 0.1971153846153846,
+      "grad_norm": 0.4672079384326935,
+      "learning_rate": 0.00019884471185836769,
+      "loss": 1.0386,
+      "step": 1107
+    },
+    {
+      "epoch": 0.1972934472934473,
+      "grad_norm": 0.4558824598789215,
+      "learning_rate": 0.0001988425893472674,
+      "loss": 1.0535,
+      "step": 1108
+    },
+    {
+      "epoch": 0.19747150997150997,
+      "grad_norm": 0.5149834752082825,
+      "learning_rate": 0.00019884046489955192,
+      "loss": 1.0296,
+      "step": 1109
+    },
+    {
+      "epoch": 0.19764957264957264,
+      "grad_norm": 0.43444496393203735,
+      "learning_rate": 0.00019883833851526287,
+      "loss": 1.1475,
+      "step": 1110
+    },
+    {
+      "epoch": 0.19782763532763534,
+      "grad_norm": 0.46062374114990234,
+      "learning_rate": 0.00019883621019444188,
+      "loss": 1.183,
+      "step": 1111
+    },
+    {
+      "epoch": 0.198005698005698,
+      "grad_norm": 0.4893282949924469,
+      "learning_rate": 0.00019883407993713065,
+      "loss": 1.3733,
+      "step": 1112
+    },
+    {
+      "epoch": 0.19818376068376067,
+      "grad_norm": 0.5434843897819519,
+      "learning_rate": 0.00019883194774337096,
+      "loss": 1.2505,
+      "step": 1113
+    },
+    {
+      "epoch": 0.19836182336182337,
+      "grad_norm": 0.4698035418987274,
+      "learning_rate": 0.00019882981361320456,
+      "loss": 1.0152,
+      "step": 1114
+    },
+    {
+      "epoch": 0.19853988603988604,
+      "grad_norm": 0.4582163989543915,
+      "learning_rate": 0.00019882767754667325,
+      "loss": 1.1718,
+      "step": 1115
+    },
+    {
+      "epoch": 0.1987179487179487,
+      "grad_norm": 0.48744696378707886,
+      "learning_rate": 0.0001988255395438189,
+      "loss": 1.2923,
+      "step": 1116
+    },
+    {
+      "epoch": 0.1988960113960114,
+      "grad_norm": 0.4172030985355377,
+      "learning_rate": 0.0001988233996046834,
+      "loss": 0.8098,
+      "step": 1117
+    },
+    {
+      "epoch": 0.19907407407407407,
+      "grad_norm": 0.4556557834148407,
+      "learning_rate": 0.00019882125772930867,
+      "loss": 0.9654,
+      "step": 1118
+    },
+    {
+      "epoch": 0.19925213675213677,
+      "grad_norm": 0.4363219141960144,
+      "learning_rate": 0.00019881911391773666,
+      "loss": 1.0333,
+      "step": 1119
+    },
+    {
+      "epoch": 0.19943019943019943,
+      "grad_norm": 0.4336536228656769,
+      "learning_rate": 0.0001988169681700094,
+      "loss": 1.091,
+      "step": 1120
+    },
+    {
+      "epoch": 0.1996082621082621,
+      "grad_norm": 0.42073166370391846,
+      "learning_rate": 0.00019881482048616893,
+      "loss": 0.9687,
+      "step": 1121
+    },
+    {
+      "epoch": 0.1997863247863248,
+      "grad_norm": 0.4330587685108185,
+      "learning_rate": 0.00019881267086625733,
+      "loss": 1.0512,
+      "step": 1122
+    },
+    {
+      "epoch": 0.19996438746438747,
+      "grad_norm": 0.4602276682853699,
+      "learning_rate": 0.0001988105193103167,
+      "loss": 1.1806,
+      "step": 1123
+    },
+    {
+      "epoch": 0.20014245014245013,
+      "grad_norm": 0.4271257817745209,
+      "learning_rate": 0.0001988083658183892,
+      "loss": 1.1079,
+      "step": 1124
+    },
+    {
+      "epoch": 0.20032051282051283,
+      "grad_norm": 0.35446426272392273,
+      "learning_rate": 0.00019880621039051707,
+      "loss": 0.6769,
+      "step": 1125
+    },
+    {
+      "epoch": 0.2004985754985755,
+      "grad_norm": 0.413753479719162,
+      "learning_rate": 0.00019880405302674244,
+      "loss": 1.1088,
+      "step": 1126
+    },
+    {
+      "epoch": 0.20067663817663817,
+      "grad_norm": 0.4423675835132599,
+      "learning_rate": 0.00019880189372710767,
+      "loss": 1.1371,
+      "step": 1127
+    },
+    {
+      "epoch": 0.20085470085470086,
+      "grad_norm": 0.41865605115890503,
+      "learning_rate": 0.00019879973249165502,
+      "loss": 1.0027,
+      "step": 1128
+    },
+    {
+      "epoch": 0.20103276353276353,
+      "grad_norm": 0.4109594225883484,
+      "learning_rate": 0.00019879756932042686,
+      "loss": 0.8734,
+      "step": 1129
+    },
+    {
+      "epoch": 0.2012108262108262,
+      "grad_norm": 0.42326363921165466,
+      "learning_rate": 0.00019879540421346555,
+      "loss": 0.9722,
+      "step": 1130
+    },
+    {
+      "epoch": 0.2013888888888889,
+      "grad_norm": 0.4601542055606842,
+      "learning_rate": 0.00019879323717081354,
+      "loss": 1.1251,
+      "step": 1131
+    },
+    {
+      "epoch": 0.20156695156695156,
+      "grad_norm": 0.4704367518424988,
+      "learning_rate": 0.00019879106819251327,
+      "loss": 0.9457,
+      "step": 1132
+    },
+    {
+      "epoch": 0.20174501424501423,
+      "grad_norm": 0.465023934841156,
+      "learning_rate": 0.00019878889727860724,
+      "loss": 0.9633,
+      "step": 1133
+    },
+    {
+      "epoch": 0.20192307692307693,
+      "grad_norm": 0.4572450518608093,
+      "learning_rate": 0.00019878672442913796,
+      "loss": 1.1965,
+      "step": 1134
+    },
+    {
+      "epoch": 0.2021011396011396,
+      "grad_norm": 0.4323410391807556,
+      "learning_rate": 0.00019878454964414807,
+      "loss": 1.1296,
+      "step": 1135
+    },
+    {
+      "epoch": 0.2022792022792023,
+      "grad_norm": 0.4513751268386841,
+      "learning_rate": 0.00019878237292368013,
+      "loss": 1.0571,
+      "step": 1136
+    },
+    {
+      "epoch": 0.20245726495726496,
+      "grad_norm": 0.45504096150398254,
+      "learning_rate": 0.00019878019426777677,
+      "loss": 1.0316,
+      "step": 1137
+    },
+    {
+      "epoch": 0.20263532763532763,
+      "grad_norm": 0.45715275406837463,
+      "learning_rate": 0.0001987780136764807,
+      "loss": 1.0528,
+      "step": 1138
+    },
+    {
+      "epoch": 0.20281339031339032,
+      "grad_norm": 0.4934465289115906,
+      "learning_rate": 0.00019877583114983466,
+      "loss": 1.3238,
+      "step": 1139
+    },
+    {
+      "epoch": 0.202991452991453,
+      "grad_norm": 0.4304082989692688,
+      "learning_rate": 0.0001987736466878814,
+      "loss": 1.1774,
+      "step": 1140
+    },
+    {
+      "epoch": 0.20316951566951566,
+      "grad_norm": 0.49721968173980713,
+      "learning_rate": 0.00019877146029066372,
+      "loss": 1.1767,
+      "step": 1141
+    },
+    {
+      "epoch": 0.20334757834757836,
+      "grad_norm": 0.3629468083381653,
+      "learning_rate": 0.00019876927195822445,
+      "loss": 0.8588,
+      "step": 1142
+    },
+    {
+      "epoch": 0.20352564102564102,
+      "grad_norm": 0.49310383200645447,
+      "learning_rate": 0.00019876708169060648,
+      "loss": 1.0588,
+      "step": 1143
+    },
+    {
+      "epoch": 0.2037037037037037,
+      "grad_norm": 0.4270328879356384,
+      "learning_rate": 0.00019876488948785271,
+      "loss": 1.1523,
+      "step": 1144
+    },
+    {
+      "epoch": 0.2038817663817664,
+      "grad_norm": 0.4559730887413025,
+      "learning_rate": 0.0001987626953500061,
+      "loss": 1.1736,
+      "step": 1145
+    },
+    {
+      "epoch": 0.20405982905982906,
+      "grad_norm": 0.5335259437561035,
+      "learning_rate": 0.00019876049927710962,
+      "loss": 0.991,
+      "step": 1146
+    },
+    {
+      "epoch": 0.20423789173789172,
+      "grad_norm": 0.43500083684921265,
+      "learning_rate": 0.0001987583012692063,
+      "loss": 1.0631,
+      "step": 1147
+    },
+    {
+      "epoch": 0.20441595441595442,
+      "grad_norm": 0.4135417938232422,
+      "learning_rate": 0.00019875610132633927,
+      "loss": 1.0896,
+      "step": 1148
+    },
+    {
+      "epoch": 0.2045940170940171,
+      "grad_norm": 0.4078896641731262,
+      "learning_rate": 0.00019875389944855153,
+      "loss": 1.0395,
+      "step": 1149
+    },
+    {
+      "epoch": 0.20477207977207978,
+      "grad_norm": 0.46612194180488586,
+      "learning_rate": 0.00019875169563588632,
+      "loss": 1.0541,
+      "step": 1150
+    },
+    {
+      "epoch": 0.20495014245014245,
+      "grad_norm": 0.5093224048614502,
+      "learning_rate": 0.00019874948988838674,
+      "loss": 1.1486,
+      "step": 1151
+    },
+    {
+      "epoch": 0.20512820512820512,
+      "grad_norm": 0.5079755187034607,
+      "learning_rate": 0.00019874728220609607,
+      "loss": 1.2614,
+      "step": 1152
+    },
+    {
+      "epoch": 0.20530626780626782,
+      "grad_norm": 0.43663498759269714,
+      "learning_rate": 0.0001987450725890575,
+      "loss": 1.0683,
+      "step": 1153
+    },
+    {
+      "epoch": 0.20548433048433049,
+      "grad_norm": 0.5029327273368835,
+      "learning_rate": 0.00019874286103731435,
+      "loss": 1.1934,
+      "step": 1154
+    },
+    {
+      "epoch": 0.20566239316239315,
+      "grad_norm": 0.48770397901535034,
+      "learning_rate": 0.00019874064755090999,
+      "loss": 1.1634,
+      "step": 1155
+    },
+    {
+      "epoch": 0.20584045584045585,
+      "grad_norm": 0.46826690435409546,
+      "learning_rate": 0.00019873843212988776,
+      "loss": 1.0621,
+      "step": 1156
+    },
+    {
+      "epoch": 0.20601851851851852,
+      "grad_norm": 0.4810047149658203,
+      "learning_rate": 0.00019873621477429105,
+      "loss": 1.0879,
+      "step": 1157
+    },
+    {
+      "epoch": 0.20619658119658119,
+      "grad_norm": 0.4769522249698639,
+      "learning_rate": 0.00019873399548416335,
+      "loss": 1.1365,
+      "step": 1158
+    },
+    {
+      "epoch": 0.20637464387464388,
+      "grad_norm": 0.4221782982349396,
+      "learning_rate": 0.00019873177425954806,
+      "loss": 1.1168,
+      "step": 1159
+    },
+    {
+      "epoch": 0.20655270655270655,
+      "grad_norm": 0.4084923565387726,
+      "learning_rate": 0.00019872955110048876,
+      "loss": 1.2364,
+      "step": 1160
+    },
+    {
+      "epoch": 0.20673076923076922,
+      "grad_norm": 0.4781704545021057,
+      "learning_rate": 0.00019872732600702904,
+      "loss": 1.19,
+      "step": 1161
+    },
+    {
+      "epoch": 0.20690883190883191,
+      "grad_norm": 0.3984242081642151,
+      "learning_rate": 0.0001987250989792124,
+      "loss": 1.0568,
+      "step": 1162
+    },
+    {
+      "epoch": 0.20708689458689458,
+      "grad_norm": 0.4601972997188568,
+      "learning_rate": 0.00019872287001708257,
+      "loss": 1.1625,
+      "step": 1163
+    },
+    {
+      "epoch": 0.20726495726495728,
+      "grad_norm": 0.4853581190109253,
+      "learning_rate": 0.00019872063912068316,
+      "loss": 1.2304,
+      "step": 1164
+    },
+    {
+      "epoch": 0.20744301994301995,
+      "grad_norm": 0.41779839992523193,
+      "learning_rate": 0.0001987184062900579,
+      "loss": 0.9807,
+      "step": 1165
+    },
+    {
+      "epoch": 0.20762108262108261,
+      "grad_norm": 0.4945356249809265,
+      "learning_rate": 0.00019871617152525056,
+      "loss": 1.1861,
+      "step": 1166
+    },
+    {
+      "epoch": 0.2077991452991453,
+      "grad_norm": 0.47432294487953186,
+      "learning_rate": 0.00019871393482630487,
+      "loss": 1.1448,
+      "step": 1167
+    },
+    {
+      "epoch": 0.20797720797720798,
+      "grad_norm": 0.44647398591041565,
+      "learning_rate": 0.00019871169619326473,
+      "loss": 1.096,
+      "step": 1168
+    },
+    {
+      "epoch": 0.20815527065527065,
+      "grad_norm": 0.4643072783946991,
+      "learning_rate": 0.00019870945562617393,
+      "loss": 1.1561,
+      "step": 1169
+    },
+    {
+      "epoch": 0.20833333333333334,
+      "grad_norm": 0.4544340968132019,
+      "learning_rate": 0.0001987072131250764,
+      "loss": 1.0764,
+      "step": 1170
+    },
+    {
+      "epoch": 0.208511396011396,
+      "grad_norm": 0.6036561727523804,
+      "learning_rate": 0.00019870496869001607,
+      "loss": 1.3961,
+      "step": 1171
+    },
+    {
+      "epoch": 0.20868945868945868,
+      "grad_norm": 0.41348758339881897,
+      "learning_rate": 0.00019870272232103695,
+      "loss": 1.2219,
+      "step": 1172
+    },
+    {
+      "epoch": 0.20886752136752137,
+      "grad_norm": 0.4184056222438812,
+      "learning_rate": 0.000198700474018183,
+      "loss": 1.1115,
+      "step": 1173
+    },
+    {
+      "epoch": 0.20904558404558404,
+      "grad_norm": 0.41920599341392517,
+      "learning_rate": 0.0001986982237814983,
+      "loss": 0.9207,
+      "step": 1174
+    },
+    {
+      "epoch": 0.2092236467236467,
+      "grad_norm": 0.4710249602794647,
+      "learning_rate": 0.00019869597161102694,
+      "loss": 1.1342,
+      "step": 1175
+    },
+    {
+      "epoch": 0.2094017094017094,
+      "grad_norm": 0.46897777915000916,
+      "learning_rate": 0.000198693717506813,
+      "loss": 0.983,
+      "step": 1176
+    },
+    {
+      "epoch": 0.20957977207977208,
+      "grad_norm": 0.4817039370536804,
+      "learning_rate": 0.00019869146146890074,
+      "loss": 1.0923,
+      "step": 1177
+    },
+    {
+      "epoch": 0.20975783475783477,
+      "grad_norm": 0.4806751012802124,
+      "learning_rate": 0.00019868920349733427,
+      "loss": 1.2296,
+      "step": 1178
+    },
+    {
+      "epoch": 0.20993589743589744,
+      "grad_norm": 0.44182994961738586,
+      "learning_rate": 0.0001986869435921579,
+      "loss": 1.1856,
+      "step": 1179
+    },
+    {
+      "epoch": 0.2101139601139601,
+      "grad_norm": 0.4282805621623993,
+      "learning_rate": 0.00019868468175341584,
+      "loss": 1.0046,
+      "step": 1180
+    },
+    {
+      "epoch": 0.2102920227920228,
+      "grad_norm": 0.5011838674545288,
+      "learning_rate": 0.00019868241798115242,
+      "loss": 1.2401,
+      "step": 1181
+    },
+    {
+      "epoch": 0.21047008547008547,
+      "grad_norm": 0.4282447397708893,
+      "learning_rate": 0.00019868015227541208,
+      "loss": 0.9338,
+      "step": 1182
+    },
+    {
+      "epoch": 0.21064814814814814,
+      "grad_norm": 0.4348810911178589,
+      "learning_rate": 0.00019867788463623912,
+      "loss": 0.926,
+      "step": 1183
+    },
+    {
+      "epoch": 0.21082621082621084,
+      "grad_norm": 0.41518425941467285,
+      "learning_rate": 0.00019867561506367799,
+      "loss": 1.2723,
+      "step": 1184
+    },
+    {
+      "epoch": 0.2110042735042735,
+      "grad_norm": 0.47346001863479614,
+      "learning_rate": 0.00019867334355777315,
+      "loss": 1.1931,
+      "step": 1185
+    },
+    {
+      "epoch": 0.21118233618233617,
+      "grad_norm": 0.4071715474128723,
+      "learning_rate": 0.00019867107011856914,
+      "loss": 0.9619,
+      "step": 1186
+    },
+    {
+      "epoch": 0.21136039886039887,
+      "grad_norm": 0.4803447425365448,
+      "learning_rate": 0.00019866879474611046,
+      "loss": 1.2,
+      "step": 1187
+    },
+    {
+      "epoch": 0.21153846153846154,
+      "grad_norm": 0.4827699661254883,
+      "learning_rate": 0.00019866651744044172,
+      "loss": 1.0938,
+      "step": 1188
+    },
+    {
+      "epoch": 0.2117165242165242,
+      "grad_norm": 0.4528424143791199,
+      "learning_rate": 0.00019866423820160756,
+      "loss": 0.9721,
+      "step": 1189
+    },
+    {
+      "epoch": 0.2118945868945869,
+      "grad_norm": 0.43566834926605225,
+      "learning_rate": 0.0001986619570296526,
+      "loss": 1.0352,
+      "step": 1190
+    },
+    {
+      "epoch": 0.21207264957264957,
+      "grad_norm": 0.4516540467739105,
+      "learning_rate": 0.0001986596739246215,
+      "loss": 1.1333,
+      "step": 1191
+    },
+    {
+      "epoch": 0.21225071225071226,
+      "grad_norm": 0.4456641376018524,
+      "learning_rate": 0.00019865738888655908,
+      "loss": 1.2813,
+      "step": 1192
+    },
+    {
+      "epoch": 0.21242877492877493,
+      "grad_norm": 0.47048309445381165,
+      "learning_rate": 0.00019865510191551008,
+      "loss": 1.1067,
+      "step": 1193
+    },
+    {
+      "epoch": 0.2126068376068376,
+      "grad_norm": 0.4604061543941498,
+      "learning_rate": 0.00019865281301151928,
+      "loss": 0.925,
+      "step": 1194
+    },
+    {
+      "epoch": 0.2127849002849003,
+      "grad_norm": 0.49341437220573425,
+      "learning_rate": 0.00019865052217463153,
+      "loss": 1.2319,
+      "step": 1195
+    },
+    {
+      "epoch": 0.21296296296296297,
+      "grad_norm": 0.5099014639854431,
+      "learning_rate": 0.00019864822940489173,
+      "loss": 1.139,
+      "step": 1196
+    },
+    {
+      "epoch": 0.21314102564102563,
+      "grad_norm": 0.41396936774253845,
+      "learning_rate": 0.0001986459347023448,
+      "loss": 1.0594,
+      "step": 1197
+    },
+    {
+      "epoch": 0.21331908831908833,
+      "grad_norm": 0.46071869134902954,
+      "learning_rate": 0.0001986436380670357,
+      "loss": 1.0815,
+      "step": 1198
+    },
+    {
+      "epoch": 0.213497150997151,
+      "grad_norm": 0.507882297039032,
+      "learning_rate": 0.00019864133949900942,
+      "loss": 1.3841,
+      "step": 1199
+    },
+    {
+      "epoch": 0.21367521367521367,
+      "grad_norm": 0.45680439472198486,
+      "learning_rate": 0.00019863903899831103,
+      "loss": 1.0945,
+      "step": 1200
+    },
+    {
+      "epoch": 0.21385327635327636,
+      "grad_norm": 0.44277429580688477,
+      "learning_rate": 0.00019863673656498555,
+      "loss": 1.1655,
+      "step": 1201
+    },
+    {
+      "epoch": 0.21403133903133903,
+      "grad_norm": 0.43890756368637085,
+      "learning_rate": 0.00019863443219907812,
+      "loss": 1.1186,
+      "step": 1202
+    },
+    {
+      "epoch": 0.2142094017094017,
+      "grad_norm": 0.3910178542137146,
+      "learning_rate": 0.0001986321259006339,
+      "loss": 1.0817,
+      "step": 1203
+    },
+    {
+      "epoch": 0.2143874643874644,
+      "grad_norm": 0.3803878128528595,
+      "learning_rate": 0.00019862981766969803,
+      "loss": 0.8022,
+      "step": 1204
+    },
+    {
+      "epoch": 0.21456552706552706,
+      "grad_norm": 0.4495108425617218,
+      "learning_rate": 0.0001986275075063158,
+      "loss": 1.2212,
+      "step": 1205
+    },
+    {
+      "epoch": 0.21474358974358973,
+      "grad_norm": 0.5211976766586304,
+      "learning_rate": 0.00019862519541053244,
+      "loss": 1.2771,
+      "step": 1206
+    },
+    {
+      "epoch": 0.21492165242165243,
+      "grad_norm": 0.4313061535358429,
+      "learning_rate": 0.00019862288138239325,
+      "loss": 1.1205,
+      "step": 1207
+    },
+    {
+      "epoch": 0.2150997150997151,
+      "grad_norm": 0.47110888361930847,
+      "learning_rate": 0.00019862056542194355,
+      "loss": 1.1835,
+      "step": 1208
+    },
+    {
+      "epoch": 0.2152777777777778,
+      "grad_norm": 0.5129403471946716,
+      "learning_rate": 0.00019861824752922876,
+      "loss": 1.1655,
+      "step": 1209
+    },
+    {
+      "epoch": 0.21545584045584046,
+      "grad_norm": 0.4353938102722168,
+      "learning_rate": 0.00019861592770429427,
+      "loss": 1.2794,
+      "step": 1210
+    },
+    {
+      "epoch": 0.21563390313390313,
+      "grad_norm": 0.48590636253356934,
+      "learning_rate": 0.0001986136059471855,
+      "loss": 1.2003,
+      "step": 1211
+    },
+    {
+      "epoch": 0.21581196581196582,
+      "grad_norm": 0.4738406836986542,
+      "learning_rate": 0.00019861128225794804,
+      "loss": 1.2271,
+      "step": 1212
+    },
+    {
+      "epoch": 0.2159900284900285,
+      "grad_norm": 0.45983126759529114,
+      "learning_rate": 0.0001986089566366273,
+      "loss": 1.1896,
+      "step": 1213
+    },
+    {
+      "epoch": 0.21616809116809116,
+      "grad_norm": 0.37296006083488464,
+      "learning_rate": 0.00019860662908326892,
+      "loss": 1.079,
+      "step": 1214
+    },
+    {
+      "epoch": 0.21634615384615385,
+      "grad_norm": 0.4442676305770874,
+      "learning_rate": 0.00019860429959791845,
+      "loss": 1.1754,
+      "step": 1215
+    },
+    {
+      "epoch": 0.21652421652421652,
+      "grad_norm": 0.4950128495693207,
+      "learning_rate": 0.0001986019681806216,
+      "loss": 1.1571,
+      "step": 1216
+    },
+    {
+      "epoch": 0.2167022792022792,
+      "grad_norm": 0.4374556541442871,
+      "learning_rate": 0.000198599634831424,
+      "loss": 1.1003,
+      "step": 1217
+    },
+    {
+      "epoch": 0.2168803418803419,
+      "grad_norm": 0.47301414608955383,
+      "learning_rate": 0.00019859729955037136,
+      "loss": 1.1426,
+      "step": 1218
+    },
+    {
+      "epoch": 0.21705840455840456,
+      "grad_norm": 0.41213178634643555,
+      "learning_rate": 0.00019859496233750947,
+      "loss": 1.0659,
+      "step": 1219
+    },
+    {
+      "epoch": 0.21723646723646722,
+      "grad_norm": 0.41601964831352234,
+      "learning_rate": 0.0001985926231928841,
+      "loss": 1.0248,
+      "step": 1220
+    },
+    {
+      "epoch": 0.21741452991452992,
+      "grad_norm": 0.46328839659690857,
+      "learning_rate": 0.0001985902821165411,
+      "loss": 1.0405,
+      "step": 1221
+    },
+    {
+      "epoch": 0.2175925925925926,
+      "grad_norm": 0.43287959694862366,
+      "learning_rate": 0.0001985879391085263,
+      "loss": 0.9202,
+      "step": 1222
+    },
+    {
+      "epoch": 0.21777065527065528,
+      "grad_norm": 0.4770444631576538,
+      "learning_rate": 0.00019858559416888568,
+      "loss": 1.0911,
+      "step": 1223
+    },
+    {
+      "epoch": 0.21794871794871795,
+      "grad_norm": 0.4756585955619812,
+      "learning_rate": 0.00019858324729766507,
+      "loss": 1.1566,
+      "step": 1224
+    },
+    {
+      "epoch": 0.21812678062678062,
+      "grad_norm": 0.4337233006954193,
+      "learning_rate": 0.00019858089849491054,
+      "loss": 0.9084,
+      "step": 1225
+    },
+    {
+      "epoch": 0.21830484330484332,
+      "grad_norm": 0.5165579319000244,
+      "learning_rate": 0.00019857854776066813,
+      "loss": 1.4154,
+      "step": 1226
+    },
+    {
+      "epoch": 0.21848290598290598,
+      "grad_norm": 0.4280378520488739,
+      "learning_rate": 0.00019857619509498382,
+      "loss": 1.1291,
+      "step": 1227
+    },
+    {
+      "epoch": 0.21866096866096865,
+      "grad_norm": 0.5375089049339294,
+      "learning_rate": 0.00019857384049790376,
+      "loss": 1.2985,
+      "step": 1228
+    },
+    {
+      "epoch": 0.21883903133903135,
+      "grad_norm": 0.4708811640739441,
+      "learning_rate": 0.00019857148396947401,
+      "loss": 1.0589,
+      "step": 1229
+    },
+    {
+      "epoch": 0.21901709401709402,
+      "grad_norm": 0.4744570255279541,
+      "learning_rate": 0.00019856912550974084,
+      "loss": 1.1269,
+      "step": 1230
+    },
+    {
+      "epoch": 0.21919515669515668,
+      "grad_norm": 0.5355265736579895,
+      "learning_rate": 0.00019856676511875043,
+      "loss": 1.1441,
+      "step": 1231
+    },
+    {
+      "epoch": 0.21937321937321938,
+      "grad_norm": 0.42718183994293213,
+      "learning_rate": 0.00019856440279654897,
+      "loss": 1.0244,
+      "step": 1232
+    },
+    {
+      "epoch": 0.21955128205128205,
+      "grad_norm": 0.5162127614021301,
+      "learning_rate": 0.00019856203854318283,
+      "loss": 1.2674,
+      "step": 1233
+    },
+    {
+      "epoch": 0.21972934472934472,
+      "grad_norm": 0.5180695652961731,
+      "learning_rate": 0.00019855967235869827,
+      "loss": 1.2472,
+      "step": 1234
+    },
+    {
+      "epoch": 0.2199074074074074,
+      "grad_norm": 0.4290023744106293,
+      "learning_rate": 0.00019855730424314167,
+      "loss": 1.0502,
+      "step": 1235
+    },
+    {
+      "epoch": 0.22008547008547008,
+      "grad_norm": 0.4418254792690277,
+      "learning_rate": 0.00019855493419655945,
+      "loss": 1.0589,
+      "step": 1236
+    },
+    {
+      "epoch": 0.22026353276353278,
+      "grad_norm": 0.4074663817882538,
+      "learning_rate": 0.000198552562218998,
+      "loss": 0.9197,
+      "step": 1237
+    },
+    {
+      "epoch": 0.22044159544159544,
+      "grad_norm": 0.4526660740375519,
+      "learning_rate": 0.00019855018831050383,
+      "loss": 1.2578,
+      "step": 1238
+    },
+    {
+      "epoch": 0.2206196581196581,
+      "grad_norm": 0.4747827649116516,
+      "learning_rate": 0.00019854781247112343,
+      "loss": 1.0841,
+      "step": 1239
+    },
+    {
+      "epoch": 0.2207977207977208,
+      "grad_norm": 0.41567128896713257,
+      "learning_rate": 0.00019854543470090334,
+      "loss": 1.0737,
+      "step": 1240
+    },
+    {
+      "epoch": 0.22097578347578348,
+      "grad_norm": 0.4793100953102112,
+      "learning_rate": 0.00019854305499989022,
+      "loss": 1.1972,
+      "step": 1241
+    },
+    {
+      "epoch": 0.22115384615384615,
+      "grad_norm": 0.41755473613739014,
+      "learning_rate": 0.00019854067336813058,
+      "loss": 1.2529,
+      "step": 1242
+    },
+    {
+      "epoch": 0.22133190883190884,
+      "grad_norm": 0.40421152114868164,
+      "learning_rate": 0.0001985382898056712,
+      "loss": 1.0549,
+      "step": 1243
+    },
+    {
+      "epoch": 0.2215099715099715,
+      "grad_norm": 0.45779645442962646,
+      "learning_rate": 0.0001985359043125587,
+      "loss": 1.1586,
+      "step": 1244
+    },
+    {
+      "epoch": 0.22168803418803418,
+      "grad_norm": 0.4380546808242798,
+      "learning_rate": 0.00019853351688883987,
+      "loss": 1.1024,
+      "step": 1245
+    },
+    {
+      "epoch": 0.22186609686609687,
+      "grad_norm": 0.39917269349098206,
+      "learning_rate": 0.00019853112753456142,
+      "loss": 0.9823,
+      "step": 1246
+    },
+    {
+      "epoch": 0.22204415954415954,
+      "grad_norm": 0.4228038489818573,
+      "learning_rate": 0.00019852873624977022,
+      "loss": 1.1684,
+      "step": 1247
+    },
+    {
+      "epoch": 0.2222222222222222,
+      "grad_norm": 0.4462146759033203,
+      "learning_rate": 0.00019852634303451315,
+      "loss": 0.9027,
+      "step": 1248
+    },
+    {
+      "epoch": 0.2224002849002849,
+      "grad_norm": 0.5682163834571838,
+      "learning_rate": 0.000198523947888837,
+      "loss": 1.141,
+      "step": 1249
+    },
+    {
+      "epoch": 0.22257834757834757,
+      "grad_norm": 0.44866830110549927,
+      "learning_rate": 0.0001985215508127888,
+      "loss": 1.0759,
+      "step": 1250
+    },
+    {
+      "epoch": 0.22275641025641027,
+      "grad_norm": 0.4034106135368347,
+      "learning_rate": 0.00019851915180641548,
+      "loss": 1.0675,
+      "step": 1251
+    },
+    {
+      "epoch": 0.22293447293447294,
+      "grad_norm": 0.4780726432800293,
+      "learning_rate": 0.00019851675086976397,
+      "loss": 1.0283,
+      "step": 1252
+    },
+    {
+      "epoch": 0.2231125356125356,
+      "grad_norm": 0.48892372846603394,
+      "learning_rate": 0.00019851434800288145,
+      "loss": 1.1159,
+      "step": 1253
+    },
+    {
+      "epoch": 0.2232905982905983,
+      "grad_norm": 0.42629215121269226,
+      "learning_rate": 0.0001985119432058149,
+      "loss": 1.0292,
+      "step": 1254
+    },
+    {
+      "epoch": 0.22346866096866097,
+      "grad_norm": 0.4496444761753082,
+      "learning_rate": 0.00019850953647861146,
+      "loss": 1.0252,
+      "step": 1255
+    },
+    {
+      "epoch": 0.22364672364672364,
+      "grad_norm": 0.4371408224105835,
+      "learning_rate": 0.00019850712782131828,
+      "loss": 1.1104,
+      "step": 1256
+    },
+    {
+      "epoch": 0.22382478632478633,
+      "grad_norm": 0.4910794496536255,
+      "learning_rate": 0.00019850471723398258,
+      "loss": 1.1928,
+      "step": 1257
+    },
+    {
+      "epoch": 0.224002849002849,
+      "grad_norm": 0.41235068440437317,
+      "learning_rate": 0.00019850230471665157,
+      "loss": 1.1261,
+      "step": 1258
+    },
+    {
+      "epoch": 0.22418091168091167,
+      "grad_norm": 0.4507700502872467,
+      "learning_rate": 0.0001984998902693725,
+      "loss": 1.0602,
+      "step": 1259
+    },
+    {
+      "epoch": 0.22435897435897437,
+      "grad_norm": 0.4654198884963989,
+      "learning_rate": 0.00019849747389219272,
+      "loss": 1.1258,
+      "step": 1260
+    },
+    {
+      "epoch": 0.22453703703703703,
+      "grad_norm": 0.439807653427124,
+      "learning_rate": 0.00019849505558515952,
+      "loss": 1.2312,
+      "step": 1261
+    },
+    {
+      "epoch": 0.2247150997150997,
+      "grad_norm": 0.4309258759021759,
+      "learning_rate": 0.00019849263534832035,
+      "loss": 1.0083,
+      "step": 1262
+    },
+    {
+      "epoch": 0.2248931623931624,
+      "grad_norm": 0.4920141100883484,
+      "learning_rate": 0.00019849021318172255,
+      "loss": 1.0254,
+      "step": 1263
+    },
+    {
+      "epoch": 0.22507122507122507,
+      "grad_norm": 0.5333457589149475,
+      "learning_rate": 0.00019848778908541367,
+      "loss": 1.3017,
+      "step": 1264
+    },
+    {
+      "epoch": 0.22524928774928774,
+      "grad_norm": 0.4096757769584656,
+      "learning_rate": 0.0001984853630594411,
+      "loss": 0.9531,
+      "step": 1265
+    },
+    {
+      "epoch": 0.22542735042735043,
+      "grad_norm": 0.5744075775146484,
+      "learning_rate": 0.00019848293510385244,
+      "loss": 1.1414,
+      "step": 1266
+    },
+    {
+      "epoch": 0.2256054131054131,
+      "grad_norm": 0.44707193970680237,
+      "learning_rate": 0.00019848050521869529,
+      "loss": 1.1926,
+      "step": 1267
+    },
+    {
+      "epoch": 0.2257834757834758,
+      "grad_norm": 0.4162999391555786,
+      "learning_rate": 0.00019847807340401716,
+      "loss": 1.1354,
+      "step": 1268
+    },
+    {
+      "epoch": 0.22596153846153846,
+      "grad_norm": 0.4273204207420349,
+      "learning_rate": 0.0001984756396598658,
+      "loss": 0.9956,
+      "step": 1269
+    },
+    {
+      "epoch": 0.22613960113960113,
+      "grad_norm": 0.5670466423034668,
+      "learning_rate": 0.00019847320398628878,
+      "loss": 1.2384,
+      "step": 1270
+    },
+    {
+      "epoch": 0.22631766381766383,
+      "grad_norm": 0.424544095993042,
+      "learning_rate": 0.00019847076638333395,
+      "loss": 0.9963,
+      "step": 1271
+    },
+    {
+      "epoch": 0.2264957264957265,
+      "grad_norm": 0.3716120719909668,
+      "learning_rate": 0.000198468326851049,
+      "loss": 0.865,
+      "step": 1272
+    },
+    {
+      "epoch": 0.22667378917378916,
+      "grad_norm": 0.4472847282886505,
+      "learning_rate": 0.00019846588538948172,
+      "loss": 1.174,
+      "step": 1273
+    },
+    {
+      "epoch": 0.22685185185185186,
+      "grad_norm": 0.4599195718765259,
+      "learning_rate": 0.00019846344199867994,
+      "loss": 1.289,
+      "step": 1274
+    },
+    {
+      "epoch": 0.22702991452991453,
+      "grad_norm": 0.4303213357925415,
+      "learning_rate": 0.0001984609966786916,
+      "loss": 1.1606,
+      "step": 1275
+    },
+    {
+      "epoch": 0.2272079772079772,
+      "grad_norm": 0.44893527030944824,
+      "learning_rate": 0.00019845854942956455,
+      "loss": 1.1043,
+      "step": 1276
+    },
+    {
+      "epoch": 0.2273860398860399,
+      "grad_norm": 0.40033379197120667,
+      "learning_rate": 0.00019845610025134676,
+      "loss": 1.1434,
+      "step": 1277
+    },
+    {
+      "epoch": 0.22756410256410256,
+      "grad_norm": 0.4385402202606201,
+      "learning_rate": 0.00019845364914408616,
+      "loss": 0.9943,
+      "step": 1278
+    },
+    {
+      "epoch": 0.22774216524216523,
+      "grad_norm": 0.42123618721961975,
+      "learning_rate": 0.0001984511961078309,
+      "loss": 1.0911,
+      "step": 1279
+    },
+    {
+      "epoch": 0.22792022792022792,
+      "grad_norm": 0.5558577179908752,
+      "learning_rate": 0.00019844874114262893,
+      "loss": 1.3893,
+      "step": 1280
+    },
+    {
+      "epoch": 0.2280982905982906,
+      "grad_norm": 0.3996453583240509,
+      "learning_rate": 0.00019844628424852835,
+      "loss": 0.8951,
+      "step": 1281
+    },
+    {
+      "epoch": 0.2282763532763533,
+      "grad_norm": 0.3943425714969635,
+      "learning_rate": 0.0001984438254255774,
+      "loss": 1.0595,
+      "step": 1282
+    },
+    {
+      "epoch": 0.22845441595441596,
+      "grad_norm": 0.4429021179676056,
+      "learning_rate": 0.00019844136467382414,
+      "loss": 1.0853,
+      "step": 1283
+    },
+    {
+      "epoch": 0.22863247863247863,
+      "grad_norm": 0.4515686631202698,
+      "learning_rate": 0.00019843890199331687,
+      "loss": 1.0829,
+      "step": 1284
+    },
+    {
+      "epoch": 0.22881054131054132,
+      "grad_norm": 0.5157768726348877,
+      "learning_rate": 0.00019843643738410378,
+      "loss": 1.334,
+      "step": 1285
+    },
+    {
+      "epoch": 0.228988603988604,
+      "grad_norm": 0.45833173394203186,
+      "learning_rate": 0.0001984339708462332,
+      "loss": 1.1353,
+      "step": 1286
+    },
+    {
+      "epoch": 0.22916666666666666,
+      "grad_norm": 0.46610337495803833,
+      "learning_rate": 0.00019843150237975344,
+      "loss": 1.1338,
+      "step": 1287
+    },
+    {
+      "epoch": 0.22934472934472935,
+      "grad_norm": 0.5076978802680969,
+      "learning_rate": 0.00019842903198471286,
+      "loss": 1.1811,
+      "step": 1288
+    },
+    {
+      "epoch": 0.22952279202279202,
+      "grad_norm": 0.4297824800014496,
+      "learning_rate": 0.00019842655966115986,
+      "loss": 1.1799,
+      "step": 1289
+    },
+    {
+      "epoch": 0.2297008547008547,
+      "grad_norm": 0.5304586291313171,
+      "learning_rate": 0.0001984240854091429,
+      "loss": 1.1315,
+      "step": 1290
+    },
+    {
+      "epoch": 0.22987891737891739,
+      "grad_norm": 0.45359212160110474,
+      "learning_rate": 0.00019842160922871042,
+      "loss": 1.1037,
+      "step": 1291
+    },
+    {
+      "epoch": 0.23005698005698005,
+      "grad_norm": 0.4416881203651428,
+      "learning_rate": 0.00019841913111991096,
+      "loss": 1.122,
+      "step": 1292
+    },
+    {
+      "epoch": 0.23023504273504272,
+      "grad_norm": 0.46682995557785034,
+      "learning_rate": 0.0001984166510827931,
+      "loss": 0.9808,
+      "step": 1293
+    },
+    {
+      "epoch": 0.23041310541310542,
+      "grad_norm": 0.44172337651252747,
+      "learning_rate": 0.00019841416911740538,
+      "loss": 0.9167,
+      "step": 1294
+    },
+    {
+      "epoch": 0.23059116809116809,
+      "grad_norm": 0.40562742948532104,
+      "learning_rate": 0.0001984116852237965,
+      "loss": 0.9547,
+      "step": 1295
+    },
+    {
+      "epoch": 0.23076923076923078,
+      "grad_norm": 0.4040384888648987,
+      "learning_rate": 0.00019840919940201503,
+      "loss": 1.1039,
+      "step": 1296
+    },
+    {
+      "epoch": 0.23094729344729345,
+      "grad_norm": 0.5094077587127686,
+      "learning_rate": 0.00019840671165210973,
+      "loss": 1.2283,
+      "step": 1297
+    },
+    {
+      "epoch": 0.23112535612535612,
+      "grad_norm": 0.48553213477134705,
+      "learning_rate": 0.00019840422197412938,
+      "loss": 1.0927,
+      "step": 1298
+    },
+    {
+      "epoch": 0.23130341880341881,
+      "grad_norm": 0.5197509527206421,
+      "learning_rate": 0.00019840173036812266,
+      "loss": 1.2154,
+      "step": 1299
+    },
+    {
+      "epoch": 0.23148148148148148,
+      "grad_norm": 0.42069005966186523,
+      "learning_rate": 0.0001983992368341385,
+      "loss": 1.0076,
+      "step": 1300
+    },
+    {
+      "epoch": 0.23165954415954415,
+      "grad_norm": 0.475204735994339,
+      "learning_rate": 0.00019839674137222567,
+      "loss": 1.1682,
+      "step": 1301
+    },
+    {
+      "epoch": 0.23183760683760685,
+      "grad_norm": 0.55730140209198,
+      "learning_rate": 0.0001983942439824331,
+      "loss": 1.2948,
+      "step": 1302
+    },
+    {
+      "epoch": 0.23201566951566951,
+      "grad_norm": 0.4533313512802124,
+      "learning_rate": 0.00019839174466480973,
+      "loss": 1.2691,
+      "step": 1303
+    },
+    {
+      "epoch": 0.23219373219373218,
+      "grad_norm": 0.4733520746231079,
+      "learning_rate": 0.0001983892434194045,
+      "loss": 1.2232,
+      "step": 1304
+    },
+    {
+      "epoch": 0.23237179487179488,
+      "grad_norm": 0.5085756182670593,
+      "learning_rate": 0.00019838674024626643,
+      "loss": 1.1347,
+      "step": 1305
+    },
+    {
+      "epoch": 0.23254985754985755,
+      "grad_norm": 0.4679976999759674,
+      "learning_rate": 0.00019838423514544456,
+      "loss": 1.0018,
+      "step": 1306
+    },
+    {
+      "epoch": 0.23272792022792022,
+      "grad_norm": 0.4234481751918793,
+      "learning_rate": 0.00019838172811698795,
+      "loss": 1.0472,
+      "step": 1307
+    },
+    {
+      "epoch": 0.2329059829059829,
+      "grad_norm": 0.5749204158782959,
+      "learning_rate": 0.00019837921916094579,
+      "loss": 1.2239,
+      "step": 1308
+    },
+    {
+      "epoch": 0.23308404558404558,
+      "grad_norm": 0.46715882420539856,
+      "learning_rate": 0.0001983767082773672,
+      "loss": 1.1924,
+      "step": 1309
+    },
+    {
+      "epoch": 0.23326210826210828,
+      "grad_norm": 0.5079745054244995,
+      "learning_rate": 0.00019837419546630137,
+      "loss": 1.1086,
+      "step": 1310
+    },
+    {
+      "epoch": 0.23344017094017094,
+      "grad_norm": 0.4419243037700653,
+      "learning_rate": 0.0001983716807277975,
+      "loss": 1.1911,
+      "step": 1311
+    },
+    {
+      "epoch": 0.2336182336182336,
+      "grad_norm": 0.5107570290565491,
+      "learning_rate": 0.00019836916406190493,
+      "loss": 1.1071,
+      "step": 1312
+    },
+    {
+      "epoch": 0.2337962962962963,
+      "grad_norm": 0.5295659303665161,
+      "learning_rate": 0.00019836664546867293,
+      "loss": 1.2905,
+      "step": 1313
+    },
+    {
+      "epoch": 0.23397435897435898,
+      "grad_norm": 0.4844837784767151,
+      "learning_rate": 0.00019836412494815084,
+      "loss": 1.3507,
+      "step": 1314
+    },
+    {
+      "epoch": 0.23415242165242164,
+      "grad_norm": 0.6166049242019653,
+      "learning_rate": 0.00019836160250038808,
+      "loss": 1.2822,
+      "step": 1315
+    },
+    {
+      "epoch": 0.23433048433048434,
+      "grad_norm": 0.3229198753833771,
+      "learning_rate": 0.00019835907812543402,
+      "loss": 0.4959,
+      "step": 1316
+    },
+    {
+      "epoch": 0.234508547008547,
+      "grad_norm": 0.5788772702217102,
+      "learning_rate": 0.00019835655182333815,
+      "loss": 1.0832,
+      "step": 1317
+    },
+    {
+      "epoch": 0.23468660968660968,
+      "grad_norm": 0.525705099105835,
+      "learning_rate": 0.00019835402359414997,
+      "loss": 1.0968,
+      "step": 1318
+    },
+    {
+      "epoch": 0.23486467236467237,
+      "grad_norm": 0.5007779002189636,
+      "learning_rate": 0.000198351493437919,
+      "loss": 1.2788,
+      "step": 1319
+    },
+    {
+      "epoch": 0.23504273504273504,
+      "grad_norm": 0.4276871383190155,
+      "learning_rate": 0.00019834896135469484,
+      "loss": 1.0419,
+      "step": 1320
+    },
+    {
+      "epoch": 0.2352207977207977,
+      "grad_norm": 0.5359070301055908,
+      "learning_rate": 0.00019834642734452708,
+      "loss": 1.1308,
+      "step": 1321
+    },
+    {
+      "epoch": 0.2353988603988604,
+      "grad_norm": 0.4854908883571625,
+      "learning_rate": 0.0001983438914074654,
+      "loss": 1.1211,
+      "step": 1322
+    },
+    {
+      "epoch": 0.23557692307692307,
+      "grad_norm": 0.4913707375526428,
+      "learning_rate": 0.0001983413535435594,
+      "loss": 1.2392,
+      "step": 1323
+    },
+    {
+      "epoch": 0.23575498575498577,
+      "grad_norm": 0.46755748987197876,
+      "learning_rate": 0.0001983388137528589,
+      "loss": 0.9348,
+      "step": 1324
+    },
+    {
+      "epoch": 0.23593304843304844,
+      "grad_norm": 0.4592570960521698,
+      "learning_rate": 0.0001983362720354136,
+      "loss": 1.1339,
+      "step": 1325
+    },
+    {
+      "epoch": 0.2361111111111111,
+      "grad_norm": 0.5121711492538452,
+      "learning_rate": 0.00019833372839127335,
+      "loss": 1.2973,
+      "step": 1326
+    },
+    {
+      "epoch": 0.2362891737891738,
+      "grad_norm": 0.4809017479419708,
+      "learning_rate": 0.000198331182820488,
+      "loss": 0.9849,
+      "step": 1327
+    },
+    {
+      "epoch": 0.23646723646723647,
+      "grad_norm": 0.42340895533561707,
+      "learning_rate": 0.00019832863532310733,
+      "loss": 1.0731,
+      "step": 1328
+    },
+    {
+      "epoch": 0.23664529914529914,
+      "grad_norm": 0.5388045310974121,
+      "learning_rate": 0.00019832608589918135,
+      "loss": 1.0729,
+      "step": 1329
+    },
+    {
+      "epoch": 0.23682336182336183,
+      "grad_norm": 0.43075770139694214,
+      "learning_rate": 0.00019832353454875992,
+      "loss": 1.1684,
+      "step": 1330
+    },
+    {
+      "epoch": 0.2370014245014245,
+      "grad_norm": 0.554927408695221,
+      "learning_rate": 0.00019832098127189313,
+      "loss": 1.0842,
+      "step": 1331
+    },
+    {
+      "epoch": 0.23717948717948717,
+      "grad_norm": 0.5359260439872742,
+      "learning_rate": 0.0001983184260686309,
+      "loss": 1.2399,
+      "step": 1332
+    },
+    {
+      "epoch": 0.23735754985754987,
+      "grad_norm": 0.5141251087188721,
+      "learning_rate": 0.0001983158689390234,
+      "loss": 1.3752,
+      "step": 1333
+    },
+    {
+      "epoch": 0.23753561253561253,
+      "grad_norm": 0.4578750431537628,
+      "learning_rate": 0.00019831330988312067,
+      "loss": 1.0965,
+      "step": 1334
+    },
+    {
+      "epoch": 0.2377136752136752,
+      "grad_norm": 0.47974497079849243,
+      "learning_rate": 0.00019831074890097286,
+      "loss": 1.3379,
+      "step": 1335
+    },
+    {
+      "epoch": 0.2378917378917379,
+      "grad_norm": 0.4618176817893982,
+      "learning_rate": 0.00019830818599263014,
+      "loss": 1.274,
+      "step": 1336
+    },
+    {
+      "epoch": 0.23806980056980057,
+      "grad_norm": 0.4279816448688507,
+      "learning_rate": 0.00019830562115814276,
+      "loss": 0.996,
+      "step": 1337
+    },
+    {
+      "epoch": 0.23824786324786323,
+      "grad_norm": 0.4255026876926422,
+      "learning_rate": 0.0001983030543975609,
+      "loss": 0.969,
+      "step": 1338
+    },
+    {
+      "epoch": 0.23842592592592593,
+      "grad_norm": 0.4551412761211395,
+      "learning_rate": 0.00019830048571093493,
+      "loss": 1.0204,
+      "step": 1339
+    },
+    {
+      "epoch": 0.2386039886039886,
+      "grad_norm": 0.4747903048992157,
+      "learning_rate": 0.00019829791509831513,
+      "loss": 1.1816,
+      "step": 1340
+    },
+    {
+      "epoch": 0.2387820512820513,
+      "grad_norm": 0.47187140583992004,
+      "learning_rate": 0.00019829534255975188,
+      "loss": 1.1205,
+      "step": 1341
+    },
+    {
+      "epoch": 0.23896011396011396,
+      "grad_norm": 0.49332180619239807,
+      "learning_rate": 0.0001982927680952956,
+      "loss": 1.2657,
+      "step": 1342
+    },
+    {
+      "epoch": 0.23913817663817663,
+      "grad_norm": 0.5162837505340576,
+      "learning_rate": 0.0001982901917049967,
+      "loss": 1.2247,
+      "step": 1343
+    },
+    {
+      "epoch": 0.23931623931623933,
+      "grad_norm": 0.43407055735588074,
+      "learning_rate": 0.0001982876133889057,
+      "loss": 1.0038,
+      "step": 1344
+    },
+    {
+      "epoch": 0.239494301994302,
+      "grad_norm": 0.5132251977920532,
+      "learning_rate": 0.00019828503314707306,
+      "loss": 1.0678,
+      "step": 1345
+    },
+    {
+      "epoch": 0.23967236467236466,
+      "grad_norm": 0.46295464038848877,
+      "learning_rate": 0.00019828245097954937,
+      "loss": 1.1802,
+      "step": 1346
+    },
+    {
+      "epoch": 0.23985042735042736,
+      "grad_norm": 0.4682658314704895,
+      "learning_rate": 0.00019827986688638523,
+      "loss": 1.0249,
+      "step": 1347
+    },
+    {
+      "epoch": 0.24002849002849003,
+      "grad_norm": 0.49990561604499817,
+      "learning_rate": 0.00019827728086763125,
+      "loss": 1.0691,
+      "step": 1348
+    },
+    {
+      "epoch": 0.2402065527065527,
+      "grad_norm": 0.39090847969055176,
+      "learning_rate": 0.00019827469292333806,
+      "loss": 0.8367,
+      "step": 1349
+    },
+    {
+      "epoch": 0.2403846153846154,
+      "grad_norm": 0.5023905634880066,
+      "learning_rate": 0.00019827210305355645,
+      "loss": 1.0675,
+      "step": 1350
+    },
+    {
+      "epoch": 0.24056267806267806,
+      "grad_norm": 0.4744076430797577,
+      "learning_rate": 0.00019826951125833715,
+      "loss": 1.3166,
+      "step": 1351
+    },
+    {
+      "epoch": 0.24074074074074073,
+      "grad_norm": 0.44914689660072327,
+      "learning_rate": 0.00019826691753773088,
+      "loss": 0.9818,
+      "step": 1352
+    },
+    {
+      "epoch": 0.24091880341880342,
+      "grad_norm": 0.44391971826553345,
+      "learning_rate": 0.00019826432189178853,
+      "loss": 1.0448,
+      "step": 1353
+    },
+    {
+      "epoch": 0.2410968660968661,
+      "grad_norm": 0.46102839708328247,
+      "learning_rate": 0.00019826172432056086,
+      "loss": 0.9952,
+      "step": 1354
+    },
+    {
+      "epoch": 0.2412749287749288,
+      "grad_norm": 0.4796878695487976,
+      "learning_rate": 0.00019825912482409884,
+      "loss": 1.0977,
+      "step": 1355
+    },
+    {
+      "epoch": 0.24145299145299146,
+      "grad_norm": 0.5003768801689148,
+      "learning_rate": 0.0001982565234024534,
+      "loss": 1.3149,
+      "step": 1356
+    },
+    {
+      "epoch": 0.24163105413105412,
+      "grad_norm": 0.43475663661956787,
+      "learning_rate": 0.00019825392005567551,
+      "loss": 1.0527,
+      "step": 1357
+    },
+    {
+      "epoch": 0.24180911680911682,
+      "grad_norm": 0.46120527386665344,
+      "learning_rate": 0.00019825131478381613,
+      "loss": 1.2333,
+      "step": 1358
+    },
+    {
+      "epoch": 0.2419871794871795,
+      "grad_norm": 0.43748101592063904,
+      "learning_rate": 0.00019824870758692638,
+      "loss": 0.9788,
+      "step": 1359
+    },
+    {
+      "epoch": 0.24216524216524216,
+      "grad_norm": 0.5275192856788635,
+      "learning_rate": 0.00019824609846505727,
+      "loss": 1.1473,
+      "step": 1360
+    },
+    {
+      "epoch": 0.24234330484330485,
+      "grad_norm": 0.346463143825531,
+      "learning_rate": 0.00019824348741825993,
+      "loss": 0.6824,
+      "step": 1361
+    },
+    {
+      "epoch": 0.24252136752136752,
+      "grad_norm": 0.5004115700721741,
+      "learning_rate": 0.00019824087444658556,
+      "loss": 1.1853,
+      "step": 1362
+    },
+    {
+      "epoch": 0.2426994301994302,
+      "grad_norm": 0.42746666073799133,
+      "learning_rate": 0.00019823825955008533,
+      "loss": 0.9355,
+      "step": 1363
+    },
+    {
+      "epoch": 0.24287749287749288,
+      "grad_norm": 0.4099743068218231,
+      "learning_rate": 0.00019823564272881047,
+      "loss": 1.0753,
+      "step": 1364
+    },
+    {
+      "epoch": 0.24305555555555555,
+      "grad_norm": 0.5262967944145203,
+      "learning_rate": 0.00019823302398281226,
+      "loss": 1.2324,
+      "step": 1365
+    },
+    {
+      "epoch": 0.24323361823361822,
+      "grad_norm": 0.436069518327713,
+      "learning_rate": 0.000198230403312142,
+      "loss": 1.1887,
+      "step": 1366
+    },
+    {
+      "epoch": 0.24341168091168092,
+      "grad_norm": 0.38252368569374084,
+      "learning_rate": 0.00019822778071685107,
+      "loss": 1.0211,
+      "step": 1367
+    },
+    {
+      "epoch": 0.24358974358974358,
+      "grad_norm": 0.48024141788482666,
+      "learning_rate": 0.00019822515619699081,
+      "loss": 1.065,
+      "step": 1368
+    },
+    {
+      "epoch": 0.24376780626780628,
+      "grad_norm": 0.47421589493751526,
+      "learning_rate": 0.00019822252975261267,
+      "loss": 1.0433,
+      "step": 1369
+    },
+    {
+      "epoch": 0.24394586894586895,
+      "grad_norm": 0.46094807982444763,
+      "learning_rate": 0.00019821990138376808,
+      "loss": 1.1427,
+      "step": 1370
+    },
+    {
+      "epoch": 0.24412393162393162,
+      "grad_norm": 0.5093680620193481,
+      "learning_rate": 0.00019821727109050856,
+      "loss": 1.1086,
+      "step": 1371
+    },
+    {
+      "epoch": 0.2443019943019943,
+      "grad_norm": 0.41084879636764526,
+      "learning_rate": 0.00019821463887288566,
+      "loss": 1.0068,
+      "step": 1372
+    },
+    {
+      "epoch": 0.24448005698005698,
+      "grad_norm": 0.4991084635257721,
+      "learning_rate": 0.0001982120047309509,
+      "loss": 1.1884,
+      "step": 1373
+    },
+    {
+      "epoch": 0.24465811965811965,
+      "grad_norm": 0.39198383688926697,
+      "learning_rate": 0.00019820936866475595,
+      "loss": 0.9776,
+      "step": 1374
+    },
+    {
+      "epoch": 0.24483618233618235,
+      "grad_norm": 0.4517424702644348,
+      "learning_rate": 0.00019820673067435244,
+      "loss": 1.1491,
+      "step": 1375
+    },
+    {
+      "epoch": 0.245014245014245,
+      "grad_norm": 0.45881983637809753,
+      "learning_rate": 0.00019820409075979202,
+      "loss": 1.1198,
+      "step": 1376
+    },
+    {
+      "epoch": 0.24519230769230768,
+      "grad_norm": 0.4498792290687561,
+      "learning_rate": 0.00019820144892112646,
+      "loss": 1.0897,
+      "step": 1377
+    },
+    {
+      "epoch": 0.24537037037037038,
+      "grad_norm": 0.4128037393093109,
+      "learning_rate": 0.00019819880515840752,
+      "loss": 0.9415,
+      "step": 1378
+    },
+    {
+      "epoch": 0.24554843304843305,
+      "grad_norm": 0.4340885281562805,
+      "learning_rate": 0.00019819615947168698,
+      "loss": 1.201,
+      "step": 1379
+    },
+    {
+      "epoch": 0.24572649572649571,
+      "grad_norm": 0.43814027309417725,
+      "learning_rate": 0.00019819351186101667,
+      "loss": 1.1039,
+      "step": 1380
+    },
+    {
+      "epoch": 0.2459045584045584,
+      "grad_norm": 0.40115082263946533,
+      "learning_rate": 0.00019819086232644845,
+      "loss": 1.2599,
+      "step": 1381
+    },
+    {
+      "epoch": 0.24608262108262108,
+      "grad_norm": 0.4947351813316345,
+      "learning_rate": 0.00019818821086803426,
+      "loss": 1.252,
+      "step": 1382
+    },
+    {
+      "epoch": 0.24626068376068377,
+      "grad_norm": 0.45179441571235657,
+      "learning_rate": 0.0001981855574858261,
+      "loss": 1.1323,
+      "step": 1383
+    },
+    {
+      "epoch": 0.24643874643874644,
+      "grad_norm": 0.47159844636917114,
+      "learning_rate": 0.00019818290217987587,
+      "loss": 1.2053,
+      "step": 1384
+    },
+    {
+      "epoch": 0.2466168091168091,
+      "grad_norm": 0.4358448386192322,
+      "learning_rate": 0.0001981802449502356,
+      "loss": 1.1174,
+      "step": 1385
+    },
+    {
+      "epoch": 0.2467948717948718,
+      "grad_norm": 0.4588233530521393,
+      "learning_rate": 0.00019817758579695745,
+      "loss": 1.1098,
+      "step": 1386
+    },
+    {
+      "epoch": 0.24697293447293447,
+      "grad_norm": 0.4955112636089325,
+      "learning_rate": 0.00019817492472009338,
+      "loss": 1.258,
+      "step": 1387
+    },
+    {
+      "epoch": 0.24715099715099714,
+      "grad_norm": 0.4226941764354706,
+      "learning_rate": 0.00019817226171969565,
+      "loss": 1.0976,
+      "step": 1388
+    },
+    {
+      "epoch": 0.24732905982905984,
+      "grad_norm": 0.4076840579509735,
+      "learning_rate": 0.00019816959679581637,
+      "loss": 1.0121,
+      "step": 1389
+    },
+    {
+      "epoch": 0.2475071225071225,
+      "grad_norm": 0.4395063519477844,
+      "learning_rate": 0.0001981669299485078,
+      "loss": 1.3153,
+      "step": 1390
+    },
+    {
+      "epoch": 0.24768518518518517,
+      "grad_norm": 0.41010400652885437,
+      "learning_rate": 0.0001981642611778221,
+      "loss": 1.0717,
+      "step": 1391
+    },
+    {
+      "epoch": 0.24786324786324787,
+      "grad_norm": 0.43459352850914,
+      "learning_rate": 0.00019816159048381167,
+      "loss": 1.1077,
+      "step": 1392
+    },
+    {
+      "epoch": 0.24804131054131054,
+      "grad_norm": 0.46291449666023254,
+      "learning_rate": 0.00019815891786652875,
+      "loss": 1.0257,
+      "step": 1393
+    },
+    {
+      "epoch": 0.2482193732193732,
+      "grad_norm": 0.46408146619796753,
+      "learning_rate": 0.00019815624332602578,
+      "loss": 0.7899,
+      "step": 1394
+    },
+    {
+      "epoch": 0.2483974358974359,
+      "grad_norm": 0.4763357937335968,
+      "learning_rate": 0.00019815356686235508,
+      "loss": 0.9857,
+      "step": 1395
+    },
+    {
+      "epoch": 0.24857549857549857,
+      "grad_norm": 0.4766457676887512,
+      "learning_rate": 0.00019815088847556918,
+      "loss": 1.0589,
+      "step": 1396
+    },
+    {
+      "epoch": 0.24875356125356127,
+      "grad_norm": 0.4486583173274994,
+      "learning_rate": 0.0001981482081657205,
+      "loss": 1.2572,
+      "step": 1397
+    },
+    {
+      "epoch": 0.24893162393162394,
+      "grad_norm": 0.468878835439682,
+      "learning_rate": 0.00019814552593286155,
+      "loss": 1.101,
+      "step": 1398
+    },
+    {
+      "epoch": 0.2491096866096866,
+      "grad_norm": 0.4230278730392456,
+      "learning_rate": 0.0001981428417770449,
+      "loss": 0.9457,
+      "step": 1399
+    },
+    {
+      "epoch": 0.2492877492877493,
+      "grad_norm": 0.45630761981010437,
+      "learning_rate": 0.00019814015569832315,
+      "loss": 1.0665,
+      "step": 1400
+    },
+    {
+      "epoch": 0.24946581196581197,
+      "grad_norm": 0.5780113935470581,
+      "learning_rate": 0.00019813746769674893,
+      "loss": 1.1064,
+      "step": 1401
+    },
+    {
+      "epoch": 0.24964387464387464,
+      "grad_norm": 0.4343436658382416,
+      "learning_rate": 0.0001981347777723749,
+      "loss": 1.1132,
+      "step": 1402
+    },
+    {
+      "epoch": 0.24982193732193733,
+      "grad_norm": 0.4879056513309479,
+      "learning_rate": 0.0001981320859252537,
+      "loss": 1.1301,
+      "step": 1403
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.5248328447341919,
+      "learning_rate": 0.00019812939215543818,
+      "loss": 1.1468,
+      "step": 1404
+    },
+    {
+      "epoch": 0.25,
+      "eval_loss": 1.115895390510559,
+      "eval_runtime": 25.0474,
+      "eval_samples_per_second": 41.561,
+      "eval_steps_per_second": 20.801,
+      "step": 1404
+    },
+    {
+      "epoch": 0.2501780626780627,
+      "grad_norm": 0.5076769590377808,
+      "learning_rate": 0.00019812669646298106,
+      "loss": 1.1428,
+      "step": 1405
+    },
+    {
+      "epoch": 0.25035612535612534,
+      "grad_norm": 0.5510252714157104,
+      "learning_rate": 0.00019812399884793514,
+      "loss": 1.3383,
+      "step": 1406
+    },
+    {
+      "epoch": 0.25053418803418803,
+      "grad_norm": 0.48918986320495605,
+      "learning_rate": 0.0001981212993103533,
+      "loss": 1.1507,
+      "step": 1407
+    },
+    {
+      "epoch": 0.25071225071225073,
+      "grad_norm": 0.4678935110569,
+      "learning_rate": 0.00019811859785028846,
+      "loss": 1.13,
+      "step": 1408
+    },
+    {
+      "epoch": 0.25089031339031337,
+      "grad_norm": 0.5155254602432251,
+      "learning_rate": 0.0001981158944677935,
+      "loss": 1.1194,
+      "step": 1409
+    },
+    {
+      "epoch": 0.25106837606837606,
+      "grad_norm": 0.4533839523792267,
+      "learning_rate": 0.00019811318916292142,
+      "loss": 0.9464,
+      "step": 1410
+    },
+    {
+      "epoch": 0.25124643874643876,
+      "grad_norm": 0.5142433047294617,
+      "learning_rate": 0.00019811048193572517,
+      "loss": 1.0837,
+      "step": 1411
+    },
+    {
+      "epoch": 0.2514245014245014,
+      "grad_norm": 0.4330446124076843,
+      "learning_rate": 0.00019810777278625788,
+      "loss": 0.9117,
+      "step": 1412
+    },
+    {
+      "epoch": 0.2516025641025641,
+      "grad_norm": 0.44806256890296936,
+      "learning_rate": 0.00019810506171457254,
+      "loss": 1.1643,
+      "step": 1413
+    },
+    {
+      "epoch": 0.2517806267806268,
+      "grad_norm": 0.43526285886764526,
+      "learning_rate": 0.00019810234872072235,
+      "loss": 0.9776,
+      "step": 1414
+    },
+    {
+      "epoch": 0.25195868945868943,
+      "grad_norm": 0.47394511103630066,
+      "learning_rate": 0.00019809963380476039,
+      "loss": 1.0935,
+      "step": 1415
+    },
+    {
+      "epoch": 0.25213675213675213,
+      "grad_norm": 0.48961278796195984,
+      "learning_rate": 0.00019809691696673993,
+      "loss": 1.179,
+      "step": 1416
+    },
+    {
+      "epoch": 0.2523148148148148,
+      "grad_norm": 0.43153589963912964,
+      "learning_rate": 0.00019809419820671412,
+      "loss": 0.906,
+      "step": 1417
+    },
+    {
+      "epoch": 0.25249287749287747,
+      "grad_norm": 0.41187527775764465,
+      "learning_rate": 0.00019809147752473632,
+      "loss": 0.899,
+      "step": 1418
+    },
+    {
+      "epoch": 0.25267094017094016,
+      "grad_norm": 0.5003183484077454,
+      "learning_rate": 0.00019808875492085973,
+      "loss": 1.0606,
+      "step": 1419
+    },
+    {
+      "epoch": 0.25284900284900286,
+      "grad_norm": 0.4430316984653473,
+      "learning_rate": 0.00019808603039513778,
+      "loss": 0.9167,
+      "step": 1420
+    },
+    {
+      "epoch": 0.25302706552706555,
+      "grad_norm": 0.4577699601650238,
+      "learning_rate": 0.00019808330394762382,
+      "loss": 1.1184,
+      "step": 1421
+    },
+    {
+      "epoch": 0.2532051282051282,
+      "grad_norm": 0.42656826972961426,
+      "learning_rate": 0.0001980805755783713,
+      "loss": 0.9335,
+      "step": 1422
+    },
+    {
+      "epoch": 0.2533831908831909,
+      "grad_norm": 0.40980881452560425,
+      "learning_rate": 0.0001980778452874336,
+      "loss": 0.9756,
+      "step": 1423
+    },
+    {
+      "epoch": 0.2535612535612536,
+      "grad_norm": 0.5752090811729431,
+      "learning_rate": 0.00019807511307486423,
+      "loss": 1.1694,
+      "step": 1424
+    },
+    {
+      "epoch": 0.2537393162393162,
+      "grad_norm": 0.5000349283218384,
+      "learning_rate": 0.00019807237894071681,
+      "loss": 0.9515,
+      "step": 1425
+    },
+    {
+      "epoch": 0.2539173789173789,
+      "grad_norm": 0.5159069299697876,
+      "learning_rate": 0.00019806964288504483,
+      "loss": 1.4014,
+      "step": 1426
+    },
+    {
+      "epoch": 0.2540954415954416,
+      "grad_norm": 0.5377941131591797,
+      "learning_rate": 0.00019806690490790194,
+      "loss": 1.2832,
+      "step": 1427
+    },
+    {
+      "epoch": 0.25427350427350426,
+      "grad_norm": 0.4565938711166382,
+      "learning_rate": 0.00019806416500934174,
+      "loss": 1.0629,
+      "step": 1428
+    },
+    {
+      "epoch": 0.25445156695156695,
+      "grad_norm": 0.49867144227027893,
+      "learning_rate": 0.00019806142318941797,
+      "loss": 1.2011,
+      "step": 1429
+    },
+    {
+      "epoch": 0.25462962962962965,
+      "grad_norm": 0.5111994743347168,
+      "learning_rate": 0.00019805867944818427,
+      "loss": 0.8925,
+      "step": 1430
+    },
+    {
+      "epoch": 0.2548076923076923,
+      "grad_norm": 0.5204268097877502,
+      "learning_rate": 0.00019805593378569448,
+      "loss": 1.2956,
+      "step": 1431
+    },
+    {
+      "epoch": 0.254985754985755,
+      "grad_norm": 0.3889026939868927,
+      "learning_rate": 0.00019805318620200234,
+      "loss": 1.0355,
+      "step": 1432
+    },
+    {
+      "epoch": 0.2551638176638177,
+      "grad_norm": 0.46825656294822693,
+      "learning_rate": 0.00019805043669716174,
+      "loss": 1.0444,
+      "step": 1433
+    },
+    {
+      "epoch": 0.2553418803418803,
+      "grad_norm": 0.4509420394897461,
+      "learning_rate": 0.00019804768527122648,
+      "loss": 1.0423,
+      "step": 1434
+    },
+    {
+      "epoch": 0.255519943019943,
+      "grad_norm": 0.4514774978160858,
+      "learning_rate": 0.0001980449319242505,
+      "loss": 1.1588,
+      "step": 1435
+    },
+    {
+      "epoch": 0.2556980056980057,
+      "grad_norm": 0.43019044399261475,
+      "learning_rate": 0.0001980421766562878,
+      "loss": 0.9939,
+      "step": 1436
+    },
+    {
+      "epoch": 0.25587606837606836,
+      "grad_norm": 0.5056091547012329,
+      "learning_rate": 0.00019803941946739228,
+      "loss": 1.1238,
+      "step": 1437
+    },
+    {
+      "epoch": 0.25605413105413105,
+      "grad_norm": 0.48664605617523193,
+      "learning_rate": 0.000198036660357618,
+      "loss": 1.0702,
+      "step": 1438
+    },
+    {
+      "epoch": 0.25623219373219375,
+      "grad_norm": 0.4500972032546997,
+      "learning_rate": 0.000198033899327019,
+      "loss": 0.9365,
+      "step": 1439
+    },
+    {
+      "epoch": 0.2564102564102564,
+      "grad_norm": 0.4800589382648468,
+      "learning_rate": 0.0001980311363756494,
+      "loss": 1.1159,
+      "step": 1440
+    },
+    {
+      "epoch": 0.2565883190883191,
+      "grad_norm": 0.3486495316028595,
+      "learning_rate": 0.0001980283715035633,
+      "loss": 0.6029,
+      "step": 1441
+    },
+    {
+      "epoch": 0.2567663817663818,
+      "grad_norm": 0.46258702874183655,
+      "learning_rate": 0.00019802560471081493,
+      "loss": 1.025,
+      "step": 1442
+    },
+    {
+      "epoch": 0.2569444444444444,
+      "grad_norm": 0.4846673607826233,
+      "learning_rate": 0.00019802283599745844,
+      "loss": 1.1105,
+      "step": 1443
+    },
+    {
+      "epoch": 0.2571225071225071,
+      "grad_norm": 0.4586990475654602,
+      "learning_rate": 0.00019802006536354813,
+      "loss": 0.9897,
+      "step": 1444
+    },
+    {
+      "epoch": 0.2573005698005698,
+      "grad_norm": 0.5177786350250244,
+      "learning_rate": 0.00019801729280913825,
+      "loss": 1.2558,
+      "step": 1445
+    },
+    {
+      "epoch": 0.25747863247863245,
+      "grad_norm": 0.43213751912117004,
+      "learning_rate": 0.00019801451833428312,
+      "loss": 1.0961,
+      "step": 1446
+    },
+    {
+      "epoch": 0.25765669515669515,
+      "grad_norm": 0.42974478006362915,
+      "learning_rate": 0.00019801174193903714,
+      "loss": 1.0659,
+      "step": 1447
+    },
+    {
+      "epoch": 0.25783475783475784,
+      "grad_norm": 0.4424504339694977,
+      "learning_rate": 0.00019800896362345464,
+      "loss": 0.9805,
+      "step": 1448
+    },
+    {
+      "epoch": 0.25801282051282054,
+      "grad_norm": 0.4734833836555481,
+      "learning_rate": 0.0001980061833875901,
+      "loss": 1.255,
+      "step": 1449
+    },
+    {
+      "epoch": 0.2581908831908832,
+      "grad_norm": 0.41024845838546753,
+      "learning_rate": 0.000198003401231498,
+      "loss": 1.0908,
+      "step": 1450
+    },
+    {
+      "epoch": 0.2583689458689459,
+      "grad_norm": 0.43603816628456116,
+      "learning_rate": 0.00019800061715523283,
+      "loss": 1.0611,
+      "step": 1451
+    },
+    {
+      "epoch": 0.25854700854700857,
+      "grad_norm": 0.4871339499950409,
+      "learning_rate": 0.00019799783115884915,
+      "loss": 1.1851,
+      "step": 1452
+    },
+    {
+      "epoch": 0.2587250712250712,
+      "grad_norm": 0.49758270382881165,
+      "learning_rate": 0.00019799504324240157,
+      "loss": 1.1936,
+      "step": 1453
+    },
+    {
+      "epoch": 0.2589031339031339,
+      "grad_norm": 0.4201010763645172,
+      "learning_rate": 0.00019799225340594466,
+      "loss": 1.1567,
+      "step": 1454
+    },
+    {
+      "epoch": 0.2590811965811966,
+      "grad_norm": 0.4200313091278076,
+      "learning_rate": 0.00019798946164953309,
+      "loss": 0.9666,
+      "step": 1455
+    },
+    {
+      "epoch": 0.25925925925925924,
+      "grad_norm": 0.43001702427864075,
+      "learning_rate": 0.0001979866679732216,
+      "loss": 1.0104,
+      "step": 1456
+    },
+    {
+      "epoch": 0.25943732193732194,
+      "grad_norm": 0.46733465790748596,
+      "learning_rate": 0.0001979838723770649,
+      "loss": 1.0927,
+      "step": 1457
+    },
+    {
+      "epoch": 0.25961538461538464,
+      "grad_norm": 0.4513280391693115,
+      "learning_rate": 0.00019798107486111773,
+      "loss": 1.0282,
+      "step": 1458
+    },
+    {
+      "epoch": 0.2597934472934473,
+      "grad_norm": 0.40411749482154846,
+      "learning_rate": 0.00019797827542543495,
+      "loss": 1.0789,
+      "step": 1459
+    },
+    {
+      "epoch": 0.25997150997151,
+      "grad_norm": 0.4359099268913269,
+      "learning_rate": 0.0001979754740700714,
+      "loss": 1.0616,
+      "step": 1460
+    },
+    {
+      "epoch": 0.26014957264957267,
+      "grad_norm": 0.4979047477245331,
+      "learning_rate": 0.00019797267079508198,
+      "loss": 1.2948,
+      "step": 1461
+    },
+    {
+      "epoch": 0.2603276353276353,
+      "grad_norm": 0.44698619842529297,
+      "learning_rate": 0.0001979698656005216,
+      "loss": 0.9198,
+      "step": 1462
+    },
+    {
+      "epoch": 0.260505698005698,
+      "grad_norm": 0.48437631130218506,
+      "learning_rate": 0.00019796705848644516,
+      "loss": 1.3207,
+      "step": 1463
+    },
+    {
+      "epoch": 0.2606837606837607,
+      "grad_norm": 0.4382587671279907,
+      "learning_rate": 0.00019796424945290778,
+      "loss": 1.1315,
+      "step": 1464
+    },
+    {
+      "epoch": 0.26086182336182334,
+      "grad_norm": 0.4565944969654083,
+      "learning_rate": 0.0001979614384999644,
+      "loss": 1.1893,
+      "step": 1465
+    },
+    {
+      "epoch": 0.26103988603988604,
+      "grad_norm": 0.4705163836479187,
+      "learning_rate": 0.00019795862562767017,
+      "loss": 1.1132,
+      "step": 1466
+    },
+    {
+      "epoch": 0.26121794871794873,
+      "grad_norm": 0.525184690952301,
+      "learning_rate": 0.00019795581083608012,
+      "loss": 1.2111,
+      "step": 1467
+    },
+    {
+      "epoch": 0.2613960113960114,
+      "grad_norm": 0.45215457677841187,
+      "learning_rate": 0.00019795299412524945,
+      "loss": 1.1851,
+      "step": 1468
+    },
+    {
+      "epoch": 0.26157407407407407,
+      "grad_norm": 0.4336663484573364,
+      "learning_rate": 0.00019795017549523335,
+      "loss": 1.0147,
+      "step": 1469
+    },
+    {
+      "epoch": 0.26175213675213677,
+      "grad_norm": 0.5327649712562561,
+      "learning_rate": 0.00019794735494608703,
+      "loss": 1.1743,
+      "step": 1470
+    },
+    {
+      "epoch": 0.2619301994301994,
+      "grad_norm": 0.49972307682037354,
+      "learning_rate": 0.00019794453247786578,
+      "loss": 1.1624,
+      "step": 1471
+    },
+    {
+      "epoch": 0.2621082621082621,
+      "grad_norm": 0.43475785851478577,
+      "learning_rate": 0.00019794170809062485,
+      "loss": 0.9888,
+      "step": 1472
+    },
+    {
+      "epoch": 0.2622863247863248,
+      "grad_norm": 0.428838849067688,
+      "learning_rate": 0.0001979388817844196,
+      "loss": 0.9154,
+      "step": 1473
+    },
+    {
+      "epoch": 0.26246438746438744,
+      "grad_norm": 0.508568286895752,
+      "learning_rate": 0.00019793605355930544,
+      "loss": 1.1679,
+      "step": 1474
+    },
+    {
+      "epoch": 0.26264245014245013,
+      "grad_norm": 0.47791770100593567,
+      "learning_rate": 0.00019793322341533776,
+      "loss": 1.1375,
+      "step": 1475
+    },
+    {
+      "epoch": 0.26282051282051283,
+      "grad_norm": 0.41909220814704895,
+      "learning_rate": 0.00019793039135257196,
+      "loss": 1.0235,
+      "step": 1476
+    },
+    {
+      "epoch": 0.26299857549857547,
+      "grad_norm": 0.5564408302307129,
+      "learning_rate": 0.00019792755737106361,
+      "loss": 1.0756,
+      "step": 1477
+    },
+    {
+      "epoch": 0.26317663817663817,
+      "grad_norm": 0.42813625931739807,
+      "learning_rate": 0.0001979247214708682,
+      "loss": 0.8213,
+      "step": 1478
+    },
+    {
+      "epoch": 0.26335470085470086,
+      "grad_norm": 0.44495970010757446,
+      "learning_rate": 0.00019792188365204126,
+      "loss": 0.9654,
+      "step": 1479
+    },
+    {
+      "epoch": 0.26353276353276356,
+      "grad_norm": 0.47473424673080444,
+      "learning_rate": 0.00019791904391463846,
+      "loss": 1.1643,
+      "step": 1480
+    },
+    {
+      "epoch": 0.2637108262108262,
+      "grad_norm": 0.40189051628112793,
+      "learning_rate": 0.0001979162022587154,
+      "loss": 0.8687,
+      "step": 1481
+    },
+    {
+      "epoch": 0.2638888888888889,
+      "grad_norm": 0.44629937410354614,
+      "learning_rate": 0.00019791335868432776,
+      "loss": 1.0284,
+      "step": 1482
+    },
+    {
+      "epoch": 0.2640669515669516,
+      "grad_norm": 0.511275053024292,
+      "learning_rate": 0.00019791051319153124,
+      "loss": 1.2217,
+      "step": 1483
+    },
+    {
+      "epoch": 0.26424501424501423,
+      "grad_norm": 0.5136445164680481,
+      "learning_rate": 0.00019790766578038163,
+      "loss": 1.1129,
+      "step": 1484
+    },
+    {
+      "epoch": 0.2644230769230769,
+      "grad_norm": 0.4450451135635376,
+      "learning_rate": 0.00019790481645093469,
+      "loss": 0.9912,
+      "step": 1485
+    },
+    {
+      "epoch": 0.2646011396011396,
+      "grad_norm": 0.39455199241638184,
+      "learning_rate": 0.00019790196520324621,
+      "loss": 1.0887,
+      "step": 1486
+    },
+    {
+      "epoch": 0.26477920227920226,
+      "grad_norm": 0.4444045126438141,
+      "learning_rate": 0.00019789911203737216,
+      "loss": 1.1559,
+      "step": 1487
+    },
+    {
+      "epoch": 0.26495726495726496,
+      "grad_norm": 0.4769677221775055,
+      "learning_rate": 0.0001978962569533683,
+      "loss": 1.147,
+      "step": 1488
+    },
+    {
+      "epoch": 0.26513532763532766,
+      "grad_norm": 0.40226617455482483,
+      "learning_rate": 0.0001978933999512907,
+      "loss": 1.0966,
+      "step": 1489
+    },
+    {
+      "epoch": 0.2653133903133903,
+      "grad_norm": 0.4640974700450897,
+      "learning_rate": 0.00019789054103119526,
+      "loss": 1.1002,
+      "step": 1490
+    },
+    {
+      "epoch": 0.265491452991453,
+      "grad_norm": 0.48251107335090637,
+      "learning_rate": 0.00019788768019313806,
+      "loss": 1.07,
+      "step": 1491
+    },
+    {
+      "epoch": 0.2656695156695157,
+      "grad_norm": 0.4836949408054352,
+      "learning_rate": 0.00019788481743717506,
+      "loss": 1.2992,
+      "step": 1492
+    },
+    {
+      "epoch": 0.26584757834757833,
+      "grad_norm": 0.4253857135772705,
+      "learning_rate": 0.00019788195276336244,
+      "loss": 1.1326,
+      "step": 1493
+    },
+    {
+      "epoch": 0.266025641025641,
+      "grad_norm": 0.5161862373352051,
+      "learning_rate": 0.0001978790861717563,
+      "loss": 1.2131,
+      "step": 1494
+    },
+    {
+      "epoch": 0.2662037037037037,
+      "grad_norm": 0.5223346948623657,
+      "learning_rate": 0.00019787621766241274,
+      "loss": 1.0933,
+      "step": 1495
+    },
+    {
+      "epoch": 0.26638176638176636,
+      "grad_norm": 0.37622541189193726,
+      "learning_rate": 0.000197873347235388,
+      "loss": 0.8919,
+      "step": 1496
+    },
+    {
+      "epoch": 0.26655982905982906,
+      "grad_norm": 0.4425419569015503,
+      "learning_rate": 0.0001978704748907384,
+      "loss": 1.0411,
+      "step": 1497
+    },
+    {
+      "epoch": 0.26673789173789175,
+      "grad_norm": 0.4536985456943512,
+      "learning_rate": 0.00019786760062852015,
+      "loss": 1.2747,
+      "step": 1498
+    },
+    {
+      "epoch": 0.2669159544159544,
+      "grad_norm": 0.4998049736022949,
+      "learning_rate": 0.00019786472444878955,
+      "loss": 1.3214,
+      "step": 1499
+    },
+    {
+      "epoch": 0.2670940170940171,
+      "grad_norm": 0.42104312777519226,
+      "learning_rate": 0.00019786184635160295,
+      "loss": 0.7878,
+      "step": 1500
+    },
+    {
+      "epoch": 0.2672720797720798,
+      "grad_norm": 0.5354288220405579,
+      "learning_rate": 0.00019785896633701678,
+      "loss": 1.0642,
+      "step": 1501
+    },
+    {
+      "epoch": 0.2674501424501424,
+      "grad_norm": 0.4681485891342163,
+      "learning_rate": 0.00019785608440508744,
+      "loss": 1.1737,
+      "step": 1502
+    },
+    {
+      "epoch": 0.2676282051282051,
+      "grad_norm": 0.49107062816619873,
+      "learning_rate": 0.0001978532005558714,
+      "loss": 1.1507,
+      "step": 1503
+    },
+    {
+      "epoch": 0.2678062678062678,
+      "grad_norm": 0.4173283576965332,
+      "learning_rate": 0.0001978503147894252,
+      "loss": 1.0538,
+      "step": 1504
+    },
+    {
+      "epoch": 0.26798433048433046,
+      "grad_norm": 0.49354055523872375,
+      "learning_rate": 0.0001978474271058053,
+      "loss": 1.1043,
+      "step": 1505
+    },
+    {
+      "epoch": 0.26816239316239315,
+      "grad_norm": 0.5787215232849121,
+      "learning_rate": 0.00019784453750506834,
+      "loss": 0.9245,
+      "step": 1506
+    },
+    {
+      "epoch": 0.26834045584045585,
+      "grad_norm": 0.48982590436935425,
+      "learning_rate": 0.00019784164598727095,
+      "loss": 1.2007,
+      "step": 1507
+    },
+    {
+      "epoch": 0.26851851851851855,
+      "grad_norm": 0.4971007704734802,
+      "learning_rate": 0.00019783875255246973,
+      "loss": 1.1174,
+      "step": 1508
+    },
+    {
+      "epoch": 0.2686965811965812,
+      "grad_norm": 0.5200340151786804,
+      "learning_rate": 0.00019783585720072142,
+      "loss": 1.1967,
+      "step": 1509
+    },
+    {
+      "epoch": 0.2688746438746439,
+      "grad_norm": 0.47911885380744934,
+      "learning_rate": 0.00019783295993208271,
+      "loss": 1.162,
+      "step": 1510
+    },
+    {
+      "epoch": 0.2690527065527066,
+      "grad_norm": 0.4764275848865509,
+      "learning_rate": 0.00019783006074661037,
+      "loss": 1.1358,
+      "step": 1511
+    },
+    {
+      "epoch": 0.2692307692307692,
+      "grad_norm": 0.478545606136322,
+      "learning_rate": 0.00019782715964436124,
+      "loss": 1.0096,
+      "step": 1512
+    },
+    {
+      "epoch": 0.2694088319088319,
+      "grad_norm": 0.5512787699699402,
+      "learning_rate": 0.00019782425662539212,
+      "loss": 1.1799,
+      "step": 1513
+    },
+    {
+      "epoch": 0.2695868945868946,
+      "grad_norm": 0.5495108962059021,
+      "learning_rate": 0.00019782135168975988,
+      "loss": 1.0959,
+      "step": 1514
+    },
+    {
+      "epoch": 0.26976495726495725,
+      "grad_norm": 0.42052868008613586,
+      "learning_rate": 0.0001978184448375215,
+      "loss": 1.1872,
+      "step": 1515
+    },
+    {
+      "epoch": 0.26994301994301995,
+      "grad_norm": 0.4994426965713501,
+      "learning_rate": 0.0001978155360687339,
+      "loss": 1.0568,
+      "step": 1516
+    },
+    {
+      "epoch": 0.27012108262108264,
+      "grad_norm": 0.459577351808548,
+      "learning_rate": 0.00019781262538345402,
+      "loss": 1.0315,
+      "step": 1517
+    },
+    {
+      "epoch": 0.2702991452991453,
+      "grad_norm": 0.4792841374874115,
+      "learning_rate": 0.00019780971278173895,
+      "loss": 1.2055,
+      "step": 1518
+    },
+    {
+      "epoch": 0.270477207977208,
+      "grad_norm": 0.5017708539962769,
+      "learning_rate": 0.00019780679826364575,
+      "loss": 1.157,
+      "step": 1519
+    },
+    {
+      "epoch": 0.2706552706552707,
+      "grad_norm": 0.5197349786758423,
+      "learning_rate": 0.00019780388182923152,
+      "loss": 0.9101,
+      "step": 1520
+    },
+    {
+      "epoch": 0.2708333333333333,
+      "grad_norm": 0.4226742684841156,
+      "learning_rate": 0.00019780096347855338,
+      "loss": 1.0525,
+      "step": 1521
+    },
+    {
+      "epoch": 0.271011396011396,
+      "grad_norm": 0.5058164596557617,
+      "learning_rate": 0.00019779804321166852,
+      "loss": 0.931,
+      "step": 1522
+    },
+    {
+      "epoch": 0.2711894586894587,
+      "grad_norm": 0.44492244720458984,
+      "learning_rate": 0.00019779512102863418,
+      "loss": 1.0641,
+      "step": 1523
+    },
+    {
+      "epoch": 0.27136752136752135,
+      "grad_norm": 0.5348989963531494,
+      "learning_rate": 0.00019779219692950758,
+      "loss": 1.1692,
+      "step": 1524
+    },
+    {
+      "epoch": 0.27154558404558404,
+      "grad_norm": 0.4631774425506592,
+      "learning_rate": 0.00019778927091434602,
+      "loss": 1.0876,
+      "step": 1525
+    },
+    {
+      "epoch": 0.27172364672364674,
+      "grad_norm": 0.45957499742507935,
+      "learning_rate": 0.00019778634298320684,
+      "loss": 0.9527,
+      "step": 1526
+    },
+    {
+      "epoch": 0.2719017094017094,
+      "grad_norm": 0.4506755769252777,
+      "learning_rate": 0.00019778341313614743,
+      "loss": 1.086,
+      "step": 1527
+    },
+    {
+      "epoch": 0.2720797720797721,
+      "grad_norm": 0.4900587797164917,
+      "learning_rate": 0.00019778048137322513,
+      "loss": 0.9911,
+      "step": 1528
+    },
+    {
+      "epoch": 0.27225783475783477,
+      "grad_norm": 0.478127658367157,
+      "learning_rate": 0.00019777754769449745,
+      "loss": 1.2083,
+      "step": 1529
+    },
+    {
+      "epoch": 0.2724358974358974,
+      "grad_norm": 0.47220897674560547,
+      "learning_rate": 0.00019777461210002183,
+      "loss": 1.0313,
+      "step": 1530
+    },
+    {
+      "epoch": 0.2726139601139601,
+      "grad_norm": 0.4526277184486389,
+      "learning_rate": 0.0001977716745898558,
+      "loss": 1.2648,
+      "step": 1531
+    },
+    {
+      "epoch": 0.2727920227920228,
+      "grad_norm": 0.42907601594924927,
+      "learning_rate": 0.00019776873516405688,
+      "loss": 0.8645,
+      "step": 1532
+    },
+    {
+      "epoch": 0.27297008547008544,
+      "grad_norm": 0.43440163135528564,
+      "learning_rate": 0.00019776579382268272,
+      "loss": 0.9702,
+      "step": 1533
+    },
+    {
+      "epoch": 0.27314814814814814,
+      "grad_norm": 0.48213550448417664,
+      "learning_rate": 0.0001977628505657909,
+      "loss": 0.998,
+      "step": 1534
+    },
+    {
+      "epoch": 0.27332621082621084,
+      "grad_norm": 0.43385565280914307,
+      "learning_rate": 0.00019775990539343914,
+      "loss": 1.0575,
+      "step": 1535
+    },
+    {
+      "epoch": 0.27350427350427353,
+      "grad_norm": 0.45706847310066223,
+      "learning_rate": 0.00019775695830568507,
+      "loss": 1.3024,
+      "step": 1536
+    },
+    {
+      "epoch": 0.27368233618233617,
+      "grad_norm": 0.45769137144088745,
+      "learning_rate": 0.00019775400930258652,
+      "loss": 1.0987,
+      "step": 1537
+    },
+    {
+      "epoch": 0.27386039886039887,
+      "grad_norm": 0.44682395458221436,
+      "learning_rate": 0.00019775105838420117,
+      "loss": 1.1327,
+      "step": 1538
+    },
+    {
+      "epoch": 0.27403846153846156,
+      "grad_norm": 0.5923072099685669,
+      "learning_rate": 0.00019774810555058694,
+      "loss": 1.4766,
+      "step": 1539
+    },
+    {
+      "epoch": 0.2742165242165242,
+      "grad_norm": 0.4327206015586853,
+      "learning_rate": 0.0001977451508018016,
+      "loss": 1.1175,
+      "step": 1540
+    },
+    {
+      "epoch": 0.2743945868945869,
+      "grad_norm": 0.48036691546440125,
+      "learning_rate": 0.00019774219413790315,
+      "loss": 1.1189,
+      "step": 1541
+    },
+    {
+      "epoch": 0.2745726495726496,
+      "grad_norm": 0.41371914744377136,
+      "learning_rate": 0.00019773923555894935,
+      "loss": 1.1366,
+      "step": 1542
+    },
+    {
+      "epoch": 0.27475071225071224,
+      "grad_norm": 0.4452378749847412,
+      "learning_rate": 0.00019773627506499832,
+      "loss": 0.9517,
+      "step": 1543
+    },
+    {
+      "epoch": 0.27492877492877493,
+      "grad_norm": 0.469098299741745,
+      "learning_rate": 0.00019773331265610802,
+      "loss": 1.0848,
+      "step": 1544
+    },
+    {
+      "epoch": 0.27510683760683763,
+      "grad_norm": 0.5390294790267944,
+      "learning_rate": 0.00019773034833233646,
+      "loss": 0.8589,
+      "step": 1545
+    },
+    {
+      "epoch": 0.27528490028490027,
+      "grad_norm": 0.5368238091468811,
+      "learning_rate": 0.00019772738209374174,
+      "loss": 1.2954,
+      "step": 1546
+    },
+    {
+      "epoch": 0.27546296296296297,
+      "grad_norm": 0.4705318510532379,
+      "learning_rate": 0.00019772441394038198,
+      "loss": 1.2252,
+      "step": 1547
+    },
+    {
+      "epoch": 0.27564102564102566,
+      "grad_norm": 0.4682813286781311,
+      "learning_rate": 0.00019772144387231533,
+      "loss": 1.0855,
+      "step": 1548
+    },
+    {
+      "epoch": 0.2758190883190883,
+      "grad_norm": 0.46876460313796997,
+      "learning_rate": 0.0001977184718896,
+      "loss": 1.1959,
+      "step": 1549
+    },
+    {
+      "epoch": 0.275997150997151,
+      "grad_norm": 0.4172806441783905,
+      "learning_rate": 0.00019771549799229416,
+      "loss": 1.2166,
+      "step": 1550
+    },
+    {
+      "epoch": 0.2761752136752137,
+      "grad_norm": 0.5088075399398804,
+      "learning_rate": 0.0001977125221804562,
+      "loss": 1.1285,
+      "step": 1551
+    },
+    {
+      "epoch": 0.27635327635327633,
+      "grad_norm": 0.4728628396987915,
+      "learning_rate": 0.0001977095444541443,
+      "loss": 1.2985,
+      "step": 1552
+    },
+    {
+      "epoch": 0.27653133903133903,
+      "grad_norm": 0.4431236684322357,
+      "learning_rate": 0.00019770656481341684,
+      "loss": 1.1298,
+      "step": 1553
+    },
+    {
+      "epoch": 0.2767094017094017,
+      "grad_norm": 0.474065363407135,
+      "learning_rate": 0.00019770358325833223,
+      "loss": 1.1915,
+      "step": 1554
+    },
+    {
+      "epoch": 0.27688746438746437,
+      "grad_norm": 0.45718875527381897,
+      "learning_rate": 0.00019770059978894885,
+      "loss": 1.0626,
+      "step": 1555
+    },
+    {
+      "epoch": 0.27706552706552706,
+      "grad_norm": 0.49300211668014526,
+      "learning_rate": 0.00019769761440532522,
+      "loss": 1.0134,
+      "step": 1556
+    },
+    {
+      "epoch": 0.27724358974358976,
+      "grad_norm": 0.4389498829841614,
+      "learning_rate": 0.00019769462710751974,
+      "loss": 1.0292,
+      "step": 1557
+    },
+    {
+      "epoch": 0.2774216524216524,
+      "grad_norm": 0.47330448031425476,
+      "learning_rate": 0.000197691637895591,
+      "loss": 1.1273,
+      "step": 1558
+    },
+    {
+      "epoch": 0.2775997150997151,
+      "grad_norm": 0.5322058200836182,
+      "learning_rate": 0.00019768864676959755,
+      "loss": 1.059,
+      "step": 1559
+    },
+    {
+      "epoch": 0.2777777777777778,
+      "grad_norm": 0.4714536964893341,
+      "learning_rate": 0.000197685653729598,
+      "loss": 1.1987,
+      "step": 1560
+    },
+    {
+      "epoch": 0.27795584045584043,
+      "grad_norm": 0.48687809705734253,
+      "learning_rate": 0.00019768265877565097,
+      "loss": 1.3206,
+      "step": 1561
+    },
+    {
+      "epoch": 0.2781339031339031,
+      "grad_norm": 0.46066713333129883,
+      "learning_rate": 0.00019767966190781518,
+      "loss": 1.0845,
+      "step": 1562
+    },
+    {
+      "epoch": 0.2783119658119658,
+      "grad_norm": 0.44372090697288513,
+      "learning_rate": 0.00019767666312614935,
+      "loss": 1.0942,
+      "step": 1563
+    },
+    {
+      "epoch": 0.27849002849002846,
+      "grad_norm": 0.4615907073020935,
+      "learning_rate": 0.00019767366243071216,
+      "loss": 1.071,
+      "step": 1564
+    },
+    {
+      "epoch": 0.27866809116809116,
+      "grad_norm": 0.502097487449646,
+      "learning_rate": 0.0001976706598215625,
+      "loss": 1.1164,
+      "step": 1565
+    },
+    {
+      "epoch": 0.27884615384615385,
+      "grad_norm": 0.4371815621852875,
+      "learning_rate": 0.00019766765529875913,
+      "loss": 1.0252,
+      "step": 1566
+    },
+    {
+      "epoch": 0.27902421652421655,
+      "grad_norm": 0.43035808205604553,
+      "learning_rate": 0.00019766464886236093,
+      "loss": 1.073,
+      "step": 1567
+    },
+    {
+      "epoch": 0.2792022792022792,
+      "grad_norm": 0.49721601605415344,
+      "learning_rate": 0.00019766164051242683,
+      "loss": 1.0316,
+      "step": 1568
+    },
+    {
+      "epoch": 0.2793803418803419,
+      "grad_norm": 0.44866231083869934,
+      "learning_rate": 0.00019765863024901576,
+      "loss": 1.0951,
+      "step": 1569
+    },
+    {
+      "epoch": 0.2795584045584046,
+      "grad_norm": 0.46318337321281433,
+      "learning_rate": 0.0001976556180721867,
+      "loss": 0.9836,
+      "step": 1570
+    },
+    {
+      "epoch": 0.2797364672364672,
+      "grad_norm": 0.4227696657180786,
+      "learning_rate": 0.00019765260398199868,
+      "loss": 1.0414,
+      "step": 1571
+    },
+    {
+      "epoch": 0.2799145299145299,
+      "grad_norm": 0.6062980890274048,
+      "learning_rate": 0.00019764958797851073,
+      "loss": 1.137,
+      "step": 1572
+    },
+    {
+      "epoch": 0.2800925925925926,
+      "grad_norm": 0.4856833219528198,
+      "learning_rate": 0.00019764657006178196,
+      "loss": 1.1361,
+      "step": 1573
+    },
+    {
+      "epoch": 0.28027065527065526,
+      "grad_norm": 0.45612895488739014,
+      "learning_rate": 0.00019764355023187146,
+      "loss": 1.0005,
+      "step": 1574
+    },
+    {
+      "epoch": 0.28044871794871795,
+      "grad_norm": 0.4143696129322052,
+      "learning_rate": 0.00019764052848883845,
+      "loss": 1.051,
+      "step": 1575
+    },
+    {
+      "epoch": 0.28062678062678065,
+      "grad_norm": 0.4532071352005005,
+      "learning_rate": 0.00019763750483274212,
+      "loss": 1.0595,
+      "step": 1576
+    },
+    {
+      "epoch": 0.2808048433048433,
+      "grad_norm": 0.4940357208251953,
+      "learning_rate": 0.0001976344792636417,
+      "loss": 1.0983,
+      "step": 1577
+    },
+    {
+      "epoch": 0.280982905982906,
+      "grad_norm": 0.44405099749565125,
+      "learning_rate": 0.0001976314517815965,
+      "loss": 1.0846,
+      "step": 1578
+    },
+    {
+      "epoch": 0.2811609686609687,
+      "grad_norm": 0.5508625507354736,
+      "learning_rate": 0.00019762842238666578,
+      "loss": 1.1722,
+      "step": 1579
+    },
+    {
+      "epoch": 0.2813390313390313,
+      "grad_norm": 0.5241084694862366,
+      "learning_rate": 0.00019762539107890894,
+      "loss": 1.351,
+      "step": 1580
+    },
+    {
+      "epoch": 0.281517094017094,
+      "grad_norm": 0.5307353734970093,
+      "learning_rate": 0.00019762235785838537,
+      "loss": 1.1868,
+      "step": 1581
+    },
+    {
+      "epoch": 0.2816951566951567,
+      "grad_norm": 0.45697924494743347,
+      "learning_rate": 0.00019761932272515447,
+      "loss": 1.1982,
+      "step": 1582
+    },
+    {
+      "epoch": 0.28187321937321935,
+      "grad_norm": 0.412483811378479,
+      "learning_rate": 0.00019761628567927574,
+      "loss": 1.0433,
+      "step": 1583
+    },
+    {
+      "epoch": 0.28205128205128205,
+      "grad_norm": 0.4614165425300598,
+      "learning_rate": 0.00019761324672080868,
+      "loss": 1.104,
+      "step": 1584
+    },
+    {
+      "epoch": 0.28222934472934474,
+      "grad_norm": 0.47644901275634766,
+      "learning_rate": 0.00019761020584981284,
+      "loss": 1.1037,
+      "step": 1585
+    },
+    {
+      "epoch": 0.2824074074074074,
+      "grad_norm": 0.4985184669494629,
+      "learning_rate": 0.00019760716306634773,
+      "loss": 1.2213,
+      "step": 1586
+    },
+    {
+      "epoch": 0.2825854700854701,
+      "grad_norm": 0.508301317691803,
+      "learning_rate": 0.00019760411837047305,
+      "loss": 1.1315,
+      "step": 1587
+    },
+    {
+      "epoch": 0.2827635327635328,
+      "grad_norm": 0.5346587300300598,
+      "learning_rate": 0.00019760107176224845,
+      "loss": 1.2281,
+      "step": 1588
+    },
+    {
+      "epoch": 0.2829415954415954,
+      "grad_norm": 0.5106825232505798,
+      "learning_rate": 0.00019759802324173357,
+      "loss": 1.2904,
+      "step": 1589
+    },
+    {
+      "epoch": 0.2831196581196581,
+      "grad_norm": 0.46458688378334045,
+      "learning_rate": 0.00019759497280898817,
+      "loss": 1.0861,
+      "step": 1590
+    },
+    {
+      "epoch": 0.2832977207977208,
+      "grad_norm": 0.49115365743637085,
+      "learning_rate": 0.00019759192046407201,
+      "loss": 1.0529,
+      "step": 1591
+    },
+    {
+      "epoch": 0.28347578347578345,
+      "grad_norm": 0.5114167332649231,
+      "learning_rate": 0.0001975888662070449,
+      "loss": 1.2555,
+      "step": 1592
+    },
+    {
+      "epoch": 0.28365384615384615,
+      "grad_norm": 0.45844775438308716,
+      "learning_rate": 0.0001975858100379667,
+      "loss": 1.0662,
+      "step": 1593
+    },
+    {
+      "epoch": 0.28383190883190884,
+      "grad_norm": 0.4684161841869354,
+      "learning_rate": 0.00019758275195689727,
+      "loss": 1.0537,
+      "step": 1594
+    },
+    {
+      "epoch": 0.28400997150997154,
+      "grad_norm": 0.4816220998764038,
+      "learning_rate": 0.0001975796919638965,
+      "loss": 1.126,
+      "step": 1595
+    },
+    {
+      "epoch": 0.2841880341880342,
+      "grad_norm": 0.46578118205070496,
+      "learning_rate": 0.0001975766300590244,
+      "loss": 0.9651,
+      "step": 1596
+    },
+    {
+      "epoch": 0.2843660968660969,
+      "grad_norm": 0.4181675612926483,
+      "learning_rate": 0.0001975735662423409,
+      "loss": 1.0888,
+      "step": 1597
+    },
+    {
+      "epoch": 0.28454415954415957,
+      "grad_norm": 0.49417954683303833,
+      "learning_rate": 0.00019757050051390609,
+      "loss": 1.1878,
+      "step": 1598
+    },
+    {
+      "epoch": 0.2847222222222222,
+      "grad_norm": 0.47264960408210754,
+      "learning_rate": 0.00019756743287377998,
+      "loss": 1.027,
+      "step": 1599
+    },
+    {
+      "epoch": 0.2849002849002849,
+      "grad_norm": 0.47686338424682617,
+      "learning_rate": 0.0001975643633220227,
+      "loss": 1.1307,
+      "step": 1600
+    },
+    {
+      "epoch": 0.2850783475783476,
+      "grad_norm": 0.5571266412734985,
+      "learning_rate": 0.00019756129185869443,
+      "loss": 0.984,
+      "step": 1601
+    },
+    {
+      "epoch": 0.28525641025641024,
+      "grad_norm": 0.46942809224128723,
+      "learning_rate": 0.00019755821848385527,
+      "loss": 1.0397,
+      "step": 1602
+    },
+    {
+      "epoch": 0.28543447293447294,
+      "grad_norm": 0.6325890421867371,
+      "learning_rate": 0.00019755514319756551,
+      "loss": 1.0918,
+      "step": 1603
+    },
+    {
+      "epoch": 0.28561253561253563,
+      "grad_norm": 0.5297608375549316,
+      "learning_rate": 0.00019755206599988533,
+      "loss": 0.9911,
+      "step": 1604
+    },
+    {
+      "epoch": 0.2857905982905983,
+      "grad_norm": 0.4736945331096649,
+      "learning_rate": 0.00019754898689087512,
+      "loss": 1.0786,
+      "step": 1605
+    },
+    {
+      "epoch": 0.28596866096866097,
+      "grad_norm": 0.5048685669898987,
+      "learning_rate": 0.00019754590587059512,
+      "loss": 0.9834,
+      "step": 1606
+    },
+    {
+      "epoch": 0.28614672364672367,
+      "grad_norm": 0.3823149502277374,
+      "learning_rate": 0.00019754282293910574,
+      "loss": 0.8341,
+      "step": 1607
+    },
+    {
+      "epoch": 0.2863247863247863,
+      "grad_norm": 0.44071945548057556,
+      "learning_rate": 0.00019753973809646738,
+      "loss": 1.131,
+      "step": 1608
+    },
+    {
+      "epoch": 0.286502849002849,
+      "grad_norm": 0.44182759523391724,
+      "learning_rate": 0.00019753665134274043,
+      "loss": 1.0321,
+      "step": 1609
+    },
+    {
+      "epoch": 0.2866809116809117,
+      "grad_norm": 0.4486250877380371,
+      "learning_rate": 0.00019753356267798546,
+      "loss": 0.9941,
+      "step": 1610
+    },
+    {
+      "epoch": 0.28685897435897434,
+      "grad_norm": 0.42796584963798523,
+      "learning_rate": 0.00019753047210226292,
+      "loss": 1.0235,
+      "step": 1611
+    },
+    {
+      "epoch": 0.28703703703703703,
+      "grad_norm": 0.47294023633003235,
+      "learning_rate": 0.00019752737961563336,
+      "loss": 1.11,
+      "step": 1612
+    },
+    {
+      "epoch": 0.28721509971509973,
+      "grad_norm": 0.44550734758377075,
+      "learning_rate": 0.00019752428521815742,
+      "loss": 1.0849,
+      "step": 1613
+    },
+    {
+      "epoch": 0.28739316239316237,
+      "grad_norm": 0.44189929962158203,
+      "learning_rate": 0.0001975211889098957,
+      "loss": 0.8904,
+      "step": 1614
+    },
+    {
+      "epoch": 0.28757122507122507,
+      "grad_norm": 0.5302733182907104,
+      "learning_rate": 0.00019751809069090885,
+      "loss": 1.2348,
+      "step": 1615
+    },
+    {
+      "epoch": 0.28774928774928776,
+      "grad_norm": 0.5951390862464905,
+      "learning_rate": 0.00019751499056125762,
+      "loss": 1.3035,
+      "step": 1616
+    },
+    {
+      "epoch": 0.2879273504273504,
+      "grad_norm": 0.5431534647941589,
+      "learning_rate": 0.0001975118885210027,
+      "loss": 1.0016,
+      "step": 1617
+    },
+    {
+      "epoch": 0.2881054131054131,
+      "grad_norm": 0.47301986813545227,
+      "learning_rate": 0.00019750878457020489,
+      "loss": 1.2245,
+      "step": 1618
+    },
+    {
+      "epoch": 0.2882834757834758,
+      "grad_norm": 0.44785359501838684,
+      "learning_rate": 0.00019750567870892497,
+      "loss": 1.122,
+      "step": 1619
+    },
+    {
+      "epoch": 0.28846153846153844,
+      "grad_norm": 0.49494361877441406,
+      "learning_rate": 0.00019750257093722383,
+      "loss": 0.9421,
+      "step": 1620
+    },
+    {
+      "epoch": 0.28863960113960113,
+      "grad_norm": 0.4484521150588989,
+      "learning_rate": 0.00019749946125516242,
+      "loss": 1.2146,
+      "step": 1621
+    },
+    {
+      "epoch": 0.28881766381766383,
+      "grad_norm": 0.4635269343852997,
+      "learning_rate": 0.00019749634966280156,
+      "loss": 0.976,
+      "step": 1622
+    },
+    {
+      "epoch": 0.28899572649572647,
+      "grad_norm": 0.5532249808311462,
+      "learning_rate": 0.00019749323616020226,
+      "loss": 1.1818,
+      "step": 1623
+    },
+    {
+      "epoch": 0.28917378917378916,
+      "grad_norm": 0.4730629622936249,
+      "learning_rate": 0.00019749012074742552,
+      "loss": 1.0321,
+      "step": 1624
+    },
+    {
+      "epoch": 0.28935185185185186,
+      "grad_norm": 0.47437289357185364,
+      "learning_rate": 0.0001974870034245324,
+      "loss": 1.1572,
+      "step": 1625
+    },
+    {
+      "epoch": 0.28952991452991456,
+      "grad_norm": 0.4796304404735565,
+      "learning_rate": 0.00019748388419158394,
+      "loss": 1.1667,
+      "step": 1626
+    },
+    {
+      "epoch": 0.2897079772079772,
+      "grad_norm": 0.42686304450035095,
+      "learning_rate": 0.0001974807630486413,
+      "loss": 0.9824,
+      "step": 1627
+    },
+    {
+      "epoch": 0.2898860398860399,
+      "grad_norm": 0.4444865584373474,
+      "learning_rate": 0.00019747763999576558,
+      "loss": 1.2789,
+      "step": 1628
+    },
+    {
+      "epoch": 0.2900641025641026,
+      "grad_norm": 0.5039985179901123,
+      "learning_rate": 0.000197474515033018,
+      "loss": 1.1488,
+      "step": 1629
+    },
+    {
+      "epoch": 0.29024216524216523,
+      "grad_norm": 0.581479549407959,
+      "learning_rate": 0.00019747138816045978,
+      "loss": 1.1232,
+      "step": 1630
+    },
+    {
+      "epoch": 0.2904202279202279,
+      "grad_norm": 0.5415821075439453,
+      "learning_rate": 0.00019746825937815222,
+      "loss": 1.2326,
+      "step": 1631
+    },
+    {
+      "epoch": 0.2905982905982906,
+      "grad_norm": 0.45528364181518555,
+      "learning_rate": 0.00019746512868615656,
+      "loss": 1.0246,
+      "step": 1632
+    },
+    {
+      "epoch": 0.29077635327635326,
+      "grad_norm": 0.5255574584007263,
+      "learning_rate": 0.00019746199608453418,
+      "loss": 1.0592,
+      "step": 1633
+    },
+    {
+      "epoch": 0.29095441595441596,
+      "grad_norm": 0.5064096450805664,
+      "learning_rate": 0.00019745886157334646,
+      "loss": 1.3439,
+      "step": 1634
+    },
+    {
+      "epoch": 0.29113247863247865,
+      "grad_norm": 0.500848650932312,
+      "learning_rate": 0.00019745572515265475,
+      "loss": 1.1212,
+      "step": 1635
+    },
+    {
+      "epoch": 0.2913105413105413,
+      "grad_norm": 0.5229088068008423,
+      "learning_rate": 0.00019745258682252062,
+      "loss": 1.1019,
+      "step": 1636
+    },
+    {
+      "epoch": 0.291488603988604,
+      "grad_norm": 0.4494398832321167,
+      "learning_rate": 0.00019744944658300545,
+      "loss": 1.1298,
+      "step": 1637
+    },
+    {
+      "epoch": 0.2916666666666667,
+      "grad_norm": 0.48383277654647827,
+      "learning_rate": 0.00019744630443417082,
+      "loss": 1.206,
+      "step": 1638
+    },
+    {
+      "epoch": 0.2918447293447293,
+      "grad_norm": 0.4870131313800812,
+      "learning_rate": 0.00019744316037607828,
+      "loss": 1.2096,
+      "step": 1639
+    },
+    {
+      "epoch": 0.292022792022792,
+      "grad_norm": 0.4153090715408325,
+      "learning_rate": 0.00019744001440878944,
+      "loss": 1.0478,
+      "step": 1640
+    },
+    {
+      "epoch": 0.2922008547008547,
+      "grad_norm": 0.4262249171733856,
+      "learning_rate": 0.0001974368665323659,
+      "loss": 1.0393,
+      "step": 1641
+    },
+    {
+      "epoch": 0.29237891737891736,
+      "grad_norm": 0.46131134033203125,
+      "learning_rate": 0.00019743371674686938,
+      "loss": 1.0908,
+      "step": 1642
+    },
+    {
+      "epoch": 0.29255698005698005,
+      "grad_norm": 0.44877463579177856,
+      "learning_rate": 0.0001974305650523616,
+      "loss": 1.1906,
+      "step": 1643
+    },
+    {
+      "epoch": 0.29273504273504275,
+      "grad_norm": 0.5199326276779175,
+      "learning_rate": 0.00019742741144890432,
+      "loss": 1.1147,
+      "step": 1644
+    },
+    {
+      "epoch": 0.2929131054131054,
+      "grad_norm": 0.48142504692077637,
+      "learning_rate": 0.00019742425593655924,
+      "loss": 1.1951,
+      "step": 1645
+    },
+    {
+      "epoch": 0.2930911680911681,
+      "grad_norm": 0.5672988891601562,
+      "learning_rate": 0.0001974210985153883,
+      "loss": 1.1817,
+      "step": 1646
+    },
+    {
+      "epoch": 0.2932692307692308,
+      "grad_norm": 0.38135233521461487,
+      "learning_rate": 0.00019741793918545326,
+      "loss": 0.8567,
+      "step": 1647
+    },
+    {
+      "epoch": 0.2934472934472934,
+      "grad_norm": 0.6153588891029358,
+      "learning_rate": 0.0001974147779468161,
+      "loss": 1.0593,
+      "step": 1648
+    },
+    {
+      "epoch": 0.2936253561253561,
+      "grad_norm": 0.38935527205467224,
+      "learning_rate": 0.0001974116147995387,
+      "loss": 0.9907,
+      "step": 1649
+    },
+    {
+      "epoch": 0.2938034188034188,
+      "grad_norm": 0.467351496219635,
+      "learning_rate": 0.0001974084497436831,
+      "loss": 1.091,
+      "step": 1650
+    },
+    {
+      "epoch": 0.29398148148148145,
+      "grad_norm": 0.45613420009613037,
+      "learning_rate": 0.00019740528277931128,
+      "loss": 0.6789,
+      "step": 1651
+    },
+    {
+      "epoch": 0.29415954415954415,
+      "grad_norm": 0.4045158326625824,
+      "learning_rate": 0.00019740211390648524,
+      "loss": 1.0727,
+      "step": 1652
+    },
+    {
+      "epoch": 0.29433760683760685,
+      "grad_norm": 0.5122803449630737,
+      "learning_rate": 0.00019739894312526714,
+      "loss": 1.2297,
+      "step": 1653
+    },
+    {
+      "epoch": 0.29451566951566954,
+      "grad_norm": 0.44304123520851135,
+      "learning_rate": 0.00019739577043571908,
+      "loss": 0.9562,
+      "step": 1654
+    },
+    {
+      "epoch": 0.2946937321937322,
+      "grad_norm": 0.6070618629455566,
+      "learning_rate": 0.00019739259583790322,
+      "loss": 1.2745,
+      "step": 1655
+    },
+    {
+      "epoch": 0.2948717948717949,
+      "grad_norm": 0.48815637826919556,
+      "learning_rate": 0.00019738941933188176,
+      "loss": 1.0574,
+      "step": 1656
+    },
+    {
+      "epoch": 0.2950498575498576,
+      "grad_norm": 0.5067802667617798,
+      "learning_rate": 0.00019738624091771693,
+      "loss": 1.1874,
+      "step": 1657
+    },
+    {
+      "epoch": 0.2952279202279202,
+      "grad_norm": 0.4956928491592407,
+      "learning_rate": 0.000197383060595471,
+      "loss": 1.1085,
+      "step": 1658
+    },
+    {
+      "epoch": 0.2954059829059829,
+      "grad_norm": 0.46313008666038513,
+      "learning_rate": 0.00019737987836520633,
+      "loss": 1.0548,
+      "step": 1659
+    },
+    {
+      "epoch": 0.2955840455840456,
+      "grad_norm": 0.49944064021110535,
+      "learning_rate": 0.0001973766942269852,
+      "loss": 1.1485,
+      "step": 1660
+    },
+    {
+      "epoch": 0.29576210826210825,
+      "grad_norm": 0.4743517339229584,
+      "learning_rate": 0.00019737350818087003,
+      "loss": 0.9279,
+      "step": 1661
+    },
+    {
+      "epoch": 0.29594017094017094,
+      "grad_norm": 0.45935431122779846,
+      "learning_rate": 0.00019737032022692326,
+      "loss": 0.9574,
+      "step": 1662
+    },
+    {
+      "epoch": 0.29611823361823364,
+      "grad_norm": 0.4550873637199402,
+      "learning_rate": 0.00019736713036520734,
+      "loss": 1.1642,
+      "step": 1663
+    },
+    {
+      "epoch": 0.2962962962962963,
+      "grad_norm": 0.45252951979637146,
+      "learning_rate": 0.00019736393859578474,
+      "loss": 1.0113,
+      "step": 1664
+    },
+    {
+      "epoch": 0.296474358974359,
+      "grad_norm": 0.5147238969802856,
+      "learning_rate": 0.00019736074491871804,
+      "loss": 1.1604,
+      "step": 1665
+    },
+    {
+      "epoch": 0.29665242165242167,
+      "grad_norm": 0.5122934579849243,
+      "learning_rate": 0.00019735754933406977,
+      "loss": 0.9525,
+      "step": 1666
+    },
+    {
+      "epoch": 0.2968304843304843,
+      "grad_norm": 0.438620001077652,
+      "learning_rate": 0.00019735435184190257,
+      "loss": 1.0728,
+      "step": 1667
+    },
+    {
+      "epoch": 0.297008547008547,
+      "grad_norm": 0.41970670223236084,
+      "learning_rate": 0.00019735115244227908,
+      "loss": 0.9782,
+      "step": 1668
+    },
+    {
+      "epoch": 0.2971866096866097,
+      "grad_norm": 0.5447152256965637,
+      "learning_rate": 0.000197347951135262,
+      "loss": 1.0633,
+      "step": 1669
+    },
+    {
+      "epoch": 0.29736467236467234,
+      "grad_norm": 0.4846996068954468,
+      "learning_rate": 0.00019734474792091407,
+      "loss": 0.9019,
+      "step": 1670
+    },
+    {
+      "epoch": 0.29754273504273504,
+      "grad_norm": 0.4721437990665436,
+      "learning_rate": 0.00019734154279929796,
+      "loss": 1.1793,
+      "step": 1671
+    },
+    {
+      "epoch": 0.29772079772079774,
+      "grad_norm": 0.4659852385520935,
+      "learning_rate": 0.00019733833577047655,
+      "loss": 1.1503,
+      "step": 1672
+    },
+    {
+      "epoch": 0.2978988603988604,
+      "grad_norm": 0.3733183443546295,
+      "learning_rate": 0.00019733512683451268,
+      "loss": 0.7763,
+      "step": 1673
+    },
+    {
+      "epoch": 0.2980769230769231,
+      "grad_norm": 0.4898292124271393,
+      "learning_rate": 0.0001973319159914692,
+      "loss": 1.3146,
+      "step": 1674
+    },
+    {
+      "epoch": 0.29825498575498577,
+      "grad_norm": 0.41774725914001465,
+      "learning_rate": 0.00019732870324140899,
+      "loss": 1.2069,
+      "step": 1675
+    },
+    {
+      "epoch": 0.2984330484330484,
+      "grad_norm": 0.4607912003993988,
+      "learning_rate": 0.000197325488584395,
+      "loss": 1.2255,
+      "step": 1676
+    },
+    {
+      "epoch": 0.2986111111111111,
+      "grad_norm": 0.4692424237728119,
+      "learning_rate": 0.00019732227202049025,
+      "loss": 1.0793,
+      "step": 1677
+    },
+    {
+      "epoch": 0.2987891737891738,
+      "grad_norm": 0.5925022959709167,
+      "learning_rate": 0.00019731905354975778,
+      "loss": 1.0297,
+      "step": 1678
+    },
+    {
+      "epoch": 0.29896723646723644,
+      "grad_norm": 0.44047990441322327,
+      "learning_rate": 0.00019731583317226056,
+      "loss": 1.0982,
+      "step": 1679
+    },
+    {
+      "epoch": 0.29914529914529914,
+      "grad_norm": 0.5863066911697388,
+      "learning_rate": 0.0001973126108880618,
+      "loss": 1.0284,
+      "step": 1680
+    },
+    {
+      "epoch": 0.29932336182336183,
+      "grad_norm": 0.48962152004241943,
+      "learning_rate": 0.00019730938669722457,
+      "loss": 1.1861,
+      "step": 1681
+    },
+    {
+      "epoch": 0.29950142450142453,
+      "grad_norm": 0.5445577502250671,
+      "learning_rate": 0.00019730616059981205,
+      "loss": 1.2574,
+      "step": 1682
+    },
+    {
+      "epoch": 0.29967948717948717,
+      "grad_norm": 0.49327564239501953,
+      "learning_rate": 0.00019730293259588743,
+      "loss": 0.9578,
+      "step": 1683
+    },
+    {
+      "epoch": 0.29985754985754987,
+      "grad_norm": 0.4252840578556061,
+      "learning_rate": 0.00019729970268551398,
+      "loss": 1.0083,
+      "step": 1684
+    },
+    {
+      "epoch": 0.30003561253561256,
+      "grad_norm": 0.5140926241874695,
+      "learning_rate": 0.000197296470868755,
+      "loss": 1.3263,
+      "step": 1685
+    },
+    {
+      "epoch": 0.3002136752136752,
+      "grad_norm": 0.5143948197364807,
+      "learning_rate": 0.00019729323714567375,
+      "loss": 1.0424,
+      "step": 1686
+    },
+    {
+      "epoch": 0.3003917378917379,
+      "grad_norm": 0.3811354339122772,
+      "learning_rate": 0.00019729000151633367,
+      "loss": 0.6319,
+      "step": 1687
+    },
+    {
+      "epoch": 0.3005698005698006,
+      "grad_norm": 0.5249716639518738,
+      "learning_rate": 0.0001972867639807981,
+      "loss": 1.0173,
+      "step": 1688
+    },
+    {
+      "epoch": 0.30074786324786323,
+      "grad_norm": 0.41832098364830017,
+      "learning_rate": 0.00019728352453913048,
+      "loss": 1.0503,
+      "step": 1689
+    },
+    {
+      "epoch": 0.30092592592592593,
+      "grad_norm": 0.5961149334907532,
+      "learning_rate": 0.00019728028319139428,
+      "loss": 1.1843,
+      "step": 1690
+    },
+    {
+      "epoch": 0.3011039886039886,
+      "grad_norm": 0.44083690643310547,
+      "learning_rate": 0.00019727703993765303,
+      "loss": 1.1311,
+      "step": 1691
+    },
+    {
+      "epoch": 0.30128205128205127,
+      "grad_norm": 0.4368111491203308,
+      "learning_rate": 0.00019727379477797022,
+      "loss": 0.9463,
+      "step": 1692
+    },
+    {
+      "epoch": 0.30146011396011396,
+      "grad_norm": 0.5289376974105835,
+      "learning_rate": 0.00019727054771240954,
+      "loss": 0.9836,
+      "step": 1693
+    },
+    {
+      "epoch": 0.30163817663817666,
+      "grad_norm": 0.4132843613624573,
+      "learning_rate": 0.00019726729874103448,
+      "loss": 1.1052,
+      "step": 1694
+    },
+    {
+      "epoch": 0.3018162393162393,
+      "grad_norm": 0.4919086992740631,
+      "learning_rate": 0.00019726404786390877,
+      "loss": 1.2219,
+      "step": 1695
+    },
+    {
+      "epoch": 0.301994301994302,
+      "grad_norm": 0.42561691999435425,
+      "learning_rate": 0.0001972607950810961,
+      "loss": 1.0756,
+      "step": 1696
+    },
+    {
+      "epoch": 0.3021723646723647,
+      "grad_norm": 0.5030396580696106,
+      "learning_rate": 0.0001972575403926602,
+      "loss": 1.2207,
+      "step": 1697
+    },
+    {
+      "epoch": 0.30235042735042733,
+      "grad_norm": 0.4779801666736603,
+      "learning_rate": 0.0001972542837986648,
+      "loss": 1.194,
+      "step": 1698
+    },
+    {
+      "epoch": 0.30252849002849,
+      "grad_norm": 0.45395568013191223,
+      "learning_rate": 0.00019725102529917377,
+      "loss": 1.0775,
+      "step": 1699
+    },
+    {
+      "epoch": 0.3027065527065527,
+      "grad_norm": 0.6540699005126953,
+      "learning_rate": 0.0001972477648942509,
+      "loss": 1.181,
+      "step": 1700
+    },
+    {
+      "epoch": 0.30288461538461536,
+      "grad_norm": 0.46281275153160095,
+      "learning_rate": 0.00019724450258396008,
+      "loss": 0.629,
+      "step": 1701
+    },
+    {
+      "epoch": 0.30306267806267806,
+      "grad_norm": 0.3452845811843872,
+      "learning_rate": 0.00019724123836836527,
+      "loss": 0.51,
+      "step": 1702
+    },
+    {
+      "epoch": 0.30324074074074076,
+      "grad_norm": 0.4507991671562195,
+      "learning_rate": 0.00019723797224753038,
+      "loss": 1.0258,
+      "step": 1703
+    },
+    {
+      "epoch": 0.3034188034188034,
+      "grad_norm": 0.5385412573814392,
+      "learning_rate": 0.0001972347042215194,
+      "loss": 1.0232,
+      "step": 1704
+    },
+    {
+      "epoch": 0.3035968660968661,
+      "grad_norm": 0.4460466504096985,
+      "learning_rate": 0.00019723143429039642,
+      "loss": 1.1307,
+      "step": 1705
+    },
+    {
+      "epoch": 0.3037749287749288,
+      "grad_norm": 0.5229718685150146,
+      "learning_rate": 0.00019722816245422545,
+      "loss": 1.0964,
+      "step": 1706
+    },
+    {
+      "epoch": 0.30395299145299143,
+      "grad_norm": 0.4776979088783264,
+      "learning_rate": 0.00019722488871307058,
+      "loss": 1.2678,
+      "step": 1707
+    },
+    {
+      "epoch": 0.3041310541310541,
+      "grad_norm": 0.5371831655502319,
+      "learning_rate": 0.00019722161306699601,
+      "loss": 1.2808,
+      "step": 1708
+    },
+    {
+      "epoch": 0.3043091168091168,
+      "grad_norm": 0.45322108268737793,
+      "learning_rate": 0.0001972183355160659,
+      "loss": 1.0775,
+      "step": 1709
+    },
+    {
+      "epoch": 0.30448717948717946,
+      "grad_norm": 0.5036569833755493,
+      "learning_rate": 0.00019721505606034448,
+      "loss": 1.1859,
+      "step": 1710
+    },
+    {
+      "epoch": 0.30466524216524216,
+      "grad_norm": 0.5425969958305359,
+      "learning_rate": 0.00019721177469989593,
+      "loss": 1.0173,
+      "step": 1711
+    },
+    {
+      "epoch": 0.30484330484330485,
+      "grad_norm": 0.5638980269432068,
+      "learning_rate": 0.00019720849143478462,
+      "loss": 1.182,
+      "step": 1712
+    },
+    {
+      "epoch": 0.30502136752136755,
+      "grad_norm": 0.5160546898841858,
+      "learning_rate": 0.00019720520626507486,
+      "loss": 0.9853,
+      "step": 1713
+    },
+    {
+      "epoch": 0.3051994301994302,
+      "grad_norm": 0.5079004168510437,
+      "learning_rate": 0.000197201919190831,
+      "loss": 1.3154,
+      "step": 1714
+    },
+    {
+      "epoch": 0.3053774928774929,
+      "grad_norm": 0.4590355455875397,
+      "learning_rate": 0.00019719863021211745,
+      "loss": 1.007,
+      "step": 1715
+    },
+    {
+      "epoch": 0.3055555555555556,
+      "grad_norm": 0.49656423926353455,
+      "learning_rate": 0.00019719533932899865,
+      "loss": 1.2187,
+      "step": 1716
+    },
+    {
+      "epoch": 0.3057336182336182,
+      "grad_norm": 0.46426209807395935,
+      "learning_rate": 0.0001971920465415391,
+      "loss": 1.3007,
+      "step": 1717
+    },
+    {
+      "epoch": 0.3059116809116809,
+      "grad_norm": 0.5211917757987976,
+      "learning_rate": 0.00019718875184980328,
+      "loss": 1.2256,
+      "step": 1718
+    },
+    {
+      "epoch": 0.3060897435897436,
+      "grad_norm": 0.42953309416770935,
+      "learning_rate": 0.00019718545525385578,
+      "loss": 1.2838,
+      "step": 1719
+    },
+    {
+      "epoch": 0.30626780626780625,
+      "grad_norm": 0.4893105924129486,
+      "learning_rate": 0.00019718215675376116,
+      "loss": 1.052,
+      "step": 1720
+    },
+    {
+      "epoch": 0.30644586894586895,
+      "grad_norm": 0.4833602011203766,
+      "learning_rate": 0.00019717885634958405,
+      "loss": 1.069,
+      "step": 1721
+    },
+    {
+      "epoch": 0.30662393162393164,
+      "grad_norm": 0.502176821231842,
+      "learning_rate": 0.0001971755540413891,
+      "loss": 1.1659,
+      "step": 1722
+    },
+    {
+      "epoch": 0.3068019943019943,
+      "grad_norm": 0.4648856818675995,
+      "learning_rate": 0.00019717224982924108,
+      "loss": 1.1873,
+      "step": 1723
+    },
+    {
+      "epoch": 0.306980056980057,
+      "grad_norm": 0.405429869890213,
+      "learning_rate": 0.00019716894371320465,
+      "loss": 0.99,
+      "step": 1724
+    },
+    {
+      "epoch": 0.3071581196581197,
+      "grad_norm": 0.4306945204734802,
+      "learning_rate": 0.00019716563569334463,
+      "loss": 0.8751,
+      "step": 1725
+    },
+    {
+      "epoch": 0.3073361823361823,
+      "grad_norm": 0.49424824118614197,
+      "learning_rate": 0.00019716232576972583,
+      "loss": 0.9205,
+      "step": 1726
+    },
+    {
+      "epoch": 0.307514245014245,
+      "grad_norm": 0.5044034123420715,
+      "learning_rate": 0.00019715901394241306,
+      "loss": 1.2042,
+      "step": 1727
+    },
+    {
+      "epoch": 0.3076923076923077,
+      "grad_norm": 0.512180507183075,
+      "learning_rate": 0.00019715570021147126,
+      "loss": 1.1644,
+      "step": 1728
+    },
+    {
+      "epoch": 0.30787037037037035,
+      "grad_norm": 0.4377981126308441,
+      "learning_rate": 0.00019715238457696538,
+      "loss": 1.1625,
+      "step": 1729
+    },
+    {
+      "epoch": 0.30804843304843305,
+      "grad_norm": 0.49107855558395386,
+      "learning_rate": 0.00019714906703896027,
+      "loss": 1.1037,
+      "step": 1730
+    },
+    {
+      "epoch": 0.30822649572649574,
+      "grad_norm": 0.47342559695243835,
+      "learning_rate": 0.00019714574759752105,
+      "loss": 1.3186,
+      "step": 1731
+    },
+    {
+      "epoch": 0.3084045584045584,
+      "grad_norm": 0.487177312374115,
+      "learning_rate": 0.0001971424262527127,
+      "loss": 1.1196,
+      "step": 1732
+    },
+    {
+      "epoch": 0.3085826210826211,
+      "grad_norm": 0.5290025472640991,
+      "learning_rate": 0.0001971391030046003,
+      "loss": 1.2103,
+      "step": 1733
+    },
+    {
+      "epoch": 0.3087606837606838,
+      "grad_norm": 0.4587760269641876,
+      "learning_rate": 0.00019713577785324896,
+      "loss": 1.1017,
+      "step": 1734
+    },
+    {
+      "epoch": 0.3089387464387464,
+      "grad_norm": 0.45323294401168823,
+      "learning_rate": 0.00019713245079872388,
+      "loss": 1.0,
+      "step": 1735
+    },
+    {
+      "epoch": 0.3091168091168091,
+      "grad_norm": 0.43414804339408875,
+      "learning_rate": 0.00019712912184109013,
+      "loss": 1.0341,
+      "step": 1736
+    },
+    {
+      "epoch": 0.3092948717948718,
+      "grad_norm": 0.49604663252830505,
+      "learning_rate": 0.00019712579098041304,
+      "loss": 0.9437,
+      "step": 1737
+    },
+    {
+      "epoch": 0.30947293447293445,
+      "grad_norm": 0.48580703139305115,
+      "learning_rate": 0.00019712245821675785,
+      "loss": 1.2622,
+      "step": 1738
+    },
+    {
+      "epoch": 0.30965099715099714,
+      "grad_norm": 0.45333603024482727,
+      "learning_rate": 0.00019711912355018982,
+      "loss": 1.2063,
+      "step": 1739
+    },
+    {
+      "epoch": 0.30982905982905984,
+      "grad_norm": 0.5990764498710632,
+      "learning_rate": 0.00019711578698077432,
+      "loss": 1.5097,
+      "step": 1740
+    },
+    {
+      "epoch": 0.31000712250712253,
+      "grad_norm": 0.4386102259159088,
+      "learning_rate": 0.0001971124485085767,
+      "loss": 1.1283,
+      "step": 1741
+    },
+    {
+      "epoch": 0.3101851851851852,
+      "grad_norm": 0.4476035237312317,
+      "learning_rate": 0.00019710910813366242,
+      "loss": 0.8922,
+      "step": 1742
+    },
+    {
+      "epoch": 0.31036324786324787,
+      "grad_norm": 0.5276228785514832,
+      "learning_rate": 0.00019710576585609685,
+      "loss": 1.2373,
+      "step": 1743
+    },
+    {
+      "epoch": 0.31054131054131057,
+      "grad_norm": 0.4885637164115906,
+      "learning_rate": 0.00019710242167594557,
+      "loss": 1.0881,
+      "step": 1744
+    },
+    {
+      "epoch": 0.3107193732193732,
+      "grad_norm": 0.421132355928421,
+      "learning_rate": 0.000197099075593274,
+      "loss": 1.0544,
+      "step": 1745
+    },
+    {
+      "epoch": 0.3108974358974359,
+      "grad_norm": 0.5257927179336548,
+      "learning_rate": 0.00019709572760814777,
+      "loss": 1.265,
+      "step": 1746
+    },
+    {
+      "epoch": 0.3110754985754986,
+      "grad_norm": 0.5164850950241089,
+      "learning_rate": 0.00019709237772063247,
+      "loss": 0.9593,
+      "step": 1747
+    },
+    {
+      "epoch": 0.31125356125356124,
+      "grad_norm": 0.5176383256912231,
+      "learning_rate": 0.00019708902593079374,
+      "loss": 1.0194,
+      "step": 1748
+    },
+    {
+      "epoch": 0.31143162393162394,
+      "grad_norm": 0.4620790481567383,
+      "learning_rate": 0.00019708567223869716,
+      "loss": 0.9241,
+      "step": 1749
+    },
+    {
+      "epoch": 0.31160968660968663,
+      "grad_norm": 0.48307979106903076,
+      "learning_rate": 0.00019708231664440854,
+      "loss": 1.2314,
+      "step": 1750
+    },
+    {
+      "epoch": 0.31178774928774927,
+      "grad_norm": 0.4931468069553375,
+      "learning_rate": 0.00019707895914799364,
+      "loss": 1.2065,
+      "step": 1751
+    },
+    {
+      "epoch": 0.31196581196581197,
+      "grad_norm": 0.5035979747772217,
+      "learning_rate": 0.00019707559974951818,
+      "loss": 1.1867,
+      "step": 1752
+    },
+    {
+      "epoch": 0.31214387464387466,
+      "grad_norm": 0.47543632984161377,
+      "learning_rate": 0.00019707223844904795,
+      "loss": 1.0603,
+      "step": 1753
+    },
+    {
+      "epoch": 0.3123219373219373,
+      "grad_norm": 0.49929797649383545,
+      "learning_rate": 0.00019706887524664892,
+      "loss": 1.0597,
+      "step": 1754
+    },
+    {
+      "epoch": 0.3125,
+      "grad_norm": 0.5075222253799438,
+      "learning_rate": 0.00019706551014238687,
+      "loss": 1.1398,
+      "step": 1755
+    },
+    {
+      "epoch": 0.3126780626780627,
+      "grad_norm": 0.5096884369850159,
+      "learning_rate": 0.00019706214313632784,
+      "loss": 1.1382,
+      "step": 1756
+    },
+    {
+      "epoch": 0.31285612535612534,
+      "grad_norm": 0.4629988372325897,
+      "learning_rate": 0.0001970587742285377,
+      "loss": 1.0009,
+      "step": 1757
+    },
+    {
+      "epoch": 0.31303418803418803,
+      "grad_norm": 0.5244084596633911,
+      "learning_rate": 0.00019705540341908253,
+      "loss": 1.047,
+      "step": 1758
+    },
+    {
+      "epoch": 0.31321225071225073,
+      "grad_norm": 0.5136716961860657,
+      "learning_rate": 0.00019705203070802832,
+      "loss": 1.29,
+      "step": 1759
+    },
+    {
+      "epoch": 0.31339031339031337,
+      "grad_norm": 0.43991541862487793,
+      "learning_rate": 0.0001970486560954412,
+      "loss": 0.9605,
+      "step": 1760
+    },
+    {
+      "epoch": 0.31356837606837606,
+      "grad_norm": 0.4633477032184601,
+      "learning_rate": 0.00019704527958138725,
+      "loss": 1.1507,
+      "step": 1761
+    },
+    {
+      "epoch": 0.31374643874643876,
+      "grad_norm": 0.4419999420642853,
+      "learning_rate": 0.00019704190116593266,
+      "loss": 0.9262,
+      "step": 1762
+    },
+    {
+      "epoch": 0.3139245014245014,
+      "grad_norm": 0.49359434843063354,
+      "learning_rate": 0.00019703852084914357,
+      "loss": 0.9348,
+      "step": 1763
+    },
+    {
+      "epoch": 0.3141025641025641,
+      "grad_norm": 0.5072139501571655,
+      "learning_rate": 0.00019703513863108627,
+      "loss": 1.1592,
+      "step": 1764
+    },
+    {
+      "epoch": 0.3142806267806268,
+      "grad_norm": 0.45969831943511963,
+      "learning_rate": 0.00019703175451182698,
+      "loss": 1.1519,
+      "step": 1765
+    },
+    {
+      "epoch": 0.31445868945868943,
+      "grad_norm": 0.5148758292198181,
+      "learning_rate": 0.00019702836849143208,
+      "loss": 1.1673,
+      "step": 1766
+    },
+    {
+      "epoch": 0.31463675213675213,
+      "grad_norm": 0.43033209443092346,
+      "learning_rate": 0.0001970249805699678,
+      "loss": 0.9256,
+      "step": 1767
+    },
+    {
+      "epoch": 0.3148148148148148,
+      "grad_norm": 0.48143425583839417,
+      "learning_rate": 0.00019702159074750058,
+      "loss": 1.08,
+      "step": 1768
+    },
+    {
+      "epoch": 0.31499287749287747,
+      "grad_norm": 0.4780619740486145,
+      "learning_rate": 0.00019701819902409685,
+      "loss": 1.1198,
+      "step": 1769
+    },
+    {
+      "epoch": 0.31517094017094016,
+      "grad_norm": 0.4662075936794281,
+      "learning_rate": 0.00019701480539982305,
+      "loss": 0.8424,
+      "step": 1770
+    },
+    {
+      "epoch": 0.31534900284900286,
+      "grad_norm": 0.503901481628418,
+      "learning_rate": 0.00019701140987474566,
+      "loss": 1.1026,
+      "step": 1771
+    },
+    {
+      "epoch": 0.31552706552706555,
+      "grad_norm": 0.5197132229804993,
+      "learning_rate": 0.00019700801244893124,
+      "loss": 1.2148,
+      "step": 1772
+    },
+    {
+      "epoch": 0.3157051282051282,
+      "grad_norm": 0.4746309220790863,
+      "learning_rate": 0.00019700461312244634,
+      "loss": 1.0906,
+      "step": 1773
+    },
+    {
+      "epoch": 0.3158831908831909,
+      "grad_norm": 0.5277339816093445,
+      "learning_rate": 0.00019700121189535752,
+      "loss": 1.0588,
+      "step": 1774
+    },
+    {
+      "epoch": 0.3160612535612536,
+      "grad_norm": 0.436002254486084,
+      "learning_rate": 0.00019699780876773147,
+      "loss": 1.0341,
+      "step": 1775
+    },
+    {
+      "epoch": 0.3162393162393162,
+      "grad_norm": 0.5171145796775818,
+      "learning_rate": 0.00019699440373963486,
+      "loss": 1.282,
+      "step": 1776
+    },
+    {
+      "epoch": 0.3164173789173789,
+      "grad_norm": 0.38382846117019653,
+      "learning_rate": 0.00019699099681113436,
+      "loss": 0.8908,
+      "step": 1777
+    },
+    {
+      "epoch": 0.3165954415954416,
+      "grad_norm": 0.4621630609035492,
+      "learning_rate": 0.0001969875879822968,
+      "loss": 1.1074,
+      "step": 1778
+    },
+    {
+      "epoch": 0.31677350427350426,
+      "grad_norm": 0.5543130040168762,
+      "learning_rate": 0.00019698417725318892,
+      "loss": 0.9682,
+      "step": 1779
+    },
+    {
+      "epoch": 0.31695156695156695,
+      "grad_norm": 0.49534836411476135,
+      "learning_rate": 0.00019698076462387753,
+      "loss": 1.107,
+      "step": 1780
+    },
+    {
+      "epoch": 0.31712962962962965,
+      "grad_norm": 0.48844948410987854,
+      "learning_rate": 0.00019697735009442956,
+      "loss": 1.1295,
+      "step": 1781
+    },
+    {
+      "epoch": 0.3173076923076923,
+      "grad_norm": 0.5070686936378479,
+      "learning_rate": 0.00019697393366491185,
+      "loss": 1.083,
+      "step": 1782
+    },
+    {
+      "epoch": 0.317485754985755,
+      "grad_norm": 0.47817620635032654,
+      "learning_rate": 0.00019697051533539134,
+      "loss": 1.3014,
+      "step": 1783
+    },
+    {
+      "epoch": 0.3176638176638177,
+      "grad_norm": 0.538488507270813,
+      "learning_rate": 0.00019696709510593502,
+      "loss": 1.0354,
+      "step": 1784
+    },
+    {
+      "epoch": 0.3178418803418803,
+      "grad_norm": 0.5141439437866211,
+      "learning_rate": 0.0001969636729766099,
+      "loss": 1.2912,
+      "step": 1785
+    },
+    {
+      "epoch": 0.318019943019943,
+      "grad_norm": 0.5009665489196777,
+      "learning_rate": 0.00019696024894748306,
+      "loss": 0.9014,
+      "step": 1786
+    },
+    {
+      "epoch": 0.3181980056980057,
+      "grad_norm": 0.46199744939804077,
+      "learning_rate": 0.00019695682301862155,
+      "loss": 1.0532,
+      "step": 1787
+    },
+    {
+      "epoch": 0.31837606837606836,
+      "grad_norm": 0.4649423062801361,
+      "learning_rate": 0.0001969533951900925,
+      "loss": 0.8608,
+      "step": 1788
+    },
+    {
+      "epoch": 0.31855413105413105,
+      "grad_norm": 0.516909658908844,
+      "learning_rate": 0.0001969499654619631,
+      "loss": 1.1385,
+      "step": 1789
+    },
+    {
+      "epoch": 0.31873219373219375,
+      "grad_norm": 0.46016669273376465,
+      "learning_rate": 0.00019694653383430048,
+      "loss": 0.9168,
+      "step": 1790
+    },
+    {
+      "epoch": 0.3189102564102564,
+      "grad_norm": 0.4794938564300537,
+      "learning_rate": 0.00019694310030717193,
+      "loss": 1.0244,
+      "step": 1791
+    },
+    {
+      "epoch": 0.3190883190883191,
+      "grad_norm": 0.46577662229537964,
+      "learning_rate": 0.00019693966488064471,
+      "loss": 1.0954,
+      "step": 1792
+    },
+    {
+      "epoch": 0.3192663817663818,
+      "grad_norm": 0.4866746962070465,
+      "learning_rate": 0.00019693622755478614,
+      "loss": 1.2925,
+      "step": 1793
+    },
+    {
+      "epoch": 0.3194444444444444,
+      "grad_norm": 0.4841702878475189,
+      "learning_rate": 0.00019693278832966357,
+      "loss": 1.119,
+      "step": 1794
+    },
+    {
+      "epoch": 0.3196225071225071,
+      "grad_norm": 0.4835243821144104,
+      "learning_rate": 0.00019692934720534435,
+      "loss": 1.1702,
+      "step": 1795
+    },
+    {
+      "epoch": 0.3198005698005698,
+      "grad_norm": 0.5200608968734741,
+      "learning_rate": 0.00019692590418189594,
+      "loss": 1.1989,
+      "step": 1796
+    },
+    {
+      "epoch": 0.31997863247863245,
+      "grad_norm": 0.5147821307182312,
+      "learning_rate": 0.00019692245925938577,
+      "loss": 1.1417,
+      "step": 1797
+    },
+    {
+      "epoch": 0.32015669515669515,
+      "grad_norm": 0.5145614743232727,
+      "learning_rate": 0.00019691901243788136,
+      "loss": 1.0571,
+      "step": 1798
+    },
+    {
+      "epoch": 0.32033475783475784,
+      "grad_norm": 0.5416026711463928,
+      "learning_rate": 0.00019691556371745022,
+      "loss": 1.188,
+      "step": 1799
+    },
+    {
+      "epoch": 0.32051282051282054,
+      "grad_norm": 0.5140644311904907,
+      "learning_rate": 0.00019691211309815995,
+      "loss": 1.1795,
+      "step": 1800
+    },
+    {
+      "epoch": 0.3206908831908832,
+      "grad_norm": 0.44219106435775757,
+      "learning_rate": 0.00019690866058007817,
+      "loss": 0.9215,
+      "step": 1801
+    },
+    {
+      "epoch": 0.3208689458689459,
+      "grad_norm": 0.49523603916168213,
+      "learning_rate": 0.00019690520616327245,
+      "loss": 1.1117,
+      "step": 1802
+    },
+    {
+      "epoch": 0.32104700854700857,
+      "grad_norm": 0.5818293690681458,
+      "learning_rate": 0.0001969017498478105,
+      "loss": 1.16,
+      "step": 1803
+    },
+    {
+      "epoch": 0.3212250712250712,
+      "grad_norm": 0.5175749659538269,
+      "learning_rate": 0.0001968982916337601,
+      "loss": 1.1999,
+      "step": 1804
+    },
+    {
+      "epoch": 0.3214031339031339,
+      "grad_norm": 0.49916017055511475,
+      "learning_rate": 0.00019689483152118898,
+      "loss": 0.9505,
+      "step": 1805
+    },
+    {
+      "epoch": 0.3215811965811966,
+      "grad_norm": 0.46849536895751953,
+      "learning_rate": 0.00019689136951016488,
+      "loss": 0.9627,
+      "step": 1806
+    },
+    {
+      "epoch": 0.32175925925925924,
+      "grad_norm": 0.4226818382740021,
+      "learning_rate": 0.00019688790560075568,
+      "loss": 1.037,
+      "step": 1807
+    },
+    {
+      "epoch": 0.32193732193732194,
+      "grad_norm": 0.4697103798389435,
+      "learning_rate": 0.00019688443979302923,
+      "loss": 1.1431,
+      "step": 1808
+    },
+    {
+      "epoch": 0.32211538461538464,
+      "grad_norm": 0.4999365508556366,
+      "learning_rate": 0.00019688097208705343,
+      "loss": 1.171,
+      "step": 1809
+    },
+    {
+      "epoch": 0.3222934472934473,
+      "grad_norm": 0.5229731798171997,
+      "learning_rate": 0.00019687750248289625,
+      "loss": 1.3395,
+      "step": 1810
+    },
+    {
+      "epoch": 0.32247150997151,
+      "grad_norm": 0.512525737285614,
+      "learning_rate": 0.00019687403098062566,
+      "loss": 1.1438,
+      "step": 1811
+    },
+    {
+      "epoch": 0.32264957264957267,
+      "grad_norm": 0.4558548927307129,
+      "learning_rate": 0.00019687055758030967,
+      "loss": 1.0012,
+      "step": 1812
+    },
+    {
+      "epoch": 0.3228276353276353,
+      "grad_norm": 0.45195743441581726,
+      "learning_rate": 0.00019686708228201636,
+      "loss": 1.0222,
+      "step": 1813
+    },
+    {
+      "epoch": 0.323005698005698,
+      "grad_norm": 0.5023126602172852,
+      "learning_rate": 0.00019686360508581373,
+      "loss": 1.2128,
+      "step": 1814
+    },
+    {
+      "epoch": 0.3231837606837607,
+      "grad_norm": 0.46516045928001404,
+      "learning_rate": 0.00019686012599177003,
+      "loss": 0.989,
+      "step": 1815
+    },
+    {
+      "epoch": 0.32336182336182334,
+      "grad_norm": 0.4142672121524811,
+      "learning_rate": 0.00019685664499995338,
+      "loss": 1.0144,
+      "step": 1816
+    },
+    {
+      "epoch": 0.32353988603988604,
+      "grad_norm": 0.4511009752750397,
+      "learning_rate": 0.0001968531621104319,
+      "loss": 0.885,
+      "step": 1817
+    },
+    {
+      "epoch": 0.32371794871794873,
+      "grad_norm": 0.49583545327186584,
+      "learning_rate": 0.00019684967732327396,
+      "loss": 1.0986,
+      "step": 1818
+    },
+    {
+      "epoch": 0.3238960113960114,
+      "grad_norm": 0.5872161388397217,
+      "learning_rate": 0.0001968461906385478,
+      "loss": 1.1482,
+      "step": 1819
+    },
+    {
+      "epoch": 0.32407407407407407,
+      "grad_norm": 0.4509563148021698,
+      "learning_rate": 0.00019684270205632168,
+      "loss": 1.0578,
+      "step": 1820
+    },
+    {
+      "epoch": 0.32425213675213677,
+      "grad_norm": 0.501345157623291,
+      "learning_rate": 0.00019683921157666402,
+      "loss": 1.1792,
+      "step": 1821
+    },
+    {
+      "epoch": 0.3244301994301994,
+      "grad_norm": 0.48257577419281006,
+      "learning_rate": 0.00019683571919964314,
+      "loss": 1.0448,
+      "step": 1822
+    },
+    {
+      "epoch": 0.3246082621082621,
+      "grad_norm": 0.5399422645568848,
+      "learning_rate": 0.00019683222492532752,
+      "loss": 1.0579,
+      "step": 1823
+    },
+    {
+      "epoch": 0.3247863247863248,
+      "grad_norm": 0.4382506012916565,
+      "learning_rate": 0.0001968287287537856,
+      "loss": 1.0246,
+      "step": 1824
+    },
+    {
+      "epoch": 0.32496438746438744,
+      "grad_norm": 0.49247491359710693,
+      "learning_rate": 0.00019682523068508586,
+      "loss": 1.318,
+      "step": 1825
+    },
+    {
+      "epoch": 0.32514245014245013,
+      "grad_norm": 0.49067625403404236,
+      "learning_rate": 0.0001968217307192969,
+      "loss": 1.1028,
+      "step": 1826
+    },
+    {
+      "epoch": 0.32532051282051283,
+      "grad_norm": 0.4832286238670349,
+      "learning_rate": 0.00019681822885648723,
+      "loss": 1.0996,
+      "step": 1827
+    },
+    {
+      "epoch": 0.32549857549857547,
+      "grad_norm": 0.47144386172294617,
+      "learning_rate": 0.0001968147250967255,
+      "loss": 1.0707,
+      "step": 1828
+    },
+    {
+      "epoch": 0.32567663817663817,
+      "grad_norm": 0.46299225091934204,
+      "learning_rate": 0.0001968112194400803,
+      "loss": 1.0461,
+      "step": 1829
+    },
+    {
+      "epoch": 0.32585470085470086,
+      "grad_norm": 0.4880816340446472,
+      "learning_rate": 0.00019680771188662044,
+      "loss": 1.1198,
+      "step": 1830
+    },
+    {
+      "epoch": 0.32603276353276356,
+      "grad_norm": 0.43837276101112366,
+      "learning_rate": 0.00019680420243641452,
+      "loss": 1.0599,
+      "step": 1831
+    },
+    {
+      "epoch": 0.3262108262108262,
+      "grad_norm": 0.453168660402298,
+      "learning_rate": 0.0001968006910895314,
+      "loss": 1.0327,
+      "step": 1832
+    },
+    {
+      "epoch": 0.3263888888888889,
+      "grad_norm": 0.45183828473091125,
+      "learning_rate": 0.00019679717784603975,
+      "loss": 1.1381,
+      "step": 1833
+    },
+    {
+      "epoch": 0.3265669515669516,
+      "grad_norm": 0.5326765775680542,
+      "learning_rate": 0.00019679366270600852,
+      "loss": 1.3169,
+      "step": 1834
+    },
+    {
+      "epoch": 0.32674501424501423,
+      "grad_norm": 0.47468429803848267,
+      "learning_rate": 0.00019679014566950653,
+      "loss": 1.1816,
+      "step": 1835
+    },
+    {
+      "epoch": 0.3269230769230769,
+      "grad_norm": 0.5096879005432129,
+      "learning_rate": 0.0001967866267366027,
+      "loss": 1.1162,
+      "step": 1836
+    },
+    {
+      "epoch": 0.3271011396011396,
+      "grad_norm": 0.491514652967453,
+      "learning_rate": 0.00019678310590736598,
+      "loss": 1.2793,
+      "step": 1837
+    },
+    {
+      "epoch": 0.32727920227920226,
+      "grad_norm": 0.601439356803894,
+      "learning_rate": 0.00019677958318186533,
+      "loss": 0.9851,
+      "step": 1838
+    },
+    {
+      "epoch": 0.32745726495726496,
+      "grad_norm": 0.45270970463752747,
+      "learning_rate": 0.0001967760585601698,
+      "loss": 1.0042,
+      "step": 1839
+    },
+    {
+      "epoch": 0.32763532763532766,
+      "grad_norm": 0.48864325881004333,
+      "learning_rate": 0.00019677253204234847,
+      "loss": 1.0835,
+      "step": 1840
+    },
+    {
+      "epoch": 0.3278133903133903,
+      "grad_norm": 0.5855685472488403,
+      "learning_rate": 0.00019676900362847037,
+      "loss": 1.193,
+      "step": 1841
+    },
+    {
+      "epoch": 0.327991452991453,
+      "grad_norm": 0.7181013822555542,
+      "learning_rate": 0.00019676547331860466,
+      "loss": 1.2028,
+      "step": 1842
+    },
+    {
+      "epoch": 0.3281695156695157,
+      "grad_norm": 0.4517378807067871,
+      "learning_rate": 0.00019676194111282054,
+      "loss": 1.013,
+      "step": 1843
+    },
+    {
+      "epoch": 0.32834757834757833,
+      "grad_norm": 0.5477756857872009,
+      "learning_rate": 0.00019675840701118718,
+      "loss": 1.2311,
+      "step": 1844
+    },
+    {
+      "epoch": 0.328525641025641,
+      "grad_norm": 0.5194997191429138,
+      "learning_rate": 0.00019675487101377382,
+      "loss": 1.0953,
+      "step": 1845
+    },
+    {
+      "epoch": 0.3287037037037037,
+      "grad_norm": 0.44454067945480347,
+      "learning_rate": 0.00019675133312064977,
+      "loss": 0.8505,
+      "step": 1846
+    },
+    {
+      "epoch": 0.32888176638176636,
+      "grad_norm": 0.3938713073730469,
+      "learning_rate": 0.00019674779333188428,
+      "loss": 0.8525,
+      "step": 1847
+    },
+    {
+      "epoch": 0.32905982905982906,
+      "grad_norm": 0.4927884340286255,
+      "learning_rate": 0.00019674425164754682,
+      "loss": 1.2477,
+      "step": 1848
+    },
+    {
+      "epoch": 0.32923789173789175,
+      "grad_norm": 0.4516635239124298,
+      "learning_rate": 0.0001967407080677067,
+      "loss": 0.8333,
+      "step": 1849
+    },
+    {
+      "epoch": 0.3294159544159544,
+      "grad_norm": 0.47105780243873596,
+      "learning_rate": 0.00019673716259243336,
+      "loss": 1.0989,
+      "step": 1850
+    },
+    {
+      "epoch": 0.3295940170940171,
+      "grad_norm": 0.5192127823829651,
+      "learning_rate": 0.00019673361522179627,
+      "loss": 1.1164,
+      "step": 1851
+    },
+    {
+      "epoch": 0.3297720797720798,
+      "grad_norm": 0.5222696661949158,
+      "learning_rate": 0.00019673006595586495,
+      "loss": 1.3191,
+      "step": 1852
+    },
+    {
+      "epoch": 0.3299501424501424,
+      "grad_norm": 0.6046679019927979,
+      "learning_rate": 0.0001967265147947089,
+      "loss": 0.9782,
+      "step": 1853
+    },
+    {
+      "epoch": 0.3301282051282051,
+      "grad_norm": 0.47928622364997864,
+      "learning_rate": 0.00019672296173839775,
+      "loss": 1.2247,
+      "step": 1854
+    },
+    {
+      "epoch": 0.3303062678062678,
+      "grad_norm": 0.5435982346534729,
+      "learning_rate": 0.00019671940678700107,
+      "loss": 1.1647,
+      "step": 1855
+    },
+    {
+      "epoch": 0.33048433048433046,
+      "grad_norm": 0.46878984570503235,
+      "learning_rate": 0.00019671584994058856,
+      "loss": 1.132,
+      "step": 1856
+    },
+    {
+      "epoch": 0.33066239316239315,
+      "grad_norm": 0.5336877107620239,
+      "learning_rate": 0.00019671229119922986,
+      "loss": 1.0583,
+      "step": 1857
+    },
+    {
+      "epoch": 0.33084045584045585,
+      "grad_norm": 0.4811093807220459,
+      "learning_rate": 0.0001967087305629947,
+      "loss": 1.0089,
+      "step": 1858
+    },
+    {
+      "epoch": 0.33101851851851855,
+      "grad_norm": 0.5140184760093689,
+      "learning_rate": 0.0001967051680319529,
+      "loss": 1.2335,
+      "step": 1859
+    },
+    {
+      "epoch": 0.3311965811965812,
+      "grad_norm": 0.5855883955955505,
+      "learning_rate": 0.00019670160360617418,
+      "loss": 1.1107,
+      "step": 1860
+    },
+    {
+      "epoch": 0.3313746438746439,
+      "grad_norm": 0.5081531405448914,
+      "learning_rate": 0.00019669803728572844,
+      "loss": 1.0669,
+      "step": 1861
+    },
+    {
+      "epoch": 0.3315527065527066,
+      "grad_norm": 0.48749417066574097,
+      "learning_rate": 0.0001966944690706855,
+      "loss": 1.1465,
+      "step": 1862
+    },
+    {
+      "epoch": 0.3317307692307692,
+      "grad_norm": 0.5175687670707703,
+      "learning_rate": 0.00019669089896111536,
+      "loss": 1.254,
+      "step": 1863
+    },
+    {
+      "epoch": 0.3319088319088319,
+      "grad_norm": 0.4198860824108124,
+      "learning_rate": 0.0001966873269570879,
+      "loss": 0.9811,
+      "step": 1864
+    },
+    {
+      "epoch": 0.3320868945868946,
+      "grad_norm": 0.5220273733139038,
+      "learning_rate": 0.0001966837530586731,
+      "loss": 1.277,
+      "step": 1865
+    },
+    {
+      "epoch": 0.33226495726495725,
+      "grad_norm": 0.551954448223114,
+      "learning_rate": 0.00019668017726594101,
+      "loss": 1.0627,
+      "step": 1866
+    },
+    {
+      "epoch": 0.33244301994301995,
+      "grad_norm": 0.5289301872253418,
+      "learning_rate": 0.00019667659957896166,
+      "loss": 1.4525,
+      "step": 1867
+    },
+    {
+      "epoch": 0.33262108262108264,
+      "grad_norm": 0.5190161466598511,
+      "learning_rate": 0.00019667301999780522,
+      "loss": 1.1064,
+      "step": 1868
+    },
+    {
+      "epoch": 0.3327991452991453,
+      "grad_norm": 0.437637060880661,
+      "learning_rate": 0.00019666943852254172,
+      "loss": 1.1304,
+      "step": 1869
+    },
+    {
+      "epoch": 0.332977207977208,
+      "grad_norm": 0.4801286458969116,
+      "learning_rate": 0.00019666585515324138,
+      "loss": 1.032,
+      "step": 1870
+    },
+    {
+      "epoch": 0.3331552706552707,
+      "grad_norm": 0.5041908621788025,
+      "learning_rate": 0.00019666226988997445,
+      "loss": 1.2611,
+      "step": 1871
+    },
+    {
+      "epoch": 0.3333333333333333,
+      "grad_norm": 0.4529375731945038,
+      "learning_rate": 0.00019665868273281115,
+      "loss": 1.1346,
+      "step": 1872
+    },
+    {
+      "epoch": 0.333511396011396,
+      "grad_norm": 0.4797019064426422,
+      "learning_rate": 0.00019665509368182172,
+      "loss": 1.1716,
+      "step": 1873
+    },
+    {
+      "epoch": 0.3336894586894587,
+      "grad_norm": 0.5505055785179138,
+      "learning_rate": 0.00019665150273707652,
+      "loss": 0.9729,
+      "step": 1874
+    },
+    {
+      "epoch": 0.33386752136752135,
+      "grad_norm": 0.4228051006793976,
+      "learning_rate": 0.00019664790989864592,
+      "loss": 0.9023,
+      "step": 1875
+    },
+    {
+      "epoch": 0.33404558404558404,
+      "grad_norm": 0.4926959276199341,
+      "learning_rate": 0.00019664431516660028,
+      "loss": 1.0999,
+      "step": 1876
+    },
+    {
+      "epoch": 0.33422364672364674,
+      "grad_norm": 0.4273219704627991,
+      "learning_rate": 0.00019664071854101005,
+      "loss": 1.1039,
+      "step": 1877
+    },
+    {
+      "epoch": 0.3344017094017094,
+      "grad_norm": 0.48438936471939087,
+      "learning_rate": 0.00019663712002194566,
+      "loss": 1.1308,
+      "step": 1878
+    },
+    {
+      "epoch": 0.3345797720797721,
+      "grad_norm": 0.5102053284645081,
+      "learning_rate": 0.0001966335196094777,
+      "loss": 1.0618,
+      "step": 1879
+    },
+    {
+      "epoch": 0.33475783475783477,
+      "grad_norm": 0.4357300400733948,
+      "learning_rate": 0.00019662991730367663,
+      "loss": 1.0521,
+      "step": 1880
+    },
+    {
+      "epoch": 0.3349358974358974,
+      "grad_norm": 0.5052695870399475,
+      "learning_rate": 0.00019662631310461308,
+      "loss": 0.9579,
+      "step": 1881
+    },
+    {
+      "epoch": 0.3351139601139601,
+      "grad_norm": 0.4889117181301117,
+      "learning_rate": 0.00019662270701235762,
+      "loss": 1.0304,
+      "step": 1882
+    },
+    {
+      "epoch": 0.3352920227920228,
+      "grad_norm": 0.4671195149421692,
+      "learning_rate": 0.000196619099026981,
+      "loss": 1.2228,
+      "step": 1883
+    },
+    {
+      "epoch": 0.33547008547008544,
+      "grad_norm": 0.4700174331665039,
+      "learning_rate": 0.0001966154891485538,
+      "loss": 0.9634,
+      "step": 1884
+    },
+    {
+      "epoch": 0.33564814814814814,
+      "grad_norm": 0.488817423582077,
+      "learning_rate": 0.00019661187737714676,
+      "loss": 1.2499,
+      "step": 1885
+    },
+    {
+      "epoch": 0.33582621082621084,
+      "grad_norm": 0.5336169600486755,
+      "learning_rate": 0.00019660826371283073,
+      "loss": 1.251,
+      "step": 1886
+    },
+    {
+      "epoch": 0.33600427350427353,
+      "grad_norm": 0.5054540038108826,
+      "learning_rate": 0.00019660464815567642,
+      "loss": 1.221,
+      "step": 1887
+    },
+    {
+      "epoch": 0.33618233618233617,
+      "grad_norm": 0.5078747868537903,
+      "learning_rate": 0.00019660103070575472,
+      "loss": 0.9792,
+      "step": 1888
+    },
+    {
+      "epoch": 0.33636039886039887,
+      "grad_norm": 0.498571515083313,
+      "learning_rate": 0.0001965974113631365,
+      "loss": 1.1682,
+      "step": 1889
+    },
+    {
+      "epoch": 0.33653846153846156,
+      "grad_norm": 0.49969518184661865,
+      "learning_rate": 0.00019659379012789264,
+      "loss": 1.0012,
+      "step": 1890
+    },
+    {
+      "epoch": 0.3367165242165242,
+      "grad_norm": 0.4238094687461853,
+      "learning_rate": 0.00019659016700009416,
+      "loss": 1.0455,
+      "step": 1891
+    },
+    {
+      "epoch": 0.3368945868945869,
+      "grad_norm": 0.5139104723930359,
+      "learning_rate": 0.000196586541979812,
+      "loss": 0.9979,
+      "step": 1892
+    },
+    {
+      "epoch": 0.3370726495726496,
+      "grad_norm": 0.5446547269821167,
+      "learning_rate": 0.00019658291506711715,
+      "loss": 0.9271,
+      "step": 1893
+    },
+    {
+      "epoch": 0.33725071225071224,
+      "grad_norm": 0.5284572839736938,
+      "learning_rate": 0.00019657928626208077,
+      "loss": 1.0356,
+      "step": 1894
+    },
+    {
+      "epoch": 0.33742877492877493,
+      "grad_norm": 0.49936217069625854,
+      "learning_rate": 0.00019657565556477387,
+      "loss": 0.9785,
+      "step": 1895
+    },
+    {
+      "epoch": 0.33760683760683763,
+      "grad_norm": 0.4678729772567749,
+      "learning_rate": 0.00019657202297526763,
+      "loss": 1.2135,
+      "step": 1896
+    },
+    {
+      "epoch": 0.33778490028490027,
+      "grad_norm": 0.46844249963760376,
+      "learning_rate": 0.0001965683884936332,
+      "loss": 0.9369,
+      "step": 1897
+    },
+    {
+      "epoch": 0.33796296296296297,
+      "grad_norm": 0.4307389557361603,
+      "learning_rate": 0.0001965647521199418,
+      "loss": 0.9301,
+      "step": 1898
+    },
+    {
+      "epoch": 0.33814102564102566,
+      "grad_norm": 0.48227834701538086,
+      "learning_rate": 0.00019656111385426468,
+      "loss": 1.3169,
+      "step": 1899
+    },
+    {
+      "epoch": 0.3383190883190883,
+      "grad_norm": 0.45860713720321655,
+      "learning_rate": 0.00019655747369667315,
+      "loss": 0.9835,
+      "step": 1900
+    },
+    {
+      "epoch": 0.338497150997151,
+      "grad_norm": 0.5522414445877075,
+      "learning_rate": 0.00019655383164723846,
+      "loss": 1.363,
+      "step": 1901
+    },
+    {
+      "epoch": 0.3386752136752137,
+      "grad_norm": 0.5283710360527039,
+      "learning_rate": 0.000196550187706032,
+      "loss": 1.1499,
+      "step": 1902
+    },
+    {
+      "epoch": 0.33885327635327633,
+      "grad_norm": 0.4419134259223938,
+      "learning_rate": 0.00019654654187312525,
+      "loss": 1.2039,
+      "step": 1903
+    },
+    {
+      "epoch": 0.33903133903133903,
+      "grad_norm": 0.49066096544265747,
+      "learning_rate": 0.00019654289414858952,
+      "loss": 0.9707,
+      "step": 1904
+    },
+    {
+      "epoch": 0.3392094017094017,
+      "grad_norm": 0.4619338810443878,
+      "learning_rate": 0.00019653924453249633,
+      "loss": 1.0849,
+      "step": 1905
+    },
+    {
+      "epoch": 0.33938746438746437,
+      "grad_norm": 0.5191119313240051,
+      "learning_rate": 0.0001965355930249172,
+      "loss": 1.1387,
+      "step": 1906
+    },
+    {
+      "epoch": 0.33956552706552706,
+      "grad_norm": 0.5245711207389832,
+      "learning_rate": 0.00019653193962592368,
+      "loss": 1.3435,
+      "step": 1907
+    },
+    {
+      "epoch": 0.33974358974358976,
+      "grad_norm": 0.49562904238700867,
+      "learning_rate": 0.0001965282843355873,
+      "loss": 1.2781,
+      "step": 1908
+    },
+    {
+      "epoch": 0.3399216524216524,
+      "grad_norm": 0.4661353826522827,
+      "learning_rate": 0.0001965246271539797,
+      "loss": 0.9317,
+      "step": 1909
+    },
+    {
+      "epoch": 0.3400997150997151,
+      "grad_norm": 0.4723222851753235,
+      "learning_rate": 0.00019652096808117254,
+      "loss": 1.0733,
+      "step": 1910
+    },
+    {
+      "epoch": 0.3402777777777778,
+      "grad_norm": 0.4358505308628082,
+      "learning_rate": 0.00019651730711723754,
+      "loss": 1.1461,
+      "step": 1911
+    },
+    {
+      "epoch": 0.34045584045584043,
+      "grad_norm": 0.462422251701355,
+      "learning_rate": 0.00019651364426224638,
+      "loss": 1.0914,
+      "step": 1912
+    },
+    {
+      "epoch": 0.3406339031339031,
+      "grad_norm": 0.47952914237976074,
+      "learning_rate": 0.0001965099795162709,
+      "loss": 1.0392,
+      "step": 1913
+    },
+    {
+      "epoch": 0.3408119658119658,
+      "grad_norm": 0.5036373734474182,
+      "learning_rate": 0.00019650631287938282,
+      "loss": 1.4002,
+      "step": 1914
+    },
+    {
+      "epoch": 0.34099002849002846,
+      "grad_norm": 0.5130090713500977,
+      "learning_rate": 0.000196502644351654,
+      "loss": 1.3499,
+      "step": 1915
+    },
+    {
+      "epoch": 0.34116809116809116,
+      "grad_norm": 0.4426332414150238,
+      "learning_rate": 0.00019649897393315635,
+      "loss": 1.0726,
+      "step": 1916
+    },
+    {
+      "epoch": 0.34134615384615385,
+      "grad_norm": 0.5580727458000183,
+      "learning_rate": 0.00019649530162396176,
+      "loss": 1.1164,
+      "step": 1917
+    },
+    {
+      "epoch": 0.34152421652421655,
+      "grad_norm": 0.545001745223999,
+      "learning_rate": 0.00019649162742414218,
+      "loss": 0.962,
+      "step": 1918
+    },
+    {
+      "epoch": 0.3417022792022792,
+      "grad_norm": 0.5225808024406433,
+      "learning_rate": 0.00019648795133376962,
+      "loss": 1.1415,
+      "step": 1919
+    },
+    {
+      "epoch": 0.3418803418803419,
+      "grad_norm": 0.48210129141807556,
+      "learning_rate": 0.0001964842733529161,
+      "loss": 1.1188,
+      "step": 1920
+    },
+    {
+      "epoch": 0.3420584045584046,
+      "grad_norm": 0.4515395164489746,
+      "learning_rate": 0.00019648059348165365,
+      "loss": 1.0828,
+      "step": 1921
+    },
+    {
+      "epoch": 0.3422364672364672,
+      "grad_norm": 0.5802633166313171,
+      "learning_rate": 0.0001964769117200544,
+      "loss": 1.3137,
+      "step": 1922
+    },
+    {
+      "epoch": 0.3424145299145299,
+      "grad_norm": 0.4432032108306885,
+      "learning_rate": 0.00019647322806819046,
+      "loss": 1.0523,
+      "step": 1923
+    },
+    {
+      "epoch": 0.3425925925925926,
+      "grad_norm": 0.4697614908218384,
+      "learning_rate": 0.00019646954252613402,
+      "loss": 0.8426,
+      "step": 1924
+    },
+    {
+      "epoch": 0.34277065527065526,
+      "grad_norm": 0.4610968232154846,
+      "learning_rate": 0.0001964658550939573,
+      "loss": 0.9826,
+      "step": 1925
+    },
+    {
+      "epoch": 0.34294871794871795,
+      "grad_norm": 0.5278257727622986,
+      "learning_rate": 0.00019646216577173258,
+      "loss": 1.1064,
+      "step": 1926
+    },
+    {
+      "epoch": 0.34312678062678065,
+      "grad_norm": 0.5686144232749939,
+      "learning_rate": 0.00019645847455953205,
+      "loss": 0.9138,
+      "step": 1927
+    },
+    {
+      "epoch": 0.3433048433048433,
+      "grad_norm": 0.42894792556762695,
+      "learning_rate": 0.0001964547814574281,
+      "loss": 1.0461,
+      "step": 1928
+    },
+    {
+      "epoch": 0.343482905982906,
+      "grad_norm": 0.5567317605018616,
+      "learning_rate": 0.0001964510864654931,
+      "loss": 0.8787,
+      "step": 1929
+    },
+    {
+      "epoch": 0.3436609686609687,
+      "grad_norm": 0.5015586614608765,
+      "learning_rate": 0.0001964473895837994,
+      "loss": 1.1406,
+      "step": 1930
+    },
+    {
+      "epoch": 0.3438390313390313,
+      "grad_norm": 0.47391530871391296,
+      "learning_rate": 0.00019644369081241948,
+      "loss": 1.0685,
+      "step": 1931
+    },
+    {
+      "epoch": 0.344017094017094,
+      "grad_norm": 0.546037495136261,
+      "learning_rate": 0.00019643999015142574,
+      "loss": 1.2349,
+      "step": 1932
+    },
+    {
+      "epoch": 0.3441951566951567,
+      "grad_norm": 0.4724953770637512,
+      "learning_rate": 0.00019643628760089078,
+      "loss": 1.0621,
+      "step": 1933
+    },
+    {
+      "epoch": 0.34437321937321935,
+      "grad_norm": 0.5644593834877014,
+      "learning_rate": 0.00019643258316088703,
+      "loss": 1.2559,
+      "step": 1934
+    },
+    {
+      "epoch": 0.34455128205128205,
+      "grad_norm": 0.500815749168396,
+      "learning_rate": 0.00019642887683148718,
+      "loss": 1.0439,
+      "step": 1935
+    },
+    {
+      "epoch": 0.34472934472934474,
+      "grad_norm": 0.4932316541671753,
+      "learning_rate": 0.0001964251686127638,
+      "loss": 1.0404,
+      "step": 1936
+    },
+    {
+      "epoch": 0.3449074074074074,
+      "grad_norm": 0.48494651913642883,
+      "learning_rate": 0.00019642145850478954,
+      "loss": 0.9951,
+      "step": 1937
+    },
+    {
+      "epoch": 0.3450854700854701,
+      "grad_norm": 0.5191963315010071,
+      "learning_rate": 0.00019641774650763706,
+      "loss": 1.1258,
+      "step": 1938
+    },
+    {
+      "epoch": 0.3452635327635328,
+      "grad_norm": 0.4439312815666199,
+      "learning_rate": 0.00019641403262137918,
+      "loss": 1.1158,
+      "step": 1939
+    },
+    {
+      "epoch": 0.3454415954415954,
+      "grad_norm": 0.4829137921333313,
+      "learning_rate": 0.0001964103168460886,
+      "loss": 1.0531,
+      "step": 1940
+    },
+    {
+      "epoch": 0.3456196581196581,
+      "grad_norm": 0.49433329701423645,
+      "learning_rate": 0.00019640659918183811,
+      "loss": 1.1295,
+      "step": 1941
+    },
+    {
+      "epoch": 0.3457977207977208,
+      "grad_norm": 0.5351347923278809,
+      "learning_rate": 0.00019640287962870062,
+      "loss": 1.2379,
+      "step": 1942
+    },
+    {
+      "epoch": 0.34597578347578345,
+      "grad_norm": 0.4845680892467499,
+      "learning_rate": 0.00019639915818674895,
+      "loss": 1.0197,
+      "step": 1943
+    },
+    {
+      "epoch": 0.34615384615384615,
+      "grad_norm": 0.5312514901161194,
+      "learning_rate": 0.00019639543485605604,
+      "loss": 0.9734,
+      "step": 1944
+    },
+    {
+      "epoch": 0.34633190883190884,
+      "grad_norm": 0.4571874737739563,
+      "learning_rate": 0.00019639170963669478,
+      "loss": 1.1012,
+      "step": 1945
+    },
+    {
+      "epoch": 0.34650997150997154,
+      "grad_norm": 0.4449031949043274,
+      "learning_rate": 0.00019638798252873824,
+      "loss": 1.1393,
+      "step": 1946
+    },
+    {
+      "epoch": 0.3466880341880342,
+      "grad_norm": 0.47470834851264954,
+      "learning_rate": 0.0001963842535322594,
+      "loss": 0.981,
+      "step": 1947
+    },
+    {
+      "epoch": 0.3468660968660969,
+      "grad_norm": 0.5386981964111328,
+      "learning_rate": 0.00019638052264733132,
+      "loss": 1.1247,
+      "step": 1948
+    },
+    {
+      "epoch": 0.34704415954415957,
+      "grad_norm": 0.535589873790741,
+      "learning_rate": 0.00019637678987402714,
+      "loss": 1.3157,
+      "step": 1949
+    },
+    {
+      "epoch": 0.3472222222222222,
+      "grad_norm": 0.49338245391845703,
+      "learning_rate": 0.00019637305521242,
+      "loss": 1.1066,
+      "step": 1950
+    },
+    {
+      "epoch": 0.3474002849002849,
+      "grad_norm": 0.4247688353061676,
+      "learning_rate": 0.00019636931866258298,
+      "loss": 1.0039,
+      "step": 1951
+    },
+    {
+      "epoch": 0.3475783475783476,
+      "grad_norm": 0.5351517200469971,
+      "learning_rate": 0.00019636558022458934,
+      "loss": 1.0344,
+      "step": 1952
+    },
+    {
+      "epoch": 0.34775641025641024,
+      "grad_norm": 0.4633362889289856,
+      "learning_rate": 0.00019636183989851238,
+      "loss": 1.1383,
+      "step": 1953
+    },
+    {
+      "epoch": 0.34793447293447294,
+      "grad_norm": 0.553709089756012,
+      "learning_rate": 0.00019635809768442535,
+      "loss": 1.0389,
+      "step": 1954
+    },
+    {
+      "epoch": 0.34811253561253563,
+      "grad_norm": 0.479374498128891,
+      "learning_rate": 0.00019635435358240154,
+      "loss": 1.1774,
+      "step": 1955
+    },
+    {
+      "epoch": 0.3482905982905983,
+      "grad_norm": 0.5274081230163574,
+      "learning_rate": 0.0001963506075925143,
+      "loss": 1.1809,
+      "step": 1956
+    },
+    {
+      "epoch": 0.34846866096866097,
+      "grad_norm": 0.45398542284965515,
+      "learning_rate": 0.0001963468597148371,
+      "loss": 1.0502,
+      "step": 1957
+    },
+    {
+      "epoch": 0.34864672364672367,
+      "grad_norm": 0.48201611638069153,
+      "learning_rate": 0.00019634310994944332,
+      "loss": 1.0557,
+      "step": 1958
+    },
+    {
+      "epoch": 0.3488247863247863,
+      "grad_norm": 0.6407544016838074,
+      "learning_rate": 0.00019633935829640642,
+      "loss": 1.2138,
+      "step": 1959
+    },
+    {
+      "epoch": 0.349002849002849,
+      "grad_norm": 0.5385687351226807,
+      "learning_rate": 0.00019633560475579995,
+      "loss": 1.3496,
+      "step": 1960
+    },
+    {
+      "epoch": 0.3491809116809117,
+      "grad_norm": 0.5260964035987854,
+      "learning_rate": 0.0001963318493276974,
+      "loss": 1.0253,
+      "step": 1961
+    },
+    {
+      "epoch": 0.34935897435897434,
+      "grad_norm": 0.48478585481643677,
+      "learning_rate": 0.00019632809201217238,
+      "loss": 1.137,
+      "step": 1962
+    },
+    {
+      "epoch": 0.34953703703703703,
+      "grad_norm": 0.620033860206604,
+      "learning_rate": 0.0001963243328092985,
+      "loss": 1.3445,
+      "step": 1963
+    },
+    {
+      "epoch": 0.34971509971509973,
+      "grad_norm": 0.5149700045585632,
+      "learning_rate": 0.00019632057171914942,
+      "loss": 1.1042,
+      "step": 1964
+    },
+    {
+      "epoch": 0.34989316239316237,
+      "grad_norm": 0.42695048451423645,
+      "learning_rate": 0.0001963168087417988,
+      "loss": 0.8789,
+      "step": 1965
+    },
+    {
+      "epoch": 0.35007122507122507,
+      "grad_norm": 0.5281283855438232,
+      "learning_rate": 0.00019631304387732044,
+      "loss": 1.1155,
+      "step": 1966
+    },
+    {
+      "epoch": 0.35024928774928776,
+      "grad_norm": 0.4994089901447296,
+      "learning_rate": 0.00019630927712578804,
+      "loss": 1.1226,
+      "step": 1967
+    },
+    {
+      "epoch": 0.3504273504273504,
+      "grad_norm": 0.4433288276195526,
+      "learning_rate": 0.0001963055084872754,
+      "loss": 1.0262,
+      "step": 1968
+    },
+    {
+      "epoch": 0.3506054131054131,
+      "grad_norm": 0.46541857719421387,
+      "learning_rate": 0.0001963017379618564,
+      "loss": 1.1438,
+      "step": 1969
+    },
+    {
+      "epoch": 0.3507834757834758,
+      "grad_norm": 0.5097604393959045,
+      "learning_rate": 0.00019629796554960488,
+      "loss": 0.9641,
+      "step": 1970
+    },
+    {
+      "epoch": 0.35096153846153844,
+      "grad_norm": 0.49461981654167175,
+      "learning_rate": 0.00019629419125059478,
+      "loss": 1.1765,
+      "step": 1971
+    },
+    {
+      "epoch": 0.35113960113960113,
+      "grad_norm": 0.4763339161872864,
+      "learning_rate": 0.00019629041506490005,
+      "loss": 1.0527,
+      "step": 1972
+    },
+    {
+      "epoch": 0.35131766381766383,
+      "grad_norm": 0.4528443217277527,
+      "learning_rate": 0.00019628663699259463,
+      "loss": 1.1409,
+      "step": 1973
+    },
+    {
+      "epoch": 0.35149572649572647,
+      "grad_norm": 0.4436309039592743,
+      "learning_rate": 0.00019628285703375258,
+      "loss": 1.0459,
+      "step": 1974
+    },
+    {
+      "epoch": 0.35167378917378916,
+      "grad_norm": 0.5146129727363586,
+      "learning_rate": 0.00019627907518844797,
+      "loss": 1.2527,
+      "step": 1975
+    },
+    {
+      "epoch": 0.35185185185185186,
+      "grad_norm": 0.5202171802520752,
+      "learning_rate": 0.0001962752914567549,
+      "loss": 1.226,
+      "step": 1976
+    },
+    {
+      "epoch": 0.35202991452991456,
+      "grad_norm": 0.5267411470413208,
+      "learning_rate": 0.00019627150583874747,
+      "loss": 1.0898,
+      "step": 1977
+    },
+    {
+      "epoch": 0.3522079772079772,
+      "grad_norm": 0.546840250492096,
+      "learning_rate": 0.00019626771833449987,
+      "loss": 1.1716,
+      "step": 1978
+    },
+    {
+      "epoch": 0.3523860398860399,
+      "grad_norm": 0.5525290966033936,
+      "learning_rate": 0.0001962639289440863,
+      "loss": 1.1762,
+      "step": 1979
+    },
+    {
+      "epoch": 0.3525641025641026,
+      "grad_norm": 0.48967215418815613,
+      "learning_rate": 0.000196260137667581,
+      "loss": 1.1884,
+      "step": 1980
+    },
+    {
+      "epoch": 0.35274216524216523,
+      "grad_norm": 0.5908235907554626,
+      "learning_rate": 0.0001962563445050583,
+      "loss": 1.1887,
+      "step": 1981
+    },
+    {
+      "epoch": 0.3529202279202279,
+      "grad_norm": 0.46708086133003235,
+      "learning_rate": 0.00019625254945659245,
+      "loss": 0.8842,
+      "step": 1982
+    },
+    {
+      "epoch": 0.3530982905982906,
+      "grad_norm": 0.41652458906173706,
+      "learning_rate": 0.00019624875252225788,
+      "loss": 1.0268,
+      "step": 1983
+    },
+    {
+      "epoch": 0.35327635327635326,
+      "grad_norm": 0.5084529519081116,
+      "learning_rate": 0.00019624495370212892,
+      "loss": 1.0547,
+      "step": 1984
+    },
+    {
+      "epoch": 0.35345441595441596,
+      "grad_norm": 0.5667507648468018,
+      "learning_rate": 0.00019624115299628003,
+      "loss": 1.0656,
+      "step": 1985
+    },
+    {
+      "epoch": 0.35363247863247865,
+      "grad_norm": 0.5022873282432556,
+      "learning_rate": 0.00019623735040478568,
+      "loss": 1.0627,
+      "step": 1986
+    },
+    {
+      "epoch": 0.3538105413105413,
+      "grad_norm": 0.48342058062553406,
+      "learning_rate": 0.00019623354592772035,
+      "loss": 1.0976,
+      "step": 1987
+    },
+    {
+      "epoch": 0.353988603988604,
+      "grad_norm": 0.48117366433143616,
+      "learning_rate": 0.0001962297395651586,
+      "loss": 1.0515,
+      "step": 1988
+    },
+    {
+      "epoch": 0.3541666666666667,
+      "grad_norm": 0.492564857006073,
+      "learning_rate": 0.000196225931317175,
+      "loss": 1.1957,
+      "step": 1989
+    },
+    {
+      "epoch": 0.3543447293447293,
+      "grad_norm": 0.4756208658218384,
+      "learning_rate": 0.00019622212118384417,
+      "loss": 1.007,
+      "step": 1990
+    },
+    {
+      "epoch": 0.354522792022792,
+      "grad_norm": 0.581930935382843,
+      "learning_rate": 0.00019621830916524076,
+      "loss": 1.232,
+      "step": 1991
+    },
+    {
+      "epoch": 0.3547008547008547,
+      "grad_norm": 0.480064332485199,
+      "learning_rate": 0.00019621449526143947,
+      "loss": 1.2693,
+      "step": 1992
+    },
+    {
+      "epoch": 0.35487891737891736,
+      "grad_norm": 0.5679123401641846,
+      "learning_rate": 0.000196210679472515,
+      "loss": 1.2985,
+      "step": 1993
+    },
+    {
+      "epoch": 0.35505698005698005,
+      "grad_norm": 0.43757280707359314,
+      "learning_rate": 0.00019620686179854213,
+      "loss": 1.1387,
+      "step": 1994
+    },
+    {
+      "epoch": 0.35523504273504275,
+      "grad_norm": 0.4950634837150574,
+      "learning_rate": 0.00019620304223959566,
+      "loss": 1.1809,
+      "step": 1995
+    },
+    {
+      "epoch": 0.3554131054131054,
+      "grad_norm": 0.5574113726615906,
+      "learning_rate": 0.00019619922079575043,
+      "loss": 1.2434,
+      "step": 1996
+    },
+    {
+      "epoch": 0.3555911680911681,
+      "grad_norm": 0.5154930949211121,
+      "learning_rate": 0.00019619539746708128,
+      "loss": 1.1747,
+      "step": 1997
+    },
+    {
+      "epoch": 0.3557692307692308,
+      "grad_norm": 0.4377825856208801,
+      "learning_rate": 0.00019619157225366315,
+      "loss": 0.9547,
+      "step": 1998
+    },
+    {
+      "epoch": 0.3559472934472934,
+      "grad_norm": 0.530714213848114,
+      "learning_rate": 0.00019618774515557097,
+      "loss": 1.2057,
+      "step": 1999
+    },
+    {
+      "epoch": 0.3561253561253561,
+      "grad_norm": 0.5703464150428772,
+      "learning_rate": 0.00019618391617287978,
+      "loss": 1.3068,
+      "step": 2000
+    },
+    {
+      "epoch": 0.3563034188034188,
+      "grad_norm": 0.4862228333950043,
+      "learning_rate": 0.0001961800853056645,
+      "loss": 1.0077,
+      "step": 2001
+    },
+    {
+      "epoch": 0.35648148148148145,
+      "grad_norm": 0.5575395822525024,
+      "learning_rate": 0.00019617625255400028,
+      "loss": 1.03,
+      "step": 2002
+    },
+    {
+      "epoch": 0.35665954415954415,
+      "grad_norm": 0.4826279580593109,
+      "learning_rate": 0.0001961724179179622,
+      "loss": 1.268,
+      "step": 2003
+    },
+    {
+      "epoch": 0.35683760683760685,
+      "grad_norm": 0.49423274397850037,
+      "learning_rate": 0.00019616858139762534,
+      "loss": 1.1305,
+      "step": 2004
+    },
+    {
+      "epoch": 0.35701566951566954,
+      "grad_norm": 0.5208541750907898,
+      "learning_rate": 0.00019616474299306491,
+      "loss": 1.1651,
+      "step": 2005
+    },
+    {
+      "epoch": 0.3571937321937322,
+      "grad_norm": 0.5324164032936096,
+      "learning_rate": 0.0001961609027043561,
+      "loss": 1.1406,
+      "step": 2006
+    },
+    {
+      "epoch": 0.3573717948717949,
+      "grad_norm": 0.45385462045669556,
+      "learning_rate": 0.00019615706053157416,
+      "loss": 1.0716,
+      "step": 2007
+    },
+    {
+      "epoch": 0.3575498575498576,
+      "grad_norm": 0.5016173720359802,
+      "learning_rate": 0.00019615321647479438,
+      "loss": 1.0878,
+      "step": 2008
+    },
+    {
+      "epoch": 0.3577279202279202,
+      "grad_norm": 0.5073097348213196,
+      "learning_rate": 0.00019614937053409205,
+      "loss": 1.237,
+      "step": 2009
+    },
+    {
+      "epoch": 0.3579059829059829,
+      "grad_norm": 0.48880141973495483,
+      "learning_rate": 0.00019614552270954256,
+      "loss": 0.8794,
+      "step": 2010
+    },
+    {
+      "epoch": 0.3580840455840456,
+      "grad_norm": 0.43902209401130676,
+      "learning_rate": 0.00019614167300122126,
+      "loss": 0.912,
+      "step": 2011
+    },
+    {
+      "epoch": 0.35826210826210825,
+      "grad_norm": 0.42809322476387024,
+      "learning_rate": 0.0001961378214092036,
+      "loss": 0.7804,
+      "step": 2012
+    },
+    {
+      "epoch": 0.35844017094017094,
+      "grad_norm": 0.4464281499385834,
+      "learning_rate": 0.00019613396793356503,
+      "loss": 1.0004,
+      "step": 2013
+    },
+    {
+      "epoch": 0.35861823361823364,
+      "grad_norm": 0.49085676670074463,
+      "learning_rate": 0.00019613011257438109,
+      "loss": 1.1087,
+      "step": 2014
+    },
+    {
+      "epoch": 0.3587962962962963,
+      "grad_norm": 0.4997732937335968,
+      "learning_rate": 0.00019612625533172725,
+      "loss": 0.9591,
+      "step": 2015
+    },
+    {
+      "epoch": 0.358974358974359,
+      "grad_norm": 0.48442545533180237,
+      "learning_rate": 0.00019612239620567912,
+      "loss": 0.9744,
+      "step": 2016
+    },
+    {
+      "epoch": 0.35915242165242167,
+      "grad_norm": 0.4989205002784729,
+      "learning_rate": 0.00019611853519631233,
+      "loss": 0.9844,
+      "step": 2017
+    },
+    {
+      "epoch": 0.3593304843304843,
+      "grad_norm": 0.6107521653175354,
+      "learning_rate": 0.00019611467230370248,
+      "loss": 1.147,
+      "step": 2018
+    },
+    {
+      "epoch": 0.359508547008547,
+      "grad_norm": 0.5594844818115234,
+      "learning_rate": 0.00019611080752792535,
+      "loss": 1.3195,
+      "step": 2019
+    },
+    {
+      "epoch": 0.3596866096866097,
+      "grad_norm": 0.4786946475505829,
+      "learning_rate": 0.00019610694086905656,
+      "loss": 1.2108,
+      "step": 2020
+    },
+    {
+      "epoch": 0.35986467236467234,
+      "grad_norm": 0.5186030268669128,
+      "learning_rate": 0.0001961030723271719,
+      "loss": 1.0008,
+      "step": 2021
+    },
+    {
+      "epoch": 0.36004273504273504,
+      "grad_norm": 0.4520573318004608,
+      "learning_rate": 0.0001960992019023472,
+      "loss": 1.1307,
+      "step": 2022
+    },
+    {
+      "epoch": 0.36022079772079774,
+      "grad_norm": 0.4983210563659668,
+      "learning_rate": 0.00019609532959465823,
+      "loss": 1.1486,
+      "step": 2023
+    },
+    {
+      "epoch": 0.3603988603988604,
+      "grad_norm": 0.6209200024604797,
+      "learning_rate": 0.00019609145540418094,
+      "loss": 1.2566,
+      "step": 2024
+    },
+    {
+      "epoch": 0.3605769230769231,
+      "grad_norm": 0.47047603130340576,
+      "learning_rate": 0.00019608757933099117,
+      "loss": 1.1588,
+      "step": 2025
+    },
+    {
+      "epoch": 0.36075498575498577,
+      "grad_norm": 0.5147389769554138,
+      "learning_rate": 0.0001960837013751649,
+      "loss": 1.2113,
+      "step": 2026
+    },
+    {
+      "epoch": 0.3609330484330484,
+      "grad_norm": 0.45826098322868347,
+      "learning_rate": 0.00019607982153677808,
+      "loss": 1.13,
+      "step": 2027
+    },
+    {
+      "epoch": 0.3611111111111111,
+      "grad_norm": 0.5699561834335327,
+      "learning_rate": 0.00019607593981590675,
+      "loss": 1.2476,
+      "step": 2028
+    },
+    {
+      "epoch": 0.3612891737891738,
+      "grad_norm": 0.5349239110946655,
+      "learning_rate": 0.000196072056212627,
+      "loss": 1.2295,
+      "step": 2029
+    },
+    {
+      "epoch": 0.36146723646723644,
+      "grad_norm": 0.6212165355682373,
+      "learning_rate": 0.00019606817072701484,
+      "loss": 1.1965,
+      "step": 2030
+    },
+    {
+      "epoch": 0.36164529914529914,
+      "grad_norm": 0.4870990216732025,
+      "learning_rate": 0.00019606428335914645,
+      "loss": 1.4464,
+      "step": 2031
+    },
+    {
+      "epoch": 0.36182336182336183,
+      "grad_norm": 0.42427828907966614,
+      "learning_rate": 0.00019606039410909797,
+      "loss": 1.1546,
+      "step": 2032
+    },
+    {
+      "epoch": 0.36200142450142453,
+      "grad_norm": 0.5081788301467896,
+      "learning_rate": 0.0001960565029769456,
+      "loss": 1.1867,
+      "step": 2033
+    },
+    {
+      "epoch": 0.36217948717948717,
+      "grad_norm": 0.4813104271888733,
+      "learning_rate": 0.00019605260996276565,
+      "loss": 1.3726,
+      "step": 2034
+    },
+    {
+      "epoch": 0.36235754985754987,
+      "grad_norm": 0.4648851156234741,
+      "learning_rate": 0.0001960487150666343,
+      "loss": 1.2434,
+      "step": 2035
+    },
+    {
+      "epoch": 0.36253561253561256,
+      "grad_norm": 0.484161913394928,
+      "learning_rate": 0.00019604481828862792,
+      "loss": 1.1309,
+      "step": 2036
+    },
+    {
+      "epoch": 0.3627136752136752,
+      "grad_norm": 0.4929439127445221,
+      "learning_rate": 0.00019604091962882283,
+      "loss": 1.1007,
+      "step": 2037
+    },
+    {
+      "epoch": 0.3628917378917379,
+      "grad_norm": 0.45599642395973206,
+      "learning_rate": 0.00019603701908729544,
+      "loss": 1.2628,
+      "step": 2038
+    },
+    {
+      "epoch": 0.3630698005698006,
+      "grad_norm": 0.45295149087905884,
+      "learning_rate": 0.00019603311666412213,
+      "loss": 0.9808,
+      "step": 2039
+    },
+    {
+      "epoch": 0.36324786324786323,
+      "grad_norm": 0.48681163787841797,
+      "learning_rate": 0.00019602921235937942,
+      "loss": 1.0574,
+      "step": 2040
+    },
+    {
+      "epoch": 0.36342592592592593,
+      "grad_norm": 0.41232365369796753,
+      "learning_rate": 0.00019602530617314378,
+      "loss": 1.0454,
+      "step": 2041
+    },
+    {
+      "epoch": 0.3636039886039886,
+      "grad_norm": 0.46214723587036133,
+      "learning_rate": 0.00019602139810549174,
+      "loss": 0.9985,
+      "step": 2042
+    },
+    {
+      "epoch": 0.36378205128205127,
+      "grad_norm": 0.44307878613471985,
+      "learning_rate": 0.00019601748815649989,
+      "loss": 0.9683,
+      "step": 2043
+    },
+    {
+      "epoch": 0.36396011396011396,
+      "grad_norm": 0.4809451401233673,
+      "learning_rate": 0.00019601357632624477,
+      "loss": 1.028,
+      "step": 2044
+    },
+    {
+      "epoch": 0.36413817663817666,
+      "grad_norm": 0.4638497531414032,
+      "learning_rate": 0.0001960096626148031,
+      "loss": 0.9851,
+      "step": 2045
+    },
+    {
+      "epoch": 0.3643162393162393,
+      "grad_norm": 0.5942164063453674,
+      "learning_rate": 0.00019600574702225153,
+      "loss": 1.1606,
+      "step": 2046
+    },
+    {
+      "epoch": 0.364494301994302,
+      "grad_norm": 0.5171293616294861,
+      "learning_rate": 0.00019600182954866675,
+      "loss": 1.2335,
+      "step": 2047
+    },
+    {
+      "epoch": 0.3646723646723647,
+      "grad_norm": 0.5294404625892639,
+      "learning_rate": 0.00019599791019412558,
+      "loss": 1.0966,
+      "step": 2048
+    },
+    {
+      "epoch": 0.36485042735042733,
+      "grad_norm": 0.46117448806762695,
+      "learning_rate": 0.00019599398895870477,
+      "loss": 1.0565,
+      "step": 2049
+    },
+    {
+      "epoch": 0.36502849002849,
+      "grad_norm": 0.5385118126869202,
+      "learning_rate": 0.00019599006584248118,
+      "loss": 1.0076,
+      "step": 2050
+    },
+    {
+      "epoch": 0.3652065527065527,
+      "grad_norm": 0.4915166199207306,
+      "learning_rate": 0.00019598614084553165,
+      "loss": 0.9686,
+      "step": 2051
+    },
+    {
+      "epoch": 0.36538461538461536,
+      "grad_norm": 0.46769094467163086,
+      "learning_rate": 0.00019598221396793303,
+      "loss": 1.1217,
+      "step": 2052
+    },
+    {
+      "epoch": 0.36556267806267806,
+      "grad_norm": 0.5440493822097778,
+      "learning_rate": 0.00019597828520976236,
+      "loss": 1.2344,
+      "step": 2053
+    },
+    {
+      "epoch": 0.36574074074074076,
+      "grad_norm": 0.616727352142334,
+      "learning_rate": 0.00019597435457109657,
+      "loss": 1.2953,
+      "step": 2054
+    },
+    {
+      "epoch": 0.3659188034188034,
+      "grad_norm": 0.4859183430671692,
+      "learning_rate": 0.00019597042205201265,
+      "loss": 1.16,
+      "step": 2055
+    },
+    {
+      "epoch": 0.3660968660968661,
+      "grad_norm": 0.47056329250335693,
+      "learning_rate": 0.0001959664876525877,
+      "loss": 0.9982,
+      "step": 2056
+    },
+    {
+      "epoch": 0.3662749287749288,
+      "grad_norm": 0.48347967863082886,
+      "learning_rate": 0.00019596255137289875,
+      "loss": 1.0966,
+      "step": 2057
+    },
+    {
+      "epoch": 0.36645299145299143,
+      "grad_norm": 0.5068454742431641,
+      "learning_rate": 0.00019595861321302296,
+      "loss": 1.2891,
+      "step": 2058
+    },
+    {
+      "epoch": 0.3666310541310541,
+      "grad_norm": 0.5702359080314636,
+      "learning_rate": 0.00019595467317303747,
+      "loss": 1.1394,
+      "step": 2059
+    },
+    {
+      "epoch": 0.3668091168091168,
+      "grad_norm": 0.5028812885284424,
+      "learning_rate": 0.0001959507312530195,
+      "loss": 1.2324,
+      "step": 2060
+    },
+    {
+      "epoch": 0.36698717948717946,
+      "grad_norm": 0.4672880172729492,
+      "learning_rate": 0.00019594678745304628,
+      "loss": 1.0581,
+      "step": 2061
+    },
+    {
+      "epoch": 0.36716524216524216,
+      "grad_norm": 0.5233900547027588,
+      "learning_rate": 0.00019594284177319504,
+      "loss": 1.138,
+      "step": 2062
+    },
+    {
+      "epoch": 0.36734330484330485,
+      "grad_norm": 0.46871712803840637,
+      "learning_rate": 0.00019593889421354316,
+      "loss": 1.2159,
+      "step": 2063
+    },
+    {
+      "epoch": 0.36752136752136755,
+      "grad_norm": 0.5180533528327942,
+      "learning_rate": 0.00019593494477416793,
+      "loss": 1.1116,
+      "step": 2064
+    },
+    {
+      "epoch": 0.3676994301994302,
+      "grad_norm": 0.5398494005203247,
+      "learning_rate": 0.0001959309934551467,
+      "loss": 1.2038,
+      "step": 2065
+    },
+    {
+      "epoch": 0.3678774928774929,
+      "grad_norm": 0.4850373864173889,
+      "learning_rate": 0.000195927040256557,
+      "loss": 1.4315,
+      "step": 2066
+    },
+    {
+      "epoch": 0.3680555555555556,
+      "grad_norm": 0.49190905690193176,
+      "learning_rate": 0.0001959230851784762,
+      "loss": 0.9993,
+      "step": 2067
+    },
+    {
+      "epoch": 0.3682336182336182,
+      "grad_norm": 0.4546903073787689,
+      "learning_rate": 0.00019591912822098178,
+      "loss": 1.0979,
+      "step": 2068
+    },
+    {
+      "epoch": 0.3684116809116809,
+      "grad_norm": 0.4726468622684479,
+      "learning_rate": 0.00019591516938415133,
+      "loss": 1.1629,
+      "step": 2069
+    },
+    {
+      "epoch": 0.3685897435897436,
+      "grad_norm": 0.47856009006500244,
+      "learning_rate": 0.00019591120866806235,
+      "loss": 1.2048,
+      "step": 2070
+    },
+    {
+      "epoch": 0.36876780626780625,
+      "grad_norm": 0.46847718954086304,
+      "learning_rate": 0.0001959072460727925,
+      "loss": 1.0958,
+      "step": 2071
+    },
+    {
+      "epoch": 0.36894586894586895,
+      "grad_norm": 0.47164350748062134,
+      "learning_rate": 0.0001959032815984194,
+      "loss": 1.1912,
+      "step": 2072
+    },
+    {
+      "epoch": 0.36912393162393164,
+      "grad_norm": 0.4838213324546814,
+      "learning_rate": 0.0001958993152450207,
+      "loss": 1.1466,
+      "step": 2073
+    },
+    {
+      "epoch": 0.3693019943019943,
+      "grad_norm": 0.47234636545181274,
+      "learning_rate": 0.00019589534701267412,
+      "loss": 0.9475,
+      "step": 2074
+    },
+    {
+      "epoch": 0.369480056980057,
+      "grad_norm": 0.4913126826286316,
+      "learning_rate": 0.00019589137690145746,
+      "loss": 1.1571,
+      "step": 2075
+    },
+    {
+      "epoch": 0.3696581196581197,
+      "grad_norm": 0.4696233570575714,
+      "learning_rate": 0.00019588740491144842,
+      "loss": 0.9797,
+      "step": 2076
+    },
+    {
+      "epoch": 0.3698361823361823,
+      "grad_norm": 0.46146106719970703,
+      "learning_rate": 0.00019588343104272492,
+      "loss": 1.027,
+      "step": 2077
+    },
+    {
+      "epoch": 0.370014245014245,
+      "grad_norm": 0.4920627176761627,
+      "learning_rate": 0.00019587945529536474,
+      "loss": 1.1008,
+      "step": 2078
+    },
+    {
+      "epoch": 0.3701923076923077,
+      "grad_norm": 0.4854249954223633,
+      "learning_rate": 0.0001958754776694458,
+      "loss": 1.0759,
+      "step": 2079
+    },
+    {
+      "epoch": 0.37037037037037035,
+      "grad_norm": 0.4884897768497467,
+      "learning_rate": 0.00019587149816504608,
+      "loss": 1.1403,
+      "step": 2080
+    },
+    {
+      "epoch": 0.37054843304843305,
+      "grad_norm": 0.5062584280967712,
+      "learning_rate": 0.00019586751678224345,
+      "loss": 1.0185,
+      "step": 2081
+    },
+    {
+      "epoch": 0.37072649572649574,
+      "grad_norm": 0.44697675108909607,
+      "learning_rate": 0.000195863533521116,
+      "loss": 1.0462,
+      "step": 2082
+    },
+    {
+      "epoch": 0.3709045584045584,
+      "grad_norm": 0.5122885704040527,
+      "learning_rate": 0.00019585954838174176,
+      "loss": 1.108,
+      "step": 2083
+    },
+    {
+      "epoch": 0.3710826210826211,
+      "grad_norm": 0.486650288105011,
+      "learning_rate": 0.0001958555613641988,
+      "loss": 1.126,
+      "step": 2084
+    },
+    {
+      "epoch": 0.3712606837606838,
+      "grad_norm": 0.5296297669410706,
+      "learning_rate": 0.00019585157246856523,
+      "loss": 1.1757,
+      "step": 2085
+    },
+    {
+      "epoch": 0.3714387464387464,
+      "grad_norm": 0.4935721457004547,
+      "learning_rate": 0.0001958475816949192,
+      "loss": 1.1654,
+      "step": 2086
+    },
+    {
+      "epoch": 0.3716168091168091,
+      "grad_norm": 0.6226509213447571,
+      "learning_rate": 0.00019584358904333891,
+      "loss": 1.1981,
+      "step": 2087
+    },
+    {
+      "epoch": 0.3717948717948718,
+      "grad_norm": 0.44094228744506836,
+      "learning_rate": 0.0001958395945139026,
+      "loss": 0.8468,
+      "step": 2088
+    },
+    {
+      "epoch": 0.37197293447293445,
+      "grad_norm": 0.5335884690284729,
+      "learning_rate": 0.00019583559810668858,
+      "loss": 1.1597,
+      "step": 2089
+    },
+    {
+      "epoch": 0.37215099715099714,
+      "grad_norm": 0.4585414528846741,
+      "learning_rate": 0.000195831599821775,
+      "loss": 0.9343,
+      "step": 2090
+    },
+    {
+      "epoch": 0.37232905982905984,
+      "grad_norm": 0.533087432384491,
+      "learning_rate": 0.00019582759965924035,
+      "loss": 1.1209,
+      "step": 2091
+    },
+    {
+      "epoch": 0.37250712250712253,
+      "grad_norm": 0.5302683711051941,
+      "learning_rate": 0.00019582359761916295,
+      "loss": 1.236,
+      "step": 2092
+    },
+    {
+      "epoch": 0.3726851851851852,
+      "grad_norm": 0.4522508382797241,
+      "learning_rate": 0.00019581959370162122,
+      "loss": 1.0196,
+      "step": 2093
+    },
+    {
+      "epoch": 0.37286324786324787,
+      "grad_norm": 0.52391517162323,
+      "learning_rate": 0.00019581558790669358,
+      "loss": 1.0077,
+      "step": 2094
+    },
+    {
+      "epoch": 0.37304131054131057,
+      "grad_norm": 0.47144797444343567,
+      "learning_rate": 0.00019581158023445854,
+      "loss": 1.0956,
+      "step": 2095
+    },
+    {
+      "epoch": 0.3732193732193732,
+      "grad_norm": 0.4486723244190216,
+      "learning_rate": 0.00019580757068499459,
+      "loss": 0.8697,
+      "step": 2096
+    },
+    {
+      "epoch": 0.3733974358974359,
+      "grad_norm": 0.4626580476760864,
+      "learning_rate": 0.00019580355925838034,
+      "loss": 0.8489,
+      "step": 2097
+    },
+    {
+      "epoch": 0.3735754985754986,
+      "grad_norm": 0.5647920370101929,
+      "learning_rate": 0.00019579954595469438,
+      "loss": 1.1458,
+      "step": 2098
+    },
+    {
+      "epoch": 0.37375356125356124,
+      "grad_norm": 0.4734349846839905,
+      "learning_rate": 0.00019579553077401528,
+      "loss": 1.1036,
+      "step": 2099
+    },
+    {
+      "epoch": 0.37393162393162394,
+      "grad_norm": 0.5624295473098755,
+      "learning_rate": 0.00019579151371642176,
+      "loss": 0.9793,
+      "step": 2100
+    },
+    {
+      "epoch": 0.37410968660968663,
+      "grad_norm": 0.47507283091545105,
+      "learning_rate": 0.00019578749478199256,
+      "loss": 1.0371,
+      "step": 2101
+    },
+    {
+      "epoch": 0.37428774928774927,
+      "grad_norm": 0.550865113735199,
+      "learning_rate": 0.00019578347397080633,
+      "loss": 1.046,
+      "step": 2102
+    },
+    {
+      "epoch": 0.37446581196581197,
+      "grad_norm": 0.5249403715133667,
+      "learning_rate": 0.00019577945128294193,
+      "loss": 1.3185,
+      "step": 2103
+    },
+    {
+      "epoch": 0.37464387464387466,
+      "grad_norm": 0.4921024739742279,
+      "learning_rate": 0.00019577542671847815,
+      "loss": 1.0758,
+      "step": 2104
+    },
+    {
+      "epoch": 0.3748219373219373,
+      "grad_norm": 0.5351784825325012,
+      "learning_rate": 0.00019577140027749384,
+      "loss": 1.067,
+      "step": 2105
+    },
+    {
+      "epoch": 0.375,
+      "grad_norm": 0.44420507550239563,
+      "learning_rate": 0.00019576737196006787,
+      "loss": 1.1065,
+      "step": 2106
+    },
+    {
+      "epoch": 0.3751780626780627,
+      "grad_norm": 0.531384289264679,
+      "learning_rate": 0.0001957633417662792,
+      "loss": 1.1634,
+      "step": 2107
+    },
+    {
+      "epoch": 0.37535612535612534,
+      "grad_norm": 0.5167618989944458,
+      "learning_rate": 0.00019575930969620677,
+      "loss": 1.1646,
+      "step": 2108
+    },
+    {
+      "epoch": 0.37553418803418803,
+      "grad_norm": 0.41487228870391846,
+      "learning_rate": 0.0001957552757499296,
+      "loss": 0.793,
+      "step": 2109
+    },
+    {
+      "epoch": 0.37571225071225073,
+      "grad_norm": 0.5110787153244019,
+      "learning_rate": 0.00019575123992752672,
+      "loss": 1.1752,
+      "step": 2110
+    },
+    {
+      "epoch": 0.37589031339031337,
+      "grad_norm": 0.4422051012516022,
+      "learning_rate": 0.00019574720222907717,
+      "loss": 1.0102,
+      "step": 2111
+    },
+    {
+      "epoch": 0.37606837606837606,
+      "grad_norm": 0.4757538139820099,
+      "learning_rate": 0.0001957431626546601,
+      "loss": 1.0467,
+      "step": 2112
+    },
+    {
+      "epoch": 0.37624643874643876,
+      "grad_norm": 0.4736764430999756,
+      "learning_rate": 0.00019573912120435466,
+      "loss": 1.3048,
+      "step": 2113
+    },
+    {
+      "epoch": 0.3764245014245014,
+      "grad_norm": 0.49894335865974426,
+      "learning_rate": 0.00019573507787824004,
+      "loss": 1.0502,
+      "step": 2114
+    },
+    {
+      "epoch": 0.3766025641025641,
+      "grad_norm": 0.48120981454849243,
+      "learning_rate": 0.00019573103267639543,
+      "loss": 1.2405,
+      "step": 2115
+    },
+    {
+      "epoch": 0.3767806267806268,
+      "grad_norm": 0.4826737642288208,
+      "learning_rate": 0.0001957269855989001,
+      "loss": 1.1189,
+      "step": 2116
+    },
+    {
+      "epoch": 0.37695868945868943,
+      "grad_norm": 0.4736921489238739,
+      "learning_rate": 0.0001957229366458333,
+      "loss": 1.2862,
+      "step": 2117
+    },
+    {
+      "epoch": 0.37713675213675213,
+      "grad_norm": 0.3895208537578583,
+      "learning_rate": 0.00019571888581727446,
+      "loss": 1.0573,
+      "step": 2118
+    },
+    {
+      "epoch": 0.3773148148148148,
+      "grad_norm": 0.5107510089874268,
+      "learning_rate": 0.00019571483311330284,
+      "loss": 1.2913,
+      "step": 2119
+    },
+    {
+      "epoch": 0.37749287749287747,
+      "grad_norm": 0.4543241262435913,
+      "learning_rate": 0.00019571077853399794,
+      "loss": 0.949,
+      "step": 2120
+    },
+    {
+      "epoch": 0.37767094017094016,
+      "grad_norm": 0.46897491812705994,
+      "learning_rate": 0.00019570672207943913,
+      "loss": 1.2235,
+      "step": 2121
+    },
+    {
+      "epoch": 0.37784900284900286,
+      "grad_norm": 0.4812130630016327,
+      "learning_rate": 0.0001957026637497059,
+      "loss": 0.8857,
+      "step": 2122
+    },
+    {
+      "epoch": 0.37802706552706555,
+      "grad_norm": 0.47452476620674133,
+      "learning_rate": 0.00019569860354487782,
+      "loss": 1.0549,
+      "step": 2123
+    },
+    {
+      "epoch": 0.3782051282051282,
+      "grad_norm": 0.49879950284957886,
+      "learning_rate": 0.00019569454146503438,
+      "loss": 1.0475,
+      "step": 2124
+    },
+    {
+      "epoch": 0.3783831908831909,
+      "grad_norm": 0.4246445894241333,
+      "learning_rate": 0.00019569047751025518,
+      "loss": 0.8788,
+      "step": 2125
+    },
+    {
+      "epoch": 0.3785612535612536,
+      "grad_norm": 0.4868565499782562,
+      "learning_rate": 0.00019568641168061986,
+      "loss": 1.1801,
+      "step": 2126
+    },
+    {
+      "epoch": 0.3787393162393162,
+      "grad_norm": 0.46723654866218567,
+      "learning_rate": 0.0001956823439762081,
+      "loss": 1.1661,
+      "step": 2127
+    },
+    {
+      "epoch": 0.3789173789173789,
+      "grad_norm": 0.4989059269428253,
+      "learning_rate": 0.00019567827439709954,
+      "loss": 1.3037,
+      "step": 2128
+    },
+    {
+      "epoch": 0.3790954415954416,
+      "grad_norm": 0.441307932138443,
+      "learning_rate": 0.00019567420294337395,
+      "loss": 1.0197,
+      "step": 2129
+    },
+    {
+      "epoch": 0.37927350427350426,
+      "grad_norm": 0.5200160145759583,
+      "learning_rate": 0.0001956701296151111,
+      "loss": 1.3366,
+      "step": 2130
+    },
+    {
+      "epoch": 0.37945156695156695,
+      "grad_norm": 0.43610256910324097,
+      "learning_rate": 0.00019566605441239082,
+      "loss": 1.0148,
+      "step": 2131
+    },
+    {
+      "epoch": 0.37962962962962965,
+      "grad_norm": 0.4160982370376587,
+      "learning_rate": 0.00019566197733529293,
+      "loss": 1.0758,
+      "step": 2132
+    },
+    {
+      "epoch": 0.3798076923076923,
+      "grad_norm": 0.5007950663566589,
+      "learning_rate": 0.00019565789838389726,
+      "loss": 1.1937,
+      "step": 2133
+    },
+    {
+      "epoch": 0.379985754985755,
+      "grad_norm": 0.4991525113582611,
+      "learning_rate": 0.00019565381755828385,
+      "loss": 1.1788,
+      "step": 2134
+    },
+    {
+      "epoch": 0.3801638176638177,
+      "grad_norm": 0.6313113570213318,
+      "learning_rate": 0.00019564973485853258,
+      "loss": 1.1241,
+      "step": 2135
+    },
+    {
+      "epoch": 0.3803418803418803,
+      "grad_norm": 0.49736538529396057,
+      "learning_rate": 0.0001956456502847234,
+      "loss": 1.0299,
+      "step": 2136
+    },
+    {
+      "epoch": 0.380519943019943,
+      "grad_norm": 0.4384380578994751,
+      "learning_rate": 0.00019564156383693643,
+      "loss": 1.132,
+      "step": 2137
+    },
+    {
+      "epoch": 0.3806980056980057,
+      "grad_norm": 0.4696183502674103,
+      "learning_rate": 0.00019563747551525168,
+      "loss": 1.1145,
+      "step": 2138
+    },
+    {
+      "epoch": 0.38087606837606836,
+      "grad_norm": 0.42039749026298523,
+      "learning_rate": 0.0001956333853197493,
+      "loss": 0.9549,
+      "step": 2139
+    },
+    {
+      "epoch": 0.38105413105413105,
+      "grad_norm": 0.5547221899032593,
+      "learning_rate": 0.00019562929325050936,
+      "loss": 1.0476,
+      "step": 2140
+    },
+    {
+      "epoch": 0.38123219373219375,
+      "grad_norm": 0.4803301692008972,
+      "learning_rate": 0.0001956251993076121,
+      "loss": 1.1285,
+      "step": 2141
+    },
+    {
+      "epoch": 0.3814102564102564,
+      "grad_norm": 0.609501838684082,
+      "learning_rate": 0.00019562110349113766,
+      "loss": 1.2375,
+      "step": 2142
+    },
+    {
+      "epoch": 0.3815883190883191,
+      "grad_norm": 0.5134759545326233,
+      "learning_rate": 0.00019561700580116639,
+      "loss": 1.0895,
+      "step": 2143
+    },
+    {
+      "epoch": 0.3817663817663818,
+      "grad_norm": 0.5086711049079895,
+      "learning_rate": 0.00019561290623777846,
+      "loss": 1.1139,
+      "step": 2144
+    },
+    {
+      "epoch": 0.3819444444444444,
+      "grad_norm": 0.5371596813201904,
+      "learning_rate": 0.00019560880480105428,
+      "loss": 0.9302,
+      "step": 2145
+    },
+    {
+      "epoch": 0.3821225071225071,
+      "grad_norm": 0.4966319799423218,
+      "learning_rate": 0.00019560470149107418,
+      "loss": 1.2485,
+      "step": 2146
+    },
+    {
+      "epoch": 0.3823005698005698,
+      "grad_norm": 0.5296950340270996,
+      "learning_rate": 0.00019560059630791855,
+      "loss": 1.4449,
+      "step": 2147
+    },
+    {
+      "epoch": 0.38247863247863245,
+      "grad_norm": 0.5564194321632385,
+      "learning_rate": 0.00019559648925166783,
+      "loss": 1.0817,
+      "step": 2148
+    },
+    {
+      "epoch": 0.38265669515669515,
+      "grad_norm": 0.5763841867446899,
+      "learning_rate": 0.0001955923803224025,
+      "loss": 1.1915,
+      "step": 2149
+    },
+    {
+      "epoch": 0.38283475783475784,
+      "grad_norm": 0.4782295823097229,
+      "learning_rate": 0.00019558826952020304,
+      "loss": 1.1317,
+      "step": 2150
+    },
+    {
+      "epoch": 0.38301282051282054,
+      "grad_norm": 0.4876856207847595,
+      "learning_rate": 0.00019558415684515002,
+      "loss": 1.2113,
+      "step": 2151
+    },
+    {
+      "epoch": 0.3831908831908832,
+      "grad_norm": 0.4894421398639679,
+      "learning_rate": 0.00019558004229732398,
+      "loss": 1.0761,
+      "step": 2152
+    },
+    {
+      "epoch": 0.3833689458689459,
+      "grad_norm": 0.47914227843284607,
+      "learning_rate": 0.0001955759258768056,
+      "loss": 1.0869,
+      "step": 2153
+    },
+    {
+      "epoch": 0.38354700854700857,
+      "grad_norm": 0.43933629989624023,
+      "learning_rate": 0.00019557180758367543,
+      "loss": 1.0581,
+      "step": 2154
+    },
+    {
+      "epoch": 0.3837250712250712,
+      "grad_norm": 0.4078103005886078,
+      "learning_rate": 0.00019556768741801428,
+      "loss": 1.065,
+      "step": 2155
+    },
+    {
+      "epoch": 0.3839031339031339,
+      "grad_norm": 0.5112793445587158,
+      "learning_rate": 0.00019556356537990278,
+      "loss": 1.2023,
+      "step": 2156
+    },
+    {
+      "epoch": 0.3840811965811966,
+      "grad_norm": 0.4699678122997284,
+      "learning_rate": 0.00019555944146942177,
+      "loss": 1.2459,
+      "step": 2157
+    },
+    {
+      "epoch": 0.38425925925925924,
+      "grad_norm": 0.4723528027534485,
+      "learning_rate": 0.00019555531568665198,
+      "loss": 1.2204,
+      "step": 2158
+    },
+    {
+      "epoch": 0.38443732193732194,
+      "grad_norm": 0.4648225009441376,
+      "learning_rate": 0.00019555118803167432,
+      "loss": 1.1355,
+      "step": 2159
+    },
+    {
+      "epoch": 0.38461538461538464,
+      "grad_norm": 0.49861815571784973,
+      "learning_rate": 0.00019554705850456961,
+      "loss": 1.1301,
+      "step": 2160
+    },
+    {
+      "epoch": 0.3847934472934473,
+      "grad_norm": 0.4076344966888428,
+      "learning_rate": 0.00019554292710541874,
+      "loss": 0.8997,
+      "step": 2161
+    },
+    {
+      "epoch": 0.38497150997151,
+      "grad_norm": 0.5510796308517456,
+      "learning_rate": 0.00019553879383430272,
+      "loss": 1.0594,
+      "step": 2162
+    },
+    {
+      "epoch": 0.38514957264957267,
+      "grad_norm": 0.55793696641922,
+      "learning_rate": 0.00019553465869130249,
+      "loss": 1.1284,
+      "step": 2163
+    },
+    {
+      "epoch": 0.3853276353276353,
+      "grad_norm": 0.5096491575241089,
+      "learning_rate": 0.00019553052167649906,
+      "loss": 1.0419,
+      "step": 2164
+    },
+    {
+      "epoch": 0.385505698005698,
+      "grad_norm": 0.49077361822128296,
+      "learning_rate": 0.0001955263827899735,
+      "loss": 1.1632,
+      "step": 2165
+    },
+    {
+      "epoch": 0.3856837606837607,
+      "grad_norm": 0.5546894073486328,
+      "learning_rate": 0.00019552224203180693,
+      "loss": 1.1487,
+      "step": 2166
+    },
+    {
+      "epoch": 0.38586182336182334,
+      "grad_norm": 0.4930037260055542,
+      "learning_rate": 0.00019551809940208047,
+      "loss": 1.2668,
+      "step": 2167
+    },
+    {
+      "epoch": 0.38603988603988604,
+      "grad_norm": 0.5600671172142029,
+      "learning_rate": 0.00019551395490087525,
+      "loss": 1.3988,
+      "step": 2168
+    },
+    {
+      "epoch": 0.38621794871794873,
+      "grad_norm": 0.45897629857063293,
+      "learning_rate": 0.0001955098085282725,
+      "loss": 0.7792,
+      "step": 2169
+    },
+    {
+      "epoch": 0.3863960113960114,
+      "grad_norm": 0.46138936281204224,
+      "learning_rate": 0.00019550566028435346,
+      "loss": 1.1749,
+      "step": 2170
+    },
+    {
+      "epoch": 0.38657407407407407,
+      "grad_norm": 0.5136167407035828,
+      "learning_rate": 0.0001955015101691994,
+      "loss": 1.0153,
+      "step": 2171
+    },
+    {
+      "epoch": 0.38675213675213677,
+      "grad_norm": 0.4886440336704254,
+      "learning_rate": 0.00019549735818289165,
+      "loss": 1.0006,
+      "step": 2172
+    },
+    {
+      "epoch": 0.3869301994301994,
+      "grad_norm": 0.4339776635169983,
+      "learning_rate": 0.00019549320432551154,
+      "loss": 1.0109,
+      "step": 2173
+    },
+    {
+      "epoch": 0.3871082621082621,
+      "grad_norm": 0.48729443550109863,
+      "learning_rate": 0.00019548904859714044,
+      "loss": 1.2016,
+      "step": 2174
+    },
+    {
+      "epoch": 0.3872863247863248,
+      "grad_norm": 0.5128757357597351,
+      "learning_rate": 0.0001954848909978598,
+      "loss": 1.085,
+      "step": 2175
+    },
+    {
+      "epoch": 0.38746438746438744,
+      "grad_norm": 0.49636292457580566,
+      "learning_rate": 0.0001954807315277511,
+      "loss": 1.0671,
+      "step": 2176
+    },
+    {
+      "epoch": 0.38764245014245013,
+      "grad_norm": 0.4946988821029663,
+      "learning_rate": 0.00019547657018689578,
+      "loss": 1.2091,
+      "step": 2177
+    },
+    {
+      "epoch": 0.38782051282051283,
+      "grad_norm": 0.49004554748535156,
+      "learning_rate": 0.00019547240697537544,
+      "loss": 1.0241,
+      "step": 2178
+    },
+    {
+      "epoch": 0.38799857549857547,
+      "grad_norm": 0.48750075697898865,
+      "learning_rate": 0.00019546824189327157,
+      "loss": 1.1082,
+      "step": 2179
+    },
+    {
+      "epoch": 0.38817663817663817,
+      "grad_norm": 0.47726166248321533,
+      "learning_rate": 0.00019546407494066585,
+      "loss": 1.1275,
+      "step": 2180
+    },
+    {
+      "epoch": 0.38835470085470086,
+      "grad_norm": 0.5253444910049438,
+      "learning_rate": 0.00019545990611763986,
+      "loss": 1.0164,
+      "step": 2181
+    },
+    {
+      "epoch": 0.38853276353276356,
+      "grad_norm": 0.4470371603965759,
+      "learning_rate": 0.00019545573542427533,
+      "loss": 1.0138,
+      "step": 2182
+    },
+    {
+      "epoch": 0.3887108262108262,
+      "grad_norm": 0.6645087599754333,
+      "learning_rate": 0.00019545156286065397,
+      "loss": 1.0884,
+      "step": 2183
+    },
+    {
+      "epoch": 0.3888888888888889,
+      "grad_norm": 0.498775839805603,
+      "learning_rate": 0.0001954473884268575,
+      "loss": 1.1035,
+      "step": 2184
+    },
+    {
+      "epoch": 0.3890669515669516,
+      "grad_norm": 0.5830566883087158,
+      "learning_rate": 0.00019544321212296772,
+      "loss": 1.1665,
+      "step": 2185
+    },
+    {
+      "epoch": 0.38924501424501423,
+      "grad_norm": 0.48162809014320374,
+      "learning_rate": 0.00019543903394906646,
+      "loss": 1.1035,
+      "step": 2186
+    },
+    {
+      "epoch": 0.3894230769230769,
+      "grad_norm": 0.46334075927734375,
+      "learning_rate": 0.0001954348539052356,
+      "loss": 0.9764,
+      "step": 2187
+    },
+    {
+      "epoch": 0.3896011396011396,
+      "grad_norm": 0.6343515515327454,
+      "learning_rate": 0.00019543067199155704,
+      "loss": 0.9474,
+      "step": 2188
+    },
+    {
+      "epoch": 0.38977920227920226,
+      "grad_norm": 0.4867806136608124,
+      "learning_rate": 0.0001954264882081127,
+      "loss": 1.1161,
+      "step": 2189
+    },
+    {
+      "epoch": 0.38995726495726496,
+      "grad_norm": 0.49305734038352966,
+      "learning_rate": 0.00019542230255498454,
+      "loss": 1.1825,
+      "step": 2190
+    },
+    {
+      "epoch": 0.39013532763532766,
+      "grad_norm": 0.518465518951416,
+      "learning_rate": 0.00019541811503225457,
+      "loss": 1.0695,
+      "step": 2191
+    },
+    {
+      "epoch": 0.3903133903133903,
+      "grad_norm": 0.4892457127571106,
+      "learning_rate": 0.00019541392564000488,
+      "loss": 1.3113,
+      "step": 2192
+    },
+    {
+      "epoch": 0.390491452991453,
+      "grad_norm": 0.5150920152664185,
+      "learning_rate": 0.00019540973437831753,
+      "loss": 1.0735,
+      "step": 2193
+    },
+    {
+      "epoch": 0.3906695156695157,
+      "grad_norm": 0.5414708256721497,
+      "learning_rate": 0.00019540554124727462,
+      "loss": 1.0773,
+      "step": 2194
+    },
+    {
+      "epoch": 0.39084757834757833,
+      "grad_norm": 0.49826398491859436,
+      "learning_rate": 0.0001954013462469583,
+      "loss": 1.0542,
+      "step": 2195
+    },
+    {
+      "epoch": 0.391025641025641,
+      "grad_norm": 0.5203596949577332,
+      "learning_rate": 0.0001953971493774508,
+      "loss": 1.178,
+      "step": 2196
+    },
+    {
+      "epoch": 0.3912037037037037,
+      "grad_norm": 0.45095738768577576,
+      "learning_rate": 0.00019539295063883432,
+      "loss": 1.1254,
+      "step": 2197
+    },
+    {
+      "epoch": 0.39138176638176636,
+      "grad_norm": 0.4938857853412628,
+      "learning_rate": 0.00019538875003119113,
+      "loss": 1.1061,
+      "step": 2198
+    },
+    {
+      "epoch": 0.39155982905982906,
+      "grad_norm": 0.5260919332504272,
+      "learning_rate": 0.00019538454755460354,
+      "loss": 1.3292,
+      "step": 2199
+    },
+    {
+      "epoch": 0.39173789173789175,
+      "grad_norm": 0.46527108550071716,
+      "learning_rate": 0.00019538034320915388,
+      "loss": 1.2074,
+      "step": 2200
+    },
+    {
+      "epoch": 0.3919159544159544,
+      "grad_norm": 0.5608304738998413,
+      "learning_rate": 0.00019537613699492453,
+      "loss": 1.0385,
+      "step": 2201
+    },
+    {
+      "epoch": 0.3920940170940171,
+      "grad_norm": 0.5056684613227844,
+      "learning_rate": 0.00019537192891199792,
+      "loss": 1.1513,
+      "step": 2202
+    },
+    {
+      "epoch": 0.3922720797720798,
+      "grad_norm": 0.3764426112174988,
+      "learning_rate": 0.00019536771896045644,
+      "loss": 0.8966,
+      "step": 2203
+    },
+    {
+      "epoch": 0.3924501424501424,
+      "grad_norm": 0.4983638823032379,
+      "learning_rate": 0.0001953635071403827,
+      "loss": 1.097,
+      "step": 2204
+    },
+    {
+      "epoch": 0.3926282051282051,
+      "grad_norm": 0.5733919739723206,
+      "learning_rate": 0.00019535929345185904,
+      "loss": 1.4992,
+      "step": 2205
+    },
+    {
+      "epoch": 0.3928062678062678,
+      "grad_norm": 0.632064163684845,
+      "learning_rate": 0.00019535507789496817,
+      "loss": 1.0611,
+      "step": 2206
+    },
+    {
+      "epoch": 0.39298433048433046,
+      "grad_norm": 0.409978449344635,
+      "learning_rate": 0.00019535086046979262,
+      "loss": 0.7172,
+      "step": 2207
+    },
+    {
+      "epoch": 0.39316239316239315,
+      "grad_norm": 0.40910813212394714,
+      "learning_rate": 0.00019534664117641502,
+      "loss": 0.8803,
+      "step": 2208
+    },
+    {
+      "epoch": 0.39334045584045585,
+      "grad_norm": 0.4696179926395416,
+      "learning_rate": 0.00019534242001491807,
+      "loss": 1.1551,
+      "step": 2209
+    },
+    {
+      "epoch": 0.39351851851851855,
+      "grad_norm": 0.538425862789154,
+      "learning_rate": 0.00019533819698538444,
+      "loss": 1.1296,
+      "step": 2210
+    },
+    {
+      "epoch": 0.3936965811965812,
+      "grad_norm": 0.5913630723953247,
+      "learning_rate": 0.00019533397208789692,
+      "loss": 0.9757,
+      "step": 2211
+    },
+    {
+      "epoch": 0.3938746438746439,
+      "grad_norm": 0.5649870038032532,
+      "learning_rate": 0.00019532974532253822,
+      "loss": 0.9976,
+      "step": 2212
+    },
+    {
+      "epoch": 0.3940527065527066,
+      "grad_norm": 0.5012063980102539,
+      "learning_rate": 0.00019532551668939121,
+      "loss": 0.9969,
+      "step": 2213
+    },
+    {
+      "epoch": 0.3942307692307692,
+      "grad_norm": 0.5098594427108765,
+      "learning_rate": 0.00019532128618853872,
+      "loss": 1.1229,
+      "step": 2214
+    },
+    {
+      "epoch": 0.3944088319088319,
+      "grad_norm": 0.4753342568874359,
+      "learning_rate": 0.0001953170538200636,
+      "loss": 1.0808,
+      "step": 2215
+    },
+    {
+      "epoch": 0.3945868945868946,
+      "grad_norm": 0.4770098626613617,
+      "learning_rate": 0.00019531281958404888,
+      "loss": 1.0656,
+      "step": 2216
+    },
+    {
+      "epoch": 0.39476495726495725,
+      "grad_norm": 0.6007979512214661,
+      "learning_rate": 0.00019530858348057746,
+      "loss": 1.0093,
+      "step": 2217
+    },
+    {
+      "epoch": 0.39494301994301995,
+      "grad_norm": 0.4501650929450989,
+      "learning_rate": 0.00019530434550973227,
+      "loss": 0.8557,
+      "step": 2218
+    },
+    {
+      "epoch": 0.39512108262108264,
+      "grad_norm": 0.5123980641365051,
+      "learning_rate": 0.00019530010567159645,
+      "loss": 0.9833,
+      "step": 2219
+    },
+    {
+      "epoch": 0.3952991452991453,
+      "grad_norm": 0.4623969495296478,
+      "learning_rate": 0.000195295863966253,
+      "loss": 0.913,
+      "step": 2220
+    },
+    {
+      "epoch": 0.395477207977208,
+      "grad_norm": 0.4341880679130554,
+      "learning_rate": 0.0001952916203937851,
+      "loss": 1.0234,
+      "step": 2221
+    },
+    {
+      "epoch": 0.3956552706552707,
+      "grad_norm": 0.5935006141662598,
+      "learning_rate": 0.00019528737495427581,
+      "loss": 1.061,
+      "step": 2222
+    },
+    {
+      "epoch": 0.3958333333333333,
+      "grad_norm": 0.44835174083709717,
+      "learning_rate": 0.00019528312764780837,
+      "loss": 1.1567,
+      "step": 2223
+    },
+    {
+      "epoch": 0.396011396011396,
+      "grad_norm": 0.5476976633071899,
+      "learning_rate": 0.00019527887847446595,
+      "loss": 1.2304,
+      "step": 2224
+    },
+    {
+      "epoch": 0.3961894586894587,
+      "grad_norm": 0.4487939774990082,
+      "learning_rate": 0.00019527462743433187,
+      "loss": 1.1813,
+      "step": 2225
+    },
+    {
+      "epoch": 0.39636752136752135,
+      "grad_norm": 0.4053241014480591,
+      "learning_rate": 0.00019527037452748936,
+      "loss": 0.7899,
+      "step": 2226
+    },
+    {
+      "epoch": 0.39654558404558404,
+      "grad_norm": 0.534570574760437,
+      "learning_rate": 0.00019526611975402176,
+      "loss": 1.0681,
+      "step": 2227
+    },
+    {
+      "epoch": 0.39672364672364674,
+      "grad_norm": 0.46096158027648926,
+      "learning_rate": 0.00019526186311401246,
+      "loss": 0.9234,
+      "step": 2228
+    },
+    {
+      "epoch": 0.3969017094017094,
+      "grad_norm": 0.47363516688346863,
+      "learning_rate": 0.00019525760460754483,
+      "loss": 1.0197,
+      "step": 2229
+    },
+    {
+      "epoch": 0.3970797720797721,
+      "grad_norm": 0.46317258477211,
+      "learning_rate": 0.00019525334423470234,
+      "loss": 1.2103,
+      "step": 2230
+    },
+    {
+      "epoch": 0.39725783475783477,
+      "grad_norm": 0.4924237132072449,
+      "learning_rate": 0.0001952490819955684,
+      "loss": 1.3299,
+      "step": 2231
+    },
+    {
+      "epoch": 0.3974358974358974,
+      "grad_norm": 0.5419978499412537,
+      "learning_rate": 0.0001952448178902266,
+      "loss": 1.2526,
+      "step": 2232
+    },
+    {
+      "epoch": 0.3976139601139601,
+      "grad_norm": 0.5003267526626587,
+      "learning_rate": 0.00019524055191876043,
+      "loss": 1.1073,
+      "step": 2233
+    },
+    {
+      "epoch": 0.3977920227920228,
+      "grad_norm": 0.621789276599884,
+      "learning_rate": 0.00019523628408125347,
+      "loss": 1.3409,
+      "step": 2234
+    },
+    {
+      "epoch": 0.39797008547008544,
+      "grad_norm": 0.44235602021217346,
+      "learning_rate": 0.0001952320143777894,
+      "loss": 0.9799,
+      "step": 2235
+    },
+    {
+      "epoch": 0.39814814814814814,
+      "grad_norm": 0.49954718351364136,
+      "learning_rate": 0.0001952277428084518,
+      "loss": 1.2227,
+      "step": 2236
+    },
+    {
+      "epoch": 0.39832621082621084,
+      "grad_norm": 0.5113739967346191,
+      "learning_rate": 0.00019522346937332443,
+      "loss": 1.1644,
+      "step": 2237
+    },
+    {
+      "epoch": 0.39850427350427353,
+      "grad_norm": 0.5026139616966248,
+      "learning_rate": 0.00019521919407249096,
+      "loss": 1.0823,
+      "step": 2238
+    },
+    {
+      "epoch": 0.39868233618233617,
+      "grad_norm": 0.4943205714225769,
+      "learning_rate": 0.0001952149169060352,
+      "loss": 1.0961,
+      "step": 2239
+    },
+    {
+      "epoch": 0.39886039886039887,
+      "grad_norm": 0.4680631458759308,
+      "learning_rate": 0.00019521063787404094,
+      "loss": 0.9787,
+      "step": 2240
+    },
+    {
+      "epoch": 0.39903846153846156,
+      "grad_norm": 0.5511566400527954,
+      "learning_rate": 0.00019520635697659202,
+      "loss": 1.2543,
+      "step": 2241
+    },
+    {
+      "epoch": 0.3992165242165242,
+      "grad_norm": 0.5494263172149658,
+      "learning_rate": 0.00019520207421377229,
+      "loss": 1.1978,
+      "step": 2242
+    },
+    {
+      "epoch": 0.3993945868945869,
+      "grad_norm": 0.4850340485572815,
+      "learning_rate": 0.00019519778958566568,
+      "loss": 0.8531,
+      "step": 2243
+    },
+    {
+      "epoch": 0.3995726495726496,
+      "grad_norm": 0.47168150544166565,
+      "learning_rate": 0.00019519350309235613,
+      "loss": 1.0746,
+      "step": 2244
+    },
+    {
+      "epoch": 0.39975071225071224,
+      "grad_norm": 0.571133553981781,
+      "learning_rate": 0.00019518921473392765,
+      "loss": 1.2984,
+      "step": 2245
+    },
+    {
+      "epoch": 0.39992877492877493,
+      "grad_norm": 0.4636089503765106,
+      "learning_rate": 0.00019518492451046427,
+      "loss": 1.019,
+      "step": 2246
+    },
+    {
+      "epoch": 0.40010683760683763,
+      "grad_norm": 0.4573518931865692,
+      "learning_rate": 0.00019518063242205,
+      "loss": 1.1042,
+      "step": 2247
+    },
+    {
+      "epoch": 0.40028490028490027,
+      "grad_norm": 0.49098989367485046,
+      "learning_rate": 0.00019517633846876894,
+      "loss": 1.1224,
+      "step": 2248
+    },
+    {
+      "epoch": 0.40046296296296297,
+      "grad_norm": 0.5475491881370544,
+      "learning_rate": 0.00019517204265070523,
+      "loss": 1.0984,
+      "step": 2249
+    },
+    {
+      "epoch": 0.40064102564102566,
+      "grad_norm": 0.45498281717300415,
+      "learning_rate": 0.00019516774496794307,
+      "loss": 0.8883,
+      "step": 2250
+    },
+    {
+      "epoch": 0.4008190883190883,
+      "grad_norm": 0.4908423125743866,
+      "learning_rate": 0.00019516344542056666,
+      "loss": 1.328,
+      "step": 2251
+    },
+    {
+      "epoch": 0.400997150997151,
+      "grad_norm": 0.5474920272827148,
+      "learning_rate": 0.0001951591440086602,
+      "loss": 1.3825,
+      "step": 2252
+    },
+    {
+      "epoch": 0.4011752136752137,
+      "grad_norm": 0.5165615081787109,
+      "learning_rate": 0.000195154840732308,
+      "loss": 1.33,
+      "step": 2253
+    },
+    {
+      "epoch": 0.40135327635327633,
+      "grad_norm": 0.5185585021972656,
+      "learning_rate": 0.00019515053559159435,
+      "loss": 1.1689,
+      "step": 2254
+    },
+    {
+      "epoch": 0.40153133903133903,
+      "grad_norm": 0.5468854904174805,
+      "learning_rate": 0.00019514622858660363,
+      "loss": 1.2708,
+      "step": 2255
+    },
+    {
+      "epoch": 0.4017094017094017,
+      "grad_norm": 0.47556906938552856,
+      "learning_rate": 0.0001951419197174202,
+      "loss": 1.0488,
+      "step": 2256
+    },
+    {
+      "epoch": 0.40188746438746437,
+      "grad_norm": 0.5521323084831238,
+      "learning_rate": 0.0001951376089841285,
+      "loss": 1.0868,
+      "step": 2257
+    },
+    {
+      "epoch": 0.40206552706552706,
+      "grad_norm": 0.6029638051986694,
+      "learning_rate": 0.00019513329638681296,
+      "loss": 1.1735,
+      "step": 2258
+    },
+    {
+      "epoch": 0.40224358974358976,
+      "grad_norm": 0.4897766411304474,
+      "learning_rate": 0.00019512898192555812,
+      "loss": 1.1687,
+      "step": 2259
+    },
+    {
+      "epoch": 0.4024216524216524,
+      "grad_norm": 0.45527184009552,
+      "learning_rate": 0.00019512466560044848,
+      "loss": 1.0352,
+      "step": 2260
+    },
+    {
+      "epoch": 0.4025997150997151,
+      "grad_norm": 0.5025625824928284,
+      "learning_rate": 0.00019512034741156863,
+      "loss": 1.2503,
+      "step": 2261
+    },
+    {
+      "epoch": 0.4027777777777778,
+      "grad_norm": 0.46415451169013977,
+      "learning_rate": 0.00019511602735900317,
+      "loss": 1.032,
+      "step": 2262
+    },
+    {
+      "epoch": 0.40295584045584043,
+      "grad_norm": 0.4812934398651123,
+      "learning_rate": 0.00019511170544283678,
+      "loss": 1.0523,
+      "step": 2263
+    },
+    {
+      "epoch": 0.4031339031339031,
+      "grad_norm": 0.49937039613723755,
+      "learning_rate": 0.00019510738166315404,
+      "loss": 1.2238,
+      "step": 2264
+    },
+    {
+      "epoch": 0.4033119658119658,
+      "grad_norm": 0.5428698062896729,
+      "learning_rate": 0.00019510305602003975,
+      "loss": 1.0361,
+      "step": 2265
+    },
+    {
+      "epoch": 0.40349002849002846,
+      "grad_norm": 0.44836854934692383,
+      "learning_rate": 0.0001950987285135786,
+      "loss": 1.169,
+      "step": 2266
+    },
+    {
+      "epoch": 0.40366809116809116,
+      "grad_norm": 0.5071489214897156,
+      "learning_rate": 0.00019509439914385549,
+      "loss": 1.1567,
+      "step": 2267
+    },
+    {
+      "epoch": 0.40384615384615385,
+      "grad_norm": 0.5204613208770752,
+      "learning_rate": 0.00019509006791095513,
+      "loss": 0.9949,
+      "step": 2268
+    },
+    {
+      "epoch": 0.40402421652421655,
+      "grad_norm": 0.4583234488964081,
+      "learning_rate": 0.00019508573481496238,
+      "loss": 0.9051,
+      "step": 2269
+    },
+    {
+      "epoch": 0.4042022792022792,
+      "grad_norm": 0.5436791181564331,
+      "learning_rate": 0.00019508139985596222,
+      "loss": 1.3239,
+      "step": 2270
+    },
+    {
+      "epoch": 0.4043803418803419,
+      "grad_norm": 0.48774269223213196,
+      "learning_rate": 0.00019507706303403954,
+      "loss": 1.2102,
+      "step": 2271
+    },
+    {
+      "epoch": 0.4045584045584046,
+      "grad_norm": 0.4742540717124939,
+      "learning_rate": 0.00019507272434927933,
+      "loss": 1.1137,
+      "step": 2272
+    },
+    {
+      "epoch": 0.4047364672364672,
+      "grad_norm": 0.531148374080658,
+      "learning_rate": 0.00019506838380176658,
+      "loss": 1.3162,
+      "step": 2273
+    },
+    {
+      "epoch": 0.4049145299145299,
+      "grad_norm": 0.5002314448356628,
+      "learning_rate": 0.0001950640413915863,
+      "loss": 1.0743,
+      "step": 2274
+    },
+    {
+      "epoch": 0.4050925925925926,
+      "grad_norm": 0.39826446771621704,
+      "learning_rate": 0.00019505969711882366,
+      "loss": 0.7698,
+      "step": 2275
+    },
+    {
+      "epoch": 0.40527065527065526,
+      "grad_norm": 0.5177471041679382,
+      "learning_rate": 0.00019505535098356371,
+      "loss": 1.1821,
+      "step": 2276
+    },
+    {
+      "epoch": 0.40544871794871795,
+      "grad_norm": 0.467241108417511,
+      "learning_rate": 0.00019505100298589158,
+      "loss": 0.8036,
+      "step": 2277
+    },
+    {
+      "epoch": 0.40562678062678065,
+      "grad_norm": 0.43711844086647034,
+      "learning_rate": 0.00019504665312589255,
+      "loss": 0.8667,
+      "step": 2278
+    },
+    {
+      "epoch": 0.4058048433048433,
+      "grad_norm": 0.4929116368293762,
+      "learning_rate": 0.00019504230140365177,
+      "loss": 1.1279,
+      "step": 2279
+    },
+    {
+      "epoch": 0.405982905982906,
+      "grad_norm": 0.5279183983802795,
+      "learning_rate": 0.00019503794781925452,
+      "loss": 1.1318,
+      "step": 2280
+    },
+    {
+      "epoch": 0.4061609686609687,
+      "grad_norm": 0.549217939376831,
+      "learning_rate": 0.00019503359237278608,
+      "loss": 1.2007,
+      "step": 2281
+    },
+    {
+      "epoch": 0.4063390313390313,
+      "grad_norm": 0.5485880374908447,
+      "learning_rate": 0.00019502923506433187,
+      "loss": 1.1079,
+      "step": 2282
+    },
+    {
+      "epoch": 0.406517094017094,
+      "grad_norm": 0.48379644751548767,
+      "learning_rate": 0.0001950248758939772,
+      "loss": 0.9978,
+      "step": 2283
+    },
+    {
+      "epoch": 0.4066951566951567,
+      "grad_norm": 0.5943657755851746,
+      "learning_rate": 0.00019502051486180744,
+      "loss": 1.0466,
+      "step": 2284
+    },
+    {
+      "epoch": 0.40687321937321935,
+      "grad_norm": 0.5721273422241211,
+      "learning_rate": 0.00019501615196790812,
+      "loss": 1.2674,
+      "step": 2285
+    },
+    {
+      "epoch": 0.40705128205128205,
+      "grad_norm": 0.47624221444129944,
+      "learning_rate": 0.00019501178721236464,
+      "loss": 1.089,
+      "step": 2286
+    },
+    {
+      "epoch": 0.40722934472934474,
+      "grad_norm": 0.5091297030448914,
+      "learning_rate": 0.0001950074205952626,
+      "loss": 1.2035,
+      "step": 2287
+    },
+    {
+      "epoch": 0.4074074074074074,
+      "grad_norm": 0.45206236839294434,
+      "learning_rate": 0.0001950030521166875,
+      "loss": 0.9188,
+      "step": 2288
+    },
+    {
+      "epoch": 0.4075854700854701,
+      "grad_norm": 0.5563844442367554,
+      "learning_rate": 0.00019499868177672497,
+      "loss": 1.3444,
+      "step": 2289
+    },
+    {
+      "epoch": 0.4077635327635328,
+      "grad_norm": 0.4971138536930084,
+      "learning_rate": 0.00019499430957546055,
+      "loss": 1.1615,
+      "step": 2290
+    },
+    {
+      "epoch": 0.4079415954415954,
+      "grad_norm": 0.49355944991111755,
+      "learning_rate": 0.00019498993551298,
+      "loss": 1.1528,
+      "step": 2291
+    },
+    {
+      "epoch": 0.4081196581196581,
+      "grad_norm": 0.534705638885498,
+      "learning_rate": 0.000194985559589369,
+      "loss": 1.197,
+      "step": 2292
+    },
+    {
+      "epoch": 0.4082977207977208,
+      "grad_norm": 0.5113020539283752,
+      "learning_rate": 0.0001949811818047133,
+      "loss": 1.109,
+      "step": 2293
+    },
+    {
+      "epoch": 0.40847578347578345,
+      "grad_norm": 0.4823366701602936,
+      "learning_rate": 0.00019497680215909858,
+      "loss": 1.168,
+      "step": 2294
+    },
+    {
+      "epoch": 0.40865384615384615,
+      "grad_norm": 0.500792920589447,
+      "learning_rate": 0.00019497242065261077,
+      "loss": 1.1567,
+      "step": 2295
+    },
+    {
+      "epoch": 0.40883190883190884,
+      "grad_norm": 0.5047918558120728,
+      "learning_rate": 0.00019496803728533566,
+      "loss": 1.0515,
+      "step": 2296
+    },
+    {
+      "epoch": 0.40900997150997154,
+      "grad_norm": 0.474624365568161,
+      "learning_rate": 0.00019496365205735913,
+      "loss": 1.1747,
+      "step": 2297
+    },
+    {
+      "epoch": 0.4091880341880342,
+      "grad_norm": 0.5522183179855347,
+      "learning_rate": 0.0001949592649687671,
+      "loss": 1.1506,
+      "step": 2298
+    },
+    {
+      "epoch": 0.4093660968660969,
+      "grad_norm": 0.4526083767414093,
+      "learning_rate": 0.00019495487601964553,
+      "loss": 0.9968,
+      "step": 2299
+    },
+    {
+      "epoch": 0.40954415954415957,
+      "grad_norm": 0.545845091342926,
+      "learning_rate": 0.00019495048521008044,
+      "loss": 1.146,
+      "step": 2300
+    },
+    {
+      "epoch": 0.4097222222222222,
+      "grad_norm": 0.5475544333457947,
+      "learning_rate": 0.00019494609254015784,
+      "loss": 1.0101,
+      "step": 2301
+    },
+    {
+      "epoch": 0.4099002849002849,
+      "grad_norm": 0.43419042229652405,
+      "learning_rate": 0.00019494169800996373,
+      "loss": 1.065,
+      "step": 2302
+    },
+    {
+      "epoch": 0.4100783475783476,
+      "grad_norm": 0.44998374581336975,
+      "learning_rate": 0.00019493730161958435,
+      "loss": 0.9948,
+      "step": 2303
+    },
+    {
+      "epoch": 0.41025641025641024,
+      "grad_norm": 0.5401661992073059,
+      "learning_rate": 0.0001949329033691057,
+      "loss": 1.0473,
+      "step": 2304
+    },
+    {
+      "epoch": 0.41043447293447294,
+      "grad_norm": 0.48064103722572327,
+      "learning_rate": 0.00019492850325861404,
+      "loss": 1.0486,
+      "step": 2305
+    },
+    {
+      "epoch": 0.41061253561253563,
+      "grad_norm": 0.5398300290107727,
+      "learning_rate": 0.00019492410128819557,
+      "loss": 1.0314,
+      "step": 2306
+    },
+    {
+      "epoch": 0.4107905982905983,
+      "grad_norm": 0.4771125912666321,
+      "learning_rate": 0.0001949196974579365,
+      "loss": 0.9855,
+      "step": 2307
+    },
+    {
+      "epoch": 0.41096866096866097,
+      "grad_norm": 0.5375809669494629,
+      "learning_rate": 0.00019491529176792315,
+      "loss": 1.0777,
+      "step": 2308
+    },
+    {
+      "epoch": 0.41114672364672367,
+      "grad_norm": 0.48424094915390015,
+      "learning_rate": 0.00019491088421824183,
+      "loss": 1.0751,
+      "step": 2309
+    },
+    {
+      "epoch": 0.4113247863247863,
+      "grad_norm": 0.5054880380630493,
+      "learning_rate": 0.00019490647480897887,
+      "loss": 1.2457,
+      "step": 2310
+    },
+    {
+      "epoch": 0.411502849002849,
+      "grad_norm": 0.47118356823921204,
+      "learning_rate": 0.0001949020635402207,
+      "loss": 1.0445,
+      "step": 2311
+    },
+    {
+      "epoch": 0.4116809116809117,
+      "grad_norm": 0.47171851992607117,
+      "learning_rate": 0.00019489765041205375,
+      "loss": 1.0062,
+      "step": 2312
+    },
+    {
+      "epoch": 0.41185897435897434,
+      "grad_norm": 0.5703238844871521,
+      "learning_rate": 0.00019489323542456447,
+      "loss": 1.5639,
+      "step": 2313
+    },
+    {
+      "epoch": 0.41203703703703703,
+      "grad_norm": 0.5045075416564941,
+      "learning_rate": 0.00019488881857783935,
+      "loss": 1.1665,
+      "step": 2314
+    },
+    {
+      "epoch": 0.41221509971509973,
+      "grad_norm": 0.46835362911224365,
+      "learning_rate": 0.00019488439987196495,
+      "loss": 1.2078,
+      "step": 2315
+    },
+    {
+      "epoch": 0.41239316239316237,
+      "grad_norm": 0.5187196731567383,
+      "learning_rate": 0.00019487997930702785,
+      "loss": 1.1049,
+      "step": 2316
+    },
+    {
+      "epoch": 0.41257122507122507,
+      "grad_norm": 0.5190554857254028,
+      "learning_rate": 0.00019487555688311463,
+      "loss": 1.331,
+      "step": 2317
+    },
+    {
+      "epoch": 0.41274928774928776,
+      "grad_norm": 0.7394969463348389,
+      "learning_rate": 0.00019487113260031197,
+      "loss": 0.9646,
+      "step": 2318
+    },
+    {
+      "epoch": 0.4129273504273504,
+      "grad_norm": 0.532982349395752,
+      "learning_rate": 0.00019486670645870656,
+      "loss": 1.166,
+      "step": 2319
+    },
+    {
+      "epoch": 0.4131054131054131,
+      "grad_norm": 0.48659515380859375,
+      "learning_rate": 0.00019486227845838509,
+      "loss": 1.0016,
+      "step": 2320
+    },
+    {
+      "epoch": 0.4132834757834758,
+      "grad_norm": 0.5364453196525574,
+      "learning_rate": 0.00019485784859943434,
+      "loss": 1.3877,
+      "step": 2321
+    },
+    {
+      "epoch": 0.41346153846153844,
+      "grad_norm": 0.49788740277290344,
+      "learning_rate": 0.0001948534168819411,
+      "loss": 1.2949,
+      "step": 2322
+    },
+    {
+      "epoch": 0.41363960113960113,
+      "grad_norm": 0.5125377774238586,
+      "learning_rate": 0.00019484898330599217,
+      "loss": 0.9769,
+      "step": 2323
+    },
+    {
+      "epoch": 0.41381766381766383,
+      "grad_norm": 0.5434861779212952,
+      "learning_rate": 0.00019484454787167447,
+      "loss": 1.254,
+      "step": 2324
+    },
+    {
+      "epoch": 0.41399572649572647,
+      "grad_norm": 0.5324583053588867,
+      "learning_rate": 0.00019484011057907487,
+      "loss": 0.9788,
+      "step": 2325
+    },
+    {
+      "epoch": 0.41417378917378916,
+      "grad_norm": 0.4806961715221405,
+      "learning_rate": 0.00019483567142828033,
+      "loss": 1.0089,
+      "step": 2326
+    },
+    {
+      "epoch": 0.41435185185185186,
+      "grad_norm": 0.5152947306632996,
+      "learning_rate": 0.0001948312304193778,
+      "loss": 1.15,
+      "step": 2327
+    },
+    {
+      "epoch": 0.41452991452991456,
+      "grad_norm": 0.6030138731002808,
+      "learning_rate": 0.0001948267875524543,
+      "loss": 1.196,
+      "step": 2328
+    },
+    {
+      "epoch": 0.4147079772079772,
+      "grad_norm": 0.4504946768283844,
+      "learning_rate": 0.0001948223428275969,
+      "loss": 0.8742,
+      "step": 2329
+    },
+    {
+      "epoch": 0.4148860398860399,
+      "grad_norm": 0.5195745825767517,
+      "learning_rate": 0.00019481789624489263,
+      "loss": 1.0104,
+      "step": 2330
+    },
+    {
+      "epoch": 0.4150641025641026,
+      "grad_norm": 0.5269250869750977,
+      "learning_rate": 0.0001948134478044287,
+      "loss": 1.2284,
+      "step": 2331
+    },
+    {
+      "epoch": 0.41524216524216523,
+      "grad_norm": 0.5302315354347229,
+      "learning_rate": 0.00019480899750629218,
+      "loss": 1.1374,
+      "step": 2332
+    },
+    {
+      "epoch": 0.4154202279202279,
+      "grad_norm": 0.5501471161842346,
+      "learning_rate": 0.0001948045453505703,
+      "loss": 1.214,
+      "step": 2333
+    },
+    {
+      "epoch": 0.4155982905982906,
+      "grad_norm": 0.4674588739871979,
+      "learning_rate": 0.0001948000913373503,
+      "loss": 1.0568,
+      "step": 2334
+    },
+    {
+      "epoch": 0.41577635327635326,
+      "grad_norm": 0.5262266993522644,
+      "learning_rate": 0.0001947956354667195,
+      "loss": 1.111,
+      "step": 2335
+    },
+    {
+      "epoch": 0.41595441595441596,
+      "grad_norm": 0.4549071788787842,
+      "learning_rate": 0.00019479117773876507,
+      "loss": 1.2655,
+      "step": 2336
+    },
+    {
+      "epoch": 0.41613247863247865,
+      "grad_norm": 0.48897311091423035,
+      "learning_rate": 0.00019478671815357447,
+      "loss": 1.0543,
+      "step": 2337
+    },
+    {
+      "epoch": 0.4163105413105413,
+      "grad_norm": 0.5544867515563965,
+      "learning_rate": 0.000194782256711235,
+      "loss": 1.2276,
+      "step": 2338
+    },
+    {
+      "epoch": 0.416488603988604,
+      "grad_norm": 0.5050773024559021,
+      "learning_rate": 0.0001947777934118341,
+      "loss": 0.9781,
+      "step": 2339
+    },
+    {
+      "epoch": 0.4166666666666667,
+      "grad_norm": 0.4831899106502533,
+      "learning_rate": 0.00019477332825545925,
+      "loss": 1.0213,
+      "step": 2340
+    },
+    {
+      "epoch": 0.4168447293447293,
+      "grad_norm": 0.5392552614212036,
+      "learning_rate": 0.0001947688612421979,
+      "loss": 1.3251,
+      "step": 2341
+    },
+    {
+      "epoch": 0.417022792022792,
+      "grad_norm": 0.5003608465194702,
+      "learning_rate": 0.00019476439237213754,
+      "loss": 1.0714,
+      "step": 2342
+    },
+    {
+      "epoch": 0.4172008547008547,
+      "grad_norm": 0.5016986727714539,
+      "learning_rate": 0.00019475992164536582,
+      "loss": 1.0656,
+      "step": 2343
+    },
+    {
+      "epoch": 0.41737891737891736,
+      "grad_norm": 0.5139234066009521,
+      "learning_rate": 0.00019475544906197024,
+      "loss": 1.1317,
+      "step": 2344
+    },
+    {
+      "epoch": 0.41755698005698005,
+      "grad_norm": 0.582478940486908,
+      "learning_rate": 0.00019475097462203847,
+      "loss": 1.4209,
+      "step": 2345
+    },
+    {
+      "epoch": 0.41773504273504275,
+      "grad_norm": 0.5248767137527466,
+      "learning_rate": 0.00019474649832565823,
+      "loss": 1.2965,
+      "step": 2346
+    },
+    {
+      "epoch": 0.4179131054131054,
+      "grad_norm": 0.4977390170097351,
+      "learning_rate": 0.00019474202017291713,
+      "loss": 1.3319,
+      "step": 2347
+    },
+    {
+      "epoch": 0.4180911680911681,
+      "grad_norm": 0.4868984818458557,
+      "learning_rate": 0.00019473754016390298,
+      "loss": 1.0595,
+      "step": 2348
+    },
+    {
+      "epoch": 0.4182692307692308,
+      "grad_norm": 0.5965346693992615,
+      "learning_rate": 0.00019473305829870353,
+      "loss": 1.2289,
+      "step": 2349
+    },
+    {
+      "epoch": 0.4184472934472934,
+      "grad_norm": 0.46590209007263184,
+      "learning_rate": 0.0001947285745774066,
+      "loss": 1.0468,
+      "step": 2350
+    },
+    {
+      "epoch": 0.4186253561253561,
+      "grad_norm": 0.497811883687973,
+      "learning_rate": 0.0001947240890001,
+      "loss": 1.1247,
+      "step": 2351
+    },
+    {
+      "epoch": 0.4188034188034188,
+      "grad_norm": 0.5348289012908936,
+      "learning_rate": 0.0001947196015668717,
+      "loss": 0.9496,
+      "step": 2352
+    },
+    {
+      "epoch": 0.41898148148148145,
+      "grad_norm": 0.5086174607276917,
+      "learning_rate": 0.0001947151122778095,
+      "loss": 0.8869,
+      "step": 2353
+    },
+    {
+      "epoch": 0.41915954415954415,
+      "grad_norm": 0.4844677150249481,
+      "learning_rate": 0.00019471062113300146,
+      "loss": 0.847,
+      "step": 2354
+    },
+    {
+      "epoch": 0.41933760683760685,
+      "grad_norm": 0.5395866632461548,
+      "learning_rate": 0.00019470612813253556,
+      "loss": 0.9684,
+      "step": 2355
+    },
+    {
+      "epoch": 0.41951566951566954,
+      "grad_norm": 0.479403018951416,
+      "learning_rate": 0.0001947016332764998,
+      "loss": 1.0532,
+      "step": 2356
+    },
+    {
+      "epoch": 0.4196937321937322,
+      "grad_norm": 0.5499961376190186,
+      "learning_rate": 0.00019469713656498227,
+      "loss": 1.2565,
+      "step": 2357
+    },
+    {
+      "epoch": 0.4198717948717949,
+      "grad_norm": 0.5865352153778076,
+      "learning_rate": 0.00019469263799807104,
+      "loss": 1.1349,
+      "step": 2358
+    },
+    {
+      "epoch": 0.4200498575498576,
+      "grad_norm": 0.4454309046268463,
+      "learning_rate": 0.00019468813757585432,
+      "loss": 0.9631,
+      "step": 2359
+    },
+    {
+      "epoch": 0.4202279202279202,
+      "grad_norm": 0.48426875472068787,
+      "learning_rate": 0.00019468363529842023,
+      "loss": 0.9795,
+      "step": 2360
+    },
+    {
+      "epoch": 0.4204059829059829,
+      "grad_norm": 0.47428226470947266,
+      "learning_rate": 0.00019467913116585697,
+      "loss": 0.9316,
+      "step": 2361
+    },
+    {
+      "epoch": 0.4205840455840456,
+      "grad_norm": 0.5193758010864258,
+      "learning_rate": 0.00019467462517825282,
+      "loss": 1.235,
+      "step": 2362
+    },
+    {
+      "epoch": 0.42076210826210825,
+      "grad_norm": 0.49845513701438904,
+      "learning_rate": 0.00019467011733569607,
+      "loss": 1.2413,
+      "step": 2363
+    },
+    {
+      "epoch": 0.42094017094017094,
+      "grad_norm": 0.45483845472335815,
+      "learning_rate": 0.00019466560763827502,
+      "loss": 1.2817,
+      "step": 2364
+    },
+    {
+      "epoch": 0.42111823361823364,
+      "grad_norm": 0.43345287442207336,
+      "learning_rate": 0.00019466109608607806,
+      "loss": 0.8568,
+      "step": 2365
+    },
+    {
+      "epoch": 0.4212962962962963,
+      "grad_norm": 0.4467088282108307,
+      "learning_rate": 0.00019465658267919352,
+      "loss": 1.1408,
+      "step": 2366
+    },
+    {
+      "epoch": 0.421474358974359,
+      "grad_norm": 0.6705610156059265,
+      "learning_rate": 0.00019465206741770992,
+      "loss": 1.445,
+      "step": 2367
+    },
+    {
+      "epoch": 0.42165242165242167,
+      "grad_norm": 0.5037859678268433,
+      "learning_rate": 0.00019464755030171565,
+      "loss": 0.8682,
+      "step": 2368
+    },
+    {
+      "epoch": 0.4218304843304843,
+      "grad_norm": 0.49576324224472046,
+      "learning_rate": 0.00019464303133129928,
+      "loss": 0.8387,
+      "step": 2369
+    },
+    {
+      "epoch": 0.422008547008547,
+      "grad_norm": 0.5222806334495544,
+      "learning_rate": 0.00019463851050654927,
+      "loss": 1.1443,
+      "step": 2370
+    },
+    {
+      "epoch": 0.4221866096866097,
+      "grad_norm": 0.4966863989830017,
+      "learning_rate": 0.00019463398782755426,
+      "loss": 1.1555,
+      "step": 2371
+    },
+    {
+      "epoch": 0.42236467236467234,
+      "grad_norm": 0.6140168309211731,
+      "learning_rate": 0.00019462946329440285,
+      "loss": 1.2264,
+      "step": 2372
+    },
+    {
+      "epoch": 0.42254273504273504,
+      "grad_norm": 0.4906651973724365,
+      "learning_rate": 0.0001946249369071837,
+      "loss": 1.2459,
+      "step": 2373
+    },
+    {
+      "epoch": 0.42272079772079774,
+      "grad_norm": 0.5956700444221497,
+      "learning_rate": 0.00019462040866598544,
+      "loss": 1.1521,
+      "step": 2374
+    },
+    {
+      "epoch": 0.4228988603988604,
+      "grad_norm": 0.46044886112213135,
+      "learning_rate": 0.00019461587857089687,
+      "loss": 1.2084,
+      "step": 2375
+    },
+    {
+      "epoch": 0.4230769230769231,
+      "grad_norm": 0.5109430551528931,
+      "learning_rate": 0.00019461134662200668,
+      "loss": 1.2684,
+      "step": 2376
+    },
+    {
+      "epoch": 0.42325498575498577,
+      "grad_norm": 0.4373733103275299,
+      "learning_rate": 0.0001946068128194037,
+      "loss": 1.0451,
+      "step": 2377
+    },
+    {
+      "epoch": 0.4234330484330484,
+      "grad_norm": 0.553817868232727,
+      "learning_rate": 0.00019460227716317673,
+      "loss": 1.1052,
+      "step": 2378
+    },
+    {
+      "epoch": 0.4236111111111111,
+      "grad_norm": 0.5742647647857666,
+      "learning_rate": 0.00019459773965341468,
+      "loss": 1.1647,
+      "step": 2379
+    },
+    {
+      "epoch": 0.4237891737891738,
+      "grad_norm": 0.5461940169334412,
+      "learning_rate": 0.00019459320029020642,
+      "loss": 1.0953,
+      "step": 2380
+    },
+    {
+      "epoch": 0.42396723646723644,
+      "grad_norm": 0.5837802290916443,
+      "learning_rate": 0.0001945886590736409,
+      "loss": 1.1303,
+      "step": 2381
+    },
+    {
+      "epoch": 0.42414529914529914,
+      "grad_norm": 0.5316985249519348,
+      "learning_rate": 0.0001945841160038071,
+      "loss": 1.1204,
+      "step": 2382
+    },
+    {
+      "epoch": 0.42432336182336183,
+      "grad_norm": 0.5846191048622131,
+      "learning_rate": 0.00019457957108079404,
+      "loss": 1.2622,
+      "step": 2383
+    },
+    {
+      "epoch": 0.42450142450142453,
+      "grad_norm": 0.43266957998275757,
+      "learning_rate": 0.00019457502430469075,
+      "loss": 0.9834,
+      "step": 2384
+    },
+    {
+      "epoch": 0.42467948717948717,
+      "grad_norm": 0.514081597328186,
+      "learning_rate": 0.00019457047567558632,
+      "loss": 0.8413,
+      "step": 2385
+    },
+    {
+      "epoch": 0.42485754985754987,
+      "grad_norm": 0.4831700325012207,
+      "learning_rate": 0.00019456592519356987,
+      "loss": 0.9244,
+      "step": 2386
+    },
+    {
+      "epoch": 0.42503561253561256,
+      "grad_norm": 0.5612850785255432,
+      "learning_rate": 0.00019456137285873057,
+      "loss": 0.9438,
+      "step": 2387
+    },
+    {
+      "epoch": 0.4252136752136752,
+      "grad_norm": 0.5197352766990662,
+      "learning_rate": 0.00019455681867115758,
+      "loss": 1.1095,
+      "step": 2388
+    },
+    {
+      "epoch": 0.4253917378917379,
+      "grad_norm": 0.5045261979103088,
+      "learning_rate": 0.00019455226263094018,
+      "loss": 1.0007,
+      "step": 2389
+    },
+    {
+      "epoch": 0.4255698005698006,
+      "grad_norm": 0.5167570114135742,
+      "learning_rate": 0.00019454770473816758,
+      "loss": 1.1335,
+      "step": 2390
+    },
+    {
+      "epoch": 0.42574786324786323,
+      "grad_norm": 0.49262070655822754,
+      "learning_rate": 0.00019454314499292913,
+      "loss": 1.0436,
+      "step": 2391
+    },
+    {
+      "epoch": 0.42592592592592593,
+      "grad_norm": 0.4489207863807678,
+      "learning_rate": 0.00019453858339531417,
+      "loss": 1.0138,
+      "step": 2392
+    },
+    {
+      "epoch": 0.4261039886039886,
+      "grad_norm": 0.6024920344352722,
+      "learning_rate": 0.00019453401994541203,
+      "loss": 1.1921,
+      "step": 2393
+    },
+    {
+      "epoch": 0.42628205128205127,
+      "grad_norm": 0.46807861328125,
+      "learning_rate": 0.00019452945464331215,
+      "loss": 1.0947,
+      "step": 2394
+    },
+    {
+      "epoch": 0.42646011396011396,
+      "grad_norm": 0.48776543140411377,
+      "learning_rate": 0.00019452488748910397,
+      "loss": 1.0029,
+      "step": 2395
+    },
+    {
+      "epoch": 0.42663817663817666,
+      "grad_norm": 0.4798663556575775,
+      "learning_rate": 0.000194520318482877,
+      "loss": 0.7863,
+      "step": 2396
+    },
+    {
+      "epoch": 0.4268162393162393,
+      "grad_norm": 0.5067816972732544,
+      "learning_rate": 0.0001945157476247207,
+      "loss": 1.0049,
+      "step": 2397
+    },
+    {
+      "epoch": 0.426994301994302,
+      "grad_norm": 0.5179638266563416,
+      "learning_rate": 0.00019451117491472468,
+      "loss": 1.1851,
+      "step": 2398
+    },
+    {
+      "epoch": 0.4271723646723647,
+      "grad_norm": 0.4782430827617645,
+      "learning_rate": 0.00019450660035297854,
+      "loss": 1.125,
+      "step": 2399
+    },
+    {
+      "epoch": 0.42735042735042733,
+      "grad_norm": 0.560077965259552,
+      "learning_rate": 0.00019450202393957186,
+      "loss": 1.1843,
+      "step": 2400
+    },
+    {
+      "epoch": 0.42752849002849,
+      "grad_norm": 0.5247970223426819,
+      "learning_rate": 0.00019449744567459436,
+      "loss": 1.1576,
+      "step": 2401
+    },
+    {
+      "epoch": 0.4277065527065527,
+      "grad_norm": 0.6414062976837158,
+      "learning_rate": 0.00019449286555813568,
+      "loss": 1.1833,
+      "step": 2402
+    },
+    {
+      "epoch": 0.42788461538461536,
+      "grad_norm": 0.5006586909294128,
+      "learning_rate": 0.00019448828359028563,
+      "loss": 1.1778,
+      "step": 2403
+    },
+    {
+      "epoch": 0.42806267806267806,
+      "grad_norm": 0.4946450889110565,
+      "learning_rate": 0.0001944836997711339,
+      "loss": 1.1611,
+      "step": 2404
+    },
+    {
+      "epoch": 0.42824074074074076,
+      "grad_norm": 0.4601200222969055,
+      "learning_rate": 0.00019447911410077037,
+      "loss": 1.2456,
+      "step": 2405
+    },
+    {
+      "epoch": 0.4284188034188034,
+      "grad_norm": 0.4653947651386261,
+      "learning_rate": 0.00019447452657928485,
+      "loss": 1.0941,
+      "step": 2406
+    },
+    {
+      "epoch": 0.4285968660968661,
+      "grad_norm": 0.5015713572502136,
+      "learning_rate": 0.00019446993720676726,
+      "loss": 1.3113,
+      "step": 2407
+    },
+    {
+      "epoch": 0.4287749287749288,
+      "grad_norm": 0.5803143978118896,
+      "learning_rate": 0.0001944653459833075,
+      "loss": 1.0568,
+      "step": 2408
+    },
+    {
+      "epoch": 0.42895299145299143,
+      "grad_norm": 0.5259647965431213,
+      "learning_rate": 0.0001944607529089955,
+      "loss": 1.1243,
+      "step": 2409
+    },
+    {
+      "epoch": 0.4291310541310541,
+      "grad_norm": 0.5150414109230042,
+      "learning_rate": 0.00019445615798392124,
+      "loss": 1.0676,
+      "step": 2410
+    },
+    {
+      "epoch": 0.4293091168091168,
+      "grad_norm": 0.5848649740219116,
+      "learning_rate": 0.0001944515612081748,
+      "loss": 1.0671,
+      "step": 2411
+    },
+    {
+      "epoch": 0.42948717948717946,
+      "grad_norm": 0.5696990489959717,
+      "learning_rate": 0.00019444696258184626,
+      "loss": 1.3323,
+      "step": 2412
+    },
+    {
+      "epoch": 0.42966524216524216,
+      "grad_norm": 0.49822330474853516,
+      "learning_rate": 0.00019444236210502567,
+      "loss": 1.1004,
+      "step": 2413
+    },
+    {
+      "epoch": 0.42984330484330485,
+      "grad_norm": 0.4683490991592407,
+      "learning_rate": 0.00019443775977780317,
+      "loss": 0.9768,
+      "step": 2414
+    },
+    {
+      "epoch": 0.43002136752136755,
+      "grad_norm": 0.5703811049461365,
+      "learning_rate": 0.00019443315560026893,
+      "loss": 1.154,
+      "step": 2415
+    },
+    {
+      "epoch": 0.4301994301994302,
+      "grad_norm": 0.5121861100196838,
+      "learning_rate": 0.0001944285495725132,
+      "loss": 1.1388,
+      "step": 2416
+    },
+    {
+      "epoch": 0.4303774928774929,
+      "grad_norm": 0.4864094853401184,
+      "learning_rate": 0.00019442394169462619,
+      "loss": 0.9214,
+      "step": 2417
+    },
+    {
+      "epoch": 0.4305555555555556,
+      "grad_norm": 0.5234864354133606,
+      "learning_rate": 0.0001944193319666982,
+      "loss": 1.2787,
+      "step": 2418
+    },
+    {
+      "epoch": 0.4307336182336182,
+      "grad_norm": 0.5137650370597839,
+      "learning_rate": 0.00019441472038881955,
+      "loss": 1.1406,
+      "step": 2419
+    },
+    {
+      "epoch": 0.4309116809116809,
+      "grad_norm": 0.49687784910202026,
+      "learning_rate": 0.00019441010696108054,
+      "loss": 0.93,
+      "step": 2420
+    },
+    {
+      "epoch": 0.4310897435897436,
+      "grad_norm": 0.5078722834587097,
+      "learning_rate": 0.00019440549168357163,
+      "loss": 1.1417,
+      "step": 2421
+    },
+    {
+      "epoch": 0.43126780626780625,
+      "grad_norm": 0.4483391046524048,
+      "learning_rate": 0.00019440087455638324,
+      "loss": 0.9016,
+      "step": 2422
+    },
+    {
+      "epoch": 0.43144586894586895,
+      "grad_norm": 0.5963045954704285,
+      "learning_rate": 0.00019439625557960576,
+      "loss": 1.1567,
+      "step": 2423
+    },
+    {
+      "epoch": 0.43162393162393164,
+      "grad_norm": 0.5534471273422241,
+      "learning_rate": 0.0001943916347533298,
+      "loss": 1.1409,
+      "step": 2424
+    },
+    {
+      "epoch": 0.4318019943019943,
+      "grad_norm": 0.6400241851806641,
+      "learning_rate": 0.0001943870120776458,
+      "loss": 1.2041,
+      "step": 2425
+    },
+    {
+      "epoch": 0.431980056980057,
+      "grad_norm": 0.4599420726299286,
+      "learning_rate": 0.0001943823875526444,
+      "loss": 1.023,
+      "step": 2426
+    },
+    {
+      "epoch": 0.4321581196581197,
+      "grad_norm": 0.4799708425998688,
+      "learning_rate": 0.00019437776117841614,
+      "loss": 1.0872,
+      "step": 2427
+    },
+    {
+      "epoch": 0.4323361823361823,
+      "grad_norm": 0.5138532519340515,
+      "learning_rate": 0.00019437313295505172,
+      "loss": 1.1175,
+      "step": 2428
+    },
+    {
+      "epoch": 0.432514245014245,
+      "grad_norm": 0.538223147392273,
+      "learning_rate": 0.00019436850288264183,
+      "loss": 1.1203,
+      "step": 2429
+    },
+    {
+      "epoch": 0.4326923076923077,
+      "grad_norm": 0.458044171333313,
+      "learning_rate": 0.00019436387096127713,
+      "loss": 1.0383,
+      "step": 2430
+    },
+    {
+      "epoch": 0.43287037037037035,
+      "grad_norm": 0.5928303599357605,
+      "learning_rate": 0.00019435923719104842,
+      "loss": 1.1191,
+      "step": 2431
+    },
+    {
+      "epoch": 0.43304843304843305,
+      "grad_norm": 0.5818437933921814,
+      "learning_rate": 0.00019435460157204645,
+      "loss": 1.0352,
+      "step": 2432
+    },
+    {
+      "epoch": 0.43322649572649574,
+      "grad_norm": 0.487341046333313,
+      "learning_rate": 0.0001943499641043621,
+      "loss": 1.2608,
+      "step": 2433
+    },
+    {
+      "epoch": 0.4334045584045584,
+      "grad_norm": 0.4737292230129242,
+      "learning_rate": 0.0001943453247880862,
+      "loss": 1.0084,
+      "step": 2434
+    },
+    {
+      "epoch": 0.4335826210826211,
+      "grad_norm": 0.4251207709312439,
+      "learning_rate": 0.0001943406836233096,
+      "loss": 0.9163,
+      "step": 2435
+    },
+    {
+      "epoch": 0.4337606837606838,
+      "grad_norm": 0.49468478560447693,
+      "learning_rate": 0.00019433604061012331,
+      "loss": 1.0293,
+      "step": 2436
+    },
+    {
+      "epoch": 0.4339387464387464,
+      "grad_norm": 0.47120022773742676,
+      "learning_rate": 0.00019433139574861826,
+      "loss": 1.0097,
+      "step": 2437
+    },
+    {
+      "epoch": 0.4341168091168091,
+      "grad_norm": 0.5060358047485352,
+      "learning_rate": 0.00019432674903888548,
+      "loss": 1.0683,
+      "step": 2438
+    },
+    {
+      "epoch": 0.4342948717948718,
+      "grad_norm": 0.5455917119979858,
+      "learning_rate": 0.00019432210048101598,
+      "loss": 0.8886,
+      "step": 2439
+    },
+    {
+      "epoch": 0.43447293447293445,
+      "grad_norm": 0.7960546612739563,
+      "learning_rate": 0.00019431745007510086,
+      "loss": 0.8648,
+      "step": 2440
+    },
+    {
+      "epoch": 0.43465099715099714,
+      "grad_norm": 0.5069689154624939,
+      "learning_rate": 0.00019431279782123126,
+      "loss": 1.1315,
+      "step": 2441
+    },
+    {
+      "epoch": 0.43482905982905984,
+      "grad_norm": 0.5597776174545288,
+      "learning_rate": 0.0001943081437194983,
+      "loss": 1.2281,
+      "step": 2442
+    },
+    {
+      "epoch": 0.43500712250712253,
+      "grad_norm": 0.4527420997619629,
+      "learning_rate": 0.00019430348776999315,
+      "loss": 0.7576,
+      "step": 2443
+    },
+    {
+      "epoch": 0.4351851851851852,
+      "grad_norm": 0.5625936388969421,
+      "learning_rate": 0.00019429882997280706,
+      "loss": 1.0302,
+      "step": 2444
+    },
+    {
+      "epoch": 0.43536324786324787,
+      "grad_norm": 0.5173513293266296,
+      "learning_rate": 0.0001942941703280313,
+      "loss": 1.2255,
+      "step": 2445
+    },
+    {
+      "epoch": 0.43554131054131057,
+      "grad_norm": 0.45889151096343994,
+      "learning_rate": 0.00019428950883575714,
+      "loss": 0.9322,
+      "step": 2446
+    },
+    {
+      "epoch": 0.4357193732193732,
+      "grad_norm": 0.5288477540016174,
+      "learning_rate": 0.00019428484549607593,
+      "loss": 1.0572,
+      "step": 2447
+    },
+    {
+      "epoch": 0.4358974358974359,
+      "grad_norm": 0.48328033089637756,
+      "learning_rate": 0.00019428018030907902,
+      "loss": 1.1213,
+      "step": 2448
+    },
+    {
+      "epoch": 0.4360754985754986,
+      "grad_norm": 0.5146737098693848,
+      "learning_rate": 0.00019427551327485786,
+      "loss": 0.9633,
+      "step": 2449
+    },
+    {
+      "epoch": 0.43625356125356124,
+      "grad_norm": 0.5138360261917114,
+      "learning_rate": 0.00019427084439350382,
+      "loss": 1.0561,
+      "step": 2450
+    },
+    {
+      "epoch": 0.43643162393162394,
+      "grad_norm": 0.5192533135414124,
+      "learning_rate": 0.00019426617366510843,
+      "loss": 1.1704,
+      "step": 2451
+    },
+    {
+      "epoch": 0.43660968660968663,
+      "grad_norm": 0.4819495379924774,
+      "learning_rate": 0.00019426150108976318,
+      "loss": 1.0958,
+      "step": 2452
+    },
+    {
+      "epoch": 0.43678774928774927,
+      "grad_norm": 0.4626680910587311,
+      "learning_rate": 0.00019425682666755965,
+      "loss": 1.1872,
+      "step": 2453
+    },
+    {
+      "epoch": 0.43696581196581197,
+      "grad_norm": 0.5773931741714478,
+      "learning_rate": 0.00019425215039858937,
+      "loss": 1.0722,
+      "step": 2454
+    },
+    {
+      "epoch": 0.43714387464387466,
+      "grad_norm": 0.5003872513771057,
+      "learning_rate": 0.00019424747228294402,
+      "loss": 1.0561,
+      "step": 2455
+    },
+    {
+      "epoch": 0.4373219373219373,
+      "grad_norm": 0.47370314598083496,
+      "learning_rate": 0.0001942427923207152,
+      "loss": 1.1619,
+      "step": 2456
+    },
+    {
+      "epoch": 0.4375,
+      "grad_norm": 0.466421514749527,
+      "learning_rate": 0.00019423811051199466,
+      "loss": 1.1311,
+      "step": 2457
+    },
+    {
+      "epoch": 0.4376780626780627,
+      "grad_norm": 0.44564682245254517,
+      "learning_rate": 0.00019423342685687413,
+      "loss": 1.1889,
+      "step": 2458
+    },
+    {
+      "epoch": 0.43785612535612534,
+      "grad_norm": 0.40986698865890503,
+      "learning_rate": 0.00019422874135544533,
+      "loss": 0.7312,
+      "step": 2459
+    },
+    {
+      "epoch": 0.43803418803418803,
+      "grad_norm": 0.4714358448982239,
+      "learning_rate": 0.0001942240540078001,
+      "loss": 0.9273,
+      "step": 2460
+    },
+    {
+      "epoch": 0.43821225071225073,
+      "grad_norm": 0.5298398733139038,
+      "learning_rate": 0.00019421936481403025,
+      "loss": 1.3377,
+      "step": 2461
+    },
+    {
+      "epoch": 0.43839031339031337,
+      "grad_norm": 0.6326695680618286,
+      "learning_rate": 0.0001942146737742277,
+      "loss": 1.0258,
+      "step": 2462
+    },
+    {
+      "epoch": 0.43856837606837606,
+      "grad_norm": 0.5087653994560242,
+      "learning_rate": 0.00019420998088848427,
+      "loss": 1.0007,
+      "step": 2463
+    },
+    {
+      "epoch": 0.43874643874643876,
+      "grad_norm": 0.4895429313182831,
+      "learning_rate": 0.00019420528615689202,
+      "loss": 1.0032,
+      "step": 2464
+    },
+    {
+      "epoch": 0.4389245014245014,
+      "grad_norm": 0.5029937028884888,
+      "learning_rate": 0.00019420058957954285,
+      "loss": 1.2877,
+      "step": 2465
+    },
+    {
+      "epoch": 0.4391025641025641,
+      "grad_norm": 0.4953192174434662,
+      "learning_rate": 0.00019419589115652884,
+      "loss": 1.0759,
+      "step": 2466
+    },
+    {
+      "epoch": 0.4392806267806268,
+      "grad_norm": 0.5081778168678284,
+      "learning_rate": 0.000194191190887942,
+      "loss": 0.8816,
+      "step": 2467
+    },
+    {
+      "epoch": 0.43945868945868943,
+      "grad_norm": 0.5065913200378418,
+      "learning_rate": 0.00019418648877387446,
+      "loss": 1.0362,
+      "step": 2468
+    },
+    {
+      "epoch": 0.43963675213675213,
+      "grad_norm": 0.540600061416626,
+      "learning_rate": 0.00019418178481441832,
+      "loss": 1.0911,
+      "step": 2469
+    },
+    {
+      "epoch": 0.4398148148148148,
+      "grad_norm": 0.5122954845428467,
+      "learning_rate": 0.00019417707900966572,
+      "loss": 0.9866,
+      "step": 2470
+    },
+    {
+      "epoch": 0.43999287749287747,
+      "grad_norm": 0.5380190014839172,
+      "learning_rate": 0.00019417237135970893,
+      "loss": 1.2775,
+      "step": 2471
+    },
+    {
+      "epoch": 0.44017094017094016,
+      "grad_norm": 1.2977570295333862,
+      "learning_rate": 0.00019416766186464016,
+      "loss": 1.3993,
+      "step": 2472
+    },
+    {
+      "epoch": 0.44034900284900286,
+      "grad_norm": 0.48105308413505554,
+      "learning_rate": 0.00019416295052455165,
+      "loss": 0.9369,
+      "step": 2473
+    },
+    {
+      "epoch": 0.44052706552706555,
+      "grad_norm": 0.4742157459259033,
+      "learning_rate": 0.00019415823733953574,
+      "loss": 1.101,
+      "step": 2474
+    },
+    {
+      "epoch": 0.4407051282051282,
+      "grad_norm": 0.4958631694316864,
+      "learning_rate": 0.00019415352230968473,
+      "loss": 0.9906,
+      "step": 2475
+    },
+    {
+      "epoch": 0.4408831908831909,
+      "grad_norm": 0.5808146595954895,
+      "learning_rate": 0.00019414880543509107,
+      "loss": 1.2315,
+      "step": 2476
+    },
+    {
+      "epoch": 0.4410612535612536,
+      "grad_norm": 0.4294755160808563,
+      "learning_rate": 0.00019414408671584714,
+      "loss": 0.8275,
+      "step": 2477
+    },
+    {
+      "epoch": 0.4412393162393162,
+      "grad_norm": 0.5346055626869202,
+      "learning_rate": 0.0001941393661520454,
+      "loss": 1.2432,
+      "step": 2478
+    },
+    {
+      "epoch": 0.4414173789173789,
+      "grad_norm": 0.5827590227127075,
+      "learning_rate": 0.00019413464374377833,
+      "loss": 1.3204,
+      "step": 2479
+    },
+    {
+      "epoch": 0.4415954415954416,
+      "grad_norm": 0.45688143372535706,
+      "learning_rate": 0.00019412991949113847,
+      "loss": 0.9307,
+      "step": 2480
+    },
+    {
+      "epoch": 0.44177350427350426,
+      "grad_norm": 0.512999415397644,
+      "learning_rate": 0.0001941251933942184,
+      "loss": 1.2808,
+      "step": 2481
+    },
+    {
+      "epoch": 0.44195156695156695,
+      "grad_norm": 0.4546334445476532,
+      "learning_rate": 0.00019412046545311064,
+      "loss": 1.0156,
+      "step": 2482
+    },
+    {
+      "epoch": 0.44212962962962965,
+      "grad_norm": 0.48552581667900085,
+      "learning_rate": 0.00019411573566790793,
+      "loss": 1.3798,
+      "step": 2483
+    },
+    {
+      "epoch": 0.4423076923076923,
+      "grad_norm": 0.511970579624176,
+      "learning_rate": 0.00019411100403870287,
+      "loss": 1.065,
+      "step": 2484
+    },
+    {
+      "epoch": 0.442485754985755,
+      "grad_norm": 0.6367824077606201,
+      "learning_rate": 0.00019410627056558815,
+      "loss": 1.3242,
+      "step": 2485
+    },
+    {
+      "epoch": 0.4426638176638177,
+      "grad_norm": 0.48913368582725525,
+      "learning_rate": 0.00019410153524865659,
+      "loss": 0.9761,
+      "step": 2486
+    },
+    {
+      "epoch": 0.4428418803418803,
+      "grad_norm": 0.5077710151672363,
+      "learning_rate": 0.0001940967980880009,
+      "loss": 1.1023,
+      "step": 2487
+    },
+    {
+      "epoch": 0.443019943019943,
+      "grad_norm": 0.4956335723400116,
+      "learning_rate": 0.00019409205908371395,
+      "loss": 1.1788,
+      "step": 2488
+    },
+    {
+      "epoch": 0.4431980056980057,
+      "grad_norm": 0.4726616442203522,
+      "learning_rate": 0.00019408731823588853,
+      "loss": 1.1445,
+      "step": 2489
+    },
+    {
+      "epoch": 0.44337606837606836,
+      "grad_norm": 0.5676438212394714,
+      "learning_rate": 0.00019408257554461757,
+      "loss": 1.0344,
+      "step": 2490
+    },
+    {
+      "epoch": 0.44355413105413105,
+      "grad_norm": 0.537656843662262,
+      "learning_rate": 0.000194077831009994,
+      "loss": 0.9876,
+      "step": 2491
+    },
+    {
+      "epoch": 0.44373219373219375,
+      "grad_norm": 0.517905592918396,
+      "learning_rate": 0.00019407308463211074,
+      "loss": 1.1389,
+      "step": 2492
+    },
+    {
+      "epoch": 0.4439102564102564,
+      "grad_norm": 0.49227026104927063,
+      "learning_rate": 0.0001940683364110608,
+      "loss": 1.0351,
+      "step": 2493
+    },
+    {
+      "epoch": 0.4440883190883191,
+      "grad_norm": 0.5131173729896545,
+      "learning_rate": 0.00019406358634693725,
+      "loss": 1.0351,
+      "step": 2494
+    },
+    {
+      "epoch": 0.4442663817663818,
+      "grad_norm": 0.5064495205879211,
+      "learning_rate": 0.0001940588344398331,
+      "loss": 1.0248,
+      "step": 2495
+    },
+    {
+      "epoch": 0.4444444444444444,
+      "grad_norm": 0.44107526540756226,
+      "learning_rate": 0.00019405408068984148,
+      "loss": 0.8068,
+      "step": 2496
+    },
+    {
+      "epoch": 0.4446225071225071,
+      "grad_norm": 0.6711848378181458,
+      "learning_rate": 0.00019404932509705554,
+      "loss": 1.059,
+      "step": 2497
+    },
+    {
+      "epoch": 0.4448005698005698,
+      "grad_norm": 0.5862596035003662,
+      "learning_rate": 0.00019404456766156845,
+      "loss": 1.2012,
+      "step": 2498
+    },
+    {
+      "epoch": 0.44497863247863245,
+      "grad_norm": 0.5528512001037598,
+      "learning_rate": 0.0001940398083834734,
+      "loss": 1.1121,
+      "step": 2499
+    },
+    {
+      "epoch": 0.44515669515669515,
+      "grad_norm": 0.5326655507087708,
+      "learning_rate": 0.0001940350472628637,
+      "loss": 1.166,
+      "step": 2500
+    },
+    {
+      "epoch": 0.44533475783475784,
+      "grad_norm": 0.5384873747825623,
+      "learning_rate": 0.00019403028429983252,
+      "loss": 1.4111,
+      "step": 2501
+    },
+    {
+      "epoch": 0.44551282051282054,
+      "grad_norm": 0.5142310857772827,
+      "learning_rate": 0.0001940255194944733,
+      "loss": 1.3353,
+      "step": 2502
+    },
+    {
+      "epoch": 0.4456908831908832,
+      "grad_norm": 0.49124231934547424,
+      "learning_rate": 0.0001940207528468793,
+      "loss": 1.1443,
+      "step": 2503
+    },
+    {
+      "epoch": 0.4458689458689459,
+      "grad_norm": 0.509713888168335,
+      "learning_rate": 0.000194015984357144,
+      "loss": 1.1857,
+      "step": 2504
+    },
+    {
+      "epoch": 0.44604700854700857,
+      "grad_norm": 0.5211275219917297,
+      "learning_rate": 0.00019401121402536078,
+      "loss": 0.9911,
+      "step": 2505
+    },
+    {
+      "epoch": 0.4462250712250712,
+      "grad_norm": 0.480340838432312,
+      "learning_rate": 0.00019400644185162312,
+      "loss": 1.1018,
+      "step": 2506
+    },
+    {
+      "epoch": 0.4464031339031339,
+      "grad_norm": 0.4212559163570404,
+      "learning_rate": 0.00019400166783602448,
+      "loss": 0.7501,
+      "step": 2507
+    },
+    {
+      "epoch": 0.4465811965811966,
+      "grad_norm": 0.5110511183738708,
+      "learning_rate": 0.00019399689197865846,
+      "loss": 1.1244,
+      "step": 2508
+    },
+    {
+      "epoch": 0.44675925925925924,
+      "grad_norm": 0.5604230165481567,
+      "learning_rate": 0.0001939921142796186,
+      "loss": 1.1066,
+      "step": 2509
+    },
+    {
+      "epoch": 0.44693732193732194,
+      "grad_norm": 0.5578675270080566,
+      "learning_rate": 0.0001939873347389985,
+      "loss": 1.0514,
+      "step": 2510
+    },
+    {
+      "epoch": 0.44711538461538464,
+      "grad_norm": 0.520908772945404,
+      "learning_rate": 0.00019398255335689184,
+      "loss": 1.1217,
+      "step": 2511
+    },
+    {
+      "epoch": 0.4472934472934473,
+      "grad_norm": 0.4405131936073303,
+      "learning_rate": 0.00019397777013339224,
+      "loss": 1.043,
+      "step": 2512
+    },
+    {
+      "epoch": 0.44747150997151,
+      "grad_norm": 0.5217751860618591,
+      "learning_rate": 0.0001939729850685935,
+      "loss": 1.1301,
+      "step": 2513
+    },
+    {
+      "epoch": 0.44764957264957267,
+      "grad_norm": 0.6151493191719055,
+      "learning_rate": 0.00019396819816258932,
+      "loss": 1.3498,
+      "step": 2514
+    },
+    {
+      "epoch": 0.4478276353276353,
+      "grad_norm": 0.5622836947441101,
+      "learning_rate": 0.0001939634094154735,
+      "loss": 1.146,
+      "step": 2515
+    },
+    {
+      "epoch": 0.448005698005698,
+      "grad_norm": 0.4671688973903656,
+      "learning_rate": 0.00019395861882733984,
+      "loss": 0.9456,
+      "step": 2516
+    },
+    {
+      "epoch": 0.4481837606837607,
+      "grad_norm": 0.453951358795166,
+      "learning_rate": 0.00019395382639828223,
+      "loss": 1.0042,
+      "step": 2517
+    },
+    {
+      "epoch": 0.44836182336182334,
+      "grad_norm": 0.5150699615478516,
+      "learning_rate": 0.0001939490321283946,
+      "loss": 1.166,
+      "step": 2518
+    },
+    {
+      "epoch": 0.44853988603988604,
+      "grad_norm": 0.5718298554420471,
+      "learning_rate": 0.0001939442360177708,
+      "loss": 1.2033,
+      "step": 2519
+    },
+    {
+      "epoch": 0.44871794871794873,
+      "grad_norm": 0.5306782126426697,
+      "learning_rate": 0.00019393943806650488,
+      "loss": 1.0765,
+      "step": 2520
+    },
+    {
+      "epoch": 0.4488960113960114,
+      "grad_norm": 0.47633033990859985,
+      "learning_rate": 0.0001939346382746908,
+      "loss": 0.9957,
+      "step": 2521
+    },
+    {
+      "epoch": 0.44907407407407407,
+      "grad_norm": 0.496441513299942,
+      "learning_rate": 0.00019392983664242262,
+      "loss": 1.2016,
+      "step": 2522
+    },
+    {
+      "epoch": 0.44925213675213677,
+      "grad_norm": 0.45956477522850037,
+      "learning_rate": 0.00019392503316979442,
+      "loss": 1.026,
+      "step": 2523
+    },
+    {
+      "epoch": 0.4494301994301994,
+      "grad_norm": 0.5400575995445251,
+      "learning_rate": 0.0001939202278569003,
+      "loss": 1.0785,
+      "step": 2524
+    },
+    {
+      "epoch": 0.4496082621082621,
+      "grad_norm": 0.4847868084907532,
+      "learning_rate": 0.00019391542070383442,
+      "loss": 1.013,
+      "step": 2525
+    },
+    {
+      "epoch": 0.4497863247863248,
+      "grad_norm": 0.4694063663482666,
+      "learning_rate": 0.00019391061171069094,
+      "loss": 0.8793,
+      "step": 2526
+    },
+    {
+      "epoch": 0.44996438746438744,
+      "grad_norm": 0.5158169269561768,
+      "learning_rate": 0.00019390580087756413,
+      "loss": 0.9602,
+      "step": 2527
+    },
+    {
+      "epoch": 0.45014245014245013,
+      "grad_norm": 0.5404585003852844,
+      "learning_rate": 0.00019390098820454822,
+      "loss": 1.2247,
+      "step": 2528
+    },
+    {
+      "epoch": 0.45032051282051283,
+      "grad_norm": 0.5302738547325134,
+      "learning_rate": 0.00019389617369173752,
+      "loss": 0.918,
+      "step": 2529
+    },
+    {
+      "epoch": 0.45049857549857547,
+      "grad_norm": 0.5065485835075378,
+      "learning_rate": 0.00019389135733922634,
+      "loss": 1.0934,
+      "step": 2530
+    },
+    {
+      "epoch": 0.45067663817663817,
+      "grad_norm": 0.5491471886634827,
+      "learning_rate": 0.00019388653914710903,
+      "loss": 1.0736,
+      "step": 2531
+    },
+    {
+      "epoch": 0.45085470085470086,
+      "grad_norm": 0.4850206971168518,
+      "learning_rate": 0.00019388171911548005,
+      "loss": 1.2401,
+      "step": 2532
+    },
+    {
+      "epoch": 0.45103276353276356,
+      "grad_norm": 0.5419789552688599,
+      "learning_rate": 0.0001938768972444338,
+      "loss": 1.269,
+      "step": 2533
+    },
+    {
+      "epoch": 0.4512108262108262,
+      "grad_norm": 0.4209023714065552,
+      "learning_rate": 0.00019387207353406476,
+      "loss": 1.0544,
+      "step": 2534
+    },
+    {
+      "epoch": 0.4513888888888889,
+      "grad_norm": 0.578588604927063,
+      "learning_rate": 0.00019386724798446743,
+      "loss": 1.0564,
+      "step": 2535
+    },
+    {
+      "epoch": 0.4515669515669516,
+      "grad_norm": 0.5277524590492249,
+      "learning_rate": 0.00019386242059573638,
+      "loss": 1.1497,
+      "step": 2536
+    },
+    {
+      "epoch": 0.45174501424501423,
+      "grad_norm": 0.5536073446273804,
+      "learning_rate": 0.0001938575913679662,
+      "loss": 1.2213,
+      "step": 2537
+    },
+    {
+      "epoch": 0.4519230769230769,
+      "grad_norm": 0.5572254657745361,
+      "learning_rate": 0.00019385276030125143,
+      "loss": 1.0231,
+      "step": 2538
+    },
+    {
+      "epoch": 0.4521011396011396,
+      "grad_norm": 0.493847131729126,
+      "learning_rate": 0.00019384792739568686,
+      "loss": 0.9385,
+      "step": 2539
+    },
+    {
+      "epoch": 0.45227920227920226,
+      "grad_norm": 0.4641396403312683,
+      "learning_rate": 0.00019384309265136707,
+      "loss": 0.9332,
+      "step": 2540
+    },
+    {
+      "epoch": 0.45245726495726496,
+      "grad_norm": 0.5439442992210388,
+      "learning_rate": 0.00019383825606838681,
+      "loss": 1.317,
+      "step": 2541
+    },
+    {
+      "epoch": 0.45263532763532766,
+      "grad_norm": 0.7050970792770386,
+      "learning_rate": 0.00019383341764684086,
+      "loss": 0.9508,
+      "step": 2542
+    },
+    {
+      "epoch": 0.4528133903133903,
+      "grad_norm": 0.5013265013694763,
+      "learning_rate": 0.000193828577386824,
+      "loss": 1.2704,
+      "step": 2543
+    },
+    {
+      "epoch": 0.452991452991453,
+      "grad_norm": 0.47641924023628235,
+      "learning_rate": 0.0001938237352884311,
+      "loss": 1.0101,
+      "step": 2544
+    },
+    {
+      "epoch": 0.4531695156695157,
+      "grad_norm": 0.5223637819290161,
+      "learning_rate": 0.000193818891351757,
+      "loss": 1.0548,
+      "step": 2545
+    },
+    {
+      "epoch": 0.45334757834757833,
+      "grad_norm": 0.49065667390823364,
+      "learning_rate": 0.0001938140455768966,
+      "loss": 1.0927,
+      "step": 2546
+    },
+    {
+      "epoch": 0.453525641025641,
+      "grad_norm": 0.4808312654495239,
+      "learning_rate": 0.0001938091979639449,
+      "loss": 1.0599,
+      "step": 2547
+    },
+    {
+      "epoch": 0.4537037037037037,
+      "grad_norm": 0.5157489776611328,
+      "learning_rate": 0.0001938043485129968,
+      "loss": 1.2596,
+      "step": 2548
+    },
+    {
+      "epoch": 0.45388176638176636,
+      "grad_norm": 0.5983387231826782,
+      "learning_rate": 0.0001937994972241474,
+      "loss": 1.2276,
+      "step": 2549
+    },
+    {
+      "epoch": 0.45405982905982906,
+      "grad_norm": 0.49776506423950195,
+      "learning_rate": 0.00019379464409749163,
+      "loss": 1.3666,
+      "step": 2550
+    },
+    {
+      "epoch": 0.45423789173789175,
+      "grad_norm": 0.4693490266799927,
+      "learning_rate": 0.00019378978913312471,
+      "loss": 1.087,
+      "step": 2551
+    },
+    {
+      "epoch": 0.4544159544159544,
+      "grad_norm": 0.4754335880279541,
+      "learning_rate": 0.00019378493233114167,
+      "loss": 1.1282,
+      "step": 2552
+    },
+    {
+      "epoch": 0.4545940170940171,
+      "grad_norm": 0.5852862000465393,
+      "learning_rate": 0.00019378007369163776,
+      "loss": 1.1113,
+      "step": 2553
+    },
+    {
+      "epoch": 0.4547720797720798,
+      "grad_norm": 0.47442635893821716,
+      "learning_rate": 0.00019377521321470805,
+      "loss": 0.983,
+      "step": 2554
+    },
+    {
+      "epoch": 0.4549501424501424,
+      "grad_norm": 0.47432273626327515,
+      "learning_rate": 0.00019377035090044787,
+      "loss": 1.0169,
+      "step": 2555
+    },
+    {
+      "epoch": 0.4551282051282051,
+      "grad_norm": 0.4929196834564209,
+      "learning_rate": 0.00019376548674895246,
+      "loss": 1.0182,
+      "step": 2556
+    },
+    {
+      "epoch": 0.4553062678062678,
+      "grad_norm": 0.5433184504508972,
+      "learning_rate": 0.00019376062076031708,
+      "loss": 1.1339,
+      "step": 2557
+    },
+    {
+      "epoch": 0.45548433048433046,
+      "grad_norm": 0.47430408000946045,
+      "learning_rate": 0.00019375575293463715,
+      "loss": 1.1589,
+      "step": 2558
+    },
+    {
+      "epoch": 0.45566239316239315,
+      "grad_norm": 0.46641045808792114,
+      "learning_rate": 0.000193750883272008,
+      "loss": 1.029,
+      "step": 2559
+    },
+    {
+      "epoch": 0.45584045584045585,
+      "grad_norm": 0.44476228952407837,
+      "learning_rate": 0.00019374601177252502,
+      "loss": 0.8494,
+      "step": 2560
+    },
+    {
+      "epoch": 0.45601851851851855,
+      "grad_norm": 0.4886183440685272,
+      "learning_rate": 0.00019374113843628366,
+      "loss": 1.1374,
+      "step": 2561
+    },
+    {
+      "epoch": 0.4561965811965812,
+      "grad_norm": 0.4786703288555145,
+      "learning_rate": 0.00019373626326337946,
+      "loss": 1.2861,
+      "step": 2562
+    },
+    {
+      "epoch": 0.4563746438746439,
+      "grad_norm": 0.5752716660499573,
+      "learning_rate": 0.0001937313862539079,
+      "loss": 1.2365,
+      "step": 2563
+    },
+    {
+      "epoch": 0.4565527065527066,
+      "grad_norm": 0.519176185131073,
+      "learning_rate": 0.00019372650740796452,
+      "loss": 1.2264,
+      "step": 2564
+    },
+    {
+      "epoch": 0.4567307692307692,
+      "grad_norm": 0.5927292704582214,
+      "learning_rate": 0.00019372162672564493,
+      "loss": 0.8979,
+      "step": 2565
+    },
+    {
+      "epoch": 0.4569088319088319,
+      "grad_norm": 0.5467435121536255,
+      "learning_rate": 0.00019371674420704478,
+      "loss": 1.1016,
+      "step": 2566
+    },
+    {
+      "epoch": 0.4570868945868946,
+      "grad_norm": 0.49593284726142883,
+      "learning_rate": 0.00019371185985225968,
+      "loss": 0.982,
+      "step": 2567
+    },
+    {
+      "epoch": 0.45726495726495725,
+      "grad_norm": 0.5696587562561035,
+      "learning_rate": 0.00019370697366138538,
+      "loss": 0.979,
+      "step": 2568
+    },
+    {
+      "epoch": 0.45744301994301995,
+      "grad_norm": 0.4455752968788147,
+      "learning_rate": 0.00019370208563451757,
+      "loss": 0.8832,
+      "step": 2569
+    },
+    {
+      "epoch": 0.45762108262108264,
+      "grad_norm": 0.5072923302650452,
+      "learning_rate": 0.00019369719577175203,
+      "loss": 1.1046,
+      "step": 2570
+    },
+    {
+      "epoch": 0.4577991452991453,
+      "grad_norm": 0.45119982957839966,
+      "learning_rate": 0.0001936923040731846,
+      "loss": 1.0083,
+      "step": 2571
+    },
+    {
+      "epoch": 0.457977207977208,
+      "grad_norm": 0.5062251091003418,
+      "learning_rate": 0.00019368741053891108,
+      "loss": 1.2771,
+      "step": 2572
+    },
+    {
+      "epoch": 0.4581552706552707,
+      "grad_norm": 0.5511104464530945,
+      "learning_rate": 0.0001936825151690274,
+      "loss": 1.0039,
+      "step": 2573
+    },
+    {
+      "epoch": 0.4583333333333333,
+      "grad_norm": 0.4721006453037262,
+      "learning_rate": 0.0001936776179636294,
+      "loss": 1.3246,
+      "step": 2574
+    },
+    {
+      "epoch": 0.458511396011396,
+      "grad_norm": 0.5021488666534424,
+      "learning_rate": 0.0001936727189228131,
+      "loss": 1.1733,
+      "step": 2575
+    },
+    {
+      "epoch": 0.4586894586894587,
+      "grad_norm": 0.5755292177200317,
+      "learning_rate": 0.0001936678180466745,
+      "loss": 1.2241,
+      "step": 2576
+    },
+    {
+      "epoch": 0.45886752136752135,
+      "grad_norm": 0.4501610994338989,
+      "learning_rate": 0.00019366291533530952,
+      "loss": 1.0503,
+      "step": 2577
+    },
+    {
+      "epoch": 0.45904558404558404,
+      "grad_norm": 0.4067458212375641,
+      "learning_rate": 0.00019365801078881432,
+      "loss": 0.8259,
+      "step": 2578
+    },
+    {
+      "epoch": 0.45922364672364674,
+      "grad_norm": 0.539730429649353,
+      "learning_rate": 0.0001936531044072849,
+      "loss": 1.1964,
+      "step": 2579
+    },
+    {
+      "epoch": 0.4594017094017094,
+      "grad_norm": 0.5624797344207764,
+      "learning_rate": 0.0001936481961908175,
+      "loss": 1.2059,
+      "step": 2580
+    },
+    {
+      "epoch": 0.4595797720797721,
+      "grad_norm": 0.43679240345954895,
+      "learning_rate": 0.00019364328613950824,
+      "loss": 1.1371,
+      "step": 2581
+    },
+    {
+      "epoch": 0.45975783475783477,
+      "grad_norm": 0.5214769244194031,
+      "learning_rate": 0.00019363837425345328,
+      "loss": 1.109,
+      "step": 2582
+    },
+    {
+      "epoch": 0.4599358974358974,
+      "grad_norm": 0.4522894024848938,
+      "learning_rate": 0.00019363346053274892,
+      "loss": 1.0532,
+      "step": 2583
+    },
+    {
+      "epoch": 0.4601139601139601,
+      "grad_norm": 0.44980281591415405,
+      "learning_rate": 0.0001936285449774914,
+      "loss": 0.9352,
+      "step": 2584
+    },
+    {
+      "epoch": 0.4602920227920228,
+      "grad_norm": 0.5697414875030518,
+      "learning_rate": 0.00019362362758777705,
+      "loss": 1.2171,
+      "step": 2585
+    },
+    {
+      "epoch": 0.46047008547008544,
+      "grad_norm": 0.4636315107345581,
+      "learning_rate": 0.00019361870836370217,
+      "loss": 1.0662,
+      "step": 2586
+    },
+    {
+      "epoch": 0.46064814814814814,
+      "grad_norm": 0.5144017338752747,
+      "learning_rate": 0.00019361378730536321,
+      "loss": 1.0681,
+      "step": 2587
+    },
+    {
+      "epoch": 0.46082621082621084,
+      "grad_norm": 0.5007636547088623,
+      "learning_rate": 0.00019360886441285654,
+      "loss": 1.2058,
+      "step": 2588
+    },
+    {
+      "epoch": 0.46100427350427353,
+      "grad_norm": 0.5024117231369019,
+      "learning_rate": 0.00019360393968627864,
+      "loss": 1.065,
+      "step": 2589
+    },
+    {
+      "epoch": 0.46118233618233617,
+      "grad_norm": 0.48105588555336,
+      "learning_rate": 0.00019359901312572596,
+      "loss": 1.0887,
+      "step": 2590
+    },
+    {
+      "epoch": 0.46136039886039887,
+      "grad_norm": 0.5381982326507568,
+      "learning_rate": 0.00019359408473129506,
+      "loss": 1.2754,
+      "step": 2591
+    },
+    {
+      "epoch": 0.46153846153846156,
+      "grad_norm": 0.5051333904266357,
+      "learning_rate": 0.0001935891545030825,
+      "loss": 0.9334,
+      "step": 2592
+    },
+    {
+      "epoch": 0.4617165242165242,
+      "grad_norm": 0.43818601965904236,
+      "learning_rate": 0.0001935842224411849,
+      "loss": 1.0967,
+      "step": 2593
+    },
+    {
+      "epoch": 0.4618945868945869,
+      "grad_norm": 0.4727257490158081,
+      "learning_rate": 0.0001935792885456988,
+      "loss": 0.8136,
+      "step": 2594
+    },
+    {
+      "epoch": 0.4620726495726496,
+      "grad_norm": 0.5505291223526001,
+      "learning_rate": 0.00019357435281672098,
+      "loss": 1.3113,
+      "step": 2595
+    },
+    {
+      "epoch": 0.46225071225071224,
+      "grad_norm": 0.4705682396888733,
+      "learning_rate": 0.0001935694152543481,
+      "loss": 0.9863,
+      "step": 2596
+    },
+    {
+      "epoch": 0.46242877492877493,
+      "grad_norm": 0.49653419852256775,
+      "learning_rate": 0.0001935644758586769,
+      "loss": 1.035,
+      "step": 2597
+    },
+    {
+      "epoch": 0.46260683760683763,
+      "grad_norm": 0.4788367748260498,
+      "learning_rate": 0.00019355953462980415,
+      "loss": 1.1253,
+      "step": 2598
+    },
+    {
+      "epoch": 0.46278490028490027,
+      "grad_norm": 0.5295125842094421,
+      "learning_rate": 0.00019355459156782668,
+      "loss": 1.0853,
+      "step": 2599
+    },
+    {
+      "epoch": 0.46296296296296297,
+      "grad_norm": 0.4878056049346924,
+      "learning_rate": 0.00019354964667284133,
+      "loss": 1.1381,
+      "step": 2600
+    },
+    {
+      "epoch": 0.46314102564102566,
+      "grad_norm": 0.5442031025886536,
+      "learning_rate": 0.00019354469994494497,
+      "loss": 1.1349,
+      "step": 2601
+    },
+    {
+      "epoch": 0.4633190883190883,
+      "grad_norm": 0.4845225214958191,
+      "learning_rate": 0.00019353975138423457,
+      "loss": 1.0538,
+      "step": 2602
+    },
+    {
+      "epoch": 0.463497150997151,
+      "grad_norm": 0.4957871437072754,
+      "learning_rate": 0.00019353480099080703,
+      "loss": 1.2765,
+      "step": 2603
+    },
+    {
+      "epoch": 0.4636752136752137,
+      "grad_norm": 0.5414339303970337,
+      "learning_rate": 0.00019352984876475936,
+      "loss": 1.1015,
+      "step": 2604
+    },
+    {
+      "epoch": 0.46385327635327633,
+      "grad_norm": 0.5171043872833252,
+      "learning_rate": 0.0001935248947061886,
+      "loss": 0.9995,
+      "step": 2605
+    },
+    {
+      "epoch": 0.46403133903133903,
+      "grad_norm": 0.46040529012680054,
+      "learning_rate": 0.0001935199388151918,
+      "loss": 1.1126,
+      "step": 2606
+    },
+    {
+      "epoch": 0.4642094017094017,
+      "grad_norm": 0.5327033400535583,
+      "learning_rate": 0.00019351498109186613,
+      "loss": 1.1983,
+      "step": 2607
+    },
+    {
+      "epoch": 0.46438746438746437,
+      "grad_norm": 0.4451361298561096,
+      "learning_rate": 0.0001935100215363086,
+      "loss": 0.9689,
+      "step": 2608
+    },
+    {
+      "epoch": 0.46456552706552706,
+      "grad_norm": 0.5462809801101685,
+      "learning_rate": 0.00019350506014861646,
+      "loss": 1.036,
+      "step": 2609
+    },
+    {
+      "epoch": 0.46474358974358976,
+      "grad_norm": 0.4907000958919525,
+      "learning_rate": 0.00019350009692888694,
+      "loss": 1.0724,
+      "step": 2610
+    },
+    {
+      "epoch": 0.4649216524216524,
+      "grad_norm": 0.47523510456085205,
+      "learning_rate": 0.00019349513187721723,
+      "loss": 0.9214,
+      "step": 2611
+    },
+    {
+      "epoch": 0.4650997150997151,
+      "grad_norm": 0.539732813835144,
+      "learning_rate": 0.0001934901649937046,
+      "loss": 1.1166,
+      "step": 2612
+    },
+    {
+      "epoch": 0.4652777777777778,
+      "grad_norm": 0.4827860891819,
+      "learning_rate": 0.00019348519627844643,
+      "loss": 1.1613,
+      "step": 2613
+    },
+    {
+      "epoch": 0.46545584045584043,
+      "grad_norm": 0.5385223031044006,
+      "learning_rate": 0.00019348022573154,
+      "loss": 1.0105,
+      "step": 2614
+    },
+    {
+      "epoch": 0.4656339031339031,
+      "grad_norm": 0.4629383087158203,
+      "learning_rate": 0.0001934752533530828,
+      "loss": 1.0298,
+      "step": 2615
+    },
+    {
+      "epoch": 0.4658119658119658,
+      "grad_norm": 0.599371075630188,
+      "learning_rate": 0.00019347027914317212,
+      "loss": 1.3158,
+      "step": 2616
+    },
+    {
+      "epoch": 0.46599002849002846,
+      "grad_norm": 0.5954698324203491,
+      "learning_rate": 0.00019346530310190553,
+      "loss": 1.1882,
+      "step": 2617
+    },
+    {
+      "epoch": 0.46616809116809116,
+      "grad_norm": 0.49185171723365784,
+      "learning_rate": 0.00019346032522938046,
+      "loss": 1.0977,
+      "step": 2618
+    },
+    {
+      "epoch": 0.46634615384615385,
+      "grad_norm": 0.5145422220230103,
+      "learning_rate": 0.0001934553455256945,
+      "loss": 0.9948,
+      "step": 2619
+    },
+    {
+      "epoch": 0.46652421652421655,
+      "grad_norm": 0.6809412837028503,
+      "learning_rate": 0.00019345036399094517,
+      "loss": 1.5798,
+      "step": 2620
+    },
+    {
+      "epoch": 0.4667022792022792,
+      "grad_norm": 0.4606841206550598,
+      "learning_rate": 0.00019344538062523005,
+      "loss": 0.7357,
+      "step": 2621
+    },
+    {
+      "epoch": 0.4668803418803419,
+      "grad_norm": 0.49036628007888794,
+      "learning_rate": 0.00019344039542864685,
+      "loss": 1.1518,
+      "step": 2622
+    },
+    {
+      "epoch": 0.4670584045584046,
+      "grad_norm": 0.47904539108276367,
+      "learning_rate": 0.0001934354084012932,
+      "loss": 0.9929,
+      "step": 2623
+    },
+    {
+      "epoch": 0.4672364672364672,
+      "grad_norm": 0.5224666595458984,
+      "learning_rate": 0.0001934304195432668,
+      "loss": 1.2544,
+      "step": 2624
+    },
+    {
+      "epoch": 0.4674145299145299,
+      "grad_norm": 0.4902483820915222,
+      "learning_rate": 0.00019342542885466543,
+      "loss": 1.0301,
+      "step": 2625
+    },
+    {
+      "epoch": 0.4675925925925926,
+      "grad_norm": 0.46824702620506287,
+      "learning_rate": 0.00019342043633558683,
+      "loss": 0.9364,
+      "step": 2626
+    },
+    {
+      "epoch": 0.46777065527065526,
+      "grad_norm": 0.46272051334381104,
+      "learning_rate": 0.00019341544198612888,
+      "loss": 1.056,
+      "step": 2627
+    },
+    {
+      "epoch": 0.46794871794871795,
+      "grad_norm": 0.6216606497764587,
+      "learning_rate": 0.0001934104458063894,
+      "loss": 1.0825,
+      "step": 2628
+    },
+    {
+      "epoch": 0.46812678062678065,
+      "grad_norm": 0.5024014115333557,
+      "learning_rate": 0.00019340544779646623,
+      "loss": 1.1832,
+      "step": 2629
+    },
+    {
+      "epoch": 0.4683048433048433,
+      "grad_norm": 0.5547130107879639,
+      "learning_rate": 0.00019340044795645737,
+      "loss": 1.1335,
+      "step": 2630
+    },
+    {
+      "epoch": 0.468482905982906,
+      "grad_norm": 0.5439161658287048,
+      "learning_rate": 0.0001933954462864608,
+      "loss": 1.0229,
+      "step": 2631
+    },
+    {
+      "epoch": 0.4686609686609687,
+      "grad_norm": 0.4782990515232086,
+      "learning_rate": 0.0001933904427865744,
+      "loss": 1.2318,
+      "step": 2632
+    },
+    {
+      "epoch": 0.4688390313390313,
+      "grad_norm": 0.5872140526771545,
+      "learning_rate": 0.00019338543745689633,
+      "loss": 1.0132,
+      "step": 2633
+    },
+    {
+      "epoch": 0.469017094017094,
+      "grad_norm": 0.44163307547569275,
+      "learning_rate": 0.00019338043029752458,
+      "loss": 1.0091,
+      "step": 2634
+    },
+    {
+      "epoch": 0.4691951566951567,
+      "grad_norm": 0.541081428527832,
+      "learning_rate": 0.0001933754213085573,
+      "loss": 1.2155,
+      "step": 2635
+    },
+    {
+      "epoch": 0.46937321937321935,
+      "grad_norm": 0.4761527478694916,
+      "learning_rate": 0.00019337041049009255,
+      "loss": 1.1138,
+      "step": 2636
+    },
+    {
+      "epoch": 0.46955128205128205,
+      "grad_norm": 0.46414369344711304,
+      "learning_rate": 0.0001933653978422286,
+      "loss": 0.9903,
+      "step": 2637
+    },
+    {
+      "epoch": 0.46972934472934474,
+      "grad_norm": 0.5337086915969849,
+      "learning_rate": 0.00019336038336506363,
+      "loss": 1.2873,
+      "step": 2638
+    },
+    {
+      "epoch": 0.4699074074074074,
+      "grad_norm": 0.5065379738807678,
+      "learning_rate": 0.00019335536705869592,
+      "loss": 1.1436,
+      "step": 2639
+    },
+    {
+      "epoch": 0.4700854700854701,
+      "grad_norm": 0.5539217591285706,
+      "learning_rate": 0.0001933503489232237,
+      "loss": 1.2881,
+      "step": 2640
+    },
+    {
+      "epoch": 0.4702635327635328,
+      "grad_norm": 0.48303213715553284,
+      "learning_rate": 0.0001933453289587453,
+      "loss": 1.0209,
+      "step": 2641
+    },
+    {
+      "epoch": 0.4704415954415954,
+      "grad_norm": 0.6986871957778931,
+      "learning_rate": 0.00019334030716535908,
+      "loss": 1.1979,
+      "step": 2642
+    },
+    {
+      "epoch": 0.4706196581196581,
+      "grad_norm": 0.46137234568595886,
+      "learning_rate": 0.00019333528354316347,
+      "loss": 1.0682,
+      "step": 2643
+    },
+    {
+      "epoch": 0.4707977207977208,
+      "grad_norm": 0.4726654291152954,
+      "learning_rate": 0.00019333025809225684,
+      "loss": 1.1712,
+      "step": 2644
+    },
+    {
+      "epoch": 0.47097578347578345,
+      "grad_norm": 0.46188637614250183,
+      "learning_rate": 0.0001933252308127377,
+      "loss": 1.0183,
+      "step": 2645
+    },
+    {
+      "epoch": 0.47115384615384615,
+      "grad_norm": 0.5323259830474854,
+      "learning_rate": 0.0001933202017047045,
+      "loss": 0.935,
+      "step": 2646
+    },
+    {
+      "epoch": 0.47133190883190884,
+      "grad_norm": 0.5004189014434814,
+      "learning_rate": 0.00019331517076825582,
+      "loss": 1.1331,
+      "step": 2647
+    },
+    {
+      "epoch": 0.47150997150997154,
+      "grad_norm": 0.5443634986877441,
+      "learning_rate": 0.0001933101380034902,
+      "loss": 1.0514,
+      "step": 2648
+    },
+    {
+      "epoch": 0.4716880341880342,
+      "grad_norm": 0.504180371761322,
+      "learning_rate": 0.0001933051034105063,
+      "loss": 1.3099,
+      "step": 2649
+    },
+    {
+      "epoch": 0.4718660968660969,
+      "grad_norm": 0.5092344284057617,
+      "learning_rate": 0.0001933000669894027,
+      "loss": 1.0716,
+      "step": 2650
+    },
+    {
+      "epoch": 0.47204415954415957,
+      "grad_norm": 0.5236422419548035,
+      "learning_rate": 0.0001932950287402781,
+      "loss": 1.0981,
+      "step": 2651
+    },
+    {
+      "epoch": 0.4722222222222222,
+      "grad_norm": 0.6228063702583313,
+      "learning_rate": 0.0001932899886632312,
+      "loss": 1.3398,
+      "step": 2652
+    },
+    {
+      "epoch": 0.4724002849002849,
+      "grad_norm": 0.5112748146057129,
+      "learning_rate": 0.00019328494675836078,
+      "loss": 1.0151,
+      "step": 2653
+    },
+    {
+      "epoch": 0.4725783475783476,
+      "grad_norm": 0.5554201602935791,
+      "learning_rate": 0.00019327990302576563,
+      "loss": 1.404,
+      "step": 2654
+    },
+    {
+      "epoch": 0.47275641025641024,
+      "grad_norm": 0.5050725340843201,
+      "learning_rate": 0.0001932748574655445,
+      "loss": 0.951,
+      "step": 2655
+    },
+    {
+      "epoch": 0.47293447293447294,
+      "grad_norm": 0.5161749720573425,
+      "learning_rate": 0.00019326981007779636,
+      "loss": 1.2425,
+      "step": 2656
+    },
+    {
+      "epoch": 0.47311253561253563,
+      "grad_norm": 0.4865442216396332,
+      "learning_rate": 0.00019326476086262002,
+      "loss": 1.1175,
+      "step": 2657
+    },
+    {
+      "epoch": 0.4732905982905983,
+      "grad_norm": 0.5276186466217041,
+      "learning_rate": 0.0001932597098201144,
+      "loss": 1.3687,
+      "step": 2658
+    },
+    {
+      "epoch": 0.47346866096866097,
+      "grad_norm": 0.509139358997345,
+      "learning_rate": 0.00019325465695037855,
+      "loss": 1.0546,
+      "step": 2659
+    },
+    {
+      "epoch": 0.47364672364672367,
+      "grad_norm": 0.49815434217453003,
+      "learning_rate": 0.00019324960225351138,
+      "loss": 1.0807,
+      "step": 2660
+    },
+    {
+      "epoch": 0.4738247863247863,
+      "grad_norm": 0.5059618353843689,
+      "learning_rate": 0.00019324454572961197,
+      "loss": 1.0827,
+      "step": 2661
+    },
+    {
+      "epoch": 0.474002849002849,
+      "grad_norm": 0.5698565244674683,
+      "learning_rate": 0.00019323948737877942,
+      "loss": 1.2019,
+      "step": 2662
+    },
+    {
+      "epoch": 0.4741809116809117,
+      "grad_norm": 0.49661511182785034,
+      "learning_rate": 0.00019323442720111276,
+      "loss": 1.1447,
+      "step": 2663
+    },
+    {
+      "epoch": 0.47435897435897434,
+      "grad_norm": 0.46442747116088867,
+      "learning_rate": 0.0001932293651967112,
+      "loss": 0.8796,
+      "step": 2664
+    },
+    {
+      "epoch": 0.47453703703703703,
+      "grad_norm": 0.48306044936180115,
+      "learning_rate": 0.00019322430136567388,
+      "loss": 1.1358,
+      "step": 2665
+    },
+    {
+      "epoch": 0.47471509971509973,
+      "grad_norm": 0.5677350759506226,
+      "learning_rate": 0.00019321923570810005,
+      "loss": 1.1026,
+      "step": 2666
+    },
+    {
+      "epoch": 0.47489316239316237,
+      "grad_norm": 0.3700144588947296,
+      "learning_rate": 0.0001932141682240889,
+      "loss": 0.7514,
+      "step": 2667
+    },
+    {
+      "epoch": 0.47507122507122507,
+      "grad_norm": 0.6003054976463318,
+      "learning_rate": 0.0001932090989137398,
+      "loss": 1.1591,
+      "step": 2668
+    },
+    {
+      "epoch": 0.47524928774928776,
+      "grad_norm": 0.520298421382904,
+      "learning_rate": 0.00019320402777715204,
+      "loss": 1.339,
+      "step": 2669
+    },
+    {
+      "epoch": 0.4754273504273504,
+      "grad_norm": 0.46453598141670227,
+      "learning_rate": 0.00019319895481442493,
+      "loss": 0.9879,
+      "step": 2670
+    },
+    {
+      "epoch": 0.4756054131054131,
+      "grad_norm": 0.5247363448143005,
+      "learning_rate": 0.00019319388002565793,
+      "loss": 0.9862,
+      "step": 2671
+    },
+    {
+      "epoch": 0.4757834757834758,
+      "grad_norm": 0.5498613715171814,
+      "learning_rate": 0.00019318880341095046,
+      "loss": 1.2224,
+      "step": 2672
+    },
+    {
+      "epoch": 0.47596153846153844,
+      "grad_norm": 0.565838098526001,
+      "learning_rate": 0.00019318372497040192,
+      "loss": 1.0712,
+      "step": 2673
+    },
+    {
+      "epoch": 0.47613960113960113,
+      "grad_norm": 0.5797489881515503,
+      "learning_rate": 0.00019317864470411191,
+      "loss": 1.0176,
+      "step": 2674
+    },
+    {
+      "epoch": 0.47631766381766383,
+      "grad_norm": 0.5114326477050781,
+      "learning_rate": 0.0001931735626121799,
+      "loss": 1.1027,
+      "step": 2675
+    },
+    {
+      "epoch": 0.47649572649572647,
+      "grad_norm": 0.5396515727043152,
+      "learning_rate": 0.00019316847869470547,
+      "loss": 1.1782,
+      "step": 2676
+    },
+    {
+      "epoch": 0.47667378917378916,
+      "grad_norm": 0.4812076985836029,
+      "learning_rate": 0.00019316339295178824,
+      "loss": 1.1196,
+      "step": 2677
+    },
+    {
+      "epoch": 0.47685185185185186,
+      "grad_norm": 0.4875647723674774,
+      "learning_rate": 0.00019315830538352787,
+      "loss": 1.1407,
+      "step": 2678
+    },
+    {
+      "epoch": 0.47702991452991456,
+      "grad_norm": 0.5036377906799316,
+      "learning_rate": 0.00019315321599002404,
+      "loss": 0.9842,
+      "step": 2679
+    },
+    {
+      "epoch": 0.4772079772079772,
+      "grad_norm": 0.5054177641868591,
+      "learning_rate": 0.00019314812477137645,
+      "loss": 0.8196,
+      "step": 2680
+    },
+    {
+      "epoch": 0.4773860398860399,
+      "grad_norm": 0.5050665736198425,
+      "learning_rate": 0.00019314303172768483,
+      "loss": 0.8463,
+      "step": 2681
+    },
+    {
+      "epoch": 0.4775641025641026,
+      "grad_norm": 0.5179004669189453,
+      "learning_rate": 0.000193137936859049,
+      "loss": 1.2485,
+      "step": 2682
+    },
+    {
+      "epoch": 0.47774216524216523,
+      "grad_norm": 0.44986143708229065,
+      "learning_rate": 0.00019313284016556876,
+      "loss": 0.9855,
+      "step": 2683
+    },
+    {
+      "epoch": 0.4779202279202279,
+      "grad_norm": 0.5594347715377808,
+      "learning_rate": 0.00019312774164734398,
+      "loss": 1.0987,
+      "step": 2684
+    },
+    {
+      "epoch": 0.4780982905982906,
+      "grad_norm": 0.4837244749069214,
+      "learning_rate": 0.0001931226413044746,
+      "loss": 1.1119,
+      "step": 2685
+    },
+    {
+      "epoch": 0.47827635327635326,
+      "grad_norm": 0.489145427942276,
+      "learning_rate": 0.0001931175391370605,
+      "loss": 1.1962,
+      "step": 2686
+    },
+    {
+      "epoch": 0.47845441595441596,
+      "grad_norm": 0.503568708896637,
+      "learning_rate": 0.00019311243514520164,
+      "loss": 0.9668,
+      "step": 2687
+    },
+    {
+      "epoch": 0.47863247863247865,
+      "grad_norm": 0.5401005744934082,
+      "learning_rate": 0.00019310732932899805,
+      "loss": 1.3072,
+      "step": 2688
+    },
+    {
+      "epoch": 0.4788105413105413,
+      "grad_norm": 0.526523768901825,
+      "learning_rate": 0.00019310222168854971,
+      "loss": 1.1387,
+      "step": 2689
+    },
+    {
+      "epoch": 0.478988603988604,
+      "grad_norm": 0.5223183631896973,
+      "learning_rate": 0.00019309711222395678,
+      "loss": 1.1391,
+      "step": 2690
+    },
+    {
+      "epoch": 0.4791666666666667,
+      "grad_norm": 0.5840879082679749,
+      "learning_rate": 0.00019309200093531933,
+      "loss": 1.1543,
+      "step": 2691
+    },
+    {
+      "epoch": 0.4793447293447293,
+      "grad_norm": 0.5173699259757996,
+      "learning_rate": 0.00019308688782273753,
+      "loss": 1.1889,
+      "step": 2692
+    },
+    {
+      "epoch": 0.479522792022792,
+      "grad_norm": 0.5417894124984741,
+      "learning_rate": 0.00019308177288631146,
+      "loss": 1.299,
+      "step": 2693
+    },
+    {
+      "epoch": 0.4797008547008547,
+      "grad_norm": 0.4890797734260559,
+      "learning_rate": 0.0001930766561261415,
+      "loss": 1.1516,
+      "step": 2694
+    },
+    {
+      "epoch": 0.47987891737891736,
+      "grad_norm": 0.5422119498252869,
+      "learning_rate": 0.00019307153754232772,
+      "loss": 1.0301,
+      "step": 2695
+    },
+    {
+      "epoch": 0.48005698005698005,
+      "grad_norm": 0.5838702917098999,
+      "learning_rate": 0.00019306641713497057,
+      "loss": 1.265,
+      "step": 2696
+    },
+    {
+      "epoch": 0.48023504273504275,
+      "grad_norm": 0.5020943284034729,
+      "learning_rate": 0.00019306129490417027,
+      "loss": 1.1119,
+      "step": 2697
+    },
+    {
+      "epoch": 0.4804131054131054,
+      "grad_norm": 0.412993460893631,
+      "learning_rate": 0.00019305617085002723,
+      "loss": 0.8083,
+      "step": 2698
+    },
+    {
+      "epoch": 0.4805911680911681,
+      "grad_norm": 0.6270101070404053,
+      "learning_rate": 0.00019305104497264184,
+      "loss": 1.3355,
+      "step": 2699
+    },
+    {
+      "epoch": 0.4807692307692308,
+      "grad_norm": 0.45256730914115906,
+      "learning_rate": 0.0001930459172721145,
+      "loss": 1.0368,
+      "step": 2700
+    },
+    {
+      "epoch": 0.4809472934472934,
+      "grad_norm": 0.5351749658584595,
+      "learning_rate": 0.0001930407877485457,
+      "loss": 1.135,
+      "step": 2701
+    },
+    {
+      "epoch": 0.4811253561253561,
+      "grad_norm": 0.49324163794517517,
+      "learning_rate": 0.00019303565640203593,
+      "loss": 0.9383,
+      "step": 2702
+    },
+    {
+      "epoch": 0.4813034188034188,
+      "grad_norm": 0.5434361100196838,
+      "learning_rate": 0.00019303052323268576,
+      "loss": 1.2605,
+      "step": 2703
+    },
+    {
+      "epoch": 0.48148148148148145,
+      "grad_norm": 0.5858064889907837,
+      "learning_rate": 0.00019302538824059572,
+      "loss": 1.0846,
+      "step": 2704
+    },
+    {
+      "epoch": 0.48165954415954415,
+      "grad_norm": 0.5753700733184814,
+      "learning_rate": 0.00019302025142586647,
+      "loss": 1.0371,
+      "step": 2705
+    },
+    {
+      "epoch": 0.48183760683760685,
+      "grad_norm": 0.43102699518203735,
+      "learning_rate": 0.00019301511278859858,
+      "loss": 0.9189,
+      "step": 2706
+    },
+    {
+      "epoch": 0.48201566951566954,
+      "grad_norm": 0.4731025993824005,
+      "learning_rate": 0.0001930099723288928,
+      "loss": 1.1291,
+      "step": 2707
+    },
+    {
+      "epoch": 0.4821937321937322,
+      "grad_norm": 0.5685615539550781,
+      "learning_rate": 0.00019300483004684987,
+      "loss": 1.1006,
+      "step": 2708
+    },
+    {
+      "epoch": 0.4823717948717949,
+      "grad_norm": 0.4368155896663666,
+      "learning_rate": 0.00019299968594257044,
+      "loss": 0.9959,
+      "step": 2709
+    },
+    {
+      "epoch": 0.4825498575498576,
+      "grad_norm": 0.5594738125801086,
+      "learning_rate": 0.00019299454001615537,
+      "loss": 1.0826,
+      "step": 2710
+    },
+    {
+      "epoch": 0.4827279202279202,
+      "grad_norm": 0.48876598477363586,
+      "learning_rate": 0.00019298939226770548,
+      "loss": 1.1556,
+      "step": 2711
+    },
+    {
+      "epoch": 0.4829059829059829,
+      "grad_norm": 0.548039436340332,
+      "learning_rate": 0.00019298424269732157,
+      "loss": 1.158,
+      "step": 2712
+    },
+    {
+      "epoch": 0.4830840455840456,
+      "grad_norm": 0.4957645535469055,
+      "learning_rate": 0.00019297909130510464,
+      "loss": 0.9824,
+      "step": 2713
+    },
+    {
+      "epoch": 0.48326210826210825,
+      "grad_norm": 0.5197011232376099,
+      "learning_rate": 0.00019297393809115555,
+      "loss": 1.1074,
+      "step": 2714
+    },
+    {
+      "epoch": 0.48344017094017094,
+      "grad_norm": 0.5742064118385315,
+      "learning_rate": 0.00019296878305557526,
+      "loss": 1.0431,
+      "step": 2715
+    },
+    {
+      "epoch": 0.48361823361823364,
+      "grad_norm": 0.5698413252830505,
+      "learning_rate": 0.0001929636261984648,
+      "loss": 1.0713,
+      "step": 2716
+    },
+    {
+      "epoch": 0.4837962962962963,
+      "grad_norm": 0.48126333951950073,
+      "learning_rate": 0.0001929584675199252,
+      "loss": 0.9274,
+      "step": 2717
+    },
+    {
+      "epoch": 0.483974358974359,
+      "grad_norm": 0.49299830198287964,
+      "learning_rate": 0.00019295330702005754,
+      "loss": 0.9392,
+      "step": 2718
+    },
+    {
+      "epoch": 0.48415242165242167,
+      "grad_norm": 0.4780774414539337,
+      "learning_rate": 0.0001929481446989629,
+      "loss": 1.1459,
+      "step": 2719
+    },
+    {
+      "epoch": 0.4843304843304843,
+      "grad_norm": 0.5462654829025269,
+      "learning_rate": 0.00019294298055674248,
+      "loss": 1.0635,
+      "step": 2720
+    },
+    {
+      "epoch": 0.484508547008547,
+      "grad_norm": 0.5371061563491821,
+      "learning_rate": 0.00019293781459349743,
+      "loss": 1.3578,
+      "step": 2721
+    },
+    {
+      "epoch": 0.4846866096866097,
+      "grad_norm": 0.46308520436286926,
+      "learning_rate": 0.00019293264680932893,
+      "loss": 0.9001,
+      "step": 2722
+    },
+    {
+      "epoch": 0.48486467236467234,
+      "grad_norm": 0.5149807929992676,
+      "learning_rate": 0.0001929274772043383,
+      "loss": 0.6908,
+      "step": 2723
+    },
+    {
+      "epoch": 0.48504273504273504,
+      "grad_norm": 0.5435031056404114,
+      "learning_rate": 0.00019292230577862678,
+      "loss": 1.2143,
+      "step": 2724
+    },
+    {
+      "epoch": 0.48522079772079774,
+      "grad_norm": 0.44217726588249207,
+      "learning_rate": 0.00019291713253229568,
+      "loss": 0.9303,
+      "step": 2725
+    },
+    {
+      "epoch": 0.4853988603988604,
+      "grad_norm": 0.6120226383209229,
+      "learning_rate": 0.00019291195746544643,
+      "loss": 1.3801,
+      "step": 2726
+    },
+    {
+      "epoch": 0.4855769230769231,
+      "grad_norm": 0.5014316439628601,
+      "learning_rate": 0.00019290678057818037,
+      "loss": 1.0631,
+      "step": 2727
+    },
+    {
+      "epoch": 0.48575498575498577,
+      "grad_norm": 0.5667829513549805,
+      "learning_rate": 0.00019290160187059895,
+      "loss": 1.3166,
+      "step": 2728
+    },
+    {
+      "epoch": 0.4859330484330484,
+      "grad_norm": 0.5011509656906128,
+      "learning_rate": 0.0001928964213428036,
+      "loss": 1.1887,
+      "step": 2729
+    },
+    {
+      "epoch": 0.4861111111111111,
+      "grad_norm": 0.48317405581474304,
+      "learning_rate": 0.00019289123899489586,
+      "loss": 1.1125,
+      "step": 2730
+    },
+    {
+      "epoch": 0.4862891737891738,
+      "grad_norm": 0.4669005870819092,
+      "learning_rate": 0.00019288605482697726,
+      "loss": 1.0091,
+      "step": 2731
+    },
+    {
+      "epoch": 0.48646723646723644,
+      "grad_norm": 0.4330739974975586,
+      "learning_rate": 0.00019288086883914937,
+      "loss": 0.9789,
+      "step": 2732
+    },
+    {
+      "epoch": 0.48664529914529914,
+      "grad_norm": 0.48482781648635864,
+      "learning_rate": 0.0001928756810315138,
+      "loss": 1.1922,
+      "step": 2733
+    },
+    {
+      "epoch": 0.48682336182336183,
+      "grad_norm": 0.5781838297843933,
+      "learning_rate": 0.0001928704914041722,
+      "loss": 1.1793,
+      "step": 2734
+    },
+    {
+      "epoch": 0.48700142450142453,
+      "grad_norm": 0.5955413579940796,
+      "learning_rate": 0.00019286529995722623,
+      "loss": 1.1001,
+      "step": 2735
+    },
+    {
+      "epoch": 0.48717948717948717,
+      "grad_norm": 0.49204322695732117,
+      "learning_rate": 0.00019286010669077763,
+      "loss": 0.9219,
+      "step": 2736
+    },
+    {
+      "epoch": 0.48735754985754987,
+      "grad_norm": 0.5853500962257385,
+      "learning_rate": 0.00019285491160492813,
+      "loss": 1.1133,
+      "step": 2737
+    },
+    {
+      "epoch": 0.48753561253561256,
+      "grad_norm": 0.5555846095085144,
+      "learning_rate": 0.0001928497146997795,
+      "loss": 1.0915,
+      "step": 2738
+    },
+    {
+      "epoch": 0.4877136752136752,
+      "grad_norm": 0.5166759490966797,
+      "learning_rate": 0.00019284451597543364,
+      "loss": 0.9349,
+      "step": 2739
+    },
+    {
+      "epoch": 0.4878917378917379,
+      "grad_norm": 0.47816506028175354,
+      "learning_rate": 0.00019283931543199234,
+      "loss": 0.8978,
+      "step": 2740
+    },
+    {
+      "epoch": 0.4880698005698006,
+      "grad_norm": 0.5632442831993103,
+      "learning_rate": 0.0001928341130695575,
+      "loss": 1.0491,
+      "step": 2741
+    },
+    {
+      "epoch": 0.48824786324786323,
+      "grad_norm": 0.6532769799232483,
+      "learning_rate": 0.00019282890888823107,
+      "loss": 1.2779,
+      "step": 2742
+    },
+    {
+      "epoch": 0.48842592592592593,
+      "grad_norm": 0.5733640789985657,
+      "learning_rate": 0.000192823702888115,
+      "loss": 1.4127,
+      "step": 2743
+    },
+    {
+      "epoch": 0.4886039886039886,
+      "grad_norm": 0.5701746344566345,
+      "learning_rate": 0.00019281849506931132,
+      "loss": 1.138,
+      "step": 2744
+    },
+    {
+      "epoch": 0.48878205128205127,
+      "grad_norm": 0.5227449536323547,
+      "learning_rate": 0.000192813285431922,
+      "loss": 1.1831,
+      "step": 2745
+    },
+    {
+      "epoch": 0.48896011396011396,
+      "grad_norm": 0.48457080125808716,
+      "learning_rate": 0.00019280807397604915,
+      "loss": 1.2468,
+      "step": 2746
+    },
+    {
+      "epoch": 0.48913817663817666,
+      "grad_norm": 0.4596176743507385,
+      "learning_rate": 0.0001928028607017949,
+      "loss": 1.1098,
+      "step": 2747
+    },
+    {
+      "epoch": 0.4893162393162393,
+      "grad_norm": 0.5204966068267822,
+      "learning_rate": 0.00019279764560926142,
+      "loss": 1.1501,
+      "step": 2748
+    },
+    {
+      "epoch": 0.489494301994302,
+      "grad_norm": 0.5179490447044373,
+      "learning_rate": 0.0001927924286985508,
+      "loss": 1.2601,
+      "step": 2749
+    },
+    {
+      "epoch": 0.4896723646723647,
+      "grad_norm": 0.4563423693180084,
+      "learning_rate": 0.00019278720996976533,
+      "loss": 1.081,
+      "step": 2750
+    },
+    {
+      "epoch": 0.48985042735042733,
+      "grad_norm": 0.4906339943408966,
+      "learning_rate": 0.00019278198942300717,
+      "loss": 1.157,
+      "step": 2751
+    },
+    {
+      "epoch": 0.49002849002849,
+      "grad_norm": 0.42241403460502625,
+      "learning_rate": 0.00019277676705837873,
+      "loss": 1.0333,
+      "step": 2752
+    },
+    {
+      "epoch": 0.4902065527065527,
+      "grad_norm": 0.6310175657272339,
+      "learning_rate": 0.00019277154287598226,
+      "loss": 1.1225,
+      "step": 2753
+    },
+    {
+      "epoch": 0.49038461538461536,
+      "grad_norm": 0.5109034776687622,
+      "learning_rate": 0.0001927663168759201,
+      "loss": 1.1619,
+      "step": 2754
+    },
+    {
+      "epoch": 0.49056267806267806,
+      "grad_norm": 0.4809598922729492,
+      "learning_rate": 0.00019276108905829465,
+      "loss": 1.0423,
+      "step": 2755
+    },
+    {
+      "epoch": 0.49074074074074076,
+      "grad_norm": 0.557502806186676,
+      "learning_rate": 0.00019275585942320837,
+      "loss": 0.8783,
+      "step": 2756
+    },
+    {
+      "epoch": 0.4909188034188034,
+      "grad_norm": 0.5434393882751465,
+      "learning_rate": 0.0001927506279707637,
+      "loss": 1.1701,
+      "step": 2757
+    },
+    {
+      "epoch": 0.4910968660968661,
+      "grad_norm": 0.49278944730758667,
+      "learning_rate": 0.00019274539470106317,
+      "loss": 1.0447,
+      "step": 2758
+    },
+    {
+      "epoch": 0.4912749287749288,
+      "grad_norm": 0.5634264349937439,
+      "learning_rate": 0.00019274015961420927,
+      "loss": 1.0639,
+      "step": 2759
+    },
+    {
+      "epoch": 0.49145299145299143,
+      "grad_norm": 0.5632645487785339,
+      "learning_rate": 0.00019273492271030464,
+      "loss": 0.9223,
+      "step": 2760
+    },
+    {
+      "epoch": 0.4916310541310541,
+      "grad_norm": 0.5949172377586365,
+      "learning_rate": 0.00019272968398945177,
+      "loss": 0.894,
+      "step": 2761
+    },
+    {
+      "epoch": 0.4918091168091168,
+      "grad_norm": 0.5375374555587769,
+      "learning_rate": 0.00019272444345175342,
+      "loss": 1.0311,
+      "step": 2762
+    },
+    {
+      "epoch": 0.49198717948717946,
+      "grad_norm": 0.5211305022239685,
+      "learning_rate": 0.00019271920109731222,
+      "loss": 1.1531,
+      "step": 2763
+    },
+    {
+      "epoch": 0.49216524216524216,
+      "grad_norm": 0.44022253155708313,
+      "learning_rate": 0.00019271395692623084,
+      "loss": 0.9147,
+      "step": 2764
+    },
+    {
+      "epoch": 0.49234330484330485,
+      "grad_norm": 0.4682174623012543,
+      "learning_rate": 0.0001927087109386121,
+      "loss": 1.081,
+      "step": 2765
+    },
+    {
+      "epoch": 0.49252136752136755,
+      "grad_norm": 0.4971517324447632,
+      "learning_rate": 0.0001927034631345588,
+      "loss": 1.1017,
+      "step": 2766
+    },
+    {
+      "epoch": 0.4926994301994302,
+      "grad_norm": 0.5015294551849365,
+      "learning_rate": 0.00019269821351417364,
+      "loss": 1.1093,
+      "step": 2767
+    },
+    {
+      "epoch": 0.4928774928774929,
+      "grad_norm": 0.5512694716453552,
+      "learning_rate": 0.00019269296207755958,
+      "loss": 0.9657,
+      "step": 2768
+    },
+    {
+      "epoch": 0.4930555555555556,
+      "grad_norm": 0.4914868474006653,
+      "learning_rate": 0.00019268770882481948,
+      "loss": 1.0379,
+      "step": 2769
+    },
+    {
+      "epoch": 0.4932336182336182,
+      "grad_norm": 0.567337691783905,
+      "learning_rate": 0.00019268245375605626,
+      "loss": 1.004,
+      "step": 2770
+    },
+    {
+      "epoch": 0.4934116809116809,
+      "grad_norm": 0.518489420413971,
+      "learning_rate": 0.0001926771968713729,
+      "loss": 1.0734,
+      "step": 2771
+    },
+    {
+      "epoch": 0.4935897435897436,
+      "grad_norm": 0.567742109298706,
+      "learning_rate": 0.00019267193817087237,
+      "loss": 1.1276,
+      "step": 2772
+    },
+    {
+      "epoch": 0.49376780626780625,
+      "grad_norm": 0.5287964344024658,
+      "learning_rate": 0.00019266667765465773,
+      "loss": 1.1429,
+      "step": 2773
+    },
+    {
+      "epoch": 0.49394586894586895,
+      "grad_norm": 0.5302085876464844,
+      "learning_rate": 0.00019266141532283207,
+      "loss": 1.0934,
+      "step": 2774
+    },
+    {
+      "epoch": 0.49412393162393164,
+      "grad_norm": 0.5569987297058105,
+      "learning_rate": 0.00019265615117549842,
+      "loss": 1.1453,
+      "step": 2775
+    },
+    {
+      "epoch": 0.4943019943019943,
+      "grad_norm": 0.519695520401001,
+      "learning_rate": 0.00019265088521275997,
+      "loss": 1.1255,
+      "step": 2776
+    },
+    {
+      "epoch": 0.494480056980057,
+      "grad_norm": 0.5073211193084717,
+      "learning_rate": 0.0001926456174347199,
+      "loss": 1.0609,
+      "step": 2777
+    },
+    {
+      "epoch": 0.4946581196581197,
+      "grad_norm": 0.45028239488601685,
+      "learning_rate": 0.00019264034784148142,
+      "loss": 0.9098,
+      "step": 2778
+    },
+    {
+      "epoch": 0.4948361823361823,
+      "grad_norm": 0.6641215682029724,
+      "learning_rate": 0.00019263507643314776,
+      "loss": 0.8903,
+      "step": 2779
+    },
+    {
+      "epoch": 0.495014245014245,
+      "grad_norm": 0.5281413793563843,
+      "learning_rate": 0.00019262980320982224,
+      "loss": 1.2906,
+      "step": 2780
+    },
+    {
+      "epoch": 0.4951923076923077,
+      "grad_norm": 0.6256437301635742,
+      "learning_rate": 0.0001926245281716081,
+      "loss": 1.4142,
+      "step": 2781
+    },
+    {
+      "epoch": 0.49537037037037035,
+      "grad_norm": 0.5422517657279968,
+      "learning_rate": 0.00019261925131860877,
+      "loss": 1.1606,
+      "step": 2782
+    },
+    {
+      "epoch": 0.49554843304843305,
+      "grad_norm": 0.46938949823379517,
+      "learning_rate": 0.0001926139726509276,
+      "loss": 1.0333,
+      "step": 2783
+    },
+    {
+      "epoch": 0.49572649572649574,
+      "grad_norm": 0.5799683928489685,
+      "learning_rate": 0.000192608692168668,
+      "loss": 1.0333,
+      "step": 2784
+    },
+    {
+      "epoch": 0.4959045584045584,
+      "grad_norm": 0.5231602787971497,
+      "learning_rate": 0.0001926034098719335,
+      "loss": 1.1847,
+      "step": 2785
+    },
+    {
+      "epoch": 0.4960826210826211,
+      "grad_norm": 0.477845698595047,
+      "learning_rate": 0.00019259812576082752,
+      "loss": 1.0746,
+      "step": 2786
+    },
+    {
+      "epoch": 0.4962606837606838,
+      "grad_norm": 0.5490350723266602,
+      "learning_rate": 0.00019259283983545365,
+      "loss": 1.2462,
+      "step": 2787
+    },
+    {
+      "epoch": 0.4964387464387464,
+      "grad_norm": 0.5788847208023071,
+      "learning_rate": 0.0001925875520959154,
+      "loss": 1.3485,
+      "step": 2788
+    },
+    {
+      "epoch": 0.4966168091168091,
+      "grad_norm": 0.46184736490249634,
+      "learning_rate": 0.00019258226254231643,
+      "loss": 0.8673,
+      "step": 2789
+    },
+    {
+      "epoch": 0.4967948717948718,
+      "grad_norm": 0.4890633225440979,
+      "learning_rate": 0.0001925769711747603,
+      "loss": 0.9474,
+      "step": 2790
+    },
+    {
+      "epoch": 0.49697293447293445,
+      "grad_norm": 0.5719282627105713,
+      "learning_rate": 0.00019257167799335078,
+      "loss": 1.2532,
+      "step": 2791
+    },
+    {
+      "epoch": 0.49715099715099714,
+      "grad_norm": 0.5385584235191345,
+      "learning_rate": 0.0001925663829981915,
+      "loss": 1.1326,
+      "step": 2792
+    },
+    {
+      "epoch": 0.49732905982905984,
+      "grad_norm": 0.5339545011520386,
+      "learning_rate": 0.00019256108618938625,
+      "loss": 1.1362,
+      "step": 2793
+    },
+    {
+      "epoch": 0.49750712250712253,
+      "grad_norm": 0.5017803907394409,
+      "learning_rate": 0.00019255578756703878,
+      "loss": 1.0449,
+      "step": 2794
+    },
+    {
+      "epoch": 0.4976851851851852,
+      "grad_norm": 0.6004226803779602,
+      "learning_rate": 0.00019255048713125294,
+      "loss": 0.9346,
+      "step": 2795
+    },
+    {
+      "epoch": 0.49786324786324787,
+      "grad_norm": 0.44581490755081177,
+      "learning_rate": 0.00019254518488213255,
+      "loss": 1.038,
+      "step": 2796
+    },
+    {
+      "epoch": 0.49804131054131057,
+      "grad_norm": 0.5180951356887817,
+      "learning_rate": 0.00019253988081978151,
+      "loss": 1.0479,
+      "step": 2797
+    },
+    {
+      "epoch": 0.4982193732193732,
+      "grad_norm": 0.53944993019104,
+      "learning_rate": 0.00019253457494430376,
+      "loss": 1.2598,
+      "step": 2798
+    },
+    {
+      "epoch": 0.4983974358974359,
+      "grad_norm": 0.5633010268211365,
+      "learning_rate": 0.00019252926725580322,
+      "loss": 1.205,
+      "step": 2799
+    },
+    {
+      "epoch": 0.4985754985754986,
+      "grad_norm": 0.6653175950050354,
+      "learning_rate": 0.0001925239577543839,
+      "loss": 1.2383,
+      "step": 2800
+    },
+    {
+      "epoch": 0.49875356125356124,
+      "grad_norm": 0.5083333849906921,
+      "learning_rate": 0.00019251864644014984,
+      "loss": 1.0649,
+      "step": 2801
+    },
+    {
+      "epoch": 0.49893162393162394,
+      "grad_norm": 0.4842020571231842,
+      "learning_rate": 0.00019251333331320506,
+      "loss": 1.1991,
+      "step": 2802
+    },
+    {
+      "epoch": 0.49910968660968663,
+      "grad_norm": 0.47987112402915955,
+      "learning_rate": 0.00019250801837365373,
+      "loss": 1.1686,
+      "step": 2803
+    },
+    {
+      "epoch": 0.49928774928774927,
+      "grad_norm": 0.5316333770751953,
+      "learning_rate": 0.00019250270162159992,
+      "loss": 1.1759,
+      "step": 2804
+    },
+    {
+      "epoch": 0.49946581196581197,
+      "grad_norm": 0.5015079379081726,
+      "learning_rate": 0.00019249738305714787,
+      "loss": 0.9424,
+      "step": 2805
+    },
+    {
+      "epoch": 0.49964387464387466,
+      "grad_norm": 0.6488274931907654,
+      "learning_rate": 0.00019249206268040172,
+      "loss": 1.066,
+      "step": 2806
+    },
+    {
+      "epoch": 0.4998219373219373,
+      "grad_norm": 0.40364864468574524,
+      "learning_rate": 0.00019248674049146574,
+      "loss": 0.6998,
+      "step": 2807
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.5535672903060913,
+      "learning_rate": 0.00019248141649044423,
+      "loss": 1.2207,
+      "step": 2808
+    },
+    {
+      "epoch": 0.5,
+      "eval_loss": 1.1072274446487427,
+      "eval_runtime": 28.6913,
+      "eval_samples_per_second": 36.283,
+      "eval_steps_per_second": 18.159,
+      "step": 2808
+    },
+    {
+      "epoch": 0.5001780626780626,
+      "grad_norm": 0.4834389090538025,
+      "learning_rate": 0.00019247609067744143,
+      "loss": 1.1686,
+      "step": 2809
+    },
+    {
+      "epoch": 0.5003561253561254,
+      "grad_norm": 0.5007249712944031,
+      "learning_rate": 0.00019247076305256176,
+      "loss": 1.1343,
+      "step": 2810
+    },
+    {
+      "epoch": 0.500534188034188,
+      "grad_norm": 0.4773348271846771,
+      "learning_rate": 0.00019246543361590957,
+      "loss": 0.9324,
+      "step": 2811
+    },
+    {
+      "epoch": 0.5007122507122507,
+      "grad_norm": 0.47324609756469727,
+      "learning_rate": 0.0001924601023675893,
+      "loss": 1.0223,
+      "step": 2812
+    },
+    {
+      "epoch": 0.5008903133903134,
+      "grad_norm": 0.5583845973014832,
+      "learning_rate": 0.00019245476930770537,
+      "loss": 1.1328,
+      "step": 2813
+    },
+    {
+      "epoch": 0.5010683760683761,
+      "grad_norm": 0.4814579486846924,
+      "learning_rate": 0.00019244943443636232,
+      "loss": 1.0528,
+      "step": 2814
+    },
+    {
+      "epoch": 0.5012464387464387,
+      "grad_norm": 0.4996104836463928,
+      "learning_rate": 0.00019244409775366465,
+      "loss": 1.2482,
+      "step": 2815
+    },
+    {
+      "epoch": 0.5014245014245015,
+      "grad_norm": 0.47870904207229614,
+      "learning_rate": 0.0001924387592597169,
+      "loss": 0.9452,
+      "step": 2816
+    },
+    {
+      "epoch": 0.5016025641025641,
+      "grad_norm": 0.5617441534996033,
+      "learning_rate": 0.0001924334189546237,
+      "loss": 1.378,
+      "step": 2817
+    },
+    {
+      "epoch": 0.5017806267806267,
+      "grad_norm": 0.4872083365917206,
+      "learning_rate": 0.00019242807683848967,
+      "loss": 1.1571,
+      "step": 2818
+    },
+    {
+      "epoch": 0.5019586894586895,
+      "grad_norm": 0.5147804021835327,
+      "learning_rate": 0.00019242273291141947,
+      "loss": 1.1086,
+      "step": 2819
+    },
+    {
+      "epoch": 0.5021367521367521,
+      "grad_norm": 0.4698995351791382,
+      "learning_rate": 0.00019241738717351784,
+      "loss": 1.1579,
+      "step": 2820
+    },
+    {
+      "epoch": 0.5023148148148148,
+      "grad_norm": 0.5158926844596863,
+      "learning_rate": 0.00019241203962488946,
+      "loss": 1.2763,
+      "step": 2821
+    },
+    {
+      "epoch": 0.5024928774928775,
+      "grad_norm": 0.5218976736068726,
+      "learning_rate": 0.00019240669026563914,
+      "loss": 1.0633,
+      "step": 2822
+    },
+    {
+      "epoch": 0.5026709401709402,
+      "grad_norm": 0.5511452555656433,
+      "learning_rate": 0.0001924013390958717,
+      "loss": 0.9939,
+      "step": 2823
+    },
+    {
+      "epoch": 0.5028490028490028,
+      "grad_norm": 0.5227555632591248,
+      "learning_rate": 0.00019239598611569191,
+      "loss": 1.2478,
+      "step": 2824
+    },
+    {
+      "epoch": 0.5030270655270656,
+      "grad_norm": 0.5444719791412354,
+      "learning_rate": 0.00019239063132520475,
+      "loss": 1.1574,
+      "step": 2825
+    },
+    {
+      "epoch": 0.5032051282051282,
+      "grad_norm": 0.4752781093120575,
+      "learning_rate": 0.0001923852747245151,
+      "loss": 0.9034,
+      "step": 2826
+    },
+    {
+      "epoch": 0.5033831908831908,
+      "grad_norm": 0.5286496877670288,
+      "learning_rate": 0.00019237991631372792,
+      "loss": 1.1391,
+      "step": 2827
+    },
+    {
+      "epoch": 0.5035612535612536,
+      "grad_norm": 0.5009933710098267,
+      "learning_rate": 0.00019237455609294815,
+      "loss": 1.2178,
+      "step": 2828
+    },
+    {
+      "epoch": 0.5037393162393162,
+      "grad_norm": 0.5012276768684387,
+      "learning_rate": 0.00019236919406228085,
+      "loss": 0.9877,
+      "step": 2829
+    },
+    {
+      "epoch": 0.5039173789173789,
+      "grad_norm": 0.576508104801178,
+      "learning_rate": 0.00019236383022183106,
+      "loss": 1.1299,
+      "step": 2830
+    },
+    {
+      "epoch": 0.5040954415954416,
+      "grad_norm": 0.4716590642929077,
+      "learning_rate": 0.0001923584645717039,
+      "loss": 1.0451,
+      "step": 2831
+    },
+    {
+      "epoch": 0.5042735042735043,
+      "grad_norm": 0.5817418098449707,
+      "learning_rate": 0.00019235309711200448,
+      "loss": 1.0911,
+      "step": 2832
+    },
+    {
+      "epoch": 0.5044515669515669,
+      "grad_norm": 0.5695745944976807,
+      "learning_rate": 0.000192347727842838,
+      "loss": 1.0229,
+      "step": 2833
+    },
+    {
+      "epoch": 0.5046296296296297,
+      "grad_norm": 0.49127066135406494,
+      "learning_rate": 0.00019234235676430958,
+      "loss": 1.1377,
+      "step": 2834
+    },
+    {
+      "epoch": 0.5048076923076923,
+      "grad_norm": 0.5426172614097595,
+      "learning_rate": 0.00019233698387652453,
+      "loss": 1.2427,
+      "step": 2835
+    },
+    {
+      "epoch": 0.5049857549857549,
+      "grad_norm": 0.5342385172843933,
+      "learning_rate": 0.0001923316091795881,
+      "loss": 1.1427,
+      "step": 2836
+    },
+    {
+      "epoch": 0.5051638176638177,
+      "grad_norm": 0.5480486750602722,
+      "learning_rate": 0.00019232623267360558,
+      "loss": 1.0647,
+      "step": 2837
+    },
+    {
+      "epoch": 0.5053418803418803,
+      "grad_norm": 0.4584530293941498,
+      "learning_rate": 0.00019232085435868235,
+      "loss": 1.0461,
+      "step": 2838
+    },
+    {
+      "epoch": 0.5055199430199431,
+      "grad_norm": 0.5992119908332825,
+      "learning_rate": 0.00019231547423492371,
+      "loss": 1.1456,
+      "step": 2839
+    },
+    {
+      "epoch": 0.5056980056980057,
+      "grad_norm": 0.514018177986145,
+      "learning_rate": 0.00019231009230243515,
+      "loss": 1.2559,
+      "step": 2840
+    },
+    {
+      "epoch": 0.5058760683760684,
+      "grad_norm": 0.5392283797264099,
+      "learning_rate": 0.0001923047085613221,
+      "loss": 1.044,
+      "step": 2841
+    },
+    {
+      "epoch": 0.5060541310541311,
+      "grad_norm": 0.4486566483974457,
+      "learning_rate": 0.00019229932301169,
+      "loss": 1.0679,
+      "step": 2842
+    },
+    {
+      "epoch": 0.5062321937321937,
+      "grad_norm": 0.4523460566997528,
+      "learning_rate": 0.00019229393565364442,
+      "loss": 1.1651,
+      "step": 2843
+    },
+    {
+      "epoch": 0.5064102564102564,
+      "grad_norm": 0.6032688021659851,
+      "learning_rate": 0.0001922885464872909,
+      "loss": 1.15,
+      "step": 2844
+    },
+    {
+      "epoch": 0.5065883190883191,
+      "grad_norm": 0.5883688926696777,
+      "learning_rate": 0.000192283155512735,
+      "loss": 1.2179,
+      "step": 2845
+    },
+    {
+      "epoch": 0.5067663817663818,
+      "grad_norm": 0.5534378886222839,
+      "learning_rate": 0.00019227776273008238,
+      "loss": 1.0387,
+      "step": 2846
+    },
+    {
+      "epoch": 0.5069444444444444,
+      "grad_norm": 0.5899033546447754,
+      "learning_rate": 0.00019227236813943872,
+      "loss": 1.0812,
+      "step": 2847
+    },
+    {
+      "epoch": 0.5071225071225072,
+      "grad_norm": 0.5718855261802673,
+      "learning_rate": 0.00019226697174090965,
+      "loss": 1.1375,
+      "step": 2848
+    },
+    {
+      "epoch": 0.5073005698005698,
+      "grad_norm": 0.5080967545509338,
+      "learning_rate": 0.00019226157353460094,
+      "loss": 1.1421,
+      "step": 2849
+    },
+    {
+      "epoch": 0.5074786324786325,
+      "grad_norm": 0.5253677368164062,
+      "learning_rate": 0.0001922561735206184,
+      "loss": 1.0166,
+      "step": 2850
+    },
+    {
+      "epoch": 0.5076566951566952,
+      "grad_norm": 0.47797444462776184,
+      "learning_rate": 0.00019225077169906772,
+      "loss": 1.0504,
+      "step": 2851
+    },
+    {
+      "epoch": 0.5078347578347578,
+      "grad_norm": 0.4911690652370453,
+      "learning_rate": 0.0001922453680700548,
+      "loss": 1.0629,
+      "step": 2852
+    },
+    {
+      "epoch": 0.5080128205128205,
+      "grad_norm": 0.49678200483322144,
+      "learning_rate": 0.00019223996263368557,
+      "loss": 1.1672,
+      "step": 2853
+    },
+    {
+      "epoch": 0.5081908831908832,
+      "grad_norm": 0.5451810359954834,
+      "learning_rate": 0.00019223455539006586,
+      "loss": 1.3031,
+      "step": 2854
+    },
+    {
+      "epoch": 0.5083689458689459,
+      "grad_norm": 0.5708984136581421,
+      "learning_rate": 0.00019222914633930166,
+      "loss": 1.0986,
+      "step": 2855
+    },
+    {
+      "epoch": 0.5085470085470085,
+      "grad_norm": 0.47232356667518616,
+      "learning_rate": 0.00019222373548149888,
+      "loss": 1.0449,
+      "step": 2856
+    },
+    {
+      "epoch": 0.5087250712250713,
+      "grad_norm": 0.6027610898017883,
+      "learning_rate": 0.0001922183228167636,
+      "loss": 0.862,
+      "step": 2857
+    },
+    {
+      "epoch": 0.5089031339031339,
+      "grad_norm": 0.5211802124977112,
+      "learning_rate": 0.00019221290834520188,
+      "loss": 1.1048,
+      "step": 2858
+    },
+    {
+      "epoch": 0.5090811965811965,
+      "grad_norm": 0.45101237297058105,
+      "learning_rate": 0.00019220749206691972,
+      "loss": 1.0046,
+      "step": 2859
+    },
+    {
+      "epoch": 0.5092592592592593,
+      "grad_norm": 0.5526158213615417,
+      "learning_rate": 0.00019220207398202335,
+      "loss": 1.2275,
+      "step": 2860
+    },
+    {
+      "epoch": 0.5094373219373219,
+      "grad_norm": 0.48322010040283203,
+      "learning_rate": 0.00019219665409061885,
+      "loss": 0.9974,
+      "step": 2861
+    },
+    {
+      "epoch": 0.5096153846153846,
+      "grad_norm": 0.4775219261646271,
+      "learning_rate": 0.00019219123239281244,
+      "loss": 1.1852,
+      "step": 2862
+    },
+    {
+      "epoch": 0.5097934472934473,
+      "grad_norm": 0.46184200048446655,
+      "learning_rate": 0.00019218580888871034,
+      "loss": 0.9393,
+      "step": 2863
+    },
+    {
+      "epoch": 0.50997150997151,
+      "grad_norm": 0.47495174407958984,
+      "learning_rate": 0.00019218038357841883,
+      "loss": 0.9631,
+      "step": 2864
+    },
+    {
+      "epoch": 0.5101495726495726,
+      "grad_norm": 0.48600029945373535,
+      "learning_rate": 0.00019217495646204418,
+      "loss": 1.0498,
+      "step": 2865
+    },
+    {
+      "epoch": 0.5103276353276354,
+      "grad_norm": 0.5801547169685364,
+      "learning_rate": 0.00019216952753969274,
+      "loss": 1.2181,
+      "step": 2866
+    },
+    {
+      "epoch": 0.510505698005698,
+      "grad_norm": 0.5082106590270996,
+      "learning_rate": 0.00019216409681147085,
+      "loss": 1.2009,
+      "step": 2867
+    },
+    {
+      "epoch": 0.5106837606837606,
+      "grad_norm": 0.4184330701828003,
+      "learning_rate": 0.00019215866427748493,
+      "loss": 0.8462,
+      "step": 2868
+    },
+    {
+      "epoch": 0.5108618233618234,
+      "grad_norm": 0.518099844455719,
+      "learning_rate": 0.00019215322993784147,
+      "loss": 1.2091,
+      "step": 2869
+    },
+    {
+      "epoch": 0.511039886039886,
+      "grad_norm": 0.569464921951294,
+      "learning_rate": 0.0001921477937926469,
+      "loss": 1.0264,
+      "step": 2870
+    },
+    {
+      "epoch": 0.5112179487179487,
+      "grad_norm": 0.526767909526825,
+      "learning_rate": 0.00019214235584200768,
+      "loss": 1.1192,
+      "step": 2871
+    },
+    {
+      "epoch": 0.5113960113960114,
+      "grad_norm": 0.6511057019233704,
+      "learning_rate": 0.00019213691608603047,
+      "loss": 1.3193,
+      "step": 2872
+    },
+    {
+      "epoch": 0.5115740740740741,
+      "grad_norm": 0.48536401987075806,
+      "learning_rate": 0.00019213147452482173,
+      "loss": 1.1671,
+      "step": 2873
+    },
+    {
+      "epoch": 0.5117521367521367,
+      "grad_norm": 0.7972469329833984,
+      "learning_rate": 0.00019212603115848818,
+      "loss": 1.1393,
+      "step": 2874
+    },
+    {
+      "epoch": 0.5119301994301995,
+      "grad_norm": 0.5543264746665955,
+      "learning_rate": 0.00019212058598713642,
+      "loss": 1.1436,
+      "step": 2875
+    },
+    {
+      "epoch": 0.5121082621082621,
+      "grad_norm": 0.49688720703125,
+      "learning_rate": 0.0001921151390108731,
+      "loss": 1.0897,
+      "step": 2876
+    },
+    {
+      "epoch": 0.5122863247863247,
+      "grad_norm": 0.4928736090660095,
+      "learning_rate": 0.000192109690229805,
+      "loss": 1.2426,
+      "step": 2877
+    },
+    {
+      "epoch": 0.5124643874643875,
+      "grad_norm": 0.4917896091938019,
+      "learning_rate": 0.0001921042396440389,
+      "loss": 1.0047,
+      "step": 2878
+    },
+    {
+      "epoch": 0.5126424501424501,
+      "grad_norm": 0.5485204458236694,
+      "learning_rate": 0.00019209878725368152,
+      "loss": 1.2615,
+      "step": 2879
+    },
+    {
+      "epoch": 0.5128205128205128,
+      "grad_norm": 0.5229470133781433,
+      "learning_rate": 0.0001920933330588397,
+      "loss": 1.3249,
+      "step": 2880
+    },
+    {
+      "epoch": 0.5129985754985755,
+      "grad_norm": 0.4783077538013458,
+      "learning_rate": 0.00019208787705962037,
+      "loss": 1.2004,
+      "step": 2881
+    },
+    {
+      "epoch": 0.5131766381766382,
+      "grad_norm": 0.5106910467147827,
+      "learning_rate": 0.00019208241925613035,
+      "loss": 1.1745,
+      "step": 2882
+    },
+    {
+      "epoch": 0.5133547008547008,
+      "grad_norm": 0.5308730006217957,
+      "learning_rate": 0.00019207695964847666,
+      "loss": 0.9706,
+      "step": 2883
+    },
+    {
+      "epoch": 0.5135327635327636,
+      "grad_norm": 0.5489775538444519,
+      "learning_rate": 0.00019207149823676617,
+      "loss": 1.0073,
+      "step": 2884
+    },
+    {
+      "epoch": 0.5137108262108262,
+      "grad_norm": 0.4992835521697998,
+      "learning_rate": 0.00019206603502110596,
+      "loss": 1.1053,
+      "step": 2885
+    },
+    {
+      "epoch": 0.5138888888888888,
+      "grad_norm": 0.5304922461509705,
+      "learning_rate": 0.00019206057000160302,
+      "loss": 1.0565,
+      "step": 2886
+    },
+    {
+      "epoch": 0.5140669515669516,
+      "grad_norm": 0.46411609649658203,
+      "learning_rate": 0.00019205510317836448,
+      "loss": 0.9202,
+      "step": 2887
+    },
+    {
+      "epoch": 0.5142450142450142,
+      "grad_norm": 0.5236835479736328,
+      "learning_rate": 0.0001920496345514974,
+      "loss": 0.9075,
+      "step": 2888
+    },
+    {
+      "epoch": 0.5144230769230769,
+      "grad_norm": 0.4416964054107666,
+      "learning_rate": 0.00019204416412110895,
+      "loss": 0.9225,
+      "step": 2889
+    },
+    {
+      "epoch": 0.5146011396011396,
+      "grad_norm": 0.5470940470695496,
+      "learning_rate": 0.00019203869188730633,
+      "loss": 1.2195,
+      "step": 2890
+    },
+    {
+      "epoch": 0.5147792022792023,
+      "grad_norm": 0.5380414128303528,
+      "learning_rate": 0.0001920332178501967,
+      "loss": 1.0731,
+      "step": 2891
+    },
+    {
+      "epoch": 0.5149572649572649,
+      "grad_norm": 0.4405716359615326,
+      "learning_rate": 0.00019202774200988737,
+      "loss": 0.8739,
+      "step": 2892
+    },
+    {
+      "epoch": 0.5151353276353277,
+      "grad_norm": 0.5222984552383423,
+      "learning_rate": 0.0001920222643664856,
+      "loss": 1.1806,
+      "step": 2893
+    },
+    {
+      "epoch": 0.5153133903133903,
+      "grad_norm": 0.48545539379119873,
+      "learning_rate": 0.0001920167849200987,
+      "loss": 0.9939,
+      "step": 2894
+    },
+    {
+      "epoch": 0.5154914529914529,
+      "grad_norm": 0.45078009366989136,
+      "learning_rate": 0.0001920113036708341,
+      "loss": 1.0085,
+      "step": 2895
+    },
+    {
+      "epoch": 0.5156695156695157,
+      "grad_norm": 0.5029830932617188,
+      "learning_rate": 0.00019200582061879913,
+      "loss": 1.1095,
+      "step": 2896
+    },
+    {
+      "epoch": 0.5158475783475783,
+      "grad_norm": 0.5316143035888672,
+      "learning_rate": 0.00019200033576410118,
+      "loss": 0.9883,
+      "step": 2897
+    },
+    {
+      "epoch": 0.5160256410256411,
+      "grad_norm": 0.5282100439071655,
+      "learning_rate": 0.0001919948491068478,
+      "loss": 1.1441,
+      "step": 2898
+    },
+    {
+      "epoch": 0.5162037037037037,
+      "grad_norm": 0.5145367980003357,
+      "learning_rate": 0.00019198936064714647,
+      "loss": 1.1999,
+      "step": 2899
+    },
+    {
+      "epoch": 0.5163817663817664,
+      "grad_norm": 0.5385651588439941,
+      "learning_rate": 0.00019198387038510468,
+      "loss": 1.1831,
+      "step": 2900
+    },
+    {
+      "epoch": 0.5165598290598291,
+      "grad_norm": 0.4971916377544403,
+      "learning_rate": 0.00019197837832083002,
+      "loss": 1.2518,
+      "step": 2901
+    },
+    {
+      "epoch": 0.5167378917378918,
+      "grad_norm": 0.5253807306289673,
+      "learning_rate": 0.00019197288445443016,
+      "loss": 1.0788,
+      "step": 2902
+    },
+    {
+      "epoch": 0.5169159544159544,
+      "grad_norm": 0.49724945425987244,
+      "learning_rate": 0.00019196738878601263,
+      "loss": 1.0985,
+      "step": 2903
+    },
+    {
+      "epoch": 0.5170940170940171,
+      "grad_norm": 0.5327325463294983,
+      "learning_rate": 0.0001919618913156852,
+      "loss": 1.2862,
+      "step": 2904
+    },
+    {
+      "epoch": 0.5172720797720798,
+      "grad_norm": 0.639999270439148,
+      "learning_rate": 0.00019195639204355554,
+      "loss": 1.2052,
+      "step": 2905
+    },
+    {
+      "epoch": 0.5174501424501424,
+      "grad_norm": 0.4630785584449768,
+      "learning_rate": 0.0001919508909697314,
+      "loss": 1.1157,
+      "step": 2906
+    },
+    {
+      "epoch": 0.5176282051282052,
+      "grad_norm": 0.513949990272522,
+      "learning_rate": 0.00019194538809432055,
+      "loss": 1.0047,
+      "step": 2907
+    },
+    {
+      "epoch": 0.5178062678062678,
+      "grad_norm": 0.488034725189209,
+      "learning_rate": 0.0001919398834174308,
+      "loss": 0.9008,
+      "step": 2908
+    },
+    {
+      "epoch": 0.5179843304843305,
+      "grad_norm": 0.4892788529396057,
+      "learning_rate": 0.00019193437693917006,
+      "loss": 1.1024,
+      "step": 2909
+    },
+    {
+      "epoch": 0.5181623931623932,
+      "grad_norm": 0.5503842830657959,
+      "learning_rate": 0.00019192886865964618,
+      "loss": 1.2283,
+      "step": 2910
+    },
+    {
+      "epoch": 0.5183404558404558,
+      "grad_norm": 0.48885393142700195,
+      "learning_rate": 0.00019192335857896707,
+      "loss": 0.9522,
+      "step": 2911
+    },
+    {
+      "epoch": 0.5185185185185185,
+      "grad_norm": 0.5479527115821838,
+      "learning_rate": 0.00019191784669724072,
+      "loss": 1.1616,
+      "step": 2912
+    },
+    {
+      "epoch": 0.5186965811965812,
+      "grad_norm": 0.42701148986816406,
+      "learning_rate": 0.00019191233301457506,
+      "loss": 0.8434,
+      "step": 2913
+    },
+    {
+      "epoch": 0.5188746438746439,
+      "grad_norm": 0.4273422658443451,
+      "learning_rate": 0.00019190681753107822,
+      "loss": 0.8316,
+      "step": 2914
+    },
+    {
+      "epoch": 0.5190527065527065,
+      "grad_norm": 0.5047736763954163,
+      "learning_rate": 0.00019190130024685818,
+      "loss": 1.171,
+      "step": 2915
+    },
+    {
+      "epoch": 0.5192307692307693,
+      "grad_norm": 0.5221177935600281,
+      "learning_rate": 0.00019189578116202307,
+      "loss": 1.0256,
+      "step": 2916
+    },
+    {
+      "epoch": 0.5194088319088319,
+      "grad_norm": 0.4782322943210602,
+      "learning_rate": 0.00019189026027668105,
+      "loss": 0.8598,
+      "step": 2917
+    },
+    {
+      "epoch": 0.5195868945868946,
+      "grad_norm": 0.5627185702323914,
+      "learning_rate": 0.00019188473759094022,
+      "loss": 1.1825,
+      "step": 2918
+    },
+    {
+      "epoch": 0.5197649572649573,
+      "grad_norm": 0.5036423206329346,
+      "learning_rate": 0.00019187921310490888,
+      "loss": 1.0881,
+      "step": 2919
+    },
+    {
+      "epoch": 0.51994301994302,
+      "grad_norm": 0.4271143972873688,
+      "learning_rate": 0.0001918736868186952,
+      "loss": 0.9265,
+      "step": 2920
+    },
+    {
+      "epoch": 0.5201210826210826,
+      "grad_norm": 0.5427432656288147,
+      "learning_rate": 0.00019186815873240747,
+      "loss": 1.196,
+      "step": 2921
+    },
+    {
+      "epoch": 0.5202991452991453,
+      "grad_norm": 0.5494198203086853,
+      "learning_rate": 0.00019186262884615402,
+      "loss": 1.1207,
+      "step": 2922
+    },
+    {
+      "epoch": 0.520477207977208,
+      "grad_norm": 0.5305119752883911,
+      "learning_rate": 0.0001918570971600432,
+      "loss": 1.0393,
+      "step": 2923
+    },
+    {
+      "epoch": 0.5206552706552706,
+      "grad_norm": 0.46713170409202576,
+      "learning_rate": 0.00019185156367418333,
+      "loss": 0.9583,
+      "step": 2924
+    },
+    {
+      "epoch": 0.5208333333333334,
+      "grad_norm": 0.597776472568512,
+      "learning_rate": 0.00019184602838868292,
+      "loss": 1.2978,
+      "step": 2925
+    },
+    {
+      "epoch": 0.521011396011396,
+      "grad_norm": 0.520976722240448,
+      "learning_rate": 0.00019184049130365036,
+      "loss": 1.0515,
+      "step": 2926
+    },
+    {
+      "epoch": 0.5211894586894587,
+      "grad_norm": 0.5266290307044983,
+      "learning_rate": 0.00019183495241919415,
+      "loss": 1.0437,
+      "step": 2927
+    },
+    {
+      "epoch": 0.5213675213675214,
+      "grad_norm": 0.50911545753479,
+      "learning_rate": 0.00019182941173542285,
+      "loss": 0.9977,
+      "step": 2928
+    },
+    {
+      "epoch": 0.521545584045584,
+      "grad_norm": 0.4924670457839966,
+      "learning_rate": 0.00019182386925244496,
+      "loss": 0.9309,
+      "step": 2929
+    },
+    {
+      "epoch": 0.5217236467236467,
+      "grad_norm": 0.4979301393032074,
+      "learning_rate": 0.00019181832497036912,
+      "loss": 0.87,
+      "step": 2930
+    },
+    {
+      "epoch": 0.5219017094017094,
+      "grad_norm": 0.6307916045188904,
+      "learning_rate": 0.0001918127788893039,
+      "loss": 1.2159,
+      "step": 2931
+    },
+    {
+      "epoch": 0.5220797720797721,
+      "grad_norm": 0.4915660619735718,
+      "learning_rate": 0.00019180723100935802,
+      "loss": 1.0828,
+      "step": 2932
+    },
+    {
+      "epoch": 0.5222578347578347,
+      "grad_norm": 0.4312742352485657,
+      "learning_rate": 0.00019180168133064017,
+      "loss": 1.0496,
+      "step": 2933
+    },
+    {
+      "epoch": 0.5224358974358975,
+      "grad_norm": 0.6006124019622803,
+      "learning_rate": 0.00019179612985325908,
+      "loss": 1.0751,
+      "step": 2934
+    },
+    {
+      "epoch": 0.5226139601139601,
+      "grad_norm": 0.5332220196723938,
+      "learning_rate": 0.0001917905765773235,
+      "loss": 1.2601,
+      "step": 2935
+    },
+    {
+      "epoch": 0.5227920227920227,
+      "grad_norm": 0.4877954423427582,
+      "learning_rate": 0.00019178502150294223,
+      "loss": 1.2279,
+      "step": 2936
+    },
+    {
+      "epoch": 0.5229700854700855,
+      "grad_norm": 0.5975968837738037,
+      "learning_rate": 0.00019177946463022418,
+      "loss": 1.3371,
+      "step": 2937
+    },
+    {
+      "epoch": 0.5231481481481481,
+      "grad_norm": 0.5363923907279968,
+      "learning_rate": 0.00019177390595927815,
+      "loss": 1.0705,
+      "step": 2938
+    },
+    {
+      "epoch": 0.5233262108262108,
+      "grad_norm": 0.4314909875392914,
+      "learning_rate": 0.0001917683454902131,
+      "loss": 0.9172,
+      "step": 2939
+    },
+    {
+      "epoch": 0.5235042735042735,
+      "grad_norm": 0.46187883615493774,
+      "learning_rate": 0.0001917627832231379,
+      "loss": 1.1201,
+      "step": 2940
+    },
+    {
+      "epoch": 0.5236823361823362,
+      "grad_norm": 0.4648260772228241,
+      "learning_rate": 0.00019175721915816162,
+      "loss": 1.1307,
+      "step": 2941
+    },
+    {
+      "epoch": 0.5238603988603988,
+      "grad_norm": 0.4427165687084198,
+      "learning_rate": 0.00019175165329539325,
+      "loss": 0.9459,
+      "step": 2942
+    },
+    {
+      "epoch": 0.5240384615384616,
+      "grad_norm": 0.4645056128501892,
+      "learning_rate": 0.0001917460856349418,
+      "loss": 0.9176,
+      "step": 2943
+    },
+    {
+      "epoch": 0.5242165242165242,
+      "grad_norm": 0.4939568042755127,
+      "learning_rate": 0.0001917405161769164,
+      "loss": 1.1056,
+      "step": 2944
+    },
+    {
+      "epoch": 0.5243945868945868,
+      "grad_norm": 0.6057310104370117,
+      "learning_rate": 0.00019173494492142617,
+      "loss": 1.2714,
+      "step": 2945
+    },
+    {
+      "epoch": 0.5245726495726496,
+      "grad_norm": 0.5038546323776245,
+      "learning_rate": 0.00019172937186858025,
+      "loss": 0.911,
+      "step": 2946
+    },
+    {
+      "epoch": 0.5247507122507122,
+      "grad_norm": 0.5521321296691895,
+      "learning_rate": 0.00019172379701848784,
+      "loss": 1.0781,
+      "step": 2947
+    },
+    {
+      "epoch": 0.5249287749287749,
+      "grad_norm": 0.516979455947876,
+      "learning_rate": 0.00019171822037125817,
+      "loss": 1.1051,
+      "step": 2948
+    },
+    {
+      "epoch": 0.5251068376068376,
+      "grad_norm": 0.5443150997161865,
+      "learning_rate": 0.0001917126419270005,
+      "loss": 1.0802,
+      "step": 2949
+    },
+    {
+      "epoch": 0.5252849002849003,
+      "grad_norm": 0.5373311042785645,
+      "learning_rate": 0.00019170706168582412,
+      "loss": 0.9313,
+      "step": 2950
+    },
+    {
+      "epoch": 0.5254629629629629,
+      "grad_norm": 0.7511917948722839,
+      "learning_rate": 0.0001917014796478384,
+      "loss": 1.1958,
+      "step": 2951
+    },
+    {
+      "epoch": 0.5256410256410257,
+      "grad_norm": 0.49893468618392944,
+      "learning_rate": 0.00019169589581315263,
+      "loss": 0.9387,
+      "step": 2952
+    },
+    {
+      "epoch": 0.5258190883190883,
+      "grad_norm": 0.48010289669036865,
+      "learning_rate": 0.00019169031018187628,
+      "loss": 1.2459,
+      "step": 2953
+    },
+    {
+      "epoch": 0.5259971509971509,
+      "grad_norm": 0.48768678307533264,
+      "learning_rate": 0.0001916847227541188,
+      "loss": 1.0127,
+      "step": 2954
+    },
+    {
+      "epoch": 0.5261752136752137,
+      "grad_norm": 0.5973068475723267,
+      "learning_rate": 0.00019167913352998963,
+      "loss": 1.1685,
+      "step": 2955
+    },
+    {
+      "epoch": 0.5263532763532763,
+      "grad_norm": 0.5567806959152222,
+      "learning_rate": 0.00019167354250959826,
+      "loss": 1.142,
+      "step": 2956
+    },
+    {
+      "epoch": 0.5265313390313391,
+      "grad_norm": 0.47819700837135315,
+      "learning_rate": 0.00019166794969305428,
+      "loss": 0.712,
+      "step": 2957
+    },
+    {
+      "epoch": 0.5267094017094017,
+      "grad_norm": 0.5191744565963745,
+      "learning_rate": 0.00019166235508046725,
+      "loss": 1.2208,
+      "step": 2958
+    },
+    {
+      "epoch": 0.5268874643874644,
+      "grad_norm": 0.4987856149673462,
+      "learning_rate": 0.00019165675867194675,
+      "loss": 1.0466,
+      "step": 2959
+    },
+    {
+      "epoch": 0.5270655270655271,
+      "grad_norm": 0.5017665028572083,
+      "learning_rate": 0.0001916511604676025,
+      "loss": 1.1236,
+      "step": 2960
+    },
+    {
+      "epoch": 0.5272435897435898,
+      "grad_norm": 0.5115348696708679,
+      "learning_rate": 0.00019164556046754415,
+      "loss": 1.1497,
+      "step": 2961
+    },
+    {
+      "epoch": 0.5274216524216524,
+      "grad_norm": 0.4934345781803131,
+      "learning_rate": 0.0001916399586718814,
+      "loss": 1.0183,
+      "step": 2962
+    },
+    {
+      "epoch": 0.5275997150997151,
+      "grad_norm": 0.5033719539642334,
+      "learning_rate": 0.00019163435508072404,
+      "loss": 1.0256,
+      "step": 2963
+    },
+    {
+      "epoch": 0.5277777777777778,
+      "grad_norm": 0.5325372219085693,
+      "learning_rate": 0.00019162874969418184,
+      "loss": 1.1384,
+      "step": 2964
+    },
+    {
+      "epoch": 0.5279558404558404,
+      "grad_norm": 0.4901772141456604,
+      "learning_rate": 0.00019162314251236465,
+      "loss": 1.0831,
+      "step": 2965
+    },
+    {
+      "epoch": 0.5281339031339032,
+      "grad_norm": 0.4743805229663849,
+      "learning_rate": 0.0001916175335353823,
+      "loss": 1.1894,
+      "step": 2966
+    },
+    {
+      "epoch": 0.5283119658119658,
+      "grad_norm": 0.5439450740814209,
+      "learning_rate": 0.00019161192276334466,
+      "loss": 1.2066,
+      "step": 2967
+    },
+    {
+      "epoch": 0.5284900284900285,
+      "grad_norm": 0.5123090744018555,
+      "learning_rate": 0.00019160631019636174,
+      "loss": 1.1829,
+      "step": 2968
+    },
+    {
+      "epoch": 0.5286680911680912,
+      "grad_norm": 0.5995343923568726,
+      "learning_rate": 0.00019160069583454346,
+      "loss": 1.4872,
+      "step": 2969
+    },
+    {
+      "epoch": 0.5288461538461539,
+      "grad_norm": 0.4596657156944275,
+      "learning_rate": 0.00019159507967799985,
+      "loss": 0.8948,
+      "step": 2970
+    },
+    {
+      "epoch": 0.5290242165242165,
+      "grad_norm": 0.5533682107925415,
+      "learning_rate": 0.0001915894617268409,
+      "loss": 1.1779,
+      "step": 2971
+    },
+    {
+      "epoch": 0.5292022792022792,
+      "grad_norm": 0.3860718309879303,
+      "learning_rate": 0.00019158384198117673,
+      "loss": 0.6424,
+      "step": 2972
+    },
+    {
+      "epoch": 0.5293803418803419,
+      "grad_norm": 0.47424063086509705,
+      "learning_rate": 0.0001915782204411174,
+      "loss": 1.1592,
+      "step": 2973
+    },
+    {
+      "epoch": 0.5295584045584045,
+      "grad_norm": 0.5050228834152222,
+      "learning_rate": 0.00019157259710677309,
+      "loss": 1.1971,
+      "step": 2974
+    },
+    {
+      "epoch": 0.5297364672364673,
+      "grad_norm": 0.6080113649368286,
+      "learning_rate": 0.00019156697197825396,
+      "loss": 1.1511,
+      "step": 2975
+    },
+    {
+      "epoch": 0.5299145299145299,
+      "grad_norm": 0.4805932641029358,
+      "learning_rate": 0.00019156134505567024,
+      "loss": 1.1033,
+      "step": 2976
+    },
+    {
+      "epoch": 0.5300925925925926,
+      "grad_norm": 0.4835345447063446,
+      "learning_rate": 0.00019155571633913215,
+      "loss": 1.1832,
+      "step": 2977
+    },
+    {
+      "epoch": 0.5302706552706553,
+      "grad_norm": 0.5183725953102112,
+      "learning_rate": 0.00019155008582875,
+      "loss": 0.9221,
+      "step": 2978
+    },
+    {
+      "epoch": 0.530448717948718,
+      "grad_norm": 0.48015761375427246,
+      "learning_rate": 0.00019154445352463412,
+      "loss": 1.045,
+      "step": 2979
+    },
+    {
+      "epoch": 0.5306267806267806,
+      "grad_norm": 0.4670043885707855,
+      "learning_rate": 0.0001915388194268948,
+      "loss": 0.9025,
+      "step": 2980
+    },
+    {
+      "epoch": 0.5308048433048433,
+      "grad_norm": 0.5048824548721313,
+      "learning_rate": 0.0001915331835356425,
+      "loss": 1.0681,
+      "step": 2981
+    },
+    {
+      "epoch": 0.530982905982906,
+      "grad_norm": 0.4785633981227875,
+      "learning_rate": 0.00019152754585098758,
+      "loss": 1.0097,
+      "step": 2982
+    },
+    {
+      "epoch": 0.5311609686609686,
+      "grad_norm": 0.4829573333263397,
+      "learning_rate": 0.00019152190637304056,
+      "loss": 1.0856,
+      "step": 2983
+    },
+    {
+      "epoch": 0.5313390313390314,
+      "grad_norm": 0.5425563454627991,
+      "learning_rate": 0.00019151626510191189,
+      "loss": 1.2313,
+      "step": 2984
+    },
+    {
+      "epoch": 0.531517094017094,
+      "grad_norm": 0.5532251596450806,
+      "learning_rate": 0.0001915106220377121,
+      "loss": 1.0328,
+      "step": 2985
+    },
+    {
+      "epoch": 0.5316951566951567,
+      "grad_norm": 0.47016972303390503,
+      "learning_rate": 0.0001915049771805518,
+      "loss": 1.2003,
+      "step": 2986
+    },
+    {
+      "epoch": 0.5318732193732194,
+      "grad_norm": 0.5241743326187134,
+      "learning_rate": 0.00019149933053054153,
+      "loss": 1.046,
+      "step": 2987
+    },
+    {
+      "epoch": 0.532051282051282,
+      "grad_norm": 0.5043526887893677,
+      "learning_rate": 0.00019149368208779197,
+      "loss": 1.0022,
+      "step": 2988
+    },
+    {
+      "epoch": 0.5322293447293447,
+      "grad_norm": 0.5563312768936157,
+      "learning_rate": 0.00019148803185241374,
+      "loss": 1.1017,
+      "step": 2989
+    },
+    {
+      "epoch": 0.5324074074074074,
+      "grad_norm": 0.5414231419563293,
+      "learning_rate": 0.00019148237982451763,
+      "loss": 0.9649,
+      "step": 2990
+    },
+    {
+      "epoch": 0.5325854700854701,
+      "grad_norm": 0.5452231764793396,
+      "learning_rate": 0.0001914767260042143,
+      "loss": 1.2281,
+      "step": 2991
+    },
+    {
+      "epoch": 0.5327635327635327,
+      "grad_norm": 0.5500698685646057,
+      "learning_rate": 0.00019147107039161454,
+      "loss": 1.2865,
+      "step": 2992
+    },
+    {
+      "epoch": 0.5329415954415955,
+      "grad_norm": 0.49747416377067566,
+      "learning_rate": 0.00019146541298682918,
+      "loss": 1.1296,
+      "step": 2993
+    },
+    {
+      "epoch": 0.5331196581196581,
+      "grad_norm": 0.5684167742729187,
+      "learning_rate": 0.00019145975378996903,
+      "loss": 1.0685,
+      "step": 2994
+    },
+    {
+      "epoch": 0.5332977207977208,
+      "grad_norm": 0.5411235690116882,
+      "learning_rate": 0.00019145409280114502,
+      "loss": 1.1372,
+      "step": 2995
+    },
+    {
+      "epoch": 0.5334757834757835,
+      "grad_norm": 0.5006675720214844,
+      "learning_rate": 0.00019144843002046806,
+      "loss": 1.0688,
+      "step": 2996
+    },
+    {
+      "epoch": 0.5336538461538461,
+      "grad_norm": 0.4591315686702728,
+      "learning_rate": 0.00019144276544804908,
+      "loss": 1.1071,
+      "step": 2997
+    },
+    {
+      "epoch": 0.5338319088319088,
+      "grad_norm": 0.5615306496620178,
+      "learning_rate": 0.000191437099083999,
+      "loss": 1.1033,
+      "step": 2998
+    },
+    {
+      "epoch": 0.5340099715099715,
+      "grad_norm": 0.4986817240715027,
+      "learning_rate": 0.00019143143092842897,
+      "loss": 1.176,
+      "step": 2999
+    },
+    {
+      "epoch": 0.5341880341880342,
+      "grad_norm": 0.5017120242118835,
+      "learning_rate": 0.00019142576098144995,
+      "loss": 1.0174,
+      "step": 3000
+    },
+    {
+      "epoch": 0.5343660968660968,
+      "grad_norm": 0.508298397064209,
+      "learning_rate": 0.0001914200892431731,
+      "loss": 1.164,
+      "step": 3001
+    },
+    {
+      "epoch": 0.5345441595441596,
+      "grad_norm": 0.48068809509277344,
+      "learning_rate": 0.0001914144157137095,
+      "loss": 0.7959,
+      "step": 3002
+    },
+    {
+      "epoch": 0.5347222222222222,
+      "grad_norm": 0.6347028017044067,
+      "learning_rate": 0.0001914087403931703,
+      "loss": 1.1727,
+      "step": 3003
+    },
+    {
+      "epoch": 0.5349002849002849,
+      "grad_norm": 0.5558401942253113,
+      "learning_rate": 0.00019140306328166676,
+      "loss": 1.2282,
+      "step": 3004
+    },
+    {
+      "epoch": 0.5350783475783476,
+      "grad_norm": 0.5093596577644348,
+      "learning_rate": 0.00019139738437931004,
+      "loss": 1.3258,
+      "step": 3005
+    },
+    {
+      "epoch": 0.5352564102564102,
+      "grad_norm": 0.4653106927871704,
+      "learning_rate": 0.0001913917036862114,
+      "loss": 1.1062,
+      "step": 3006
+    },
+    {
+      "epoch": 0.5354344729344729,
+      "grad_norm": 0.48085781931877136,
+      "learning_rate": 0.00019138602120248222,
+      "loss": 0.9019,
+      "step": 3007
+    },
+    {
+      "epoch": 0.5356125356125356,
+      "grad_norm": 0.5174745321273804,
+      "learning_rate": 0.0001913803369282338,
+      "loss": 1.044,
+      "step": 3008
+    },
+    {
+      "epoch": 0.5357905982905983,
+      "grad_norm": 0.5359669327735901,
+      "learning_rate": 0.00019137465086357746,
+      "loss": 1.0723,
+      "step": 3009
+    },
+    {
+      "epoch": 0.5359686609686609,
+      "grad_norm": 0.5583470463752747,
+      "learning_rate": 0.00019136896300862467,
+      "loss": 1.2192,
+      "step": 3010
+    },
+    {
+      "epoch": 0.5361467236467237,
+      "grad_norm": 0.4905693829059601,
+      "learning_rate": 0.00019136327336348688,
+      "loss": 1.2372,
+      "step": 3011
+    },
+    {
+      "epoch": 0.5363247863247863,
+      "grad_norm": 0.5741264820098877,
+      "learning_rate": 0.0001913575819282755,
+      "loss": 1.1703,
+      "step": 3012
+    },
+    {
+      "epoch": 0.5365028490028491,
+      "grad_norm": 0.577033281326294,
+      "learning_rate": 0.0001913518887031021,
+      "loss": 1.1555,
+      "step": 3013
+    },
+    {
+      "epoch": 0.5366809116809117,
+      "grad_norm": 0.46795153617858887,
+      "learning_rate": 0.00019134619368807822,
+      "loss": 0.8583,
+      "step": 3014
+    },
+    {
+      "epoch": 0.5368589743589743,
+      "grad_norm": 0.5973345637321472,
+      "learning_rate": 0.0001913404968833154,
+      "loss": 1.1509,
+      "step": 3015
+    },
+    {
+      "epoch": 0.5370370370370371,
+      "grad_norm": 0.62020343542099,
+      "learning_rate": 0.00019133479828892531,
+      "loss": 1.0781,
+      "step": 3016
+    },
+    {
+      "epoch": 0.5372150997150997,
+      "grad_norm": 0.5342286229133606,
+      "learning_rate": 0.00019132909790501958,
+      "loss": 1.1556,
+      "step": 3017
+    },
+    {
+      "epoch": 0.5373931623931624,
+      "grad_norm": 0.49612846970558167,
+      "learning_rate": 0.0001913233957317099,
+      "loss": 0.9027,
+      "step": 3018
+    },
+    {
+      "epoch": 0.5375712250712251,
+      "grad_norm": 0.5403908491134644,
+      "learning_rate": 0.00019131769176910796,
+      "loss": 1.1125,
+      "step": 3019
+    },
+    {
+      "epoch": 0.5377492877492878,
+      "grad_norm": 0.4952050447463989,
+      "learning_rate": 0.0001913119860173256,
+      "loss": 1.2329,
+      "step": 3020
+    },
+    {
+      "epoch": 0.5379273504273504,
+      "grad_norm": 0.5877819657325745,
+      "learning_rate": 0.0001913062784764745,
+      "loss": 1.2855,
+      "step": 3021
+    },
+    {
+      "epoch": 0.5381054131054132,
+      "grad_norm": 0.49312907457351685,
+      "learning_rate": 0.00019130056914666655,
+      "loss": 1.0212,
+      "step": 3022
+    },
+    {
+      "epoch": 0.5382834757834758,
+      "grad_norm": 0.45544490218162537,
+      "learning_rate": 0.00019129485802801366,
+      "loss": 0.9748,
+      "step": 3023
+    },
+    {
+      "epoch": 0.5384615384615384,
+      "grad_norm": 0.5535242557525635,
+      "learning_rate": 0.00019128914512062762,
+      "loss": 1.2134,
+      "step": 3024
+    },
+    {
+      "epoch": 0.5386396011396012,
+      "grad_norm": 0.45369696617126465,
+      "learning_rate": 0.00019128343042462044,
+      "loss": 0.9964,
+      "step": 3025
+    },
+    {
+      "epoch": 0.5388176638176638,
+      "grad_norm": 0.6240725517272949,
+      "learning_rate": 0.00019127771394010406,
+      "loss": 1.425,
+      "step": 3026
+    },
+    {
+      "epoch": 0.5389957264957265,
+      "grad_norm": 0.4859573245048523,
+      "learning_rate": 0.0001912719956671905,
+      "loss": 1.087,
+      "step": 3027
+    },
+    {
+      "epoch": 0.5391737891737892,
+      "grad_norm": 0.47529762983322144,
+      "learning_rate": 0.0001912662756059918,
+      "loss": 0.9517,
+      "step": 3028
+    },
+    {
+      "epoch": 0.5393518518518519,
+      "grad_norm": 0.5317288637161255,
+      "learning_rate": 0.00019126055375661997,
+      "loss": 1.0945,
+      "step": 3029
+    },
+    {
+      "epoch": 0.5395299145299145,
+      "grad_norm": 0.55974280834198,
+      "learning_rate": 0.00019125483011918722,
+      "loss": 1.0794,
+      "step": 3030
+    },
+    {
+      "epoch": 0.5397079772079773,
+      "grad_norm": 0.48579123616218567,
+      "learning_rate": 0.0001912491046938056,
+      "loss": 1.1421,
+      "step": 3031
+    },
+    {
+      "epoch": 0.5398860398860399,
+      "grad_norm": 0.4917181134223938,
+      "learning_rate": 0.00019124337748058733,
+      "loss": 0.9708,
+      "step": 3032
+    },
+    {
+      "epoch": 0.5400641025641025,
+      "grad_norm": 0.525291383266449,
+      "learning_rate": 0.00019123764847964466,
+      "loss": 1.064,
+      "step": 3033
+    },
+    {
+      "epoch": 0.5402421652421653,
+      "grad_norm": 0.5733301639556885,
+      "learning_rate": 0.00019123191769108977,
+      "loss": 1.2142,
+      "step": 3034
+    },
+    {
+      "epoch": 0.5404202279202279,
+      "grad_norm": 0.5400987863540649,
+      "learning_rate": 0.00019122618511503494,
+      "loss": 1.1309,
+      "step": 3035
+    },
+    {
+      "epoch": 0.5405982905982906,
+      "grad_norm": 0.6261051893234253,
+      "learning_rate": 0.00019122045075159257,
+      "loss": 1.2112,
+      "step": 3036
+    },
+    {
+      "epoch": 0.5407763532763533,
+      "grad_norm": 0.5483576059341431,
+      "learning_rate": 0.0001912147146008749,
+      "loss": 1.2705,
+      "step": 3037
+    },
+    {
+      "epoch": 0.540954415954416,
+      "grad_norm": 0.5442137122154236,
+      "learning_rate": 0.00019120897666299443,
+      "loss": 1.2512,
+      "step": 3038
+    },
+    {
+      "epoch": 0.5411324786324786,
+      "grad_norm": 0.5680811405181885,
+      "learning_rate": 0.00019120323693806355,
+      "loss": 1.392,
+      "step": 3039
+    },
+    {
+      "epoch": 0.5413105413105413,
+      "grad_norm": 0.5237287878990173,
+      "learning_rate": 0.00019119749542619466,
+      "loss": 1.1599,
+      "step": 3040
+    },
+    {
+      "epoch": 0.541488603988604,
+      "grad_norm": 0.48119300603866577,
+      "learning_rate": 0.00019119175212750032,
+      "loss": 1.0976,
+      "step": 3041
+    },
+    {
+      "epoch": 0.5416666666666666,
+      "grad_norm": 0.507033109664917,
+      "learning_rate": 0.00019118600704209302,
+      "loss": 1.0181,
+      "step": 3042
+    },
+    {
+      "epoch": 0.5418447293447294,
+      "grad_norm": 0.484672874212265,
+      "learning_rate": 0.00019118026017008531,
+      "loss": 1.1636,
+      "step": 3043
+    },
+    {
+      "epoch": 0.542022792022792,
+      "grad_norm": 0.4923502206802368,
+      "learning_rate": 0.00019117451151158985,
+      "loss": 1.0388,
+      "step": 3044
+    },
+    {
+      "epoch": 0.5422008547008547,
+      "grad_norm": 0.4882057309150696,
+      "learning_rate": 0.00019116876106671922,
+      "loss": 1.131,
+      "step": 3045
+    },
+    {
+      "epoch": 0.5423789173789174,
+      "grad_norm": 0.6068355441093445,
+      "learning_rate": 0.0001911630088355861,
+      "loss": 1.3218,
+      "step": 3046
+    },
+    {
+      "epoch": 0.54255698005698,
+      "grad_norm": 0.5012881755828857,
+      "learning_rate": 0.0001911572548183032,
+      "loss": 1.0514,
+      "step": 3047
+    },
+    {
+      "epoch": 0.5427350427350427,
+      "grad_norm": 0.49849793314933777,
+      "learning_rate": 0.00019115149901498328,
+      "loss": 1.0003,
+      "step": 3048
+    },
+    {
+      "epoch": 0.5429131054131054,
+      "grad_norm": 0.4934251010417938,
+      "learning_rate": 0.00019114574142573904,
+      "loss": 1.0319,
+      "step": 3049
+    },
+    {
+      "epoch": 0.5430911680911681,
+      "grad_norm": 0.4947762191295624,
+      "learning_rate": 0.00019113998205068334,
+      "loss": 1.0906,
+      "step": 3050
+    },
+    {
+      "epoch": 0.5432692307692307,
+      "grad_norm": 0.5449416041374207,
+      "learning_rate": 0.00019113422088992907,
+      "loss": 0.9093,
+      "step": 3051
+    },
+    {
+      "epoch": 0.5434472934472935,
+      "grad_norm": 0.49395284056663513,
+      "learning_rate": 0.00019112845794358902,
+      "loss": 1.0071,
+      "step": 3052
+    },
+    {
+      "epoch": 0.5436253561253561,
+      "grad_norm": 0.5478728413581848,
+      "learning_rate": 0.00019112269321177613,
+      "loss": 1.2124,
+      "step": 3053
+    },
+    {
+      "epoch": 0.5438034188034188,
+      "grad_norm": 0.6205173134803772,
+      "learning_rate": 0.0001911169266946034,
+      "loss": 1.021,
+      "step": 3054
+    },
+    {
+      "epoch": 0.5439814814814815,
+      "grad_norm": 0.4777783751487732,
+      "learning_rate": 0.00019111115839218372,
+      "loss": 0.9192,
+      "step": 3055
+    },
+    {
+      "epoch": 0.5441595441595442,
+      "grad_norm": 0.5541689991950989,
+      "learning_rate": 0.00019110538830463018,
+      "loss": 1.1248,
+      "step": 3056
+    },
+    {
+      "epoch": 0.5443376068376068,
+      "grad_norm": 0.4750942289829254,
+      "learning_rate": 0.0001910996164320558,
+      "loss": 1.3147,
+      "step": 3057
+    },
+    {
+      "epoch": 0.5445156695156695,
+      "grad_norm": 0.6283948421478271,
+      "learning_rate": 0.0001910938427745737,
+      "loss": 1.0919,
+      "step": 3058
+    },
+    {
+      "epoch": 0.5446937321937322,
+      "grad_norm": 0.552725076675415,
+      "learning_rate": 0.00019108806733229698,
+      "loss": 1.3807,
+      "step": 3059
+    },
+    {
+      "epoch": 0.5448717948717948,
+      "grad_norm": 0.4832848310470581,
+      "learning_rate": 0.0001910822901053388,
+      "loss": 1.0705,
+      "step": 3060
+    },
+    {
+      "epoch": 0.5450498575498576,
+      "grad_norm": 0.6468375325202942,
+      "learning_rate": 0.00019107651109381233,
+      "loss": 1.0766,
+      "step": 3061
+    },
+    {
+      "epoch": 0.5452279202279202,
+      "grad_norm": 0.5464920401573181,
+      "learning_rate": 0.00019107073029783083,
+      "loss": 1.0453,
+      "step": 3062
+    },
+    {
+      "epoch": 0.5454059829059829,
+      "grad_norm": 0.5321210026741028,
+      "learning_rate": 0.0001910649477175076,
+      "loss": 1.2326,
+      "step": 3063
+    },
+    {
+      "epoch": 0.5455840455840456,
+      "grad_norm": 0.5572962164878845,
+      "learning_rate": 0.00019105916335295582,
+      "loss": 1.0673,
+      "step": 3064
+    },
+    {
+      "epoch": 0.5457621082621082,
+      "grad_norm": 0.5239177942276001,
+      "learning_rate": 0.00019105337720428894,
+      "loss": 1.04,
+      "step": 3065
+    },
+    {
+      "epoch": 0.5459401709401709,
+      "grad_norm": 0.5633319616317749,
+      "learning_rate": 0.00019104758927162023,
+      "loss": 0.9606,
+      "step": 3066
+    },
+    {
+      "epoch": 0.5461182336182336,
+      "grad_norm": 0.5317914485931396,
+      "learning_rate": 0.0001910417995550632,
+      "loss": 1.0651,
+      "step": 3067
+    },
+    {
+      "epoch": 0.5462962962962963,
+      "grad_norm": 0.5126453638076782,
+      "learning_rate": 0.00019103600805473118,
+      "loss": 1.0316,
+      "step": 3068
+    },
+    {
+      "epoch": 0.5464743589743589,
+      "grad_norm": 0.5262107253074646,
+      "learning_rate": 0.00019103021477073773,
+      "loss": 1.0752,
+      "step": 3069
+    },
+    {
+      "epoch": 0.5466524216524217,
+      "grad_norm": 0.5384877324104309,
+      "learning_rate": 0.0001910244197031963,
+      "loss": 1.1731,
+      "step": 3070
+    },
+    {
+      "epoch": 0.5468304843304843,
+      "grad_norm": 0.5126553773880005,
+      "learning_rate": 0.00019101862285222048,
+      "loss": 1.2229,
+      "step": 3071
+    },
+    {
+      "epoch": 0.5470085470085471,
+      "grad_norm": 0.4841194450855255,
+      "learning_rate": 0.0001910128242179238,
+      "loss": 0.9955,
+      "step": 3072
+    },
+    {
+      "epoch": 0.5471866096866097,
+      "grad_norm": 0.526546061038971,
+      "learning_rate": 0.00019100702380041987,
+      "loss": 1.2436,
+      "step": 3073
+    },
+    {
+      "epoch": 0.5473646723646723,
+      "grad_norm": 0.5085833072662354,
+      "learning_rate": 0.0001910012215998224,
+      "loss": 1.011,
+      "step": 3074
+    },
+    {
+      "epoch": 0.5475427350427351,
+      "grad_norm": 0.5149994492530823,
+      "learning_rate": 0.000190995417616245,
+      "loss": 0.8632,
+      "step": 3075
+    },
+    {
+      "epoch": 0.5477207977207977,
+      "grad_norm": 0.48079630732536316,
+      "learning_rate": 0.00019098961184980145,
+      "loss": 1.1115,
+      "step": 3076
+    },
+    {
+      "epoch": 0.5478988603988604,
+      "grad_norm": 0.5769477486610413,
+      "learning_rate": 0.00019098380430060546,
+      "loss": 0.9544,
+      "step": 3077
+    },
+    {
+      "epoch": 0.5480769230769231,
+      "grad_norm": 0.5260093808174133,
+      "learning_rate": 0.0001909779949687708,
+      "loss": 1.2354,
+      "step": 3078
+    },
+    {
+      "epoch": 0.5482549857549858,
+      "grad_norm": 0.5518734455108643,
+      "learning_rate": 0.00019097218385441135,
+      "loss": 1.1944,
+      "step": 3079
+    },
+    {
+      "epoch": 0.5484330484330484,
+      "grad_norm": 0.5436808466911316,
+      "learning_rate": 0.00019096637095764095,
+      "loss": 1.0717,
+      "step": 3080
+    },
+    {
+      "epoch": 0.5486111111111112,
+      "grad_norm": 0.4749584197998047,
+      "learning_rate": 0.00019096055627857344,
+      "loss": 1.0417,
+      "step": 3081
+    },
+    {
+      "epoch": 0.5487891737891738,
+      "grad_norm": 0.5485591292381287,
+      "learning_rate": 0.0001909547398173228,
+      "loss": 1.2515,
+      "step": 3082
+    },
+    {
+      "epoch": 0.5489672364672364,
+      "grad_norm": 0.5751016736030579,
+      "learning_rate": 0.00019094892157400296,
+      "loss": 1.2112,
+      "step": 3083
+    },
+    {
+      "epoch": 0.5491452991452992,
+      "grad_norm": 0.5404475331306458,
+      "learning_rate": 0.00019094310154872795,
+      "loss": 0.4334,
+      "step": 3084
+    },
+    {
+      "epoch": 0.5493233618233618,
+      "grad_norm": 0.5198020935058594,
+      "learning_rate": 0.00019093727974161178,
+      "loss": 0.9759,
+      "step": 3085
+    },
+    {
+      "epoch": 0.5495014245014245,
+      "grad_norm": 0.4893439710140228,
+      "learning_rate": 0.0001909314561527685,
+      "loss": 1.1287,
+      "step": 3086
+    },
+    {
+      "epoch": 0.5496794871794872,
+      "grad_norm": 0.5675956606864929,
+      "learning_rate": 0.00019092563078231228,
+      "loss": 1.234,
+      "step": 3087
+    },
+    {
+      "epoch": 0.5498575498575499,
+      "grad_norm": 0.5539132356643677,
+      "learning_rate": 0.00019091980363035714,
+      "loss": 1.2378,
+      "step": 3088
+    },
+    {
+      "epoch": 0.5500356125356125,
+      "grad_norm": 0.5194353461265564,
+      "learning_rate": 0.00019091397469701735,
+      "loss": 1.1338,
+      "step": 3089
+    },
+    {
+      "epoch": 0.5502136752136753,
+      "grad_norm": 0.5143756866455078,
+      "learning_rate": 0.0001909081439824071,
+      "loss": 0.9118,
+      "step": 3090
+    },
+    {
+      "epoch": 0.5503917378917379,
+      "grad_norm": 0.5624327659606934,
+      "learning_rate": 0.0001909023114866406,
+      "loss": 1.035,
+      "step": 3091
+    },
+    {
+      "epoch": 0.5505698005698005,
+      "grad_norm": 0.5285067558288574,
+      "learning_rate": 0.0001908964772098321,
+      "loss": 1.0451,
+      "step": 3092
+    },
+    {
+      "epoch": 0.5507478632478633,
+      "grad_norm": 0.5730587244033813,
+      "learning_rate": 0.000190890641152096,
+      "loss": 1.0672,
+      "step": 3093
+    },
+    {
+      "epoch": 0.5509259259259259,
+      "grad_norm": 0.5822951197624207,
+      "learning_rate": 0.0001908848033135466,
+      "loss": 1.1791,
+      "step": 3094
+    },
+    {
+      "epoch": 0.5511039886039886,
+      "grad_norm": 0.596161961555481,
+      "learning_rate": 0.00019087896369429826,
+      "loss": 1.0954,
+      "step": 3095
+    },
+    {
+      "epoch": 0.5512820512820513,
+      "grad_norm": 0.5138190984725952,
+      "learning_rate": 0.00019087312229446542,
+      "loss": 0.896,
+      "step": 3096
+    },
+    {
+      "epoch": 0.551460113960114,
+      "grad_norm": 0.5061872601509094,
+      "learning_rate": 0.0001908672791141625,
+      "loss": 1.1017,
+      "step": 3097
+    },
+    {
+      "epoch": 0.5516381766381766,
+      "grad_norm": 0.5189547538757324,
+      "learning_rate": 0.00019086143415350404,
+      "loss": 1.2906,
+      "step": 3098
+    },
+    {
+      "epoch": 0.5518162393162394,
+      "grad_norm": 0.5640039443969727,
+      "learning_rate": 0.00019085558741260448,
+      "loss": 1.1001,
+      "step": 3099
+    },
+    {
+      "epoch": 0.551994301994302,
+      "grad_norm": 0.453867107629776,
+      "learning_rate": 0.00019084973889157844,
+      "loss": 0.9731,
+      "step": 3100
+    },
+    {
+      "epoch": 0.5521723646723646,
+      "grad_norm": 0.5431303977966309,
+      "learning_rate": 0.0001908438885905405,
+      "loss": 1.3511,
+      "step": 3101
+    },
+    {
+      "epoch": 0.5523504273504274,
+      "grad_norm": 0.47693368792533875,
+      "learning_rate": 0.00019083803650960527,
+      "loss": 1.0426,
+      "step": 3102
+    },
+    {
+      "epoch": 0.55252849002849,
+      "grad_norm": 0.4663422703742981,
+      "learning_rate": 0.00019083218264888743,
+      "loss": 1.05,
+      "step": 3103
+    },
+    {
+      "epoch": 0.5527065527065527,
+      "grad_norm": 0.561354398727417,
+      "learning_rate": 0.00019082632700850164,
+      "loss": 0.9608,
+      "step": 3104
+    },
+    {
+      "epoch": 0.5528846153846154,
+      "grad_norm": 0.4981916844844818,
+      "learning_rate": 0.00019082046958856266,
+      "loss": 1.1935,
+      "step": 3105
+    },
+    {
+      "epoch": 0.5530626780626781,
+      "grad_norm": 0.5301326513290405,
+      "learning_rate": 0.0001908146103891852,
+      "loss": 1.0646,
+      "step": 3106
+    },
+    {
+      "epoch": 0.5532407407407407,
+      "grad_norm": 0.5023610591888428,
+      "learning_rate": 0.00019080874941048416,
+      "loss": 1.127,
+      "step": 3107
+    },
+    {
+      "epoch": 0.5534188034188035,
+      "grad_norm": 0.5172514319419861,
+      "learning_rate": 0.00019080288665257426,
+      "loss": 1.0435,
+      "step": 3108
+    },
+    {
+      "epoch": 0.5535968660968661,
+      "grad_norm": 0.6340598464012146,
+      "learning_rate": 0.00019079702211557048,
+      "loss": 1.3528,
+      "step": 3109
+    },
+    {
+      "epoch": 0.5537749287749287,
+      "grad_norm": 0.46882256865501404,
+      "learning_rate": 0.0001907911557995876,
+      "loss": 1.1361,
+      "step": 3110
+    },
+    {
+      "epoch": 0.5539529914529915,
+      "grad_norm": 0.6401382088661194,
+      "learning_rate": 0.00019078528770474068,
+      "loss": 1.2415,
+      "step": 3111
+    },
+    {
+      "epoch": 0.5541310541310541,
+      "grad_norm": 0.5141328573226929,
+      "learning_rate": 0.00019077941783114463,
+      "loss": 1.0505,
+      "step": 3112
+    },
+    {
+      "epoch": 0.5543091168091168,
+      "grad_norm": 0.522318959236145,
+      "learning_rate": 0.00019077354617891444,
+      "loss": 1.0964,
+      "step": 3113
+    },
+    {
+      "epoch": 0.5544871794871795,
+      "grad_norm": 0.539551854133606,
+      "learning_rate": 0.00019076767274816517,
+      "loss": 1.0735,
+      "step": 3114
+    },
+    {
+      "epoch": 0.5546652421652422,
+      "grad_norm": 0.495320200920105,
+      "learning_rate": 0.00019076179753901195,
+      "loss": 0.9754,
+      "step": 3115
+    },
+    {
+      "epoch": 0.5548433048433048,
+      "grad_norm": 0.5499199628829956,
+      "learning_rate": 0.00019075592055156984,
+      "loss": 1.0043,
+      "step": 3116
+    },
+    {
+      "epoch": 0.5550213675213675,
+      "grad_norm": 0.5352509617805481,
+      "learning_rate": 0.00019075004178595396,
+      "loss": 1.1701,
+      "step": 3117
+    },
+    {
+      "epoch": 0.5551994301994302,
+      "grad_norm": 0.5392300486564636,
+      "learning_rate": 0.00019074416124227953,
+      "loss": 1.1612,
+      "step": 3118
+    },
+    {
+      "epoch": 0.5553774928774928,
+      "grad_norm": 0.5195050835609436,
+      "learning_rate": 0.0001907382789206618,
+      "loss": 1.0934,
+      "step": 3119
+    },
+    {
+      "epoch": 0.5555555555555556,
+      "grad_norm": 0.5276884436607361,
+      "learning_rate": 0.000190732394821216,
+      "loss": 0.9011,
+      "step": 3120
+    },
+    {
+      "epoch": 0.5557336182336182,
+      "grad_norm": 0.6115903258323669,
+      "learning_rate": 0.00019072650894405734,
+      "loss": 1.3065,
+      "step": 3121
+    },
+    {
+      "epoch": 0.5559116809116809,
+      "grad_norm": 0.5752483010292053,
+      "learning_rate": 0.00019072062128930127,
+      "loss": 1.0063,
+      "step": 3122
+    },
+    {
+      "epoch": 0.5560897435897436,
+      "grad_norm": 0.5508273243904114,
+      "learning_rate": 0.00019071473185706302,
+      "loss": 1.2598,
+      "step": 3123
+    },
+    {
+      "epoch": 0.5562678062678063,
+      "grad_norm": 0.49712198972702026,
+      "learning_rate": 0.00019070884064745808,
+      "loss": 0.924,
+      "step": 3124
+    },
+    {
+      "epoch": 0.5564458689458689,
+      "grad_norm": 0.572849452495575,
+      "learning_rate": 0.00019070294766060185,
+      "loss": 0.9683,
+      "step": 3125
+    },
+    {
+      "epoch": 0.5566239316239316,
+      "grad_norm": 0.4807920753955841,
+      "learning_rate": 0.00019069705289660976,
+      "loss": 1.0998,
+      "step": 3126
+    },
+    {
+      "epoch": 0.5568019943019943,
+      "grad_norm": 0.5543031096458435,
+      "learning_rate": 0.0001906911563555973,
+      "loss": 1.0878,
+      "step": 3127
+    },
+    {
+      "epoch": 0.5569800569800569,
+      "grad_norm": 0.5710418820381165,
+      "learning_rate": 0.00019068525803768007,
+      "loss": 1.0381,
+      "step": 3128
+    },
+    {
+      "epoch": 0.5571581196581197,
+      "grad_norm": 0.5169163346290588,
+      "learning_rate": 0.00019067935794297357,
+      "loss": 1.1149,
+      "step": 3129
+    },
+    {
+      "epoch": 0.5573361823361823,
+      "grad_norm": 0.6474376916885376,
+      "learning_rate": 0.00019067345607159345,
+      "loss": 0.9828,
+      "step": 3130
+    },
+    {
+      "epoch": 0.5575142450142451,
+      "grad_norm": 0.5029847621917725,
+      "learning_rate": 0.0001906675524236553,
+      "loss": 0.797,
+      "step": 3131
+    },
+    {
+      "epoch": 0.5576923076923077,
+      "grad_norm": 0.5681431293487549,
+      "learning_rate": 0.00019066164699927478,
+      "loss": 1.1565,
+      "step": 3132
+    },
+    {
+      "epoch": 0.5578703703703703,
+      "grad_norm": 0.5654549598693848,
+      "learning_rate": 0.00019065573979856764,
+      "loss": 1.2488,
+      "step": 3133
+    },
+    {
+      "epoch": 0.5580484330484331,
+      "grad_norm": 0.47653043270111084,
+      "learning_rate": 0.0001906498308216496,
+      "loss": 1.0428,
+      "step": 3134
+    },
+    {
+      "epoch": 0.5582264957264957,
+      "grad_norm": 0.5068467259407043,
+      "learning_rate": 0.00019064392006863643,
+      "loss": 0.9659,
+      "step": 3135
+    },
+    {
+      "epoch": 0.5584045584045584,
+      "grad_norm": 0.7076661586761475,
+      "learning_rate": 0.00019063800753964393,
+      "loss": 1.1289,
+      "step": 3136
+    },
+    {
+      "epoch": 0.5585826210826211,
+      "grad_norm": 0.551456868648529,
+      "learning_rate": 0.000190632093234788,
+      "loss": 1.1925,
+      "step": 3137
+    },
+    {
+      "epoch": 0.5587606837606838,
+      "grad_norm": 0.518276035785675,
+      "learning_rate": 0.00019062617715418442,
+      "loss": 0.8681,
+      "step": 3138
+    },
+    {
+      "epoch": 0.5589387464387464,
+      "grad_norm": 0.5272278785705566,
+      "learning_rate": 0.0001906202592979492,
+      "loss": 1.0865,
+      "step": 3139
+    },
+    {
+      "epoch": 0.5591168091168092,
+      "grad_norm": 0.5344942212104797,
+      "learning_rate": 0.00019061433966619822,
+      "loss": 1.1647,
+      "step": 3140
+    },
+    {
+      "epoch": 0.5592948717948718,
+      "grad_norm": 0.5833460092544556,
+      "learning_rate": 0.00019060841825904753,
+      "loss": 1.3403,
+      "step": 3141
+    },
+    {
+      "epoch": 0.5594729344729344,
+      "grad_norm": 0.5707054734230042,
+      "learning_rate": 0.00019060249507661306,
+      "loss": 1.1236,
+      "step": 3142
+    },
+    {
+      "epoch": 0.5596509971509972,
+      "grad_norm": 0.5446065664291382,
+      "learning_rate": 0.00019059657011901094,
+      "loss": 1.017,
+      "step": 3143
+    },
+    {
+      "epoch": 0.5598290598290598,
+      "grad_norm": 0.5285109281539917,
+      "learning_rate": 0.0001905906433863572,
+      "loss": 1.3186,
+      "step": 3144
+    },
+    {
+      "epoch": 0.5600071225071225,
+      "grad_norm": 0.5308659672737122,
+      "learning_rate": 0.00019058471487876802,
+      "loss": 0.8464,
+      "step": 3145
+    },
+    {
+      "epoch": 0.5601851851851852,
+      "grad_norm": 0.5218054056167603,
+      "learning_rate": 0.00019057878459635948,
+      "loss": 1.0219,
+      "step": 3146
+    },
+    {
+      "epoch": 0.5603632478632479,
+      "grad_norm": 0.45067787170410156,
+      "learning_rate": 0.00019057285253924785,
+      "loss": 1.0364,
+      "step": 3147
+    },
+    {
+      "epoch": 0.5605413105413105,
+      "grad_norm": 0.4856041669845581,
+      "learning_rate": 0.0001905669187075493,
+      "loss": 1.1928,
+      "step": 3148
+    },
+    {
+      "epoch": 0.5607193732193733,
+      "grad_norm": 0.506912112236023,
+      "learning_rate": 0.00019056098310138016,
+      "loss": 1.119,
+      "step": 3149
+    },
+    {
+      "epoch": 0.5608974358974359,
+      "grad_norm": 0.49049463868141174,
+      "learning_rate": 0.00019055504572085662,
+      "loss": 1.2165,
+      "step": 3150
+    },
+    {
+      "epoch": 0.5610754985754985,
+      "grad_norm": 0.5250293612480164,
+      "learning_rate": 0.0001905491065660951,
+      "loss": 1.1427,
+      "step": 3151
+    },
+    {
+      "epoch": 0.5612535612535613,
+      "grad_norm": 0.43438446521759033,
+      "learning_rate": 0.00019054316563721195,
+      "loss": 0.884,
+      "step": 3152
+    },
+    {
+      "epoch": 0.5614316239316239,
+      "grad_norm": 0.5386807918548584,
+      "learning_rate": 0.00019053722293432354,
+      "loss": 1.1494,
+      "step": 3153
+    },
+    {
+      "epoch": 0.5616096866096866,
+      "grad_norm": 0.5403809547424316,
+      "learning_rate": 0.00019053127845754632,
+      "loss": 1.1743,
+      "step": 3154
+    },
+    {
+      "epoch": 0.5617877492877493,
+      "grad_norm": 0.4759823977947235,
+      "learning_rate": 0.00019052533220699678,
+      "loss": 1.0716,
+      "step": 3155
+    },
+    {
+      "epoch": 0.561965811965812,
+      "grad_norm": 0.45332327485084534,
+      "learning_rate": 0.0001905193841827914,
+      "loss": 0.8405,
+      "step": 3156
+    },
+    {
+      "epoch": 0.5621438746438746,
+      "grad_norm": 0.5617053508758545,
+      "learning_rate": 0.00019051343438504671,
+      "loss": 1.0422,
+      "step": 3157
+    },
+    {
+      "epoch": 0.5623219373219374,
+      "grad_norm": 0.5088049173355103,
+      "learning_rate": 0.00019050748281387931,
+      "loss": 1.0067,
+      "step": 3158
+    },
+    {
+      "epoch": 0.5625,
+      "grad_norm": 0.5174484848976135,
+      "learning_rate": 0.00019050152946940578,
+      "loss": 1.0623,
+      "step": 3159
+    },
+    {
+      "epoch": 0.5626780626780626,
+      "grad_norm": 0.6093568801879883,
+      "learning_rate": 0.0001904955743517428,
+      "loss": 1.24,
+      "step": 3160
+    },
+    {
+      "epoch": 0.5628561253561254,
+      "grad_norm": 0.49063584208488464,
+      "learning_rate": 0.00019048961746100703,
+      "loss": 0.8563,
+      "step": 3161
+    },
+    {
+      "epoch": 0.563034188034188,
+      "grad_norm": 0.583940863609314,
+      "learning_rate": 0.00019048365879731517,
+      "loss": 1.0695,
+      "step": 3162
+    },
+    {
+      "epoch": 0.5632122507122507,
+      "grad_norm": 0.4943268597126007,
+      "learning_rate": 0.000190477698360784,
+      "loss": 0.8606,
+      "step": 3163
+    },
+    {
+      "epoch": 0.5633903133903134,
+      "grad_norm": 0.5050932168960571,
+      "learning_rate": 0.00019047173615153028,
+      "loss": 1.1591,
+      "step": 3164
+    },
+    {
+      "epoch": 0.5635683760683761,
+      "grad_norm": 0.5445677638053894,
+      "learning_rate": 0.0001904657721696708,
+      "loss": 1.262,
+      "step": 3165
+    },
+    {
+      "epoch": 0.5637464387464387,
+      "grad_norm": 0.5445297360420227,
+      "learning_rate": 0.00019045980641532246,
+      "loss": 1.223,
+      "step": 3166
+    },
+    {
+      "epoch": 0.5639245014245015,
+      "grad_norm": 0.5098413228988647,
+      "learning_rate": 0.00019045383888860213,
+      "loss": 1.0829,
+      "step": 3167
+    },
+    {
+      "epoch": 0.5641025641025641,
+      "grad_norm": 0.484998881816864,
+      "learning_rate": 0.0001904478695896267,
+      "loss": 1.0711,
+      "step": 3168
+    },
+    {
+      "epoch": 0.5642806267806267,
+      "grad_norm": 0.5515334010124207,
+      "learning_rate": 0.0001904418985185132,
+      "loss": 1.1583,
+      "step": 3169
+    },
+    {
+      "epoch": 0.5644586894586895,
+      "grad_norm": 0.545460045337677,
+      "learning_rate": 0.00019043592567537853,
+      "loss": 1.2321,
+      "step": 3170
+    },
+    {
+      "epoch": 0.5646367521367521,
+      "grad_norm": 0.5463964343070984,
+      "learning_rate": 0.0001904299510603398,
+      "loss": 1.1019,
+      "step": 3171
+    },
+    {
+      "epoch": 0.5648148148148148,
+      "grad_norm": 0.5619220733642578,
+      "learning_rate": 0.000190423974673514,
+      "loss": 1.1001,
+      "step": 3172
+    },
+    {
+      "epoch": 0.5649928774928775,
+      "grad_norm": 0.4448916018009186,
+      "learning_rate": 0.00019041799651501825,
+      "loss": 1.057,
+      "step": 3173
+    },
+    {
+      "epoch": 0.5651709401709402,
+      "grad_norm": 0.6073006987571716,
+      "learning_rate": 0.00019041201658496975,
+      "loss": 1.0306,
+      "step": 3174
+    },
+    {
+      "epoch": 0.5653490028490028,
+      "grad_norm": 0.5342072248458862,
+      "learning_rate": 0.0001904060348834855,
+      "loss": 0.9231,
+      "step": 3175
+    },
+    {
+      "epoch": 0.5655270655270656,
+      "grad_norm": 0.4505697786808014,
+      "learning_rate": 0.0001904000514106829,
+      "loss": 1.1134,
+      "step": 3176
+    },
+    {
+      "epoch": 0.5657051282051282,
+      "grad_norm": 0.5627852082252502,
+      "learning_rate": 0.00019039406616667902,
+      "loss": 1.2138,
+      "step": 3177
+    },
+    {
+      "epoch": 0.5658831908831908,
+      "grad_norm": 0.499734103679657,
+      "learning_rate": 0.0001903880791515912,
+      "loss": 1.1074,
+      "step": 3178
+    },
+    {
+      "epoch": 0.5660612535612536,
+      "grad_norm": 0.4768189489841461,
+      "learning_rate": 0.00019038209036553676,
+      "loss": 0.9442,
+      "step": 3179
+    },
+    {
+      "epoch": 0.5662393162393162,
+      "grad_norm": 0.5265373587608337,
+      "learning_rate": 0.00019037609980863298,
+      "loss": 1.0907,
+      "step": 3180
+    },
+    {
+      "epoch": 0.5664173789173789,
+      "grad_norm": 0.5506128072738647,
+      "learning_rate": 0.00019037010748099728,
+      "loss": 1.2541,
+      "step": 3181
+    },
+    {
+      "epoch": 0.5665954415954416,
+      "grad_norm": 0.44860872626304626,
+      "learning_rate": 0.00019036411338274703,
+      "loss": 0.893,
+      "step": 3182
+    },
+    {
+      "epoch": 0.5667735042735043,
+      "grad_norm": 0.4901522994041443,
+      "learning_rate": 0.00019035811751399973,
+      "loss": 1.0469,
+      "step": 3183
+    },
+    {
+      "epoch": 0.5669515669515669,
+      "grad_norm": 0.500868022441864,
+      "learning_rate": 0.0001903521198748728,
+      "loss": 1.0527,
+      "step": 3184
+    },
+    {
+      "epoch": 0.5671296296296297,
+      "grad_norm": 0.5508102774620056,
+      "learning_rate": 0.00019034612046548376,
+      "loss": 1.283,
+      "step": 3185
+    },
+    {
+      "epoch": 0.5673076923076923,
+      "grad_norm": 0.5079495906829834,
+      "learning_rate": 0.0001903401192859502,
+      "loss": 1.0808,
+      "step": 3186
+    },
+    {
+      "epoch": 0.5674857549857549,
+      "grad_norm": 0.5758788585662842,
+      "learning_rate": 0.00019033411633638964,
+      "loss": 1.1301,
+      "step": 3187
+    },
+    {
+      "epoch": 0.5676638176638177,
+      "grad_norm": 0.46557924151420593,
+      "learning_rate": 0.00019032811161691972,
+      "loss": 1.0205,
+      "step": 3188
+    },
+    {
+      "epoch": 0.5678418803418803,
+      "grad_norm": 0.5665056109428406,
+      "learning_rate": 0.0001903221051276581,
+      "loss": 1.1926,
+      "step": 3189
+    },
+    {
+      "epoch": 0.5680199430199431,
+      "grad_norm": 0.5948992967605591,
+      "learning_rate": 0.00019031609686872246,
+      "loss": 1.2724,
+      "step": 3190
+    },
+    {
+      "epoch": 0.5681980056980057,
+      "grad_norm": 0.6189367771148682,
+      "learning_rate": 0.00019031008684023055,
+      "loss": 1.2762,
+      "step": 3191
+    },
+    {
+      "epoch": 0.5683760683760684,
+      "grad_norm": 0.49511992931365967,
+      "learning_rate": 0.00019030407504230006,
+      "loss": 1.0117,
+      "step": 3192
+    },
+    {
+      "epoch": 0.5685541310541311,
+      "grad_norm": 0.5358837842941284,
+      "learning_rate": 0.00019029806147504878,
+      "loss": 0.944,
+      "step": 3193
+    },
+    {
+      "epoch": 0.5687321937321937,
+      "grad_norm": 0.458636999130249,
+      "learning_rate": 0.00019029204613859463,
+      "loss": 0.8174,
+      "step": 3194
+    },
+    {
+      "epoch": 0.5689102564102564,
+      "grad_norm": 0.5168304443359375,
+      "learning_rate": 0.00019028602903305535,
+      "loss": 1.1533,
+      "step": 3195
+    },
+    {
+      "epoch": 0.5690883190883191,
+      "grad_norm": 0.5334134697914124,
+      "learning_rate": 0.00019028001015854892,
+      "loss": 1.1868,
+      "step": 3196
+    },
+    {
+      "epoch": 0.5692663817663818,
+      "grad_norm": 0.5649123191833496,
+      "learning_rate": 0.0001902739895151932,
+      "loss": 0.9876,
+      "step": 3197
+    },
+    {
+      "epoch": 0.5694444444444444,
+      "grad_norm": 0.5647651553153992,
+      "learning_rate": 0.0001902679671031062,
+      "loss": 1.0805,
+      "step": 3198
+    },
+    {
+      "epoch": 0.5696225071225072,
+      "grad_norm": 0.5251876711845398,
+      "learning_rate": 0.00019026194292240587,
+      "loss": 1.2335,
+      "step": 3199
+    },
+    {
+      "epoch": 0.5698005698005698,
+      "grad_norm": 0.5268014669418335,
+      "learning_rate": 0.0001902559169732103,
+      "loss": 1.19,
+      "step": 3200
+    },
+    {
+      "epoch": 0.5699786324786325,
+      "grad_norm": 0.5301041007041931,
+      "learning_rate": 0.00019024988925563752,
+      "loss": 1.1173,
+      "step": 3201
+    },
+    {
+      "epoch": 0.5701566951566952,
+      "grad_norm": 0.4531562030315399,
+      "learning_rate": 0.00019024385976980566,
+      "loss": 0.7576,
+      "step": 3202
+    },
+    {
+      "epoch": 0.5703347578347578,
+      "grad_norm": 0.5779716372489929,
+      "learning_rate": 0.00019023782851583282,
+      "loss": 1.1719,
+      "step": 3203
+    },
+    {
+      "epoch": 0.5705128205128205,
+      "grad_norm": 0.4886093735694885,
+      "learning_rate": 0.00019023179549383716,
+      "loss": 1.085,
+      "step": 3204
+    },
+    {
+      "epoch": 0.5706908831908832,
+      "grad_norm": 0.510117769241333,
+      "learning_rate": 0.0001902257607039369,
+      "loss": 0.8931,
+      "step": 3205
+    },
+    {
+      "epoch": 0.5708689458689459,
+      "grad_norm": 0.5195479393005371,
+      "learning_rate": 0.00019021972414625036,
+      "loss": 0.9922,
+      "step": 3206
+    },
+    {
+      "epoch": 0.5710470085470085,
+      "grad_norm": 0.5791407227516174,
+      "learning_rate": 0.00019021368582089568,
+      "loss": 1.112,
+      "step": 3207
+    },
+    {
+      "epoch": 0.5712250712250713,
+      "grad_norm": 0.5056005716323853,
+      "learning_rate": 0.00019020764572799122,
+      "loss": 0.8474,
+      "step": 3208
+    },
+    {
+      "epoch": 0.5714031339031339,
+      "grad_norm": 0.5060068964958191,
+      "learning_rate": 0.00019020160386765537,
+      "loss": 1.071,
+      "step": 3209
+    },
+    {
+      "epoch": 0.5715811965811965,
+      "grad_norm": 0.5396568775177002,
+      "learning_rate": 0.00019019556024000648,
+      "loss": 1.0436,
+      "step": 3210
+    },
+    {
+      "epoch": 0.5717592592592593,
+      "grad_norm": 0.6552190780639648,
+      "learning_rate": 0.0001901895148451629,
+      "loss": 0.9869,
+      "step": 3211
+    },
+    {
+      "epoch": 0.5719373219373219,
+      "grad_norm": 0.5177004337310791,
+      "learning_rate": 0.00019018346768324314,
+      "loss": 1.0193,
+      "step": 3212
+    },
+    {
+      "epoch": 0.5721153846153846,
+      "grad_norm": 0.5192117094993591,
+      "learning_rate": 0.0001901774187543657,
+      "loss": 1.1263,
+      "step": 3213
+    },
+    {
+      "epoch": 0.5722934472934473,
+      "grad_norm": 0.4857729971408844,
+      "learning_rate": 0.00019017136805864906,
+      "loss": 0.9808,
+      "step": 3214
+    },
+    {
+      "epoch": 0.57247150997151,
+      "grad_norm": 0.5800918936729431,
+      "learning_rate": 0.00019016531559621177,
+      "loss": 1.2334,
+      "step": 3215
+    },
+    {
+      "epoch": 0.5726495726495726,
+      "grad_norm": 0.4812086522579193,
+      "learning_rate": 0.00019015926136717242,
+      "loss": 1.2409,
+      "step": 3216
+    },
+    {
+      "epoch": 0.5728276353276354,
+      "grad_norm": 0.5128398537635803,
+      "learning_rate": 0.00019015320537164963,
+      "loss": 0.9036,
+      "step": 3217
+    },
+    {
+      "epoch": 0.573005698005698,
+      "grad_norm": 0.4761141538619995,
+      "learning_rate": 0.00019014714760976205,
+      "loss": 1.1058,
+      "step": 3218
+    },
+    {
+      "epoch": 0.5731837606837606,
+      "grad_norm": 0.5850459933280945,
+      "learning_rate": 0.0001901410880816284,
+      "loss": 1.1011,
+      "step": 3219
+    },
+    {
+      "epoch": 0.5733618233618234,
+      "grad_norm": 0.5648714303970337,
+      "learning_rate": 0.00019013502678736738,
+      "loss": 1.0479,
+      "step": 3220
+    },
+    {
+      "epoch": 0.573539886039886,
+      "grad_norm": 0.5835902094841003,
+      "learning_rate": 0.00019012896372709774,
+      "loss": 1.0555,
+      "step": 3221
+    },
+    {
+      "epoch": 0.5737179487179487,
+      "grad_norm": 0.5155113935470581,
+      "learning_rate": 0.00019012289890093828,
+      "loss": 0.9488,
+      "step": 3222
+    },
+    {
+      "epoch": 0.5738960113960114,
+      "grad_norm": 0.5064889788627625,
+      "learning_rate": 0.00019011683230900784,
+      "loss": 0.9144,
+      "step": 3223
+    },
+    {
+      "epoch": 0.5740740740740741,
+      "grad_norm": 0.53825843334198,
+      "learning_rate": 0.00019011076395142527,
+      "loss": 1.0713,
+      "step": 3224
+    },
+    {
+      "epoch": 0.5742521367521367,
+      "grad_norm": 0.5341386198997498,
+      "learning_rate": 0.00019010469382830947,
+      "loss": 1.1438,
+      "step": 3225
+    },
+    {
+      "epoch": 0.5744301994301995,
+      "grad_norm": 0.5300050973892212,
+      "learning_rate": 0.00019009862193977936,
+      "loss": 1.0114,
+      "step": 3226
+    },
+    {
+      "epoch": 0.5746082621082621,
+      "grad_norm": 0.6033682823181152,
+      "learning_rate": 0.0001900925482859539,
+      "loss": 1.0458,
+      "step": 3227
+    },
+    {
+      "epoch": 0.5747863247863247,
+      "grad_norm": 0.5108983516693115,
+      "learning_rate": 0.00019008647286695215,
+      "loss": 1.1211,
+      "step": 3228
+    },
+    {
+      "epoch": 0.5749643874643875,
+      "grad_norm": 0.5263782739639282,
+      "learning_rate": 0.00019008039568289308,
+      "loss": 0.8647,
+      "step": 3229
+    },
+    {
+      "epoch": 0.5751424501424501,
+      "grad_norm": 0.47119566798210144,
+      "learning_rate": 0.0001900743167338958,
+      "loss": 1.019,
+      "step": 3230
+    },
+    {
+      "epoch": 0.5753205128205128,
+      "grad_norm": 0.56391841173172,
+      "learning_rate": 0.00019006823602007937,
+      "loss": 0.9791,
+      "step": 3231
+    },
+    {
+      "epoch": 0.5754985754985755,
+      "grad_norm": 0.5364985466003418,
+      "learning_rate": 0.000190062153541563,
+      "loss": 1.1355,
+      "step": 3232
+    },
+    {
+      "epoch": 0.5756766381766382,
+      "grad_norm": 0.5098565220832825,
+      "learning_rate": 0.00019005606929846578,
+      "loss": 0.987,
+      "step": 3233
+    },
+    {
+      "epoch": 0.5758547008547008,
+      "grad_norm": 0.6640968918800354,
+      "learning_rate": 0.00019004998329090692,
+      "loss": 1.1165,
+      "step": 3234
+    },
+    {
+      "epoch": 0.5760327635327636,
+      "grad_norm": 0.5044721961021423,
+      "learning_rate": 0.00019004389551900578,
+      "loss": 0.8643,
+      "step": 3235
+    },
+    {
+      "epoch": 0.5762108262108262,
+      "grad_norm": 0.4822785258293152,
+      "learning_rate": 0.00019003780598288153,
+      "loss": 1.0735,
+      "step": 3236
+    },
+    {
+      "epoch": 0.5763888888888888,
+      "grad_norm": 0.505261242389679,
+      "learning_rate": 0.00019003171468265348,
+      "loss": 1.0001,
+      "step": 3237
+    },
+    {
+      "epoch": 0.5765669515669516,
+      "grad_norm": 0.5020412802696228,
+      "learning_rate": 0.00019002562161844102,
+      "loss": 0.9601,
+      "step": 3238
+    },
+    {
+      "epoch": 0.5767450142450142,
+      "grad_norm": 0.4920475482940674,
+      "learning_rate": 0.00019001952679036354,
+      "loss": 1.0111,
+      "step": 3239
+    },
+    {
+      "epoch": 0.5769230769230769,
+      "grad_norm": 0.5638813376426697,
+      "learning_rate": 0.00019001343019854042,
+      "loss": 1.1456,
+      "step": 3240
+    },
+    {
+      "epoch": 0.5771011396011396,
+      "grad_norm": 0.5519235134124756,
+      "learning_rate": 0.0001900073318430911,
+      "loss": 0.9258,
+      "step": 3241
+    },
+    {
+      "epoch": 0.5772792022792023,
+      "grad_norm": 0.5207770466804504,
+      "learning_rate": 0.0001900012317241351,
+      "loss": 0.9859,
+      "step": 3242
+    },
+    {
+      "epoch": 0.5774572649572649,
+      "grad_norm": 0.5493707656860352,
+      "learning_rate": 0.00018999512984179195,
+      "loss": 1.1183,
+      "step": 3243
+    },
+    {
+      "epoch": 0.5776353276353277,
+      "grad_norm": 0.4504764676094055,
+      "learning_rate": 0.00018998902619618116,
+      "loss": 0.9363,
+      "step": 3244
+    },
+    {
+      "epoch": 0.5778133903133903,
+      "grad_norm": 0.5232836604118347,
+      "learning_rate": 0.00018998292078742233,
+      "loss": 1.1887,
+      "step": 3245
+    },
+    {
+      "epoch": 0.5779914529914529,
+      "grad_norm": 0.5715088248252869,
+      "learning_rate": 0.0001899768136156351,
+      "loss": 1.4524,
+      "step": 3246
+    },
+    {
+      "epoch": 0.5781695156695157,
+      "grad_norm": 0.59555584192276,
+      "learning_rate": 0.0001899707046809391,
+      "loss": 1.0922,
+      "step": 3247
+    },
+    {
+      "epoch": 0.5783475783475783,
+      "grad_norm": 0.4500894546508789,
+      "learning_rate": 0.00018996459398345404,
+      "loss": 1.0087,
+      "step": 3248
+    },
+    {
+      "epoch": 0.5785256410256411,
+      "grad_norm": 0.49126625061035156,
+      "learning_rate": 0.00018995848152329967,
+      "loss": 1.1512,
+      "step": 3249
+    },
+    {
+      "epoch": 0.5787037037037037,
+      "grad_norm": 0.4096335172653198,
+      "learning_rate": 0.00018995236730059574,
+      "loss": 0.7633,
+      "step": 3250
+    },
+    {
+      "epoch": 0.5788817663817664,
+      "grad_norm": 0.5364313721656799,
+      "learning_rate": 0.00018994625131546199,
+      "loss": 1.295,
+      "step": 3251
+    },
+    {
+      "epoch": 0.5790598290598291,
+      "grad_norm": 0.4897502660751343,
+      "learning_rate": 0.00018994013356801834,
+      "loss": 1.2197,
+      "step": 3252
+    },
+    {
+      "epoch": 0.5792378917378918,
+      "grad_norm": 0.5101368427276611,
+      "learning_rate": 0.00018993401405838456,
+      "loss": 1.1129,
+      "step": 3253
+    },
+    {
+      "epoch": 0.5794159544159544,
+      "grad_norm": 0.5426377654075623,
+      "learning_rate": 0.00018992789278668063,
+      "loss": 1.188,
+      "step": 3254
+    },
+    {
+      "epoch": 0.5795940170940171,
+      "grad_norm": 0.5066362023353577,
+      "learning_rate": 0.00018992176975302644,
+      "loss": 1.2802,
+      "step": 3255
+    },
+    {
+      "epoch": 0.5797720797720798,
+      "grad_norm": 0.5418947339057922,
+      "learning_rate": 0.00018991564495754196,
+      "loss": 1.1675,
+      "step": 3256
+    },
+    {
+      "epoch": 0.5799501424501424,
+      "grad_norm": 0.5139963626861572,
+      "learning_rate": 0.0001899095184003472,
+      "loss": 0.9717,
+      "step": 3257
+    },
+    {
+      "epoch": 0.5801282051282052,
+      "grad_norm": 0.5167285799980164,
+      "learning_rate": 0.00018990339008156219,
+      "loss": 1.1529,
+      "step": 3258
+    },
+    {
+      "epoch": 0.5803062678062678,
+      "grad_norm": 0.53471440076828,
+      "learning_rate": 0.00018989726000130704,
+      "loss": 1.0711,
+      "step": 3259
+    },
+    {
+      "epoch": 0.5804843304843305,
+      "grad_norm": 0.49875229597091675,
+      "learning_rate": 0.0001898911281597018,
+      "loss": 1.1095,
+      "step": 3260
+    },
+    {
+      "epoch": 0.5806623931623932,
+      "grad_norm": 0.4473155438899994,
+      "learning_rate": 0.00018988499455686663,
+      "loss": 0.836,
+      "step": 3261
+    },
+    {
+      "epoch": 0.5808404558404558,
+      "grad_norm": 0.6181996464729309,
+      "learning_rate": 0.00018987885919292174,
+      "loss": 1.2787,
+      "step": 3262
+    },
+    {
+      "epoch": 0.5810185185185185,
+      "grad_norm": 0.4996899664402008,
+      "learning_rate": 0.00018987272206798733,
+      "loss": 1.2132,
+      "step": 3263
+    },
+    {
+      "epoch": 0.5811965811965812,
+      "grad_norm": 0.49979713559150696,
+      "learning_rate": 0.00018986658318218358,
+      "loss": 0.8388,
+      "step": 3264
+    },
+    {
+      "epoch": 0.5813746438746439,
+      "grad_norm": 0.5288876295089722,
+      "learning_rate": 0.00018986044253563084,
+      "loss": 1.1871,
+      "step": 3265
+    },
+    {
+      "epoch": 0.5815527065527065,
+      "grad_norm": 0.534063458442688,
+      "learning_rate": 0.00018985430012844937,
+      "loss": 0.96,
+      "step": 3266
+    },
+    {
+      "epoch": 0.5817307692307693,
+      "grad_norm": 0.5081285834312439,
+      "learning_rate": 0.00018984815596075953,
+      "loss": 1.1577,
+      "step": 3267
+    },
+    {
+      "epoch": 0.5819088319088319,
+      "grad_norm": 0.5648202896118164,
+      "learning_rate": 0.00018984201003268176,
+      "loss": 1.2235,
+      "step": 3268
+    },
+    {
+      "epoch": 0.5820868945868946,
+      "grad_norm": 0.495061993598938,
+      "learning_rate": 0.00018983586234433642,
+      "loss": 1.056,
+      "step": 3269
+    },
+    {
+      "epoch": 0.5822649572649573,
+      "grad_norm": 0.47149857878685,
+      "learning_rate": 0.000189829712895844,
+      "loss": 1.0844,
+      "step": 3270
+    },
+    {
+      "epoch": 0.58244301994302,
+      "grad_norm": 0.6107062697410583,
+      "learning_rate": 0.00018982356168732492,
+      "loss": 0.9868,
+      "step": 3271
+    },
+    {
+      "epoch": 0.5826210826210826,
+      "grad_norm": 0.7355940341949463,
+      "learning_rate": 0.00018981740871889974,
+      "loss": 1.1448,
+      "step": 3272
+    },
+    {
+      "epoch": 0.5827991452991453,
+      "grad_norm": 0.5950441956520081,
+      "learning_rate": 0.00018981125399068907,
+      "loss": 0.9618,
+      "step": 3273
+    },
+    {
+      "epoch": 0.582977207977208,
+      "grad_norm": 0.47607290744781494,
+      "learning_rate": 0.0001898050975028134,
+      "loss": 0.957,
+      "step": 3274
+    },
+    {
+      "epoch": 0.5831552706552706,
+      "grad_norm": 0.541164755821228,
+      "learning_rate": 0.00018979893925539338,
+      "loss": 1.1426,
+      "step": 3275
+    },
+    {
+      "epoch": 0.5833333333333334,
+      "grad_norm": 0.5240640044212341,
+      "learning_rate": 0.00018979277924854974,
+      "loss": 1.1421,
+      "step": 3276
+    },
+    {
+      "epoch": 0.583511396011396,
+      "grad_norm": 0.48155727982521057,
+      "learning_rate": 0.00018978661748240307,
+      "loss": 1.0069,
+      "step": 3277
+    },
+    {
+      "epoch": 0.5836894586894587,
+      "grad_norm": 0.5559938549995422,
+      "learning_rate": 0.00018978045395707418,
+      "loss": 1.1227,
+      "step": 3278
+    },
+    {
+      "epoch": 0.5838675213675214,
+      "grad_norm": 0.5244291424751282,
+      "learning_rate": 0.0001897742886726838,
+      "loss": 1.1103,
+      "step": 3279
+    },
+    {
+      "epoch": 0.584045584045584,
+      "grad_norm": 0.5277758240699768,
+      "learning_rate": 0.00018976812162935268,
+      "loss": 1.2125,
+      "step": 3280
+    },
+    {
+      "epoch": 0.5842236467236467,
+      "grad_norm": 0.5415039658546448,
+      "learning_rate": 0.00018976195282720173,
+      "loss": 1.146,
+      "step": 3281
+    },
+    {
+      "epoch": 0.5844017094017094,
+      "grad_norm": 0.5152051448822021,
+      "learning_rate": 0.00018975578226635177,
+      "loss": 1.0092,
+      "step": 3282
+    },
+    {
+      "epoch": 0.5845797720797721,
+      "grad_norm": 0.5489452481269836,
+      "learning_rate": 0.00018974960994692371,
+      "loss": 1.2425,
+      "step": 3283
+    },
+    {
+      "epoch": 0.5847578347578347,
+      "grad_norm": 0.491274356842041,
+      "learning_rate": 0.00018974343586903848,
+      "loss": 0.9559,
+      "step": 3284
+    },
+    {
+      "epoch": 0.5849358974358975,
+      "grad_norm": 0.5783739686012268,
+      "learning_rate": 0.00018973726003281707,
+      "loss": 1.1971,
+      "step": 3285
+    },
+    {
+      "epoch": 0.5851139601139601,
+      "grad_norm": 0.5056472420692444,
+      "learning_rate": 0.00018973108243838045,
+      "loss": 1.0313,
+      "step": 3286
+    },
+    {
+      "epoch": 0.5852920227920227,
+      "grad_norm": 0.4939729571342468,
+      "learning_rate": 0.00018972490308584962,
+      "loss": 1.1061,
+      "step": 3287
+    },
+    {
+      "epoch": 0.5854700854700855,
+      "grad_norm": 0.4889580011367798,
+      "learning_rate": 0.00018971872197534576,
+      "loss": 0.9157,
+      "step": 3288
+    },
+    {
+      "epoch": 0.5856481481481481,
+      "grad_norm": 0.40889349579811096,
+      "learning_rate": 0.00018971253910698993,
+      "loss": 0.8083,
+      "step": 3289
+    },
+    {
+      "epoch": 0.5858262108262108,
+      "grad_norm": 0.5221503973007202,
+      "learning_rate": 0.00018970635448090322,
+      "loss": 0.9995,
+      "step": 3290
+    },
+    {
+      "epoch": 0.5860042735042735,
+      "grad_norm": 0.47060561180114746,
+      "learning_rate": 0.00018970016809720687,
+      "loss": 0.9738,
+      "step": 3291
+    },
+    {
+      "epoch": 0.5861823361823362,
+      "grad_norm": 0.6083170771598816,
+      "learning_rate": 0.000189693979956022,
+      "loss": 1.188,
+      "step": 3292
+    },
+    {
+      "epoch": 0.5863603988603988,
+      "grad_norm": 0.4696751534938812,
+      "learning_rate": 0.00018968779005746998,
+      "loss": 1.089,
+      "step": 3293
+    },
+    {
+      "epoch": 0.5865384615384616,
+      "grad_norm": 0.5081014633178711,
+      "learning_rate": 0.00018968159840167202,
+      "loss": 1.1869,
+      "step": 3294
+    },
+    {
+      "epoch": 0.5867165242165242,
+      "grad_norm": 0.48042431473731995,
+      "learning_rate": 0.0001896754049887494,
+      "loss": 0.964,
+      "step": 3295
+    },
+    {
+      "epoch": 0.5868945868945868,
+      "grad_norm": 0.5075193643569946,
+      "learning_rate": 0.00018966920981882353,
+      "loss": 1.1884,
+      "step": 3296
+    },
+    {
+      "epoch": 0.5870726495726496,
+      "grad_norm": 0.5734842419624329,
+      "learning_rate": 0.00018966301289201576,
+      "loss": 1.1475,
+      "step": 3297
+    },
+    {
+      "epoch": 0.5872507122507122,
+      "grad_norm": 0.5525311231613159,
+      "learning_rate": 0.00018965681420844753,
+      "loss": 1.241,
+      "step": 3298
+    },
+    {
+      "epoch": 0.5874287749287749,
+      "grad_norm": 0.48142680525779724,
+      "learning_rate": 0.00018965061376824025,
+      "loss": 1.0871,
+      "step": 3299
+    },
+    {
+      "epoch": 0.5876068376068376,
+      "grad_norm": 0.5360350608825684,
+      "learning_rate": 0.00018964441157151544,
+      "loss": 1.1895,
+      "step": 3300
+    },
+    {
+      "epoch": 0.5877849002849003,
+      "grad_norm": 0.5207685232162476,
+      "learning_rate": 0.00018963820761839457,
+      "loss": 0.9323,
+      "step": 3301
+    },
+    {
+      "epoch": 0.5879629629629629,
+      "grad_norm": 0.453620970249176,
+      "learning_rate": 0.00018963200190899926,
+      "loss": 0.802,
+      "step": 3302
+    },
+    {
+      "epoch": 0.5881410256410257,
+      "grad_norm": 0.5198796391487122,
+      "learning_rate": 0.00018962579444345106,
+      "loss": 1.0243,
+      "step": 3303
+    },
+    {
+      "epoch": 0.5883190883190883,
+      "grad_norm": 0.5597525835037231,
+      "learning_rate": 0.0001896195852218716,
+      "loss": 0.9351,
+      "step": 3304
+    },
+    {
+      "epoch": 0.5884971509971509,
+      "grad_norm": 0.5738299489021301,
+      "learning_rate": 0.00018961337424438254,
+      "loss": 1.3737,
+      "step": 3305
+    },
+    {
+      "epoch": 0.5886752136752137,
+      "grad_norm": 0.5569949150085449,
+      "learning_rate": 0.00018960716151110554,
+      "loss": 1.0469,
+      "step": 3306
+    },
+    {
+      "epoch": 0.5888532763532763,
+      "grad_norm": 0.5088010430335999,
+      "learning_rate": 0.00018960094702216238,
+      "loss": 1.0982,
+      "step": 3307
+    },
+    {
+      "epoch": 0.5890313390313391,
+      "grad_norm": 0.5127636790275574,
+      "learning_rate": 0.0001895947307776748,
+      "loss": 0.9986,
+      "step": 3308
+    },
+    {
+      "epoch": 0.5892094017094017,
+      "grad_norm": 0.5160682797431946,
+      "learning_rate": 0.00018958851277776456,
+      "loss": 1.0219,
+      "step": 3309
+    },
+    {
+      "epoch": 0.5893874643874644,
+      "grad_norm": 0.5380711555480957,
+      "learning_rate": 0.00018958229302255356,
+      "loss": 1.118,
+      "step": 3310
+    },
+    {
+      "epoch": 0.5895655270655271,
+      "grad_norm": 0.5571228861808777,
+      "learning_rate": 0.0001895760715121636,
+      "loss": 1.0302,
+      "step": 3311
+    },
+    {
+      "epoch": 0.5897435897435898,
+      "grad_norm": 0.542266309261322,
+      "learning_rate": 0.00018956984824671657,
+      "loss": 1.0372,
+      "step": 3312
+    },
+    {
+      "epoch": 0.5899216524216524,
+      "grad_norm": 0.48350459337234497,
+      "learning_rate": 0.00018956362322633446,
+      "loss": 1.2,
+      "step": 3313
+    },
+    {
+      "epoch": 0.5900997150997151,
+      "grad_norm": 0.5001645088195801,
+      "learning_rate": 0.0001895573964511392,
+      "loss": 0.9749,
+      "step": 3314
+    },
+    {
+      "epoch": 0.5902777777777778,
+      "grad_norm": 0.5227531790733337,
+      "learning_rate": 0.00018955116792125276,
+      "loss": 1.025,
+      "step": 3315
+    },
+    {
+      "epoch": 0.5904558404558404,
+      "grad_norm": 0.522251546382904,
+      "learning_rate": 0.00018954493763679727,
+      "loss": 1.0821,
+      "step": 3316
+    },
+    {
+      "epoch": 0.5906339031339032,
+      "grad_norm": 0.5423251390457153,
+      "learning_rate": 0.00018953870559789467,
+      "loss": 1.0961,
+      "step": 3317
+    },
+    {
+      "epoch": 0.5908119658119658,
+      "grad_norm": 0.5615720748901367,
+      "learning_rate": 0.0001895324718046672,
+      "loss": 1.1209,
+      "step": 3318
+    },
+    {
+      "epoch": 0.5909900284900285,
+      "grad_norm": 0.44746771454811096,
+      "learning_rate": 0.00018952623625723692,
+      "loss": 0.9935,
+      "step": 3319
+    },
+    {
+      "epoch": 0.5911680911680912,
+      "grad_norm": 0.5993229150772095,
+      "learning_rate": 0.00018951999895572597,
+      "loss": 1.1409,
+      "step": 3320
+    },
+    {
+      "epoch": 0.5913461538461539,
+      "grad_norm": 0.4969801902770996,
+      "learning_rate": 0.00018951375990025666,
+      "loss": 1.1568,
+      "step": 3321
+    },
+    {
+      "epoch": 0.5915242165242165,
+      "grad_norm": 0.6001267433166504,
+      "learning_rate": 0.00018950751909095116,
+      "loss": 1.1135,
+      "step": 3322
+    },
+    {
+      "epoch": 0.5917022792022792,
+      "grad_norm": 0.5386021733283997,
+      "learning_rate": 0.00018950127652793172,
+      "loss": 0.947,
+      "step": 3323
+    },
+    {
+      "epoch": 0.5918803418803419,
+      "grad_norm": 0.49043843150138855,
+      "learning_rate": 0.00018949503221132074,
+      "loss": 0.9581,
+      "step": 3324
+    },
+    {
+      "epoch": 0.5920584045584045,
+      "grad_norm": 0.5241141319274902,
+      "learning_rate": 0.00018948878614124048,
+      "loss": 1.0797,
+      "step": 3325
+    },
+    {
+      "epoch": 0.5922364672364673,
+      "grad_norm": 0.5755026340484619,
+      "learning_rate": 0.00018948253831781338,
+      "loss": 1.1046,
+      "step": 3326
+    },
+    {
+      "epoch": 0.5924145299145299,
+      "grad_norm": 0.5004449486732483,
+      "learning_rate": 0.00018947628874116179,
+      "loss": 1.1416,
+      "step": 3327
+    },
+    {
+      "epoch": 0.5925925925925926,
+      "grad_norm": 0.53347247838974,
+      "learning_rate": 0.00018947003741140821,
+      "loss": 1.2718,
+      "step": 3328
+    },
+    {
+      "epoch": 0.5927706552706553,
+      "grad_norm": 0.6473469138145447,
+      "learning_rate": 0.0001894637843286751,
+      "loss": 1.2255,
+      "step": 3329
+    },
+    {
+      "epoch": 0.592948717948718,
+      "grad_norm": 0.4750518798828125,
+      "learning_rate": 0.00018945752949308498,
+      "loss": 1.0537,
+      "step": 3330
+    },
+    {
+      "epoch": 0.5931267806267806,
+      "grad_norm": 0.5636306405067444,
+      "learning_rate": 0.00018945127290476043,
+      "loss": 0.9906,
+      "step": 3331
+    },
+    {
+      "epoch": 0.5933048433048433,
+      "grad_norm": 0.4871736466884613,
+      "learning_rate": 0.00018944501456382397,
+      "loss": 1.0549,
+      "step": 3332
+    },
+    {
+      "epoch": 0.593482905982906,
+      "grad_norm": 0.5554637312889099,
+      "learning_rate": 0.0001894387544703983,
+      "loss": 1.1587,
+      "step": 3333
+    },
+    {
+      "epoch": 0.5936609686609686,
+      "grad_norm": 0.5385799407958984,
+      "learning_rate": 0.000189432492624606,
+      "loss": 0.9565,
+      "step": 3334
+    },
+    {
+      "epoch": 0.5938390313390314,
+      "grad_norm": 0.4996553063392639,
+      "learning_rate": 0.00018942622902656976,
+      "loss": 1.0456,
+      "step": 3335
+    },
+    {
+      "epoch": 0.594017094017094,
+      "grad_norm": 0.46810707449913025,
+      "learning_rate": 0.00018941996367641237,
+      "loss": 1.119,
+      "step": 3336
+    },
+    {
+      "epoch": 0.5941951566951567,
+      "grad_norm": 0.5672653913497925,
+      "learning_rate": 0.0001894136965742565,
+      "loss": 1.1317,
+      "step": 3337
+    },
+    {
+      "epoch": 0.5943732193732194,
+      "grad_norm": 0.4790053367614746,
+      "learning_rate": 0.00018940742772022504,
+      "loss": 1.0967,
+      "step": 3338
+    },
+    {
+      "epoch": 0.594551282051282,
+      "grad_norm": 0.5935906171798706,
+      "learning_rate": 0.00018940115711444072,
+      "loss": 1.3044,
+      "step": 3339
+    },
+    {
+      "epoch": 0.5947293447293447,
+      "grad_norm": 0.4790516793727875,
+      "learning_rate": 0.00018939488475702647,
+      "loss": 1.074,
+      "step": 3340
+    },
+    {
+      "epoch": 0.5949074074074074,
+      "grad_norm": 0.474588006734848,
+      "learning_rate": 0.00018938861064810516,
+      "loss": 1.1476,
+      "step": 3341
+    },
+    {
+      "epoch": 0.5950854700854701,
+      "grad_norm": 0.4908665120601654,
+      "learning_rate": 0.0001893823347877997,
+      "loss": 1.216,
+      "step": 3342
+    },
+    {
+      "epoch": 0.5952635327635327,
+      "grad_norm": 0.531650960445404,
+      "learning_rate": 0.00018937605717623307,
+      "loss": 1.1057,
+      "step": 3343
+    },
+    {
+      "epoch": 0.5954415954415955,
+      "grad_norm": 0.5581082105636597,
+      "learning_rate": 0.00018936977781352823,
+      "loss": 0.7972,
+      "step": 3344
+    },
+    {
+      "epoch": 0.5956196581196581,
+      "grad_norm": 0.42370662093162537,
+      "learning_rate": 0.00018936349669980827,
+      "loss": 0.8888,
+      "step": 3345
+    },
+    {
+      "epoch": 0.5957977207977208,
+      "grad_norm": 0.5817318558692932,
+      "learning_rate": 0.00018935721383519624,
+      "loss": 1.2801,
+      "step": 3346
+    },
+    {
+      "epoch": 0.5959757834757835,
+      "grad_norm": 0.4766376316547394,
+      "learning_rate": 0.00018935092921981524,
+      "loss": 1.0918,
+      "step": 3347
+    },
+    {
+      "epoch": 0.5961538461538461,
+      "grad_norm": 0.5567346811294556,
+      "learning_rate": 0.00018934464285378836,
+      "loss": 1.0269,
+      "step": 3348
+    },
+    {
+      "epoch": 0.5963319088319088,
+      "grad_norm": 0.5285565257072449,
+      "learning_rate": 0.0001893383547372388,
+      "loss": 1.1887,
+      "step": 3349
+    },
+    {
+      "epoch": 0.5965099715099715,
+      "grad_norm": 0.49052694439888,
+      "learning_rate": 0.00018933206487028979,
+      "loss": 1.0773,
+      "step": 3350
+    },
+    {
+      "epoch": 0.5966880341880342,
+      "grad_norm": 0.6175199151039124,
+      "learning_rate": 0.0001893257732530645,
+      "loss": 1.0192,
+      "step": 3351
+    },
+    {
+      "epoch": 0.5968660968660968,
+      "grad_norm": 0.56049644947052,
+      "learning_rate": 0.00018931947988568628,
+      "loss": 0.9516,
+      "step": 3352
+    },
+    {
+      "epoch": 0.5970441595441596,
+      "grad_norm": 0.47873660922050476,
+      "learning_rate": 0.00018931318476827838,
+      "loss": 0.8174,
+      "step": 3353
+    },
+    {
+      "epoch": 0.5972222222222222,
+      "grad_norm": 0.4748854339122772,
+      "learning_rate": 0.00018930688790096416,
+      "loss": 1.0238,
+      "step": 3354
+    },
+    {
+      "epoch": 0.5974002849002849,
+      "grad_norm": 0.5382232666015625,
+      "learning_rate": 0.00018930058928386698,
+      "loss": 1.0815,
+      "step": 3355
+    },
+    {
+      "epoch": 0.5975783475783476,
+      "grad_norm": 0.5038299560546875,
+      "learning_rate": 0.00018929428891711027,
+      "loss": 1.0472,
+      "step": 3356
+    },
+    {
+      "epoch": 0.5977564102564102,
+      "grad_norm": 0.5185908079147339,
+      "learning_rate": 0.00018928798680081744,
+      "loss": 1.0435,
+      "step": 3357
+    },
+    {
+      "epoch": 0.5979344729344729,
+      "grad_norm": 0.5169877409934998,
+      "learning_rate": 0.00018928168293511202,
+      "loss": 1.0437,
+      "step": 3358
+    },
+    {
+      "epoch": 0.5981125356125356,
+      "grad_norm": 0.5218369960784912,
+      "learning_rate": 0.00018927537732011749,
+      "loss": 1.082,
+      "step": 3359
+    },
+    {
+      "epoch": 0.5982905982905983,
+      "grad_norm": 0.5358219742774963,
+      "learning_rate": 0.0001892690699559574,
+      "loss": 1.2523,
+      "step": 3360
+    },
+    {
+      "epoch": 0.5984686609686609,
+      "grad_norm": 0.47716647386550903,
+      "learning_rate": 0.0001892627608427553,
+      "loss": 1.2069,
+      "step": 3361
+    },
+    {
+      "epoch": 0.5986467236467237,
+      "grad_norm": 0.5484169125556946,
+      "learning_rate": 0.00018925644998063482,
+      "loss": 1.2016,
+      "step": 3362
+    },
+    {
+      "epoch": 0.5988247863247863,
+      "grad_norm": 0.46814846992492676,
+      "learning_rate": 0.00018925013736971965,
+      "loss": 0.7989,
+      "step": 3363
+    },
+    {
+      "epoch": 0.5990028490028491,
+      "grad_norm": 0.5391258001327515,
+      "learning_rate": 0.0001892438230101334,
+      "loss": 1.224,
+      "step": 3364
+    },
+    {
+      "epoch": 0.5991809116809117,
+      "grad_norm": 0.5248384475708008,
+      "learning_rate": 0.00018923750690199987,
+      "loss": 1.1532,
+      "step": 3365
+    },
+    {
+      "epoch": 0.5993589743589743,
+      "grad_norm": 0.5074637532234192,
+      "learning_rate": 0.00018923118904544273,
+      "loss": 1.0968,
+      "step": 3366
+    },
+    {
+      "epoch": 0.5995370370370371,
+      "grad_norm": 0.5260029435157776,
+      "learning_rate": 0.00018922486944058581,
+      "loss": 1.1311,
+      "step": 3367
+    },
+    {
+      "epoch": 0.5997150997150997,
+      "grad_norm": 0.48497965931892395,
+      "learning_rate": 0.00018921854808755294,
+      "loss": 1.1208,
+      "step": 3368
+    },
+    {
+      "epoch": 0.5998931623931624,
+      "grad_norm": 0.5108651518821716,
+      "learning_rate": 0.00018921222498646792,
+      "loss": 1.147,
+      "step": 3369
+    },
+    {
+      "epoch": 0.6000712250712251,
+      "grad_norm": 0.5243437886238098,
+      "learning_rate": 0.00018920590013745471,
+      "loss": 0.9614,
+      "step": 3370
+    },
+    {
+      "epoch": 0.6002492877492878,
+      "grad_norm": 0.47022634744644165,
+      "learning_rate": 0.00018919957354063719,
+      "loss": 1.0579,
+      "step": 3371
+    },
+    {
+      "epoch": 0.6004273504273504,
+      "grad_norm": 0.6461413502693176,
+      "learning_rate": 0.00018919324519613931,
+      "loss": 1.2126,
+      "step": 3372
+    },
+    {
+      "epoch": 0.6006054131054132,
+      "grad_norm": 0.4654616713523865,
+      "learning_rate": 0.00018918691510408508,
+      "loss": 1.1476,
+      "step": 3373
+    },
+    {
+      "epoch": 0.6007834757834758,
+      "grad_norm": 0.48571303486824036,
+      "learning_rate": 0.00018918058326459854,
+      "loss": 1.2093,
+      "step": 3374
+    },
+    {
+      "epoch": 0.6009615384615384,
+      "grad_norm": 0.5255016684532166,
+      "learning_rate": 0.00018917424967780368,
+      "loss": 1.1538,
+      "step": 3375
+    },
+    {
+      "epoch": 0.6011396011396012,
+      "grad_norm": 0.5059894323348999,
+      "learning_rate": 0.00018916791434382468,
+      "loss": 1.0556,
+      "step": 3376
+    },
+    {
+      "epoch": 0.6013176638176638,
+      "grad_norm": 0.4581229090690613,
+      "learning_rate": 0.00018916157726278561,
+      "loss": 1.1468,
+      "step": 3377
+    },
+    {
+      "epoch": 0.6014957264957265,
+      "grad_norm": 0.5701818466186523,
+      "learning_rate": 0.00018915523843481067,
+      "loss": 1.3641,
+      "step": 3378
+    },
+    {
+      "epoch": 0.6016737891737892,
+      "grad_norm": 0.5007243752479553,
+      "learning_rate": 0.00018914889786002403,
+      "loss": 1.2705,
+      "step": 3379
+    },
+    {
+      "epoch": 0.6018518518518519,
+      "grad_norm": 0.5192995071411133,
+      "learning_rate": 0.0001891425555385499,
+      "loss": 0.9922,
+      "step": 3380
+    },
+    {
+      "epoch": 0.6020299145299145,
+      "grad_norm": 0.5880612134933472,
+      "learning_rate": 0.00018913621147051258,
+      "loss": 0.8783,
+      "step": 3381
+    },
+    {
+      "epoch": 0.6022079772079773,
+      "grad_norm": 0.5161563158035278,
+      "learning_rate": 0.0001891298656560364,
+      "loss": 0.9634,
+      "step": 3382
+    },
+    {
+      "epoch": 0.6023860398860399,
+      "grad_norm": 0.48450782895088196,
+      "learning_rate": 0.00018912351809524563,
+      "loss": 0.809,
+      "step": 3383
+    },
+    {
+      "epoch": 0.6025641025641025,
+      "grad_norm": 0.621537983417511,
+      "learning_rate": 0.00018911716878826465,
+      "loss": 1.2031,
+      "step": 3384
+    },
+    {
+      "epoch": 0.6027421652421653,
+      "grad_norm": 0.6014544367790222,
+      "learning_rate": 0.00018911081773521787,
+      "loss": 1.1552,
+      "step": 3385
+    },
+    {
+      "epoch": 0.6029202279202279,
+      "grad_norm": 0.49995481967926025,
+      "learning_rate": 0.00018910446493622976,
+      "loss": 0.8569,
+      "step": 3386
+    },
+    {
+      "epoch": 0.6030982905982906,
+      "grad_norm": 0.5157307386398315,
+      "learning_rate": 0.00018909811039142472,
+      "loss": 0.9515,
+      "step": 3387
+    },
+    {
+      "epoch": 0.6032763532763533,
+      "grad_norm": 0.5164140462875366,
+      "learning_rate": 0.0001890917541009273,
+      "loss": 0.9803,
+      "step": 3388
+    },
+    {
+      "epoch": 0.603454415954416,
+      "grad_norm": 0.5555596947669983,
+      "learning_rate": 0.00018908539606486206,
+      "loss": 1.2994,
+      "step": 3389
+    },
+    {
+      "epoch": 0.6036324786324786,
+      "grad_norm": 0.605697512626648,
+      "learning_rate": 0.00018907903628335353,
+      "loss": 1.2865,
+      "step": 3390
+    },
+    {
+      "epoch": 0.6038105413105413,
+      "grad_norm": 0.5700713992118835,
+      "learning_rate": 0.0001890726747565263,
+      "loss": 1.2493,
+      "step": 3391
+    },
+    {
+      "epoch": 0.603988603988604,
+      "grad_norm": 0.5516746044158936,
+      "learning_rate": 0.0001890663114845051,
+      "loss": 1.2743,
+      "step": 3392
+    },
+    {
+      "epoch": 0.6041666666666666,
+      "grad_norm": 0.5233162641525269,
+      "learning_rate": 0.0001890599464674145,
+      "loss": 0.9237,
+      "step": 3393
+    },
+    {
+      "epoch": 0.6043447293447294,
+      "grad_norm": 0.5709942579269409,
+      "learning_rate": 0.00018905357970537925,
+      "loss": 0.9922,
+      "step": 3394
+    },
+    {
+      "epoch": 0.604522792022792,
+      "grad_norm": 0.48403796553611755,
+      "learning_rate": 0.0001890472111985241,
+      "loss": 1.1255,
+      "step": 3395
+    },
+    {
+      "epoch": 0.6047008547008547,
+      "grad_norm": 0.628718376159668,
+      "learning_rate": 0.00018904084094697386,
+      "loss": 1.1458,
+      "step": 3396
+    },
+    {
+      "epoch": 0.6048789173789174,
+      "grad_norm": 0.46822869777679443,
+      "learning_rate": 0.00018903446895085328,
+      "loss": 0.8727,
+      "step": 3397
+    },
+    {
+      "epoch": 0.60505698005698,
+      "grad_norm": 0.505584180355072,
+      "learning_rate": 0.00018902809521028724,
+      "loss": 1.1595,
+      "step": 3398
+    },
+    {
+      "epoch": 0.6052350427350427,
+      "grad_norm": 0.4494974911212921,
+      "learning_rate": 0.00018902171972540058,
+      "loss": 0.6685,
+      "step": 3399
+    },
+    {
+      "epoch": 0.6054131054131054,
+      "grad_norm": 0.5101519227027893,
+      "learning_rate": 0.0001890153424963183,
+      "loss": 0.9313,
+      "step": 3400
+    },
+    {
+      "epoch": 0.6055911680911681,
+      "grad_norm": 0.5081079602241516,
+      "learning_rate": 0.00018900896352316528,
+      "loss": 1.2588,
+      "step": 3401
+    },
+    {
+      "epoch": 0.6057692307692307,
+      "grad_norm": 0.5784309506416321,
+      "learning_rate": 0.00018900258280606653,
+      "loss": 1.2077,
+      "step": 3402
+    },
+    {
+      "epoch": 0.6059472934472935,
+      "grad_norm": 0.4506312608718872,
+      "learning_rate": 0.00018899620034514705,
+      "loss": 1.05,
+      "step": 3403
+    },
+    {
+      "epoch": 0.6061253561253561,
+      "grad_norm": 0.5243048071861267,
+      "learning_rate": 0.0001889898161405319,
+      "loss": 1.2295,
+      "step": 3404
+    },
+    {
+      "epoch": 0.6063034188034188,
+      "grad_norm": 0.5447196364402771,
+      "learning_rate": 0.00018898343019234615,
+      "loss": 1.1476,
+      "step": 3405
+    },
+    {
+      "epoch": 0.6064814814814815,
+      "grad_norm": 0.46813663840293884,
+      "learning_rate": 0.00018897704250071492,
+      "loss": 1.2113,
+      "step": 3406
+    },
+    {
+      "epoch": 0.6066595441595442,
+      "grad_norm": 0.5340631604194641,
+      "learning_rate": 0.00018897065306576342,
+      "loss": 1.1656,
+      "step": 3407
+    },
+    {
+      "epoch": 0.6068376068376068,
+      "grad_norm": 0.513708233833313,
+      "learning_rate": 0.00018896426188761675,
+      "loss": 1.1616,
+      "step": 3408
+    },
+    {
+      "epoch": 0.6070156695156695,
+      "grad_norm": 0.594601035118103,
+      "learning_rate": 0.00018895786896640023,
+      "loss": 1.2564,
+      "step": 3409
+    },
+    {
+      "epoch": 0.6071937321937322,
+      "grad_norm": 0.45067599415779114,
+      "learning_rate": 0.000188951474302239,
+      "loss": 1.0107,
+      "step": 3410
+    },
+    {
+      "epoch": 0.6073717948717948,
+      "grad_norm": 0.5394250750541687,
+      "learning_rate": 0.00018894507789525843,
+      "loss": 1.4081,
+      "step": 3411
+    },
+    {
+      "epoch": 0.6075498575498576,
+      "grad_norm": 0.5612049102783203,
+      "learning_rate": 0.00018893867974558383,
+      "loss": 1.1015,
+      "step": 3412
+    },
+    {
+      "epoch": 0.6077279202279202,
+      "grad_norm": 0.4794061779975891,
+      "learning_rate": 0.00018893227985334056,
+      "loss": 1.2103,
+      "step": 3413
+    },
+    {
+      "epoch": 0.6079059829059829,
+      "grad_norm": 0.6060562133789062,
+      "learning_rate": 0.00018892587821865402,
+      "loss": 1.3693,
+      "step": 3414
+    },
+    {
+      "epoch": 0.6080840455840456,
+      "grad_norm": 0.44624534249305725,
+      "learning_rate": 0.00018891947484164963,
+      "loss": 0.8209,
+      "step": 3415
+    },
+    {
+      "epoch": 0.6082621082621082,
+      "grad_norm": 0.49297213554382324,
+      "learning_rate": 0.0001889130697224528,
+      "loss": 1.2027,
+      "step": 3416
+    },
+    {
+      "epoch": 0.6084401709401709,
+      "grad_norm": 0.4431746304035187,
+      "learning_rate": 0.0001889066628611891,
+      "loss": 1.0347,
+      "step": 3417
+    },
+    {
+      "epoch": 0.6086182336182336,
+      "grad_norm": 0.5425933599472046,
+      "learning_rate": 0.00018890025425798404,
+      "loss": 1.0556,
+      "step": 3418
+    },
+    {
+      "epoch": 0.6087962962962963,
+      "grad_norm": 0.5502763390541077,
+      "learning_rate": 0.00018889384391296315,
+      "loss": 1.2362,
+      "step": 3419
+    },
+    {
+      "epoch": 0.6089743589743589,
+      "grad_norm": 0.5442292094230652,
+      "learning_rate": 0.00018888743182625203,
+      "loss": 1.1306,
+      "step": 3420
+    },
+    {
+      "epoch": 0.6091524216524217,
+      "grad_norm": 0.4651123583316803,
+      "learning_rate": 0.00018888101799797636,
+      "loss": 0.9305,
+      "step": 3421
+    },
+    {
+      "epoch": 0.6093304843304843,
+      "grad_norm": 0.4713892340660095,
+      "learning_rate": 0.00018887460242826177,
+      "loss": 1.0789,
+      "step": 3422
+    },
+    {
+      "epoch": 0.6095085470085471,
+      "grad_norm": 0.5283244848251343,
+      "learning_rate": 0.00018886818511723398,
+      "loss": 1.345,
+      "step": 3423
+    },
+    {
+      "epoch": 0.6096866096866097,
+      "grad_norm": 0.5527324080467224,
+      "learning_rate": 0.0001888617660650187,
+      "loss": 1.1297,
+      "step": 3424
+    },
+    {
+      "epoch": 0.6098646723646723,
+      "grad_norm": 0.5412901043891907,
+      "learning_rate": 0.00018885534527174168,
+      "loss": 1.1213,
+      "step": 3425
+    },
+    {
+      "epoch": 0.6100427350427351,
+      "grad_norm": 0.5295354127883911,
+      "learning_rate": 0.00018884892273752878,
+      "loss": 1.1217,
+      "step": 3426
+    },
+    {
+      "epoch": 0.6102207977207977,
+      "grad_norm": 0.461900532245636,
+      "learning_rate": 0.0001888424984625058,
+      "loss": 0.827,
+      "step": 3427
+    },
+    {
+      "epoch": 0.6103988603988604,
+      "grad_norm": 0.4922671616077423,
+      "learning_rate": 0.00018883607244679865,
+      "loss": 1.2216,
+      "step": 3428
+    },
+    {
+      "epoch": 0.6105769230769231,
+      "grad_norm": 0.5080927014350891,
+      "learning_rate": 0.00018882964469053317,
+      "loss": 1.2446,
+      "step": 3429
+    },
+    {
+      "epoch": 0.6107549857549858,
+      "grad_norm": 0.5523943901062012,
+      "learning_rate": 0.00018882321519383534,
+      "loss": 1.3346,
+      "step": 3430
+    },
+    {
+      "epoch": 0.6109330484330484,
+      "grad_norm": 0.5105271935462952,
+      "learning_rate": 0.0001888167839568311,
+      "loss": 1.1311,
+      "step": 3431
+    },
+    {
+      "epoch": 0.6111111111111112,
+      "grad_norm": 0.5635872483253479,
+      "learning_rate": 0.0001888103509796465,
+      "loss": 1.1875,
+      "step": 3432
+    },
+    {
+      "epoch": 0.6112891737891738,
+      "grad_norm": 0.4619547426700592,
+      "learning_rate": 0.00018880391626240755,
+      "loss": 0.9176,
+      "step": 3433
+    },
+    {
+      "epoch": 0.6114672364672364,
+      "grad_norm": 0.5896356105804443,
+      "learning_rate": 0.00018879747980524034,
+      "loss": 1.0251,
+      "step": 3434
+    },
+    {
+      "epoch": 0.6116452991452992,
+      "grad_norm": 0.49062737822532654,
+      "learning_rate": 0.000188791041608271,
+      "loss": 1.1598,
+      "step": 3435
+    },
+    {
+      "epoch": 0.6118233618233618,
+      "grad_norm": 0.45717164874076843,
+      "learning_rate": 0.00018878460167162558,
+      "loss": 0.8647,
+      "step": 3436
+    },
+    {
+      "epoch": 0.6120014245014245,
+      "grad_norm": 0.5903525352478027,
+      "learning_rate": 0.00018877815999543038,
+      "loss": 0.9671,
+      "step": 3437
+    },
+    {
+      "epoch": 0.6121794871794872,
+      "grad_norm": 0.5315384268760681,
+      "learning_rate": 0.00018877171657981153,
+      "loss": 1.1759,
+      "step": 3438
+    },
+    {
+      "epoch": 0.6123575498575499,
+      "grad_norm": 0.5650150775909424,
+      "learning_rate": 0.0001887652714248953,
+      "loss": 1.0128,
+      "step": 3439
+    },
+    {
+      "epoch": 0.6125356125356125,
+      "grad_norm": 0.49841752648353577,
+      "learning_rate": 0.000188758824530808,
+      "loss": 1.1259,
+      "step": 3440
+    },
+    {
+      "epoch": 0.6127136752136753,
+      "grad_norm": 0.4985620975494385,
+      "learning_rate": 0.00018875237589767593,
+      "loss": 1.0158,
+      "step": 3441
+    },
+    {
+      "epoch": 0.6128917378917379,
+      "grad_norm": 0.45266565680503845,
+      "learning_rate": 0.00018874592552562536,
+      "loss": 0.93,
+      "step": 3442
+    },
+    {
+      "epoch": 0.6130698005698005,
+      "grad_norm": 0.5696130990982056,
+      "learning_rate": 0.00018873947341478274,
+      "loss": 1.1432,
+      "step": 3443
+    },
+    {
+      "epoch": 0.6132478632478633,
+      "grad_norm": 0.5211645364761353,
+      "learning_rate": 0.00018873301956527451,
+      "loss": 1.1317,
+      "step": 3444
+    },
+    {
+      "epoch": 0.6134259259259259,
+      "grad_norm": 0.4991866946220398,
+      "learning_rate": 0.00018872656397722707,
+      "loss": 1.0362,
+      "step": 3445
+    },
+    {
+      "epoch": 0.6136039886039886,
+      "grad_norm": 0.5109508037567139,
+      "learning_rate": 0.00018872010665076694,
+      "loss": 1.2728,
+      "step": 3446
+    },
+    {
+      "epoch": 0.6137820512820513,
+      "grad_norm": 0.5838373899459839,
+      "learning_rate": 0.00018871364758602058,
+      "loss": 1.1131,
+      "step": 3447
+    },
+    {
+      "epoch": 0.613960113960114,
+      "grad_norm": 0.5139824151992798,
+      "learning_rate": 0.00018870718678311462,
+      "loss": 1.238,
+      "step": 3448
+    },
+    {
+      "epoch": 0.6141381766381766,
+      "grad_norm": 0.4852082431316376,
+      "learning_rate": 0.00018870072424217562,
+      "loss": 1.0677,
+      "step": 3449
+    },
+    {
+      "epoch": 0.6143162393162394,
+      "grad_norm": 0.5312315225601196,
+      "learning_rate": 0.00018869425996333018,
+      "loss": 1.178,
+      "step": 3450
+    },
+    {
+      "epoch": 0.614494301994302,
+      "grad_norm": 0.6343565583229065,
+      "learning_rate": 0.00018868779394670492,
+      "loss": 0.8839,
+      "step": 3451
+    },
+    {
+      "epoch": 0.6146723646723646,
+      "grad_norm": 0.6029773950576782,
+      "learning_rate": 0.00018868132619242662,
+      "loss": 1.1188,
+      "step": 3452
+    },
+    {
+      "epoch": 0.6148504273504274,
+      "grad_norm": 0.5246016383171082,
+      "learning_rate": 0.00018867485670062193,
+      "loss": 1.0797,
+      "step": 3453
+    },
+    {
+      "epoch": 0.61502849002849,
+      "grad_norm": 0.49307698011398315,
+      "learning_rate": 0.00018866838547141763,
+      "loss": 0.9749,
+      "step": 3454
+    },
+    {
+      "epoch": 0.6152065527065527,
+      "grad_norm": 0.5232903361320496,
+      "learning_rate": 0.00018866191250494052,
+      "loss": 1.0785,
+      "step": 3455
+    },
+    {
+      "epoch": 0.6153846153846154,
+      "grad_norm": 0.5545645356178284,
+      "learning_rate": 0.0001886554378013174,
+      "loss": 1.0496,
+      "step": 3456
+    },
+    {
+      "epoch": 0.6155626780626781,
+      "grad_norm": 0.493945837020874,
+      "learning_rate": 0.00018864896136067515,
+      "loss": 0.9248,
+      "step": 3457
+    },
+    {
+      "epoch": 0.6157407407407407,
+      "grad_norm": 0.5223548412322998,
+      "learning_rate": 0.00018864248318314065,
+      "loss": 1.0617,
+      "step": 3458
+    },
+    {
+      "epoch": 0.6159188034188035,
+      "grad_norm": 0.5666514039039612,
+      "learning_rate": 0.00018863600326884082,
+      "loss": 0.9981,
+      "step": 3459
+    },
+    {
+      "epoch": 0.6160968660968661,
+      "grad_norm": 0.4648127257823944,
+      "learning_rate": 0.00018862952161790265,
+      "loss": 0.917,
+      "step": 3460
+    },
+    {
+      "epoch": 0.6162749287749287,
+      "grad_norm": 0.590326189994812,
+      "learning_rate": 0.0001886230382304531,
+      "loss": 1.044,
+      "step": 3461
+    },
+    {
+      "epoch": 0.6164529914529915,
+      "grad_norm": 0.5511625409126282,
+      "learning_rate": 0.00018861655310661925,
+      "loss": 1.0988,
+      "step": 3462
+    },
+    {
+      "epoch": 0.6166310541310541,
+      "grad_norm": 0.567182183265686,
+      "learning_rate": 0.0001886100662465281,
+      "loss": 1.3017,
+      "step": 3463
+    },
+    {
+      "epoch": 0.6168091168091168,
+      "grad_norm": 0.5708897709846497,
+      "learning_rate": 0.0001886035776503068,
+      "loss": 0.9123,
+      "step": 3464
+    },
+    {
+      "epoch": 0.6169871794871795,
+      "grad_norm": 0.4945180416107178,
+      "learning_rate": 0.0001885970873180824,
+      "loss": 1.1645,
+      "step": 3465
+    },
+    {
+      "epoch": 0.6171652421652422,
+      "grad_norm": 0.4713336229324341,
+      "learning_rate": 0.00018859059524998215,
+      "loss": 1.0546,
+      "step": 3466
+    },
+    {
+      "epoch": 0.6173433048433048,
+      "grad_norm": 0.532859206199646,
+      "learning_rate": 0.0001885841014461332,
+      "loss": 1.0795,
+      "step": 3467
+    },
+    {
+      "epoch": 0.6175213675213675,
+      "grad_norm": 0.5165733695030212,
+      "learning_rate": 0.00018857760590666284,
+      "loss": 1.1284,
+      "step": 3468
+    },
+    {
+      "epoch": 0.6176994301994302,
+      "grad_norm": 0.48623126745224,
+      "learning_rate": 0.00018857110863169826,
+      "loss": 0.8618,
+      "step": 3469
+    },
+    {
+      "epoch": 0.6178774928774928,
+      "grad_norm": 0.628559947013855,
+      "learning_rate": 0.0001885646096213668,
+      "loss": 1.1089,
+      "step": 3470
+    },
+    {
+      "epoch": 0.6180555555555556,
+      "grad_norm": 0.503545880317688,
+      "learning_rate": 0.0001885581088757958,
+      "loss": 1.2311,
+      "step": 3471
+    },
+    {
+      "epoch": 0.6182336182336182,
+      "grad_norm": 0.6172101497650146,
+      "learning_rate": 0.00018855160639511264,
+      "loss": 1.2651,
+      "step": 3472
+    },
+    {
+      "epoch": 0.6184116809116809,
+      "grad_norm": 0.49572527408599854,
+      "learning_rate": 0.00018854510217944465,
+      "loss": 1.1026,
+      "step": 3473
+    },
+    {
+      "epoch": 0.6185897435897436,
+      "grad_norm": 0.5373549461364746,
+      "learning_rate": 0.00018853859622891938,
+      "loss": 1.2562,
+      "step": 3474
+    },
+    {
+      "epoch": 0.6187678062678063,
+      "grad_norm": 0.5272396206855774,
+      "learning_rate": 0.0001885320885436642,
+      "loss": 1.1763,
+      "step": 3475
+    },
+    {
+      "epoch": 0.6189458689458689,
+      "grad_norm": 0.46584269404411316,
+      "learning_rate": 0.00018852557912380665,
+      "loss": 1.1762,
+      "step": 3476
+    },
+    {
+      "epoch": 0.6191239316239316,
+      "grad_norm": 0.4798245131969452,
+      "learning_rate": 0.0001885190679694743,
+      "loss": 0.9229,
+      "step": 3477
+    },
+    {
+      "epoch": 0.6193019943019943,
+      "grad_norm": 0.5221366286277771,
+      "learning_rate": 0.0001885125550807947,
+      "loss": 1.1078,
+      "step": 3478
+    },
+    {
+      "epoch": 0.6194800569800569,
+      "grad_norm": 0.5051897168159485,
+      "learning_rate": 0.0001885060404578954,
+      "loss": 1.0055,
+      "step": 3479
+    },
+    {
+      "epoch": 0.6196581196581197,
+      "grad_norm": 0.492662250995636,
+      "learning_rate": 0.00018849952410090413,
+      "loss": 1.1172,
+      "step": 3480
+    },
+    {
+      "epoch": 0.6198361823361823,
+      "grad_norm": 0.4906775951385498,
+      "learning_rate": 0.00018849300600994853,
+      "loss": 1.1223,
+      "step": 3481
+    },
+    {
+      "epoch": 0.6200142450142451,
+      "grad_norm": 0.5032641291618347,
+      "learning_rate": 0.0001884864861851563,
+      "loss": 0.9541,
+      "step": 3482
+    },
+    {
+      "epoch": 0.6201923076923077,
+      "grad_norm": 0.5262296795845032,
+      "learning_rate": 0.00018847996462665521,
+      "loss": 1.021,
+      "step": 3483
+    },
+    {
+      "epoch": 0.6203703703703703,
+      "grad_norm": 0.5253522992134094,
+      "learning_rate": 0.00018847344133457295,
+      "loss": 0.9075,
+      "step": 3484
+    },
+    {
+      "epoch": 0.6205484330484331,
+      "grad_norm": 0.4204299747943878,
+      "learning_rate": 0.00018846691630903744,
+      "loss": 0.895,
+      "step": 3485
+    },
+    {
+      "epoch": 0.6207264957264957,
+      "grad_norm": 0.557604193687439,
+      "learning_rate": 0.0001884603895501765,
+      "loss": 1.1758,
+      "step": 3486
+    },
+    {
+      "epoch": 0.6209045584045584,
+      "grad_norm": 0.5981321930885315,
+      "learning_rate": 0.00018845386105811795,
+      "loss": 1.1087,
+      "step": 3487
+    },
+    {
+      "epoch": 0.6210826210826211,
+      "grad_norm": 0.5285581946372986,
+      "learning_rate": 0.00018844733083298975,
+      "loss": 1.0692,
+      "step": 3488
+    },
+    {
+      "epoch": 0.6212606837606838,
+      "grad_norm": 0.5403170585632324,
+      "learning_rate": 0.00018844079887491986,
+      "loss": 1.1998,
+      "step": 3489
+    },
+    {
+      "epoch": 0.6214387464387464,
+      "grad_norm": 0.5471615791320801,
+      "learning_rate": 0.0001884342651840362,
+      "loss": 0.9556,
+      "step": 3490
+    },
+    {
+      "epoch": 0.6216168091168092,
+      "grad_norm": 0.6126871705055237,
+      "learning_rate": 0.00018842772976046686,
+      "loss": 1.2629,
+      "step": 3491
+    },
+    {
+      "epoch": 0.6217948717948718,
+      "grad_norm": 0.45669353008270264,
+      "learning_rate": 0.00018842119260433982,
+      "loss": 1.0203,
+      "step": 3492
+    },
+    {
+      "epoch": 0.6219729344729344,
+      "grad_norm": 0.4998520612716675,
+      "learning_rate": 0.0001884146537157832,
+      "loss": 1.0271,
+      "step": 3493
+    },
+    {
+      "epoch": 0.6221509971509972,
+      "grad_norm": 0.5820242166519165,
+      "learning_rate": 0.00018840811309492507,
+      "loss": 1.0321,
+      "step": 3494
+    },
+    {
+      "epoch": 0.6223290598290598,
+      "grad_norm": 0.581676185131073,
+      "learning_rate": 0.00018840157074189367,
+      "loss": 0.9219,
+      "step": 3495
+    },
+    {
+      "epoch": 0.6225071225071225,
+      "grad_norm": 0.6044120788574219,
+      "learning_rate": 0.0001883950266568171,
+      "loss": 1.1621,
+      "step": 3496
+    },
+    {
+      "epoch": 0.6226851851851852,
+      "grad_norm": 0.5448858737945557,
+      "learning_rate": 0.0001883884808398236,
+      "loss": 1.0686,
+      "step": 3497
+    },
+    {
+      "epoch": 0.6228632478632479,
+      "grad_norm": 0.4921551048755646,
+      "learning_rate": 0.00018838193329104143,
+      "loss": 1.2259,
+      "step": 3498
+    },
+    {
+      "epoch": 0.6230413105413105,
+      "grad_norm": 0.5374335646629333,
+      "learning_rate": 0.00018837538401059888,
+      "loss": 1.2608,
+      "step": 3499
+    },
+    {
+      "epoch": 0.6232193732193733,
+      "grad_norm": 0.5123008489608765,
+      "learning_rate": 0.0001883688329986243,
+      "loss": 0.8682,
+      "step": 3500
+    },
+    {
+      "epoch": 0.6233974358974359,
+      "grad_norm": 0.566145122051239,
+      "learning_rate": 0.00018836228025524595,
+      "loss": 1.1807,
+      "step": 3501
+    },
+    {
+      "epoch": 0.6235754985754985,
+      "grad_norm": 0.6658587455749512,
+      "learning_rate": 0.00018835572578059233,
+      "loss": 1.1641,
+      "step": 3502
+    },
+    {
+      "epoch": 0.6237535612535613,
+      "grad_norm": 0.4992465078830719,
+      "learning_rate": 0.00018834916957479177,
+      "loss": 0.9125,
+      "step": 3503
+    },
+    {
+      "epoch": 0.6239316239316239,
+      "grad_norm": 0.5081812739372253,
+      "learning_rate": 0.00018834261163797278,
+      "loss": 1.0939,
+      "step": 3504
+    },
+    {
+      "epoch": 0.6241096866096866,
+      "grad_norm": 0.5168607234954834,
+      "learning_rate": 0.0001883360519702638,
+      "loss": 1.2382,
+      "step": 3505
+    },
+    {
+      "epoch": 0.6242877492877493,
+      "grad_norm": 0.5517697334289551,
+      "learning_rate": 0.00018832949057179344,
+      "loss": 1.206,
+      "step": 3506
+    },
+    {
+      "epoch": 0.624465811965812,
+      "grad_norm": 0.4505497217178345,
+      "learning_rate": 0.00018832292744269013,
+      "loss": 0.8485,
+      "step": 3507
+    },
+    {
+      "epoch": 0.6246438746438746,
+      "grad_norm": 0.5230690240859985,
+      "learning_rate": 0.0001883163625830826,
+      "loss": 1.1701,
+      "step": 3508
+    },
+    {
+      "epoch": 0.6248219373219374,
+      "grad_norm": 0.5062205195426941,
+      "learning_rate": 0.00018830979599309937,
+      "loss": 1.0602,
+      "step": 3509
+    },
+    {
+      "epoch": 0.625,
+      "grad_norm": 0.49922460317611694,
+      "learning_rate": 0.00018830322767286913,
+      "loss": 1.1937,
+      "step": 3510
+    },
+    {
+      "epoch": 0.6251780626780626,
+      "grad_norm": 0.4637366831302643,
+      "learning_rate": 0.0001882966576225206,
+      "loss": 1.038,
+      "step": 3511
+    },
+    {
+      "epoch": 0.6253561253561254,
+      "grad_norm": 0.5330080389976501,
+      "learning_rate": 0.00018829008584218246,
+      "loss": 0.9308,
+      "step": 3512
+    },
+    {
+      "epoch": 0.625534188034188,
+      "grad_norm": 0.5443428754806519,
+      "learning_rate": 0.0001882835123319835,
+      "loss": 1.0006,
+      "step": 3513
+    },
+    {
+      "epoch": 0.6257122507122507,
+      "grad_norm": 0.5534018874168396,
+      "learning_rate": 0.00018827693709205253,
+      "loss": 1.2383,
+      "step": 3514
+    },
+    {
+      "epoch": 0.6258903133903134,
+      "grad_norm": 0.49207547307014465,
+      "learning_rate": 0.00018827036012251832,
+      "loss": 0.9804,
+      "step": 3515
+    },
+    {
+      "epoch": 0.6260683760683761,
+      "grad_norm": 0.4900086224079132,
+      "learning_rate": 0.0001882637814235098,
+      "loss": 1.012,
+      "step": 3516
+    },
+    {
+      "epoch": 0.6262464387464387,
+      "grad_norm": 0.5267475247383118,
+      "learning_rate": 0.00018825720099515585,
+      "loss": 1.1104,
+      "step": 3517
+    },
+    {
+      "epoch": 0.6264245014245015,
+      "grad_norm": 0.5711902379989624,
+      "learning_rate": 0.00018825061883758534,
+      "loss": 1.0616,
+      "step": 3518
+    },
+    {
+      "epoch": 0.6266025641025641,
+      "grad_norm": 0.5007771849632263,
+      "learning_rate": 0.0001882440349509273,
+      "loss": 0.9578,
+      "step": 3519
+    },
+    {
+      "epoch": 0.6267806267806267,
+      "grad_norm": 0.5657192468643188,
+      "learning_rate": 0.00018823744933531075,
+      "loss": 1.2768,
+      "step": 3520
+    },
+    {
+      "epoch": 0.6269586894586895,
+      "grad_norm": 0.6077173352241516,
+      "learning_rate": 0.00018823086199086462,
+      "loss": 1.147,
+      "step": 3521
+    },
+    {
+      "epoch": 0.6271367521367521,
+      "grad_norm": 0.5114718079566956,
+      "learning_rate": 0.000188224272917718,
+      "loss": 1.1176,
+      "step": 3522
+    },
+    {
+      "epoch": 0.6273148148148148,
+      "grad_norm": 0.4831676185131073,
+      "learning_rate": 0.0001882176821160001,
+      "loss": 0.8021,
+      "step": 3523
+    },
+    {
+      "epoch": 0.6274928774928775,
+      "grad_norm": 0.6327390670776367,
+      "learning_rate": 0.00018821108958583994,
+      "loss": 0.9449,
+      "step": 3524
+    },
+    {
+      "epoch": 0.6276709401709402,
+      "grad_norm": 0.5541796684265137,
+      "learning_rate": 0.00018820449532736672,
+      "loss": 1.2018,
+      "step": 3525
+    },
+    {
+      "epoch": 0.6278490028490028,
+      "grad_norm": 0.5224639773368835,
+      "learning_rate": 0.00018819789934070968,
+      "loss": 1.0138,
+      "step": 3526
+    },
+    {
+      "epoch": 0.6280270655270656,
+      "grad_norm": 0.49359360337257385,
+      "learning_rate": 0.00018819130162599798,
+      "loss": 1.0768,
+      "step": 3527
+    },
+    {
+      "epoch": 0.6282051282051282,
+      "grad_norm": 0.5525050759315491,
+      "learning_rate": 0.00018818470218336092,
+      "loss": 1.0883,
+      "step": 3528
+    },
+    {
+      "epoch": 0.6283831908831908,
+      "grad_norm": 0.5563427209854126,
+      "learning_rate": 0.00018817810101292787,
+      "loss": 1.1491,
+      "step": 3529
+    },
+    {
+      "epoch": 0.6285612535612536,
+      "grad_norm": 0.49363306164741516,
+      "learning_rate": 0.00018817149811482803,
+      "loss": 1.1409,
+      "step": 3530
+    },
+    {
+      "epoch": 0.6287393162393162,
+      "grad_norm": 0.5102340579032898,
+      "learning_rate": 0.00018816489348919086,
+      "loss": 1.1914,
+      "step": 3531
+    },
+    {
+      "epoch": 0.6289173789173789,
+      "grad_norm": 0.5173332691192627,
+      "learning_rate": 0.00018815828713614576,
+      "loss": 0.9308,
+      "step": 3532
+    },
+    {
+      "epoch": 0.6290954415954416,
+      "grad_norm": 0.5093010067939758,
+      "learning_rate": 0.00018815167905582216,
+      "loss": 0.9429,
+      "step": 3533
+    },
+    {
+      "epoch": 0.6292735042735043,
+      "grad_norm": 0.5453153848648071,
+      "learning_rate": 0.00018814506924834954,
+      "loss": 1.0147,
+      "step": 3534
+    },
+    {
+      "epoch": 0.6294515669515669,
+      "grad_norm": 0.5850773453712463,
+      "learning_rate": 0.00018813845771385737,
+      "loss": 1.3372,
+      "step": 3535
+    },
+    {
+      "epoch": 0.6296296296296297,
+      "grad_norm": 0.5095621943473816,
+      "learning_rate": 0.00018813184445247525,
+      "loss": 1.0515,
+      "step": 3536
+    },
+    {
+      "epoch": 0.6298076923076923,
+      "grad_norm": 0.6216054558753967,
+      "learning_rate": 0.00018812522946433266,
+      "loss": 0.8703,
+      "step": 3537
+    },
+    {
+      "epoch": 0.6299857549857549,
+      "grad_norm": 0.4945531189441681,
+      "learning_rate": 0.00018811861274955932,
+      "loss": 1.1485,
+      "step": 3538
+    },
+    {
+      "epoch": 0.6301638176638177,
+      "grad_norm": 0.47882601618766785,
+      "learning_rate": 0.00018811199430828477,
+      "loss": 1.1107,
+      "step": 3539
+    },
+    {
+      "epoch": 0.6303418803418803,
+      "grad_norm": 0.5005326867103577,
+      "learning_rate": 0.00018810537414063876,
+      "loss": 1.0237,
+      "step": 3540
+    },
+    {
+      "epoch": 0.6305199430199431,
+      "grad_norm": 0.5382370352745056,
+      "learning_rate": 0.00018809875224675093,
+      "loss": 0.9965,
+      "step": 3541
+    },
+    {
+      "epoch": 0.6306980056980057,
+      "grad_norm": 0.47002625465393066,
+      "learning_rate": 0.0001880921286267511,
+      "loss": 1.065,
+      "step": 3542
+    },
+    {
+      "epoch": 0.6308760683760684,
+      "grad_norm": 0.4519105851650238,
+      "learning_rate": 0.00018808550328076897,
+      "loss": 0.9312,
+      "step": 3543
+    },
+    {
+      "epoch": 0.6310541310541311,
+      "grad_norm": 0.45360881090164185,
+      "learning_rate": 0.0001880788762089344,
+      "loss": 1.0739,
+      "step": 3544
+    },
+    {
+      "epoch": 0.6312321937321937,
+      "grad_norm": 0.5578218698501587,
+      "learning_rate": 0.00018807224741137723,
+      "loss": 1.2478,
+      "step": 3545
+    },
+    {
+      "epoch": 0.6314102564102564,
+      "grad_norm": 0.4838615655899048,
+      "learning_rate": 0.0001880656168882273,
+      "loss": 1.0221,
+      "step": 3546
+    },
+    {
+      "epoch": 0.6315883190883191,
+      "grad_norm": 0.5733556747436523,
+      "learning_rate": 0.0001880589846396146,
+      "loss": 1.1249,
+      "step": 3547
+    },
+    {
+      "epoch": 0.6317663817663818,
+      "grad_norm": 0.4939686954021454,
+      "learning_rate": 0.00018805235066566894,
+      "loss": 0.8559,
+      "step": 3548
+    },
+    {
+      "epoch": 0.6319444444444444,
+      "grad_norm": 0.5072234869003296,
+      "learning_rate": 0.00018804571496652044,
+      "loss": 1.0842,
+      "step": 3549
+    },
+    {
+      "epoch": 0.6321225071225072,
+      "grad_norm": 0.4640493392944336,
+      "learning_rate": 0.00018803907754229903,
+      "loss": 1.0728,
+      "step": 3550
+    },
+    {
+      "epoch": 0.6323005698005698,
+      "grad_norm": 0.5314788818359375,
+      "learning_rate": 0.00018803243839313481,
+      "loss": 1.0752,
+      "step": 3551
+    },
+    {
+      "epoch": 0.6324786324786325,
+      "grad_norm": 0.5511462092399597,
+      "learning_rate": 0.0001880257975191578,
+      "loss": 1.0238,
+      "step": 3552
+    },
+    {
+      "epoch": 0.6326566951566952,
+      "grad_norm": 0.4980711042881012,
+      "learning_rate": 0.00018801915492049816,
+      "loss": 1.0981,
+      "step": 3553
+    },
+    {
+      "epoch": 0.6328347578347578,
+      "grad_norm": 0.7746123671531677,
+      "learning_rate": 0.00018801251059728604,
+      "loss": 1.0968,
+      "step": 3554
+    },
+    {
+      "epoch": 0.6330128205128205,
+      "grad_norm": 0.5006106495857239,
+      "learning_rate": 0.00018800586454965155,
+      "loss": 1.1802,
+      "step": 3555
+    },
+    {
+      "epoch": 0.6331908831908832,
+      "grad_norm": 0.49427780508995056,
+      "learning_rate": 0.000187999216777725,
+      "loss": 1.1257,
+      "step": 3556
+    },
+    {
+      "epoch": 0.6333689458689459,
+      "grad_norm": 0.5484146475791931,
+      "learning_rate": 0.00018799256728163662,
+      "loss": 1.1344,
+      "step": 3557
+    },
+    {
+      "epoch": 0.6335470085470085,
+      "grad_norm": 0.5007877349853516,
+      "learning_rate": 0.00018798591606151662,
+      "loss": 1.1328,
+      "step": 3558
+    },
+    {
+      "epoch": 0.6337250712250713,
+      "grad_norm": 0.5068148970603943,
+      "learning_rate": 0.00018797926311749544,
+      "loss": 0.976,
+      "step": 3559
+    },
+    {
+      "epoch": 0.6339031339031339,
+      "grad_norm": 0.44936859607696533,
+      "learning_rate": 0.00018797260844970334,
+      "loss": 0.9735,
+      "step": 3560
+    },
+    {
+      "epoch": 0.6340811965811965,
+      "grad_norm": 0.4592931866645813,
+      "learning_rate": 0.0001879659520582707,
+      "loss": 1.1306,
+      "step": 3561
+    },
+    {
+      "epoch": 0.6342592592592593,
+      "grad_norm": 0.4664020836353302,
+      "learning_rate": 0.00018795929394332795,
+      "loss": 1.0577,
+      "step": 3562
+    },
+    {
+      "epoch": 0.6344373219373219,
+      "grad_norm": 0.5638116002082825,
+      "learning_rate": 0.00018795263410500556,
+      "loss": 1.1747,
+      "step": 3563
+    },
+    {
+      "epoch": 0.6346153846153846,
+      "grad_norm": 0.524736225605011,
+      "learning_rate": 0.00018794597254343401,
+      "loss": 0.8964,
+      "step": 3564
+    },
+    {
+      "epoch": 0.6347934472934473,
+      "grad_norm": 0.4645404517650604,
+      "learning_rate": 0.00018793930925874386,
+      "loss": 0.8673,
+      "step": 3565
+    },
+    {
+      "epoch": 0.63497150997151,
+      "grad_norm": 0.4800064265727997,
+      "learning_rate": 0.00018793264425106558,
+      "loss": 1.0334,
+      "step": 3566
+    },
+    {
+      "epoch": 0.6351495726495726,
+      "grad_norm": 0.6202501058578491,
+      "learning_rate": 0.0001879259775205298,
+      "loss": 1.1061,
+      "step": 3567
+    },
+    {
+      "epoch": 0.6353276353276354,
+      "grad_norm": 0.503383457660675,
+      "learning_rate": 0.00018791930906726718,
+      "loss": 0.8545,
+      "step": 3568
+    },
+    {
+      "epoch": 0.635505698005698,
+      "grad_norm": 0.5256780982017517,
+      "learning_rate": 0.00018791263889140832,
+      "loss": 1.0785,
+      "step": 3569
+    },
+    {
+      "epoch": 0.6356837606837606,
+      "grad_norm": 0.47562023997306824,
+      "learning_rate": 0.00018790596699308392,
+      "loss": 1.0041,
+      "step": 3570
+    },
+    {
+      "epoch": 0.6358618233618234,
+      "grad_norm": 0.5103238224983215,
+      "learning_rate": 0.00018789929337242469,
+      "loss": 1.1488,
+      "step": 3571
+    },
+    {
+      "epoch": 0.636039886039886,
+      "grad_norm": 0.5023695826530457,
+      "learning_rate": 0.0001878926180295614,
+      "loss": 1.0696,
+      "step": 3572
+    },
+    {
+      "epoch": 0.6362179487179487,
+      "grad_norm": 0.5302290916442871,
+      "learning_rate": 0.00018788594096462487,
+      "loss": 1.0554,
+      "step": 3573
+    },
+    {
+      "epoch": 0.6363960113960114,
+      "grad_norm": 0.4798361361026764,
+      "learning_rate": 0.00018787926217774588,
+      "loss": 0.8872,
+      "step": 3574
+    },
+    {
+      "epoch": 0.6365740740740741,
+      "grad_norm": 0.5529209971427917,
+      "learning_rate": 0.00018787258166905527,
+      "loss": 1.0976,
+      "step": 3575
+    },
+    {
+      "epoch": 0.6367521367521367,
+      "grad_norm": 0.49757125973701477,
+      "learning_rate": 0.00018786589943868402,
+      "loss": 1.0049,
+      "step": 3576
+    },
+    {
+      "epoch": 0.6369301994301995,
+      "grad_norm": 0.5497848391532898,
+      "learning_rate": 0.00018785921548676295,
+      "loss": 1.2272,
+      "step": 3577
+    },
+    {
+      "epoch": 0.6371082621082621,
+      "grad_norm": 0.5061752200126648,
+      "learning_rate": 0.0001878525298134231,
+      "loss": 1.0307,
+      "step": 3578
+    },
+    {
+      "epoch": 0.6372863247863247,
+      "grad_norm": 0.5427432656288147,
+      "learning_rate": 0.00018784584241879538,
+      "loss": 1.1064,
+      "step": 3579
+    },
+    {
+      "epoch": 0.6374643874643875,
+      "grad_norm": 0.48312774300575256,
+      "learning_rate": 0.0001878391533030109,
+      "loss": 1.078,
+      "step": 3580
+    },
+    {
+      "epoch": 0.6376424501424501,
+      "grad_norm": 0.5059898495674133,
+      "learning_rate": 0.00018783246246620067,
+      "loss": 1.0922,
+      "step": 3581
+    },
+    {
+      "epoch": 0.6378205128205128,
+      "grad_norm": 0.5144124031066895,
+      "learning_rate": 0.00018782576990849581,
+      "loss": 1.0909,
+      "step": 3582
+    },
+    {
+      "epoch": 0.6379985754985755,
+      "grad_norm": 0.5535032153129578,
+      "learning_rate": 0.0001878190756300274,
+      "loss": 1.2579,
+      "step": 3583
+    },
+    {
+      "epoch": 0.6381766381766382,
+      "grad_norm": 0.49145692586898804,
+      "learning_rate": 0.00018781237963092667,
+      "loss": 1.0823,
+      "step": 3584
+    },
+    {
+      "epoch": 0.6383547008547008,
+      "grad_norm": 0.5245576500892639,
+      "learning_rate": 0.00018780568191132472,
+      "loss": 0.9595,
+      "step": 3585
+    },
+    {
+      "epoch": 0.6385327635327636,
+      "grad_norm": 0.5026637315750122,
+      "learning_rate": 0.00018779898247135287,
+      "loss": 1.153,
+      "step": 3586
+    },
+    {
+      "epoch": 0.6387108262108262,
+      "grad_norm": 0.5092771053314209,
+      "learning_rate": 0.00018779228131114234,
+      "loss": 1.0661,
+      "step": 3587
+    },
+    {
+      "epoch": 0.6388888888888888,
+      "grad_norm": 0.517387330532074,
+      "learning_rate": 0.00018778557843082444,
+      "loss": 1.0113,
+      "step": 3588
+    },
+    {
+      "epoch": 0.6390669515669516,
+      "grad_norm": 0.5149948000907898,
+      "learning_rate": 0.00018777887383053047,
+      "loss": 0.9483,
+      "step": 3589
+    },
+    {
+      "epoch": 0.6392450142450142,
+      "grad_norm": 0.4854544997215271,
+      "learning_rate": 0.00018777216751039185,
+      "loss": 1.22,
+      "step": 3590
+    },
+    {
+      "epoch": 0.6394230769230769,
+      "grad_norm": 0.5317271947860718,
+      "learning_rate": 0.0001877654594705399,
+      "loss": 1.2483,
+      "step": 3591
+    },
+    {
+      "epoch": 0.6396011396011396,
+      "grad_norm": 0.4554755687713623,
+      "learning_rate": 0.0001877587497111061,
+      "loss": 0.9864,
+      "step": 3592
+    },
+    {
+      "epoch": 0.6397792022792023,
+      "grad_norm": 0.4833736717700958,
+      "learning_rate": 0.0001877520382322219,
+      "loss": 0.8895,
+      "step": 3593
+    },
+    {
+      "epoch": 0.6399572649572649,
+      "grad_norm": 0.5018072724342346,
+      "learning_rate": 0.00018774532503401878,
+      "loss": 1.2523,
+      "step": 3594
+    },
+    {
+      "epoch": 0.6401353276353277,
+      "grad_norm": 0.4478762447834015,
+      "learning_rate": 0.00018773861011662832,
+      "loss": 0.8833,
+      "step": 3595
+    },
+    {
+      "epoch": 0.6403133903133903,
+      "grad_norm": 0.5686985850334167,
+      "learning_rate": 0.00018773189348018205,
+      "loss": 0.9934,
+      "step": 3596
+    },
+    {
+      "epoch": 0.6404914529914529,
+      "grad_norm": 0.5144175291061401,
+      "learning_rate": 0.00018772517512481157,
+      "loss": 0.8149,
+      "step": 3597
+    },
+    {
+      "epoch": 0.6406695156695157,
+      "grad_norm": 0.5359936356544495,
+      "learning_rate": 0.00018771845505064852,
+      "loss": 1.1822,
+      "step": 3598
+    },
+    {
+      "epoch": 0.6408475783475783,
+      "grad_norm": 0.532573938369751,
+      "learning_rate": 0.00018771173325782457,
+      "loss": 1.0361,
+      "step": 3599
+    },
+    {
+      "epoch": 0.6410256410256411,
+      "grad_norm": 0.46121537685394287,
+      "learning_rate": 0.00018770500974647138,
+      "loss": 1.0792,
+      "step": 3600
+    },
+    {
+      "epoch": 0.6412037037037037,
+      "grad_norm": 0.4804821312427521,
+      "learning_rate": 0.00018769828451672076,
+      "loss": 1.1119,
+      "step": 3601
+    },
+    {
+      "epoch": 0.6413817663817664,
+      "grad_norm": 0.4955114722251892,
+      "learning_rate": 0.00018769155756870443,
+      "loss": 0.9312,
+      "step": 3602
+    },
+    {
+      "epoch": 0.6415598290598291,
+      "grad_norm": 0.4987298250198364,
+      "learning_rate": 0.00018768482890255415,
+      "loss": 1.2326,
+      "step": 3603
+    },
+    {
+      "epoch": 0.6417378917378918,
+      "grad_norm": 0.47216179966926575,
+      "learning_rate": 0.0001876780985184018,
+      "loss": 1.0114,
+      "step": 3604
+    },
+    {
+      "epoch": 0.6419159544159544,
+      "grad_norm": 0.5891931653022766,
+      "learning_rate": 0.0001876713664163793,
+      "loss": 1.2963,
+      "step": 3605
+    },
+    {
+      "epoch": 0.6420940170940171,
+      "grad_norm": 0.4645081162452698,
+      "learning_rate": 0.00018766463259661846,
+      "loss": 1.0874,
+      "step": 3606
+    },
+    {
+      "epoch": 0.6422720797720798,
+      "grad_norm": 0.5275476574897766,
+      "learning_rate": 0.00018765789705925125,
+      "loss": 0.9453,
+      "step": 3607
+    },
+    {
+      "epoch": 0.6424501424501424,
+      "grad_norm": 0.5884957313537598,
+      "learning_rate": 0.00018765115980440964,
+      "loss": 1.0796,
+      "step": 3608
+    },
+    {
+      "epoch": 0.6426282051282052,
+      "grad_norm": 0.4843178987503052,
+      "learning_rate": 0.00018764442083222567,
+      "loss": 1.1657,
+      "step": 3609
+    },
+    {
+      "epoch": 0.6428062678062678,
+      "grad_norm": 0.5188381671905518,
+      "learning_rate": 0.00018763768014283126,
+      "loss": 1.1109,
+      "step": 3610
+    },
+    {
+      "epoch": 0.6429843304843305,
+      "grad_norm": 0.4101468324661255,
+      "learning_rate": 0.00018763093773635863,
+      "loss": 0.895,
+      "step": 3611
+    },
+    {
+      "epoch": 0.6431623931623932,
+      "grad_norm": 0.4552084505558014,
+      "learning_rate": 0.00018762419361293979,
+      "loss": 0.9418,
+      "step": 3612
+    },
+    {
+      "epoch": 0.6433404558404558,
+      "grad_norm": 0.5924661159515381,
+      "learning_rate": 0.0001876174477727069,
+      "loss": 1.2562,
+      "step": 3613
+    },
+    {
+      "epoch": 0.6435185185185185,
+      "grad_norm": 0.5072348713874817,
+      "learning_rate": 0.00018761070021579212,
+      "loss": 1.1501,
+      "step": 3614
+    },
+    {
+      "epoch": 0.6436965811965812,
+      "grad_norm": 0.5312697887420654,
+      "learning_rate": 0.0001876039509423277,
+      "loss": 1.0751,
+      "step": 3615
+    },
+    {
+      "epoch": 0.6438746438746439,
+      "grad_norm": 0.6046462059020996,
+      "learning_rate": 0.0001875971999524458,
+      "loss": 1.0927,
+      "step": 3616
+    },
+    {
+      "epoch": 0.6440527065527065,
+      "grad_norm": 0.4992375373840332,
+      "learning_rate": 0.00018759044724627876,
+      "loss": 0.96,
+      "step": 3617
+    },
+    {
+      "epoch": 0.6442307692307693,
+      "grad_norm": 0.4983134865760803,
+      "learning_rate": 0.00018758369282395886,
+      "loss": 1.0599,
+      "step": 3618
+    },
+    {
+      "epoch": 0.6444088319088319,
+      "grad_norm": 0.5655683279037476,
+      "learning_rate": 0.00018757693668561843,
+      "loss": 1.2372,
+      "step": 3619
+    },
+    {
+      "epoch": 0.6445868945868946,
+      "grad_norm": 0.4968827962875366,
+      "learning_rate": 0.00018757017883138985,
+      "loss": 1.1639,
+      "step": 3620
+    },
+    {
+      "epoch": 0.6447649572649573,
+      "grad_norm": 0.5831420421600342,
+      "learning_rate": 0.00018756341926140553,
+      "loss": 0.9002,
+      "step": 3621
+    },
+    {
+      "epoch": 0.64494301994302,
+      "grad_norm": 0.4828467071056366,
+      "learning_rate": 0.0001875566579757979,
+      "loss": 0.9201,
+      "step": 3622
+    },
+    {
+      "epoch": 0.6451210826210826,
+      "grad_norm": 0.5067087411880493,
+      "learning_rate": 0.00018754989497469943,
+      "loss": 0.9874,
+      "step": 3623
+    },
+    {
+      "epoch": 0.6452991452991453,
+      "grad_norm": 0.5182318091392517,
+      "learning_rate": 0.00018754313025824267,
+      "loss": 1.1291,
+      "step": 3624
+    },
+    {
+      "epoch": 0.645477207977208,
+      "grad_norm": 0.472200483083725,
+      "learning_rate": 0.0001875363638265601,
+      "loss": 1.0286,
+      "step": 3625
+    },
+    {
+      "epoch": 0.6456552706552706,
+      "grad_norm": 0.4597308039665222,
+      "learning_rate": 0.0001875295956797843,
+      "loss": 0.7517,
+      "step": 3626
+    },
+    {
+      "epoch": 0.6458333333333334,
+      "grad_norm": 0.5358221530914307,
+      "learning_rate": 0.00018752282581804798,
+      "loss": 1.2264,
+      "step": 3627
+    },
+    {
+      "epoch": 0.646011396011396,
+      "grad_norm": 0.5268992781639099,
+      "learning_rate": 0.00018751605424148363,
+      "loss": 1.0801,
+      "step": 3628
+    },
+    {
+      "epoch": 0.6461894586894587,
+      "grad_norm": 0.5917379260063171,
+      "learning_rate": 0.00018750928095022403,
+      "loss": 0.9538,
+      "step": 3629
+    },
+    {
+      "epoch": 0.6463675213675214,
+      "grad_norm": 0.44506707787513733,
+      "learning_rate": 0.00018750250594440183,
+      "loss": 0.9818,
+      "step": 3630
+    },
+    {
+      "epoch": 0.646545584045584,
+      "grad_norm": 0.5578880906105042,
+      "learning_rate": 0.00018749572922414982,
+      "loss": 0.9958,
+      "step": 3631
+    },
+    {
+      "epoch": 0.6467236467236467,
+      "grad_norm": 0.5155318975448608,
+      "learning_rate": 0.00018748895078960076,
+      "loss": 1.2888,
+      "step": 3632
+    },
+    {
+      "epoch": 0.6469017094017094,
+      "grad_norm": 0.5117297768592834,
+      "learning_rate": 0.0001874821706408874,
+      "loss": 1.0452,
+      "step": 3633
+    },
+    {
+      "epoch": 0.6470797720797721,
+      "grad_norm": 0.5169841647148132,
+      "learning_rate": 0.00018747538877814267,
+      "loss": 1.1649,
+      "step": 3634
+    },
+    {
+      "epoch": 0.6472578347578347,
+      "grad_norm": 0.5001181960105896,
+      "learning_rate": 0.00018746860520149942,
+      "loss": 1.1472,
+      "step": 3635
+    },
+    {
+      "epoch": 0.6474358974358975,
+      "grad_norm": 0.6289856433868408,
+      "learning_rate": 0.00018746181991109056,
+      "loss": 1.0351,
+      "step": 3636
+    },
+    {
+      "epoch": 0.6476139601139601,
+      "grad_norm": 0.5490612983703613,
+      "learning_rate": 0.00018745503290704897,
+      "loss": 0.8938,
+      "step": 3637
+    },
+    {
+      "epoch": 0.6477920227920227,
+      "grad_norm": 0.47378283739089966,
+      "learning_rate": 0.00018744824418950775,
+      "loss": 0.937,
+      "step": 3638
+    },
+    {
+      "epoch": 0.6479700854700855,
+      "grad_norm": 0.6079059839248657,
+      "learning_rate": 0.0001874414537585998,
+      "loss": 1.0486,
+      "step": 3639
+    },
+    {
+      "epoch": 0.6481481481481481,
+      "grad_norm": 0.5351769924163818,
+      "learning_rate": 0.00018743466161445823,
+      "loss": 1.0316,
+      "step": 3640
+    },
+    {
+      "epoch": 0.6483262108262108,
+      "grad_norm": 0.5516425967216492,
+      "learning_rate": 0.0001874278677572161,
+      "loss": 1.1552,
+      "step": 3641
+    },
+    {
+      "epoch": 0.6485042735042735,
+      "grad_norm": 0.5027523636817932,
+      "learning_rate": 0.0001874210721870065,
+      "loss": 1.0491,
+      "step": 3642
+    },
+    {
+      "epoch": 0.6486823361823362,
+      "grad_norm": 0.5596168041229248,
+      "learning_rate": 0.00018741427490396258,
+      "loss": 1.0256,
+      "step": 3643
+    },
+    {
+      "epoch": 0.6488603988603988,
+      "grad_norm": 0.5601046681404114,
+      "learning_rate": 0.00018740747590821751,
+      "loss": 1.1604,
+      "step": 3644
+    },
+    {
+      "epoch": 0.6490384615384616,
+      "grad_norm": 0.49749523401260376,
+      "learning_rate": 0.0001874006751999046,
+      "loss": 1.0532,
+      "step": 3645
+    },
+    {
+      "epoch": 0.6492165242165242,
+      "grad_norm": 0.6226113438606262,
+      "learning_rate": 0.00018739387277915697,
+      "loss": 1.1402,
+      "step": 3646
+    },
+    {
+      "epoch": 0.6493945868945868,
+      "grad_norm": 0.6142009496688843,
+      "learning_rate": 0.00018738706864610794,
+      "loss": 1.2437,
+      "step": 3647
+    },
+    {
+      "epoch": 0.6495726495726496,
+      "grad_norm": 0.48814916610717773,
+      "learning_rate": 0.00018738026280089084,
+      "loss": 0.8429,
+      "step": 3648
+    },
+    {
+      "epoch": 0.6497507122507122,
+      "grad_norm": 0.5717982053756714,
+      "learning_rate": 0.00018737345524363902,
+      "loss": 1.1095,
+      "step": 3649
+    },
+    {
+      "epoch": 0.6499287749287749,
+      "grad_norm": 0.5150009989738464,
+      "learning_rate": 0.00018736664597448582,
+      "loss": 1.199,
+      "step": 3650
+    },
+    {
+      "epoch": 0.6501068376068376,
+      "grad_norm": 0.58461594581604,
+      "learning_rate": 0.00018735983499356472,
+      "loss": 1.0704,
+      "step": 3651
+    },
+    {
+      "epoch": 0.6502849002849003,
+      "grad_norm": 0.5108643770217896,
+      "learning_rate": 0.0001873530223010091,
+      "loss": 1.2039,
+      "step": 3652
+    },
+    {
+      "epoch": 0.6504629629629629,
+      "grad_norm": 0.513306736946106,
+      "learning_rate": 0.00018734620789695247,
+      "loss": 1.1448,
+      "step": 3653
+    },
+    {
+      "epoch": 0.6506410256410257,
+      "grad_norm": 0.5139986872673035,
+      "learning_rate": 0.00018733939178152835,
+      "loss": 1.0023,
+      "step": 3654
+    },
+    {
+      "epoch": 0.6508190883190883,
+      "grad_norm": 0.5187703967094421,
+      "learning_rate": 0.00018733257395487027,
+      "loss": 1.1304,
+      "step": 3655
+    },
+    {
+      "epoch": 0.6509971509971509,
+      "grad_norm": 0.5470501184463501,
+      "learning_rate": 0.00018732575441711183,
+      "loss": 1.0272,
+      "step": 3656
+    },
+    {
+      "epoch": 0.6511752136752137,
+      "grad_norm": 0.537309467792511,
+      "learning_rate": 0.00018731893316838665,
+      "loss": 1.0806,
+      "step": 3657
+    },
+    {
+      "epoch": 0.6513532763532763,
+      "grad_norm": 0.5187864899635315,
+      "learning_rate": 0.00018731211020882836,
+      "loss": 1.0154,
+      "step": 3658
+    },
+    {
+      "epoch": 0.6515313390313391,
+      "grad_norm": 0.48373252153396606,
+      "learning_rate": 0.00018730528553857062,
+      "loss": 1.0135,
+      "step": 3659
+    },
+    {
+      "epoch": 0.6517094017094017,
+      "grad_norm": 0.5645000338554382,
+      "learning_rate": 0.00018729845915774716,
+      "loss": 0.8924,
+      "step": 3660
+    },
+    {
+      "epoch": 0.6518874643874644,
+      "grad_norm": 0.5722129940986633,
+      "learning_rate": 0.00018729163106649178,
+      "loss": 1.2416,
+      "step": 3661
+    },
+    {
+      "epoch": 0.6520655270655271,
+      "grad_norm": 0.5904877185821533,
+      "learning_rate": 0.00018728480126493823,
+      "loss": 0.9792,
+      "step": 3662
+    },
+    {
+      "epoch": 0.6522435897435898,
+      "grad_norm": 0.5224713087081909,
+      "learning_rate": 0.00018727796975322026,
+      "loss": 1.079,
+      "step": 3663
+    },
+    {
+      "epoch": 0.6524216524216524,
+      "grad_norm": 0.5667217969894409,
+      "learning_rate": 0.00018727113653147184,
+      "loss": 1.1397,
+      "step": 3664
+    },
+    {
+      "epoch": 0.6525997150997151,
+      "grad_norm": 0.5274622440338135,
+      "learning_rate": 0.00018726430159982677,
+      "loss": 1.0569,
+      "step": 3665
+    },
+    {
+      "epoch": 0.6527777777777778,
+      "grad_norm": 0.5745310187339783,
+      "learning_rate": 0.00018725746495841896,
+      "loss": 1.2129,
+      "step": 3666
+    },
+    {
+      "epoch": 0.6529558404558404,
+      "grad_norm": 0.6123398542404175,
+      "learning_rate": 0.0001872506266073824,
+      "loss": 1.186,
+      "step": 3667
+    },
+    {
+      "epoch": 0.6531339031339032,
+      "grad_norm": 0.4983387291431427,
+      "learning_rate": 0.00018724378654685106,
+      "loss": 1.1957,
+      "step": 3668
+    },
+    {
+      "epoch": 0.6533119658119658,
+      "grad_norm": 0.5584192276000977,
+      "learning_rate": 0.00018723694477695897,
+      "loss": 1.0939,
+      "step": 3669
+    },
+    {
+      "epoch": 0.6534900284900285,
+      "grad_norm": 0.5318745374679565,
+      "learning_rate": 0.00018723010129784016,
+      "loss": 1.1869,
+      "step": 3670
+    },
+    {
+      "epoch": 0.6536680911680912,
+      "grad_norm": 0.4607617259025574,
+      "learning_rate": 0.0001872232561096287,
+      "loss": 0.8447,
+      "step": 3671
+    },
+    {
+      "epoch": 0.6538461538461539,
+      "grad_norm": 0.5312213897705078,
+      "learning_rate": 0.00018721640921245874,
+      "loss": 1.0623,
+      "step": 3672
+    },
+    {
+      "epoch": 0.6540242165242165,
+      "grad_norm": 0.5099136233329773,
+      "learning_rate": 0.0001872095606064644,
+      "loss": 0.7174,
+      "step": 3673
+    },
+    {
+      "epoch": 0.6542022792022792,
+      "grad_norm": 0.6894404888153076,
+      "learning_rate": 0.0001872027102917799,
+      "loss": 1.0251,
+      "step": 3674
+    },
+    {
+      "epoch": 0.6543803418803419,
+      "grad_norm": 0.5758535861968994,
+      "learning_rate": 0.00018719585826853944,
+      "loss": 1.1655,
+      "step": 3675
+    },
+    {
+      "epoch": 0.6545584045584045,
+      "grad_norm": 0.521824061870575,
+      "learning_rate": 0.0001871890045368773,
+      "loss": 1.1653,
+      "step": 3676
+    },
+    {
+      "epoch": 0.6547364672364673,
+      "grad_norm": 0.5370712280273438,
+      "learning_rate": 0.00018718214909692771,
+      "loss": 1.3152,
+      "step": 3677
+    },
+    {
+      "epoch": 0.6549145299145299,
+      "grad_norm": 0.4459827244281769,
+      "learning_rate": 0.000187175291948825,
+      "loss": 1.0953,
+      "step": 3678
+    },
+    {
+      "epoch": 0.6550925925925926,
+      "grad_norm": 0.44131460785865784,
+      "learning_rate": 0.00018716843309270353,
+      "loss": 0.8568,
+      "step": 3679
+    },
+    {
+      "epoch": 0.6552706552706553,
+      "grad_norm": 0.5529624819755554,
+      "learning_rate": 0.00018716157252869772,
+      "loss": 1.2085,
+      "step": 3680
+    },
+    {
+      "epoch": 0.655448717948718,
+      "grad_norm": 0.44604751467704773,
+      "learning_rate": 0.00018715471025694194,
+      "loss": 0.9605,
+      "step": 3681
+    },
+    {
+      "epoch": 0.6556267806267806,
+      "grad_norm": 0.4662449359893799,
+      "learning_rate": 0.0001871478462775707,
+      "loss": 1.2092,
+      "step": 3682
+    },
+    {
+      "epoch": 0.6558048433048433,
+      "grad_norm": 0.42632922530174255,
+      "learning_rate": 0.0001871409805907184,
+      "loss": 0.9141,
+      "step": 3683
+    },
+    {
+      "epoch": 0.655982905982906,
+      "grad_norm": 0.534009575843811,
+      "learning_rate": 0.00018713411319651958,
+      "loss": 1.0147,
+      "step": 3684
+    },
+    {
+      "epoch": 0.6561609686609686,
+      "grad_norm": 0.5433241724967957,
+      "learning_rate": 0.00018712724409510888,
+      "loss": 1.1998,
+      "step": 3685
+    },
+    {
+      "epoch": 0.6563390313390314,
+      "grad_norm": 0.4771319627761841,
+      "learning_rate": 0.0001871203732866208,
+      "loss": 1.0384,
+      "step": 3686
+    },
+    {
+      "epoch": 0.656517094017094,
+      "grad_norm": 0.507641077041626,
+      "learning_rate": 0.00018711350077119,
+      "loss": 0.9608,
+      "step": 3687
+    },
+    {
+      "epoch": 0.6566951566951567,
+      "grad_norm": 0.5069413185119629,
+      "learning_rate": 0.00018710662654895108,
+      "loss": 1.055,
+      "step": 3688
+    },
+    {
+      "epoch": 0.6568732193732194,
+      "grad_norm": 0.512340247631073,
+      "learning_rate": 0.00018709975062003876,
+      "loss": 0.9506,
+      "step": 3689
+    },
+    {
+      "epoch": 0.657051282051282,
+      "grad_norm": 0.5156390070915222,
+      "learning_rate": 0.00018709287298458778,
+      "loss": 1.0089,
+      "step": 3690
+    },
+    {
+      "epoch": 0.6572293447293447,
+      "grad_norm": 0.5101696252822876,
+      "learning_rate": 0.0001870859936427329,
+      "loss": 1.0441,
+      "step": 3691
+    },
+    {
+      "epoch": 0.6574074074074074,
+      "grad_norm": 0.4394689202308655,
+      "learning_rate": 0.00018707911259460884,
+      "loss": 0.9124,
+      "step": 3692
+    },
+    {
+      "epoch": 0.6575854700854701,
+      "grad_norm": 0.4842554032802582,
+      "learning_rate": 0.00018707222984035043,
+      "loss": 1.0051,
+      "step": 3693
+    },
+    {
+      "epoch": 0.6577635327635327,
+      "grad_norm": 0.6418108344078064,
+      "learning_rate": 0.00018706534538009262,
+      "loss": 1.1165,
+      "step": 3694
+    },
+    {
+      "epoch": 0.6579415954415955,
+      "grad_norm": 0.5596832036972046,
+      "learning_rate": 0.00018705845921397022,
+      "loss": 1.1127,
+      "step": 3695
+    },
+    {
+      "epoch": 0.6581196581196581,
+      "grad_norm": 0.6692909002304077,
+      "learning_rate": 0.00018705157134211813,
+      "loss": 1.2403,
+      "step": 3696
+    },
+    {
+      "epoch": 0.6582977207977208,
+      "grad_norm": 0.5046468377113342,
+      "learning_rate": 0.00018704468176467134,
+      "loss": 1.1016,
+      "step": 3697
+    },
+    {
+      "epoch": 0.6584757834757835,
+      "grad_norm": 0.6723586320877075,
+      "learning_rate": 0.00018703779048176485,
+      "loss": 1.1777,
+      "step": 3698
+    },
+    {
+      "epoch": 0.6586538461538461,
+      "grad_norm": 0.5269754528999329,
+      "learning_rate": 0.00018703089749353365,
+      "loss": 1.1441,
+      "step": 3699
+    },
+    {
+      "epoch": 0.6588319088319088,
+      "grad_norm": 0.5303323268890381,
+      "learning_rate": 0.0001870240028001128,
+      "loss": 1.07,
+      "step": 3700
+    },
+    {
+      "epoch": 0.6590099715099715,
+      "grad_norm": 0.4795511066913605,
+      "learning_rate": 0.00018701710640163738,
+      "loss": 1.0189,
+      "step": 3701
+    },
+    {
+      "epoch": 0.6591880341880342,
+      "grad_norm": 0.514659583568573,
+      "learning_rate": 0.00018701020829824255,
+      "loss": 1.0792,
+      "step": 3702
+    },
+    {
+      "epoch": 0.6593660968660968,
+      "grad_norm": 0.5407463312149048,
+      "learning_rate": 0.0001870033084900634,
+      "loss": 0.9346,
+      "step": 3703
+    },
+    {
+      "epoch": 0.6595441595441596,
+      "grad_norm": 0.5358424186706543,
+      "learning_rate": 0.0001869964069772352,
+      "loss": 1.1242,
+      "step": 3704
+    },
+    {
+      "epoch": 0.6597222222222222,
+      "grad_norm": 0.470825731754303,
+      "learning_rate": 0.00018698950375989307,
+      "loss": 0.9952,
+      "step": 3705
+    },
+    {
+      "epoch": 0.6599002849002849,
+      "grad_norm": 0.5711592435836792,
+      "learning_rate": 0.00018698259883817236,
+      "loss": 1.1678,
+      "step": 3706
+    },
+    {
+      "epoch": 0.6600783475783476,
+      "grad_norm": 0.5298995971679688,
+      "learning_rate": 0.00018697569221220832,
+      "loss": 0.869,
+      "step": 3707
+    },
+    {
+      "epoch": 0.6602564102564102,
+      "grad_norm": 0.5453875064849854,
+      "learning_rate": 0.00018696878388213626,
+      "loss": 0.9706,
+      "step": 3708
+    },
+    {
+      "epoch": 0.6604344729344729,
+      "grad_norm": 0.6219926476478577,
+      "learning_rate": 0.00018696187384809154,
+      "loss": 1.1902,
+      "step": 3709
+    },
+    {
+      "epoch": 0.6606125356125356,
+      "grad_norm": 0.5972491502761841,
+      "learning_rate": 0.00018695496211020953,
+      "loss": 1.2054,
+      "step": 3710
+    },
+    {
+      "epoch": 0.6607905982905983,
+      "grad_norm": 0.5048904418945312,
+      "learning_rate": 0.0001869480486686257,
+      "loss": 1.0405,
+      "step": 3711
+    },
+    {
+      "epoch": 0.6609686609686609,
+      "grad_norm": 0.5474200248718262,
+      "learning_rate": 0.00018694113352347546,
+      "loss": 1.09,
+      "step": 3712
+    },
+    {
+      "epoch": 0.6611467236467237,
+      "grad_norm": 0.5073318481445312,
+      "learning_rate": 0.00018693421667489432,
+      "loss": 1.0698,
+      "step": 3713
+    },
+    {
+      "epoch": 0.6613247863247863,
+      "grad_norm": 0.5693208575248718,
+      "learning_rate": 0.0001869272981230178,
+      "loss": 0.9664,
+      "step": 3714
+    },
+    {
+      "epoch": 0.6615028490028491,
+      "grad_norm": 0.5678503513336182,
+      "learning_rate": 0.00018692037786798143,
+      "loss": 1.0895,
+      "step": 3715
+    },
+    {
+      "epoch": 0.6616809116809117,
+      "grad_norm": 0.4950976073741913,
+      "learning_rate": 0.00018691345590992082,
+      "loss": 0.9584,
+      "step": 3716
+    },
+    {
+      "epoch": 0.6618589743589743,
+      "grad_norm": 0.4944666624069214,
+      "learning_rate": 0.0001869065322489716,
+      "loss": 0.8607,
+      "step": 3717
+    },
+    {
+      "epoch": 0.6620370370370371,
+      "grad_norm": 0.5197804570198059,
+      "learning_rate": 0.0001868996068852694,
+      "loss": 1.2335,
+      "step": 3718
+    },
+    {
+      "epoch": 0.6622150997150997,
+      "grad_norm": 0.6550365686416626,
+      "learning_rate": 0.00018689267981894994,
+      "loss": 1.0441,
+      "step": 3719
+    },
+    {
+      "epoch": 0.6623931623931624,
+      "grad_norm": 0.5331503748893738,
+      "learning_rate": 0.00018688575105014888,
+      "loss": 1.1696,
+      "step": 3720
+    },
+    {
+      "epoch": 0.6625712250712251,
+      "grad_norm": 0.47304239869117737,
+      "learning_rate": 0.00018687882057900207,
+      "loss": 0.9695,
+      "step": 3721
+    },
+    {
+      "epoch": 0.6627492877492878,
+      "grad_norm": 0.5653772354125977,
+      "learning_rate": 0.00018687188840564524,
+      "loss": 1.2082,
+      "step": 3722
+    },
+    {
+      "epoch": 0.6629273504273504,
+      "grad_norm": 0.5323491096496582,
+      "learning_rate": 0.00018686495453021417,
+      "loss": 0.9106,
+      "step": 3723
+    },
+    {
+      "epoch": 0.6631054131054132,
+      "grad_norm": 0.5612817406654358,
+      "learning_rate": 0.00018685801895284483,
+      "loss": 1.1302,
+      "step": 3724
+    },
+    {
+      "epoch": 0.6632834757834758,
+      "grad_norm": 0.4562164545059204,
+      "learning_rate": 0.000186851081673673,
+      "loss": 0.8886,
+      "step": 3725
+    },
+    {
+      "epoch": 0.6634615384615384,
+      "grad_norm": 0.5006430745124817,
+      "learning_rate": 0.00018684414269283463,
+      "loss": 0.9128,
+      "step": 3726
+    },
+    {
+      "epoch": 0.6636396011396012,
+      "grad_norm": 0.5305442810058594,
+      "learning_rate": 0.0001868372020104657,
+      "loss": 1.1766,
+      "step": 3727
+    },
+    {
+      "epoch": 0.6638176638176638,
+      "grad_norm": 0.6129274368286133,
+      "learning_rate": 0.0001868302596267022,
+      "loss": 1.04,
+      "step": 3728
+    },
+    {
+      "epoch": 0.6639957264957265,
+      "grad_norm": 0.5530399084091187,
+      "learning_rate": 0.00018682331554168013,
+      "loss": 1.4114,
+      "step": 3729
+    },
+    {
+      "epoch": 0.6641737891737892,
+      "grad_norm": 0.5397193431854248,
+      "learning_rate": 0.00018681636975553557,
+      "loss": 1.1945,
+      "step": 3730
+    },
+    {
+      "epoch": 0.6643518518518519,
+      "grad_norm": 0.5510205030441284,
+      "learning_rate": 0.00018680942226840456,
+      "loss": 1.0489,
+      "step": 3731
+    },
+    {
+      "epoch": 0.6645299145299145,
+      "grad_norm": 0.5519221425056458,
+      "learning_rate": 0.00018680247308042324,
+      "loss": 1.1633,
+      "step": 3732
+    },
+    {
+      "epoch": 0.6647079772079773,
+      "grad_norm": 0.4848768711090088,
+      "learning_rate": 0.00018679552219172784,
+      "loss": 0.8716,
+      "step": 3733
+    },
+    {
+      "epoch": 0.6648860398860399,
+      "grad_norm": 0.5490246415138245,
+      "learning_rate": 0.0001867885696024544,
+      "loss": 1.1347,
+      "step": 3734
+    },
+    {
+      "epoch": 0.6650641025641025,
+      "grad_norm": 0.5281458497047424,
+      "learning_rate": 0.00018678161531273928,
+      "loss": 1.0987,
+      "step": 3735
+    },
+    {
+      "epoch": 0.6652421652421653,
+      "grad_norm": 0.5313079953193665,
+      "learning_rate": 0.00018677465932271867,
+      "loss": 0.9705,
+      "step": 3736
+    },
+    {
+      "epoch": 0.6654202279202279,
+      "grad_norm": 0.5425750017166138,
+      "learning_rate": 0.0001867677016325289,
+      "loss": 1.1847,
+      "step": 3737
+    },
+    {
+      "epoch": 0.6655982905982906,
+      "grad_norm": 0.5796298980712891,
+      "learning_rate": 0.0001867607422423062,
+      "loss": 1.2639,
+      "step": 3738
+    },
+    {
+      "epoch": 0.6657763532763533,
+      "grad_norm": 0.49738675355911255,
+      "learning_rate": 0.00018675378115218702,
+      "loss": 1.0536,
+      "step": 3739
+    },
+    {
+      "epoch": 0.665954415954416,
+      "grad_norm": 0.665250301361084,
+      "learning_rate": 0.0001867468183623077,
+      "loss": 1.2836,
+      "step": 3740
+    },
+    {
+      "epoch": 0.6661324786324786,
+      "grad_norm": 0.5184717178344727,
+      "learning_rate": 0.00018673985387280469,
+      "loss": 1.0497,
+      "step": 3741
+    },
+    {
+      "epoch": 0.6663105413105413,
+      "grad_norm": 0.5129656791687012,
+      "learning_rate": 0.00018673288768381442,
+      "loss": 1.2041,
+      "step": 3742
+    },
+    {
+      "epoch": 0.666488603988604,
+      "grad_norm": 0.5308768153190613,
+      "learning_rate": 0.00018672591979547337,
+      "loss": 1.2092,
+      "step": 3743
+    },
+    {
+      "epoch": 0.6666666666666666,
+      "grad_norm": 0.5059141516685486,
+      "learning_rate": 0.00018671895020791812,
+      "loss": 1.1929,
+      "step": 3744
+    },
+    {
+      "epoch": 0.6668447293447294,
+      "grad_norm": 0.5237857103347778,
+      "learning_rate": 0.00018671197892128517,
+      "loss": 1.2538,
+      "step": 3745
+    },
+    {
+      "epoch": 0.667022792022792,
+      "grad_norm": 0.450000137090683,
+      "learning_rate": 0.0001867050059357111,
+      "loss": 0.7138,
+      "step": 3746
+    },
+    {
+      "epoch": 0.6672008547008547,
+      "grad_norm": 0.5413795709609985,
+      "learning_rate": 0.00018669803125133258,
+      "loss": 1.1383,
+      "step": 3747
+    },
+    {
+      "epoch": 0.6673789173789174,
+      "grad_norm": 0.4657825529575348,
+      "learning_rate": 0.00018669105486828622,
+      "loss": 1.0518,
+      "step": 3748
+    },
+    {
+      "epoch": 0.66755698005698,
+      "grad_norm": 0.6198551654815674,
+      "learning_rate": 0.00018668407678670875,
+      "loss": 1.2697,
+      "step": 3749
+    },
+    {
+      "epoch": 0.6677350427350427,
+      "grad_norm": 0.5112186074256897,
+      "learning_rate": 0.00018667709700673685,
+      "loss": 0.9907,
+      "step": 3750
+    },
+    {
+      "epoch": 0.6679131054131054,
+      "grad_norm": 0.5446593761444092,
+      "learning_rate": 0.00018667011552850728,
+      "loss": 1.0708,
+      "step": 3751
+    },
+    {
+      "epoch": 0.6680911680911681,
+      "grad_norm": 0.5673866271972656,
+      "learning_rate": 0.00018666313235215682,
+      "loss": 1.05,
+      "step": 3752
+    },
+    {
+      "epoch": 0.6682692307692307,
+      "grad_norm": 0.4821988046169281,
+      "learning_rate": 0.00018665614747782235,
+      "loss": 1.0543,
+      "step": 3753
+    },
+    {
+      "epoch": 0.6684472934472935,
+      "grad_norm": 0.5158842206001282,
+      "learning_rate": 0.00018664916090564067,
+      "loss": 1.0331,
+      "step": 3754
+    },
+    {
+      "epoch": 0.6686253561253561,
+      "grad_norm": 0.45486921072006226,
+      "learning_rate": 0.00018664217263574865,
+      "loss": 0.9262,
+      "step": 3755
+    },
+    {
+      "epoch": 0.6688034188034188,
+      "grad_norm": 0.46193036437034607,
+      "learning_rate": 0.00018663518266828327,
+      "loss": 0.9858,
+      "step": 3756
+    },
+    {
+      "epoch": 0.6689814814814815,
+      "grad_norm": 0.5144094824790955,
+      "learning_rate": 0.00018662819100338148,
+      "loss": 1.0302,
+      "step": 3757
+    },
+    {
+      "epoch": 0.6691595441595442,
+      "grad_norm": 0.5246134400367737,
+      "learning_rate": 0.0001866211976411802,
+      "loss": 1.064,
+      "step": 3758
+    },
+    {
+      "epoch": 0.6693376068376068,
+      "grad_norm": 0.4853166937828064,
+      "learning_rate": 0.0001866142025818165,
+      "loss": 0.9481,
+      "step": 3759
+    },
+    {
+      "epoch": 0.6695156695156695,
+      "grad_norm": 0.5029586553573608,
+      "learning_rate": 0.00018660720582542743,
+      "loss": 0.9443,
+      "step": 3760
+    },
+    {
+      "epoch": 0.6696937321937322,
+      "grad_norm": 0.5373172163963318,
+      "learning_rate": 0.0001866002073721501,
+      "loss": 1.1401,
+      "step": 3761
+    },
+    {
+      "epoch": 0.6698717948717948,
+      "grad_norm": 0.6236287951469421,
+      "learning_rate": 0.00018659320722212158,
+      "loss": 1.1255,
+      "step": 3762
+    },
+    {
+      "epoch": 0.6700498575498576,
+      "grad_norm": 0.5470684766769409,
+      "learning_rate": 0.00018658620537547903,
+      "loss": 1.0622,
+      "step": 3763
+    },
+    {
+      "epoch": 0.6702279202279202,
+      "grad_norm": 0.63177090883255,
+      "learning_rate": 0.00018657920183235964,
+      "loss": 0.9736,
+      "step": 3764
+    },
+    {
+      "epoch": 0.6704059829059829,
+      "grad_norm": 0.5456309914588928,
+      "learning_rate": 0.00018657219659290068,
+      "loss": 1.027,
+      "step": 3765
+    },
+    {
+      "epoch": 0.6705840455840456,
+      "grad_norm": 0.4816138744354248,
+      "learning_rate": 0.00018656518965723935,
+      "loss": 0.7801,
+      "step": 3766
+    },
+    {
+      "epoch": 0.6707621082621082,
+      "grad_norm": 0.4811640679836273,
+      "learning_rate": 0.00018655818102551294,
+      "loss": 1.0535,
+      "step": 3767
+    },
+    {
+      "epoch": 0.6709401709401709,
+      "grad_norm": 0.4677673280239105,
+      "learning_rate": 0.00018655117069785884,
+      "loss": 1.1043,
+      "step": 3768
+    },
+    {
+      "epoch": 0.6711182336182336,
+      "grad_norm": 0.5628635883331299,
+      "learning_rate": 0.0001865441586744143,
+      "loss": 1.0392,
+      "step": 3769
+    },
+    {
+      "epoch": 0.6712962962962963,
+      "grad_norm": 0.5484504103660583,
+      "learning_rate": 0.00018653714495531673,
+      "loss": 1.1533,
+      "step": 3770
+    },
+    {
+      "epoch": 0.6714743589743589,
+      "grad_norm": 0.5830571055412292,
+      "learning_rate": 0.0001865301295407036,
+      "loss": 1.2479,
+      "step": 3771
+    },
+    {
+      "epoch": 0.6716524216524217,
+      "grad_norm": 0.5516841411590576,
+      "learning_rate": 0.00018652311243071235,
+      "loss": 1.2152,
+      "step": 3772
+    },
+    {
+      "epoch": 0.6718304843304843,
+      "grad_norm": 0.6360766291618347,
+      "learning_rate": 0.0001865160936254804,
+      "loss": 1.0752,
+      "step": 3773
+    },
+    {
+      "epoch": 0.6720085470085471,
+      "grad_norm": 0.6038610935211182,
+      "learning_rate": 0.00018650907312514533,
+      "loss": 1.2425,
+      "step": 3774
+    },
+    {
+      "epoch": 0.6721866096866097,
+      "grad_norm": 0.49572908878326416,
+      "learning_rate": 0.0001865020509298447,
+      "loss": 1.0057,
+      "step": 3775
+    },
+    {
+      "epoch": 0.6723646723646723,
+      "grad_norm": 0.4551616311073303,
+      "learning_rate": 0.00018649502703971607,
+      "loss": 1.0763,
+      "step": 3776
+    },
+    {
+      "epoch": 0.6725427350427351,
+      "grad_norm": 0.6621482372283936,
+      "learning_rate": 0.00018648800145489706,
+      "loss": 1.0306,
+      "step": 3777
+    },
+    {
+      "epoch": 0.6727207977207977,
+      "grad_norm": 0.5523806810379028,
+      "learning_rate": 0.0001864809741755253,
+      "loss": 0.9906,
+      "step": 3778
+    },
+    {
+      "epoch": 0.6728988603988604,
+      "grad_norm": 0.5527048110961914,
+      "learning_rate": 0.00018647394520173856,
+      "loss": 1.0734,
+      "step": 3779
+    },
+    {
+      "epoch": 0.6730769230769231,
+      "grad_norm": 0.573573887348175,
+      "learning_rate": 0.00018646691453367444,
+      "loss": 1.1409,
+      "step": 3780
+    },
+    {
+      "epoch": 0.6732549857549858,
+      "grad_norm": 0.6273239254951477,
+      "learning_rate": 0.00018645988217147079,
+      "loss": 0.9682,
+      "step": 3781
+    },
+    {
+      "epoch": 0.6734330484330484,
+      "grad_norm": 0.4917762279510498,
+      "learning_rate": 0.00018645284811526534,
+      "loss": 0.9681,
+      "step": 3782
+    },
+    {
+      "epoch": 0.6736111111111112,
+      "grad_norm": 0.4901154339313507,
+      "learning_rate": 0.0001864458123651959,
+      "loss": 1.1828,
+      "step": 3783
+    },
+    {
+      "epoch": 0.6737891737891738,
+      "grad_norm": 0.6292546391487122,
+      "learning_rate": 0.00018643877492140036,
+      "loss": 1.1987,
+      "step": 3784
+    },
+    {
+      "epoch": 0.6739672364672364,
+      "grad_norm": 0.5334137678146362,
+      "learning_rate": 0.0001864317357840166,
+      "loss": 1.0347,
+      "step": 3785
+    },
+    {
+      "epoch": 0.6741452991452992,
+      "grad_norm": 0.6064338684082031,
+      "learning_rate": 0.0001864246949531825,
+      "loss": 1.4154,
+      "step": 3786
+    },
+    {
+      "epoch": 0.6743233618233618,
+      "grad_norm": 0.5442034602165222,
+      "learning_rate": 0.000186417652429036,
+      "loss": 1.2604,
+      "step": 3787
+    },
+    {
+      "epoch": 0.6745014245014245,
+      "grad_norm": 0.490858793258667,
+      "learning_rate": 0.00018641060821171518,
+      "loss": 1.1511,
+      "step": 3788
+    },
+    {
+      "epoch": 0.6746794871794872,
+      "grad_norm": 0.571116030216217,
+      "learning_rate": 0.00018640356230135798,
+      "loss": 1.1479,
+      "step": 3789
+    },
+    {
+      "epoch": 0.6748575498575499,
+      "grad_norm": 0.4857785105705261,
+      "learning_rate": 0.00018639651469810247,
+      "loss": 0.9,
+      "step": 3790
+    },
+    {
+      "epoch": 0.6750356125356125,
+      "grad_norm": 0.5320703983306885,
+      "learning_rate": 0.0001863894654020867,
+      "loss": 1.2284,
+      "step": 3791
+    },
+    {
+      "epoch": 0.6752136752136753,
+      "grad_norm": 0.5586925745010376,
+      "learning_rate": 0.0001863824144134488,
+      "loss": 1.1183,
+      "step": 3792
+    },
+    {
+      "epoch": 0.6753917378917379,
+      "grad_norm": 0.47740885615348816,
+      "learning_rate": 0.000186375361732327,
+      "loss": 1.1512,
+      "step": 3793
+    },
+    {
+      "epoch": 0.6755698005698005,
+      "grad_norm": 0.5867732167243958,
+      "learning_rate": 0.00018636830735885935,
+      "loss": 1.1903,
+      "step": 3794
+    },
+    {
+      "epoch": 0.6757478632478633,
+      "grad_norm": 0.5013887882232666,
+      "learning_rate": 0.0001863612512931842,
+      "loss": 0.8581,
+      "step": 3795
+    },
+    {
+      "epoch": 0.6759259259259259,
+      "grad_norm": 0.6026871204376221,
+      "learning_rate": 0.0001863541935354397,
+      "loss": 0.9581,
+      "step": 3796
+    },
+    {
+      "epoch": 0.6761039886039886,
+      "grad_norm": 0.5238468647003174,
+      "learning_rate": 0.00018634713408576415,
+      "loss": 1.0949,
+      "step": 3797
+    },
+    {
+      "epoch": 0.6762820512820513,
+      "grad_norm": 0.5128598213195801,
+      "learning_rate": 0.00018634007294429585,
+      "loss": 0.8992,
+      "step": 3798
+    },
+    {
+      "epoch": 0.676460113960114,
+      "grad_norm": 0.5092771053314209,
+      "learning_rate": 0.00018633301011117324,
+      "loss": 1.0793,
+      "step": 3799
+    },
+    {
+      "epoch": 0.6766381766381766,
+      "grad_norm": 0.592566728591919,
+      "learning_rate": 0.00018632594558653457,
+      "loss": 1.3242,
+      "step": 3800
+    },
+    {
+      "epoch": 0.6768162393162394,
+      "grad_norm": 0.4953067898750305,
+      "learning_rate": 0.0001863188793705184,
+      "loss": 0.9925,
+      "step": 3801
+    },
+    {
+      "epoch": 0.676994301994302,
+      "grad_norm": 0.4989747107028961,
+      "learning_rate": 0.00018631181146326305,
+      "loss": 1.0677,
+      "step": 3802
+    },
+    {
+      "epoch": 0.6771723646723646,
+      "grad_norm": 0.5375261902809143,
+      "learning_rate": 0.00018630474186490705,
+      "loss": 1.0556,
+      "step": 3803
+    },
+    {
+      "epoch": 0.6773504273504274,
+      "grad_norm": 0.6512624025344849,
+      "learning_rate": 0.00018629767057558894,
+      "loss": 1.2041,
+      "step": 3804
+    },
+    {
+      "epoch": 0.67752849002849,
+      "grad_norm": 0.5428260564804077,
+      "learning_rate": 0.00018629059759544723,
+      "loss": 0.9645,
+      "step": 3805
+    },
+    {
+      "epoch": 0.6777065527065527,
+      "grad_norm": 0.5598662495613098,
+      "learning_rate": 0.00018628352292462052,
+      "loss": 1.1683,
+      "step": 3806
+    },
+    {
+      "epoch": 0.6778846153846154,
+      "grad_norm": 0.49351340532302856,
+      "learning_rate": 0.0001862764465632474,
+      "loss": 1.1622,
+      "step": 3807
+    },
+    {
+      "epoch": 0.6780626780626781,
+      "grad_norm": 0.4796701669692993,
+      "learning_rate": 0.00018626936851146657,
+      "loss": 1.0017,
+      "step": 3808
+    },
+    {
+      "epoch": 0.6782407407407407,
+      "grad_norm": 0.444533109664917,
+      "learning_rate": 0.00018626228876941664,
+      "loss": 0.9145,
+      "step": 3809
+    },
+    {
+      "epoch": 0.6784188034188035,
+      "grad_norm": 0.5197392702102661,
+      "learning_rate": 0.00018625520733723635,
+      "loss": 1.283,
+      "step": 3810
+    },
+    {
+      "epoch": 0.6785968660968661,
+      "grad_norm": 0.48785829544067383,
+      "learning_rate": 0.00018624812421506447,
+      "loss": 1.1084,
+      "step": 3811
+    },
+    {
+      "epoch": 0.6787749287749287,
+      "grad_norm": 0.5083680152893066,
+      "learning_rate": 0.00018624103940303974,
+      "loss": 0.9071,
+      "step": 3812
+    },
+    {
+      "epoch": 0.6789529914529915,
+      "grad_norm": 0.553819477558136,
+      "learning_rate": 0.00018623395290130103,
+      "loss": 0.9986,
+      "step": 3813
+    },
+    {
+      "epoch": 0.6791310541310541,
+      "grad_norm": 0.5347508788108826,
+      "learning_rate": 0.00018622686470998713,
+      "loss": 1.0148,
+      "step": 3814
+    },
+    {
+      "epoch": 0.6793091168091168,
+      "grad_norm": 0.5080769062042236,
+      "learning_rate": 0.00018621977482923693,
+      "loss": 1.0169,
+      "step": 3815
+    },
+    {
+      "epoch": 0.6794871794871795,
+      "grad_norm": 0.5444077849388123,
+      "learning_rate": 0.00018621268325918938,
+      "loss": 1.172,
+      "step": 3816
+    },
+    {
+      "epoch": 0.6796652421652422,
+      "grad_norm": 0.521946132183075,
+      "learning_rate": 0.00018620558999998335,
+      "loss": 1.0247,
+      "step": 3817
+    },
+    {
+      "epoch": 0.6798433048433048,
+      "grad_norm": 0.5257413983345032,
+      "learning_rate": 0.00018619849505175786,
+      "loss": 1.1574,
+      "step": 3818
+    },
+    {
+      "epoch": 0.6800213675213675,
+      "grad_norm": 0.5473007559776306,
+      "learning_rate": 0.00018619139841465193,
+      "loss": 1.1254,
+      "step": 3819
+    },
+    {
+      "epoch": 0.6801994301994302,
+      "grad_norm": 0.5479872226715088,
+      "learning_rate": 0.00018618430008880463,
+      "loss": 1.0196,
+      "step": 3820
+    },
+    {
+      "epoch": 0.6803774928774928,
+      "grad_norm": 0.5918973088264465,
+      "learning_rate": 0.00018617720007435497,
+      "loss": 1.082,
+      "step": 3821
+    },
+    {
+      "epoch": 0.6805555555555556,
+      "grad_norm": 0.5411791801452637,
+      "learning_rate": 0.0001861700983714421,
+      "loss": 0.7723,
+      "step": 3822
+    },
+    {
+      "epoch": 0.6807336182336182,
+      "grad_norm": 0.5466326475143433,
+      "learning_rate": 0.00018616299498020516,
+      "loss": 1.0979,
+      "step": 3823
+    },
+    {
+      "epoch": 0.6809116809116809,
+      "grad_norm": 0.5405182838439941,
+      "learning_rate": 0.00018615588990078332,
+      "loss": 0.8891,
+      "step": 3824
+    },
+    {
+      "epoch": 0.6810897435897436,
+      "grad_norm": 0.5415780544281006,
+      "learning_rate": 0.00018614878313331579,
+      "loss": 1.0927,
+      "step": 3825
+    },
+    {
+      "epoch": 0.6812678062678063,
+      "grad_norm": 0.5284909605979919,
+      "learning_rate": 0.00018614167467794182,
+      "loss": 1.0684,
+      "step": 3826
+    },
+    {
+      "epoch": 0.6814458689458689,
+      "grad_norm": 0.4873995780944824,
+      "learning_rate": 0.00018613456453480062,
+      "loss": 1.1653,
+      "step": 3827
+    },
+    {
+      "epoch": 0.6816239316239316,
+      "grad_norm": 0.5506551265716553,
+      "learning_rate": 0.0001861274527040316,
+      "loss": 0.9876,
+      "step": 3828
+    },
+    {
+      "epoch": 0.6818019943019943,
+      "grad_norm": 0.5031297206878662,
+      "learning_rate": 0.0001861203391857741,
+      "loss": 1.067,
+      "step": 3829
+    },
+    {
+      "epoch": 0.6819800569800569,
+      "grad_norm": 0.622346043586731,
+      "learning_rate": 0.0001861132239801674,
+      "loss": 1.1514,
+      "step": 3830
+    },
+    {
+      "epoch": 0.6821581196581197,
+      "grad_norm": 0.47706183791160583,
+      "learning_rate": 0.000186106107087351,
+      "loss": 0.9857,
+      "step": 3831
+    },
+    {
+      "epoch": 0.6823361823361823,
+      "grad_norm": 0.5082845091819763,
+      "learning_rate": 0.00018609898850746424,
+      "loss": 1.123,
+      "step": 3832
+    },
+    {
+      "epoch": 0.6825142450142451,
+      "grad_norm": 0.5119805932044983,
+      "learning_rate": 0.00018609186824064671,
+      "loss": 1.1386,
+      "step": 3833
+    },
+    {
+      "epoch": 0.6826923076923077,
+      "grad_norm": 0.5247541069984436,
+      "learning_rate": 0.00018608474628703788,
+      "loss": 0.9433,
+      "step": 3834
+    },
+    {
+      "epoch": 0.6828703703703703,
+      "grad_norm": 0.4618282616138458,
+      "learning_rate": 0.00018607762264677722,
+      "loss": 0.8727,
+      "step": 3835
+    },
+    {
+      "epoch": 0.6830484330484331,
+      "grad_norm": 0.6014040112495422,
+      "learning_rate": 0.00018607049732000436,
+      "loss": 1.1823,
+      "step": 3836
+    },
+    {
+      "epoch": 0.6832264957264957,
+      "grad_norm": 0.6489043831825256,
+      "learning_rate": 0.00018606337030685892,
+      "loss": 1.1466,
+      "step": 3837
+    },
+    {
+      "epoch": 0.6834045584045584,
+      "grad_norm": 0.5527763366699219,
+      "learning_rate": 0.00018605624160748053,
+      "loss": 1.3015,
+      "step": 3838
+    },
+    {
+      "epoch": 0.6835826210826211,
+      "grad_norm": 0.5628284215927124,
+      "learning_rate": 0.0001860491112220088,
+      "loss": 1.1504,
+      "step": 3839
+    },
+    {
+      "epoch": 0.6837606837606838,
+      "grad_norm": 0.5414566993713379,
+      "learning_rate": 0.00018604197915058355,
+      "loss": 1.0155,
+      "step": 3840
+    },
+    {
+      "epoch": 0.6839387464387464,
+      "grad_norm": 0.5378929376602173,
+      "learning_rate": 0.00018603484539334443,
+      "loss": 0.8917,
+      "step": 3841
+    },
+    {
+      "epoch": 0.6841168091168092,
+      "grad_norm": 0.5953748822212219,
+      "learning_rate": 0.00018602770995043125,
+      "loss": 1.1971,
+      "step": 3842
+    },
+    {
+      "epoch": 0.6842948717948718,
+      "grad_norm": 0.511813759803772,
+      "learning_rate": 0.00018602057282198376,
+      "loss": 1.1345,
+      "step": 3843
+    },
+    {
+      "epoch": 0.6844729344729344,
+      "grad_norm": 0.5145484209060669,
+      "learning_rate": 0.00018601343400814185,
+      "loss": 1.0786,
+      "step": 3844
+    },
+    {
+      "epoch": 0.6846509971509972,
+      "grad_norm": 0.5199604034423828,
+      "learning_rate": 0.00018600629350904542,
+      "loss": 1.2063,
+      "step": 3845
+    },
+    {
+      "epoch": 0.6848290598290598,
+      "grad_norm": 0.5653825998306274,
+      "learning_rate": 0.0001859991513248343,
+      "loss": 1.0314,
+      "step": 3846
+    },
+    {
+      "epoch": 0.6850071225071225,
+      "grad_norm": 0.5660843849182129,
+      "learning_rate": 0.00018599200745564843,
+      "loss": 1.2754,
+      "step": 3847
+    },
+    {
+      "epoch": 0.6851851851851852,
+      "grad_norm": 0.5225719809532166,
+      "learning_rate": 0.00018598486190162788,
+      "loss": 1.0837,
+      "step": 3848
+    },
+    {
+      "epoch": 0.6853632478632479,
+      "grad_norm": 0.5011669397354126,
+      "learning_rate": 0.00018597771466291252,
+      "loss": 1.1,
+      "step": 3849
+    },
+    {
+      "epoch": 0.6855413105413105,
+      "grad_norm": 0.5923115015029907,
+      "learning_rate": 0.00018597056573964245,
+      "loss": 1.1875,
+      "step": 3850
+    },
+    {
+      "epoch": 0.6857193732193733,
+      "grad_norm": 0.5666482448577881,
+      "learning_rate": 0.00018596341513195776,
+      "loss": 1.1663,
+      "step": 3851
+    },
+    {
+      "epoch": 0.6858974358974359,
+      "grad_norm": 0.5396790504455566,
+      "learning_rate": 0.0001859562628399985,
+      "loss": 1.1179,
+      "step": 3852
+    },
+    {
+      "epoch": 0.6860754985754985,
+      "grad_norm": 0.5709532499313354,
+      "learning_rate": 0.00018594910886390485,
+      "loss": 1.0369,
+      "step": 3853
+    },
+    {
+      "epoch": 0.6862535612535613,
+      "grad_norm": 0.45524322986602783,
+      "learning_rate": 0.00018594195320381692,
+      "loss": 1.0171,
+      "step": 3854
+    },
+    {
+      "epoch": 0.6864316239316239,
+      "grad_norm": 0.6130724549293518,
+      "learning_rate": 0.00018593479585987498,
+      "loss": 1.1944,
+      "step": 3855
+    },
+    {
+      "epoch": 0.6866096866096866,
+      "grad_norm": 0.5079745054244995,
+      "learning_rate": 0.0001859276368322192,
+      "loss": 1.2567,
+      "step": 3856
+    },
+    {
+      "epoch": 0.6867877492877493,
+      "grad_norm": 0.49919846653938293,
+      "learning_rate": 0.00018592047612098992,
+      "loss": 0.9459,
+      "step": 3857
+    },
+    {
+      "epoch": 0.686965811965812,
+      "grad_norm": 0.5776857733726501,
+      "learning_rate": 0.00018591331372632734,
+      "loss": 1.2456,
+      "step": 3858
+    },
+    {
+      "epoch": 0.6871438746438746,
+      "grad_norm": 0.4740692377090454,
+      "learning_rate": 0.00018590614964837188,
+      "loss": 1.0401,
+      "step": 3859
+    },
+    {
+      "epoch": 0.6873219373219374,
+      "grad_norm": 0.5015742182731628,
+      "learning_rate": 0.00018589898388726389,
+      "loss": 1.2052,
+      "step": 3860
+    },
+    {
+      "epoch": 0.6875,
+      "grad_norm": 0.4819730818271637,
+      "learning_rate": 0.0001858918164431437,
+      "loss": 1.007,
+      "step": 3861
+    },
+    {
+      "epoch": 0.6876780626780626,
+      "grad_norm": 0.5510426163673401,
+      "learning_rate": 0.00018588464731615184,
+      "loss": 1.0123,
+      "step": 3862
+    },
+    {
+      "epoch": 0.6878561253561254,
+      "grad_norm": 0.4950829744338989,
+      "learning_rate": 0.00018587747650642867,
+      "loss": 1.033,
+      "step": 3863
+    },
+    {
+      "epoch": 0.688034188034188,
+      "grad_norm": 0.5278680920600891,
+      "learning_rate": 0.0001858703040141148,
+      "loss": 1.0912,
+      "step": 3864
+    },
+    {
+      "epoch": 0.6882122507122507,
+      "grad_norm": 0.6359158158302307,
+      "learning_rate": 0.00018586312983935068,
+      "loss": 1.2868,
+      "step": 3865
+    },
+    {
+      "epoch": 0.6883903133903134,
+      "grad_norm": 0.5098239183425903,
+      "learning_rate": 0.0001858559539822769,
+      "loss": 0.8364,
+      "step": 3866
+    },
+    {
+      "epoch": 0.6885683760683761,
+      "grad_norm": 0.5651038289070129,
+      "learning_rate": 0.000185848776443034,
+      "loss": 1.1983,
+      "step": 3867
+    },
+    {
+      "epoch": 0.6887464387464387,
+      "grad_norm": 0.5305678248405457,
+      "learning_rate": 0.00018584159722176272,
+      "loss": 1.32,
+      "step": 3868
+    },
+    {
+      "epoch": 0.6889245014245015,
+      "grad_norm": 0.5481845140457153,
+      "learning_rate": 0.00018583441631860368,
+      "loss": 1.013,
+      "step": 3869
+    },
+    {
+      "epoch": 0.6891025641025641,
+      "grad_norm": 0.5214795470237732,
+      "learning_rate": 0.00018582723373369753,
+      "loss": 1.172,
+      "step": 3870
+    },
+    {
+      "epoch": 0.6892806267806267,
+      "grad_norm": 0.6282780766487122,
+      "learning_rate": 0.00018582004946718502,
+      "loss": 1.7304,
+      "step": 3871
+    },
+    {
+      "epoch": 0.6894586894586895,
+      "grad_norm": 0.5266988277435303,
+      "learning_rate": 0.0001858128635192069,
+      "loss": 1.1418,
+      "step": 3872
+    },
+    {
+      "epoch": 0.6896367521367521,
+      "grad_norm": 0.4761001467704773,
+      "learning_rate": 0.000185805675889904,
+      "loss": 0.8585,
+      "step": 3873
+    },
+    {
+      "epoch": 0.6898148148148148,
+      "grad_norm": 0.528779923915863,
+      "learning_rate": 0.00018579848657941715,
+      "loss": 1.0036,
+      "step": 3874
+    },
+    {
+      "epoch": 0.6899928774928775,
+      "grad_norm": 0.5427684783935547,
+      "learning_rate": 0.00018579129558788716,
+      "loss": 0.9769,
+      "step": 3875
+    },
+    {
+      "epoch": 0.6901709401709402,
+      "grad_norm": 0.6229544281959534,
+      "learning_rate": 0.00018578410291545495,
+      "loss": 1.2848,
+      "step": 3876
+    },
+    {
+      "epoch": 0.6903490028490028,
+      "grad_norm": 0.6602693200111389,
+      "learning_rate": 0.00018577690856226147,
+      "loss": 1.2713,
+      "step": 3877
+    },
+    {
+      "epoch": 0.6905270655270656,
+      "grad_norm": 0.45884042978286743,
+      "learning_rate": 0.0001857697125284476,
+      "loss": 0.9143,
+      "step": 3878
+    },
+    {
+      "epoch": 0.6907051282051282,
+      "grad_norm": 0.4956444203853607,
+      "learning_rate": 0.00018576251481415443,
+      "loss": 0.9646,
+      "step": 3879
+    },
+    {
+      "epoch": 0.6908831908831908,
+      "grad_norm": 0.473561555147171,
+      "learning_rate": 0.00018575531541952292,
+      "loss": 0.843,
+      "step": 3880
+    },
+    {
+      "epoch": 0.6910612535612536,
+      "grad_norm": 0.4676312506198883,
+      "learning_rate": 0.00018574811434469415,
+      "loss": 0.9464,
+      "step": 3881
+    },
+    {
+      "epoch": 0.6912393162393162,
+      "grad_norm": 0.5452045202255249,
+      "learning_rate": 0.00018574091158980922,
+      "loss": 0.985,
+      "step": 3882
+    },
+    {
+      "epoch": 0.6914173789173789,
+      "grad_norm": 0.6274946331977844,
+      "learning_rate": 0.0001857337071550092,
+      "loss": 1.0357,
+      "step": 3883
+    },
+    {
+      "epoch": 0.6915954415954416,
+      "grad_norm": 0.5533788800239563,
+      "learning_rate": 0.00018572650104043531,
+      "loss": 1.2636,
+      "step": 3884
+    },
+    {
+      "epoch": 0.6917735042735043,
+      "grad_norm": 0.48312318325042725,
+      "learning_rate": 0.00018571929324622872,
+      "loss": 1.2402,
+      "step": 3885
+    },
+    {
+      "epoch": 0.6919515669515669,
+      "grad_norm": 0.6087453961372375,
+      "learning_rate": 0.00018571208377253062,
+      "loss": 1.2961,
+      "step": 3886
+    },
+    {
+      "epoch": 0.6921296296296297,
+      "grad_norm": 0.49156486988067627,
+      "learning_rate": 0.00018570487261948234,
+      "loss": 0.9585,
+      "step": 3887
+    },
+    {
+      "epoch": 0.6923076923076923,
+      "grad_norm": 0.5200015902519226,
+      "learning_rate": 0.0001856976597872251,
+      "loss": 0.9274,
+      "step": 3888
+    },
+    {
+      "epoch": 0.6924857549857549,
+      "grad_norm": 0.5185118913650513,
+      "learning_rate": 0.0001856904452759002,
+      "loss": 1.0015,
+      "step": 3889
+    },
+    {
+      "epoch": 0.6926638176638177,
+      "grad_norm": 0.5859049558639526,
+      "learning_rate": 0.00018568322908564904,
+      "loss": 1.0959,
+      "step": 3890
+    },
+    {
+      "epoch": 0.6928418803418803,
+      "grad_norm": 0.5882301926612854,
+      "learning_rate": 0.00018567601121661302,
+      "loss": 1.3214,
+      "step": 3891
+    },
+    {
+      "epoch": 0.6930199430199431,
+      "grad_norm": 0.6475503444671631,
+      "learning_rate": 0.0001856687916689335,
+      "loss": 1.3265,
+      "step": 3892
+    },
+    {
+      "epoch": 0.6931980056980057,
+      "grad_norm": 0.46175432205200195,
+      "learning_rate": 0.000185661570442752,
+      "loss": 0.8547,
+      "step": 3893
+    },
+    {
+      "epoch": 0.6933760683760684,
+      "grad_norm": 0.5362716913223267,
+      "learning_rate": 0.00018565434753820998,
+      "loss": 0.974,
+      "step": 3894
+    },
+    {
+      "epoch": 0.6935541310541311,
+      "grad_norm": 0.4317963719367981,
+      "learning_rate": 0.00018564712295544896,
+      "loss": 0.7653,
+      "step": 3895
+    },
+    {
+      "epoch": 0.6937321937321937,
+      "grad_norm": 0.5679717659950256,
+      "learning_rate": 0.00018563989669461047,
+      "loss": 1.0691,
+      "step": 3896
+    },
+    {
+      "epoch": 0.6939102564102564,
+      "grad_norm": 0.5058363676071167,
+      "learning_rate": 0.00018563266875583608,
+      "loss": 1.0665,
+      "step": 3897
+    },
+    {
+      "epoch": 0.6940883190883191,
+      "grad_norm": 0.5365496277809143,
+      "learning_rate": 0.00018562543913926746,
+      "loss": 0.9963,
+      "step": 3898
+    },
+    {
+      "epoch": 0.6942663817663818,
+      "grad_norm": 0.49945300817489624,
+      "learning_rate": 0.0001856182078450462,
+      "loss": 0.8668,
+      "step": 3899
+    },
+    {
+      "epoch": 0.6944444444444444,
+      "grad_norm": 0.5869430899620056,
+      "learning_rate": 0.00018561097487331405,
+      "loss": 1.1942,
+      "step": 3900
+    },
+    {
+      "epoch": 0.6946225071225072,
+      "grad_norm": 0.5188950300216675,
+      "learning_rate": 0.0001856037402242127,
+      "loss": 0.9493,
+      "step": 3901
+    },
+    {
+      "epoch": 0.6948005698005698,
+      "grad_norm": 0.510788083076477,
+      "learning_rate": 0.00018559650389788384,
+      "loss": 0.9989,
+      "step": 3902
+    },
+    {
+      "epoch": 0.6949786324786325,
+      "grad_norm": 0.5360601544380188,
+      "learning_rate": 0.0001855892658944693,
+      "loss": 1.2766,
+      "step": 3903
+    },
+    {
+      "epoch": 0.6951566951566952,
+      "grad_norm": 0.522502601146698,
+      "learning_rate": 0.00018558202621411093,
+      "loss": 0.8774,
+      "step": 3904
+    },
+    {
+      "epoch": 0.6953347578347578,
+      "grad_norm": 0.5330635905265808,
+      "learning_rate": 0.00018557478485695052,
+      "loss": 0.972,
+      "step": 3905
+    },
+    {
+      "epoch": 0.6955128205128205,
+      "grad_norm": 0.5387479066848755,
+      "learning_rate": 0.00018556754182312996,
+      "loss": 1.0574,
+      "step": 3906
+    },
+    {
+      "epoch": 0.6956908831908832,
+      "grad_norm": 0.5357984900474548,
+      "learning_rate": 0.00018556029711279116,
+      "loss": 1.396,
+      "step": 3907
+    },
+    {
+      "epoch": 0.6958689458689459,
+      "grad_norm": 0.5647178292274475,
+      "learning_rate": 0.00018555305072607612,
+      "loss": 1.3304,
+      "step": 3908
+    },
+    {
+      "epoch": 0.6960470085470085,
+      "grad_norm": 0.46460914611816406,
+      "learning_rate": 0.00018554580266312673,
+      "loss": 0.9574,
+      "step": 3909
+    },
+    {
+      "epoch": 0.6962250712250713,
+      "grad_norm": 0.6206206679344177,
+      "learning_rate": 0.00018553855292408503,
+      "loss": 1.1637,
+      "step": 3910
+    },
+    {
+      "epoch": 0.6964031339031339,
+      "grad_norm": 0.5899842977523804,
+      "learning_rate": 0.00018553130150909312,
+      "loss": 1.1067,
+      "step": 3911
+    },
+    {
+      "epoch": 0.6965811965811965,
+      "grad_norm": 0.47294262051582336,
+      "learning_rate": 0.000185524048418293,
+      "loss": 1.1516,
+      "step": 3912
+    },
+    {
+      "epoch": 0.6967592592592593,
+      "grad_norm": 0.5791197419166565,
+      "learning_rate": 0.00018551679365182684,
+      "loss": 1.0007,
+      "step": 3913
+    },
+    {
+      "epoch": 0.6969373219373219,
+      "grad_norm": 0.5678651332855225,
+      "learning_rate": 0.00018550953720983672,
+      "loss": 1.2698,
+      "step": 3914
+    },
+    {
+      "epoch": 0.6971153846153846,
+      "grad_norm": 0.6509683728218079,
+      "learning_rate": 0.0001855022790924649,
+      "loss": 1.0354,
+      "step": 3915
+    },
+    {
+      "epoch": 0.6972934472934473,
+      "grad_norm": 0.5176648497581482,
+      "learning_rate": 0.0001854950192998535,
+      "loss": 1.1243,
+      "step": 3916
+    },
+    {
+      "epoch": 0.69747150997151,
+      "grad_norm": 0.520631730556488,
+      "learning_rate": 0.00018548775783214477,
+      "loss": 1.1371,
+      "step": 3917
+    },
+    {
+      "epoch": 0.6976495726495726,
+      "grad_norm": 0.5408333539962769,
+      "learning_rate": 0.00018548049468948108,
+      "loss": 1.1185,
+      "step": 3918
+    },
+    {
+      "epoch": 0.6978276353276354,
+      "grad_norm": 0.5423790216445923,
+      "learning_rate": 0.00018547322987200461,
+      "loss": 1.1539,
+      "step": 3919
+    },
+    {
+      "epoch": 0.698005698005698,
+      "grad_norm": 0.5422113537788391,
+      "learning_rate": 0.0001854659633798578,
+      "loss": 1.171,
+      "step": 3920
+    },
+    {
+      "epoch": 0.6981837606837606,
+      "grad_norm": 0.5113416314125061,
+      "learning_rate": 0.00018545869521318292,
+      "loss": 1.0597,
+      "step": 3921
+    },
+    {
+      "epoch": 0.6983618233618234,
+      "grad_norm": 0.49901214241981506,
+      "learning_rate": 0.00018545142537212248,
+      "loss": 1.1043,
+      "step": 3922
+    },
+    {
+      "epoch": 0.698539886039886,
+      "grad_norm": 0.6606622338294983,
+      "learning_rate": 0.00018544415385681885,
+      "loss": 1.1797,
+      "step": 3923
+    },
+    {
+      "epoch": 0.6987179487179487,
+      "grad_norm": 0.4786234498023987,
+      "learning_rate": 0.00018543688066741454,
+      "loss": 0.9532,
+      "step": 3924
+    },
+    {
+      "epoch": 0.6988960113960114,
+      "grad_norm": 0.5900700688362122,
+      "learning_rate": 0.00018542960580405203,
+      "loss": 1.1171,
+      "step": 3925
+    },
+    {
+      "epoch": 0.6990740740740741,
+      "grad_norm": 0.53485506772995,
+      "learning_rate": 0.00018542232926687383,
+      "loss": 1.1535,
+      "step": 3926
+    },
+    {
+      "epoch": 0.6992521367521367,
+      "grad_norm": 0.5269177556037903,
+      "learning_rate": 0.00018541505105602255,
+      "loss": 1.0287,
+      "step": 3927
+    },
+    {
+      "epoch": 0.6994301994301995,
+      "grad_norm": 0.5185505151748657,
+      "learning_rate": 0.0001854077711716408,
+      "loss": 1.2526,
+      "step": 3928
+    },
+    {
+      "epoch": 0.6996082621082621,
+      "grad_norm": 0.5615512132644653,
+      "learning_rate": 0.00018540048961387115,
+      "loss": 1.0189,
+      "step": 3929
+    },
+    {
+      "epoch": 0.6997863247863247,
+      "grad_norm": 0.4492493271827698,
+      "learning_rate": 0.00018539320638285637,
+      "loss": 0.8917,
+      "step": 3930
+    },
+    {
+      "epoch": 0.6999643874643875,
+      "grad_norm": 0.5062302947044373,
+      "learning_rate": 0.00018538592147873906,
+      "loss": 1.053,
+      "step": 3931
+    },
+    {
+      "epoch": 0.7001424501424501,
+      "grad_norm": 0.5508798956871033,
+      "learning_rate": 0.000185378634901662,
+      "loss": 0.9638,
+      "step": 3932
+    },
+    {
+      "epoch": 0.7003205128205128,
+      "grad_norm": 0.463980108499527,
+      "learning_rate": 0.00018537134665176793,
+      "loss": 1.0945,
+      "step": 3933
+    },
+    {
+      "epoch": 0.7004985754985755,
+      "grad_norm": 0.5027088522911072,
+      "learning_rate": 0.0001853640567291997,
+      "loss": 1.1745,
+      "step": 3934
+    },
+    {
+      "epoch": 0.7006766381766382,
+      "grad_norm": 0.5006551146507263,
+      "learning_rate": 0.00018535676513410009,
+      "loss": 0.8521,
+      "step": 3935
+    },
+    {
+      "epoch": 0.7008547008547008,
+      "grad_norm": 0.5870724320411682,
+      "learning_rate": 0.000185349471866612,
+      "loss": 0.9197,
+      "step": 3936
+    },
+    {
+      "epoch": 0.7010327635327636,
+      "grad_norm": 0.5030696392059326,
+      "learning_rate": 0.00018534217692687825,
+      "loss": 1.1049,
+      "step": 3937
+    },
+    {
+      "epoch": 0.7012108262108262,
+      "grad_norm": 0.5212681889533997,
+      "learning_rate": 0.00018533488031504186,
+      "loss": 1.3397,
+      "step": 3938
+    },
+    {
+      "epoch": 0.7013888888888888,
+      "grad_norm": 0.5649709105491638,
+      "learning_rate": 0.0001853275820312458,
+      "loss": 1.1994,
+      "step": 3939
+    },
+    {
+      "epoch": 0.7015669515669516,
+      "grad_norm": 0.4892779290676117,
+      "learning_rate": 0.00018532028207563297,
+      "loss": 1.1511,
+      "step": 3940
+    },
+    {
+      "epoch": 0.7017450142450142,
+      "grad_norm": 0.4929407835006714,
+      "learning_rate": 0.00018531298044834643,
+      "loss": 1.0792,
+      "step": 3941
+    },
+    {
+      "epoch": 0.7019230769230769,
+      "grad_norm": 0.5645940899848938,
+      "learning_rate": 0.00018530567714952932,
+      "loss": 1.0937,
+      "step": 3942
+    },
+    {
+      "epoch": 0.7021011396011396,
+      "grad_norm": 0.5471178293228149,
+      "learning_rate": 0.00018529837217932466,
+      "loss": 1.193,
+      "step": 3943
+    },
+    {
+      "epoch": 0.7022792022792023,
+      "grad_norm": 0.576627790927887,
+      "learning_rate": 0.00018529106553787558,
+      "loss": 1.1032,
+      "step": 3944
+    },
+    {
+      "epoch": 0.7024572649572649,
+      "grad_norm": 0.5015735626220703,
+      "learning_rate": 0.00018528375722532526,
+      "loss": 1.066,
+      "step": 3945
+    },
+    {
+      "epoch": 0.7026353276353277,
+      "grad_norm": 0.5315404534339905,
+      "learning_rate": 0.00018527644724181683,
+      "loss": 1.2059,
+      "step": 3946
+    },
+    {
+      "epoch": 0.7028133903133903,
+      "grad_norm": 0.5516065955162048,
+      "learning_rate": 0.0001852691355874936,
+      "loss": 1.161,
+      "step": 3947
+    },
+    {
+      "epoch": 0.7029914529914529,
+      "grad_norm": 0.5026212930679321,
+      "learning_rate": 0.0001852618222624988,
+      "loss": 1.2616,
+      "step": 3948
+    },
+    {
+      "epoch": 0.7031695156695157,
+      "grad_norm": 0.49874603748321533,
+      "learning_rate": 0.0001852545072669757,
+      "loss": 0.805,
+      "step": 3949
+    },
+    {
+      "epoch": 0.7033475783475783,
+      "grad_norm": 0.47698748111724854,
+      "learning_rate": 0.00018524719060106763,
+      "loss": 1.2321,
+      "step": 3950
+    },
+    {
+      "epoch": 0.7035256410256411,
+      "grad_norm": 0.5201322436332703,
+      "learning_rate": 0.00018523987226491792,
+      "loss": 1.1577,
+      "step": 3951
+    },
+    {
+      "epoch": 0.7037037037037037,
+      "grad_norm": 0.5506543517112732,
+      "learning_rate": 0.00018523255225867002,
+      "loss": 1.2289,
+      "step": 3952
+    },
+    {
+      "epoch": 0.7038817663817664,
+      "grad_norm": 0.5691256523132324,
+      "learning_rate": 0.0001852252305824673,
+      "loss": 1.1945,
+      "step": 3953
+    },
+    {
+      "epoch": 0.7040598290598291,
+      "grad_norm": 0.5324838757514954,
+      "learning_rate": 0.00018521790723645322,
+      "loss": 1.1037,
+      "step": 3954
+    },
+    {
+      "epoch": 0.7042378917378918,
+      "grad_norm": 0.5238786339759827,
+      "learning_rate": 0.00018521058222077127,
+      "loss": 1.2075,
+      "step": 3955
+    },
+    {
+      "epoch": 0.7044159544159544,
+      "grad_norm": 0.4936453402042389,
+      "learning_rate": 0.00018520325553556498,
+      "loss": 1.0537,
+      "step": 3956
+    },
+    {
+      "epoch": 0.7045940170940171,
+      "grad_norm": 0.6198282837867737,
+      "learning_rate": 0.00018519592718097791,
+      "loss": 1.0728,
+      "step": 3957
+    },
+    {
+      "epoch": 0.7047720797720798,
+      "grad_norm": 0.44729140400886536,
+      "learning_rate": 0.0001851885971571536,
+      "loss": 0.8432,
+      "step": 3958
+    },
+    {
+      "epoch": 0.7049501424501424,
+      "grad_norm": 0.5884211659431458,
+      "learning_rate": 0.00018518126546423572,
+      "loss": 0.9515,
+      "step": 3959
+    },
+    {
+      "epoch": 0.7051282051282052,
+      "grad_norm": 0.5293807983398438,
+      "learning_rate": 0.00018517393210236788,
+      "loss": 1.1178,
+      "step": 3960
+    },
+    {
+      "epoch": 0.7053062678062678,
+      "grad_norm": 0.6036825180053711,
+      "learning_rate": 0.00018516659707169374,
+      "loss": 1.0408,
+      "step": 3961
+    },
+    {
+      "epoch": 0.7054843304843305,
+      "grad_norm": 0.5157122015953064,
+      "learning_rate": 0.0001851592603723571,
+      "loss": 1.2136,
+      "step": 3962
+    },
+    {
+      "epoch": 0.7056623931623932,
+      "grad_norm": 0.5354781150817871,
+      "learning_rate": 0.00018515192200450163,
+      "loss": 0.7165,
+      "step": 3963
+    },
+    {
+      "epoch": 0.7058404558404558,
+      "grad_norm": 0.6073734760284424,
+      "learning_rate": 0.00018514458196827111,
+      "loss": 1.3079,
+      "step": 3964
+    },
+    {
+      "epoch": 0.7060185185185185,
+      "grad_norm": 0.4324839413166046,
+      "learning_rate": 0.0001851372402638094,
+      "loss": 0.7903,
+      "step": 3965
+    },
+    {
+      "epoch": 0.7061965811965812,
+      "grad_norm": 0.6530333161354065,
+      "learning_rate": 0.00018512989689126034,
+      "loss": 1.3179,
+      "step": 3966
+    },
+    {
+      "epoch": 0.7063746438746439,
+      "grad_norm": 0.5500404238700867,
+      "learning_rate": 0.00018512255185076782,
+      "loss": 1.0624,
+      "step": 3967
+    },
+    {
+      "epoch": 0.7065527065527065,
+      "grad_norm": 0.6277863383293152,
+      "learning_rate": 0.00018511520514247567,
+      "loss": 1.1056,
+      "step": 3968
+    },
+    {
+      "epoch": 0.7067307692307693,
+      "grad_norm": 0.580544650554657,
+      "learning_rate": 0.0001851078567665279,
+      "loss": 0.9849,
+      "step": 3969
+    },
+    {
+      "epoch": 0.7069088319088319,
+      "grad_norm": 0.4880999028682709,
+      "learning_rate": 0.00018510050672306848,
+      "loss": 1.0185,
+      "step": 3970
+    },
+    {
+      "epoch": 0.7070868945868946,
+      "grad_norm": 0.4919959306716919,
+      "learning_rate": 0.0001850931550122414,
+      "loss": 1.0334,
+      "step": 3971
+    },
+    {
+      "epoch": 0.7072649572649573,
+      "grad_norm": 0.6001213192939758,
+      "learning_rate": 0.0001850858016341907,
+      "loss": 1.0729,
+      "step": 3972
+    },
+    {
+      "epoch": 0.70744301994302,
+      "grad_norm": 0.538690447807312,
+      "learning_rate": 0.00018507844658906052,
+      "loss": 1.0733,
+      "step": 3973
+    },
+    {
+      "epoch": 0.7076210826210826,
+      "grad_norm": 0.5427643656730652,
+      "learning_rate": 0.00018507108987699487,
+      "loss": 1.1207,
+      "step": 3974
+    },
+    {
+      "epoch": 0.7077991452991453,
+      "grad_norm": 0.43014347553253174,
+      "learning_rate": 0.00018506373149813795,
+      "loss": 0.7958,
+      "step": 3975
+    },
+    {
+      "epoch": 0.707977207977208,
+      "grad_norm": 0.56591796875,
+      "learning_rate": 0.00018505637145263394,
+      "loss": 1.2199,
+      "step": 3976
+    },
+    {
+      "epoch": 0.7081552706552706,
+      "grad_norm": 0.59147047996521,
+      "learning_rate": 0.000185049009740627,
+      "loss": 1.2354,
+      "step": 3977
+    },
+    {
+      "epoch": 0.7083333333333334,
+      "grad_norm": 0.5078346133232117,
+      "learning_rate": 0.00018504164636226137,
+      "loss": 0.976,
+      "step": 3978
+    },
+    {
+      "epoch": 0.708511396011396,
+      "grad_norm": 0.533302366733551,
+      "learning_rate": 0.00018503428131768135,
+      "loss": 0.9653,
+      "step": 3979
+    },
+    {
+      "epoch": 0.7086894586894587,
+      "grad_norm": 0.4985341727733612,
+      "learning_rate": 0.00018502691460703122,
+      "loss": 1.1485,
+      "step": 3980
+    },
+    {
+      "epoch": 0.7088675213675214,
+      "grad_norm": 0.5143141150474548,
+      "learning_rate": 0.00018501954623045532,
+      "loss": 1.148,
+      "step": 3981
+    },
+    {
+      "epoch": 0.709045584045584,
+      "grad_norm": 0.507189154624939,
+      "learning_rate": 0.00018501217618809804,
+      "loss": 0.9306,
+      "step": 3982
+    },
+    {
+      "epoch": 0.7092236467236467,
+      "grad_norm": 0.5246604084968567,
+      "learning_rate": 0.00018500480448010377,
+      "loss": 0.9116,
+      "step": 3983
+    },
+    {
+      "epoch": 0.7094017094017094,
+      "grad_norm": 0.5321049094200134,
+      "learning_rate": 0.00018499743110661693,
+      "loss": 0.9607,
+      "step": 3984
+    },
+    {
+      "epoch": 0.7095797720797721,
+      "grad_norm": 0.62645423412323,
+      "learning_rate": 0.000184990056067782,
+      "loss": 1.5834,
+      "step": 3985
+    },
+    {
+      "epoch": 0.7097578347578347,
+      "grad_norm": 0.486557275056839,
+      "learning_rate": 0.0001849826793637435,
+      "loss": 1.0598,
+      "step": 3986
+    },
+    {
+      "epoch": 0.7099358974358975,
+      "grad_norm": 0.5122783184051514,
+      "learning_rate": 0.0001849753009946459,
+      "loss": 1.2213,
+      "step": 3987
+    },
+    {
+      "epoch": 0.7101139601139601,
+      "grad_norm": 0.4864068627357483,
+      "learning_rate": 0.0001849679209606338,
+      "loss": 1.2708,
+      "step": 3988
+    },
+    {
+      "epoch": 0.7102920227920227,
+      "grad_norm": 0.5860990881919861,
+      "learning_rate": 0.00018496053926185183,
+      "loss": 1.2421,
+      "step": 3989
+    },
+    {
+      "epoch": 0.7104700854700855,
+      "grad_norm": 0.471194326877594,
+      "learning_rate": 0.00018495315589844453,
+      "loss": 0.879,
+      "step": 3990
+    },
+    {
+      "epoch": 0.7106481481481481,
+      "grad_norm": 0.5626323819160461,
+      "learning_rate": 0.00018494577087055662,
+      "loss": 1.1297,
+      "step": 3991
+    },
+    {
+      "epoch": 0.7108262108262108,
+      "grad_norm": 0.4706762135028839,
+      "learning_rate": 0.0001849383841783328,
+      "loss": 1.0444,
+      "step": 3992
+    },
+    {
+      "epoch": 0.7110042735042735,
+      "grad_norm": 0.5776444673538208,
+      "learning_rate": 0.00018493099582191783,
+      "loss": 1.1773,
+      "step": 3993
+    },
+    {
+      "epoch": 0.7111823361823362,
+      "grad_norm": 0.5493253469467163,
+      "learning_rate": 0.00018492360580145637,
+      "loss": 1.0354,
+      "step": 3994
+    },
+    {
+      "epoch": 0.7113603988603988,
+      "grad_norm": 0.5328514575958252,
+      "learning_rate": 0.0001849162141170933,
+      "loss": 0.9251,
+      "step": 3995
+    },
+    {
+      "epoch": 0.7115384615384616,
+      "grad_norm": 0.5814893841743469,
+      "learning_rate": 0.0001849088207689734,
+      "loss": 1.1066,
+      "step": 3996
+    },
+    {
+      "epoch": 0.7117165242165242,
+      "grad_norm": 0.5476071834564209,
+      "learning_rate": 0.00018490142575724154,
+      "loss": 1.1613,
+      "step": 3997
+    },
+    {
+      "epoch": 0.7118945868945868,
+      "grad_norm": 0.5216463208198547,
+      "learning_rate": 0.00018489402908204258,
+      "loss": 1.2574,
+      "step": 3998
+    },
+    {
+      "epoch": 0.7120726495726496,
+      "grad_norm": 0.5110020637512207,
+      "learning_rate": 0.00018488663074352153,
+      "loss": 1.0663,
+      "step": 3999
+    },
+    {
+      "epoch": 0.7122507122507122,
+      "grad_norm": 0.448090523481369,
+      "learning_rate": 0.00018487923074182326,
+      "loss": 0.6687,
+      "step": 4000
+    },
+    {
+      "epoch": 0.7124287749287749,
+      "grad_norm": 0.4980565011501312,
+      "learning_rate": 0.00018487182907709279,
+      "loss": 1.2365,
+      "step": 4001
+    },
+    {
+      "epoch": 0.7126068376068376,
+      "grad_norm": 0.485831081867218,
+      "learning_rate": 0.00018486442574947511,
+      "loss": 1.0941,
+      "step": 4002
+    },
+    {
+      "epoch": 0.7127849002849003,
+      "grad_norm": 0.4955040216445923,
+      "learning_rate": 0.00018485702075911534,
+      "loss": 1.248,
+      "step": 4003
+    },
+    {
+      "epoch": 0.7129629629629629,
+      "grad_norm": 0.5168375968933105,
+      "learning_rate": 0.00018484961410615845,
+      "loss": 1.1118,
+      "step": 4004
+    },
+    {
+      "epoch": 0.7131410256410257,
+      "grad_norm": 0.5255687832832336,
+      "learning_rate": 0.00018484220579074968,
+      "loss": 1.0558,
+      "step": 4005
+    },
+    {
+      "epoch": 0.7133190883190883,
+      "grad_norm": 0.5502219796180725,
+      "learning_rate": 0.00018483479581303416,
+      "loss": 1.1604,
+      "step": 4006
+    },
+    {
+      "epoch": 0.7134971509971509,
+      "grad_norm": 0.5155881643295288,
+      "learning_rate": 0.000184827384173157,
+      "loss": 0.8246,
+      "step": 4007
+    },
+    {
+      "epoch": 0.7136752136752137,
+      "grad_norm": 0.5321542024612427,
+      "learning_rate": 0.0001848199708712635,
+      "loss": 1.2058,
+      "step": 4008
+    },
+    {
+      "epoch": 0.7138532763532763,
+      "grad_norm": 0.4929848313331604,
+      "learning_rate": 0.00018481255590749884,
+      "loss": 1.4023,
+      "step": 4009
+    },
+    {
+      "epoch": 0.7140313390313391,
+      "grad_norm": 0.5070937871932983,
+      "learning_rate": 0.00018480513928200836,
+      "loss": 1.0561,
+      "step": 4010
+    },
+    {
+      "epoch": 0.7142094017094017,
+      "grad_norm": 0.5750083327293396,
+      "learning_rate": 0.00018479772099493728,
+      "loss": 1.0276,
+      "step": 4011
+    },
+    {
+      "epoch": 0.7143874643874644,
+      "grad_norm": 0.5265933275222778,
+      "learning_rate": 0.00018479030104643108,
+      "loss": 1.0295,
+      "step": 4012
+    },
+    {
+      "epoch": 0.7145655270655271,
+      "grad_norm": 0.526830792427063,
+      "learning_rate": 0.00018478287943663504,
+      "loss": 1.0157,
+      "step": 4013
+    },
+    {
+      "epoch": 0.7147435897435898,
+      "grad_norm": 0.5344091653823853,
+      "learning_rate": 0.00018477545616569458,
+      "loss": 1.1997,
+      "step": 4014
+    },
+    {
+      "epoch": 0.7149216524216524,
+      "grad_norm": 0.4935445189476013,
+      "learning_rate": 0.0001847680312337552,
+      "loss": 1.1858,
+      "step": 4015
+    },
+    {
+      "epoch": 0.7150997150997151,
+      "grad_norm": 0.5291212797164917,
+      "learning_rate": 0.0001847606046409623,
+      "loss": 0.926,
+      "step": 4016
+    },
+    {
+      "epoch": 0.7152777777777778,
+      "grad_norm": 0.559050977230072,
+      "learning_rate": 0.00018475317638746142,
+      "loss": 1.0947,
+      "step": 4017
+    },
+    {
+      "epoch": 0.7154558404558404,
+      "grad_norm": 0.4566570222377777,
+      "learning_rate": 0.00018474574647339814,
+      "loss": 1.0334,
+      "step": 4018
+    },
+    {
+      "epoch": 0.7156339031339032,
+      "grad_norm": 0.5156155824661255,
+      "learning_rate": 0.000184738314898918,
+      "loss": 1.0076,
+      "step": 4019
+    },
+    {
+      "epoch": 0.7158119658119658,
+      "grad_norm": 0.5008716583251953,
+      "learning_rate": 0.00018473088166416662,
+      "loss": 1.0378,
+      "step": 4020
+    },
+    {
+      "epoch": 0.7159900284900285,
+      "grad_norm": 0.49556368589401245,
+      "learning_rate": 0.0001847234467692896,
+      "loss": 1.15,
+      "step": 4021
+    },
+    {
+      "epoch": 0.7161680911680912,
+      "grad_norm": 0.5464680790901184,
+      "learning_rate": 0.00018471601021443265,
+      "loss": 1.2975,
+      "step": 4022
+    },
+    {
+      "epoch": 0.7163461538461539,
+      "grad_norm": 0.6291980147361755,
+      "learning_rate": 0.00018470857199974144,
+      "loss": 1.05,
+      "step": 4023
+    },
+    {
+      "epoch": 0.7165242165242165,
+      "grad_norm": 0.5566631555557251,
+      "learning_rate": 0.00018470113212536176,
+      "loss": 1.1296,
+      "step": 4024
+    },
+    {
+      "epoch": 0.7167022792022792,
+      "grad_norm": 0.5569562911987305,
+      "learning_rate": 0.00018469369059143933,
+      "loss": 1.2484,
+      "step": 4025
+    },
+    {
+      "epoch": 0.7168803418803419,
+      "grad_norm": 0.5804716944694519,
+      "learning_rate": 0.00018468624739812,
+      "loss": 1.0547,
+      "step": 4026
+    },
+    {
+      "epoch": 0.7170584045584045,
+      "grad_norm": 0.6316802501678467,
+      "learning_rate": 0.00018467880254554952,
+      "loss": 1.1188,
+      "step": 4027
+    },
+    {
+      "epoch": 0.7172364672364673,
+      "grad_norm": 0.6131419539451599,
+      "learning_rate": 0.00018467135603387385,
+      "loss": 1.1662,
+      "step": 4028
+    },
+    {
+      "epoch": 0.7174145299145299,
+      "grad_norm": 0.4703124761581421,
+      "learning_rate": 0.00018466390786323883,
+      "loss": 1.038,
+      "step": 4029
+    },
+    {
+      "epoch": 0.7175925925925926,
+      "grad_norm": 0.5718469023704529,
+      "learning_rate": 0.0001846564580337904,
+      "loss": 1.0786,
+      "step": 4030
+    },
+    {
+      "epoch": 0.7177706552706553,
+      "grad_norm": 0.5227612853050232,
+      "learning_rate": 0.00018464900654567457,
+      "loss": 1.0561,
+      "step": 4031
+    },
+    {
+      "epoch": 0.717948717948718,
+      "grad_norm": 0.5800358057022095,
+      "learning_rate": 0.00018464155339903727,
+      "loss": 1.0944,
+      "step": 4032
+    },
+    {
+      "epoch": 0.7181267806267806,
+      "grad_norm": 0.5562314987182617,
+      "learning_rate": 0.00018463409859402455,
+      "loss": 0.8573,
+      "step": 4033
+    },
+    {
+      "epoch": 0.7183048433048433,
+      "grad_norm": 0.6420153379440308,
+      "learning_rate": 0.0001846266421307825,
+      "loss": 1.088,
+      "step": 4034
+    },
+    {
+      "epoch": 0.718482905982906,
+      "grad_norm": 0.4745902717113495,
+      "learning_rate": 0.00018461918400945718,
+      "loss": 1.1679,
+      "step": 4035
+    },
+    {
+      "epoch": 0.7186609686609686,
+      "grad_norm": 0.5070300102233887,
+      "learning_rate": 0.00018461172423019475,
+      "loss": 1.1984,
+      "step": 4036
+    },
+    {
+      "epoch": 0.7188390313390314,
+      "grad_norm": 0.5339375138282776,
+      "learning_rate": 0.00018460426279314133,
+      "loss": 1.3038,
+      "step": 4037
+    },
+    {
+      "epoch": 0.719017094017094,
+      "grad_norm": 0.5947147607803345,
+      "learning_rate": 0.00018459679969844313,
+      "loss": 1.0103,
+      "step": 4038
+    },
+    {
+      "epoch": 0.7191951566951567,
+      "grad_norm": 0.5493791699409485,
+      "learning_rate": 0.00018458933494624642,
+      "loss": 1.1001,
+      "step": 4039
+    },
+    {
+      "epoch": 0.7193732193732194,
+      "grad_norm": 0.5700310468673706,
+      "learning_rate": 0.00018458186853669736,
+      "loss": 0.9006,
+      "step": 4040
+    },
+    {
+      "epoch": 0.719551282051282,
+      "grad_norm": 0.60371994972229,
+      "learning_rate": 0.0001845744004699423,
+      "loss": 1.3001,
+      "step": 4041
+    },
+    {
+      "epoch": 0.7197293447293447,
+      "grad_norm": 0.5469261407852173,
+      "learning_rate": 0.00018456693074612757,
+      "loss": 1.1745,
+      "step": 4042
+    },
+    {
+      "epoch": 0.7199074074074074,
+      "grad_norm": 0.5179165601730347,
+      "learning_rate": 0.00018455945936539947,
+      "loss": 0.9883,
+      "step": 4043
+    },
+    {
+      "epoch": 0.7200854700854701,
+      "grad_norm": 0.5396696329116821,
+      "learning_rate": 0.00018455198632790447,
+      "loss": 1.1277,
+      "step": 4044
+    },
+    {
+      "epoch": 0.7202635327635327,
+      "grad_norm": 0.4559909403324127,
+      "learning_rate": 0.00018454451163378888,
+      "loss": 0.9644,
+      "step": 4045
+    },
+    {
+      "epoch": 0.7204415954415955,
+      "grad_norm": 0.49863892793655396,
+      "learning_rate": 0.00018453703528319927,
+      "loss": 1.1276,
+      "step": 4046
+    },
+    {
+      "epoch": 0.7206196581196581,
+      "grad_norm": 0.4790710508823395,
+      "learning_rate": 0.000184529557276282,
+      "loss": 0.9443,
+      "step": 4047
+    },
+    {
+      "epoch": 0.7207977207977208,
+      "grad_norm": 0.541999876499176,
+      "learning_rate": 0.0001845220776131837,
+      "loss": 1.0681,
+      "step": 4048
+    },
+    {
+      "epoch": 0.7209757834757835,
+      "grad_norm": 0.5119109153747559,
+      "learning_rate": 0.00018451459629405088,
+      "loss": 1.2078,
+      "step": 4049
+    },
+    {
+      "epoch": 0.7211538461538461,
+      "grad_norm": 0.6141307353973389,
+      "learning_rate": 0.00018450711331903006,
+      "loss": 1.1071,
+      "step": 4050
+    },
+    {
+      "epoch": 0.7213319088319088,
+      "grad_norm": 0.48679864406585693,
+      "learning_rate": 0.00018449962868826795,
+      "loss": 0.9713,
+      "step": 4051
+    },
+    {
+      "epoch": 0.7215099715099715,
+      "grad_norm": 0.5548661947250366,
+      "learning_rate": 0.0001844921424019111,
+      "loss": 1.2099,
+      "step": 4052
+    },
+    {
+      "epoch": 0.7216880341880342,
+      "grad_norm": 0.5000107884407043,
+      "learning_rate": 0.00018448465446010626,
+      "loss": 1.0184,
+      "step": 4053
+    },
+    {
+      "epoch": 0.7218660968660968,
+      "grad_norm": 0.6131454110145569,
+      "learning_rate": 0.00018447716486300013,
+      "loss": 1.2581,
+      "step": 4054
+    },
+    {
+      "epoch": 0.7220441595441596,
+      "grad_norm": 0.5145987868309021,
+      "learning_rate": 0.0001844696736107394,
+      "loss": 1.1646,
+      "step": 4055
+    },
+    {
+      "epoch": 0.7222222222222222,
+      "grad_norm": 0.4361337125301361,
+      "learning_rate": 0.00018446218070347094,
+      "loss": 0.8239,
+      "step": 4056
+    },
+    {
+      "epoch": 0.7224002849002849,
+      "grad_norm": 0.5549173355102539,
+      "learning_rate": 0.00018445468614134146,
+      "loss": 1.1935,
+      "step": 4057
+    },
+    {
+      "epoch": 0.7225783475783476,
+      "grad_norm": 0.5569297671318054,
+      "learning_rate": 0.00018444718992449789,
+      "loss": 1.0137,
+      "step": 4058
+    },
+    {
+      "epoch": 0.7227564102564102,
+      "grad_norm": 0.44866305589675903,
+      "learning_rate": 0.00018443969205308704,
+      "loss": 0.987,
+      "step": 4059
+    },
+    {
+      "epoch": 0.7229344729344729,
+      "grad_norm": 0.5142943263053894,
+      "learning_rate": 0.0001844321925272558,
+      "loss": 1.0837,
+      "step": 4060
+    },
+    {
+      "epoch": 0.7231125356125356,
+      "grad_norm": 0.4922119379043579,
+      "learning_rate": 0.0001844246913471512,
+      "loss": 0.8477,
+      "step": 4061
+    },
+    {
+      "epoch": 0.7232905982905983,
+      "grad_norm": 0.5245375633239746,
+      "learning_rate": 0.0001844171885129201,
+      "loss": 0.9985,
+      "step": 4062
+    },
+    {
+      "epoch": 0.7234686609686609,
+      "grad_norm": 0.45562678575515747,
+      "learning_rate": 0.00018440968402470956,
+      "loss": 0.8678,
+      "step": 4063
+    },
+    {
+      "epoch": 0.7236467236467237,
+      "grad_norm": 0.5388376712799072,
+      "learning_rate": 0.0001844021778826666,
+      "loss": 1.0586,
+      "step": 4064
+    },
+    {
+      "epoch": 0.7238247863247863,
+      "grad_norm": 0.48945263028144836,
+      "learning_rate": 0.00018439467008693833,
+      "loss": 1.0547,
+      "step": 4065
+    },
+    {
+      "epoch": 0.7240028490028491,
+      "grad_norm": 0.5202330350875854,
+      "learning_rate": 0.00018438716063767178,
+      "loss": 1.3142,
+      "step": 4066
+    },
+    {
+      "epoch": 0.7241809116809117,
+      "grad_norm": 0.5432567000389099,
+      "learning_rate": 0.00018437964953501413,
+      "loss": 1.0192,
+      "step": 4067
+    },
+    {
+      "epoch": 0.7243589743589743,
+      "grad_norm": 0.5220325589179993,
+      "learning_rate": 0.00018437213677911253,
+      "loss": 1.0904,
+      "step": 4068
+    },
+    {
+      "epoch": 0.7245370370370371,
+      "grad_norm": 0.45711690187454224,
+      "learning_rate": 0.00018436462237011417,
+      "loss": 1.0417,
+      "step": 4069
+    },
+    {
+      "epoch": 0.7247150997150997,
+      "grad_norm": 0.560778021812439,
+      "learning_rate": 0.0001843571063081663,
+      "loss": 1.2316,
+      "step": 4070
+    },
+    {
+      "epoch": 0.7248931623931624,
+      "grad_norm": 0.591533362865448,
+      "learning_rate": 0.0001843495885934162,
+      "loss": 1.0294,
+      "step": 4071
+    },
+    {
+      "epoch": 0.7250712250712251,
+      "grad_norm": 0.5550443530082703,
+      "learning_rate": 0.00018434206922601106,
+      "loss": 1.0162,
+      "step": 4072
+    },
+    {
+      "epoch": 0.7252492877492878,
+      "grad_norm": 0.5744053721427917,
+      "learning_rate": 0.00018433454820609833,
+      "loss": 1.2774,
+      "step": 4073
+    },
+    {
+      "epoch": 0.7254273504273504,
+      "grad_norm": 0.6210703253746033,
+      "learning_rate": 0.0001843270255338253,
+      "loss": 1.2526,
+      "step": 4074
+    },
+    {
+      "epoch": 0.7256054131054132,
+      "grad_norm": 0.49684277176856995,
+      "learning_rate": 0.0001843195012093394,
+      "loss": 1.0786,
+      "step": 4075
+    },
+    {
+      "epoch": 0.7257834757834758,
+      "grad_norm": 0.5851606130599976,
+      "learning_rate": 0.00018431197523278802,
+      "loss": 1.14,
+      "step": 4076
+    },
+    {
+      "epoch": 0.7259615384615384,
+      "grad_norm": 0.5494425296783447,
+      "learning_rate": 0.00018430444760431862,
+      "loss": 1.211,
+      "step": 4077
+    },
+    {
+      "epoch": 0.7261396011396012,
+      "grad_norm": 0.5247658491134644,
+      "learning_rate": 0.00018429691832407867,
+      "loss": 0.8031,
+      "step": 4078
+    },
+    {
+      "epoch": 0.7263176638176638,
+      "grad_norm": 0.5012249946594238,
+      "learning_rate": 0.00018428938739221574,
+      "loss": 1.1258,
+      "step": 4079
+    },
+    {
+      "epoch": 0.7264957264957265,
+      "grad_norm": 0.5226427912712097,
+      "learning_rate": 0.0001842818548088774,
+      "loss": 1.0029,
+      "step": 4080
+    },
+    {
+      "epoch": 0.7266737891737892,
+      "grad_norm": 0.45008543133735657,
+      "learning_rate": 0.00018427432057421114,
+      "loss": 1.0681,
+      "step": 4081
+    },
+    {
+      "epoch": 0.7268518518518519,
+      "grad_norm": 0.5127285122871399,
+      "learning_rate": 0.00018426678468836467,
+      "loss": 1.1069,
+      "step": 4082
+    },
+    {
+      "epoch": 0.7270299145299145,
+      "grad_norm": 0.5406150221824646,
+      "learning_rate": 0.0001842592471514856,
+      "loss": 1.052,
+      "step": 4083
+    },
+    {
+      "epoch": 0.7272079772079773,
+      "grad_norm": 0.5001157522201538,
+      "learning_rate": 0.0001842517079637216,
+      "loss": 0.9157,
+      "step": 4084
+    },
+    {
+      "epoch": 0.7273860398860399,
+      "grad_norm": 0.6169779300689697,
+      "learning_rate": 0.00018424416712522042,
+      "loss": 1.3133,
+      "step": 4085
+    },
+    {
+      "epoch": 0.7275641025641025,
+      "grad_norm": 0.4891316890716553,
+      "learning_rate": 0.00018423662463612974,
+      "loss": 0.9505,
+      "step": 4086
+    },
+    {
+      "epoch": 0.7277421652421653,
+      "grad_norm": 0.5883708596229553,
+      "learning_rate": 0.00018422908049659743,
+      "loss": 1.2797,
+      "step": 4087
+    },
+    {
+      "epoch": 0.7279202279202279,
+      "grad_norm": 0.6679072976112366,
+      "learning_rate": 0.00018422153470677125,
+      "loss": 1.1096,
+      "step": 4088
+    },
+    {
+      "epoch": 0.7280982905982906,
+      "grad_norm": 0.5178479552268982,
+      "learning_rate": 0.00018421398726679904,
+      "loss": 1.0299,
+      "step": 4089
+    },
+    {
+      "epoch": 0.7282763532763533,
+      "grad_norm": 0.6343900561332703,
+      "learning_rate": 0.0001842064381768287,
+      "loss": 1.2983,
+      "step": 4090
+    },
+    {
+      "epoch": 0.728454415954416,
+      "grad_norm": 0.43816515803337097,
+      "learning_rate": 0.0001841988874370081,
+      "loss": 0.9452,
+      "step": 4091
+    },
+    {
+      "epoch": 0.7286324786324786,
+      "grad_norm": 0.579790472984314,
+      "learning_rate": 0.00018419133504748528,
+      "loss": 1.1037,
+      "step": 4092
+    },
+    {
+      "epoch": 0.7288105413105413,
+      "grad_norm": 0.571374773979187,
+      "learning_rate": 0.00018418378100840807,
+      "loss": 1.1655,
+      "step": 4093
+    },
+    {
+      "epoch": 0.728988603988604,
+      "grad_norm": 0.5163514018058777,
+      "learning_rate": 0.0001841762253199246,
+      "loss": 1.1579,
+      "step": 4094
+    },
+    {
+      "epoch": 0.7291666666666666,
+      "grad_norm": 0.6553022265434265,
+      "learning_rate": 0.0001841686679821828,
+      "loss": 0.9664,
+      "step": 4095
+    },
+    {
+      "epoch": 0.7293447293447294,
+      "grad_norm": 0.5072969198226929,
+      "learning_rate": 0.00018416110899533084,
+      "loss": 0.9416,
+      "step": 4096
+    },
+    {
+      "epoch": 0.729522792022792,
+      "grad_norm": 0.5103251338005066,
+      "learning_rate": 0.00018415354835951675,
+      "loss": 1.0715,
+      "step": 4097
+    },
+    {
+      "epoch": 0.7297008547008547,
+      "grad_norm": 0.49752289056777954,
+      "learning_rate": 0.00018414598607488874,
+      "loss": 1.1848,
+      "step": 4098
+    },
+    {
+      "epoch": 0.7298789173789174,
+      "grad_norm": 0.5361882448196411,
+      "learning_rate": 0.00018413842214159488,
+      "loss": 1.1035,
+      "step": 4099
+    },
+    {
+      "epoch": 0.73005698005698,
+      "grad_norm": 0.5167670249938965,
+      "learning_rate": 0.00018413085655978343,
+      "loss": 1.0015,
+      "step": 4100
+    },
+    {
+      "epoch": 0.7302350427350427,
+      "grad_norm": 0.5930629372596741,
+      "learning_rate": 0.00018412328932960263,
+      "loss": 0.9766,
+      "step": 4101
+    },
+    {
+      "epoch": 0.7304131054131054,
+      "grad_norm": 0.5234778523445129,
+      "learning_rate": 0.00018411572045120073,
+      "loss": 1.0317,
+      "step": 4102
+    },
+    {
+      "epoch": 0.7305911680911681,
+      "grad_norm": 0.5361374020576477,
+      "learning_rate": 0.000184108149924726,
+      "loss": 1.1228,
+      "step": 4103
+    },
+    {
+      "epoch": 0.7307692307692307,
+      "grad_norm": 0.5845770239830017,
+      "learning_rate": 0.0001841005777503268,
+      "loss": 0.9541,
+      "step": 4104
+    },
+    {
+      "epoch": 0.7309472934472935,
+      "grad_norm": 0.49320483207702637,
+      "learning_rate": 0.0001840930039281515,
+      "loss": 0.9445,
+      "step": 4105
+    },
+    {
+      "epoch": 0.7311253561253561,
+      "grad_norm": 0.5391250252723694,
+      "learning_rate": 0.00018408542845834845,
+      "loss": 1.1983,
+      "step": 4106
+    },
+    {
+      "epoch": 0.7313034188034188,
+      "grad_norm": 0.4890393316745758,
+      "learning_rate": 0.00018407785134106613,
+      "loss": 0.8353,
+      "step": 4107
+    },
+    {
+      "epoch": 0.7314814814814815,
+      "grad_norm": 0.5839747190475464,
+      "learning_rate": 0.00018407027257645296,
+      "loss": 1.4074,
+      "step": 4108
+    },
+    {
+      "epoch": 0.7316595441595442,
+      "grad_norm": 0.5957708358764648,
+      "learning_rate": 0.0001840626921646574,
+      "loss": 1.1032,
+      "step": 4109
+    },
+    {
+      "epoch": 0.7318376068376068,
+      "grad_norm": 0.5029017925262451,
+      "learning_rate": 0.00018405511010582805,
+      "loss": 1.095,
+      "step": 4110
+    },
+    {
+      "epoch": 0.7320156695156695,
+      "grad_norm": 0.6054347157478333,
+      "learning_rate": 0.00018404752640011345,
+      "loss": 1.0366,
+      "step": 4111
+    },
+    {
+      "epoch": 0.7321937321937322,
+      "grad_norm": 0.5476830005645752,
+      "learning_rate": 0.00018403994104766212,
+      "loss": 1.0976,
+      "step": 4112
+    },
+    {
+      "epoch": 0.7323717948717948,
+      "grad_norm": 0.5000962615013123,
+      "learning_rate": 0.00018403235404862277,
+      "loss": 1.0809,
+      "step": 4113
+    },
+    {
+      "epoch": 0.7325498575498576,
+      "grad_norm": 0.5119251012802124,
+      "learning_rate": 0.00018402476540314394,
+      "loss": 1.0176,
+      "step": 4114
+    },
+    {
+      "epoch": 0.7327279202279202,
+      "grad_norm": 0.5825830698013306,
+      "learning_rate": 0.00018401717511137445,
+      "loss": 1.2357,
+      "step": 4115
+    },
+    {
+      "epoch": 0.7329059829059829,
+      "grad_norm": 0.5702941417694092,
+      "learning_rate": 0.0001840095831734629,
+      "loss": 1.1549,
+      "step": 4116
+    },
+    {
+      "epoch": 0.7330840455840456,
+      "grad_norm": 0.5660699605941772,
+      "learning_rate": 0.00018400198958955807,
+      "loss": 1.1778,
+      "step": 4117
+    },
+    {
+      "epoch": 0.7332621082621082,
+      "grad_norm": 0.5241161584854126,
+      "learning_rate": 0.0001839943943598088,
+      "loss": 0.8587,
+      "step": 4118
+    },
+    {
+      "epoch": 0.7334401709401709,
+      "grad_norm": 0.581194281578064,
+      "learning_rate": 0.0001839867974843638,
+      "loss": 1.2169,
+      "step": 4119
+    },
+    {
+      "epoch": 0.7336182336182336,
+      "grad_norm": 0.4342379570007324,
+      "learning_rate": 0.00018397919896337198,
+      "loss": 0.9182,
+      "step": 4120
+    },
+    {
+      "epoch": 0.7337962962962963,
+      "grad_norm": 0.5708567500114441,
+      "learning_rate": 0.00018397159879698224,
+      "loss": 1.1781,
+      "step": 4121
+    },
+    {
+      "epoch": 0.7339743589743589,
+      "grad_norm": 0.5827265977859497,
+      "learning_rate": 0.00018396399698534344,
+      "loss": 1.2905,
+      "step": 4122
+    },
+    {
+      "epoch": 0.7341524216524217,
+      "grad_norm": 0.5274056792259216,
+      "learning_rate": 0.00018395639352860457,
+      "loss": 1.1786,
+      "step": 4123
+    },
+    {
+      "epoch": 0.7343304843304843,
+      "grad_norm": 0.5094266533851624,
+      "learning_rate": 0.00018394878842691452,
+      "loss": 1.2016,
+      "step": 4124
+    },
+    {
+      "epoch": 0.7345085470085471,
+      "grad_norm": 0.48779475688934326,
+      "learning_rate": 0.0001839411816804224,
+      "loss": 1.0562,
+      "step": 4125
+    },
+    {
+      "epoch": 0.7346866096866097,
+      "grad_norm": 0.5805709958076477,
+      "learning_rate": 0.00018393357328927716,
+      "loss": 1.1705,
+      "step": 4126
+    },
+    {
+      "epoch": 0.7348646723646723,
+      "grad_norm": 0.4910700023174286,
+      "learning_rate": 0.00018392596325362791,
+      "loss": 1.0682,
+      "step": 4127
+    },
+    {
+      "epoch": 0.7350427350427351,
+      "grad_norm": 0.5297428369522095,
+      "learning_rate": 0.0001839183515736238,
+      "loss": 0.9505,
+      "step": 4128
+    },
+    {
+      "epoch": 0.7352207977207977,
+      "grad_norm": 0.45442086458206177,
+      "learning_rate": 0.00018391073824941385,
+      "loss": 0.9548,
+      "step": 4129
+    },
+    {
+      "epoch": 0.7353988603988604,
+      "grad_norm": 0.49299946427345276,
+      "learning_rate": 0.00018390312328114733,
+      "loss": 1.0868,
+      "step": 4130
+    },
+    {
+      "epoch": 0.7355769230769231,
+      "grad_norm": 0.4839940369129181,
+      "learning_rate": 0.0001838955066689734,
+      "loss": 0.9565,
+      "step": 4131
+    },
+    {
+      "epoch": 0.7357549857549858,
+      "grad_norm": 0.48600608110427856,
+      "learning_rate": 0.00018388788841304128,
+      "loss": 1.2353,
+      "step": 4132
+    },
+    {
+      "epoch": 0.7359330484330484,
+      "grad_norm": 0.4893583357334137,
+      "learning_rate": 0.0001838802685135003,
+      "loss": 0.9595,
+      "step": 4133
+    },
+    {
+      "epoch": 0.7361111111111112,
+      "grad_norm": 0.4587398171424866,
+      "learning_rate": 0.00018387264697049963,
+      "loss": 1.1222,
+      "step": 4134
+    },
+    {
+      "epoch": 0.7362891737891738,
+      "grad_norm": 0.5361055731773376,
+      "learning_rate": 0.00018386502378418872,
+      "loss": 1.3304,
+      "step": 4135
+    },
+    {
+      "epoch": 0.7364672364672364,
+      "grad_norm": 0.5556629300117493,
+      "learning_rate": 0.00018385739895471686,
+      "loss": 1.0358,
+      "step": 4136
+    },
+    {
+      "epoch": 0.7366452991452992,
+      "grad_norm": 0.45555856823921204,
+      "learning_rate": 0.00018384977248223346,
+      "loss": 1.0081,
+      "step": 4137
+    },
+    {
+      "epoch": 0.7368233618233618,
+      "grad_norm": 0.5606052875518799,
+      "learning_rate": 0.00018384214436688797,
+      "loss": 0.9367,
+      "step": 4138
+    },
+    {
+      "epoch": 0.7370014245014245,
+      "grad_norm": 0.5428356528282166,
+      "learning_rate": 0.00018383451460882982,
+      "loss": 1.1391,
+      "step": 4139
+    },
+    {
+      "epoch": 0.7371794871794872,
+      "grad_norm": 0.4891330897808075,
+      "learning_rate": 0.00018382688320820853,
+      "loss": 0.9805,
+      "step": 4140
+    },
+    {
+      "epoch": 0.7373575498575499,
+      "grad_norm": 0.5407996773719788,
+      "learning_rate": 0.0001838192501651736,
+      "loss": 1.0532,
+      "step": 4141
+    },
+    {
+      "epoch": 0.7375356125356125,
+      "grad_norm": 0.5241971611976624,
+      "learning_rate": 0.00018381161547987454,
+      "loss": 0.9509,
+      "step": 4142
+    },
+    {
+      "epoch": 0.7377136752136753,
+      "grad_norm": 0.5370210409164429,
+      "learning_rate": 0.000183803979152461,
+      "loss": 1.2342,
+      "step": 4143
+    },
+    {
+      "epoch": 0.7378917378917379,
+      "grad_norm": 0.5470060706138611,
+      "learning_rate": 0.00018379634118308259,
+      "loss": 0.9621,
+      "step": 4144
+    },
+    {
+      "epoch": 0.7380698005698005,
+      "grad_norm": 0.546313464641571,
+      "learning_rate": 0.00018378870157188893,
+      "loss": 1.1253,
+      "step": 4145
+    },
+    {
+      "epoch": 0.7382478632478633,
+      "grad_norm": 0.502027153968811,
+      "learning_rate": 0.00018378106031902974,
+      "loss": 1.1919,
+      "step": 4146
+    },
+    {
+      "epoch": 0.7384259259259259,
+      "grad_norm": 0.5282283425331116,
+      "learning_rate": 0.0001837734174246547,
+      "loss": 1.0088,
+      "step": 4147
+    },
+    {
+      "epoch": 0.7386039886039886,
+      "grad_norm": 0.5152897238731384,
+      "learning_rate": 0.00018376577288891355,
+      "loss": 1.0813,
+      "step": 4148
+    },
+    {
+      "epoch": 0.7387820512820513,
+      "grad_norm": 0.5002804398536682,
+      "learning_rate": 0.0001837581267119561,
+      "loss": 0.9797,
+      "step": 4149
+    },
+    {
+      "epoch": 0.738960113960114,
+      "grad_norm": 0.5698176026344299,
+      "learning_rate": 0.00018375047889393215,
+      "loss": 1.1099,
+      "step": 4150
+    },
+    {
+      "epoch": 0.7391381766381766,
+      "grad_norm": 0.5384604930877686,
+      "learning_rate": 0.00018374282943499156,
+      "loss": 1.1944,
+      "step": 4151
+    },
+    {
+      "epoch": 0.7393162393162394,
+      "grad_norm": 0.5483044385910034,
+      "learning_rate": 0.00018373517833528418,
+      "loss": 1.1734,
+      "step": 4152
+    },
+    {
+      "epoch": 0.739494301994302,
+      "grad_norm": 0.4824066162109375,
+      "learning_rate": 0.0001837275255949599,
+      "loss": 0.9515,
+      "step": 4153
+    },
+    {
+      "epoch": 0.7396723646723646,
+      "grad_norm": 0.45413634181022644,
+      "learning_rate": 0.00018371987121416873,
+      "loss": 0.7534,
+      "step": 4154
+    },
+    {
+      "epoch": 0.7398504273504274,
+      "grad_norm": 0.5874246954917908,
+      "learning_rate": 0.00018371221519306055,
+      "loss": 0.9464,
+      "step": 4155
+    },
+    {
+      "epoch": 0.74002849002849,
+      "grad_norm": 0.5219913125038147,
+      "learning_rate": 0.00018370455753178544,
+      "loss": 1.0494,
+      "step": 4156
+    },
+    {
+      "epoch": 0.7402065527065527,
+      "grad_norm": 0.5937709212303162,
+      "learning_rate": 0.00018369689823049341,
+      "loss": 1.0529,
+      "step": 4157
+    },
+    {
+      "epoch": 0.7403846153846154,
+      "grad_norm": 0.5204295516014099,
+      "learning_rate": 0.00018368923728933449,
+      "loss": 1.0602,
+      "step": 4158
+    },
+    {
+      "epoch": 0.7405626780626781,
+      "grad_norm": 0.5422890186309814,
+      "learning_rate": 0.00018368157470845885,
+      "loss": 0.9261,
+      "step": 4159
+    },
+    {
+      "epoch": 0.7407407407407407,
+      "grad_norm": 0.6163852214813232,
+      "learning_rate": 0.00018367391048801655,
+      "loss": 1.2771,
+      "step": 4160
+    },
+    {
+      "epoch": 0.7409188034188035,
+      "grad_norm": 0.5070751309394836,
+      "learning_rate": 0.00018366624462815785,
+      "loss": 1.0401,
+      "step": 4161
+    },
+    {
+      "epoch": 0.7410968660968661,
+      "grad_norm": 0.4477100968360901,
+      "learning_rate": 0.00018365857712903283,
+      "loss": 1.1463,
+      "step": 4162
+    },
+    {
+      "epoch": 0.7412749287749287,
+      "grad_norm": 0.5421462655067444,
+      "learning_rate": 0.0001836509079907918,
+      "loss": 0.9373,
+      "step": 4163
+    },
+    {
+      "epoch": 0.7414529914529915,
+      "grad_norm": 0.6162141561508179,
+      "learning_rate": 0.000183643237213585,
+      "loss": 1.1827,
+      "step": 4164
+    },
+    {
+      "epoch": 0.7416310541310541,
+      "grad_norm": 0.5653836131095886,
+      "learning_rate": 0.00018363556479756272,
+      "loss": 1.0689,
+      "step": 4165
+    },
+    {
+      "epoch": 0.7418091168091168,
+      "grad_norm": 0.57053542137146,
+      "learning_rate": 0.00018362789074287527,
+      "loss": 1.0289,
+      "step": 4166
+    },
+    {
+      "epoch": 0.7419871794871795,
+      "grad_norm": 0.5603055953979492,
+      "learning_rate": 0.00018362021504967304,
+      "loss": 1.1926,
+      "step": 4167
+    },
+    {
+      "epoch": 0.7421652421652422,
+      "grad_norm": 0.5460166335105896,
+      "learning_rate": 0.0001836125377181064,
+      "loss": 1.1488,
+      "step": 4168
+    },
+    {
+      "epoch": 0.7423433048433048,
+      "grad_norm": 0.5097107887268066,
+      "learning_rate": 0.00018360485874832579,
+      "loss": 1.0781,
+      "step": 4169
+    },
+    {
+      "epoch": 0.7425213675213675,
+      "grad_norm": 0.6280624270439148,
+      "learning_rate": 0.00018359717814048164,
+      "loss": 1.3625,
+      "step": 4170
+    },
+    {
+      "epoch": 0.7426994301994302,
+      "grad_norm": 0.4528210759162903,
+      "learning_rate": 0.0001835894958947244,
+      "loss": 0.8417,
+      "step": 4171
+    },
+    {
+      "epoch": 0.7428774928774928,
+      "grad_norm": 0.48735132813453674,
+      "learning_rate": 0.00018358181201120468,
+      "loss": 0.9544,
+      "step": 4172
+    },
+    {
+      "epoch": 0.7430555555555556,
+      "grad_norm": 0.48388174176216125,
+      "learning_rate": 0.00018357412649007296,
+      "loss": 1.0663,
+      "step": 4173
+    },
+    {
+      "epoch": 0.7432336182336182,
+      "grad_norm": 0.5435357689857483,
+      "learning_rate": 0.00018356643933147986,
+      "loss": 1.2074,
+      "step": 4174
+    },
+    {
+      "epoch": 0.7434116809116809,
+      "grad_norm": 0.49890074133872986,
+      "learning_rate": 0.00018355875053557594,
+      "loss": 1.1322,
+      "step": 4175
+    },
+    {
+      "epoch": 0.7435897435897436,
+      "grad_norm": 0.5680708885192871,
+      "learning_rate": 0.0001835510601025119,
+      "loss": 1.1964,
+      "step": 4176
+    },
+    {
+      "epoch": 0.7437678062678063,
+      "grad_norm": 0.5002360939979553,
+      "learning_rate": 0.00018354336803243842,
+      "loss": 1.1396,
+      "step": 4177
+    },
+    {
+      "epoch": 0.7439458689458689,
+      "grad_norm": 0.5202965140342712,
+      "learning_rate": 0.00018353567432550616,
+      "loss": 1.1498,
+      "step": 4178
+    },
+    {
+      "epoch": 0.7441239316239316,
+      "grad_norm": 0.514492928981781,
+      "learning_rate": 0.00018352797898186588,
+      "loss": 1.0959,
+      "step": 4179
+    },
+    {
+      "epoch": 0.7443019943019943,
+      "grad_norm": 0.6395383477210999,
+      "learning_rate": 0.0001835202820016684,
+      "loss": 1.2867,
+      "step": 4180
+    },
+    {
+      "epoch": 0.7444800569800569,
+      "grad_norm": 0.5489062070846558,
+      "learning_rate": 0.00018351258338506447,
+      "loss": 1.1638,
+      "step": 4181
+    },
+    {
+      "epoch": 0.7446581196581197,
+      "grad_norm": 0.5705671906471252,
+      "learning_rate": 0.00018350488313220498,
+      "loss": 0.9493,
+      "step": 4182
+    },
+    {
+      "epoch": 0.7448361823361823,
+      "grad_norm": 0.5404297709465027,
+      "learning_rate": 0.00018349718124324076,
+      "loss": 0.9876,
+      "step": 4183
+    },
+    {
+      "epoch": 0.7450142450142451,
+      "grad_norm": 0.5841003060340881,
+      "learning_rate": 0.0001834894777183227,
+      "loss": 1.1225,
+      "step": 4184
+    },
+    {
+      "epoch": 0.7451923076923077,
+      "grad_norm": 0.49774688482284546,
+      "learning_rate": 0.00018348177255760178,
+      "loss": 1.1442,
+      "step": 4185
+    },
+    {
+      "epoch": 0.7453703703703703,
+      "grad_norm": 0.5212422609329224,
+      "learning_rate": 0.00018347406576122894,
+      "loss": 1.101,
+      "step": 4186
+    },
+    {
+      "epoch": 0.7455484330484331,
+      "grad_norm": 0.615024983882904,
+      "learning_rate": 0.00018346635732935517,
+      "loss": 1.4188,
+      "step": 4187
+    },
+    {
+      "epoch": 0.7457264957264957,
+      "grad_norm": 0.46818843483924866,
+      "learning_rate": 0.00018345864726213154,
+      "loss": 1.0071,
+      "step": 4188
+    },
+    {
+      "epoch": 0.7459045584045584,
+      "grad_norm": 0.4921121895313263,
+      "learning_rate": 0.00018345093555970906,
+      "loss": 1.015,
+      "step": 4189
+    },
+    {
+      "epoch": 0.7460826210826211,
+      "grad_norm": 0.5042136311531067,
+      "learning_rate": 0.00018344322222223889,
+      "loss": 0.9974,
+      "step": 4190
+    },
+    {
+      "epoch": 0.7462606837606838,
+      "grad_norm": 0.5872490406036377,
+      "learning_rate": 0.0001834355072498721,
+      "loss": 1.3166,
+      "step": 4191
+    },
+    {
+      "epoch": 0.7464387464387464,
+      "grad_norm": 0.559117317199707,
+      "learning_rate": 0.00018342779064275984,
+      "loss": 1.2227,
+      "step": 4192
+    },
+    {
+      "epoch": 0.7466168091168092,
+      "grad_norm": 0.5269635319709778,
+      "learning_rate": 0.00018342007240105336,
+      "loss": 1.0281,
+      "step": 4193
+    },
+    {
+      "epoch": 0.7467948717948718,
+      "grad_norm": 0.4608335793018341,
+      "learning_rate": 0.00018341235252490387,
+      "loss": 0.98,
+      "step": 4194
+    },
+    {
+      "epoch": 0.7469729344729344,
+      "grad_norm": 0.5818259119987488,
+      "learning_rate": 0.00018340463101446255,
+      "loss": 1.1544,
+      "step": 4195
+    },
+    {
+      "epoch": 0.7471509971509972,
+      "grad_norm": 0.5577529668807983,
+      "learning_rate": 0.00018339690786988079,
+      "loss": 1.3059,
+      "step": 4196
+    },
+    {
+      "epoch": 0.7473290598290598,
+      "grad_norm": 0.5430468320846558,
+      "learning_rate": 0.00018338918309130983,
+      "loss": 1.2766,
+      "step": 4197
+    },
+    {
+      "epoch": 0.7475071225071225,
+      "grad_norm": 0.4941701591014862,
+      "learning_rate": 0.0001833814566789011,
+      "loss": 1.193,
+      "step": 4198
+    },
+    {
+      "epoch": 0.7476851851851852,
+      "grad_norm": 0.5471884608268738,
+      "learning_rate": 0.00018337372863280589,
+      "loss": 1.2261,
+      "step": 4199
+    },
+    {
+      "epoch": 0.7478632478632479,
+      "grad_norm": 0.4641438126564026,
+      "learning_rate": 0.0001833659989531757,
+      "loss": 0.7953,
+      "step": 4200
+    },
+    {
+      "epoch": 0.7480413105413105,
+      "grad_norm": 0.5244714617729187,
+      "learning_rate": 0.0001833582676401619,
+      "loss": 0.9344,
+      "step": 4201
+    },
+    {
+      "epoch": 0.7482193732193733,
+      "grad_norm": 0.5964360237121582,
+      "learning_rate": 0.00018335053469391603,
+      "loss": 1.2072,
+      "step": 4202
+    },
+    {
+      "epoch": 0.7483974358974359,
+      "grad_norm": 0.4929158091545105,
+      "learning_rate": 0.00018334280011458954,
+      "loss": 1.2183,
+      "step": 4203
+    },
+    {
+      "epoch": 0.7485754985754985,
+      "grad_norm": 0.46221864223480225,
+      "learning_rate": 0.00018333506390233405,
+      "loss": 1.1957,
+      "step": 4204
+    },
+    {
+      "epoch": 0.7487535612535613,
+      "grad_norm": 0.6301732659339905,
+      "learning_rate": 0.0001833273260573011,
+      "loss": 1.0582,
+      "step": 4205
+    },
+    {
+      "epoch": 0.7489316239316239,
+      "grad_norm": 0.5606021881103516,
+      "learning_rate": 0.0001833195865796423,
+      "loss": 1.4034,
+      "step": 4206
+    },
+    {
+      "epoch": 0.7491096866096866,
+      "grad_norm": 0.44856077432632446,
+      "learning_rate": 0.00018331184546950926,
+      "loss": 0.8421,
+      "step": 4207
+    },
+    {
+      "epoch": 0.7492877492877493,
+      "grad_norm": 0.5487226247787476,
+      "learning_rate": 0.00018330410272705366,
+      "loss": 1.238,
+      "step": 4208
+    },
+    {
+      "epoch": 0.749465811965812,
+      "grad_norm": 0.6043636798858643,
+      "learning_rate": 0.00018329635835242724,
+      "loss": 1.1215,
+      "step": 4209
+    },
+    {
+      "epoch": 0.7496438746438746,
+      "grad_norm": 0.5145319104194641,
+      "learning_rate": 0.00018328861234578173,
+      "loss": 1.1002,
+      "step": 4210
+    },
+    {
+      "epoch": 0.7498219373219374,
+      "grad_norm": 0.5667078495025635,
+      "learning_rate": 0.00018328086470726884,
+      "loss": 1.2994,
+      "step": 4211
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.5117634534835815,
+      "learning_rate": 0.00018327311543704043,
+      "loss": 0.9448,
+      "step": 4212
+    },
+    {
+      "epoch": 0.75,
+      "eval_loss": 1.0982474088668823,
+      "eval_runtime": 24.6617,
+      "eval_samples_per_second": 42.211,
+      "eval_steps_per_second": 21.126,
+      "step": 4212
+    },
+    {
+      "epoch": 0.7501780626780626,
+      "grad_norm": 0.5451585054397583,
+      "learning_rate": 0.00018326536453524826,
+      "loss": 0.9023,
+      "step": 4213
+    },
+    {
+      "epoch": 0.7503561253561254,
+      "grad_norm": 0.6585208773612976,
+      "learning_rate": 0.0001832576120020443,
+      "loss": 1.2798,
+      "step": 4214
+    },
+    {
+      "epoch": 0.750534188034188,
+      "grad_norm": 0.6444812417030334,
+      "learning_rate": 0.00018324985783758037,
+      "loss": 1.3999,
+      "step": 4215
+    },
+    {
+      "epoch": 0.7507122507122507,
+      "grad_norm": 0.6178330779075623,
+      "learning_rate": 0.0001832421020420084,
+      "loss": 1.1846,
+      "step": 4216
+    },
+    {
+      "epoch": 0.7508903133903134,
+      "grad_norm": 0.509969174861908,
+      "learning_rate": 0.00018323434461548036,
+      "loss": 1.1831,
+      "step": 4217
+    },
+    {
+      "epoch": 0.7510683760683761,
+      "grad_norm": 0.5558911561965942,
+      "learning_rate": 0.00018322658555814826,
+      "loss": 1.1599,
+      "step": 4218
+    },
+    {
+      "epoch": 0.7512464387464387,
+      "grad_norm": 0.5714917778968811,
+      "learning_rate": 0.0001832188248701641,
+      "loss": 0.9702,
+      "step": 4219
+    },
+    {
+      "epoch": 0.7514245014245015,
+      "grad_norm": 0.6136442422866821,
+      "learning_rate": 0.00018321106255167995,
+      "loss": 0.9376,
+      "step": 4220
+    },
+    {
+      "epoch": 0.7516025641025641,
+      "grad_norm": 0.5832077264785767,
+      "learning_rate": 0.00018320329860284785,
+      "loss": 1.2564,
+      "step": 4221
+    },
+    {
+      "epoch": 0.7517806267806267,
+      "grad_norm": 0.45330923795700073,
+      "learning_rate": 0.00018319553302381997,
+      "loss": 0.9321,
+      "step": 4222
+    },
+    {
+      "epoch": 0.7519586894586895,
+      "grad_norm": 0.5278468132019043,
+      "learning_rate": 0.00018318776581474847,
+      "loss": 1.1334,
+      "step": 4223
+    },
+    {
+      "epoch": 0.7521367521367521,
+      "grad_norm": 0.49267473816871643,
+      "learning_rate": 0.00018317999697578549,
+      "loss": 1.1577,
+      "step": 4224
+    },
+    {
+      "epoch": 0.7523148148148148,
+      "grad_norm": 0.5372124314308167,
+      "learning_rate": 0.00018317222650708325,
+      "loss": 1.037,
+      "step": 4225
+    },
+    {
+      "epoch": 0.7524928774928775,
+      "grad_norm": 0.5879829525947571,
+      "learning_rate": 0.000183164454408794,
+      "loss": 1.1312,
+      "step": 4226
+    },
+    {
+      "epoch": 0.7526709401709402,
+      "grad_norm": 0.5363932251930237,
+      "learning_rate": 0.00018315668068107004,
+      "loss": 1.174,
+      "step": 4227
+    },
+    {
+      "epoch": 0.7528490028490028,
+      "grad_norm": 0.5585991740226746,
+      "learning_rate": 0.00018314890532406366,
+      "loss": 1.2106,
+      "step": 4228
+    },
+    {
+      "epoch": 0.7530270655270656,
+      "grad_norm": 0.49395787715911865,
+      "learning_rate": 0.0001831411283379272,
+      "loss": 1.1163,
+      "step": 4229
+    },
+    {
+      "epoch": 0.7532051282051282,
+      "grad_norm": 0.5081066489219666,
+      "learning_rate": 0.00018313334972281306,
+      "loss": 1.184,
+      "step": 4230
+    },
+    {
+      "epoch": 0.7533831908831908,
+      "grad_norm": 0.40304034948349,
+      "learning_rate": 0.0001831255694788736,
+      "loss": 0.7548,
+      "step": 4231
+    },
+    {
+      "epoch": 0.7535612535612536,
+      "grad_norm": 0.4999815821647644,
+      "learning_rate": 0.0001831177876062613,
+      "loss": 1.0092,
+      "step": 4232
+    },
+    {
+      "epoch": 0.7537393162393162,
+      "grad_norm": 0.48917025327682495,
+      "learning_rate": 0.00018311000410512862,
+      "loss": 1.0354,
+      "step": 4233
+    },
+    {
+      "epoch": 0.7539173789173789,
+      "grad_norm": 0.475606769323349,
+      "learning_rate": 0.00018310221897562806,
+      "loss": 0.8728,
+      "step": 4234
+    },
+    {
+      "epoch": 0.7540954415954416,
+      "grad_norm": 0.630439817905426,
+      "learning_rate": 0.00018309443221791214,
+      "loss": 1.1436,
+      "step": 4235
+    },
+    {
+      "epoch": 0.7542735042735043,
+      "grad_norm": 0.524740993976593,
+      "learning_rate": 0.00018308664383213344,
+      "loss": 1.0487,
+      "step": 4236
+    },
+    {
+      "epoch": 0.7544515669515669,
+      "grad_norm": 0.4734523892402649,
+      "learning_rate": 0.0001830788538184445,
+      "loss": 1.0681,
+      "step": 4237
+    },
+    {
+      "epoch": 0.7546296296296297,
+      "grad_norm": 0.5767266750335693,
+      "learning_rate": 0.00018307106217699807,
+      "loss": 1.0599,
+      "step": 4238
+    },
+    {
+      "epoch": 0.7548076923076923,
+      "grad_norm": 0.6276642084121704,
+      "learning_rate": 0.0001830632689079467,
+      "loss": 1.2837,
+      "step": 4239
+    },
+    {
+      "epoch": 0.7549857549857549,
+      "grad_norm": 0.5539988279342651,
+      "learning_rate": 0.00018305547401144316,
+      "loss": 0.9072,
+      "step": 4240
+    },
+    {
+      "epoch": 0.7551638176638177,
+      "grad_norm": 0.4551292061805725,
+      "learning_rate": 0.00018304767748764014,
+      "loss": 1.0204,
+      "step": 4241
+    },
+    {
+      "epoch": 0.7553418803418803,
+      "grad_norm": 0.47344550490379333,
+      "learning_rate": 0.00018303987933669034,
+      "loss": 1.0473,
+      "step": 4242
+    },
+    {
+      "epoch": 0.7555199430199431,
+      "grad_norm": 0.6050213575363159,
+      "learning_rate": 0.00018303207955874665,
+      "loss": 1.1552,
+      "step": 4243
+    },
+    {
+      "epoch": 0.7556980056980057,
+      "grad_norm": 0.48943889141082764,
+      "learning_rate": 0.00018302427815396186,
+      "loss": 1.0002,
+      "step": 4244
+    },
+    {
+      "epoch": 0.7558760683760684,
+      "grad_norm": 0.5664682984352112,
+      "learning_rate": 0.00018301647512248878,
+      "loss": 1.1865,
+      "step": 4245
+    },
+    {
+      "epoch": 0.7560541310541311,
+      "grad_norm": 0.5702242255210876,
+      "learning_rate": 0.00018300867046448034,
+      "loss": 1.3029,
+      "step": 4246
+    },
+    {
+      "epoch": 0.7562321937321937,
+      "grad_norm": 0.593207836151123,
+      "learning_rate": 0.00018300086418008942,
+      "loss": 1.109,
+      "step": 4247
+    },
+    {
+      "epoch": 0.7564102564102564,
+      "grad_norm": 0.5887887477874756,
+      "learning_rate": 0.000182993056269469,
+      "loss": 1.3022,
+      "step": 4248
+    },
+    {
+      "epoch": 0.7565883190883191,
+      "grad_norm": 0.5277966260910034,
+      "learning_rate": 0.00018298524673277203,
+      "loss": 1.1738,
+      "step": 4249
+    },
+    {
+      "epoch": 0.7567663817663818,
+      "grad_norm": 0.589347779750824,
+      "learning_rate": 0.00018297743557015155,
+      "loss": 1.0185,
+      "step": 4250
+    },
+    {
+      "epoch": 0.7569444444444444,
+      "grad_norm": 0.49920859932899475,
+      "learning_rate": 0.0001829696227817606,
+      "loss": 1.118,
+      "step": 4251
+    },
+    {
+      "epoch": 0.7571225071225072,
+      "grad_norm": 0.502565324306488,
+      "learning_rate": 0.0001829618083677522,
+      "loss": 1.1856,
+      "step": 4252
+    },
+    {
+      "epoch": 0.7573005698005698,
+      "grad_norm": 0.49814435839653015,
+      "learning_rate": 0.00018295399232827955,
+      "loss": 1.0432,
+      "step": 4253
+    },
+    {
+      "epoch": 0.7574786324786325,
+      "grad_norm": 0.5087502598762512,
+      "learning_rate": 0.00018294617466349574,
+      "loss": 1.2325,
+      "step": 4254
+    },
+    {
+      "epoch": 0.7576566951566952,
+      "grad_norm": 0.5107288956642151,
+      "learning_rate": 0.00018293835537355394,
+      "loss": 1.0487,
+      "step": 4255
+    },
+    {
+      "epoch": 0.7578347578347578,
+      "grad_norm": 0.524725615978241,
+      "learning_rate": 0.00018293053445860732,
+      "loss": 1.1821,
+      "step": 4256
+    },
+    {
+      "epoch": 0.7580128205128205,
+      "grad_norm": 0.5234082937240601,
+      "learning_rate": 0.0001829227119188092,
+      "loss": 0.8896,
+      "step": 4257
+    },
+    {
+      "epoch": 0.7581908831908832,
+      "grad_norm": 0.5102918744087219,
+      "learning_rate": 0.00018291488775431275,
+      "loss": 1.0246,
+      "step": 4258
+    },
+    {
+      "epoch": 0.7583689458689459,
+      "grad_norm": 0.5552714467048645,
+      "learning_rate": 0.00018290706196527135,
+      "loss": 1.0193,
+      "step": 4259
+    },
+    {
+      "epoch": 0.7585470085470085,
+      "grad_norm": 0.5395022630691528,
+      "learning_rate": 0.00018289923455183825,
+      "loss": 1.3203,
+      "step": 4260
+    },
+    {
+      "epoch": 0.7587250712250713,
+      "grad_norm": 0.7474865913391113,
+      "learning_rate": 0.00018289140551416692,
+      "loss": 1.182,
+      "step": 4261
+    },
+    {
+      "epoch": 0.7589031339031339,
+      "grad_norm": 0.4892016649246216,
+      "learning_rate": 0.00018288357485241066,
+      "loss": 0.968,
+      "step": 4262
+    },
+    {
+      "epoch": 0.7590811965811965,
+      "grad_norm": 0.4627816081047058,
+      "learning_rate": 0.00018287574256672291,
+      "loss": 0.6895,
+      "step": 4263
+    },
+    {
+      "epoch": 0.7592592592592593,
+      "grad_norm": 0.6221280097961426,
+      "learning_rate": 0.00018286790865725715,
+      "loss": 0.9691,
+      "step": 4264
+    },
+    {
+      "epoch": 0.7594373219373219,
+      "grad_norm": 0.5542295575141907,
+      "learning_rate": 0.0001828600731241669,
+      "loss": 0.9996,
+      "step": 4265
+    },
+    {
+      "epoch": 0.7596153846153846,
+      "grad_norm": 0.5570770502090454,
+      "learning_rate": 0.00018285223596760562,
+      "loss": 1.1996,
+      "step": 4266
+    },
+    {
+      "epoch": 0.7597934472934473,
+      "grad_norm": 0.5495262742042542,
+      "learning_rate": 0.00018284439718772687,
+      "loss": 1.1572,
+      "step": 4267
+    },
+    {
+      "epoch": 0.75997150997151,
+      "grad_norm": 0.5006741881370544,
+      "learning_rate": 0.00018283655678468427,
+      "loss": 1.1215,
+      "step": 4268
+    },
+    {
+      "epoch": 0.7601495726495726,
+      "grad_norm": 0.4682157635688782,
+      "learning_rate": 0.00018282871475863144,
+      "loss": 1.0547,
+      "step": 4269
+    },
+    {
+      "epoch": 0.7603276353276354,
+      "grad_norm": 0.6275840997695923,
+      "learning_rate": 0.00018282087110972197,
+      "loss": 1.3855,
+      "step": 4270
+    },
+    {
+      "epoch": 0.760505698005698,
+      "grad_norm": 0.5341474413871765,
+      "learning_rate": 0.0001828130258381096,
+      "loss": 1.2024,
+      "step": 4271
+    },
+    {
+      "epoch": 0.7606837606837606,
+      "grad_norm": 0.4330833852291107,
+      "learning_rate": 0.000182805178943948,
+      "loss": 1.0508,
+      "step": 4272
+    },
+    {
+      "epoch": 0.7608618233618234,
+      "grad_norm": 0.6276537179946899,
+      "learning_rate": 0.00018279733042739094,
+      "loss": 1.1635,
+      "step": 4273
+    },
+    {
+      "epoch": 0.761039886039886,
+      "grad_norm": 0.5370199084281921,
+      "learning_rate": 0.00018278948028859217,
+      "loss": 1.0579,
+      "step": 4274
+    },
+    {
+      "epoch": 0.7612179487179487,
+      "grad_norm": 0.524959921836853,
+      "learning_rate": 0.00018278162852770552,
+      "loss": 1.0972,
+      "step": 4275
+    },
+    {
+      "epoch": 0.7613960113960114,
+      "grad_norm": 0.5029389262199402,
+      "learning_rate": 0.00018277377514488486,
+      "loss": 0.959,
+      "step": 4276
+    },
+    {
+      "epoch": 0.7615740740740741,
+      "grad_norm": 0.49772894382476807,
+      "learning_rate": 0.00018276592014028397,
+      "loss": 1.2773,
+      "step": 4277
+    },
+    {
+      "epoch": 0.7617521367521367,
+      "grad_norm": 0.5195719003677368,
+      "learning_rate": 0.00018275806351405685,
+      "loss": 1.0676,
+      "step": 4278
+    },
+    {
+      "epoch": 0.7619301994301995,
+      "grad_norm": 0.5167942643165588,
+      "learning_rate": 0.00018275020526635735,
+      "loss": 1.0615,
+      "step": 4279
+    },
+    {
+      "epoch": 0.7621082621082621,
+      "grad_norm": 0.4958035945892334,
+      "learning_rate": 0.0001827423453973395,
+      "loss": 0.9605,
+      "step": 4280
+    },
+    {
+      "epoch": 0.7622863247863247,
+      "grad_norm": 0.6256808042526245,
+      "learning_rate": 0.00018273448390715728,
+      "loss": 1.2526,
+      "step": 4281
+    },
+    {
+      "epoch": 0.7624643874643875,
+      "grad_norm": 0.5062580108642578,
+      "learning_rate": 0.0001827266207959647,
+      "loss": 1.0604,
+      "step": 4282
+    },
+    {
+      "epoch": 0.7626424501424501,
+      "grad_norm": 0.5080778002738953,
+      "learning_rate": 0.00018271875606391583,
+      "loss": 1.1246,
+      "step": 4283
+    },
+    {
+      "epoch": 0.7628205128205128,
+      "grad_norm": 0.5069389939308167,
+      "learning_rate": 0.00018271088971116479,
+      "loss": 1.3158,
+      "step": 4284
+    },
+    {
+      "epoch": 0.7629985754985755,
+      "grad_norm": 0.7280121445655823,
+      "learning_rate": 0.00018270302173786567,
+      "loss": 1.2066,
+      "step": 4285
+    },
+    {
+      "epoch": 0.7631766381766382,
+      "grad_norm": 0.6523470282554626,
+      "learning_rate": 0.00018269515214417267,
+      "loss": 1.3236,
+      "step": 4286
+    },
+    {
+      "epoch": 0.7633547008547008,
+      "grad_norm": 0.5799322724342346,
+      "learning_rate": 0.00018268728093023988,
+      "loss": 0.9786,
+      "step": 4287
+    },
+    {
+      "epoch": 0.7635327635327636,
+      "grad_norm": 0.46675166487693787,
+      "learning_rate": 0.00018267940809622163,
+      "loss": 0.8131,
+      "step": 4288
+    },
+    {
+      "epoch": 0.7637108262108262,
+      "grad_norm": 0.5566182732582092,
+      "learning_rate": 0.00018267153364227214,
+      "loss": 1.0565,
+      "step": 4289
+    },
+    {
+      "epoch": 0.7638888888888888,
+      "grad_norm": 0.532028079032898,
+      "learning_rate": 0.00018266365756854566,
+      "loss": 0.952,
+      "step": 4290
+    },
+    {
+      "epoch": 0.7640669515669516,
+      "grad_norm": 0.5082666873931885,
+      "learning_rate": 0.00018265577987519653,
+      "loss": 1.0704,
+      "step": 4291
+    },
+    {
+      "epoch": 0.7642450142450142,
+      "grad_norm": 0.5223562717437744,
+      "learning_rate": 0.00018264790056237912,
+      "loss": 1.1161,
+      "step": 4292
+    },
+    {
+      "epoch": 0.7644230769230769,
+      "grad_norm": 0.48472318053245544,
+      "learning_rate": 0.00018264001963024778,
+      "loss": 0.8784,
+      "step": 4293
+    },
+    {
+      "epoch": 0.7646011396011396,
+      "grad_norm": 0.5901281833648682,
+      "learning_rate": 0.0001826321370789569,
+      "loss": 1.1031,
+      "step": 4294
+    },
+    {
+      "epoch": 0.7647792022792023,
+      "grad_norm": 0.570350706577301,
+      "learning_rate": 0.000182624252908661,
+      "loss": 0.9047,
+      "step": 4295
+    },
+    {
+      "epoch": 0.7649572649572649,
+      "grad_norm": 0.568373441696167,
+      "learning_rate": 0.00018261636711951445,
+      "loss": 1.0106,
+      "step": 4296
+    },
+    {
+      "epoch": 0.7651353276353277,
+      "grad_norm": 0.6175880432128906,
+      "learning_rate": 0.00018260847971167182,
+      "loss": 1.3531,
+      "step": 4297
+    },
+    {
+      "epoch": 0.7653133903133903,
+      "grad_norm": 0.5682594776153564,
+      "learning_rate": 0.00018260059068528762,
+      "loss": 1.1261,
+      "step": 4298
+    },
+    {
+      "epoch": 0.7654914529914529,
+      "grad_norm": 0.5050225257873535,
+      "learning_rate": 0.00018259270004051644,
+      "loss": 1.0921,
+      "step": 4299
+    },
+    {
+      "epoch": 0.7656695156695157,
+      "grad_norm": 0.5416565537452698,
+      "learning_rate": 0.0001825848077775129,
+      "loss": 1.0881,
+      "step": 4300
+    },
+    {
+      "epoch": 0.7658475783475783,
+      "grad_norm": 0.5418867468833923,
+      "learning_rate": 0.0001825769138964316,
+      "loss": 1.2069,
+      "step": 4301
+    },
+    {
+      "epoch": 0.7660256410256411,
+      "grad_norm": 0.5447866320610046,
+      "learning_rate": 0.00018256901839742718,
+      "loss": 1.1827,
+      "step": 4302
+    },
+    {
+      "epoch": 0.7662037037037037,
+      "grad_norm": 0.5482802987098694,
+      "learning_rate": 0.00018256112128065439,
+      "loss": 1.0492,
+      "step": 4303
+    },
+    {
+      "epoch": 0.7663817663817664,
+      "grad_norm": 0.5059601664543152,
+      "learning_rate": 0.0001825532225462679,
+      "loss": 1.0996,
+      "step": 4304
+    },
+    {
+      "epoch": 0.7665598290598291,
+      "grad_norm": 0.5153701901435852,
+      "learning_rate": 0.00018254532219442258,
+      "loss": 1.3237,
+      "step": 4305
+    },
+    {
+      "epoch": 0.7667378917378918,
+      "grad_norm": 0.5370768904685974,
+      "learning_rate": 0.0001825374202252731,
+      "loss": 0.9925,
+      "step": 4306
+    },
+    {
+      "epoch": 0.7669159544159544,
+      "grad_norm": 0.4516580402851105,
+      "learning_rate": 0.00018252951663897432,
+      "loss": 1.0749,
+      "step": 4307
+    },
+    {
+      "epoch": 0.7670940170940171,
+      "grad_norm": 0.5565171837806702,
+      "learning_rate": 0.0001825216114356811,
+      "loss": 1.1617,
+      "step": 4308
+    },
+    {
+      "epoch": 0.7672720797720798,
+      "grad_norm": 0.5212662220001221,
+      "learning_rate": 0.00018251370461554834,
+      "loss": 1.1108,
+      "step": 4309
+    },
+    {
+      "epoch": 0.7674501424501424,
+      "grad_norm": 0.49061715602874756,
+      "learning_rate": 0.00018250579617873095,
+      "loss": 1.0881,
+      "step": 4310
+    },
+    {
+      "epoch": 0.7676282051282052,
+      "grad_norm": 0.5535751581192017,
+      "learning_rate": 0.00018249788612538387,
+      "loss": 0.9341,
+      "step": 4311
+    },
+    {
+      "epoch": 0.7678062678062678,
+      "grad_norm": 0.5425209403038025,
+      "learning_rate": 0.00018248997445566208,
+      "loss": 1.1858,
+      "step": 4312
+    },
+    {
+      "epoch": 0.7679843304843305,
+      "grad_norm": 0.6224395036697388,
+      "learning_rate": 0.0001824820611697206,
+      "loss": 1.0836,
+      "step": 4313
+    },
+    {
+      "epoch": 0.7681623931623932,
+      "grad_norm": 0.4895690977573395,
+      "learning_rate": 0.00018247414626771445,
+      "loss": 0.8598,
+      "step": 4314
+    },
+    {
+      "epoch": 0.7683404558404558,
+      "grad_norm": 0.5279615521430969,
+      "learning_rate": 0.00018246622974979877,
+      "loss": 1.1742,
+      "step": 4315
+    },
+    {
+      "epoch": 0.7685185185185185,
+      "grad_norm": 0.45300471782684326,
+      "learning_rate": 0.0001824583116161286,
+      "loss": 0.8872,
+      "step": 4316
+    },
+    {
+      "epoch": 0.7686965811965812,
+      "grad_norm": 0.6499692797660828,
+      "learning_rate": 0.00018245039186685916,
+      "loss": 1.2495,
+      "step": 4317
+    },
+    {
+      "epoch": 0.7688746438746439,
+      "grad_norm": 0.48151278495788574,
+      "learning_rate": 0.00018244247050214552,
+      "loss": 1.2382,
+      "step": 4318
+    },
+    {
+      "epoch": 0.7690527065527065,
+      "grad_norm": 0.6597028374671936,
+      "learning_rate": 0.0001824345475221429,
+      "loss": 1.3453,
+      "step": 4319
+    },
+    {
+      "epoch": 0.7692307692307693,
+      "grad_norm": 0.4536992609500885,
+      "learning_rate": 0.0001824266229270066,
+      "loss": 1.1141,
+      "step": 4320
+    },
+    {
+      "epoch": 0.7694088319088319,
+      "grad_norm": 0.5489405393600464,
+      "learning_rate": 0.00018241869671689184,
+      "loss": 1.0333,
+      "step": 4321
+    },
+    {
+      "epoch": 0.7695868945868946,
+      "grad_norm": 0.5741586089134216,
+      "learning_rate": 0.00018241076889195394,
+      "loss": 0.9939,
+      "step": 4322
+    },
+    {
+      "epoch": 0.7697649572649573,
+      "grad_norm": 0.47170960903167725,
+      "learning_rate": 0.00018240283945234823,
+      "loss": 0.9878,
+      "step": 4323
+    },
+    {
+      "epoch": 0.76994301994302,
+      "grad_norm": 0.4729093313217163,
+      "learning_rate": 0.00018239490839823004,
+      "loss": 1.0087,
+      "step": 4324
+    },
+    {
+      "epoch": 0.7701210826210826,
+      "grad_norm": 0.49869823455810547,
+      "learning_rate": 0.0001823869757297548,
+      "loss": 1.169,
+      "step": 4325
+    },
+    {
+      "epoch": 0.7702991452991453,
+      "grad_norm": 0.5118468403816223,
+      "learning_rate": 0.0001823790414470779,
+      "loss": 1.1092,
+      "step": 4326
+    },
+    {
+      "epoch": 0.770477207977208,
+      "grad_norm": 0.5076048970222473,
+      "learning_rate": 0.0001823711055503548,
+      "loss": 1.1028,
+      "step": 4327
+    },
+    {
+      "epoch": 0.7706552706552706,
+      "grad_norm": 0.5661569237709045,
+      "learning_rate": 0.00018236316803974098,
+      "loss": 1.1114,
+      "step": 4328
+    },
+    {
+      "epoch": 0.7708333333333334,
+      "grad_norm": 0.5542354583740234,
+      "learning_rate": 0.000182355228915392,
+      "loss": 1.0931,
+      "step": 4329
+    },
+    {
+      "epoch": 0.771011396011396,
+      "grad_norm": 0.5476680994033813,
+      "learning_rate": 0.0001823472881774634,
+      "loss": 1.036,
+      "step": 4330
+    },
+    {
+      "epoch": 0.7711894586894587,
+      "grad_norm": 0.5449798703193665,
+      "learning_rate": 0.00018233934582611073,
+      "loss": 1.0682,
+      "step": 4331
+    },
+    {
+      "epoch": 0.7713675213675214,
+      "grad_norm": 0.61089026927948,
+      "learning_rate": 0.00018233140186148963,
+      "loss": 1.0748,
+      "step": 4332
+    },
+    {
+      "epoch": 0.771545584045584,
+      "grad_norm": 0.5015206336975098,
+      "learning_rate": 0.00018232345628375576,
+      "loss": 1.2032,
+      "step": 4333
+    },
+    {
+      "epoch": 0.7717236467236467,
+      "grad_norm": 0.579289972782135,
+      "learning_rate": 0.00018231550909306475,
+      "loss": 1.0764,
+      "step": 4334
+    },
+    {
+      "epoch": 0.7719017094017094,
+      "grad_norm": 0.5889299511909485,
+      "learning_rate": 0.00018230756028957235,
+      "loss": 1.1768,
+      "step": 4335
+    },
+    {
+      "epoch": 0.7720797720797721,
+      "grad_norm": 0.5328249335289001,
+      "learning_rate": 0.00018229960987343428,
+      "loss": 1.0055,
+      "step": 4336
+    },
+    {
+      "epoch": 0.7722578347578347,
+      "grad_norm": 0.5766382217407227,
+      "learning_rate": 0.0001822916578448063,
+      "loss": 0.9923,
+      "step": 4337
+    },
+    {
+      "epoch": 0.7724358974358975,
+      "grad_norm": 0.6448187828063965,
+      "learning_rate": 0.00018228370420384423,
+      "loss": 1.1135,
+      "step": 4338
+    },
+    {
+      "epoch": 0.7726139601139601,
+      "grad_norm": 0.5505210757255554,
+      "learning_rate": 0.00018227574895070394,
+      "loss": 1.2048,
+      "step": 4339
+    },
+    {
+      "epoch": 0.7727920227920227,
+      "grad_norm": 0.6278925538063049,
+      "learning_rate": 0.00018226779208554126,
+      "loss": 1.1045,
+      "step": 4340
+    },
+    {
+      "epoch": 0.7729700854700855,
+      "grad_norm": 0.5345009565353394,
+      "learning_rate": 0.00018225983360851207,
+      "loss": 1.0102,
+      "step": 4341
+    },
+    {
+      "epoch": 0.7731481481481481,
+      "grad_norm": 0.566633403301239,
+      "learning_rate": 0.00018225187351977233,
+      "loss": 1.0038,
+      "step": 4342
+    },
+    {
+      "epoch": 0.7733262108262108,
+      "grad_norm": 0.5066078901290894,
+      "learning_rate": 0.000182243911819478,
+      "loss": 1.0339,
+      "step": 4343
+    },
+    {
+      "epoch": 0.7735042735042735,
+      "grad_norm": 0.5614920258522034,
+      "learning_rate": 0.00018223594850778503,
+      "loss": 1.1021,
+      "step": 4344
+    },
+    {
+      "epoch": 0.7736823361823362,
+      "grad_norm": 0.7747337818145752,
+      "learning_rate": 0.0001822279835848495,
+      "loss": 1.1129,
+      "step": 4345
+    },
+    {
+      "epoch": 0.7738603988603988,
+      "grad_norm": 0.7066529989242554,
+      "learning_rate": 0.00018222001705082744,
+      "loss": 1.3234,
+      "step": 4346
+    },
+    {
+      "epoch": 0.7740384615384616,
+      "grad_norm": 0.6340884566307068,
+      "learning_rate": 0.00018221204890587497,
+      "loss": 1.0726,
+      "step": 4347
+    },
+    {
+      "epoch": 0.7742165242165242,
+      "grad_norm": 0.5401145815849304,
+      "learning_rate": 0.00018220407915014818,
+      "loss": 0.9904,
+      "step": 4348
+    },
+    {
+      "epoch": 0.7743945868945868,
+      "grad_norm": 0.5069159269332886,
+      "learning_rate": 0.00018219610778380315,
+      "loss": 1.0654,
+      "step": 4349
+    },
+    {
+      "epoch": 0.7745726495726496,
+      "grad_norm": 0.5422839522361755,
+      "learning_rate": 0.00018218813480699623,
+      "loss": 1.1741,
+      "step": 4350
+    },
+    {
+      "epoch": 0.7747507122507122,
+      "grad_norm": 0.5550300478935242,
+      "learning_rate": 0.0001821801602198835,
+      "loss": 1.0033,
+      "step": 4351
+    },
+    {
+      "epoch": 0.7749287749287749,
+      "grad_norm": 0.5987736582756042,
+      "learning_rate": 0.00018217218402262123,
+      "loss": 0.935,
+      "step": 4352
+    },
+    {
+      "epoch": 0.7751068376068376,
+      "grad_norm": 0.6137008666992188,
+      "learning_rate": 0.00018216420621536573,
+      "loss": 1.17,
+      "step": 4353
+    },
+    {
+      "epoch": 0.7752849002849003,
+      "grad_norm": 0.47124359011650085,
+      "learning_rate": 0.0001821562267982733,
+      "loss": 0.8316,
+      "step": 4354
+    },
+    {
+      "epoch": 0.7754629629629629,
+      "grad_norm": 0.5057868361473083,
+      "learning_rate": 0.00018214824577150024,
+      "loss": 1.0246,
+      "step": 4355
+    },
+    {
+      "epoch": 0.7756410256410257,
+      "grad_norm": 0.604055643081665,
+      "learning_rate": 0.00018214026313520299,
+      "loss": 1.1272,
+      "step": 4356
+    },
+    {
+      "epoch": 0.7758190883190883,
+      "grad_norm": 0.6690384149551392,
+      "learning_rate": 0.0001821322788895379,
+      "loss": 1.0464,
+      "step": 4357
+    },
+    {
+      "epoch": 0.7759971509971509,
+      "grad_norm": 0.5458958745002747,
+      "learning_rate": 0.0001821242930346614,
+      "loss": 1.1712,
+      "step": 4358
+    },
+    {
+      "epoch": 0.7761752136752137,
+      "grad_norm": 0.6448663473129272,
+      "learning_rate": 0.00018211630557073,
+      "loss": 1.1125,
+      "step": 4359
+    },
+    {
+      "epoch": 0.7763532763532763,
+      "grad_norm": 0.49889448285102844,
+      "learning_rate": 0.00018210831649790018,
+      "loss": 1.097,
+      "step": 4360
+    },
+    {
+      "epoch": 0.7765313390313391,
+      "grad_norm": 0.5118046998977661,
+      "learning_rate": 0.00018210032581632843,
+      "loss": 1.009,
+      "step": 4361
+    },
+    {
+      "epoch": 0.7767094017094017,
+      "grad_norm": 0.5450068116188049,
+      "learning_rate": 0.00018209233352617135,
+      "loss": 1.1138,
+      "step": 4362
+    },
+    {
+      "epoch": 0.7768874643874644,
+      "grad_norm": 0.6147481203079224,
+      "learning_rate": 0.00018208433962758558,
+      "loss": 1.212,
+      "step": 4363
+    },
+    {
+      "epoch": 0.7770655270655271,
+      "grad_norm": 0.554176926612854,
+      "learning_rate": 0.00018207634412072764,
+      "loss": 1.1271,
+      "step": 4364
+    },
+    {
+      "epoch": 0.7772435897435898,
+      "grad_norm": 0.5872851014137268,
+      "learning_rate": 0.00018206834700575426,
+      "loss": 1.2793,
+      "step": 4365
+    },
+    {
+      "epoch": 0.7774216524216524,
+      "grad_norm": 0.5135685205459595,
+      "learning_rate": 0.00018206034828282207,
+      "loss": 0.9642,
+      "step": 4366
+    },
+    {
+      "epoch": 0.7775997150997151,
+      "grad_norm": 0.5699490308761597,
+      "learning_rate": 0.00018205234795208786,
+      "loss": 0.9086,
+      "step": 4367
+    },
+    {
+      "epoch": 0.7777777777777778,
+      "grad_norm": 0.5908057689666748,
+      "learning_rate": 0.00018204434601370832,
+      "loss": 1.1973,
+      "step": 4368
+    },
+    {
+      "epoch": 0.7779558404558404,
+      "grad_norm": 0.5777581334114075,
+      "learning_rate": 0.00018203634246784025,
+      "loss": 1.0447,
+      "step": 4369
+    },
+    {
+      "epoch": 0.7781339031339032,
+      "grad_norm": 0.4822927713394165,
+      "learning_rate": 0.00018202833731464048,
+      "loss": 0.814,
+      "step": 4370
+    },
+    {
+      "epoch": 0.7783119658119658,
+      "grad_norm": 0.5343610644340515,
+      "learning_rate": 0.0001820203305542658,
+      "loss": 1.2785,
+      "step": 4371
+    },
+    {
+      "epoch": 0.7784900284900285,
+      "grad_norm": 0.5462222695350647,
+      "learning_rate": 0.00018201232218687316,
+      "loss": 1.1785,
+      "step": 4372
+    },
+    {
+      "epoch": 0.7786680911680912,
+      "grad_norm": 0.5177609324455261,
+      "learning_rate": 0.00018200431221261943,
+      "loss": 1.111,
+      "step": 4373
+    },
+    {
+      "epoch": 0.7788461538461539,
+      "grad_norm": 0.5324625968933105,
+      "learning_rate": 0.00018199630063166157,
+      "loss": 1.0738,
+      "step": 4374
+    },
+    {
+      "epoch": 0.7790242165242165,
+      "grad_norm": 0.6392876505851746,
+      "learning_rate": 0.0001819882874441565,
+      "loss": 1.1758,
+      "step": 4375
+    },
+    {
+      "epoch": 0.7792022792022792,
+      "grad_norm": 0.49964696168899536,
+      "learning_rate": 0.00018198027265026127,
+      "loss": 1.0556,
+      "step": 4376
+    },
+    {
+      "epoch": 0.7793803418803419,
+      "grad_norm": 0.6090660691261292,
+      "learning_rate": 0.00018197225625013287,
+      "loss": 1.0102,
+      "step": 4377
+    },
+    {
+      "epoch": 0.7795584045584045,
+      "grad_norm": 0.5242345929145813,
+      "learning_rate": 0.00018196423824392842,
+      "loss": 0.8335,
+      "step": 4378
+    },
+    {
+      "epoch": 0.7797364672364673,
+      "grad_norm": 0.5265036225318909,
+      "learning_rate": 0.00018195621863180498,
+      "loss": 1.0781,
+      "step": 4379
+    },
+    {
+      "epoch": 0.7799145299145299,
+      "grad_norm": 0.5115378499031067,
+      "learning_rate": 0.0001819481974139197,
+      "loss": 1.1658,
+      "step": 4380
+    },
+    {
+      "epoch": 0.7800925925925926,
+      "grad_norm": 0.6489549875259399,
+      "learning_rate": 0.00018194017459042972,
+      "loss": 1.0572,
+      "step": 4381
+    },
+    {
+      "epoch": 0.7802706552706553,
+      "grad_norm": 0.5800202488899231,
+      "learning_rate": 0.0001819321501614922,
+      "loss": 0.9593,
+      "step": 4382
+    },
+    {
+      "epoch": 0.780448717948718,
+      "grad_norm": 0.5608528256416321,
+      "learning_rate": 0.00018192412412726443,
+      "loss": 1.0324,
+      "step": 4383
+    },
+    {
+      "epoch": 0.7806267806267806,
+      "grad_norm": 0.5596401691436768,
+      "learning_rate": 0.00018191609648790362,
+      "loss": 1.071,
+      "step": 4384
+    },
+    {
+      "epoch": 0.7808048433048433,
+      "grad_norm": 0.5712903141975403,
+      "learning_rate": 0.00018190806724356707,
+      "loss": 0.9011,
+      "step": 4385
+    },
+    {
+      "epoch": 0.780982905982906,
+      "grad_norm": 0.5079438090324402,
+      "learning_rate": 0.0001819000363944121,
+      "loss": 1.1194,
+      "step": 4386
+    },
+    {
+      "epoch": 0.7811609686609686,
+      "grad_norm": 0.5785079598426819,
+      "learning_rate": 0.00018189200394059602,
+      "loss": 1.1703,
+      "step": 4387
+    },
+    {
+      "epoch": 0.7813390313390314,
+      "grad_norm": 0.6901816129684448,
+      "learning_rate": 0.00018188396988227625,
+      "loss": 1.6689,
+      "step": 4388
+    },
+    {
+      "epoch": 0.781517094017094,
+      "grad_norm": 0.48107922077178955,
+      "learning_rate": 0.00018187593421961022,
+      "loss": 1.0116,
+      "step": 4389
+    },
+    {
+      "epoch": 0.7816951566951567,
+      "grad_norm": 0.5843084454536438,
+      "learning_rate": 0.0001818678969527553,
+      "loss": 1.1172,
+      "step": 4390
+    },
+    {
+      "epoch": 0.7818732193732194,
+      "grad_norm": 0.479034423828125,
+      "learning_rate": 0.00018185985808186902,
+      "loss": 0.811,
+      "step": 4391
+    },
+    {
+      "epoch": 0.782051282051282,
+      "grad_norm": 0.5864158272743225,
+      "learning_rate": 0.00018185181760710888,
+      "loss": 0.9522,
+      "step": 4392
+    },
+    {
+      "epoch": 0.7822293447293447,
+      "grad_norm": 0.4824625551700592,
+      "learning_rate": 0.00018184377552863242,
+      "loss": 0.9039,
+      "step": 4393
+    },
+    {
+      "epoch": 0.7824074074074074,
+      "grad_norm": 0.580102801322937,
+      "learning_rate": 0.00018183573184659717,
+      "loss": 1.2382,
+      "step": 4394
+    },
+    {
+      "epoch": 0.7825854700854701,
+      "grad_norm": 0.5300056338310242,
+      "learning_rate": 0.00018182768656116073,
+      "loss": 1.2268,
+      "step": 4395
+    },
+    {
+      "epoch": 0.7827635327635327,
+      "grad_norm": 0.5548123121261597,
+      "learning_rate": 0.00018181963967248078,
+      "loss": 1.0628,
+      "step": 4396
+    },
+    {
+      "epoch": 0.7829415954415955,
+      "grad_norm": 0.5485070943832397,
+      "learning_rate": 0.00018181159118071496,
+      "loss": 0.9628,
+      "step": 4397
+    },
+    {
+      "epoch": 0.7831196581196581,
+      "grad_norm": 0.47405415773391724,
+      "learning_rate": 0.00018180354108602095,
+      "loss": 1.1413,
+      "step": 4398
+    },
+    {
+      "epoch": 0.7832977207977208,
+      "grad_norm": 0.5545752644538879,
+      "learning_rate": 0.0001817954893885565,
+      "loss": 1.3807,
+      "step": 4399
+    },
+    {
+      "epoch": 0.7834757834757835,
+      "grad_norm": 0.5339497327804565,
+      "learning_rate": 0.00018178743608847933,
+      "loss": 0.9978,
+      "step": 4400
+    },
+    {
+      "epoch": 0.7836538461538461,
+      "grad_norm": 0.5006352663040161,
+      "learning_rate": 0.00018177938118594725,
+      "loss": 0.8873,
+      "step": 4401
+    },
+    {
+      "epoch": 0.7838319088319088,
+      "grad_norm": 0.4845179319381714,
+      "learning_rate": 0.00018177132468111812,
+      "loss": 0.8866,
+      "step": 4402
+    },
+    {
+      "epoch": 0.7840099715099715,
+      "grad_norm": 0.5240967869758606,
+      "learning_rate": 0.0001817632665741497,
+      "loss": 1.0347,
+      "step": 4403
+    },
+    {
+      "epoch": 0.7841880341880342,
+      "grad_norm": 0.5311884880065918,
+      "learning_rate": 0.00018175520686519993,
+      "loss": 1.2065,
+      "step": 4404
+    },
+    {
+      "epoch": 0.7843660968660968,
+      "grad_norm": 0.5562815070152283,
+      "learning_rate": 0.00018174714555442673,
+      "loss": 1.1272,
+      "step": 4405
+    },
+    {
+      "epoch": 0.7845441595441596,
+      "grad_norm": 0.5524366497993469,
+      "learning_rate": 0.00018173908264198802,
+      "loss": 1.2337,
+      "step": 4406
+    },
+    {
+      "epoch": 0.7847222222222222,
+      "grad_norm": 0.5612216591835022,
+      "learning_rate": 0.0001817310181280418,
+      "loss": 1.1809,
+      "step": 4407
+    },
+    {
+      "epoch": 0.7849002849002849,
+      "grad_norm": 0.5315343737602234,
+      "learning_rate": 0.000181722952012746,
+      "loss": 1.0491,
+      "step": 4408
+    },
+    {
+      "epoch": 0.7850783475783476,
+      "grad_norm": 0.5233435034751892,
+      "learning_rate": 0.00018171488429625878,
+      "loss": 1.0457,
+      "step": 4409
+    },
+    {
+      "epoch": 0.7852564102564102,
+      "grad_norm": 0.7809093594551086,
+      "learning_rate": 0.00018170681497873813,
+      "loss": 1.1578,
+      "step": 4410
+    },
+    {
+      "epoch": 0.7854344729344729,
+      "grad_norm": 0.49659839272499084,
+      "learning_rate": 0.00018169874406034217,
+      "loss": 1.0815,
+      "step": 4411
+    },
+    {
+      "epoch": 0.7856125356125356,
+      "grad_norm": 0.5020765066146851,
+      "learning_rate": 0.00018169067154122904,
+      "loss": 1.1985,
+      "step": 4412
+    },
+    {
+      "epoch": 0.7857905982905983,
+      "grad_norm": 0.6408432126045227,
+      "learning_rate": 0.0001816825974215569,
+      "loss": 1.2272,
+      "step": 4413
+    },
+    {
+      "epoch": 0.7859686609686609,
+      "grad_norm": 0.5062605142593384,
+      "learning_rate": 0.00018167452170148396,
+      "loss": 0.9663,
+      "step": 4414
+    },
+    {
+      "epoch": 0.7861467236467237,
+      "grad_norm": 0.5100119113922119,
+      "learning_rate": 0.0001816664443811684,
+      "loss": 1.0256,
+      "step": 4415
+    },
+    {
+      "epoch": 0.7863247863247863,
+      "grad_norm": 0.5277643799781799,
+      "learning_rate": 0.00018165836546076854,
+      "loss": 1.2885,
+      "step": 4416
+    },
+    {
+      "epoch": 0.7865028490028491,
+      "grad_norm": 0.5568150281906128,
+      "learning_rate": 0.0001816502849404426,
+      "loss": 1.2673,
+      "step": 4417
+    },
+    {
+      "epoch": 0.7866809116809117,
+      "grad_norm": 0.5061392188072205,
+      "learning_rate": 0.00018164220282034896,
+      "loss": 1.072,
+      "step": 4418
+    },
+    {
+      "epoch": 0.7868589743589743,
+      "grad_norm": 0.5383077263832092,
+      "learning_rate": 0.00018163411910064597,
+      "loss": 1.0621,
+      "step": 4419
+    },
+    {
+      "epoch": 0.7870370370370371,
+      "grad_norm": 0.5167948007583618,
+      "learning_rate": 0.00018162603378149198,
+      "loss": 1.099,
+      "step": 4420
+    },
+    {
+      "epoch": 0.7872150997150997,
+      "grad_norm": 0.5084534287452698,
+      "learning_rate": 0.0001816179468630454,
+      "loss": 1.3984,
+      "step": 4421
+    },
+    {
+      "epoch": 0.7873931623931624,
+      "grad_norm": 0.608762264251709,
+      "learning_rate": 0.00018160985834546475,
+      "loss": 1.3553,
+      "step": 4422
+    },
+    {
+      "epoch": 0.7875712250712251,
+      "grad_norm": 0.4900866746902466,
+      "learning_rate": 0.00018160176822890842,
+      "loss": 1.0009,
+      "step": 4423
+    },
+    {
+      "epoch": 0.7877492877492878,
+      "grad_norm": 0.5928917527198792,
+      "learning_rate": 0.00018159367651353496,
+      "loss": 1.0523,
+      "step": 4424
+    },
+    {
+      "epoch": 0.7879273504273504,
+      "grad_norm": 0.624422013759613,
+      "learning_rate": 0.0001815855831995029,
+      "loss": 1.0519,
+      "step": 4425
+    },
+    {
+      "epoch": 0.7881054131054132,
+      "grad_norm": 0.5140150785446167,
+      "learning_rate": 0.00018157748828697082,
+      "loss": 1.048,
+      "step": 4426
+    },
+    {
+      "epoch": 0.7882834757834758,
+      "grad_norm": 0.47006943821907043,
+      "learning_rate": 0.00018156939177609732,
+      "loss": 1.0067,
+      "step": 4427
+    },
+    {
+      "epoch": 0.7884615384615384,
+      "grad_norm": 0.5178864002227783,
+      "learning_rate": 0.00018156129366704105,
+      "loss": 1.0583,
+      "step": 4428
+    },
+    {
+      "epoch": 0.7886396011396012,
+      "grad_norm": 0.5279985666275024,
+      "learning_rate": 0.00018155319395996066,
+      "loss": 1.3023,
+      "step": 4429
+    },
+    {
+      "epoch": 0.7888176638176638,
+      "grad_norm": 0.5238787531852722,
+      "learning_rate": 0.00018154509265501482,
+      "loss": 1.0851,
+      "step": 4430
+    },
+    {
+      "epoch": 0.7889957264957265,
+      "grad_norm": 0.5914917588233948,
+      "learning_rate": 0.00018153698975236228,
+      "loss": 0.9291,
+      "step": 4431
+    },
+    {
+      "epoch": 0.7891737891737892,
+      "grad_norm": 0.5046082735061646,
+      "learning_rate": 0.00018152888525216183,
+      "loss": 0.9951,
+      "step": 4432
+    },
+    {
+      "epoch": 0.7893518518518519,
+      "grad_norm": 0.5042256116867065,
+      "learning_rate": 0.00018152077915457225,
+      "loss": 1.0243,
+      "step": 4433
+    },
+    {
+      "epoch": 0.7895299145299145,
+      "grad_norm": 0.5950339436531067,
+      "learning_rate": 0.0001815126714597523,
+      "loss": 0.9803,
+      "step": 4434
+    },
+    {
+      "epoch": 0.7897079772079773,
+      "grad_norm": 0.5163764953613281,
+      "learning_rate": 0.0001815045621678609,
+      "loss": 1.0353,
+      "step": 4435
+    },
+    {
+      "epoch": 0.7898860398860399,
+      "grad_norm": 0.5166211128234863,
+      "learning_rate": 0.00018149645127905691,
+      "loss": 0.9649,
+      "step": 4436
+    },
+    {
+      "epoch": 0.7900641025641025,
+      "grad_norm": 0.5239769220352173,
+      "learning_rate": 0.00018148833879349927,
+      "loss": 0.9747,
+      "step": 4437
+    },
+    {
+      "epoch": 0.7902421652421653,
+      "grad_norm": 0.5803237557411194,
+      "learning_rate": 0.00018148022471134692,
+      "loss": 1.315,
+      "step": 4438
+    },
+    {
+      "epoch": 0.7904202279202279,
+      "grad_norm": 0.5141370296478271,
+      "learning_rate": 0.00018147210903275877,
+      "loss": 1.0547,
+      "step": 4439
+    },
+    {
+      "epoch": 0.7905982905982906,
+      "grad_norm": 0.545788586139679,
+      "learning_rate": 0.00018146399175789394,
+      "loss": 1.0797,
+      "step": 4440
+    },
+    {
+      "epoch": 0.7907763532763533,
+      "grad_norm": 0.5273314714431763,
+      "learning_rate": 0.0001814558728869114,
+      "loss": 0.7928,
+      "step": 4441
+    },
+    {
+      "epoch": 0.790954415954416,
+      "grad_norm": 0.4614652693271637,
+      "learning_rate": 0.00018144775241997024,
+      "loss": 0.8826,
+      "step": 4442
+    },
+    {
+      "epoch": 0.7911324786324786,
+      "grad_norm": 0.6203590631484985,
+      "learning_rate": 0.00018143963035722958,
+      "loss": 1.2891,
+      "step": 4443
+    },
+    {
+      "epoch": 0.7913105413105413,
+      "grad_norm": 0.4870408773422241,
+      "learning_rate": 0.0001814315066988485,
+      "loss": 1.0717,
+      "step": 4444
+    },
+    {
+      "epoch": 0.791488603988604,
+      "grad_norm": 0.6468982696533203,
+      "learning_rate": 0.00018142338144498625,
+      "loss": 1.3398,
+      "step": 4445
+    },
+    {
+      "epoch": 0.7916666666666666,
+      "grad_norm": 0.4727918207645416,
+      "learning_rate": 0.00018141525459580197,
+      "loss": 1.0195,
+      "step": 4446
+    },
+    {
+      "epoch": 0.7918447293447294,
+      "grad_norm": 0.5080479979515076,
+      "learning_rate": 0.0001814071261514549,
+      "loss": 1.0163,
+      "step": 4447
+    },
+    {
+      "epoch": 0.792022792022792,
+      "grad_norm": 0.5380908250808716,
+      "learning_rate": 0.0001813989961121043,
+      "loss": 1.1673,
+      "step": 4448
+    },
+    {
+      "epoch": 0.7922008547008547,
+      "grad_norm": 0.5020384192466736,
+      "learning_rate": 0.00018139086447790945,
+      "loss": 0.8591,
+      "step": 4449
+    },
+    {
+      "epoch": 0.7923789173789174,
+      "grad_norm": 0.5279949903488159,
+      "learning_rate": 0.0001813827312490297,
+      "loss": 1.1221,
+      "step": 4450
+    },
+    {
+      "epoch": 0.79255698005698,
+      "grad_norm": 0.6739233732223511,
+      "learning_rate": 0.00018137459642562437,
+      "loss": 1.2704,
+      "step": 4451
+    },
+    {
+      "epoch": 0.7927350427350427,
+      "grad_norm": 0.5112259984016418,
+      "learning_rate": 0.00018136646000785288,
+      "loss": 1.1161,
+      "step": 4452
+    },
+    {
+      "epoch": 0.7929131054131054,
+      "grad_norm": 0.5244031548500061,
+      "learning_rate": 0.00018135832199587463,
+      "loss": 0.7866,
+      "step": 4453
+    },
+    {
+      "epoch": 0.7930911680911681,
+      "grad_norm": 0.5803347229957581,
+      "learning_rate": 0.0001813501823898491,
+      "loss": 0.994,
+      "step": 4454
+    },
+    {
+      "epoch": 0.7932692307692307,
+      "grad_norm": 0.6191152930259705,
+      "learning_rate": 0.00018134204118993568,
+      "loss": 1.0725,
+      "step": 4455
+    },
+    {
+      "epoch": 0.7934472934472935,
+      "grad_norm": 0.549735963344574,
+      "learning_rate": 0.00018133389839629396,
+      "loss": 0.9915,
+      "step": 4456
+    },
+    {
+      "epoch": 0.7936253561253561,
+      "grad_norm": 0.4940381646156311,
+      "learning_rate": 0.00018132575400908347,
+      "loss": 1.1815,
+      "step": 4457
+    },
+    {
+      "epoch": 0.7938034188034188,
+      "grad_norm": 0.5009099245071411,
+      "learning_rate": 0.00018131760802846377,
+      "loss": 1.0833,
+      "step": 4458
+    },
+    {
+      "epoch": 0.7939814814814815,
+      "grad_norm": 0.595853865146637,
+      "learning_rate": 0.00018130946045459445,
+      "loss": 1.2774,
+      "step": 4459
+    },
+    {
+      "epoch": 0.7941595441595442,
+      "grad_norm": 0.534794807434082,
+      "learning_rate": 0.00018130131128763513,
+      "loss": 1.0891,
+      "step": 4460
+    },
+    {
+      "epoch": 0.7943376068376068,
+      "grad_norm": 0.5828582048416138,
+      "learning_rate": 0.00018129316052774557,
+      "loss": 1.0786,
+      "step": 4461
+    },
+    {
+      "epoch": 0.7945156695156695,
+      "grad_norm": 0.4750654697418213,
+      "learning_rate": 0.00018128500817508533,
+      "loss": 1.0818,
+      "step": 4462
+    },
+    {
+      "epoch": 0.7946937321937322,
+      "grad_norm": 0.5626576542854309,
+      "learning_rate": 0.00018127685422981426,
+      "loss": 1.0807,
+      "step": 4463
+    },
+    {
+      "epoch": 0.7948717948717948,
+      "grad_norm": 0.6434760093688965,
+      "learning_rate": 0.00018126869869209203,
+      "loss": 1.0908,
+      "step": 4464
+    },
+    {
+      "epoch": 0.7950498575498576,
+      "grad_norm": 0.5577414631843567,
+      "learning_rate": 0.00018126054156207853,
+      "loss": 1.0281,
+      "step": 4465
+    },
+    {
+      "epoch": 0.7952279202279202,
+      "grad_norm": 0.5001249313354492,
+      "learning_rate": 0.00018125238283993347,
+      "loss": 0.9083,
+      "step": 4466
+    },
+    {
+      "epoch": 0.7954059829059829,
+      "grad_norm": 0.5298314690589905,
+      "learning_rate": 0.00018124422252581676,
+      "loss": 0.971,
+      "step": 4467
+    },
+    {
+      "epoch": 0.7955840455840456,
+      "grad_norm": 0.4872737228870392,
+      "learning_rate": 0.00018123606061988832,
+      "loss": 1.0515,
+      "step": 4468
+    },
+    {
+      "epoch": 0.7957621082621082,
+      "grad_norm": 0.5895398259162903,
+      "learning_rate": 0.00018122789712230798,
+      "loss": 1.0771,
+      "step": 4469
+    },
+    {
+      "epoch": 0.7959401709401709,
+      "grad_norm": 0.5212514996528625,
+      "learning_rate": 0.00018121973203323577,
+      "loss": 1.0365,
+      "step": 4470
+    },
+    {
+      "epoch": 0.7961182336182336,
+      "grad_norm": 0.4679451584815979,
+      "learning_rate": 0.0001812115653528316,
+      "loss": 0.9445,
+      "step": 4471
+    },
+    {
+      "epoch": 0.7962962962962963,
+      "grad_norm": 0.5852653980255127,
+      "learning_rate": 0.00018120339708125552,
+      "loss": 1.1781,
+      "step": 4472
+    },
+    {
+      "epoch": 0.7964743589743589,
+      "grad_norm": 0.6081342697143555,
+      "learning_rate": 0.00018119522721866756,
+      "loss": 1.3881,
+      "step": 4473
+    },
+    {
+      "epoch": 0.7966524216524217,
+      "grad_norm": 0.5254155993461609,
+      "learning_rate": 0.00018118705576522777,
+      "loss": 1.2198,
+      "step": 4474
+    },
+    {
+      "epoch": 0.7968304843304843,
+      "grad_norm": 0.5959419012069702,
+      "learning_rate": 0.00018117888272109632,
+      "loss": 1.0922,
+      "step": 4475
+    },
+    {
+      "epoch": 0.7970085470085471,
+      "grad_norm": 0.6243147253990173,
+      "learning_rate": 0.0001811707080864333,
+      "loss": 1.1782,
+      "step": 4476
+    },
+    {
+      "epoch": 0.7971866096866097,
+      "grad_norm": 0.5336906909942627,
+      "learning_rate": 0.0001811625318613988,
+      "loss": 1.167,
+      "step": 4477
+    },
+    {
+      "epoch": 0.7973646723646723,
+      "grad_norm": 0.5287907719612122,
+      "learning_rate": 0.00018115435404615315,
+      "loss": 0.9923,
+      "step": 4478
+    },
+    {
+      "epoch": 0.7975427350427351,
+      "grad_norm": 0.48941442370414734,
+      "learning_rate": 0.0001811461746408565,
+      "loss": 0.863,
+      "step": 4479
+    },
+    {
+      "epoch": 0.7977207977207977,
+      "grad_norm": 0.48465651273727417,
+      "learning_rate": 0.0001811379936456691,
+      "loss": 1.147,
+      "step": 4480
+    },
+    {
+      "epoch": 0.7978988603988604,
+      "grad_norm": 0.5676067471504211,
+      "learning_rate": 0.0001811298110607513,
+      "loss": 1.3121,
+      "step": 4481
+    },
+    {
+      "epoch": 0.7980769230769231,
+      "grad_norm": 0.4894018769264221,
+      "learning_rate": 0.00018112162688626337,
+      "loss": 1.1831,
+      "step": 4482
+    },
+    {
+      "epoch": 0.7982549857549858,
+      "grad_norm": 0.5626382827758789,
+      "learning_rate": 0.0001811134411223657,
+      "loss": 1.1977,
+      "step": 4483
+    },
+    {
+      "epoch": 0.7984330484330484,
+      "grad_norm": 0.564119815826416,
+      "learning_rate": 0.00018110525376921862,
+      "loss": 1.2686,
+      "step": 4484
+    },
+    {
+      "epoch": 0.7986111111111112,
+      "grad_norm": 0.6385740041732788,
+      "learning_rate": 0.00018109706482698256,
+      "loss": 1.2418,
+      "step": 4485
+    },
+    {
+      "epoch": 0.7987891737891738,
+      "grad_norm": 0.5550164580345154,
+      "learning_rate": 0.00018108887429581802,
+      "loss": 1.081,
+      "step": 4486
+    },
+    {
+      "epoch": 0.7989672364672364,
+      "grad_norm": 0.5583973526954651,
+      "learning_rate": 0.00018108068217588544,
+      "loss": 1.1757,
+      "step": 4487
+    },
+    {
+      "epoch": 0.7991452991452992,
+      "grad_norm": 0.5533342957496643,
+      "learning_rate": 0.00018107248846734527,
+      "loss": 1.1947,
+      "step": 4488
+    },
+    {
+      "epoch": 0.7993233618233618,
+      "grad_norm": 0.5291479229927063,
+      "learning_rate": 0.00018106429317035815,
+      "loss": 1.2769,
+      "step": 4489
+    },
+    {
+      "epoch": 0.7995014245014245,
+      "grad_norm": 0.4680160582065582,
+      "learning_rate": 0.00018105609628508458,
+      "loss": 0.7059,
+      "step": 4490
+    },
+    {
+      "epoch": 0.7996794871794872,
+      "grad_norm": 0.5364881157875061,
+      "learning_rate": 0.00018104789781168517,
+      "loss": 1.0566,
+      "step": 4491
+    },
+    {
+      "epoch": 0.7998575498575499,
+      "grad_norm": 0.5917307734489441,
+      "learning_rate": 0.0001810396977503206,
+      "loss": 1.2263,
+      "step": 4492
+    },
+    {
+      "epoch": 0.8000356125356125,
+      "grad_norm": 0.6013199090957642,
+      "learning_rate": 0.0001810314961011515,
+      "loss": 1.2053,
+      "step": 4493
+    },
+    {
+      "epoch": 0.8002136752136753,
+      "grad_norm": 0.6005663275718689,
+      "learning_rate": 0.0001810232928643385,
+      "loss": 1.2241,
+      "step": 4494
+    },
+    {
+      "epoch": 0.8003917378917379,
+      "grad_norm": 0.49207603931427,
+      "learning_rate": 0.00018101508804004246,
+      "loss": 1.0661,
+      "step": 4495
+    },
+    {
+      "epoch": 0.8005698005698005,
+      "grad_norm": 0.4834063947200775,
+      "learning_rate": 0.00018100688162842401,
+      "loss": 1.1745,
+      "step": 4496
+    },
+    {
+      "epoch": 0.8007478632478633,
+      "grad_norm": 0.5347156524658203,
+      "learning_rate": 0.000180998673629644,
+      "loss": 1.0679,
+      "step": 4497
+    },
+    {
+      "epoch": 0.8009259259259259,
+      "grad_norm": 0.5815600156784058,
+      "learning_rate": 0.00018099046404386327,
+      "loss": 1.2652,
+      "step": 4498
+    },
+    {
+      "epoch": 0.8011039886039886,
+      "grad_norm": 0.5291135311126709,
+      "learning_rate": 0.00018098225287124263,
+      "loss": 1.2072,
+      "step": 4499
+    },
+    {
+      "epoch": 0.8012820512820513,
+      "grad_norm": 0.5779497027397156,
+      "learning_rate": 0.000180974040111943,
+      "loss": 1.3277,
+      "step": 4500
+    },
+    {
+      "epoch": 0.801460113960114,
+      "grad_norm": 0.44566696882247925,
+      "learning_rate": 0.0001809658257661252,
+      "loss": 0.7702,
+      "step": 4501
+    },
+    {
+      "epoch": 0.8016381766381766,
+      "grad_norm": 0.5407577753067017,
+      "learning_rate": 0.00018095760983395027,
+      "loss": 1.2894,
+      "step": 4502
+    },
+    {
+      "epoch": 0.8018162393162394,
+      "grad_norm": 0.4771903455257416,
+      "learning_rate": 0.00018094939231557916,
+      "loss": 1.045,
+      "step": 4503
+    },
+    {
+      "epoch": 0.801994301994302,
+      "grad_norm": 0.5970945358276367,
+      "learning_rate": 0.00018094117321117286,
+      "loss": 1.2059,
+      "step": 4504
+    },
+    {
+      "epoch": 0.8021723646723646,
+      "grad_norm": 0.4959338903427124,
+      "learning_rate": 0.0001809329525208924,
+      "loss": 1.155,
+      "step": 4505
+    },
+    {
+      "epoch": 0.8023504273504274,
+      "grad_norm": 0.5142548084259033,
+      "learning_rate": 0.00018092473024489887,
+      "loss": 0.9413,
+      "step": 4506
+    },
+    {
+      "epoch": 0.80252849002849,
+      "grad_norm": 0.5336433053016663,
+      "learning_rate": 0.00018091650638335334,
+      "loss": 1.0699,
+      "step": 4507
+    },
+    {
+      "epoch": 0.8027065527065527,
+      "grad_norm": 0.47770628333091736,
+      "learning_rate": 0.00018090828093641698,
+      "loss": 1.1515,
+      "step": 4508
+    },
+    {
+      "epoch": 0.8028846153846154,
+      "grad_norm": 0.5443438291549683,
+      "learning_rate": 0.00018090005390425091,
+      "loss": 1.189,
+      "step": 4509
+    },
+    {
+      "epoch": 0.8030626780626781,
+      "grad_norm": 0.523179829120636,
+      "learning_rate": 0.00018089182528701632,
+      "loss": 1.1272,
+      "step": 4510
+    },
+    {
+      "epoch": 0.8032407407407407,
+      "grad_norm": 0.49628451466560364,
+      "learning_rate": 0.00018088359508487448,
+      "loss": 0.9754,
+      "step": 4511
+    },
+    {
+      "epoch": 0.8034188034188035,
+      "grad_norm": 0.5933086276054382,
+      "learning_rate": 0.00018087536329798663,
+      "loss": 1.2111,
+      "step": 4512
+    },
+    {
+      "epoch": 0.8035968660968661,
+      "grad_norm": 0.4565310776233673,
+      "learning_rate": 0.00018086712992651402,
+      "loss": 0.7729,
+      "step": 4513
+    },
+    {
+      "epoch": 0.8037749287749287,
+      "grad_norm": 0.5013461112976074,
+      "learning_rate": 0.00018085889497061798,
+      "loss": 1.2178,
+      "step": 4514
+    },
+    {
+      "epoch": 0.8039529914529915,
+      "grad_norm": 0.5170024633407593,
+      "learning_rate": 0.00018085065843045987,
+      "loss": 0.9181,
+      "step": 4515
+    },
+    {
+      "epoch": 0.8041310541310541,
+      "grad_norm": 0.583363950252533,
+      "learning_rate": 0.00018084242030620104,
+      "loss": 1.1542,
+      "step": 4516
+    },
+    {
+      "epoch": 0.8043091168091168,
+      "grad_norm": 0.46835777163505554,
+      "learning_rate": 0.00018083418059800297,
+      "loss": 0.8954,
+      "step": 4517
+    },
+    {
+      "epoch": 0.8044871794871795,
+      "grad_norm": 0.5145657062530518,
+      "learning_rate": 0.000180825939306027,
+      "loss": 1.0417,
+      "step": 4518
+    },
+    {
+      "epoch": 0.8046652421652422,
+      "grad_norm": 0.47216105461120605,
+      "learning_rate": 0.00018081769643043467,
+      "loss": 0.9516,
+      "step": 4519
+    },
+    {
+      "epoch": 0.8048433048433048,
+      "grad_norm": 0.5059915781021118,
+      "learning_rate": 0.0001808094519713875,
+      "loss": 1.1643,
+      "step": 4520
+    },
+    {
+      "epoch": 0.8050213675213675,
+      "grad_norm": 0.5406439900398254,
+      "learning_rate": 0.00018080120592904692,
+      "loss": 1.2038,
+      "step": 4521
+    },
+    {
+      "epoch": 0.8051994301994302,
+      "grad_norm": 0.6123420000076294,
+      "learning_rate": 0.0001807929583035746,
+      "loss": 1.4004,
+      "step": 4522
+    },
+    {
+      "epoch": 0.8053774928774928,
+      "grad_norm": 0.49699845910072327,
+      "learning_rate": 0.00018078470909513208,
+      "loss": 1.0347,
+      "step": 4523
+    },
+    {
+      "epoch": 0.8055555555555556,
+      "grad_norm": 0.5369421243667603,
+      "learning_rate": 0.000180776458303881,
+      "loss": 1.0418,
+      "step": 4524
+    },
+    {
+      "epoch": 0.8057336182336182,
+      "grad_norm": 0.5407396554946899,
+      "learning_rate": 0.00018076820592998301,
+      "loss": 0.9546,
+      "step": 4525
+    },
+    {
+      "epoch": 0.8059116809116809,
+      "grad_norm": 0.5749752521514893,
+      "learning_rate": 0.00018075995197359984,
+      "loss": 1.1438,
+      "step": 4526
+    },
+    {
+      "epoch": 0.8060897435897436,
+      "grad_norm": 0.5523102283477783,
+      "learning_rate": 0.00018075169643489317,
+      "loss": 1.1312,
+      "step": 4527
+    },
+    {
+      "epoch": 0.8062678062678063,
+      "grad_norm": 0.5767508149147034,
+      "learning_rate": 0.00018074343931402472,
+      "loss": 1.1951,
+      "step": 4528
+    },
+    {
+      "epoch": 0.8064458689458689,
+      "grad_norm": 0.5262924432754517,
+      "learning_rate": 0.00018073518061115633,
+      "loss": 1.1985,
+      "step": 4529
+    },
+    {
+      "epoch": 0.8066239316239316,
+      "grad_norm": 0.4742378294467926,
+      "learning_rate": 0.0001807269203264498,
+      "loss": 1.0126,
+      "step": 4530
+    },
+    {
+      "epoch": 0.8068019943019943,
+      "grad_norm": 0.5190158486366272,
+      "learning_rate": 0.00018071865846006692,
+      "loss": 0.9985,
+      "step": 4531
+    },
+    {
+      "epoch": 0.8069800569800569,
+      "grad_norm": 0.5910618305206299,
+      "learning_rate": 0.00018071039501216964,
+      "loss": 1.2776,
+      "step": 4532
+    },
+    {
+      "epoch": 0.8071581196581197,
+      "grad_norm": 0.5363098382949829,
+      "learning_rate": 0.00018070212998291983,
+      "loss": 1.3346,
+      "step": 4533
+    },
+    {
+      "epoch": 0.8073361823361823,
+      "grad_norm": 0.47711408138275146,
+      "learning_rate": 0.0001806938633724794,
+      "loss": 1.04,
+      "step": 4534
+    },
+    {
+      "epoch": 0.8075142450142451,
+      "grad_norm": 0.5092964172363281,
+      "learning_rate": 0.0001806855951810104,
+      "loss": 1.1409,
+      "step": 4535
+    },
+    {
+      "epoch": 0.8076923076923077,
+      "grad_norm": 0.5828777551651001,
+      "learning_rate": 0.00018067732540867472,
+      "loss": 1.3048,
+      "step": 4536
+    },
+    {
+      "epoch": 0.8078703703703703,
+      "grad_norm": 0.5779826045036316,
+      "learning_rate": 0.00018066905405563445,
+      "loss": 1.1599,
+      "step": 4537
+    },
+    {
+      "epoch": 0.8080484330484331,
+      "grad_norm": 0.49908435344696045,
+      "learning_rate": 0.00018066078112205167,
+      "loss": 1.1502,
+      "step": 4538
+    },
+    {
+      "epoch": 0.8082264957264957,
+      "grad_norm": 0.4772704839706421,
+      "learning_rate": 0.0001806525066080884,
+      "loss": 0.7925,
+      "step": 4539
+    },
+    {
+      "epoch": 0.8084045584045584,
+      "grad_norm": 0.4298383295536041,
+      "learning_rate": 0.00018064423051390683,
+      "loss": 0.7322,
+      "step": 4540
+    },
+    {
+      "epoch": 0.8085826210826211,
+      "grad_norm": 0.49349579215049744,
+      "learning_rate": 0.0001806359528396691,
+      "loss": 1.0021,
+      "step": 4541
+    },
+    {
+      "epoch": 0.8087606837606838,
+      "grad_norm": 0.4698609411716461,
+      "learning_rate": 0.00018062767358553735,
+      "loss": 0.9751,
+      "step": 4542
+    },
+    {
+      "epoch": 0.8089387464387464,
+      "grad_norm": 0.4949014186859131,
+      "learning_rate": 0.00018061939275167385,
+      "loss": 0.9553,
+      "step": 4543
+    },
+    {
+      "epoch": 0.8091168091168092,
+      "grad_norm": 0.5604463815689087,
+      "learning_rate": 0.0001806111103382408,
+      "loss": 0.9894,
+      "step": 4544
+    },
+    {
+      "epoch": 0.8092948717948718,
+      "grad_norm": 0.5761561989784241,
+      "learning_rate": 0.00018060282634540053,
+      "loss": 1.258,
+      "step": 4545
+    },
+    {
+      "epoch": 0.8094729344729344,
+      "grad_norm": 0.5239115357398987,
+      "learning_rate": 0.00018059454077331527,
+      "loss": 0.9189,
+      "step": 4546
+    },
+    {
+      "epoch": 0.8096509971509972,
+      "grad_norm": 0.47902220487594604,
+      "learning_rate": 0.00018058625362214742,
+      "loss": 1.0389,
+      "step": 4547
+    },
+    {
+      "epoch": 0.8098290598290598,
+      "grad_norm": 0.6274173259735107,
+      "learning_rate": 0.00018057796489205936,
+      "loss": 1.3368,
+      "step": 4548
+    },
+    {
+      "epoch": 0.8100071225071225,
+      "grad_norm": 0.5789401531219482,
+      "learning_rate": 0.00018056967458321345,
+      "loss": 1.1473,
+      "step": 4549
+    },
+    {
+      "epoch": 0.8101851851851852,
+      "grad_norm": 0.5850043296813965,
+      "learning_rate": 0.0001805613826957721,
+      "loss": 1.2224,
+      "step": 4550
+    },
+    {
+      "epoch": 0.8103632478632479,
+      "grad_norm": 0.6310738921165466,
+      "learning_rate": 0.00018055308922989788,
+      "loss": 1.0707,
+      "step": 4551
+    },
+    {
+      "epoch": 0.8105413105413105,
+      "grad_norm": 0.5198429822921753,
+      "learning_rate": 0.00018054479418575317,
+      "loss": 0.8984,
+      "step": 4552
+    },
+    {
+      "epoch": 0.8107193732193733,
+      "grad_norm": 0.5757743120193481,
+      "learning_rate": 0.00018053649756350054,
+      "loss": 1.2007,
+      "step": 4553
+    },
+    {
+      "epoch": 0.8108974358974359,
+      "grad_norm": 0.5109567642211914,
+      "learning_rate": 0.0001805281993633025,
+      "loss": 1.0696,
+      "step": 4554
+    },
+    {
+      "epoch": 0.8110754985754985,
+      "grad_norm": 0.5030225515365601,
+      "learning_rate": 0.00018051989958532173,
+      "loss": 0.9667,
+      "step": 4555
+    },
+    {
+      "epoch": 0.8112535612535613,
+      "grad_norm": 0.5291743874549866,
+      "learning_rate": 0.00018051159822972079,
+      "loss": 1.0219,
+      "step": 4556
+    },
+    {
+      "epoch": 0.8114316239316239,
+      "grad_norm": 0.5874896049499512,
+      "learning_rate": 0.00018050329529666233,
+      "loss": 0.8589,
+      "step": 4557
+    },
+    {
+      "epoch": 0.8116096866096866,
+      "grad_norm": 0.673284113407135,
+      "learning_rate": 0.000180494990786309,
+      "loss": 1.1902,
+      "step": 4558
+    },
+    {
+      "epoch": 0.8117877492877493,
+      "grad_norm": 0.4742524027824402,
+      "learning_rate": 0.00018048668469882354,
+      "loss": 1.0578,
+      "step": 4559
+    },
+    {
+      "epoch": 0.811965811965812,
+      "grad_norm": 0.5519167184829712,
+      "learning_rate": 0.0001804783770343687,
+      "loss": 1.083,
+      "step": 4560
+    },
+    {
+      "epoch": 0.8121438746438746,
+      "grad_norm": 0.5669941306114197,
+      "learning_rate": 0.00018047006779310727,
+      "loss": 1.0784,
+      "step": 4561
+    },
+    {
+      "epoch": 0.8123219373219374,
+      "grad_norm": 0.512759804725647,
+      "learning_rate": 0.000180461756975202,
+      "loss": 1.0361,
+      "step": 4562
+    },
+    {
+      "epoch": 0.8125,
+      "grad_norm": 0.5721749067306519,
+      "learning_rate": 0.00018045344458081575,
+      "loss": 1.0246,
+      "step": 4563
+    },
+    {
+      "epoch": 0.8126780626780626,
+      "grad_norm": 0.566430389881134,
+      "learning_rate": 0.00018044513061011137,
+      "loss": 1.1452,
+      "step": 4564
+    },
+    {
+      "epoch": 0.8128561253561254,
+      "grad_norm": 0.49391916394233704,
+      "learning_rate": 0.00018043681506325177,
+      "loss": 0.89,
+      "step": 4565
+    },
+    {
+      "epoch": 0.813034188034188,
+      "grad_norm": 0.5379437804222107,
+      "learning_rate": 0.00018042849794039988,
+      "loss": 1.1289,
+      "step": 4566
+    },
+    {
+      "epoch": 0.8132122507122507,
+      "grad_norm": 0.5667982697486877,
+      "learning_rate": 0.00018042017924171865,
+      "loss": 1.1596,
+      "step": 4567
+    },
+    {
+      "epoch": 0.8133903133903134,
+      "grad_norm": 0.6214209794998169,
+      "learning_rate": 0.00018041185896737109,
+      "loss": 1.0622,
+      "step": 4568
+    },
+    {
+      "epoch": 0.8135683760683761,
+      "grad_norm": 0.5442491173744202,
+      "learning_rate": 0.00018040353711752015,
+      "loss": 1.0536,
+      "step": 4569
+    },
+    {
+      "epoch": 0.8137464387464387,
+      "grad_norm": 0.5266172885894775,
+      "learning_rate": 0.00018039521369232894,
+      "loss": 1.0576,
+      "step": 4570
+    },
+    {
+      "epoch": 0.8139245014245015,
+      "grad_norm": 0.6057912111282349,
+      "learning_rate": 0.00018038688869196053,
+      "loss": 1.3067,
+      "step": 4571
+    },
+    {
+      "epoch": 0.8141025641025641,
+      "grad_norm": 0.489869087934494,
+      "learning_rate": 0.00018037856211657803,
+      "loss": 1.0279,
+      "step": 4572
+    },
+    {
+      "epoch": 0.8142806267806267,
+      "grad_norm": 0.5497978329658508,
+      "learning_rate": 0.00018037023396634457,
+      "loss": 1.1568,
+      "step": 4573
+    },
+    {
+      "epoch": 0.8144586894586895,
+      "grad_norm": 0.5243251919746399,
+      "learning_rate": 0.0001803619042414233,
+      "loss": 0.9767,
+      "step": 4574
+    },
+    {
+      "epoch": 0.8146367521367521,
+      "grad_norm": 0.503032922744751,
+      "learning_rate": 0.0001803535729419775,
+      "loss": 1.065,
+      "step": 4575
+    },
+    {
+      "epoch": 0.8148148148148148,
+      "grad_norm": 0.49955418705940247,
+      "learning_rate": 0.00018034524006817034,
+      "loss": 1.2752,
+      "step": 4576
+    },
+    {
+      "epoch": 0.8149928774928775,
+      "grad_norm": 0.5746406316757202,
+      "learning_rate": 0.00018033690562016508,
+      "loss": 1.098,
+      "step": 4577
+    },
+    {
+      "epoch": 0.8151709401709402,
+      "grad_norm": 0.5224192142486572,
+      "learning_rate": 0.00018032856959812507,
+      "loss": 1.1284,
+      "step": 4578
+    },
+    {
+      "epoch": 0.8153490028490028,
+      "grad_norm": 0.5484535694122314,
+      "learning_rate": 0.00018032023200221362,
+      "loss": 0.9182,
+      "step": 4579
+    },
+    {
+      "epoch": 0.8155270655270656,
+      "grad_norm": 0.5003355741500854,
+      "learning_rate": 0.00018031189283259405,
+      "loss": 1.136,
+      "step": 4580
+    },
+    {
+      "epoch": 0.8157051282051282,
+      "grad_norm": 0.5395768284797668,
+      "learning_rate": 0.00018030355208942977,
+      "loss": 1.2349,
+      "step": 4581
+    },
+    {
+      "epoch": 0.8158831908831908,
+      "grad_norm": 0.561966598033905,
+      "learning_rate": 0.0001802952097728842,
+      "loss": 0.999,
+      "step": 4582
+    },
+    {
+      "epoch": 0.8160612535612536,
+      "grad_norm": 0.4886479675769806,
+      "learning_rate": 0.00018028686588312083,
+      "loss": 0.9165,
+      "step": 4583
+    },
+    {
+      "epoch": 0.8162393162393162,
+      "grad_norm": 0.4769509732723236,
+      "learning_rate": 0.00018027852042030307,
+      "loss": 1.1377,
+      "step": 4584
+    },
+    {
+      "epoch": 0.8164173789173789,
+      "grad_norm": 0.4723633825778961,
+      "learning_rate": 0.00018027017338459448,
+      "loss": 1.0274,
+      "step": 4585
+    },
+    {
+      "epoch": 0.8165954415954416,
+      "grad_norm": 0.5773285627365112,
+      "learning_rate": 0.00018026182477615859,
+      "loss": 1.1468,
+      "step": 4586
+    },
+    {
+      "epoch": 0.8167735042735043,
+      "grad_norm": 0.5529203414916992,
+      "learning_rate": 0.00018025347459515895,
+      "loss": 1.0815,
+      "step": 4587
+    },
+    {
+      "epoch": 0.8169515669515669,
+      "grad_norm": 0.5449469685554504,
+      "learning_rate": 0.00018024512284175922,
+      "loss": 1.1637,
+      "step": 4588
+    },
+    {
+      "epoch": 0.8171296296296297,
+      "grad_norm": 0.5155341625213623,
+      "learning_rate": 0.00018023676951612298,
+      "loss": 1.1842,
+      "step": 4589
+    },
+    {
+      "epoch": 0.8173076923076923,
+      "grad_norm": 0.5569564700126648,
+      "learning_rate": 0.00018022841461841393,
+      "loss": 0.9254,
+      "step": 4590
+    },
+    {
+      "epoch": 0.8174857549857549,
+      "grad_norm": 0.45203131437301636,
+      "learning_rate": 0.00018022005814879573,
+      "loss": 0.9561,
+      "step": 4591
+    },
+    {
+      "epoch": 0.8176638176638177,
+      "grad_norm": 0.5735056400299072,
+      "learning_rate": 0.00018021170010743218,
+      "loss": 1.1402,
+      "step": 4592
+    },
+    {
+      "epoch": 0.8178418803418803,
+      "grad_norm": 0.6075260043144226,
+      "learning_rate": 0.00018020334049448697,
+      "loss": 0.8601,
+      "step": 4593
+    },
+    {
+      "epoch": 0.8180199430199431,
+      "grad_norm": 0.522682785987854,
+      "learning_rate": 0.0001801949793101239,
+      "loss": 1.0088,
+      "step": 4594
+    },
+    {
+      "epoch": 0.8181980056980057,
+      "grad_norm": 0.5648437142372131,
+      "learning_rate": 0.00018018661655450682,
+      "loss": 0.8359,
+      "step": 4595
+    },
+    {
+      "epoch": 0.8183760683760684,
+      "grad_norm": 0.5406472086906433,
+      "learning_rate": 0.00018017825222779954,
+      "loss": 1.1553,
+      "step": 4596
+    },
+    {
+      "epoch": 0.8185541310541311,
+      "grad_norm": 0.4917788803577423,
+      "learning_rate": 0.000180169886330166,
+      "loss": 1.2198,
+      "step": 4597
+    },
+    {
+      "epoch": 0.8187321937321937,
+      "grad_norm": 0.6293069124221802,
+      "learning_rate": 0.00018016151886177004,
+      "loss": 1.0245,
+      "step": 4598
+    },
+    {
+      "epoch": 0.8189102564102564,
+      "grad_norm": 0.47277843952178955,
+      "learning_rate": 0.00018015314982277564,
+      "loss": 1.1141,
+      "step": 4599
+    },
+    {
+      "epoch": 0.8190883190883191,
+      "grad_norm": 0.6132395267486572,
+      "learning_rate": 0.0001801447792133468,
+      "loss": 1.1227,
+      "step": 4600
+    },
+    {
+      "epoch": 0.8192663817663818,
+      "grad_norm": 0.46839597821235657,
+      "learning_rate": 0.00018013640703364747,
+      "loss": 0.9239,
+      "step": 4601
+    },
+    {
+      "epoch": 0.8194444444444444,
+      "grad_norm": 0.5055009722709656,
+      "learning_rate": 0.00018012803328384171,
+      "loss": 0.8486,
+      "step": 4602
+    },
+    {
+      "epoch": 0.8196225071225072,
+      "grad_norm": 0.5094841718673706,
+      "learning_rate": 0.00018011965796409362,
+      "loss": 0.9969,
+      "step": 4603
+    },
+    {
+      "epoch": 0.8198005698005698,
+      "grad_norm": 0.6177363395690918,
+      "learning_rate": 0.00018011128107456726,
+      "loss": 1.242,
+      "step": 4604
+    },
+    {
+      "epoch": 0.8199786324786325,
+      "grad_norm": 0.5280042290687561,
+      "learning_rate": 0.00018010290261542676,
+      "loss": 1.1569,
+      "step": 4605
+    },
+    {
+      "epoch": 0.8201566951566952,
+      "grad_norm": 0.5259367227554321,
+      "learning_rate": 0.00018009452258683625,
+      "loss": 0.9993,
+      "step": 4606
+    },
+    {
+      "epoch": 0.8203347578347578,
+      "grad_norm": 0.464469850063324,
+      "learning_rate": 0.00018008614098896,
+      "loss": 1.0288,
+      "step": 4607
+    },
+    {
+      "epoch": 0.8205128205128205,
+      "grad_norm": 0.6136324405670166,
+      "learning_rate": 0.00018007775782196214,
+      "loss": 1.1541,
+      "step": 4608
+    },
+    {
+      "epoch": 0.8206908831908832,
+      "grad_norm": 0.5376590490341187,
+      "learning_rate": 0.000180069373086007,
+      "loss": 1.0624,
+      "step": 4609
+    },
+    {
+      "epoch": 0.8208689458689459,
+      "grad_norm": 0.662916362285614,
+      "learning_rate": 0.0001800609867812588,
+      "loss": 1.1502,
+      "step": 4610
+    },
+    {
+      "epoch": 0.8210470085470085,
+      "grad_norm": 0.5153383612632751,
+      "learning_rate": 0.00018005259890788188,
+      "loss": 0.9789,
+      "step": 4611
+    },
+    {
+      "epoch": 0.8212250712250713,
+      "grad_norm": 0.5042359232902527,
+      "learning_rate": 0.00018004420946604057,
+      "loss": 0.9585,
+      "step": 4612
+    },
+    {
+      "epoch": 0.8214031339031339,
+      "grad_norm": 0.5395993590354919,
+      "learning_rate": 0.00018003581845589927,
+      "loss": 1.159,
+      "step": 4613
+    },
+    {
+      "epoch": 0.8215811965811965,
+      "grad_norm": 0.5561928749084473,
+      "learning_rate": 0.00018002742587762237,
+      "loss": 1.1604,
+      "step": 4614
+    },
+    {
+      "epoch": 0.8217592592592593,
+      "grad_norm": 0.5602710843086243,
+      "learning_rate": 0.00018001903173137432,
+      "loss": 0.9922,
+      "step": 4615
+    },
+    {
+      "epoch": 0.8219373219373219,
+      "grad_norm": 0.5529088377952576,
+      "learning_rate": 0.00018001063601731955,
+      "loss": 1.0943,
+      "step": 4616
+    },
+    {
+      "epoch": 0.8221153846153846,
+      "grad_norm": 0.5156456828117371,
+      "learning_rate": 0.00018000223873562254,
+      "loss": 1.1399,
+      "step": 4617
+    },
+    {
+      "epoch": 0.8222934472934473,
+      "grad_norm": 0.4868306517601013,
+      "learning_rate": 0.0001799938398864479,
+      "loss": 1.0692,
+      "step": 4618
+    },
+    {
+      "epoch": 0.82247150997151,
+      "grad_norm": 0.5372915267944336,
+      "learning_rate": 0.0001799854394699601,
+      "loss": 1.2675,
+      "step": 4619
+    },
+    {
+      "epoch": 0.8226495726495726,
+      "grad_norm": 0.6101839542388916,
+      "learning_rate": 0.0001799770374863238,
+      "loss": 0.9586,
+      "step": 4620
+    },
+    {
+      "epoch": 0.8228276353276354,
+      "grad_norm": 0.5034586787223816,
+      "learning_rate": 0.00017996863393570357,
+      "loss": 1.0885,
+      "step": 4621
+    },
+    {
+      "epoch": 0.823005698005698,
+      "grad_norm": 0.5608823299407959,
+      "learning_rate": 0.0001799602288182641,
+      "loss": 1.0002,
+      "step": 4622
+    },
+    {
+      "epoch": 0.8231837606837606,
+      "grad_norm": 0.5700048208236694,
+      "learning_rate": 0.00017995182213417,
+      "loss": 1.1484,
+      "step": 4623
+    },
+    {
+      "epoch": 0.8233618233618234,
+      "grad_norm": 0.5283229351043701,
+      "learning_rate": 0.00017994341388358608,
+      "loss": 1.0744,
+      "step": 4624
+    },
+    {
+      "epoch": 0.823539886039886,
+      "grad_norm": 0.5215758681297302,
+      "learning_rate": 0.00017993500406667703,
+      "loss": 1.2686,
+      "step": 4625
+    },
+    {
+      "epoch": 0.8237179487179487,
+      "grad_norm": 0.528883159160614,
+      "learning_rate": 0.0001799265926836076,
+      "loss": 1.1393,
+      "step": 4626
+    },
+    {
+      "epoch": 0.8238960113960114,
+      "grad_norm": 0.5589834451675415,
+      "learning_rate": 0.00017991817973454265,
+      "loss": 1.1744,
+      "step": 4627
+    },
+    {
+      "epoch": 0.8240740740740741,
+      "grad_norm": 0.49817174673080444,
+      "learning_rate": 0.00017990976521964697,
+      "loss": 1.0544,
+      "step": 4628
+    },
+    {
+      "epoch": 0.8242521367521367,
+      "grad_norm": 0.613961398601532,
+      "learning_rate": 0.00017990134913908542,
+      "loss": 1.0951,
+      "step": 4629
+    },
+    {
+      "epoch": 0.8244301994301995,
+      "grad_norm": 0.47278255224227905,
+      "learning_rate": 0.00017989293149302295,
+      "loss": 0.9742,
+      "step": 4630
+    },
+    {
+      "epoch": 0.8246082621082621,
+      "grad_norm": 0.49807092547416687,
+      "learning_rate": 0.00017988451228162443,
+      "loss": 1.0985,
+      "step": 4631
+    },
+    {
+      "epoch": 0.8247863247863247,
+      "grad_norm": 0.5624374747276306,
+      "learning_rate": 0.00017987609150505485,
+      "loss": 1.2446,
+      "step": 4632
+    },
+    {
+      "epoch": 0.8249643874643875,
+      "grad_norm": 0.4863535761833191,
+      "learning_rate": 0.00017986766916347916,
+      "loss": 1.0239,
+      "step": 4633
+    },
+    {
+      "epoch": 0.8251424501424501,
+      "grad_norm": 0.679585874080658,
+      "learning_rate": 0.00017985924525706245,
+      "loss": 1.1698,
+      "step": 4634
+    },
+    {
+      "epoch": 0.8253205128205128,
+      "grad_norm": 0.5545455813407898,
+      "learning_rate": 0.00017985081978596967,
+      "loss": 1.0926,
+      "step": 4635
+    },
+    {
+      "epoch": 0.8254985754985755,
+      "grad_norm": 0.5303109288215637,
+      "learning_rate": 0.000179842392750366,
+      "loss": 1.0978,
+      "step": 4636
+    },
+    {
+      "epoch": 0.8256766381766382,
+      "grad_norm": 0.6053299307823181,
+      "learning_rate": 0.00017983396415041644,
+      "loss": 1.0596,
+      "step": 4637
+    },
+    {
+      "epoch": 0.8258547008547008,
+      "grad_norm": 0.5241885185241699,
+      "learning_rate": 0.00017982553398628625,
+      "loss": 0.8541,
+      "step": 4638
+    },
+    {
+      "epoch": 0.8260327635327636,
+      "grad_norm": 0.5934443473815918,
+      "learning_rate": 0.00017981710225814052,
+      "loss": 1.145,
+      "step": 4639
+    },
+    {
+      "epoch": 0.8262108262108262,
+      "grad_norm": 0.5341619849205017,
+      "learning_rate": 0.00017980866896614447,
+      "loss": 1.0745,
+      "step": 4640
+    },
+    {
+      "epoch": 0.8263888888888888,
+      "grad_norm": 0.6732913851737976,
+      "learning_rate": 0.00017980023411046336,
+      "loss": 1.0775,
+      "step": 4641
+    },
+    {
+      "epoch": 0.8265669515669516,
+      "grad_norm": 0.5134359002113342,
+      "learning_rate": 0.0001797917976912624,
+      "loss": 1.0298,
+      "step": 4642
+    },
+    {
+      "epoch": 0.8267450142450142,
+      "grad_norm": 0.5234783887863159,
+      "learning_rate": 0.00017978335970870698,
+      "loss": 1.1069,
+      "step": 4643
+    },
+    {
+      "epoch": 0.8269230769230769,
+      "grad_norm": 0.4776439964771271,
+      "learning_rate": 0.00017977492016296232,
+      "loss": 0.6367,
+      "step": 4644
+    },
+    {
+      "epoch": 0.8271011396011396,
+      "grad_norm": 0.53763347864151,
+      "learning_rate": 0.0001797664790541938,
+      "loss": 1.1356,
+      "step": 4645
+    },
+    {
+      "epoch": 0.8272792022792023,
+      "grad_norm": 0.5082212686538696,
+      "learning_rate": 0.00017975803638256682,
+      "loss": 0.7873,
+      "step": 4646
+    },
+    {
+      "epoch": 0.8274572649572649,
+      "grad_norm": 0.5156424641609192,
+      "learning_rate": 0.00017974959214824685,
+      "loss": 1.084,
+      "step": 4647
+    },
+    {
+      "epoch": 0.8276353276353277,
+      "grad_norm": 0.5275198817253113,
+      "learning_rate": 0.00017974114635139926,
+      "loss": 1.1219,
+      "step": 4648
+    },
+    {
+      "epoch": 0.8278133903133903,
+      "grad_norm": 0.5548223257064819,
+      "learning_rate": 0.00017973269899218956,
+      "loss": 1.0808,
+      "step": 4649
+    },
+    {
+      "epoch": 0.8279914529914529,
+      "grad_norm": 0.535347580909729,
+      "learning_rate": 0.00017972425007078323,
+      "loss": 1.1211,
+      "step": 4650
+    },
+    {
+      "epoch": 0.8281695156695157,
+      "grad_norm": 0.5299580693244934,
+      "learning_rate": 0.00017971579958734587,
+      "loss": 0.9911,
+      "step": 4651
+    },
+    {
+      "epoch": 0.8283475783475783,
+      "grad_norm": 0.4863550066947937,
+      "learning_rate": 0.000179707347542043,
+      "loss": 0.9122,
+      "step": 4652
+    },
+    {
+      "epoch": 0.8285256410256411,
+      "grad_norm": 0.5284972190856934,
+      "learning_rate": 0.00017969889393504022,
+      "loss": 1.0424,
+      "step": 4653
+    },
+    {
+      "epoch": 0.8287037037037037,
+      "grad_norm": 0.5305661559104919,
+      "learning_rate": 0.00017969043876650317,
+      "loss": 1.1122,
+      "step": 4654
+    },
+    {
+      "epoch": 0.8288817663817664,
+      "grad_norm": 0.5645657777786255,
+      "learning_rate": 0.00017968198203659755,
+      "loss": 1.2195,
+      "step": 4655
+    },
+    {
+      "epoch": 0.8290598290598291,
+      "grad_norm": 0.521649181842804,
+      "learning_rate": 0.000179673523745489,
+      "loss": 1.2684,
+      "step": 4656
+    },
+    {
+      "epoch": 0.8292378917378918,
+      "grad_norm": 0.5984422564506531,
+      "learning_rate": 0.00017966506389334322,
+      "loss": 0.9894,
+      "step": 4657
+    },
+    {
+      "epoch": 0.8294159544159544,
+      "grad_norm": 0.5318729281425476,
+      "learning_rate": 0.00017965660248032603,
+      "loss": 1.2929,
+      "step": 4658
+    },
+    {
+      "epoch": 0.8295940170940171,
+      "grad_norm": 0.4666081368923187,
+      "learning_rate": 0.0001796481395066032,
+      "loss": 0.9646,
+      "step": 4659
+    },
+    {
+      "epoch": 0.8297720797720798,
+      "grad_norm": 0.5780388116836548,
+      "learning_rate": 0.00017963967497234054,
+      "loss": 1.1043,
+      "step": 4660
+    },
+    {
+      "epoch": 0.8299501424501424,
+      "grad_norm": 0.44089245796203613,
+      "learning_rate": 0.00017963120887770387,
+      "loss": 0.8932,
+      "step": 4661
+    },
+    {
+      "epoch": 0.8301282051282052,
+      "grad_norm": 0.5198349356651306,
+      "learning_rate": 0.0001796227412228591,
+      "loss": 0.9378,
+      "step": 4662
+    },
+    {
+      "epoch": 0.8303062678062678,
+      "grad_norm": 0.5298343896865845,
+      "learning_rate": 0.00017961427200797206,
+      "loss": 1.0272,
+      "step": 4663
+    },
+    {
+      "epoch": 0.8304843304843305,
+      "grad_norm": 0.5087099671363831,
+      "learning_rate": 0.0001796058012332088,
+      "loss": 0.989,
+      "step": 4664
+    },
+    {
+      "epoch": 0.8306623931623932,
+      "grad_norm": 0.504228949546814,
+      "learning_rate": 0.0001795973288987352,
+      "loss": 1.0134,
+      "step": 4665
+    },
+    {
+      "epoch": 0.8308404558404558,
+      "grad_norm": 0.6788033843040466,
+      "learning_rate": 0.00017958885500471728,
+      "loss": 0.8856,
+      "step": 4666
+    },
+    {
+      "epoch": 0.8310185185185185,
+      "grad_norm": 0.5166172385215759,
+      "learning_rate": 0.00017958037955132113,
+      "loss": 0.8711,
+      "step": 4667
+    },
+    {
+      "epoch": 0.8311965811965812,
+      "grad_norm": 0.5712400078773499,
+      "learning_rate": 0.00017957190253871272,
+      "loss": 1.0418,
+      "step": 4668
+    },
+    {
+      "epoch": 0.8313746438746439,
+      "grad_norm": 0.5531231164932251,
+      "learning_rate": 0.0001795634239670582,
+      "loss": 0.9021,
+      "step": 4669
+    },
+    {
+      "epoch": 0.8315527065527065,
+      "grad_norm": 0.6165615916252136,
+      "learning_rate": 0.00017955494383652365,
+      "loss": 1.0927,
+      "step": 4670
+    },
+    {
+      "epoch": 0.8317307692307693,
+      "grad_norm": 0.5920368432998657,
+      "learning_rate": 0.00017954646214727525,
+      "loss": 1.231,
+      "step": 4671
+    },
+    {
+      "epoch": 0.8319088319088319,
+      "grad_norm": 0.5037244558334351,
+      "learning_rate": 0.00017953797889947915,
+      "loss": 0.85,
+      "step": 4672
+    },
+    {
+      "epoch": 0.8320868945868946,
+      "grad_norm": 0.5618211627006531,
+      "learning_rate": 0.0001795294940933016,
+      "loss": 1.145,
+      "step": 4673
+    },
+    {
+      "epoch": 0.8322649572649573,
+      "grad_norm": 0.6275593042373657,
+      "learning_rate": 0.00017952100772890877,
+      "loss": 0.9061,
+      "step": 4674
+    },
+    {
+      "epoch": 0.83244301994302,
+      "grad_norm": 0.5376096367835999,
+      "learning_rate": 0.00017951251980646702,
+      "loss": 1.1948,
+      "step": 4675
+    },
+    {
+      "epoch": 0.8326210826210826,
+      "grad_norm": 0.5162268877029419,
+      "learning_rate": 0.0001795040303261426,
+      "loss": 1.2158,
+      "step": 4676
+    },
+    {
+      "epoch": 0.8327991452991453,
+      "grad_norm": 0.5730512142181396,
+      "learning_rate": 0.0001794955392881019,
+      "loss": 0.9962,
+      "step": 4677
+    },
+    {
+      "epoch": 0.832977207977208,
+      "grad_norm": 0.5128712058067322,
+      "learning_rate": 0.00017948704669251122,
+      "loss": 1.2797,
+      "step": 4678
+    },
+    {
+      "epoch": 0.8331552706552706,
+      "grad_norm": 0.5173979997634888,
+      "learning_rate": 0.00017947855253953697,
+      "loss": 1.1093,
+      "step": 4679
+    },
+    {
+      "epoch": 0.8333333333333334,
+      "grad_norm": 0.504646897315979,
+      "learning_rate": 0.0001794700568293456,
+      "loss": 1.3171,
+      "step": 4680
+    },
+    {
+      "epoch": 0.833511396011396,
+      "grad_norm": 0.5638105869293213,
+      "learning_rate": 0.00017946155956210356,
+      "loss": 0.9224,
+      "step": 4681
+    },
+    {
+      "epoch": 0.8336894586894587,
+      "grad_norm": 0.5289680361747742,
+      "learning_rate": 0.00017945306073797733,
+      "loss": 0.8919,
+      "step": 4682
+    },
+    {
+      "epoch": 0.8338675213675214,
+      "grad_norm": 0.5224629044532776,
+      "learning_rate": 0.0001794445603571334,
+      "loss": 1.0345,
+      "step": 4683
+    },
+    {
+      "epoch": 0.834045584045584,
+      "grad_norm": 0.5342282056808472,
+      "learning_rate": 0.00017943605841973836,
+      "loss": 1.2305,
+      "step": 4684
+    },
+    {
+      "epoch": 0.8342236467236467,
+      "grad_norm": 0.6118032336235046,
+      "learning_rate": 0.00017942755492595874,
+      "loss": 1.0316,
+      "step": 4685
+    },
+    {
+      "epoch": 0.8344017094017094,
+      "grad_norm": 0.49112311005592346,
+      "learning_rate": 0.00017941904987596121,
+      "loss": 0.9809,
+      "step": 4686
+    },
+    {
+      "epoch": 0.8345797720797721,
+      "grad_norm": 0.5044063925743103,
+      "learning_rate": 0.0001794105432699124,
+      "loss": 0.834,
+      "step": 4687
+    },
+    {
+      "epoch": 0.8347578347578347,
+      "grad_norm": 0.4849987328052521,
+      "learning_rate": 0.00017940203510797892,
+      "loss": 0.9971,
+      "step": 4688
+    },
+    {
+      "epoch": 0.8349358974358975,
+      "grad_norm": 0.5539469122886658,
+      "learning_rate": 0.00017939352539032748,
+      "loss": 1.1599,
+      "step": 4689
+    },
+    {
+      "epoch": 0.8351139601139601,
+      "grad_norm": 0.5474258065223694,
+      "learning_rate": 0.00017938501411712485,
+      "loss": 1.25,
+      "step": 4690
+    },
+    {
+      "epoch": 0.8352920227920227,
+      "grad_norm": 0.4880213737487793,
+      "learning_rate": 0.0001793765012885378,
+      "loss": 1.1471,
+      "step": 4691
+    },
+    {
+      "epoch": 0.8354700854700855,
+      "grad_norm": 0.5602759718894958,
+      "learning_rate": 0.00017936798690473309,
+      "loss": 1.0723,
+      "step": 4692
+    },
+    {
+      "epoch": 0.8356481481481481,
+      "grad_norm": 0.627775251865387,
+      "learning_rate": 0.00017935947096587755,
+      "loss": 1.3768,
+      "step": 4693
+    },
+    {
+      "epoch": 0.8358262108262108,
+      "grad_norm": 0.5324847102165222,
+      "learning_rate": 0.00017935095347213804,
+      "loss": 0.9945,
+      "step": 4694
+    },
+    {
+      "epoch": 0.8360042735042735,
+      "grad_norm": 0.5244048237800598,
+      "learning_rate": 0.0001793424344236814,
+      "loss": 1.1725,
+      "step": 4695
+    },
+    {
+      "epoch": 0.8361823361823362,
+      "grad_norm": 0.5420708656311035,
+      "learning_rate": 0.00017933391382067462,
+      "loss": 1.1267,
+      "step": 4696
+    },
+    {
+      "epoch": 0.8363603988603988,
+      "grad_norm": 0.5285456776618958,
+      "learning_rate": 0.00017932539166328458,
+      "loss": 1.0368,
+      "step": 4697
+    },
+    {
+      "epoch": 0.8365384615384616,
+      "grad_norm": 0.5330373048782349,
+      "learning_rate": 0.00017931686795167825,
+      "loss": 1.1082,
+      "step": 4698
+    },
+    {
+      "epoch": 0.8367165242165242,
+      "grad_norm": 0.5516682267189026,
+      "learning_rate": 0.0001793083426860227,
+      "loss": 1.1833,
+      "step": 4699
+    },
+    {
+      "epoch": 0.8368945868945868,
+      "grad_norm": 0.5229935646057129,
+      "learning_rate": 0.0001792998158664849,
+      "loss": 0.8527,
+      "step": 4700
+    },
+    {
+      "epoch": 0.8370726495726496,
+      "grad_norm": 0.4821490943431854,
+      "learning_rate": 0.00017929128749323195,
+      "loss": 1.1201,
+      "step": 4701
+    },
+    {
+      "epoch": 0.8372507122507122,
+      "grad_norm": 0.6276404857635498,
+      "learning_rate": 0.0001792827575664309,
+      "loss": 1.0986,
+      "step": 4702
+    },
+    {
+      "epoch": 0.8374287749287749,
+      "grad_norm": 0.5681334733963013,
+      "learning_rate": 0.00017927422608624897,
+      "loss": 1.3821,
+      "step": 4703
+    },
+    {
+      "epoch": 0.8376068376068376,
+      "grad_norm": 0.5257087349891663,
+      "learning_rate": 0.00017926569305285324,
+      "loss": 1.1033,
+      "step": 4704
+    },
+    {
+      "epoch": 0.8377849002849003,
+      "grad_norm": 0.5665168166160583,
+      "learning_rate": 0.0001792571584664109,
+      "loss": 1.104,
+      "step": 4705
+    },
+    {
+      "epoch": 0.8379629629629629,
+      "grad_norm": 0.5202076435089111,
+      "learning_rate": 0.00017924862232708918,
+      "loss": 1.052,
+      "step": 4706
+    },
+    {
+      "epoch": 0.8381410256410257,
+      "grad_norm": 0.5103010535240173,
+      "learning_rate": 0.00017924008463505534,
+      "loss": 1.1348,
+      "step": 4707
+    },
+    {
+      "epoch": 0.8383190883190883,
+      "grad_norm": 0.6811865568161011,
+      "learning_rate": 0.00017923154539047667,
+      "loss": 1.2804,
+      "step": 4708
+    },
+    {
+      "epoch": 0.8384971509971509,
+      "grad_norm": 0.46808311343193054,
+      "learning_rate": 0.00017922300459352042,
+      "loss": 0.9302,
+      "step": 4709
+    },
+    {
+      "epoch": 0.8386752136752137,
+      "grad_norm": 0.47713059186935425,
+      "learning_rate": 0.00017921446224435398,
+      "loss": 0.78,
+      "step": 4710
+    },
+    {
+      "epoch": 0.8388532763532763,
+      "grad_norm": 0.7579890489578247,
+      "learning_rate": 0.0001792059183431447,
+      "loss": 1.4776,
+      "step": 4711
+    },
+    {
+      "epoch": 0.8390313390313391,
+      "grad_norm": 0.6009423136711121,
+      "learning_rate": 0.00017919737289006,
+      "loss": 1.2679,
+      "step": 4712
+    },
+    {
+      "epoch": 0.8392094017094017,
+      "grad_norm": 0.56390780210495,
+      "learning_rate": 0.00017918882588526729,
+      "loss": 1.0402,
+      "step": 4713
+    },
+    {
+      "epoch": 0.8393874643874644,
+      "grad_norm": 0.5698862075805664,
+      "learning_rate": 0.00017918027732893404,
+      "loss": 1.2336,
+      "step": 4714
+    },
+    {
+      "epoch": 0.8395655270655271,
+      "grad_norm": 0.5016305446624756,
+      "learning_rate": 0.0001791717272212277,
+      "loss": 1.0373,
+      "step": 4715
+    },
+    {
+      "epoch": 0.8397435897435898,
+      "grad_norm": 0.5886971950531006,
+      "learning_rate": 0.0001791631755623159,
+      "loss": 1.1062,
+      "step": 4716
+    },
+    {
+      "epoch": 0.8399216524216524,
+      "grad_norm": 0.647833526134491,
+      "learning_rate": 0.00017915462235236607,
+      "loss": 1.0464,
+      "step": 4717
+    },
+    {
+      "epoch": 0.8400997150997151,
+      "grad_norm": 0.4961194396018982,
+      "learning_rate": 0.00017914606759154587,
+      "loss": 1.0763,
+      "step": 4718
+    },
+    {
+      "epoch": 0.8402777777777778,
+      "grad_norm": 0.47041359543800354,
+      "learning_rate": 0.00017913751128002288,
+      "loss": 1.0685,
+      "step": 4719
+    },
+    {
+      "epoch": 0.8404558404558404,
+      "grad_norm": 0.5752858519554138,
+      "learning_rate": 0.00017912895341796475,
+      "loss": 1.0577,
+      "step": 4720
+    },
+    {
+      "epoch": 0.8406339031339032,
+      "grad_norm": 0.5233224034309387,
+      "learning_rate": 0.00017912039400553914,
+      "loss": 1.1484,
+      "step": 4721
+    },
+    {
+      "epoch": 0.8408119658119658,
+      "grad_norm": 0.5327485203742981,
+      "learning_rate": 0.00017911183304291378,
+      "loss": 1.0028,
+      "step": 4722
+    },
+    {
+      "epoch": 0.8409900284900285,
+      "grad_norm": 0.5320752263069153,
+      "learning_rate": 0.00017910327053025638,
+      "loss": 1.1247,
+      "step": 4723
+    },
+    {
+      "epoch": 0.8411680911680912,
+      "grad_norm": 0.529617965221405,
+      "learning_rate": 0.00017909470646773477,
+      "loss": 1.1698,
+      "step": 4724
+    },
+    {
+      "epoch": 0.8413461538461539,
+      "grad_norm": 0.5055609345436096,
+      "learning_rate": 0.00017908614085551664,
+      "loss": 1.0925,
+      "step": 4725
+    },
+    {
+      "epoch": 0.8415242165242165,
+      "grad_norm": 0.5356255769729614,
+      "learning_rate": 0.00017907757369376985,
+      "loss": 1.0354,
+      "step": 4726
+    },
+    {
+      "epoch": 0.8417022792022792,
+      "grad_norm": 0.582834780216217,
+      "learning_rate": 0.00017906900498266233,
+      "loss": 1.1248,
+      "step": 4727
+    },
+    {
+      "epoch": 0.8418803418803419,
+      "grad_norm": 0.5750834941864014,
+      "learning_rate": 0.00017906043472236188,
+      "loss": 1.0119,
+      "step": 4728
+    },
+    {
+      "epoch": 0.8420584045584045,
+      "grad_norm": 0.5923320055007935,
+      "learning_rate": 0.00017905186291303644,
+      "loss": 1.0662,
+      "step": 4729
+    },
+    {
+      "epoch": 0.8422364672364673,
+      "grad_norm": 0.4767811894416809,
+      "learning_rate": 0.00017904328955485396,
+      "loss": 1.0911,
+      "step": 4730
+    },
+    {
+      "epoch": 0.8424145299145299,
+      "grad_norm": 0.5294556021690369,
+      "learning_rate": 0.00017903471464798245,
+      "loss": 1.2861,
+      "step": 4731
+    },
+    {
+      "epoch": 0.8425925925925926,
+      "grad_norm": 0.599117636680603,
+      "learning_rate": 0.00017902613819258985,
+      "loss": 1.1707,
+      "step": 4732
+    },
+    {
+      "epoch": 0.8427706552706553,
+      "grad_norm": 0.5912977457046509,
+      "learning_rate": 0.00017901756018884424,
+      "loss": 1.1884,
+      "step": 4733
+    },
+    {
+      "epoch": 0.842948717948718,
+      "grad_norm": 0.587676465511322,
+      "learning_rate": 0.0001790089806369137,
+      "loss": 1.1054,
+      "step": 4734
+    },
+    {
+      "epoch": 0.8431267806267806,
+      "grad_norm": 0.6271800398826599,
+      "learning_rate": 0.0001790003995369663,
+      "loss": 1.2094,
+      "step": 4735
+    },
+    {
+      "epoch": 0.8433048433048433,
+      "grad_norm": 0.47198590636253357,
+      "learning_rate": 0.00017899181688917017,
+      "loss": 0.9561,
+      "step": 4736
+    },
+    {
+      "epoch": 0.843482905982906,
+      "grad_norm": 0.690732479095459,
+      "learning_rate": 0.00017898323269369351,
+      "loss": 1.1629,
+      "step": 4737
+    },
+    {
+      "epoch": 0.8436609686609686,
+      "grad_norm": 0.4926888048648834,
+      "learning_rate": 0.00017897464695070445,
+      "loss": 1.1097,
+      "step": 4738
+    },
+    {
+      "epoch": 0.8438390313390314,
+      "grad_norm": 0.7071278691291809,
+      "learning_rate": 0.00017896605966037128,
+      "loss": 1.195,
+      "step": 4739
+    },
+    {
+      "epoch": 0.844017094017094,
+      "grad_norm": 0.5650486350059509,
+      "learning_rate": 0.00017895747082286216,
+      "loss": 1.0107,
+      "step": 4740
+    },
+    {
+      "epoch": 0.8441951566951567,
+      "grad_norm": 0.5291931629180908,
+      "learning_rate": 0.00017894888043834545,
+      "loss": 1.0104,
+      "step": 4741
+    },
+    {
+      "epoch": 0.8443732193732194,
+      "grad_norm": 0.5751241445541382,
+      "learning_rate": 0.00017894028850698942,
+      "loss": 1.2482,
+      "step": 4742
+    },
+    {
+      "epoch": 0.844551282051282,
+      "grad_norm": 0.5833632349967957,
+      "learning_rate": 0.0001789316950289624,
+      "loss": 1.0552,
+      "step": 4743
+    },
+    {
+      "epoch": 0.8447293447293447,
+      "grad_norm": 0.543729841709137,
+      "learning_rate": 0.00017892310000443282,
+      "loss": 1.1453,
+      "step": 4744
+    },
+    {
+      "epoch": 0.8449074074074074,
+      "grad_norm": 0.5674204230308533,
+      "learning_rate": 0.00017891450343356902,
+      "loss": 1.0757,
+      "step": 4745
+    },
+    {
+      "epoch": 0.8450854700854701,
+      "grad_norm": 0.5161892771720886,
+      "learning_rate": 0.00017890590531653946,
+      "loss": 1.1163,
+      "step": 4746
+    },
+    {
+      "epoch": 0.8452635327635327,
+      "grad_norm": 0.49907612800598145,
+      "learning_rate": 0.00017889730565351258,
+      "loss": 1.0356,
+      "step": 4747
+    },
+    {
+      "epoch": 0.8454415954415955,
+      "grad_norm": 0.4994732439517975,
+      "learning_rate": 0.00017888870444465692,
+      "loss": 1.026,
+      "step": 4748
+    },
+    {
+      "epoch": 0.8456196581196581,
+      "grad_norm": 0.6397520303726196,
+      "learning_rate": 0.00017888010169014095,
+      "loss": 0.957,
+      "step": 4749
+    },
+    {
+      "epoch": 0.8457977207977208,
+      "grad_norm": 0.5379729270935059,
+      "learning_rate": 0.00017887149739013327,
+      "loss": 1.1664,
+      "step": 4750
+    },
+    {
+      "epoch": 0.8459757834757835,
+      "grad_norm": 0.4487382769584656,
+      "learning_rate": 0.00017886289154480246,
+      "loss": 0.9377,
+      "step": 4751
+    },
+    {
+      "epoch": 0.8461538461538461,
+      "grad_norm": 0.5645943880081177,
+      "learning_rate": 0.00017885428415431707,
+      "loss": 1.273,
+      "step": 4752
+    },
+    {
+      "epoch": 0.8463319088319088,
+      "grad_norm": 0.5535289645195007,
+      "learning_rate": 0.00017884567521884577,
+      "loss": 1.1779,
+      "step": 4753
+    },
+    {
+      "epoch": 0.8465099715099715,
+      "grad_norm": 0.5039721131324768,
+      "learning_rate": 0.0001788370647385573,
+      "loss": 1.0237,
+      "step": 4754
+    },
+    {
+      "epoch": 0.8466880341880342,
+      "grad_norm": 0.4543854892253876,
+      "learning_rate": 0.00017882845271362032,
+      "loss": 0.8149,
+      "step": 4755
+    },
+    {
+      "epoch": 0.8468660968660968,
+      "grad_norm": 0.5095639824867249,
+      "learning_rate": 0.00017881983914420352,
+      "loss": 1.0141,
+      "step": 4756
+    },
+    {
+      "epoch": 0.8470441595441596,
+      "grad_norm": 0.5341798663139343,
+      "learning_rate": 0.00017881122403047575,
+      "loss": 1.1885,
+      "step": 4757
+    },
+    {
+      "epoch": 0.8472222222222222,
+      "grad_norm": 0.5595062971115112,
+      "learning_rate": 0.00017880260737260573,
+      "loss": 0.8939,
+      "step": 4758
+    },
+    {
+      "epoch": 0.8474002849002849,
+      "grad_norm": 0.5355880260467529,
+      "learning_rate": 0.00017879398917076232,
+      "loss": 1.2434,
+      "step": 4759
+    },
+    {
+      "epoch": 0.8475783475783476,
+      "grad_norm": 0.49477261304855347,
+      "learning_rate": 0.0001787853694251144,
+      "loss": 0.979,
+      "step": 4760
+    },
+    {
+      "epoch": 0.8477564102564102,
+      "grad_norm": 0.5154359340667725,
+      "learning_rate": 0.00017877674813583078,
+      "loss": 1.0957,
+      "step": 4761
+    },
+    {
+      "epoch": 0.8479344729344729,
+      "grad_norm": 0.5651070475578308,
+      "learning_rate": 0.00017876812530308046,
+      "loss": 1.1884,
+      "step": 4762
+    },
+    {
+      "epoch": 0.8481125356125356,
+      "grad_norm": 0.537277340888977,
+      "learning_rate": 0.00017875950092703232,
+      "loss": 1.0272,
+      "step": 4763
+    },
+    {
+      "epoch": 0.8482905982905983,
+      "grad_norm": 0.5259691476821899,
+      "learning_rate": 0.00017875087500785538,
+      "loss": 1.1493,
+      "step": 4764
+    },
+    {
+      "epoch": 0.8484686609686609,
+      "grad_norm": 0.5491300225257874,
+      "learning_rate": 0.00017874224754571867,
+      "loss": 0.8316,
+      "step": 4765
+    },
+    {
+      "epoch": 0.8486467236467237,
+      "grad_norm": 0.5493744611740112,
+      "learning_rate": 0.00017873361854079116,
+      "loss": 1.2328,
+      "step": 4766
+    },
+    {
+      "epoch": 0.8488247863247863,
+      "grad_norm": 0.571002185344696,
+      "learning_rate": 0.00017872498799324197,
+      "loss": 1.1384,
+      "step": 4767
+    },
+    {
+      "epoch": 0.8490028490028491,
+      "grad_norm": 0.538152813911438,
+      "learning_rate": 0.00017871635590324013,
+      "loss": 1.0581,
+      "step": 4768
+    },
+    {
+      "epoch": 0.8491809116809117,
+      "grad_norm": 0.5214923620223999,
+      "learning_rate": 0.00017870772227095486,
+      "loss": 1.0612,
+      "step": 4769
+    },
+    {
+      "epoch": 0.8493589743589743,
+      "grad_norm": 0.5714883804321289,
+      "learning_rate": 0.0001786990870965553,
+      "loss": 0.9076,
+      "step": 4770
+    },
+    {
+      "epoch": 0.8495370370370371,
+      "grad_norm": 0.4181775450706482,
+      "learning_rate": 0.00017869045038021054,
+      "loss": 0.8366,
+      "step": 4771
+    },
+    {
+      "epoch": 0.8497150997150997,
+      "grad_norm": 0.6266027688980103,
+      "learning_rate": 0.00017868181212208993,
+      "loss": 1.2047,
+      "step": 4772
+    },
+    {
+      "epoch": 0.8498931623931624,
+      "grad_norm": 0.5423732399940491,
+      "learning_rate": 0.0001786731723223626,
+      "loss": 1.3878,
+      "step": 4773
+    },
+    {
+      "epoch": 0.8500712250712251,
+      "grad_norm": 0.5512300133705139,
+      "learning_rate": 0.00017866453098119793,
+      "loss": 1.1132,
+      "step": 4774
+    },
+    {
+      "epoch": 0.8502492877492878,
+      "grad_norm": 0.5767185688018799,
+      "learning_rate": 0.00017865588809876519,
+      "loss": 0.97,
+      "step": 4775
+    },
+    {
+      "epoch": 0.8504273504273504,
+      "grad_norm": 0.5305790305137634,
+      "learning_rate": 0.00017864724367523368,
+      "loss": 1.1158,
+      "step": 4776
+    },
+    {
+      "epoch": 0.8506054131054132,
+      "grad_norm": 0.49702391028404236,
+      "learning_rate": 0.00017863859771077284,
+      "loss": 0.9669,
+      "step": 4777
+    },
+    {
+      "epoch": 0.8507834757834758,
+      "grad_norm": 0.5490063428878784,
+      "learning_rate": 0.00017862995020555205,
+      "loss": 1.0646,
+      "step": 4778
+    },
+    {
+      "epoch": 0.8509615384615384,
+      "grad_norm": 0.5308689475059509,
+      "learning_rate": 0.00017862130115974068,
+      "loss": 0.8922,
+      "step": 4779
+    },
+    {
+      "epoch": 0.8511396011396012,
+      "grad_norm": 0.5412983894348145,
+      "learning_rate": 0.00017861265057350826,
+      "loss": 1.1444,
+      "step": 4780
+    },
+    {
+      "epoch": 0.8513176638176638,
+      "grad_norm": 0.5857377052307129,
+      "learning_rate": 0.00017860399844702425,
+      "loss": 1.1643,
+      "step": 4781
+    },
+    {
+      "epoch": 0.8514957264957265,
+      "grad_norm": 0.599273681640625,
+      "learning_rate": 0.00017859534478045815,
+      "loss": 1.169,
+      "step": 4782
+    },
+    {
+      "epoch": 0.8516737891737892,
+      "grad_norm": 0.5677087903022766,
+      "learning_rate": 0.00017858668957397957,
+      "loss": 1.0793,
+      "step": 4783
+    },
+    {
+      "epoch": 0.8518518518518519,
+      "grad_norm": 0.5648362636566162,
+      "learning_rate": 0.00017857803282775807,
+      "loss": 1.1932,
+      "step": 4784
+    },
+    {
+      "epoch": 0.8520299145299145,
+      "grad_norm": 0.5138826966285706,
+      "learning_rate": 0.00017856937454196323,
+      "loss": 1.0011,
+      "step": 4785
+    },
+    {
+      "epoch": 0.8522079772079773,
+      "grad_norm": 0.5951429009437561,
+      "learning_rate": 0.0001785607147167647,
+      "loss": 1.3198,
+      "step": 4786
+    },
+    {
+      "epoch": 0.8523860398860399,
+      "grad_norm": 0.5341953039169312,
+      "learning_rate": 0.00017855205335233216,
+      "loss": 0.9094,
+      "step": 4787
+    },
+    {
+      "epoch": 0.8525641025641025,
+      "grad_norm": 0.5193579196929932,
+      "learning_rate": 0.00017854339044883535,
+      "loss": 0.892,
+      "step": 4788
+    },
+    {
+      "epoch": 0.8527421652421653,
+      "grad_norm": 0.5053097009658813,
+      "learning_rate": 0.00017853472600644392,
+      "loss": 1.0589,
+      "step": 4789
+    },
+    {
+      "epoch": 0.8529202279202279,
+      "grad_norm": 0.5819617509841919,
+      "learning_rate": 0.0001785260600253277,
+      "loss": 1.2646,
+      "step": 4790
+    },
+    {
+      "epoch": 0.8530982905982906,
+      "grad_norm": 0.5327470302581787,
+      "learning_rate": 0.00017851739250565645,
+      "loss": 1.056,
+      "step": 4791
+    },
+    {
+      "epoch": 0.8532763532763533,
+      "grad_norm": 0.5131269097328186,
+      "learning_rate": 0.0001785087234476,
+      "loss": 1.1192,
+      "step": 4792
+    },
+    {
+      "epoch": 0.853454415954416,
+      "grad_norm": 0.4698086977005005,
+      "learning_rate": 0.00017850005285132821,
+      "loss": 0.9849,
+      "step": 4793
+    },
+    {
+      "epoch": 0.8536324786324786,
+      "grad_norm": 0.5503947734832764,
+      "learning_rate": 0.00017849138071701092,
+      "loss": 1.1139,
+      "step": 4794
+    },
+    {
+      "epoch": 0.8538105413105413,
+      "grad_norm": 0.5120903849601746,
+      "learning_rate": 0.0001784827070448181,
+      "loss": 0.9801,
+      "step": 4795
+    },
+    {
+      "epoch": 0.853988603988604,
+      "grad_norm": 0.47650405764579773,
+      "learning_rate": 0.00017847403183491968,
+      "loss": 1.0268,
+      "step": 4796
+    },
+    {
+      "epoch": 0.8541666666666666,
+      "grad_norm": 0.5773387551307678,
+      "learning_rate": 0.0001784653550874856,
+      "loss": 1.0336,
+      "step": 4797
+    },
+    {
+      "epoch": 0.8543447293447294,
+      "grad_norm": 0.545531153678894,
+      "learning_rate": 0.00017845667680268593,
+      "loss": 1.0532,
+      "step": 4798
+    },
+    {
+      "epoch": 0.854522792022792,
+      "grad_norm": 0.533161461353302,
+      "learning_rate": 0.0001784479969806906,
+      "loss": 1.1964,
+      "step": 4799
+    },
+    {
+      "epoch": 0.8547008547008547,
+      "grad_norm": 0.5880789160728455,
+      "learning_rate": 0.00017843931562166977,
+      "loss": 1.1588,
+      "step": 4800
+    },
+    {
+      "epoch": 0.8548789173789174,
+      "grad_norm": 0.5381524562835693,
+      "learning_rate": 0.00017843063272579346,
+      "loss": 1.1533,
+      "step": 4801
+    },
+    {
+      "epoch": 0.85505698005698,
+      "grad_norm": 0.6280176639556885,
+      "learning_rate": 0.00017842194829323187,
+      "loss": 1.0084,
+      "step": 4802
+    },
+    {
+      "epoch": 0.8552350427350427,
+      "grad_norm": 0.5098552703857422,
+      "learning_rate": 0.0001784132623241551,
+      "loss": 1.0804,
+      "step": 4803
+    },
+    {
+      "epoch": 0.8554131054131054,
+      "grad_norm": 0.5406526923179626,
+      "learning_rate": 0.00017840457481873328,
+      "loss": 1.2571,
+      "step": 4804
+    },
+    {
+      "epoch": 0.8555911680911681,
+      "grad_norm": 0.5859003663063049,
+      "learning_rate": 0.00017839588577713678,
+      "loss": 1.2462,
+      "step": 4805
+    },
+    {
+      "epoch": 0.8557692307692307,
+      "grad_norm": 0.6209002137184143,
+      "learning_rate": 0.00017838719519953572,
+      "loss": 1.307,
+      "step": 4806
+    },
+    {
+      "epoch": 0.8559472934472935,
+      "grad_norm": 0.525753915309906,
+      "learning_rate": 0.00017837850308610037,
+      "loss": 1.2957,
+      "step": 4807
+    },
+    {
+      "epoch": 0.8561253561253561,
+      "grad_norm": 0.5096195340156555,
+      "learning_rate": 0.0001783698094370011,
+      "loss": 1.1433,
+      "step": 4808
+    },
+    {
+      "epoch": 0.8563034188034188,
+      "grad_norm": 0.5873076915740967,
+      "learning_rate": 0.0001783611142524082,
+      "loss": 1.2271,
+      "step": 4809
+    },
+    {
+      "epoch": 0.8564814814814815,
+      "grad_norm": 0.5093944668769836,
+      "learning_rate": 0.0001783524175324921,
+      "loss": 0.8788,
+      "step": 4810
+    },
+    {
+      "epoch": 0.8566595441595442,
+      "grad_norm": 0.5485084652900696,
+      "learning_rate": 0.00017834371927742307,
+      "loss": 1.256,
+      "step": 4811
+    },
+    {
+      "epoch": 0.8568376068376068,
+      "grad_norm": 0.5808873772621155,
+      "learning_rate": 0.00017833501948737163,
+      "loss": 0.9287,
+      "step": 4812
+    },
+    {
+      "epoch": 0.8570156695156695,
+      "grad_norm": 0.5113978385925293,
+      "learning_rate": 0.00017832631816250822,
+      "loss": 1.0372,
+      "step": 4813
+    },
+    {
+      "epoch": 0.8571937321937322,
+      "grad_norm": 0.5877016186714172,
+      "learning_rate": 0.0001783176153030033,
+      "loss": 1.3023,
+      "step": 4814
+    },
+    {
+      "epoch": 0.8573717948717948,
+      "grad_norm": 0.534328043460846,
+      "learning_rate": 0.00017830891090902742,
+      "loss": 1.1023,
+      "step": 4815
+    },
+    {
+      "epoch": 0.8575498575498576,
+      "grad_norm": 0.5781638026237488,
+      "learning_rate": 0.0001783002049807511,
+      "loss": 0.9562,
+      "step": 4816
+    },
+    {
+      "epoch": 0.8577279202279202,
+      "grad_norm": 0.5760263204574585,
+      "learning_rate": 0.00017829149751834487,
+      "loss": 0.8733,
+      "step": 4817
+    },
+    {
+      "epoch": 0.8579059829059829,
+      "grad_norm": 0.3887255787849426,
+      "learning_rate": 0.00017828278852197944,
+      "loss": 0.5949,
+      "step": 4818
+    },
+    {
+      "epoch": 0.8580840455840456,
+      "grad_norm": 0.47814446687698364,
+      "learning_rate": 0.00017827407799182537,
+      "loss": 1.0698,
+      "step": 4819
+    },
+    {
+      "epoch": 0.8582621082621082,
+      "grad_norm": 0.5520272254943848,
+      "learning_rate": 0.00017826536592805334,
+      "loss": 1.1314,
+      "step": 4820
+    },
+    {
+      "epoch": 0.8584401709401709,
+      "grad_norm": 0.5285319685935974,
+      "learning_rate": 0.00017825665233083405,
+      "loss": 1.1618,
+      "step": 4821
+    },
+    {
+      "epoch": 0.8586182336182336,
+      "grad_norm": 0.6080102324485779,
+      "learning_rate": 0.0001782479372003382,
+      "loss": 1.3817,
+      "step": 4822
+    },
+    {
+      "epoch": 0.8587962962962963,
+      "grad_norm": 0.7474410533905029,
+      "learning_rate": 0.00017823922053673662,
+      "loss": 1.1321,
+      "step": 4823
+    },
+    {
+      "epoch": 0.8589743589743589,
+      "grad_norm": 0.559283435344696,
+      "learning_rate": 0.0001782305023402,
+      "loss": 1.1894,
+      "step": 4824
+    },
+    {
+      "epoch": 0.8591524216524217,
+      "grad_norm": 0.5620571374893188,
+      "learning_rate": 0.00017822178261089918,
+      "loss": 1.134,
+      "step": 4825
+    },
+    {
+      "epoch": 0.8593304843304843,
+      "grad_norm": 0.5553044676780701,
+      "learning_rate": 0.00017821306134900504,
+      "loss": 1.3222,
+      "step": 4826
+    },
+    {
+      "epoch": 0.8595085470085471,
+      "grad_norm": 0.6177778244018555,
+      "learning_rate": 0.00017820433855468846,
+      "loss": 1.2545,
+      "step": 4827
+    },
+    {
+      "epoch": 0.8596866096866097,
+      "grad_norm": 0.656233012676239,
+      "learning_rate": 0.0001781956142281203,
+      "loss": 1.1346,
+      "step": 4828
+    },
+    {
+      "epoch": 0.8598646723646723,
+      "grad_norm": 0.6710973381996155,
+      "learning_rate": 0.0001781868883694715,
+      "loss": 1.1361,
+      "step": 4829
+    },
+    {
+      "epoch": 0.8600427350427351,
+      "grad_norm": 0.5093601942062378,
+      "learning_rate": 0.0001781781609789131,
+      "loss": 1.0509,
+      "step": 4830
+    },
+    {
+      "epoch": 0.8602207977207977,
+      "grad_norm": 0.5707578063011169,
+      "learning_rate": 0.00017816943205661598,
+      "loss": 1.0964,
+      "step": 4831
+    },
+    {
+      "epoch": 0.8603988603988604,
+      "grad_norm": 0.6159597635269165,
+      "learning_rate": 0.00017816070160275125,
+      "loss": 1.0322,
+      "step": 4832
+    },
+    {
+      "epoch": 0.8605769230769231,
+      "grad_norm": 0.5430580377578735,
+      "learning_rate": 0.0001781519696174899,
+      "loss": 1.2464,
+      "step": 4833
+    },
+    {
+      "epoch": 0.8607549857549858,
+      "grad_norm": 0.48104700446128845,
+      "learning_rate": 0.0001781432361010031,
+      "loss": 1.1031,
+      "step": 4834
+    },
+    {
+      "epoch": 0.8609330484330484,
+      "grad_norm": 0.5304946303367615,
+      "learning_rate": 0.0001781345010534619,
+      "loss": 1.0281,
+      "step": 4835
+    },
+    {
+      "epoch": 0.8611111111111112,
+      "grad_norm": 0.5230711698532104,
+      "learning_rate": 0.00017812576447503742,
+      "loss": 0.9499,
+      "step": 4836
+    },
+    {
+      "epoch": 0.8612891737891738,
+      "grad_norm": 0.5363606214523315,
+      "learning_rate": 0.00017811702636590093,
+      "loss": 1.1358,
+      "step": 4837
+    },
+    {
+      "epoch": 0.8614672364672364,
+      "grad_norm": 0.5880044102668762,
+      "learning_rate": 0.00017810828672622358,
+      "loss": 1.1765,
+      "step": 4838
+    },
+    {
+      "epoch": 0.8616452991452992,
+      "grad_norm": 0.5194395184516907,
+      "learning_rate": 0.0001780995455561766,
+      "loss": 1.1622,
+      "step": 4839
+    },
+    {
+      "epoch": 0.8618233618233618,
+      "grad_norm": 0.5114264488220215,
+      "learning_rate": 0.00017809080285593126,
+      "loss": 1.0081,
+      "step": 4840
+    },
+    {
+      "epoch": 0.8620014245014245,
+      "grad_norm": 0.6174240112304688,
+      "learning_rate": 0.00017808205862565886,
+      "loss": 1.0745,
+      "step": 4841
+    },
+    {
+      "epoch": 0.8621794871794872,
+      "grad_norm": 0.5662630200386047,
+      "learning_rate": 0.0001780733128655307,
+      "loss": 1.3369,
+      "step": 4842
+    },
+    {
+      "epoch": 0.8623575498575499,
+      "grad_norm": 0.5917882919311523,
+      "learning_rate": 0.00017806456557571817,
+      "loss": 1.1631,
+      "step": 4843
+    },
+    {
+      "epoch": 0.8625356125356125,
+      "grad_norm": 0.5305736660957336,
+      "learning_rate": 0.00017805581675639265,
+      "loss": 0.9875,
+      "step": 4844
+    },
+    {
+      "epoch": 0.8627136752136753,
+      "grad_norm": 0.5181219577789307,
+      "learning_rate": 0.00017804706640772556,
+      "loss": 0.9918,
+      "step": 4845
+    },
+    {
+      "epoch": 0.8628917378917379,
+      "grad_norm": 0.5467997789382935,
+      "learning_rate": 0.00017803831452988832,
+      "loss": 1.1395,
+      "step": 4846
+    },
+    {
+      "epoch": 0.8630698005698005,
+      "grad_norm": 0.5494031310081482,
+      "learning_rate": 0.00017802956112305241,
+      "loss": 1.0312,
+      "step": 4847
+    },
+    {
+      "epoch": 0.8632478632478633,
+      "grad_norm": 0.5804065465927124,
+      "learning_rate": 0.00017802080618738931,
+      "loss": 1.1555,
+      "step": 4848
+    },
+    {
+      "epoch": 0.8634259259259259,
+      "grad_norm": 0.5424801111221313,
+      "learning_rate": 0.00017801204972307067,
+      "loss": 1.0215,
+      "step": 4849
+    },
+    {
+      "epoch": 0.8636039886039886,
+      "grad_norm": 0.5321891903877258,
+      "learning_rate": 0.0001780032917302679,
+      "loss": 1.0187,
+      "step": 4850
+    },
+    {
+      "epoch": 0.8637820512820513,
+      "grad_norm": 0.5543400049209595,
+      "learning_rate": 0.0001779945322091527,
+      "loss": 1.1972,
+      "step": 4851
+    },
+    {
+      "epoch": 0.863960113960114,
+      "grad_norm": 0.566649317741394,
+      "learning_rate": 0.00017798577115989668,
+      "loss": 1.0758,
+      "step": 4852
+    },
+    {
+      "epoch": 0.8641381766381766,
+      "grad_norm": 0.5538444519042969,
+      "learning_rate": 0.00017797700858267145,
+      "loss": 1.1338,
+      "step": 4853
+    },
+    {
+      "epoch": 0.8643162393162394,
+      "grad_norm": 0.5641313791275024,
+      "learning_rate": 0.0001779682444776487,
+      "loss": 1.256,
+      "step": 4854
+    },
+    {
+      "epoch": 0.864494301994302,
+      "grad_norm": 0.6377350091934204,
+      "learning_rate": 0.00017795947884500016,
+      "loss": 1.144,
+      "step": 4855
+    },
+    {
+      "epoch": 0.8646723646723646,
+      "grad_norm": 0.5581876039505005,
+      "learning_rate": 0.0001779507116848976,
+      "loss": 1.3163,
+      "step": 4856
+    },
+    {
+      "epoch": 0.8648504273504274,
+      "grad_norm": 0.5416772365570068,
+      "learning_rate": 0.0001779419429975128,
+      "loss": 1.0219,
+      "step": 4857
+    },
+    {
+      "epoch": 0.86502849002849,
+      "grad_norm": 0.5450608730316162,
+      "learning_rate": 0.0001779331727830175,
+      "loss": 1.0093,
+      "step": 4858
+    },
+    {
+      "epoch": 0.8652065527065527,
+      "grad_norm": 0.5151242017745972,
+      "learning_rate": 0.00017792440104158358,
+      "loss": 1.067,
+      "step": 4859
+    },
+    {
+      "epoch": 0.8653846153846154,
+      "grad_norm": 0.5225046873092651,
+      "learning_rate": 0.0001779156277733829,
+      "loss": 1.0432,
+      "step": 4860
+    },
+    {
+      "epoch": 0.8655626780626781,
+      "grad_norm": 0.5168602466583252,
+      "learning_rate": 0.00017790685297858737,
+      "loss": 0.9665,
+      "step": 4861
+    },
+    {
+      "epoch": 0.8657407407407407,
+      "grad_norm": 0.5749059319496155,
+      "learning_rate": 0.00017789807665736889,
+      "loss": 1.1607,
+      "step": 4862
+    },
+    {
+      "epoch": 0.8659188034188035,
+      "grad_norm": 0.45656394958496094,
+      "learning_rate": 0.00017788929880989938,
+      "loss": 0.8362,
+      "step": 4863
+    },
+    {
+      "epoch": 0.8660968660968661,
+      "grad_norm": 0.5090615749359131,
+      "learning_rate": 0.00017788051943635086,
+      "loss": 0.9553,
+      "step": 4864
+    },
+    {
+      "epoch": 0.8662749287749287,
+      "grad_norm": 0.5381240248680115,
+      "learning_rate": 0.0001778717385368954,
+      "loss": 1.1391,
+      "step": 4865
+    },
+    {
+      "epoch": 0.8664529914529915,
+      "grad_norm": 0.522720456123352,
+      "learning_rate": 0.00017786295611170493,
+      "loss": 1.1869,
+      "step": 4866
+    },
+    {
+      "epoch": 0.8666310541310541,
+      "grad_norm": 0.530986487865448,
+      "learning_rate": 0.0001778541721609516,
+      "loss": 1.1046,
+      "step": 4867
+    },
+    {
+      "epoch": 0.8668091168091168,
+      "grad_norm": 0.5065864324569702,
+      "learning_rate": 0.0001778453866848075,
+      "loss": 1.008,
+      "step": 4868
+    },
+    {
+      "epoch": 0.8669871794871795,
+      "grad_norm": 0.5541394352912903,
+      "learning_rate": 0.00017783659968344476,
+      "loss": 1.0004,
+      "step": 4869
+    },
+    {
+      "epoch": 0.8671652421652422,
+      "grad_norm": 0.5059576630592346,
+      "learning_rate": 0.00017782781115703556,
+      "loss": 1.128,
+      "step": 4870
+    },
+    {
+      "epoch": 0.8673433048433048,
+      "grad_norm": 0.5052187442779541,
+      "learning_rate": 0.00017781902110575203,
+      "loss": 0.8544,
+      "step": 4871
+    },
+    {
+      "epoch": 0.8675213675213675,
+      "grad_norm": 0.5383397340774536,
+      "learning_rate": 0.00017781022952976646,
+      "loss": 1.1411,
+      "step": 4872
+    },
+    {
+      "epoch": 0.8676994301994302,
+      "grad_norm": 0.4760429859161377,
+      "learning_rate": 0.00017780143642925106,
+      "loss": 0.8246,
+      "step": 4873
+    },
+    {
+      "epoch": 0.8678774928774928,
+      "grad_norm": 0.5480535626411438,
+      "learning_rate": 0.00017779264180437817,
+      "loss": 1.013,
+      "step": 4874
+    },
+    {
+      "epoch": 0.8680555555555556,
+      "grad_norm": 0.5303317904472351,
+      "learning_rate": 0.00017778384565532004,
+      "loss": 1.0201,
+      "step": 4875
+    },
+    {
+      "epoch": 0.8682336182336182,
+      "grad_norm": 0.5365355014801025,
+      "learning_rate": 0.00017777504798224903,
+      "loss": 1.1107,
+      "step": 4876
+    },
+    {
+      "epoch": 0.8684116809116809,
+      "grad_norm": 0.5173360705375671,
+      "learning_rate": 0.00017776624878533754,
+      "loss": 1.0808,
+      "step": 4877
+    },
+    {
+      "epoch": 0.8685897435897436,
+      "grad_norm": 0.5088842511177063,
+      "learning_rate": 0.00017775744806475792,
+      "loss": 0.995,
+      "step": 4878
+    },
+    {
+      "epoch": 0.8687678062678063,
+      "grad_norm": 0.5796698927879333,
+      "learning_rate": 0.00017774864582068264,
+      "loss": 1.1485,
+      "step": 4879
+    },
+    {
+      "epoch": 0.8689458689458689,
+      "grad_norm": 0.5719375610351562,
+      "learning_rate": 0.00017773984205328417,
+      "loss": 1.0133,
+      "step": 4880
+    },
+    {
+      "epoch": 0.8691239316239316,
+      "grad_norm": 0.6396418213844299,
+      "learning_rate": 0.00017773103676273498,
+      "loss": 1.0932,
+      "step": 4881
+    },
+    {
+      "epoch": 0.8693019943019943,
+      "grad_norm": 0.5602468252182007,
+      "learning_rate": 0.00017772222994920763,
+      "loss": 0.9702,
+      "step": 4882
+    },
+    {
+      "epoch": 0.8694800569800569,
+      "grad_norm": 0.5167748332023621,
+      "learning_rate": 0.00017771342161287457,
+      "loss": 1.0528,
+      "step": 4883
+    },
+    {
+      "epoch": 0.8696581196581197,
+      "grad_norm": 0.5572916865348816,
+      "learning_rate": 0.00017770461175390848,
+      "loss": 1.1341,
+      "step": 4884
+    },
+    {
+      "epoch": 0.8698361823361823,
+      "grad_norm": 0.6666276454925537,
+      "learning_rate": 0.00017769580037248195,
+      "loss": 1.1948,
+      "step": 4885
+    },
+    {
+      "epoch": 0.8700142450142451,
+      "grad_norm": 0.5348601937294006,
+      "learning_rate": 0.0001776869874687676,
+      "loss": 1.0562,
+      "step": 4886
+    },
+    {
+      "epoch": 0.8701923076923077,
+      "grad_norm": 0.5449648499488831,
+      "learning_rate": 0.00017767817304293812,
+      "loss": 0.988,
+      "step": 4887
+    },
+    {
+      "epoch": 0.8703703703703703,
+      "grad_norm": 0.5995045304298401,
+      "learning_rate": 0.0001776693570951662,
+      "loss": 1.2526,
+      "step": 4888
+    },
+    {
+      "epoch": 0.8705484330484331,
+      "grad_norm": 0.6575320959091187,
+      "learning_rate": 0.00017766053962562457,
+      "loss": 1.1717,
+      "step": 4889
+    },
+    {
+      "epoch": 0.8707264957264957,
+      "grad_norm": 0.5882139801979065,
+      "learning_rate": 0.00017765172063448597,
+      "loss": 1.238,
+      "step": 4890
+    },
+    {
+      "epoch": 0.8709045584045584,
+      "grad_norm": 0.5908389091491699,
+      "learning_rate": 0.00017764290012192325,
+      "loss": 1.0606,
+      "step": 4891
+    },
+    {
+      "epoch": 0.8710826210826211,
+      "grad_norm": 0.6169339418411255,
+      "learning_rate": 0.00017763407808810917,
+      "loss": 1.1456,
+      "step": 4892
+    },
+    {
+      "epoch": 0.8712606837606838,
+      "grad_norm": 0.5916035771369934,
+      "learning_rate": 0.0001776252545332166,
+      "loss": 1.0026,
+      "step": 4893
+    },
+    {
+      "epoch": 0.8714387464387464,
+      "grad_norm": 0.539995551109314,
+      "learning_rate": 0.00017761642945741843,
+      "loss": 1.2397,
+      "step": 4894
+    },
+    {
+      "epoch": 0.8716168091168092,
+      "grad_norm": 0.5346137881278992,
+      "learning_rate": 0.00017760760286088755,
+      "loss": 1.1232,
+      "step": 4895
+    },
+    {
+      "epoch": 0.8717948717948718,
+      "grad_norm": 0.570202112197876,
+      "learning_rate": 0.00017759877474379692,
+      "loss": 1.0708,
+      "step": 4896
+    },
+    {
+      "epoch": 0.8719729344729344,
+      "grad_norm": 0.5023398399353027,
+      "learning_rate": 0.00017758994510631948,
+      "loss": 1.1056,
+      "step": 4897
+    },
+    {
+      "epoch": 0.8721509971509972,
+      "grad_norm": 0.5447137951850891,
+      "learning_rate": 0.00017758111394862826,
+      "loss": 0.8776,
+      "step": 4898
+    },
+    {
+      "epoch": 0.8723290598290598,
+      "grad_norm": 0.5193906426429749,
+      "learning_rate": 0.00017757228127089625,
+      "loss": 0.9959,
+      "step": 4899
+    },
+    {
+      "epoch": 0.8725071225071225,
+      "grad_norm": 0.5958787798881531,
+      "learning_rate": 0.00017756344707329656,
+      "loss": 1.092,
+      "step": 4900
+    },
+    {
+      "epoch": 0.8726851851851852,
+      "grad_norm": 0.521045982837677,
+      "learning_rate": 0.00017755461135600221,
+      "loss": 0.9864,
+      "step": 4901
+    },
+    {
+      "epoch": 0.8728632478632479,
+      "grad_norm": 0.5257635116577148,
+      "learning_rate": 0.00017754577411918638,
+      "loss": 1.216,
+      "step": 4902
+    },
+    {
+      "epoch": 0.8730413105413105,
+      "grad_norm": 0.5425964593887329,
+      "learning_rate": 0.0001775369353630222,
+      "loss": 1.1432,
+      "step": 4903
+    },
+    {
+      "epoch": 0.8732193732193733,
+      "grad_norm": 0.47995322942733765,
+      "learning_rate": 0.00017752809508768286,
+      "loss": 1.0227,
+      "step": 4904
+    },
+    {
+      "epoch": 0.8733974358974359,
+      "grad_norm": 0.5747429728507996,
+      "learning_rate": 0.0001775192532933415,
+      "loss": 0.9984,
+      "step": 4905
+    },
+    {
+      "epoch": 0.8735754985754985,
+      "grad_norm": 0.5745723247528076,
+      "learning_rate": 0.00017751040998017142,
+      "loss": 1.2559,
+      "step": 4906
+    },
+    {
+      "epoch": 0.8737535612535613,
+      "grad_norm": 0.6114141941070557,
+      "learning_rate": 0.0001775015651483459,
+      "loss": 1.3224,
+      "step": 4907
+    },
+    {
+      "epoch": 0.8739316239316239,
+      "grad_norm": 0.4757187068462372,
+      "learning_rate": 0.00017749271879803817,
+      "loss": 1.0352,
+      "step": 4908
+    },
+    {
+      "epoch": 0.8741096866096866,
+      "grad_norm": 0.48644450306892395,
+      "learning_rate": 0.0001774838709294216,
+      "loss": 1.0876,
+      "step": 4909
+    },
+    {
+      "epoch": 0.8742877492877493,
+      "grad_norm": 0.5652037262916565,
+      "learning_rate": 0.00017747502154266955,
+      "loss": 0.9189,
+      "step": 4910
+    },
+    {
+      "epoch": 0.874465811965812,
+      "grad_norm": 0.5289644002914429,
+      "learning_rate": 0.00017746617063795538,
+      "loss": 0.9431,
+      "step": 4911
+    },
+    {
+      "epoch": 0.8746438746438746,
+      "grad_norm": 0.594656229019165,
+      "learning_rate": 0.00017745731821545253,
+      "loss": 1.2408,
+      "step": 4912
+    },
+    {
+      "epoch": 0.8748219373219374,
+      "grad_norm": 0.5693240165710449,
+      "learning_rate": 0.0001774484642753344,
+      "loss": 1.347,
+      "step": 4913
+    },
+    {
+      "epoch": 0.875,
+      "grad_norm": 0.5291008949279785,
+      "learning_rate": 0.00017743960881777456,
+      "loss": 1.161,
+      "step": 4914
+    },
+    {
+      "epoch": 0.8751780626780626,
+      "grad_norm": 0.5958300232887268,
+      "learning_rate": 0.00017743075184294642,
+      "loss": 1.2058,
+      "step": 4915
+    },
+    {
+      "epoch": 0.8753561253561254,
+      "grad_norm": 0.513884425163269,
+      "learning_rate": 0.00017742189335102354,
+      "loss": 1.0952,
+      "step": 4916
+    },
+    {
+      "epoch": 0.875534188034188,
+      "grad_norm": 0.5860681533813477,
+      "learning_rate": 0.00017741303334217948,
+      "loss": 1.1801,
+      "step": 4917
+    },
+    {
+      "epoch": 0.8757122507122507,
+      "grad_norm": 0.47962820529937744,
+      "learning_rate": 0.00017740417181658788,
+      "loss": 1.0785,
+      "step": 4918
+    },
+    {
+      "epoch": 0.8758903133903134,
+      "grad_norm": 0.5110440254211426,
+      "learning_rate": 0.00017739530877442227,
+      "loss": 1.1385,
+      "step": 4919
+    },
+    {
+      "epoch": 0.8760683760683761,
+      "grad_norm": 0.5106285214424133,
+      "learning_rate": 0.00017738644421585643,
+      "loss": 1.1204,
+      "step": 4920
+    },
+    {
+      "epoch": 0.8762464387464387,
+      "grad_norm": 0.5709205865859985,
+      "learning_rate": 0.00017737757814106393,
+      "loss": 1.0108,
+      "step": 4921
+    },
+    {
+      "epoch": 0.8764245014245015,
+      "grad_norm": 0.5850250124931335,
+      "learning_rate": 0.0001773687105502185,
+      "loss": 1.0059,
+      "step": 4922
+    },
+    {
+      "epoch": 0.8766025641025641,
+      "grad_norm": 0.5194727778434753,
+      "learning_rate": 0.00017735984144349396,
+      "loss": 0.9466,
+      "step": 4923
+    },
+    {
+      "epoch": 0.8767806267806267,
+      "grad_norm": 0.5246787667274475,
+      "learning_rate": 0.000177350970821064,
+      "loss": 1.1336,
+      "step": 4924
+    },
+    {
+      "epoch": 0.8769586894586895,
+      "grad_norm": 0.5798323154449463,
+      "learning_rate": 0.00017734209868310244,
+      "loss": 1.1641,
+      "step": 4925
+    },
+    {
+      "epoch": 0.8771367521367521,
+      "grad_norm": 0.5188565850257874,
+      "learning_rate": 0.00017733322502978314,
+      "loss": 0.9959,
+      "step": 4926
+    },
+    {
+      "epoch": 0.8773148148148148,
+      "grad_norm": 0.5969653725624084,
+      "learning_rate": 0.00017732434986127995,
+      "loss": 1.2162,
+      "step": 4927
+    },
+    {
+      "epoch": 0.8774928774928775,
+      "grad_norm": 0.5520089268684387,
+      "learning_rate": 0.00017731547317776674,
+      "loss": 1.0163,
+      "step": 4928
+    },
+    {
+      "epoch": 0.8776709401709402,
+      "grad_norm": 0.48789507150650024,
+      "learning_rate": 0.00017730659497941745,
+      "loss": 0.9757,
+      "step": 4929
+    },
+    {
+      "epoch": 0.8778490028490028,
+      "grad_norm": 0.6034960746765137,
+      "learning_rate": 0.000177297715266406,
+      "loss": 1.1278,
+      "step": 4930
+    },
+    {
+      "epoch": 0.8780270655270656,
+      "grad_norm": 0.53016597032547,
+      "learning_rate": 0.00017728883403890638,
+      "loss": 1.0637,
+      "step": 4931
+    },
+    {
+      "epoch": 0.8782051282051282,
+      "grad_norm": 0.5073726177215576,
+      "learning_rate": 0.00017727995129709266,
+      "loss": 1.1491,
+      "step": 4932
+    },
+    {
+      "epoch": 0.8783831908831908,
+      "grad_norm": 0.540605366230011,
+      "learning_rate": 0.00017727106704113878,
+      "loss": 1.0133,
+      "step": 4933
+    },
+    {
+      "epoch": 0.8785612535612536,
+      "grad_norm": 0.5346775054931641,
+      "learning_rate": 0.0001772621812712189,
+      "loss": 1.1781,
+      "step": 4934
+    },
+    {
+      "epoch": 0.8787393162393162,
+      "grad_norm": 0.5659036040306091,
+      "learning_rate": 0.00017725329398750702,
+      "loss": 1.1023,
+      "step": 4935
+    },
+    {
+      "epoch": 0.8789173789173789,
+      "grad_norm": 0.591063380241394,
+      "learning_rate": 0.00017724440519017738,
+      "loss": 1.0298,
+      "step": 4936
+    },
+    {
+      "epoch": 0.8790954415954416,
+      "grad_norm": 0.5173781514167786,
+      "learning_rate": 0.0001772355148794041,
+      "loss": 1.0483,
+      "step": 4937
+    },
+    {
+      "epoch": 0.8792735042735043,
+      "grad_norm": 0.5405352711677551,
+      "learning_rate": 0.0001772266230553613,
+      "loss": 1.0716,
+      "step": 4938
+    },
+    {
+      "epoch": 0.8794515669515669,
+      "grad_norm": 0.518442690372467,
+      "learning_rate": 0.00017721772971822323,
+      "loss": 1.1373,
+      "step": 4939
+    },
+    {
+      "epoch": 0.8796296296296297,
+      "grad_norm": 0.533673107624054,
+      "learning_rate": 0.0001772088348681642,
+      "loss": 1.0489,
+      "step": 4940
+    },
+    {
+      "epoch": 0.8798076923076923,
+      "grad_norm": 0.46117857098579407,
+      "learning_rate": 0.0001771999385053584,
+      "loss": 1.0297,
+      "step": 4941
+    },
+    {
+      "epoch": 0.8799857549857549,
+      "grad_norm": 0.4687997102737427,
+      "learning_rate": 0.0001771910406299802,
+      "loss": 1.071,
+      "step": 4942
+    },
+    {
+      "epoch": 0.8801638176638177,
+      "grad_norm": 0.5064153075218201,
+      "learning_rate": 0.0001771821412422039,
+      "loss": 0.9518,
+      "step": 4943
+    },
+    {
+      "epoch": 0.8803418803418803,
+      "grad_norm": 0.6561978459358215,
+      "learning_rate": 0.00017717324034220385,
+      "loss": 1.11,
+      "step": 4944
+    },
+    {
+      "epoch": 0.8805199430199431,
+      "grad_norm": 0.5551498532295227,
+      "learning_rate": 0.00017716433793015454,
+      "loss": 0.9719,
+      "step": 4945
+    },
+    {
+      "epoch": 0.8806980056980057,
+      "grad_norm": 0.47059500217437744,
+      "learning_rate": 0.00017715543400623025,
+      "loss": 0.8891,
+      "step": 4946
+    },
+    {
+      "epoch": 0.8808760683760684,
+      "grad_norm": 0.5035740733146667,
+      "learning_rate": 0.00017714652857060554,
+      "loss": 0.9671,
+      "step": 4947
+    },
+    {
+      "epoch": 0.8810541310541311,
+      "grad_norm": 0.4599960446357727,
+      "learning_rate": 0.00017713762162345487,
+      "loss": 0.9588,
+      "step": 4948
+    },
+    {
+      "epoch": 0.8812321937321937,
+      "grad_norm": 0.5087231397628784,
+      "learning_rate": 0.0001771287131649527,
+      "loss": 1.1433,
+      "step": 4949
+    },
+    {
+      "epoch": 0.8814102564102564,
+      "grad_norm": 0.5609854459762573,
+      "learning_rate": 0.00017711980319527366,
+      "loss": 1.2022,
+      "step": 4950
+    },
+    {
+      "epoch": 0.8815883190883191,
+      "grad_norm": 0.49460700154304504,
+      "learning_rate": 0.00017711089171459227,
+      "loss": 1.019,
+      "step": 4951
+    },
+    {
+      "epoch": 0.8817663817663818,
+      "grad_norm": 0.5047259330749512,
+      "learning_rate": 0.00017710197872308314,
+      "loss": 0.8301,
+      "step": 4952
+    },
+    {
+      "epoch": 0.8819444444444444,
+      "grad_norm": 0.5784406065940857,
+      "learning_rate": 0.0001770930642209209,
+      "loss": 0.9336,
+      "step": 4953
+    },
+    {
+      "epoch": 0.8821225071225072,
+      "grad_norm": 0.5037121772766113,
+      "learning_rate": 0.00017708414820828022,
+      "loss": 1.0199,
+      "step": 4954
+    },
+    {
+      "epoch": 0.8823005698005698,
+      "grad_norm": 0.5683804750442505,
+      "learning_rate": 0.00017707523068533575,
+      "loss": 0.9758,
+      "step": 4955
+    },
+    {
+      "epoch": 0.8824786324786325,
+      "grad_norm": 0.5167922973632812,
+      "learning_rate": 0.0001770663116522623,
+      "loss": 1.0389,
+      "step": 4956
+    },
+    {
+      "epoch": 0.8826566951566952,
+      "grad_norm": 0.5813606381416321,
+      "learning_rate": 0.0001770573911092345,
+      "loss": 1.3998,
+      "step": 4957
+    },
+    {
+      "epoch": 0.8828347578347578,
+      "grad_norm": 0.5280475616455078,
+      "learning_rate": 0.00017704846905642723,
+      "loss": 1.0545,
+      "step": 4958
+    },
+    {
+      "epoch": 0.8830128205128205,
+      "grad_norm": 0.5421732068061829,
+      "learning_rate": 0.00017703954549401528,
+      "loss": 0.899,
+      "step": 4959
+    },
+    {
+      "epoch": 0.8831908831908832,
+      "grad_norm": 0.5177720189094543,
+      "learning_rate": 0.00017703062042217344,
+      "loss": 0.975,
+      "step": 4960
+    },
+    {
+      "epoch": 0.8833689458689459,
+      "grad_norm": 0.639327883720398,
+      "learning_rate": 0.00017702169384107666,
+      "loss": 1.1936,
+      "step": 4961
+    },
+    {
+      "epoch": 0.8835470085470085,
+      "grad_norm": 0.5201572179794312,
+      "learning_rate": 0.00017701276575089975,
+      "loss": 0.9891,
+      "step": 4962
+    },
+    {
+      "epoch": 0.8837250712250713,
+      "grad_norm": 0.5304145216941833,
+      "learning_rate": 0.00017700383615181767,
+      "loss": 1.0569,
+      "step": 4963
+    },
+    {
+      "epoch": 0.8839031339031339,
+      "grad_norm": 0.6068132519721985,
+      "learning_rate": 0.00017699490504400538,
+      "loss": 1.2653,
+      "step": 4964
+    },
+    {
+      "epoch": 0.8840811965811965,
+      "grad_norm": 0.597895085811615,
+      "learning_rate": 0.00017698597242763787,
+      "loss": 1.2577,
+      "step": 4965
+    },
+    {
+      "epoch": 0.8842592592592593,
+      "grad_norm": 0.5356902480125427,
+      "learning_rate": 0.00017697703830289017,
+      "loss": 1.1056,
+      "step": 4966
+    },
+    {
+      "epoch": 0.8844373219373219,
+      "grad_norm": 0.5429540872573853,
+      "learning_rate": 0.0001769681026699373,
+      "loss": 1.0951,
+      "step": 4967
+    },
+    {
+      "epoch": 0.8846153846153846,
+      "grad_norm": 0.5789309144020081,
+      "learning_rate": 0.00017695916552895436,
+      "loss": 1.0786,
+      "step": 4968
+    },
+    {
+      "epoch": 0.8847934472934473,
+      "grad_norm": 0.5621341466903687,
+      "learning_rate": 0.0001769502268801164,
+      "loss": 1.0645,
+      "step": 4969
+    },
+    {
+      "epoch": 0.88497150997151,
+      "grad_norm": 0.5879453420639038,
+      "learning_rate": 0.00017694128672359865,
+      "loss": 1.2171,
+      "step": 4970
+    },
+    {
+      "epoch": 0.8851495726495726,
+      "grad_norm": 0.5005951523780823,
+      "learning_rate": 0.0001769323450595762,
+      "loss": 1.0725,
+      "step": 4971
+    },
+    {
+      "epoch": 0.8853276353276354,
+      "grad_norm": 0.5439660549163818,
+      "learning_rate": 0.00017692340188822425,
+      "loss": 1.162,
+      "step": 4972
+    },
+    {
+      "epoch": 0.885505698005698,
+      "grad_norm": 0.6309837698936462,
+      "learning_rate": 0.00017691445720971802,
+      "loss": 1.2861,
+      "step": 4973
+    },
+    {
+      "epoch": 0.8856837606837606,
+      "grad_norm": 0.4997463822364807,
+      "learning_rate": 0.00017690551102423282,
+      "loss": 1.1887,
+      "step": 4974
+    },
+    {
+      "epoch": 0.8858618233618234,
+      "grad_norm": 0.5430852174758911,
+      "learning_rate": 0.00017689656333194385,
+      "loss": 1.1231,
+      "step": 4975
+    },
+    {
+      "epoch": 0.886039886039886,
+      "grad_norm": 0.5414215922355652,
+      "learning_rate": 0.00017688761413302644,
+      "loss": 1.2345,
+      "step": 4976
+    },
+    {
+      "epoch": 0.8862179487179487,
+      "grad_norm": 0.5594443082809448,
+      "learning_rate": 0.00017687866342765601,
+      "loss": 1.0775,
+      "step": 4977
+    },
+    {
+      "epoch": 0.8863960113960114,
+      "grad_norm": 0.5827134847640991,
+      "learning_rate": 0.00017686971121600787,
+      "loss": 1.0609,
+      "step": 4978
+    },
+    {
+      "epoch": 0.8865740740740741,
+      "grad_norm": 0.5075414776802063,
+      "learning_rate": 0.00017686075749825738,
+      "loss": 0.796,
+      "step": 4979
+    },
+    {
+      "epoch": 0.8867521367521367,
+      "grad_norm": 0.6007544994354248,
+      "learning_rate": 0.00017685180227458003,
+      "loss": 1.1716,
+      "step": 4980
+    },
+    {
+      "epoch": 0.8869301994301995,
+      "grad_norm": 0.6458030343055725,
+      "learning_rate": 0.00017684284554515128,
+      "loss": 1.1945,
+      "step": 4981
+    },
+    {
+      "epoch": 0.8871082621082621,
+      "grad_norm": 0.5519212484359741,
+      "learning_rate": 0.00017683388731014657,
+      "loss": 1.2571,
+      "step": 4982
+    },
+    {
+      "epoch": 0.8872863247863247,
+      "grad_norm": 0.5079960227012634,
+      "learning_rate": 0.00017682492756974146,
+      "loss": 1.1186,
+      "step": 4983
+    },
+    {
+      "epoch": 0.8874643874643875,
+      "grad_norm": 0.63576740026474,
+      "learning_rate": 0.00017681596632411147,
+      "loss": 1.389,
+      "step": 4984
+    },
+    {
+      "epoch": 0.8876424501424501,
+      "grad_norm": 0.43325698375701904,
+      "learning_rate": 0.0001768070035734322,
+      "loss": 0.7757,
+      "step": 4985
+    },
+    {
+      "epoch": 0.8878205128205128,
+      "grad_norm": 0.49492064118385315,
+      "learning_rate": 0.00017679803931787923,
+      "loss": 1.0096,
+      "step": 4986
+    },
+    {
+      "epoch": 0.8879985754985755,
+      "grad_norm": 0.5561224222183228,
+      "learning_rate": 0.00017678907355762825,
+      "loss": 0.952,
+      "step": 4987
+    },
+    {
+      "epoch": 0.8881766381766382,
+      "grad_norm": 0.5392457246780396,
+      "learning_rate": 0.00017678010629285486,
+      "loss": 1.0442,
+      "step": 4988
+    },
+    {
+      "epoch": 0.8883547008547008,
+      "grad_norm": 0.4659234881401062,
+      "learning_rate": 0.00017677113752373482,
+      "loss": 0.8668,
+      "step": 4989
+    },
+    {
+      "epoch": 0.8885327635327636,
+      "grad_norm": 0.5139175057411194,
+      "learning_rate": 0.0001767621672504438,
+      "loss": 0.8386,
+      "step": 4990
+    },
+    {
+      "epoch": 0.8887108262108262,
+      "grad_norm": 0.5395823121070862,
+      "learning_rate": 0.00017675319547315755,
+      "loss": 0.9754,
+      "step": 4991
+    },
+    {
+      "epoch": 0.8888888888888888,
+      "grad_norm": 0.4751867949962616,
+      "learning_rate": 0.0001767442221920519,
+      "loss": 0.8775,
+      "step": 4992
+    },
+    {
+      "epoch": 0.8890669515669516,
+      "grad_norm": 0.5728281736373901,
+      "learning_rate": 0.00017673524740730265,
+      "loss": 1.2807,
+      "step": 4993
+    },
+    {
+      "epoch": 0.8892450142450142,
+      "grad_norm": 0.5545622110366821,
+      "learning_rate": 0.00017672627111908558,
+      "loss": 1.0039,
+      "step": 4994
+    },
+    {
+      "epoch": 0.8894230769230769,
+      "grad_norm": 0.5127374529838562,
+      "learning_rate": 0.00017671729332757665,
+      "loss": 1.0505,
+      "step": 4995
+    },
+    {
+      "epoch": 0.8896011396011396,
+      "grad_norm": 0.5238714218139648,
+      "learning_rate": 0.00017670831403295175,
+      "loss": 1.1775,
+      "step": 4996
+    },
+    {
+      "epoch": 0.8897792022792023,
+      "grad_norm": 0.5610160827636719,
+      "learning_rate": 0.00017669933323538674,
+      "loss": 1.0555,
+      "step": 4997
+    },
+    {
+      "epoch": 0.8899572649572649,
+      "grad_norm": 0.5481634736061096,
+      "learning_rate": 0.00017669035093505762,
+      "loss": 1.0802,
+      "step": 4998
+    },
+    {
+      "epoch": 0.8901353276353277,
+      "grad_norm": 0.4725174307823181,
+      "learning_rate": 0.0001766813671321404,
+      "loss": 0.9611,
+      "step": 4999
+    },
+    {
+      "epoch": 0.8903133903133903,
+      "grad_norm": 0.5184635519981384,
+      "learning_rate": 0.0001766723818268111,
+      "loss": 1.1659,
+      "step": 5000
+    },
+    {
+      "epoch": 0.8904914529914529,
+      "grad_norm": 0.5503578186035156,
+      "learning_rate": 0.00017666339501924575,
+      "loss": 1.2165,
+      "step": 5001
+    },
+    {
+      "epoch": 0.8906695156695157,
+      "grad_norm": 0.5299594402313232,
+      "learning_rate": 0.0001766544067096204,
+      "loss": 1.0196,
+      "step": 5002
+    },
+    {
+      "epoch": 0.8908475783475783,
+      "grad_norm": 0.5673944354057312,
+      "learning_rate": 0.00017664541689811118,
+      "loss": 1.2058,
+      "step": 5003
+    },
+    {
+      "epoch": 0.8910256410256411,
+      "grad_norm": 0.6057320833206177,
+      "learning_rate": 0.00017663642558489426,
+      "loss": 1.0136,
+      "step": 5004
+    },
+    {
+      "epoch": 0.8912037037037037,
+      "grad_norm": 0.4767026901245117,
+      "learning_rate": 0.00017662743277014578,
+      "loss": 0.8522,
+      "step": 5005
+    },
+    {
+      "epoch": 0.8913817663817664,
+      "grad_norm": 0.5346270203590393,
+      "learning_rate": 0.00017661843845404192,
+      "loss": 1.1568,
+      "step": 5006
+    },
+    {
+      "epoch": 0.8915598290598291,
+      "grad_norm": 0.5365738868713379,
+      "learning_rate": 0.00017660944263675891,
+      "loss": 1.0488,
+      "step": 5007
+    },
+    {
+      "epoch": 0.8917378917378918,
+      "grad_norm": 0.5536269545555115,
+      "learning_rate": 0.00017660044531847305,
+      "loss": 1.1216,
+      "step": 5008
+    },
+    {
+      "epoch": 0.8919159544159544,
+      "grad_norm": 0.6325978636741638,
+      "learning_rate": 0.00017659144649936055,
+      "loss": 1.2843,
+      "step": 5009
+    },
+    {
+      "epoch": 0.8920940170940171,
+      "grad_norm": 0.5890641212463379,
+      "learning_rate": 0.00017658244617959777,
+      "loss": 1.1976,
+      "step": 5010
+    },
+    {
+      "epoch": 0.8922720797720798,
+      "grad_norm": 0.604870080947876,
+      "learning_rate": 0.00017657344435936107,
+      "loss": 1.2881,
+      "step": 5011
+    },
+    {
+      "epoch": 0.8924501424501424,
+      "grad_norm": 0.49805206060409546,
+      "learning_rate": 0.00017656444103882676,
+      "loss": 0.8998,
+      "step": 5012
+    },
+    {
+      "epoch": 0.8926282051282052,
+      "grad_norm": 0.506926953792572,
+      "learning_rate": 0.0001765554362181713,
+      "loss": 1.0731,
+      "step": 5013
+    },
+    {
+      "epoch": 0.8928062678062678,
+      "grad_norm": 0.5353260636329651,
+      "learning_rate": 0.0001765464298975711,
+      "loss": 1.0676,
+      "step": 5014
+    },
+    {
+      "epoch": 0.8929843304843305,
+      "grad_norm": 0.5641853213310242,
+      "learning_rate": 0.0001765374220772026,
+      "loss": 0.9606,
+      "step": 5015
+    },
+    {
+      "epoch": 0.8931623931623932,
+      "grad_norm": 0.5049327611923218,
+      "learning_rate": 0.00017652841275724233,
+      "loss": 1.009,
+      "step": 5016
+    },
+    {
+      "epoch": 0.8933404558404558,
+      "grad_norm": 0.6255155205726624,
+      "learning_rate": 0.0001765194019378668,
+      "loss": 1.138,
+      "step": 5017
+    },
+    {
+      "epoch": 0.8935185185185185,
+      "grad_norm": 0.5816851854324341,
+      "learning_rate": 0.00017651038961925247,
+      "loss": 1.3398,
+      "step": 5018
+    },
+    {
+      "epoch": 0.8936965811965812,
+      "grad_norm": 0.5188020467758179,
+      "learning_rate": 0.00017650137580157605,
+      "loss": 1.0126,
+      "step": 5019
+    },
+    {
+      "epoch": 0.8938746438746439,
+      "grad_norm": 0.5231554508209229,
+      "learning_rate": 0.00017649236048501406,
+      "loss": 1.0328,
+      "step": 5020
+    },
+    {
+      "epoch": 0.8940527065527065,
+      "grad_norm": 0.7638634443283081,
+      "learning_rate": 0.0001764833436697432,
+      "loss": 1.3016,
+      "step": 5021
+    },
+    {
+      "epoch": 0.8942307692307693,
+      "grad_norm": 0.5354094505310059,
+      "learning_rate": 0.00017647432535594008,
+      "loss": 1.0646,
+      "step": 5022
+    },
+    {
+      "epoch": 0.8944088319088319,
+      "grad_norm": 0.6938086748123169,
+      "learning_rate": 0.0001764653055437814,
+      "loss": 1.2051,
+      "step": 5023
+    },
+    {
+      "epoch": 0.8945868945868946,
+      "grad_norm": 0.5546849370002747,
+      "learning_rate": 0.00017645628423344393,
+      "loss": 1.0671,
+      "step": 5024
+    },
+    {
+      "epoch": 0.8947649572649573,
+      "grad_norm": 0.49294665455818176,
+      "learning_rate": 0.0001764472614251044,
+      "loss": 1.0328,
+      "step": 5025
+    },
+    {
+      "epoch": 0.89494301994302,
+      "grad_norm": 0.5965796113014221,
+      "learning_rate": 0.00017643823711893956,
+      "loss": 1.0741,
+      "step": 5026
+    },
+    {
+      "epoch": 0.8951210826210826,
+      "grad_norm": 0.4846448302268982,
+      "learning_rate": 0.00017642921131512626,
+      "loss": 1.0409,
+      "step": 5027
+    },
+    {
+      "epoch": 0.8952991452991453,
+      "grad_norm": 0.5767390131950378,
+      "learning_rate": 0.00017642018401384135,
+      "loss": 1.018,
+      "step": 5028
+    },
+    {
+      "epoch": 0.895477207977208,
+      "grad_norm": 0.503027617931366,
+      "learning_rate": 0.00017641115521526167,
+      "loss": 1.0002,
+      "step": 5029
+    },
+    {
+      "epoch": 0.8956552706552706,
+      "grad_norm": 0.6668619513511658,
+      "learning_rate": 0.00017640212491956412,
+      "loss": 1.2154,
+      "step": 5030
+    },
+    {
+      "epoch": 0.8958333333333334,
+      "grad_norm": 0.5544148683547974,
+      "learning_rate": 0.00017639309312692566,
+      "loss": 1.2701,
+      "step": 5031
+    },
+    {
+      "epoch": 0.896011396011396,
+      "grad_norm": 0.6026872992515564,
+      "learning_rate": 0.00017638405983752323,
+      "loss": 0.9335,
+      "step": 5032
+    },
+    {
+      "epoch": 0.8961894586894587,
+      "grad_norm": 0.6288694143295288,
+      "learning_rate": 0.00017637502505153384,
+      "loss": 0.9075,
+      "step": 5033
+    },
+    {
+      "epoch": 0.8963675213675214,
+      "grad_norm": 0.4890204966068268,
+      "learning_rate": 0.00017636598876913446,
+      "loss": 0.8492,
+      "step": 5034
+    },
+    {
+      "epoch": 0.896545584045584,
+      "grad_norm": 0.5746598243713379,
+      "learning_rate": 0.00017635695099050218,
+      "loss": 1.1557,
+      "step": 5035
+    },
+    {
+      "epoch": 0.8967236467236467,
+      "grad_norm": 0.5165683031082153,
+      "learning_rate": 0.00017634791171581405,
+      "loss": 1.0899,
+      "step": 5036
+    },
+    {
+      "epoch": 0.8969017094017094,
+      "grad_norm": 0.4621037244796753,
+      "learning_rate": 0.0001763388709452472,
+      "loss": 1.0457,
+      "step": 5037
+    },
+    {
+      "epoch": 0.8970797720797721,
+      "grad_norm": 0.532358705997467,
+      "learning_rate": 0.00017632982867897876,
+      "loss": 1.139,
+      "step": 5038
+    },
+    {
+      "epoch": 0.8972578347578347,
+      "grad_norm": 0.5794399976730347,
+      "learning_rate": 0.00017632078491718587,
+      "loss": 1.031,
+      "step": 5039
+    },
+    {
+      "epoch": 0.8974358974358975,
+      "grad_norm": 0.5031905174255371,
+      "learning_rate": 0.00017631173966004576,
+      "loss": 0.9508,
+      "step": 5040
+    },
+    {
+      "epoch": 0.8976139601139601,
+      "grad_norm": 0.6528840065002441,
+      "learning_rate": 0.00017630269290773564,
+      "loss": 0.9974,
+      "step": 5041
+    },
+    {
+      "epoch": 0.8977920227920227,
+      "grad_norm": 0.6007558703422546,
+      "learning_rate": 0.00017629364466043273,
+      "loss": 1.0993,
+      "step": 5042
+    },
+    {
+      "epoch": 0.8979700854700855,
+      "grad_norm": 0.5104095339775085,
+      "learning_rate": 0.00017628459491831437,
+      "loss": 0.9175,
+      "step": 5043
+    },
+    {
+      "epoch": 0.8981481481481481,
+      "grad_norm": 0.5285516977310181,
+      "learning_rate": 0.00017627554368155782,
+      "loss": 0.998,
+      "step": 5044
+    },
+    {
+      "epoch": 0.8983262108262108,
+      "grad_norm": 0.5629046559333801,
+      "learning_rate": 0.00017626649095034045,
+      "loss": 1.2021,
+      "step": 5045
+    },
+    {
+      "epoch": 0.8985042735042735,
+      "grad_norm": 0.57548987865448,
+      "learning_rate": 0.00017625743672483962,
+      "loss": 1.2076,
+      "step": 5046
+    },
+    {
+      "epoch": 0.8986823361823362,
+      "grad_norm": 0.4883024990558624,
+      "learning_rate": 0.0001762483810052327,
+      "loss": 0.9761,
+      "step": 5047
+    },
+    {
+      "epoch": 0.8988603988603988,
+      "grad_norm": 0.6378034949302673,
+      "learning_rate": 0.0001762393237916972,
+      "loss": 1.2266,
+      "step": 5048
+    },
+    {
+      "epoch": 0.8990384615384616,
+      "grad_norm": 0.5201624035835266,
+      "learning_rate": 0.0001762302650844105,
+      "loss": 1.247,
+      "step": 5049
+    },
+    {
+      "epoch": 0.8992165242165242,
+      "grad_norm": 0.5438048243522644,
+      "learning_rate": 0.0001762212048835501,
+      "loss": 0.993,
+      "step": 5050
+    },
+    {
+      "epoch": 0.8993945868945868,
+      "grad_norm": 0.5928253531455994,
+      "learning_rate": 0.00017621214318929354,
+      "loss": 1.0469,
+      "step": 5051
+    },
+    {
+      "epoch": 0.8995726495726496,
+      "grad_norm": 0.6437996625900269,
+      "learning_rate": 0.00017620308000181831,
+      "loss": 1.3136,
+      "step": 5052
+    },
+    {
+      "epoch": 0.8997507122507122,
+      "grad_norm": 0.5961456298828125,
+      "learning_rate": 0.00017619401532130208,
+      "loss": 1.1495,
+      "step": 5053
+    },
+    {
+      "epoch": 0.8999287749287749,
+      "grad_norm": 0.497388631105423,
+      "learning_rate": 0.0001761849491479224,
+      "loss": 0.7783,
+      "step": 5054
+    },
+    {
+      "epoch": 0.9001068376068376,
+      "grad_norm": 0.5984451174736023,
+      "learning_rate": 0.00017617588148185687,
+      "loss": 1.3115,
+      "step": 5055
+    },
+    {
+      "epoch": 0.9002849002849003,
+      "grad_norm": 0.549163818359375,
+      "learning_rate": 0.0001761668123232832,
+      "loss": 1.1649,
+      "step": 5056
+    },
+    {
+      "epoch": 0.9004629629629629,
+      "grad_norm": 0.5831968188285828,
+      "learning_rate": 0.00017615774167237903,
+      "loss": 1.1749,
+      "step": 5057
+    },
+    {
+      "epoch": 0.9006410256410257,
+      "grad_norm": 0.5111076235771179,
+      "learning_rate": 0.00017614866952932214,
+      "loss": 0.8936,
+      "step": 5058
+    },
+    {
+      "epoch": 0.9008190883190883,
+      "grad_norm": 0.5740947723388672,
+      "learning_rate": 0.00017613959589429028,
+      "loss": 1.2606,
+      "step": 5059
+    },
+    {
+      "epoch": 0.9009971509971509,
+      "grad_norm": 0.5881099700927734,
+      "learning_rate": 0.0001761305207674612,
+      "loss": 1.3682,
+      "step": 5060
+    },
+    {
+      "epoch": 0.9011752136752137,
+      "grad_norm": 0.5007091760635376,
+      "learning_rate": 0.00017612144414901268,
+      "loss": 0.7788,
+      "step": 5061
+    },
+    {
+      "epoch": 0.9013532763532763,
+      "grad_norm": 0.5127760171890259,
+      "learning_rate": 0.00017611236603912262,
+      "loss": 1.0519,
+      "step": 5062
+    },
+    {
+      "epoch": 0.9015313390313391,
+      "grad_norm": 0.6185184121131897,
+      "learning_rate": 0.00017610328643796882,
+      "loss": 1.1672,
+      "step": 5063
+    },
+    {
+      "epoch": 0.9017094017094017,
+      "grad_norm": 0.49707287549972534,
+      "learning_rate": 0.00017609420534572926,
+      "loss": 1.1865,
+      "step": 5064
+    },
+    {
+      "epoch": 0.9018874643874644,
+      "grad_norm": 0.5667552351951599,
+      "learning_rate": 0.0001760851227625818,
+      "loss": 1.1388,
+      "step": 5065
+    },
+    {
+      "epoch": 0.9020655270655271,
+      "grad_norm": 0.50298011302948,
+      "learning_rate": 0.00017607603868870442,
+      "loss": 0.9552,
+      "step": 5066
+    },
+    {
+      "epoch": 0.9022435897435898,
+      "grad_norm": 0.5709219574928284,
+      "learning_rate": 0.0001760669531242751,
+      "loss": 1.2636,
+      "step": 5067
+    },
+    {
+      "epoch": 0.9024216524216524,
+      "grad_norm": 0.4943496286869049,
+      "learning_rate": 0.0001760578660694718,
+      "loss": 0.8951,
+      "step": 5068
+    },
+    {
+      "epoch": 0.9025997150997151,
+      "grad_norm": 0.5475931167602539,
+      "learning_rate": 0.00017604877752447267,
+      "loss": 1.1442,
+      "step": 5069
+    },
+    {
+      "epoch": 0.9027777777777778,
+      "grad_norm": 0.5280239582061768,
+      "learning_rate": 0.0001760396874894557,
+      "loss": 0.9537,
+      "step": 5070
+    },
+    {
+      "epoch": 0.9029558404558404,
+      "grad_norm": 0.5480797290802002,
+      "learning_rate": 0.000176030595964599,
+      "loss": 1.1557,
+      "step": 5071
+    },
+    {
+      "epoch": 0.9031339031339032,
+      "grad_norm": 0.5232734680175781,
+      "learning_rate": 0.00017602150295008073,
+      "loss": 1.0219,
+      "step": 5072
+    },
+    {
+      "epoch": 0.9033119658119658,
+      "grad_norm": 0.5448359251022339,
+      "learning_rate": 0.000176012408446079,
+      "loss": 1.1964,
+      "step": 5073
+    },
+    {
+      "epoch": 0.9034900284900285,
+      "grad_norm": 0.4841914474964142,
+      "learning_rate": 0.00017600331245277206,
+      "loss": 1.0667,
+      "step": 5074
+    },
+    {
+      "epoch": 0.9036680911680912,
+      "grad_norm": 0.5407083630561829,
+      "learning_rate": 0.0001759942149703381,
+      "loss": 1.1895,
+      "step": 5075
+    },
+    {
+      "epoch": 0.9038461538461539,
+      "grad_norm": 0.5140416026115417,
+      "learning_rate": 0.00017598511599895534,
+      "loss": 0.9402,
+      "step": 5076
+    },
+    {
+      "epoch": 0.9040242165242165,
+      "grad_norm": 0.6333765983581543,
+      "learning_rate": 0.00017597601553880207,
+      "loss": 1.239,
+      "step": 5077
+    },
+    {
+      "epoch": 0.9042022792022792,
+      "grad_norm": 0.4996028244495392,
+      "learning_rate": 0.00017596691359005664,
+      "loss": 1.0259,
+      "step": 5078
+    },
+    {
+      "epoch": 0.9043803418803419,
+      "grad_norm": 0.591892421245575,
+      "learning_rate": 0.00017595781015289732,
+      "loss": 1.2148,
+      "step": 5079
+    },
+    {
+      "epoch": 0.9045584045584045,
+      "grad_norm": 0.736499011516571,
+      "learning_rate": 0.0001759487052275025,
+      "loss": 1.1373,
+      "step": 5080
+    },
+    {
+      "epoch": 0.9047364672364673,
+      "grad_norm": 0.5951572060585022,
+      "learning_rate": 0.00017593959881405057,
+      "loss": 1.1833,
+      "step": 5081
+    },
+    {
+      "epoch": 0.9049145299145299,
+      "grad_norm": 0.5092006325721741,
+      "learning_rate": 0.00017593049091271996,
+      "loss": 0.8841,
+      "step": 5082
+    },
+    {
+      "epoch": 0.9050925925925926,
+      "grad_norm": 0.5679013729095459,
+      "learning_rate": 0.0001759213815236891,
+      "loss": 1.1056,
+      "step": 5083
+    },
+    {
+      "epoch": 0.9052706552706553,
+      "grad_norm": 0.5708174109458923,
+      "learning_rate": 0.0001759122706471365,
+      "loss": 1.1952,
+      "step": 5084
+    },
+    {
+      "epoch": 0.905448717948718,
+      "grad_norm": 0.5726733803749084,
+      "learning_rate": 0.00017590315828324067,
+      "loss": 1.1013,
+      "step": 5085
+    },
+    {
+      "epoch": 0.9056267806267806,
+      "grad_norm": 0.5821273326873779,
+      "learning_rate": 0.00017589404443218008,
+      "loss": 1.2323,
+      "step": 5086
+    },
+    {
+      "epoch": 0.9058048433048433,
+      "grad_norm": 0.5811445713043213,
+      "learning_rate": 0.00017588492909413337,
+      "loss": 1.2241,
+      "step": 5087
+    },
+    {
+      "epoch": 0.905982905982906,
+      "grad_norm": 0.5377545952796936,
+      "learning_rate": 0.0001758758122692791,
+      "loss": 0.9777,
+      "step": 5088
+    },
+    {
+      "epoch": 0.9061609686609686,
+      "grad_norm": 0.5985640287399292,
+      "learning_rate": 0.0001758666939577959,
+      "loss": 0.9737,
+      "step": 5089
+    },
+    {
+      "epoch": 0.9063390313390314,
+      "grad_norm": 0.6038222908973694,
+      "learning_rate": 0.00017585757415986247,
+      "loss": 1.2116,
+      "step": 5090
+    },
+    {
+      "epoch": 0.906517094017094,
+      "grad_norm": 0.6752246022224426,
+      "learning_rate": 0.00017584845287565743,
+      "loss": 1.1975,
+      "step": 5091
+    },
+    {
+      "epoch": 0.9066951566951567,
+      "grad_norm": 0.5400625467300415,
+      "learning_rate": 0.0001758393301053595,
+      "loss": 0.9669,
+      "step": 5092
+    },
+    {
+      "epoch": 0.9068732193732194,
+      "grad_norm": 0.5637784004211426,
+      "learning_rate": 0.00017583020584914746,
+      "loss": 1.2672,
+      "step": 5093
+    },
+    {
+      "epoch": 0.907051282051282,
+      "grad_norm": 0.4825877249240875,
+      "learning_rate": 0.00017582108010720006,
+      "loss": 0.9719,
+      "step": 5094
+    },
+    {
+      "epoch": 0.9072293447293447,
+      "grad_norm": 0.49902790784835815,
+      "learning_rate": 0.00017581195287969613,
+      "loss": 0.7941,
+      "step": 5095
+    },
+    {
+      "epoch": 0.9074074074074074,
+      "grad_norm": 0.5991541743278503,
+      "learning_rate": 0.0001758028241668144,
+      "loss": 1.049,
+      "step": 5096
+    },
+    {
+      "epoch": 0.9075854700854701,
+      "grad_norm": 0.5788859724998474,
+      "learning_rate": 0.00017579369396873384,
+      "loss": 1.0318,
+      "step": 5097
+    },
+    {
+      "epoch": 0.9077635327635327,
+      "grad_norm": 0.5914160013198853,
+      "learning_rate": 0.0001757845622856333,
+      "loss": 1.1007,
+      "step": 5098
+    },
+    {
+      "epoch": 0.9079415954415955,
+      "grad_norm": 0.5361711382865906,
+      "learning_rate": 0.00017577542911769166,
+      "loss": 1.0694,
+      "step": 5099
+    },
+    {
+      "epoch": 0.9081196581196581,
+      "grad_norm": 0.5752849578857422,
+      "learning_rate": 0.00017576629446508792,
+      "loss": 1.1184,
+      "step": 5100
+    },
+    {
+      "epoch": 0.9082977207977208,
+      "grad_norm": 0.6042249798774719,
+      "learning_rate": 0.000175757158328001,
+      "loss": 1.2808,
+      "step": 5101
+    },
+    {
+      "epoch": 0.9084757834757835,
+      "grad_norm": 0.508352518081665,
+      "learning_rate": 0.00017574802070661,
+      "loss": 1.0038,
+      "step": 5102
+    },
+    {
+      "epoch": 0.9086538461538461,
+      "grad_norm": 0.5667358040809631,
+      "learning_rate": 0.00017573888160109385,
+      "loss": 1.0208,
+      "step": 5103
+    },
+    {
+      "epoch": 0.9088319088319088,
+      "grad_norm": 0.653619647026062,
+      "learning_rate": 0.00017572974101163165,
+      "loss": 1.2053,
+      "step": 5104
+    },
+    {
+      "epoch": 0.9090099715099715,
+      "grad_norm": 0.5069597363471985,
+      "learning_rate": 0.00017572059893840246,
+      "loss": 0.8634,
+      "step": 5105
+    },
+    {
+      "epoch": 0.9091880341880342,
+      "grad_norm": 0.6160602569580078,
+      "learning_rate": 0.00017571145538158547,
+      "loss": 1.2626,
+      "step": 5106
+    },
+    {
+      "epoch": 0.9093660968660968,
+      "grad_norm": 0.6335833668708801,
+      "learning_rate": 0.00017570231034135978,
+      "loss": 1.3381,
+      "step": 5107
+    },
+    {
+      "epoch": 0.9095441595441596,
+      "grad_norm": 0.5140398740768433,
+      "learning_rate": 0.00017569316381790454,
+      "loss": 1.1258,
+      "step": 5108
+    },
+    {
+      "epoch": 0.9097222222222222,
+      "grad_norm": 0.5682975649833679,
+      "learning_rate": 0.00017568401581139905,
+      "loss": 1.3367,
+      "step": 5109
+    },
+    {
+      "epoch": 0.9099002849002849,
+      "grad_norm": 0.49765729904174805,
+      "learning_rate": 0.00017567486632202246,
+      "loss": 1.1891,
+      "step": 5110
+    },
+    {
+      "epoch": 0.9100783475783476,
+      "grad_norm": 0.5139224529266357,
+      "learning_rate": 0.00017566571534995406,
+      "loss": 0.9768,
+      "step": 5111
+    },
+    {
+      "epoch": 0.9102564102564102,
+      "grad_norm": 0.5510922074317932,
+      "learning_rate": 0.00017565656289537316,
+      "loss": 1.1552,
+      "step": 5112
+    },
+    {
+      "epoch": 0.9104344729344729,
+      "grad_norm": 0.6243364810943604,
+      "learning_rate": 0.00017564740895845908,
+      "loss": 1.1341,
+      "step": 5113
+    },
+    {
+      "epoch": 0.9106125356125356,
+      "grad_norm": 0.5334977507591248,
+      "learning_rate": 0.00017563825353939116,
+      "loss": 1.0894,
+      "step": 5114
+    },
+    {
+      "epoch": 0.9107905982905983,
+      "grad_norm": 0.5195826292037964,
+      "learning_rate": 0.00017562909663834878,
+      "loss": 1.1011,
+      "step": 5115
+    },
+    {
+      "epoch": 0.9109686609686609,
+      "grad_norm": 0.5298168063163757,
+      "learning_rate": 0.00017561993825551138,
+      "loss": 1.0079,
+      "step": 5116
+    },
+    {
+      "epoch": 0.9111467236467237,
+      "grad_norm": 0.5858965516090393,
+      "learning_rate": 0.00017561077839105835,
+      "loss": 1.2746,
+      "step": 5117
+    },
+    {
+      "epoch": 0.9113247863247863,
+      "grad_norm": 0.5572476387023926,
+      "learning_rate": 0.0001756016170451692,
+      "loss": 0.8169,
+      "step": 5118
+    },
+    {
+      "epoch": 0.9115028490028491,
+      "grad_norm": 0.5247095823287964,
+      "learning_rate": 0.0001755924542180234,
+      "loss": 1.1206,
+      "step": 5119
+    },
+    {
+      "epoch": 0.9116809116809117,
+      "grad_norm": 0.5605118274688721,
+      "learning_rate": 0.0001755832899098005,
+      "loss": 1.371,
+      "step": 5120
+    },
+    {
+      "epoch": 0.9118589743589743,
+      "grad_norm": 0.5732316970825195,
+      "learning_rate": 0.00017557412412068005,
+      "loss": 1.1248,
+      "step": 5121
+    },
+    {
+      "epoch": 0.9120370370370371,
+      "grad_norm": 0.6167279481887817,
+      "learning_rate": 0.0001755649568508416,
+      "loss": 0.94,
+      "step": 5122
+    },
+    {
+      "epoch": 0.9122150997150997,
+      "grad_norm": 0.5497499108314514,
+      "learning_rate": 0.00017555578810046483,
+      "loss": 1.0112,
+      "step": 5123
+    },
+    {
+      "epoch": 0.9123931623931624,
+      "grad_norm": 0.540762186050415,
+      "learning_rate": 0.00017554661786972931,
+      "loss": 1.1058,
+      "step": 5124
+    },
+    {
+      "epoch": 0.9125712250712251,
+      "grad_norm": 0.5943556427955627,
+      "learning_rate": 0.0001755374461588148,
+      "loss": 0.9086,
+      "step": 5125
+    },
+    {
+      "epoch": 0.9127492877492878,
+      "grad_norm": 0.5300756692886353,
+      "learning_rate": 0.0001755282729679009,
+      "loss": 1.1566,
+      "step": 5126
+    },
+    {
+      "epoch": 0.9129273504273504,
+      "grad_norm": 0.5390434861183167,
+      "learning_rate": 0.00017551909829716743,
+      "loss": 1.1395,
+      "step": 5127
+    },
+    {
+      "epoch": 0.9131054131054132,
+      "grad_norm": 0.627434492111206,
+      "learning_rate": 0.00017550992214679405,
+      "loss": 1.1537,
+      "step": 5128
+    },
+    {
+      "epoch": 0.9132834757834758,
+      "grad_norm": 0.4806903302669525,
+      "learning_rate": 0.00017550074451696063,
+      "loss": 0.7905,
+      "step": 5129
+    },
+    {
+      "epoch": 0.9134615384615384,
+      "grad_norm": 0.5714817047119141,
+      "learning_rate": 0.00017549156540784696,
+      "loss": 1.1042,
+      "step": 5130
+    },
+    {
+      "epoch": 0.9136396011396012,
+      "grad_norm": 0.5839236378669739,
+      "learning_rate": 0.0001754823848196329,
+      "loss": 1.0383,
+      "step": 5131
+    },
+    {
+      "epoch": 0.9138176638176638,
+      "grad_norm": 0.6089872717857361,
+      "learning_rate": 0.0001754732027524983,
+      "loss": 0.9399,
+      "step": 5132
+    },
+    {
+      "epoch": 0.9139957264957265,
+      "grad_norm": 0.4937956631183624,
+      "learning_rate": 0.00017546401920662307,
+      "loss": 0.7382,
+      "step": 5133
+    },
+    {
+      "epoch": 0.9141737891737892,
+      "grad_norm": 0.5918676257133484,
+      "learning_rate": 0.00017545483418218716,
+      "loss": 1.2207,
+      "step": 5134
+    },
+    {
+      "epoch": 0.9143518518518519,
+      "grad_norm": 0.5825346112251282,
+      "learning_rate": 0.0001754456476793705,
+      "loss": 0.9669,
+      "step": 5135
+    },
+    {
+      "epoch": 0.9145299145299145,
+      "grad_norm": 0.49829617142677307,
+      "learning_rate": 0.0001754364596983531,
+      "loss": 1.2247,
+      "step": 5136
+    },
+    {
+      "epoch": 0.9147079772079773,
+      "grad_norm": 0.5128271579742432,
+      "learning_rate": 0.00017542727023931497,
+      "loss": 0.9563,
+      "step": 5137
+    },
+    {
+      "epoch": 0.9148860398860399,
+      "grad_norm": 0.5789414644241333,
+      "learning_rate": 0.00017541807930243622,
+      "loss": 1.22,
+      "step": 5138
+    },
+    {
+      "epoch": 0.9150641025641025,
+      "grad_norm": 0.44155433773994446,
+      "learning_rate": 0.00017540888688789683,
+      "loss": 0.9897,
+      "step": 5139
+    },
+    {
+      "epoch": 0.9152421652421653,
+      "grad_norm": 0.550464391708374,
+      "learning_rate": 0.00017539969299587696,
+      "loss": 1.0624,
+      "step": 5140
+    },
+    {
+      "epoch": 0.9154202279202279,
+      "grad_norm": 0.5019831657409668,
+      "learning_rate": 0.0001753904976265567,
+      "loss": 0.9045,
+      "step": 5141
+    },
+    {
+      "epoch": 0.9155982905982906,
+      "grad_norm": 0.589658796787262,
+      "learning_rate": 0.0001753813007801163,
+      "loss": 1.0454,
+      "step": 5142
+    },
+    {
+      "epoch": 0.9157763532763533,
+      "grad_norm": 0.5945459008216858,
+      "learning_rate": 0.00017537210245673586,
+      "loss": 1.0042,
+      "step": 5143
+    },
+    {
+      "epoch": 0.915954415954416,
+      "grad_norm": 0.5409809947013855,
+      "learning_rate": 0.00017536290265659566,
+      "loss": 1.0609,
+      "step": 5144
+    },
+    {
+      "epoch": 0.9161324786324786,
+      "grad_norm": 0.5302975177764893,
+      "learning_rate": 0.00017535370137987597,
+      "loss": 1.1394,
+      "step": 5145
+    },
+    {
+      "epoch": 0.9163105413105413,
+      "grad_norm": 0.5253351330757141,
+      "learning_rate": 0.00017534449862675698,
+      "loss": 1.2249,
+      "step": 5146
+    },
+    {
+      "epoch": 0.916488603988604,
+      "grad_norm": 0.6363829970359802,
+      "learning_rate": 0.00017533529439741908,
+      "loss": 1.1333,
+      "step": 5147
+    },
+    {
+      "epoch": 0.9166666666666666,
+      "grad_norm": 0.4703354835510254,
+      "learning_rate": 0.0001753260886920426,
+      "loss": 0.9971,
+      "step": 5148
+    },
+    {
+      "epoch": 0.9168447293447294,
+      "grad_norm": 0.6394907236099243,
+      "learning_rate": 0.00017531688151080786,
+      "loss": 1.5942,
+      "step": 5149
+    },
+    {
+      "epoch": 0.917022792022792,
+      "grad_norm": 0.5573459267616272,
+      "learning_rate": 0.00017530767285389527,
+      "loss": 0.9669,
+      "step": 5150
+    },
+    {
+      "epoch": 0.9172008547008547,
+      "grad_norm": 0.5000962615013123,
+      "learning_rate": 0.00017529846272148532,
+      "loss": 1.2151,
+      "step": 5151
+    },
+    {
+      "epoch": 0.9173789173789174,
+      "grad_norm": 0.5550395846366882,
+      "learning_rate": 0.0001752892511137584,
+      "loss": 1.1765,
+      "step": 5152
+    },
+    {
+      "epoch": 0.91755698005698,
+      "grad_norm": 0.5461394786834717,
+      "learning_rate": 0.00017528003803089496,
+      "loss": 1.1136,
+      "step": 5153
+    },
+    {
+      "epoch": 0.9177350427350427,
+      "grad_norm": 0.5512672662734985,
+      "learning_rate": 0.00017527082347307558,
+      "loss": 1.1727,
+      "step": 5154
+    },
+    {
+      "epoch": 0.9179131054131054,
+      "grad_norm": 0.5210778713226318,
+      "learning_rate": 0.0001752616074404808,
+      "loss": 1.09,
+      "step": 5155
+    },
+    {
+      "epoch": 0.9180911680911681,
+      "grad_norm": 0.5214943289756775,
+      "learning_rate": 0.00017525238993329115,
+      "loss": 0.9654,
+      "step": 5156
+    },
+    {
+      "epoch": 0.9182692307692307,
+      "grad_norm": 0.5822862386703491,
+      "learning_rate": 0.00017524317095168724,
+      "loss": 1.0951,
+      "step": 5157
+    },
+    {
+      "epoch": 0.9184472934472935,
+      "grad_norm": 0.43948012590408325,
+      "learning_rate": 0.0001752339504958497,
+      "loss": 0.6984,
+      "step": 5158
+    },
+    {
+      "epoch": 0.9186253561253561,
+      "grad_norm": 0.5024449229240417,
+      "learning_rate": 0.00017522472856595916,
+      "loss": 0.983,
+      "step": 5159
+    },
+    {
+      "epoch": 0.9188034188034188,
+      "grad_norm": 0.5815144181251526,
+      "learning_rate": 0.00017521550516219636,
+      "loss": 0.9784,
+      "step": 5160
+    },
+    {
+      "epoch": 0.9189814814814815,
+      "grad_norm": 0.5519825220108032,
+      "learning_rate": 0.00017520628028474197,
+      "loss": 1.064,
+      "step": 5161
+    },
+    {
+      "epoch": 0.9191595441595442,
+      "grad_norm": 0.5615749955177307,
+      "learning_rate": 0.00017519705393377675,
+      "loss": 1.1284,
+      "step": 5162
+    },
+    {
+      "epoch": 0.9193376068376068,
+      "grad_norm": 0.5929917693138123,
+      "learning_rate": 0.00017518782610948148,
+      "loss": 1.1221,
+      "step": 5163
+    },
+    {
+      "epoch": 0.9195156695156695,
+      "grad_norm": 0.7116361856460571,
+      "learning_rate": 0.00017517859681203692,
+      "loss": 1.0188,
+      "step": 5164
+    },
+    {
+      "epoch": 0.9196937321937322,
+      "grad_norm": 0.5095893740653992,
+      "learning_rate": 0.00017516936604162396,
+      "loss": 1.0724,
+      "step": 5165
+    },
+    {
+      "epoch": 0.9198717948717948,
+      "grad_norm": 0.5701385736465454,
+      "learning_rate": 0.00017516013379842337,
+      "loss": 1.0572,
+      "step": 5166
+    },
+    {
+      "epoch": 0.9200498575498576,
+      "grad_norm": 0.518412709236145,
+      "learning_rate": 0.00017515090008261613,
+      "loss": 1.0514,
+      "step": 5167
+    },
+    {
+      "epoch": 0.9202279202279202,
+      "grad_norm": 0.5324261784553528,
+      "learning_rate": 0.00017514166489438312,
+      "loss": 1.1708,
+      "step": 5168
+    },
+    {
+      "epoch": 0.9204059829059829,
+      "grad_norm": 0.5640990138053894,
+      "learning_rate": 0.00017513242823390525,
+      "loss": 1.2846,
+      "step": 5169
+    },
+    {
+      "epoch": 0.9205840455840456,
+      "grad_norm": 0.510352373123169,
+      "learning_rate": 0.00017512319010136356,
+      "loss": 1.0763,
+      "step": 5170
+    },
+    {
+      "epoch": 0.9207621082621082,
+      "grad_norm": 0.4994175136089325,
+      "learning_rate": 0.00017511395049693898,
+      "loss": 0.9665,
+      "step": 5171
+    },
+    {
+      "epoch": 0.9209401709401709,
+      "grad_norm": 0.43196994066238403,
+      "learning_rate": 0.00017510470942081258,
+      "loss": 0.761,
+      "step": 5172
+    },
+    {
+      "epoch": 0.9211182336182336,
+      "grad_norm": 0.558977484703064,
+      "learning_rate": 0.00017509546687316543,
+      "loss": 1.0758,
+      "step": 5173
+    },
+    {
+      "epoch": 0.9212962962962963,
+      "grad_norm": 0.573302149772644,
+      "learning_rate": 0.0001750862228541786,
+      "loss": 0.9635,
+      "step": 5174
+    },
+    {
+      "epoch": 0.9214743589743589,
+      "grad_norm": 0.5083786845207214,
+      "learning_rate": 0.00017507697736403321,
+      "loss": 1.0311,
+      "step": 5175
+    },
+    {
+      "epoch": 0.9216524216524217,
+      "grad_norm": 0.5478954911231995,
+      "learning_rate": 0.00017506773040291043,
+      "loss": 1.074,
+      "step": 5176
+    },
+    {
+      "epoch": 0.9218304843304843,
+      "grad_norm": 0.522376537322998,
+      "learning_rate": 0.00017505848197099137,
+      "loss": 1.1162,
+      "step": 5177
+    },
+    {
+      "epoch": 0.9220085470085471,
+      "grad_norm": 0.5946292281150818,
+      "learning_rate": 0.0001750492320684573,
+      "loss": 0.9494,
+      "step": 5178
+    },
+    {
+      "epoch": 0.9221866096866097,
+      "grad_norm": 0.5423247814178467,
+      "learning_rate": 0.00017503998069548943,
+      "loss": 1.0558,
+      "step": 5179
+    },
+    {
+      "epoch": 0.9223646723646723,
+      "grad_norm": 0.49960651993751526,
+      "learning_rate": 0.000175030727852269,
+      "loss": 1.0748,
+      "step": 5180
+    },
+    {
+      "epoch": 0.9225427350427351,
+      "grad_norm": 0.6066586375236511,
+      "learning_rate": 0.00017502147353897732,
+      "loss": 1.2066,
+      "step": 5181
+    },
+    {
+      "epoch": 0.9227207977207977,
+      "grad_norm": 0.57244473695755,
+      "learning_rate": 0.00017501221775579576,
+      "loss": 1.048,
+      "step": 5182
+    },
+    {
+      "epoch": 0.9228988603988604,
+      "grad_norm": 0.512464165687561,
+      "learning_rate": 0.00017500296050290557,
+      "loss": 1.1405,
+      "step": 5183
+    },
+    {
+      "epoch": 0.9230769230769231,
+      "grad_norm": 0.5380734801292419,
+      "learning_rate": 0.00017499370178048818,
+      "loss": 1.0641,
+      "step": 5184
+    },
+    {
+      "epoch": 0.9232549857549858,
+      "grad_norm": 0.47102874517440796,
+      "learning_rate": 0.000174984441588725,
+      "loss": 0.7948,
+      "step": 5185
+    },
+    {
+      "epoch": 0.9234330484330484,
+      "grad_norm": 0.6702211499214172,
+      "learning_rate": 0.00017497517992779747,
+      "loss": 1.3009,
+      "step": 5186
+    },
+    {
+      "epoch": 0.9236111111111112,
+      "grad_norm": 0.4685834050178528,
+      "learning_rate": 0.000174965916797887,
+      "loss": 0.8136,
+      "step": 5187
+    },
+    {
+      "epoch": 0.9237891737891738,
+      "grad_norm": 0.5414277911186218,
+      "learning_rate": 0.00017495665219917513,
+      "loss": 0.9708,
+      "step": 5188
+    },
+    {
+      "epoch": 0.9239672364672364,
+      "grad_norm": 0.5253050923347473,
+      "learning_rate": 0.0001749473861318434,
+      "loss": 1.0691,
+      "step": 5189
+    },
+    {
+      "epoch": 0.9241452991452992,
+      "grad_norm": 0.6009906530380249,
+      "learning_rate": 0.00017493811859607328,
+      "loss": 1.2023,
+      "step": 5190
+    },
+    {
+      "epoch": 0.9243233618233618,
+      "grad_norm": 0.5519336462020874,
+      "learning_rate": 0.00017492884959204643,
+      "loss": 1.189,
+      "step": 5191
+    },
+    {
+      "epoch": 0.9245014245014245,
+      "grad_norm": 0.5024857521057129,
+      "learning_rate": 0.0001749195791199444,
+      "loss": 0.8685,
+      "step": 5192
+    },
+    {
+      "epoch": 0.9246794871794872,
+      "grad_norm": 0.5735679864883423,
+      "learning_rate": 0.00017491030717994887,
+      "loss": 1.1903,
+      "step": 5193
+    },
+    {
+      "epoch": 0.9248575498575499,
+      "grad_norm": 0.5338658094406128,
+      "learning_rate": 0.00017490103377224147,
+      "loss": 1.0442,
+      "step": 5194
+    },
+    {
+      "epoch": 0.9250356125356125,
+      "grad_norm": 0.46669119596481323,
+      "learning_rate": 0.0001748917588970039,
+      "loss": 0.6343,
+      "step": 5195
+    },
+    {
+      "epoch": 0.9252136752136753,
+      "grad_norm": 0.510910153388977,
+      "learning_rate": 0.00017488248255441793,
+      "loss": 0.9334,
+      "step": 5196
+    },
+    {
+      "epoch": 0.9253917378917379,
+      "grad_norm": 0.5732216238975525,
+      "learning_rate": 0.00017487320474466524,
+      "loss": 1.0483,
+      "step": 5197
+    },
+    {
+      "epoch": 0.9255698005698005,
+      "grad_norm": 0.5864318609237671,
+      "learning_rate": 0.00017486392546792762,
+      "loss": 1.0669,
+      "step": 5198
+    },
+    {
+      "epoch": 0.9257478632478633,
+      "grad_norm": 0.5074281096458435,
+      "learning_rate": 0.00017485464472438692,
+      "loss": 1.0636,
+      "step": 5199
+    },
+    {
+      "epoch": 0.9259259259259259,
+      "grad_norm": 0.5833215117454529,
+      "learning_rate": 0.00017484536251422496,
+      "loss": 1.2005,
+      "step": 5200
+    },
+    {
+      "epoch": 0.9261039886039886,
+      "grad_norm": 0.5624990463256836,
+      "learning_rate": 0.0001748360788376236,
+      "loss": 1.1623,
+      "step": 5201
+    },
+    {
+      "epoch": 0.9262820512820513,
+      "grad_norm": 0.5618230104446411,
+      "learning_rate": 0.00017482679369476472,
+      "loss": 1.0495,
+      "step": 5202
+    },
+    {
+      "epoch": 0.926460113960114,
+      "grad_norm": 0.6254985332489014,
+      "learning_rate": 0.00017481750708583024,
+      "loss": 0.9521,
+      "step": 5203
+    },
+    {
+      "epoch": 0.9266381766381766,
+      "grad_norm": 0.5488203763961792,
+      "learning_rate": 0.00017480821901100216,
+      "loss": 1.0689,
+      "step": 5204
+    },
+    {
+      "epoch": 0.9268162393162394,
+      "grad_norm": 0.6157993674278259,
+      "learning_rate": 0.00017479892947046245,
+      "loss": 1.2852,
+      "step": 5205
+    },
+    {
+      "epoch": 0.926994301994302,
+      "grad_norm": 0.49653390049934387,
+      "learning_rate": 0.00017478963846439305,
+      "loss": 0.8616,
+      "step": 5206
+    },
+    {
+      "epoch": 0.9271723646723646,
+      "grad_norm": 0.5079081058502197,
+      "learning_rate": 0.00017478034599297603,
+      "loss": 1.0192,
+      "step": 5207
+    },
+    {
+      "epoch": 0.9273504273504274,
+      "grad_norm": 0.5392495393753052,
+      "learning_rate": 0.00017477105205639354,
+      "loss": 1.115,
+      "step": 5208
+    },
+    {
+      "epoch": 0.92752849002849,
+      "grad_norm": 0.5336191654205322,
+      "learning_rate": 0.00017476175665482756,
+      "loss": 1.1892,
+      "step": 5209
+    },
+    {
+      "epoch": 0.9277065527065527,
+      "grad_norm": 0.631712019443512,
+      "learning_rate": 0.00017475245978846026,
+      "loss": 0.9619,
+      "step": 5210
+    },
+    {
+      "epoch": 0.9278846153846154,
+      "grad_norm": 0.5123951435089111,
+      "learning_rate": 0.0001747431614574738,
+      "loss": 1.1477,
+      "step": 5211
+    },
+    {
+      "epoch": 0.9280626780626781,
+      "grad_norm": 0.5045743584632874,
+      "learning_rate": 0.00017473386166205038,
+      "loss": 0.9749,
+      "step": 5212
+    },
+    {
+      "epoch": 0.9282407407407407,
+      "grad_norm": 0.5296525359153748,
+      "learning_rate": 0.00017472456040237217,
+      "loss": 1.0736,
+      "step": 5213
+    },
+    {
+      "epoch": 0.9284188034188035,
+      "grad_norm": 0.6304933428764343,
+      "learning_rate": 0.00017471525767862145,
+      "loss": 1.2444,
+      "step": 5214
+    },
+    {
+      "epoch": 0.9285968660968661,
+      "grad_norm": 0.4851958155632019,
+      "learning_rate": 0.00017470595349098044,
+      "loss": 0.9049,
+      "step": 5215
+    },
+    {
+      "epoch": 0.9287749287749287,
+      "grad_norm": 0.5730679631233215,
+      "learning_rate": 0.00017469664783963148,
+      "loss": 1.0773,
+      "step": 5216
+    },
+    {
+      "epoch": 0.9289529914529915,
+      "grad_norm": 0.6020415425300598,
+      "learning_rate": 0.00017468734072475684,
+      "loss": 1.3247,
+      "step": 5217
+    },
+    {
+      "epoch": 0.9291310541310541,
+      "grad_norm": 0.47981077432632446,
+      "learning_rate": 0.00017467803214653893,
+      "loss": 1.0009,
+      "step": 5218
+    },
+    {
+      "epoch": 0.9293091168091168,
+      "grad_norm": 0.5787527561187744,
+      "learning_rate": 0.0001746687221051601,
+      "loss": 1.2523,
+      "step": 5219
+    },
+    {
+      "epoch": 0.9294871794871795,
+      "grad_norm": 0.4495891332626343,
+      "learning_rate": 0.00017465941060080278,
+      "loss": 0.7364,
+      "step": 5220
+    },
+    {
+      "epoch": 0.9296652421652422,
+      "grad_norm": 0.5721768140792847,
+      "learning_rate": 0.0001746500976336494,
+      "loss": 1.015,
+      "step": 5221
+    },
+    {
+      "epoch": 0.9298433048433048,
+      "grad_norm": 0.5500208735466003,
+      "learning_rate": 0.0001746407832038824,
+      "loss": 1.053,
+      "step": 5222
+    },
+    {
+      "epoch": 0.9300213675213675,
+      "grad_norm": 0.5784386992454529,
+      "learning_rate": 0.00017463146731168437,
+      "loss": 0.9784,
+      "step": 5223
+    },
+    {
+      "epoch": 0.9301994301994302,
+      "grad_norm": 0.4960322082042694,
+      "learning_rate": 0.00017462214995723772,
+      "loss": 0.8674,
+      "step": 5224
+    },
+    {
+      "epoch": 0.9303774928774928,
+      "grad_norm": 0.5005537271499634,
+      "learning_rate": 0.00017461283114072508,
+      "loss": 1.0486,
+      "step": 5225
+    },
+    {
+      "epoch": 0.9305555555555556,
+      "grad_norm": 0.5064167380332947,
+      "learning_rate": 0.000174603510862329,
+      "loss": 0.9722,
+      "step": 5226
+    },
+    {
+      "epoch": 0.9307336182336182,
+      "grad_norm": 0.583558976650238,
+      "learning_rate": 0.0001745941891222321,
+      "loss": 0.9957,
+      "step": 5227
+    },
+    {
+      "epoch": 0.9309116809116809,
+      "grad_norm": 0.4982515871524811,
+      "learning_rate": 0.00017458486592061704,
+      "loss": 0.958,
+      "step": 5228
+    },
+    {
+      "epoch": 0.9310897435897436,
+      "grad_norm": 0.526549756526947,
+      "learning_rate": 0.0001745755412576664,
+      "loss": 1.1172,
+      "step": 5229
+    },
+    {
+      "epoch": 0.9312678062678063,
+      "grad_norm": 0.6129719018936157,
+      "learning_rate": 0.000174566215133563,
+      "loss": 1.2524,
+      "step": 5230
+    },
+    {
+      "epoch": 0.9314458689458689,
+      "grad_norm": 0.5385653972625732,
+      "learning_rate": 0.00017455688754848948,
+      "loss": 1.1655,
+      "step": 5231
+    },
+    {
+      "epoch": 0.9316239316239316,
+      "grad_norm": 0.5646410584449768,
+      "learning_rate": 0.0001745475585026287,
+      "loss": 0.9026,
+      "step": 5232
+    },
+    {
+      "epoch": 0.9318019943019943,
+      "grad_norm": 0.549223780632019,
+      "learning_rate": 0.0001745382279961633,
+      "loss": 0.804,
+      "step": 5233
+    },
+    {
+      "epoch": 0.9319800569800569,
+      "grad_norm": 0.48547953367233276,
+      "learning_rate": 0.0001745288960292762,
+      "loss": 1.0224,
+      "step": 5234
+    },
+    {
+      "epoch": 0.9321581196581197,
+      "grad_norm": 0.5260967016220093,
+      "learning_rate": 0.00017451956260215016,
+      "loss": 0.9688,
+      "step": 5235
+    },
+    {
+      "epoch": 0.9323361823361823,
+      "grad_norm": 0.6261999011039734,
+      "learning_rate": 0.00017451022771496812,
+      "loss": 1.2539,
+      "step": 5236
+    },
+    {
+      "epoch": 0.9325142450142451,
+      "grad_norm": 0.5801421999931335,
+      "learning_rate": 0.00017450089136791298,
+      "loss": 1.11,
+      "step": 5237
+    },
+    {
+      "epoch": 0.9326923076923077,
+      "grad_norm": 0.5833573937416077,
+      "learning_rate": 0.0001744915535611676,
+      "loss": 0.9328,
+      "step": 5238
+    },
+    {
+      "epoch": 0.9328703703703703,
+      "grad_norm": 0.5422634482383728,
+      "learning_rate": 0.00017448221429491496,
+      "loss": 1.034,
+      "step": 5239
+    },
+    {
+      "epoch": 0.9330484330484331,
+      "grad_norm": 0.5105658769607544,
+      "learning_rate": 0.00017447287356933808,
+      "loss": 0.8924,
+      "step": 5240
+    },
+    {
+      "epoch": 0.9332264957264957,
+      "grad_norm": 0.5114831924438477,
+      "learning_rate": 0.00017446353138461995,
+      "loss": 0.9328,
+      "step": 5241
+    },
+    {
+      "epoch": 0.9334045584045584,
+      "grad_norm": 0.5105039477348328,
+      "learning_rate": 0.00017445418774094358,
+      "loss": 1.0468,
+      "step": 5242
+    },
+    {
+      "epoch": 0.9335826210826211,
+      "grad_norm": 0.593250036239624,
+      "learning_rate": 0.00017444484263849208,
+      "loss": 1.0603,
+      "step": 5243
+    },
+    {
+      "epoch": 0.9337606837606838,
+      "grad_norm": 0.600788414478302,
+      "learning_rate": 0.00017443549607744853,
+      "loss": 1.1506,
+      "step": 5244
+    },
+    {
+      "epoch": 0.9339387464387464,
+      "grad_norm": 0.5394418239593506,
+      "learning_rate": 0.00017442614805799605,
+      "loss": 1.038,
+      "step": 5245
+    },
+    {
+      "epoch": 0.9341168091168092,
+      "grad_norm": 0.5446375608444214,
+      "learning_rate": 0.00017441679858031786,
+      "loss": 1.079,
+      "step": 5246
+    },
+    {
+      "epoch": 0.9342948717948718,
+      "grad_norm": 0.5859794616699219,
+      "learning_rate": 0.00017440744764459702,
+      "loss": 1.1453,
+      "step": 5247
+    },
+    {
+      "epoch": 0.9344729344729344,
+      "grad_norm": 0.4899081289768219,
+      "learning_rate": 0.00017439809525101688,
+      "loss": 1.163,
+      "step": 5248
+    },
+    {
+      "epoch": 0.9346509971509972,
+      "grad_norm": 0.652846097946167,
+      "learning_rate": 0.00017438874139976055,
+      "loss": 1.1819,
+      "step": 5249
+    },
+    {
+      "epoch": 0.9348290598290598,
+      "grad_norm": 0.5402514934539795,
+      "learning_rate": 0.00017437938609101138,
+      "loss": 1.0159,
+      "step": 5250
+    },
+    {
+      "epoch": 0.9350071225071225,
+      "grad_norm": 0.565864086151123,
+      "learning_rate": 0.00017437002932495265,
+      "loss": 1.1121,
+      "step": 5251
+    },
+    {
+      "epoch": 0.9351851851851852,
+      "grad_norm": 0.611786425113678,
+      "learning_rate": 0.0001743606711017677,
+      "loss": 1.2511,
+      "step": 5252
+    },
+    {
+      "epoch": 0.9353632478632479,
+      "grad_norm": 0.5706882476806641,
+      "learning_rate": 0.00017435131142163988,
+      "loss": 1.128,
+      "step": 5253
+    },
+    {
+      "epoch": 0.9355413105413105,
+      "grad_norm": 0.5369367003440857,
+      "learning_rate": 0.00017434195028475253,
+      "loss": 1.0562,
+      "step": 5254
+    },
+    {
+      "epoch": 0.9357193732193733,
+      "grad_norm": 0.49957552552223206,
+      "learning_rate": 0.0001743325876912891,
+      "loss": 1.0568,
+      "step": 5255
+    },
+    {
+      "epoch": 0.9358974358974359,
+      "grad_norm": 0.5398106575012207,
+      "learning_rate": 0.00017432322364143305,
+      "loss": 1.1502,
+      "step": 5256
+    },
+    {
+      "epoch": 0.9360754985754985,
+      "grad_norm": 0.6522027254104614,
+      "learning_rate": 0.00017431385813536783,
+      "loss": 1.0591,
+      "step": 5257
+    },
+    {
+      "epoch": 0.9362535612535613,
+      "grad_norm": 0.5872012972831726,
+      "learning_rate": 0.00017430449117327693,
+      "loss": 1.3737,
+      "step": 5258
+    },
+    {
+      "epoch": 0.9364316239316239,
+      "grad_norm": 0.5124474167823792,
+      "learning_rate": 0.00017429512275534382,
+      "loss": 1.0727,
+      "step": 5259
+    },
+    {
+      "epoch": 0.9366096866096866,
+      "grad_norm": 0.5103365778923035,
+      "learning_rate": 0.00017428575288175218,
+      "loss": 1.0339,
+      "step": 5260
+    },
+    {
+      "epoch": 0.9367877492877493,
+      "grad_norm": 0.585483729839325,
+      "learning_rate": 0.0001742763815526855,
+      "loss": 1.1844,
+      "step": 5261
+    },
+    {
+      "epoch": 0.936965811965812,
+      "grad_norm": 0.5855562090873718,
+      "learning_rate": 0.00017426700876832746,
+      "loss": 1.3234,
+      "step": 5262
+    },
+    {
+      "epoch": 0.9371438746438746,
+      "grad_norm": 0.5774588584899902,
+      "learning_rate": 0.00017425763452886162,
+      "loss": 1.0937,
+      "step": 5263
+    },
+    {
+      "epoch": 0.9373219373219374,
+      "grad_norm": 0.5718343257904053,
+      "learning_rate": 0.00017424825883447168,
+      "loss": 1.0783,
+      "step": 5264
+    },
+    {
+      "epoch": 0.9375,
+      "grad_norm": 0.5414558053016663,
+      "learning_rate": 0.00017423888168534136,
+      "loss": 1.1244,
+      "step": 5265
+    },
+    {
+      "epoch": 0.9376780626780626,
+      "grad_norm": 0.5818275809288025,
+      "learning_rate": 0.00017422950308165438,
+      "loss": 1.247,
+      "step": 5266
+    },
+    {
+      "epoch": 0.9378561253561254,
+      "grad_norm": 0.586398184299469,
+      "learning_rate": 0.00017422012302359448,
+      "loss": 1.0515,
+      "step": 5267
+    },
+    {
+      "epoch": 0.938034188034188,
+      "grad_norm": 0.5236606001853943,
+      "learning_rate": 0.00017421074151134544,
+      "loss": 1.1907,
+      "step": 5268
+    },
+    {
+      "epoch": 0.9382122507122507,
+      "grad_norm": 0.5108010172843933,
+      "learning_rate": 0.0001742013585450911,
+      "loss": 1.1125,
+      "step": 5269
+    },
+    {
+      "epoch": 0.9383903133903134,
+      "grad_norm": 0.4956454038619995,
+      "learning_rate": 0.00017419197412501527,
+      "loss": 1.0305,
+      "step": 5270
+    },
+    {
+      "epoch": 0.9385683760683761,
+      "grad_norm": 0.5432302951812744,
+      "learning_rate": 0.0001741825882513018,
+      "loss": 1.1946,
+      "step": 5271
+    },
+    {
+      "epoch": 0.9387464387464387,
+      "grad_norm": 0.5119295716285706,
+      "learning_rate": 0.00017417320092413463,
+      "loss": 0.875,
+      "step": 5272
+    },
+    {
+      "epoch": 0.9389245014245015,
+      "grad_norm": 0.49740248918533325,
+      "learning_rate": 0.0001741638121436977,
+      "loss": 1.1093,
+      "step": 5273
+    },
+    {
+      "epoch": 0.9391025641025641,
+      "grad_norm": 0.5069027543067932,
+      "learning_rate": 0.00017415442191017491,
+      "loss": 1.2498,
+      "step": 5274
+    },
+    {
+      "epoch": 0.9392806267806267,
+      "grad_norm": 0.570264995098114,
+      "learning_rate": 0.00017414503022375027,
+      "loss": 1.0192,
+      "step": 5275
+    },
+    {
+      "epoch": 0.9394586894586895,
+      "grad_norm": 0.48129352927207947,
+      "learning_rate": 0.00017413563708460776,
+      "loss": 0.8467,
+      "step": 5276
+    },
+    {
+      "epoch": 0.9396367521367521,
+      "grad_norm": 0.5214534401893616,
+      "learning_rate": 0.00017412624249293148,
+      "loss": 0.9723,
+      "step": 5277
+    },
+    {
+      "epoch": 0.9398148148148148,
+      "grad_norm": 0.5150161385536194,
+      "learning_rate": 0.00017411684644890544,
+      "loss": 1.0906,
+      "step": 5278
+    },
+    {
+      "epoch": 0.9399928774928775,
+      "grad_norm": 0.5695852637290955,
+      "learning_rate": 0.00017410744895271377,
+      "loss": 1.2891,
+      "step": 5279
+    },
+    {
+      "epoch": 0.9401709401709402,
+      "grad_norm": 0.5613594651222229,
+      "learning_rate": 0.00017409805000454055,
+      "loss": 1.1373,
+      "step": 5280
+    },
+    {
+      "epoch": 0.9403490028490028,
+      "grad_norm": 0.5134239196777344,
+      "learning_rate": 0.00017408864960457004,
+      "loss": 1.1081,
+      "step": 5281
+    },
+    {
+      "epoch": 0.9405270655270656,
+      "grad_norm": 0.5256397724151611,
+      "learning_rate": 0.00017407924775298628,
+      "loss": 1.058,
+      "step": 5282
+    },
+    {
+      "epoch": 0.9407051282051282,
+      "grad_norm": 0.5145402550697327,
+      "learning_rate": 0.00017406984444997357,
+      "loss": 1.0667,
+      "step": 5283
+    },
+    {
+      "epoch": 0.9408831908831908,
+      "grad_norm": 0.5435704588890076,
+      "learning_rate": 0.0001740604396957161,
+      "loss": 1.2275,
+      "step": 5284
+    },
+    {
+      "epoch": 0.9410612535612536,
+      "grad_norm": 0.5798762440681458,
+      "learning_rate": 0.0001740510334903982,
+      "loss": 1.2061,
+      "step": 5285
+    },
+    {
+      "epoch": 0.9412393162393162,
+      "grad_norm": 0.5461057424545288,
+      "learning_rate": 0.00017404162583420414,
+      "loss": 1.1585,
+      "step": 5286
+    },
+    {
+      "epoch": 0.9414173789173789,
+      "grad_norm": 0.5090487003326416,
+      "learning_rate": 0.00017403221672731818,
+      "loss": 1.2496,
+      "step": 5287
+    },
+    {
+      "epoch": 0.9415954415954416,
+      "grad_norm": 0.5171035528182983,
+      "learning_rate": 0.00017402280616992476,
+      "loss": 1.1947,
+      "step": 5288
+    },
+    {
+      "epoch": 0.9417735042735043,
+      "grad_norm": 0.5292364358901978,
+      "learning_rate": 0.00017401339416220818,
+      "loss": 1.0182,
+      "step": 5289
+    },
+    {
+      "epoch": 0.9419515669515669,
+      "grad_norm": 0.5011499524116516,
+      "learning_rate": 0.00017400398070435293,
+      "loss": 1.3363,
+      "step": 5290
+    },
+    {
+      "epoch": 0.9421296296296297,
+      "grad_norm": 0.4821554720401764,
+      "learning_rate": 0.0001739945657965434,
+      "loss": 0.9077,
+      "step": 5291
+    },
+    {
+      "epoch": 0.9423076923076923,
+      "grad_norm": 0.5849515199661255,
+      "learning_rate": 0.00017398514943896403,
+      "loss": 1.1582,
+      "step": 5292
+    },
+    {
+      "epoch": 0.9424857549857549,
+      "grad_norm": 0.49826139211654663,
+      "learning_rate": 0.00017397573163179937,
+      "loss": 1.1025,
+      "step": 5293
+    },
+    {
+      "epoch": 0.9426638176638177,
+      "grad_norm": 0.6031842827796936,
+      "learning_rate": 0.00017396631237523392,
+      "loss": 1.1932,
+      "step": 5294
+    },
+    {
+      "epoch": 0.9428418803418803,
+      "grad_norm": 0.6013330221176147,
+      "learning_rate": 0.00017395689166945224,
+      "loss": 1.2078,
+      "step": 5295
+    },
+    {
+      "epoch": 0.9430199430199431,
+      "grad_norm": 0.5147021412849426,
+      "learning_rate": 0.00017394746951463893,
+      "loss": 0.9988,
+      "step": 5296
+    },
+    {
+      "epoch": 0.9431980056980057,
+      "grad_norm": 0.5721762776374817,
+      "learning_rate": 0.0001739380459109785,
+      "loss": 1.1442,
+      "step": 5297
+    },
+    {
+      "epoch": 0.9433760683760684,
+      "grad_norm": 0.49272531270980835,
+      "learning_rate": 0.0001739286208586557,
+      "loss": 1.0481,
+      "step": 5298
+    },
+    {
+      "epoch": 0.9435541310541311,
+      "grad_norm": 0.6545688509941101,
+      "learning_rate": 0.00017391919435785514,
+      "loss": 1.1393,
+      "step": 5299
+    },
+    {
+      "epoch": 0.9437321937321937,
+      "grad_norm": 0.617756724357605,
+      "learning_rate": 0.00017390976640876152,
+      "loss": 1.1108,
+      "step": 5300
+    },
+    {
+      "epoch": 0.9439102564102564,
+      "grad_norm": 0.4870470464229584,
+      "learning_rate": 0.00017390033701155955,
+      "loss": 0.9028,
+      "step": 5301
+    },
+    {
+      "epoch": 0.9440883190883191,
+      "grad_norm": 0.5250138640403748,
+      "learning_rate": 0.000173890906166434,
+      "loss": 1.0326,
+      "step": 5302
+    },
+    {
+      "epoch": 0.9442663817663818,
+      "grad_norm": 0.5879467129707336,
+      "learning_rate": 0.00017388147387356964,
+      "loss": 1.1569,
+      "step": 5303
+    },
+    {
+      "epoch": 0.9444444444444444,
+      "grad_norm": 0.4790486991405487,
+      "learning_rate": 0.00017387204013315127,
+      "loss": 0.967,
+      "step": 5304
+    },
+    {
+      "epoch": 0.9446225071225072,
+      "grad_norm": 0.5884372591972351,
+      "learning_rate": 0.0001738626049453637,
+      "loss": 1.1342,
+      "step": 5305
+    },
+    {
+      "epoch": 0.9448005698005698,
+      "grad_norm": 0.4633975028991699,
+      "learning_rate": 0.00017385316831039187,
+      "loss": 0.8942,
+      "step": 5306
+    },
+    {
+      "epoch": 0.9449786324786325,
+      "grad_norm": 0.5301823019981384,
+      "learning_rate": 0.0001738437302284206,
+      "loss": 1.1683,
+      "step": 5307
+    },
+    {
+      "epoch": 0.9451566951566952,
+      "grad_norm": 0.5476770997047424,
+      "learning_rate": 0.00017383429069963484,
+      "loss": 1.1574,
+      "step": 5308
+    },
+    {
+      "epoch": 0.9453347578347578,
+      "grad_norm": 0.47689101099967957,
+      "learning_rate": 0.00017382484972421953,
+      "loss": 1.0792,
+      "step": 5309
+    },
+    {
+      "epoch": 0.9455128205128205,
+      "grad_norm": 0.526063084602356,
+      "learning_rate": 0.00017381540730235963,
+      "loss": 0.9012,
+      "step": 5310
+    },
+    {
+      "epoch": 0.9456908831908832,
+      "grad_norm": 0.5667058229446411,
+      "learning_rate": 0.0001738059634342402,
+      "loss": 1.0908,
+      "step": 5311
+    },
+    {
+      "epoch": 0.9458689458689459,
+      "grad_norm": 0.5402196645736694,
+      "learning_rate": 0.00017379651812004623,
+      "loss": 0.943,
+      "step": 5312
+    },
+    {
+      "epoch": 0.9460470085470085,
+      "grad_norm": 0.5288932919502258,
+      "learning_rate": 0.00017378707135996276,
+      "loss": 1.0055,
+      "step": 5313
+    },
+    {
+      "epoch": 0.9462250712250713,
+      "grad_norm": 0.5607456564903259,
+      "learning_rate": 0.00017377762315417492,
+      "loss": 1.2073,
+      "step": 5314
+    },
+    {
+      "epoch": 0.9464031339031339,
+      "grad_norm": 0.5737698674201965,
+      "learning_rate": 0.00017376817350286781,
+      "loss": 1.0001,
+      "step": 5315
+    },
+    {
+      "epoch": 0.9465811965811965,
+      "grad_norm": 0.6562079787254333,
+      "learning_rate": 0.00017375872240622657,
+      "loss": 1.1503,
+      "step": 5316
+    },
+    {
+      "epoch": 0.9467592592592593,
+      "grad_norm": 0.5407183170318604,
+      "learning_rate": 0.0001737492698644364,
+      "loss": 1.1169,
+      "step": 5317
+    },
+    {
+      "epoch": 0.9469373219373219,
+      "grad_norm": 0.5504152178764343,
+      "learning_rate": 0.00017373981587768248,
+      "loss": 1.0468,
+      "step": 5318
+    },
+    {
+      "epoch": 0.9471153846153846,
+      "grad_norm": 0.4813530743122101,
+      "learning_rate": 0.00017373036044615006,
+      "loss": 0.9707,
+      "step": 5319
+    },
+    {
+      "epoch": 0.9472934472934473,
+      "grad_norm": 0.5810509920120239,
+      "learning_rate": 0.00017372090357002437,
+      "loss": 1.4949,
+      "step": 5320
+    },
+    {
+      "epoch": 0.94747150997151,
+      "grad_norm": 0.5250222086906433,
+      "learning_rate": 0.00017371144524949074,
+      "loss": 1.0818,
+      "step": 5321
+    },
+    {
+      "epoch": 0.9476495726495726,
+      "grad_norm": 0.4852280914783478,
+      "learning_rate": 0.00017370198548473444,
+      "loss": 1.1793,
+      "step": 5322
+    },
+    {
+      "epoch": 0.9478276353276354,
+      "grad_norm": 0.5392420291900635,
+      "learning_rate": 0.00017369252427594086,
+      "loss": 1.153,
+      "step": 5323
+    },
+    {
+      "epoch": 0.948005698005698,
+      "grad_norm": 0.521294116973877,
+      "learning_rate": 0.00017368306162329533,
+      "loss": 0.8572,
+      "step": 5324
+    },
+    {
+      "epoch": 0.9481837606837606,
+      "grad_norm": 0.5579673647880554,
+      "learning_rate": 0.0001736735975269833,
+      "loss": 1.0452,
+      "step": 5325
+    },
+    {
+      "epoch": 0.9483618233618234,
+      "grad_norm": 0.6027318835258484,
+      "learning_rate": 0.0001736641319871901,
+      "loss": 1.3475,
+      "step": 5326
+    },
+    {
+      "epoch": 0.948539886039886,
+      "grad_norm": 0.5600738525390625,
+      "learning_rate": 0.00017365466500410132,
+      "loss": 1.0338,
+      "step": 5327
+    },
+    {
+      "epoch": 0.9487179487179487,
+      "grad_norm": 0.5691532492637634,
+      "learning_rate": 0.00017364519657790236,
+      "loss": 1.129,
+      "step": 5328
+    },
+    {
+      "epoch": 0.9488960113960114,
+      "grad_norm": 0.5161463022232056,
+      "learning_rate": 0.0001736357267087788,
+      "loss": 1.0438,
+      "step": 5329
+    },
+    {
+      "epoch": 0.9490740740740741,
+      "grad_norm": 0.5049656629562378,
+      "learning_rate": 0.0001736262553969161,
+      "loss": 0.9484,
+      "step": 5330
+    },
+    {
+      "epoch": 0.9492521367521367,
+      "grad_norm": 0.5477150678634644,
+      "learning_rate": 0.00017361678264249988,
+      "loss": 0.8995,
+      "step": 5331
+    },
+    {
+      "epoch": 0.9494301994301995,
+      "grad_norm": 0.5679608583450317,
+      "learning_rate": 0.0001736073084457157,
+      "loss": 1.241,
+      "step": 5332
+    },
+    {
+      "epoch": 0.9496082621082621,
+      "grad_norm": 0.5748196840286255,
+      "learning_rate": 0.00017359783280674926,
+      "loss": 1.0046,
+      "step": 5333
+    },
+    {
+      "epoch": 0.9497863247863247,
+      "grad_norm": 0.5677094459533691,
+      "learning_rate": 0.00017358835572578617,
+      "loss": 1.2913,
+      "step": 5334
+    },
+    {
+      "epoch": 0.9499643874643875,
+      "grad_norm": 0.49663659930229187,
+      "learning_rate": 0.0001735788772030121,
+      "loss": 1.0388,
+      "step": 5335
+    },
+    {
+      "epoch": 0.9501424501424501,
+      "grad_norm": 0.5687218904495239,
+      "learning_rate": 0.0001735693972386128,
+      "loss": 1.1631,
+      "step": 5336
+    },
+    {
+      "epoch": 0.9503205128205128,
+      "grad_norm": 0.520708441734314,
+      "learning_rate": 0.00017355991583277395,
+      "loss": 1.0744,
+      "step": 5337
+    },
+    {
+      "epoch": 0.9504985754985755,
+      "grad_norm": 0.5738952159881592,
+      "learning_rate": 0.00017355043298568137,
+      "loss": 1.318,
+      "step": 5338
+    },
+    {
+      "epoch": 0.9506766381766382,
+      "grad_norm": 0.5378455519676208,
+      "learning_rate": 0.00017354094869752085,
+      "loss": 0.9827,
+      "step": 5339
+    },
+    {
+      "epoch": 0.9508547008547008,
+      "grad_norm": 0.5047366619110107,
+      "learning_rate": 0.0001735314629684782,
+      "loss": 1.0966,
+      "step": 5340
+    },
+    {
+      "epoch": 0.9510327635327636,
+      "grad_norm": 0.5526043772697449,
+      "learning_rate": 0.0001735219757987393,
+      "loss": 1.059,
+      "step": 5341
+    },
+    {
+      "epoch": 0.9512108262108262,
+      "grad_norm": 0.5741400718688965,
+      "learning_rate": 0.00017351248718849003,
+      "loss": 1.1232,
+      "step": 5342
+    },
+    {
+      "epoch": 0.9513888888888888,
+      "grad_norm": 0.5421118140220642,
+      "learning_rate": 0.00017350299713791626,
+      "loss": 1.0427,
+      "step": 5343
+    },
+    {
+      "epoch": 0.9515669515669516,
+      "grad_norm": 0.4857081472873688,
+      "learning_rate": 0.00017349350564720392,
+      "loss": 0.8663,
+      "step": 5344
+    },
+    {
+      "epoch": 0.9517450142450142,
+      "grad_norm": 0.5411618947982788,
+      "learning_rate": 0.00017348401271653904,
+      "loss": 1.0317,
+      "step": 5345
+    },
+    {
+      "epoch": 0.9519230769230769,
+      "grad_norm": 0.5246246457099915,
+      "learning_rate": 0.00017347451834610756,
+      "loss": 1.0076,
+      "step": 5346
+    },
+    {
+      "epoch": 0.9521011396011396,
+      "grad_norm": 0.5278927683830261,
+      "learning_rate": 0.00017346502253609556,
+      "loss": 0.931,
+      "step": 5347
+    },
+    {
+      "epoch": 0.9522792022792023,
+      "grad_norm": 0.5934548377990723,
+      "learning_rate": 0.00017345552528668902,
+      "loss": 1.3205,
+      "step": 5348
+    },
+    {
+      "epoch": 0.9524572649572649,
+      "grad_norm": 0.5466100573539734,
+      "learning_rate": 0.00017344602659807406,
+      "loss": 0.8725,
+      "step": 5349
+    },
+    {
+      "epoch": 0.9526353276353277,
+      "grad_norm": 0.5220118761062622,
+      "learning_rate": 0.00017343652647043678,
+      "loss": 1.1642,
+      "step": 5350
+    },
+    {
+      "epoch": 0.9528133903133903,
+      "grad_norm": 0.6166301965713501,
+      "learning_rate": 0.0001734270249039633,
+      "loss": 0.8152,
+      "step": 5351
+    },
+    {
+      "epoch": 0.9529914529914529,
+      "grad_norm": 0.5173428058624268,
+      "learning_rate": 0.00017341752189883983,
+      "loss": 0.9296,
+      "step": 5352
+    },
+    {
+      "epoch": 0.9531695156695157,
+      "grad_norm": 0.5363461375236511,
+      "learning_rate": 0.0001734080174552525,
+      "loss": 1.3546,
+      "step": 5353
+    },
+    {
+      "epoch": 0.9533475783475783,
+      "grad_norm": 0.5333831906318665,
+      "learning_rate": 0.0001733985115733876,
+      "loss": 1.0401,
+      "step": 5354
+    },
+    {
+      "epoch": 0.9535256410256411,
+      "grad_norm": 0.5179334878921509,
+      "learning_rate": 0.00017338900425343132,
+      "loss": 1.1254,
+      "step": 5355
+    },
+    {
+      "epoch": 0.9537037037037037,
+      "grad_norm": 0.5171303153038025,
+      "learning_rate": 0.00017337949549556993,
+      "loss": 1.0518,
+      "step": 5356
+    },
+    {
+      "epoch": 0.9538817663817664,
+      "grad_norm": 0.5164596438407898,
+      "learning_rate": 0.00017336998529998978,
+      "loss": 0.8732,
+      "step": 5357
+    },
+    {
+      "epoch": 0.9540598290598291,
+      "grad_norm": 0.5555717349052429,
+      "learning_rate": 0.00017336047366687719,
+      "loss": 1.2312,
+      "step": 5358
+    },
+    {
+      "epoch": 0.9542378917378918,
+      "grad_norm": 0.45685622096061707,
+      "learning_rate": 0.00017335096059641847,
+      "loss": 0.8882,
+      "step": 5359
+    },
+    {
+      "epoch": 0.9544159544159544,
+      "grad_norm": 0.5260133743286133,
+      "learning_rate": 0.0001733414460888001,
+      "loss": 1.0952,
+      "step": 5360
+    },
+    {
+      "epoch": 0.9545940170940171,
+      "grad_norm": 0.4597703814506531,
+      "learning_rate": 0.0001733319301442084,
+      "loss": 1.0835,
+      "step": 5361
+    },
+    {
+      "epoch": 0.9547720797720798,
+      "grad_norm": 0.5279495120048523,
+      "learning_rate": 0.0001733224127628299,
+      "loss": 1.0295,
+      "step": 5362
+    },
+    {
+      "epoch": 0.9549501424501424,
+      "grad_norm": 0.48919400572776794,
+      "learning_rate": 0.00017331289394485104,
+      "loss": 0.9693,
+      "step": 5363
+    },
+    {
+      "epoch": 0.9551282051282052,
+      "grad_norm": 0.5639515519142151,
+      "learning_rate": 0.0001733033736904583,
+      "loss": 1.0893,
+      "step": 5364
+    },
+    {
+      "epoch": 0.9553062678062678,
+      "grad_norm": 0.49761319160461426,
+      "learning_rate": 0.00017329385199983823,
+      "loss": 1.038,
+      "step": 5365
+    },
+    {
+      "epoch": 0.9554843304843305,
+      "grad_norm": 0.5503305792808533,
+      "learning_rate": 0.0001732843288731774,
+      "loss": 0.9976,
+      "step": 5366
+    },
+    {
+      "epoch": 0.9556623931623932,
+      "grad_norm": 0.5633028745651245,
+      "learning_rate": 0.00017327480431066235,
+      "loss": 1.0602,
+      "step": 5367
+    },
+    {
+      "epoch": 0.9558404558404558,
+      "grad_norm": 0.48074454069137573,
+      "learning_rate": 0.00017326527831247973,
+      "loss": 1.0286,
+      "step": 5368
+    },
+    {
+      "epoch": 0.9560185185185185,
+      "grad_norm": 0.506597638130188,
+      "learning_rate": 0.0001732557508788162,
+      "loss": 0.9061,
+      "step": 5369
+    },
+    {
+      "epoch": 0.9561965811965812,
+      "grad_norm": 0.6570749282836914,
+      "learning_rate": 0.0001732462220098584,
+      "loss": 1.0852,
+      "step": 5370
+    },
+    {
+      "epoch": 0.9563746438746439,
+      "grad_norm": 0.5607653856277466,
+      "learning_rate": 0.00017323669170579302,
+      "loss": 1.0486,
+      "step": 5371
+    },
+    {
+      "epoch": 0.9565527065527065,
+      "grad_norm": 0.6047050356864929,
+      "learning_rate": 0.0001732271599668068,
+      "loss": 1.2175,
+      "step": 5372
+    },
+    {
+      "epoch": 0.9567307692307693,
+      "grad_norm": 0.5506869554519653,
+      "learning_rate": 0.00017321762679308651,
+      "loss": 1.0114,
+      "step": 5373
+    },
+    {
+      "epoch": 0.9569088319088319,
+      "grad_norm": 0.5868638157844543,
+      "learning_rate": 0.00017320809218481891,
+      "loss": 1.2983,
+      "step": 5374
+    },
+    {
+      "epoch": 0.9570868945868946,
+      "grad_norm": 0.539619505405426,
+      "learning_rate": 0.00017319855614219084,
+      "loss": 1.2361,
+      "step": 5375
+    },
+    {
+      "epoch": 0.9572649572649573,
+      "grad_norm": 0.5525495409965515,
+      "learning_rate": 0.0001731890186653891,
+      "loss": 1.1316,
+      "step": 5376
+    },
+    {
+      "epoch": 0.95744301994302,
+      "grad_norm": 0.5549767017364502,
+      "learning_rate": 0.0001731794797546006,
+      "loss": 1.0547,
+      "step": 5377
+    },
+    {
+      "epoch": 0.9576210826210826,
+      "grad_norm": 0.5356076955795288,
+      "learning_rate": 0.00017316993941001222,
+      "loss": 0.9942,
+      "step": 5378
+    },
+    {
+      "epoch": 0.9577991452991453,
+      "grad_norm": 0.5365784168243408,
+      "learning_rate": 0.00017316039763181084,
+      "loss": 1.226,
+      "step": 5379
+    },
+    {
+      "epoch": 0.957977207977208,
+      "grad_norm": 0.5190927386283875,
+      "learning_rate": 0.00017315085442018343,
+      "loss": 1.1704,
+      "step": 5380
+    },
+    {
+      "epoch": 0.9581552706552706,
+      "grad_norm": 0.526658833026886,
+      "learning_rate": 0.00017314130977531705,
+      "loss": 1.109,
+      "step": 5381
+    },
+    {
+      "epoch": 0.9583333333333334,
+      "grad_norm": 0.5373684763908386,
+      "learning_rate": 0.0001731317636973986,
+      "loss": 1.0018,
+      "step": 5382
+    },
+    {
+      "epoch": 0.958511396011396,
+      "grad_norm": 0.5714904069900513,
+      "learning_rate": 0.00017312221618661516,
+      "loss": 1.1855,
+      "step": 5383
+    },
+    {
+      "epoch": 0.9586894586894587,
+      "grad_norm": 0.5707863569259644,
+      "learning_rate": 0.00017311266724315377,
+      "loss": 0.9482,
+      "step": 5384
+    },
+    {
+      "epoch": 0.9588675213675214,
+      "grad_norm": 0.5856872797012329,
+      "learning_rate": 0.00017310311686720157,
+      "loss": 0.9543,
+      "step": 5385
+    },
+    {
+      "epoch": 0.959045584045584,
+      "grad_norm": 0.5041963458061218,
+      "learning_rate": 0.00017309356505894568,
+      "loss": 1.1427,
+      "step": 5386
+    },
+    {
+      "epoch": 0.9592236467236467,
+      "grad_norm": 0.5409179925918579,
+      "learning_rate": 0.00017308401181857316,
+      "loss": 0.8432,
+      "step": 5387
+    },
+    {
+      "epoch": 0.9594017094017094,
+      "grad_norm": 0.5248702764511108,
+      "learning_rate": 0.00017307445714627128,
+      "loss": 1.1403,
+      "step": 5388
+    },
+    {
+      "epoch": 0.9595797720797721,
+      "grad_norm": 0.50718092918396,
+      "learning_rate": 0.00017306490104222722,
+      "loss": 0.9066,
+      "step": 5389
+    },
+    {
+      "epoch": 0.9597578347578347,
+      "grad_norm": 0.5563821196556091,
+      "learning_rate": 0.0001730553435066282,
+      "loss": 1.0204,
+      "step": 5390
+    },
+    {
+      "epoch": 0.9599358974358975,
+      "grad_norm": 0.5696987509727478,
+      "learning_rate": 0.00017304578453966146,
+      "loss": 1.1405,
+      "step": 5391
+    },
+    {
+      "epoch": 0.9601139601139601,
+      "grad_norm": 0.5927395224571228,
+      "learning_rate": 0.00017303622414151435,
+      "loss": 1.0398,
+      "step": 5392
+    },
+    {
+      "epoch": 0.9602920227920227,
+      "grad_norm": 0.5375707745552063,
+      "learning_rate": 0.0001730266623123741,
+      "loss": 0.9519,
+      "step": 5393
+    },
+    {
+      "epoch": 0.9604700854700855,
+      "grad_norm": 0.457998126745224,
+      "learning_rate": 0.00017301709905242815,
+      "loss": 0.8743,
+      "step": 5394
+    },
+    {
+      "epoch": 0.9606481481481481,
+      "grad_norm": 0.5427796244621277,
+      "learning_rate": 0.00017300753436186382,
+      "loss": 1.078,
+      "step": 5395
+    },
+    {
+      "epoch": 0.9608262108262108,
+      "grad_norm": 0.5458595752716064,
+      "learning_rate": 0.0001729979682408685,
+      "loss": 1.1081,
+      "step": 5396
+    },
+    {
+      "epoch": 0.9610042735042735,
+      "grad_norm": 0.5495280027389526,
+      "learning_rate": 0.00017298840068962962,
+      "loss": 1.0141,
+      "step": 5397
+    },
+    {
+      "epoch": 0.9611823361823362,
+      "grad_norm": 0.5878560543060303,
+      "learning_rate": 0.00017297883170833465,
+      "loss": 1.302,
+      "step": 5398
+    },
+    {
+      "epoch": 0.9613603988603988,
+      "grad_norm": 0.5452881455421448,
+      "learning_rate": 0.00017296926129717108,
+      "loss": 0.9929,
+      "step": 5399
+    },
+    {
+      "epoch": 0.9615384615384616,
+      "grad_norm": 0.6021811366081238,
+      "learning_rate": 0.0001729596894563264,
+      "loss": 1.2629,
+      "step": 5400
+    },
+    {
+      "epoch": 0.9617165242165242,
+      "grad_norm": 0.5820204615592957,
+      "learning_rate": 0.0001729501161859882,
+      "loss": 1.0662,
+      "step": 5401
+    },
+    {
+      "epoch": 0.9618945868945868,
+      "grad_norm": 0.4953218102455139,
+      "learning_rate": 0.000172940541486344,
+      "loss": 1.047,
+      "step": 5402
+    },
+    {
+      "epoch": 0.9620726495726496,
+      "grad_norm": 0.5409793853759766,
+      "learning_rate": 0.00017293096535758143,
+      "loss": 1.1993,
+      "step": 5403
+    },
+    {
+      "epoch": 0.9622507122507122,
+      "grad_norm": 0.49702873826026917,
+      "learning_rate": 0.00017292138779988805,
+      "loss": 1.2471,
+      "step": 5404
+    },
+    {
+      "epoch": 0.9624287749287749,
+      "grad_norm": 0.5743489861488342,
+      "learning_rate": 0.00017291180881345158,
+      "loss": 1.0816,
+      "step": 5405
+    },
+    {
+      "epoch": 0.9626068376068376,
+      "grad_norm": 0.5747945308685303,
+      "learning_rate": 0.00017290222839845968,
+      "loss": 1.3548,
+      "step": 5406
+    },
+    {
+      "epoch": 0.9627849002849003,
+      "grad_norm": 0.5341345071792603,
+      "learning_rate": 0.00017289264655510005,
+      "loss": 1.0435,
+      "step": 5407
+    },
+    {
+      "epoch": 0.9629629629629629,
+      "grad_norm": 0.5719689130783081,
+      "learning_rate": 0.00017288306328356044,
+      "loss": 1.2319,
+      "step": 5408
+    },
+    {
+      "epoch": 0.9631410256410257,
+      "grad_norm": 0.4783279597759247,
+      "learning_rate": 0.0001728734785840286,
+      "loss": 0.9397,
+      "step": 5409
+    },
+    {
+      "epoch": 0.9633190883190883,
+      "grad_norm": 0.4730507731437683,
+      "learning_rate": 0.00017286389245669233,
+      "loss": 0.9384,
+      "step": 5410
+    },
+    {
+      "epoch": 0.9634971509971509,
+      "grad_norm": 0.5309939384460449,
+      "learning_rate": 0.00017285430490173944,
+      "loss": 1.098,
+      "step": 5411
+    },
+    {
+      "epoch": 0.9636752136752137,
+      "grad_norm": 0.5177853107452393,
+      "learning_rate": 0.0001728447159193578,
+      "loss": 1.2777,
+      "step": 5412
+    },
+    {
+      "epoch": 0.9638532763532763,
+      "grad_norm": 0.6437913775444031,
+      "learning_rate": 0.00017283512550973526,
+      "loss": 1.2661,
+      "step": 5413
+    },
+    {
+      "epoch": 0.9640313390313391,
+      "grad_norm": 0.6096072196960449,
+      "learning_rate": 0.00017282553367305975,
+      "loss": 0.9569,
+      "step": 5414
+    },
+    {
+      "epoch": 0.9642094017094017,
+      "grad_norm": 0.5104934573173523,
+      "learning_rate": 0.00017281594040951918,
+      "loss": 0.9666,
+      "step": 5415
+    },
+    {
+      "epoch": 0.9643874643874644,
+      "grad_norm": 0.6178240776062012,
+      "learning_rate": 0.00017280634571930153,
+      "loss": 1.1277,
+      "step": 5416
+    },
+    {
+      "epoch": 0.9645655270655271,
+      "grad_norm": 0.5749034881591797,
+      "learning_rate": 0.0001727967496025948,
+      "loss": 1.245,
+      "step": 5417
+    },
+    {
+      "epoch": 0.9647435897435898,
+      "grad_norm": 0.5036978721618652,
+      "learning_rate": 0.00017278715205958694,
+      "loss": 1.3049,
+      "step": 5418
+    },
+    {
+      "epoch": 0.9649216524216524,
+      "grad_norm": 0.5593041777610779,
+      "learning_rate": 0.00017277755309046605,
+      "loss": 1.2304,
+      "step": 5419
+    },
+    {
+      "epoch": 0.9650997150997151,
+      "grad_norm": 0.5446555614471436,
+      "learning_rate": 0.0001727679526954202,
+      "loss": 0.732,
+      "step": 5420
+    },
+    {
+      "epoch": 0.9652777777777778,
+      "grad_norm": 0.6063070297241211,
+      "learning_rate": 0.00017275835087463747,
+      "loss": 1.3723,
+      "step": 5421
+    },
+    {
+      "epoch": 0.9654558404558404,
+      "grad_norm": 0.4994211792945862,
+      "learning_rate": 0.00017274874762830602,
+      "loss": 1.0505,
+      "step": 5422
+    },
+    {
+      "epoch": 0.9656339031339032,
+      "grad_norm": 0.49396973848342896,
+      "learning_rate": 0.00017273914295661395,
+      "loss": 0.8691,
+      "step": 5423
+    },
+    {
+      "epoch": 0.9658119658119658,
+      "grad_norm": 0.5067027807235718,
+      "learning_rate": 0.0001727295368597495,
+      "loss": 0.9744,
+      "step": 5424
+    },
+    {
+      "epoch": 0.9659900284900285,
+      "grad_norm": 0.6720643043518066,
+      "learning_rate": 0.00017271992933790085,
+      "loss": 1.1513,
+      "step": 5425
+    },
+    {
+      "epoch": 0.9661680911680912,
+      "grad_norm": 0.5494341254234314,
+      "learning_rate": 0.00017271032039125624,
+      "loss": 0.8295,
+      "step": 5426
+    },
+    {
+      "epoch": 0.9663461538461539,
+      "grad_norm": 0.644332230091095,
+      "learning_rate": 0.00017270071002000394,
+      "loss": 1.0043,
+      "step": 5427
+    },
+    {
+      "epoch": 0.9665242165242165,
+      "grad_norm": 0.5658500790596008,
+      "learning_rate": 0.00017269109822433225,
+      "loss": 1.2575,
+      "step": 5428
+    },
+    {
+      "epoch": 0.9667022792022792,
+      "grad_norm": 0.5163155794143677,
+      "learning_rate": 0.00017268148500442952,
+      "loss": 1.1391,
+      "step": 5429
+    },
+    {
+      "epoch": 0.9668803418803419,
+      "grad_norm": 0.5113703608512878,
+      "learning_rate": 0.00017267187036048404,
+      "loss": 1.0819,
+      "step": 5430
+    },
+    {
+      "epoch": 0.9670584045584045,
+      "grad_norm": 0.6339422464370728,
+      "learning_rate": 0.00017266225429268426,
+      "loss": 1.0733,
+      "step": 5431
+    },
+    {
+      "epoch": 0.9672364672364673,
+      "grad_norm": 0.5158288478851318,
+      "learning_rate": 0.0001726526368012185,
+      "loss": 0.9518,
+      "step": 5432
+    },
+    {
+      "epoch": 0.9674145299145299,
+      "grad_norm": 0.593717634677887,
+      "learning_rate": 0.00017264301788627527,
+      "loss": 0.9416,
+      "step": 5433
+    },
+    {
+      "epoch": 0.9675925925925926,
+      "grad_norm": 0.49593186378479004,
+      "learning_rate": 0.00017263339754804301,
+      "loss": 1.0307,
+      "step": 5434
+    },
+    {
+      "epoch": 0.9677706552706553,
+      "grad_norm": 0.44032949209213257,
+      "learning_rate": 0.00017262377578671024,
+      "loss": 0.7884,
+      "step": 5435
+    },
+    {
+      "epoch": 0.967948717948718,
+      "grad_norm": 0.513073742389679,
+      "learning_rate": 0.00017261415260246538,
+      "loss": 0.9797,
+      "step": 5436
+    },
+    {
+      "epoch": 0.9681267806267806,
+      "grad_norm": 0.5737422108650208,
+      "learning_rate": 0.0001726045279954971,
+      "loss": 1.0487,
+      "step": 5437
+    },
+    {
+      "epoch": 0.9683048433048433,
+      "grad_norm": 0.5385867953300476,
+      "learning_rate": 0.0001725949019659939,
+      "loss": 1.4166,
+      "step": 5438
+    },
+    {
+      "epoch": 0.968482905982906,
+      "grad_norm": 0.5224326848983765,
+      "learning_rate": 0.00017258527451414438,
+      "loss": 1.195,
+      "step": 5439
+    },
+    {
+      "epoch": 0.9686609686609686,
+      "grad_norm": 0.5305148363113403,
+      "learning_rate": 0.0001725756456401372,
+      "loss": 1.0301,
+      "step": 5440
+    },
+    {
+      "epoch": 0.9688390313390314,
+      "grad_norm": 0.532588005065918,
+      "learning_rate": 0.000172566015344161,
+      "loss": 1.1269,
+      "step": 5441
+    },
+    {
+      "epoch": 0.969017094017094,
+      "grad_norm": 0.5812515020370483,
+      "learning_rate": 0.0001725563836264045,
+      "loss": 1.1787,
+      "step": 5442
+    },
+    {
+      "epoch": 0.9691951566951567,
+      "grad_norm": 0.4962109327316284,
+      "learning_rate": 0.00017254675048705638,
+      "loss": 1.0639,
+      "step": 5443
+    },
+    {
+      "epoch": 0.9693732193732194,
+      "grad_norm": 0.5094883441925049,
+      "learning_rate": 0.00017253711592630534,
+      "loss": 1.0922,
+      "step": 5444
+    },
+    {
+      "epoch": 0.969551282051282,
+      "grad_norm": 0.5728049874305725,
+      "learning_rate": 0.00017252747994434025,
+      "loss": 1.1237,
+      "step": 5445
+    },
+    {
+      "epoch": 0.9697293447293447,
+      "grad_norm": 0.5406180620193481,
+      "learning_rate": 0.00017251784254134983,
+      "loss": 1.1161,
+      "step": 5446
+    },
+    {
+      "epoch": 0.9699074074074074,
+      "grad_norm": 0.5724552869796753,
+      "learning_rate": 0.00017250820371752292,
+      "loss": 1.2205,
+      "step": 5447
+    },
+    {
+      "epoch": 0.9700854700854701,
+      "grad_norm": 0.5698846578598022,
+      "learning_rate": 0.0001724985634730484,
+      "loss": 1.1472,
+      "step": 5448
+    },
+    {
+      "epoch": 0.9702635327635327,
+      "grad_norm": 0.5315805673599243,
+      "learning_rate": 0.0001724889218081151,
+      "loss": 1.0253,
+      "step": 5449
+    },
+    {
+      "epoch": 0.9704415954415955,
+      "grad_norm": 0.5970377326011658,
+      "learning_rate": 0.000172479278722912,
+      "loss": 1.3033,
+      "step": 5450
+    },
+    {
+      "epoch": 0.9706196581196581,
+      "grad_norm": 0.6149488687515259,
+      "learning_rate": 0.00017246963421762798,
+      "loss": 1.0689,
+      "step": 5451
+    },
+    {
+      "epoch": 0.9707977207977208,
+      "grad_norm": 0.4848574995994568,
+      "learning_rate": 0.00017245998829245202,
+      "loss": 0.8829,
+      "step": 5452
+    },
+    {
+      "epoch": 0.9709757834757835,
+      "grad_norm": 0.6073294281959534,
+      "learning_rate": 0.00017245034094757312,
+      "loss": 1.2378,
+      "step": 5453
+    },
+    {
+      "epoch": 0.9711538461538461,
+      "grad_norm": 0.6362034678459167,
+      "learning_rate": 0.00017244069218318026,
+      "loss": 1.3606,
+      "step": 5454
+    },
+    {
+      "epoch": 0.9713319088319088,
+      "grad_norm": 0.5353880524635315,
+      "learning_rate": 0.00017243104199946257,
+      "loss": 1.1288,
+      "step": 5455
+    },
+    {
+      "epoch": 0.9715099715099715,
+      "grad_norm": 0.5096352100372314,
+      "learning_rate": 0.00017242139039660902,
+      "loss": 1.0056,
+      "step": 5456
+    },
+    {
+      "epoch": 0.9716880341880342,
+      "grad_norm": 0.5086682438850403,
+      "learning_rate": 0.00017241173737480884,
+      "loss": 1.091,
+      "step": 5457
+    },
+    {
+      "epoch": 0.9718660968660968,
+      "grad_norm": 0.5034295320510864,
+      "learning_rate": 0.000172402082934251,
+      "loss": 0.9749,
+      "step": 5458
+    },
+    {
+      "epoch": 0.9720441595441596,
+      "grad_norm": 0.5205379724502563,
+      "learning_rate": 0.0001723924270751248,
+      "loss": 1.1068,
+      "step": 5459
+    },
+    {
+      "epoch": 0.9722222222222222,
+      "grad_norm": 0.5904826521873474,
+      "learning_rate": 0.00017238276979761937,
+      "loss": 1.0613,
+      "step": 5460
+    },
+    {
+      "epoch": 0.9724002849002849,
+      "grad_norm": 0.6415045261383057,
+      "learning_rate": 0.0001723731111019239,
+      "loss": 1.2126,
+      "step": 5461
+    },
+    {
+      "epoch": 0.9725783475783476,
+      "grad_norm": 0.5769147872924805,
+      "learning_rate": 0.0001723634509882277,
+      "loss": 1.337,
+      "step": 5462
+    },
+    {
+      "epoch": 0.9727564102564102,
+      "grad_norm": 0.5585111975669861,
+      "learning_rate": 0.00017235378945671998,
+      "loss": 1.3922,
+      "step": 5463
+    },
+    {
+      "epoch": 0.9729344729344729,
+      "grad_norm": 0.5788411498069763,
+      "learning_rate": 0.00017234412650759008,
+      "loss": 0.8532,
+      "step": 5464
+    },
+    {
+      "epoch": 0.9731125356125356,
+      "grad_norm": 0.5617673397064209,
+      "learning_rate": 0.00017233446214102728,
+      "loss": 1.2575,
+      "step": 5465
+    },
+    {
+      "epoch": 0.9732905982905983,
+      "grad_norm": 0.4227815568447113,
+      "learning_rate": 0.00017232479635722093,
+      "loss": 1.0618,
+      "step": 5466
+    },
+    {
+      "epoch": 0.9734686609686609,
+      "grad_norm": 0.49751797318458557,
+      "learning_rate": 0.00017231512915636047,
+      "loss": 0.7714,
+      "step": 5467
+    },
+    {
+      "epoch": 0.9736467236467237,
+      "grad_norm": 0.5983800292015076,
+      "learning_rate": 0.0001723054605386353,
+      "loss": 1.2297,
+      "step": 5468
+    },
+    {
+      "epoch": 0.9738247863247863,
+      "grad_norm": 0.543394923210144,
+      "learning_rate": 0.0001722957905042348,
+      "loss": 1.0078,
+      "step": 5469
+    },
+    {
+      "epoch": 0.9740028490028491,
+      "grad_norm": 0.5633566975593567,
+      "learning_rate": 0.00017228611905334846,
+      "loss": 1.0938,
+      "step": 5470
+    },
+    {
+      "epoch": 0.9741809116809117,
+      "grad_norm": 0.49377235770225525,
+      "learning_rate": 0.00017227644618616578,
+      "loss": 1.096,
+      "step": 5471
+    },
+    {
+      "epoch": 0.9743589743589743,
+      "grad_norm": 0.4963362216949463,
+      "learning_rate": 0.00017226677190287627,
+      "loss": 1.0003,
+      "step": 5472
+    },
+    {
+      "epoch": 0.9745370370370371,
+      "grad_norm": 0.4483006000518799,
+      "learning_rate": 0.00017225709620366953,
+      "loss": 0.8623,
+      "step": 5473
+    },
+    {
+      "epoch": 0.9747150997150997,
+      "grad_norm": 0.5429352521896362,
+      "learning_rate": 0.00017224741908873506,
+      "loss": 1.1383,
+      "step": 5474
+    },
+    {
+      "epoch": 0.9748931623931624,
+      "grad_norm": 0.5871657729148865,
+      "learning_rate": 0.0001722377405582625,
+      "loss": 1.2005,
+      "step": 5475
+    },
+    {
+      "epoch": 0.9750712250712251,
+      "grad_norm": 0.6002383828163147,
+      "learning_rate": 0.0001722280606124415,
+      "loss": 1.0696,
+      "step": 5476
+    },
+    {
+      "epoch": 0.9752492877492878,
+      "grad_norm": 0.5351617336273193,
+      "learning_rate": 0.00017221837925146164,
+      "loss": 1.243,
+      "step": 5477
+    },
+    {
+      "epoch": 0.9754273504273504,
+      "grad_norm": 0.46613118052482605,
+      "learning_rate": 0.00017220869647551268,
+      "loss": 1.0344,
+      "step": 5478
+    },
+    {
+      "epoch": 0.9756054131054132,
+      "grad_norm": 0.6015593409538269,
+      "learning_rate": 0.00017219901228478432,
+      "loss": 1.082,
+      "step": 5479
+    },
+    {
+      "epoch": 0.9757834757834758,
+      "grad_norm": 0.5829521417617798,
+      "learning_rate": 0.0001721893266794663,
+      "loss": 0.8683,
+      "step": 5480
+    },
+    {
+      "epoch": 0.9759615384615384,
+      "grad_norm": 0.6344960927963257,
+      "learning_rate": 0.00017217963965974838,
+      "loss": 1.1048,
+      "step": 5481
+    },
+    {
+      "epoch": 0.9761396011396012,
+      "grad_norm": 0.5586308240890503,
+      "learning_rate": 0.00017216995122582034,
+      "loss": 0.9657,
+      "step": 5482
+    },
+    {
+      "epoch": 0.9763176638176638,
+      "grad_norm": 0.48625239729881287,
+      "learning_rate": 0.00017216026137787204,
+      "loss": 1.1026,
+      "step": 5483
+    },
+    {
+      "epoch": 0.9764957264957265,
+      "grad_norm": 0.5625223517417908,
+      "learning_rate": 0.00017215057011609332,
+      "loss": 1.1579,
+      "step": 5484
+    },
+    {
+      "epoch": 0.9766737891737892,
+      "grad_norm": 0.6016653776168823,
+      "learning_rate": 0.0001721408774406741,
+      "loss": 1.1777,
+      "step": 5485
+    },
+    {
+      "epoch": 0.9768518518518519,
+      "grad_norm": 0.5444921851158142,
+      "learning_rate": 0.00017213118335180418,
+      "loss": 1.119,
+      "step": 5486
+    },
+    {
+      "epoch": 0.9770299145299145,
+      "grad_norm": 0.5574755668640137,
+      "learning_rate": 0.0001721214878496736,
+      "loss": 1.1128,
+      "step": 5487
+    },
+    {
+      "epoch": 0.9772079772079773,
+      "grad_norm": 0.5486113429069519,
+      "learning_rate": 0.00017211179093447226,
+      "loss": 1.1673,
+      "step": 5488
+    },
+    {
+      "epoch": 0.9773860398860399,
+      "grad_norm": 0.5545483231544495,
+      "learning_rate": 0.00017210209260639018,
+      "loss": 1.1748,
+      "step": 5489
+    },
+    {
+      "epoch": 0.9775641025641025,
+      "grad_norm": 0.5756667256355286,
+      "learning_rate": 0.0001720923928656174,
+      "loss": 1.2377,
+      "step": 5490
+    },
+    {
+      "epoch": 0.9777421652421653,
+      "grad_norm": 0.5744972229003906,
+      "learning_rate": 0.00017208269171234392,
+      "loss": 1.1242,
+      "step": 5491
+    },
+    {
+      "epoch": 0.9779202279202279,
+      "grad_norm": 0.6109468340873718,
+      "learning_rate": 0.00017207298914675984,
+      "loss": 1.1948,
+      "step": 5492
+    },
+    {
+      "epoch": 0.9780982905982906,
+      "grad_norm": 0.5195167660713196,
+      "learning_rate": 0.00017206328516905525,
+      "loss": 1.0941,
+      "step": 5493
+    },
+    {
+      "epoch": 0.9782763532763533,
+      "grad_norm": 0.5549042224884033,
+      "learning_rate": 0.0001720535797794203,
+      "loss": 1.1503,
+      "step": 5494
+    },
+    {
+      "epoch": 0.978454415954416,
+      "grad_norm": 0.6317743062973022,
+      "learning_rate": 0.0001720438729780451,
+      "loss": 1.3468,
+      "step": 5495
+    },
+    {
+      "epoch": 0.9786324786324786,
+      "grad_norm": 0.5932528972625732,
+      "learning_rate": 0.0001720341647651199,
+      "loss": 1.105,
+      "step": 5496
+    },
+    {
+      "epoch": 0.9788105413105413,
+      "grad_norm": 0.607880175113678,
+      "learning_rate": 0.00017202445514083488,
+      "loss": 1.1465,
+      "step": 5497
+    },
+    {
+      "epoch": 0.978988603988604,
+      "grad_norm": 0.49227309226989746,
+      "learning_rate": 0.00017201474410538027,
+      "loss": 0.9075,
+      "step": 5498
+    },
+    {
+      "epoch": 0.9791666666666666,
+      "grad_norm": 0.5059443116188049,
+      "learning_rate": 0.00017200503165894636,
+      "loss": 1.0483,
+      "step": 5499
+    },
+    {
+      "epoch": 0.9793447293447294,
+      "grad_norm": 0.5792799592018127,
+      "learning_rate": 0.0001719953178017234,
+      "loss": 1.0987,
+      "step": 5500
+    },
+    {
+      "epoch": 0.979522792022792,
+      "grad_norm": 0.5010457038879395,
+      "learning_rate": 0.00017198560253390177,
+      "loss": 1.1051,
+      "step": 5501
+    },
+    {
+      "epoch": 0.9797008547008547,
+      "grad_norm": 0.5866543054580688,
+      "learning_rate": 0.0001719758858556718,
+      "loss": 1.2824,
+      "step": 5502
+    },
+    {
+      "epoch": 0.9798789173789174,
+      "grad_norm": 0.5392137169837952,
+      "learning_rate": 0.00017196616776722382,
+      "loss": 0.886,
+      "step": 5503
+    },
+    {
+      "epoch": 0.98005698005698,
+      "grad_norm": 0.5200899839401245,
+      "learning_rate": 0.00017195644826874834,
+      "loss": 1.1504,
+      "step": 5504
+    },
+    {
+      "epoch": 0.9802350427350427,
+      "grad_norm": 0.533159077167511,
+      "learning_rate": 0.00017194672736043569,
+      "loss": 1.1216,
+      "step": 5505
+    },
+    {
+      "epoch": 0.9804131054131054,
+      "grad_norm": 0.5543524622917175,
+      "learning_rate": 0.0001719370050424764,
+      "loss": 1.0161,
+      "step": 5506
+    },
+    {
+      "epoch": 0.9805911680911681,
+      "grad_norm": 0.5315365195274353,
+      "learning_rate": 0.00017192728131506092,
+      "loss": 1.0509,
+      "step": 5507
+    },
+    {
+      "epoch": 0.9807692307692307,
+      "grad_norm": 0.5406147837638855,
+      "learning_rate": 0.00017191755617837977,
+      "loss": 1.0695,
+      "step": 5508
+    },
+    {
+      "epoch": 0.9809472934472935,
+      "grad_norm": 0.4563386142253876,
+      "learning_rate": 0.00017190782963262354,
+      "loss": 0.995,
+      "step": 5509
+    },
+    {
+      "epoch": 0.9811253561253561,
+      "grad_norm": 0.5456405282020569,
+      "learning_rate": 0.00017189810167798274,
+      "loss": 1.0546,
+      "step": 5510
+    },
+    {
+      "epoch": 0.9813034188034188,
+      "grad_norm": 0.6275575160980225,
+      "learning_rate": 0.00017188837231464795,
+      "loss": 1.0432,
+      "step": 5511
+    },
+    {
+      "epoch": 0.9814814814814815,
+      "grad_norm": 0.49735602736473083,
+      "learning_rate": 0.0001718786415428099,
+      "loss": 1.035,
+      "step": 5512
+    },
+    {
+      "epoch": 0.9816595441595442,
+      "grad_norm": 0.5234259963035583,
+      "learning_rate": 0.00017186890936265916,
+      "loss": 1.0918,
+      "step": 5513
+    },
+    {
+      "epoch": 0.9818376068376068,
+      "grad_norm": 0.5091170072555542,
+      "learning_rate": 0.00017185917577438643,
+      "loss": 1.0239,
+      "step": 5514
+    },
+    {
+      "epoch": 0.9820156695156695,
+      "grad_norm": 0.6155703067779541,
+      "learning_rate": 0.00017184944077818244,
+      "loss": 1.2366,
+      "step": 5515
+    },
+    {
+      "epoch": 0.9821937321937322,
+      "grad_norm": 0.5074070692062378,
+      "learning_rate": 0.0001718397043742379,
+      "loss": 1.0318,
+      "step": 5516
+    },
+    {
+      "epoch": 0.9823717948717948,
+      "grad_norm": 0.5234423279762268,
+      "learning_rate": 0.0001718299665627436,
+      "loss": 1.0322,
+      "step": 5517
+    },
+    {
+      "epoch": 0.9825498575498576,
+      "grad_norm": 0.5783474445343018,
+      "learning_rate": 0.0001718202273438903,
+      "loss": 0.9486,
+      "step": 5518
+    },
+    {
+      "epoch": 0.9827279202279202,
+      "grad_norm": 0.5708683133125305,
+      "learning_rate": 0.00017181048671786886,
+      "loss": 1.0785,
+      "step": 5519
+    },
+    {
+      "epoch": 0.9829059829059829,
+      "grad_norm": 0.5985961556434631,
+      "learning_rate": 0.00017180074468487009,
+      "loss": 1.198,
+      "step": 5520
+    },
+    {
+      "epoch": 0.9830840455840456,
+      "grad_norm": 0.5711352229118347,
+      "learning_rate": 0.0001717910012450849,
+      "loss": 1.0386,
+      "step": 5521
+    },
+    {
+      "epoch": 0.9832621082621082,
+      "grad_norm": 0.5338063836097717,
+      "learning_rate": 0.00017178125639870416,
+      "loss": 1.1594,
+      "step": 5522
+    },
+    {
+      "epoch": 0.9834401709401709,
+      "grad_norm": 0.6144943237304688,
+      "learning_rate": 0.00017177151014591881,
+      "loss": 1.1083,
+      "step": 5523
+    },
+    {
+      "epoch": 0.9836182336182336,
+      "grad_norm": 0.547285795211792,
+      "learning_rate": 0.00017176176248691983,
+      "loss": 1.1507,
+      "step": 5524
+    },
+    {
+      "epoch": 0.9837962962962963,
+      "grad_norm": 0.5807644724845886,
+      "learning_rate": 0.00017175201342189817,
+      "loss": 1.3044,
+      "step": 5525
+    },
+    {
+      "epoch": 0.9839743589743589,
+      "grad_norm": 0.5229477882385254,
+      "learning_rate": 0.00017174226295104485,
+      "loss": 1.2622,
+      "step": 5526
+    },
+    {
+      "epoch": 0.9841524216524217,
+      "grad_norm": 0.6100695133209229,
+      "learning_rate": 0.00017173251107455094,
+      "loss": 1.2026,
+      "step": 5527
+    },
+    {
+      "epoch": 0.9843304843304843,
+      "grad_norm": 0.5410884618759155,
+      "learning_rate": 0.00017172275779260744,
+      "loss": 1.2964,
+      "step": 5528
+    },
+    {
+      "epoch": 0.9845085470085471,
+      "grad_norm": 0.5937406420707703,
+      "learning_rate": 0.00017171300310540554,
+      "loss": 1.1435,
+      "step": 5529
+    },
+    {
+      "epoch": 0.9846866096866097,
+      "grad_norm": 0.56817227602005,
+      "learning_rate": 0.00017170324701313634,
+      "loss": 1.0099,
+      "step": 5530
+    },
+    {
+      "epoch": 0.9848646723646723,
+      "grad_norm": 0.5776323080062866,
+      "learning_rate": 0.00017169348951599092,
+      "loss": 1.3539,
+      "step": 5531
+    },
+    {
+      "epoch": 0.9850427350427351,
+      "grad_norm": 0.5208535194396973,
+      "learning_rate": 0.0001716837306141605,
+      "loss": 1.2306,
+      "step": 5532
+    },
+    {
+      "epoch": 0.9852207977207977,
+      "grad_norm": 0.552173376083374,
+      "learning_rate": 0.0001716739703078363,
+      "loss": 1.0551,
+      "step": 5533
+    },
+    {
+      "epoch": 0.9853988603988604,
+      "grad_norm": 0.5327515602111816,
+      "learning_rate": 0.00017166420859720955,
+      "loss": 1.2443,
+      "step": 5534
+    },
+    {
+      "epoch": 0.9855769230769231,
+      "grad_norm": 0.5255244374275208,
+      "learning_rate": 0.0001716544454824715,
+      "loss": 1.005,
+      "step": 5535
+    },
+    {
+      "epoch": 0.9857549857549858,
+      "grad_norm": 0.4753847122192383,
+      "learning_rate": 0.00017164468096381343,
+      "loss": 1.0081,
+      "step": 5536
+    },
+    {
+      "epoch": 0.9859330484330484,
+      "grad_norm": 0.5261829495429993,
+      "learning_rate": 0.00017163491504142665,
+      "loss": 1.2249,
+      "step": 5537
+    },
+    {
+      "epoch": 0.9861111111111112,
+      "grad_norm": 0.46499499678611755,
+      "learning_rate": 0.00017162514771550255,
+      "loss": 0.8759,
+      "step": 5538
+    },
+    {
+      "epoch": 0.9862891737891738,
+      "grad_norm": 0.5233004689216614,
+      "learning_rate": 0.00017161537898623247,
+      "loss": 1.0474,
+      "step": 5539
+    },
+    {
+      "epoch": 0.9864672364672364,
+      "grad_norm": 0.46905553340911865,
+      "learning_rate": 0.00017160560885380778,
+      "loss": 0.9033,
+      "step": 5540
+    },
+    {
+      "epoch": 0.9866452991452992,
+      "grad_norm": 0.5816231369972229,
+      "learning_rate": 0.00017159583731841998,
+      "loss": 1.0628,
+      "step": 5541
+    },
+    {
+      "epoch": 0.9868233618233618,
+      "grad_norm": 0.4575413167476654,
+      "learning_rate": 0.00017158606438026045,
+      "loss": 1.0446,
+      "step": 5542
+    },
+    {
+      "epoch": 0.9870014245014245,
+      "grad_norm": 0.5968109965324402,
+      "learning_rate": 0.00017157629003952067,
+      "loss": 1.032,
+      "step": 5543
+    },
+    {
+      "epoch": 0.9871794871794872,
+      "grad_norm": 0.5316148400306702,
+      "learning_rate": 0.00017156651429639218,
+      "loss": 0.9167,
+      "step": 5544
+    },
+    {
+      "epoch": 0.9873575498575499,
+      "grad_norm": 0.5185125470161438,
+      "learning_rate": 0.00017155673715106651,
+      "loss": 1.1527,
+      "step": 5545
+    },
+    {
+      "epoch": 0.9875356125356125,
+      "grad_norm": 0.5167772769927979,
+      "learning_rate": 0.00017154695860373525,
+      "loss": 0.9954,
+      "step": 5546
+    },
+    {
+      "epoch": 0.9877136752136753,
+      "grad_norm": 0.6406680345535278,
+      "learning_rate": 0.00017153717865458994,
+      "loss": 1.2758,
+      "step": 5547
+    },
+    {
+      "epoch": 0.9878917378917379,
+      "grad_norm": 0.5223956108093262,
+      "learning_rate": 0.00017152739730382223,
+      "loss": 1.1526,
+      "step": 5548
+    },
+    {
+      "epoch": 0.9880698005698005,
+      "grad_norm": 0.6131790280342102,
+      "learning_rate": 0.00017151761455162375,
+      "loss": 1.1024,
+      "step": 5549
+    },
+    {
+      "epoch": 0.9882478632478633,
+      "grad_norm": 0.5574753880500793,
+      "learning_rate": 0.00017150783039818616,
+      "loss": 0.9733,
+      "step": 5550
+    },
+    {
+      "epoch": 0.9884259259259259,
+      "grad_norm": 0.5417882800102234,
+      "learning_rate": 0.0001714980448437012,
+      "loss": 1.2244,
+      "step": 5551
+    },
+    {
+      "epoch": 0.9886039886039886,
+      "grad_norm": 0.6217474341392517,
+      "learning_rate": 0.0001714882578883606,
+      "loss": 0.9224,
+      "step": 5552
+    },
+    {
+      "epoch": 0.9887820512820513,
+      "grad_norm": 0.5846285223960876,
+      "learning_rate": 0.00017147846953235606,
+      "loss": 1.2429,
+      "step": 5553
+    },
+    {
+      "epoch": 0.988960113960114,
+      "grad_norm": 0.5924782752990723,
+      "learning_rate": 0.00017146867977587936,
+      "loss": 0.9907,
+      "step": 5554
+    },
+    {
+      "epoch": 0.9891381766381766,
+      "grad_norm": 0.5756853818893433,
+      "learning_rate": 0.00017145888861912242,
+      "loss": 1.1266,
+      "step": 5555
+    },
+    {
+      "epoch": 0.9893162393162394,
+      "grad_norm": 0.5277376770973206,
+      "learning_rate": 0.00017144909606227693,
+      "loss": 1.1676,
+      "step": 5556
+    },
+    {
+      "epoch": 0.989494301994302,
+      "grad_norm": 0.5138902068138123,
+      "learning_rate": 0.00017143930210553485,
+      "loss": 0.9864,
+      "step": 5557
+    },
+    {
+      "epoch": 0.9896723646723646,
+      "grad_norm": 0.8072507977485657,
+      "learning_rate": 0.00017142950674908805,
+      "loss": 1.111,
+      "step": 5558
+    },
+    {
+      "epoch": 0.9898504273504274,
+      "grad_norm": 0.5641721487045288,
+      "learning_rate": 0.00017141970999312844,
+      "loss": 0.9106,
+      "step": 5559
+    },
+    {
+      "epoch": 0.99002849002849,
+      "grad_norm": 0.5260798931121826,
+      "learning_rate": 0.000171409911837848,
+      "loss": 1.1609,
+      "step": 5560
+    },
+    {
+      "epoch": 0.9902065527065527,
+      "grad_norm": 0.5398530960083008,
+      "learning_rate": 0.00017140011228343864,
+      "loss": 1.0368,
+      "step": 5561
+    },
+    {
+      "epoch": 0.9903846153846154,
+      "grad_norm": 0.6011313199996948,
+      "learning_rate": 0.00017139031133009245,
+      "loss": 1.1314,
+      "step": 5562
+    },
+    {
+      "epoch": 0.9905626780626781,
+      "grad_norm": 0.6194971203804016,
+      "learning_rate": 0.00017138050897800135,
+      "loss": 1.3493,
+      "step": 5563
+    },
+    {
+      "epoch": 0.9907407407407407,
+      "grad_norm": 0.5779356956481934,
+      "learning_rate": 0.0001713707052273575,
+      "loss": 0.943,
+      "step": 5564
+    },
+    {
+      "epoch": 0.9909188034188035,
+      "grad_norm": 0.5321127772331238,
+      "learning_rate": 0.00017136090007835293,
+      "loss": 0.7914,
+      "step": 5565
+    },
+    {
+      "epoch": 0.9910968660968661,
+      "grad_norm": 0.5470426678657532,
+      "learning_rate": 0.00017135109353117977,
+      "loss": 1.2113,
+      "step": 5566
+    },
+    {
+      "epoch": 0.9912749287749287,
+      "grad_norm": 0.5551436543464661,
+      "learning_rate": 0.00017134128558603012,
+      "loss": 0.8932,
+      "step": 5567
+    },
+    {
+      "epoch": 0.9914529914529915,
+      "grad_norm": 0.45770928263664246,
+      "learning_rate": 0.0001713314762430962,
+      "loss": 1.0061,
+      "step": 5568
+    },
+    {
+      "epoch": 0.9916310541310541,
+      "grad_norm": 0.5578967332839966,
+      "learning_rate": 0.00017132166550257017,
+      "loss": 1.148,
+      "step": 5569
+    },
+    {
+      "epoch": 0.9918091168091168,
+      "grad_norm": 0.5086452960968018,
+      "learning_rate": 0.0001713118533646443,
+      "loss": 0.9803,
+      "step": 5570
+    },
+    {
+      "epoch": 0.9919871794871795,
+      "grad_norm": 0.4714745879173279,
+      "learning_rate": 0.00017130203982951078,
+      "loss": 1.0176,
+      "step": 5571
+    },
+    {
+      "epoch": 0.9921652421652422,
+      "grad_norm": 0.6254406571388245,
+      "learning_rate": 0.0001712922248973619,
+      "loss": 1.0932,
+      "step": 5572
+    },
+    {
+      "epoch": 0.9923433048433048,
+      "grad_norm": 0.5005003809928894,
+      "learning_rate": 0.00017128240856838998,
+      "loss": 1.0783,
+      "step": 5573
+    },
+    {
+      "epoch": 0.9925213675213675,
+      "grad_norm": 0.5668206214904785,
+      "learning_rate": 0.00017127259084278733,
+      "loss": 1.0404,
+      "step": 5574
+    },
+    {
+      "epoch": 0.9926994301994302,
+      "grad_norm": 0.4976036250591278,
+      "learning_rate": 0.00017126277172074632,
+      "loss": 1.1437,
+      "step": 5575
+    },
+    {
+      "epoch": 0.9928774928774928,
+      "grad_norm": 0.567546546459198,
+      "learning_rate": 0.00017125295120245935,
+      "loss": 1.2188,
+      "step": 5576
+    },
+    {
+      "epoch": 0.9930555555555556,
+      "grad_norm": 0.5614372491836548,
+      "learning_rate": 0.0001712431292881188,
+      "loss": 0.9187,
+      "step": 5577
+    },
+    {
+      "epoch": 0.9932336182336182,
+      "grad_norm": 0.6117973327636719,
+      "learning_rate": 0.00017123330597791712,
+      "loss": 1.1285,
+      "step": 5578
+    },
+    {
+      "epoch": 0.9934116809116809,
+      "grad_norm": 0.6000342965126038,
+      "learning_rate": 0.00017122348127204676,
+      "loss": 0.9837,
+      "step": 5579
+    },
+    {
+      "epoch": 0.9935897435897436,
+      "grad_norm": 0.5453050136566162,
+      "learning_rate": 0.0001712136551707003,
+      "loss": 0.8771,
+      "step": 5580
+    },
+    {
+      "epoch": 0.9937678062678063,
+      "grad_norm": 0.49603891372680664,
+      "learning_rate": 0.00017120382767407018,
+      "loss": 1.0754,
+      "step": 5581
+    },
+    {
+      "epoch": 0.9939458689458689,
+      "grad_norm": 0.48031488060951233,
+      "learning_rate": 0.00017119399878234894,
+      "loss": 0.6933,
+      "step": 5582
+    },
+    {
+      "epoch": 0.9941239316239316,
+      "grad_norm": 0.6048742532730103,
+      "learning_rate": 0.0001711841684957292,
+      "loss": 0.9696,
+      "step": 5583
+    },
+    {
+      "epoch": 0.9943019943019943,
+      "grad_norm": 0.5183123350143433,
+      "learning_rate": 0.00017117433681440355,
+      "loss": 1.1313,
+      "step": 5584
+    },
+    {
+      "epoch": 0.9944800569800569,
+      "grad_norm": 0.504916250705719,
+      "learning_rate": 0.00017116450373856466,
+      "loss": 1.0273,
+      "step": 5585
+    },
+    {
+      "epoch": 0.9946581196581197,
+      "grad_norm": 0.5804886817932129,
+      "learning_rate": 0.0001711546692684051,
+      "loss": 1.1162,
+      "step": 5586
+    },
+    {
+      "epoch": 0.9948361823361823,
+      "grad_norm": 0.5531938672065735,
+      "learning_rate": 0.0001711448334041176,
+      "loss": 1.2893,
+      "step": 5587
+    },
+    {
+      "epoch": 0.9950142450142451,
+      "grad_norm": 0.5079928636550903,
+      "learning_rate": 0.00017113499614589492,
+      "loss": 1.0393,
+      "step": 5588
+    },
+    {
+      "epoch": 0.9951923076923077,
+      "grad_norm": 0.5421964526176453,
+      "learning_rate": 0.00017112515749392973,
+      "loss": 0.8844,
+      "step": 5589
+    },
+    {
+      "epoch": 0.9953703703703703,
+      "grad_norm": 0.4834558367729187,
+      "learning_rate": 0.00017111531744841486,
+      "loss": 1.0187,
+      "step": 5590
+    },
+    {
+      "epoch": 0.9955484330484331,
+      "grad_norm": 0.6704340577125549,
+      "learning_rate": 0.00017110547600954307,
+      "loss": 0.8524,
+      "step": 5591
+    },
+    {
+      "epoch": 0.9957264957264957,
+      "grad_norm": 0.4578927159309387,
+      "learning_rate": 0.00017109563317750718,
+      "loss": 1.059,
+      "step": 5592
+    },
+    {
+      "epoch": 0.9959045584045584,
+      "grad_norm": 0.5563494563102722,
+      "learning_rate": 0.00017108578895250006,
+      "loss": 1.1211,
+      "step": 5593
+    },
+    {
+      "epoch": 0.9960826210826211,
+      "grad_norm": 0.5272170901298523,
+      "learning_rate": 0.00017107594333471454,
+      "loss": 0.9224,
+      "step": 5594
+    },
+    {
+      "epoch": 0.9962606837606838,
+      "grad_norm": 0.5697501301765442,
+      "learning_rate": 0.00017106609632434357,
+      "loss": 1.2223,
+      "step": 5595
+    },
+    {
+      "epoch": 0.9964387464387464,
+      "grad_norm": 0.5385653376579285,
+      "learning_rate": 0.00017105624792158007,
+      "loss": 1.0809,
+      "step": 5596
+    },
+    {
+      "epoch": 0.9966168091168092,
+      "grad_norm": 0.5608006119728088,
+      "learning_rate": 0.000171046398126617,
+      "loss": 1.3936,
+      "step": 5597
+    },
+    {
+      "epoch": 0.9967948717948718,
+      "grad_norm": 0.5063132643699646,
+      "learning_rate": 0.00017103654693964736,
+      "loss": 1.2086,
+      "step": 5598
+    },
+    {
+      "epoch": 0.9969729344729344,
+      "grad_norm": 0.6014235019683838,
+      "learning_rate": 0.00017102669436086415,
+      "loss": 1.1231,
+      "step": 5599
+    },
+    {
+      "epoch": 0.9971509971509972,
+      "grad_norm": 0.49549567699432373,
+      "learning_rate": 0.00017101684039046036,
+      "loss": 1.0013,
+      "step": 5600
+    },
+    {
+      "epoch": 0.9973290598290598,
+      "grad_norm": 0.517464816570282,
+      "learning_rate": 0.00017100698502862916,
+      "loss": 1.1143,
+      "step": 5601
+    },
+    {
+      "epoch": 0.9975071225071225,
+      "grad_norm": 0.514281153678894,
+      "learning_rate": 0.00017099712827556358,
+      "loss": 1.0336,
+      "step": 5602
+    },
+    {
+      "epoch": 0.9976851851851852,
+      "grad_norm": 0.5378567576408386,
+      "learning_rate": 0.00017098727013145672,
+      "loss": 0.8278,
+      "step": 5603
+    },
+    {
+      "epoch": 0.9978632478632479,
+      "grad_norm": 0.5098404884338379,
+      "learning_rate": 0.0001709774105965018,
+      "loss": 0.9902,
+      "step": 5604
+    },
+    {
+      "epoch": 0.9980413105413105,
+      "grad_norm": 0.6231759190559387,
+      "learning_rate": 0.00017096754967089198,
+      "loss": 1.0564,
+      "step": 5605
+    },
+    {
+      "epoch": 0.9982193732193733,
+      "grad_norm": 0.47434380650520325,
+      "learning_rate": 0.00017095768735482042,
+      "loss": 0.7457,
+      "step": 5606
+    },
+    {
+      "epoch": 0.9983974358974359,
+      "grad_norm": 0.5771013498306274,
+      "learning_rate": 0.00017094782364848035,
+      "loss": 1.1191,
+      "step": 5607
+    },
+    {
+      "epoch": 0.9985754985754985,
+      "grad_norm": 0.5617234706878662,
+      "learning_rate": 0.00017093795855206508,
+      "loss": 1.0779,
+      "step": 5608
+    },
+    {
+      "epoch": 0.9987535612535613,
+      "grad_norm": 0.6573554873466492,
+      "learning_rate": 0.00017092809206576792,
+      "loss": 1.0191,
+      "step": 5609
+    },
+    {
+      "epoch": 0.9989316239316239,
+      "grad_norm": 0.482834130525589,
+      "learning_rate": 0.00017091822418978207,
+      "loss": 1.0119,
+      "step": 5610
+    },
+    {
+      "epoch": 0.9991096866096866,
+      "grad_norm": 0.47496405243873596,
+      "learning_rate": 0.000170908354924301,
+      "loss": 0.8297,
+      "step": 5611
+    },
+    {
+      "epoch": 0.9992877492877493,
+      "grad_norm": 0.5013265013694763,
+      "learning_rate": 0.00017089848426951796,
+      "loss": 1.1511,
+      "step": 5612
+    },
+    {
+      "epoch": 0.999465811965812,
+      "grad_norm": 0.5402522683143616,
+      "learning_rate": 0.00017088861222562643,
+      "loss": 1.1401,
+      "step": 5613
+    },
+    {
+      "epoch": 0.9996438746438746,
+      "grad_norm": 0.546302318572998,
+      "learning_rate": 0.00017087873879281977,
+      "loss": 0.8611,
+      "step": 5614
+    },
+    {
+      "epoch": 0.9998219373219374,
+      "grad_norm": 0.44279807806015015,
+      "learning_rate": 0.0001708688639712915,
+      "loss": 0.79,
+      "step": 5615
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.5514659285545349,
+      "learning_rate": 0.00017085898776123502,
+      "loss": 1.0709,
+      "step": 5616
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 1.093075156211853,
+      "eval_runtime": 24.6155,
+      "eval_samples_per_second": 42.29,
+      "eval_steps_per_second": 21.166,
+      "step": 5616
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 22464,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 4,
+  "save_steps": 5616,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.1681189773991936e+17,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-5616/training_args.bin b/checkpoint-5616/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..1245f6a2afbe9a6eefbb6d141231d555e0b0bf84
--- /dev/null
+++ b/checkpoint-5616/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:86de370014ed2be86ea27c820b434ceec5e097da2b5f9b08d0eac9aa564d8961
+size 6200
diff --git a/config.json b/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0c19ab480f00c0f7d7b5b9100fc78008e88f98aa
--- /dev/null
+++ b/config.json
@@ -0,0 +1,47 @@
+{
+  "_attn_implementation_autoset": true,
+  "_name_or_path": "openlm-research/open_llama_3b_v2",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "head_dim": 100,
+  "hidden_act": "silu",
+  "hidden_size": 3200,
+  "initializer_range": 0.02,
+  "intermediate_size": 8640,
+  "max_position_embeddings": 2048,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 26,
+  "num_key_value_heads": 32,
+  "pad_token_id": 0,
+  "pretraining_tp": 1,
+  "quantization_config": {
+    "_load_in_4bit": false,
+    "_load_in_8bit": true,
+    "bnb_4bit_compute_dtype": "float32",
+    "bnb_4bit_quant_storage": "uint8",
+    "bnb_4bit_quant_type": "fp4",
+    "bnb_4bit_use_double_quant": false,
+    "llm_int8_enable_fp32_cpu_offload": false,
+    "llm_int8_has_fp16_weight": false,
+    "llm_int8_skip_modules": null,
+    "llm_int8_threshold": 6.0,
+    "load_in_4bit": false,
+    "load_in_8bit": true,
+    "quant_method": "bitsandbytes"
+  },
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float16",
+  "transformers_version": "4.46.1",
+  "use_cache": false,
+  "vocab_size": 32000
+}
diff --git a/merged/config.json b/merged/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..de1fd07914c59e1f3619f6a02991d58ed3d50a5f
--- /dev/null
+++ b/merged/config.json
@@ -0,0 +1,31 @@
+{
+  "_name_or_path": "openlm-research/open_llama_3b_v2",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "head_dim": 100,
+  "hidden_act": "silu",
+  "hidden_size": 3200,
+  "initializer_range": 0.02,
+  "intermediate_size": 8640,
+  "max_position_embeddings": 2048,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 26,
+  "num_key_value_heads": 32,
+  "pad_token_id": 0,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float16",
+  "transformers_version": "4.46.1",
+  "use_cache": false,
+  "vocab_size": 32000
+}
diff --git a/merged/generation_config.json b/merged/generation_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..08ba3591b4ff1d0b847f1f5692758b0eefdd40c8
--- /dev/null
+++ b/merged/generation_config.json
@@ -0,0 +1,8 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "do_sample": true,
+  "eos_token_id": 2,
+  "pad_token_id": 0,
+  "transformers_version": "4.46.1"
+}
diff --git a/merged/pytorch_model-00001-of-00002.bin b/merged/pytorch_model-00001-of-00002.bin
new file mode 100644
index 0000000000000000000000000000000000000000..261a38874c2f41af6006bc7f862381f37afddfbe
--- /dev/null
+++ b/merged/pytorch_model-00001-of-00002.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b8c49a856649e62748a4ddf071cc144f832bd35945c16204dd76e4c590fdfa76
+size 4995377838
diff --git a/merged/pytorch_model-00002-of-00002.bin b/merged/pytorch_model-00002-of-00002.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e445c36d05d0984cb7b894b66ec882b47555d43d
--- /dev/null
+++ b/merged/pytorch_model-00002-of-00002.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:89939bdc254a14ef5179d72f6886b5c898ce703a895164442f19292ce9b8e22a
+size 1857653748
diff --git a/merged/pytorch_model.bin.index.json b/merged/pytorch_model.bin.index.json
new file mode 100644
index 0000000000000000000000000000000000000000..8c89827f6f3500ec645cba494bc16b8e901a09fa
--- /dev/null
+++ b/merged/pytorch_model.bin.index.json
@@ -0,0 +1,244 @@
+{
+  "metadata": {
+    "total_size": 6852947200
+  },
+  "weight_map": {
+    "lm_head.weight": "pytorch_model-00002-of-00002.bin",
+    "model.embed_tokens.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.0.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.0.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.0.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.0.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.0.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.0.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.0.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.0.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.1.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.1.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.1.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.1.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.1.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.1.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.1.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.1.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.1.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.10.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.10.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.10.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.10.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.10.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.10.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.10.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.10.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.10.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.11.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.11.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.11.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.11.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.11.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.11.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.11.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.11.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.11.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.12.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.12.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.12.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.12.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.12.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.12.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.12.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.12.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.12.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.13.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.13.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.13.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.13.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.13.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.13.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.13.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.13.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.13.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.14.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.14.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.14.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.14.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.14.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.14.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.14.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.14.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.14.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.15.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.15.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.15.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.15.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.15.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.15.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.15.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.15.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.15.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.16.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.16.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.16.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.16.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.16.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.16.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.16.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.16.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.16.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.17.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.17.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.17.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.17.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.17.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.17.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.17.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.17.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.17.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.18.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.18.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.18.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.18.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.18.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.18.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.18.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.18.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.18.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.19.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.19.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.19.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.19.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.19.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.19.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.19.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.19.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.19.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.2.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.2.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.2.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.2.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.2.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.2.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.2.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.2.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.2.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.20.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.20.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.20.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.20.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.20.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.20.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.20.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.20.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.20.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.21.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.21.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.21.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.21.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.21.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.21.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.21.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.21.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.21.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.22.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.22.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.22.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.22.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.22.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.22.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.22.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.22.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.22.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.23.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.23.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.23.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.23.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.23.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.23.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.23.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.23.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.23.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.24.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.24.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.24.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.24.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.24.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.24.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.24.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.24.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.24.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.25.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.25.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.25.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.25.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.25.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.25.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.25.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.25.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.25.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "model.layers.3.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.3.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.3.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.3.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.3.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.3.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.3.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.3.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.3.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.4.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.4.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.4.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.4.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.4.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.4.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.4.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.4.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.4.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.5.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.5.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.5.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.5.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.5.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.5.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.5.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.5.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.5.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.6.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.6.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.6.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.6.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.6.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.6.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.6.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.6.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.6.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.7.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.7.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.7.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.7.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.7.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.7.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.7.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.7.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.7.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.8.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.8.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.8.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.8.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.8.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.8.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.8.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.8.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.8.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.9.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.9.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.9.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.9.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.9.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.9.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.9.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.9.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.layers.9.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "model.norm.weight": "pytorch_model-00002-of-00002.bin"
+  }
+}
diff --git a/merged/special_tokens_map.json b/merged/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..72ecfeeb7e14d244c936169d2ed139eeae235ef1
--- /dev/null
+++ b/merged/special_tokens_map.json
@@ -0,0 +1,24 @@
+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/merged/tokenizer.model b/merged/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..98866ff8ae3631f331c57923c921a0c9ad22b97d
--- /dev/null
+++ b/merged/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:91b289e85fa20fd375d8b33dc12f77616f18abc6359804471d1fafcb425fecb8
+size 511574
diff --git a/merged/tokenizer_config.json b/merged/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c218d1b7228e3ad6055bdcf0ec15c4f188dc7d79
--- /dev/null
+++ b/merged/tokenizer_config.json
@@ -0,0 +1,43 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": true,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": true,
+  "model_max_length": 2048,
+  "pad_token": "</s>",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false,
+  "use_fast": true
+}
diff --git a/special_tokens_map.json b/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..72ecfeeb7e14d244c936169d2ed139eeae235ef1
--- /dev/null
+++ b/special_tokens_map.json
@@ -0,0 +1,24 @@
+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/tokenizer_config.json b/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c218d1b7228e3ad6055bdcf0ec15c4f188dc7d79
--- /dev/null
+++ b/tokenizer_config.json
@@ -0,0 +1,43 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": true,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": true,
+  "model_max_length": 2048,
+  "pad_token": "</s>",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false,
+  "use_fast": true
+}